diff --git a/.bazelrc b/.bazelrc
index fc2995dc838c5..3656a86eb364c 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -2,7 +2,11 @@ build --cxxopt=--std=c++17
 build --copt=-I.
 # Bazel does not support including its cc_library targets as system
 # headers. We work around this for generated code
+<<<<<<< HEAD
 # (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a
+=======
+# (e.g. c10/macros/cmake_macros.h) by making the generated directory a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # system include path.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
 build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index b25f3b21e8eb1..9a178300266b7 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,6 +3,7 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
+<<<<<<< HEAD
 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0"
@@ -19,6 +20,10 @@ if [[ "$DESIRED_CUDA" == *"13"* ]]; then
     export TORCH_NVCC_FLAGS="-compress-mode=size"
     # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
     export BUILD_BUNDLE_PTXAS=1
+=======
+if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -32,6 +37,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
+<<<<<<< HEAD
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
@@ -50,4 +56,16 @@ else
     fi
 
     python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+=======
+pip install auditwheel==6.2.0
+if [ "$DESIRED_CUDA" = "cpu" ]; then
+    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+else
+    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+    export USE_SYSTEM_NCCL=1
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index a99e5f8f65659..fe0a76c275c5c 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -13,6 +13,52 @@ def list_dir(path: str) -> list[str]:
     return check_output(["ls", "-1", path]).decode().split("\n")
 
 
+<<<<<<< HEAD
+=======
+def build_ArmComputeLibrary() -> None:
+    """
+    Using ArmComputeLibrary for aarch64 PyTorch
+    """
+    print("Building Arm Compute Library")
+    acl_build_flags = [
+        "debug=0",
+        "neon=1",
+        "opencl=0",
+        "os=linux",
+        "openmp=1",
+        "cppthreads=0",
+        "arch=armv8a",
+        "multi_isa=1",
+        "fixed_format_kernels=1",
+        "build=native",
+    ]
+    acl_install_dir = "/acl"
+    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    if os.path.isdir(acl_install_dir):
+        shutil.rmtree(acl_install_dir)
+    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        check_call(
+            [
+                "git",
+                "clone",
+                "https://github.com/ARM-software/ComputeLibrary.git",
+                "-b",
+                "v25.02",
+                "--depth",
+                "1",
+                "--shallow-submodules",
+            ]
+        )
+
+    check_call(
+        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
+    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def replace_tag(filename) -> None:
     with open(filename) as f:
         lines = f.readlines()
@@ -26,6 +72,7 @@ def replace_tag(filename) -> None:
         f.writelines(lines)
 
 
+<<<<<<< HEAD
 def patch_library_rpath(
     folder: str,
     lib_name: str,
@@ -88,11 +135,14 @@ def copy_and_patch_library(
         patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     """
     Package the cuda wheel libraries
     """
     folder = os.path.dirname(wheel_path)
+<<<<<<< HEAD
     os.mkdir(f"{folder}/tmp")
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
     # Delete original wheel since it will be repackaged
@@ -206,6 +256,57 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         # Copy libraries to unzipped_folder/torch/lib
         for lib_path in libs_to_copy:
             copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+=======
+    wheelname = os.path.basename(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+        "/usr/local/cuda/lib64/libcudnn.so.9",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
+        "/usr/local/cuda/lib64/libcurand.so.10",
+        "/usr/local/cuda/lib64/libnccl.so.2",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
+        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+        "/lib64/libgomp.so.1",
+        "/usr/lib64/libgfortran.so.5",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
+    ]
+
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+        ]
+
+    # Copy libraries to unzipped_folder/a/lib
+    for lib_path in libs_to_copy:
+        lib_name = os.path.basename(lib_path)
+        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Make sure the wheel is tagged with manylinux_2_28
     for f in os.scandir(f"{folder}/tmp/"):
@@ -213,8 +314,19 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
             replace_tag(f"{f.path}/WHEEL")
             break
 
+<<<<<<< HEAD
     os.system(f"wheel pack {folder}/tmp/ -d {folder}")
     os.system(f"rm -rf {folder}/tmp/")
+=======
+    os.mkdir(f"{folder}/cuda_wheel")
+    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
+    shutil.move(
+        f"{folder}/cuda_wheel/{wheelname}",
+        f"{folder}/{wheelname}",
+        copy_function=shutil.copy2,
+    )
+    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def complete_wheel(folder: str) -> str:
@@ -237,7 +349,18 @@ def complete_wheel(folder: str) -> str:
             f"/{folder}/dist/{repaired_wheel_name}",
         )
     else:
+<<<<<<< HEAD
         repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
+=======
+        repaired_wheel_name = wheel_name.replace(
+            "linux_aarch64", "manylinux_2_28_aarch64"
+        )
+        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
+        os.rename(
+            f"/{folder}/dist/{wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     print(f"Copying {repaired_wheel_name} to artifacts")
     shutil.copy2(
@@ -274,6 +397,7 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
+<<<<<<< HEAD
     build_vars = ""
     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
     if enable_cuda:
@@ -288,6 +412,12 @@ def parse_arguments():
         else:
             print("Configuring build for bundled NVIDIA libraries")
             # Keep existing static linking approach - already configured above
+=======
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars = "MAX_JOBS=5 " + build_vars
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     desired_cuda = os.getenv("DESIRED_CUDA")
@@ -313,6 +443,7 @@ def parse_arguments():
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
+<<<<<<< HEAD
         print("build pytorch with mkldnn+acl backend")
         build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
         build_vars += "ACL_ROOT_DIR=/acl "
@@ -324,6 +455,25 @@ def parse_arguments():
         print("build pytorch without mkldnn backend")
 
     os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+=======
+        build_ArmComputeLibrary()
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += (
+            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+            "ACL_ROOT_DIR=/acl "
+            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
+            "ACL_INCLUDE_DIR=/acl/build "
+            "ACL_LIBRARY=/acl/build "
+        )
+        if enable_cuda:
+            build_vars += "BLAS=NVPL "
+        else:
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
+    else:
+        print("build pytorch without mkldnn backend")
+
+    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if enable_cuda:
         print("Updating Cuda Dependency")
         filename = os.listdir("/pytorch/dist/")
diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
index a157ec57b574a..e1809a8528aae 100755
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -241,7 +241,11 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
         try:
             with socket.create_connection((addr, port), timeout=timeout):
                 return
+<<<<<<< HEAD
         except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+=======
+        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if i == attempt_cnt - 1:
                 raise
             time.sleep(timeout)
@@ -299,6 +303,43 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
         )
 
 
+<<<<<<< HEAD
+=======
+def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
+    print("Building OpenBLAS")
+    host.run_cmd(
+        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
+    )
+    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
+    host.run_cmd(
+        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
+    )
+
+
+def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
+    print("Building Arm Compute Library")
+    acl_build_flags = " ".join(
+        [
+            "debug=0",
+            "neon=1",
+            "opencl=0",
+            "os=linux",
+            "openmp=1",
+            "cppthreads=0",
+            "arch=armv8a",
+            "multi_isa=1",
+            "fixed_format_kernels=1",
+            "build=native",
+        ]
+    )
+    host.run_cmd(
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
+    )
+
+    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
     host.run_cmd("pip3 install auditwheel")
     host.run_cmd(
@@ -404,11 +445,21 @@ def build_torchvision(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
+<<<<<<< HEAD
         build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
     host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+=======
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vision_wheel_name = host.list_dir("vision/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
 
@@ -459,11 +510,21 @@ def build_torchdata(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
+<<<<<<< HEAD
         build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
     host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+=======
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wheel_name = host.list_dir("data/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
 
@@ -515,11 +576,21 @@ def build_torchtext(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
+<<<<<<< HEAD
         build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
     host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+=======
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wheel_name = host.list_dir("text/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
 
@@ -573,14 +644,24 @@ def build_torchaudio(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
+<<<<<<< HEAD
         build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+=======
+        build_vars += (
+            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
     host.run_cmd(
         f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
         && ./packaging/ffmpeg/build.sh \
+<<<<<<< HEAD
         && {build_vars} python3 -m build --wheel --no-isolation"
+=======
+        && {build_vars} python3 setup.py bdist_wheel"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     wheel_name = host.list_dir("audio/dist")[0]
@@ -666,6 +747,10 @@ def start_build(
     configure_system(
         host, compiler=compiler, use_conda=use_conda, python_version=python_version
     )
+<<<<<<< HEAD
+=======
+    build_OpenBLAS(host, git_clone_flags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if host.using_docker():
         print("Move libgfortant.a into a standard location")
@@ -688,12 +773,19 @@ def start_build(
         f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
     )
 
+<<<<<<< HEAD
     host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
 
     print("Building PyTorch wheel")
     build_opts = ""
     if pytorch_build_number is not None:
         build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+=======
+    print("Building PyTorch wheel")
+    build_opts = ""
+    if pytorch_build_number is not None:
+        build_opts += f" --build-number {pytorch_build_number}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Breakpad build fails on aarch64
     build_vars = "USE_BREAKPAD=0 "
     if branch == "nightly":
@@ -710,6 +802,7 @@ def start_build(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
     if enable_mkldnn:
+<<<<<<< HEAD
         host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
         print("build pytorch with mkldnn+acl backend")
         build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
@@ -722,6 +815,17 @@ def start_build(
         print("Repair the wheel")
         pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
         ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
+=======
+        build_ArmComputeLibrary(host, git_clone_flags)
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        host.run_cmd(
+            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+        )
+        print("Repair the wheel")
+        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         host.run_cmd(
             f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
         )
@@ -733,7 +837,11 @@ def start_build(
     else:
         print("build pytorch without mkldnn backend")
         host.run_cmd(
+<<<<<<< HEAD
             f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+=======
+            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     print("Deleting build folder")
@@ -877,7 +985,11 @@ def terminate_instances(instance_type: str) -> None:
 def parse_arguments():
     from argparse import ArgumentParser
 
+<<<<<<< HEAD
     parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
+=======
+    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument("--key-name", type=str)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--build-only", action="store_true")
@@ -974,7 +1086,11 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
+<<<<<<< HEAD
     python_version = args.python_version if args.python_version is not None else "3.10"
+=======
+    python_version = args.python_version if args.python_version is not None else "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)
diff --git a/.ci/docker/README.md b/.ci/docker/README.md
index 5a97a0a3c2d46..a795edf2c0b9b 100644
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@@ -36,6 +36,7 @@ See `build.sh` for valid build environments (it's the giant switch).
 # Set flags (see build.sh) and build image
 sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
+<<<<<<< HEAD
 
 ## [Guidance] Adding a New Base Docker Image
 
@@ -137,3 +138,5 @@ If your new Docker image needs a library installed from a specific pinned commit
 
    The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
    pinned commit updates.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
index ce7803cf9acd2..87fbc51917829 100644
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@@ -64,6 +64,7 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9
 
+<<<<<<< HEAD
 FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
@@ -71,6 +72,10 @@ ENV DESIRED_CUDA=13.0
 FROM ${ROCM_IMAGE} as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+=======
+FROM ${ROCM_IMAGE} as rocm
+ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
@@ -81,10 +86,17 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh
 
 FROM base as all_cuda
+<<<<<<< HEAD
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
 COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0
+=======
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
+COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
+COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Final step
 FROM ${BASE_TARGET} as final
diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh
index ad234ce1ffb93..50aeb082d09b5 100755
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@@ -36,12 +36,15 @@ case ${DOCKER_TAG_PREFIX} in
     ;;
   rocm*)
     BASE_TARGET=rocm
+<<<<<<< HEAD
     PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
     # add gfx950, gfx115x conditionally starting in ROCm 7.0
     if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
         PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
     fi
     EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ;;
   *)
     echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 54339e5efbbde..054853b44efcd 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -56,14 +56,20 @@ elif [[ "$image" == *-noble* ]]; then
   UBUNTU_VERSION=24.04
 elif [[ "$image" == *ubuntu* ]]; then
   extract_version_from_image_name ubuntu UBUNTU_VERSION
+<<<<<<< HEAD
 elif [[ "$image" == *centos* ]]; then
   extract_version_from_image_name centos CENTOS_VERSION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [ -n "${UBUNTU_VERSION}" ]; then
   OS="ubuntu"
+<<<<<<< HEAD
 elif [ -n "${CENTOS_VERSION}" ]; then
   OS="centos"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   echo "Unable to derive operating system base..."
   exit 1
@@ -80,6 +86,7 @@ elif [[ "$image" == *cuda*linter* ]]; then
 elif [[ "$image" == *linter* ]]; then
   # Use a separate Dockerfile for linter to keep a small image size
   DOCKERFILE="linter/Dockerfile"
+<<<<<<< HEAD
 elif [[ "$image" == *riscv* ]]; then
   # Use RISC-V specific Dockerfile
   DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
@@ -90,6 +97,15 @@ _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
   _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
   _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
+=======
+fi
+
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+if [[ "$image" == *rocm* ]]; then
+  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
+  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 tag=$(echo $image | awk -F':' '{print $2}')
@@ -98,6 +114,7 @@ tag=$(echo $image | awk -F':' '{print $2}')
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$tag" in
+<<<<<<< HEAD
   pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
     CUDA_VERSION=12.4
     ANACONDA_PYTHON_VERSION=3.10
@@ -121,6 +138,11 @@ case "$tag" in
     ;;
   pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
     CUDA_VERSION=13.0.0
+=======
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
@@ -131,6 +153,10 @@ case "$tag" in
     ;;
   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
     CUDA_VERSION=12.8.1
+<<<<<<< HEAD
+=======
+    CUDNN_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
     VISION=yes
@@ -140,18 +166,92 @@ case "$tag" in
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
     CUDA_VERSION=12.8.1
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
+=======
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.6.3
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    ;;
+<<<<<<< HEAD
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.8.1
+=======
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
     ;;
   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
     CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
     VISION=yes
@@ -161,17 +261,27 @@ case "$tag" in
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-clang12-onnx)
+<<<<<<< HEAD
     ANACONDA_PYTHON_VERSION=3.10
+=======
+    ANACONDA_PYTHON_VERSION=3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CLANG_VERSION=12
     VISION=yes
     ONNX=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-py3.10-clang12)
     ANACONDA_PYTHON_VERSION=3.10
+=======
+  pytorch-linux-jammy-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CLANG_VERSION=12
     VISION=yes
     TRITON=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
     if [[ $tag =~ "jammy" ]]; then
       ANACONDA_PYTHON_VERSION=3.10
@@ -181,11 +291,31 @@ case "$tag" in
     GCC_VERSION=11
     VISION=yes
     ROCM_VERSION=7.0
+=======
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.9-gcc9)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=9
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-rocm-n-1-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    ROCM_VERSION=6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
     if [[ $tag =~ "benchmarks" ]]; then
       INDUCTOR_BENCHMARKS=yes
@@ -195,10 +325,39 @@ case "$tag" in
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
+=======
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    ROCM_VERSION=6.4
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.1-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     XPU_VERSION=2025.1
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-xpu-n-py3)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
@@ -209,6 +368,10 @@ case "$tag" in
     ;;
   pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.10
+=======
+    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
+    ANACONDA_PYTHON_VERSION=3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -216,20 +379,46 @@ case "$tag" in
     DOCS=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
     ANACONDA_PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
+=======
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CLANG_VERSION=12
     VISION=yes
     TRITON=yes
     ;;
+<<<<<<< HEAD
+=======
+  pytorch-linux-jammy-py3-clang12-asan)
+    ANACONDA_PYTHON_VERSION=3.9
+    CLANG_VERSION=12
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3-clang15-asan)
+    ANACONDA_PYTHON_VERSION=3.10
+    CLANG_VERSION=15
+    VISION=yes
+    ;;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pytorch-linux-jammy-py3-clang18-asan)
     ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=18
     VISION=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-py3.10-gcc11)
     ANACONDA_PYTHON_VERSION=3.10
+=======
+  pytorch-linux-jammy-py3.9-gcc11)
+    ANACONDA_PYTHON_VERSION=3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -256,10 +445,20 @@ case "$tag" in
     TRITON_CPU=yes
     ;;
   pytorch-linux-jammy-linter)
+<<<<<<< HEAD
     PYTHON_VERSION=3.10
     ;;
   pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
     PYTHON_VERSION=3.10
+=======
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
+    # We will need to update mypy version eventually, but that's for another day. The task
+    # would be to upgrade mypy to 1.0.0 with Python 3.11
+    PYTHON_VERSION=3.9
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
+    PYTHON_VERSION=3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA_VERSION=12.8.1
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)
@@ -267,6 +466,10 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
+<<<<<<< HEAD
+=======
+    CONDA_CMAKE=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
@@ -277,18 +480,27 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
+<<<<<<< HEAD
+=======
+    CONDA_CMAKE=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-noble-riscv64-py3.12-gcc14)
     GCC_VERSION=14
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
+=======
+  *)
+    # Catch-all for builds that are not hardcoded.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     echo "image '$image' did not match an existing build configuration"
     if [[ "$image" == *py* ]]; then
@@ -296,6 +508,10 @@ case "$tag" in
     fi
     if [[ "$image" == *cuda* ]]; then
       extract_version_from_image_name cuda CUDA_VERSION
+<<<<<<< HEAD
+=======
+      extract_version_from_image_name cudnn CUDNN_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fi
     if [[ "$image" == *rocm* ]]; then
       extract_version_from_image_name rocm ROCM_VERSION
@@ -303,7 +519,10 @@ case "$tag" in
       TRITON=yes
       # To ensure that any ROCm config will build using conda cmake
       # and thus have LAPACK/MKL enabled
+<<<<<<< HEAD
       CONDA_CMAKE=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       fi
     if [[ "$image" == *centos7* ]]; then
       NINJA_VERSION=1.10.2
@@ -320,9 +539,12 @@ case "$tag" in
     if [[ "$image" == *glibc* ]]; then
       extract_version_from_image_name glibc GLIBC_VERSION
     fi
+<<<<<<< HEAD
     if [[ "$image" == *cmake* ]]; then
       extract_version_from_image_name cmake CMAKE_VERSION
     fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ;;
 esac
 
@@ -336,20 +558,29 @@ if [[ -n "${CI:-}" ]]; then
   progress_flag="--progress=plain"
 fi
 
+<<<<<<< HEAD
 if [[ "${DOCKER_BUILDKIT}" == 0 ]]; then
   progress_flag=""
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Build image
 docker build \
        ${no_cache_flag} \
        ${progress_flag} \
        --build-arg "BUILD_ENVIRONMENT=${image}" \
+<<<<<<< HEAD
        --build-arg "PROTOBUF=${PROTOBUF:-}" \
        --build-arg "LLVMDEV=${LLVMDEV:-}" \
        --build-arg "VISION=${VISION:-}" \
        --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
        --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
+=======
+       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "VISION=${VISION:-}" \
+       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \
        --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
        --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
@@ -357,6 +588,7 @@ docker build \
        --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
        --build-arg "GCC_VERSION=${GCC_VERSION}" \
        --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+<<<<<<< HEAD
        --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
        --build-arg "KATEX=${KATEX:-}" \
@@ -366,6 +598,18 @@ docker build \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
        --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
+=======
+       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
+       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
+       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
+       --build-arg "KATEX=${KATEX:-}" \
+       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
+       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
+       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "TRITON=${TRITON}" \
        --build-arg "TRITON_CPU=${TRITON_CPU}" \
        --build-arg "ONNX=${ONNX}" \
@@ -379,7 +623,10 @@ docker build \
        --build-arg "OPENBLAS=${OPENBLAS:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
+<<<<<<< HEAD
        --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
@@ -420,6 +667,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 fi
 
 if [ -n "$GCC_VERSION" ]; then
+<<<<<<< HEAD
   if [[ "$image" == *riscv* ]]; then
     # Check RISC-V cross-compilation toolchain version
     if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
@@ -428,6 +676,9 @@ if [ -n "$GCC_VERSION" ]; then
       exit 1
     fi
   elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+=======
+  if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo "GCC_VERSION=$GCC_VERSION, but:"
     drun gcc --version
     exit 1
@@ -454,9 +705,26 @@ HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || ec
 if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
   if [ "$HAS_TRITON" = "no" ]; then
     echo "expecting triton to be installed, but it is not"
+<<<<<<< HEAD
     exit 0
   fi
 elif [ "$HAS_TRITON" = "yes" ]; then
   echo "expecting triton to not be installed, but it is"
   exit 0
+=======
+    exit 1
+  fi
+elif [ "$HAS_TRITON" = "yes" ]; then
+  echo "expecting triton to not be installed, but it is"
+  exit 1
+fi
+
+# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
+# they support 4.0.0 yet, so exclude them from this check.
+CMAKE_VERSION=$(drun cmake --version)
+if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
+  echo "CMake version is not 4.0.0:"
+  drun cmake --version
+  exit 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile
index 3d35bb79c7b8d..05d6320cabc05 100644
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@@ -1,7 +1,13 @@
 ARG CENTOS_VERSION
 
+<<<<<<< HEAD
 FROM quay.io/centos/centos:stream${CENTOS_VERSION}
 
+=======
+FROM centos:${CENTOS_VERSION}
+
+ARG CENTOS_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Set AMD gpu targets to build for
 ARG PYTORCH_ROCM_ARCH
@@ -13,6 +19,7 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 COPY ./common/install_base.sh install_base.sh
 RUN bash ./install_base.sh && rm install_base.sh
 
+<<<<<<< HEAD
 #Install langpack
 RUN yum install -y glibc-langpack-en
 
@@ -28,6 +35,21 @@ ENV BASH_ENV "/etc/profile"
 # Install ninja
 RUN dnf --enablerepo=crb install -y ninja-build
 
+=======
+# Update CentOS git version
+RUN yum -y remove git
+RUN yum -y remove git-*
+RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+    sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo
+RUN yum install -y git
+
+# Install devtoolset
+ARG DEVTOOLSET_VERSION
+COPY ./common/install_devtoolset.sh install_devtoolset.sh
+RUN bash ./install_devtoolset.sh && rm install_devtoolset.sh
+ENV BASH_ENV "/etc/profile"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default glibc version
 ARG GLIBC_VERSION
 COPY ./common/install_glibc.sh install_glibc.sh
@@ -48,7 +70,10 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@@ -58,6 +83,7 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+<<<<<<< HEAD
 RUN mkdir ci_commit_pins
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
@@ -65,14 +91,22 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh common_utils.sh
 RUN rm -r ci_commit_pins
+=======
+COPY ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh
+RUN rm install_rocm.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
 RUN rm install_amdsmi.sh
+<<<<<<< HEAD
 
 ENV ROCM_PATH /opt/rocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@@ -82,7 +116,10 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
@@ -98,7 +135,11 @@ COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
+<<<<<<< HEAD
 #RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+=======
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
 
 # Install ccache/sccache (do this last, so we get priority in PATH)
diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index f2e2d655a6cf2..f0f16576caf54 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 deb42f2a8e48f5032b4a98ee781a15fa87a157cf
+=======
+56392aa978594cc155fa8af48cd949f5b5f1823a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt
new file mode 100644
index 0000000000000..f00d6ca4f9ca7
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@@ -0,0 +1 @@
+243e186efbf7fb93328dd6b34927a4e8c8f24395
diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt
index 77a73992346c1..57a4f51b2dd1e 100644
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@@ -1 +1,5 @@
-v2.27.5-1
\ No newline at end of file
+<<<<<<< HEAD
+v2.27.5-1
+=======
+v2.27.3-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
index b03606f6defc1..6abd4a388f1c2 100644
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 1b0418a9a454b2b93ab8d71f40e59d2297157fae
+=======
+ae324eeac8e102a2b40370e341460f3791353398
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index d893bdd32ab34..03fc672f6eaaf 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 ac80c4190aa0321f761a08af97e1e1eee41f01d9
+=======
+21876a4bbaf371bcb83df8e6ee4f43a92f524dfe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/cache_vision_models.sh b/.ci/docker/common/cache_vision_models.sh
index 8380c48177de3..760cbb85cd2a9 100644
--- a/.ci/docker/common/cache_vision_models.sh
+++ b/.ci/docker/common/cache_vision_models.sh
@@ -2,6 +2,7 @@
 
 set -ex
 
+<<<<<<< HEAD
 # Skip pytorch-nightly installation in docker images
 # Installation of pytorch-nightly is needed to prefetch mobilenet_v2 avd v3 models for some tests.
 # Came from https://github.com/ROCm/pytorch/commit/85bd6bc0105162293fa0bbfb7b661f85ec67f85a
@@ -16,6 +17,8 @@ set -ex
 echo "Skip torch-nightly installation"
 exit 0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 # Cache the test models at ~/.cache/torch/hub/
diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh
index 0b865e5bc6f8d..eae1f89b5a90e 100755
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #!/bin/bash
 # Script used only in CD pipeline
 
@@ -24,4 +25,22 @@ do
   sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
 done
 
-rm -rf $ACL_CHECKOUT_DIR
\ No newline at end of file
+rm -rf $ACL_CHECKOUT_DIR
+=======
+set -euo pipefail
+
+readonly version=v25.02
+readonly src_host=https://github.com/ARM-software
+readonly src_repo=ComputeLibrary
+
+# Clone ACL
+[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
+cd ${src_repo}
+
+git checkout $version
+
+# Build with scons
+scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
+  os=linux arch=armv8a build=native multi_isa=1 \
+  fixed_format_kernels=1 openmp=1 cppthreads=0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index a1c98aa25a31a..adeb1b812b984 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -77,15 +77,19 @@ install_ubuntu() {
   # see: https://github.com/pytorch/pytorch/issues/65931
   apt-get install -y libgnutls30
 
+<<<<<<< HEAD
   if [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
     apt-get install -y libopenblas-dev
   fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Cleanup package manager
   apt-get autoclean && apt-get clean
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 }
 
+<<<<<<< HEAD
 build_libpng() {
   # install few packages
   yum install -y zlib zlib-devel
@@ -125,6 +129,16 @@ install_centos() {
   ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
   numpy_deps="gcc-gfortran"
   yum install -y $ALLOW_ERASE \
+=======
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
+  numpy_deps="gcc-gfortran"
+  yum install -y \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     $ccache_deps \
     $numpy_deps \
     autoconf \
@@ -141,13 +155,20 @@ install_centos() {
     glibc-headers \
     glog-devel \
     libstdc++-devel \
+<<<<<<< HEAD
+    make \
+=======
+    libsndfile-devel \
     make \
+    opencv-devel \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sudo \
     wget \
     vim \
     unzip \
     gdb
 
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]
   then
 	  dnf --enablerepo=crb -y install libsndfile-devel
@@ -163,6 +184,8 @@ install_centos() {
   # Libpng is required for torchvision build.
   build_libpng
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
@@ -170,10 +193,15 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
 # Install base packages depending on the base OS
+=======
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 case "$ID" in
   ubuntu)
     install_ubuntu
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index 80839990e4e6f..4b220cba6ed5a 100644
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -36,12 +36,16 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"
 
 # Setup compiler cache
+<<<<<<< HEAD
 if [ -n "$ROCM_VERSION" ]; then
   curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
 else
   install_ubuntu
 fi
 
+=======
+install_ubuntu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 68f17b73f10a1..d7da44ec5ec50 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -24,10 +24,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   source "${SCRIPT_FOLDER}/common_utils.sh"
 
   pushd /tmp
+<<<<<<< HEAD
   if [ -n $CENTOS_VERSION ] && [[ $CENTOS_VERSION == 7.* ]]; then
     NO_CHECK_CERTIFICATE_FLAG="--no-check-certificate"
   fi
   wget -q "${BASE_URL}/${CONDA_FILE}" ${NO_CHECK_CERTIFICATE_FLAG}
+=======
+  wget -q "${BASE_URL}/${CONDA_FILE}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431
   as_jenkins bash "${CONDA_FILE}" -b -f -p "/opt/conda"
   popd
@@ -43,6 +47,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 
   # Prevent conda from updating to 4.14.0, which causes docker build failures
   # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d
+<<<<<<< HEAD
   # Uncomment the below when resolved to track the latest conda update,
   # but this is required for CentOS stream 9 builds
   ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@@ -50,6 +55,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   if [[ $ID == centos && $OS_VERSION == 9 ]]; then
     as_jenkins conda update -y -n base conda
   fi
+=======
+  # Uncomment the below when resolved to track the latest conda update
+  # as_jenkins conda update -y -n base conda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if [[ $(uname -m) == "aarch64" ]]; then
     export SYSROOT_DEP="sysroot_linux-aarch64=2.17"
@@ -73,10 +82,17 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   fi
 
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
+<<<<<<< HEAD
   if [[ $(uname -m) != "aarch64" ]]; then
     pip_install mkl==2024.2.0
     pip_install mkl-static==2024.2.0
     pip_install mkl-include==2024.2.0
+=======
+  if [[ $(uname -m) == "aarch64" ]]; then
+    conda_install "openblas==0.3.29=*openmp*"
+  else
+    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fi
 
   # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@@ -94,6 +110,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     conda_install_through_forge libstdcxx-ng=14
   fi
 
+<<<<<<< HEAD
   # Install required libstdc++.so.6 version
   if [ "$ANACONDA_PYTHON_VERSION" = "3.10" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.9" ] ; then
     conda_install_through_forge libstdcxx-ng=12
@@ -103,6 +120,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     conda_install_through_forge libstdcxx-ng=14
   fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Install some other packages, including those needed for Python test reporting
   pip_install -r /opt/conda/requirements-ci.txt
 
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
index c873c930097b1..9b0a894a33558 100755
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@@ -3,10 +3,18 @@
 set -uex -o pipefail
 
 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+<<<<<<< HEAD
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
 
 # Python versions to be installed in /opt/$VERSION_NO
 CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t 3.14.0 3.14.0t"}
+=======
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+# Python versions to be installed in /opt/$VERSION_NO
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 function check_var {
     if [ -z "$1" ]; then
@@ -23,8 +31,14 @@ function do_cpython_build {
     tar -xzf Python-$py_ver.tgz
 
     local additional_flags=""
+<<<<<<< HEAD
     if [[ "$py_ver" == *"t" ]]; then
         additional_flags=" --disable-gil"
+=======
+    if [ "$py_ver" == "3.13.0t" ]; then
+        additional_flags=" --disable-gil"
+        mv cpython-3.13/ cpython-3.13t/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fi
 
     pushd $py_folder
@@ -66,15 +80,21 @@ function do_cpython_build {
         ln -s pip3 ${prefix}/bin/pip
     fi
     # install setuptools since python 3.12 is required to use distutils
+<<<<<<< HEAD
     # packaging is needed to create symlink since wheel no longer provides needed information
     ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
     local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
+=======
+    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
+    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ln -sf ${prefix} /opt/python/${abi_tag}
 }
 
 function build_cpython {
     local py_ver=$1
     check_var $py_ver
+<<<<<<< HEAD
     local py_suffix=$py_ver
     local py_folder=$py_ver
 
@@ -85,6 +105,26 @@ function build_cpython {
     fi
     wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
     do_cpython_build $py_ver Python-$py_suffix
+=======
+    check_var $PYTHON_DOWNLOAD_URL
+    local py_ver_folder=$py_ver
+
+    if [ "$py_ver" = "3.13.0t" ]; then
+        PY_VER_SHORT="3.13"
+        PYT_VER_SHORT="3.13t"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
+    elif [ "$py_ver" = "3.13.0" ]; then
+        PY_VER_SHORT="3.13"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PY_VER_SHORT
+    else
+        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
+        do_cpython_build $py_ver Python-$py_ver
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     rm -f Python-$py_ver.tgz
 }
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index fe2f9ae3185a3..2aa2fd95c165c 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -10,8 +10,11 @@ else
   arch_path='sbsa'
 fi
 
+<<<<<<< HEAD
 NVSHMEM_VERSION=3.4.5
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 function install_cuda {
   version=$1
   runfile=$2
@@ -42,6 +45,7 @@ function install_cudnn {
   rm -rf tmp_cudnn
 }
 
+<<<<<<< HEAD
 function install_nvshmem {
   cuda_major_version=$1      # e.g. "12"
   nvshmem_version=$2         # e.g. "3.3.9"
@@ -97,12 +101,20 @@ function install_124 {
 function install_126 {
   CUDNN_VERSION=9.10.2.21
   echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
+=======
+function install_126 {
+  CUDNN_VERSION=9.10.2.21
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
 
   install_cudnn 12 $CUDNN_VERSION
 
+<<<<<<< HEAD
   install_nvshmem 12 $NVSHMEM_VERSION
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDA_VERSION=12.6 bash install_nccl.sh
 
   CUDA_VERSION=12.6 bash install_cusparselt.sh
@@ -112,15 +124,22 @@ function install_126 {
 
 function install_129 {
   CUDNN_VERSION=9.10.2.21
+<<<<<<< HEAD
   echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
+=======
+  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # install CUDA 12.9.1 in the same container
   install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
 
   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
   install_cudnn 12 $CUDNN_VERSION
 
+<<<<<<< HEAD
   install_nvshmem 12 $NVSHMEM_VERSION
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDA_VERSION=12.9 bash install_nccl.sh
 
   CUDA_VERSION=12.9 bash install_cusparselt.sh
@@ -128,17 +147,60 @@ function install_129 {
   ldconfig
 }
 
+<<<<<<< HEAD
 function install_128 {
   CUDNN_VERSION=9.8.0.87
   echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
+=======
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
+function install_128 {
+  CUDNN_VERSION=9.8.0.87
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # install CUDA 12.8.1 in the same container
   install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
 
   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
   install_cudnn 12 $CUDNN_VERSION
 
+<<<<<<< HEAD
   install_nvshmem 12 $NVSHMEM_VERSION
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDA_VERSION=12.8 bash install_nccl.sh
 
   CUDA_VERSION=12.8 bash install_cusparselt.sh
@@ -146,6 +208,7 @@ function install_128 {
   ldconfig
 }
 
+<<<<<<< HEAD
 function install_130 {
   CUDNN_VERSION=9.13.0.50
   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@@ -164,20 +227,29 @@ function install_130 {
   ldconfig
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
     case "$1" in
+<<<<<<< HEAD
     12.4) install_124;
         ;;
     12.6|12.6.*) install_126;
+=======
+    12.6|12.6.*) install_126; prune_126
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ;;
     12.8|12.8.*) install_128;
         ;;
     12.9|12.9.*) install_129;
         ;;
+<<<<<<< HEAD
     13.0|13.0.*) install_130;
         ;;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *) echo "bad argument $1"; exit 1
         ;;
     esac
diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh
new file mode 100644
index 0000000000000..7ee5e73226cb6
--- /dev/null
+++ b/.ci/docker/common/install_cudnn.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+if [[ -n "${CUDNN_VERSION}" ]]; then
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn
+    pushd tmp_cudnn
+    if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    else
+        print "Unsupported CUDA version ${CUDA_VERSION}"
+        exit 1
+    fi
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    tar xf ${CUDNN_NAME}.tar.xz
+    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cudnn
+    ldconfig
+fi
diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh
index b532c086371f1..3443da6482a1e 100644
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@@ -5,6 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
 
+<<<<<<< HEAD
 if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
     arch_path='sbsa'
     export TARGETARCH=${TARGETARCH:-$(uname -m)}
@@ -14,6 +15,9 @@ if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
     CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+=======
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arch_path='sbsa'
     export TARGETARCH=${TARGETARCH:-$(uname -m)}
     if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
@@ -21,6 +25,7 @@ elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
     fi
     CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+<<<<<<< HEAD
 elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
     arch_path='sbsa'
     export TARGETARCH=${TARGETARCH:-$(uname -m)}
@@ -29,6 +34,8 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
     fi
     CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
     echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}"
 fi
diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
index fb168acd4febe..ede624e175562 100755
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@@ -42,13 +42,17 @@ install_pip_dependencies() {
   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
   # numba and scipy version used in PyTorch CI
   conda_run pip uninstall -y numba scipy
+<<<<<<< HEAD
   # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
   pip_install yaspin==3.1.0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   popd
 }
 
 setup_executorch() {
+<<<<<<< HEAD
   export PYTHON_EXECUTABLE=python
   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
 
@@ -66,3 +70,19 @@ if [ $# -eq 0 ]; then
 else
   "$@"
 fi
+=======
+  pushd executorch
+
+  export PYTHON_EXECUTABLE=python
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  popd
+}
+
+clone_executorch
+install_buck2
+install_conda_dependencies
+install_pip_dependencies
+setup_executorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh
index 81467d87f5140..c8ac925d402ad 100644
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@@ -5,7 +5,13 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 function install_huggingface() {
+<<<<<<< HEAD
   pip_install -r huggingface-requirements.txt
+=======
+  local version
+  commit=$(get_pinned_commit huggingface)
+  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function install_timm() {
@@ -13,6 +19,7 @@ function install_timm() {
   commit=$(get_pinned_commit timm)
 
   pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
+<<<<<<< HEAD
 }
 
 function install_torchbench() {
@@ -30,10 +37,15 @@ function install_torchbench() {
 
   chown -R jenkins torchbench
   chown -R jenkins /opt/conda
+=======
+  # Clean up
+  conda_run pip uninstall -y torch torchvision triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
+<<<<<<< HEAD
 
 # Stable packages are ok here, just to satisfy TorchBench check
 pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
@@ -44,3 +56,7 @@ install_timm
 
 # Clean up
 conda_run pip uninstall -y torch torchvision torchaudio triton torchao
+=======
+install_huggingface
+install_timm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh
index 58a8e0b4e49c1..ea0cdfc2bf703 100644
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@@ -7,8 +7,11 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
   NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
   NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
+<<<<<<< HEAD
 elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
   NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
   exit 1
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index 77fd0ff126a43..edccab581ecd6 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -19,8 +19,13 @@ pip_install \
   transformers==4.36.2
 
 pip_install coloredlogs packaging
+<<<<<<< HEAD
 pip_install onnxruntime==1.23.1
 pip_install onnxscript==0.5.4
+=======
+pip_install onnxruntime==1.18.1
+pip_install onnxscript==0.3.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
index 2f386c6bd523a..9baed367ba955 100755
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@@ -3,10 +3,15 @@
 
 set -ex
 
+<<<<<<< HEAD
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
 
 # Clone OpenBLAS
 git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
+=======
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
@@ -19,7 +24,12 @@ CFLAGS=-O3
 BUILD_BFLOAT16=1
 "
 
+<<<<<<< HEAD
 make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
 sudo make install -C $OPENBLAS_CHECKOUT_DIR
 
-rm -rf $OPENBLAS_CHECKOUT_DIR
\ No newline at end of file
+rm -rf $OPENBLAS_CHECKOUT_DIR
+=======
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index afba246cbf0c7..eff17bd922f23 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -2,11 +2,14 @@
 
 set -ex
 
+<<<<<<< HEAD
 # for pip_install function
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ver() {
     printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
@@ -35,6 +38,7 @@ EOF
 
     # we want the patch version of 6.4 instead
     if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+<<<<<<< HEAD
         ROCM_VERSION="${ROCM_VERSION}.2"
     fi
 
@@ -48,6 +52,18 @@ EOF
 
     # Add rocm repository
     wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+=======
+        ROCM_VERSION="${ROCM_VERSION}.1"
+    fi
+
+    # Add amdgpu repository
+    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+
+    # Add rocm repository
+    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
     apt-get update --allow-insecure-repositories
 
@@ -60,9 +76,25 @@ EOF
                    roctracer-dev \
                    amd-smi-lib
 
+<<<<<<< HEAD
     # precompiled miopen kernels is too old and never updated from last 3+yrs so removing the logic to install
     # Also, these kernels are not generating for MI300X, MI350 and also not reliable anymore
     
+=======
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
+    fi
+
+    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
+    # search for all unversioned packages
+    # if search fails it will abort this script; use true to avoid case where search fails
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
+    else
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
     for kdb in /opt/rocm/share/miopen/db/*.kdb
@@ -71,6 +103,7 @@ EOF
     done
 
     # ROCm 6.3 had a regression where initializing static code objects had significant overhead
+<<<<<<< HEAD
     # CI no longer builds for ROCm 6.3, but
     # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
     if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
@@ -98,12 +131,40 @@ EOF
         cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
         make -j
         cp hipamd/lib/libamdhip64.so.6.4.* /opt/rocm/lib/libamdhip64.so.6.4.*
+=======
+    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
+        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
+            HIP_BRANCH=release/rocm-rel-6.4
+            VER_STR=6.4
+            VER_PATCH=.1
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+            HIP_BRANCH=release/rocm-rel-6.4
+            VER_STR=6.4
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
+            HIP_BRANCH=rocm-6.3.x
+            VER_STR=6.3
+        fi
+        # clr build needs CppHeaderParser but can only find it using conda's python
+        /opt/conda/bin/python -m pip install CppHeaderParser
+        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        HIP_COMMON_DIR=$(readlink -f HIP)
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
+        mkdir -p clr/build
+        pushd clr/build
+        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
+        make -j
+        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         popd
         rm -rf HIP clr
     fi
 
+<<<<<<< HEAD
     pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Cleanup
     apt-get autoclean && apt-get clean
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@@ -114,6 +175,7 @@ install_centos() {
   yum update -y
   yum install -y kmod
   yum install -y wget
+<<<<<<< HEAD
 
   if [[ $OS_VERSION == 9 ]]; then
       dnf install -y openblas-serial
@@ -150,6 +212,28 @@ install_centos() {
   else
       local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}/main"
   fi
+=======
+  yum install -y openblas-devel
+
+  yum install -y epel-release
+  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
+
+  # Add amdgpu repository
+  local amdgpu_baseurl
+  if [[ $OS_VERSION == 9 ]]; then
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
+  else
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+  fi
+  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
+
+  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
   echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo
   echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo
@@ -157,6 +241,7 @@ install_centos() {
   echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo
   echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo
 
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]; then
       yum update -y --nogpgcheck
       dnf --enablerepo=crb install -y perl-File-BaseDir python3-wheel
@@ -164,6 +249,11 @@ install_centos() {
   else
       yum update -y
       yum install -y \
+=======
+  yum update -y
+
+  yum install -y \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    rocm-dev \
                    rocm-utils \
                    rocm-libs \
@@ -171,9 +261,21 @@ install_centos() {
                    rocprofiler-dev \
                    roctracer-dev \
                    amd-smi-lib
+<<<<<<< HEAD
   fi
     # precompiled miopen kernels is too old and never updated from last 3+yrs so removing the logic to install
     # Also, these kernels are not generating for MI300X, MI350 and also not reliable anymore
+=======
+
+  # precompiled miopen kernels; search for all unversioned packages
+  # if search fails it will abort this script; use true to avoid case where search fails
+  MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
+  if [[ "x${MIOPENHIPGFX}" = x ]]; then
+    echo "miopen-hip-gfx package not available" && exit 1
+  else
+    yum install -y ${MIOPENHIPGFX}
+  fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
   for kdb in /opt/rocm/share/miopen/db/*.kdb
@@ -181,8 +283,11 @@ install_centos() {
       sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
   done
 
+<<<<<<< HEAD
   pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
@@ -190,8 +295,11 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install Python packages depending on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 9bf45e6f1b0a9..9ba07c8e26331 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -12,8 +12,13 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
+<<<<<<< HEAD
     # https://github.com/icl-utk-edu/magma/pull/65
     MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+=======
+    # Version 2.7.2 + ROCm related updates
+    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index b2fdebdcc4747..1a670aaaa9f73 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -57,7 +57,11 @@ if [ ! -f setup.py ]; then
   cd python
 fi
 
+<<<<<<< HEAD
 pip_install pybind11==3.0.1
+=======
+pip_install pybind11==2.13.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@@ -66,15 +70,25 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
   # Triton needs at least gcc-9 to build
   apt-get install -y g++-9
 
+<<<<<<< HEAD
   CXX=g++-9 conda_run python -m build --wheel --no-isolation
+=======
+  CXX=g++-9 conda_run python setup.py bdist_wheel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
   # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
   add-apt-repository -y ppa:ubuntu-toolchain-r/test
   apt-get install -y g++-9
 
+<<<<<<< HEAD
   CXX=g++-9 conda_run python -m build --wheel --no-isolation
 else
   conda_run python -m build --wheel --no-isolation
+=======
+  CXX=g++-9 conda_run python setup.py bdist_wheel
+else
+  conda_run python setup.py bdist_wheel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 # Copy the wheel to /opt for multi stage docker builds
@@ -98,10 +112,15 @@ fi
 if [ -n "${NUMPY_VERSION}" ]; then
   pip_install "numpy==${NUMPY_VERSION}"
 fi
+<<<<<<< HEAD
 
 # IMPORTANT: helion needs to be installed without dependencies.
 # It depends on torch and triton. We don't want to install
 # triton and torch from production on Docker CI images
 if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
   pip_install helion --no-deps
+=======
+if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
+  pip_install helion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh
index 04f15a52e88e3..10048ebc19efc 100755
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@@ -44,12 +44,17 @@ function install_ucc() {
 
   ./autogen.sh
 
+<<<<<<< HEAD
   if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
     NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
   else
     # We only run distributed tests on Tesla M60 and A10G
     NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
   fi
+=======
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if [[ -n "$ROCM_VERSION" ]]; then
     if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
diff --git a/.ci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh
index 532d8d14a55c8..665b8c0805c65 100755
--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@@ -15,6 +15,7 @@ install_ubuntu() {
 install_centos() {
   # Need EPEL for many packages we depend on.
   # See http://fedoraproject.org/wiki/EPEL
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]; then
       yum install -y epel-release
   else
@@ -23,6 +24,12 @@ install_centos() {
           opencv-devel \
           ffmpeg-devel
   fi
+=======
+  yum --enablerepo=extras install -y epel-release
+
+  yum install -y \
+      opencv-devel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Cleanup
   yum clean all
@@ -31,8 +38,11 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install base packages depending on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index 0b150872f93ce..f77c9bb6d2f95 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -34,6 +34,7 @@ function install_ubuntu() {
 
     # The xpu-smi packages
     apt-get install -y flex bison xpu-smi
+<<<<<<< HEAD
 
     if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
         # Compute and Media Runtimes
@@ -55,6 +56,20 @@ function install_ubuntu() {
         apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
     fi
 
+=======
+    # Compute and Media Runtimes
+    apt-get install -y \
+        intel-opencl-icd intel-level-zero-gpu level-zero \
+        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+        apt-get install -y intel-ocloc
+    fi
+    # Development Packages
+    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Install Intel Support Packages
     apt-get install -y ${XPU_PACKAGES}
 
@@ -143,6 +158,7 @@ function install_sles() {
 
 }
 
+<<<<<<< HEAD
 # Default use GPU driver rolling releases
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
@@ -155,6 +171,20 @@ if [[ "$XPU_VERSION" == "2025.2" ]]; then
     XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
 else
     XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+=======
+# Default use GPU driver LTS releases
+XPU_DRIVER_VERSION="/lts/2350"
+if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+    # Use GPU driver rolling releases
+    XPU_DRIVER_VERSION=""
+fi
+
+# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+if [[ "$XPU_VERSION" == "2025.1" ]]; then
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+else
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 # The installation depends on the base OS
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index c93f022268b25..1c7781b192289 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -69,6 +69,7 @@ RUN bash ./install_cuda.sh 12.9
 RUN bash ./install_magma.sh 12.9
 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
 
+<<<<<<< HEAD
 FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 RUN bash ./install_magma.sh 13.0
@@ -83,6 +84,10 @@ RUN apt-get update -y && \
     cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
 
 FROM cpu as rocm
+=======
+FROM cpu as rocm
+ARG ROCM_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@@ -101,7 +106,11 @@ RUN apt-get update -y && \
     apt-get clean
 
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+<<<<<<< HEAD
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+=======
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
index c40896cb5499f..756be70a0a24c 100755
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@@ -39,6 +39,7 @@ case ${DOCKER_TAG_PREFIX} in
         DOCKER_GPU_BUILD_ARG=""
         ;;
     rocm*)
+<<<<<<< HEAD
         # we want the patch version of 7.0 instead
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
@@ -54,6 +55,11 @@ case ${DOCKER_TAG_PREFIX} in
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
         fi
+=======
+        BASE_TARGET=rocm
+        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
         ;;
     *)
diff --git a/.ci/docker/linter/Dockerfile b/.ci/docker/linter/Dockerfile
index 95d08ffea051d..658ad4a91709e 100644
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@@ -27,7 +27,10 @@ COPY ./common/install_linter.sh install_linter.sh
 RUN bash ./install_linter.sh
 RUN rm install_linter.sh
 
+<<<<<<< HEAD
 RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 USER jenkins
 CMD ["bash"]
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
index 4803cb778c905..ebbce2f360f93 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -130,8 +130,12 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
     /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
     done;
+<<<<<<< HEAD
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
@@ -176,6 +180,10 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
+<<<<<<< HEAD
 ENV XPU_VERSION 2025.2
+=======
+ENV XPU_VERSION 2025.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
index 768db09929361..9a45488354c4a 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -62,6 +62,7 @@ ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 
+<<<<<<< HEAD
 # Install Arm Compute Library
 FROM base as arm_compute
 # use python3.9 to install scons
@@ -69,6 +70,8 @@ RUN python3.9 -m pip install scons==4.7.0
 RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
 COPY ./common/install_acl.sh install_acl.sh
 RUN bash ./install_acl.sh && rm install_acl.sh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FROM base as final
 
 # remove unnecessary python versions
@@ -77,7 +80,11 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+<<<<<<< HEAD
 COPY --from=arm_compute /acl /acl
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+=======
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
index 347a01ee4ede7..35aba8e282941 100644
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -86,6 +86,7 @@ FROM base as nvpl
 ADD ./common/install_nvpl.sh install_nvpl.sh
 RUN bash ./install_nvpl.sh && rm install_nvpl.sh
 
+<<<<<<< HEAD
 # Install Arm Compute Library
 FROM base as arm_compute
 # use python3.9 to install scons
@@ -95,6 +96,8 @@ COPY ./common/install_acl.sh install_acl.sh
 RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@@ -102,9 +105,14 @@ COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BAS
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
+<<<<<<< HEAD
 COPY --from=arm_compute /acl /acl
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+=======
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi
new file mode 100644
index 0000000000000..ed33cc61df093
--- /dev/null
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x
index 1cf83acb1c736..0a85278e8b0eb 100644
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@@ -115,9 +115,12 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0
 
+<<<<<<< HEAD
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
@@ -134,8 +137,11 @@ RUN pip3 install flatbuffers && \
   git clone https://github.com/microsoft/onnxruntime && \
   cd onnxruntime && git checkout v1.21.0 && \
   git submodule update --init --recursive && \
+<<<<<<< HEAD
   wget https://github.com/microsoft/onnxruntime/commit/f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \
   patch -p1 < f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ./build.sh --config Release --parallel 0 --enable_pybind \
   --build_wheel --enable_training --enable_training_apis \
   --enable_training_ops --skip_tests --allow_running_as_root \
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index b4b5059973037..a4fc37c81e1a8 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -28,7 +28,10 @@ fi
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+<<<<<<< HEAD
 ACL_VERSION=${ACL_VERSION:-}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 case ${image} in
     manylinux2_28-builder:cpu)
@@ -42,6 +45,16 @@ case ${image} in
         GPU_IMAGE=arm64v8/almalinux:8
         DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
         MANY_LINUX_VERSION="2_28_aarch64"
+<<<<<<< HEAD
+=======
+        OPENBLAS_VERSION="v0.3.30"
+        ;;
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+        TARGET=final
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ;;
     manylinuxs390x-builder:cpu-s390x)
         TARGET=final
@@ -61,12 +74,15 @@ case ${image} in
         DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
         MANY_LINUX_VERSION="2_28"
         ;;
+<<<<<<< HEAD
     manylinux2_28-builder:cuda13*)
         TARGET=cuda_final
         GPU_IMAGE=amd64/almalinux:8
         DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
         MANY_LINUX_VERSION="2_28"
         ;;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     manylinuxaarch64-builder:cuda*)
         TARGET=cuda_final
         GPU_IMAGE=amd64/almalinux:8
@@ -75,6 +91,7 @@ case ${image} in
         DOCKERFILE_SUFFIX="_cuda_aarch64"
         ;;
     manylinux2_28-builder:rocm*)
+<<<<<<< HEAD
         # we want the patch version of 7.0 instead
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
@@ -83,15 +100,20 @@ case ${image} in
         if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
         fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TARGET=rocm_final
         MANY_LINUX_VERSION="2_28"
         DEVTOOLSET_VERSION="11"
         GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+<<<<<<< HEAD
         # add gfx950, gfx115x conditionally starting in ROCm 7.0
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
         fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
         ;;
     manylinux2_28-builder:xpu)
@@ -123,8 +145,12 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
     ${DOCKER_GPU_BUILD_ARG} \
     --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+<<<<<<< HEAD
     --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
     --build-arg "ACL_VERSION=${ACL_VERSION:-}" \
+=======
+    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     --target "${TARGET}" \
     -t "${tmp_tag}" \
     $@ \
diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py
index c4df0eacbb7fd..7f6f92f12c84b 100644
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@@ -10,6 +10,14 @@
 
 print("Testing SSL certificate checking for Python:", sys.version)
 
+<<<<<<< HEAD
+=======
+if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 EXC = OSError
 
 print(f"Connecting to {GOOD_SSL} should work")
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 658d2d34a6474..677883fdcfd3d 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -10,11 +10,14 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
 
+<<<<<<< HEAD
 build==1.3.0
 #Description: A simple, correct Python build frontend.
 #Pinned versions: 1.3.0
 #test that import:
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
@@ -52,10 +55,17 @@ flatbuffers==24.12.23
 #Pinned versions: 24.12.23
 #test that import:
 
+<<<<<<< HEAD
 hypothesis==6.56.4
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
 #Pinned versions: 6.56.4
+=======
+hypothesis==5.35.1
+# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
+#Description: advanced library for generating parametrized tests
+#Pinned versions: 3.44.6, 4.53.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
 
 junitparser==2.1.1
@@ -68,12 +78,20 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:
 
+<<<<<<< HEAD
 librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
 librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
 #librosa depends on numba; disable it for s390x while numba is disabled too
+=======
+librosa>=0.6.2 ; python_version < "3.11"
+librosa==0.10.2 ; python_version == "3.12"
+#Description: A python package for music and audio analysis
+#Pinned versions: >=0.6.2
+#test that import: test_spectral_ops.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@@ -98,9 +116,14 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 mypy==1.16.0 ; platform_system == "Linux"
 # Pin MyPy version because new errors are likely to appear with each release
 # Skip on Windows as lots of type annotations are POSIX specific
+=======
+mypy==1.16.0
+# Pin MyPy version because new errors are likely to appear with each release
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@@ -111,18 +134,31 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch
 
+<<<<<<< HEAD
 ninja==1.11.1.4
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
 #Pinned versions: 1.11.1.4
+=======
+ninja==1.11.1.3
+#Description: build system. Used in some tests. Used in build to generate build
+#time tracing information
+#Pinned versions: 1.11.1.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
 numba==0.60.0 ; python_version == "3.9"
 numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
+<<<<<<< HEAD
 #Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
+=======
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#test that import: test_numba_integration.py
+#For numba issue see https://github.com/pytorch/pytorch/issues/51511
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@@ -166,6 +202,7 @@ optree==0.13.0
 
 pillow==11.0.0
 #Description:  Python Imaging Library fork
+<<<<<<< HEAD
 #Pinned versions: 11.0.0
 #test that import:
 
@@ -174,6 +211,16 @@ protobuf==5.29.5
 #Pinned versions: 5.29.5
 #test that import: test_tensorboard.py, test/onnx/*
 
+=======
+#Pinned versions: 10.3.0
+#test that import:
+
+protobuf==3.20.2 ; python_version <= "3.12"
+protobuf==4.25.1 ; python_version == "3.13"
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 psutil
 #Description: information on running processes and system utilization
@@ -215,7 +262,11 @@ pytest-subtests==0.13.1
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 xdoctest==1.3.0
+=======
+xdoctest==1.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: runs doctests in pytest
 #Pinned versions: 1.1.0
 #test that import:
@@ -225,9 +276,15 @@ pygments==2.15.0
 #Pinned versions: 2.12.0
 #test that import: the doctests
 
+<<<<<<< HEAD
 #pyyaml
 #Description: data serialization format
 #Pinned versions: 6.0.2
+=======
+#PyYAML
+#Description: data serialization format
+#Pinned versions:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import:
 
 #requests
@@ -237,12 +294,22 @@ pygments==2.15.0
 
 #rich
 #Description: rich text and beautiful formatting in the terminal
+<<<<<<< HEAD
 #Pinned versions: 14.1.0
 #test that import:
 
 scikit-image==0.22.0
 #Description: image processing routines
 #Pinned versions: 0.22.0
+=======
+#Pinned versions: 10.9.0
+#test that import:
+
+scikit-image==0.19.3 ; python_version < "3.10"
+scikit-image==0.22.0 ; python_version >= "3.10"
+#Description: image processing routines
+#Pinned versions:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import: test_nn.py
 
 #scikit-learn
@@ -265,7 +332,11 @@ scipy==1.14.1 ; python_version > "3.9"
 #test that import:
 
 # needed by torchgen utils
+<<<<<<< HEAD
 typing-extensions==4.12.2
+=======
+typing-extensions>=4.10.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@@ -305,7 +376,11 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
+<<<<<<< HEAD
 z3-solver==4.15.1.0 ; platform_machine != "s390x"
+=======
+z3-solver==4.12.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@@ -326,6 +401,11 @@ lxml==5.3.0 ; python_version <= "3.12"
 lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
+<<<<<<< HEAD
+=======
+# Python-3.9 binaries
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyGithub==2.3.0
 
 sympy==1.13.3
@@ -333,6 +413,7 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 onnx==1.19.1
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 
@@ -340,6 +421,15 @@ onnx==1.19.1
 #test that import:
 
 onnxscript==0.5.4
+=======
+onnx==1.16.1 ; python_version <= "3.12"
+onnx==1.18.0 ; python_version == "3.13"
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
+#Pinned versions:
+#test that import:
+
+onnxscript==0.3.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@@ -358,12 +448,21 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
 
+<<<<<<< HEAD
 # To build PyTorch itself
 pyyaml==6.0.2
 pyzstd
 setuptools==78.1.1
 packaging==23.1
 six
+=======
+
+# To build PyTorch itself
+astunparse
+PyYAML
+pyzstd
+setuptools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 scons==4.5.2 ; platform_machine == "aarch64"
 
@@ -377,6 +476,7 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:
 
+<<<<<<< HEAD
 cmake==3.31.6
 #Description: required for building
 
@@ -395,3 +495,14 @@ scikit-build==0.18.1
 pyre-extensions==0.0.32
 tabulate==0.9.0
 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI
+=======
+cmake==4.0.0
+#Description: required for building
+
+tlparse==0.3.30
+#Description: required for log parsing
+
+cuda-bindings>=12.0,<13.0
+#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
+#test that import: test_cuda.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 6e623b4c56949..d98390bf0cd7b 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,6 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
+<<<<<<< HEAD
 
 standard-imghdr==3.13.0; python_version >= "3.13"
 #Description: This is needed by Sphinx, so it needs to be added here.
@@ -10,6 +11,10 @@ standard-imghdr==3.13.0; python_version >= "3.13"
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
 
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
+=======
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@@ -26,10 +31,16 @@ sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
 #Pinned versions: 2.6.0
 
+<<<<<<< HEAD
 matplotlib==3.5.3 ; python_version < "3.13"
 matplotlib==3.6.3 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3.
+=======
+matplotlib==3.5.3
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 3.5.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 tensorboard==2.13.0 ; python_version < "3.13"
 tensorboard==2.18.0 ; python_version >= "3.13"
@@ -57,8 +68,13 @@ IPython==8.12.0
 #Pinned versions: 8.12.0
 
 myst-nb==0.17.2
+<<<<<<< HEAD
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
 #Pinned versions: 0.17.2
+=======
+#Description: This is used to generate PyTorch functorch docs
+#Pinned versions: 0.13.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index 1545d966571dc..561eb4a3cc51e 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 3.5.0
+=======
+3.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt
index 1545d966571dc..561eb4a3cc51e 100644
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 3.5.0
+=======
+3.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index b517a990a057b..52ace3ff45137 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -52,6 +52,7 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+<<<<<<< HEAD
 RUN mkdir ci_commit_pins
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
@@ -59,6 +60,11 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh common_utils.sh
 RUN rm -r ci_commit_pins
+=======
+COPY ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh
+RUN rm install_rocm.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
@@ -100,11 +106,18 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
+<<<<<<< HEAD
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+=======
+COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/timm.txt timm.txt
+RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
index 8765249688ce5..8ab05c37b9ec5 100644
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -56,10 +56,17 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
+<<<<<<< HEAD
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+=======
+COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/timm.txt timm.txt
+RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install XPU Dependencies
 ARG XPU_VERSION
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 84a74114c381e..150a585bac0c1 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -66,7 +66,10 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
+<<<<<<< HEAD
 ARG CUDA_VERSION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
@@ -97,6 +100,7 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
+<<<<<<< HEAD
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
@@ -109,6 +113,12 @@ ARG INSTALL_MINGW
 COPY ./common/install_mingw.sh install_mingw.sh
 RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
 RUN rm install_mingw.sh
+=======
+COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/timm.txt timm.txt
+RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ARG TRITON
 ARG TRITON_CPU
@@ -189,6 +199,10 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
+<<<<<<< HEAD
+=======
+ENV TORCH_CUDA_ARCH_LIST Maxwell
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 ENV CUDA_PATH /usr/local/cuda
 
diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh
index c2d67f8b1bb29..7c668ca81e714 100644
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@@ -7,4 +7,8 @@ set -ex
 
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
+<<<<<<< HEAD
 USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
+=======
+USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile
index 9fca7ad544617..f7d21fcc579bc 100644
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@@ -1,11 +1,19 @@
 SHELL=/usr/bin/env bash
 
 DOCKER_CMD ?= docker
+<<<<<<< HEAD
 DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
 #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
+=======
+DESIRED_ROCM ?= 6.4
+DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
+PACKAGE_NAME = magma-rocm
+# inherit this from underlying docker image, do not pass this env var to docker
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@@ -16,20 +24,36 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 
 .PHONY: all
+<<<<<<< HEAD
 all: magma-rocm70
 all: magma-rocm64
+=======
+all: magma-rocm64
+all: magma-rocm63
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+<<<<<<< HEAD
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
 	$(DOCKER_RUN)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
+<<<<<<< HEAD
+=======
+
+.PHONY: magma-rocm63
+magma-rocm63: DESIRED_ROCM := 6.3
+magma-rocm63:
+	$(DOCKER_RUN)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh
index c7c7780227ea5..0b435f5f337ef 100755
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@@ -6,8 +6,13 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 
+<<<<<<< HEAD
 # https://github.com/icl-utk-edu/magma/pull/65
 MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+=======
+# Version 2.7.2 + ROCm related updates
+MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@@ -20,7 +25,11 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
+<<<<<<< HEAD
 git clone https://github.com/jeffdaily/magma
+=======
+git clone https://bitbucket.org/icl/magma.git
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile
index 4169aedd03fa5..233925d95eb67 100644
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@@ -16,7 +16,10 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh
 
 .PHONY: all
+<<<<<<< HEAD
 all: magma-cuda130
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
@@ -26,12 +29,15 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+<<<<<<< HEAD
 .PHONY: magma-cuda130
 magma-cuda130: DESIRED_CUDA := 13.0
 magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 magma-cuda130:
 	$(DOCKER_RUN)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .PHONY: magma-cuda129
 magma-cuda129: DESIRED_CUDA := 12.9
 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
diff --git a/.ci/magma/build_magma.sh b/.ci/magma/build_magma.sh
index 6f1924fa45965..c88109ab01765 100755
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@@ -28,7 +28,10 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
+<<<<<<< HEAD
 patch -p1 < ${PACKAGE_FILES}/cuda13.patch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
@@ -38,7 +41,10 @@ popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
+<<<<<<< HEAD
 cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
diff --git a/.ci/manywheel/build.sh b/.ci/manywheel/build.sh
index 6b2a60bc5ca28..82339921b69dd 100755
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@@ -5,6 +5,13 @@ set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 case "${GPU_ARCH_TYPE:-BLANK}" in
+<<<<<<< HEAD
+=======
+    BLANK)
+        # Legacy behavior for CircleCI
+        bash "${SCRIPTPATH}/build_cuda.sh"
+        ;;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cuda)
         bash "${SCRIPTPATH}/build_cuda.sh"
         ;;
diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh
index b84268fd12896..2d8624f59b072 100644
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@@ -97,7 +97,11 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
     exit 1
 fi
 pushd "$PYTORCH_ROOT"
+<<<<<<< HEAD
 retry pip install -qUr requirements-build.txt
+=======
+retry pip install -q cmake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 python setup.py clean
 retry pip install -qr requirements.txt
 case ${DESIRED_PYTHON} in
@@ -138,11 +142,36 @@ fi
 
 echo "Calling setup.py bdist at $(date)"
 
+<<<<<<< HEAD
 time CMAKE_ARGS=${CMAKE_ARGS[@]} \
     EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
     BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
     USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
     python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
+=======
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
+    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
+    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+else
+    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 echo "Finished setup.py bdist at $(date)"
 
 # Build libtorch packages
@@ -255,6 +284,13 @@ ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
 
+<<<<<<< HEAD
+=======
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
+fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
     mkdir -p /$LIBTORCH_HOUSE_DIR
     mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@@ -431,8 +467,21 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
   pushd $PYTORCH_ROOT/test
 
   # Install the wheel for this Python version
+<<<<<<< HEAD
+  pip uninstall -y "$TORCH_PACKAGE_NAME"
+
+=======
+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
+  fi
+
   pip uninstall -y "$TORCH_PACKAGE_NAME"
 
+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+  fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
 
   # Print info on the libraries installed in this wheel
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 2a822295e0361..e7263ae1da951 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -66,9 +66,12 @@ case ${CUDA_VERSION} in
             TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
         fi
         ;;
+<<<<<<< HEAD
     13.0)
         TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
         ;;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     12.6)
         TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
         ;;
@@ -113,6 +116,7 @@ DEPS_SONAME=(
 )
 
 
+<<<<<<< HEAD
 # CUDA_VERSION 12.*, 13.*
 if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
     export USE_STATIC_CUDNN=0
@@ -125,6 +129,15 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
     if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
         echo "Bundling with cudnn and cublas."
 
+=======
+# CUDA_VERSION 12.6, 12.8, 12.9
+if [[ $CUDA_VERSION == 12* ]]; then
+    export USE_STATIC_CUDNN=0
+    # Try parallelizing nvcc as well
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling with cudnn and cublas."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         DEPS_LIST+=(
             "/usr/local/cuda/lib64/libcudnn_adv.so.9"
             "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
@@ -134,12 +147,23 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
             "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
             "/usr/local/cuda/lib64/libcudnn.so.9"
+<<<<<<< HEAD
             "/usr/local/cuda/lib64/libcusparseLt.so.0"
             "/usr/local/cuda/lib64/libnvrtc-builtins.so"
             "/usr/local/cuda/lib64/libcufile.so.0"
             "/usr/local/cuda/lib64/libcufile_rdma.so.1"
             "/usr/local/cuda/lib64/libnvshmem_host.so.3"
             "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
+=======
+            "/usr/local/cuda/lib64/libcublas.so.12"
+            "/usr/local/cuda/lib64/libcublasLt.so.12"
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
+            "/usr/local/cuda/lib64/libcudart.so.12"
+            "/usr/local/cuda/lib64/libnvrtc.so.12"
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
+            "/usr/local/cuda/lib64/libcufile.so.0"
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         DEPS_SONAME+=(
             "libcudnn_adv.so.9"
@@ -150,6 +174,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             "libcudnn_engines_precompiled.so.9"
             "libcudnn_heuristic.so.9"
             "libcudnn.so.9"
+<<<<<<< HEAD
             "libcusparseLt.so.0"
             "libnvrtc-builtins.so"
             "libnvshmem_host.so.3"
@@ -230,6 +255,35 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             )
         fi
 
+=======
+            "libcublas.so.12"
+            "libcublasLt.so.12"
+            "libcusparseLt.so.0"
+            "libcudart.so.12"
+            "libnvrtc.so.12"
+            "libnvrtc-builtins.so"
+            "libcufile.so.0"
+            "libcufile_rdma.so.1"
+        )
+    else
+        echo "Using nvidia libs from pypi."
+        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
+            '$ORIGIN/../../nvidia/cudnn/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../nvidia/cusparselt/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+            '$ORIGIN/../../nvidia/cufile/lib'
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
         export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
         export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh
index d78fbd5c3ed36..dc9bf200bbcb8 100644
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@@ -92,7 +92,11 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
     exit 1
 fi
 pushd "$PYTORCH_ROOT"
+<<<<<<< HEAD
 retry pip install -qUr requirements-build.txt
+=======
+retry pip install -q cmake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1
@@ -104,7 +108,11 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
     export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi
 
+<<<<<<< HEAD
 echo "Calling -m pip install . -v --no-build-isolation at $(date)"
+=======
+echo "Calling setup.py install at $(date)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
     STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
@@ -120,7 +128,11 @@ fi
         # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
         CFLAGS='-Wno-deprecated-declarations' \
         BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
+<<<<<<< HEAD
         python -m pip install --no-build-isolation -v .
+=======
+        python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     mkdir -p libtorch/{lib,bin,include,share}
 
diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh
index bac56746f4501..1f97907e35040 100755
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@@ -107,10 +107,13 @@ if [[ $ROCM_INT -ge 60200 ]]; then
     ROCM_SO_FILES+=("librocm-core.so")
 fi
 
+<<<<<<< HEAD
 if [[ $ROCM_INT -ge 70000 ]]; then
     ROCM_SO_FILES+=("librocroller.so")
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
     LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
@@ -198,7 +201,11 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
 ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
+<<<<<<< HEAD
 ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
+=======
+ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh
index bd7b168be336c..034ef7cf08fc9 100755
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@@ -25,7 +25,10 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
 export USE_ONEMKL=1
 export USE_XCCL=1
+<<<<<<< HEAD
 export USE_MPI=0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
diff --git a/.ci/pytorch/build-mobile.sh b/.ci/pytorch/build-mobile.sh
new file mode 100755
index 0000000000000..1f253ff58c03d
--- /dev/null
+++ b/.ci/pytorch/build-mobile.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
+set -eu -o pipefail
+
+# This script uses linux host toolchain + mobile build options in order to
+# build & test mobile libtorch without having to setup Android/iOS
+# toolchain/simulator.
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+
+# Install torch & torchvision - used to download & trace test model.
+# Ideally we should use the libtorch built on the PR so that backward
+# incompatible changes won't break this script - but it will significantly slow
+# down mobile CI jobs.
+# Here we install nightly instead of stable so that we have an option to
+# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
+retry pip install --pre torch torchvision \
+  -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
+  --progress-bar off
+
+# Run end-to-end process of building mobile library, linking into the predictor
+# binary, and running forward pass with a real model.
+if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
+  TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
+elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
+  test/mobile/lightweight_dispatch/build.sh
+else
+  TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
+fi
+
+print_sccache_stats
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index cae81a2568d5c..d1db011006ebe 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -11,6 +11,13 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 # shellcheck source=./common-build.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
 
+<<<<<<< HEAD
+=======
+if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
+  exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
+fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 echo "Python version:"
 python --version
 
@@ -50,6 +57,12 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
   export ATEN_THREADING=NATIVE
 fi
 
+<<<<<<< HEAD
+=======
+# Enable LLVM dependency for TensorExpr testing
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
@@ -89,6 +102,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
   export USE_MKLDNN=1
   export USE_MKLDNN_ACL=1
+<<<<<<< HEAD
   export ACL_ROOT_DIR=/acl
 fi
 
@@ -111,6 +125,9 @@ if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
   export SLEEF_TARGET_EXEC_USE_QEMU=ON
   sudo chown -R jenkins /var/lib/jenkins/workspace /opt
 
+=======
+  export ACL_ROOT_DIR=/ComputeLibrary
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
@@ -138,8 +155,31 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
 fi
 
 # Use special scripts for Android builds
+<<<<<<< HEAD
 
 if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
+=======
+if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
+  export ANDROID_NDK=/opt/ndk
+  build_args=()
+  if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
+    build_args+=("-DANDROID_ABI=armeabi-v7a")
+  elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
+    build_args+=("-DANDROID_ABI=arm64-v8a")
+  elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
+    build_args+=("-DANDROID_ABI=x86")
+  elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
+    build_args+=("-DANDROID_ABI=x86_64")
+  fi
+  if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
+    build_args+=("-DUSE_VULKAN=ON")
+  fi
+  build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
+  exec ./scripts/build_android.sh "${build_args[@]}" "$@"
+fi
+
+if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   export USE_VULKAN=1
   # shellcheck disable=SC1091
   source /var/lib/jenkins/vulkansdk/setup-env.sh
@@ -173,7 +213,10 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   source /opt/intel/oneapi/mpi/latest/env/vars.sh
   # Enable XCCL build
   export USE_XCCL=1
+<<<<<<< HEAD
   export USE_MPI=0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
   export USE_KINETO=0
   export TORCH_XPU_ARCH_LIST=pvc
@@ -195,6 +238,7 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
+<<<<<<< HEAD
 
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
   J=2  # default to 2 jobs
@@ -205,6 +249,12 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr '
   esac
   echo "Building FlashAttention with job limit $J"
   export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
+=======
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
+  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
+  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
+  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
@@ -219,6 +269,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
   export USE_ASAN=1
   export REL_WITH_DEB_INFO=1
   export UBSAN_FLAGS="-fno-sanitize-recover=all"
+<<<<<<< HEAD
+=======
+  unset USE_LLVM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@@ -229,6 +283,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
     export USE_PRECOMPILED_HEADERS=1
 fi
 
+<<<<<<< HEAD
 if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
   export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
@@ -236,12 +291,23 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
   export CMAKE_BUILD_TYPE=Debug
 elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+=======
+if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
+  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   export CMAKE_BUILD_TYPE=RelWithAssert
 fi
 
 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
+<<<<<<< HEAD
 if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then
+=======
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
   WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
   cleanup_workspace() {
@@ -286,18 +352,32 @@ else
     # XLA test build fails when WERROR=1
     # set only when building other architectures
     # or building non-XLA tests.
+<<<<<<< HEAD
     if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
+=======
+    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *xla* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Install numpy-2.0.2 for builds which are backward compatible with 1.X
       python -mpip install numpy==2.0.2
 
       WERROR=1 python setup.py clean
 
+<<<<<<< HEAD
       WERROR=1 python -m build --wheel --no-isolation
+=======
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        python3 tools/packaging/split_wheel.py bdist_wheel
+      else
+        WERROR=1 python setup.py bdist_wheel
+      fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else
       python setup.py clean
       if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
         source .ci/pytorch/install_cache_xla.sh
       fi
+<<<<<<< HEAD
       python -m build --wheel --no-isolation
     fi
     pip_install_whl "$(echo dist/*.whl)"
@@ -322,6 +402,16 @@ else
     if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchao* ]]; then
       install_torchao
     fi
+=======
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
+        exit 1
+      else
+        python setup.py bdist_wheel
+      fi
+    fi
+    pip_install_whl "$(echo dist/*.whl)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
       echo "Checking that xpu is compiled"
@@ -410,8 +500,15 @@ else
     # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
     # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
     # 16 CPUs
+<<<<<<< HEAD
     MAX_JOBS=$(nproc --ignore=4)
     export MAX_JOBS
+=======
+    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+      MAX_JOBS=$(nproc --ignore=4)
+      export MAX_JOBS
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NB: Install outside of source directory (at the same level as the root
     # pytorch folder) so that it doesn't get cleaned away prior to docker push.
@@ -428,7 +525,12 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
   # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
   python tools/stats/export_test_times.py
 fi
+<<<<<<< HEAD
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+=======
+# don't do this for bazel or s390x as they don't use sccache
+if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   print_sccache_stats
 fi
diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh
index 0f632f8006c07..83af7493bc821 100755
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@@ -300,3 +300,27 @@ except RuntimeError as e:
     exit 1
   fi
 fi
+<<<<<<< HEAD
+=======
+
+###############################################################################
+# Check for C++ ABI compatibility to GCC-11 - GCC 13
+###############################################################################
+if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
+  pushd /tmp
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # gcc 11 - CUDA 11.8, xpu, rocm
+  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  if [[ "$(uname -m)" == "s390x" ]]; then
+    cxx_abi="19"
+  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+    cxx_abi="18"
+  else
+    cxx_abi="16"
+  fi
+  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  popd
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/common-build.sh b/.ci/pytorch/common-build.sh
index 8ca9fdb34c77a..23dca9287491e 100644
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@@ -13,6 +13,7 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
     fi
 
     if which sccache > /dev/null; then
+<<<<<<< HEAD
         # Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
         # sccache will complain about invalid bucket configuration
         if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
@@ -20,6 +21,8 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
           unset SCCACHE_REGION
         fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Save sccache logs to file
         sccache --stop-server > /dev/null  2>&1 || true
         rm -f ~/sccache_error.log || true
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 2325c3d4ed4e7..a9dc698e939b0 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -67,17 +67,26 @@ function pip_install_whl() {
     # Loop through each path and install individually
     for path in "${paths[@]}"; do
       echo "Installing $path"
+<<<<<<< HEAD
       python3 -mpip install "$path"
+=======
+      python3 -mpip install --no-index --no-deps "$path"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     done
   else
     # Loop through each argument and install individually
     for path in "${args[@]}"; do
       echo "Installing $path"
+<<<<<<< HEAD
       python3 -mpip install "$path"
+=======
+      python3 -mpip install --no-index --no-deps "$path"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     done
   fi
 }
 
+<<<<<<< HEAD
 function pip_build_and_install() {
   local build_target=$1
   local wheel_dir=$2
@@ -105,6 +114,8 @@ function pip_build_and_install() {
     pip_install_whl "${file}"
   done
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 function pip_install() {
   # retry 3 times
@@ -148,6 +159,7 @@ function get_pinned_commit() {
   cat .github/ci_commit_pins/"${1}".txt
 }
 
+<<<<<<< HEAD
 function detect_cuda_arch() {
   if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
     if command -v nvidia-smi; then
@@ -165,6 +177,19 @@ function install_torchaudio() {
   local commit
   commit=$(get_pinned_commit audio)
   pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
+=======
+function install_torchaudio() {
+  local commit
+  commit=$(get_pinned_commit audio)
+  if [[ "$1" == "cuda" ]]; then
+    # TODO: This is better to be passed as a parameter from _linux-test workflow
+    # so that it can be consistent with what is set in build
+    TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
+  else
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
+  fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function install_torchtext() {
@@ -172,8 +197,13 @@ function install_torchtext() {
   local text_commit
   data_commit=$(get_pinned_commit data)
   text_commit=$(get_pinned_commit text)
+<<<<<<< HEAD
   pip_build_and_install "git+https://github.com/pytorch/data.git@${data_commit}" dist/data
   pip_build_and_install "git+https://github.com/pytorch/text.git@${text_commit}" dist/text
+=======
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}"
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function install_torchvision() {
@@ -186,6 +216,7 @@ function install_torchvision() {
     echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
     LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
   fi
+<<<<<<< HEAD
 
   if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
     # Not sure if both are needed, but why not
@@ -194,6 +225,9 @@ function install_torchvision() {
   fi
   pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
 
+=======
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if [ -n "${LD_PRELOAD}" ]; then
     LD_PRELOAD=${orig_preload}
   fi
@@ -213,6 +247,7 @@ function install_torchrec_and_fbgemm() {
 
   if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
     # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm
+<<<<<<< HEAD
     pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
     pip_uninstall fbgemm-gpu-nightly
 
@@ -286,12 +321,37 @@ EOF
   else
     pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
     pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
+=======
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+    pip_uninstall fbgemm-gpu-nightly
+
+    pip_install tabulate  # needed for newer fbgemm
+    pip_install patchelf  # needed for rocm fbgemm
+    git clone --recursive https://github.com/pytorch/fbgemm
+    pushd fbgemm/fbgemm_gpu
+    git checkout "${fbgemm_commit}"
+    python setup.py install \
+      --package_variant=rocm \
+      -DHIP_ROOT_DIR="${ROCM_PATH}" \
+      -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
+      -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+    popd
+    rm -rf fbgemm
+  else
+    # See https://github.com/pytorch/pytorch/issues/106971
+    CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
+    pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fi
 }
 
 function clone_pytorch_xla() {
   if [[ ! -d ./xla ]]; then
+<<<<<<< HEAD
     git clone --recursive --quiet https://github.com/pytorch/xla.git
+=======
+    git clone --recursive -b r2.8 https://github.com/pytorch/xla.git
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pushd xla
     # pin the xla hash so that we don't get broken by changes to xla
     git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@@ -301,10 +361,41 @@ function clone_pytorch_xla() {
   fi
 }
 
+<<<<<<< HEAD
 function install_torchao() {
   local commit
   commit=$(get_pinned_commit torchao)
   pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao
+=======
+function checkout_install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
+
+  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
+  # is regressing speedup metric. This needs to be investigated further
+  pip install transformers==4.38.1
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+}
+
+function install_torchao() {
+  local commit
+  commit=$(get_pinned_commit torchao)
+  pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function print_sccache_stats() {
diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh
index f085fa78bebe9..536966a992503 100755
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@@ -58,7 +58,11 @@ time python tools/setup_helpers/generate_code.py \
 
 # Build the docs
 pushd docs/cpp
+<<<<<<< HEAD
 time make VERBOSE=1 html
+=======
+time make VERBOSE=1 html -j
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 popd
 popd
diff --git a/.ci/pytorch/create_test_cert.py b/.ci/pytorch/create_test_cert.py
new file mode 100644
index 0000000000000..f2be0c13227d1
--- /dev/null
+++ b/.ci/pytorch/create_test_cert.py
@@ -0,0 +1,123 @@
+from datetime import datetime, timedelta, timezone
+from tempfile import mkdtemp
+
+from cryptography import x509
+from cryptography.hazmat.primitives import hashes, serialization
+from cryptography.hazmat.primitives.asymmetric import rsa
+from cryptography.x509.oid import NameOID
+
+
+temp_dir = mkdtemp()
+print(temp_dir)
+
+
+def genrsa(path):
+    key = rsa.generate_private_key(
+        public_exponent=65537,
+        key_size=2048,
+    )
+    with open(path, "wb") as f:
+        f.write(
+            key.private_bytes(
+                encoding=serialization.Encoding.PEM,
+                format=serialization.PrivateFormat.TraditionalOpenSSL,
+                encryption_algorithm=serialization.NoEncryption(),
+            )
+        )
+    return key
+
+
+def create_cert(path, C, ST, L, O, key):
+    subject = issuer = x509.Name(
+        [
+            x509.NameAttribute(NameOID.COUNTRY_NAME, C),
+            x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
+            x509.NameAttribute(NameOID.LOCALITY_NAME, L),
+            x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
+        ]
+    )
+    cert = (
+        x509.CertificateBuilder()
+        .subject_name(subject)
+        .issuer_name(issuer)
+        .public_key(key.public_key())
+        .serial_number(x509.random_serial_number())
+        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_after(
+            # Our certificate will be valid for 10 days
+            datetime.now(timezone.utc) + timedelta(days=10)
+        )
+        .add_extension(
+            x509.BasicConstraints(ca=True, path_length=None),
+            critical=True,
+        )
+        .sign(key, hashes.SHA256())
+    )
+    # Write our certificate out to disk.
+    with open(path, "wb") as f:
+        f.write(cert.public_bytes(serialization.Encoding.PEM))
+    return cert
+
+
+def create_req(path, C, ST, L, O, key):
+    csr = (
+        x509.CertificateSigningRequestBuilder()
+        .subject_name(
+            x509.Name(
+                [
+                    # Provide various details about who we are.
+                    x509.NameAttribute(NameOID.COUNTRY_NAME, C),
+                    x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
+                    x509.NameAttribute(NameOID.LOCALITY_NAME, L),
+                    x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
+                ]
+            )
+        )
+        .sign(key, hashes.SHA256())
+    )
+    with open(path, "wb") as f:
+        f.write(csr.public_bytes(serialization.Encoding.PEM))
+    return csr
+
+
+def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
+    cert = (
+        x509.CertificateBuilder()
+        .subject_name(csr_cert.subject)
+        .issuer_name(ca_cert.subject)
+        .public_key(csr_cert.public_key())
+        .serial_number(x509.random_serial_number())
+        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_after(
+            # Our certificate will be valid for 10 days
+            datetime.now(timezone.utc) + timedelta(days=10)
+            # Sign our certificate with our private key
+        )
+        .sign(private_ca_key, hashes.SHA256())
+    )
+    with open(path, "wb") as f:
+        f.write(cert.public_bytes(serialization.Encoding.PEM))
+    return cert
+
+
+ca_key = genrsa(temp_dir + "/ca.key")
+ca_cert = create_cert(
+    temp_dir + "/ca.pem",
+    "US",
+    "New York",
+    "New York",
+    "Gloo Certificate Authority",
+    ca_key,
+)
+
+pkey = genrsa(temp_dir + "/pkey.key")
+csr = create_req(
+    temp_dir + "/csr.csr",
+    "US",
+    "California",
+    "San Francisco",
+    "Gloo Testing Company",
+    pkey,
+)
+
+cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)
diff --git a/.ci/pytorch/functorch_doc_push_script.sh b/.ci/pytorch/functorch_doc_push_script.sh
new file mode 100755
index 0000000000000..85c70dffa3966
--- /dev/null
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex -o pipefail
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ "$version" == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index c01efda11ea6f..4c15dd5b1f9f9 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -36,11 +36,19 @@ fi
 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
   # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+<<<<<<< HEAD
   USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
 else
   # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
   # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
   USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
+=======
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+else
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 if which sccache > /dev/null; then
   print_sccache_stats
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 2687852a2c4f3..7ec63ae2dd185 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -55,7 +55,11 @@ test_python_shard() {
 
   setup_test_python
 
+<<<<<<< HEAD
   time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
+=======
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   assert_git_not_dirty
 }
@@ -157,6 +161,7 @@ test_jit_hooks() {
   assert_git_not_dirty
 }
 
+<<<<<<< HEAD
 # Shellcheck doesn't like it when you pass no arguments to a function
 # that can take args. See https://www.shellcheck.net/wiki/SC2120
 # shellcheck disable=SC2120
@@ -185,6 +190,8 @@ checkout_install_torchbench() {
   python -mpip freeze
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torchbench_setup_macos() {
   git clone --recursive https://github.com/pytorch/vision torchvision
   git clone --recursive https://github.com/pytorch/audio torchaudio
@@ -195,7 +202,11 @@ torchbench_setup_macos() {
   git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
   git submodule update --init --recursive
   python setup.py clean
+<<<<<<< HEAD
   python -m pip install -e . -v --no-build-isolation
+=======
+  python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   popd
 
   pushd torchaudio
@@ -204,14 +215,26 @@ torchbench_setup_macos() {
   git submodule update --init --recursive
   python setup.py clean
   #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
+<<<<<<< HEAD
   USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
   popd
 
+=======
+  USE_OPENMP=0 python setup.py develop
+  popd
+
+  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
+  # shellcheck disable=SC2119,SC2120
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   checkout_install_torchbench
 }
 
 pip_benchmark_deps() {
+<<<<<<< HEAD
   python -mpip install --no-input requests cython scikit-learn six
+=======
+  python -mpip install --no-input astunparse requests cython scikit-learn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -256,7 +279,11 @@ test_torchbench_smoketest() {
   local device=mps
   local dtypes=(undefined float16 bfloat16 notset)
   local dtype=${dtypes[$1]}
+<<<<<<< HEAD
   local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
+=======
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for backend in eager inductor; do
 
@@ -302,6 +329,7 @@ test_torchbench_smoketest() {
     fi
 
   done
+<<<<<<< HEAD
   echo "Pytorch benchmark on mps device completed"
 }
 
@@ -343,6 +371,8 @@ test_aoti_torchbench_smoketest() {
   PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
     --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
     --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   echo "Pytorch benchmark on mps device completed"
 }
@@ -391,8 +421,11 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
   test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
   test_torchbench_smoketest "${SHARD_NUMBER}"
+<<<<<<< HEAD
 elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
   test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
   test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
index 039459816724f..4ab4c038eadf9 100755
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@@ -26,7 +26,10 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
     time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
     time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
     time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
+<<<<<<< HEAD
     time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     time python test/run_test.py --verbose -i distributed/test_store
     time python test/run_test.py --verbose -i distributed/test_symmetric_memory
     time python test/run_test.py --verbose -i distributed/test_pg_wrapper
@@ -46,7 +49,10 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
     # DTensor tests
     time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
     time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
+<<<<<<< HEAD
     time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # DeviceMesh test
     time python test/run_test.py --verbose -i distributed/test_device_mesh
diff --git a/.ci/pytorch/run_glootls_test.sh b/.ci/pytorch/run_glootls_test.sh
new file mode 100755
index 0000000000000..cd17b269fe6a9
--- /dev/null
+++ b/.ci/pytorch/run_glootls_test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
+TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
+
+openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
+
+export GLOO_DEVICE_TRANSPORT=TCP_TLS
+export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
+export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
+export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
+
+time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
+
+unset GLOO_DEVICE_TRANSPORT
+unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
+unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
+unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
diff --git a/.ci/pytorch/run_tests.sh b/.ci/pytorch/run_tests.sh
index f5ed90deef249..97ae8d22c7917 100755
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@@ -74,6 +74,7 @@ else
 fi
 
 # Environment initialization
+<<<<<<< HEAD
 retry pip install -qUr requirements-build.txt
 if [[ "$(uname)" == Darwin ]]; then
     # Install the testing dependencies
@@ -81,6 +82,14 @@ if [[ "$(uname)" == Darwin ]]; then
 else
     retry pip install -qr requirements.txt || true
     retry pip install -q hypothesis protobuf pytest || true
+=======
+if [[ "$(uname)" == Darwin ]]; then
+    # Install the testing dependencies
+    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+else
+    retry pip install -qr requirements.txt || true
+    retry pip install -q hypothesis protobuf pytest setuptools || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     numpy_ver=1.15
     case "$(python --version 2>&1)" in
       *2* | *3.5* | *3.6*)
diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py
index b0c607659c72d..1dd56236e2619 100755
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@@ -32,9 +32,12 @@
     "torch::",
 )
 
+<<<<<<< HEAD
 # Patterns for detecting statically linked libstdc++ symbols
 STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _apply_libtorch_symbols(symbols):
     return [
@@ -56,17 +59,24 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
     return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
 
 
+<<<<<<< HEAD
 def grep_symbols(
     lib: str, patterns: list[Any], symbol_type: str | None = None
 ) -> list[str]:
+=======
+def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _grep_symbols(
         symbols: list[tuple[str, str, str]], patterns: list[Any]
     ) -> list[str]:
         rc = []
         for _s_addr, _s_type, s_name in symbols:
+<<<<<<< HEAD
             # Filter by symbol type if specified
             if symbol_type and _s_type != symbol_type:
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for pattern in patterns:
                 if pattern.match(s_name):
                     rc.append(s_name)
@@ -88,6 +98,7 @@ def _get_symbols_chunk(i):
         return functools.reduce(list.__add__, (x.result() for x in tasks), [])
 
 
+<<<<<<< HEAD
 def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
     cxx11_statically_linked_symbols = grep_symbols(
         lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
@@ -100,6 +111,8 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
     print(f"lib: {lib}")
     cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@@ -127,7 +140,10 @@ def main() -> None:
 
     libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
     check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+<<<<<<< HEAD
     check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py
index 675d58a3e283d..7840686f13171 100644
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@@ -385,6 +385,7 @@ def foo(x: torch.Tensor) -> torch.Tensor:
     x_pt2 = torch.compile(model, mode="max-autotune")(x)
 
 
+<<<<<<< HEAD
 def smoke_test_nvshmem() -> None:
     if not torch.cuda.is_available() or target_os == "windows":
         print("Windows platform or CUDA is not available, skipping NVSHMEM test")
@@ -410,6 +411,8 @@ def smoke_test_nvshmem() -> None:
     print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def smoke_test_modules():
     cwd = os.getcwd()
     for module in MODULES:
@@ -504,8 +507,11 @@ def main() -> None:
         options.pypi_pkg_check,
     )
 
+<<<<<<< HEAD
     smoke_test_nvshmem()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     main()
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 40dc90f2eb24f..90546839a2bd8 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -11,8 +11,11 @@ export TERM=vt100
 
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+<<<<<<< HEAD
 # shellcheck source=./common-build.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
@@ -32,6 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
   git config --global --add safe.directory /var/lib/jenkins/workspace
 fi
 
+<<<<<<< HEAD
 
 # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
@@ -44,6 +48,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
   fi
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 echo "Environment variables:"
 env
 
@@ -103,7 +109,10 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   export VALGRIND=OFF
 fi
 
+<<<<<<< HEAD
 detect_cuda_arch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
   # There are additional warnings on s390x, maybe due to newer gcc.
@@ -178,6 +187,11 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
   # setting PYTHON_TEST_EXTRA_OPTION
   export PYTHON_TEST_EXTRA_OPTION="--xpu"
+<<<<<<< HEAD
+=======
+  # Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
+  sudo rm -rf /opt/cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@@ -302,12 +316,15 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
   export ATEN_CPU_CAPABILITY=avx2
 fi
 
+<<<<<<< HEAD
 if [[ "${TEST_CONFIG}" == "legacy_nvidia_driver" ]]; then
   # Make sure that CUDA can be initialized
   (cd test && python -c "import torch; torch.rand(2, 2, device='cuda')")
   export USE_LEGACY_DRIVER=1
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_python_legacy_jit() {
   time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
   assert_git_not_dirty
@@ -324,18 +341,27 @@ test_python_shard() {
 
   # modify LD_LIBRARY_PATH to ensure it has the conda env.
   # This set of tests has been shown to be buggy without it for the split-build
+<<<<<<< HEAD
   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+=======
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   assert_git_not_dirty
 }
 
 test_python() {
   # shellcheck disable=SC2086
+<<<<<<< HEAD
   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+=======
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert_git_not_dirty
 }
 
 test_python_smoke() {
+<<<<<<< HEAD
   # Smoke tests for H100/B200
   time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
@@ -344,6 +370,10 @@ test_python_smoke() {
 test_python_smoke_b200() {
   # Targeted smoke tests for B200 - staged approach to avoid too many failures
   time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+=======
+  # Smoke tests for H100
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert_git_not_dirty
 }
 
@@ -352,6 +382,7 @@ test_h100_distributed() {
   time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   # This test requires multicast support
   time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+<<<<<<< HEAD
   assert_git_not_dirty
 }
 
@@ -370,6 +401,14 @@ test_h100_cutlass_backend() {
   TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
 }
 
+=======
+  # symmetric memory test
+  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_lazy_tensor_meta_reference_disabled() {
   export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
   echo "Testing lazy tensor operations without meta reference"
@@ -392,7 +431,10 @@ test_dynamo_wrapped_shard() {
     --exclude-distributed-tests \
     --exclude-torch-export-tests \
     --exclude-aot-dispatch-tests \
+<<<<<<< HEAD
     --exclude-quantization-tests \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose \
     --upload-artifacts-while-running
@@ -413,10 +455,16 @@ test_einops() {
 test_inductor_distributed() {
   # Smuggle a few multi-gpu tests here so that we don't have to request another large node
   echo "Testing multi_gpu tests in test_torchinductor"
+<<<<<<< HEAD
   python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
   python test/run_test.py -i inductor/test_aot_inductor.py -k test_on_gpu_device1 --verbose
   python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_gpu_device --verbose
   python test/run_test.py -i inductor/test_aot_inductor.py -k test_load_package_multiple_gpus --verbose
+=======
+  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
   python test/run_test.py -i distributed/tensor/test_dtensor_compile.py --verbose
   python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
@@ -437,7 +485,11 @@ test_inductor_distributed() {
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
+<<<<<<< HEAD
   python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
+=======
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert_git_not_dirty
 }
 
@@ -460,12 +512,19 @@ test_inductor_shard() {
     --verbose
 }
 
+<<<<<<< HEAD
 test_inductor_aoti_cpp() {
+=======
+test_inductor_aoti() {
+  # docker build uses bdist_wheel which does not work with test_aot_inductor
+  # TODO: need a faster way to build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
     # We need to hipify before building again
     python3 tools/amd_build/build_amd.py
   fi
   if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
+<<<<<<< HEAD
     # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
     TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
   else
@@ -489,6 +548,16 @@ test_inductor_aoti_cross_compile_for_windows() {
   ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
 
   python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
+=======
+    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
+    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
+    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+  else
+    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+  fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 test_inductor_cpp_wrapper_shard() {
@@ -501,6 +570,7 @@ test_inductor_cpp_wrapper_shard() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
+<<<<<<< HEAD
   # Run certain inductor unit tests with cpp wrapper. In the end state, we
   # should be able to run all the inductor unit tests with cpp_wrapper.
   #
@@ -528,6 +598,48 @@ test_inductor_cpp_wrapper_shard() {
       -k 'xpu' \
       --shard "$1" "$NUM_TEST_SHARDS" \
       --verbose
+=======
+  if [[ "$1" -eq "2" ]]; then
+    # For now, manually put the opinfo tests in shard 2, and all other tests in
+    # shard 1.  Run all CPU tests, as well as specific GPU tests triggering past
+    # bugs, for now.
+    python test/run_test.py \
+      --include inductor/test_torchinductor_opinfo \
+      -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
+      --verbose
+    exit
+  fi
+
+  # Run certain inductor unit tests with cpp wrapper. In the end state, we
+  # should be able to run all the inductor unit tests with cpp_wrapper.
+  python test/run_test.py \
+    --include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
+    --verbose
+  python test/run_test.py --inductor --include test_torch -k 'take' --verbose
+
+  # Run inductor benchmark tests with cpp wrapper.
+  # Skip benchmark tests if it's in rerun-disabled-mode.
+  if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
+    echo "skip dynamo benchmark tests for rerun-disabled-test"
+  else
+    echo "run dynamo benchmark tests with cpp wrapper"
+    python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
+    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
+    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv"
+
+    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+      --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+      --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+      --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fi
 }
 
@@ -649,8 +761,13 @@ test_perf_for_dashboard() {
 
   local device=cuda
   if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+<<<<<<< HEAD
     if [[ "${TEST_CONFIG}" == *cpu_x86_zen* ]]; then
       device=cpu_x86_zen
+=======
+    if [[ "${TEST_CONFIG}" == *zen_cpu_x86* ]]; then
+      device=zen_cpu_x86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
       device=cpu_x86
     elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
@@ -661,19 +778,26 @@ test_perf_for_dashboard() {
     device=cuda_a10g
   elif [[ "${TEST_CONFIG}" == *h100* ]]; then
     device=cuda_h100
+<<<<<<< HEAD
   elif [[ "${TEST_CONFIG}" == *b200* ]]; then
     device=cuda_b200
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
     device=rocm
   fi
 
   for mode in "${modes[@]}"; do
     if [[ "$mode" == "inference" ]]; then
+<<<<<<< HEAD
       if [[ "$device" == "cpu_x86" ]]; then
         dtype=amp
       else
         dtype=bfloat16
       fi
+=======
+      dtype=bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif [[ "$mode" == "training" ]]; then
       dtype=amp
     fi
@@ -685,10 +809,13 @@ test_perf_for_dashboard() {
         target_flag+=( --no-translation-validation)
       fi
 
+<<<<<<< HEAD
       if [[ "$DASHBOARD_TAG" == *freezing-true* ]]; then
         target_flag+=( --freezing)
       fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
         $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
@@ -837,6 +964,7 @@ test_dynamo_benchmark() {
   if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
     test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
   elif [[ "${TEST_CONFIG}" == *perf* ]]; then
+<<<<<<< HEAD
     # TODO (huydhn): Just smoke test some sample models
     if [[ "${TEST_CONFIG}" == *b200* ]]; then
       if [[ "${suite}" == "huggingface" ]]; then
@@ -847,6 +975,8 @@ test_dynamo_benchmark() {
         export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
       fi
     fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
   else
     if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@@ -875,13 +1005,21 @@ test_inductor_torchbench_smoketest_perf() {
   mkdir -p "$TEST_REPORTS_DIR"
 
   python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
+<<<<<<< HEAD
     --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
+=======
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
   # The threshold value needs to be actively maintained to make this check useful
   python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
 
   # Check memory compression ratio for a few models
+<<<<<<< HEAD
   for test in BERT_pytorch yolov3; do
+=======
+  for test in hf_Albert timm_vision_transformer; do
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
       --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
       --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@@ -892,7 +1030,11 @@ test_inductor_torchbench_smoketest_perf() {
   done
 
   # Perform some "warm-start" runs for a few huggingface models.
+<<<<<<< HEAD
   for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+=======
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
       --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
     python benchmarks/dynamo/check_accuracy.py \
@@ -906,7 +1048,11 @@ test_inductor_set_cpu_affinity(){
   export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
 
+<<<<<<< HEAD
   if [[ "$(uname -m)" != "aarch64" ]]; then
+=======
+  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Use Intel OpenMP for x86
     IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
     export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@@ -920,7 +1066,11 @@ test_inductor_set_cpu_affinity(){
   cores=$((cpus / thread_per_core))
 
   # Set number of cores to 16 on aarch64 for performance runs
+<<<<<<< HEAD
   if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
+=======
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cores=16
   fi
   export OMP_NUM_THREADS=$cores
@@ -974,6 +1124,15 @@ test_torchbench_gcp_smoketest(){
   popd
 }
 
+<<<<<<< HEAD
+=======
+test_python_gloo_with_tls() {
+  source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
+  assert_git_not_dirty
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_aten() {
   # Test ATen
   # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
@@ -1020,8 +1179,11 @@ test_without_numpy() {
   if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
     python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
   fi
+<<<<<<< HEAD
   # Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
   python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   popd
 }
 
@@ -1085,10 +1247,26 @@ test_libtorch_api() {
     mkdir -p $TEST_REPORTS_DIR
 
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+<<<<<<< HEAD
+=======
+    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else
     # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
 
+<<<<<<< HEAD
+=======
+    # On s390x, pytorch is built without llvm.
+    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
+    # test fails with errors like:
+    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
+    # unknown file: Failure
+    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
+    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
+      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fi
 
   # quantization is not fully supported on s390x yet
@@ -1171,12 +1349,15 @@ test_distributed() {
   fi
 }
 
+<<<<<<< HEAD
 test_quantization() {
   echo "Testing quantization"
 
   python test/test_quantization.py
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_rpc() {
   echo "Testing RPC C++ tests"
   # NB: the ending test_rpc must match the current function name for the current
@@ -1362,6 +1543,7 @@ EOF
 
   # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
   # file is modified to introduce an invalid public API function.
+<<<<<<< HEAD
   # The filepath here must not have __all__ defined in it, otherwise the test will pass.
   # If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
   # that does not have __all__ defined.
@@ -1369,6 +1551,12 @@ EOF
   cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
   echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
   invalid_api="torch.cuda.streams.new_public_func"
+=======
+  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
+  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
+  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
+  invalid_api="torch.nn.parameter.new_public_func"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
 
   check_public_api_test_fails \
@@ -1423,7 +1611,11 @@ EOF
   pip3 install -r requirements.txt
   # shellcheck source=./common-build.sh
   source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+<<<<<<< HEAD
   python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
+=======
+  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python -mpip install base_dist/*.whl
   echo "::endgroup::"
 
@@ -1571,10 +1763,21 @@ test_executorch() {
   install_torchvision
   install_torchaudio
 
+<<<<<<< HEAD
   INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
 
   pushd /executorch
   "${INSTALL_SCRIPT}" setup_executorch
+=======
+  pushd /executorch
+
+  export PYTHON_EXECUTABLE=python
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
+  # from the PR
+  bash .ci/scripts/setup-linux.sh --build-tool cmake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   echo "Run ExecuTorch unit tests"
   pytest -v -n auto
@@ -1588,14 +1791,25 @@ test_executorch() {
 
   popd
 
+<<<<<<< HEAD
+=======
+  # Test torchgen generated code for Executorch.
+  echo "Testing ExecuTorch op registration"
+  "$BUILD_BIN_DIR"/test_edge_op_registration
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert_git_not_dirty
 }
 
 test_linux_aarch64() {
   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
+<<<<<<< HEAD
         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
+=======
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 
   # Dynamo tests
@@ -1621,12 +1835,19 @@ test_operator_benchmark() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   TEST_DIR=$(pwd)
+<<<<<<< HEAD
   ARCH=$(uname -m)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   test_inductor_set_cpu_affinity
 
   cd benchmarks/operator_benchmark/pt_extension
+<<<<<<< HEAD
   python -m pip install . -v --no-build-isolation
+=======
+  python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   cd "${TEST_DIR}"/benchmarks/operator_benchmark
   $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
@@ -1636,6 +1857,7 @@ test_operator_benchmark() {
   pip_install pandas
   python check_perf_csv.py \
       --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
+<<<<<<< HEAD
       --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 
@@ -1658,6 +1880,11 @@ test_operator_microbenchmark() {
       --benchmark-name "PyTorch operator microbenchmark"
   done
 }
+=======
+      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
@@ -1665,6 +1892,7 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
 fi
 if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
   # Install numpy-2.0.2 and compatible scipy & numba versions
+<<<<<<< HEAD
   # Force re-install of pandas to avoid error where pandas checks numpy version from initial install and fails upon import
   TMP_PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)" 2>/dev/null)
   if [ -n "$TMP_PANDAS_VERSION" ]; then
@@ -1674,6 +1902,11 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
   fi
   python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
 elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
+=======
+  python -mpip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
+  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
+elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
@@ -1682,16 +1915,22 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
   install_torchvision
   build_xla
   test_xla
+<<<<<<< HEAD
 elif [[ "$TEST_CONFIG" == *vllm* ]]; then
     echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
     (cd .ci/lumen_cli && python -m pip install -e .)
     python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
   test_python_legacy_jit
+<<<<<<< HEAD
 elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
   test_quantization
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
@@ -1714,8 +1953,11 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
     test_operator_benchmark cpu ${TEST_MODE}
 
   fi
+<<<<<<< HEAD
 elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
   test_operator_microbenchmark
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@@ -1724,8 +1966,11 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
   test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
   test_inductor_micro_benchmark
+<<<<<<< HEAD
 elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
   test_inductor_aoti_cross_compile_for_windows
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))
@@ -1735,6 +1980,7 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
   id=$((SHARD_NUMBER-1))
   test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
+<<<<<<< HEAD
   install_torchaudio
   install_torchvision
   PYTHONPATH=/torchbench test_cachebench
@@ -1745,21 +1991,56 @@ elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   install_torchaudio
   install_torchvision
+=======
+  install_torchaudio cuda
+  install_torchvision
+  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
+  PYTHONPATH=$(pwd)/torchbench test_cachebench
+elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
+  install_torchaudio cpu
+  install_torchvision
+  checkout_install_torchbench nanogpt
+  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
+elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    install_torchaudio cpu
+  else
+    install_torchaudio cuda
+  fi
+  install_torchvision
+  TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   id=$((SHARD_NUMBER-1))
   # https://github.com/opencv/opencv-python/issues/885
   pip_install opencv-python==4.8.0.74
   if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
+<<<<<<< HEAD
     PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
   elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
     PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
   elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
     TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
   else
+=======
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
+      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
+      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
+    checkout_install_torchbench
+    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
+  else
+    checkout_install_torchbench
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Do this after checkout_install_torchbench to ensure we clobber any
     # nightlies that torchbench may pull in
     if [[ "${TEST_CONFIG}" != *cpu* ]]; then
       install_torchrec_and_fbgemm
     fi
+<<<<<<< HEAD
     PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
   fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
@@ -1771,6 +2052,24 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
   install_torchvision
   test_inductor_shard "${SHARD_NUMBER}"
+=======
+    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
+  fi
+elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
+  install_torchaudio cuda
+  install_torchvision
+  checkout_install_torchbench hf_T5 llama moco
+  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  test_inductor_aoti
+elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
+  install_torchvision
+  test_inductor_shard "${SHARD_NUMBER}"
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
+      test_inductor_distributed
+    fi
+  fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
   test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
@@ -1820,6 +2119,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
   test_python_smoke
+<<<<<<< HEAD
 elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
   test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
@@ -1830,6 +2130,10 @@ elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
   test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
   test_h100_cutlass_backend
+=======
+elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
+  test_h100_distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   install_torchvision
   install_monkeytype
diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt
index e87f37ae61fb4..688395d1615d9 100644
--- a/.ci/pytorch/test_example_code/CMakeLists.txt
+++ b/.ci/pytorch/test_example_code/CMakeLists.txt
@@ -16,7 +16,11 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse
 find_library(CUDNN_LIBRARY NAMES cudnn)
 target_link_libraries(simple-torch-test  ${CUDNN_LIBRARY} )
 if(MSVC)
+<<<<<<< HEAD
   file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll")
+=======
+  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message("dlls to copy "  ${TORCH_DLLS})
   add_custom_command(TARGET simple-torch-test
                      POST_BUILD
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 240cc8b559322..960942000d714 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -38,13 +38,20 @@ if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
 :: Update CMake
+<<<<<<< HEAD
 :: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
+<<<<<<< HEAD
 :: TODO: Move to .ci/docker/requirements-ci.txt
 call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
+=======
+call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
@@ -63,10 +70,16 @@ if "%USE_XPU%"=="1" (
   call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
   call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
   if errorlevel 1 exit /b 1
+<<<<<<< HEAD
   :: Reduce build time
   SET TORCH_XPU_ARCH_LIST=bmg
   :: Re-setup python env for build
   call pip install -r requirements.txt
+=======
+  :: Reduce build time. Only have MTL self-hosted runner now
+  SET TORCH_XPU_ARCH_LIST=xe-lpg
+  SET USE_KINETO=0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 @echo on
@@ -132,14 +145,22 @@ if "%USE_CUDA%"=="1" (
 :: Print all existing environment variable for debugging
 set
 
+<<<<<<< HEAD
 python -m build --wheel --no-isolation
+=======
+python setup.py bdist_wheel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
   if "%BUILD_ENVIRONMENT%"=="" (
+<<<<<<< HEAD
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
+=======
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ) else (
     copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
 
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
index abd2c8722b11d..09c66282f04d2 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@@ -3,12 +3,20 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
   set CONDA_PARENT_DIR=C:\Jenkins
 )
+<<<<<<< HEAD
 set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
+<<<<<<< HEAD
 if not exist %CONDA_ROOT_DIR% (
+=======
+if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set INSTALL_FRESH_CONDA=1
 )
 
@@ -17,14 +25,22 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 
+<<<<<<< HEAD
   %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
+=======
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 )
 
 :: Activate conda so that we can use its commands, i.e. conda, python, pip
+<<<<<<< HEAD
 call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
 :: Activate conda so that we can use its commands, i.e. conda, python, pip
 call conda activate py_tmp
 
 call pip install -r .ci/docker/requirements-ci.txt
+=======
+call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 3173582b06f45..928fc58113ca6 100644
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -14,7 +14,11 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
+<<<<<<< HEAD
 xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+=======
+xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 pushd .
 if "%VC_VERSION%" == "" (
diff --git a/.ci/pytorch/win-test-helpers/test_libtorch.bat b/.ci/pytorch/win-test-helpers/test_libtorch.bat
index d6ecd72188760..68dd3d4c28ec4 100644
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@@ -15,6 +15,7 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1
 
 cd %TMP_DIR_WIN%\build\torch\test
+<<<<<<< HEAD
 
 :: Enable delayed variable expansion to make the list
 setlocal enabledelayedexpansion
@@ -44,6 +45,39 @@ if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
 goto :eof
+=======
+for /r "." %%a in (*.exe) do (
+    call :libtorch_check "%%~na" "%%~fa"
+    if errorlevel 1 goto fail
+)
+
+goto :eof
+
+:libtorch_check
+
+cd %CWD%
+set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
+
+:: Skip verify_api_visibility as it a compile level test
+if "%~1" == "verify_api_visibility" goto :eof
+
+echo Running "%~2"
+if "%~1" == "c10_intrusive_ptr_benchmark" (
+  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
+  call "%~2"
+  goto :eof
+)
+
+python test\run_test.py --cpp --verbose -i "cpp/%~1"
+if errorlevel 1 (
+  echo %1 failed with exit code %errorlevel%
+  goto fail
+)
+if not errorlevel 0 (
+  echo %1 failed with exit code %errorlevel%
+  goto fail
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :eof
 exit /b 0
diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat
index 02829ee369757..f0489db1875f9 100644
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@@ -25,7 +25,11 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
 
 echo Run nn tests
+<<<<<<< HEAD
 python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+=======
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if ERRORLEVEL 1 goto fail
 
 popd
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index a01aa0b6431cd..b7de62bfe553f 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -37,8 +37,25 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 fi
 
+<<<<<<< HEAD
 # TODO: Move this to .ci/docker/requirements-ci.txt
 python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
+=======
+# TODO: Move both of them to Windows AMI
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Install Z3 optional dependency for Windows builds.
+python -m pip install z3-solver==4.12.2.0
+
+# Install tlparse for test\dynamo\test_structured_trace.py UTs.
+python -m pip install tlparse==0.3.30
+
+# Install parameterized
+python -m pip install parameterized==0.8.1
+
+# Install pulp for testing ilps under torch\distributed\_tools
+python -m pip install pulp==2.9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 run_tests() {
     # Run nvidia-smi if available
diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat
index b5c2ef65b84ad..dfcc8b7fd47af 100644
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@@ -48,7 +48,11 @@ sccache --zero-stats
 sccache --show-stats
 
 :: Call PyTorch build script
+<<<<<<< HEAD
 python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+=======
+python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :: show sccache stats
 sccache --show-stats
diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat
index efb8cfec63e7e..2db616810ecb6 100644
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@@ -18,6 +18,7 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+<<<<<<< HEAD
 IF "%NVTOOLSEXT_PATH%"=="" (
     IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
         set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
@@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" (
     )
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF "%CUDA_PATH_V126%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" (
         set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6"
@@ -37,7 +40,11 @@ IF "%CUDA_PATH_V126%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
+<<<<<<< HEAD
     set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
+=======
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
     set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat
index bbd349e2efb4b..0234ec324c039 100644
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@@ -18,6 +18,7 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+<<<<<<< HEAD
 IF "%NVTOOLSEXT_PATH%"=="" (
     IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
         set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
@@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" (
     )
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF "%CUDA_PATH_V128%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" (
         set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
@@ -37,10 +40,17 @@ IF "%CUDA_PATH_V128%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
+<<<<<<< HEAD
     set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
     set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+=======
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+) ELSE (
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 set "CUDA_PATH=%CUDA_PATH_V128%"
diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat
index b17e6113c63e2..ad19af5363c3c 100644
--- a/.ci/pytorch/windows/cuda129.bat
+++ b/.ci/pytorch/windows/cuda129.bat
@@ -18,6 +18,7 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+<<<<<<< HEAD
 IF "%NVTOOLSEXT_PATH%"=="" (
     IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
         set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
@@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" (
     )
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF "%CUDA_PATH_V129%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" (
         set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat
index e0281c0d78a44..993f11e1e0142 100644
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 
 if %CUDA_VERSION% geq 130 (
     set "dll_path=bin\x64"
@@ -19,6 +20,19 @@ copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib
 
 copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
+=======
+copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib
+
+copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 
 :: Should be set in build_pytorch.bat
@@ -28,3 +42,11 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
 if exist "C:\Windows\System32\zlibwapi.dll" (
     copy "C:\Windows\System32\zlibwapi.dll"  pytorch\torch\lib
 )
+<<<<<<< HEAD
+=======
+
+::copy nvJitLink dll is requires for cuda 12+
+if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" (
+    copy "%CUDA_PATH%\bin\nvJitLink_*.dll*"  pytorch\torch\lib
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 1349d3e661f55..b17eda7de7815 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -26,7 +26,10 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
 if %CUDA_VER% EQU 129 goto cuda129
+<<<<<<< HEAD
 if %CUDA_VER% EQU 130 goto cuda130
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@@ -114,6 +117,7 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
 
 goto cuda_common
 
+<<<<<<< HEAD
 :cuda130
 
 set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
@@ -141,17 +145,22 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
 
 goto cuda_common
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 :cuda_common
 :: NOTE: We only install CUDA if we don't have it installed already.
 :: With GHA runners these should be pre-installed as part of our AMI process
 :: If you cannot find the CUDA version you want to build for here then please
 :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows
 if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+<<<<<<< HEAD
     if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
         curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
         if errorlevel 1 exit /b 1
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" (
         curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
         if errorlevel 1 exit /b 1
@@ -178,6 +187,7 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
         xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations"
     )
 
+<<<<<<< HEAD
     echo Installing NvToolsExt...
     7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
     mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
@@ -187,6 +197,8 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
     xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
     xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo Installing cuDNN...
     7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
     xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
@@ -217,4 +229,7 @@ echo Setting up environment...
 set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
 set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+<<<<<<< HEAD
 set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat
index 2c173aed818b4..f9ffb6de2fd29 100644
--- a/.ci/pytorch/windows/internal/driver_update.bat
+++ b/.ci/pytorch/windows/internal/driver_update.bat
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 set WIN_DRIVER_VN=580.88
 set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
 curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
@@ -7,3 +8,14 @@ start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-inte
 if errorlevel 1 exit /b 1
 
 del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
+=======
+set WIN_DRIVER_VN=528.89
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
+curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
+if errorlevel 1 exit /b 1
+
+start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
+if errorlevel 1 exit /b 1
+
+del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat
index b2f68af97b3f4..1ab15fc544cff 100644
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@@ -1,12 +1,16 @@
 set ADDITIONAL_OPTIONS=""
 set PYTHON_EXEC="python"
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%DESIRED_PYTHON%" == "3.13t" (
     echo Python version is set to 3.13t
     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
     set ADDITIONAL_OPTIONS="Include_freethreaded=1"
     set PYTHON_EXEC="python3.13t"
+<<<<<<< HEAD
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
     echo Python version is set to 3.14 or 3.14t
     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
@@ -14,6 +18,10 @@ if "%DESIRED_PYTHON%" == "3.13t" (
     set PYTHON_EXEC="python3.14t"
 ) else (
     echo Python version is set to %DESIRED_PYTHON%
+=======
+) else (
+    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
 )
 
@@ -25,5 +33,8 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1
 
 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
+<<<<<<< HEAD
 %PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
 if errorlevel 1 exit /b 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat
index 34a5140cb1ee0..bd7627ed7bede 100644
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@@ -86,7 +86,11 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end
 
 :pytorch
+<<<<<<< HEAD
 %PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+=======
+%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :build_end
 IF ERRORLEVEL 1 exit /b 1
diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat
index f671a9d0e0abb..eb803a058cae0 100644
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@@ -148,7 +148,18 @@ if "%NVIDIA_GPU_EXISTS%" == "0" (
     goto end
 )
 
+<<<<<<< HEAD
 cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ
+=======
+set BUILD_SPLIT_CUDA=
+if exist "%install_root%\lib\torch_cuda_cu.lib" if exist "%install_root%\lib\torch_cuda_cpp.lib" set BUILD_SPLIT_CUDA=ON
+
+if "%BUILD_SPLIT_CUDA%" == "ON" (
+    cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda_cu.lib torch_cuda_cpp.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ /INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z
+) else (
+    cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .\check-torch-cuda.exe
 if ERRORLEVEL 1 exit /b 1
 
diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat
index cd1fc484ae155..f06b3a7c0b41e 100644
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@@ -63,7 +63,11 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1
 
+<<<<<<< HEAD
 call conda install  -y -q -c conda-forge libuv=1.51
+=======
+call conda install  -y -q -c conda-forge libuv=1.39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 call conda install -y -q intel-openmp
 
 echo "install and test libtorch"
diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat
index f143571a56922..85b72caccaba0 100644
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@@ -13,9 +13,15 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start
 
 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
+<<<<<<< HEAD
 set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
 set XPU_BUNDLE_VERSION=2025.1.3+5
+=======
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
+set XPU_BUNDLE_VERSION=2025.0.1+20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@@ -24,9 +30,15 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0
 
+<<<<<<< HEAD
 if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
     set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
     set XPU_BUNDLE_VERSION=2025.2.1+20
+=======
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+    set XPU_BUNDLE_VERSION=2025.1.3+5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 :: Check if XPU bundle is target version or already installed
@@ -90,3 +102,17 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe
 
 :xpu_install_end
+<<<<<<< HEAD
+=======
+
+if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
+:: Install Level Zero SDK
+set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
+curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+echo "Installing level zero SDK..."
+7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
+set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
+del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+
+:install_end
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat
index a7addd5d712d0..279b65b19b939 100644
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@@ -7,8 +7,11 @@ call "internal\install_python.bat"
 
 %PYTHON_EXEC% --version
 set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
+<<<<<<< HEAD
 if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
 if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
@@ -18,7 +21,11 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
 
 %PYTHON_EXEC% -m pip install pyyaml
 %PYTHON_EXEC% -m pip install mkl-include mkl-static
+<<<<<<< HEAD
 %PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0
+=======
+%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 where cmake.exe
 
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index 6123e8abc8c0c..1adf8e63941ef 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -85,7 +85,11 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 # Create an isolated directory to store this builds pytorch checkout and conda
 # installation
 if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
+<<<<<<< HEAD
     MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
+=======
+    MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
 if [[ -n ${GITHUB_ACTIONS} ]]; then
@@ -96,11 +100,19 @@ fi
 whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
 mkdir -p "$whl_tmp_dir"
 
+<<<<<<< HEAD
 mac_version='macosx-11_0-arm64'
 libtorch_arch='arm64'
 
 # Create a consistent wheel package name to rename the wheel to
 wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
+=======
+mac_version='macosx_11_0_arm64'
+libtorch_arch='arm64'
+
+# Create a consistent wheel package name to rename the wheel to
+wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ###########################################################
 
@@ -124,12 +136,21 @@ popd
 
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
+<<<<<<< HEAD
 export MACOSX_DEPLOYMENT_TARGET=11.0
 
+=======
+export MACOSX_DEPLOYMENT_TARGET=10.15
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+
+SETUPTOOLS_PINNED_VERSION="=46.0.0"
+PYYAML_PINNED_VERSION="=5.3"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
+<<<<<<< HEAD
     3.14t)
         echo "Using 3.14 deps"
         mac_version='macosx-11.0-arm64'
@@ -146,10 +167,21 @@ case $desired_python in
         echo "Using 3.13t deps"
         mac_version='macosx-11.0-arm64'
         NUMPY_PINNED_VERSION="==2.1.0"
+=======
+    3.13t)
+        echo "Using 3.13 deps"
+        SETUPTOOLS_PINNED_VERSION=">=68.0.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+        desired_python="3.13"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         RENAME_WHEEL=false
         ;;
     3.13)
         echo "Using 3.13 deps"
+<<<<<<< HEAD
         NUMPY_PINNED_VERSION="==2.1.0"
         ;;
     3.12)
@@ -176,21 +208,87 @@ PINNED_PACKAGES=(
 python -mvenv ~/${desired_python}-build
 source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
+=======
+        SETUPTOOLS_PINNED_VERSION=">=68.0.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        ;;
+    3.12)
+        echo "Using 3.12 deps"
+        SETUPTOOLS_PINNED_VERSION=">=68.0.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.0.2"
+        ;;
+    3.11)
+        echo "Using 3.11 deps"
+        SETUPTOOLS_PINNED_VERSION=">=46.0.0"
+        PYYAML_PINNED_VERSION=">=5.3"
+        NUMPY_PINNED_VERSION="=2.0.2"
+        ;;
+    3.10)
+        echo "Using 3.10 deps"
+        SETUPTOOLS_PINNED_VERSION=">=46.0.0"
+        PYYAML_PINNED_VERSION=">=5.3"
+        NUMPY_PINNED_VERSION="=2.0.2"
+        ;;
+    3.9)
+        echo "Using 3.9 deps"
+        SETUPTOOLS_PINNED_VERSION=">=46.0.0"
+        PYYAML_PINNED_VERSION=">=5.3"
+        NUMPY_PINNED_VERSION="=2.0.2"
+        ;;
+    *)
+        echo "Using default deps"
+        NUMPY_PINNED_VERSION="=1.11.3"
+        ;;
+esac
+
+# Install into a fresh env
+tmp_env_name="wheel_py$python_nodot"
+conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
+source activate "$tmp_env_name"
+
+pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions
+retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 retry brew install libomp
 
 # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1
 
+<<<<<<< HEAD
+=======
+if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
+    export CMAKE_OSX_ARCHITECTURES=arm64
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
 
 pushd "$pytorch_rootdir"
+<<<<<<< HEAD
 echo "Calling -m build --wheel --no-isolation at $(date)"
 
 _PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"
 
 echo "Finished -m build --wheel --no-isolation at $(date)"
+=======
+echo "Calling setup.py bdist_wheel at $(date)"
+
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
+    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
+    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+else
+    python setup.py bdist_wheel -d "$whl_tmp_dir"
+fi
+
+echo "Finished setup.py bdist_wheel at $(date)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [[ $package_type != 'libtorch' ]]; then
     echo "delocating wheel dependencies"
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index c24a50b8b17ed..0af272b341abf 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -65,8 +65,21 @@ fi
 
 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
   if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
+<<<<<<< HEAD
     pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
     retry pip install -q numpy protobuf typing-extensions
+=======
+    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
+      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
+      # todo: after folder is populated use the pypi_pkg channel instead
+      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
+      retry pip install -q numpy protobuf typing-extensions
+    else
+      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+      retry pip install -q numpy protobuf typing-extensions
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else
     pip install "\$pkg"
     retry pip install -q numpy protobuf typing-extensions
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 3f747e1a186ae..30a2daafd6c93 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -5,7 +5,13 @@ export TZ=UTC
 tagged_version() {
   GIT_DIR="${workdir}/pytorch/.git"
   GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
+<<<<<<< HEAD
   if [[ ! -d "${GIT_DIR}" ]]; then
+=======
+  if [[ -n "${CIRCLE_TAG:-}" ]]; then
+    echo "${CIRCLE_TAG}"
+  elif [[ ! -d "${GIT_DIR}" ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
     kill $$
   elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -73,7 +79,18 @@ export PYTORCH_BUILD_NUMBER=1
 : <<'BLOCK_COMMENT'
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
+<<<<<<< HEAD
 TRITON_CONSTRAINT="platform_system == 'Linux'"
+=======
+
+# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
+TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
+
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
+  TRITON_CONSTRAINT="platform_system == 'Linux'"
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
   TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
@@ -86,10 +103,17 @@ fi
 
 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
+<<<<<<< HEAD
     TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
     if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
         TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
         TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+=======
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
+        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fi
     if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@@ -130,6 +154,10 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
+<<<<<<< HEAD
+=======
+export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ "${OSTYPE}" == "msys" ]]; then
   export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
   if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@@ -166,6 +194,7 @@ if [[ "$(uname)" != Darwin ]]; then
   MEMORY_LIMIT_MAX_JOBS=12
   NUM_CPUS=$(( $(nproc) - 2 ))
 
+<<<<<<< HEAD
   if [[ "$(uname)" == Linux ]]; then
     # Defaults here for **binary** linux builds so they can be changed in one place
     export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
@@ -173,6 +202,10 @@ if [[ "$(uname)" != Darwin ]]; then
     # For other builds
     export MAX_JOBS=${NUM_CPUS}
   fi
+=======
+  # Defaults here for **binary** linux builds so they can be changed in one place
+  export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   cat >>"$envfile" <<EOL
   export MAX_JOBS="${MAX_JOBS}"
diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh
index d48077e112455..81b8a06778c42 100755
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@@ -23,6 +23,13 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
   AWS_S3_CP="aws s3 cp"
 fi
 
+<<<<<<< HEAD
+=======
+if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
+fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
   UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
@@ -51,12 +58,23 @@ s3_upload() {
     s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
   fi
   (
+<<<<<<< HEAD
+=======
+    cache_control_flag=""
+    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
+      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for pkg in ${PKG_DIR}/*.${extension}; do
       (
         set -x
         shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
         ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
+<<<<<<< HEAD
           --metadata "checksum-sha256=${shm_id}"
+=======
+          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       )
     done
   )
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 18dcde50e2b65..60ffb1e15a817 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -15,7 +15,12 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
     export USE_SCCACHE=0
+<<<<<<< HEAD
     export XPU_VERSION=2025.2
+=======
+    export XPU_VERSION=2025.1
+    export XPU_ENABLE_KINETO=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 echo "Free space on filesystem before build:"
diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh
index 9326d9037e8b3..eb5b15b762cd1 100644
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@@ -8,7 +8,11 @@ export VC_YEAR=2022
 
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
+<<<<<<< HEAD
     export XPU_VERSION=2025.2
+=======
+    export XPU_VERSION=2025.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
diff --git a/.circleci/scripts/functorch_doc_push_script.sh b/.circleci/scripts/functorch_doc_push_script.sh
new file mode 100755
index 0000000000000..010956e212520
--- /dev/null
+++ b/.circleci/scripts/functorch_doc_push_script.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# =================== The following code **should** be executed inside Docker container ===================
+
+# Install dependencies
+sudo apt-get -y update
+sudo apt-get -y install expect-dev
+
+# This is where the local pytorch install in the docker image is located
+pt_checkout="/var/lib/jenkins/workspace"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
+echo "functorch_doc_push_script.sh: Invoked with $*"
+
+set -ex
+
+version=${DOCS_VERSION:-nightly}
+echo "version: $version"
+
+# Build functorch docs
+pushd $pt_checkout/functorch/docs
+pip -q install -r requirements.txt
+make html
+popd
+
+git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
+pushd functorch_ghpages
+
+if [ $version == "main" ]; then
+  version=nightly
+fi
+
+git rm -rf "$version" || true
+mv "$pt_checkout/functorch/docs/build/html" "$version"
+
+git add "$version" || true
+git status
+git config user.email "soumith+bot@pytorch.org"
+git config user.name "pytorchbot"
+# If there aren't changes, don't make a commit; push is no-op
+git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
+git status
+
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  git push -u origin gh-pages
+fi
+
+popd
+# =================== The above code **should** be executed inside Docker container ===================
diff --git a/.clang-format b/.clang-format
index 67b722d967c7e..448aa5d0f343d 100644
--- a/.clang-format
+++ b/.clang-format
@@ -120,7 +120,10 @@ UseTab:          Never
 Language: ObjC
 ColumnLimit: 120
 AlignAfterOpenBracket: Align
+<<<<<<< HEAD
 IndentWidth: 2
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
diff --git a/.clang-tidy b/.clang-tidy
index 71ffdf8cb224c..e85101e531adf 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -59,19 +59,31 @@ performance-*,
 -performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
+<<<<<<< HEAD
 readability-duplicate-include,
 readability-misplaced-array-index,
 readability-redundant*,
+=======
+readability-duplicate-include
+readability-misplaced-array-index,
+readability-redundant*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
+<<<<<<< HEAD
 -readability-redundant-inline-specifier,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
 LineFilter:
   - name: '/usr/include/.*'
+=======
+'
+HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
+WarningsAsErrors: '*'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CheckOptions:
   cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
   cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index 7ef8da027ad9e..c7e65eeecedd1 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -61,8 +61,13 @@ You are now all set to start developing with PyTorch in a DevContainer environme
 ## Step 8: Build PyTorch
 
 To build pytorch from source, simply run:
+<<<<<<< HEAD
    ```bash
    python -m pip install --no-build-isolation -v -e .
+=======
+   ```
+   python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ```
 
 The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
diff --git a/.editorconfig b/.editorconfig
index e9581612a050e..0456b5cd51a07 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -1,11 +1,15 @@
 root = true
 
 [*]
+<<<<<<< HEAD
 charset = utf-8
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 end_of_line = lf
 insert_final_newline = true
 
 # Python
+<<<<<<< HEAD
 [*.{py,pyi,py.in,pyi.in}]
 indent_style = space
 indent_size = 4
@@ -34,3 +38,12 @@ indent_style = tab
 indent_style = space
 indent_size = 2
 end_of_line = crlf
+=======
+[*.py]
+indent_style = space
+indent_size = 4
+
+# Make
+[Makefile]
+indent_style = tab
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.flake8 b/.flake8
index 937234edb4036..c3534a7e3db70 100644
--- a/.flake8
+++ b/.flake8
@@ -7,12 +7,24 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
+<<<<<<< HEAD
     E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
+=======
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
     # to line this up with executable bit
     EXE001,
     # these ignores are from flake8-bugbear; please fix!
+<<<<<<< HEAD
     B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
+=======
+    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
+    # these ignores are from flake8-comprehensions; please fix!
+    C407,
+    # these ignores are from flake8-logging-format; please fix!
+    G100,G101,G200
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these ignores are from flake8-simplify. please fix or ignore with commented reason
     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
     # SIM104 is already covered by pyupgrade ruff
@@ -44,7 +56,10 @@ per-file-ignores =
     torch/__init__.py: F401,TOR901
     torch/_custom_op/impl.py: TOR901
     torch/_export/serde/upgrade.py: TOR901
+<<<<<<< HEAD
     torch/_functorch/predispatch.py: TOR901
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch/_functorch/vmap.py: TOR901
     torch/_inductor/test_operators.py: TOR901
     torch/_library/abstract_impl.py: TOR901
@@ -69,7 +84,11 @@ exclude =
     ./docs/src,
     ./functorch/docs,
     ./functorch/examples,
+<<<<<<< HEAD
     ./functorch/docs/source/tutorials,
+=======
+    ./functorch/notebooks,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ./scripts,
     ./test/generated_type_hints_smoketest.py,
     ./third_party,
diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md
index 1ed74161f55de..a149b5cff4fd9 100644
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@@ -1,6 +1,7 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
+<<<<<<< HEAD
 title: ''
 labels: ''
 assignees: ''
@@ -9,6 +10,11 @@ assignees: ''
 
 > NOTE: Remember to label this issue with "`ci: sev`"
 >       If you want autorevert to be disabled, keep the ci: disable-autorevert label
+=======
+---
+
+> NOTE: Remember to label this issue with "`ci: sev`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  <!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
 
diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
index d9e0cc22bd3f5..5379c662410d6 100644
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@@ -1,10 +1,15 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
+<<<<<<< HEAD
 title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
 labels: 'module: ci'
 assignees: ''
 
+=======
+title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
+labels: "module: ci"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ---
 
 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index d021371ca8863..dccb8b3f6e4ef 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -12,19 +12,26 @@ self-hosted-runner:
     - linux.9xlarge.ephemeral
     - am2.linux.9xlarge.ephemeral
     - linux.12xlarge
+<<<<<<< HEAD
     - linux.12xlarge.memory
     - linux.24xlarge
     - linux.24xlarge.memory
+=======
+    - linux.24xlarge
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - linux.24xlarge.ephemeral
     - linux.24xlarge.amd
     - linux.arm64.2xlarge
     - linux.arm64.2xlarge.ephemeral
     - linux.arm64.m7g.4xlarge
     - linux.arm64.m7g.4xlarge.ephemeral
+<<<<<<< HEAD
     - linux.arm64.r7g.12xlarge.memory
     - linux.aws.h100
     - linux.aws.h100.4
     - linux.aws.h100.8
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - linux.4xlarge.nvidia.gpu
     - linux.8xlarge.nvidia.gpu
     - linux.16xlarge.nvidia.gpu
@@ -59,6 +66,7 @@ self-hosted-runner:
     - linux.rocm.gpu.mi250
     - linux.rocm.gpu.2
     - linux.rocm.gpu.4
+<<<<<<< HEAD
     # gfx942 runners
     - linux.rocm.gpu.gfx942.1
     - linux.rocm.gpu.gfx942.2
@@ -66,6 +74,18 @@ self-hosted-runner:
     - rocm-docker
     # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
     - macos-m1-stable
+=======
+    # MI300 runners
+    - linux.rocm.gpu.mi300.2
+    - linux.rocm.gpu.mi300.4
+    - rocm-docker
+    # Repo-specific Apple hosted  runners
+    - macos-m1-ultra
+    - macos-m2-14
+    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
+    - macos-m1-stable
+    - macos-m1-13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - macos-m1-14
     # GitHub-hosted MacOS runners
     - macos-latest-xlarge
diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml
new file mode 100644
index 0000000000000..bccd42aa42f2c
--- /dev/null
+++ b/.github/actions/build-android/action.yml
@@ -0,0 +1,78 @@
+name: build android
+
+description: build android for a specific arch
+
+inputs:
+  arch:
+    description: arch to build
+    required: true
+  arch-for-build-env:
+    description: |
+      arch to pass to build environment.
+      This is currently different than the arch name we use elsewhere, which
+      should be fixed.
+    required: true
+  github-secret:
+    description: github token
+    required: true
+  build-environment:
+    required: true
+    description: Top-level label for what's being built/tested.
+  docker-image:
+    required: true
+    description: Name of the base docker image to build with.
+  branch:
+    required: true
+    description: What branch we are building on.
+outputs:
+  container_id:
+    description: Docker container identifier used to build the artifacts
+    value: ${{ steps.build.outputs.container_id }}
+
+runs:
+  using: composite
+  steps:
+    - name: Build-${{ inputs.arch }}
+      id: build
+      shell: bash
+      env:
+        BRANCH: ${{ inputs.branch }}
+        BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
+        AWS_DEFAULT_REGION: us-east-1
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
+        DOCKER_IMAGE: ${{ inputs.docker-image  }}
+        MATRIX_ARCH: ${{ inputs.arch }}
+      run: |
+        # detached container should get cleaned up by teardown_ec2_linux
+        set -exo pipefail
+        export container_name
+        container_name=$(docker run \
+          -e BUILD_ENVIRONMENT \
+          -e MAX_JOBS="$(nproc --ignore=2)" \
+          -e AWS_DEFAULT_REGION \
+          -e PR_NUMBER \
+          -e SHA1 \
+          -e BRANCH \
+          -e SCCACHE_BUCKET \
+          -e SCCACHE_REGION \
+          -e SKIP_SCCACHE_INITIALIZATION=1 \
+          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+          --security-opt seccomp=unconfined \
+          --cap-add=SYS_PTRACE \
+          --tty \
+          --detach \
+          --user jenkins \
+          -w /var/lib/jenkins/workspace \
+          "${DOCKER_IMAGE}"
+        )
+        git submodule sync && git submodule update -q --init --recursive --depth 1
+        docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
+        (echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
+
+        # Copy install binaries back
+        mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
+        docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
+        echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"
diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml
index 15f193ef3a5dc..b64267fdf45c3 100644
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@@ -57,6 +57,7 @@ runs:
         submodules: ${{ inputs.submodules }}
         show-progress: false
 
+<<<<<<< HEAD
     - name: Clean submodules post checkout
       id: clean-submodules
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
@@ -72,6 +73,8 @@ runs:
           git submodule foreach --recursive git clean -ffdx
         fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - name: Clean workspace (try again)
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
         (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 338fc0c2a844c..0fc3a4ac53048 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -70,7 +70,11 @@ runs:
           set -eux
           # PyYAML 6.0 doesn't work with MacOS x86 anymore
           # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
+<<<<<<< HEAD
           python3 -m pip install requests==2.27.1 pyyaml==6.0.2
+=======
+          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     - name: Parse ref
       id: parse-ref
@@ -125,7 +129,11 @@ runs:
         TAG: ${{ steps.parse-ref.outputs.tag }}
         EVENT_NAME: ${{ github.event_name }}
         SCHEDULE: ${{ github.event.schedule }}
+<<<<<<< HEAD
         HEAD_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+=======
+        HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       id: filter
       run: |
         echo "Workflow: ${GITHUB_WORKFLOW}"
diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml
index f29d776402ba2..b0cfb1ce4e213 100644
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@@ -126,7 +126,11 @@ runs:
       shell: bash
       continue-on-error: true
       run: |
+<<<<<<< HEAD
         python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84
+=======
+        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
         echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -274,6 +278,11 @@ runs:
           -w /var/lib/jenkins/workspace \
           "${DOCKER_IMAGE}"
         )
+<<<<<<< HEAD
+=======
+        # Propagate download.pytorch.org IP to container
+        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
         docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
 
diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py
index 48a8490985946..58fc16d3565be 100644
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@@ -264,7 +264,11 @@ def change_content_to_new_version(file: Union[str, Path]) -> None:
         change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
 
         for file in Path(f"artifacts/dist/{old_stem}").glob(
+<<<<<<< HEAD
             "*.dist-info/*",
+=======
+            "*.dist-info/**",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             change_content_to_new_version(file)
 
@@ -304,7 +308,12 @@ def change_content_to_new_version(file: Union[str, Path]) -> None:
 
 
 def set_output() -> None:
+<<<<<<< HEAD
     print("Setting output reuse=true")
+=======
+    # Disable for now so we can monitor first
+    # pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
             print("reuse=true", file=env)
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index 4370549e4801a..2e8af1c65b4c6 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -28,10 +28,13 @@ runs:
         echo "instance-type: $(get_ec2_metadata instance-type)"
         echo "system info $(uname -a)"
 
+<<<<<<< HEAD
     - name: Print GPU info (if present)
       shell: bash
       run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - name: Check if in a container runner
       shell: bash
       id: check_container_runner
@@ -86,6 +89,40 @@ runs:
         # Prune all of the docker images
         docker system prune -af
 
+<<<<<<< HEAD
+=======
+    - name: Manually resolve download.pytorch.org
+      shell: bash
+      continue-on-error: true
+      run: |
+        set +e
+        set -x
+
+        PT_DOMAIN=download.pytorch.org
+        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
+        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
+        # one is returned at random
+        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
+
+        if [ -z "${RESOLVED_IP}" ]; then
+          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
+          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
+
+          if [ -z "${RESOLVED_IP}" ]; then
+            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
+            exit 1
+          fi
+        fi
+
+        if grep -r "${PT_DOMAIN}" /etc/hosts; then
+          # Clean up any old records first
+          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
+        fi
+
+        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
+        cat /etc/hosts
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - name: Check that the docker daemon is running
       shell: bash
       continue-on-error: true
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index f77c6267f5067..e60a9c83f267a 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -59,6 +59,14 @@ runs:
             echo "$msg"
             exit 1
         fi
+<<<<<<< HEAD
+=======
+        if [[ $ngpu -eq 1 ]]; then
+            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
+            echo "$msg"
+            exit 1
+        fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     - name: Runner diskspace health check
       uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
@@ -111,6 +119,7 @@ runs:
         # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
         # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
         echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
 
     - name: configure aws credentials
       id: aws_creds
@@ -131,3 +140,5 @@ runs:
         env | grep '^GITHUB' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
         env | grep '^CI' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
         env | grep '^RUNNER' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index 2ea330f93b490..90850b5551edb 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -6,12 +6,15 @@ inputs:
   cuda-version:
     description: which cuda version to install, 'cpu' for none
     required: true
+<<<<<<< HEAD
   python-version:
     required: false
     type: string
     default: "3.10"
     description: |
       The python version to be used. Will be 3.10 by default
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 runs:
   using: composite
@@ -44,24 +47,34 @@ runs:
         CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
 
         {
+<<<<<<< HEAD
           echo "CONDA=${CONDA}";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "CONDA_RUN=${CONDA} run --no-capture-output";
           echo "CONDA_BUILD=${CONDA} run conda-build";
           echo "CONDA_INSTALL=${CONDA} install";
         } >> "${GITHUB_ENV}"
 
     - name: Setup Python3
+<<<<<<< HEAD
       env:
           PYTHON_VERSION: ${{ inputs.python-version }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       shell: bash
       run: |
         set +e
         set -x
 
+<<<<<<< HEAD
         # Create new py_tmp env with python-version
         ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
 
         PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
+=======
+        PYTHON3=$(${CONDA_RUN} which python3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         EXIT_CODE=$?
 
         if [[ "${EXIT_CODE}" == "0" ]]; then
@@ -74,7 +87,11 @@ runs:
           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
           # is also the Miniconda installation that is Python 2 based, and both can be installed if
           # needed. In both cases, Python binary is just called python
+<<<<<<< HEAD
           PYTHON=$(${CONDA_RUN} -n py_tmp which python)
+=======
+          PYTHON=$(${CONDA_RUN} which python)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           EXIT_CODE=$?
 
           if [[ "${EXIT_CODE}" == "0" ]]; then
diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml
index b5e5f74db037a..8adc0c90a958b 100644
--- a/.github/actions/teardown-win/action.yml
+++ b/.github/actions/teardown-win/action.yml
@@ -23,6 +23,12 @@ runs:
       run: |
         .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
+=======
+    - name: Clean up leftover processes on non-ephemeral Windows runner
+      uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Cleaning up Windows workspace sometimes fails flakily with device or resource busy
     # error, meaning one or more processes haven't stopped completely yet. So trying to
     # retry this step several time similar to how checkout-pytorch GHA does
diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
index 991cf9fb87eff..6b682f2a768e3 100644
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@@ -24,6 +24,10 @@ runs:
           -e PYTORCH_FINAL_PACKAGE_DIR \
           -e PYTORCH_ROOT \
           -e SKIP_ALL_TESTS \
+<<<<<<< HEAD
+=======
+          -e USE_SPLIT_BUILD \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           --tty \
           --detach \
           -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
@@ -33,6 +37,13 @@ runs:
         )
 
         echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+<<<<<<< HEAD
+=======
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
+          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
+          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
+        fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
         # Generate test script
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 8af554d56ee57..9c375a2708937 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 69bbe7363897764f9e758d851cd0340147d27f94
+=======
+4e94321c54617dd738a05bfedfc28bc0fa635b5c
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/fbgemm_rocm.txt b/.github/ci_commit_pins/fbgemm_rocm.txt
index 19f5a2b2efa1a..aec0e9af23271 100644
--- a/.github/ci_commit_pins/fbgemm_rocm.txt
+++ b/.github/ci_commit_pins/fbgemm_rocm.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 08ae0af1395c8d8471f4025deb6af9aef90b342f
+=======
+5fb5024118e9bb9decf96c2b0b1a8f0010bf56be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
new file mode 100644
index 0000000000000..efbc3ceeb2afe
--- /dev/null
+++ b/.github/ci_commit_pins/torchbench.txt
@@ -0,0 +1 @@
+e03a63be43e33596f7f0a43b0f530353785e4a59
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 5d9b8d5d171ef..794fb1c780dd9 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 218d2ab791d437309f91e0486eb9fa7f00badc17
+=======
+966da7e46f65d6d49df3e31214470a4fe5cc8e66
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 280d5ab77009f..df14012a202e5 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 df6798dfb931ce7c7fe5bed2447cd1092a5981af
+=======
+r2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml
index 782696fc782d3..89cf6640a6c2f 100644
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@@ -16,11 +16,14 @@
   then:
   - "module: pt2-dispatcher"
 - any:
+<<<<<<< HEAD
   - "vllm-compile"
   then:
   - "module: vllm"
   - "oncall: pt2"
 - any:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - "module: vmap"
   then:
   - "module: functorch"
@@ -33,6 +36,13 @@
   then:
   - "module: dynamo"
 - any:
+<<<<<<< HEAD
+=======
+  - "module: flex attention"
+  then:
+  - "module: higher order operators"
+- any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - "module: aotinductor"
   then:
   - "oncall: export"
@@ -49,6 +59,7 @@
   - "module: dynamic shapes"
   then:
   - "oncall: pt2"
+<<<<<<< HEAD
 - any:
   - "release notes: distributed (c10d)"
   - "release notes: distributed (symm_mem)"
@@ -58,3 +69,5 @@
   - "oncall: distributed"
   then:
   - "ciflow/h100-distributed"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 7b47b9fefb5dc..347fff90afbde 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -130,6 +130,7 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
+<<<<<<< HEAD
 
 "ciflow/vllm":
 - .github/ci_commit_pins/vllm.txt
@@ -162,3 +163,5 @@
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 4ad15ecc7f8ba..576691f1e51be 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -76,7 +76,10 @@
   - .github/ci_commit_pins/audio.txt
   - .github/ci_commit_pins/vision.txt
   - .github/ci_commit_pins/torchdynamo.txt
+<<<<<<< HEAD
   - .github/ci_commit_pins/vllm.txt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - .ci/docker/ci_commit_pins/triton.txt
   approved_by:
   - pytorchbot
@@ -131,6 +134,24 @@
   - Lint
   - pull
 
+<<<<<<< HEAD
+=======
+- name: Mobile
+  patterns:
+  - ios/**
+  - android/**
+  - test/mobile/**
+  approved_by:
+  - linbinyu
+  - IvanKobzarev
+  - dreiss
+  - raziel
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: PrimTorch
   patterns:
   - torch/_meta_registrations.py
@@ -370,7 +391,10 @@
   - leslie-fang-intel
   - jgong5
   - EikanWang
+<<<<<<< HEAD
   - CaoE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -422,7 +446,10 @@
   approved_by:
   - leslie-fang-intel
   - jgong5
+<<<<<<< HEAD
   - CaoE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -477,6 +504,7 @@
   - srossross
   - chillee
   - zou3519
+<<<<<<< HEAD
   - guilhermeleobas
   mandatory_checks_name:
   - EasyCLA
@@ -494,6 +522,8 @@
   - test/inductor_skips/**
   approved_by:
   - guilhermeleobas
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -525,6 +555,7 @@
   - Lint
   - pull
 
+<<<<<<< HEAD
 - name: typechecking
   patterns:
   - 'pyrefly.toml'
@@ -560,6 +591,8 @@
   - Lint
   - pull
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: superuser
   patterns:
   - '*'
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 74b0d243859a2..a0500b3b89da5 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,6 +1,7 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
+<<<<<<< HEAD
 - ciflow/b200
 - ciflow/b200-symm-mem
 - ciflow/b200-distributed
@@ -43,6 +44,37 @@ ciflow_push_tags:
 - ciflow/vllm
 - ciflow/win-arm64
 - ciflow/xpu
+=======
+- ciflow/binaries
+- ciflow/binaries_libtorch
+- ciflow/binaries_wheel
+- ciflow/inductor
+- ciflow/inductor-periodic
+- ciflow/inductor-rocm
+- ciflow/inductor-perf-test-nightly-rocm
+- ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
+- ciflow/inductor-micro-benchmark-cpu-x86
+- ciflow/inductor-perf-test-nightly-x86-zen
+- ciflow/inductor-cu126
+- ciflow/linux-aarch64
+- ciflow/mps
+- ciflow/nightly
+- ciflow/periodic
+- ciflow/periodic-rocm-mi300
+- ciflow/rocm
+- ciflow/rocm-mi300
+- ciflow/s390
+- ciflow/slow
+- ciflow/trunk
+- ciflow/unstable
+- ciflow/xpu
+- ciflow/torchbench
+- ciflow/op-benchmark
+- ciflow/pull
+- ciflow/h100
+- ciflow/h100-distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 retryable_workflows:
 - pull
 - trunk
@@ -51,4 +83,8 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
+<<<<<<< HEAD
 mergebot: true
+=======
+mergebot: True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
index c274ca1e5914d..e4085fcc6adbb 100644
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@@ -1,15 +1,28 @@
 # This file is to cache other dependencies not specified elsewhere in:
+<<<<<<< HEAD
 #   requirements.txt
 #   requirements-build.txt
+=======
+#   requirement.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   docs/requirements.txt
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.35.42
 jinja2==3.1.6
+<<<<<<< HEAD
 lintrunner==0.12.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0.2
 requests==2.32.4
 rich==14.1.0
+=======
+lintrunner==0.10.7
+ninja==1.10.0.post1
+nvidia-ml-py==11.525.84
+pyyaml==6.0
+requests==2.32.4
+rich==10.9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
new file mode 100644
index 0000000000000..b6e9a6ce9f3e5
--- /dev/null
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -0,0 +1,5 @@
+# Not pinning certifi so that we can always get the latest certificates
+certifi
+pip=23.2.1
+pkg-config=0.29.2
+wheel=0.37.1
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
new file mode 100644
index 0000000000000..e8464f0a55ff5
--- /dev/null
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -0,0 +1,36 @@
+boto3==1.35.42
+cmake==3.27.*
+expecttest==0.3.0
+fbscribelogger==0.1.7
+filelock==3.6.0
+hypothesis==6.56.4
+librosa>=0.6.2
+mpmath==1.3.0
+networkx==2.8.7
+ninja==1.10.2.4
+numba==0.59.0
+numpy==1.26.4
+opt-einsum>=3.3
+optree==0.13.0
+packaging==23.1
+parameterized==0.8.1
+pillow==10.3.0
+protobuf==5.29.4
+psutil==5.9.1
+pygments==2.15.0
+pytest-cpp==2.3.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==10.3
+pytest-subtests==0.13.1
+pytest-xdist==3.3.1
+pytest==7.3.2
+pyyaml==6.0.2
+scipy==1.12.0
+setuptools==72.1.0
+sympy==1.13.3
+tlparse==0.3.30
+tensorboard==2.13.0
+typing-extensions==4.12.2
+unittest-xml-reporting<=3.2.0,>=2.0.0
+xdoctest==1.1.0
+z3-solver==4.12.2.0
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index e541e7a86f653..040fa12f368a6 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -57,7 +57,10 @@ def get_rocm_version() -> str:
     rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
     if not os.path.isfile(rocm_version_h):
         rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
     if os.path.isfile(rocm_version_h):
         RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
@@ -90,23 +93,38 @@ def build_triton(
     if "MAX_JOBS" not in env:
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
+<<<<<<< HEAD
 
     version_suffix = ""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not release:
         # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
         # while release build should only include the version, i.e. 2.1.0
         rocm_version = get_rocm_version()
         version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
         version += version_suffix
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
+<<<<<<< HEAD
             triton_pkg_name = "triton"
             triton_repo = "https://github.com/ROCm/triton"
+=======
+            triton_repo = "https://github.com/ROCm/triton"
+            rocm_version = get_rocm_version()  # e.g., "7.0.1"
+            if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
+                triton_pkg_name = "triton"
+            else:
+                triton_pkg_name = "pytorch-triton-rocm"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -119,7 +137,10 @@ def build_triton(
                 ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
             )
         else:
+<<<<<<< HEAD
             check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
 
         # change built wheel name and version
@@ -163,6 +184,7 @@ def build_triton(
                 cwd=triton_basedir,
             )
 
+<<<<<<< HEAD
         # For gpt-oss models, triton requires this extra triton_kernels wheel
         # triton_kernels came after pytorch release/2.8
         triton_kernels_dir = Path(f"{triton_basedir}/python/triton_kernels")
@@ -170,6 +192,8 @@ def build_triton(
         kernels_whl_path = next(iter((triton_kernels_dir / "dist").glob("*.whl")))
         shutil.copy(kernels_whl_path, Path.cwd())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return Path.cwd() / whl_path.name
 
 
diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py
index 8032008edf122..63a82ca1b3dd5 100644
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@@ -275,7 +275,11 @@ def delete_branches() -> None:
         delete_branch(git_repo, branch)
 
 
+<<<<<<< HEAD
 def delete_old_tags() -> None:
+=======
+def delete_old_ciflow_tags() -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Deletes ciflow tags if they are associated with a closed PR or a specific
     # commit.  Lightweight tags don't have information about the date they were
     # created, so we can't check how old they are.  The script just assumes that
@@ -288,14 +292,20 @@ def delete_tag(tag: str) -> None:
         delete_branch(git_repo, f"refs/tags/{tag}")
 
     tags = git_repo._run_git("tag").splitlines()
+<<<<<<< HEAD
 
     CIFLOW_TAG_REGEX = re.compile(r"^ciflow\/.*\/(\d{5,6}|[0-9a-f]{40})$")
     AUTO_REVERT_TAG_REGEX = re.compile(r"^trunk\/[0-9a-f]{40}$")
+=======
+    open_pr_numbers = [x["number"] for x in get_open_prs()]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for tag in tags:
         try:
             if ESTIMATED_TOKENS[0] > 400:
                 print("Estimated tokens exceeded, exiting")
                 break
+<<<<<<< HEAD
 
             if not CIFLOW_TAG_REGEX.match(tag) and not AUTO_REVERT_TAG_REGEX.match(tag):
                 continue
@@ -311,6 +321,18 @@ def delete_tag(tag: str) -> None:
 
             if tag_age_days > 7:
                 print(f"[{tag}] Tag is older than 7 days, deleting")
+=======
+            if not tag.startswith("ciflow/"):
+                continue
+            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
+            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
+            if re_match_pr:
+                pr_number = int(re_match_pr.group(1))
+                if pr_number in open_pr_numbers:
+                    continue
+                delete_tag(tag)
+            elif re_match_sha:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 delete_tag(tag)
         except Exception as e:
             print(f"Failed to check tag {tag}: {e}")
@@ -318,4 +340,8 @@ def delete_tag(tag: str) -> None:
 
 if __name__ == "__main__":
     delete_branches()
+<<<<<<< HEAD
     delete_old_tags()
+=======
+    delete_old_ciflow_tags()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
index 04f4707a55c3f..bc81f319bf997 100644
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@@ -39,9 +39,13 @@ def main() -> None:
     pull_request_label_names = [label.name for label in pull_request_labels]
     issue_label_names = [label.name for label in issue_labels]
     labels_to_add = [
+<<<<<<< HEAD
         label
         for label in issue_label_names
         if label not in pull_request_label_names and label != "actionable"
+=======
+        label for label in issue_label_names if label not in pull_request_label_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     if not labels_to_add:
         print("The pull request already has the same labels.")
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 592c7aab6d933..c8419a7631887 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -18,7 +18,10 @@
 
 
 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
+<<<<<<< HEAD
 MAIN_BRANCH = "main"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 PREFIX = "test-config/"
 
@@ -41,9 +44,15 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
 }
 
 # The link to the published list of disabled jobs
+<<<<<<< HEAD
 DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
 # and unstable jobs
 UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+=======
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=HnkH0xQWnnsoeMsSIVf9291NE5c4jWSa"
+# and unstable jobs
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iP_F8gBs60PfOMAJ8gnn1paVrzM1WYsK"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
@@ -98,7 +107,11 @@ def parse_args() -> Any:
     parser.add_argument(
         "--branch",
         type=str,
+<<<<<<< HEAD
         default=MAIN_BRANCH,
+=======
+        default="main",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         help="the branch name",
     )
     return parser.parse_args()
@@ -457,7 +470,10 @@ def download_json(url: str, headers: dict[str, str], num_retries: int = 3) -> An
 
 
 def set_output(name: str, val: Any) -> None:
+<<<<<<< HEAD
     print(f"Setting output {name}={val}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
             print(f"{name}={val}", file=env)
@@ -497,17 +513,22 @@ def check_for_setting(labels: set[str], body: str, setting: str) -> bool:
 
 
 def perform_misc_tasks(
+<<<<<<< HEAD
     labels: set[str],
     test_matrix: dict[str, list[Any]],
     job_name: str,
     pr_body: str,
     branch: Optional[str] = None,
     tag: Optional[str] = None,
+=======
+    labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     In addition to apply the filter logic, the script also does the following
     misc tasks to set keep-going and is-unstable variables
     """
+<<<<<<< HEAD
     set_output(
         "keep-going",
         branch == MAIN_BRANCH
@@ -516,6 +537,9 @@ def perform_misc_tasks(
         or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag))
         or check_for_setting(labels, pr_body, "keep-going"),
     )
+=======
+    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_output(
         "ci-verbose-test-logs",
         check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
@@ -638,8 +662,11 @@ def main() -> None:
         test_matrix=filtered_test_matrix,
         job_name=args.job_name,
         pr_body=pr_body if pr_body else "",
+<<<<<<< HEAD
         branch=args.branch,
         tag=tag,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # Set the filtered test matrix as the output
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index fd04922f39999..c42d3e41a0a53 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,23 +16,37 @@
 
 
 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
+<<<<<<< HEAD
 CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
+=======
+CUDA_ARCHES = ["12.6", "12.8", "12.9"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
     "12.6": "12.6.3",
     "12.8": "12.8.1",
     "12.9": "12.9.1",
+<<<<<<< HEAD
     "13.0": "13.0.2",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 CUDA_ARCHES_CUDNN_VERSION = {
     "12.6": "9",
     "12.8": "9",
     "12.9": "9",
+<<<<<<< HEAD
     "13.0": "9",
 }
 
 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
 ROCM_ARCHES = ["6.4", "7.0"]
+=======
+}
+
+# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
+ROCM_ARCHES = ["6.3", "6.4"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 XPU_ARCHES = ["xpu"]
 
@@ -40,11 +54,16 @@
 
 CPU_S390X_ARCH = ["cpu-s390x"]
 
+<<<<<<< HEAD
 CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
+=======
+CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "12.6": (
+<<<<<<< HEAD
         "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
         "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
         "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
@@ -133,6 +152,76 @@
         "tcmlib==1.4.0 | "
         "umf==0.11.0 | "
         "intel-pti==0.13.1"
+=======
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "12.8": (
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "12.9": (
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "xpu": (
+        "intel-cmplr-lib-rt==2025.1.1 | "
+        "intel-cmplr-lib-ur==2025.1.1 | "
+        "intel-cmplr-lic-rt==2025.1.1 | "
+        "intel-sycl-rt==2025.1.1 | "
+        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.1.0 | "
+        "onemkl-sycl-dft==2025.1.0 | "
+        "onemkl-sycl-lapack==2025.1.0 | "
+        "onemkl-sycl-rng==2025.1.0 | "
+        "onemkl-sycl-sparse==2025.1.0 | "
+        "dpcpp-cpp-rt==2025.1.1 | "
+        "intel-opencl-rt==2025.1.1 | "
+        "mkl==2025.1.0 | "
+        "intel-openmp==2025.1.1 | "
+        "tbb==2022.1.0 | "
+        "tcmlib==1.3.0 | "
+        "umf==0.10.0 | "
+        "intel-pti==0.12.3"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
 }
 
@@ -143,7 +232,13 @@ def get_nccl_wheel_version(arch_version: str) -> str:
     requirements = map(
         str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
     )
+<<<<<<< HEAD
     return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
+=======
+    return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
+        1
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def read_nccl_pin(arch_version: str) -> str:
@@ -210,7 +305,11 @@ def arch_type(arch_version: str) -> str:
     "cpu": "libtorch-cxx11-builder:cpu",
 }
 
+<<<<<<< HEAD
 FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
+=======
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@@ -241,11 +340,15 @@ def generate_libtorch_matrix(
             arches += CUDA_ARCHES
             arches += ROCM_ARCHES
         elif os == "windows":
+<<<<<<< HEAD
             # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
             # in 2.10
             windows_cuda_arches = CUDA_ARCHES.copy()
             windows_cuda_arches.remove("12.9")
             arches += windows_cuda_arches
+=======
+            arches += CUDA_ARCHES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if libtorch_variants is None:
         libtorch_variants = [
             "shared-with-deps",
@@ -294,6 +397,10 @@ def generate_wheels_matrix(
     os: str,
     arches: Optional[list[str]] = None,
     python_versions: Optional[list[str]] = None,
+<<<<<<< HEAD
+=======
+    use_split_build: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[dict[str, str]]:
     package_type = "wheel"
     if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@@ -309,11 +416,15 @@ def generate_wheels_matrix(
         if os == "linux":
             arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
         elif os == "windows":
+<<<<<<< HEAD
             # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
             # in 2.10
             windows_cuda_arches = CUDA_ARCHES.copy()
             windows_cuda_arches.remove("12.9")
             arches += windows_cuda_arches + XPU_ARCHES
+=======
+            arches += CUDA_ARCHES + XPU_ARCHES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif os == "linux-aarch64":
             # Separate new if as the CPU type is different and
             # uses different build/test scripts
@@ -336,6 +447,7 @@ def generate_wheels_matrix(
                 else arch_version
             )
 
+<<<<<<< HEAD
             # TODO: Enable python 3.14 for rest
             if os not in [
                 "linux",
@@ -350,6 +462,25 @@ def generate_wheels_matrix(
 
             if (
                 arch_version in ["13.0", "12.9", "12.8", "12.6"]
+=======
+            # TODO: Enable python 3.13t on cpu-s390x
+            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
+                continue
+
+            if use_split_build and (
+                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
+            ):
+                raise RuntimeError(
+                    "Split build is only supported on linux with cuda 12* and cpu.\n"
+                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
+                    "Please modify the matrix generation to exclude this combination."
+                )
+
+            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
+
+            if (
+                arch_version in ["12.9", "12.8", "12.6"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -360,6 +491,10 @@ def generate_wheels_matrix(
                         "gpu_arch_type": gpu_arch_type,
                         "gpu_arch_version": gpu_arch_version,
                         "desired_cuda": desired_cuda,
+<<<<<<< HEAD
+=======
+                        "use_split_build": "True" if use_split_build else "False",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                             ":"
                         )[0],
@@ -382,6 +517,33 @@ def generate_wheels_matrix(
                         ),  # include special case for aarch64 build, remove the -aarch64 postfix
                     }
                 )
+<<<<<<< HEAD
+=======
+                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
+                if python_version == "3.11" and arch_version == CUDA_STABLE:
+                    ret.append(
+                        {
+                            "python_version": python_version,
+                            "gpu_arch_type": gpu_arch_type,
+                            "gpu_arch_version": gpu_arch_version,
+                            "desired_cuda": translate_desired_cuda(
+                                gpu_arch_type, gpu_arch_version
+                            ),
+                            "use_split_build": "True" if use_split_build else "False",
+                            "container_image": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[0],
+                            "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[1],
+                            "package_type": package_type,
+                            "pytorch_extra_install_requirements": "",
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                                ".", "_"
+                            ),
+                        }
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 ret.append(
                     {
@@ -391,6 +553,10 @@ def generate_wheels_matrix(
                         "desired_cuda": translate_desired_cuda(
                             gpu_arch_type, gpu_arch_version
                         ),
+<<<<<<< HEAD
+=======
+                        "use_split_build": "True" if use_split_build else "False",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                             ":"
                         )[0],
@@ -412,7 +578,10 @@ def generate_wheels_matrix(
     return ret
 
 
+<<<<<<< HEAD
 validate_nccl_dep_consistency("13.0")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 7d22e5059b7cb..f121486890d22 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -22,7 +22,10 @@
 LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
 LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
 LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
+<<<<<<< HEAD
 LABEL_CIFLOW_ROCM = "ciflow/rocm"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -59,7 +62,13 @@ class BinaryBuildWorkflow:
     is_scheduled: str = ""
     branches: str = "nightly"
     # Mainly for macos
+<<<<<<< HEAD
     macos_runner: str = "macos-14-xlarge"
+=======
+    cross_compile_arm64: bool = False
+    macos_runner: str = "macos-14-xlarge"
+    use_split_build: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Mainly used for libtorch builds
     build_variant: str = ""
 
@@ -70,6 +79,12 @@ def __post_init__(self) -> None:
                 for item in [self.os, "binary", self.package_type, self.build_variant]
                 if item != ""
             )
+<<<<<<< HEAD
+=======
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         output_file_path = (
@@ -112,6 +127,24 @@ class OperatingSystem:
             isolated_workflow=True,
         ),
     ),
+<<<<<<< HEAD
+=======
+    # See https://github.com/pytorch/pytorch/issues/138750
+    #   BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         use_split_build=True,
+    #         arches=["11.8", "12.1", "12.4", "cpu"],
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+    #         isolated_workflow=True,
+    #     ),
+    #     use_split_build=True,
+    # ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
@@ -127,6 +160,50 @@ class OperatingSystem:
     ),
 ]
 
+<<<<<<< HEAD
+=======
+LINUX_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["12.6", "12.8", "12.9", "6.4"],
+            python_versions=["3.9"],
+        ),
+        branches="main",
+    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    # BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         arches=["11.8", "12.1", "12.4"],
+    #         python_versions=["3.9"],
+    #         use_split_build=True,
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_PERIODIC},
+    #     ),
+    #     branches="main",
+    #     use_split_build=True,
+    # ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+    ),
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
@@ -212,6 +289,42 @@ class OperatingSystem:
     ),
 ]
 
+<<<<<<< HEAD
+=======
+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MACOS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
@@ -222,6 +335,10 @@ class OperatingSystem:
             generate_binary_build_matrix.RELEASE,
             libtorch_variants=["shared-with-deps"],
         ),
+<<<<<<< HEAD
+=======
+        cross_compile_arm64=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@@ -234,6 +351,10 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.MACOS_ARM64
         ),
+<<<<<<< HEAD
+=======
+        cross_compile_arm64=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
@@ -293,10 +414,24 @@ def main() -> None:
             S390X_BINARY_BUILD_WORKFLOWS,
         ),
         (
+<<<<<<< HEAD
+=======
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            LINUX_BINARY_SMOKE_WORKFLOWS,
+        ),
+        (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
             WINDOWS_BINARY_BUILD_WORKFLOWS,
         ),
         (
+<<<<<<< HEAD
+=======
+            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
+            WINDOWS_BINARY_SMOKE_WORKFLOWS,
+        ),
+        (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
             MACOS_BINARY_BUILD_WORKFLOWS,
         ),
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
index b04cbed76e955..bf8e669531096 100644
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@@ -136,10 +136,17 @@ def find_job_id_name(args: Any) -> tuple[str, str]:
 
 
 def set_output(name: str, val: Any) -> None:
+<<<<<<< HEAD
     print(f"Setting output {name}={val}")
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
             print(f"{name}={val}", file=env)
+=======
+    if os.getenv("GITHUB_OUTPUT"):
+        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
+            print(f"{name}={val}", file=env)
+        print(f"setting {name}={val}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         print(f"::set-output name={name}::{val}")
 
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
index 110015988a5c3..4bd7228b9298c 100644
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@@ -18,7 +18,10 @@ class GitHubComment:
     body_text: str
     created_at: str
     author_login: str
+<<<<<<< HEAD
     author_url: Optional[str]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     author_association: str
     editor_login: Optional[str]
     database_id: int
diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
index b353617a45b2b..cd04147193c63 100755
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@@ -2,7 +2,11 @@
 set -ex
 
 # Use uv to speed up lintrunner init
+<<<<<<< HEAD
 python3 -m pip install -U uv==0.8.* setuptools
+=======
+python3 -m pip install uv==0.1.45 setuptools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
diff --git a/.github/scripts/parse_ref.py b/.github/scripts/parse_ref.py
index e821750a49e10..05433caa11efa 100755
--- a/.github/scripts/parse_ref.py
+++ b/.github/scripts/parse_ref.py
@@ -5,7 +5,10 @@
 
 
 def set_output(name: str, val: str) -> None:
+<<<<<<< HEAD
     print(f"Setting output {name}={val}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
             print(f"{name}={val}", file=env)
diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py
index baf560234549b..9af3be41dd65b 100644
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@@ -262,12 +262,16 @@ def is_exception_branch(branch: str) -> bool:
     """
     Branches that get opted out of experiments by default, until they're explicitly enabled.
     """
+<<<<<<< HEAD
     return branch.split("/", maxsplit=1)[0] in {
         "main",
         "nightly",
         "release",
         "landchecks",
     }
+=======
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def load_yaml(yaml_text: str) -> Any:
diff --git a/.github/scripts/tag_docker_images_for_release.py b/.github/scripts/tag_docker_images_for_release.py
new file mode 100644
index 0000000000000..b2bf474575f6f
--- /dev/null
+++ b/.github/scripts/tag_docker_images_for_release.py
@@ -0,0 +1,64 @@
+import argparse
+import subprocess
+
+import generate_binary_build_matrix
+
+
+def tag_image(
+    image: str,
+    default_tag: str,
+    release_version: str,
+    dry_run: str,
+    tagged_images: dict[str, bool],
+) -> None:
+    if image in tagged_images:
+        return
+    release_image = image.replace(f"-{default_tag}", f"-{release_version}")
+    print(f"Tagging {image} to {release_image} , dry_run: {dry_run}")
+
+    if dry_run == "disabled":
+        subprocess.check_call(["docker", "pull", image])
+        subprocess.check_call(["docker", "tag", image, release_image])
+        subprocess.check_call(["docker", "push", release_image])
+    tagged_images[image] = True
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--version",
+        help="Version to tag",
+        type=str,
+        default="2.2",
+    )
+    parser.add_argument(
+        "--dry-run",
+        help="No Runtime Error check",
+        type=str,
+        choices=["enabled", "disabled"],
+        default="enabled",
+    )
+
+    options = parser.parse_args()
+    tagged_images: dict[str, bool] = {}
+    platform_images = [
+        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
+        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
+    ]
+    default_tag = generate_binary_build_matrix.DEFAULT_TAG
+
+    for platform_image in platform_images:  # type: ignore[attr-defined]
+        for arch in platform_image.keys():  # type: ignore[attr-defined]
+            if arch == "cpu-s390x":
+                continue
+            tag_image(
+                platform_image[arch],  # type: ignore[index]
+                default_tag,
+                options.version,
+                options.dry_run,
+                tagged_images,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/td_llm_indexer.sh b/.github/scripts/td_llm_indexer.sh
index cc8f363659ba6..834664fc00d24 100644
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@@ -6,7 +6,11 @@ set -euxo pipefail
 cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
+<<<<<<< HEAD
 pip install --no-build-isolation -v -e .
+=======
+pip install -e .
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pip install numpy==1.26.0
 
 # Run indexer
diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py
index 74ce276c9d10a..c699fad7346b7 100644
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@@ -38,7 +38,10 @@ def mock_get_comments() -> list[GitHubComment]:
             body_text="mock_body_text",
             created_at="",
             author_login="",
+<<<<<<< HEAD
             author_url=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             author_association="",
             editor_login=None,
             database_id=1,
@@ -49,7 +52,10 @@ def mock_get_comments() -> list[GitHubComment]:
             body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
             created_at="",
             author_login=BOT_AUTHORS[1],
+<<<<<<< HEAD
             author_url=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             author_association="",
             editor_login=None,
             database_id=2,
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index 790deb85ef8c3..4e414a934d2ae 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -27,17 +27,26 @@
     get_drci_classifications,
     gh_get_team_members,
     GitHubPR,
+<<<<<<< HEAD
     iter_issue_timeline_until_comment,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     JobCheckState,
     main as trymerge_main,
     MandatoryChecksMissingError,
     MergeRule,
+<<<<<<< HEAD
     PostCommentError,
     RE_GHSTACK_DESC,
     read_merge_rules,
     remove_job_name_suffix,
     sha_from_committed_event,
     sha_from_force_push_after,
+=======
+    RE_GHSTACK_DESC,
+    read_merge_rules,
+    remove_job_name_suffix,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_revert,
 )
 
@@ -74,9 +83,12 @@ def save_mocked_queries(obj: Any) -> None:
     if key in mocked_queries:
         return mocked_queries[key]
 
+<<<<<<< HEAD
     # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved
     raise ValueError(f"Key {key} could not be found in gql_mocks")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         rc = fallback_function(*args)
     except HTTPError as err:
@@ -128,7 +140,11 @@ def __init__(self) -> None:
             self.force = force
             self.pr_num = 76123
             self.dry_run = True
+<<<<<<< HEAD
             self.comment_id = 12345  # Set to non-zero value
+=======
+            self.comment_id = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.reason = "this is for testing"
             self.ignore_current = False
             self.check_mergeability = False
@@ -156,9 +172,15 @@ def mock_revert(
 def mock_merge(
     pr: GitHubPR,
     repo: GitRepo,
+<<<<<<< HEAD
     comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
+=======
+    dry_run: bool = False,
+    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
@@ -474,9 +496,15 @@ def test_main_force(
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+<<<<<<< HEAD
             comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=True,
+=======
+            dry_run=mock.ANY,
+            skip_mandatory_checks=True,
+            comment_id=mock.ANY,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_current=False,
         )
 
@@ -489,9 +517,15 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None:
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+<<<<<<< HEAD
             comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=False,
+=======
+            dry_run=mock.ANY,
+            skip_mandatory_checks=False,
+            comment_id=mock.ANY,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_current=False,
         )
 
@@ -589,6 +623,7 @@ def test_get_merge_base(self, *args: Any) -> None:
             self.assertEqual(mock_merge_base, pr.get_merge_base())
             mocked_gh_fetch_merge_base.assert_called_once()
 
+<<<<<<< HEAD
     def test_app_can_revert(self, *args: Any) -> None:
         pr = GitHubPR("pytorch", "pytorch", 164660)
         repo = DummyGitRepo()
@@ -606,6 +641,8 @@ def test_app_can_revert(self, *args: Any) -> None:
             "pytorch-auto-revert",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
 @mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@@ -1159,6 +1196,7 @@ def test__revlist_to_prs_two_prs(
         )
 
 
+<<<<<<< HEAD
 @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
 @mock.patch("trymerge.gh_fetch_merge_base", return_value="")
 @mock.patch(
@@ -1330,5 +1368,7 @@ def test_get_commit_sha_at_comment_exception(
         self.assertIsNone(sha)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index c258284a00d83..bb63f20e87076 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -108,6 +108,13 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 fragment PRCheckSuites on CheckSuiteConnection {
   edges {
     node {
+<<<<<<< HEAD
+=======
+      app {
+        name
+        databaseId
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       workflowRun {
         workflow {
           name
@@ -234,7 +241,10 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
           createdAt
           author {
             login
+<<<<<<< HEAD
             url
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
           authorAssociation
           editor {
@@ -451,6 +461,7 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10
 
 
+<<<<<<< HEAD
 def iter_issue_timeline_until_comment(
     org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
 ) -> Any:
@@ -508,6 +519,8 @@ def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
     return ev.get("after_sha") or ev.get("head_sha")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
     rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
     return rc["data"]["repository"]["pullRequest"]
@@ -795,6 +808,7 @@ def get_changed_files_count(self) -> int:
     def last_commit(self) -> Any:
         return self.info["commits"]["nodes"][-1]["commit"]
 
+<<<<<<< HEAD
     def last_commit_sha(self, default: Optional[str] = None) -> str:
         # for commits, the oid is the sha
 
@@ -803,16 +817,26 @@ def last_commit_sha(self, default: Optional[str] = None) -> str:
 
         return str(self.last_commit().get("oid", default))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_merge_base(self) -> str:
         if self.merge_base:
             return self.merge_base
 
+<<<<<<< HEAD
         last_commit_sha = self.last_commit_sha()
+=======
+        last_commit_oid = self.last_commit()["oid"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: We could use self.base_ref() here for regular PR, however, that doesn't
         # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
         # so let's just use main instead
         self.merge_base = gh_fetch_merge_base(
+<<<<<<< HEAD
             self.org, self.project, last_commit_sha, self.default_branch()
+=======
+            self.org, self.project, last_commit_oid, self.default_branch()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@@ -901,6 +925,7 @@ def get_approved_by(self) -> list[str]:
     def get_commit_count(self) -> int:
         return int(self.info["commits_with_authors"]["totalCount"])
 
+<<<<<<< HEAD
     def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
         """
         Get the PR head commit SHA that was present when a specific comment was posted.
@@ -939,6 +964,8 @@ def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
         print(f"Did not find comment with id {comment_id} in the PR timeline")
         return None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_pr_creator_login(self) -> str:
         return cast(str, self.info["author"]["login"])
 
@@ -1092,9 +1119,14 @@ def _comment_from_node(node: Any) -> GitHubComment:
         editor = node["editor"]
         return GitHubComment(
             body_text=node["bodyText"],
+<<<<<<< HEAD
             created_at=node.get("createdAt", ""),
             author_login=node["author"]["login"],
             author_url=node["author"].get("url", None),
+=======
+            created_at=node["createdAt"] if "createdAt" in node else "",
+            author_login=node["author"]["login"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             author_association=node["authorAssociation"],
             editor_login=editor["login"] if editor else None,
             database_id=node["databaseId"],
@@ -1256,7 +1288,11 @@ def merge_into(
         *,
         skip_mandatory_checks: bool = False,
         dry_run: bool = False,
+<<<<<<< HEAD
         comment_id: int,
+=======
+        comment_id: Optional[int] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignore_current_checks: Optional[list[str]] = None,
     ) -> None:
         # Raises exception if matching rule is not found
@@ -1272,7 +1308,11 @@ def merge_into(
             skip_internal_checks=can_skip_internal_checks(self, comment_id),
             ignore_current_checks=ignore_current_checks,
         )
+<<<<<<< HEAD
         additional_merged_prs = self.merge_changes_locally(
+=======
+        additional_merged_prs = self.merge_changes(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             repo, skip_mandatory_checks, comment_id
         )
 
@@ -1301,7 +1341,11 @@ def merge_into(
                 broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                 flaky_checks=ignorable_checks.get("FLAKY", []),
                 unstable_checks=ignorable_checks.get("UNSTABLE", []),
+<<<<<<< HEAD
                 last_commit_sha=self.last_commit_sha(default=""),
+=======
+                last_commit_sha=self.last_commit().get("oid", ""),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 merge_base_sha=self.get_merge_base(),
                 merge_commit_sha=merge_commit_sha,
                 is_failed=False,
@@ -1322,7 +1366,11 @@ def merge_into(
             dry_run=dry_run,
         )
 
+<<<<<<< HEAD
     def merge_changes_locally(
+=======
+    def merge_changes(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         repo: GitRepo,
         skip_mandatory_checks: bool = False,
@@ -1331,15 +1379,38 @@ def merge_changes_locally(
         skip_all_rule_checks: bool = False,
     ) -> list["GitHubPR"]:
         """
+<<<<<<< HEAD
         :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
+=======
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         branch_to_merge_into = self.default_branch() if branch is None else branch
         if repo.current_branch() != branch_to_merge_into:
             repo.checkout(branch_to_merge_into)
+<<<<<<< HEAD
 
         # It's okay to skip the commit SHA check for ghstack PRs since
         # authoring requires write access to the repo.
         if self.is_ghstack_pr():
+=======
+        if not self.is_ghstack_pr():
+            msg = self.gen_commit_message()
+            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+            repo.fetch(self.last_commit()["oid"], pr_branch_name)
+            repo._run_git("merge", "--squash", pr_branch_name)
+            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
+
+            # Did the PR change since we started the merge?
+            pulled_sha = repo.show_ref(pr_branch_name)
+            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+            if pulled_sha != latest_pr_status.last_commit()["oid"]:
+                raise RuntimeError(
+                    "PR has been updated since CI checks last passed. Please rerun the merge command."
+                )
+            return []
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.merge_ghstack_into(
                 repo,
                 skip_mandatory_checks,
@@ -1347,6 +1418,7 @@ def merge_changes_locally(
                 skip_all_rule_checks=skip_all_rule_checks,
             )
 
+<<<<<<< HEAD
         msg = self.gen_commit_message()
         pr_branch_name = f"__pull-request-{self.pr_num}__init__"
 
@@ -1389,6 +1461,8 @@ def merge_changes_locally(
             )
         return []
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MergeRuleFailedError(RuntimeError):
     def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@@ -1593,7 +1667,11 @@ def find_matching_merge_rule(
             pending_checks = []
             failed_checks = []
 
+<<<<<<< HEAD
         hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
+=======
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(failed_checks) > 0:
             if reject_reason_score < 30000:
                 reject_reason_score = 30000
@@ -2022,26 +2100,40 @@ def validate_revert(
         else pr.get_comment_by_id(comment_id)
     )
     if comment.editor_login is not None:
+<<<<<<< HEAD
         raise PostCommentError(
             "Halting the revert as the revert comment has been edited."
         )
+=======
+        raise PostCommentError("Don't want to revert based on edited command")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     author_association = comment.author_association
     author_login = comment.author_login
     allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]
     # For some reason, one can not be a member of private repo, only CONTRIBUTOR
     if pr.is_base_repo_private():
         allowed_reverters.append("CONTRIBUTOR")
+<<<<<<< HEAD
     # Special case the pytorch-auto-revert app, whose does not have association
     # But should be able to issue revert command
     if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
         allowed_reverters.append("NONE")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if author_association not in allowed_reverters:
         raise PostCommentError(
             f"Will not revert as @{author_login} is not one of "
             f"[{', '.join(allowed_reverters)}], but instead is {author_association}."
         )
 
+<<<<<<< HEAD
+=======
+    # Raises exception if matching rule is not found, but ignores all status checks
+    find_matching_merge_rule(
+        pr, repo, skip_mandatory_checks=True, skip_internal_checks=True
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     commit_sha = get_pr_commit_sha(repo, pr)
     return (author_login, commit_sha)
 
@@ -2292,14 +2384,24 @@ def categorize_checks(
 def merge(
     pr: GitHubPR,
     repo: GitRepo,
+<<<<<<< HEAD
     comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
+=======
+    dry_run: bool = False,
+    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
 ) -> None:
+<<<<<<< HEAD
     initial_commit_sha = pr.last_commit_sha()
+=======
+    initial_commit_sha = pr.last_commit()["oid"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
     print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
 
@@ -2370,7 +2472,11 @@ def merge(
             f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
         )
         pr = GitHubPR(pr.org, pr.project, pr.pr_num)
+<<<<<<< HEAD
         if initial_commit_sha != pr.last_commit_sha():
+=======
+        if initial_commit_sha != pr.last_commit()["oid"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 "New commits were pushed while merging. Please rerun the merge command."
             )
@@ -2537,7 +2643,11 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
     if args.check_mergeability:
         if pr.is_ghstack_pr():
             get_ghstack_prs(repo, pr)  # raises error if out of sync
+<<<<<<< HEAD
         pr.merge_changes_locally(
+=======
+        pr.merge_changes(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             repo,
             skip_mandatory_checks=True,
             skip_all_rule_checks=True,
@@ -2552,6 +2662,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
         return
     try:
+<<<<<<< HEAD
         # Ensure comment id is set, else fail
         if not args.comment_id:
             raise ValueError(
@@ -2564,6 +2675,14 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
             comment_id=args.comment_id,
             dry_run=args.dry_run,
             skip_mandatory_checks=args.force,
+=======
+        merge(
+            pr,
+            repo,
+            dry_run=args.dry_run,
+            skip_mandatory_checks=args.force,
+            comment_id=args.comment_id,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_current=args.ignore_current,
         )
     except Exception as e:
@@ -2585,7 +2704,11 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
                 broken_trunk_checks=[],
                 flaky_checks=[],
                 unstable_checks=[],
+<<<<<<< HEAD
                 last_commit_sha=pr.last_commit_sha(default=""),
+=======
+                last_commit_sha=pr.last_commit().get("oid", ""),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 merge_base_sha=pr.get_merge_base(),
                 is_failed=True,
                 skip_mandatory_checks=args.force,
diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat
index 75c916ecdbef7..28977ee042ffc 100644
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@@ -17,7 +17,10 @@ if errorlevel 1 exit /b 1
 
 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%"
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%
+<<<<<<< HEAD
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 mkdir magma_cuda%CUVER_NODOT%
 cd magma_cuda%CUVER_NODOT%
@@ -35,9 +38,12 @@ cd magma
 mkdir build && cd build
 
 set GPU_TARGET=All
+<<<<<<< HEAD
 if "%CUVER_NODOT%" == "130" (
   set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%CUVER_NODOT%" == "129" (
   set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
diff --git a/.github/scripts/windows/build_triton.bat b/.github/scripts/windows/build_triton.bat
index d26dc8bf3b198..761d5cfbc962f 100644
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@@ -1,12 +1,30 @@
 @echo on
 
+<<<<<<< HEAD
 set DESIRED_PYTHON=%PY_VERS%
 call .ci/pytorch/windows/internal/install_python.bat
 
 :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
 %PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4
+=======
+set PYTHON_PREFIX=%PY_VERS:.=%
+set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
+call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+:: Create a new conda environment
+if "%PY_VERS%" == "3.13t" (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
+) else (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
+)
+:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
+call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 dir "%VC_INSTALL_PATH%"
 
 call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
+<<<<<<< HEAD
 %PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
+=======
+call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 064eea7592230..c62af9388e5fd 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -4,7 +4,11 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
 
 {%- set timeout_minutes = 240 -%}
+<<<<<<< HEAD
 {%- set timeout_minutes_windows_binary = 360 -%}
+=======
+{%- set timeout_minutes_windows_binary = 300 -%}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {%- macro concurrency(build_environment) -%}
 concurrency:
@@ -32,7 +36,11 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
       !{{ display_ec2_information() }}
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index baff04967e3ae..4b6e2db12a7ca 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -56,7 +56,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -71,15 +75,22 @@ jobs:
     with:!{{ upload.binary_env_as_input(config) }}
       {%- if "aarch64" in build_environment %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       {%- elif "s390x" in build_environment %}
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
+<<<<<<< HEAD
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       timeout-minutes: 300
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.24xlarge.ephemeral
@@ -117,12 +128,21 @@ jobs:
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runs_on: linux.rocm.gpu
+<<<<<<< HEAD
       {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
       {%- elif config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      {%- elif config["gpu_arch_type"] == "cuda" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {%- else %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge
@@ -138,7 +158,11 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -153,10 +177,17 @@ jobs:
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
+<<<<<<< HEAD
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -164,7 +195,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -177,9 +212,12 @@ jobs:
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: !{{ common.timeout_minutes }}
     !{{ upload.binary_env(config) }}
+<<<<<<< HEAD
     permissions:
       id-token: write
       contents: read
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
@@ -188,7 +226,11 @@ jobs:
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
+<<<<<<< HEAD
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+=======
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
@@ -202,7 +244,11 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -210,7 +256,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index ad5dd74972d0a..6ac18c7395fd7 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -22,6 +22,7 @@ name: !{{ build_environment }}
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
 {%- endmacro %}
 
+<<<<<<< HEAD
 {%- macro setup_python(py_ver) -%}
       - name: Setup Python
         uses: actions/setup-python@v6
@@ -31,6 +32,8 @@ name: !{{ build_environment }}
           freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
 {%- endmacro %}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 on:
 # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
   push:
@@ -56,6 +59,12 @@ env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SKIP_ALL_TESTS: 0
+<<<<<<< HEAD
+=======
+{%- if cross_compile_arm64 %}
+  CROSS_COMPILE_ARM64: 1
+{% endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 !{{ common.concurrency(build_environment) }}
 
 jobs:
@@ -70,6 +79,7 @@ jobs:
     {%- endif %}
     steps:
       !{{ set_runner_specific_vars() }}
+<<<<<<< HEAD
       !{{ setup_python(config.get("python_version", "3.10")) }}
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
       - name: Populate binary env
@@ -77,6 +87,30 @@ jobs:
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -93,6 +127,11 @@ jobs:
 {%- if config["package_type"] == "wheel" %}
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -102,10 +141,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 5e3798f8e2377..ae519cc9a7330 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -15,7 +15,11 @@
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: !{{ config["desired_cuda"] }}
 {%- if config["gpu_arch_version"] %}
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}"
+=======
+      GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- endif %}
       GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }}
 {%- if include_skip_tests %}
@@ -25,6 +29,14 @@
       DOCKER_IMAGE: !{{ config["container_image"] }}
       DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
 {%- endif %}
+<<<<<<< HEAD
+=======
+{%- if config["package_type"] == "manywheel" %}
+  {%- if config.use_split_build is defined %}
+      use_split_build: !{{ config["use_split_build"] }}
+  {%- endif %}
+{%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- if config["package_type"] == "libtorch" %}
   {%- if config["libtorch_config"] %}
       LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
@@ -33,7 +45,11 @@
   {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {%- endif %}
 
 {%- else %}
diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2
index 34c148270c6bc..a566a3dde764f 100644
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@@ -64,7 +64,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -79,9 +83,15 @@ jobs:
     runs-on: "windows-11-arm64-preview"
     {%- else %}
     {%- if branches == "nightly" %}
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     {%- else %}
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    {%- else %}
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {%- endif %}
     {%- endif %}
     timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
@@ -135,7 +145,11 @@ jobs:
 {%- else %}
       !{{ set_runner_specific_vars() }}
       !{{ common.setup_ec2_windows() }}
+<<<<<<< HEAD
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+=======
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- endif %}
       - name: Populate binary env
         shell: bash
@@ -211,7 +225,11 @@ jobs:
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
 {%- else %}
       !{{ common.setup_ec2_windows() }}
+<<<<<<< HEAD
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+=======
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       !{{ set_runner_specific_vars() }}
 {%- endif %}
       - uses: !{{ common.download_artifact_action }}
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index 72241a772be61..59a23462af5d2 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -47,7 +47,11 @@ jobs:
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
@@ -69,25 +73,41 @@ jobs:
     runs-on: ${{ matrix.runner }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -97,7 +117,11 @@ jobs:
         run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
 
       - name: Output disk space left
@@ -209,5 +233,9 @@ jobs:
           file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index bfa035bc753b8..4bb6b0198d7d4 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -26,6 +26,16 @@ on:
         default: 240
         type: number
         description: timeout for the job
+<<<<<<< HEAD
+=======
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE:
         required: false
         type: string
@@ -110,6 +120,10 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+<<<<<<< HEAD
+=======
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Make the env permanent during this workflow (but not the secrets)
         shell: bash
@@ -134,6 +148,10 @@ jobs:
             echo "PR_NUMBER=${{ env.PR_NUMBER }}"
             echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
             echo "SHA1=${{ env.SHA1 }}"
+<<<<<<< HEAD
+=======
+            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } >> "${GITHUB_ENV} }}"
 
       - name: List the env
@@ -142,13 +160,21 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -178,7 +204,10 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -213,9 +242,15 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
         with:
           # If doing this in main or release branch, use docker.io. Otherwise
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          # If doing this in release/2.8 or release branch, use docker.io. Otherwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           # use ECR
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -227,7 +262,11 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -252,6 +291,10 @@ jobs:
             -e PYTORCH_ROOT \
             -e SKIP_ALL_TESTS \
             -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
+<<<<<<< HEAD
+=======
+            -e USE_SPLIT_BUILD \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             --tty \
             --detach \
             -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
@@ -283,7 +326,11 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 476dd182db0f8..5fdd16f8db0cb 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -64,6 +64,16 @@ on:
         required: true
         type: string
         description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
+<<<<<<< HEAD
+=======
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token:
         required: true
@@ -97,6 +107,10 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+<<<<<<< HEAD
+=======
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Make the env permanent during this workflow (but not the secrets)
         shell: bash
@@ -121,18 +135,30 @@ jobs:
             echo "PR_NUMBER=${{ env.PR_NUMBER }}"
             echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
             echo "SHA1=${{ env.SHA1 }}"
+<<<<<<< HEAD
+=======
+            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } >> "${GITHUB_ENV} }}"
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
         # Setup the environment
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -155,7 +181,10 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           show-progress: false
           path: pytorch
@@ -186,7 +215,11 @@ jobs:
           path: "${{ runner.temp }}/artifacts/"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
@@ -201,7 +234,11 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -211,7 +248,11 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -223,7 +264,11 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 636b76d42931a..98f40e28fcf23 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -51,6 +51,16 @@ on:
         required: false
         type: string
         description: Desired python version
+<<<<<<< HEAD
+=======
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token:
         required: true
@@ -79,9 +89,16 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+<<<<<<< HEAD
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index ebf96264e9944..7f2a81569d1b6 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -67,6 +67,7 @@ jobs:
             # an OOM issue when running the job, so this upgrades the runner from 4xlarge
             # to the next available tier of 12xlarge. So much memory just to generate cpp
             # doc
+<<<<<<< HEAD
             runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
             # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
             # Let's try to figure out how this can be improved
@@ -75,12 +76,30 @@ jobs:
             runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge
             # It takes less than 30m to finish python docs unless there are issues
             timeout-minutes: 30
+=======
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge
+            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
+            # Let's try to figure out how this can be improved
+            timeout-minutes: 240
+          - docs_type: python
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            # It takes less than 30m to finish python docs unless there are issues
+            timeout-minutes: 30
+          - docs_type: functorch
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            # It takes less than 15m to finish functorch docs unless there are issues
+            timeout-minutes: 15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
     # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -91,7 +110,11 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
@@ -106,12 +129,20 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -207,6 +238,21 @@ jobs:
           path: cppdocs/
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
 
+<<<<<<< HEAD
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+      - name: Upload functorch Docs Preview
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: functorch_ghpages/nightly/
+          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
index 014e6106b0730..9287aa304b2af 100644
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@@ -11,9 +11,14 @@ on:
 jobs:
   lint-urls:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       job-name: lint-urls
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout: 120
       runner: ${{ inputs.runner }}linux.2xlarge
       docker-image: ci-image:pytorch-linux-jammy-linter
@@ -37,9 +42,14 @@ jobs:
 
   lint-xrefs:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       job-name: lint-xrefs
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout: 60
       runner: ${{ inputs.runner }}linux.2xlarge
       docker-image: ci-image:pytorch-linux-jammy-linter
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index cc0064391fdef..6790c0d482289 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -16,6 +16,14 @@ on:
         type: boolean
         default: true
         description: If set, upload generated build artifacts.
+<<<<<<< HEAD
+=======
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag:
         required: false
         type: string
@@ -37,7 +45,11 @@ on:
       runner:
         required: false
         type: string
+<<<<<<< HEAD
         default: "linux.c7i.2xlarge"
+=======
+        default: "linux.2xlarge"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         description: |
           Label of the runner this job should run on.
       test-matrix:
@@ -64,6 +76,14 @@ on:
         required: false
         type: string
         default: ""
+<<<<<<< HEAD
+=======
+      max-jobs:
+        description: |
+          Overwrite the number of jobs to use for the build
+        required: false
+        type: string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       disable-monitor:
         description: |
           Disable utilization monitoring for build job
@@ -82,6 +102,10 @@ on:
         required: false
         type: number
         default: 1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       allow-reuse-old-whl:
         description: |
           If set, the build try to pull an old wheel from s3 that was built on a
@@ -89,6 +113,7 @@ on:
         required: false
         type: boolean
         default: true
+<<<<<<< HEAD
       build-additional-packages:
         description: |
           If set, the build job will also builds these packages and saves their
@@ -103,6 +128,8 @@ on:
         required: false
         type: string
         default: ""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     secrets:
       HUGGING_FACE_HUB_TOKEN:
@@ -114,6 +141,10 @@ on:
         description: |
           FB app token to write to scribe endpoint
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     outputs:
       docker-image:
         value: ${{ jobs.build.outputs.docker-image }}
@@ -128,12 +159,17 @@ jobs:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
+<<<<<<< HEAD
     timeout-minutes: 480
+=======
+    timeout-minutes: 240
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     outputs:
       docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
@@ -141,13 +177,23 @@ jobs:
           instructions: |
             Build is done inside the container, to start an interactive session run:
               docker exec -it $(docker container ps --format '{{.ID}}') bash
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       # [pytorch repo ref]
       # Use a pytorch/pytorch reference instead of a reference to the local
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -183,7 +229,11 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
@@ -199,7 +249,11 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -232,7 +286,11 @@ jobs:
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
         run: |
           mkdir -p ../../usage_logs
+<<<<<<< HEAD
           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+=======
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           python3 -m tools.stats.monitor \
           --log-interval "$MONITOR_LOG_INTERVAL" \
           --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
@@ -254,6 +312,11 @@ jobs:
         env:
           BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
+<<<<<<< HEAD
+=======
+          # TODO duplicated
+          AWS_DEFAULT_REGION: us-east-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           PR_NUMBER: ${{ github.event.pull_request.number }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
@@ -265,11 +328,19 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
           DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+<<<<<<< HEAD
           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
           RUNNER: ${{ inputs.runner }}
+=======
+          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
+          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           START_TIME=$(date +%s)
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -289,12 +360,22 @@ jobs:
             DOCKER_SHELL_CMD=
           fi
 
+<<<<<<< HEAD
+=======
+          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
+            MAX_JOBS="$(nproc --ignore=2)"
+          else
+            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
+          fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           # Leaving 1GB for the runner and other things
           TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
           # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
           # comes from https://github.com/pytorch/test-infra/pull/6058
           TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
 
+<<<<<<< HEAD
           if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then
             # EC2 specific setup for RISC-V emulation
             # Ensure binfmt_misc is available
@@ -320,13 +401,22 @@ jobs:
             RISCV_DOCKER_ARGS=
           fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           # detached container should get cleaned up by teardown_ec2_linux
           # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
           # shellcheck disable=SC2086
           container_name=$(docker run \
+<<<<<<< HEAD
             ${RISCV_DOCKER_ARGS} \
             -e BUILD_ENVIRONMENT \
             -e MAX_JOBS="$(nproc --ignore=2)" \
+=======
+            -e BUILD_ENVIRONMENT \
+            -e MAX_JOBS=${MAX_JOBS} \
+            -e MAX_JOBS_OVERRIDE \
+            -e AWS_DEFAULT_REGION \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             -e PR_NUMBER \
             -e SHA1 \
             -e BRANCH \
@@ -340,8 +430,12 @@ jobs:
             -e OUR_GITHUB_JOB_ID \
             -e HUGGING_FACE_HUB_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
+<<<<<<< HEAD
             -e BUILD_ADDITIONAL_PACKAGES \
             -e RUNNER \
+=======
+            -e USE_SPLIT_BUILD \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
             --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@@ -355,16 +449,20 @@ jobs:
             "${USED_IMAGE}" \
             ${DOCKER_SHELL_CMD}
           )
+<<<<<<< HEAD
 
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
             docker exec -t "${container_name}" sh -c "python3 -m pip install -r requirements.txt"
           fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
 
           END_TIME=$(date +%s)
           echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
 
+<<<<<<< HEAD
       - name: Build external packages
         id: build-external-packages
         if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
@@ -385,6 +483,8 @@ jobs:
             mv "$src" "dist/$(dirname "$src")/"
           fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Stop monitoring script
         if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
         shell: bash
@@ -457,7 +557,11 @@ jobs:
           artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: Cleanup docker
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 29c2fc8e08476..244f12494512d 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -72,10 +72,13 @@ on:
         required: false
         description: |
           HF Auth token to avoid rate limits when downloading models or datasets from hub
+<<<<<<< HEAD
       VLLM_TEST_HUGGING_FACE_TOKEN:
         required: false
         description: |
           HF Auth token to test vllm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       SCRIBE_GRAPHQL_ACCESS_TOKEN:
         required: false
         description: |
@@ -94,6 +97,7 @@ jobs:
     environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
     runs-on: ${{ matrix.runner }}
     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+<<<<<<< HEAD
     permissions:
       id-token: write
       contents: read
@@ -101,6 +105,12 @@ jobs:
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+=======
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -108,6 +118,7 @@ jobs:
               docker exec -it $(docker container ps --format '{{.ID}}') bash
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
         with:
           no-sudo: true
@@ -125,12 +136,25 @@ jobs:
 
       - name: configure aws credentials
         if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+        with:
+          no-sudo: true
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+
+      - name: configure aws credentials
+        if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
         with:
           role-to-assume: ${{ inputs.aws-role-to-assume }}
           role-session-name: gha-linux-test
           aws-region: us-east-1
 
+<<<<<<< HEAD
       - name: Login to Amazon ECR
         if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
         id: login-ecr
@@ -140,6 +164,11 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image }}
@@ -155,7 +184,11 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -167,20 +200,33 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
         with:
           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup GPU_FLAG for docker run
         id: setup-gpu-flag
         run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
+=======
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
         id: setup-sscache-port-flag
         run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
         if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
+=======
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Lock NVIDIA A100 40GB Frequency
         run: |
@@ -209,7 +255,11 @@ jobs:
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
         run: |
+<<<<<<< HEAD
           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+=======
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -224,6 +274,7 @@ jobs:
         continue-on-error: true
         uses: ./.github/actions/download-td-artifacts
 
+<<<<<<< HEAD
       - name: Download Windows torch wheel for cross-compilation
         if: matrix.win_torch_wheel_artifact != ''
         uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
@@ -264,6 +315,8 @@ jobs:
           echo "CUDA libraries:"
           ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -287,12 +340,15 @@ jobs:
         run: |
           echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
 
+<<<<<<< HEAD
       - name: Preserve github env variables for use in docker
         shell: bash
         run: |
           env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
           env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Test
         id: test
         timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
@@ -313,8 +369,11 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+<<<<<<< HEAD
           EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
           OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
@@ -323,8 +382,13 @@ jobs:
           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
           # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
+<<<<<<< HEAD
           SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
           SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
+=======
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@@ -332,9 +396,15 @@ jobs:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+<<<<<<< HEAD
           VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+=======
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
         run: |
           set -x
@@ -360,6 +430,13 @@ jobs:
             # if for some reason cleanup action doesn't stop container
             # when job is cancelled
             DOCKER_SHELL_CMD="sleep 12h"
+<<<<<<< HEAD
+=======
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           else
             SHM_OPTS="--shm-size=${SHM_SIZE}"
             JENKINS_USER="--user jenkins"
@@ -409,9 +486,15 @@ jobs:
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e HUGGING_FACE_HUB_TOKEN \
+<<<<<<< HEAD
             -e VLLM_TEST_HUGGING_FACE_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
             -e DASHBOARD_TAG \
+=======
+            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
+            -e DASHBOARD_TAG \
+            -e IS_A100_RUNNER \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             -e ARTIFACTS_FILE_SUFFIX \
             --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
             --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
@@ -429,6 +512,11 @@ jobs:
             "${DOCKER_IMAGE}" \
             ${DOCKER_SHELL_CMD}
           )
+<<<<<<< HEAD
+=======
+          # Propagate download.pytorch.org IP to container
+          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
 
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -448,6 +536,7 @@ jobs:
           test_config: ${{ matrix.config }}
           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
 
+<<<<<<< HEAD
       - name: Authenticate with AWS
         if: ${{ always() && contains(matrix.runner, 'b200') }}
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
@@ -459,6 +548,10 @@ jobs:
 
       - name: Upload the benchmark results
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+=======
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           benchmark-results-dir: test/test-reports
@@ -516,7 +609,11 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
 
       # NB: We are currently having an intermittent GPU-related issue on G5 runners with
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 24fe510f0fb59..bdd40b4674a6d 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -67,11 +67,19 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Clean up disk space before running MacOS workflow
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@main
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Set xcode version
         env:
@@ -82,10 +90,17 @@ jobs:
           fi
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-python@main
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .ci/docker/requirements-ci.txt
+=======
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.8
+        with:
+          python-version: ${{ inputs.python-version }}
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
@@ -123,7 +138,11 @@ jobs:
             else
               # The runner has access to the S3 bucket via IAM profile without the need
               # for any credential
+<<<<<<< HEAD
               echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+=======
+              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
             fi
 
@@ -152,14 +171,27 @@ jobs:
         env:
           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
         run: |
+<<<<<<< HEAD
           # TODO: Remove me later, and properly activate venv
           PATH="$VENV_PATH/bin:$PATH"
           export PATH
+=======
+          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
+
+          if [[ -n "$CONDA_ENV" ]]; then
+            # Use binaries under conda environment
+            export PATH="$CONDA_ENV/bin":$PATH
+          fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           # NB: Same trick as Linux, there is no need to initialize sccache with the risk of getting
           # it hangs or timeout at initialization. The cache will be started automatically
           export SKIP_SCCACHE_INITIALIZATION=1
+<<<<<<< HEAD
           .ci/pytorch/macos-build.sh
+=======
+          ${CONDA_RUN} .ci/pytorch/macos-build.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
@@ -188,4 +220,8 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@main
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 82eb3c4bf2c75..b065453b7f009 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -88,6 +88,7 @@ jobs:
             pkill "${PROCESS}" || true
           done
 
+<<<<<<< HEAD
       - name: Clean up brew miniconda, if installed
         continue-on-error: true
         run: |
@@ -95,6 +96,11 @@ jobs:
             brew uninstall miniconda
             echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}"
           fi
+=======
+      - name: Clean up leftover miniconda installation
+        continue-on-error: true
+        run: brew uninstall miniconda || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Clean up leftover local python3 site-packages on MacOS pet runner
         continue-on-error: true
@@ -105,11 +111,19 @@ jobs:
           done
 
       - name: Clean up disk space before running MacOS workflow
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@main
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Get workflow job id
         id: get-job-id
@@ -118,12 +132,15 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+<<<<<<< HEAD
       - name: Setup Python
         uses: pytorch/test-infra/.github/actions/setup-python@main
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .ci/docker/requirements-ci.txt
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Start monitoring script
         id: monitor-script
         if: ${{ !inputs.disable-monitor }}
@@ -136,8 +153,13 @@ jobs:
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
         run: |
+<<<<<<< HEAD
           "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
           "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+=======
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download build artifacts
@@ -152,6 +174,16 @@ jobs:
         with:
           use-gha: true
 
+<<<<<<< HEAD
+=======
+      - name: Setup Python
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.8
+        with:
+          python-version: ${{ inputs.python-version }}
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          default-packages: ""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -202,7 +234,11 @@ jobs:
           set -ex
 
           # TODO: Remove me later, and properly activate venv
+<<<<<<< HEAD
           PATH="$VENV_PATH/bin:$PATH"
+=======
+          PATH="$(dirname "$(which python)"):$PATH"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           export PATH
 
           # Print out some information about the test environment
@@ -257,7 +293,11 @@ jobs:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Upload the benchmark results
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+=======
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
@@ -276,6 +316,7 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
           local_path: usage_log.txt
 
+<<<<<<< HEAD
       - name: Reinstall brew miniconda, if was installed
         if: always()
         continue-on-error: true
@@ -288,3 +329,9 @@ jobs:
         if: always()
         continue-on-error: true
         uses: pytorch/test-infra/.github/actions/check-disk-space@main
+=======
+      - name: Clean up disk space
+        if: always()
+        continue-on-error: true
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 43ed76a63cc67..35c814ae5afe0 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -62,11 +62,14 @@ on:
         required: false
         type: number
         default: 1
+<<<<<<< HEAD
     secrets:
       HUGGING_FACE_HUB_TOKEN:
         required: false
         description: |
           HF Auth token to avoid rate limits when downloading models or datasets from hub
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
@@ -81,17 +84,27 @@ jobs:
     strategy:
       matrix: ${{ fromJSON(inputs.test-matrix) }}
       fail-fast: false
+<<<<<<< HEAD
     runs-on: ${{ matrix.runner }}
     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    runs-on: ${{ matrix.runner }}
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
 
+<<<<<<< HEAD
       - name: Runner check GPU count (distributed jobs)
         if: ${{ contains(matrix.config, 'distributed') }}
         shell: bash
@@ -105,11 +118,33 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -122,9 +157,12 @@ jobs:
 
       - name: Start monitoring script
         id: monitor-script
+<<<<<<< HEAD
         if: ${{ !inputs.disable-monitor }}
         shell: bash
         continue-on-error: true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         env:
           JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@@ -132,8 +170,16 @@ jobs:
           WORKFLOW_RUN_ID: ${{github.run_id}}
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+<<<<<<< HEAD
         run: |
           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+=======
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
+        run: |
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -169,12 +215,15 @@ jobs:
         run: |
           echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
 
+<<<<<<< HEAD
       - name: Preserve github env variables for use in docker
         shell: bash
         run: |
           env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
           env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Test
         id: test
         env:
@@ -190,22 +239,35 @@ jobs:
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+<<<<<<< HEAD
           BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
           NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+<<<<<<< HEAD
+=======
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
           DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+<<<<<<< HEAD
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         run: |
           set -x
@@ -235,7 +297,10 @@ jobs:
             -e GITHUB_RUN_ATTEMPT \
             -e JOB_ID \
             -e JOB_NAME \
+<<<<<<< HEAD
             -e BASE_SHA \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             -e BRANCH \
             -e SHA1 \
             -e AWS_DEFAULT_REGION \
@@ -253,12 +318,18 @@ jobs:
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e TESTS_TO_INCLUDE \
+<<<<<<< HEAD
             -e HUGGING_FACE_HUB_TOKEN \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             -e DASHBOARD_TAG \
             --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --ulimit core=0 \
+<<<<<<< HEAD
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --shm-size="8g" \
@@ -281,8 +352,13 @@ jobs:
           # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
 
+<<<<<<< HEAD
       - name: Change permissions (only needed for kubernetes runners for now)
         if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
+=======
+      - name: Change permissions (only needed for MI300 runners for now)
+        if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
 
@@ -332,7 +408,11 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+=======
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml
index 0d674f044ec42..2f3ed8ea446e5 100644
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@@ -59,7 +59,11 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
     steps:
       # - name: Checkout PyTorch
+<<<<<<< HEAD
       #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       #   with:
       #     fetch-depth: 1
       #     submodules: true
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 0fd3cf7f3972e..5d57b739685c9 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -77,15 +77,26 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+<<<<<<< HEAD
           git config --global core.ignorecase false
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
           # in https://github.com/actions/checkout/issues/1018
           git config --global core.fsmonitor false
 
+<<<<<<< HEAD
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+      - name: Clean up leftover processes on non-ephemeral Windows runner
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8
+
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -100,7 +111,11 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -148,7 +163,11 @@ jobs:
           BUILD_WHEEL: 1
           MAX_JOBS: 8
           CUDA_VERSION: ${{ inputs.cuda-version }}
+<<<<<<< HEAD
           PYTHON_VERSION: "3.10"
+=======
+          PYTHON_VERSION: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           SCCACHE_BUCKET: "ossci-compiler-cache"
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SCCACHE_REGION: us-east-1
@@ -168,6 +187,7 @@ jobs:
         run: |
           .ci/pytorch/win-build.sh
 
+<<<<<<< HEAD
       # Collect Windows torch libs and CUDA libs for cross-compilation
       - name: Collect Windows CUDA libs for cross-compilation
         if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
@@ -193,6 +213,8 @@ jobs:
           echo "Collected CUDA libs:"
           ls -lah /c/${{ github.run_id }}/build-results/*.lib
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
         if: steps.build.outcome != 'skipped'
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 3d2fe8a4b3fac..c1bbc5395659c 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -70,15 +70,26 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+<<<<<<< HEAD
           git config --global core.ignorecase false
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
           # in https://github.com/actions/checkout/issues/1018
           git config --global core.fsmonitor false
 
+<<<<<<< HEAD
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+      - name: Clean up leftover processes on non-ephemeral Windows runner
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8
+
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -94,7 +105,11 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -103,6 +118,21 @@ jobs:
         with:
           cuda-version: ${{ inputs.cuda-version }}
 
+<<<<<<< HEAD
+=======
+      # TODO: Move to a requirements.txt file for windows
+      - name: Install pip dependencies
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 5
+          max_attempts: 5
+          retry_wait_seconds: 30
+          command: |
+            set -eu
+            python3 -m pip install 'xdoctest>=1.1.0'
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -124,7 +154,11 @@ jobs:
         continue-on-error: true
         run: |
           # Windows conda doesn't have python3 binary, only python, but it's python3
+<<<<<<< HEAD
           ${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+=======
+          ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -169,7 +203,11 @@ jobs:
         env:
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           INSTALL_WINDOWS_SDK: 1
+<<<<<<< HEAD
           PYTHON_VERSION: "3.10"
+=======
+          PYTHON_VERSION: 3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
@@ -257,6 +295,18 @@ jobs:
         shell: bash
         run: python3 .github/scripts/parse_ref.py
 
+<<<<<<< HEAD
+=======
+      - name: Uninstall PyTorch
+        if: always()
+        continue-on-error: true
+        shell: bash
+        run: |
+          # This step removes PyTorch installed by the test to give a clean slate
+          # to the next job
+          python3 -mpip uninstall -y torch
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown Windows
         uses: ./.github/actions/teardown-win
         if: always()
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index 7aa7608924487..f8dcf6d2a03e6 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -77,7 +77,11 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup XPU
         uses: ./.github/actions/setup-xpu
@@ -95,7 +99,11 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
@@ -109,7 +117,11 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -133,7 +145,11 @@ jobs:
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
         run: |
+<<<<<<< HEAD
           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+=======
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -191,6 +207,12 @@ jobs:
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+<<<<<<< HEAD
+=======
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
@@ -275,7 +297,11 @@ jobs:
       - name: Change permissions
         if: ${{ always() && steps.test.conclusion }}
         run: |
+<<<<<<< HEAD
           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
+=======
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Print remaining test logs
         shell: bash
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index 8318286cccbee..a4d5c04274052 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -36,10 +36,17 @@ jobs:
     runs-on: linux.9xlarge.ephemeral
     strategy:
       matrix:
+<<<<<<< HEAD
         tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
     steps:
       - name: Build docker image
         uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+=======
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
+    steps:
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: almalinux-builder
           custom-tag-prefix: ${{matrix.tag}}
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index c67281e0a112b..70d7c80609f6b 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -32,7 +32,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -48,17 +52,29 @@ jobs:
       fail-fast: false
       matrix:
         include: [
+<<<<<<< HEAD
           { tag: "cuda13.0" },
           { tag: "cuda12.9" },
           { tag: "cuda12.8" },
           { tag: "cuda12.6" },
           { tag: "rocm6.4"  },
           { tag: "rocm7.0"  },
+=======
+          { tag: "cuda12.9" },
+          { tag: "cuda12.8" },
+          { tag: "cuda12.6" },
+          { tag: "rocm6.3"  },
+          { tag: "rocm6.4"  },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           { tag: "cpu"      },
         ]
     steps:
       - name: Build docker image
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+=======
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: libtorch-cxx11-builder
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml
index be8f613169e8c..d96f1505826ce 100644
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@@ -34,7 +34,11 @@ jobs:
       id-token: write
     strategy:
       matrix:
+<<<<<<< HEAD
         cuda_version: ["130", "129", "128", "126"]
+=======
+        cuda_version: ["129", "128", "126"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml
index eaeb741e56394..d7a6aadd7e380 100644
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@@ -34,7 +34,11 @@ jobs:
       id-token: write
     strategy:
       matrix:
+<<<<<<< HEAD
         rocm_version: ["70", "64"]
+=======
+        rocm_version: ["64", "63"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml
index b7d293a5cec11..fc5ea76151cd5 100644
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@@ -22,7 +22,11 @@ jobs:
     runs-on: windows-2022
     strategy:
       matrix:
+<<<<<<< HEAD
         cuda_version: ["130", "129", "128", "126"]
+=======
+        cuda_version: ["129", "128", "126"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         config: ["Release", "Debug"]
     env:
       CUDA_VERSION: ${{ matrix.cuda_version }}
diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml
index c498e169f1aa5..b7fbb55df6fe4 100644
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@@ -25,7 +25,11 @@ jobs:
     runs-on: linux.s390x
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           no-sudo: true
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index a5c5c387adb82..f9ba2a402657f 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -32,7 +32,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -46,6 +50,7 @@ jobs:
       fail-fast: false
       matrix:
         include: [
+<<<<<<< HEAD
           { name: "manylinux2_28-builder",          tag: "cuda13.0",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.9",          runner: "linux.9xlarge.ephemeral" },
@@ -58,13 +63,29 @@ jobs:
           { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+=======
+          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
         ]
     runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
     name: ${{ matrix.name }}:${{ matrix.tag }}
     steps:
       - name: Build docker image
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+=======
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ matrix.name }}
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 9e4144ae56c2d..f350ba334045d 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -3,12 +3,19 @@ name: Build Triton wheels
 on:
   push:
     branches:
+<<<<<<< HEAD
       - main
+=======
+      - release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+<<<<<<< HEAD
       - 'ciflow/triton_binaries/*'
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     paths:
       - .github/workflows/build-triton-wheel.yml
       - .github/scripts/build_triton_wheel.py
@@ -36,7 +43,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -50,12 +61,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+<<<<<<< HEAD
         py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+=======
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device: ["cuda", "rocm", "xpu", "aarch64"]
         docker-image: ["pytorch/manylinux2_28-builder:cpu"]
         include:
           - device: "rocm"
+<<<<<<< HEAD
             rocm_version: "7.0"
+=======
+            rocm_version: "6.4"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
           - device: "cuda"
             rocm_version: ""
@@ -74,12 +93,20 @@ jobs:
       PLATFORM: 'manylinux_2_28_x86_64'
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
 
@@ -87,7 +114,11 @@ jobs:
         uses: ./.github/actions/setup-linux
 
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ env.DOCKER_IMAGE }}
 
@@ -108,6 +139,12 @@ jobs:
 
           # Determine python executable for given version
           case $PY_VERS in
+<<<<<<< HEAD
+=======
+          3.9)
+            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
+            ;;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           3.10)
             PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
             ;;
@@ -123,12 +160,15 @@ jobs:
           3.13t)
             PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
             ;;
+<<<<<<< HEAD
           3.14)
             PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
             ;;
           3.14t)
             PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
             ;;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           *)
             echo "Unsupported python version ${PY_VERS}"
             exit 1
@@ -142,7 +182,11 @@ jobs:
           fi
 
           docker exec -t "${container_name}" yum install -y zlib-devel zip
+<<<<<<< HEAD
           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
+=======
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set +e
           docker exec -t "${container_name}" command -v pip
           has_pip=$?
@@ -181,7 +225,11 @@ jobs:
           path: ${{ runner.temp }}/artifacts/wheelhouse/*
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
   build-wheel-win:
@@ -191,7 +239,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+<<<<<<< HEAD
         py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+=======
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device: ["xpu"]
     timeout-minutes: 40
     env:
@@ -214,7 +266,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
index 44430522b79d8..679d8028c6e69 100644
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@@ -38,7 +38,11 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml
index 569a174665ba8..c94545096896f 100644
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@@ -56,7 +56,11 @@ jobs:
           cache: pip
           architecture: x64
 
+<<<<<<< HEAD
       - run: pip install pyyaml==6.0.2
+=======
+      - run: pip install pyyaml==6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shell: bash
 
       - name: Verify mergeability
diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml
index 310857782ea14..3153a0fc07175 100644
--- a/.github/workflows/cherry-pick.yml
+++ b/.github/workflows/cherry-pick.yml
@@ -26,7 +26,11 @@ jobs:
           cache: pip
 
       # Not the direct dependencies but the script uses trymerge
+<<<<<<< HEAD
       - run: pip install pyyaml==6.0.2
+=======
+      - run: pip install pyyaml==6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup committer id
         run: |
diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
index bef3d8797149c..d20a1407f39a5 100644
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -13,7 +13,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index d5e0d96fe19f2..c7f79fd9f2be9 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -19,7 +19,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -35,7 +39,10 @@ jobs:
       contents: write
     outputs:
       pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
+<<<<<<< HEAD
       pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -54,6 +61,7 @@ jobs:
           tag_or_branch="${tag_or_branch#refs/heads/}"
           # replace directory separators with _ in branch name
           tag_or_branch="${tag_or_branch//\//_}"
+<<<<<<< HEAD
           torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
           {
             echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
@@ -84,19 +92,43 @@ jobs:
           pip install build==1.2.2.post1 || exit 1
           python -m build --sdist || exit 1
           cd dist || exit 1
+=======
+          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
+          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
+      - name: Checkout optional submodules
+        run: python3 tools/optional_submodules.py
+      - name: Create source distribution
+        run: |
+            # Create new folder with specified name so extracting the archive yields that
+            rm -rf "/tmp/$PT_RELEASE_NAME"
+            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+            mv "/tmp/$PT_RELEASE_NAME" .
+            # Cleanup
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+            # Create archive
+            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Upload source distribution for release
         if: ${{ github.event_name == 'release' }}
         uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
         with:
+<<<<<<< HEAD
           files: |
             ${{ env.PT_RELEASE_FILE }}
             ${{ env.PT_PEP517_RELEASE_FILE }}
       - name: Upload source distribution to GHA artifacts  # for release tags
+=======
+          files: ${{env.PT_RELEASE_FILE}}
+      - name: Upload source distribution to GHA artifacts for release tags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
           name: ${{ env.PT_RELEASE_FILE }}
           path: ${{ env.PT_RELEASE_FILE }}
+<<<<<<< HEAD
       - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags
         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
@@ -110,6 +142,11 @@ jobs:
             echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
             echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
           } >> "${GITHUB_OUTPUT}"
+=======
+      - name: Set output
+        id: release_name
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   upload_source_code_to_s3:
     if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@@ -125,9 +162,12 @@ jobs:
       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
         with:
           name: ${{ needs.release.outputs.pt_release_name }}
+<<<<<<< HEAD
       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
         with:
           name: ${{ needs.release.outputs.pt_pep517_release_name }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Configure AWS credentials(PyTorch account)
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
         with:
@@ -138,9 +178,13 @@ jobs:
           s3-bucket: pytorch
           s3-prefix: source_code/test
           if-no-files-found: warn
+<<<<<<< HEAD
           path: |
             ${{ needs.release.outputs.pt_release_name }}
             ${{ needs.release.outputs.pt_pep517_release_name }}
+=======
+          path: ${{ needs.release.outputs.pt_release_name }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index ca257ee8225ad..bb533842d3717 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,7 +33,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -50,6 +54,7 @@ jobs:
         runner: [linux.12xlarge]
         docker-image-name: [
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+<<<<<<< HEAD
           pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
@@ -73,6 +78,34 @@ jobs:
           pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu,
           pytorch-linux-noble-riscv64-py3.12-gcc14
+=======
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
+          pytorch-linux-jammy-py3.9-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
+          pytorch-linux-jammy-py3.13-clang12,
+          pytorch-linux-jammy-rocm-n-1-py3,
+          pytorch-linux-jammy-rocm-n-py3,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
+          pytorch-linux-jammy-py3.9-gcc11,
+          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-xpu-2025.0-py3,
+          pytorch-linux-jammy-xpu-2025.1-py3,
+          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-jammy-py3-clang18-asan,
+          pytorch-linux-jammy-py3-clang12-onnx,
+          pytorch-linux-jammy-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-py3.12-triton-cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         include:
           - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
@@ -94,21 +127,33 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Build docker image
         id: build-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ci-image:${{ matrix.docker-image-name }}
           always-rebuild: true
           push: true
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
@@ -121,7 +166,11 @@ jobs:
           GHCR_PAT: ${{ secrets.GHCR_PAT }}
         with:
           shell: bash
+<<<<<<< HEAD
           timeout_minutes: 60
+=======
+          timeout_minutes: 30
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           max_attempts: 5
           retry_wait_seconds: 90
           command: |
@@ -139,5 +188,9 @@ jobs:
         if: always()
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml
index 02c1171c567aa..2f5e2e5e60ca0 100644
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@@ -20,7 +20,11 @@ jobs:
     runs-on: rocm-docker
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -39,13 +43,21 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
           push: false
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index 2560ebf7912aa..becdca64080b0 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -37,7 +37,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -52,7 +56,11 @@ jobs:
       matrix: ${{ steps.generate-matrix.outputs.matrix }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: true
@@ -82,7 +90,11 @@ jobs:
       CUDNN_VERSION: ${{ matrix.cudnn_version }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
       # [see note: pytorch repo ref]
@@ -144,7 +156,11 @@ jobs:
         run: |
           make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
       - name: Push nightly tags
+<<<<<<< HEAD
         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
+=======
+        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
           CUDA_SUFFIX="-cu${CUDA_VERSION}"
@@ -164,12 +180,22 @@ jobs:
           fi
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
   validate:
     needs: build
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
     with:
       channel: nightly
+=======
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.8
+    with:
+      channel: test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ref: main
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index b8a6403faffbd..de5fa61edafe6 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -41,12 +41,135 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+  manywheel-py3_9-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_9-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_9-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -60,9 +183,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -83,6 +213,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -106,12 +240,17 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -204,6 +343,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-cuda-aarch64-12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -214,6 +355,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
@@ -225,6 +367,20 @@ jobs:
       build_name: manywheel-py3_10-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -240,16 +396,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda-aarch64-12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -296,6 +461,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -309,9 +476,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -332,6 +506,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -355,12 +533,17 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -453,6 +636,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-cuda-aarch64-12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -463,6 +648,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
@@ -474,6 +660,20 @@ jobs:
       build_name: manywheel-py3_11-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -489,16 +689,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda-aarch64-12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -545,6 +754,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_12-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -558,9 +769,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -581,6 +799,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -604,12 +826,17 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -702,6 +929,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_12-cuda-aarch64-12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -712,6 +941,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
@@ -723,6 +953,20 @@ jobs:
       build_name: manywheel-py3_12-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -738,16 +982,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda-aarch64-12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -794,6 +1047,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -807,9 +1062,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -830,6 +1092,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -853,12 +1119,17 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -951,6 +1222,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13-cuda-aarch64-12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -961,6 +1234,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
@@ -972,6 +1246,20 @@ jobs:
       build_name: manywheel-py3_13-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -987,16 +1275,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda-aarch64-12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1043,6 +1340,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13t-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1056,9 +1355,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.r7g.12xlarge.memory
+=======
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1079,6 +1385,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1102,12 +1412,17 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1200,6 +1515,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13t-cuda-aarch64-12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1210,6 +1527,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
@@ -1221,6 +1539,20 @@ jobs:
       build_name: manywheel-py3_13t-cuda-aarch64-12_9
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1236,15 +1568,24 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda-aarch64-12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
 
   manywheel-py3_13t-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
@@ -1789,3 +2130,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 7f3277ef64a12..b9528cb738a83 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -41,7 +41,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -122,7 +126,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -145,7 +153,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -154,7 +166,11 @@ jobs:
       build_name: libtorch-cuda12_6-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
@@ -169,7 +185,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -190,7 +210,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -213,7 +237,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -222,7 +250,11 @@ jobs:
       build_name: libtorch-cuda12_8-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
@@ -237,7 +269,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -258,7 +294,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
+=======
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -281,7 +321,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
+=======
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -290,7 +334,11 @@ jobs:
       build_name: libtorch-cuda12_9-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
@@ -305,7 +353,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
+=======
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@@ -316,7 +368,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-release-build:
+=======
+  libtorch-rocm6_3-shared-with-deps-release-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -325,6 +381,7 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
@@ -410,6 +467,24 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
       - libtorch-rocm6_4-shared-with-deps-release-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_3-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_3-shared-with-deps-release-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -418,6 +493,7 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.4
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
@@ -429,18 +505,35 @@ jobs:
     permissions:
       id-token: write
       contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: libtorch-rocm6_4-shared-with-deps-release
+=======
+          name: libtorch-rocm6_3-shared-with-deps-release
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -462,7 +555,124 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_3-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_3-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_4-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
@@ -470,7 +680,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -491,7 +705,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
+=======
+      GPU_ARCH_VERSION: 6.4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -501,6 +719,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
 
   libtorch-rocm7_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
@@ -619,3 +838,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
new file mode 100644
index 0000000000000..1b231ca5ffb6f
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
@@ -0,0 +1,87 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-release
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
new file mode 100644
index 0000000000000..9ad095ee68b5f
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -0,0 +1,275 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_9-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_6-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 5fcf4e0bd176f..48d92b918689a 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -41,12 +41,629 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+  manywheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cpu
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_6-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_6
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda12_6-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda12_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-rocm6_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-rocm6_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-rocm6_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-rocm6_3-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm6_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_9-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-rocm6_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-rocm6_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_9-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-xpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-xpu
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-xpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-xpu-build
+      - get-label-type
+    runs-on: linux.idc.xpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Setup XPU
+        uses: ./.github/actions/setup-xpu
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-xpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown XPU
+        uses: ./.github/actions/teardown-xpu
+  manywheel-py3_9-xpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-xpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-xpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -60,6 +677,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cpu
@@ -81,6 +702,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
       build_environment: linux-binary-manywheel
@@ -103,6 +728,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
     secrets:
@@ -119,15 +748,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-test:  # Testing
@@ -142,15 +783,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-upload:  # Uploading
@@ -165,10 +818,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
     secrets:
@@ -185,15 +846,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-test:  # Testing
@@ -208,15 +881,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-upload:  # Uploading
@@ -231,10 +916,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
     secrets:
@@ -251,15 +944,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_9-test:  # Testing
@@ -274,15 +979,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_9-upload:  # Uploading
@@ -297,17 +1014,29 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-cuda13_0-build:
+=======
+  manywheel-py3_10-rocm6_3-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -316,6 +1045,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
@@ -356,19 +1086,174 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cuda13_0-test
+    needs: manywheel-py3_10-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_10-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-rocm6_4-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-rocm6_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-rocm6_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-rocm6_3-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DESIRED_PYTHON: "3.10"
+    permissions:
+      id-token: write
+      contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+<<<<<<< HEAD
+          name: manywheel-py3_10-rocm6_4
+=======
+          name: manywheel-py3_10-rocm6_3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+<<<<<<< HEAD
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_10-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-rocm6_3-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda13_0
+      build_name: manywheel-py3_10-rocm6_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -383,13 +1268,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -407,15 +1292,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
       DESIRED_PYTHON: "3.10"
-    permissions:
-      id-token: write
-      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
@@ -427,7 +1310,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -449,7 +1331,8 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -457,7 +1340,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -478,16 +1365,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+=======
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -603,6 +1499,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -616,11 +1514,19 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-xpu-test:  # Testing
@@ -640,13 +1546,21 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -664,7 +1578,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -675,7 +1592,11 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -683,7 +1604,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -707,6 +1632,10 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-xpu
     secrets:
@@ -726,6 +1655,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cpu
@@ -747,6 +1680,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
       build_environment: linux-binary-manywheel
@@ -769,6 +1706,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
     secrets:
@@ -785,15 +1726,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-test:  # Testing
@@ -808,15 +1761,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-upload:  # Uploading
@@ -831,10 +1796,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
     secrets:
@@ -851,15 +1824,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-test:  # Testing
@@ -874,15 +1859,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-upload:  # Uploading
@@ -897,16 +1894,95 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_11-cuda12_8-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_8-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_8-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-cuda12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -917,15 +1993,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_9-test:  # Testing
@@ -940,15 +2028,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_9-upload:  # Uploading
@@ -963,17 +2063,29 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cuda13_0-build:
+=======
+  manywheel-py3_11-rocm6_3-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -982,6 +2094,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
@@ -1064,6 +2177,24 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
       - manywheel-py3_11-rocm6_4-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-rocm6_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-rocm6_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-rocm6_3-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -1072,6 +2203,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.4
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
@@ -1082,18 +2214,35 @@ jobs:
     permissions:
       id-token: write
       contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: manywheel-py3_11-rocm6_4
+=======
+          name: manywheel-py3_11-rocm6_3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1115,7 +2264,124 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_11-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-rocm6_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm6_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_11-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1123,7 +2389,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1144,16 +2414,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+=======
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1269,6 +2548,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1282,11 +2563,19 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-xpu-test:  # Testing
@@ -1306,13 +2595,21 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1330,7 +2627,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1341,7 +2641,11 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1349,7 +2653,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1373,6 +2681,10 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-xpu
     secrets:
@@ -1392,6 +2704,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cpu
@@ -1413,6 +2729,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
       build_environment: linux-binary-manywheel
@@ -1435,6 +2755,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
     secrets:
@@ -1451,15 +2775,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-test:  # Testing
@@ -1474,15 +2810,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-upload:  # Uploading
@@ -1497,10 +2845,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
     secrets:
@@ -1517,15 +2873,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
@@ -1540,15 +2908,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-upload:  # Uploading
@@ -1563,10 +2943,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
     secrets:
@@ -1583,15 +2971,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_9-test:  # Testing
@@ -1606,15 +3006,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_9-upload:  # Uploading
@@ -1629,17 +3041,29 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-cuda13_0-build:
+=======
+  manywheel-py3_12-rocm6_3-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1648,6 +3072,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
@@ -1730,6 +3155,24 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
       - manywheel-py3_12-rocm6_4-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-rocm6_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-rocm6_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-rocm6_3-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -1738,6 +3181,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.4
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
@@ -1748,18 +3192,35 @@ jobs:
     permissions:
       id-token: write
       contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: manywheel-py3_12-rocm6_4
+=======
+          name: manywheel-py3_12-rocm6_3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1781,7 +3242,124 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_12-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_12-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1789,7 +3367,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1810,16 +3392,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+=======
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1935,6 +3526,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_12-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1948,11 +3541,19 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-xpu-test:  # Testing
@@ -1972,13 +3573,21 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1996,7 +3605,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2007,7 +3619,11 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2015,7 +3631,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2039,6 +3659,10 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-xpu
     secrets:
@@ -2058,6 +3682,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cpu
@@ -2079,6 +3707,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
       build_environment: linux-binary-manywheel
@@ -2101,6 +3733,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
     secrets:
@@ -2117,15 +3753,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-test:  # Testing
@@ -2140,15 +3788,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-upload:  # Uploading
@@ -2163,10 +3823,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
     secrets:
@@ -2183,15 +3851,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-test:  # Testing
@@ -2206,15 +3886,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-upload:  # Uploading
@@ -2229,10 +3921,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
     secrets:
@@ -2249,15 +3949,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_9-test:  # Testing
@@ -2272,15 +3984,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_9-upload:  # Uploading
@@ -2295,78 +4019,246 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-cuda13_0-build:
+=======
+  manywheel-py3_13-rocm6_3-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-cuda13_0
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cuda13_0-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda13_0
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda13_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: manywheel-py3_13-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
+    needs:
+      - manywheel-py3_13-rocm6_4-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda13_0
+      build_name: manywheel-py3_13-rocm6_3
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda13_0-test:  # Testing
+  manywheel-py3_13-rocm6_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13-cuda13_0-build
+      - manywheel-py3_13-rocm6_3-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda13_0
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda13_0-upload:  # Uploading
+    permissions:
+      id-token: write
+      contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+<<<<<<< HEAD
+          name: manywheel-py3_13-rocm6_4
+=======
+          name: manywheel-py3_13-rocm6_3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+<<<<<<< HEAD
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_13-rocm6_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-cuda13_0-test
+    needs: manywheel-py3_13-rocm6_3-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda13_0
+      build_name: manywheel-py3_13-rocm6_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2381,13 +4273,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
       build_name: manywheel-py3_13-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -2405,15 +4297,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
       DESIRED_PYTHON: "3.13"
-    permissions:
-      id-token: write
-      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
@@ -2425,7 +4315,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2447,7 +4336,8 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2455,7 +4345,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2476,16 +4370,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+=======
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2601,6 +4504,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2614,11 +4519,19 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-xpu-test:  # Testing
@@ -2638,13 +4551,21 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -2662,7 +4583,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2673,7 +4597,11 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2681,7 +4609,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2705,6 +4637,10 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-xpu
     secrets:
@@ -2724,6 +4660,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cpu
@@ -2745,6 +4685,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
       build_environment: linux-binary-manywheel
@@ -2767,6 +4711,10 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
     secrets:
@@ -2783,15 +4731,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-test:  # Testing
@@ -2806,15 +4766,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-upload:  # Uploading
@@ -2829,10 +4801,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+=======
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
     secrets:
@@ -2849,15 +4829,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-test:  # Testing
@@ -2872,15 +4864,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-upload:  # Uploading
@@ -2895,10 +4899,18 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+=======
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
     secrets:
@@ -2915,15 +4927,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_9-test:  # Testing
@@ -2938,15 +4962,27 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_9-upload:  # Uploading
@@ -2961,17 +4997,29 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu129
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+=======
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-cuda13_0-build:
+=======
+  manywheel-py3_13t-rocm6_3-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2980,6 +5028,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
@@ -3062,6 +5111,24 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
       - manywheel-py3_13t-rocm6_4-build
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-rocm6_3
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-rocm6_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-rocm6_3-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -3070,6 +5137,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.4
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
@@ -3080,18 +5148,35 @@ jobs:
     permissions:
       id-token: write
       contents: read
+=======
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: manywheel-py3_13t-rocm6_4
+=======
+          name: manywheel-py3_13t-rocm6_3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3113,7 +5198,124 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_13t-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-rocm6_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-rocm6_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_13t-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3121,7 +5323,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3142,16 +5348,25 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+=======
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3267,6 +5482,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13t-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3280,11 +5497,19 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-xpu-test:  # Testing
@@ -3304,13 +5529,21 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/setup-xpu@main
+=======
+        uses: ./.github/actions/setup-xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -3328,7 +5561,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3339,7 +5575,11 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3347,7 +5587,11 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3371,11 +5615,16 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
 
   manywheel-py3_14-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
@@ -4708,3 +6957,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
index 4a7ebe8366336..2fcb18482eb99 100644
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@@ -41,12 +41,86 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+  manywheel-py3_9-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_9-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -60,6 +134,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
@@ -83,6 +161,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
@@ -105,6 +187,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-s390x
     secrets:
@@ -124,6 +210,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
@@ -147,6 +237,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
@@ -169,6 +263,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-s390x
     secrets:
@@ -188,6 +286,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
@@ -211,6 +313,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
@@ -233,6 +339,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-s390x
     secrets:
@@ -252,6 +362,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
@@ -275,6 +389,10 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
@@ -297,11 +415,16 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+<<<<<<< HEAD
+=======
+      use_split_build: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
 
   manywheel-py3_13t-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
@@ -494,3 +617,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index 109e98cd9d91f..86ff96b7a2e0c 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -46,7 +46,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -60,6 +64,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -70,6 +75,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -80,9 +102,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index afe9330deb83d..6ac8df02540f4 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -30,6 +30,132 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+<<<<<<< HEAD
+=======
+  wheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_9-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
@@ -56,6 +182,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -66,6 +193,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -76,9 +220,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -94,6 +248,11 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -103,10 +262,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -165,6 +334,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -175,6 +345,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -185,9 +372,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -203,6 +400,11 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -212,10 +414,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -274,6 +486,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -284,6 +497,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -294,9 +524,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -312,6 +552,11 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -321,10 +566,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -383,6 +638,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -393,6 +649,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -403,9 +676,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -421,6 +704,11 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -430,10 +718,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -492,6 +790,7 @@ jobs:
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           # shellcheck disable=SC2129
           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -502,6 +801,23 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -512,9 +828,19 @@ jobs:
         working-directory: pytorch
       - name: Populate binary env
         run: |
+<<<<<<< HEAD
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -530,6 +856,11 @@ jobs:
           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
       - name: Test PyTorch wheel
         run: |
+<<<<<<< HEAD
+=======
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           set -eux -o pipefail
           # shellcheck disable=SC1090
           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@@ -539,10 +870,20 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
+<<<<<<< HEAD
 
           # shellcheck disable=SC2086
           python -mvenv test_venv
           source test_venv/bin/activate
+=======
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
+          conda activate test_conda_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
           # shellcheck disable=SC2086
@@ -575,6 +916,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_14-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
@@ -793,3 +1135,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
index 7c26dbc3b9eea..3f5e0579ea009 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@@ -41,7 +41,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +55,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +72,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +140,11 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +157,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +221,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
index 5e30b66183840..b2e5c084a0213 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@@ -41,7 +41,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +55,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +72,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +140,11 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +157,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +221,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
index 1368bc942350e..8f5c089c1d67c 100644
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@@ -41,7 +41,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +55,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -124,7 +132,11 @@ jobs:
       - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -198,7 +210,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -271,7 +287,11 @@ jobs:
       - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -345,7 +365,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -418,7 +442,11 @@ jobs:
       - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
new file mode 100644
index 0000000000000..33d97946c6dc8
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@@ -0,0 +1,259 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-debug
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  OS: windows
+concurrency:
+  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cpu-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 3ca3364e5de88..131e97fa2b158 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -35,7 +35,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -44,8 +48,13 @@ jobs:
   libtorch-cpu-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +67,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -84,7 +97,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +133,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -160,7 +180,11 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +197,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -190,7 +218,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +254,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -283,7 +318,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -291,22 +330,35 @@ jobs:
   libtorch-cuda12_6-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -332,7 +384,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +420,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -408,21 +467,33 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -439,7 +510,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +546,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -527,13 +605,21 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cuda12_6-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -541,22 +627,35 @@ jobs:
   libtorch-cuda12_8-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -582,7 +681,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +717,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -658,21 +764,33 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -689,7 +807,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +843,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -777,36 +902,61 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cuda12_8-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  libtorch-cuda12_9-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -832,7 +982,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +1018,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -884,7 +1041,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: libtorch-cuda13_0-shared-with-deps-debug
+=======
+          name: libtorch-cuda12_9-shared-with-deps-debug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,6 +1063,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -909,20 +1071,38 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -939,7 +1119,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +1155,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -992,7 +1179,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: libtorch-cuda13_0-shared-with-deps-debug
+=======
+          name: libtorch-cuda12_9-shared-with-deps-debug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1206,44 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
+=======
+  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: libtorch-cuda13_0-shared-with-deps-debug-test
+=======
+    needs: libtorch-cuda12_9-shared-with-deps-debug-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda13_0-shared-with-deps-debug
+=======
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_9-shared-with-deps-debug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
new file mode 100644
index 0000000000000..de71e497e7328
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@@ -0,0 +1,259 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-release
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  OS: windows
+concurrency:
+  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index c6d1e2cf3b017..1c93a77015cab 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -35,7 +35,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -44,8 +48,13 @@ jobs:
   libtorch-cpu-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +67,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -84,7 +97,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +133,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -160,7 +180,11 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +197,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -190,7 +218,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +254,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -283,7 +318,11 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -291,22 +330,35 @@ jobs:
   libtorch-cuda12_6-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -332,7 +384,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +420,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -408,21 +467,33 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -439,7 +510,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +546,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -527,13 +605,21 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cuda12_6-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -541,22 +627,35 @@ jobs:
   libtorch-cuda12_8-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -582,7 +681,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +717,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -658,21 +764,33 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -689,7 +807,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +843,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -777,36 +902,61 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build_name: libtorch-cuda12_8-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  libtorch-cuda12_9-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -832,7 +982,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +1018,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -884,7 +1041,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: libtorch-cuda13_0-shared-with-deps-release
+=======
+          name: libtorch-cuda12_9-shared-with-deps-release
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,6 +1063,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -909,20 +1071,38 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
+=======
+      DESIRED_PYTHON: "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Display EC2 information
         shell: bash
@@ -939,7 +1119,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +1155,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -992,7 +1179,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: libtorch-cuda13_0-shared-with-deps-release
+=======
+          name: libtorch-cuda12_9-shared-with-deps-release
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1206,44 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
+=======
+  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: libtorch-cuda13_0-shared-with-deps-release-test
+=======
+    needs: libtorch-cuda12_9-shared-with-deps-release-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda13_0-shared-with-deps-release
+=======
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index e14cb79c0000e..f085c60d21aeb 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -35,17 +35,1204 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
   wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_9-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_9-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-cuda12_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_9-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_9-cuda12_6-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda12_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cuda12_6-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cuda12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_9-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-cuda12_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_9-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_9-cuda12_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda12_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cuda12_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cuda12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_9-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-cuda12_9
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_9-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda12_9
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cuda12_9-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_9-xpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_9-xpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_9-xpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_9-xpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-xpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-xpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-xpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-xpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -80,7 +1267,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -112,7 +1303,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -156,7 +1350,11 @@ jobs:
       - wheel-py3_10-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -182,7 +1380,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -214,7 +1416,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -279,15 +1484,24 @@ jobs:
   wheel-py3_10-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -316,7 +1530,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -348,7 +1566,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -392,14 +1613,22 @@ jobs:
       - wheel-py3_10-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -419,7 +1648,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -451,7 +1684,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -507,7 +1743,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda12_6
@@ -517,15 +1757,24 @@ jobs:
   wheel-py3_10-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -554,7 +1803,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -586,7 +1839,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -630,14 +1886,22 @@ jobs:
       - wheel-py3_10-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -657,7 +1921,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -689,7 +1957,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -745,25 +2016,42 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_10-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_10-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -792,7 +2080,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -824,7 +2116,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -844,7 +2139,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: wheel-py3_10-cuda13_0
+=======
+          name: wheel-py3_10-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -862,6 +2161,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   wheel-py3_10-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -869,13 +2169,27 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_10-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -895,7 +2209,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -927,7 +2245,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -948,7 +2269,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: wheel-py3_10-cuda13_0
+=======
+          name: wheel-py3_10-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -971,30 +2296,51 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   wheel-py3_10-cuda13_0-upload:  # Uploading
+=======
+  wheel-py3_10-cuda12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: wheel-py3_10-cuda13_0-test
+=======
+    needs: wheel-py3_10-cuda12_9-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cuda13_0
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_10-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1004,7 +2350,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1030,7 +2380,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1062,7 +2416,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1106,7 +2463,11 @@ jobs:
       - wheel-py3_10-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1132,7 +2493,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1164,7 +2529,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1229,8 +2597,13 @@ jobs:
   wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1265,7 +2638,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1297,7 +2674,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1341,7 +2721,11 @@ jobs:
       - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1367,7 +2751,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1399,7 +2787,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1464,15 +2855,24 @@ jobs:
   wheel-py3_11-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1501,7 +2901,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1533,7 +2937,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1577,14 +2984,22 @@ jobs:
       - wheel-py3_11-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1604,7 +3019,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1636,7 +3055,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1692,7 +3114,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda12_6
@@ -1702,15 +3128,24 @@ jobs:
   wheel-py3_11-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1739,7 +3174,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1771,7 +3210,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1815,14 +3257,22 @@ jobs:
       - wheel-py3_11-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1842,7 +3292,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1874,7 +3328,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1930,25 +3387,42 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_11-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1977,7 +3451,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2009,7 +3487,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2029,7 +3510,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: wheel-py3_11-cuda13_0
+=======
+          name: wheel-py3_11-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2047,6 +3532,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   wheel-py3_11-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -2054,13 +3540,27 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_11-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2080,7 +3580,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2112,7 +3616,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2133,7 +3640,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: wheel-py3_11-cuda13_0
+=======
+          name: wheel-py3_11-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2156,30 +3667,51 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   wheel-py3_11-cuda13_0-upload:  # Uploading
+=======
+  wheel-py3_11-cuda12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: wheel-py3_11-cuda13_0-test
+=======
+    needs: wheel-py3_11-cuda12_9-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cuda13_0
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_11-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2189,7 +3721,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2215,7 +3751,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2247,7 +3787,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2291,7 +3834,11 @@ jobs:
       - wheel-py3_11-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2317,7 +3864,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2349,7 +3900,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2414,8 +3968,13 @@ jobs:
   wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2450,7 +4009,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2482,7 +4045,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2526,7 +4092,11 @@ jobs:
       - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2552,7 +4122,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2584,7 +4158,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2649,15 +4226,24 @@ jobs:
   wheel-py3_12-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -2686,7 +4272,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2718,7 +4308,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2762,14 +4355,22 @@ jobs:
       - wheel-py3_12-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -2789,7 +4390,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2821,7 +4426,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2877,7 +4485,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda12_6
@@ -2887,15 +4499,24 @@ jobs:
   wheel-py3_12-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -2924,7 +4545,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2956,7 +4581,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3000,14 +4628,22 @@ jobs:
       - wheel-py3_12-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3027,7 +4663,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3059,7 +4699,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3115,25 +4758,42 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_12-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3162,7 +4822,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3194,7 +4858,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3214,7 +4881,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: wheel-py3_12-cuda13_0
+=======
+          name: wheel-py3_12-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3232,6 +4903,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   wheel-py3_12-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -3239,13 +4911,27 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_12-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_12-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3265,7 +4951,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3297,7 +4987,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3318,7 +5011,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: wheel-py3_12-cuda13_0
+=======
+          name: wheel-py3_12-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3341,30 +5038,51 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   wheel-py3_12-cuda13_0-upload:  # Uploading
+=======
+  wheel-py3_12-cuda12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: wheel-py3_12-cuda13_0-test
+=======
+    needs: wheel-py3_12-cuda12_9-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cuda13_0
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_12-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3374,7 +5092,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3400,7 +5122,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3432,7 +5158,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3476,7 +5205,11 @@ jobs:
       - wheel-py3_12-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3502,7 +5235,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3534,7 +5271,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3599,8 +5339,13 @@ jobs:
   wheel-py3_13-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3635,7 +5380,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3667,7 +5416,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3711,7 +5463,11 @@ jobs:
       - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3737,7 +5493,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3769,7 +5529,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3834,15 +5597,24 @@ jobs:
   wheel-py3_13-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -3871,7 +5643,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3903,7 +5679,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3947,14 +5726,22 @@ jobs:
       - wheel-py3_13-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -3974,7 +5761,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4006,7 +5797,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4062,7 +5856,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda12_6
@@ -4072,15 +5870,24 @@ jobs:
   wheel-py3_13-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4109,7 +5916,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4141,7 +5952,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4185,14 +5999,22 @@ jobs:
       - wheel-py3_13-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4212,7 +6034,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4244,7 +6070,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4300,25 +6129,42 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_13-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4347,7 +6193,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4379,7 +6229,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4399,7 +6252,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: wheel-py3_13-cuda13_0
+=======
+          name: wheel-py3_13-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4417,6 +6274,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   wheel-py3_13-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -4424,13 +6282,27 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_13-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_13-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4450,7 +6322,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4482,7 +6358,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4503,7 +6382,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: wheel-py3_13-cuda13_0
+=======
+          name: wheel-py3_13-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4526,30 +6409,51 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   wheel-py3_13-cuda13_0-upload:  # Uploading
+=======
+  wheel-py3_13-cuda12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: wheel-py3_13-cuda13_0-test
+=======
+    needs: wheel-py3_13-cuda12_9-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cuda13_0
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_13-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4559,7 +6463,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4585,7 +6493,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4617,7 +6529,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4661,7 +6576,11 @@ jobs:
       - wheel-py3_13-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4687,7 +6606,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4719,7 +6642,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4784,8 +6710,13 @@ jobs:
   wheel-py3_13t-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4820,7 +6751,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4852,7 +6787,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4896,7 +6834,11 @@ jobs:
       - wheel-py3_13t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4922,7 +6864,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4954,7 +6900,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5019,15 +6968,24 @@ jobs:
   wheel-py3_13t-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5056,7 +7014,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5088,7 +7050,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5132,14 +7097,22 @@ jobs:
       - wheel-py3_13t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5159,7 +7132,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5191,7 +7168,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5247,7 +7227,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.6"
+=======
+      GPU_ARCH_VERSION: 12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda12_6
@@ -5257,15 +7241,24 @@ jobs:
   wheel-py3_13t-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5294,7 +7287,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5326,7 +7323,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5370,14 +7370,22 @@ jobs:
       - wheel-py3_13t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5397,7 +7405,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5429,7 +7441,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5485,25 +7500,42 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
+<<<<<<< HEAD
       GPU_ARCH_VERSION: "12.8"
+=======
+      GPU_ARCH_VERSION: 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_13t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_13t-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5532,7 +7564,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5564,7 +7600,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5584,7 +7623,11 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
+<<<<<<< HEAD
           name: wheel-py3_13t-cuda13_0
+=======
+          name: wheel-py3_13t-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5602,6 +7645,7 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
+<<<<<<< HEAD
   wheel-py3_13t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -5609,13 +7653,27 @@ jobs:
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 360
+=======
+  wheel-py3_13t-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_13t-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5635,7 +7693,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5667,7 +7729,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5688,7 +7753,11 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
+<<<<<<< HEAD
           name: wheel-py3_13t-cuda13_0
+=======
+          name: wheel-py3_13t-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5711,30 +7780,51 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
   wheel-py3_13t-cuda13_0-upload:  # Uploading
+=======
+  wheel-py3_13t-cuda12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: wheel-py3_13t-cuda13_0-test
+=======
+    needs: wheel-py3_13t-cuda12_9-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu130
       GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cuda13_0
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_13t-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
     timeout-minutes: 360
+=======
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5744,7 +7834,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5770,7 +7864,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5802,7 +7900,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5846,7 +7947,11 @@ jobs:
       - wheel-py3_13t-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+<<<<<<< HEAD
     timeout-minutes: 360
+=======
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5872,7 +7977,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5904,7 +8013,10 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
+<<<<<<< HEAD
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5966,6 +8078,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_14-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -8336,3 +10449,5 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
index be19b8f961f4d..1c582731776f0 100644
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@@ -8,23 +8,33 @@ on:
   push:
     tags:
       - ciflow/h100-distributed/*
+<<<<<<< HEAD
   schedule:
     - cron: 46 8 * * *  # about 1:46am PDT
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 jobs:
 
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -37,7 +47,11 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runner: "linux.c7i.12xlarge"
+=======
+      runner: "linux.12xlarge"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '9.0'
diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml
index c6cc075e6b270..ce2b5f9bdec16 100644
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@@ -13,6 +13,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
@@ -25,6 +26,18 @@ jobs:
     with:
       build-environment: linux-jammy-py3.9-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+permissions: read-all
+
+jobs:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Use metal host for benchmark jobs
       test-matrix: |
         { include: [
@@ -32,6 +45,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-micro-benchmark-test:
     name: inductor-micro-benchmark-test
     uses: ./.github/workflows/_linux-test.yml
@@ -40,5 +54,15 @@ jobs:
       build-environment: linux-jammy-py3.9-gcc11
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index a0ae234ab5669..9d23a6d7eb128 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -13,14 +13,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index 78602e05586b7..8ff8ffd482201 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -16,14 +16,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -32,6 +40,7 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   nightly-dynamo-benchmarks-build:
     name: nightly-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
@@ -39,6 +48,15 @@ jobs:
     with:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -48,6 +66,7 @@ jobs:
           { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
@@ -59,5 +78,17 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
       test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+=======
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 628f624240127..51d22211b3124 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -10,15 +10,23 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -43,7 +51,10 @@ jobs:
           { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
           { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio fbgemm torchao"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   test:
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index e16c8be79130d..47daf1a70ad92 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -48,14 +48,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -116,7 +124,10 @@ jobs:
           { config: "inductor_torchbench_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" },
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
+<<<<<<< HEAD
       build-additional-packages: "vision audio torchao"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
 
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index 8209bf053a772..a9100f4862a97 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -2,7 +2,11 @@ name: inductor-perf-nightly-h100
 
 on:
   schedule:
+<<<<<<< HEAD
     - cron: 15 0 * * 1-6
+=======
+    - cron: 15 0,4,8,12,16,20 * * 1-6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - cron: 0 7 * * 0
   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@@ -58,6 +62,7 @@ on:
         required: false
         type: string
         default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100
+<<<<<<< HEAD
   pull_request:
     # Changing these files guarantees that this workflow needs to be run
     paths:
@@ -71,11 +76,23 @@ concurrency:
 permissions:
   id-token: write
   contents: read
+=======
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -84,17 +101,26 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   build:
     name: build
+=======
+  # NB: Keep this in sync with trunk.yml
+  build:
+    name: cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
       # or newer GPUs, so it doesn't benefit much from existing compiler cache
       # from trunk. Also use a memory-intensive runner here because memory is
       # usually the bottleneck
       runner: linux.12xlarge.memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '9.0'
@@ -123,6 +149,7 @@ jobs:
           { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
+<<<<<<< HEAD
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
@@ -131,6 +158,15 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '15 0 * * 1-6'
+=======
+    secrets: inherit
+
+  test-periodically:
+    name: cuda12.8-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
@@ -144,7 +180,11 @@ jobs:
     secrets: inherit
 
   test-weekly:
+<<<<<<< HEAD
     name: test-weekly
+=======
+    name: cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
@@ -161,6 +201,7 @@ jobs:
     secrets: inherit
 
   test:
+<<<<<<< HEAD
     name: test
     uses: ./.github/workflows/_linux-test.yml
     needs: build
@@ -170,6 +211,15 @@ jobs:
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
       dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
+=======
+    name: cuda12.8-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml
index 81c1c27b76439..3482da7e80639 100644
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@@ -48,9 +48,12 @@ jobs:
           { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+<<<<<<< HEAD
           { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
           { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
           { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -63,7 +66,10 @@ jobs:
       # Same as the build job
       python-version: 3.12.7
       test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+<<<<<<< HEAD
       timeout-minutes: 300
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       disable-monitor: false
       monitor-log-interval: 15
       monitor-data-collect-interval: 4
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml
new file mode 100644
index 0000000000000..25da0dae163d3
--- /dev/null
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@@ -0,0 +1,123 @@
+name: inductor-perf-nightly-rocm
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-test-nightly-rocm/*
+  schedule:
+    - cron: 0 7 * * 0
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  linux-jammy-rocm-py3_10-inductor-benchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: rocm-py3_10-inductor-benchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-inductor-benchmark-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3_10-inductor-benchmark-test
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # Disable monitor in perf tests for more investigation
+      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
+    secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
index a7110b0fd9328..ad3514c117daa 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -43,29 +43,44 @@ on:
         required: false
         type: boolean
         default: false
+<<<<<<< HEAD
       freezing:
         description: Run freezing?
         required: false
         type: boolean
         default: true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
         type: string
+<<<<<<< HEAD
         default: inductor_huggingface_perf_cpu_x86_zen,inductor_timm_perf_cpu_x86_zen,inductor_torchbench_perf_cpu_x86_zen
+=======
+        default: inductor_huggingface_perf_zen_cpu_x86,inductor_timm_perf_zen_cpu_x86,inductor_torchbench_perf_zen_cpu_x86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -74,12 +89,18 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   inductor-build:
     name: inductor-build
+=======
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
@@ -96,10 +117,29 @@ jobs:
           { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" },
           { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" },
           { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" },
+=======
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-test-nightly:
     name: inductor-test-nightly
     uses: ./.github/workflows/_linux-test.yml
@@ -110,6 +150,18 @@ jobs:
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    if: github.event.schedule == '0 7 * * *'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -117,6 +169,7 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-test:
     name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
@@ -126,6 +179,19 @@ jobs:
       dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+=======
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 0533184df2e0e..5ce608d6582be 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -1,9 +1,12 @@
 name: inductor-perf-nightly-x86
 
 on:
+<<<<<<< HEAD
   pull_request:
     paths:
       - .github/workflows/inductor-perf-test-nightly-x86.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   schedule:
     # - cron: 0 7 * * 1-6
     # - cron: 0 7 * * 0
@@ -43,11 +46,14 @@ on:
         required: false
         type: boolean
         default: false
+<<<<<<< HEAD
       freezing:
         description: Run freezing?
         required: false
         type: boolean
         default: true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -55,17 +61,28 @@ on:
         default: inductor_huggingface_perf_cpu_x86,inductor_timm_perf_cpu_x86,inductor_torchbench_perf_cpu_x86
 
 concurrency:
+<<<<<<< HEAD
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 permissions:
   id-token: write
   contents: read
+=======
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -74,14 +91,24 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   inductor-build:
     name: inductor-build
+=======
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@@ -98,6 +125,7 @@ jobs:
           { config: "inductor_torchbench_perf_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xl.spr-metal" },
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
+<<<<<<< HEAD
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
@@ -111,6 +139,21 @@ jobs:
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+=======
+    secrets: inherit
+
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    if: github.event.schedule == '0 7 * * *'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -118,6 +161,7 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-test:
     name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
@@ -128,6 +172,19 @@ jobs:
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
       docker-image: ${{ needs.inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+=======
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 19f72ba453414..9bc518ede20e8 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -63,14 +63,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -79,14 +87,21 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
+=======
+  # NB: Keep this in sync with trunk.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   build:
     name: cuda12.8-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       # Every bit to make perf run faster helps
       runner: linux.12xlarge.memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
@@ -113,7 +128,10 @@ jobs:
           { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
+<<<<<<< HEAD
       build-additional-packages: "vision audio fbgemm torchao"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   test-nightly:
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index b08d9865d15d3..13949b3a3f651 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -15,14 +15,22 @@ concurrency:
   cancel-in-progress: true
 
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -31,15 +39,24 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   periodic-dynamo-benchmarks-build:
     name: periodic-dynamo-benchmarks-build
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+<<<<<<< HEAD
       cuda-arch-list: '8.0;8.6'
+=======
+      cuda-arch-list: '8.6'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@@ -57,6 +74,7 @@ jobs:
           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+<<<<<<< HEAD
           { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@@ -133,6 +151,64 @@ jobs:
 
   inductor-smoke-build:
     name: inductor-smoke-build
+=======
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
+    if: github.repository_owner == 'pytorch'
+    name: rocm-py3_10-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3_10-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
@@ -145,6 +221,7 @@ jobs:
         { include: [
           { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
@@ -165,6 +242,27 @@ jobs:
     with:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -179,6 +277,69 @@ jobs:
           { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+<<<<<<< HEAD
+=======
+        ]}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+      test-matrix: |
+        { include: [
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
+      test-matrix: |
+        { include: [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@@ -201,6 +362,7 @@ jobs:
           { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
@@ -212,4 +374,16 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
+=======
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index 732ec7eb85f3e..2a13d38059c3f 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -28,7 +28,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -47,8 +51,13 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
           { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+=======
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml
index b1bb7972d67de..64b3e12cfca09 100644
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@@ -7,6 +7,10 @@ on:
       - release/*
     tags:
       - ciflow/inductor-rocm/*
+<<<<<<< HEAD
+=======
+      - ciflow/inductor/*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
 
 concurrency:
@@ -20,7 +24,11 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index 6ab276a57fc4d..618b97c774973 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -12,14 +12,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-unittest
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -28,8 +36,13 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   inductor-build:
     name: inductor-build
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -47,6 +60,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-test:
     name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
@@ -59,6 +73,46 @@ jobs:
 
   inductor-halide-build:
     name: inductor-halide-build
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -71,6 +125,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-halide-test:
     name: inductor-halide-test
     uses: ./.github/workflows/_linux-test.yml
@@ -83,6 +138,20 @@ jobs:
 
   inductor-triton-cpu-build:
     name: inductor-triton-cpu-build
+=======
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -95,6 +164,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-triton-cpu-test:
     name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
     uses: ./.github/workflows/_linux-test.yml
@@ -112,6 +182,25 @@ jobs:
     with:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -122,6 +211,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   inductor-cpu-test:
     name: inductor-cpu-test
     uses: ./.github/workflows/_linux-test.yml
@@ -130,4 +220,39 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+=======
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 2616141c0dc2a..80b1354cc6988 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -22,9 +22,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   unit-test:
@@ -35,7 +39,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -44,8 +52,13 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
+<<<<<<< HEAD
   inductor-build:
     name: inductor-build
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -53,6 +66,10 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
+=======
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@@ -61,6 +78,7 @@ jobs:
           { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
@@ -82,6 +100,29 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+=======
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@@ -93,6 +134,7 @@ jobs:
           { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
           { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
         ]}
+<<<<<<< HEAD
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
@@ -104,4 +146,16 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+=======
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml
index b962970dc5b78..e1eb6a7ee1a6e 100644
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@@ -13,7 +13,11 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }}
     steps:
       - name: Checkout pytorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: true
           fetch-depth: 0
diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml
index e0de9ede35084..d2cfc48fe3a37 100644
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@@ -20,7 +20,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Run BC Lint Action
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/bc-lint@main
+=======
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo: ${{ github.event.pull_request.head.repo.full_name }}
           base_sha: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 729b111574851..f46d53d2100a9 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,7 +12,10 @@ on:
       - landchecks/*
     tags:
       - ciflow/pull/*
+<<<<<<< HEAD
       - ciflow/trunk/*
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
 
 permissions: read-all
@@ -22,12 +25,17 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
 
+<<<<<<< HEAD
   get-changed-files:
     if: github.repository_owner == 'pytorch'
     name: Get changed files
@@ -59,12 +67,22 @@ jobs:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
       docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
+=======
+  lintrunner-clang:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    needs: get-label-type
+    with:
+      timeout: 120
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
       submodules: true
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
+<<<<<<< HEAD
         CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
         if [ "$CHANGED_FILES" = "*" ]; then
           export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
@@ -109,12 +127,26 @@ jobs:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
       docker-image: ci-image:pytorch-linux-jammy-linter
+=======
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
+        export CLANG=1
+        .github/scripts/lintrunner.sh
+
+  lintrunner-noclang:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    needs: get-label-type
+    with:
+      timeout: 120
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
       submodules: true
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
+<<<<<<< HEAD
         CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
         echo "Running all other linters"
         if [ "$CHANGED_FILES" = '*' ]; then
@@ -125,6 +157,13 @@ jobs:
 
   quick-checks:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+=======
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
+        .github/scripts/lintrunner.sh
+
+  quick-checks:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
@@ -164,7 +203,11 @@ jobs:
     if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: -1
@@ -177,7 +220,11 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
@@ -188,6 +235,10 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         # Regenerate workflows
+<<<<<<< HEAD
+=======
+        export RELEASE_VERSION_TAG=2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .github/scripts/generate_ci_workflows.py
 
         RC=0
@@ -211,7 +262,11 @@ jobs:
         exit $RC
 
   toc:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
@@ -247,7 +302,11 @@ jobs:
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
@@ -267,6 +326,7 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
         with:
           submodules: false
@@ -275,6 +335,16 @@ jobs:
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: '3.10'
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+        with:
+          submodules: false
+          fetch-depth: 1
+      - name: Setup Python 3.9
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.9'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           architecture: x64
           cache: pip
       - name: Install dependencies
@@ -304,7 +374,11 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required, to allow us to use git log
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
@@ -324,7 +398,10 @@ jobs:
           check-latest: false
           cache: pip
           cache-dependency-path: |
+<<<<<<< HEAD
             **/requirements-build.txt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **/requirements.txt
       - name: Setup Min Python version
         if: matrix.test_type != 'older_python_version'
@@ -335,7 +412,10 @@ jobs:
           check-latest: false
           cache: pip
           cache-dependency-path: |
+<<<<<<< HEAD
             **/requirements-build.txt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **/requirements.txt
       - name: Install torch
         if: matrix.test_type == 'with_torch'
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
index 2b840a39a5c21..456f931527369 100644
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@@ -19,7 +19,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml
index 565a9b25df50f..512c60abcc99f 100644
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@@ -12,7 +12,11 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -116,5 +120,9 @@ jobs:
           AWS_REGION: ""
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index c80599fe89988..87477dda1e2e3 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -28,6 +28,10 @@ jobs:
       # than our AWS macos-m1-14 runners
       test-matrix: |
         { include: [
+<<<<<<< HEAD
+=======
+          { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
           { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
         ]}
diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml
index acf3504dec9ca..d9e462b337bde 100644
--- a/.github/workflows/nightly-s3-uploads.yml
+++ b/.github/workflows/nightly-s3-uploads.yml
@@ -23,7 +23,11 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 696c5b68b475b..2299dc8c63342 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -19,7 +19,11 @@ concurrency:
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -42,8 +46,13 @@ jobs:
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   docs-push:
@@ -54,7 +63,11 @@ jobs:
       - get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
       push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
       run-doxygen: true
@@ -75,24 +88,38 @@ jobs:
             repo-owner: pytorch
             branch: main
             pin-folder: .github/ci_commit_pins
+<<<<<<< HEAD
           # executorch jobs are disabled since it needs some manual work for the hash update
           # - repo-name: executorch
           #   repo-owner: pytorch
           #   branch: main
           #   pin-folder: .ci/docker/ci_commit_pins
+=======
+          - repo-name: executorch
+            repo-owner: pytorch
+            branch: main
+            pin-folder: .ci/docker/ci_commit_pins
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           - repo-name: triton
             repo-owner: triton-lang
             branch: main
             pin-folder: .ci/docker/ci_commit_pins
+<<<<<<< HEAD
           - repo-name: vllm
             repo-owner: vllm-project
             branch: main
             pin-folder: .github/ci_commit_pins
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing
     if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
     steps:
       - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+=======
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo-owner: ${{ matrix.repo-owner }}
           repo-name: ${{ matrix.repo-name }}
diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml
index 40bd245ce913f..a996318343a9b 100644
--- a/.github/workflows/nitpicker.yml
+++ b/.github/workflows/nitpicker.yml
@@ -19,7 +19,11 @@ jobs:
     if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }}
     steps:
     - name: Checkout PyTorch
+<<<<<<< HEAD
       uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - uses: ethanis/nitpicker@v1
       with:
         nitpicks: '.github/nitpicks.yml'
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index 40fb3b8d0c85f..fc88b2ce5ea19 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -7,24 +7,34 @@ on:
   workflow_dispatch:
     inputs:
       test_mode:
+<<<<<<< HEAD
         type: choice
         options:
           - 'short'
           - 'long'
           - 'all'
+=======
+        required: false
+        type: string
+        default: 'short'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         description: tag filter for operator benchmarks, options from long, short, all
   schedule:
     # Run at 07:00 UTC every Sunday
     - cron: 0 7 * * 0
+<<<<<<< HEAD
   pull_request:
     paths:
       - benchmarks/operator_benchmark/**
       - .github/workflows/operator_benchmark.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
@@ -75,4 +85,43 @@ jobs:
       build-environment: linux-jammy-aarch64-py3.10
       docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
+=======
+permissions: read-all
+
+jobs:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
+    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml
index 4d8890e69fc73..cce4798eb8ee8 100644
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@@ -41,7 +41,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -59,9 +63,15 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
+=======
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 0c4668aa89c6b..70cd2683e3498 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -20,9 +20,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   llm-td:
@@ -43,7 +47,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -51,6 +59,7 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-jammy-cuda12_4-py3_10-gcc11-build:
     name: linux-jammy-cuda12.4-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -82,6 +91,8 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-cuda12_8-py3_10-gcc11-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -113,13 +124,22 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-cuda12_8-py3_10-gcc9-build:
     name: linux-jammy-cuda12.8-py3.10-gcc9
+=======
+  linux-jammy-cuda12_8-py3_9-gcc9-build:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9
+=======
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
       cuda-arch-list: 8.6
       test-matrix: |
@@ -127,6 +147,7 @@ jobs:
           { config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
           { config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
         ]}
+<<<<<<< HEAD
     secrets: inherit
 
   linux-jammy-cuda12_8-py3_10-gcc9-test:
@@ -137,6 +158,19 @@ jobs:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9
       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
+=======
+      build-with-debug: false
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_9-gcc9-test:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_9-gcc9-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
@@ -147,6 +181,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
+<<<<<<< HEAD
       cuda-arch-list: 8.9
       test-matrix: |
         { include: [
@@ -157,6 +192,18 @@ jobs:
           { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
           { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
           { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+=======
+      build-with-debug: true
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -172,6 +219,7 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-cuda13_0-py3_10-gcc11-build:
     name: linux-jammy-cuda13.0-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -204,6 +252,8 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-rocm-py3_10-build:
     name: linux-jammy-rocm-py3.10
     uses: ./.github/workflows/_linux-build.yml
@@ -214,9 +264,15 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+=======
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 7fdfab476705b..3ded657422413 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -19,9 +19,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   llm-td:
@@ -42,21 +46,35 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
 
+<<<<<<< HEAD
   linux-jammy-py3_10-gcc11-build:
     name: linux-jammy-py3.10-gcc11
+=======
+  linux-jammy-py3_9-gcc11-build:
+    name: linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -73,6 +91,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-gcc11-test:
     name: linux-jammy-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
@@ -83,11 +102,24 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
+=======
+  linux-jammy-py3_9-gcc11-test:
+    name: linux-jammy-py3.9-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-docs:
     name: linux-docs
     uses: ./.github/workflows/_docs.yml
+<<<<<<< HEAD
     needs: linux-jammy-py3_10-gcc11-build
     with:
       build-environment: linux-jammy-py3.10-gcc11
@@ -96,26 +128,51 @@ jobs:
 
   linux-jammy-py3_10-gcc11-no-ops:
     name: linux-jammy-py3.10-gcc11-no-ops
+=======
+    needs: linux-jammy-py3_9-gcc11-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+    secrets: inherit
+
+  linux-jammy-py3_9-gcc11-no-ops:
+    name: linux-jammy-py3.9-gcc11-no-ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11-no-ops
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-gcc11-pch:
     name: linux-jammy-py3.10-gcc11-pch
+=======
+  linux-jammy-py3_9-gcc11-pch:
+    name: linux-jammy-py3.9-gcc11-pch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11-pch
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -127,12 +184,16 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
+<<<<<<< HEAD
       runner: linux.2xlarge.memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -140,10 +201,22 @@ jobs:
           { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+=======
+          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
       sync-tag: asan-build
     secrets: inherit
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-py3_10-clang18-asan-test:
     name: linux-jammy-py3.10-clang18-asan
     uses: ./.github/workflows/_linux-test.yml
@@ -157,13 +230,22 @@ jobs:
       sync-tag: asan-test
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang12-onnx-build:
     name: linux-jammy-py3.10-clang12-onnx
+=======
+  linux-jammy-py3_9-clang12-onnx-build:
+    name: linux-jammy-py3.9-clang12-onnx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-clang12-onnx
+=======
+      build-environment: linux-jammy-py3.9-clang12-onnx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
       test-matrix: |
         { include: [
@@ -172,6 +254,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang12-onnx-test:
     name: linux-jammy-py3.10-clang12-onnx
     uses: ./.github/workflows/_linux-test.yml
@@ -186,12 +269,33 @@ jobs:
 
   linux-jammy-py3_10-clang12-build:
     name: linux-jammy-py3.10-clang12
+=======
+  linux-jammy-py3_9-clang12-onnx-test:
+    name: linux-jammy-py3.9-clang12-onnx
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-clang12
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+=======
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -208,6 +312,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang12-test:
     name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
@@ -218,6 +323,18 @@ jobs:
       build-environment: linux-jammy-py3.10-clang12
       docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+=======
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-py3_13-clang12-build:
@@ -252,22 +369,138 @@ jobs:
       build-environment: linux-jammy-py3.13-clang12
       docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
+<<<<<<< HEAD
     secrets: inherit
 
   linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
     name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+=======
+      timeout-minutes: 600
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-mobile-build:
+    name: linux-jammy-py3-clang12-mobile-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-mobile-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang15-asan
+      build-generates-artifacts: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit
+
+<<<<<<< HEAD
+=======
+  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-cpu-py3_10-gcc11-bazel-test:
     name: linux-jammy-cpu-py3.10-gcc11-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
@@ -283,14 +516,24 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
     name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+=======
+  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+=======
+      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -317,6 +560,65 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: 8.9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    if: false # Has been broken for a while
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
     name: cuda12.8-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-build.yml
@@ -342,6 +644,7 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-xpu-n-py3_10-build:
     name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
@@ -352,6 +655,17 @@ jobs:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-xpu-n-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+=======
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-2025-1-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index 226d773e48977..476691f003e69 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -26,7 +26,11 @@ jobs:
           architecture: x64
           check-latest: false
           cache: pip
+<<<<<<< HEAD
       - run: pip install pyyaml==6.0.2
+=======
+      - run: pip install pyyaml==6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup committer id
         run: |
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index c50111d068d24..374751730db1f 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -28,7 +28,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -36,13 +40,20 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-noble-rocm-py3_12-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     name: linux-noble-rocm-py3.12-mi300
+=======
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-rocm-py3.10-mi300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-noble-rocm-py3.12-mi300
       docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
       test-matrix: |
@@ -69,4 +80,33 @@ jobs:
       build-environment: linux-noble-rocm-py3.12-mi300
       docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
+=======
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10-mi300
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml
index 50a791432dc97..706904843cd74 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@@ -26,6 +26,7 @@ jobs:
       id-token: write
       contents: read
 
+<<<<<<< HEAD
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
@@ -36,13 +37,19 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   linux-jammy-rocm-py3_10-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     name: linux-jammy-rocm-py3.10
     uses: ./.github/workflows/_linux-build.yml
+<<<<<<< HEAD
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+=======
+    with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-environment: linux-jammy-rocm-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       sync-tag: rocm-build
diff --git a/.github/workflows/s390x-periodic.yml b/.github/workflows/s390x-periodic.yml
index 405e3e1a581cc..2723fa23dc2ba 100644
--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@@ -15,9 +15,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   llm-td:
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index d4992a2ddb2cf..b8c4cc8ba2dca 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -18,9 +18,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   llm-td:
@@ -41,7 +45,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -78,14 +86,24 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang12-build:
     name: linux-jammy-py3.10-clang12
+=======
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-clang12
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+=======
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@@ -93,6 +111,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang12-test:
     name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
@@ -103,6 +122,18 @@ jobs:
       build-environment: linux-jammy-py3.10-clang12
       docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+=======
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-rocm-py3_10-build:
@@ -140,7 +171,10 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
+<<<<<<< HEAD
       runner: linux.2xlarge.memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml
index ec579fda8da94..216550ba0f38e 100644
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@@ -13,7 +13,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -35,7 +39,11 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
           working-directory: pytorch
@@ -50,13 +58,21 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Clone CodeLlama
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -149,7 +165,11 @@ jobs:
             "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
 concurrency:
diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
index c712b11185a76..273776839c6c1 100644
--- a/.github/workflows/target_determination.yml
+++ b/.github/workflows/target_determination.yml
@@ -9,7 +9,11 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -27,7 +31,11 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
 
diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml
index 5f0ad59d3a3bb..1291f7fa1deee 100644
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@@ -15,7 +15,11 @@ jobs:
   check_binary_linux_cpu:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CPU
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
@@ -28,9 +32,15 @@ jobs:
   check_binary_linux_cuda:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CUDA
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g4dn.4xlarge.nvidia.gpu
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    with:
+      runner: linux.4xlarge.nvidia.gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
       script: |
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index ec99f4473bb0b..43335eb2ceb7b 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -4,10 +4,13 @@ on:
   pull_request:
     paths:
       - .github/workflows/test-h100.yml
+<<<<<<< HEAD
       - test/inductor/test_max_autotune.py
       - torch/_inductor/kernel/mm.py
       - torch/_inductor/kernel/mm_grouped.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
   schedule:
     - cron: 0 4,10,16,22 * * *  # every 6 hours
@@ -19,16 +22,23 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 jobs:
 
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -41,7 +51,11 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runner: linux.12xlarge.memory
+=======
+      runner: "linux.12xlarge"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '9.0'
@@ -61,6 +75,7 @@ jobs:
       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }}
     secrets: inherit
+<<<<<<< HEAD
 
   linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test:
     name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test
@@ -73,3 +88,5 @@ jobs:
       timeout-minutes: 30
       s3-bucket: gha-artifacts
     secrets: inherit
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index 08fcd33402625..c4694fe32d785 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -10,15 +10,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/trunk-tagging.yml b/.github/workflows/trunk-tagging.yml
index d96f2de8366aa..cc024b89cbdd9 100644
--- a/.github/workflows/trunk-tagging.yml
+++ b/.github/workflows/trunk-tagging.yml
@@ -58,10 +58,15 @@ jobs:
           else
             COMMIT_SHA="${{ github.sha }}"
           fi
+<<<<<<< HEAD
           {
             echo "sha=${COMMIT_SHA}"
             echo "tag_name=trunk/${COMMIT_SHA}"
           } >> "${GITHUB_OUTPUT}"
+=======
+          echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
+          echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Validate commit SHA
         run: |
@@ -89,7 +94,11 @@ jobs:
             echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)"
           fi
 
+<<<<<<< HEAD
       - name: Create and push tag(s) with retry
+=======
+      - name: Create and push tag with retry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: check_tag
         env:
           TAG_NAME: ${{ steps.commit.outputs.tag_name }}
@@ -114,6 +123,7 @@ jobs:
             return 1
           }
 
+<<<<<<< HEAD
           # Counters for summary reporting
           created_count=0
           skipped_count=0
@@ -131,6 +141,16 @@ jobs:
             fi
           }
           trap finish EXIT
+=======
+          # Exit early if tag already exists
+          if check_tag_exists; then
+            echo "✅ Tag already exists - no action needed"
+            echo "exists=true" >> "${GITHUB_OUTPUT}"
+            exit 0
+          fi
+
+          echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           # Retry configuration
           MAX_RETRIES=5
@@ -205,6 +225,7 @@ jobs:
             }
           }
 
+<<<<<<< HEAD
           # New behavior for push events: enumerate commits in the push and tag each one.
           # For workflow_dispatch, retain existing single-SHA behavior.
 
@@ -278,11 +299,21 @@ jobs:
               failed_count=1
               exit 1
             fi
+=======
+          # Execute with retry
+          if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
+            echo "exists=false" >> "${GITHUB_OUTPUT}"
+            exit 0
+          else
+            echo "Tag creation failed after all retry attempts"
+            exit 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           fi
 
       - name: Tag creation summary
         if: always()
         run: |
+<<<<<<< HEAD
           if [ "${{ github.event_name }}" = "push" ]; then
             echo "Trigger: push on main"
             echo "Created: ${{ steps.check_tag.outputs.created_count }}"
@@ -312,4 +343,21 @@ jobs:
             if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
               echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
             fi
+=======
+          if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then
+            echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
+          elif [ "${{ job.status }}" = "success" ]; then
+            echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          else
+            echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          fi
+
+          echo ""
+          echo "Tag details:"
+          echo "  Name: ${{ steps.commit.outputs.tag_name }}"
+          echo "  Commit: ${{ steps.commit.outputs.sha }}"
+          echo "  Trigger: ${{ github.event_name }}"
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 48d1c4490d726..8d3b9d1fbe956 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -16,9 +16,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   llm-td:
@@ -39,7 +43,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -56,13 +64,18 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       build-generates-artifacts: false
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runner: "linux.c7i.4xlarge"
+=======
+      runner: "linux.4xlarge"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-cuda12_8-py3_10-gcc11-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -100,6 +113,8 @@ jobs:
     secrets: inherit
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
   linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
@@ -131,6 +146,10 @@ jobs:
           { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
           { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
           { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
+<<<<<<< HEAD
+=======
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
         ]}
@@ -160,10 +179,16 @@ jobs:
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
           { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
           { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
           { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+=======
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -180,6 +205,7 @@ jobs:
       disable-monitor: false
     secrets: inherit
 
+<<<<<<< HEAD
   win-vs2022-cuda12_8-py3-build:
     name: win-vs2022-cuda12.8-py3
     uses: ./.github/workflows/_win-build.yml
@@ -187,6 +213,15 @@ jobs:
     with:
       build-environment: win-vs2022-cuda12.8-py3
       cuda-version: "12.8"
+=======
+  win-vs2022-cuda12_6-py3-build:
+    name: win-vs2022-cuda12.6-py3
+    uses: ./.github/workflows/_win-build.yml
+    needs: get-label-type
+    with:
+      build-environment: win-vs2022-cuda12.6-py3
+      cuda-version: "12.6"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
     secrets: inherit
 
@@ -202,8 +237,14 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+=======
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -221,6 +262,7 @@ jobs:
       build-environment: linux-jammy-rocm-py3.10
       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+<<<<<<< HEAD
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
     secrets: inherit
 
@@ -230,10 +272,23 @@ jobs:
     needs: get-label-type
     with:
       build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+=======
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+    secrets: inherit
+
+  # NB: Keep this in sync with inductor-perf-test-nightly.yml
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
 
+<<<<<<< HEAD
   # Test cross-compiled models with Windows libs extracted from wheel
   cross-compile-linux-test:
     name: cross-compile-linux-test
@@ -251,14 +306,21 @@ jobs:
         ]}
     secrets: inherit
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   verify-cachebench-cpu-build:
     name: verify-cachebench-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+=======
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -272,6 +334,7 @@ jobs:
       - verify-cachebench-cpu-build
       - target-determination
     with:
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
@@ -311,3 +374,9 @@ jobs:
       build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
       docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
     secrets: inherit
+=======
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
+    secrets: inherit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 5c456c607c887..e18659f799f1f 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -28,7 +28,11 @@ jobs:
           check-latest: false
           cache: pip
           architecture: x64
+<<<<<<< HEAD
       - run: pip install pyyaml==6.0.2
+=======
+      - run: pip install pyyaml==6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup committer id
         run: |
@@ -59,6 +63,7 @@ jobs:
             # on the PR appear in chronological order (timing issues can shuffle them around)
             sleep 60
           fi
+<<<<<<< HEAD
 
           # Require a comment id for merge operations
           if [ -z "${COMMENT_ID}" ]; then
@@ -72,6 +77,24 @@ jobs:
             python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
           else
             python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+=======
+          if [ -n "${FORCE}" ]; then
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            fi
+          elif [ -n "${IGNORE_CURRENT}" ]; then
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
+            fi
+          elif [ -n "${COMMENT_ID}" ]; then
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+          else
+            python3 .github/scripts/trymerge.py "${PR_NUM}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           fi
       - name: Comment on Canceled
         if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
index 1a8e00e4390be..43275303c3acc 100644
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@@ -25,7 +25,11 @@ jobs:
           architecture: x64
           check-latest: false
           cache: pip
+<<<<<<< HEAD
       - run: pip install pyyaml==6.0.2
+=======
+      - run: pip install pyyaml==6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup committer id
         run: |
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index b5955127d9fb3..038ac58bd4efb 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -12,9 +12,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+<<<<<<< HEAD
 permissions:
   id-token: write
   contents: read
+=======
+permissions: read-all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   # There must be at least one job here to satisfy GitHub action workflow syntax
@@ -46,7 +50,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index b3fc9efdf667f..62eadf26e2fd7 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -7,7 +7,11 @@ on:
 
 concurrency:
   group: ${{ github.workflow }}
+<<<<<<< HEAD
   cancel-in-progress: true
+=======
+  cancel-in-progress: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   do_update_viablestrict:
@@ -18,12 +22,20 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
     steps:
       - name: Update viable/strict
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+=======
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: update_viablestrict
         with:
           repository: pytorch/pytorch
           stable-branch: viable/strict
+<<<<<<< HEAD
           requires: '[\"pull\", \"trunk\", \"lint\",  \"linux-aarch64\"]'
+=======
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
@@ -48,7 +60,10 @@ jobs:
             echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
             pip install awscli==1.29.40
             aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+<<<<<<< HEAD
             # Push new viable/strict tag
             cd pytorch/pytorch
             git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           fi
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index a1b8c38141ae8..8a3eccc523a22 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -17,7 +17,11 @@ jobs:
       contents: read
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml
index 9aecaad0e068f..b0757b8622850 100644
--- a/.github/workflows/upload-test-stats-while-running.yml
+++ b/.github/workflows/upload-test-stats-while-running.yml
@@ -16,7 +16,11 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index f77b6081b776a..c5d6a4fef1158 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -14,7 +14,10 @@ on:
       - inductor-periodic
       - rocm
       - rocm-mi300
+<<<<<<< HEAD
       - rocm-mi355
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - inductor-micro-benchmark
       - inductor-micro-benchmark-x86
       - inductor-cu124
@@ -58,7 +61,11 @@ jobs:
         run: echo "${TRIGGERING_WORKFLOW}"
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Configure aws credentials
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
index 07471619437a2..f8a9fb57ec0e8 100644
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -32,7 +32,11 @@ jobs:
     name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
index 5702562006055..159d1e0873eba 100644
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@@ -17,7 +17,11 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index b95dadd5f2b1c..b820ca44ef3be 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -22,7 +22,11 @@ jobs:
           fetch-depth: 0
       - name: update-xla-commit-hash
         continue-on-error: true
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+=======
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo-name: xla
           branch: master
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index 36f603f70fde7..79d4b2d3fe3db 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -5,10 +5,13 @@ on:
     tags:
       - ciflow/xpu/*
   workflow_dispatch:
+<<<<<<< HEAD
   schedule:
     # Run 3 times on weekdays and less frequently on weekends.
     - cron: 45 0,8,16 * * 1-5
     - cron: 45 4 * * 0,6
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -19,13 +22,18 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-jammy-xpu-n-1-py3_10-build:
     name: linux-jammy-xpu-n-1-py3.10
     uses: ./.github/workflows/_linux-build.yml
@@ -36,6 +44,18 @@ jobs:
       build-environment: linux-jammy-xpu-n-1-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
       runner: linux.c7i.12xlarge
+=======
+  linux-jammy-xpu-2025_0-py3_9-build:
+    name: linux-jammy-xpu-2025.0-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-2025-0-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-jammy-xpu-2025.0-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
+      runner: linux.12xlarge
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@@ -47,6 +67,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-xpu-n-py3_10-build:
     name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
@@ -78,10 +99,38 @@ jobs:
     name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_xpu-test.yml
     needs: linux-jammy-xpu-n-py3_10-build
+=======
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-2025-1-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      runner: linux.12xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-xpu-2025_1-py3_9-test:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: linux-jammy-xpu-2025_1-py3_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     permissions:
       id-token: write
       contents: read
     with:
+<<<<<<< HEAD
       build-environment: linux-jammy-xpu-n-py3.10
       docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
@@ -93,11 +142,37 @@ jobs:
     uses: ./.github/workflows/_win-build.yml
     with:
       build-environment: win-vs2022-xpu-n-1-py3
+=======
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
+    secrets: inherit
+
+  windows-xpu-2025_0-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-2025_0-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.0'
+      vc-year: '2022'
+    secrets: inherit
+
+  windows-xpu-2025_1-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-2025_1-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-version: cpu
       use-xpu: true
       xpu-version: '2025.1'
       vc-year: '2022'
     secrets: inherit
+<<<<<<< HEAD
 
   windows-xpu-n-build:
     if: github.repository_owner == 'pytorch'
@@ -110,3 +185,5 @@ jobs:
       xpu-version: '2025.2'
       vc-year: '2022'
     secrets: inherit
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.gitignore b/.gitignore
index 447ef777e9291..7fd4227d952a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,7 +32,10 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
+<<<<<<< HEAD
 aten/src/ATen/hip/HIPConfig.h
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 benchmarks/.data
 caffe2/cpp_test/
 dist/
@@ -82,13 +85,20 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+<<<<<<< HEAD
 torch/csrc/functionalization/generated/*
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
+<<<<<<< HEAD
 torch/headeronly/version.h
+=======
+torch/csrc/api/include/torch/version.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
@@ -148,9 +158,12 @@ merge_record.json
 torchgen/packaged/*
 !torchgen/packaged/README.md
 
+<<<<<<< HEAD
 # This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
 torch/_rocm_init.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # IPython notebook checkpoints
 .ipynb_checkpoints
 
@@ -260,9 +273,12 @@ gen
 .pytest_cache
 aten/build/*
 
+<<<<<<< HEAD
 # Linker scripts for prioritized text optimization
 cmake/linker_script.ld
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Bram
 plsdontbreak
 
@@ -374,7 +390,10 @@ third_party/ruy/
 third_party/glog/
 
 # Virtualenv
+<<<<<<< HEAD
 .venv/
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 venv/
 
 # Log files
@@ -394,6 +413,9 @@ android/pytorch_android_torchvision/.cxx
 
 # Claude Code local configuration
 CLAUDE.local.md
+<<<<<<< HEAD
 /test_*.py
 /debug_*.py
 CLAUDE_CONTEXT/
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.gitmodules b/.gitmodules
index ba1bca8c7e6c6..3582acaa0af54 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -87,7 +87,11 @@
 	url = https://github.com/NVIDIA/cudnn-frontend.git
 [submodule "third_party/kineto"]
     path = third_party/kineto
+<<<<<<< HEAD
     url = https://github.com/pytorch/kineto.git
+=======
+    url = https://github.com/pytorch/kineto
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
 	url = https://github.com/mreineck/pocketfft
@@ -129,6 +133,9 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+<<<<<<< HEAD
 [submodule "third_party/aiter"]
 	path = third_party/aiter
 	url = https://github.com/ROCm/aiter.git
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 26ade791a1bde..0a48295057c14 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -13,12 +13,19 @@ exclude_patterns = [
     '**/fb/**',
     'functorch/docs/**',
     'functorch/examples/**',
+<<<<<<< HEAD
     'functorch/docs/source/tutorials/**',
+=======
+    'functorch/notebooks/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
     'scripts/**',
     'test/generated_type_hints_smoketest.py',
+<<<<<<< HEAD
     'test/test_torchfuzz_repros.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # CPython tests
     'test/dynamo/cpython/**',
     # Tests from the NumPy test suite
@@ -28,7 +35,10 @@ exclude_patterns = [
     'torch/lib/**',
     'venv/**',
     '**/*.pyi',
+<<<<<<< HEAD
     "tools/experimental/torchfuzz/**",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'tools/test/test_selective_build.py',
 ]
 command = [
@@ -41,6 +51,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'flake8==7.3.0',
     'flake8-bugbear==24.12.12',
     'flake8-comprehensions==3.16.0',
@@ -52,6 +63,19 @@ init_command = [
     'pycodestyle==2.14.0',
     'pyflakes==3.4.0',
     'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
+=======
+    'flake8==6.1.0',
+    'flake8-bugbear==23.3.23',
+    'flake8-comprehensions==3.15.0',
+    'flake8-executable==2.1.3',
+    'flake8-logging-format==0.9.0',
+    'flake8-pyi==23.3.1',
+    'flake8-simplify==0.19.3',
+    'mccabe==0.7.0',
+    'pycodestyle==2.11.1',
+    'pyflakes==3.1.0',
+    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -124,8 +148,11 @@ is_formatter = true
 [[linter]]
 code = 'MYPY'
 include_patterns = [
+<<<<<<< HEAD
     'setup.py',
     'functorch/dim/**/*.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/**/*.py',
     'torch/**/*.pyi',
     'caffe2/**/*.py',
@@ -135,7 +162,11 @@ include_patterns = [
     'test/test_complex.py',
     'test/test_datapipe.py',
     'test/test_futures.py',
+<<<<<<< HEAD
     'test/test_numpy_interop.py',
+=======
+    # 'test/test_numpy_interop.py',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/test_torch.py',
     'test/test_type_hints.py',
     'test/test_type_info.py',
@@ -155,22 +186,37 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
+=======
+    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'numpy==2.1.0 ; python_version >= "3.12"',
     'expecttest==0.3.0',
     'mypy==1.16.0',
     'sympy==1.13.3',
     'types-requests==2.27.25',
+<<<<<<< HEAD
     'types-pyyaml==6.0.2',
+=======
+    'types-pyyaml==6.0.1',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'types-tabulate==0.8.8',
     'types-protobuf==5.29.1.20250403',
     'types-setuptools==79.0.0.20250422',
     'types-jinja2==2.11.9',
     'types-colorama==0.4.6',
+<<<<<<< HEAD
     'filelock==3.18.0',
     'junitparser==2.1.1',
     'rich==14.1.0',
     'pyyaml==6.0.2',
+=======
+    'filelock==3.13.1',
+    'junitparser==2.1.1',
+    'rich==10.9.0',
+    'pyyaml==6.0.1',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'optree==0.13.0',
     'dataclasses-json==0.6.7',
     'pandas==2.2.3',
@@ -198,7 +244,10 @@ exclude_patterns = [
     'tools/test/gen_operators_yaml_test.py',
     'tools/test/gen_oplist_test.py',
     'tools/test/test_selective_build.py',
+<<<<<<< HEAD
     'tools/experimental/torchfuzz/**',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -209,6 +258,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 
+<<<<<<< HEAD
 
 [[linter]]
 code = 'PYREFLY'
@@ -249,6 +299,8 @@ init_command = [
     'types-python-dateutil==2.9.0.20251008'
 ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
@@ -275,8 +327,12 @@ include_patterns = [
     'c10/**/*.cpp',
     'c10/**/*.h',
     'torch/*.h',
+<<<<<<< HEAD
     'torch/_inductor/codegen/aoti_runtime/*.h',
     'torch/_inductor/codegen/aoti_runtime/*.cpp',
+=======
+    'torch/_inductor/codegen/aoti_runtime/interface.cpp',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/csrc/*.h',
     'torch/csrc/*.cpp',
     'torch/csrc/**/*.h',
@@ -544,7 +600,11 @@ include_patterns = [
     '**/*.h',
 ]
 exclude_patterns = [
+<<<<<<< HEAD
     'torch/headeronly/macros/Macros.h',
+=======
+    'c10/macros/Macros.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -567,7 +627,11 @@ include_patterns = [
     '**/*.h',
 ]
 exclude_patterns = [
+<<<<<<< HEAD
     'torch/headeronly/macros/Macros.h',
+=======
+    'c10/macros/Macros.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -627,7 +691,11 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
+<<<<<<< HEAD
     '--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
+=======
+    '--pattern=#include <pybind11\/(^|[^(gil\.h)])',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     '--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
     '--linter-name=PYBIND11_INCLUDE',
     '--match-first-only',
@@ -833,7 +901,12 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
+<<<<<<< HEAD
     '--pattern=(cudaSetDevice|cudaGetDevice)\\(',
+=======
+    '--pattern=cudaSetDevice(',
+    '--pattern=cudaGetDevice(',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     '--linter-name=RAWCUDADEVICE',
     '--error-name=raw CUDA API usage',
     """--error-description=\
@@ -1007,6 +1080,10 @@ exclude_patterns = [
     'test/jit/**',  # should be run through test/test_jit.py
     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
     'test/fx/**',  # should be run through test/test_fx.py
+<<<<<<< HEAD
+=======
+    'test/bottleneck_test/**',  # excluded by test/run_test.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/package/**',  # excluded by test/run_test.py
     'test/distributed/argparse_util_test.py',
     'test/distributed/bin/test_script.py',
@@ -1137,8 +1214,16 @@ command = [
 [[linter]]
 code = 'WORKFLOWSYNC'
 include_patterns = [
+<<<<<<< HEAD
     '.github/workflows/*.yml',
     '.github/workflows/*.yaml',
+=======
+    '.github/workflows/pull.yml',
+    '.github/workflows/trunk.yml',
+    '.github/workflows/periodic.yml',
+    '.github/workflows/mac-mps.yml',
+    '.github/workflows/slow.yml',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -1150,7 +1235,11 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'pyyaml==6.0.2',
+=======
+    'PyYAML==6.0.1',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 [[linter]]
@@ -1172,7 +1261,11 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'pyyaml==6.0.2',
+=======
+    'PyYAML==6.0.1',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 [[linter]]
@@ -1197,6 +1290,7 @@ exclude_patterns = [
     'torch/_vendor/**',
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
+<<<<<<< HEAD
     'torch/utils/model_dump/preact.mjs',
     # These files are all grandfathered in, feel free to remove from this list
     # as necessary
@@ -1204,6 +1298,31 @@ exclude_patterns = [
     'aten/src/ATen/native/[a-pA-P]*/**',
     'aten/src/ATen/[a-mA-M]*/**',
     'test/**',
+=======
+    # These files are all grandfathered in, feel free to remove from this list
+    # as necessary
+    # NOTE: remove the patterns in the order they are listed
+    'aten/**',
+    'aten/src/ATen/native/**',
+    'aten/src/ATen/native/q*/**',
+    'aten/src/ATen/native/[a-pA-P]*/**',
+    'aten/src/ATen/[a-mA-M]*/**',
+    'test/**',
+    'test/test_*',
+    'test/[a-hA-h]*/**',
+    'test/inductor/**',
+    'test/dynamo/**',
+    'test/distributed/**',
+    'torch/**',
+    'torch/_*/**',
+    'torch/ao/**',
+    'torch/fx/**',
+    'torch/distributed/tensor/**',
+    'torch/[j-o]*/**',
+    'torch/utils/**',
+    'torch/csrc/jit/**',
+    'torch/csrc/jit/[a-o]*/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 init_command = [
     'python3',
@@ -1298,7 +1417,10 @@ exclude_patterns = [
     'test/test_masked.py',
     'test/test_maskedtensor.py',
     'test/test_matmul_cuda.py',
+<<<<<<< HEAD
     'test/test_scaled_matmul_cuda.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/test_meta.py',
     'test/test_metal.py',
     'test/test_mkl_verbose.py',
@@ -1450,6 +1572,11 @@ exclude_patterns = [
     'torch/utils/benchmark/utils/timer.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
+<<<<<<< HEAD
+=======
+    'torch/utils/bottleneck/__init__.py',
+    'torch/utils/bottleneck/__main__.py',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/utils/bundled_inputs.py',
     'torch/utils/checkpoint.py',
     'torch/utils/collect_env.py',
@@ -1490,13 +1617,22 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'usort==1.0.8.post1',
     'isort==6.0.1',
     'ruff==0.13.1',  # sync with RUFF
+=======
+    '--no-black-binary',
+    'black==23.12.1',
+    'usort==1.0.8.post1',
+    'isort==6.0.1',
+    'ruff==0.11.13',  # sync with RUFF
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 is_formatter = true
 
 [[linter]]
+<<<<<<< HEAD
 code = 'PYPROJECT'
 command = [
     'python3',
@@ -1541,6 +1677,8 @@ init_command = [
 ]
 
 [[linter]]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 code = 'COPYRIGHT'
 include_patterns = ['**']
 exclude_patterns = [
@@ -1606,10 +1744,17 @@ include_patterns = [
 exclude_patterns = [
     'caffe2/**',
     'functorch/docs/**',
+<<<<<<< HEAD
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
     'test/dynamo/cpython/**',
     'test/test_torchfuzz_repros.py',
+=======
+    'functorch/notebooks/**',
+    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
+    'test/dynamo/cpython/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'scripts/**',
     'third_party/**',
     'fb/**',
@@ -1627,7 +1772,11 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'ruff==0.13.1',  # sync with PYFMT
+=======
+    'ruff==0.11.13',  # sync with PYFMT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 is_formatter = true
 
@@ -1636,10 +1785,14 @@ is_formatter = true
 # the same line, merge conflicts should not arise in git or hg
 [[linter]]
 code = 'MERGE_CONFLICTLESS_CSV'
+<<<<<<< HEAD
 include_patterns = [
     'benchmarks/dynamo/ci_expected_accuracy/*.csv',
     'benchmarks/dynamo/pr_time_benchmarks/expected_results.csv',
 ]
+=======
+include_patterns = ['benchmarks/dynamo/ci_expected_accuracy/*.csv']
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 command = [
     'python3',
     'tools/linter/adapters/no_merge_conflict_csv_linter.py',
@@ -1830,6 +1983,7 @@ include_patterns = [
     'torch/header_only_apis.txt',
 ]
 is_formatter = false
+<<<<<<< HEAD
 
 
 [[linter]]
@@ -1839,3 +1993,5 @@ command = [
   "python3",
   "tools/linter/adapters/gb_registry_linter.py",
 ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/AGENTS.md b/AGENTS.md
index 3d5436a02a85d..dd27ff6213af6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,4 +1,5 @@
 - This is the only AGENTS.md, there are no recursive AGENTS.md
+<<<<<<< HEAD
 - When you are working on a bug, first create a standalone file that
   reproduces the bug and verify it fails in the expected way.  Use this to
   test if your changes work.  Once the change is passing, find an appropriate
@@ -15,3 +16,5 @@
   - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
   - git stash pop
   - Resolve conflicts if necessary
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/BUILD.bazel b/BUILD.bazel
index 4737a2a0c486c..2ac9a0823ad87 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -13,9 +13,12 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")
 
+<<<<<<< HEAD
 # Export files for use by torch/headeronly (where version.h generation now lives)
 exports_files(["version.txt"])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 define_targets(rules = rules)
 
 COMMON_COPTS = [
@@ -94,8 +97,11 @@ generated_cpu_cpp = [
     "aten/src/ATen/NativeMetaFunctions.h",
     "aten/src/ATen/RegistrationDeclarations.h",
     "aten/src/ATen/VmapGeneratedPlumbing.h",
+<<<<<<< HEAD
     "aten/src/ATen/ViewMetaClasses.h",
     "aten/src/ATen/ViewMetaClasses.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten/src/ATen/core/aten_interned_strings.h",
     "aten/src/ATen/core/enum_tag.h",
     "aten/src/ATen/core/TensorBody.h",
@@ -284,7 +290,10 @@ header_template_rule(
         "@AT_BLAS_F2C@": "0",
         "@AT_BLAS_USE_CBLAS_DOT@": "1",
         "@AT_KLEIDIAI_ENABLED@": "0",
+<<<<<<< HEAD
         "@AT_USE_EIGEN_SPARSE@": "0",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
 )
 
@@ -685,7 +694,10 @@ cc_library(
         [
             "torch/*.h",
             "torch/csrc/**/*.h",
+<<<<<<< HEAD
             "torch/nativert/**/*.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch/csrc/distributed/c10d/**/*.hpp",
             "torch/lib/libshm/*.h",
         ],
@@ -693,9 +705,13 @@ cc_library(
             "torch/csrc/*/generated/*.h",
             "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
         ] + torch_cuda_headers,
+<<<<<<< HEAD
     ) + GENERATED_AUTOGRAD_CPP + [
         "//torch/headeronly:version_h",
     ],
+=======
+    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     includes = [
         "third_party/kineto/libkineto/include",
         "torch/csrc",
@@ -754,7 +770,10 @@ cc_library(
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+<<<<<<< HEAD
             "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
         ],
     )) + torch_sources,
@@ -840,6 +859,39 @@ pybind_extension(
     ],
 )
 
+<<<<<<< HEAD
+=======
+cc_library(
+    name = "functorch",
+    hdrs = glob([
+        "functorch/csrc/dim/*.h",
+    ]),
+    srcs = glob([
+        "functorch/csrc/dim/*.cpp",
+    ]),
+    deps = [
+        ":aten_nvrtc",
+        ":torch_python",
+        "@pybind11",
+    ],
+)
+
+pybind_extension(
+    name = "functorch/_C",
+    copts=[
+        "-DTORCH_EXTENSION_NAME=_C"
+    ],
+    srcs = [
+        "functorch/csrc/init_dim_only.cpp",
+    ],
+    deps = [
+        ":functorch",
+        ":torch_python",
+        ":aten_nvrtc",
+    ],
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cc_binary(
     name = "torch/bin/torch_shm_manager",
     srcs = [
@@ -880,6 +932,10 @@ py_library(
     ],
     data = [
         ":torch/_C.so",
+<<<<<<< HEAD
+=======
+        ":functorch/_C.so",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ":torch/bin/torch_shm_manager",
     ],
 )
@@ -1082,7 +1138,10 @@ test_suite(
         "aten/src/ATen/templates/LazyNonNativeIr.h",
         "aten/src/ATen/templates/RegisterDispatchKey.cpp",
         "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
+<<<<<<< HEAD
         "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "aten/src/ATen/native/native_functions.yaml",
         "aten/src/ATen/native/tags.yaml",
         "aten/src/ATen/native/ts_native_functions.yaml",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f081d8166d7f7..7f23de84e9d36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,8 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+<<<<<<< HEAD
+=======
+# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@@ -25,7 +29,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
 endif()
 
 # ---[ Project and semantic versioning.
+<<<<<<< HEAD
 project(Torch CXX C HIP)
+=======
+project(Torch CXX C)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
   set(LINUX TRUE)
@@ -56,11 +64,19 @@ set(CMAKE_C_STANDARD
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
+<<<<<<< HEAD
 # --- [ Check that minimal gcc version is 9.2+
 if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2)
   message(
     FATAL_ERROR
       "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+=======
+# --- [ Check that minimal gcc version is 9.3+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+  message(
+    FATAL_ERROR
+      "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   )
 endif()
 
@@ -232,16 +248,23 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
 option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
+<<<<<<< HEAD
 option(USE_LSAN "Use Leak Sanitizer" OFF)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+<<<<<<< HEAD
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF)
 cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF)
 option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF)
+=======
+cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
@@ -253,6 +276,10 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
+<<<<<<< HEAD
+=======
+option(USE_FAKELOWP "Use FakeLowp operators" OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -261,6 +288,7 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
+<<<<<<< HEAD
 option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
@@ -273,6 +301,16 @@ cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                        OFF)
 cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
                        "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+=======
+cmake_dependent_option(USE_NCCL "Use NCCL" ON
+                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_XCCL "Use XCCL" ON
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
+cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
+cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
+                       OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@@ -289,7 +327,10 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
 option(USE_PROF "Use profiling" OFF)
 option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
+<<<<<<< HEAD
 option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(USE_SYSTEM_EIGEN_INSTALL
     "Use system Eigen instead of the one under third_party" OFF)
 cmake_dependent_option(
@@ -326,6 +367,10 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
                        OFF)
 option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
+<<<<<<< HEAD
+=======
+option(USE_DISTRIBUTED "Use distributed" ON)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cmake_dependent_option(
   USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
   "USE_DISTRIBUTED" OFF)
@@ -378,6 +423,7 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                        OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                         "CPU_AARCH64" OFF)
+<<<<<<< HEAD
 # prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
 set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
 if(LINUX AND CPU_AARCH64)
@@ -391,6 +437,14 @@ option(USE_MIMALLOC "Use mimalloc" OFF)
 # on Windows and AArch64.
 option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
 if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
+=======
+
+option(USE_MIMALLOC "Use mimalloc" OFF)
+# Enable third party mimalloc library to improve memory allocation performance
+# on Windows.
+option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
+if(WIN32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set(USE_MIMALLOC ON)
 
   # Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
@@ -442,7 +496,11 @@ if(WIN32)
       message(
         WARNING
           "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+<<<<<<< HEAD
           "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
+=======
+          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       )
     else()
       set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@@ -574,7 +632,11 @@ if(MSVC)
   set(CMAKE_NINJA_CMCLDEPS_RC OFF)
   if(MSVC_Z7_OVERRIDE)
     # CMake set debug flags to use /Z7
+<<<<<<< HEAD
     set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
+=======
+    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   foreach(
     flag_var
@@ -663,11 +725,14 @@ endif(MSVC)
 
 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
 
+<<<<<<< HEAD
 # Set linker max-page-size to 64KiB on AArch64 Linux
 if(LINUX AND CPU_AARCH64)
   add_link_options_if_supported("-z,max-page-size=0x10000")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@@ -849,11 +914,18 @@ include(ExternalProject)
 
 # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
 # CMAKE_SYSTEM_PROCESSOR thinks its 64bit
+<<<<<<< HEAD
 if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
   message(WARNING
     "x64 operating system is required for FBGEMM. "
     "Not compiling with FBGEMM. "
     "Turn this warning off by USE_FBGEMM=OFF.")
+=======
+if(USE_FBGEMM
+   AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
+                                                      4)
+        OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set(USE_FBGEMM OFF)
 endif()
 
@@ -888,6 +960,7 @@ cmake_dependent_option(
   "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
   OFF)
 
+<<<<<<< HEAD
 
 IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
   message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build")
@@ -912,6 +985,8 @@ cmake_dependent_option(
 if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
 # Eff Attention won't
 cmake_dependent_option(
@@ -945,10 +1020,13 @@ if(USE_FBGEMM)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
 endif()
 
+<<<<<<< HEAD
 if(USE_FBGEMM_GENAI)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(USE_PYTORCH_QNNPACK)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()
@@ -1225,7 +1303,11 @@ if(APPLE)
     string(
       APPEND
       CMAKE_SHARED_LINKER_FLAGS
+<<<<<<< HEAD
       " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal -weak_framework IOKit"
+=======
+      " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     # To suppress MPSGraph availability warnings
     append_cxx_flag_if_supported("-Wno-unguarded-availability-new"
@@ -1234,6 +1316,13 @@ if(APPLE)
   append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
 endif()
 
+<<<<<<< HEAD
+=======
+if(USE_XPU)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XPU")
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(EMSCRIPTEN)
   string(
     APPEND
@@ -1285,7 +1374,10 @@ if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL)
 endif()
 
 # ---[ Main build
+<<<<<<< HEAD
 add_subdirectory(torch/headeronly)  # headeronly headers
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 add_subdirectory(c10)
 add_subdirectory(caffe2)
 
@@ -1395,6 +1487,13 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 
+<<<<<<< HEAD
+=======
+if(BUILD_FUNCTORCH)
+  add_subdirectory(functorch)
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@@ -1433,6 +1532,7 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
           DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
+<<<<<<< HEAD
 
 if(USE_PRIORITIZED_TEXT_FOR_LD)
   add_compile_options(
@@ -1487,3 +1587,5 @@ else()
     ]])
   endif()
 endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/CODEOWNERS b/CODEOWNERS
index cc249dc4f43a2..ca5526e3eb5e6 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -14,6 +14,10 @@
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
+<<<<<<< HEAD
+=======
+/torch/header_only_apis.txt @janeyx99
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@@ -50,12 +54,21 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/c10d/Ops.* @kwen2501
 
 # ONNX Export
+<<<<<<< HEAD
 /torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
 /torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
 /torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
 /torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
 /torch/onnx/ @titaiwangms @xadupre @justinchuby
 /test/onnx/  @titaiwangms @xadupre @justinchuby
+=======
+/torch/_dynamo/backends/onnxrt.py @wschin
+/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
+/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
+/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # CI
 /.ci  @pytorch/pytorch-dev-infra
@@ -135,7 +148,11 @@ torch/profiler/ @sraikund16
 test/functorch/test_aotdispatch.py @ezyang @Chillee
 
 # Dataloader
+<<<<<<< HEAD
 torch/utils/data/ @divyanshk @ramanishsingh @scotts
+=======
+torch/utils/data/ @divyanshk @ramanishsingh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
@@ -164,7 +181,10 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
+<<<<<<< HEAD
 /torch/_export/serde/schema.py @SherlockNoMad @zhxchen17
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Dynamic Shapes
 /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
@@ -181,6 +201,7 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki
 
 # CUDA and CUDA math libraries
+<<<<<<< HEAD
 aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
 aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
 aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
@@ -190,12 +211,24 @@ torch/cuda/ @eqy @syed-ahmed @Aidyn-A
 torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
 torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
 torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
+=======
+aten/src/ATen/cuda/ @eqy @syed-ahmed
+aten/src/ATen/cudnn/ @eqy @syed-ahmed
+aten/src/ATen/native/cuda/ @eqy @syed-ahmed
+aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
+c10/cuda @eqy @syed-ahmed
+torch/cuda/ @eqy @syed-ahmed
+torch/csrc/cuda/ @eqy @syed-ahmed
+torch/backends/cuda/ @eqy @syed-ahmed
+torch/backends/cudnn/ @eqy @syed-ahmed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # PyTree utilities
 /torch/utils/_pytree.py @XuehaiPan
 /torch/utils/_cxx_pytree.py @XuehaiPan
 /torch/utils/pytree/ @XuehaiPan
 /torch/_dynamo/polyfills/pytree.py @XuehaiPan
+<<<<<<< HEAD
 
 # Relating to libtorch ABI
 /torch/csrc/stable/ @janeyx99 @mikaylagawarecki
@@ -215,3 +248,5 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4c46077f9db71..8c35edba26b6a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,13 +81,18 @@ git remote add upstream git@github.com:pytorch/pytorch.git
 make setup-env
 # Or run `make setup-env-cuda` for pre-built CUDA binaries
 # Or run `make setup-env-rocm` for pre-built ROCm binaries
+<<<<<<< HEAD
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ### Tips and Debugging
 
 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
 
+<<<<<<< HEAD
 * When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use
   the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
   This way you do not need to repeatedly install after modifying Python files (`.py`).
@@ -101,6 +106,22 @@ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
   ```
   Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
   would be sufficient to make change visible in `torch` package.
+=======
+* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use
+  the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
+  This way you do not need to repeatedly install after modifying Python files (`.py`).
+  However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or
+   non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
+
+
+  One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
+  is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
+  ```bash
+   pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
+  ```
+   Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder),
+   would be sufficient to make change visible in `torch` package.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
   To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip
@@ -114,9 +135,15 @@ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
   pip uninstall torch
   ```
 
+<<<<<<< HEAD
   Next run `python setup.py clean`. After that, you can install in editable mode again.
 
 * If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps:
+=======
+  Next run `python setup.py clean`. After that, you can install in `develop` mode again.
+
+* If you run into errors when running `python setup.py develop`, here are some debugging steps:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
   your CMake works and can compile this simple Hello World program without errors.
   2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@@ -129,6 +156,7 @@ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
       git clean -xdf
       python setup.py clean
       git submodule update --init --recursive
+<<<<<<< HEAD
       python -m pip install --group dev
       python -m pip install --no-build-isolation -v -e .
       ```
@@ -143,6 +171,15 @@ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
       python -m pip install --no-build-isolation -v -e .
       ```
 
+=======
+      python setup.py develop
+      ```
+  4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
+    experiment with some environment variables, you can pass them into the command:
+      ```bash
+      ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
+      ```
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 * If you run into issue running `git submodule update --init --recursive`. Please try the following:
   - If you encounter an error such as
@@ -182,26 +219,39 @@ You can use this script to check out a new nightly branch with the following:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch
+<<<<<<< HEAD
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --cuda
+<<<<<<< HEAD
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --rocm
+<<<<<<< HEAD
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 You can also use this tool to pull the nightly commits into the current branch:
 
 ```bash
+<<<<<<< HEAD
 ./tools/nightly.py pull
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
@@ -212,6 +262,10 @@ pass in the `--python` argument:
 ```bash
 ./tools/nightly.py --python /path/to/python3.12
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+./tools/nightly.py pull -p my-env
+source my-env/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Pulling will recreate a fresh virtual environment and reinstall the development
@@ -267,7 +321,10 @@ dependencies as well as the nightly binaries into the repo directory.
       support for PyTorch.
 * [tools](tools) - Code generation scripts for the PyTorch library.
   See [README](tools/README.md) of this directory for more details.
+<<<<<<< HEAD
 * [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * [test](test) - Python unit tests for PyTorch Python frontend.
   * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
     functionality.
@@ -303,7 +360,11 @@ The following packages should be installed with `pip`:
 - `pytest` - recommended to run tests more selectively
 Running
 ```
+<<<<<<< HEAD
 pip install --group dev
+=======
+pip install -r requirements.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 will install these dependencies for you.
 
@@ -654,9 +715,15 @@ can be selected interactively with your mouse to zoom in on a particular part of
 the program execution timeline. The `--native` command-line option tells
 `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
 for C++ code it may be necessary to compile PyTorch in debug mode by prepending
+<<<<<<< HEAD
 your `python -m pip install -e . -v --no-build-isolation` call to compile
 PyTorch with `DEBUG=1`. Depending on your operating system it may also be
 necessary to run `py-spy` with root privileges.
+=======
+your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on
+your operating system it may also be necessary to run `py-spy` with root
+privileges.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 `py-spy` can also work in an `htop`-like "live profiling" mode and can be
 tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@@ -664,10 +731,17 @@ details.
 
 ## Managing multiple build trees
 
+<<<<<<< HEAD
 One downside to using `python -m pip install -e . -v --no-build-isolation` is
 that your development version of PyTorch will be installed globally on your
 account (e.g., if you run `import torch` anywhere else, the development version
 will be used).
+=======
+One downside to using `python setup.py develop` is that your development
+version of PyTorch will be installed globally on your account (e.g., if
+you run `import torch` anywhere else, the development version will be
+used).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 If you want to manage multiple builds of PyTorch, you can make use of
 [venv environments](https://docs.python.org/3/library/venv.html) to maintain
@@ -678,7 +752,11 @@ specific build of PyTorch. To set one up:
 python -m venv pytorch-myfeature
 source pytorch-myfeature/bin/activate  # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
 # if you run python now, torch will NOT be installed
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ## C++ development tips
@@ -716,9 +794,13 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN
 For example:
 
 ```bash
+<<<<<<< HEAD
 DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \
     USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \
     python -m pip install --no-build-isolation -v -e .
+=======
+DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build
@@ -728,7 +810,11 @@ options.
 
 ### Code completion and IDE support
 
+<<<<<<< HEAD
 When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate
+=======
+When using `python setup.py develop`, PyTorch will generate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 a `compile_commands.json` file that can be used by many editors
 to provide command completion and error highlighting for PyTorch's
 C++ code. You need to `pip install ninja` to generate accurate
@@ -789,7 +875,11 @@ If not, you can define these variables on the command line before invoking `setu
 export CMAKE_C_COMPILER_LAUNCHER=ccache
 export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 #### Use a faster linker
@@ -802,7 +892,11 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
 Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed:
 
 ```sh
+<<<<<<< HEAD
 CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e .
+=======
+CMAKE_LINKER_TYPE=MOLD python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 #### Use pre-compiled headers
@@ -814,7 +908,11 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the
 `CMakeCache.txt` file.
 
 ```sh
+<<<<<<< HEAD
 USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e .
+=======
+USE_PRECOMPILED_HEADERS=1 python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
@@ -837,7 +935,11 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can
 this as a compiler launcher, similar to `ccache`
 ```bash
 export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache"
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ### Rebuild few files with debug information
@@ -1188,7 +1290,11 @@ build_with_asan()
   CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
   CXX_FLAGS="-pthread" \
   USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
+<<<<<<< HEAD
   python -m pip install --no-build-isolation -v -e .
+=======
+  python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 run_with_asan()
diff --git a/Dockerfile b/Dockerfile
index 331cf00593cb2..dceb1b1bb9663 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,7 +33,11 @@ RUN case ${TARGETPLATFORM} in \
          *)              MINICONDA_ARCH=x86_64   ;; \
     esac && \
     curl -fsSL -v -o ~/miniconda.sh -O  "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${MINICONDA_ARCH}.sh"
+<<<<<<< HEAD
 COPY requirements.txt requirements-build.txt .
+=======
+COPY requirements.txt .
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
 RUN chmod +x ~/miniconda.sh && \
     bash ~/miniconda.sh -b -p /opt/conda && \
@@ -47,6 +51,7 @@ WORKDIR /opt/pytorch
 COPY . .
 RUN git submodule update --init --recursive
 
+<<<<<<< HEAD
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
@@ -54,6 +59,28 @@ ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
 # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
 RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
+=======
+FROM conda as build
+ARG CMAKE_VARS
+WORKDIR /opt/pytorch
+COPY --from=conda /opt/conda /opt/conda
+COPY --from=submodule-update /opt/pytorch /opt/pytorch
+RUN make triton
+RUN --mount=type=cache,target=/opt/ccache \
+    export eval ${CMAKE_VARS} && \
+    TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+    python setup.py install
+
+FROM conda as conda-installs
+ARG PYTHON_VERSION=3.11
+ARG CUDA_PATH=cu121
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=whl/nightly
+# Automatically set by buildx
+RUN /opt/conda/bin/conda update -y -n base -c defaults conda
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ARG TARGETPLATFORM
 
@@ -96,5 +123,9 @@ WORKDIR /workspace
 
 FROM official as dev
 # Should override the already installed version from the official-image stage
+<<<<<<< HEAD
 COPY --from=conda /opt/conda /opt/conda
 COPY --from=submodule-update /opt/pytorch /opt/pytorch
+=======
+COPY --from=build /opt/conda /opt/conda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/MANIFEST.in b/MANIFEST.in
index bb8e488283a96..ddac0b58efb0d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
 
 # Include individual top-level files
@@ -97,3 +98,36 @@ include .gitmodules
 # concern here.
 
 # [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
+=======
+include MANIFEST.in
+include CMakeLists.txt
+include CITATION.cff
+include LICENSE
+include NOTICE
+include .gitmodules
+include build_variables.bzl
+include mypy.ini
+include requirements.txt
+include ufunc_defs.bzl
+include version.txt
+recursive-include android *.*
+recursive-include aten *.*
+recursive-include binaries *.*
+recursive-include c10 *.*
+recursive-include caffe2 *.*
+recursive-include cmake *.*
+recursive-include torch *.*
+recursive-include tools *.*
+recursive-include test *.*
+recursive-include docs *.*
+recursive-include ios *.*
+recursive-include third_party *
+recursive-include test *.*
+recursive-include benchmarks *.*
+recursive-include scripts *.*
+recursive-include mypy_plugins *.*
+recursive-include modules *.*
+recursive-include functorch *.*
+prune */__pycache__
+global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/Makefile b/Makefile
index 3db2b7aa44e76..3dc907d125f8f 100644
--- a/Makefile
+++ b/Makefile
@@ -57,8 +57,12 @@ setup-env-cuda:
 setup-env-rocm:
 	$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm"
 
+<<<<<<< HEAD
 .PHONY: setup-lint
 setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
+=======
+.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 	@echo "Setting up lintrunner..."
 	$(PIP) install lintrunner
 	lintrunner init
@@ -66,6 +70,12 @@ setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrun
 	@mkdir -p .lintbin
 	@sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256
 
+<<<<<<< HEAD
+=======
+.PHONY: setup-lint
+setup-lint: .lintbin/.lintrunner.sha256
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .PHONY: lazy-setup-lint
 lazy-setup-lint: .lintbin/.lintrunner.sha256
 	@if [ ! -x "$(shell command -v lintrunner)" ]; then \
diff --git a/README.md b/README.md
index a0c9b54c95a8b..abbaa4e36e1b3 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,11 @@ Elaborating Further:
 
 If you use NumPy, then you have used Tensors (a.k.a. ndarray).
 
+<<<<<<< HEAD
 ![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
+=======
+![Tensor illustration](./docs/source/_static/img/tensor_illustration.png)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@@ -161,7 +165,11 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 
 #### Prerequisites
 If you are installing from source, you will need:
+<<<<<<< HEAD
 - Python 3.10 or later
+=======
+- Python 3.9 or later
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)
 
@@ -200,7 +208,11 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
 
+<<<<<<< HEAD
 Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver, and NVIDIA hardware.
+=======
+Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.  If
@@ -228,7 +240,10 @@ If you want to disable Intel GPU support, export the environment variable `USE_X
 Other potentially useful environment variables may be found in `setup.py`.
 
 #### Get the PyTorch Source
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```bash
 git clone https://github.com/pytorch/pytorch
 cd pytorch
@@ -242,8 +257,14 @@ git submodule update --init --recursive
 **Common**
 
 ```bash
+<<<<<<< HEAD
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
 pip install --group dev
+=======
+conda install cmake ninja
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
+pip install -r requirements.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 **On Linux**
@@ -275,6 +296,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
+<<<<<<< HEAD
 conda install -c conda-forge libuv=1.51
 ```
 
@@ -284,22 +306,41 @@ conda install -c conda-forge libuv=1.51
 
 If you're compiling for AMD ROCm then first run this command:
 
+=======
+conda install -c conda-forge libuv=1.39
+```
+
+#### Install PyTorch
+**On Linux**
+
+If you're compiling for AMD ROCm then first run this command:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```bash
 # Only run this if you're compiling for ROCm
 python tools/amd_build/build_amd.py
 ```
 
 Install PyTorch
+<<<<<<< HEAD
 
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
 python -m pip install --no-build-isolation -v -e .
+=======
+```bash
+export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 **On macOS**
 
 ```bash
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python3 setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 **On Windows**
@@ -311,7 +352,11 @@ If you want to build legacy python code, please refer to [Building on legacy cod
 In this mode PyTorch computations will run on your CPU, not your GPU.
 
 ```cmd
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
@@ -332,6 +377,10 @@ Additional libraries such as
 
 You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```cmd
 cmd
 
@@ -351,7 +400,12 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
 :: [Optional] If you want to override the CUDA host compiler
 set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
 
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 **Intel GPU builds**
@@ -373,7 +427,11 @@ if defined CMAKE_PREFIX_PATH (
     set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library"
 )
 
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ##### Adjust Build Options (Optional)
@@ -383,7 +441,10 @@ the following. For example, adjusting the pre-detected directories for CuDNN or
 with such a step.
 
 On Linux
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
 CMAKE_ONLY=1 python setup.py build
@@ -391,10 +452,16 @@ ccmake build  # or cmake-gui build
 ```
 
 On macOS
+<<<<<<< HEAD
 
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
 MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build
+=======
+```bash
+export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ccmake build  # or cmake-gui build
 ```
 
@@ -517,7 +584,11 @@ on [our website](https://pytorch.org/get-started/previous-versions).
 
 ## Getting Started
 
+<<<<<<< HEAD
 Three pointers to get you started:
+=======
+Three-pointers to get you started:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - [Tutorials: get you started with understanding and using PyTorch](https://pytorch.org/tutorials/)
 - [Examples: easy to understand PyTorch code across all domains](https://github.com/pytorch/examples)
 - [The API Reference](https://pytorch.org/docs/)
@@ -559,7 +630,11 @@ To learn more about making a contribution to Pytorch, please see our [Contributi
 
 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.
 
+<<<<<<< HEAD
 PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+=======
+PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->
 
 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
diff --git a/RELEASE.md b/RELEASE.md
index 87f042d659fdf..553896d87d9d5 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3,7 +3,10 @@
 <!-- toc -->
 
   - [Release Compatibility Matrix](#release-compatibility-matrix)
+<<<<<<< HEAD
     - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - [Release Cadence](#release-cadence)
   - [General Overview](#general-overview)
     - [Frequently Asked Questions](#frequently-asked-questions)
@@ -51,8 +54,11 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 
 | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- | --- |
+<<<<<<< HEAD
 | 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
 | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
 | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
 | 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
@@ -64,6 +70,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
 | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
 
+<<<<<<< HEAD
 ### PyTorch CUDA Support Matrix
 
 For Release 2.9 PyTorch Supports following CUDA Architectures:
@@ -80,6 +87,8 @@ For Release 2.9 PyTorch Supports following CUDA Architectures:
 | 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0)  |
 | 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## Release Cadence
 
 Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
@@ -92,9 +101,15 @@ Following is the release cadence. All future dates below are tentative. For late
 | 2.4 | Jun 2024 | Jul 2024 | Sept 2024 | Not planned |
 | 2.5 | Sep 2024 | Oct 2024 | Nov 2024 |  Not planned |
 | 2.6 | Dec 2024 | Jan 2025 | Not planned | Not planned |
+<<<<<<< HEAD
 | 2.7 | Mar 2025 | Apr 2025 | Jun 2025 | Not planned |
 | 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) |
 | 2.9 | Sept 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
+=======
+| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) |
+| 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) |
+| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | 2.10 | Dec 2025 | Jan 2026 | (Feb 2026) | (Mar 2026) |
 | 2.11 | Mar 2026 | Apr 2026 | (Jun 2026) | (Jul 2026) |
 
diff --git a/SECURITY.md b/SECURITY.md
index ed8228af36724..b381b749d58fc 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -16,8 +16,11 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
 
 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
 
+<<<<<<< HEAD
 All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 
 https://www.facebook.com/whitehat
@@ -31,9 +34,15 @@ Be careful when running untrusted models. This classification includes models cr
 
 **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
 
+<<<<<<< HEAD
 **Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
 
 Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.
+=======
+**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
 
diff --git a/android/README.md b/android/README.md
index f0c74750522de..102a795fed980 100644
--- a/android/README.md
+++ b/android/README.md
@@ -2,7 +2,11 @@
 
 ## Demo applications and tutorials
 
+<<<<<<< HEAD
 Please refer to [meta-pytorch/executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
+=======
+Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions.
 
diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h
index 03b00cc215640..b7394a370175a 100644
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@@ -28,6 +28,7 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
   return stream << BlasBackendToString(backend);
 }
 
+<<<<<<< HEAD
 namespace blas {
 
 enum class ScalingType : std::uint8_t {
@@ -43,4 +44,6 @@ enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
 
 } // namespace blas
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index b9ccee7db811f..520de1082b974 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -38,7 +38,11 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
 
 configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
 # TODO: Do not generate CUDAConfig.h for ROCm BUILDS
+<<<<<<< HEAD
 # At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds
+=======
+# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(USE_CUDA OR USE_ROCM)
   configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
 endif()
@@ -96,8 +100,11 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
 file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
 
+<<<<<<< HEAD
 file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Metal
 file(GLOB metal_h "metal/*.h")
 file(GLOB metal_cpp "metal/*.cpp")
@@ -121,8 +128,11 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
 file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
 file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
 file(GLOB_RECURSE native_mps_h "native/mps/*.h")
+<<<<<<< HEAD
 file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
 file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 file(GLOB native_sparse_cpp "native/sparse/*.cpp")
 file(GLOB native_quantized_cpp
@@ -182,6 +192,7 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
 # if USE_FLASH_ATTENTION is set, ensure CK instances get generated
 if(USE_FLASH_ATTENTION)
+<<<<<<< HEAD
   if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1")
     message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead")
     caffe2_update_option(USE_ROCM_CK_SDPA ON)
@@ -203,6 +214,24 @@ if(USE_FLASH_ATTENTION)
     add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
     file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
     list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
+=======
+  if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
+    set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
+      if(USE_CK_FLASH_ATTENTION STREQUAL "1")
+        if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+          list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
+          if(NUM_ARCHS GREATER 1)
+            message(WARNING "Building CK for multiple archs can increase build time considerably!
+            Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
+          endif()
+        endif()
+        message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
+        message(STATUS "Generating CK kernel instances...")
+        add_subdirectory(native/transformers/hip/flash_attn/ck)
+        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
+        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+      endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
   file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
@@ -216,7 +245,11 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
   add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
 
+<<<<<<< HEAD
   target_include_directories(flash_attention SYSTEM PUBLIC
+=======
+  target_include_directories(flash_attention PUBLIC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
     ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -252,6 +285,7 @@ if(USE_MEM_EFF_ATTENTION)
   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
 endif()
 
+<<<<<<< HEAD
 IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
   message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
   set(USE_FBGEMM_GENAI off)
@@ -363,6 +397,8 @@ IF(USE_FBGEMM_GENAI)
   endif()
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # XNNPACK
 file(GLOB native_xnnpack "native/xnnpack/*.cpp")
 
@@ -410,9 +446,12 @@ if(USE_VULKAN)
 else()
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
+<<<<<<< HEAD
 if(USE_EIGEN_SPARSE)
   set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp})
 endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(USE_MTIA)
     set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})
@@ -491,6 +530,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_ROCM)
+<<<<<<< HEAD
   if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM)
     # NOTE: The PyTorch build does not actually add_subdirectory
     # third_party/composable_kernel or use it as a CMake library. What is used
@@ -520,13 +560,46 @@ if(USE_ROCM)
     list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
     _pytorch_rocm_generate_ck_conf()
   endif()
+=======
+  # NOTE: The PyTorch build does not actually add_subdirectory
+  # third_party/composable_kernel or use it as a CMake library. What is used
+  # is header only, so this should be ok, except that the CMake build generates
+  # a ck/config.h. We just do that part here. Without this, the ck.h from the
+  # ROCM SDK may get accidentally used instead.
+  function(_pytorch_rocm_generate_ck_conf)
+    set(CK_ENABLE_INT8 "ON")
+    set(CK_ENABLE_FP16 "ON")
+    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_FP64 "ON")
+    set(CK_ENABLE_BF16 "ON")
+    set(CK_ENABLE_FP8 "ON")
+    set(CK_ENABLE_BF8 "ON")
+    set(CK_USE_XDL "ON")
+    set(CK_USE_WMMA "ON")
+    configure_file(
+      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+      )
+  endfunction()
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+  _pytorch_rocm_generate_ck_conf()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Next two lines are needed because TunableOp uses third-party/fmt
   list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
   list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
+<<<<<<< HEAD
   if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA)
     list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
   endif()
+=======
+if(USE_FLASH_ATTENTION)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
+endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   list(APPEND ATen_HIP_SRCS
     ${ATen_HIP_SRCS}
     ${hip_hip}
@@ -536,13 +609,20 @@ if(USE_ROCM)
     ${native_quantized_hip_hip}
     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
   )
+<<<<<<< HEAD
   if(NOT USE_ROCM_CK_GEMM)
+=======
+  if(WIN32) # Windows doesn't support Composable Kernels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
     file(GLOB native_hip_ck "native/hip/ck*.hip")
     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
       ${native_hip_bgemm} ${native_hip_ck})
   endif()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
   list(APPEND all_hip_cpp
     ${native_nested_hip_cpp}
@@ -581,7 +661,11 @@ if(LAPACK_FOUND)
     # would not need this at all), some of our libraries (magma in particular)
     # backend to CPU BLAS/LAPACK implementations, and so it is very important
     # we get the *right* implementation, because even if the symbols are the
+<<<<<<< HEAD
     # same, LAPACK implementations may have different calling conventions.
+=======
+    # same, LAPACK implementions may have different calling conventions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This caused https://github.com/pytorch/pytorch/issues/7353
     #
     # We do NOT do this on Linux, since we just rely on torch_cpu to
@@ -621,11 +705,14 @@ if(UNIX)
   if(HAVE_MALLOC_USABLE_SIZE)
     add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
   endif(HAVE_MALLOC_USABLE_SIZE)
+<<<<<<< HEAD
   set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
   CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
   if(HAVE_POSIX_FALLOCATE)
     add_definitions(-DHAVE_POSIX_FALLOCATE=1)
   endif(HAVE_POSIX_FALLOCATE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif(UNIX)
 
 ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)
@@ -707,6 +794,7 @@ if(USE_CUDA AND NOT USE_ROCM)
   add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
+<<<<<<< HEAD
 
   if($ENV{ATEN_STATIC_CUDA})
     if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
@@ -726,6 +814,26 @@ if(USE_CUDA AND NOT USE_ROCM)
        CUDA::cusolver_static
        ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
      )
+=======
+  if($ENV{ATEN_STATIC_CUDA})
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+      ${CUDA_LIBRARIES}
+      CUDA::cusparse_static
+      CUDA::cufft_static_nocallback
+    )
+   if(NOT BUILD_LAZY_CUDA_LINALG)
+     if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
+       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+         CUDA::cusolver_static
+         ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static
+       )
+     elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
+       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+         CUDA::cusolver_static
+         ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
+       )
+     endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    endif()
   else()
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
@@ -790,6 +898,7 @@ endif()
 if(USE_MPS)
     include(../../../cmake/Metal.cmake)
 
+<<<<<<< HEAD
     set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})
 
     if(CAN_COMPILE_METAL)
@@ -809,6 +918,31 @@ if(USE_MPS)
     else()
         file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
         foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
+=======
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
+
+    if(CAN_COMPILE_METAL)
+        foreach(SHADER ${native_mps_metal})
+            cmake_path(GET SHADER STEM TGT_STEM)
+            string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
+            string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
+            list(APPEND AIR_BASIC ${TGT_BASIC})
+            list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
+            metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
+            metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
+        endforeach()
+        air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
+        air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
+        add_custom_command(
+                          COMMAND echo "// $$(date)" > metallib_dummy.cpp
+                          DEPENDS kernels_basic.metallib kernels_bfloat.metallib
+                          OUTPUT metallib_dummy.cpp
+                          COMMENT "Updating metallibs timestamp")
+        add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
+    else()
+        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
+        foreach(SHADER ${native_mps_metal})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cmake_path(GET SHADER STEM TGT_STEM)
             string(CONCAT SHADER_HDR_NAME  "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
             metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 8b283556c7a43..4e3b090f7bfdd 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -144,7 +144,12 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
   checkDeviceType("CPU_tensor_apply", tensors, kCPU);
   checkLayout("CPU_tensor_apply", tensors, kStrided);
+<<<<<<< HEAD
   TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
+=======
+  if (!_all_equal_numel(tensors))
+    TORCH_CHECK(false, _all_equal_numel_error(tensors));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // An empty tensor has no elements
   for (auto& t : tensors)
     if (t.numel() == 0)
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index 44ad24b81755f..f27391e7ee73c 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -131,18 +131,36 @@ uint64_t CPUGeneratorImpl::seed() {
 
 /**
  * Sets the internal state of CPUGeneratorImpl. The new internal state
+<<<<<<< HEAD
  * must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState.
+=======
+ * must be a strided CPU byte tensor and of the same size as either
+ * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
+ * CPUGeneratorImplState (for new state).
+ *
+ * FIXME: Remove support of the legacy state in the future?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   using detail::CPUGeneratorImplState;
   using detail::CPUGeneratorImplStateLegacy;
 
+<<<<<<< HEAD
   static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
   constexpr size_t size = sizeof(CPUGeneratorImplState);
+=======
+  static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
+  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
+
+  static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
+  static const size_t size_current = sizeof(CPUGeneratorImplState);
+  static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   detail::check_rng_state(new_state);
 
   at::mt19937 engine;
+<<<<<<< HEAD
   auto new_state_size = new_state.numel();
 
   TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size,
@@ -150,6 +168,51 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
 
   auto rng_state = new_state.data_ptr_impl<CPUGeneratorImplState>();
   auto legacy_pod = &(rng_state->legacy_pod);
+=======
+  auto float_normal_sample = std::optional<float>();
+  auto double_normal_sample = std::optional<double>();
+
+  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
+  CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
+  auto new_state_size = new_state.numel();
+  if (new_state_size == size_legacy) {
+    legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
+    // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
+    // of normal sample and hence we leave the std::optional<float> as is
+
+    // Update next_double_normal_sample.
+    // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
+    // and a rho value (normal_rho). These three values were redundant and in the new
+    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
+    // intermediate values.
+    if (legacy_pod->normal_is_valid) {
+      auto r = legacy_pod->normal_rho;
+      auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
+      // we return the sin version of the normal sample when in caching mode
+      double_normal_sample = std::optional<double>(r * ::sin(theta));
+    }
+  } else if (new_state_size == size_current) {
+    auto rng_state = (CPUGeneratorImplState*)new_state.data();
+    legacy_pod = &rng_state->legacy_pod;
+    // update next_float_normal_sample
+    if (rng_state->is_next_float_normal_sample_valid) {
+      float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
+    }
+
+    // Update next_double_normal_sample.
+    // Note that in getRNGState, we now return the actual normal sample in normal_y
+    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
+    // are squashed to 0.0.
+    if (legacy_pod->normal_is_valid) {
+      double_normal_sample = std::optional<double>(legacy_pod->normal_y);
+    }
+  } else {
+    TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
+             " or a CPUGeneratorImplState of size ", size_current,
+             " but found the input RNG state size to be ", new_state_size);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // construct engine_
   // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
   // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
@@ -163,12 +226,17 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   engine.set_data(rng_data);
   TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
   this->engine_ = engine;
+<<<<<<< HEAD
   this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid
       ? std::optional<float>(rng_state->next_float_normal_sample)
       : std::optional<float>();
   this->next_double_normal_sample_ = legacy_pod->normal_is_valid
       ? std::optional<double>(legacy_pod->normal_y)
       : std::optional<double>();
+=======
+  this->next_float_normal_sample_ = float_normal_sample;
+  this->next_double_normal_sample_ = double_normal_sample;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /**
diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
index 0bae6d4af6e5e..c4475dc390fce 100644
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@@ -20,4 +20,7 @@
 #define AT_BLAS_F2C() @AT_BLAS_F2C@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
 #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
+<<<<<<< HEAD
 #define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 3310abfb41d54..e91a2abb8a710 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -14,13 +14,18 @@
 #include <ATen/cpu/FlushDenormal.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // USE_FBGEMM
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 #include <cpuinfo.h>
 #endif
+<<<<<<< HEAD
 namespace at {
 
 namespace {
@@ -99,6 +104,11 @@ std::string precision2str(Float32Precision prec) {
   TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
 }
 
+=======
+
+namespace at {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Context::Context() = default;
 
 // TODO: This could be bad juju if someone calls globalContext() in the
@@ -192,6 +202,7 @@ void Context::setUserEnabledNNPACK(bool e) {
   enabled_nnpack = e;
 }
 
+<<<<<<< HEAD
 bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
   if (!op.has_value()) {
     bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
@@ -207,14 +218,21 @@ bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
     return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
   }
   warn_deprecated_fp32_precision_api();
+=======
+bool Context::allowTF32CuDNN() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return allow_tf32_cudnn;
 }
 
 void Context::setAllowTF32CuDNN(bool b) {
+<<<<<<< HEAD
   setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
   setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
   allow_tf32_cudnn = b;
   warn_deprecated_fp32_precision_api();
+=======
+  allow_tf32_cudnn = b;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
@@ -235,6 +253,7 @@ bool Context::allowTF32OneDNN() const {
   return allow_tf32_onednn;
 }
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(clang-diagnostic-unused-parameter)
   void Context::setAllowTF32OneDNN(bool b){
   #ifdef USE_XPU
@@ -242,6 +261,14 @@ bool Context::allowTF32OneDNN() const {
   #else
   TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support.");
   #endif
+=======
+void Context::setAllowTF32OneDNN(bool b){
+#ifdef USE_XPU
+  allow_tf32_onednn = b;
+#else
+  TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support.");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool Context::userEnabledFlashSDP() const {
@@ -292,6 +319,48 @@ bool Context::userEnabledOverrideableSDP() const {
   return enabled_overrideable;
 }
 
+<<<<<<< HEAD
+=======
+static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
+static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
+#ifdef USE_ROCM
+static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
+#endif
+
+bool Context::checkCuBLASConfigDeterministic() {
+  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
+  // is set to deterministic setting
+  if (hasCUDART()) {
+    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
+    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
+  }
+  return true;
+}
+
+void Context::alertCuBLASConfigNotDeterministic() const {
+  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
+  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
+    return;
+  }
+
+  auto msg = c10::str(
+    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
+    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
+    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
+    "case, you must set an environment variable before running your PyTorch application: ",
+    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
+    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
+    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
+  );
+
+  if (deterministicAlgorithmsWarnOnly()) {
+    TORCH_WARN(msg);
+  } else {
+    TORCH_CHECK(false, msg);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool Context::benchmarkCuDNN() const {
   return benchmark_cudnn;
 }
@@ -308,6 +377,7 @@ void Context::setBenchmarkLimitCuDNN(int b) {
   benchmark_limit_cudnn = b;
 }
 
+<<<<<<< HEAD
 bool Context::immediateMiopen() const {
   return immediate_miopen;
 }
@@ -373,10 +443,41 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op)
     return Float32Precision::NONE;
   }
   return precision;
+=======
+bool Context::allowTF32CuBLAS() const {
+#ifdef USE_ROCM
+    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+    if (allow_tf32 != true) {
+      return false;
+    }
+#endif
+  return float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
+}
+
+void Context::setAllowTF32CuBLAS(bool b) {
+#ifdef USE_ROCM
+  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+  if (allow_tf32 != true) {
+    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
+                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
+    return;
+  }
+#endif
+  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
+}
+
+Float32MatmulPrecision Context::float32MatmulPrecision() const {
+  return float32_matmul_precision;
+}
+
+void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) {
+  float32_matmul_precision = p;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void Context::setFloat32MatmulPrecision(const std::string &s) {
   auto match = [this](const std::string & s_) {
+<<<<<<< HEAD
     warn_deprecated_fp32_precision_api();
     // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
     if (s_ == "highest") {
@@ -393,6 +494,17 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
       float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
       setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
       setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
+=======
+    // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
+    if (s_ == "highest") {
+      float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
+      return true;
+    } else if (s_ == "high") {
+      float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
+      return true;
+    } else if (s_ == "medium") {
+      float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     }
     return false;
@@ -406,6 +518,7 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
     "setFloat32MatmulPrecision call has no effect.");
 }
 
+<<<<<<< HEAD
 void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
   auto it = fp32_precision.find(std::make_pair(backend, op));
   TORCH_CHECK(
@@ -418,6 +531,8 @@ void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32P
   it->second = p;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::LinalgBackend Context::linalgPreferredBackend() const {
   return linalg_preferred_backend;
 }
@@ -442,9 +557,12 @@ at::BlasBackend Context::blasPreferredBackend() {
   // call site for blasPreferredBackend(), we set it to an actual value.
   if (blas_preferred_backend == at::BlasBackend::Default) {
     blas_preferred_backend = at::BlasBackend::Cublas;
+<<<<<<< HEAD
     // This logic sits in the getter because it needs to validate
     // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT
     // which initialize the backend without calling the setter
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_ROCM
     // AMD Instinct targets prefer hipblaslt
     static const bool hipblaslt_preferred = []() {
@@ -453,6 +571,12 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60400
           "gfx1200", "gfx1201",
 #endif
+<<<<<<< HEAD
+=======
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if ROCM_VERSION >= 60500
           "gfx950"
 #endif
@@ -474,6 +598,7 @@ at::BlasBackend Context::blasPreferredBackend() {
   // hipblaslt support for all archs is not as complete as hipblas
   if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
     static const bool hipblaslt_unsupported = []() {
+<<<<<<< HEAD
       if(!hasCuBLASLt())
       {
           return true;
@@ -485,6 +610,18 @@ at::BlasBackend Context::blasPreferredBackend() {
 #endif
 #if ROCM_VERSION >= 70000
           "gfx950", "gfx1150", "gfx1151"
+=======
+      static const std::vector<std::string> archs = {
+          "gfx90a", "gfx942",
+#if ROCM_VERSION >= 60300
+          "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+#endif
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
+#if ROCM_VERSION >= 60500
+          "gfx950"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@@ -503,6 +640,7 @@ at::BlasBackend Context::blasPreferredBackend() {
   return blas_preferred_backend;
 }
 
+<<<<<<< HEAD
 bool Context::ckSupported() {
 #ifdef USE_ROCM
   static const std::vector<std::string> supported_archs = {
@@ -521,6 +659,8 @@ bool Context::ckSupported() {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #ifdef _MSC_VER
   TORCH_WARN_ONCE(
@@ -530,6 +670,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #else
   TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
       "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
+<<<<<<< HEAD
 #ifdef USE_ROCM
   static const bool ckSupportedFlag = ckSupported();
   static const bool hasCKGEMMFlag = hasCKGEMM();
@@ -538,6 +679,10 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
       "architecture supported for CK: ", ckSupportedFlag,
       ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag);
 #endif
+=======
+  TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
+      "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
     TORCH_WARN_ONCE(
       "torch.backends.cuda.preferred_blas_library is an experimental feature. "
@@ -549,6 +694,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #endif
 }
 
+<<<<<<< HEAD
 at::ROCmFABackend Context::getROCmFAPreferredBackend() {
 #ifdef USE_ROCM
   // Set potential "Default" value so we don't have to interpret at call sites.
@@ -572,10 +718,14 @@ at::ROCmFABackend Context::getROCmFAPreferredBackend() {
   }
 #endif
 
+=======
+at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return rocm_fa_preferred_backend;
 }
 
 void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
+<<<<<<< HEAD
 #ifdef USE_ROCM
   static const bool hasCKSDPAFlag = hasCKSDPA();
   static const bool ckSupportedFlag = ckSupported();
@@ -583,10 +733,37 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
       "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
       "architecture supported for CK: ", ckSupportedFlag,
       ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
+=======
+
+  // TODO: add plumbing for hasCK for validity checking
+  TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
+      "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
+#ifdef USE_ROCM
+  if(b == at::ROCmFABackend::Ck) {
+    static const bool ck_unsupported = []() {
+      static const std::vector<std::string> archs = {
+          "gfx90a",  "gfx942", "gfx950"
+      };
+      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
+          TORCH_WARN_ONCE(
+            "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
+          return true;
+        }
+      }
+      return false;
+    }();
+    if(!ck_unsupported) rocm_fa_preferred_backend = b;
+  }
+  else {
+     rocm_fa_preferred_backend = b;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   rocm_fa_preferred_backend = b;
 }
 
+<<<<<<< HEAD
 CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
   return allow_fp16_reduction_cublas;
 }
@@ -614,6 +791,22 @@ CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
 
 void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
   allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
+=======
+bool Context::allowFP16ReductionCuBLAS() const {
+  return allow_fp16_reduction_cublas;
+}
+
+void Context::setAllowFP16ReductionCuBLAS(bool b) {
+  allow_fp16_reduction_cublas = b;
+}
+
+bool Context::allowBF16ReductionCuBLAS() const {
+  return allow_bf16_reduction_cublas;
+}
+
+void Context::setAllowBF16ReductionCuBLAS(bool b) {
+  allow_bf16_reduction_cublas = b;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool Context::allowFP16AccumulationCuBLAS() const {
@@ -673,6 +866,7 @@ bool Context::hasLAPACK() {
 #endif
 }
 
+<<<<<<< HEAD
 bool Context::hasEigenSparse() {
 #if AT_USE_EIGEN_SPARSE()
   return true;
@@ -681,6 +875,8 @@ bool Context::hasEigenSparse() {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::QEngine Context::qEngine() const {
   static auto _quantized_engine = []() {
     at::QEngine qengine = at::kNoQEngine;
@@ -704,14 +900,22 @@ at::QEngine Context::qEngine() const {
 #endif
     return qengine;
   }();
+<<<<<<< HEAD
   auto qt_engine = quantized_engine.load();
   return qt_engine == at::QEngine::NoQEngine ? _quantized_engine : qt_engine;
+=======
+  return quantized_engine.value_or(_quantized_engine);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void Context::setQEngine(at::QEngine e) {
   const auto& qengines = supportedQEngines();
   if (std::find(qengines.begin(), qengines.end(), e) != qengines.end()) {
+<<<<<<< HEAD
     quantized_engine.store(e);
+=======
+    quantized_engine = e;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
   TORCH_CHECK(false, "quantized engine ", toString(e), " is not supported");
@@ -723,9 +927,23 @@ const std::vector<at::QEngine>& Context::supportedQEngines() {
     // Engines are listed in priority order: later one wins
     // By default we prefer FBGEMM if we're running on server side
     // QNNPACK on server side has some issue, so we disable it by default.
+<<<<<<< HEAD
+#ifdef USE_PYTORCH_QNNPACK
+    engines.push_back(at::kQNNPACK);
+#endif
+=======
+#ifdef C10_MOBILE
+    engines.push_back(at::kNoQEngine);
+#ifdef USE_PYTORCH_QNNPACK
+    engines.push_back(at::kQNNPACK);
+#endif
+#else  // C10_MOBILE
 #ifdef USE_PYTORCH_QNNPACK
     engines.push_back(at::kQNNPACK);
 #endif
+    engines.push_back(at::kNoQEngine);
+#endif // C10_MOBILE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if AT_MKLDNN_ENABLED()
     engines.push_back(at::kONEDNN);
@@ -857,7 +1075,10 @@ void Context::setAllowFP16ReductionCPU(bool b) {
 #if defined(__aarch64__) && !defined(C10_MOBILE)
     if (!cpuinfo_initialize() || !cpuinfo_has_arm_fp16_arith())
 #else
+<<<<<<< HEAD
     // NOLINTNEXTLINE(facebook-hte-MissingBraces)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (true)
 #endif
       TORCH_CHECK(false, "Float16 arithmetic is not supported by the CPU!");
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index a4a26b5671e59..5854e827a2572 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -19,13 +19,17 @@
 #include <ATen/detail/MPSHooksInterface.h>
 #include <ATen/detail/MTIAHooksInterface.h>
 #include <ATen/detail/PrivateUse1HooksInterface.h>
+<<<<<<< HEAD
 #include <ATen/detail/XLAHooksInterface.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/detail/XPUHooksInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
+<<<<<<< HEAD
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
 
@@ -33,6 +37,12 @@
 #include <map>
 #include <mutex>
 #include <unordered_map>
+=======
+#include <c10/util/irange.h>
+
+#include <cstdint>
+#include <mutex>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 
@@ -40,6 +50,7 @@ class Tensor;
 
 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
 
+<<<<<<< HEAD
 enum class CuBLASReductionOption : uint8_t {
   AllowReducedPrecisionWithSplitK = 0,
   DisallowReducedPrecisionAllowSplitK = 1,
@@ -54,6 +65,8 @@ TORCH_API Float32Op str2op(const std::string& name);
 TORCH_API Float32Precision str2precision(const std::string& name);
 TORCH_API std::string precision2str(Float32Precision prec);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TORCH_API Context {
  public:
   Context();
@@ -89,8 +102,11 @@ class TORCH_API Context {
       return at::detail::getHIPHooks();
     } else if (opt_device_type == at::kHPU) {
       return at::detail::getHPUHooks();
+<<<<<<< HEAD
     } else if (opt_device_type == at::kXLA) {
       return at::detail::getXLAHooks();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       TORCH_CHECK(
           false,
@@ -151,8 +167,11 @@ class TORCH_API Context {
   static bool hasKleidiAI();
   static bool hasLAPACK();
   static bool hasMKLDNN();
+<<<<<<< HEAD
   static bool ckSupported();
   static bool hasEigenSparse();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static bool hasMAGMA() {
     return detail::getCUDAHooks().hasMAGMA();
   }
@@ -183,12 +202,15 @@ class TORCH_API Context {
   static bool hasROCM() {
     return detail::getCUDAHooks().hasROCM();
   }
+<<<<<<< HEAD
   static bool hasCKSDPA() {
     return detail::getCUDAHooks().hasCKSDPA();
   }
   static bool hasCKGEMM() {
     return detail::getCUDAHooks().hasCKGEMM();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
@@ -199,7 +221,11 @@ class TORCH_API Context {
     return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
   }
   static bool hasXLA() {
+<<<<<<< HEAD
     return detail::getXLAHooks().hasXLA();
+=======
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   static bool hasXPU() {
     return detail::getXPUHooks().hasXPU();
@@ -229,6 +255,7 @@ class TORCH_API Context {
   bool userEnabledMkldnn() const;
   void setUserEnabledMkldnn(bool e);
   bool benchmarkCuDNN() const;
+<<<<<<< HEAD
   void setBenchmarkCuDNN(bool /*b*/);
   int benchmarkLimitCuDNN() const;
   void setBenchmarkLimitCuDNN(int /*b*/);
@@ -238,6 +265,15 @@ class TORCH_API Context {
   void setDeterministicCuDNN(bool /*b*/);
   bool deterministicMkldnn() const;
   void setDeterministicMkldnn(bool /*b*/);
+=======
+  void setBenchmarkCuDNN(bool);
+  int benchmarkLimitCuDNN() const;
+  void setBenchmarkLimitCuDNN(int);
+  bool deterministicCuDNN() const;
+  void setDeterministicCuDNN(bool);
+  bool deterministicMkldnn() const;
+  void setDeterministicMkldnn(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool userEnabledNNPACK() const;
   void setUserEnabledNNPACK(bool e);
 
@@ -255,6 +291,7 @@ class TORCH_API Context {
   void setSDPPriorityOrder(const std::vector<int64_t>& order);
   std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
 
+<<<<<<< HEAD
   void setSDPUseFlash(bool /*e*/);
   bool userEnabledFlashSDP() const;
 
@@ -281,6 +318,34 @@ class TORCH_API Context {
 
   at::ROCmFABackend getROCmFAPreferredBackend();
   void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
+=======
+  void setSDPUseFlash(bool);
+  bool userEnabledFlashSDP() const;
+
+  void setSDPUseMemEfficient(bool);
+  bool userEnabledMemEfficientSDP() const;
+
+  void setSDPUseMath(bool);
+  bool userEnabledMathSDP() const;
+
+  void setSDPUseCuDNN(bool);
+  bool userEnabledCuDNNSDP() const;
+
+  void setAllowFP16BF16ReductionMathSDP(bool);
+  bool allowFP16BF16ReductionMathSDP() const;
+
+  void setSDPUseOverrideable(bool);
+  bool userEnabledOverrideableSDP() const;
+
+  at::LinalgBackend linalgPreferredBackend() const;
+  void setLinalgPreferredBackend(at::LinalgBackend);
+
+  at::BlasBackend blasPreferredBackend();
+  void setBlasPreferredBackend(at::BlasBackend);
+
+  at::ROCmFABackend getROCmFAPreferredBackend() const;
+  void setROCmFAPreferredBackend(at::ROCmFABackend);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Note [Enabling Deterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -313,9 +378,15 @@ class TORCH_API Context {
 
   bool deterministicAlgorithms() const;
   bool deterministicAlgorithmsWarnOnly() const;
+<<<<<<< HEAD
   void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
   bool deterministicFillUninitializedMemory() const;
   void setDeterministicFillUninitializedMemory(bool /*b*/);
+=======
+  void setDeterministicAlgorithms(bool, bool);
+  bool deterministicFillUninitializedMemory() const;
+  void setDeterministicFillUninitializedMemory(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Note [Writing Nondeterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -329,7 +400,17 @@ class TORCH_API Context {
   //
   // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
   //   of the time, this should be accomplished by calling
+<<<<<<< HEAD
   //   `at::globalContext().alertNotDeterminstic().
+=======
+  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
+  //   nondeterministic behavior is caused by the CuBLAS workspace
+  //   configuration in CUDA >= 10.2,
+  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
+  //   called instead (in this case, a comment explaining why the operation is
+  //   nondeterministic is not necessary). See below for details on these
+  //   methods.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // * Have an entry in the list of nondeterministic PyTorch operations in the
   //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@@ -353,6 +434,7 @@ class TORCH_API Context {
   // Throws an error if `Context::deterministicAlgorithms()` is true
   static void alertNotDeterministic(std::string_view const& caller);
 
+<<<<<<< HEAD
   void setFloat32MatmulPrecision(const std::string& s);
   void setFloat32Precision(
       Float32Backend backend,
@@ -376,6 +458,29 @@ class TORCH_API Context {
       bool allow_splitk = true);
   bool allowFP16AccumulationCuBLAS() const;
   void setAllowFP16AccumulationCuBLAS(bool /*b*/);
+=======
+  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
+  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
+  // ":4096:8". For more details:
+  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+  void alertCuBLASConfigNotDeterministic() const;
+
+  void setFloat32MatmulPrecision(const std::string& s);
+  bool allowTF32CuDNN() const;
+  void setAllowTF32CuDNN(bool);
+  bool allowTF32OneDNN() const;
+  void setAllowTF32OneDNN(bool);
+  bool allowTF32CuBLAS() const;
+  void setAllowTF32CuBLAS(bool);
+  Float32MatmulPrecision float32MatmulPrecision() const;
+  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
+  bool allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
+  bool allowFP16AccumulationCuBLAS() const;
+  void setAllowFP16AccumulationCuBLAS(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Matmuls can use a so-called "persistent" kernel which launches one CUDA
   // block for each SM on the GPU, and each block then iterates over multiple
@@ -387,7 +492,11 @@ class TORCH_API Context {
   // to make matmuls target only a subset of the SMs, so they can fully schedule
   // even next to a comms kernel, and only be a few percent slower.
   std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
+<<<<<<< HEAD
   void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
+=======
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::QEngine qEngine() const;
   void setQEngine(at::QEngine e);
@@ -408,7 +517,11 @@ class TORCH_API Context {
   void setDefaultMobileCPUAllocator();
   void unsetDefaultMobileCPUAllocator();
   bool allowFP16ReductionCPU() const;
+<<<<<<< HEAD
   void setAllowFP16ReductionCPU(bool /*b*/);
+=======
+  void setAllowFP16ReductionCPU(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Preserved for BC
   void lazyInitCUDA() {
@@ -438,6 +551,10 @@ class TORCH_API Context {
   }
 
  private:
+<<<<<<< HEAD
+=======
+  static bool checkCuBLASConfigDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
   bool enabled_cudnn = true;
   bool deterministic_cudnn = false;
@@ -449,8 +566,12 @@ class TORCH_API Context {
       at::SDPBackend::flash_attention,
       at::SDPBackend::efficient_attention,
       at::SDPBackend::math,
+<<<<<<< HEAD
       at::SDPBackend::cudnn_attention,
       at::SDPBackend::overrideable};
+=======
+      at::SDPBackend::cudnn_attention};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool enabled_flashSDP = true;
   bool enabled_mem_efficientSDP = true;
   bool enabled_mathSDP = true;
@@ -458,17 +579,25 @@ class TORCH_API Context {
   bool enabled_overrideable = true;
   bool allow_fp16_bf16_reduction_mathSDP = false;
   bool benchmark_cudnn = false;
+<<<<<<< HEAD
   bool immediate_miopen = false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Float32MatmulPrecision float32_matmul_precision =
       c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
       ? at::Float32MatmulPrecision::HIGH
       : at::Float32MatmulPrecision::HIGHEST;
   int benchmark_limit_cudnn = 10;
   bool allow_tf32_cudnn = true;
+<<<<<<< HEAD
   CuBLASReductionOption allow_fp16_reduction_cublas =
       CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
   CuBLASReductionOption allow_bf16_reduction_cublas =
       CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+=======
+  bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool allow_fp16_accumulation_cublas = false;
   std::optional<int32_t> sm_carveout = std::nullopt;
   bool enabled_mkldnn = true;
@@ -494,6 +623,7 @@ class TORCH_API Context {
   bool release_original_weights = false;
 #endif
   bool display_vmap_fallback_warnings_ = false;
+<<<<<<< HEAD
   std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
   bool enable_sparse_tensor_invariant_checks = false;
   bool allow_fp16_reduction_cpu = false;
@@ -514,6 +644,12 @@ class TORCH_API Context {
            : Float32Precision::TF32},
   };
 
+=======
+  std::optional<at::QEngine> quantized_engine = std::nullopt;
+  bool enable_sparse_tensor_invariant_checks = false;
+  bool allow_fp16_reduction_cpu = false;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Allocator* prev_allocator_ptr_{nullptr};
 };
 
@@ -625,10 +761,13 @@ inline bool hasLAPACK() {
   return globalContext().hasLAPACK();
 }
 
+<<<<<<< HEAD
 inline bool hasEigenSparse() {
   return globalContext().hasEigenSparse();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool hasMAGMA() {
   return globalContext().hasMAGMA();
 }
@@ -693,4 +832,8 @@ struct TORCH_API ROCmBackwardPassGuard {
   ~ROCmBackwardPassGuard();
   static bool is_backward_pass();
 };
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index ccb0ae15a11e6..1bbf2285f39f7 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -65,6 +65,7 @@ DLDataType getDLDataType(const Tensor& t) {
       break;
     // TODO(#146647): use macro here instead of spelling out each shell dtype
     case ScalarType::Float8_e5m2:
+<<<<<<< HEAD
       dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
       break;
     case ScalarType::Float8_e5m2fnuz:
@@ -83,29 +84,53 @@ DLDataType getDLDataType(const Tensor& t) {
       dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
       dtype.lanes = 2;
       dtype.bits = 4;
+=======
+    case ScalarType::Float8_e5m2fnuz:
+    case ScalarType::Float8_e4m3fn:
+    case ScalarType::Float8_e4m3fnuz:
+    case ScalarType::Float8_e8m0fnu:
+      TORCH_CHECK(false, "float8 types are not supported by dlpack");
+      break;
+    case ScalarType::Float4_e2m1fn_x2:
+      TORCH_CHECK(false, "float4 types are not supported by dlpack");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
     case ScalarType::QInt8:
     case ScalarType::QUInt8:
     case ScalarType::QInt32:
     case ScalarType::QUInt4x2:
     case ScalarType::QUInt2x4:
+<<<<<<< HEAD
       TORCH_CHECK_BUFFER(false, "QUInt/QInt types are not supported by dlpack");
+=======
+      TORCH_CHECK(false, "QUInt/QInt types are not supported by dlpack");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
     case ScalarType::Bits1x8:
     case ScalarType::Bits2x4:
     case ScalarType::Bits4x2:
     case ScalarType::Bits8:
     case ScalarType::Bits16:
+<<<<<<< HEAD
       TORCH_CHECK_BUFFER(false, "Bit types are not supported by dlpack");
       break;
     case ScalarType::Undefined:
       TORCH_CHECK_BUFFER(false, "Undefined is not a valid ScalarType");
     case ScalarType::NumOptions:
       TORCH_CHECK_BUFFER(false, "NumOptions is not a valid ScalarType");
+=======
+      TORCH_CHECK(false, "Bit types are not supported by dlpack");
+      break;
+    case ScalarType::Undefined:
+      TORCH_CHECK(false, "Undefined is not a valid ScalarType");
+    case ScalarType::NumOptions:
+      TORCH_CHECK(false, "NumOptions is not a valid ScalarType");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return dtype;
 }
 
+<<<<<<< HEAD
 DLDevice torchDeviceToDLDevice(at::Device device) {
   DLDevice ctx;
 
@@ -114,6 +139,12 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
       : 0;
 
   switch (device.type()) {
+=======
+static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
+  DLDevice ctx;
+  ctx.device_id = static_cast<int32_t>(static_cast<unsigned char>(device_id));
+  switch (tensor.device().type()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case DeviceType::CPU:
       ctx.device_type = DLDeviceType::kDLCPU;
       break;
@@ -134,7 +165,12 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
       break;
     case DeviceType::XPU:
       ctx.device_type = DLDeviceType::kDLOneAPI;
+<<<<<<< HEAD
       ctx.device_id = at::detail::getXPUHooks().getGlobalIdxFromDevice(device);
+=======
+      ctx.device_id =
+          at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
     case DeviceType::MAIA:
       ctx.device_type = DLDeviceType::kDLMAIA;
@@ -142,6 +178,7 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
     case DeviceType::PrivateUse1:
       ctx.device_type = DLDeviceType::kDLExtDev;
       break;
+<<<<<<< HEAD
     case DeviceType::MPS:
       ctx.device_type = DLDeviceType::kDLMetal;
       break;
@@ -154,11 +191,22 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
 
 static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* data = nullptr) {
   switch (type) {
+=======
+    default:
+      TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
+  }
+  return ctx;
+}
+
+static Device getATenDevice(const DLDevice& ctx, void* data) {
+  switch (ctx.device_type) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case DLDeviceType::kDLCPU:
       return at::Device(DeviceType::CPU);
 #ifndef USE_ROCM
     // if we are compiled under HIP, we cannot do cuda
     case DLDeviceType::kDLCUDA:
+<<<<<<< HEAD
       return at::Device(DeviceType::CUDA, index);
 #endif
     case DLDeviceType::kDLOpenCL:
@@ -182,16 +230,42 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
     default:
       TORCH_CHECK_BUFFER(
           false, "Unsupported device_type: ", std::to_string(type));
+=======
+      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
+#endif
+    case DLDeviceType::kDLOpenCL:
+      return at::Device(DeviceType::OPENCL, static_cast<c10::DeviceIndex>(ctx.device_id));
+    case DLDeviceType::kDLROCM:
+#ifdef USE_ROCM
+      // this looks funny, we need to return CUDA here to masquerade
+      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
+#else
+      return at::Device(DeviceType::HIP, static_cast<c10::DeviceIndex>(ctx.device_id));
+#endif
+    case DLDeviceType::kDLOneAPI:
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
+    case DLDeviceType::kDLMAIA:
+      return at::Device(DeviceType::MAIA, static_cast<c10::DeviceIndex>(ctx.device_id));
+    case DLDeviceType::kDLExtDev:
+      return at::Device(DeviceType::PrivateUse1, static_cast<c10::DeviceIndex>(ctx.device_id));
+    default:
+      TORCH_CHECK(
+          false, "Unsupported device_type: ", std::to_string(ctx.device_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 ScalarType toScalarType(const DLDataType& dtype) {
   ScalarType stype = ScalarType::Undefined;
+<<<<<<< HEAD
   if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
     TORCH_CHECK_BUFFER(
         dtype.lanes == 1,
         "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
   }
+=======
+  TORCH_CHECK(dtype.lanes == 1, "ATen does not support lanes != 1");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (dtype.code) {
     case DLDataTypeCode::kDLUInt:
       switch (dtype.bits) {
@@ -208,7 +282,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::UInt64;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
+=======
+          TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               false, "Unsupported kUInt bits ", std::to_string(dtype.bits));
       }
       break;
@@ -227,7 +305,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::Long;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
+=======
+          TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               false, "Unsupported kInt bits ", std::to_string(dtype.bits));
       }
       break;
@@ -243,7 +325,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::Double;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
+=======
+          TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
       }
       break;
@@ -253,7 +339,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::BFloat16;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
+=======
+          TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
       }
       break;
@@ -269,7 +359,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::ComplexDouble;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
+=======
+          TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
       }
       break;
@@ -279,6 +373,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
           stype = ScalarType::Bool;
           break;
         default:
+<<<<<<< HEAD
           TORCH_CHECK_BUFFER(
               false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
       }
@@ -352,11 +447,20 @@ ScalarType toScalarType(const DLDataType& dtype) {
       break;
     default:
       TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
+=======
+          TORCH_CHECK(
+              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
+      }
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported code ", std::to_string(dtype.code));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return stype;
 }
 
 namespace {
+<<<<<<< HEAD
 
 // The templated classes below are needed for supporting both:
 //   - DLManagedTensor
@@ -383,10 +487,21 @@ void fillVersion<DLManagedTensorVersioned>(
   tensor->flags = 0;
   tensor->version.major = DLPACK_MAJOR_VERSION;
   tensor->version.minor = DLPACK_MINOR_VERSION;
+=======
+struct ATenDLMTensor {
+  Tensor handle;
+  DLManagedTensor tensor{};
+};
+} // namespace
+
+static void deleter(DLManagedTensor* arg) {
+  delete static_cast<ATenDLMTensor*>(arg->manager_ctx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // This function returns a shared_ptr to memory managed DLpack tensor
 // constructed out of ATen tensor
+<<<<<<< HEAD
 template <class T>
 T* toDLPackImpl(const Tensor& src) {
   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@@ -429,18 +544,73 @@ at::Tensor fromDLPackImpl(T* src, std::function<void(void*)> deleter) {
     return at::from_blob(
         dl_tensor.data,
         IntArrayRef(dl_tensor.shape, dl_tensor.ndim),
+=======
+DLManagedTensor* toDLPack(const Tensor& src) {
+  // create a new tensor with possibly normalized strides
+  // gh-83069
+  auto shape = src.sizes();
+  auto strides = src.strides().vec();
+  for (int i = 0; i < src.dim(); i++) {
+    if (shape[i] < 2) {
+      strides[i] = 1;
+    }
+  }
+
+  auto view = src.as_strided(shape, strides, src.storage_offset());
+  ATenDLMTensor* atDLMTensor(new ATenDLMTensor);
+  atDLMTensor->handle = view;
+  atDLMTensor->tensor.manager_ctx = atDLMTensor;
+  atDLMTensor->tensor.deleter = &deleter;
+  atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
+  c10::DeviceIndex device_id = 0;
+  if (src.is_cuda() || src.is_privateuseone()) {
+    device_id = src.get_device();
+  }
+  atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id);
+  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
+  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
+  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
+  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
+  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
+  return &(atDLMTensor->tensor);
+}
+
+Tensor fromDLPack(DLManagedTensor* src) {
+  auto deleter = [src](void* self [[maybe_unused]]) {
+    if (src->deleter) {
+      src->deleter(src);
+    }
+  };
+  return fromDLPack(src, std::move(deleter));
+}
+
+Tensor fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter) {
+  Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
+  ScalarType stype = toScalarType(src->dl_tensor.dtype);
+  if (!src->dl_tensor.strides) {
+    return at::from_blob(
+        src->dl_tensor.data,
+        IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::move(deleter),
         at::device(device).dtype(stype),
         {device});
   }
   return at::from_blob(
+<<<<<<< HEAD
       dl_tensor.data,
       IntArrayRef(dl_tensor.shape, dl_tensor.ndim),
       IntArrayRef(dl_tensor.strides, dl_tensor.ndim),
+=======
+      src->dl_tensor.data,
+      IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
+      IntArrayRef(src->dl_tensor.strides, src->dl_tensor.ndim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       deleter,
       at::device(device).dtype(stype),
       {device});
 }
+<<<<<<< HEAD
 
 // Explicitly instantiate the template above for both classes.
 template at::Tensor fromDLPackImpl<DLManagedTensor>(DLManagedTensor* src, std::function<void(void*)> deleter);
@@ -495,4 +665,6 @@ Tensor maybeCopyTensor(
   return data;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index 928731fafb2f6..f28623e744728 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -4,7 +4,11 @@
 #include <ATen/Tensor.h>
 #include <ATen/dlpack.h>
 
+<<<<<<< HEAD
 // this converter will:
+=======
+// this convertor will:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // 1) take a Tensor object and wrap it in the DLPack tensor
 // 2) take a dlpack tensor and convert it to the ATen Tensor
 
@@ -12,6 +16,7 @@ namespace at {
 
 TORCH_API ScalarType toScalarType(const DLDataType& dtype);
 TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
+<<<<<<< HEAD
 TORCH_API struct DLManagedTensorVersioned* toDLPackVersioned(const Tensor& src);
 TORCH_API Tensor
 fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter = {});
@@ -66,4 +71,12 @@ struct DLPackTraits<DLManagedTensorVersioned> {
   inline static auto fromDLPack = at::fromDLPackVersioned;
 };
 
+=======
+TORCH_API Tensor fromDLPack(DLManagedTensor* src);
+TORCH_API Tensor
+fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
+TORCH_API DLDataType getDLDataType(const Tensor& t);
+TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
index f23b35047fcc8..0f01d3278929a 100644
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@@ -1,6 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/core/CachingDeviceAllocator.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>
 
@@ -31,7 +34,11 @@ TORCH_API bool isAccelerator(c10::DeviceType device_type);
 template <
     typename... T,
     typename = std::enable_if_t<(std::is_same_v<T, c10::DeviceType> && ...)>>
+<<<<<<< HEAD
 inline bool isAcceleratorExcluded(
+=======
+TORCH_API inline bool isAcceleratorExcluded(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::DeviceType device_type,
     c10::DeviceType first_excluded,
     T... rest_excluded) {
@@ -73,6 +80,7 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
 
+<<<<<<< HEAD
 TORCH_API inline void emptyCache() {
   const auto device_type = getAccelerator(true).value();
   at::getDeviceAllocator(device_type)->emptyCache();
@@ -94,6 +102,8 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
   at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::accelerator
 
 namespace at {
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 0e535ab20cd21..e6faaaf1e7f12 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -31,9 +31,13 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
       return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
     } else {
       TORCH_CHECK(
+<<<<<<< HEAD
           false,
           "pin_memory=True requires a CUDA or other accelerator backend; "
           "no pinned memory allocator is available on this system.")
+=======
+          false, "Need to provide pin_memory allocator to use pin memory.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index ac76d09537fa2..952a85bced0d6 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -16,8 +16,13 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {
 
 inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
   for (const auto& x : size) {
+<<<<<<< HEAD
     TORCH_SYM_CHECK(
         x.sym_ge(0),
+=======
+    TORCH_CHECK(
+        x.expect_size(__FILE__, __LINE__),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Trying to create tensor with negative dimension ",
         x,
         ": ",
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 1bf46ebe61b61..2ab18ed01ab07 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -468,7 +468,11 @@ inline Tensor _sum_to(
       // if we assume no reduction due to unbacked we ensure that at runtime.
       TORCH_MAYBE_SYM_CHECK(
           sym_eq(shape[i - leading_dims], sizes[i]),
+<<<<<<< HEAD
           "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
+=======
+          "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           shape[i - leading_dims],
           ", ",
           sizes[i])
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 123d87b304148..1af022b0411ae 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -233,8 +233,13 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor
 
 // NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) {
+<<<<<<< HEAD
     // It would be nice if this logic could be reused from autograd's split_backward(), but I don't think it can.
     // For functionalization, we have only have one of the tensors from the TensorList outputted by split(), and we want to layer i
+=======
+    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
+    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // on top of the base tensor.
     // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
     dim = at::maybe_wrap_dim(dim, base.dim());
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 9631872875c69..c4ffe05866473 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -9,6 +9,14 @@
 
 namespace at::functionalization {
 
+<<<<<<< HEAD
+=======
+ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
+  if (out_idx == this->out_index) return *this;
+  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@@ -37,12 +45,20 @@ namespace at::functionalization {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
   at::Tensor t = update.new_val;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+<<<<<<< HEAD
   if (update.view_metas.empty()) { return t; }
+=======
+  if (update.view_metas.empty()) return t;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<at::Tensor> tmp_values({base});
   tmp_values.reserve(update.view_metas.size());
   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
+<<<<<<< HEAD
     at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
+=======
+    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
     // All of these ops require additional information to recover the sizes of the original tensor.
     // If need to, we could probably apply this optimization and only bother computing tmp_values
@@ -50,8 +66,14 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
     tmp_values.push_back(std::move(next_view));
   }
   for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
+<<<<<<< HEAD
     // Each view inverse is implemented in ViewInverses.cpp.
     t = update.view_metas[i]->reverse(tmp_values[i], t);
+=======
+    int64_t out_idx = update.view_metas[i].out_index;
+    // Each view inverse is implemented in ViewInverses.cpp.
+    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
   return t;
@@ -96,7 +118,11 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   // SparseTensorImpl has no storage, so we cannot query its nbytes.
   // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
   // Same for XLA
+<<<<<<< HEAD
   if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
+=======
+  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
   } else {
     original_storage_size_ = -1;
@@ -105,13 +131,21 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
 
+<<<<<<< HEAD
 void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
+=======
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
 
   if (metas.size() > 1) {
     for (size_t i = 1; i < metas.size(); ++i) {
       // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
+<<<<<<< HEAD
       TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
+=======
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 0c9c1fd775f32..9fff2c6e0b677 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -8,6 +8,7 @@ namespace at::functionalization {
 
 // See Note [Functionalization Pass In Core]
 
+<<<<<<< HEAD
 enum class InverseReturnMode {
   /// Specifies that functional inverses should always return a view.
   AlwaysView,
@@ -77,20 +78,58 @@ enum class InverseReturnMode {
 // a type are used for supporting pickle serialization.
 struct ViewMeta {
   ViewMeta(
+=======
+// ViewMeta is a class used by the functionalization pass to navigate between
+// a base tensor and a view tensor.
+// For example, if I call `b = a.view1(...)`
+// the functionalization pass will generate and store a ViewMeta on b that looks
+// like:
+//
+// ViewMeta(
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
+//     return base.view1(...);
+//   },
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
+//   int64_t mutated_view_idx) -> at::Tensor {
+//     return at::functionalization::impl::view1_inverse(base, mutated_view,
+//     ...);
+//   }
+//
+// The forward_fn lambda describes how to replay view1 on a tensor.
+//
+// The reverse_fn lambda describes how, given a tensor that is already a view,
+// how to get the corresponding base tensor. See Note [Functionalization Pass:
+// View Inverses] for details.
+struct ViewMeta {
+  ViewMeta(
+      std::function<Tensor(const Tensor&, int64_t)> forward,
+      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool has_symbolic_inputs,
       bool is_multi_output = false,
       bool is_as_strided = false,
       int64_t out_idx = 0)
+<<<<<<< HEAD
       : out_index(out_idx),
+=======
+      : forward_fn(std::move(forward)),
+        reverse_fn(std::move(reverse)),
+        out_index(out_idx),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_multi_output(is_multi_output),
         is_as_strided(is_as_strided),
         has_symbolic_inputs(has_symbolic_inputs) {}
 
+<<<<<<< HEAD
   virtual ~ViewMeta() = default;
 
   virtual Tensor forward(const Tensor& base) = 0;
   virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
 
+=======
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See Note [out_idx in ViewMeta]
   int64_t out_index;
 
@@ -102,6 +141,7 @@ struct ViewMeta {
   // Tells us if this view operation has any symbolic inputs
   bool has_symbolic_inputs;
 
+<<<<<<< HEAD
   // Returns a new ViewMeta with the same forward/reverse
   // functions, but a new out index.
   //
@@ -113,6 +153,12 @@ struct ViewMeta {
         "ViewMeta::to_out_index not implemented. ",
         "Likely because there's only one output.");
   }
+=======
+  // Returns a copy of the current ViewMeta, if out_idx matches the current
+  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // functions, but a new out index.
+  ViewMeta to_out_idx(int64_t out_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@@ -145,14 +191,22 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const at::Tensor new_val;
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+<<<<<<< HEAD
     const std::vector<std::shared_ptr<ViewMeta>> view_metas;
+=======
+    const std::vector<ViewMeta> view_metas;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   explicit FunctionalStorageImpl(const Tensor& value);
 
   void add_update(
       const Tensor& updated_val,
+<<<<<<< HEAD
       const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
+=======
+      const std::vector<ViewMeta>& view_metas);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool apply_updates();
   const Tensor& base() {
     return base_;
@@ -174,9 +228,12 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
 
   ~FunctionalStorageImpl() override = default;
 
+<<<<<<< HEAD
   uint64_t mutation_counter() {
     return mutation_counter_;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void mark_mutation() {
     mutation_counter_++;
   }
@@ -205,17 +262,23 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   void mark_inductor_storage_resize(c10::SymInt new_size) {
     inductor_storage_resized_ = true;
     curr_storage_size_ = std::move(new_size);
+<<<<<<< HEAD
     inductor_storage_resized_counter_++;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool was_inductor_storage_resized() {
     return inductor_storage_resized_;
   }
 
+<<<<<<< HEAD
   uint64_t inductor_storage_resized_counter() {
     return inductor_storage_resized_counter_;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   // NB: base_ should always point to a tensor BELOW the current
   // functionalization layer. This is mainly to avoid reference cycles. e.g.
@@ -261,7 +324,10 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   // (1) There were any storage resizes on a graph input
   // (2) The original/curr storage size tell us if these resizes result in a nop
   bool inductor_storage_resized_ = false;
+<<<<<<< HEAD
   uint64_t inductor_storage_resized_counter_ = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::SymInt original_storage_size_;
   c10::SymInt curr_storage_size_;
 };
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 8b7b3bc42a9cb..cfe3e5867b10d 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -122,13 +122,18 @@ void FunctionalTensorWrapper::freeze_storage() const {
 //          |   have their own storages, but backends like functorch      |
 //         \/   are allowed to re-alias underneath the pass               \/
 // . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+<<<<<<< HEAD
 // |    underlying_storage     |                             |      underlying_storage       |
+=======
+// |    underyling_storage     |                             |      underyling_storage       |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
 //
 // This constructor is only used by view ops.
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
+<<<<<<< HEAD
 FunctionalTensorWrapper::FunctionalTensorWrapper(
     const Tensor& view_value,
     const FunctionalTensorWrapper* base,
@@ -142,6 +147,19 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
           base->is_multi_output_view_ || meta->is_multi_output),
       was_storage_changed_(base->was_storage_changed_),
       is_symbolic_(base->is_symbolic_) {
+=======
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
+  : c10::TensorImpl(
+      c10::DispatchKeySet(DispatchKey::Functionalize),
+      view_value.dtype(),
+      view_value.device()
+    ),
+    value_(view_value),
+    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
+    was_storage_changed_(base->was_storage_changed_),
+    is_symbolic_(base->is_symbolic_)
+{
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
   set_constructor_metadata();
@@ -150,10 +168,18 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
       view_metas_ = base->view_metas_;  // copy
   }
   view_metas_.push_back(meta);
+<<<<<<< HEAD
   maybe_mark_symbolic(meta.get());
   storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }
 
+=======
+  maybe_mark_symbolic(meta);
+  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
   return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@@ -177,18 +203,31 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }
 
 // See Note [Functionalization Pass - Inplace View Ops]
+<<<<<<< HEAD
 void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
   view_metas_.push_back(meta);
   // Manually track the fact that this tensor received a metadata mutation!
   has_metadata_mutation_ = true;
   // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
   maybe_mark_symbolic(meta.get());
+=======
+void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
+  view_metas_.push_back(meta);
+  // Manually track the fact that this tensor recieved a metadata mutation!
+  has_metadata_mutation_ = true;
+  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
+  maybe_mark_symbolic(meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Note [Functionalization Pass - Inplace View Ops]
   // So, these ops are special - they're mutation AND view ops. They get special codegen.
   // An example is transpose_, e.g. `a.transpose_()`
   // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
   at::AutoDispatchSkipFunctionalize guard;
+<<<<<<< HEAD
   value_ = meta->forward(value_);
+=======
+  value_ = meta.forward_fn(value_, meta.out_index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }
 
@@ -274,7 +313,11 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) {
   // (We could check if the updated value has a new storage than the original value,
   // but this won't also let us uniquely determine if the tensor **also**
   // experienced a data mutation).
+<<<<<<< HEAD
   mark_storage_changed();
+=======
+  was_storage_changed_ = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto sizes_ = value_.sym_sizes();
   auto strides_ = value_.sym_strides();
@@ -287,11 +330,19 @@ void FunctionalTensorWrapper::storage_resize_(const c10::SymInt& new_size) {
   // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
   TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
   // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
+<<<<<<< HEAD
   // resize_() calls to actually emit any ops in the functional graph.
   // How does it work?
   // Resizing up (old size == 0):
   //   We do nothing in this case.
   //   The expectation is that for the user code to be valid, the next op that should run against the current tensor "x"
+=======
+  // resize_() calls to actualy emit any ops in the functional graph.
+  // How does it work?
+  // Resizing up (old size == 0):
+  //   We do nothing in this case.
+  //   The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //   will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
   //   If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
   //   (otherwise the eager code would be invalid),
@@ -328,7 +379,11 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
   // We're also no longer re-generate "b" fully from "a" anymore, since "a" refers to a slice of "b"'s data.
   //
   // This is probably fixable in theory, but:
+<<<<<<< HEAD
   // - the fix would likely complicated the functionalization logic quite a bit.
+=======
+  // - the fix would likey complicated the functionalization logic quite a bit.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // - the primary use case for resize_() today is resizing zero-sized tensors in out= variants of operators
   // - resize_() also can give you weird results today if you try to resize_() a weirdly strided tensor.
   //
@@ -345,7 +400,11 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
   set_sizes_and_strides(value_.sizes(), value_.strides());
   refresh_numel();
   // (Technically we should be guaranteed that the tensor was already contiguous,
+<<<<<<< HEAD
   // since it's guaranteed not to have been a view. Doesn't hurt to run though)
+=======
+  // since it's guaranteed not to have been a view. Doesnt hurt to run though)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   refresh_contiguous();
   // Swapping out the storage of a tensor (aka from a resize_() call) will update the sizes and strides of the tensor,
   // so we need to record the fact that metadata was mutated.
@@ -369,8 +428,20 @@ void FunctionalTensorWrapper::sync_() {
   regenerate_from_base();
 }
 
+<<<<<<< HEAD
 const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
   return view_metas_;
+=======
+Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
+  auto t = base;
+
+  // Reapply views to get the viewed tensor from the base in alias_
+  for (auto& view_meta: view_metas_) {
+    t = view_meta.forward_fn(t, view_meta.out_index);
+  }
+
+  return t;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void FunctionalTensorWrapper::regenerate_from_base() {
@@ -379,7 +450,11 @@ void FunctionalTensorWrapper::regenerate_from_base() {
   auto t = storage_impl->base();
 
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+<<<<<<< HEAD
   t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
+=======
+  t = apply_view_metas(t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
 
   replace_(t, /*from_lazy_regenerate=*/true);
@@ -479,10 +554,14 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
 
 
 c10::Device FunctionalTensorWrapper::device_custom() const {
+<<<<<<< HEAD
   // The storage pointer already uses the underlying tensor custom device (if
   // applicable) to extract the device. So, we dont have to recurse again by
   // doing value_.unsafeGetTensorImpl()->device().
   return storage().data_ptr().device();
+=======
+  return value_.unsafeGetTensorImpl()->device();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sizes();
@@ -496,8 +575,13 @@ int64_t FunctionalTensorWrapper::dim_custom() const {
 int64_t FunctionalTensorWrapper::numel_custom() const {
   return value_.unsafeGetTensorImpl()->numel();
 }
+<<<<<<< HEAD
 c10::SymBool FunctionalTensorWrapper::sym_is_contiguous_custom(at::MemoryFormat memory_format) const {
   return value_.unsafeGetTensorImpl()->sym_is_contiguous(memory_format);
+=======
+bool FunctionalTensorWrapper::is_contiguous_custom(at::MemoryFormat memory_format) const {
+  return value_.unsafeGetTensorImpl()->is_contiguous(memory_format);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sym_sizes();
@@ -576,7 +660,11 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
   for (const auto& tensor : t_list) {
     // from_functional_tensor(Tensor) has asserts to make sure you don't accidentally call
     // it on a non-functional input,
+<<<<<<< HEAD
     // but from_functional_tensor(TensorList) can receive a list containing both
+=======
+    // but from_functional_tensor(TensorList) can recieve a list containing both
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // functional and non-functional tensors.
     // Example of when that can happen: torch.cat(function_input_tensor, global_state_tensor).
     // When that happens, we're okay with only unwrapping the functional tensors.
@@ -721,11 +809,19 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }
 
 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
+<<<<<<< HEAD
   if (t_list.empty()) { return false; }
   auto functional_count = 0;
   for (const auto i : c10::irange(t_list.size())) {
     auto const & e= t_list[i];
     if (!e.has_value() || !e->defined()) { continue; }
+=======
+  if (t_list.empty()) return false;
+  auto functional_count = 0;
+  for (const auto i : c10::irange(t_list.size())) {
+    auto const & e= t_list[i];
+    if (!e.has_value() || !e->defined()) continue;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (isFunctionalTensor(e)) {
       ++functional_count;
     }
@@ -735,10 +831,17 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
 
 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
+<<<<<<< HEAD
   if (list.size() == 0) { return false; }
   auto functional_count = 0;
   for (const auto& tensor : list) {
     if (!tensor.defined()) { continue; }
+=======
+  if (list.size() == 0) return false;
+  auto functional_count = 0;
+  for (const auto& tensor : list) {
+    if (!tensor.defined()) continue;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (isFunctionalTensor(tensor)) {
       ++functional_count;
     }
@@ -756,6 +859,7 @@ void freeze_functional_tensor(const Tensor& tensor) {
   functional_base_impl->freeze_storage();
 }
 
+<<<<<<< HEAD
 Tensor create_functional_tensor_with_view_meta(
     const at::Tensor& view_to_wrap,
     const at::Tensor& base,
@@ -765,10 +869,17 @@ Tensor create_functional_tensor_with_view_meta(
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
   auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
   auto meta_ = meta;
+=======
+Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
+  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
+  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (out_idx != 0) {
     // Note [out_idx in ViewMeta]
     // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
     // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
+<<<<<<< HEAD
     meta_ = meta->to_out_index(out_idx);
   }
   return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
@@ -778,6 +889,14 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
     ITensorListRef view_to_wrap,
     const at::Tensor& base,
     const std::shared_ptr<functionalization::ViewMeta>& meta) {
+=======
+    meta = meta.to_out_idx(out_idx);
+  }
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+}
+
+std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<Tensor> outputs(view_to_wrap.size());
   int64_t i = 0;
   for (const auto& tensor : view_to_wrap) {
@@ -787,12 +906,17 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
   return outputs;
 }
 
+<<<<<<< HEAD
 void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
+=======
+void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
   self_impl->mutate_view_meta(meta);
 }
 
+<<<<<<< HEAD
 Tensor apply_view_meta_sequence(
     const Tensor& base,
     const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
@@ -803,6 +927,8 @@ Tensor apply_view_meta_sequence(
   return r;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@@ -834,7 +960,11 @@ void setFunctionalizationReapplyViewsTLS(bool reapply_views) {
 // This function will "functionalize" it.
 // That is, it will call the operator, but removing any intermediate views/mutations
 // that are performed inside of it.
+<<<<<<< HEAD
 // This is useful for LTC/XLA, which would like to reuse some of our composite kernels
+=======
+// This is useful for LTC/XLA, which would like to re-use some of our composite kernels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // from pytorch core but not have to worry about the view ops that they might call.
 // e.g. at::block_diag
 void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@@ -896,7 +1026,11 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
     const auto& ivalue = returns[idx];
     if (ivalue.isTensor()) {
       const auto& t = ivalue.toTensor();
+<<<<<<< HEAD
       if (!t.defined()) { continue; }
+=======
+      if (!t.defined()) continue;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::functionalization::impl::sync(t);
       auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
       (*stack)[returns_begin + idx] = t_new;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 6d9050728da70..3beade67d507a 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -56,7 +56,11 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   explicit FunctionalTensorWrapper(
       const Tensor& view_value,
       const FunctionalTensorWrapper* base,
+<<<<<<< HEAD
       const std::shared_ptr<functionalization::ViewMeta>& meta);
+=======
+      const functionalization::ViewMeta& meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Get the underlying, actual tensor, that doesn't know anything about
   // functionalization.
@@ -74,9 +78,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   bool has_metadata_mutation() const {
     return has_metadata_mutation_;
   }
+<<<<<<< HEAD
   uint64_t mutation_counter() const {
     return functional_storage_impl()->mutation_counter();
   }
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void mark_mutation() {
     functional_storage_impl()->mark_mutation();
   }
@@ -99,17 +107,28 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
         ->are_all_mutations_under_no_grad_or_inference_mode();
   }
 
+<<<<<<< HEAD
   void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
     is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
+=======
+  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
+    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool is_symbolic() const {
     return is_symbolic_;
   }
 
+<<<<<<< HEAD
   // Retrieves the ViewMeta sequence of this tensor.
   const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
       const;
+=======
+  // Runs the forward_fn of every ViewMeta collected in the current instance
+  // to some other base.
+  Tensor apply_view_metas(const Tensor& base);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Sync's the underlying tensor with its alias, if it's out of date. This
   // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@@ -146,8 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // from the base tensor. This method is used by inplace-view ops like
   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
   // tensor by replaying the views off of the alias.
+<<<<<<< HEAD
   void mutate_view_meta(
       const std::shared_ptr<at::functionalization::ViewMeta>& meta);
+=======
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Custom implementation of self.set_(src)
   void set__impl(const FunctionalTensorWrapper* other);
@@ -164,6 +187,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
     return was_storage_changed_;
   }
 
+<<<<<<< HEAD
   void mark_storage_changed() {
     was_storage_changed_ = true;
     storage_changed_counter_++;
@@ -171,6 +195,10 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
 
   uint64_t storage_changed_counter() {
     return storage_changed_counter_;
+=======
+  void set_storage_changed() {
+    was_storage_changed_ = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // A FunctionalTensor is considered a base if its not a view of another
@@ -189,9 +217,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
     return functional_storage_impl()->was_inductor_storage_resized();
   }
 
+<<<<<<< HEAD
   bool inductor_storage_resized_counter() {
     return functional_storage_impl()->inductor_storage_resized_counter();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // The functionalization pass can be used to remove mutations.
   // It does so by replacing any mutation op with it's corresponding
   // out-of-place op, followed by a call to replace_(). e.g:
@@ -237,8 +268,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   at::IntArrayRef strides_custom() const override;
   int64_t dim_custom() const override;
   int64_t numel_custom() const override;
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(
       at::MemoryFormat memory_format) const override;
+=======
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::SymIntArrayRef sym_sizes_custom() const override;
   c10::SymInt sym_size_custom(int64_t d) const override;
   c10::SymIntArrayRef sym_strides_custom() const override;
@@ -281,12 +316,19 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   bool is_multi_output_view_ = false;
   // Did the tensor experience a set_() call.
   bool was_storage_changed_ = false;
+<<<<<<< HEAD
   uint64_t storage_changed_counter_ = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Did the tensor experience any view operation with symbolic int.
   bool is_symbolic_ = false;
 
   size_t generation_ = 0;
+<<<<<<< HEAD
   std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
+=======
+  std::vector<at::functionalization::ViewMeta> view_metas_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   static void copy_tensor_metadata(
@@ -301,7 +343,11 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
 namespace functionalization {
 namespace impl {
 
+<<<<<<< HEAD
 inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+=======
+TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& tensor) {
   auto functional_impl =
       static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
@@ -378,11 +424,16 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
     const Tensor& view_to_wrap,
     const Tensor& base,
+<<<<<<< HEAD
     const std::shared_ptr<functionalization::ViewMeta>& meta,
+=======
+    functionalization::ViewMeta meta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
     ITensorListRef view_to_wrap,
     const Tensor& base,
+<<<<<<< HEAD
     const std::shared_ptr<functionalization::ViewMeta>& meta);
 
 void mutate_view_meta(
@@ -392,6 +443,13 @@ void mutate_view_meta(
 TORCH_API Tensor apply_view_meta_sequence(
     const Tensor& base,
     const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
+=======
+    const functionalization::ViewMeta& meta);
+
+void mutate_view_meta(
+    const Tensor& self,
+    const functionalization::ViewMeta& meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 10f988b4d2815..51f1e43a68498 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -1,5 +1,8 @@
+<<<<<<< HEAD
 #include <ATen/FunctionalizeFallbackKernel.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@@ -9,6 +12,10 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
+<<<<<<< HEAD
+=======
+#include <ATen/EmptyTensor.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@@ -29,6 +36,7 @@
 #include <utility>
 #endif
 
+<<<<<<< HEAD
 namespace at::functionalization {
 
 Tensor resize__ViewMeta::forward(const Tensor& base) {
@@ -54,6 +62,8 @@ Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_
 
 } // namespace at::functionalization
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
     const auto& schema = op.schema();
@@ -132,9 +142,13 @@ namespace {
       const auto& ivalue = returns[idx];
       if (ivalue.isTensor() && should_wrap_outputs) {
         const auto& t = ivalue.toTensor();
+<<<<<<< HEAD
         if (!t.defined()) {
           continue;
         }
+=======
+        if (!t.defined()) continue;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
         (*stack)[returns_begin + idx] = t_new;
       } else if (ivalue.isTensorList() && should_wrap_outputs) {
@@ -197,8 +211,24 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
   // The output of resizing is equivalent to taking a slice of a larger tensor.
   // We have to emulate this "slicing" with an as_strided call.
   auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+<<<<<<< HEAD
   auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
       reapply_views, size.vec());
+=======
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      if (reapply_views) {
+        return base.as_strided(size, c10::contiguous_strides(size));
+      } else {
+        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+      }
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
+    },
+    /*has_symbolic_inputs=*/false
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::functionalization::impl::mutate_view_meta(self, view_meta);
   return self;
 }
@@ -317,11 +347,25 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
     tmp_output = at::_unsafe_view_symint(self_, size);
   }
 
+<<<<<<< HEAD
   bool has_symbolic_inputs = std::any_of(
       size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
   auto view_meta =
       std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
           has_symbolic_inputs, size.vec());
+=======
+  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(base, size);
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+    },
+    /*has_symbolic_inputs=*/has_symbolic_inputs
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
   // See  Note [Propagating strides in the functionalization pass]
@@ -331,9 +375,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
   auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size);
 
   if (!stride.has_value()) {
+<<<<<<< HEAD
 
     TORCH_SYM_CHECK(
         self.sym_is_contiguous(),
+=======
+    // With unbacked symints, computeStride could fail even on contiguous
+    // tensors. In this case, we can use the strides of an empty tensor of
+    // inferred_size.
+    TORCH_CHECK(
+        self.is_contiguous(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "View is not valid from size:",
         self.sym_sizes(),
         " stride: ",
@@ -342,9 +394,12 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
         inferred_size,
         " in case of unbacked symbols consider adding torch.check to guide computing strides.");
 
+<<<<<<< HEAD
     // With unbacked symints, computeStride could fail even on contiguous
     // tensors. In this case, we can use the strides of an empty tensor of
     // inferred_size.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stride = at::detail::empty_symint_meta(
                  inferred_size,
                  std::nullopt,
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index 817bf0ddba0b8..29c1596b0b2e3 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -4,7 +4,10 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/DimVector.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 #include <sstream>
 #include <vector>
@@ -27,7 +30,13 @@ inline void infer_size_impl(
   std::optional<int64_t> infer_dim;
   for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
     if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
+<<<<<<< HEAD
       TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
+=======
+      if (infer_dim) {
+        throw std::runtime_error("only one dimension can be inferred");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       infer_dim = dim;
     } else {
       // in case of unbacked shape[dim] we assume it's not -1 and add a runtime
@@ -44,6 +53,7 @@ inline void infer_size_impl(
     }
   }
 
+<<<<<<< HEAD
   if (infer_dim) {
     // numel is the product of known sizes, it has to be divisible by newsize.
     // and newsize should be positive unless newsize == numel (we throw
@@ -77,6 +87,9 @@ inline void infer_size_impl(
           numel);
     }
 
+=======
+  auto set_infer_dim = [&]() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We have a degree of freedom here to select the dimension size; follow
     // NumPy semantics and just bail.  However, a nice error message is needed
     // because users often use `view` as a way to flatten & unflatten
@@ -85,15 +98,29 @@ inline void infer_size_impl(
     // works yet
     //   empty_tensor.view(-1, 0)
     // doesn't.
+<<<<<<< HEAD
     TORCH_MAYBE_SYM_CHECK(
+=======
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         newsize != 0,
         "cannot reshape tensor of 0 elements into shape ",
         shape,
         " because the unspecified dimension size -1 can be any "
         "value and is ambiguous");
+<<<<<<< HEAD
 
     res[*infer_dim] = numel / newsize;
     return;
+=======
+    res[*infer_dim] = numel / newsize;
+    return;
+  };
+
+  if (infer_dim && newsize > 0 && numel % newsize == 0) {
+    set_infer_dim();
+    return;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   TORCH_MAYBE_SYM_CHECK(
@@ -102,6 +129,12 @@ inline void infer_size_impl(
       shape,
       "' is invalid for input of size ",
       numel);
+<<<<<<< HEAD
+=======
+  if (infer_dim) {
+    set_infer_dim();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
diff --git a/aten/src/ATen/LegacyBatchedFallback.cpp b/aten/src/ATen/LegacyBatchedFallback.cpp
index f2b527302a97b..f49559e21f97f 100644
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@@ -218,7 +218,11 @@ static Tensor safeStack(TensorList tensors) {
   // is possible for the backward function to return an undefined grad for some
   // grad_input for each example. In that case, we return an undefined grad.
   //
+<<<<<<< HEAD
   // It is theoretically possible for *some* of the examples to produce an
+=======
+  // It is theoretically posssible for *some* of the examples to produce an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // undefined grad (a kernel could peek at the gradient values and return an
   // undefined tensor if it determines the gradient is full of zeros). We
   // could handle this by treating the undefined grad as a zero-filled tensor
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.cpp b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
index cceefe985a7e2..d944682a2e8e2 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.cpp
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
@@ -84,7 +84,11 @@ IntArrayRef BatchedTensorImpl::strides_custom() const {
 
 // TODO: implement proper contiguity on batched tensor, then put
 // sizes_strides_policy back to Default
+<<<<<<< HEAD
 c10::SymBool BatchedTensorImpl::sym_is_contiguous_custom(at::MemoryFormat memory_format) const {
+=======
+bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(memory_format == MemoryFormat::Contiguous,
       "NYI: querying is_contiguous inside of vmap for memory_format ",
       "other than torch.contiguous_format");
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index 798e3535af3fb..22d2400b26a9e 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -82,8 +82,12 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
   IntArrayRef strides_custom() const override;
   // Override a bunch of methods inherited from TensorImpl to return error
   // messages.
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(
       at::MemoryFormat memory_format) const override;
+=======
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_size(int64_t dim, int64_t new_size) override;
   void set_stride(int64_t dim, int64_t new_stride) override;
   void set_storage_offset(int64_t storage_offset) override;
diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp
index 2c54718e938fb..4b48afc5389fe 100644
--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@@ -58,7 +58,11 @@ namespace at {
 namespace{
 
 // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
+<<<<<<< HEAD
 bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+=======
+static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return dim == 0 || dim == -1;
 }
 
@@ -365,7 +369,11 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
+<<<<<<< HEAD
 int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
+=======
+static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return maybe_wrap_dim(dim, static_cast<int64_t>(input_sizes.size())) + num_batch_dims;
 }
 
@@ -488,7 +496,11 @@ Tensor view_as_complex_batching_rule(const Tensor& self) {
 // Checks that the smallest batch stride is greater than the largest example
 // stride. This is something we can support but we choose not to because it's
 // potentially error prone.
+<<<<<<< HEAD
 void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
+=======
+static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto smallest_batch_stride = std::min_element(
       physical_strides.begin(), physical_strides.begin() + num_batch_dims);
   auto largest_example_stride = std::max_element(
@@ -508,7 +520,11 @@ void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_bat
 // given (sizes, strides, storage_offset) returns the maximum location that
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
+<<<<<<< HEAD
 std::optional<int64_t> maximum_indexable_location(
+=======
+static std::optional<int64_t> maximum_indexable_location(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
   auto result = native::storage_size_for(sizes, strides);
   if (result == 0) {
@@ -521,7 +537,11 @@ std::optional<int64_t> maximum_indexable_location(
 // This checks that the range of possible memory locations accessible by
 // x.as_strided(sizes, strides, maybe_storage_offset)
 // are within the bounds of possible memory locations accessible by x.
+<<<<<<< HEAD
 void checkBasicAsStridedValidForSlice(
+=======
+static void checkBasicAsStridedValidForSlice(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& physical_tensor,
     int64_t num_batch_dims,
     IntArrayRef sizes,
diff --git a/aten/src/ATen/LegacyVmapTransforms.h b/aten/src/ATen/LegacyVmapTransforms.h
index be6cf1b697a22..3ca5c09332f98 100644
--- a/aten/src/ATen/LegacyVmapTransforms.h
+++ b/aten/src/ATen/LegacyVmapTransforms.h
@@ -140,7 +140,11 @@ struct TORCH_API VmapPhysicalView {
   // mapping a physical tensor to a new logical tensor (BatchedTensor)
   VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
 
+<<<<<<< HEAD
   // Maps a logical shape to a physical shape by prepending the batch
+=======
+  // Maps a logical shape to a physical shape by pre-pending the batch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // sizes to the logical shape.
   VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
 
diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp
index ed697c32b58a8..ac8060bdbd03b 100644
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@@ -62,7 +62,11 @@ constexpr const char* unknown_eventname = "eventname not specified";
 #endif
 }  // namespace (anonymous)
 
+<<<<<<< HEAD
 MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
+=======
+MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   : filename_(filename.empty() ? unknown_filename : filename)
   , size_(0) // to be filled later
 #ifdef _WIN32
@@ -292,6 +296,7 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd,
           if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
             TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
           }
+<<<<<<< HEAD
 
 #ifdef HAVE_POSIX_FALLOCATE
           if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
@@ -314,6 +319,8 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd,
           }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
 #ifndef STRIP_ERROR_MESSAGES
             int last_err = errno;
@@ -321,7 +328,11 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd,
             ::close(fd);
             TORCH_CHECK(false, "unable to stretch file <", filename_, "> to the right size: ", c10::utils::str_error(last_err), " (", last_err, ")");
           }
+<<<<<<< HEAD
 /* on macOS write returns with errno 45 (Operation not supported) when used
+=======
+/* on macOS write returns with errno 45 (Opperation not supported) when used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * with a file descriptor obtained via shm_open
  */
 #ifndef __APPLE__
@@ -494,7 +505,11 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
 
     initializeAlloc();
 }
+<<<<<<< HEAD
 RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
+=======
+RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   : RefcountedMapAllocatorArgCheck(flags)
   , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
 
@@ -614,7 +629,11 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
   return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
 }
 
+<<<<<<< HEAD
 at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+=======
+at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size();
   return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@@ -626,7 +645,11 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
   return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
 }
 
+<<<<<<< HEAD
 at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+=======
+at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
   return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h
index 7a3415a4c4112..a9cf976b96cd3 100644
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@@ -25,7 +25,11 @@ class TORCH_API MapAllocator {
  public:
   MapAllocator(std::string_view filename, int flags, size_t size);
   MapAllocator(
+<<<<<<< HEAD
       WithFd /*unused*/,
+=======
+      WithFd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::string_view filename,
       int fd,
       int flags,
@@ -59,14 +63,22 @@ class TORCH_API MapAllocator {
     return flags_;
   }
 
+<<<<<<< HEAD
   static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
+=======
+  static MapAllocator* fromDataPtr(const at::DataPtr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static at::DataPtr makeDataPtr(
       std::string_view filename,
       int flags,
       size_t size,
       size_t* actual_size_out);
   static at::DataPtr makeDataPtr(
+<<<<<<< HEAD
       WithFd /*unused*/,
+=======
+      WithFd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const char* filename,
       int fd,
       int flags,
@@ -105,13 +117,21 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
  public:
   RefcountedMapAllocator(const char* filename, int flags, size_t size);
   RefcountedMapAllocator(
+<<<<<<< HEAD
       WithFd /*unused*/,
+=======
+      WithFd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const char* filename,
       int fd,
       int flags,
       size_t size);
 
+<<<<<<< HEAD
   static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
+=======
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
   RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
   RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@@ -122,7 +142,11 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
       size_t size,
       size_t* actual_size_out);
   static at::DataPtr makeDataPtr(
+<<<<<<< HEAD
       WithFd /*unused*/,
+=======
+      WithFd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const char* filename,
       int fd,
       int flags,
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
index 1bc8c30158aec..004e35b82904c 100644
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@@ -24,7 +24,11 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
     }
   }
 
+<<<<<<< HEAD
   if (t->is_non_overlapping_and_dense_or_false()) {
+=======
+  if (t->is_non_overlapping_and_dense()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return MemOverlap::No;
   }
 
@@ -63,7 +67,11 @@ MemOverlapStatus get_overlap_status(const TensorImpl* a, const TensorImpl* b) {
   if (a->numel() == 0 || b->numel() == 0) {
     return MemOverlapStatus::No;
   }
+<<<<<<< HEAD
   if (!a->is_non_overlapping_and_dense_or_false() || !b->is_non_overlapping_and_dense_or_false()) {
+=======
+  if (!a->is_non_overlapping_and_dense() || !b->is_non_overlapping_and_dense()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return MemOverlapStatus::TooHard;
   }
   // Test for storage equality, rather than pointer equality.
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 2de73a70dd332..6448c88c81e63 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -179,7 +179,11 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
     return;
   }
   const auto src_names = src.names();
+<<<<<<< HEAD
   const auto result_dim = result.dim();
+=======
+  const auto result_dim = static_cast<int64_t>(result.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto src_dim = static_cast<int64_t>(src_names.size());
   const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
   TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);
diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h
index c6198dccd2431..9f0388c7fccf0 100644
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@@ -167,14 +167,22 @@ TORCH_API TensorImpl* propagate_names(
 
 TORCH_API void propagate_names(TensorImpl* result, /*const */ TensorImpl* src);
 
+<<<<<<< HEAD
 inline void propagate_names(
+=======
+TORCH_API inline void propagate_names(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorBase& result,
     DimnameList names,
     bool validate_names = false) {
   propagate_names(result.unsafeGetTensorImpl(), names, validate_names);
 }
 
+<<<<<<< HEAD
 inline void propagate_names_if_nonempty(
+=======
+TORCH_API inline void propagate_names_if_nonempty(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorBase& result,
     DimnameList names,
     bool validate_names = false) {
@@ -182,7 +190,13 @@ inline void propagate_names_if_nonempty(
       result.unsafeGetTensorImpl(), names, validate_names);
 }
 
+<<<<<<< HEAD
 inline void propagate_names(const TensorBase& result, const TensorBase& src) {
+=======
+TORCH_API inline void propagate_names(
+    const TensorBase& result,
+    const TensorBase& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   propagate_names(result.unsafeGetTensorImpl(), src.unsafeGetTensorImpl());
 }
 
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index ea951ed3db136..fcf4d5183d721 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -211,7 +211,11 @@ NestedTensorImpl::NestedTensorImpl(
 }
 
 // assume contiguous, `nested_strides` and `offsets`
+<<<<<<< HEAD
 // can be inferred from `nested_sizes`
+=======
+// can be infered from `nested_sizes`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NestedTensorImpl::NestedTensorImpl(
     const at::Tensor& buffer,
     const at::Tensor& nested_sizes)
@@ -273,7 +277,11 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
   return NestedTensorImpl::numel_custom();
 }
 
+<<<<<<< HEAD
 c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
+=======
+bool NestedTensorImpl::is_contiguous_custom(MemoryFormat) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return nested_tensor_impl_is_contiguous(this);
 }
 IntArrayRef NestedTensorImpl::sizes_custom() const {
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index 9b92e9ec83ad2..56e931773068f 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -32,7 +32,11 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
       at::Tensor nested_strides,
       at::Tensor storage_offsets);
   // assume contiguous, `nested_strides` and `offsets`
+<<<<<<< HEAD
   // can be inferred from `nested_sizes`
+=======
+  // can be infered from `nested_sizes`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   explicit NestedTensorImpl(
       const at::Tensor& buffer,
       const at::Tensor& nested_sizes);
@@ -115,8 +119,12 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // with real implementations
   int64_t numel_custom() const override;
   c10::SymInt sym_numel_custom() const override;
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(
       MemoryFormat /*memory_format*/) const override;
+=======
+  bool is_contiguous_custom(MemoryFormat) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t size_custom(int64_t d) const override {
     return this->size(d);
   }
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index d09a33841b948..b16338dd9c6ac 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -14,7 +14,11 @@ inline int64_t divup(int64_t x, int64_t y) {
 TORCH_API void init_num_threads();
 
 // Sets the number of threads to be used in parallel region
+<<<<<<< HEAD
 TORCH_API void set_num_threads(int /*nthreads*/);
+=======
+TORCH_API void set_num_threads(int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Returns the maximum number of threads that may be used in a parallel region
 TORCH_API int get_num_threads();
@@ -37,7 +41,11 @@ inline void lazy_init_num_threads() {
   }
 }
 
+<<<<<<< HEAD
 TORCH_API void set_thread_num(int /*id*/);
+=======
+TORCH_API void set_thread_num(int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TORCH_API ThreadIdGuard {
  public:
@@ -93,12 +101,20 @@ ident: identity for binary combination function sf. sf(ident, x) needs to return
 x.
 
 f: function for reduction over a chunk. f needs to be of signature scalar_t
+<<<<<<< HEAD
 f(int64_t partial_begin, int64_t partial_end, scalar_t identify)
+=======
+f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 sf: function to combine two partial results. sf needs to be of signature
 scalar_t sf(scalar_t x, scalar_t y)
 
+<<<<<<< HEAD
 For example, you might have a tensor of 10000 entries and want to sum together
+=======
+For example, you might have a tensor of 10000 entires and want to sum together
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 an intermediate result tensor with 4 elements. Then it will execute the function
 "f" you provide and pass the beginning and end index of these chunks, so
@@ -130,7 +146,11 @@ inline scalar_t parallel_reduce(
 TORCH_API std::string get_parallel_info();
 
 // Sets number of threads used for inter-op parallelism
+<<<<<<< HEAD
 TORCH_API void set_num_interop_threads(int /*nthreads*/);
+=======
+TORCH_API void set_num_interop_threads(int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Returns the number of threads used for inter-op parallelism
 TORCH_API size_t get_num_interop_threads();
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index e90065543e35b..bf1692a6166ca 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -42,6 +42,7 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
 }
 
 bool torch_function_mode_enabled() {
+<<<<<<< HEAD
   // Manually flatten because gcc is refusing to inline here.  Note
   // that we are still calling __tls_get_addr twice here with GCC,
   // presumably because of
@@ -50,6 +51,10 @@ bool torch_function_mode_enabled() {
   // performance.
   const auto& ptfs = pythonTorchFunctionState;
   return ptfs.disabled_state_ != TorchFunctionDisabledState::ALL_DISABLED && !ptfs.stack_.empty();
+=======
+  return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED &&
+         PythonTorchFunctionTLS::stack_len() > 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // This is needed to disambiguate the ternary torch function disabled states
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
index 502bb535be050..e239ed73e487e 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.h
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -27,7 +27,10 @@ struct TORCH_API PythonTorchFunctionTLS {
   TorchFunctionDisabledState disabled_state_ =
       TorchFunctionDisabledState::ENABLED;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
+<<<<<<< HEAD
   friend TORCH_API bool torch_function_mode_enabled();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 TORCH_API bool torch_function_mode_enabled();
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index 69d0c243156fa..08a8e01e62c6d 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -13,7 +13,11 @@ namespace {
   // and left at true for the rest of the execution.
   // It's an optimization so that users who never use default hooks don't need to
   // read the thread_local variables pack_hook_ and unpack_hook_.
+<<<<<<< HEAD
   bool is_initialized(false);
+=======
+  static bool is_initialized(false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void assertSavedTensorHooksNotDisabled() {
diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp
index dec6d2e95960b..e71b0b9e9f9d8 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@@ -252,7 +252,14 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
   TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
+<<<<<<< HEAD
 c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
   TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
 }
+=======
+bool SparseCsrTensorImpl::is_contiguous_custom(MemoryFormat) const {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index e764f954db33e..e94b6971c547d 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -32,10 +32,17 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
 
  public:
   explicit SparseCsrTensorImpl(
+<<<<<<< HEAD
       at::DispatchKeySet /*key_set*/,
       at::Device device,
       Layout layout,
       const caffe2::TypeMeta /*data_type*/);
+=======
+      at::DispatchKeySet,
+      at::Device device,
+      Layout layout,
+      const caffe2::TypeMeta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void resize_(int64_t nnz, IntArrayRef size);
   void resize_and_clear_(
@@ -86,8 +93,12 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  protected:
   IntArrayRef strides_custom() const override;
   SymIntArrayRef sym_strides_custom() const override;
+<<<<<<< HEAD
   SymBool sym_is_contiguous_custom(
       MemoryFormat /*memory_format*/) const override;
+=======
+  bool is_contiguous_custom(MemoryFormat) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   void set_size(int64_t dim, int64_t new_size) override;
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index a2c12fcfe8b9b..b4ac24f951e0c 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -46,9 +46,13 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 
  public:
   // Public for now...
+<<<<<<< HEAD
   explicit SparseTensorImpl(
       at::DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta /*data_type*/);
+=======
+  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void release_resources() override;
 
@@ -135,12 +139,21 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         "resize_ called on tensor with symbolic shape")
     TORCH_CHECK(
         sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
+<<<<<<< HEAD
         "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
         size.size(),
         ", sparse_dim = ",
         sparse_dim,
         ", dense_dim = ",
         dense_dim);
+=======
+        "number of dimensions must be sparse_dim (",
+        sparse_dim,
+        ") + dense_dim (",
+        dense_dim,
+        "), but got ",
+        size.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (nnz() > 0) {
       [[maybe_unused]] auto constexpr alt_options_msg =
           "You could try the following options:\n\
@@ -231,14 +244,22 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   }
 
   void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
+<<<<<<< HEAD
     _resize_(sparse_dim, dense_dim, size);
+=======
+    return _resize_(sparse_dim, dense_dim, size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void resize_(
       int64_t sparse_dim,
       int64_t dense_dim,
       ArrayRef<c10::SymInt> size) {
+<<<<<<< HEAD
     _resize_(sparse_dim, dense_dim, size);
+=======
+    return _resize_(sparse_dim, dense_dim, size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // NOTE: this function will resize the sparse tensor and also set `indices`
@@ -256,12 +277,21 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         "resize_and_clear_ called on tensor with symbolic shape")
     TORCH_CHECK(
         sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
+<<<<<<< HEAD
         "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
         size.size(),
         ", sparse_dim = ",
         sparse_dim,
         ", dense_dim = ",
         dense_dim);
+=======
+        "number of dimensions must be sparse_dim (",
+        sparse_dim,
+        ") + dense_dim (",
+        dense_dim,
+        "), but got ",
+        size.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
     sparse_dim_ = sparse_dim;
@@ -386,8 +416,13 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 
  private:
   explicit SparseTensorImpl(
+<<<<<<< HEAD
       at::DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta /*data_type*/,
+=======
+      at::DispatchKeySet,
+      const caffe2::TypeMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::Tensor indices,
       at::Tensor values);
 
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 1fa852686656f..603216cca9cc7 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -59,7 +59,11 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
     }
   }
 
+<<<<<<< HEAD
   set_item(self, indices, value);
+=======
+  return set_item(self, indices, value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace indexing
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 9291d2e66e5f5..f5055d32e40fa 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -112,10 +112,17 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
   // Case 1: `at::indexing::None`
+<<<<<<< HEAD
   TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
 
   // Case 2: "..." / `at::indexing::Ellipsis`
   TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
+=======
+  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
+
+  // Case 2: "..." / `at::indexing::Ellipsis`
+  TensorIndex(at::indexing::EllipsisIndexType)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : type_(TensorIndexType::Ellipsis) {}
   TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
     TORCH_CHECK_VALUE(
@@ -214,7 +221,11 @@ inline Tensor applySlice(
       "step must be greater than zero");
 
   // See NOTE [nested tensor size for indexing]
+<<<<<<< HEAD
   if (self_sizes.has_value() && !self_sizes.value().empty()) {
+=======
+  if (self_sizes.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Skip this optimization if we are tracing, as the trace may be polymorphic
     // over the shape of the `self` tensor, and we still want to record
     // the slice.
@@ -223,7 +234,11 @@ inline Tensor applySlice(
         : self.sym_size(dim);
     if (!disable_slice_optimization &&
         TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
+<<<<<<< HEAD
         TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) {
+=======
+        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return self;
     }
   }
@@ -252,7 +267,11 @@ inline Tensor applySelect(
     // Note: `size >= -index` is not equivalent to `size > -1 - index` if index
     // is INT64_MIN For std::numeric_limits<int64_t>::min() result of unary
     // minus is undefined by the standard but in practice is equal to self. On
+<<<<<<< HEAD
     // the other hand, indexing wrapping is valid for all negative int64_t
+=======
+    // the other hand, indexing wraping is valid for all negative int64_t
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // values, as x[INT64_MIN] is the same as x[INT64_MAX]
     TORCH_CHECK_INDEX(
         size.sym_gt(-1 - index)
@@ -315,6 +334,7 @@ inline void recordTensorIndex(
     const Tensor& tensor,
     std::vector<Tensor>& outIndices,
     int64_t* dim_ptr) {
+<<<<<<< HEAD
   if (outIndices.empty()) {
     outIndices.resize(*dim_ptr + 1);
     outIndices[*dim_ptr] = tensor;
@@ -326,6 +346,12 @@ inline void recordTensorIndex(
   } else {
     *dim_ptr += 1;
   }
+=======
+  // TODO: check scalarType
+  outIndices.resize(*dim_ptr + 1);
+  outIndices[*dim_ptr] = tensor;
+  (*dim_ptr)++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline c10::List<::std::optional<Tensor>> typeConvertIndices(
@@ -465,6 +491,7 @@ inline Tensor handleDimInMultiDimIndexing(
         original_tensor_device,
         prev_dim_result_sizes);
     (*dim_ptr)++;
+<<<<<<< HEAD
     if (!outIndices.empty()) {
       outIndices.resize(outIndices.size() + 1);
     }
@@ -475,13 +502,21 @@ inline Tensor handleDimInMultiDimIndexing(
     if (!outIndices.empty()) {
       outIndices.resize(outIndices.size() + ellipsis_ndims);
     }
+=======
+    return result;
+  } else if (index.is_ellipsis()) {
+    (*dim_ptr) += original_tensor.dim() - (*specified_dims_ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return prev_dim_result;
   } else if (index.is_none()) {
     Tensor result = prev_dim_result.unsqueeze(*dim_ptr);
     (*dim_ptr)++;
+<<<<<<< HEAD
     if (!outIndices.empty()) {
       outIndices.resize(outIndices.size() + 1);
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   } else if (index.is_boolean()) {
     Tensor result = prev_dim_result.unsqueeze(*dim_ptr);
@@ -577,10 +612,13 @@ inline Tensor applySlicing(
 inline Tensor dispatch_index(
     const Tensor& self,
     std::vector<Tensor>&& indices) {
+<<<<<<< HEAD
   // Remove trailing null elements from indices
   while (!indices.empty() && !indices.back().defined()) {
     indices.pop_back();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return self.index(impl::typeConvertIndices(self, std::move(indices)));
 }
 
@@ -588,10 +626,13 @@ inline Tensor dispatch_index_put_(
     Tensor& self,
     std::vector<Tensor>&& indices,
     const Tensor& value) {
+<<<<<<< HEAD
   // Remove trailing null elements from indices
   while (!indices.empty() && !indices.back().defined()) {
     indices.pop_back();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return self.index_put_(
       impl::typeConvertIndices(self, std::move(indices)), value);
 }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index b10d5c7d1fc3f..f9c28ed832b88 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -56,7 +56,11 @@ inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_
   }
 }
 
+<<<<<<< HEAD
 OptionalTensorRef make_otr(const TensorBase &tensor) {
+=======
+static OptionalTensorRef make_otr(const TensorBase &tensor) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (tensor.defined()) {
     return OptionalTensorRef(tensor);
   } else {
@@ -208,7 +212,11 @@ bool TensorIteratorConfig::is_tensor_const(size_t idx) {
 // same strides are increasing. If dimensions are non-increasing, we move on to the next input to break the tie.
 //
 // Instead of applying rule 4 for tie breaking, we could move on to the next tensor directly. This would result in possibly
+<<<<<<< HEAD
 // losing the correct permutation of the first tensor if there are permuted trivial dimensions, but could potentially
+=======
+// losing the correct permuation of the first tensor if there are permuted trivial dimensions, but could potentially
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // improve traversal order of the second tensor. We chose the former option to better propagate channels last layout
 // for example for a tensor with the sizes N1H1
 // These rules result in the intuitive behavior that in most cases recovers permutation of either the first argument (if all
@@ -244,7 +252,11 @@ void TensorIteratorBase::reorder_dimensions() {
   // initialize perm with n-1, n-2, ..., 1, 0
   std::iota(perm_.rbegin(), perm_.rend(), 0);
 
+<<<<<<< HEAD
   // Reordering dimensions changes iteration order
+=======
+  // Reordering dimensions changes iteraton order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (enforce_linear_iteration_) {
     permute_dimensions(perm_);
     return;
@@ -765,8 +777,12 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
   if (numel == 0) {
     return;
   } else if (numel < grain_size || at::get_num_threads() == 1) {
+<<<<<<< HEAD
     serial_for_each(loop, {0, numel});
     return;
+=======
+    return serial_for_each(loop, {0, numel});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
       serial_for_each(loop, {begin, end});
@@ -1534,7 +1550,11 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
 
   // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
+<<<<<<< HEAD
   // Extend the condition to MAIA tensors as MAIA tensors also don't have storage.
+=======
+  // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (privateuse1_without_storage  ||
       common_device_.type() == DeviceType::XLA  ||
       common_device_.type() == DeviceType::IPU  ||
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index d8593a80292b3..dac89cece9c80 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -250,7 +250,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;
 
+<<<<<<< HEAD
   void build(TensorIteratorConfig& /*config*/);
+=======
+  void build(TensorIteratorConfig&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // The inner-loop function operates on the fastest moving dimension. It
   // implements element-wise operations in terms of 1-d strided tensors.
@@ -388,7 +392,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 
   /// Return scalar value from original_tensor_base if it is defined. When
   /// common_dtype is Half, casting scalar input to common_dtype might overflow.
+<<<<<<< HEAD
   /// If the scalar is already given in the type of Half, then return scalar
+=======
+  /// If the scalar is aleady given in the type of Half, then return scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// value from tensor_base.
   template <typename T>
   T original_scalar_value(int64_t arg) {
@@ -502,7 +510,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// kernels
   bool can_use_32bit_indexing() const;
 
+<<<<<<< HEAD
   /// An "iterable" object that recursively splits this iterator into
+=======
+  /// An "iteratable" object that recursively splits this iterator into
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// sub-iterators that can use 32-bit indexing.
   SplitUntil32Bit with_32bit_indexing() const;
 
@@ -618,6 +630,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 #undef TORCH_DISALLOW_TEMPORARIES
  protected:
   // Mutable reference as it moves tensors out of TensorIteratorConfig
+<<<<<<< HEAD
   void populate_operands(TensorIteratorConfig& /*config*/);
   void mark_outputs();
   void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
@@ -632,6 +645,22 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   bool fast_set_up(const TensorIteratorConfig& /*config*/);
   FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
   void compute_names(const TensorIteratorConfig& /*config*/);
+=======
+  void populate_operands(TensorIteratorConfig&);
+  void mark_outputs();
+  void mark_resize_outputs(const TensorIteratorConfig&);
+  void compute_mem_overlaps(const TensorIteratorConfig&);
+  void compute_shape(const TensorIteratorConfig&);
+  void compute_strides(const TensorIteratorConfig&);
+  void reorder_dimensions();
+  void permute_dimensions(IntArrayRef perm);
+  void compute_types(const TensorIteratorConfig&);
+  ScalarType compute_common_dtype();
+  void allocate_or_resize_outputs();
+  bool fast_set_up(const TensorIteratorConfig&);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
+  void compute_names(const TensorIteratorConfig&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void propagate_names_to_outputs();
   void coalesce_dimensions();
 
@@ -878,7 +907,11 @@ class TORCH_API TensorIteratorConfig final {
 
   // Sets the enforce_linear_iteration_ flag, which is false by default.
   // If true, iteration goes in the same order as a C-contiguous tensor
+<<<<<<< HEAD
   // is laid out in memory. i.e. last dimension iterates fastest.
+=======
+  // is layed out in memory. i.e. last dimension iterates fastest.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // This iteration order can be less efficient and may even prevent
   // vectorization. So only use if the correctness of your kernel depends on it.
diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h
index 515642a0c51d2..73c3b3c2d87b2 100644
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@@ -78,7 +78,11 @@ inline bool areAnyOptionalTensorSubclassLike(
 // NOTE: This function expects a scalar tensor of boolean dtype.
 // Eg.
 // Non-Composite Compliant Pattern : (t == 0).all().item<bool>()
+<<<<<<< HEAD
 // Composite Compliant Pattern : is_salar_tensor_true((t == 0).all())
+=======
+// Composite Compliant Patter : is_salar_tensor_true((t == 0).all())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool is_scalar_tensor_true(const Tensor& t) {
   TORCH_INTERNAL_ASSERT(t.dim() == 0)
   TORCH_INTERNAL_ASSERT(t.scalar_type() == kBool)
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 8236751679f06..68466ef915e41 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -273,11 +273,19 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
 }
 
 void * maybe_data_ptr(const Tensor& tensor) {
+<<<<<<< HEAD
   return tensor.defined() ? tensor.data_ptr() : nullptr;
 }
 
 void * maybe_data_ptr(const TensorArg& tensor) {
   return tensor->defined() ? tensor->data_ptr() : nullptr;
+=======
+  return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
+}
+
+void * maybe_data_ptr(const TensorArg& tensor) {
+  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void check_dim_size(
@@ -378,9 +386,15 @@ inline static std::optional<ResultVec> computeStride_impl(
         (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) &&
         TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) {
      // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not
+<<<<<<< HEAD
      // know if that is satisfied we keep accumulating. For example if view_numel = 1 and tensor_numel = u1,
      // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop.
      // That's why we use TORCH_GUARD_OR_TRUE below.
+=======
+     // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1,
+     // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop.
+     // Thats why we use TORCH_GUARD_OR_TRUE below.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
      // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because
      // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 22509c7be4e19..bfdd8f9bf51c5 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -8,7 +8,10 @@
 #include <ATen/record_function.h>
 #include <ATen/SavedTensorHooks.h>
 #include <ATen/FunctionalTensorWrapper.h>
+<<<<<<< HEAD
 #include <ATen/DTensorState.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 
@@ -20,7 +23,10 @@ ThreadLocalState::ThreadLocalState()
       torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
       saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
+<<<<<<< HEAD
       dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
   for(size_t i=0; i<autocast_dtypes_.size(); i++) {
@@ -54,8 +60,11 @@ void ThreadLocalState::setThreadLocalState(
 
   c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
 
+<<<<<<< HEAD
   at::set_dtensor_allow_implicit_replication(state.dtensor_allow_implicit_replication_);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index d0d8112fc4cec..7728f99799d60 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -75,8 +75,11 @@ class TORCH_API ThreadLocalState {
 
   bool functionalization_reapply_views_state_;
 
+<<<<<<< HEAD
   bool dtensor_allow_implicit_replication_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TLS for arbitrary python objects that is registered via hooks
   at::impl::ThreadLocalPythonObjects saved_objects_;
 
diff --git a/aten/src/ATen/TracerMode.h b/aten/src/ATen/TracerMode.h
index d0d4c93a84f53..bffd0abda1533 100644
--- a/aten/src/ATen/TracerMode.h
+++ b/aten/src/ATen/TracerMode.h
@@ -27,7 +27,11 @@
 //    ops (ops being called by other ops). After the intermediate op call
 //    finishes it's set back to the original `TracingState` object.
 //
+<<<<<<< HEAD
 //    The `TracingState` object in TLS can also be read/written via its Python
+=======
+//    The `TracingState` obect in TLS can also be read/written via its Python
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //    binding in `python_tracer.cpp`, and `get/setTracingState()` C++ APIs,
 //    which are also exposed as `TORCH_API`.
 //
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index e9c936b906c67..a37d1bb99842f 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -20,7 +20,11 @@
 
 namespace at {
 
+<<<<<<< HEAD
 TORCH_API int _crash_if_asan(int /*arg*/);
+=======
+TORCH_API int _crash_if_asan(int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 7239f357fdd64..5bc698b6cb3a7 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -95,6 +95,7 @@ std::string get_cpu_capability() {
   // environment variable
   auto capability = native::get_cpu_capability();
   switch (capability) {
+<<<<<<< HEAD
     case native::CPUCapability::DEFAULT:
       return "DEFAULT";
 #if defined(HAVE_VSX_CPU_DEFINITION)
@@ -107,6 +108,26 @@ std::string get_cpu_capability() {
     case native::CPUCapability::SVE256:
       return "SVE256";
 #else
+=======
+#if defined(HAVE_VSX_CPU_DEFINITION)
+    case native::CPUCapability::DEFAULT:
+      return "DEFAULT";
+    case native::CPUCapability::VSX:
+      return "VSX";
+#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
+    case native::CPUCapability::DEFAULT:
+      return "DEFAULT";
+    case native::CPUCapability::ZVECTOR:
+      return "Z VECTOR";
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+    case native::CPUCapability::DEFAULT:
+      return "DEFAULT";
+    case native::CPUCapability::SVE256:
+      return "SVE256";
+#else
+    case native::CPUCapability::DEFAULT:
+      return "NO AVX";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case native::CPUCapability::AVX2:
       return "AVX2";
     case native::CPUCapability::AVX512:
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index aa000b118daa2..a4c5d1e5b1224 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -121,7 +121,11 @@ inline int64_t legacy_cat_wrap_dim_symint(
     const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
   for (auto& sizes : tensor_sizes) {
     if (sizes.size() == 1) {
+<<<<<<< HEAD
       if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
+=======
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue;
       }
     }
@@ -135,7 +139,11 @@ inline int64_t legacy_cat_wrap_dim(
     const MaterializedITensorListRef& tensors) {
   for (const Tensor& tensor : tensors) {
     if (tensor.dim() == 1) {
+<<<<<<< HEAD
       if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
+=======
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue;
       }
     }
diff --git a/aten/src/ATen/ZeroTensorFallback.cpp b/aten/src/ATen/ZeroTensorFallback.cpp
index 40b34030b85b9..d29d2c981ad26 100644
--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@@ -9,6 +9,7 @@
 
 namespace at {
 
+<<<<<<< HEAD
  /*
   * Design:
   * 1. ZeroTensors are regular tensors with TensorOptions, a storage
@@ -39,6 +40,9 @@ namespace at {
   * it does not perfectly handle NaNs and Infs as we don't check the actual values
   * and assume that they are non-zero, non-inf, non-NaN etc.
   */
+=======
+  // TODO: add a note explaining the design decisions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
   static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
     const auto& arguments = op.schema().arguments();
@@ -124,7 +128,11 @@ namespace at {
     m.impl("clone", torch::CppFunction::makeFallthrough());
     m.impl("dot", torch::CppFunction::makeFallthrough());
     m.impl("vdot", torch::CppFunction::makeFallthrough());
+<<<<<<< HEAD
     // The functions in the list below have a specific registration in native_functions.yaml and
+=======
+    // The functions in the list below have a specific registeration in native_functions.yaml and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // do not use the fallback.
     // m.impl("mul.Tensor", torch::CppFunction::makeFallthrough());
     // m.impl("add.Tensor", torch::CppFunction::makeFallthrough());
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index e3424cc4cb8eb..a2d3affa24bbc 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -36,7 +36,11 @@ namespace {
 using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
 using val_type = std::tuple<weakref_type, Tensor>;
 
+<<<<<<< HEAD
 ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
+=======
+static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static ska::flat_hash_map<TensorImpl*, val_type> cached_casts;
   return cached_casts;
 }
@@ -148,7 +152,11 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/
 
+<<<<<<< HEAD
 static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
+=======
+static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
            "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
            "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
@@ -216,7 +224,10 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
   KERNEL_MPS(_convolution, lower_precision_fp)
   KERNEL_MPS(conv1d, lower_precision_fp)
   KERNEL_MPS(conv2d, lower_precision_fp)
+<<<<<<< HEAD
   KERNEL_MPS(conv3d, lower_precision_fp)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   KERNEL_MPS(conv_tbc, lower_precision_fp)
   KERNEL_MPS(conv_transpose1d, lower_precision_fp)
   KERNEL_MPS(conv_transpose2d, input, lower_precision_fp)
@@ -240,7 +251,10 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
   KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)
 
   // fp32
+<<<<<<< HEAD
   KERNEL_MPS(conv_transpose3d, input, fp32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   KERNEL_MPS(acos, fp32)
   KERNEL_MPS(asin, fp32)
   KERNEL_MPS(cosh, fp32)
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 655b2343d5d5c..96d719004904c 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -25,7 +25,11 @@ TORCH_API void set_autocast_cache_enabled(bool enabled);
 // deprecated CUDA-specific autocast APIs
 C10_DEPRECATED_MESSAGE(
     "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
+<<<<<<< HEAD
 inline bool is_enabled() {
+=======
+TORCH_API inline bool is_enabled() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_DEPRECATION(
       "at::autocast::",
       __func__,
@@ -34,7 +38,11 @@ inline bool is_enabled() {
 }
 C10_DEPRECATED_MESSAGE(
     "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
+<<<<<<< HEAD
 inline void set_enabled(bool enabled) {
+=======
+TORCH_API inline void set_enabled(bool enabled) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_DEPRECATION(
       "at::autocast::",
       __func__,
@@ -43,7 +51,11 @@ inline void set_enabled(bool enabled) {
 }
 C10_DEPRECATED_MESSAGE(
     "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
+<<<<<<< HEAD
 inline at::ScalarType get_autocast_gpu_dtype() {
+=======
+TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_DEPRECATION(
       "at::autocast::",
       __func__,
@@ -52,7 +64,11 @@ inline at::ScalarType get_autocast_gpu_dtype() {
 }
 C10_DEPRECATED_MESSAGE(
     "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
+<<<<<<< HEAD
 inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
+=======
+TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_DEPRECATION(
       "at::autocast::",
       __func__,
@@ -65,7 +81,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
       "at::autocast::is_" #name                                                                      \
       "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type         \
       ") instead.")                                                                                  \
+<<<<<<< HEAD
   inline bool is_##name##_enabled() {                                                                \
+=======
+  TORCH_API inline bool is_##name##_enabled() {                                                      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_DEPRECATION(                                                                          \
         "at::autocast::",                                                                            \
         __func__,                                                                                    \
@@ -78,7 +98,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
       "at::autocast::set_" #name                                                                     \
       "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \
       ", enabled) instead.")                                                                         \
+<<<<<<< HEAD
   inline void set_##name##_enabled(bool enabled) {                                                   \
+=======
+  TORCH_API inline void set_##name##_enabled(bool enabled) {                                         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_DEPRECATION(                                                                          \
         "at::autocast::",                                                                            \
         __func__,                                                                                    \
@@ -91,7 +115,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
       "at::autocast::get_autocast_" #name                                                            \
       "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type            \
       ") instead.")                                                                                  \
+<<<<<<< HEAD
   inline at::ScalarType get_autocast_##name##_dtype() {                                              \
+=======
+  TORCH_API inline at::ScalarType get_autocast_##name##_dtype() {                                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_DEPRECATION(                                                                          \
         "at::autocast::",                                                                            \
         __func__,                                                                                    \
@@ -104,7 +132,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
       "at::autocast::set_autocast_" #name                                                            \
       "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type       \
       ", dtype) instead.")                                                                           \
+<<<<<<< HEAD
   inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                                    \
+=======
+  TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                          \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_DEPRECATION(                                                                          \
         "at::autocast::",                                                                            \
         __func__,                                                                                    \
@@ -377,7 +409,11 @@ Keep it simple for now by assuming only one such flag is
 present in the argument list.  If I ever need a function
 with more than flag I'll figure out something else.
 The policy is:
+<<<<<<< HEAD
 If the user has explicitly specified a dtype, respect it.
+=======
+If the user has explicity specified a dtype, respect it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Otherwise, set it to the autocast type.
 ********************************************************/
 
diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp
index f3ddaedc5ecd6..a6428259d7f18 100644
--- a/aten/src/ATen/core/CachingHostAllocator.cpp
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@@ -6,9 +6,15 @@ namespace at {
 
 namespace {
 
+<<<<<<< HEAD
 std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
     allocator_array{};
 std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+=======
+static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_array{};
+static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     allocator_priority{};
 
 } // anonymous namespace
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index 603e7e73bc1ea..66c8989fb133b 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -1,12 +1,18 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
+<<<<<<< HEAD
 #include <iostream>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 #include <deque>
@@ -39,7 +45,11 @@ struct HostBlock {
 };
 
 template <typename B>
+<<<<<<< HEAD
 struct alignas(hardware_destructive_interference_size) FreeBlockList {
+=======
+struct alignas(64) FreeBlockList {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::mutex mutex_;
   std::deque<B*> list_;
 };
@@ -50,6 +60,7 @@ namespace {
   constexpr size_t MAX_SIZE_INDEX = 64;
 }
 
+<<<<<<< HEAD
 // A large reserved pinned memory segment that is created in advance which is used
 // to allocate small pinned memory requests to avoid calling into expensive APIs.
 // We never free this memory and move up the pointer as we allocate new blocks
@@ -101,6 +112,21 @@ struct TORCH_API HostStats {
   // SUM: bytes allocated/reserved by this memory allocator. This accounts
   // for both free and in-use blocks.
   Stat allocated_bytes;
+=======
+// Struct containing memory allocator summary statistics for host.
+struct TORCH_API HostStats {
+  // COUNT: allocations requested by client code. Note that active
+  // count can be extracted by looking at current allocations
+  Stat allocation;
+  // COUNT: number of allocated segments from host memory allocation.
+  Stat segment;
+
+  // SUM: bytes allocated by this memory alocator. Note that active bytes
+  // can be extracted by looking at current bytes allocated
+  Stat allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  Stat reserved_bytes;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
   DurationStat host_alloc_time;
@@ -114,14 +140,18 @@ struct TORCH_API HostStats {
 
   // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
   int64_t num_host_free = 0; // This is derived from segment or timing
+<<<<<<< HEAD
 
   // Count of cudaHostAlloc/cudaHostRegister per bucket
   std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Struct containing memory allocator summary statistics for host, as they
 // are staged for reporting. This is a temporary struct that is used to
 // avoid locking the allocator while collecting stats.
+<<<<<<< HEAD
 struct alignas(hardware_destructive_interference_size) HostStatsStaged {
   std::mutex timing_mutex_;
   // COUNT: total allocations (active + free)
@@ -140,6 +170,21 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged {
   // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
   std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
   // SUM: bytes of allocation per bucket (active + free)
+=======
+struct alignas(64) HostStatsStaged {
+  std::mutex timing_mutex_;
+  // COUNT: allocations requested by client code resulting in a new segment/block allocation
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocation;
+  // SUM: bytes within active memory blocks, including blocks that are
+  // currently in the free list.
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocated_bytes;
+  // COUNT: number of allocations per bucket
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: bytes of allocation per bucket
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
   std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
   // SUM: time spent in cudaHostAlloc/cudaHostRegister
@@ -258,6 +303,15 @@ struct CachingHostAllocatorImpl {
     // Check in the recently freed blocks with pending events to see if we
     // can reuse them. Call get_free_block again after processing events
     if (pinned_use_background_threads()) {
+<<<<<<< HEAD
+=======
+      process_events_for_specific_size(roundSize);
+      block = get_free_block(roundSize);
+      if (block) {
+        return {block->ptr_, reinterpret_cast<void*>(block)};
+      }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Launch the background thread and process events in a loop.
       static bool background_thread_flag [[maybe_unused]] = [this] {
         getBackgroundThreadPool()->run([&]() {
@@ -293,7 +347,10 @@ struct CachingHostAllocatorImpl {
     auto* block = reinterpret_cast<B*>(ctx);
 
     std::optional<std::vector<E>> events;
+<<<<<<< HEAD
     ska::flat_hash_set<S> streams;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {
       std::lock_guard<std::mutex> g(block->mutex_);
       block->allocated_ = false;
@@ -302,23 +359,38 @@ struct CachingHostAllocatorImpl {
       } else {
         events = std::vector<E>();
         events->reserve(block->streams_.size());
+<<<<<<< HEAD
         block->event_count_ += block->streams_.size();
         // Move out streams to avoid holding the mutex during event recording
         streams = std::move(block->streams_);
+=======
+        for (auto stream : block->streams_) {
+          record_stream(events, stream);
+        }
+        block->event_count_ += events->size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block->streams_.clear();
       }
     }
 
+<<<<<<< HEAD
     // Event recording must be done outside the mutex to avoid potential
     // deadlocks (e.g., when Python GIL is involved)
     for (auto stream : streams) {
       record_stream(events, stream);
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!events) {
       auto index = size_index(block->size_);
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
       free_list_[index].list_.push_back(block);
+<<<<<<< HEAD
+=======
+      stats_.allocation_bucket_stats[index].decrease(1);
+      stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       // restore these events that record by used streams.
       std::lock_guard<std::mutex> g(events_mutex_);
@@ -378,12 +450,18 @@ struct CachingHostAllocatorImpl {
       for (auto* block : blocks_to_remove) {
         blocks_.erase(block);
         ptr_to_block_.erase(block->ptr_);
+<<<<<<< HEAD
         auto index = size_index(block->size_);
         free_block(block);
         stats_.allocations.decrease(1);
         stats_.allocated_bytes.decrease(block->size_);
         stats_.allocation_bucket_stats[index].decrease(1);
         stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
+=======
+        stats_.allocation.decrease(1);
+        stats_.allocated_bytes.decrease(block->size_);
+        free_block(block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         delete block;
       }
     }
@@ -394,8 +472,12 @@ struct CachingHostAllocatorImpl {
   }
 
   virtual bool pinned_use_background_threads() {
+<<<<<<< HEAD
     return c10::CachingAllocator::AcceleratorAllocatorConfig::
         pinned_use_background_threads();
+=======
+    return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
@@ -430,17 +512,29 @@ struct CachingHostAllocatorImpl {
       // per bucket (we pick index 0 arbitrarily). These are also all the host
       // allocations, not taking into account caching and free lists.
       if (i == 0) {
+<<<<<<< HEAD
         stats.allocations = stats_.allocations;
         stats.allocated_bytes = stats_.allocated_bytes;
         stats.num_host_alloc = stats.allocations.allocated;
         stats.num_host_free = stats.allocations.freed;
+=======
+        stats.segment = stats_.allocation;
+        stats.reserved_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.segment.allocated;
+        stats.num_host_free = stats.segment.freed;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
 
       // Bucket stats need to be merged with the slow-path stats. We do this in
       // a best effort manner, since we can't really replay the cached events per bucket.
+<<<<<<< HEAD
       add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
       add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
       stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
+=======
+      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
+      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     // Get the timing stats
@@ -455,7 +549,11 @@ struct CachingHostAllocatorImpl {
   }
 
   void resetAccumulatedStats() {
+<<<<<<< HEAD
     // Resetting accumulated memory stats requires concurrently holding both the
+=======
+    // Reseting accumulated memory stats requires concurrently holding both the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // free list mutexes and the blocks mutex. Previously, this was only done in
     // empty_cache function.
     for (size_t i = 0; i < free_list_.size(); ++i) {
@@ -464,11 +562,17 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
 
       if (i == 0) {
+<<<<<<< HEAD
         stats_.allocations.reset_accumulated();
         stats_.allocated_bytes.reset_accumulated();
       }
       stats_.active_bucket_stats[i].reset_accumulated();
       stats_.active_bytes_bucket_stats[i].reset_accumulated();
+=======
+        stats_.allocation.reset_accumulated();
+        stats_.allocated_bytes.reset_accumulated();
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats_.allocation_bucket_stats[i].reset_accumulated();
       stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
     }
@@ -482,7 +586,11 @@ struct CachingHostAllocatorImpl {
   }
 
   void resetPeakStats() {
+<<<<<<< HEAD
     // Resetting peak memory stats requires concurrently holding both the
+=======
+    // Reseting peak memory stats requires concurrently holding both the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // free list mutexes and the blocks mutex. Previously, this was only done in
     // empty_cache function.
     for (size_t i = 0; i < free_list_.size(); ++i) {
@@ -491,11 +599,17 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
 
       if (i == 0) {
+<<<<<<< HEAD
         stats_.allocations.reset_peak();
         stats_.allocated_bytes.reset_peak();
       }
       stats_.active_bucket_stats[i].reset_peak();
       stats_.active_bytes_bucket_stats[i].reset_peak();
+=======
+        stats_.allocation.reset_peak();
+        stats_.allocated_bytes.reset_peak();
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats_.allocation_bucket_stats[i].reset_peak();
       stats_.allocated_bytes_bucket_stats[i].reset_peak();
     }
@@ -512,7 +626,11 @@ struct CachingHostAllocatorImpl {
   virtual void add_allocated_block(B* block) {
     std::lock_guard<std::mutex> g(blocks_mutex_);
     blocks_.insert(block);
+<<<<<<< HEAD
     stats_.allocations.increase(1);
+=======
+    stats_.allocation.increase(1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stats_.allocated_bytes.increase(block->size_);
     ptr_to_block_.insert({block->ptr_, block});
 
@@ -525,8 +643,11 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
       stats_.allocation_bucket_stats[index].increase(1);
       stats_.allocated_bytes_bucket_stats[index].increase(size);
+<<<<<<< HEAD
       stats_.active_bucket_stats[index].increase(1);
       stats_.active_bytes_bucket_stats[index].increase(size);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -537,8 +658,13 @@ struct CachingHostAllocatorImpl {
       B* block = free_list_[index].list_.back();
       free_list_[index].list_.pop_back();
       block->allocated_ = true;
+<<<<<<< HEAD
       stats_.active_bucket_stats[index].increase(1);
       stats_.active_bytes_bucket_stats[index].increase(size);
+=======
+      stats_.allocation_bucket_stats[index].increase(1);
+      stats_.allocated_bytes_bucket_stats[index].increase(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return block;
     }
     return nullptr;
@@ -632,8 +758,13 @@ struct CachingHostAllocatorImpl {
         auto index = size_index(block->size_);
         std::lock_guard<std::mutex> g(free_list_[index].mutex_);
         free_list_[index].list_.push_back(block);
+<<<<<<< HEAD
         stats_.active_bucket_stats[index].decrease(1);
         stats_.active_bytes_bucket_stats[index].decrease(size);
+=======
+        stats_.allocation_bucket_stats[index].decrease(1);
+        stats_.allocated_bytes_bucket_stats[index].decrease(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (size != -1) {
           return;
         }
@@ -669,7 +800,11 @@ struct CachingHostAllocatorImpl {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
   }
 
+<<<<<<< HEAD
   alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
+=======
+  alignas(64) std::mutex blocks_mutex_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ska::flat_hash_set<B*> blocks_; // block list
   ska::flat_hash_map<void*, B*> ptr_to_block_;
 
@@ -677,17 +812,28 @@ struct CachingHostAllocatorImpl {
   // size. This allows us to quickly find a free block of the right size.
   // We use deque to store per size free list and guard the list with its own
   // mutex.
+<<<<<<< HEAD
   alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
       free_list_{MAX_SIZE_INDEX};
 
   alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
+=======
+  alignas(64) std::vector<FreeBlockList<B>> free_list_ =
+      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+
+  alignas(64) std::mutex events_mutex_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::deque<std::pair<E, B*>> events_; // event queue paired with block
 
   // Indicates whether the object is active.
   // Set to false in the destructor to signal background threads to stop.
   std::atomic<bool> active_{true};
 protected:
+<<<<<<< HEAD
   alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
+=======
+  alignas(64) HostStatsStaged stats_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API HostAllocator : public at::Allocator {
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 679308872989d..90b21fd630fd7 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -59,7 +59,13 @@ struct TORCH_API Generator {
 
   explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
    : impl_(std::move(gen_impl)) {
+<<<<<<< HEAD
     TORCH_CHECK(impl_.get(), "GeneratorImpl with nullptr is not supported");
+=======
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("GeneratorImpl with nullptr is not supported");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool operator==(const Generator& rhs) const {
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index 0bbeb9ddc13ae..3ee25c0edc805 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -49,7 +49,11 @@ static void check_unique_names(DimnameList names) {
 }
 
 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
+<<<<<<< HEAD
   impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+=======
+  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index 52acae90b1280..759f7eb3ff8d7 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -27,11 +27,19 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
     HasNonWildcard
   };
 
+<<<<<<< HEAD
   explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
     : names_(names.vec()) {
     check_invariants();
   }
   explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
+=======
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+    : names_(names.vec()) {
+    check_invariants();
+  }
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : names_(std::move(names)) {
     check_invariants();
   }
@@ -52,13 +60,21 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
       std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
   }
 
+<<<<<<< HEAD
   void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
+=======
+  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
     std::copy(new_names.begin(), new_names.end(), names_.begin());
     check_invariants();
   }
 
+<<<<<<< HEAD
   void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
+=======
+  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
     names_ = std::move(new_names);
     check_invariants();
diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index e8bac545933ca..9d967203751ca 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -229,10 +229,17 @@ class philox_engine {
   }
 
 
+<<<<<<< HEAD
   static constexpr uint32_t kPhilox10A = 0x9E3779B9;
   static constexpr uint32_t kPhilox10B = 0xBB67AE85;
   static constexpr uint32_t kPhiloxSA = 0xD2511F53;
   static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
+=======
+  static const uint32_t kPhilox10A = 0x9E3779B9;
+  static const uint32_t kPhilox10B = 0xBB67AE85;
+  static const uint32_t kPhiloxSA = 0xD2511F53;
+  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 typedef philox_engine Philox4_32;
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 39f4e7cb69764..f5a0484db4c25 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -2,7 +2,10 @@
 #include <c10/core/impl/PythonDispatcherTLS.h>
 #include <ATen/core/PythonFallbackKernel.h>
 #include <c10/core/SafePyObject.h>
+<<<<<<< HEAD
 #include <ATen/record_function.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -54,24 +57,38 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
   TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
   // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
   // StashTLSOnEntryGuard stash_guard;
+<<<<<<< HEAD
   c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset);
 
   const auto& schema = op.schema();
   const auto num_arguments = schema.arguments().size();
+=======
+  c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
   const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
   if (mode_stack_len > 0) {
+<<<<<<< HEAD
     RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
     cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
     return;
   }
 
+<<<<<<< HEAD
   RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments));
 
   // Otherwise, find a PyInterpreter on a Tensor
 
+=======
+  // Otherwise, find a PyInterpreter on a Tensor
+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // It is safe to dispatch on the very first Tensor with a pyobj_interpreter
   // without checking the interpreters of any of the arguments, because when
   // we actually run dispatch(), we will take out PyObjects in the context
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
index 83b39de34d782..6bf76c229fe44 100644
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@@ -13,7 +13,11 @@ class TORCH_API PythonOpRegistrationTrampoline final {
 public:
   //  Returns true if you successfully registered yourself (that means
   //  you are in the hot seat for doing the operator registrations!)
+<<<<<<< HEAD
   static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
+=======
+  static bool registerInterpreter(c10::impl::PyInterpreter*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Returns nullptr if no interpreter has been registered yet.
   static c10::impl::PyInterpreter* getInterpreter();
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index c5f887f096cd1..650b338000b01 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -138,7 +138,11 @@ void Tensor::_backward(TensorList inputs,
         const std::optional<Tensor>& gradient,
         std::optional<bool> keep_graph,
         bool create_graph) const {
+<<<<<<< HEAD
   impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+=======
+  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@@ -173,6 +177,7 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
   return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }
 
+<<<<<<< HEAD
 std::optional<ScalarType> TensorBase::grad_dtype() const {
   return impl::GetVariableHooks()->grad_dtype(*this);
 }
@@ -181,4 +186,6 @@ void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) con
   return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 2b9558197bdcb..8da2cfcbbbe6a 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -1,5 +1,6 @@
 #pragma once
 
+<<<<<<< HEAD
 // See https://github.com/pytorch/pytorch/issues/161660
 // This compile flag is intended to be passed in to CppExtensions that rely on
 // the stable ABI via the `extra_compile_args` argument. This is a stopgap
@@ -13,6 +14,8 @@
     "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
@@ -100,7 +103,11 @@ class TORCH_API TensorBase {
   // Create a Tensor with a +0 reference count. Special care must be
   // taken to avoid decrementing this reference count at destruction
   // time. Intended to support MaybeOwnedTraits<Tensor>.
+<<<<<<< HEAD
   explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
+=======
+  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
   friend MaybeOwnedTraits<TensorBase>;
 
@@ -111,7 +118,13 @@ class TORCH_API TensorBase {
   explicit TensorBase(
       c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
       : impl_(std::move(tensor_impl)) {
+<<<<<<< HEAD
     TORCH_CHECK(impl_.get(), "TensorImpl with nullptr is not supported");
+=======
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TensorBase(const TensorBase&) = default;
   TensorBase(TensorBase&&) noexcept = default;
@@ -135,7 +148,11 @@ class TORCH_API TensorBase {
   }
 
   TensorBase contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+<<<<<<< HEAD
     if (is_contiguous_or_false(memory_format)) {
+=======
+    if (is_contiguous(memory_format)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return *this;
     } else {
       return __dispatch_contiguous(memory_format);
@@ -276,6 +293,7 @@ class TORCH_API TensorBase {
     return impl_->is_contiguous(memory_format);
   }
 
+<<<<<<< HEAD
   // Like is_contiguous, but more dynamic shape-friendly. May return a symbolic representation of
   // contiguity instead of SymTrue SymFalse, when results are data-dependent.
   c10::SymBool sym_is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
@@ -295,6 +313,8 @@ class TORCH_API TensorBase {
     return impl_->is_contiguous(memory_format);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_non_overlapping_and_dense() const {
     return impl_->is_non_overlapping_and_dense();
   }
@@ -928,10 +948,13 @@ class TORCH_API TensorBase {
 
   const TensorBase& requires_grad_(bool _requires_grad=true) const;
 
+<<<<<<< HEAD
   std::optional<ScalarType> grad_dtype() const;
 
   void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // View Variables
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -952,7 +975,11 @@ class TORCH_API TensorBase {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 
 private:
+<<<<<<< HEAD
   TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
+=======
+  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 inline DeviceIndex get_device(const TensorBase& self) {
diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h
index dad18bd019bbe..83a8260bd56c6 100644
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@@ -117,7 +117,11 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
 template <>
 C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
   // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
+<<<<<<< HEAD
   return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
+=======
+  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /**
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index dad3f090bb1ea..fdc8bf6e6e95a 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -109,10 +109,13 @@ TORCH_LIBRARY_IMPL(_, AutogradHPU, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
 
+<<<<<<< HEAD
 TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef AUTOGRAD_FALLBACK
 
 } // namespace
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index c0f270700e3ce..8792156ac6064 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -68,8 +68,11 @@ struct TORCH_API VariableHooksInterface {
       const c10::OperatorHandle& op,
       c10::DispatchKeySet dispatch_keys,
       torch::jit::Stack* stack) const = 0;
+<<<<<<< HEAD
   virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
   virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
diff --git a/aten/src/ATen/core/boxing/BoxedKernel.h b/aten/src/ATen/core/boxing/BoxedKernel.h
index c5e46d8de000d..7abdc4b22c9a9 100644
--- a/aten/src/ATen/core/boxing/BoxedKernel.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel.h
@@ -18,10 +18,17 @@ class KernelFunction;
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
 TORCH_API void fallthrough_kernel(
+<<<<<<< HEAD
     OperatorKernel* /*unused*/,
     const OperatorHandle& /*unused*/,
     DispatchKeySet /*unused*/,
     Stack* /*unused*/);
+=======
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Note [Ambiguity in AutogradOther kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -62,10 +69,17 @@ TORCH_API void fallthrough_kernel(
 // than arbitrarily pick one or the other, we just register a kernel that raises
 // an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(
+<<<<<<< HEAD
     OperatorKernel* /*unused*/,
     const OperatorHandle& /*op*/,
     DispatchKeySet /*unused*/,
     Stack* /*unused*/);
+=======
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -75,10 +89,17 @@ TORCH_API void ambiguous_autogradother_kernel(
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
 [[noreturn]] TORCH_API void named_not_supported_kernel(
+<<<<<<< HEAD
     OperatorKernel* /*unused*/,
     const OperatorHandle& /*op*/,
     DispatchKeySet /*unused*/,
     Stack* /*unused*/);
+=======
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /**
  * BoxedKernel is similar to a std::function storing a boxed kernel.
@@ -185,16 +206,28 @@ class TORCH_API BoxedKernel final {
 
   template <BoxedKernelFunction* func>
   static void make_boxed_function(
+<<<<<<< HEAD
       OperatorKernel* /*unused*/,
       const OperatorHandle& opHandle,
       DispatchKeySet /*unused*/,
+=======
+      OperatorKernel*,
+      const OperatorHandle& opHandle,
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Stack* stack);
 
   template <BoxedKernelFunction_withDispatchKeys* func>
   static void make_boxed_function(
+<<<<<<< HEAD
       OperatorKernel* /*unused*/,
       const OperatorHandle& opHandle,
       DispatchKeySet /*ks*/,
+=======
+      OperatorKernel*,
+      const OperatorHandle& opHandle,
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Stack* stack);
 
   explicit BoxedKernel(
diff --git a/aten/src/ATen/core/boxing/BoxedKernel_impl.h b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
index 04ba1368f070a..ef94a608e00c4 100644
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@@ -2,7 +2,11 @@
 
 namespace c10 {
 
+<<<<<<< HEAD
 inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
+=======
+inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inline BoxedKernel::BoxedKernel(
     std::unique_ptr<OperatorKernel> functor,
@@ -11,9 +15,15 @@ inline BoxedKernel::BoxedKernel(
 
 template <BoxedKernel::BoxedKernelFunction* func>
 inline void BoxedKernel::make_boxed_function(
+<<<<<<< HEAD
     OperatorKernel* /*unused*/,
     const OperatorHandle& opHandle,
     DispatchKeySet /*unused*/,
+=======
+    OperatorKernel*,
+    const OperatorHandle& opHandle,
+    DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Stack* stack) {
   // Note that we're dropping the DispatchKeySet argument.
   // See Note [Plumbing Keys Through The Dispatcher 2] for details.
@@ -22,7 +32,11 @@ inline void BoxedKernel::make_boxed_function(
 
 template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
 inline void BoxedKernel::make_boxed_function(
+<<<<<<< HEAD
     OperatorKernel* /*unused*/,
+=======
+    OperatorKernel*,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const OperatorHandle& opHandle,
     DispatchKeySet ks,
     Stack* stack) {
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index dd2fb32e6817d..1b1d2e752c2e2 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -10,7 +10,11 @@ namespace c10 {
 // be handled specially.  Its semantics is that it redispatches to the
 // *next* dispatch key that would have been processed, skipping the current
 // one.
+<<<<<<< HEAD
 void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+=======
+void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(0,
     "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
     "This could occur if you registered a fallthrough kernel as a override for a specific operator "
@@ -19,7 +23,11 @@ void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unus
     "let us know in the bug tracker.");
 }
 
+<<<<<<< HEAD
 void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+=======
+void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(0,
     op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
     "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
@@ -32,7 +40,11 @@ void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHa
     "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }
 
+<<<<<<< HEAD
 void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
+=======
+void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
   // See Note [named_not_supported_kernel]
   TORCH_CHECK(0,
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 0314dcd9903e7..f2aec1977d2db 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -218,7 +218,11 @@ class TORCH_API KernelFunction final {
    * &unboxed_func>();
    */
   template <class FuncPtr, bool AllowLegacyTypes = false>
+<<<<<<< HEAD
   static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
+=======
+  static KernelFunction makeFromUnboxedFunction(FuncPtr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /**
    * Create a KernelFunction from an unboxed function.
@@ -260,7 +264,11 @@ class TORCH_API KernelFunction final {
 
   std::string dumpState() const;
   // For testing internal invariants only
+<<<<<<< HEAD
   bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
+=======
+  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   explicit KernelFunction(
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 5b645506206f9..5077e03e18af6 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -15,12 +15,22 @@ std::enable_if_t<
         std::is_base_of_v<Base, Child>,
     std::unique_ptr<Base>>
 make_unique_base(Args&&... args) {
+<<<<<<< HEAD
   return std::make_unique<Child>(std::forward<Args>(args)...);
+=======
+  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace detail
 
 inline KernelFunction::KernelFunction()
+<<<<<<< HEAD
     : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
+=======
+    : boxed_kernel_func_(),
+      unboxed_kernel_func_(nullptr),
+      sym_unboxed_kernel_func_(nullptr) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inline KernelFunction::KernelFunction(
     std::unique_ptr<OperatorKernel> functor,
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index 7fbc3b982609f..8c0cf276d9075 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -131,7 +131,11 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
   new (dest++) IValue(options.pinned_memory());
 }
 
+<<<<<<< HEAD
 inline void boxArgsToStack(IValue*& /*unused*/) {}
+=======
+inline void boxArgsToStack(IValue*&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T, typename... Args>
 C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
@@ -185,7 +189,11 @@ struct PopResult<std::tuple<Types...>> final {
   template <size_t... indices>
   static Result pop_to_tuple_impl(
       Stack& stack,
+<<<<<<< HEAD
       std::index_sequence<indices...> /*unused*/) {
+=======
+      std::index_sequence<indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
   }
 };
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 34b1514f32cdb..8f8ba4a558f98 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -105,7 +105,11 @@ using supported_primitive_arg_types = guts::typelist::typelist<
 // So a valid input type is one that our boxed functor wrapper can
 // unbox from an IValue into a C++ value.
 //
+<<<<<<< HEAD
 // Whereas a valid output type is one that our wrapper can receive
+=======
+// Whereas a valid output type is one that our wrapper can recieve
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // as a C++ value from the unboxed functor, and box into an IValue.
 
 //
@@ -561,7 +565,11 @@ struct wrap_kernel_functor_unboxed_<
   // doesn't use &&
   static ReturnType call(
       OperatorKernel* functor,
+<<<<<<< HEAD
       DispatchKeySet /*unused*/,
+=======
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ParameterTypes... args) {
     KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
     // Note [Plumbing Keys Through The Dispatcher 2]
@@ -629,8 +637,13 @@ call_functor_with_args_from_stack_(
     OperatorKernel* functor,
     DispatchKeySet dispatchKeySet,
     Stack* stack,
+<<<<<<< HEAD
     std::index_sequence<ivalue_arg_indices...> /*unused*/,
     guts::typelist::typelist<ArgTypes...>* /*unused*/) {
+=======
+    std::index_sequence<ivalue_arg_indices...>,
+    guts::typelist::typelist<ArgTypes...>*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                  // be unused and we have to silence the compiler warning.
 
@@ -708,7 +721,11 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
   static void call_(
       std::tuple<OutputTypes...>&& output,
       Stack* stack,
+<<<<<<< HEAD
       std::index_sequence<indices...> /*unused*/) {
+=======
+      std::index_sequence<indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::push(
         *stack,
         return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
@@ -718,7 +735,11 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
   static void copy_(
       const std::tuple<OutputTypes...>& output,
       Stack* stack,
+<<<<<<< HEAD
       std::index_sequence<indices...> /*unused*/) {
+=======
+      std::index_sequence<indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::push(
         *stack,
         return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
@@ -741,7 +762,11 @@ struct make_boxed_from_unboxed_functor final {
 
   static void call(
       OperatorKernel* functor,
+<<<<<<< HEAD
       const OperatorHandle& /*unused*/,
+=======
+      const OperatorHandle&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DispatchKeySet dispatchKeySet,
       Stack* stack) {
     using ReturnType =
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 8c837871dff7d..fddc5f844c787 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -63,13 +63,22 @@ struct BuiltinOpFunction : public Function {
 
   bool call(
       Stack& stack,
+<<<<<<< HEAD
       std::optional<size_t> /*unused*/,
       c10::function_ref<void(const Code&)> /*unused*/) override {
+=======
+      std::optional<size_t>,
+      c10::function_ref<void(const Code&)>) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run(stack);
     return false;
   }
 
+<<<<<<< HEAD
   bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
+=======
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       override {
     run(stack);
     return false;
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index ea537400ef73d..2524980cc4988 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -148,7 +148,11 @@ struct TORCH_API ClassType : public NamedType {
 
   void checkNotExist(const std::string& name, const std::string& what) const;
 
+<<<<<<< HEAD
   // Attributes are stored in a specific slot at runtime for efficiency.
+=======
+  // Attributes are stored in a specific slot at runtime for effiency.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // When emitting instructions we specify the slot so that attribute access is
   // a constant lookup
   std::optional<size_t> findAttributeSlot(const std::string& name) const {
@@ -412,7 +416,11 @@ struct TORCH_API ClassType : public NamedType {
   // Holds method attributes
   std::weak_ptr<CompilationUnit> compilation_unit_;
 
+<<<<<<< HEAD
   // Holds all attributes, attribute details are found on ClassAttribute
+=======
+  // Holds all atrributes, attribute details are found on ClassAttribute
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<ClassAttribute> attributes_;
   // Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
   // Never fill this without using the appropriate provideNewClassAttribute method
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index dbd00e9c52909..1e32f59b5302f 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -80,8 +80,12 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
       ts = ts | x.key_set();
     }
   }
+<<<<<<< HEAD
   [[noreturn]] void operator()(
       at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
+=======
+  [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Just checking that the handling of Tensor?[] didn't change.
     TORCH_INTERNAL_ASSERT(false);
   }
@@ -96,7 +100,11 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
     }
   }
   template <typename T>
+<<<<<<< HEAD
   void operator()(const T& /*unused*/) {
+=======
+  void operator()(const T&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // do nothing
   }
 };
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index afcaf51f231ae..8a50f1d44aee4 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -76,7 +76,17 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,
 
 OpRegistrationListener::~OpRegistrationListener()= default;
 
+<<<<<<< HEAD
 Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
+=======
+Dispatcher::Dispatcher()
+: operators_()
+, operatorLookupTable_()
+, backendFallbackKernels_()
+, listeners_(std::make_unique<detail::RegistrationListenerList>())
+, cond_var_()
+, guard_(std::make_shared<Guard>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {}
 
 Dispatcher::~Dispatcher() {
@@ -442,6 +452,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
 
   auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
   TORCH_CHECK(idx >= 0 && static_cast<uint64_t>(idx) < backendFallbackKernels_.size(), "idx=", idx);
+<<<<<<< HEAD
   // NB: Perserve BC for registering fallback for AutogradPrivateUse1 multiple time,
   // refer to https://github.com/pytorch/pytorch/issues/163979 for more informations.
   TORCH_CHECK(
@@ -453,6 +464,13 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
       backendFallbackKernels_[idx].debug,
       ", new registration ",
       debug);
+=======
+  TORCH_CHECK(
+    !backendFallbackKernels_[idx].kernel.isValid(),
+    "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ",
+    backendFallbackKernels_[idx].debug, ", new registration ", debug
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NB: inferred function schema is always nullptr for fallbacks, as fallbacks
   // cannot be unboxed
   backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
@@ -537,7 +555,11 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
 
   // Note: this records a sequence number for both Autograd keys, and for
   // non-Autograd keys where the dispatchKeySet still contains an autograd key.
+<<<<<<< HEAD
   // This means that we might collect the same sequence number two different
+=======
+  // This means that we might collect the same sequence nubmer two different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // events if they all occurred above Autograd and still had the Autograd
   // dispatch key in the dispatch key set.
   // However, this usually doesn't happen: normally the first call will
@@ -568,9 +590,15 @@ bool Dispatcher::profilingOperatorEvents() {
   return TORCH_SDT_IS_ENABLED(operator_start) || TORCH_SDT_IS_ENABLED(operator_end);
 }
 
+<<<<<<< HEAD
 C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref, std::vector<void*>& argsAddresses, std::vector<const char*>& argsTypes) {
   if (TORCH_SDT_IS_ENABLED(operator_start)) {
     TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str(), argsAddresses.size(), argsAddresses.data(), argsTypes.data());
+=======
+C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) {
+  if (TORCH_SDT_IS_ENABLED(operator_start)) {
+    TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 79a8e28d88b64..6ef75c054f58d 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -96,7 +96,11 @@ class TORCH_API Dispatcher final {
   friend class TypedOperatorHandle;
 
   struct Guard final {
+<<<<<<< HEAD
     Guard() : alive(true) {}
+=======
+    Guard() : alive(true), mutex() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::atomic<bool> alive;
     std::mutex mutex;
   };
@@ -371,10 +375,14 @@ class TORCH_API Dispatcher final {
 
 #ifdef FBCODE_CAFFE2
   static bool profilingOperatorEvents();
+<<<<<<< HEAD
   static void fireOpStartUSDT(
       at::RecordFunction::schema_ref_t schema_ref,
       std::vector<void*>& argsAddresses,
       std::vector<const char*>& argsTypes);
+=======
+  static void fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref);
 #endif // FBCODE_CAFFE2
 
@@ -492,7 +500,11 @@ class TORCH_API OperatorHandle {
   }
 
   void checkInvariants() const {
+<<<<<<< HEAD
     operatorDef_->op.checkInvariants();
+=======
+    return operatorDef_->op.checkInvariants();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   c10::ArrayRef<at::Tag> getTags() const {
@@ -581,7 +593,11 @@ class TORCH_API OperatorHandle {
 
   // We need to store this iterator in order to make
   // Dispatcher::cleanup() fast -- it runs a lot on program
+<<<<<<< HEAD
   // termination (and presumably library unloading).
+=======
+  // termination (and presuambly library unloading).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
 };
 
@@ -629,7 +645,11 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
 
 namespace detail {
 template <class... Args>
+<<<<<<< HEAD
 inline void unused_arg_(const Args&... /*unused*/) {}
+=======
+inline void unused_arg_(const Args&...) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // CaptureKernelCall is intended to capture return values from Dispatcher
 // unboxed kernel calls. A record function may request to get outputs from the
@@ -798,6 +818,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(
 
 #ifdef FBCODE_CAFFE2
   if (profilingOperatorEvents()) {
+<<<<<<< HEAD
     std::vector<void*> argsAddresses = {(void*)(&args)...};
     std::vector<const char*> argsTypes = {(typeid(args).name())...};
     struct FireOpRAII {
@@ -807,12 +828,22 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(
           std::vector<const char*>& argsTypes)
           : schema_ref_(schema_ref) {
         fireOpStartUSDT(schema_ref, argsAddresses, argsTypes);
+=======
+    struct FireOpRAII {
+      FireOpRAII(at::RecordFunction::schema_ref_t schema_ref)
+          : schema_ref_(schema_ref) {
+        fireOpStartUSDT(schema_ref);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       ~FireOpRAII() {
         fireOpEndUSDT(schema_ref_);
       }
       at::RecordFunction::schema_ref_t schema_ref_;
+<<<<<<< HEAD
     } event(op.schema(), argsAddresses, argsTypes);
+=======
+    } event(op.schema());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kernel.template call<Return, Args...>(
         op, dispatchKeySet, std::forward<Args>(args)...);
   } else {
@@ -928,7 +959,11 @@ inline void Dispatcher::redispatchBoxed(
   }
 #endif
   const auto& kernel = entry.lookup(dispatchKeySet);
+<<<<<<< HEAD
   kernel.callBoxed(op, dispatchKeySet, stack);
+=======
+  return kernel.callBoxed(op, dispatchKeySet, stack);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index add1ba059ea8a..adfcf1e312186 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -62,7 +62,21 @@ static const auto& getDispatchTableIndexToKey() {
 }
 
 OperatorEntry::OperatorEntry(OperatorName&& operator_name)
+<<<<<<< HEAD
 : name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
+=======
+: name_(std::move(operator_name))
+, schema_()
+#ifndef C10_MOBILE
+, tags_()
+#endif
+, dispatchTable_()
+, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
+, kernels_()
+, cpp_signature_()
+, sym_cpp_signature_()
+, is_observed_(ObservedOperators::isObserved(name_))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {
   // Pick up any backend fallbacks that were registered prior to this
   // OperatorEntry being created.
@@ -329,7 +343,11 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //          For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
   //          to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd].
   //          For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of
+<<<<<<< HEAD
   //          its backends and ask backend extender to request a dedicated Autograd key for the backend.
+=======
+  //          its backends and ask backend extender to request a decicated Autograd key for the backend.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //          See Note [Ambiguity in AutogradOther kernel] for more details.
   //          A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
   //          cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 176b16f7265fe..3a714c9193677 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -105,7 +105,11 @@ class TORCH_API OperatorEntry final {
   // versa that is an error.  (Refcounting for the registrations is
   // handled in the OperatorHandle in Dispatcher)
   void registerSchema(
+<<<<<<< HEAD
       FunctionSchema&& /*schema*/,
+=======
+      FunctionSchema&&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::string&& debug,
       std::vector<at::Tag> tags = {});
   void deregisterSchema();
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index 2b1a32bd0ac8a..a52cd5a67514b 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -177,7 +177,11 @@ bool DynamicType::equals(const Type& rhs) const {
   return equals(*create(rhs));
 }
 
+<<<<<<< HEAD
 bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
+=======
+bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto other = create(rhs);
   if (tag_ == other->tag_) {
     if (equals(*other)) {
@@ -371,7 +375,11 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
 }
 
 DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
+<<<<<<< HEAD
     const Type& /*unused*/) {
+=======
+    const Type&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
   return nullptr;
 }
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index ee0d077e5c51a..61f47bde598ef 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -64,7 +64,10 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(ScalarType, kDynamicIntTypeBit, 1)                                \
   _(Layout, kDynamicIntTypeBit, 1)                                        \
   _(SymInt, kDynamicIntTypeBit, 1)                                        \
+<<<<<<< HEAD
   _(SymBool, kDynamicIntTypeBit, 1)                                        \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _(MemoryFormat, kDynamicIntTypeBit, 1)
 
 #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type;
@@ -138,8 +141,13 @@ class DynamicType : public SharedType {
 
   struct Arguments {
     Arguments() = default;
+<<<<<<< HEAD
     Arguments(c10::ArrayRef<TypePtr> /*args*/);
     Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
+=======
+    Arguments(c10::ArrayRef<TypePtr>);
+    Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<LabeledDynamicType> elems;
   };
 
@@ -156,15 +164,24 @@ class DynamicType : public SharedType {
   static const TypeKind Kind = TypeKind::DynamicType;
   static TORCH_API DynamicTypePtr create(Type& ty);
 
+<<<<<<< HEAD
   explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
   explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
+=======
+  explicit DynamicType(Tag, Arguments);
+  explicit DynamicType(Tag, std::string_view, Arguments);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   DynamicType(DynamicType&& other) = delete;
   DynamicType(const DynamicType&) = delete;
   DynamicType& operator=(const DynamicType&) = delete;
   DynamicType& operator=(DynamicType&&) = delete;
 
+<<<<<<< HEAD
   TypePtr containedType(size_t /*i*/) const override;
+=======
+  TypePtr containedType(size_t) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t containedTypeSize() const override;
   Tag tag() const {
     return tag_;
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index 83db2ec9d71df..c4f48ed8cd3ce 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -96,15 +96,25 @@ struct TORCH_API Function {
   // Overload for server interpreter, a bailout size is needed for graph
   // executor.
   virtual bool call(
+<<<<<<< HEAD
       Stack& /*unused*/,
       std::optional<size_t> /*unused*/,
       c10::function_ref<void(const Code&)> /*unused*/) {
+=======
+      Stack&,
+      std::optional<size_t>,
+      c10::function_ref<void(const Code&)>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
 
   // Overload for mobile interpreter.
+<<<<<<< HEAD
   virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
+=======
+  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 6587af0f9ccc0..b0e009aa07e58 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -261,7 +261,11 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
     //
     // There are 2 cases
     // 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
+<<<<<<< HEAD
     // without the extra parenthesis, the c++ scheme parser can not parse it.
+=======
+    // without the extra parenthesis, the c++ schem parser can not parse it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // 2. something like '-> ((str, str))'. Need extra parenthesis so the return
     // type is a single tuple rather than two strings.
     // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
diff --git a/aten/src/ATen/core/interned_strings.cpp b/aten/src/ATen/core/interned_strings.cpp
index 799f6821bb928..5c44c1122edae 100644
--- a/aten/src/ATen/core/interned_strings.cpp
+++ b/aten/src/ATen/core/interned_strings.cpp
@@ -68,7 +68,15 @@ Symbol InternedStrings::_symbol(const std::string& s) {
     return it->second;
 
   auto pos = s.find("::");
+<<<<<<< HEAD
   TORCH_CHECK(pos != std::string::npos, "all symbols must have a namespace, <namespace>::<string>, but found: ", s);
+=======
+  if (pos == std::string::npos) {
+    std::stringstream ss;
+    ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
+    throw std::runtime_error(ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
 
   Symbol sym(sym_to_info_.size());
@@ -117,7 +125,16 @@ std::string Symbol::domainString() const {
 }
 
 Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
+<<<<<<< HEAD
   TORCH_CHECK(d.compare(0, domain_prefix().size(), domain_prefix()) == 0, "Symbol: domain string is expected to be prefixed with '", domain_prefix(), "', e.g. 'org.pytorch.aten'");
+=======
+  if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
+    std::ostringstream ss;
+    ss << "Symbol: domain string is expected to be prefixed with '"
+       << domain_prefix() << "', e.g. 'org.pytorch.aten'";
+    throw std::runtime_error(ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
   return fromQualString(qualString);
 }
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index bb01c47e055a8..b0a6cf0ebcbcd 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -7,7 +7,10 @@
 #include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 #include <ATen/core/type_factory.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/StringUtil.h>
 #include <c10/util/hash.h>
 #include <c10/util/irange.h>
@@ -98,8 +101,11 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
         return ComplexType::get();
       case Tag::Int:
         return IntType::get();
+<<<<<<< HEAD
       case Tag::UInt:
         return IntType::get();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       case Tag::SymInt:
         return c10::SymIntType::get();
       case Tag::SymFloat:
@@ -323,8 +329,11 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
     case Tag::Int:
       return rhs.isInt() && lhs.toInt() == rhs.toInt();
+<<<<<<< HEAD
     case Tag::UInt:
       return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Tag::SymInt:
       return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
     case Tag::SymFloat:
@@ -358,7 +367,11 @@ IValue IValue::equals(const IValue& rhs) const {
     case Tag::Enum:
       return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
     case Tag::Uninitialized:
+<<<<<<< HEAD
       // Uninitialized ivalues show up in no-ops when the compiler can prove a
+=======
+      // Unitialized ivalues show up in no-ops when the compiler can prove a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // value will never be used. Just return false on any equality comparison.
       return false;
   }
@@ -384,8 +397,11 @@ size_t IValue::hash(const IValue& v) {
     case Tag::Int:
       return c10::get_hash(v.payload.u.as_int);
     // NB: these are technically strict aliasing violations
+<<<<<<< HEAD
     case Tag::UInt:
       return c10::get_hash(v.payload.u.as_int);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Tag::SymInt:
       return c10::get_hash(v.payload.u.as_int);
     case Tag::SymFloat:
@@ -413,7 +429,11 @@ size_t IValue::hash(const IValue& v) {
     case Tag::Enum:
     case Tag::Stream:
     case Tag::Uninitialized:
+<<<<<<< HEAD
       TORCH_CHECK(false,
+=======
+      throw std::runtime_error(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "unhashable type: '" + v.type()->repr_str() + "'");
   }
   // the above switch should be exhaustive
@@ -813,8 +833,11 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printComplex(out, v);
     } case IValue::Tag::Int:
       return out << v.toInt();
+<<<<<<< HEAD
     case IValue::Tag::UInt:
       return out << v.toUInt();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case IValue::Tag::SymInt:
       return out << v.toSymInt();
     case IValue::Tag::SymFloat:
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index f13b0613691b4..4041e8b198b8a 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -12,7 +12,10 @@
 #include <c10/macros/Export.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/intrusive_ptr.h>
+<<<<<<< HEAD
 #include <limits>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -161,7 +164,10 @@ struct Capsule {
   _(Double)                  \
   _(ComplexDouble)           \
   _(Int)                     \
+<<<<<<< HEAD
   _(UInt)                    \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _(SymInt)                  \
   _(SymFloat)                \
   _(SymBool)                 \
@@ -624,6 +630,7 @@ struct TORCH_API IValue final {
   IValue(const c10::SymBool& i) {
     if (auto mi = i.maybe_as_bool()) {
       tag = Tag::Bool;
+<<<<<<< HEAD
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
       payload.u.as_int = *mi;
 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
@@ -632,6 +639,9 @@ struct TORCH_API IValue final {
 #else
 #error Unexpected or undefined __BYTE_ORDER__
 #endif
+=======
+      payload.u.as_int = *mi;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       tag = Tag::SymBool;
       payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
@@ -662,6 +672,7 @@ struct TORCH_API IValue final {
     }
   }
 
+<<<<<<< HEAD
   // Unsigned
   IValue(uint64_t u) : tag( u <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt) {
     payload.u.as_uint = u;
@@ -685,6 +696,8 @@ struct TORCH_API IValue final {
   }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Bool
   IValue(bool b) : tag(Tag::Bool) {
 #if defined(__clang__) && defined(__x86_64__)
@@ -854,7 +867,11 @@ struct TORCH_API IValue final {
   IValue(std::optional<T> v);
   template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
   IValue(c10::OptionalArrayRef<T> v);
+<<<<<<< HEAD
   IValue(std::nullopt_t /*unused*/);
+=======
+  IValue(std::nullopt_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // ClassType
   IValue(c10::intrusive_ptr<ivalue::Object> v);
@@ -925,6 +942,7 @@ struct TORCH_API IValue final {
     } else {
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           s.isIntegral(false), "Unknown type in Scalar");
+<<<<<<< HEAD
       if (s.isUnsigned()) {
         const auto val = s.toUInt64();
         payload.u.as_uint = val;
@@ -933,6 +951,10 @@ struct TORCH_API IValue final {
         payload.u.as_int = s.toLong();
         tag = Tag::Int;
       }
+=======
+      tag = Tag::Int;
+      payload.u.as_int = s.toLong();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -956,8 +978,11 @@ struct TORCH_API IValue final {
       return toSymFloat();
     else if (isSymBool())
       return toSymBool();
+<<<<<<< HEAD
     else if (isUnsigned())
       return toUInt();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "IValue is not a Scalar");
   }
 
@@ -1176,7 +1201,11 @@ struct TORCH_API IValue final {
   using HashIdentityIValueMap =
       std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
 
+<<<<<<< HEAD
   // Checks if this and rhs has a subvalues in common.
+=======
+  // Chechs if this and rhs has a subvalues in common.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // [t1,t2] and [t2, t3] returns true.
   bool overlaps(const IValue& rhs) const;
 
@@ -1287,8 +1316,11 @@ struct TORCH_API IValue final {
         return true;
       case Tag::Int:
         return false;
+<<<<<<< HEAD
       case Tag::UInt:
         return false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       case Tag::SymInt:
         return true;
       case Tag::SymFloat:
@@ -1385,8 +1417,11 @@ struct TORCH_API IValue final {
     union TriviallyCopyablePayload {
       TriviallyCopyablePayload() : as_int(0) {}
       int64_t as_int;
+<<<<<<< HEAD
       // See Note [Meaning of HAS_u]
       uint64_t as_uint;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       double as_double;
       bool as_bool;
       // Invariant: never nullptr; null state is represented as
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 8d1c3aa83dadb..418a2700820f2 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -660,7 +660,11 @@ struct TORCH_API TupleTypeFactory<TupleType> {
 template <>
 struct TORCH_API TupleTypeFactory<c10::DynamicType> {
   static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
+<<<<<<< HEAD
   static DynamicTypePtr fallback(const Type& /*unused*/);
+=======
+  static DynamicTypePtr fallback(const Type&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API Tuple : c10::intrusive_ptr_target {
@@ -1501,7 +1505,11 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
   // However, the CompilationUnit holds ownership of the type's graphs, so
   // inserting a constant object into a Graph would create a reference cycle if
   // that constant object held a shared_ptr to its CU. For these objects we
+<<<<<<< HEAD
   // instantiate them with non-owning references to its CU
+=======
+  // instatiate them with non-owning references to its CU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
     slots_.resize(numSlots);
   }
@@ -1682,7 +1690,11 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 namespace detail {
 
 struct _guarded_unsigned_long_unique_dummy final {
+<<<<<<< HEAD
   _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
+=======
+  _guarded_unsigned_long_unique_dummy(int64_t){}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 using _guarded_unsigned_long = std::conditional_t<
     std::is_same_v<unsigned long, uint32_t> ||
@@ -1776,7 +1788,11 @@ template <class Elem>
 // native_functions.yaml still return std::vector.
 // C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
 // and deprecated. Please use torch::List<T> instead.")
+<<<<<<< HEAD
 std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
+=======
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We need to do a deep copy of the vector because there might be other
   // references to this same IValue that also use the list. We can't just
   // move the elements out.
@@ -1826,18 +1842,30 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
 }
 
 template <typename T>
+<<<<<<< HEAD
 T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
+=======
+T generic_to(IValue ivalue, _fake_type<T>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using ElemType = typename std::remove_pointer<T>::type::element_type;
   return std::move(ivalue).template toCustomClass<ElemType>();
 }
 
 template <typename T>
+<<<<<<< HEAD
 tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
+=======
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return tagged_capsule<T>{std::move(ivalue)};
 }
 
 template <typename Elem>
+<<<<<<< HEAD
 c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
+=======
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }
 
@@ -1867,7 +1895,11 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
+=======
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (ivalue.isNone()) {
     return {};
   }
@@ -1880,8 +1912,13 @@ namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
     IValue ivalue,
+<<<<<<< HEAD
     _fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
     std::index_sequence<I...> /*unused*/) {
+=======
+    _fake_type<std::array<Elem, sizeof...(I)>>,
+    std::index_sequence<I...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We need to do a deep copy of the array because there might be other
   // references to this same IValue that also use the list. We can't just
   // move the elements out.
@@ -1906,7 +1943,11 @@ std::array<Elem, N> generic_to(
 template <typename Key, typename Value>
 c10::Dict<Key, Value> generic_to(
     IValue ivalue,
+<<<<<<< HEAD
     _fake_type<c10::Dict<Key, Value>> /*unused*/) {
+=======
+    _fake_type<c10::Dict<Key, Value>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
 }
 
@@ -1915,7 +1956,11 @@ C10_DEPRECATED_MESSAGE(
     "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
 std::unordered_map<K, V> generic_to(
     IValue ivalue,
+<<<<<<< HEAD
     _fake_type<std::unordered_map<K, V>> /*unused*/) {
+=======
+    _fake_type<std::unordered_map<K, V>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<K, V> specialized_dict;
 
   for (const auto& item : std::move(ivalue).toGenericDict()) {
@@ -1926,7 +1971,11 @@ std::unordered_map<K, V> generic_to(
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
+=======
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (ivalue.isNone()) {
     return std::nullopt;
   }
@@ -1937,7 +1986,11 @@ namespace detail {
 template <typename Tuple, std::size_t... INDEX>
 Tuple generic_to_tuple_impl(
     const ivalue::TupleElements& t,
+<<<<<<< HEAD
     std::index_sequence<INDEX...> /*unused*/) {
+=======
+    std::index_sequence<INDEX...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::make_tuple(
       t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
 }
@@ -1951,7 +2004,11 @@ template <
             std::is_lvalue_reference<Args>...,
             std::negation<std::is_constructible<IValue, Args>>...>,
         std::nullptr_t> = nullptr>
+<<<<<<< HEAD
 std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
+=======
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto& vals = ivalue.toTupleRef().elements();
   TORCH_CHECK(vals.size() == sizeof...(Args));
   return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
@@ -2311,7 +2368,11 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
   }
 }
 
+<<<<<<< HEAD
 inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
+=======
+inline IValue::IValue(std::nullopt_t) : IValue() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
     : tag(Tag::Object) {
@@ -2482,6 +2543,7 @@ namespace ivalue {
 namespace detail {
 
 template <typename T>
+<<<<<<< HEAD
 IValue from_(T&& x, std::true_type /*unused*/) {
   return IValue(std::forward<T>(x));
 }
@@ -2491,6 +2553,17 @@ IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
 }
 template <typename T>
 IValue from_(T&& /*x*/, std::false_type /*unused*/) {
+=======
+IValue from_(T&& x, std::true_type) {
+  return IValue(std::forward<T>(x));
+}
+template <typename T>
+IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+  return IValue(std::move(x));
+}
+template <typename T>
+IValue from_(T&& /*x*/, std::false_type) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(
       guts::false_t<T>::value,
       "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@@ -2546,19 +2619,31 @@ struct MaybeOwnedTraits<IValue> {
     return &borrow;
   }
 
+<<<<<<< HEAD
   static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
+=======
+  static bool debugBorrowIsValid(const borrow_type&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
 };
 
 template <>
 struct IValue::TagType<c10::Type> {
+<<<<<<< HEAD
   static TORCH_API c10::TypePtr get(const IValue& /*v*/);
+=======
+  static TORCH_API c10::TypePtr get(const IValue&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <>
 struct IValue::TagType<c10::DynamicType> {
+<<<<<<< HEAD
   static TORCH_API c10::TypePtr get(const IValue& /*v*/);
+=======
+  static TORCH_API c10::TypePtr get(const IValue&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <typename T>
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 35c0d3530adcc..bceb3a3c7f14e 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -8,7 +8,10 @@
 #include <ATen/core/type_factory.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/util/TypeList.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymBool.h>
@@ -117,8 +120,15 @@ struct SingleElementType : public SharedType {
 
  protected:
   SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
+<<<<<<< HEAD
     TORCH_CHECK(this->elem, c10::str(
             "Can not create ", typeKindToString(Kind), " with None type"));
+=======
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
+            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
@@ -373,7 +383,11 @@ struct TORCH_API SymbolicShape {
   // Unranked shape constructor.
   SymbolicShape() : dims_(std::nullopt) {}
 
+<<<<<<< HEAD
   // Known rank but unknown dimensions.
+=======
+  // Known rank but unknown dimentions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
     if(!rank) {
       return;
@@ -415,12 +429,24 @@ struct TORCH_API SymbolicShape {
   }
 
   ShapeSymbol operator[](size_t i) const {
+<<<<<<< HEAD
     TORCH_CHECK(dims_, "Rank isn't fixed");
+=======
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (*dims_).at(i);
   }
 
   ShapeSymbol at(size_t i) const {
+<<<<<<< HEAD
     TORCH_CHECK(dims_, "Rank isn't fixed");
+=======
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (*dims_).at(i);
   }
 
@@ -515,7 +541,13 @@ struct VaryingShape {
   }
 
   const std::optional<T> &operator[](size_t i) const {
+<<<<<<< HEAD
     TORCH_CHECK(dims_, "Rank isn't fixed");
+=======
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (*dims_).at(i);
   }
 
@@ -884,9 +916,15 @@ struct TORCH_API ListType
 
   // global singleton
   // Given an inner type T and an identifier,
+<<<<<<< HEAD
   // this function will return the global singleton type pointer
   // the type List<T>.
   // The extra "identifier" argument is needed because we have multiple container types
+=======
+  // this function wil return the global singleton type pointer
+  // the type List<T>.
+  // The extra "identifier" argument is needed beccause we have multiple container types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // that all re-use this function (List<T>, array<T, N>, etc.)
   static TypePtr get(const std::string& identifier, TypePtr inner);
 
@@ -950,7 +988,13 @@ struct TORCH_API DictType : public SharedType {
 
   TypePtr createWithContained(
       std::vector<TypePtr> contained_types) const override {
+<<<<<<< HEAD
     TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
+=======
+    if (contained_types.size() != 2) {
+      throw std::runtime_error("Expected 2 contained types");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
   }
 
@@ -1225,7 +1269,11 @@ struct TORCH_API TupleType : public NamedType {
   std::shared_ptr<FunctionSchema> schema_;
 };
 
+<<<<<<< HEAD
 // the common supertype of all Enums, only used in operator registration.
+=======
+// the common supertype of all Enums, only used in operator registraion.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
 using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index 4db1cb18883be..53631dd03b2e9 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -185,11 +185,19 @@ struct TORCH_API Type {
         : repr_(nullptr) {}
 
     /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<T> p)
+<<<<<<< HEAD
         : repr_(makeSingletonSharedPtr(p.get())) {}
 
     template <typename U, std::enable_if_t<std::is_convertible_v<U*, T*>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<U> p)
         : repr_(makeSingletonSharedPtr(static_cast<T*>(p.get()))) {}
+=======
+        : repr_(p) {}
+
+    template <typename U, std::enable_if_t<std::is_convertible_v<U*, T*>, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<U> p)
+        : repr_(SingletonTypePtr<T>(p.get())) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
     // We need to support construction from T* for pybind. The problem
@@ -202,8 +210,13 @@ struct TORCH_API Type {
     // Case 2: if T is exactly Type, we need to do a dynamic_cast to
     // check if it's a SharedType and do the right thing.
     //
+<<<<<<< HEAD
     // Case 3: Otherwise, T is not a SharedType. Use a singleton
     // pointer.
+=======
+    // Case 3: Otherwise, T is not a SharedType. (debug-check this
+    // assumption!) Use a singleton pointer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     template <typename U = T, std::enable_if_t<std::is_base_of_v<SharedType, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p) : SingletonOrSharedTypePtr(static_cast<typename detail::as_shared_type<U>::type>(p)->shared_from_this()) {}
@@ -211,15 +224,25 @@ struct TORCH_API Type {
     template <typename U = T, std::enable_if_t<std::is_same_v<Type, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p) {
       if (auto* shared_p = dynamic_cast<typename detail::as_shared_type<U>::type>(p)) {
+<<<<<<< HEAD
         repr_ = shared_p->shared_from_this();
       } else {
         repr_ = makeSingletonSharedPtr(p);
+=======
+        repr_ = Repr(shared_p->shared_from_this());
+      } else {
+        repr_ = Repr(p);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
 
     template <typename U = T, std::enable_if_t<!std::is_same_v<Type, U> && !std::is_base_of_v<SharedType, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p)
+<<<<<<< HEAD
         : repr_(makeSingletonSharedPtr(p)) {
+=======
+        : repr_(p) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dynamic_cast<typename detail::as_shared_type<U>::type>(p) == nullptr);
     }
 
@@ -230,6 +253,7 @@ struct TORCH_API Type {
     ~SingletonOrSharedTypePtr() = default;
 
     T* get() const {
+<<<<<<< HEAD
       return repr_.get();
     }
 
@@ -243,6 +267,21 @@ struct TORCH_API Type {
 
     bool operator!=(std::nullptr_t) const {
       return repr_ != nullptr;
+=======
+      return repr_.isSharedAndNonNull() ? repr_.shared_.repr_.get() : static_cast<T*>(repr_.rawRepr().first);
+    }
+
+    operator bool() const {
+      return repr_.isNonNull();
+    }
+
+    bool operator==(std::nullptr_t) const {
+      return !repr_.isNonNull();
+    }
+
+    bool operator!=(std::nullptr_t) const {
+      return repr_.isNonNull();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     template <typename U = T, std::enable_if_t<!std::is_same_v<std::remove_const_t<U>, void>, bool> = true>
@@ -255,6 +294,7 @@ struct TORCH_API Type {
     }
 
   private:
+<<<<<<< HEAD
     // Use shared_ptr's aliasing constructor to create a non-owning pointer
     // to a singleton. The lifetime is tied to the null shared_ptr, so there's
     // no reference counting overhead for the singleton itself.
@@ -263,6 +303,140 @@ struct TORCH_API Type {
     }
 
     std::shared_ptr<T> repr_;
+=======
+    // NOTE: SharedPtrWrapper exists to work around a baffling bug in
+    // nvcc; see comment in destroy() below.
+    struct SharedPtrWrapper {
+      SharedPtrWrapper(std::shared_ptr<T> &&x)
+          : repr_(std::move(x)) {}
+      std::shared_ptr<T> repr_;
+    };
+    union Repr {
+      Repr() : Repr(nullptr) {}
+
+      explicit Repr(std::shared_ptr<T> x)
+          : shared_(std::move(x)) {}
+
+      explicit Repr(std::nullptr_t)
+          : singletonRepr_(nullptr) {}
+
+      explicit Repr(SingletonTypePtr<T> p)
+          : singletonRepr_(p.get()) {}
+
+      ~Repr() {
+        destroy();
+      }
+
+      // NOTE: the only non-UB way to access our null state is through
+      // rawRepr(), because our copy operation doesn't preserve which
+      // union member is active for null pointers.
+      Repr(const Repr& rhs) {
+        if (rhs.isSharedAndNonNull()) {
+          new (&shared_) SharedPtrWrapper(rhs.shared_);
+        } else {
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+      }
+
+      Repr(Repr&& rhs) noexcept {
+        if (rhs.isSharedAndNonNull()) {
+          new (&shared_) SharedPtrWrapper(std::move(rhs.shared_));
+        } else {
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+      }
+
+      Repr& operator=(const Repr& rhs) {
+        if (&rhs == this) {
+          return *this;
+        }
+        if (rhs.isSharedAndNonNull()) {
+          if (isSharedAndNonNull()) {
+            shared_ = rhs.shared_;
+          } else {
+            new (&shared_) SharedPtrWrapper(rhs.shared_);
+          }
+        } else {
+          if (isSharedAndNonNull()) {
+            destroy();
+          }
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+        return *this;
+      }
+
+      Repr& operator=(Repr&& rhs) noexcept {
+        if (&rhs == this) {
+          return *this;
+        }
+        if (rhs.isSharedAndNonNull()) {
+          if (isSharedAndNonNull()) {
+            shared_ = std::move(rhs.shared_);
+          } else {
+            new (&shared_) SharedPtrWrapper(std::move(rhs.shared_));
+          }
+        } else {
+          if (isSharedAndNonNull()) {
+            destroy();
+          }
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+        return *this;
+      }
+
+      SharedPtrWrapper shared_;
+
+      struct SingletonRepr {
+        explicit SingletonRepr(T* s) : singleton_(s) {}
+        T* singleton_;
+        void* unused_ = nullptr;
+      } singletonRepr_;
+      struct RawRepr {
+        void* first;
+        void* nullIfSingleton_;
+      };
+
+      // It is UB to read the singleton part of Repr if it was
+      // constructed as a shared_ptr and vice versa, but memcpying out
+      // the representation is always OK, so here's an accessor to obey
+      // the letter of the law.
+      RawRepr rawRepr() const {
+        RawRepr repr{};
+        memcpy(&repr, reinterpret_cast<const char *>(this), sizeof(RawRepr));
+        return repr;
+      }
+
+      bool isNonNull() const {
+        auto repr = rawRepr();
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(repr.nullIfSingleton_ == nullptr || repr.first != nullptr);
+        return repr.first != nullptr;
+      }
+
+      bool isSharedAndNonNull() const {
+        return rawRepr().nullIfSingleton_ != nullptr;
+      }
+
+     private:
+      void destroy() {
+        if (isSharedAndNonNull()) {
+          // Without SharedPtrWrapper, this line would read
+          // `shared_.~shared_ptr()` and nvcc would complain with
+          // "error: expected primary-expression before '>' token"
+          // referring to the "t" in "shared_ptr". SharedPtrWrapper
+          // exists to work around this compiler bug.
+          shared_.~SharedPtrWrapper();
+        }
+      }
+    } repr_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   using TypePtr = SingletonOrSharedTypePtr<Type>;
@@ -553,7 +727,11 @@ inline TypePtr Type::withContained(std::vector<TypePtr> contained_types) {
 }
 
 
+<<<<<<< HEAD
 inline bool operator==(const Type& lhs, const Type& rhs) {
+=======
+TORCH_API inline bool operator==(const Type& lhs, const Type& rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (C10_UNLIKELY(!rhs.symmetric())) {
     return rhs.equals(lhs);
   }
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index 0ee79ed85930b..adff65171c040 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -44,7 +44,11 @@ constexpr int checkStaticTypes() {
 }
 
 template <typename... Ts, size_t... Is>
+<<<<<<< HEAD
 constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
+=======
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (
     // Check types for common errors
     checkStaticTypes<Ts...>(),
diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h
index 1f39ba4e38717..04ff9c3089850 100644
--- a/aten/src/ATen/core/op_registration/op_allowlist.h
+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
@@ -114,7 +114,11 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
         }
         next++;
       } else {
+<<<<<<< HEAD
         if (allowlist.substr(cur) == item) {
+=======
+        if (allowlist.substr(cur).compare(item) == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return true;
         }
         break;
diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp
index b34134309cb7a..4bbe62b1ab16b 100644
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@@ -73,7 +73,11 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
 
   std::optional<FunctionSchema> inferred_schema = std::nullopt;
   for (const auto& kernel : options.kernels) {
+<<<<<<< HEAD
     if (nullptr != kernel.inferred_function_schema) {
+=======
+    if (nullptr != kernel.inferred_function_schema.get()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!inferred_schema.has_value()) {
         inferred_schema = *kernel.inferred_function_schema;
         break;
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index a2e8d9e2a9e1d..5c6b1e23b0d16 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -21,7 +21,11 @@ namespace c10 {
 
 namespace detail {
 // The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
+<<<<<<< HEAD
 // We do this because every argument in a function schema is expected to be convertible
+=======
+// We do this because every argument in a function schema is expected to be convertable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
 // See Note [Plumbing Keys Through The Dispatcher]
 template<class KernelFunctor>
@@ -411,6 +415,10 @@ class TORCH_API RegisterOperators final {
 
     Options()
     : schemaOrName_(std::nullopt)
+<<<<<<< HEAD
+=======
+    , kernels()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     , aliasAnalysisKind_(std::nullopt)
     {}
 
@@ -419,6 +427,10 @@ class TORCH_API RegisterOperators final {
     struct KernelRegistrationConfig final {
       KernelRegistrationConfig()
         : dispatch_key(std::nullopt)
+<<<<<<< HEAD
+=======
+        , func()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         , cpp_signature(std::nullopt)
         , inferred_function_schema(nullptr)
       {}
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index e3f68128a9e14..dd854589a854e 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -251,7 +251,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
   callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA));
   EXPECT_TRUE(called_kernel_cpu);
 
+<<<<<<< HEAD
   // Ensure that dispatch key from tensor is not used here.
+=======
+  // Ensure that disptach key from tensor is not used here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   called_kernel_cpu = false;
   expectThrows<c10::Error>([&] {
     callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU));
diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h
index 4c138ee504564..fd81a994b00ea 100644
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@@ -83,7 +83,11 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
 }
 
 TORCH_API std::string toString(const OperatorName& opName);
+<<<<<<< HEAD
 TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
+=======
+TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10
 
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index 9d8080cb8f317..c5d534a8e0973 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -172,7 +172,11 @@ VaryingShape<Stride> TensorType::computeStrideProps(
   // The logic below follows what TensorIterator uses in its logic:
   //   1. Fast_set_up is the short-cut to identify a. channels_last and
   //      b. contiguous format, which is what we have in the below logic.
+<<<<<<< HEAD
   //   2. In more general cases, it does best effort to preserve permutatoin.
+=======
+  //   2. In more generla cases, it does best effort to preserve permutatoin.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
     // case 1.a. short cut channels last
     std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index abba4e14583a3..8d9e301cd60ec 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -8,7 +8,10 @@
 #include <ATen/core/jit_type.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/env.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <array>
@@ -827,7 +830,13 @@ TupleType::TupleType(
     : NamedType(TypeKind::TupleType, std::move(name)),
       elements_(std::move(elements)),
       has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
+<<<<<<< HEAD
         TORCH_CHECK(v, "Can not create tuple with None type");
+=======
+        if (!v) {
+          throw std::runtime_error("Can not create tuple with None type");
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return v->hasFreeVariables();
       })), schema_(std::move(schema)) {
 
diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h
index 011a1750ecaa0..64f883ddb8bf9 100644
--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@@ -16,7 +16,11 @@ class SingletonTypePtr {
   /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
 
   // We need this to satisfy Pybind11, but it shouldn't be hit.
+<<<<<<< HEAD
   explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
+=======
+  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using element_type = typename std::shared_ptr<T>::element_type;
 
diff --git a/aten/src/ATen/cpu/vec/intrinsics.h b/aten/src/ATen/cpu/vec/intrinsics.h
index 70223700f6364..65752fe8628cf 100644
--- a/aten/src/ATen/cpu/vec/intrinsics.h
+++ b/aten/src/ATen/cpu/vec/intrinsics.h
@@ -1 +1,59 @@
+<<<<<<< HEAD
 #include <torch/headeronly/cpu/vec/intrinsics.h>
+=======
+#pragma once
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC or clang-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
+/* Clang-compatible compiler, targeting arm neon */
+#include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* CLANG-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
+#elif defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) \
+  (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
+#define _mm256_extract_epi32(X, Y) \
+  (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
+#define _mm256_extract_epi16(X, Y) \
+  (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
+#define _mm256_extract_epi8(X, Y) \
+  (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
+#endif
+#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* GCC-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
+#if defined(MISSING_ARM_VLD1)
+#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
+#elif defined(MISSING_ARM_VST1)
+#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
+#endif
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif defined(__s390x__)
+// targets Z/architecture
+// we will include vecintrin later
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
+   with the C++ types. => Can still use __bool/__vector */
+#undef bool
+#undef vector
+#undef pixel
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
index 9e0b189bdac89..c82c34f7be0bb 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@@ -5,7 +5,10 @@
 #include <ATen/cpu/vec/sve/vec_common_sve.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/vec_base.h>
+<<<<<<< HEAD
 #include <c10/util/bit_cast.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cmath>
 namespace at {
 namespace vec {
@@ -37,7 +40,11 @@ class Vectorized<BFloat16> {
     return VECTOR_WIDTH / sizeof(BFloat16);
   }
 
+<<<<<<< HEAD
   Vectorized();
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(svbfloat16_t v) : values(v) {}
   Vectorized(int val);
   Vectorized(BFloat16 val);
@@ -164,9 +171,12 @@ class Vectorized<BFloat16> {
   Vectorized<BFloat16> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<BFloat16> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
   Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
   Vectorized<BFloat16> i0() const;
@@ -224,12 +234,17 @@ class Vectorized<BFloat16> {
   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
 };
 
+<<<<<<< HEAD
 #if defined(__GNUC__) && __GNUC__ == 14
 // Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
 __attribute__((optimize("no-tree-vectorize")))
 #endif
 inline std::tuple<Vectorized<float>, Vectorized<float>>
 convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
+=======
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<c10::BFloat16>& a) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(
       Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
   auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
@@ -307,11 +322,14 @@ Vectorized<c10::BFloat16> inline operator/(
   return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
 }
 
+<<<<<<< HEAD
 inline Vectorized<BFloat16>::Vectorized() {
   auto vals_f = svdup_n_f32(0);
   values = convert_float_bfloat16(vals_f, vals_f);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<BFloat16>::Vectorized(int val) {
   auto vals_f = svdup_n_f32(val);
   values = convert_float_bfloat16(vals_f, vals_f);
diff --git a/aten/src/ATen/cpu/vec/sve/vec_double.h b/aten/src/ATen/cpu/vec/sve/vec_double.h
index 474652be17a1a..6867d1687451a 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@@ -38,9 +38,13 @@ class Vectorized<double> {
   static constexpr size_type size() {
     return VECTOR_WIDTH / sizeof(double);
   }
+<<<<<<< HEAD
   Vectorized() {
     values = svdup_n_f64(0);
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(svfloat64_t v) : values(v) {}
   Vectorized(double val) {
     values = svdup_n_f64(val);
@@ -251,9 +255,12 @@ class Vectorized<double> {
   Vectorized<double> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<double> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
       { return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
       {
@@ -587,6 +594,7 @@ Vectorized<double> inline fmadd(
   return svmad_f64_x(ptrue, a, b, c);
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<double> inline fnmadd(
     const Vectorized<double>& a,
@@ -611,6 +619,8 @@ Vectorized<double> inline fnmsub(
   return svnmad_f64_x(ptrue, a, b, c);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // defined(CPU_CAPABILITY_SVE)
 
 } // namespace CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/sve/vec_float.h b/aten/src/ATen/cpu/vec/sve/vec_float.h
index 7e7a8fe682ff6..24895107dd778 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@@ -38,9 +38,13 @@ class Vectorized<float> {
   static constexpr size_type size() {
     return VECTOR_WIDTH / sizeof(float);
   }
+<<<<<<< HEAD
   Vectorized() {
     values = svdup_n_f32(0);
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(svfloat32_t v) : values(v) {}
   Vectorized(float val) {
     values = svdup_n_f32(val);
@@ -104,6 +108,74 @@ class Vectorized<float> {
     }
     return b;
   }
+<<<<<<< HEAD
+=======
+  // Implementation is picked from
+  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L105
+  inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const {
+    const auto c1 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
+    const auto c2 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
+    const auto c3 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
+    const auto c4 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
+    const auto c5 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); // x^5: 0x1.0e4020p-7f
+    const auto shift = svreinterpret_f32_u32(
+        svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(
+        svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(
+        0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(
+        0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+    const auto zero = svdup_n_f32(0.f);
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127
+    //   forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e.
+    //     n) + 127 will occupy the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part
+    //     of x / ln(2) (i.e. n) because the decimal part has been pushed out
+    //     and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be
+    //   used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(
+        svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy
+    // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in
+    // term of accuracy and performance.
+    const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
+    const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    const auto r2 = svmul_f32_z(pg, r, r);
+    const auto p1 = svmul_f32_z(pg, c1, r);
+    const auto p23 = svmla_f32_z(pg, c2, c3, r);
+    const auto p45 = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+    const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
+    auto poly = svmla_f32_z(pg, scale, p12345, scale);
+    // Handle underflow and overflow.
+    poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly);
+    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
+    return poly;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
@@ -248,6 +320,7 @@ class Vectorized<float> {
     return USE_SLEEF(
         Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
   }
+<<<<<<< HEAD
   // Implementation copied from Arm Optimized Routines:
   // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
   Vectorized<float> exp_u20() const {
@@ -283,6 +356,10 @@ class Vectorized<float> {
   }
   Vectorized<float> fexp_u20() const {
     return exp_u20();
+=======
+  Vectorized<float> exp_u20() const {
+    return exp();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
       { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
@@ -418,11 +495,17 @@ class Vectorized<float> {
         ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
 
     // Step 2: Calculate exp(2 * x), where x is the clamped value.
+<<<<<<< HEAD
     // svmul_f32_z computes 2 * x, and exp_u20() computes the exponential of
     // the result (via Vectorized<float>, then auto-converts back to
     // svfloat32_t).
     svfloat32_t exp2x =
         Vectorized<float>(svmul_f32_z(ptrue, CONST_2, x)).exp_u20();
+=======
+    // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
+    // the result.
+    svfloat32_t exp2x = svexp_f32_z(ptrue, svmul_f32_z(ptrue, CONST_2, x));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Step 3: Calculate the numerator of the tanh function, which is exp(2x)
     // - 1.
@@ -725,6 +808,7 @@ Vectorized<float> inline fmadd(
   return svmad_f32_x(ptrue, a, b, c);
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<float> inline fnmadd(
     const Vectorized<float>& a,
@@ -749,6 +833,8 @@ Vectorized<float> inline fnmsub(
   return svnmad_f32_x(ptrue, a, b, c);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // defined(CPU_CAPABILITY_SVE)
 
 } // namespace CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/sve/vec_int.h b/aten/src/ATen/cpu/vec/sve/vec_int.h
index f0bc42caa9502..2a5f8a2468851 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@@ -32,9 +32,13 @@ inline namespace CPU_CAPABILITY {
     static constexpr size_type size() {                                       \
       return vl;                                                              \
     }                                                                         \
+<<<<<<< HEAD
     Vectorized() {                                                            \
       values = svdup_n_s##bit(0);                                             \
     }                                                                         \
+=======
+    Vectorized() {}                                                           \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized(svint##bit##_t v) : values(v) {}                               \
     Vectorized(int##bit##_t val) {                                            \
       values = svdup_n_s##bit(val);                                           \
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128.h b/aten/src/ATen/cpu/vec/vec128/vec128.h
index 9f9079d475a8f..d91194c0e19ac 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@@ -6,11 +6,16 @@
 #ifdef __aarch64__
 #if !defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec128/vec128_double_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_half_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
 #include <ATen/cpu/vec/vec128/vec128_uint_aarch64.h>
+=======
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #include <ATen/cpu/vec/vec128/vec128_convert.h>
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
index aae7f2a79c2ea..84580cbb86ebb 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@@ -354,6 +354,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
 
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
   Vectorized frac() const;
+<<<<<<< HEAD
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
 
@@ -395,6 +396,11 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
   }
 #else
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
+=======
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
   DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
   DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
@@ -402,7 +408,10 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
   DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
   DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
   DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
 #undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
@@ -451,6 +460,7 @@ template <>
 Vectorized<c10::BFloat16> inline operator+(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
@@ -458,12 +468,16 @@ Vectorized<c10::BFloat16> inline operator+(
 #else
   return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
 #endif
+=======
+  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::BFloat16> inline operator-(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
@@ -471,12 +485,16 @@ Vectorized<c10::BFloat16> inline operator-(
 #else
   return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
 #endif
+=======
+  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::BFloat16> inline operator*(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
@@ -484,12 +502,16 @@ Vectorized<c10::BFloat16> inline operator*(
 #else
   return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
 #endif
+=======
+  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::BFloat16> inline operator/(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
@@ -497,6 +519,9 @@ Vectorized<c10::BFloat16> inline operator/(
 #else
   return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
 #endif
+=======
+  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // frac. Implement this here so we can use subtraction
@@ -607,18 +632,22 @@ Vectorized<c10::BFloat16> inline fmadd(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b,
     const Vectorized<c10::BFloat16>& c) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
   bfloat16x8_t z = c;
   return x * y + z;
 #else
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,
   // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
   // elements, not the bottom and top half, so they don't seem
   // particularly useful here. Ideally we would include dot product in
   // the Vectorized interface...
   return a * b + c;
+<<<<<<< HEAD
 #endif
 }
 
@@ -636,6 +665,8 @@ Vectorized<c10::BFloat16> inline fnmadd(
   // See NOTE [BF16 FMA] above.
   return -a * b + c;
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -643,6 +674,7 @@ Vectorized<c10::BFloat16> inline fmsub(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b,
     const Vectorized<c10::BFloat16>& c) {
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_BF16
   bfloat16x8_t x = a;
   bfloat16x8_t y = b;
@@ -668,6 +700,10 @@ Vectorized<c10::BFloat16> inline fnmsub(
   // See NOTE [BF16 FMA] above.
   return -a * b - c;
 #endif
+=======
+  // See NOTE [BF16 FMA] above.
+  return a * b - c;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
index b2e6016bcc12e..a95484b0596b2 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@@ -5,6 +5,7 @@
 namespace at::vec {
 inline namespace CPU_CAPABILITY {
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+<<<<<<< HEAD
 
 // Enable auto-vectorization for GCC-13+ and clang-17+
 // GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
@@ -249,6 +250,8 @@ inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
 
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename src_t>
 struct VecConvert<
     float,
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
index 67760ec967aa1..1198feadb52ee 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -83,9 +83,13 @@ class Vectorized<float> {
   static constexpr size_type size() {
     return 4;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = vmovq_n_f32(0);
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(float32x4_t v) : values(v) {}
   Vectorized(float val) : values{vdupq_n_f32(val)} {}
   Vectorized(float val0, float val1, float val2, float val3)
@@ -204,6 +208,7 @@ class Vectorized<float> {
     store(tmp);
     return tmp[idx];
   }
+<<<<<<< HEAD
   int zero_mask() const {
     uint32x4_t is_zero_vec = vceqzq_f32(values);
     const int32x4_t shift = vcombine_s32(
@@ -212,6 +217,20 @@ class Vectorized<float> {
     uint32x4_t bits_vec =
         vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift);
     return vaddvq_u32(bits_vec);
+=======
+  // For boolean version where we want to if any 1/all zero
+  // etc. can be done faster in a different way.
+  int zero_mask() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    int mask = 0;
+    for (int i = 0; i < size(); ++i) {
+      if (tmp[i] == 0.f) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> isnan() const {
     return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values)));
@@ -307,6 +326,7 @@ class Vectorized<float> {
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
+<<<<<<< HEAD
   // Implementation copied from Arm Optimized Routine
   // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
   Vectorized<float> exp_u20() const {
@@ -350,6 +370,10 @@ class Vectorized<float> {
   }
   Vectorized<float> fexp_u20() const {
     return exp_u20();
+=======
+  Vectorized<float> exp_u20() const {
+    return exp();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
       fmod,
@@ -579,6 +603,45 @@ inline Vectorized<float> Vectorized<float>::le(
 }
 
 template <>
+<<<<<<< HEAD
+=======
+inline void convert(const float* src, int32_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<int32_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<float> inline fmadd(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -587,6 +650,7 @@ Vectorized<float> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fnmadd(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -595,6 +659,8 @@ Vectorized<float> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<float> inline fmsub(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -602,6 +668,7 @@ Vectorized<float> inline fmsub(
   return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<float> inline fnmsub(
     const Vectorized<float>& a,
@@ -610,6 +677,8 @@ Vectorized<float> inline fnmsub(
   return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<float> Vectorized<float>::erf() const {
   // constants
   const Vectorized<float> neg_zero_vec(-0.f);
@@ -634,7 +703,12 @@ inline Vectorized<float> Vectorized<float>::erf() const {
   // - exp(- x * x)
   auto pow_2 = (*this) * (*this);
   auto neg_pow_2 = pow_2 ^ neg_zero_vec;
+<<<<<<< HEAD
   auto tmp4 = neg_pow_2.exp();
+=======
+  auto tmp4 = neg_pow_2.map(
+      std::exp); // This can be swapped for a faster implementation of exp.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto tmp5 = tmp4 ^ neg_zero_vec;
   // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
   auto tmp6 = t * tmp5;
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
index c40480ec73ac2..e9925a3def4de 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@@ -220,6 +220,7 @@ class Vectorized<c10::Half> : public Vectorized16<
       std::memcpy(ptr, tmp_values, count * sizeof(float16_t));
     }
   }
+<<<<<<< HEAD
   int zero_mask() const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     uint16x8_t is_zero_vec = vceqzq_f16(values);
@@ -246,6 +247,10 @@ class Vectorized<c10::Half> : public Vectorized16<
     return mask;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
   }
+=======
+  // For boolean version where we want to if any 1/all zero
+  // etc. can be done faster in a different way.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::Half> isnan() const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     return vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, values)));
@@ -569,6 +574,49 @@ inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
   return (*this <= other) & Vectorized<c10::Half>(1);
 }
 
+<<<<<<< HEAD
+=======
+// These are global functions, so the defaults in vec_base.h should
+// work fine if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is not available.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<int16_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float16_t>(src[i]);
+  }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 Vectorized<c10::Half> inline fmadd(
     const Vectorized<c10::Half>& a,
@@ -582,6 +630,7 @@ Vectorized<c10::Half> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::Half> inline fnmadd(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b,
@@ -594,6 +643,8 @@ Vectorized<c10::Half> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<c10::Half> inline fmsub(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b,
@@ -604,6 +655,7 @@ Vectorized<c10::Half> inline fmsub(
   return a * b - c;
 #endif
 }
+<<<<<<< HEAD
 
 template <>
 Vectorized<c10::Half> inline fnmsub(
@@ -616,6 +668,8 @@ Vectorized<c10::Half> inline fnmsub(
   return -a * b - c;
 #endif
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)
 
 } // namespace CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
index 5fb3679f37239..c7855f0a4fb4d 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
@@ -206,10 +206,13 @@ struct Vectorized16 {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(
         &Vectorized<float>::exp_u20);
   }
+<<<<<<< HEAD
   Derived fexp_u20() const {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(
         &Vectorized<float>::exp_u20);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Derived fmod(const Derived& q) const {
     // This function is questionable with a conversion, so we use map2
     return map2(q, std::fmod);
diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
index aa40000b6ccdb..21ae6d5aef1a8 100644
--- a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
@@ -1 +1,400 @@
+<<<<<<< HEAD
 #include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
+=======
+/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */
+
+__extension__ extern __inline uint8x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u8_x2(const uint8_t* __a) {
+  uint8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int8x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s8_x2(const int8_t* __a) {
+  int8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint16x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u16_x2(const uint16_t* __a) {
+  uint16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int16x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s16_x2(const int16_t* __a) {
+  int16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint32x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u32_x2(const uint32_t* __a) {
+  uint32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int32x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s32_x2(const int32_t* __a) {
+  int32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint64x1x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u64_x2(const uint64_t* __a) {
+  uint64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int64x1x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s64_x2(const int64_t* __a) {
+  int64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float16x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f16_x2(const float16_t* __a) {
+  float16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float32x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f32_x2(const float32_t* __a) {
+  float32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float64x1x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f64_x2(const float64_t* __a) {
+  float64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly8x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p8_x2(const poly8_t* __a) {
+  poly8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly16x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p16_x2(const poly16_t* __a) {
+  poly16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly64x1x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p64_x2(const poly64_t* __a) {
+  poly64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint8x16x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u8_x2(const uint8_t* __a) {
+  uint8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int8x16x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s8_x2(const int8_t* __a) {
+  int8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint16x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u16_x2(const uint16_t* __a) {
+  uint16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int16x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s16_x2(const int16_t* __a) {
+  int16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint32x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u32_x2(const uint32_t* __a) {
+  uint32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int32x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s32_x2(const int32_t* __a) {
+  int32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint64x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u64_x2(const uint64_t* __a) {
+  uint64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int64x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s64_x2(const int64_t* __a) {
+  int64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float16x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f16_x2(const float16_t* __a) {
+  float16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float32x4x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f32_x2(const float32_t* __a) {
+  float32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float64x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f64_x2(const float64_t* __a) {
+  float64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly8x16x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p8_x2(const poly8_t* __a) {
+  poly8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly16x8x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p16_x2(const poly16_t* __a) {
+  poly16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly64x2x2_t
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p64_x2(const poly64_t* __a) {
+  poly64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+  return ret;
+}
+
+/* vst1x2 */
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
index b3d721531d246..c2c0c0d91e29c 100644
--- a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
@@ -1 +1,11 @@
+<<<<<<< HEAD
 #include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
+=======
+/* Workaround for missing vst1q_f32_x2 in gcc-8.  */
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
index 425fb6aa79e13..19eed91ff9199 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
@@ -488,9 +488,12 @@ class Vectorized16 {
   Vectorized<T> expm1() const {
     return map(Sleef_expm1f8_u10);
   }
+<<<<<<< HEAD
   Vectorized<T> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> exp_u20() const {
     return exp();
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index 735315bee7686..00ce79a218825 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -34,9 +34,13 @@ class Vectorized<c10::complex<double>> {
   static constexpr size_type size() {
     return 2;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm256_setzero_pd();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m256d v) : values(v) {}
   Vectorized(c10::complex<double> val) {
     double real_value = val.real();
@@ -342,6 +346,7 @@ class Vectorized<c10::complex<double>> {
     return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
   }
   Vectorized<c10::complex<double>> operator<(
+<<<<<<< HEAD
       const Vectorized<c10::complex<double>>& /*unused*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
@@ -355,6 +360,21 @@ class Vectorized<c10::complex<double>> {
   }
   Vectorized<c10::complex<double>> operator>=(
       const Vectorized<c10::complex<double>>& /*unused*/) const {
+=======
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(
+      const Vectorized<c10::complex<double>>&) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index 5d8c69a34b9d2..4b6f518e96c7c 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -33,9 +33,13 @@ class Vectorized<c10::complex<float>> {
   static constexpr size_type size() {
     return 4;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm256_setzero_ps();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m256 v) : values(v) {}
   Vectorized(c10::complex<float> val) {
     float real_value = val.real();
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index d5abafedec2e6..75df7b555381a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -31,9 +31,13 @@ class Vectorized<double> {
   static constexpr size_type size() {
     return 4;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm256_setzero_pd();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m256d v) : values(v) {}
   Vectorized(double val) {
     values = _mm256_set1_pd(val);
@@ -200,9 +204,12 @@ class Vectorized<double> {
   Vectorized<double> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<double> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> fmod(const Vectorized<double>& q) const {
     return Vectorized<double>(Sleef_fmodd4(values, q));
   }
@@ -496,6 +503,7 @@ Vectorized<double> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fnmadd(
     const Vectorized<double>& a,
     const Vectorized<double>& b,
@@ -504,12 +512,15 @@ Vectorized<double> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<double> inline fmsub(
     const Vectorized<double>& a,
     const Vectorized<double>& b,
     const Vectorized<double>& c) {
   return _mm256_fmsub_pd(a, b, c);
 }
+<<<<<<< HEAD
 
 template <>
 Vectorized<double> inline fnmsub(
@@ -518,6 +529,8 @@ Vectorized<double> inline fnmsub(
     const Vectorized<double>& c) {
   return _mm256_fnmsub_pd(a, b, c);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #endif
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index a42a51e567a63..c8584f3ad84b8 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -1,4 +1,8 @@
 #pragma once
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
@@ -30,9 +34,13 @@ class Vectorized<float> {
   static constexpr size_type size() {
     return 8;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm256_setzero_ps();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m256 v) : values(v) {}
   Vectorized(float val) {
     values = _mm256_set1_ps(val);
@@ -257,6 +265,7 @@ class Vectorized<float> {
   Vectorized<float> expm1() const {
     return Vectorized<float>(Sleef_expm1f8_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<float> fexp_u20() const {
     const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f);
     const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f);
@@ -314,6 +323,8 @@ class Vectorized<float> {
     return result;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> exp_u20() const {
     // A faster version of exp with ULP=20
     const __m256 vec_factorial_1 =
@@ -697,6 +708,7 @@ Vectorized<float> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fnmadd(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -705,6 +717,8 @@ Vectorized<float> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<float> inline fmsub(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -712,6 +726,7 @@ Vectorized<float> inline fmsub(
   return _mm256_fmsub_ps(a, b, c);
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<float> inline fnmsub(
     const Vectorized<float>& a,
@@ -720,6 +735,8 @@ Vectorized<float> inline fnmsub(
   return _mm256_fnmsub_ps(a, b, c);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
 // Used by Inductor CPP codegen for micro gemm
 inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 998177758be8d..ce695e89a1b7e 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -23,9 +23,13 @@ struct Vectorizedi {
   }
 
  public:
+<<<<<<< HEAD
   Vectorizedi() {
     values = _mm256_setzero_si256();
   }
+=======
+  Vectorizedi() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorizedi(__m256i v) : values(v) {}
   operator __m256i() const {
     return values;
@@ -55,9 +59,13 @@ class Vectorized<int64_t> : public Vectorizedi {
     return 4;
   }
   using Vectorizedi::Vectorizedi;
+<<<<<<< HEAD
   Vectorized() {
     values = _mm256_setzero_si256();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(int64_t v) {
     values = _mm256_set1_epi64x(v);
   }
@@ -905,7 +913,11 @@ class Vectorized8 : public Vectorizedi {
     // Because loadu(const void* ptr, T count) requires zero initialization for
     // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
     // bits of the result are undefined.
+<<<<<<< HEAD
     // TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
+=======
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // since gcc 9.3 doesn't support it now.
     __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
     return _mm256_castsi128_si256(input_128);
@@ -1740,7 +1752,11 @@ Vectorized<int16_t> inline shift_256_16(
 
   // Control masks for shuffle operation, treating 256 bits as an
   // array of 16-bit elements, and considering pairs of neighboring
+<<<<<<< HEAD
   // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
+=======
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // M!=N) is set so that shuffle will move element with index M from
   // input pair into element with index N in output pair, and element
   // with index M in output pair will be set to all 0s.
@@ -1844,7 +1860,11 @@ Vectorized<int16_t> inline shift_256_16(
     c0 = _mm256_srav_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
 
+<<<<<<< HEAD
   // Perform shifting the same way for input array elements with
+=======
+  // Peform shifting the same way for input array elements with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // idx%2==1.
   __m256i a1 = _mm256_and_si256(a, keep_1);
   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@@ -1875,7 +1895,11 @@ Vectorized<T> inline shift_256_8(
 
   // Control masks for shuffle operation, treating 256 bits as an
   // array of 8-bit elements, and considering quadruples of
+<<<<<<< HEAD
   // neighboring elements.  Specifically, a mask named "ctl_M_N" (M,N
+=======
+  // neighboring elements.  Specifially, a mask named "ctl_M_N" (M,N
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // in [0,1,2,3], and M!=N) is set so that shuffle will move element
   // with index M from input quadruple into element with index N in
   // output quadruple, and other elements in output quadruple will be
@@ -2180,7 +2204,11 @@ Vectorized<T> inline shift_256_8(
     c0 = _mm256_srlv_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
 
+<<<<<<< HEAD
   // Perform shifting the same way for input array elements with
+=======
+  // Peform shifting the same way for input array elements with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // idx%4==1.
   __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@@ -2193,7 +2221,11 @@ Vectorized<T> inline shift_256_8(
     c1 = _mm256_srlv_epi32(a1, b1);
   c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
 
+<<<<<<< HEAD
   // Perform shifting the same way for input array elements with
+=======
+  // Peform shifting the same way for input array elements with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // idx%4==2.
   __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
   __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
@@ -2206,7 +2238,11 @@ Vectorized<T> inline shift_256_8(
     c2 = _mm256_srlv_epi32(a2, b2);
   c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
 
+<<<<<<< HEAD
   // Perform shifting the same way for input array elements with
+=======
+  // Peform shifting the same way for input array elements with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // idx%4==3.
   __m256i a3 = _mm256_and_si256(a, keep_3);
   __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index 2b70564b9ca81..84b7eff128732 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -54,9 +54,13 @@ struct Vectorizedqi {
 #endif
 
  public:
+<<<<<<< HEAD
   Vectorizedqi() {
     vals = _mm256_setzero_si256();
   }
+=======
+  Vectorizedqi() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorizedqi(__m256i v) : vals(v) {}
   operator __m256i() const {
     return vals;
@@ -123,29 +127,46 @@ typename std::enable_if_t<
 }
 
 template <typename T>
+<<<<<<< HEAD
 at::vec::Vectorized<T> inline convert_float_to_int8(
     at::vec::Vectorized<float> src);
 
 template <>
 at::vec::Vectorized<int8_t> inline convert_float_to_int8(
     at::vec::Vectorized<float> src) {
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Convert from float32 to int32 with truncation
   __m256i x_values_int32 = _mm256_cvttps_epi32(src);
 
   // Convert from int32 to int16 using signed saturation
   __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
 
+<<<<<<< HEAD
   constexpr auto min_val = std::numeric_limits<int8_t>::min();
   constexpr auto max_val = std::numeric_limits<int8_t>::max();
 
   // Convert from int16 to int8 using unsigned saturation
   __m256i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
       xy_packed_v, xy_packed_v, min_val, max_val);
+=======
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m256i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256i permute_mask_v =
       _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
   return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
 }
 
+<<<<<<< HEAD
 template <>
 at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
     at::vec::Vectorized<float> src) {
@@ -169,6 +190,8 @@ at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
   return _mm256_castsi128_si256(result);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 __FORCE_INLINE void QuantizeAvx2(
     const float* src,
@@ -1377,7 +1400,11 @@ Vectorized<c10::quint8> inline maximum(
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
     at::vec::Vectorized<int8_t> src) {
+<<<<<<< HEAD
   auto s8x8 = vget_low_s8(src);
+=======
+  auto s8x8 = vld1_s8(src.operator const int8_t*());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@@ -1390,7 +1417,11 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
 
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
     at::vec::Vectorized<uint8_t> src) {
+<<<<<<< HEAD
   auto u8x8 = vget_low_u8(src);
+=======
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto u16x8 = vmovl_u8(u8x8);
   auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
   auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
@@ -1402,7 +1433,11 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
 
 Vectorized<float> inline convert_int8_half_register_to_float(
     at::vec::Vectorized<int8_t> src) {
+<<<<<<< HEAD
   auto s8x8 = vget_low_s8(src);
+=======
+  auto s8x8 = vld1_s8(src.operator const int8_t*());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
@@ -1412,7 +1447,11 @@ Vectorized<float> inline convert_int8_half_register_to_float(
 
 Vectorized<float> inline convert_int8_half_register_to_float(
     at::vec::Vectorized<uint8_t> src) {
+<<<<<<< HEAD
   auto u8x8 = vget_low_u8(src);
+=======
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto u16x8 = vmovl_u8(u8x8);
   auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
 
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
index db574702f3ee1..7e19ccc8a9352 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -143,7 +143,11 @@ class Vectorized<double> {
       const Vectorized<double>& a,
       const Vectorized<double>& b,
       const Vectorized<double>& mask) {
+<<<<<<< HEAD
     // the mask used here returned by comparison of vec256
+=======
+    // the mask used here returned by comparision of vec256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return {
         vec_sel(a._vec0, b._vec0, mask._vecb0),
@@ -273,9 +277,12 @@ class Vectorized<double> {
   Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<double> C10_ALWAYS_INLINE fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
     return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
index 535d3a23173d5..ad7492a56f917 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -142,7 +142,11 @@ class Vectorized<float> {
       const Vectorized<float>& a,
       const Vectorized<float>& b,
       const Vectorized<float>& mask) {
+<<<<<<< HEAD
     // the mask used here returned by comparison of vec256
+=======
+    // the mask used here returned by comparision of vec256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // assuming this we can use the same mask directly with vec_sel
     return {
         vec_sel(a._vec0, b._vec0, mask._vecb0),
@@ -352,9 +356,12 @@ class Vectorized<float> {
   Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<float> C10_ALWAYS_INLINE fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized<float> C10_ALWAYS_INLINE log() const {
     return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
index 7176dd15d75ed..f860f2dbb6588 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@@ -202,7 +202,11 @@ class Vectorized<int16_t> {
       const Vectorized<int16_t>& a,
       const Vectorized<int16_t>& b,
       const Vectorized<int16_t>& mask) {
+<<<<<<< HEAD
     // the mask used here returned by comparison of vec256
+=======
+    // the mask used here returned by comparision of vec256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // assuming this we can use the same mask directly with vec_sel
     // warning intel style mask will not work properly
     return {
@@ -349,6 +353,29 @@ class Vectorized<int16_t> {
 };
 
 template <>
+<<<<<<< HEAD
+=======
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+  return Vectorized<int16_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+  return Vectorized<int16_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<int16_t> inline maximum(
     const Vectorized<int16_t>& a,
     const Vectorized<int16_t>& b) {
@@ -362,8 +389,11 @@ Vectorized<int16_t> inline minimum(
   return a.minimum(b);
 }
 
+<<<<<<< HEAD
 DEFINE_SHIFT_FUNCS(int16_t)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 Vectorized<int16_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
index 75d3ba381ad41..42a4a996af490 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@@ -155,7 +155,11 @@ class Vectorized<int32_t> {
       const Vectorized<int32_t>& a,
       const Vectorized<int32_t>& b,
       const Vectorized<int32_t>& mask) {
+<<<<<<< HEAD
     // the mask used here returned by comparison of vec256
+=======
+    // the mask used here returned by comparision of vec256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // assuming this we can use the same mask directly with vec_sel
     // warning intel style mask will not work properly
     return {
@@ -279,6 +283,29 @@ class Vectorized<int32_t> {
 };
 
 template <>
+<<<<<<< HEAD
+=======
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
+  return Vectorized<int32_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
+  return Vectorized<int32_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<int32_t> inline maximum(
     const Vectorized<int32_t>& a,
     const Vectorized<int32_t>& b) {
@@ -292,8 +319,11 @@ Vectorized<int32_t> inline minimum(
   return a.minimum(b);
 }
 
+<<<<<<< HEAD
 DEFINE_SHIFT_FUNCS(int32_t)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 Vectorized<int32_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
index 653c277b7d033..ff1df97190265 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@@ -119,7 +119,11 @@ class Vectorized<int64_t> {
       const Vectorized<int64_t>& a,
       const Vectorized<int64_t>& b,
       const Vectorized<int64_t>& mask) {
+<<<<<<< HEAD
     // the mask used here returned by comparison of vec256
+=======
+    // the mask used here returned by comparision of vec256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return {
         vec_sel(a._vec0, b._vec0, mask._vecb0),
@@ -232,6 +236,29 @@ class Vectorized<int64_t> {
 };
 
 template <>
+<<<<<<< HEAD
+=======
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
+  return Vectorized<int64_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
+  return Vectorized<int64_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<int64_t> inline maximum(
     const Vectorized<int64_t>& a,
     const Vectorized<int64_t>& b) {
@@ -245,8 +272,11 @@ Vectorized<int64_t> inline minimum(
   return a.minimum(b);
 }
 
+<<<<<<< HEAD
 DEFINE_SHIFT_FUNCS(int64_t)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 Vectorized<int64_t> C10_ALWAYS_INLINE
 operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
index 7ca603c0b91df..95b7905203127 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -1,6 +1,9 @@
 #pragma once
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/macros/Macros.h>
 #include <cstdint>
 
@@ -40,6 +43,7 @@ using vfloat32 = __attribute__((altivec(vector__))) float;
 using vfloat64 = __attribute__((altivec(vector__))) double;
 #endif
 
+<<<<<<< HEAD
 inline auto make_vuint(vint8 v) {
   return reinterpret_cast<vuint8>(v);
 }
@@ -53,6 +57,8 @@ inline auto make_vuint(vint64 v) {
   return reinterpret_cast<vuint64>(v);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(vec_float)
 C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
   vfloat32 vec_out;
@@ -535,6 +541,7 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
 const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
 const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
 
+<<<<<<< HEAD
 template <typename T>
 Vectorized<T> VsxShiftRightArith(
     const Vectorized<T>& a,
@@ -571,6 +578,8 @@ Vectorized<T> VsxShiftLeftArith(
     return VsxShiftLeftArith(a, b);                                           \
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace CPU_CAPABILITY
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
index efb97b3c614db..cd430913e695c 100644
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -1023,9 +1023,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   Vectorized<T> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<T> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized<T> log() const {
     return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index 975b71ce9a867..2d796a84e28a0 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -397,7 +397,11 @@ inline Vectorized<bool> operator&&(
   const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
   __m512i out = _mm512_and_si512(*self_, *other_);
   Vectorized<bool> ret;
+<<<<<<< HEAD
   // We do not have a constructor that takes __m512i, so we need to memcpy
+=======
+  // We do not have a constructer that takes __m512i, so we need to memcpy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::memcpy(ret, &out, ret.size() * sizeof(bool));
   return ret;
 }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index 844b3b1fcc1e8..cd1b61eeb2e85 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -192,9 +192,13 @@ class Vectorized16 {
   static constexpr size_type size() {
     return 32;
   }
+<<<<<<< HEAD
   Vectorized16() {
     values = _mm512_setzero_si512();
   }
+=======
+  Vectorized16() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized16(__m512i v) : values(v) {}
   Vectorized16(T val) {
     value_type uw = val.x;
@@ -537,9 +541,12 @@ class Vectorized16 {
   Vectorized<T> expm1() const {
     return map(Sleef_expm1f16_u10);
   }
+<<<<<<< HEAD
   Vectorized<T> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> exp_u20() const {
     return exp();
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index 3776001fc8720..3d11a98ee0815 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -34,9 +34,13 @@ class Vectorized<c10::complex<double>> {
   static constexpr size_type size() {
     return 4;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm512_setzero_pd();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m512d v) : values(v) {}
   Vectorized(c10::complex<double> val) {
     double real_value = val.real();
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index d434b2a1e2070..bb91ac64c4549 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -34,9 +34,13 @@ class Vectorized<c10::complex<float>> {
   static constexpr size_type size() {
     return 8;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm512_setzero_ps();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m512 v) : values(v) {}
   Vectorized(c10::complex<float> val) {
     float real_value = val.real();
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 438fd31e91618..4fcab45731748 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -34,9 +34,13 @@ class Vectorized<double> {
   static constexpr size_type size() {
     return 8;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm512_setzero_pd();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m512d v) : values(v) {}
   Vectorized(double val) {
     values = _mm512_set1_pd(val);
@@ -223,9 +227,12 @@ class Vectorized<double> {
   Vectorized<double> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<double> fexp_u20() const {
     return exp();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> fmod(const Vectorized<double>& q) const {
     return Vectorized<double>(Sleef_fmodd8(values, q));
   }
@@ -537,6 +544,7 @@ Vectorized<double> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fnmadd(
     const Vectorized<double>& a,
     const Vectorized<double>& b,
@@ -545,6 +553,8 @@ Vectorized<double> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<double> inline fmsub(
     const Vectorized<double>& a,
     const Vectorized<double>& b,
@@ -552,6 +562,7 @@ Vectorized<double> inline fmsub(
   return _mm512_fmsub_pd(a, b, c);
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<double> inline fnmsub(
     const Vectorized<double>& a,
@@ -560,6 +571,8 @@ Vectorized<double> inline fnmsub(
   return _mm512_fnmsub_pd(a, b, c);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 } // namespace CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 7a9e69b76c851..de0c9ea3fca26 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -32,9 +32,13 @@ class Vectorized<float> {
   static constexpr size_type size() {
     return 16;
   }
+<<<<<<< HEAD
   Vectorized() {
     values = _mm512_setzero_ps();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(__m512 v) : values(v) {}
   Vectorized(float val) {
     values = _mm512_set1_ps(val);
@@ -312,6 +316,7 @@ class Vectorized<float> {
   Vectorized<float> expm1() const {
     return Vectorized<float>(Sleef_expm1f16_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<float> fexp_u20() const {
     const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
     const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
@@ -366,6 +371,8 @@ class Vectorized<float> {
     // final interpretation to float
     return _mm512_castsi512_ps(casted_integer);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> exp_u20() const {
     // A faster version of exp with ULP=20
     const __m512 vec_factorial_1 =
@@ -750,6 +757,7 @@ Vectorized<float> inline fmadd(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fnmadd(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -758,6 +766,8 @@ Vectorized<float> inline fnmadd(
 }
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<float> inline fmsub(
     const Vectorized<float>& a,
     const Vectorized<float>& b,
@@ -765,6 +775,7 @@ Vectorized<float> inline fmsub(
   return _mm512_fmsub_ps(a, b, c);
 }
 
+<<<<<<< HEAD
 template <>
 Vectorized<float> inline fnmsub(
     const Vectorized<float>& a,
@@ -773,6 +784,8 @@ Vectorized<float> inline fnmsub(
   return _mm512_fnmsub_ps(a, b, c);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
 // Used by Inductor CPP codegen for micro gemm
 // Code referred to FBGEMM:
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index 0a2f2c5f94823..2e29187a66165 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -53,9 +53,13 @@ class Vectorized<int64_t> : public Vectorizedi {
     return 8;
   }
   using Vectorizedi::Vectorizedi;
+<<<<<<< HEAD
   Vectorized() {
     values = _mm512_setzero_si512();
   }
+=======
+  Vectorized() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(int64_t v) {
     values = _mm512_set1_epi64(v);
   }
@@ -1088,7 +1092,11 @@ class Vectorized8 : public Vectorizedi {
     // Because loadu(const void* ptr, T count) requires zero initialization for
     // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
     // bits of the result are undefined.
+<<<<<<< HEAD
     // TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
+=======
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // since gcc 9.3 doesn't support it now.
     __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
     return _mm512_castsi128_si512(input_128);
@@ -1852,7 +1860,11 @@ Vectorized<T> inline shift_512_8(
 
   // Control masks for shuffle operation, treating 512 bits as an
   // array of 8-bit elements, and considering pairs of neighboring
+<<<<<<< HEAD
   // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
+=======
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // M!=N) is set so that shuffle will move element with index M from
   // input pair into element with index N in output pair, and element
   // with index M in output pair will be set to all 0s.
@@ -2022,7 +2034,11 @@ Vectorized<T> inline shift_512_8(
     c0 = _mm512_srlv_epi16(a0, b0);
   c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
 
+<<<<<<< HEAD
   // Perform shifting the same way for input array elements with
+=======
+  // Peform shifting the same way for input array elements with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // idx%2==1.
   __m512i a1 = _mm512_and_si512(a, keep_1);
   __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
index 64ba47e0f0646..ba4fa82bbeed5 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@@ -55,9 +55,13 @@ struct Vectorizedqi {
 #endif
 
  public:
+<<<<<<< HEAD
   Vectorizedqi() {
     vals = _mm512_setzero_si512();
   }
+=======
+  Vectorizedqi() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorizedqi(__m512i v) : vals(v) {}
   operator __m512i() const {
     return vals;
@@ -125,24 +129,40 @@ typename std::enable_if_t<
 }
 
 template <typename T>
+<<<<<<< HEAD
 at::vec::Vectorized<T> inline convert_float_to_int8(
     at::vec::Vectorized<float> src);
 
 template <>
 at::vec::Vectorized<int8_t> inline convert_float_to_int8(
     at::vec::Vectorized<float> src) {
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Convert from float32 to int32 with truncation
   __m512i x_values_int32 = _mm512_cvttps_epi32(src);
 
   // Convert from int32 to int16 using signed saturation
   __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
 
+<<<<<<< HEAD
   constexpr auto min_val = std::numeric_limits<int8_t>::min();
   constexpr auto max_val = std::numeric_limits<int8_t>::max();
 
   // Convert from int16 to int8 using unsigned saturation
   __m512i xyzw_clamped_v = pack_saturate_and_clamp<int8_t>(
       xy_packed_v, xy_packed_v, min_val, max_val);
+=======
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m512i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i permute_mask_v = _mm512_set_epi32(
       0x0f,
       0x0b,
@@ -163,6 +183,7 @@ at::vec::Vectorized<int8_t> inline convert_float_to_int8(
   return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
 }
 
+<<<<<<< HEAD
 template <>
 at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
     at::vec::Vectorized<float> src) {
@@ -178,6 +199,8 @@ at::vec::Vectorized<uint8_t> inline convert_float_to_int8(
   return _mm512_castsi128_si512(int8_src);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 __FORCE_INLINE void QuantizeAvx512(
     const float* src,
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index b4441981b3d87..1b86794d8a336 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -238,6 +238,12 @@ struct Vectorized {
     Vectorized vector;
     int_same_size_t<T> buffer[size()];
     mask.store(buffer);
+<<<<<<< HEAD
+=======
+#if defined(__clang__) && __ARM_FEATURE_SVE
+#pragma clang loop vectorize(disable)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       if (buffer[i] & 0x01) {
         vector[i] = b[i];
@@ -544,9 +550,12 @@ struct Vectorized {
   Vectorized<T> exp_u20() const {
     return map(std::exp);
   }
+<<<<<<< HEAD
   Vectorized<T> fexp_u20() const {
     return map(std::exp);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> frac() const {
     return *this - this->trunc();
   }
@@ -634,7 +643,11 @@ struct Vectorized {
   }
   Vectorized<T> neg() const {
     // NB: the trailing return type is needed because we need to coerce the
+<<<<<<< HEAD
     // return value back to T in the case of unary operator- incurring a
+=======
+    // return value back to T in the case of unary operator- incuring a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // promotion
     return map([](T x) -> T { return -x; });
   }
@@ -1248,6 +1261,7 @@ inline Vectorized<T> fmadd(
 VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd)
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T> fnmadd(
     const Vectorized<T>& a,
     const Vectorized<T>& b,
@@ -1258,6 +1272,8 @@ inline Vectorized<T> fnmadd(
 VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmadd)
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<T> fmsub(
     const Vectorized<T>& a,
     const Vectorized<T>& b,
@@ -1268,6 +1284,7 @@ inline Vectorized<T> fmsub(
 VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub)
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T> fnmsub(
     const Vectorized<T>& a,
     const Vectorized<T>& b,
@@ -1278,6 +1295,8 @@ inline Vectorized<T> fnmsub(
 VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmsub)
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Vectorized<T> inline operator&&(
     const Vectorized<T>& a,
     const Vectorized<T>& b) {
diff --git a/aten/src/ATen/cpu/vec/vec_half.h b/aten/src/ATen/cpu/vec/vec_half.h
index dc1c23c74ae52..2bf13659596c5 100644
--- a/aten/src/ATen/cpu/vec/vec_half.h
+++ b/aten/src/ATen/cpu/vec/vec_half.h
@@ -3,12 +3,58 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <c10/util/Exception.h>
 
+<<<<<<< HEAD
 #include <torch/headeronly/cpu/vec/vec_half.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
+<<<<<<< HEAD
+=======
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+static inline uint16_t float2half_scalar(float val) {
+#if defined(CPU_CAPABILITY_AVX2)
+#if defined(_MSC_VER)
+  __m256 v = _mm256_set1_ps(val);
+  __m128i o =
+      _mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
+#else
+  return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#endif
+#elif defined(CPU_CAPABILITY_AVX512)
+  __m512 v = _mm512_set1_ps(val);
+  __m256i o =
+      _mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return static_cast<std::uint16_t>(
+      _mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
+#endif
+}
+
+static inline float half2float_scalar(uint16_t val) {
+#if defined(CPU_CAPABILITY_AVX2)
+#if defined(_MSC_VER)
+  __m128i v = _mm_cvtsi32_si128(val);
+  __m256 o = _mm256_cvtph_ps(v);
+  return _mm256_cvtss_f32(o);
+#else
+  return _cvtsh_ss(val);
+#endif
+#elif defined(CPU_CAPABILITY_AVX512)
+  __m256i v =
+      _mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  __m512 o = _mm512_cvtph_ps(v);
+  return _mm512_cvtss_f32(o);
+#endif
+}
+
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Transpose a [2, 32] matrix to [32, 2]
 // Note: the output leading dimension should be 2,
 // that is, the output must be contiguous
diff --git a/aten/src/ATen/cpu/vec/vec_n.h b/aten/src/ATen/cpu/vec/vec_n.h
index 3de55de6f1b85..93c61ad3a44b4 100644
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@@ -263,7 +263,10 @@ class VectorizedN {
   VECTORIZEDN_DEFINE_UNARY_OP(exp2)
   VECTORIZEDN_DEFINE_UNARY_OP(expm1)
   VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
+<<<<<<< HEAD
   VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   VECTORIZEDN_DEFINE_UNARY_OP(frac)
   VECTORIZEDN_DEFINE_BINARY_OP(fmod)
   VECTORIZEDN_DEFINE_UNARY_OP(log)
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index aaed431064611..a03f958654fb7 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -16,10 +16,14 @@
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>
 
+<<<<<<< HEAD
 #include <ATen/cuda/detail/BLASConstants.h>
 
 #ifdef USE_ROCM
 #include <c10/cuda/CUDAStream.h>
+=======
+#ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <hipblaslt/hipblaslt-ext.hpp>
 // until hipblas has an API to accept flags, we must use rocblas here
 #include <hipblas/hipblas.h>
@@ -110,7 +114,11 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
 
 namespace {
 
+<<<<<<< HEAD
 cublasOperation_t _cublasOpFromChar(char op) {
+=======
+static cublasOperation_t _cublasOpFromChar(char op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOLINTNEXTLINE(bugprone-switch-missing-default-case)
   switch (op) {
     case 'n':
@@ -130,7 +138,11 @@ cublasOperation_t _cublasOpFromChar(char op) {
       "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
 }
 
+<<<<<<< HEAD
 void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
+=======
+static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Note: leading dimensions generally are checked that they are > 0
   // and at least as big the result requires (even if the value won't
   // be used).
@@ -144,7 +156,11 @@ void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
     *lda = std::max<int64_t>(m, 1);
 }
 
+<<<<<<< HEAD
 void _cublasAdjustLdLevel3(
+=======
+static void _cublasAdjustLdLevel3(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     char transa,
     char transb,
     int64_t m,
@@ -191,6 +207,7 @@ uint32_t _getAlignment(uintptr_t address) {
 }
 #endif
 
+<<<<<<< HEAD
 #ifdef USE_ROCM
 static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
   // 0 is default value, meaning full CUs i.e. no mask
@@ -249,6 +266,8 @@ static void _syncCurrentWithCarveoutStream(hipStream_t stream, bool presync) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct CublasLtWorkspace {
   CublasLtWorkspace() {
     size = at::cuda::getCUDABlasLtWorkspaceSize();
@@ -325,7 +344,11 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
+<<<<<<< HEAD
   void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+=======
+  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NOLINTNEXTLINE(bugprone-sizeof-expression)
     TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
   }
@@ -347,7 +370,11 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
+<<<<<<< HEAD
   void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+=======
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
   }
 };
@@ -362,7 +389,11 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
+<<<<<<< HEAD
   void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+=======
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
   }
 };
@@ -397,7 +428,11 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
+<<<<<<< HEAD
     if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+=======
+    if (at::globalContext().allowTF32CuBLAS()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
@@ -424,6 +459,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     abType = CUDA_R_16F;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
+<<<<<<< HEAD
     auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
     if (fp16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
@@ -435,12 +471,18 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
               : CUBLASLT_REDUCTION_SCHEME_NONE;
       preference.setAttribute(
           CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+=======
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 #endif
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abType = CUDA_R_16BF;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
+<<<<<<< HEAD
     auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
     if (bf16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
@@ -452,12 +494,21 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
               : CUBLASLT_REDUCTION_SCHEME_NONE;
       preference.setAttribute(
           CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+=======
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 #endif
   } else {
     static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
   }
 
+<<<<<<< HEAD
+=======
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -466,7 +517,10 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -474,12 +528,15 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T);
   CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T);
@@ -542,12 +599,16 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
       &heuristicResult.algo,
       ltworkspace.ptr,
       ltworkspace.size,
+<<<<<<< HEAD
       stream);
 #ifdef USE_ROCM
     if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
       _syncCurrentWithCarveoutStream(stream, false);
     }
 #endif
+=======
+      at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
     TORCH_WARN(
@@ -591,6 +652,11 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
 
 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -602,6 +668,11 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
 
 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -613,6 +684,11 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
 
 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -626,6 +702,11 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
 
 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -639,6 +720,11 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
 
 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -655,8 +741,11 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
   void * beta_ptr = &fbeta;
 #ifdef USE_ROCM
   int flag = 0;
+<<<<<<< HEAD
   rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
   rocblas_datatype d_type = c_type;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@@ -665,8 +754,13 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
                                    hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
                                    (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
                                    b, rocblas_datatype_f16_r, (int)ldb, strideb,
+<<<<<<< HEAD
                                    (void*)beta_ptr, c, c_type, (int)ldc, stridec,
                                    c, d_type, (int)ldc, stridec,
+=======
+                                   (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
+                                   c, rocblas_datatype_f16_r, (int)ldc, stridec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                    (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
                                    0, flag)));
 #else
@@ -710,6 +804,11 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
 
 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BGEMM_CHECK_ARGVALUES(at::BFloat16);
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
@@ -843,7 +942,11 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
       bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
     }
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
   }
@@ -1007,6 +1110,12 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+<<<<<<< HEAD
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO: Support tuning for Half inputs and FP32 output
   bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
@@ -1014,7 +1123,13 @@ void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)
 
 template <>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+<<<<<<< HEAD
   #ifndef USE_ROCM
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1033,6 +1148,11 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
 
 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1044,6 +1164,11 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 
 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1055,6 +1180,11 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
 
 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1068,6 +1198,11 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
 
 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1081,6 +1216,11 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
 
 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1097,8 +1237,11 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
   GEMM_CHECK_ARGVALUES(at::Half);
 #ifdef USE_ROCM
   int flag = 0;
+<<<<<<< HEAD
   rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
   rocblas_datatype d_type = c_type;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@@ -1118,10 +1261,17 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
       ldb,
       beta_ptr,
       c,
+<<<<<<< HEAD
       c_type,
       ldc,
       c,
       d_type,
+=======
+      rocblas_datatype_f16_r,
+      ldc,
+      c,
+      rocblas_datatype_f16_r,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ldc,
       rocblas_datatype_f32_r,
       rocblas_gemm_algo_standard,
@@ -1138,6 +1288,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
   }
   if (prop->major >= 5) {
     cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+<<<<<<< HEAD
     auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
     TORCH_CHECK(fp16_reduction !=
         at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
@@ -1147,6 +1298,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
       cublas_flags = static_cast<cublasMath_t>(
           cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+=======
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // Disallow fp16 reductions that could lead to unexpected overflow issues.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@@ -1196,6 +1351,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
 
 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+<<<<<<< HEAD
+=======
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1205,6 +1364,7 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
   GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
   cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+<<<<<<< HEAD
   auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
   TORCH_CHECK(bf16_reduction !=
       at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
@@ -1214,6 +1374,10 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
       at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
     cublas_flags = static_cast<cublasMath_t>(
         cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+=======
+  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif
 #if defined(USE_ROCM)
@@ -1284,7 +1448,11 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
 #endif
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
   }
@@ -1300,9 +1468,15 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
     } else{
       at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@@ -1352,7 +1526,11 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
@@ -1368,7 +1546,11 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
@@ -1524,6 +1706,12 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+<<<<<<< HEAD
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO: Support Tuning for fp16-fp32 gemm
   gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
@@ -1531,7 +1719,13 @@ void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
 
 template <>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+<<<<<<< HEAD
   #ifndef USE_ROCM
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1591,7 +1785,11 @@ bool gemm_and_bias(
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
+<<<<<<< HEAD
     if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+=======
+    if (at::globalContext().allowTF32CuBLAS()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, at::Half>) {
@@ -1609,6 +1807,7 @@ bool gemm_and_bias(
     abType = CUDA_R_16F;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
+<<<<<<< HEAD
     auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
     if (fp16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
@@ -1620,12 +1819,18 @@ bool gemm_and_bias(
               : CUBLASLT_REDUCTION_SCHEME_NONE;
       preference.setAttribute(
           CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+=======
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 #endif
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abType = CUDA_R_16BF;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
+<<<<<<< HEAD
     auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
     if (bf16_reduction !=
         at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
@@ -1637,6 +1842,11 @@ bool gemm_and_bias(
               : CUBLASLT_REDUCTION_SCHEME_NONE;
       preference.setAttribute(
           CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+=======
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 #endif
   }
@@ -1646,7 +1856,10 @@ bool gemm_and_bias(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
   cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -1654,18 +1867,27 @@ bool gemm_and_bias(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
   if (activation == GEMMAndBiasActivationEpilogue::RELU) {
     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+<<<<<<< HEAD
+    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+=======
+#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (bias != nullptr) {
@@ -1726,12 +1948,16 @@ bool gemm_and_bias(
       &heuristicResult.algo,
       ltworkspace.ptr,
       ltworkspace.size,
+<<<<<<< HEAD
       stream);
 #ifdef USE_ROCM
     if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
       _syncCurrentWithCarveoutStream(stream, false);
     }
 #endif
+=======
+      at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
     TORCH_WARN(
@@ -1863,6 +2089,7 @@ template bool gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+<<<<<<< HEAD
 using at::blas::ScalingType;
 
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
@@ -1932,6 +2159,8 @@ case ScalingType::TensorWise:
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void scaled_gemm(
     char transa,
     char transb,
@@ -1943,13 +2172,19 @@ void scaled_gemm(
     int64_t mat1_ld,
     ScalarType mat1_dtype,
     ScalarType mat1_scale_dtype,
+<<<<<<< HEAD
     ScalingType mat1_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const void* mat2_ptr,
     const void* mat2_scale_ptr,
     int64_t mat2_ld,
     ScalarType mat2_dtype,
     ScalarType mat2_scale_dtype,
+<<<<<<< HEAD
     ScalingType mat2_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const void* bias_ptr,
     ScalarType bias_dtype,
     void* result_ptr,
@@ -1957,6 +2192,7 @@ void scaled_gemm(
     int64_t result_ld,
     ScalarType result_dtype,
     bool use_fast_accum,
+<<<<<<< HEAD
     const std::optional<Tensor>& alpha) {
   // Note: see `cublasCommonArgs` for various non-intuitive manipulations
   // of input arguments to this function.
@@ -1965,20 +2201,38 @@ void scaled_gemm(
   // Note: alpha_val may change later depending on user-passed argument
   float alpha_val = 1.0;
   float beta_val = 0.0;
+=======
+    bool use_rowwise) {
+  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
+  // of input arguments to this function.
+#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
+  const auto computeType = CUBLAS_COMPUTE_32F;
+  const auto scaleType = CUDA_R_32F;
+  const float alpha_val = 1.0;
+  const float beta_val = 0.0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
   cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
   cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+<<<<<<< HEAD
 #if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   // hipblaslt supported row-wise before cublas, and did so their own way (via
   // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
   // the SCALE_MODEs). Here we check for this early custom mode.
   bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
+=======
+#if defined(USE_ROCM)
+#if defined(HIPBLASLT_OUTER_VEC)
+  // this case is handled later as hipified CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F
+#elif defined(HIPBLASLT_VEC_EXT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (use_rowwise) {
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
   }
+<<<<<<< HEAD
   else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
   #if ROCM_VERSION >= 70000
             if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
@@ -1997,12 +2251,32 @@ void scaled_gemm(
   // rowwise isn't supported using older cublaslt or older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
+=======
+    else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#if ROCM_VERSION >= 70000
+          if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) {
+            // Validate matrix dimensions for MX format
+            TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+                       "Matrix dimensions must be multiples of 32 for MX format. ",
+                       "Got m=", m, ", n=", n, ", k=", k);
+          }
+#endif
+  }
+#else
+  // rowwise isn't supported using older hipblaslt
+  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
+#endif
+#endif // defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
   computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
   if (result_scale_ptr != nullptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
   }
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -2010,12 +2284,15 @@ void scaled_gemm(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // ifndef USE_ROCM
 #ifndef USE_ROCM
   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
@@ -2036,6 +2313,7 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
 
+<<<<<<< HEAD
   // Handle user-passed alpha
   float *alpha_ptr = &alpha_val;
   float *beta_ptr = &beta_val;
@@ -2073,13 +2351,44 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
 #endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
 
+=======
+  if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
+#else
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above");
+#endif // if CUDA_VERSION >= 12080
+  } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
+#if CUDA_VERSION >= 12080
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+#else
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above");
+#endif // if CUDA_VERSION >= 12080
+  } else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) {
+#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
+    // no-op here for older hipblaslt ext enums, to avoid TORCH_CHECK below
+#else
+    TORCH_CHECK(false, "scaled_gemm with `torch.float` outer vector scaling is only supported for CUDA 12.9 and above");
+#endif // if CUDA_VERSION >= 12090
+  }
+
+  auto stream = c10::cuda::getCurrentCUDAStream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CuBlasLtMatmulPreference preference;
   auto ltworkspace = CublasLtWorkspace();
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
       ltHandle,
       computeDesc.descriptor(),
@@ -2120,10 +2429,17 @@ void scaled_gemm(
         auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
                 ltHandle,
                 computeDesc.descriptor(),
+<<<<<<< HEAD
                 alpha_ptr,
                 Adesc.descriptor(),
                 Bdesc.descriptor(),
                 beta_ptr,
+=======
+                &alpha_val,
+                Adesc.descriptor(),
+                Bdesc.descriptor(),
+                &beta_val,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Cdesc.descriptor(),
                 Ddesc.descriptor(),
                 all_algos[i].algo,
@@ -2142,14 +2458,27 @@ void scaled_gemm(
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
+<<<<<<< HEAD
       alpha_ptr,
+=======
+      &alpha_val,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       mat1_ptr,
       Adesc.descriptor(),
       mat2_ptr,
       Bdesc.descriptor(),
+<<<<<<< HEAD
       beta_ptr,
       // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
       result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
+=======
+      &beta_val,
+#ifdef USE_ROCM
+      result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
+#else
+      nullptr,
+#endif // ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
@@ -2157,11 +2486,14 @@ void scaled_gemm(
       ltworkspace.ptr,
       ltworkspace.size,
       stream);
+<<<<<<< HEAD
 #ifdef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     _syncCurrentWithCarveoutStream(stream, false);
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
@@ -2187,6 +2519,11 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
+<<<<<<< HEAD
+=======
+#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
+  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void int8_gemm(
@@ -2213,7 +2550,10 @@ void int8_gemm(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
   cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -2221,12 +2561,15 @@ void int8_gemm(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
@@ -2288,7 +2631,11 @@ void int8_gemm(
 #else
       0,
 #endif
+<<<<<<< HEAD
       stream);
+=======
+      at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
@@ -2317,11 +2664,14 @@ void int8_gemm(
       computeType,
       " scaleType ",
       scaleType);
+<<<<<<< HEAD
 #ifdef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     _syncCurrentWithCarveoutStream(stream, false);
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -2461,6 +2811,11 @@ void trsmBatched<c10::complex<double>>(
 
 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2476,6 +2831,11 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
   // gemv is bw bound, and does not benefit from TF32. But the precision
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2488,6 +2848,11 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
 
 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2501,6 +2866,11 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
   // gemv is bw bound, and does not benefit from TF32. But the precision
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
+<<<<<<< HEAD
+=======
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2625,6 +2995,11 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
                                    reinterpret_cast<cuDoubleComplex*>(result)));
 }
 
+<<<<<<< HEAD
+=======
+// HIP on Windows does not support
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
   TORCH_CUDABLAS_CHECK(cublasSgetrsBatched(
@@ -2823,5 +3198,9 @@ void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple
       devInfoArray,
       batchSize));
 }
+<<<<<<< HEAD
+=======
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 0295948311a59..6e7003191533e 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -14,7 +14,10 @@
  */
 
 #include <ATen/cuda/CUDAContext.h>
+<<<<<<< HEAD
 #include <ATen/BlasBackend.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/OpMathType.h>
 
 namespace at::cuda::blas {
@@ -148,13 +151,19 @@ void scaled_gemm(
     int64_t mat1_ld,
     ScalarType mat1_dtype,
     ScalarType mat1_scale_dtype,
+<<<<<<< HEAD
     at::blas::ScalingType mat1_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const void* mat2_ptr,
     const void* mat2_scale_ptr,
     int64_t mat2_ld,
     ScalarType mat2_dtype,
     ScalarType mat2_scale_dtype,
+<<<<<<< HEAD
     at::blas::ScalingType mat2_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const void* bias_ptr,
     ScalarType bias_dtype,
     void* result_ptr,
@@ -162,7 +171,11 @@ void scaled_gemm(
     int64_t result_ld,
     ScalarType result_dtype,
     bool use_fast_accum,
+<<<<<<< HEAD
     const std::optional<Tensor>& alpha);
+=======
+    bool use_rowwise);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
 
@@ -336,6 +349,12 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
   int m, int n, int nrhs, Dtype** dA_array, int ldda, \
   Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
 
+<<<<<<< HEAD
+=======
+// HIP on Windows does not support getrs, geqrf, getrf, gels
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<class Dtype>
 void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented");
@@ -390,4 +409,31 @@ TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_A
 template<>
 TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
 
+<<<<<<< HEAD
+=======
+#else // !(defined(USE_ROCM) && defined(_MSC_VER))
+
+template<class Dtype>
+void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows");
+}
+
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows");
+}
+
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 81b4643ac0418..f28cb7b69ccc4 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -2,10 +2,17 @@
 
 #include <ATen/cuda/ATenCUDAGeneral.h>
 #include <ATen/cuda/CUDAContext.h>
+<<<<<<< HEAD
 #include <ATen/cuda/Exceptions.h>
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+=======
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/Exceptions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 
 #include <cuda_runtime_api.h>
@@ -246,6 +253,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
   }
 };
 
+<<<<<<< HEAD
 // EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
 // calls. cudaEventCreate when concurrently invoked from multiple threads can be
 // very expensive (especially on certain device/driver combinations).
@@ -321,4 +329,6 @@ class EventPool {
   std::vector<PerDevicePool> pools_;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 2e387fbc264d7..545ebbc3c20b8 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -15,6 +15,7 @@ namespace cuda::detail {
 namespace {
 
 // Total number of gpus in the system.
+<<<<<<< HEAD
 int64_t num_gpus;
 
 // Ensures default_gens_cuda is initialized once.
@@ -22,12 +23,25 @@ std::deque<c10::once_flag> cuda_gens_init_flag;
 
 // Default, global CUDA generators, one per GPU.
 std::vector<Generator> default_gens_cuda;
+=======
+static int64_t num_gpus;
+
+// Ensures default_gens_cuda is initialized once.
+static std::deque<c10::once_flag> cuda_gens_init_flag;
+
+// Default, global CUDA generators, one per GPU.
+static std::vector<Generator> default_gens_cuda;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /*
  * Populates the global variables related to CUDA generators
  * Warning: this function must only be called once!
  */
+<<<<<<< HEAD
 void initCUDAGenVector() {
+=======
+static void initCUDAGenVector() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Ensures we only call cudaGetDeviceCount only once.
   static bool num_gpu_init_flag [[maybe_unused]] = []() {
     num_gpus = static_cast<int32_t>(c10::cuda::device_count());
@@ -109,7 +123,11 @@ void CUDAGeneratorState::increase(uint64_t increment) {
         offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
     // Ensures the increment does not cause overflow.
     TORCH_INTERNAL_ASSERT(
+<<<<<<< HEAD
         offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
+=======
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Increment causes overflow in the offset value.");
     offset_intragraph_ += increment;
   } else {
@@ -266,6 +284,7 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
+<<<<<<< HEAD
   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
     state_->seed_ = seed;
     state_->philox_offset_per_thread_ = 0;
@@ -274,6 +293,13 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
     TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
     // no-op case
   }
+=======
+  at::cuda::assertNotCapturing(
+      "Cannot call CUDAGeneratorImpl::set_current_seed");
+  state_->seed_ = seed;
+  state_->philox_offset_per_thread_ = 0;
+  no_reset_rnn_state_.clear();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /**
@@ -302,6 +328,12 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
  * Gets the current seed of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::current_seed() const {
+<<<<<<< HEAD
+=======
+  // Debatable if current_seed() should be allowed in captured regions.
+  // Conservatively disallow it for now.
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return state_->seed_;
 }
 
@@ -325,9 +357,15 @@ uint64_t CUDAGeneratorImpl::seed() {
  */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
+<<<<<<< HEAD
   constexpr size_t seed_size = sizeof(uint64_t);
   constexpr size_t offset_size = sizeof(int64_t);
   constexpr size_t total_size = seed_size + offset_size;
+=======
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
   auto rng_state = state_tensor.data_ptr<uint8_t>();
@@ -346,9 +384,17 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+<<<<<<< HEAD
   constexpr size_t seed_size = sizeof(uint64_t);
   constexpr size_t offset_size = sizeof(int64_t);
   constexpr size_t total_size = seed_size + offset_size;
+=======
+  at::cuda::assertNotCapturing(
+      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   detail::check_rng_state(new_state);
 
@@ -400,6 +446,7 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
  */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
   // see Note [Why enforce RNG offset % 4 == 0?]
+<<<<<<< HEAD
 
   // Note: If you use CUDNN RNN's, calling
   // set_philox_offset_per_thread instead of set_offset will cause the
@@ -410,17 +457,25 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
   } else {
     state_->offset_intragraph_ = offset;
   }
+=======
+  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
+  state_->philox_offset_per_thread_ = offset;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /**
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
+<<<<<<< HEAD
   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
     return state_->philox_offset_per_thread_;
   } else {
     return state_->offset_intragraph_;
   }
+=======
+  return state_->philox_offset_per_thread_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /**
@@ -461,7 +516,11 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
  */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
   if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
+<<<<<<< HEAD
     uint64_t offset = state_->offset_intragraph_;
+=======
+    uint32_t offset = state_->offset_intragraph_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     state_->increase(increment);
     return PhiloxCudaState(
         state_->seed_extragraph_.data_ptr<int64_t>(),
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
index d4ab49382e7ff..063579427a3b3 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@@ -96,16 +96,28 @@ struct CUDAGraph;
 struct CUDAGeneratorState : public c10::intrusive_ptr_target {
   uint64_t seed_;
   uint64_t philox_offset_per_thread_;
+<<<<<<< HEAD
   uint64_t offset_intragraph_;
   bool capturing_{};
   std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
   at::TensorBase seed_extragraph_;
   at::TensorBase offset_extragraph_;
+=======
+  uint32_t offset_intragraph_;
+  bool capturing_{};
+  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   CUDAGeneratorState(
       uint64_t seed = default_rng_seed_val,
       uint64_t philox_offset_per_thread = 0,
+<<<<<<< HEAD
       uint64_t offset_intragraph = 0)
+=======
+      uint32_t offset_intragraph = 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : seed_(seed),
         philox_offset_per_thread_(philox_offset_per_thread),
         offset_intragraph_(offset_intragraph) {}
@@ -167,7 +179,11 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   CUDAGeneratorImpl* clone_impl() const override;
 
   c10::intrusive_ptr<CUDAGeneratorState> state_;
+<<<<<<< HEAD
   std::atomic_flag no_reset_rnn_state_;
+=======
+  std::atomic_flag no_reset_rnn_state_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 namespace cuda::detail {
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 31d2d3f1fe589..a7bb66970e2cb 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -2,6 +2,10 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
+<<<<<<< HEAD
+=======
+#include <c10/cuda/CUDACachingAllocator.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDAFunctions.h>
 
 #include <cstddef>
@@ -168,9 +172,17 @@ void CUDAGraph::instantiate() {
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
   // cudaGraphInstantiateWithFlags
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
+<<<<<<< HEAD
   int version = 0;
   AT_CUDA_CHECK(cudaDriverGetVersion(&version));
   if (version < 11040) {
+=======
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
+  int version = 0;
+  AT_CUDA_CHECK(cudaDriverGetVersion(&version));
+  if (version < 11040) {
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
     // who prefer not to report error message through these arguments moving forward
     // (they prefer return value, or errors on api calls internal to the capture)
@@ -181,11 +193,19 @@ void CUDAGraph::instantiate() {
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
+<<<<<<< HEAD
+=======
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                 graph_,
                                                 cudaGraphInstantiateFlagAutoFreeOnLaunch));
   }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   has_graph_exec_ = true;
 }
 
@@ -248,6 +268,7 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() {
   return graph_;
 }
 
+<<<<<<< HEAD
 cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() {
   TORCH_CHECK(
       has_graph_exec_,
@@ -255,6 +276,8 @@ cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() {
   return graph_exec_;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void CUDAGraph::reset() {
   // I'd prefer these checks throw exceptions, not print warnings,
   // but the destructor calls reset(), and at least one CI build
@@ -307,7 +330,11 @@ CUDAGraph::~CUDAGraph() {
 // There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
 // They wait for next sync point in order to free the memory, this is to ensure that all
 // hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
+<<<<<<< HEAD
 // We need to ensure all async operations finish before deleting the object.
+=======
+// We need to ensure all async opreations finish before deleting the object.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
   if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
   {
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index a32e7b4b86f07..075e6ed31bc68 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -2,7 +2,10 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
+<<<<<<< HEAD
 #include <c10/cuda/CUDACachingAllocator.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
@@ -37,7 +40,10 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void enable_debug_mode();
   void debug_dump(const std::string& debug_path);
   cudaGraph_t raw_cuda_graph();
+<<<<<<< HEAD
   cudaGraphExec_t raw_cuda_graph_exec();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   cudaGraph_t graph_ = nullptr;
@@ -56,7 +62,11 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
 
   // the ID assigned by cuda during graph capture,
   // used to identify when a stream is participating in capture
+<<<<<<< HEAD
   CaptureId_t capture_id_ = 0;
+=======
+  CaptureId_t capture_id_ = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // uuid used to request a particular private mempool from CUDACachingAllocator.
   // By default, this will be set to {id_, 0}.
diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h
index e00e50b38d2de..380769f9076e0 100644
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@@ -6,15 +6,54 @@
 #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
 #endif
 
+<<<<<<< HEAD
+=======
+// cuSparse Generic API added in CUDA 10.1
+// Windows support added in CUDA 11.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
+#define AT_USE_CUSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_API() 0
+#endif
+
+// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION < 12000)
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
+#endif
+
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION >= 12000)
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(USE_ROCM)
 // hipSparse const API added in v2.4.0
 #if HIPSPARSE_VERSION >= 200400
+<<<<<<< HEAD
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#endif
+#else // USE_ROCM
+=======
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #else
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #endif
 #else // USE_ROCM
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif // USE_ROCM
 
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
index d5f04df55f9c2..742398dbdb8fb 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@@ -12,6 +12,11 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
   return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
 }
 
+<<<<<<< HEAD
+=======
+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 // If a specific GPU model does not provide native support for a given data
@@ -208,4 +213,9 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
   descriptor_.reset(raw_descriptor);
 }
 
+<<<<<<< HEAD
+=======
+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda::sparse
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h
index f12ef628e13f5..98547c4879396 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@@ -35,6 +35,10 @@ class CuSparseDescriptor {
   std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
 
+<<<<<<< HEAD
+=======
+#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T, cusparseStatus_t (*destructor)(const T*)>
 struct ConstCuSparseDescriptorDeleter {
   void operator()(T* x) {
@@ -57,6 +61,10 @@ class ConstCuSparseDescriptor {
  protected:
   std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
+<<<<<<< HEAD
+=======
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(USE_ROCM)
 using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
@@ -121,8 +129,44 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
 
 #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
 
+<<<<<<< HEAD
 cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
 
+=======
+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
+
+#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
+class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
+    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
+ public:
+  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+};
+
+class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
+ public:
+  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+};
+
+class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
+    : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
+ public:
+  explicit CuSparseDnVecDescriptor(const Tensor& input);
+};
+
+class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
+    : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
+
+#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
       : public ConstCuSparseDescriptor<
             cusparseDnMatDescr,
@@ -161,6 +205,10 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
       : public ConstCuSparseDescriptor<
             cusparseSpMatDescr,
             &cusparseDestroySpMat> {};
+<<<<<<< HEAD
+=======
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
     : public CuSparseSpMatDescriptor {
@@ -249,4 +297,9 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
   }
 };
 
+<<<<<<< HEAD
+=======
+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda::sparse
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 5786e87dac519..53f3aa74feeec 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -9,6 +9,10 @@
 
 #include <cuda_runtime_api.h>
 #include <future>
+<<<<<<< HEAD
+=======
+#include <unordered_map>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::cuda {
 namespace {
@@ -71,6 +75,7 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
     : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
  private:
+<<<<<<< HEAD
   ska::flat_hash_map<void*, bool> use_host_register;
 
   void allocate_host_memory(size_t size, void** ptr) override {
@@ -85,6 +90,11 @@ struct CUDACachingHostAllocatorImpl
   }
 
   void allocate_host_memory_slowpath(size_t size, void** ptr) {
+=======
+  std::unordered_map<void*, bool> use_host_register;
+
+  void allocate_host_memory(size_t size, void** ptr) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Pinned memory pointers allocated by any device can be directly used by
     // any other device, regardless of the current device at the time of
     // allocation, since we assume unified addressing. So we grab any existing
@@ -123,6 +133,7 @@ struct CUDACachingHostAllocatorImpl
   }
 
   void free_block(Block* block) override {
+<<<<<<< HEAD
     // We never free blocks from the reserve segment
     if (get_reserve_segment().initialized()) {
       // Check if the block is from the reserve segment
@@ -138,6 +149,11 @@ struct CUDACachingHostAllocatorImpl
     auto start = std::chrono::steady_clock::now();
     // Users may change the allocator config at will. torch unit tests do this.
     // However, allocations using cudaHostRegister should use corresponding
+=======
+    auto start = std::chrono::steady_clock::now();
+    // Users may change the allocator config at will. torch unit tests do this.
+    // However, allocations using cudaHostRegister should use corresonding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
     void* ptr = block->ptr_;
     bool use_register = false;
@@ -183,12 +199,21 @@ struct CUDACachingHostAllocatorImpl
     return true;
   }
 
+<<<<<<< HEAD
+=======
+  bool pinned_use_background_threads() override {
+    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+        pinned_use_background_threads();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EventPool::Event create_event_internal(DeviceIndex idx) {
     // Leak the event pool to avoid shutdown issue.
     static auto* event_pool = new EventPool();
     return event_pool->get(idx);
   }
 
+<<<<<<< HEAD
   PinnedReserveSegment& get_reserve_segment() {
     static auto reserve_segment = [&]() {
       if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
@@ -203,6 +228,8 @@ struct CUDACachingHostAllocatorImpl
     return reserve_segment;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TaskThreadPool* getThreadPool() {
     static TaskThreadPool* pool = new TaskThreadPool(
         static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
@@ -217,15 +244,24 @@ struct CUDACachingHostAllocatorImpl
       size_t numThreads,
       size_t pageSize) {
     uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
+<<<<<<< HEAD
     uintptr_t end = start + (size / numThreads);
+=======
+    uintptr_t end = (uintptr_t)start + (size / numThreads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (i == (numThreads - 1)) {
       end = (uintptr_t)ptr + size;
     }
 
     // pre-fault/map the pages by setting the first byte of the page
     uintptr_t alignedStart =
+<<<<<<< HEAD
         ((start + pageSize - 1) & ~(pageSize - 1));
     for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
+=======
+        (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
+    for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       memset((void*)p, 0, 1);
     }
@@ -289,7 +325,11 @@ DECLARE_HOST_ALLOCATOR(
     CUDACachingHostAllocator,
     CUDACachingHostAllocatorImpl,
     raw_local_deleter,
+<<<<<<< HEAD
     caching_host_allocator)
+=======
+    caching_host_allocator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)
 
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index d7832c761ae55..fa404b0c7711c 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -309,8 +309,12 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
+<<<<<<< HEAD
   if (!NoTF32Guard::should_disable_tf32() &&
       at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+=======
+  if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
index 66a75db6ea067..93ca53c77d6df 100644
--- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
@@ -4,9 +4,12 @@
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
+<<<<<<< HEAD
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
@@ -15,7 +18,10 @@
 namespace at::cuda {
 
 static std::vector<int8_t> p2pAccessEnabled_;
+<<<<<<< HEAD
 static std::vector<int8_t> fabricAccessEnabled_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static int64_t num_devices_ = -1;
 
 namespace detail {
@@ -33,15 +39,22 @@ void init_p2p_access_cache(int64_t num_devices) {
   for (const auto i : c10::irange(num_devices)) {
     p2pAccessEnabled_[i * num_devices + i] = 1;
   }
+<<<<<<< HEAD
   fabricAccessEnabled_.clear();
   fabricAccessEnabled_.resize(num_devices, -1);
 }
 
 } // namespace detail
+=======
+}
+
+}  // namespace detail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
 
+<<<<<<< HEAD
   TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
   TORCH_CHECK(
       dev_to_access >= 0 || dev_to_access < num_devices_,
@@ -50,6 +63,15 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
   TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
 
   auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
+=======
+  TORCH_CHECK(dev >= 0 || dev < num_devices_,
+              dev, " is not a device");
+  TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
+              dev_to_access, " is not a device");
+  TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
+
+  auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (cache != -1) {
     return cache;
@@ -65,6 +87,7 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
   return cache;
 }
 
+<<<<<<< HEAD
 namespace {
 #if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
 
@@ -180,3 +203,6 @@ bool get_fabric_access(c10::DeviceIndex dev) {
 }
 
 } // namespace at::cuda
+=======
+}  // namespace at::cuda::detail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.h b/aten/src/ATen/cuda/PeerToPeerAccess.h
index 30d21af83ed88..c041aa7d6f107 100644
--- a/aten/src/ATen/cuda/PeerToPeerAccess.h
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.h
@@ -8,6 +8,9 @@ void init_p2p_access_cache(int64_t num_devices);
 }
 
 TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev);
+<<<<<<< HEAD
 TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 }  // namespace at::cuda
diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu
index bc863b8880da7..3bcbe9d60dc9b 100644
--- a/aten/src/ATen/cuda/cub.cu
+++ b/aten/src/ATen/cuda/cub.cu
@@ -15,7 +15,12 @@ struct SumOp {
 
 template <typename input_t, typename output_t>
 void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t num_items) {
+<<<<<<< HEAD
   inclusive_scan(input, output, NO_ROCM(::cuda)::std::plus<>{}, num_items);
+=======
+  using NO_ROCM(at_cuda_detail)::cub::Sum;
+  inclusive_scan(input, output, Sum{}, num_items);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template void inclusive_sum_truncating(const int32_t *input, int32_t *output, int64_t num_items);
@@ -41,7 +46,12 @@ struct CountMaskOp {
 
 void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n) {
   CountMaskOp op{};
+<<<<<<< HEAD
   auto iter = ATEN_CUB_TRANSFORM_ITERATOR(bool, decltype(op), decltype(mask))(mask, op);
+=======
+  auto iter = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<
+      bool, decltype(op), decltype(mask)>(mask, op);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   exclusive_scan(iter, output_idx, SumOp<int64_t>{}, int64_t{0}, n);
 }
 
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 5c83810164adb..cf121a8604595 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -6,10 +6,13 @@
 #include <iterator>
 #include <limits>
 
+<<<<<<< HEAD
 #ifndef USE_ROCM
 #include <cuda/std/functional>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/cub_definitions.cuh>
 #include <ATen/cuda/CUDAContextLight.h>
 
@@ -55,6 +58,7 @@
 #define ROCM_HIPCUB(x) x
 #endif
 
+<<<<<<< HEAD
 #if CUB_V3_PLUS()
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -71,6 +75,13 @@
 #endif
 
 #if defined(USE_ROCM)
+=======
+#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM)
+
+#if !defined(USE_ROCM)
+namespace at_cuda_detail {
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
 
@@ -92,6 +103,13 @@ template <>
 struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
        ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
 
+<<<<<<< HEAD
+=======
+#if !defined(USE_ROCM)
+} // namespace at_cuda_detail
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #if !defined(USE_ROCM)
@@ -113,7 +131,11 @@ struct cuda_type<c10::Half> {
   using type = __half;
 };
 
+<<<<<<< HEAD
 #if !defined(USE_ROCM)
+=======
+#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template<>
 struct cuda_type<c10::BFloat16> {
@@ -169,6 +191,10 @@ inline void segmented_sort_pairs(
   }
 }
 
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
 inline void unique_by_key(
   KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@@ -184,6 +210,10 @@ inline void unique_by_key(
   CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
     keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
 }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace impl {
 
@@ -195,6 +225,39 @@ __global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputItera
   *out = scan_op(static_cast<acc_t>(*a), static_cast<acc_t>(*b));
 }
 
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+template<typename ValueT, typename InputIteratorT>
+struct chained_iterator {
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = ValueT;
+  using pointer           = ValueT*;
+  using reference         = ValueT&;
+
+  InputIteratorT iter;
+  ValueT *first;
+  difference_type offset = 0;
+
+  __device__ ValueT operator[](difference_type i) {
+    i +=  offset;
+    if (i == 0) {
+      return *first;
+    } else {
+      return ValueT(iter[i - 1]);
+    }
+  }
+  __device__ chained_iterator operator+(difference_type i) {
+    return chained_iterator{iter, first, i};
+  }
+  __device__ ValueT operator*() {
+    return (*this)[0];
+  }
+};
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
 // so split at int_max/2
 constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
@@ -239,6 +302,28 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         first_elem_ptr,
         scan_op);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>;
+    using tuple = typename ArgIndexInputIterator::value_type;
+    auto input_iter_transform = [=] __device__ (const tuple &x)->input_t  {
+      if (x.key == 0) {
+        return *first_elem_ptr;
+      } else {
+        return x.value;
+      }
+    };
+    auto input_ = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>(
+      ArgIndexInputIterator(input + i), input_iter_transform);
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
         input + i + 1,
         output + i,
@@ -246,6 +331,10 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         ::at_cuda_detail::cub::FutureValue<input_t>(first_elem_ptr),
         size_cub,
         at::cuda::getCurrentCUDAStream());
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif
 }
@@ -384,7 +473,11 @@ __global__ void calc_block_sums(const T * d_in, aggT * agg, int64_t nelem, int i
     aggT data[ITEMS_PER_THREAD];
     aggT agg_val = 0;
     TransformFunctor<T, aggT, nonzero> transform_functor;
+<<<<<<< HEAD
     auto iter_in = ATEN_CUB_TRANSFORM_ITERATOR(aggT, TransformFunctor<T, aggT, nonzero>, const T*)(d_in, transform_functor);
+=======
+    auto iter_in = ROCM_HIPCUB(at_cuda_detail::cub)::TransformInputIterator<aggT, TransformFunctor<T, aggT, nonzero>, const T*>(d_in, transform_functor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (int i=0; i<iters_per_cta; i++){
       if (remaining >= BLOCK_THREADS * ITEMS_PER_THREAD) {
         BlockLoadT(temp_storage.load).Load(iter_in, data);
@@ -497,6 +590,19 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         first_elem_ptr,
         scan_op);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{
+      input + i, first_elem_ptr};
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
         input + i,
         output + i,
@@ -504,10 +610,18 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
         ::at_cuda_detail::cub::FutureValue<InitValueT>(first_elem_ptr),
         size_cub,
         at::cuda::getCurrentCUDAStream());
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif
 }
 
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
 inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@@ -515,7 +629,11 @@ inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT i
     "cub InclusiveSumByKey does not support more than INT_MAX elements");
 #if !defined(USE_ROCM)
   CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveSumByKey,
+<<<<<<< HEAD
       keys, input, output, num_items, NO_ROCM(::cuda)::std::equal_to<>(), at::cuda::getCurrentCUDAStream());
+=======
+      keys, input, output, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   CUB_WRAPPER(cub::DeviceScan::InclusiveSumByKey,
       keys, input, output, num_items, hipcub::Equality(), at::cuda::getCurrentCUDAStream());
@@ -528,13 +646,21 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
     "cub InclusiveSumByKey does not support more than INT_MAX elements");
 #if !defined(USE_ROCM)
   CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveScanByKey,
+<<<<<<< HEAD
       keys, input, output, scan_op, num_items, NO_ROCM(::cuda)::std::equal_to<>(), at::cuda::getCurrentCUDAStream());
+=======
+      keys, input, output, scan_op, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   CUB_WRAPPER(cub::DeviceScan::InclusiveScanByKey,
       keys, input, output, scan_op, num_items, hipcub::Equality(), at::cuda::getCurrentCUDAStream());
 #endif
 }
 
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
 void unique(InputIteratorT input, OutputIteratorT output,
diff --git a/aten/src/ATen/cuda/cub.h b/aten/src/ATen/cuda/cub.h
index 7430edaf8a3dc..97b468a806f8e 100644
--- a/aten/src/ATen/cuda/cub.h
+++ b/aten/src/ATen/cuda/cub.h
@@ -4,7 +4,11 @@
 #include <ATen/cuda/CUDAConfig.h>
 
 // NOTE: These templates are intentionally not defined in this header,
+<<<<<<< HEAD
 // which avoids re-compiling them for each translation unit. If you get
+=======
+// which aviods re-compiling them for each translation unit. If you get
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // a link error, you need to add an explicit instantiation for your
 // types in cub.cu
 
diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
index e0d7455d4c22b..1c26e01ea722d 100644
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@@ -10,6 +10,17 @@
 #define CUB_VERSION 200001
 #endif
 
+<<<<<<< HEAD
+=======
+// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
+// https://github.com/NVIDIA/cub/pull/306
+#if CUB_VERSION >= 101300
+#define CUB_SUPPORTS_NV_BFLOAT16() true
+#else
+#define CUB_SUPPORTS_NV_BFLOAT16() false
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
 // https://github.com/NVIDIA/cub/pull/326
 // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
@@ -20,10 +31,35 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif
 
+<<<<<<< HEAD
 // There were many bc-breaking changes in major version release of CCCL v3.0.0
 // Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html
 #if CUB_VERSION >= 200800
 #define CUB_V3_PLUS() true
 #else
 #define CUB_V3_PLUS() false
+=======
+// cub support for UniqueByKey is added to cub 1.16 in:
+// https://github.com/NVIDIA/cub/pull/405
+#if CUB_VERSION >= 101600
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
+#else
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+#endif
+
+// cub support for scan by key is added to cub 1.15
+// in https://github.com/NVIDIA/cub/pull/376
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_SCAN_BY_KEY() 1
+#else
+#define CUB_SUPPORTS_SCAN_BY_KEY() 0
+#endif
+
+// cub support for cub::FutureValue is added to cub 1.15 in:
+// https://github.com/NVIDIA/cub/pull/305
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_FUTURE_VALUE() true
+#else
+#define CUB_SUPPORTS_FUTURE_VALUE() false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index b7f80101d926e..5cb3f7212f613 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -19,6 +19,13 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
+=======
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
 #endif
@@ -89,6 +96,32 @@ void CUDAHooks::init() const {
   // have a chance to enable vitals.
   at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
 
+<<<<<<< HEAD
+=======
+  // Sets the CUDA_MODULE_LOADING environment variable
+  // if it's not set by the user.
+  // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+.
+  // Check the driver version and only set the env variable if needed.
+  bool set_lazy_module_loading = true;
+  #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  // Initialize NVML
+  if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
+    // Get the driver version
+    int version = -1;
+    auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
+    if (res == NVML_SUCCESS) {
+      // Check if driver is sufficiently new
+      if (version >= 12020) {
+        set_lazy_module_loading = false;
+      }
+    }
+  }
+  #endif
+  if (set_lazy_module_loading) {
+    c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_devices = c10::cuda::device_count_ensure_non_zero();
   c10::cuda::CUDACachingAllocator::init(num_devices);
   at::cuda::detail::init_p2p_access_cache(num_devices);
@@ -180,6 +213,7 @@ bool CUDAHooks::hasCuBLASLt() const {
 #endif
 }
 
+<<<<<<< HEAD
 
 bool CUDAHooks::hasCKSDPA() const {
 #if !defined(USE_ROCM)
@@ -201,6 +235,8 @@ bool CUDAHooks::hasCKGEMM() const {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool CUDAHooks::hasROCM() const {
   // Currently, this is same as `compiledWithMIOpen`.
   // But in future if there are ROCm builds without MIOpen,
@@ -281,9 +317,12 @@ bool CUDAHooks::compiledWithMIOpen() const {
 
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
+<<<<<<< HEAD
   if (!hasCUDA()) {
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE: extra parenthesis around numbers disable clang warnings about
   // dead code
   return true;
@@ -294,9 +333,12 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 
 bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
+<<<<<<< HEAD
   if (!hasCUDA()) {
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   // Check for Volta cores
   if (prop->major >= 7) {
@@ -311,6 +353,7 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 
 bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #if AT_CUDNN_ENABLED()
+<<<<<<< HEAD
   if (!hasCUDA()) {
     return false;
   }
@@ -331,6 +374,8 @@ bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
   if (!hasCUDA()) {
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   // Check for Volta cores
   if (prop->major >= 8) {
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 8d3d1db003928..a2940b298e09c 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -17,7 +17,11 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
 
 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
+<<<<<<< HEAD
   CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
+=======
+  CUDAHooks(at::CUDAHooksArgs) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void init() const override;
   Device getDeviceFromPtr(void* data) const override;
   bool isPinnedPtr(const void* data) const override;
@@ -31,8 +35,11 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasCuSOLVER() const override;
   bool hasCuBLASLt() const override;
   bool hasROCM() const override;
+<<<<<<< HEAD
   bool hasCKSDPA() const override;
   bool hasCKGEMM() const override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const at::cuda::NVRTC& nvrtc() const override;
   DeviceIndex current_device() const override;
   bool isBuilt() const override {return true;}
@@ -45,7 +52,10 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool supportsDilatedConvolutionWithCuDNN() const override;
   bool supportsDepthwiseConvolutionWithCuDNN() const override;
   bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+<<<<<<< HEAD
   bool supportsBFloat16RNNWithCuDNN() const override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool hasCUDART() const override;
   long versionCUDART() const override;
   long versionCuDNN() const override;
diff --git a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
index 71a344d281d2a..0a05477e08555 100644
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@@ -122,7 +122,11 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread
 
     // Called by the destructor.  Releases this thread's handles back into the pool.
     void release() {
+<<<<<<< HEAD
         if(!my_handles.empty()) {
+=======
+        if(my_handles.size() > 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto parent = weak_parent.lock();
             if (!parent) {
                 // If this thread exits after atexit handlers have completed, the
diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
index 487e798bd80f6..5a050d1937e87 100644
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@@ -49,12 +49,20 @@ struct OffsetCalculator {
 #if defined(USE_ROCM)
     if ((dims > 0) && (dims <= 2)) {
       auto divmod = sizes_[0].divmod(linear_idx);
+<<<<<<< HEAD
 #pragma unroll
+=======
+      #pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (int arg = 0; arg < NARGS; arg++)
         offsets[arg] = divmod.mod * strides_[0][arg];
       if (dims >= 2) {
         divmod = sizes_[1].divmod(divmod.div);
+<<<<<<< HEAD
 #pragma unroll
+=======
+        #pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (int arg = 0; arg < NARGS; arg++)
           offsets[arg] += divmod.mod * strides_[1][arg];
       }
diff --git a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
index 7de0321256fd7..5a63e455db600 100644
--- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@@ -19,7 +19,11 @@ struct PhiloxCudaState {
   // Called if graph capture is underway
   PhiloxCudaState(int64_t* seed,
                   int64_t* offset_extragraph,
+<<<<<<< HEAD
                   uint64_t offset_intragraph) {
+=======
+                  uint32_t offset_intragraph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     seed_.ptr = seed;
     offset_.ptr = offset_extragraph;
     offset_intragraph_ = offset_intragraph;
@@ -36,7 +40,11 @@ struct PhiloxCudaState {
 
   Payload seed_{};
   Payload offset_{};
+<<<<<<< HEAD
   uint64_t offset_intragraph_ = 0;
+=======
+  uint32_t offset_intragraph_ = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool captured_ = false;
 };
 
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index aca83386ad421..cbf4d811ebf14 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -117,8 +117,11 @@ namespace at::cuda {
   _(nvrtcGetPTXSize)                              \
   _(nvrtcGetPTX)                                  \
   _(cuModuleLoadData)                             \
+<<<<<<< HEAD
   _(cuModuleLoad)                                 \
   _(cuGetErrorString)                             \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _(cuModuleGetFunction)                          \
   _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
   _(nvrtcGetErrorString)                          \
diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
index 5d9e33b2b5b2f..4ff019366af75 100644
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -13,7 +13,10 @@
 #include <c10/core/ScalarType.h>
 
 #include <ATen/cuda/tunable/TunableOp.h>
+<<<<<<< HEAD
 #include <ATen/cuda/tunable/Tunable.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/util/StringUtil.h>
@@ -30,8 +33,11 @@
 
 namespace at::cuda::tunable {
 
+<<<<<<< HEAD
 using at::blas::ScalingType;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enum class BlasOp {
   N = 0,
   T = 1
@@ -151,7 +157,10 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
       BLASType = "unknown";
   }
   return BLASType;
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Similar to Compute Type in GemmRocblas.h
@@ -164,7 +173,11 @@ inline std::string ComputeTypeFor() {
 // ROCBLAS and hipBLASLt.
 template <>
 inline std::string ComputeTypeFor<float>() {
+<<<<<<< HEAD
   if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
+=======
+  if (!at::globalContext().allowTF32CuBLAS()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "f32_r";
   } else {
     return "xf32_r";
@@ -246,6 +259,7 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio
 
 namespace detail {
 
+<<<<<<< HEAD
 static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
 
   if (!config.enabled) {
@@ -253,10 +267,16 @@ static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t siz
   }
 
   auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+=======
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  // comparison done as 1D tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor ref = at::from_blob(c,       {size}, options);
   at::Tensor oth = at::from_blob(other_c, {size}, options);
   at::Tensor ref_float = ref.to(at::kFloat);
   at::Tensor oth_float = oth.to(at::kFloat);
+<<<<<<< HEAD
 
   const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
   if (ok) {
@@ -265,6 +285,28 @@ static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t siz
     TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
   }
   return ok;
+=======
+  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  double last_succeed_atol = 1;
+  double last_succeed_rtol = 1;
+  for (auto& atol : atols) {
+    for (auto& rtol : rtols) {
+      if (at::allclose(ref_float, oth_float, rtol, atol)) {
+        last_succeed_atol = atol;
+        last_succeed_rtol = rtol;
+      }
+    }
+  }
+  if (last_succeed_atol == 1) {
+    return false;
+  }
+  else {
+    TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+  }
+
+  return true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 }
@@ -349,10 +391,15 @@ struct GemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmParams<T> *other) {
+<<<<<<< HEAD
     auto* ctx = getTuningContext();
     auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
     return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+=======
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   char transa{};
@@ -445,10 +492,15 @@ struct GemmAndBiasParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+<<<<<<< HEAD
     auto* ctx = getTuningContext();
     auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
     return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+=======
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   char transa{};
@@ -544,10 +596,15 @@ struct GemmStridedBatchedParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+<<<<<<< HEAD
     auto* ctx = getTuningContext();
     auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
     return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+=======
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   char transa{};
@@ -600,8 +657,12 @@ struct ScaledGemmParams : OpParams {
     //
     // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector.
     return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s",
+<<<<<<< HEAD
       transa, transb, m, n, k, lda, ldb, ldc,
       a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise,
+=======
+      transa, transb, m, n, k, lda, ldb, ldc, use_rowwise,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bias_ptr == nullptr ? "None" : at::toString(bias_dtype));
   }
 
@@ -663,9 +724,13 @@ struct ScaledGemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+<<<<<<< HEAD
     auto* ctx = getTuningContext();
     auto cfg = ctx->GetNumericalCheckConfig();
     return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+=======
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   char transa{};
@@ -678,13 +743,19 @@ struct ScaledGemmParams : OpParams {
   int64_t lda{};
   ScalarType a_dtype{};
   ScalarType a_scale_dtype{};
+<<<<<<< HEAD
   ScalingType a_scaling_type{};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const void* b{};
   const void* b_scale_ptr{};
   int64_t ldb{};
   ScalarType b_dtype{};
   ScalarType b_scale_dtype{};
+<<<<<<< HEAD
   ScalingType b_scaling_type{};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const void* bias_ptr{};
   ScalarType bias_dtype{};
   void* c{};
@@ -693,6 +764,10 @@ struct ScaledGemmParams : OpParams {
   ScalarType c_dtype{};
   void* amax_ptr{};
   bool use_fast_accum{};
+<<<<<<< HEAD
+=======
+  bool use_rowwise{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 private:
   bool duplicate_inputs_{false};
 };
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index 10490b0323ed9..bd80943804c47 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -14,7 +14,10 @@
 #include <hipblaslt/hipblaslt.h>
 #include <hipblaslt/hipblaslt-ext.hpp>
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_HIPBLASLT_CHECK(EXPR)               \
   do {                                            \
     hipblasStatus_t __err = EXPR;                 \
@@ -216,6 +219,7 @@ float GetBetaFromParams(const ScaledGemmParams<T>* params) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 ScalingType GetAScalingTypeFromParams(const GemmParams<T>* params) {
   return ScalingType::TensorWise;
 }
@@ -253,6 +257,25 @@ ScalingType GetAScalingTypeFromParams(const ScaledGemmParams<T>* params) {
 template <typename T>
 ScalingType GetBScalingTypeFromParams(const ScaledGemmParams<T>* params) {
   return params->b_scaling_type;
+=======
+bool GetUseRowwiseFromParams(const GemmParams<T>* params) {
+  return false;
+}
+
+template <typename T>
+bool GetUseRowwiseFromParams(const GemmAndBiasParams<T>* params) {
+  return false;
+}
+
+template <typename T>
+bool GetUseRowwiseFromParams(const GemmStridedBatchedParams<T>* params) {
+  return false;
+}
+
+template <typename T>
+bool GetUseRowwiseFromParams(const ScaledGemmParams<T>* params) {
+  return params->use_rowwise;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -507,7 +530,11 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       }
 
       hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+<<<<<<< HEAD
       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+=======
+      if (at::globalContext().allowTF32CuBLAS()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
       }
       HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
@@ -519,6 +546,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
       const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
       if (mat1_scale_ptr && mat2_scale_ptr) {
+<<<<<<< HEAD
         hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER;
         hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER;
         if (GetAScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
@@ -537,6 +565,25 @@ class HipblasltGemmOp : public Callable<ParamsT> {
         }
         matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr);
         matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr);
+=======
+#ifdef HIPBLASLT_VEC_EXT
+        if (GetUseRowwiseFromParams<CT>(params)) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT, mat1_scale_ptr);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT, mat2_scale_ptr);
+        }
+        else
+#endif
+        {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+        }
+#ifdef HIPBLASLT_OUTER_VEC
+        if (GetUseRowwiseFromParams<CT>(params)) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+        }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       if (result_scale_ptr) {
         matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h
index 60eaa2e4d4754..234456bd937e5 100644
--- a/aten/src/ATen/cuda/tunable/GemmRocblas.h
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@@ -141,7 +141,11 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
 
     TuningStatus Call(const GemmParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
+<<<<<<< HEAD
       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+=======
+      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);
@@ -209,7 +213,11 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
 
     TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
+<<<<<<< HEAD
       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+=======
+      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);
diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md
index 1a9c91dab7c0b..eca37cee98b5f 100644
--- a/aten/src/ATen/cuda/tunable/README.md
+++ b/aten/src/ATen/cuda/tunable/README.md
@@ -38,7 +38,11 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
 GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
 ```
 
+<<<<<<< HEAD
 Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
+=======
+Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 this and reject the tunings file because the prior tunings are likely affected by other software changes.
 
 The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
@@ -145,7 +149,11 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
 | PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. |
 | PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. |
 | PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. |
+<<<<<<< HEAD
 | PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". |
+=======
+| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. |
 | PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. |
 | PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. |
@@ -154,7 +162,11 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
 | PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS | Default is 0, meaning it is not used. |
 | PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED | Default is 1. Set to 0 to disable. |
 | PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE | Default (or < 0) is to query L2 cache size. Set to 0 to disable. Otherwise, set to the number of MiB to use for the pool of operator parameters. For example, setting this to the size of your device's memory cache will guarantee that every tuning iteration will use a cold cache. |
+<<<<<<< HEAD
 | PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS parameters to tuning CSV file. |
+=======
+| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS paramters to tuning CSV file. |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Python Interface
 All python APIs exist in the `torch.cuda.tunable` module.
@@ -173,9 +185,16 @@ All python APIs exist in the `torch.cuda.tunable` module.
 | get_max_tuning_iterations() -> int | |
 | set_filename(filename: str, insert_device_ordinal: bool = False) -> None | |
 | get_filename() -> str | |
+<<<<<<< HEAD
 | set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5.
 | get_results() -> Tuple[str, str, str, float] | |
 | get_validators() -> Tuple[str, str] | |
+=======
+| get_results() -> Tuple[str, str, str, float] | |
+| get_validators() -> Tuple[str, str] | |
+| write_file_on_exit(val: bool) -> None | Default is True. |
+| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
 | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 9fb04b40d30f6..5096059ba48e1 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -107,6 +107,7 @@ void TuningResultsManager::AddImpl(const std::string& op_signature,
 }
 
 void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
+<<<<<<< HEAD
   bool is_new = false;
   ResultEntry inserted = ResultEntry::Null();
 
@@ -131,6 +132,16 @@ void TuningResultsManager::Add(const std::string& op_signature, const std::strin
     }
   }
 
+=======
+  std::scoped_lock l{lock_};
+
+  auto it = results_.find(op_signature);
+  if (it == results_.end()) {
+    it = results_.insert({op_signature, {}}).first;
+  }
+
+  AddImpl(op_signature, params_signature, std::move(best), it->second);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
@@ -166,6 +177,7 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
   }
 }
 
+<<<<<<< HEAD
 void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map<std::string, std::string>& validators) {
   std::scoped_lock fl{realtime_file_mutex_};
 
@@ -237,6 +249,8 @@ void TuningResultsManager::CloseRealtimeAppend() {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
   std::scoped_lock l{lock_};
 
@@ -307,6 +321,7 @@ TuningResultsValidator::TuningResultsValidator() {
       []() { return GetPyTorchVersion(); },
       [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
 #ifdef USE_ROCM
+<<<<<<< HEAD
   // hip
   {
     // HIP version is more accurate than ROCm version.  User's environment could be a stock
@@ -318,6 +333,21 @@ TuningResultsValidator::TuningResultsValidator() {
        [hip_version](auto&& k) {
         TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version);
         return hip_version == k ? OK : FAIL;
+=======
+  // rocm
+  {
+#ifdef _WIN32
+    std::string rocm_version = HIP_VERSION_BUILD_NAME;
+#else
+    std::string rocm_version = ROCM_BUILD_INFO;
+#endif
+    RegisterValidator(
+       "ROCM_VERSION",
+       [rocm_version]() { return rocm_version; },
+       [rocm_version](auto&& k) {
+        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
+        return rocm_version == k ? OK : FAIL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   }
   // gfx arch
@@ -483,6 +513,10 @@ TuningContext::TuningContext() :
     tuning_enable_{true},
     record_untuned_enable_{false},
     manager_initialized_{false},
+<<<<<<< HEAD
+=======
+    write_file_on_exit_{true},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     numerics_check_enable_{false},
     max_tuning_duration_ms_{30},
     max_tuning_iterations_{100},
@@ -490,6 +524,11 @@ TuningContext::TuningContext() :
     max_warmup_iterations_{0},
     icache_flush_{true},
     rotating_buffer_size_{-1},
+<<<<<<< HEAD
+=======
+    filename_{},
+    untuned_file_{},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     results_count_from_input_file_{0},
     is_shutting_down_{false}
 {
@@ -503,8 +542,25 @@ TuningContext::~TuningContext() {
     // but doesn't do any computation itself.
     return;
   }
+<<<<<<< HEAD
   TUNABLE_LOG1("Closing File");
   GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now.
+=======
+  auto filename = GetFilename();
+  if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) {
+    if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
+      if (results_count_from_input_file_ > 0) {
+        TUNABLE_LOG1("additional tuning results available, rewriting file ", filename);
+      }
+      else {
+        TUNABLE_LOG1("writing file ", filename);
+      }
+      if (!WriteFile(filename)) {
+        TUNABLE_LOG1("failed to write file ", filename);
+      }
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (untuned_file_.good()) {
     untuned_file_.close();
@@ -580,16 +636,27 @@ std::ofstream& TuningContext::GetUntunedFile(){
       filename.append(device);
     }
 
+<<<<<<< HEAD
     untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
+=======
+    untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return untuned_file_;
 }
 
+<<<<<<< HEAD
+=======
+void TuningContext::WriteFileOnExit(bool value) {
+  write_file_on_exit_ = value;
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void TuningContext::EnableNumericsCheck(bool value) {
   numerics_check_enable_ = value;
 }
 
+<<<<<<< HEAD
 NumericalCheckConfig TuningContext::GetNumericalCheckConfig() const {
   const auto env_opt = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
 
@@ -633,6 +700,14 @@ void TuningContext::SetNumericalCheckConfig(bool enabled, double atol, double rt
 bool TuningContext::IsNumericsCheckEnabled() const {
   const auto cfg = GetNumericalCheckConfig();
   return cfg.enabled || numerics_check_enable_;
+=======
+bool TuningContext::IsNumericsCheckEnabled() const {
+  const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+  if (env == "1") {
+    return true;
+  }
+  return numerics_check_enable_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
@@ -742,6 +817,14 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
     auto filename = GetFilename();
     if (!filename.empty() && !IsRecordUntunedEnabled()) {
       ReadFile(filename);
+<<<<<<< HEAD
+=======
+      // attempt immediately to open file for writing to catch errors early
+      std::ofstream file(filename, std::ios::out | std::ios::app);
+      if (!file.good()) {
+        TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   });
   return manager_;
@@ -847,6 +930,30 @@ bool TuningContext::ReadFile(const std::string& filename_) {
   return true;
 }
 
+<<<<<<< HEAD
+=======
+bool TuningContext::WriteFile(const std::string& filename_) {
+  std::string filename = filename_.empty() ? GetFilename() : filename_;
+  std::ofstream file(filename, std::ios::out | std::ios::trunc);
+  if (!file.good()) {
+    TUNABLE_LOG1("error opening tuning results file for writing ", filename);
+    return false;
+  }
+  auto validators = GetTuningResultsValidator().GetAllValidators();
+  for (const auto& [key, val] : validators) {
+    file << "Validator," << key << "," << val << std::endl;
+  }
+  auto results = GetTuningResultsManager().Dump();
+  for (const auto& [op_sig, kernelmap] : results) {
+    for (const auto& [param_sig, result] : kernelmap) {
+      file << op_sig << "," << param_sig << "," << result << std::endl;
+    }
+  }
+  file.close();
+  return true;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 struct MaybeDelete {
diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h
index 17b4ea34ddf61..999864c8731b8 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.h
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@@ -103,6 +103,7 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
 
     void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
       const std::string& params_signature, const std::string& blas_signature);
+<<<<<<< HEAD
 
     void InitRealtimeAppend(
         const std::string& filename,
@@ -121,6 +122,12 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
     ResultsMap results_;
     UntunedMap untuned_results_;
     bool validators_written_ = false;
+=======
+  private:
+    std::mutex lock_;
+    ResultsMap results_;
+    UntunedMap untuned_results_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 };
 
@@ -148,6 +155,7 @@ class TORCH_CUDA_CPP_API TuningResultsValidator {
     GetValidateFuncs validators_;
 };
 
+<<<<<<< HEAD
 struct NumericalCheckConfig {
   bool   enabled{false};
   double atol{1e-5};
@@ -158,6 +166,8 @@ struct NumericalCheckConfig {
 };
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TORCH_CUDA_CPP_API TuningContext {
   public:
     TuningContext();
@@ -179,8 +189,11 @@ class TORCH_CUDA_CPP_API TuningContext {
 
     void EnableNumericsCheck(bool value);
     bool IsNumericsCheckEnabled() const;
+<<<<<<< HEAD
     void SetNumericalCheckConfig(bool enabled, double atol, double rtol);
     NumericalCheckConfig GetNumericalCheckConfig() const;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     void SetMaxTuningDurationMs(int max_duration_ms);
     int GetMaxTuningDurationMs() const;
@@ -211,7 +224,14 @@ class TORCH_CUDA_CPP_API TuningContext {
     void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
     std::string GetFilename() const;
 
+<<<<<<< HEAD
+    bool ReadFile(const std::string& filename={});
+=======
+    void WriteFileOnExit(bool value);
+
     bool ReadFile(const std::string& filename={});
+    bool WriteFile(const std::string& filename={});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     template<class... Types>
     void Log(int level, Types... args) {
@@ -230,6 +250,10 @@ class TORCH_CUDA_CPP_API TuningContext {
     bool tuning_enable_;
     bool record_untuned_enable_;
     bool manager_initialized_;
+<<<<<<< HEAD
+=======
+    bool write_file_on_exit_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool numerics_check_enable_;
     int max_tuning_duration_ms_;
     int max_tuning_iterations_;
@@ -244,8 +268,11 @@ class TORCH_CUDA_CPP_API TuningContext {
     std::ofstream untuned_file_;
     size_t results_count_from_input_file_;
     bool is_shutting_down_;
+<<<<<<< HEAD
 
     NumericalCheckConfig numerics_cfg_{};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 TORCH_CUDA_CPP_API TuningContext* getTuningContext();
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
index c014d1ea569c8..801d48a2e8045 100644
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -96,13 +96,19 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
           params->lda,
           params->a_dtype,
           params->a_scale_dtype,
+<<<<<<< HEAD
           params->a_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           params->b,
           params->b_scale_ptr,
           params->ldb,
           params->b_dtype,
           params->b_scale_dtype,
+<<<<<<< HEAD
           params->b_scaling_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           params->bias_ptr,
           params->bias_dtype,
           params->c,
@@ -110,7 +116,11 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
           params->ldc,
           params->c_dtype,
           params->use_fast_accum,
+<<<<<<< HEAD
           std::nullopt /* alpha */);
+=======
+          params->use_rowwise);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return OK;
     }
 };
diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h
index 830473cb4ca9e..7620754fba887 100644
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@@ -29,7 +29,11 @@ template <typename ParamsT>
 class Callable {
   public:
     virtual ~Callable() = default;
+<<<<<<< HEAD
     virtual TuningStatus Call(const ParamsT* /*unused*/) {
+=======
+    virtual TuningStatus Call(const ParamsT*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return FAIL;
     }
     virtual TuningStatus IsSupported(const ParamsT* params) {
@@ -235,7 +239,11 @@ class TunableOp {
       // numeric check option is controlled by non-static env var, so check it once per tuned operator
       bool do_numerics_check = ctx->IsNumericsCheckEnabled();
 
+<<<<<<< HEAD
       // calculate a reference answer for numerical check
+=======
+      // calcaulte a reference answer for numerical check
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (do_numerics_check) {
         reference_params = params->DeepCopy(false);
         TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
@@ -267,10 +275,34 @@ class TunableOp {
       for (size_t i = 0; i < op_names_.size(); i++) {
         auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
 
+<<<<<<< HEAD
         auto status = candidate->Call(reusable_params[0]);
         if (status != OK) {
           TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
           continue;
+=======
+        if (do_numerics_check) {
+          ParamsT* numerical_params = params->DeepCopy(false);
+          auto status = candidate->Call(numerical_params);
+          if (status != OK) {
+            numerical_params->Delete();
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+        else {
+          auto status = candidate->Call(reusable_params[0]);
+          if (status != OK) {
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         // collect a small profile
@@ -293,6 +325,7 @@ class TunableOp {
           continue;
         }
 
+<<<<<<< HEAD
         if (do_numerics_check) {
           ParamsT* numerical_params = params->DeepCopy(false);
           auto status = candidate->Call(numerical_params);
@@ -309,6 +342,8 @@ class TunableOp {
           }
         }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // for warmup does user set max duration, max iters, or both?
         // warmup is skipped by default, i.e. warmup_iter = 0
         // warmup will be set to the non-zero value of max_warmup_duration
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index dbd178e0f8eee..b9f1e692e0f72 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -141,7 +141,11 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
     size[i] = (int) t.size(i);
   }
   for (const auto i : c10::irange(dim, pad)) {
+<<<<<<< HEAD
     size[i] = 1;
+=======
+    size[i] = (int) 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   dim = std::max(dim, pad);
   cudnnTensorFormat_t filter_format{};
diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp
index f612436f56724..fe7b8a17342bd 100644
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@@ -2,8 +2,11 @@
 
 #include <ATen/ATen.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
@@ -22,10 +25,16 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
   } else if (dtype == at::kByte) {
     return CUDNN_DATA_UINT8;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false,
     "getCudnnDataTypeFromScalarType() not supported for ",
     toString(dtype)
   );
+=======
+  std::string msg("getCudnnDataTypeFromScalarType() not supported for ");
+  msg += toString(dtype);
+  throw std::runtime_error(msg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) {
diff --git a/aten/src/ATen/detail/AcceleratorHooksInterface.h b/aten/src/ATen/detail/AcceleratorHooksInterface.h
index fb9e51ded83e3..09494977ca578 100644
--- a/aten/src/ATen/detail/AcceleratorHooksInterface.h
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@@ -12,7 +12,11 @@ namespace at {
 
 // AcceleratorHooksInterface is a shared interface provided by all
 // accelerators to allow generic code.
+<<<<<<< HEAD
 // This interface is hook-based as it corresponds to all the functions
+=======
+// This inferface is hook-based as it corresponds to all the functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // that are going to be called in a generic way from the CPU code.
 
 struct TORCH_API AcceleratorHooksInterface {
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index f1f2056917472..6179d66909a2f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -118,6 +118,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+<<<<<<< HEAD
   virtual bool hasCKSDPA() const {
     return false;
   }
@@ -126,6 +127,8 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual const at::cuda::NVRTC& nvrtc() const {
     TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
   }
@@ -166,10 +169,13 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+<<<<<<< HEAD
   virtual bool supportsBFloat16RNNWithCuDNN() const {
     return false;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual long versionCuDNN() const {
     TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
   }
diff --git a/aten/src/ATen/detail/HPUHooksInterface.h b/aten/src/ATen/detail/HPUHooksInterface.h
index 3240ff4dac137..3649f4e39ea4a 100644
--- a/aten/src/ATen/detail/HPUHooksInterface.h
+++ b/aten/src/ATen/detail/HPUHooksInterface.h
@@ -25,7 +25,11 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
         false, "Cannot get device of pointer on HPU without HPU backend");
   }
 
+<<<<<<< HEAD
   bool isPinnedPtr(const void* /*data*/) const override {
+=======
+  bool isPinnedPtr(const void*) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
 
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp
index d2e331abb0c04..f4bd44b96649a 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.cpp
+++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp
@@ -21,10 +21,13 @@ bool isMTIAHooksBuilt() {
 
 } // namespace detail
 
+<<<<<<< HEAD
 bool MTIAHooksInterface::isAvailable() const {
   return detail::isMTIAHooksBuilt() && detail::getMTIAHooks().deviceCount() > 0;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs)
 
 } // namespace at
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
index b415862f29e7c..5e140a93a1190 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -149,8 +149,11 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
     return;
   }
+<<<<<<< HEAD
 
   virtual bool isAvailable() const override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API MTIAHooksArgs {};
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
index 1ab3e99e10773..78d169fa0e2e6 100644
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@@ -38,7 +38,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
 
   Generator getNewGenerator(
       [[maybe_unused]] DeviceIndex device_index = -1) const override {
+<<<<<<< HEAD
     // TODO(FFFrog): Preserved for BC and will be removed in the future.
+=======
+    // TODO(FFFrog): Perserved for BC and will be removed in the future.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (at::GetGeneratorPrivate().has_value())
       return at::GetGeneratorForPrivateuse1(device_index);
 
diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
index f1b3ae2b7760b..8082efaf83684 100644
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@@ -15,11 +15,19 @@
 #define DLPACK_EXTERN_C
 #endif
 
+<<<<<<< HEAD
 /*! \brief The current major version of dlpack */
 #define DLPACK_MAJOR_VERSION 1
 
 /*! \brief The current minor version of dlpack */
 #define DLPACK_MINOR_VERSION 1
+=======
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 80
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -32,12 +40,19 @@
 #define DLPACK_DLL
 #endif
 
+<<<<<<< HEAD
+#include <stdint.h>
+=======
+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
+// NOLINTNEXTLINE(modernize-deprecated-headers)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <stddef.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+<<<<<<< HEAD
 
 /*!
  * \brief The DLPack version.
@@ -65,6 +80,8 @@ typedef struct {
   uint32_t minor;
 } DLPackVersion;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /*!
  * \brief The device type in DLDevice.
  */
@@ -116,7 +133,11 @@ typedef enum {
   kDLWebGPU = 15,
   /*! \brief Qualcomm Hexagon DSP */
   kDLHexagon = 16,
+<<<<<<< HEAD
   /*! \brief Microsoft MAIA devices */
+=======
+  /*! \brief Microsoft AI Accelerator */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   kDLMAIA = 17,
 } DLDeviceType;
 
@@ -157,6 +178,7 @@ typedef enum {
   kDLComplex = 5U,
   /*! \brief boolean */
   kDLBool = 6U,
+<<<<<<< HEAD
   /*! \brief FP8 data types */
   kDLFloat8_e3m4 = 7U,
   kDLFloat8_e4m3 = 8U,
@@ -177,6 +199,8 @@ typedef enum {
    * while the consumer must stop importing if the value is unexpected.
    */
   kDLFloat4_e2m1fn = 17U,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } DLDataTypeCode;
 
 /*!
@@ -190,12 +214,15 @@ typedef enum {
  *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+<<<<<<< HEAD
  *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
  *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
  *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
  *
  *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
  *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 typedef struct {
   /*!
@@ -223,7 +250,11 @@ typedef struct {
    * `byte_offset` field should be used to point to the beginning of the data.
    *
    * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+<<<<<<< HEAD
    * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
+=======
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
    * (after which this note will be updated); at the moment it is recommended
    * to not rely on the data pointer being correctly aligned.
@@ -241,9 +272,12 @@ typedef struct {
    *   return size;
    * }
    * \endcode
+<<<<<<< HEAD
    *
    * Note that if the tensor is of size zero, then the data pointer should be
    * set to `NULL`.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    */
   void* data;
   /*! \brief The device of the tensor */
@@ -253,12 +287,20 @@ typedef struct {
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
+<<<<<<< HEAD
   int64_t* shape;
+=======
+  const int64_t* shape;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /*!
    * \brief strides of the tensor (in number of elements, not bytes)
    *  can be NULL, indicating tensor is compact and row-majored.
    */
+<<<<<<< HEAD
   int64_t* strides;
+=======
+  const int64_t* strides;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /*! \brief The offset in bytes to the beginning pointer to data */
   uint64_t byte_offset;
 } DLTensor;
@@ -269,6 +311,7 @@ typedef struct {
  *  not meant to transfer the tensor. When the borrowing framework doesn't need
  *  the tensor, it should call the deleter to notify the host that the resource
  *  is no longer needed.
+<<<<<<< HEAD
  *
  * \note This data structure is used as Legacy DLManagedTensor
  *       in DLPack exchange and is deprecated after DLPack v0.8
@@ -276,6 +319,8 @@ typedef struct {
  *       This data structure may get renamed or deleted in future versions.
  *
  * \sa DLManagedTensorVersioned
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 typedef struct DLManagedTensor {
   /*! \brief DLTensor which is being memory managed */
@@ -284,6 +329,7 @@ typedef struct DLManagedTensor {
    *   which DLManagedTensor is used in the framework. It can also be NULL.
    */
   void * manager_ctx;
+<<<<<<< HEAD
   /*!
    * \brief Destructor - this should be called
    * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
@@ -360,6 +406,15 @@ struct DLManagedTensorVersioned {
   DLTensor dl_tensor;
 };
 
+=======
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index d58d436c511d1..5cb6684c332f2 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -158,7 +158,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(kron);
   OP_DECOMPOSE(l1_loss);
   m.impl("layer_norm", native::layer_norm_symint);
+<<<<<<< HEAD
   m.impl("_fused_rms_norm", native::rms_norm_composite);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   OP_DECOMPOSE2(ldexp, Tensor);
   OP_DECOMPOSE2(less_equal, Tensor );
   OP_DECOMPOSE2(less, Tensor );
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 0d2f075d0c540..e916e9561c206 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -283,7 +283,11 @@ inline void boxed_existing_bdim_all_batch_rule(
 // Use when all tensors arguments accept one (normal) batch dim.
 // This batching rule expands the batch dim on all Tensors, reshapes it into
 // dim 0, calls the op, and then reshapes the batch dim out of dim 0.
+<<<<<<< HEAD
 // This is not the most efficient thing; if there are alternatives, please try
+=======
+// This is not the most efficient thing; if there are alternatives, plese try
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // to use them. Use this only as a last resort.
 #define EXISTING_BDIM_ALL_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
@@ -410,7 +414,11 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..
 
 
 template <typename F, F Method, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
+=======
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
   return self;
 }
diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
index 804d6953bd410..4de3fa3d0ef8a 100644
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@@ -39,7 +39,11 @@ Tensor vdot_decomp(const Tensor& A, const Tensor& B) {
 // NB: I wrote this like this because we *might* want its for a future matmul
 // batch rule that isn't decomposed...
 // "tv" = tensor @ vector
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
+=======
+static std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   if (self_bdim && other_bdim) {
@@ -66,7 +70,11 @@ std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
   TORCH_INTERNAL_ASSERT(false, "can't get here");
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
+=======
+static std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -79,7 +87,11 @@ std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
   return tv_batch_rule(self, self_bdim, other, other_bdim);
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
+=======
+static std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -94,7 +106,11 @@ std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
   return std::make_tuple( at::matmul(self_, other_), 0 );
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>> bmm_batch_rule(
+=======
+static std::tuple<Tensor, std::optional<int64_t>> bmm_batch_rule(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -176,7 +192,11 @@ struct LinalgCheckMatrixUnaryRuleHelper;
 
 template <char const *op_name, typename F, F Func, typename A, typename... T>
 struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
+<<<<<<< HEAD
   static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
+=======
+  static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
     return moveBatchDimToFront(tensor, batch_dim);
   }
@@ -222,7 +242,11 @@ struct LinalgCheckMatrixBinaryRuleHelper;
 
 template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
 struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
+<<<<<<< HEAD
   static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
+=======
+  static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const Tensor& first, std::optional<int64_t> first_bdim,
       const Tensor& second, std::optional<int64_t> second_bdim) {
     TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,
@@ -250,7 +274,11 @@ struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>>
   }
 };
 
+<<<<<<< HEAD
 void expect_at_least_rank(
+=======
+static void expect_at_least_rank(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& tensor,
     std::optional<int64_t> tensor_bdim,
     int64_t expected_rank,
@@ -384,7 +412,11 @@ fourOutputs solve_ex_batch_rule(
 
   // NOTE [ solve_ex Batch Rule Contiguity ]
   // A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
+<<<<<<< HEAD
   // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
+=======
+  // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // differs based on whether or not the optimized path was taken
   const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
   if (batched_A_was_contiguous && !A.is_complex()) {
@@ -472,7 +504,11 @@ atol_rtol_tensor_batch_rule(
   return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0);
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>>
+=======
+static std::tuple<Tensor, std::optional<int64_t>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pinv_batch_rule(
     const Tensor& input, std::optional<int64_t> input_bdim, const std::optional<Tensor>& atol,
     const std::optional<int64_t> atol_bdim, const std::optional<Tensor>& rtol,
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 5fba8d257ceb8..6cf7194037f38 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -7,7 +7,10 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+<<<<<<< HEAD
 #include <ATen/DTensorState.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <utility>
 
@@ -45,6 +48,7 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
   const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
   auto indices_ = moveBatchDimToFront(indices, indices_bdim);
 
+<<<<<<< HEAD
   {
     // getStepTensor returns a regular Tensor. If indices_ is a DTensor
     // we want to allow this mixed DTensor-Tensor operation.
@@ -52,6 +56,10 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
     const auto range = getStepTensor(indices, batch_size, num_embeddings);
     indices_ = indices_ + range;
   }
+=======
+  const auto range = getStepTensor(indices, batch_size, num_embeddings);
+  indices_ = indices_ + range;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
   return std::make_tuple(std::move(result), 0);
 }
@@ -213,6 +221,7 @@ static cudnn_grid_sample_backward_batch_rule(
   return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size);
 }
 
+<<<<<<< HEAD
 // uses functional formulation for one_hot under vmap to be compatible with
 // fakeTensor/dynamic shapes and compiled functorch transforms.
 // mirrors the meta path in aten/src/ATen/native/Onehot.cpp,
@@ -229,6 +238,42 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes
     const auto options = self.options();
     at::Tensor index = at::arange(num_classes, options);
     return at::eq(self.unsqueeze(-1), index).to(at::kLong);
+=======
+// TODO: replace with targetable functionalization
+static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) {
+    TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
+    auto shape = self.sym_sizes().vec();
+
+    // empty tensor could be converted to one hot representation,
+    // but shape inference is not possible.
+    if (self.sym_numel() == 0) {
+        if (num_classes <= 0) {
+            TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
+        } else {
+            shape.emplace_back(num_classes);
+            return at::empty_symint(shape, self.options());
+        }
+    }
+
+    TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please "
+        "provide an explicit positive num_classes argument.");
+
+    // Disabling all of the following checks. This is OK because scatter has checks too.
+    // Maybe one_hot should be a primitive wrt autograd so we don't have to deal with this.
+    // // non-empty tensor
+    // if (self.device().type() != at::kCUDA) {
+    //   //for cuda, rely on device assert thrown by scatter
+    //   TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
+    // }
+    // if (self.device().type() != at::kCUDA) {
+    //   //rely on device asserts from scatter to avoid sync here
+    //   TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
+    // }
+
+    shape.emplace_back(num_classes);
+    Tensor ret = at::zeros_symint(shape, self.options());
+    return ret.scatter(-1, self.unsqueeze(-1), 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename A, A a, typename C>
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index de1a37a9b4320..f6854d5321285 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -282,7 +282,11 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
 
   dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
 
+<<<<<<< HEAD
   // Not sure why output_ needs to be marked as .contiguous(). Something must
+=======
+  // Not sure why output_ needs to be marked as .contiguous(). Someting must
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // have changed in PyTorch (and output of softmax is probably always contiguous)
   return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
 }
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index f5c770371de8e..cefbc94a80204 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -12,14 +12,21 @@
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
 #include <torch/library.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 // NOLINTBEGIN(bugprone-unchecked-optional-access)
 namespace at::functorch {
 
 namespace {
+<<<<<<< HEAD
 bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
+=======
+static bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& bdim : bdims) {
     if (bdim.has_value()) {
       return true;
@@ -28,7 +35,11 @@ bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
   return false;
 }
 
+<<<<<<< HEAD
 int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
+=======
+static int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t result = 0;
   for (const auto& idx : indices) {
     if (!idx.has_value() || !idx->defined()) {
@@ -40,7 +51,11 @@ int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
   return result;
 }
 
+<<<<<<< HEAD
 int64_t get_max_index_logical_dim(
+=======
+static int64_t get_max_index_logical_dim(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ArrayRef<std::optional<Tensor>> indices,
     ArrayRef<std::optional<int64_t>> indices_bdims) {
   int64_t max_logical_dim = -1;
@@ -57,7 +72,11 @@ int64_t get_max_index_logical_dim(
   return max_logical_dim;
 }
 
+<<<<<<< HEAD
 std::vector<std::optional<Tensor>> batchIndices(
+=======
+static std::vector<std::optional<Tensor>> batchIndices(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::TensorOptions options,
   ArrayRef<std::optional<Tensor>> indices,
   ArrayRef<std::optional<int64_t>> indices_bdims,
@@ -95,10 +114,16 @@ std::vector<std::optional<Tensor>> batchIndices(
     if (index.has_value() && index->sym_numel() != 0) {
       const auto idx_bdim = indices_bdims[i];
       indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
+<<<<<<< HEAD
       TORCH_CHECK(
         !(index.value().dtype() == kBool) || !indices_bdims[i].has_value(),
         "vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."
       );
+=======
+      if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
+        throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       indices_.push_back(index);
     }
@@ -126,7 +151,11 @@ std::vector<std::optional<Tensor>> batchIndices(
 
 // Define an "advanced index" to be a selection object that is
 // a non-trivial Tensor (i.e. it does not represent :).
+<<<<<<< HEAD
 bool is_advanced_index(const std::optional<Tensor>& idx) {
+=======
+static bool is_advanced_index(const std::optional<Tensor>& idx) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!idx.has_value()) {
     return false;
   }
@@ -137,7 +166,11 @@ bool is_advanced_index(const std::optional<Tensor>& idx) {
 }
 
 // See NOTE: [advanced indices adjacent] for definition
+<<<<<<< HEAD
 bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
+=======
+static bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t num_advanced_indices_regions = 0;
   bool in_advanced_indices_region = false;
   for (const auto& idx : indices) {
@@ -165,7 +198,11 @@ bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
 // - result: Tensor[B, 4, 5, 6, 2, 3, 7, 8]
 //                     -------  ----
 //                     region2  region1
+<<<<<<< HEAD
 Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) {
+=======
+static Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   VmapDimVector permutation(tensor.dim(), 0);
   std::iota(permutation.begin(), permutation.end(), 0);
   std::rotate(
@@ -553,7 +590,11 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List<std::optional<Tensor>
   return self;
 }
 
+<<<<<<< HEAD
 Tensor maybe_permute_values(
+=======
+static Tensor maybe_permute_values(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& values,
     ArrayRef<std::optional<Tensor>> orig_indices,
     ArrayRef<std::optional<int64_t>> orig_indices_bdims) {
@@ -1052,7 +1093,11 @@ std::tuple<Tensor, std::optional<int64_t>> index_add_batch_rule(
                                    other, other_bdim, alpha, false);
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor,Tensor> binary_pointwise_align(
+=======
+static std::tuple<Tensor,Tensor> binary_pointwise_align(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor & self,
     std::optional<int64_t> self_bdim,
     const Tensor & mask,
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index 48a735c3e5332..88c1fc755aae4 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -171,8 +171,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   POINTWISE_BOXED(fill_.Scalar);
   POINTWISE_BOXED(zero_);
+<<<<<<< HEAD
   // This is special because this op doesn't return anything
   m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #undef UNARY_POINTWISE
 #undef UNARY_POINTWISE_ALL
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 08db1d202b4eb..94a2629586c1b 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -346,7 +346,11 @@ std::tuple<Tensor, std::optional<int64_t>> slice_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
+<<<<<<< HEAD
 bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+=======
+static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return dim == 0 || dim == -1;
 }
 
diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp
index 92123c1cd0e22..664afede8aa9b 100644
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@@ -224,7 +224,11 @@ static Tensor safeStack(TensorList tensors) {
   // is possible for the backward function to return an undefined grad for some
   // grad_input for each example. In that case, we return an undefined grad.
   //
+<<<<<<< HEAD
   // It is theoretically possible for *some* of the examples to produce an
+=======
+  // It is theoretically posssible for *some* of the examples to produce an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // undefined grad (a kernel could peek at the gradient values and return an
   // undefined tensor if it determines the gradient is full of zeros). We
   // could handle this by treating the undefined grad as a zero-filled tensor
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.cpp b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
index 895770fc69921..5969881a1ebac 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@@ -113,7 +113,11 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
   return sym_sizes_default();
 }
 
+<<<<<<< HEAD
 // The following are publicly exposed as methods of Tensor
+=======
+// The following are publically exposed as methods of Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 IntArrayRef BatchedTensorImpl::strides_custom() const {
   return strides_default();
@@ -126,7 +130,11 @@ SymIntArrayRef BatchedTensorImpl::sym_strides_custom() const {
 
 // TODO: implement proper contiguity on batched tensor, then put
 // sizes_strides_policy back to Default
+<<<<<<< HEAD
 c10::SymBool BatchedTensorImpl::sym_is_contiguous_custom(at::MemoryFormat memory_format) const {
+=======
+bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(memory_format == MemoryFormat::Contiguous,
       "NYI: querying is_contiguous inside of vmap for memory_format ",
       "other than torch.contiguous_format");
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index 985b289b3fe02..c910c73210d01 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -69,7 +69,11 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
   IntArrayRef strides_custom() const override;
   SymIntArrayRef sym_strides_custom() const override;
   // Override a bunch of methods inherited from TensorImpl to return error messages.
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(at::MemoryFormat memory_format) const override;
+=======
+  bool is_contiguous_custom(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_size(int64_t dim, int64_t new_size) override;
   void set_stride(int64_t dim, int64_t new_stride) override;
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
@@ -160,10 +164,13 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::CUDA,
   DispatchKey::CPU,
   DispatchKey::PrivateUse1,
+<<<<<<< HEAD
   DispatchKey::SparseCPU,
   DispatchKey::SparseCUDA,
   DispatchKey::SparseCsrCPU,
   DispatchKey::SparseCsrCUDA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 });
 
 inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 69af08a7bd7ce..a84f1b2c4113e 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -465,11 +465,19 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s
 
 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+<<<<<<< HEAD
   dynamicLayerBack(op, stack, true);
 }
 
 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   dynamicLayerBack(op, stack, false);
+=======
+  return dynamicLayerBack(op, stack, true);
+}
+
+static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  return dynamicLayerBack(op, stack, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h
index 672a33fda0016..4d5e82a310cd5 100644
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@@ -37,7 +37,11 @@ namespace at::functorch  {
 // how to perform the transform.
 //
 // TODO: we can excise DynamicLayer in favor of Interpreter,
+<<<<<<< HEAD
 // But I am going to leave it for now as a compatibility shim to avoid
+=======
+// But I am going to leave it for now as a compatiblity shim to avoid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // needing to refactor a lot of callsites...
 struct TORCH_API DynamicLayer {
   explicit DynamicLayer(
diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h
index 3d3b2069387d7..2879f3013e2b5 100644
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@@ -3,7 +3,10 @@
 #include <ATen/functorch/Macros.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 #include <bitset>
 #include <utility>
@@ -88,7 +91,11 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
 // >>> VmapInterpreterPtr(&interpreter).batchSize()
 //
 // Finally, Interpreter::process switches on the type of the interpreter
+<<<<<<< HEAD
 // and calls one of {Transform}Interpreter::processImpl under the hood.
+=======
+// and calls one of {Transform}Intepreter::processImpl under the hood.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Same for Interpreter::sendToNextInterpreter :)
 
 struct VmapInterpreterMeta {
@@ -107,10 +114,16 @@ struct VmapInterpreterMeta {
 
   template <typename T>
   friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) {
+<<<<<<< HEAD
     TORCH_CHECK(
       !json_t.batchSize_.is_heap_allocated(),
       "Serialization for heap-allocated SymInt is not implemented yet"
     );
+=======
+    if (json_t.batchSize_.is_heap_allocated()) {
+      throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     json_j["batchSize"] = json_t.batchSize_.as_int_unchecked();
     json_j["randomness"] = static_cast<int64_t>(json_t.randomness_);
   }
@@ -304,7 +317,11 @@ struct Interpreter {
     } else if (meta.contains("Functionalize")) {
       json_t.meta_.emplace<FunctionalizeInterpreterMeta>(meta["Functionalize"].template get<FunctionalizeInterpreterMeta>());
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "unknown interpreter metadata type");
+=======
+      throw std::runtime_error("unknown interpreter metadata type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
index 0c2ed37d23765..6816f5740a6e8 100644
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@@ -68,18 +68,30 @@ namespace at::functorch {
 
 namespace{
 // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
+<<<<<<< HEAD
 bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
 }
 
 int64_t get_current_level() {
+=======
+static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+  return dim == 0 || dim == -1;
+}
+
+static int64_t get_current_level() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto maybe_level = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_level.has_value());
   return maybe_level->layerId();
 }
 
 // This check should probably go into the dispatcher...
+<<<<<<< HEAD
 bool participatesInCurrentLevel(const Tensor& self) {
+=======
+static bool participatesInCurrentLevel(const Tensor& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto current_level = get_current_level();
   auto* maybe_batched_impl = maybeGetBatchedImpl(self);
   if (!maybe_batched_impl) {
@@ -90,7 +102,11 @@ bool participatesInCurrentLevel(const Tensor& self) {
   return self_level == current_level;
 }
 
+<<<<<<< HEAD
 bool participatesInCurrentLevel(ITensorListRef self) {
+=======
+static bool participatesInCurrentLevel(ITensorListRef self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const Tensor& tensor : self) {
     if (participatesInCurrentLevel(tensor)) {
       return true;
@@ -285,7 +301,11 @@ std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
 // given (sizes, strides, storage_offset) returns the maximum location that
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
+<<<<<<< HEAD
 std::optional<c10::SymInt> maximum_indexable_location(
+=======
+static std::optional<c10::SymInt> maximum_indexable_location(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) {
   auto result = native::storage_size_for(sizes, strides);
   if (result == 0) {
@@ -298,7 +318,11 @@ std::optional<c10::SymInt> maximum_indexable_location(
 // This checks that the range of possible memory locations accessible by
 // x.as_strided(sizes, strides, maybe_storage_offset)
 // are within the bounds of possible memory locations accessible by x.
+<<<<<<< HEAD
 void checkBasicAsStridedValidForSlice(
+=======
+static void checkBasicAsStridedValidForSlice(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& physical_tensor,
     int64_t num_batch_dims,
     c10::SymIntArrayRef sizes,
@@ -733,7 +757,11 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) {
 }
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
+<<<<<<< HEAD
   // still legacy b/c returns multiple tensors
+=======
+  // still legacy b/c teturns multiple tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.impl("split.Tensor", split_batching_rule);
   m.impl("split_with_sizes", split_with_sizes_batching_rule);
   m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index 667e92970033c..18d6ef008019e 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -6,7 +6,10 @@
 #include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/xnnpack/Engine.h>
@@ -71,7 +74,11 @@ Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optiona
   return output;
 }
 
+<<<<<<< HEAD
 inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+=======
+static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (reduction == at::Reduction::Mean) {
     return unreduced.mean();
   } else if (reduction == at::Reduction::Sum) {
@@ -109,7 +116,13 @@ Tensor binary_cross_entropy_with_logits_hack(
 }
 
 Tensor trace_backward_decomp(const Tensor& grad, IntArrayRef sizes) {
+<<<<<<< HEAD
   TORCH_CHECK(sizes.size() == 2, "expected matrix input");
+=======
+  if (sizes.size() != 2) {
+    throw std::runtime_error("expected matrix input");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options());
   auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
   // Workaround using index_put instead of yet unsupported index_fill_
@@ -127,7 +140,11 @@ namespace {
 template<bool inplace>
 using Ctype = std::conditional_t<inplace, Tensor&, Tensor>;
 
+<<<<<<< HEAD
 Tensor make_feature_noise(const Tensor& input) {
+=======
+static Tensor make_feature_noise(const Tensor& input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input_sizes = input.sizes();
   TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input");
   std::vector<int64_t> sizes;
@@ -141,7 +158,11 @@ Tensor make_feature_noise(const Tensor& input) {
   return at::empty(sizes, input.options());
 }
 
+<<<<<<< HEAD
 bool is_fused_kernel_acceptable(const Tensor& input, double p) {
+=======
+static bool is_fused_kernel_acceptable(const Tensor& input, double p) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0;
 }
 
@@ -210,7 +231,11 @@ ALIAS_SPECIALIZATION(_feature_dropout,       true,  false)
 ALIAS_SPECIALIZATION(_alpha_dropout,         false, true )
 ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 
+<<<<<<< HEAD
 Tensor dropout(const Tensor& input, double p, bool train) {
+=======
+static Tensor dropout(const Tensor& input, double p, bool train) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = [&]() {
     NoNamesGuard guard;
     if (train && is_fused_kernel_acceptable(input, p)) {
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index cfdecaac778b3..b72da1fda9b6f 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -37,6 +37,7 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->copy_data(dest, src, count);
   }
 
+<<<<<<< HEAD
   // From DeviceAllocator
 
   bool initialized() override {
@@ -64,6 +65,8 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->resetPeakStats(device);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // From CUDAAllocator
 
   void* raw_alloc(size_t nbytes) override {
@@ -82,6 +85,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->init(device_count);
   }
 
+<<<<<<< HEAD
+=======
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   double getMemoryFraction(c10::DeviceIndex device) override {
     return allocator_->getMemoryFraction(device);
   }
@@ -90,8 +100,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->setMemoryFraction(fraction, device);
   }
 
+<<<<<<< HEAD
   std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
     return allocator_->getExpandableSegmentSizes(device);
+=======
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void enable(bool value) override {
@@ -114,6 +129,21 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->recordStream(ptr, stream);
   }
 
+<<<<<<< HEAD
+=======
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    return allocator_->getDeviceStats(device);
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    allocator_->resetAccumulatedStats(device);
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    allocator_->resetPeakStats(device);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
     return allocator_->snapshot(mempool_id);
   }
diff --git a/aten/src/ATen/metal/Context.h b/aten/src/ATen/metal/Context.h
index e4c6da738e0db..b5379c9b257a4 100644
--- a/aten/src/ATen/metal/Context.h
+++ b/aten/src/ATen/metal/Context.h
@@ -18,7 +18,11 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;
 
 class MetalImplRegistrar {
  public:
+<<<<<<< HEAD
   explicit MetalImplRegistrar(MetalInterface* /*impl*/);
+=======
+  explicit MetalImplRegistrar(MetalInterface*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index 6c58de099648d..e8c178b18f00d 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -12,7 +12,11 @@
 
 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
+<<<<<<< HEAD
   "The MPS backend is supported on MacOS 14.0+. ", \
+=======
+  "The MPS backend is supported on MacOS 13.0+.", \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
   "as the MPS framework doesn't support float64. Please use float32 instead."
@@ -43,6 +47,10 @@ TensorBase empty_mps(
     int64_t nelements = c10::multiply_integers(size);
     auto dtype = dtype_or_default(dtype_opt);
     TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
+<<<<<<< HEAD
+=======
+    TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
     auto dtype_meta = scalarTypeToTypeMeta(dtype);
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 9b58477104978..9075e733eed99 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -18,7 +18,15 @@ namespace at::mps {
 
 // Helper enum to check if a MPSGraph op is supported in a given macOS version
 enum class MacOSVersion : uint32_t {
+<<<<<<< HEAD
   MACOS_VER_14_4_PLUS = 0,
+=======
+  MACOS_VER_13_1_PLUS = 0,
+  MACOS_VER_13_2_PLUS,
+  MACOS_VER_13_3_PLUS,
+  MACOS_VER_14_0_PLUS,
+  MACOS_VER_14_4_PLUS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MACOS_VER_15_0_PLUS,
   MACOS_VER_15_1_PLUS,
   MACOS_VER_15_2_PLUS,
@@ -55,6 +63,7 @@ class TORCH_API MPSDevice {
    */
   bool isMacOS13Plus(MacOSVersion version) const;
 
+<<<<<<< HEAD
   /**
    * Returns device name
    */
@@ -66,6 +75,8 @@ class TORCH_API MPSDevice {
    */
   unsigned getCoreCount() const;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~MPSDevice();
 
  private:
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index 5a37490c02402..39ddd7c1ceec0 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -32,11 +32,19 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 
 MPSDevice::MPSDevice() : _mtl_device(nil) {
   // Check that MacOS 13.0+ version of MPS framework is available
+<<<<<<< HEAD
   // Create the MPSGraph and check method introduced in 14.0
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
 
   if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) {
+=======
+  // Create the MPSGraph and check method introduced in 13.0
+  // which is used by MPS backend.
+  id mpsCD = NSClassFromString(@"MPSGraph");
+
+  if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
@@ -66,12 +74,30 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
           isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}];
     }
   };
+<<<<<<< HEAD
+=======
+  static bool _macos_13_1_plus = is_os_version_at_least(13, 1);
+  static bool _macos_13_2_plus = is_os_version_at_least(13, 2);
+  static bool _macos_13_3_plus = is_os_version_at_least(13, 3);
+  static bool _macos_14_0_plus = is_os_version_at_least(14, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
   static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
   static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
   static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
 
   switch (version) {
+<<<<<<< HEAD
+=======
+    case MacOSVersion::MACOS_VER_13_1_PLUS:
+      return _macos_13_1_plus;
+    case MacOSVersion::MACOS_VER_13_2_PLUS:
+      return _macos_13_2_plus;
+    case MacOSVersion::MACOS_VER_13_3_PLUS:
+      return _macos_13_3_plus;
+    case MacOSVersion::MACOS_VER_14_0_PLUS:
+      return _macos_14_0_plus;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case MacOSVersion::MACOS_VER_14_4_PLUS:
       return _macos_14_4_plus;
     case MacOSVersion::MACOS_VER_15_0_PLUS:
@@ -85,6 +111,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   }
 }
 
+<<<<<<< HEAD
 std::string MPSDevice::getName() const {
   @autoreleasepool {
     return [[_mtl_device name] UTF8String];
@@ -115,6 +142,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
   return getIMPSAllocator(useSharedAllocator);
 }
+=======
+at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
+  return getIMPSAllocator(useSharedAllocator);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool is_available() {
   return MPSDevice::getInstance()->device() != nil;
 }
diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm
index 34fbd31af91da..812a09750c225 100644
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@@ -34,7 +34,11 @@
     case 14:
       switch (minor) {
         case 0:
+<<<<<<< HEAD
           return true;
+=======
+          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         case 4:
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
         default:
@@ -42,7 +46,23 @@
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
       }
     case 13:
+<<<<<<< HEAD
       return true;
+=======
+      switch (minor) {
+        case 0:
+          return true;
+        case 1:
+          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
+        case 2:
+          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
+        case 3:
+          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+        default:
+          TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+");
+          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false");
       return false;
@@ -70,10 +90,14 @@
 }
 
 void* MPSHooks::getCommandBuffer() const {
+<<<<<<< HEAD
   auto stream = at::mps::getDefaultMPSStream();
   // Release pending computeCommandEncoder, as extensions is likely to allocate new one
   stream->endKernelCoalescing();
   return stream->commandBuffer();
+=======
+  return at::mps::getDefaultMPSStream()->commandBuffer();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void* MPSHooks::getDispatchQueue() const {
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index 595d71aeef15a..c53089c67d5f5 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -158,6 +158,7 @@ @interface MPSGraphExecutionDescriptor ()
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
 
+<<<<<<< HEAD
       // For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
       // See https://github.com/pytorch/pytorch/issues/163962
       // Workaround by batching copy commands into 4Gb chunks
@@ -170,6 +171,9 @@ @interface MPSGraphExecutionDescriptor ()
         bytes_filled += bytes_to_copy;
         bytes_remains -= bytes_to_copy;
       }
+=======
+      [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       [blitEncoder endEncoding];
       synchronize(syncType);
     }
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index c164120a1f3c4..b6c3e0e76c983 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -240,8 +240,13 @@ TORCH_META_FUNC(gelu_backward) (
 
 namespace at::native {
 
+<<<<<<< HEAD
 static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
 static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
+=======
+static const double SELU_ALPHA = 1.6732632423543772848170429916717;
+static const double SELU_SCALE = 1.0507009873554804934193349852946;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
@@ -670,8 +675,11 @@ Tensor rrelu_with_noise_backward(
 }
 
 Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
+<<<<<<< HEAD
   TORCH_CHECK(std::isfinite(lower.to<double>()), "rrelu: lower bound must be finite, got ", lower.to<double>());
   TORCH_CHECK(std::isfinite(upper.to<double>()), "rrelu: upper bound must be finite, got ", upper.to<double>());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
   auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator));
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index 5821cd561cdf1..1e0b36a735f2a 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -24,7 +24,11 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_avg_pool3d_out_frame(
+=======
+static void adaptive_avg_pool3d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const scalar_t* input_p,
     scalar_t* output_p,
     int64_t sizeD,
@@ -176,7 +180,11 @@ void adaptive_avg_pool3d_out_cpu_template(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_avg_pool3d_backward_out_frame(
+=======
+static void adaptive_avg_pool3d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* gradInput_p,
     const scalar_t* gradOutput_p,
     int64_t sizeD,
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
index ef4bab3ec1de0..436ac6d66b2ca 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@@ -93,7 +93,11 @@ namespace {
 // 5d tensor B x D x T x H x W
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_max_pool3d_single_out_frame(
+=======
+static void adaptive_max_pool3d_single_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const scalar_t *input_p,
           scalar_t *output_p,
           int64_t *ind_p,
@@ -170,7 +174,11 @@ void adaptive_max_pool3d_single_out_frame(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_max_pool3d_out_frame(
+=======
+static void adaptive_max_pool3d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const scalar_t *input_data,
           scalar_t *output_data,
           int64_t *indices_data,
@@ -202,7 +210,11 @@ void adaptive_max_pool3d_out_frame(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_max_pool3d_backward_single_out_frame(
+=======
+static void adaptive_max_pool3d_backward_single_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           scalar_t *gradInput_p,
           const scalar_t *gradOutput_p,
           const int64_t *ind_p,
@@ -241,7 +253,11 @@ void adaptive_max_pool3d_backward_single_out_frame(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_max_pool3d_backward_out_frame(
+=======
+static void adaptive_max_pool3d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           scalar_t *gradInput_data,
           const scalar_t *gradOutput_data,
           const int64_t *indices_data,
diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp
index 365cfa311512a..8a74920373edf 100644
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@@ -153,7 +153,11 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void avg_pool3d_out_frame(
+=======
+static void avg_pool3d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const scalar_t *input_p,
           scalar_t *output_p,
           int64_t nslices,
@@ -333,7 +337,11 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void avg_pool3d_backward_out_frame(
+=======
+static void avg_pool3d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           scalar_t *gradInput_p,
           const scalar_t *gradOutput_p,
           int64_t nslices,
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 6669357cda456..941ecc32399f9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -2060,7 +2060,11 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
 }
 
 // TODO Deprecate this function in favour of linalg_lu_factor_ex
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
+=======
+std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    TORCH_WARN_ONCE(
     "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
     "removed in a future PyTorch release.\n",
@@ -2453,7 +2457,11 @@ TORCH_IMPL_FUNC(linalg_qr_out)(const Tensor& A,
 
   // geqrf requires m x n workspace input that is modified in-place
   // We try to use Q. If it doesn't fit, we try to use R
+<<<<<<< HEAD
   // If m > n and compute_q==false, it won't fit into Q or R, so we need to create an auxiliary tensor
+=======
+  // If m > n and compute_q==false, it won't fit into Q or R, so we neet to create an auxiliary tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor QR;
   if (compute_q && Q.size(-1) == n) {
     QR = Q;
@@ -4095,7 +4103,11 @@ Tensor linalg_vander_symint(
   const auto n = N.value_or(shape.back());
   TORCH_CHECK(n > 1, "N must be greater than 1.");
 
+<<<<<<< HEAD
   // Append cumprod of the other 0...n-1 powers
+=======
+  // Append cumprod of the oher 0...n-1 powers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   shape.push_back(n - 1);
   auto result = at::cumprod(x_.unsqueeze(-1).expand_symint(shape), -1);
   // The row of ones
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index df64aa42e602f..acc4cfe4044d8 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -143,13 +143,21 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper)
  For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776
 */
 template <typename T>
+<<<<<<< HEAD
 inline
+=======
+static inline
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::enable_if_t<std::is_floating_point_v<T>, int> lapack_work_to_int(const T val) {
     const auto next_after = std::nextafter(val, std::numeric_limits<T>::infinity());
     return std::max<int>(1, std::ceil(next_after));
 }
 template <typename T>
+<<<<<<< HEAD
 inline
+=======
+static inline
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::enable_if_t<c10::is_complex<T>::value, int> lapack_work_to_int(const T val) {
     return lapack_work_to_int(val.real());
 }
@@ -343,7 +351,11 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c
   For further details, please see the LAPACK documentation for GEQRF.
 */
 template <typename scalar_t>
+<<<<<<< HEAD
 void apply_geqrf(const Tensor& input, const Tensor& tau) {
+=======
+static void apply_geqrf(const Tensor& input, const Tensor& tau) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !AT_BUILD_WITH_LAPACK()
   TORCH_CHECK(
       false,
@@ -1039,7 +1051,11 @@ void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tr
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void apply_svd(const Tensor& A,
+=======
+static void apply_svd(const Tensor& A,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       const bool full_matrices,
                       const bool compute_uv,
                       const Tensor& U,
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 6b7496f49732e..4fdd239f18294 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -9,7 +9,10 @@
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Linear.h>
 #include <ATen/native/Resize.h>
+<<<<<<< HEAD
 #include <ATen/native/GroupedMMUtils.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(__s390x__) && !defined(__powerpc__)
 #include <cpuinfo.h>
 #endif
@@ -58,7 +61,11 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
 template<typename scalar_t>
 scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);
 
+<<<<<<< HEAD
 static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
+=======
+static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return n == 1 || lda >= std::max<int64_t>(1L, m);
 }
 
@@ -333,6 +340,7 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
   return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 
+<<<<<<< HEAD
 // TODO(vasiliy, future PR): figure out why we need to declare this function, when
 // other functions that live in ATen/native/*.cpp without declarations
 // or headers work just fine.
@@ -352,4 +360,6 @@ std::optional<c10::ScalarType> out_dtype) {
   return out;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // namespace at::native
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index b476ca3cff8f1..69e36ebb62265 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -286,7 +286,11 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
+<<<<<<< HEAD
   auto constexpr intmax = std::numeric_limits<int>::max();
+=======
+  auto intmax = std::numeric_limits<int>::max();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return n <= intmax && incx <= intmax;
 }
 
@@ -315,7 +319,11 @@ bool gemv_use_fast_path<float>(
     int64_t incx,
     [[maybe_unused]] float beta,
     int64_t incy) {
+<<<<<<< HEAD
   auto constexpr intmax = std::numeric_limits<int>::max();
+=======
+  auto intmax = std::numeric_limits<int>::max();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
          (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
@@ -375,7 +383,11 @@ static void bf16_gemv_trans(
   const at::BFloat16 beta,
   at::BFloat16* y,
   const int incy) {
+<<<<<<< HEAD
   bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+=======
+  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h
index bd19f9c987f14..e7a038e37ffb6 100644
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@@ -70,7 +70,11 @@ inline void searchsorted_maybe_trim_input_tensors(
     const Tensor& raw_boundaries) {
   Tensor trimmed_sorter;
   Tensor raw_sorter;
+<<<<<<< HEAD
   searchsorted_maybe_trim_input_tensors(
+=======
+  return searchsorted_maybe_trim_input_tensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       trimmed_input,
       trimmed_boundaries,
       trimmed_sorter,
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index c17a70ea308ab..b3852d40f6fb0 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -51,7 +51,11 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
 // brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C
 #if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5)
 #define ONEDNN_UKERNEL_1
+<<<<<<< HEAD
 #elif ((IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR >= 6) || (IDEEP_VERSION_MAJOR > 3))
+=======
+#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define ONEDNN_UKERNEL_2
 #endif
 #if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))))
@@ -202,7 +206,11 @@ void gemm(
     float *c, int64_t ldc) {
   internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
 #if AT_MKLDNN_ENABLED()
+<<<<<<< HEAD
    if (mkldnn_reduced_f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+=======
+   if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      return;
    }
 #endif
@@ -358,6 +366,7 @@ void gemm(
       int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
       char transa_ = to_blas(transa), transb_ = to_blas(transb);
       float alpha_ = alpha, beta_ = beta;
+<<<<<<< HEAD
       int c_size = n_ * m_;
       // C matrix in OpenBLAS sbgemm are of type "float" so we have to convert, copy and copy back.
       std::vector<float> float_v(c_size, 0.0f);
@@ -366,17 +375,28 @@ void gemm(
           float_v[j * m_ + i] = c10::convert<float>(c[j * ldc_ + i]);
         }
       }
+=======
+      int c_size = n_ * ldc_;
+      // C matrix in OpenBLAS sbgemm are of type "float" so we have to convert, copy and copy back.
+      std::vector<float> float_v(c, c + c_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sbgemm_(&transa_, &transb_,
               &m_, &n_, &k_,
               &alpha_,
               a, &lda_,
               b, &ldb_,
               &beta_,
+<<<<<<< HEAD
               float_v.data(), &m_);
       for (const auto j : c10::irange(n)) {
         for (const auto i : c10::irange(m)) {
           c[j * ldc_ + i] = c10::convert<at::BFloat16>(float_v[j * m_ + i]);
         }
+=======
+              float_v.data(), &ldc_);
+      for (auto cv: float_v) {
+        *(c++) = c10::convert<at::BFloat16>(cv);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       return;
    }
@@ -457,9 +477,30 @@ void gemm(
     return;
   }
 #endif
+<<<<<<< HEAD
   gemm_no_downcast_stub(
       at::kCPU, at::kBFloat16,
       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+=======
+  // for the fallback path, first compute gemm with beta = 0,
+  // and then add c in full precision.
+  int64_t c_size = n * m;
+  std::vector<float> float_c(c_size, 0.f);
+  gemm_no_downcast_stub(
+      at::kCPU, at::kBFloat16,
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      auto offset = j * ldc + i;
+      // beta == 0 won't propagate NaN from C
+      if (beta == 0.f) {
+        c[offset] = float_c[j * m + i];
+      } else {
+        c[offset] = beta * c[offset] + float_c[j * m + i];
+      }
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void gemm(
@@ -478,9 +519,30 @@ void gemm(
     return;
   }
 #endif
+<<<<<<< HEAD
   gemm_no_downcast_stub(
       at::kCPU, at::kHalf,
       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+=======
+  // for the fallback path, first compute gemm with beta = 0,
+  // and then add c in full precision.
+  int64_t c_size = n * m;
+  std::vector<at::Half> float16_c(c_size, 0.f);
+  gemm_stub(
+      at::kCPU, at::kHalf,
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      auto offset = j * ldc + i;
+      // beta == 0 won't propagate NaN from C
+      if (beta == 0.f) {
+        c[offset] = c10::convert<float>(float16_c[j * m + i]);
+      } else {
+        c[offset] = beta * c[offset] + c10::convert<float>(float16_c[j * m + i]);
+      }
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void gemm(
@@ -991,7 +1053,11 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
 template <typename key_t, typename value_t>
 struct KernelCache  {
   using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
+<<<<<<< HEAD
   static std::shared_ptr<value_t>&& fetch_or_create(
+=======
+  static inline std::shared_ptr<value_t>&& fetch_or_create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const key_t& key,
       const std::function<std::shared_ptr<value_t>()>& callback) {
     auto&& search = get_store().find(key);
@@ -1003,7 +1069,11 @@ struct KernelCache  {
     }
   }
 
+<<<<<<< HEAD
   static kstore_t& get_store() {
+=======
+  static inline kstore_t& get_store() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static thread_local kstore_t cache_kernels;
     return cache_kernels;
   }
@@ -1067,7 +1137,11 @@ struct GemmHelper {
 struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
   // Fetch/create GemmHelper object and execute brgemm with batch size = 1
   template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
+<<<<<<< HEAD
   static void call(
+=======
+  static inline void call(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t M,
       int64_t N,
       int64_t K,
@@ -1118,12 +1192,20 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
         .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
   }
 
+<<<<<<< HEAD
   static std::shared_ptr<GemmHelper>& get_current() {
+=======
+  static inline std::shared_ptr<GemmHelper>& get_current() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static thread_local std::shared_ptr<GemmHelper> current;
     return current;
   }
 
+<<<<<<< HEAD
   static bool device_check(ScalarType dtype) {
+=======
+  static inline bool device_check(ScalarType dtype) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!at::globalContext().userEnabledMkldnn()) {
       return false;
     }
@@ -1153,7 +1235,11 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
 using pack_t = dnnl::ukernel::transform;
 #endif
 struct Pack : public KernelCache <PackKey, pack_t> {
+<<<<<<< HEAD
   static void call(
+=======
+  static inline void call(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t K,
       int64_t N,
       int64_t ld_in,
@@ -1182,7 +1268,11 @@ struct Pack : public KernelCache <PackKey, pack_t> {
     }
   }
 
+<<<<<<< HEAD
   static bool could_pack(ScalarType dtype) {
+=======
+  static inline bool could_pack(ScalarType dtype) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!at::globalContext().userEnabledMkldnn()) {
       return false;
     }
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index 8b75f12ebaf21..8512af333fb8e 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -206,6 +206,7 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
 // B Base pointer to a tensor B.
 // C Pointer to a tensor C (accumulation buffer).
 // Note only batch size 1 is used currently
+<<<<<<< HEAD
 
 // Define macros for available brgemm APIs
 // so that callers can determine which APIs are available
@@ -216,6 +217,8 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
 #define CPUBLAS_BRGEMM_U8I8I32 // unsigned char * signed char -> int32
 #define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API void brgemm(
     int64_t M,
     int64_t N,
diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp
index d043014b3820e..b2ec8db631da7 100644
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@@ -81,7 +81,11 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
   // TODO: contiguous can be made to preserve the memory format
   // of the input. However since the above reshape clobbers h and w
   // it may not be safe to do that, since channels_last contiguous
+<<<<<<< HEAD
   // may think oc and the last dim correspond to h,w?
+=======
+  // may think oc and and the last dim correspond to h,w?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // It is not clear, however from initial looking around it feels that
   // this may not be correct.
   // In this case channels last will likely require custom implementation
diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp
index f0270a02b2677..f496f15512a3c 100644
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@@ -71,7 +71,11 @@
 namespace at::native {
 namespace {
 
+<<<<<<< HEAD
 void col2im_out_cpu_template(
+=======
+static void col2im_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp
index 13bef0a00b9c9..3c7134cd18617 100644
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@@ -24,6 +24,7 @@ static void _assert_match(const O& original, const C& compared, const std::strin
   }
 }
 
+<<<<<<< HEAD
 template<>
 void _assert_match<c10::Device, std::optional<c10::Device>>(
     const c10::Device& original,
@@ -47,6 +48,8 @@ void _assert_match<c10::Device, std::optional<c10::Device>>(
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
   _assert_match(tensor.sym_sizes(), sizes, "sizes");
   _assert_match(tensor.sym_strides(), strides, "strides");
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 892144ac663a6..c09bea137751c 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -465,11 +465,16 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
     return false;
   }
 
+<<<<<<< HEAD
   auto is_channel_last = [](const at::Tensor& t) {
     auto fmt = t.suggest_memory_format();
     return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
   };
   return is_channel_last(input) || is_channel_last(weight);
+=======
+  auto fmt = input.suggest_memory_format();
+  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 610f454be21fa..db1e7f66fa865 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -3,7 +3,10 @@
 #include <ATen/Config.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
+<<<<<<< HEAD
 #include <ATen/native/CanUse32BitIndexMath.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/ConvolutionMM3d.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/Pool.h>
@@ -14,7 +17,10 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/macros/Macros.h>
+<<<<<<< HEAD
 #include <algorithm>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <limits>
 #include <utility>
 
@@ -32,6 +38,13 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#ifdef USE_MPS
+#include <ATen/mps/MPSDevice.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@@ -297,6 +310,7 @@ struct ConvParams {
   bool allow_tf32{};
 
   bool is_strided() const {
+<<<<<<< HEAD
     return std::any_of(
       stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; });
   }
@@ -341,6 +355,69 @@ struct ConvParams {
   bool is_stride_nonpos() const {
     return std::any_of(
       stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; });
+=======
+    bool is_strided = false;
+    for (const auto& s : stride) {
+      is_strided |= (s != 1);
+    }
+    return is_strided;
+  }
+
+  bool is_dilated() const {
+    bool is_dilated = false;
+    for (const auto& d : dilation) {
+      is_dilated |= (d != 1);
+    }
+    return is_dilated;
+  }
+
+  bool is_padded() const {
+    bool is_padded = false;
+    for (auto p : padding) {
+      is_padded |= (p != 0);
+    }
+    return is_padded;
+  }
+
+  bool is_output_padding_neg() const {
+    bool is_non_neg = false;
+    for (const auto& p : output_padding) {
+      is_non_neg |= (p < 0);
+    }
+    return is_non_neg;
+  }
+
+  bool is_output_padding_big() const {
+    bool is_big = false;
+    for (auto i: c10::irange(output_padding.size())) {
+      is_big |= (output_padding[i] >= stride[i]);
+    }
+    return is_big;
+  }
+
+  bool is_padding_neg() const {
+    bool is_non_neg = false;
+    for (const auto& p : padding) {
+      is_non_neg |= (p < 0);
+    }
+    return is_non_neg;
+  }
+
+  bool is_dilation_neg() const {
+    bool is_non_neg = false;
+    for (const auto& p : dilation) {
+      is_non_neg |= (p < 0);
+    }
+    return is_non_neg;
+  }
+
+  bool is_stride_nonpos() const {
+    bool is_nonpos = false;
+    for (const auto& s : stride) {
+      is_nonpos |= (s <= 0);
+    }
+    return is_nonpos;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void view1d_as_2d() {
@@ -406,6 +483,7 @@ struct ConvParams {
   // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
   // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
+<<<<<<< HEAD
     if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
       return false;
     }
@@ -423,6 +501,13 @@ struct ConvParams {
       }
     }
     if (needs_64bit_indexing_no_split(input, weight)) {
+=======
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+      return false;
+    }
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -430,6 +515,12 @@ struct ConvParams {
         return false;
       }
     }
+<<<<<<< HEAD
+=======
+    if (!input.is_cuda() || !cudnn_enabled) {
+      return false;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
       if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
         return false;
@@ -448,6 +539,7 @@ struct ConvParams {
 
   // Use cudnn for FP16 depthwise convolutions
   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
+<<<<<<< HEAD
     if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
       return false;
     }
@@ -461,6 +553,18 @@ struct ConvParams {
         }
       }
 
+=======
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+      return false;
+    }
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
+    // native kernel doesn't support 64-bit non-splittable case
+    if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) {
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -470,10 +574,13 @@ struct ConvParams {
         return true;
       }
     }
+<<<<<<< HEAD
     if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
       // always use cudnn_depthwise for channels_last format
       return true;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
       bool kernel_cond =  (use_cudnn(input, weight) &&
                            input.scalar_type() == kHalf && // only for FP16
@@ -658,7 +765,10 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
   TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported");
   TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero");
+<<<<<<< HEAD
   TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
@@ -689,10 +799,13 @@ static void check_shape_forward(const at::Tensor& input,
              ", but got bias of size ", at::symint::sizes<T>(bias), " instead");
 
     for (const auto i : c10::irange(2, k)) {
+<<<<<<< HEAD
       // T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
       TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
                   "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
                   (std::numeric_limits<T>::max() / 2));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
       // log new kernel size considering dilation
       kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
@@ -707,7 +820,11 @@ static void check_shape_forward(const at::Tensor& input,
       // If kernel size is incorrect
       std::ostringstream input_ss;
       std::ostringstream kernel_ss;
+<<<<<<< HEAD
       std::string separator;
+=======
+      std::string separator = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       for (int i = 0, len = input_shape.size(); i < len; ++i) {
         input_ss << separator << input_shape[i];
@@ -719,11 +836,14 @@ static void check_shape_forward(const at::Tensor& input,
                "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
     }
   } else { // transposed
+<<<<<<< HEAD
     for (const auto i : c10::irange(2, k)) {
       TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
                   "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
                   (std::numeric_limits<T>::max() / 2));
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
@@ -1029,7 +1149,11 @@ static Tensor convolution_same(
 
   if (symmetric_padding) {
     // All backends handle symmetric padding natively
+<<<<<<< HEAD
     SymDimVector output_padding(dim);
+=======
+    SymDimVector output_padding(static_cast<size_t>(dim));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                                false, output_padding, groups);
   }
@@ -1049,7 +1173,11 @@ static Tensor convolution_same(
     }
   }
   auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
+<<<<<<< HEAD
   SymDimVector output_padding(dim);
+=======
+  SymDimVector output_padding(static_cast<size_t>(dim));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
                                 dilation, false, output_padding, groups);
 }
@@ -1184,7 +1312,11 @@ at::Tensor convolution(
   bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   return at::_convolution(input, weight, bias, stride, padding, dilation,
                           transposed, output_padding, groups,
+<<<<<<< HEAD
                           ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV));
+=======
+                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor convolution_overrideable(
@@ -1329,7 +1461,11 @@ ConvBackend select_conv_backend(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
+<<<<<<< HEAD
   params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
+=======
+  params.allow_tf32 = ctx.allowTF32CuDNN();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto input = input_r;
   auto weight = weight_r;
@@ -1451,8 +1587,17 @@ static inline at::MemoryFormat determine_backend_memory_format(
       }
       break;
     case ConvBackend::Mps:
+<<<<<<< HEAD
     case ConvBackend::MpsTranspose:
       if (mps_conv_use_channels_last(input, weight)) {
+=======
+      if (mps_conv_use_channels_last(input, weight)) {
+#ifdef USE_MPS
+        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
+          break;
+        }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
       }
       break;
@@ -1709,7 +1854,11 @@ at::Tensor _convolution(
   c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
   const Tensor& bias_r = *bias_r_maybe_owned;
 
+<<<<<<< HEAD
   return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV));
+=======
+  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
@@ -2007,7 +2156,11 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
+<<<<<<< HEAD
   params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
+=======
+  params.allow_tf32 = ctx.allowTF32CuDNN();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Validate inputs.
   check_shape_backward(input, weight.sizes(), params);
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 538a893d54ea0..16ab0dc2d0ea5 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -25,7 +25,11 @@ namespace at::native {
 
 namespace {
 
+<<<<<<< HEAD
 Tensor compute_columns2d(
+=======
+static Tensor compute_columns2d(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     IntArrayRef padding,
     IntArrayRef stride,
@@ -93,7 +97,11 @@ Tensor compute_columns2d(
   return columns.contiguous();
 }
 
+<<<<<<< HEAD
 inline void slow_conv2d_shape_check(
+=======
+static inline void slow_conv2d_shape_check(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -205,7 +213,11 @@ inline void slow_conv2d_shape_check(
   }
 }
 
+<<<<<<< HEAD
 inline Tensor view_weight_2d(const Tensor& weight_,
+=======
+static inline Tensor view_weight_2d(const Tensor& weight_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) {
   Tensor weight = weight_.contiguous(memory_format);
   if (weight.dim() == 4) {
@@ -220,7 +232,11 @@ inline Tensor view_weight_2d(const Tensor& weight_,
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void slow_conv2d_update_output_frame(
+=======
+static void slow_conv2d_update_output_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorAccessor<const scalar_t, 3> input,
     TensorAccessor<scalar_t, 3> output,
     TensorAccessor<const scalar_t, 2> weight,
@@ -480,7 +496,11 @@ void slow_conv2d_backward_weight_frame(
   }
 }
 
+<<<<<<< HEAD
 void slow_conv2d_backward_weight_out_cpu_template(
+=======
+static void slow_conv2d_backward_weight_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_weight,
     const Tensor& input,
     const Tensor& grad_output_,
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index 894bf29456f78..e1c10ea7073a1 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -9,7 +9,10 @@
 #include <ATen/native/TransposeType.h>
 #include <ATen/native/Unfold3d.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 #include <c10/util/safe_numerics.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -28,7 +31,11 @@ namespace at::native {
 
 namespace {
 
+<<<<<<< HEAD
 Tensor compute_columns3d(
+=======
+static Tensor compute_columns3d(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input_,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -108,7 +115,11 @@ Tensor compute_columns3d(
   return columns;
 }
 
+<<<<<<< HEAD
 inline void slow_conv3d_shape_check(
+=======
+static inline void slow_conv3d_shape_check(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -175,6 +186,7 @@ inline void slow_conv3d_shape_check(
   const int64_t input_height = input.size(dim_height);
   const int64_t input_width = input.size(dim_width);
 
+<<<<<<< HEAD
   constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
 
   TORCH_CHECK_VALUE(
@@ -192,6 +204,8 @@ inline void slow_conv3d_shape_check(
     "Padding depth too large: pad_depth=",
     pad_depth);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t exact_input_depth = input_depth + 2 * pad_depth;
   const int64_t exact_input_height = input_height + 2 * pad_height;
   const int64_t exact_input_width = input_width + 2 * pad_width;
@@ -239,6 +253,7 @@ inline void slow_conv3d_shape_check(
       output_width,
       "). Output size is too small");
 
+<<<<<<< HEAD
   uint64_t kernel_product;
   TORCH_CHECK(
     !c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
@@ -247,6 +262,8 @@ inline void slow_conv3d_shape_check(
     ", kernel_width=",
     kernel_width);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (weight.defined()) {
     int64_t n_input_plane = weight.size(1);
     if (weight.dim() == 2) {
@@ -273,7 +290,11 @@ inline void slow_conv3d_shape_check(
   }
 }
 
+<<<<<<< HEAD
 Tensor view_weight_2d(const Tensor& weight_) {
+=======
+static Tensor view_weight_2d(const Tensor& weight_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor weight = weight_.contiguous();
   if (weight.dim() == 5) {
     const int64_t s1 = weight.size(0);
@@ -286,7 +307,11 @@ Tensor view_weight_2d(const Tensor& weight_) {
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void slow_conv3d_update_output_frame(
+=======
+static void slow_conv3d_update_output_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorAccessor<const scalar_t, 4> input,
     TensorAccessor<scalar_t, 4> output,
     TensorAccessor<const scalar_t, 2> weight,
@@ -515,7 +540,11 @@ void slow_conv3d_backward_weight_frame(
       grad_weight.data(), ldc, grad_weight.stride(0) * n);
 }
 
+<<<<<<< HEAD
 void slow_conv3d_backward_parameters_out_cpu_template(
+=======
+static void slow_conv3d_backward_parameters_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_weight,
     const Tensor& input,
     const Tensor& grad_output,
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
index 8786257a8bd62..2e6a8551c8a70 100644
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -52,7 +52,12 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
   for (const auto k : c10::irange(kw)) {
     int iShift = std::max(0, static_cast<int>(k - real_pad));
     int oShift = std::max(0, static_cast<int>(real_pad - k));
+<<<<<<< HEAD
     long t = std::min(ilen + real_pad - k, olen) - oShift;
+=======
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    int t = std::min(ilen + real_pad - k, olen) - oShift;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Note: gemm assumes column-major matrices
     // input    is l*m (row-major)
     // weight   is m*r (row-major)
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 0b3ffda30577f..e6a9408f76156 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -1,5 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/Copy.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
@@ -35,10 +39,15 @@
 #endif
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 #include <fbgemm/FbgemmConvert.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmConvert.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 namespace {
diff --git a/aten/src/ATen/native/DilatedMaxPool2d.cpp b/aten/src/ATen/native/DilatedMaxPool2d.cpp
index 641e9f14dd711..ad084a913ef86 100644
--- a/aten/src/ATen/native/DilatedMaxPool2d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@@ -54,7 +54,11 @@ bool ceil_mode) {
     TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+=======
+    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /* sizes */
@@ -130,7 +134,11 @@ const Tensor& indices) {
     TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+=======
+    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /* sizes */
diff --git a/aten/src/ATen/native/DilatedMaxPool3d.cpp b/aten/src/ATen/native/DilatedMaxPool3d.cpp
index 23d77cb210720..afa493eda70ad 100644
--- a/aten/src/ATen/native/DilatedMaxPool3d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp
@@ -63,7 +63,11 @@ void max_pool3d_with_indices_out_cpu_template(
     TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
       "non-empty 4D or 5D (batch mode) tensor expected for input");
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
+=======
+    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const int64_t nslices = input.size(-4);
@@ -158,7 +162,11 @@ Tensor& max_pool3d_with_indices_backward_out_cpu_template(
     TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
       "non-empty 4D or 5D (batch mode) tensor expected for input");
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
+=======
+    TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const int64_t nslices = input.size(-4);
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index 21a15b80c9c84..b5675ecb79d1a 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -28,13 +28,21 @@ namespace at::native::templates {
 // ==================================================== Random ========================================================
 
 // The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`.
+<<<<<<< HEAD
 // The current implementation of `random_` uses uint64_t arithmetic and casts the result to the target dtype(scalar_t).
+=======
+// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance:
 //
 //    auto actual = torch::empty({3, 3}, torch::half);
 //    actual.random_(0, 65504);
 //
+<<<<<<< HEAD
 // If random's uint64_t arithmetic produces 65503 as a random value after casting to torch::half it becomes 65504
+=======
+// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
 // moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
 // the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 5f34ed9d24c17..03f8acb547cd1 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -424,6 +424,7 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
  */
 
 Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(
       at::isFloatingType(count.scalar_type()),
       "binomial only supports floating-point dtypes for count, got: ",
@@ -432,6 +433,8 @@ Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Ge
       at::isFloatingType(prob.scalar_type()),
       "binomial only supports floating-point dtypes for prob, got: ",
       prob.scalar_type());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor ret = at::zeros(count.sizes(), count.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 755fe00b1f1c5..46ea0b67227d7 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -1,6 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <array>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
@@ -128,7 +131,11 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
 
 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
+<<<<<<< HEAD
   constexpr static scalar_t kTailValues[] = {
+=======
+  const static scalar_t kTailValues[] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0.0810614667953272,
     0.0413406959554092,
     0.0276779256849983,
@@ -140,7 +147,11 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
     0.00925546218271273,
     0.00833056343336287
   };
+<<<<<<< HEAD
   if (k < std::size(kTailValues)) {
+=======
+  if (k <= 9) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kTailValues[static_cast<size_t>(k)];
   }
   scalar_t kp1sq = (k + 1) * (k + 1);
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index e1076d0400f79..0f5c60e65dce0 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -14,10 +14,15 @@
 #include <c10/util/Half.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 #include <fbgemm/FbgemmConvert.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmConvert.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <caffe2/perfkernels/embedding_lookup_idx.h>
 #endif
@@ -108,7 +113,11 @@ bool is_fast_path(const Tensor& src, const std::optional<Tensor>& scale, Tensor&
 // index_add (using add_indices as the index), without creating an intermediary
 // tensor to hold the selected embeddings
 template <typename data_t, typename index_t>
+<<<<<<< HEAD
 std::enable_if_t<std::is_same_v<data_t, double>, void>
+=======
+static std::enable_if_t<std::is_same_v<data_t, double>, void>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 index_select_add(
     const Tensor& select_indices,
     const Tensor& add_indices,
@@ -494,7 +503,11 @@ index_select_add(const Tensor &select_indices,
 // mul (scaling by per_sample_weights)
 // index_add (using add_indices as the index)
 template <typename data_t, typename index_t>
+<<<<<<< HEAD
 std::enable_if_t<std::is_same_v<data_t, double>, void>
+=======
+static std::enable_if_t<std::is_same_v<data_t, double>, void>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 index_select_scale_add(
     const Tensor& select_indices,
     const Tensor& add_indices,
diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h
index a344422204844..b86a3fae6d089 100644
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #pragma once
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <cstdint>
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index 8e04a7490e879..a3b0323211c47 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -97,24 +97,42 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
   int64_t nDims = self.dim();
   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
 
+<<<<<<< HEAD
   auto height = self.sym_size(0);
   auto width = self.sym_size(1);
 
   if (nDims > 2) {
     for (const auto i : c10::irange(1, nDims)) {
       if (self.sym_size(i) != height) {
+=======
+  int64_t height = self.size(0);
+  int64_t width = self.size(1);
+
+  if (nDims > 2) {
+    int64_t dim1 = height;
+    for (const auto i : c10::irange(1, nDims)) {
+      if (self.size(i) != dim1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TORCH_CHECK(false, "all dimensions of input must be of equal length");
       }
     }
   }
 
+<<<<<<< HEAD
   auto storage_offset = self.sym_storage_offset();
   auto size = std::min(height, width);
+=======
+  int64_t storage_offset = self.storage_offset();
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+  int64_t size = std::min(height, width);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t stride = 0;
   for (const auto i : c10::irange(nDims)) {
     stride += self.stride(i);
   }
+<<<<<<< HEAD
   std::vector<SymInt> strides{stride};
   std::vector<SymInt> sizes{size};
 
@@ -129,6 +147,24 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
     auto offset = self.stride(0) * (width + 1);
 
     auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
+=======
+  strides.push_back(stride);
+  sizes.push_back(size);
+
+  auto main_diag = self.as_strided(sizes, strides, storage_offset);
+  main_diag.fill_(fill_value);
+
+  if (wrap && nDims == 2 && height > width + 1) {
+    std::vector<int64_t> wrap_sizes;
+
+    int64_t step = width + 1;
+    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
+    wrap_sizes.push_back(wrap_size);
+
+    int64_t offset = self.stride(0) * (width + 1);
+
+    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrap_diag.fill_(fill_value);
   }
 
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index cb437fb45ce21..4065c93f87bae 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -260,7 +260,10 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2);            \
                                                                           \
     std::vector<Tensor> result;                                           \
+<<<<<<< HEAD
     result.reserve(input.size());                                         \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(input.size())) {                      \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalar)); \
     }                                                                     \
@@ -289,7 +292,10 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2, scalars);       \
                                                                               \
     std::vector<Tensor> result;                                               \
+<<<<<<< HEAD
     result.reserve(input.size());                                             \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(input.size())) {                          \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalars[i])); \
     }                                                                         \
@@ -419,7 +425,10 @@ std::vector<Tensor> foreach_tensor_ternary_lerp_slow(
     TensorList tensors3) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
   std::vector<Tensor> result;
+<<<<<<< HEAD
   result.reserve(tensors1.size());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], tensors3[i]));
   }
@@ -442,7 +451,10 @@ std::vector<Tensor> foreach_tensor_lerp_scalarlist_kernel_slow(
     at::ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, scalars);
   std::vector<Tensor> result;
+<<<<<<< HEAD
   result.reserve(tensors1.size());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], scalars[i]));
   }
@@ -473,7 +485,10 @@ std::vector<Tensor> foreach_tensor_norm_slow(
     std::optional<ScalarType> dtype) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+<<<<<<< HEAD
   result.reserve(tensors.size());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& t : tensors) {
     result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype));
   }
@@ -483,7 +498,10 @@ std::vector<Tensor> foreach_tensor_norm_slow(
 std::vector<Tensor> foreach_tensor_max_slow(TensorList tensors) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+<<<<<<< HEAD
   result.reserve(tensors.size());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& t : tensors) {
     result.emplace_back(at::max(t));
   }
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index f0dce20a6eff4..a2bfe650a1219 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -22,7 +22,11 @@ namespace {
 // Check if tensor list has either a boolean tensor or a integer tensor
 inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
   return std::any_of(
+<<<<<<< HEAD
       tensors.begin(), tensors.end(), [includeBool](const auto& t) {
+=======
+      tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return at::isIntegralType(t.scalar_type(), includeBool);
       });
 }
@@ -53,8 +57,13 @@ inline void check_foreach_api_restrictions(
 inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2) {
+<<<<<<< HEAD
   check_foreach_api_restrictions(tensors1);
   check_foreach_api_restrictions(tensors2);
+=======
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       tensors1.size() == tensors2.size(),
       "Tensor lists must have the same number of tensors, got ",
@@ -67,8 +76,26 @@ inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
+<<<<<<< HEAD
   check_foreach_api_restrictions(tensors1, tensors2);
   check_foreach_api_restrictions(tensors1, tensors3);
+=======
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+  TORCH_CHECK(
+      tensors1.size() == tensors3.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors3.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void check_foreach_api_restrictions(
@@ -77,7 +104,16 @@ inline void check_foreach_api_restrictions(
     TensorList tensors3,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+<<<<<<< HEAD
   check_foreach_api_restrictions(tensors1, scalars);
+=======
+  TORCH_CHECK(
+      tensors1.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list, got ",
+      tensors1.size(),
+      " and ",
+      scalars.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void check_foreach_api_restrictions(
@@ -85,7 +121,16 @@ inline void check_foreach_api_restrictions(
     TensorList tensors2,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2);
+<<<<<<< HEAD
   check_foreach_api_restrictions(tensors1, scalars);
+=======
+  TORCH_CHECK(
+      tensors1.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list, got ",
+      tensors1.size(),
+      " and ",
+      scalars.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Helper function called in check_fast_path_restrictions to check whether all
@@ -103,6 +148,7 @@ inline bool _check_tensors_share_device_and_dtype(
         tensor.is_non_overlapping_and_dense();
   };
 
+<<<<<<< HEAD
   return std::all_of(
       tensorLists.cbegin(),
       tensorLists.cend(),
@@ -110,6 +156,17 @@ inline bool _check_tensors_share_device_and_dtype(
         return std::all_of(
             tensorList.cbegin(), tensorList.cend(), is_tensor_okay);
       });
+=======
+  for (const auto& tensorList : tensorLists) {
+    for (const auto& tensor : tensorList) {
+      if (!is_tensor_okay(tensor)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Helper function called in check_fast_path_restrictions to check if
@@ -155,9 +212,17 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
     bool does_op_promote_integer_inputs_to_float = false) {
   for (const auto i : c10::irange(tensorList.size())) {
     // For division, integer inputs will result in float.
+<<<<<<< HEAD
     if (does_op_promote_integer_inputs_to_float &&
         at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) {
       return false;
+=======
+    if (does_op_promote_integer_inputs_to_float) {
+      if (at::isIntegralType(
+              tensorList[i].scalar_type(), /*includeBool*/ true)) {
+        return false;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (!scalarList.empty()) {
       const auto& scalar =
@@ -334,6 +399,7 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
               }
             }),
         "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
+<<<<<<< HEAD
     grouped_tensors_with_indices.try_emplace(
         key,
         TensorsAndIndicesT{
@@ -362,6 +428,38 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
                 return indices;
               }
             }()});
+=======
+    if (!grouped_tensors_with_indices.count(key)) {
+      grouped_tensors_with_indices.insert(
+          {key,
+           TensorsAndIndicesT{
+               [&]() -> nested_optional_tensorvec_t {
+                 nested_optional_tensorvec_t nested_tensorvec;
+                 nested_tensorvec.reserve(num_lists);
+                 for (const auto& i : c10::irange(num_lists)) {
+                   std::vector<std::optional<at::Tensor>> tensors;
+                   if (!nested_tensorlist[i].empty()) {
+                     // NB: num_tensors is the max possible length for any of
+                     // the inner lists of tensor references. Reserving the max
+                     // trades memory for perf. This should not have significant
+                     // impact.
+                     tensors.reserve(num_tensors);
+                   }
+                   nested_tensorvec.emplace_back(tensors);
+                 }
+                 return nested_tensorvec;
+               }(),
+               [&]() -> IndicesT {
+                 if (!with_indices) {
+                   return {};
+                 } else {
+                   IndicesT indices;
+                   indices.reserve(num_tensors);
+                   return indices;
+                 }
+               }()}});
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto& list_index : c10::irange(num_lists)) {
       if (!nested_tensorlist[list_index].empty()) {
         grouped_tensors_with_indices[key].first[list_index].emplace_back(
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index 664a612d0b137..228db8c60d65d 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -130,7 +130,11 @@ namespace native {
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool2d_out_single_batch_frame(
+=======
+static void fractional_max_pool2d_out_single_batch_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -188,7 +192,11 @@ void fractional_max_pool2d_out_single_batch_frame(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool2d_out_frame(
+=======
+static void fractional_max_pool2d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -220,7 +228,11 @@ void fractional_max_pool2d_out_frame(
   }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool2d_backward_out_single_batch_frame(
+=======
+static void fractional_max_pool2d_backward_out_single_batch_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
@@ -247,7 +259,11 @@ void fractional_max_pool2d_backward_out_single_batch_frame(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool2d_backward_out_frame(
+=======
+static void fractional_max_pool2d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 5ed3fdeab7651..538e99c8a77de 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -67,6 +67,7 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
   int64_t inputH = input_.size(heightDim);
   int64_t inputW = input_.size(widthDim);
 
+<<<<<<< HEAD
   TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
            "fractional_max_pool3d_out(): pool time ", poolSizeT,
            " too large relative to input time ", inputT);
@@ -74,6 +75,15 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
            "fractional_max_pool3d_out(): pool width ", poolSizeW,
            " too large relative to input width ", inputW);
   TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
+=======
+  TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
+           "fractional_max_pool3d_out(): pool time ", poolSizeT,
+           " too large relative to input time ", inputT);
+  TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
+           "fractional_max_pool3d_out(): pool width ", poolSizeW,
+           " too large relative to input width ", inputW);
+  TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            "fractional_max_pool3d_out(): pool height ", poolSizeH,
            " too large relative to input height ", inputH);
 
@@ -99,7 +109,11 @@ namespace at::native {
 namespace {
 
 template<typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool3d_out_single_batch_frame(
+=======
+static void fractional_max_pool3d_out_single_batch_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -169,7 +183,11 @@ void fractional_max_pool3d_out_single_batch_frame(
 }
 
 template<typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool3d_out_frame(
+=======
+static void fractional_max_pool3d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -257,7 +275,11 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
 namespace {
 
 template<typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool3d_backward_out_single_batch_frame(
+=======
+static void fractional_max_pool3d_backward_out_single_batch_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
@@ -287,7 +309,11 @@ void fractional_max_pool3d_backward_out_single_batch_frame(
 }
 
 template<typename scalar_t>
+<<<<<<< HEAD
 void fractional_max_pool3d_backward_out_frame(
+=======
+static void fractional_max_pool3d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 0ca8ec2a3a887..37454e5a0f260 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -86,7 +86,11 @@ namespace {
         for (const auto d : c10::irange(out_D)) {
           for (const auto h : c10::irange(out_H)) {
             for (const auto w : c10::irange(out_W)) {
+<<<<<<< HEAD
               // get the corresponding input x, y, z coordinates from grid
+=======
+              // get the corresponding input x, y, z co-ordinates from grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
               scalar_t ix = *grid_ptr_NDHW;
               scalar_t iy = grid_ptr_NDHW[grid_sCoor];
@@ -285,7 +289,11 @@ namespace {
         for (const auto d : c10::irange(out_D)) {
           for (const auto h : c10::irange(out_H)) {
             for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
+<<<<<<< HEAD
               // get the corresponding input x, y, z coordinates from grid
+=======
+              // get the corresponding input x, y, z co-ordinates from grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
               scalar_t ix = *grid_ptr_NDHW;
               scalar_t iy = grid_ptr_NDHW[grid_sCoor];
@@ -496,7 +504,11 @@ static Tensor _grid_sampler_2d_cpu_quantized(
       uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
       for (const auto h : c10::irange(out_H)) {
         for (const auto w : c10::irange(out_W)) {
+<<<<<<< HEAD
           // get the corresponding input x, y, z coordinates from grid
+=======
+          // get the corresponding input x, y, z co-ordinates from grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           float x = *grid_ptr_NHW;
           float y = grid_ptr_NHW[grid_sCoor];
@@ -599,7 +611,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
       const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
       for (const auto h : c10::irange(out_H)) {
         for (const auto w : c10::irange(out_W)) {
+<<<<<<< HEAD
           // get the corresponding input x, y, z coordinates from grid
+=======
+          // get the corresponding input x, y, z co-ordinates from grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           scalar_t x = *grid_ptr_NHW;
           scalar_t y = grid_ptr_NHW[grid_sCoor];
@@ -771,7 +787,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
       scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
       for (const auto h : c10::irange(out_H)) {
         for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
+<<<<<<< HEAD
           // get the corresponding input x, y coordinates from grid
+=======
+          // get the corresponding input x, y co-ordinates from grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           scalar_t x = *grid_ptr_NHW;
           scalar_t y = grid_ptr_NHW[grid_sCoor];
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
index 3388af7b8a0a7..56fd4d728c193 100644
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -93,12 +93,15 @@ inline bool cond_cudnn_grid_sampler(
   const TensorBase& input,
   const TensorBase& grid
 ) {
+<<<<<<< HEAD
   auto st = input.scalar_type();
   if (!(st == kDouble || st == kFloat || st == kHalf))
     return false;
   st = grid.scalar_type();
   if (!(st == kDouble || st == kFloat || st == kHalf))
     return false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (
     at::native::cudnn_is_acceptable(input) &&
     at::native::cudnn_is_acceptable(grid) &&
diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index 5919997cf5fe5..f30376a5a01eb 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -23,7 +23,10 @@
 #include <ATen/ops/linspace.h>
 #endif
 
+<<<<<<< HEAD
 #include <cmath>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <numeric>
 #include <tuple>
 #include <vector>
@@ -203,6 +206,7 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
     return std::make_pair(leftmost_edges, rightmost_edges);
 }
 
+<<<<<<< HEAD
 
 /* Bin edges correction based on the precision representation.
  * To maintain the backward compatibility we take max(std::nextafter<>, +1)
@@ -243,6 +247,8 @@ void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &r
 #undef UPDATE_WITH_LIMIT
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /* histc's version of the logic for outermost bin edges.
  */
 std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
@@ -257,7 +263,12 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
     }
 
     if (leftmost_edge == rightmost_edge) {
+<<<<<<< HEAD
         bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge);
+=======
+        leftmost_edge -= 1;
+        rightmost_edge += 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index acdcb2b27bda2..869ab96ea7327 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -19,7 +19,11 @@
 namespace at::native {
 namespace {
 
+<<<<<<< HEAD
 void im2col_out_cpu_template(
+=======
+static void im2col_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& output,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp
index 16d7c8670699a..4b623b5e36517 100644
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@@ -16,7 +16,12 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
   auto linearId = elements - 1;
 
   // NOTE: Assumes all strides are positive, which is true for now
+<<<<<<< HEAD
   for (auto i = t.dim() - 1; i >= 0; --i) {
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  for (int i = t.dim() - 1; i >= 0; --i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto curDimIndex = linearId % t.sym_size(i);
     auto curDimOffset = curDimIndex * t.sym_stride(i);
     offset += curDimOffset;
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 948a6b8320a4e..612f479ddeaa5 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -5,6 +5,7 @@
 #include <ATen/core/IListRef.h>
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
@@ -12,6 +13,8 @@
 #include <ATen/ops/nonzero.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 [[noreturn]]
@@ -22,8 +25,12 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
 
 [[maybe_unused]] static std::vector<Tensor> expandTensors(
     const Tensor& self,
+<<<<<<< HEAD
     IOptTensorListRef indices,
     bool ensure_same_device = false) {
+=======
+    IOptTensorListRef indices) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If indices come in as ByteTensor or BoolTensor (masks), expand them into
   // the equivalent indexing by LongTensors
   std::vector<Tensor> result;
@@ -46,6 +53,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
           }
         }
         // Replace with nonzeros
+<<<<<<< HEAD
         at::Tensor nonzero;
         if (ensure_same_device && index.device() != self.device()) {
           bool non_blocking = index.is_cpu() && self.device().is_cuda();
@@ -59,6 +67,12 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
         }
       } else if (ensure_same_device && index.device() != self.device()) {
         result.emplace_back(index.to(self.device()));
+=======
+        auto nonzero = index.nonzero();
+        for (const auto j : c10::irange(index.dim())) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         result.emplace_back(index);
       }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index a744da3bcad2e..3ad3024e59735 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -93,7 +93,11 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
   if (bias->defined() && !input.is_xla()) {
     // Also hit the fused path for contiguous 3D input, if not using xla
     // backend. Reshaping/flattening has some performance implications on xla.
+<<<<<<< HEAD
     bool is_contiguous = input.is_contiguous_or_false();
+=======
+    bool is_contiguous = definitely_contiguous(input.sym_sizes(), input.sym_strides(), input.sym_numel());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is_contiguous && input_dim == 3) {
       return _flatten_nd_linear(input, weight, *bias);
     } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
@@ -154,8 +158,13 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   Tensor left = left_;
   Tensor right = right_;
   for (const auto i : c10::irange(dim)) {
+<<<<<<< HEAD
     auto sl = TORCH_GUARD_OR_TRUE(left.sym_size(i).sym_ne(1));
     auto sr = TORCH_GUARD_OR_TRUE(right.sym_size(i).sym_ne(1));
+=======
+    auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1));
+    auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
       if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
         TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
@@ -185,6 +194,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
   // then the permuted output is a view of bmm(left, right)
   // finally, opermutation reverts the permutation to the original order of dimensions
+<<<<<<< HEAD
   // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
   // However, if all dimensions from the right operand appear before those from the left
   // operand in the final output, we can swap the operands so that bmm directly produces
@@ -196,6 +206,8 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
     std::swap(lo, ro);
     std::swap(lo_size, ro_size);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
   std::vector<SymInt> out_size;
   out_size.reserve(out_num_dim);
@@ -488,7 +500,11 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
         // Iterate over each dimension covered by ellipsis
         const auto ndim = operands[i].ndimension() - (static_cast<int64_t>(op_labels[i].size()) - 1);
         for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) {
+<<<<<<< HEAD
           if (TORCH_GUARD_OR_TRUE(op.sym_size(dim).sym_ne(1))) {
+=======
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             // Update ellipsis size
             TORCH_SYM_CHECK(
                 ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))),
@@ -507,7 +523,11 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
           permutation[ell_index + j] = dim++;
         }
       } else if (permutation[label_perm_index[s]] == -1) {
+<<<<<<< HEAD
         if (TORCH_GUARD_OR_TRUE(op.sym_size(dim).sym_ne(1))) {
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // Update subscript
           TORCH_SYM_CHECK(
               label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))),
@@ -585,6 +605,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
     SmallVector<int64_t, 5> a_dims_to_sum;
     SmallVector<int64_t, 5> b_dims_to_sum;
     for (auto dim = out_num_dim; dim < perm_index; ++dim) {
+<<<<<<< HEAD
       auto sa = TORCH_GUARD_OR_TRUE(a.sym_size(dim).sym_ne(1));
       auto sb = TORCH_GUARD_OR_TRUE(b.sym_size(dim).sym_ne(1));
 
@@ -592,15 +613,26 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
         // if both a and b are equal, or we can't tell that its a broadcast for sure,
         // we assume non-broadcast.
         TORCH_SYM_CHECK(a.sym_size(dim).sym_eq(b.sym_size(dim)), "non-broadcast dimensions must match");
+=======
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))
+        && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (--dim_counts[dim] == 1) {
           sum_dims.push_back(dim);
           dim_counts[dim] = 0;
         }
       } else if (dim_counts[dim] == 1) {
+<<<<<<< HEAD
         if (sa) {
           a_dims_to_sum.push_back(dim);
           dim_counts[dim] = 0;
         } else if (sb) {
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) {
+          a_dims_to_sum.push_back(dim);
+          dim_counts[dim] = 0;
+        } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           b_dims_to_sum.push_back(dim);
           dim_counts[dim] = 0;
         }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index c07c7a5ac6e07..8bb1f117cdad0 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1360,8 +1360,11 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 #endif
 
 
+<<<<<<< HEAD
 #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
 // Used by default on x86 platforms and on AArch64+ACL
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static inline int64_t get_mkldnn_matmul_min_dim() {
   static auto value = [&] {
     const int64_t default_min_dim = [&] {
@@ -1395,7 +1398,12 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
   const int64_t min_size = get_mkldnn_matmul_min_size();
   return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
+<<<<<<< HEAD
 #endif
+=======
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void addmm_impl_cpu_(
     Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
   TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@@ -1771,8 +1779,12 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
     return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
         (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
   };
+<<<<<<< HEAD
 #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
   // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
   if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
     try {
@@ -1783,7 +1795,11 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
       at::globalContext().setUserEnabledMkldnn(false);
     }
   }
+<<<<<<< HEAD
 #endif
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
@@ -2801,7 +2817,10 @@ Tensor matrix_exp(const Tensor& a) {
 // TODO This should be deprecated in favor of linalg_matrix_exp_differential
 //      in FunctionsManual.cpp
 Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
+<<<<<<< HEAD
   squareCheckInputs(self, "matrix_exp_backward");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   NoTF32Guard disable_tf32;
   return backward_analytic_function_of_a_matrix(
     self, grad,
@@ -3620,7 +3639,11 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
     try {
       mkldnn_matmul_i8i8i32(self, mat2, result);
       dispatched = true;
+<<<<<<< HEAD
     } catch ([[maybe_unused]] const std::exception& e) {
+=======
+    } catch (const std::exception& e) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
     }
   }
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 257863573d3a8..f8db79e80d4cc 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -148,7 +148,11 @@ inline void checkInputsSolver(const Tensor& A,
 
 inline bool is_row_or_column_contiguous(const Tensor& t) {
   // This could be made more general, similar to how it's checked in matmul, which would allow to
+<<<<<<< HEAD
   // elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
+=======
+  // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We choose to be conservative for simplicity
   return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
 }
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 40d79d97c0cdf..c379fbd6f9e24 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -61,7 +61,11 @@
 constexpr float EPSILON = 1e-12;
 
 namespace {
+<<<<<<< HEAD
   inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+=======
+  static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (reduction == at::Reduction::Mean) {
       return unreduced.mean();
     } else if (reduction == at::Reduction::Sum) {
@@ -127,9 +131,12 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out)
 
 TORCH_IMPL_FUNC(mse_loss_out)
 (const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) {
+<<<<<<< HEAD
   TORCH_CHECK(input.device() == target.device(),
       "Expected all tensors to be on the same device, but found at least two devices, ",
       input.device(), " and ", target.device(), "!");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (reduction != Reduction::None) {
     Tensor loss;
     auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 2e2bc5542b51b..65d4826cf2793 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -44,7 +44,11 @@ namespace {
 
 // this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
 template<typename target_t>
+<<<<<<< HEAD
 inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+=======
+static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (idx % 2 == 0) {
     return BLANK;
   } else {
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index b524d277cd0aa..fc725045f4b99 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -58,7 +58,11 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void multilabel_margin_loss_forward_out_frame(
+=======
+static void multilabel_margin_loss_forward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input_contiguous,
     const Tensor& target_contiguous,
     Tensor& output,
@@ -108,7 +112,11 @@ void multilabel_margin_loss_forward_out_frame(
   }
 }
 
+<<<<<<< HEAD
 void multilabel_margin_loss_forward_out_cpu_template(
+=======
+static void multilabel_margin_loss_forward_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& target,
     Tensor& output,
@@ -153,7 +161,11 @@ void multilabel_margin_loss_forward_out_cpu_template(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void multilabel_margin_loss_backward_out_frame(
+=======
+static void multilabel_margin_loss_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input_contiguous,
@@ -222,7 +234,11 @@ void multilabel_margin_loss_backward_out_frame(
   }
 }
 
+<<<<<<< HEAD
 void multilabel_margin_loss_backward_out_cpu_template(
+=======
+static void multilabel_margin_loss_backward_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index f9dc074a6983b..099c353ab5968 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -57,7 +57,11 @@ inline int64_t target_index_checked(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void multi_margin_loss_cpu_kernel(
+=======
+static inline void multi_margin_loss_cpu_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& output,
     const scalar_t* input_data,
     const int64_t* target_data,
@@ -148,7 +152,11 @@ void multi_margin_loss_out_cpu_template(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void multi_margin_loss_backward_cpu_kernel(
+=======
+static void multi_margin_loss_backward_cpu_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* grad_input_data,
     const Tensor& grad_output,
     const scalar_t* input_data,
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 576f56986988b..52eda70660bb7 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -47,6 +47,7 @@ TORCH_META_FUNC(nll_loss_forward)
   TORCH_CHECK(
       target.dim() <= 1,
       "0D or 1D target tensor expected, multi-target not supported");
+<<<<<<< HEAD
   if (self.dim() == 1 && target.dim() == 1) {
       TORCH_CHECK_VALUE(
           target.size(0) == 1,
@@ -55,6 +56,12 @@ TORCH_META_FUNC(nll_loss_forward)
   }
   TORCH_CHECK(
       self.dim() == 1 || (self.size(0) == target.size(0)),
+=======
+
+  auto no_batch_dim = self.dim() == 1  && target.dim() == 0;
+  TORCH_CHECK(
+      no_batch_dim || (self.size(0) == target.size(0)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "size mismatch (got input: ",
       self.sizes(),
       ", target: ",
@@ -159,7 +166,11 @@ inline scalar_t* optional_data(const Tensor& source) {
 }
 
 template <typename scalar_t, typename target_t>
+<<<<<<< HEAD
 void nll_loss_out_frame(
+=======
+static void nll_loss_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& output,
     const Tensor& total_weight,
     const Tensor& input,
@@ -338,7 +349,11 @@ void nll_loss_forward_out_cpu_template(
 }
 
 template <typename scalar_t, typename target_t>
+<<<<<<< HEAD
 void nll_loss_backward_out_frame(
+=======
+static void nll_loss_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 7bea90cbd5274..e668eb62302a8 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -99,7 +99,11 @@ inline void check_gradout_shape_nll_loss2d(
 
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void nll_loss2d_forward_out_frame(
+=======
+static void nll_loss2d_forward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& output,
     Tensor& total_weight,
     const Tensor& input,
@@ -280,7 +284,11 @@ void nll_loss2d_forward_out_cpu_template(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void nll_loss2d_backward_out_frame(
+=======
+static void nll_loss2d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index 4677542706f6b..49f983aeb9cae 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -581,7 +581,11 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
+<<<<<<< HEAD
   static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
+=======
+  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -596,7 +600,11 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
+<<<<<<< HEAD
   static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
+=======
+  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     1.,
     66.,
     1925.,
@@ -712,7 +720,11 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+<<<<<<< HEAD
   static constexpr scalar_t d[25][25] =
+=======
+  static const scalar_t d[25][25] =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
@@ -1068,7 +1080,11 @@ inline scalar_t calc_igammac(scalar_t a, scalar_t x) {
    *   result at the boundary
    * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
    *   Large Parameter (see DLMF 8.12.4 [igam1])
+<<<<<<< HEAD
    * - if x > 1.1 and x < a, using the subtraction from the regularized lower
+=======
+   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    *   incomplete gamma
    * - otherwise, calculate the series from [igam2] eq (5)
    */
@@ -1148,7 +1164,11 @@ scalar_t calc_igamma(scalar_t a, scalar_t x) {
    *   result at the boundary
    * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
    *   Large Parameter (see DLMF 8.12.3 [igam1])
+<<<<<<< HEAD
    * - if x > 1 and x > a, using the subtraction from the regularized upper
+=======
+   * - if x > 1 and x > a, using the substraction from the regularized upper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    *   incomplete gamma
    * - otherwise, calculate the series from [igam2] eq (4)
    */
@@ -1730,7 +1750,11 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) {
    with the usual checks for overflow etcetera.
 
    Performance-wise, it seems to be substantially faster than either
+<<<<<<< HEAD
    the SLATEC DERFC function [or an erfcx function derived there from]
+=======
+   the SLATEC DERFC function [or an erfcx function derived therefrom]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    or Cody's CALERF function (from netlib.org/specfun), while
    retaining near machine precision in accuracy.  */
 
@@ -2862,7 +2886,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) {
     T q = x;
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x) * q - p;
         p = q;
         q = r;
@@ -2910,7 +2938,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) {
     T q = x + x;
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x) * q - p;
         p = q;
         q = r;
@@ -2966,7 +2998,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) {
     T q = x + x - T(1.0);
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x) * q - p;
         p = q;
         q = r;
@@ -3026,7 +3062,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) {
     T q = x + x + T(1.0);
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x) * q - p;
         p = q;
         q = r;
@@ -3150,7 +3190,11 @@ inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) {
     T q = T(1.0) - x;
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 1; k < n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
         p = q;
         q = r;
@@ -3190,7 +3234,11 @@ inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) {
     T q = x;
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 1; k < n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = ((k + k + 1) * x * q - k * p) / (k + 1);
         p = q;
         q = r;
@@ -3733,7 +3781,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
     T q = x + x - T(1.0);
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
         p = q;
         q = r;
@@ -3785,7 +3837,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
     T q = x + x - T(1.0) + (x + x - T(1.0));
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
         p = q;
         q = r;
@@ -3841,7 +3897,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
     T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
         p = q;
         q = r;
@@ -3897,7 +3957,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
     T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
     T r;
 
+<<<<<<< HEAD
     for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
+=======
+    for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
         p = q;
         q = r;
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index f91b892efec21..9681ca3b3454a 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -23,6 +23,11 @@ Tensor& max_unpooling2d_forward_out_cpu(
   // Nondeterministic with duplicate indices
   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
 
+<<<<<<< HEAD
+=======
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
@@ -43,9 +48,12 @@ Tensor& max_unpooling2d_forward_out_cpu(
                 self_.sizes(), " with dimension ", i , " being empty.");
   }
 
+<<<<<<< HEAD
   auto oheight = output_size[0];
   auto owidth = output_size[1];
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto memory_format = self_.suggest_memory_format();
   auto self = self_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
index 08c42a0d470c7..d71507f8734ec 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@@ -24,7 +24,11 @@
 
 namespace at {
 namespace {
+<<<<<<< HEAD
 inline void slow_conv_transpose2d_shape_check(
+=======
+static inline void slow_conv_transpose2d_shape_check(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -386,7 +390,11 @@ void slow_conv_transpose2d_out_cpu_template(
   }
 }
 
+<<<<<<< HEAD
 void slow_conv_transpose2d_backward_out_cpu_template(
+=======
+static void slow_conv_transpose2d_backward_out_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input_,
     const Tensor& grad_output_,
     Tensor& grad_input,
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index 469269ab07dfb..876e8700b7697 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -22,7 +22,11 @@ namespace at::native {
 
 namespace {
 
+<<<<<<< HEAD
 inline void slow_conv_transpose3d_shape_check(
+=======
+static inline void slow_conv_transpose3d_shape_check(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 0914ede84034c..a6d05cb58c9b5 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -61,9 +61,14 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <utility>
 #include <vector>
+<<<<<<< HEAD
 #include <iostream>
 
 static constexpr int MIOPEN_DIM_MAX = 5;
+=======
+
+static const int MIOPEN_DIM_MAX = 5;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::meta {
 
@@ -93,7 +98,11 @@ namespace {
              arg_name, " should contain ", expected, " elements not ", actual);
   }
 
+<<<<<<< HEAD
   inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) {
+=======
+  static inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (t.defined()) {
       return t.repeat_symint(repeat);
     }
@@ -495,8 +504,11 @@ static std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
   return std::make_tuple(grad_input, grad_weight, grad_bias);
 }
 
+<<<<<<< HEAD
 static bool PYTORCH_MIOPEN_EXTRA_LOGGING = c10::utils::check_env("PYTORCH_MIOPEN_EXTRA_LOGGING").value_or(false);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BatchNormBackend _select_batch_norm_backend(
     const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean,
     const Tensor& running_var, bool training, double eps) {
@@ -504,6 +516,7 @@ BatchNormBackend _select_batch_norm_backend(
   auto& ctx = at::globalContext();
   bool cudnn_enabled = ctx.userEnabledCuDNN();
 
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std::cout
       << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _select_batch_norm_backend"
@@ -517,6 +530,8 @@ BatchNormBackend _select_batch_norm_backend(
       << " input.dim=" << input.dim()
       << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (
       input.is_cuda()
       && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16
@@ -537,6 +552,7 @@ BatchNormBackend _select_batch_norm_backend(
   }
 
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
+<<<<<<< HEAD
   // See https://github.com/pytorch/pytorch/issues/64427.
   // non static variable is used to be able to change environment variable in runtime for testing
   // enabled by default for ROCm >= 7.0.0 with miopen 3.5
@@ -544,6 +560,12 @@ BatchNormBackend _select_batch_norm_backend(
   bool is_miopen_3_4 = miopen_version >= 30400;  // ROCm 6.4
   bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
   bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(is_miopen_3_5);
+=======
+  // See #64427
+  // non static variable is used to be able to change environment variable in runtime for testing
+  // enabled by default for ROCm >= 7.0.0
+  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (
       detail::getCUDAHooks().compiledWithMIOpen()
@@ -552,15 +574,30 @@ BatchNormBackend _select_batch_norm_backend(
       && input.dim() <= MIOPEN_DIM_MAX
       && input.dim() >= 3
       && input.scalar_type() != at::kDouble
+<<<<<<< HEAD
       && (is_miopen_3_4 || input.scalar_type() != at::kBFloat16)
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION < 60400)
+      && (input.scalar_type() != at::kBFloat16)
+#endif
+      && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input
       && weight.defined() && bias.defined()
       && ((running_mean.defined() && running_var.defined())
         || (!running_mean.defined() && !running_var.defined() && training))
       && (input.suggest_memory_format() == MemoryFormat::Contiguous
+<<<<<<< HEAD
           || (is_miopen_3_5 && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM &&
               (input.suggest_memory_format() == MemoryFormat::ChannelsLast
                || input.suggest_memory_format() == MemoryFormat::ChannelsLast3d)))
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+#endif
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ) {
     return BatchNormBackend::Miopen;
   }
@@ -578,6 +615,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     const Tensor& input, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& bias_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */,
     bool training, double momentum, double eps, bool cudnn_enabled) {
   // See [Note: hacky wrapper removal for optional tensor]
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std::cout
       << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index"
@@ -592,6 +630,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
       << " cudnn_enabled=" << cudnn_enabled
       << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = bias_opt.value_or(Tensor());
@@ -651,6 +691,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
 
   Tensor reserve = at::empty({0}, input.options().dtype(kByte));
 
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std::cout
             << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (use_miopen)"
@@ -674,6 +715,12 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
                input.contiguous(input.suggest_memory_format()),
                weight.contiguous(),
                bias.contiguous(),
+=======
+  if (backend == BatchNormBackend::Miopen) {
+    return std::tuple_cat(
+             at::miopen_batch_norm(
+               input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                running_mean.defined() ? running_mean.contiguous() : running_mean,
                running_var.defined() ? running_var.contiguous() : running_var,
                training, momentum, eps),
@@ -681,9 +728,12 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
              std::make_tuple(2));
   }
 
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling native_batch_norm)" << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::tuple_cat(
            at::native_batch_norm(
              input, weight, bias, running_mean, running_var, training, momentum, eps),
@@ -696,8 +746,11 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
     const Tensor& input, const Tensor& grad_output, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */, const std::optional<Tensor>& save_mean_opt /* optional */, const std::optional<Tensor>& save_var_transform_opt /* optional */,
     bool train, double epsilon, std::array<bool, 3> output_mask, const Tensor &reservedSpace) {
   // See [Note: hacky wrapper removal for optional tensor]
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& running_mean = running_mean_opt.value_or(Tensor());
@@ -728,16 +781,22 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
 
   // backward in inference mode is not supported in cudnn, fallback to native
   if (impl_index == 0 || (!train)) {
+<<<<<<< HEAD
     if (PYTORCH_MIOPEN_EXTRA_LOGGING)
       std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling native_batch_norm_backward)" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask);
   } else if (impl_index == 1) {
     // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC
     // format conversion is done inside cudnn_batch_norm_backward instead
     return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon, reservedSpace);
   } else if (impl_index == 2) {
+<<<<<<< HEAD
     if (PYTORCH_MIOPEN_EXTRA_LOGGING)
       std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling miopen_batch_norm_backward)" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon);
   }
   TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index);
@@ -748,6 +807,7 @@ Tensor batch_norm(
     const Tensor& input, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
     const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
     bool training, double momentum, double eps, bool cudnn_enabled) {
+<<<<<<< HEAD
 
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std :: cout
@@ -763,11 +823,16 @@ Tensor batch_norm(
       << " cudnn_enabled=" << cudnn_enabled
       << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const Tensor& weight = weight_opt.value_or(Tensor());
   const Tensor& bias = bias_opt.value_or(Tensor());
   const Tensor& running_mean = running_mean_opt.value_or(Tensor());
   const Tensor& running_var = running_var_opt.value_or(Tensor());
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var,
                                                 training, momentum, eps, cudnn_enabled));
   // TODO: switch to the new stack after the 2 week FC window
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index 2a20f95f10c20..4e1efae3926a7 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -1,6 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+<<<<<<< HEAD
 #include <ATen/DTensorState.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -25,6 +28,7 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         if (num_classes == -1) {
           num_classes = self.max().item().toLong() + 1;
         }
+<<<<<<< HEAD
         {
           // If `self` is a DTensor, then allow implicit replication
           // of the `index` Tensor.
@@ -44,6 +48,22 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         } else {
             shape.emplace_back(num_classes);
             return at::empty_symint(shape, self.options());
+=======
+        at::Tensor index = at::arange(num_classes, self.options());
+        return at::eq(self.unsqueeze(-1), index).to(kLong);
+    }
+
+    auto shape = self.sizes().vec();
+
+    // empty tensor could be converted to one hot representation,
+    // but shape inference is not possible.
+    if (self.numel() == 0) {
+        if (num_classes <= 0) {
+            TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
+        } else {
+            shape.push_back(num_classes);
+            return at::empty(shape, self.options());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     }
 
@@ -66,8 +86,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         }
     }
 
+<<<<<<< HEAD
     shape.emplace_back(num_classes);
     Tensor ret = at::zeros_symint(shape, self.options());
+=======
+    shape.push_back(num_classes);
+    Tensor ret = at::zeros(shape, self.options());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ret.scatter_(-1, self.unsqueeze(-1), 1);
     return ret;
 }
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index 986447bab6141..bd23379a39598 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -70,10 +70,17 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
         new_shape.emplace_back(input_sizes[i]);
     }
 
+<<<<<<< HEAD
     for (const auto i : c10::irange(l_pad)) {
         auto pad_idx = pad.size() - ((i + 1) * 2);
         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
         TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+=======
+    for (const auto i : c10::irange((size_t)l_pad)) {
+        auto pad_idx = pad.size() - ((i + 1) * 2);
+        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
+        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                  pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                  "which is invalid. Check dimension ", l_diff + i, " of your input.");
         new_shape.emplace_back(new_dim);
@@ -240,6 +247,7 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod
       default: {}
     }
   }
+<<<<<<< HEAD
 
   std::ostringstream error_msg;
   error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n";
@@ -249,6 +257,10 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod
   error_msg << "  - 4D or 5D input: padding size = 6 (pads last 3 dimensions)";
 
   C10_THROW_ERROR(NotImplementedError, error_msg.str());
+=======
+  C10_THROW_ERROR(NotImplementedError,
+      "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional<double> value) {
diff --git a/aten/src/ATen/native/PixelShuffle.h b/aten/src/ATen/native/PixelShuffle.h
index 46ffa7ddb23c3..c61673770ac54 100644
--- a/aten/src/ATen/native/PixelShuffle.h
+++ b/aten/src/ATen/native/PixelShuffle.h
@@ -11,8 +11,11 @@ inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_facto
               "pixel_shuffle expects a positive upscale_factor, but got ",
               upscale_factor);
   int64_t c = self.size(-3);
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(upscale_factor <= std::numeric_limits<decltype(upscale_factor)>::max() / upscale_factor,
         "upscale factor is too large, (upscale_factor)^2 overflowed: upscale_factor=", upscale_factor);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t upscale_factor_squared = upscale_factor * upscale_factor;
   TORCH_CHECK(c % upscale_factor_squared == 0,
               "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 7f335de04b90a..204b3afe97ad3 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -17,7 +17,11 @@ using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& g
 DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel)
 DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel)
 
+<<<<<<< HEAD
 // average pooling has same signature for forward and backward
+=======
+// averge pooling has same signature for forward and backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
     int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional<int64_t> divisor_override);
 using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
@@ -26,7 +30,11 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel)
 DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel)
 
+<<<<<<< HEAD
 // average pooling has same signature for forward and backward
+=======
+// averge pooling has same signature for forward and backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input,
     int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD, bool count_include_pad,
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index 3a3ab3794c3cd..fc41897456caf 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -25,11 +25,17 @@
 #include <c10/util/irange.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 #include <fbgemm/FbgemmFP16.h>
 #include <fbgemm/QuantUtils.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmFP16.h>
+#include <fbgemm/QuantUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // USE_FBGEMM
 
 namespace caffe2 {
@@ -68,6 +74,10 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
   const float* input_ptr = input_contig.const_data_ptr<float>();
 
   TORCH_CHECK(input.dim() >= 2);
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
   const int64_t K = input.size(input.dim() - 1);
   TORCH_CHECK(weight.dim() == 2);
@@ -410,8 +420,12 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
+<<<<<<< HEAD
     const std::optional<Tensor>& bias,
     at::Tensor& output) {
+=======
+    const Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
                   "and will be removed in a future PyTorch release.")
 
@@ -432,15 +446,25 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
 
   TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
   TORCH_CHECK(input.dim() >= 2);
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(bias.dim() == 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
   const int64_t N = packed_weight_fp16.numCols();
+<<<<<<< HEAD
 
   std::vector<int64_t> output_size = input.sizes().vec();
   output_size.back() = N;
   // Resize output Tensor
   output.resize_(output_size);
+=======
+  std::vector<int64_t> output_size = input.sizes().vec();
+  output_size.back() = N;
+  Tensor output = at::empty(output_size, input.options().dtype(at::kFloat));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Call the fp16 gemm interface
   fbgemm::cblas_gemm_compute(
@@ -452,16 +476,21 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
       output.data_ptr<float>());
 
   // Add bias term
+<<<<<<< HEAD
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias);
   const Tensor& bias_ = *bias_maybe_owned;
   if (bias_.defined()) {
     TORCH_CHECK(bias_.dim() == 1);
     output.add_(bias_);
   }
+=======
+  output.add_(bias);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return output;
 }
 
+<<<<<<< HEAD
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
@@ -470,6 +499,8 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
       return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
@@ -478,6 +509,7 @@ Tensor fbgemm_linear_fp16_weight(
       input, packed_weight, bias);
 }
 
+<<<<<<< HEAD
 Tensor fbgemm_linear_fp16_weight(
   const Tensor& input,
     const Tensor& packed_weight,
@@ -487,6 +519,8 @@ Tensor fbgemm_linear_fp16_weight(
       input, packed_weight, bias, output);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else // USE_FBGEMM
 
 Tensor fbgemm_linear_int8_weight_fp32_activation(
@@ -576,8 +610,12 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
+<<<<<<< HEAD
     const std::optional<Tensor>& bias,
     at::Tensor& output) {
+=======
+    const Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
                   "and will be removed in a future PyTorch release.")
 
@@ -588,6 +626,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
       false, "This PyTorch installation was not built with FBGEMM operators");
 }
 
+<<<<<<< HEAD
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
@@ -617,6 +656,8 @@ Tensor fbgemm_linear_fp16_weight(
       false, "This PyTorch installation was not built with FBGEMM operators");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 75b30320b0276..8168097d942f9 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -108,6 +108,7 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
   return false;
 }
 
+<<<<<<< HEAD
 bool use_cudnn(const Tensor& t) {
   bool acceptable = at::cudnn_is_acceptable(t);
   auto st = t.scalar_type();
@@ -115,6 +116,8 @@ bool use_cudnn(const Tensor& t) {
   return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<typename T>
 using pair_of = std::pair<T, T>;
 
@@ -538,7 +541,11 @@ c10::intrusive_ptr<CellParamsBase> make_quantized_cell_params_fp16(
       std::move(w_ih_packed), std::move(w_hh_packed));
 }
 
+<<<<<<< HEAD
 std::unordered_map<
+=======
+static std::unordered_map<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string,
     c10::intrusive_ptr<CellParamsBase> (*)(CellParamsSerializationType)>
     cell_params_deserializers = {
@@ -578,7 +585,11 @@ struct QRNNCellParamsWrapper {
 
 // Gathers every two elements of a vector in a vector of pairs
 template<typename T>
+<<<<<<< HEAD
 std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
+=======
+static std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN");
   std::vector<pair_of<T>> result;
   result.reserve(vals.size() / 2);
@@ -590,7 +601,11 @@ std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
 
 // Flattens a vector of pairs
 template<typename T>
+<<<<<<< HEAD
 std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
+=======
+static std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<T> result;
   result.reserve(vals.size() * 2);
   for (const auto i : c10::irange(vals.size())) {
@@ -601,7 +616,11 @@ std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
 }
 
 // Parses a flat list of parameter tensors into a list of CellParams
+<<<<<<< HEAD
 std::vector<CellParams> gather_params(TensorList params, bool has_biases, bool has_projections = false) {
+=======
+static std::vector<CellParams> gather_params(TensorList params, bool has_biases, bool has_projections = false) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static at::Tensor undefined;
   std::vector<CellParams> result;
   if (has_biases) {
@@ -1207,7 +1226,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
       bool train,                                                           \
       bool bidirectional,                                                   \
       bool batch_first) {                                                   \
+<<<<<<< HEAD
     if (use_cudnn(_input)) {                                                \
+=======
+    if (at::cudnn_is_acceptable(_input)) {                                  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Tensor output, hy;                                                    \
       NAME##_cudnn_stub(                                                    \
           _input.device().type(),                                           \
@@ -1269,7 +1292,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
       double dropout_p,                                                     \
       bool train,                                                           \
       bool bidirectional) {                                                 \
+<<<<<<< HEAD
     if (use_cudnn(data)) {                                                  \
+=======
+    if (at::cudnn_is_acceptable(data)) {                                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Tensor output, hy;                                                    \
       NAME##_packed_cudnn_stub(                                             \
           data.device().type(),                                             \
@@ -1437,7 +1464,11 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
+<<<<<<< HEAD
   if (use_cudnn(_input)) {
+=======
+  if (at::cudnn_is_acceptable(_input)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor output, hy, cy;
     lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
             num_layers, dropout_p, train, bidirectional, batch_first);
@@ -1498,7 +1529,11 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
+<<<<<<< HEAD
   if (use_cudnn(data)) {
+=======
+  if (at::cudnn_is_acceptable(data)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor output, hy, cy;
     lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
             _params, has_biases, num_layers, dropout_p, train, bidirectional);
@@ -1894,10 +1929,17 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple
 
 namespace {
 
+<<<<<<< HEAD
 [[maybe_unused]] auto ensure_linear_params_registered =
     register_linear_params();
 
 auto cell_params_base_registry =
+=======
+[[maybe_unused]] static auto ensure_linear_params_registered =
+    register_linear_params();
+
+static auto cell_params_base_registry =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
         .def_pickle(
             [](const c10::intrusive_ptr<CellParamsBase>& self)
diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h
index fd62b8e01329b..3c05e4e51db43 100644
--- a/aten/src/ATen/native/RangeUtils.h
+++ b/aten/src/ATen/native/RangeUtils.h
@@ -47,7 +47,11 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
     int64_t sgn = (xstep > 0) - (xstep < 0);
     size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
   } else {
+<<<<<<< HEAD
     size_d = std::ceil((end.to<double>() - start.to<double>())
+=======
+    size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         / step.to<double>());
   }
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index db046428bb683..0a5a154d59cd5 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -71,8 +71,11 @@
 #include <ATen/ops/exp.h>
 #include <ATen/ops/gather.h>
 #include <ATen/ops/gradient_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/hash_tensor.h>
 #include <ATen/ops/hash_tensor_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/imag.h>
 #include <ATen/ops/isnan_native.h>
 #include <ATen/ops/linalg_vector_norm.h>
@@ -220,8 +223,11 @@ static void check_argmax_argmin(
     const char* name,
     const Tensor& self,
     const std::optional<int64_t>& dim) {
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(), name, ": does not support complex input");
   TORCH_CHECK(!(self.scalar_type() == kBool), name, ": does not support bool input");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (dim.has_value()) {
     auto dim_ = maybe_wrap_dim(dim.value(), self.dim());
     native::zero_numel_check_dims(self, dim_, name);
@@ -402,6 +408,7 @@ TORCH_META_FUNC(amin)
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
 
+<<<<<<< HEAD
 TORCH_META_FUNC(hash_tensor)
 (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode) {
   auto maybe_result = maybe_get_output();
@@ -415,6 +422,8 @@ TORCH_META_FUNC(hash_tensor)
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::meta
 
 namespace at::native {
@@ -458,7 +467,10 @@ DEFINE_DISPATCH(argmin_stub);
 DEFINE_DISPATCH(cumsum_stub);
 DEFINE_DISPATCH(cumprod_stub);
 DEFINE_DISPATCH(logcumsumexp_stub);
+<<<<<<< HEAD
 DEFINE_DISPATCH(xor_sum_stub);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Tensor _logcumsumexp_cpu(const Tensor& self, int64_t dim) {
   Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
@@ -1469,7 +1481,11 @@ Tensor& nanmean_out(
       "nanmean(): expected input to have floating point or complex dtype but got ",
       self.scalar_type());
   const auto factor = at::native::isnan(self).logical_not_().sum(dim, keepdim);
+<<<<<<< HEAD
   at::nansum_out(result, self, dim, keepdim, opt_dtype).div_(factor);
+=======
+  at::native::nansum_out(self, dim, keepdim, opt_dtype, result).div_(factor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return result;
 }
 
@@ -2251,6 +2267,7 @@ Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){
   return at::norm(self - other, p);
 }
 
+<<<<<<< HEAD
 enum class HashMode { XOR_SUM = 0 };
 
 TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode, const Tensor& result)  {
@@ -2269,6 +2286,8 @@ TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keep
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool cpu_equal(const Tensor& self, const Tensor& other) {
   if (!at::namedinference::are_names_equal(
         self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) {
diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h
index c562bf548403b..818a69b597693 100644
--- a/aten/src/ATen/native/ReduceOps.h
+++ b/aten/src/ATen/native/ReduceOps.h
@@ -27,7 +27,10 @@ DECLARE_DISPATCH(reduce_fn, min_values_stub)
 DECLARE_DISPATCH(reduce_fn, max_values_stub)
 DECLARE_DISPATCH(reduce_fn, argmax_stub)
 DECLARE_DISPATCH(reduce_fn, argmin_stub)
+<<<<<<< HEAD
 DECLARE_DISPATCH(reduce_fn, xor_sum_stub)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using reduce_std_var_function =
     void (*)(TensorIterator&, double correction, bool take_sqrt);
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index 0c66c7a632997..18e57899bfd9d 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -229,20 +229,29 @@ void replication_pad3d_backward_out_cpu_template(
   int pbottom = paddingSize[3];
   int pfront = paddingSize[4];
   int pback = paddingSize[5];
+<<<<<<< HEAD
   int dimc = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int dimw = 3;
   int dimh = 2;
   int dimd = 1;
 
   if (input.dim() == 5) {
+<<<<<<< HEAD
     dimc++;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dimw++;
     dimh++;
     dimd++;
   }
 
   /* sizes */
+<<<<<<< HEAD
   int64_t ichannel = input.size(dimc);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t idepth = input.size(dimd);
   int64_t iheight = input.size(dimh);
   int64_t iwidth = input.size(dimw);
@@ -252,9 +261,12 @@ void replication_pad3d_backward_out_cpu_template(
 
   at::native::padding::check_valid_input<3>(input, paddingSize);
 
+<<<<<<< HEAD
   TORCH_CHECK(ichannel == gradOutput.size(dimc),
       "gradOutput width unexpected. Expected: ", ichannel, ", Got: ",
       gradOutput.size(dimc));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(owidth == gradOutput.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
       gradOutput.size(dimw));
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index a946def225b0c..7bb7b30193465 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -107,6 +107,14 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
   storage->set_nbytes(size_bytes);
 }
 
+<<<<<<< HEAD
+=======
+// Call the sparse implementation in SparseTensor.cpp directly.
+// A dynamic dispatch here is NOT necessary, so I didn't put
+// this function in native_functions.yaml
+const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO(VitalyFedyunin): Move it to HTML docs.
 //
 // Strides of the output tensor of `resize_as_` operator is defined by input
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 39e203f632781..0b4dd2d51e6d8 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -15,11 +15,15 @@ namespace at::native {
 
 Scalar item(const Tensor& self) {
   auto numel = self.sym_numel();
+<<<<<<< HEAD
   TORCH_SYM_CHECK(
       numel.sym_eq(1),
       "a Tensor with ",
       numel,
       " elements cannot be converted to Scalar");
+=======
+  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (self.is_sparse()) {
     if (self._nnz() == 0) return Scalar(0);
     if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp
index 2b61bcec6a828..13e34c9c25f75 100644
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@@ -480,7 +480,11 @@ REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets
 REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 
 // Currently some computation is being duplicated across forward and backward.
+<<<<<<< HEAD
 // TODO: Cache indices in forward pass to reuse in backward
+=======
+// TODO: Cache indices in forward pass to re-use in backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor _segment_reduce_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index 15794040bf39c..b64e6d64604ed 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -346,17 +346,29 @@ template<typename acc_t>
 struct AbsSwitch {};
 
 template<typename scalar_t, typename acc_t>
+<<<<<<< HEAD
 inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> /*unused*/) {
+=======
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<acc_t>(data);
 }
 
 template<typename scalar_t, typename acc_t>
+<<<<<<< HEAD
 inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
+=======
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<acc_t>(std::abs(data));
 }
 
 template<typename scalar_t, typename acc_t>
+<<<<<<< HEAD
 inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
+=======
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<acc_t>(std::abs(at::opmath_type<c10::complex<scalar_t>>(data)));
 }
 
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 44215a26018f0..2f7b649c26795 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -59,8 +59,11 @@ TORCH_META_FUNC(topk)
       "selected index k out of range");
   int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
   TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension");
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(), " topk does not support complex dtypes on CPU");
   TORCH_CHECK(!(self.scalar_type() == kBool), "topk does not support bool dtypes on CPU");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Build the output size, which is the dim being selected set to
   // size k
@@ -76,7 +79,15 @@ TORCH_META_FUNC2(sort, stable)
 (const Tensor& self, std::optional<bool> stable, int64_t dim, bool descending) {
   maybe_wrap_dim(dim, self.dim());
 
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(), " Sort does not support complex dtypes on CPU");
+=======
+  const auto self_dtype = self.dtype();
+  TORCH_CHECK_VALUE(
+    self_dtype != ScalarType::ComplexFloat &&
+    self_dtype != ScalarType::ComplexDouble,
+    "Sort currently does not support complex dtypes on CPU.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // See issue: https://github.com/pytorch/pytorch/issues/65863
   // Strides should be dense, so as not to allocate too much memory.
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
index d04813e60281c..d17a67bd4e16c 100644
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -21,7 +21,11 @@ enum class fft_norm_mode {
 // NOTE [ Fourier Transform Conjugate Symmetry ]
 //
 // Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
+<<<<<<< HEAD
 // assuming X is the transformed K-dimensional signal, we have
+=======
+// assuming X is the transformed K-dimensionsal signal, we have
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 //     X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
 //
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 451869f521df2..9ebd64ad67646 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -145,6 +145,15 @@
 #include <utility>
 #include <vector>
 
+<<<<<<< HEAD
+=======
+namespace at::native {
+
+AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
+
+} // namespace at::native
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::meta {
 
 TORCH_META_FUNC(gather)
@@ -469,7 +478,11 @@ static void build_index_op(
     TensorIteratorBase& iter,
     const at::native::AdvancedIndex& info,
     const Tensor& result) {
+<<<<<<< HEAD
   // 'TensorIterator' needs to own the things coming from 'info', since
+=======
+  // 'TensorIterator' needs to own the things comming from 'info', since
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 'info' will be destroyed after the META function.
   TensorIteratorConfig config;
   // info.src is a restrided view of result
@@ -1906,9 +1919,17 @@ Tensor& index_fill_(
         "This also applies to advanced indexing e.g. tensor[mask] = scalar");
   }
 
+<<<<<<< HEAD
   TORCH_CHECK(
       self.is_complex() || !source.isComplex(),
       "index_fill_(): Converting complex Scalar to non-complex type is not supported");
+=======
+  if (!self.is_complex() && source.isComplex()) {
+    TORCH_CHECK(
+        false,
+        "index_fill_(): Converting complex Scalar to non-complex type is not supported");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Handle the case when `self` is 0-dim
   Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self;
@@ -2145,6 +2166,7 @@ static void _scatter_via_index_put(
     const Tensor& src,
     const Tensor& mut_out,
     bool accumulate) {
+<<<<<<< HEAD
   // If index is expanded with zero strides across non-scatter dimensions,
   // advanced indexing with the index tensor alone achieves the desired
   // semantics and avoids creating large intermediate tensors.
@@ -2192,6 +2214,83 @@ static void _scatter_via_index_put(
     }
   }
   mut_out.index_put_(indices, src_view, accumulate);
+=======
+  if (self.dim() == 1) {
+    torch::List<std::optional<Tensor>> indices;
+    indices.reserve(1);
+    indices.push_back(index);
+    mut_out.index_put_(indices, src, accumulate);
+  } else {
+    Tensor mut_out_contig = mut_out.contiguous();
+
+    auto index_coords_sizes = index.sizes().vec();
+    index_coords_sizes.push_back(self.dim());
+    auto index_coords = at::empty(
+        index_coords_sizes,
+        at::TensorOptions().dtype(at::ScalarType::Long).device(self.device()));
+
+    for (int64_t dim_other = 0; dim_other < self.dim(); dim_other++) {
+      if (dim_other == dim) {
+        continue;
+      }
+      auto dim_coord_vals = at::arange(
+          index.size(dim_other), at::TensorOptions().device(self.device()));
+
+      for (int64_t dim_unsqueeze = 0; dim_unsqueeze < self.dim() - 1;
+           dim_unsqueeze++) {
+        dim_coord_vals =
+            dim_coord_vals.unsqueeze((dim_unsqueeze >= dim_other) ? -1 : 0);
+      }
+
+      auto view_sizes = index.sizes().vec();
+      view_sizes.push_back(1);
+      auto view_strides = index_coords.strides().vec();
+      view_strides[self.dim()] = self.dim();
+
+      at::as_strided(index_coords, view_sizes, view_strides, dim_other)
+          .copy_(dim_coord_vals.unsqueeze(-1));
+    }
+
+    auto view_sizes = index.sizes().vec();
+    view_sizes.push_back(1);
+    auto view_strides = index_coords.strides().vec();
+    view_strides[self.dim()] = self.dim();
+
+    at::as_strided(index_coords, view_sizes, view_strides, dim)
+        .copy_(index.unsqueeze(-1));
+
+    Tensor index_coords_flat = index_coords.flatten(0, -2);
+
+    // Copy mut_out_contig's strides into a tensor
+    // TODO: Is there a utility function that already does this?
+    IntArrayRef mut_out_contig_strides = mut_out_contig.strides();
+    Tensor coord_strides = at::empty(
+        {mut_out_contig.dim()},
+        TensorOptions().dtype(at::ScalarType::Long).device(at::kCPU));
+    std::memcpy(
+        coord_strides.mutable_data_ptr(),
+        mut_out_contig_strides.data(),
+        coord_strides.nbytes());
+    coord_strides = coord_strides.to(mut_out_contig.device());
+
+    // `index_flat` contains the 1-D indices corresponding with the
+    // flattened `mut_out`
+    Tensor index_flat = (index_coords_flat * coord_strides).sum({-1});
+    Tensor mut_out_flat = mut_out_contig.flatten();
+    Tensor src_flat =
+        at::as_strided(src, index.sizes(), src.strides()).flatten();
+
+    torch::List<std::optional<Tensor>> indices;
+    indices.reserve(1);
+    indices.push_back(index_flat);
+
+    mut_out_flat.index_put_(indices, src_flat, accumulate);
+
+    if (!mut_out.is_contiguous()) {
+      mut_out.copy_(mut_out_flat.reshape(mut_out.sizes()));
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <
@@ -2674,7 +2773,11 @@ inline std::tuple<Tensor, Tensor, int64_t> _take_along_dim_helper(
       std::move(dim));
 }
 
+<<<<<<< HEAD
 inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
+=======
+static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       !t.defined() || t.device() == device,
       "Expected tensor to have ",
@@ -2687,7 +2790,11 @@ inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
       ")");
 }
 
+<<<<<<< HEAD
 inline void checkDevice(
+=======
+static inline void checkDevice(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CheckedFrom c,
     at::ArrayRef<Tensor> tensors,
     Device device) {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
index 6f127b711d3e8..f6d6ed40acb22 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@@ -35,9 +35,13 @@ inline std::tuple<bool, Tensor> canDispatchToMaskedFill(
   auto self_device = self.device();
   for (const std::optional<Tensor>& i : indices) {
     if (!i.has_value() || !(*i).defined()) {
+<<<<<<< HEAD
       if (!mask.defined()) {
         num_ind++;
       }
+=======
+      num_ind++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       const Tensor& index = *i;
       if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
@@ -73,11 +77,19 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
   checkIndexTensorTypes(orig, /*allow_int*/ true);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more
   // LongTensors
+<<<<<<< HEAD
   auto indices = expandTensors(self, orig, /*ensure_same_device=*/true);
   // next broadcast all index tensors together
   try {
     indices = expand_outplace(indices);
   } catch (std::exception&) {
+=======
+  auto indices = expandTensors(self, orig);
+  // next broadcast all index tensors together
+  try {
+    indices = expand_outplace(indices);
+  } catch (std::exception& e) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK_INDEX(
         false,
         "shape mismatch: indexing tensors could not be broadcast together"
@@ -93,6 +105,15 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
   if (!hasContiguousSubspace(indices)) {
     std::tie(self, indices) = transposeToFront(self, indices);
   }
+<<<<<<< HEAD
+=======
+  // Ensure indices are on the same device as self
+  for (auto& indice : indices) {
+    if (indice.defined() && indice.device() != self.device()) {
+      indice = indice.to(self.device());
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto& indice : indices) {
     if (indice.defined() && indice.dtype() == at::kInt) {
       indice = indice.to(at::kLong);
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index c6126eda61e73..1df1d4b40924b 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -73,6 +73,10 @@
 #include <ATen/ops/where_native.h>
 #include <ATen/ops/zeros_like.h>
 
+<<<<<<< HEAD
+=======
+#include <iostream>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <utility>
 #endif
 
@@ -846,7 +850,11 @@ TORCH_IMPL_FUNC(clamp_Tensor_out)
 (const Tensor& self,
  const OptionalTensorRef min,
  const OptionalTensorRef max,
+<<<<<<< HEAD
  const Tensor& /*unused*/) {
+=======
+ const Tensor&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (min && max) {
     clamp_stub(device_type(), *this);
   } else if (min) {
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 7df7745fc5077..ef8065a3ff90b 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -67,7 +67,11 @@ namespace at::native {
 namespace {
 // dense_to_sparse_{csr,bsr,csc,bsc} common helpers
 
+<<<<<<< HEAD
 // Preparation for the N-D dense -> sparse compressed conversion.
+=======
+// Preparation fo the N-D dense -> sparse compressed conversion.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // The N-D input is converted to 3-D (single batch dim) where we check that the
 // product of batch dims is nonzero and for each batch the sparse matrix
 // contained within has the same number of non-zero elements.
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 1886e65fc1edc..20349ae43ea90 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1367,9 +1367,15 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
     for (int64_t i = 0; i < n - 1; i++) {
       // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
       int64_t z = generator->random() % (n - i);
+<<<<<<< HEAD
       scalar_t save = r__data[i * r__stride_0];
       r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
       r__data[(z + i) * r__stride_0] = save;
+=======
+      scalar_t sav = r__data[i * r__stride_0];
+      r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
+      r__data[(z + i) * r__stride_0] = sav;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return;
   }
@@ -1640,9 +1646,12 @@ Tensor zeros_symint(
     std::optional<Layout> layout,
     std::optional<Device> device,
     std::optional<bool> pin_memory) {
+<<<<<<< HEAD
   for (const auto& dim_size : size) {
     TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative.");
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Layout layout_ = layout.value_or(Layout::Strided);
   if (at::sparse_csr::is_sparse_compressed(layout_)) {
     return zeros_sparse_compressed_symint(
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index ce2987eb251ae..a32782eec7763 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -80,7 +80,11 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
 }
 
 /// Chooses a dimension over which to parallelize. Prefers the outer-most
+<<<<<<< HEAD
 /// dimension that's larger than the number of available threads.
+=======
+/// dimension thats larger than the number of available threads.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static int find_split_dim(TensorIteratorBase& iter) {
   int num_threads = at::get_num_threads();
   auto shape = iter.shape();
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index c15b082f107b2..f25335234e53c 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -18,7 +18,10 @@
 #include <ATen/ops/is_set_to_native.h>
 #include <ATen/ops/size_native.h>
 #include <ATen/ops/stride_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/sym_is_contiguous_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/sym_numel_native.h>
 #include <ATen/ops/sym_size_native.h>
 #include <ATen/ops/sym_storage_offset_native.h>
@@ -58,12 +61,15 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) {
   return self.sym_size(dim);
 }
 
+<<<<<<< HEAD
 c10::SymBool sym_is_contiguous(
     const Tensor& self,
     c10::MemoryFormat memory_format) {
   return self.sym_is_contiguous(memory_format);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
   return self.sym_stride(dim);
 }
@@ -91,6 +97,12 @@ bool cudnn_is_acceptable(const TensorBase& self) {
     return false;
   if (!self.is_cuda())
     return false;
+<<<<<<< HEAD
+=======
+  auto st = self.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!detail::getCUDAHooks().compiledWithCuDNN())
     return false;
   // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
@@ -117,7 +129,11 @@ Tensor& detach_(Tensor& self) {
 }
 
 Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
+<<<<<<< HEAD
   if (self.is_contiguous_or_false(memory_format)) {
+=======
+  if (self.is_contiguous(memory_format)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self;
   }
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 6df7761d822db..794a69451f6ac 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1,5 +1,8 @@
+<<<<<<< HEAD
 #include <ATen/core/ATen_fwd.h>
 #include <c10/core/ScalarType.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -249,7 +252,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
   // Checking names before the actual dimensions.
   auto maybe_outnames = namedinference::compute_cat_outnames(materialized);
 
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(
+=======
+  TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       !materialized.empty(),
       "torch.cat(): expected a non-empty list of Tensors");
 
@@ -276,7 +283,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
   // when computing the actual output dtype and the flags.
   if (is_out_defined) {
     // Check for type promotion, if the output tensor is defined.
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
+=======
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         canCast(out_dtype, result.scalar_type()),
         "torch.cat(): input types can't be cast to the desired output type ",
         result.scalar_type());
@@ -295,7 +306,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
   // are compatible, i.e. we can execute `cat` on them.
   bool found_valid_tensor = valid < materialized.size();
   if (found_valid_tensor) {
+<<<<<<< HEAD
     TORCH_CHECK_INDEX(
+=======
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim <= materialized[valid].get().dim(),
         "torch.cat(): dimension ",
         dim,
@@ -386,7 +401,11 @@ Tensor& set_storage_cpu_(
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
   at::OptionalIntArrayRef stride_opt =
       stride.data() != nullptr ? at::OptionalIntArrayRef(stride) : std::nullopt;
+<<<<<<< HEAD
   // We can reuse this kernel for the meta device.
+=======
+  // We can re-use this kernel for the meta device.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We just need to make sure we don't actually try to resize the (null)
   // storage.
   at::native::resize_impl_cpu_(
@@ -461,7 +480,12 @@ Tensor& set_storage_meta__symint(
                 size, stride, itemsize, std::move(storage_offset));
 
       if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() &&
+<<<<<<< HEAD
           (new_size_bytes > storage.sym_nbytes())) {
+=======
+          TORCH_GUARD_SIZE_OBLIVIOUS(
+              new_size_bytes.sym_gt(storage.sym_nbytes()))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         storage.set_nbytes(std::move(new_size_bytes));
       }
     }
@@ -507,7 +531,11 @@ Tensor& set_cpu_(Tensor& result) {
   return result;
 }
 
+<<<<<<< HEAD
 // We can't reuse the cpu kernel here because we don't want to use the cpu
+=======
+// We can't re-use the cpu kernel here because we don't want to use the cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // allocator.
 Tensor& set_meta_(Tensor& result) {
   caffe2::TypeMeta dtype = result.dtype();
@@ -1410,6 +1438,12 @@ Tensor as_strided_tensorimpl(
     IntArrayRef size,
     IntArrayRef stride,
     std::optional<int64_t> storage_offset_) {
+<<<<<<< HEAD
+=======
+  TORCH_INTERNAL_ASSERT(
+      !self.is_mps(),
+      "as_strided_tensorimpl does not work with MPS; call self.as_strided(...) instead");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto storage_offset = storage_offset_.value_or(self.storage_offset());
   auto result = at::detail::make_tensor<TensorImpl>(
       c10::TensorImpl::VIEW,
@@ -1880,18 +1914,33 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 
   Tensor xtensor = self.expand(padded_size);
 
+<<<<<<< HEAD
   Tensor urtensor;
   if (self.is_quantized()) {
     urtensor = at::empty_quantized(target_size, self);
   } else {
     urtensor = at::empty(target_size, self.options());
+=======
+  Tensor result;
+  if (self.is_quantized()) {
+    result = at::empty_quantized(target_size, self);
+  } else {
+    result = at::empty(target_size, self.options());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // return an empty tensor if one of the repeat dimensions is zero
   if (zero_tensor) {
+<<<<<<< HEAD
     return urtensor;
   }
 
+=======
+    return result;
+  }
+
+  Tensor urtensor = at::alias(result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(xtensor.dim())) {
     // can't unfold with step 0, so make sure step is at least 1
     // (it doesn't matter what it is in that case, because the size is 0).
@@ -1901,6 +1950,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 
   urtensor.copy_(xtensor.expand_as(urtensor));
 
+<<<<<<< HEAD
   // Combine the dimensions to produce the target_size.
   // xtensor dims: [a0, ..., ad-1]
   // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
@@ -1921,6 +1971,13 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 
 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
   // If self.size() > len(reps), reps is promoted to self.size() by prepending
+=======
+  return result;
+}
+
+Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
+  // If self.size() > len(reps), reps is promoted to self.size() by pre-pending
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 1’s to it to keep the same behaviour as `numpy.tile`.
   // Thus for a tensor of shape (2, 3, 4, 5), a dims of (2, 2) is treated
   // as (1, 1, 2, 2).
@@ -2011,18 +2068,32 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
     TORCH_CHECK(false, "reshape is not implemented for sparse tensors");
   }
 
+<<<<<<< HEAD
   if (self.is_contiguous_or_false() && !self.is_mkldnn()) {
     return self.view_symint(proposed_shape);
   }
 
   auto sym_numel = self.sym_numel();
+=======
+  auto sym_sizes = self.sym_sizes();
+  auto sym_strides = self.sym_strides();
+  auto sym_numel = self.sym_numel();
+  if (definitely_contiguous(sym_sizes, sym_strides, sym_numel) &&
+      !self.is_mkldnn()) {
+    return self.view_symint(proposed_shape);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::SymDimVector shape = infer_size_dv(proposed_shape, sym_numel);
 
   if (self.is_mkldnn()) {
     return at::_mkldnn_reshape(self, C10_AS_INTARRAYREF_SLOW(shape));
   }
+<<<<<<< HEAD
   auto sym_sizes = self.sym_sizes();
   auto sym_strides = self.sym_strides();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // `computeStride` returns the proper strides to use if this
   // `reshape` can be just a view.
@@ -2067,7 +2138,11 @@ Tensor _reshape_copy_symint(
     TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
   }
 
+<<<<<<< HEAD
   if (self.is_contiguous_or_false()) {
+=======
+  if (self.is_contiguous()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
   } else {
     return at::_unsafe_view_symint(
@@ -2444,7 +2519,11 @@ Tensor index_select_sparse_cpu(
     const auto dim_indices = indices[dim].contiguous();
 
     // If nnz is smaller than size, then either indices[dim] or index gets
+<<<<<<< HEAD
     // sorted, then this is followed by a binary search to find intersections.
+=======
+    // sorted, then this is followed by a binary search to find interesections.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto get_selected_indices_small_nnz_large_size =
         [&]() -> std::tuple<Tensor, Tensor> {
       const auto grain_size = at::internal::GRAIN_SIZE;
@@ -3641,7 +3720,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
 namespace {
 // Transpose implementation for sparse compressed layouts
 // NB: We assume that dim1,dim0 have already been wrapped
+<<<<<<< HEAD
 inline Tensor sparse_compressed_transpose(
+=======
+static inline Tensor sparse_compressed_transpose(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim0,
     int64_t dim1) {
@@ -3950,7 +4033,11 @@ Tensor squeeze_qtensor(const Tensor& self, c10::OptionalIntArrayRef dims) {
         quantizer->scalar_type());
   }
   // TODO: quantized Tensor support for SymInt needs to be added but basic
+<<<<<<< HEAD
   // building blocks are missing for now.
+=======
+  // building blocs are missing for now.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = make_qtensor(
       self,
       C10_AS_INTARRAYREF_SLOW(sizes),
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index 08b666e296ed7..fb5a4f1aca8a1 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -52,7 +52,10 @@ void apply_triu_tril_single(
     int64_t self_col_stride,
     bool upper) {
   constexpr int64_t zero = 0;
+<<<<<<< HEAD
   k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (upper) {
     parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index 156d2c8974b84..aeb63441d2422 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -29,7 +29,11 @@ namespace {
 // grad_in does not mean that it is a gradient wrt to input,
 // grad_in/grad_out is just an input/output of unfold_backward kernel.
 
+<<<<<<< HEAD
 [[maybe_unused]] TensorIterator _make_unfold_backward_iter_over_grad_out(
+=======
+[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& grad_out,
     const Tensor& grad_in,
     int64_t dim,
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index b14079e7ea19c..98fe558953c4d 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -124,7 +124,11 @@ struct IsUnique {};
 
 template <typename scalar_t>
 struct IsUnique<scalar_t, false> {
+<<<<<<< HEAD
   bool operator() (scalar_t* data_ptr, int64_t i) {
+=======
+  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (i == 0) { return true; }
     return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
   }
@@ -132,7 +136,11 @@ struct IsUnique<scalar_t, false> {
 
 template <typename scalar_t>
 struct IsUnique<scalar_t, true> {
+<<<<<<< HEAD
   bool operator() (scalar_t* data_ptr, int64_t i) {
+=======
+  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (i == 0) { return true; }
     return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
         && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index cf6727c2207c7..901d8dd72fb05 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -4,6 +4,10 @@
 
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/OpMathType.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/core/Tensor.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
@@ -406,7 +410,11 @@ scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 static inline void get_cubic_upsample_coefficients(
+=======
+void get_cubic_upsample_coefficients(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t coeffs[4],
     scalar_t t) {
   scalar_t A = -0.75;
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 3ab8795f6dca3..2df198a37e668 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -105,7 +105,11 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void upsample_bicubic2d_backward_out_frame(
+=======
+static void upsample_bicubic2d_backward_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const scalar_t* odata,
     scalar_t* idata,
     int64_t input_height,
@@ -177,7 +181,11 @@ void upsample_bicubic2d_backward_out_frame(
   });
 }
 
+<<<<<<< HEAD
 void upsample_bicubic2d_backward_kernel(
+=======
+static void upsample_bicubic2d_backward_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index 02c798a3d0400..9f2c8f7a5c8d9 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -25,11 +25,19 @@
 namespace at::native {
 
 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
+<<<<<<< HEAD
   self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }
 
 void set_data(Tensor& self, const Tensor& new_data) {
   self.set_data(new_data);
+=======
+  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+}
+
+void set_data(Tensor& self, const Tensor& new_data) {
+  return self.set_data(new_data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor data(const Tensor& self) {
@@ -54,7 +62,11 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }
 
 void retain_grad(Tensor& self) {
+<<<<<<< HEAD
   self.retain_grad();
+=======
+  return self.retain_grad();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool retains_grad(const Tensor& self) {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
index 0773217c90a4c..159ac68f45169 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
@@ -39,6 +39,10 @@ int register_linear_params() {
 }
 
 namespace {
+<<<<<<< HEAD
 [[maybe_unused]] auto linear_params = register_linear_params();
+=======
+[[maybe_unused]] static auto linear_params = register_linear_params();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 }  // namespace ao::sparse
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
index 9a122cd7cf05e..6730187ebd385 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
@@ -4,11 +4,17 @@
 #include <c10/core/QScheme.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 #include <fbgemm/FbgemmSparse.h>
 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmSparse.h>
+#include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 namespace ao::sparse {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
index 01b292adc01c3..11f2f9ed91551 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
@@ -17,7 +17,11 @@
 
 namespace ao::sparse {
 
+<<<<<<< HEAD
 
+=======
+int register_linear_params();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_FBGEMM
 
@@ -128,7 +132,11 @@ at::Tensor PackedLinearWeight::apply_impl(
   auto* input_tr_ptr =
       reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
   // TODO: Activation transpose before and after the kernel can be removed if we
+<<<<<<< HEAD
   // keep activation tensor always transposed.
+=======
+  // keep activation tensor always tranposed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fbgemm::transpose_simd<uint8_t>(
       batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
 
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index 968e58d591c1d..91abc68c2484d 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -20,7 +20,11 @@
 
 namespace ao::sparse {
 
+<<<<<<< HEAD
 
+=======
+int register_linear_params();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_FBGEMM
 namespace {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
index b9cffe5b0bcbf..f7e2a0cdf3503 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@@ -16,7 +16,11 @@
 #endif
 
 namespace ao::sparse {
+<<<<<<< HEAD
 
+=======
+int register_linear_params();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_FBGEMM
 
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index bc9b452bc6876..86db66a0b7935 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -26,11 +26,15 @@ namespace at::native {
 
 namespace {
 
+<<<<<<< HEAD
 #if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 // Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
 __attribute__((optimize("no-tree-vectorize")))
 #endif
 void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
+=======
+static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::isReducedFloatingType(input.scalar_type())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
     using Vec = Vectorized<scalar_t>;
@@ -96,7 +100,11 @@ void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const Tensor
   }
 }
 
+<<<<<<< HEAD
 void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
+=======
+static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
       using Vec = Vectorized<scalar_t>;
@@ -150,7 +158,11 @@ void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
   }
 }
 
+<<<<<<< HEAD
 void threshold_kernel(
+=======
+static void threshold_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorIteratorBase& iter,
     const Scalar& threshold_scalar,
     const Scalar& value_scalar) {
@@ -868,7 +880,11 @@ void hardswish_backward_kernel(TensorIterator& iter) {
   }
 }
 
+<<<<<<< HEAD
 void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+=======
+static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&]() {
     auto zero_vec = Vectorized<float>((float)(0));
@@ -907,7 +923,11 @@ void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
   }
 }
 
+<<<<<<< HEAD
 void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+=======
+static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&]() {
     auto zero_vec = Vectorized<float>((float)(0));
diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
index 3a19088114b2d..434d084d9cf2f 100644
--- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
@@ -520,7 +520,11 @@ cpu_adaptive_avg_pool3d_channels_last(
       scalar_t* out = output_data + i * channels;
       int64_t size = channels;
 
+<<<<<<< HEAD
       // Note: For ordinary usage scenario, each out lane should
+=======
+      // Note: For oridinary usage scenario, each out lane should
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       //   fit in L1 cache; otherwise consider block dim C.
       // Pass I: zero the out lane
       int64_t d1 = 0;
diff --git a/aten/src/ATen/native/cpu/AtomicAddFloat.h b/aten/src/ATen/native/cpu/AtomicAddFloat.h
index 526f86d705b77..db0bf3c40d151 100644
--- a/aten/src/ATen/native/cpu/AtomicAddFloat.h
+++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h
@@ -22,7 +22,11 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
   old_value.floatV = *dst;
   new_value.floatV = old_value.floatV + fvalue;
 
+<<<<<<< HEAD
   unsigned* old_intV = &old_value.intV;
+=======
+  unsigned* old_intV = (unsigned*)(&old_value.intV);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
 #ifdef __aarch64__
     __asm__ __volatile__("yield;" : : : "memory");
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 10e0daacab33c..5725154954130 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -300,8 +300,12 @@ void div_floor_kernel(TensorIteratorBase& iter) {
     // In the special case of unsigned integer division, floor division is
     // equivalent to truncation division (since the signs of the divisor and
     // dividend are always the same)
+<<<<<<< HEAD
     div_trunc_kernel(iter);
     return;
+=======
+    return div_trunc_kernel(iter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (isIntegralType(dtype, /*includeBool*/ false)) {
     // There's no SIMD integer division, so don't try to vectorize it.
     AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp
index 2e3a82ac049e7..c7a1e2b390f2a 100644
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@@ -118,7 +118,11 @@ gemm_notrans_(
   scale_(m, n, beta, c, ldc);
 
   // c += alpha * (a @ b)
+<<<<<<< HEAD
   const uint64_t unsigned_m = m;
+=======
+  const uint64_t unsigned_m = static_cast<int64_t>(m);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint64_t i_m = unsigned_m / 4;
   for (const uint64_t l : c10::irange(k)) {
     for (const uint64_t j : c10::irange(n)) {
@@ -369,7 +373,11 @@ void gemm_notrans_(
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)
 
 #if !defined(C10_MOBILE)
+<<<<<<< HEAD
 float compute_dot(const at::Half* a, const at::Half* b, int64_t len) {
+=======
+static float compute_dot(const at::Half* a, const at::Half* b, int64_t len) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::native::CPU_CAPABILITY::fp16_dot_with_fp32_arith(
       a, b, len);
 }
@@ -406,7 +414,11 @@ void gemm_transa_(
   });
 }
 
+<<<<<<< HEAD
 float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) {
+=======
+static float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::native::CPU_CAPABILITY::bf16_dot_with_fp32_arith(a, b, len);
 }
 
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index 365a79ba52ca9..06768175b39c7 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -15,12 +15,20 @@ namespace at::native {
 inline namespace CPU_CAPABILITY {
 
 namespace {
+<<<<<<< HEAD
 bool reduced_input(ScalarType input_t, ScalarType output_t) {
+=======
+static bool reduced_input(ScalarType input_t, ScalarType output_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return !at::isFloat8Type(input_t) && at::isReducedFloatingType(input_t) &&
       output_t == kFloat;
 }
 
+<<<<<<< HEAD
 bool reduced_output(ScalarType input_t, ScalarType output_t) {
+=======
+static bool reduced_output(ScalarType input_t, ScalarType output_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return !at::isFloat8Type(output_t) && at::isReducedFloatingType(output_t) &&
       input_t == kFloat;
 }
diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp
index 66e49f911f68b..0700cd2669f48 100644
--- a/aten/src/ATen/native/cpu/CrossKernel.cpp
+++ b/aten/src/ATen/native/cpu/CrossKernel.cpp
@@ -15,7 +15,11 @@ namespace at::native {
 namespace {
 
 template<typename scalar_t>
+<<<<<<< HEAD
 void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+=======
+static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t total = a.numel() / 3;
   int64_t a_stride = a.stride(dim);
   int64_t b_stride = b.stride(dim);
@@ -68,7 +72,11 @@ void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const i
   });
 }
 
+<<<<<<< HEAD
 void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+=======
+static void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, result.scalar_type(), "cross", [&]() {
     apply_cross<scalar_t>(result, a, b, dim);
   });
diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
index cce4c43b2e4e5..ffb0bd3ddcea7 100644
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@@ -259,6 +259,7 @@ inline void winograd_f2k3_input_transform_inplace__rvv(
   const vfloat32m1_t wd1 = __riscv_vfadd_vv_f32m1(d1, d2, 4);
   const vfloat32m1_t wd2 = __riscv_vfsub_vv_f32m1(d2, d1, 4);
   const vfloat32m1_t wd3 = __riscv_vfsub_vv_f32m1(d1, d3, 4);
+<<<<<<< HEAD
   /* GCC 14.2 (RISC-V RVV) ICE workaround:
    * Avoid single-statement read-modify-write on MEM_REF like:
    *   *input_tile_val =
@@ -273,6 +274,13 @@ inline void winograd_f2k3_input_transform_inplace__rvv(
   tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 2, wd2);
   tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 3, wd3);
   *input_tile_val = tmp_input_tile_val;
+=======
+
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wd0);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wd1);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 2, wd2);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 3, wd3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void winograd_f2k3_output_transform_inplace__rvv(
@@ -286,6 +294,7 @@ inline void winograd_f2k3_output_transform_inplace__rvv(
   const vfloat32m1_t wm0 = __riscv_vfadd_vv_f32m1(m0_plus_m1, m2, 4);
   const vfloat32m1_t m1_sub_m2 = __riscv_vfsub_vv_f32m1(m1, m2, 4);
   const vfloat32m1_t wm1 = __riscv_vfsub_vv_f32m1(m1_sub_m2, m3, 4);
+<<<<<<< HEAD
   /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
    * Keep the temporary + write-back pattern to avoid ICE.
    * Do NOT rewrite into:
@@ -295,6 +304,11 @@ inline void winograd_f2k3_output_transform_inplace__rvv(
   tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 0, wm0);
   tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 1, wm1);
   *input_tile_val = tmp_output_tile_val;
+=======
+
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wm0);
+  *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wm1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline vfloat32m1_t
@@ -315,6 +329,7 @@ inline void winograd_f2k3_kernel_transform__rvv(
   const vfloat32m1_t const_half = __riscv_vfmv_v_f_f32m1(0.5f, 4);
   const vfloat32m1_t g0_plus_g2 = __riscv_vfadd_vv_f32m1(g0, g2, 4);
   vfloat32m1_t half_g0_plus_g2 =  __riscv_vfmul_vv_f32m1(const_half, g0_plus_g2, 4);
+<<<<<<< HEAD
   /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
    * Keep the temporary + write-back pattern to avoid ICE.
    * Do NOT rewrite into:
@@ -326,6 +341,13 @@ inline void winograd_f2k3_kernel_transform__rvv(
   tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
   tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 3, g2);
   *transform = tmp_transform;
+=======
+
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 0, g0);
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1));
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
+  *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 3, g2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline vfloat32m1x4_t v4f_transpose4x4__rvv(const vfloat32m1x4_t m) {
@@ -473,11 +495,19 @@ void convolution_depthwise3x3_winograd_impl(
 #else
 
 void convolution_depthwise3x3_winograd_impl(
+<<<<<<< HEAD
     const Arguments& /*unused*/,
     const float* const /*unused*/,
     const float* const /*unused*/,
     const float* const /*unused*/,
     float* const /*unused*/) {
+=======
+    const Arguments&,
+    const float* const,
+    const float* const,
+    const float* const,
+    float* const) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif /* __ARM_NEON__ */
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index e900bc5216117..6bd4bfee7d78c 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -34,7 +34,11 @@ struct Dist {
   //     finish :   This tells what to do with the aggregated value to compute
   //                the norm. Generally this is the result of val ^ (1 / p).
   //     backward : This is the gradient for that norm. Arguments are pretty
+<<<<<<< HEAD
   //                self explanatory.
+=======
+  //                self explanitory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // There are a few cases where these aren't used. The 0 norm has no backward,
   // because it's always 0, so that's shortcircuited earlier. There's a special
@@ -139,7 +143,11 @@ struct Dist {
     static inline data_t map(const data_t& diff, const data_t& p) { return diff; }
     static inline data_t red(const data_t& agg, const data_t& up) { return max(agg, up); }
     static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+<<<<<<< HEAD
     // TODO This backward pass uses a very complex expression to compute (diff
+=======
+    // TODO This backward pass uses a very complext expression to compute (diff
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // == dist) that could be much faster if using SSE instructions.
     static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff) * (Vec(1) - vec::minimum(Vec(1), (diff.abs() - Vec(dist)).abs().ceil())); }
   };
@@ -160,9 +168,16 @@ struct Dist {
     // value of k.
     parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
       const Vec pvec(p);
+<<<<<<< HEAD
       double n2 = static_cast<double>(n) - .5;
       // The -1 accounts for floating point truncation issues
       int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
+=======
+      double n2 = n - .5;
+      // The -1 accounts for floating point truncation issues
+      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
 
       const scalar_t * self_i = self_start + i * m;
@@ -421,19 +436,31 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double
   });
 }
 
+<<<<<<< HEAD
 void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+=======
+static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_backward", [&] {
     Dist<scalar_t>::apply_backward_pdist(result, grad, self, p, dist);
   });
 }
 
+<<<<<<< HEAD
 void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) {
+=======
+static void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist", [&] {
     Dist<scalar_t>::apply_cdist(result, x1, x2, p);
   });
 }
 
+<<<<<<< HEAD
 void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) {
+=======
+static void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_backward", [&] {
     Dist<scalar_t>::apply_backward_cdist(result, grad, x1, x2, p, dist);
   });
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index e3fdefb523044..ead8515216baa 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -27,7 +27,11 @@
 namespace at::native {
 namespace {
 
+<<<<<<< HEAD
 void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+=======
+static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
@@ -101,7 +105,11 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Gen
 }
 #endif
 
+<<<<<<< HEAD
 void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+=======
+static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
@@ -198,12 +206,20 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<G
 }
 #endif
 
+<<<<<<< HEAD
 void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+=======
+static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
 }
 
+<<<<<<< HEAD
 void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+=======
+static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::log_normal_kernel(iter, mean, std, generator);
 }
@@ -218,12 +234,20 @@ void normal_kernel(const TensorBase &self, double mean, double std, std::optiona
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
 
+<<<<<<< HEAD
 void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
+=======
+static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_from_to_kernel(iter, range, base, generator);
 }
 
+<<<<<<< HEAD
 void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+=======
+static void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_kernel(iter, generator);
 }
@@ -231,7 +255,11 @@ void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
 // This is the special kernel to handle single specific case:
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+<<<<<<< HEAD
 void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+=======
+static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
 }
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 1f8693902a32b..63a5f751caaad 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -85,7 +85,11 @@ struct RandomKernel {
 // ==================================================== Normal ========================================================
 
 #ifdef CPU_CAPABILITY_AVX2
+<<<<<<< HEAD
 void normal_fill_16_AVX2(float *data,
+=======
+static void normal_fill_16_AVX2(float *data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          const __m256* two_pi,
                          const __m256* one,
                          const __m256* minus_two,
@@ -136,7 +140,11 @@ void normal_fill_AVX2(const TensorBase &self, const float mean, const float std,
 #endif
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
+=======
+static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto j : c10::irange(8)) {
     const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log.
     const scalar_t u2 = data[j + 8];
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
index 5ac4971396076..4b4a2c8c256fc 100644
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -96,6 +96,7 @@ inline void _exp_reduce_sum_fusion_kernel(
   for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
     auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
     auto tmp1 = tmp0 - vec_max;
+<<<<<<< HEAD
     Vectorized<T1> tmp2;
     if constexpr (std::is_same_v<T1, float> &&
               (std::is_same_v<T2, at::BFloat16> || std::is_same_v<T2, at::Half>))
@@ -104,6 +105,9 @@ inline void _exp_reduce_sum_fusion_kernel(
     } else {
         tmp2 = tmp1.exp_u20();
     }
+=======
+    auto tmp2 = tmp1.exp_u20();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vec_tmp_sum += tmp2;
     _store(out + i, tmp2);
   }
@@ -158,14 +162,22 @@ inline void _mul_reduce_max_fusion_kernel(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
+=======
+static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(ptr2 == nullptr);
   return ptr;
 }
 
 template <typename scalar_t,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+<<<<<<< HEAD
 inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
+=======
+static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ptr2;
 }
 
@@ -315,12 +327,21 @@ void cpu_flash_attention(
     bool is_causal,
     std::optional<Tensor> attn_mask,
     std::optional<double> scale) {
+<<<<<<< HEAD
   // Query (Batch x Num_heads    x Q_seq_len    x Dim_per_head)
   //    -> (Batch x Q_seq_len    x Num_heads    x Dim_per_head)
   // Key   (Batch x KV_num_heads x KV_seq_len   x Dim_per_head)
   //    -> (Batch x KV_seq_len   x KV_num_heads x Dim_per_head)
   // Value (Batch x KV_num_heads x KV_seq_len   x Dim_per_head)
   //    -> (Batch x KV_seq_len   x KV_num_heads x Dim_per_head)
+=======
+  // Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
+  //    -> (Batch x Q_seq_len  x Num_heads  x Dim_per_head)
+  // Key   (Batch x Num_heads  x KV_seq_len x Dim_per_head)
+  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
+  // Value (Batch x Num_heads  x KV_seq_len x Dim_per_head)
+  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor query = q.transpose(1, 2);
   at::Tensor key = k.transpose(1, 2);
   at::Tensor value = v.transpose(1, 2);
@@ -338,8 +359,11 @@ void cpu_flash_attention(
   int64_t qSize = query.size(1);
   int64_t kvSize = value.size(1);
   int64_t num_head = query.size(2);
+<<<<<<< HEAD
   int64_t kv_num_head = key.size(2);
   int64_t repeat_factor = num_head / kv_num_head;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t headSize = query.size(3);
 
   bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel();
@@ -400,7 +424,11 @@ void cpu_flash_attention(
     // When the number of gemm is greater than the number of pack,
     // the pack overhead can be overlapped.
     if (need_pack) {
+<<<<<<< HEAD
       double pack_size = batchSize * kv_num_head * kvSize * headSize;
+=======
+      double pack_size = batchSize * num_head * kvSize * headSize;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       double qs_per_thread = (batchSize * num_head * qSlice + num_thread - 1) / num_thread;
       double gemm_size_per_thread = qs_per_thread * qSplitSize *
           (is_causal ? std::min(qSize, kvSize) : kvSize) * headSize;
@@ -450,10 +478,17 @@ void cpu_flash_attention(
   at::Tensor qeury_t_padding;
   if (need_pack) {
     key_t_reorder = at::empty(
+<<<<<<< HEAD
       {batchSize, kv_num_head, eheadSize, kvSize},
       c10::CppTypeToScalarType<scalar_t>::value);
     value_t_reorder = at::empty(
       {batchSize, kv_num_head, kv_padding_size, headSize},
+=======
+      {batchSize, num_head, eheadSize, kvSize},
+      c10::CppTypeToScalarType<scalar_t>::value);
+    value_t_reorder = at::empty(
+      {batchSize, num_head, kv_padding_size, headSize},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10::CppTypeToScalarType<scalar_t>::value);
     key_reorder_ptr = key_t_reorder.data_ptr<scalar_t>();
     value_reorder_ptr = value_t_reorder.data_ptr<scalar_t>();
@@ -472,11 +507,19 @@ void cpu_flash_attention(
       {num_thread, kvSplitSize, headSize},
       c10::CppTypeToScalarType<scalar_t>::value);
     scalar_t* transpose_buffer_ptr = tranpose_t_reorder.data_ptr<scalar_t>();
+<<<<<<< HEAD
     at::parallel_for(0, batchSize * kv_num_head * kvSlice, 1, [&](int64_t begin, int64_t end) {
         int ompIdx = at::get_thread_num();
         int64_t i = 0, kv_j = 0, l = 0, n = 0;
         scalar_t* transpose_ptr = transpose_buffer_ptr + ompIdx * kvSplitSize * headSize;
         at::native::data_index_init(begin, i, batchSize, kv_j, kv_num_head, l, kvSlice);
+=======
+    at::parallel_for(0, batchSize * num_head * kvSlice, 1, [&](int64_t begin, int64_t end) {
+        int ompIdx = at::get_thread_num();
+        int64_t i = 0, j = 0, l = 0, n = 0;
+        scalar_t* transpose_ptr = transpose_buffer_ptr + ompIdx * kvSplitSize * headSize;
+        at::native::data_index_init(begin, i, batchSize, j, num_head, l, kvSlice);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
           n = l * kvSplitSize;
           int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
@@ -486,7 +529,11 @@ void cpu_flash_attention(
               kvBlockSize,
               headSize,
               /* src_ptr */
+<<<<<<< HEAD
               reinterpret_cast<const uint16_t*>(k_data + i * kStrideB + kv_j * kStrideH + n * kStrideN),
+=======
+              reinterpret_cast<const uint16_t*>(k_data + i * kStrideB + j * kStrideH + n * kStrideN),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               /* ld_src */ kStrideN,
               /* dst */ reinterpret_cast<uint16_t*>(transpose_ptr),
               /* ld_dst */ kvBlockSize);
@@ -494,24 +541,40 @@ void cpu_flash_attention(
           // Pack [headSize, kvBlockSize]
           at::vec::pack_vnni2(
             /* src */ reinterpret_cast<const uint16_t*>(transpose_ptr),
+<<<<<<< HEAD
             /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * kv_num_head * eheadSize * kvSize +
                     kv_j * eheadSize * kvSize + n * eheadSize),
+=======
+            /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head * eheadSize * kvSize +
+                    j * eheadSize * kvSize + n * eheadSize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             /* ld_src */ kvBlockSize,
             /* K */ headSize,
             /* N */ kvBlockSize);
 
           // Pack [kvBlockSize, headSize]
           at::vec::pack_vnni2(
+<<<<<<< HEAD
             /* src */ reinterpret_cast<const uint16_t*>(v_data + i * vStrideB + kv_j * vStrideH + n * vStrideN),
             /* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
                     i * kv_num_head * kv_padding_size * headSize +
                     kv_j * kv_padding_size * headSize + n * headSize),
+=======
+            /* src */ reinterpret_cast<const uint16_t*>(v_data + i * vStrideB + j * vStrideH + n * vStrideN),
+            /* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
+                    i * num_head * kv_padding_size * headSize +
+                    j * kv_padding_size * headSize + n * headSize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             /* ld_src */ vStrideN,
             /* K */ kvBlockSize,
             /* N */ headSize);
 
           // Move to the next query
+<<<<<<< HEAD
           at::native::data_index_step(i, batchSize, kv_j, kv_num_head, l, kvSlice);
+=======
+          at::native::data_index_step(i, batchSize, j, num_head, l, kvSlice);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       });
   }
@@ -533,7 +596,10 @@ void cpu_flash_attention(
     for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
       int64_t m = k * qSplitSize;
       int64_t qBlockSize = std::min(qSplitSize, qSize - m);
+<<<<<<< HEAD
       int64_t kv_j = j / repeat_factor;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Initialize max and sum
       fill_stub(qk_max_data,
           -std::numeric_limits<accum_t>::infinity(), qBlockSize);
@@ -570,8 +636,13 @@ void cpu_flash_attention(
                 !headSize_even
                     ? query_t_padding_ptr
                     : q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+<<<<<<< HEAD
                 key_reorder_ptr + i * kv_num_head * eheadSize * kvSize +
                     kv_j * eheadSize * kvSize + n * eheadSize,
+=======
+                key_reorder_ptr + i * num_head * eheadSize * kvSize +
+                    j * eheadSize * kvSize + n * eheadSize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 qk_data);
           }
         } else {
@@ -582,7 +653,11 @@ void cpu_flash_attention(
             qBlockSize,
             headSize,
             static_cast<accum_t>(1),
+<<<<<<< HEAD
             k_data + i * kStrideB + kv_j * kStrideH +
+=======
+            k_data + i * kStrideB + j * kStrideH +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 n * kStrideN,
             kStrideN,
             q_data + i * qStrideB + j * qStrideH +
@@ -701,8 +776,13 @@ void cpu_flash_attention(
                   n > 0,
                   qk_reduced_data,
                   value_reorder_ptr +
+<<<<<<< HEAD
                       i * kv_num_head * kv_padding_size * headSize +
                       kv_j * kv_padding_size * headSize + psize * headSize,
+=======
+                      i * num_head * kv_padding_size * headSize +
+                      j * kv_padding_size * headSize + psize * headSize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   dst_data);
           }
         } else {
@@ -713,7 +793,11 @@ void cpu_flash_attention(
             qBlockSize,
             kvBlockSize,
             static_cast<accum_t>(1),
+<<<<<<< HEAD
             v_data + i * vStrideB + kv_j * vStrideH +
+=======
+            v_data + i * vStrideB + j * vStrideH +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 n * vStrideN,
             vStrideN,
             conditional_data_ptr(qk_data, qk_reduced_data),
@@ -778,15 +862,24 @@ void cpu_flash_attention_backward(
   // Sizes
   TORCH_CHECK((query.size(3) == value.size(3)) && (key.size(3) == value.size(3)),
         "scaled_dot_product_attention_flash_attention_backward: Q/K/V should have the same head size");
+<<<<<<< HEAD
   // Query (Batch x Q_seq_len  x Num_heads    x Dim_per_head)
   // Key   (Batch x KV_seq_len x KV_num_heads x Dim_per_head)
   // Value (Batch x KV_seq_len x KV_num_heads x Dim_per_head)
+=======
+  // Query (Batch x Q_seq_len  x Num_heads x Dim_per_head)
+  // Key   (Batch x KV_seq_len x Num_heads x Dim_per_head)
+  // Value (Batch x KV_seq_len x Num_heads x Dim_per_head)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t batchSize = query.size(0);
   int64_t qSize = query.size(1);
   int64_t kvSize = value.size(1);
   int64_t num_head = query.size(2);
+<<<<<<< HEAD
   int64_t kv_num_head = key.size(2);
   int64_t repeat_factor = num_head / kv_num_head;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t headSize = query.size(3);
 
   bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel();
@@ -877,9 +970,15 @@ void cpu_flash_attention_backward(
   accum_t* buf_data = buf.data_ptr<accum_t>();
   scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;
 
+<<<<<<< HEAD
   at::parallel_for(0, batchSize * kv_num_head, 1, [&](int64_t begin, int64_t end) {
     int64_t i = 0, kv_j = 0;
     data_index_init(begin, i, batchSize, kv_j, kv_num_head);
+=======
+  at::parallel_for(0, batchSize * num_head, 1, [&](int64_t begin, int64_t end) {
+    int64_t i = 0, j = 0;
+    data_index_init(begin, i, batchSize, j, num_head);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int ompIdx = at::get_thread_num();
     accum_t* buf_ptr = buf_data + ompIdx * size_per_thread;
     accum_t* attn_data = buf_ptr;
@@ -891,6 +990,7 @@ void cpu_flash_attention_backward(
     at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
     accum_t* dsum_data = dsum.data_ptr<accum_t>();
     for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+<<<<<<< HEAD
       for (int64_t r = 0; r < repeat_factor; r++) {
         int64_t j = kv_j * repeat_factor + r;
         // rowsum of grad_out * out
@@ -1084,6 +1184,198 @@ void cpu_flash_attention_backward(
       }
       // Move to the next query
       data_index_step(i, batchSize, kv_j, kv_num_head);
+=======
+      // rowsum of grad_out * out
+      for (int64_t m = 0; m < qSize; m += qSplitSize) {
+        int64_t qBlockSize = std::min(qSplitSize, qSize - m);
+        // dsum <- rowsum(grad_out * out)
+        for (const auto row : c10::irange(qBlockSize)) {
+          *(dsum_data + row) = vec::map2_reduce_all<scalar_t>(
+            [](Vec x, Vec y) { return x * y; },
+            [](Vec x, Vec y) { return x + y; },
+            grad_out_data + i * grad_oStrideB + j * grad_oStrideH + (m + row) * grad_oStrideM,
+            out_data + i * oStrideB + j * oStrideH + (m + row) * oStrideM,
+            headSize);
+        }
+        int64_t num_keys = is_causal ? std::min(m + qBlockSize, kvSize) : kvSize;
+        for (int64_t n = 0; n < num_keys; n += kvSplitSize) {
+          int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
+          // attn <- scale * q @ k.T
+          cpublas::gemm(
+            TransposeType::Transpose,
+            TransposeType::NoTranspose,
+            kvBlockSize,
+            qBlockSize,
+            headSize,
+            scaling_factor,
+            k_data + i * kStrideB + j * kStrideH +
+                n * kStrideN,
+            kStrideN,
+            q_data + i * qStrideB + j * qStrideH +
+                m * qStrideM,
+            qStrideM,
+            static_cast<accum_t>(0),
+            attn_data,
+            kvBlockSize);
+          // attn <- attn + mask
+          if (has_attn_mask) {
+            accum_t one = accum_t(1);
+            for (const auto row : c10::irange(qBlockSize)) {
+#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
+                _scale_attn_mask_fusion_kernel(
+                  attn_data + row * kvBlockSize,
+                  mask_data + i * mStrideB + j * mStrideH +
+                      (m + row) * mStrideM + (mStrideN == 0 ? 0 : n),
+                  kvBlockSize,
+                  attn_data + row * kvBlockSize,
+                  one,
+                  mStrideN == 0);
+#else
+                if (mStrideN == 0) {
+                  _scale_attn_mask_fusion_kernel</*is_stride_0*/ true>(
+                    attn_data + row * kvBlockSize,
+                    mask_data + i * mStrideB + j * mStrideH +
+                        (m + row) * mStrideM,
+                    kvBlockSize,
+                    attn_data + row * kvBlockSize,
+                    one);
+                } else {
+                  _scale_attn_mask_fusion_kernel</*is_stride_0*/ false>(
+                    attn_data + row * kvBlockSize,
+                    mask_data + i * mStrideB + j * mStrideH +
+                        (m + row) * mStrideM + n,
+                    kvBlockSize,
+                    attn_data + row * kvBlockSize,
+                    one);
+                }
+#endif
+            }
+          }
+          // restore self attention after softmax from logsumexp
+          // attn <- exp(attn - normalizer)
+          for (const auto row : c10::irange(qBlockSize)) {
+            accum_t normalizer = lse_data[i * lStrideB + j * lStrideH + (m + row) * lStrideM];
+            vec::map<accum_t>(
+              [normalizer](Vec x) { return (x - Vec(normalizer)).exp(); },
+              attn_data + row * kvBlockSize,
+              attn_data + row * kvBlockSize,
+              kvBlockSize);
+          }
+          // Apply causal mask, filled unused with 0
+          if (is_causal && num_keys - n <= kvSplitSize) {
+            for (const auto row : c10::irange(qBlockSize)) {
+              int64_t last_col = m + row - n;
+              accum_t* row_ptr = attn_data + row * kvBlockSize;
+              fill_stub(row_ptr + last_col + 1, static_cast<accum_t>(0), kvBlockSize - last_col - 1);
+            }
+          }
+#ifdef _MSC_VER
+          if (is_reduced_type) {
+#else
+          if constexpr (is_reduced_type) {
+#endif
+            for (const auto row : c10::irange(qBlockSize)) {
+              convert<accum_t, scalar_t>(
+                attn_data + row * kvBlockSize,
+                attn_reduced_data + row * kvBlockSize,
+                kvBlockSize);
+            }
+          }
+          // grad_v <- grad_v + attn.T @ grad_out
+          cpublas::gemm(
+            TransposeType::NoTranspose,
+            TransposeType::Transpose,
+            headSize,
+            kvBlockSize,
+            qBlockSize,
+            static_cast<accum_t>(1),
+            grad_out_data + i * grad_oStrideB + j * grad_oStrideH +
+                m * grad_oStrideM,
+            grad_oStrideM,
+            conditional_data_ptr(attn_data, attn_reduced_data),
+            kvBlockSize,
+            static_cast<accum_t>(1),
+            grad_v_data + i * grad_vStrideB + j * grad_vStrideH +
+                n * grad_vStrideN,
+            grad_vStrideN);
+          // grad_attn <- grad_out @ v.T
+          cpublas::gemm(
+            TransposeType::Transpose,
+            TransposeType::NoTranspose,
+            kvBlockSize,
+            qBlockSize,
+            headSize,
+            static_cast<accum_t>(1),
+            v_data + i * vStrideB + j * vStrideH +
+                n * vStrideN,
+            vStrideN,
+            grad_out_data + i * grad_oStrideB + j * grad_oStrideH +
+                m * grad_oStrideM,
+            grad_oStrideM,
+            static_cast<accum_t>(0),
+            grad_attn_data,
+            kvBlockSize);
+          // grad_attn <- attn * (grad_attn - dsum)
+          for (const auto row : c10::irange(qBlockSize)) {
+            accum_t d = *(dsum_data + row);
+            vec::map2<accum_t>(
+              [d](Vec attn, Vec grad_attn) { return attn * (grad_attn - Vec(d)); },
+              grad_attn_data + row * kvBlockSize,
+              attn_data + row * kvBlockSize,
+              grad_attn_data + row * kvBlockSize,
+              kvBlockSize);
+          }
+#ifdef _MSC_VER
+          if (is_reduced_type) {
+#else
+          if constexpr (is_reduced_type) {
+#endif
+            for (const auto row : c10::irange(qBlockSize)) {
+              convert<accum_t, scalar_t>(
+                grad_attn_data + row * kvBlockSize,
+                grad_attn_reduced_data + row * kvBlockSize,
+                kvBlockSize);
+            }
+          }
+          // grad_q <- grad_q + scale * grad_attn @ k
+          cpublas::gemm(
+            TransposeType::NoTranspose,
+            TransposeType::NoTranspose,
+            headSize,
+            qBlockSize,
+            kvBlockSize,
+            scaling_factor,
+            k_data + i * kStrideB + j * kStrideH +
+                n * kStrideN,
+            kStrideN,
+            conditional_data_ptr(grad_attn_data, grad_attn_reduced_data),
+            kvBlockSize,
+            static_cast<accum_t>(1),
+            grad_q_data + i * grad_qStrideB + j * grad_qStrideH +
+                m * grad_qStrideM,
+            grad_qStrideM);
+          // grad_k <- grad_k + scale * grad_attn.T @ q
+          cpublas::gemm(
+            TransposeType::NoTranspose,
+            TransposeType::Transpose,
+            headSize,
+            kvBlockSize,
+            qBlockSize,
+            scaling_factor,
+            q_data + i * qStrideB + j * qStrideH +
+                m * qStrideM,
+            qStrideM,
+            conditional_data_ptr(grad_attn_data, grad_attn_reduced_data),
+            kvBlockSize,
+            static_cast<accum_t>(1),
+            grad_k_data + i * grad_kStrideB + j * grad_kStrideH +
+                n * grad_kStrideN,
+            grad_kStrideN);
+        }
+      }
+      // Move to the next query
+      data_index_step(i, batchSize, j, num_head);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   });
 }
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 7587988528ebb..702c9f73db48c 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -441,7 +441,11 @@ struct ComputeLocation<scalar_t, GridSamplerPadding::Reflection, align_corners>
 // See NOTE [ Grid Sample CPU Kernels ] for details.
 
 template<typename scalar_t>
+<<<<<<< HEAD
 inline void
+=======
+static inline void
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
                  const int_same_size_t<scalar_t> *offsets,
                  const int_same_size_t<scalar_t> *mask, int64_t len) {
@@ -1030,7 +1034,11 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
 // See NOTE [ Grid Sample CPU Kernels ] for details.
 
 template<typename scalar_t, typename ApplyFn>
+<<<<<<< HEAD
 inline void grid_sample_2d_grid_slice_iterator(
+=======
+static inline void grid_sample_2d_grid_slice_iterator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorAccessor<const scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
   int64_t out_H = grid_slice.size(0);
   int64_t out_W = grid_slice.size(1);
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 261683a187b8a..d48a83b8e3dac 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -259,7 +259,11 @@ void histogramdd_out_cpu_template(const Tensor& self, const std::optional<Tensor
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
+<<<<<<< HEAD
 void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight, bool density,
+=======
+static void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight, bool density,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensor& hist, const TensorList& bin_edges) {
     histogramdd_out_cpu_template<BINARY_SEARCH>(self, weight, density, hist, bin_edges);
 }
@@ -269,7 +273,11 @@ void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& we
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
+<<<<<<< HEAD
 void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight,
+=======
+static void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) {
     if (local_search) {
         // histogramdd codepath: both hist and bin_edges are eventually returned as output,
@@ -298,7 +306,11 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N,
     std::copy(max_data, max_data + N, rightmost_edges.begin());
 }
 
+<<<<<<< HEAD
 void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N,
+=======
+static void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges) {
     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "histogramdd", [&]() {
         infer_bin_edges_from_input<scalar_t>(input, N, leftmost_edges, rightmost_edges);
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index 57d3ab89c6174..efe05a5611dcb 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -749,6 +749,7 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
         // });
 
         if (iter_dtype == kByte) {
+<<<<<<< HEAD
           cpu_hflip_vec<uint8_t>(iter);
           return;
         } else if (iter_dtype == kChar) {
@@ -772,6 +773,23 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
         } else if (iter_dtype == kDouble) {
           cpu_hflip_vec<double>(iter);
           return;
+=======
+          return cpu_hflip_vec<uint8_t>(iter);
+        } else if (iter_dtype == kChar) {
+          return cpu_hflip_vec<int8_t>(iter);
+        } else if (iter_dtype == kInt) {
+          return cpu_hflip_vec<int32_t>(iter);
+        } else if (iter_dtype == kLong) {
+          return cpu_hflip_vec<int64_t>(iter);
+        } else if (iter_dtype == kShort) {
+          return cpu_hflip_vec<int16_t>(iter);
+        } else if (iter_dtype == kBool) {
+          return cpu_hflip_vec<bool>(iter);
+        } else if (iter_dtype == kFloat) {
+          return cpu_hflip_vec<float>(iter);
+        } else if (iter_dtype == kDouble) {
+          return cpu_hflip_vec<double>(iter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
       // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@@ -786,12 +804,19 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
           c == input_strides_2[1] &&
           c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
       ) {
+<<<<<<< HEAD
         cpu_hflip_channels_last_vec(iter);
         return;
       }
       // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
       cpu_vflip_memcpy(iter);
       return;
+=======
+        return cpu_hflip_channels_last_vec(iter);
+      }
+      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
+      return cpu_vflip_memcpy(iter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index aad618a258a37..a0bd72698c138 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -46,7 +46,11 @@ using namespace vec;
 template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
+<<<<<<< HEAD
                  std::index_sequence<INDEX...> /*unused*/) {
+=======
+                 std::index_sequence<INDEX...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::make_tuple(
       c10::load<typename traits::template arg<INDEX>::type>(
           data[INDEX] + i * strides[INDEX])...);
@@ -65,7 +69,11 @@ dereference_vec_impl(char* C10_RESTRICT data[],
                      const typename traits::result_type& opt_scalar,
                      size_t S,
                      int64_t i,
+<<<<<<< HEAD
                      std::index_sequence<INDEX...> /*unused*/) {
+=======
+                     std::index_sequence<INDEX...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vec = typename traits::result_type;
   using scalar_t = typename Vec::value_type;
   return std::make_tuple(
@@ -89,7 +97,11 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
   using result_type = typename traits::result_type;
   for (; i < n; i++) {
     result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+<<<<<<< HEAD
     *out_ptr = std::apply(op, dereference<traits>(
+=======
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         &data[1],
         &strides[1],
         i));
@@ -102,7 +114,11 @@ inline void
 execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
   using traits = function_traits<func_t>;
   for (; i < n; i++) {
+<<<<<<< HEAD
     std::apply(op, dereference<traits>(
+=======
+    c10::guts::apply(op, dereference<traits>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         &data[0],
         &strides[0],
         i));
@@ -162,7 +178,11 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
 }
 
 // Loop operation for `cpu_kernel_multiple_outputs`.
+<<<<<<< HEAD
 // 1. Use `std::apply` to make dynamic method invocation
+=======
+// 1. Use `c10::guts::apply` to make dynamic method invocation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //    for the lambda passed in `cpu_kernel_multiple_outputs`.
 // 2. Iterate over the members of the returned tuple, set the corresponding
 //    output tensor by the tuple member in `handle_tuple_outputs` function.
@@ -183,7 +203,11 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
   }
 
   for (; i < n; i++) {
+<<<<<<< HEAD
     auto output = std::apply(op, dereference<traits>(
+=======
+    auto output = c10::guts::apply(op, dereference<traits>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       &data[num_outputs],
       &strides[num_outputs],
       i));
@@ -213,8 +237,13 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
     auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
     auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
+<<<<<<< HEAD
     auto out1 = std::apply(vop, std::move(args1));
     auto out2 = std::apply(vop, std::move(args2));
+=======
+    auto out1 = c10::guts::apply(vop, std::move(args1));
+    auto out2 = c10::guts::apply(vop, std::move(args2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out1.store(data[0] + i * sizeof(scalar_t));
     out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
   }
@@ -231,7 +260,11 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
 template <typename traits, typename cb_t>
 inline void unroll_contiguous_scalar_checks(
     const int64_t* /*strides*/,
+<<<<<<< HEAD
     std::index_sequence<> /*unused*/,
+=======
+    std::index_sequence<>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cb_t&& cb) {
   cb(0);
 }
@@ -239,7 +272,11 @@ inline void unroll_contiguous_scalar_checks(
 template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
 inline void unroll_contiguous_scalar_checks(
     const int64_t* strides,
+<<<<<<< HEAD
     std::index_sequence<INDEX0, INDEX...> /*unused*/,
+=======
+    std::index_sequence<INDEX0, INDEX...>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cb_t&& cb) {
   if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
     cb(INDEX0 + 1);
diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
index a888b8fa801c5..7bf6841d47c9b 100644
--- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
@@ -30,7 +30,11 @@ vec::Vectorized<scalar_t> is_nan_vec(vec::Vectorized<scalar_t> vec) {
   return vec.isnan();
 }
 
+<<<<<<< HEAD
 // TODO: use is_integral/is_same to check the scalar_t and simplify the implementation
+=======
+// TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // currently it does not work
 template <>
 vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) {
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
index fca7d8bdce5ae..de0f640094a62 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -85,11 +85,19 @@ void cpu_max_unpool(
     if constexpr (is_3d) {
       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
           " (output volumes are of size ", output_depth,
+<<<<<<< HEAD
           "x", output_height, "x", output_width, ")");
     } else {
       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
           " (output volumes are of size ", output_height,
           "x", output_width, ")");
+=======
+          "x", output_height, "x", output_width);
+    } else {
+      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
+          " (output volumes are of size ", output_height,
+          "x", output_width);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index 7ea8e87e28b1b..bfb8417759702 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -210,7 +210,11 @@ multinomial_with_replacement_apply(
   }
 }
 
+<<<<<<< HEAD
 void multinomial_with_replacement_kernel_impl(
+=======
+static void multinomial_with_replacement_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
index 853fc959f6345..fb0af34aead7b 100644
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -96,7 +96,11 @@ struct ReplicationPad {
 };
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
+=======
+static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vec = Vectorized<scalar_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
@@ -112,7 +116,11 @@ inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) {
+=======
+static inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vec = Vectorized<scalar_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
@@ -156,7 +164,11 @@ void cpu_padding(
   int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
   int64_t offset_w = p.offsets[ndim - 1];
 
+<<<<<<< HEAD
   // do vectorized copy when output is overlapped with input on W,
+=======
+  // do vectorized copy whe output is overlapped with input on W,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // only applies to positive padding
   auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
     if (positive_padding) {
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index 6fad9270bf193..f70e4e24a95d3 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -9,7 +9,11 @@
 namespace at::native {
 namespace {
 
+<<<<<<< HEAD
 void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+=======
+static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ScalarType dtype = iter.common_dtype();
   if (at::isReducedFloatingType(dtype)) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcmul_cpu_out", [&]() {
@@ -50,7 +54,11 @@ void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
+<<<<<<< HEAD
 void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+=======
+static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ScalarType dtype = iter.common_dtype();
   if (at::isReducedFloatingType(dtype)) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcdiv_cpu_out", [&]() {
@@ -90,7 +98,11 @@ void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
+<<<<<<< HEAD
 void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
+=======
+static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ScalarType dtype = iter.dtype(0);
   if (dtype == kBFloat16) {
     auto norm_val = norm.to<float>();
@@ -176,7 +188,11 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou
   }
 }
 
+<<<<<<< HEAD
 void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
+=======
+static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "huber_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
@@ -215,7 +231,11 @@ void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double
   });
 }
 
+<<<<<<< HEAD
 void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) {
+=======
+static void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "mse_backward_cpu_out", [&] {
     scalar_t scalar_val = value.to<scalar_t>();
diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
index ed23503099ed3..c0839ee3e220b 100644
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -96,6 +96,7 @@ static void pow_tensor_scalar_kernel(
       dtype == kBFloat16 || isComplexType(dtype)) {
     // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
     if (exp_scalar.equal(.5)) {
+<<<<<<< HEAD
       sqrt_kernel(iter);
       return;
     } else if (exp_scalar.equal(-0.5)) {
@@ -104,6 +105,13 @@ static void pow_tensor_scalar_kernel(
     } else if (exp_scalar.equal(-1.0)) {
       reciprocal_kernel(iter);
       return;
+=======
+      return sqrt_kernel(iter);
+    } else if (exp_scalar.equal(-0.5)) {
+      return rsqrt_kernel(iter);
+    } else if (exp_scalar.equal(-1.0)) {
+      return reciprocal_kernel(iter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -120,7 +128,11 @@ static void pow_tensor_scalar_kernel(
   } else if (dtype == ScalarType::Half) {
     [&]() {
       using scalar_t =
+<<<<<<< HEAD
           c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
+=======
+          decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto exp = exp_scalar.to<scalar_t>();
       using Vec = Vectorized<scalar_t>;
       cpu_kernel_vec(iter,
diff --git a/aten/src/ATen/native/cpu/README.md b/aten/src/ATen/native/cpu/README.md
index 6a7ed0d12b0eb..838a3d0282ecf 100644
--- a/aten/src/ATen/native/cpu/README.md
+++ b/aten/src/ATen/native/cpu/README.md
@@ -74,7 +74,11 @@ it to sum up the entire array into a single value.
 
 `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
 compiler flags it is currently compiled. This allows the programmer to write
+<<<<<<< HEAD
 generic code, which will be compiled under multiplied compilation settings.
+=======
+generic code, which will be compiled under multipled compilation settings.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
 a generic definition of `sumImplAll`. This function allows the user to reduce
diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
index b469aa5c2eee6..a38342191b8ed 100644
--- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
+++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
@@ -18,7 +18,11 @@ namespace {
 
 using namespace vec;
 
+<<<<<<< HEAD
 void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) {
+=======
+static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "arange_cpu", [&]() {
     using accscalar_t = at::acc_type<scalar_t, false>;
     auto start = scalar_start.to<accscalar_t>();
@@ -42,7 +46,11 @@ void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scala
   });
 }
 
+<<<<<<< HEAD
 void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) {
+=======
+static void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "linspace_cpu", [&]() {
     // step should be of double type for all integral types
     using step_t = std::conditional_t<std::is_integral_v<scalar_t>, double, scalar_t>;
diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
index c7eaa802af125..252298596ad96 100644
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@@ -62,7 +62,11 @@ inline void reduce_all_impl(
   output.fill_(result);
 }
 
+<<<<<<< HEAD
 void min_all_kernel_impl(Tensor& result, const Tensor& input) {
+=======
+static void min_all_kernel_impl(Tensor& result, const Tensor& input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (input.scalar_type() == ScalarType::Bool) {
     TensorIterator iter = TensorIteratorConfig()
       .add_input(input)
@@ -87,7 +91,11 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
   }
 }
 
+<<<<<<< HEAD
 void max_all_kernel_impl(Tensor& result, const Tensor& input) {
+=======
+static void max_all_kernel_impl(Tensor& result, const Tensor& input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (input.scalar_type() == ScalarType::Bool) {
     TensorIterator iter = TensorIteratorConfig()
       .add_input(input)
@@ -167,7 +175,11 @@ inline void reduce_all_impl_vec_two_outputs(
   output2.fill_(result.second);
 }
 
+<<<<<<< HEAD
 void aminmax_allreduce_kernel(
+=======
+static void aminmax_allreduce_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     Tensor& min_result,
     Tensor& max_result) {
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 2e62936501948..e1d0689713420 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -28,7 +28,11 @@ namespace at::native { namespace {
 using namespace vec;
 
 template <typename scalar_t, typename func_t>
+<<<<<<< HEAD
 inline void cpu_cum_base_kernel(const Tensor& result,
+=======
+static inline void cpu_cum_base_kernel(const Tensor& result,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     const func_t& f,
@@ -76,7 +80,11 @@ inline void cpu_cum_base_kernel(const Tensor& result,
   iter.for_each(loop, grain_size);
 }
 
+<<<<<<< HEAD
 void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+=======
+static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -95,7 +103,11 @@ void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   });
 }
 
+<<<<<<< HEAD
 void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+=======
+static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -114,7 +126,11 @@ void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   });
 }
 
+<<<<<<< HEAD
 void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
+=======
+static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -135,7 +151,11 @@ void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
   });
 }
 
+<<<<<<< HEAD
 void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
+=======
+static void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
         iter,
@@ -148,7 +168,11 @@ void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt
   });
 }
 
+<<<<<<< HEAD
 void prod_kernel_impl(TensorIterator& iter) {
+=======
+static void prod_kernel_impl(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Workaround for the error: '*' in boolean context, suggest '&&' instead
   if (iter.dtype() == ScalarType::Bool) {
     using scalar_t = bool;
@@ -203,7 +227,11 @@ void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) {
   }
 }
 
+<<<<<<< HEAD
 void norm_kernel_tensor_iterator_impl(
+=======
+static void norm_kernel_tensor_iterator_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorIterator& iter,
     const Scalar& p) {
   double val = 0;
@@ -256,10 +284,17 @@ void norm_kernel_tensor_iterator_impl(
   } else {
     if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
       // type promotion that does cast and reduction in a single kernel
+<<<<<<< HEAD
       norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
     } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
       // type promotion that does cast and reduction in a single kernel
       norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
+=======
+      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
+      // type promotion that does cast and reduction in a single kernel
+      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
@@ -274,7 +309,11 @@ void norm_kernel_tensor_iterator_impl(
   }
 }
 
+<<<<<<< HEAD
 void and_kernel_impl(TensorIterator& iter) {
+=======
+static void and_kernel_impl(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
@@ -312,7 +351,11 @@ void and_kernel_impl(TensorIterator& iter) {
   }
 }
 
+<<<<<<< HEAD
 void or_kernel_impl(TensorIterator& iter) {
+=======
+static void or_kernel_impl(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
@@ -346,7 +389,11 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
   }
 };
 
+<<<<<<< HEAD
 void min_values_kernel_impl(TensorIterator& iter) {
+=======
+static void min_values_kernel_impl(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (iter.dtype() == kLong) {
     // This case is special because of Vectorized<int64_t> does not
     // handle upper_bound<int64_t>().
@@ -367,7 +414,11 @@ void min_values_kernel_impl(TensorIterator& iter) {
   });
 }
 
+<<<<<<< HEAD
 void max_values_kernel_impl(TensorIterator& iter) {
+=======
+static void max_values_kernel_impl(TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
     binary_kernel_reduce_vec(
       iter,
@@ -377,7 +428,11 @@ void max_values_kernel_impl(TensorIterator& iter) {
   });
 }
 
+<<<<<<< HEAD
 void argmax_kernel_impl(TensorIterator &iter) {
+=======
+static void argmax_kernel_impl(TensorIterator &iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] {
     if (is_reduce_lastdim(iter)) {
       using arg_t = std::pair<scalar_t, int64_t>;
@@ -401,7 +456,11 @@ void argmax_kernel_impl(TensorIterator &iter) {
   });
 }
 
+<<<<<<< HEAD
 void argmin_kernel_impl(TensorIterator &iter) {
+=======
+static void argmin_kernel_impl(TensorIterator &iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] {
     if (is_reduce_lastdim(iter)) {
       using arg_t = std::pair<scalar_t, int64_t>;
@@ -425,6 +484,7 @@ void argmin_kernel_impl(TensorIterator &iter) {
   });
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename acc_t = uint64_t, typename out_t = acc_t>
 struct XorSumOps {
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
@@ -468,6 +528,8 @@ void xor_sum_kernel_impl(TensorIterator& iter) {
       });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // anonymous namespace
 
 REGISTER_DISPATCH(std_var_stub, &std_var_kernel_impl)
@@ -482,7 +544,10 @@ REGISTER_DISPATCH(min_values_stub, &min_values_kernel_impl)
 REGISTER_DISPATCH(max_values_stub, &max_values_kernel_impl)
 REGISTER_DISPATCH(argmax_stub, &argmax_kernel_impl)
 REGISTER_DISPATCH(argmin_stub, &argmin_kernel_impl)
+<<<<<<< HEAD
 REGISTER_DISPATCH(xor_sum_stub, &xor_sum_kernel_impl)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 REGISTER_DISPATCH(cumprod_stub, &cumprod_cpu_kernel)
 REGISTER_DISPATCH(cumsum_stub, &cumsum_cpu_kernel)
diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
index 1b0be8d18db7d..199b89337bd62 100644
--- a/aten/src/ATen/native/cpu/ReduceUtils.h
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -8,6 +8,10 @@
 #include <c10/util/irange.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/cpu/utils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/OpMathType.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 inline namespace CPU_CAPABILITY {
diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
index 8d22201ed63c4..63419dcb6861f 100644
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@@ -428,11 +428,18 @@ void fp16_gemv_trans(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
   if (at::globalContext().allowFP16ReductionCPU()) {
+<<<<<<< HEAD
     fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
     return;
   }
 #endif
   fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+=======
+    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  }
+#endif
+  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@@ -466,7 +473,11 @@ void bf16_gemv_trans(
   at::BFloat16* y,
   const int incy) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
+<<<<<<< HEAD
   bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+=======
+  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 float fp16_dot(
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index 895263bc44664..78663cecf2163 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -41,7 +41,11 @@ class ReduceMultiply {
     *self_data = c10::load(self_data) && c10::load(src_data);
   }
 };
+<<<<<<< HEAD
 ReduceMultiply reduce_multiply;
+=======
+static ReduceMultiply reduce_multiply;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReduceAdd {
 public:
@@ -51,7 +55,11 @@ class ReduceAdd {
     *self_data += opmath_t(c10::load(src_data));
   }
 };
+<<<<<<< HEAD
 ReduceAdd reduce_add;
+=======
+static ReduceAdd reduce_add;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReduceMean {
 public:
@@ -61,7 +69,11 @@ class ReduceMean {
     *self_data += opmath_t(c10::load(src_data));
   }
 };
+<<<<<<< HEAD
 ReduceMean reduce_mean;
+=======
+static ReduceMean reduce_mean;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReduceMaximum {
 public:
@@ -73,7 +85,11 @@ class ReduceMaximum {
     *self_data = at::_isnan<scalar_t>(src_value) ? opmath_t(src_value) : std::max(self_value, opmath_t(src_value));
   }
 };
+<<<<<<< HEAD
 ReduceMaximum reduce_maximum;
+=======
+static ReduceMaximum reduce_maximum;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReduceMinimum {
 public:
@@ -85,7 +101,11 @@ class ReduceMinimum {
     *self_data = at::_isnan<scalar_t>(src_value) ? opmath_t(src_value) : std::min(self_value, opmath_t(src_value));
   }
 };
+<<<<<<< HEAD
 ReduceMinimum reduce_minimum;
+=======
+static ReduceMinimum reduce_minimum;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TensorAssign {
 public:
@@ -95,7 +115,11 @@ class TensorAssign {
     *self_data = opmath_t(c10::load(src_data));
   }
 };
+<<<<<<< HEAD
 TensorAssign tensor_assign;
+=======
+static TensorAssign tensor_assign;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <bool is_scatter_like = true>
 struct _cpu_scatter_gather_dim_loop {
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 9ecfe55cedc4a..abcfab16806f1 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -7,7 +7,10 @@
 #include <algorithm>
 #include <iterator>
 #include <numeric>
+<<<<<<< HEAD
 #include <vector>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
@@ -17,6 +20,10 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <ATen/OpMathType.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // [Note AVX-SSE transitions] In general we avoid calls into cmath for code
 // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
@@ -647,10 +654,17 @@ _vec_softmax(
   parallel_for(
       0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
         int64_t idx = begin;
+<<<<<<< HEAD
         std::vector<float> temp_vec_input(dim_size * vectorized_step);
         std::vector<float> temp_vec_output(dim_size * vectorized_step);
         float* temp_vec_input_data = temp_vec_input.data();
         float* temp_vec_output_data = temp_vec_output.data();
+=======
+        std::unique_ptr<float[]> temp_vec_input(new float[dim_size*vectorized_step]());
+        std::unique_ptr<float[]> temp_vec_output(new float[dim_size*vectorized_step]());
+        float* temp_vec_input_data = temp_vec_input.get();
+        float* temp_vec_output_data = temp_vec_output.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while (idx < end) {
           int64_t outer_idx = idx / inner_size;
           int64_t inner_idx = idx % inner_size;
@@ -968,7 +982,11 @@ struct vec_host_softmax_backward {
   }
 };
 
+<<<<<<< HEAD
 void softmax_lastdim_kernel_impl(
+=======
+static void softmax_lastdim_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& result,
     const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -977,13 +995,21 @@ void softmax_lastdim_kernel_impl(
       [&] { vec_host_softmax_lastdim<scalar_t, false>::apply(result, self); });
 }
 
+<<<<<<< HEAD
 void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+=======
+static void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(),
     "softmax_kernel_impl",
     [&] { vec_softmax<scalar_t, false>::apply(result, self, dim); });
 }
 
+<<<<<<< HEAD
 void log_softmax_lastdim_kernel_impl(
+=======
+static void log_softmax_lastdim_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& result,
     const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -992,13 +1018,21 @@ void log_softmax_lastdim_kernel_impl(
       [&] { vec_host_softmax_lastdim<scalar_t, true>::apply(result, self); });
 }
 
+<<<<<<< HEAD
 void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+=======
+static void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(),
     "softmax_kernel_impl",
     [&] { vec_softmax<scalar_t, true>::apply(result, self, dim); });
 }
 
+<<<<<<< HEAD
 void softmax_backward_lastdim_kernel_impl(
+=======
+static void softmax_backward_lastdim_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output) {
@@ -1010,7 +1044,11 @@ void softmax_backward_lastdim_kernel_impl(
       });
 }
 
+<<<<<<< HEAD
 void log_softmax_backward_lastdim_kernel_impl(
+=======
+static void log_softmax_backward_lastdim_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output) {
@@ -1022,7 +1060,11 @@ void log_softmax_backward_lastdim_kernel_impl(
       });
 }
 
+<<<<<<< HEAD
 void softmax_backward_kernel_impl(
+=======
+static void softmax_backward_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output,
@@ -1038,7 +1080,11 @@ void softmax_backward_kernel_impl(
       });
 }
 
+<<<<<<< HEAD
 void log_softmax_backward_kernel_impl(
+=======
+static void log_softmax_backward_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output,
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
index 7d337c119c983..22ff180e1333a 100644
--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -90,7 +90,11 @@ struct KeyValueCompDesc {
 };
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 bool can_use_radix_sort(const TensorBase& values, const bool descending) {
+=======
+static bool can_use_radix_sort(const TensorBase& values, const bool descending) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // radix_sort can be used only for 1D data
   if (values.dim() != 1) return false;
   // radix_sort sorts in ascending order
@@ -106,7 +110,11 @@ bool can_use_radix_sort(const TensorBase& values, const bool descending) {
   return true;
 }
 
+<<<<<<< HEAD
 void parallel_sort1d_kernel(
+=======
+static void parallel_sort1d_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorBase& values,
     const TensorBase& indices) {
   AT_DISPATCH_INTEGRAL_TYPES(values.scalar_type(), "parallel_sort1d_kernel", [&] {
@@ -140,7 +148,11 @@ void parallel_sort1d_kernel(
 #endif
 
 template <typename scalar_t, typename value_accessor_t, typename indices_accessor_t>
+<<<<<<< HEAD
 inline void sort_kernel_impl(const value_accessor_t& value_accessor,
+=======
+static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             const indices_accessor_t& indices_accessor,
             int64_t dim_size, bool descending, bool stable) {
   auto composite_accessor = CompositeRandomAccessorCPU<
@@ -165,7 +177,11 @@ inline void sort_kernel_impl(const value_accessor_t& value_accessor,
   }
 }
 
+<<<<<<< HEAD
 void sort_kernel(
+=======
+static void sort_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorBase& self,
     const TensorBase& values,
     const TensorBase& indices,
@@ -222,7 +238,11 @@ void sort_kernel(
   );
 }
 
+<<<<<<< HEAD
 void topk_kernel(
+=======
+static void topk_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const TensorBase &values,
     const TensorBase &indices,
     const TensorBase &self,
diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp
index 0fda4ae05f3e0..f6a86ac9484d1 100644
--- a/aten/src/ATen/native/cpu/SumKernel.cpp
+++ b/aten/src/ATen/native/cpu/SumKernel.cpp
@@ -286,12 +286,20 @@ struct CastStoreAccumulate {
 };
 
 template <typename StorePolicy, typename scalar_t>
+<<<<<<< HEAD
 void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) {
+=======
+static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   StorePolicy::store(data, stride, index, value);
 }
 
 template <typename StorePolicy, typename scalar_t, size_t numel>
+<<<<<<< HEAD
 void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+=======
+static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   const std::array<scalar_t, numel> &values) {
   auto *base_ptr = data + stride * index;
   for (const auto k : c10::irange(numel)) {
@@ -301,7 +309,11 @@ void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
 }
 
 template <typename StorePolicy, typename scalar_t>
+<<<<<<< HEAD
 void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+=======
+static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   const Vectorized<scalar_t> &values) {
   using vec_t = Vectorized<scalar_t>;
   alignas(64) std::array<scalar_t, vec_t::size()> array_values{};
diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
index c479e1610cbeb..ad4193e69926e 100644
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@@ -29,7 +29,11 @@
 namespace at::native { namespace {
 
 template <typename scalar_t, typename scalar_t_2 = int64_t, typename loop1d_t>
+<<<<<<< HEAD
 inline void compare_base_kernel_core(
+=======
+static inline void compare_base_kernel_core(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& result1,
     const Tensor& result2,
     const Tensor& self,
@@ -71,7 +75,11 @@ inline void compare_base_kernel_core(
 }
 
 template <typename scalar_t, typename scalar_t_2=int64_t, typename func_t>
+<<<<<<< HEAD
 inline void compare_base_kernel(const Tensor& result1, const Tensor& result2,
+=======
+static inline void compare_base_kernel(const Tensor& result1, const Tensor& result2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -98,7 +106,11 @@ inline void compare_base_kernel(const Tensor& result1, const Tensor& result2,
       result1, result2, self, dim, keepdim, loop);
 }
 
+<<<<<<< HEAD
 void min_kernel_impl(
+=======
+static void min_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& result,
     const Tensor& indice,
     const Tensor& self,
@@ -131,7 +143,11 @@ void min_kernel_impl(
   });
 }
 
+<<<<<<< HEAD
 void max_kernel_impl(
+=======
+static void max_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& result,
     const Tensor& indice,
     const Tensor& self,
@@ -164,7 +180,11 @@ void max_kernel_impl(
   });
 }
 
+<<<<<<< HEAD
 void aminmax_kernel(
+=======
+static void aminmax_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -212,7 +232,11 @@ void aminmax_kernel(
   });
 }
 
+<<<<<<< HEAD
 void where_kernel_impl(TensorIterator &iter) {
+=======
+static void where_kernel_impl(TensorIterator &iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_V2(
     iter.dtype(), "where_cpu", [&] {
       cpu_kernel(
@@ -224,19 +248,31 @@ void where_kernel_impl(TensorIterator &iter) {
   kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_FLOAT8_TYPES));
 }
 
+<<<<<<< HEAD
 void isposinf_kernel_impl(TensorIteratorBase& iter) {
+=======
+static void isposinf_kernel_impl(TensorIteratorBase& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_cpu", [&]() {
     cpu_kernel(iter, [](scalar_t a) -> bool { return a == std::numeric_limits<scalar_t>::infinity(); });
   });
 }
 
+<<<<<<< HEAD
 void isneginf_kernel_impl(TensorIteratorBase& iter) {
+=======
+static void isneginf_kernel_impl(TensorIteratorBase& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_cpu", [&]() {
     cpu_kernel(iter, [](scalar_t a) -> bool { return a == -std::numeric_limits<scalar_t>::infinity(); });
   });
 }
 
+<<<<<<< HEAD
 void mode_kernel_impl(
+=======
+static void mode_kernel_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& values,
     Tensor& indices,
     const Tensor& self,
@@ -308,7 +344,11 @@ void mode_kernel_impl(
 
 // Default brute force implementation of isin(). Used when the number of test elements is small.
 // Iterates through each element and checks it against each test element.
+<<<<<<< HEAD
 void isin_default_kernel_cpu(
+=======
+static void isin_default_kernel_cpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& elements,
     const Tensor& test_elements,
     bool invert,
@@ -339,7 +379,11 @@ void isin_default_kernel_cpu(
   });
 }
 
+<<<<<<< HEAD
 void clamp_kernel_impl(TensorIteratorBase& iter) {
+=======
+static void clamp_kernel_impl(TensorIteratorBase& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_cpu", [&]() {
     cpu_kernel_vec(iter,
       [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t {
@@ -355,7 +399,11 @@ void clamp_kernel_impl(TensorIteratorBase& iter) {
   });
 }
 
+<<<<<<< HEAD
 void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) {
+=======
+static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_scalar_cpu", [&]() {
     const auto min = min_.to<scalar_t>();
     const auto max = max_.to<scalar_t>();
@@ -371,7 +419,11 @@ void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, cons
   });
 }
 
+<<<<<<< HEAD
 void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
+=======
+static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() {
     const auto max = max_.to<scalar_t>();
     const Vectorized<scalar_t> max_vec(max);
@@ -385,7 +437,11 @@ void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
   });
 }
 
+<<<<<<< HEAD
 void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) {
+=======
+static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_min_scalar_cpu", [&]() {
     const auto min = min_.to<scalar_t>();
     const Vectorized<scalar_t> min_vec(min);
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index 444ec10861da8..ff397a8e183e9 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -13,7 +13,11 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void cadd(
+=======
+static inline void cadd(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* z,
     const scalar_t* x,
     const scalar_t* y,
@@ -34,7 +38,11 @@ inline void cadd(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void unfolded2d_acc(
+=======
+static void unfolded2d_acc(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* finput_data,
     scalar_t* input_data,
     int64_t kH,
@@ -113,7 +121,11 @@ void unfolded2d_acc(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void unfolded2d_acc_channels_last(
+=======
+static void unfolded2d_acc_channels_last(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* finput_data,
     scalar_t* input_data,
     int64_t kH,
@@ -169,10 +181,13 @@ void unfolded2d_acc_channels_last(
 
 /* note: due to write issues, this one cannot be parallelized as well as
  * unfolded2d_copy */
+<<<<<<< HEAD
 #if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
 // Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
 __attribute__((optimize("no-tree-vectorize")))
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void unfolded2d_acc_kernel(
     ScalarType dtype,
     void *finput_data,
@@ -225,7 +240,11 @@ void unfolded2d_acc_kernel(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void unfolded2d_copy(
+=======
+static void unfolded2d_copy(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
@@ -240,7 +259,11 @@ void unfolded2d_copy(
     int64_t output_height,
     int64_t output_width) {
   at::parallel_for(
+<<<<<<< HEAD
       0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
+=======
+      0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (const auto k : c10::irange(start, end)) {
           int64_t nip = k / (kH * kW);
           int64_t rest = k % (kH * kW);
@@ -316,7 +339,11 @@ void unfolded2d_copy(
                 for (int64_t x = 0; x < output_width; x++)
                   memcpy(
                       dst + (size_t)y * output_width + x,
+<<<<<<< HEAD
                       src + (size_t)iy * input_width + ix + x * dW,
+=======
+                      src + (size_t)iy * input_width + ix + (int64_t)x * dW,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       sizeof(scalar_t) * (1));
               }
             }
@@ -326,7 +353,11 @@ void unfolded2d_copy(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void unfolded2d_copy_channels_last(
+=======
+static void unfolded2d_copy_channels_last(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index e59e5985bf7f3..56859dca14ed8 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -157,13 +157,21 @@ struct Interpolate<1, scalar_t, opmath_t, index_t, 2> {
 };
 
 template <int n, typename scalar_t, typename index_t, int interp_size>
+<<<<<<< HEAD
 inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
+=======
+static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using opmath_t = at::opmath_type<scalar_t>;
   return Interpolate<n, scalar_t, opmath_t, index_t, interp_size>::eval(src, data, strides, i);
 }
 
 template <typename scalar_t, typename index_t>
+<<<<<<< HEAD
 inline scalar_t interpolate_aa_single_dim_zero_strides(
+=======
+static inline scalar_t interpolate_aa_single_dim_zero_strides(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     char* src,
     char** data,
     const index_t ids_stride) {
@@ -187,7 +195,11 @@ inline scalar_t interpolate_aa_single_dim_zero_strides(
 }
 
 template <typename scalar_t, typename index_t>
+<<<<<<< HEAD
 inline scalar_t interpolate_aa_single_dim(
+=======
+static inline scalar_t interpolate_aa_single_dim(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     char* src,
     char** data,
     const int64_t* strides,
@@ -213,7 +225,11 @@ inline scalar_t interpolate_aa_single_dim(
 }
 
 template<int m>
+<<<<<<< HEAD
 inline bool is_zero_stride(const int64_t* strides) {
+=======
+static inline bool is_zero_stride(const int64_t* strides) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool output = strides[0] == 0;
   for (const auto i : c10::irange(1, m)) {
     output &= (strides[i] == 0);
@@ -222,7 +238,11 @@ inline bool is_zero_stride(const int64_t* strides) {
 }
 
 template <typename scalar_t, typename index_t, int interp_size>
+<<<<<<< HEAD
 inline bool is_contiguous_stride(const int64_t* strides) {
+=======
+static inline bool is_contiguous_stride(const int64_t* strides) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t));
   for (int i=2; i<2 * interp_size; i+=2) {
     output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t));
@@ -282,13 +302,21 @@ struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, inte
 };
 
 template <int n, int s, typename scalar_t, typename index_t, int interp_size>
+<<<<<<< HEAD
 inline bool check_almost_all_zero_stride(const int64_t* strides) {
+=======
+static inline bool check_almost_all_zero_stride(const int64_t* strides) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return CheckAlmostAllZeroStrides<n, s, scalar_t, index_t, interp_size>::eval(strides);
 }
 
 // Helper method to compute interpolation for nearest, linear, cubic modes
 template <typename scalar_t, typename index_t, int out_ndims, int interp_size>
+<<<<<<< HEAD
 inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
+=======
+static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   char* dst = data[0];
   char* src = data[1];
   for (const auto i : c10::irange(n)) {
@@ -298,7 +326,11 @@ inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void basic_loop_aa_vertical(
+=======
+static inline void basic_loop_aa_vertical(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     char** data,
     const int64_t* strides,
     int64_t n,
@@ -354,7 +386,11 @@ inline void basic_loop_aa_vertical<uint8_t>(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void basic_loop_aa_horizontal(
+=======
+static inline void basic_loop_aa_horizontal(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     char** data,
     const int64_t* strides,
     int64_t n,
@@ -1038,7 +1074,11 @@ struct HelperInterpNearest : public HelperInterpBase {
   // We keep this structure for BC and consider as deprecated.
   // See HelperInterpNearestExact as replacement
 
+<<<<<<< HEAD
   static constexpr int interp_size = 1;
+=======
+  static const int interp_size = 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static inline void init_indices_weights(
     at::ScalarType output_type,
@@ -1155,7 +1195,11 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
 
 struct HelperInterpLinear : public HelperInterpBase {
 
+<<<<<<< HEAD
   static constexpr int interp_size = 2;
+=======
+  static const int interp_size = 2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
@@ -1275,7 +1319,11 @@ struct HelperInterpLinear : public HelperInterpBase {
 
 struct HelperInterpCubic : public HelperInterpBase {
 
+<<<<<<< HEAD
   static constexpr int interp_size = 4;
+=======
+  static const int interp_size = 4;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
index 073cc4fd7e8bb..debaceef74b5f 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -35,7 +35,11 @@ Like PIL, Pillow is licensed under the open source HPND License
 
 namespace {
 
+<<<<<<< HEAD
 inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+=======
+static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int32_t v;
   if (i32_aligned) {
     v = *(const int32_t*)ptr;
@@ -45,11 +49,19 @@ inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligne
   return _mm_cvtsi32_si128(v);
 }
 
+<<<<<<< HEAD
 inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
   return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned));
 }
 
 inline void _write_endline_rgb_as_uint32(
+=======
+static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+  return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned));
+}
+
+static inline void _write_endline_rgb_as_uint32(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uint8_t* C10_RESTRICT output,
     uint32_t data
 ) {
@@ -889,7 +901,11 @@ void ImagingResampleHorizontalConvolution8u(
             _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
             _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
 
+<<<<<<< HEAD
         // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
+=======
+        // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // RGBA: pix1 = [
         //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0
         //   r4 0 r5 0  g4 0 g5 0  b4 0 b5 0  a4 0 a5 0
diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h
index d66bb0a0ec068..a5fa02558cd01 100644
--- a/aten/src/ATen/native/cpu/avx_mathfun.h
+++ b/aten/src/ATen/native/cpu/avx_mathfun.h
@@ -240,7 +240,11 @@ _PS256_CONST(coscof_p2,  4.166664568298827E-002);
 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 
 
+<<<<<<< HEAD
 /* evaluation of 8 sines at once using AVX intrinsics
+=======
+/* evaluation of 8 sines at onces using AVX intrinsics
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
    The code is the exact rewriting of the cephes sinf function.
    Precision is excellent as long as x < 8192 (I did not bother to
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index d013dfa0485e0..a471ff71cce07 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -318,7 +318,11 @@ batch_norm_cpu_collect_stats_channels_last_impl(
     //
     // The optimal THRESHOLD to tile was found empirically.
     // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
+<<<<<<< HEAD
     // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+=======
+    // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     //
     // When num_threads == 1, always use Method 2 as there is no synchronization overhead.
     //
diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index adac022bc8a5d..f157a6a314105 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -311,7 +311,11 @@ void GroupNormKernelImplChannelsLastInternal(
   const bool gamma_null = (gamma_data == nullptr);
   const bool beta_null = beta_data == nullptr;
 
+<<<<<<< HEAD
   // NB: About algorithm chosen:
+=======
+  // NB: About algorithm choosen:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // On channels last, GroupNorm has a input shape of {N, H, W, GD},
   // Mean and rstd are collected per each n and g, which involves reduction
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
index a9683ba4bef3f..f45c0e450ee9d 100644
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -838,7 +838,11 @@ void dyn_quant_pack_4bit_weight_kernel(
   }
 }
 
+<<<<<<< HEAD
 void ref_dyn_quant_matmul_4bit_channelwise_kernel(
+=======
+static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size_t m,
     size_t n,
     size_t k,
@@ -906,7 +910,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           // Round to nearest integer
           const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
+<<<<<<< HEAD
           int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
+=======
+          int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           // LHS offset at the beginning of the row
           *((float*)(dst_ptr)) = recip_scale0;
@@ -930,7 +938,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
         }
       };
 
+<<<<<<< HEAD
   // Dynamically Quantize the float32 input to 8 bit asymmetric
+=======
+  // Dynamically Quantize the float32 input to 8 bit assymetric
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
 
   const size_t lhs_stride =
@@ -997,7 +1009,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
   }
 }
 
+<<<<<<< HEAD
 void ref_dyn_quant_matmul_4bit_groupwise_kernel(
+=======
+static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size_t m,
     size_t n,
     size_t k,
@@ -1048,7 +1064,11 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
       zero_point0 = (std::min)(zero_point0, qmax);
       const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
+<<<<<<< HEAD
       int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
+=======
+      int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       *((float*)(dst_ptr)) = recip_scale0;
       dst_ptr += sizeof(float);
@@ -1163,7 +1183,11 @@ void dyn_quant_matmul_4bit_kernel(
   const int64_t weight_packed_size =
       kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
   if (weight_packed_size == packed_weights.numel()) {
+<<<<<<< HEAD
     // KleidiAI interface internally handles the Channelwise and groupwise
+=======
+    // KleidiAI interface intenally handles the Channelwise and groupwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // distinction
     kleidiai::kai_quant_pack_lhs_int4_mm(
         output, inp, packed_weights, M, N, K, block_size);
diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
index 496b982619649..4d53e0c609b6e 100644
--- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@@ -100,7 +100,11 @@ inline void tinygemm_kernel(
 
 #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 
+<<<<<<< HEAD
 inline float _mm256_reduce_add_ps(__m256& v) {
+=======
+static inline float _mm256_reduce_add_ps(__m256& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1);
   v = _mm256_add_ps(v, v1);
   v1 = _mm256_shuffle_ps(v, v, 0x4E);
@@ -367,6 +371,7 @@ void int8pack_mm_kernel_(
   auto* C_data = C.data_ptr<T>();
   const auto* S_data = scales.const_data_ptr<T>();
 
+<<<<<<< HEAD
   int64_t M = A.size(0);
   int64_t N = B.size(0);
   int64_t K = A.size(1);
@@ -379,15 +384,36 @@ void int8pack_mm_kernel_(
 
   at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
     int64_t mb{0}, nb{0};
+=======
+  int M = A.size(0);
+  int N = B.size(0);
+  int K = A.size(1);
+  int lda = A.stride(0);
+  constexpr int BLOCK_M = 4;
+  constexpr int BLOCK_N = 4;
+
+  const int MB = (M + BLOCK_M - 1) / BLOCK_M;
+  const int NB = (N + BLOCK_N - 1) / BLOCK_N;
+
+  at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
+    int mb{0}, nb{0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     data_index_init(begin, mb, MB, nb, NB);
 
     for (const auto i : c10::irange(begin, end)) {
       (void)i;
 
+<<<<<<< HEAD
       int64_t mb_start = mb * BLOCK_M;
       int64_t mb_size = std::min(BLOCK_M, M - mb_start);
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+=======
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(BLOCK_M, M - mb_start);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = std::min(BLOCK_N, N - nb_start);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       const auto* A_ptr = A_data + mb_start * lda;
       const auto* B_ptr = B_data + nb_start * K;
diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h
index 8aba425e89637..d52bc276fbbff 100644
--- a/aten/src/ATen/native/cpu/moments_utils.h
+++ b/aten/src/ATen/native/cpu/moments_utils.h
@@ -8,6 +8,10 @@
 #include <ATen/OpMathType.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/cpu/utils.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/SmallVector.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 namespace at::native {
@@ -117,11 +121,17 @@ std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, in
 
   using Vec = vec::Vectorized<math_t>;
   const Vec kZeroVec(math_t(0));
+<<<<<<< HEAD
   std::array<int64_t, kMaxDepth> m0_stk = {{0}};
   std::array<Vec, kMaxDepth> m1_stk;
   m1_stk.fill(kZeroVec);
   std::array<Vec, kMaxDepth> m2_stk;
   m2_stk.fill(kZeroVec);
+=======
+  c10::SmallVector<int64_t, kMaxDepth> m0_stk(depth, 0);
+  c10::SmallVector<Vec, kMaxDepth> m1_stk(depth, kZeroVec);
+  c10::SmallVector<Vec, kMaxDepth> m2_stk(depth, kZeroVec);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (const auto i : c10::irange(m)) {
     const T* X_ptr = X + i * kChunkSize * kVecSize;
diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
index 827c69629eb37..968835d1874d2 100644
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@@ -6,9 +6,13 @@
 #include <c10/util/llvmMathExtras.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <fbgemm/Fbgemm.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <fbgemm/Fbgemm.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 namespace at::native {
@@ -167,12 +171,15 @@ inline void transpose<uint16_t>(int64_t M, int64_t N, const uint16_t* src, int64
   TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
   fbgemm::transpose_simd<uint16_t>(M, N, src, ld_src, dst, ld_dst);
 }
+<<<<<<< HEAD
 
 template <>
 inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
   TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
   fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 template <typename index_t, typename F>
diff --git a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
index fcacef37ceaf0..8ca84aab0d54f 100644
--- a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu
@@ -36,7 +36,11 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
             [zero, one_sixth, three, six] GPU_LAMBDA(
                 scalar_t self_val) -> scalar_t {
               opmath_t x = static_cast<opmath_t>(self_val);
+<<<<<<< HEAD
               return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
+=======
+              return std::min(std::max(x + three, zero), six) * one_sixth;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             });
       });
 }
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index 47c705a667b52..2ac0cbed4d2d6 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -526,7 +526,11 @@ namespace {
 
 
         // we are dealing with packed tensor here. max index is the same as numel.
+<<<<<<< HEAD
         // TODO: to really support input tensor large enough to go beyond int32,
+=======
+        // TODO: to really support input tensor large enought to go beyond int32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // we will need to restrict out shared memory usage and adjust the launch
         // config;
         AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@@ -681,7 +685,11 @@ namespace {
           const dim3 grid(grid_x, grid_y, grid_z);
 
           // we are dealing with packed tensor here. max index is the same as numel.
+<<<<<<< HEAD
           // TODO: to really support input tensor large enough to go beyond int32,
+=======
+          // TODO: to really support input tensor large enought to go beyond int32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // we will need to restrict out shared memory usage and adjust the launch
           // config;
           AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
index d9a0b0059917f..265b74036e321 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
@@ -53,7 +53,11 @@ __global__ void adaptiveaveragepool(
     const scalar_t *input, scalar_t *output,
     int isizeT, int isizeH, int isizeW,
     int osizeT, int osizeH, int osizeW,
+<<<<<<< HEAD
     int64_t sizeD, int64_t istrideB, int64_t istrideD,
+=======
+    int64_t istrideD,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t istrideT, int64_t istrideH, int64_t istrideW,
     int64_t offsetZ) {
   // iterates on output pixels
@@ -70,17 +74,26 @@ __global__ void adaptiveaveragepool(
   // select output plane
   int64_t o_plane = blockIdx.x + offsetZ;
   ot = o_plane % osizeT; // output frame/time
+<<<<<<< HEAD
   int d = o_plane / osizeT; // flattened (batch, channel) index
 
   // Decompose d into batch and channel indices
   int batch_idx = d / sizeD;
   int channel_idx = d % sizeD;
+=======
+  int d = o_plane / osizeT; // slice/feature
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // input frame/time range is fixed.
   int istartT = start_index(ot, osizeT, isizeT);
   int iendT = end_index(ot, osizeT, isizeT);
   int kT = iendT - istartT;
 
+<<<<<<< HEAD
+=======
+  // input offset by slice/feature and earliest relevant frame/time
+  const scalar_t *input_dt = input + d*istrideD + istartT*istrideT;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // output offset by slice/feature and frame/time
   scalar_t *output_dt = output + o_plane*osizeH*osizeW;
 
@@ -95,6 +108,11 @@ __global__ void adaptiveaveragepool(
       int iendW = end_index(ow, osizeW, isizeW);
       int kW = iendW - istartW;
 
+<<<<<<< HEAD
+=======
+      // Compute the average pooling from corresponding input pixels
+      const scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       scalar_t *ptr_output = output_dt + oh*osizeW + ow;
       accscalar_t sum = static_cast<accscalar_t>(0);
 
@@ -102,6 +120,7 @@ __global__ void adaptiveaveragepool(
       for (it = 0; it < kT; ++it) {
         for (ih = 0; ih < kH; ++ih) {
           for (iw = 0; iw < kW; ++iw) {
+<<<<<<< HEAD
             int64_t input_offset = batch_idx * istrideB + channel_idx * istrideD +
                                    (istartT + it) * istrideT +
                                    (istartH + ih) * istrideH + (istartW + iw) * istrideW;
@@ -109,6 +128,13 @@ __global__ void adaptiveaveragepool(
             sum += static_cast<accscalar_t>(val);
           }
         }
+=======
+            scalar_t val = ptr_input[ih*istrideH + iw*istrideW];
+            sum += static_cast<accscalar_t>(val);
+          }
+        }
+        ptr_input += istrideT; // next input frame
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       // Update output
       const accscalar_t divide_factor = static_cast<accscalar_t>(kT * kH * kW);
@@ -123,7 +149,11 @@ void adaptiveaveragepool_loop(
     int64_t totalZ,
     int isizeT, int isizeH, int isizeW,
     int osizeT, int osizeH, int osizeW,
+<<<<<<< HEAD
     int64_t sizeD, int64_t istrideB, int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) {
+=======
+    int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t offsetZ = 0;
   dim3 threads(32, 8);
   // each H*W plane is processed by blocksH thread blocks
@@ -135,7 +165,11 @@ void adaptiveaveragepool_loop(
         input_data, output_data,
         isizeT, isizeH, isizeW,
         osizeT, osizeH, osizeW,
+<<<<<<< HEAD
         sizeD, istrideB, istrideD,
+=======
+        istrideD,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         istrideT, istrideH, istrideW,
         offsetZ);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -366,7 +400,11 @@ void adaptive_avg_pool3d_out_cuda_template(
   int64_t osizeW = output_size[2];
 
   int64_t sizeD, isizeT, isizeH, isizeW;
+<<<<<<< HEAD
   int64_t istrideB, istrideD, istrideT, istrideH, istrideW;
+=======
+  int64_t istrideD, istrideT, istrideH, istrideW;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t totalZ;
 
   const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
@@ -377,7 +415,10 @@ void adaptive_avg_pool3d_out_cuda_template(
     isizeH = input.size(2);
     isizeW = input.size(3);
 
+<<<<<<< HEAD
     istrideB = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     istrideD = input.stride(0);
     istrideT = input.stride(1);
     istrideH = input.stride(2);
@@ -393,7 +434,10 @@ void adaptive_avg_pool3d_out_cuda_template(
     isizeH = input.size(3);
     isizeW = input.size(4);
 
+<<<<<<< HEAD
     istrideB = input.stride(0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     istrideD = input.stride(1);
     istrideT = input.stride(2);
     istrideH = input.stride(3);
@@ -419,7 +463,11 @@ void adaptive_avg_pool3d_out_cuda_template(
             totalZ,
             isizeT, isizeH, isizeW,
             osizeT, osizeH, osizeW,
+<<<<<<< HEAD
             sizeD, istrideB, istrideD, istrideT, istrideH, istrideW);
+=======
+            istrideD, istrideT, istrideH, istrideW);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index f29be23acd559..46a3684a21974 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -4,7 +4,10 @@
 #include <c10/util/SmallVector.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
@@ -13,11 +16,15 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/CUDABlas.h>
+<<<<<<< HEAD
 #include <ATen/cuda/CUDAScaledBlas.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/tunable/Tunable.h>
 #include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
+<<<<<<< HEAD
 #include <ATen/native/GroupedMMUtils.h>
 #include <ATen/native/cuda/cuBlasCommonArgs.h>
 #include <ATen/native/cuda/RowwiseScaledMM.h>
@@ -28,6 +35,11 @@
 #ifdef USE_FBGEMM_GENAI
 #include <fbgemm_gpu/torch_ops.h>
 #endif
+=======
+#include <ATen/native/cuda/RowwiseScaledMM.h>
+#include <ATen/native/cuda/ScaledGroupMM.h>
+#include <ATen/native/cuda/GroupMM.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -58,8 +70,165 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 using at::blas::ScalingType;
 using at::blas::SwizzleType;
+=======
+namespace {
+
+// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
+c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
+  if (resolve_conj && tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
+  }
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, !transpose_result);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, transpose_result);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, true);
+  }
+
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, true);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, true);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+
+/**
+ * @brief Prepares matrices for CUBLAS operation
+ *
+ * This constructor prepares tensors for CUBLAS
+ * The main difference is that PyTorch uses row-major as the default and
+ * CUBLAS expects column-major.
+ *
+ * @details
+ * To enable row-major output while using CUBLAS,
+ * we use the mathematical identity that (A × B)^T = B^T × A^T.
+ *
+ * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
+ * T = row-major, N = col-major
+ *
+ * Example:
+ * For matrices A (M×K)(row-major) and B (K×N)(row-major):
+ *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
+ *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
+ *   - However, since the output form cublas is column-major this is
+ *   - equivalent to an output of size MxN row-major as expected
+ *
+ * The transpose flags are derived from the layouts of the passed in tensors
+ *
+ * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
+ * to their unpacked values to match what cuBLAS expects.
+ *
+ * @param mat1 First input matrix
+ * @param mat2 Second input matrix
+ * @param c Output matrix (result)
+ * @param scale_a Optional scaling factor for first matrix
+ * @param scale_b Optional scaling factor for second matrix
+ * @param scale_result Optional scaling factor for result
+ */
+struct cublasCommonArgs {
+  cublasCommonArgs(
+      const Tensor& mat1,
+      const Tensor& mat2,
+      Tensor& c,
+      const std::optional<Tensor>& scale_a = std::nullopt,
+      const std::optional<Tensor>& scale_b = std::nullopt,
+      const std::optional<Tensor>& scale_result = std::nullopt) {
+    bool transpose_result = false, transpose_a = false, transpose_b = false;
+    result = prepare_matrix_for_cublas(c, transpose_result);
+    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
+    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
+
+    // Handle scale tensors if provided
+    if (scale_a && scale_b) {
+      // By default since we return in row-major we run the gemm
+      // as B.T @ A.T, check transpose_result to determine if we flip the scales
+      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
+      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
+      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
+      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
+    }
+
+    if (scale_result) {
+      scale_result_ptr = scale_result->data_ptr();
+      scale_result_dtype = scale_result->scalar_type();
+    }
+
+    // Update transpose flags
+    if (transpose_result) {
+      transpose_a = !transpose_a;
+      transpose_b = !transpose_b;
+    }
+
+    auto sizes_a = mata->sizes();
+    auto sizes_b = matb->sizes();
+
+    m = sizes_a[transpose_result ? 1 : 0];
+    k = sizes_a[transpose_result ? 0 : 1];
+    n = sizes_b[transpose_result ? 0 : 1];
+    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
+    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
+    result_ld = result->stride(transpose_result ? 0 : 1);
+    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
+    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
+
+    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
+    // if the gemm operands are in packed float4
+    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
+      k = k * 2;
+      lda = lda * 2;
+      ldb = ldb * 2;
+    }
+  }
+
+  // Matrix members
+  char transa, transb;
+  int64_t m, n, k;
+  int64_t lda, ldb, result_ld;
+  c10::MaybeOwned<Tensor> mata, matb, result;
+
+  // Scale members
+  void* scale_mata_ptr = nullptr;
+  void* scale_matb_ptr = nullptr;
+  void* scale_result_ptr = nullptr;
+  std::optional<c10::ScalarType> scale_mata_dtype;
+  std::optional<c10::ScalarType> scale_matb_dtype;
+  std::optional<c10::ScalarType> scale_result_dtype;
+};
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 c10::MaybeOwned<Tensor> prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, int64_t& ld_tensor, bool transpose_result, int64_t m, int64_t n) {
   IntArrayRef tensor_strides = tensor.strides();
@@ -114,6 +283,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
   }
 }
 
+<<<<<<< HEAD
 /*
  * Checks whether DISABLE_ADDMM_CUDA_LT is set.
  * Additionally, for ROCM we test whether the architecture supports the Lt.
@@ -221,6 +391,36 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
 
 template <typename scalar_t>
 void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
+=======
+static bool getDisableAddmmCudaLt() {
+    static const auto env_value = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
+    if (env_value == "1") {
+      return true;
+    }
+    return false;
+}
+
+#ifdef USE_ROCM
+static bool isSupportedHipLtROCmArch(int index) {
+    static const std::vector<std::string> archs = {
+        "gfx90a", "gfx942",
+#if ROCM_VERSION >= 60300
+        "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+#endif
+#if ROCM_VERSION >= 60402
+        "gfx1150", "gfx1151",
+#endif
+#if ROCM_VERSION >= 60500
+        "gfx950"
+#endif
+    };
+    return at::detail::getCUDAHooks().isGPUArch(archs, index);
+}
+#endif
+
+template <typename scalar_t>
+static void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
   bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
   at::cuda::tunable::GemmAndBiasParams<scalar_t> params;
@@ -259,6 +459,7 @@ void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
     // args contains result which is modified
@@ -323,6 +524,9 @@ bool launchGemmCublas(
 
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) {
   // Shape checks {
+=======
+Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Make sure to keep addmm_cuda below in sync with this code; it
   // preflights a check to try to avoid actually needing to call
   // expand().
@@ -332,6 +536,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
   )
 
+<<<<<<< HEAD
   if (result.is_same(self)) {
     TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
     TORCH_CHECK(self.sizes()[0] == mat1.sizes()[0], "self dim 0 must match mat1 dim 0");
@@ -339,10 +544,13 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   }
   // } Shape checks
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOLINTNEXTLINE(*c-array*)
   TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
   checkAllSameGPU(__func__, targs);
 
+<<<<<<< HEAD
   // Handle whether to use the Lt interface {
   static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
   // if lt path fails, we recurse back into this function here and force the lt path to off
@@ -388,6 +596,103 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 
   // Short circuit if the reduction dim is empty
   if (mat1.sizes()[1] == 0) {
+=======
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  IntArrayRef self__sizes;
+  bool useLtInterface = false;
+#if defined(USE_ROCM)
+  // When hipBLASLt is not supported on the architecture,
+  // disable_addmm_cuda_lt will always be to set to true
+  static bool disable_addmm_cuda_lt =
+    !isSupportedHipLtROCmArch(self.device().index()) || getDisableAddmmCudaLt();
+#else
+  static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
+#endif
+  // if lt path fails, we recurse back into this function here and force the lt path to off
+  // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
+  bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  cublasCommonArgs _args(mat1, mat2, result);
+  if (_args.transa == 't' && _args.transb == 't') {
+    disable_addmm_cuda_lt_final = true;
+  }
+#endif
+  at::ScalarType scalar_type = mat1.scalar_type();
+  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
+  c10::MaybeOwned<Tensor> self_;
+  if (&result != &self) {
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+    // Strangely, if mat2 has only 1 row or column, we get
+    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    // for cuda 11.4, cublasLtMatmul is activated
+    // the last two conditions is to skip 16b transA and non-trans-B having
+    // leading dim >> rows when they are sliced from a large tensor
+    // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+    if (!disable_addmm_cuda_lt_final) {
+      useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+          result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+          self.is_contiguous() && result.is_contiguous() &&
+#ifdef USE_ROCM
+          (scalar_type == at::ScalarType::Float ||
+           scalar_type == at::ScalarType::Half ||
+           scalar_type == at::ScalarType::BFloat16) &&
+#else
+          (scalar_type == at::ScalarType::Double ||
+           scalar_type == at::ScalarType::Float ||
+           scalar_type == at::ScalarType::Half ||
+           scalar_type == at::ScalarType::BFloat16) &&
+#endif
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
+          mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+#else
+          mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+          mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
+          mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
+          // avoid leading dim >> rows bugs
+          ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
+           (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
+           (scalar_type != at::ScalarType::Half &&
+            scalar_type != at::ScalarType::BFloat16)) &&
+          ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
+           (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
+           (scalar_type != at::ScalarType::Half &&
+            scalar_type != at::ScalarType::BFloat16));
+#endif
+    }
+#endif
+    if (!useLtInterface) {
+      self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    }
+    self__sizes = self_->sizes();
+  } else {
+    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
+    self__sizes = self_->sizes();
+    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
+    TORCH_CHECK(self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
+    TORCH_CHECK(self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
+  }
+
+  if (&result != &self) {
+    at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+    if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
+      at::native::copy_(result, *self_);
+    }
+  }
+
+
+  IntArrayRef result_sizes = result.sizes();
+  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+    return result;
+  }
+
+  cublasCommonArgs args(mat1, mat2, result);
+
+  if (mat1.numel() == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // By definition, when beta==0, values in self should be ignored. nans and infs
     // should not propagate
     if (beta.toComplexDouble() == 0.) {
@@ -399,6 +704,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         result,
         self.expand(result.sizes()),
         at::native::scalar_tensor(
+<<<<<<< HEAD
           beta,
           self.scalar_type(),
           std::nullopt /* layout */,
@@ -431,12 +737,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
       #endif
     } else {
       // !is_float_output_with_half_input
+=======
+            beta,
+            self.scalar_type(),
+            std::nullopt /* layout */,
+            at::kCPU,
+            std::nullopt /* pin_memory */));
+  }
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
+
+  if (useLtInterface) {
+#if defined(USE_ROCM)
+    bool okay = true;
+    if (is_float_output_with_half_input) {
+      TORCH_CHECK(false, "float output with half input is not enabled for ROCm");
+    } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AT_DISPATCH_FLOATING_TYPES_AND2(
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
         scalar_type,
         "addmm_cuda_lt",
         [&] {
+<<<<<<< HEAD
           lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
         }
       );
@@ -449,14 +773,145 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     // end Lt path
   } else {
     // No Lt, we use a GEMM instead
+=======
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(
+              args,
+              alpha,
+              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+              activation_to_gemm_and_blas_arg(activation));
+        } else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+            args.transa == 't',
+            args.transb == 't',
+            args.m,
+            args.n,
+            args.k,
+            alpha.to<at::opmath_type<scalar_t>>(),
+            args.mata->const_data_ptr<scalar_t>(),
+            args.lda,
+            args.matb->const_data_ptr<scalar_t>(),
+            args.ldb,
+            // This condition is needed for mm case on ROCm for hipblasLt path.
+            // Passing the bias ptr as null to avoid accuracy issues for mm case.
+            (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+            args.result->data_ptr<scalar_t>(),
+            args.result_ld,
+            activation_to_gemm_and_blas_arg(activation)
+          );
+        }
+      });
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
+      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
+    }
+#else
+    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
+    bool okay = true;
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t, float>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<float>(),
+              args.result_ld,
+              activation_epilogue
+          );
+        }});
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(
+              args,
+              alpha,
+              self.const_data_ptr<scalar_t>(),
+              activation_epilogue);
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<scalar_t>(),
+              args.result_ld,
+              activation_epilogue
+          );
+      }});
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
+      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
+    }
+#endif
+  } else
+  {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is_float_output_with_half_input) {
       AT_DISPATCH_REDUCED_FLOATING_TYPES(
         scalar_type,
         "addmm_cuda",
         [&] {
+<<<<<<< HEAD
           launchGemmCublas<scalar_t, float>(args, alpha, beta);
         }
       );
+=======
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+
+          float* result_ptr = args.result->mutable_data_ptr<float>();
+          at::cuda::blas::gemm<scalar_t, float>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
         at::ScalarType::Half,
@@ -464,12 +919,37 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         scalar_type,
         "addmm_cuda",
         [&] {
+<<<<<<< HEAD
           launchGemmCublas<scalar_t>(args, alpha, beta);
         }
       );
     }
 
     // Apply epilogue
+=======
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+          scalar_t* result_ptr = args.result->mutable_data_ptr<scalar_t>();
+          at::cuda::blas::gemm<scalar_t>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (activation) {
       case Activation::RELU:
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@@ -481,14 +961,22 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         break;
       default: break;
     }
+<<<<<<< HEAD
   } // end GEMM path
+=======
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Preprocessor gate here needs to match the inverse of the check
 // gating activation_to_gemm_and_blas_arg above; here we are manually
 // performing a post-GELU because we weren't able to use the GELU
 // epilogue above.
 #if !defined(CUDA_VERSION) && !defined(USE_ROCM)
+<<<<<<< HEAD
   if (!disable_addmm_cuda_lt && activation == Activation::GELU) {
+=======
+  if (useLtInterface && activation == Activation::GELU) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
   }
 #endif
@@ -893,6 +1381,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
+<<<<<<< HEAD
 static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
   // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
@@ -922,6 +1411,708 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
     TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
     TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
   }
+=======
+static bool _scaled_mm_allowed_device(bool sm90_only=false) {
+#ifdef USE_ROCM
+    static const std::vector<std::string> archs = {
+        "gfx942",
+#if ROCM_VERSION >= 60300
+        "gfx1200", "gfx1201",
+#endif
+#if ROCM_VERSION >= 60500
+        "gfx950"
+#endif
+    };
+    return at::detail::getCUDAHooks().isGPUArch(archs);
+#else
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if (sm90_only) {
+      return dprops->major == 9;
+    } else {
+      return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+    }
+#endif
+}
+
+#ifdef USE_ROCM
+static bool _scaled_mm_is_fnuz() {
+    return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
+}
+#endif
+
+namespace{
+
+enum class ScalingType : std::uint8_t {
+  TensorWise,
+  RowWise,
+  BlockWise,
+  Error
+};
+/*
+ * Scaling Type Determination:
+ * ---------------------------
+ * Conditions and corresponding Scaling Types:
+ *
+ * - If scale tensors are both `Float8_e8m0fnu` or `Float8_e4m3fn`:
+ *   - Returns BlockWise (with additional size checks).
+ *
+ * - If scale_a.numel() == 1 && scale_b.numel() == 1:
+ *   - Returns TensorWise.
+ *
+ * - Else if scale_a.dim() == 2 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n:
+ *   - Returns RowWise.
+ *
+ * - Otherwise:
+ *   - Returns Error.
+ */
+
+// Validates the scale tensors to scaled_mm
+// And returns the type of scaling/which kernel to use
+ScalingType get_scaling_type(
+    const at::Tensor& scale_a,
+    const at::Tensor& scale_b,
+    int64_t dim_m,
+    int64_t dim_k,
+    int64_t dim_n) {
+  // Check for BlockWise scaling (FP8_E8M0 and FP8_E4M3 types)
+  if ((scale_a.scalar_type() == scale_b.scalar_type()) &&
+      ((scale_a.scalar_type() == at::kFloat8_e8m0fnu) || (scale_a.scalar_type() == at::kFloat8_e4m3fn))) {
+    const bool is_nvfp4 = scale_a.scalar_type() == at::kFloat8_e4m3fn;
+
+    // cuBLAS's mxfp8 gemm: block_size is 1 scale per 32 elements
+    // cuBLAS's nvfp4 gemm: block_size is 1 scale per 16 unpacked elements.
+    const auto BLOCK_SIZE_K = is_nvfp4 ? 16 : 32;
+
+    constexpr int64_t BLOCK_SIZE_MN = 128;
+
+    // adjust for fp4x2 packing if necessary
+    const auto dim_k_unpacked = is_nvfp4 ? dim_k * 2 : dim_k;
+
+    auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
+    auto num_k_blocks = ceil_div(dim_k_unpacked, BLOCK_SIZE_K);
+    auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4;
+
+    // TODO: We might want to enforce some structure on the shapes of the scale
+    // tensors
+
+    // Check expected sizes for block-wise scaling
+    auto expected_a_size =
+        BLOCK_SIZE_MN * ceil_div(dim_m, BLOCK_SIZE_MN) * padded_num_k_blocks;
+    auto expected_b_size =
+        BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
+
+    //TODO: enable the checks for ROCm
+#ifndef USE_ROCM
+    TORCH_CHECK(scale_a.numel() == expected_a_size,
+                "For BlockWise scaling: Expected scale_a size to be ",
+                expected_a_size, " but got ", scale_a.numel());
+    TORCH_CHECK(scale_b.numel() == expected_b_size,
+                "For BlockWise scaling: Expected scale_b size to be ",
+                expected_b_size, " but got ", scale_b.numel());
+#endif
+
+    TORCH_CHECK(
+        scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For BlockWise scaling: Both scale_a and scale_b must be contiguous");
+
+    return ScalingType::BlockWise;
+  }
+  // Both Per-Tensor and Row-wise scaling expect fp32 tensors
+  TORCH_CHECK(
+      scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
+      "Both scale_a and scale_b must be float (fp32) tensors.");
+
+  // Check the singluar scale case for per-tensor scaling
+  if (scale_a.numel() == 1 && scale_b.numel() == 1) {
+    return ScalingType::TensorWise;
+  }
+
+  // For non-TensorWise scaling, enforce 2D input tensors
+  TORCH_CHECK(
+      scale_a.dim() == 2 && scale_b.dim() == 2,
+      "For non-TensorWise scaling, scale tensors must be 2-dimensional, "
+      "but got scale_a.dim()=",
+      scale_a.dim(),
+      " and scale_b.dim()=",
+      scale_b.dim());
+
+  // Check for RowWise scaling
+  if (scale_a.size(0) == dim_m && scale_a.size(1) == 1 &&
+      scale_b.size(0) == 1 && scale_b.size(1) == dim_n) {
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || \
+    (defined(USE_ROCM) && (defined(HIPBLASLT_VEC_EXT) || defined(HIPBLASLT_OUTER_VEC)))
+    TORCH_CHECK(
+        scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "Both scale_a and scale_b must be contiguous for RowWise scaling.");
+    return ScalingType::RowWise;
+#else
+    TORCH_CHECK(false, "Per-row scaling is not supported for this platform!");
+    return ScalingType::Error;
+#endif
+  }
+
+  // If we reach here, the input doesn't match any valid scaling type
+  TORCH_CHECK(
+      false,
+      "Invalid scaling configuration. For TensorWise scaling, both scales should be scalar. "
+      "For RowWise scaling, scale_a should be (",
+      dim_m,
+      ", 1) and scale_b should be (1, ",
+      dim_n,
+      "). "
+      "Got scale_a.size()=(",
+      scale_a.size(0),
+      ", ",
+      scale_a.size(1),
+      ") and ",
+      "scale_b.size()=(",
+      scale_b.size(0),
+      ", ",
+      scale_b.size(1),
+      ")");
+
+  return ScalingType::Error;
+}
+
+} // namespace
+
+
+// Computes matrix multiply + bias while applying scaling to input and output matrices
+// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
+// If output matrix type is 16 or 32-bit type, scale_result is not applied.
+// Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
+//    - `scale_a`: a scalar or 1-dimensional tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
+//    - `scale_b`: a scalar or 1-dimensional tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
+//    - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
+//    - `use_fast_accum`: if true, enables fast float8 accumulation
+//    - `out`: a reference to the output tensor
+
+Tensor&
+_scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Check sizes
+  bool allowed_device = _scaled_mm_allowed_device();
+  TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
+  // Check what type of scaling we are doing based on inputs
+  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat1.size(1), mat2.size(1));
+  TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported");
+
+  TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
+       "scale_result must be a float scalar");
+  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
+       " but got ", bias->numel());
+  TORCH_CHECK(
+      mat1.sizes()[1] % 16 == 0,
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
+      "but got mat1 shape: (",
+      mat1.sizes()[0],
+      "x",
+      mat1.sizes()[1],
+      ").");
+  TORCH_CHECK(mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, "mat2 shape (", mat2.sizes()[0], "x",
+       mat2.sizes()[1], ") must be divisible by 16");
+  // Check types
+  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()) || mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat1 to be Float8 or Float4_x2 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type());
+#ifndef USE_ROCM
+  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
+  TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
+        "Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
+  }
+#endif
+  if (use_fast_accum) {
+    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
+  }
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e5m2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e4m3fn is only supported for ROCm 7.0 and above");
+  }
+#endif
+  if (bias) {
+    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
+         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
+    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
+          bias->scalar_type() == ScalarType::BFloat16,
+          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
+          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+    auto scale_result_ = scale_result.value_or(Tensor());
+
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{{out, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2},
+                      {bias_, "bias", 3}, {scale_a, "scale_a", 4}, {scale_b, "scale_b", 5},
+                      {scale_result_, "scale_result", 6}};
+    checkAllSameGPU(__func__, targs);
+  }
+  // Validation checks have passed lets resize the output to actual size
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels
+  // do not support this case).
+  if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    if (mat1_sizes[1] == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // ROCm's hipblaslt supports rowwise, so skip this check that sends this to cutlass.
+#ifndef USE_ROCM
+  // We are doing row-wise scaling
+  if (scaling_choice == ScalingType::RowWise) {
+    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
+    at::cuda::detail::f8f8bf16_rowwise(
+        mat1,
+        mat2,
+        scale_a,
+        scale_b,
+        bias,
+        use_fast_accum,
+        out);
+    return out;
+  }
+#else
+  if (scaling_choice == ScalingType::RowWise) {
+    // For ROCm, match behavior of f8f8bf16_rowwise type checking
+    Tensor b = mat2;
+    if (_scaled_mm_is_fnuz()) {
+      TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
+    }
+    else {
+      TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
+    }
+    // Until more than bf16 is supported
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
+  }
+  else if (scaling_choice == ScalingType::BlockWise) {
+#if ROCM_VERSION >= 70000
+    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0),
+               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+               "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                out.scalar_type() == ScalarType::Half,
+                "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
+  }
+#endif
+
+  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result);
+  const auto out_dtype_ = args.result->scalar_type();
+  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
+
+#ifdef USE_ROCM
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }
+    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
+      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
+      params.transa = args.transa;
+      params.transb = args.transb;
+      params.m = args.m;
+      params.n = args.n;
+      params.k = args.k;
+      params.a = args.mata->data_ptr();
+      params.a_scale_ptr = args.scale_mata_ptr;
+      params.a_scale_dtype = scale_a.scalar_type();
+      params.lda = args.lda;
+      params.a_dtype = args.mata->scalar_type();
+      params.b = args.matb->data_ptr();
+      params.b_scale_ptr = args.scale_matb_ptr;
+      params.b_scale_dtype = scale_b.scalar_type();
+      params.ldb = args.ldb;
+      params.b_dtype = args.matb->scalar_type();
+      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+      params.c = args.result->data_ptr();
+      params.c_scale_ptr = args.scale_result_ptr;
+      params.ldc = args.result_ld;
+      params.c_dtype = out_dtype_;
+      params.use_fast_accum = use_fast_accum;
+      params.use_rowwise = scaling_choice == ScalingType::RowWise;
+      if (transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
+      }
+      else if (transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
+      }
+      else if (!transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
+      }
+      else if (!transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+    }),
+    kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  }
+  else
+#endif
+ {
+    at::cuda::blas::scaled_gemm(
+        args.transa,
+        args.transb,
+        args.m,
+        args.n,
+        args.k,
+        args.mata->data_ptr(),
+        args.scale_mata_ptr,
+        args.lda,
+        args.mata->scalar_type(),
+        args.scale_mata_dtype.value(),
+        args.matb->data_ptr(),
+        args.scale_matb_ptr,
+        args.ldb,
+        args.matb->scalar_type(),
+        args.scale_matb_dtype.value(),
+        bias ? bias->data_ptr(): nullptr,
+        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+        args.result->data_ptr(),
+        args.scale_result_ptr,
+        args.result_ld,
+        out_dtype_,
+        use_fast_accum,
+        scaling_choice == ScalingType::RowWise);
+  }
+
+  return out;
+}
+
+namespace {
+  at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
+  const Tensor& mat_b,
+  const std::optional<at::Tensor>& offs,
+  std::optional<c10::ScalarType> out_dtype
+  ) {
+    c10::SmallVector<int64_t, 3> out_size;
+    const bool a_is_2d = mat_a.dim() == 2;
+    const bool b_is_2d = mat_b.dim() == 2;
+    if (a_is_2d) {
+      if (b_is_2d) {
+        out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
+      } else {
+        TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
+        out_size = {mat_a.size(0), mat_b.size(-1)};
+      }
+    } else {
+      if (b_is_2d) {
+        // this case is not actually encountered for MoE gemms
+        TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
+        out_size = {mat_a.size(1), mat_b.size(1)};
+      } else { // regular bmm
+        TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
+        out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
+      }
+    }
+
+    const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+    TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+    // For TMA transfers, strides of output tensor have to be either
+    // 1, or aligned to 16 bytes.
+    const auto last_dim = out_size.size() - 1;
+    const auto alignment = 16 / c10::elementSize(out_dtype_);
+    const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
+    std::vector<int64_t> out_stride;
+    if (a_is_2d != b_is_2d) {
+      out_stride = {size_padded, 1};
+    } else {
+      out_stride = {out_size[1] * size_padded, size_padded, 1};
+    }
+    auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
+
+    return out;
+  }
+
+  bool check_valid_strides_and_return_transposed(const Tensor& mat) {
+    IntArrayRef tensor_strides = mat.strides();
+    IntArrayRef tensor_sizes = mat.sizes();
+    int end_dim = mat.dim() - 1;
+    int alignment = 16 / mat.element_size();
+    TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
+    if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
+      TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
+      return true;
+    } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
+      TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
+      return false;
+    } else {
+      TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
+    }
+  }
+
+  void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+    if (mat.dim() == 2) {
+      TORCH_CHECK(
+          scale.dim() == 1,
+          "scale must be a 1D tensor, but got ",
+          scale.dim(),
+          "D, arg ",
+          arg_idx);
+      TORCH_CHECK(
+          scale.is_contiguous(), "scale must be contiguous for arg ", arg_idx);
+      TORCH_CHECK(
+          scale.size(0) == mat.size(dim) * scale_multiplier,
+          "scale must have the same length as mat for arg ",
+          arg_idx);
+    } else {
+      TORCH_CHECK(
+          scale.dim() == 2,
+          "scale must be a 2D tensor, but got ",
+          scale.dim(),
+          "D for arg ",
+          arg_idx);
+      TORCH_CHECK(
+          scale.stride(1) == 1,
+          "scale must be contiguous in the last dimension for arg ",
+          arg_idx);
+      TORCH_CHECK(
+          scale.size(0) == mat.size(0),
+          "scale must have the same batch dimension as mat for arg ",
+          arg_idx);
+      TORCH_CHECK(
+          scale.size(1) == mat.size(1 + dim),
+          "scale must have the same first dimension as mat for arg ",
+          arg_idx);
+    }
+}
+
+
+}
+
+Tensor
+_scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+  return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
+}
+
+
+Tensor
+_scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
+const Tensor& scale_a, const Tensor& scale_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+const std::optional<at::Tensor>& scale_result,
+std::optional<c10::ScalarType> out_dtype,
+bool use_fast_accum) {
+#ifndef USE_ROCM
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
+  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
+
+  TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
+  TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
+  TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
+  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  TORCH_CHECK(
+    mat_a.size(-1) % 16 == 0,
+    "Expected trailing dimension of mat_a to be divisible by 16 ",
+    "but got mat1 shape: (",
+    mat_a.sizes(),
+    ").");
+  TORCH_CHECK(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0,
+    "Expected mat_b shape to be divisible by 16 ",
+    "but got mat_b shape: (",
+    mat_b.sizes(),
+    ").");
+
+
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+  TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet");
+  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
+
+  if (offs.has_value()) {
+    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+
+  // Both Per-Tensor and Row-wise scaling expect fp32 tensors
+  TORCH_CHECK(
+      scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
+      "Both scale_a and scale_b must be float (fp32) tensors.");
+
+  const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
+  check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
+  check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+
+  at::cuda::detail::f8f8bf16_grouped_mm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs,
+      bias,
+      use_fast_accum,
+      out);
+    return out;
+
+
+
+
+#else
+  TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
+#endif
+
+}
+
+Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+#ifndef USE_ROCM
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
+  TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0");
+
+  TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
+  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+
+  // check that the strides are valid, the fn will throw an error if not
+  check_valid_strides_and_return_transposed(mat_a);
+  check_valid_strides_and_return_transposed(mat_b);
+  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
+
+  if (offs.has_value()) {
+    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+
+  at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+  return out;
+#else
+  TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
@@ -933,7 +2124,16 @@ Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::Sca
 }
 
 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
+<<<<<<< HEAD
   baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+=======
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Scalar beta(0.0);
   Scalar alpha(1.0);
   {
@@ -952,7 +2152,16 @@ Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tenso
 }
 
 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+<<<<<<< HEAD
   baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
+=======
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {
     NoNamesGuard guard;
     baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@@ -967,12 +2176,15 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy
 }
 
 Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
+<<<<<<< HEAD
   TORCH_CHECK(self.dim() == 2,  "self must be a matrix, got ", self.dim(), "-D tensor");
   TORCH_CHECK(mat2.dim() == 2,  "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
   TORCH_CHECK(
       self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
       self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
   TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
   TORCH_CHECK(out_dtype == self.scalar_type() ||
@@ -981,7 +2193,11 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
 
 
+<<<<<<< HEAD
   addmm_out_cuda_impl(out, out, self, mat2, 0, 1);
+=======
+  addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return out;
 }
@@ -992,6 +2208,7 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m
 }
 
 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+<<<<<<< HEAD
   TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
   TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
@@ -1000,6 +2217,8 @@ Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tens
       mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
       mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
   TORCH_CHECK(out_dtype == self.scalar_type() ||
     (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index c42d03b9cbf7f..da11bbfdac099 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -297,7 +297,10 @@ static inline void launch_vectorized_kernel(
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
   c10::DeviceIndex curDevice = -1;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
+<<<<<<< HEAD
   // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread<io_size>();
 #else
   using cpp_type = typename function_traits<func_t>::result_type;
@@ -436,6 +439,10 @@ static inline void launch_vectorized_templated_kernel(
     loader_t l,
     storer_t s) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+<<<<<<< HEAD
+=======
+  using traits = function_traits<func_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
       vectorized_templated_config::block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -856,6 +863,7 @@ struct type_specialized_kernel_launcher {
       out_calc_t output_offset_calculator,
       loader_t loader,
       storer_t storer) {
+<<<<<<< HEAD
     constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
     constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
     constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
@@ -863,6 +871,11 @@ struct type_specialized_kernel_launcher {
       using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
       using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
       using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
+=======
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       launch_vectorized_templated_kernel<
           func_t,
           array_t,
@@ -870,9 +883,18 @@ struct type_specialized_kernel_launcher {
           out_calc_t,
           loader_t,
           storer_t,
+<<<<<<< HEAD
           cret_t,
           carg0_t,
           carg1_t>(
+=======
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][0]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           numel,
           f,
           data,
@@ -880,10 +902,79 @@ struct type_specialized_kernel_launcher {
           output_offset_calculator,
           loader,
           storer);
+<<<<<<< HEAD
     }
   }
 };
 
+=======
+  }
+};
+
+template <int arg_index>
+struct type_specialized_broadcast_kernel_launcher {
+  template <
+      typename func_t,
+      typename array_t,
+      typename dtypes_t,
+      typename calc_t>
+  static void apply(
+      int64_t numel,
+      func_t f,
+      array_t data,
+      dtypes_t dtypes,
+      calc_t offset_calc) {
+        using traits = function_traits<func_t>;
+        using ret_t = typename traits::result_type;
+        using arg0_t = typename traits::template arg<0>::type;
+        using arg1_t = typename traits::template arg<1>::type;
+        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
+          dtypes[1] == rt_binary_specializations[arg_index][1] &&
+          dtypes[2] == rt_binary_specializations[arg_index][2]) {
+            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
+            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
+            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
+            constexpr int grp_sz = 128;
+            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+              if (unrl) {
+                auto offsets0 = offset_calc.get(idx);
+                auto offsets1 = offset_calc.get(idx + grp_sz);
+                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+                void* out0 = data[0] + offsets0[0];
+                void* out1 = data[0] + offsets1[0];
+                void* out2 = data[0] + offsets2[0];
+                void* out3 = data[0] + offsets3[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
+                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
+                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
+                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
+                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
+                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
+                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
+                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
+                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
+                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
+                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
+                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
+                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
+                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
+              } else {
+                auto offsets = offset_calc.get(idx);
+                void* out = data[0] + offsets[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
+                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
+              }
+            });
+        }
+      }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 #endif
 
@@ -1002,6 +1093,35 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
+<<<<<<< HEAD
+=======
+    if (check_binary_rt_types_for_specialization(iter)) {
+      // constexpr to reduce the amount of kernels generated for
+      // broadcast elementwise with mexed dtypes and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+        check_binary_functor_types_for_specialization<
+          func_tuple,
+          float,
+          float,
+          traits::arity,
+          /*arg_num=*/0>::check()) {
+            memory::detail::static_unroll<
+              type_specialized_broadcast_kernel_launcher,
+              rt_binary_specializations.size()>::with_args(
+                numel,
+                f,
+                data,
+                dtypes,
+                offset_calc
+            );
+            return;
+      }
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int grp_sz = 128;
     launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
       if (unrl) {
diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu
index 0d34bd52f211a..524578b07c90b 100644
--- a/aten/src/ATen/native/cuda/CUDAScalar.cu
+++ b/aten/src/ATen/native/cuda/CUDAScalar.cu
@@ -11,11 +11,31 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM)
+// TODO(lufang): Tensor.item() on AMD HIP is not synced in the Recsys models.
+// This is just a short term workaround. Issue is tracked as FBA-388 on the AMD side.
+namespace {
+  bool use_sync_mode() {
+    static const bool sync_mode = c10::utils::check_env("HIP_DOUBLE_SYNC_ON_LOCAL_SCALE_DENSE") == true;
+    return sync_mode;
+  }
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 Scalar _local_scalar_dense_cuda(const Tensor& self) {
   Scalar r;
   TORCH_CHECK(self.numel() > 0, "_local_scalar_dense: Empty tensor not supported");
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM)
+  if (!use_sync_mode()){
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_V2(
       self.scalar_type(), "_local_scalar_dense_cuda", AT_WRAP([&] {
           // Create pinned memory for the scalar value to avoid implicit
@@ -32,6 +52,18 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) {
           at::cuda::memcpy_and_sync((void *)value.const_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
           r = Scalar(*value.const_data_ptr<scalar_t>());
         }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM)
+  } else {
+    auto cpu_self = self.cpu();
+    AT_DISPATCH_V2(
+      self.scalar_type(), "_local_scalar_dense_hip", AT_WRAP([&] {
+          r = Scalar(*cpu_self.const_data_ptr<scalar_t>());
+        }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return r;
 }
 
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 754582d2d9777..aec69425e4b3f 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -1,4 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
@@ -6,6 +7,15 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <ATen/cuda/CachingHostAllocator.h>
+=======
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/cuda/CachingHostAllocator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/native/Copy.h>
 #include <ATen/native/TensorIterator.h>
@@ -27,6 +37,7 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 namespace {
 
 // Initial pool size for CUDA events per device.
@@ -45,6 +56,8 @@ at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
 
 } // namespace
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void neg_kernel_cuda(TensorIteratorBase &iter);
 void conj_kernel_cuda(TensorIteratorBase &iter);
 
@@ -281,6 +294,7 @@ void copy_device_to_device(TensorIterator& iter,
     // write-after-read dependencies on the destination side are handled, so
     // that no one is operating on the dst memory when we perform the copy.
     // src waits on dst barrier (src already waits on src)
+<<<<<<< HEAD
 
     // Use event pool for better performance instead of creating new events
     auto dst_ready = getEventFromPool(dst_device.index());
@@ -289,6 +303,14 @@ void copy_device_to_device(TensorIterator& iter,
 
     device_guard.set_device(src_device);
     dst_ready->block(copy_stream);
+=======
+    CUDAEvent dst_ready;
+    device_guard.set_device(dst_device);
+    dst_ready.record(getCurrentCUDAStream(dst_device.index()));
+
+    device_guard.set_device(src_device);
+    dst_ready.block(copy_stream);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (memcpy_eligible) {
@@ -327,11 +349,19 @@ void copy_device_to_device(TensorIterator& iter,
     // operate on dst's copy until the copy is complete.
 
     // Still on src_device, record stream event
+<<<<<<< HEAD
     auto src_ready = getEventFromPool(src_device.index());
     src_ready->record(copy_stream);
 
     device_guard.set_device(dst_device);
     src_ready->block(getCurrentCUDAStream(dst_device.index()));
+=======
+    CUDAEvent src_ready;
+    src_ready.record(copy_stream);
+
+    device_guard.set_device(dst_device);
+    src_ready.block(getCurrentCUDAStream(dst_device.index()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   AT_CUDA_CHECK(cudaGetLastError());
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 333c21e94f18e..49e14b9b51540 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -223,7 +223,11 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo
 class CuFFTConfig {
 public:
 
+<<<<<<< HEAD
   // Only move semantics is enough for this class. Although we already use
+=======
+  // Only move semantics is enought for this class. Although we already use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // unique_ptr for the plan, still remove copy constructor and assignment op so
   // we don't accidentally copy and take perf hit.
   CuFFTConfig(const CuFFTConfig&) = delete;
diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h
index 38013137f0a40..42e5e9fee11a0 100644
--- a/aten/src/ATen/native/cuda/CuFFTUtils.h
+++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
@@ -38,12 +38,22 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
       return "CUFFT_INVALID_SIZE";
     case CUFFT_UNALIGNED_DATA:
       return "CUFFT_UNALIGNED_DATA";
+<<<<<<< HEAD
     case CUFFT_INVALID_DEVICE:
       return "CUFFT_INVALID_DEVICE";
+=======
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case CUFFT_NO_WORKSPACE:
       return "CUFFT_NO_WORKSPACE";
     case CUFFT_NOT_IMPLEMENTED:
       return "CUFFT_NOT_IMPLEMENTED";
+<<<<<<< HEAD
 #if CUDA_VERSION <= 12090
     case CUFFT_INCOMPLETE_PARAMETER_LIST:
       return "CUFFT_INCOMPLETE_PARAMETER_LIST";
@@ -51,6 +61,9 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
       return "CUFFT_PARSE_ERROR";
 #endif
 #if !defined(USE_ROCM) && CUDA_VERSION <= 12090
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case CUFFT_LICENSE_ERROR:
       return "CUFFT_LICENSE_ERROR";
 #endif
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 344906a2a4df2..a51d87cf73a55 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -38,6 +38,7 @@ __device__ inline int min(int a, int b) {
 #define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched
 #endif
 
+<<<<<<< HEAD
 template <typename index_t>
 static __device__ inline index_t p_start(index_t size, int pad, int kernel, int dilation, int stride) {
   const auto kernel_extent = static_cast<index_t>((kernel - 1) * dilation + 1);
@@ -73,6 +74,14 @@ static inline bool can_use_int32_nhwc(
   if (height * width > int_max) return false;
 
   return true;
+=======
+static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) {
+  return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1;
+}
+
+static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) {
+  return min((size + pad) / stride + 1, pooled_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // kernels borrowed from Caffe
@@ -114,6 +123,7 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename index_t>
 C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
 __global__ void max_pool_forward_nhwc(
@@ -133,6 +143,23 @@ __global__ void max_pool_forward_nhwc(
   index_t *out_mask_cached = reinterpret_cast<index_t*>(smem_raw);
   scalar_t *out_cached = reinterpret_cast<scalar_t*>(
       out_mask_cached + kernel_size_C*blockDim.x*blockDim.y*blockDim.z);
+=======
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
+__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch,
+                                   const int64_t channels, const int64_t height,
+                                   const int64_t width, const int pooled_height, const int pooled_width,
+                                   const int kernel_h, const int kernel_w, const int stride_h,
+                                   const int stride_w, const int pad_h, const int pad_w,
+                                   const int dilation_h, const int dilation_w,
+                                   const int in_stride_n, const int in_stride_c,
+                                   const int in_stride_h, const int in_stride_w,
+                                   const int kernel_stride_C, const int kernel_size_C,
+                                   scalar_t* top_data, int64_t* top_mask) {
+  extern __shared__ int smem[];
+  int *out_mask_cached = smem;
+  scalar_t *out_cached = reinterpret_cast<scalar_t*>(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // flattening cta for pre-computation & smem initialization;
   int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
@@ -151,6 +178,7 @@ __global__ void max_pool_forward_nhwc(
   int channel_id = blockIdx.x / nbatch;
   int channel_offset = threadIdx.x + channel_id * blockDim.x;
 
+<<<<<<< HEAD
   top_data = top_data + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
   top_mask = top_mask + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
   bottom_data = bottom_data + static_cast<index_t>(batch_id) * in_stride_n;
@@ -171,6 +199,28 @@ __global__ void max_pool_forward_nhwc(
     for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
       index_t wstart = static_cast<index_t>(ow) * stride_w - pad_w;
       index_t wend = std::min(wstart + static_cast<index_t>((kernel_w - 1) * dilation_w + 1), width);
+=======
+  top_data = top_data + batch_id * pooled_height * pooled_width * channels;
+  top_mask = top_mask + batch_id * pooled_height * pooled_width * channels;
+  bottom_data = bottom_data + batch_id * in_stride_n;
+
+  out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+  out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+
+  int oH = (pooled_height + gridDim.z-1) / gridDim.z;
+  int oW = (pooled_width + gridDim.y-1) / gridDim.y;
+  int ostartH = threadIdx.z + blockIdx.z*oH;
+  int oendH = ::min(ostartH+oH, pooled_height);
+  int ostartW = threadIdx.y + blockIdx.y*oW;
+  int oendW = ::min(ostartW+oW, pooled_width);
+
+  for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
+    int hstart = oh * stride_h - pad_h;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
+      int wstart = ow * stride_w - pad_w;
+      int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       while(hstart < 0)
         hstart += dilation_h;
       while(wstart < 0)
@@ -218,12 +268,21 @@ __global__ void max_pool_forward_nhwc(
       // Else do it Non-Prefetch...
       else
 #endif
+<<<<<<< HEAD
       for (index_t ih = hstart; ih < hend; ih += dilation_h) {
         for (index_t iw = wstart; iw < wend; iw += dilation_w) {
           int cached_index = threadIdx.x;
           const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
           for (index_t c = channel_offset; c < channels; c += static_cast<index_t>(blockDim.x) * kernel_stride_C) {
             scalar_t val = ptr_input[c * in_stride_c];
+=======
+      for (int ih = hstart; ih < hend; ih += dilation_h) {
+        for (int iw = wstart; iw < wend; iw += dilation_w) {
+          int cached_index = threadIdx.x;
+          const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
+          for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
+            scalar_t val = ptr_input[c*in_stride_c];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if ((val > out_cached[cached_index]) || at::_isnan(val)) {
               out_cached[cached_index] = val;
               out_mask_cached[cached_index] = ih * width + iw;
@@ -233,6 +292,7 @@ __global__ void max_pool_forward_nhwc(
         }
       }
 
+<<<<<<< HEAD
       scalar_t *ptr_output_data = top_data + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
       int64_t *ptr_output_mask = top_mask + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
 
@@ -242,6 +302,17 @@ __global__ void max_pool_forward_nhwc(
         ptr_output_mask[c] = static_cast<int64_t>(out_mask_cached[cached_index]);
         out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
         out_mask_cached[cached_index] = index_t(0);
+=======
+      scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels;
+      int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels;
+
+      int cached_index = threadIdx.x;
+      for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
+        ptr_output_data[c] = out_cached[cached_index];
+        ptr_output_mask[c] = out_mask_cached[cached_index];
+        out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
+        out_mask_cached[cached_index] = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cached_index += blockDim.x;
       }
     }
@@ -249,7 +320,11 @@ __global__ void max_pool_forward_nhwc(
 }
 
 
+<<<<<<< HEAD
 static constexpr int BLOCK_THREADS = 256;
+=======
+static const int BLOCK_THREADS = 256;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
@@ -495,11 +570,14 @@ const Tensor& indices) {
               maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
           const dim3 block(block_x, block_y, block_z);
 
+<<<<<<< HEAD
           bool use_int32 = can_use_int32_nhwc(
               nbatch, nInputPlane, inputHeight, inputWidth,
               outputHeight, outputWidth,
               in_stride_n, in_stride_c, in_stride_h, in_stride_w);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           int kernel_stride_C = ceil_div(
               safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
           int kernel_size_C = ceil_div(
@@ -514,6 +592,7 @@ const Tensor& indices) {
               ceil_div(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE_FWD));
           const dim3 grid(grid_x, grid_y, grid_z);
 
+<<<<<<< HEAD
           size_t shmem_size;
           size_t mask_elems = static_cast<size_t>(kernel_size_C) * block_x * block_y * block_z;
 
@@ -549,6 +628,20 @@ const Tensor& indices) {
                 kernel_stride_C, kernel_size_C,
                 output_data, indices_data);
           }
+=======
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
+          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock);
+
+          max_pool_forward_nhwc<scalar_t>
+          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+              input_data, nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  in_stride_n, in_stride_c,
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C,
+                  output_data, indices_data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 9dcfea3af9435..0bf5bbd42cf68 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -494,7 +494,11 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
       auto value = static_cast<scalar_t>(rand * range + from);
       // reverse the bounds of curand4 from (0, 1] to [0, 1)
       // Note that this method is from legacy THCTensorRandom and is likely to give
+<<<<<<< HEAD
       // you more 0-s, since, the probability of getting 1-s is higher than 0-s and
+=======
+      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
       // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
       auto reverse_bound_value = value == to ? from : value;
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 65b0e1441de78..5fb661a0cf38c 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -15,7 +15,13 @@
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/cuda/thread_constants.h>
 
+<<<<<<< HEAD
 #include <thrust/iterator/reverse_iterator.h>
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+#include <thrust/iterator/reverse_iterator.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -34,9 +40,15 @@ namespace at::native {
 namespace {
 
 #if defined(USE_ROCM)
+<<<<<<< HEAD
 static constexpr int BLOCKDIMY = 16;
 #else
 static constexpr int BLOCKDIMY = 32;
+=======
+static const int BLOCKDIMY = 16;
+#else
+static const int BLOCKDIMY = 32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 template
@@ -238,6 +250,13 @@ __global__ void renorm_kernel(
 
 } // anonymous namespace
 
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
                                int64_t num_weights, int64_t padding_idx,
@@ -300,6 +319,10 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -310,7 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
       auto count_data = count.mutable_data_ptr<index_t>();
       cuda::cub::inclusive_sum_by_key(
         sorted_data,
+<<<<<<< HEAD
         ATEN_CUB_CONSTANT_ITERATOR(index_t)(1),
+=======
+        NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::ConstantInputIterator<index_t>(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count_data,
         num_indices
       );
@@ -322,10 +349,22 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
         thrust::make_reverse_iterator(sorted_data + num_indices),
         thrust::make_reverse_iterator(static_cast<const index_t*>(count_data) + num_indices),
         thrust::make_reverse_iterator(count_data + num_indices),
+<<<<<<< HEAD
         ATEN_CUB_MAXIMUM(),
         num_indices
       );
     });
+=======
+        NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max(),
+        num_indices
+      );
+    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
+      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
+    });
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   return embedding_backward_cuda_kernel(grad, orig_indices,
@@ -357,7 +396,11 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
 
     int warp_size = at::cuda::warp_size();
     TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
+<<<<<<< HEAD
                   num_threads() <= static_cast<uint32_t>(cuda_utils::kCUDABlockReduceMaxThreads()),
+=======
+                  num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   "BlockReduceSum requires all warps be active");
     const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr<int64_t>();
     dim3 grid = unique_indices.numel();
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index 6ce419137345f..ee8253de43456 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -10,7 +10,13 @@
 
 #include <c10/macros/Macros.h>
 
+<<<<<<< HEAD
 #include <thrust/iterator/counting_iterator.h>
+=======
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+#include <thrust/iterator/counting_iterator.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -86,9 +92,15 @@ __global__ void compute_grad_weight_bags(
     const int64_t stride_warped) {
 
   int64_t num_of_segments = *num_of_segments_ptr;
+<<<<<<< HEAD
   const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int64_t id = gid / stride_warped;
   const int64_t startFeature = gid % stride_warped;
+=======
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (startFeature >= stride) {
     return;
   }
@@ -132,9 +144,15 @@ __global__ void compute_grad_weight(
 
   int64_t num_of_segments = *num_of_segments_ptr;
   using accscalar_t = acc_type<scalar_t, true>;
+<<<<<<< HEAD
   const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int64_t id = gid / stride_warped;
   const int64_t startFeature = gid % stride_warped;
+=======
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (startFeature >= stride) {
     return;
   }
@@ -165,9 +183,15 @@ __global__ void sum_and_scatter(
 
   int64_t num_of_segments = *num_of_segments_ptr;
   int64_t num_of_partial_segments = *num_of_partial_segments_ptr;
+<<<<<<< HEAD
   const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int64_t id = gid / stride_warped;
   const int64_t startFeature = gid % stride_warped;
+=======
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (startFeature >= stride) {
     return;
   }
@@ -194,9 +218,24 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
             partials_per_segment_offset[num_of_segments-1];
 }
 
+<<<<<<< HEAD
+
+} // anon namespace
+
+=======
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
+  *num_of_segments_ptr = num_of_segments;
+}
+#endif
 
 } // anon namespace
 
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+template<typename index_t>
+int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Tensor embedding_backward_cuda_kernel(
         const Tensor &grad,
@@ -223,12 +262,26 @@ Tensor embedding_backward_cuda_kernel(
   auto segment_offsets = at::empty({numel}, orig_indices.options());
   auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
   int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
+    int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
+    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
     cuda::cub::unique_by_key(
       sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
       segment_offsets.mutable_data_ptr<index_t>(),
       num_of_segments_ptr, sorted_indices.numel());
   });
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t max_segments = std::min<int64_t>(numel, num_weights);
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index ab3747df031eb..86c021c8c1435 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -31,10 +31,23 @@
 
 #include <c10/macros/Macros.h>
 
+<<<<<<< HEAD
 #include <thrust/iterator/reverse_iterator.h>
 
 namespace at::native {
 
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+#include <thrust/iterator/reverse_iterator.h>
+#endif
+
+namespace at::native {
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -193,6 +206,10 @@ Tensor embedding_bag_backward_cuda_sum_avg(
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -203,7 +220,11 @@ Tensor embedding_bag_backward_cuda_sum_avg(
       auto count_data = count.mutable_data_ptr<index_t>();
       cuda::cub::inclusive_sum_by_key(
         sorted_data,
+<<<<<<< HEAD
         ATEN_CUB_CONSTANT_ITERATOR(index_t)(1),
+=======
+        NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::ConstantInputIterator<index_t>(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count_data,
         num_indices
       );
@@ -215,10 +236,22 @@ Tensor embedding_bag_backward_cuda_sum_avg(
         thrust::make_reverse_iterator(sorted_data + num_indices),
         thrust::make_reverse_iterator(count_data + num_indices),
         thrust::make_reverse_iterator(count_data + num_indices),
+<<<<<<< HEAD
         ATEN_CUB_MAXIMUM(),
         num_indices
       );
     });
+=======
+        NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max(),
+        num_indices
+      );
+    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
+      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
+    });
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
       count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 227d42247ebd9..5b22b01893130 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -51,7 +51,11 @@ std::vector<Tensor> foreach_tensor_list_op(
       Op<opmath_t>(),
       alpha.to<opmath_t>());
 
+<<<<<<< HEAD
   return std::move(tensor_lists[2]);
+=======
+  return tensor_lists[2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 9ac0e875b2d68..cadc1490970ba 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -45,7 +45,11 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
       Op<opmath_t>(),
       scalar.to<opmath_t>());
+<<<<<<< HEAD
   return std::move(tensor_lists[1]);
+=======
+  return tensor_lists[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index b28aa690630b4..2fe7112dedfae 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -33,7 +33,11 @@ std::vector<Tensor> foreach_binary_op(
   }
 
   tensor_lists.emplace_back(tensors.vec());
+<<<<<<< HEAD
   tensor_lists.emplace_back(std::move(vec_res));
+=======
+  tensor_lists.emplace_back(vec_res);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using opmath_t = at::opmath_type<T>;
   multi_tensor_apply<2, opmath_t>(
@@ -46,7 +50,11 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
 
       Op<opmath_t>());
+<<<<<<< HEAD
   return std::move(tensor_lists[1]);
+=======
+  return tensor_lists[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
index bc6bd37891258..62696916cccda 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@@ -56,7 +56,11 @@ std::vector<Tensor> foreach_binary_op(
       Op<opmath_t>(),
       scalar.data_ptr<T>(),
       alpha.to<opmath_t>());
+<<<<<<< HEAD
   return std::move(tensor_lists[1]);
+=======
+  return tensor_lists[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index c121d971cd7be..a6938aeb94d24 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -208,7 +208,11 @@ struct BinaryOpScalarFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op,
       opmath_t scalar) {
@@ -232,7 +236,11 @@ struct BinaryOpScalarListFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListScalarListMetadata<opmath_t, depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
@@ -256,7 +264,11 @@ struct BinaryOpListAlphaFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op,
       opmath_t alpha) {
@@ -308,7 +320,11 @@ struct BinaryOpScalarTensorFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op,
       T* scalar,
@@ -364,7 +380,11 @@ struct BinaryOpScalarTensorFunctor {
 template <typename T, int depth, int r_args_depth, int res_arg_index>
 struct ZeroFunctor {
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<1>& tl) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
     const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
@@ -406,7 +426,11 @@ struct UnaryOpFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
@@ -458,7 +482,11 @@ struct PointwiseOpScalarFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op,
       opmath_t scalar) {
@@ -482,7 +510,11 @@ struct PointwiseOpScalarListFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListScalarListMetadata<opmath_t, depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
@@ -506,7 +538,11 @@ struct PointwiseOpListFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
@@ -557,7 +593,11 @@ struct TernaryOpListFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op) {
     static_assert(depth == 3 || depth == 4, "");
@@ -611,7 +651,11 @@ struct TernaryOpScalarFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op,
       opmath_t alpha) {
@@ -668,7 +712,11 @@ struct TernaryOpScalarListFunctor {
   using opmath_t = at::opmath_type<T>;
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListScalarListMetadata<opmath_t, depth>& tl,
       Op op) {
     static_assert(depth == 2 || depth == 3, "");
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 7f563f55d5565..4bd8e26ff7b50 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -57,7 +57,11 @@ std::vector<Tensor> foreach_pointwise_op(
             scalar.to<opmath_t>());
       });
 
+<<<<<<< HEAD
   return std::move(tensor_lists[3]);
+=======
+  return tensor_lists[3];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <template <class> class Op>
@@ -160,7 +164,11 @@ std::vector<Tensor> foreach_pointwise_op(
             Op<opmath_t>());
       });
 
+<<<<<<< HEAD
   return std::move(tensor_lists[3]);
+=======
+  return tensor_lists[3];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
diff --git a/aten/src/ATen/native/cuda/ForeachReduceOp.cu b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
index 2da8e634981f9..ee674fbed2767 100644
--- a/aten/src/ATen/native/cuda/ForeachReduceOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
@@ -53,7 +53,11 @@ template <
     int res_arg_index = 0>
 struct LpMaxFunctor {
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       T* output_per_tensor_ptr,
       const int max_chunks_per_tensor) {
@@ -243,7 +247,11 @@ template <
 struct LpNormFunctor {
   using out_opmath_t = typename at::opmath_type<out_t>;
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       out_opmath_t* output_per_tensor_ptr,
       const int max_chunks_per_tensor) {
diff --git a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
index 313aa567bb055..05e9729b5edb4 100644
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@@ -37,7 +37,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
+<<<<<<< HEAD
       tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
+=======
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -56,7 +60,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
             LerpFunctor<opmath_t>());
       });
 
+<<<<<<< HEAD
   return std::move(tensor_lists[3]);
+=======
+  return tensor_lists[3];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void foreach_tensor_lerp_ternary_cuda_(
@@ -104,7 +112,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
+<<<<<<< HEAD
       tensors1.vec(), tensors2.vec(), std::move(vec_res)};
+=======
+      tensors1.vec(), tensors2.vec(), vec_res};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -124,7 +136,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
             weight.to<opmath_t>());
       });
 
+<<<<<<< HEAD
   return std::move(tensor_lists[2]);
+=======
+  return tensor_lists[2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void foreach_tensor_lerp_list_cuda_(
@@ -173,7 +189,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
+<<<<<<< HEAD
       tensors1.vec(), tensors2.vec(), std::move(vec_res)};
+=======
+      tensors1.vec(), tensors2.vec(), vec_res};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -193,7 +213,11 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
             LerpFunctor<opmath_t>());
       });
 
+<<<<<<< HEAD
   return std::move(tensor_lists[2]);
+=======
+  return tensor_lists[2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void foreach_tensor_lerp_scalarlist_cuda_(
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index bb070f9d97616..9cdf02281f3ed 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -67,7 +67,11 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
           /* res_arg_index */ 1>(),
       Op<opmath_t>());
 
+<<<<<<< HEAD
   return std::move(tensor_lists[1]);
+=======
+  return tensor_lists[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename scalar_t, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
index d0cf7e06c8688..12c68d73c2b4a 100644
--- a/aten/src/ATen/native/cuda/FusedSgdKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
@@ -62,7 +62,11 @@ struct FusedSgdMathFunctor {
       depth == 2 || depth == 3,
       "depth of 2 for SGD w/ momentum == 0, 3 for SGD w/ momentum != 0");
   C10_DEVICE __forceinline__ void operator()(
+<<<<<<< HEAD
       const int64_t chunk_size,
+=======
+      const int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       const double weight_decay,
       const double momentum,
diff --git a/aten/src/ATen/native/cuda/GroupMM.cu b/aten/src/ATen/native/cuda/GroupMM.cu
index a917b0d6163fa..44e151d1b6ab7 100644
--- a/aten/src/ATen/native/cuda/GroupMM.cu
+++ b/aten/src/ATen/native/cuda/GroupMM.cu
@@ -8,10 +8,16 @@
 #include <c10/util/irange.h>
 
 
+<<<<<<< HEAD
 // Three warninngs in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
+=======
+// Two warninngs in Cutlass included header files
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Determine if the architecture supports rowwise scaled mm
 // Currently failing on windows with:
@@ -44,6 +50,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
 #include <cutlass/gemm/dispatch_policy.hpp>
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 
+<<<<<<< HEAD
 #include <ATen/native/cuda/cutlass_common.cuh>
 
 namespace {
@@ -52,6 +59,13 @@ using Strides = at::cuda::detail::Strides; // std::array<int64_t, 3>;
 template <typename ArchTag, bool PONGOr2SM, typename TB_M, typename TB_N, typename TB_K>
 struct Schedule {
   // SM90
+=======
+namespace {
+using Strides = at::cuda::detail::Strides; // std::array<int64_t, 3>;
+
+template <bool PONG, typename TB_M, typename TB_N, typename TB_K>
+struct Schedule {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using CooperativeSchedule =
       cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
   using PongSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;
@@ -59,6 +73,7 @@ struct Schedule {
       cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
   using PongEpilogueSchedule =
       cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+<<<<<<< HEAD
   // SM100
   using MMA1SMKernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
   using MMA1SMEpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
@@ -72,6 +87,12 @@ struct Schedule {
     cute::conditional_t<PONGOr2SM, MMA2SMEpilogueSchedule, MMA1SMEpilogueSchedule>,
     cute::conditional_t<PONGOr2SM, PongEpilogueSchedule, CooperativeEpilogueSchedule>>;
 
+=======
+  using KernelSchedule =
+      cute::conditional_t<PONG, PongSchedule, CooperativeSchedule>;
+  using EpilogueSchedule = cute::
+      conditional_t<PONG, PongEpilogueSchedule, CooperativeEpilogueSchedule>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 int ceildiv(int a, int b) {
@@ -83,6 +104,7 @@ int round_up_to_nearest_multiple(int a, int b) {
 }
 
 template <
+<<<<<<< HEAD
     typename ArchTag,
     bool a_row_major,
     bool b_row_major,
@@ -91,6 +113,15 @@ template <
     typename TB_N,
     typename TB_K>
 void bf16bf16_grouped_gemm_impl_sm90_sm100(
+=======
+    bool a_row_major,
+    bool b_row_major,
+    bool Pong,
+    typename TB_M,
+    typename TB_N,
+    typename TB_K>
+void bf16bf16_grouped_gemm_impl_sm90(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor mat_a, // bf16
     at::Tensor mat_b, // bf16
     std::optional<at::Tensor> offs,
@@ -113,13 +144,23 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
   constexpr int AlignmentB = 16 / sizeof(DtypeB);
   using LayoutOutput = cutlass::layout::RowMajor;
   constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);
+<<<<<<< HEAD
+=======
+  using ArchTag = cutlass::arch::Sm90;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using OperatorClass = cutlass::arch::OpClassTensorOp;
   using TileShape = cute::Shape<TB_M, TB_N, TB_K>;
   using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
   using KernelSchedule =
+<<<<<<< HEAD
       typename Schedule<ArchTag, PONGOr2SM, TB_M, TB_N, TB_K>::KernelSchedule;
   using EpilogueSchedule =
       typename Schedule<ArchTag, PONGOr2SM, TB_M, TB_N, TB_K>::EpilogueSchedule;
+=======
+      typename Schedule<Pong, TB_M, TB_N, TB_K>::KernelSchedule;
+  using EpilogueSchedule =
+      typename Schedule<Pong, TB_M, TB_N, TB_K>::EpilogueSchedule;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using ProblemShape = cutlass::gemm::GroupProblemShape<
       cute::Shape<int32_t, int32_t, int32_t>>; // <M,N,K> per
                                                // group
@@ -159,6 +200,7 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
               sizeof(typename CollectiveEpilogue::SharedStorage))>,
           KernelSchedule>::CollectiveOp;
+<<<<<<< HEAD
 
   using GemmKernelBase = cutlass::gemm::kernel::GemmUniversal<
       ProblemShape,
@@ -169,6 +211,10 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
       std::is_same_v<ArchTag, cutlass::arch::Sm100>,
       at::cuda::detail::enable_3x_kernel_for_sm10<GemmKernelBase>,
       at::cuda::detail::enable_3x_kernel_for_sm9x<GemmKernelBase>>;
+=======
+  using GemmKernel = cutlass::gemm::kernel::
+      GemmUniversal<ProblemShape, CollectiveMainloop, CollectiveEpilogue>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
   using StrideA = typename Gemm::GemmKernel::InternalStrideA;
@@ -241,8 +287,11 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
   Strides tensor_StrideA = make_strides(mat_a.strides());
   Strides tensor_StrideB = make_strides(mat_b.strides());
   Strides tensor_StrideOutput = make_strides(out.strides());
+<<<<<<< HEAD
   Strides tensor_ShapeA = make_strides(mat_a.sizes());
   Strides tensor_ShapeB = make_strides(mat_b.sizes());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
       reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
@@ -266,8 +315,11 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
       tensor_StrideA,
       tensor_StrideB,
       tensor_StrideOutput,
+<<<<<<< HEAD
       tensor_ShapeA,
       tensor_ShapeB,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       0,
       0,
       a_row_major,
@@ -344,6 +396,7 @@ void dispatch_bf16_grouped_kernel_on_tile_size(
   //       ((M >= 2048 && K >= 2048) || (M >= 2048 && N >= 2048) ||
   //        (K >= 2048 && N >= 2048));
   bool small = (M <= 128 || N <= 128);
+<<<<<<< HEAD
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
   const bool sm10x = properties != nullptr && properties->major == 10;
 
@@ -387,6 +440,24 @@ void dispatch_bf16_grouped_kernel_on_tile_size(
         cute::_256,
         cute::_64>(mat_a, mat_b, offs, bias, out);
     }
+=======
+  if (small) {
+    bf16bf16_grouped_gemm_impl_sm90<
+        a_row_major,
+        b_row_major,
+        /*Pong*/ true,
+        cute::_64,
+        cute::_128,
+        cute::_128>(mat_a, mat_b, offs, bias, out);
+  } else {
+    bf16bf16_grouped_gemm_impl_sm90<
+        a_row_major,
+        b_row_major,
+        /*Pong*/ false,
+        cute::_128,
+        cute::_256,
+        cute::_64>(mat_a, mat_b, offs, bias, out);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/GroupMMCommon.cuh b/aten/src/ATen/native/cuda/GroupMMCommon.cuh
index ed8176b53f84c..9fecb6c1b760d 100644
--- a/aten/src/ATen/native/cuda/GroupMMCommon.cuh
+++ b/aten/src/ATen/native/cuda/GroupMMCommon.cuh
@@ -38,20 +38,37 @@ __global__ void prepare_grouped_gemm_data(
     Strides tensor_StrideA,
     Strides tensor_StrideB,
     Strides tensor_StrideOutput,
+<<<<<<< HEAD
     Strides tensor_ShapeA,
     Strides tensor_ShapeB,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t a_scale_stride,
     int64_t b_scale_stride,
     bool a_row_major = true,
     bool b_row_major = false) {
   int32_t tid = threadIdx.x;
   int32_t delta = 0;
+<<<<<<< HEAD
   int32_t offset = 0;
   if (offs != nullptr) {
     int32_t start = tid == 0 ? 0 : offs[tid - 1];
     offset = offs[tid];
     delta = offset - start;
     CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
+=======
+  if (offs != nullptr) {
+    int32_t start = tid == 0 ? 0 : offs[tid - 1];
+    delta = offs[tid] - start;
+    if (K < 0) {
+      if (!a_row_major && b_row_major) {
+        CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
+      } else  {
+        // CUTLASS cannot handle delta=0 here.
+        CUDA_KERNEL_ASSERT(delta >0 && "expected ofsets to be greater than 0\n");
+      }
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // TMA transfers require global memory tensor addresses to be
     // aligned to 16 bytes.
@@ -86,7 +103,10 @@ __global__ void prepare_grouped_gemm_data(
   int64_t lda, ldb, ldoutput;
   if (M < 0) {
     // A and output is 2d
+<<<<<<< HEAD
     CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     M = delta;
     lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
     ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
@@ -99,7 +119,10 @@ __global__ void prepare_grouped_gemm_data(
     output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
     B_ptrs[tid] = B + tid * tensor_StrideB[0];
   } else if (N < 0) {
+<<<<<<< HEAD
     CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     N = delta;
     lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
     ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
@@ -112,7 +135,10 @@ __global__ void prepare_grouped_gemm_data(
       inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
     }
   } else if (K < 0) {
+<<<<<<< HEAD
     CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // A, B is 2d, output is 3d
     K = delta;
     lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index 73db6272be9ef..0b07a5ff32c18 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -82,7 +82,11 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
 
+<<<<<<< HEAD
   constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
+=======
+  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -97,7 +101,11 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
+<<<<<<< HEAD
   constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+=======
+  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     1.,
     66.,
     1925.,
@@ -126,10 +134,17 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t ax, fac, res, num, numfac;
+<<<<<<< HEAD
   constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
   constexpr accscalar_t EXP1 = 2.718281828459045;
   constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
+=======
+  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+    7.09782712893383996843E2 : 88.72283905206835;
+  static const accscalar_t EXP1 = 2.718281828459045;
+  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (::fabs(a - x) > 0.4 * ::fabs(a)) {
     ax = a * ::log(x) - x - ::lgamma(a);
@@ -158,9 +173,15 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+<<<<<<< HEAD
   constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   constexpr int MAXITER = 2000;
+=======
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static const int MAXITER = 2000;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int i;
   accscalar_t ans, ax, c, r;
@@ -196,8 +217,13 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   accscalar_t fac = 1;
   accscalar_t sum = 0;
   accscalar_t term, logx;
+<<<<<<< HEAD
   constexpr int MAXITER = 2000;
   constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+=======
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -219,7 +245,11 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+<<<<<<< HEAD
   constexpr accscalar_t d[25][25] =
+=======
+  static const accscalar_t d[25][25] =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@@ -248,7 +278,11 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
 
   int k, n, sgn;
   int maxpow = 0;
+<<<<<<< HEAD
   constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+=======
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   accscalar_t lambda = x / a;
   accscalar_t sigma = (x - a) / a;
@@ -314,12 +348,21 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
   int i;
   accscalar_t ans, ax, c, yc, r, t, y, z;
   accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+<<<<<<< HEAD
   constexpr int MAXITER = 2000;
   constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
     4.503599627370496e15 : 16777216.;
   constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+=======
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+    4.503599627370496e15 : 16777216.;
+  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
@@ -385,10 +428,17 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
 
+<<<<<<< HEAD
   constexpr accscalar_t SMALL = 20.0;
   constexpr accscalar_t LARGE = 200.0;
   constexpr accscalar_t SMALLRATIO = 0.3;
   constexpr accscalar_t LARGERATIO = 4.5;
+=======
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if ((x < 0) || (a < 0)) {
     // out of defined-region of the function
@@ -467,10 +517,17 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
+<<<<<<< HEAD
   constexpr accscalar_t SMALL = 20.0;
   constexpr accscalar_t LARGE = 200.0;
   constexpr accscalar_t SMALLRATIO = 0.3;
   constexpr accscalar_t LARGERATIO = 4.5;
+=======
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // boundary values following SciPy
   if ((x < 0) || (a < 0)) {
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index 927af661396cd..2963397dd7efa 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -5,7 +5,10 @@
 #include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
+<<<<<<< HEAD
 #include <ATen/ceil_div.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -84,6 +87,7 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
         auto ind_dim_size = index_size[0];
         auto inp_stride_bytes = index_stride[0];
         auto out_stride_bytes = iter.strides(0)[1];
+<<<<<<< HEAD
         // avoid grid overflow in the fast kernel
         const int64_t vec_chunks = ceil_div(slice_size, alignment);
         const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
@@ -95,6 +99,13 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
           return;
         }
     }
+=======
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+        return;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   auto sizes = std::array<int64_t, MAX_DIMS>{};
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 5c8b98105bb26..6cb4ed80eecbb 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -6,7 +6,11 @@
 #endif
 
 // ROCm 6.3 is planned to have these functions, but until then here they are.
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && ROCM_VERSION >= 60201
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
@@ -115,7 +119,13 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     index_t index,
     const index_t numel,
     scalar_t value) {
+<<<<<<< HEAD
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
+=======
+#if (                      \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::Half*>(tensor) + index,
       static_cast<at::Half>(value));
@@ -158,7 +168,13 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     index_t index,
     const index_t numel,
     scalar_t value) {
+<<<<<<< HEAD
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+=======
+#if (                      \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::BFloat16*>(tensor) + index,
       static_cast<at::BFloat16>(value));
@@ -312,8 +328,11 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
         }
     }
 
+<<<<<<< HEAD
     // not coalsced, so now let try to capture lane-matches...
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
       // well shucks, unlikely to capture same-dest atomics in a wave.
       // fall back to direct fastAtomic...
@@ -321,6 +340,10 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
       return;
     }
 
+<<<<<<< HEAD
+=======
+    // not coalsced, so now let try to capture lane-matches...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // __activemask() -- finds the set of threads in the warp that are about to perform atomicAdd
     // __match_any_sync() -- returns bit mask of the threads that have same dest addr
     auto mask = __match_any_sync(__activemask(), (int64_t)dst);
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
new file mode 100644
index 0000000000000..6a549ac3d62cc
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@@ -0,0 +1,90 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/cuda/SortingCommon.cuh>
+#include <ATen/cuda/cub_definitions.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <ATen/cuda/ThrustAllocator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace at::native {
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  at::cuda::ThrustAllocator allocator;
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto num_indices = count.numel();
+
+  // Compute an increasing sequence per unique item in sortedIndices:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 1 2 3 1 2 1 1 2
+  auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
+  thrust::inclusive_scan_by_key(
+    policy,
+    sorted_data,
+    sorted_data + num_indices,
+    thrust::make_constant_iterator(1),
+    count_data
+  );
+
+  // Take the maximum of each count per unique key in reverse:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 3 3 3 2 2 1 2 2
+  thrust::inclusive_scan_by_key(
+    policy,
+    thrust::make_reverse_iterator(sorted_data + num_indices),
+    thrust::make_reverse_iterator(sorted_data),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::equal_to<index_t>(),
+    thrust::maximum<index_t>()
+  );
+}
+
+template
+void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
+template
+void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
+
+#endif
+
+template<typename index_t>
+int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  at::cuda::ThrustAllocator allocator;
+  auto policy = thrust::cuda::par(allocator).on(stream);
+  const ptrdiff_t numel = sorted_indices.numel();
+  auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
+  auto ends = thrust::unique_by_key_copy(
+          policy,
+          sorted_indices_dev,
+          sorted_indices_dev + numel,
+          thrust::make_counting_iterator(0),
+          dummy_dev,
+          thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
+  return thrust::get<0>(ends) - dummy_dev;
+}
+
+template
+int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
+template
+int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
index 7cdbfbef087bd..bcedb4e4a6c5d 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -8,6 +8,10 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/DynamicLibrary.h>
+<<<<<<< HEAD
+=======
+#include <ATen/NativeFunctions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/LinearAlgebra.h>
@@ -154,7 +158,11 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel)
 
 // Old style dispatches
 // torch_cuda_linalg dynamic library should have a global constructor
+<<<<<<< HEAD
 // that calls registerLinalgDispatch so in order ot lazy bind
+=======
+// that calls regiserLinaglDispatch so in order ot lazy bind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // old style dispatch all one have to do is to load library and call disp.func_name
 // Protect from infinite recursion by initializing dispatch to self and checking
 // that values are different after linalg library were loaded
diff --git a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
index 2d276137c17ec..913664740705a 100644
--- a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
@@ -102,7 +102,17 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::comple
 }
 
 void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
+<<<<<<< HEAD
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
+=======
+// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
+#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
+#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
+#else
+#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
+#endif
+  _LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       self.scalar_type(), "logcumsumexp_cuda",
       [&]() {
         using opmath_t = at::opmath_type<scalar_t>;
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index a80c51fa6a9cb..68a5e8109858f 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -1,17 +1,32 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <ATen/OpMathType.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TensorIteratorDynamicCasting.h>
 #include <ATen/native/cuda/thread_constants.h>
+=======
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/cuda/MemoryAccess.cuh>
 
 #include <tuple>
 
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 template<int N>
@@ -61,11 +76,15 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
   #pragma unroll
   for (int i = 0; i < elems_per_thread; i++) {
     if (policy.check_inbounds(i)) {
+<<<<<<< HEAD
 #if defined(__HIP__)
       results[i] = c10::guts::apply(f, args[i]);
 #else
       results[i] = std::apply(f, args[i]);
 #endif
+=======
+      results[i] = c10::guts::apply(f, args[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index c6d3c25200d50..26894d844203f 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -644,12 +644,16 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
   Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
 
   // As above, there may be better configurations to use.
+<<<<<<< HEAD
   constexpr int max_threads_ = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
   int max_threads = max_threads_;
   // Blackwell launch bounds
   if (at::cuda::getCurrentDeviceProperties()->major >= 10) {
     max_threads = 512;
   }
+=======
+  constexpr int max_threads = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1fa245af1a4d1..b0daa37a4986f 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -231,7 +231,11 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
   template <typename T>
   T digamma(T x) {
+<<<<<<< HEAD
     static constexpr double PI_f64 = 3.14159265358979323846;
+=======
+    static const double PI_f64 = 3.14159265358979323846;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
     if (x == 0) {
@@ -1946,7 +1950,11 @@ const auto chebyshev_polynomial_t_string = jiterator_stringify(
         T q = x;
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x) * q - p;
             p = q;
             q = r;
@@ -1996,7 +2004,11 @@ const auto chebyshev_polynomial_u_string = jiterator_stringify(
         T q = x + x;
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x) * q - p;
             p = q;
             q = r;
@@ -2054,7 +2066,11 @@ const auto chebyshev_polynomial_v_string = jiterator_stringify(
         T q = x + x - T(1.0);
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x) * q - p;
             p = q;
             q = r;
@@ -2116,7 +2132,11 @@ const auto chebyshev_polynomial_w_string = jiterator_stringify(
         T q = x + x + T(1.0);
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x) * q - p;
             p = q;
             q = r;
@@ -2252,7 +2272,11 @@ const auto laguerre_polynomial_l_string = jiterator_stringify(
         T q = T(1.0) - x;
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 1; (k < n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 1; k < n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
             p = q;
             q = r;
@@ -2294,7 +2318,11 @@ const auto legendre_polynomial_p_string = jiterator_stringify(
         T q = x;
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 1; (k < n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 1; k < n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = ((k + k + 1) * x * q - k * p) / (k + 1);
             p = q;
             q = r;
@@ -2851,7 +2879,11 @@ const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
         T q = x + x - T(1.0);
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
             p = q;
             q = r;
@@ -2905,7 +2937,11 @@ const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
         T q = x + x - T(1.0) + (x + x - T(1.0));
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
             p = q;
             q = r;
@@ -2963,7 +2999,11 @@ const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
         T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
             p = q;
             q = r;
@@ -3021,7 +3061,11 @@ const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
         T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
         T r;
 
+<<<<<<< HEAD
         for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
+=======
+        for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
             p = q;
             q = r;
@@ -3072,9 +3116,15 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+<<<<<<< HEAD
   static constexpr double PI_f64 = 3.14159265358979323846;
   constexpr accscalar_t PSI_10 = 2.25175258906672110764;
   constexpr accscalar_t A[] = {
+=======
+  static const double PI_f64 = 3.14159265358979323846;
+  const accscalar_t PSI_10 = 2.25175258906672110764;
+  const accscalar_t A[] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       8.33333333333333333333E-2,
       -2.10927960927960927961E-2,
       7.57575757575757575758E-3,
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index c270a8432ff30..47382f5e24c49 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -125,6 +125,11 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
+<<<<<<< HEAD
+=======
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
       indices_arg{indices_, "indices_", 3};
@@ -147,9 +152,12 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
       output_size.size() == 2,
       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
 
+<<<<<<< HEAD
   auto oheight = output_size[0];
   auto owidth = output_size[1];
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t dimw = 2;
   int64_t dimh = 1;
   int64_t numBatch = 1;
@@ -218,6 +226,12 @@ static void max_unpooling3d_shape_check(
     IntArrayRef stride,
     IntArrayRef padding,
     const char *fn_name) {
+<<<<<<< HEAD
+=======
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       indices.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices.scalar_type());
@@ -248,10 +262,13 @@ static void max_unpooling3d_shape_check(
       "strides should be greater than zero, but got stride: ",
       stride);
 
+<<<<<<< HEAD
   int64_t oT = output_size[0];
   int64_t oH = output_size[1];
   int64_t oW = output_size[2];
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
@@ -404,6 +421,11 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& grad_input) {
+<<<<<<< HEAD
+=======
+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
@@ -426,9 +448,12 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
 
   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
 
+<<<<<<< HEAD
   int64_t oheight = output_size[0];
   int64_t owidth = output_size[1];
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t nInputCols, nInputRows, nInputPlane;
 
   int dimw = 2;
@@ -508,14 +533,23 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
     IntArrayRef padding,
     Tensor& grad_input) {
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+<<<<<<< HEAD
 
   max_unpooling3d_shape_check(
     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t oT = output_size[0];
   int64_t oH = output_size[1];
   int64_t oW = output_size[2];
 
+<<<<<<< HEAD
+=======
+  max_unpooling3d_shape_check(
+    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int batchSize = 0;
   int inputSlices = 0;
   int inputTime = 0;
diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
index ad7b3638b489d..3c179f61c6e6e 100644
--- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu
+++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
@@ -121,7 +121,10 @@ __global__ void MultiMarginLoss_backward_kernel(
     gradInput_k[target_k] = static_cast<scalar_t>(gradInput_target_k);
   }
 
+<<<<<<< HEAD
   __syncthreads();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int i=i_start; i<i_end; i+= i_step) {
     gradInput_k[i] *= * gradOutput_k;
   }
diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu
index 5009efedc9725..d8238dd56cda3 100644
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@@ -146,7 +146,10 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
   int64_t batch_size = target.size(0);
   int64_t H = target.size(1);
   int64_t W = target.size(2);
+<<<<<<< HEAD
   int64_t n_classes = grad_input.size(1);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   CUDA_KERNEL_LOOP(index, n_threads) {
     const int64_t b = index % batch_size;
@@ -157,7 +160,10 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
     if (cur_target == ignore_index) {
       continue;
     }
+<<<<<<< HEAD
     CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
     grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
   }
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index 8811f8dc5117e..1229465d433fe 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -94,7 +94,11 @@ __global__ void flag_kernel(const T* d_in, int64_t * d_out, const int64_t * agg,
 
   // Specialize BlockScan type for our thread block
   using BlockScanT = ROCM_HIPCUB(at_cuda_detail::cub)::BlockScan<int, BLOCK_THREADS, ROCM_HIPCUB(at_cuda_detail::cub)::BLOCK_SCAN_WARP_SCANS>;
+<<<<<<< HEAD
   using TransformInputIteratorT = ATEN_CUB_TRANSFORM_ITERATOR(int, NonZeroOp<T>, const T*);
+=======
+  using TransformInputIteratorT = ROCM_HIPCUB(at_cuda_detail::cub)::TransformInputIterator<int, NonZeroOp<T>, const T*>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using BlockExchangeT =  ROCM_HIPCUB(at_cuda_detail::cub)::BlockExchange<int, BLOCK_THREADS, ITEMS_PER_THREAD>;
 
   // Shared memory
@@ -184,7 +188,11 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
   auto num_nonzeros = allocator.allocate(sizeof(int) * num_chunks);
   for (int64_t idx = 0; idx < num_chunks; idx++) {
     int64_t remaining = std::min(chunk_size, self.numel() - idx * chunk_size);
+<<<<<<< HEAD
     ATEN_CUB_TRANSFORM_ITERATOR(bool, NonZeroOp<scalar_t>, const scalar_t*) itr(
+=======
+    cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_.const_data_ptr<scalar_t>() + idx * chunk_size,
         NonZeroOp<scalar_t>());
     AT_CUDA_CHECK(cub::DeviceReduce::Sum(
@@ -243,8 +251,13 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
     for (int64_t idx = 0; idx < num_chunks; idx++) {
       int remaining = std::min(chunk_size, self.numel() - idx * chunk_size);
 
+<<<<<<< HEAD
       ATEN_CUB_COUNTING_ITERATOR(int64_t) counting_itr(idx * chunk_size);
       ATEN_CUB_TRANSFORM_ITERATOR(bool, NonZeroOp<scalar_t>, const scalar_t*)
+=======
+      cub::CountingInputIterator<int64_t> counting_itr(idx * chunk_size);
+      cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           itr(self_.const_data_ptr<scalar_t>() + idx * chunk_size,
               NonZeroOp<scalar_t>());
       temp_storage_bytes = 0;
@@ -300,6 +313,11 @@ void nonzero_static_cuda_out_impl(
     int64_t size,
     int64_t fill_value,
     Tensor& out) {
+<<<<<<< HEAD
+=======
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor self_contiguous_ = self.contiguous();
   // see comment in nonzero_cuda_out_impl on reqs for out
   bool out_correct_size =
@@ -315,6 +333,7 @@ void nonzero_static_cuda_out_impl(
     out_temp =
         Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
   }
+<<<<<<< HEAD
   // If input has zero elements, avoid kernel grid calculations (which can
   // produce zero divisors) and just fill the output with fill_value.
   if (self.numel() == 0) {
@@ -326,6 +345,8 @@ void nonzero_static_cuda_out_impl(
     }
     return;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
                                        : out.mutable_data_ptr<int64_t>();
 
@@ -375,6 +396,12 @@ void nonzero_static_cuda_out_impl(
   if (need_to_copy) {
     out.copy_(out_temp);
   }
+<<<<<<< HEAD
+=======
+#else
+  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index d211adc3f6a78..88215822cbba4 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -121,7 +121,11 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
     for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
 #pragma unroll
       for (int u = 0; u < UNRL; u++)
+<<<<<<< HEAD
         tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
+=======
+        tmp[u] = op(batch, plane, min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #pragma unroll
       for (int u = 0; u < UNRL; u++)
         if (x+u*blockDim.x < tensor.size(2))
@@ -311,7 +315,11 @@ __global__ void batch_norm_collect_statistics_kernel(
     stat_accscalar_t v_[UNRL];
     for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
       for (int u = 0; u < UNRL; u++)
+<<<<<<< HEAD
         v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)];
+=======
+        v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (int u = 0; u < UNRL; u++) {
         if (x+u*blockDim.x < input.size(2)) {
           stat_accscalar_t d1 = v_[u] - avg;
diff --git a/aten/src/ATen/native/cuda/Pow.cuh b/aten/src/ATen/native/cuda/Pow.cuh
index fe249c1cdaef3..17ac5781b37eb 100644
--- a/aten/src/ATen/native/cuda/Pow.cuh
+++ b/aten/src/ATen/native/cuda/Pow.cuh
@@ -14,7 +14,11 @@ namespace {
 //   pow(double, int)
 //   pow(float, float)
 //   pow(double, double)
+<<<<<<< HEAD
 #if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
+=======
+#ifdef _MSC_VER
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Functions for pow
 // pow for at::Half
 static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
diff --git a/aten/src/ATen/native/cuda/PowKernel.cu b/aten/src/ATen/native/cuda/PowKernel.cu
index 2698207c45ef5..06759dfdb5ea9 100644
--- a/aten/src/ATen/native/cuda/PowKernel.cu
+++ b/aten/src/ATen/native/cuda/PowKernel.cu
@@ -185,12 +185,15 @@ void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar
       return;
     }
     AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "pow_cuda", [&]() {
+<<<<<<< HEAD
       if (exp_scalar.equal(2.0)) {
         gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base) -> scalar_t {
           return base * base;
         });
         return;
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto exp = exp_scalar.to<scalar_t>();
       gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base) -> scalar_t {
         return pow_(base, exp);
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 22d82df5f205f..23e6e1ead3f97 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -413,12 +413,22 @@ struct ReduceOp {
       value = thread_reduce<output_vec_size>(input_slice);
     }
 
+<<<<<<< HEAD
     if (config.should_block_x_reduce()) {
       value = block_x_reduce<output_vec_size>(value, shared_memory);
     }
     if (config.should_block_y_reduce()) {
       value = block_y_reduce<output_vec_size>(value, shared_memory);
     }
+=======
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<output_vec_size>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<output_vec_size>(value, shared_memory);
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
     using offset_vec_t = std::array<index_t, output_vec_size>;
     offset_vec_t base_offsets;
@@ -653,6 +663,7 @@ struct ReduceOp {
     }
 
     __syncthreads();
+<<<<<<< HEAD
     // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
     // matching Triton, etc.
     // TODO(PaulZhang12): AMD and internal
@@ -661,6 +672,10 @@ struct ReduceOp {
     #else
     for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
     #endif
+=======
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = ops.warp_shfl_down(value[i], offset);
@@ -1095,7 +1110,15 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   // threads with different threadIdx.x are independent and will produce results for different outputs.
   // In such case, values in each loaded vector always correspond to different outputs.
   if (fastest_moving_stride == sizeof(scalar_t)) {
+<<<<<<< HEAD
+    if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
+=======
+#ifdef USE_ROCM
     if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
+#else
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Case 1: "vectorize along input"
       // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
       // we should avoid vectorization.
diff --git a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
index cabe86b313e94..bc1ea97c6de97 100644
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@@ -39,6 +39,7 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
 template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
 void mean_kernel_impl(TensorIterator& iter) {
   //  returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
+<<<<<<< HEAD
   constexpr bool is_16_bits = sizeof(scalar_t) == 2;
   using factor_t = typename c10::scalar_value_type<acc_t>::type;
   factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
@@ -47,6 +48,11 @@ void mean_kernel_impl(TensorIterator& iter) {
   } else {
     gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
   }
+=======
+  using factor_t = typename c10::scalar_value_type<acc_t>::type;
+  factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
+  gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void mean_kernel_cuda(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index 36f0835890de8..949a6c00af581 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -13,6 +13,7 @@ namespace at::native {
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct sum_functor {
   void operator()(TensorIterator& iter) {
+<<<<<<< HEAD
     const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
       return a + b;
     };
@@ -26,6 +27,26 @@ struct sum_functor {
         iter, func_wrapper<out_t>(sum_combine)
       );
     }
+=======
+#ifdef USE_ROCM
+    // Half and BFloat16 can be packed in groups of up to 8 elements and
+    // can use *_DWORDX4 instructions to achieve that.
+    const bool is_16_bits =
+      ( (std::is_same<at::Half, scalar_t>::value) ||
+        (std::is_same<at::BFloat16, scalar_t>::value) );
+    if (is_16_bits) {
+      gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
+        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }));
+      return;
+    }
+#endif
+    gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -72,8 +93,13 @@ struct nansum_functor_complex {
 #if AT_USE_JITERATOR()
   void operator()(TensorIterator& iter) {
     std::string func = jiterator_stringify(
+<<<<<<< HEAD
         arg_t combine(arg_t a, arg_t b) {
           return a + (std::isnan(b) ? arg_t{0.} : b);
+=======
+        arg_t combine(arg_t a, scalar_t b) {
+          return a + (std::isnan(b) ? arg_t{0.} : arg_t{b});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     );
     jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
@@ -82,7 +108,11 @@ struct nansum_functor_complex {
 #else
   void operator()(TensorIterator& iter) {
     using acc_t = at::opmath_type<scalar_t>;
+<<<<<<< HEAD
     gpu_reduce_kernel<scalar_t, scalar_t>(
+=======
+    gpu_reduce_kernel<scalar_t, acc_t>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         iter, NanSumOps<acc_t, acc_t>{});
   }
 #endif
@@ -149,6 +179,7 @@ struct prod_functor<c10::complex<at::Half>> {
 #endif
 };
 
+<<<<<<< HEAD
 template <typename scalar_t, typename enable = void>
 struct xor_sum_functor {
   void operator()(TensorIterator& iter) {
@@ -194,6 +225,8 @@ struct xor_sum_functor<scalar_t, std::enable_if_t<std::is_same_v<scalar_t, bool>
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // The function `reduce_dispatch` below dispatches to the kernel based
 // on the type of `iter`. It takes care of the common logic
 // for handling Half-Precision floating types.
@@ -262,6 +295,7 @@ static void prod_kernel_cuda(TensorIterator& iter) {
   reduce_dispatch<prod_functor>(iter, general_dispatcher);
 }
 
+<<<<<<< HEAD
 static void xor_sum_kernel_cuda(TensorIterator& iter) {
   // Use iter.dtype(1) to dispatch based on the type of the input tensor
   AT_DISPATCH_ALL_TYPES_AND3(
@@ -274,5 +308,10 @@ REGISTER_DISPATCH(sum_stub, &sum_kernel_cuda)
 REGISTER_DISPATCH(nansum_stub, &nansum_kernel_cuda)
 REGISTER_DISPATCH(prod_stub, &prod_kernel_cuda)
 REGISTER_DISPATCH(xor_sum_stub, &xor_sum_kernel_cuda)
+=======
+REGISTER_DISPATCH(sum_stub, &sum_kernel_cuda)
+REGISTER_DISPATCH(nansum_stub, &nansum_kernel_cuda)
+REGISTER_DISPATCH(prod_stub, &prod_kernel_cuda)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index 228f0321026f5..ff321f1a14cba 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -92,6 +92,7 @@ inline thrust::pair<int64_t, int64_t>  get_index_mapping2d(
     output_offset + output_y * output_dim_x + output_x);
 }
 
+<<<<<<< HEAD
 __device__ __forceinline__ int64_t reflect_index(int64_t x, int64_t len) {
   const int64_t two = (len - 1) * 2;
   if (two <= 0) {
@@ -102,6 +103,8 @@ __device__ __forceinline__ int64_t reflect_index(int64_t x, int64_t len) {
   return (m < len) ? m : (two - m);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<typename scalar_t>
 __global__ void reflection_pad1d_out_kernel(
     const scalar_t * input, scalar_t * output,
@@ -117,6 +120,7 @@ __global__ void reflection_pad1d_out_kernel(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 __global__ void reflection_pad1d_flat(
     const scalar_t* __restrict__ input,
     scalar_t* __restrict__ output,
@@ -139,6 +143,8 @@ __global__ void reflection_pad1d_flat(
 }
 
 template <typename scalar_t>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void reflection_pad1d_backward_out_kernel(
     scalar_t * grad_input, const scalar_t * grad_output,
     int64_t input_w,
@@ -742,6 +748,7 @@ TORCH_IMPL_FUNC(reflection_pad1d_out_cuda)
   int64_t input_w = input_.size(dim_w);
   int64_t output_w = input_w + pad_l + pad_r;
 
+<<<<<<< HEAD
 
   Tensor input = input_.contiguous();
 
@@ -780,6 +787,27 @@ TORCH_IMPL_FUNC(reflection_pad1d_out_cuda)
 
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
+=======
+  dim3 block_size(output_w > 256 ? 256 : output_w);
+  dim3 grid_size((int)::ceil(output_w / 256.0), nplane, nbatch);
+
+  Tensor input = input_.contiguous();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out_template", [&] {
+        reflection_pad1d_out_kernel<<<
+            grid_size,
+            block_size,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            input.const_data_ptr<scalar_t>(),
+            output.mutable_data_ptr<scalar_t>(),
+            input_w,
+            pad_l,
+            pad_r);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(reflection_pad1d_backward_out_cuda)(const Tensor& grad_output_,
diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu
index e131081af153d..53e9ff4e03657 100644
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@@ -17,12 +17,16 @@ __global__ static void compute_cuda_kernel(
     index_t* result_ptr,
     int64_t size,
     int64_t result_size) {
+<<<<<<< HEAD
   CUDA_KERNEL_ASSERT_PRINTF(
       result_size == cumsum_ptr[size - 1],
       "Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
       result_size,
       cumsum_ptr[size - 1]);
 
+=======
+  CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
   int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
   int warp_id = idx / C10_WARP_SIZE;
diff --git a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
index 3eeca901a18d5..519d201945dff 100644
--- a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+++ b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@@ -9,7 +9,10 @@
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Determine if the architecture supports rowwise scaled mm
 // Currently failing on windows with:
@@ -47,7 +50,10 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
 
 C10_DIAGNOSTIC_POP()
 C10_DIAGNOSTIC_POP()
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
index 9a06c5907febc..837ce032d0f9f 100644
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@@ -10,7 +10,10 @@
 // Two warninngs in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Determine if the architecture supports rowwise scaled mm
 // Currently failing on windows with:
@@ -47,7 +50,10 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
 
 C10_DIAGNOSTIC_POP()
 C10_DIAGNOSTIC_POP()
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -298,9 +304,12 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   Strides tensor_StrideA = make_strides(mat_a.strides());
   Strides tensor_StrideB = make_strides(mat_b.strides());
   Strides tensor_StrideOutput = make_strides(out.strides());
+<<<<<<< HEAD
   Strides tensor_ShapeA = make_strides(mat_a.sizes());
   Strides tensor_ShapeB = make_strides(mat_b.sizes());
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // scale stride will be used inside the kernel only if needed,
   // so for 1d scales the "1" assigned here won't be used
   int64_t a_scale_stride = scale_a.stride(0);
@@ -328,8 +337,11 @@ void f8f8bf16_grouped_gemm_impl_sm90(
       tensor_StrideA,
       tensor_StrideB,
       tensor_StrideOutput,
+<<<<<<< HEAD
       tensor_ShapeA,
       tensor_ShapeB,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       a_scale_stride,
       b_scale_stride);
 
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index c6f88692a8a5c..ffd18f3f337ed 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -20,7 +20,11 @@
 
 // SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
 // See https://github.com/pytorch/pytorch/issues/156181
+<<<<<<< HEAD
 #if !(defined(_WIN32) && CUDART_VERSION == 12090)
+=======
+#if !defined(_WIN32) || CUDART_VERSION < 12090
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 
@@ -606,4 +610,8 @@ REGISTER_DISPATCH(
 
 } // namespace at::native
 
-#endif
\ No newline at end of file
+<<<<<<< HEAD
+#endif
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index b8774e18487bb..63118d6f6fc79 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -226,6 +226,7 @@ __global__ void CatArrayBatchedCopy_contig(
     }
 }
 
+<<<<<<< HEAD
 
 template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
 __global__ void CatArrayBatchedCopy_vectorized(
@@ -258,6 +259,8 @@ __global__ void CatArrayBatchedCopy_vectorized(
 
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /*
   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
   to improve memory bandwidth throughput.
@@ -328,6 +331,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
   scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
   CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
   TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+<<<<<<< HEAD
   // If all batches are contiguous we can call a specialized implementation
   // which requires the input tensor addresses to be aligned to a
   // 16 Byte boundary.
@@ -349,6 +353,14 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       } else {
         outputParam.tensorStride[i] = out.stride(i);
       }
+=======
+
+  // Next, let's initialize the size, stride arrays for the output Tensor.
+  if (memory_format == c10::MemoryFormat::Contiguous) {
+    for (int i = 0; i < nDims; ++i) {
+      outputParam.tensorSize[i] = out.size(i);
+      outputParam.tensorStride[i] = out.stride(i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
     // permute the semantics of dims from NCHW to NHWC so that the input
@@ -367,6 +379,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 
   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
 
+<<<<<<< HEAD
 
   // for channels last computing slice size correctly is much more involved, so we never send it
   // on the fully vectorized path
@@ -376,6 +389,14 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
   bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
                         memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
                         outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
+=======
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  bool isContig = true;
+  bool isAligned = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   unsigned int max_elements_per_tensor = 0;
 
   // Now we loop
@@ -391,6 +412,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       // high-dimensional tensor
       if (inputs[i+batchCounter].get().numel() > 0) {
         dimSize = inputs[i+batchCounter].get().size(dimension);
+<<<<<<< HEAD
         if (isInOutAligned) {
           auto t = inputs[i+batchCounter].get();
           // similarly to output stride, we cannot trust stride value to
@@ -401,6 +423,8 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           slice_size *= sizeof(scalar_t);
           isInOutAligned &= (slice_size % alignment == 0);
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
 
       catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@@ -411,12 +435,18 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
       // On ROCm, CatArrayBatchedCopy_contig is faster
       isAligned = false;
+<<<<<<< HEAD
       isInOutAligned = false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
       // If at least one of the inputs is not aligned, we can't call the
       // CatArrayBatchedCopy_alignedK_contig
       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+<<<<<<< HEAD
       isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
       if (stride_size > 1) {
@@ -427,6 +457,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
         }
         catMetaData.isContiguous[batchCounter] = false;
+<<<<<<< HEAD
+=======
+        isContig = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         catMetaData.isContiguous[batchCounter] = true;
       }
@@ -449,6 +483,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           max_elements_per_tensor, batchCounter);
 #else
     dim3 applyBlock, catGrid;
+<<<<<<< HEAD
     if (isInOutAligned) {
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
         max_elements_per_tensor, batchCounter);
@@ -456,6 +491,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
           max_elements_per_tensor, batchCounter);
     } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
+=======
+    if (isContig && sizeof(scalar_t) > 2) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
+          max_elements_per_tensor, batchCounter);
+    } else if (isContig && sizeof(scalar_t) == 2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
           max_elements_per_tensor, batchCounter);
     } else {
@@ -463,6 +504,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       getCatGrid(batchCounter, catGrid);
     }
 #endif
+<<<<<<< HEAD
     int32_t trailingSize;
     int nDimsLocal = nDims;
     TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
@@ -499,10 +541,23 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
         break;
       default:
         cat_dim--;
+=======
+
+    if (memory_format != c10::MemoryFormat::Contiguous) {
+      switch (dimension) {
+      case 0:
+        break;
+      case 1:
+        dimension = nDims - dimension;
+        break;
+      default:
+        dimension--;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
+<<<<<<< HEAD
     if (isInOutAligned) {\
       constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
       CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
@@ -527,6 +582,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
     }\
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     switch (nDimsLocal) {
+=======
+    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) { \
+      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_8><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else if (isContig) {\
+      CatArrayBatchedCopy_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else {\
+      CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    }\
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    switch (nDims) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       case 1:
         HANDLE_CASE(1);
         break;
diff --git a/aten/src/ATen/native/cuda/SortStable.cu b/aten/src/ATen/native/cuda/SortStable.cu
index 8117eeeec558e..7c7823d2c356d 100644
--- a/aten/src/ATen/native/cuda/SortStable.cu
+++ b/aten/src/ATen/native/cuda/SortStable.cu
@@ -21,6 +21,7 @@ namespace {
 struct offset_t {
   int stride;
   int begin;
+<<<<<<< HEAD
   __device__ int operator[](int i) const {
     return stride * (begin + i);
   }
@@ -30,6 +31,11 @@ struct offset_t {
     return *this;
   }
 #endif
+=======
+  __device__ int operator[](int i) {
+    return stride * (begin + i);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 // Segmented sort by full sort algorithm:.
 // Say we are sorting a (2, 3) tensor. We have in flattened form:
diff --git a/aten/src/ATen/native/cuda/Sorting.cpp b/aten/src/ATen/native/cuda/Sorting.cpp
index 14725afda52ae..cee8dc2e1ae67 100644
--- a/aten/src/ATen/native/cuda/Sorting.cpp
+++ b/aten/src/ATen/native/cuda/Sorting.cpp
@@ -43,12 +43,15 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_impl_cuda(
   TORCH_CHECK(k >= 1 && k <= slicesize,
               "kthvalue(): selected number k out of range for dimension ", dim);
 
+<<<<<<< HEAD
   TORCH_CHECK(
       slicesize <= std::numeric_limits<int32_t>::max(),
       "kthvalue(): dimension ", dim, " is too large (", slicesize,
       "). The current CUDA implementation supports dimension sizes up to ",
       std::numeric_limits<int32_t>::max());
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::assert_no_overlap(self, values);
 
   _reduction_with_indices_allocate_or_resize_output(
@@ -169,6 +172,13 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_cuda(
     bool keepdim,
     Tensor& values,
     Tensor& indices) {
+<<<<<<< HEAD
+=======
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue CUDA");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = [&]() {
     NoNamesGuard guard;
     // `kthvalue_out_impl_cuda` expects contiguous in input `self`.
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 1c4f06fe262e6..e57dc0f6d1842 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -65,6 +65,7 @@ __global__ void gatherKthValue(
       &kValue);
 
   // Find the index of the k-th highest element
+<<<<<<< HEAD
   __shared__ int32_t minIndexFound;
 
   if (threadIdx.x == 0) {
@@ -93,6 +94,27 @@ __global__ void gatherKthValue(
   if (threadIdx.x == 0) {
       indicesSliceStart[0] = static_cast<index_t>(minIndexFound);
       kthValueSliceStart[0] = kValue;
+=======
+  index_t kValueIndex = 0;
+  bool foundKValue = false;
+
+  for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    scalar_t v = inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride])
+                         : static_cast<scalar_t>(0);
+    bool isKValue = inRange &&
+        ((v == kValue) || (at::_isnan(v) && at::_isnan(kValue)));
+    if (isKValue) {
+      kValueIndex = i;
+      foundKValue = true;
+      break;
+    }
+  }
+
+  if (foundKValue) {
+    kthValueSliceStart[0] = kValue;
+    indicesSliceStart[0] = kValueIndex;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index 3bb6de431cbb1..6f0fbaac228ef 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -121,7 +121,11 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
     "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
     device_index);
+<<<<<<< HEAD
   cufft_get_plan_cache(device_index).resize(max_size);
+=======
+  return cufft_get_plan_cache(device_index).resize(max_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@@ -137,7 +141,11 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
     "cufft_clear_plan_cache: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
     device_index);
+<<<<<<< HEAD
   cufft_get_plan_cache(device_index).clear();
+=======
+  return cufft_get_plan_cache(device_index).clear();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native::detail
@@ -163,7 +171,11 @@ bool has_large_prime_factor(int64_t n) {
 }
 
 // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+<<<<<<< HEAD
 const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+=======
+static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          IntArrayRef dim, bool forward) {
   const auto ndim = self.dim();
   const int64_t signal_ndim = dim.size();
@@ -221,9 +233,28 @@ const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
   std::optional<CuFFTConfig> uncached_plan;
   const CuFFTConfig * config = nullptr;
 
+<<<<<<< HEAD
   // Bluestein's algorithm is only used when a size has large prime factors,
   // sizes with only small prime factors can still be cached
   if (plan_cache.max_size() > 0) {
+=======
+  // Workaround for gh-63152, gh-58724
+  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
+  // Bluestein's algorithm is only used when a size has large prime factors,
+  // sizes with only small prime factors can still be cached
+  bool use_caching = true;
+#ifdef CUFFT_VERSION
+  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
+    // Only cache plans for transforms with small prime factors
+    use_caching = std::none_of(
+        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
+      return has_large_prime_factor(dim_size);
+    });
+  }
+#endif
+
+  if (use_caching && plan_cache.max_size() > 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     guard.lock();
     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
       config = &plan_cache.lookup(Params);
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
index bc609f829a26d..8def13059b535 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -19,6 +19,10 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
+=======
+// TODO: remove this when CUDA <11.6 is no longer supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void topk_out_with_sort(
   const Tensor& self,
   int64_t k, int64_t dim, bool largest,
@@ -30,12 +34,28 @@ void topk_out_with_sort(
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
 
+<<<<<<< HEAD
+=======
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool should_use_sort(const Tensor& self, int64_t dim) {
 #if defined(USE_ROCM)
   if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
   return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
 #else
+<<<<<<< HEAD
   return false;
+=======
+  if (disable_sort_for_topk()) return false;
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
+  if (self.dim() == 0) return false;
+  if (self.dtype() == kBool) return false; // Bool is not support by topk
+  int64_t slice_size = self.size(dim);
+  if (slice_size == 0) return false;
+  int64_t num_slices = self.numel() / slice_size;
+  return num_slices <= 10 && slice_size >= 100000;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index d95d85bf02378..1be5e3ca9387d 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -21,6 +21,14 @@ using namespace at::native;
 
 namespace at::native {
 
+<<<<<<< HEAD
+=======
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk() {
+  return CUB_SUPPORTS_SCAN_BY_KEY();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace sbtopk { // single_block_topk
 
 template <typename T>
@@ -225,7 +233,11 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
+<<<<<<< HEAD
 static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
+=======
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;
 
@@ -237,10 +249,18 @@ __global__ void fill(T* x, T value, IndexType size) {
   }
 }
 
+<<<<<<< HEAD
 // compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
 __global__ void computeBlockDigitCounts(
+=======
+// find the kth smallest value,
+// for largest topk, k_to_find = slice_size - k + 1
+template <typename T, typename IndexType, typename Bitwise, int Dim>
+C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
+__global__ void radixFindKthValues(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::cuda::detail::TensorInfo<const T, IndexType> input,
     uint32_t slice_size,
     uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@@ -315,6 +335,7 @@ __global__ void computeBlockDigitCounts(
   }
 }
 
+<<<<<<< HEAD
 // compute global histogram and cumsum for each row
 __global__ void computeDigitCumSum(
   short* counts,
@@ -353,13 +374,18 @@ __global__ void computeDigitCumSum(
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
   Bitwise* desires_in,          // size: num_slices
   short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+<<<<<<< HEAD
   uint32_t* digit_cum_sum,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t* ks_to_find_in,  // size: num_slices
   uint32_t blocks_per_slice,
   int current_bit,
@@ -371,7 +397,11 @@ __global__ void computeBlockwiseWithinKCounts(
   Bitwise* desires_out,
   uint32_t num_blocks
 ) {
+<<<<<<< HEAD
   // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
+=======
+  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int tidx = threadIdx.x;
   uint32_t block_idx = getLinearBlockId<uint32_t>();
   uint32_t slice_idx = block_idx / blocks_per_slice;
@@ -384,15 +414,46 @@ __global__ void computeBlockwiseWithinKCounts(
   if (block_idx >= num_blocks) {
     return;
   }
+<<<<<<< HEAD
 
+=======
+  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  union __align__(16) TempStorage {
+    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
+    typename BlockScan::TempStorage scan_storage;
+  };
+  __shared__ TempStorage temp_storage;
+
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (tidx < RADIX_DIGITS) {
+    for (int blk = 0; blk < blocks_per_slice; ++blk) {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
+    }
+  }
+
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
+  if (tidx < RADIX_DIGITS) {
+    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
+  }
+  __syncthreads();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   __shared__ Bitwise desired;
   uint32_t k_to_find = ks_to_find_in[slice_idx];
 
   if (tidx < RADIX_DIGITS) {
+<<<<<<< HEAD
     uint32_t position = slice_idx * RADIX_DIGITS + tidx;
     uint32_t digit_count_cumsum = digit_cum_sum[position];
     uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];
+=======
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // if not the last pass: update desired and ks_to_find
     // if last pass: write out the kth value
@@ -413,6 +474,13 @@ __global__ void computeBlockwiseWithinKCounts(
   }
   __syncthreads();
 
+<<<<<<< HEAD
+=======
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+  return;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
 
   // if largest, then only threads that has tidx > desired_digit are active
@@ -468,12 +536,20 @@ __global__ void computeBlockwiseWithinKCounts(
   }
 }
 
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Assumption: slice_size can not be larger than UINT32_MAX
 template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
   Bitwise* desires,            // size: num_slices
   short* counts,               // size: num_slices * blocks_per_slice * radix_digits
+<<<<<<< HEAD
   uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
+=======
+  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t blocks_per_slice,
   // outputs:
   uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@@ -599,6 +675,10 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
     }
   }
 }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
   // occupancy of this kernel is limited by registers per threads
@@ -655,7 +735,13 @@ void launch(
   T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());
 
   TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
+<<<<<<< HEAD
 
+=======
+  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
+  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
   uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@@ -672,16 +758,26 @@ void launch(
   static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
     "blockwise counter too large");
 
+<<<<<<< HEAD
   auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
   uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
   AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
 
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
   AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
 
   auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
+<<<<<<< HEAD
+=======
+#else
+  uint32_t* withinKCounts = nullptr;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Bitwise desiredMask = 0;
   dim3 grid;
@@ -695,7 +791,11 @@ void launch(
 
   // iterate radix bits for multiple passes
   for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
+<<<<<<< HEAD
     computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+=======
+    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input,
         inputSliceSize,
         ks_to_find_in, // unused arg
@@ -708,6 +808,7 @@ void launch(
         desired_in,
         counts);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+<<<<<<< HEAD
 
     computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -716,6 +817,12 @@ void launch(
     // if cub supports scan_by_key we additionally do k counts
     computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
       desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+=======
+    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
+    // if cub supports scan_by_key we additionally do k counts
+    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
+      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     // swap desired/ks_to_find in and out for next iter
     auto tmp_desired = desired_in;
@@ -728,12 +835,21 @@ void launch(
   }
   desired = desired_in;
 
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
     desired, counts, num_blocks, blocks_per_slice, kthCounts);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block
+<<<<<<< HEAD
   using counting_iter_t = ATEN_CUB_COUNTING_ITERATOR(uint32_t, uint32_t);
   using slice_idx_iter_t = ATEN_CUB_TRANSFORM_ITERATOR(uint32_t, BlockIdxToKey, counting_iter_t);
+=======
+  using counting_iter_t = cub::CountingInputIterator<uint32_t, uint32_t>;
+  using slice_idx_iter_t = cub::TransformInputIterator<uint32_t, BlockIdxToKey, counting_iter_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice));
   at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks);
   at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks);
@@ -743,6 +859,31 @@ void launch(
     topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
     blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
+<<<<<<< HEAD
+=======
+#else
+  // Find topk values based on kth values
+  {
+    dim3 grid;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
+    int warp_size = at::cuda::warp_size();
+    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
+    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
+        input,
+        inputSliceSize,
+        outputSliceSize,
+        largest,
+        numInputSlices,
+        inputWithinSliceStride,
+        topK,
+        topKWithinSliceStride,
+        indices,
+        indicesWithinSliceStride,
+        kthValues);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace mbtopk
@@ -750,6 +891,10 @@ void launch(
 bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
   if (num_slices > std::numeric_limits<uint32_t>::max() ||
       slice_size > std::numeric_limits<uint32_t>::max()) return false;
+<<<<<<< HEAD
+=======
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
   return (num_slices <= 20 && slice_size >= 20000) ||
       (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@@ -758,6 +903,15 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
       (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
       (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
       (num_slices > 4000 && slice_size >= 400);
+<<<<<<< HEAD
+=======
+#else
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
+  return (num_slices <= 400 && slice_size >= 5000) ||
+      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
+      (num_slices >= 4000 && slice_size >= 300);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void launch_gather_topk_kernel(
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index dd8084b48ebf4..f579ee7af52bb 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -44,7 +44,11 @@ __global__ void triu_tril_kernel(
     const int64_t k,
     const int64_t N_padded,
     const IndexType last_dim_padded) {
+<<<<<<< HEAD
   int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
+=======
+  int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (linear_idx >= N_padded) {
     return;
   }
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
index bedb5add839ee..e0b7ee318f373 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
@@ -12,15 +12,23 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if 0 && AT_USE_JITERATOR()
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr char tan_name[] = "tan_impl";
 #endif
 
 void tan_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
+<<<<<<< HEAD
     // Disabled due to accuracy issues
 #if 0 && AT_USE_JITERATOR()
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static const auto tan_string = jiterator_stringify(
         template <typename T> T tan_impl(T a) { return std::tan(a); });
     AT_DISPATCH_COMPLEX_TYPES_AND(
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
index dedb15473fc51..f8d5863c47216 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
@@ -12,15 +12,23 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if 0 && AT_USE_JITERATOR()
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr char tanh_name[] = "tanh_impl";
 #endif
 
 void tanh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
+<<<<<<< HEAD
     // Disabled due to accuracy issues
 #if 0 && AT_USE_JITERATOR()
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static const auto tanh_string = jiterator_stringify(
         template <typename T> T tanh_impl(T a) { return std::tanh(a); });
     AT_DISPATCH_COMPLEX_TYPES_AND(
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
index 0a1f3408e783d..b3c5340aac393 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@@ -54,7 +54,11 @@ struct LoadBoolOp {
 auto wrap_input_iterator(const bool *data) {
   // See NOTE [Loading boolean values]
   LoadBoolOp op;
+<<<<<<< HEAD
   return ATEN_CUB_TRANSFORM_ITERATOR(bool, LoadBoolOp, const uint8_t*, int)(
+=======
+  return NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<bool, LoadBoolOp, const uint8_t*, int>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       reinterpret_cast<const uint8_t*>(data), op);
 }
 
@@ -259,10 +263,17 @@ struct UniqueCub<bool> {
 
     const bool* self_data = self.const_data_ptr<bool>();
     MapNumberOfTrueValues op;
+<<<<<<< HEAD
     ATEN_CUB_TRANSFORM_ITERATOR(int, MapNumberOfTrueValues, const uint8_t*, int)
         data_iter(reinterpret_cast<const uint8_t*>(self_data), op);
     at::cuda::cub::reduce(data_iter, tmp_num_true.get(), num_inp,
                           NO_ROCM(::cuda)::std::plus<>{}, 0);
+=======
+    NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<int, MapNumberOfTrueValues, const uint8_t*, int>
+        data_iter(reinterpret_cast<const uint8_t*>(self_data), op);
+    at::cuda::cub::reduce(data_iter, tmp_num_true.get(), num_inp,
+                          NO_ROCM(at_cuda_detail)::cub::Sum{}, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto options = self.options();
     output = at::empty({2}, self.options());
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 09e094ea2bf02..ec3259a96d7d2 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -277,7 +277,11 @@ struct BilinearFilterFunctor {
     return 0;
   }
 
+<<<<<<< HEAD
   static constexpr int size = 2;
+=======
+  static const int size = 2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // taken from
@@ -301,7 +305,11 @@ struct BicubicFilterFunctor {
     return 0;
   }
 
+<<<<<<< HEAD
   static constexpr int size = 4;
+=======
+  static const int size = 4;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <typename accscalar_t>
diff --git a/aten/src/ATen/native/cuda/cutlass_common.cuh b/aten/src/ATen/native/cuda/cutlass_common.cuh
index 8f5143713aa99..b6503c9a73273 100644
--- a/aten/src/ATen/native/cuda/cutlass_common.cuh
+++ b/aten/src/ATen/native/cuda/cutlass_common.cuh
@@ -26,6 +26,7 @@ struct enable_3x_kernel_for_sm9x : Kernel {
 };
 
 template <typename Kernel>
+<<<<<<< HEAD
 struct enable_3x_kernel_for_sm10 : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
@@ -36,6 +37,8 @@ struct enable_3x_kernel_for_sm10 : Kernel {
 };
 
 template <typename Kernel>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct enable_3x_kernel_for_sm10_or_later : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
diff --git a/aten/src/ATen/native/cuda/fused_adam_utils.cuh b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
index 7a8f4a0d0e7e2..dc64c1f27914b 100644
--- a/aten/src/ATen/native/cuda/fused_adam_utils.cuh
+++ b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
@@ -108,7 +108,11 @@ struct FusedAdamMathFunctor {
       "depth of 4 for Adam, depth of 5 for Adam with AMSGrad.");
   using opmath_t = at::opmath_type<scalar_type>;
   C10_DEVICE __forceinline__ void operator()(
+<<<<<<< HEAD
       int64_t chunk_size,
+=======
+      int chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       FusedOptimizerTensorListMetadata<depth>& tl,
       const float* lr_ptr,
       const double& lr,
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index 9cdceea3e75da..6a62842c30a46 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 #if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -127,13 +131,21 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
   return diff == 0 ? 0 : uint32_t(Align) - diff;
 }
 
+<<<<<<< HEAD
 #if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
+=======
+#if defined (__gfx90a__) || defined(__gfx942__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
 #endif
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(USE_ROCM)
 // TODO: Support RDNA
@@ -143,7 +155,11 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));
 
 static bool isCDNA2orLater(int index) {
+<<<<<<< HEAD
     return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
+=======
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #else
@@ -1161,7 +1177,11 @@ at::Tensor _weight_int4pack_mm_cuda(
   auto C_final = at::empty(
       {m, n}, at::TensorOptions().dtype(at::kBFloat16).device(A.device()));
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto stream = at::cuda::getCurrentCUDAStream();
 #define RUN_GEMM(WARPS, K_TILES_PER_WARP, Q_GROUP_SIZE, REDUCE_TYPE) \
   do {                                                               \
@@ -1304,7 +1324,11 @@ at::Tensor _convert_weight_to_int4pack_cuda(
   constexpr int32_t kKTileSize = 16;
 
   // GPT-FAST assumes nTileSize of 8 for quantized weight tensor.
+<<<<<<< HEAD
   // See https://github.com/meta-pytorch/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510
+=======
+  // See https://github.com/pytorch-labs/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Torch dynamo also requires the torch ops has the same output shape for each device.
   // See https://github.com/pytorch/pytorch/blob/ec284d3a74ec1863685febd53687d491fd99a161/torch/_meta_registrations.py#L3263
   constexpr int32_t kNTileSizeTensor = 8;
@@ -1327,7 +1351,11 @@ at::Tensor _convert_weight_to_int4pack_cuda(
       {nTilesTensor, kSuperTiles, 32, innerKTiles / 2},
       at::TensorOptions().dtype(at::kInt).device(in.device()));
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+=======
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 09c8e74d4b2cf..789b85ba2600e 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -45,7 +45,11 @@ namespace at::cuda::jit {
 // Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
 // If not compiling for ROCm, return the original get_traits_string().
 std::string get_traits_string_but_hiprtc_safe() {
+<<<<<<< HEAD
 #if defined(USE_ROCM) && HIP_VERSION_MAJOR < 7
+=======
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return R"ESCAPE(
 namespace std {
 
@@ -1041,8 +1045,13 @@ std::string generate_code(
   // and `extra_args` for computation call if
   // extra arguments to capture runtime state are passed.
   // (look at polygamma for example).
+<<<<<<< HEAD
   std::string extra_params;
   std::string extra_args;
+=======
+  std::string extra_params = "";
+  std::string extra_args = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t i = 0; i < extra_args_typenames.size(); i++) {
     auto type = std::string(extra_args_typenames[i]);
     auto name = "extra_arg_" + std::to_string(i);
@@ -1352,7 +1361,11 @@ std::string generate_reduction_code(
     int vec_size,
     int max_threads_codegen) {
   TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());
+=======
+  TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return generate_reduction_code(
       desc.nOutputs,
@@ -1451,7 +1464,11 @@ std::optional<std::string> get_cache_dir() {
   std::string cache_dir;
   char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
   // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
+<<<<<<< HEAD
   std::string kernels_cache_dir;
+=======
+  std::string kernels_cache_dir = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (ptkcp != nullptr) {
     cache_dir = std::string(ptkcp);
   } else {
@@ -1532,7 +1549,11 @@ NvrtcFunction jit_pwise_function(
 
   std::string file_path;
   if (cache_dir.has_value()) {
+<<<<<<< HEAD
     // Attempts to read from the cache.
+=======
+    // Attemps to read from the cache.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
     // Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
     //   because we hash on the CUDA code, but we save the compiled ptx or sass
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index c457bd3dba753..d27212df942bf 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -55,7 +55,11 @@ bool can_vectorize(const T * ptr, int alignment) {
 };
 
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm>
+=======
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void RowwiseMomentsCUDAKernel(
     int64_t N,
     T_ACC eps,
@@ -89,6 +93,7 @@ __global__ void RowwiseMomentsCUDAKernel(
     T_ACC m1;
     T_ACC m2;
     thrust::tie(m2, m1) = welford_op.project(val);
+<<<<<<< HEAD
     if constexpr (!rms_norm){
       mean[i] = m1;
       rstd[i] = c10::cuda::compat::rsqrt(m2 + eps);
@@ -100,6 +105,14 @@ __global__ void RowwiseMomentsCUDAKernel(
 }
 
 template <typename T, typename T_ACC, bool rms_norm>
+=======
+    mean[i] = m1;
+    rstd[i] = c10::cuda::compat::rsqrt(m2 + eps);
+  }
+}
+
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void LayerNormForwardCUDAKernel(
     int64_t N,
     const T* X,
@@ -113,6 +126,7 @@ __global__ void LayerNormForwardCUDAKernel(
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+<<<<<<< HEAD
     if constexpr (!rms_norm){
       const T_ACC beta_v =
           beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
@@ -122,6 +136,13 @@ __global__ void LayerNormForwardCUDAKernel(
     } else {
       Y[index] = (static_cast<T_ACC>(X[index])) * static_cast<T_ACC>(rstd[i]) * gamma_v;
     }
+=======
+    const T_ACC beta_v =
+        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
+    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+            static_cast<T_ACC>(rstd[i]) * gamma_v +
+        beta_v;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -133,11 +154,16 @@ struct WelfordDataLN{
   C10_HOST_DEVICE WelfordDataLN(float mean, float sigma2, float count): mean(mean), sigma2(sigma2), count(count) {}
 };
 
+<<<<<<< HEAD
 template<typename U, bool rms_norm> __device__
+=======
+template<typename U> __device__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 WelfordDataLN cuWelfordOnlineSum(
   const U val,
   const WelfordDataLN& curr_sum)
 {
+<<<<<<< HEAD
   if constexpr (!rms_norm){
     U delta = val - curr_sum.mean;
     U new_count = curr_sum.count + 1.f;
@@ -153,10 +179,24 @@ WelfordDataLN cuWelfordOnlineSum(
 }
 
 template<bool rms_norm> __device__
+=======
+  U delta = val - curr_sum.mean;
+  U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+  U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
+  U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
+  return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
+}
+
+__device__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 WelfordDataLN cuWelfordCombine(
   const WelfordDataLN dataB,
   const WelfordDataLN dataA
 ) {
+<<<<<<< HEAD
   if constexpr (!rms_norm){
     using U = decltype(dataB.count);
     U delta = dataB.mean - dataA.mean;
@@ -183,6 +223,30 @@ WelfordDataLN cuWelfordCombine(
 }
 
 template<typename T, bool rms_norm = false>
+=======
+  using U = decltype(dataB.count);
+  U delta = dataB.mean - dataA.mean;
+  U count = dataA.count + dataB.count;
+  U mean, sigma2;
+  if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+    auto coef = __builtin_amdgcn_rcpf(count);
+#else
+    auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
+    auto nA = dataA.count * coef;
+    auto nB = dataB.count * coef;
+    mean = nA*dataA.mean + nB*dataB.mean;
+    sigma2 = dataA.sigma2 + dataB.sigma2 + delta * delta * dataA.count * nB;
+  } else {
+    mean = U(0);
+    sigma2 = U(0);
+  }
+  return {mean, sigma2, count};
+}
+
+template<typename T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __device__ WelfordDataLN compute_stats(
   const T*  __restrict__ X,
   const int N,
@@ -201,13 +265,23 @@ __device__ WelfordDataLN compute_stats(
       vec_t data = X_vec[i];
       #pragma unroll
       for (int ii=0; ii < vec_size; ii++){
+<<<<<<< HEAD
         wd = cuWelfordOnlineSum<acc_t, rms_norm>(static_cast<acc_t>(data.val[ii]), wd);
+=======
+        wd = cuWelfordOnlineSum(static_cast<acc_t>(data.val[ii]), wd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     // intra-warp reduction
     for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+<<<<<<< HEAD
         WelfordDataLN wdB{WARP_SHFL_DOWN(wd.mean, offset), WARP_SHFL_DOWN(wd.sigma2, offset), WARP_SHFL_DOWN(wd.count, offset)};
         wd = cuWelfordCombine<rms_norm>(wd, wdB);
+=======
+        WelfordDataLN wdB{WARP_SHFL_DOWN(wd.mean, offset),
+        WARP_SHFL_DOWN(wd.sigma2, offset), WARP_SHFL_DOWN(wd.count, offset)};
+        wd = cuWelfordCombine(wd, wdB);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -228,7 +302,11 @@ __device__ WelfordDataLN compute_stats(
           WelfordDataLN wdB{meansigmabuf[2*threadIdx.y],
                           meansigmabuf[2*threadIdx.y+1],
                           countbuf[threadIdx.y]};
+<<<<<<< HEAD
           wd = cuWelfordCombine<rms_norm>(wd, wdB);
+=======
+          wd = cuWelfordCombine(wd, wdB);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         __syncthreads();
       }
@@ -245,7 +323,11 @@ __device__ WelfordDataLN compute_stats(
 }
 
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm = false,
+=======
+template <typename T, typename T_ACC,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 typename std::enable_if_t<!std::is_same_v<T, double>, int> = 0>
 __device__ __inline__ void vectorized_layer_norm_kernel_impl(
   const int N,
@@ -260,7 +342,11 @@ __device__ __inline__ void vectorized_layer_norm_kernel_impl(
     //as one thread would have to write 3 consecutive floats
     auto i1 = blockIdx.x;
     const T * block_row = X + i1 * N;
+<<<<<<< HEAD
     WelfordDataLN wd = compute_stats<T, rms_norm>(block_row, N, s_data);
+=======
+    WelfordDataLN wd = compute_stats(block_row, N, s_data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     using vec_t = aligned_vector<T, vec_size>;
     const vec_t * X_vec = reinterpret_cast<const vec_t*>(block_row);
@@ -283,48 +369,73 @@ __device__ __inline__ void vectorized_layer_norm_kernel_impl(
       if (gamma_vec != nullptr && beta_vec != nullptr) {
         #pragma unroll
         for (int ii=0; ii < vec_size; ii++){
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean))
               + static_cast<T_ACC>(beta_vec[i].val[ii]);
           } else {
             out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * static_cast<T_ACC>(data.val[ii]));
           }
+=======
+          out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean))
+            + static_cast<T_ACC>(beta_vec[i].val[ii]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (gamma_vec != nullptr) {
         #pragma unroll
         for (int ii=0; ii < vec_size; ii++){
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean));
           } else {
             out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * static_cast<T_ACC>(data.val[ii]));
           }
+=======
+          out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (beta_vec != nullptr) {
         #pragma unroll
         for (int ii=0; ii < vec_size; ii++){
+<<<<<<< HEAD
             out.val[ii] = (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean)) + static_cast<T_ACC>(beta_vec[i].val[ii]);
+=======
+          out.val[ii] = (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean)) + static_cast<T_ACC>(beta_vec[i].val[ii]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else {
         #pragma unroll
         for (int ii=0; ii < vec_size; ii++){
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             out.val[ii] = rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean);
           } else {
             out.val[ii] = rstd_val * static_cast<T_ACC>(data.val[ii]);
           }
+=======
+          out.val[ii] = rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
       Y_vec[i] = out;
     }
     if (thrx == 0) {
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         mean[i1] = wd.mean;
       }
+=======
+      mean[i1] = wd.mean;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       rstd[i1] = rstd_val;
     }
 }
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm = false,
+=======
+template <typename T, typename T_ACC,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 typename std::enable_if_t<std::is_same_v<T, double>, int> = 0>
 __device__ __inline__ void vectorized_layer_norm_kernel_impl(
   const int /*N*/,
@@ -339,7 +450,11 @@ __device__ __inline__ void vectorized_layer_norm_kernel_impl(
   }
 
 //to avoid windows SFINAE errors
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm = false>
+=======
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void vectorized_layer_norm_kernel(
   const int N,
   T_ACC eps,
@@ -349,11 +464,19 @@ __global__ void vectorized_layer_norm_kernel(
   T_ACC* mean,
   T_ACC* rstd,
   T* Y){
+<<<<<<< HEAD
     vectorized_layer_norm_kernel_impl<T, T_ACC, rms_norm>(N, eps, X, gamma, beta, mean, rstd, Y);
   }
 
 
 template<typename T, typename T_ACC, bool rms_norm>
+=======
+    vectorized_layer_norm_kernel_impl(N, eps, X, gamma, beta, mean, rstd, Y);
+  }
+
+
+template<typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __device__ __inline__ void compute_gI(
   const T* __restrict__ dY,
   const T* __restrict__ X,
@@ -364,10 +487,14 @@ __device__ __inline__ void compute_gI(
   const int N,
   T_ACC * buf){
     const auto i1 = blockIdx.x;
+<<<<<<< HEAD
     T_ACC mean_val = 0;
     if constexpr (!rms_norm){
       mean_val = mean[i1];
     }
+=======
+    const T_ACC mean_val = mean[i1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const T_ACC rstd_val = rstd[i1];
     T_ACC stats_x1{0}, stats_x2{0};
     constexpr int unroll = 4;
@@ -383,18 +510,24 @@ __device__ __inline__ void compute_gI(
           const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l+k]) : T_ACC(1);
           const auto c_h = static_cast<T_ACC>(X_i[l+k]);
           const auto c_loss = static_cast<T_ACC>(dY_i[l+k]);
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             stats_x1 += c_loss * gamma_val;
             stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
           } else {
             stats_x2 += c_loss * gamma_val * (c_h) * rstd_val;
           }
+=======
+          stats_x1 += c_loss * gamma_val;
+          stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     for (;  l < N; l ++) {
           const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
           const auto c_h = static_cast<T_ACC>(X_i[l]);
           const auto c_loss = static_cast<T_ACC>(dY_i[l]);
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             stats_x1 += c_loss * gamma_val;
             stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
@@ -416,6 +549,20 @@ __device__ __inline__ void compute_gI(
     if constexpr (!rms_norm){
       stats_x1 = buf[0];
     }
+=======
+          stats_x1 += c_loss * gamma_val;
+          stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+
+    stats_x1 = cuda_utils::BlockReduceSum(stats_x1, buf);
+    stats_x2 = cuda_utils::BlockReduceSum(stats_x2, buf);
+    if (threadIdx.x == 0) {
+      buf[0] = stats_x1;
+      buf[1] = stats_x2;
+    }
+    __syncthreads();
+    stats_x1 = buf[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stats_x2 = buf[1];
     T_ACC fH = N;
     T_ACC term1 = (T_ACC(1) / fH) * rstd_val;
@@ -426,6 +573,7 @@ __device__ __inline__ void compute_gI(
         const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
 
         T_ACC f_grad_input = fH * gamma_val * dy;
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
           f_grad_input -= stats_x1;
@@ -433,13 +581,21 @@ __device__ __inline__ void compute_gI(
           f_grad_input -= (x) * rstd_val * stats_x2;
         }
 
+=======
+        f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+        f_grad_input -= stats_x1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_grad_input *= term1;
         dX_i[l] = f_grad_input;
     }
   }
 
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm>
+=======
+template<typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void layer_norm_grad_input_kernel(
   const T* __restrict__ dY,
   const T* __restrict__ X,
@@ -451,7 +607,11 @@ __global__ void layer_norm_grad_input_kernel(
     alignas(sizeof(double)) extern __shared__ char s_data1[];
     T_ACC * buf = reinterpret_cast<T_ACC*>(&s_data1);
 
+<<<<<<< HEAD
     compute_gI<T, T_ACC, rms_norm>(dY, X, mean, rstd, gamma, dX, N, buf);
+=======
+    compute_gI(dY, X, mean, rstd, gamma, dX, N, buf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 
@@ -460,7 +620,11 @@ __global__ void layer_norm_grad_input_kernel(
 // faster measured at PT operator level, with cases seeing a 2X speedup (where N >> M).
 // There are no noticeable regressions on the rest of the sizes.
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm>
+=======
+template<typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void layer_norm_grad_input_kernel_vectorized(
   const T* __restrict__ dY,
   const T* __restrict__ X,
@@ -473,10 +637,14 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
   T_ACC* reduce_buf = reinterpret_cast<T_ACC*>(&shared_data);
 
   const auto bIdx = blockIdx.x;
+<<<<<<< HEAD
   T_ACC mean_val = 0;
   if constexpr (!rms_norm){
     mean_val = mean[bIdx];
   }
+=======
+  const T_ACC mean_val = mean[bIdx];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const T_ACC rstd_val = rstd[bIdx];
   const T* X_i = X + bIdx * N;
   const T* dY_i = dY + bIdx * N;
@@ -508,12 +676,17 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
       const auto gamma_val = static_cast<T_ACC>(gamma_vec_reg.val[k]);
       const auto c_h = static_cast<T_ACC>(X_i_vec_reg.val[k]);
       const auto c_loss = static_cast<T_ACC>(dY_i_vec_reg.val[k]);
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         stats_x1 += c_loss * gamma_val;
         stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
       } else {
         stats_x2 += c_loss * gamma_val * (c_h) * rstd_val;
       }
+=======
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -522,6 +695,7 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
     const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
     const auto c_h = static_cast<T_ACC>(X_i[l]);
     const auto c_loss = static_cast<T_ACC>(dY_i[l]);
+<<<<<<< HEAD
     if constexpr (!rms_norm){
       stats_x1 += c_loss * gamma_val;
       stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
@@ -545,6 +719,21 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
   if constexpr (!rms_norm){
     stats_x1 = reduce_buf[0];
   }
+=======
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  // Reduction in Shared Memory
+  stats_x1 = cuda_utils::BlockReduceSum(stats_x1, reduce_buf);
+  stats_x2 = cuda_utils::BlockReduceSum(stats_x2, reduce_buf);
+  if (threadIdx.x == 0) {
+    reduce_buf[0] = stats_x1;
+    reduce_buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = reduce_buf[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   stats_x2 = reduce_buf[1];
 
   T_ACC fH = N;
@@ -566,12 +755,17 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
       const auto dy = static_cast<T_ACC>(dY_i_vec_reg.val[k]);
 
       T_ACC f_grad_input = fH * gamma_val * dy;
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
         f_grad_input -= stats_x1;
       } else {
         f_grad_input -= (x) * rstd_val * stats_x2;
       }
+=======
+      f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+      f_grad_input -= stats_x1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       f_grad_input *= term1;
       dX_i_vec_reg.val[k] = f_grad_input;
     }
@@ -586,19 +780,28 @@ __global__ void layer_norm_grad_input_kernel_vectorized(
     const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
 
     T_ACC f_grad_input = fH * gamma_val * dy;
+<<<<<<< HEAD
     if constexpr (!rms_norm){
       f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
       f_grad_input -= stats_x1;
     } else {
       f_grad_input -= (x) * rstd_val * stats_x2;
     }
+=======
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     f_grad_input *= term1;
     dX_i[l] = f_grad_input;
   }
 }
 
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm>
+=======
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void GammaBetaBackwardSimpleCUDAKernel(
     int64_t M,
     int64_t N,
@@ -614,6 +817,7 @@ __global__ void GammaBetaBackwardSimpleCUDAKernel(
     T_ACC sum2 = 0;
     for (int64_t i = 0; i < M; ++i) {
       const int64_t index = i * N + j;
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         sum1 += dg == nullptr ? T_ACC(0)
                               : static_cast<T_ACC>(dY[index]) *
@@ -625,14 +829,25 @@ __global__ void GammaBetaBackwardSimpleCUDAKernel(
                               : static_cast<T_ACC>(dY[index]) *
                 (static_cast<T_ACC>(X[index])) * static_cast<T_ACC>(rstd[i]);
       }
+=======
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+              (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+              static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (dg != nullptr) {
       dg[j] = sum1;
     }
     if (db != nullptr) {
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         db[j] = sum2;
       }
+=======
+      db[j] = sum2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -642,8 +857,12 @@ unsigned int block_dim_x,
 unsigned int block_dim_y,
 unsigned int rows_per_block_y,
 bool check_x,
+<<<<<<< HEAD
 bool check_y,
 bool rms_norm>
+=======
+bool check_y>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __device__
 __forceinline__
 void
@@ -667,9 +886,13 @@ blockReduceGammaBetaBackwardsHelper(
     int64_t mean_index = M_start + threadIdx.y * rows_per_thread_y;
     T_ACC warp_mean = 0, warp_rstd = 0;
     if (lane_id < rows_per_thread_y && mean_index + lane_id < M) {
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         warp_mean = mean[mean_index + lane_id];
       }
+=======
+      warp_mean = mean[mean_index + lane_id];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       warp_rstd = rstd[mean_index + lane_id];
     }
     // We do a WARP_SYNC() here because we use WARP_SHFL below to access
@@ -696,6 +919,7 @@ blockReduceGammaBetaBackwardsHelper(
 
     #pragma unroll
     for (int i = 0; i < rows_per_thread_y; ++i) {
+<<<<<<< HEAD
       T_ACC rstd_reg = WARP_SHFL(warp_rstd, i, kWarpSize);
       if constexpr (!rms_norm){
         T_ACC mean_reg = WARP_SHFL(warp_mean, i, kWarpSize);
@@ -704,6 +928,12 @@ blockReduceGammaBetaBackwardsHelper(
       } else{
         dg_sum += dY_regs[i] * (X_regs[i]) * rstd_reg;
       }
+=======
+      T_ACC mean_reg = WARP_SHFL(warp_mean, i, kWarpSize);
+      T_ACC rstd_reg = WARP_SHFL(warp_rstd, i, kWarpSize);
+      dg_sum += dY_regs[i] * (X_regs[i] - mean_reg) * rstd_reg;
+      db_sum += dY_regs[i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 }
 
@@ -712,8 +942,12 @@ unsigned int block_dim_x,
 unsigned int block_dim_y,
 unsigned int rows_per_block_y,
 bool check_x,
+<<<<<<< HEAD
 bool check_y,
 bool rms_norm>
+=======
+bool check_y>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __device__
 __forceinline__
 void
@@ -734,10 +968,17 @@ blockReduceGammaBetaBackwardsWithChecks(
         M_start += rows_per_block_y * gridDim.y) {
     int64_t M_end = M_start + rows_per_block_y - 1;
     if (!check_y || M_end < M) {
+<<<<<<< HEAD
       blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, false, rms_norm>
       (M_start, M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
     } else {
       blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, true, rms_norm>
+=======
+      blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, false>
+      (M_start, M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    } else {
+      blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, true>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       (M_start, M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
     }
   }
@@ -759,8 +1000,12 @@ template <typename T, typename T_ACC,
 unsigned int block_dim_x, unsigned int block_dim_y,
 unsigned int rows_per_block_y,
 bool partial_reduction,
+<<<<<<< HEAD
 bool aligned_grid,
 bool rms_norm
+=======
+bool aligned_grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 >
 __global__
 void
@@ -785,7 +1030,11 @@ __launch_bounds__(block_dim_x * block_dim_y)
     // When N and M align perfectly with block_dim_x and block_dim_y, we
     // can skip boundary condition checks that waste instruction issue slots.
     blockReduceGammaBetaBackwardsWithChecks
+<<<<<<< HEAD
           <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, false, rms_norm>
+=======
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, false>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
   } else {
     // In the general case we need to check boundary conditions in the M
@@ -793,11 +1042,19 @@ __launch_bounds__(block_dim_x * block_dim_y)
     // for the inner blocks. So try to avoid those checks when possible.
     if (blockIdx.x * block_dim_x + block_dim_x - 1 < N) {
       blockReduceGammaBetaBackwardsWithChecks
+<<<<<<< HEAD
           <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, true, rms_norm>
           (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
     } else {
       blockReduceGammaBetaBackwardsWithChecks
           <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true, true, rms_norm>
+=======
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, true>
+          (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    } else {
+      blockReduceGammaBetaBackwardsWithChecks
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true, true>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
     }
   }
@@ -812,7 +1069,11 @@ __launch_bounds__(block_dim_x * block_dim_y)
       if (dg) {
         dg[thread_y * N + thread_x] = dg_sum;
       }
+<<<<<<< HEAD
       if (db && !rms_norm) {
+=======
+      if (db) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         db[thread_y * N + thread_x] = db_sum;
       }
     }
@@ -858,7 +1119,11 @@ __launch_bounds__(block_dim_x * block_dim_y)
         if (dg) {
           dg[out_index] = reg_dg;
         }
+<<<<<<< HEAD
         if (db && !rms_norm) {
+=======
+        if (db) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           db[out_index] = reg_db;
         }
       }
@@ -869,8 +1134,12 @@ __launch_bounds__(block_dim_x * block_dim_y)
 template<typename T, typename T_ACC,
 int block_dim_x, int block_dim_y,
 int rows_per_block_y,
+<<<<<<< HEAD
 bool partial_reduction,
 bool rms_norm>
+=======
+bool partial_reduction>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void LaunchAndCheckGammaBetaBackwardKernel(
   bool aligned_grid,
   dim3 blocks,
@@ -886,7 +1155,11 @@ void LaunchAndCheckGammaBetaBackwardKernel(
   T* dgamma_data,
   T* dbeta_data) {
 if (aligned_grid) {
+<<<<<<< HEAD
     GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, true, rms_norm>
+=======
+    GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, true>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         <<<blocks, threads, shmem_sz, cuda_stream>>>(
             M,
             N,
@@ -897,7 +1170,11 @@ if (aligned_grid) {
             dgamma_data,
             dbeta_data);
   } else {
+<<<<<<< HEAD
     GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, false, rms_norm>
+=======
+    GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, false>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         <<<blocks, threads, shmem_sz, cuda_stream>>>(
             M,
             N,
@@ -913,7 +1190,11 @@ if (aligned_grid) {
 
 template<typename T, typename T_ACC,
 int block_dim_x, int block_dim_y,
+<<<<<<< HEAD
 int rows_per_block_y, bool rms_norm>
+=======
+int rows_per_block_y>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ConfigureAndLaunchGammaBetaBackwardKernel(
     const T* dY_data,
     const T* X_data,
@@ -936,16 +1217,27 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(
   if (blocks.y == 1 && threads.y == 1) {
     // Optimization: since there is just one thread doing all the summation, we don't need a reduction
     // across threads. So we set partial_reduction to true.
+<<<<<<< HEAD
     LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true, rms_norm>(
       aligned_grid, blocks, threads, shmem_sz, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_data, dbeta_data);
   } else {
     LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, rms_norm>(
+=======
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true>(
+      aligned_grid, blocks, threads, shmem_sz, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_data, dbeta_data);
+  } else {
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       aligned_grid, blocks, threads, shmem_sz, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_data, dbeta_data);
   }
 
 }
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm>
+=======
+template<typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void LaunchGammaBetaBackwardCUDAKernel(
     const T* dY_data,
     const T* X_data,
@@ -983,21 +1275,34 @@ void LaunchGammaBetaBackwardCUDAKernel(
       dgamma_blocks = at::empty({blocks.y * threads.y, dgamma->size(-1)}, options);
       dgamma_blocks_ptr = dgamma_blocks.data_ptr<T>();
     }
+<<<<<<< HEAD
     if (dbeta->defined() && !rms_norm) {
+=======
+    if (dbeta->defined()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto options = dbeta->options();
       dbeta_blocks = at::empty({blocks.y * threads.y, dgamma->size(-1)}, options);
       dbeta_blocks_ptr = dbeta_blocks.data_ptr<T>();
     }
+<<<<<<< HEAD
     LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true, rms_norm>(
+=======
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       aligned_grid, blocks, threads, 0, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_blocks_ptr, dbeta_blocks_ptr);
 
     if (dgamma_blocks.defined()) {
       *dgamma = dgamma_blocks.sum(0);
     }
+<<<<<<< HEAD
     if constexpr (!rms_norm){
       if (dbeta_blocks.defined()) {
         *dbeta = dbeta_blocks.sum(0);
       }
+=======
+    if (dbeta_blocks.defined()) {
+      *dbeta = dbeta_blocks.sum(0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else {
     // We are in the normal case where M is not that large.
@@ -1005,6 +1310,7 @@ void LaunchGammaBetaBackwardCUDAKernel(
     // For small M it is faster to have a smaller tile, otherwise we could have idle threads.
     // For larger M we use a bigger tile size.
     if (M < 64) {
+<<<<<<< HEAD
       ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 1, 8, rms_norm>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
     } else if (M < 128) {
       ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 8, 64, rms_norm>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
@@ -1012,11 +1318,24 @@ void LaunchGammaBetaBackwardCUDAKernel(
       ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 16, 128, rms_norm>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
     } else {
       ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 32, 256, rms_norm>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+=======
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 1, 8>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else if (M < 128) {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 8, 64>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else if (M < 256) {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 16, 128>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 32, 256>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm = false>
+=======
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void launch_vectorized_layer_norm_kernel(
   int N,
   int64_t M,
@@ -1045,7 +1364,11 @@ void launch_vectorized_layer_norm_kernel(
 
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
     int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
+<<<<<<< HEAD
     vectorized_layer_norm_kernel<T, T_ACC, rms_norm><<<blocks, threads, nshared, stream>>>(N, eps, X_data,
+=======
+    vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gamma_data, beta_data, mean_data, rstd_data, Y_data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
@@ -1067,7 +1390,11 @@ void launch_vectorized_layer_norm_kernel(
 
       blocks.x = (remaining > blocks.x) ? blocks.x : remaining;
 
+<<<<<<< HEAD
       vectorized_layer_norm_kernel<T, T_ACC, rms_norm><<<blocks, threads, nshared, stream>>>(N, eps, X_data2,
+=======
+      vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gamma_data, beta_data, mean_data2, rstd_data2, Y_data2);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
 
@@ -1077,7 +1404,11 @@ void launch_vectorized_layer_norm_kernel(
 
 }
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC, bool rms_norm = false>
+=======
+template <typename T, typename T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void LayerNormKernelImplInternal(
     const Tensor& X,
     const Tensor& gamma,
@@ -1096,7 +1427,11 @@ void LayerNormKernelImplInternal(
   const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
   const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
   T* Y_data = Y->data_ptr<T>();
+<<<<<<< HEAD
   T_ACC* mean_data = !rms_norm ? mean->data_ptr<T_ACC>() : nullptr;
+=======
+  T_ACC* mean_data = mean->data_ptr<T_ACC>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   T_ACC* rstd_data = rstd->data_ptr<T_ACC>();
 
   // check if can take fast path - all tensors are properly aligned, N is less than 2^24 (to use float count),
@@ -1111,6 +1446,7 @@ void LayerNormKernelImplInternal(
   if ((std::is_same_v<T, float> || std::is_same_v<T, at::Half> || std::is_same_v<T, at::BFloat16>) &&
   N <= static_cast<int64_t>(1ULL << std::numeric_limits<float>::digits) && N % num_vec_elems == 0 &&
   can_vec_X && can_vec_Y && can_vec_gamma && can_vec_beta) {
+<<<<<<< HEAD
     launch_vectorized_layer_norm_kernel<T, T_ACC, rms_norm>(static_cast<int>(N), M, eps, X_data, gamma_data, beta_data, Y_data, mean_data, rstd_data);
   } else {
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
@@ -1119,6 +1455,16 @@ void LayerNormKernelImplInternal(
           N, eps, X_data, mean_data, rstd_data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   LayerNormForwardCUDAKernel<T, T_ACC, rms_norm><<<M, kCUDANumThreads, 0, cuda_stream>>>(
+=======
+    launch_vectorized_layer_norm_kernel(static_cast<int>(N), M, eps, X_data, gamma_data, beta_data, Y_data, mean_data, rstd_data);
+  } else {
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  RowwiseMomentsCUDAKernel<T, T_ACC>
+      <<<M, cuda_utils::kCUDABlockReduceNumThreads, 0, cuda_stream>>>(
+          N, eps, X_data, mean_data, rstd_data);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  LayerNormForwardCUDAKernel<T, T_ACC><<<M, kCUDANumThreads, 0, cuda_stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       N, X_data, mean_data, rstd_data, gamma_data, beta_data, Y_data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
@@ -1146,6 +1492,7 @@ void LayerNormKernelImpl(
       });
 }
 
+<<<<<<< HEAD
 void RmsNormKernelImpl(
   const Tensor& X,
   const Tensor& gamma,
@@ -1169,6 +1516,9 @@ AT_DISPATCH_FLOATING_TYPES_AND2(
 }
 
 template<typename T, typename T_ACC, bool rms_norm> __device__
+=======
+template<typename T, typename T_ACC> __device__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cuLoadWriteStridedInputs(
     const int i1_block,
     const int thr_load_row_off,
@@ -1186,10 +1536,14 @@ void cuLoadWriteStridedInputs(
 {
   int i1 = i1_block+thr_load_row_off;
   if (i1 < i1_end) {
+<<<<<<< HEAD
     T_ACC curr_mean = 0;
     if constexpr (!rms_norm){
       curr_mean = mean[i1];
     }
+=======
+    T_ACC curr_mean = mean[i1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T_ACC curr_rstd = rstd[i1];
     for (int k = 0;  k < blockDim.y;  ++k) {
       int i2 = i2_off + k;
@@ -1214,7 +1568,11 @@ void cuLoadWriteStridedInputs(
   }
 }
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm> __device__
+=======
+template<typename T, typename T_ACC> __device__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cuLoadAddStridedInputs(
     const int i1_block,
     const int thr_load_row_off,
@@ -1232,11 +1590,15 @@ void cuLoadAddStridedInputs(
 {
   int i1 = i1_block+thr_load_row_off;
   if (i1 < i1_end) {
+<<<<<<< HEAD
 
     T_ACC curr_mean = 0;
     if constexpr (!rms_norm){
       curr_mean = mean[i1];
     }
+=======
+    T_ACC curr_mean = mean[i1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T_ACC curr_rstd = rstd[i1];
     for (int k = 0;  k < blockDim.y;  ++k) {
       int i2 = i2_off + k;
@@ -1252,7 +1614,11 @@ void cuLoadAddStridedInputs(
   }
 }
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm> __global__
+=======
+template<typename T, typename T_ACC> __global__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cuComputePartGradGammaBeta(
     const T* __restrict__ dout,
     const T* __restrict__ input,
@@ -1278,9 +1644,15 @@ void cuComputePartGradGammaBeta(
     T_ACC* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
     // compute partial sums from strided inputs
     // do this to increase number of loads in flight
+<<<<<<< HEAD
     cuLoadWriteStridedInputs<T, T_ACC, rms_norm>(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
     for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
       cuLoadAddStridedInputs<T, T_ACC, rms_norm>(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+=======
+    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
+      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     __syncthreads();
     // inter-warp reductions
@@ -1319,7 +1691,11 @@ void cuComputePartGradGammaBeta(
     }
 }
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm> __global__
+=======
+template<typename T, typename T_ACC> __global__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cuComputeGradGammaBeta(
     const T_ACC* part_grad_gamma,
     const T_ACC* part_grad_beta,
@@ -1344,9 +1720,13 @@ void cuComputeGradGammaBeta(
     if (i2 < N) {
         for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
           sum_gamma += part_grad_gamma_ptr[warp_offset*N];
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             sum_beta += part_grad_beta_ptr[warp_offset*N];
           }
+=======
+          sum_beta += part_grad_beta_ptr[warp_offset*N];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     }
 
@@ -1364,9 +1744,13 @@ void cuComputeGradGammaBeta(
       if (threadIdx.y < offset) {
         const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
         sum_gamma += buf[read_idx];
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           sum_beta += buf[read_idx+nbsize3];
         }
+=======
+        sum_beta += buf[read_idx+nbsize3];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       __syncthreads();
     }
@@ -1377,14 +1761,22 @@ void cuComputeGradGammaBeta(
           grad_gamma[i2] = sum_gamma;
       }
       if (grad_beta) {
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             grad_beta[i2] = sum_beta;
           }
+=======
+          grad_beta[i2] = sum_beta;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
 }
 
+<<<<<<< HEAD
 template<typename T, typename T_ACC, bool rms_norm> __global__
+=======
+template<typename T, typename T_ACC> __global__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cuComputeGradInput(
     const T* __restrict__ dout,
     const T* __restrict__ input,
@@ -1398,10 +1790,14 @@ void cuComputeGradInput(
   for (int i1=blockIdx.y; i1 < M; i1 += gridDim.y) {
     T_ACC sum_loss1 = T_ACC(0);
     T_ACC sum_loss2 = T_ACC(0);
+<<<<<<< HEAD
     T_ACC c_mean = 0;
     if constexpr (!rms_norm){
       c_mean = mean[i1];
     }
+=======
+    T_ACC c_mean = mean[i1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const T_ACC c_rstd = rstd[i1];
     const T* k_input = input + i1*N;
     const T* k_dout = dout + i1*N;
@@ -1414,31 +1810,45 @@ void cuComputeGradInput(
         const T_ACC gamma_idx = static_cast<T_ACC>((idx<N) ? gamma[idx] : T(0));
         const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
         const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           sum_loss1 += c_loss * gamma_idx;
           sum_loss2 += c_loss * gamma_idx * (c_h - c_mean) * c_rstd;
         } else{
           sum_loss2 += c_loss * gamma_idx * (c_h) * c_rstd;
         }
+=======
+        sum_loss1 += c_loss * gamma_idx;
+        sum_loss2 += c_loss * gamma_idx * (c_h - c_mean) * c_rstd;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     } else {
       for( int l = 0; l < N ; l += numx) {
         int idx = l + thrx;
         const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
         const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           sum_loss1 += c_loss;
           sum_loss2 += c_loss * (c_h - c_mean) * c_rstd;
         } else {
           sum_loss2 += c_loss * (c_h) * c_rstd;
         }
+=======
+        sum_loss1 += c_loss;
+        sum_loss2 += c_loss * (c_h - c_mean) * c_rstd;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     // intra-warp reductions
     for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
+<<<<<<< HEAD
       if constexpr (!rms_norm){
         sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
       }
+=======
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
     }
     // inter-warp reductions
@@ -1449,33 +1859,49 @@ void cuComputeGradInput(
         // upper half of warps write to shared
         if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
           const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             buf[2*wrt_i] = sum_loss1;
           }
+=======
+          buf[2*wrt_i] = sum_loss1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           buf[2*wrt_i+1] = sum_loss2;
         }
         __syncthreads();
         // lower half merges
         if (threadIdx.y < offset) {
           const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
+<<<<<<< HEAD
           if constexpr (!rms_norm){
             sum_loss1 += buf[2*read_i];
           }
+=======
+          sum_loss1 += buf[2*read_i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           sum_loss2 += buf[2*read_i+1];
         }
         __syncthreads();
       }
       if (threadIdx.y == 0) {
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           buf[2*threadIdx.x] = sum_loss1;
         }
+=======
+        buf[2*threadIdx.x] = sum_loss1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buf[2*threadIdx.x+1] = sum_loss2;
       }
       __syncthreads();
       if (threadIdx.y !=0) {
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           sum_loss1 = buf[2*threadIdx.x];
         }
+=======
+        sum_loss1 = buf[2*threadIdx.x];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_loss2 = buf[2*threadIdx.x+1];
       }
     }
@@ -1488,12 +1914,17 @@ void cuComputeGradInput(
         const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
         const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
         T_ACC f_grad_input = fH * c_loss * gamma[l];
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           f_grad_input -= sum_loss1;
           f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
         } else {
           f_grad_input -= (c_h) * c_rstd * sum_loss2;
         }
+=======
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_grad_input *= term1;
         k_grad_input[l] = static_cast<T>(f_grad_input);
       }
@@ -1502,12 +1933,17 @@ void cuComputeGradInput(
         const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
         const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
         T_ACC f_grad_input = fH * c_loss;
+<<<<<<< HEAD
         if constexpr (!rms_norm){
           f_grad_input -= sum_loss1;
           f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
         } else {
           f_grad_input -= (c_h) * c_rstd * sum_loss2;
         }
+=======
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_grad_input *= term1;
         k_grad_input[l] = static_cast<T>(f_grad_input);
       }
@@ -1517,7 +1953,11 @@ void cuComputeGradInput(
   }
 }
 
+<<<<<<< HEAD
 template <typename T, bool rms_norm = false>
+=======
+template <typename T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void LayerNormBackwardKernelImplInternal(
     const Tensor& dY,
     const Tensor& X,
@@ -1531,9 +1971,13 @@ void LayerNormBackwardKernelImplInternal(
     Tensor* dbeta) {
   using T_ACC = acc_type<T, true>;
   TORCH_CHECK(dY.numel() == M * N);
+<<<<<<< HEAD
   if constexpr (!rms_norm){
     TORCH_CHECK(mean.numel() == M);
   }
+=======
+  TORCH_CHECK(mean.numel() == M);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(rstd.numel() == M);
   TORCH_CHECK(M <= at::cuda::getCurrentDeviceProperties()->maxGridSize[0], "M should be less than maximum CUDA grid size, \
   file a support request to support bigger batches");
@@ -1559,7 +2003,11 @@ void LayerNormBackwardKernelImplInternal(
               threads1.y > 1 ?
               threads1.y*threads1.x*sizeof(T_ACC) :
               0;
+<<<<<<< HEAD
       cuComputeGradInput<T, T_ACC, rms_norm><<<blocks1, threads1, nshared, cuda_stream>>>(
+=======
+      cuComputeGradInput<<<blocks1, threads1, nshared, cuda_stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               dY_data,
               X_data,
               M, N,
@@ -1571,7 +2019,11 @@ void LayerNormBackwardKernelImplInternal(
     } else {
       const dim3 blocks(M);
       int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
+<<<<<<< HEAD
       layer_norm_grad_input_kernel<T, T_ACC, rms_norm><<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+=======
+      layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       X_data, mean_data, rstd_data, gamma_data, dX_data, N);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
@@ -1585,12 +2037,22 @@ void LayerNormBackwardKernelImplInternal(
     const unsigned int alignment = sizeof(T) * vec_size;
     bool bAlignedBuffers = can_vectorize(dY_data, alignment) && can_vectorize(X_data, alignment) &&
       can_vectorize(gamma_data, alignment) && can_vectorize(dX_data, alignment);
+<<<<<<< HEAD
     if (bAlignedBuffers && bTargetDataTypes && bVectorSizeMultiple) {
       layer_norm_grad_input_kernel_vectorized<T, T_ACC, rms_norm><<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
           X_data, mean_data, rstd_data, gamma_data, dX_data, N);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       layer_norm_grad_input_kernel<T, T_ACC, rms_norm><<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+=======
+
+    if (bAlignedBuffers && bTargetDataTypes && bVectorSizeMultiple) {
+      layer_norm_grad_input_kernel_vectorized<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+          X_data, mean_data, rstd_data, gamma_data, dX_data, N);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           X_data, mean_data, rstd_data, gamma_data, dX_data, N);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
@@ -1606,7 +2068,11 @@ void LayerNormBackwardKernelImplInternal(
     if (M < 128) {
       // For small batch size, do colwise reduce directly.
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+<<<<<<< HEAD
       GammaBetaBackwardSimpleCUDAKernel<T, T_ACC, rms_norm>
+=======
+      GammaBetaBackwardSimpleCUDAKernel<T, T_ACC>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           <<<B, kCUDANumThreads, 0, cuda_stream>>>(
               M,
               N,
@@ -1630,7 +2096,11 @@ void LayerNormBackwardKernelImplInternal(
       Tensor part_grad_gamma = at::empty({part_size,N}, gamma.options().dtype(part_grad_dtype));
       Tensor part_grad_beta = at::native::empty_like(part_grad_gamma);
 
+<<<<<<< HEAD
       cuComputePartGradGammaBeta<T, T_ACC, rms_norm><<<blocks2, threads2, nshared2, cuda_stream>>>(
+=======
+      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, cuda_stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       dY_data,
                       X_data,
                       M,N,
@@ -1644,7 +2114,11 @@ void LayerNormBackwardKernelImplInternal(
       const dim3 blocks3((N + threads3.x - 1) / threads3.x, 1, 1);
       const int nshared3 = threads3.x * threads3.y * sizeof(T_ACC);
 
+<<<<<<< HEAD
       cuComputeGradGammaBeta<T, T_ACC, rms_norm><<<blocks3, threads3, nshared3, cuda_stream>>>(
+=======
+      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, cuda_stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       part_grad_gamma.template data_ptr<T_ACC>(),
                       part_grad_beta.template data_ptr<T_ACC>(),
                       part_size,
@@ -1654,7 +2128,11 @@ void LayerNormBackwardKernelImplInternal(
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 #else
+<<<<<<< HEAD
     LaunchGammaBetaBackwardCUDAKernel<T, T_ACC, rms_norm>(
+=======
+    LaunchGammaBetaBackwardCUDAKernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
 #endif
   }
@@ -1682,6 +2160,7 @@ void LayerNormBackwardKernelImpl(
       });
 }
 
+<<<<<<< HEAD
 void RMSNormBackwardKernelImpl(
     const Tensor& dY,
     const Tensor& X,
@@ -1705,6 +2184,10 @@ void RMSNormBackwardKernelImpl(
 } // namespace
 
 
+=======
+} // namespace
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
     const Tensor& input,
     IntArrayRef normalized_shape,
@@ -1833,6 +2316,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cuda(
   return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
+<<<<<<< HEAD
 /* RMSNorm is implemented by reusing layer_norm's kernels */
 std::tuple<Tensor, Tensor> _fused_rms_norm_cuda(
     const Tensor& input,
@@ -1940,6 +2424,8 @@ std::tuple<Tensor, Tensor> _fused_rms_norm_backward_cuda(
   return std::make_tuple(std::move(dX), std::move(dgamma));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl)
 REGISTER_DISPATCH(LayerNormBackwardKernel, &LayerNormBackwardKernelImpl)
 
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index dea59d5913b91..6ac0edd5ad025 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -14,6 +14,10 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/LinearAlgebra.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/BatchLinearAlgebra.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
 #include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>
@@ -1107,6 +1111,7 @@ void ldl_factor_kernel(
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
+<<<<<<< HEAD
        { ldl_factor_cusolver(
           LD, pivots, info, upper, hermitian);
         return;
@@ -1115,6 +1120,12 @@ void ldl_factor_kernel(
        { ldl_factor_magma(LD, pivots, info, upper, hermitian);
         return;
 }
+=======
+      return ldl_factor_cusolver(
+          LD, pivots, info, upper, hermitian);
+    case at::LinalgBackend::Magma:
+      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
     // By default use cusolver if available and magma otherwise.
     // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@@ -1126,10 +1137,15 @@ void ldl_factor_kernel(
             LD, pivots, info, upper, hermitian);
       }
 #endif
+<<<<<<< HEAD
     { ldl_factor_cusolver(
       LD, pivots, info, upper, hermitian);
       return;
     }
+=======
+      return ldl_factor_cusolver(
+          LD, pivots, info, upper, hermitian);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
       return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@@ -1243,7 +1259,11 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
 // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
 //     Batched cholesky_solve is dispatched to magma.
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
+<<<<<<< HEAD
 #if defined(USE_LINALG_SOLVER)
+=======
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
@@ -1346,7 +1366,11 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
     });
 
   if (input.dim() > 2) {
+<<<<<<< HEAD
     // if upper=true we need to transpose and conjugate the result tensor
+=======
+    // if upper=true we need to tranpose and conjugate the result tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // because the cholesky decomposition is stored in the lower triangular part
     if (upper) {
       input.copy_(result.mH());
@@ -1357,7 +1381,11 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
 }
 
 static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
+<<<<<<< HEAD
 #if defined(USE_LINALG_SOLVER)
+=======
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
@@ -1438,7 +1466,11 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper)
   // This function calculates the inverse matrix in-place
   // result should be in column major order and contain matrices to invert
   // the content of result is overwritten by 'apply_cholesky_inverse'
+<<<<<<< HEAD
 #if defined(USE_LINALG_SOLVER)
+=======
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
@@ -1620,7 +1652,20 @@ static void lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& i
   const auto preferred_backend = at::globalContext().linalgPreferredBackend();
 #ifdef USE_LINALG_SOLVER
   const auto lu_factor_cusolver = [batch_size, m, n](const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
+<<<<<<< HEAD
     if (m != n || (batch_size == 1 || m >= 512)) {
+=======
+    // In CUDA 10.2, lu_factor_looped_cusolver does not finish the computations when the input
+    // matrix is exactly singular. The returned pivots contain garbage. This breaks linalg.det
+    // Now, batched_cublas does not handle rectangular matrices, so we still dispatch to
+    // looped_cusolver even if m != n.
+#ifdef USE_ROCM
+    constexpr bool looped_correct = true;
+#else
+    constexpr bool looped_correct = CUSOLVER_VERSION >= 11100;
+#endif
+    if (m != n || (looped_correct && (batch_size == 1 || m >= 512))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       lu_factor_looped_cusolver(input, pivots, infos, compute_pivots);
     } else {
       lu_factor_batched_cublas(input, pivots, infos, compute_pivots);
@@ -1845,6 +1890,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
       // For the benchmarks see
       // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
       if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
+<<<<<<< HEAD
         geqrf_batched_cublas(input, tau);
         return;
       } else {
@@ -1853,11 +1899,22 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
       }
       geqrf_batched_cublas(input, tau);
       return;
+=======
+        return geqrf_batched_cublas(input, tau);
+      } else {
+        return geqrf_cusolver(input, tau);
+      }
+      return geqrf_batched_cublas(input, tau);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
+<<<<<<< HEAD
   // TODO Investigate whether the following magma bug is still occurring.
+=======
+  // TODO Investigate whether the following magma bug is still occuring.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // It may be the case that geqrf followed by orgqr is wrong for the magma backend
   // geqrf_magma currently uses geqrf2_gpu
   //
@@ -1865,6 +1922,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
   // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
   // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
     case at::LinalgBackend::Magma:
+<<<<<<< HEAD
       { geqrf_magma(input, tau);
         return;
       }
@@ -1873,6 +1931,12 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
       { geqrf_cusolver_backend(input, tau);
         return;
       }
+=======
+      return geqrf_magma(input, tau);
+    case at::LinalgBackend::Cusolver:
+    default:
+      return geqrf_cusolver_backend(input, tau);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #else
   return geqrf_magma(input, tau);
@@ -2712,6 +2776,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
 }
 
 void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
+<<<<<<< HEAD
 #if defined(USE_LINALG_SOLVER)
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
@@ -2719,14 +2784,25 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
       { gels_magma(a, b, infos);
         return;
       }
+=======
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      return gels_magma(a, b, infos);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::LinalgBackend::Cusolver:
     default:
       // linalg_lstsq_gels is a generic function that is implemented using
       // geqrf_stub, ormqr_stub, and triangular_solve_stub
       // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
+<<<<<<< HEAD
       { linalg_lstsq_gels(a, b, infos);
         return;
       }
+=======
+      return linalg_lstsq_gels(a, b, infos);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #else
   return gels_magma(a, b, infos);
@@ -2740,7 +2816,11 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
   // first handle the underdetermined case (m < n)
   // this case is not supported by MAGMA or cuBLAS
   if (m < n) {
+<<<<<<< HEAD
 #if defined(USE_LINALG_SOLVER)
+=======
+#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     linalg_lstsq_gels(a, b, infos);
 #else
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 5b28cc6eccf01..46efae2046a49 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -127,7 +127,12 @@ void apply_ldl_solve_cusolver(
     const Tensor& pivots,
     const Tensor& B,
     bool upper) {
+<<<<<<< HEAD
 #if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
+=======
+#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
+    CUSOLVER_VERSION >= 11102)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
@@ -332,11 +337,19 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
   // gesvd just knows how to handle m >= n, so in the other case we need to transpose A
   const auto not_A_H = A.size(-2) >= A.size(-1);
   Tensor Vcopy = V; // Shallow copy
+<<<<<<< HEAD
 #ifdef ROCM_VERSION
   // Similar to the case in svd_magma(), experiments have shown Vh tensor is
   // not guaranteed to be column major on ROCM, we have to create a copy to
   // deal with this
   if (compute_uv && !not_A_H) {
+=======
+#ifdef USE_ROCM
+  // Similar to the case in svd_magma(), experiments have shown Vh tensor is
+  // not guaranteed to be column major on ROCM, we have to create a copy to
+  // deal with this
+  if (!not_A_H) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vcopy = at::empty_like(V.mT(),
                            V.options()
                            .device(V.device())
@@ -351,8 +364,13 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
                                        infos,
                                        full_matrices, compute_uv, calculate_all_batches, batches);
   });
+<<<<<<< HEAD
 #ifdef ROCM_VERSION
   if (compute_uv && !not_A_H) {
+=======
+#ifdef USE_ROCM
+  if (!not_A_H) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     V.copy_(Vcopy);
   }
 #endif
@@ -526,8 +544,13 @@ static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const T
 template<typename scalar_t>
 static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
     const Tensor& infos, bool full_matrices, bool compute_uv) {
+<<<<<<< HEAD
 #if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION < 60100
   TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend or ROCM >= 5.7.0.")
+=======
+#ifndef CUDART_VERSION
+  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   int m = cuda_int_cast(A.size(-2), "m");
@@ -662,10 +685,17 @@ void svd_cusolver(const Tensor& A,
   const auto n = A.size(-1);
   const auto k = std::min(m, n);
 
+<<<<<<< HEAD
   static constexpr const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
   // The default heuristic is to use gesvdj driver
 #if defined(ROCM_VERSION) && ROCM_VERSION < 60100
+=======
+  static const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
+
+  // The default heuristic is to use gesvdj driver
+#ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto driver_v = std::string_view("gesvdj");
 #else
   const auto driver_v = driver.value_or("gesvdj");
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index cd03319f96d05..578f5bc158dc4 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -82,11 +82,19 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
 #if defined(BUILD_LAZY_CUDA_LINALG)
 namespace cuda { namespace detail {
 // This is only used for an old-style dispatches
+<<<<<<< HEAD
 // Please do not add any new entries to it
 struct LinalgDispatch {
    Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
 };
 C10_EXPORT void registerLinalgDispatch(const LinalgDispatch& /*disp_*/);
+=======
+// Please do not add any new entires to it
+struct LinalgDispatch {
+   Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
+};
+C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }} // namespace cuda::detail
 #endif
 
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
index 915d5f50a1790..28707b8f71ddc 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
@@ -239,6 +239,7 @@ void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left
 
 template <typename scalar_t>
 inline void apply_gels_batched(const Tensor& A, Tensor& B, Tensor& infos) {
+<<<<<<< HEAD
 #if defined(USE_ROCM) && (ROCM_VERSION >= 50400)
   auto trans = HIPBLAS_OP_N;
 #elif defined(USE_ROCM) && (ROCM_VERSION < 50400)
@@ -246,6 +247,9 @@ inline void apply_gels_batched(const Tensor& A, Tensor& B, Tensor& infos) {
 #else
   auto trans = CUBLAS_OP_N;
 #endif
+=======
+  auto trans = CUBLAS_OP_N;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto m = cuda_int_cast(A.size(-2), "m");
   auto n = cuda_int_cast(A.size(-1), "n");
 
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
index af183038bb8e4..5f88384727f5a 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -470,8 +470,13 @@ void gesvdjBatched<c10::complex<double>>(
 }
 
 
+<<<<<<< HEAD
 // ROCM does not implement gesdva correctly before 6.1
 #if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 60100
+=======
+// ROCM does not implement gesdva yet
+#ifdef CUDART_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<>
 void gesvdaStridedBatched_buffersize<float>(
     cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n, float *A, int lda, long long int strideA,
diff --git a/aten/src/ATen/native/cuda/reduction_template.cuh b/aten/src/ATen/native/cuda/reduction_template.cuh
index 484d04e2a39ba..819f87afad173 100644
--- a/aten/src/ATen/native/cuda/reduction_template.cuh
+++ b/aten/src/ATen/native/cuda/reduction_template.cuh
@@ -466,11 +466,15 @@ struct ReduceJitOp {
 
     __syncthreads();
 
+<<<<<<< HEAD
     #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
     for (int offset = 1; offset < dim_x; offset <<= 1) {
     #else
     for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
     #endif
+=======
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = reducer::warp_shfl_down(value[i], offset);
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 371b77722cd54..befe3e129e546 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -28,6 +28,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
   TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
     const Tensor& input,
     const Tensor& weight,
@@ -44,6 +45,8 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
   AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     const Tensor& input,
     const Tensor& grad_output,
@@ -136,12 +139,16 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
   return reserve_size;
 }
 
+<<<<<<< HEAD
 // Param `reserve` is a placeholder, just passing an empty tensor.
 // usage:
 // auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
 // at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
 // reserve);
 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
+=======
+std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input_t,
     const Tensor& weight_t,
     const std::optional<Tensor>& bias_t_opt,
@@ -149,11 +156,15 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
     const std::optional<Tensor>& running_var_t_opt,
     bool training,
     double exponential_average_factor,
+<<<<<<< HEAD
     double epsilon,
     Tensor& output_t,
     Tensor& save_mean,
     Tensor& save_var,
     Tensor& reserve) {
+=======
+    double epsilon) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned =
       at::borrow_from_optional_tensor(bias_t_opt);
@@ -193,6 +204,12 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
   cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
       training, input->suggest_memory_format(), input->dim());
 
+<<<<<<< HEAD
+=======
+  auto output_t =
+      at::empty_like(*input, input->options(), input->suggest_memory_format());
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TensorArg output{output_t, "output", 0};
 
   auto handle = getCudnnHandle();
@@ -204,8 +221,20 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
 
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
+<<<<<<< HEAD
+
+  if (training) {
+=======
+  Tensor save_mean, save_var;
+
+  Tensor reserve;
 
   if (training) {
+    int64_t num_features = input_t.size(1);
+    save_mean = at::empty({num_features}, weight_t.options());
+    save_var = at::empty({num_features}, weight_t.options());
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto op = CUDNN_BATCHNORM_OPS_BN;
     size_t workspace_size;
     AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@@ -253,6 +282,12 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
         reserve_size));
   } else {
     reserve = at::empty({0}, input->options().dtype(kByte));
+<<<<<<< HEAD
+=======
+    // This keeps a consistent output with native_batch_norm
+    save_mean = at::empty({0}, weight_t.options());
+    save_var = at::empty({0}, weight_t.options());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
         handle,
         mode,
@@ -273,6 +308,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
   // save_mean and save_var can be undefined
   // If this causes problems, we can initialize them to empty tensors
   // of the correct type
+<<<<<<< HEAD
   return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
       output_t, save_mean, save_var, reserve};
 }
@@ -315,6 +351,12 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
       reserve);
 }
 
+=======
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>{
+      output_t, save_mean, save_var, reserve};
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NB: CuDNN only implements the backward algorithm for batchnorm
 // in training mode (evaluation mode batchnorm has a different algorithm),
 // which is why this doesn't accept a 'training' parameter.
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index b86b7436138f2..fcf239901887e 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -169,11 +169,15 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
   ss << "import torch\n";
   ss << "torch.backends.cuda.matmul.allow_tf32 = "
+<<<<<<< HEAD
      << pybool(
             at::globalContext().float32Precision(
                 at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
             at::Float32Precision::TF32)
      << "\n";
+=======
+     << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ss << "torch.backends.cudnn.benchmark = "
      << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
   ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
@@ -729,7 +733,11 @@ Tensor cudnn_convolution_relu(
 
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
+<<<<<<< HEAD
   bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
+=======
+  bool allow_tf32 = ctx.allowTF32CuDNN();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto _bias = bias_t.has_value()
       ? bias_t.value()
       : at::zeros(
@@ -787,7 +795,11 @@ Tensor cudnn_convolution_add_relu(
   }
 
   auto& ctx = at::globalContext();
+<<<<<<< HEAD
   bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
+=======
+  bool allow_tf32 = ctx.allowTF32CuDNN();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool benchmark = ctx.benchmarkCuDNN();
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias_t.has_value()
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 081b4afa15ac5..8dc97a8cd853e 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -285,7 +285,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution forward algorithms");
     int perf_count;
+<<<<<<< HEAD
     c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
+=======
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
           cudnnGetConvolutionForwardAlgorithm_v7(
@@ -296,7 +300,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
               args.odesc.desc(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data()),
+=======
+              perf_results.get()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -314,7 +322,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
               args.output.data_ptr(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data(),
+=======
+              perf_results.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               ws.data,
               ws.size),
           args);
@@ -324,7 +336,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
+<<<<<<< HEAD
     return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
+=======
+    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static void getWorkspaceSize(
@@ -369,8 +385,12 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution backward data algorithms.");
     int perf_count;
+<<<<<<< HEAD
     c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
         perf_results;
+=======
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
           cudnnGetConvolutionBackwardDataAlgorithm_v7(
@@ -381,7 +401,11 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
               args.idesc.desc(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data()),
+=======
+              perf_results.get()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -399,7 +423,11 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
               args.input.data_ptr(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data(),
+=======
+              perf_results.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               ws.data,
               ws.size),
           args);
@@ -409,7 +437,11 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
+<<<<<<< HEAD
     return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
+=======
+    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static void getWorkspaceSize(
@@ -457,8 +489,12 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
     static_assert(
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution backward filter algorithms.");
+<<<<<<< HEAD
     c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
         perf_results;
+=======
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int perf_count;
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
@@ -470,7 +506,11 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.wdesc.desc(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data()),
+=======
+              perf_results.get()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -488,7 +528,11 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.weight.data_ptr(),
               num_algos,
               &perf_count,
+<<<<<<< HEAD
               perf_results.data(),
+=======
+              perf_results.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               ws.data,
               ws.size),
           args);
@@ -498,7 +542,11 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
+<<<<<<< HEAD
     return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
+=======
+    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static void getWorkspaceSize(
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index 8a19fac27bfd4..2af1c11adf37c 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -252,7 +252,11 @@ struct CacheKeyFusedWrapper : ParamsWrapper<CacheKeyFused> {
   }
 };
 
+<<<<<<< HEAD
 int getLRUCacheLimit() {
+=======
+static int getLRUCacheLimit() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr int DEFAULT_LIMIT =
       10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan
   // 0 is used to indicate no limit
@@ -337,7 +341,12 @@ struct BenchmarkCache {
             engine_cache_order.begin(), engine_cache_order, it->second.second);
       }
     } else {
+<<<<<<< HEAD
       engine_cache.insert_or_assign(
+=======
+      engine_cache.erase(key);
+      engine_cache.emplace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           key,
           std::make_pair(results, engine_cache_order.end())); // dummy iterator
     }
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 35d5a64685f20..ecd11fda2bfc8 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -76,6 +76,10 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
 
 #else // AT_CUDNN_ENABLED
 
+<<<<<<< HEAD
+=======
+#include <ATen/cudnn/Descriptors.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
 
@@ -283,9 +287,15 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
   checkBackend(c, {*targets}, Backend::CUDA);
   const auto batch_size = log_probs->size(1);
   int64_t input_lengths_size =
+<<<<<<< HEAD
       !input_lengths_.sizes().empty() ? input_lengths_.size(0) : 1;
   int64_t target_lengths_size =
       !target_lengths_.sizes().empty() ? target_lengths_.size(0) : 1;
+=======
+      input_lengths_.sizes().size() ? input_lengths_.size(0) : 1;
+  int64_t target_lengths_size =
+      target_lengths_.sizes().size() ? target_lengths_.size(0) : 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       input_lengths_size == batch_size,
       "input_lengths needs to have size to match batch_size");
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index 7604244997bcf..c211e36af76c5 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Config.h>
 #include <ATen/cuda/CUDAConfig.h>
 
+<<<<<<< HEAD
 #if AT_CUDNN_ENABLED()
 #include <cudnn_frontend.h>
 #endif
@@ -9,6 +10,11 @@
 #if defined(USE_ROCM) || !AT_CUDNN_ENABLED() ||         \
     (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900) || \
     (defined(CUDNN_FRONTEND_VERSION) && CUDNN_FRONTEND_VERSION < 10100)
+=======
+#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \
+    (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 namespace native {
 
@@ -88,6 +94,7 @@ void run_cudnn_SDP_bprop(
       false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
 }
 
+<<<<<<< HEAD
 void run_cudnn_SDP_bprop_nestedtensor(
     int64_t b,
     int64_t h_q,
@@ -119,6 +126,8 @@ void run_cudnn_SDP_bprop_nestedtensor(
       false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace native
 } // namespace at
 
@@ -130,6 +139,10 @@ void run_cudnn_SDP_bprop_nestedtensor(
 #include <ATen/native/transformers/sdp_utils.h>
 
 #include <ATen/cuda/Exceptions.h>
+<<<<<<< HEAD
+=======
+#include <cudnn_frontend.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/TensorUtils.h>
 #include <ATen/native/utils/ParamsHash.h>
@@ -142,6 +155,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
 namespace at {
 namespace native {
 
+<<<<<<< HEAD
 namespace fe = cudnn_frontend;
 
 constexpr uint8_t MAX_MHA_DIM = 4;
@@ -195,6 +209,47 @@ int roundup_power2(int dim) {
   dim++;
   return dim;
 }
+=======
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+using graph_and_tensors = std::tuple<
+    std::shared_ptr<fe::graph::Graph>,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
+    // TODO(eqy): additional options
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_mask,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_scale
+    std::shared_ptr<fe::graph::Tensor_attributes>, // O
+    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
+    >;
+
+using graph_and_tensors_backward = std::tuple<
+    std::shared_ptr<fe::graph::Graph>,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dO,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // stats,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dQ,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dK,,
+    std::shared_ptr<fe::graph::Tensor_attributes> // dV,
+    >;
+
+#define MAX_MHA_DIM 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct MHAParams {
   c10::DeviceIndex device_id;
@@ -219,7 +274,10 @@ struct MHAParams {
   // might be redundant if we take 0 dim/stride
   // as signaling no-bias
   bool has_attn_bias;
+<<<<<<< HEAD
   bool use_ragged;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 void setMHAParams(
@@ -236,8 +294,12 @@ void setMHAParams(
     const std::optional<Tensor>& attn_bias,
     double dropout_probability,
     bool is_causal,
+<<<<<<< HEAD
     bool return_softmaxstats,
     bool is_nested) {
+=======
+    bool return_softmaxstats) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   memset(&params, 0, sizeof(MHAParams));
   params.device_id = at::cuda::current_device();
   params.dataType = fe::DataType_t::HALF;
@@ -254,6 +316,7 @@ void setMHAParams(
   params.is_causal = is_causal;
   params.return_softmaxstats = return_softmaxstats;
   params.has_attn_bias = attn_bias.has_value();
+<<<<<<< HEAD
   // Expect 4D dense tensor, 3D nested case (THD)
   TORCH_INTERNAL_ASSERT(
       q.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
@@ -272,6 +335,25 @@ void setMHAParams(
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
       v.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
+=======
+  TORCH_INTERNAL_ASSERT(
+      q.sizes().size() == MAX_MHA_DIM,
+      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      q.strides().size() == MAX_MHA_DIM,
+      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      k.sizes().size() == MAX_MHA_DIM,
+      "K tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      k.strides().size() == MAX_MHA_DIM,
+      "K tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      v.sizes().size() == MAX_MHA_DIM,
+      "V tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      v.strides().size() == MAX_MHA_DIM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin());
   std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin());
@@ -279,6 +361,7 @@ void setMHAParams(
   std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
   std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
   std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+<<<<<<< HEAD
   bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias);
   params.use_ragged = use_ragged;
   if (use_ragged) {
@@ -293,6 +376,8 @@ void setMHAParams(
     params.k_dim[2] = roundup_power2(params.k_dim[2]);
     params.v_dim[2] = roundup_power2(params.v_dim[2]);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // uninit is OK as the struct is memset 0'd
   if (params.has_attn_bias) {
     std::copy(
@@ -320,8 +405,12 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
       const std::optional<Tensor>& attn_bias,
       double dropout_probability,
       bool is_causal,
+<<<<<<< HEAD
       bool return_softmaxstats,
       bool is_nested) {
+=======
+      bool return_softmaxstats) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     setMHAParams(
         this->pod,
         b,
@@ -336,6 +425,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
         attn_bias,
         dropout_probability,
         is_causal,
+<<<<<<< HEAD
         return_softmaxstats,
         is_nested);
   }
@@ -352,10 +442,20 @@ struct MHAGraphCache {
   MapType engine_cache;
   int count = 0;
   int hits = 0;
+=======
+        return_softmaxstats);
+  }
+};
+
+template <typename T, typename KeyType>
+struct MHAGraphCache {
+  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // no mutexes here as caches are now thread local for v8, can also return a
   // pointer to the Execution Plan if we know it will not be invalidated by
   // another thread
+<<<<<<< HEAD
   iterator find(const KeyType& key) {
     static bool flag =
         c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
@@ -382,12 +482,26 @@ struct MHAGraphCache {
   template <typename... Args>
   std::pair<iterator, bool> try_emplace(const KeyType& key, Args&&... args) {
     return engine_cache.try_emplace(key, std::forward<Args>(args)...);
+=======
+  T* find(const KeyType& key) {
+    auto it = engine_cache.find(key);
+    if (it == engine_cache.end()) {
+      return nullptr;
+    }
+    return &(it->second);
+  }
+
+  void update(const KeyType& key, T& results) {
+    engine_cache.erase(key);
+    engine_cache.emplace(key, std::move(results));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
 // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
 // be thread safe across all engines see Limitations in
 // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
+<<<<<<< HEAD
 // We also leak the caches to workaround potential teardown race issues.
 
 MHAGraphCache& getMHAGraphCache_() {
@@ -425,6 +539,13 @@ enum UIDS {
   RAG_LSE_OFF
 };
 
+=======
+thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
+thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
+    mhagraphbackwardcache;
+
+namespace {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // analogous to the same function in Descriptors.h for cuDNN Convolutions...
 auto fixSizeOneDimStrideSDPA(
     const IntArrayRef sizes,
@@ -442,10 +563,16 @@ auto fixSizeOneDimStrideSDPA(
   }
   return strides;
 }
+<<<<<<< HEAD
 
 } // namespace
 
 std::unique_ptr<fe::graph::Graph> build_graph(
+=======
+} // namespace
+
+auto build_graph_and_tensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -469,7 +596,11 @@ std::unique_ptr<fe::graph::Graph> build_graph(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
+<<<<<<< HEAD
   auto mha_graph = std::make_unique<fe::graph::Graph>();
+=======
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -478,12 +609,16 @@ std::unique_ptr<fe::graph::Graph> build_graph(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+<<<<<<< HEAD
                             .set_uid(SCALE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(fe::DataType_t::FLOAT));
+<<<<<<< HEAD
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA")
@@ -632,6 +767,62 @@ std::unique_ptr<fe::graph::Graph> build_graph(
     if (Stats) {
       Stats->set_dim(softmaxstats.sizes().vec());
     }
+=======
+  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(
+                                        dropoutseed.dtype() == kInt
+                                            ? fe::DataType_t::INT32
+                                            : fe::DataType_t::INT64));
+  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(
+                                          dropoutoffset.dtype() == kInt
+                                              ? fe::DataType_t::INT32
+                                              : fe::DataType_t::INT64));
+  auto scaled_dot_product_flash_attention_options =
+      fe::graph::SDPA_attributes()
+          .set_name("CUDNN_SDPA")
+          .set_is_inference(return_softmaxstats == false)
+          .set_causal_mask(is_causal)
+          .set_attn_scale(attn_scale)
+          .set_dropout(dropout_probability, seed, offset);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(q.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(k.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(v.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+  }
+
+  auto [O, Stats] =
+      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
+  O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
+
+  if (Stats) {
+    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@@ -641,10 +832,27 @@ std::unique_ptr<fe::graph::Graph> build_graph(
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
 
+<<<<<<< HEAD
   return mha_graph;
 }
 
 std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
+=======
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(bias),
+      std::move(attn_scale),
+      std::move(seed),
+      std::move(offset),
+      std::move(O),
+      std::move(Stats));
+}
+
+auto build_graph_and_tensors_nestedtensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t b,
     int64_t h_q,
     int64_t h_k,
@@ -672,7 +880,11 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
+<<<<<<< HEAD
   auto mha_graph = std::make_unique<fe::graph::Graph>();
+=======
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -681,12 +893,16 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+<<<<<<< HEAD
                             .set_uid(SCALE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(fe::DataType_t::FLOAT));
+<<<<<<< HEAD
   auto SEQ_LEN_Q_ =
       mha_graph->tensor(fe::graph::Tensor_attributes()
                             .set_uid(SEQ_LEN_Q)
@@ -697,6 +913,25 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
   auto SEQ_LEN_KV_ =
       mha_graph->tensor(fe::graph::Tensor_attributes()
                             .set_uid(SEQ_LEN_KV)
+=======
+  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("Seq_q")
+                                         .set_dim({b, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_KV =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             .set_name("Seq_kv")
                             .set_dim({b, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
@@ -705,6 +940,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
+<<<<<<< HEAD
           .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
@@ -733,10 +969,20 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
     scaled_dot_product_flash_attention_options.set_dropout(
         dropout_probability, seed, offset);
   }
+=======
+          .set_is_inference(return_softmaxstats == false)
+          .set_causal_mask(is_causal)
+          .set_attn_scale(attn_scale)
+          .set_dropout(dropout_probability, seed, offset)
+          .set_seq_len_q(SEQ_LEN_Q)
+          .set_seq_len_kv(SEQ_LEN_KV)
+          .set_padding_mask(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We hardcode BSHD to cuDNN even though the underlying layout is THD
   auto q_strides = q.strides();
   auto k_strides = k.strides();
   auto v_strides = v.strides();
+<<<<<<< HEAD
   // NB: cuDNN API shape is transposed: we pass it nominally as HTD
   constexpr int strideidx0 = 1;
   constexpr int strideidx1 = 0;
@@ -768,10 +1014,41 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
                                        v_strides[strideidx0],
                                        v_strides[strideidx1],
                                        v_strides[strideidx2]}));
+=======
+  constexpr int strideidx0 = 1;
+  constexpr int strideidx1 = 0;
+  constexpr int strideidx2 = 2;
+  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Q")
+                                 .set_dim({b, h_q, s_q, d_qk})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      q_strides[strideidx0],
+                                      q_strides[strideidx1],
+                                      q_strides[strideidx2]}));
+  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("K")
+                                 .set_dim({b, h_k, s_kv, d_qk})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      k_strides[strideidx0],
+                                      k_strides[strideidx1],
+                                      k_strides[strideidx2]}));
+  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("V")
+                                 .set_dim({b, h_v, s_kv, d_v})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      v_strides[strideidx0],
+                                      v_strides[strideidx1],
+                                      v_strides[strideidx2]}));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (attn_bias.has_value()) {
     TORCH_CHECK(
         false,
         "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+<<<<<<< HEAD
     scaled_dot_product_flash_attention_options.set_bias(
         mha_graph->tensor(fe::graph::Tensor_attributes()
                               .set_uid(BIAS)
@@ -815,6 +1092,48 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
   auto o_strides = o.strides();
   O_->set_output(true)
       .set_uid(O)
+=======
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+  }
+  auto RAG_Q_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_q")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_K_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_k")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_V_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_v")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_O_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_o")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  // auto RAG_STATS_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+  //                                     .set_name("cum_seq_stats")
+  //                                     .set_dim({b + 1, 1, 1, 1})
+  //                                     .set_stride({1, 1, 1, 1})
+  //                                     .set_data_type(fe::DataType_t::INT32));
+  auto RAG_STATS_OFF = nullptr;
+  Q->set_ragged_offset(RAG_Q_OFF);
+  K->set_ragged_offset(RAG_K_OFF);
+  V->set_ragged_offset(RAG_V_OFF);
+  auto [O, Stats] =
+      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
+  auto o_strides = o.strides();
+  O->set_output(true)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .set_dim({b, h_q, s_q, d_v})
       .set_stride(
           {INT_MAX,
@@ -822,6 +1141,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
            o_strides[strideidx1],
            o_strides[strideidx2]});
 
+<<<<<<< HEAD
   O_->set_ragged_offset(RAG_O_OFF_);
   if (Stats) {
     auto RAG_STATS_OFF =
@@ -836,6 +1156,18 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
         .set_data_type(fe::DataType_t::FLOAT)
         .set_dim({b, h_q, s_q, 1})
         .set_stride({h_q * s_q, 1, h_q, 1});
+=======
+  O->set_ragged_offset(RAG_O_OFF);
+  if (Stats) {
+    TORCH_CHECK(
+        false,
+        "cuDNN SDPA Nested Tensor does not yet handle backwards/logsumexp computation");
+    // TODO(eqy): fix  when stats (backward) support is added
+    Stats->set_output(true)
+        .set_data_type(fe::DataType_t::FLOAT)
+        .set_dim({b, h_q, s_q, 1})
+        .set_stride({h_q * s_q * d_v, d_v, s_q * d_v, 1});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Stats->set_ragged_offset(RAG_STATS_OFF);
   }
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@@ -844,10 +1176,34 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
       mha_graph->create_execution_plans({fe::HeurMode_t::A}));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+<<<<<<< HEAD
   return mha_graph;
 }
 
 std::unique_ptr<fe::graph::Graph> build_graph_backward(
+=======
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(bias),
+      std::move(attn_scale),
+      std::move(seed),
+      std::move(offset),
+      std::move(O),
+      std::move(Stats),
+      std::move(RAG_Q_OFF),
+      std::move(RAG_K_OFF),
+      std::move(RAG_V_OFF),
+      std::move(RAG_O_OFF),
+      std::move(RAG_STATS_OFF),
+      std::move(SEQ_LEN_Q),
+      std::move(SEQ_LEN_KV));
+}
+
+auto build_graph_and_tensors_backward(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -874,7 +1230,11 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
+<<<<<<< HEAD
   auto mha_graph = std::make_unique<fe::graph::Graph>();
+=======
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -883,7 +1243,10 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+<<<<<<< HEAD
                             .set_uid(SCALE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
@@ -893,6 +1256,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward(
                                    .set_name("CUDNN_SDPA_BACKWARD")
                                    .set_causal_mask(is_causal)
                                    .set_attn_scale(attn_scale);
+<<<<<<< HEAD
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
     auto SEQ_LEN_Q_ =
         mha_graph->tensor(fe::graph::Tensor_attributes()
@@ -1291,13 +1655,93 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
            v_strides[strideidx1],
            v_strides[strideidx2]});
 
+=======
+  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Q")
+                                 .set_dim(q.sizes().vec())
+                                 .set_stride(q.strides().vec()));
+  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("K")
+                                 .set_dim(k.sizes().vec())
+                                 .set_stride(k.strides().vec()));
+  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("V")
+                                 .set_dim(v.sizes().vec())
+                                 .set_stride(v.strides().vec()));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    sdpa_backward_options.set_bias(bias.value());
+  }
+  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(
+                                        dropoutseed.dtype() == kInt
+                                            ? fe::DataType_t::INT32
+                                            : fe::DataType_t::INT64));
+
+  auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(
+                                          dropoutoffset.dtype() == kInt
+                                              ? fe::DataType_t::INT32
+                                              : fe::DataType_t::INT64));
+
+  auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("O")
+                                 .set_dim(o.sizes().vec())
+                                 .set_stride(o.strides().vec()));
+  auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Stats")
+                                     .set_dim(softmaxstats.sizes().vec())
+                                     .set_stride(softmaxstats.strides().vec())
+                                     .set_data_type(fe::DataType_t::FLOAT));
+  auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_name("DO")
+                                  .set_dim(dO.sizes().vec())
+                                  .set_stride(dO.strides().vec()));
+  if (dropout_probability != 0.0f) {
+    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
+  }
+  auto [DQ, DK, DV] =
+      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
+  DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+  DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+  DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
   AT_CUDNN_FRONTEND_CHECK(
       mha_graph->create_execution_plans({fe::HeurMode_t::A}));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+<<<<<<< HEAD
   return mha_graph;
+=======
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(bias),
+      std::move(attn_scale),
+      std::move(Seed),
+      std::move(Offset),
+      std::move(O),
+      std::move(DO),
+      std::move(STATS),
+      std::move(DQ),
+      std::move(DK),
+      std::move(DV));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void run_cudnn_SDP_fprop(
@@ -1319,6 +1763,7 @@ void run_cudnn_SDP_fprop(
     Tensor& o,
     Tensor& dropoutseed,
     Tensor& dropoutoffset) {
+<<<<<<< HEAD
   // do nothing if we got 0-element tensors
   if (!q.numel() || !k.numel() || !v.numel()) {
     return;
@@ -1360,6 +1805,8 @@ void run_cudnn_SDP_fprop(
     }
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto dprops = at::cuda::getCurrentDeviceProperties();
   auto _dropoutseed = dropoutseed;
   auto _dropoutoffset = dropoutoffset;
@@ -1370,11 +1817,30 @@ void run_cudnn_SDP_fprop(
   }
 
   cudnnHandle_t handle = getCudnnHandle();
+<<<<<<< HEAD
 
   // NB: The key initialization will round up sequence length, stride data etc.
   // if use_ragged_in_dense is enabled (to allow multiple sequence lengths to
   // reuse the same cached value/graph)
   MHACacheKeyWrapper key(
+=======
+  if (!o.defined()) {
+    // q is passed to us in BHSD dim order
+    alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
+  }
+
+  if (return_softmaxstats && !softmaxstats.defined()) {
+    // TODO(eqy): verify that this is correct
+    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
+  }
+
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }
+
+  auto key = MHACacheKeyWrapper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       b,
       h,
       s_q,
@@ -1387,11 +1853,21 @@ void run_cudnn_SDP_fprop(
       attn_bias,
       dropout_probability,
       is_causal,
+<<<<<<< HEAD
       return_softmaxstats,
       false);
   auto [cache_it, not_found] = getMHAGraphCache_().try_emplace(key, nullptr);
   if (not_found) {
     cache_it->second = build_graph(
+=======
+      return_softmaxstats);
+  auto graph_and_tensors_ptr = mhagraphcache.find(key);
+  graph_and_tensors graph_and_tensors_values;
+  if (graph_and_tensors_ptr) {
+    graph_and_tensors_values = *graph_and_tensors_ptr;
+  } else {
+    graph_and_tensors_values = build_graph_and_tensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b,
         h,
         s_q,
@@ -1412,6 +1888,7 @@ void run_cudnn_SDP_fprop(
         _dropoutoffset,
         handle);
   }
+<<<<<<< HEAD
   const fe::graph::Graph& mha_graph = *cache_it->second;
   std::unordered_map<int64_t, void*> variant_pack = {
       {Q, q.mutable_data_ptr()},
@@ -1445,6 +1922,31 @@ void run_cudnn_SDP_fprop(
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(
       mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
+=======
+  auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
+      graph_and_tensors_values;
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {
+          {Q, q.data_ptr()},
+          {K, k.data_ptr()},
+          {V, v.data_ptr()},
+          {attn_scale, &scaling_factor},
+          {seed, _dropoutseed.data_ptr()},
+          {offset, _dropoutoffset.data_ptr()},
+          {O, o.data_ptr()}};
+  if (return_softmaxstats) {
+    variant_pack[Stats] = softmaxstats.data_ptr();
+  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  mhagraphcache.update(key, graph_and_tensors_values);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void run_cudnn_SDP_fprop_nestedtensor(
@@ -1483,6 +1985,7 @@ void run_cudnn_SDP_fprop_nestedtensor(
   if (return_softmaxstats && !softmaxstats.defined()) {
     softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
   }
+<<<<<<< HEAD
 
   MHACacheKeyWrapper key(
       b,
@@ -1558,15 +2061,91 @@ void run_cudnn_SDP_fprop_nestedtensor(
   if (dropout_probability != 0.0f) {
     variant_pack[SEED] = dropoutseed.mutable_data_ptr();
     variant_pack[OFFSET] = dropoutoffset.mutable_data_ptr();
+=======
+  auto
+      [mha_graph,
+       Q,
+       K,
+       V,
+       bias,
+       attn_scale,
+       seed,
+       offset,
+       O,
+       Stats,
+       RAG_Q_OFF,
+       RAG_K_OFF,
+       RAG_V_OFF,
+       RAG_O_OFF,
+       RAG_STATS_OFF,
+       SEQ_LEN_Q,
+       SEQ_LEN_KV] =
+          build_graph_and_tensors_nestedtensor(
+              b,
+              h_q,
+              h_k,
+              h_v,
+              s_q,
+              s_kv,
+              d_qk,
+              d_v,
+              scaling_factor,
+              return_softmaxstats,
+              is_causal,
+              dropout_probability,
+              cum_seqlen_q,
+              cum_seqlen_kv,
+              q,
+              k,
+              v,
+              attn_bias,
+              softmaxstats,
+              o,
+              dropoutseed,
+              dropoutoffset,
+              handle);
+  auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
+  auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
+  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
+  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_qk);
+  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_stats_off = cum_seqlen_q.mul(h_q);
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {
+          {Q, q.data_ptr()},
+          {K, k.data_ptr()},
+          {V, v.data_ptr()},
+          {attn_scale, &scaling_factor},
+          {seed, dropoutseed.data_ptr()},
+          {offset, dropoutoffset.data_ptr()},
+          {O, o.data_ptr()},
+          {RAG_Q_OFF, rag_q_off.data_ptr()},
+          {RAG_O_OFF, rag_q_off.data_ptr()},
+          {RAG_K_OFF, rag_k_off.data_ptr()},
+          {RAG_V_OFF, rag_v_off.data_ptr()},
+          {SEQ_LEN_Q, seqlen_q.data_ptr()},
+          {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+  if (return_softmaxstats) {
+    variant_pack[Stats] = softmaxstats.data_ptr();
+    variant_pack[RAG_STATS_OFF] = cum_seqlen_q.data_ptr();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (attn_bias.has_value()) {
     TORCH_CHECK("bias not supported with nestedtensor");
   }
+<<<<<<< HEAD
   auto workspace_size = mha_graph.get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(
       mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
+=======
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void run_cudnn_SDP_bprop(
@@ -1596,9 +2175,12 @@ void run_cudnn_SDP_bprop(
       !softmaxstats.numel()) {
     return;
   }
+<<<<<<< HEAD
   Tensor seqlen_q, seqlen_kv;
   Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dprops = at::cuda::getCurrentDeviceProperties();
   auto _dropoutseed = dropoutseed;
   auto _dropoutoffset = dropoutoffset;
@@ -1625,6 +2207,7 @@ void run_cudnn_SDP_bprop(
       "with matching strides...");
 #else
   const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
+<<<<<<< HEAD
   if (innermost_dO_stride != 1 ||
       use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
     permute_to_matching_layout(o, dO_);
@@ -1649,6 +2232,14 @@ void run_cudnn_SDP_bprop(
 
   cudnnHandle_t handle = getCudnnHandle();
   MHACacheKeyWrapper key(
+=======
+  if (innermost_dO_stride != 1) {
+    permute_to_matching_layout(o, dO_);
+  }
+#endif
+  cudnnHandle_t handle = getCudnnHandle();
+  auto key = MHACacheKeyWrapper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       b,
       h,
       s_q,
@@ -1661,12 +2252,22 @@ void run_cudnn_SDP_bprop(
       attn_bias,
       dropout_probability,
       is_causal,
+<<<<<<< HEAD
       true,
       false);
   auto [cache_it, not_found] =
       getMHAGraphBackwardCache_().try_emplace(key, nullptr);
   if (not_found) {
     cache_it->second = build_graph_backward(
+=======
+      true);
+  auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key);
+  graph_and_tensors_backward graph_and_tensors_backward_values;
+  if (graph_and_tensors_backward_ptr) {
+    graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr;
+  } else {
+    graph_and_tensors_backward_values = build_graph_and_tensors_backward(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b,
         h,
         s_q,
@@ -1690,6 +2291,7 @@ void run_cudnn_SDP_bprop(
         _dropoutoffset,
         handle);
   }
+<<<<<<< HEAD
   const fe::graph::Graph& mha_graph = *cache_it->second;
 
   std::unordered_map<int64_t, void*> variant_pack = {
@@ -1723,10 +2325,50 @@ void run_cudnn_SDP_bprop(
   }
 
   auto workspace_size = mha_graph.get_workspace_size();
+=======
+  auto
+      [mha_graph,
+       Q,
+       K,
+       V,
+       bias,
+       attn_scale,
+       Seed,
+       Offset,
+       O,
+       Do,
+       Stats,
+       Dq,
+       Dk,
+       Dv] = graph_and_tensors_backward_values;
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {// inputs
+                      {Q, q.data_ptr()},
+                      {K, k.data_ptr()},
+                      {V, v.data_ptr()},
+                      {O, o.data_ptr()},
+                      {Do, dO_.data_ptr()},
+                      {Stats, softmaxstats.data_ptr()},
+                      // outputs
+                      {Dq, dQ.data_ptr()},
+                      {Dk, dK.data_ptr()},
+                      {Dv, dV.data_ptr()},
+                      // pass by value
+                      {attn_scale, &scaling_factor}};
+  if (dropout_probability != 0.0f) {
+    variant_pack[Seed] = _dropoutseed.data_ptr();
+    variant_pack[Offset] = _dropoutoffset.data_ptr();
+  }
+  if (attn_bias.has_value()) {
+    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(!workspace_size || workspace_ptr.get());
   TORCH_CHECK(
+<<<<<<< HEAD
       mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }
 
@@ -1872,6 +2514,10 @@ void run_cudnn_SDP_bprop_nestedtensor(
   TORCH_CHECK(!workspace_size || workspace_ptr.get());
   TORCH_CHECK(
       mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
+=======
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  mhagraphbackwardcache.update(key, graph_and_tensors_backward_values);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cudnn/MHA.h b/aten/src/ATen/native/cudnn/MHA.h
index 620abc1aa0a8e..bd983f0e65dea 100644
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@@ -70,6 +70,7 @@ void run_cudnn_SDP_bprop(
     const Tensor& dropoutseed,
     const Tensor& dropoutoffset);
 
+<<<<<<< HEAD
 void run_cudnn_SDP_bprop_nestedtensor(
     int64_t b,
     int64_t h_q,
@@ -97,4 +98,6 @@ void run_cudnn_SDP_bprop_nestedtensor(
     const Tensor& dropoutseed,
     const Tensor& dropoutoffset);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 704a333b1f84b..e6a0329bab6ff 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -245,7 +245,11 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
       datatype,
       input_datatype,
       algo,
+<<<<<<< HEAD
       at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
+=======
+      at::globalContext().allowTF32CuDNN());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
     rnn_desc.set(
         handle,
@@ -261,7 +265,11 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
         datatype,
         input_datatype,
         algo,
+<<<<<<< HEAD
         at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
+=======
+        at::globalContext().allowTF32CuDNN());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   return rnn_desc;
 }
@@ -1222,7 +1230,11 @@ cudnnRNNAlgo_t get_algo(
 }
 
 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
+<<<<<<< HEAD
   if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
+=======
+  if (dtype == CUDNN_DATA_HALF) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return CUDNN_DATA_FLOAT;
   }
   return dtype;
diff --git a/aten/src/ATen/native/hip/ck_gemm.h b/aten/src/ATen/native/hip/ck_gemm.h
index 0d42cad56fcda..c3d757faebfac 100644
--- a/aten/src/ATen/native/hip/ck_gemm.h
+++ b/aten/src/ATen/native/hip/ck_gemm.h
@@ -10,7 +10,10 @@ inline void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas_gemm_internal_ck: not implemented");
 }
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void gemm_internal_ck<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@@ -19,7 +22,11 @@ template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+<<<<<<< HEAD
 #endif
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
index 0050e8419e850..ef209b7d163b8 100644
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@@ -1,7 +1,12 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
+<<<<<<< HEAD
 #include <ATen/native/hip/ck_gemm.h>
 
 #if defined(USE_ROCM_CK_GEMM)
+=======
+
+#include <ATen/native/hip/ck_gemm.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/hip/ck_gemm_template.h>
 #include <ck/utility/sequence.hpp>
 
@@ -772,6 +777,7 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+<<<<<<< HEAD
   static const std::vector<std::string> wmma_archs = {
     "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
 #if ROCM_VERSION >= 70000
@@ -791,3 +797,15 @@ void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 } // namespace at::native
 #endif // USE_ROCM_CK_GEMM
+=======
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string_view arch(dprops->gcnArchName);
+  if (arch == "gfx1100") {
+    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  } else{
+    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/hip/ck_gemm_float.hip b/aten/src/ATen/native/hip/ck_gemm_float.hip
index c4fea6088d3f0..358739d46b7e9 100644
--- a/aten/src/ATen/native/hip/ck_gemm_float.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_float.hip
@@ -1,7 +1,10 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
 
 #include <ATen/native/hip/ck_gemm.h>
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_GEMM)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/hip/ck_gemm_template.h>
 #include <ck/utility/sequence.hpp>
 
@@ -485,4 +488,7 @@ void gemm_internal_ck<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 }
 
 } // namespace at::native
+<<<<<<< HEAD
 #endif // USE_ROCM_CK_GEMM
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip
index 1b39283f9f944..756f994e1ee51 100644
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@@ -1,7 +1,10 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
 
 #include <ATen/native/hip/ck_gemm.h>
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_GEMM)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/hip/ck_gemm_template.h>
 
 #include <ck/utility/sequence.hpp>
@@ -599,6 +602,7 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 
 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+<<<<<<< HEAD
   static const std::vector<std::string> wmma_archs = {
     "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
 #if ROCM_VERSION >= 70000
@@ -618,3 +622,13 @@ void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 
 } // namespace at::native
 #endif // USE_ROCM_CK_GEMM
+=======
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
+  } else{
+    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/im2col_shape_check.h b/aten/src/ATen/native/im2col_shape_check.h
index 710954f7a022b..2487fb31e8348 100644
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@@ -2,7 +2,10 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+<<<<<<< HEAD
 #include <c10/util/safe_numerics.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 
@@ -55,6 +58,7 @@ inline void col2im_shape_check(
 
   int64_t batch_dim = (ndim == 3) ? 0 : -1;
   int64_t n_input_plane = input.size(batch_dim + 1);
+<<<<<<< HEAD
   uint64_t prod_kernel_size = 1;
 
   TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
@@ -63,6 +67,8 @@ inline void col2im_shape_check(
             " and kernel_height = ",
             kernel_height,
             " the product of kernel_width and kernel_height overflowed.");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (n_input_plane % (kernel_width * kernel_height) != 0) {
     TORCH_CHECK(false,
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index dadfe8aef5fd6..414f6c0aaaef1 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -261,11 +261,38 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
   return outputs;
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> rms_norm_composite(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const std::optional<Tensor>& weight_opt /* optional */,
     std::optional<double> eps) {
+=======
+Tensor rms_norm_symint(
+    const Tensor& input,
+    c10::SymIntArrayRef normalized_shape,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    std::optional<double> eps) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  _check_rms_norm_inputs_symint(input, normalized_shape, weight);
+
+#ifdef USE_MPS
+  if (input.device().type() == DeviceType::MPS && weight_opt.has_value()) {
+    const Tensor weight = weight_opt.value();
+    const bool any_nested = input.is_nested() || weight.is_nested();
+    const bool any_inputs_require_grad = input.requires_grad() || weight.requires_grad();
+    const bool is_input_fp = isFloatingType(input.scalar_type());
+    const bool is_weight_fp = isFloatingType(weight.scalar_type());
+
+    if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
+      auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
+      return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
+    }
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<int64_t> dims_to_reduce;
   for (const auto i : c10::irange(normalized_shape.size())) {
@@ -302,6 +329,7 @@ std::tuple<Tensor, Tensor> rms_norm_composite(
       upcasted_result = upcasted_result.mul(weight_opt.value());
     }
 
+<<<<<<< HEAD
     // if nested do not make contiguous
     if(input.is_nested() || (weight_opt.has_value() && weight_opt.value().is_nested())){
       return std::make_tuple(upcasted_result, rqrst_input);
@@ -365,4 +393,12 @@ Tensor rms_norm_symint(
   return std::get<0>(at::_fused_rms_norm(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
 }
 
+=======
+    return upcasted_result;
+  });
+
+  return result.type_as(input);
+
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index c6f498ca94747..2610521f45fde 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -3,9 +3,12 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/accumulate.h>
+<<<<<<< HEAD
 #include <c10/core/SymBool.h>
 #include <c10/util/StringUtil.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 
@@ -22,6 +25,7 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
       "Expected normalized_shape to be at least 1-dimensional, i.e., ",
       "containing at least one element, but got normalized_shape = ",
       normalized_shape);
+<<<<<<< HEAD
   if (weight.defined()) {
     TORCH_SYM_CHECK(
         sym_equals(weight.sym_sizes(), normalized_shape),
@@ -46,6 +50,30 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
   TORCH_SYM_CHECK(
       sym_equals(input_shape.slice(input_ndim - normalized_ndim), normalized_shape),
       expect_input_shape_msg);
+=======
+  TORCH_CHECK(
+      !weight.defined() || weight.sym_sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sym_sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  const auto input_ndim = input.dim();
+  const auto input_shape = input.sym_sizes();
+  if (input_ndim < normalized_ndim ||
+      !input_shape.slice(input_ndim - normalized_ndim)
+           .equals(normalized_shape)) {
+    std::stringstream ss;
+    ss << "Given normalized_shape=" << normalized_shape
+       << ", expected input with shape [*";
+    for (auto size : normalized_shape) {
+      ss << ", " << size;
+    }
+    ss << "], but got input of size" << input_shape;
+    TORCH_CHECK(false, ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_layer_norm_inputs(
@@ -111,12 +139,15 @@ void layer_norm_cpu_out(
     int64_t M,
     int64_t N);
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> rms_norm_composite(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const std::optional<Tensor>& weight_opt /* optional */,
     std::optional<double> eps);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor rms_norm_symint(
     const Tensor& input,
     c10::SymIntArrayRef normalized_shape,
diff --git a/aten/src/ATen/native/metal/MetalTensorImpl.h b/aten/src/ATen/native/metal/MetalTensorImpl.h
index 44152dd3c6d03..17248081e7063 100644
--- a/aten/src/ATen/native/metal/MetalTensorImpl.h
+++ b/aten/src/ATen/native/metal/MetalTensorImpl.h
@@ -35,7 +35,11 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
     return c10::fromIntArrayRefKnownNonNegative(strides_);
   }
 
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(c10::MemoryFormat memory_format) const override {
+=======
+  bool is_contiguous_custom(c10::MemoryFormat memory_format) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
 
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 0c122c9e13d4d..d787887691d82 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -191,9 +191,13 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   }
   checkAllSameType(c, {input, grad_output});
   checkAllSameType(c, {weight, save_mean, save_var});
+<<<<<<< HEAD
   // TODO: is weight required to be contiguous?
   checkAllContiguous(c, {save_mean, save_var});
   // TODO: TensorArg check should start handle memory format
+=======
+  checkAllContiguous(c, {save_mean, save_var});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
   TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
@@ -210,7 +214,12 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
+<<<<<<< HEAD
   auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
+=======
+  auto grad_input_t = at::empty(
+      input->sizes(), input->options(), input->suggest_memory_format());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
   auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 328daffa40861..492b41df2a429 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -855,7 +855,12 @@ void raw_miopen_convolution_forward_out_32bit(
       benchmark,
       deterministic);
 
+<<<<<<< HEAD
   if (at::globalContext().immediateMiopen()) {
+=======
+  if (deterministic && !benchmark) {
+      // immediate mode is triggered for the specific combination of benchmark=off deterministic=on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint64_t solution_id;
       Workspace workspace = chooseSolution<miopenConvFwdAlgorithm_t>(args, &solution_id);
 
@@ -1144,7 +1149,12 @@ void raw_miopen_convolution_backward_input_out_32bit(
       benchmark,
       deterministic);
 
+<<<<<<< HEAD
   if (at::globalContext().immediateMiopen()) {
+=======
+  if (deterministic && !benchmark) {
+      // immediate mode is triggered for the specific combination of benchmark=off deterministic=on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint64_t solution_id;
       Workspace workspace = chooseSolution<miopenConvBwdDataAlgorithm_t>(args, &solution_id);
 
@@ -1318,7 +1328,12 @@ void raw_miopen_convolution_backward_weight_out_32bit(
       benchmark,
       deterministic);
 
+<<<<<<< HEAD
   if (at::globalContext().immediateMiopen()) {
+=======
+  if (deterministic && !benchmark) {
+      // immediate mode is triggered for the specific combination of benchmark=off deterministic=on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint64_t solution_id;
       Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
 
@@ -1770,12 +1785,19 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
 // fusions
 // ---------------------------------------------------------------------
 
+<<<<<<< HEAD
 void raw_miopen_convolution_add_relu_out(
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,
     const Tensor& z,
     float alpha,
+=======
+void raw_miopen_convolution_relu_out(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& bias,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -1783,20 +1805,82 @@ void raw_miopen_convolution_add_relu_out(
     int64_t groups,
     bool benchmark,
     bool deterministic) {
+<<<<<<< HEAD
   raw_miopen_convolution_forward_out(
       output,
+=======
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = miopenConvolution;
+  ConvolutionArgs args{ input, output, weight };
+  args.handle = getMiopenHandle();
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       input,
       weight,
       padding,
       stride,
       dilation,
       groups,
+<<<<<<< HEAD
       benchmark,
       deterministic);
   at::Tensor alpha_mul_z_add_bias =
       at::native::reshape_bias(input.dim(), bias).add(z, alpha);
   output.add_(alpha_mul_z_add_bias);
   output.relu_();
+=======
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
+
+  TensorDescriptor bdesc;
+  bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
+
+  // Create the fusion plan
+  miopenFusionPlanDescriptor_t fusePlanDesc;
+  miopenFusionOpDescriptor_t convoOp;
+  miopenFusionOpDescriptor_t biasOp;
+  miopenFusionOpDescriptor_t activOp;
+  MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
+  MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
+
+  // compile fusion plan
+  MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
+
+  // Set the Args
+  float alpha = static_cast<float>(1);
+  float beta = static_cast<float>(0);
+  float activ_alpha = static_cast<float>(0);
+  float activ_beta = static_cast<float>(0);
+  float activ_gamma = static_cast<float>(0);
+  miopenOperatorArgs_t fusionArgs;
+  MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
+  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
+
+  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
+
+  // Cleanup
+  miopenDestroyFusionPlan(fusePlanDesc);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@@ -1809,13 +1893,20 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 Tensor miopen_convolution_add_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
+<<<<<<< HEAD
     const Tensor& z_t,
     const std::optional<Scalar>& alpha,
     const std::optional<Tensor>& bias_t,
+=======
+    const Tensor& z,
+    const std::optional<Scalar>& alpha,
+    const std::optional<Tensor>& bias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
+<<<<<<< HEAD
   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
   const Tensor input = input_t.contiguous(memory_format);
   const Tensor weight = weight_t.contiguous(memory_format);
@@ -1861,16 +1952,78 @@ Tensor miopen_convolution_add_relu(
       true); // deterministic
 
   return output_t;
+=======
+
+  // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
+  // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
+
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+
+  auto& ctx = at::globalContext();
+  bool benchmark = ctx.benchmarkCuDNN();
+
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 };
+
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0){
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      "miopen_convolution_add_relu",
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      false // deterministic
+  );
+
+  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
+
+  if (!output_t.is_same(contig_output_t)) {
+    contig_output_t.copy_(output_t);
+  }
+
+  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
+  auto _bias = bias.has_value()
+          ? bias.value()
+          : at::zeros(
+                {contig_output_t.size(1)},
+                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                contig_output_t.options().layout_opt(),
+                contig_output_t.options().device_opt(),
+                contig_output_t.options().pinned_memory_opt());
+
+  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
+  contig_output_t.add_(alpha_mul_z_add_bias);
+  contig_output_t.relu_();
+
+  return contig_output_t;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor miopen_convolution_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
+<<<<<<< HEAD
     const std::optional<Tensor>& bias_t,
+=======
+    const std::optional<Tensor>& bias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
+<<<<<<< HEAD
   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
   const Tensor input = input_t.contiguous(memory_format);
   const Tensor weight = weight_t.contiguous(memory_format);
@@ -1910,6 +2063,101 @@ Tensor miopen_convolution_relu(
       true); // deterministic
 
   return output_t;
+=======
+
+  auto& ctx = at::globalContext();
+  bool benchmark = ctx.benchmarkCuDNN();
+
+  // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
+  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
+          && input_t.scalar_type() == at::kFloat
+          && input_t.ndimension() == 4) {
+
+    // FuseFrozenConvAddRelu performs some tensor shape checking
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input_t.options().memory_format(input_t.suggest_memory_format()));
+    if (output_t.numel() == 0) {
+      return output_t;
+    }
+
+    auto _bias = bias.has_value()
+            ? bias.value()
+            : at::zeros(
+                  {output_t.size(1)},
+                  optTypeMetaToScalarType(output_t.options().dtype_opt()),
+                  output_t.options().layout_opt(),
+                  output_t.options().device_opt(),
+                  output_t.options().pinned_memory_opt());
+
+    raw_miopen_convolution_relu_out(
+        output_t,
+        input_t,
+        weight_t,
+        _bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        benchmark, // benchmark
+        false // deterministic
+    );
+
+    return output_t;
+  }
+  else {
+    // fallback
+
+    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+
+    TensorArg input  { input_t,  "input",  1 },
+              weight { weight_t, "weight", 2 };
+
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input->options().memory_format(memory_format));
+    if (output_t.numel() == 0){
+      return output_t;
+    }
+    // Avoid ambiguity of "output" when this is being used as backwards
+    TensorArg output{output_t, "result", 0};
+    miopen_convolution_forward_out(
+        output,
+        "miopen_convolution_relu",
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        false // deterministic
+    );
+
+    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
+
+    if (!output_t.is_same(contig_output_t)) {
+      contig_output_t.copy_(output_t);
+    }
+
+    auto _bias = bias.has_value()
+            ? bias.value()
+            : at::zeros(
+                  {contig_output_t.size(1)},
+                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                  contig_output_t.options().layout_opt(),
+                  contig_output_t.options().device_opt(),
+                  contig_output_t.options().pinned_memory_opt());
+
+    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
+    contig_output_t.add_(reshaped_bias);
+    contig_output_t.relu_();
+
+    return contig_output_t;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
index ba555ac1bfb26..43c9bfbe3e605 100644
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -373,58 +373,91 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
+<<<<<<< HEAD
         addmm_dense_result(
+=======
+        return addmm_dense_result(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(0, 1).to_sparse_csr(),
             mat1.transpose(0, 1),
             beta,
             alpha,
             result.transpose(0, 1));
+<<<<<<< HEAD
             return;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_dense_result(
+=======
+        return addmm_dense_result(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
             result.transpose(-2, -1));
+<<<<<<< HEAD
             return;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseBsc) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_dense_result(
+=======
+        return addmm_dense_result(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
             result.transpose(-2, -1));
+<<<<<<< HEAD
             return;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
   if (mat1.layout() == kSparseCsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_dense_result(mat1, mat2, beta, alpha, result);
         return;
+=======
+        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
         return;
       }
       if (result.layout() == kSparseCsr) {
         addmm_sparse_result(mat1, mat2, beta, alpha, result);
         return;
+=======
+        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+      }
+      if (result.layout() == kSparseCsr) {
+        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
         // TODO: CSR @ CSC kernel would be very fast due to format alignment
+<<<<<<< HEAD
         addmm_sparse_input_dense_result(
           mat1, mat2.to_sparse_csr(), beta, alpha, result);
         return;
@@ -434,6 +467,15 @@ void addmm_out_sparse_csr(
         addmm_sparse_result(
           mat1, mat2.to_sparse_csr(), beta, alpha, result);
         return;
+=======
+        return addmm_sparse_input_dense_result(
+            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+      }
+      if (result.layout() == kSparseCsr) {
+        // TODO: CSR @ CSC kernel would be very fast due to format alignment
+        return addmm_sparse_result(
+            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -441,21 +483,32 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
         // TODO: avoid csc->csr conversion with native csc support
+<<<<<<< HEAD
         addmm_dense_result(
           mat1.to_sparse_csr(), mat2, beta, alpha, result);
         return;
+=======
+        return addmm_dense_result(
+            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr) {
         // TODO: avoid csc->csr conversion with native csc support
+<<<<<<< HEAD
         addmm_sparse_result(
           mat1.to_sparse_csr(), mat2, beta, alpha, result);
         return;
+=======
+        return addmm_sparse_result(
+            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_sparse_input_dense_result(
           mat2.transpose(-2, -1),
           mat1.transpose(-2, -1),
@@ -478,18 +531,44 @@ void addmm_out_sparse_csr(
           alpha,
           result.transpose(-2, -1));
         return;
+=======
+        return addmm_sparse_input_dense_result(
+            mat2.transpose(-2, -1),
+            mat1.transpose(-2, -1),
+            beta,
+            alpha,
+            result.transpose(-2, -1));
+      }
+      if (result.layout() == kSparseCsr) {
+        // TODO avoid csc->csr
+        return addmm_sparse_result(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+      }
+      if (result.layout() == kSparseCsc) {
+        return addmm_sparse_result(
+            mat2.transpose(-2, -1),
+            mat1.transpose(-2, -1),
+            beta,
+            alpha,
+            result.transpose(-2, -1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
   if (mat1.layout() == kSparseBsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         addmm_dense_result(mat1, mat2, beta, alpha, result);
         return;
+=======
+        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
   TORCH_CHECK(
+<<<<<<< HEAD
     false,
     "addmm: computation on CPU is not implemented for ",
     result.layout(),
@@ -497,6 +576,15 @@ void addmm_out_sparse_csr(
     mat1.layout(),
     " @ ",
     mat2.layout());
+=======
+      false,
+      "addmm: computation on CPU is not implemented for ",
+      result.layout(),
+      " + ",
+      mat1.layout(),
+      " @ ",
+      mat2.layout());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /*
@@ -510,6 +598,7 @@ void addmm_out_sparse_csr(
                [out] result of the operation.
 */
 void addmv_out_sparse_csr(
+<<<<<<< HEAD
   const Tensor& mat,
   const Tensor& vec,
   const Scalar& beta,
@@ -520,6 +609,18 @@ void addmv_out_sparse_csr(
     false,
     "Calling addmv on a sparse CPU tensor requires Linux platform. ",
     "Please use PyTorch built with MKL on Linux.");
+=======
+    const Tensor& mat,
+    const Tensor& vec,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& result) {
+#if !AT_USE_MKL_SPARSE()
+  TORCH_CHECK(
+      false,
+      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+      "Please use PyTorch built with MKL on Linux.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
   c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 4aa53c5e794b8..ffd144b8b16d4 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -337,7 +337,10 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 #include <cmath>
 
 #include <mkl_dfti.h>
+<<<<<<< HEAD
 #include <mkl_version.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/mkl/Exceptions.h>
 #include <ATen/mkl/Descriptors.h>
 #include <ATen/mkl/Limits.h>
@@ -480,6 +483,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
   const auto value_type = c10::toRealValueType(input.scalar_type());
   out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
 
+<<<<<<< HEAD
   // fix mkl issue
   // https://github.com/pytorch/pytorch/issues/154477
 #ifdef INTEL_MKL_VERSION
@@ -493,6 +497,8 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
 #endif
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto descriptor = _plan_mkl_fft(
       input.strides(), out.strides(), signal_size, input.is_complex(),
       out.is_complex(), normalization, forward, value_type);
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 0431377d72bf6..f5a2e23a6fc55 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -38,6 +38,10 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub)
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/ConvUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 namespace at::native {
@@ -104,7 +108,11 @@ static void check_shape_forward(const Tensor& input,
     // If kernel size is incorrect
     std::ostringstream input_ss;
     std::ostringstream kernel_ss;
+<<<<<<< HEAD
     std::string separator;
+=======
+    std::string separator = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for (int i = 0, len = input_shape.size(); i < len; ++i) {
       input_ss << separator << input_shape[i];
@@ -147,13 +155,18 @@ static void check_shape_forward(const Tensor& input,
 //  blocked format will propagate between layers. Input, output will be in blocked format.
 //
 //  For inference case, weight can be prepacked into blocked format by
+<<<<<<< HEAD
 //  (so as to save weight reorder overhead):
+=======
+//  (so as to save weight reoder overhead):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //      model = torch.utils.mkldnn.to_mkldnn(model)
 //
 //  For training case, grad_output can be CPU tensor or MKLDNN tensor,
 //  but weight/bias and grad_weight/grad_bias are always CPU tensor.
 //
 
+<<<<<<< HEAD
 static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::BF16 &&
       mkldnn_bf16_device_check();
@@ -168,6 +181,8 @@ static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
    auto memory_format =  at::MemoryFormat::Contiguous;
    if (is_channels_last) {
@@ -176,7 +191,11 @@ static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bo
    return memory_format;
 }
 
+<<<<<<< HEAD
 static void _mkldnn_convolution_out(
+=======
+static void _mkldnn_convolution_out (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input_t,
     const Tensor& weight_t,
     const Tensor& bias,
@@ -274,6 +293,7 @@ static Tensor _mkldnn_convolution(
     output.resize_(output_sizes, memory_format);
     y = itensor_from_tensor(output);
   }
+<<<<<<< HEAD
   if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
       input_t.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
@@ -282,6 +302,8 @@ static Tensor _mkldnn_convolution(
       input_t.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _mkldnn_convolution_out(
       input_t,
       weight_t,
@@ -463,6 +485,7 @@ Tensor mkldnn_convolution_pointwise_binary(
     op_attr.set_post_ops(po);
     auto aprop_kind = ideep::prop_kind::forward_inference;
 
+<<<<<<< HEAD
     if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
     }
@@ -470,6 +493,8 @@ Tensor mkldnn_convolution_pointwise_binary(
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (bias.defined()) {
       const ideep::tensor b = itensor_from_tensor(bias);
       ideep::convolution_forward::compute_binary(
@@ -607,6 +632,7 @@ Tensor& mkldnn_convolution_pointwise_binary_(
       op_attr = ideep::attr_t::fuse_sum();
     }
     auto aprop_kind = ideep::prop_kind::forward_inference;
+<<<<<<< HEAD
     if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
         input_t.scalar_type() == at::kFloat) {
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
@@ -615,6 +641,8 @@ Tensor& mkldnn_convolution_pointwise_binary_(
         input_t.scalar_type() == at::kFloat) {
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mkldnn_convolution_out(
         input_t,
         weight_t,
@@ -723,7 +751,11 @@ Tensor _mkldnn_convolution_transpose(
   ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
   if (!weight.is_mkldnn()) {
     // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
+<<<<<<< HEAD
     // while PyTorch has IOHW or IODHW, `._transpose()` switches strides (no memory copy).
+=======
+    // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     w.transpose_(0, 1);
   }
 
@@ -733,6 +765,7 @@ Tensor _mkldnn_convolution_transpose(
     y = itensor_from_tensor(output);
   }
 
+<<<<<<< HEAD
   if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
@@ -740,6 +773,8 @@ Tensor _mkldnn_convolution_transpose(
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (bias.defined()) {
     const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
     ideep::convolution_transpose_forward::compute_v3(
@@ -824,6 +859,7 @@ Tensor mkldnn_convolution_backward_input(
     grad_input.resize_(input_size, memory_format);
     grad_x = itensor_from_tensor(grad_input);
   }
+<<<<<<< HEAD
   ideep::attr_t op_attr = ideep::attr_t();
   if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
       weight.scalar_type() == at::kFloat) {
@@ -833,6 +869,8 @@ Tensor mkldnn_convolution_backward_input(
       weight.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ideep::convolution_backward_data::compute_v2(
       grad_y,
       w,
@@ -843,6 +881,7 @@ Tensor mkldnn_convolution_backward_input(
       padding.vec(),
       padding.vec(),
       groups,
+<<<<<<< HEAD
 #if IDEEP_PREREQ(3, 4, 1, 3)
       is_channels_last,
       op_attr);
@@ -859,6 +898,9 @@ Tensor mkldnn_convolution_backward_input(
         "Unexpected ideep version to support fpmath_mode_tf32, please update ideep version to align with pytorch main branch");
       }
 #endif
+=======
+      is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (grad_output.is_mkldnn()) {
     return MKLDNNTensor(grad_x, grad_output.options());
@@ -883,6 +925,7 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
   const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
 
   ideep::tensor grad_w, grad_b;
+<<<<<<< HEAD
   ideep::attr_t op_attr = ideep::attr_t();
   if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
       input.scalar_type() == at::kFloat) {
@@ -892,6 +935,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
       input.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (bias_defined) {
     ideep::convolution_backward_weights::compute_v2(
         x,
@@ -904,8 +949,12 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
         padding.vec(),
         padding.vec(),
         groups,
+<<<<<<< HEAD
         is_channels_last,
         op_attr);
+=======
+        is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     ideep::convolution_backward_weights::compute_v2(
         x,
@@ -917,8 +966,12 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
         padding.vec(),
         padding.vec(),
         groups,
+<<<<<<< HEAD
         is_channels_last,
         op_attr);
+=======
+        is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (!is_channels_last) {
@@ -1040,6 +1093,7 @@ Tensor mkldnn_convolution_transpose_backward_input(
     grad_input.resize_(input_size, memory_format);
     grad_x = itensor_from_tensor(grad_input);
   }
+<<<<<<< HEAD
   ideep::attr_t op_attr = ideep::attr_t();
   if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
       weight.scalar_type() == at::kFloat) {
@@ -1049,6 +1103,8 @@ Tensor mkldnn_convolution_transpose_backward_input(
       weight.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ideep::convolution_transpose_backward_data::compute_v3(
       grad_y,
       w,
@@ -1059,8 +1115,12 @@ Tensor mkldnn_convolution_transpose_backward_input(
       padding_r(padding, output_padding),
       dilation.vec(),
       groups,
+<<<<<<< HEAD
       is_channels_last,
       op_attr);
+=======
+      is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (grad_output.is_mkldnn()) {
     return MKLDNNTensor(grad_x, grad_output.options());
@@ -1086,6 +1146,7 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
   auto x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
 
   ideep::tensor grad_w, grad_b;
+<<<<<<< HEAD
   ideep::attr_t op_attr = ideep::attr_t();
   if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
       input.scalar_type() == at::kFloat) {
@@ -1095,6 +1156,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
       input.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (bias_defined) {
     ideep::convolution_transpose_backward_weights::compute_v3(
         x,
@@ -1107,8 +1170,12 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
         padding_r(padding, output_padding),
         dilation.vec(),
         groups,
+<<<<<<< HEAD
         is_channels_last,
         op_attr);
+=======
+        is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     ideep::convolution_transpose_backward_weights::compute_v3(
         x,
@@ -1120,8 +1187,12 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
         padding_r(padding, output_padding),
         dilation.vec(),
         groups,
+<<<<<<< HEAD
         is_channels_last,
         op_attr);
+=======
+        is_channels_last);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (!is_channels_last) {
diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index 2f8448cf57d1f..30d11520534b4 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -68,6 +68,7 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
 
 namespace at::native {
 
+<<<<<<< HEAD
 static bool use_mkldnn_bf32_linear() {
   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16 &&
       mkldnn_bf16_device_check();
@@ -82,6 +83,8 @@ static bool use_mkldnn_tf32_linear() {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor mkldnn_linear(
     const Tensor& self,
     const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
@@ -265,12 +268,16 @@ Tensor mkldnn_linear_pointwise(
         it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
     op_attr = it->second(scalars, algorithm);
   }
+<<<<<<< HEAD
   if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
   if (use_mkldnn_tf32_linear() && input_t.scalar_type() == at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
@@ -360,6 +367,7 @@ Tensor mkldnn_linear_pointwise_binary(
   auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
   auto aprop_kind = ideep::prop_kind::forward_inference;
 
+<<<<<<< HEAD
   if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
@@ -368,6 +376,8 @@ Tensor mkldnn_linear_pointwise_binary(
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index b969e6a19c299..8b885d9281513 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -1,7 +1,13 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
 #include <ATen/Config.h>
 #include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
+=======
+#include <ATen/core/Tensor.h>
+#include <ATen/Config.h>
+#include <ATen/Context.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mkldnn/Matmul.h>
 
 #if !AT_MKLDNN_ENABLED()
@@ -53,7 +59,11 @@ bool mkldnn_fp16_gemm(
     c10::Half *c, int64_t ldc) {
   return false;
 }
+<<<<<<< HEAD
 bool mkldnn_reduced_f32_gemm(
+=======
+bool mkldnn_bf32_gemm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
@@ -85,6 +95,7 @@ void mkldnn_matmul_i8i8i32(
   TORCH_INTERNAL_ASSERT(false, __func__, ": ATen not compiled with MKLDNN support");
 }
 
+<<<<<<< HEAD
 bool use_mkldnn_tf32_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
@@ -92,6 +103,8 @@ bool use_mkldnn_tf32_matmul(
     return false;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
 
 
@@ -111,6 +124,7 @@ static bool use_mkldnn_fp16_matmul() {
 }
 
 static bool use_mkldnn_bf32_matmul() {
+<<<<<<< HEAD
   return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
 }
 
@@ -121,6 +135,9 @@ static bool use_mkldnn_tf32_matmul() {
 #else
     return false;  // TF32 not supported on power system
 #endif
+=======
+  return use_mkldnn_bf16_matmul() && at::globalContext().float32MatmulPrecision() == at::Float32MatmulPrecision::MEDIUM;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // returns an ideep::tensor
@@ -160,8 +177,12 @@ mkldnn_gemm(
   bool bf16_usable = std::is_same_v<scalar_t, c10::BFloat16> && use_mkldnn_bf16_matmul();
   bool fp16_usable = std::is_same_v<scalar_t, c10::Half> && use_mkldnn_fp16_matmul();
   bool bf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_bf32_matmul();
+<<<<<<< HEAD
   bool tf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_tf32_matmul();
   if ( !(bf16_usable || fp16_usable || bf32_usable || tf32_usable) ||
+=======
+  if ( !(bf16_usable || fp16_usable || bf32_usable) ||
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       (m * n * k <= 16 * 16 * 16) || (alpha == 0.0f)) {
     return false;
   }
@@ -172,7 +193,10 @@ mkldnn_gemm(
     op_attr = ideep::attr_t::fuse_sum();
   }
   if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+<<<<<<< HEAD
   if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
   // Use identity: C = AB <=> C^T = B^T A^T
@@ -299,7 +323,11 @@ bool mkldnn_fp16_gemm(
   return mkldnn_gemm<c10::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+<<<<<<< HEAD
 bool mkldnn_reduced_f32_gemm(
+=======
+bool mkldnn_bf32_gemm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
@@ -357,7 +385,10 @@ void mkldnn_matmul(
   auto mat2_unsqueezed = mat2.dim() == 1 ? mat2.unsqueeze(1) : mat2;
   auto result_unsqueezed = result.dim() == 1 ? result.unsqueeze(1) : result;
   bool bf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_bf32_matmul();
+<<<<<<< HEAD
   bool tf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_tf32_matmul();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ideep::attr_t op_attr;
   // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
@@ -365,7 +396,10 @@ void mkldnn_matmul(
   // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
   if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
   if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+<<<<<<< HEAD
   if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If alpha = 0, dose not need actually do gemm computation
   if (alpha == 0)
     return;
@@ -416,7 +450,11 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
+<<<<<<< HEAD
   constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+=======
+  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
@@ -438,6 +476,7 @@ bool use_mkldnn_bf16_matmul(
     const Tensor& result) {
 #if defined(__aarch64__)
   if (mkldnn_bf16_device_check_arm()) {
+<<<<<<< HEAD
     // onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
     // Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
     // inputs, allow it for float as well
@@ -455,6 +494,28 @@ bool use_mkldnn_bf16_matmul(
         mat2.scalar_type() == kBFloat16 &&
         (!result.defined() || result.scalar_type() == kBFloat16) &&
         mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
+=======
+     //onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
+     //so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
+     return (
+        use_mkldnn_bf16_matmul() &&
+        (mat1.scalar_type() == mat2.scalar_type()) && (!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
+        ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
+        mat1.numel() != 0 &&
+        mat2.numel() != 0 &&
+        checksize(mat1, mat2));
+  } else
+#endif
+  {
+     return (
+        use_mkldnn_bf16_matmul() &&
+        mat1.scalar_type() == kBFloat16 &&
+        mat2.scalar_type() == kBFloat16 &&
+        (!result.defined() || result.scalar_type() == kBFloat16) &&
+        mat1.numel() != 0 &&
+        mat2.numel() != 0 &&
+        checksize(mat1, mat2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -462,17 +523,30 @@ bool use_mkldnn_fp16_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
+<<<<<<< HEAD
   return (
       use_mkldnn_fp16_matmul() && mat1.scalar_type() == kHalf &&
       mat2.scalar_type() == kHalf &&
       (!result.defined() || result.scalar_type() == kHalf) &&
       mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
+=======
+
+    return (
+      use_mkldnn_fp16_matmul() &&
+      mat1.scalar_type() == kHalf &&
+      mat2.scalar_type() == kHalf &&
+      (!result.defined() || result.scalar_type() == kHalf) &&
+      mat1.numel() != 0 &&
+      mat2.numel() != 0 &&
+      checksize(mat1, mat2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool use_mkldnn_bf32_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
+<<<<<<< HEAD
   return (
       use_mkldnn_bf32_matmul() && mat1.scalar_type() == kFloat &&
       mat2.scalar_type() == kFloat &&
@@ -489,17 +563,32 @@ bool use_mkldnn_tf32_matmul(
       mat2.scalar_type() == kFloat &&
       (!result.defined() || result.scalar_type() == kFloat) &&
       mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
+=======
+
+    return (
+      use_mkldnn_bf32_matmul() &&
+      mat1.scalar_type() == kFloat &&
+      mat2.scalar_type() == kFloat &&
+      (!result.defined() || result.scalar_type() == kFloat) &&
+      mat1.numel() != 0 &&
+      mat2.numel() != 0 &&
+      checksize(mat1, mat2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool use_mkldnn_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
+<<<<<<< HEAD
   return (
       use_mkldnn_bf16_matmul(mat1, mat2, result) ||
       use_mkldnn_fp16_matmul(mat1, mat2, result) ||
       use_mkldnn_bf32_matmul(mat1, mat2, result) ||
       use_mkldnn_tf32_matmul(mat1, mat2, result));
+=======
+  return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result) || use_mkldnn_bf32_matmul(mat1, mat2, result));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void _mkldnn_matmul_i8i8i32_with_primitive(
@@ -540,7 +629,11 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
   args.insert({DNNL_ARG_WEIGHTS, expected_weight});
   args.insert({DNNL_ARG_DST, dst});
   args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+<<<<<<< HEAD
   // Create primitive and execute
+=======
+  // Create primitve and execute
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto primitive = dnnl::matmul(prim_desc);
   primitive.execute(ideep::stream::default_stream(), args);
 }
diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h
index 80247497d58f0..5a3a1bfe22745 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.h
+++ b/aten/src/ATen/native/mkldnn/Matmul.h
@@ -29,11 +29,14 @@ bool use_mkldnn_bf32_matmul(
     const Tensor& mat2,
     const Tensor& result_opt);
 
+<<<<<<< HEAD
 bool use_mkldnn_tf32_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result_opt);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Try running mkldnn optimized gemm, or returns false if naive gemm would be faster
 bool mkldnn_bf16_gemm(
     TransposeType transa, TransposeType transb,
@@ -67,7 +70,11 @@ oneDNN implicit reduced precision arithmetic feature
 https://github.com/mgouicem/oneDNN/tree/mgouicem/rfcs/implicit_downconvert/rfcs/20210301-computation-datatype
 to allow implicitly cast data type from FP32 to BF16 in onednn compute primitives
 */
+<<<<<<< HEAD
 bool mkldnn_reduced_f32_gemm(
+=======
+bool mkldnn_bf32_gemm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp
index c93198aec33a2..610759876ede3 100644
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@@ -439,7 +439,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
 // I. Memory Formats
 //   a. mkldnn will use plain formats for input, hx/cx, output, hy/cy
 //      and possibly use blocked formats for weights depending shape info.
+<<<<<<< HEAD
 //   b. All mkldnn memories are created (in plain format) as views on ATen tensor,
+=======
+//   b. All mkldnn memorys are created (in plain format) as views on ATen tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //      the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge)
 //
 // II. MKLDNN Primitive Mapping
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index fa8021e399428..914860d92cf69 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -39,7 +39,11 @@ void check_mkldnn_binary_fusion_inputs(
 inline std::vector<int64_t> padding_r(
     IntArrayRef padding, IntArrayRef output_padding)
 {
+<<<<<<< HEAD
   // ConvTranspose padding adjustment
+=======
+  // ConvTranpose padding adjustment
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // PyTorch uses padding/output_padding:
   //   osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
index 7be355b74c2f8..519467c945c90 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <ATen/Context.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils.h>
@@ -50,7 +53,11 @@ bool check_no_grad(sdp::sdp_params const& params, bool debug) {
   return !any_inputs_require_grad || !gradmode_enabled;
 }
 
+<<<<<<< HEAD
 bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
+=======
+bool use_overrideable_xpu(sdp::sdp_params const& params, bool debug) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr auto supported_dtypes = c10::array_of<at::ScalarType>(
       at::kFloat, at::kBFloat16, at::kHalf); // double is not supported
 
@@ -74,6 +81,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
   return sdp::check_tensor_dtype(params, supported_dtypes, debug);
 }
 
+<<<<<<< HEAD
 bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
   // Currently, XPU fallbacks flash attention to overridable
   return can_use_overrideable_attention(params, debug);
@@ -110,28 +118,50 @@ std::array<sdp::SDPBackend, sdp::num_backends> priority_order(
   return at::globalContext().sDPPriorityOrder();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
   // This function defines the priority order of the different sdp backends
   // 1. Flash Attention
   // 2. Math fallback
   auto& ctx = at::globalContext();
+<<<<<<< HEAD
   // use overridable linked to onednn as overridable implementation
   if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
       !ctx.userEnabledFlashSDP()) {
+=======
+  // use overrideable linked to onednn as overrideable implementation
+  if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return sdp::SDPBackend::error;
   }
 
   // Get ideal kernel ordering
+<<<<<<< HEAD
   const auto ordering = priority_order(kernel_params);
+=======
+  const std::array<sdp::SDPBackend, 2> priority_order{
+      sdp::SDPBackend::overrideable,
+      sdp::SDPBackend::math,
+  };
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Because TORCHCHECK checks if condition is true we negate debug so that
   // The statements will be printed when debug is true
   bool print_debug = false;
+<<<<<<< HEAD
   for (auto& backend : ordering) {
     switch (backend) {
       case sdp::SDPBackend::overrideable:
         if (ctx.userEnabledOverrideableSDP() &&
             can_use_overrideable_attention(kernel_params, print_debug)) {
+=======
+  for (auto& backend : priority_order) {
+    switch (backend) {
+      case sdp::SDPBackend::overrideable:
+        if (ctx.userEnabledOverrideableSDP() &&
+            use_overrideable_xpu(kernel_params, print_debug)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return sdp::SDPBackend::overrideable;
         }
         break;
@@ -140,6 +170,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
           return sdp::SDPBackend::math;
         }
         break;
+<<<<<<< HEAD
       case sdp::SDPBackend::flash_attention:
         if (ctx.userEnabledFlashSDP() &&
             can_use_flash_attention(kernel_params, print_debug)) {
@@ -160,17 +191,24 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
           TORCH_CHECK(false, "Invalid backend");
         }
         break;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       default:
         TORCH_CHECK(false, "Invalid backend");
     }
   }
   // If we have gotten to this point then two things have happened:
+<<<<<<< HEAD
   // 1. can_use_overridable_attention did not satisfy the constraints to be ran
+=======
+  // 1. use_overrideable_xpu did not satisfy the constraints to be ran
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 2. The user has explicitly disabled the math kernel
   // We then re-run the kernel checks with debug enabled to print out the
   // reason why the kernel was not selected
 
   print_debug = true;
+<<<<<<< HEAD
   TORCH_WARN("Flash attention kernel not used because:");
   can_use_flash_attention(kernel_params, print_debug);
   TORCH_WARN("Overrideable attention kernel not used because:");
@@ -179,6 +217,10 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
   can_use_cudnn_attention(kernel_params, print_debug);
   TORCH_WARN("Memory Efficient attention kernel not used because:");
   can_use_mem_efficien_attention(kernel_params, print_debug);
+=======
+  TORCH_WARN("OneDNN kernel not used because:");
+  use_overrideable_xpu(kernel_params, print_debug);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return sdp::SDPBackend::error;
 }
@@ -202,7 +244,11 @@ int64_t _fused_sdp_choice_xpu(
     TORCH_CHECK(
         false,
         "No viable backend for scaled_dot_product_attention was found. ",
+<<<<<<< HEAD
         "This is likely due to turning off both the math kernel and the overrideable kernels.");
+=======
+        "This is likely due to turning off both the math kernel and the fused kernels.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return static_cast<int64_t>(backend);
 }
@@ -260,7 +306,11 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
   alloc_with_matching_layout(query, output, output_shape);
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
+<<<<<<< HEAD
   at::native::onednn::sdpa(
+=======
+  at::native::onednn::gpu_float_sdpa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       batch_size,
       seq_len_q,
       seq_len_kv,
@@ -274,9 +324,13 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
       attn_bias,
       is_causal,
       scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
+<<<<<<< HEAD
       output,
       false,
       logsumexp);
+=======
+      output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // rng not used
   auto philox_seed = at::empty({}, at::dtype(at::kLong));
diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
index fb117ccc63fa6..a4986108db37a 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -2,7 +2,10 @@
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+<<<<<<< HEAD
 #include <ATen/native/xpu/Blas.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/library.h>
 #ifndef AT_PER_OPERATOR_HEADERS
 
@@ -51,6 +54,7 @@ Tensor& addmm_out(
       mat1.dtype(),
       " != ",
       mat2.dtype())
+<<<<<<< HEAD
 
   // complex case
   if (self.is_complex()) {
@@ -58,6 +62,11 @@ Tensor& addmm_out(
 
     return result;
   }
+=======
+  // complex case
+  TORCH_CHECK(
+      !mat1.is_complex(), "Complex datatype matmul is not supported in oneDNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<int64_t> result_shape = {mat1.size(0), mat2.size(1)};
   result.resize_(result_shape);
@@ -172,11 +181,16 @@ Tensor& mm_out(const Tensor& self, const Tensor& mat2, Tensor& result) {
     return result;
   }
 
+<<<<<<< HEAD
   if (self.is_complex()) {
     at::native::mm_complex_out_xpu(self, mat2, result);
 
     return result;
   }
+=======
+  TORCH_CHECK(
+      !self.is_complex(), "Complex datatype matmul is not supported in oneDNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   onednn::matmul(result, self, mat2, Tensor(), true, onednn::Attr());
   return result;
@@ -216,12 +230,18 @@ Tensor& baddbmm_out(
       input.sizes());
 
   // complex case
+<<<<<<< HEAD
   if (input.is_complex()) {
     at::native::baddbmm_complex_out_xpu(
         input, batch1, batch2, beta, alpha, result);
 
     return result;
   }
+=======
+  TORCH_CHECK(
+      !batch1.is_complex(),
+      "Complex datatype matmul is not supported in oneDNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // general case
   onednn::Attr attr;
@@ -268,6 +288,7 @@ Tensor& bmm_out(const Tensor& self, const Tensor& batch2, Tensor& result) {
     return result;
   }
 
+<<<<<<< HEAD
   // complex case
   if (self.is_complex()) {
     at::native::bmm_complex_out_xpu(self, batch2, result);
@@ -275,6 +296,10 @@ Tensor& bmm_out(const Tensor& self, const Tensor& batch2, Tensor& result) {
     return result;
   }
 
+=======
+  TORCH_CHECK(
+      !self.is_complex(), "Complex datatype matmul is not supported in oneDNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   onednn::matmul(result, self, batch2, at::Tensor(), true, onednn::Attr());
   return result;
 }
@@ -485,6 +510,7 @@ Tensor _weight_int4pack_mm_xpu(
 
   return C;
 }
+<<<<<<< HEAD
 
 Tensor& _int_mm_out_xpu(
     const Tensor& self,
@@ -631,4 +657,6 @@ Tensor _weight_int8pack_mm_xpu(
 
   return C;
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
index 98e2e39fe3bfa..840687f929807 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -13,9 +13,12 @@ using dims = logical_tensor::dims;
 using op = dnnl::graph::op;
 using partition = dnnl::graph::partition;
 
+<<<<<<< HEAD
 constexpr logical_tensor::data_type sdpa_intermediate_dtype =
     logical_tensor::data_type::f32;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
   return scalar_type == c10::ScalarType::Float   ? data_type::f32
       : scalar_type == c10::ScalarType::Half     ? data_type::f16
@@ -23,8 +26,11 @@ inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
                                                  : data_type::undef;
 }
 
+<<<<<<< HEAD
 namespace sdpa_forward {
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct SDPALogicalParams {
   enum class TensorID {
     query,
@@ -33,8 +39,12 @@ struct SDPALogicalParams {
     neg_inf,
     attn_mask,
     value,
+<<<<<<< HEAD
     attention,
     logsumexp,
+=======
+    output,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     end,
   };
 
@@ -44,16 +54,24 @@ struct SDPALogicalParams {
   std::optional<logical_tensor> neg_inf;
   std::optional<logical_tensor> attn_mask;
   logical_tensor value{};
+<<<<<<< HEAD
   logical_tensor attention{};
   std::optional<logical_tensor> logsumexp;
+=======
+  logical_tensor output{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   SDPALogicalParams(
       const at::Tensor& query_,
       const at::Tensor& key_,
       const at::Tensor& value_,
       const std::optional<at::Tensor>& attn_mask_,
+<<<<<<< HEAD
       const at::Tensor& attention_,
       const at::Tensor& logsumexp_,
+=======
+      const at::Tensor& output_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int batch_size,
       int seq_len_q,
       int seq_len_kv,
@@ -61,26 +79,40 @@ struct SDPALogicalParams {
       int num_head_kv,
       int head_dim_qk,
       int head_dim_v,
+<<<<<<< HEAD
       bool is_causal,
       bool compute_logsumexp) {
+=======
+      bool is_causal) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
     TORCH_INTERNAL_ASSERT(
         (dtype != data_type::undef),
         "Only FP16/BF16/FP32 datatypes are currently supported");
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(
         query_.scalar_type() == attention_.scalar_type(),
         "scaled_dot_product_attention_xpu: query and attention tensors should have the same data type.");
     const dims scalar_shape = {1};
+=======
+    const dims scalar_shape = {1};
+    std::vector<logical_tensor> inputLogicalTensors;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     at::Tensor reshaped_query = query_;
     at::Tensor reshaped_key = key_;
     at::Tensor reshaped_value = value_;
+<<<<<<< HEAD
     at::Tensor reshaped_attention = attention_;
     at::Tensor reshaped_logsumexp =
         compute_logsumexp ? logsumexp_.unsqueeze(-1) : logsumexp_;
     at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
 
     // handle broadcasted input tensors for OneDNN
+=======
+    at::Tensor reshaped_output = output_;
+    at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (at::native::onednn::is_broadcast(reshaped_query)) {
       at::native::onednn::undo_broadcast(reshaped_query);
     }
@@ -90,6 +122,12 @@ struct SDPALogicalParams {
     if (at::native::onednn::is_broadcast(reshaped_value)) {
       at::native::onednn::undo_broadcast(reshaped_value);
     }
+<<<<<<< HEAD
+=======
+    if (at::native::onednn::is_broadcast(reshaped_output)) {
+      at::native::onednn::undo_broadcast(reshaped_output);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (attn_mask_.has_value() &&
         at::native::onednn::is_broadcast(reshaped_attn_mask)) {
       at::native::onednn::undo_broadcast(reshaped_attn_mask);
@@ -107,13 +145,18 @@ struct SDPALogicalParams {
           {batch_size, group_num, group_size, seq_len_q, head_dim_qk});
       reshaped_key = key_.unsqueeze(2);
       reshaped_value = value_.unsqueeze(2);
+<<<<<<< HEAD
       reshaped_attention = attention_.view(
+=======
+      reshaped_output = output_.view(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           {batch_size, group_num, group_size, seq_len_q, head_dim_v});
       if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) {
         reshaped_attn_mask = attn_mask_.value().unsqueeze(2);
       }
     }
 
+<<<<<<< HEAD
 #define LOGIC_TENSOR_DESC(name, dtype)     \
   name = {                                 \
       static_cast<size_t>(TensorID::name), \
@@ -123,6 +166,18 @@ struct SDPALogicalParams {
 
     LOGIC_TENSOR_DESC(query, dtype);
     LOGIC_TENSOR_DESC(key, dtype);
+=======
+    query = {
+        static_cast<size_t>(TensorID::query),
+        dtype,
+        reshaped_query.sizes().vec(),
+        reshaped_query.strides().vec()};
+    key = {
+        static_cast<size_t>(TensorID::key),
+        dtype,
+        reshaped_key.sizes().vec(),
+        reshaped_key.strides().vec()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scale = {
         static_cast<size_t>(TensorID::scale),
         to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
@@ -143,6 +198,7 @@ struct SDPALogicalParams {
       TORCH_INTERNAL_ASSERT(
           (mask_dtype != data_type::undef),
           "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
+<<<<<<< HEAD
       LOGIC_TENSOR_DESC(attn_mask, mask_dtype);
     }
     LOGIC_TENSOR_DESC(value, dtype);
@@ -156,6 +212,24 @@ struct SDPALogicalParams {
       LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype);
     }
 #undef LOGIC_TENSOR_DESC
+=======
+      attn_mask = {
+          static_cast<size_t>(TensorID::attn_mask),
+          mask_dtype,
+          reshaped_attn_mask.sizes().vec(),
+          reshaped_attn_mask.strides().vec()};
+    }
+    value = {
+        static_cast<size_t>(TensorID::value),
+        dtype,
+        reshaped_value.sizes().vec(),
+        reshaped_value.strides().vec()};
+    output = {
+        static_cast<size_t>(TensorID::output),
+        dtype,
+        reshaped_output.sizes().vec(),
+        reshaped_output.strides().vec()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   std::vector<logical_tensor> get_input() const {
     std::vector<logical_tensor> input = {query, key, scale};
@@ -169,21 +243,32 @@ struct SDPALogicalParams {
     return input;
   }
   std::vector<logical_tensor> get_output() const {
+<<<<<<< HEAD
     std::vector<logical_tensor> output;
     output.push_back(attention);
     if (logsumexp.has_value()) {
       output.push_back(logsumexp.value());
     }
     return output;
+=======
+    return {output};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
 partition create_sdpa_graph_partition(
     bool is_causal,
+<<<<<<< HEAD
     bool compute_logsumexp,
     data_type dtype,
     const SDPALogicalParams& params) {
   // graph building and partitioning
+=======
+    data_type dtype,
+    const SDPALogicalParams& params) {
+  // graph building and partitioning
+  // currently, we assume that Q and K have same sequence length
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   size_t lt_id = static_cast<size_t>(SDPALogicalParams::TensorID::end);
   size_t op_id = 0;
@@ -193,7 +278,11 @@ partition create_sdpa_graph_partition(
   // Matrix Extensions (Intel(R) XMX) support, which means the
   // Q/K/V tensors have bf16 or f16 data type while the output of the first
   // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
+<<<<<<< HEAD
   logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype};
+=======
+  logical_tensor matmul_qk_out{lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op matmul_qk{
       op_id++,
       op::kind::MatMul,
@@ -202,7 +291,11 @@ partition create_sdpa_graph_partition(
       "matmul_qk"};
   matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
 
+<<<<<<< HEAD
   logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype};
+=======
+  logical_tensor scaled_qk_out{lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op scale_mul{
       op_id++,
       op::kind::Multiply,
@@ -215,7 +308,11 @@ partition create_sdpa_graph_partition(
   // For optional additive mask
   std::optional<op> mask_add;
 
+<<<<<<< HEAD
   // For optional implicit causal mask
+=======
+  // For optional implicite causal mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<op> mask_gen_idx_row;
   std::optional<logical_tensor> mask_row_idx;
   std::optional<op> mask_gen_idx_col;
@@ -227,7 +324,11 @@ partition create_sdpa_graph_partition(
   if (params.attn_mask.has_value()) {
     TORCH_INTERNAL_ASSERT(
         !is_causal, "Additive mask cannot use with is_causal.");
+<<<<<<< HEAD
     masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+=======
+    masked_qk_out = {lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mask_add = {
         op_id++,
         op::kind::Add,
@@ -262,7 +363,11 @@ partition create_sdpa_graph_partition(
         {mask_gt_out.value()},
         "mask_gt"};
 
+<<<<<<< HEAD
     masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+=======
+    masked_qk_out = {lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mask_select = {
         op_id++,
         op::kind::Select,
@@ -283,15 +388,22 @@ partition create_sdpa_graph_partition(
   logical_tensor softmax_out{lt_id++, dtype};
   softmax.add_input(masked_qk_out.value_or(scaled_qk_out));
   softmax.add_output(softmax_out);
+<<<<<<< HEAD
   if (compute_logsumexp) {
     softmax.add_output(params.logsumexp.value());
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   op matmul_v{
       op_id++,
       op::kind::MatMul,
       {softmax_out, params.value},
+<<<<<<< HEAD
       {params.attention},
+=======
+      {params.output},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "matmul_v"};
 
   constexpr auto ekind = dnnl::engine::kind::gpu;
@@ -320,15 +432,21 @@ partition create_sdpa_graph_partition(
 
 partition& find_or_create_graph_partition(
     bool is_causal,
+<<<<<<< HEAD
     bool compute_logsumexp,
     const SDPALogicalParams& params) {
   thread_local PartitionCache cache;
+=======
+    const SDPALogicalParams& params) {
+  thread_local static PartitionCache cache;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const data_type dtype = params.query.get_data_type();
 
   // cache key creation
   // patternID is determined on the basis of the arguments provided
   std::bitset<32> patternID;
   if (dtype == data_type::f32) {
+<<<<<<< HEAD
     patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Float32), 1);
   }
   if (dtype == data_type::bf16) {
@@ -336,25 +454,45 @@ partition& find_or_create_graph_partition(
   }
   // sdp pattern
   patternID.set(static_cast<uint8_t>(PartitionCache::BitType::SdpaPattern), 1);
+=======
+    // bit 3 corresponds to float32 dtype
+    patternID.set(3, 1);
+  }
+  if (dtype == data_type::bf16) {
+    // bit 2 corresponds to fp16/bf16 dtype
+    patternID.set(2, 1);
+  }
+  // sdp pattern
+  patternID.set(4, 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Refer to comments in Utils.h. The first 8 bits are reserved
   int pos = 8;
   // attn_mask
   patternID.set(pos++, params.attn_mask.has_value());
   patternID.set(pos++, is_causal);
+<<<<<<< HEAD
   // compute_logsumexp
   patternID.set(pos++, compute_logsumexp);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto partition_ = cache.find_partition(patternID);
   if (!partition_.has_value()) {
     // partition cache no hit
     // graph building and partitioning
+<<<<<<< HEAD
     partition sdp_partition = create_sdpa_graph_partition(
         is_causal, compute_logsumexp, dtype, params);
+=======
+    partition sdp_partition =
+        create_sdpa_graph_partition(is_causal, dtype, params);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     partition_ = cache.insert_partition_cache(patternID, sdp_partition);
   }
   return *partition_;
 }
+<<<<<<< HEAD
 } // namespace sdpa_forward
 
 namespace sdpa_backward {
@@ -783,6 +921,12 @@ partition& find_or_create_backward_graph_partition(
 
 namespace at::native::onednn {
 void sdpa(
+=======
+} // namespace
+
+namespace at::native::onednn {
+void gpu_float_sdpa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -796,9 +940,13 @@ void sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
+<<<<<<< HEAD
     const Tensor& attention,
     bool compute_logsumexp,
     const Tensor& logsumexp) {
+=======
+    const Tensor& output) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& eng = GpuEngineManager::Instance().get_engine();
   auto& strm = GpuStreamManager::Instance().get_stream();
 
@@ -813,8 +961,13 @@ void sdpa(
   };
 
   // OneDNN doesn't support fp32 ukernel for implicit causal mask,
+<<<<<<< HEAD
   // and the reference implementation is worse than aten math + explicit causal
   // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32
+=======
+  // and the reference implementation is worse than aten math + explict causal
+  // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // ukernel for implicit causal mask.
   if (is_causal && query.dtype() == at::kFloat) {
     attn_mask = get_tril_mask();
@@ -824,6 +977,7 @@ void sdpa(
   std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
   std::optional<dnnl::graph::compiled_partition> compiled_partition;
 
+<<<<<<< HEAD
   const sdpa_forward::SDPALogicalParams logical_params(
       query,
       key,
@@ -845,6 +999,34 @@ void sdpa(
   l_inputs = std::move(logical_params.get_input());
   l_outputs = std::move(logical_params.get_output());
   compiled_partition = partition.compile(l_inputs, l_outputs, eng);
+=======
+  auto get_compiled_partition = [&]() {
+    const SDPALogicalParams logical_params(
+        query,
+        key,
+        value,
+        attn_mask,
+        output,
+        batch_size,
+        seq_len_q,
+        seq_len_kv,
+        num_head_q,
+        num_head_kv,
+        head_dim_qk,
+        head_dim_v,
+        is_causal);
+    auto& partition_ =
+        find_or_create_graph_partition(is_causal, logical_params);
+    auto i = logical_params.get_input();
+    auto o = logical_params.get_output();
+    auto compiled_partition = partition_.compile(i, o, eng);
+    l_inputs = std::move(i);
+    l_outputs = std::move(o);
+    return compiled_partition;
+  };
+
+  compiled_partition = get_compiled_partition();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Tensor softmax_scale1 = at::full(
       {},
@@ -854,11 +1036,16 @@ void sdpa(
   if (is_causal) {
     neg_inf = at::full(
         {},
+<<<<<<< HEAD
         -std::numeric_limits<float>::infinity(),
+=======
+        -INFINITY,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         query.options().dtype(at::toOpMathType(query.scalar_type())));
   }
 
   std::vector<dnnl::graph::tensor> outputs = {
+<<<<<<< HEAD
       {l_outputs[0], eng, attention.data_ptr()},
   };
   if (compute_logsumexp) {
@@ -995,6 +1182,23 @@ void sdpa_backward(
   }
 #undef ADD_INPUT
 
+=======
+      {l_outputs[0], eng, output.data_ptr()},
+  };
+  size_t i = 0;
+  std::vector<dnnl::graph::tensor> inputs;
+  inputs.reserve(l_inputs.size());
+  inputs.emplace_back(l_inputs[i++], eng, query.data_ptr());
+  inputs.emplace_back(l_inputs[i++], eng, key.data_ptr());
+  inputs.emplace_back(l_inputs[i++], eng, softmax_scale1.data_ptr());
+  if (neg_inf.has_value()) {
+    inputs.emplace_back(l_inputs[i++], eng, neg_inf->data_ptr());
+  }
+  if (attn_mask.has_value()) {
+    inputs.emplace_back(l_inputs[i++], eng, attn_mask->data_ptr());
+  }
+  inputs.emplace_back(l_inputs[i++], eng, value.data_ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   compiled_partition->execute(strm, inputs, outputs);
 }
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
index 49a249b5aea84..c21d3cb950c5b 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@@ -345,7 +345,11 @@ class Attr {
         dnnl::memory binary_m;
         auto binary = ops_params_[i].binary_;
         auto md = ops_params_[i].meta_;
+<<<<<<< HEAD
         // query expected_md to achieve peak performance
+=======
+        // qeury expected_md to achieve peak performance
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto expected_md = pd.query_md(
             dnnl::query::exec_arg_md,
             DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
index ede01093ff3e7..8afb352241a83 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@@ -110,9 +110,14 @@ void quantized_matmul(
   // [Note] Quantized Matrix Multiplication at XPU
   // The following code integrates oneDNN quantized gemm. The quantization
   // config we support:
+<<<<<<< HEAD
   // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
   // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
   // symmetric
+=======
+  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
+  // weight: s8; per_tensor/per_channel calibrated; symmetric
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
   construct_attr_by_post_op(
       binary_post_op,
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
index 9b68f4281d08c..5f7117f7ea2ad 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@@ -301,7 +301,11 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
       return false;
   }
 
+<<<<<<< HEAD
   // the overlapped cases are not supported
+=======
+  // the overlaped cases are not supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dnnl::memory::dims strides = get_onednn_strides(tensor);
   int64_t storage_size = 1;
   for (size_t dim = 0; dim < tensor_dim; ++dim)
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
index 52f89bc1395d7..113b695f64648 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
@@ -110,6 +110,7 @@ struct PartitionCache {
   // bit 1: is uint8
   // bit 2: fp16(0) / bf16(1)
   // bit 3: is fp32
+<<<<<<< HEAD
   // bit 4: is sdpa pattern
   // bit 5: is sdpa backward pattern
   // bit 6-7: reserved for future use
@@ -125,6 +126,13 @@ struct PartitionCache {
     SdpaBwdPattern = 5
   };
 
+=======
+  // bit 4: is sdp pattern
+  // bit 5-7: N/A
+  // The rest of the bits depend upon the arguments provided
+  // However, down the line, we might have different bitsets for different
+  // patterns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dnnl::graph::partition& insert_partition_cache(
       std::bitset<32>& patternID,
       dnnl::graph::partition& p) {
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
index 6b2bf01e6d73d..45b828b3c5c86 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -164,7 +164,11 @@ void quantized_matmul(
     std::string_view unary_post_op_algorithm,
     bool m2_trnas);
 
+<<<<<<< HEAD
 void sdpa(
+=======
+void gpu_float_sdpa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -178,6 +182,7 @@ void sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
+<<<<<<< HEAD
     const Tensor& attention,
     bool compute_logsumexp,
     const Tensor& logsumexp);
@@ -202,4 +207,7 @@ void sdpa_backward(
     Tensor& grad_query,
     Tensor& grad_key,
     Tensor& grad_value);
+=======
+    const Tensor& output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
index c014313a5b35d..1614e45ef0d24 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@@ -1,7 +1,10 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+<<<<<<< HEAD
 #include <ATen/native/mkldnn/xpu/qconv.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
@@ -9,7 +12,11 @@
 using namespace at::native::onednn;
 namespace at::native::xpu {
 
+<<<<<<< HEAD
 inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
+=======
+static inline c10::ScalarType qconv_decide_out_dtype(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -21,7 +28,11 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
   return dst_dtype;
 }
 
+<<<<<<< HEAD
 at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
+=======
+static at::Tensor qconv_prepack_xpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor weight,
     at::Tensor weight_scales,
     double input_scale,
@@ -35,6 +46,7 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
   return weight;
 }
 
+<<<<<<< HEAD
 at::Tensor QConvoneDNNXPU::run_pointwise(
     at::Tensor act,
     double act_scale,
@@ -289,11 +301,228 @@ at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
       unary_scalars,
       unary_algorithm);
 }
+=======
+class QConvoneDNNXPU final {
+ public:
+  static at::Tensor run_pointwise(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double inv_output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    if (act.dim() == 3 || act.dim() == 5) {
+      TORCH_CHECK(
+          attr == "none",
+          "quantized pointwise conv",
+          act.dim() - 2,
+          "d doesn't support unary_post_op fusion. Got unary_post_op:",
+          attr,
+          ".");
+    } else {
+      TORCH_CHECK(
+          attr == "none" || attr == "relu" || attr == "hardtanh" ||
+              attr == "hardswish" || attr == "swish",
+          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
+          attr,
+          ".");
+    }
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    Tensor output =
+        at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    return quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        inv_output_scale,
+        output_zero_point,
+        /*accum*/ std::nullopt,
+        /*accum_scale*/ 0.0,
+        /*accum_zero_point*/ 0,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ std::nullopt,
+        /*binary_alpha*/ std::nullopt,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
+  }
+
+  static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    return run_pointwise(
+        act,
+        act_scale.item().toDouble(),
+        act_zero_point.item().toLong(),
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
+  }
+
+  static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm) {
+    TORCH_CHECK(
+        act.dim() == 4 && binary_attr == "sum" &&
+            (!unary_attr.has_value() ||
+             (unary_attr.has_value() &&
+              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+        binary_attr,
+        " unary_post_op: ",
+        unary_attr.has_value() ? unary_attr.value() : "none",
+        ".")
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    bool has_accum_postop_sum = binary_attr == "sum";
+    Tensor output = has_accum_postop_sum
+        ? accum
+        : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    output = quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        output_scale,
+        output_zero_point,
+        /*accum*/ accum,
+        /*accum_scale*/ accum_scale,
+        /*accum_zero_point*/ accum_zero_point,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ binary_attr,
+        /*binary_alpha*/ alpha,
+        /*unary_attr*/ unary_attr,
+        /*unary_scalars*/ unary_scalars,
+        /*unary_algorithm*/ unary_algorithm);
+
+    if (!has_accum_postop_sum) {
+      return output;
+    } else {
+      return accum;
+    }
+  }
+};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
+<<<<<<< HEAD
       TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
+=======
+      TORCH_FN(xpu::qconv_prepack_xpu));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
       QConvoneDNNXPU::run_pointwise);
@@ -312,9 +541,12 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
       QConvoneDNNXPU::run_pointwise_tensor);
+<<<<<<< HEAD
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
       QConvoneDNNXPU::run_pointwise_binary_tensor);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
index e9584e8289eb2..2e9bc2019c0d0 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@@ -1,14 +1,21 @@
 #include <torch/library.h>
 
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+<<<<<<< HEAD
 #include <ATen/native/mkldnn/xpu/qlinear.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/ScalarType.h>
 
 using namespace at::native::onednn;
 
 namespace at::native::xpu {
 
+<<<<<<< HEAD
 inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
+=======
+static inline c10::ScalarType qlinear_decide_out_dtype(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -20,7 +27,11 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
   return dst_dtype;
 }
 
+<<<<<<< HEAD
 Tensor QLinearOnednnXPU::q_linear_pointwise(
+=======
+static Tensor q_linear_pointwise(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -79,7 +90,11 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
   return qout;
 }
 
+<<<<<<< HEAD
 Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
+=======
+static Tensor q_linear_pointwise_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -138,7 +153,11 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
   return qout;
 }
 
+<<<<<<< HEAD
 Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
+=======
+static Tensor q_linear_pointwise_binary(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -209,7 +228,11 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
   return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }
 
+<<<<<<< HEAD
 Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
+=======
+static Tensor q_linear_pointwise_binary_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -249,7 +272,11 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
       unary_post_op_algorithm);
 }
 
+<<<<<<< HEAD
 Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
+=======
+static at::Tensor q_linear_prepack_onednn(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor weight,
     std::optional<torch::List<int64_t>> input_shape) {
   at::Tensor weight_transposed = weight.transpose(0, 1);
@@ -259,6 +286,7 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
+<<<<<<< HEAD
       TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
@@ -272,6 +300,21 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
       TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
+=======
+      TORCH_FN(q_linear_pointwise));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
+      TORCH_FN(q_linear_pointwise_tensor));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
+      TORCH_FN(q_linear_prepack_onednn));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
+      TORCH_FN(q_linear_pointwise_binary));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
+      TORCH_FN(q_linear_pointwise_binary_tensor));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
new file mode 100644
index 0000000000000..6290245083a44
--- /dev/null
+++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
+  MPSGraphFFTScalingModeNone = 0L,
+  MPSGraphFFTScalingModeSize = 1L,
+  MPSGraphFFTScalingModeUnitary = 2L,
+};
+
+@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
+@property(readwrite, nonatomic) BOOL inverse;
+@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
+@property(readwrite, nonatomic) BOOL roundToOddHermitean;
++ (nullable instancetype)descriptor;
+@end
+
+@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
+
+@interface MPSGraph (SonomaOps)
+- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                      axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                                descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                    name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                                    axes:(NSArray<NSNumber*>* _Nonnull)axes
+                                              descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
+                                                    name:(NSString* _Nullable)name;
+@end
+
+// define BFloat16 enums for MacOS13
+#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
+
+// define Metal version
+#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
+#endif
diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
new file mode 100644
index 0000000000000..5497c83f7b9a6
--- /dev/null
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -0,0 +1,196 @@
+#pragma once
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+// TODO: Remove me when moved to MacOS 13
+#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
+
+@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
+
+@property(readwrite, nonatomic) NSUInteger strideInX;
+@property(readwrite, nonatomic) NSUInteger strideInY;
+@property(readwrite, nonatomic) NSUInteger strideInZ;
+@property(readwrite, nonatomic) NSUInteger dilationRateInX;
+@property(readwrite, nonatomic) NSUInteger dilationRateInY;
+@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
+
+@property(readwrite, nonatomic) NSUInteger paddingLeft;
+@property(readwrite, nonatomic) NSUInteger paddingRight;
+@property(readwrite, nonatomic) NSUInteger paddingTop;
+@property(readwrite, nonatomic) NSUInteger paddingBottom;
+@property(readwrite, nonatomic) NSUInteger paddingFront;
+@property(readwrite, nonatomic) NSUInteger paddingBack;
+
+@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
+@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
+@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
+
+@property(readwrite, nonatomic) NSUInteger groups;
+
+@end
+
+@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
+
+#endif
+
+@interface MPSGraph (VenturaOps)
+
+#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
+  MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
+  MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
+  MPSGraphResizeNearestRoundingModeCeil = 2L,
+  MPSGraphResizeNearestRoundingModeFloor = 3L,
+  MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
+  MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
+};
+
+// Define complex enums for MacOS 12
+#define MPSDataTypeComplexBit 0x01000000
+#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
+#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
+#endif
+
+- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                            weightsTensor:(MPSGraphTensor* _Nonnull)weights
+                                               descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
+                                                     name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)
+    convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
+                                          weightsTensor:(MPSGraphTensor* _Nonnull)weights
+                                            outputShape:(MPSShape* _Nonnull)outputShape
+                           forwardConvolutionDescriptor:
+                               (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
+                                                   name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)
+    convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
+                                              sourceTensor:(MPSGraphTensor* _Nonnull)source
+                                               outputShape:(MPSShape* _Nonnull)outputShape
+                              forwardConvolutionDescriptor:
+                                  (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
+                                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                               axis:(NSInteger)axis
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                      axis:(NSInteger)axis
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                      axis:(NSInteger)axis
+                                descending:(BOOL)descending
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                descending:(BOOL)descending
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                      name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                         axis:(NSInteger)axis
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                         axis:(NSInteger)axis
+                                   descending:(BOOL)descending
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                   descending:(BOOL)descending
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
+                                   axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
+                                         name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                       centerResult:(BOOL)centerResult
+                                       alignCorners:(BOOL)alignCorners
+                                             layout:(MPSGraphTensorNamedDataLayout)layout
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                         sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                  scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                             layout:(MPSGraphTensorNamedDataLayout)layout
+                                               name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                        centerResult:(BOOL)centerResult
+                                        alignCorners:(BOOL)alignCorners
+                                              layout:(MPSGraphTensorNamedDataLayout)layout
+                                                name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
+                                          sizeTensor:(MPSGraphTensor* _Nonnull)size
+                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                              layout:(MPSGraphTensorNamedDataLayout)layout
+                                                name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                      input:(MPSGraphTensor* _Nonnull)input
+                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                               centerResult:(BOOL)centerResult
+                                               alignCorners:(BOOL)alignCorners
+                                                     layout:(MPSGraphTensorNamedDataLayout)layout
+                                                       name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                      input:(MPSGraphTensor* _Nonnull)input
+                                          scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                        nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                                     layout:(MPSGraphTensorNamedDataLayout)layout
+                                                       name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                       input:(MPSGraphTensor* _Nonnull)input
+                                                centerResult:(BOOL)centerResult
+                                                alignCorners:(BOOL)alignCorners
+                                                      layout:(MPSGraphTensorNamedDataLayout)layout
+                                                        name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
+                                                       input:(MPSGraphTensor* _Nonnull)input
+                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
+                                                      layout:(MPSGraphTensorNamedDataLayout)layout
+                                                        name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
+                                                layout:(MPSGraphTensorNamedDataLayout)layout
+                                  normalizeCoordinates:(BOOL)normalizeCoordinates
+                                   relativeCoordinates:(BOOL)relativeCoordinates
+                                          alignCorners:(BOOL)alignCorners
+                                           paddingMode:(MPSGraphPaddingMode)paddingMode
+                                          samplingMode:(MPSGraphResizeMode)samplingMode
+                                         constantValue:(double)constantValue
+                                                  name:(NSString* _Nullable)name;
+
+- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
+                                      coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
+                                                layout:(MPSGraphTensorNamedDataLayout)layout
+                                  normalizeCoordinates:(BOOL)normalizeCoordinates
+                                   relativeCoordinates:(BOOL)relativeCoordinates
+                                          alignCorners:(BOOL)alignCorners
+                                           paddingMode:(MPSGraphPaddingMode)paddingMode
+                                   nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
+                                         constantValue:(double)constantValue
+                                                  name:(NSString* _Nullable)name;
+- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
+
+@end
diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
index d9f126938b301..25616f00b694d 100644
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -116,8 +116,11 @@ class MetalShaderLibrary {
   std::vector<std::string> getFunctionNames();
   std::shared_ptr<MetalKernelFunction> getKernelFunction(
       const std::string& name);
+<<<<<<< HEAD
   // Returns a raw pointer to the kernel function for use in C APIs
   MetalKernelFunction* getCachedKernelFunctionPtr(const std::string& name);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   inline MTLComputePipelineState_t getPipelineStateForFunc(
       const std::string& fname) {
     return getLibraryPipelineState(getLibrary(), fname).first;
@@ -166,9 +169,12 @@ class MetalShaderLibrary {
       std::string,
       std::pair<MTLComputePipelineState_t, MTLFunction_t>>
       cplMap;
+<<<<<<< HEAD
   // Cache for kernel functions returned by getCachedKernelFunctionPtr
   std::unordered_map<std::string, std::unique_ptr<MetalKernelFunction>>
       kernelCache;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class DynamicMetalShaderLibrary : public MetalShaderLibrary {
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 03b3076402d0a..69ed196e558ef 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -88,8 +88,19 @@ std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
 Tensor& scatterViewTensor(const Tensor& src, Tensor& output);
+<<<<<<< HEAD
 MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
 MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
+=======
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph,
+                               MPSGraphTensor* inputTensor,
+                               const TensorBase& input,
+                               bool includesInt64 = false);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
+                                 MPSGraphTensor* inputTensor,
+                                 const TensorBase& input,
+                                 bool includesInt64 = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray);
 MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {});
@@ -99,9 +110,12 @@ Tensor getTensorView(const Tensor& t, MPSShape* shape);
 MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
+<<<<<<< HEAD
 // Determines whether a tensor is too large to use MPSGraph
 bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI = true);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static inline id<MTLBuffer> getMTLBufferStorage(const TensorBase& tensor) {
   return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
 }
@@ -142,6 +156,11 @@ MPSGraphTensorData* getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStre
 MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
 
 MPSGraph* make_mps_graph();
+<<<<<<< HEAD
+=======
+void printTensorNDArray(const TensorBase& t);
+MPSNDArray* ndArrayFromTensor(const TensorBase& tensor, MPSShape* shape, MPSDataType mpsType);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
@@ -432,6 +451,17 @@ inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(M
 // Common math operations
 MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
 
+<<<<<<< HEAD
+=======
+#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name)                                            \
+  if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) {                                                        \
+    TORCH_WARN_ONCE(                                                                                                     \
+        "MPS: no support for int64 for ",                                                                                \
+        op_name,                                                                                                         \
+        ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3."); \
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * Returns distance from lowest to highest element offset in given tensor.
  */
@@ -607,6 +637,13 @@ inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds,
   runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
 }
 
+<<<<<<< HEAD
+=======
+inline bool supportsComplex() {
+  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // MPS yet to support double types, but starting from MacOS 14, supports bfloat16
 inline bool supportedFloatingType(ScalarType dtype) {
   return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
@@ -618,7 +655,11 @@ inline bool supportedFloatingType(const TensorBase& t) {
 
 inline bool supportedFloatingOrComplexType(ScalarType dtype) {
   if (dtype == kComplexFloat || dtype == kComplexHalf) {
+<<<<<<< HEAD
     return true;
+=======
+    return supportsComplex();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return supportedFloatingType(dtype);
 }
@@ -626,6 +667,14 @@ inline bool supportedFloatingOrComplexType(const TensorBase& t) {
   return supportedFloatingOrComplexType(t.scalar_type());
 }
 
+<<<<<<< HEAD
+=======
+inline void checkSupportsBFloat16() {
+  TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
+                   "MPS bfloat16 type is supported on MacOS 14.0 or newer.");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool needsGather(const TensorBase& t) {
   static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
   return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 96cd5d41959c3..62efafc4e6a00 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -9,6 +9,11 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -29,7 +34,11 @@ - (MPSGraphTensor*)minimumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPS
                                                             secondaryTensor:(MPSGraphTensor*)secondaryTensor
                                                                        name:(NSString*)name {
   // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
+<<<<<<< HEAD
   // arguments results in
+=======
+  // agruments results in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
   //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
   if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@@ -42,7 +51,11 @@ - (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPS
                                                             secondaryTensor:(MPSGraphTensor*)secondaryTensor
                                                                        name:(NSString*)name {
   // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
+<<<<<<< HEAD
   // arguments results in
+=======
+  // agruments results in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
   //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
   if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@@ -87,6 +100,13 @@ void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds,
   mpsStream->executeMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE);
 }
 
+<<<<<<< HEAD
+=======
+static inline void checkSupportsComplex() {
+  TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer.");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MPSDataType getMPSDataType(ScalarType scalar_type) {
   switch (scalar_type) {
     case ScalarType::Float:
@@ -94,6 +114,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
     case ScalarType::Half:
       return MPSDataTypeFloat16;
     case ScalarType::BFloat16:
+<<<<<<< HEAD
+=======
+      checkSupportsBFloat16();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
@@ -112,6 +136,7 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
                        "Cannot convert a float64 Tensor to MPS as the MPS framework doesn't support float64. "
                        "Please use float32 instead.")
     case ScalarType::ComplexHalf:
+<<<<<<< HEAD
       return MPSDataTypeComplexFloat16;
     case ScalarType::ComplexFloat:
       return MPSDataTypeComplexFloat32;
@@ -122,6 +147,13 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
       return MPSDataTypeUInt32;
     case ScalarType::UInt16:
       return MPSDataTypeUInt16;
+=======
+      checkSupportsComplex();
+      return MPSDataTypeComplexFloat16;
+    case ScalarType::ComplexFloat:
+      checkSupportsComplex();
+      return MPSDataTypeComplexFloat32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK_TYPE(
           false, "Trying to convert ", scalar_type, " to the MPS backend but it does not have support for that dtype.")
@@ -131,10 +163,23 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
 // #issue 104398441 sortWithTensor and argsortWithTensor has support of
 // Int32, Half and Float32 types. These utilities are to help cast to these
 // types.
+<<<<<<< HEAD
 MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) {
   MPSDataType dataType = getMPSDataType(input.scalar_type());
   bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) &&
       (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64);
+=======
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph,
+                               MPSGraphTensor* inputTensor,
+                               const TensorBase& input,
+                               bool includesInt64) {
+  MPSDataType dataType = getMPSDataType(input.scalar_type());
+  bool condition =
+      (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16);
+  if (includesInt64) {
+    condition = condition && (dataType != MPSDataTypeInt64);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (condition) {
     dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
     return [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"];
@@ -145,10 +190,23 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
 // #issue 104398441 sortWithTensor and argsortWithTensor has support of
 // Int32, Half and Float32 types. These utilities are to help cast from these
 // types.
+<<<<<<< HEAD
 MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) {
   MPSDataType dataType = getMPSDataType(input.scalar_type());
   bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) &&
       (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64);
+=======
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
+                                 MPSGraphTensor* inputTensor,
+                                 const TensorBase& input,
+                                 bool includesInt64) {
+  MPSDataType dataType = getMPSDataType(input.scalar_type());
+  bool condition =
+      (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16);
+  if (includesInt64) {
+    condition = condition && (dataType != MPSDataTypeInt64);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (condition) {
     inputTensor = [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"];
   }
@@ -165,6 +223,10 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Half:
       return MPSDataTypeFloat16;
     case ScalarType::BFloat16:
+<<<<<<< HEAD
+=======
+      checkSupportsBFloat16();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
@@ -179,11 +241,16 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Bool:
       return MPSDataTypeBool;
     case ScalarType::ComplexHalf:
+<<<<<<< HEAD
+=======
+      checkSupportsComplex();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return MPSDataTypeComplexFloat16;
     // This is an intentional fallthrough supporting ComplexDouble for Scalar
     // types as they are casted to Complex64 currently.
     case ScalarType::ComplexDouble:
     case ScalarType::ComplexFloat:
+<<<<<<< HEAD
       return MPSDataTypeComplexFloat32;
     // Unsigned types
     case ScalarType::UInt64:
@@ -192,6 +259,10 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return MPSDataTypeUInt32;
     case ScalarType::UInt16:
       return MPSDataTypeUInt16;
+=======
+      checkSupportsComplex();
+      return MPSDataTypeComplexFloat32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK_TYPE(
           false, "Trying to convert ", scalar_type, " to the MPS backend but it does not have support for that dtype.")
@@ -224,6 +295,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return short_name ? "c16" : "ComplexFloat16";
     case ScalarType::ComplexFloat:
       return short_name ? "c32" : "ComplexFloat32";
+<<<<<<< HEAD
     // Unsigned types
     case ScalarType::UInt64:
       return short_name ? "u64" : "UInt64";
@@ -231,6 +303,8 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return short_name ? "u32" : "UInt32";
     case ScalarType::UInt16:
       return short_name ? "u16" : "UInt16";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       return "Undefined";
   }
@@ -243,6 +317,10 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Half:
       return "half";
     case ScalarType::BFloat16:
+<<<<<<< HEAD
+=======
+      checkSupportsBFloat16();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return "bfloat";
     case ScalarType::Int:
       return "int";
@@ -260,6 +338,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return "half2";
     case ScalarType::ComplexFloat:
       return "float2";
+<<<<<<< HEAD
     // Unsigned types
     case ScalarType::UInt64:
       return "ulong";
@@ -267,6 +346,8 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return "uint";
     case ScalarType::UInt16:
       return "ushort";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK(false, "Undefined type ", scalar_type);
       return "Undefined";
@@ -330,7 +411,11 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
         if (exclude_shape) {
           fmt::format_to(buf_iterator, "-1");
         } else {
+<<<<<<< HEAD
           fmt::format_to(buf_iterator, "{}", getArrayRefString(tensor.sizes()));
+=======
+          fmt::format_to(buf_iterator, getArrayRefString(tensor.sizes()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
       fmt::format_to(buf_iterator, "]");
@@ -380,6 +465,39 @@ Tensor getTensorView(const Tensor& t, MPSShape* shape) {
   return [NSArray arrayWithObjects:numbers.data() count:numbers.size()];
 }
 
+<<<<<<< HEAD
+=======
+void printTensorNDArray(const TensorBase& t) {
+  if (!t.is_mps())
+    return;
+  if (t.numel() == 0)
+    return;
+  // Get shape and data type
+  auto selfShape = getMPSShape(t);
+  auto selfDType = getMPSDataType(t.scalar_type());
+
+  // Initialize data
+  id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
+  MPSGraphTensorData* tdata = [[[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf shape:selfShape
+                                                                    dataType:selfDType] autorelease];
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wobjc-method-access")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wobjc-method-access")
+#endif
+  [tdata printNDArray];
+  C10_CLANG_DIAGNOSTIC_POP()
+}
+
+MPSNDArray* ndArrayFromTensor(const TensorBase& tensor, MPSShape* shape, MPSDataType mpsType) {
+  id<MTLBuffer> buffer = getMTLBufferStorage(tensor);
+  MPSGraphTensorData* tmpGraphTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:buffer
+                                                                                    shape:shape
+                                                                                 dataType:mpsType] autorelease];
+
+  return [tmpGraphTensorData mpsndarray];
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::vector<int64_t> getSortedStrides(const IntArrayRef& s) {
   std::vector<int64_t> idx(s.size());
   iota(idx.begin(), idx.end(), 0);
@@ -430,6 +548,7 @@ Tensor getTensorView(const Tensor& t, MPSShape* shape) {
   return result;
 }
 
+<<<<<<< HEAD
 // Should be called before initWithBuffer to prevent hard crashes with
 // '[MPSNDArray initWithDevice:descriptor:isTextureBacked:] Error: NDArray dimension length > INT_MAX'
 static void check_mps_shape(MPSShape* shape) {
@@ -455,13 +574,18 @@ bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI) {
   return false;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes, MPSShape* strides) {
   id<MTLBuffer> srcBuf = getMTLBufferStorage(t);
 
   MPSDataType mpsDataType = getMPSDataType(t.scalar_type());
   MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:mpsDataType shape:sizes];
   srcTensorDesc.preferPackedRows = YES;
+<<<<<<< HEAD
   check_mps_shape(sizes);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MPSNDArray* srcNDArray = [[[MPSNDArray alloc] initWithBuffer:srcBuf
                                                         offset:t.storage_offset() * t.element_size()
                                                     descriptor:srcTensorDesc] autorelease];
@@ -539,7 +663,11 @@ bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI) {
 
   static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
   // Use gather kernel to solve strides for macOS < 15.0
+<<<<<<< HEAD
   // Starting with macOS 15.0, MPS supports native strides directly in the kernels
+=======
+  // Starting with macOS 15.0, MPS supports native strides direclty in the kernels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
     if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
       Tensor emptyShell = Tensor();
@@ -571,9 +699,15 @@ bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI) {
   // Tensor is contiguous and has no storage offset.
   // Wrap it directly inside MPSGraphTensorData
   if ((_tensor.is_contiguous() && !_tensor.storage_offset()) || !useMPSStridedAPI || !is_macOS_15_0_or_newer) {
+<<<<<<< HEAD
     auto shape = mpsShape_ ? mpsShape_ : getMPSShape(_tensor);
     check_mps_shape(shape);
     _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf shape:shape dataType:dataType] autorelease];
+=======
+    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
+                                                      shape:mpsShape_ ? mpsShape_ : getMPSShape(_tensor)
+                                                   dataType:dataType] autorelease];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     IntArrayRef view_shape;
     if (mpsShape_) {
@@ -582,11 +716,16 @@ bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI) {
 
     MPSShape* mpsShape = getMPSShape(_tensor);
     MPSShape* mpsStrides = getMPSShape(_tensor.strides());
+<<<<<<< HEAD
     check_mps_shape(mpsShape);
 
     auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
     TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
                 "MPSGaph does not support tensor dims larger than INT_MAX");
+=======
+
+    auto storage_numel = src.storage().nbytes() / src.element_size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
                                                                                  shape:@[ @(storage_numel) ]];
     srcTensorDesc.preferPackedRows = YES;
@@ -670,11 +809,14 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
     case ScalarType::ComplexFloat:
     case ScalarType::ComplexDouble:
       return {.size = sizeof(int64_t), .type = type, .value.cf = scalar.to<c10::complex<float>>()};
+<<<<<<< HEAD
     // Unsigned types
     case ScalarType::UInt32:
       return {.size = sizeof(uint32_t), .type = type, .value.i = scalar.to<uint32_t>()};
     case ScalarType::UInt16:
       return {.size = sizeof(uint16_t), .type = type, .value.i = scalar.to<uint16_t>()};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_INTERNAL_ASSERT(false, "Unsupported scalar type '", type, "' on MPS backend.");
   }
@@ -712,7 +854,11 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) {
   } else if (scalar.isBoolean()) {
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kBool));
   } else if (scalar.isComplex()) {
+<<<<<<< HEAD
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexFloat));
+=======
+    tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexDouble));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_INTERNAL_ASSERT(scalar.isIntegral(false));
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong));
@@ -870,7 +1016,13 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   MTLCompileOptions* options = compile_options;
   if (!options) {
     options = [[MTLCompileOptions new] autorelease];
+<<<<<<< HEAD
     [options setLanguageVersion:MTLLanguageVersion3_1];
+=======
+    // Need 3.0 for atomic oprations, 3.1 introduces bfloat support
+    [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1
+                                                                                        : MTLLanguageVersion3_0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
       options.mathMode = fast_math ? MTLMathModeFast : MTLMathModeSafe;
       options.mathFloatingPointFunctions =
@@ -933,6 +1085,7 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   return std::make_shared<MetalKernelFunction>(cpl, func);
 }
 
+<<<<<<< HEAD
 MetalKernelFunction* MetalShaderLibrary::getCachedKernelFunctionPtr(const std::string& name) {
   // Check if kernel is already cached
   auto it = kernelCache.find(name);
@@ -949,6 +1102,8 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   return raw_ptr;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class BundledShaderLibary : public MetalShaderLibrary {
  public:
   BundledShaderLibary() : MetalShaderLibrary("") {}
@@ -958,7 +1113,12 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
     if (C10_UNLIKELY(!library)) {
       auto device = MPSDevice::getInstance()->device();
       NSError* error = nil;
+<<<<<<< HEAD
       library = [device newLibraryWithData:getSectionData("metal_basic") error:&error];
+=======
+      auto section_name = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? "metal_bfloat" : "metal_basic";
+      library = [device newLibraryWithData:getSectionData(section_name) error:&error];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]);
     }
     return library;
diff --git a/aten/src/ATen/native/mps/kernels/ActivationKernel.metal b/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
index ae1fda66c3b38..be9169c323162 100644
--- a/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
@@ -33,6 +33,7 @@ struct shrink_backward_functor {
 
 REGISTER_UNARY_ALPHA_OP(hardshrink, float, float, float);
 REGISTER_UNARY_ALPHA_OP(hardshrink, half, half, half);
+<<<<<<< HEAD
 REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
 
 REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
@@ -42,6 +43,23 @@ REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
 REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
 REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
+#endif
+
+REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
+REGISTER_UNARY_ALPHA_OP(softshrink, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
+#endif
+
+REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
+REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct hardsigmoid_functor {
   template <typename T>
@@ -61,11 +79,23 @@ struct hardsigmoid_backward_functor {
 
 REGISTER_UNARY_OP(hardsigmoid, float, float);
 REGISTER_UNARY_OP(hardsigmoid, half, half);
+<<<<<<< HEAD
+REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
+
+REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
+REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
+REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
+#endif
 
 REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
 REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
+#if __METAL_VERSION__ >= 310
 REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct hardswish_functor {
   template <typename T>
@@ -93,11 +123,23 @@ struct hardswish_backward_functor {
 
 REGISTER_UNARY_OP(hardswish, float, float);
 REGISTER_UNARY_OP(hardswish, half, half);
+<<<<<<< HEAD
+REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
+
+REGISTER_BINARY_OP(hardswish_backward, float, float);
+REGISTER_BINARY_OP(hardswish_backward, half, half);
+REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
+#endif
 
 REGISTER_BINARY_OP(hardswish_backward, float, float);
 REGISTER_BINARY_OP(hardswish_backward, half, half);
+#if __METAL_VERSION__ >= 310
 REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct leaky_relu_functor {
   template <typename T>
@@ -121,8 +163,20 @@ struct leaky_relu_backward_functor {
 
 REGISTER_UNARY_ALPHA_OP(leaky_relu, float, float, float);
 REGISTER_UNARY_ALPHA_OP(leaky_relu, half, half, half);
+<<<<<<< HEAD
+REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
+
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
+#endif
 
 REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
 REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
+#if __METAL_VERSION__ >= 310
 REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Amp.metal b/aten/src/ATen/native/mps/kernels/Amp.metal
index 653c2057d498d..873a98ca6ac0d 100644
--- a/aten/src/ATen/native/mps/kernels/Amp.metal
+++ b/aten/src/ATen/native/mps/kernels/Amp.metal
@@ -113,6 +113,7 @@ kernel void ampUpdateScale(
 
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(float);
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(half);
+<<<<<<< HEAD
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
 
 INSTANTIATE_AMP_UPDATE_SCALE(float);
@@ -122,3 +123,20 @@ INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
 INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
+#endif
+
+INSTANTIATE_AMP_UPDATE_SCALE(float);
+INSTANTIATE_AMP_UPDATE_SCALE(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
+#endif
+
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Attention.metal b/aten/src/ATen/native/mps/kernels/Attention.metal
index 5a317895f508e..91e7552f1335a 100644
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@@ -14,8 +14,13 @@ template <typename T, int D, int V = D>
     device T* out [[buffer(3)]],
     const constant uint& gqa_factor [[buffer(4)]],
     const constant uint& N [[buffer(5)]],
+<<<<<<< HEAD
     const constant uint3& qkv_head_strides [[buffer(6)]],
     const constant uint3& qkv_seq_strides [[buffer(7)]],
+=======
+    const constant uint2& k_head_seq_stride [[buffer(6)]],
+    const constant uint2& v_head_seq_stride [[buffer(7)]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const constant float& scale [[buffer(8)]],
     const device bool* mask [[buffer(9)]],
     const constant uint3& mask_strides [[buffer(10)]],
@@ -28,12 +33,19 @@ template <typename T, int D, int V = D>
   constexpr uint BD = 32;
   constexpr uint qk_per_thread = D / BD;
   constexpr uint v_per_thread = V / BD;
+<<<<<<< HEAD
   const uint q_head_stride = qkv_head_strides.x;
   const uint q_seq_stride = qkv_seq_strides.x;
   const uint k_head_stride = qkv_head_strides.y;
   const uint k_seq_stride = qkv_seq_strides.y;
   const uint v_head_stride = qkv_head_strides.z;
   const uint v_seq_stride = qkv_seq_strides.z;
+=======
+  const uint k_head_stride = k_head_seq_stride.x;
+  const uint k_seq_stride = k_head_seq_stride.y;
+  const uint v_head_stride = v_head_seq_stride.x;
+  const uint v_seq_stride = v_head_seq_stride.y;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint mask_head_stride = mask_strides.x;
   const uint mask_kv_seq_stride = mask_strides.y;
   const uint mask_q_seq_stride = mask_strides.z;
@@ -56,9 +68,15 @@ template <typename T, int D, int V = D>
   const int kv_head_idx = head_idx / gqa_factor;
   const int Q = tpg.y;
   const int group_offset = head_idx * Q + q_seq_idx;
+<<<<<<< HEAD
   const int o_offset = group_offset;
   queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
       simd_lid * qk_per_thread;
+=======
+  const int q_offset = group_offset;
+  const int o_offset = group_offset;
+  queries += q_offset * D + simd_lid * qk_per_thread;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
       simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@@ -158,8 +176,13 @@ template <typename T, int D, int V = D>
     device float* maxs [[buffer(5)]],
     const constant uint& gqa_factor [[buffer(6)]],
     const constant uint& N [[buffer(7)]],
+<<<<<<< HEAD
     const constant uint3& qkv_head_strides [[buffer(8)]],
     const constant uint3& qkv_seq_strides [[buffer(9)]],
+=======
+    const constant uint2& k_head_seq_stride [[buffer(8)]],
+    const constant uint2& v_head_seq_stride [[buffer(9)]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const constant float& scale [[buffer(10)]],
     const device bool* mask [[buffer(11)]],
     const constant uint3& mask_strides [[buffer(12)]],
@@ -172,12 +195,19 @@ template <typename T, int D, int V = D>
   constexpr int BD = 32;
   constexpr int qk_per_thread = D / BD;
   constexpr int v_per_thread = V / BD;
+<<<<<<< HEAD
   const int q_head_stride = qkv_head_strides.x;
   const int q_seq_stride = qkv_seq_strides.x;
   const int k_head_stride = qkv_head_strides.y;
   const int k_seq_stride = qkv_seq_strides.y;
   const int v_head_stride = qkv_head_strides.z;
   const int v_seq_stride = qkv_seq_strides.z;
+=======
+  const int k_head_stride = k_head_seq_stride.x;
+  const int k_seq_stride = k_head_seq_stride.y;
+  const int v_head_stride = v_head_seq_stride.x;
+  const int v_seq_stride = v_head_seq_stride.y;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int mask_kv_seq_stride = mask_strides.x;
   const int mask_q_seq_stride = mask_strides.y;
   const int mask_head_stride = mask_strides.z;
@@ -200,10 +230,17 @@ template <typename T, int D, int V = D>
   const int head_idx = tid.x;
   const int q_seq_idx = tid.y;
   const int o_offset = head_idx * tpg.y + q_seq_idx;
+<<<<<<< HEAD
   const int kv_head_idx = head_idx / gqa_factor;
 
   queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
       simd_lid * qk_per_thread;
+=======
+  const int q_offset = o_offset;
+  const int kv_head_idx = head_idx / gqa_factor;
+
+  queries += q_offset * D + simd_lid * qk_per_thread;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   keys += kv_head_idx * k_head_stride +
       (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride +
@@ -524,6 +561,7 @@ kernel void attention(
   }
 }
 
+<<<<<<< HEAD
 #define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
   template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
                        "_" #VALUE_DIM)]] kernel void        \
@@ -543,6 +581,27 @@ kernel void attention(
       uint3 tid [[threadgroup_position_in_grid]],           \
       uint3 tpg [[threadgroups_per_grid]],                  \
       uint simd_gid [[simdgroup_index_in_threadgroup]],     \
+=======
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
+                       "_" #VALUE_DIM)]] kernel void         \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
+      const device DTYPE* queries [[buffer(0)]],             \
+      const device DTYPE* keys [[buffer(1)]],                \
+      const device DTYPE* values [[buffer(2)]],              \
+      device DTYPE* out [[buffer(3)]],                       \
+      const constant uint& gqa_factor [[buffer(4)]],         \
+      const constant uint& N [[buffer(5)]],                  \
+      const constant uint2& k_head_seq_stride [[buffer(6)]], \
+      const constant uint2& v_head_seq_stride [[buffer(7)]], \
+      const constant float& scale [[buffer(8)]],             \
+      const device bool* mask [[buffer(9)]],                 \
+      const constant uint3& mask_strides [[buffer(10)]],     \
+      const constant bool& has_mask [[buffer(11)]],          \
+      uint3 tid [[threadgroup_position_in_grid]],            \
+      uint3 tpg [[threadgroups_per_grid]],                   \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint simd_lid [[thread_index_in_simdgroup]]);
 
 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@@ -557,8 +616,13 @@ kernel void attention(
       device float* maxs [[buffer(5)]],                           \
       const constant uint& gqa_factor [[buffer(6)]],              \
       const constant uint& N [[buffer(7)]],                       \
+<<<<<<< HEAD
       const constant uint3& qkv_head_strides [[buffer(8)]],       \
       const constant uint3& qkv_seq_strides [[buffer(9)]],        \
+=======
+      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
+      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const constant float& scale [[buffer(10)]],                 \
       const device bool* mask [[buffer(11)]],                     \
       const constant uint3& mask_strides [[buffer(12)]],          \
@@ -594,7 +658,13 @@ kernel void attention(
 
 INSTANTIATE_SDPA_VECTOR_HEADS(float);
 INSTANTIATE_SDPA_VECTOR_HEADS(half);
+<<<<<<< HEAD
 INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define INSTANTIATE_ATTN(DTYPE, bq, bk, bd, wm, wn)                      \
   template [[host_name("attention_" #DTYPE "_bq" #bq "_bk" #bk "_bd" #bd \
@@ -623,4 +693,10 @@ INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
 
 INSTANTIATE_ATTN_SHAPES_HELPER(float);
 INSTANTIATE_ATTN_SHAPES_HELPER(half);
+<<<<<<< HEAD
+INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
index 0764b9d5e12d9..664fbb324fa4d 100644
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@@ -39,6 +39,7 @@ struct lerp_alpha_functor {
   }
 };
 
+<<<<<<< HEAD
 struct native_dropout_mask_and_scale_functor {
   template <typename TI, typename TA>
   inline TA operator()(const TI a, const TI b, const TA scale) {
@@ -46,6 +47,8 @@ struct native_dropout_mask_and_scale_functor {
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct fmax_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
@@ -141,6 +144,7 @@ struct chebyshev_polynomial_w_functor {
   }
 };
 
+<<<<<<< HEAD
 struct shifted_chebyshev_polynomial_t_functor {
   template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
   inline T operator()(const T a, const T b) {
@@ -193,6 +197,8 @@ struct shifted_chebyshev_polynomial_w_functor {
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct hermite_polynomial_h_functor {
   template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
   inline T operator()(const T a, const T b) {
@@ -216,6 +222,7 @@ struct hermite_polynomial_he_functor {
 };
 
 struct nextafter_functor {
+<<<<<<< HEAD
   template <typename T>
   inline T operator()(const T a, const T b) {
     return static_cast<T>(::metal::nextafter(a, b));
@@ -226,6 +233,40 @@ struct hypot_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
     return static_cast<T>(precise::sqrt(float(a) * a + float(b) * b));
+=======
+#if __METAL_VERSION__ < 310
+  template <typename U>
+  struct bit_type {};
+  template <>
+  struct bit_type<float> {
+    using type = int;
+  };
+  template <>
+  struct bit_type<half> {
+    using type = short;
+  };
+#endif
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+#if __METAL_VERSION__ >= 310
+    return static_cast<T>(::metal::nextafter(a, b));
+#else
+    using U = typename bit_type<T>::type;
+    if (a == b) {
+      return a;
+    }
+    if (::metal::isunordered(a, b)) {
+      return NAN;
+    }
+    if (a == 0) {
+      constexpr auto eps = as_type<T>(static_cast<U>(1));
+      return b > 0 ? eps : -eps;
+    }
+    auto bits = as_type<U>(a);
+    (a > 0) ^ (a > b) ? bits++ : bits--;
+    return as_type<T>(bits);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -329,6 +370,7 @@ struct fmod_functor {
   }
 };
 
+<<<<<<< HEAD
 struct igamma_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
@@ -342,6 +384,14 @@ struct igammac_functor {
     return c10::metal::igammac(a, b);
   }
 };
+=======
+// Some helper defines
+#if __METAL_VERSION__ >= 310
+#define _METAL_310_PLUS(x) x
+#else
+#define _METAL_310_PLUS(x)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define REGISTER_INTEGER_BINARY_OP(NAME)  \
   REGISTER_BINARY_OP(NAME, long, long);   \
@@ -362,14 +412,23 @@ struct igammac_functor {
 #define REGISTER_FLOAT_BINARY_OP(NAME)    \
   REGISTER_BINARY_OP(NAME, float, float); \
   REGISTER_BINARY_OP(NAME, half, half);   \
+<<<<<<< HEAD
   REGISTER_BINARY_OP(NAME, bfloat, bfloat)
+=======
+  _METAL_310_PLUS(REGISTER_BINARY_OP(NAME, bfloat, bfloat))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define REGISTER_OPMATH_FLOAT_BINARY_OP(NAME)    \
   REGISTER_OPMATH_BINARY_OP(NAME, float, float); \
   REGISTER_OPMATH_BINARY_OP(NAME, half, half);   \
+<<<<<<< HEAD
   REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat)
 
 REGISTER_FLOAT_BINARY_OP(hypot);
+=======
+  _METAL_310_PLUS(REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_FLOAT_BINARY_OP(copysign);
 REGISTER_INT2FLOAT_BINARY_OP(copysign);
 REGISTER_FLOAT_BINARY_OP(fmax);
@@ -387,6 +446,7 @@ REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_v);
 REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_w);
 REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_w);
 REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_v);
+<<<<<<< HEAD
 REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
 REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
 REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_u);
@@ -395,6 +455,8 @@ REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
 REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
 REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
 REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_FLOAT_BINARY_OP(hermite_polynomial_h);
 REGISTER_INT2FLOAT_BINARY_OP(hermite_polynomial_h);
 REGISTER_FLOAT_BINARY_OP(hermite_polynomial_he);
@@ -415,8 +477,11 @@ REGISTER_OPMATH_FLOAT_BINARY_OP(remainder);
 REGISTER_INTEGER_BINARY_OP(remainder);
 REGISTER_OPMATH_FLOAT_BINARY_OP(fmod);
 REGISTER_INTEGER_BINARY_OP(fmod);
+<<<<<<< HEAD
 REGISTER_OPMATH_FLOAT_BINARY_OP(igamma);
 REGISTER_OPMATH_FLOAT_BINARY_OP(igammac);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long);
 REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int);
 REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float);
@@ -442,6 +507,7 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
 
+<<<<<<< HEAD
 REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, float, float, float);
 REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half);
@@ -449,6 +515,13 @@ REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half);
 REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Complex binary functions
 REGISTER_BINARY_OP(polar, float, float2);
diff --git a/aten/src/ATen/native/mps/kernels/Bucketization.metal b/aten/src/ATen/native/mps/kernels/Bucketization.metal
index a84698d77f57c..51ea3ec43f63a 100644
--- a/aten/src/ATen/native/mps/kernels/Bucketization.metal
+++ b/aten/src/ATen/native/mps/kernels/Bucketization.metal
@@ -180,8 +180,15 @@ REGISTER_SEARCHSORTED_OP(float, int);
 REGISTER_SEARCHSORTED_OP(float, long);
 REGISTER_SEARCHSORTED_OP(half, int);
 REGISTER_SEARCHSORTED_OP(half, long);
+<<<<<<< HEAD
 REGISTER_SEARCHSORTED_OP(bfloat, int);
 REGISTER_SEARCHSORTED_OP(bfloat, long);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_SEARCHSORTED_OP(bfloat, int);
+REGISTER_SEARCHSORTED_OP(bfloat, long);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_SEARCHSORTED_OP(char, int);
 REGISTER_SEARCHSORTED_OP(char, long);
 REGISTER_SEARCHSORTED_OP(uchar, int);
diff --git a/aten/src/ATen/native/mps/kernels/Col2Im.metal b/aten/src/ATen/native/mps/kernels/Col2Im.metal
index 61f596a9250f4..a0ac3f63e5b38 100644
--- a/aten/src/ATen/native/mps/kernels/Col2Im.metal
+++ b/aten/src/ATen/native/mps/kernels/Col2Im.metal
@@ -96,4 +96,10 @@ kernel void col2im_kernel(
 INSTANTIATE_COL2IM(bool);
 INSTANTIATE_COL2IM(float);
 INSTANTIATE_COL2IM(half);
+<<<<<<< HEAD
 INSTANTIATE_COL2IM(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_COL2IM(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/CrossKernel.metal b/aten/src/ATen/native/mps/kernels/CrossKernel.metal
index bceae51c02db4..5d31426969504 100644
--- a/aten/src/ATen/native/mps/kernels/CrossKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/CrossKernel.metal
@@ -20,7 +20,13 @@ REGISTER_CROSS_FUNC(short);
 REGISTER_CROSS_FUNC(char);
 REGISTER_CROSS_FUNC(uchar);
 REGISTER_CROSS_FUNC(bool);
+<<<<<<< HEAD
 REGISTER_CROSS_FUNC(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_CROSS_FUNC(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T, typename U>
 kernel void cross(
@@ -66,4 +72,10 @@ REGISTER_CROSS_OP(short);
 REGISTER_CROSS_OP(char);
 REGISTER_CROSS_OP(uchar);
 REGISTER_CROSS_OP(bool);
+<<<<<<< HEAD
+REGISTER_CROSS_OP(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_CROSS_OP(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal b/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
index f46b10f99bf4b..0626d13acb87b 100644
--- a/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
+++ b/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
@@ -1,9 +1,17 @@
 #include <metal_stdlib>
 
 using metal::max;
+<<<<<<< HEAD
 bfloat max(bfloat a, bfloat b) {
   return a > b ? a : b;
 }
+=======
+#if __METAL_VERSION__ >= 310
+bfloat max(bfloat a, bfloat b) {
+  return a > b ? a : b;
+}
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define kmaxThreadGroups 32
 #define kmaxTensors 32
@@ -304,9 +312,17 @@ REGISTER_ADAM_OPS_QUART(float, float);
 REGISTER_ADAM_OPS_QUART(float, half);
 REGISTER_ADAM_OPS_QUART(half, float);
 REGISTER_ADAM_OPS_QUART(half, half);
+<<<<<<< HEAD
+REGISTER_ADAM_OPS_QUART(float, bfloat);
+REGISTER_ADAM_OPS_QUART(bfloat, bfloat);
+REGISTER_ADAM_OPS_QUART(bfloat, float);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_ADAM_OPS_QUART(float, bfloat);
 REGISTER_ADAM_OPS_QUART(bfloat, bfloat);
 REGISTER_ADAM_OPS_QUART(bfloat, float);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 inline void sgd_momentum_math(
@@ -456,5 +472,12 @@ REGISTER_FUSED_SGD_OP(float);
 REGISTER_FUSED_SGD_OP(half);
 REGISTER_FUSED_SGD_MOMENTUM_OP(float);
 REGISTER_FUSED_SGD_MOMENTUM_OP(half);
+<<<<<<< HEAD
+REGISTER_FUSED_SGD_OP(bfloat);
+REGISTER_FUSED_SGD_MOMENTUM_OP(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_FUSED_SGD_OP(bfloat);
 REGISTER_FUSED_SGD_MOMENTUM_OP(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Gamma.metal b/aten/src/ATen/native/mps/kernels/Gamma.metal
index 1c150a726edb1..1e82f33b9213b 100644
--- a/aten/src/ATen/native/mps/kernels/Gamma.metal
+++ b/aten/src/ATen/native/mps/kernels/Gamma.metal
@@ -106,7 +106,13 @@ kernel void polygamma(
       constant int64_t& order [[buffer(2)]],                                  \
       uint id [[thread_position_in_grid]]);
 
+<<<<<<< HEAD
 INSTANTIATE_GAMMA_KERNELS(bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_GAMMA_KERNELS(bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 INSTANTIATE_GAMMA_KERNELS(half, half);
 INSTANTIATE_GAMMA_KERNELS(float, float);
 INSTANTIATE_GAMMA_KERNELS(bool, float);
diff --git a/aten/src/ATen/native/mps/kernels/Im2Col.metal b/aten/src/ATen/native/mps/kernels/Im2Col.metal
index 191462bbd3d08..39a8b4a1ed39c 100644
--- a/aten/src/ATen/native/mps/kernels/Im2Col.metal
+++ b/aten/src/ATen/native/mps/kernels/Im2Col.metal
@@ -76,4 +76,10 @@ INSTANTIATE_IM2COL(float);
 INSTANTIATE_IM2COL(float2);
 INSTANTIATE_IM2COL(half);
 INSTANTIATE_IM2COL(half2);
+<<<<<<< HEAD
 INSTANTIATE_IM2COL(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_IM2COL(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index b41e64d70ced5..3b4a49010e108 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -9,6 +9,7 @@ struct IndexAB {
   constant int64_t* indexArray;
 };
 
+<<<<<<< HEAD
 uint3 index_get_offsets(
     constant int64_t* sizes,
     constant int64_t* output_strides,
@@ -43,6 +44,8 @@ OffsetT index_apply_indices(
   return rc;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T, typename OffsetT = ulong>
 kernel void index_select(
     device T* output,
@@ -58,6 +61,7 @@ kernel void index_select(
     uint thread_index [[thread_position_in_grid]]) {
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
+<<<<<<< HEAD
   const auto offs = index_get_offsets(
       sizes,
       output_strides,
@@ -68,6 +72,22 @@ kernel void index_select(
   auto input_offs = index_apply_indices<OffsetT>(
       offs.yz, indices, index_sizes, index_strides, num_indices);
   output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
+=======
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  OffsetT input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    input_offs += idx * index_strides[i];
+  }
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, typename OffsetT = ulong>
@@ -85,6 +105,7 @@ inline void index_put_impl(
     uint thread_index) {
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
+<<<<<<< HEAD
   const auto offs = index_get_offsets(
       sizes,
       output_strides,
@@ -95,6 +116,22 @@ inline void index_put_impl(
   auto output_offs = index_apply_indices<OffsetT>(
       offs.xz, indices, index_sizes, index_strides, num_indices);
   output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
+=======
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    output_offs += idx * index_strides[i];
+  }
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, typename OffsetT = ulong>
@@ -169,6 +206,7 @@ kernel void index_put_accumulate(
     uint thread_index [[thread_position_in_grid]]) {
   const auto ndim = ndim_nindices_numel.x;
   const auto num_indices = ndim_nindices_numel.y;
+<<<<<<< HEAD
   const auto offs = index_get_offsets(
       sizes,
       output_strides,
@@ -182,6 +220,25 @@ kernel void index_put_accumulate(
       reinterpret_cast<device AtomicType_t<T>*>(output),
       output_offs / sizeof(T),
       input[offs.y / sizeof(T)]);
+=======
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    output_offs += idx * index_strides[i];
+  }
+  AtomicType<T>::atomic_add(
+      reinterpret_cast<device AtomicType_t<T>*>(output),
+      output_offs / sizeof(T),
+      input[input_offs / sizeof(T)]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #define REGISTER_INDEX_OP(OP_NAME, SUFFIX, DTYPE)                   \
@@ -211,6 +268,7 @@ REGISTER_INDEX_OP_ALL_DTYPES(put_serial);
 
 REGISTER_INDEX_OP(put_accumulate, float, float);
 REGISTER_INDEX_OP(put_accumulate, half, half);
+<<<<<<< HEAD
 REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
 REGISTER_INDEX_OP(put_accumulate, long, long);
 REGISTER_INDEX_OP(put_accumulate, int, int);
@@ -220,6 +278,13 @@ REGISTER_INDEX_OP(put_accumulate, uchar, uchar);
 REGISTER_INDEX_OP(put_accumulate, bool, bool);
 REGISTER_INDEX_OP(put_accumulate, float2, float2);
 REGISTER_INDEX_OP(put_accumulate, half2, half2);
+=======
+REGISTER_INDEX_OP(put_accumulate, int, int);
+REGISTER_INDEX_OP(put_accumulate, bool, bool);
+#if __METAL_VERSION__ >= 310
+REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename StridesT, typename DataT>
 kernel void kernel_index_offsets(
@@ -358,7 +423,10 @@ kernel void index_copy_strided(
     constant long* input_strides,
     constant long* output_strides,
     constant long* source_strides,
+<<<<<<< HEAD
     constant long& indices_stride,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uint thread_index [[thread_position_in_grid]]) {
   int pos[max_ndim];
   pos_from_thread_index(int(thread_index), pos, sizes, ndim);
@@ -375,7 +443,11 @@ kernel void index_copy_strided(
   // find the last index in the indices array that equals this coordinate
   int last_matching_index = -1;
   for (uint i = 0; i < indices_numel; i++) {
+<<<<<<< HEAD
     if (indices[i * indices_stride] == orig_dim) {
+=======
+    if (indices[i] == orig_dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       last_matching_index = int(i);
     }
   }
@@ -414,7 +486,10 @@ kernel void index_copy_strided(
       constant long*,                                           \
       constant long*,                                           \
       constant long*,                                           \
+<<<<<<< HEAD
       constant long&,                                           \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint);
 
 #define REGISTER_MASKED_FILL_SCALAR(SIZE, DTYPE)                            \
@@ -456,8 +531,15 @@ INSTANTIATE_INDEX_COPY(char, long);
 INSTANTIATE_INDEX_COPY(uchar, int);
 INSTANTIATE_INDEX_COPY(uchar, long);
 
+<<<<<<< HEAD
+INSTANTIATE_INDEX_COPY(bfloat, int);
+INSTANTIATE_INDEX_COPY(bfloat, long);
+=======
+#if __METAL_VERSION__ >= 310
 INSTANTIATE_INDEX_COPY(bfloat, int);
 INSTANTIATE_INDEX_COPY(bfloat, long);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 INSTANTIATE_INDEX_COPY(float2, int);
 INSTANTIATE_INDEX_COPY(float2, long);
 INSTANTIATE_INDEX_COPY(half2, int);
diff --git a/aten/src/ATen/native/mps/kernels/LayerNorm.metal b/aten/src/ATen/native/mps/kernels/LayerNorm.metal
index 7b4a789ed292a..a3e2ae8c670f8 100644
--- a/aten/src/ATen/native/mps/kernels/LayerNorm.metal
+++ b/aten/src/ATen/native/mps/kernels/LayerNorm.metal
@@ -1,8 +1,14 @@
+<<<<<<< HEAD
 #include <c10/metal/common.h>
 #include <metal_simdgroup>
 #include <metal_stdlib>
 using namespace metal;
 using c10::metal::simdgroup_size;
+=======
+#include <metal_simdgroup>
+#include <metal_stdlib>
+using namespace metal;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 kernel void layer_norm_single_row(
@@ -20,6 +26,10 @@ kernel void layer_norm_single_row(
     uint tid [[thread_position_in_threadgroup]],
     uint simd_lane_id [[thread_index_in_simdgroup]],
     uint simdgroup_id [[simdgroup_index_in_threadgroup]]) {
+<<<<<<< HEAD
+=======
+  constexpr int SIMD_SIZE = 32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr int N_READS = 4;
 
   // each threadgroup handles one full “row” of length axis_size
@@ -53,8 +63,13 @@ kernel void layer_norm_single_row(
   }
 
   // threadgroup‐wide reduction
+<<<<<<< HEAD
   threadgroup float local_sums[simdgroup_size];
   threadgroup float local_sums_sq[simdgroup_size];
+=======
+  threadgroup float local_sums[SIMD_SIZE];
+  threadgroup float local_sums_sq[SIMD_SIZE];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   threadgroup float tg_mean[1];
   threadgroup float tg_inv_std[1];
 
@@ -143,6 +158,10 @@ kernel void layer_norm_looped(
     uint lsize [[threads_per_threadgroup]],
     uint simd_lane_id [[thread_index_in_simdgroup]],
     uint simdgroup_id [[simdgroup_index_in_threadgroup]]) {
+<<<<<<< HEAD
+=======
+  constexpr int SIMD_SIZE = 32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr int N_READS = 4;
 
   uint row_offset = tg_id * axis_size;
@@ -178,8 +197,13 @@ kernel void layer_norm_looped(
   partial_sum = simd_sum(partial_sum);
   partial_sum_sq = simd_sum(partial_sum_sq);
 
+<<<<<<< HEAD
   threadgroup float local_sums[simdgroup_size];
   threadgroup float local_sums_sq[simdgroup_size];
+=======
+  threadgroup float local_sums[SIMD_SIZE];
+  threadgroup float local_sums_sq[SIMD_SIZE];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   threadgroup float tg_mean[1];
   threadgroup float tg_inv_std[1];
 
@@ -288,6 +312,13 @@ kernel void layer_norm_looped(
 #define instantiate_layer_norm(DTYPE) \
   instantiate_layer_norm_single_row(DTYPE) instantiate_layer_norm_looped(DTYPE)
 
+<<<<<<< HEAD
 instantiate_layer_norm(float);
 instantiate_layer_norm(half);
 instantiate_layer_norm(bfloat);
+=======
+instantiate_layer_norm(float) instantiate_layer_norm(half)
+#if __METAL_VERSION__ >= 310
+    instantiate_layer_norm(bfloat)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index c356dbf9ecb38..e73306448108d 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <ATen/native/mps/kernels/LinearAlgebra.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_simdgroup>
@@ -70,6 +73,7 @@ kernel void matmul(
 }
 
 template <typename T>
+<<<<<<< HEAD
 kernel void addmm(
     constant T* mat1Data [[buffer(0)]],
     constant T* mat2Data [[buffer(1)]],
@@ -101,6 +105,8 @@ kernel void addmm(
 }
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel void naive_bmm(
     constant T* mat1Data [[buffer(0)]],
     constant T* mat2Data [[buffer(1)]],
@@ -442,7 +448,11 @@ kernel void applySYRK(
     uint3 tid [[thread_position_in_threadgroup]],
     uint3 tgid [[threadgroup_position_in_grid]],
     uint3 tpg [[threads_per_threadgroup]],
+<<<<<<< HEAD
     uint warp_id [[simdgroup_index_in_threadgroup]]) {
+=======
+    uint sgitg [[simdgroup_index_in_threadgroup]]) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint tx = tid.x;
   const uint ty = tid.y;
   const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
@@ -475,8 +485,16 @@ kernel void applySYRK(
       (actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
 
   if (use_simdgroup) {
+<<<<<<< HEAD
     simdgroup_matrix<float, 8, 8> negative_identity =
         simdgroup_matrix<float, 8, 8>(-1.0);
+=======
+    uint warp_id = sgitg;
+
+    simdgroup_matrix<float, 8, 8> negative_identity =
+        simdgroup_matrix<float, 8, 8>(-1.0);
+    simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     simdgroup_matrix<float, 8, 8> Prod;
     simdgroup_matrix<float, 8, 8> Afrag;
     simdgroup_matrix<float, 8, 8> Bfrag;
@@ -519,7 +537,12 @@ kernel void applySYRK(
             /* transpose = */ upper);
 
         simdgroup_multiply(Prod, Afrag, Bfrag);
+<<<<<<< HEAD
         simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
+=======
+        simdgroup_multiply(Prod, Prod, negative_identity);
+        simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
 
       simdgroup_store(
@@ -641,6 +664,7 @@ kernel void applyPivots(
   }
 }
 
+<<<<<<< HEAD
 template <typename T>
 static T bool_to_float(bool b) {
   return static_cast<T>(b);
@@ -808,6 +832,19 @@ kernel void orgqr(
       constant uint3 & sizes [[buffer(4)]],                                 \
       uint2 tid [[thread_position_in_threadgroup]],                         \
       uint2 group_id [[threadgroup_position_in_grid]]);                     \
+=======
+#define INSTANTIATE_NAIVE_MM(DTYPE)                                   \
+  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>( \
+      constant DTYPE * mat1Data [[buffer(0)]],                        \
+      constant DTYPE * mat2Data [[buffer(1)]],                        \
+      device DTYPE * outputData [[buffer(2)]],                        \
+      constant array<ulong2, 3> & strides [[buffer(3)]],              \
+      constant uint3 & sizes [[buffer(4)]],                           \
+      uint2 tid [[thread_position_in_threadgroup]],                   \
+      uint2 group_id [[threadgroup_position_in_grid]])
+
+#define INSTANTIATE_NAIVE_BMM(DTYPE)                                        \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   template [[host_name("naive_bmm_" #DTYPE)]] kernel void naive_bmm<DTYPE>( \
       constant DTYPE * mat1Data [[buffer(0)]],                              \
       constant DTYPE * mat2Data [[buffer(1)]],                              \
@@ -815,6 +852,7 @@ kernel void orgqr(
       constant array<ulong, 9> & strides [[buffer(3)]],                     \
       constant uint4 & sizes [[buffer(4)]],                                 \
       uint3 tid [[thread_position_in_threadgroup]],                         \
+<<<<<<< HEAD
       uint3 group_id [[threadgroup_position_in_grid]]);                     \
   template [[host_name("addmm_" #DTYPE)]] kernel void addmm<DTYPE>(         \
       constant DTYPE * mat1Data [[buffer(0)]],                              \
@@ -854,3 +892,24 @@ REGISTER_ORGQR(half);
 REGISTER_ORGQR(bfloat);
 REGISTER_ORGQR(float2);
 REGISTER_ORGQR(half2);
+=======
+      uint3 group_id [[threadgroup_position_in_grid]])
+
+INSTANTIATE_NAIVE_MM(float);
+INSTANTIATE_NAIVE_MM(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_NAIVE_MM(bfloat);
+#endif
+
+// Integral MM
+INSTANTIATE_NAIVE_MM(short);
+INSTANTIATE_NAIVE_MM(int);
+INSTANTIATE_NAIVE_MM(long);
+INSTANTIATE_NAIVE_MM(char);
+INSTANTIATE_NAIVE_MM(uchar);
+INSTANTIATE_NAIVE_BMM(short);
+INSTANTIATE_NAIVE_BMM(int);
+INSTANTIATE_NAIVE_BMM(long);
+INSTANTIATE_NAIVE_BMM(char);
+INSTANTIATE_NAIVE_BMM(uchar);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Quantized.metal b/aten/src/ATen/native/mps/kernels/Quantized.metal
index b84c033a07f49..3973b1f04de96 100644
--- a/aten/src/ATen/native/mps/kernels/Quantized.metal
+++ b/aten/src/ATen/native/mps/kernels/Quantized.metal
@@ -197,10 +197,18 @@ INSTANTIATE_INT4MV(float, 128);
 INSTANTIATE_INT4MV(half, 128);
 INSTANTIATE_INT4MV(float, 256);
 INSTANTIATE_INT4MV(half, 256);
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 INSTANTIATE_INT4MV(bfloat, 32);
 INSTANTIATE_INT4MV(bfloat, 64);
 INSTANTIATE_INT4MV(bfloat, 128);
 INSTANTIATE_INT4MV(bfloat, 256);
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // ------------------------------ int8 MM For M >= 12 ------------------------------------
 /**
@@ -232,10 +240,18 @@ template <> struct BlockType<half> {
   using simdgroup_type8x8 = simdgroup_half8x8;
   using type4 = half4;
 };
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <> struct BlockType<bfloat> {
   using simdgroup_type8x8 = simdgroup_bfloat8x8;
   using type4 = bfloat4;
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template<typename T>
 float2 get_scale_zero_q8(constant T * scalesAndZeros, uint2 index) {
@@ -486,7 +502,13 @@ kernel void kernel_mul_mm<DTYPE, WDTYPE, DEQUANT_FUNC>(                  \
 
 INSTANTIATE_MM(float, char, get_scale_zero_q8);
 INSTANTIATE_MM(half, char, get_scale_zero_q8);
+<<<<<<< HEAD
 INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ------------------------------ int8 MM For M < 12 ------------------------------------
 /* Matrix vector multiplication, used for small M size for matrix multiplication as well.
 
@@ -640,4 +662,10 @@ kernel void kernel_mul_mv<DTYPE>(
 
 INSTANTIATE_MV(float);
 INSTANTIATE_MV(half);
+<<<<<<< HEAD
+INSTANTIATE_MV(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 INSTANTIATE_MV(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/RMSNorm.metal b/aten/src/ATen/native/mps/kernels/RMSNorm.metal
index d6c69217e65f3..51d94b4540efb 100644
--- a/aten/src/ATen/native/mps/kernels/RMSNorm.metal
+++ b/aten/src/ATen/native/mps/kernels/RMSNorm.metal
@@ -2,13 +2,19 @@
 // https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/rms_norm.metal
 // Copyright © 2024 Apple Inc.
 
+<<<<<<< HEAD
 #include <c10/metal/common.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <metal_common>
 #include <metal_simdgroup>
 #include <metal_stdlib>
 
 using namespace metal;
+<<<<<<< HEAD
 using c10::metal::simdgroup_size;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 [[kernel]] void rms_single_row(
@@ -22,10 +28,18 @@ template <typename T>
     uint lid [[thread_position_in_threadgroup]],
     uint simd_lane_id [[thread_index_in_simdgroup]],
     uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+<<<<<<< HEAD
   constexpr int N_READS = 4;
 
   threadgroup float local_inv_mean[1];
   threadgroup float local_sums[simdgroup_size];
+=======
+  constexpr int SIMD_SIZE = 32;
+  constexpr int N_READS = 4;
+
+  threadgroup float local_inv_mean[1];
+  threadgroup float local_sums[SIMD_SIZE];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   float acc = 0;
   x += gid * size_t(axis_size) + lid * N_READS;
@@ -93,9 +107,16 @@ template <typename T>
     uint lsize [[threads_per_threadgroup]],
     uint simd_lane_id [[thread_index_in_simdgroup]],
     uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+<<<<<<< HEAD
   constexpr int N_READS = 4;
   threadgroup float local_inv_mean[1];
   threadgroup float local_sums[simdgroup_size];
+=======
+  constexpr int SIMD_SIZE = 32;
+  constexpr int N_READS = 4;
+  threadgroup float local_inv_mean[1];
+  threadgroup float local_sums[SIMD_SIZE];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   float acc = 0;
   x += gid * size_t(axis_size) + lid * N_READS;
@@ -192,4 +213,10 @@ template <typename T>
 
 instantiate_rms(float)
 instantiate_rms(half)
+<<<<<<< HEAD
+instantiate_rms(bfloat)
+=======
+#if __METAL_VERSION__ >= 310
 instantiate_rms(bfloat)
+#endif // clang-format on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/RenormKernel.metal b/aten/src/ATen/native/mps/kernels/RenormKernel.metal
index 0bfd60b04c162..518ebf380bcb7 100644
--- a/aten/src/ATen/native/mps/kernels/RenormKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/RenormKernel.metal
@@ -23,4 +23,10 @@ kernel void renorm(
 
 REGISTER_RENORM_OP(float);
 REGISTER_RENORM_OP(half);
+<<<<<<< HEAD
 REGISTER_RENORM_OP(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_RENORM_OP(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/ScanKernel.metal b/aten/src/ATen/native/mps/kernels/ScanKernel.metal
index de493af7aaa05..4fa91dea9b4d4 100644
--- a/aten/src/ATen/native/mps/kernels/ScanKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/ScanKernel.metal
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <metal_simdgroup>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <metal_stdlib>
 using namespace metal;
 
@@ -7,6 +10,7 @@ using namespace metal;
 
 using c10::metal::accum_t;
 
+<<<<<<< HEAD
 struct LogAddExp {
   template <typename T>
   T operator()(T x, T y) {
@@ -212,11 +216,43 @@ struct CumMinOp {
     return make_pair<T, acc_t>(
         simd_shuffle_and_fill_up(data.value, filling.value, delta),
         simd_shuffle_and_fill_up(data.index, filling.index, delta));
+=======
+template <typename T, typename acc_t = accum_t<T>>
+struct CumSumOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return a + b;
+  }
+  static acc_t identity() {
+    return acc_t(0);
+  }
+};
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumProdOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return a * b;
+  }
+  static acc_t identity() {
+    return acc_t(1);
+  }
+};
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumMinOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return metal::min(a, b);
+  }
+  static acc_t identity() {
+    return static_cast<acc_t>(
+        metal::is_floating_point_v<T> ? metal::numeric_limits<T>::infinity()
+                                      : metal::numeric_limits<T>::max());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
 template <typename T, typename acc_t = accum_t<T>>
 struct CumMaxOp {
+<<<<<<< HEAD
   using pair_t = ValueIndexPair<T, acc_t>;
 
   static constexpr constant acc_t init_val = static_cast<acc_t>(
@@ -420,10 +456,43 @@ kernel void scan_innermost_dim(
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     prefix = simdgroup_sums[0];
+=======
+  static acc_t apply(acc_t a, acc_t b) {
+    return metal::max(a, b);
+  }
+  static acc_t identity() {
+    return static_cast<acc_t>(
+        metal::is_floating_point_v<T> ? -metal::numeric_limits<T>::infinity()
+                                      : metal::numeric_limits<T>::lowest());
+  }
+};
+
+// Inclusive scan along innermost dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_contiguous_innermost_dim(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant uint& num_rows [[buffer(2)]],
+    constant uint& row_size [[buffer(3)]],
+    uint row [[thread_position_in_grid]]) {
+  if (row >= num_rows)
+    return;
+
+  const uint offset = row * row_size;
+
+  acc_t accumulator = Op::identity();
+
+  for (uint col = 0; col < row_size; col++) {
+    T val = input[offset + col];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[offset + col] = static_cast<T>(accumulator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 // Inclusive scan along outer dimension for contiguous tensors
+<<<<<<< HEAD
 template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
 kernel void scan_outer_dim(
     const device T* in [[buffer(0)]],
@@ -518,10 +587,126 @@ kernel void scan_outer_dim(
           out[index_y * stride + i] = static_cast<T>(read_into[i]);
         }
       }
+=======
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_contiguous_outer_dim(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant uint& num_orows [[buffer(2)]],
+    constant uint& num_irows [[buffer(3)]],
+    constant uint& row_size [[buffer(4)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const uint orow = thread_index / num_irows;
+  const uint irow = thread_index % num_irows;
+
+  if (orow >= num_orows)
+    return;
+
+  acc_t accumulator = Op::identity();
+
+  const uint idx_base = orow * row_size * num_irows + irow;
+  for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
+    T val = input[idx];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[idx] = static_cast<T>(accumulator);
+  }
+}
+
+// Inclusive scan with indices along innermost dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_contiguous_innermost_dim(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant uint& num_rows [[buffer(3)]],
+    constant uint& row_size [[buffer(4)]],
+    uint row [[thread_position_in_grid]]) {
+  if (row >= num_rows)
+    return;
+
+  const uint offset = row * row_size;
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+
+  for (uint col = 0; col < row_size; col++) {
+    T val = input[offset + col];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = col;
     }
+    values[offset + col] = static_cast<T>(accumulator);
+    indices[offset + col] = best_idx;
   }
 }
 
+// Inclusive scan with indices along outer dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_contiguous_outer_dim(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant uint& num_orows [[buffer(3)]],
+    constant uint& num_irows [[buffer(4)]],
+    constant uint& row_size [[buffer(5)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const uint orow = thread_index / num_irows;
+  const uint irow = thread_index % num_irows;
+
+  if (orow >= num_orows)
+    return;
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+
+  const uint idx_base = orow * row_size * num_irows + irow;
+  for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
+    T val = input[idx];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = col;
+    }
+    values[idx] = static_cast<T>(accumulator);
+    indices[idx] = best_idx;
+  }
+}
+
+// Shared utility functions for strided kernels
+inline long calculate_non_scan_elements(
+    constant long* sizes,
+    uint ndim,
+    uint scan_dim) {
+  long total = 1;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      total *= sizes[i];
+    }
+  }
+  return total;
+}
+
+inline void thread_index_to_coordinates(
+    uint index,
+    int pos[c10::metal::max_ndim],
+    constant long* sizes,
+    uint ndim,
+    uint scan_dim) {
+  long remaining_index = index;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      pos[i] = remaining_index % sizes[i];
+      remaining_index /= sizes[i];
+    } else {
+      pos[i] = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    }
+  }
+}
+
+<<<<<<< HEAD
 template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
 kernel void scan_with_indices_innermost_dim(
     const device T* in [[buffer(0)]],
@@ -786,3 +971,200 @@ REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, short, 4);
 REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, char, 4);
 REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, uchar, 4);
 REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bool, 4);
+=======
+inline long calculate_base_offset(
+    int pos[c10::metal::max_ndim],
+    constant long* strides,
+    uint ndim,
+    uint scan_dim) {
+  long offset = 0;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      offset += pos[i] * strides[i];
+    }
+  }
+  return offset;
+}
+
+// Generic strided scan kernel
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_strided(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    constant uint& scan_dim [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const long total_non_scan_elements =
+      calculate_non_scan_elements(sizes, ndim, scan_dim);
+  if (thread_index >= total_non_scan_elements) {
+    return;
+  }
+
+  int pos[c10::metal::max_ndim];
+  thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
+
+  const long input_base_offset =
+      calculate_base_offset(pos, input_strides, ndim, scan_dim);
+  const long output_base_offset =
+      calculate_base_offset(pos, output_strides, ndim, scan_dim);
+
+  acc_t accumulator = Op::identity();
+  const long scan_size = sizes[scan_dim];
+  const long input_scan_stride = input_strides[scan_dim];
+  const long output_scan_stride = output_strides[scan_dim];
+
+  for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
+    const long input_offset = input_base_offset + scan_idx * input_scan_stride;
+    const long output_offset =
+        output_base_offset + scan_idx * output_scan_stride;
+
+    T val = input[input_offset];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[output_offset] = static_cast<T>(accumulator);
+  }
+}
+
+// Generic strided scan with indices kernel
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_strided(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* input_strides [[buffer(4)]],
+    constant long* values_strides [[buffer(5)]],
+    constant long* indices_strides [[buffer(6)]],
+    constant uint& ndim [[buffer(7)]],
+    constant uint& scan_dim [[buffer(8)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const long total_non_scan_elements =
+      calculate_non_scan_elements(sizes, ndim, scan_dim);
+  if (thread_index >= total_non_scan_elements) {
+    return;
+  }
+
+  int pos[c10::metal::max_ndim];
+  thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
+
+  const long input_base_offset =
+      calculate_base_offset(pos, input_strides, ndim, scan_dim);
+  const long values_base_offset =
+      calculate_base_offset(pos, values_strides, ndim, scan_dim);
+  const long indices_base_offset =
+      calculate_base_offset(pos, indices_strides, ndim, scan_dim);
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+  const long scan_size = sizes[scan_dim];
+  const long input_scan_stride = input_strides[scan_dim];
+  const long values_scan_stride = values_strides[scan_dim];
+  const long indices_scan_stride = indices_strides[scan_dim];
+
+  for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
+    const long input_offset = input_base_offset + scan_idx * input_scan_stride;
+    const long values_offset =
+        values_base_offset + scan_idx * values_scan_stride;
+    const long indices_offset =
+        indices_base_offset + scan_idx * indices_scan_stride;
+
+    T val = input[input_offset];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (scan_idx == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = scan_idx;
+    }
+    values[values_offset] = static_cast<T>(accumulator);
+    indices[indices_offset] = best_idx;
+  }
+}
+
+#define REGISTER_SCAN_OP(OP_NAME, OP_CLASS, DTYPE)                             \
+  template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
+  scan_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>(                       \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant uint & num_rows [[buffer(2)]],                                  \
+      constant uint & row_size [[buffer(3)]],                                  \
+      uint row [[thread_position_in_grid]]);                                   \
+                                                                               \
+  template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void     \
+  scan_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>(                           \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant uint & num_orows [[buffer(2)]],                                 \
+      constant uint & num_irows [[buffer(3)]],                                 \
+      constant uint & row_size [[buffer(4)]],                                  \
+      uint thread_index [[thread_position_in_grid]]);                          \
+                                                                               \
+  template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void              \
+  scan_strided<DTYPE, OP_CLASS<DTYPE>>(                                        \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant long* sizes [[buffer(2)]],                                      \
+      constant long* input_strides [[buffer(3)]],                              \
+      constant long* output_strides [[buffer(4)]],                             \
+      constant uint& ndim [[buffer(5)]],                                       \
+      constant uint& scan_dim [[buffer(6)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);
+
+#define REGISTER_SCAN_WITH_INDICES_OP(OP_NAME, OP_CLASS, DTYPE)                \
+  template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
+  scan_with_indices_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>(          \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant uint& num_rows [[buffer(3)]],                                   \
+      constant uint& row_size [[buffer(4)]],                                   \
+      uint row [[thread_position_in_grid]]);                                   \
+                                                                               \
+  template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void     \
+  scan_with_indices_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>(              \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant uint& num_orows [[buffer(3)]],                                  \
+      constant uint& num_irows [[buffer(4)]],                                  \
+      constant uint& row_size [[buffer(5)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);                          \
+                                                                               \
+  template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void              \
+  scan_with_indices_strided<DTYPE, OP_CLASS<DTYPE>>(                           \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant long* sizes [[buffer(3)]],                                      \
+      constant long* input_strides [[buffer(4)]],                              \
+      constant long* values_strides [[buffer(5)]],                             \
+      constant long* indices_strides [[buffer(6)]],                            \
+      constant uint& ndim [[buffer(7)]],                                       \
+      constant uint& scan_dim [[buffer(8)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);
+
+// Scan operations with indices
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, long);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, int);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, short);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, char);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, uchar);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, bool);
+
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, float);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, half);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, long);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, int);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, short);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, char);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, uchar);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bool);
+
+#if __METAL_VERSION__ >= 310
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, bfloat);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/SpecialOps.metal b/aten/src/ATen/native/mps/kernels/SpecialOps.metal
index 1e37573a36e88..e5e6c312a56e4 100644
--- a/aten/src/ATen/native/mps/kernels/SpecialOps.metal
+++ b/aten/src/ATen/native/mps/kernels/SpecialOps.metal
@@ -89,4 +89,10 @@ REGISTER_SPECIAL(short, float);
 REGISTER_SPECIAL(int, float);
 REGISTER_SPECIAL(long, float);
 REGISTER_SPECIAL(half, half);
+<<<<<<< HEAD
 REGISTER_SPECIAL(bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+REGISTER_SPECIAL(bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/TriangularOps.metal b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
index ad1a0f93a217a..5b994f740f6d3 100644
--- a/aten/src/ATen/native/mps/kernels/TriangularOps.metal
+++ b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
@@ -100,7 +100,13 @@ kernel void triul(
 
 INSTANTIATE_TRIUL_KERNELS(float, int);
 INSTANTIATE_TRIUL_KERNELS(half, int);
+<<<<<<< HEAD
 INSTANTIATE_TRIUL_KERNELS(bfloat, int);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_TRIUL_KERNELS(bfloat, int);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 INSTANTIATE_TRIUL_KERNELS(float2, int);
 INSTANTIATE_TRIUL_KERNELS(half2, int);
diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
index a6ec9d036dce3..9ed6e89556cb2 100644
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@@ -5,6 +5,7 @@
 using namespace metal;
 using namespace c10::metal;
 
+<<<<<<< HEAD
 struct angle_functor {
   template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
   inline T operator()(const T x) {
@@ -20,6 +21,8 @@ struct angle_functor {
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Implement exp wrapper for both real and complex types
 template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
 inline T exp_(const T x) {
@@ -505,6 +508,14 @@ struct bitwise_not_functor {
   }
 };
 
+<<<<<<< HEAD
+=======
+template <typename T>
+float erfc(T x) {
+  return 1.0 - erf(x);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct round_decimals_functor {
   template <typename T>
   inline T operator()(const T x, const long ndigits) {
@@ -513,6 +524,7 @@ struct round_decimals_functor {
   }
 };
 
+<<<<<<< HEAD
 struct round_functor {
   template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
   inline T operator()(const T x) {
@@ -524,6 +536,8 @@ struct round_functor {
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFINE_UNARY_FLOATING_FUNCTOR(erf);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@@ -536,6 +550,7 @@ REGISTER_UNARY_OP(neg, char, char);
 REGISTER_UNARY_OP(neg, uchar, uchar);
 REGISTER_UNARY_OP(neg, float, float);
 REGISTER_UNARY_OP(neg, half, half);
+<<<<<<< HEAD
 REGISTER_UNARY_OP(round, int, int);
 REGISTER_UNARY_OP(round, long, long);
 REGISTER_UNARY_OP(round, short, short);
@@ -543,6 +558,8 @@ REGISTER_UNARY_OP(round, char, char);
 REGISTER_UNARY_OP(round, uchar, uchar);
 REGISTER_UNARY_OP(round, float, float);
 REGISTER_UNARY_OP(round, half, half);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 REGISTER_UNARY_OP(bitwise_not, int, int);
 REGISTER_UNARY_OP(bitwise_not, long, long);
@@ -560,7 +577,10 @@ REGISTER_UNARY_OP(abs, float, float);
 REGISTER_UNARY_OP(abs, half, half);
 
 #define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
+<<<<<<< HEAD
   REGISTER_UNARY_OP(angle, DTYPE1, DTYPE0);        \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   REGISTER_UNARY_OP(erf, DTYPE1, DTYPE0);          \
   REGISTER_UNARY_OP(erfc, DTYPE1, DTYPE0);         \
   REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0);       \
@@ -585,10 +605,18 @@ REGISTER_UNARY_OP(abs, half, half);
   REGISTER_UNARY_OP(acos, DTYPE1, DTYPE0);         \
   REGISTER_UNARY_OP(atan, DTYPE1, DTYPE0)
 
+<<<<<<< HEAD
 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
 REGISTER_UNARY_OP(neg, bfloat, bfloat);
 REGISTER_UNARY_OP(round, bfloat, bfloat);
 REGISTER_UNARY_OP(abs, bfloat, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
+REGISTER_UNARY_OP(neg, bfloat, bfloat);
+REGISTER_UNARY_OP(abs, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
 INSTANTIATE_UNARY_KERNELS2(float, bool);
@@ -599,7 +627,10 @@ INSTANTIATE_UNARY_KERNELS2(float, int);
 INSTANTIATE_UNARY_KERNELS2(float, long);
 
 #define INSTANTIATE_UNARY_KERNELS_VEC2(DTYPE)     \
+<<<<<<< HEAD
   REGISTER_UNARY_OP(angle, DTYPE##2, DTYPE##2);   \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   REGISTER_UNARY_OP(neg, DTYPE##2, DTYPE##2);     \
   REGISTER_UNARY_OP(exp, DTYPE##2, DTYPE##2);     \
   REGISTER_UNARY_OP(expm1, DTYPE##2, DTYPE##2);   \
@@ -629,4 +660,10 @@ INSTANTIATE_UNARY_KERNELS_VEC2(float);
 
 REGISTER_UNARY_ALPHA_OP(round_decimals, float, long, float);
 REGISTER_UNARY_ALPHA_OP(round_decimals, half, long, half);
+<<<<<<< HEAD
+REGISTER_UNARY_ALPHA_OP(round_decimals, bfloat, long, bfloat);
+=======
+#if __METAL_VERSION__ >= 310
 REGISTER_UNARY_ALPHA_OP(round_decimals, bfloat, long, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/UnfoldBackward.metal b/aten/src/ATen/native/mps/kernels/UnfoldBackward.metal
index 8369258a30a6a..25a0cff0a1039 100644
--- a/aten/src/ATen/native/mps/kernels/UnfoldBackward.metal
+++ b/aten/src/ATen/native/mps/kernels/UnfoldBackward.metal
@@ -70,4 +70,10 @@ kernel void unfold_backward(
 
 INSTANTIATE_UNFOLD_BACKWARD(float);
 INSTANTIATE_UNFOLD_BACKWARD(half);
+<<<<<<< HEAD
 INSTANTIATE_UNFOLD_BACKWARD(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_UNFOLD_BACKWARD(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/UpSample.h b/aten/src/ATen/native/mps/kernels/UpSample.h
index e9fb5f8b631ed..5d3f0d007dc9d 100644
--- a/aten/src/ATen/native/mps/kernels/UpSample.h
+++ b/aten/src/ATen/native/mps/kernels/UpSample.h
@@ -1,4 +1,5 @@
 #pragma once
+<<<<<<< HEAD
 #include <c10/metal/common.h>
 
 template <unsigned N = 5>
@@ -8,5 +9,24 @@ struct UpsampleParams {
   ::c10::metal::array<uint64_t, N> output_strides;
   ::c10::metal::array<uint64_t, N> output_sizes;
   ::c10::metal::array<float, N - 2> scales;
+=======
+
+#ifndef __METAL__
+#include <array>
+using ulong = unsigned long;
+#define _ARRAY_NS std
+#else
+#include <metal_array>
+#define _ARRAY_NS metal
+#endif
+
+template <unsigned N = 5>
+struct UpsampleParams {
+  _ARRAY_NS::array<ulong, N> input_strides;
+  _ARRAY_NS::array<ulong, N> input_sizes;
+  _ARRAY_NS::array<ulong, N> output_strides;
+  _ARRAY_NS::array<ulong, N> output_sizes;
+  _ARRAY_NS::array<float, N - 2> scales;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool align_corners;
 };
diff --git a/aten/src/ATen/native/mps/kernels/UpSample.metal b/aten/src/ATen/native/mps/kernels/UpSample.metal
index 393c9e1b4d422..90d0c3cc1e304 100644
--- a/aten/src/ATen/native/mps/kernels/UpSample.metal
+++ b/aten/src/ATen/native/mps/kernels/UpSample.metal
@@ -66,7 +66,11 @@ template <typename scalar_t>
 scalar_t upsample_get_value_bounded(
     constant scalar_t* data,
     uint3 dim,
+<<<<<<< HEAD
     ::metal::array<ulong, 5> strides,
+=======
+    array<ulong, 5> strides,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uint n,
     uint c,
     uint z,
@@ -131,7 +135,11 @@ template <typename scalar_t>
 void upsample_increment_value_bounded(
     device AtomicType_t<scalar_t>* data,
     uint3 dim,
+<<<<<<< HEAD
     ::metal::array<ulong, 5> strides,
+=======
+    array<ulong, 5> strides,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uint n,
     uint c,
     uint z,
@@ -852,4 +860,10 @@ INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
 INSTANTIATE_UPSAMPLE_3D(uchar);
 INSTANTIATE_UPSAMPLE_ALL(float);
 INSTANTIATE_UPSAMPLE_ALL(half);
+<<<<<<< HEAD
 INSTANTIATE_UPSAMPLE_ALL(bfloat);
+=======
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_UPSAMPLE_ALL(bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index e437ea5ed7989..bbba298f7e25d 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -512,7 +512,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
 }
 
 static MPSGraphTensor* normcdf(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+<<<<<<< HEAD
   // (1.0f + erf(x*SQRT1_2)) * 0.5f;
+=======
+  // (1.0f + erf(x*SQRT1_2)) * 0.5f * x;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dataType = [inputTensor dataType];
   const float SQRT1_2 = 0.707106781186547524400844362104849039f;
   MPSGraphTensor* sqrt1_2 = [mpsGraph constantWithScalar:SQRT1_2 shape:@[ @1 ] dataType:dataType];
diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm
index ce57174177885..4f97a832a4320 100644
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@@ -92,8 +92,18 @@
           }
 
           // upcasting to float32 if needed to improve precision when multiplying by the scale factor
+<<<<<<< HEAD
           maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
           maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+=======
+          if ([maskedMM dataType] != MPSDataTypeFloat32) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:MPSDataTypeFloat32 name:nil];
+          }
+          maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+          if ([maskedMM dataType] != qTensor.dataType) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:qTensor.dataType name:nil];
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           if (is_causal) {
             auto causalMask = [mpsGraph constantWithScalar:1.0f
@@ -107,6 +117,7 @@
                                                       name:nil];
           } else if (attn_mask) {
             graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+<<<<<<< HEAD
             maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
                                            secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
                                                       name:nil];
@@ -132,6 +143,17 @@
           graph->vTensor = vTensor;
           graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
           graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
+=======
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
+          }
+          auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
+          auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:sm secondaryTensor:vTensor name:nil];
+          graph->qTensor = qTensor;
+          graph->kTensor = kTensor;
+          graph->vTensor = vTensor;
+          graph->outputTensor = output;
+          graph->attnTensor = sm;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         });
     auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
     auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
@@ -179,8 +201,11 @@
   uint maxSeqLength = k_.size(2);
   uint N = k_.size(2);
   uint B = q_.size(0) * q_.size(1);
+<<<<<<< HEAD
   uint q_head_stride = q_.stride(1);
   uint q_seq_stride = q_.stride(2);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -208,8 +233,13 @@
                   out,
                   1,
                   N,
+<<<<<<< HEAD
                   std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
                   std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
+=======
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   scale_factor);
 
       if (has_mask) {
@@ -256,8 +286,11 @@
   uint B = batchSize * num_heads;
   uint gqa_factor = q_.size(1) / k_.size(1);
 
+<<<<<<< HEAD
   uint q_head_stride = q_.stride(1);
   uint q_seq_stride = q_.stride(2);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -295,8 +328,13 @@
                   maxs,
                   gqa_factor,
                   N,
+<<<<<<< HEAD
                   std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
                   std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
+=======
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   scale_factor);
 
       if (has_mask) {
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index 70211ceef07ad..25eb23893aed0 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -8,6 +8,11 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
+<<<<<<< HEAD
+=======
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -51,7 +56,10 @@ void binary_op_kernel(const std::string func_name,
                   .add_input(input)
                   .add_input(other)
                   .check_all_same_dtype(false)
+<<<<<<< HEAD
                   .promote_inputs_to_common_dtype(true)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   .build();
 
   lib.exec_binary_kernel(iter, func_name, alpha);
@@ -118,6 +126,7 @@ static void chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "chebyshev_polynomial_w");
 }
 
+<<<<<<< HEAD
 static void shifted_chebyshev_polynomial_t_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
                    "shifted_chebyshev_polynomial_t_mps not implemented for non-floating types");
@@ -142,6 +151,8 @@ static void shifted_chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter)
   lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_w");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void hermite_polynomial_h_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
                    "hermite_polynomial_h_mps not implemented for non-floating types");
@@ -166,10 +177,13 @@ static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& w
   lib.exec_binary_kernel(iter, "lerp_alpha", weight);
 }
 
+<<<<<<< HEAD
 static void native_dropout_mask_and_scale_mps_kernel(at::TensorIteratorBase& iter, const Scalar& scale) {
   lib.exec_binary_kernel(iter, "native_dropout_mask_and_scale", scale);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void mul_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "mul");
 }
@@ -194,6 +208,7 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "fmod");
 }
 
+<<<<<<< HEAD
 static void igamma_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "igamma");
 }
@@ -206,6 +221,8 @@ static void hypot_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "hypot");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
 REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
@@ -216,10 +233,13 @@ static void hypot_mps_kernel(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_mps_kernel)
+<<<<<<< HEAD
 REGISTER_DISPATCH(shifted_chebyshev_polynomial_t_stub, &shifted_chebyshev_polynomial_t_mps_kernel)
 REGISTER_DISPATCH(shifted_chebyshev_polynomial_u_stub, &shifted_chebyshev_polynomial_u_mps_kernel)
 REGISTER_DISPATCH(shifted_chebyshev_polynomial_v_stub, &shifted_chebyshev_polynomial_v_mps_kernel)
 REGISTER_DISPATCH(shifted_chebyshev_polynomial_w_stub, &shifted_chebyshev_polynomial_w_mps_kernel)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_mps_kernel)
 REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_mps_kernel)
 REGISTER_DISPATCH(polar_stub, &polar_mps_kernel);
@@ -231,7 +251,10 @@ static void hypot_mps_kernel(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel)
 REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel)
 REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel)
+<<<<<<< HEAD
 REGISTER_DISPATCH(igamma_stub, &igamma_mps_kernel)
 REGISTER_DISPATCH(igammac_stub, &igammac_mps_kernel)
 REGISTER_DISPATCH(hypot_stub, &hypot_mps_kernel)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index bffd792432666..9fa7a5048ee34 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -16,6 +16,10 @@
 #include <ATen/ops/eq_native.h>
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/hypot_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/le_native.h>
 #include <ATen/ops/logaddexp2_native.h>
 #include <ATen/ops/logaddexp_native.h>
@@ -47,11 +51,34 @@
 #define BinaryOpFn(graph, primary, secondary) \
   MPSGraphTensor*(mps::BinaryOpCachedGraph * graph, MPSGraphTensor * primary, MPSGraphTensor * secondary)
 
+<<<<<<< HEAD
+=======
+static inline Tensor legacy_complex_as_view(const Tensor& t) {
+  // Convert non-complex types (and cdouble CPU scalars) to cfloat
+  if (!isComplexType(t.scalar_type()) || t.scalar_type() == kComplexDouble) {
+    return at::view_as_real(t.to(kMPS, kComplexFloat));
+  }
+  return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void binaryOpTensor(const Tensor& self,
                            const Tensor& other,
                            const Tensor& output_,
                            std::string op_name,
                            BinaryOpBlock binaryBlock) {
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
+                (self.scalar_type() == ScalarType::Long ||
+                 (other.scalar_type() == ScalarType::Long &&
+                  (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
+              "MPS: ",
+              op_name,
+              " op with int64 input is supported natively starting from macOS 13.2");
+  TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()) || mps::supportsComplex(),
+                   "Complex types are supported starting from MacOS 14.0+");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MPSStream* mpsStream = getCurrentMPSStream();
 
   const bool is_self_scalar = self.dim() == 0;
@@ -277,6 +304,25 @@ static void add_sub_lerp_template(const Tensor& self,
   }
 }
 
+<<<<<<< HEAD
+=======
+TORCH_IMPL_FUNC(hypot_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::BinaryOpBlock hypot_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* twoTensor = [mpsGraph constantWithScalar:2.0 shape:@[ @1 ] dataType:primaryCastTensor.dataType];
+    MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:[mpsGraph powerWithPrimaryTensor:primaryCastTensor
+                                                                                     secondaryTensor:twoTensor
+                                                                                                name:nil]
+                                                    secondaryTensor:[mpsGraph powerWithPrimaryTensor:secondaryCastTensor
+                                                                                     secondaryTensor:twoTensor
+                                                                                                name:nil]
+                                                               name:nil];
+    return [mpsGraph squareRootWithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, output, "hypot_out_mps", hypot_op_block);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 16d744cedb8ef..0c80419cec113 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -51,6 +51,7 @@ inline void dot_check(const Tensor& self, const Tensor& other) {
 } // namespace mps
 
 Tensor dot_mps(const Tensor& self, const Tensor& other) {
+<<<<<<< HEAD
   using namespace mps;
   using CachedGraph = MPSBinaryCachedGraph;
 
@@ -58,6 +59,14 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
     return zeros({}, self.options());
   }
 
+=======
+  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || self.scalar_type() != ScalarType::Long,
+              "MPS: dot op doesn't support int64 input on MacOS13")
+
+  using namespace mps;
+  using CachedGraph = MPSBinaryCachedGraph;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dot_check(self, other);
 
   auto output = at::empty({}, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
index e36ac4dc45246..7cf692f455745 100644
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -62,12 +62,24 @@
   return self;
 }
 
+<<<<<<< HEAD
 static Tensor& fill_mps_tensor_(Tensor& self, uint8_t value) {
   TORCH_INTERNAL_ASSERT(self.is_contiguous());
   const auto stream = getCurrentMPSStream();
   auto storage_byte_offset = self.storage_offset() * self.itemsize();
   stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
   return self;
+=======
+// returns false if tensor cannot be filled with fillBuffer()
+static bool fill_mps_tensor_(Tensor& self, uint8_t value) {
+  if (self.is_contiguous()) {
+    MPSStream* stream = getCurrentMPSStream();
+    auto storage_byte_offset = self.storage_offset() * self.itemsize();
+    stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
+    return true;
+  }
+  return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor& fill_scalar_mps(Tensor& self, const Scalar& value) {
@@ -86,6 +98,7 @@
     return self;
   }
   // check if it's possible to use fillBuffer() to fill the Tensor's storage
+<<<<<<< HEAD
   if (self.is_contiguous()) {
     if (value.toDouble() == 0.0) {
       return fill_mps_tensor_(self, 0);
@@ -100,6 +113,10 @@
       return fill_mps_tensor_(self, value.toChar());
     }
   }
+=======
+  if (value.toDouble() == 0.0 && fill_mps_tensor_(self, 0) == true)
+    return self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return fill_scalar_mps_impl(self, value);
 }
@@ -110,6 +127,11 @@
               value.dim(),
               " dimensions.");
   Scalar scalar_value = value.item();
+<<<<<<< HEAD
+=======
+  if (scalar_value.toDouble() == 0.0 && fill_mps_tensor_(self, 0) == true)
+    return self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return fill_scalar_mps(self, scalar_value);
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index a457267a9d850..365c7fe95845a 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -1,12 +1,30 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ConvUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_mps_convolution_native.h>
 #include <ATen/ops/_mps_convolution_transpose_native.h>
 #include <ATen/ops/mps_convolution_backward_native.h>
 #include <ATen/ops/mps_convolution_transpose_backward_native.h>
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+
+#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
+
+@implementation FakeMPSGraphConvolution3DOpDescriptor
+- (nonnull id)copyWithZone:(nullable NSZone*)zone {
+  return self;
+}
+
+@end
+
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 
@@ -39,9 +57,17 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
   descriptor_.paddingFront = paddingDepth;
   descriptor_.paddingBack = paddingDepth;
 
+<<<<<<< HEAD
   descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
 
   descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
+=======
+  // PyTorch always uses NCDHW memory layout for 3D tensors
+  descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
+
+  // PyTorch always uses OIDHW memory layout for 3D weights
+  descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   descriptor_.groups = groups; // not yet tested in Xcode/C++
 }
@@ -52,7 +78,13 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
                                      NSUInteger dilationRateInX,
                                      NSUInteger dilationRateInY,
                                      NSUInteger paddingHorizontal,
+<<<<<<< HEAD
                                      NSUInteger paddingVertical) {
+=======
+                                     NSUInteger paddingVertical,
+                                     c10::MemoryFormat memory_format,
+                                     NSUInteger groups) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   descriptor_.strides =
       @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
   descriptor_.dilationRates =
@@ -101,7 +133,11 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
+<<<<<<< HEAD
 static Tensor _mps_convolution_impl(const Tensor& input_t,
+=======
+static Tensor _mps_convolution_impl(const Tensor& input_t_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     const Tensor& weight_t,
                                     const std::optional<Tensor>& bias_opt,
                                     IntArrayRef padding,
@@ -109,6 +145,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                                     IntArrayRef dilation,
                                     int64_t groups,
                                     std::optional<IntArrayRef> input_shape) {
+<<<<<<< HEAD
   constexpr auto kChannelsLast = MemoryFormat::ChannelsLast;
   constexpr auto kContiguous = MemoryFormat::Contiguous;
   const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
@@ -118,6 +155,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
   const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
   const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
   const bool bias_defined = bias_opt ? bias_opt->defined() : false;
+=======
+  const bool is_macOS_13_2_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
+  const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  Tensor input_t = input_t_;
+  bool is3DConv = input_t.dim() == 5;
+  if (!is_macOS_15_0_or_newer || is3DConv) {
+    input_t = input_t.contiguous();
+  }
+
+  TORCH_CHECK(((input_t.dim() < 5) || is_macOS_13_2_or_newer),
+              "Conv3D is only supported on MPS for MacOS_13_2 or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
 
@@ -127,6 +176,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
+<<<<<<< HEAD
+=======
+  bool bias_defined;
+
+  if (bias_opt == std::nullopt)
+    bias_defined = false;
+  else
+    bias_defined = bias_opt->defined();
+
+  auto memory_format = input_t.suggest_memory_format();
+  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto output_t =
       at::empty(input_shape.has_value() ? input_shape.value()
                                         : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
@@ -134,18 +195,25 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                 std::nullopt,
                 kMPS,
                 std::nullopt,
+<<<<<<< HEAD
                 is_channels_last ? kChannelsLast : kContiguous);
+=======
+                is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (output_t.numel() == 0) {
     return output_t;
   }
   TensorArg output{output_t, "result", 0};
 
+<<<<<<< HEAD
   // TODO: Remove me when MacOS-14 is no longer supported
   std::optional<Tensor> output_c;
   if (!is_macos_15_plus && is_channels_last) {
     output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
     // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
     for (auto elem : output_t.sizes()) {
@@ -171,6 +239,21 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
     if (bias_defined)
       bias_shape = bias_opt.value().sizes();
 
+<<<<<<< HEAD
+=======
+    std::string mem_format_key;
+    switch (memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string bias_shape_key;
     if (bias_defined) {
       bias_shape_key = std::to_string(bias_shape[0]);
@@ -178,6 +261,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
       bias_shape_key = "nobias";
     }
 
+<<<<<<< HEAD
     std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
                                   is3DConv ? "3d_" : "",
                                   getArrayRefString(stride),
@@ -200,6 +284,44 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
       MPSGraphTensor* outputTensor = nil;
       if (is3DConv) {
         auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
+=======
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
+
+    } else {
+      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
+    }
+
+    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
+    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
+    MPSNDArray* inputNDArray = nil;
+    MPSNDArray* outputNDArray = nil;
+
+    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
+      inputNDArray = getMPSNDArray(input_t, inputShape);
+      outputNDArray = getMPSNDArray(*output, outputShape);
+    }
+
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      MPSShape* weightShape = mps::getMPSShape(weight_t);
+      bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
+                              weightShape.count >= 4 && !is_channels_last);
+
+      MPSGraphTensor* inputTensor =
+          mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
+      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+      MPSGraphTensor* outputTensor;
+      if (is3DConv) {
+        MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fill_conv3d_desc(conv3dDescriptor_,
                          stride[2],
                          stride[1],
@@ -217,9 +339,23 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                                                     descriptor:conv3dDescriptor_
                                                           name:nil];
       } else if (isDepthwiseConv) {
+<<<<<<< HEAD
         auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
         fill_depthwise_conv_desc(
             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+=======
+        MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
+            [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
+                                 stride[1],
+                                 stride[0],
+                                 dilation[1],
+                                 dilation[0],
+                                 padding[1],
+                                 padding[0],
+                                 memory_format,
+                                 groups);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                 dimension:-3
@@ -238,7 +374,11 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
                        dilation[0],
                        padding[1],
                        padding[0],
+<<<<<<< HEAD
                        input_suggested_layout,
+=======
+                       memory_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        groups);
 
         outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
@@ -250,6 +390,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
       MPSGraphTensor* biasTensor = nil;
       if (bias_defined) {
         biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
+<<<<<<< HEAD
+=======
+      }
+
+      if (is_channels_last && !is_macOS_15_0_or_newer) {
+        outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
+      }
+
+      if (bias_defined) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
       }
       newCachedGraph->inputTensor_ = inputTensor;
@@ -258,6 +408,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
+<<<<<<< HEAD
     auto inputPlaceholder = input_suggested_layout == kContiguous
         ? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t)
         : Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape));
@@ -265,10 +416,16 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
         ? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
         : Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
+=======
+    auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
+                                         : Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto biasPlaceholder = Placeholder();
     // Reshape the bias to be broadcastable with output of conv2d or conv3d
     if (bias_defined) {
       if (is3DConv) {
+<<<<<<< HEAD
         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1}));
       } else if (input_suggested_layout == kChannelsLast) {
         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
@@ -278,6 +435,22 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
     }
 
     auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
+=======
+        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
+      } else {
+        if (is_channels_last && is_macOS_15_0_or_newer) {
+          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
+        } else {
+          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
+        }
+      }
+    }
+    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
+                                           : Placeholder(cachedGraph->outputTensor_, *output);
+
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
+        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
     feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
     if (bias_defined) {
@@ -287,11 +460,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
+<<<<<<< HEAD
   if (output_c) {
     output_t.copy_(*output_c);
   }
 
   return output_t;
+=======
+  return *output;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor _mps_convolution(const Tensor& input_t,
@@ -327,21 +504,30 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
+<<<<<<< HEAD
   constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
   bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv;
   auto grad_input_t =
       at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
+=======
+  auto memory_format = grad_output_t.suggest_memory_format();
+  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
+  auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{grad_input_t, "result", 0};
   convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
 
+<<<<<<< HEAD
   // TODO: Remove me when MacOS-14 is no longer supported
   std::optional<Tensor> grad_input_c;
   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
     grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -353,6 +539,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   // Add backward with input
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
+<<<<<<< HEAD
     MPSShape* mps_input_shape = getMPSShape(input_size);
     std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
                                   is3DConv ? "3d_" : "",
@@ -362,6 +549,36 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
                                   groups,
                                   is_channels_last,
                                   getTensorsStringKey({grad_output_t, weight_t}));
+=======
+
+    std::string mem_format_key;
+    switch (memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    MPSShape* mps_input_shape = getMPSShape(input_size);
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          getTensorsStringKey({grad_output_t, weight_t});
+
+    } else {
+      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          getTensorsStringKey({grad_output_t, weight_t});
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
       auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@@ -393,8 +610,20 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
       } else if (isDepthwiseConv) {
         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+<<<<<<< HEAD
         fill_depthwise_conv_desc(
             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+=======
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
+                                 stride[1],
+                                 stride[0],
+                                 dilation[1],
+                                 dilation[0],
+                                 padding[1],
+                                 padding[0],
+                                 at::MemoryFormat::Contiguous,
+                                 groups);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                 dimension:-3
                                                             withDimension:-4
@@ -429,18 +658,28 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
       newCachedGraph->gradInputTensor_ = gradInputTensor;
     });
 
+<<<<<<< HEAD
     auto gradOutputPlaceholder =
         Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t);
     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
+=======
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+<<<<<<< HEAD
   if (grad_input_c) {
     grad_input_t.copy_(*grad_input_c);
   }
   return grad_input_t;
+=======
+  return *grad_input;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
@@ -453,11 +692,17 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
                                                bool bias_defined) {
   using namespace at::native::mps;
   using namespace mps;
+<<<<<<< HEAD
   const bool is3DConv = input_t.dim() == 5;
   TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
   CheckedFrom c = "mps_convolution_backward_weights";
   constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
   bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
+=======
+  bool is3DConv = input_t.dim() == 5;
+  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
+  CheckedFrom c = "mps_convolution_backward_weights";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -468,8 +713,12 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
   checkAllSameGPU(c, {grad_output, input});
 
   auto grad_weight_t =
+<<<<<<< HEAD
       at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
 
+=======
+      at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TensorArg grad_weight{grad_weight_t, "result", 0};
 
   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@@ -482,16 +731,20 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
     MPSGraphTensor* gradWeightTensor_ = nil;
   };
 
+<<<<<<< HEAD
   // TODO: Remove me when MacOS-14 is no longer supported
   std::optional<Tensor> grad_weight_c;
   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
     grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
 
     MPSShape* mps_weight_shape = getMPSShape(weight_size);
+<<<<<<< HEAD
     std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}",
                                   is3DConv ? "3d_" : "",
                                   getArrayRefString(stride),
@@ -500,6 +753,21 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
                                   groups,
                                   is_channels_last,
                                   getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
+=======
+    std::string key;
+    if (is3DConv) {
+      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
+          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
+    } else {
+      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
+          getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSShape* inputShape = getMPSShape(input_t);
       bool isDepthwiseConv =
@@ -530,8 +798,20 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
       } else if (isDepthwiseConv) {
         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+<<<<<<< HEAD
         fill_depthwise_conv_desc(
             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
+=======
+        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
+                                 stride[1],
+                                 stride[0],
+                                 dilation[1],
+                                 dilation[0],
+                                 padding[1],
+                                 padding[0],
+                                 at::MemoryFormat::Contiguous,
+                                 groups);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         NSNumber* outputFeatChannelDim = mps_weight_shape[0];
         MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
         MPSGraphTensor* gradWeightTensorTranspose =
@@ -565,19 +845,28 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
       newCachedGraph->gradWeightTensor_ = gradWeightTensor;
     });
 
+<<<<<<< HEAD
     auto gradOutputPlaceholder =
         Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t);
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t);
     auto outputPlaceholder =
         Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
+=======
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
+<<<<<<< HEAD
   if (grad_weight_c) {
     grad_weight_t.copy_(*grad_weight_c);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return grad_weight_t;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index a3cba05c975cf..ae0ce61055015 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -2,6 +2,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
@@ -59,6 +63,10 @@ static void copy_cast_mps(at::Tensor& dst,
         outputTensor = [mpsGraph castTensor:outputTensor toType:dstDType name:@"cast"];
       }
       if (needs_conj) {
+<<<<<<< HEAD
+=======
+        TORCH_CHECK(supportsComplex(), "MPS complex tensors conjugation needs MacOS14+");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputTensor = [mpsGraph conjugateWithTensor:outputTensor name:nil];
       }
 
@@ -273,7 +281,28 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset, profile_id);
   } else {
+<<<<<<< HEAD
     if (dst_byte_offset) {
+=======
+    // Simulate cast to Complex on older MacOS by initializing real and imag parts
+    if (dst_.is_complex() && !supportsComplex()) {
+      if (!src.is_complex()) {
+        at::real(dst_).copy_(src);
+        at::imag(dst_).fill_(0);
+      } else if (src.is_conj() || dst_.is_conj()) {
+        // One cannot take view of conjugated tensor, but for some reason real and imag views are fine
+        // Use this to implement a conjugation
+        at::real(dst_).copy_(at::real(src));
+        if (src.is_conj() != dst_.is_conj()) {
+          at::imag(dst_).copy_(at::neg(at::imag(src)));
+        } else {
+          at::imag(dst_).copy_(at::imag(src));
+        }
+      } else {
+        at::view_as_real(dst_).copy_(at::view_as_real(src));
+      }
+    } else if (dst_byte_offset) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto maybeCastedSource =
           at::empty(dst_.sizes(), dst_.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
       auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index d6c776abd9037..9623e8dc5d56f 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -5,6 +5,11 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -57,7 +62,10 @@
   if (self.numel() == 0) {
     return self;
   }
+<<<<<<< HEAD
   at::assert_no_internal_overlap(self);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
   const auto need_reshape = self.ndimension() > 4;
   auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@@ -86,6 +94,10 @@
           case kFloat:
             return MPSDataTypeFloat32;
           case kBFloat16: {
+<<<<<<< HEAD
+=======
+            checkSupportsBFloat16();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return MPSDataTypeBFloat16;
           }
           default:
@@ -154,6 +166,7 @@
       feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
     }
 
+<<<<<<< HEAD
     // Handle non-contiguous output tensors by creating a contiguous temporary
     const auto needs_gather = needsGather(self);
     Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
@@ -164,6 +177,10 @@
     if (needs_gather) {
       self.copy_(self_);
     }
+=======
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   return self;
@@ -424,9 +441,14 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, std::optional<Generator
     MPSGraphTensor* logTensor = [mpsGraph logarithmWithTensor:subtractTensor name:nil];
     return [mpsGraph divisionWithPrimaryTensor:logTensor secondaryTensor:minusLambdaTensor name:nil];
   };
+<<<<<<< HEAD
   auto eps = std::numeric_limits<float>::epsilon();
   return mps::random_mps_impl<double>(self,
                                       eps,
+=======
+  return mps::random_mps_impl<double>(self,
+                                      0.0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                       1.0,
                                       std::nullopt,
                                       std::nullopt,
diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
index 9f50a2343a479..d89d56dfee154 100644
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@@ -1,6 +1,12 @@
+<<<<<<< HEAD
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
 #include <ATen/native/SpectralOpsUtils.h>
+=======
+#include <ATen/native/SpectralOpsUtils.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -12,6 +18,23 @@
 #include <ATen/ops/_fft_r2c_native.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+@implementation FakeMPSGraphFFTDescriptor
++ (nullable instancetype)descriptor {
+  // Redispatch the constructor to the actual implementation
+  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
+  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
+}
+
+- (nonnull id)copyWithZone:(nullable NSZone*)zone {
+  return self;
+}
+@end
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 namespace {
 MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
@@ -39,12 +62,33 @@ MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
 } // anonymous namespace
 
 Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
+<<<<<<< HEAD
   auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
+=======
+  TORCH_CHECK(self.is_complex());
+  auto in_sizes = self.sizes();
+  DimVector out_sizes(in_sizes.begin(), in_sizes.end());
+  out_sizes[dim.back()] = last_dim_size;
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
 }
 
 Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
+<<<<<<< HEAD
   auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
+=======
+  TORCH_CHECK(self.is_floating_point());
+  auto input_sizes = self.sizes();
+  DimVector out_sizes(input_sizes.begin(), input_sizes.end());
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
+  if (onesided) {
+    out_sizes[last_dim] = last_dim_halfsize;
+  }
+
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
 }
 
@@ -61,6 +105,7 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 // TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
 Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
+<<<<<<< HEAD
   TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
   TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
   const auto input_sizes = self.sym_sizes();
@@ -72,6 +117,9 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
   }
   at::native::resize_output_symint(out, out_sizes);
 
+=======
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(onesided);
   @autoreleasepool {
@@ -112,12 +160,16 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
                          int64_t normalization,
                          int64_t last_dim_size,
                          Tensor& out) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_complex(), "Input must be complex");
   TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
   const auto in_sizes = self.sym_sizes();
   SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
   out_sizes[dim.back()] = last_dim_size;
   at::native::resize_output_symint(out, out_sizes);
+=======
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(last_dim_size);
   @autoreleasepool {
@@ -143,6 +195,10 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
 }
 
 Tensor& _fft_c2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward, Tensor& out) {
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(forward);
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index 92f2b9c6fbf74..2d3a702573a5d 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -1,9 +1,15 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/GridSampler.h>
+=======
+#include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -11,6 +17,7 @@
 #else
 #include <ATen/ops/grid_sampler_2d.h>
 #include <ATen/ops/grid_sampler_2d_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/grid_sampler_3d_native.h>
 #endif
 
@@ -22,6 +29,11 @@
 #include <ATen/native/mps/GridSampler_metallib.h>
 #endif
 
+=======
+#endif
+
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace mps {
 static void grid_sampler_2d_mps_impl(Tensor& output,
                                      const Tensor& input,
@@ -130,6 +142,7 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
+<<<<<<< HEAD
 
 static void grid_sampler_template(Tensor& output,
                                   const Tensor& input,
@@ -220,6 +233,8 @@ static void grid_sampler_template(Tensor& output,
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace mps
 
 Tensor grid_sampler_2d_mps(const Tensor& input,
@@ -227,6 +242,18 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
                            int64_t interpolation_mode,
                            int64_t padding_mode,
                            bool align_corners) {
+<<<<<<< HEAD
+=======
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS)) {
+    TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.2. ",
+                    "Falling back on CPU. This may have performance implications.");
+
+    return at::grid_sampler_2d(input.to("cpu"), grid.to("cpu"), interpolation_mode, padding_mode, align_corners)
+        .clone()
+        .to("mps");
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto in_size = input.sizes();
   auto grid_size = grid.sizes();
   auto output = at::empty({in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
@@ -235,6 +262,7 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
   return output;
 }
 
+<<<<<<< HEAD
 Tensor grid_sampler_3d_mps(const Tensor& input,
                            const Tensor& grid,
                            int64_t interpolation_mode,
@@ -252,4 +280,6 @@ Tensor grid_sampler_3d_mps(const Tensor& input,
   return output;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Indexing.h b/aten/src/ATen/native/mps/operations/Indexing.h
new file mode 100644
index 0000000000000..f52e5cd7334c3
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Indexing.h
@@ -0,0 +1,8 @@
+//  Copyright © 2022 Apple Inc.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/TensorFactory.h>
+#include <c10/core/ScalarType.h>
+#include <unordered_map>
+
+using namespace at::mps;
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 0b0a84c45a52c..00234f782d07a 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -17,6 +17,12 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+#include <ATen/native/mps/operations/Indexing.h>
+#include <c10/core/QScheme.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
 #include <fmt/format.h>
@@ -107,12 +113,33 @@
 static void validateInputData(const TensorIteratorBase& iter,
                               IntArrayRef index_size,
                               IntArrayRef index_stride,
+<<<<<<< HEAD
                               const std::string& op) {
+=======
+                              const std::string& op,
+                              bool accumulate) {
+  using namespace mps;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_indices = index_size.size();
   TORCH_CHECK(num_indices <= 16, "Current limit allows up to 16 indices to be used in MPS indexing kernels");
 
   AT_ASSERT(num_indices == index_stride.size());
   AT_ASSERT(static_cast<int>(num_indices) == iter.ntensors() - 2);
+<<<<<<< HEAD
+=======
+  const Tensor& inputTensor = iter.tensor(1);
+  const auto scalar_type = inputTensor.scalar_type();
+
+  if (accumulate) {
+    // No atomic support for the rest of dtypes
+    TORCH_CHECK(supportedFloatingType(scalar_type) || scalar_type == kInt || scalar_type == kBool);
+  } else {
+    TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) ||
+                    scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf,
+                getMPSTypeString(inputTensor) + std::string(" not supported for index.Tensor_out"));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Tensor& masked_select_out_mps_impl(Tensor& result, const Tensor& self, const Tensor& mask) {
@@ -143,7 +170,11 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
                                   IntArrayRef index_stride,
                                   const std::string& kernel_name,
                                   const bool serial = false) {
+<<<<<<< HEAD
   validateInputData(iter, index_size, index_stride, "index.Tensor_out");
+=======
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (iter.numel() == 0)
     return;
   if (!iter.can_use_32bit_indexing()) {
@@ -185,7 +216,11 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
 }
 
 static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
+<<<<<<< HEAD
   validateInputData(iter, index_size, index_stride, "index.Tensor_out");
+=======
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dispatch_index_kernel(
       iter, index_size, index_stride, fmt::format("index_select_{}", getBitSizeString(iter.tensor_base(0))));
 }
@@ -195,7 +230,11 @@ static void index_put_kernel_mps(TensorIterator& iter,
                                  IntArrayRef index_stride,
                                  bool accumulate) {
   @autoreleasepool {
+<<<<<<< HEAD
     validateInputData(iter, index_size, index_stride, "index_put_impl");
+=======
+    validateInputData(iter, index_size, index_stride, "index_put_impl", accumulate);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (accumulate) {
       dispatch_index_kernel(iter,
                             index_size,
@@ -229,7 +268,11 @@ static void index_put_kernel_mps(TensorIterator& iter,
                 index.numel());
     int64_t idx = index.item<int64_t>();
     TORCH_CHECK(idx == 0, "index_copy_(): the only valid index for a 0-dim tensor is 0, but got ", idx);
+<<<<<<< HEAD
     result.copy_(source.squeeze());
+=======
+    result.copy_(source);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
@@ -253,12 +296,20 @@ static void index_put_kernel_mps(TensorIterator& iter,
     }
   }
 
+<<<<<<< HEAD
   const auto source_size_dim = source.dim() > 0 ? source.size(dim) : 1;
   TORCH_CHECK(index.numel() == source_size_dim,
               "index_copy_(): Number of indices (",
               index.numel(),
               ") should be equal to source.size(dim) (",
               source_size_dim,
+=======
+  TORCH_CHECK(source.size(dim) == index.numel(),
+              "index_copy_(): Number of indices (",
+              index.numel(),
+              ") should be equal to source.size(dim) (",
+              source.size(dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               ")");
 
   auto stream = getCurrentMPSStream();
@@ -281,7 +332,11 @@ static void index_put_kernel_mps(TensorIterator& iter,
       [computeEncoder setComputePipelineState:indexCopyPSO];
       mtl_setArgs(computeEncoder, result, self, source, index, dim_arg, self.sizes(), ndim, indices_numel);
       if (!is_dense) {
+<<<<<<< HEAD
         mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides(), index.strides());
+=======
+        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       mtl_dispatch1DJob(computeEncoder, indexCopyPSO, result.numel());
     }
@@ -339,7 +394,18 @@ static Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
+<<<<<<< HEAD
   if (self.is_complex()) {
+=======
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
+    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+    Tensor out_fallback = nonzero_fallback(self);
+    at::native::resize_output(out_, out_fallback.sizes());
+    out_.copy_(out_fallback);
+    return out_;
+  } else if (self.is_complex()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes. ",
                     "Falling back on CPU. This may have performance implications.");
     Tensor out_fallback = nonzero_fallback(self);
@@ -424,7 +490,15 @@ static Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor nonzero_mps(const Tensor& self) {
+<<<<<<< HEAD
   if (self.is_complex()) {
+=======
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
+    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+    return nonzero_fallback(self);
+  } else if (self.is_complex()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes ",
                     "Falling back on CPU. This may have performance implications.");
     return nonzero_fallback(self);
@@ -512,6 +586,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     return;
   }
 
+<<<<<<< HEAD
   bool use_deterministic_algorithm = globalContext().deterministicAlgorithms();
 
   // TODO: Do not use deterministic algorithm for long/complex but rather implement it as Metal shader
@@ -534,6 +609,9 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     return;
   }
 
+=======
+  TORCH_CHECK(source.scalar_type() != ScalarType::Long, "index_add(): Expected non int64 dtype for source.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto casted_type = isFloatingType(source.scalar_type()) ? ScalarType::Float : ScalarType::Int;
 
   struct CachedGraph : public MPSCachedGraph {
@@ -595,7 +673,32 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
 }
 
 Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
+<<<<<<< HEAD
   Tensor result = at::empty({0}, self.options());
+=======
+  IntArrayRef input_shape = self.sizes();
+  auto num_input_dims = input_shape.size();
+
+  auto num_indices = index.numel();
+  TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
+
+  dim = maybe_wrap_dim(dim, self.dim());
+  std::vector<int64_t> shape_data(num_input_dims);
+
+  // Calculate new shape
+  for (const auto i : c10::irange(num_input_dims)) {
+    if (i == static_cast<decltype(i)>(dim)) {
+      shape_data[i] = num_indices;
+    } else {
+      shape_data[i] = input_shape[i];
+    }
+  }
+
+  IntArrayRef output_shape = IntArrayRef(shape_data.data(), num_input_dims);
+
+  Tensor result = at::empty(output_shape, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   index_select_out_mps(self, dim, index, result);
   return result;
 }
@@ -617,6 +720,7 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
   TORCH_CHECK(self.scalar_type() == output.scalar_type(),
               "index_select(): self and output must have the same scalar type");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
+<<<<<<< HEAD
   at::assert_no_internal_overlap(output);
   at::assert_no_overlap(output, self);
   at::assert_no_overlap(output, index);
@@ -625,6 +729,27 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
     output_size[dim] = num_indices;
   }
   at::native::resize_output(output, output_size);
+=======
+  TORCH_CHECK(output.dim() == 0 || index.size(-1) == output.size(dim),
+              "index_select(): index and output must have the same size at `dim`th dimension, but got ",
+              index.size(-1),
+              " and ",
+              output.size(dim),
+              ".");
+
+  for (const auto i : irange(self.dim())) {
+    if (i == dim)
+      continue;
+    TORCH_CHECK(self.size(i) == output.size(i),
+                "index_select(): self and output must have the same dimensions except for `dim`th dimension, but got ",
+                self.size(i),
+                " and ",
+                output.size(i),
+                " at dimension ",
+                i,
+                ".");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Empty index
   if (num_indices == 0 || self.numel() == 0) {
@@ -910,10 +1035,13 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
               "index_fill_(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
+<<<<<<< HEAD
   TORCH_CHECK(self.is_complex() || !source.is_complex(),
               "index_fill_(): Converting complex Scalar to non-complex type is not supported");
   // MPS.scatter crashes if used with complex dtypes
   TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Empty index
   if (num_indices == 0) {
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 219086edd8e37..4964cbe29d46d 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -115,10 +115,14 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     return output;
   }
 
+<<<<<<< HEAD
   // No-graph execution causes nonsense if these are non-contiguous.
   const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
 
   if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
+=======
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mps_linear_nograph(input, weight, bias, output);
     // Squeeze last dim of 1D linear
     return weight_arg.dim() != 1 ? output : output.squeeze(-1);
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index aed417ca9ca92..ea43faebb4a10 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -6,11 +6,18 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
+<<<<<<< HEAD
 #include <ATen/native/mps/MPSGraphSequoiaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/LinearAlgebra.h>
 
 #include <fmt/format.h>
+=======
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSequoiaOps.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -23,7 +30,10 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/eye_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@@ -31,7 +41,10 @@
 #include <ATen/ops/linalg_solve_triangular_native.h>
 #include <ATen/ops/lu_unpack_native.h>
 #include <ATen/ops/mm_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/orgqr_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/slice.h>
 #include <ATen/ops/stack.h>
 #include <ATen/ops/triangular_solve_native.h>
@@ -115,6 +128,7 @@
   return output;
 }
 
+<<<<<<< HEAD
 Tensor& do_metal_addmm(const Tensor& self,
                        const Tensor& other,
                        Tensor& output,
@@ -170,6 +184,8 @@
   return output;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<MPSGraphTensor*, MPSGraphTensor*, MPSGraphTensor*> do_mm(MPSGraph* graph,
                                                                     const Tensor& self,
                                                                     const Tensor& other) {
@@ -200,6 +216,7 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output)
        other.size(0) > max_stride_size || other.size(1) > max_stride_size);
 }
 
+<<<<<<< HEAD
 void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
   const auto& status_flat = status.view(-1);
 
@@ -222,6 +239,8 @@ void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // anonymous namespace
 
 static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
@@ -342,8 +361,11 @@ static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
           ". See https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus for details.");
     }
   }
+<<<<<<< HEAD
 
   map_mps_decomposition_error_code_to_blas(info);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void linalg_solve_out_mps_impl(const Tensor& A,
@@ -515,9 +537,12 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
                   "mpsmatrixdecompositionstatus for details.");
     }
   }
+<<<<<<< HEAD
 
   map_mps_decomposition_error_code_to_blas(info);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!left) {
     // If this was a right solve, transpose the result back
     result.copy_(result_t.transpose(-2, -1).contiguous());
@@ -528,12 +553,21 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
   using namespace mps;
   TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
   TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
+<<<<<<< HEAD
+
+  info.zero_();
+=======
+  using CachedGraph = MPSUnaryCachedGraph;
 
+  MPSStream* stream = getCurrentMPSStream();
   info.zero_();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (A.numel() == 0) {
     return;
   }
 
+<<<<<<< HEAD
   auto A_sizes = A.sizes();
   int ndim = A.dim();
 
@@ -546,6 +580,19 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
   Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
   linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
   result.copy_(tmp);
+=======
+  if (!result.is_contiguous()) {
+    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+  }
+  auto A_sizes = A.sizes();
+  int ndim = A.dim();
+
+  Tensor LU = empty_like(A);
+  Tensor identity = zeros_like(A);
+  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
+  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
@@ -727,6 +774,10 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
 
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(supportedFloatingOrComplexType(self), "MPS device does not support addmm for non-float input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}};
   checkAllSameGPU(__func__, args);
@@ -753,10 +804,13 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
     return output;
   }
 
+<<<<<<< HEAD
   if (use_metal_mm(self, other, output)) {
     return do_metal_addmm(self, other, output, alpha, beta, *bias_);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_beta_non_zero = beta.toDouble() != 0.0;
 
   struct CachedGraph : public mps::MPSCachedGraph {
@@ -1239,6 +1293,7 @@ static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper
   }
 }
 
+<<<<<<< HEAD
 static Tensor& orgqr_stub_impl(Tensor& self, const Tensor& tau) {
   if (self.numel() == 0) {
     return self;
@@ -1302,6 +1357,8 @@ static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper
   return self;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace mps
 
 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@@ -1517,6 +1574,23 @@ Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper,
   mps::linalg_solve_out_mps_impl(A, B, left, check_errors, result, LU, pivots, info);
 }
 
+<<<<<<< HEAD
+=======
+std::tuple<Tensor&, Tensor&> linalg_lu_factor_out_mps(const Tensor& A, bool pivot, Tensor& LU, Tensor& pivots) {
+  Tensor info = at::empty({}, A.options().dtype(kInt));
+  mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
+  return std::tie(LU, pivots);
+}
+
+std::tuple<Tensor, Tensor> linalg_lu_factor_mps(const Tensor& A, bool pivot) {
+  Tensor LU = at::empty({0}, A.options());
+  Tensor pivots = at::empty({0}, A.options().dtype(kInt));
+  Tensor info = at::empty({}, A.options().dtype(kInt));
+  mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
+  return std::make_tuple(std::move(LU), std::move(pivots));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(lu_unpack_out_mps)
 (const Tensor& LU_data,
  const Tensor& LU_pivots,
@@ -1538,6 +1612,9 @@ Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper,
 }
 
 REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
+<<<<<<< HEAD
 REGISTER_DISPATCH(orgqr_stub, mps::orgqr_stub_impl);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index f5264cf32d9f2..cb1d4d0365b58 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -597,10 +597,14 @@ Check if running mean exists (maybe do this check before making graph)
 
   const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
 
+<<<<<<< HEAD
   bool any_grad_needed = (grad_input_mask[0] && grad_input.numel() > 0) ||
       (grad_input_mask[1] && grad_weight.numel() > 0) || (grad_input_mask[2] && grad_bias.numel() > 0);
 
   if (!any_grad_needed) {
+=======
+  if (grad_input.numel() == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_tuple(grad_input, grad_weight, grad_bias);
   }
 
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 2945ebf715f27..10e9b4c70013e 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -460,9 +460,12 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i
 
 // backward pass is explicitly handled in autograd by negating the "pad" argument
 Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) {
+<<<<<<< HEAD
   if (pad.empty()) {
     return self.clone();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (pad.size() > 6) {
     TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ",
                     "It uses View Ops default implementation to run. This may have performance implications.");
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index fdfabecef06b9..eea18bc3243fb 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -1,10 +1,15 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/kernels/Pooling.h>
+=======
+#include <ATen/native/Pool.h>
+#include <ATen/native/mps/OperationUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -14,12 +19,16 @@
 #include <ATen/ops/avg_pool2d_backward.h>
 #include <ATen/ops/avg_pool2d_backward_native.h>
 #include <ATen/ops/avg_pool2d_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/avg_pool3d_backward_native.h>
 #include <ATen/ops/avg_pool3d_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/max_pool2d_backward_native.h>
 #include <ATen/ops/max_pool2d_native.h>
 #include <ATen/ops/max_pool2d_with_indices_backward_native.h>
 #include <ATen/ops/max_pool2d_with_indices_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/max_pool3d_with_indices_backward_native.h>
 #include <ATen/ops/max_pool3d_with_indices_native.h>
 #include <ATen/ops/max_unpool2d_native.h>
@@ -34,6 +43,11 @@
 #include <ATen/native/mps/Pooling_metallib.h>
 #endif
 
+=======
+#endif
+
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace mps {
 
 struct PoolingCachedGraph : public MPSCachedGraph {
@@ -256,6 +270,7 @@ static void pool2d_template(const Tensor& input,
   }
 }
 
+<<<<<<< HEAD
 static std::vector<int32_t> copy_and_maybe_expand(IntArrayRef a, int32_t pooling_dims) {
   std::vector<int32_t> b(pooling_dims);
   for (const auto dim : c10::irange(pooling_dims)) {
@@ -584,6 +599,8 @@ static void max_unpool_out_mps_template(const Tensor& input,
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void avg_pool2d_template(const Tensor& input,
                                 const Tensor& output,
                                 const std::optional<Tensor>& grad_output_opt,
@@ -705,6 +722,7 @@ static void avg_pool2d_template(const Tensor& input,
                   op_name);
 }
 
+<<<<<<< HEAD
 static void avg_pool_out_mps_template(const Tensor& output,
                                       const Tensor& input,
                                       IntArrayRef _kernel_size,
@@ -831,6 +849,10 @@ static bool use_graph_for_max_pool2d(IntArrayRef kernel_size, IntArrayRef stride
   return (stride[0] == 1) && (stride.size() == 1 || stride[1] == 1);
 }
 
+=======
+} // namespace mps
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor mps_max_pool2d(const Tensor& input,
                       IntArrayRef kernel_size,
                       IntArrayRef stride,
@@ -838,6 +860,7 @@ Tensor mps_max_pool2d(const Tensor& input,
                       IntArrayRef dilation,
                       bool ceil_mode) {
   Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
+<<<<<<< HEAD
   bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
   if (use_graph) {
     mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
@@ -869,6 +892,26 @@ Tensor mps_max_pool2d(const Tensor& input,
                                                 /*pooling_dims=*/2,
                                                 "max_pool2d");
   }
+=======
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
+  };
+  mps::pool2d_template(input,
+                       output,
+                       std::nullopt,
+                       std::nullopt,
+                       kernel_size,
+                       stride,
+                       padding,
+                       dilation,
+                       ceil_mode,
+                       false,
+                       std::nullopt,
+                       pooling_op_block,
+                       "max_pool2d");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return output;
 }
 
@@ -913,6 +956,7 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output,
  bool ceil_mode,
  const Tensor& output,
  const Tensor& indices) {
+<<<<<<< HEAD
   bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
   if (use_graph) {
     auto indices_memory_format = indices.suggest_memory_format();
@@ -952,6 +996,34 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output,
                                                 ceil_mode,
                                                 /*pooling_dims=*/2,
                                                 "max_pool2d");
+=======
+  auto indices_memory_format = indices.suggest_memory_format();
+
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor
+                                                                                     descriptor:desc
+                                                                                           name:nil];
+    cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
+    return poolOutputs[0];
+  };
+  mps::pool2d_template(input,
+                       output,
+                       indices,
+                       std::nullopt,
+                       kernel_size,
+                       stride,
+                       padding,
+                       dilation,
+                       ceil_mode,
+                       false,
+                       std::nullopt,
+                       pooling_op_block,
+                       "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -987,6 +1059,7 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output,
                        "max_pool2d_indices_backward");
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor&, Tensor&> max_pool3d_with_indices_out_mps(const Tensor& input,
                                                              IntArrayRef kernel_size,
                                                              IntArrayRef stride,
@@ -1144,6 +1217,8 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
   return output;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(avg_pool2d_out_mps)
 (const Tensor& input,
  int64_t kH,
@@ -1156,6 +1231,7 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
  bool count_include_pad,
  std::optional<int64_t> divisor_override,
  const Tensor& output) {
+<<<<<<< HEAD
   if (ceil_mode) {
     mps::avg_pool_out_mps_template(output,
                                    input,
@@ -1180,6 +1256,19 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
                              divisor_override,
                              "avg_pool2d");
   }
+=======
+  mps::avg_pool2d_template(input,
+                           output,
+                           std::nullopt,
+                           {kH, kW},
+                           {dH, dW},
+                           {padH, padW},
+                           {1, 1},
+                           ceil_mode,
+                           count_include_pad,
+                           divisor_override,
+                           "avg_pool2d");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps)
@@ -1205,6 +1294,7 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
                            "avg_pool2d_backward");
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(avg_pool3d_out_mps)
 (const Tensor& input,
  IntArrayRef kernel_size,
@@ -1248,4 +1338,6 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
                                           "avg_pool3d_backward");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/RMSNorm.mm b/aten/src/ATen/native/mps/operations/RMSNorm.mm
index 7948b5acd8e93..1572d56ddb9cf 100644
--- a/aten/src/ATen/native/mps/operations/RMSNorm.mm
+++ b/aten/src/ATen/native/mps/operations/RMSNorm.mm
@@ -19,6 +19,7 @@
 #include <ATen/native/mps/RMSNorm_metallib.h>
 #endif
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> _fused_rms_norm_mps(const Tensor& input,
                                                IntArrayRef normalized_shape,
                                                const std::optional<Tensor>& weight_opt,
@@ -27,6 +28,9 @@
   const int64_t normalized_ndim = normalized_shape.size();
   auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
 
+=======
+Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, const Tensor& weight, const double eps) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(input.is_contiguous() && weight.is_contiguous(), "Expected contiguous input and weight tensors");
   auto output = at::empty_like(input);
   const auto input_shape = input.sizes();
@@ -48,7 +52,11 @@
       const std::string kernel = fmt::format("{}_{}", name, scalarToMetalTypeString(output));
       id<MTLComputePipelineState> rms_norm_pso = lib.getPipelineStateForFunc(kernel);
       [computeEncoder setComputePipelineState:rms_norm_pso];
+<<<<<<< HEAD
       mtl_setArgs(computeEncoder, input, weight, output, eps_val, N, 1);
+=======
+      mtl_setArgs(computeEncoder, input, weight, output, eps, N, 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       const auto maxThreadsPerGroup = static_cast<size_t>([rms_norm_pso maxTotalThreadsPerThreadgroup]);
       size_t threadgroup_size = maxThreadsPerGroup;
@@ -65,7 +73,11 @@
     }
   });
 
+<<<<<<< HEAD
   return std::make_tuple(output, Tensor());
+=======
+  return output;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index f4e469b79cb48..104091dd8bfb2 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -4,6 +4,10 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/ReduceOpsUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <c10/util/irange.h>
 
@@ -151,6 +155,11 @@ static void reduction_out_mps(const Tensor& input_t,
                               const Tensor& output_t,
                               MPSReductionType reduction_type,
                               const std::string& func_name) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, func_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NS: TODO: get rid of all those shenanigans and just call reduction_op with view tensor
   bool canSqueezeLastDim = true;
   IntArrayRef input_shape = input_t.sizes();
@@ -158,7 +167,11 @@ static void reduction_out_mps(const Tensor& input_t,
     IntArrayRef dim = opt_dim.value();
     for (const auto dim_val : dim) {
       auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
+<<<<<<< HEAD
       // canSqueeze logic is broken when dim is negative, it introduces off-by-one-errors or crashes
+=======
+      // canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
       if (wrap_dim >= 4 || dim_val < 0) {
         canSqueezeLastDim = false;
@@ -233,10 +246,19 @@ static void reduction_out_mps(const Tensor& input_t,
       MPSGraphTensor* castInputTensor = inputTensor;
       MPSDataType inputCastType = MPSDataTypeInvalid;
       if (dtype.has_value() &&
+<<<<<<< HEAD
           (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt || dtype.value() == kLong)) {
         inputCastType = getMPSDataType(dtype.value());
       } else if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat &&
                  inputScalarType != kComplexFloat && inputScalarType != kComplexHalf && inputScalarType != kLong) {
+=======
+          (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt ||
+           (dtype.value() == kLong && macOS13_3_plus))) {
+        inputCastType = getMPSDataType(dtype.value());
+      } else if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat &&
+                 inputScalarType != kComplexFloat && inputScalarType != kComplexHalf &&
+                 (inputScalarType != kLong || !macOS13_3_plus)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputCastType = getMPSDataType(kFloat);
       }
 
@@ -455,7 +477,11 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
     errMessage += ": reduction dim must be in the range of input shape";
     for (const auto dim : dim_value) {
       auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
+<<<<<<< HEAD
       TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
+=======
+      TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -610,13 +636,22 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
 }
 
 static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, nanmedian ? "nanmedian" : "median");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   IntArrayRef input_shape = input_t.sizes();
   int64_t num_in_elements = c10::multiply_integers(input_shape);
 
   // we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
   Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
   if (output_t.numel() == 0 || num_in_elements == 0) {
+<<<<<<< HEAD
     output_t.fill_(std::numeric_limits<float>::quiet_NaN());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return output_t;
   }
 
@@ -627,7 +662,12 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
   auto medianCachedGraph =
       LookUpOrCreateCachedGraph<MedianCachedGraph>(medianKey, [&](auto mpsGraph, auto newCachedGraph) {
         MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+<<<<<<< HEAD
         MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
+=======
+        MPSGraphTensor* castInputTensor =
+            castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         MPSGraphTensor* reshapedTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
 
@@ -685,6 +725,12 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
 }
 
 static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction_type, const std::string& func_name) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using CachedGraph = MPSUnaryCachedGraph;
 
   IntArrayRef input_shape = input_t.sizes();
@@ -702,7 +748,12 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
       MPSGraphTensor* castOutputTensor = nil;
+<<<<<<< HEAD
       MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
+=======
+      MPSGraphTensor* castInputTensor =
+          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       NSArray<NSNumber*>* axes = getTensorAxes(input_t);
       if (reduction_type == MPSReductionType::MAX) {
@@ -737,6 +788,12 @@ static void min_max_out_mps(const Tensor& input_t,
                             const Tensor& indices_t,
                             MPSReductionType reduction_type,
                             const std::string& func_name) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (output_t.numel() == 0) {
     return;
   }
@@ -774,7 +831,12 @@ static void min_max_out_mps(const Tensor& input_t,
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       MPSGraphTensor* outputTensor = nil;
+<<<<<<< HEAD
       MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
+=======
+      MPSGraphTensor* castInputTensor =
+          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       if (reduction_type == MPSReductionType::MAX) {
         outputTensor = [mpsGraph reductionMaximumPropagateNaNWithTensor:castInputTensor axis:(NSInteger)dim_ name:nil];
@@ -880,6 +942,12 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
                                   const std::string& func_name) {
   using CachedGraph = MPSUnaryCachedGraph;
 
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "argmax_argmin_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t dim_ = -1;
 
   if (dim.has_value()) {
@@ -934,7 +1002,11 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
 
       MPSGraphTensor* castInputTensor = inputTensor;
       if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat &&
+<<<<<<< HEAD
           inputScalarType != kLong) {
+=======
+          (inputScalarType != kLong || !macOS13_3_plus)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         castInputTensor = castMPSTensor(mpsGraph, inputTensor, kFloat);
       }
       if (reduction_type == MPSReductionType::MAX) {
@@ -1263,6 +1335,12 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, op_name);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
   native::zero_numel_check_dims(input_t, dim_, op_name.c_str());
 
@@ -1281,8 +1359,13 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
+<<<<<<< HEAD
       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
+=======
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      // reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // See https://github.com/pytorch/pytorch/issues/95538
       MPSGraphTensor* outputTensor = nil;
       if (input_t.ndimension() > 4) {
@@ -1347,12 +1430,23 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "any_all_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   @autoreleasepool {
     std::string key = std::string("any_all_out_mps:") + getTensorsStringKey(input_t);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+<<<<<<< HEAD
       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
+=======
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      // reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // See https://github.com/pytorch/pytorch/issues/95538
       if (input_t.dim() > 4) {
         castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
@@ -1395,12 +1489,23 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "all_all_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   @autoreleasepool {
     std::string key = std::string("all_all_out_mps:") + getTensorsStringKey(input_t);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+<<<<<<< HEAD
       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
+=======
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      // reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // See https://github.com/pytorch/pytorch/issues/95538
       if (input_t.ndimension() > 4) {
         castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
@@ -1484,6 +1589,12 @@ static void median_out_mps_common(const Tensor& input_t,
                                   Tensor& indices,
                                   const std::string& func_name,
                                   bool nanmedian) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
   native::zero_numel_check_dims(input_t, dim_, "max()");
 
@@ -1554,7 +1665,12 @@ static void median_out_mps_common(const Tensor& input_t,
         getTensorsStringKey(indices);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+<<<<<<< HEAD
       MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
+=======
+      MPSGraphTensor* castInputTensor =
+          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       MPSGraphTensor* effectiveLengthTensor = nil;
       if (nanmedian) {
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 40afa15b4f700..41cb8876e1dab 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -129,8 +129,21 @@ void computeRepeatIndices(const index_t* repeat_ptr,
   });
 }
 
+<<<<<<< HEAD
 Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
   Tensor output;
+=======
+Tensor repeat_interleave_mps(const Tensor& repeat_, std::optional<int64_t> output_size) {
+  Tensor output;
+  Tensor repeat = repeat_;
+  if (repeat.scalar_type() == kLong && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS)) {
+    // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output,
+    // which currently doesn't support int64_t as input. Casting internally the indices to int32_t.
+    TORCH_WARN_ONCE(
+        "MPS: no support for int64 repeats mask, casting it to int32. Support has been added in macOS 13.3");
+    repeat = repeat.to(kInt);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
     output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
   });
diff --git a/aten/src/ATen/native/mps/operations/Scalar.mm b/aten/src/ATen/native/mps/operations/Scalar.mm
index afda8557c9524..409643bedfd46 100644
--- a/aten/src/ATen/native/mps/operations/Scalar.mm
+++ b/aten/src/ATen/native/mps/operations/Scalar.mm
@@ -5,6 +5,13 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_local_scalar_dense_native.h>
 
+<<<<<<< HEAD
+=======
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using namespace at::mps;
 
 namespace at::native {
@@ -15,12 +22,18 @@ Scalar _local_scalar_dense_mps(const Tensor& self) {
 
   auto output = at::empty_like(self, TensorOptions(kCPU));
   mps::mps_copy_(output, self, false);
+<<<<<<< HEAD
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(at::ScalarType::Half,
                                          at::ScalarType::Bool,
                                          at::ScalarType::BFloat16,
                                          at::ScalarType::UInt16,
                                          at::ScalarType::UInt32,
                                          at::ScalarType::UInt64,
+=======
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half,
+                                         at::ScalarType::Bool,
+                                         at::ScalarType::BFloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                          self.scalar_type(),
                                          "_local_scalar_dense_mps",
                                          [&] {
diff --git a/aten/src/ATen/native/mps/operations/ScanKernel.mm b/aten/src/ATen/native/mps/operations/ScanKernel.mm
index 80495ba9d501d..169d2abed4844 100644
--- a/aten/src/ATen/native/mps/operations/ScanKernel.mm
+++ b/aten/src/ATen/native/mps/operations/ScanKernel.mm
@@ -10,9 +10,13 @@
 #else
 #include <ATen/ops/_cummax_helper_native.h>
 #include <ATen/ops/_cummin_helper_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/_logcumsumexp_native.h>
 #endif
 #include <fmt/format.h>
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native {
 namespace mps {
@@ -23,6 +27,7 @@
 #include <ATen/native/mps/ScanKernel_metallib.h>
 #endif
 
+<<<<<<< HEAD
 // Utility function to get 2D grid dimensions for dispatch
 static std::pair<uint32_t, uint32_t> get_2d_grid_dims(const IntArrayRef& shape, const int64_t dim) {
   size_t grid_x = 1;
@@ -47,11 +52,20 @@
 
 static void scan_simple_mps_impl(const Tensor& self, const Tensor& output, int64_t dim, const std::string& op_name) {
   if (output.numel() == 0) {
+=======
+// Generic scan implementation that handles both simple scans and scans with indices
+static void scan_mps_impl(const Tensor& self,
+                          const std::vector<Tensor>& outputs,
+                          int64_t dim,
+                          const std::string& op_name) {
+  if (outputs[0].numel() == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
   const int64_t ndim = self.dim();
   const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim);
+<<<<<<< HEAD
   const int64_t axis_size = self.size(wrapped_dim);
 
   // Preprocess input tensor - ensure it's contiguous for Metal shaders
@@ -70,26 +84,71 @@ static void scan_simple_mps_impl(const Tensor& self, const Tensor& output, int64
 
   // Determine which kernel to use based on scan dimension position
   bool is_innermost_scan = (wrapped_dim == ndim - 1);
+=======
+
+  // Calculate dimensions for scan operation
+  int64_t row_size = self.size(wrapped_dim);
+  auto sizes = self.sizes();
+
+  bool is_innermost = (wrapped_dim == ndim - 1);
+
+  // Check if all tensors are contiguous
+  bool is_contiguous = self.is_contiguous();
+  for (const auto& output : outputs) {
+    is_contiguous = is_contiguous && output.is_contiguous();
+  }
+
+  uint32_t num_rows, num_orows, num_irows, num_threads;
+
+  if (is_innermost) {
+    // Treat all outer dimensions as a single dimension
+    num_rows = self.numel() / row_size;
+    num_threads = num_rows;
+  } else {
+    // Treat all outer dimensions (i.e. dim_ < dim) as one
+    num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + wrapped_dim);
+    // Treat all inner dimensions (i.e. dim > dimension) as one
+    num_irows = c10::multiply_integers(sizes.begin() + wrapped_dim + 1, sizes.end());
+    num_threads = num_orows * num_irows;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   MPSStream* mpsStream = getCurrentMPSStream();
   dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
 
+<<<<<<< HEAD
       // Build kernel name based on scan dimension position
       const auto type_str = scalarToMetalTypeString(input_tensor);
       const auto kernel_name = fmt::format("{}_{}_{}", op_name, is_innermost_scan ? "innermost" : "outer", type_str);
+=======
+      // Choose kernel based on contiguity and dimension
+      std::string kernel_name;
+      if (is_contiguous) {
+        kernel_name =
+            op_name + "_contiguous_" + (is_innermost ? "innermost_" : "outer_") + scalarToMetalTypeString(self);
+      } else {
+        kernel_name = op_name + "_strided_" + scalarToMetalTypeString(self);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       id<MTLComputePipelineState> scanPSO = lib.getPipelineStateForFunc(kernel_name);
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() {
+<<<<<<< HEAD
         std::vector<Tensor> all_tensors = {input_tensor, output_tensor};
+=======
+        std::vector<Tensor> all_tensors = {self};
+        all_tensors.insert(all_tensors.end(), outputs.begin(), outputs.end());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return all_tensors;
       }());
 
       [computeEncoder setComputePipelineState:scanPSO];
 
+<<<<<<< HEAD
       // Set input and output buffers (both guaranteed contiguous)
       mtl_setBuffer(computeEncoder, input_tensor, 0);
       mtl_setBuffer(computeEncoder, output_tensor, 1);
@@ -251,11 +310,68 @@ static void scan_with_indices_mps_impl(const Tensor& self,
   if (indices_needs_copy) {
     indices_output.copy_(indices_tensor);
   }
+=======
+      // Set input tensor
+      mtl_setBuffer(computeEncoder, self, 0);
+
+      // Set output tensors
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        mtl_setBuffer(computeEncoder, outputs[i], i + 1);
+      }
+
+      if (is_contiguous) {
+        // Contiguous kernels
+        if (is_innermost) {
+          if (outputs.size() == 1) {
+            // Simple scan
+            mtl_setArgs<2>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
+          } else {
+            // Scan with indices
+            mtl_setArgs<3>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
+          }
+        } else {
+          if (outputs.size() == 1) {
+            // Simple scan
+            mtl_setArgs<2>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
+          } else {
+            // Scan with indices
+            mtl_setArgs<3>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
+          }
+        }
+      } else {
+        // Strided kernels - pass full tensor information
+        if (outputs.size() == 1) {
+          // Simple scan
+          mtl_setArgs<2>(computeEncoder,
+                         self.sizes(),
+                         self.strides(),
+                         outputs[0].strides(),
+                         static_cast<uint32_t>(self.ndimension()),
+                         static_cast<uint32_t>(wrapped_dim));
+        } else {
+          // Scan with indices
+          mtl_setArgs<3>(computeEncoder,
+                         self.sizes(),
+                         self.strides(),
+                         outputs[0].strides(),
+                         outputs[1].strides(),
+                         static_cast<uint32_t>(self.ndimension()),
+                         static_cast<uint32_t>(wrapped_dim));
+        }
+      }
+
+      mtl_dispatch1DJob(computeEncoder, scanPSO, num_threads);
+
+      getMPSProfiler().endProfileKernel(scanPSO);
+    }
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace mps
 
 void cummax_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+<<<<<<< HEAD
   mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummax");
 }
 
@@ -282,6 +398,13 @@ void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int6
 Tensor _logcumsumexp_mps(const Tensor& self, int64_t dim) {
   Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
   return _logcumsumexp_out_mps(self, dim, result);
+=======
+  mps::scan_mps_impl(self, {values, indices}, dim, "cummax");
+}
+
+void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+  mps::scan_mps_impl(self, {values, indices}, dim, "cummin");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 973bef036d564..3dadd576243ec 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -2,6 +2,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
+<<<<<<< HEAD
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/TensorShape.h>
@@ -10,6 +11,12 @@
 #include <ATen/native/mps/kernels/Shape.h>
 
 #include <fmt/format.h>
+=======
+#include <ATen/native/TensorShape.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -21,6 +28,7 @@
 #endif
 
 namespace at::native {
+<<<<<<< HEAD
 
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
@@ -28,6 +36,8 @@
 #include <ATen/native/mps/Shape_metallib.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace mps {
 
 // Produces a shape with the `dim` dimension set to 0.
@@ -69,6 +79,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
                 ")");
   }
 }
+<<<<<<< HEAD
 
 template <typename T>
 std::string get_type_str();
@@ -146,6 +157,8 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
     input_idx++;
   }
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace mps
 
 // topk
@@ -308,16 +321,30 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
               " and out is on ",
               out.device());
 
+<<<<<<< HEAD
+=======
+  // TODO: For better performance by eliminating input tensor gathering and post transpose,
+  // TODO: it is better to keep the out tensor's memory format.
+  // TODO: dimension needs to be recomputed as:
+  // TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
+  if (needsGather(out)) {
+    out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
   int64_t cat_dim_size = 0;
   idx = 0;
+<<<<<<< HEAD
   bool has_large_tensor = false;
   for (const Tensor& tensor : materialized_inputs) {
     if (isTooLargeForMPSGraph(tensor)) {
       has_large_tensor |= true;
     }
+=======
+  for (const Tensor& tensor : materialized_inputs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!should_skip(tensor)) {
       // TODO: Factor out `check_shape_except_dim`
       check_shape_except_dim(notSkippedTensor, tensor, dimension, idx);
@@ -335,12 +362,88 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
     return;
   }
 
+<<<<<<< HEAD
   has_large_tensor |= isTooLargeForMPSGraph(out);
 
   if (has_large_tensor) {
     return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
   } else {
     return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
+=======
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    std::vector<MPSGraphTensor*> inputTensors_;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  @autoreleasepool {
+    std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
+        (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+    if (!all_same_dtype) {
+      key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
+    } else {
+      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
+    }
+    for (auto idx : skipped_tensor_indices) {
+      key += "," + std::to_string(idx);
+    }
+
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
+      std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
+      newCachedGraph->inputTensors_.reserve(len_tensor_array);
+
+      for (const auto idx : c10::irange(len_tensor_array)) {
+        const Tensor& tensor = input_tensors[idx];
+        auto scalar_type = getMPSScalarType(tensor.scalar_type());
+        if (tensor.scalar_type() == kBool) {
+          scalar_type = MPSDataTypeInt8;
+        }
+        newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
+        if (tensor.scalar_type() != out_dtype) {
+          castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
+                                                toType:getMPSDataType(out_dtype)
+                                                  name:@"castInput"];
+        } else {
+          castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
+        }
+      }
+
+      auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data() count:len_tensor_array];
+      MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
+                                                   dimension:dimension // Maybe convert this from int64_t -> int32
+                                                        name:nil];
+      if (getMPSDataType(out_dtype) == MPSDataTypeBool) {
+        outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"outputTensor"];
+      }
+      newCachedGraph->outputTensor_ = outputTensor;
+    });
+
+    std::vector<Placeholder> inputPlaceholders;
+    int i = 0;
+    int t_idx = 0;
+    for (const Tensor& tensor : materialized_inputs) {
+      if (std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
+        auto scalar_type = getMPSScalarType(tensor.scalar_type());
+        if (tensor.scalar_type() == kBool) {
+          scalar_type = MPSDataTypeInt8;
+        }
+        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
+        t_idx++;
+      }
+      i++;
+    }
+
+    auto outputDataType = getMPSScalarType(out.scalar_type());
+    Placeholder outputPlaceholder =
+        Placeholder(cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
+
+    NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
+    for (auto& inputPlaceholder : inputPlaceholders) {
+      feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    }
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index b6a07f14704cc..544ff31873ef3 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -2,20 +2,30 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
+<<<<<<< HEAD
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
+=======
+#include <ATen/native/TensorShape.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+<<<<<<< HEAD
 #include <ATen/ops/kthvalue_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/sort.h>
 #include <ATen/ops/sort_native.h>
 #endif
 namespace at::native {
+<<<<<<< HEAD
 namespace {
 
 void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
@@ -90,6 +100,8 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
   }
 }
 } // anonymous namespace
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // sort
 TORCH_IMPL_FUNC(sort_stable_out_mps)
@@ -101,6 +113,12 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
  const Tensor& indices) {
   using namespace mps;
 
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(self, macOS13_3_plus, "sort_stable_out");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (self.numel() == 0) {
     return;
   }
@@ -127,7 +145,12 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
 
+<<<<<<< HEAD
       MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self);
+=======
+      MPSGraphTensor* castInputTensor =
+          castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self, /*includesInt64=*/macOS13_3_plus);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
                                                          axis:(NSInteger)dim
                                                    descending:(BOOL)descending
@@ -156,6 +179,7 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
+<<<<<<< HEAD
 
 std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
                                               int64_t k,
@@ -183,4 +207,6 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
 
   return std::forward_as_tuple(values, indices);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 7b637d896f850..b7a2b8aefb84c 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -297,6 +297,12 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 
   const auto common_type = at::result_type(elements, test_elements);
   TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || supportedFloatingType(common_type),
+              "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ",
+              common_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   @autoreleasepool {
     std::string key = op_name + getTensorsStringKey({elements, test_elements}) + std::to_string(invert);
@@ -335,9 +341,12 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 }
 
 static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
+<<<<<<< HEAD
   if (iter.numel() == 0) {
     return;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto& self = iter.input(0);
   auto& out = iter.output(0);
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
index 418b7358f9b00..369e3c3a3f4d2 100644
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -14,7 +14,10 @@
 #include <ATen/native/mps/UnaryKernel_metallib.h>
 #endif
 
+<<<<<<< HEAD
 // KURT: call site of `exec_unary_kernel`
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define REGISTER_UNARY_TI_DISPATCH(NAME)                    \
   static void NAME##_kernel_mps(TensorIteratorBase& iter) { \
     lib.exec_unary_kernel(iter, #NAME);                     \
@@ -34,7 +37,10 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
 REGISTER_UNARY_TI_DISPATCH(sinh);
 REGISTER_UNARY_TI_DISPATCH(cosh);
 REGISTER_UNARY_TI_DISPATCH(tanh);
+<<<<<<< HEAD
 REGISTER_UNARY_TI_DISPATCH(angle);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_UNARY_TI_DISPATCH(abs);
 REGISTER_UNARY_TI_DISPATCH(sin);
 REGISTER_UNARY_TI_DISPATCH(cos);
@@ -51,7 +57,10 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
 REGISTER_UNARY_TI_DISPATCH(log);
 REGISTER_UNARY_TI_DISPATCH(log1p);
 REGISTER_UNARY_TI_DISPATCH(bitwise_not);
+<<<<<<< HEAD
 REGISTER_UNARY_TI_DISPATCH(round);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_UNARY_TI_DISPATCH(sigmoid);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index f1593c3e9303f..3037aacc027ae 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -2,6 +2,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/Copy.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -12,6 +17,10 @@
 #include <ATen/ops/_copy_from_and_resize.h>
 #include <ATen/ops/acos_native.h>
 #include <ATen/ops/acosh_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/angle_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/asin_native.h>
 #include <ATen/ops/asinh_native.h>
 #include <ATen/ops/atan_native.h>
@@ -181,6 +190,10 @@ static void unary_op(const Tensor& self,
 
 REGISTER_MPS_UNARY_STUB(ceil, ceil);
 REGISTER_MPS_UNARY_STUB(floor, floor);
+<<<<<<< HEAD
+=======
+REGISTER_MPS_UNARY_STUB(round, round);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_MPS_UNARY_STUB(trunc, truncate);
 
 #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                         \
@@ -203,6 +216,42 @@ static void unary_op(const Tensor& self,
   return output;
 }
 
+<<<<<<< HEAD
+=======
+Tensor& angle_out_mps(const Tensor& self, Tensor& output) {
+  if (mps::supportsComplex()) {
+    mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+      auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil];
+      auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil];
+      return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil];
+    });
+    return output;
+  } else {
+    TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13")
+    mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+      // On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are
+      // not available, and NaN is not propagated correctly:
+      auto imagPart = [mpsGraph constantWithScalar:0.0 shape:inputTensor.shape dataType:inputTensor.dataType];
+      auto result = [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:inputTensor name:nil];
+      auto nanMask = [mpsGraph isNaNWithTensor:inputTensor name:nil];
+      return [mpsGraph selectWithPredicateTensor:nanMask
+                             truePredicateTensor:inputTensor
+                            falsePredicateTensor:result
+                                            name:nil];
+    });
+    return output;
+  }
+}
+
+Tensor angle_mps(const Tensor& self) {
+  const auto float_type = c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)
+      ? c10::typeMetaToScalarType(c10::get_default_dtype())
+      : c10::toRealValueType(self.scalar_type());
+  Tensor result = at::empty({0}, self.options().dtype(float_type));
+  return angle_out_mps(self, result);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(frac_out_mps)(const Tensor& self, const Tensor& output) {
   TORCH_CHECK(isFloatingType(self.scalar_type()), "frac_out_mps is only implemented for floating types");
   mps::unary_op(self, output, "frac_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
@@ -325,6 +374,10 @@ static void cumulative_op_impl(const Tensor& self,
                                const Tensor& result,
                                MPSCumulativeOpType cumulativeOpType,
                                const std::string& op_name) {
+<<<<<<< HEAD
+=======
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto nDims = self.dim();
   auto wrapped_dim = maybe_wrap_dim(dim, nDims);
   TORCH_CHECK(wrapped_dim >= 0 && wrapped_dim < std::max(1LL, self.ndimension()),
@@ -343,6 +396,14 @@ static void cumulative_op_impl(const Tensor& self,
   bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int &&
                         input.scalar_type() != ScalarType::Long);
 
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long,
+              "MPS does not support ",
+              op_name,
+              " op with int64 input. Support has been added in macOS 13.3");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mps::unary_op(
       input, result, op_name + std::to_string(dim), ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
         if (castInputData) {
@@ -397,10 +458,24 @@ static void cumulative_op_impl(const Tensor& self,
 
 Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
   TORCH_CHECK(self.is_complex());
+<<<<<<< HEAD
   TORCH_CHECK(self.dtype() != at::kComplexDouble);
   mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
     return [mpsGraph conjugateWithTensor:inputTensor name:nil];
   });
+=======
+  if (!mps::supportsComplex()) {
+    if (!result.is_same_size(self)) {
+      result.resize_(self.sizes());
+    }
+    at::real(result).copy_(at::real(self));
+    at::imag(result).copy_(at::neg(at::imag(self)));
+  } else {
+    mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+      return [mpsGraph conjugateWithTensor:inputTensor name:nil];
+    });
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return result;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 24bf4a7f95961..1db3d45dce417 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -1,6 +1,10 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Resize.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -9,6 +13,7 @@
 #else
 #include <ATen/ops/_unique2.h>
 #include <ATen/ops/_unique2_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/arange.h>
 #include <ATen/ops/argsort.h>
 #include <ATen/ops/cat.h>
@@ -18,13 +23,18 @@
 #include <ATen/ops/nonzero.h>
 #include <ATen/ops/ones.h>
 #include <ATen/ops/ones_like.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/slice.h>
 #include <ATen/ops/unique_consecutive.h>
 #include <ATen/ops/unique_consecutive_native.h>
 #include <ATen/ops/unique_dim_consecutive.h>
 #include <ATen/ops/unique_dim_consecutive_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/unique_dim_native.h>
 #include <ATen/ops/zeros.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 namespace at::native {
@@ -316,6 +326,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
   return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
 }
 
+<<<<<<< HEAD
 static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
   const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
   if (rows <= 1 || cols == 0) {
@@ -397,4 +408,6 @@ static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
   return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 435af3ce7cf6a..9828c0acc2fd2 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -1,6 +1,10 @@
 //  Copyright © 2023 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UpSample.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index ff0bfff3cd7ae..2b06e5d5060d0 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -4,6 +4,11 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
+<<<<<<< HEAD
+=======
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
@@ -15,11 +20,38 @@
 #include <ATen/ops/view_as_real.h>
 #endif
 
+<<<<<<< HEAD
 namespace at::native::mps {
 
 // For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
 // and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
 // is that reduction of n-dimensional vector, where n is 2, should be slower
+=======
+namespace at::native {
+namespace mps {
+
+static IntArrayRef updateTensorBaseShape(const Tensor& self) {
+  IntArrayRef base_shape = getIMPSAllocator()->getBufferShape(self.storage().data());
+  // if there's no base_shape stored in MPSAllocator, then infer it from tensor's size and store it
+  if (base_shape.size() == 0) {
+    // IntArrayRef wouldn't own the data, so we use a static storage
+    static const int64_t shape_1d = 1;
+    // self.sizes().size() could be zero
+    base_shape = self.sizes().size()
+        ? self.sizes()
+        : ((self.is_view() && self._base().sizes().size()) ? self._base().sizes() : IntArrayRef(&shape_1d, 1));
+
+    // base_shape will be retained in MPSAllocator until buffer gets recycled
+    if (self.storage().data())
+      getIMPSAllocator()->setBufferShape(self.storage().data(), base_shape);
+  }
+  return base_shape;
+}
+
+// For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
+// and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
+// is that reduction of n-dimentional vector, where n is 2, should be slower
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // than reduction of 2D one, as n is not known at compiler time, therefore compiler
 // could not do loop unrolls, that is
 // float sum(float* v, int n) {
@@ -177,4 +209,30 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
   return output;
 }
 
+<<<<<<< HEAD
 } // namespace at::native::mps
+=======
+} // namespace mps
+
+// implementation of as_strided() op
+Tensor as_strided_tensorimpl_mps(const Tensor& self,
+                                 IntArrayRef size,
+                                 IntArrayRef stride,
+                                 std::optional<int64_t> storage_offset_) {
+  auto storage_offset = storage_offset_.value_or(self.storage_offset());
+  auto result =
+      detail::make_tensor<TensorImpl>(c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
+  setStrided(result, size, stride, storage_offset);
+
+  // creating the view graph will be deferred until gatherViewTensor() or scatterViewTensor() are called.
+  // In as_strided, we just update the base shape of the buffer in order to retrieve it later
+  // when we create/run the view graph.
+  IntArrayRef base_shape = mps::updateTensorBaseShape(self);
+  TORCH_INTERNAL_ASSERT(
+      !base_shape.empty(), "Failed to update the base shape of tensor's buffer at ", self.storage().data());
+
+  return result;
+}
+
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0bc89ef493dc9..93175903c94a8 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -288,7 +288,10 @@
   dispatch:
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
+<<<<<<< HEAD
     MPS: native_dropout_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
@@ -297,7 +300,10 @@
   dispatch:
     CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
     CUDA: native_dropout_backward_cuda
+<<<<<<< HEAD
     MPS: native_dropout_backward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: native_dropout_backward.out
   tags: pointwise
 
@@ -342,8 +348,13 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: abs_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
+=======
+    SparseCPU, SparseCUDA: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
   tags: [core, pointwise]
 
@@ -352,16 +363,27 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
+=======
+    SparseCPU, SparseCUDA: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: abs_out
     SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
+=======
+    CPU, CUDA, MPS: abs_out
+    SparseCPU, SparseCUDA: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 # Note [Adding an alias]
@@ -403,14 +425,24 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: angle
+=======
+    CPU, CUDA: angle
+    MPS: angle_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr
   tags: pointwise
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: angle_out
+=======
+    CPU, CUDA: angle_out
+    MPS: angle_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr_out
   tags: pointwise
 
@@ -428,7 +460,11 @@
   variants: function, method
   structured_delegate: sgn.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
+=======
+    SparseCPU, SparseCUDA: sgn_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
   tags: pointwise
@@ -437,7 +473,11 @@
   variants: method
   structured_delegate: sgn.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
+=======
+    SparseCPU, SparseCUDA: sgn_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
   tags: pointwise
@@ -448,7 +488,11 @@
   dispatch:
     CPU, CUDA: sgn_out
     MPS: sgn_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
+=======
+    SparseCPU, SparseCUDA: sgn_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
   tags: pointwise
 
@@ -476,7 +520,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: _conj_physical
+<<<<<<< HEAD
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
+=======
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _conj_physical.out
 
 - func: conj_physical(Tensor self) -> Tensor
@@ -487,8 +535,13 @@
   dispatch:
     CPU, CUDA: conj_physical_out
     MPS: conj_physical_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
+=======
+    SparseCPU, SparseCUDA: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@@ -554,7 +607,11 @@
   structured_delegate: add.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
@@ -566,7 +623,11 @@
   variants: method
   structured_delegate: add.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@@ -582,12 +643,18 @@
   dispatch:
     SparseCPU, SparseMeta: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
+<<<<<<< HEAD
     SparseMPS: add_out_sparse_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
     SparseCsrCUDA: add_out_sparse_compressed_cuda
     MkldnnCPU: mkldnn_add_out
     MPS: add_out_mps
+<<<<<<< HEAD
     MTIA: add_out_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -704,7 +771,10 @@
   variants: function, method
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@@ -714,7 +784,10 @@
   cpp_no_default_args: ['dim']
   dispatch:
     CompositeExplicitAutograd: all_dims_default
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -722,8 +795,11 @@
   dispatch:
     CPU, CUDA: all_out
     MPS: all_out_mps
+<<<<<<< HEAD
     MTIA: all_out_mtia
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -732,16 +808,25 @@
     CPU, CUDA: all_dims_out
     CompositeExplicitAutograd: all_dims_out_default
   cpp_no_default_args: ['dim']
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
   variants: function, method
@@ -753,14 +838,22 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: any.out
   variants: function, method
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: any.dims_out
   variants: function, method
   cpp_no_default_args: ['dim']
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dispatch:
     CompositeExplicitAutograd: any_dims_default
 
@@ -770,7 +863,10 @@
   dispatch:
     CPU, CUDA: any_out
     MPS: any_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -779,16 +875,25 @@
     CPU, CUDA: any_dims_out
     CompositeExplicitAutograd: any_dims_out_default
   cpp_no_default_args: ['dim']
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -820,7 +925,10 @@
     CPU, Meta: arange_out
     CUDA: arange_cuda_out
     MPS: arange_mps_out
+<<<<<<< HEAD
     MTIA: arange_mtia_out
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cpp_no_default_args: ['step']
 
 # This function is a temporary hack to allow tracing of arange like constructs with dynamic
@@ -834,27 +942,41 @@
   structured_delegate: argmax.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
     CPU, CUDA: argmax_out
     MPS: argmax_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
   structured_delegate: argmin.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
     CPU, CUDA: argmin_out
     MPS: argmin_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: acosh(Tensor self) -> Tensor
   variants: function, method
@@ -887,7 +1009,11 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
+=======
+    SparseCPU, SparseCUDA: asinh_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
   tags: [core, pointwise]
 
@@ -895,7 +1021,11 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
+=======
+    SparseCPU, SparseCUDA: asinh_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
   tags: pointwise
 
@@ -905,7 +1035,11 @@
   dispatch:
     CPU, CUDA: asinh_out
     MPS: asinh_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
+=======
+    SparseCPU, SparseCUDA: asinh_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
   tags: pointwise
 
@@ -922,7 +1056,11 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
+=======
+    SparseCPU, SparseCUDA: atanh_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
   tags: [core, pointwise]
 
@@ -930,7 +1068,11 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
+=======
+    SparseCPU, SparseCUDA: atanh_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
   tags: pointwise
 
@@ -940,7 +1082,11 @@
   dispatch:
     CPU, CUDA: atanh_out
     MPS: atanh_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
+=======
+    SparseCPU, SparseCUDA: atanh_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
   tags: pointwise
 # arctanh, alias for atanh
@@ -956,8 +1102,14 @@
 - func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     ZeroTensor, CPU, CUDA, MTIA, MPS: as_strided_tensorimpl
     Meta: as_strided_tensorimpl_meta_symint
+=======
+    ZeroTensor, CPU, CUDA, MTIA: as_strided_tensorimpl
+    Meta: as_strided_tensorimpl_meta_symint
+    MPS: as_strided_tensorimpl_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
   device_check: NoCheck
   device_guard: False
@@ -977,7 +1129,11 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asin_sparse
+=======
+    SparseCPU, SparseCUDA: asin_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
   tags: [core, pointwise]
 
@@ -986,7 +1142,11 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
+=======
+    SparseCPU, SparseCUDA: asin_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
   tags: pointwise
 
@@ -996,7 +1156,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: asin_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
+=======
+    SparseCPU, SparseCUDA: asin_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
   tags: pointwise
 
@@ -1014,7 +1178,11 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atan_sparse
+=======
+    SparseCPU, SparseCUDA: atan_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
   tags: [core, pointwise]
 
@@ -1023,7 +1191,11 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
+=======
+    SparseCPU, SparseCUDA: atan_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
   tags: pointwise
 
@@ -1033,7 +1205,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: atan_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
+=======
+    SparseCPU, SparseCUDA: atan_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
   tags: pointwise
 
@@ -1082,7 +1258,10 @@
     CUDA: baddbmm_out_cuda
     MPS: baddbmm_out_mps
     XPU: baddbmm_out_xpu
+<<<<<<< HEAD
     MTIA: baddbmm_out_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
 
 - func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
@@ -1297,7 +1476,11 @@
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: logical_not_out
+=======
+    CPU, CUDA: logical_not_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: logical_not_out_mps
   tags: pointwise
 
@@ -1380,7 +1563,10 @@
   dispatch:
     SparseCPU: bmm_sparse_cpu
     SparseCUDA: bmm_sparse_cuda
+<<<<<<< HEAD
     SparseMPS: bmm_sparse_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NestedTensorCPU: bmm_nested
     NestedTensorCUDA: bmm_nested_cuda
   tags: core
@@ -1393,10 +1579,15 @@
     CUDA: bmm_out_cuda
     MPS: bmm_out_mps
     XPU: bmm_out_xpu
+<<<<<<< HEAD
     MTIA: bmm_out_mtia
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
     SparseMPS: bmm_out_sparse_mps
+=======
+    SparseCPU: bmm_out_sparse_cpu
+    SparseCUDA: bmm_out_sparse_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCUDA: bmm_out_sparse_csr_cuda
 
 - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@@ -1421,12 +1612,20 @@
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to
+=======
+    SparseCPU, SparseCUDA: sparse_broadcast_to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
   structured_delegate: cat.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: cat_sparse
+=======
+    SparseCPU, SparseCUDA: cat_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU: cat_quantized_cpu
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
   tags: core
@@ -1474,7 +1673,11 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
+=======
+    SparseCPU, SparseCUDA: ceil_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
   tags: [core, pointwise]
 
@@ -1483,7 +1686,11 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
+=======
+    SparseCPU, SparseCUDA: ceil_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
   tags: pointwise
 
@@ -1493,7 +1700,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: ceil_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
+=======
+    SparseCPU, SparseCUDA: ceil_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
   tags: pointwise
 
@@ -1810,7 +2021,11 @@
   device_guard: False
   dispatch:
     MkldnnCPU: copy_mkldnn_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
+=======
+    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: copy_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@@ -1879,14 +2094,20 @@
     CUDA: count_nonzero_cuda
     MPS: count_nonzero_mps
   autogen: count_nonzero.dim_IntList_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: count_nonzero
   autogen: count_nonzero.out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
   variants: function, method
@@ -1908,10 +2129,14 @@
 - func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: cudnn_batch_norm
+<<<<<<< HEAD
 
 - func: cudnn_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
   dispatch:
     CUDA: cudnn_batch_norm_out
+=======
+  autogen: cudnn_batch_norm.out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NB: You can only use this if you used cudnn_batch_norm training=True
 - func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
@@ -2174,7 +2399,11 @@
   variants: function, method
   structured_delegate: div.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: div_sparse
+=======
+    SparseCPU, SparseCUDA: div_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ZeroTensor: div_zerotensor
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
   tags: [core, pointwise]
@@ -2184,7 +2413,11 @@
   variants: method
   structured_delegate: div.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: div_sparse_
+=======
+    SparseCPU, SparseCUDA: div_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2192,8 +2425,13 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: div_out
     SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
+=======
+    CPU, CUDA, MPS: div_out
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -2201,7 +2439,11 @@
   variants: function, method
   structured_delegate: div.out_mode
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: div_sparse
+=======
+    SparseCPU, SparseCUDA: div_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@@ -2209,7 +2451,11 @@
   variants: method
   structured_delegate: div.out_mode
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: div_sparse_
+=======
+    SparseCPU, SparseCUDA: div_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@@ -2218,7 +2464,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: div_out_mode
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
+=======
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -2365,7 +2615,10 @@
   dispatch:
     CPU: _embedding_bag_forward_only_cpu
     CUDA: _embedding_bag_forward_only_cuda
+<<<<<<< HEAD
     MPS: _embedding_bag_forward_only_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _embedding_bag_forward_only.out
 
 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@@ -2387,13 +2640,20 @@
   dispatch:
     CPU: _embedding_bag_cpu
     CUDA: _embedding_bag_cuda
+<<<<<<< HEAD
     MPS: _embedding_bag_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _embedding_bag.out
   tags: core
 
 - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: _embedding_bag_backward_symint
+=======
+    CPU, CUDA: _embedding_bag_backward_symint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
   dispatch:
@@ -2403,14 +2663,20 @@
   dispatch:
     CPU: _embedding_bag_dense_backward_cpu
     CUDA: _embedding_bag_dense_backward_cuda
+<<<<<<< HEAD
     MPS: _embedding_bag_dense_backward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _embedding_bag_dense_backward.out
 
 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
   dispatch:
     CPU: _embedding_bag_per_sample_weights_backward_cpu
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
+<<<<<<< HEAD
     MPS: _embedding_bag_per_sample_weights_backward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _embedding_bag_per_sample_weights_backward.out
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@@ -2427,7 +2693,11 @@
     MPS: empty_mps
     Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: empty_sparse
+=======
+    SparseCPU, SparseCUDA: empty_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseMeta: empty_sparse_symint
     SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
     SparseCsrMeta: empty_sparse_compressed_symint
@@ -2535,7 +2805,11 @@
   dispatch:
     CompositeExplicitAutograd: empty_like
     QuantizedCPU, QuantizedCUDA: empty_like_quantized
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
+=======
+    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
   autogen: empty_like.out
@@ -2555,7 +2829,11 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: erf_sparse
+=======
+    SparseCPU, SparseCUDA: erf_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
   tags: [core, pointwise]
 
@@ -2564,7 +2842,11 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
+=======
+    SparseCPU, SparseCUDA: erf_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
   tags: pointwise
 
@@ -2574,7 +2856,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: erf_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
+=======
+    SparseCPU, SparseCUDA: erf_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
   tags: pointwise
 
@@ -2640,7 +2926,11 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
+=======
+    SparseCPU, SparseCUDA: expm1_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
   tags: [core, pointwise]
 
@@ -2649,7 +2939,11 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
+=======
+    SparseCPU, SparseCUDA: expm1_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
   tags: pointwise
 
@@ -2659,7 +2953,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: expm1_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
+=======
+    SparseCPU, SparseCUDA: expm1_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
   tags: pointwise
 
@@ -2758,7 +3056,11 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: floor_sparse
+=======
+    SparseCPU, SparseCUDA: floor_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
   tags: [core, pointwise]
 
@@ -2767,7 +3069,11 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
+=======
+    SparseCPU, SparseCUDA: floor_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
   tags: pointwise
 
@@ -2777,7 +3083,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: floor_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
+=======
+    SparseCPU, SparseCUDA: floor_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
   tags: pointwise
 
@@ -2785,21 +3095,34 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: floor_divide
     SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
+=======
+    CPU, CUDA, MPS: floor_divide
+    SparseCPU, SparseCUDA: floor_divide_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CPU, CUDA, MPS: floor_divide_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
+=======
+    SparseCPU, SparseCUDA: floor_divide_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS: floor_divide_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
+=======
+    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2819,7 +3142,11 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: frac_sparse
+=======
+    SparseCPU, SparseCUDA: frac_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
   tags: pointwise
 
@@ -2828,7 +3155,11 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
+=======
+    SparseCPU, SparseCUDA: frac_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
   tags: pointwise
 
@@ -2839,7 +3170,11 @@
   dispatch:
     CPU, CUDA: frac_out
     MPS: frac_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
+=======
+    SparseCPU, SparseCUDA: frac_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
   tags: pointwise
 
@@ -2952,7 +3287,10 @@
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
+<<<<<<< HEAD
     MPS: grid_sampler_3d_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: grid_sampler_3d.out
 
 # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@@ -3229,7 +3567,11 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: isnan
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
+=======
+    SparseCPU, SparseCUDA: isnan_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
   autogen: isnan.out
   tags: [core, pointwise]
@@ -3310,7 +3652,10 @@
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
+<<<<<<< HEAD
     MPS: kthvalue_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -3344,6 +3689,7 @@
   dispatch:
     CompositeImplicitAutograd: rms_norm_symint
 
+<<<<<<< HEAD
 - func: _fused_rms_norm(Tensor input, int[] normalized_shape, Tensor? weight, float? eps) -> (Tensor, Tensor)
   dispatch:
     CUDA: _fused_rms_norm_cuda
@@ -3353,26 +3699,43 @@
 - func: _fused_rms_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor rstd, Tensor? weight, bool[2] output_mask) -> (Tensor, Tensor)
   dispatch:
     CUDA: _fused_rms_norm_backward_cuda
+=======
+- func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
+  dispatch:
+    MPS: _fused_rms_norm_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
+=======
+    SparseCPU, SparseCUDA: nan_to_num_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
+=======
+    SparseCPU, SparseCUDA: nan_to_num_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA, MTIA: nan_to_num_out
     MPS: nan_to_num_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
+=======
+    SparseCPU, SparseCUDA: nan_to_num_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@@ -3468,6 +3831,7 @@
 
 - func: _wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor
 
+<<<<<<< HEAD
 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor
 
 - func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor
@@ -3476,6 +3840,12 @@
 
 - func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor
 
+=======
+- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
 
 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
@@ -3575,7 +3945,11 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
+=======
+    SparseCPU, SparseCUDA: log1p_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
   tags: [core, pointwise]
 
@@ -3584,7 +3958,11 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
+=======
+    SparseCPU, SparseCUDA: log1p_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
   tags: pointwise
 
@@ -3594,7 +3972,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: log1p_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
+=======
+    SparseCPU, SparseCUDA: log1p_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
   tags: pointwise
 
@@ -3780,13 +4162,19 @@
   dispatch:
     CPU: _logcumsumexp_cpu
     CUDA: _logcumsumexp_cuda
+<<<<<<< HEAD
     MPS: _logcumsumexp_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: _logcumsumexp_out_cpu
     CUDA: _logcumsumexp_out_cuda
+<<<<<<< HEAD
     MPS: _logcumsumexp_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: logcumsumexp(Tensor self, int dim) -> Tensor
   variants: function, method
@@ -3807,23 +4195,35 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logsumexp
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     # calls squeeze
     CompositeExplicitAutogradNonFunctional: logsumexp_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
 
@@ -3873,15 +4273,23 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: aminmax.out
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: aminmax_out
     MPS: aminmax_out_mps
   tags: reduction
+=======
+    CPU, CUDA: aminmax_out
+    MPS: aminmax_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
   dispatch:
@@ -3897,7 +4305,11 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmax
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -3907,16 +4319,25 @@
   dispatch:
     CPU, CUDA, MTIA: max_out
     MPS: max_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
   variants: function
@@ -3929,14 +4350,23 @@
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amax.out
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: amax_out
     MPS: amax_out_mps
   tags: reduction
+=======
+    CPU, CUDA: amax_out
+    MPS: amax_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -3998,14 +4428,21 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: mean
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
 - func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CompositeExplicitAutograd: mean_dtype_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: mean.out
@@ -4013,7 +4450,11 @@
   variants: function, method
   dispatch:
     QuantizedCPU: mean_quantized_cpu
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -4022,16 +4463,25 @@
     CPU, CUDA: mean_out
     MPS: mean_out_mps
     QuantizedCPU: mean_out_quantized_cpu
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # Composite
@@ -4094,7 +4544,11 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmin
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -4104,28 +4558,46 @@
   dispatch:
     CPU, CUDA, MTIA: min_out
     MPS: min_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amin.out
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: amin_out
     MPS: amin_out_mps
   tags: reduction
+=======
+    CPU, CUDA: amin_out
+    MPS: amin_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
 # native_functions.yaml
@@ -4205,7 +4677,11 @@
   structured_delegate: mm.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
+=======
+    SparseCPU, SparseCUDA: _sparse_mm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
   tags: core
 
@@ -4232,13 +4708,19 @@
   dispatch:
     CPU: _int_mm_cpu
     CUDA: _int_mm_cuda
+<<<<<<< HEAD
     XPU: _int_mm_xpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: _int_mm_out_cpu
     CUDA: _int_mm_out_cuda
+<<<<<<< HEAD
     XPU: _int_mm_out_xpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
   dispatch:
@@ -4275,9 +4757,13 @@
 - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
   dispatch:
     CPU: _weight_int8pack_mm_cpu
+<<<<<<< HEAD
     CUDA: _weight_int8pack_mm_cuda
     MPS: _weight_int8pack_mm_mps
     XPU: _weight_int8pack_mm_xpu
+=======
+    MPS: _weight_int8pack_mm_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   python_module: sparse
@@ -4310,7 +4796,11 @@
   structured_delegate: mul.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: mul_sparse
+=======
+    SparseCPU, SparseCUDA: mul_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
@@ -4322,7 +4812,11 @@
   structured_delegate: mul.out
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
+=======
+    SparseCPU, SparseCUDA: mul_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@@ -4333,10 +4827,16 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: mul_out
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
     SparseMPS: mul_out_sparse_mps
+=======
+    CPU, CUDA, MPS: mul_out
+    SparseCPU: mul_out_sparse_cpu
+    SparseCUDA: mul_out_sparse_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
     MkldnnCPU: mkldnn_mul_out
   tags: pointwise
@@ -4409,7 +4909,11 @@
   variants: function, method
   dispatch:
     CPU: narrow_copy_dense_cpu
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
+=======
+    SparseCPU, SparseCUDA: narrow_copy_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
   tags: view_copy
 
@@ -4577,7 +5081,10 @@
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   dispatch:
     CPU, CUDA: _cdist_forward
+<<<<<<< HEAD
     MTIA: _cdist_forward_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: _cdist_forward_mps
   autogen: _cdist_forward.out
   tags: core
@@ -4707,7 +5214,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
+=======
+    SparseCPU, SparseCUDA: rad2deg_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
   tags: pointwise
 
@@ -4715,14 +5226,22 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
+=======
+    SparseCPU, SparseCUDA: rad2deg_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
   tags: pointwise
 
 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: rad2deg_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
+=======
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
   tags: pointwise
 
@@ -4730,7 +5249,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
+=======
+    SparseCPU, SparseCUDA: deg2rad_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
   tags: pointwise
 
@@ -4738,14 +5261,22 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
+=======
+    SparseCPU, SparseCUDA: deg2rad_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
   tags: pointwise
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: deg2rad_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
+=======
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
   tags: pointwise
 
@@ -4971,7 +5502,11 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: neg_sparse
+=======
+    SparseCPU, SparseCUDA: neg_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
   tags: [core, pointwise]
@@ -4981,7 +5516,11 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
+=======
+    SparseCPU, SparseCUDA: neg_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
   tags: pointwise
@@ -4992,7 +5531,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: neg_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
+=======
+    SparseCPU, SparseCUDA: neg_out_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
   tags: pointwise
 # Alias for neg
@@ -5076,7 +5619,11 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: round_sparse
+=======
+    SparseCPU, SparseCUDA: round_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
   tags: [core, pointwise]
 
@@ -5085,7 +5632,11 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: round_sparse_
+=======
+    SparseCPU, SparseCUDA: round_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
   tags: pointwise
 
@@ -5095,7 +5646,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: round_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
+=======
+    SparseCPU, SparseCUDA: round_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
   tags: pointwise
 
@@ -5131,14 +5686,23 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: relu
     MPS: relu_mps
     MTIA: relu_mtia
+=======
+    CPU, CUDA, MTIA: relu
+    MPS: relu_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MkldnnCPU: mkldnn_relu
     QuantizedCPU: relu_quantized_cpu
     QuantizedCUDA: relu_quantized_cuda
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: relu_sparse
+=======
+    SparseCPU, SparseCUDA: relu_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
   tags: [core, pointwise]
 
@@ -5146,14 +5710,23 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: relu_
     MPS: relu_mps_
     MTIA: relu_mtia_
+=======
+    CPU, CUDA, MTIA: relu_
+    MPS: relu_mps_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MkldnnCPU: mkldnn_relu_
     QuantizedCPU: relu_quantized_cpu_
     QuantizedCUDA: relu_quantized_cuda_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
+=======
+    SparseCPU, SparseCUDA: relu_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
   autogen: relu.out
   tags: pointwise
@@ -5440,7 +6013,11 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sin_sparse
+=======
+    SparseCPU, SparseCUDA: sin_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
   tags: [core, pointwise]
 
@@ -5450,7 +6027,11 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
+=======
+    SparseCPU, SparseCUDA: sin_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5460,7 +6041,11 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: sin_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
+=======
+    SparseCPU, SparseCUDA: sin_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: sinc(Tensor self) -> Tensor
@@ -5485,7 +6070,11 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
+=======
+    SparseCPU, SparseCUDA: sinh_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
   tags: [core, pointwise]
 
@@ -5494,7 +6083,11 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
+=======
+    SparseCPU, SparseCUDA: sinh_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
   tags: pointwise
 
@@ -5504,7 +6097,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: sinh_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
+=======
+    SparseCPU, SparseCUDA: sinh_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
@@ -5552,6 +6149,7 @@
   tags: core
   manual_cpp_binding: True
 
+<<<<<<< HEAD
 - func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool
   variants: function
   device_check: NoCheck
@@ -5559,6 +6157,8 @@
   tags: core
   manual_cpp_binding: True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: sym_numel(Tensor self) -> SymInt
   variants: function
   device_check: NoCheck
@@ -5887,10 +6487,16 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sum
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
   autogen: sum.out
   tags: reduction
+=======
+    SparseCPU, SparseCUDA, SparseMeta: sum_coo
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
+  autogen: sum.out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
@@ -5899,14 +6505,23 @@
   variants: function, method
   dispatch:
     NestedTensorCPU: NestedTensor_sum_dim_CPU
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
   tags: [core, reduction]
+=======
+    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -5914,11 +6529,17 @@
   dispatch:
     CPU, CUDA: sum_out
     MPS: sum_out_mps
+<<<<<<< HEAD
   tags: reduction
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: this function will be replaced once nested expand semantics have been settled on
 - func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
@@ -5930,12 +6551,16 @@
   dispatch:
     CPU, CUDA: nansum
     MPS: nansum_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nansum_out
     MPS: nansum_out_mps
+<<<<<<< HEAD
   tags: reduction
 
 - func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
@@ -5946,6 +6571,8 @@
   structured: True
   dispatch:
     CPU, CUDA: hash_tensor_out
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
   variants: method
@@ -5960,7 +6587,11 @@
   variants: function, method
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
+=======
+    SparseCPU, SparseCUDA: sqrt_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
   tags: [core, pointwise]
 
@@ -5969,7 +6600,11 @@
   structured_delegate: sqrt.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
+=======
+    SparseCPU, SparseCUDA: sqrt_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
   tags: pointwise
 
@@ -5979,7 +6614,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: sqrt_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
+=======
+    SparseCPU, SparseCUDA: sqrt_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
   tags: pointwise
 
@@ -6000,13 +6639,19 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6015,19 +6660,28 @@
     CPU, CUDA: std
     MPS: std_mps
     QuantizedCPU: std_quantized_cpu
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
@@ -6036,51 +6690,78 @@
     CPU, CUDA: std_mean
     MPS: std_mean_mps
   autogen: std_mean.correction_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: std_out
     QuantizedCPU: std_out_quantized_cpu
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6089,13 +6770,21 @@
     CPU, CUDA: prod
     MPS: prod_mps
   autogen: prod.out
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: prod.int_out
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6103,16 +6792,25 @@
   dispatch:
     CPU, CUDA: prod_out
     MPS: prod_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: t(Tensor(a) self) -> Tensor(a)
   device_check: NoCheck
@@ -6134,7 +6832,11 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tan_sparse
+=======
+    SparseCPU, SparseCUDA: tan_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
   tags: [core, pointwise]
 
@@ -6143,7 +6845,11 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
+=======
+    SparseCPU, SparseCUDA: tan_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
   tags: pointwise
 
@@ -6153,7 +6859,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: tan_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
+=======
+    SparseCPU, SparseCUDA: tan_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
   tags: pointwise
 
@@ -6164,7 +6874,11 @@
   dispatch:
     QuantizedCPU: tanh_quantized_cpu
     MkldnnCPU: mkldnn_tanh
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
+=======
+    SparseCPU, SparseCUDA: tanh_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
   tags: [core, pointwise]
@@ -6175,7 +6889,11 @@
   variants: function, method
   dispatch:
     MkldnnCPU: mkldnn_tanh_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
+=======
+    SparseCPU, SparseCUDA: tanh_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
   tags: pointwise
@@ -6186,7 +6904,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: tanh_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
+=======
+    SparseCPU, SparseCUDA: tanh_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
   tags: pointwise
 
@@ -6458,8 +7180,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
+=======
+    SparseCPU, SparseCUDA: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
@@ -6467,8 +7194,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
+=======
+    SparseCPU, SparseCUDA: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6477,8 +7209,13 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS: trunc_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
+=======
+    SparseCPU, SparseCUDA: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 # Alias for trunc
 
@@ -6508,7 +7245,10 @@
   dispatch:
     CPU: unique_dim_cpu
     CUDA: unique_dim_cuda
+<<<<<<< HEAD
     MPS: unique_dim_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: dynamic_output_shape
   autogen: unique_dim.out
 
@@ -6554,7 +7294,11 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: unsqueeze
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
+=======
+    SparseCPU, SparseCUDA: unsqueeze_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
   tags: core
@@ -6573,12 +7317,19 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cpp_no_default_args: ["unbiased"]
 
 - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
@@ -6587,52 +7338,80 @@
   dispatch:
     CPU, CUDA: var
     MPS: var_mps
+<<<<<<< HEAD
     MTIA: var_mtia
   tags: [core, reduction]
+=======
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: var_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
@@ -6641,18 +7420,27 @@
     CPU, CUDA: var_mean
     MPS: var_mean_mps
   autogen: var_mean.correction_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
   variants: method
@@ -6735,7 +7523,11 @@
 - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: zeros_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
+=======
+    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   dispatch:
@@ -6794,12 +7586,20 @@
 
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: norm_sparse
+=======
+    SparseCPU, SparseCUDA: norm_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: native_norm.out
 
 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: norm_sparse
+=======
+    SparseCPU, SparseCUDA: norm_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: native_norm.ScalarOpt_dim_dtype_out
 
 - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
@@ -6912,7 +7712,10 @@
   dispatch:
     CompositeExplicitAutograd: norm
   autogen: norm.ScalarOpt_dtype_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6920,23 +7723,34 @@
   dispatch:
     CompositeExplicitAutograd: norm
   autogen: norm.Scalar_out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   structured_delegate: norm.dtype_out
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
   tags: reduction
+=======
+    SparseCPU, SparseCUDA: sparse_dtype_norm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
   structured_delegate: norm.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sparse_norm
   tags: reduction
+=======
+    SparseCPU, SparseCUDA: sparse_norm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6944,7 +7758,10 @@
   dispatch:
     CPU, CUDA: norm_dtype_out
     MPS: norm_dtype_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6952,17 +7769,24 @@
   dispatch:
     CPU, CUDA: norm_out
     MPS: norm_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+<<<<<<< HEAD
   tags: reduction
 
 - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
@@ -6972,6 +7796,14 @@
 - func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   tags: reduction
+=======
+
+- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
   variants: method, function
@@ -7012,7 +7844,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: clone
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: clone_sparse
+=======
+    SparseCPU, SparseCUDA: clone_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
     MkldnnCPU: mkldnn_clone
     QuantizedCPU, QuantizedCUDA: quantized_clone
@@ -7047,7 +7883,11 @@
     CPU, CUDA: zero_
     MPS: zero_mps_
     Meta: zero_meta_
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
@@ -7060,8 +7900,12 @@
   dispatch:
     CPU, CUDA: sub_out
     MPS: sub_out_mps
+<<<<<<< HEAD
     MTIA: sub_out_mtia
     SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
+=======
+    SparseCPU, SparseCUDA: sub_out_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -7069,7 +7913,11 @@
   variants: function, method
   structured_delegate: sub.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sub_sparse
+=======
+    SparseCPU, SparseCUDA: sub_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ZeroTensor: sub_zerotensor
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
   tags: [core, pointwise]
@@ -7079,7 +7927,11 @@
   variants: method
   structured_delegate: sub.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
+=======
+    SparseCPU, SparseCUDA: sub_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 # For C++ only, until we have conversion from C++ numbers to Tensor
 
@@ -7118,7 +7970,11 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: rsub
+=======
+    CPU, CUDA, MPS: rsub
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: rsub.Tensor_out
 
 - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
@@ -7186,10 +8042,15 @@
     CUDA: addmm_out_cuda
     MPS: addmm_out_mps
     XPU: addmm_out_xpu
+<<<<<<< HEAD
     MTIA: addmm_out_mtia
     SparseCPU: addmm_out_sparse_dense_cpu
     SparseCUDA: addmm_out_sparse_dense_cuda
     SparseMPS: addmm_out_sparse_dense_mps
+=======
+    SparseCPU: addmm_out_sparse_dense_cpu
+    SparseCUDA: addmm_out_sparse_dense_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU: addmm_out_sparse_compressed_cpu
     SparseCsrCUDA: addmm_out_sparse_compressed_cuda
 
@@ -7199,7 +8060,10 @@
   dispatch:
     SparseCPU: addmm_sparse_dense_cpu
     SparseCUDA: addmm_sparse_dense_cuda
+<<<<<<< HEAD
     SparseMPS: addmm_sparse_dense_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
   tags: core
 
@@ -7236,14 +8100,18 @@
   dispatch:
     CPU: _scaled_mm_cpu
     CUDA: _scaled_mm_cuda
+<<<<<<< HEAD
   tags: needs_exact_strides
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
     CPU: _scaled_mm_out_cpu
     CUDA: _scaled_mm_out_cuda
+<<<<<<< HEAD
   tags: needs_exact_strides
 
 - func: _scaled_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
@@ -7255,12 +8123,15 @@
   variants: function
   dispatch:
     CUDA: _scaled_mm_cuda_v2_out
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 - func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
   variants: function
   dispatch:
     CUDA: _scaled_grouped_mm_cuda
+<<<<<<< HEAD
   tags: needs_exact_strides
 
 - func: _scaled_grouped_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
@@ -7268,11 +8139,16 @@
   dispatch:
     CUDA: _scaled_grouped_mm_cuda_v2
   tags: needs_exact_strides
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
   variants: function
   dispatch:
+<<<<<<< HEAD
     CompositeExplicitAutograd: _grouped_mm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA: _grouped_mm_cuda
 
 # NOTE [ Sparse: autograd and API ]
@@ -7439,32 +8315,52 @@
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMeta, SparseMPS, Meta: new_with_dims_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _sparse_coo_tensor_with_dims.out
 
 - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMeta, SparseMPS, Meta: new_with_dims_and_tensor_sparse_symint
+=======
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _sparse_coo_tensor_with_dims_and_tensors.out
 
 - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sparse_resize_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: sparse_resize, sparse_resize.out
 
 - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sparse_resize_and_clear_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_and_clear_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: sparse_resize_and_clear, sparse_resize_and_clear.out
 
 - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sparse_mask
+=======
+    SparseCPU, SparseCUDA: sparse_mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_mask_sparse_compressed
   autogen: sparse_mask.out
 
@@ -7484,8 +8380,13 @@
 - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
+=======
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MkldnnCPU: mkldnn_to_dense
   autogen: _to_dense.out
 
@@ -7494,8 +8395,13 @@
 - func: sparse_dim(Tensor self) -> int
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sparse_dim_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_dim_sparse_csr
+=======
+    SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: sparse_dim_default
   device_check: NoCheck
   device_guard: False
@@ -7511,8 +8417,13 @@
 - func: dense_dim(Tensor self) -> int
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
+=======
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: dense_dim_default
   device_check: NoCheck
   device_guard: False
@@ -7528,8 +8439,13 @@
 - func: _nnz(Tensor self) -> int
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: _nnz_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: _nnz_sparse_csr
+=======
+    SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   device_check: NoCheck
   device_guard: False
 
@@ -7545,13 +8461,20 @@
   dispatch:
     SparseCPU: _coalesce_sparse_cpu
     SparseCUDA: _coalesce_sparse_cuda
+<<<<<<< HEAD
     SparseMPS: _coalesce_sparse_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _coalesce.out
 
 - func: is_coalesced(Tensor self) -> bool
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: is_coalesced_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: is_coalesced_default
   device_check: NoCheck
   device_guard: False
@@ -7559,14 +8482,22 @@
 - func: _indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: _indices_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: _indices_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   device_check: NoCheck
   device_guard: False
 
 - func: _values(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: _values_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: _values_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   device_check: NoCheck
   device_guard: False
 
@@ -7576,7 +8507,11 @@
 - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: _coalesced_sparse_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: _coalesced_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   device_check: NoCheck
   device_guard: False
   autogen: _coalesced, _coalesced.out
@@ -7584,7 +8519,11 @@
 - func: indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: indices_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: indices_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: indices_default
   device_check: NoCheck
   device_guard: False
@@ -7592,7 +8531,11 @@
 - func: values(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: values_sparse
+=======
+    SparseCPU, SparseCUDA, SparseMeta: values_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
     CompositeExplicitAutograd: values_default
@@ -7645,7 +8588,11 @@
   device_check: NoCheck  # Allows copy into different device
   variants: function
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
+=======
+    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7665,9 +8612,15 @@
 - func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: dense_to_sparse
     SparseCPU, SparseCUDA, SparseMPS: sparse_coo_to_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta, SparseCsrMPS: sparse_compressed_to_sparse
+=======
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _to_sparse.sparse_dim_out
 
 - func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
@@ -7677,8 +8630,13 @@
 - func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: dense_to_sparse
     SparseCPU, SparseCUDA, SparseMPS: sparse_coo_to_sparse
+=======
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse
   autogen: _to_sparse.out
 
@@ -9051,7 +10009,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: ne_Scalar_out
+=======
+    CPU, CUDA: ne_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: ne_scalar_out_mps
     QuantizedCPU: ne_out_quantized_cpu
   tags: pointwise
@@ -9069,7 +10031,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: ne_Tensor_out
+=======
+    CPU, CUDA: ne_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: ne_tensor_out_mps
     QuantizedCPU: ne_out_quantized_cpu
   tags: pointwise
@@ -9114,7 +10080,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: eq_Scalar_out
+=======
+    CPU, CUDA: eq_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: eq_scalar_out_mps
     QuantizedCPU: eq_out_quantized_cpu
   tags: pointwise
@@ -9133,7 +10103,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: eq_Tensor_out
+=======
+    CPU, CUDA: eq_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: eq_tensor_out_mps
     QuantizedCPU: eq_out_quantized_cpu
   tags: pointwise
@@ -9152,7 +10126,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: ge_Scalar_out
+=======
+    CPU, CUDA: ge_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: ge_scalar_out_mps
     QuantizedCPU: ge_out_quantized_cpu
   tags: pointwise
@@ -9171,7 +10149,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: ge_Tensor_out
+=======
+    CPU, CUDA: ge_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: ge_tensor_out_mps
     QuantizedCPU: ge_out_quantized_cpu
   tags: pointwise
@@ -9216,7 +10198,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: le_Scalar_out
+=======
+    CPU, CUDA: le_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: le_scalar_out_mps
     QuantizedCPU: le_out_quantized_cpu
   tags: pointwise
@@ -9234,7 +10220,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: le_Tensor_out
+=======
+    CPU, CUDA: le_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: le_tensor_out_mps
     QuantizedCPU: le_out_quantized_cpu
   tags: pointwise
@@ -9279,7 +10269,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA,MTIA: gt_Scalar_out
+=======
+    CPU, CUDA: gt_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: gt_scalar_out_mps
     QuantizedCPU: gt_out_quantized_cpu
   tags: pointwise
@@ -9298,7 +10292,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: gt_Tensor_out
+=======
+    CPU, CUDA: gt_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: gt_tensor_out_mps
     QuantizedCPU: gt_out_quantized_cpu
   tags: pointwise
@@ -9526,7 +10524,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: addcmul_out
+=======
+    CPU, CUDA: addcmul_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: addcmul_out_mps
   tags: pointwise
 
@@ -9547,7 +10549,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MTIA: addcdiv_out
+=======
+    CPU, CUDA: addcdiv_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: addcdiv_out_mps
   tags: pointwise
 
@@ -9836,7 +10842,11 @@
   structured_delegate: sign.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sign_sparse
+=======
+    SparseCPU, SparseCUDA: sign_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
   tags: [core, pointwise]
 
@@ -9845,7 +10855,11 @@
   structured_delegate: sign.out
   variants: method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
+=======
+    SparseCPU, SparseCUDA: sign_sparse_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
   tags: pointwise
 
@@ -9856,7 +10870,11 @@
   dispatch:
     CPU, CUDA: sign_out
     MPS: sign_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
+=======
+    SparseCPU, SparseCUDA: sign_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
   tags: pointwise
 
@@ -9864,7 +10882,11 @@
   variants: function, method
   structured_delegate: signbit.out
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
+=======
+    SparseCPU, SparseCUDA: signbit_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
   tags: pointwise
 
@@ -9875,7 +10897,11 @@
     CPU: signbit_out
     CUDA: signbit_out
     MPS: signbit_out_mps
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
+=======
+    SparseCPU, SparseCUDA: signbit_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
   tags: pointwise
 
@@ -10021,7 +11047,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS, MTIA: fmod_out
+=======
+    CPU, CUDA, MPS: fmod_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
@@ -10040,7 +11070,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: hypot_out
+=======
+    CPU, CUDA: hypot_out
+    MPS: hypot_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
@@ -10057,7 +11092,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: igamma_out
+=======
+    CPU, CUDA: igamma_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
@@ -10074,7 +11113,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: igammac_out
+=======
+    CPU, CUDA: igammac_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
@@ -10156,14 +11199,20 @@
     CPU, CUDA: min
     MPS: min_mps
     QuantizedCPU: min_quantized_cpu
+<<<<<<< HEAD
   tags: [reduction]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: min_unary_out
     QuantizedCPU: min_quantized_unary_out
+<<<<<<< HEAD
   tags: [reduction]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: fmin(Tensor self, Tensor other) -> Tensor
   structured_delegate: fmin.out
@@ -10186,7 +11235,10 @@
     CPU, CUDA: max
     MPS: max_mps
     QuantizedCPU: max_quantized_cpu
+<<<<<<< HEAD
   tags: [reduction]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: fmax(Tensor self, Tensor other) -> Tensor
   structured_delegate: fmax.out
@@ -10233,7 +11285,10 @@
   dispatch:
     CPU, CUDA: max_unary_out
     QuantizedCPU: max_quantized_unary_out
+<<<<<<< HEAD
   tags: [reduction]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: minimum(Tensor self, Tensor other) -> Tensor
   structured_delegate: minimum.out
@@ -10353,24 +11408,36 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: all.all_out
   variants: method, function
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   structured: True
   dispatch:
     CPU, CUDA: all_all_out
+<<<<<<< HEAD
     MTIA: all_all_out_mtia
     MPS: all_all_out_mps
   tags: reduction
+=======
+    MPS: all_all_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: any(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: any.all_out
   variants: method, function
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: any_sparse
   tags: [core, reduction]
+=======
+    SparseCPU, SparseCUDA: any_sparse
+  tags: core
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -10378,7 +11445,10 @@
   dispatch:
     CPU, CUDA: any_all_out
     MPS: any_all_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -10454,7 +11524,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: pow_Tensor_Scalar_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
+=======
+    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: pow_tensor_scalar_out_mps
   tags: pointwise
 
@@ -10463,7 +11537,11 @@
   structured_delegate: pow.Tensor_Scalar_out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
+=======
+    SparseCPU, SparseCUDA: pow_sparse_scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
@@ -10628,7 +11706,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_add_scalar_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_add.Scalar_out
 
 - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
@@ -10637,7 +11718,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_add_list_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10645,7 +11729,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_add_list_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_add.List_out
 
 - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10676,7 +11763,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
     CUDA: foreach_tensor_add_tensor_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_add_tensor_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_add.Tensor_out
 
 - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10737,7 +11827,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_mul_scalar_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_mul.Scalar_out
 
 - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10746,7 +11839,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
     CUDA: foreach_tensor_mul_list_kernel_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_mul_list_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10754,7 +11850,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_mul_list_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_mul.List_out
 
 - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10778,7 +11877,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
     CUDA: foreach_tensor_mul_tensor_kernel_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_mul_tensor_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10786,7 +11888,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
     CUDA: foreach_tensor_mul_tensor_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_mul_tensor_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_mul.Tensor_out
 
 - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10810,7 +11915,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_div_list_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10818,7 +11926,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_div_list_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_div.List_out
 
 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10842,7 +11953,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
     CUDA: foreach_tensor_div_tensor_kernel_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_div_tensor_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10850,7 +11964,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
     CUDA: foreach_tensor_div_tensor_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_div_tensor_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_div.Tensor_out
 
 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10957,7 +12074,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_maximum.Scalar_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
@@ -11088,7 +12208,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
     CUDA: foreach_tensor_addcmul_scalar_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_addcmul_scalar_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11110,7 +12233,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
     CUDA: foreach_tensor_addcmul_scalar_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_addcmul_scalar_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_addcmul.Scalar_out
 
 - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
@@ -11135,7 +12261,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_abs_slow
     CUDA: foreach_tensor_abs_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_abs_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _foreach_abs_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11143,7 +12272,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_abs_slow_
     CUDA: foreach_tensor_abs_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_abs_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_abs.out
 
 - func: _foreach_acos(Tensor[] self) -> Tensor[]
@@ -11478,7 +12610,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_norm_slow
     CUDA: foreach_tensor_norm_cuda
+<<<<<<< HEAD
     MTIA: foreach_tensor_norm_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_norm.Scalar_out
 
 - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
@@ -11651,7 +12786,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_sqrt_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_sqrt.out
 
 - func: _foreach_tan(Tensor[] self) -> Tensor[]
@@ -11713,7 +12851,10 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
     CUDA: foreach_tensor_copy_list_kernel_cuda_
+<<<<<<< HEAD
     MTIA: foreach_tensor_copy_list_kernel_mtia_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _foreach_copy.out
 
 - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
@@ -11721,7 +12862,10 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: _foreach_copy
+<<<<<<< HEAD
     MTIA: foreach_tensor_copy_list_kernel_mtia
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
   dispatch:
@@ -12497,7 +13641,10 @@
   dispatch:
     CPU: avg_pool3d_out_cpu
     CUDA: avg_pool3d_out_cuda
+<<<<<<< HEAD
     MPS: avg_pool3d_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MkldnnCPU: mkldnn_avg_pool3d_out
 
 - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
@@ -12514,7 +13661,10 @@
   dispatch:
     CPU: avg_pool3d_backward_out_cpu
     CUDA: avg_pool3d_backward_out_cuda
+<<<<<<< HEAD
     MPS: avg_pool3d_backward_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MkldnnCPU: mkldnn_avg_pool3d_backward_out
 
 - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
@@ -12610,7 +13760,10 @@
   dispatch:
     CPU: max_pool3d_with_indices_out_cpu
     CUDA: max_pool3d_with_indices_out_cuda
+<<<<<<< HEAD
     MPS: max_pool3d_with_indices_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -12618,7 +13771,10 @@
   dispatch:
     CPU: max_pool3d_with_indices_cpu
     CUDA: max_pool3d_with_indices_cuda
+<<<<<<< HEAD
     MPS: max_pool3d_with_indices_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -12626,42 +13782,60 @@
   dispatch:
     CPU: max_pool3d_with_indices_backward_out_cpu
     CUDA: max_pool3d_with_indices_backward_out_cuda
+<<<<<<< HEAD
     MPS: max_pool3d_with_indices_backward_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_backward_cpu
     CUDA: max_pool3d_with_indices_backward_cuda
+<<<<<<< HEAD
     MPS: max_pool3d_with_indices_backward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_forward_out_cpu
     CUDA: max_unpooling2d_forward_out_cuda
+<<<<<<< HEAD
     MPS: max_unpooling2d_forward_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_forward_cpu
     CUDA: max_unpooling2d_forward_cuda
+<<<<<<< HEAD
     MPS: max_unpooling2d_forward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_out_cpu
     CUDA: max_unpooling3d_forward_out_cuda
+<<<<<<< HEAD
     MPS: max_unpooling3d_forward_out_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_cpu
     CUDA: max_unpooling3d_forward_cuda
+<<<<<<< HEAD
     MPS: max_unpooling3d_forward_mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -13391,7 +14565,11 @@
   dispatch:
     CompositeExplicitAutograd: isinf
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
+=======
+    SparseCPU, SparseCUDA: isinf_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
   autogen: isinf.out
@@ -13407,7 +14585,11 @@
   structured_delegate: isposinf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
+=======
+    SparseCPU, SparseCUDA: isposinf_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
   tags: pointwise
 
@@ -13416,7 +14598,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isposinf_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
+=======
+    SparseCPU, SparseCUDA: isposinf_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
   tags: pointwise
 
@@ -13425,7 +14611,11 @@
   structured_delegate: isneginf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
+=======
+    SparseCPU, SparseCUDA: isneginf_sparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
   tags: pointwise
 
@@ -13434,7 +14624,11 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isneginf_out
+<<<<<<< HEAD
     SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
+=======
+    SparseCPU, SparseCUDA: isneginf_sparse_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
   tags: pointwise
 
@@ -14154,10 +15348,22 @@
 - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
   python_module: linalg
   variants: function
+<<<<<<< HEAD
+=======
+  dispatch:
+    CompositeImplicitAutograd: linalg_lu_factor
+    MPS: linalg_lu_factor_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
   python_module: linalg
   variants: function
+<<<<<<< HEAD
+=======
+  dispatch:
+    CompositeImplicitAutograd: linalg_lu_factor_out
+    MPS: linalg_lu_factor_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
   python_module: linalg
@@ -14359,12 +15565,20 @@
   python_module: linalg
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: linalg_householder_product
+=======
+    CPU, CUDA: linalg_householder_product
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: linalg_householder_product_out
+=======
+    CPU, CUDA: linalg_householder_product_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
   python_module: linalg
@@ -14424,7 +15638,10 @@
   python_module: linalg
   variants: function
   structured_delegate: linalg_vector_norm.out
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
@@ -14432,7 +15649,10 @@
   dispatch:
     CPU, CUDA: linalg_vector_norm_out
     MPS: linalg_vector_norm_out_mps
+<<<<<<< HEAD
   tags: reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   python_module: linalg
@@ -15143,7 +16363,10 @@
 - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
+<<<<<<< HEAD
     NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: nondeterministic_seeded
 
 - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
@@ -15176,11 +16399,14 @@
     CUDA: _cudnn_attention_forward
   tags: nondeterministic_seeded
 
+<<<<<<< HEAD
 - func: _cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _cudnn_attention_backward
   tags: nondeterministic_seeded
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
   dispatch:
@@ -15783,7 +17009,11 @@
 - func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_t_out
+=======
+    CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15832,7 +17062,11 @@
 - func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_u_out
+=======
+    CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15881,7 +17115,11 @@
 - func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_v_out
+=======
+    CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15930,7 +17168,11 @@
 - func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_w_out
+=======
+    CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -16039,7 +17281,10 @@
   variants: function
   dispatch:
     CPU: _fused_adagrad_kernel_cpu_
+<<<<<<< HEAD
     CUDA: _fused_adagrad_kernel_cuda_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _fused_adagrad, _fused_adagrad.out
 
 - func: _fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -16047,7 +17292,10 @@
   variants: function
   dispatch:
     CPU: _fused_adagrad_kernel_cpu_
+<<<<<<< HEAD
     CUDA: _fused_adagrad_kernel_cuda_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _fused_adagrad.tensor_lr, _fused_adagrad.tensor_lr_out
 
 # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index ed7442b1c5969..bb5de39e9b412 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -316,7 +316,11 @@ Tensor NestedTensor_to_padded_tensor_generic(
     TORCH_CHECK(
         (int64_t)output_size_.size() == ret_val.dim(),
         "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
+<<<<<<< HEAD
     for (int64_t i = 0; i < ret_val.dim(); i++) {
+=======
+    for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(
           output_size_[i] >= ret_val.size(i),
           "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h
index c688eb1bde53f..2ba3f4a309e92 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.h
+++ b/aten/src/ATen/native/nested/NestedTensorMath.h
@@ -53,7 +53,11 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
       normalized_shape);
 
   // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
+<<<<<<< HEAD
   // Also, compute M and N considering the idiosyncrasies of NestedTensors
+=======
+  // Also, compute M and N considering the idiosyncracies of NestedTensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t N = 1;
   for (const auto i: c10::irange(normalized_ndim)) {
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.cpp b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
index 8284796047952..64efb81a7373f 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
@@ -95,7 +95,11 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
   for (const auto split_idx : c10::irange(chunks)) {
       auto new_sizes = sizes.clone();
       auto new_strides = strides.clone();
+<<<<<<< HEAD
       // This copies offsets so we are safe to move
+=======
+      // This copys offsets so we are safe to move
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto new_offsets = offsets.clone();
       int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
       int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 96c6ab8310f80..fa47c12907b9c 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -349,6 +349,7 @@ _scaled_dot_product_cudnn_attention_nestedtensor_cuda(
   return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda(
     const Tensor& grad_out,
     const Tensor& query,
@@ -406,6 +407,8 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_nestedten
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_nested(
     const at::Tensor& grad_out_,
     const at::Tensor& query,
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
index c44fd27902e04..6990199292ddc 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
@@ -245,7 +245,11 @@ int64_t get_nnz(const Tensor& nestedtensor) {
     //     this is because needs_broadcast indicates that the batch_size is 1
     //     and hence there is only 1 value for seq_len
     // (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
+<<<<<<< HEAD
     // ..., output_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
+=======
+    // ..., outut_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // output_batch_size * {*}_t.size(1);
 
     int64_t max_seqlen_batch_q = 0, Nnz_q = 0;
diff --git a/aten/src/ATen/native/quantized/AffineQuantizerBase.h b/aten/src/ATen/native/quantized/AffineQuantizerBase.h
index b38ec0b47e5ea..d2253bf8779c5 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.h
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.h
@@ -31,7 +31,11 @@ TORCH_API float dequantize_vec(
     float* dst,
     size_t count = 8);
 template <typename SRC_T, typename DST_T>
+<<<<<<< HEAD
 TORCH_API DST_T requantize_val(double /*src_scale*/, int64_t /*src_zero_point*/, double /*dst_scale*/, int64_t /*dst_zero_point*/, SRC_T src);
+=======
+TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Given a multiplier and a zero_point, requantize int32_t computed values back
 // to quantized values. See comment above
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
index ffe6f4c31829d..f8bdc5edf8db5 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@@ -48,8 +48,13 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max) {
+<<<<<<< HEAD
   TORCH_CHECK(scale.scalar_type() == ScalarType::Float || scale.scalar_type() == at::kBFloat16,
               "Scale must be Float or BFloat16, found ", scale.scalar_type());
+=======
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float,
+              "Scale must be Float, found ", scale.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int || zero_point.scalar_type() == ScalarType::Float || zero_point.scalar_type() == ScalarType::Half,
               "Zero-point must be Int32, Float or Half, found ", zero_point.scalar_type());
   TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
@@ -178,6 +183,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
           0 & \text{ else }
         \end{cases}
   */
+<<<<<<< HEAD
   bool is_bfloat16 = (X.scalar_type() == at::kBFloat16);
   at::Tensor X_ = is_bfloat16 ? X.to(ScalarType::Float) : X;
   at::Tensor dY_ = is_bfloat16 ? dY.to(ScalarType::Float) : dY;
@@ -202,6 +208,26 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
       "scale and zero-point need to have the same dimensions");
   TORCH_CHECK(
       scale_.numel() == X_.size(axis),
+=======
+  auto zero_point_rounded = _get_rounded_zero_point(zero_point, quant_min, quant_max);
+
+  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Float);
+
+  TORCH_CHECK(X.sizes() == dY.sizes(), "`X` and `dY` are not the same size");
+  TORCH_CHECK(
+      quant_min <= 0 && quant_max >= 0,
+      "Expecting `quant_min` <= 0 and `quant_max` >= 0");
+  TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
+  TORCH_CHECK(zero_point.dim() == 1, "zero point should be a 1-D tensor");
+  TORCH_CHECK(
+      scale.numel() == zero_point.numel(),
+      "scale and zero-point need to have the same dimensions");
+  TORCH_CHECK(
+      scale.numel() == X.size(axis),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "dimensions of scale and zero-point are not consistent with input tensor")
 
   TORCH_CHECK(
@@ -210,6 +236,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
       "`zero_point` must be between `quant_min` and `quant_max`.");
 
   TORCH_CHECK(
+<<<<<<< HEAD
       axis >= 0 && axis < X_.dim(),
       "`axis` must be between 0 and number of dimensions of input");
 
@@ -221,31 +248,62 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
   auto dScale_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
   auto dZeroPoint_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
   auto numDimensions = X_.ndimension();
+=======
+      axis >= 0 && axis < X.dim(),
+      "`axis` must be between 0 and number of dimensions of input");
+
+  if (X.numel() <= 0) {
+    return std::make_tuple(X, scale, zero_point);
+  }
+
+  auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto dScale_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto dZeroPoint_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto numDimensions = X.ndimension();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Create an axis mask for vectorizing and reshaping the scale and zero point tensors
   // into the same shapes as X along the channel axis.
   c10::DimVector axis_mask(numDimensions);
   for (const auto i : c10::irange(numDimensions)) {
+<<<<<<< HEAD
     axis_mask[i] = (i == axis) ? X_.size(axis) : 1;
   }
   auto X_shape = X_.sizes();
   auto scale_vectorized = scale_.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
+=======
+    axis_mask[i] = (i == axis) ? X.size(axis) : 1;
+  }
+  auto X_shape = X.sizes();
+  auto scale_vectorized = scale.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto zero_point_vectorized = zero_point_rounded.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
 
   auto iter = TensorIteratorConfig()
     .add_output(dX)
     .add_output(dScale_vec)
     .add_output(dZeroPoint_vec)
+<<<<<<< HEAD
     .add_input(X_)
     .add_input(dY_)
+=======
+    .add_input(X)
+    .add_input(dY)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     .add_input(scale_vectorized)
     .add_input(zero_point_vectorized)
     .build();
 
   fake_quant_grad_learnable_channel_stub(
+<<<<<<< HEAD
     X_.device().type(), iter, quant_min, quant_max, grad_factor);
 
   auto numElements = X_.ndimension() - 1;
+=======
+    X.device().type(), iter, quant_min, quant_max, grad_factor);
+
+  auto numElements = X.ndimension() - 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Create a collection of axes that include all but the channel axis for
   // reduction when summing over the dScale and dZeroPoint tensors.
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
index 88ac05cffe9e2..8ac512fe87b38 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
@@ -184,6 +184,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
           0 & \text{ else }
         \end{cases}
   */
+<<<<<<< HEAD
 
   bool is_bfloat16 = (X.scalar_type() == at::kBFloat16);
 
@@ -201,6 +202,17 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
   TORCH_CHECK(scale_.scalar_type() == ScalarType::Float);
   TORCH_CHECK(zero_point_.scalar_type() == ScalarType::Float);
   TORCH_CHECK(X_.numel() == dY_.numel(), "`X` and `dY` are not the same size");
+=======
+  float scale_val = scale[0].item<float>();
+  float inv_scale_val = 1.0f / scale_val;
+  int64_t zero_point_val = native::_get_zero_point_from_tensor(zero_point, quant_min, quant_max, false);
+
+  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X.numel() == dY.numel(), "`X` and `dY` are not the same size");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       quant_min <= 0 && quant_max >= 0,
       "`quant_min` should be less than or \
@@ -208,6 +220,7 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
   TORCH_CHECK(
       zero_point_val >= quant_min && zero_point_val <= quant_max,
       "`zero_point` must be between `quant_min` and `quant_max`.");
+<<<<<<< HEAD
   if (X_.numel() <= 0) {
     return std::make_tuple(X, scale, zero_point);
   }
@@ -215,11 +228,21 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
   auto dX = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
   auto dScale_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
   auto dZeroPoint_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+=======
+  if (X.numel() <= 0) {
+    return std::make_tuple(X, scale, zero_point);
+  }
+
+  auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto dScale_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto dZeroPoint_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto iter = TensorIteratorConfig()
     .add_output(dX)
     .add_output(dScale_vec)
     .add_output(dZeroPoint_vec)
+<<<<<<< HEAD
     .add_input(X_)
     .add_input(dY_)
     .build();
@@ -230,6 +253,18 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
   // The total sums over the scale and zero point gradient vectors are what will be returned in the end.
   auto dScale = dScale_vec.sum().unsqueeze(0).to(scale_.device());
   auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point_.device());
+=======
+    .add_input(X)
+    .add_input(dY)
+    .build();
+
+  fake_quant_grad_learnable_tensor_stub(
+    X.device().type(), iter, scale_val, inv_scale_val, zero_point_val, quant_min, quant_max, grad_factor);
+
+  // The total sums over the scale and zero point gradient vectors are what will be returned in the end.
+  auto dScale = dScale_vec.sum().unsqueeze(0).to(scale.device());
+  auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point.device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return std::make_tuple(dX, dScale, dZeroPoint);
 }
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index f804670c31538..37c3b3a17f381 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -335,8 +335,11 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
     const int64_t n_bins,
     const double ratio,
     int64_t bit_width) {
+<<<<<<< HEAD
   const float* input_row = input_tensor.const_data_ptr<float>();
   TORCH_CHECK_VALUE(input_row != nullptr, "input tensor is empty and has no data");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (numel < 0 || numel > input_tensor.numel()) {
     TORCH_CHECK(false, "numel is out of the bound of input tensor");
@@ -344,7 +347,11 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
 
   TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel,
       " greater than input_tensor.numel() ", input_tensor.numel());
+<<<<<<< HEAD
 
+=======
+  const float* input_row = input_tensor.const_data_ptr<float>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float xmin = *std::min_element(input_row, input_row + numel);
   float xmax = *std::max_element(input_row, input_row + numel);
   float n_bins_float = static_cast<float>(n_bins);
diff --git a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
index c3272d7aab9c5..3e7f13886b217 100644
--- a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
@@ -14,7 +14,11 @@ DEFINE_DISPATCH(index_put_kernel_quantized_stub);
 DEFINE_DISPATCH(index_put_with_sort_quantized_stub);
 
 namespace {
+<<<<<<< HEAD
 TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
+=======
+static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(is_expandable_to(value.sizes(), info.src.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
              " cannot be broadcast to indexing result of shape ", info.src.sizes());
   TensorIteratorConfig config;
@@ -30,7 +34,11 @@ TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor&
   return config.build();
 }
 
+<<<<<<< HEAD
 Tensor & masked_fill_impl_quantized_cpu(Tensor & self, const Tensor & mask, const Scalar& value) {
+=======
+static Tensor & masked_fill_impl_quantized_cpu(Tensor & self, const Tensor & mask, const Scalar& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   NoNamesGuard guard;
   TORCH_CHECK(mask.dtype() == ScalarType::Bool, "masked_fill only supports boolean masks, "
     "but got dtype ", mask.dtype());
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index 75405c51bd0a6..35454c9a82a94 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -104,6 +104,7 @@ Tensor empty_strided_unknown_quantized(
 
 // Provide better error message if dtype is wrong
 Tensor empty_affine_quantized_other_backends_stub(
+<<<<<<< HEAD
     IntArrayRef /*unused*/,
     std::optional<ScalarType> /*unused*/,
     std::optional<Layout> /*unused*/,
@@ -112,10 +113,21 @@ Tensor empty_affine_quantized_other_backends_stub(
     double /*unused*/,
     int64_t /*unused*/,
     std::optional<c10::MemoryFormat> /*unused*/) {
+=======
+    IntArrayRef,
+    std::optional<ScalarType>,
+    std::optional<Layout>,
+    std::optional<Device>,
+    std::optional<bool>,
+    double,
+    int64_t,
+    std::optional<c10::MemoryFormat>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
 Tensor empty_per_channel_affine_quantized_other_backends_stub(
+<<<<<<< HEAD
     IntArrayRef /*unused*/,
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
@@ -125,6 +137,17 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub(
     std::optional<Device> /*unused*/,
     std::optional<bool> /*unused*/,
     std::optional<c10::MemoryFormat> /*unused*/) {
+=======
+    IntArrayRef,
+    const Tensor&,
+    const Tensor&,
+    int64_t,
+    std::optional<ScalarType>,
+    std::optional<Layout>,
+    std::optional<Device>,
+    std::optional<bool>,
+    std::optional<c10::MemoryFormat>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp b/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
index c689132c7692e..546de933a5f86 100644
--- a/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
@@ -81,7 +81,11 @@ DynamicQuantMatmul::DynamicQuantMatmul(
   auto src_q_tensor_info = arm_compute::TensorInfo(
       arm_compute::TensorShape(weight_dim_0, m),
       1,
+<<<<<<< HEAD
       // ACL dynamically quantized matmuls only support (signed) int8_t
+=======
+      // ACL dyanamically quantized matmuls only support (signed) int8_t
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       arm_compute::DataType::QASYMM8_SIGNED,
       // TODO: setting the initial offset value to int8_t max instead of zero,
       // because ACL currently skips MatrixBReduction calculation if the
diff --git a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
index de7c380b6b67f..a963cfc28d819 100644
--- a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
@@ -54,7 +54,11 @@ inline int end_index(int out_idx, int out_len, int in_len) {
 
 // adaptive avg pool for 2D and 3D inputs
 template <typename scalar_t>
+<<<<<<< HEAD
 void adaptive_avg_pool_single_out_frame(
+=======
+static void adaptive_avg_pool_single_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* input_p,
     scalar_t* output_p,
     int64_t sizeC,
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
index 640ce50b76e85..fa586fcf12013 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
@@ -31,7 +31,11 @@ DEFINE_DISPATCH(qavg_pool2d_nhwc_stub);
 namespace {
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void avg_pool2d_out_frame(
+=======
+static void avg_pool2d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     Tensor& output,
     int64_t nInputPlane,
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 963a47a21fa9f..122f6f658889f 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -460,6 +460,9 @@ at::Tensor _qconv_prepack_onednn(
     int64_t groups,
     std::optional<torch::List<int64_t>> input_shape=std::nullopt);
 
+<<<<<<< HEAD
 #define FP8E4M3_MAX 448.0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
index 764d237e68b4c..8a61a578bf2f1 100644
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -456,7 +456,11 @@ make_zero_points_and_scales_tensor(
     uint32_t groups = 1) {
   const int out_ch_idx = transpose ? 1 : 0;
   const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
+<<<<<<< HEAD
   // Add 8 to account for buffering needed by QNNPACK.
+=======
+  // Add 8 to account for bufferring needed by QNNPACK.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
   const auto qtype = weight_contig.qscheme();
   std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
diff --git a/aten/src/ATen/native/quantized/cpu/QuantUtils.h b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
index 686bbf4f83177..ccaa8234bcb97 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
@@ -146,12 +146,21 @@ inline TensorQuantizationParams ChooseQuantizationParams(
   // The arithmetic error on the zero point computed from either pair
   // will be roughly machine_epsilon * (sum of absolute values of terms)
   // so we want to use the variant that adds the smaller terms.
+<<<<<<< HEAD
   double zero_point_from_min = qmin - min / scale;
   double zero_point_from_max = qmax - max / scale;
   double zero_point_from_min_error =
       std::abs(qmin) - std::abs(min / scale);
   double zero_point_from_max_error =
       std::abs(qmax) - std::abs(max / scale);
+=======
+  double zero_point_from_min = qmin - min / static_cast<double>(scale);
+  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min_error =
+      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+  double zero_point_from_max_error =
+      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   double initial_zero_point =
       zero_point_from_min_error < zero_point_from_max_error
       ? zero_point_from_min
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
index 945710f804f4d..0c7f4164d62a7 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@@ -35,7 +35,11 @@ struct UpsampleBilinearParamW {
 
 // at::native functions for the native_functions.yaml
 template <typename scalar_t>
+<<<<<<< HEAD
 void upsample_bilinear2d_out_frame(
+=======
+static void upsample_bilinear2d_out_frame(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& output,
     const Tensor& input,
     int64_t input_height,
@@ -73,7 +77,12 @@ void upsample_bilinear2d_out_frame(
   const auto rwidth = area_pixel_compute_scale<float>(
       input_width, output_width, align_corners, scales_w);
 
+<<<<<<< HEAD
   float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
+=======
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+  float output_scale = output.q_scale() / input.q_scale();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const int64_t input_q_zero_point = input.q_zero_point();
   const int64_t output_q_zero_point = output.q_zero_point();
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
index 42c000ee09d5c..091cbe5b90427 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
@@ -17,7 +17,10 @@
 #include <c10/util/irange.h>
 
 #include <cstring>
+<<<<<<< HEAD
 #include <vector>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 namespace at::native {
@@ -54,8 +57,13 @@ static void upsample_nearest2d_out_frame(
     return;
   }
 
+<<<<<<< HEAD
   std::vector<int64_t> input_offset_arr(output_width);
   int64_t* input_offset = input_offset_arr.data();
+=======
+  std::unique_ptr<int64_t []> input_offset_arr(new int64_t[output_width]);
+  int64_t* input_offset = input_offset_arr.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (const auto w2 : c10::irange(output_width)) {
     const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 1e4d2b9960d02..eecd5a87f798f 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -83,8 +83,15 @@ void CopyICFirst3dTensorToChannelsLast3dTensor(
   for (int64_t i = 0; i < G * OC_G; ++i) {
     for (const auto j : c10::irange(inner_size)) {
       for (const auto ic : c10::irange(IC_G)) {
+<<<<<<< HEAD
         int g = static_cast<int>(i / OC_G);
         int oc = static_cast<int>(i % OC_G);
+=======
+        // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+        int g = i / OC_G;
+        // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+        int oc = i % OC_G;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dst[(i * inner_size + j) * IC_G + ic] =
             src[((g * IC_G + ic) * OC_G + oc) * inner_size + j];
       }
@@ -110,6 +117,7 @@ fbgemm::conv_param_t<kSpatialDim> MakeFbgemmConvParam(
   std::array<int, kSpatialDim> image_shape_{};
   std::array<int, kSpatialDim> kernels_{};
   std::array<int, kSpatialDim> strides_{};
+<<<<<<< HEAD
   std::array<int, kSpatialDim * 2ull> pads_{};
   std::array<int, kSpatialDim> dilations_{};
   std::array<int, kSpatialDim> output_padding_{};
@@ -130,6 +138,26 @@ fbgemm::conv_param_t<kSpatialDim> MakeFbgemmConvParam(
   std::copy(pads.begin(), pads.begin() + static_cast<int64_t>(pads.size()), pads_.begin());
   const auto pads_size = static_cast<int64_t>(pads.size());
   std::move(pads.begin(), pads.begin() + pads_size, pads_.begin() + pads_size);
+=======
+  std::array<int, kSpatialDim * 2> pads_{};
+  std::array<int, kSpatialDim> dilations_{};
+  std::array<int, kSpatialDim> output_padding_{};
+  std::move(image_shape.begin(), image_shape.begin() + image_shape.size(), image_shape_.begin());
+  std::move(
+      kernels.begin(), kernels.begin() + kernels.size(), kernels_.begin());
+  std::move(
+      strides.begin(), strides.begin() + strides.size(), strides_.begin());
+  std::move(
+      dilations.begin(),
+      dilations.begin() + dilations.size(),
+      dilations_.begin());
+  std::move(
+      output_padding.begin(),
+      output_padding.begin() + output_padding.size(),
+      output_padding_.begin());
+  std::copy(pads.begin(), pads.begin() + pads.size(), pads_.begin());
+  std::move(pads.begin(), pads.begin() + pads.size(), pads_.begin() + pads.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return fbgemm::conv_param_t<kSpatialDim>(
       N, // batch size
@@ -158,7 +186,11 @@ Tensor MakeStridedQTensorCPU(
   TORCH_CHECK(
       isQIntType(typeMetaToScalarType(dtype)),
       "ScalarType is not supported in new_qtensor_cpu.");
+<<<<<<< HEAD
   int64_t size_bytes = static_cast<int64_t>(nelements * dtype.itemsize());
+=======
+  int64_t size_bytes = nelements * dtype.itemsize();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto storage = c10::make_intrusive<StorageImpl>(
       StorageImpl::use_byte_size_t(),
       size_bytes,
@@ -366,7 +398,11 @@ Tensor ConvertConvWeightsToChannelLastTensor<3>(
 #endif // USE_FBGEMM
 
 namespace {
+<<<<<<< HEAD
   // This is really terrible, but couldn't figure out a better way to constexpr convert int to
+=======
+  // This is really terrible, but couldnt figure out a better way to constexpr convert int to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // string and then perform string concatenation on/with it
   constexpr const char* _hack_int_to_class_name(int x) {
     switch(x) {
@@ -531,8 +567,13 @@ int register_embedding_params() {
             TORCH_INTERNAL_ASSERT(longs.size() == 1, "EmbeddingPackedParams: Expected bit_rate to be serialized");
             TORCH_CHECK(version == 1, "EmbeddingPackedParams: Currently only version 1 supported.");
 
+<<<<<<< HEAD
             const auto& weight = tensors[0];
             return PackedEmbeddingBagWeight::prepack(weight);
+=======
+            at::Tensor weight = std::move(tensors[0]);
+            return PackedEmbeddingBagWeight::prepack(std::move(weight));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def("bit_rate", &EmbeddingPackedParamsBase::bit_rate)
       .def("unpack", &EmbeddingPackedParamsBase::unpack)
@@ -543,9 +584,16 @@ int register_embedding_params() {
 
 namespace {
 
+<<<<<<< HEAD
 [[maybe_unused]] auto conv2d_params = register_conv_params<2>();
 [[maybe_unused]] auto conv3d_params = register_conv_params<3>();
 [[maybe_unused]] auto linear_params = register_linear_params();
 [[maybe_unused]] auto embedding_params = register_embedding_params();
+=======
+[[maybe_unused]] static auto conv2d_params = register_conv_params<2>();
+[[maybe_unused]] static auto conv3d_params = register_conv_params<3>();
+[[maybe_unused]] static auto linear_params = register_linear_params();
+[[maybe_unused]] static auto embedding_params = register_embedding_params();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index a1139be833f87..ef0947ef34c5a 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -7,13 +7,19 @@
 #include <c10/util/irange.h>
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fbgemm/Fbgemm.h>
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override")
 #include <fbgemm/FbgemmFP16.h>
 C10_DIAGNOSTIC_POP()
 #include <fbgemm/QuantUtils.h>
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // The struct for the packed weight matrix (PackBMatrix) and the corresponding
 // column offsets used for the fully connect layer, which are both prepared in
@@ -380,7 +386,11 @@ struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
 
   at::Tensor unpack() override;
   static c10::intrusive_ptr<EmbeddingPackedParamsBase> prepack(
+<<<<<<< HEAD
       const at::Tensor& weight);
+=======
+      at::Tensor weight);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t bit_rate() const override {
     return bit_rate_;
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 8fc9d65a830fb..f730ca2950d5c 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -148,7 +148,11 @@ Tensor qcat_nhwc_kernel(
           // Vectorized loop
           if (c + VLEN <= curr_C) {
             auto curr_scale_vec = Vectorized<float>(curr_scale);
+<<<<<<< HEAD
             auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
+=======
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
             for (; c + VLEN <= curr_C; c += VLEN) {
               auto inp_vec = Vec::loadu(iptr + c);
@@ -174,7 +178,11 @@ Tensor qcat_nhwc_kernel(
           int64_t elem_size = curr_C - c;
           if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
             auto curr_scale_vec = Vectorized<float>(curr_scale);
+<<<<<<< HEAD
             auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
+=======
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
             int64_t vec_num = elem_size / kVLEN;
             std::array<typename scalar_t::underlying, VLEN> buf_in{};
@@ -560,7 +568,11 @@ float hsum_sq(const int32_t* A, int len) {
   alignas(64) float temp[8];
   _mm256_store_ps(temp, sum_ps);
   for (const auto k : c10::irange(8)) {
+<<<<<<< HEAD
     row_sum += temp[k];
+=======
+    row_sum += static_cast<float>(temp[k]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #elif defined(CPU_CAPABILITY_AVX512)
   __m512 sum_ps = _mm512_setzero_ps();
@@ -574,7 +586,11 @@ float hsum_sq(const int32_t* A, int len) {
   alignas(64) float temp[16];
   _mm512_store_ps(temp, sum_ps);
   for (const auto k : c10::irange(16)) {
+<<<<<<< HEAD
     row_sum += temp[k];
+=======
+    row_sum += static_cast<float>(temp[k]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif // CPU_CAPABILITY_AVX2 or CPU_CAPABILITY_AVX512
 
@@ -608,6 +624,7 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
   });
 }
 
+<<<<<<< HEAD
 void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
                                    const Scalar& negval_) {
   int64_t i_zp = qx.q_zero_point();
@@ -615,6 +632,17 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
 
   int64_t o_zp = out.q_zero_point();
   float o_scale = static_cast<float>(out.q_scale());
+=======
+static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
+                                   const Scalar& negval_) {
+  int64_t i_zp = qx.q_zero_point();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float i_scale = qx.q_scale();
+
+  int64_t o_zp = out.q_zero_point();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float o_scale = out.q_scale();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float o_inv_scale = 1.0f / o_scale;
 
   float negval = negval_.to<float>();
@@ -625,8 +653,13 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
     Vec zero_vec = Vec(0.0f);
     Vec one_vec = Vec(1.0f);
 
+<<<<<<< HEAD
     Vec i_scale_vec = Vec(i_scale);
     Vec i_zp_vec = Vec(i_zp);
+=======
+    Vec i_scale_vec = Vec((float)i_scale);
+    Vec i_zp_vec = Vec((float)i_zp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
 
     Vec negval_vec = Vec(negval);
@@ -658,7 +691,11 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
   });
 }
 
+<<<<<<< HEAD
 void qprelu_out_kernel(Tensor& out,
+=======
+static void qprelu_out_kernel(Tensor& out,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                               const Tensor& qx,
                               const Tensor& qw) {
   int32_t i_zp = static_cast<int32_t>(qx.q_zero_point());
@@ -736,9 +773,16 @@ void qprelu_out_kernel(Tensor& out,
 
 void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
   int64_t zero_point = qx.q_zero_point();
+<<<<<<< HEAD
   float scale = static_cast<float>(qx.q_scale());
   auto scale_vec = Vectorized<float>(scale);
   auto zero_point_vec = Vectorized<float>(zero_point);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
+  auto scale_vec = Vectorized<float>(scale);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
   int64_t output_zero_point = zero_point;
   float output_scale = scale;
@@ -825,9 +869,16 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
 void qsigmoid_kernel(
     const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
   int64_t zero_point = qx.q_zero_point();
+<<<<<<< HEAD
   float scale = static_cast<float>(qx.q_scale());
   auto scale_vec = Vectorized<float>(scale);
   auto zero_point_vec = Vectorized<float>(zero_point);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
+  auto scale_vec = Vectorized<float>(scale);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
     float inv_output_scale = 1.0 / output_scale;
@@ -866,9 +917,16 @@ void qsigmoid_kernel(
 
 void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
   int64_t zero_point = qx.q_zero_point();
+<<<<<<< HEAD
   float scale = static_cast<float>(qx.q_scale());
   auto scale_vec = Vectorized<float>(scale);
   auto zero_point_vec = Vectorized<float>(zero_point);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
+  auto scale_vec = Vectorized<float>(scale);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
 
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
@@ -1024,10 +1082,20 @@ void qthreshold_kernel(
 
   // defines input and output scales and zero_points
   int64_t input_zero_point = qx.q_zero_point();
+<<<<<<< HEAD
   float input_scale = static_cast<float>(qx.q_scale());
   int64_t output_zero_point = qy.q_zero_point();
   float output_scale = static_cast<float>(qy.q_scale());
   float inv_output_scale = static_cast<float>(1.0 / output_scale);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float input_scale = qx.q_scale();
+  int64_t output_zero_point = qy.q_zero_point();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float output_scale = qy.q_scale();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float inv_output_scale = 1.0 / output_scale;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
     qy = at::_empty_affine_quantized(
@@ -1088,7 +1156,12 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
 
   const auto o_scale = qy.q_scale();
   const auto o_zero_point = qy.q_zero_point();
+<<<<<<< HEAD
   const float o_inv_scale = static_cast<float>(1.0 / o_scale);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  const float o_inv_scale = 1.0 / o_scale;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using fVec = Vectorized<float>;
   fVec i_scale_vec(i_scale);
@@ -1126,9 +1199,16 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
 
 void qtanh_kernel(const Tensor& qx, Tensor& qy) {
   int64_t zero_point = qx.q_zero_point();
+<<<<<<< HEAD
   float scale = static_cast<float>(qx.q_scale());
   auto scale_vec = Vectorized<float>(scale);
   auto zero_point_vec = Vectorized<float>(zero_point);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
+  auto scale_vec = Vectorized<float>(scale);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
 
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
@@ -1188,13 +1268,25 @@ void qelu_kernel(
   // they are NOT related to the quantization scale term
 
   int64_t i_zp = qx.q_zero_point();
+<<<<<<< HEAD
   float i_scale = static_cast<float>(qx.q_scale());
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float i_scale = qx.q_scale();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // In a future PR, we can improve on output scale and zero_point
   // selection.
   int64_t o_zp = qy.q_zero_point();
+<<<<<<< HEAD
   float o_scale = static_cast<float>(qy.q_scale());
   float inv_o_scale = static_cast<float>(1.0 / o_scale);
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float o_scale = qy.q_scale();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float inv_o_scale = 1.0 / o_scale;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   float alpha_float = alpha.to<float>();
   float scale_coef = scale.to<float>();
@@ -1214,7 +1306,11 @@ void qelu_kernel(
     Vec scale_coef_vec = Vec(scale_coef);
     Vec input_scale_coef_vec = Vec(input_scale_coef);
     Vec i_scale_vec = Vec(i_scale);
+<<<<<<< HEAD
     Vec i_zero_point_vec = Vec(i_zp);
+=======
+    Vec i_zero_point_vec = Vec((float)i_zp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
 
     cpu_kernel_vec(
@@ -1269,7 +1365,11 @@ template <bool ReLUFused = false>
 void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
   int64_t zero_point = out.q_zero_point();
   float scale = static_cast<float>(out.q_scale());
+<<<<<<< HEAD
   float inv_scale = 1.0f / scale;
+=======
+  float inv_scale = static_cast<float>(1.0f / scale);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t self_zero_point = self.q_zero_point();
   float self_scale = static_cast<float>(self.q_scale());
 
@@ -1313,20 +1413,38 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
 template <bool ReLUFused = false>
 void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
   int64_t zero_point = out.q_zero_point();
+<<<<<<< HEAD
   float scale = static_cast<float>(out.q_scale());
   float inv_scale = 1.0f / scale;
   int64_t self_zero_point = self.q_zero_point();
   float self_scale = static_cast<float>(self.q_scale());
   int64_t other_zero_point = other.q_zero_point();
   float other_scale = static_cast<float>(other.q_scale());
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = out.q_scale();
+  float inv_scale = 1.0f / scale;
+  int64_t self_zero_point = self.q_zero_point();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float self_scale = self.q_scale();
+  int64_t other_zero_point = other.q_zero_point();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float other_scale = other.q_scale();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Broadcast out the parameters here to amortize out that cost across
   // loop iterations.
   // TODO: we can optimize dequantization by doing a premultiplication
   // of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
+<<<<<<< HEAD
   auto self_zero_point_vec = Vectorized<float>(self_zero_point);
   auto self_scale_vec = Vectorized<float>(self_scale);
   auto other_zero_point_vec = Vectorized<float>(other_zero_point);
+=======
+  auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
+  auto self_scale_vec = Vectorized<float>(self_scale);
+  auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto other_scale_vec = Vectorized<float>(other_scale);
 
   auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
@@ -2680,11 +2798,18 @@ void _fake_quantize_tensor_helper(
           bool* mask_val = (bool*)(data[1] + i * strides[1]);
           scalar_t* input_val = (scalar_t*)(data[2] + i * strides[2]);
 
+<<<<<<< HEAD
           if (fake_quant_on) {
             auto qval_f = z_point + std::nearbyint(*input_val * inv_scale);
             const auto qval = static_cast<int64_t>(std::fmin(std::fmax(qval_f, quant_min), quant_max));
             *output_val = (qval - z_point) * sc;
             *mask_val = ((quant_min <= qval_f) && (qval_f <= quant_max));
+=======
+          const auto qval = static_cast<int64_t>(z_point + std::nearbyint(*input_val * inv_scale));
+          if (fake_quant_on) {
+          *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc;
+          *mask_val = ((quant_min <= qval) && (qval <= quant_max));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } else {
             *output_val = *input_val;
             *mask_val = 1;
@@ -2700,11 +2825,18 @@ void _fake_quantize_tensor_helper(
           bool* mask_val = (bool*)(data[1] + i * strides[1]);
           scalar_t* input_val = (scalar_t*)(data[2] + i * strides[2]);
 
+<<<<<<< HEAD
           if (fake_quant_on) {
             auto qval_f = z_point + std::nearbyint(*input_val * inv_scale);
             const auto qval = static_cast<int64_t>(std::fmin(std::fmax(qval_f, quant_min), quant_max));
             *output_val = (qval - z_point) * sc;
             *mask_val = ((quant_min <= qval_f) && (qval_f <= quant_max));
+=======
+          const auto qval = static_cast<int64_t>(z_point + std::nearbyint(*input_val * inv_scale));
+          if (fake_quant_on) {
+          *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc;
+          *mask_val = ((quant_min <= qval) && (qval <= quant_max));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } else {
             *output_val = *input_val;
             *mask_val = 1;
@@ -2899,7 +3031,11 @@ void fake_quantize_learnable_channel_grad_kernel_cpu(
       // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
       *dx_output = (*dy_input) * (xqi >= quant_min && xqi <= quant_max);
       // Calculate gradients for scale and zero point.
+<<<<<<< HEAD
       float xfqi = ((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
+=======
+      float xfqi = static_cast<float>((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (xqi < quant_min || xqi > quant_max) {
         *dzero_point_output = (*dy_input) * (-1) * (*scale_input) * grad_factor;
         *dscale_output = ((xqi < quant_min) ? ((*dy_input) * dscale_small) : ((*dy_input) * dscale_big)) * grad_factor;
@@ -2949,7 +3085,11 @@ void quantized_normalize_kernel(
     const bool beta_null = beta_data == nullptr;
     int64_t x_zp = X.q_zero_point();
     float x_scale = X.q_scale();
+<<<<<<< HEAD
     fVec x_zp_vec(x_zp);
+=======
+    fVec x_zp_vec((float)x_zp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fVec one_vec(1.0f);
     fVec zero_vec(0.0f);
     float x_fake_scale = 1.0f;
@@ -3237,7 +3377,11 @@ void quantized_groupnorm_nhwc_kernel(
     const bool beta_null = beta_data == nullptr;
     int64_t x_zp = X.q_zero_point();
     float x_scale = X.q_scale();
+<<<<<<< HEAD
     fVec x_zp_vec(x_zp);
+=======
+    fVec x_zp_vec((float)x_zp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fVec one_vec(1.0f);
     fVec zero_vec(0.0f);
     float x_fake_scale = 1.0f;
@@ -3535,7 +3679,11 @@ void dequantize_tensor_per_tensor_affine_cpu(
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 
+<<<<<<< HEAD
 constexpr static int PARALLEL_THRESHOLD = 1 << 20;
+=======
+const static int PARALLEL_THRESHOLD = 1 << 20;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Generic template defaults to naive quantize implementation
 template <typename T>
@@ -4399,7 +4547,11 @@ void _qmul_tensor_cpu_impl(
     uint8_t y_data = *(y_ptr + idx);
     int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
     int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
+<<<<<<< HEAD
     int32_t out_val = x_val * y_val;
+=======
+    int32_t out_val = static_cast<int32_t>(x_val * y_val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     float out_val_f = (float)out_val * multiplier;
     if constexpr (std::is_same<T, float>::value) {
       *(out_ptr + idx) = out_val_f;
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index cd8fb6df37f0e..f3e4dac3d815f 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -34,6 +34,7 @@
 #include <ATen/ops/quantize_per_channel_native.h>
 #include <ATen/ops/quantize_per_tensor_native.h>
 #include <ATen/ops/zeros.h>
+<<<<<<< HEAD
 #include <ATen/ops/convolution.h>
 #include <ATen/ops/linear.h>
 #include <ATen/ops/relu.h>
@@ -43,6 +44,8 @@
 #include <ATen/ops/hardtanh.h>
 #include <ATen/ops/hardswish.h>
 #include <ATen/ops/sigmoid.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #include <c10/util/irange.h>
@@ -1198,7 +1201,11 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
       kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
   ideep::tensor src(src_desc, act_contig.data_ptr());
   // weights & bias
+<<<<<<< HEAD
   ideep::tensor& weights = *(weight_);
+=======
+  ideep::tensor& weights = *(weight_.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool with_bias = bias_.has_value();
   const auto& kernel_size = weights.get_dims();
   // dst
@@ -1277,7 +1284,11 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
   float sum_scale = has_accum ? accum.value().q_scale() : 1.0;
   int32_t sum_zero_point = has_accum ? accum.value().q_zero_point() : 0;
   if (has_accum) {
+<<<<<<< HEAD
     // Just tells we have these post op, the actual value such as scale and zero point will be set later.
+=======
+    // Just tells we have these post op, the actual value such as scale and zero point will be setted later.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     op_attr = kReluFused ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
     const ideep::scale_t accum_scale = ideep::scale_t(1, 1.0/sum_scale);
     const ideep::zero_point_t accum_zero_points = ideep::zero_point_t(1, sum_zero_point);
@@ -1393,6 +1404,7 @@ template at::Tensor PackedConvWeightsOnednn<3>::apply_relu(
     double output_scale,
     int64_t output_zero_point);
 
+<<<<<<< HEAD
 static at::Tensor _fp8_convolution_onednn_ref(
     at::Tensor act, // contains quantized values but not QTensor
     double act_scale,
@@ -1507,6 +1519,8 @@ static at::Tensor _fp8_convolution_onednn_ref(
   return y_f32.to(out_dtype);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static at::Tensor _quantized_convolution_onednn(
     at::Tensor act, // contains quantized values but not QTensor
     double act_scale,
@@ -1531,7 +1545,10 @@ static at::Tensor _quantized_convolution_onednn(
     std::optional<std::string_view> unary_attr,
     torch::List<std::optional<at::Scalar>> unary_scalars,
     std::optional<std::string_view> unary_algorithm) {
+<<<<<<< HEAD
   using ideep::tensor;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /*********************************/
   /*          Checks               */
   /*********************************/
@@ -1588,6 +1605,13 @@ static at::Tensor _quantized_convolution_onednn(
   if (kSpatialDim == 1) {
     kSpatialDim += 1;
   }
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(
+    weight.is_mkldnn(),
+    func_name, ": Weight should be prepacked as an MKLDNN tensor"
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (transposed) {
     TORCH_CHECK(
       false,
@@ -1601,6 +1625,7 @@ static at::Tensor _quantized_convolution_onednn(
     padding = quant_utils::MakeArgForConv1d(padding, 0);
     dilation = quant_utils::MakeArgForConv1d(dilation, 1);
   }
+<<<<<<< HEAD
   auto act_dtype = act.scalar_type();
   TORCH_CHECK(
     act_dtype == c10::ScalarType::Byte || act_dtype == c10::ScalarType::Float8_e4m3fn,
@@ -1608,6 +1633,14 @@ static at::Tensor _quantized_convolution_onednn(
   TORCH_CHECK(
     weight.scalar_type() == c10::ScalarType::Char || weight.scalar_type() == c10::ScalarType::Float8_e4m3fn,
     func_name, ": Weight tensor should have int8 (char) or fp8 data type");
+=======
+  TORCH_CHECK(
+    act.scalar_type() == c10::ScalarType::Byte,
+    func_name, ": Input tensor should have uint8 (unsigned char) data type");
+  TORCH_CHECK(
+    weight.scalar_type() == c10::ScalarType::Char,
+    func_name, ": Weight tensor should have int8 (char) data type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
     weight.ndimension() == kSpatialDim + 2,
     func_name, ": Weights are expected to have ", kSpatialDim + 2, " dimensions");
@@ -1623,6 +1656,7 @@ static at::Tensor _quantized_convolution_onednn(
     dilation.size() == (decltype(dilation.size()))kSpatialDim,
     func_name, ": dilation should contain ", kSpatialDim, " elements for ",
     kSpatialDim, "D convolution.");
+<<<<<<< HEAD
   bool is_fp8 = weight.scalar_type() == c10::ScalarType::Float8_e4m3fn;
   if (is_fp8) {
     TORCH_CHECK(act_dtype == c10::ScalarType::Float8_e4m3fn,
@@ -1647,6 +1681,8 @@ static at::Tensor _quantized_convolution_onednn(
     weight.is_mkldnn(),
     func_name, ": Weight should be prepacked as an MKLDNN tensor"
   );
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Parameters
 #if IDEEP_PREREQ(3, 1, 0, 1)
@@ -1722,7 +1758,11 @@ static at::Tensor _quantized_convolution_onednn(
                                    c10::MemoryFormat::ChannelsLast :
                                    c10::MemoryFormat::ChannelsLast3d);
   auto src_dims = act_contig.sizes().vec();
+<<<<<<< HEAD
   auto src_data_type = at::native::get_mkldnn_dtype(act.scalar_type());
+=======
+  auto src_data_type = dnnl::memory::data_type::u8;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto src_desc = ideep::tensor::desc(src_dims, src_data_type,
       kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
   ideep::tensor src;
@@ -1734,13 +1774,20 @@ static at::Tensor _quantized_convolution_onednn(
   output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec());
   ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
   // Output is not a quantized tensor but data type is uint8
+<<<<<<< HEAD
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : act_dtype;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor output = has_accum_postop_sum ?
     accum.value() :
     at::empty(
       dst_dims,
       at::device(c10::kCPU)
+<<<<<<< HEAD
           .dtype(out_dtype)
+=======
+          .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
               c10::MemoryFormat::ChannelsLast3d)
@@ -1760,6 +1807,7 @@ static at::Tensor _quantized_convolution_onednn(
     unary_scalars,
     unary_algorithm.has_value() ? unary_algorithm.value() : ""
   );
+<<<<<<< HEAD
   // Avoid NaN if output dtype is fp8
   if (out_dtype == c10::kFloat8_e4m3fn) {
     // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
@@ -1770,13 +1818,21 @@ static at::Tensor _quantized_convolution_onednn(
     op_attr.set_post_ops(post_ops);
     output_scale = 1.0f;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if IDEEP_PREREQ(3, 1, 0, 0)
   // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead.
   // The functions from ideep are heavy because they have complex data structures for unified API
   // oneDNN version >= 3.1.0 is required.
+<<<<<<< HEAD
   auto weight_grouped = packed_weight.make_grouped_weights(groups, /* is_deconv */false);
   auto weights_desc = tensor::desc(weight_grouped.get_dims(), packed_weight.get_data_type(), ideep::format_tag::any);
+=======
+  using ideep::tensor;
+  auto weight_grouped = packed_weight.make_grouped_weights(groups, /* is_deconv */false);
+  auto weights_desc = tensor::desc(weight_grouped.get_dims(), ideep::data_type::s8, ideep::format_tag::any);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (groups > 1) {
     weights_desc = weights_desc.to_grouped(groups);
   }
@@ -2245,6 +2301,7 @@ TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"), at::native::QConvoneDNN::run_pointwise_binary_tensor);
 }
 
+<<<<<<< HEAD
 TORCH_LIBRARY_IMPL(onednn, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv_pointwise"), at::native::QConvoneDNN::run_pointwise);
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"), at::native::QConvoneDNN::run_pointwise_tensor);
@@ -2252,5 +2309,7 @@ TORCH_LIBRARY_IMPL(onednn, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"), at::native::QConvoneDNN::run_pointwise_binary_tensor);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index e9043f06b3018..9c764c1dfd9e6 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -519,10 +519,13 @@ at::Tensor _qconv_prepack_onednn(
       dilation.size() == (decltype(dilation.size()))kSpatialDim,
       "dilation should contain ", kSpatialDim, " elements for ",
       kSpatialDim, "D convolution.");
+<<<<<<< HEAD
   TORCH_CHECK(
       weight.scalar_type() == at::kChar || weight.scalar_type() == at::kFloat8_e4m3fn,
       "Weight should have dtype int8 or fp8_e4m3fn but got ", weight.scalar_type());
   bool is_fp8 = weight.scalar_type() == at::kFloat8_e4m3fn;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_1d = (1 == kSpatialDim);
   auto x_dims = input_shape.has_value()?input_shape.value().vec():ideep::dims();
@@ -539,12 +542,15 @@ at::Tensor _qconv_prepack_onednn(
     dilation = quant_utils::MakeArgForConv1d(dilation, 1);
     kSpatialDim += 1;
   }
+<<<<<<< HEAD
   if (is_fp8) {
     // The current version of oneDNN does not support fp8 conv
     // TODO(weiwen) Remove this when oneDNN supports fp8 conv
     // FP8 convolution is not supported by oneDNN until v3.9
     return weight;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto w_dims = weight.sizes().vec();
   auto strides = stride.vec();
   auto padding_l = padding.vec();
@@ -591,6 +597,7 @@ at::Tensor _qconv_prepack_onednn(
   ideep::dims dims_iohw, dims_giohw;
   ideep::tag w_tag = ideep::tag::any;
   const bool with_groups = groups > 1;
+<<<<<<< HEAD
   auto w_dnnl_dtype = at::native::get_mkldnn_dtype(weight.scalar_type());
   auto x_dnnl_dtype = is_fp8 ? dnnl::memory::data_type::f8_e4m3 : dnnl::memory::data_type::u8;
   w_desc = ideep::convolution_forward::expected_weights_desc(
@@ -598,6 +605,13 @@ at::Tensor _qconv_prepack_onednn(
       strides, padding_l, padding_r, dilates, groups,
       dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference,
       x_dnnl_dtype, x_dims, op_attr, /*is_channels_last=*/true);
+=======
+  w_desc = ideep::convolution_forward::expected_weights_desc(
+      w_dims, dnnl::memory::data_type::s8,
+      strides, padding_l, padding_r, dilates, groups,
+      dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference,
+      dnnl::memory::data_type::u8, x_dims, op_attr, /*is_channels_last=*/true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Note: Weight in Conv1D will unsqueeze into Conv2D in previous step
   weight_copy = weight.clone(c10::MemoryFormat::Contiguous);
@@ -610,7 +624,11 @@ at::Tensor _qconv_prepack_onednn(
   ideep::dims wei_dims = with_groups ? ideep::utils::group_dims(w_desc.get_dims(), groups)
                                   : w_desc.get_dims();
   ideep::tensor wgt = ideep::tensor(
+<<<<<<< HEAD
       ideep::tensor::desc({wei_dims, w_dnnl_dtype, w_tag}, groups),
+=======
+      ideep::tensor::desc({wei_dims, dnnl::memory::data_type::s8, w_tag}, groups),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       weight_copy.data_ptr());
 
   wgt.set_scale(weights_scales); // Scales are needed for feed_from().
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 40fb1c6c0f5f1..6b151023be964 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -33,7 +33,11 @@
  * for each row along with the quantized weights.
  */
 c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
+<<<<<<< HEAD
     const at::Tensor& qweight) {
+=======
+    at::Tensor qweight) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static constexpr int64_t version = 1;
   TORCH_CHECK(
       qweight.dim() == 2,
@@ -67,8 +71,13 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
       "Expect embedding_bag weights to be quantized using kPerChannelAffineFloatQParams");
   std::vector<float> weight_bias(embedding_rows);
 
+<<<<<<< HEAD
   const auto& channel_scales = qweight.q_per_channel_scales();
   const auto& channel_zero_points = qweight.q_per_channel_zero_points();
+=======
+  at::Tensor channel_scales = qweight.q_per_channel_scales();
+  at::Tensor channel_zero_points = qweight.q_per_channel_zero_points();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<float> weight_scales(
       channel_scales.data_ptr<float>(),
       channel_scales.data_ptr<float>() + embedding_rows);
@@ -77,11 +86,14 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
       channel_zero_points.data_ptr<float>() + embedding_rows);
 
   for (const auto i : c10::irange(embedding_rows)) {
+<<<<<<< HEAD
     // As of now weight_zero_points and weight_scales are initialized with
     // the size of embedding_rows. Hence, this linter is a false positive.
     // However, if this assumption changes in the future, we need to
     // ensure that the bounds are checked.
     // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weight_bias[i] = weight_zero_points[i] * weight_scales[i] * -1;
   }
 
@@ -158,6 +170,7 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
   return packed_ptr;
 }
 
+<<<<<<< HEAD
 #ifdef USE_FBGEMM
 namespace {
 /// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
@@ -188,16 +201,21 @@ auto _get_rowwise_min_max_contig(
 }
 #endif // USE_FBGEMM
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
 // quantize and pack function once we support FP scale and FP zero_point
 //
+<<<<<<< HEAD
 // The optional rowwise_min_max argument is to support callers to pass in the min/max
 // values of the weight tensor. If the rowwise_min_max is not provided, the min/max
 // values will be computed from the weight tensor.
 //
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Python example examining a packed 8bit zero_point and scale:
 //
 // >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@@ -255,10 +273,14 @@ namespace at::native {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
+<<<<<<< HEAD
 Tensor& qembeddingbag_byte_prepack_out(
     Tensor& output,
     const Tensor& weight,
     const std::optional<Tensor>& rowwise_min_max_opt) {
+=======
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // The "last" dimension of an N-Dimensioned batch of embedding bags is
   // quantization channel. E.g. for a 2D embedding bag, this has
   // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@@ -279,20 +301,32 @@ Tensor& qembeddingbag_byte_prepack_out(
 
   const auto weight_sizes = weight.sizes();
   const auto cols_dim = weight_sizes.size() - 1;
+<<<<<<< HEAD
   const int64_t embedding_rows = c10::size_to_dim_(static_cast<int>(cols_dim), weight_sizes);
   const int32_t embedding_cols = static_cast<int32_t>(weight_sizes[cols_dim]);
   // Add 8 bytes per column to store FP32 scale and zero_point per row.
   const int32_t output_columns = static_cast<int32_t>(embedding_cols + 2 * sizeof(float));
+=======
+  const int64_t embedding_rows = c10::size_to_dim_(cols_dim, weight_sizes);
+  const int32_t embedding_cols = weight_sizes[cols_dim];
+  // Add 8 bytes per column to store FP32 scale and zero_point per row.
+  const int32_t output_columns = embedding_cols + 2 * sizeof(float);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto weight_contig =
       weight.expect_contiguous(weight.suggest_memory_format());
 
   // Adjust output dimensions to account for FP32 scale and zero_points.
   std::vector<int64_t> output_shape = weight_sizes.vec();
+<<<<<<< HEAD
   output_shape.at(cols_dim) = output_columns;
+=======
+  output_shape[cols_dim] = output_columns;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::native::resize_(output, output_shape, std::nullopt);
   auto* output_data = output.data_ptr<uint8_t>();
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
   // Move these outside of the ifdef when we support non-FBGEMM flow.
   const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
   const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
@@ -303,6 +337,11 @@ Tensor& qembeddingbag_byte_prepack_out(
     const auto rowwise_min_max_data = is_valid_rowwise_min_max
         ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
         : nullptr;
+=======
+  if (weight_contig->scalar_type() == at::ScalarType::Half) {
+    const auto weight_data =
+        static_cast<fbgemm::float16*>(weight_contig->data_ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::parallel_for(
         0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
           fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@@ -310,6 +349,7 @@ Tensor& qembeddingbag_byte_prepack_out(
               weight_data + start_idx * embedding_cols,
               end_idx - start_idx,
               embedding_cols,
+<<<<<<< HEAD
               output_data + start_idx * output_columns,
               (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
         });
@@ -317,14 +357,24 @@ Tensor& qembeddingbag_byte_prepack_out(
     const auto weight_data = weight_contig->data_ptr<float>();
     const auto rowwise_min_max_data =
         is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
+=======
+              output_data + start_idx * output_columns);
+        });
+  } else {
+    const auto weight_data = weight_contig->data_ptr<float>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::parallel_for(
         0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
           fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
               weight_data + start_idx * embedding_cols,
               end_idx - start_idx,
               embedding_cols,
+<<<<<<< HEAD
               output_data + start_idx * output_columns,
               (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+=======
+              output_data + start_idx * output_columns);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         });
   }
 
@@ -374,6 +424,7 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   return output;
 }
 
+<<<<<<< HEAD
 static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
     const Tensor& weight,
     const Tensor& rowwise_min_max) {
@@ -390,6 +441,8 @@ static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
   return output;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
   const auto weight_contig =
       weight.expect_contiguous(weight.suggest_memory_format());
@@ -397,6 +450,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
       weight.scalar_type() == at::ScalarType::Float ||
           weight.scalar_type() == at::ScalarType::Half,
       "'embedding_bag_byte_prepack' only support float32 or float16.");
+<<<<<<< HEAD
   const auto weight_sizes = weight.sym_sizes();
   const auto cols_dim = weight.ndimension() - 1;
   const auto& embedding_cols = weight_sizes[cols_dim];
@@ -406,6 +460,17 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
   // Adjust output dimensions to account for FP32 scale and zero_points.
   auto output_shape = weight_sizes.vec();
   output_shape.at(cols_dim) = output_columns;
+=======
+  const auto weight_sizes = weight.sizes();
+  const auto cols_dim = weight_sizes.size() - 1;
+  const int32_t embedding_cols = weight_sizes[cols_dim];
+  // Add 8 bytes per column to store FP32 scale and zero_point per row.
+  const int32_t output_columns = embedding_cols + 2 * sizeof(float);
+
+  // Adjust output dimensions to account for FP32 scale and zero_points.
+  std::vector<int64_t> output_shape = weight_sizes.vec();
+  output_shape[cols_dim] = output_columns;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::SymDimVector output_shape_vec(output_shape);
 
   return at::empty_symint(
@@ -423,8 +488,12 @@ Tensor _qembeddingbag_nbit_prepack_helper(
     int bit_width,
     const bool optimized_qparams,
     const int64_t nbins,
+<<<<<<< HEAD
     const double ratio,
     const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
+=======
+    const double ratio) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       weight.scalar_type() == at::ScalarType::Float ||
           weight.scalar_type() == at::ScalarType::Half,
@@ -466,17 +535,23 @@ Tensor _qembeddingbag_nbit_prepack_helper(
   auto* output_data = output.data_ptr<uint8_t>();
 
 #ifdef USE_FBGEMM
+<<<<<<< HEAD
   // Move these outside of the ifdef when we support non-FBGEMM flow.
   const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
   const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!optimized_qparams) {
     if (weight_contig.scalar_type() == at::ScalarType::Half) {
       const auto weight_data =
           static_cast<fbgemm::float16*>(weight_contig.data_ptr());
+<<<<<<< HEAD
       const auto rowwise_min_max_data = is_valid_rowwise_min_max
           ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
           : nullptr;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::parallel_for(
           0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
             fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@@ -484,6 +559,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                 bit_width,
                 weight_data + start_idx * embedding_cols,
                 end_idx - start_idx,
+<<<<<<< HEAD
                 static_cast<int>(embedding_cols),
                 output_data + start_idx * output_shape[1],
                 (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
@@ -492,15 +568,27 @@ Tensor _qembeddingbag_nbit_prepack_helper(
       const auto weight_data = weight_contig.data_ptr<float>();
       const auto rowwise_min_max_data =
           is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
+=======
+                embedding_cols,
+                output_data + start_idx * output_shape[1]);
+          });
+    } else {
+      const auto weight_data = weight_contig.data_ptr<float>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::parallel_for(
           0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
             fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
                 bit_width,
                 weight_data + start_idx * embedding_cols,
                 end_idx - start_idx,
+<<<<<<< HEAD
                 static_cast<int>(embedding_cols),
                 output_data + start_idx * output_shape[1],
                 (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+=======
+                embedding_cols,
+                output_data + start_idx * output_shape[1]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           });
     }
   } else {
@@ -556,7 +644,11 @@ Tensor _qembeddingbag_nbit_prepack_helper(
         std::uint8_t quantized = std::max(
             0,
             std::min<int>(
+<<<<<<< HEAD
                 static_cast<int>(lrintf((X - Xmin) * inverse_scale)), (1 << bit_width) - 1));
+=======
+                lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // We pack 2 4-bit values in a byte. Index 0 is packed in the lower
         // 4-bits and index 1 is packed in the upper 4-bits.
         if (col % NUM_ELEM_PER_BYTE == 0) {
@@ -590,6 +682,7 @@ Tensor qembeddingbag_4bit_prepack(
       weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 
+<<<<<<< HEAD
 Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
     const Tensor& weight,
     const Tensor& rowwise_min_max,
@@ -600,6 +693,8 @@ Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
       weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Applies 2-bit row-wise quantization by determining the range
 // (maximum - minimum) and bias (minimum value) of each row in the input
 // matrix, and then scaling each element to an 2-bit number between 0 and
@@ -617,6 +712,7 @@ Tensor qembeddingbag_2bit_prepack(
       weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 
+<<<<<<< HEAD
 Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
     const Tensor& weight,
     const Tensor& rowwise_min_max,
@@ -631,6 +727,12 @@ class QEmbeddingPackWeights final {
  public:
   static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
     return PackedEmbeddingBagWeight::prepack(weight);
+=======
+class QEmbeddingPackWeights final {
+ public:
+  static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(at::Tensor weight) {
+    return PackedEmbeddingBagWeight::prepack(std::move(weight));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -639,6 +741,7 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
       TORCH_FN(qembeddingbag_byte_prepack));
   m.impl(
+<<<<<<< HEAD
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
       TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
   m.impl(
@@ -653,6 +756,13 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
       TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
+=======
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
+      TORCH_FN(qembeddingbag_4bit_prepack));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
+      TORCH_FN(qembeddingbag_2bit_prepack));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
index c110e63b36294..ce79bc3e6321f 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@@ -3,10 +3,14 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 Tensor& qembeddingbag_byte_prepack_out(
     Tensor& output,
     const Tensor& weight,
     const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
+=======
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Tensor qembeddingbag_byte_prepack(const Tensor& weight);
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 7a80b166f8cb7..3729e06a87319 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -3,7 +3,10 @@
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
 #include <ATen/core/Tensor.h>
+<<<<<<< HEAD
 #include <ATen/core/List.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/quantized/PackedParams.h>
 #include <ATen/native/quantized/cpu/ACLUtils.h>
@@ -28,6 +31,7 @@
 #include <ATen/ops/quantize_per_tensor_native.h>      // for quantize_per_te...
 #include <ATen/ops/zeros.h>
 #include <ATen/ops/_weight_int4pack_mm_for_cpu.h>
+<<<<<<< HEAD
 #include <ATen/ops/linear.h>
 #include <ATen/ops/relu.h>
 #include <ATen/ops/leaky_relu.h>
@@ -36,6 +40,8 @@
 #include <ATen/ops/hardtanh.h>
 #include <ATen/ops/hardswish.h>
 #include <ATen/ops/sigmoid.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #include <c10/util/irange.h>
@@ -812,7 +818,11 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(
 
   auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
   auto input_contig = input.expect_contiguous();
+<<<<<<< HEAD
   auto& w = *weight_;
+=======
+  auto& w = *(weight_.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1);
   auto input_dims = {M, K};
   auto input_data_type = is_input_qint8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::u8;
@@ -927,6 +937,7 @@ at::Tensor PackedLinearWeightsOnednn:: apply_tanh(
       std::move(input), output_scale, output_zero_point);
 }
 
+<<<<<<< HEAD
 static at::Tensor fp8_qlinear_onednn_ref(
     at::Tensor input,
     double input_scale,
@@ -1053,6 +1064,8 @@ static at::Tensor fp8_qlinear_onednn_ref(
   return y_f32.to(out_dtype);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static at::Tensor linear_int8_with_onednn_weight(
     at::Tensor input, // int8 CPU Tensor, not QTensor
     double input_scale,
@@ -1074,6 +1087,7 @@ static at::Tensor linear_int8_with_onednn_weight(
     std::string_view& unary_post_op_algorithm) {
   using ideep::tensor;
   const int64_t dim = input.dim();
+<<<<<<< HEAD
   TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte || input.scalar_type() == c10::ScalarType::Char || input.scalar_type() == c10::ScalarType::Float8_e4m3fn,
       "qlinear with mkldnn tensor: data type of input should be uint8, int8 or float8_e4m3fn.");
   TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char || onednn_weight.scalar_type() == c10::ScalarType::Float8_e4m3fn,
@@ -1086,6 +1100,12 @@ static at::Tensor linear_int8_with_onednn_weight(
         input.scalar_type(), " and ", onednn_weight.scalar_type());
     is_fp8 = true;
   }
+=======
+  TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte || input.scalar_type() == c10::ScalarType::Char,
+      "qlinear with mkldnn tensor: data type of input should be uint8 or int8 (unsigned char or char).");
+  TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
+      "qlinear with mkldnn tensor: data type of weight should be int8 (char).");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       weight_scales.scalar_type() == c10::ScalarType::Float, "weight scales should be dtype c10::ScalarType::Float.");
   TORCH_CHECK(
@@ -1119,7 +1139,11 @@ static at::Tensor linear_int8_with_onednn_weight(
       );
     }
     if (binary_post_op == "sum") {
+<<<<<<< HEAD
       auto expected_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type();
+=======
+      auto expected_dtype = output_dtype.has_value() ? output_dtype.value() : c10::kByte;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(
           other.value().scalar_type() == expected_dtype,
           "onednn qlinear: the dtype of extra input for binary post op should be ", expected_dtype,
@@ -1127,6 +1151,7 @@ static at::Tensor linear_int8_with_onednn_weight(
       );
     }
   }
+<<<<<<< HEAD
 #if defined(__powerpc__)
   if (is_fp8) {
 #else
@@ -1140,6 +1165,8 @@ static at::Tensor linear_int8_with_onednn_weight(
         binary_post_op, binary_alpha, unary_post_op,
         unary_post_op_args, unary_post_op_algorithm);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // If the input has more than two dimensions, we will reshape it to a 2-dimensional form
   // for calculation and subsequently reshape the output back.
@@ -1167,13 +1194,20 @@ static at::Tensor linear_int8_with_onednn_weight(
   }
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
+<<<<<<< HEAD
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor output = binary_post_op == "sum" ?
       other.value() :
       at::empty(
         dst_dims,
         at::device(c10::kCPU)
+<<<<<<< HEAD
             .dtype(out_dtype)
+=======
+            .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : c10::kByte))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       );
   if (output.numel() == 0) {
     return output;
@@ -1186,7 +1220,11 @@ static at::Tensor linear_int8_with_onednn_weight(
       empty_tensor;
 
   // Create onednn primitive
+<<<<<<< HEAD
   auto src_dtype = at::native::get_mkldnn_dtype(input.scalar_type());
+=======
+  auto src_dtype = input.scalar_type() == c10::kByte ? ideep::data_type::u8 : ideep::data_type::s8;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto src_desc = tensor::desc(src_dims, src_dtype, ideep::format_tag::any);
   auto weights_desc = packed_weight.get_desc();
   auto dst_dtype = dst.get_data_type();
@@ -1208,6 +1246,7 @@ static at::Tensor linear_int8_with_onednn_weight(
     unary_post_op_args,
     unary_post_op_algorithm
   );
+<<<<<<< HEAD
   // Avoid NaN if output dtype is fp8
   if (out_dtype == c10::kFloat8_e4m3fn) {
     // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
@@ -1218,6 +1257,8 @@ static at::Tensor linear_int8_with_onednn_weight(
     op_attr.set_post_ops(post_ops);
     output_scale = 1.0f;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (input_scale != 1.0f) {
     op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
   }
@@ -1388,7 +1429,11 @@ namespace at::native {
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
         "onednn int8 linear: act scale/zp size should be 1/<=1");
     static std::optional<at::Tensor> other = std::nullopt;
+<<<<<<< HEAD
     constexpr std::string_view binary_post_op = "none";
+=======
+    static const std::string_view binary_post_op = "none";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zp,
@@ -1630,6 +1675,7 @@ TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
       TORCH_FN(at::native::QLinearOnednn::run_pointwise_binary_tensor));
 }
 
+<<<<<<< HEAD
 TORCH_LIBRARY_IMPL(onednn, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
       TORCH_FN(QLinearOnednn::run_pointwise));
@@ -1641,5 +1687,7 @@ TORCH_LIBRARY_IMPL(onednn, CPU, m) {
       TORCH_FN(at::native::QLinearOnednn::run_pointwise_binary_tensor));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index d844d49838db5..47231e4849853 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -414,6 +414,10 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
   TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
   TORCH_CHECK(input.dim() >= 2);
 
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
   const int64_t N = packed_weight_fp16.numCols();
   std::vector<int64_t> output_sizes = input.sizes().vec();
@@ -544,7 +548,11 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
       /*reduce_range=*/reduce_range);
   const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point);
   // weights, dst
+<<<<<<< HEAD
   auto w = *weight_;
+=======
+  auto w = *(weight_.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dst_dims = {x.get_dim(0), w.get_dim(1)};
   const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale);
   const ideep::scale_t& weights_scales = w.get_scale();
@@ -887,7 +895,11 @@ class QLinearUnpackedDynamicFp16 final {
   static at::Tensor run(
       at::Tensor input,
       const at::Tensor& weight,
+<<<<<<< HEAD
       const std::optional<at::Tensor>& bias) {
+=======
+      const at::Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We make a strong guarantee that models using these operators will have
     // the same numerics across different machines. Therefore, we do not provide
     // a fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -907,7 +919,11 @@ class QLinearUnpackedDynamicFp16 final {
   static at::Tensor meta(
       at::Tensor input,
       const at::Tensor& weight,
+<<<<<<< HEAD
       const std::optional<at::Tensor>& bias) {
+=======
+      const at::Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We make a strong guarantee that models using these operators will have
     // the same numerics across different machines. Therefore, we do not provide
     // a fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -928,7 +944,11 @@ class QLinearUnpackedDynamicFp16 final {
   static at::Tensor run(
       at::Tensor /* input */,
       const at::Tensor& weight,
+<<<<<<< HEAD
       const std::optional<at::Tensor>& bias) {
+=======
+      const at::Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We make a strong guarantee that models using these operators will have
     // the same numerics across different machines. Therefore, we do not provide
     // a fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -939,7 +959,11 @@ class QLinearUnpackedDynamicFp16 final {
   static at::Tensor meta(
       at::Tensor /* input */,
       const at::Tensor& weight,
+<<<<<<< HEAD
       const std::optional<at::Tensor>& bias) {
+=======
+      const at::Tensor& bias) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false, "This PyTorch installation was not built with FBGEMM operators");
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index b4ae4e677bcd2..b1444316243bb 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -297,6 +297,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
 static inline at::Tensor pack_weight_to_onednn_tensor(
     const at::Tensor& weight,
     std::optional<torch::List<int64_t>>& input_shape) {
+<<<<<<< HEAD
   at::ScalarType weigh_dtype = weight.scalar_type();
   TORCH_CHECK(
     weigh_dtype == at::kChar || weigh_dtype == at::kFloat8_e4m3fn,
@@ -328,6 +329,16 @@ static inline at::Tensor pack_weight_to_onednn_tensor(
       : dnnl::memory::data_type::u8;
   auto w_desc = ideep::matmul_forward::expected_weights_desc(
       wei.get_dims(), input_dims, w_data_type, x_data_type, op_attr);
+=======
+  std::vector<int64_t> w_dims = weight.sizes().vec();
+  ideep::tensor wei = ideep::tensor({w_dims, dnnl::memory::data_type::s8}, weight.data_ptr());
+  wei.transpose_(0, 1); // oneDNN requires transposed weight
+  ideep::dims input_dims = input_shape.has_value() ? input_shape.value().vec() : ideep::dims();
+  ideep::attr_t op_attr;
+  op_attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
+  auto w_desc = ideep::matmul_forward::expected_weights_desc(
+      wei.get_dims(), input_dims, dnnl::memory::data_type::s8, dnnl::memory::data_type::u8, op_attr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ideep::tensor expected_weight(w_desc);
   expected_weight.feed_from(wei);
   auto packed_weight = at::native::new_with_itensor_mkldnn(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh
index 5c52f1a020f1e..d99d19eb19cd1 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-arm64.sh
@@ -53,7 +53,11 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/android/arm64-v8a && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh
index 81da44097801f..5d70431398ac8 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-armv7.sh
@@ -53,7 +53,11 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/android/armeabi-v7a && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh
index 747704f1edfea..63e8aada1d80a 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-android-x86.sh
@@ -53,7 +53,11 @@ CMAKE_ARGS+=("-DANDROID_PIE=ON")
 CMAKE_ARGS+=("-DANDROID_STL=c++_static")
 CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/android/x86 && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh
index 8e867f18d3f91..7c6a88af0bbf4 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64.sh
@@ -40,7 +40,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/arm64 && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh
index 34a95d1944148..eaead37025b79 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-arm64e.sh
@@ -40,7 +40,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=arm64e")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/arm64e && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh
index 37e57ab557fcc..ab94bf8c61aca 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7.sh
@@ -40,7 +40,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/armv7 && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh
index 2fd2732191112..eb24af313b6e0 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-armv7s.sh
@@ -40,7 +40,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=armv7s")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/armv7s && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh
index b51b574d8136a..747f1e207402c 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-i386.sh
@@ -40,7 +40,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=i386")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/i386 && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh
index a3430082e3e57..6a688da2abba6 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-ios-x86_64.sh
@@ -45,7 +45,11 @@ CMAKE_ARGS+=("-DIOS_ARCH=x86_64")
 CMAKE_ARGS+=("-DENABLE_BITCODE=OFF")
 CMAKE_ARGS+=("-DENABLE_ARC=OFF")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/ios/x86_64 && cmake ../../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh
index ac61a4061b90c..eb65d8021abec 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/scripts/build-local.sh
@@ -27,7 +27,11 @@ CMAKE_ARGS+=("-DPYTORCH_QNNPACK_LIBRARY_TYPE=static")
 CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_BENCHMARKS=ON")
 CMAKE_ARGS+=("-DPYTORCH_QNNPACK_BUILD_TESTS=ON")
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overriding defaults
+=======
+# Use-specified CMake arguments go last to allow overridding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 cd build/local && cmake ../.. \
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
index 29f5338f5c734..4a709c83810a8 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
@@ -368,7 +368,11 @@ static enum pytorch_qnnp_status pytorch_qnnp_create_convolution_ndhwc_q8(
     case pytorch_qnnp_ukernel_type_xzp_gemm: {
       // TODO: XZP kernels won't be supporting per channel quantization.
       // For now we dont use XZP kernels anywhere. Probably deprecate it for now
+<<<<<<< HEAD
       // and resurrect later if needed.
+=======
+      // and ressurrect later if needed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const uint32_t nr = pytorch_qnnp_params.q8conv_xzp.nr;
       const uint32_t kr = pytorch_qnnp_params.q8conv_xzp.kr;
       const uint32_t sr = pytorch_qnnp_params.q8conv_xzp.kc;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
index ac06fa5973eca..0708e433d5c0c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
@@ -20,6 +20,7 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |a           | 0
 #  |w           | 4
@@ -28,10 +29,21 @@
 #  |out ch index| 16
 #  |params      | 20
 #  |------------|
+=======
+#  |-----------|
+#  |a          | 0
+#  |w          | 4
+#  |c          | 8
+#  |c_stride   | 12
+#  |out ch indx| 16
+#  |params     | 20
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
+<<<<<<< HEAD
 #  |------------|
 #  |d8 - d15    | 0
 #  |r4 - r11    | 64
@@ -42,6 +54,18 @@
 #  |out ch index| 112
 #  |params      | 116
 #  |------------|
+=======
+#  |-----------|
+#  |d8 - d15   | 0
+#  |r4 - r11   | 64
+#  |a          | 96
+#  |w          | 100
+#  |c          | 104
+#  |c_stride   | 108
+#  |out ch indx| 112
+#  |params     | 116
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 # void pytorch_q8conv_ukernel_4x8__aarch32_neon(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
index 1653b46e2d374..ebb7b2f2979ae 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
@@ -23,10 +23,17 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |out ch index| 0
 #  |params      | 8
 #  |------------|
+=======
+#  |-----------|
+#  |out ch indx| 0
+#  |params     | 8
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8conv_ukernel_8x8__aarch64_neon(
 #    size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
index f18605124356e..a8b2ffc8eb3d3 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
@@ -20,6 +20,7 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |a_stride    | 0
 #  |w           | 4
@@ -28,10 +29,21 @@
 #  |out ch index| 16
 #  |params      | 20
 #  |------------|
+=======
+#  |-----------|
+#  |a_stride   | 0
+#  |w          | 4
+#  |c          | 8
+#  |c_stride   | 12
+#  |out ch indx| 16
+#  |params     | 20
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
+<<<<<<< HEAD
 #  |------------|
 #  |d8 - d15    | 0
 #  |r4 - r9     | 64
@@ -42,6 +54,18 @@
 #  |out ch index| 104
 #  |params      | 108
 #  |------------|
+=======
+#  |-----------|
+#  |d8 - d15   | 0
+#  |r4 - r9    | 64
+#  |a_stride   | 88
+#  |w          | 92
+#  |c          | 96
+#  |c_stride   | 100
+#  |out ch indx| 104
+#  |params     | 108
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 #
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
index c964bf2be7c44..1fcb59061000d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
@@ -33,6 +33,7 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |a_stride    | 0
 #  |w           | 4
@@ -41,10 +42,21 @@
 #  |out ch index| 16
 #  |params      | 20
 #  |------------|
+=======
+#  |-----------|
+#  |a_stride   | 0
+#  |w          | 4
+#  |c          | 8
+#  |c_stride   | 12
+#  |out ch indx| 16
+#  |params     | 20
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
+<<<<<<< HEAD
 #  |------------|
 #  |d8 - d15    | 0
 #  |r4 - r7     | 64
@@ -56,6 +68,19 @@
 #  |out ch index| 100
 #  |params      | 104
 #  |------------|
+=======
+#  |-----------|
+#  |d8 - d15   | 0
+#  |r4 - r7    | 64
+#  |a_stride   | 80
+#  |w          | 84
+#  |b          | 88
+#  |c          | 92
+#  |c_stride   | 96
+#  |out ch indx| 100
+#  |params     | 104
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 
 # void pytorch_q8gemm_ukernel_4x8__aarch32_neon(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
index 51866fd3b1ed1..1156d3ae6ecd5 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
@@ -22,10 +22,17 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |out ch index| 0
 #  |params      | 8
 #  |------------|
+=======
+#  |-----------|
+#  |out ch indx| 0
+#  |params     | 8
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_ukernel_8x8__aarch64_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
index 63f667b04a283..42ad88a09523e 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
@@ -14,11 +14,19 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |c_stride    | 0
 #  |out ch index| 8
 #  |params      | 16
 #  |------------|
+=======
+#  |-----------|
+#  |c_stride   | 0
+#  |out ch indx| 8
+#  |params     | 16
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_dq_ukernel_8x8__aarch64_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
index 4583e50046d69..a0e9d01250125 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
@@ -32,7 +32,11 @@
 #
 
 # Packed A format.
+<<<<<<< HEAD
 # 4kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
+=======
+# 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@@ -53,7 +57,11 @@
 # This locality helps in loading 8kx4m blocks of activations
 # Note when M is not multiple of 4, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
+<<<<<<< HEAD
 # This will be taken care by just copying the appropriate valid data
+=======
+# This wil be taken care by just copying the appropriate valid data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Also note that this packing is same as taking for 4x1 pattern.
 # This is because all the adjacent k's are laid next to each other
@@ -109,7 +117,11 @@ k_loop:
     VLD1.8 {d2}, [r6]!
     VLD1.8 {d3}, [r7]!
 
+<<<<<<< HEAD
     #  Now we have 4x8 block of values that we will transpose
+=======
+    #  Now we have 4x8 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  A matrix
     #  --------------------------------
     #  |                              |
@@ -155,7 +167,11 @@ k_loop:
     VTRN.32 d2, d3
     VSWP d1, d2
 
+<<<<<<< HEAD
     # Now store the transposed values
+=======
+    # Now store the tranposed values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # d0, d1, d2, d3
     VST1.8 {q0}, [r2]!
     VST1.8 {q1}, [r2]!
@@ -172,7 +188,11 @@ k_loop:
     VLD1.32 {d2[]}, [r6]
     VLD1.32 {d3[]}, [r7]
 
+<<<<<<< HEAD
     #  Now we have 4x8 block of values that we will transpose
+=======
+    #  Now we have 4x8 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  _d{0-3} are arm neon vector registers
     #  va0 = _d0 = a0 a1 a2 a3
     #  va1 = _d1 = b0 b1 b2 b3
@@ -218,7 +238,11 @@ k_loop:
     VEXT.8 d0, d0, d1, #4
     VEXT.8 d1, d2, d3, #4
 
+<<<<<<< HEAD
     # Now store the transposed values
+=======
+    # Now store the tranposed values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # d0, d1, d2, d3
     VST1.8 {q0}, [r2]
     .p2align 4
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
index d7a3aa6eaaf74..0d6149c25090f 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
@@ -46,7 +46,11 @@
 #  |b               | 12
 #  |c               | 16
 #  |c_stride        | 20
+<<<<<<< HEAD
 #  |out ch index    | 24
+=======
+#  |out ch indx     | 24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #  |params          | 28
 #  |----------------|
 #
@@ -61,7 +65,11 @@
 #  |b               | 108
 #  |c               | 112
 #  |c_stride        | 116
+<<<<<<< HEAD
 #  |out ch index    | 120
+=======
+#  |out ch indx     | 120
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #  |params          | 124
 #  |----------------|
 #
@@ -101,7 +109,11 @@
         /* Add output_channel_index to the b_zero_point pointer */            ;\
         ADD r4, r4, r5                                                        ;\
                                                                               ;\
+<<<<<<< HEAD
         /* We enter the loop if r1 is at least 1. */                           ;\
+=======
+        /* We enter the loop if r1 is atleast 1. */                           ;\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /* r1 = r1 - 1 will happen in the epilogue */                         ;\
         /* of the loop */                                                     ;\
         CMP r1, 1                                                             ;\
@@ -222,7 +234,11 @@
         /* Thus we will load accumulators back in q0, q1, q2, q3, q4, q5, q6, q7 */ ;\
         /* When nr < 4, extra q values will be fetched from stack which may overlap */ ;\
         /* with other parts of stack storing local variables. To avoid that we just */ ;\
+<<<<<<< HEAD
         /* create a buffer of 128 bytes in between to make sure pointer increment */ ;\
+=======
+        /* create a buffer of 128 bytes inbetween to make sure pointer increment */ ;\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /* never produces address that is beyond the stack frame of this function. */ ;\
         SUB r9, sp, 140                                                       ;\
         /* Each iteration produce 4 values each of 4 bytes */                 ;\
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
index 37db2adcad069..56c4f705b93c7 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
@@ -46,7 +46,11 @@
 #  |b               | 12
 #  |c               | 16
 #  |c_stride        | 20
+<<<<<<< HEAD
 #  |out ch index    | 24
+=======
+#  |out ch indx     | 24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #  |params          | 28
 #  |----------------|
 #
@@ -61,7 +65,11 @@
 #  |b               | 108
 #  |c               | 112
 #  |c_stride        | 116
+<<<<<<< HEAD
 #  |out ch index    | 120
+=======
+#  |out ch indx     | 120
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #  |params          | 124
 #  |----------------|
 #
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
index a5a91b9cb64f7..6d2dd6260f6c8 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
@@ -32,7 +32,11 @@
 #
 
 # Packed A format.
+<<<<<<< HEAD
 # 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
+=======
+# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@@ -53,7 +57,11 @@
 # This locality helps in loading 8kx8m blocks of activations
 # Note when M is not multiple of 8, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
+<<<<<<< HEAD
 # This will be taken care by just copying the appropriate valid data
+=======
+# This wil be taken care by just copying the appropriate valid data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
 #     size_t mr,
@@ -125,7 +133,11 @@ k_loop:
     VLD1.8 {d6}, [r10]!
     VLD1.8 {d7}, [r11]!
 
+<<<<<<< HEAD
     #  Now we have 8x8 block of values that we will transpose
+=======
+    #  Now we have 8x8 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  A matrix
     #  --------------------------------
     #  |                              |
@@ -189,7 +201,11 @@ k_loop:
     VTRN.32 q0, q2
     VTRN.32 q1, q3
 
+<<<<<<< HEAD
     # Now store the transposed values
+=======
+    # Now store the tranposed values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # d0, d1, d2, d3
     # then d4, d5, d6, d7 contiguously
     VST1.8 {q0}, [r2]!
@@ -213,7 +229,11 @@ k_loop:
     VLD1.32 {d6[]}, [r7]
     VLD1.32 {d7[]}, [r11]
 
+<<<<<<< HEAD
     #  Now we have 4x8 block of values that we will transpose
+=======
+    #  Now we have 4x8 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  _d{0-3} are arm neon vector registers
     #  va04 = _d0 = a0 a1 a2 a3 e0 e1 e2 e3
     #  va15 = _d1 = b0 b1 b2 b3 f0 f1 f2 f3
@@ -260,7 +280,11 @@ k_loop:
     VTRN.16 d0, d2
     VTRN.16 d1, d3
 
+<<<<<<< HEAD
     # Now store the transposed values
+=======
+    # Now store the tranposed values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # d0, d1, d2, d3
     # then d4, d5, d6, d7 contiguously
     VST1.8 {q0}, [r2]!
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
index b1f8fe719ca44..abf3da38bb7c6 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
@@ -9,7 +9,11 @@
 #include <qnnpack/assembly.h>
 
 # Packed A format.
+<<<<<<< HEAD
 # 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
+=======
+# 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Original A
 # --------- K -----------          -- (K + 4 - 1) / 4 --
 # |                     |          |                   |
@@ -30,7 +34,11 @@
 # This locality helps in loading 8kx8m blocks of activations
 # Note when M is not multiple of 8, the rest can contain arbitrary
 # data in packed A as we will not be writing those out.
+<<<<<<< HEAD
 # This will be taken care by just copying the appropriate valid data
+=======
+# This wil be taken care by just copying the appropriate valid data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch32_neon(
 #     size_t mr,
@@ -93,7 +101,11 @@ k_loop:
     LD1 {v3.d}[0], [x7], 8
     LD1 {v3.d}[1], [x11], 8
 
+<<<<<<< HEAD
     #  Now we have 8x8 block of values that we will transpose
+=======
+    #  Now we have 8x8 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  A matrix
     #  ------------------------
     #  |                      |
@@ -180,7 +192,11 @@ k_loop:
     LD1 {v3.s}[0], [x7]
     LD1 {v3.s}[1], [x11]
 
+<<<<<<< HEAD
     #  Now we have 8x4 block of values that we will transpose
+=======
+    #  Now we have 8x4 block of values that we will tranpose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #  A matrix
     #  ----------------------------
     #  |                          |
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
index df707d3d800ea..cff379b7daf58 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
@@ -14,7 +14,11 @@
 #include "8x4c1x4-packed-sse2.h"
 
 // This is a super slow kernel in that it does not use intrinsics to
+<<<<<<< HEAD
 // transpose. Since this is for x86 we are not optimizing it.
+=======
+// tranpose. Since this is for x86 we are not optimizing it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // For ARM this will be optimized.
 void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
     const size_t mr,
@@ -24,7 +28,11 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
     uint8_t* a_packed) {
 
   // Packed A format.
+<<<<<<< HEAD
   // 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
+=======
+  // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Original A
   // --------- K -----------          -- (K + 4 - 1) / 4 --
   // |                     |          |                   |
@@ -45,7 +53,11 @@ void pytorch_q8gemm_sparse_packA_ukernel_8x4__sse2(
   // This locality helps in loading 8kx8m blocks of activations
   // Note when M is not multiple of 8, the rest can contain arbitrary
   // data in packed A as we will not be writing those out.
+<<<<<<< HEAD
   // This will be taken care by just copying the appropriate valid data
+=======
+  // This wil be taken care by just copying the appropriate valid data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Note that parts of A that are not filled are:
   // Remainder of M blocks. So some m values are random. This is ok
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h
index ef771b4187b82..503344bc98b4c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.h
@@ -47,7 +47,11 @@ void KERNEL_NAME(
   const __m128i vzero = _mm_setzero_si128();
 
   // Packed A format.
+<<<<<<< HEAD
   // 8kx4m blocks for all blocks given 4 rows (4m) are placed in contiguous memory.
+=======
+  // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Original A
   // --------- K -----------          -- (K + 4 - 1) / 4 --
   // |                     |          |                   |
@@ -68,7 +72,11 @@ void KERNEL_NAME(
   // This locality helps in loading 8kx8m blocks of activations
   // Note when M is not multiple of 8, the rest can contain arbitrary
   // data in packed A as we will not be writing those out.
+<<<<<<< HEAD
   // This will be taken care by just copying the appropriate valid data
+=======
+  // This wil be taken care by just copying the appropriate valid data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   __m128i vacc_low[4];
   __m128i vacc_high[4];
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
index 8af5c417da31f..d87c9b2a359d2 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
@@ -42,11 +42,19 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |c_stride    | 0
 #  |out ch index| 8
 #  |params      | 16
 #  |------------|
+=======
+#  |-----------|
+#  |c_stride   | 0
+#  |out ch indx| 8
+#  |params     | 16
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
 #     size_t mr,
@@ -234,7 +242,11 @@
         /* v16, v17, v18, v19, v20, v21, v22, v23 */                         XX\
         /* When nr < 8, say nr = 1, extra v values will be fetched from stack which may overlap */ XX\
         /* with other parts of stack storing local variables. To avoid that we just */ XX\
+<<<<<<< HEAD
         /* create a buffer of 256 bytes in between to make sure pointer increment */ XX\
+=======
+        /* create a buffer of 256 bytes inbetween to make sure pointer increment */ XX\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /* never produces address that is beyond the stack frame of this function. */ XX\
         SUB x9, sp, 320                                                      XX\
         /* Each iteration produce 8 values each of 4 bytes */                XX\
@@ -287,7 +299,11 @@
         LD1 {v22.4s}, [x9], 16                                               XX\
         LD1 {v23.4s}, [x9]                                                   XX\
                                                                              XX\
+<<<<<<< HEAD
         /* We can transpose one 4x4 block using macro */                     XX\
+=======
+        /* We can tranpose one 4x4 block using macro */                      XX\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /* TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3 */            XX\
         /* After this we have */                                             XX\
         /* v8  : x00, x01, x02, x03 */                                       XX\
@@ -302,7 +318,11 @@
         /* v20 : x24, x25, x26, x27 */                                       XX\
         /* v22 : x34, x35, x36, x37 */                                       XX\
         /* Similarly we can transpose other two 4x4 blocks and we get */     XX\
+<<<<<<< HEAD
         /* transposed 8x8 */                                                 XX\
+=======
+        /* tranposed 8x8 */                                                  XX\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                                              XX\
         TRANSPOSE_4X4_S32 v8, v10, v12, v14, v0, v1, v2, v3                  XX\
         TRANSPOSE_4X4_S32 v16, v18, v20, v22, v4, v5, v6, v7                 XX\
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
index 58602beb030d1..317ef065456ee 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
@@ -31,11 +31,19 @@
 
 #  Args passed via stack.
 #  TOS
+<<<<<<< HEAD
 #  |------------|
 #  |c_stride    | 0
 #  |out ch index| 8
 #  |params      | 16
 #  |------------|
+=======
+#  |-----------|
+#  |c_stride   | 0
+#  |out ch indx| 8
+#  |params     | 16
+#  |-----------|
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # void pytorch_q8gemm_dq_sparse_8x1_ukernel_8x8_packedA_w##W_INDEX_DTYPE_NUM_BITS##__aarch64_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
index 14365d1ab3ddc..da1431c22f727 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
@@ -238,7 +238,11 @@ static inline void pytorch_pack_q8conv_wrq(
           }
         }
         if (kzp != 0) {
+<<<<<<< HEAD
           // This part fills the packed weights with zero points for output channels
+=======
+          // This part fills the packed wights with zero points for output channels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // when they are not divisible by nr blocking parameter.
           // In that case
           for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
@@ -360,7 +364,11 @@ static inline void pytorch_pack_q8deconv_wrq(
           }
         }
         if (kzp != 0) {
+<<<<<<< HEAD
           // This part fills the packed weights with zero points for output channels
+=======
+          // This part fills the packed wights with zero points for output channels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // when they are not divisible by nr blocking parameter.
           // In that case
           for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c
index 74961b51ff638..da2f0bcbc42f4 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/q31-scalar.c
@@ -93,7 +93,11 @@ void pytorch_qnnp_requantize_q31__scalar(
      * overflow is possible only when input is positive, and even when addition
      * of a rounding constant overflows 32-bit signed integer, it still doesn't
      *    overflow 32-bit unsigned integer. Thus, in case of signed overflow, we
+<<<<<<< HEAD
      * can compute the result using unsigned arithmetic, specifically using
+=======
+     * can compute the result using unsigned arithmetics, specifically using
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      * logical shift right instead of arithmetic shift right.
      * 3. Performs arithmetic shift as is, which will produce division result
      * rounded down. Then compute remainder of this division by a power of 2,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/fully-connected-sparse-operator-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/fully-connected-sparse-operator-tester.h
index 597662fbbbae4..b97ee1bf4ef9d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/fully-connected-sparse-operator-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/fully-connected-sparse-operator-tester.h
@@ -579,9 +579,15 @@ class FullyConnectedSparseOperatorTester {
 
           for (size_t i = 0; i < batchSize(); i++) {
             for (size_t c = 0; c < outputChannels(); c++) {
+<<<<<<< HEAD
               ASSERT_NEAR(
                   output_dynamic[i * outputChannels() + c],
                   accumulators_float[i * outputChannels() + c], 1e-3)
+=======
+              ASSERT_FLOAT_EQ(
+                  output_dynamic[i * outputChannels() + c],
+                  accumulators_float[i * outputChannels() + c])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   << "at " << i << ", " << c
                   << ": reference = " <<
                   accumulators_float[i * outputChannels() + c]
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc
index f535e4b99ed76..005e15606af4a 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/requantization.cc
@@ -17,7 +17,11 @@
 #include "requantization-tester.h"
 
 /*
+<<<<<<< HEAD
  * Precise scalar implementation using unsigned 32-bit arithmetic.
+=======
+ * Precise scalar implementation using unsigned 32-bit arithmetics.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 TEST(PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
@@ -83,7 +87,11 @@ TEST(PRECISE__SCALAR_UNSIGNED32, random_cases) {
 }
 
 /*
+<<<<<<< HEAD
  * Precise scalar implementation using unsigned 64-bit arithmetic.
+=======
+ * Precise scalar implementation using unsigned 64-bit arithmetics.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 TEST(PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
@@ -149,7 +157,11 @@ TEST(PRECISE__SCALAR_UNSIGNED64, random_cases) {
 }
 
 /*
+<<<<<<< HEAD
  * Precise scalar implementation using signed 64-bit arithmetic.
+=======
+ * Precise scalar implementation using signed 64-bit arithmetics.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 TEST(PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
@@ -302,7 +314,11 @@ TEST(GEMMLOWP__SCALAR, random_cases) {
 }
 
 /*
+<<<<<<< HEAD
  * Precise PSIMD implementation using unsigned 32-bit arithmetic.
+=======
+ * Precise PSIMD implementation using unsigned 32-bit arithmetics.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 TEST(PRECISE__PSIMD, exact_divide_by_po2) {
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index 31221cd9bf262..039faec76b31c 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -16,8 +16,13 @@ namespace {
 
 #ifdef USE_PYTORCH_QNNPACK
 
+<<<<<<< HEAD
 constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
 constexpr static int qnnpack_softmax_output_zero_point = 0;
+=======
+const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+const static int qnnpack_softmax_output_zero_point = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool is_qnnpack_compatible(
     const Tensor& qx,
diff --git a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
index ccdbe04fdf905..7085a408f3927 100644
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@@ -1,6 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+<<<<<<< HEAD
 #include <ATen/Dispatch.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ceil_div.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/cuda/CUDAGuard.h>
@@ -22,11 +25,18 @@
 namespace at::native {
 
 namespace {
+<<<<<<< HEAD
 template <typename T>
 __global__ void ChooseQuantizationParamsKernelImpl(
     const int64_t* fake_quant_on,
     const T* x_min,
     const T* x_max,
+=======
+__global__ void ChooseQuantizationParamsKernelImpl(
+    const int64_t* fake_quant_on,
+    const float* x_min,
+    const float* x_max,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int32_t qmin,
     int32_t qmax,
     int size,
@@ -95,6 +105,7 @@ __global__ void ChooseQuantizationParamsKernelImpl(
   }
 }
 
+<<<<<<< HEAD
 __device__ inline bool isinf_device(float v) {
   return ::isinf(v);
 }
@@ -105,12 +116,15 @@ __device__ inline bool isinf_device(at::Half v) {
   return ::isinf(static_cast<float>(v));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
 // The formula used to compute the new min/max is as follows
 //
 // running_min = (1 - c) * running_min + c * x_min, if running_min != inf
 // running_min = x_min, if running_min == inf
+<<<<<<< HEAD
 template <typename T>
 __global__ void MovingAverageMinMax(
     const int64_t* observer_on,
@@ -118,12 +132,21 @@ __global__ void MovingAverageMinMax(
     const T* x_max,
     T* running_min,
     T* running_max,
+=======
+__global__ void MovingAverageMinMax(
+    const int64_t* observer_on,
+    const float* x_min,
+    const float* x_max,
+    float* running_min,
+    float* running_max,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const float averaging_const,
     const int size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (*observer_on == 1) {
     if (i < size) {
+<<<<<<< HEAD
       T curr_min = x_min[i];
       T curr_max = x_max[i];
 
@@ -136,6 +159,18 @@ __global__ void MovingAverageMinMax(
       T adjusted_max = isinf_device(running_max[i]) ? curr_max
                                                     : (running_max[i]) +
               averaging_const_t * (curr_max - (running_max[i]));
+=======
+      float curr_min = x_min[i];
+      float curr_max = x_max[i];
+
+      float adjusted_min = ::isinf(running_min[i])
+          ? curr_min
+          : (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
+
+      float adjusted_max = ::isinf(running_max[i])
+          ? curr_max
+          : (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       running_min[i] = adjusted_min;
       running_max[i] = adjusted_max;
@@ -157,10 +192,16 @@ void _calculate_moving_average(
   at::Tensor x_min, x_max;
 
   int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
+<<<<<<< HEAD
+=======
+  float* running_min_data = running_min.data_ptr<float>();
+  float* running_max_data = running_max.data_ptr<float>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
 
   if (per_row_fq) {
     std::tie(x_min, x_max) = at::aminmax(x, 1);
+<<<<<<< HEAD
     int num_threads = std::min(size, (int64_t)512);
     const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
     AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -202,6 +243,36 @@ void _calculate_moving_average(
               averaging_const,
               1 /*size*/);
         });
+=======
+    float* x_min_data = x_min.data_ptr<float>();
+    float* x_max_data = x_max.data_ptr<float>();
+    int num_threads = std::min(size, (int64_t)512);
+    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+
+    // Moving Average Min/Max observer for activations
+    MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
+        observer_on_data,
+        x_min_data,
+        x_max_data,
+        running_min_data,
+        running_max_data,
+        averaging_const,
+        size);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    std::tie(x_min, x_max) = at::aminmax(x);
+    float* x_min_data = x_min.data_ptr<float>();
+    float* x_max_data = x_max.data_ptr<float>();
+    // Moving Average Min/Max observer for activations
+    MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
+        observer_on_data,
+        x_min_data,
+        x_max_data,
+        running_min_data,
+        running_max_data,
+        averaging_const,
+        1 /*size*/);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
@@ -224,6 +295,7 @@ void _calc_moving_avg_qparams_helper(
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
   if (per_row_fq) {
+<<<<<<< HEAD
     AT_DISPATCH_FLOATING_TYPES_AND2(
         at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
           scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
@@ -262,6 +334,36 @@ void _calc_moving_avg_qparams_helper(
               scale_ptr,
               zp_ptr);
         });
+=======
+    float* running_min_data = running_min.data_ptr<float>();
+    float* running_max_data = running_max.data_ptr<float>();
+    int num_threads = std::min(size, (int64_t)512);
+    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+    ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
+        fake_quant_on_data,
+        running_min_data,
+        running_max_data,
+        qmin,
+        qmax,
+        size,
+        symmetric_quant,
+        scale_ptr,
+        zp_ptr);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    float* running_min_data = running_min.data_ptr<float>();
+    float* running_max_data = running_max.data_ptr<float>();
+    ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
+        fake_quant_on_data,
+        running_min_data,
+        running_max_data,
+        qmin,
+        qmax,
+        1, // size
+        symmetric_quant, // preserve_sparsity
+        scale_ptr,
+        zp_ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
diff --git a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
index b94ab0fd0975f..796d170b65e38 100644
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@@ -12,6 +12,10 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <torch/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <utility>
 
diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
index 230850998fda1..b82d345310159 100644
--- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@@ -171,7 +171,11 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
     return;
   }
 
+<<<<<<< HEAD
   // linear_op computes act_int8 * transpose(w_int8) (matrix multiplication)
+=======
+  // linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // where act_int8 and w_int8 are the input and weight variables, resp.
   // output is a fp32 tensor
   auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
diff --git a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
index 53da11b4d0fe7..3023cbddceae3 100644
--- a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
@@ -10,6 +10,10 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <torch/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int register_linear_params();
 
diff --git a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
index 7fe44de11e54c..39f0693bc81b4 100644
--- a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
@@ -54,7 +54,11 @@ void check_maxpool2d_params(
 Tensor adaptive_avg_pool2d_quantized_cuda(
     const at::Tensor& input,
     IntArrayRef output_size) {
+<<<<<<< HEAD
 // TODO: re-enable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
+=======
+// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDA
 // #if AT_CUDNN_ENABLED()
     // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 9ce3619261553..900c0266c9a8f 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -121,12 +121,18 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
+<<<<<<< HEAD
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
+=======
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
@@ -145,7 +151,11 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
+<<<<<<< HEAD
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor? bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
+=======
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_tanh(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index a321074f60ea1..a22c9e9df5742 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -2,7 +2,10 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
+<<<<<<< HEAD
 #include <ATen/AccumulateType.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/sparse/ParamUtils.h>
 #include <ATen/native/SparseTensorUtils.h>
@@ -296,7 +299,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
     to exp functions as well as reuse of softmax implementation for
     log_softmax.
   */
+<<<<<<< HEAD
   using accscalar_t = at::acc_type<scalar_t, false>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto sparse_dim = input.sparse_dim();
   auto indices = input._indices().contiguous();
   auto values = input._values().contiguous();
@@ -342,14 +348,23 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
           continue;
 
         /* Prepare scratch space */
+<<<<<<< HEAD
         std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
         std::vector<accscalar_t> exp_sums_row(nvalues, 0);
+=======
+        std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
+        std::vector<scalar_t> exp_sums_row(nvalues, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         /* Compute mx */
         for (int64_t i : pool_indices) {
           auto values_row = values_accessor[i];
           for (const auto j : c10::irange(nvalues)) {
+<<<<<<< HEAD
             mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
+=======
+            mx_row[j] = std::max(mx_row[j], values_row[j]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
         }
 
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
index 7cec767d44660..ffad5746120cc 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -31,7 +31,11 @@ using at::sparse::get_sparse_impl;
 
 // ForwardIt: only legacy random access iterator is supported.
 template<class ForwardIt, class T, bool is_lower = true>
+<<<<<<< HEAD
 FUNCAPI INLINE
+=======
+static FUNCAPI INLINE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ForwardIt find_bound(ForwardIt first, ForwardIt last, const T& value) {
     ForwardIt RESTRICT it;
     typename std::iterator_traits<ForwardIt>::difference_type count, step;
@@ -51,10 +55,17 @@ ForwardIt find_bound(ForwardIt first, ForwardIt last, const T& value) {
       // Similarly, an upper bound is a value at *it with the smallest index
       // such that *it > value if such value exists, or last if does not.
       // Let is_lower = true and *it < value, then we know that *it and values
+<<<<<<< HEAD
       // preceding *it cannot contain a lower bound, so we adjust initial iterator range
       // from [first, first + count] to [first + step + 1, first + count - (step + 1)],
       // where +1 skips the element at which we have just evaluated *it < value.
       // Similar logic holds when is_lower = false.
+=======
+      // preceeding *it cannot contain a lower bound, so we adjust initial iterator range
+      // from [first, first + count] to [first + step + 1, first + count - (step + 1)],
+      // where +1 skips the element at which we have just evaluated *it < value.
+      // Samilar logic holds when is_lower = false.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (is_lower ? *it < value : value >= *it) {
         first = ++it;
         count -= step + 1;
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
index cf854a84e7dad..62a04b9119469 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@@ -79,7 +79,11 @@ struct CPUValueSelectionIntersectionKernel {
               const auto* ptr_argsort = argsort.const_data_ptr<index_t>();
 
               for (int64_t i = 0; i < n; ++i) {
+<<<<<<< HEAD
                 // Extract data
+=======
+                // Exctract data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 auto* ptr_res_values = reinterpret_cast<scalar_t*>(ptr_res_values_bytes);
                 const auto* ptr_lhs_values = reinterpret_cast<const scalar_t*>(ptr_lhs_values_bytes);
                 const auto lhs_nnz_idx = *reinterpret_cast<const index_t*>(ptr_lhs_select_idx_bytes);
diff --git a/aten/src/ATen/native/sparse/SparseBlas.cpp b/aten/src/ATen/native/sparse/SparseBlas.cpp
index 1bf7e459215a2..9f85f1436179a 100644
--- a/aten/src/ATen/native/sparse/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlas.cpp
@@ -65,7 +65,11 @@ Tensor& addmv_out_sparse_compressed(
       return result.zero_();
     } else {
       return at::mul_out(
+<<<<<<< HEAD
           result,
+=======
+          const_cast<Tensor&>(result),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           self,
           at::native::scalar_tensor(
               beta,
diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
index c841da8354b5f..cd5372bc57b34 100644
--- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
@@ -23,9 +23,12 @@
 #include <ATen/Parallel.h>
 #endif
 
+<<<<<<< HEAD
 #if AT_USE_EIGEN_SPARSE()
 #include <ATen/native/sparse/eigen/SparseBlasImpl.h>
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native::sparse::impl {
 
@@ -445,6 +448,7 @@ void add_out_sparse_csr(
     const Tensor& mat2,
     const Scalar& alpha,
     const Tensor& result) {
+<<<<<<< HEAD
 #if AT_USE_MKL_SPARSE()
   sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result);
 #elif AT_USE_EIGEN_SPARSE()
@@ -454,6 +458,15 @@ void add_out_sparse_csr(
     false,
     "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
     "Please use PyTorch built MKL support.");
+=======
+#if !AT_MKL_ENABLED()
+  TORCH_CHECK(
+      false,
+      "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
+      "Please use PyTorch built MKL support.");
+#else
+  sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
@@ -464,7 +477,11 @@ void triangular_solve_out_sparse_csr(
     bool upper,
     bool transpose,
     bool unitriangular) {
+<<<<<<< HEAD
 #if !AT_USE_MKL_SPARSE()
+=======
+#if !AT_MKL_ENABLED()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ",
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 91be22a7e737e..b4143cae94a8f 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -127,10 +127,13 @@
 #include <ATen/ops/zeros_like.h>
 #endif
 
+<<<<<<< HEAD
 #if AT_USE_EIGEN_SPARSE()
 #include <ATen/native/sparse/eigen/SparseBlasImpl.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <algorithm>
 
 namespace at {
@@ -540,12 +543,16 @@ static void addmm_out_sparse_csr_native_cpu(
   auto values = sparse.values();
 
   scalar_t cast_alpha = alpha.to<scalar_t>();
+<<<<<<< HEAD
   // If beta is zero NaN and Inf should not be propagated to the result
   if (beta.toComplexDouble() == 0.) {
     r.zero_();
   } else {
     r.mul_(beta);
   }
+=======
+  r.mul_(beta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_DISPATCH_INDEX_TYPES(
       col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
         auto csr_accessor = csr.accessor<index_t, 1>();
@@ -657,6 +664,7 @@ Tensor& addmm_out_sparse_compressed_cpu(
     return result;
   }
 
+<<<<<<< HEAD
 #if AT_USE_EIGEN_SPARSE()
   if ((result.layout() == kSparseCsr || result.layout() == kSparseCsc) &&
       (mat1.layout() == kSparseCsr || mat1.layout() == kSparseCsc) &&
@@ -666,6 +674,8 @@ Tensor& addmm_out_sparse_compressed_cpu(
   }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !AT_USE_MKL_SPARSE()
   // The custom impl addmm_out_sparse_csr_native_cpu only supports CSR @
   // strided -> strided
@@ -1330,18 +1340,32 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_
 
 template <typename scalar_t>
 struct ReductionAddOp {
+<<<<<<< HEAD
   scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
     return a + b;
   }
   scalar_t identity() const { return 0; }
+=======
+  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+    return a + b;
+  }
+  inline scalar_t identity() const { return 0; }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <typename scalar_t>
 struct ReductionMulOp {
+<<<<<<< HEAD
   scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
     return a * b;
   }
   scalar_t identity() const { return 1; }
+=======
+  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+    return a * b;
+  }
+  inline scalar_t identity() const { return 1; }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 }  // namespace
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 80f79c6520378..ec79db16b7af8 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -55,6 +55,10 @@
 #include <ATen/ops/is_pinned_native.h>
 #include <ATen/ops/resize_as_sparse.h>
 #include <ATen/ops/resize_as_sparse_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/sparse_coo_tensor.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/sparse_coo_tensor_native.h>
 #include <ATen/ops/sparse_dim_native.h>
 #include <ATen/ops/sparse_mask_native.h>
@@ -273,7 +277,11 @@ Tensor sparse_coo_tensor(IntArrayRef size,
 
 // helper
 namespace {
+<<<<<<< HEAD
 inline Tensor expand_values_if_needed(const Tensor& values) {
+=======
+static inline Tensor expand_values_if_needed(const Tensor& values) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // expand
   if (values.dim() == 0) {
     // Mimic Numpy behavior here and treat it as a 1D tensor
@@ -390,6 +398,7 @@ void _validate_sparse_coo_tensor_args(
   int64_t sparse_dim = indices.size(0);
   int64_t dense_dim = values.dim() - 1;
   TORCH_CHECK(
+<<<<<<< HEAD
     sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
     "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
     size.size(),
@@ -397,6 +406,15 @@ void _validate_sparse_coo_tensor_args(
     sparse_dim,
     ", dense_dim = ",
     dense_dim);
+=======
+      static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
+      "number of dimensions must be sparse_dim (",
+      sparse_dim,
+      ") + dense_dim (",
+      dense_dim,
+      "), but got ",
+      size.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (check_pinning) {
     TORCH_CHECK(
@@ -729,7 +747,11 @@ static std::tuple<Tensor, Tensor, OptTensor> sparse_mask_like_prepare_sparse_inp
   // is that these primitives might project first argument onto second one or
   // the other way around depending on which arguments are coalesced and which are
   // larger. This function prepares inputs for `sparse_mask` such that `t` is
+<<<<<<< HEAD
   // projected onto `mask` by sorting `t` if uncoalesced and artificially marking it
+=======
+  // projected onto `mask` by sorting `t` if uncoalesced and artifically marking it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // as coalesced all while `mask` is set to uncoalesced.
   // The result of this projectionk is going to be uncoalesced, so it is up to the
   // user to set the corresponding flag correctly with respect to the operations'
diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
index cfa890d7f344e..ed4222b992208 100644
--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
@@ -145,7 +145,11 @@ INVARIANT_CHECK_FUNC_API _check_idx_sorted_distinct_vals_slices_with_cidx(
   }
 }
 
+<<<<<<< HEAD
 inline int64_t indexCount(IntArrayRef sizes) {
+=======
+static inline int64_t indexCount(IntArrayRef sizes) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t res = 1;
   for (const auto& s : sizes) {
     res *= s;
@@ -242,7 +246,11 @@ void _validate_compressed_sparse_indices_kernel(
   // Catch integer overflow from large dimensions. Otherwise, the
   // invariant checks may fail with bogus exceptions or succeed with
   // false-positive results when int64_t typed dimensions are cast to
+<<<<<<< HEAD
   // index dtype that corresponds to smaller integer type such as
+=======
+  // index dtype that corresponds to smaller interger type such as
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // int32_t.
   {
     AT_DISPATCH_INDEX_TYPES(idx.scalar_type(), NAME, [cdim, dim, nnz]() {
diff --git a/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h b/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h
index 530804099b6fd..5d644bb8b9987 100644
--- a/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h
+++ b/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h
@@ -112,7 +112,11 @@ struct LargestValuesGreedy {
   }
 };
 
+<<<<<<< HEAD
 // We consider each rows independently in order
+=======
+// We consider each rows independantly in order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // This is to ensure that a row's sparsity pattern is only determined
 // by its values and the rows before (but never the rows after)
 // This enforces causality strictly
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
index b01180ab5ffb6..20446a31c8e7d 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@@ -244,7 +244,11 @@ Tensor& addmv_out_sparse_compressed_cuda(
       return result.zero_();
     } else {
       return at::mul_out(
+<<<<<<< HEAD
           result,
+=======
+          const_cast<Tensor&>(result),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           self,
           at::native::scalar_tensor(
               beta,
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index b1fd81123b990..572a948afca5e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -10,6 +10,10 @@
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -67,6 +71,32 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_cusparse(
   }
 }
 
+<<<<<<< HEAD
+=======
+// This function is used for old CUDA Toolkit versions that doesn't support new cuSPARSE Generic API
+void addmm_out_legacy(
+    const at::sparse_csr::SparseCsrTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& result) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
+  auto nnz = mat1._nnz();
+  auto m = mat1.size(0);
+  auto k = mat1.size(1);
+  auto n = mat2.size(1);
+  auto crow_indices = mat1.crow_indices().to(kInt);
+  auto col_indices = mat1.col_indices().to(kInt);
+  auto values = mat1.values();
+  auto mat2_ = at::native::expect_resolved_conj(mat2);
+  auto result_ = at::native::expect_resolved_conj(result);
+  at::native::s_addmm_out_csr_sparse_dense_cuda_worker(nnz, m, n, k, result, beta, *result_, alpha, crow_indices, col_indices, values, *mat2_);
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::MaybeOwned<Tensor> inline prepare_dense_vector_for_cusparse(
     const Tensor& tensor) {
   if (tensor.is_non_overlapping_and_dense()) {
@@ -93,6 +123,18 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
       input.sizes());
 }
 
+<<<<<<< HEAD
+=======
+void inline bsrsv2_bsrsm2_may_need_to_sync() {
+#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703
+  // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x
+  // See https://github.com/pytorch/pytorch/issues/71297
+  ::c10::cuda::device_synchronize();
+#endif
+  // else: do nothing!
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void block_sparse_triangular_solve_vec(
     const at::sparse_csr::SparseCsrTensor& A,
     const Tensor& B,
@@ -213,6 +255,10 @@ void block_sparse_triangular_solve_vec(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
+<<<<<<< HEAD
+=======
+        bsrsv2_bsrsm2_may_need_to_sync();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -353,6 +399,10 @@ void block_sparse_triangular_solve_mat(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
+<<<<<<< HEAD
+=======
+        bsrsv2_bsrsm2_may_need_to_sync();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -547,6 +597,12 @@ void spmm(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
+<<<<<<< HEAD
+=======
+#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
+  addmm_out_legacy(mat1, mat2, beta, alpha, result);
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> result_ = prepare_dense_matrix_for_cusparse(result);
   c10::MaybeOwned<Tensor> mat2_ = prepare_dense_matrix_for_cusparse(mat2);
 
@@ -575,9 +631,30 @@ void spmm(
   cusparseOperation_t opB = transpose_B ? CUSPARSE_OPERATION_TRANSPOSE
                                         : CUSPARSE_OPERATION_NON_TRANSPOSE;
 
+<<<<<<< HEAD
+  // TODO: update this to support COO sparse layout
+  auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1);
+  auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
+=======
+  // CUDA < 11.0 doesn't support 64-bit indices and doesn't raise an error about this
+  // silently returning incorrect results
+#if defined(USE_ROCM) && (ROCM_VERSION < 60300)
+  auto mat1_32 = at::native::_sparse_csr_tensor_unsafe(
+      mat1.crow_indices().to(kInt),
+      mat1.col_indices().to(kInt),
+      mat1.values(),
+      mat1.sizes(),
+      mat1.scalar_type(),
+      mat1.layout(),
+      mat1.device());
+  auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1_32);
+  auto algorithm = CUSPARSE_MM_ALG_DEFAULT;
+#else // defined(USE_ROCM) && (ROCM_VERSION < 60300)
   // TODO: update this to support COO sparse layout
   auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1);
   auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
+#endif // defined(USE_ROCM) && (ROCM_VERSION < 60300)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto descB = at::cuda::sparse::CuSparseConstDnMatDescriptor(
       transpose_B ? mat2_->mT() : *mat2_);
@@ -630,6 +707,10 @@ void spmm(
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
+<<<<<<< HEAD
+=======
+#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void spgemm(
@@ -638,6 +719,15 @@ void spgemm(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
+<<<<<<< HEAD
+=======
+  // older versions of cusparse on Windows segfault for complex128 dtype
+#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
+  TORCH_CHECK(
+      !(A.scalar_type() == ScalarType::ComplexDouble),
+      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   IntArrayRef A_sizes = A.sizes();
   auto ndim = A.dim();
@@ -795,8 +885,12 @@ void addmm_out_sparse_csr(
   if (mat1.layout() == kSparseBsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided)
+<<<<<<< HEAD
          { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
 }
+=======
+        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -805,13 +899,21 @@ void addmm_out_sparse_csr(
       if (result.layout() == kStrided) {
         auto result_t = result.transpose(-2, -1);
         auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
+<<<<<<< HEAD
         block_sparse_mm(
+=======
+        return block_sparse_mm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_t,
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
+<<<<<<< HEAD
             result_t); return;
+=======
+            result_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -826,41 +928,69 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
+<<<<<<< HEAD
         spmm(
+=======
+        return spmm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(0, 1).to_sparse_csr(),
             mat1.transpose(0, 1),
             beta,
             alpha,
+<<<<<<< HEAD
             result.transpose(0, 1)); return;
+=======
+            result.transpose(0, 1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         spmm(
+=======
+        return spmm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
+<<<<<<< HEAD
             result.transpose(-2, -1)); return;
+=======
+            result.transpose(-2, -1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
   if (mat1.layout() == kSparseCsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
+<<<<<<< HEAD
         spmm(mat1, mat2, beta, alpha, result); return;
+=======
+        return spmm(mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr) {
+<<<<<<< HEAD
         spgemm(mat1, mat2, beta, alpha, result); return;
+=======
+        return spgemm(mat1, mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kSparseCsr) {
         // TODO: Add native CSC support via cuSPARSE if supported.
         // CSR @ CSC kernel would be very fast due to format alignment
+<<<<<<< HEAD
         spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
+=======
+        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -868,28 +998,48 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
+<<<<<<< HEAD
         spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+=======
+        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr)
         // TODO: Add native CSC support via cuSPARSE if supported.
+<<<<<<< HEAD
          { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
 }
+=======
+        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kSparseCsr) {
         // TODO: Add native CSC support via cuSPARSE if supported.
+<<<<<<< HEAD
         spgemm(
             mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
       }
       if (result.layout() == kSparseCsc) {
         spgemm(
+=======
+        return spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+      }
+      if (result.layout() == kSparseCsc) {
+        return spgemm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
+<<<<<<< HEAD
             result.transpose(-2, -1)); return;
+=======
+            result.transpose(-2, -1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -920,8 +1070,20 @@ void addmv_out_sparse_csr(
     const Scalar& alpha,
     const Tensor& result) {
   if (mat.layout() == kSparseBsr) {
+<<<<<<< HEAD
     block_sparse_mv(mat, vec, beta, alpha, result); return;
   }
+=======
+    return block_sparse_mv(mat, vec, beta, alpha, result);
+  }
+#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
+  TORCH_CHECK(
+      false,
+      "Calling addmv on a sparse GPU tensor requires compiling ",
+      "PyTorch with CUDA 10.2+ (CUDA 11+ on Windows). ",
+      "Please use PyTorch built with newer CUDA version.");
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
   c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_cusparse(result);
@@ -932,10 +1094,18 @@ void addmv_out_sparse_csr(
   auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*vec_);
   auto descY = at::cuda::sparse::CuSparseDnVecDescriptor(*result_);
 
+<<<<<<< HEAD
 #ifdef USE_ROCM
   cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
 #else
   cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
+=======
+  // cusparseSpMVAlg_t was updated in cuda 11.2.1 (cusparse 11.4.0)
+#if CUSPARSE_VERSION >= 11400
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
+#else
+  cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   // SpMV doesn't support uniform precision computation
@@ -988,6 +1158,10 @@ void addmv_out_sparse_csr(
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
+<<<<<<< HEAD
+=======
+#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 /*
@@ -1200,6 +1374,7 @@ void triangular_solve_out_sparse_csr(
   }
   if (A.layout() == kSparseBsr) {
     if (B.size(-1) == 1) {
+<<<<<<< HEAD
       block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
     } else {
       block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
@@ -1207,6 +1382,19 @@ void triangular_solve_out_sparse_csr(
   }
 #ifdef USE_ROCM
   TORCH_CHECK(false, "ROCm is not supported");
+=======
+      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+    } else {
+      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+    }
+  }
+#if !AT_USE_CUSPARSE_GENERIC_SPSV()
+  TORCH_CHECK(
+      false,
+      "Calling triangular solve on a sparse GPU tensor requires compiling ",
+      "PyTorch with at least CUDA 11.3. ",
+      "Please use PyTorch built with newer CUDA version.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
   // It should be possible to use mixed memory format
@@ -1273,6 +1461,16 @@ void triangular_solve_out_sparse_csr(
               desc_spsv.descriptor()));
         });
   } else {
+<<<<<<< HEAD
+=======
+#if !AT_USE_CUSPARSE_GENERIC_SPSM()
+    TORCH_CHECK(
+        false,
+        "Calling triangular solve on a sparse GPU tensor requires compiling ",
+        "PyTorch with at least CUDA 11.3.1. ",
+        "Please use PyTorch built with newer CUDA version.");
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
         X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
           scalar_t alpha = 1;
@@ -1326,11 +1524,19 @@ void triangular_solve_out_sparse_csr(
               CUSPARSE_SPSM_ALG_DEFAULT,
               desc_spsm.descriptor()));
         });
+<<<<<<< HEAD
+=======
+#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
   }
+<<<<<<< HEAD
 #endif
+=======
+#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void sampled_addmm_out_sparse_csr(
@@ -1339,6 +1545,16 @@ void sampled_addmm_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
+<<<<<<< HEAD
+=======
+#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_API())
+  TORCH_CHECK(
+      false,
+      "Calling sampled_addmm with sparse GPU tensors requires compiling ",
+      "PyTorch with CUDA 11.2.1+. ",
+      "Please use PyTorch built with newer CUDA version.");
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == Layout::Strided);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr());
@@ -1413,6 +1629,10 @@ void sampled_addmm_out_sparse_csr(
               buffer.get()));
         }
       });
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native::sparse::impl::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index c11588a32ba05..bd60f6c7e2127 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -196,6 +196,7 @@ C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   Dtype *values, Dtype *newValues,
+<<<<<<< HEAD
   int64_t nnz, int64_t newNnz,
 #ifdef USE_ROCM
   int64_t nsegments,
@@ -207,6 +208,11 @@ __global__ void coalesceValuesKernel(
 #else
   int64_t seg = blockIdx.x * 4 + threadIdx.y;
 #endif
+=======
+  int64_t nnz, int64_t newNnz, int64_t stride) {
+
+  int seg = blockIdx.x * 4 + threadIdx.y;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Number of values processed by each thread (grain size)
   const int SZ = 4;
@@ -215,11 +221,15 @@ __global__ void coalesceValuesKernel(
     const int newValueRow = seg * stride;
     const int begin = segment_offsets[seg];
     const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+<<<<<<< HEAD
 #ifdef USE_ROCM
     const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ;
 #else
     const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
 #endif
+=======
+    const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Acctype tmp[SZ];
     #pragma unroll
     for (int ii = 0; ii < SZ; ii++) {
@@ -262,6 +272,7 @@ C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   bool *values, bool *newValues,
+<<<<<<< HEAD
   int64_t nnz, int64_t newNnz,
 #ifdef USE_ROCM
   int64_t nsegments,
@@ -273,6 +284,11 @@ __global__ void coalesceValuesKernel(
 #else
   int64_t seg = blockIdx.x * 4 + threadIdx.y;
 #endif
+=======
+  int64_t nnz, int64_t newNnz, int64_t stride) {
+
+  int seg = blockIdx.x * 4 + threadIdx.y;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Number of values processed by each thread (grain size)
   const int SZ = 4;
@@ -281,11 +297,15 @@ __global__ void coalesceValuesKernel(
     const int newValueRow = seg * stride;
     const int begin = segment_offsets[seg];
     const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+<<<<<<< HEAD
 #ifdef USE_ROCM
     const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ;
 #else
     const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
 #endif
+=======
+    const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool tmp[SZ];
     #pragma unroll
     for (int ii = 0; ii < SZ; ii++) {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
index c656dc71a660d..b092d07b8a916 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
@@ -65,7 +65,11 @@ void _csrmm2(
     csrvala,                    /* values of the sparse matrix, size = nnz */
     CUSPARSE_INDEX_32I,         /* data type of row offsets index */
     CUSPARSE_INDEX_32I,         /* data type of col indices */
+<<<<<<< HEAD
     CUSPARSE_INDEX_BASE_ZERO,   /* base index of row offset and col index */
+=======
+    CUSPARSE_INDEX_BASE_ZERO,   /* base index of row offset and col indes */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cusparse_value_type         /* data type of values */
   ));
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index b59221a3231a5..6ae2d7b6dc160 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -106,6 +106,7 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
     values = values.contiguous();
     int64_t stride = c10::multiply_integers(values.sizes().slice(1));
     int warp_size = at::cuda::warp_size();
+<<<<<<< HEAD
 #ifdef USE_ROCM
     const int64_t BATCHING_SEGMENT = 4096;
     int64_t nsegments = ceil_div(newNnz, (int64_t) SZ);
@@ -134,6 +135,10 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 #else
+=======
+    dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) warp_size*SZ));
+    dim3 block(warp_size, SZ);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool,
       values.scalar_type(), "coalesce_sparse_cuda", [&] {
@@ -149,7 +154,10 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
         );
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 // this grid-strided version is slower but probably more flexible
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 62deedfc2a712..52693e61043cf 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -672,7 +672,11 @@ Tensor bmm_sparse_cuda(const SparseTensor& self, const Tensor& mat2) {
   return bmm_out_sparse_cuda(self, mat2, result);
 }
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(CUSPARSE_VERSION)
+=======
+#if defined(USE_ROCM) || !(defined(_MSC_VER) && CUSPARSE_VERSION < 11000)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void search_end_matrix_indices_cuda_kernel(
   int64_t* mat_el_end_indices,
   int64_t num_matrices,
@@ -745,6 +749,13 @@ cudaDataType getTensorCudaDataType(Tensor self) {
 #endif
 
 Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor& result) {
+<<<<<<< HEAD
+=======
+#if defined(_MSC_VER) && (CUSPARSE_VERSION < 11000)
+  TORCH_CHECK(false, "bmm sparse-dense CUDA is not supported on Windows with cuda before 11.0");
+#elif defined(USE_ROCM) || (defined(CUDART_VERSION) && (CUDART_VERSION >= 10010))  // linux cuda >= 10.1 or windows cuda >= 11.0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(!mat2.is_sparse(), "bmm_sparse: Tensor 'mat2' must be dense");
   TORCH_CHECK(self.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self.dense_dim());
   TORCH_CHECK(self.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self.sparse_dim());
@@ -796,7 +807,11 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
   Tensor indices_dim1 = indices[1].to(ScalarType::Int);
   Tensor indices_dim2 = indices[2].to(ScalarType::Int);
 
+<<<<<<< HEAD
   std::vector<int64_t> mat_el_end_indices_host(num_matrices);
+=======
+  std::unique_ptr<int64_t[]> mat_el_end_indices_host(new int64_t[num_matrices]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   {
     auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@@ -805,14 +820,22 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
 
     search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
     AT_CUDA_CHECK(cudaMemcpy(
+<<<<<<< HEAD
       mat_el_end_indices_host.data(),
+=======
+      mat_el_end_indices_host.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       mat_el_end_indices_device,
       num_matrices*sizeof(int64_t),
       cudaMemcpyDeviceToHost
     ));
   }
   // Need a pointer to an array to access within a lambda
+<<<<<<< HEAD
   int64_t* mat_el_end_indices = mat_el_end_indices_host.data();
+=======
+  int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Scalar beta = 0;
   Scalar alpha = 1;
@@ -940,6 +963,13 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
   // them in column-major order in memory
   result.transpose_(1,2);
 
+<<<<<<< HEAD
+=======
+#else
+  TORCH_CHECK(false, "bmm sparse-dense requires CUDA 10.1 or greater");
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return result;
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 16867b2bd7de6..4643e269da245 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -46,6 +46,7 @@
 #define IS_CUSPARSE11_AVAILABLE() 0
 #endif
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) && (ROCM_VERSION >= 70000)
 #define HIPSPARSE_FP16_SUPPORT 1
 #else
@@ -58,6 +59,8 @@
 #define HIPSPARSE_FP16_BF16_SUPPORT 0
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if IS_CUSPARSE11_AVAILABLE()
 #include <library_types.h>
 #endif
@@ -105,7 +108,11 @@ void create_general_description_(cusparseMatDescr_t& description_) {
 }
 
 // csrMatrixRef is used to have a representation of a raw CSR matrix representation
+<<<<<<< HEAD
 // coming from `sparse_sparse_matmul_cuda_kernel` function.
+=======
+// comming from `sparse_sparse_matmul_cuda_kernel` function.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Moreover this implements a RAII guard for a cusparse descriptor
 template<class scalar_t>
 struct csrMatrixRef {
@@ -115,9 +122,23 @@ struct csrMatrixRef {
   int nnz_{0};
   std::vector<int> size_{};
 
+<<<<<<< HEAD
   cusparseSpMatDescr_t description_{0};
 
   csrMatrixRef() = default;
+=======
+  #if IS_CUSPARSE11_AVAILABLE()
+    cusparseSpMatDescr_t description_{0};
+  #else
+    cusparseMatDescr_t description_{0};
+  #endif
+
+  csrMatrixRef() {
+    #if !IS_CUSPARSE11_AVAILABLE()
+      create_general_description_(description_);
+    #endif
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   csrMatrixRef(
       int* csr_indices,
@@ -130,6 +151,10 @@ struct csrMatrixRef {
         csr_values_{csr_values},
         nnz_{nnz},
         size_{size} {
+<<<<<<< HEAD
+=======
+    #if IS_CUSPARSE11_AVAILABLE()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cudaDataType cuda_data_type = at::cuda::getCudaDataType<scalar_t>();
       TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
         &description_,
@@ -143,10 +168,24 @@ struct csrMatrixRef {
         CUSPARSE_INDEX_32I,
         CUSPARSE_INDEX_BASE_ZERO,
         cuda_data_type));
+<<<<<<< HEAD
   }
 
   ~csrMatrixRef() {
     cusparseDestroySpMat(description_);
+=======
+    #else
+      create_general_description_(description_);
+    #endif
+  }
+
+  ~csrMatrixRef() {
+    #if IS_CUSPARSE11_AVAILABLE()
+      cusparseDestroySpMat(description_);
+    #else
+      cusparseDestroyMatDescr(description_);
+    #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   int size(int index) const {
@@ -192,6 +231,11 @@ struct csrOutput {
   }
 };
 
+<<<<<<< HEAD
+=======
+#if IS_CUSPARSE11_AVAILABLE()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // RAII guard helps to support cuSparse 11 API for `A @ B` operation
 // This generic template exists because with cuSparse the `scalar_t` type could be a double or float
 template <class scalar_t>
@@ -390,6 +434,287 @@ template struct CusparseMatrixMultiplyOp<float>;
 
 template struct CusparseMatrixMultiplyOp<double>;
 
+<<<<<<< HEAD
+=======
+#else // if not IS_CUSPARSE11_AVAILABLE()
+
+using DcsrMatrixRef = csrMatrixRef<double>;
+using ScsrMatrixRef = csrMatrixRef<float>;
+
+// RAII guard helps to support cuSparse 10 API for `A @ B` operation
+// This generic template exists because with cuSparse the `scalar_t` type could be a double or float
+template <class scalar_t>
+struct CusparseMatrixMultiplyOp {
+  csrOutput operator()(
+      const csrMatrixRef<scalar_t>& lhs,
+      const csrMatrixRef<scalar_t>& rhs,
+      Tensor &output_values,
+      Tensor &output_indices)
+  {
+    static_assert(false&&sizeof(scalar_t), "cusparse csr sparse-sparse MM only supports data type of float and double.");
+  }
+};
+
+// Specializacion for `A @ B` operation for double values with cuSparse
+template<> struct CusparseMatrixMultiplyOp<double> {
+  csrgemm2Info_t gemm2Info_;
+
+  CusparseMatrixMultiplyOp() {
+    TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
+  }
+  ~CusparseMatrixMultiplyOp() {
+    cusparseDestroyCsrgemm2Info(gemm2Info_);
+  }
+
+  csrOutput operator ()(
+      const DcsrMatrixRef& lhs,
+      const DcsrMatrixRef& rhs,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    double alpha = 1.0;
+    DcsrMatrixRef empty;
+    return Dgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
+  }
+
+  csrOutput Dgemm2(
+      const DcsrMatrixRef& A,
+      const DcsrMatrixRef& B,
+      const DcsrMatrixRef& C,
+      const double* alpha,
+      const double* beta,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    void* buffer_{nullptr};
+    cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
+    TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
+
+    csrOutput out({A.size(0), B.size(1)});
+    int innerSize = confirm_mult_size(A.size_, B.size_);
+    out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
+
+    // Compute needed buffer size
+    size_t new_bubber_sz;
+    TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2_bufferSizeExt(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        gemm2Info_,
+        &new_bubber_sz));
+
+    // (Re)allocate buffer if needed
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
+    buffer_ = data_ptr.get();
+
+    // Find the resulting non-zero pattern.
+    TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_pointers_.data_ptr<int>(),
+        &out.nnz_,
+        gemm2Info_,
+        buffer_));
+
+    out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
+    out.csr_values_ = at::empty({out.nnz_}, output_values.options());
+
+    // Perform the gemm2 operation for doubles
+    // out = alpha ∗ A ∗ B + beta ∗ C
+    TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_values_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_values_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_values_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_values_.data_ptr<double>(),
+        out.csr_pointers_.data_ptr<int>(),
+        out.csr_indices_.data_ptr<int>(),
+        gemm2Info_,
+        buffer_));
+    return out;
+  }
+};
+
+// Specializacion for `A @ B` operation for float values with cuSparse
+template<> struct CusparseMatrixMultiplyOp<float> {
+  csrgemm2Info_t gemm2Info_;
+
+  CusparseMatrixMultiplyOp() {
+    TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
+
+  }
+  ~CusparseMatrixMultiplyOp() {
+    cusparseDestroyCsrgemm2Info(gemm2Info_);
+  }
+  csrOutput operator()(
+      const ScsrMatrixRef& lhs,
+      const ScsrMatrixRef& rhs,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    float alpha = 1.0;
+    ScsrMatrixRef empty;
+    return Sgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
+  }
+
+  csrOutput Sgemm2(
+      const ScsrMatrixRef& A,
+      const ScsrMatrixRef& B,
+      const ScsrMatrixRef& C,
+      const float* alpha,
+      const float* beta,
+      Tensor &output_values,
+      Tensor &output_indices) {
+    void* buffer_{nullptr};
+    cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
+    TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
+
+    csrOutput out({A.size(0), B.size(1)});
+
+    int innerSize = confirm_mult_size(A.size_, B.size_);
+
+    out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
+
+    // Compute needed buffer size
+    size_t new_bubber_sz;
+    TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2_bufferSizeExt(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        gemm2Info_,
+        &new_bubber_sz));
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
+    buffer_ = data_ptr.get();
+
+    // Find the resulting non-zero pattern.
+    TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        A.description_,
+        A.nnz_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        C.description_,
+        C.nnz_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_pointers_.data_ptr<int>(),
+        &out.nnz_,
+        gemm2Info_,
+        buffer_));
+
+    out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
+    out.csr_values_ = at::empty({out.nnz_}, output_values.options());
+
+    // Perform the gemm2 operation for doubles
+    // out = alpha ∗ A ∗ B + beta ∗ C
+    TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2(
+        cusparseHandle_,
+        out.size(0),
+        out.size(1),
+        innerSize,
+        alpha,
+        A.description_,
+        A.nnz_,
+        A.csr_values_,
+        A.csr_pointers_,
+        A.csr_indices_,
+        B.description_,
+        B.nnz_,
+        B.csr_values_,
+        B.csr_pointers_,
+        B.csr_indices_,
+        beta,
+        C.description_,
+        C.nnz_,
+        C.csr_values_,
+        C.csr_pointers_,
+        C.csr_indices_,
+        out.description_,
+        out.csr_values_.data_ptr<float>(),
+        out.csr_pointers_.data_ptr<int>(),
+        out.csr_indices_.data_ptr<int>(),
+        gemm2Info_,
+        buffer_));
+    return out;
+  }
+};
+
+
+
+#endif // IS_CUSPARSE11_AVAILABLE()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename scalar_t>
 void sparse_sparse_matmul_cuda_kernel(
     Tensor& result,
@@ -405,6 +730,16 @@ void sparse_sparse_matmul_cuda_kernel(
         std::is_same_v<c10::complex<double>, scalar_t>,
     "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
 
+<<<<<<< HEAD
+=======
+  // older versions of cusparse on Windows segfault for complex128 dtype
+#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
+  TORCH_CHECK(
+      !(mat1.scalar_type() == ScalarType::ComplexDouble),
+      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor mat1_indices_ = mat1._indices().contiguous();
   Tensor mat1_values = mat1._values().contiguous();
 
@@ -531,15 +866,30 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
   auto output = at::native::empty_like(mat1_);
   output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
 
+<<<<<<< HEAD
 #if !defined(USE_ROCM)
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
       sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
   });
 #else
+=======
+#if IS_CUSPARSE11_AVAILABLE() && !defined(USE_ROCM)
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
+      sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+  });
+#elif IS_CUSPARSE11_AVAILABLE() && defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // ROCm does not support half and bfloat16 types for sparse_matmul
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
       sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
   });
+<<<<<<< HEAD
+=======
+#else
+  AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
+    sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   return output;
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
new file mode 100644
index 0000000000000..5ead412bd2d81
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
@@ -0,0 +1,107 @@
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/accumulate.h>
+
+#if defined(USE_ROCM) || defined(_MSC_VER)
+#else
+#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
+#endif
+
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER)
+#else
+template <typename KT>
+__global__ void __launch_bounds__(32 /* num_threads */)
+  sparse_semi_structured_apply_kernel(typename KT::Params p)
+{
+  KT::sparse_semi_structured_apply_kernel(p);
+}
+
+// Apply a 2:4 sparsify pattern computed with
+// `_sparse_semi_structured_tile` to another Tensor
+template <bool kIsMeta, typename Element>
+std::tuple<Tensor, Tensor> _sparse_semi_structured_apply_typed(Tensor input, Tensor threads_masks)
+{
+  using KT = KernelTypes<Element>;
+  // TODO: Technically we should be able to deal with that
+  // by running on the transpose of `input` and swapping
+  // `packed` & `packed_t`.
+  // This would require to adapt the `threads_masks` a bit tho.
+  if (input.stride(1) != 1) {
+    input = input.contiguous();
+  }
+  std::optional<at::cuda::CUDAGuard> device_guard;
+  if (!kIsMeta) {
+    device_guard.emplace(input.device());
+  }
+
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input.stride(1) == 1);
+  TORCH_CHECK(input.stride(0) % 8 == 0);
+  TORCH_CHECK(input.size(1) % 32 == 0, "Wrong alignment shape[1]");
+
+  auto roundedx = cutlass::round_up(input.size(0), kWarpX);
+  auto roundedy = cutlass::round_up(input.size(1), kWarpY);
+  at::Tensor packed =
+      at::empty({roundedx, cutlass::ceil_div(roundedy, 2)}, input.options());
+  at::Tensor packed_trans =
+      at::empty({roundedy, cutlass::ceil_div(roundedx, 2)}, input.options());
+
+  typename KT::Params p;
+  p.input = (Element const*)input.data_ptr();
+  p.input_s0 = input.stride(0);
+  p.input_dim0 = input.size(0);
+  p.input_dim1 = input.size(1);
+
+  p.packed = (Element*)packed.data_ptr();
+  p.packed_stride = packed.stride(0);
+  p.packed_trans = (Element*)packed_trans.data_ptr();
+  p.packed_trans_stride = packed_trans.stride(0);
+
+  p.threads_masks = (uint64_t*)threads_masks.data_ptr();
+
+  TORCH_CHECK(threads_masks.dim() == 3);
+  TORCH_CHECK(
+      threads_masks.size(0) == p.getBlocksGrid().x * p.getThreadsGrid().x);
+  TORCH_CHECK(
+      threads_masks.size(1) == p.getBlocksGrid().y * p.getThreadsGrid().y);
+  TORCH_CHECK(threads_masks.stride(1) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.size(2) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.stride(2) == 1);
+  TORCH_CHECK(threads_masks.scalar_type() == at::ScalarType::Byte);
+
+  if (!kIsMeta) {
+    size_t smem_bytes = 0;
+    sparse_semi_structured_apply_kernel<KT>
+        <<<p.getBlocksGrid(),
+           p.getThreadsGrid(),
+           smem_bytes,
+           at::cuda::getCurrentCUDAStream()>>>(p);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+  return std::make_tuple(packed, packed_trans);
+}
+#endif
+
+std::tuple<Tensor, Tensor> _sparse_semi_structured_apply(const Tensor& input, const Tensor& threads_masks) // Returned by `_sparse_semi_structured_tile`
+{
+#if defined(USE_ROCM) || defined(_MSC_VER)
+  TORCH_CHECK(false, "_sparse_semi_structured_apply: not supported");
+  return std::make_tuple(Tensor{}, Tensor{});
+#else
+  TORCH_CHECK(
+    input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16,
+    "Unsupported dtype - only `float16` and `bfloat16` are supported currently"
+  );
+  auto result = (input.scalar_type() == at::ScalarType::Half)
+            ? _sparse_semi_structured_apply_typed<false, cutlass::half_t>(input, threads_masks)
+            : _sparse_semi_structured_apply_typed<false, cutlass::bfloat16_t>(input, threads_masks);
+  return result;
+#endif
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index 9d735ac0f2c88..d78d79c9bafae 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -64,6 +64,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   // create sparse descriptor, dtype
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cudaDataType type;
+<<<<<<< HEAD
+=======
+  auto compression_factor = 9;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -72,6 +76,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   switch (sparse_input.scalar_type()) {
     case at::ScalarType::Char:
       type = CUDA_R_8I;
+<<<<<<< HEAD
+=======
+      compression_factor = 10;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
     case at::ScalarType::Half:
       type = CUDA_R_16F;
@@ -87,6 +95,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
     case at::ScalarType::Float8_e4m3fn:
       type = CUDA_R_8F_E4M3;
+<<<<<<< HEAD
+=======
+      compression_factor = 10;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
 #endif
     default:
@@ -94,6 +106,13 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       break;
   }
 
+<<<<<<< HEAD
+=======
+  // create a new compressed tensor with the same dtype as
+  auto compressed_tensor =
+      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
       &handle,
       &sparse_input_descriptor,
@@ -114,6 +133,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       &compressed_size,
       &compressed_buffer_size));
 
+<<<<<<< HEAD
   // create a new compressed tensor with the same dtype as the input,
   // and with packed data/metadata stored in an array with original
   // number of rows, and sufficient columns to provide compressed_size
@@ -123,6 +143,8 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
   auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -155,7 +177,11 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
     handle_initialized = true;
   }
+<<<<<<< HEAD
   // cuSPARSELt constructs
+=======
+  // cupsarselt constructs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cusparseLtMatmulDescriptor_t matmul;
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
@@ -167,6 +193,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
   cudaDataType output_type;
   cudaDataType C_type;
   cusparseComputeType compute_type;
+<<<<<<< HEAD
+=======
+  auto compression_factor = 9;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -178,6 +208,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8I;
       C_type = CUDA_R_8I;
       compute_type = CUSPARSE_COMPUTE_32I;
+<<<<<<< HEAD
+=======
+      compression_factor = 10;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
 
 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@@ -210,6 +244,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8F_E4M3;
       C_type = CUDA_R_16F;
       compute_type = CUSPARSE_COMPUTE_32F;
+<<<<<<< HEAD
+=======
+      compression_factor = 10;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@@ -299,10 +337,16 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
     }
   }
 
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = compressed_A.size(0);
+=======
+  int64_t k = dense_B.size(0);
+  int64_t n = dense_B.size(1);
+  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 6a53d4833adeb..845f256edd0c8 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -93,7 +93,10 @@
           This operator does not support cudagraphs. The presence of this tag on an operator will cause
           Inductor to split the graph around this operator. Note that operators without this tag may still
           not support CUDAGraphs. Inductor may have other hardcoded lists around that.
+<<<<<<< HEAD
 - tag: reduction
   desc: |
           This tag indicates that an operator performs a reduction operation, computing aggregate values
           (sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 7aad4309924d4..6b238d692e055 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -207,7 +207,11 @@ Tensor qkv_projection(
     } else {
       // encoder-decoder attention
       // TODO: is there a more efficient way to set this up?
+<<<<<<< HEAD
       // TODO: can we stay nested instead of using cat? Probably just make a
+=======
+      // TODO: can we stay nested insted of using cat? Probably just make a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NestedTensor out of the matmul results or something?
       auto q_kv_weight_s =
           at::native::split_with_sizes(qkv_weight, {embed_dim, embed_dim * 2}, 0);
@@ -776,7 +780,11 @@ Tensor scaled_dot_product_attention(
 #ifdef USE_MPS
       const auto any_nested = query_.is_nested() || key.is_nested() || value.is_nested();
       const bool any_inputs_require_grad = query_.requires_grad() || key.requires_grad() || value.requires_grad();
+<<<<<<< HEAD
       const auto all_contiguous = query_.is_contiguous_or_false() && key.is_contiguous_or_false() && value.is_contiguous_or_false();
+=======
+      const auto all_contiguous = query_.is_contiguous() && key.is_contiguous() && value.is_contiguous();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (query_device_type == DeviceType::MPS && dropout_p == 0.0
           && !(GradMode::is_enabled() && any_inputs_require_grad)
           && (all_contiguous || mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS))
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index c2193f2378dd5..dacd26a628822 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -915,6 +915,19 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
     // TODO(eqy): support debug_attn_mask
     return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
   } else {
+<<<<<<< HEAD
+=======
+    //auto [
+    //    query_buffer_reshaped,
+    //    key_buffer_reshaped,
+    //    value_buffer_reshaped,
+    //    cumulative_sequence_length_q,
+    //    cumulative_sequence_length_kv,
+    //    max_seqlen_batch_q,
+    //    max_seqlen_batch_kv,
+    //    output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
+    // C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // TODO(eqy): debug mask support
     // BHSD ...
     const int64_t batch_size = cumulative_sequence_length_q.value().size(0) - 1;
@@ -1402,7 +1415,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
   if(at::globalContext().getROCmFAPreferredBackend() ==
     at::ROCmFABackend::Ck) {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::optional<Tensor> out(res);
     std::optional<Tensor> seqused_k = std::nullopt;
     std::optional<Tensor> alibi_slopes = std::nullopt;
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 55fc1e261219e..46843688064ef 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -27,8 +27,11 @@
 #include <ATen/ops/zeros.h>
 #include <ATen/ops/zeros_like.h>
 #include <ATen/ops/empty_strided.h>
+<<<<<<< HEAD
 #include <ATen/ops/_cudnn_attention_backward.h>
 #include <ATen/ops/_cudnn_attention_backward_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/_flash_attention_backward.h>
 #include <ATen/ops/_flash_attention_backward_native.h>
 #include <ATen/ops/_efficient_attention_backward.h>
@@ -100,14 +103,22 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   std::optional<at::Tensor> dk{std::nullopt};
   std::optional<at::Tensor> dv{std::nullopt};
 
+<<<<<<< HEAD
   //  The kernel computes regardless we will drop for this functions return
+=======
+  //  The kernel computes irregardless we will drop for this functions return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor grad_softmax;
 
   // Currently unused args:
   std::optional<at::Tensor> alibi_slopes{std::nullopt};
   const float softcap = 0.0;
 
+<<<<<<< HEAD
   bool deterministic{false};
+=======
+  bool determinisitic{false};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& ctx = at::globalContext();
   if (ctx.deterministicAlgorithms()) {
     if (ctx.deterministicAlgorithmsWarnOnly()) {
@@ -115,7 +126,11 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
           "Flash Attention defaults to a non-deterministic algorithm. ",
           "To explicitly enable determinism call torch.use_deterministic_algorithms(True, warn_only=False).");
     } else {
+<<<<<<< HEAD
       deterministic = true;
+=======
+      determinisitic = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -150,7 +165,11 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         non_null_window_right,
 #endif
         softcap,
+<<<<<<< HEAD
         deterministic,
+=======
+        determinisitic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         philox_seed,
         philox_offset);
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue));
@@ -178,7 +197,11 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         non_null_window_right,
 #endif
         softcap,
+<<<<<<< HEAD
         deterministic,
+=======
+        determinisitic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         philox_seed,
         philox_offset);
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue));
@@ -188,7 +211,11 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
+=======
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_out,
     const Tensor& query,
     const Tensor& key,
@@ -215,6 +242,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
       }
     }
 
+<<<<<<< HEAD
     const bool is_nested = cum_seq_q.defined();
     const int64_t max_seqlen_batch_q = query.size(2);
     const int64_t max_seqlen_batch_k = key.size(2);
@@ -326,6 +354,59 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
         philox_offset);
       return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
     }
+=======
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t head_dim_qk = query.size(3);
+    const int64_t head_dim_v = value.size(3);
+    const int64_t max_seqlen_batch_q = query.size(2);
+    const int64_t max_seqlen_batch_k = key.size(2);
+
+    // This is needed because SaveVariable automatically converts
+    // std::optional to undefined tensor
+    std::optional<Tensor> attn_bias_;
+    if (attn_bias.defined()) {
+      attn_bias_ = attn_bias;
+    }
+    if (attn_bias_.has_value()) {
+      const auto bias_dim = attn_bias_.value().dim();
+      if (bias_dim == 2) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+      } else if (bias_dim == 3) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+      } else {
+        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+      }
+    }
+
+    const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
+    auto dq = at::empty_like(query);
+    auto dk = at::empty_like(key);
+    auto dv = at::empty_like(value);
+    run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
+                        num_heads /*int64_t h*/,
+                        max_q/*int64_t s_q*/,
+                        max_k/*int64_t s_kv*/,
+                        head_dim_qk /*int64_t d_qk*/,
+                        head_dim_v /*int64_t d_v*/,
+                        softmax_scale /*float scaling_factor*/,
+                        is_causal /*bool is_causal*/,
+                        dropout_p /*float dropout_probability*/,
+                        query /*const Tensor& q*/,
+                        key /*const Tensor& k*/,
+                        value /*const Tensor& v*/,
+                        attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
+                        out /*const Tensor& o*/,
+                        grad_out/*const Tensor& dO*/,
+                        logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
+                        dq/*Tensor& dQ*/,
+                        dk/*Tensor& dK*/,
+                        dv/*Tensor& dV*/,
+                        philox_seed/*Tensor& dropoutseed*/,
+                        philox_offset/*Tensor& dropoutoffset*/);
+    return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -495,7 +576,11 @@ _efficient_attention_backward(
   // ROCM Implementation
   if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck)
   {
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float();
     // Store grad_bias in optional
     std::optional<at::Tensor> opt_grad_bias = grad_bias;
@@ -1185,6 +1270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
   }
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
     const Tensor& grad_out,
     const Tensor& query,
@@ -1221,4 +1307,6 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
             scale);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index 4141044116bbe..be21dd87cc2bb 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -22,7 +22,10 @@
 #else
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
+<<<<<<< HEAD
 #include <ATen/ops/zeros_like.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/reshape.h>
 #include <ATen/ops/scalar_tensor.h>
 #include <ATen/ops/sum.h>
@@ -33,9 +36,13 @@
 #endif
 
 
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <cutlass/numeric_types.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <cutlass/numeric_types.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 #include <flash.h>
@@ -43,6 +50,10 @@ C10_DIAGNOSTIC_POP()
 #include <static_switch.h>
 #include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 
 namespace FLASH_NAMESPACE {
@@ -391,14 +402,29 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
         std::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
     bool is_sm80_or_newer = (dprops->major * 10) >= 80;
     TORCH_CHECK(is_sm80_or_newer, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto q_dtype = q.dtype();
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -417,6 +443,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
     const int head_size_og = sizes[3];
     const int seqlen_k = k.size(1);
     const int num_heads_k = k.size(2);
+<<<<<<< HEAD
 
     if (batch_size == 0) {
         auto opts = q.options();
@@ -437,6 +464,8 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
         return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
     TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
@@ -567,7 +596,11 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
         q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
         softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
     }
+<<<<<<< HEAD
     return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
+=======
+    return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -593,14 +626,29 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                std::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
     bool is_sm80_or_newer = (dprops->major * 10) >= 80;
     TORCH_CHECK(is_sm80_or_newer, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto q_dtype = q.dtype();
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -848,8 +896,20 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     #endif
     if (is_causal) { window_size_right = 0; }
     auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
     bool is_sm80_or_newer = (dprops->major * 10) >= 80;
     TORCH_CHECK(is_sm80_or_newer, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     bool is_dropout = p_dropout > 0.0;
     auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -858,7 +918,11 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -872,6 +936,10 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
     TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
     TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+<<<<<<< HEAD
+=======
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     const auto sizes = q.sizes();
 
@@ -882,6 +950,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     const int head_size = sizes[3];
     const int seqlen_k = k.size(1);
     const int num_heads_k = k.size(2);
+<<<<<<< HEAD
 
     if (batch_size == 0) {
         auto opts = q.options();
@@ -896,12 +965,18 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
 
     TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
     TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
     if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+=======
+        TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120_or_sm121, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -1071,9 +1146,21 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     if (is_causal) { window_size_right = 0; }
     auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
     bool is_sm80_or_newer = (dprops->major * 10) >= 80;
     TORCH_CHECK(is_sm80_or_newer, "FlashAttention only supports Ampere GPUs or newer.");
 
+=======
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool is_dropout = p_dropout > 0.0;
     auto stream = at::cuda::getCurrentCUDAStream().stream();
 
@@ -1081,7 +1168,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -1116,7 +1207,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
     if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+=======
+        TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120_or_sm121, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -1290,14 +1385,29 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
                 ) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
     bool is_sm80_or_newer = (dprops->major * 10) >= 80;
     TORCH_CHECK(is_sm80_or_newer, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto q_dtype = q.dtype();
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80_or_newer, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype");
@@ -1332,7 +1442,11 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
     const int num_heads_k = kcache.size(2);
     const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
+<<<<<<< HEAD
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
+=======
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h
index 7115cb07a793e..156f0d2ad2c7e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h
@@ -125,7 +125,11 @@ class MemoryEfficientAttentionNormalize {
       FragmentSource const& source) const {
     assert(!isFirst);
 
+<<<<<<< HEAD
     // Convert source to internal compute numeric type
+=======
+    // Convert source to interal compute numeric type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
         source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
@@ -164,7 +168,11 @@ class MemoryEfficientAttentionNormalize {
       const {
     assert(isFirst);
 
+<<<<<<< HEAD
     // Convert source to internal compute numeric type
+=======
+    // Convert source to interal compute numeric type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
         accumulator_converter;
 
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
index 156034954d9e6..6f73f804b3960 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -110,9 +110,15 @@ class ApplyLogSumExp {
   using ElementCompute = ElementCompute_;
   using ElementLSE = ElementLSE_;
 
+<<<<<<< HEAD
   static int constexpr kElementsPerAccess = ElementsPerAccess;
   static int constexpr kCount = kElementsPerAccess;
   static constexpr ScaleType::Kind kScale =
+=======
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cutlass::epilogue::thread::ScaleType::NoBetaScaling;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h
index 3c3566512b45c..beaab057c376d 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h
@@ -88,7 +88,11 @@ class CustomMmaBase {
       Shape::kN / WarpGemm::kN,
       Shape::kK / WarpGemm::kK>;
 
+<<<<<<< HEAD
   /// Number of warp-level GEMM operations
+=======
+  /// Number of warp-level GEMM oeprations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
 
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
index e75a1b9001e02..cca1157d6d426 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -68,7 +68,11 @@ namespace threadblock {
 /// ForwardTileIterator
 ///
 template <
+<<<<<<< HEAD
     typename ThreadMap_, ///< Thread map (concept: OutputTileThreadMap)
+=======
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typename Element_, ///< Element data type
     bool ScatterD = false, ///< Scatter D operand or not
     bool UseCUDAStore = false>
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index 20495a05474b0..1a770c970451b 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -245,7 +245,11 @@ struct AttentionBackwardKernel {
   static constexpr int64_t kWarpSize = 32;
 
   // If this is true, we store and accumulate dK/dV in RF
+<<<<<<< HEAD
   // rather than going back to gmem every time
+=======
+  // rather than going back to gmem everytime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
   static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
   static_assert(
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
index 7b83617f643d2..b06aeb06de877 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@@ -117,7 +117,11 @@ def cpp_impl(self) -> str:
     def get_all(cls) -> list["FwdKernel"]:
         kernels: list[FwdKernel] = []
         for aligned, dtype, (sm, sm_max) in itertools.product(
+<<<<<<< HEAD
             [True, False], DTYPES.keys(), itertools.pairwise(SM)
+=======
+            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # Remove some kernels we don't use
             if dtype == "bf16" and sm < 80:
@@ -228,7 +232,11 @@ def get_all(cls) -> list["BwdKernel"]:
         for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
             [True, False],
             DTYPES.keys(),
+<<<<<<< HEAD
             itertools.pairwise(SM),
+=======
+            zip(SM, SM[1:]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [True, False],
             [32, 64, 128, 2**16],
         ):
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 7fce73151b00f..53b8fdc1f257a 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -61,6 +61,7 @@
 namespace sdp {
 namespace {
 
+<<<<<<< HEAD
 // tracks whether we've set the default priority order once, to avoid setting
 // it redundantly or overwriting a user-specified priority order
 // when the priority order context manager is used before the default priority
@@ -91,6 +92,23 @@ bool check_prefer_cudnn_attention() {
 #endif
     return false;
   }
+=======
+// TODO(eqy): more benchmarking to determine whether this should include sm86/89
+// Needs to be kept in-sync with test_fused_chocie in test_transformers.py
+bool check_prefer_cudnn_attention() {
+  // TODO(eqy): Re-enable by default after upgrading to a release later than 9.5.0
+  // see context: https://github.com/pytorch/pytorch/issues/138340
+  // return false;
+#if defined(CUDNN_VERSION)
+
+#if CUDNN_VERSION > 90000
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  return dprops->major >= 9;
+#else
+  return false;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   return false;
 #endif
@@ -98,6 +116,7 @@ bool check_prefer_cudnn_attention() {
 
 // flash_attention V2 is universally faster than efficient_attention and Math
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
+<<<<<<< HEAD
   if (!priority_order_init_) {
     priority_order_init_ = true;
     if (check_prefer_cudnn_attention()) {
@@ -108,6 +127,8 @@ std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
         at::globalContext().setSDPPriorityOrder(cudnn_order);
     }
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::globalContext().sDPPriorityOrder();
 }
 
@@ -323,12 +344,15 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
   return false;
 #endif
 #else
+<<<<<<< HEAD
   if (!at::cuda::is_available()) {
     if (debug) {
       TORCH_WARN("flash attention requires a CUDA device, which is not available.");
     }
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm121>(dprops)) {
     if (debug) {
@@ -380,12 +404,15 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
   return false;
 #endif
 #else
+<<<<<<< HEAD
   if (!at::cuda::is_available()) {
     if (debug) {
       TORCH_WARN("Mem Efficient attention requires a CUDA device, which is not available.");
     }
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm50, sm121>(dprops)) {
     if (debug) {
@@ -456,7 +483,11 @@ bool check_flash_causal_non_square_seqlens(sdp_params const& params, bool debug)
 
 bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
   // Check that all tensors are on the GPU device
+<<<<<<< HEAD
   // This should be handled by the stub dispatch, but we call can_use_*_attention
+=======
+  // This should be handled by the stub dispatch, but whe call can_use_*_attention
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // directly from python we need to ensure that the tensors are on cuda
   if (params.query.device().type() != at::DeviceType::CUDA) {
     if (debug) {
@@ -492,9 +523,15 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
     return false;
   }
   auto head_dim_limit = 128;
+<<<<<<< HEAD
   if (cudnn_version >= 91000) {
     auto dprops = at::cuda::getCurrentDeviceProperties();
     if (dprops->major == 9 && !dprops->minor) {
+=======
+  if (cudnn_version >= 90501) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if (dprops->major == 9  && !dprops->minor) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       head_dim_limit = 256;
     }
   }
@@ -531,6 +568,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
       return false;
     }
   }
+<<<<<<< HEAD
   if (s_k == 1) {
     if (debug) {
       TORCH_WARN_ONCE("cudnn SDPA does not support key/value sequence length 1.");
@@ -540,6 +578,11 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
   if (s_q == 1 && params.dropout != 0.0) {
     if (debug) {
       TORCH_WARN_ONCE("cudnn SDPA does not support query sequence length 1 with dropout.");
+=======
+  if (s_q == 1 || s_k == 1) {
+    if (debug) {
+      TORCH_WARN_ONCE("cudnn SDPA does not support sequence length 1.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return false;
   }
@@ -616,12 +659,15 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
 bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
   using sm80 = SMVersion<8, 0>;
   using sm121 = SMVersion<12, 1>;
+<<<<<<< HEAD
   if (!at::cuda::is_available()) {
     if (debug) {
       TORCH_WARN("cuDNN SDPA requires a CUDA device, which is not available.");
     }
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm121>(dprops)) {
     if (debug) {
@@ -644,12 +690,27 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
     }
     return false;
+<<<<<<< HEAD
   }
   const auto dprop = at::cuda::getCurrentDeviceProperties();
   // Check that the input is nested
   if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
     if (debug) {
       TORCH_WARN("cuDNN SDPA supports nested tensors on SM 9.0, SM 10.0.");
+=======
+  } else if (has_for_nested_inputs(params) && (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad())) {
+    if (debug) {
+      TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
+      return false;
+    }
+  }
+
+  const auto dprop = at::cuda::getCurrentDeviceProperties();
+  // Check that the input is nested
+  if (dprop->major != 9 && has_for_nested_inputs(params)) {
+    if (debug) {
+      TORCH_WARN("CuDNN SDPA supports nested tensors on SM 9.0.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return false;
   }
@@ -673,7 +734,11 @@ bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) {
   // sdp kernels
   if (!at::globalContext().userEnabledCuDNNSDP()) {
     if (debug) {
+<<<<<<< HEAD
       TORCH_WARN("cuDNN attention has been runtime disabled.");
+=======
+      TORCH_WARN("CuDNN attention has been runtime disabled.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return false;
   }
@@ -704,6 +769,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
 #endif
 #if defined(CUDNN_VERSION) && CUDNN_VERSION < 90000
   if (debug) {
+<<<<<<< HEAD
     TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
   }
   return false;
@@ -717,14 +783,27 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
     return false;
   }
 #endif
+=======
+    TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use CuDNN Attention (< v9.0.0)");
+  }
+  return false;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Define gate functions that determine if a flash kernel can be ran
   // Replace with std::to_array when we migrate to c++20
   constexpr auto general_constraints =
       c10::array_of<bool (*)(sdp_params const&, bool)>(
           check_runtime_disabled_cudnn,
           check_for_nested_inputs,
+<<<<<<< HEAD
           check_all_tensors_on_device,
           check_tensor_shapes,
+=======
+          check_nonzero_sequence_lengths_dense,
+          check_all_tensors_on_device,
+          check_tensor_shapes,
+          check_cudnn_tensor_shapes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           check_cudnn_deterministic,
           check_dtypes_low_precision,
           check_attn_mask_shape,
@@ -737,10 +816,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   }
   constexpr auto dense_constraints =
       c10::array_of<bool (*)(sdp_params const&, bool)>(
+<<<<<<< HEAD
       check_nonzero_sequence_lengths_dense,
       check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>,
       check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>,
       check_cudnn_tensor_shapes
+=======
+      check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>,
+      check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   );
 
   if (has_only_dense_inputs(params)) {
@@ -936,11 +1020,14 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
           return SDPBackend::math;
         }
         break;
+<<<<<<< HEAD
       case SDPBackend::overrideable:
         if (ctx.userEnabledOverrideableSDP()) {
           TORCH_CHECK(false, "Invalid backend");
         }
         break;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       default:
         TORCH_CHECK(false, "Invalid backend");
     }
@@ -957,7 +1044,11 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   sdp::can_use_mem_efficient_attention(kernel_params, print_debug);
   TORCH_WARN("Flash attention kernel not used because:");
   sdp::can_use_flash_attention(kernel_params, print_debug);
+<<<<<<< HEAD
   TORCH_WARN("cuDNN attention kernel not used because:");
+=======
+  TORCH_WARN("CuDNN attention kernel not used because:");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   sdp::can_use_cudnn_attention(kernel_params, print_debug);
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return SDPBackend::error;
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
index d316808cf9bef..07d2deb9ba677 100644
--- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
+++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
@@ -86,7 +86,11 @@ aotriton::TensorView<Rank> mk_aotensor(const at::Tensor& q, std::string_view ten
 {
   const auto strides = q.strides();
   int real_rank = strides.size();
+<<<<<<< HEAD
   if (real_rank != Rank) {  // Lazy conversion of tensor_name
+=======
+  if (real_rank != Rank) {  // Lazy convertion of tensor_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false,
                 std::string(tensor_name) + "'s rank should be " + std::to_string(Rank)
                 + " but is " + std::to_string(real_rank));
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index 2467cb809fdbf..0b5839a67af04 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -236,6 +236,15 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   } else {
     softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
   }
+<<<<<<< HEAD
+=======
+
+  at::Tensor atomic_counter;
+  if (is_causal) {
+    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
                                                               window_size_right,
                                                               seqlen_q,
@@ -249,6 +258,7 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   constexpr bool uses_swa = false;
 #endif
 
+<<<<<<< HEAD
   // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
   is_causal = is_causal || uses_swa;
 
@@ -257,6 +267,8 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
     atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hipError_t err; // TODO: Error handling
   using aotriton::v2::flash::attn_fwd;
   using sdp::aotriton_adapter::mk_aotensor;
@@ -396,7 +408,11 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
   CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
 
   // AOTriton's varlen API needs input shapes be
+<<<<<<< HEAD
   // (1, num_heads, total sequence length, head dimension)
+=======
+  // (1, num_heads, total sequence lenght, head dimension)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor q_padded, k_padded, v_padded;
   at::Tensor out, out_padded;
   q_padded = q.unsqueeze(0).transpose(1, 2);
@@ -450,9 +466,12 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
   constexpr bool uses_swa = false;
 #endif
 
+<<<<<<< HEAD
   // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
   is_causal = is_causal || needs_swa;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto [seed_t, offset_t, philox_state, use_philox_state] =
     prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);
 
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt
index 819880cf3bc5c..5e1980b33d581 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt
@@ -1,6 +1,10 @@
 # generate a list of kernels, but not actually emit files at config stage
 execute_process(
+<<<<<<< HEAD
   COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
+=======
+  COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   --api fwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt
   RESULT_VARIABLE ret
 )
@@ -10,6 +14,7 @@ if(ret AND NOT ret EQUAL 0)
 endif()
 
 execute_process(
+<<<<<<< HEAD
   COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
   --api fwd_splitkv --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_splitkv_blob_list.txt
   RESULT_VARIABLE ret
@@ -31,6 +36,9 @@ endif()
 
 execute_process(
   COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
+=======
+  COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   --api bwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt
   RESULT_VARIABLE ret
 )
@@ -39,14 +47,20 @@ if(ret AND NOT ret EQUAL 0)
   message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of BWD kernels via Python.")
 endif()
 
+<<<<<<< HEAD
 # Generate the files for both fwd, fwd_splitkv, fwd_appendkv, and bwd
 execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
+=======
+# Generate the files for both fwd and bwd
+execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if(ret AND NOT ret EQUAL 0)
   message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD kernels.")
 endif()
 
+<<<<<<< HEAD
 execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd_splitkv --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
 )
 
@@ -62,6 +76,9 @@ if(ret AND NOT ret EQUAL 0)
 endif()
 
 execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
+=======
+execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   RESULT_VARIABLE ret
 )
 
@@ -78,6 +95,7 @@ if(ret AND NOT ret EQUAL 0)
   message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd pass")
 endif()
 
+<<<<<<< HEAD
 execute_process(
   COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/fwd_splitkv_blob_list.txt"
   RESULT_VARIABLE ret)
@@ -94,6 +112,8 @@ if(ret AND NOT ret EQUAL 0)
   message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd appendkv pass")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Change make_kernel to make_kernel_pt for bwd
 execute_process(
   COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt"
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh
index 849613f795692..51bb1bb58d656 100755
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh
@@ -21,8 +21,11 @@ while IFS= read -r file; do
     if [ -f "$file" ]; then
         # Use sed to replace "make_kernel" with "make_kernel_pt" in place
         sed -i 's/make_kernel/make_kernel_pt/g' "$file"
+<<<<<<< HEAD
         sed -i 's/\#include \"fmha_fwd.hpp\"/\#include \"fmha_fwd.hpp\"\n\#include \"launch_kernel_pt.hpp\"/g' "$file"
         sed -i 's/\#include \"fmha_bwd.hpp\"/\#include \"fmha_bwd.hpp\"\n\#include \"launch_kernel_pt.hpp\"/g' "$file"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         echo "Updated: $file"
     else
         echo "Skipping: $file (not found)"
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/bias.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/bias.hpp
new file mode 100644
index 0000000000000..8115288fb8877
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/bias.hpp
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
+
+// keep sync with BlockAttentionBiasEnum
+enum class bias_enum
+{
+    no_bias          = 0,
+    elementwise_bias = 1,
+    alibi            = 2,
+};
+
+struct bias_info
+{
+    bias_enum type;
+    /*
+     * simple dispatch logic
+     *
+     * if type == elementwise_bias:
+     *      if rank_info == 0:
+     *           bias is 1*1*s*s
+     *      elif rank_info == 1:
+     *           bias is 1*h*s*s
+     *      elif rank_info == 2:
+     *           bias is b*h*s*s
+     *
+     * elif type == alibi:
+     *       if rank_info == 0:
+     *           alibi in 1*h
+     *       elif rank_info == 1:
+     *           alibi in b*h
+     */
+    int rank_info;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == bias_enum::no_bias)
+            os << "n";
+        else if(type == bias_enum::elementwise_bias)
+        {
+            os << "e";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+        else if(type == bias_enum::alibi)
+        {
+            os << "alibi";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+    }
+
+    static bias_info decode(std::string str)
+    {
+        bias_info info{bias_enum::no_bias, 0};
+        if(str == "0" || str == "n")
+        {
+            info.type = bias_enum::no_bias;
+        }
+        else if(str.compare(0, 1, "1") == 0 || str.compare(0, 1, "e") == 0 ||
+                str.compare(0, 11, "elementwise") == 0)
+        {
+            info.type    = bias_enum::elementwise_bias;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        else if(str.compare(0, 1, "2") == 0 || str.compare(0, 1, "a") == 0 ||
+                str.compare(0, 5, "alibi") == 0)
+        {
+            info.type    = bias_enum::alibi;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        return info;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const bias_info& bi)
+    {
+        bi.serialize(os);
+        return os;
+    }
+};
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
new file mode 100644
index 0000000000000..affa40619b598
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
@@ -0,0 +1,457 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ck_tile/core.hpp>
+#include <ck_tile/host/kernel_launch.hpp>
+#include <ck_tile/ops/fmha.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <mask.hpp>
+#include <bias.hpp>
+#include <launch_kernel_pt.hpp>
+
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+struct FmhaBwdFp16
+{
+};
+
+struct FmhaBwdBf16
+{
+};
+
+template <typename DataType>
+struct FmhaBwdTypeConfig;
+
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdFp16>
+{
+    using QDataType             = ck_tile::half_t;
+    using KDataType             = ck_tile::half_t;
+    using VDataType             = ck_tile::half_t;
+    using GemmDataType          = ck_tile::half_t;
+    using BiasDataType          = ck_tile::half_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::half_t;
+    using OGradDataType         = ck_tile::half_t;
+    using QGradDataType         = ck_tile::half_t;
+    using KGradDataType         = ck_tile::half_t;
+    using VGradDataType         = ck_tile::half_t;
+    using BiasGradDataType      = ck_tile::half_t;
+};
+
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdBf16>
+{
+    using QDataType             = ck_tile::bf16_t;
+    using KDataType             = ck_tile::bf16_t;
+    using VDataType             = ck_tile::bf16_t;
+    using GemmDataType          = ck_tile::bf16_t;
+    using BiasDataType          = ck_tile::bf16_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::bf16_t;
+    using OGradDataType         = ck_tile::bf16_t;
+    using QGradDataType         = ck_tile::bf16_t;
+    using KGradDataType         = ck_tile::bf16_t;
+    using VGradDataType         = ck_tile::bf16_t;
+    using BiasGradDataType      = ck_tile::bf16_t;
+};
+
+struct FmhaMasks
+{
+    using NoMask      = ck_tile::GenericAttentionMask<false>;
+    using GenericMask = ck_tile::GenericAttentionMask<true, true>;
+    using CausalMask  = ck_tile::GenericAttentionMask<true, false>;
+};
+
+// runtime args, some will passed to karg, some will used to compute grids/blocks
+struct fmha_bwd_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    const void* o_ptr;
+    const void* lse_ptr;
+    const void* do_ptr;
+    void* d_ptr;
+    void* rand_val_ptr;
+    void* dq_ptr;
+    void* dk_ptr;
+    void* dv_ptr;
+    void* dbias_ptr;
+    void* dq_acc_ptr;
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t max_seqlen_k;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+    float scale;
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o;
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_do;
+    ck_tile::index_t stride_dq_acc;
+    ck_tile::index_t stride_dq;
+    ck_tile::index_t stride_dk;
+    ck_tile::index_t stride_dv;
+    ck_tile::index_t stride_dbias;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_do;
+    ck_tile::index_t nhead_stride_lsed;
+    ck_tile::index_t nhead_stride_dq_acc;
+    ck_tile::index_t nhead_stride_dq;
+    ck_tile::index_t nhead_stride_dk;
+    ck_tile::index_t nhead_stride_dv;
+    ck_tile::index_t nhead_stride_dbias;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_o;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_do;
+    ck_tile::index_t batch_stride_lsed;
+    ck_tile::index_t batch_stride_dq_acc;
+    ck_tile::index_t batch_stride_dq;
+    ck_tile::index_t batch_stride_dk;
+    ck_tile::index_t batch_stride_dv;
+    ck_tile::index_t batch_stride_dbias;
+    ck_tile::index_t split_stride_dq_acc;
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+    float p_drop;
+    float p_undrop;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
+};
+
+template <typename FmhaBwdDQDKDVKernel>
+auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
+        {
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      args.dq_acc_ptr,
+                                                      args.seqstart_q_ptr,
+                                                      args.seqstart_k_ptr,
+                                                      args.seqlen_k_ptr,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      args.stride_dq_acc,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      args.nhead_stride_dq_acc,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      args.dq_acc_ptr,
+                                                      args.seqlen_q,
+                                                      args.seqlen_k,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      args.stride_dq_acc,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      args.nhead_stride_dq_acc,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.batch_stride_q,
+                                                      args.batch_stride_k,
+                                                      args.batch_stride_v,
+                                                      args.batch_stride_bias,
+                                                      args.batch_stride_randval,
+                                                      args.batch_stride_do,
+                                                      args.batch_stride_lsed,
+                                                      args.batch_stride_dq_acc,
+                                                      args.batch_stride_dk,
+                                                      args.batch_stride_dv,
+                                                      args.batch_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
+        }
+    }();
+
+    dim3 grids = FmhaBwdDQDKDVKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_k);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename FmhaBwdOGradDotOKernel>
+auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdOGradDotOKernel::kIsGroupMode)
+        {
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqstart_q_ptr,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqlen_q,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed,
+                                                     args.batch_stride_do,
+                                                     args.batch_stride_o,
+                                                     args.batch_stride_lsed);
+        }
+    }();
+
+    dim3 grids = FmhaBwdOGradDotOKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename FmhaBwdConvertQGradKernel>
+auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdConvertQGradKernel::kIsGroupMode)
+        {
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqstart_q_ptr,
+                                                        args.seqstart_k_ptr,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqlen_q,
+                                                        args.seqlen_k,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.batch_stride_dq,
+                                                        args.batch_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+    }();
+
+    dim3 grids = FmhaBwdConvertQGradKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
+          typename FmhaMask_,
+          typename FmhaDropout_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kHasBiasGrad_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_,
+          bool kIsDeterministic_>
+struct fmha_bwd_dq_dk_dv_traits_
+{
+    static constexpr ck_tile::index_t HDim    = HDim_;
+    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode        = kIsGroupMode_;
+    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
+    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
+    static constexpr auto BiasEnum            = BiasEnum_;
+    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
+    static constexpr bool kPadS               = kPadS_;
+    static constexpr bool kPadSK              = kPadSK_;
+    static constexpr bool kPadD               = kPadD_;
+    static constexpr bool kPadDv              = kPadDv_;
+    static constexpr bool kIsDeterministic    = kIsDeterministic_;
+};
+
+template <typename Traits_>
+float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+std::string fmha_bwd_dq_dk_dv_get_name_();
+
+template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
+struct fmha_bwd_dot_do_o_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadDv           = kPadDv_;
+};
+
+template <typename Traits_>
+float fmha_bwd_dot_do_o_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+std::string fmha_bwd_dot_do_o_get_name_();
+
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          bool kPadS_,
+          bool kPadD_,
+          bool kIsDeterministic_>
+struct fmha_bwd_convert_dq_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
+};
+
+template <typename Traits_>
+float fmha_bwd_convert_dq_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_>
+std::string fmha_bwd_convert_dq_get_name_();
+
+// This is the public API, will be generated by script
+struct fmha_bwd_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_dbias;
+    bool has_dropout;
+    bool is_store_randval;
+    bool is_deterministic;
+    // TODO: padding check is inside this api
+};
+template <int Version = 2>
+float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp
new file mode 100644
index 0000000000000..2de70cd49bbb7
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp
@@ -0,0 +1,824 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ck_tile/core.hpp>
+#include <ck_tile/host/kernel_launch.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
+
+#include <bias.hpp>
+#include <mask.hpp>
+#include <rotary.hpp>
+#include <launch_kernel_pt.hpp>
+
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+struct FmhaFwdFp16
+{
+};
+
+struct FmhaFwdBf16
+{
+};
+
+struct FmhaFwdFp8
+{
+};
+
+struct FmhaFwdBf8
+{
+};
+
+struct FmhaFwdFp8Fp16
+{
+};
+
+struct FmhaFwdFp8Bf16
+{
+};
+
+template <typename DataType>
+struct FmhaFwdTypeConfig;
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdFp16>
+{
+    using QDataType             = ck_tile::half_t;
+    using KDataType             = ck_tile::half_t;
+    using VDataType             = ck_tile::half_t;
+    using BiasDataType          = ck_tile::half_t;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::half_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;           // data type for second gemm accumulation
+    using ODataType             = ck_tile::half_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdBf16>
+{
+    using QDataType             = ck_tile::bf16_t;
+    using KDataType             = ck_tile::bf16_t;
+    using VDataType             = ck_tile::bf16_t;
+    using BiasDataType          = ck_tile::bf16_t;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::bf16_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;           // data type for second gemm accumulation
+    using ODataType             = ck_tile::bf16_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdFp8>
+{
+    using QDataType             = ck_tile::fp8_t;
+    using KDataType             = ck_tile::fp8_t;
+    using VDataType             = ck_tile::fp8_t;
+    using BiasDataType          = float;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = ck_tile::fp8_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<FmhaFwdBf8>
+{
+    using QDataType             = ck_tile::bf8_t;
+    using KDataType             = ck_tile::bf8_t;
+    using VDataType             = ck_tile::bf8_t;
+    using BiasDataType          = ck_tile::bf8_t;
+    using RandValOutputDataType = uint8_t;
+    using LSEDataType           = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType          = float; // data type for first gemm accumulation
+    using SMPLComputeDataType   = float; // data type for reduction, softmax
+    using PDataType             = ck_tile::bf8_t; // data type for A matrix of second gemm
+    using OaccDataType          = float;          // data type for second gemm accumulation
+    using ODataType             = ck_tile::bf8_t;
+};
+
+struct FmhaMasks
+{
+    using NoMask      = ck_tile::GenericAttentionMask<false>;
+    using GenericMask = ck_tile::GenericAttentionMask<true, true>;
+    using CausalMask  = ck_tile::GenericAttentionMask<true, false>;
+};
+
+// runtime args, some will passed to karg, some will used to compute grids/blocks
+struct fmha_fwd_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* rand_val_ptr;
+    void* lse_ptr;
+    void* o_ptr;
+
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void*
+        seqlen_k_ptr; // only used if both 'seqstart_q_ptr' & 'seqstart_k_ptr' are not nullptr
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_o;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+
+    float p_drop;
+    bool s_randval;
+
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
+};
+
+struct fmha_fwd_splitkv_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* lse_acc_ptr;
+    void* o_acc_ptr;
+    void* lse_ptr;
+    void* o_ptr;
+
+    void* block_table_ptr;
+    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
+    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+    bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
+                   // nullptr.
+
+    const void* cache_batch_idx;
+
+    // the real seqlen_q & seqlen_k are decided by following:
+    // batch mode: seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k
+    // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    //
+    // batch mode (kvcache):
+    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //
+    //     when is_gappy=true:
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    //             seqstart_k_ptr[b] now store local offset of each batch
+    //
+    //     when is_gappy=false:
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+    ck_tile::index_t num_splits;
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o_acc;
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_lse_acc;
+    ck_tile::index_t nhead_stride_o_acc;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_lse_acc;
+    ck_tile::index_t batch_stride_o_acc;
+    ck_tile::index_t batch_stride_o;
+    ck_tile::index_t split_stride_lse_acc;
+    ck_tile::index_t split_stride_o_acc;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+};
+
+struct fmha_fwd_appendkv_args
+{
+    void* q_ptr;
+    void* k_ptr;
+    const void* knew_ptr;
+    void* v_ptr;
+    const void* vnew_ptr;
+
+    const void* seqlen_k_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_knew;
+    ck_tile::index_t batch;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    const void* rotary_cos_ptr; // only used if 'rotary_dim' > 0
+    const void* rotary_sin_ptr; // only used if 'rotary_dim' > 0
+    ck_tile::index_t rotary_dim;
+    bool has_mask;
+
+    void* block_table_ptr;
+    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
+    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+
+    const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache)
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_knew;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_vnew;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_knew;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_vnew;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_knew;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_vnew;
+};
+
+template <typename FmhaKernel>
+auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqstart_q_ptr,
+                                             args.seqstart_k_ptr,
+                                             args.seqlen_k_ptr,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqlen_q,
+                                             args.seqlen_k,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.batch_stride_q,
+                                             args.batch_stride_k,
+                                             args.batch_stride_v,
+                                             args.batch_stride_bias,
+                                             args.batch_stride_randval,
+                                             args.batch_stride_lse,
+                                             args.batch_stride_o,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
+        }
+    }();
+
+    if constexpr(FmhaKernel::kIsGroupMode)
+    {
+        dim3 grids = FmhaKernel::GridSize(
+            args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+    else
+    {
+        dim3 grids =
+            FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+}
+
+template <typename Kernel>
+auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(Kernel::kIsGroupMode)
+        {
+            return Kernel::MakeKargs(args.q_ptr,
+                                     args.k_ptr,
+                                     args.v_ptr,
+                                     args.bias_ptr,
+                                     args.lse_acc_ptr,
+                                     args.o_acc_ptr,
+                                     args.batch,
+                                     args.seqstart_q_ptr,
+                                     args.seqstart_k_ptr,
+                                     args.seqlen_k_ptr,
+                                     args.hdim_q,
+                                     args.hdim_v,
+                                     args.nhead_q,
+                                     args.nhead_q / args.nhead_k,
+                                     args.num_splits,
+                                     args.block_table_ptr,
+                                     args.batch_stride_block_table,
+                                     args.page_block_size,
+                                     args.is_gappy,
+                                     args.scale_s,
+                                     args.scale_p,
+                                     args.stride_q,
+                                     args.stride_k,
+                                     args.stride_v,
+                                     args.stride_bias,
+                                     args.stride_o_acc,
+                                     args.nhead_stride_q,
+                                     args.nhead_stride_k,
+                                     args.nhead_stride_v,
+                                     args.nhead_stride_bias,
+                                     args.nhead_stride_lse_acc,
+                                     args.nhead_stride_o_acc,
+                                     args.batch_stride_k, // only used for paged-kvcache
+                                     args.batch_stride_v, // only used for paged-kvcache
+                                     args.split_stride_lse_acc,
+                                     args.split_stride_o_acc,
+                                     args.window_size_left,
+                                     args.window_size_right,
+                                     args.mask_type);
+        }
+        else
+        { // create batch mode kernel arguments
+            return Kernel::MakeKargs(args.q_ptr,
+                                     args.k_ptr,
+                                     args.v_ptr,
+                                     args.bias_ptr,
+                                     args.lse_acc_ptr,
+                                     args.o_acc_ptr,
+                                     args.batch,
+                                     args.seqlen_q,
+                                     args.seqlen_k,
+                                     args.seqlen_k_ptr,
+                                     args.hdim_q,
+                                     args.hdim_v,
+                                     args.nhead_q,
+                                     args.nhead_q / args.nhead_k,
+                                     args.num_splits,
+                                     args.block_table_ptr,
+                                     args.batch_stride_block_table,
+                                     args.page_block_size,
+                                     args.cache_batch_idx,
+                                     args.scale_s,
+                                     args.scale_p,
+                                     args.stride_q,
+                                     args.stride_k,
+                                     args.stride_v,
+                                     args.stride_bias,
+                                     args.stride_o_acc,
+                                     args.nhead_stride_q,
+                                     args.nhead_stride_k,
+                                     args.nhead_stride_v,
+                                     args.nhead_stride_bias,
+                                     args.nhead_stride_lse_acc,
+                                     args.nhead_stride_o_acc,
+                                     args.batch_stride_q,
+                                     args.batch_stride_k,
+                                     args.batch_stride_v,
+                                     args.batch_stride_bias,
+                                     args.batch_stride_lse_acc,
+                                     args.batch_stride_o_acc,
+                                     args.split_stride_lse_acc,
+                                     args.split_stride_o_acc,
+                                     args.window_size_left,
+                                     args.window_size_right,
+                                     args.mask_type);
+        }
+    }();
+
+    dim3 grids = Kernel::GridSize(
+        args.batch, args.nhead_q, args.nhead_k, args.max_seqlen_q, args.hdim_v, args.num_splits);
+
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename Kernel>
+auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel argumentszs
+        if constexpr(Kernel::kIsGroupMode)
+        {
+            return Kernel::MakeKargs(args.lse_acc_ptr,
+                                     args.o_acc_ptr,
+                                     args.lse_ptr,
+                                     args.o_ptr,
+                                     args.batch,
+                                     args.seqstart_q_ptr,
+                                     args.hdim_v,
+                                     args.num_splits,
+                                     args.scale_o,
+                                     args.stride_o_acc,
+                                     args.stride_o,
+                                     args.nhead_stride_lse_acc,
+                                     args.nhead_stride_o_acc,
+                                     args.nhead_stride_lse,
+                                     args.nhead_stride_o,
+                                     args.split_stride_lse_acc,
+                                     args.split_stride_o_acc);
+        }
+        else
+        { // create batch mode kernel arguments
+            return Kernel::MakeKargs(args.lse_acc_ptr,
+                                     args.o_acc_ptr,
+                                     args.lse_ptr,
+                                     args.o_ptr,
+                                     args.batch,
+                                     args.seqlen_q,
+                                     args.hdim_v,
+                                     args.num_splits,
+                                     args.scale_o,
+                                     args.stride_o_acc,
+                                     args.stride_o,
+                                     args.nhead_stride_lse_acc,
+                                     args.nhead_stride_o_acc,
+                                     args.nhead_stride_lse,
+                                     args.nhead_stride_o,
+                                     args.batch_stride_lse_acc,
+                                     args.batch_stride_o_acc,
+                                     args.batch_stride_lse,
+                                     args.batch_stride_o,
+                                     args.split_stride_lse_acc,
+                                     args.split_stride_o_acc);
+        }
+    }();
+
+    dim3 grids = Kernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
+
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename Kernel>
+auto fmha_fwd_appendkv_create_kargs_and_grids(fmha_fwd_appendkv_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = Kernel::MakeKargs(args.q_ptr,
+                                   args.k_ptr,
+                                   args.knew_ptr,
+                                   args.v_ptr,
+                                   args.vnew_ptr,
+                                   args.seqlen_q,
+                                   args.seqlen_k_ptr,
+                                   args.seqlen_knew,
+                                   args.hdim_q,
+                                   args.hdim_v,
+                                   args.nhead_q,
+                                   args.nhead_q / args.nhead_k,
+                                   args.rotary_cos_ptr,
+                                   args.rotary_sin_ptr,
+                                   args.rotary_dim,
+                                   args.has_mask,
+                                   args.block_table_ptr,
+                                   args.batch_stride_block_table,
+                                   args.page_block_size,
+                                   args.cache_batch_idx,
+                                   args.stride_q,
+                                   args.stride_k,
+                                   args.stride_knew,
+                                   args.stride_v,
+                                   args.stride_vnew,
+                                   args.nhead_stride_q,
+                                   args.nhead_stride_k,
+                                   args.nhead_stride_knew,
+                                   args.nhead_stride_v,
+                                   args.nhead_stride_vnew,
+                                   args.batch_stride_q,
+                                   args.batch_stride_k,
+                                   args.batch_stride_knew,
+                                   args.batch_stride_v,
+                                   args.batch_stride_vnew);
+
+    dim3 grids = Kernel::GridSize(args.batch, args.nhead_q, args.seqlen_q, args.seqlen_knew);
+
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kM0_,
+          ck_tile::index_t kN0_,
+          ck_tile::index_t kK0_,
+          ck_tile::index_t kN1_,
+          ck_tile::index_t kK1_,
+          ck_tile::index_t kK0BlockLength_,
+          bool kIsVLayoutRowMajor_,
+          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          typename FmhaMask_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kStoreLse_,
+          bool kHasDropout_,
+          bool kDoFp8StaticQuant_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_>
+struct fmha_fwd_traits_
+{
+    static constexpr ck_tile::index_t HDim           = HDim_;
+    using DataType                                   = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode               = kIsGroupMode_;
+    static constexpr ck_tile::index_t kM0            = kM0_;
+    static constexpr ck_tile::index_t kN0            = kN0_;
+    static constexpr ck_tile::index_t kK0            = kK0_;
+    static constexpr ck_tile::index_t kN1            = kN1_;
+    static constexpr ck_tile::index_t kK1            = kK1_;
+    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
+    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
+    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr auto BiasEnum                   = BiasEnum_;
+    static constexpr bool kStoreLse                  = kStoreLse_;
+    static constexpr bool kHasDropout                = kHasDropout_;
+    static constexpr bool kDoFp8StaticQuant          = kDoFp8StaticQuant_;
+    static constexpr bool kPadS                      = kPadS_;
+    static constexpr bool kPadSK                     = kPadSK_;
+    static constexpr bool kPadD                      = kPadD_;
+    static constexpr bool kPadDv                     = kPadDv_;
+};
+
+template <typename Traits_>
+float fmha_fwd_(const ck_tile::stream_config&, fmha_fwd_args);
+
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kM0_,
+          ck_tile::index_t kN0_,
+          ck_tile::index_t kK0_,
+          ck_tile::index_t kN1_,
+          ck_tile::index_t kK1_,
+          ck_tile::index_t kK0BlockLength_,
+          bool kIsVLayoutRowMajor_,
+          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          typename FmhaMask_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kStoreLse_,
+          bool kDoFp8StaticQuant_,
+          bool kIsPagedKV_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_>
+struct fmha_fwd_splitkv_traits_
+{
+    static constexpr ck_tile::index_t HDim           = HDim_;
+    using DataType                                   = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode               = kIsGroupMode_;
+    static constexpr ck_tile::index_t kM0            = kM0_;
+    static constexpr ck_tile::index_t kN0            = kN0_;
+    static constexpr ck_tile::index_t kK0            = kK0_;
+    static constexpr ck_tile::index_t kN1            = kN1_;
+    static constexpr ck_tile::index_t kK1            = kK1_;
+    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
+    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
+    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr auto BiasEnum                   = BiasEnum_;
+    static constexpr bool kStoreLse                  = kStoreLse_;
+    static constexpr bool kDoFp8StaticQuant          = kDoFp8StaticQuant_;
+    static constexpr bool kPadS                      = kPadS_;
+    static constexpr bool kPadSK                     = kPadSK_;
+    static constexpr bool kPadD                      = kPadD_;
+    static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kIsPagedKV                 = kIsPagedKV_;
+};
+
+template <typename Traits_>
+void fmha_fwd_splitkv_oneshot_(const ck_tile::stream_config&, fmha_fwd_splitkv_args);
+
+template <typename Traits_>
+std::string fmha_fwd_splitkv_get_name_();
+
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kN1_,
+          bool kStoreLse_,
+          bool kDoFp8StaticQuant_,
+          bool kPadS_,
+          bool kPadDv_>
+struct fmha_fwd_splitkv_combine_traits_
+{
+    static constexpr ck_tile::index_t HDim  = HDim_;
+    using DataType                          = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode      = kIsGroupMode_;
+    static constexpr ck_tile::index_t kN1   = kN1_;
+    static constexpr bool kStoreLse         = kStoreLse_;
+    static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
+    static constexpr bool kPadS             = kPadS_;
+    static constexpr bool kPadDv            = kPadDv_;
+};
+
+template <typename Traits_>
+void fmha_fwd_splitkv_combine_oneshot_(const ck_tile::stream_config&, fmha_fwd_splitkv_args);
+
+template <typename Traits_>
+std::string fmha_fwd_splitkv_combine_get_name_();
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          ck_tile::index_t kTileSizeS_,
+          ck_tile::index_t kTileSizeSk_,
+          ck_tile::index_t kTileSizeD_,
+          ck_tile::index_t kTileSizeDv_,
+          bool kIsVLayoutRowMajor_,
+          bool kPadS_,
+          bool kPadSk_,
+          bool kPadD_,
+          bool kPadDv_,
+          ck_tile::RotaryEmbeddingEnum RotaryEnum_,
+          bool kIsPagedKV_>
+struct fmha_fwd_appendkv_traits_
+{
+    static constexpr ck_tile::index_t HDim        = HDim_;
+    using DataType                                = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr ck_tile::index_t kTileSizeS  = kTileSizeS_;
+    static constexpr ck_tile::index_t kTileSizeSk = kTileSizeSk_;
+    static constexpr ck_tile::index_t kTileSizeD  = kTileSizeD_;
+    static constexpr ck_tile::index_t kTileSizeDv = kTileSizeDv_;
+    static constexpr bool kIsVLayoutRowMajor      = kIsVLayoutRowMajor_;
+    static constexpr bool kPadS                   = kPadS_;
+    static constexpr bool kPadSk                  = kPadSk_;
+    static constexpr bool kPadD                   = kPadD_;
+    static constexpr bool kPadDv                  = kPadDv_;
+    static constexpr auto RotaryEnum              = RotaryEnum_;
+    static constexpr bool kIsPagedKV              = kIsPagedKV_;
+};
+
+template <typename Traits_>
+float fmha_fwd_appendkv_(const ck_tile::stream_config&, fmha_fwd_appendkv_args);
+
+// This is the public API, will be generated by script
+struct fmha_fwd_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    bool is_v_rowmajor;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_lse;
+    bool has_dropout;
+    bool do_fp8_static_quant;
+    // TODO: padding check is inside this api
+};
+float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);
+
+struct fmha_fwd_splitkv_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    bool is_v_rowmajor;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_lse;
+    bool do_fp8_static_quant;
+    // TODO: padding check is inside this api
+};
+float fmha_fwd_splitkv(fmha_fwd_splitkv_traits,
+                       fmha_fwd_splitkv_args,
+                       const ck_tile::stream_config&);
+
+struct fmha_fwd_appendkv_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_v_rowmajor;
+    rope_enum rope_type;
+};
+float fmha_fwd_appendkv(fmha_fwd_appendkv_traits,
+                        fmha_fwd_appendkv_args,
+                        const ck_tile::stream_config&);
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mask.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mask.hpp
new file mode 100644
index 0000000000000..133049057d782
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mask.hpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
+
+// keep this in sync with ck_tile::GenericAttentionMaskEnum
+enum class mask_enum
+{
+    no_mask = 0,
+    mask_top_left,
+    mask_bottom_right,
+    window_generic,
+};
+
+struct mask_info
+{
+    mask_enum type;
+    ck_tile::index_t y, x;
+    ck_tile::index_t left, right; // FA style SWA left/right
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == mask_enum::no_mask)
+            os << "n";
+        else if(type == mask_enum::mask_top_left)
+            os << "t(" << left << ":" << right << ")";
+        else if(type == mask_enum::mask_bottom_right)
+            os << "b(" << left << ":" << right << ")";
+        else
+        {
+            os << "g(" << y << ":" << x << ")";
+        }
+    }
+    static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
+    {
+        ck_tile::index_t x_total = seqlen_k;
+        ck_tile::index_t y_total = seqlen_q;
+        mask_info tmp;
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            if(t == "xt" || t == "xb")
+            {
+                // xformer style sliding window attn from top-left
+                ck_tile::index_t window_size = atoi(v.c_str());
+                ck_tile::index_t left_size   = -1;
+                ck_tile::index_t right_size  = 0;
+                if(window_size > 0)
+                {
+                    left_size  = window_size / 2;
+                    right_size = window_size - 1 - left_size;
+                }
+                auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                    left_size, right_size, y_total, x_total, t == "xt");
+
+                tmp.type  = t == "xt" ? mask_enum::mask_top_left : mask_enum::mask_bottom_right;
+                tmp.y     = r.at(ck_tile::number<0>{});
+                tmp.x     = r.at(ck_tile::number<1>{});
+                tmp.left  = left_size;
+                tmp.right = right_size;
+            }
+            else
+            {
+                auto found_1 = v.find(",");
+                if(found_1 == std::string::npos)
+                {
+                    printf("not supported value %s, %s\n", v.c_str(), str.c_str());
+                    assert(0);
+                }
+                tmp.type            = mask_enum::window_generic;
+                ck_tile::index_t v0 = atoi(v.substr(0, found_1).c_str());
+                ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
+                // TODO: some validation
+                if(t == "t")
+                {
+                    tmp.type = mask_enum::mask_top_left;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, y_total, x_total, true);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                }
+                else if(t == "b")
+                {
+                    tmp.type = mask_enum::mask_bottom_right;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, y_total, x_total, false);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                }
+                else if(t == "g")
+                {
+                    tmp.y     = v0;
+                    tmp.x     = v1;
+                    tmp.left  = v0; // TODO: don't use this?
+                    tmp.right = v1;
+                }
+                else
+                {
+                    printf("not supported type %s, %s\n", t.c_str(), str.c_str());
+                    assert(0);
+                }
+            }
+        }
+        else
+        {
+            auto set_causal_top_left = [&]() {
+                tmp.type  = mask_enum::mask_top_left;
+                tmp.y     = seqlen_q;
+                tmp.x     = 1;
+                tmp.left  = -1;
+                tmp.right = 0;
+            };
+            auto set_causal_bottom_right = [&]() {
+                tmp.type  = mask_enum::mask_bottom_right;
+                tmp.y     = seqlen_q;
+                tmp.x     = seqlen_k - seqlen_q + 1;
+                tmp.left  = -1;
+                tmp.right = 0;
+            };
+            if(str == "t")
+                set_causal_top_left();
+            else if(str == "b")
+                set_causal_bottom_right();
+            else
+            {
+                tmp.type = static_cast<mask_enum>(atoi(str.c_str()));
+                if(tmp.type == mask_enum::mask_top_left)
+                {
+                    set_causal_top_left();
+                }
+                else if(tmp.type == mask_enum::mask_bottom_right)
+                {
+                    set_causal_bottom_right();
+                }
+            }
+        }
+        return tmp;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
+    {
+        mi.serialize(os);
+        return os;
+    }
+};
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
index 59669afb93d2f..c430a611347db 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
@@ -1,7 +1,11 @@
 #include <ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp>
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace pytorch_flash {
 std::tuple<
     at::Tensor, // dQ
@@ -117,4 +121,8 @@ mem_eff_backward_ck(
 }
 
 } // namespace pytorch_flash
+<<<<<<< HEAD
 #endif // USE_ROCM_CK_SDPA
+=======
+#endif // USE_CK_FLASH_ATTENTION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
index e92006ef6315c..7390b24cc9592 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
@@ -3,7 +3,11 @@
 
 #include <ATen/core/Tensor.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace pytorch_flash {
 
 std::tuple<
@@ -64,4 +68,8 @@ mem_eff_backward_ck(
     const at::Tensor philox_offset);
 
 } // namespace pytorch_flash
+<<<<<<< HEAD
 #endif // USE_ROCM_CK_SDPA
+=======
+#endif // USE_CK_FLASH_ATTENTION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
index d15c5105d0b46..97f35a22ac788 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
@@ -1,7 +1,11 @@
 #include <ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp>
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace pytorch_flash {
 std::tuple<
     at::Tensor, // output
@@ -93,4 +97,8 @@ mem_eff_forward_ck(
 }
 
 } // namespace pytorch_flash
+<<<<<<< HEAD
 #endif // USE_ROCM_CK_SDPA
+=======
+#endif // USE_CK_FLASH_ATTENTION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
index 854ac950a867d..46eb9bb9d3b7d 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
@@ -3,7 +3,10 @@
  ******************************************************************************/
 
 #include <ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp>
+<<<<<<< HEAD
 #include <mha_bwd.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fmha_bwd.hpp>
 #include <mask.hpp>
 
@@ -29,6 +32,7 @@ fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask,
                            deterministic};
 }
 
+<<<<<<< HEAD
 
 
 aiter::mha_bwd_traits get_mha_bwd_traits(fmha_bwd_traits t, mask_info mask)
@@ -49,6 +53,8 @@ aiter::mha_bwd_traits get_mha_bwd_traits(fmha_bwd_traits t, mask_info mask)
 
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask,
                                    // sizes
                                    const int b,
@@ -122,11 +128,19 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask,
     ck_tile::index_t stride_dv = dv.stride(1);
     ck_tile::index_t nhead_stride_dv = dv.stride(2);
 
+<<<<<<< HEAD
     // dq_acc: (split, batch_size, nheads, seqlen_q, hdim)
     ck_tile::index_t split_stride_dq_acc = dq_acc.stride(0);
     ck_tile::index_t batch_stride_dq_acc = dq_acc.stride(1);
     ck_tile::index_t stride_dq_acc = dq_acc.stride(3);
     ck_tile::index_t nhead_stride_dq_acc = dq_acc.stride(2);
+=======
+    // dq_acc: (split, batch_size, seqlen_q, nheads, hdim)
+    ck_tile::index_t split_stride_dq_acc = dq_acc.stride(0);
+    ck_tile::index_t batch_stride_dq_acc = dq_acc.stride(1);
+    ck_tile::index_t stride_dq_acc = dq_acc.stride(2);
+    ck_tile::index_t nhead_stride_dq_acc = dq_acc.stride(3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // bias: (batch_size, nheads, seqlen_q, seqlen_k)
     void *attn_bias_ptr = nullptr;
@@ -372,11 +386,19 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
     at::Tensor dq_accum;
 
     if (!deterministic) {
+<<<<<<< HEAD
         dq_accum = at::zeros({1, batch_size, num_heads, seqlen_q, head_size_8x}, opts.dtype(at::kFloat));
     } else {
         const ck_tile::index_t kN0 = head_size_8x <= 128 ? 128 : 64;
         const ck_tile::index_t nsplits = ck_tile::integer_divide_ceil(seqlen_k, kN0);
         dq_accum = at::zeros({nsplits, batch_size, num_heads, seqlen_q, head_size_8x}, opts.dtype(at::kFloat));
+=======
+        dq_accum = at::zeros({1, batch_size, seqlen_q, num_heads, head_size_8x}, opts.dtype(at::kFloat));
+    } else {
+        const ck_tile::index_t kN0 = head_size_8x <= 128 ? 128 : 64;
+        const ck_tile::index_t nsplits = ck_tile::integer_divide_ceil(seqlen_k, kN0);
+        dq_accum = at::zeros({nsplits, batch_size, seqlen_q, num_heads, head_size_8x}, opts.dtype(at::kFloat));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     at::Tensor dk_expanded, dv_expanded;
@@ -397,6 +419,17 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
     if (seqlen_q > 0) {
         ck_tile::stream_config stream_config{stream};
         dq.zero_(); // ck use atomic operation on dq
+<<<<<<< HEAD
+=======
+        auto traits =
+            get_ck_fmha_bwd_traits(mask,
+                                   q_dtype_str,
+                                   head_size_8x,
+                                   is_dropout,
+                                   attn_bias_.has_value(),
+                                   deterministic,
+                                   bias_requires_grad);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         auto args =
             get_ck_fmha_bwd_args(
@@ -424,6 +457,7 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
                 softmax_scale,
                 p_dropout,
                 drop_seed_offset);
+<<<<<<< HEAD
 
         float t = aiter::mha_bwd(args,
                                  stream_config,
@@ -441,6 +475,9 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
 
 
 
+=======
+        float t = fmha_bwd(traits, args, stream_config);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TORCH_CHECK(t >= 0, "invalid argument for fmha_bwd");
     } else {
         // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
index 05f97414acdd8..1539bcff520e1 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip
@@ -22,7 +22,10 @@ fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask,
                            dtype,
                            false, // is_group_mode
                            true,  // is_v_rowmajor
+<<<<<<< HEAD
                            false, // has_logits_soft_cap
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            mask.type,
                            enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias,
                            has_lse,
@@ -86,7 +89,10 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
     ck_tile::index_t stride_attn_bias = 0;
     ck_tile::index_t batch_stride_bias = 0;
     ck_tile::index_t nhead_stride_bias = 0;
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (attn_bias_.has_value()) {
         auto a_b = attn_bias_.value();
         CHECK_DEVICE(a_b);
@@ -96,6 +102,10 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
         nhead_stride_bias = a_b.stride(1);
         batch_stride_bias = a_b.stride(0);
     }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fmha_fwd_args{q.data_ptr(),
                          k.data_ptr(),
                          v.data_ptr(),
@@ -117,7 +127,10 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
                          softmax_scale,                     // scale_s
                          1,                                 // scale_p
                          1,                                 // scale_o
+<<<<<<< HEAD
                          0.0f,                              // logits_soft_cap
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          stride_q,
                          stride_k,
                          stride_v,
@@ -141,7 +154,10 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
                          mask.left,
                          mask.right,
                          static_cast<ck_tile::index_t>(mask.type),
+<<<<<<< HEAD
                          -1,                                // min_seqlen_q
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          p_dropout,
                          has_dropout_randval,
                          drop_seed_offset};
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip
index ee6261df8a91a..e9a724170e772 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip
@@ -20,7 +20,10 @@ fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask,
                            dtype,
                            true, // is_group_mode
                            true, // is_v_rowmajor
+<<<<<<< HEAD
                            false, // has_logits_soft_cap
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            mask.type,
                            enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias,
                            has_lse,
@@ -118,7 +121,10 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse,
                          softmax_scale, // scale_s
                          1,             // scale_p
                          1,             // scale_o
+<<<<<<< HEAD
                          0.0f,          // logits_soft_cap
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          stride_q,
                          stride_k,
                          stride_v,
@@ -142,7 +148,10 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse,
                          mask.left,
                          mask.right,
                          static_cast<ck_tile::index_t>(mask.type),
+<<<<<<< HEAD
                          -1,                                // min_seqlen_q
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                          p_dropout,
                          has_dropout_randval,
                          drop_seed_offset};
@@ -212,7 +221,11 @@ mha_varlen_fwd_ck(const at::Tensor &q,                   // total_q x num_heads
     const int total_q = q.size(0);
     const int total_k = k.size(0);
 
+<<<<<<< HEAD
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
+=======
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(head_size_og <= 256, "CK only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/rotary.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/rotary.hpp
new file mode 100644
index 0000000000000..85754c0378725
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/rotary.hpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ck_tile/core.hpp>
+#include <ck_tile/host/host_tensor.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <optional>
+#include <random>
+#include <tuple>
+
+// keep sync with RotaryEmbeddingEnum
+enum class rope_enum
+{
+    none         = 0,
+    interleaved  = 1,
+    half_rotated = 2,
+};
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+generate_rotary_cos_sin(ck_tile::index_t seqlen,
+                        ck_tile::index_t rotary_dim,
+                        std::optional<unsigned> seed = std::nullopt)
+{
+    // return dummy tensors if we won't apply RoPE at all
+    if(rotary_dim <= 0)
+    {
+        ck_tile::HostTensor<DataType> dummy({1, 1});
+        return std::make_tuple(dummy, dummy);
+    }
+
+    std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+
+    const ck_tile::index_t num_rows = seqlen * 2;
+    const ck_tile::index_t num_cols = rotary_dim / 2;
+
+    using std::begin, std::end;
+
+    ck_tile::HostTensor<float> angle({num_rows, num_cols});
+    std::generate(begin(angle), end(angle), [&] { return generator(random_engine) * 2 * M_PI; });
+
+    ck_tile::HostTensor<DataType> cos({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(cos), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::cos(origin_value));
+    });
+
+    ck_tile::HostTensor<DataType> sin({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(sin), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::sin(origin_value));
+    });
+
+    return std::make_tuple(cos, sin);
+}
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+slice_rotary_cos_sin(const ck_tile::HostTensor<DataType>& cos,
+                     const ck_tile::HostTensor<DataType>& sin,
+                     ck_tile::index_t seqlen_offset,
+                     ck_tile::index_t seqlen)
+{
+    assert(cos.get_num_of_dimension() == 2 && sin.get_num_of_dimension() == 2);
+    assert(cos.get_length(0) == sin.get_length(0) && cos.get_length(1) == sin.get_length(1));
+
+    assert(static_cast<std::size_t>(seqlen_offset + seqlen) <= cos.get_length(0));
+
+    const ck_tile::index_t num_rows = seqlen;
+    const ck_tile::index_t num_cols = cos.get_length(1);
+
+    ck_tile::HostTensor<DataType> cos_pt({num_rows, num_cols});
+    cos_pt.ForEach([&](auto& self, auto i) { self(i) = cos(i[0] + seqlen_offset, i[1]); });
+
+    ck_tile::HostTensor<DataType> sin_pt({num_rows, num_cols});
+    sin_pt.ForEach([&](auto& self, auto i) { self(i) = sin(i[0] + seqlen_offset, i[1]); });
+
+    return std::make_tuple(cos_pt, sin_pt);
+}
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
index 71a1959065970..e7348297c054d 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
@@ -147,7 +147,11 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd_aot(
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset);
 
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // CK implementation
 TORCH_API
 std::tuple<
@@ -333,7 +337,11 @@ mha_varlen_fwd(
     const float softcap,
     const bool return_softmax,
     std::optional<at::Generator> gen_) {
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
@@ -406,10 +414,16 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
     const bool deterministic,
     const at::Tensor philox_seed,
     const at::Tensor philox_offset) {
+<<<<<<< HEAD
 
 #if defined(USE_ROCM_CK_SDPA)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
+=======
+  if (at::globalContext().getROCmFAPreferredBackend() ==
+      at::ROCmFABackend::Ck) {
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
     const int non_null_window_left = window_size_left.value_or(-1);
     const int non_null_window_right = window_size_right.value_or(-1);
@@ -440,8 +454,15 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
                              philox_offset);
     // for FA return [dQ, dV, dK, dSoftmax]
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax));
+<<<<<<< HEAD
   }
 #endif
+=======
+#else
+    TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend...");
+#endif
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return mha_bwd_aot(
       dout,
       q,
@@ -494,7 +515,11 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
     const bool deterministic,
     const at::Tensor philox_seed,
     const at::Tensor philox_offset) {
+<<<<<<< HEAD
 #if defined(USE_ROCM_CK_SDPA)
+=======
+#if defined(USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
diff --git a/aten/src/ATen/native/transformers/sdp_utils.h b/aten/src/ATen/native/transformers/sdp_utils.h
index 809abe50178ec..28aae7f26bc58 100644
--- a/aten/src/ATen/native/transformers/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/sdp_utils.h
@@ -23,7 +23,11 @@ void alloc_with_matching_layout(
   const auto q_strides = q.strides();
   std::stable_sort(
       fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) {
+<<<<<<< HEAD
         return q_strides[idx1] ? q_strides[idx1] : 1 <  q_strides[idx2] ? q_strides[idx2] : 1;
+=======
+        return q_strides[idx1] < q_strides[idx2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   std::vector<int64_t> ordered_strides(shape.size());
   int64_t current_stride = 1;
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp b/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
index 931b66cbef9ad..63dfd6a8e20bc 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
@@ -43,7 +43,11 @@ bool use_flash_attention_cpp(sdp_params const& params, bool debug) {
       check_nested_tensor,
       check_for_dropout,
       check_tensor_shapes,
+<<<<<<< HEAD
       check_batch_size_and_num_heads_dense<true /*supports_grouped_query_attention*/>,
+=======
+      check_batch_size_and_num_heads_dense<false /*supports_grouped_query_attention*/>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       check_attn_mask_shape,
       check_head_dim_size_cpp,
       check_nonzero_sequence_lengths_dense,
diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h
index 4c9d97328ad61..dccc77222e1dd 100644
--- a/aten/src/ATen/native/utils/ParamsHash.h
+++ b/aten/src/ATen/native/utils/ParamsHash.h
@@ -41,7 +41,11 @@ struct ParamsEqual {
 };
 
 // Provide explicit byte-for-byte constructors to avoid uwittingly leaving
+<<<<<<< HEAD
 // padding bytes uninitialized (e.g., when passing Params by value)
+=======
+// padding bytes unitialized (e.g., when passing Params by value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 struct ParamsWrapper {
   T pod;
diff --git a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
index 532caa62687a8..ad2dce583c88d 100644
--- a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
+++ b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
@@ -33,8 +33,12 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
     return c10::fromIntArrayRefKnownNonNegative(strides_);
   }
 
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(
       c10::MemoryFormat memory_format) const override {
+=======
+  bool is_contiguous_custom(c10::MemoryFormat memory_format) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (void)memory_format;
     return true;
   }
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index bfa92357daeed..2a5b484b28c16 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -5,7 +5,10 @@
 #include <torch/csrc/profiler/orchestration/vulkan.h>
 #endif // USE_KINETO
 
+<<<<<<< HEAD
 #include <algorithm>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cmath>
 #include <iomanip>
 #include <iostream>
diff --git a/aten/src/ATen/native/vulkan/api/Types.h b/aten/src/ATen/native/vulkan/api/Types.h
index 1202a3bd73938..c89cd57be4ea0 100644
--- a/aten/src/ATen/native/vulkan/api/Types.h
+++ b/aten/src/ATen/native/vulkan/api/Types.h
@@ -71,7 +71,11 @@ inline VkFormat to_vkformat(const ScalarType t) {
 
 /*
  * Given a `VkFormat`, return the `ScalarType` that best represents the data
+<<<<<<< HEAD
  * type of individual elements in an image texture of the `VkFormat`. Note that
+=======
+ * type of invidivual elements in an image texture of the `VkFormat`. Note that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * this mapping is different from the `to_vkformat()` function, since different
  * `ScalarType`s may use the same `VkFormat`.
  */
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 47a2630aaafbe..ab3d1e147533e 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -75,7 +75,11 @@ void main() {
   // During prepacking, the weight tensor was rearranged in order to optimize
   // for data access linearity in this shader. Therefore we need to adjust the
   // canonical coordinates to the corresponding index in the rearranged weight
+<<<<<<< HEAD
   // tensor. the x coordinate is multiplied by 4 since each group of 4 channels
+=======
+  // tensor. the x coordinate is multipled by 4 since each group of 4 channels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // is folded into the X axis. The y coordinate is offset based on the z
   // coordinate because the 2D planes were stacked atop each other vertically.
   kstart.x *= 4;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index d4188d6580599..20413e345e9c9 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -39,7 +39,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 /*
  * Computes a 2D pointwise convolution of a 2x2 output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
+<<<<<<< HEAD
  * size is only 1x1, making it much easier to reuse loaded texels from uKernel.
+=======
+ * size is only 1x1, making it much easier to re-use loaded texels from uKernel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 void main() {
   const ivec3 gpos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_uint.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_uint.glsl
index 1f66a5fe19151..4707dd4b539e0 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_uint.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_uint.glsl
@@ -57,7 +57,11 @@ void main() {
     // out CxHxW plane.
 
   ivec4 c_index = pos_in_batch / uBlock.in_extents.w;
+<<<<<<< HEAD
     // we divide pos_in_batch by HxW, to compute the channel index
+=======
+    // we devide pos_in_batch by HxW, to compute the channel index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ivec4 pos_in_hw = pos_in_batch % uBlock.in_extents.w;
     // we compute the reminder mod HxW, to find the positions in the flatten
diff --git a/aten/src/ATen/native/vulkan/glsl/indexing.h b/aten/src/ATen/native/vulkan/glsl/indexing.h
index c34ce25001ef5..5a3e530e2bc74 100644
--- a/aten/src/ATen/native/vulkan/glsl/indexing.h
+++ b/aten/src/ATen/native/vulkan/glsl/indexing.h
@@ -1,12 +1,20 @@
 /*
+<<<<<<< HEAD
  * Computes a 4D tensor coordinate from a linearized index
+=======
+ * Computes a 4D tensor co-ordinate from a linearized index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 uvec4 idx_to_coord(const uint idx, const uvec4 strides, const uvec4 sizes) {
   return ivec4(mod(idx / strides, sizes));
 }
 
 /*
+<<<<<<< HEAD
  * Computes a linearized index from a 4D tensor coordinate
+=======
+ * Computes a linearized index from a 4D tensor co-ordinate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 uint coord_to_idx(const uvec4 coord, const uvec4 strides) {
   return int(dot(coord * strides, ivec4(1)));
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
index bc13655d01e07..829b70aa7d7f5 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
@@ -96,7 +96,11 @@ void main() {
   // During prepacking, the weight tensor was rearranged in order to optimize
   // for data access linearity in this shader. Therefore we need to adjust the
   // canonical coordinates to the corresponding index in the rearranged weight
+<<<<<<< HEAD
   // tensor. the x coordinate is multiplied by 4 since each group of 4 channels
+=======
+  // tensor. the x coordinate is multipled by 4 since each group of 4 channels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // is folded into the X axis. The y coordinate is offset based on the z
   // coordinate because the 2D planes were stacked atop each other vertically.
   kstart.x *= 4;
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index aeb25b56e60e9..248d3b275c715 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -193,12 +193,20 @@ vTensor pack_biases_quantized_weights(
         src_kw_sz = b_sizes[Layout::BatchMatrices::width];
         src_kh_sz = b_sizes[Layout::BatchMatrices::height];
       } else if (bias.sizes().size() == 2) {
+<<<<<<< HEAD
         // skip batch dim for broadcasting; index -1
+=======
+        // skip batch dim for boardcasting; index -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         src_kb_sz = 1;
         src_kw_sz = b_sizes[Layout::BatchMatrices::height];
         src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
       } else {
+<<<<<<< HEAD
         // skip batch & height dim for broadcasting; index -2
+=======
+        // skip batch & height dim for boardcasting; index -2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         src_kb_sz = 1;
         src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
         src_kh_sz = 1;
@@ -327,13 +335,21 @@ bool available_check_with_batch(
              weight.size(Layout::BatchMatrices::batch) ||
          bias->size(Layout::BatchMatrices::batch) == 1);
   } else if (bias->ndimension() == 2) {
+<<<<<<< HEAD
     // skip batch dim for broadcasting; index -1
+=======
+    // skip batch dim for boardcasting; index -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bias_available &=
         (bias->size(Layout::BatchMatrices::height) ==
              weight.size(Layout::BatchMatrices::width) ||
          bias->size(Layout::BatchMatrices::height) == 1);
   } else {
+<<<<<<< HEAD
     // skip batch & height dim for broadcasting; index -2
+=======
+    // skip batch & height dim for boardcasting; index -2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bias_available &=
         (bias->size(Layout::BatchMatrices::batch) ==
              weight.size(Layout::BatchMatrices::width) ||
diff --git a/aten/src/ATen/native/vulkan/ops/Tile.cpp b/aten/src/ATen/native/vulkan/ops/Tile.cpp
index d39fd951106c6..94975bc7449c0 100644
--- a/aten/src/ATen/native/vulkan/ops/Tile.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tile.cpp
@@ -18,7 +18,11 @@ namespace {
 using namespace api::utils;
 
 Tensor tile(const Tensor& self, const IntArrayRef repeats) {
+<<<<<<< HEAD
   // If self.size() > len(reps), reps is promoted to self.size() by prepending
+=======
+  // If self.size() > len(reps), reps is promoted to self.size() by pre-pending
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 1’s to it to keep the same behaviour as `numpy.tile`.
   // Thus for a tensor of shape (2, 3, 4, 5), a dims of (2, 2) is treated
   // as (1, 1, 2, 2).
diff --git a/aten/src/ATen/native/xnnpack/Shim.cpp b/aten/src/ATen/native/xnnpack/Shim.cpp
index de255fa6b6fca..40113a48400cc 100644
--- a/aten/src/ATen/native/xnnpack/Shim.cpp
+++ b/aten/src/ATen/native/xnnpack/Shim.cpp
@@ -29,6 +29,7 @@ bool available() {
 }
 
 bool use_convolution2d(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
     const at::OptionalIntArrayRef /*unused*/,
@@ -37,10 +38,21 @@ bool use_convolution2d(
     const IntArrayRef /*unused*/,
     const int64_t /*unused*/,
     bool /*unused*/) {
+=======
+    const Tensor&,
+    const Tensor&,
+    const at::OptionalIntArrayRef,
+    const IntArrayRef,
+    const IntArrayRef,
+    const IntArrayRef,
+    const int64_t,
+    bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return false;
 }
 
 Tensor convolution2d(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
@@ -48,24 +60,46 @@ Tensor convolution2d(
     const IntArrayRef /*unused*/,
     const IntArrayRef /*unused*/,
     const int64_t /*unused*/) {
+=======
+    const Tensor&,
+    const Tensor&,
+    const Tensor&,
+    const IntArrayRef,
+    const IntArrayRef,
+    const IntArrayRef,
+    const int64_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, internal::kError);
 }
 
 bool use_linear(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
     const Tensor& /*unused*/) {
+=======
+    const Tensor&,
+    const Tensor&,
+    const Tensor&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return false;
 }
 
 Tensor linear(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const Tensor& /*unused*/,
     const Tensor& /*unused*/) {
+=======
+    const Tensor&,
+    const Tensor&,
+    const Tensor&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, internal::kError);
 }
 
 bool use_max_pool2d(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const IntArrayRef /*unused*/,
     const IntArrayRef /*unused*/,
@@ -74,10 +108,21 @@ bool use_max_pool2d(
     const bool /*unused*/,
     const float /*unused*/,
     const float /*unused*/) {
+=======
+    const Tensor&,
+    const IntArrayRef,
+    const IntArrayRef,
+    IntArrayRef,
+    const IntArrayRef,
+    const bool,
+    const float,
+    const float) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return false;
 }
 
 Tensor max_pool2d(
+<<<<<<< HEAD
     const Tensor& /*unused*/,
     const IntArrayRef /*unused*/,
     const IntArrayRef /*unused*/,
@@ -86,6 +131,16 @@ Tensor max_pool2d(
     const bool /*unused*/,
     const float /*unused*/,
     const float /*unused*/) {
+=======
+    const Tensor&,
+    const IntArrayRef,
+    const IntArrayRef,
+    IntArrayRef,
+    const IntArrayRef,
+    const bool,
+    const float,
+    const float) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, internal::kError);
 }
 
diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp
index 8f40ee4045681..ecdb727186cfe 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@@ -26,7 +26,11 @@ static void load_platform_library() {
   (void)run_once;
 }
 
+<<<<<<< HEAD
 // NnapiCompilation function definitions:
+=======
+// NnapiCompilation functon definitions:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Could possibly call load_platform_library in constructor, but error reporting
 // can be complicated if the constructor is called during model loading.
diff --git a/aten/src/ATen/nnapi/nnapi_model_loader.cpp b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
index 4597135ab7e76..5a1e0c6d55989 100644
--- a/aten/src/ATen/nnapi/nnapi_model_loader.cpp
+++ b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
@@ -77,7 +77,11 @@ typedef struct _SerializedModel {
  * Get the physically stored size of a value.  All values are padded out
  * to a multiple of 4 bytes to ensure the next value is 4-byte aligned.
  */
+<<<<<<< HEAD
 uint32_t value_physical_size(uint32_t len) {
+=======
+static uint32_t value_physical_size(uint32_t len) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t phys = len;
   if (len % 4 == 0) {
     return len;
diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
index 2853d31ec24b4..2539fd3b3ff87 100644
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@@ -5,7 +5,11 @@ namespace at {
 
 namespace detail {
 
+<<<<<<< HEAD
 inline void noopDelete(void* /*unused*/) {}
+=======
+TORCH_API inline void noopDelete(void*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace detail
 
@@ -90,12 +94,21 @@ class TORCH_API TensorMaker {
 
   void* data_;
   IntArrayRef sizes_;
+<<<<<<< HEAD
   OptionalIntArrayRef strides_;
   std::optional<int64_t> storage_offset_;
   std::function<void(void*)> deleter_;
   std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
   std::optional<Device> device_;
   TensorOptions opts_;
+=======
+  OptionalIntArrayRef strides_{};
+  std::optional<int64_t> storage_offset_{};
+  std::function<void(void*)> deleter_{};
+  std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
+  std::optional<Device> device_{};
+  TensorOptions opts_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool resizeable_{};
   c10::Allocator* allocator_{};
 };
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index d8c2a181e99c9..47db6685f24b7 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -33,7 +33,11 @@ std::atomic<int64_t> defaultNodeId(-1);
 std::atomic<uint64_t> next_thread_id_{0};
 thread_local uint64_t current_thread_id_ = 0;
 
+<<<<<<< HEAD
 constexpr size_t NumRecordScopes =
+=======
+static constexpr size_t NumRecordScopes =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static_cast<size_t>(RecordScope::NUM_SCOPES);
 
 RecordFunctionCallbacks::iterator findCallback(
@@ -203,7 +207,11 @@ class LocalCallbackManager {
   // Runtime cache.
   size_t global_version_{GlobalCallbackManager::NoVersion};
   std::array<CacheEntry, NumRecordScopes> active_callbacks_;
+<<<<<<< HEAD
   std::mt19937 generator_;
+=======
+  std::mt19937 generator_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // ============================================================================
@@ -724,6 +732,7 @@ uint64_t RecordFunction::currentThreadId() {
   return current_thread_id_;
 }
 
+<<<<<<< HEAD
 void RecordFunction::before(RecordFunction::FunctionDescriptor fn, int64_t sequence_nr) {
   std::visit([this](auto&& fn) {
     if constexpr (std::is_same_v<std::decay_t<decltype(fn)>, std::string_view>) {
@@ -735,6 +744,38 @@ void RecordFunction::before(RecordFunction::FunctionDescriptor fn, int64_t seque
     }
   }, fn);
   sequence_nr_ = sequence_nr;
+=======
+void RecordFunction::before(const char* name, int64_t sequence_nr) {
+  fn_ = name;
+  sequence_nr_ = sequence_nr;
+  is_nccl_meta_ = (std::strcmp(name, kParamCommsCallName.c_str()) == 0);
+
+#ifndef NDEBUG
+  inputs_valid_ = true;
+#endif
+  runStartCallbacks();
+  invalidateInputs();
+}
+
+void RecordFunction::before(std::string name, int64_t sequence_nr) {
+  is_nccl_meta_ = (name == kParamCommsCallName);
+  fn_ = std::move(name);
+  sequence_nr_ = sequence_nr;
+
+#ifndef NDEBUG
+  inputs_valid_ = true;
+#endif
+  runStartCallbacks();
+  invalidateInputs();
+}
+
+void RecordFunction::before(
+    RecordFunction::schema_ref_t schema,
+    int64_t sequence_nr) {
+  sequence_nr_ = sequence_nr;
+  fn_ = schema;
+  is_nccl_meta_ = (schema.get().name() == kParamCommsCallName);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef NDEBUG
   inputs_valid_ = true;
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 8ec70a1682f37..69cf67b90f7b9 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -9,7 +9,10 @@
 #include <array>
 #include <functional>
 #include <memory>
+<<<<<<< HEAD
 #include <string_view>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <variant>
 
 namespace c10 {
@@ -288,11 +291,17 @@ struct TORCH_API RecordFunction {
   explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION);
   explicit RecordFunction(StepCallbacks&& step_callbacks);
 
+<<<<<<< HEAD
   using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
   using FunctionDescriptor = std::variant<std::string_view, schema_ref_t>;
 
   void before(
       FunctionDescriptor fn,
+=======
+  template <typename F>
+  void before(
+      F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10::ArrayRef<const c10::IValue> args,
       int64_t current_sequence_nr = -1) {
     if (!isActive()) {
@@ -302,8 +311,14 @@ struct TORCH_API RecordFunction {
     before(fn, current_sequence_nr);
   }
 
+<<<<<<< HEAD
   void before(
       FunctionDescriptor fn,
+=======
+  template <typename F>
+  void before(
+      F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10::ArrayRef<const c10::IValue> args,
       const std::unordered_map<std::string, IValue>* kwargs,
       int64_t current_sequence_nr = -1) {
@@ -311,11 +326,20 @@ struct TORCH_API RecordFunction {
       return;
     }
     kwinputs_ = *kwargs;
+<<<<<<< HEAD
     before(fn, args, current_sequence_nr);
   }
 
   void before(
       FunctionDescriptor fn,
+=======
+    before(std::move(fn), args, current_sequence_nr);
+  }
+
+  template <typename F>
+  void before(
+      F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const std::unordered_map<std::string, IValue>* kwargs,
       int64_t current_sequence_nr = -1) {
     if (!isActive()) {
@@ -325,18 +349,34 @@ struct TORCH_API RecordFunction {
     before(fn, current_sequence_nr);
   }
 
+<<<<<<< HEAD
   void before(
       FunctionDescriptor fn,
       const std::vector<IValue>* args,
       int64_t current_sequence_nr = -1) {
     before(
         fn,
+=======
+  template <typename F>
+  void before(
+      F fn,
+      const std::vector<IValue>* args,
+      int64_t current_sequence_nr = -1) {
+    before(
+        std::move(fn),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10::ArrayRef<const c10::IValue>(args->data(), args->size()),
         current_sequence_nr);
   }
 
+<<<<<<< HEAD
   void before(
       FunctionDescriptor fn,
+=======
+  template <typename F>
+  void before(
+      F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const std::vector<IValue>* args,
       const std::unordered_map<std::string, IValue>* kwargs,
       int64_t current_sequence_nr = -1) {
@@ -425,7 +465,14 @@ struct TORCH_API RecordFunction {
 
   // before functions initialize RecordFunction members and call
   // start callbacks
+<<<<<<< HEAD
   void before(FunctionDescriptor schema, int64_t sequence_nr = -1);
+=======
+  using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
+  void before(const char* name, int64_t sequence_nr = -1);
+  void before(std::string name, int64_t sequence_nr = -1);
+  void before(schema_ref_t schema, int64_t sequence_nr = -1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Sets node ID for distributed profiling
   static void setDefaultNodeId(int64_t defaultNodeId);
@@ -549,10 +596,17 @@ TORCH_API std::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
     RecordScope scope);
 
 namespace detail {
+<<<<<<< HEAD
 template <typename Inputs, typename... Args>
 void record_function_with_scope(
     RecordFunction& guard,
     RecordFunction::FunctionDescriptor fn,
+=======
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Inputs& inputs,
     Args&&... args) {
   if (guard.needsInputs()) {
@@ -565,10 +619,17 @@ void record_function_with_scope(
   }
 }
 
+<<<<<<< HEAD
 template <typename Inputs, typename... Args>
 void record_function_with_scope_and_debug_handle(
     RecordFunction& guard,
     RecordFunction::FunctionDescriptor fn,
+=======
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t debug_handle,
     const Inputs& inputs,
     Args&&... args) {
@@ -583,6 +644,7 @@ void record_function_with_scope_and_debug_handle(
   }
 }
 
+<<<<<<< HEAD
 template <typename... Args>
 void record_function_with_scope(
     RecordFunction& guard,
@@ -597,12 +659,36 @@ template <typename... Args>
 void record_function_with_scope_and_debug_handle(
     RecordFunction& guard,
     RecordFunction::FunctionDescriptor fn,
+=======
+template <typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+    c10::ArrayRef<const c10::IValue> inputs,
+    Args&&... args) {
+  return record_function_with_scope<
+      c10::ArrayRef<const c10::IValue>,
+      F,
+      Args...>(guard, std::move(fn), inputs, std::forward<Args>(args)...);
+}
+
+template <typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t debug_handle,
     c10::ArrayRef<const c10::IValue> inputs,
     Args&&... args) {
   return record_function_with_scope_and_debug_handle<
       c10::ArrayRef<const c10::IValue>,
+<<<<<<< HEAD
       Args...>(guard, fn, debug_handle, inputs, std::forward<Args>(args)...);
+=======
+      F,
+      Args...>(
+      guard, std::move(fn), debug_handle, inputs, std::forward<Args>(args)...);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace detail
@@ -666,7 +752,11 @@ void record_function_with_scope_and_debug_handle(
         guard, fn, debug_handle, inputs, ##__VA_ARGS__);       \
   }
 
+<<<<<<< HEAD
 // Helper macros to record LITE INTERPRETER scope events with debug handles
+=======
+// Helper macros to record LITE INTERPETER scope events with debug handles
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
     fn, debug_handle, inputs)                           \
   RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(            \
diff --git a/aten/src/ATen/templates/FunctionalInverses.h b/aten/src/ATen/templates/FunctionalInverses.h
index b15cd09a6c65d..72b34ed5794ea 100644
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@@ -2,12 +2,29 @@
 
 // ${generated_comment}
 
+<<<<<<< HEAD
 #include <ATen/FunctionalStorageImpl.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/Tensor.h>
 
 namespace at {
 namespace functionalization {
 
+<<<<<<< HEAD
+=======
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct FunctionalInverses {
 
 ${view_inverse_declarations}
diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index f210402e543aa..ba67c27fd387a 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -64,7 +64,11 @@ Tensor TensorMaker::make_tensor() {
    if (strides_) {
      auto storage_size = detail::computeStorageNbytes(sizes_, *strides_, itemsize);
      if (storage_offset_) {
+<<<<<<< HEAD
        storage_size += storage_offset_.value() * itemsize;
+=======
+       storage_size += storage_offset_.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      }
      return storage_size;
    }
@@ -75,7 +79,11 @@ Tensor TensorMaker::make_tensor() {
    }
    auto storage_size = size * itemsize;
    if (storage_offset_) {
+<<<<<<< HEAD
      storage_size += storage_offset_.value() * itemsize;
+=======
+     storage_size += storage_offset_.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    }
    return storage_size;
  }
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index b1feaf9d4daa9..f75421a64ccd0 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -83,6 +83,7 @@ namespace at {
 // Special C++ only overloads for std()-like functions (See gh-40287)
 // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
 // So, for example std(0) would select the std(unbiased=False) overload
+<<<<<<< HEAD
 inline Tensor var(const Tensor& self, int dim) {
   return at::var(self, IntArrayRef{dim});
 }
@@ -93,6 +94,18 @@ inline Tensor std(const Tensor& self, int dim) {
   return at::std(self, IntArrayRef{dim});
 }
 inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+=======
+TORCH_API inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+TORCH_API inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::std_mean(self, IntArrayRef{dim});
 }
 
diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index 39c85b00d7a1b..7a4d488c1382d 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -5,11 +5,21 @@
 
 // NOTE: This condition is true for all PyTorch internal libraries, it
 //       just excludes external projects such as torch_xla which
+<<<<<<< HEAD
 //       reuse some of the PyTorch codegen machinery.
 #if defined(CAFFE2_BUILD_MAIN_LIB)        || \
     defined(TORCH_CUDA_BUILD_MAIN_LIB)    || \
     defined(TORCH_HIP_BUILD_MAIN_LIB)     || \
     defined(TORCH_XPU_BUILD_MAIN_LIB)
+=======
+//       re-use some of the PyTorch codegen machinery.
+#if defined(CAFFE2_BUILD_MAIN_LIB)        || \
+    defined(TORCH_CUDA_BUILD_MAIN_LIB)    || \
+    defined(TORCH_HIP_BUILD_MAIN_LIB)     || \
+    defined(TORCH_XPU_BUILD_MAIN_LIB)     || \
+    defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \
+    defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #endif
 
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index 408aff0cdab40..36a534b9fb1c7 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -4,7 +4,11 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
+<<<<<<< HEAD
 #include <ATen/ViewMetaClasses.h>
+=======
+#include <ATen/FunctionalInverses.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>
 
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 9d80c13f5ed82..374b7abc05ec5 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -158,7 +158,11 @@ class TORCH_API Tensor: public TensorBase {
   // will only lead to trouble and dangling references.
   c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
 
+<<<<<<< HEAD
   // The following overloads are very intriguing.  Consider the following
+=======
+  // The following overloads are very intruiging.  Consider the following
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // program:
   //
   //    x[1] = 3;
@@ -491,7 +495,11 @@ class TORCH_API Tensor: public TensorBase {
         "attribute won't be populated during autograd.backward(). If you indeed want the .grad "
         "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. "
         "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor "
+<<<<<<< HEAD
         "instead. See github.com/pytorch/pytorch/pull/30531 for more information.");
+=======
+        "instead. See github.com/pytorch/pytorch/pull/30531 for more informations.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return maybe_grad;
   }
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 0937de4552821..7219b84c5dfd2 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -517,6 +517,7 @@ TEST(BasicTest, BasicStdTestCPU) {
   t3.join();
   t4.join();
 }
+<<<<<<< HEAD
 
 TEST(BasicTest, TestForBlobResizeCPU) {
   // Checks that for_blob can correctly create tensors with non-empty offset and resize them
@@ -535,3 +536,5 @@ TEST(BasicTest, TestForBlobStridesResizeCPU) {
   auto te = *at::expand_size(t, {3, 3});
   ASSERT_EQ(te[1][1].item<int32_t>(), 5);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
index fa650df221fa0..fe9bec05e871d 100644
--- a/aten/src/ATen/test/cpu_caching_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@@ -5,9 +5,12 @@
 
 #include <c10/mobile/CPUCachingAllocator.h>
 
+<<<<<<< HEAD
 // At the moment caching allocator is only exposed to mobile cpu allocator.
 #ifdef C10_MOBILE
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(CPUCachingAllocatorTest, check_alloc_free) {
   c10::CPUCachingAllocator caching_allocator;
   c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
@@ -44,9 +47,19 @@ TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
 }
 
 int main(int argc, char* argv[]) {
+<<<<<<< HEAD
   ::testing::InitGoogleTest(&argc, argv);
   at::manual_seed(42);
   return RUN_ALL_TESTS();
 }
 
 #endif /* C10_Mobile */
+=======
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
index 15220e58e2485..d71ada29b0a78 100644
--- a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -199,7 +199,11 @@ int main(int argc, char* argv[]) {
 
   #ifdef C10_MOBILE
   // Need to disable mkldnn for this test since it allocated memory
+<<<<<<< HEAD
   // via raw_allocate interface which requires context pointer and raw
+=======
+  // via raw_allocate inteface which requires context pointer and raw
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // pointer to be the same. Tis is not true for mobile allocator.
   at::globalContext().setUserEnabledMkldnn(false);
   #endif
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
index ad5bb510f0cb4..3e79b2b256be4 100644
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -1,13 +1,17 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <ATen/cuda/CUDAContext.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <ATen/test/allocator_clone_test.h>
 
 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
 
+<<<<<<< HEAD
 std::unordered_map<void*, size_t> allocation_sizes;
 
 void* logging_malloc(size_t size, int device, cudaStream_t stream) {
@@ -81,3 +85,51 @@ TEST(AllocatorTestCUDA, test_clone) {
   if (!at::cuda::is_available()) return;
   test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
+=======
+TEST(AllocatorTestCUDA, test_clone) {
+  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
+}
+
+static int called_dummy_free_0 = 0;
+static int called_dummy_free_1 = 0;
+
+void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
+void dummy_free_0(void* data, size_t size, int device, void* stream) {
+  called_dummy_free_0++;
+}
+void dummy_free_1(void* data, size_t size, int device, void* stream) {
+  called_dummy_free_1++;
+}
+
+// Tests that data_ptrs have their respective deleters
+// when mixing allocators
+TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
+  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
+  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
+  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
+  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
+
+  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
+  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
+  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
+  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
+
+  // Manually use a's deleter
+  auto* ctx = a.storage().data_ptr().get_context();
+  a.storage().data_ptr().get_deleter()(ctx);
+  a.storage().mutable_data_ptr().release_context();
+
+  // a's deleter is dummy_free_0
+  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
+  ASSERT_TRUE(called_dummy_free_0 == 1);
+
+  // Manually use b's deleter
+  ctx = b.storage().data_ptr().get_context();
+  b.storage().data_ptr().get_deleter()(ctx);
+  b.storage().mutable_data_ptr().release_context();
+
+  // b's deleter is dummy_free_1
+  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
+  ASSERT_TRUE(called_dummy_free_1 == 1);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
index d03a93f684ea6..b2a339aa0d437 100644
--- a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
@@ -19,10 +19,17 @@ TEST(CachingHostAllocatorTest, check_stats) {
   // Clear the stats and ensure they are zero.
   size_t round_size = c10::llvm::PowerOf2Ceil(N);
   auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+<<<<<<< HEAD
   ASSERT_EQ(stats.allocations.current, 0);
   ASSERT_EQ(stats.allocations.peak, 0);
   ASSERT_EQ(stats.allocations.allocated, 0);
   ASSERT_EQ(stats.allocations.freed, 0);
+=======
+  ASSERT_EQ(stats.allocation.current, 0);
+  ASSERT_EQ(stats.allocation.peak, 0);
+  ASSERT_EQ(stats.allocation.allocated, 0);
+  ASSERT_EQ(stats.allocation.freed, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void* ptr{nullptr};
   void* ctx{nullptr};
@@ -32,10 +39,21 @@ TEST(CachingHostAllocatorTest, check_stats) {
     ptr = pinned_tensor.data_ptr();
     ctx = pinned_tensor.storage().data_ptr().get_context();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+<<<<<<< HEAD
     ASSERT_EQ(stats.allocations.current, 1);
     ASSERT_EQ(stats.allocations.peak, 1);
     ASSERT_EQ(stats.allocations.allocated, 1);
     // We dont track active bytes as free blocks are added in process_events
+=======
+    ASSERT_EQ(stats.allocation.current, 1);
+    ASSERT_EQ(stats.allocation.peak, 1);
+    ASSERT_EQ(stats.allocation.allocated, 1);
+    ASSERT_EQ(stats.allocation.freed, 0);
+    ASSERT_EQ(stats.segment.allocated, 1);
+    ASSERT_EQ(stats.segment.freed, 0);
+    ASSERT_EQ(stats.reserved_bytes.current, round_size);
+    ASSERT_EQ(stats.allocated_bytes.current, round_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.host_alloc_time.max, stats.host_alloc_time.min);
     ASSERT_EQ(stats.host_free_time.total, 0);
   }
@@ -46,9 +64,19 @@ TEST(CachingHostAllocatorTest, check_stats) {
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
     ASSERT_EQ(ptr, pinned_tensor.data_ptr());
     ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
+<<<<<<< HEAD
     ASSERT_EQ(stats.allocations.current, 1);
     ASSERT_EQ(stats.allocations.peak, 1);
     ASSERT_EQ(stats.allocations.allocated, 1);
+=======
+    ASSERT_EQ(stats.allocation.current, 1);
+    ASSERT_EQ(stats.allocation.peak, 1);
+    ASSERT_EQ(stats.allocation.allocated, 2);
+    ASSERT_EQ(stats.allocation.freed, 1);
+    ASSERT_EQ(stats.segment.allocated, 1);
+    ASSERT_EQ(stats.segment.freed, 0);
+    ASSERT_EQ(stats.reserved_bytes.current, round_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.allocated_bytes.current, round_size);
   }
   // Ensure we don't reuse the allocation, due to size mismatch.
@@ -60,10 +88,21 @@ TEST(CachingHostAllocatorTest, check_stats) {
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
     ASSERT_NE(ptr, pinned_tensor.data_ptr());
     ASSERT_NE(ctx, pinned_tensor.storage().data_ptr().get_context());
+<<<<<<< HEAD
     ASSERT_EQ(stats.allocations.current, 2);
     ASSERT_EQ(stats.allocations.peak, 2);
     ASSERT_EQ(stats.allocations.allocated, 2);
     ASSERT_EQ(stats.allocated_bytes.current, new_round_size + round_size);
+=======
+    ASSERT_EQ(stats.allocation.current, 1);
+    ASSERT_EQ(stats.allocation.peak, 2);
+    ASSERT_EQ(stats.allocation.allocated, 3);
+    ASSERT_EQ(stats.allocation.freed, 2);
+    ASSERT_EQ(stats.segment.allocated, 2);
+    ASSERT_EQ(stats.segment.freed, 0);
+    ASSERT_EQ(stats.reserved_bytes.current, round_size + new_round_size);
+    ASSERT_EQ(stats.allocated_bytes.current, new_round_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_NE(stats.host_alloc_time.total, stats.host_alloc_time.min);
   }
 
@@ -71,10 +110,20 @@ TEST(CachingHostAllocatorTest, check_stats) {
   {
     at::getHostAllocator(at::kCUDA)->empty_cache();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+<<<<<<< HEAD
     ASSERT_EQ(stats.allocations.current, 0);
     ASSERT_EQ(stats.allocated_bytes.current, 0);
     ASSERT_EQ(stats.allocations.peak, 2);
     ASSERT_EQ(stats.allocations.allocated, 2);
+=======
+    ASSERT_EQ(stats.allocation.current, 0);
+    ASSERT_EQ(stats.allocated_bytes.current, 0);
+    ASSERT_EQ(stats.allocation.peak, 2);
+    ASSERT_EQ(stats.allocation.allocated, 3);
+    ASSERT_EQ(stats.allocation.freed, 3);
+    ASSERT_EQ(stats.segment.allocated, 2);
+    ASSERT_EQ(stats.segment.freed, 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.num_host_alloc, 2);
     ASSERT_EQ(stats.num_host_free, 2);
     ASSERT_NE(stats.host_free_time.total, stats.host_free_time.min);
@@ -85,9 +134,15 @@ TEST(CachingHostAllocatorTest, check_stats) {
     at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
     at::getHostAllocator(at::kCUDA)->reset_peak_stats();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+<<<<<<< HEAD
     ASSERT_EQ(stats.allocations.peak, 0);
     ASSERT_EQ(stats.allocations.allocated, 0);
     ASSERT_EQ(stats.allocations.freed, 0);
+=======
+    ASSERT_EQ(stats.allocation.peak, 0);
+    ASSERT_EQ(stats.allocation.allocated, 0);
+    ASSERT_EQ(stats.allocation.freed, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.allocated_bytes.peak, 0);
     ASSERT_EQ(stats.num_host_alloc, 0);
     ASSERT_EQ(stats.num_host_free, 0);
diff --git a/aten/src/ATen/test/cuda_complex_test.cu b/aten/src/ATen/test/cuda_complex_test.cu
index 5736f73330760..8558e74094b33 100644
--- a/aten/src/ATen/test/cuda_complex_test.cu
+++ b/aten/src/ATen/test/cuda_complex_test.cu
@@ -5,14 +5,24 @@
 __global__ void test_thrust_kernel() {
   // thrust conversion
   {
+<<<<<<< HEAD
   [[maybe_unused]] constexpr float num1 = float(1.23);
   [[maybe_unused]] constexpr float num2 = float(4.56);
+=======
+  constexpr float num1 = float(1.23);
+  constexpr float num2 = float(4.56);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert(c10::complex<float>(thrust::complex<float>(num1, num2)).real() == num1);
   assert(c10::complex<float>(thrust::complex<float>(num1, num2)).imag() == num2);
   }
   {
+<<<<<<< HEAD
   [[maybe_unused]] constexpr double num1 = double(1.23);
   [[maybe_unused]] constexpr double num2 = double(4.56);
+=======
+  constexpr double num1 = double(1.23);
+  constexpr double num2 = double(4.56);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert(c10::complex<double>(thrust::complex<double>(num1, num2)).real() == num1);
   assert(c10::complex<double>(thrust::complex<double>(num1, num2)).imag() == num2);
   }
@@ -46,11 +56,19 @@ __global__ void test_reinterpret_cast() {
   assert(zzzz.real() == double(1));
   assert(zzzz.imag() == double(2));
 
+<<<<<<< HEAD
   [[maybe_unused]] cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
   assert(cuComplex_zz.x == float(1));
   assert(cuComplex_zz.y == float(2));
 
   [[maybe_unused]] cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
+=======
+  cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
+  assert(cuComplex_zz.x == float(1));
+  assert(cuComplex_zz.y == float(2));
+
+  cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert(cuDoubleComplex_zzzz.x == double(1));
   assert(cuDoubleComplex_zzzz.y == double(2));
 }
diff --git a/aten/src/ATen/test/cuda_cub_test.cu b/aten/src/ATen/test/cuda_cub_test.cu
index 6865984102b4b..f22b3f59023f1 100644
--- a/aten/src/ATen/test/cuda_cub_test.cu
+++ b/aten/src/ATen/test/cuda_cub_test.cu
@@ -146,8 +146,13 @@ TEST(InclusiveScanSplit, CubTest) {
   cudaMallocManaged(&output1, sizeof(int) * 10);
 
   cudaDeviceSynchronize();
+<<<<<<< HEAD
   at::cuda::cub::inclusive_scan<int *, int *, NO_ROCM(::cuda)::std::plus<>, /*max_cub_size=*/2>(
     input, output1, NO_ROCM(::cuda)::std::plus<>(), 10);
+=======
+  at::cuda::cub::inclusive_scan<int *, int *, ::at_cuda_detail::cub::Sum, /*max_cub_size=*/2>(
+    input, output1, ::at_cuda_detail::cub::Sum(), 10);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cudaDeviceSynchronize();
 
   ASSERT_EQ(output1[0], 1);
@@ -172,8 +177,13 @@ TEST(ExclusiveScanSplit, CubTest) {
   cudaMallocManaged(&output2, sizeof(int) * 10);
 
   cudaDeviceSynchronize();
+<<<<<<< HEAD
   at::cuda::cub::exclusive_scan<int *, int *, NO_ROCM(::cuda)::std::plus<>, int, /*max_cub_size=*/2>(
     input, output2, NO_ROCM(::cuda)::std::plus<>(), 0, 10);
+=======
+  at::cuda::cub::exclusive_scan<int *, int *, ::at_cuda_detail::cub::Sum, int, /*max_cub_size=*/2>(
+    input, output2, ::at_cuda_detail::cub::Sum(), 0, 10);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cudaDeviceSynchronize();
 
   ASSERT_EQ(output2[0], 0);
diff --git a/aten/src/ATen/test/cuda_dlconvertor_test.cpp b/aten/src/ATen/test/cuda_dlconvertor_test.cpp
index 34f8589391d5e..3ae94d53d6128 100644
--- a/aten/src/ATen/test/cuda_dlconvertor_test.cpp
+++ b/aten/src/ATen/test/cuda_dlconvertor_test.cpp
@@ -9,7 +9,10 @@
 #include <ATen/cuda/CUDAContext.h>
 
 using namespace at;
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(TestDlconvertor, TestDlconvertorCUDA) {
   manual_seed(123);
 
@@ -51,6 +54,7 @@ TEST(TestDlconvertor, TestDlconvertorCUDAHIP) {
 
   ASSERT_TRUE(a.equal(b));
 }
+<<<<<<< HEAD
 
 TEST(TestDlconvertorVersioned, TestDlconvertorCUDA) {
   manual_seed(123);
@@ -93,3 +97,5 @@ TEST(TestDlconvertorVersioned, TestDlconvertorCUDAHIP) {
 
   ASSERT_TRUE(a.equal(b));
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index 6f45acc30f9ea..4323dfe824846 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -33,7 +33,11 @@ __device__ void test(){
   // use the std namespace, but just "::" so that the function
   // gets resolved from nvcc math_functions.hpp
 
+<<<<<<< HEAD
   [[maybe_unused]] float threshold = 0.00001;
+=======
+  float threshold = 0.00001;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert(::abs(::lgamma(Half(10.0)) - ::lgamma(10.0f)) <= threshold);
   assert(::abs(::exp(Half(1.0)) - ::exp(1.0f)) <= threshold);
   assert(::abs(::log(Half(1.0)) - ::log(1.0f)) <= threshold);
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index 1b3ed4dc4ac42..f3966469aaa1f 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -10,6 +10,7 @@ using namespace at::native::memory;
 
 constexpr int buffer_size = 1024;
 
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION < 13000
 __managed__ double4 buffer1[buffer_size];
 __managed__ double4 buffer2[buffer_size];
@@ -17,6 +18,10 @@ __managed__ double4 buffer2[buffer_size];
 __managed__ double4_16a buffer1[buffer_size];
 __managed__ double4_16a buffer2[buffer_size];
 #endif
+=======
+__managed__ double4 buffer1[buffer_size];
+__managed__ double4 buffer2[buffer_size];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void reset_buffers() {
   for (int i = 0; i < buffer_size; i++) {
@@ -32,6 +37,26 @@ void reset_buffers() {
   }
 }
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM) && !defined(_WIN32)
+TEST(TestLoops, HasSameArgTypes) {
+  // This is a compile-time unit test. If this file compiles without error,
+  // then the test passes and during runtime, we just need to return.
+  using namespace at::native::modern::detail;
+  using func1_t = int (*)(float, float);
+  using func2_t = int (*)(bool, float, float);
+  using func3_t = int (*)(float);
+  using func4_t = int (*)();
+  static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
+  static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
+  static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
+  static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
+  return;
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index dca9126c7cde3..de882a7d82723 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -3,8 +3,17 @@
 #include <ATen/ATen.h>
 #include <ATen/DLConvertor.h>
 
+<<<<<<< HEAD
 using namespace at;
 
+=======
+#include <iostream>
+// NOLINTNEXTLINE(modernize-deprecated-headers)
+#include <string.h>
+#include <sstream>
+
+using namespace at;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(TestDlconvertor, TestDlconvertor) {
   manual_seed(123);
 
@@ -27,6 +36,7 @@ TEST(TestDlconvertor, TestDlconvertorNoStrides) {
 
   ASSERT_TRUE(a.equal(b));
 }
+<<<<<<< HEAD
 
 TEST(TestDlconvertorUnversioned, TestDlconvertor) {
   manual_seed(123);
@@ -50,3 +60,5 @@ TEST(TestDlconvertorUnversioned, TestDlconvertorNoStrides) {
 
   ASSERT_TRUE(a.equal(b));
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 9e594196c6925..e53eeddbd2e23 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -25,7 +25,11 @@ TEST(TestHalf, Arithmetic) {
   ASSERT_EQ(one + one, 2);
 }
 
+<<<<<<< HEAD
 TEST(TestHalf, Comparisons) {
+=======
+TEST(TestHalf, Comparisions) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Half zero = 0;
   Half one = 1;
   ASSERT_LT(zero, one);
diff --git a/aten/src/ATen/test/mps_test_objc_interface.mm b/aten/src/ATen/test/mps_test_objc_interface.mm
index 45811ed804802..611bed6161676 100644
--- a/aten/src/ATen/test/mps_test_objc_interface.mm
+++ b/aten/src/ATen/test/mps_test_objc_interface.mm
@@ -42,7 +42,11 @@ kernel void add_arrays(device const float* inA,
     id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
                                                               options: nil
                                                                 error: &error];
+<<<<<<< HEAD
     TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+=======
+    TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
     TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index 6391c3c8228c3..ee53a5cfa81ca 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -14,6 +14,7 @@ using namespace at;
 
 namespace {
 
+<<<<<<< HEAD
 constexpr auto int_min = std::numeric_limits<int>::min();
 constexpr auto int_max = std::numeric_limits<int>::max();
 constexpr auto long_min = std::numeric_limits<int64_t>::min();
@@ -24,6 +25,18 @@ constexpr auto float_max = std::numeric_limits<float>::max();
 constexpr auto double_lowest = std::numeric_limits<double>::lowest();
 constexpr auto double_min = std::numeric_limits<double>::min();
 constexpr auto double_max = std::numeric_limits<double>::max();
+=======
+const auto int_min = std::numeric_limits<int>::min();
+const auto int_max = std::numeric_limits<int>::max();
+const auto long_min = std::numeric_limits<int64_t>::min();
+const auto long_max = std::numeric_limits<int64_t>::max();
+const auto float_lowest = std::numeric_limits<float>::lowest();
+const auto float_min = std::numeric_limits<float>::min();
+const auto float_max = std::numeric_limits<float>::max();
+const auto double_lowest = std::numeric_limits<double>::lowest();
+const auto double_min = std::numeric_limits<double>::min();
+const auto double_max = std::numeric_limits<double>::max();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 const std::vector<int> ints {
   int_min,
diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp
index 60dd52d1dffcb..d57188d3b0e69 100644
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@@ -1,8 +1,15 @@
+<<<<<<< HEAD
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
+=======
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/test_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <thread>
 
 
@@ -10,7 +17,11 @@
 // numbers of threads set and also whether the scheduler
 // will throw an exception when multiple threads call
 // their first parallel construct.
+<<<<<<< HEAD
 static void test(int given_num_threads) {
+=======
+void test(int given_num_threads) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat));
   ASSERT_TRUE(given_num_threads >= 0);
   ASSERT_EQ(at::get_num_threads(), given_num_threads);
@@ -20,7 +31,11 @@ static void test(int given_num_threads) {
   }
 }
 
+<<<<<<< HEAD
 TEST(ThreadInitTest, ThreadInit) {
+=======
+int main() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::init_num_threads();
 
   at::set_num_threads(4);
@@ -33,11 +48,20 @@ TEST(ThreadInitTest, ThreadInit) {
 
   #if !AT_PARALLEL_NATIVE
   at::set_num_threads(5);
+<<<<<<< HEAD
   ASSERT_EQ(at::get_num_threads(), 5);
+=======
+  ASSERT_TRUE(at::get_num_threads() == 5);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   #endif
 
   // test inter-op settings
   at::set_num_interop_threads(5);
   ASSERT_EQ(at::get_num_interop_threads(), 5);
   ASSERT_ANY_THROW(at::set_num_interop_threads(6));
+<<<<<<< HEAD
+=======
+
+  return 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/aten/src/ATen/test/type_ptr_test.cpp b/aten/src/ATen/test/type_ptr_test.cpp
index f872f8977331e..2d9aa48bab8fa 100644
--- a/aten/src/ATen/test/type_ptr_test.cpp
+++ b/aten/src/ATen/test/type_ptr_test.cpp
@@ -37,10 +37,13 @@ TEST(SingletonOrSharedTypePtr, Comparison) {
 
   EXPECT_NE(empty, p);
   EXPECT_NE(p, p2);
+<<<<<<< HEAD
 
   EXPECT_EQ(empty, empty);
   EXPECT_EQ(p, p);
   EXPECT_EQ(p2, p2);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(SingletonOrSharedTypePtr, SingletonComparison) {
@@ -51,8 +54,11 @@ TEST(SingletonOrSharedTypePtr, SingletonComparison) {
   c10::TypePtr type = c10::NoneType::get();
   EXPECT_NE(type, c10::StringType::get());
   EXPECT_NE(type, c10::DeviceObjType::get());
+<<<<<<< HEAD
   EXPECT_EQ(type, type);
   EXPECT_EQ(type, c10::NoneType::get());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index ec6997fae9b05..85d6cbeaa7c11 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -9,7 +9,11 @@ using namespace at;
 TEST(TestUndefined, UndefinedTest) {
   manual_seed(123);
 
+<<<<<<< HEAD
   // mainly test ops on undefined tensors don't segfault and give a reasonable error message.
+=======
+  // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor und;
   Tensor ft = ones({1}, CPU(kFloat));
 
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index da0da76109569..1430a57394f12 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -5,7 +5,11 @@ namespace {
     template <typename T>
     class Memory : public ::testing::Test {};
     template <typename T>
+<<<<<<< HEAD
     class Arithmetic : public ::testing::Test {};
+=======
+    class Arithmetics : public ::testing::Test {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     template <typename T>
     class Comparison : public ::testing::Test {};
     template <typename T>
@@ -61,8 +65,11 @@ namespace {
     template <typename T>
     class QuantizationTests : public ::testing::Test {};
     template <typename T>
+<<<<<<< HEAD
     class Quantization8BitTests : public ::testing::Test {};
     template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class Quantization8BitWithTailTests : public ::testing::Test {};
     template <typename T>
     class FunctionalTests : public ::testing::Test {};
@@ -81,7 +88,10 @@ namespace {
     using FloatTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vcomplexDbl>;
     using ALLTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vlong, vint, vshort, vqint8, vquint8, vqint>;
     using QuantTestedTypes = ::testing::Types<vqint8, vquint8, vqint>;
+<<<<<<< HEAD
     using Quantization8BitTestedTypes = ::testing::Types<vqint8, vquint8>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
     using Quantization8BitWithTailTestedTypes =
         ::testing::Types<vqint8, vquint8>;
@@ -92,7 +102,11 @@ namespace {
     using ComplexTypes = ::testing::Types<vcomplex, vcomplexDbl>;
     using ReducedFloatTestedTypes = ::testing::Types<vBFloat16, vHalf>;
     TYPED_TEST_SUITE(Memory, ALLTestedTypes);
+<<<<<<< HEAD
     TYPED_TEST_SUITE(Arithmetic, FloatIntTestedTypes);
+=======
+    TYPED_TEST_SUITE(Arithmetics, FloatIntTestedTypes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST_SUITE(Comparison, RealFloatIntReducedFloatTestedTypes);
     TYPED_TEST_SUITE(Bitwise, FloatIntTestedTypes);
     TYPED_TEST_SUITE(MinMax, RealFloatIntTestedTypes);
@@ -119,7 +133,10 @@ namespace {
     TYPED_TEST_SUITE(BitwiseFloatsAdditional, RealFloatReducedFloatTestedTypes);
     TYPED_TEST_SUITE(BitwiseFloatsAdditional2, FloatTestedTypes);
     TYPED_TEST_SUITE(QuantizationTests, QuantTestedTypes);
+<<<<<<< HEAD
     TYPED_TEST_SUITE(Quantization8BitTests, Quantization8BitTestedTypes);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST_SUITE(InfiniteTests, RealFloatTestedTypes);
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
     TYPED_TEST_SUITE(
@@ -526,6 +543,7 @@ namespace {
             [](const vec& v) { return v.expm1(); },
             createDefaultUnaryTestCase<vec>(TestSeed(), false, true));
     }
+<<<<<<< HEAD
     TYPED_TEST(Exponents, ExpU20) {
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
@@ -561,6 +579,8 @@ namespace {
             test_case
         );
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST(ErrorFunctions, Erf) {
         using vec = TypeParam;
         test_unary<vec>(
@@ -726,7 +746,11 @@ namespace {
         AssertVectorized<vec>(NAME_INFO(DeInterleave FirstHalf), std::get<0>(cc), vec::loadu(vals)).check(true);
         AssertVectorized<vec>(NAME_INFO(DeInterleave SecondHalf), std::get<1>(cc), vec::loadu(vals + vec::size())).check(true);
     }
+<<<<<<< HEAD
     TYPED_TEST(Arithmetic, Plus) {
+=======
+    TYPED_TEST(Arithmetics, Plus) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
         test_binary<vec>(
@@ -738,7 +762,11 @@ namespace {
             createDefaultBinaryTestCase<vec>(TestSeed()),
                 RESOLVE_OVERLOAD(filter_add_overflow));
     }
+<<<<<<< HEAD
     TYPED_TEST(Arithmetic, Minus) {
+=======
+    TYPED_TEST(Arithmetics, Minus) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
         test_binary<vec>(
@@ -750,7 +778,11 @@ namespace {
             createDefaultBinaryTestCase<vec>(TestSeed()),
                 RESOLVE_OVERLOAD(filter_sub_overflow));
     }
+<<<<<<< HEAD
     TYPED_TEST(Arithmetic, Multiplication) {
+=======
+    TYPED_TEST(Arithmetics, Multiplication) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         using vec = TypeParam;
         test_binary<vec>(
             NAME_INFO(mult),
@@ -759,7 +791,11 @@ namespace {
             createDefaultBinaryTestCase<vec>(TestSeed(), false, true),
             RESOLVE_OVERLOAD(filter_mult_overflow));
     }
+<<<<<<< HEAD
     TYPED_TEST(Arithmetic, Division) {
+=======
+    TYPED_TEST(Arithmetics, Division) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         using vec = TypeParam;
         TestSeed seed;
         test_binary<vec>(
@@ -1535,6 +1571,7 @@ namespace {
             },
             test_case);
     }
+<<<<<<< HEAD
 #ifndef _WIN32
     TYPED_TEST(Quantization8BitTests, Transpose) {
         using VT = ValueType<TypeParam>;
@@ -1629,6 +1666,8 @@ namespace {
         }
     }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST(FunctionalTests, Map) {
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h
index f7206cc340973..29a3287a88b13 100644
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@@ -1,7 +1,12 @@
 #pragma once
+<<<<<<< HEAD
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/vec_quant.h>
+=======
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/bit_cast.h>
 #include <c10/util/irange.h>
 #include <gtest/gtest.h>
@@ -22,9 +27,13 @@
 #else
 #define CACHE_LINE 32
 #endif
+<<<<<<< HEAD
 #ifndef _WIN32
 #include <ATen/native/cpu/utils.h>
 #endif
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(__GNUC__)
 #define CACHE_ALIGN __attribute__((aligned(CACHE_LINE)))
 #define not_inline __attribute__((noinline))
@@ -531,7 +540,11 @@ template <typename T>
 std::enable_if_t<is_complex<T>::value, void>
 filter_div_ub(T& val1, T& val2) {
     //missing
+<<<<<<< HEAD
     //at least consider zero division
+=======
+    //at least consdier zero division
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto ret = std::abs(val2);
     if (ret == 0) {
         val2 = T(1, 2);
@@ -1291,7 +1304,11 @@ std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_multiply(Compl
     T y_real = y.real();
     T y_imag = y.imag();
 #if defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)
+<<<<<<< HEAD
     //check multiplication considering swap and fma
+=======
+    //check multiplication considerin swap and fma
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T rr = x_real * y_real;
     T ii = x_imag * y_real;
     T neg_imag = -y_imag;
@@ -1362,7 +1379,11 @@ std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_division(Compl
     return Complex<T>(rr, ii);
 #else /* defined(CPU_CAPABILITY_ZVECTOR) */
 #if defined(CPU_CAPABILITY_VSX)
+<<<<<<< HEAD
     //check multiplication considering swap and fma
+=======
+    //check multiplication considerin swap and fma
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T rr = x_real * y_real;
     T ii = x_imag * y_real;
     T neg_imag = -y_imag;
diff --git a/aten/src/ATen/test/verify_api_visibility.cpp b/aten/src/ATen/test/verify_api_visibility.cpp
index c6d2fcc6fb865..c5bf24fd6fd35 100644
--- a/aten/src/ATen/test/verify_api_visibility.cpp
+++ b/aten/src/ATen/test/verify_api_visibility.cpp
@@ -20,8 +20,12 @@
 #error "CAFFE2_STATIC_LINK_CUDA should not be visible in public headers"
 #endif
 
+<<<<<<< HEAD
 #include <gtest/gtest.h>
 
 TEST(VerifyApiVisibility, Test) {
   ASSERT_EQ(1, 1);
 }
+=======
+auto main() -> int {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 396ea59d2f008..cec58bd89b32e 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1232,7 +1232,11 @@ void test_matmul(
 }
 
 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_vulkan) {
+<<<<<<< HEAD
   // This will call at::bmm. Will crash for unknown reason.
+=======
+  // This will call at::bmm. Will crash for unknow reason.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto m1_cpu =
       at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m2_cpu =
@@ -1241,7 +1245,11 @@ TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_vulkan) {
 }
 
 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_cpu) {
+<<<<<<< HEAD
   // This will call at::bmm. Will crash for unknown reason.
+=======
+  // This will call at::bmm. Will crash for unknow reason.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto m1_cpu =
       at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m2_cpu =
@@ -2004,7 +2012,11 @@ TEST_F(VulkanAPITest, conv2d_pw_prepack_bc_medium) {
     1);                 // groups
 }
 
+<<<<<<< HEAD
 // The following 2 tests failed on Meta's CI when all tests are executed.  Output
+=======
+// The followin 2 tests failed on Meta's CI when all tests are executed.  Output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // has lots of nan. Cause unknown.
 // When this test is run alone (with gtest_filter), it passes.
 // The test also passes with smaller planes, see "conv2d_pw_prepack_medium".
@@ -5664,7 +5676,11 @@ TEST_F(VulkanAPITest, var_2d_unbiased) {
   test_var({3, 5}, {1}, true, true);
   test_var({3, 5}, {1}, true, false);
 
+<<<<<<< HEAD
   // input.dim() == dim_list.size(), only keepdim == true is supported
+=======
+  // inpu.dim() == dim_list.size(), only keepdim == true is supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   test_var({3, 5}, {0, 1}, true, true);
 }
 
@@ -5672,7 +5688,11 @@ TEST_F(VulkanAPITest, var_2d_biased) {
   test_var({3, 5}, {1}, false, true);
   test_var({3, 5}, {1}, false, false);
 
+<<<<<<< HEAD
   // input.dim() == dim_list.size(), only keepdim == true is supported
+=======
+  // inpu.dim() == dim_list.size(), only keepdim == true is supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   test_var({3, 5}, {0, 1}, false, true);
 }
 
@@ -6894,7 +6914,11 @@ TEST_F(VulkanAPITest, slice_height_success) {
     {2, {2, 3, 40, 50}},  // 4D tensors with dim=height
     {1, {3, 40, 50}},     // 3D tensors with dim=height
     {0, {40, 50}},        // 2D tensors with dim=height
+<<<<<<< HEAD
                           // 1D tensors don't have height dim for test
+=======
+                          // 1D tesnors don't have height dim for test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   // Act/Assert
@@ -6906,7 +6930,11 @@ TEST_F(VulkanAPITest, slice_feature_success) {
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
     {0, {40, 13, 14}},    // 3D tensors with dim=feature(channel)
+<<<<<<< HEAD
                           // 1D and 2D tensors don't have feature(channel) dim for test
+=======
+                          // 1D and 2D tesnors don't have feature(channel) dim for test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   // Act/Assert
@@ -6917,7 +6945,11 @@ TEST_F(VulkanAPITest, slice_batch_success) {
   // Arrange
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
+<<<<<<< HEAD
                           // 1D, 2D and 3D tensors don't have batch dim for test
+=======
+                          // 1D, 2D and 3D tesnors don't have batch dim for test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   // Act/Assert
@@ -7142,12 +7174,20 @@ TEST_F(VulkanAPITest, clone_success) {
 }
 
 TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) {
+<<<<<<< HEAD
   // Act: Vulkan supports Preserve and Contiguous memory formats
+=======
+  // Act: Vulkan supports Preserve and Contiguous memory foramts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EXPECT_THROW({
     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast);
   }, ::std::exception);
 
+<<<<<<< HEAD
   // Act: Vulkan supports Preserve and Contiguous memory formats
+=======
+  // Act: Vulkan supports Preserve and Contiguous memory foramts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EXPECT_THROW({
     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast3d);
   }, ::std::exception);
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 2829aed94def9..7dfc6e76f4009 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -2116,7 +2116,11 @@ std::tuple<double, double, int, int> produce_inputs_for_binary_op(
     input2_cpu = produce_random_tensor(input2_shape);
 
     if (compute_quantization_params) {
+<<<<<<< HEAD
       // compute appropriate scale and zero point for inputs
+=======
+      // compute appropiate scale and zero point for inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto in1_quant_params = compute_quant_params(input1_cpu);
       in1_scale = std::get<0>(in1_quant_params);
       in1_zero_point = std::get<1>(in1_quant_params);
@@ -2287,7 +2291,11 @@ void test_quantized_binary_op(
       apply_cpu_quantized_binary_op(op_name, input1_cpu_deq, input2_cpu_deq);
 
   if (compute_quantization_params || random_quantization_params) {
+<<<<<<< HEAD
     // compute appropriate scale and zero point for output
+=======
+    // compute appropiate scale and zero point for output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto out_quant_params = compute_quant_params(output_cpu);
     out_scale = std::get<0>(out_quant_params);
     out_zero_point = std::get<1>(out_quant_params);
@@ -2540,7 +2548,11 @@ void test_quantized_conv2d(
     bias_cpu = produce_random_tensor(bias_shape, 1.26, 5.97, 0.59);
 
     if (compute_quantization_params) {
+<<<<<<< HEAD
       // compute appropriate scale and zero point for input, weight and bias
+=======
+      // compute appropiate scale and zero point for input, weight and bias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto in_quant_params = compute_quant_params(input_cpu, in_dtype);
       in_scale = std::get<0>(in_quant_params);
       in_zero_point = std::get<1>(in_quant_params);
@@ -2624,7 +2636,11 @@ void test_quantized_conv2d(
       groups);
 
   if (compute_quantization_params || random_quantization_params) {
+<<<<<<< HEAD
     // compute appropriate scale and zero point for output
+=======
+    // compute appropiate scale and zero point for output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto out_quant_params = compute_quant_params(output_cpu, out_dtype);
     out_scale = std::get<0>(out_quant_params);
     out_zero_point = std::get<1>(out_quant_params);
@@ -3524,7 +3540,11 @@ TEST_F(VulkanAPITest, linear_4d_large) {
   test_quantized_linear({9, 13, 11, 17}, {23, 17}, {23});
 }
 
+<<<<<<< HEAD
 // The following code is not directly related to quantization. We put it here
+=======
+// The following code is not directly releated to quantization. We put it here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // since we are not able to run this test on GH's CI: for some unknown reason,
 // we are not able to reference symbols in the vulkan directory, hence the build
 // on GH fails. Moving the test here so we are still able to run it on
@@ -3566,7 +3586,11 @@ TEST_F(VulkanAPITest, extract_texel_test) {
   // is the channel count.
   // We always start a new batch on a new z. Hence, when c cannot be divided by
   // 4, there are some undefined values in the padding area. We use -1 to
+<<<<<<< HEAD
   // indicate that we are not performing comparison on those values.
+=======
+  // indicate that we are not performing comparsion on those values.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::tuple<ivec3, ivec4> test_cases[]{
       {{0, 0, 0}, {0, hw, 2 * hw, 3 * hw}},
       {{1, 0, 0}, {1, hw + 1, 2 * hw + 1, 3 * hw + 1}},
@@ -3672,7 +3696,11 @@ TEST_F(VulkanAPITest, channel_to_width_packing_test) {
   at::Tensor output = at::native::vulkan::ops::convert(v_output);
 
   // This tensor will be width-packed. Meaning that each texel represent
+<<<<<<< HEAD
   // consecutive elements along the width dimension. The  difference between
+=======
+  // consecutive elements along the width dimension. The  differece between
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // consecutive texels is 1.
   std::tuple<ivec3, ivec4> test_cases[]{
       {{0, 0, 0}, {0, 1, 2, 3}},
diff --git a/aten/src/ATen/test/xpu_generator_test.cpp b/aten/src/ATen/test/xpu_generator_test.cpp
index 0b915c1b0cc9e..e823beaa21e30 100644
--- a/aten/src/ATen/test/xpu_generator_test.cpp
+++ b/aten/src/ATen/test/xpu_generator_test.cpp
@@ -80,6 +80,7 @@ TEST(XpuGeneratorTest, testMultithreadingGetSetCurrentSeed) {
   t2.join();
   EXPECT_EQ(gen1.current_seed(), initial_seed+3);
 }
+<<<<<<< HEAD
 
 TEST(XpuGeneratorTest, testRNGForking) {
   // See Note [Acquire lock when using random generators]
@@ -96,3 +97,5 @@ TEST(XpuGeneratorTest, testRNGForking) {
   auto forked_value = at::randn({1000}, current_gen, at::kXPU);
   ASSERT_EQ(target_value.sum().item<double>(), forked_value.sum().item<double>());
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/vulkan/Context.h b/aten/src/ATen/vulkan/Context.h
index f8eef31c74bda..38d6497b64b41 100644
--- a/aten/src/ATen/vulkan/Context.h
+++ b/aten/src/ATen/vulkan/Context.h
@@ -18,7 +18,11 @@ extern std::atomic<const VulkanImplInterface*> g_vulkan_impl_registry;
 
 class VulkanImplRegistrar {
  public:
+<<<<<<< HEAD
   explicit VulkanImplRegistrar(VulkanImplInterface* /*impl*/);
+=======
+  explicit VulkanImplRegistrar(VulkanImplInterface*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src);
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
index d531b46c3c554..f8ff2d9eaee27 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.cpp
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -30,12 +30,15 @@ struct XPUCachingHostAllocatorImpl
   bool query_event(XPUEvent& event) override {
     return event.query();
   }
+<<<<<<< HEAD
 
   bool pinned_use_background_threads() override {
     // Using background threads for XPU causes a hang on Windows during program
     // exit. Will be enabled once the issue is resolved.
     return false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 DECLARE_HOST_ALLOCATOR(
diff --git a/aten/src/ATen/xpu/XPUContext.cpp b/aten/src/ATen/xpu/XPUContext.cpp
index e956ec9a16599..1fdc5e97f3ea3 100644
--- a/aten/src/ATen/xpu/XPUContext.cpp
+++ b/aten/src/ATen/xpu/XPUContext.cpp
@@ -76,6 +76,7 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
   return device_global_idxs[device];
 }
 
+<<<<<<< HEAD
 // Check if a device can access the memory of a peer device directly.
 bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
   if (device == -1) {
@@ -95,4 +96,6 @@ bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
       sycl::ext::oneapi::peer_access::access_supported);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUContext.h b/aten/src/ATen/xpu/XPUContext.h
index a473f317ca3d1..c44ccc9d821cf 100644
--- a/aten/src/ATen/xpu/XPUContext.h
+++ b/aten/src/ATen/xpu/XPUContext.h
@@ -17,6 +17,9 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
 
 TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
 
+<<<<<<< HEAD
 TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h
index 19d42aae080f1..e76aef28e2ead 100644
--- a/aten/src/ATen/xpu/XPUEvent.h
+++ b/aten/src/ATen/xpu/XPUEvent.h
@@ -12,7 +12,11 @@ namespace at::xpu {
  * must match the same device.
  *
  * Currently, XPUEvent does NOT support to export an inter-process event from
+<<<<<<< HEAD
  * another process via inter-process communication(IPC). So it means that
+=======
+ * another process via inter-process comunication(IPC). So it means that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * inter-process communication for event handles between different processes is
  * not available. This could impact some applications that rely on cross-process
  * synchronization and communication.
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
index 7a0859671ba76..4b5ca0816f253 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -1,14 +1,22 @@
+<<<<<<< HEAD
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
 #include <ATen/Utils.h>
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <ATen/xpu/XPUGraphsUtils.h>
+=======
+#include <ATen/Utils.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/StreamGuard.h>
 #include <c10/util/CallOnce.h>
 #include <c10/xpu/XPUFunctions.h>
 
+<<<<<<< HEAD
 constexpr uint64_t PHILOX_ROUND_SIZE = 4;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 namespace xpu::detail {
 namespace {
@@ -63,6 +71,7 @@ Generator createXPUGenerator(DeviceIndex device) {
 
 } // namespace xpu::detail
 
+<<<<<<< HEAD
 // Creates a clone of this XPU Generator State.
 c10::intrusive_ptr<XPUGeneratorState> XPUGeneratorState::clone() {
   return make_intrusive<XPUGeneratorState>(
@@ -124,10 +133,24 @@ void XPUGeneratorImpl::set_current_seed(uint64_t seed) {
 
 void XPUGeneratorImpl::set_offset(uint64_t offset) {
   at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::set_offset");
+=======
+XPUGeneratorImpl::XPUGeneratorImpl(DeviceIndex device_index)
+    : GeneratorImpl{
+          Device(DeviceType::XPU, device_index),
+          DispatchKeySet(c10::DispatchKey::XPU)} {}
+
+void XPUGeneratorImpl::set_current_seed(uint64_t seed) {
+  seed_ = seed;
+  set_philox_offset_per_thread(0);
+}
+
+void XPUGeneratorImpl::set_offset(uint64_t offset) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set_philox_offset_per_thread(offset);
 }
 
 uint64_t XPUGeneratorImpl::get_offset() const {
+<<<<<<< HEAD
   at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::get_offset");
   return state_->philox_offset_per_thread_;
 }
@@ -139,6 +162,16 @@ uint64_t XPUGeneratorImpl::current_seed() const {
 
 uint64_t XPUGeneratorImpl::seed() {
   at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::seed");
+=======
+  return philox_offset_per_thread_;
+}
+
+uint64_t XPUGeneratorImpl::current_seed() const {
+  return seed_;
+}
+
+uint64_t XPUGeneratorImpl::seed() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto random = c10::detail::getNonDeterministicRandom(true);
   this->set_current_seed(random);
   return random;
@@ -146,9 +179,15 @@ uint64_t XPUGeneratorImpl::seed() {
 
 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
+<<<<<<< HEAD
   constexpr size_t seed_size = sizeof(uint64_t);
   constexpr size_t offset_size = sizeof(uint64_t);
   constexpr size_t total_size = seed_size + offset_size;
+=======
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // The internal state is returned as a CPU byte tensor.
   auto state_tensor = at::detail::empty_cpu(
@@ -168,6 +207,7 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 }
 
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+<<<<<<< HEAD
   at::xpu::assertNotCapturing(
       "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
   constexpr size_t seed_size = sizeof(uint64_t);
@@ -192,11 +232,28 @@ void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   if (!no_philox_seed) {
     memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
   }
+=======
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  at::detail::check_rng_state(new_state);
+  auto new_state_size = new_state.numel();
+  TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+
+  uint64_t input_seed;
+  auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
+  memcpy(&input_seed, new_rng_state, seed_size);
+  this->set_current_seed(input_seed);
+  uint64_t philox_offset;
+  memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   this->set_philox_offset_per_thread(philox_offset);
 }
 
 void XPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
+<<<<<<< HEAD
   state_->philox_offset_per_thread_ = offset;
 }
 
@@ -218,15 +275,30 @@ PhiloxXpuState XPUGeneratorImpl::philox_xpu_state(uint64_t increment) {
     state_->increase(increment);
     return PhiloxXpuState(state_->seed_, offset);
   }
+=======
+  philox_offset_per_thread_ = offset;
+}
+
+uint64_t XPUGeneratorImpl::philox_offset_per_thread() const {
+  return philox_offset_per_thread_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::pair<uint64_t, uint64_t> XPUGeneratorImpl::philox_engine_inputs(
     uint64_t increment) {
+<<<<<<< HEAD
   at::xpu::assertNotCapturing(
       "Refactor this op to use XPUGeneratorImpl::philox_xpu_state. Cannot call XPUGeneratorImpl::philox_engine_inputs");
   uint64_t offset = state_->philox_offset_per_thread_;
   state_->increase(increment);
   return std::make_pair(state_->seed_, offset);
+=======
+  increment = ((increment + 3) / 4) * 4;
+  TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
+  uint64_t offset = this->philox_offset_per_thread_;
+  this->philox_offset_per_thread_ += increment;
+  return std::make_pair(this->seed_, offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 DeviceType XPUGeneratorImpl::device_type() {
@@ -238,8 +310,14 @@ std::shared_ptr<XPUGeneratorImpl> XPUGeneratorImpl::clone() const {
 }
 
 XPUGeneratorImpl* XPUGeneratorImpl::clone_impl() const {
+<<<<<<< HEAD
   at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::clone_impl");
   auto gen = new XPUGeneratorImpl(this->device().index(), state_->clone());
+=======
+  auto gen = new XPUGeneratorImpl(this->device().index());
+  gen->set_current_seed(this->seed_);
+  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return gen;
 }
 
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.h b/aten/src/ATen/xpu/XPUGeneratorImpl.h
index 331f7387a6296..2d3cce2d641a8 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.h
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/core/Generator.h>
+<<<<<<< HEAD
 #include <ATen/core/TensorBase.h>
 #include <ATen/xpu/PhiloxXpuState.h>
 #include <unordered_set>
@@ -38,6 +39,14 @@ struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
   XPUGeneratorImpl(
       DeviceIndex device_index,
       c10::intrusive_ptr<XPUGeneratorState> state_);
+=======
+
+namespace at {
+
+struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
+  // Constructors
+  XPUGeneratorImpl(DeviceIndex device_index = -1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~XPUGeneratorImpl() override = default;
 
   // XPUGeneratorImpl methods
@@ -49,18 +58,28 @@ struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+<<<<<<< HEAD
 
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread() const;
 
   PhiloxXpuState philox_xpu_state(uint64_t increment);
   // will remove once all ops are refactored to use philox_xpu_state.
+=======
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread() const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
   static c10::DeviceType device_type();
 
  private:
   XPUGeneratorImpl* clone_impl() const override;
+<<<<<<< HEAD
   c10::intrusive_ptr<XPUGeneratorState> state_;
+=======
+  uint64_t seed_ = default_rng_seed_val;
+  uint64_t philox_offset_per_thread_ = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 namespace xpu::detail {
diff --git a/aten/src/README.md b/aten/src/README.md
index fa279c89d26ca..904d1bede039f 100644
--- a/aten/src/README.md
+++ b/aten/src/README.md
@@ -8,7 +8,11 @@ multiple variants of the library, summarized here:
 * THC = TorcH Cuda
 * THCS = TorcH Cuda Sparse (now defunct)
 * THNN = TorcH Neural Network (now defunct)
+<<<<<<< HEAD
 * THS = TorcH Sparse (now defunct)  <!-- codespell:ignore -->
+=======
+* THS = TorcH Sparse (now defunct)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 (You'll also see these abbreviations show up in symbol names.)
 
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index 93e48aec90851..e5342fc64b901 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -50,7 +50,10 @@ run_if_exists cuda_complex_test
 run_if_exists cuda_complex_math_test
 run_if_exists cuda_cub_test
 run_if_exists cuda_atomic_ops_test
+<<<<<<< HEAD
 run_if_exists cuda_allocator_test
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [ "$VALGRIND" == "ON" ]; then
   # NB: As these tests are invoked by valgrind, let's leave them for now as it's
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4ea84bcafab46..d004333a6990b 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -8,12 +8,20 @@ It also provides mechanisms to compare PyTorch with other frameworks.
 Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
 ```
 # Install torchvision. It comes with the pytorch stable release binary
+<<<<<<< HEAD
 python -m pip install torch torchvision
+=======
+pip3 install torch torchvision
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install the latest pytorch master from source.
 # It should supersede the installation from the release binary.
 cd $PYTORCH_HOME
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py build develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Check the pytorch installation version
 python -c "import torch; print(torch.__version__)"
@@ -31,4 +39,7 @@ Please refer to each subfolder to discover each benchmark suite. Links are provi
 * [Overrides](overrides_benchmark/README.md)
 * [Sparse](sparse/README.md)
 * [Tensor expression](tensorexpr/HowToRun.md)
+<<<<<<< HEAD
 * [Data](data/README.md)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/Makefile b/benchmarks/dynamo/Makefile
index c5a0a20aaa690..690d468e5dd84 100644
--- a/benchmarks/dynamo/Makefile
+++ b/benchmarks/dynamo/Makefile
@@ -27,7 +27,11 @@ pull-deps: clone-deps
 	(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
 
 build-deps: clone-deps
+<<<<<<< HEAD
 	uv pip install numpy scipy ninja pyyaml six mkl mkl-include setuptools wheel cmake \
+=======
+	uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 		typing-extensions requests protobuf numba cython scikit-learn librosa
 	(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
 	(cd ../../../torchdata && uv pip install -e .)
diff --git a/benchmarks/dynamo/all_torchbench_models_list.txt b/benchmarks/dynamo/all_torchbench_models_list.txt
index 5205bded7b740..532461edc99f2 100644
--- a/benchmarks/dynamo/all_torchbench_models_list.txt
+++ b/benchmarks/dynamo/all_torchbench_models_list.txt
@@ -25,6 +25,18 @@ drq
 fambench_dlrm
 fambench_xlmr
 fastNLP_Bert
+<<<<<<< HEAD
+=======
+hf_Albert
+hf_Bart
+hf_Bert
+hf_BigBird
+hf_DistilBert
+hf_GPT2
+hf_Longformer
+hf_Reformer
+hf_T5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 maml
 maml_omniglot
 mnasnet1_0
@@ -51,6 +63,16 @@ soft_actor_critic
 speech_transformer
 squeezenet1_1
 tacotron2
+<<<<<<< HEAD
+=======
+timm_efficientdet
+timm_efficientnet
+timm_nfnet
+timm_regnet
+timm_resnest
+timm_vision_transformer
+timm_vovnet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tts_angular
 vgg16
 vision_maskrcnn
diff --git a/benchmarks/dynamo/benchmarks.py b/benchmarks/dynamo/benchmarks.py
index 25c1e8203a0a9..01281723a1733 100755
--- a/benchmarks/dynamo/benchmarks.py
+++ b/benchmarks/dynamo/benchmarks.py
@@ -5,12 +5,15 @@
 import sys
 
 
+<<<<<<< HEAD
 # Run only this selected group of models, leave this empty to run everything
 TORCHBENCH_ONLY_MODELS = [
     m.strip() for m in os.getenv("TORCHBENCH_ONLY_MODELS", "").split(",") if m.strip()
 ]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Note - hf and timm have their own version of this, torchbench does not
 # TODO(voz): Someday, consolidate all the files into one runner instead of a shim like this...
 def model_names(filename: str) -> set[str]:
@@ -23,8 +26,11 @@ def model_names(filename: str) -> set[str]:
             if len(line_parts) == 1:
                 line_parts = line.split(",")
             model_name = line_parts[0]
+<<<<<<< HEAD
             if TORCHBENCH_ONLY_MODELS and model_name not in TORCHBENCH_ONLY_MODELS:
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             names.add(model_name)
     return names
 
diff --git a/benchmarks/dynamo/cachebench.py b/benchmarks/dynamo/cachebench.py
index c4d79a1b12ced..611dd5ea777c8 100644
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@@ -6,7 +6,11 @@
 import subprocess
 import sys
 import tempfile
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._inductor.utils import fresh_cache
 
@@ -23,6 +27,10 @@
     "resnet50",
     "moco",
     "llama",
+<<<<<<< HEAD
+=======
+    "hf_T5",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 HUGGINGFACE_MODELS: list[str] = [
     "AllenaiLongformerBase",
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 83cca8b36b993..9063ed5d003ed 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -10,6 +10,7 @@
 
 flaky_models = {
     "yolov3",
+<<<<<<< HEAD
     "detectron2_maskrcnn_r_101_c4",
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
     "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
@@ -17,6 +18,12 @@
     "mobilenetv3_large_100",
     # https://github.com/pytorch/pytorch/issues/163670
     "vision_maskrcnn",
+=======
+    "gluon_inception_v3",
+    "detectron2_maskrcnn_r_101_c4",
+    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
+    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -34,15 +41,29 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
     if "rocm" in expected_filename:
         flaky_models.update(
             {
+<<<<<<< HEAD
                 "Background_Matting",
                 "alexnet",
+=======
+                "alexnet",
+                "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "demucs",
                 "densenet121",
                 "detectron2_fcos_r_50_fpn",
                 "doctr_det_predictor",
                 "doctr_reco_predictor",
+<<<<<<< HEAD
                 "dpn107",
                 "fbnetv3_b",
+=======
+                "hf_BigBird",
+                "hf_Longformer",
+                "hf_Reformer",
+                "hf_Roberta_base",
+                "hf_T5",
+                "hf_T5_base",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "levit_128",
                 "llava",
                 "microbench_unbacked_tolist_sum",
@@ -60,6 +81,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "squeezenet1_1",
                 "stable_diffusion_text_encoder",
                 "stable_diffusion_unet",
+<<<<<<< HEAD
                 "swsl_resnext101_32x16d",
                 "torchrec_dlrm",
                 "vgg16",
@@ -75,6 +97,16 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "Qwen/Qwen3-0.6B",
                 "mistralai/Mistral-7B-Instruct-v0.3",
                 "openai/gpt-oss-20b",
+=======
+                "timm_efficientdet",
+                "timm_efficientnet",
+                "timm_nfnet",
+                "timm_regnet",
+                "timm_resnest",
+                "timm_vovnet",
+                "torchrec_dlrm",
+                "vgg16",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
diff --git a/benchmarks/dynamo/check_graph_breaks.py b/benchmarks/dynamo/check_graph_breaks.py
index 963f370a1ae13..73dea60145c8e 100644
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@@ -10,9 +10,15 @@
 
 flaky_models = {
     "yolov3",
+<<<<<<< HEAD
     "detectron2_maskrcnn_r_101_c4",
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
     "detectron2_fcos_r_50_fpn",
+=======
+    "gluon_inception_v3",
+    "detectron2_maskrcnn_r_101_c4",
+    "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -31,21 +37,38 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
         flaky_models.update(
             {
                 "alexnet",
+<<<<<<< HEAD
+=======
+                "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "demucs",
                 "densenet121",
                 "detectron2_fcos_r_50_fpn",
                 "doctr_det_predictor",
                 "doctr_reco_predictor",
+<<<<<<< HEAD
                 "levit_128",
                 "llava",
                 "microbench_unbacked_tolist_sum",
                 "resnet50",
                 "resnet152",
+=======
+                "hf_BigBird",
+                "hf_Longformer",
+                "hf_Reformer",
+                "hf_Roberta_base",
+                "hf_T5",
+                "hf_T5_base",
+                "levit_128",
+                "llava",
+                "microbench_unbacked_tolist_sum",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "sam",
                 "sam_fast",
                 "stable_diffusion_text_encoder",
                 "stable_diffusion_unet",
                 "timm_efficientdet",
+<<<<<<< HEAD
                 "torchrec_dlrm",
                 "vgg16",
                 # LLM
@@ -56,6 +79,11 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                 "Qwen/Qwen3-0.6B",
                 "mistralai/Mistral-7B-Instruct-v0.3",
                 "openai/gpt-oss-20b",
+=======
+                "timm_nfnet",
+                "torchrec_dlrm",
+                "vgg16",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
index 54914c1395e17..84535e1fa4019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
index b5e457e58997d..c7145d5c32742 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,fail_accuracy,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 6ddac7cc558d7..22077a04e87e7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -106,7 +106,11 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
+=======
+doctr_det_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +134,73 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -154,7 +225,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +353,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +392,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index a133b9b67a762..9bcc99686a26c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -78,6 +78,65 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,6
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +149,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +257,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,fail_accuracy,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -206,7 +296,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
index 2077b996f2edc..274f8e10b45e6 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@@ -6,26 +6,84 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistillGPT2,pass,0
 
 
@@ -34,6 +92,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -46,6 +111,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -54,6 +126,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -62,10 +141,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -74,14 +167,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -103,6 +221,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -131,3 +250,5 @@ mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
 
 
 openai/gpt-oss-20b,fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index b52e4eb905d1c..757a4ad55c952 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -118,6 +118,65 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_accuracy,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -258,6 +317,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index 42f0cfef50fcb..ef774deeac30a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -114,6 +114,61 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -226,6 +281,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
index 7519dfc6c8cdc..77a2dc67a4686 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@@ -6,26 +6,84 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistillGPT2,pass,0
 
 
@@ -34,6 +92,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -46,6 +111,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -54,6 +126,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -62,10 +141,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -74,14 +167,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
index 42f0cfef50fcb..ef774deeac30a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -114,6 +114,61 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -226,6 +281,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
index 73d7786ef29e6..19251c40ea0eb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 
 
 openai/gpt-oss-20b,pass_due_to_skip,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index a0edfdbe47ffe..0a00f349952f9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
 
 
 
 doctr_reco_predictor,pass,1
+=======
+doctr_det_predictor,pass,5
+
+
+
+doctr_reco_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -122,6 +130,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,24
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -150,7 +225,11 @@ mobilenet_v2,pass,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v2_quantized_qat,pass,3
+=======
+mobilenet_v2_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -214,7 +293,11 @@ resnet50,pass,0
 
 
 
+<<<<<<< HEAD
 resnet50_quantized_qat,pass,3
+=======
+resnet50_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -242,6 +325,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
index 73d7786ef29e6..19251c40ea0eb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 
 
 openai/gpt-oss-20b,pass_due_to_skip,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index f9f970a7fc83c..cff3430cabd81 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -98,11 +98,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
 
 
 
 doctr_reco_predictor,pass,1
+=======
+doctr_det_predictor,pass,5
+
+
+
+doctr_reco_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -122,6 +130,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,24
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -150,7 +225,11 @@ mobilenet_v2,pass,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v2_quantized_qat,pass,3
+=======
+mobilenet_v2_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -214,7 +293,11 @@ resnet50,pass,0
 
 
 
+<<<<<<< HEAD
 resnet50_quantized_qat,pass,3
+=======
+resnet50_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -242,6 +325,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
index 73d7786ef29e6..19251c40ea0eb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 
 
 openai/gpt-oss-20b,pass_due_to_skip,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 4c1319db30c8a..e3564a1e58700 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -98,11 +98,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
 
 
 
 doctr_reco_predictor,pass,1
+=======
+doctr_det_predictor,pass,5
+
+
+
+doctr_reco_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -122,6 +130,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,24
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -150,7 +225,11 @@ mobilenet_v2,pass,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v2_quantized_qat,pass,3
+=======
+mobilenet_v2_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -214,7 +293,11 @@ resnet50,pass,0
 
 
 
+<<<<<<< HEAD
 resnet50_quantized_qat,pass,3
+=======
+resnet50_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -242,6 +325,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
index 54914c1395e17..84535e1fa4019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
index b5e457e58997d..c7145d5c32742 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,fail_accuracy,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 70486cca6353a..9d3b3b8c90981 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -106,7 +106,11 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
+=======
+doctr_det_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +134,73 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_to_run,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -154,7 +225,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +353,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +392,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index ef33cd850dfd4..9dcdb32977a45 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -2,7 +2,11 @@ name,accuracy,graph_breaks
 
 
 
+<<<<<<< HEAD
 torchrec_dlrm,pass,6
+=======
+torchrec_dlrm,fail_to_run,3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,6 +82,65 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,fail_to_run,3
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,fail_to_run,19
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +153,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -190,6 +257,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -202,7 +296,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index fe59dabe3b57c..280d4fedd9bd5 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -98,6 +98,61 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -210,6 +265,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
index fe59dabe3b57c..19d7dd86dd13b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -34,7 +34,11 @@ basic_gnn_gin,pass,0
 
 
 
+<<<<<<< HEAD
 basic_gnn_sage,pass,0
+=======
+basic_gnn_sage,fail_to_run,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -98,6 +102,61 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -210,6 +269,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
index 54914c1395e17..84535e1fa4019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index 4e4cc7dc18bc1..bc68f2417eaea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -82,11 +82,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
 
 
 
 doctr_reco_predictor,pass,1
+=======
+doctr_det_predictor,pass,5
+
+
+
+doctr_reco_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -106,6 +114,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,24
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -122,7 +193,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -134,7 +209,11 @@ mobilenet_v2,pass,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v2_quantized_qat,pass,3
+=======
+mobilenet_v2_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -198,7 +277,11 @@ resnet50,pass,0
 
 
 
+<<<<<<< HEAD
 resnet50_quantized_qat,pass,3
+=======
+resnet50_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -226,7 +309,43 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
 torch_multimodal_clip,pass,0
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+torch_multimodal_clip,pass,3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
index 439c9bf530468..bc7c79121ef8c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index 723ef7a272ea1..01f21234ff6c2 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
 
 
 
 doctr_reco_predictor,pass,1
+=======
+doctr_det_predictor,pass,5
+
+
+
+doctr_reco_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -122,6 +130,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,24
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -150,7 +225,11 @@ mobilenet_v2,pass,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v2_quantized_qat,pass,3
+=======
+mobilenet_v2_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -214,7 +293,11 @@ resnet50,pass,0
 
 
 
+<<<<<<< HEAD
 resnet50_quantized_qat,pass,3
+=======
+resnet50_quantized_qat,pass,2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -242,7 +325,43 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
 torch_multimodal_clip,pass,0
+=======
+timm_efficientdet,model_fail_to_load,0
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+torch_multimodal_clip,pass,3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
index a08bd7265db96..ef17e49d1f2c1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
index 1de6cdf549652..a82b9185a9b61 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
index b5e457e58997d..8950dfe42b1ff 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index cb7cfb4c7d68f..2ee6a023282a5 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -106,7 +106,11 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
+=======
+doctr_det_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +134,73 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_to_run,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -154,7 +225,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +353,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +392,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index 71311ac0faf74..168629517c613 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -2,7 +2,11 @@ name,accuracy,graph_breaks
 
 
 
+<<<<<<< HEAD
 torchrec_dlrm,pass,6
+=======
+torchrec_dlrm,fail_to_run,3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -46,7 +50,11 @@ dcgan,pass,6
 
 
 
+<<<<<<< HEAD
 demucs,pass,9
+=======
+demucs,fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,6 +86,65 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,fail_to_run,3
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,fail_to_run,19
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +157,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -190,6 +261,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,fail_accuracy,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -202,7 +300,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,37
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
index 54914c1395e17..84535e1fa4019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
index b5e457e58997d..8950dfe42b1ff 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 6ddac7cc558d7..22077a04e87e7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -106,7 +106,11 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
+=======
+doctr_det_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +134,73 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -154,7 +225,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +353,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +392,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index a133b9b67a762..55db2864bd4b5 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -78,6 +78,65 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,6
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +149,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +257,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -206,7 +296,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
index 54914c1395e17..84535e1fa4019 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -135,3 +255,5 @@ mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 openai/gpt-oss-20b,pass,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
index 1de6cdf549652..a82b9185a9b61 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
index b2f40504a4991..7685416f55162 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,fail_accuracy,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index c752deaf1990e..7756059c089ed 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -106,7 +106,11 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,pass,3
+=======
+doctr_det_predictor,pass,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +134,73 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_accuracy,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -154,7 +225,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +353,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +392,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index c94765803cc03..c84d4cf94a4a1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -78,6 +78,65 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,6
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +149,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +257,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,fail_accuracy,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -206,7 +296,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,37
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
index 46f1e5adf4ec9..67b4a60c07495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -127,3 +247,5 @@ openai/whisper-tiny,pass,6
 
 
 Qwen/Qwen3-0.6B,pass,5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
index 988cd1b70a5be..ef99c684ba17f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,4
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
index 1de6cdf549652..a386bf83ee495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,eager_fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
index b5e457e58997d..dc13971e6bc2a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,pass,6
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,eager_fail_to_run,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index ee742091e0082..160395645f12e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -106,11 +106,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,eager_fail_to_run,3
 
 
 
 doctr_reco_predictor,eager_fail_to_run,1
+=======
+doctr_det_predictor,eager_fail_to_run,5
+
+
+
+doctr_reco_predictor,eager_fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +138,76 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,9
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +216,11 @@ llama,pass,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -154,7 +236,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +364,41 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +411,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
index de21a39be4e92..b5f756c65d449 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@@ -78,6 +78,73 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,6
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +157,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -106,7 +177,11 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v3_large,pass,0
+=======
+mobilenet_v3_large,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -146,7 +221,11 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
+<<<<<<< HEAD
 resnet152,pass,0
+=======
+resnet152,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -154,7 +233,11 @@ resnet18,pass,6
 
 
 
+<<<<<<< HEAD
 resnet50,pass,0
+=======
+resnet50,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +277,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,7
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -206,7 +324,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
index 681b0e338f589..67761f14bec70 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@@ -6,26 +6,84 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistillGPT2,pass,0
 
 
@@ -34,6 +92,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -46,6 +111,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -54,6 +126,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -62,10 +141,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -74,14 +167,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -103,6 +221,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -123,3 +242,5 @@ openai/whisper-tiny,fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,fail_accuracy,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
index 05f9596f620a2..1780cb5d8b5ba 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@@ -118,6 +118,61 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_accuracy,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -126,7 +181,11 @@ llama,fail_to_run,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -142,10 +201,13 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,fail_to_run,0
 
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,0
 
 
@@ -258,6 +320,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
index 46f1e5adf4ec9..67b4a60c07495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -127,3 +247,5 @@ openai/whisper-tiny,pass,6
 
 
 Qwen/Qwen3-0.6B,pass,5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
index 988cd1b70a5be..ef99c684ba17f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,4
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
index 1de6cdf549652..a386bf83ee495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,eager_fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
index b5e457e58997d..dc13971e6bc2a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,pass,6
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,eager_fail_to_run,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 5b47d04938244..76dd0eade618d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -106,11 +106,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,eager_fail_to_run,3
 
 
 
 doctr_reco_predictor,eager_fail_to_run,1
+=======
+doctr_det_predictor,eager_fail_to_run,5
+
+
+
+doctr_reco_predictor,eager_fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +138,76 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,9
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +216,11 @@ llama,pass,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -154,7 +236,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +364,41 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +411,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
index e4b9fe47e390f..7e3b456ad9609 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@@ -78,6 +78,73 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,fail_to_run,3
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +157,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -106,7 +177,11 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v3_large,pass,0
+=======
+mobilenet_v3_large,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -146,7 +221,11 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
+<<<<<<< HEAD
 resnet152,pass,0
+=======
+resnet152,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -190,6 +269,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,7
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -202,7 +316,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
index 8850f98ce1ff6..79703b29bc698 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -127,3 +247,5 @@ openai/whisper-tiny,pass,6
 
 
 Qwen/Qwen3-0.6B,pass,5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
index 988cd1b70a5be..67bb70ddfdbd9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,11 +21,46 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 BlenderbotForCausalLM,pass_due_to_skip,0
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+BlenderbotForCausalLM,eager_fail_to_run,0
+
+
+
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,11 +68,26 @@ DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +95,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +114,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +129,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +144,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +171,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +225,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +246,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
index 0487b132c9378..ddbeb34f79510 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,7 +163,63 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
 repvgg_a2,fail_accuracy,0
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+repvgg_a2,pass,0
+
+
+
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -58,16 +227,54 @@ swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
index b2071874b70d6..e0da1fadfaf96 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,15 +62,93 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
 mobilenetv2_100,fail_accuracy,7
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+mobilenetv2_100,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -50,7 +164,63 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
 repvgg_a2,fail_accuracy,7
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+repvgg_a2,pass,7
+
+
+
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -58,10 +228,18 @@ swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
 visformer_small,fail_accuracy,7
 
 
@@ -71,3 +249,34 @@ vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+visformer_small,pass,7
+
+
+
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index 42deaec76b543..ebbc4b87abd74 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -106,11 +106,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,eager_fail_to_run,3
 
 
 
 doctr_reco_predictor,eager_fail_to_run,1
+=======
+doctr_det_predictor,eager_fail_to_run,5
+
+
+
+doctr_reco_predictor,eager_fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +138,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_to_run,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ llama,pass,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -154,10 +229,13 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
 
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,0
 
 
@@ -250,10 +328,13 @@ sam,pass,0
 
 
 
+<<<<<<< HEAD
 sam_fast,model_fail_to_load,0
 
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 shufflenet_v2_x1_0,pass,0
 
 
@@ -278,6 +359,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +398,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
index b164cb28d04b0..6bf1e69846f18 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@@ -70,7 +70,11 @@ fastNLP_Bert,pass,10
 
 
 
+<<<<<<< HEAD
 functorch_dp_cifar10,fail_accuracy,7
+=======
+functorch_dp_cifar10,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,6 +82,61 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,fail_to_run,3
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +149,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -106,7 +169,11 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v3_large,pass,0
+=======
+mobilenet_v3_large,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -146,7 +213,11 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
+<<<<<<< HEAD
 resnet152,pass,0
+=======
+resnet152,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -190,6 +261,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -202,7 +300,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,fail_accuracy,37
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
index 46f1e5adf4ec9..67b4a60c07495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -127,3 +247,5 @@ openai/whisper-tiny,pass,6
 
 
 Qwen/Qwen3-0.6B,pass,5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
index a6a0fe77d7fc0..6ab15952ac2ab 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,4
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_failed_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_failed_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
index 1de6cdf549652..a386bf83ee495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,eager_fail_to_run,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
index b5e457e58997d..dc13971e6bc2a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,pass,6
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,eager_fail_to_run,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index ee742091e0082..160395645f12e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -106,11 +106,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,eager_fail_to_run,3
 
 
 
 doctr_reco_predictor,eager_fail_to_run,1
+=======
+doctr_det_predictor,eager_fail_to_run,5
+
+
+
+doctr_reco_predictor,eager_fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +138,76 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,pass,9
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,pass,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +216,11 @@ llama,pass,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -154,7 +236,11 @@ maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,2
+=======
+microbench_unbacked_tolist_sum,pass,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -278,6 +364,41 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,0
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
@@ -290,7 +411,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,21
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
index 62a73728fbba6..e77a98ec4b61d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@@ -78,6 +78,73 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,15
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +157,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -106,7 +177,11 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 mobilenet_v3_large,pass,0
+=======
+mobilenet_v3_large,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -146,7 +221,11 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
+<<<<<<< HEAD
 resnet152,pass,0
+=======
+resnet152,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +273,41 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientdet,pass,2
+
+
+
+timm_efficientnet,pass,7
+
+
+
+timm_nfnet,pass,0
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
@@ -206,7 +320,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,40
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
index 46f1e5adf4ec9..67b4a60c07495 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,4
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,0
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,0
+
+
+
+CamemBert,pass,0
+
+
+
+DebertaForMaskedLM,pass,0
+
+
+
+DebertaForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,2
+=======
+DistilBertForQuestionAnswering,pass,0
+
+
+
+DistillGPT2,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,0
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,0
 
 
@@ -66,10 +146,24 @@ MegatronBertForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+MobileBertForQuestionAnswering,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPTForCausalLM,pass,0
 
 
@@ -78,14 +172,39 @@ PLBartForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,0
+
+
+
+Speech2Text2ForCausalLM,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,0
 
 
@@ -107,6 +226,7 @@ XLNetLMHeadModel,pass,0
 
 
 YituTechConvBert,pass,0
+<<<<<<< HEAD
 
 
 
@@ -127,3 +247,5 @@ openai/whisper-tiny,pass,6
 
 
 Qwen/Qwen3-0.6B,pass,5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
index e06f3bde8af13..1e5cd3971fa73 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@@ -6,6 +6,13 @@ AlbertForMaskedLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+AlbertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AllenaiLongformerBase,pass,9
 
 
@@ -14,23 +21,75 @@ BartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+BartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+BertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
+=======
+BlenderbotSmallForCausalLM,pass,6
+
+
+
+BlenderbotSmallForConditionalGeneration,pass,8
+
+
+
+CamemBert,pass,5
+
+
+
+DebertaForMaskedLM,pass,5
+
+
+
+DebertaForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DistilBertForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
 DistillGPT2,pass,7
+=======
+DistilBertForQuestionAnswering,pass,5
+
+
+
+DistillGPT2,pass,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -38,6 +97,13 @@ ElectraForCausalLM,pass,4
 
 
 
+<<<<<<< HEAD
+=======
+ElectraForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,pass,6
 
 
@@ -50,6 +116,13 @@ LayoutLMForMaskedLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+LayoutLMForSequenceClassification,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -58,6 +131,13 @@ MBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+MBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MT5ForConditionalGeneration,pass,5
 
 
@@ -66,11 +146,26 @@ MegatronBertForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+MegatronBertForQuestionAnswering,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MobileBertForMaskedLM,pass,3
 
 
 
+<<<<<<< HEAD
 OPTForCausalLM,pass,8
+=======
+MobileBertForQuestionAnswering,pass,3
+
+
+
+OPTForCausalLM,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -78,14 +173,39 @@ PLBartForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PLBartForConditionalGeneration,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PegasusForCausalLM,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+PegasusForConditionalGeneration,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RobertaForCausalLM,pass,5
 
 
 
+<<<<<<< HEAD
+=======
+RobertaForQuestionAnswering,pass,5
+
+
+
+Speech2Text2ForCausalLM,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,pass,5
 
 
@@ -107,6 +227,7 @@ XLNetLMHeadModel,pass,5
 
 
 YituTechConvBert,pass,5
+<<<<<<< HEAD
 
 
 
@@ -127,3 +248,5 @@ openai/whisper-tiny,eager_fail_to_run,0
 
 
 Qwen/Qwen3-0.6B,eager_fail_to_run,0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
index 1de6cdf549652..4ab337ca438cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
+=======
+botnet26t_256,pass,0
+
+
+
+cait_m36_384,pass,0
+
+
+
+coat_lite_mini,pass,0
+
+
+
+convit_base,pass,0
+
+
+
+convmixer_768_32,pass,0
+
+
+
+convnext_base,pass,0
+
+
+
+crossvit_9_240,pass,0
+
+
+
+cspdarknet53,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,0
+=======
+dla102,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,0
+
+
+
+eca_botnext26ts_256,pass,0
+
+
+
+eca_halonext26ts,pass,0
+
+
+
+ese_vovnet19b_dw,pass,0
+
+
+
+fbnetc_100,pass,0
+
+
+
+fbnetv3_b,pass,0
+
+
+
+gernet_l,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,0
+
+
+
+gmixer_24_224,pass,0
+
+
+
+gmlp_s16_224,pass,0
+
+
+
+hrnet_w18,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,0
+
+
+
+lcnet_050,pass,0
+
+
+
+levit_128,pass,0
+
+
+
+mixer_b16_224,pass,0
+
+
+
+mixnet_l,pass,0
+
+
+
+mnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,0
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,0
+
+
+
+pnasnet5large,pass,0
+
+
+
+poolformer_m36,pass,0
+
+
+
+regnety_002,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,0
+
+
+
+res2net50_14w_8s,pass,0
+
+
+
+res2next50,pass,0
+
+
+
+resmlp_12_224,pass,0
+
+
+
+resnest101e,pass,0
+
+
+
+rexnet_100,pass,0
+
+
+
+sebotnet33ts_256,pass,0
+
+
+
+selecsls42b,pass,0
+
+
+
+spnasnet_100,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,0
+
+
+
+tinynet_a,pass,0
+
+
+
+tnt_s_patch16_224,pass,0
+
+
+
+twins_pcpvt_base,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,0
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
 vit_base_patch16_siglip_256,pass,0
+=======
+vit_base_patch16_224,pass,0
+
+
+
+volo_d1_224,pass,0
+
+
+
+xcit_large_24_p8_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
index b2f40504a4991..7685416f55162 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
@@ -10,7 +10,39 @@ beit_base_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7
+=======
+botnet26t_256,pass,6
+
+
+
+cait_m36_384,eager_fail_to_run,0
+
+
+
+coat_lite_mini,pass,6
+
+
+
+convit_base,pass,7
+
+
+
+convmixer_768_32,pass,5
+
+
+
+convnext_base,pass,7
+
+
+
+crossvit_9_240,pass,7
+
+
+
+cspdarknet53,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -18,7 +50,11 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
+<<<<<<< HEAD
 deit_tiny_patch16_224.fb_in1k,pass,7
+=======
+dla102,pass,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -26,14 +62,91 @@ dm_nfnet_f0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+dpn107,pass,6
+
+
+
+eca_botnext26ts_256,pass,7
+
+
+
+eca_halonext26ts,pass,7
+
+
+
+ese_vovnet19b_dw,pass,7
+
+
+
+fbnetc_100,pass,7
+
+
+
+fbnetv3_b,pass,6
+
+
+
+gernet_l,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ghostnet_100,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+gluon_inception_v3,pass,7
+
+
+
+gmixer_24_224,pass,6
+
+
+
+gmlp_s16_224,pass,7
+
+
+
+hrnet_w18,pass,5
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inception_v3,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+jx_nest_base,pass,7
+
+
+
+lcnet_050,pass,6
+
+
+
+levit_128,pass,7
+
+
+
+mixer_b16_224,pass,7
+
+
+
+mixnet_l,pass,6
+
+
+
+mnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,pass,7
 
 
@@ -50,24 +163,120 @@ nfnet_l0,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+pit_b_224,pass,6
+
+
+
+pnasnet5large,pass,5
+
+
+
+poolformer_m36,pass,6
+
+
+
+regnety_002,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 repvgg_a2,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+res2net101_26w_4s,pass,6
+
+
+
+res2net50_14w_8s,pass,6
+
+
+
+res2next50,pass,6
+
+
+
+resmlp_12_224,pass,6
+
+
+
+resnest101e,pass,6
+
+
+
+rexnet_100,pass,7
+
+
+
+sebotnet33ts_256,pass,6
+
+
+
+selecsls42b,pass,6
+
+
+
+spnasnet_100,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 swin_base_patch4_window7_224,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+swsl_resnext101_32x16d,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tf_efficientnet_b0,pass,6
 
 
 
+<<<<<<< HEAD
+=======
+tf_mixnet_l,pass,6
+
+
+
+tinynet_a,pass,6
+
+
+
+tnt_s_patch16_224,pass,7
+
+
+
+twins_pcpvt_base,pass,7
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 visformer_small,pass,7
 
 
 
+<<<<<<< HEAD
 vit_base_patch14_dinov2.lvd142m,fail_accuracy,7
 
 
 
 vit_base_patch16_siglip_256,pass,7
+=======
+vit_base_patch16_224,pass,7
+
+
+
+volo_d1_224,pass,7
+
+
+
+xcit_large_24_p8_224,pass_due_to_skip,7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index b508d8bd6e308..5fb0678d59300 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -106,11 +106,19 @@ dlrm,pass,0
 
 
 
+<<<<<<< HEAD
 doctr_det_predictor,eager_fail_to_run,3
 
 
 
 doctr_reco_predictor,eager_fail_to_run,1
+=======
+doctr_det_predictor,eager_fail_to_run,5
+
+
+
+doctr_reco_predictor,eager_fail_to_run,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -130,6 +138,69 @@ functorch_maml_omniglot,pass,0
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,0
+
+
+
+hf_Bart,pass,0
+
+
+
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
+hf_BigBird,fail_accuracy,0
+
+
+
+hf_DistilBert,pass,0
+
+
+
+hf_GPT2,pass,0
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_T5,pass,0
+
+
+
+hf_T5_base,eager_fail_to_run,0
+
+
+
+hf_T5_generate,pass,5
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,0
+
+
+
+hf_distil_whisper,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,0
 
 
@@ -138,7 +209,11 @@ llama,pass,0
 
 
 
+<<<<<<< HEAD
 llama_v2_7b_16h,pass_due_to_skip,0
+=======
+llama_v2_7b_16h,model_fail_to_load,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -274,6 +349,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,0
+
+
+
+timm_regnet,pass,0
+
+
+
+timm_resnest,pass,0
+
+
+
+timm_vision_transformer,pass,0
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
index 91e6df19ff02a..bd182363b67d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@@ -78,6 +78,61 @@ functorch_maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+hf_Albert,pass,6
+
+
+
+hf_Bart,pass,6
+
+
+
+hf_Bert,pass,6
+
+
+
+hf_Bert_large,pass,6
+
+
+
+hf_BigBird,pass,6
+
+
+
+hf_DistilBert,pass,6
+
+
+
+hf_GPT2,pass,6
+
+
+
+hf_GPT2_large,pass_due_to_skip,0
+
+
+
+hf_Reformer,pass,23
+
+
+
+hf_T5_base,eager_2nd_run_OOM,0
+
+
+
+hf_T5_large,pass_due_to_skip,0
+
+
+
+hf_Whisper,pass,6
+
+
+
+hf_distil_whisper,model_fail_to_load,0
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lennard_jones,pass,7
 
 
@@ -90,7 +145,11 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
 microbench_unbacked_tolist_sum,pass,9
+=======
+microbench_unbacked_tolist_sum,pass,8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -194,6 +253,33 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
+<<<<<<< HEAD
+=======
+timm_efficientnet,pass,7
+
+
+
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+
+
+
+timm_vision_transformer,pass,6
+
+
+
+timm_vision_transformer_large,pass_due_to_skip,0
+
+
+
+timm_vovnet,pass,6
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_multimodal_clip,pass,7
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 54900de1ed915..44ea971e28de7 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -9,23 +9,36 @@
 import csv
 import dataclasses
 import functools
+<<<<<<< HEAD
 import gc
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import importlib
 import itertools
 import json
 import logging
 import os
+<<<<<<< HEAD
 import platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import random
 import shutil
 import signal
 import subprocess
 import sys
+<<<<<<< HEAD
 import tempfile
 import time
 import weakref
 from contextlib import contextmanager
 from typing import Any, NamedTuple, Optional, overload, TYPE_CHECKING, TypeVar
+=======
+import time
+import weakref
+from contextlib import contextmanager
+from typing import Any, NamedTuple, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import MagicMock
 
 import numpy as np
@@ -42,7 +55,10 @@
 import torch.distributed
 import torch.multiprocessing as mp
 from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU
+<<<<<<< HEAD
 from torch._C._nativert import PyModelRunner
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import (
     dummy_fx_compile,
@@ -50,7 +66,10 @@
     reset_rng_state,
     same,
 )
+<<<<<<< HEAD
 from torch._dynamo.utils import bitwise_same
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging.scribe import open_source_signpost
 
 
@@ -59,7 +78,10 @@
     from torch._inductor.utils import fresh_cache
 except ImportError:
     from _dynamo.utils import clone_inputs, graph_break_reasons
+<<<<<<< HEAD
     from _inductor.utils import fresh_cache
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._functorch.config
 from torch._functorch.aot_autograd import set_model_name
@@ -81,10 +103,14 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Sequence
 
 _D = TypeVar("_D", bound=dict[str, Any])
 _T = TypeVar("_T")
+=======
+    from collections.abc import Mapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -118,8 +144,19 @@ class CI(NamedTuple):
 
 
 CI_SKIP_OPTIMIZER = {
+<<<<<<< HEAD
     # HF
     "MobileBertForMaskedLM",  # Stack issue in fx
+=======
+    # TIMM
+    "convmixer_768_32",  # accuracy
+    "hrnet_w18",  # Stack issue in fx
+    # HF
+    "pnasnet5large",  # Stack issue in fx
+    "MobileBertForMaskedLM",  # Stack issue in fx
+    "MobileBertForQuestionAnswering",  # Stack issue in fx
+    "PegasusForConditionalGeneration",  # OOM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 try:
@@ -150,6 +187,10 @@ class CI(NamedTuple):
     "detectron2_fasterrcnn_r_50_c4",
     "detectron2_fasterrcnn_r_50_dc5",
     "detectron2_fasterrcnn_r_50_fpn",
+<<<<<<< HEAD
+=======
+    "hf_T5_generate",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Reformer",
     "llama",
 }.union(INTERNAL_CI_SKIP_DYNAMIC_BATCH_ONLY)
@@ -176,22 +217,70 @@ class CI(NamedTuple):
     "speech_transformer",
     "squeezenet1_1",
     "stable_diffusion_text_encoder",
+<<<<<<< HEAD
+    "vgg16",
+    # HF
+    "AlbertForMaskedLM",
+    "BartForCausalLM",
+    "ElectraForCausalLM",
+    "M2M100ForConditionalGeneration",
+    "MBartForCausalLM",
+    "OPTForCausalLM",
+    "PLBartForCausalLM",
+    "PegasusForCausalLM",
+=======
+    "timm_efficientdet",
+    "timm_nfnet",
+    "timm_resnest",
+    "timm_vision_transformer",
+    "timm_vovnet",
     "vgg16",
+    "hf_T5",  # Fails dynamic https://github.com/pytorch/pytorch/issues/115968
     # HF
     "AlbertForMaskedLM",
     "BartForCausalLM",
+    "BartForConditionalGeneration",
+    "BlenderbotSmallForCausalLM",
+    "BlenderbotSmallForConditionalGeneration",
+    "DebertaV2ForQuestionAnswering",  # eager OOM
     "ElectraForCausalLM",
     "M2M100ForConditionalGeneration",
     "MBartForCausalLM",
+    "MBartForConditionalGeneration",
     "OPTForCausalLM",
     "PLBartForCausalLM",
+    "PLBartForConditionalGeneration",
     "PegasusForCausalLM",
+    "Speech2Text2ForCausalLM",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "TrOCRForCausalLM",
     "XGLMForCausalLM",
     # TIMM
     "adv_inception_v3",
+<<<<<<< HEAD
     "tf_efficientnet_b0",
     "ghostnet_100",
+=======
+    "botnet26t_256",
+    "cait_m36_384",  # OOM
+    "coat_lite_mini",
+    "convit_base",
+    "dpn107",
+    "fbnetv3_b",
+    "gernet_l",
+    "lcnet_050",
+    "mixnet_l",
+    "res2net101_26w_4s",
+    "res2net50_14w_8s",
+    "res2next50",
+    "resnest101e",
+    "sebotnet33ts_256",
+    "swsl_resnext101_32x16d",
+    "tf_efficientnet_b0",
+    "ghostnet_100",
+    "gmixer_24_224",
+    "tinynet_a",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # These models OOM in CI
@@ -210,21 +299,47 @@ class CI(NamedTuple):
     "detectron2_maskrcnn_r_101_fpn",
     "detectron2_maskrcnn_r_50_c4",
     "detectron2_maskrcnn_r_50_fpn",
+<<<<<<< HEAD
+=======
+    "hf_T5_base",
+    "hf_clip",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "llama_v2_7b_16h",
     "mobilenet_v2_quantized_qat",
     "phi_1_5 resnet50_quantized_qat",
     "BlenderbotForCausalLM",
+<<<<<<< HEAD
+=======
+    "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "DALLE2_pytorch",
     "moco",
     "timm_efficientdet",
     "ghostnet_100",
+<<<<<<< HEAD
+    "inception_v3",
+    "mobilevit_s",
+    "pytorch_CycleGAN_and_pix2pix",
+    "vision_maskrcnn",
+    "dlrm",
+    "resnet50",
+    "dm_nfnet_f0",
+=======
+    "regnety_002",
+    "poolformer_m36",
     "inception_v3",
+    "tinynet_a",
+    "selecsls42b",
     "mobilevit_s",
     "pytorch_CycleGAN_and_pix2pix",
     "vision_maskrcnn",
+    "resmlp_12_224",
     "dlrm",
     "resnet50",
     "dm_nfnet_f0",
+    "pit_b_224",
+    "tf_mixnet_l",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -688,7 +803,11 @@ def vary_batch(t: torch.Tensor, new_batch_size) -> torch.Tensor:
 
     time_total = 0
     # Dont collect outputs to correctly measure timing
+<<<<<<< HEAD
     for i in range(times):
+=======
+    for _ in range(times):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If batch_size is 1, it too often collides with other non batch size
         # dimensions resulting in errors.
         if batch_size and batch_size > 1:
@@ -728,6 +847,7 @@ def vary_batch(t: torch.Tensor, new_batch_size) -> torch.Tensor:
     return (time_total, result) if return_result else time_total
 
 
+<<<<<<< HEAD
 @overload
 def _normalize_bench_inputs(example_inputs: _D) -> tuple[tuple[()], _D]: ...
 
@@ -739,6 +859,9 @@ def _normalize_bench_inputs(
 
 
 def _normalize_bench_inputs(example_inputs):
+=======
+def _normalize_bench_inputs(example_inputs) -> tuple[tuple[Any], Mapping[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary,
     # and consumed like `model(**example_inputs)`.
     # For other benchmarks, example_inputs are formatted as tuple and consumed
@@ -1056,6 +1179,7 @@ def maybe_mark_profile(*args, **kwargs):
             frozen_model_iter_fn = export_aot_inductor(
                 model, example_inputs, args.inductor_compile_mode
             )
+<<<<<<< HEAD
         elif args.export_nativert:
             frozen_model_iter_fn = export_nativert(model, example_inputs)
         elif args.torchscript_jit_trace:
@@ -1070,6 +1194,10 @@ def maybe_mark_profile(*args, **kwargs):
                 frozen_model_iter_fn = model_iter_fn
             else:
                 frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
+=======
+        else:
+            frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for rep in trange(args.repeat, desc="running benchmark"):
             inputs = (
@@ -1083,10 +1211,14 @@ def maybe_mark_profile(*args, **kwargs):
             maybe_mark_step(args)
 
             # interleave the runs to handle frequency scaling and load changes
+<<<<<<< HEAD
             with (
                 maybe_mark_profile(p=p, mark="expected"),
                 torch.compiler.set_stance("force_eager"),
             ):
+=======
+            with maybe_mark_profile(p=p, mark="expected"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 timings[rep, 0], expected_output = timed(
                     model,
                     model_iter_fn,
@@ -1417,6 +1549,7 @@ def get_excess_memory(cls, model) -> float:
         return cls.cache.get(weakref.ref(model), (None, 0.0))[1]
 
 
+<<<<<<< HEAD
 class NativeRTCache:
     cache: dict[weakref.ref, Any] = {}
 
@@ -1471,6 +1604,8 @@ def load(cls, model, example_inputs):
         return cls.cache[key]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def export(model, example_inputs):
     from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
 
@@ -1497,6 +1632,7 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
+<<<<<<< HEAD
 def aot_precompile(model, example_inputs):
     example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
 
@@ -1538,6 +1674,8 @@ def opt_nativert(_, example_inputs, collect_outputs=False):
     return opt_nativert
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def export_aot_inductor(model, example_inputs, mode):
     optimized = AOTInductorModelCache.load(model, example_inputs, mode)
 
@@ -1548,6 +1686,7 @@ def opt_aot_inductor(_, example_inputs, collect_outputs=False):
     return opt_aot_inductor
 
 
+<<<<<<< HEAD
 def torchscript_jit_trace(model, example_inputs):
     optimized = JitTracedCache.load(model, example_inputs)
 
@@ -1558,6 +1697,8 @@ def opt_jit_trace(_, example_inputs, collect_outputs=False):
     return opt_jit_trace
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def download_retry_decorator(download_fn):
     """
     Decorator function for applying retry logic to a download function.
@@ -1751,8 +1892,13 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
                         f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
                     )
                 )
+<<<<<<< HEAD
             except Exception:
                 log.exception("Failed to save memory snapshot")
+=======
+            except Exception as e:
+                log.error("Failed to save memory snapshot, %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             torch.cuda.memory._record_memory_history(enabled=None)
 
@@ -1763,7 +1909,11 @@ def __init__(self):
         self.grad_scaler = DummyGradScaler()
         self.autocast = contextlib.nullcontext
         self.autocast_arg = {}
+<<<<<<< HEAD
         self.optimizer: Optional[torch.optim.Optimizer] = None
+=======
+        self.optimizer = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._args = None
 
     def setup_amp(self, current_device=None):
@@ -1842,10 +1992,13 @@ def skip_models_for_cpu(self):
         return set()
 
     @property
+<<<<<<< HEAD
     def skip_models_for_cpu_aarch64(self):
         return set()
 
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def skip_models_for_freezing_cpu(self):
         return set()
 
@@ -2056,6 +2209,11 @@ def get_fsdp_auto_wrap_policy(self, model_name: str):
         from diffusers.models.transformer_2d import Transformer2DModel
         from torchbenchmark.models.nanogpt.model import Block
         from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+<<<<<<< HEAD
+=======
+        from transformers.models.t5.modeling_t5 import T5Block
+        from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch.distributed.fsdp.wrap import (
             ModuleWrapPolicy,
@@ -2065,6 +2223,13 @@ def get_fsdp_auto_wrap_policy(self, model_name: str):
         # handcrafted wrap policy
         MODEL_FSDP_WRAP = {
             "stable_diffusion_unet": (Transformer2DModel,),
+<<<<<<< HEAD
+=======
+            "hf_T5": (T5Block,),
+            "hf_T5_base": (T5Block,),
+            "hf_T5_large": (T5Block,),
+            "hf_Whisper": (WhisperEncoderLayer,),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "llama_v2_7b_16h": (LlamaDecoderLayer,),
             "nanogpt": (Block,),
         }
@@ -2224,12 +2389,20 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
+<<<<<<< HEAD
                 with torch.compiler.set_stance("force_eager"):
                     model_copy = self.deepcopy_and_maybe_parallelize(model)
                     self.init_optimizer(name, current_device, model_copy.parameters())
                     correct_result = self.run_n_iterations(
                         model_copy, clone_inputs(example_inputs), self.model_iter_fn
                     )
+=======
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
+                self.init_optimizer(name, current_device, model_copy.parameters())
+                correct_result = self.run_n_iterations(
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 accuracy_status = (
                     "eager_1st_run_OOM"
@@ -2246,12 +2419,20 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
+<<<<<<< HEAD
                 with torch.compiler.set_stance("force_eager"):
                     model_copy = self.deepcopy_and_maybe_parallelize(model)
                     self.init_optimizer(name, current_device, model_copy.parameters())
                     correct_rerun_result = self.run_n_iterations(
                         model_copy, clone_inputs(example_inputs), self.model_iter_fn
                     )
+=======
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
+                self.init_optimizer(name, current_device, model_copy.parameters())
+                correct_rerun_result = self.run_n_iterations(
+                    model_copy, clone_inputs(example_inputs), self.model_iter_fn
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 accuracy_status = (
                     "eager_2nd_run_OOM"
@@ -2264,9 +2445,13 @@ def record_status(accuracy_status, dynamo_start_stats):
                 del model_copy
                 empty_gpu_cache(current_device)
 
+<<<<<<< HEAD
             # Two eager runs should have exactly same result, within tolerance.
             # TODO If we want the above to be true, then deterministic should be set.
             # For example, MIOpen convolutions could be implemented with non-deterministic algos.
+=======
+            # Two eager runs should have exactly same result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_same = True
             try:
                 if (
@@ -2276,7 +2461,11 @@ def record_status(accuracy_status, dynamo_start_stats):
                         correct_rerun_result,
                         fp64_ref=None,
                         cos_similarity=False,
+<<<<<<< HEAD
                         tol=tolerance if torch.version.hip else 0,
+=======
+                        tol=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         equal_nan=self.equal_nan,
                         use_larger_multiplier_for_smaller_tensor=self.use_larger_multiplier_for_smaller_tensor(
                             name
@@ -2284,11 +2473,17 @@ def record_status(accuracy_status, dynamo_start_stats):
                     )
                 ):
                     is_same = False
+<<<<<<< HEAD
             except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
                 exception_string = str(e)
                 accuracy_status = f"fail_exception: {exception_string}"
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
+=======
+            except Exception:
+                # Sometimes torch.allclose may throw RuntimeError
+                is_same = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if not is_same:
                 accuracy_status = "eager_two_runs_differ"
@@ -2304,6 +2499,7 @@ def record_status(accuracy_status, dynamo_start_stats):
             try:
                 model_copy = self.deepcopy_and_maybe_parallelize(model)
                 self.init_optimizer(name, current_device, model_copy.parameters())
+<<<<<<< HEAD
                 if (
                     self.args.export
                     or self.args.export_aot_inductor
@@ -2311,6 +2507,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                     or self.args.torchscript_jit_trace
                     or self.args.aot_precompile
                 ):
+=======
+                if self.args.export or self.args.export_aot_inductor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # apply export on module directly
                     # no need for n iterations
                     # the logic should be the same to self.model_iter_fn (forward_pass)
@@ -2358,6 +2557,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                         new_result = process_fn(new_result)
                         fp64_outputs = process_fn(fp64_outputs)
 
+<<<<<<< HEAD
                 if (
                     self.args.save_model_outputs_to
                     and self.args.compare_model_outputs_with
@@ -2392,6 +2592,8 @@ def record_status(accuracy_status, dynamo_start_stats):
                     )
                     del saved_result
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not same(
                     correct_result,
                     new_result,
@@ -2405,11 +2607,17 @@ def record_status(accuracy_status, dynamo_start_stats):
                     force_max_multiplier=force_max_multiplier,
                 ):
                     is_same = False
+<<<<<<< HEAD
             except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
                 exception_string = str(e)
                 accuracy_status = f"fail_exception: {exception_string}"
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
+=======
+            except Exception:
+                # Sometimes torch.allclose may throw RuntimeError
+                is_same = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if not is_same:
                 if self.args.skip_accuracy_check:
@@ -2506,7 +2714,10 @@ def run_performance_test_non_alternate(
         )
 
         def warmup(fn, model, example_inputs, mode, niters=10):
+<<<<<<< HEAD
             gc.collect()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             peak_mem = 0
             start_stats = get_dynamo_stats()
             try:
@@ -2545,8 +2756,11 @@ def warmup(fn, model, example_inputs, mode, niters=10):
         # Use distributed wrapping as necessary
         model = self.deepcopy_and_maybe_parallelize(model)
 
+<<<<<<< HEAD
         if not hasattr(model, name):
             model.name = name
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.init_optimizer(name, current_device, model.parameters())
 
         # The self.autocast context is needed for the model we export with aot_compile,
@@ -2576,11 +2790,15 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                     )
 
             baseline_timings = experiment(
+<<<<<<< HEAD
                 self.model_iter_fn,
                 model,
                 example_inputs,
                 mark="expected",
                 **experiment_kwargs,
+=======
+                model, example_inputs, mark="expected", **experiment_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             if self.args.export_aot_inductor:
@@ -2648,16 +2866,25 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                     )
 
             backend_timings = experiment(
+<<<<<<< HEAD
                 self.model_iter_fn,
                 model,
                 example_inputs,
                 mark="expected",
                 **experiment_kwargs,
+=======
+                model, example_inputs, mark="expected", **experiment_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             timings = np.stack((baseline_timings, backend_timings), axis=1)
             result_summary = latency_experiment_summary(
                 self.suite_name, self.args, model, timings, **experiment_kwargs
             )
+<<<<<<< HEAD
+=======
+            if not hasattr(model, name):
+                model.name = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             results.append(result_summary)
             return " ".join(map(str, results))
 
@@ -2671,6 +2898,7 @@ def run_performance_test(
         tag=None,
         batch_size=None,
     ):
+<<<<<<< HEAD
         niters = 5
         if getattr(self, "hf_llm", False):
             # If we're benchmarking an llm, we want to use the generate function
@@ -2685,6 +2913,13 @@ def run_performance_test(
 
         def warmup(fn, model, example_inputs, mode, niters=5):
             gc.collect()
+=======
+        if self.args.xla:
+            with self.pick_grad(name, self.args.training):
+                return experiment(*self.maybe_cast(model, example_inputs))
+
+        def warmup(fn, model, example_inputs, mode, niters=5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             peak_mem = 0
             start_stats = get_dynamo_stats()
             try:
@@ -2723,9 +2958,12 @@ def warmup(fn, model, example_inputs, mode, niters=5):
         # Use distributed wrapping as necessary
         model = self.deepcopy_and_maybe_parallelize(model)
 
+<<<<<<< HEAD
         if not hasattr(model, name):
             model.name = name
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.init_optimizer(name, current_device, model.parameters())
 
         # The self.autocast context is needed for the model we export with aot_compile,
@@ -2746,12 +2984,21 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             with maybe_snapshot_memory(
                 self.args.snapshot_memory, f"eager_{self.args.only}"
             ):
+<<<<<<< HEAD
                 with torch.compiler.set_stance("force_eager"):
                     eager_latency, eager_peak_mem, _ = warmup(
+=======
+                eager_latency, eager_peak_mem, _ = warmup(
+                    self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
+                )
+                if self.args.use_warm_peak_memory:
+                    _, eager_peak_mem, _ = warmup(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.model_iter_fn,
                         copy.deepcopy(model),
                         example_inputs,
                         "eager",
+<<<<<<< HEAD
                         niters=niters,
                     )
                     if self.args.use_warm_peak_memory:
@@ -2778,6 +3025,15 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                     optimized_model_iter_fn = self.model_iter_fn
                 else:
                     optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+=======
+                        niters=1,
+                    )
+
+            if self.args.export_aot_inductor:
+                optimized_model_iter_fn = optimize_ctx
+            else:
+                optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with maybe_snapshot_memory(
                 self.args.snapshot_memory, f"compiled_{self.args.only}"
@@ -2855,6 +3111,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                     f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
                 )
 
+<<<<<<< HEAD
             experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
 
             results.append(
@@ -2862,6 +3119,11 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                     self.model_iter_fn, model, example_inputs, **experiment_kwargs
                 )
             )
+=======
+            if not hasattr(model, name):
+                model.name = name
+            results.append(experiment(model, example_inputs, **experiment_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return " ".join(map(str, results))
 
     def minify_model(
@@ -3429,6 +3691,7 @@ def get_example_inputs(self):
             instead of deleting it and creating a new one.",
     )
 
+<<<<<<< HEAD
     parser.add_argument(
         "--caching-precompile",
         action="store_true",
@@ -3446,6 +3709,8 @@ def get_example_inputs(self):
         help="Specify the path for the saved model outputs to compare against",
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     group_latency = parser.add_mutually_exclusive_group()
     group_latency.add_argument(
         "--cold-start-latency",
@@ -3545,6 +3810,7 @@ def get_example_inputs(self):
         help="Measure pass rate with Export+AOTInductor",
     )
     group.add_argument(
+<<<<<<< HEAD
         "--aot-precompile",
         action="store_true",
         help="Measure pass rate with AOT Precompile",
@@ -3560,6 +3826,8 @@ def get_example_inputs(self):
         help="Measure pass rate with TorchScript jit.trace",
     )
     group.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
     )
     group.add_argument(
@@ -3611,6 +3879,7 @@ def get_example_inputs(self):
     return parser.parse_args(args)
 
 
+<<<<<<< HEAD
 def process_caching_precompile():
     """
     After every process_entry, save precompile artifacts to DynamoCache
@@ -3626,6 +3895,8 @@ def process_caching_precompile():
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def process_entry(rank, runner, original_dir, args):
     args.rank = rank
     with maybe_init_distributed(
@@ -3634,10 +3905,14 @@ def process_entry(rank, runner, original_dir, args):
         world_size=args.world_size,
         port=args.distributed_master_port,
     ):
+<<<<<<< HEAD
         result = run(runner, args, original_dir)
         if args.caching_precompile:
             process_caching_precompile()
         return result
+=======
+        return run(runner, args, original_dir)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def maybe_fresh_cache(args):
@@ -3673,10 +3948,13 @@ def main(runner, original_dir=None, args=None):
             )
 
     with maybe_fresh_cache(args):
+<<<<<<< HEAD
         if args.caching_precompile:
             os.environ["TORCH_CACHING_PRECOMPILE"] = "1"
             torch._dynamo.config.caching_precompile = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args.init_distributed = args.only and args.multiprocess
         if args.init_distributed:
             # NB: Do NOT query device count before CUDA initialization; we're
@@ -3730,6 +4008,7 @@ def write_csv_when_exception(args, name: str, status: str, device=None):
         write_outputs(output_filename, headers, row)
 
 
+<<<<<<< HEAD
 def setup_determinism_for_accuracy_test(args):
     if args.only is not None and args.only not in {
         "alexnet",
@@ -3767,6 +4046,8 @@ def setup_determinism_for_accuracy_test(args):
     torch.backends.mkldnn.deterministic = True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run(runner, args, original_dir=None):
     # Pass the parsed args object to benchmark runner object
     torch._dynamo.reset()
@@ -3832,21 +4113,70 @@ def run(runner, args, original_dir=None):
             # TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
             args.use_eval_mode = True
         inductor_config.fallback_random = True
+<<<<<<< HEAD
 
         setup_determinism_for_accuracy_test(args)
 
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
         if args.only is not None and args.only in {
             "nvidia_deeprecommender",
+=======
+        if args.only is not None and args.only not in {
+            "alexnet",
+            "Background_Matting",
+            "pytorch_CycleGAN_and_pix2pix",
+            "pytorch_unet",
+            "Super_SloMo",
+            "vgg16",
+            # https://github.com/pytorch/pytorch/issues/96724
+            "Wav2Vec2ForCTC",
+            "Wav2Vec2ForPreTraining",
+            "sam",
+            "sam_fast",
+            "resnet50_quantized_qat",
+            "mobilenet_v2_quantized_qat",
+            "detectron2_maskrcnn",
+            "detectron2_maskrcnn_r_101_c4",
+            "detectron2_maskrcnn_r_101_fpn",
+            "detectron2_maskrcnn_r_50_c4",
+            "detectron2_maskrcnn_r_50_fpn",
+            "detectron2_fasterrcnn_r_101_c4",
+            "detectron2_fasterrcnn_r_101_dc5",
+            "detectron2_fasterrcnn_r_101_fpn",
+            "detectron2_fasterrcnn_r_50_c4",
+            "detectron2_fasterrcnn_r_50_dc5",
+            "detectron2_fasterrcnn_r_50_fpn",
+        }:
+            # some of the models do not support use_deterministic_algorithms
+            torch.use_deterministic_algorithms(True)
+        if args.devices == ["xpu"]:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+        if args.only is not None and args.only in {
+            "DebertaForQuestionAnswering",
+            "nvidia_deeprecommender",
+            "crossvit_9_240",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }:
             # These seem unhappy with numerics of larger cuBLASLt workspace
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
 
+<<<<<<< HEAD
         torch.backends.cudnn.allow_tf32 = False
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(False)
 
+=======
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(False)
+
+        torch.backends.mkldnn.deterministic = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Remove randomness when torch manual seed is called
         patch_torch_manual_seed()
 
@@ -3861,6 +4191,10 @@ def run(runner, args, original_dir=None):
             runner.skip_models.update(
                 {
                     # xfail: https://github.com/pytorch/pytorch/issues/145773
+<<<<<<< HEAD
+=======
+                    "convit_base",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "llama",
                     "cm3leon_generate",
                 }
@@ -3891,6 +4225,25 @@ def run(runner, args, original_dir=None):
         global synchronize
         synchronize = torch.cuda.synchronize if HAS_CUDA else torch.xpu.synchronize
 
+<<<<<<< HEAD
+=======
+    if (
+        args.devices == ["cuda"]
+        and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30
+    ):
+        # OOM errors on an RTX 3090 with 24gb RAM
+        runner.skip_models.update(
+            {
+                # torchbench
+                "hf_Longformer",
+                "timm_nfnet",
+                "timm_efficientdet",
+            }
+        )
+        if args.training:
+            runner.skip_models.add("hf_T5")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if args.nnc:
         torch._C._jit_override_can_fuse_on_cpu(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
@@ -3921,10 +4274,14 @@ def run(runner, args, original_dir=None):
         runner.skip_models.update(runner.slow_models)
 
     if args.devices == ["cpu"]:
+<<<<<<< HEAD
         arch = platform.machine()
         runner.skip_models.update(runner.skip_models_for_cpu)
         if arch == "aarch64":
             runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
+=======
+        runner.skip_models.update(runner.skip_models_for_cpu)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif args.devices == ["cuda"]:
         runner.skip_models.update(runner.skip_models_for_cuda)
 
@@ -3979,6 +4336,7 @@ def run(runner, args, original_dir=None):
         optimize_ctx = export
         experiment = speedup_experiment
         output_filename = "export.csv"
+<<<<<<< HEAD
     elif args.aot_precompile:
         optimize_ctx = aot_precompile
         experiment = speedup_experiment
@@ -3991,6 +4349,8 @@ def run(runner, args, original_dir=None):
         optimize_ctx = torchscript_jit_trace
         experiment = speedup_experiment
         output_filename = "torchscript_jit_trace.csv"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif args.xla:
         (dev,) = args.devices
         os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]
@@ -4064,7 +4424,11 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
         else:
             optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = (
+<<<<<<< HEAD
             speedup_experiment if args.backend != "torchao" else latency_experiment
+=======
+            speedup_experiment if not args.backend == "torchao" else latency_experiment
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if args.accuracy:
             output_filename = f"accuracy_{args.backend}.csv"
@@ -4151,7 +4515,11 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
         # Overwrite 'translation_validation' config, if specified.
         torch.fx.experimental._config.translation_validation = False
 
+<<<<<<< HEAD
     experiment = functools.partial(experiment, args)
+=======
+    experiment = functools.partial(experiment, args, runner.model_iter_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if args.only and should_diff_branch(args):
         import git
@@ -4305,7 +4673,11 @@ def detect_and_mark_batch(t):
                 nonlocal marked
                 for i, s in enumerate(t.size()):
                     if s == batch_size:
+<<<<<<< HEAD
                         torch._dynamo.maybe_mark_dynamic(t, i)
+=======
+                        torch._dynamo.mark_dynamic(t, i)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         marked = True
                         break
 
diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index 14b9de188ec45..f3bbe8351ac5b 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -21,6 +21,12 @@
 except ImportError:
     from torchbench import setup_torchbench_cwd
 
+<<<<<<< HEAD
+=======
+from transformers.models.bert.modeling_bert import BertLayer, BertLMPredictionHead
+from transformers.models.t5.modeling_t5 import T5Block
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def setup(rank, world_size):
     os.environ["MASTER_ADDR"] = os.getenv("MASTER_ADDR", "localhost")
@@ -125,6 +131,11 @@ def check_fn(submodule):
 
 MODEL_FSDP_WRAP = {
     "toy_model": (MyModule,),
+<<<<<<< HEAD
+=======
+    "hf_Bert": (BertLayer, BertLMPredictionHead),
+    "hf_T5": (T5Block,),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index 2b6ed0721ac1f..70d6265885ea3 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -158,7 +158,11 @@ def print_compile(gm, ex):
     model_arg.add_argument(
         "--torchbench-model",
         "--torchbench_model",
+<<<<<<< HEAD
         help="name of torchbench model, e.g. BERT_pytorch",
+=======
+        help="name of torchbench model, e.g. hf_Bert",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     model_arg.add_argument(
         "--toy-model", "--toy_model", action="store_true", help="use toy model instead"
diff --git a/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
index ae5b7e6d82cb9..3553e58f65f15 100644
--- a/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
+++ b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
@@ -12,6 +12,20 @@ cuda,dlrm,1024,1.3421,3.2177,4.9493,1.0009
 cuda,drq,1,1.0820,3.8157,8.0732,0.9687
 cuda,fastNLP_Bert,6,1.4839,37.9050,32.7583,1.1563
 cuda,functorch_dp_cifar10,64,1.5014,6.9596,14.1516,0.4432
+<<<<<<< HEAD
+=======
+cuda,hf_Albert,8,2.2452,30.6134,25.9036,1.3098
+cuda,hf_Bart,4,1.7012,34.3999,37.9975,1.0128
+cuda,hf_Bert,4,1.9003,23.3435,34.8196,1.0273
+cuda,hf_Bert_large,4,1.6346,52.8525,62.3112,1.0726
+cuda,hf_BigBird,2,1.9208,105.2672,101.4787,1.1415
+cuda,hf_DistilBert,8,1.3988,22.5793,20.2386,1.0232
+cuda,hf_GPT2,4,1.8075,27.5184,25.3428,1.1562
+cuda,hf_GPT2_large,4,1.7716,118.7404,68.1618,1.1725
+cuda,hf_Reformer,4,1.1744,70.4228,15.1152,0.9266
+cuda,hf_T5,8,1.8778,93.3134,37.0046,1.2279
+cuda,hf_T5_large,2,2.3623,101.5518,143.7982,1.1674
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cuda,lennard_jones,1000,1.0649,1.5233,4.1119,0.9998
 cuda,mnasnet1_0,32,1.1957,19.1993,27.2302,0.7758
 cuda,mobilenet_v2,96,1.4876,32.3311,27.4719,1.1729
@@ -31,6 +45,17 @@ cuda,shufflenet_v2_x1_0,128,1.3027,25.7017,27.9875,1.1015
 cuda,soft_actor_critic,256,0.9965,2.2580,4.6661,0.9995
 cuda,speech_transformer,32,1.8405,35.1645,33.3422,1.0888
 cuda,squeezenet1_1,32,1.4191,7.3454,9.4751,1.1148
+<<<<<<< HEAD
+=======
+cuda,timm_efficientdet,1,1.6630,78.2697,150.9620,0.9904
+cuda,timm_efficientnet,32,1.2689,28.5348,66.3911,0.9428
+cuda,timm_nfnet,128,1.5319,79.5429,32.9961,1.1070
+cuda,timm_regnet,32,1.0564,56.9897,53.0027,0.9500
+cuda,timm_resnest,32,1.6485,14.3908,56.7240,0.9515
+cuda,timm_vision_transformer,8,1.6100,18.7736,36.9495,0.7301
+cuda,timm_vision_transformer_large,8,1.0842,170.9849,72.0604,0.9762
+cuda,timm_vovnet,32,1.0472,25.4676,24.8428,0.8843
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cuda,tts_angular,64,1.0366,6.9889,4.2683,0.9973
 cuda,vgg16,64,1.2560,52.7072,7.3733,0.9884
 cuda,yolov3,16,1.2600,54.2350,42.4711,1.0108
diff --git a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
index 80339a7ae3035..52737d2fc340f 100644
--- a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
+++ b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
@@ -1,8 +1,13 @@
 #name,backend,data_type,shape,wrapper,perf_speedup_target_c7i_metal_24xl
+<<<<<<< HEAD
+=======
+#timm_vision_transformer,inductor,float32,static,default,1.039510755
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 phlippe_densenet,inductor,float32,static,default,1.46474287
 basic_gnn_edgecnn,inductor,float32,dynamic,default,1.30092957
 llama_v2_7b_16h,inductor,float32,dynamic,default,1.23234331
 resnet50,inductor,float32,dynamic,default,1.67742767
+<<<<<<< HEAD
 mobilenet_v3_large,inductor,float32,static,cpp,2.63311706
 functorch_maml_omniglot,inductor,float32,dynamic,cpp,1.126799
 yolov3,export-aot-inductor,float32,static,default,1.40687424
@@ -14,3 +19,28 @@ llama,inductor,amp,static,default,1.33157028
 mnasnet1_0,inductor,amp,static,cpp,2.1296814
 #mobilenet_v2,inductor,amp,dynamic,cpp,2.27774577 # https://github.com/pytorch/pytorch/issues/131693
 densenet121,export-aot-inductor,amp,static,default,1.25591385
+=======
+#timm_efficientnet,inductor,float32,static,cpp,
+mobilenet_v3_large,inductor,float32,static,cpp,2.63311706
+timm_resnest,inductor,float32,dynamic,cpp,1.7321529
+functorch_maml_omniglot,inductor,float32,dynamic,cpp,1.126799
+#hf_GPT2,inductor,float32,dynamic,cpp,
+yolov3,export-aot-inductor,float32,static,default,1.40687424
+mobilenet_v2,export-aot-inductor,float32,static,default,2.90375357
+resnext50_32x4d,export-aot-inductor,float32,dynamic,default,1.49299689
+hf_Albert,export-aot-inductor,float32,dynamic,default,1.261471
+resnext50_32x4d,inductor,amp,static,default,1.47023111
+vgg16,inductor,amp,static,default,1.2692454
+hf_Longformer,inductor,amp,dynamic,default,1.22015225
+hf_Bert_large,inductor,amp,dynamic,default,1.18572179
+llama,inductor,amp,static,default,1.33157028
+timm_regnet,inductor,amp,static,cpp,1.12734073
+mnasnet1_0,inductor,amp,static,cpp,2.1296814
+#hf_T5_generate,inductor,amp,dynamic,cpp,
+timm_vovnet,inductor,amp,dynamic,cpp,1.10851009
+#mobilenet_v2,inductor,amp,dynamic,cpp,2.27774577 # https://github.com/pytorch/pytorch/issues/131693
+hf_GPT2,export-aot-inductor,amp,static,default,1.4432794
+densenet121,export-aot-inductor,amp,static,default,1.25591385
+hf_DistilBert,export-aot-inductor,amp,dynamic,default,1.2926442
+hf_Bart,export-aot-inductor,amp,dynamic,default,1.19515416
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index d856a241ccac1..54df092ebc705 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -7,7 +7,10 @@
 import re
 import subprocess
 import sys
+<<<<<<< HEAD
 import types
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 
 
@@ -59,6 +62,10 @@ def pip_install(package):
     "BigBirdConfig",
     "BlenderbotForConditionalGeneration",
     "BlenderbotModel",
+<<<<<<< HEAD
+=======
+    "BlenderbotSmallForConditionalGeneration",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "BlenderbotSmallModel",
     "CLIPModel",
     "CLIPVisionModel",
@@ -72,6 +79,10 @@ def pip_install(package):
     "MarianForCausalLM",
     "MarianModel",
     "MarianMTModel",
+<<<<<<< HEAD
+=======
+    "PegasusForConditionalGeneration",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PegasusModel",
     "ReformerConfig",
     "ViTForImageClassification",
@@ -105,11 +116,14 @@ def process_hf_reformer_output(out):
 # on A100 GPUs - 40 GB.
 BATCH_SIZE_KNOWN_MODELS = {}
 
+<<<<<<< HEAD
 # Run only this selected group of models, leave this empty to run everything
 TORCHBENCH_ONLY_MODELS = [
     m.strip() for m in os.getenv("TORCHBENCH_ONLY_MODELS", "").split(",") if m.strip()
 ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO(sdym): use batch-size-file parameter of common.main, like torchbench.py
 # Get the list of models and their batch sizes
@@ -120,6 +134,7 @@ def process_hf_reformer_output(out):
     lines = [line.rstrip() for line in lines]
     for line in lines:
         model_name, batch_size = line.split(",")
+<<<<<<< HEAD
         if TORCHBENCH_ONLY_MODELS and model_name not in TORCHBENCH_ONLY_MODELS:
             continue
         batch_size = int(batch_size)
@@ -131,6 +146,11 @@ def process_hf_reformer_output(out):
     from .huggingface_llm_models import HF_LLM_MODELS
 except ImportError:
     from huggingface_llm_models import HF_LLM_MODELS
+=======
+        batch_size = int(batch_size)
+        BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
+assert len(BATCH_SIZE_KNOWN_MODELS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_module_cls_by_model_name(model_cls_name):
@@ -165,7 +185,11 @@ def get_sequence_length(model_cls, model_name):
             "Bert",
             "Roberta",
         )
+<<<<<<< HEAD
     ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert"):
+=======
+    ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert", "CamemBert"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_length = 512
     elif model_name in ("TrOCRForCausalLM"):
         seq_length = 256
@@ -220,7 +244,13 @@ def generate_inputs_for_model(
         BlenderbotModel,
         BlenderbotSmallModel,
         BlenderbotForConditionalGeneration,
+<<<<<<< HEAD
+        PegasusModel,
+=======
+        BlenderbotSmallForConditionalGeneration,
         PegasusModel,
+        PegasusForConditionalGeneration,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         MarianModel,
         MarianMTModel,
     ]:
@@ -329,6 +359,13 @@ def rand_int_tensor(device, low, high, shape):
         AutoConfig.from_pretrained("YituTech/conv-bert-base"),
         AutoModelForMaskedLM,
     ),
+<<<<<<< HEAD
+=======
+    "CamemBert": (
+        AutoConfig.from_pretrained("camembert-base"),
+        AutoModelForMaskedLM,
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -367,7 +404,12 @@ def skip_models_due_to_control_flow(self):
 
     def use_larger_multiplier_for_smaller_tensor(self, name):
         return name in [
+<<<<<<< HEAD
             "GPT2ForSequenceClassification",
+=======
+            "ElectraForQuestionAnswering",
+            "MegatronBertForQuestionAnswering",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
     def _get_model_cls_and_config(self, model_name):
@@ -415,8 +457,16 @@ def load_model(
         use_eval_mode = self.args.use_eval_mode
         dtype = torch.float32
         reset_rng_state()
+<<<<<<< HEAD
 
         # Get batch size
+=======
+        model_cls, config = self._get_model_cls_and_config(model_name)
+        model = self._download_model(model_name)
+        model = model.to(device, dtype=dtype)
+        if self.args.enable_activation_checkpointing:
+            model.gradient_checkpointing_enable()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if model_name in BATCH_SIZE_KNOWN_MODELS:
             batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
         elif batch_size is None:
@@ -434,6 +484,7 @@ def load_model(
                     f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"  # noqa: G004
                 )
 
+<<<<<<< HEAD
         # Get model and example inputs
         if model_name in HF_LLM_MODELS:
             benchmark_cls = HF_LLM_MODELS[model_name]
@@ -474,6 +525,16 @@ def generate(self, _, example_inputs, collect_outputs=True):
 
         if self.args.enable_activation_checkpointing:
             model.gradient_checkpointing_enable()
+=======
+        example_inputs = generate_inputs_for_model(
+            model_cls, model, model_name, batch_size, device, include_loss_args=True
+        )
+
+        # So we can check for correct gradients without eliminating the dropout computation
+        for attr in dir(config):
+            if "drop" in attr and isinstance(getattr(config, attr), float):
+                setattr(config, attr, 1e-30)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             is_training
@@ -550,8 +611,12 @@ def compute_loss(self, pred):
 
     def forward_pass(self, mod, inputs, collect_outputs=True):
         with self.autocast(**self.autocast_arg):
+<<<<<<< HEAD
             res = mod(**inputs)
         return res.logits if self.hf_llm else res
+=======
+            return mod(**inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         cloned_inputs = clone_inputs(inputs)
diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml
index b22988c4ba9c9..1f6b4356cce52 100644
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@@ -9,6 +9,7 @@ skip:
     # Fails with even batch size = 1
     - GPTJForCausalLM
     - GPTJForQuestionAnswering
+<<<<<<< HEAD
     # Model too big
     - google/gemma-3-4b-it
     - openai/gpt-oss-20b
@@ -23,6 +24,11 @@ skip:
       - Qwen/Qwen3-0.6B
       - mistralai/Mistral-7B-Instruct-v0.3
       - openai/gpt-oss-20b
+=======
+
+  device:
+    cpu: []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   control_flow:
     - AllenaiLongformerBase
@@ -31,6 +37,7 @@ batch_size:
   # TODO - Fails even after fake tensors
   divisors:
     AlbertForMaskedLM: 2
+<<<<<<< HEAD
     AllenaiLongformerBase: 2
     BartForCausalLM: 2
     BertForMaskedLM: 2
@@ -40,6 +47,28 @@ batch_size:
     DistilBertForMaskedLM: 2
     DistillGPT2: 2
     ElectraForCausalLM: 2
+=======
+    AlbertForQuestionAnswering: 2
+    AllenaiLongformerBase: 2
+    BartForCausalLM: 2
+    BartForConditionalGeneration: 2
+    BertForMaskedLM: 2
+    BertForQuestionAnswering: 2
+    BlenderbotForCausalLM: 8
+    # BlenderbotForConditionalGeneration : 16
+    BlenderbotSmallForCausalLM: 4
+    BlenderbotSmallForConditionalGeneration: 2
+    CamemBert: 2
+    DebertaForMaskedLM: 4
+    DebertaForQuestionAnswering: 2
+    DebertaV2ForMaskedLM: 4
+    DebertaV2ForQuestionAnswering: 8
+    DistilBertForMaskedLM: 2
+    DistilBertForQuestionAnswering: 2
+    DistillGPT2: 2
+    ElectraForCausalLM: 2
+    ElectraForQuestionAnswering: 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GPT2ForSequenceClassification: 2
     # GPTJForCausalLM : 2
     # GPTJForQuestionAnswering : 2
@@ -47,21 +76,42 @@ batch_size:
     # GPTNeoForSequenceClassification : 2
     GoogleFnet: 2
     LayoutLMForMaskedLM: 2
+<<<<<<< HEAD
+    M2M100ForConditionalGeneration: 4
+    MBartForCausalLM: 2
+    MT5ForConditionalGeneration: 2
+    MegatronBertForCausalLM: 4
+    MobileBertForMaskedLM: 2
+    OPTForCausalLM: 2
+    PLBartForCausalLM: 2
+    PegasusForCausalLM: 4
+    RobertaForCausalLM: 2
+=======
+    LayoutLMForSequenceClassification: 2
     M2M100ForConditionalGeneration: 4
     MBartForCausalLM: 2
+    MBartForConditionalGeneration: 2
     MT5ForConditionalGeneration: 2
     MegatronBertForCausalLM: 4
+    MegatronBertForQuestionAnswering: 2
     MobileBertForMaskedLM: 2
+    MobileBertForQuestionAnswering: 2
     OPTForCausalLM: 2
     PLBartForCausalLM: 2
+    PLBartForConditionalGeneration: 2
     PegasusForCausalLM: 4
+    PegasusForConditionalGeneration: 2
     RobertaForCausalLM: 2
+    RobertaForQuestionAnswering: 2
+    Speech2Text2ForCausalLM: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T5ForConditionalGeneration: 2
     T5Small: 2
     TrOCRForCausalLM: 2
     XGLMForCausalLM: 4
     XLNetLMHeadModel: 2
     YituTechConvBert: 2
+<<<<<<< HEAD
     meta-llama/Llama-3.2-1B: 8
     google/gemma-2-2b: 8
     google/gemma-3-4b-it: 8
@@ -69,11 +119,14 @@ batch_size:
     Qwen/Qwen3-0.6B: 8
     mistralai/Mistral-7B-Instruct-v0.3: 8
     openai/gpt-oss-20b: 8
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 tolerance:
   higher_training:
     - MT5ForConditionalGeneration
+<<<<<<< HEAD
 
   higher_max_autotune_training: []
 
@@ -81,6 +134,22 @@ tolerance:
     - GPT2ForSequenceClassification
 
   higher_inference_cpu:
+=======
+    # AlbertForQuestionAnswering fails in CI GCP A100 but error does not seem
+    # harmful.
+    - AlbertForQuestionAnswering
+
+  higher_max_autotune_training:
+    # DebertaForQuestionAnswering needs higher tolerance in Max-Autotune mode
+    - DebertaForQuestionAnswering
+
+  higher_inference:
+    - GPT2ForSequenceClassification
+    - RobertaForQuestionAnswering
+
+  higher_inference_cpu:
+    - LayoutLMForSequenceClassification
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - GPT2ForSequenceClassification
 
   cosine: []
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
index 99d2467d66e51..03c91f16619b0 100644
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@@ -1,4 +1,5 @@
 AlbertForMaskedLM,8
+<<<<<<< HEAD
 AllenaiLongformerBase,8
 BartForCausalLM,8
 BertForMaskedLM,32
@@ -8,6 +9,28 @@ DebertaV2ForMaskedLM,8
 DistilBertForMaskedLM,256
 DistillGPT2,32
 ElectraForCausalLM,64
+=======
+AlbertForQuestionAnswering,8
+AllenaiLongformerBase,8
+BartForCausalLM,8
+BartForConditionalGeneration,4
+BertForMaskedLM,32
+BertForQuestionAnswering,32
+BlenderbotForCausalLM,32
+BlenderbotForConditionalGeneration,16
+BlenderbotSmallForCausalLM,256
+BlenderbotSmallForConditionalGeneration,128
+CamemBert,32
+DebertaForMaskedLM,32
+DebertaForQuestionAnswering,32
+DebertaV2ForMaskedLM,8
+DebertaV2ForQuestionAnswering,8
+DistilBertForMaskedLM,256
+DistilBertForQuestionAnswering,512
+DistillGPT2,32
+ElectraForCausalLM,64
+ElectraForQuestionAnswering,128
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GPT2ForSequenceClassification,8
 GPTJForCausalLM,1
 GPTJForQuestionAnswering,1
@@ -15,21 +38,42 @@ GPTNeoForCausalLM,32
 GPTNeoForSequenceClassification,32
 GoogleFnet,32
 LayoutLMForMaskedLM,32
+<<<<<<< HEAD
+M2M100ForConditionalGeneration,64
+MBartForCausalLM,8
+MT5ForConditionalGeneration,32
+MegatronBertForCausalLM,16
+MobileBertForMaskedLM,256
+OPTForCausalLM,4
+PLBartForCausalLM,16
+PegasusForCausalLM,128
+RobertaForCausalLM,32
+=======
+LayoutLMForSequenceClassification,32
 M2M100ForConditionalGeneration,64
 MBartForCausalLM,8
+MBartForConditionalGeneration,4
 MT5ForConditionalGeneration,32
 MegatronBertForCausalLM,16
+MegatronBertForQuestionAnswering,16
 MobileBertForMaskedLM,256
+MobileBertForQuestionAnswering,256
 OPTForCausalLM,4
 PLBartForCausalLM,16
+PLBartForConditionalGeneration,8
 PegasusForCausalLM,128
+PegasusForConditionalGeneration,64
 RobertaForCausalLM,32
+RobertaForQuestionAnswering,32
+Speech2Text2ForCausalLM,1024
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,8
 T5Small,8
 TrOCRForCausalLM,64
 XGLMForCausalLM,32
 XLNetLMHeadModel,16
 YituTechConvBert,32
+<<<<<<< HEAD
 meta-llama/Llama-3.2-1B,8
 google/gemma-2-2b,8
 google/gemma-3-4b-it,8
@@ -37,3 +81,5 @@ openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
 mistralai/Mistral-7B-Instruct-v0.3, 8
 openai/gpt-oss-20b, 8
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/huggingface_models_list_cpu.txt b/benchmarks/dynamo/huggingface_models_list_cpu.txt
index 6ee735fc6eddf..da08e064acdb5 100644
--- a/benchmarks/dynamo/huggingface_models_list_cpu.txt
+++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt
@@ -1,4 +1,5 @@
 AlbertForMaskedLM,4
+<<<<<<< HEAD
 AllenaiLongformerBase,4
 BartForCausalLM,4
 BertForMaskedLM,16
@@ -20,6 +21,48 @@ OPTForCausalLM,2
 PegasusForCausalLM,32
 PLBartForCausalLM,8
 RobertaForCausalLM,16
+=======
+AlbertForQuestionAnswering,4
+AllenaiLongformerBase,4
+BartForCausalLM,4
+BartForConditionalGeneration,2
+BertForMaskedLM,16
+BertForQuestionAnswering,16
+BigBird,32
+BlenderbotForCausalLM,32
+BlenderbotSmallForCausalLM,64
+BlenderbotSmallForConditionalGeneration,64
+CamemBert,16
+DebertaForMaskedLM,32
+DebertaForQuestionAnswering,8
+DebertaV2ForMaskedLM,16
+DebertaV2ForQuestionAnswering,2
+DistilBertForMaskedLM,128
+DistilBertForQuestionAnswering,256
+DistillGPT2,16
+ElectraForCausalLM,8
+ElectraForQuestionAnswering,8
+GoogleFnet,16
+GPT2ForSequenceClassification,4
+LayoutLMForMaskedLM,16
+LayoutLMForSequenceClassification,16
+M2M100ForConditionalGeneration,16
+MBartForCausalLM,4
+MBartForConditionalGeneration,2
+MegatronBertForCausalLM,4
+MegatronBertForQuestionAnswering,8
+MobileBertForMaskedLM,64
+MobileBertForQuestionAnswering,64
+MT5ForConditionalGeneration,16
+OPTForCausalLM,2
+PegasusForCausalLM,32
+PegasusForConditionalGeneration,32
+PLBartForCausalLM,8
+PLBartForConditionalGeneration,4
+RobertaForCausalLM,16
+RobertaForQuestionAnswering,16
+Speech2Text2ForCausalLM,32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T5ForConditionalGeneration,4
 T5Small,1
 TrOCRForCausalLM,32
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index 8a6978dd448be..5fec04ffac801 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -296,8 +296,13 @@ def get_all_ops(self):
         for key in self.operator_db.keys():
             try:
                 op = eval(key)
+<<<<<<< HEAD
             except AttributeError:
                 log.warning("Evaluating an op name into an OpOverload", exc_info=True)
+=======
+            except AttributeError as ae:
+                log.warning("Evaluating an op name into an OpOverload: %s", ae)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             yield op
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/README.md b/benchmarks/dynamo/pr_time_benchmarks/README.md
index 689d2b16e5e4e..8ca69ee68ee2a 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/README.md
+++ b/benchmarks/dynamo/pr_time_benchmarks/README.md
@@ -6,4 +6,8 @@
 4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
 5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
 6. Check in your new benchmark file and submit a new PR
+<<<<<<< HEAD
 7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If you are a meta employee, you can find the dashboard here: https://internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
+=======
+7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
index a1f3a28a466a0..658c958974528 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
@@ -3,7 +3,10 @@
 from benchmark_base import BenchmarkBase
 
 import torch
+<<<<<<< HEAD
 from torch._dynamo.utils import CompileTimeInstructionCounter
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Benchmark(BenchmarkBase):
@@ -33,11 +36,15 @@ def _prepare(self):
     def _work(self):
         # enable_cpp_symbolic_shape_guards has impact on this benchmark
         # Keep using False value for consistency.
+<<<<<<< HEAD
         with (
             torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
             torch._export.config.patch(use_new_tracer_experimental=True),
             CompileTimeInstructionCounter.record(),
         ):
+=======
+        with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.export.export(self.m, (self.input,), strict=True)
 
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
index 498b4086293d5..d1ad003282bad 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
@@ -37,7 +37,11 @@ def _work(self):
         def f(a, b):
             xs = b.tolist()
             for x in xs:
+<<<<<<< HEAD
                 torch._check(x >= 0)
+=======
+                torch._check_is_size(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(x <= self.N)
             return a.split(xs)
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
index 734d3a01c1e82..a072e00d97c6d 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
@@ -132,10 +132,17 @@ def log(event_name):
             )
 
         new_entry = copy.deepcopy(entry)
+<<<<<<< HEAD
         # only change if abs(ratio) > entry.noise_margin /5.
         new_entry.expected_value = (
             replace_with_zeros(result)
             if abs(ratio) > entry.noise_margin * 100 / 5
+=======
+        # only change if abs(ratio) > entry.noise_margin /3.
+        new_entry.expected_value = (
+            replace_with_zeros(result)
+            if abs(ratio) > entry.noise_margin * 100 / 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else entry.expected_value
         )
         new_expected[key] = new_entry
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index 58dc3f82c0a4c..796886c1a07ec 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 add_loop_eager,compile_time_instruction_count,3184000000,0.1
 
 
@@ -87,3 +88,78 @@ basic_NestedModule_eager,compile_time_instruction_count,9990000000,0.1
 
 
 basic_InlineMod_eager,compile_time_instruction_count,8126000000,0.1
+=======
+add_loop_eager,compile_time_instruction_count,2937000000,0.015
+
+
+
+add_loop_eager_dynamic,compile_time_instruction_count,4300194436,0.025
+
+
+
+add_loop_inductor,compile_time_instruction_count,29630000000,0.015
+
+
+
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39110000000,0.025
+
+
+
+add_loop_inductor_gpu,compile_time_instruction_count,26180000000,0.015
+
+
+
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,942514329,0.015
+
+
+
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,18660000000,0.015
+
+
+
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,16750000000,0.015
+
+
+
+basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000000,0.2
+
+
+
+update_hint_regression,compile_time_instruction_count,1677000000,0.02
+
+
+
+sum_floordiv_regression,compile_time_instruction_count,984411080,0.015
+
+
+
+symint_sum,compile_time_instruction_count,3252000000,0.015
+
+
+
+symint_sum_loop,compile_time_instruction_count,4216000000,0.015
+
+
+
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2113000000,0.015
+
+
+
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6022000000,0.015
+
+
+
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8844000000,0.015
+
+
+
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1963000000,0.015
+
+
+
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3875000000,0.015
+
+
+
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10420000000,0.015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 7f80d107ff9e7..8c0420b079ec6 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -32,7 +32,10 @@
 import itertools
 import logging
 import os
+<<<<<<< HEAD
 import platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import shutil
 import subprocess
@@ -375,7 +378,10 @@ def get_skip_tests(suite, device, is_training: bool):
     original_dir = abspath(os.getcwd())
     module = importlib.import_module(suite)
     os.chdir(original_dir)
+<<<<<<< HEAD
     arch = platform.machine()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if suite == "torchbench":
         skip_tests.update(module.TorchBenchmarkRunner().skip_models)
@@ -385,10 +391,13 @@ def get_skip_tests(suite, device, is_training: bool):
             )
         if device == "cpu":
             skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
+<<<<<<< HEAD
             if arch == "aarch64":
                 skip_tests.update(
                     module.TorchBenchmarkRunner().skip_models_for_cpu_aarch64
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif device == "cuda":
             skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
 
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 59534e8341cbc..8afcd38deba1d 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -39,6 +39,7 @@ def pip_install(package):
     from timm.models import create_model
 
 TIMM_MODELS = {}
+<<<<<<< HEAD
 
 # Run only this selected group of models, leave this empty to run everything
 TORCHBENCH_ONLY_MODELS = [
@@ -46,13 +47,20 @@ def pip_install(package):
 ]
 
 filename = os.path.join(os.path.dirname(__file__), "timm_models_list.txt")
+=======
+filename = os.path.join(os.path.dirname(__file__), "timm_models_list.txt")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 with open(filename) as fh:
     lines = fh.readlines()
     lines = [line.rstrip() for line in lines]
     for line in lines:
         model_name, batch_size = line.split(" ")
+<<<<<<< HEAD
         if TORCHBENCH_ONLY_MODELS and model_name not in TORCHBENCH_ONLY_MODELS:
             continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TIMM_MODELS[model_name] = int(batch_size)
 
 
@@ -60,6 +68,7 @@ def pip_install(package):
 
 BATCH_SIZE_DIVISORS = {
     "beit_base_patch16_224": 2,
+<<<<<<< HEAD
     "deit_base_distilled_patch16_224": 2,
     "gluon_xception65": 2,
     "mobilevit_s": 2,
@@ -91,10 +100,89 @@ def pip_install(package):
 FORCE_AMP_FOR_FP16_BF16_MODELS = {}
 
 SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {}
+=======
+    "convit_base": 2,
+    "convmixer_768_32": 2,
+    "convnext_base": 2,
+    "cspdarknet53": 2,
+    "deit_base_distilled_patch16_224": 2,
+    "gluon_xception65": 2,
+    "mobilevit_s": 2,
+    "pnasnet5large": 2,
+    "poolformer_m36": 2,
+    "resnest101e": 2,
+    "swin_base_patch4_window7_224": 2,
+    "swsl_resnext101_32x16d": 2,
+    "vit_base_patch16_224": 2,
+    "volo_d1_224": 2,
+    "jx_nest_base": 4,
+}
+
+REQUIRE_HIGHER_TOLERANCE = {
+    "crossvit_9_240",
+    "fbnetv3_b",
+    "gmixer_24_224",
+    "hrnet_w18",
+    "inception_v3",
+    "mixer_b16_224",
+    "mobilenetv3_large_100",
+    "sebotnet33ts_256",
+    "selecsls42b",
+    "convnext_base",
+    "cait_m36_384",
+}
+
+REQUIRE_HIGHER_TOLERANCE_AMP = {
+    "poolformer_m36",
+}
+
+REQUIRE_EVEN_HIGHER_TOLERANCE = {
+    "levit_128",
+    "sebotnet33ts_256",
+    "beit_base_patch16_224",
+    "cspdarknet53",
+}
+
+# These models need higher tolerance in MaxAutotune mode
+REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE = {
+    "gluon_inception_v3",
+}
+
+REQUIRE_HIGHER_TOLERANCE_FOR_FREEZING = {
+    "adv_inception_v3",
+    "botnet26t_256",
+    "gluon_inception_v3",
+    "selecsls42b",
+    "swsl_resnext101_32x16d",
+}
+
+SCALED_COMPUTE_LOSS = {
+    "ese_vovnet19b_dw",
+    "fbnetc_100",
+    "mnasnet_100",
+    "mobilevit_s",
+    "sebotnet33ts_256",
+}
+
+FORCE_AMP_FOR_FP16_BF16_MODELS = {
+    "convit_base",
+    "xcit_large_24_p8_224",
+}
+
+SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {
+    "xcit_large_24_p8_224",
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = {
     "inception_v3",
     "mobilenetv3_large_100",
+<<<<<<< HEAD
+=======
+    "cspdarknet53",
+    "gluon_inception_v3",
+    "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -195,6 +283,7 @@ def _skip(self):
         return self._config["skip"]
 
     @property
+<<<<<<< HEAD
     def skip_models_for_cpu(self):
         return self._skip["device"]["cpu"]
 
@@ -203,6 +292,8 @@ def skip_models_for_cpu_aarch64(self):
         return self._skip["device"]["cpu_aarch64"]
 
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def skip_models(self):
         return self._skip["all"]
 
@@ -226,11 +317,23 @@ def skip_accuracy_check_as_eager_non_deterministic(self):
 
     @property
     def guard_on_nn_module_models(self):
+<<<<<<< HEAD
         return {}
 
     @property
     def inline_inbuilt_nn_modules_models(self):
         return {}
+=======
+        return {
+            "convit_base",
+        }
+
+    @property
+    def inline_inbuilt_nn_modules_models(self):
+        return {
+            "lcnet_050",
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @download_retry_decorator
     def _download_model(self, model_name):
@@ -271,6 +374,11 @@ def load_model(
             memory_format=torch.channels_last if channels_last else None,
         )
 
+<<<<<<< HEAD
+=======
+        self.num_classes = model.num_classes
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data_config = resolve_data_config(
             vars(self._args) if timmversion >= "0.8.0" else self._args,
             model=model,
@@ -300,6 +408,10 @@ def load_model(
         example_inputs = [
             example_inputs,
         ]
+<<<<<<< HEAD
+=======
+        self.target = self._gen_target(batch_size, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.loss = torch.nn.CrossEntropyLoss().to(device)
 
@@ -367,6 +479,14 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
                 tolerance = 1e-2
         return tolerance, cosine
 
+<<<<<<< HEAD
+=======
+    def _gen_target(self, batch_size, device):
+        return torch.empty((batch_size,) + (), device=device, dtype=torch.long).random_(
+            self.num_classes
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def compute_loss(self, pred):
         # High loss values make gradient checking harder, as small changes in
         # accumulation order upsets accuracy checks.
diff --git a/benchmarks/dynamo/timm_models.yaml b/benchmarks/dynamo/timm_models.yaml
index 1650a87500537..852cdfbc1bf73 100644
--- a/benchmarks/dynamo/timm_models.yaml
+++ b/benchmarks/dynamo/timm_models.yaml
@@ -2,6 +2,7 @@
 skip:
   all:
     - ~
+<<<<<<< HEAD
   device:
     cpu:
       - ~
@@ -11,3 +12,5 @@ skip:
       - dm_nfnet_f0
       - nfnet_l0
       - visformer_small
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/timm_models_list.txt b/benchmarks/dynamo/timm_models_list.txt
index a006af403f76b..e4ea565685464 100644
--- a/benchmarks/dynamo/timm_models_list.txt
+++ b/benchmarks/dynamo/timm_models_list.txt
@@ -1,18 +1,79 @@
 adv_inception_v3 128
 beit_base_patch16_224 128
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k 128
 deit_base_distilled_patch16_224 128
 deit_tiny_patch16_224.fb_in1k 128
 dm_nfnet_f0 128
 ghostnet_100 512
 inception_v3 128
+=======
+botnet26t_256 128
+cait_m36_384 4
+coat_lite_mini 128
+convit_base 128
+convmixer_768_32 64
+convnext_base 128
+crossvit_9_240 256
+cspdarknet53 128
+deit_base_distilled_patch16_224 128
+dla102 128
+dm_nfnet_f0 128
+dpn107 64
+eca_botnext26ts_256 128
+eca_halonext26ts 128
+ese_vovnet19b_dw 256
+fbnetc_100 512
+fbnetv3_b 256
+gernet_l 128
+ghostnet_100 512
+gluon_inception_v3 256
+gmixer_24_224 128
+gmlp_s16_224 128
+hrnet_w18 128
+inception_v3 128
+jx_nest_base 128
+lcnet_050 256
+levit_128 1024
+mixer_b16_224 128
+mixnet_l 128
+mnasnet_100 512
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100 128
 mobilenetv3_large_100 512
 mobilevit_s 128
 nfnet_l0 128
+<<<<<<< HEAD
 repvgg_a2 128
 swin_base_patch4_window7_224 128
 tf_efficientnet_b0 128
 visformer_small 128
 vit_base_patch14_dinov2.lvd142m 128
-vit_base_patch16_siglip_256 128
\ No newline at end of file
+vit_base_patch16_siglip_256 128
+=======
+pit_b_224 64
+pnasnet5large 32
+poolformer_m36 128
+regnety_002 1024
+repvgg_a2 128
+res2net101_26w_4s 128
+res2net50_14w_8s 128
+res2next50 128
+resmlp_12_224 128
+resnest101e 128
+rexnet_100 256
+sebotnet33ts_256 64
+selecsls42b 128
+spnasnet_100 128
+swin_base_patch4_window7_224 128
+swsl_resnext101_32x16d 64
+tf_efficientnet_b0 128
+tf_mixnet_l 128
+tinynet_a 128
+tnt_s_patch16_224 128
+twins_pcpvt_base 128
+visformer_small 128
+vit_base_patch16_224 128
+volo_d1_224 128
+xcit_large_24_p8_224 16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/timm_models_list_cpu.txt b/benchmarks/dynamo/timm_models_list_cpu.txt
index 96b743b48bd22..d2d32a8dee9c5 100644
--- a/benchmarks/dynamo/timm_models_list_cpu.txt
+++ b/benchmarks/dynamo/timm_models_list_cpu.txt
@@ -1,18 +1,77 @@
 adv_inception_v3,128
 beit_base_patch16_224,64
+<<<<<<< HEAD
 convnextv2_nano.fcmae_ft_in22k_in1k,128
 deit_base_distilled_patch16_224,64
 deit_tiny_patch16_224.fb_in1k,128
 dm_nfnet_f0,128
 ghostnet_100,128
 inception_v3,128
+=======
+botnet26t_256,128
+cait_m36_384,4
+coat_lite_mini,32
+convit_base,64
+convmixer_768_32,2
+convnext_base,64
+crossvit_9_240,32
+cspdarknet53,64
+deit_base_distilled_patch16_224,64
+dm_nfnet_f0,128
+dpn107,32
+eca_botnext26ts_256,128
+eca_halonext26ts,128
+ese_vovnet19b_dw,128
+fbnetc_100,32
+fbnetv3_b,32
+gernet_l,128
+ghostnet_100,128
+gluon_inception_v3,128
+gmixer_24_224,16
+gmlp_s16_224,128
+hrnet_w18,128
+inception_v3,128
+jx_nest_base,32
+lcnet_050,64
+mixer_b16_224,128
+mixnet_l,128
+mnasnet_100,32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mobilenetv2_100,32
 mobilenetv3_large_100,32
 mobilevit_s,256
 nfnet_l0,128
+<<<<<<< HEAD
 repvgg_a2,128
 swin_base_patch4_window7_224,64
 tf_efficientnet_b0,128
 visformer_small,128
 vit_base_patch14_dinov2.lvd142m,128
-ViT-B-16-SigLIP-i18n-256,128
\ No newline at end of file
+ViT-B-16-SigLIP-i18n-256,128
+=======
+pit_b_224,64
+pnasnet5large,16
+poolformer_m36,64
+regnety_002,128
+repvgg_a2,128
+res2net101_26w_4s,64
+res2net50_14w_8s,128
+res2next50,128
+resmlp_12_224,128
+resnest101e,64
+rexnet_100,128
+sebotnet33ts_256,64
+selecsls42b,128
+spnasnet_100,32
+swin_base_patch4_window7_224,64
+swsl_resnext101_32x16d,32
+tf_efficientnet_b0,128
+tf_mixnet_l,32
+tinynet_a,128
+tnt_s_patch16_224,32
+twins_pcpvt_base,64
+visformer_small,128
+vit_base_patch16_224,64
+volo_d1_224,64
+xcit_large_24_p8_224,5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/torchao_backend.py b/benchmarks/dynamo/torchao_backend.py
index 6b4204db7b361..ab8b5aea9493e 100644
--- a/benchmarks/dynamo/torchao_backend.py
+++ b/benchmarks/dynamo/torchao_backend.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index da6a3e1336aa3..cfa43edbef458 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -75,7 +75,33 @@ def setup_torchbench_cwd():
     return original_dir
 
 
+<<<<<<< HEAD
 process_train_model_output = {}
+=======
+def process_hf_reformer_output(out):
+    assert isinstance(out, list)
+    # second output is unstable
+    return [elem for i, elem in enumerate(out) if i != 1]
+
+
+def process_hf_whisper_output(out):
+    out_ret = []
+    for i, elem in enumerate(out):
+        if i == 0:
+            if elem is not None:
+                assert isinstance(elem, dict)
+                out_ret.append({k: v for k, v in elem.items() if k != "logits"})
+        elif i != 1:
+            out_ret.append(elem)
+
+    return out_ret
+
+
+process_train_model_output = {
+    "hf_Reformer": process_hf_reformer_output,
+    "hf_Whisper": process_hf_whisper_output,
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TorchBenchmarkRunner(BenchmarkRunner):
@@ -117,10 +143,13 @@ def skip_models_for_cpu(self):
         return self._skip["device"]["cpu"]
 
     @property
+<<<<<<< HEAD
     def skip_models_for_cpu_aarch64(self):
         return self._skip["device"]["cpu_aarch64"]
 
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def skip_models_for_cuda(self):
         return self._skip["device"]["cuda"]
 
@@ -205,10 +234,18 @@ def inline_inbuilt_nn_modules_models(self):
             "drq",
             "hf_Reformer",
             "DALLE2_pytorch",
+<<<<<<< HEAD
+=======
+            "hf_BigBird",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "detectron2_maskrcnn_r_50_fpn",
             "detectron2_maskrcnn_r_101_fpn",
             "vision_maskrcnn",
             "doctr_reco_predictor",
+<<<<<<< HEAD
+=======
+            "hf_T5_generate",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
     def load_model(
@@ -358,6 +395,7 @@ def load_model(
         if self.args.trace_on_xla:
             # work around for: https://github.com/pytorch/xla/issues/4174
             import torch_xla  # noqa: F401
+<<<<<<< HEAD
 
         # Turning off kv cache for torchbench models. This is not the right
         # thing to do, but the torchbench models are way outdated, and since we
@@ -372,6 +410,8 @@ def load_model(
         ):
             model.config.use_cache = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.validate_model(model, example_inputs)
         return device, benchmark.name, model, example_inputs, batch_size
 
@@ -432,8 +472,11 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
         if self.args.bfloat16:
             if name in self._tolerance["higher_bf16"]:
                 return 1e-2, cosine
+<<<<<<< HEAD
             elif current_device == "xpu" and name in self._tolerance["higher_bf16_xpu"]:
                 return 8 * 1e-2, cosine
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if is_training and (current_device == "cuda" or current_device == "xpu"):
             tolerance = 1e-3
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index c2324eddc3887..654d0638f5e98 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -5,6 +5,11 @@ batch_size:
     demucs: 4
     dlrm: 1024
     densenet121: 4
+<<<<<<< HEAD
+=======
+    hf_Reformer: 4
+    hf_T5_base: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timm_efficientdet: 1
     llama_v2_7b_16h: 1
     # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
@@ -28,6 +33,10 @@ tolerance:
     - alexnet
     - attention_is_all_you_need_pytorch
     - densenet121
+<<<<<<< HEAD
+=======
+    - hf_Albert
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - vgg16
     - mobilenet_v3_large
     - nvidia_deeprecommender
@@ -37,21 +46,35 @@ tolerance:
     - soft_actor_critic
     - tacotron2
     - yolov3
+<<<<<<< HEAD
+=======
+    - timm_efficientdet
+    - timm_efficientnet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - squeezenet1_1
 
   higher_fp16:
     - doctr_reco_predictor
     - drq
+<<<<<<< HEAD
     - phlippe_resnet
     - pytorch_CycleGAN_and_pix2pix
+=======
+    - hf_Whisper
+    - phlippe_resnet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   higher_bf16:
     - doctr_reco_predictor
     - drq
+<<<<<<< HEAD
 
   # These models need higher tolerance for xpu devices with bf16
   higher_bf16_xpu:
     - squeezenet1_1
+=======
+    - hf_Whisper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   freezing:
     # Similar logic to timm_models.py:get_tolerance_and_cosine_flag
@@ -65,9 +88,22 @@ tolerance:
 
 require_larger_multiplier_for_smaller_tensor:
   - yolov3
+<<<<<<< HEAD
+
+# These benchmarks took >600s on an i9-11900K CPU
+very_slow: &VERY_SLOW_MODELS
+=======
+  - timm_efficientnet
 
 # These benchmarks took >600s on an i9-11900K CPU
 very_slow: &VERY_SLOW_MODELS
+  # 3339s
+  - hf_BigBird
+  # 3062s
+  - hf_Longformer
+  # 930s
+  - hf_T5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # These benchmarks took >60s on an i9-11900K CPU
@@ -79,6 +115,21 @@ slow:
   - demucs
   # 242s
   - fastNLP_Bert
+<<<<<<< HEAD
+=======
+  # 221s
+  - hf_Albert
+  # 400s
+  - hf_Bart
+  # 334s
+  - hf_Bert
+  # 187s
+  - hf_DistilBert
+  # 470s
+  - hf_GPT2
+  # 141s
+  - hf_Reformer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # 317s
   - speech_transformer
   # 99s
@@ -162,6 +213,7 @@ skip:
     - hf_clip
     # multi gpu not always available in benchmark runners
     - simple_gpt_tp_manual
+<<<<<<< HEAD
     # skip hf and timm models in torchbench since
     # there are already separate benchmarks for them
     - hf_Albert
@@ -192,6 +244,13 @@ skip:
 
   device:
     cpu:
+=======
+
+  device:
+    cpu:
+      # OOMs
+      - hf_T5_generate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # model is CUDA only
       - cm3leon_generate
       # timeout
@@ -208,13 +267,20 @@ skip:
       - torchrec_dlrm
       - simple_gpt
       # works on cuda, accuracy failure on cpu
+<<<<<<< HEAD
+=======
+      - hf_Whisper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - stable_diffusion_text_encoder
       - llava
       - moco
 
+<<<<<<< HEAD
     # Skip these additional models when running on aarch64
     cpu_aarch64: []
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cuda: []
 
   test:
@@ -231,6 +297,10 @@ skip:
       - sam_fast
       # Model's DEFAULT_TRAIN_BSIZE is not implemented
       - cm3leon_generate
+<<<<<<< HEAD
+=======
+      - hf_T5_generate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - doctr_det_predictor
       - doctr_reco_predictor
       - moondream
@@ -242,6 +312,12 @@ skip:
     - cm3leon_generate
     - detectron2_fcos_r_50_fpn
     - fastNLP_Bert
+<<<<<<< HEAD
+=======
+    - hf_Longformer
+    - hf_Reformer
+    - hf_T5_generate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - opacus_cifar10
     - speech_transformer
 
@@ -278,6 +354,12 @@ accuracy:
       # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
       # even for 40 GB machine. We have tested accuracy for smaller version of
       # these models
+<<<<<<< HEAD
+=======
+      - hf_GPT2_large
+      - hf_T5_large
+      - timm_vision_transformer_large
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # accuracy https://github.com/pytorch/pytorch/issues/93847
       - maml
       - llama_v2_7b_16h
@@ -289,4 +371,8 @@ accuracy:
       - pytorch_unet
 
   max_batch_size:
+<<<<<<< HEAD
+=======
+    hf_GPT2: 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pytorch_unet: 2
diff --git a/benchmarks/dynamo/torchbench_models_list.txt b/benchmarks/dynamo/torchbench_models_list.txt
index f8f36810c693b..44ed5d412724b 100644
--- a/benchmarks/dynamo/torchbench_models_list.txt
+++ b/benchmarks/dynamo/torchbench_models_list.txt
@@ -4,6 +4,14 @@ LearningToPaint,1024
 alexnet,1024
 dcgan,1024
 densenet121,64
+<<<<<<< HEAD
+=======
+hf_Albert,32
+hf_Bart,16
+hf_Bert,16
+hf_GPT2,16
+hf_T5,4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,256
 mobilenet_v2,128
 mobilenet_v3_large,256
@@ -14,4 +22,13 @@ resnet50,128
 resnext50_32x4d,128
 shufflenet_v2_x1_0,512
 squeezenet1_1,512
+<<<<<<< HEAD
+=======
+timm_nfnet,256
+timm_efficientnet,128
+timm_regnet,128
+timm_resnest,256
+timm_vision_transformer,256
+timm_vovnet,128
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 vgg16,128
diff --git a/benchmarks/dynamo/torchbench_models_list_cpu.txt b/benchmarks/dynamo/torchbench_models_list_cpu.txt
index af2293b5a4a6c..fd165e00c99e2 100644
--- a/benchmarks/dynamo/torchbench_models_list_cpu.txt
+++ b/benchmarks/dynamo/torchbench_models_list_cpu.txt
@@ -6,6 +6,21 @@ densenet121,512
 dlrm,2048
 fastNLP_Bert,8
 functorch_dp_cifar10,1024
+<<<<<<< HEAD
+=======
+hf_Albert,8
+hf_Bart,8
+hf_Bert,8
+hf_Bert_large,8
+hf_DistilBert,8
+hf_GPT2,8
+hf_GPT2_large,1
+hf_Longformer,4
+hf_Reformer,8
+hf_T5,4
+hf_T5_base,1
+hf_T5_large,1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 LearningToPaint,96
 lennard_jones,1024
 mnasnet1_0,32
@@ -23,6 +38,16 @@ shufflenet_v2_x1_0,64
 speech_transformer,1024
 squeezenet1_1,16
 Super_SloMo,1024
+<<<<<<< HEAD
+=======
+timm_efficientnet,64
+timm_nfnet,128
+timm_regnet,32
+timm_resnest,32
+timm_vision_transformer,16
+timm_vision_transformer_large,8
+timm_vovnet,32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tts_angular,1024
 vgg16,64
 vision_maskrcnn,1
diff --git a/benchmarks/fastrnns/bench.py b/benchmarks/fastrnns/bench.py
index 8ad6b96fde4db..0b871a581b9fe 100644
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@@ -127,7 +127,11 @@ def train_batch(modeldef):
         bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
         return fwd_time, bwd_time
 
+<<<<<<< HEAD
     creator_args = {
+=======
+    creator_args = creator_args = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "seqLength": seqLength,
         "numLayers": numLayers,
         "inputSize": inputSize,
diff --git a/benchmarks/fastrnns/test_bench.py b/benchmarks/fastrnns/test_bench.py
index 27d5173766144..d770c9d906614 100644
--- a/benchmarks/fastrnns/test_bench.py
+++ b/benchmarks/fastrnns/test_bench.py
@@ -12,7 +12,11 @@ def modeldef(request, net_name, executor, fuser):
 
     # Given a 'net_name' provided by generate_tests, build the thing
     name, rnn_creator, context = get_nn_runners(net_name)[0]
+<<<<<<< HEAD
     creator_args = {
+=======
+    creator_args = creator_args = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "seqLength": 100,
         "numLayers": 1,
         "inputSize": 512,
diff --git a/benchmarks/functional_autograd_benchmark/README.md b/benchmarks/functional_autograd_benchmark/README.md
index 457f01265fbff..b4e313796e16f 100644
--- a/benchmarks/functional_autograd_benchmark/README.md
+++ b/benchmarks/functional_autograd_benchmark/README.md
@@ -17,8 +17,13 @@ export DEBUG=0
 export OMP_NUM_THREADS=10
 
 # Compile pytorch with the base revision
+<<<<<<< HEAD
 git checkout main
 python -m pip install --no-build-isolation -v -e .
+=======
+git checkout master
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install dependencies:
 # Scipy is required by detr
@@ -32,7 +37,11 @@ python functional_autograd_benchmark.py --output before.txt
 # Compile pytorch with your change
 popd
 git checkout your_feature_branch
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Run the benchmark for the new version
 pushd benchmarks/functional_autograd_benchmark
diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
index 9d5772c4f1244..da8e4989ec07b 100644
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
@@ -1,8 +1,12 @@
 import time
 from argparse import ArgumentParser
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, NamedTuple
+=======
+from typing import Any, Callable, NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.autograd import functional
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
index 5a26616cb5071..d078df9a961ac 100644
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@@ -367,7 +367,11 @@ def get_seq_lens(self, input_length):
         """
         seq_len = input_length
         for m in self.conv.modules():
+<<<<<<< HEAD
             if type(m) is nn.modules.conv.Conv2d:
+=======
+            if type(m) == nn.modules.conv.Conv2d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 seq_len = (
                     seq_len
                     + 2 * m.padding[1]
diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
index 8efc0bdcddd12..f80bd5750b778 100644
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@@ -1,6 +1,10 @@
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import nn, Tensor
diff --git a/benchmarks/gpt_fast/common.py b/benchmarks/gpt_fast/common.py
index 4cbd0bd0f2dc6..7327a6757d23a 100644
--- a/benchmarks/gpt_fast/common.py
+++ b/benchmarks/gpt_fast/common.py
@@ -1,6 +1,10 @@
 import dataclasses
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 all_experiments: dict[str, Callable] = {}
diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
index fd0342ce3d597..a03c56f05667a 100644
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -85,7 +85,11 @@ def create_quantized_state_dict(self):
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
+<<<<<<< HEAD
                 for weight_idx in range(3):
+=======
+                for weight_idx in range(0, 3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
diff --git a/benchmarks/inductor_backends/cutlass.py b/benchmarks/inductor_backends/cutlass.py
index b2ed506302aec..f13ddad59e661 100644
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@@ -9,9 +9,14 @@
 import time
 from abc import abstractmethod
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import asdict, dataclass, field
 from typing import Any, Optional
+=======
+from dataclasses import asdict, dataclass, field
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/benchmarks/instruction_counts/core/api.py b/benchmarks/instruction_counts/core/api.py
index d22fc5a66fabf..d255f7e586f45 100644
--- a/benchmarks/instruction_counts/core/api.py
+++ b/benchmarks/instruction_counts/core/api.py
@@ -66,7 +66,11 @@ class GroupedSetup:
 
     def __post_init__(self) -> None:
         for field in dataclasses.fields(self):
+<<<<<<< HEAD
             assert field.type is str
+=======
+            assert field.type == str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             value: str = getattr(self, field.name)
             object.__setattr__(self, field.name, textwrap.dedent(value))
 
diff --git a/benchmarks/instruction_counts/worker/main.py b/benchmarks/instruction_counts/worker/main.py
index 33021ec650049..23a526534f10d 100644
--- a/benchmarks/instruction_counts/worker/main.py
+++ b/benchmarks/instruction_counts/worker/main.py
@@ -170,7 +170,11 @@ def main(communication_file: str) -> None:
         # Runner process sent SIGINT.
         sys.exit()
 
+<<<<<<< HEAD
     except BaseException:  # noqa: B036
+=======
+    except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         trace_f = io.StringIO()
         traceback.print_exc(file=trace_f)
         result = WorkerFailure(failure_trace=trace_f.getvalue())
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index b4556d908d7dd..63a66dcfb51ef 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -20,7 +20,11 @@ Key Features:
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
 ```bash
 cd pt_extension
+<<<<<<< HEAD
 python -m pip install . -v --no-build-isolation
+=======
+python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ## How to run the benchmarks:
diff --git a/benchmarks/operator_benchmark/benchmark_all_other_test.py b/benchmarks/operator_benchmark/benchmark_all_other_test.py
index 362fec8c37f5b..60260eb657582 100644
--- a/benchmarks/operator_benchmark/benchmark_all_other_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@@ -7,7 +7,10 @@
     binary_inplace_test,
     binary_test,
     bmm_test,
+<<<<<<< HEAD
     boolean_test,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cat_test,
     channel_shuffle_test,
     chunk_test,
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 4c131843b372b..eef1c44a71505 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -4,7 +4,10 @@
 import functools
 import json
 import os
+<<<<<<< HEAD
 import platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import timeit
 from collections import namedtuple
 from dataclasses import asdict, dataclass
@@ -18,7 +21,10 @@
 
 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401
+<<<<<<< HEAD
 from torch.utils.benchmark import Timer
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """Performance microbenchmarks.
@@ -193,11 +199,14 @@ def __init__(self, args):
         self.predefined_minimum_secs = 1
         self.max_iters = 1e6
         self.use_jit = args.use_jit
+<<<<<<< HEAD
         self.use_compile = args.use_compile
         if self.use_jit and self.use_compile:
             raise ValueError(
                 "use_jit and use_compile are mutually exclusive, please specify one."
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.num_runs = args.num_runs
         self.print_per_iter = False
         self.output_csv = args.output_csv
@@ -229,7 +238,11 @@ def _print_header(self):
             if self.args.operators:
                 print(f"# {self.args.operators}")
 
+<<<<<<< HEAD
     def _print_perf_result(self, results, test_case):
+=======
+    def _print_perf_result(self, reported_run_time_us, test_case):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.args.report_aibench:
             # Output for AIBench
             # Print out per iteration execution time instead of avg time
@@ -243,14 +256,22 @@ def _print_perf_result(self, results, test_case):
                             "type": test_name,
                             "metric": "latency",
                             "unit": "us",
+<<<<<<< HEAD
                             "value": str(results["reported_run_time_us"[run]]),
+=======
+                            "value": str(reported_run_time_us[run]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         }
                     )
                 )
         else:
+<<<<<<< HEAD
             print(
                 f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
             )
+=======
+            print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print(
                 f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
             )
@@ -259,6 +280,7 @@ def _print_perf_result(self, results, test_case):
             if self.num_runs > 1:
                 for run in range(self.num_runs):
                     print(
+<<<<<<< HEAD
                         f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
                     )
                 print()
@@ -269,11 +291,21 @@ def _print_perf_result(self, results, test_case):
                 print(f"Peak Memory (KB) : {results['peak_memory']}\n")
 
     def _perf_result_to_dict(self, results, test_case):
+=======
+                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                    )
+                print()
+            else:
+                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
+
+    def _perf_result_to_dict(self, reported_run_time_us, test_case):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """This function is the parallel of _print_perf_result, which instead of
         writing information to terminal, returns a dictionary.
         """
         if self.args.report_aibench:
             return {}
+<<<<<<< HEAD
 
         out = {
             "test_name": test_case.test_config.test_name,
@@ -286,6 +318,15 @@ def _perf_result_to_dict(self, results, test_case):
             "latency unit": "us",
             "peak memory": results["peak_memory"],
             "memory unit": "KB",
+=======
+        out = {
+            "test_name": test_case.test_config.test_name,
+            "input_config": test_case.test_config.input_config,
+            "mode": "JIT" if self.use_jit else "Eager",
+            "run": "Backward" if test_case.test_config.run_backward else "Forward",
+            "latency": round(reported_run_time_us[0], 3),
+            "latency unit": "us",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@@ -347,6 +388,7 @@ def _launch_forward(self, test_case, iters, print_per_iter):
         func = test_case.run_forward
         if self.use_jit:
             func = test_case.run_jit_forward
+<<<<<<< HEAD
         if self.use_compile:
             func = test_case.run_compile_forward
 
@@ -367,6 +409,12 @@ def _launch_forward(self, test_case, iters, print_per_iter):
         )
         result = timer.adaptive_autorange(min_run_time=0.0001)
         return result.median * iters
+=======
+        forward_time = timeit.timeit(
+            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+        )
+        return forward_time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _launch_backward(self, test_case, iters, print_per_iter=False):
         """This function runs forward path of an op to get an output. Then the backward path is executed
@@ -379,7 +427,11 @@ def _launch_backward(self, test_case, iters, print_per_iter=False):
         )
         return backward_time
 
+<<<<<<< HEAD
     def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
+=======
+    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This function execute the operator for <iters> iterations then look at the time.
         If it's not significant, the number of iterations will be increased before rerun.
@@ -387,6 +439,7 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
         """
         curr_test_total_time = 0
         time_trace = []
+<<<<<<< HEAD
         peak_memory = 0
         input_values = test_case.op_bench.inputs.values()
         device, device_module = None, None
@@ -406,6 +459,10 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
             # Memory measurement process
             if hasattr(device_module, "max_memory_allocated"):
                 peak_memory = device_module.max_memory_allocated(device)
+=======
+        while True:
+            run_time_sec = launch_test(test_case, iters, print_per_iter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             curr_test_total_time += run_time_sec
             # Analyze time after each run to decide if the result is stable
             results_are_significant = self._iteration_result_is_significant(
@@ -419,6 +476,7 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
             time_trace.append(report_run_time)
             # Print out the time spent in each epoch in ms
             if self.args.report_aibench:
+<<<<<<< HEAD
                 mode = (
                     "JIT"
                     if self.use_jit
@@ -426,6 +484,9 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
                     if self.use_compile
                     else "Eager"
                 )
+=======
+                mode = "JIT" if self.use_jit else "Eager"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 test_name = "_".join(
                     [test_case.framework, test_case.test_config.test_name, mode]
                 )
@@ -437,7 +498,11 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
                             "metric": "latency",
                             "unit": "ms",
                             "value": str(report_run_time / 1e3),
+<<<<<<< HEAD
                         },
+=======
+                        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
             if results_are_significant:
@@ -447,7 +512,11 @@ def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
             # iteration count, and run the benchmark again...
             iters = self._predict_num_iter_needed(iters)
         reported_run_time_us = np.percentile(np.array(time_trace), 50)
+<<<<<<< HEAD
         return reported_run_time_us, peak_memory / 1024
+=======
+        return reported_run_time_us
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_keep(self, test_flag, cmd_flag):
         return cmd_flag is None or test_flag == cmd_flag
@@ -534,7 +603,10 @@ def _output_json(
         self,
         perf_list,
         output_file,
+<<<<<<< HEAD
         benchmark_name="PyTorch operator benchmark",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Write the result into JSON format, so that it can be uploaded to the benchmark database
@@ -552,10 +624,15 @@ def _output_json(
             input_config = perf_item.get("input_config", "")
             run_type = perf_item.get("run")
             latency = perf_item.get("latency", 0)
+<<<<<<< HEAD
             peak_memory = perf_item.get("peak memory", 0)
             device = perf_item.get("device", "unknown")
             dtype = perf_item.get("dtype", "torch.float").split(".")[1]
             runtime = perf_item.get("runtime", None)
+=======
+
+            dtype = "float32"  # default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Extract mode based on run_type
             mode = None
@@ -564,6 +641,7 @@ def _output_json(
             elif run_type == "Backward":
                 mode = "training"
 
+<<<<<<< HEAD
             # Extract use_compile from it
             if runtime == "Compile":
                 use_compile = True
@@ -583,6 +661,8 @@ def _output_json(
             # Extract operator name from test_name
             operator_name = test_name.split("_")[0]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Create the record
             @dataclass
             class BenchmarkInfo:
@@ -596,7 +676,10 @@ class ModelInfo:
                 name: str
                 type: str
                 origins: list[str]
+<<<<<<< HEAD
                 extra_info: dict[str, Any]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             @dataclass
             class MetricInfo:
@@ -611,6 +694,7 @@ class BenchmarkRecord:
                 model: ModelInfo
                 metric: MetricInfo
 
+<<<<<<< HEAD
             # Add record for latency
             record_latency = BenchmarkRecord(
                 benchmark=BenchmarkInfo(
@@ -630,6 +714,17 @@ class BenchmarkRecord:
                     type="micro-benchmark",
                     origins=["pytorch"],
                     extra_info={"operator_name": operator_name},
+=======
+            record = BenchmarkRecord(
+                benchmark=BenchmarkInfo(
+                    name="PyTorch operator benchmark",
+                    mode=mode,
+                    dtype=dtype,
+                    extra_info={"input_config": input_config},
+                ),
+                model=ModelInfo(
+                    name=test_name, type="micro-benchmark", origins=["pytorch"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 metric=MetricInfo(
                     name="latency",
@@ -638,6 +733,7 @@ class BenchmarkRecord:
                     target_value=None,
                 ),
             )
+<<<<<<< HEAD
             records.append(asdict(record_latency))
 
             # Add record for peak memory
@@ -649,6 +745,10 @@ class BenchmarkRecord:
                 target_value=None,
             )
             records.append(asdict(record_memory))
+=======
+
+            records.append(asdict(record))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Write all records to the output file
         with open(output_file, "w", encoding="utf-8") as f:
@@ -664,7 +764,10 @@ def run(self):
             "tag",
             "run_backward",
             "Execution Time",
+<<<<<<< HEAD
             "Peak Memory (KB)",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         if self.args.output_json or self.args.output_json_for_dashboard:
@@ -702,16 +805,25 @@ def run(self):
                     test_case, self.args.warmup_iterations, print_per_iter=False
                 )
                 # Actual Execution
+<<<<<<< HEAD
                 results = [
                     self._measure_metrics(
+=======
+                reported_time = [
+                    self._measure_time(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         launch_func, test_case, self.iters, self.print_per_iter
                     )
                     for _ in range(self.num_runs)
                 ]
+<<<<<<< HEAD
                 result_dict = dict()
                 result_dict["reported_run_time_us"] = [r[0] for r in results]
                 result_dict["peak_memory"] = results[0][1]
                 self._print_perf_result(results=result_dict, test_case=test_case)
+=======
+                self._print_perf_result(reported_time, test_case)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # output results to csv
                 self._output_csv(
@@ -727,6 +839,7 @@ def run(self):
                         ),
                         test_case.test_config.tag,
                         test_case.test_config.run_backward,
+<<<<<<< HEAD
                         result_dict["reported_run_time_us"][0],
                         result_dict["peak_memory"],
                     ],
@@ -738,6 +851,18 @@ def run(self):
             self._output_json(
                 perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
             )
+=======
+                        reported_time[0],
+                    ],
+                )
+                if self.args.output_json or self.args.output_json_for_dashboard:
+                    perf_list.append(
+                        self._perf_result_to_dict(reported_time, test_case)
+                    )
+
+        if self.args.output_json_for_dashboard:
+            self._output_json(perf_list, self.args.output_json_for_dashboard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.args.output_json:
             with open(self.args.output_json, "w") as f:
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index fa022417da451..9c58f91360d6b 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -4,6 +4,7 @@
 import torch
 
 
+<<<<<<< HEAD
 # Import the C++ extension to register the _consume operator
 try:
     import benchmark_cpp_extension  # noqa: F401
@@ -13,6 +14,8 @@
         "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
     ) from err
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """PyTorch performance microbenchmarks.
 
 This module contains PyTorch-specific functionalities for performance
@@ -80,6 +83,7 @@ def forward_consume(self, iters: int):
         for _ in range(iters):
             torch.ops.operator_benchmark._consume(self.forward_impl())
 
+<<<<<<< HEAD
     def forward_impl_eager(self):
         # This is to supply the inputs to the forward function which
         # will be called in both the eager and compile mode of local runs
@@ -90,6 +94,8 @@ def forward_consume_eager(self, iters: int):
         for _ in range(iters):
             torch.ops.operator_benchmark._consume(self.forward_impl_eager())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def module_name(self):
         """this is used to label the operator being benchmarked"""
         if self.user_given_name:
@@ -113,7 +119,11 @@ def test_name(self, **kargs):
             value = kargs[key]
             test_name_str.append(
                 ("" if key in skip_key_list else key)
+<<<<<<< HEAD
                 + str(value if type(value) is not bool else int(value))
+=======
+                + str(value if type(value) != bool else int(value))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
         return name
@@ -136,13 +146,17 @@ def __init__(self, op_bench, test_config):
         self.framework = "PyTorch"
         self.time_series = []
         self._jit_forward_graph = None
+<<<<<<< HEAD
         self._compile_forward_graph = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _generate_jit_forward_graph(self):
         """generate a graph for the forward function via scripting"""
         scripted_op_bench = torch.jit.script(self.op_bench)
         return scripted_op_bench.forward_consume
 
+<<<<<<< HEAD
     def _generate_compile_forward_graph(self):
         """generate a compiled graph for the forward function via torch.compile"""
         compiled_forward_consume = torch.compile(
@@ -150,12 +164,15 @@ def _generate_compile_forward_graph(self):
         )
         return compiled_forward_consume
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """Run the forward path of an op with JIT mode"""
         if self._jit_forward_graph is None:
             self._jit_forward_graph = self._generate_jit_forward_graph()
         self._jit_forward_graph(num_runs)
 
+<<<<<<< HEAD
     def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """Run the forward path of an op with compile mode"""
         if self._compile_forward_graph is None:
@@ -164,6 +181,8 @@ def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         if cuda_sync:
             torch.cuda.synchronize(torch.cuda.current_device())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_per_iter(self):
         # print last 50 values
         length = min(len(self.time_series), 50)
@@ -185,14 +204,22 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync):
         if print_per_iter:
             for _ in range(num_runs):
                 start_time = time.time()
+<<<<<<< HEAD
                 self.output = self.op_bench.forward_impl_eager()
+=======
+                self.output = self.op_bench.forward_impl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if cuda_sync:
                     torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
+<<<<<<< HEAD
                 self.output = self.op_bench.forward_impl_eager()
+=======
+                self.output = self.op_bench.forward_impl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if cuda_sync:
                 torch.cuda.synchronize(torch.cuda.current_device())
 
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index 6568cf9bf3ee6..3badfd05ac53f 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -63,6 +63,7 @@ def parse_args():
     )
 
     parser.add_argument(
+<<<<<<< HEAD
         "--benchmark-name",
         "--benchmark_name",
         help="Name of the benchmark to store results to",
@@ -70,6 +71,8 @@ def parse_args():
     )
 
     parser.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--list-tests",
         "--list_tests",
         help="List all test cases without running them",
@@ -143,6 +146,7 @@ def parse_args():
     )
 
     parser.add_argument(
+<<<<<<< HEAD
         "--use-compile",
         "--use_compile",
         type=benchmark_utils.str2bool,
@@ -153,6 +157,8 @@ def parse_args():
     )
 
     parser.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--forward-only",
         "--forward_only",
         type=benchmark_utils.str2bool,
@@ -179,7 +185,11 @@ def parse_args():
         "--output-json-for-dashboard",
         "--output_json_for_dashboard",
         help="Save results in JSON format for display on the OSS dashboard",
+<<<<<<< HEAD
         default="benchmark-results.json",
+=======
+        default="False",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     args, _ = parser.parse_known_args()
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
new file mode 100644
index 0000000000000..873f14d20127c
--- /dev/null
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -0,0 +1,1319 @@
+Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
+PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
+PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
+PyTorch,add,add_M1_N1_K1_cpu_bwd1_BACKWARD,short,TRUE,57.765
+PyTorch,add,add_M1_N1_K1_cpu_bwd2_BACKWARD,short,TRUE,57.8035
+PyTorch,add,add_M64_N64_K64_cpu_bwdall_BACKWARD,short,TRUE,135.2775
+PyTorch,add,add_M64_N64_K64_cpu_bwd1_BACKWARD,short,TRUE,135.1988
+PyTorch,add,add_M64_N64_K64_cpu_bwd2_BACKWARD,short,TRUE,135.1905
+PyTorch,add,add_M64_N64_K128_cpu_bwdall_BACKWARD,short,TRUE,135.9341
+PyTorch,add,add_M64_N64_K128_cpu_bwd1_BACKWARD,short,TRUE,136.0071
+PyTorch,add,add_M64_N64_K128_cpu_bwd2_BACKWARD,short,TRUE,135.7898
+PyTorch,addmm,addmm_M1_N1_K1_cpu,short,FALSE,6.5832
+PyTorch,addmm,addmm_M64_N64_K64_cpu,short,FALSE,18.623
+PyTorch,addmm,addmm_M64_N64_K128_cpu,short,FALSE,19.0005
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,86.4294
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd1_BACKWARD,short,TRUE,86.5513
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd2_BACKWARD,short,TRUE,86.5072
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd3_BACKWARD,short,TRUE,86.4965
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwdall_BACKWARD,short,TRUE,145.9072
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd1_BACKWARD,short,TRUE,145.2227
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd2_BACKWARD,short,TRUE,145.3786
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd3_BACKWARD,short,TRUE,145.2559
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwdall_BACKWARD,short,TRUE,155.407
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd1_BACKWARD,short,TRUE,155.4162
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd2_BACKWARD,short,TRUE,155.485
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd3_BACKWARD,short,TRUE,155.5694
+PyTorch,as_strided,"as_strided_M8_N8_size(2,2)_stride(1,1)_storage_offset0_cpu",short,FALSE,4.4039
+PyTorch,as_strided,"as_strided_M256_N256_size(32,32)_stride(1,1)_storage_offset0_cpu",short,FALSE,4.4316
+PyTorch,as_strided,"as_strided_M512_N512_size(64,64)_stride(2,2)_storage_offset1_cpu",short,FALSE,4.3663
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse,short,FALSE,153.1791
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse,short,FALSE,36.8686
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,TRUE,171.3087
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,TRUE,171.5833
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,TRUE,169.3315
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,TRUE,169.9856
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse,short,FALSE,37.001
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,TRUE,707.938
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,TRUE,705.6394
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,TRUE,228.8024
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,TRUE,229.531
+PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.6918
+PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,3.8735
+PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.1837
+PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.262
+PyTorch,copy_,copy__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.3689
+PyTorch,copy_,copy__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,8.467
+PyTorch,copy_,copy__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,8.3493
+PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,FALSE,4.084
+PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,FALSE,19.7723
+PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,FALSE,19.0845
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastTrue,short,FALSE,11.9975
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastFalse,short,FALSE,10.2126
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastTrue,short,FALSE,13.4302
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastFalse,short,FALSE,10.7077
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastTrue,short,FALSE,16.3329
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastFalse,short,FALSE,12.5681
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastTrue,short,FALSE,125.6838
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastFalse,short,FALSE,120.3468
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastTrue,short,FALSE,1682.1261
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastFalse,short,FALSE,1669.6469
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastTrue,short,FALSE,7362.4572
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastFalse,short,FALSE,7460.3745
+PyTorch,chunk,chunk_M8_N8_chunks2_cpu,short,FALSE,6.7387
+PyTorch,chunk,chunk_M256_N512_chunks2_cpu,short,FALSE,6.7331
+PyTorch,chunk,chunk_M512_N512_chunks2_cpu,short,FALSE,6.7612
+PyTorch,Conv1d,Conv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,84.5187
+PyTorch,Conv1d,Conv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,190.8436
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC2016_OC1026_kernel1024_stride256_N1_L224_cpu,short,FALSE,2746443.218
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,211.5399
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,337.3341
+PyTorch,Conv2d,Conv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,255.67
+PyTorch,ConvTranspose2d,ConvTranspose2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,335.2168
+PyTorch,Conv2dPointwise,Conv2dPointwise_IC256_OC256_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,154.9221
+PyTorch,Conv3d,Conv3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,FALSE,546.3879
+PyTorch,ConvTranspose3d,ConvTranspose3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,FALSE,1085.1947
+PyTorch,diag,diag_dim1_M64_N64_diagonal0_outTrue_cpu,short,FALSE,10.5764
+PyTorch,diag,diag_dim2_M128_N128_diagonal-10_outFalse_cpu,short,FALSE,7.56
+PyTorch,diag,diag_dim1_M256_N256_diagonal20_outTrue_cpu,short,FALSE,23.1775
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0592
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3839
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.0397
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.3495
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.689
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.9881
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5983
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8823
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.6063
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,47.841
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.6428
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,47.8691
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,28.9983
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.4274
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.1056
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4201
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.3727
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,45.1213
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.4613
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,45.4844
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.9368
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,48.3018
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.9671
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,48.6072
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0466
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3651
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.07
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4114
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.312
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.7561
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5243
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8939
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.9467
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,48.6129
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.9435
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,48.225
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0514
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3893
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,28.9891
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4042
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.4403
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,44.0072
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5882
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8711
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,31.001
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,49.1249
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,31.0196
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,49.3764
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.4515
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.5995
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,94.7753
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,94.8932
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.8096
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7441
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,95.4789
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,95.8774
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.1715
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.2046
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,98.8603
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.0584
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.526
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.4308
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,103.1781
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,103.3991
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.831
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7591
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,103.9218
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,104.1436
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.0634
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.2862
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,108.1062
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,108.1646
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.5187
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.5428
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,143.7975
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,142.3146
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.8197
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7614
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,145.1106
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,143.4348
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.1787
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.1834
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,153.0874
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,152.9047
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.4746
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.4203
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,142.9623
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,141.5405
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.7833
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7818
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,144.5829
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,142.8518
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.2002
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.3467
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,154.0366
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,153.7648
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,FALSE,15.2528
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,FALSE,15.324
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,FALSE,16.6834
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,FALSE,15.2863
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,FALSE,15.3453
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,FALSE,16.7499
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,FALSE,15.285
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,FALSE,15.3442
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,FALSE,16.7735
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,FALSE,15.2839
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,FALSE,15.34
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,FALSE,16.7823
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,64.2787
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,65.3662
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,73.7048
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,70.0031
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,71.1541
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,79.6225
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,99.6097
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,100.9448
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,116.043
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,100.1301
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,101.7065
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,113.1131
+PyTorch,fill_,fill__N1_cpu_dtypetorch.int32,short,FALSE,1.5661
+PyTorch,fill_,fill__N1024_cpu_dtypetorch.int32,short,FALSE,2.491
+PyTorch,fill_,fill__N2048_cpu_dtypetorch.int32,short,FALSE,2.6027
+PyTorch,gather,gather_M256_N512_dim0_cpu,short,FALSE,95.7869
+PyTorch,gather,gather_M512_N512_dim1_cpu,short,FALSE,56.6895
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups2",short,FALSE,16.3328
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups4",short,FALSE,16.7871
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups2",short,FALSE,33.1706
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups4",short,FALSE,33.2813
+PyTorch,Hardsigmoid,Hardsigmoid_N1_C3_H256_W256_cpu,short,FALSE,22.0699
+PyTorch,Hardsigmoid,Hardsigmoid_N4_C3_H256_W256_cpu,short,FALSE,22.4028
+PyTorch,Hardswish,Hardswish_N1_C3_H256_W256_cpu,short,FALSE,21.769
+PyTorch,Hardswish,Hardswish_N4_C3_H256_W256_cpu,short,FALSE,22.496
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,16)",short,FALSE,43.7271
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,56,56)",short,FALSE,155.211
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,FALSE,14.6238
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,FALSE,17.9502
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,FALSE,62.1607
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,FALSE,29.3439
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,FALSE,32.6772
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,FALSE,49.7318
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,FALSE,124.5945
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,FALSE,224.6027
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,FALSE,677.6898
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,FALSE,47.635
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,FALSE,84.5371
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,FALSE,453.9032
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,FALSE,137.7183
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,FALSE,250.3212
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,FALSE,763.5104
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,FALSE,51.1275
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,FALSE,90.131
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,FALSE,503.6091
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,FALSE,13.9714
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,FALSE,17.504
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,FALSE,34.1424
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,FALSE,13.986
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,FALSE,17.5393
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,FALSE,34.149
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,FALSE,47.9375
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,FALSE,84.0489
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,FALSE,453.023
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,FALSE,47.88
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,FALSE,83.8355
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,FALSE,453.0895
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,FALSE,51.024
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,FALSE,89.7526
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,FALSE,502.907
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,FALSE,51.233
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,FALSE,89.4653
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,FALSE,503.0013
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,14.5385
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,33.8506
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,127.4482
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,64.694
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,141.7011
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,70.1855
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,13.7804
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,13.7883
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,64.5975
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,64.778
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,70.2955
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,70.5035
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(1,8,16)",short,FALSE,10.2883
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(8,8,16)",short,FALSE,14.6892
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(32,8,16)",short,FALSE,16.7842
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(64,128,56,56)",short,FALSE,3613.5986
+PyTorch,linear,linear_N1_IN1_OUT1_cpu,short,FALSE,17.7464
+PyTorch,linear,linear_N4_IN256_OUT128_cpu,short,FALSE,25.6582
+PyTorch,linear,linear_N16_IN512_OUT256_cpu,short,FALSE,40.8298
+PyTorch,matmul,matmul_M1_N1_K1_trans_aTrue_trans_bFalse_cpu,short,FALSE,5.6513
+PyTorch,matmul,matmul_M128_N128_K128_trans_aTrue_trans_bFalse_cpu,short,FALSE,15.9769
+PyTorch,matmul,matmul_M256_N256_K256_trans_aFalse_trans_bTrue_cpu,short,FALSE,36.2135
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,FALSE,6.1434
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,FALSE,6.6946
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,FALSE,6.4431
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,FALSE,6.9691
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,FALSE,7.7852
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,FALSE,8.2928
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,FALSE,8.6692
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,FALSE,9.1626
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,FALSE,2.98
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,FALSE,3.4837
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,FALSE,3.2369
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,FALSE,3.7523
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,FALSE,3.6271
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,FALSE,4.1246
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,FALSE,4.4276
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,FALSE,4.9627
+PyTorch,MaxPool1d,MaxPool1d_kernel3_stride1_N8_C256_L256_cpu,short,FALSE,29.9109
+PyTorch,AvgPool1d,AvgPool1d_kernel3_stride1_N8_C256_L256_cpu,short,FALSE,222.0801
+PyTorch,MaxPool2d,"MaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,22.5565
+PyTorch,AvgPool2d,"AvgPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,16.4938
+PyTorch,AdaptiveMaxPool2d,"AdaptiveMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,16.1588
+PyTorch,FractionalMaxPool2d,"FractionalMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,22.3268
+PyTorch,MaxPool3d,"MaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,158.2535
+PyTorch,AvgPool3d,"AvgPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,46.2965
+PyTorch,AdaptiveMaxPool3d,"AdaptiveMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,32.4115
+PyTorch,FractionalMaxPool3d,"FractionalMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,22.1037
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.int32,short,FALSE,3.8882
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,3.9922
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float64,short,FALSE,3.9689
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.int32,short,FALSE,81.9179
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,101.8454
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float64,short,FALSE,192.0626
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.int32,short,FALSE,83.1678
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,104.5534
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float64,short,FALSE,195.5447
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.int32,short,FALSE,3.8216
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,3.9375
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float64,short,FALSE,3.9714
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.int32,short,FALSE,160.44
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,112.3482
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float64,short,FALSE,212.7387
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.int32,short,FALSE,151.3565
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,116.2934
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float64,short,FALSE,219.1238
+PyTorch,Softmax,Softmax_N1_C3_H256_W256_cpu,short,FALSE,34.4325
+PyTorch,Softmax,Softmax_N4_C3_H256_W256_cpu,short,FALSE,76.2321
+PyTorch,Softmax2d,Softmax2d_N1_C3_H256_W256_cpu,short,FALSE,32.0697
+PyTorch,Softmax2d,Softmax2d_N4_C3_H256_W256_cpu,short,FALSE,73.5653
+PyTorch,LogSoftmax,LogSoftmax_N1_C3_H256_W256_cpu,short,FALSE,60.9824
+PyTorch,LogSoftmax,LogSoftmax_N4_C3_H256_W256_cpu,short,FALSE,106.1641
+PyTorch,split,split_M8_N8_parts2_cpu,short,FALSE,7.7648
+PyTorch,split,split_M256_N512_parts2_cpu,short,FALSE,7.7645
+PyTorch,split,split_M512_N512_parts2_cpu,short,FALSE,7.7457
+PyTorch,sum,sum_R64_V32_dim0_contiguousTrue_cpu,short,FALSE,7.4003
+PyTorch,sum,sum_R64_V32_dim0_contiguousFalse_cpu,short,FALSE,7.7515
+PyTorch,sum,sum_R64_V32_dim1_contiguousTrue_cpu,short,FALSE,7.6422
+PyTorch,sum,sum_R64_V32_dim1_contiguousFalse_cpu,short,FALSE,8.0323
+PyTorch,sum,sum_R64_V512_dim0_contiguousTrue_cpu,short,FALSE,12.4306
+PyTorch,sum,sum_R64_V512_dim0_contiguousFalse_cpu,short,FALSE,13.821
+PyTorch,sum,sum_R64_V512_dim1_contiguousTrue_cpu,short,FALSE,15.0937
+PyTorch,sum,sum_R64_V512_dim1_contiguousFalse_cpu,short,FALSE,14.2064
+PyTorch,sum,sum_R256_V32_dim0_contiguousTrue_cpu,short,FALSE,7.76
+PyTorch,sum,sum_R256_V32_dim0_contiguousFalse_cpu,short,FALSE,9.7236
+PyTorch,sum,sum_R256_V32_dim1_contiguousTrue_cpu,short,FALSE,7.9835
+PyTorch,sum,sum_R256_V32_dim1_contiguousFalse_cpu,short,FALSE,9.6207
+PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
+PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,FALSE,20.8765
+PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
+PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,FALSE,15.3287
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.4657
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.4625
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.4165
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.0753
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.0801
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.9056
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,37.4143
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,37.4995
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,47.061
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,38.4561
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,38.6113
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,60.9784
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.0443
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.9833
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.9762
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.5451
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.4914
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.5208
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.5417
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.514
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.4671
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.5016
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,423.648
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,432.648
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,477.7001
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,428.6677
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,438.0222
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,492.3953
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.6166
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.6037
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,10.5716
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.9313
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.9191
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.4355
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,57.1153
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,57.19
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,226.2822
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,57.4159
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,57.3784
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,238.6827
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.2392
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.2414
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.2808
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.6169
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.552
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.7053
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,733.8272
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,881.4968
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,649.5353
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,736.2685
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,889.0958
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,672.2981
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.2062
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.355
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.2835
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.6287
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.7579
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.6012
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,584.3268
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,680.9102
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,701.6249
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,591.4621
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,687.5734
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,710.5012
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.6018
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.5699
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,10.5695
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.5372
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.5601
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.6734
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,248.9016
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,251.228
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,311.6496
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,254.0779
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,256.7338
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,311.0197
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.1139
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.3503
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.3583
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.9967
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,12.4721
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,12.5357
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,221.9019
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,562.2536
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,631.6971
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,224.0514
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,575.5199
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,644.9067
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.8539
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.0591
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.098
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.738
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,12.2373
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,12.2706
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,221.5425
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,562.3881
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,627.6411
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,222.5929
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,575.7397
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,645.4033
+PyTorch,add,add_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.5944
+PyTorch,add,add_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.3022
+PyTorch,add,add_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.5853
+PyTorch,add,add_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.381
+PyTorch,add,add_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6032
+PyTorch,add,add_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.2916
+PyTorch,add,add_N8_dtypetorch.quint8_contigFalse,short,FALSE,11.1193
+PyTorch,add,add_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.8562
+PyTorch,add,add_N8_dtypetorch.qint8_contigFalse,short,FALSE,11.1441
+PyTorch,add,add_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.8855
+PyTorch,add,add_N8_dtypetorch.qint32_contigFalse,short,FALSE,11.1265
+PyTorch,add,add_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.6666
+PyTorch,add,add_N64_dtypetorch.quint8_contigFalse,short,FALSE,42.827
+PyTorch,add,add_N64_dtypetorch.quint8_contigTrue,short,FALSE,12.1777
+PyTorch,add,add_N64_dtypetorch.qint8_contigFalse,short,FALSE,71.8321
+PyTorch,add,add_N64_dtypetorch.qint8_contigTrue,short,FALSE,12.2144
+PyTorch,add,add_N64_dtypetorch.qint32_contigFalse,short,FALSE,45.3253
+PyTorch,add,add_N64_dtypetorch.qint32_contigTrue,short,FALSE,31.7538
+PyTorch,add,add_N512_dtypetorch.quint8_contigFalse,short,FALSE,282.9102
+PyTorch,add,add_N512_dtypetorch.quint8_contigTrue,short,FALSE,29.0446
+PyTorch,add,add_N512_dtypetorch.qint8_contigFalse,short,FALSE,557.6633
+PyTorch,add,add_N512_dtypetorch.qint8_contigTrue,short,FALSE,28.9897
+PyTorch,add,add_N512_dtypetorch.qint32_contigFalse,short,FALSE,332.7038
+PyTorch,add,add_N512_dtypetorch.qint32_contigTrue,short,FALSE,186.5795
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.5985
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.2837
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.6095
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.2838
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6035
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.2648
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigFalse,short,FALSE,11.1752
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.8657
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigFalse,short,FALSE,11.2
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.8263
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigFalse,short,FALSE,11.1316
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.6437
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigFalse,short,FALSE,44.7881
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigTrue,short,FALSE,12.43
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigFalse,short,FALSE,57.4703
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigTrue,short,FALSE,12.4346
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigFalse,short,FALSE,45.4349
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigTrue,short,FALSE,31.8962
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigFalse,short,FALSE,300.9877
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigTrue,short,FALSE,31.4974
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigFalse,short,FALSE,410.9462
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigTrue,short,FALSE,31.4363
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigFalse,short,FALSE,344.9523
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigTrue,short,FALSE,187.2273
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.6211
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.2957
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigFalse,short,FALSE,26.7876
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigTrue,short,FALSE,11.8583
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6105
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.253
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.8428
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.2394
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigFalse,short,FALSE,482666.2954
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigTrue,short,FALSE,482636.4966
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.8113
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.5465
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigFalse,short,FALSE,24.7507
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigTrue,short,FALSE,11.7326
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigFalse,short,FALSE,482828.4171
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigTrue,short,FALSE,481824.9386
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigFalse,short,FALSE,26.0202
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigTrue,short,FALSE,12.8726
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigFalse,short,FALSE,145.5759
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigTrue,short,FALSE,32.8526
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigFalse,short,FALSE,504309.8453
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigTrue,short,FALSE,481425.6449
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigFalse,short,FALSE,178.6525
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigTrue,short,FALSE,25.5806
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.0295
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigTrue,short,FALSE,9.7243
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.4763
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.115
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.4586
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.1003
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.2296
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigTrue,short,FALSE,9.6789
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigFalse,short,FALSE,10.5389
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.1111
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.5022
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.1294
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigFalse,short,FALSE,23.0866
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigTrue,short,FALSE,10.6124
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigFalse,short,FALSE,12.5145
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigTrue,short,FALSE,10.2379
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigFalse,short,FALSE,13.7177
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigTrue,short,FALSE,11.5271
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigFalse,short,FALSE,121.8917
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigTrue,short,FALSE,21.5355
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigFalse,short,FALSE,36.5354
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigTrue,short,FALSE,15.6843
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigFalse,short,FALSE,49.8448
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigTrue,short,FALSE,17.9149
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.36
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigTrue,short,FALSE,9.9088
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.2901
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigTrue,short,FALSE,9.9043
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.2596
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigTrue,short,FALSE,9.9143
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.3176
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigTrue,short,FALSE,9.9044
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigFalse,short,FALSE,10.3161
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigTrue,short,FALSE,9.8889
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.3277
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigTrue,short,FALSE,9.9309
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigFalse,short,FALSE,12.3152
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigTrue,short,FALSE,10.0833
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigFalse,short,FALSE,12.3086
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigTrue,short,FALSE,10.0465
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigFalse,short,FALSE,13.4912
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigTrue,short,FALSE,11.3329
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigFalse,short,FALSE,36.5829
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigTrue,short,FALSE,15.4998
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigFalse,short,FALSE,36.4562
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigTrue,short,FALSE,15.3596
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigFalse,short,FALSE,49.8853
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigTrue,short,FALSE,17.6362
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,355.0354
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,368.8042
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,360.2546
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,381.3022
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,374.2793
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,390.4843
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,430.9984
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,445.2845
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,433.8101
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,451.0111
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,438.0377
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,453.5154
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,709.668
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,721.3673
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,710.1981
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,721.1726
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,710.6612
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,723.0125
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,713.4903
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,724.1643
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,712.8765
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,724.4497
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,714.4023
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,726.1041
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,109.969
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,110.0344
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,110.761
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,110.5783
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,114.0811
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,114.2354
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,132.7028
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,132.6065
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,133.5545
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,133.6274
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,137.6377
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,137.5045
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,221.7957
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,221.4864
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,222.7084
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,222.4641
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,229.7689
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,229.6451
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,216.9278
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,216.7878
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,218.8793
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,217.6596
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,227.4012
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,226.5648
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,FALSE,290.8695
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,FALSE,290.7763
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,FALSE,297.0648
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,FALSE,403.8592
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,FALSE,403.8437
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,FALSE,409.9157
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,FALSE,671.7752
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,FALSE,672.3172
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,FALSE,676.8372
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,FALSE,674.6064
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,FALSE,675.0676
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,FALSE,680.827
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,79.0452
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,80.1169
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,88.6495
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,96.7162
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,97.7005
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,106.7757
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,175.1056
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,176.8257
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,190.8659
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,171.2972
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,172.8735
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,183.3709
+PyTorch,QBatchNorm1d,QBatchNorm1d_M1_N256_K3136_cpu_dtypetorch.qint8,short,FALSE,153.9832
+PyTorch,QBatchNorm2d,QBatchNorm2d_M1_N256_K3136_cpu_dtypetorch.qint8,short,FALSE,61.4478
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.quint8,short,FALSE,104.9221
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint8,short,FALSE,104.7472
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint32,short,FALSE,128.8396
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.quint8,short,FALSE,155.9432
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint8,short,FALSE,155.3885
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint32,short,FALSE,188.1577
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.quint8,short,FALSE,190.7007
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint8,short,FALSE,192.4179
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint32,short,FALSE,231.5225
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.quint8,short,FALSE,129.3093
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint8,short,FALSE,128.2626
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint32,short,FALSE,271.3915
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.quint8,short,FALSE,191.3597
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint8,short,FALSE,191.134
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint32,short,FALSE,397.2508
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.quint8,short,FALSE,237.8599
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint8,short,FALSE,240.6118
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint32,short,FALSE,506.6646
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4297
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0997
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.7053
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9015
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0206
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1004
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6579
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3904
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3498
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0453
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6204
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9011
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0282
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1237
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5881
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3807
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5953
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3007
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6519
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9558
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0615
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2629
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6217
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.4081
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,32.5281
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,46.3943
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,32.9841
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,45.4013
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,25.19
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,37.6235
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.520375
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,37.521875
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,33.144625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,47.185375
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,33.52025
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,46.367375
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,25.179625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,38.30575
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.608625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,36.7722
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,34.2642
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,48.2449
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,34.0771
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,46.5628
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,24.3846
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,37.5572
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.827125
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,36.9891
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5395
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1835
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.682
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7864
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1007
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0939
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.644
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2399
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4854
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.09
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5728
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7568
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0606
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0656
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5755
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.186
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6756
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2891
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5692
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8301
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.193
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2332
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6213
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2713
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.6881
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,44.4952
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.8194
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.33
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.112
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.6978
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6752
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.7569
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.6535
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,44.3514
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.8102
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.2606
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.7416
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.675
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4435
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.6755
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,32.4963
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.9445
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,32.1904
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.2365
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.960125
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.6699
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.582625
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.653
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5126
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2293
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5979
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9025
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0575
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2111
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6034
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3414
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4161
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1066
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5909
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8661
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0769
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1281
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5435
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3255
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6525
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3766
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5673
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9844
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1277
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2652
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6394
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3441
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.1368
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.9901
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.6197
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.5089
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0542
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.2108
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4296
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.3959
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.9856
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.9188
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.2664
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.332
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0548
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.1588
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4009
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.1072
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.7829
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.85
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.5806
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.043
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.9851
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.1132
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.588
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.695
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5106
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1371
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6594
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7381
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0487
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.078
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6736
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1977
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3725
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,29.9698
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6123
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.6552
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0132
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.036
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6138
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1466
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6191
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2911
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.567
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8132
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0952
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1532
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6421
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2205
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.0702
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.8646
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.6099
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.0542
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0923
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.0654
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6026
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.8391
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.8884
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.8855
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.344
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.4249
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0377
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.0056
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6264
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.0507
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.7842
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.8158
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.5994
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.2045
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.956
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.9384
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.7229
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.1293
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4409
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0634
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.602
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7537
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0882
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0023
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6048
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2668
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4064
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,29.96
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.599
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.6939
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1344
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,24.9514
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5767
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2132
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6484
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2545
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5295
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7238
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.2001
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0923
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6029
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2461
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6869
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5843
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.3092
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.2157
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7831
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.6798
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1468
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.912
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6168
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5096
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.8273
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.5003
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7367
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.6689
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1267
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.7905
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.4938
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.118
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.0123
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.125
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.6769
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.624
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1879
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.5701
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4122
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1641
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.62
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7366
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0367
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1452
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6244
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1445
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3426
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0687
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.599
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7432
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0121
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.112
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5655
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1537
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5808
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3334
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5502
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7833
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1244
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2665
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6322
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2824
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6852
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5176
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.5751
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.6508
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7045
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7853
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.3255
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,35.008
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6213
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.4305
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.9976
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.8899
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.6829
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7859
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1738
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.846
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.5165
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.5134
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.2167
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.2783
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.601
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7351
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.2838
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.3512
+PyTorch,QConv1d,QConv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,121.0661
+PyTorch,QConv1d,QConv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,150.2673
+PyTorch,QConv2d,QConv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,170.6436
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128,short,FALSE,9.9914
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.3223
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512,short,FALSE,11.8468
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128,short,FALSE,11.0042
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.1633
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512,short,FALSE,12.6725
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128,short,FALSE,11.0473
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.2128
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512,short,FALSE,11.5582
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128,short,FALSE,14.0764
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256,short,FALSE,16.5054
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512,short,FALSE,20.8127
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128,short,FALSE,11.9969
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256,short,FALSE,12.1914
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512,short,FALSE,12.4254
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128,short,FALSE,12.1991
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256,short,FALSE,12.3906
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512,short,FALSE,13.1629
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,12.8605
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,14.0776
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,16.2496
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.6159
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.5633
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.5626
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.6479
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.67
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.6473
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,27.9084
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,45.0191
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,75.4599
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,8.9655
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,8.9751
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,8.9735
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.1326
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.1411
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.1444
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7564
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3127
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.7029
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.2567
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7698
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3691
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8276
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.3425
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.963
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.6502
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.9717
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.5513
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8327
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3891
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.836
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.3409
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7967
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3416
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8597
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4254
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.1124
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4759
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.1267
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.6559
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.9441
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.486
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8731
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4419
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8811
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4126
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8853
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4946
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.1347
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.6574
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.0424
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.6891
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8887
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3275
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.7541
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4383
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.0772
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4852
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8794
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4925
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.0408
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.7088
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.0971
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.7124
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups2_dtypetorch.qint8",short,FALSE,14.2907
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups4_dtypetorch.qint8",short,FALSE,14.4174
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups2_dtypetorch.qint8",short,FALSE,35.2416
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups4_dtypetorch.qint8",short,FALSE,35.5288
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,FALSE,14.4557
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,56,56)_dtypetorch.qint8",short,FALSE,33.8378
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale0.5_contigTrue,short,FALSE,7.8927
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale0.5_contigTrue,short,FALSE,12.1855
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale2.0_contigTrue,short,FALSE,7.876
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale2.0_contigTrue,short,FALSE,12.1806
+PyTorch,q_interpolate,q_interpolate_M3_N720_K1280_dtypetorch.quint8_modebilinear_scale0.83333_contigTrue,short,FALSE,31.3689
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(1,8,16)_dtypetorch.qint8",short,FALSE,14.4236
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(8,8,16)_dtypetorch.qint8",short,FALSE,19.3001
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,FALSE,19.5875
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(64,128,56,56)_dtypetorch.qint8",short,FALSE,1529.4168
+PyTorch,QLinear,QLinear_N1_IN1_OUT1_cpu,short,FALSE,61.9098
+PyTorch,QLinear,QLinear_N4_IN256_OUT128_cpu,short,FALSE,71.4256
+PyTorch,QLinear,QLinear_N16_IN512_OUT256_cpu,short,FALSE,76.5413
+PyTorch,QDynamicLinear,QDynamicLinear_N1_IN1_OUT1_cpu,short,FALSE,58.6741
+PyTorch,QDynamicLinear,QDynamicLinear_N4_IN256_OUT128_cpu,short,FALSE,68.1178
+PyTorch,QDynamicLinear,QDynamicLinear_N16_IN512_OUT256_cpu,short,FALSE,76.5539
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,165.7528
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,152.155
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,207.921
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,196.0801
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,FALSE,660.8751
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,FALSE,631.7678
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,FALSE,689.3469
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,FALSE,665.8384
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,2269.4954
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,2284.8399
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,2275.4481
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,2225.9528
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint32",short,FALSE,219.4663
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint8",short,FALSE,219.0055
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.quint8",short,FALSE,218.8345
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,FALSE,12.7215
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,FALSE,14.6691
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,FALSE,14.6109
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,FALSE,15.4468
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,FALSE,15.4546
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,FALSE,15.4778
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DFalse_dtypetorch.qint8,short,FALSE,5217.2835
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DTrue_dtypetorch.qint8,short,FALSE,10347.2789
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DFalse_dtypetorch.qint8,short,FALSE,24447.7776
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DTrue_dtypetorch.qint8,short,FALSE,48972.4577
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigFalse,short,FALSE,1.041
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigTrue,short,FALSE,1.022
+PyTorch,QuantizePerTensor,QuantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeQ,short,FALSE,25.1494
+PyTorch,DequantizePerTensor,DequantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeD,short,FALSE,17.1969
+PyTorch,QuantizePerChannel,QuantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeQ_axis0,short,FALSE,3162.223
+PyTorch,DequantizePerChannel,DequantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeD_axis0,short,FALSE,185.0221
+PyTorch,FakeQuantize,FakeQuantize_N1_C3_H512_W512_zero_point_dtypetorch.int32_cpu,short,FALSE,546.7611
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,198.838
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,208.628
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,200.6107
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,211.8795
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,415.3363
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,416.6851
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,416.4106
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,416.1216
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,417.0081
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,416.5927
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,417.6604
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,416.0931
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,183.1625
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,183.596
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,183.0808
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,182.9406
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,183.9915
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,183.407
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,182.8545
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,183.1087
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,309.1467
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,312.9401
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,300.9107
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,308.721
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,542.2402
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,544.185
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,542.8632
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,543.9898
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,544.0337
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,544.0846
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,543.6945
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,542.707
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,182.4091
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,183.0807
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,183.1147
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,182.72
+PyTorch,q_argsort,q_argsort_M512_N512_dtypetorch.quint8,short,FALSE,446.4263
+PyTorch,q_clone,q_clone_M512_N512_dtypetorch.quint8,short,FALSE,10.9374
+PyTorch,q_mean,q_mean_M512_N512_dtypetorch.quint8,short,FALSE,10.2288
+PyTorch,q_relu,q_relu_M512_N512_dtypetorch.quint8,short,FALSE,10.3366
+PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,FALSE,25.3594
+PyTorch,q_sort,q_sort_M512_N512_dtypetorch.quint8,short,FALSE,447.1303
+PyTorch,qtopk,qtopk_M512_N512_k5_dtypetorch.quint8,short,FALSE,64.856
+PyTorch,abs,abs_M512_N512_cpu,short,FALSE,12.3046
+PyTorch,abs_,abs__M512_N512_cpu,short,FALSE,7.638213467
+PyTorch,acos,acos_M512_N512_cpu,short,FALSE,18.7028
+PyTorch,acos_,acos__M512_N512_cpu,short,FALSE,65.8008
+PyTorch,argsort,argsort_M512_N512_cpu,short,FALSE,1424.792
+PyTorch,asin,asin_M512_N512_cpu,short,FALSE,17.6292
+PyTorch,asin_,asin__M512_N512_cpu,short,FALSE,13.7757
+PyTorch,atan,atan_M512_N512_cpu,short,FALSE,17.7172
+PyTorch,atan_,atan__M512_N512_cpu,short,FALSE,13.9644
+PyTorch,ceil,ceil_M512_N512_cpu,short,FALSE,11.1606
+PyTorch,ceil_,ceil__M512_N512_cpu,short,FALSE,7.4759
+PyTorch,clone,clone_M512_N512_cpu,short,FALSE,12.2572
+PyTorch,cos,cos_M512_N512_cpu,short,FALSE,18.5237
+PyTorch,cos_,cos__M512_N512_cpu,short,FALSE,14.7932
+PyTorch,cosh,cosh_M512_N512_cpu,short,FALSE,80.2281
+PyTorch,digamma,digamma_M512_N512_cpu,short,FALSE,672.0418
+PyTorch,erf,erf_M512_N512_cpu,short,FALSE,20.1027
+PyTorch,erf_,erf__M512_N512_cpu,short,FALSE,16.4605
+PyTorch,erfc,erfc_M512_N512_cpu,short,FALSE,21.6993
+PyTorch,erfc_,erfc__M512_N512_cpu,short,FALSE,17.3411
+PyTorch,erfinv,erfinv_M512_N512_cpu,short,FALSE,20.9871
+PyTorch,exp,exp_M512_N512_cpu,short,FALSE,14.1471
+PyTorch,exp_,exp__M512_N512_cpu,short,FALSE,52.1716
+PyTorch,expm1,expm1_M512_N512_cpu,short,FALSE,32.8849
+PyTorch,expm1_,expm1__M512_N512_cpu,short,FALSE,28.8886
+PyTorch,floor,floor_M512_N512_cpu,short,FALSE,11.1898
+PyTorch,floor_,floor__M512_N512_cpu,short,FALSE,7.2972
+PyTorch,frac,frac_M512_N512_cpu,short,FALSE,12.9296
+PyTorch,frac_,frac__M512_N512_cpu,short,FALSE,9.1193
+PyTorch,hardshrink,hardshrink_M512_N512_cpu,short,FALSE,14.6546
+PyTorch,lgamma,lgamma_M512_N512_cpu,short,FALSE,110.4636
+PyTorch,log,log_M512_N512_cpu,short,FALSE,14.8297
+PyTorch,log10,log10_M512_N512_cpu,short,FALSE,15.6434
+PyTorch,log10_,log10__M512_N512_cpu,short,FALSE,11.8294
+PyTorch,log1p,log1p_M512_N512_cpu,short,FALSE,27.0109
+PyTorch,log1p_,log1p__M512_N512_cpu,short,FALSE,23.1485
+PyTorch,log2,log2_M512_N512_cpu,short,FALSE,15.3609
+PyTorch,log2_,log2__M512_N512_cpu,short,FALSE,11.5224
+PyTorch,log_,log__M512_N512_cpu,short,FALSE,81.0499
+PyTorch,logit,logit_M512_N512_cpu,short,FALSE,18.1755
+PyTorch,logit_,logit__M512_N512_cpu,short,FALSE,83.57725
+PyTorch,neg,neg_M512_N512_cpu,short,FALSE,11.1491
+PyTorch,neg_,neg__M512_N512_cpu,short,FALSE,7.4216
+PyTorch,reciprocal,reciprocal_M512_N512_cpu,short,FALSE,16.1436
+PyTorch,reciprocal_,reciprocal__M512_N512_cpu,short,FALSE,12.59
+PyTorch,relu,relu_M512_N512_cpu,short,FALSE,13.1991
+PyTorch,relu_,relu__M512_N512_cpu,short,FALSE,8.7194
+PyTorch,round,round_M512_N512_cpu,short,FALSE,11.1888
+PyTorch,round_,round__M512_N512_cpu,short,FALSE,7.4217
+PyTorch,rsqrt,rsqrt_M512_N512_cpu,short,FALSE,20.9455
+PyTorch,rsqrt_,rsqrt__M512_N512_cpu,short,FALSE,17.8695
+PyTorch,sigmoid,sigmoid_M512_N512_cpu,short,FALSE,32.1797
+PyTorch,sigmoid_,sigmoid__M512_N512_cpu,short,FALSE,28.0707
+PyTorch,sign,sign_M512_N512_cpu,short,FALSE,13.2475
+PyTorch,sgn,sgn_M512_N512_cpu,short,FALSE,13.1844
+PyTorch,sin,sin_M512_N512_cpu,short,FALSE,18.4476
+PyTorch,sin_,sin__M512_N512_cpu,short,FALSE,14.5837
+PyTorch,sinh,sinh_M512_N512_cpu,short,FALSE,81.4856
+PyTorch,sqrt,sqrt_M512_N512_cpu,short,FALSE,12.4782
+PyTorch,sqrt_,sqrt__M512_N512_cpu,short,FALSE,9.536
+PyTorch,square,square_M512_N512_cpu,short,FALSE,15.1528
+PyTorch,square_,square__M512_N512_cpu,short,FALSE,10.2758
+PyTorch,tan,tan_M512_N512_cpu,short,FALSE,20.1579
+PyTorch,tan_,tan__M512_N512_cpu,short,FALSE,16.0497
+PyTorch,tanh,tanh_M512_N512_cpu,short,FALSE,14.9006
+PyTorch,tanh_,tanh__M512_N512_cpu,short,FALSE,11.5883
+PyTorch,trunc,trunc_M512_N512_cpu,short,FALSE,12.7292
+PyTorch,trunc_,trunc__M512_N512_cpu,short,FALSE,8.5329
+PyTorch,unique,unique_M512_N512_cpu,short,FALSE,21486.0475
+PyTorch,zero_,zero__M512_N512_cpu,short,FALSE,7.4783
+PyTorch,bernoulli_,bernoulli__M512_N512_cpu,short,FALSE,3986.396
+PyTorch,cauchy_,cauchy__M512_N512_cpu,short,FALSE,7504.3189
+PyTorch,digamma_,digamma__M512_N512_cpu,short,FALSE,1276.7406
+PyTorch,exponential_,exponential__M512_N512_cpu,short,FALSE,6277.716
+PyTorch,normal_,normal__M512_N512_cpu,short,FALSE,1334.7115
+PyTorch,random_,random__M512_N512_cpu,short,FALSE,1209.5657
+PyTorch,sign_,sign__M512_N512_cpu,short,FALSE,9.3767
+PyTorch,uniform_,uniform__M512_N512_cpu,short,FALSE,1248.3519
+PyTorch,half,half_M512_N512_cpu,short,FALSE,14.2058
+PyTorch,long,long_M512_N512_cpu,short,FALSE,28.371
+PyTorch,arange,arange_start0_end1000_step2.5_cpu_dtypetorch.float32,short,FALSE,7.791
+PyTorch,arange,arange_start-1024_end2048_step1_cpu_dtypetorch.float32,short,FALSE,9.501666667
+PyTorch,add_,add__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.428333333
+PyTorch,add_,add__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.516
+PyTorch,add_,add__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.43566667
+PyTorch,sub_,sub__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.567
+PyTorch,sub_,sub__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.36466667
+PyTorch,sub_,sub__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.44266667
+PyTorch,mul_,mul__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.462
+PyTorch,mul_,mul__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.666666667
+PyTorch,mul_,mul__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.804666667
+PyTorch,div_,div__M1_N1_K1_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,2.431333333
+PyTorch,div_,div__M64_N64_K64_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,192.4236667
+PyTorch,div_,div__M64_N64_K128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,194.4426667
+PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.90066667
+PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.753
+PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.34233333
+PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,4.091333333
+PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.359
+PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.472
+PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.372333333
+PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,38.91933333
+PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,39.602
+PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,3.932
+PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,13.98933333
+PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.05733333
+PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,FALSE,41.10566667
+PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,4.987666667
+PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,10.88633333
+PyTorch,logical_and,logical_and_M64_N64_K128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,10.844
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,FALSE,4.132666667
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,FALSE,4.138
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,FALSE,26.016
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,FALSE,122.5856667
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,FALSE,6.079333333
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,FALSE,6.154666667
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,FALSE,46.773
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,FALSE,138.773
+PyTorch,index_add_,index_add__M8_N32_K1_dim0_cpu_dtypetorch.float32,short,FALSE,40.94966667
+PyTorch,index_add_,index_add__M512_N512_K1_dim2_cpu_dtypetorch.float32,short,FALSE,124.4306667
+PyTorch,index_select,index_select_M8_N8_K1_dim1_cpu,short,FALSE,4.461
+PyTorch,index_select,index_select_M256_N512_K1_dim1_cpu,short,FALSE,79.63533333
+PyTorch,index_select,index_select_M512_N512_K1_dim1_cpu,short,FALSE,167.888
+PyTorch,index_select,index_select_M8_N8_K2_dim1_cpu,short,FALSE,4.544
+PyTorch,index_select,index_select_M256_N512_K2_dim1_cpu,short,FALSE,305.2126667
+PyTorch,index_select,index_select_M512_N512_K2_dim1_cpu,short,FALSE,603.7596667
+PyTorch,mm,mm_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,4.919
+PyTorch,mm,mm_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,13.90066667
+PyTorch,mm,mm_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,13.20866667
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim0",short,FALSE,5.519
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim1",short,FALSE,5.738333333
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim2",short,FALSE,5.746666667
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim3",short,FALSE,6.127
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim0",short,FALSE,19.943
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim1",short,FALSE,20.89633333
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim2",short,FALSE,116.437
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim3",short,FALSE,42.9705
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim0",short,FALSE,19.77333333
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim1",short,FALSE,20.22633333
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim2",short,FALSE,116.23
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim3",short,FALSE,42.9005
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,FALSE,4.179
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,FALSE,4.164666667
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,FALSE,4.956333333
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.bfloat16,short,FALSE,4.974666667
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.float32,short,FALSE,4.309333333
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.bfloat16,short,FALSE,4.338
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.float32,short,FALSE,5.157666667
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.bfloat16,short,FALSE,5.259333333
+PyTorch,topk,"topk_shape(16,4)_k4_dim1_cpu_dtypetorch.float32",short,FALSE,7.123333333
+PyTorch,topk,"topk_shape(1048576,)_k16_dim0_cpu_dtypetorch.float32",short,FALSE,2569.296333
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.682333333
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
+PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
\ No newline at end of file
diff --git a/benchmarks/operator_benchmark/pt/add_test.py b/benchmarks/operator_benchmark/pt/add_test.py
index 739b8ef14a54b..bec80e0d8cd13 100644
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@@ -52,6 +52,30 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)
 
+<<<<<<< HEAD
+=======
+
+"""Mircobenchmark for addmm operator."""
+
+
+class AddmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device):
+        self.inputs = {
+            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
+            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
+            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
+        }
+        self.set_module_name("addmm")
+
+    def forward(self, input_one, mat1, mat2):
+        return torch.addmm(input_one, mat1, mat2)
+
+
+op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """Mircobenchmark for addr operator."""
 
 
@@ -85,5 +109,49 @@ def forward(self, input_one, vec1, vec2):
 op_bench.generate_pt_test(addr_configs, AddrBenchmark)
 op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)
 
+<<<<<<< HEAD
+=======
+
+"""Mircobenchmark for addbmm operator."""
+
+
+class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, K, device):
+        self.inputs = {
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set()
+            ),
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+            ),
+        }
+        self.set_module_name("addbmm")
+
+    def forward(self, input_one, batch1, batch2):
+        return torch.addbmm(input_one, batch1, batch2)
+
+
+addbmm_configs = op_bench.cross_product_configs(
+    B=[2, 100],
+    M=[8, 256],
+    N=[256, 16],
+    K=[15, 16],
+    device=["cpu", "cuda"],
+    tags=["addbmm"],
+)
+
+op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark)
+op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/binary_test.py b/benchmarks/operator_benchmark/pt/binary_test.py
index c96518e3161b6..5141e3ae87957 100644
--- a/benchmarks/operator_benchmark/pt/binary_test.py
+++ b/benchmarks/operator_benchmark/pt/binary_test.py
@@ -25,7 +25,11 @@
     ],
     cross_product_configs={
         "device": ["cpu"],
+<<<<<<< HEAD
         "dtype": [torch.float, torch.bfloat16, torch.float64],
+=======
+        "dtype": [torch.float],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
     tags=["short"],
 )
@@ -56,9 +60,12 @@ def forward(self, in_one, in_two):
         ["sub", torch.sub],
         ["div", torch.div],
         ["mul", torch.mul],
+<<<<<<< HEAD
         ["asr", torch.bitwise_right_shift],
         ["lsl", torch.bitwise_left_shift],
         ["xor", torch.bitwise_xor],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
 )
 
@@ -71,8 +78,13 @@ def forward(self, in_one, in_two):
     ],
     cross_product_configs={
         "device": ["cpu", "cuda"],
+<<<<<<< HEAD
         "dtype_one": [torch.int32, torch.uint8],
         "dtype_two": [torch.int32, torch.uint8],
+=======
+        "dtype_one": [torch.int32],
+        "dtype_two": [torch.int32],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
     tags=["short"],
 )
@@ -82,8 +94,13 @@ def forward(self, in_one, in_two):
     N=[32, 64],
     K=[256, 512],
     device=["cpu", "cuda"],
+<<<<<<< HEAD
     dtype_one=[torch.int8, torch.int32, torch.uint8],
     dtype_two=[torch.int8, torch.int32, torch.uint8],
+=======
+    dtype_one=[torch.int8, torch.int32],
+    dtype_two=[torch.int8, torch.int32],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags=["long"],
 )
 
diff --git a/benchmarks/operator_benchmark/pt/bmm_test.py b/benchmarks/operator_benchmark/pt/bmm_test.py
index f867f6ac09f8d..3fde51257e4ff 100644
--- a/benchmarks/operator_benchmark/pt/bmm_test.py
+++ b/benchmarks/operator_benchmark/pt/bmm_test.py
@@ -27,12 +27,21 @@
 )
 
 batched_binary_configs_long = op_bench.cross_product_configs(
+<<<<<<< HEAD
     B=[8, 32],
     M=[256, 1024],
     N=[256, 1024],
     K=[64, 128],
     device=["cuda"],
     dtype=[torch.float32, torch.bfloat16, torch.float16],
+=======
+    B=[1, 128],
+    M=[8, 128],
+    N=[32, 64],
+    K=[4, 256],
+    device=["cpu", "cuda"],
+    dtype=[torch.float, torch.bfloat16],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags=["long"],
 )
 
@@ -40,12 +49,17 @@
 class BatchedBinaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
+<<<<<<< HEAD
             "batch1": torch.rand(
                 (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
             ),
             "batch2": torch.rand(
                 (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
             ),
+=======
+            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
+            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         self.op_func = op_func
 
@@ -58,11 +72,14 @@ def forward(self, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedBinaryOpBenchmark,
 )
+<<<<<<< HEAD
 op_bench.generate_pt_gradient_tests_from_op_list(
     batched_binary_ops,
     batched_binary_configs_long,
     BatchedBinaryOpBenchmark,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # batched ternary ops
@@ -75,6 +92,7 @@ def forward(self, batch1, batch2):
 class BatchedTernaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
+<<<<<<< HEAD
             "input_": torch.rand(
                 (B, M, K), device=device, dtype=dtype, requires_grad=self.auto_set()
             ),
@@ -84,6 +102,11 @@ def init(self, B, M, N, K, device, dtype, op_func):
             "batch2": torch.rand(
                 (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
             ),
+=======
+            "input_": torch.rand((B, M, K), device=device).to(dtype=dtype),
+            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
+            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         self.op_func = op_func
 
@@ -96,12 +119,15 @@ def forward(self, input_, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedTernaryOpBenchmark,
 )
+<<<<<<< HEAD
 op_bench.generate_pt_gradient_tests_from_op_list(
     batched_ternary_ops,
     batched_binary_configs_long,
     BatchedTernaryOpBenchmark,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: does it automatically register new scripts?
 
diff --git a/benchmarks/operator_benchmark/pt/cat_test.py b/benchmarks/operator_benchmark/pt/cat_test.py
index cf0369a433458..3138fe4d444ec 100644
--- a/benchmarks/operator_benchmark/pt/cat_test.py
+++ b/benchmarks/operator_benchmark/pt/cat_test.py
@@ -125,7 +125,11 @@ def init(self, sizes, N, dim, device):
         random.seed(42)
         inputs = []
         gen_sizes = []
+<<<<<<< HEAD
         if type(sizes) is list and N == -1:
+=======
+        if type(sizes) == list and N == -1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gen_sizes = sizes
         else:
             for i in range(N):
diff --git a/benchmarks/operator_benchmark/pt/conv_test.py b/benchmarks/operator_benchmark/pt/conv_test.py
index 65baf47e0d673..7c0ede1bc15b6 100644
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@@ -38,6 +38,7 @@ def forward(self, input):
 op_bench.generate_pt_test(
     configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
 )
+<<<<<<< HEAD
 
 
 if not torch.backends.mkldnn.is_acl_available():
@@ -48,6 +49,14 @@ def forward(self, input):
         + configs.conv_1d_configs_long,
         ConvTranspose1dBenchmark,
     )
+=======
+op_bench.generate_pt_test(
+    configs.convtranspose_1d_configs_short
+    + configs.conv_1d_configs_short
+    + configs.conv_1d_configs_long,
+    ConvTranspose1dBenchmark,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """
diff --git a/benchmarks/operator_benchmark/pt/matmul_test.py b/benchmarks/operator_benchmark/pt/matmul_test.py
index d0c58aa16e8f3..94dcc3542ec20 100644
--- a/benchmarks/operator_benchmark/pt/matmul_test.py
+++ b/benchmarks/operator_benchmark/pt/matmul_test.py
@@ -13,12 +13,19 @@
         [128, 128, 128, True, False],
         [256, 256, 256, False, True],
     ],
+<<<<<<< HEAD
     cross_product_configs={"device": ["cpu", "cuda"]},
+=======
+    cross_product_configs={
+        "device": ["cpu", "cuda"],
+    },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags=["short"],
 )
 
 
 mm_long_configs = op_bench.cross_product_configs(
+<<<<<<< HEAD
     M=[256, 1024, 3000],
     N=[512, 4096],
     K=[512, 4096],
@@ -26,11 +33,20 @@
     trans_b=[True, False],
     device=["cuda"],
     dtype=[torch.float16, torch.bfloat16, torch.float32],
+=======
+    M=[32],
+    N=[512, 128],
+    K=[64],
+    trans_a=[False, True],
+    trans_b=[True, False],
+    device=["cpu", "cuda"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags=["long"],
 )
 
 
 class MatMulBenchmark(op_bench.TorchBenchmarkBase):
+<<<<<<< HEAD
     def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float):
         # Create tensors without requires_grad first, then set it separately
         # This avoids creating graph leaves that cannot be deep copied
@@ -53,6 +69,16 @@ def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float):
         self.inputs = {
             "input_one": input_one,
             "input_two": input_two,
+=======
+    def init(self, M, N, K, trans_a, trans_b, device):
+        self.inputs = {
+            "input_one": torch.rand(M, N, device=device)
+            if trans_a
+            else torch.rand(N, M, device=device).t(),
+            "input_two": torch.rand(N, K, device=device)
+            if trans_b
+            else torch.rand(K, N, device=device).t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         self.set_module_name("matmul")
 
@@ -61,7 +87,10 @@ def forward(self, input_one, input_two):
 
 
 op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark)
+<<<<<<< HEAD
 op_bench.generate_pt_gradient_test(mm_long_configs, MatMulBenchmark)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/pt/mm_test.py b/benchmarks/operator_benchmark/pt/mm_test.py
index f9e0743ba7125..813456a6ef076 100644
--- a/benchmarks/operator_benchmark/pt/mm_test.py
+++ b/benchmarks/operator_benchmark/pt/mm_test.py
@@ -23,11 +23,19 @@
 )
 
 mm_long_configs = op_bench.cross_product_configs(
+<<<<<<< HEAD
     M=[256, 1024, 3000],
     N=[512, 4096],
     K=[512, 4096],
     device=["cuda"],
     dtype=[torch.float16, torch.bfloat16, torch.float32],
+=======
+    M=[8, 128],
+    N=[32, 64],
+    K=[256, 512],
+    device=["cpu", "cuda"],
+    dtype=[torch.float, torch.bfloat16],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags=["long"],
 )
 
@@ -35,12 +43,17 @@
 class MmOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, M, N, K, device, dtype, op_func):
         self.inputs = {
+<<<<<<< HEAD
             "input_one": torch.randn(
                 M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
             ),
             "input_two": torch.randn(
                 N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
             ),
+=======
+            "input_one": torch.randn(M, N, device=device).to(dtype=dtype),
+            "input_two": torch.randn(N, K, device=device).to(dtype=dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         self.op_func = op_func
 
@@ -51,9 +64,12 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_tests_from_op_list(
     ops_list, mm_short_configs + mm_long_configs, MmOpBenchmark
 )
+<<<<<<< HEAD
 op_bench.generate_pt_gradient_tests_from_op_list(
     ops_list, mm_long_configs, MmOpBenchmark
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/pt/stack_test.py b/benchmarks/operator_benchmark/pt/stack_test.py
index 5dea1d9ca1efd..fb454ec6b0aa8 100644
--- a/benchmarks/operator_benchmark/pt/stack_test.py
+++ b/benchmarks/operator_benchmark/pt/stack_test.py
@@ -61,7 +61,11 @@ def init(self, sizes, N, dim, device):
         random.seed(42)
         inputs = []
         gen_sizes = []
+<<<<<<< HEAD
         if type(sizes) is list and N == -1:
+=======
+        if type(sizes) == list and N == -1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gen_sizes = sizes
         else:
             for i in range(N):
diff --git a/benchmarks/sparse/test_csr.sh b/benchmarks/sparse/test_csr.sh
index f0b460b8a882b..e7833f0e6589b 100644
--- a/benchmarks/sparse/test_csr.sh
+++ b/benchmarks/sparse/test_csr.sh
@@ -11,7 +11,11 @@ export USE_MKL=1
 CMAKE_ONLY=1 python setup.py build
 ccmake build  # or cmake-gui build
 
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v .
+=======
+python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cd benchmarks
 echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
@@ -28,7 +32,11 @@ echo "----- USE_MKL=0 ------" >> $OUTFILE
 rm -rf build
 
 export USE_MKL=0
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v .
+=======
+python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cd benchmarks
 for dim0 in 1000 5000 10000; do
diff --git a/benchmarks/transformer/attention_bias_benchmarks.py b/benchmarks/transformer/attention_bias_benchmarks.py
index f6bf450633090..a7293a339ef0e 100644
--- a/benchmarks/transformer/attention_bias_benchmarks.py
+++ b/benchmarks/transformer/attention_bias_benchmarks.py
@@ -1,8 +1,14 @@
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import asdict, dataclass
 from functools import partial
 from typing import Union
+=======
+from dataclasses import asdict, dataclass
+from functools import partial
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 from tabulate import tabulate
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 928cbf27df5b1..7e91857f23f64 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -1,5 +1,6 @@
 import argparse
 import csv
+<<<<<<< HEAD
 import gc
 import itertools
 import json
@@ -14,6 +15,17 @@
 
 import numpy as np
 from config_utils import heads_input_type, load_config_file, print_default_config
+=======
+import itertools
+import random
+from collections import defaultdict
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from functools import partial
+from typing import Callable, Optional, Union
+
+import numpy as np
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from tabulate import tabulate
 from tqdm import tqdm
 
@@ -37,6 +49,7 @@
 from torch._inductor.runtime.benchmarking import benchmarker
 
 
+<<<<<<< HEAD
 def cleanup_memory():
     """Aggressively free GPU memory"""
     torch.cuda.empty_cache()
@@ -127,6 +140,8 @@ def wrapper(config, *args, **kwargs):
 SpeedupType = Literal["fwd", "bwd"]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
     # warmup
     for _ in range(5):
@@ -142,7 +157,10 @@ class ExperimentConfig:
     calculate_bwd_time: bool
     cal_bandwidth: bool
     backends: list[str]
+<<<<<<< HEAD
     max_autotune: bool
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         assert len(self.shape) == 6, (
@@ -157,7 +175,10 @@ def asdict(self):
         d.pop("cal_bandwidth", None)
         d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
         d.pop("backends", None)
+<<<<<<< HEAD
         d.pop("max_autotune", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return d
 
 
@@ -305,7 +326,10 @@ def query_key_value_clones(
     return query_ref, key_ref, value_ref
 
 
+<<<<<<< HEAD
 @safe_backend("SDPA")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_single_backend_sdpa(
     config: ExperimentConfig,
     query: torch.Tensor,
@@ -320,7 +344,10 @@ def run_single_backend_sdpa(
     backend_context = get_backend_context(backend)
     with backend_context:
         _device = torch.device("cuda")
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         eager_sdpa = generate_eager_sdpa(
             config.attn_type, config.shape, config.dtype, block_mask, score_mod
         )
@@ -369,7 +396,11 @@ def run_single_backend_sdpa(
 
         if config.calculate_bwd_time:
             # TODO: debug backward pass for njt
+<<<<<<< HEAD
             if eager_sdpa and config.attn_type != "document_mask":
+=======
+            if eager_sdpa and not config.attn_type == "document_mask":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
                 backward_eager_time = benchmark_torch_function_in_microseconds(
                     out_eager.backward, d_out, retain_graph=True
@@ -388,7 +419,10 @@ def run_single_backend_sdpa(
             )
 
 
+<<<<<<< HEAD
 @safe_backend("FlashAttention")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_single_backend_FA(
     config: ExperimentConfig,
     query: torch.Tensor,
@@ -400,9 +434,15 @@ def run_single_backend_FA(
     mask_kwargs,
     backend: str,
 ) -> ExperimentResults:
+<<<<<<< HEAD
     assert backend in ["fav3", "fakv"]
     # Generate callable for specific backend.
     if backend in ["fav3"]:
+=======
+    assert backend in ["fav2", "fav3", "fakv"]
+    # Generate callable for specific backend.
+    if backend in ["fav2", "fav3"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         FA = generate_FA_callable(
             config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
         )
@@ -453,10 +493,17 @@ def run_single_backend_FA(
     )
 
 
+<<<<<<< HEAD
 @safe_backend("flex_attention", return_dict=True)
 def run_single_experiment(
     config: ExperimentConfig,
     dynamic=False,
+=======
+def run_single_experiment(
+    config: ExperimentConfig,
+    dynamic=False,
+    max_autotune=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> dict[str, ExperimentResults]:
     device = torch.device("cuda")
     batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
@@ -476,7 +523,11 @@ def run_single_experiment(
     block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
     kernel_options = get_kernel_options(config.attn_type, config.shape)
 
+<<<<<<< HEAD
     if config.max_autotune:
+=======
+    if max_autotune:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_sdpa = torch.compile(
             flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
         )
@@ -506,7 +557,11 @@ def run_single_experiment(
 
     results = {}
     for backend in config.backends:
+<<<<<<< HEAD
         if backend in ["fav3", "fakv"]:
+=======
+        if backend in ["fav2", "fav3", "fakv"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             results[backend] = run_single_backend_FA(
                 config,
                 query,
@@ -518,7 +573,11 @@ def run_single_experiment(
                 mask_kwargs,
                 backend,
             )
+<<<<<<< HEAD
         else:  # sdpa (also supports fav2)
+=======
+        else:  # sdpa
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             results[backend] = run_single_backend_sdpa(
                 config,
                 query,
@@ -539,7 +598,11 @@ def run_single_experiment(
     sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
     sparsity = sparsity if config.attn_type != "document_mask" else 0.5
 
+<<<<<<< HEAD
     results["flex"] = ExperimentResults(
+=======
+    results["compiled"] = ExperimentResults(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fwd_time=forward_compiled_time,
         bwd_time=backward_compile_time if config.calculate_bwd_time else None,
         sparsity=sparsity,
@@ -600,15 +663,24 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
     softmax_flops = M * N * 2  # Not counting online softmax overhead
     o_flops = M * D * N * 2
     # Not counting split k overhead
+<<<<<<< HEAD
     sparsity = results.sparsity if results.sparsity is not None else 0.0
     total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
+=======
+    total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return total_flops / results.fwd_time / 1e6  # in TFLOPs/
 
 
 def get_average_speedups(results: list[Experiment], type: str, backend: str):
     # Calculate speedups
     speedups = [
+<<<<<<< HEAD
         calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
+=======
+        calculate_speedup(r.results["compiled"], r.results[backend], type)
+        for r in results
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     # Find indices of max and min speedups
@@ -636,7 +708,11 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
 def print_results(results: list[Experiment], save_path: Optional[str] = None):
     table_data = defaultdict(list)
     for experiment in results:
+<<<<<<< HEAD
         backends = experiment.config.backends + ["flex"]
+=======
+        backends = experiment.config.backends + ["compiled"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, value in experiment.asdict().items():
             if key in backends:
                 if value.fwd_time:
@@ -649,43 +725,78 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
     # Calculate speedups
     for backend in results[0].config.backends:
         fwd_speedups = [
+<<<<<<< HEAD
             calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
             for r in results
         ]
         table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
+=======
+            calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
+            for r in results
+        ]
+        table_data[f"fwd_{backend}_speedup"] = fwd_speedups
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if results[0].config.calculate_bwd_time:
         for backend in results[0].config.backends:
             bwd_speedups = [
+<<<<<<< HEAD
                 calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
                 for r in results
             ]
             table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
+=======
+                calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
+                for r in results
+            ]
+            table_data[f"bwd_{backend}_speedup"] = bwd_speedups
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Calculate mem + computational throughput
     if results[0].config.cal_bandwidth:
         fwd_bandwidth = [
+<<<<<<< HEAD
             calculate_bandwidth(r.config, r.results["flex"], type="fwd")
             for r in results
         ]
         table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
         fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
+=======
+            calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
+            for r in results
+        ]
+        table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
+        fwd_tflops = [
+            calculate_tflops(r.config, r.results["compiled"]) for r in results
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         table_data["TFlops/s"] = fwd_tflops
 
     print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
 
     for backend in results[0].config.backends:
+<<<<<<< HEAD
         if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
             continue
         print("\n")
         print(f"FWD Speedup of Flex over {backend}".center(125, "="))
+=======
+        if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
+            continue
+        print("\n")
+        print(f"FWD Speedups vs. {backend}".center(125, "="))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print("\n")
         average_data = get_average_speedups(results, type="fwd", backend=backend)
         print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
 
         if results[0].config.calculate_bwd_time:
             print("\n")
+<<<<<<< HEAD
             print(f"BWD Speedup of Flex over {backend}".center(125, "="))
+=======
+            print(f"BWD Speedups vs. {backend}".center(125, "="))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print("\n")
             average_data = get_average_speedups(results, type="bwd", backend=backend)
             print(
@@ -888,14 +999,22 @@ def get_backend_context(backend: str):
     Returns a context manager for the specified backend.
     Args:
         backend (str): The name of the backend to use.
+<<<<<<< HEAD
                        Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
+=======
+                       Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Returns:
         A context manager for the specified backend.
     Raises:
         ValueError: If an invalid backend is specified.
     """
     backends = {
+<<<<<<< HEAD
         "fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
+=======
+        "fav2": nullcontext(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
         "math": sdpa_kernel(SDPBackend.MATH),
         "efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
@@ -917,7 +1036,19 @@ def generate_FA_callable(
 ) -> Callable | None:
     if dtype not in [torch.float16, torch.bfloat16]:
         return None
+<<<<<<< HEAD
     if backend == "fav3":
+=======
+    if backend == "fav2":
+        try:
+            from flash_attn import flash_attn_func, flash_attn_varlen_func
+        except ImportError:
+            print(
+                "Flash attention 2 is not installed. Please install it to run fav2 backend. "
+            )
+            raise
+    elif backend == "fav3":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             from flash_attn.flash_attn_interface import (
                 flash_attn_func,
@@ -1123,7 +1254,10 @@ def generate_experiment_configs(
     kv_cache_size: list[int],
     cal_bandwidth: bool,
     backends: list[str],
+<<<<<<< HEAD
     max_autotune: bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[ExperimentConfig]:
     assert not (calculate_bwd and decoding), "Decoding does not support backward"
 
@@ -1167,13 +1301,17 @@ def generate_experiment_configs(
                 calculate_bwd_time=calculate_bwd,
                 cal_bandwidth=cal_bandwidth,
                 backends=backends,
+<<<<<<< HEAD
                 max_autotune=max_autotune,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
     return all_configs
 
 
+<<<<<<< HEAD
 def _output_json_for_dashboard(
     experiments,
     output_file,
@@ -1452,10 +1590,14 @@ def main(
     # Always calculate throughput
     throughput = True
     print("Backend: ", backend)
+=======
+def main(args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     seed = 123
     np.random.seed(seed)
     torch.manual_seed(seed)
     results = []
+<<<<<<< HEAD
     for experiment_count, config in enumerate(
         tqdm(
             generate_experiment_configs(
@@ -1474,17 +1616,39 @@ def main(
             )
         ),
         start=1,
+=======
+    for config in tqdm(
+        generate_experiment_configs(
+            args.calculate_bwd,
+            args.dtype,
+            args.b,
+            args.nh,
+            args.s,
+            args.d,
+            args.mods,
+            args.decoding,
+            args.kv_size,
+            args.throughput,
+            args.backend,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         results.append(
             Experiment(
                 config,
                 run_single_experiment(
                     config,
+<<<<<<< HEAD
                     dynamic=dynamic,
+=======
+                    dynamic=args.dynamic,
+                    max_autotune=args.max_autotune,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
         )
 
+<<<<<<< HEAD
         # Periodic memory cleanup every 50 experiments
         if experiment_count % 50 == 0:
             cleanup_memory()
@@ -1494,6 +1658,17 @@ def main(
     # Output JSON for dashboard if requested
     if output_json_for_dashboard:
         _output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
+=======
+    print_results(results, args.save_path)
+
+
+def heads_input_type(s):
+    try:
+        hq, hkv = map(int, s.split(","))
+        return hq, hkv
+    except Exception as e:
+        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
@@ -1502,12 +1677,15 @@ def main(
         description="Run sweep over sizes and score mods for flex attention"
     )
     parser.add_argument(
+<<<<<<< HEAD
         "--config",
         type=str,
         help="Path to JSON config file. CLI args override config file values.",
         default=None,
     )
     parser.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--dynamic",
         action="store_true",
         help="Runs a dynamic shapes version of compiled flex attention.",
@@ -1576,6 +1754,7 @@ def main(
         default=["efficient"],
         help="Backend to use for attention computation",
     )
+<<<<<<< HEAD
     parser.add_argument(
         "--output-json-for-dashboard",
         type=str,
@@ -1622,3 +1801,10 @@ def main(
     args_dict.pop("print_config", None)
 
     main(**args_dict)
+=======
+    # Parse arguments
+    args = parser.parse_args()
+    args.dtype = getattr(torch, args.dtype)
+
+    main(args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/transformer/sdpa.py b/benchmarks/transformer/sdpa.py
index b4bc77bafdd6f..fb5a15ddac2ee 100644
--- a/benchmarks/transformer/sdpa.py
+++ b/benchmarks/transformer/sdpa.py
@@ -1,8 +1,14 @@
 import itertools
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
+=======
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 4c1affd10e1bc..051d6031effa1 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -11,7 +11,11 @@ load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
 load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX")
 load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
 load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build")
+<<<<<<< HEAD
 load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build")
+=======
+load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 load(
     ":build_variables.bzl",
     "aten_cpu_source_list",
@@ -74,7 +78,11 @@ def _is_build_mode_dev():
     if is_production_build_android():
         # Android Prod builds
         return False
+<<<<<<< HEAD
     if is_production_build_ios() or is_profile_build_ios():
+=======
+    if is_production_build_ios():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # iOS Prod builds
         return False
 
@@ -176,8 +184,13 @@ THIRD_PARTY_LIBS = {
     "omp": ["//xplat/third-party/linker_lib:omp", "//third_party:no-op"],
     "pocketfft": ["//third-party/pocket_fft:pocketfft", "//third_party:pocketfft_header"],
     "psimd": ["//xplat/third-party/psimd:psimd", "//third_party:psimd"],
+<<<<<<< HEAD
     "pthreadpool": ["fbsource//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
     "pthreadpool_header": ["fbsource//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
+=======
+    "pthreadpool": ["//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
+    "pthreadpool_header": ["//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "moodycamel": ["//third-party/moodycamel:moodycamel", "//third_party:moodycamel"],
     "pyyaml": ["//third-party/pypi/pyyaml:pyyaml", "//third_party:pyyaml"],
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
@@ -391,8 +404,11 @@ def get_aten_generated_files(enabled_backends):
         "CompositeExplicitAutogradFunctions_inl.h",
         "CompositeExplicitAutogradNonFunctionalFunctions.h",
         "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
+<<<<<<< HEAD
         "ViewMetaClasses.h",
         "ViewMetaClasses.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "VmapGeneratedPlumbing.h",
         "core/ATenOpList.cpp",
         "core/TensorBody.h",
@@ -826,6 +842,7 @@ def get_pt_operator_registry_dict(
         apple_sdks = kwargs.get("apple_sdks"),
     )
 
+<<<<<<< HEAD
     # Extract existing linker_flags from kwargs and combine with default flags
     existing_linker_flags = kwargs.pop("linker_flags", [])
     combined_linker_flags = get_no_as_needed_linker_flag() + existing_linker_flags
@@ -833,6 +850,11 @@ def get_pt_operator_registry_dict(
     return dict(
         srcs = code_gen_files["srcs"],
         linker_flags = combined_linker_flags,
+=======
+    return dict(
+        srcs = code_gen_files["srcs"],
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         soname = "libtorch-code-gen.$(ext)",
@@ -950,7 +972,10 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
+<<<<<<< HEAD
                 ("", "torch/nativert/**/*.h"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ("", "torch/headeronly/**/*.h"),
                 ("", "torch/script.h"),
                 ("", "torch/library.h"),
@@ -1038,8 +1063,12 @@ def define_buck_targets(
         name = "generated-version-header",
         header_namespace = "torch",
         exported_headers = {
+<<<<<<< HEAD
             "headeronly/version.h": ":generate-version-header[version.h]",
             "version.h": "torch/csrc/api/include/torch/version.h"
+=======
+            "version.h": ":generate-version-header[version.h]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
         labels = labels,
     )
@@ -1048,6 +1077,7 @@ def define_buck_targets(
     fb_native.genrule(
         name = "generate-version-header",
         srcs = [
+<<<<<<< HEAD
             "torch/headeronly/version.h.in",
             "version.txt",
         ],
@@ -1069,6 +1099,21 @@ def define_buck_targets(
         ]),
         outs = {
             "version.h": ["torch/headeronly/version.h"],
+=======
+            "torch/csrc/api/include/torch/version.h.in",
+            "version.txt",
+        ],
+        cmd = "$(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
+            "--template-path",
+            "torch/csrc/api/include/torch/version.h.in",
+            "--version-path",
+            "version.txt",
+            "--output-path",
+            "$OUT/version.h",
+        ]),
+        outs = {
+            "version.h": ["version.h"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
         default_outs = ["."],
     )
@@ -1159,9 +1204,12 @@ def define_buck_targets(
             "--replace",
             "@AT_KLEIDIAI_ENABLED@",
             "0",
+<<<<<<< HEAD
             "--replace",
             "@AT_USE_EIGEN_SPARSE@",
             "0",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]),
         outs = {
             "Config.h": ["Config.h"],
@@ -1203,7 +1251,10 @@ def define_buck_targets(
             "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
             "Operators.h": ":gen_aten[Operators.h]",
             "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
+<<<<<<< HEAD
             "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
             "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
             "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@@ -1264,7 +1315,10 @@ def define_buck_targets(
             "torch/csrc/jit/mobile/parse_operators.cpp",
             "torch/csrc/jit/mobile/upgrader_mobile.cpp",
             "torch/csrc/jit/serialization/import_read.cpp",
+<<<<<<< HEAD
             "torch/csrc/jit/serialization/pickler_helper.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch/csrc/jit/serialization/unpickler.cpp",
         ],
         header_namespace = "",
@@ -1729,10 +1783,15 @@ def define_buck_targets(
             "torch/csrc/jit/backends/backend_debug_info.cpp",
             "torch/csrc/jit/backends/backend_interface.cpp",
         ],
+<<<<<<< HEAD
         compiler_flags = get_pt_compiler_flags() + select({
             "DEFAULT": [],
             "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
         }),
+=======
+        compiler_flags = get_pt_compiler_flags(),
+        fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         linker_flags = get_no_as_needed_linker_flag(),
@@ -2011,6 +2070,7 @@ def define_buck_targets(
                     third_party("sleef_arm"),
                 ],
             }),
+<<<<<<< HEAD
             compiler_flags = get_aten_compiler_flags() + select({
                 "DEFAULT": [],
                 "ovr_config//os:android-arm32": [
@@ -2029,6 +2089,9 @@ def define_buck_targets(
                 "DEFAULT": [],
                 "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
             }),
+=======
+            compiler_flags = get_aten_compiler_flags(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [
                 ":aten_header",
diff --git a/build.bzl b/build.bzl
index 0ce5b63e7b6ca..0b4bfc49e1442 100644
--- a/build.bzl
+++ b/build.bzl
@@ -118,9 +118,12 @@ def define_targets(rules):
             ":LazyNonNativeIr.h",
             ":RegisterDispatchDefinitions.ini",
             ":RegisterDispatchKey.cpp",
+<<<<<<< HEAD
             ":ViewMetaClassesPythonBinding.cpp",
             ":ViewMetaClasses.cpp",
             ":ViewMetaClasses.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ":native_functions.yaml",
             ":shape_inference.h",
             ":tags.yaml",
@@ -142,6 +145,21 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
     )
 
+<<<<<<< HEAD
+=======
+    rules.genrule(
+        name = "version_h",
+        srcs = [
+            ":torch/csrc/api/include/torch/version.h.in",
+            ":version.txt",
+        ],
+        outs = ["torch/csrc/api/include/torch/version.h"],
+        cmd = "$(execpath //tools/setup_helpers:gen_version_header) " +
+              "--template-path $(location :torch/csrc/api/include/torch/version.h.in) " +
+              "--version-path $(location :version.txt) --output-path $@ ",
+        tools = ["//tools/setup_helpers:gen_version_header"],
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #
 # ATen generated code
@@ -161,7 +179,10 @@ GENERATED_H = [
     "FunctionalInverses.h",
     "RedispatchFunctions.h",
     "RegistrationDeclarations.h",
+<<<<<<< HEAD
     "ViewMetaClasses.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "VmapGeneratedPlumbing.h",
 ]
 
@@ -238,7 +259,10 @@ GENERATED_CPP = [
     "RegisterFunctionalization_1.cpp",
     "RegisterFunctionalization_2.cpp",
     "RegisterFunctionalization_3.cpp",
+<<<<<<< HEAD
     "ViewMetaClasses.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 GENERATED_CPP_CORE = [
@@ -300,7 +324,10 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
     "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
     "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
     "torch/csrc/autograd/generated/python_variable_methods.cpp",
+<<<<<<< HEAD
     "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
diff --git a/build_variables.bzl b/build_variables.bzl
index 21271915f8aa6..2eb4f65485dc3 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -68,8 +68,11 @@ jit_core_sources = [
 # list for the shared files.
 
 core_sources_common = [
+<<<<<<< HEAD
     # This needs to belong here because it defines the first non-inline virtual
     # function, which matters for AutogradMetaInterface's vtable.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/autograd/autograd_meta.cpp",
     "torch/csrc/autograd/forward_grad.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
@@ -91,7 +94,10 @@ core_sources_common = [
 
 torch_unpickler_common = [
     "torch/csrc/jit/serialization/import_read.cpp",
+<<<<<<< HEAD
     "torch/csrc/jit/serialization/pickler_helper.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/jit/serialization/unpickler.cpp",
 ]
 
@@ -482,7 +488,10 @@ inductor_core_resources = [
     "torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
     "torch/csrc/inductor/inductor_ops.cpp",
     "torch/csrc/jit/serialization/pickle.cpp",
+<<<<<<< HEAD
     "torch/csrc/shim_common.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 libtorch_core_sources = sorted(
@@ -515,14 +524,20 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/TCPStore.cpp",
     "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
     "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/Types.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/Utils.cpp",
     "torch/csrc/distributed/c10d/Work.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
     "torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
     "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/cuda/StreamBlock.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/debug.cpp",
     "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
     "torch/csrc/distributed/c10d/logger.cpp",
@@ -597,20 +612,30 @@ libtorch_core_jit_sources = sorted(jit_sources_full)
 
 
 libtorch_nativert_sources = [
+<<<<<<< HEAD
     "torch/nativert/ModelRunner.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/nativert/graph/Graph.cpp",
     "torch/nativert/graph/GraphPasses.cpp",
     "torch/nativert/graph/GraphSignature.cpp",
     "torch/nativert/graph/Serialization.cpp",
     "torch/nativert/graph/TensorMeta.cpp",
+<<<<<<< HEAD
     "torch/nativert/graph/GraphUtils.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/nativert/executor/DelegateExecutor.cpp",
     "torch/nativert/executor/Placement.cpp",
     "torch/nativert/executor/ExecutionPlanner.cpp",
     "torch/nativert/executor/ExecutionFrame.cpp",
+<<<<<<< HEAD
     "torch/nativert/executor/Executor.cpp",
     "torch/nativert/executor/GraphExecutorBase.cpp",
     "torch/nativert/executor/ConstantFolder.cpp",
+=======
+    "torch/nativert/executor/GraphExecutorBase.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/nativert/executor/OpKernel.cpp",
     "torch/nativert/executor/PlacementUtils.cpp",
     "torch/nativert/executor/SerialGraphExecutor.cpp",
@@ -623,6 +648,7 @@ libtorch_nativert_sources = [
     "torch/nativert/kernels/HigherOrderKernel.cpp",
     "torch/nativert/executor/memory/GreedyBySize.cpp",
     "torch/nativert/executor/memory/Bump.cpp",
+<<<<<<< HEAD
     "torch/nativert/executor/ParallelGraphExecutor.cpp",
     "torch/nativert/kernels/CallTorchBindKernel.cpp",
     "torch/nativert/kernels/KernelFactory.cpp",
@@ -648,6 +674,8 @@ libtorch_nativert_sources = [
 libtorch_nativert_cuda_sources = [
     "torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
     "torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 torch_mobile_tracer_sources = [
@@ -670,7 +698,10 @@ libtorch_lite_eager_symbolication = [
     # Later we can split serialization and deserialization logic
     # to have better separation within build and only build relevant parts.
     "torch/csrc/jit/serialization/pickle.cpp",
+<<<<<<< HEAD
     "torch/csrc/jit/serialization/pickler_helper.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/jit/serialization/pickler.cpp",
     "torch/csrc/jit/serialization/unpickler.cpp",
 ]
@@ -757,9 +788,13 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/UCCTracing.cpp",
     "torch/csrc/distributed/c10d/UCCUtils.cpp",
     "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/cuda/CUDAEventCache.cpp",
     "torch/csrc/distributed/c10d/cuda/utils.cpp",
     "torch/csrc/distributed/c10d/cuda/StreamBlock.cu",
+=======
+    "torch/csrc/distributed/c10d/cuda/utils.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
     "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
     "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
@@ -768,6 +803,7 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
 ]
@@ -779,11 +815,20 @@ libtorch_nvshmem_sources = [
     "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
 ]
 
+=======
+    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
 
 libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
     "torch/csrc/cuda/nccl.cpp",
+<<<<<<< HEAD
 ] + libtorch_nativert_cuda_sources
+=======
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
@@ -856,7 +901,10 @@ libtorch_python_cuda_core_sources = [
     "torch/csrc/cuda/Stream.cpp",
     "torch/csrc/cuda/Graph.cpp",
     "torch/csrc/cuda/MemPool.cpp",
+<<<<<<< HEAD
     "torch/csrc/cuda/GreenContext.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/cuda/shared/cudart.cpp",
     "torch/csrc/cuda/shared/nvtx.cpp",
     "torch/csrc/cuda/utils.cpp",
@@ -892,7 +940,10 @@ libtorch_python_core_sources = [
     "torch/csrc/QScheme.cpp",
     "torch/csrc/Module.cpp",
     "torch/csrc/PyInterpreter.cpp",
+<<<<<<< HEAD
     "torch/csrc/PyInterpreterHooks.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/python_dimname.cpp",
     "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
@@ -901,7 +952,10 @@ libtorch_python_core_sources = [
     "torch/csrc/Stream.cpp",
     "torch/csrc/Event.cpp",
     "torch/csrc/TypeInfo.cpp",
+<<<<<<< HEAD
     "torch/csrc/acc/Module.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/api/src/python/init.cpp",
     "torch/csrc/autograd/functions/init.cpp",
     "torch/csrc/autograd/init.cpp",
@@ -917,7 +971,10 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_torch_functions_manual.cpp",
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/python_placement.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/dynamo/python_compiled_autograd.cpp",
     "torch/csrc/dynamo/cache_entry.cpp",
     "torch/csrc/dynamo/cpp_shim.cpp",
@@ -934,8 +991,11 @@ libtorch_python_core_sources = [
     "torch/csrc/mps/Module.cpp",
     "torch/csrc/mtia/Module.cpp",
     "torch/csrc/export/pybind.cpp",
+<<<<<<< HEAD
     "torch/csrc/export/upgrader.cpp",
     "torch/csrc/export/example_upgraders.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/inductor/aoti_package/pybind.cpp",
     "torch/csrc/inductor/aoti_runner/pybind.cpp",
     "torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
@@ -1016,9 +1076,13 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
     "torch/csrc/utils/verbose.cpp",
     "torch/csrc/cpu/Module.cpp",
+<<<<<<< HEAD
     "torch/csrc/functionalization/Module.cpp",
     "torch/csrc/instruction_counter/Module.cpp",
     "torch/nativert/python/Bindings.cpp",
+=======
+    "torch/csrc/instruction_counter/Module.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ] + lazy_tensor_core_python_sources
 
 libtorch_python_distributed_core_sources = [
@@ -1059,7 +1123,10 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
         "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
         "torch/csrc/autograd/generated/python_variable_methods.cpp",
+<<<<<<< HEAD
         "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]]
 
     _libtorch_python_sources.extend(libtorch_python_core_sources)
@@ -1075,7 +1142,10 @@ aten_cpu_non_globed_sources = [
     "aten/src/ATen/detail/MPSHooksInterface.cpp",
     "aten/src/ATen/detail/MAIAHooksInterface.cpp",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
+<<<<<<< HEAD
     "aten/src/ATen/detail/XLAHooksInterface.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten/src/ATen/detail/XPUHooksInterface.cpp",
     "aten/src/ATen/detail/MTIAHooksInterface.cpp",
     "aten/src/ATen/detail/IPUHooksInterface.cpp",
@@ -1094,7 +1164,10 @@ aten_cpu_non_globed_headers = [
     "aten/src/ATen/detail/HPUHooksInterface.h",
     "aten/src/ATen/detail/MAIAHooksInterface.h",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.h",
+<<<<<<< HEAD
     "aten/src/ATen/detail/XLAHooksInterface.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten/src/ATen/detail/XPUHooksInterface.h",
     "aten/src/ATen/detail/MTIAHooksInterface.h",
     "aten/src/ATen/detail/IPUHooksInterface.h",
@@ -1107,7 +1180,10 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/DeviceAccelerator.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
+<<<<<<< HEAD
     "aten/src/ATen/DTensorState.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten/src/ATen/EmptyTensor.cpp",
     "aten/src/ATen/ExpandUtils.cpp",
     "aten/src/ATen/CachedTensorUtils.cpp",
diff --git a/c10/BUCK.oss b/c10/BUCK.oss
index 4ec4ab5beabb4..415e5e9188feb 100644
--- a/c10/BUCK.oss
+++ b/c10/BUCK.oss
@@ -37,6 +37,11 @@ cxx_library(
     ),
     exported_linker_flags = [],
     exported_preprocessor_flags = [
+<<<<<<< HEAD
+=======
+        '-DC10_USING_CUSTOM_GENERATED_MACROS',
+        '-DC10_USE_GLOG',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         '-DC10_USE_MINIMAL_GLOG',
         '-DC10_MOBILE',
         '-fexceptions',
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index f82e460cafc31..83eced0156995 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -18,12 +18,25 @@ else()
   set(C10_LIB c10)
 endif()
 
+<<<<<<< HEAD
 set(C10_USE_GFLAGS ${USE_GFLAGS})  # also used in torch/headeronly
 set(C10_USE_GLOG ${USE_GLOG})  # also used in torch/headeronly
 set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})  # also used in torch/headeronly
 set(C10_USE_NUMA ${USE_NUMA})  # also used in torch/headeronly
 set(C10_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})  # also used in torch/headeronly
 set(C10_USE_ROCM_KERNEL_ASSERT ${USE_ROCM_KERNEL_ASSERT})  # also used in torch/headeronly
+=======
+  # ---[ Configure macro file.
+  set(C10_USE_GFLAGS ${USE_GFLAGS}) # used in cmake_macros.h.in
+  set(C10_USE_GLOG ${USE_GLOG}) # used in cmake_macros.h.in
+  set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
+  set(C10_USE_NUMA ${USE_NUMA})
+  set(C10_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+  set(C10_USE_ROCM_KERNEL_ASSERT ${USE_ROCM_KERNEL_ASSERT})
+  configure_file(
+      ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in
+      ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Note: if you want to add ANY dependency to the c10 library, make sure you
   # check with the core PyTorch developers as the dependency will be
@@ -90,8 +103,11 @@ if(NOT BUILD_LIBTORCHLESS)
   if(C10_USE_GLOG)
     target_link_libraries(c10 PUBLIC glog::glog)
   endif()
+<<<<<<< HEAD
 
   target_link_libraries(c10 PUBLIC headeronly)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   target_link_libraries(c10 PRIVATE fmt::fmt-header-only)
   target_link_libraries(c10 PRIVATE nlohmann)
   target_link_libraries(c10 PRIVATE moodycamel)
@@ -168,6 +184,11 @@ endif()
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
         DESTINATION include
         FILES_MATCHING PATTERN "*.h")
+<<<<<<< HEAD
+=======
+install(FILES ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h
+        DESTINATION include/c10/macros)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(MSVC AND C10_BUILD_SHARED_LIBS)
   install(FILES $<TARGET_PDB_FILE:c10> DESTINATION lib OPTIONAL)
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index 747b73da01352..3c40fa93f9114 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -9,7 +9,10 @@
 
 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
+<<<<<<< HEAD
 #include <c10/core/alignment.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index ff3621352632f..617e43dd2b104 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -38,8 +38,11 @@ enum class Backend {
   SparseCUDA,
   SparseCsrCPU,
   SparseCsrCUDA,
+<<<<<<< HEAD
   SparseCsrMPS,
   SparseMPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SparseHIP,
   SparseVE,
   SparseXPU,
@@ -96,10 +99,13 @@ inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::SparseCPU;
   } else if (t == DispatchKey::SparseCUDA) {
     return Backend::SparseCUDA;
+<<<<<<< HEAD
   } else if (t == DispatchKey::SparseMPS) {
     return Backend::SparseMPS;
   } else if (t == DispatchKey::SparseCsrMPS) {
     return Backend::SparseCsrMPS;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (t == DispatchKey::SparseHIP) {
     return Backend::SparseHIP;
   } else if (t == DispatchKey::SparseVE) {
@@ -178,10 +184,13 @@ inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::SparseCPU;
     case Backend::SparseCUDA:
       return DispatchKey::SparseCUDA;
+<<<<<<< HEAD
     case Backend::SparseMPS:
       return DispatchKey::SparseMPS;
     case Backend::SparseCsrMPS:
       return DispatchKey::SparseCsrMPS;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Backend::SparseHIP:
       return DispatchKey::SparseHIP;
     case Backend::SparseVE:
@@ -223,7 +232,11 @@ inline DispatchKey backendToDispatchKey(Backend b) {
     case Backend::PrivateUse1:
       return DispatchKey::PrivateUse1;
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown backend");
+=======
+      throw std::runtime_error("Unknown backend");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -274,8 +287,11 @@ inline DeviceType backendToDeviceType(Backend b) {
     case Backend::Meta:
       return DeviceType::Meta;
     case Backend::MPS:
+<<<<<<< HEAD
     case Backend::SparseMPS:
     case Backend::SparseCsrMPS:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return DeviceType::MPS;
     case Backend::HPU:
       return DeviceType::HPU;
@@ -321,10 +337,13 @@ inline const char* toString(Backend b) {
       return "SparseCPU";
     case Backend::SparseCUDA:
       return "SparseCUDA";
+<<<<<<< HEAD
     case Backend::SparseMPS:
       return "SparseMPS";
     case Backend::SparseCsrMPS:
       return "SparseCsrMPS";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Backend::SparseHIP:
       return "SparseHIP";
     case Backend::SparseVE:
@@ -377,7 +396,10 @@ inline bool isSparse(Backend b) {
     case Backend::SparseXPU:
     case Backend::SparseCPU:
     case Backend::SparseCUDA:
+<<<<<<< HEAD
     case Backend::SparseMPS:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Backend::SparseHIP:
     case Backend::SparseVE:
     case Backend::SparsePrivateUse1:
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index c923663675d6e..f9c7c82e5ebda 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -154,7 +154,11 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
   }
 };
 
+<<<<<<< HEAD
 void NoDelete(void* /*unused*/) {}
+=======
+void NoDelete(void*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 at::Allocator* GetCPUAllocator() {
   return GetAllocator(DeviceType::CPU);
diff --git a/c10/core/CPUAllocator.h b/c10/core/CPUAllocator.h
index 656adc6b14fe8..977ca2a0b6426 100644
--- a/c10/core/CPUAllocator.h
+++ b/c10/core/CPUAllocator.h
@@ -17,7 +17,11 @@ namespace c10 {
 using MemoryDeleter = void (*)(void*);
 
 // A helper function that is basically doing nothing.
+<<<<<<< HEAD
 C10_API void NoDelete(void* /*unused*/);
+=======
+C10_API void NoDelete(void*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // A simple struct that is used to report C10's memory allocation,
 // deallocation status and out-of-memory events to the profiler
diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h
index 0bec03ae417fa..15bcb2d62081c 100644
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
+<<<<<<< HEAD
 #include <c10/core/Stream.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10::CachingDeviceAllocator {
 
@@ -60,6 +63,7 @@ struct DeviceStats {
 };
 
 } // namespace c10::CachingDeviceAllocator
+<<<<<<< HEAD
 
 namespace c10 {
 
@@ -112,3 +116,5 @@ C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
 }
 
 } // namespace c10
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h
index eed3f24983424..532e2915baf66 100644
--- a/c10/core/Contiguity.h
+++ b/c10/core/Contiguity.h
@@ -12,7 +12,11 @@ namespace c10 {
 
 template <typename T>
 bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+<<<<<<< HEAD
   if (numel == 0) {
+=======
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
 
@@ -20,11 +24,19 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
   // NB: make sure we do signed arithmetic
   for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
     const auto& size_d = sizes[d];
+<<<<<<< HEAD
     if (size_d == 1) {
       continue;
     }
 
     if (strides[d] != expected_stride) {
+=======
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(size_d, 1))) {
+      continue;
+    }
+
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected_stride))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     }
     expected_stride *= size_d;
@@ -32,6 +44,7 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
   return true;
 }
 
+<<<<<<< HEAD
 // Return a SymBool with underlying symbolic expression that represents
 // contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
 // or symbolic True.
@@ -100,6 +113,33 @@ inline static c10::SymBool _compute_contiguous_sym(
 // When T is SymInt this function may throw a data dependent error.
 // _compute_channels_last_contiguous_2d_sym does not. Only use this function
 // when inputs are hinted.
+=======
+// This function will return True if the tensor is contiguous, and False if the
+// its not or if we can't determine if it is contiguous due to unbacked symbols
+// (it could be either in that case based on the actual runtime data).
+template <typename T>
+bool definitely_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (TORCH_GUARD_OR_FALSE(sym_eq(numel, 0))) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (TORCH_GUARD_OR_FALSE(sym_eq(size_d, 1))) {
+      continue;
+    }
+
+    if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride))) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 bool _compute_channels_last_contiguous_2d(
     ArrayRef<T> sizes,
@@ -111,8 +151,13 @@ bool _compute_channels_last_contiguous_2d(
       T expected = 1;
       for (auto& d : {1, 3, 2, 0}) {
         const auto& size_d = sizes[d];
+<<<<<<< HEAD
         if (size_d != 1) {
           if (strides[d] != expected) {
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return false;
           }
           expected *= size_d;
@@ -129,6 +174,7 @@ bool _compute_channels_last_contiguous_2d(
   }
 }
 
+<<<<<<< HEAD
 // Return a SymBool with underlying symbolic expression that represents
 // contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
 // or symbolic True.
@@ -188,6 +234,8 @@ inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
 // When T is SymInt this function may throw a data dependent error.
 // _compute_channels_last_contiguous_3d_sym does not. Only use this function
 // when inputs are hinted.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 bool _compute_channels_last_contiguous_3d(
     ArrayRef<T> sizes,
@@ -199,8 +247,13 @@ bool _compute_channels_last_contiguous_3d(
       T expected = 1;
       for (auto& d : {1, 4, 3, 2, 0}) {
         const auto& size_d = sizes[d];
+<<<<<<< HEAD
         if (size_d != 1) {
           if (strides[d] != expected) {
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return false;
           }
           expected *= size_d;
@@ -217,6 +270,7 @@ bool _compute_channels_last_contiguous_3d(
   }
 }
 
+<<<<<<< HEAD
 inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
     ArrayRef<c10::SymInt> sizes,
     ArrayRef<c10::SymInt> strides) {
@@ -270,6 +324,8 @@ inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 bool _compute_non_overlapping_and_dense(
     ArrayRef<T> sizes,
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 68fa6f91979ab..ea6473bb999a1 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -41,9 +41,12 @@ DeviceType parse_type(const std::string& device_string) {
         "'mkldnn' is no longer used as device type. So torch.device('mkldnn') will be "
         "deprecated and removed in the future. Please use other valid device types instead.");
   }
+<<<<<<< HEAD
   if (device_string == get_privateuse1_backend()) {
     return DeviceType::PrivateUse1;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto device = std::find_if(
       types.begin(),
       types.end(),
@@ -53,6 +56,12 @@ DeviceType parse_type(const std::string& device_string) {
   if (device != types.end()) {
     return device->second;
   }
+<<<<<<< HEAD
+=======
+  if (device_string == get_privateuse1_backend()) {
+    return DeviceType::PrivateUse1;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<const char*> device_names;
   for (const auto& it : types) {
     if (it.first) {
diff --git a/c10/core/Device.h b/c10/core/Device.h
index 52a116d4e3f6a..2e05283773edd 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -160,7 +160,11 @@ struct C10_API Device final {
   /// Return true if the device supports arbitrary strides.
   bool supports_as_strided() const noexcept {
     return type_ != DeviceType::IPU && type_ != DeviceType::XLA &&
+<<<<<<< HEAD
         type_ != DeviceType::Lazy;
+=======
+        type_ != DeviceType::Lazy && type_ != DeviceType::MTIA;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /// Same string as returned from operator<<.
diff --git a/c10/core/DeviceGuard.h b/c10/core/DeviceGuard.h
index 682c58a0a1555..1aaee484de971 100644
--- a/c10/core/DeviceGuard.h
+++ b/c10/core/DeviceGuard.h
@@ -182,7 +182,11 @@ class OptionalDeviceGuard {
   }
 
  private:
+<<<<<<< HEAD
   impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_;
+=======
+  impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Note [Whither the DeviceGuard boilerplate]
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index 907493981e117..f8b9c33f23ec6 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -158,7 +158,11 @@ void register_privateuse1_backend(const std::string& backend_name) {
   privateuse1_backend_name = backend_name;
   // Invariant: once this flag is set, privateuse1_backend_name is NEVER written
   // to.
+<<<<<<< HEAD
   privateuse1_backend_name_set.store(true, std::memory_order_release);
+=======
+  privateuse1_backend_name_set.store(true, std::memory_order_relaxed);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool is_privateuse1_backend_registered() {
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index f36b1db4f4437..fb9faf9271010 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -1,16 +1,112 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/macros/Export.h>
 
 // If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync
 // your changes into torch/headeronly/core/DeviceType.h.
 #include <torch/headeronly/core/DeviceType.h>
 
+=======
+// This is directly synchronized with caffe2/proto/caffe2.proto, but
+// doesn't require me to figure out how to get Protobuf headers into
+// ATen/core (which would require a lot more build system hacking.)
+// If you modify me, keep me synchronized with that file.
+
+#include <c10/macros/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ostream>
 #include <string>
 
 namespace c10 {
 
+<<<<<<< HEAD
+=======
+// These contains all device types that also have a BackendComponent
+// and therefore participate in per-backend functionality dispatch keys.
+// This is most backends except PrivateUse2 and PrivateUse3
+#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
+  _(CPU, extra)                                   \
+  _(CUDA, extra)                                  \
+  _(HIP, extra)                                   \
+  _(XLA, extra)                                   \
+  _(MPS, extra)                                   \
+  _(IPU, extra)                                   \
+  _(XPU, extra)                                   \
+  _(HPU, extra)                                   \
+  _(VE, extra)                                    \
+  _(Lazy, extra)                                  \
+  _(Meta, extra)                                  \
+  _(MTIA, extra)                                  \
+  _(PrivateUse1, extra)
+
+enum class DeviceType : int8_t {
+  CPU = 0,
+  CUDA = 1, // CUDA.
+  MKLDNN = 2, // Reserved for explicit MKLDNN
+  OPENGL = 3, // OpenGL
+  OPENCL = 4, // OpenCL
+  IDEEP = 5, // IDEEP.
+  HIP = 6, // AMD HIP
+  FPGA = 7, // FPGA
+  MAIA = 8, // ONNX Runtime / Microsoft
+  XLA = 9, // XLA / TPU
+  Vulkan = 10, // Vulkan
+  Metal = 11, // Metal
+  XPU = 12, // XPU
+  MPS = 13, // MPS
+  Meta = 14, // Meta (tensors with no data)
+  HPU = 15, // HPU / HABANA
+  VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
+  IPU = 18, // Graphcore IPU
+  MTIA = 19, // Meta training and inference devices
+  PrivateUse1 = 20, // PrivateUse1 device
+  // NB: If you add more devices:
+  //  - Change the implementations of DeviceTypeName and isValidDeviceType
+  //    in DeviceType.cpp
+  //  - Change the number below
+  COMPILE_TIME_MAX_DEVICE_TYPES = 21,
+};
+
+constexpr DeviceType kCPU = DeviceType::CPU;
+constexpr DeviceType kCUDA = DeviceType::CUDA;
+constexpr DeviceType kHIP = DeviceType::HIP;
+constexpr DeviceType kFPGA = DeviceType::FPGA;
+constexpr DeviceType kMAIA = DeviceType::MAIA;
+constexpr DeviceType kXLA = DeviceType::XLA;
+constexpr DeviceType kMPS = DeviceType::MPS;
+constexpr DeviceType kMeta = DeviceType::Meta;
+constexpr DeviceType kVulkan = DeviceType::Vulkan;
+constexpr DeviceType kMetal = DeviceType::Metal;
+constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kHPU = DeviceType::HPU;
+constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;
+constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kMTIA = DeviceType::MTIA;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
+
+// define explicit int constant
+constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
+    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+static_assert(
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
+    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
+    "for this constant to reflect the actual number of DeviceTypes we support "
+    "in PyTorch; it's important that this number is not too large as we "
+    "use this to allocate stack arrays in some places in our code.  If you "
+    "are indeed just adding the 20th device type, feel free to change "
+    "the check to 32; but if you are adding some sort of extensible device "
+    "types registration, please be aware that you are affecting code that "
+    "this number is small.  Try auditing uses of this constant.");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);
 
 C10_API bool isValidDeviceType(DeviceType d);
@@ -24,6 +120,18 @@ C10_API bool is_privateuse1_backend_registered();
 
 } // namespace c10
 
+<<<<<<< HEAD
+=======
+namespace std {
+template <>
+struct hash<c10::DeviceType> {
+  std::size_t operator()(c10::DeviceType k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch {
 // NOLINTNEXTLINE(misc-unused-using-decls)
 using c10::DeviceType;
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 7c239ecddede2..0b0ee5228c618 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -354,8 +354,11 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
 
       {"SparseCPU", c10::DispatchKey::SparseCPU},
       {"SparseCUDA", c10::DispatchKey::SparseCUDA},
+<<<<<<< HEAD
       {"SparseMPS", c10::DispatchKey::SparseMPS},
       {"SparseCsrMPS", c10::DispatchKey::SparseCsrMPS},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {"SparseHIP", c10::DispatchKey::SparseHIP},
       {"SparseXPU", c10::DispatchKey::SparseXPU},
       {"SparseVE", c10::DispatchKey::SparseVE},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index c513c4e8e3900..4e567e771eeeb 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -590,12 +590,19 @@ constexpr uint16_t num_runtime_entries = num_functionality_keys +
 constexpr uint16_t full_backend_mask =
     (static_cast<uint16_t>(1) << num_backends) - 1;
 
+<<<<<<< HEAD
 C10_API const char* toString(DispatchKey /*t*/);
 C10_API const char* toString(BackendComponent /*t*/);
 C10_API std::ostream& operator<<(std::ostream& /*str*/, DispatchKey /*rhs*/);
 C10_API std::ostream& operator<<(
     std::ostream& /*str*/,
     BackendComponent /*rhs*/);
+=======
+C10_API const char* toString(DispatchKey);
+C10_API const char* toString(BackendComponent);
+C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
+C10_API std::ostream& operator<<(std::ostream&, BackendComponent);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
 
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index d46bf7efeed6a..cb1c556b68ea1 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -172,10 +172,17 @@ class DispatchKeySet final {
   // use of DispatchKeySet in TLS requires this.
   constexpr DispatchKeySet() = default;
 
+<<<<<<< HEAD
   constexpr DispatchKeySet(Full /*unused*/)
       : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
 
   constexpr DispatchKeySet(FullAfter /*unused*/, DispatchKey t)
+=======
+  constexpr DispatchKeySet(Full)
+      : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
+
+  constexpr DispatchKeySet(FullAfter, DispatchKey t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // LSB after t are OK, but not t itself.
       // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
       // Quantized > Dense). But backends don't really have an ordering.
@@ -191,7 +198,11 @@ class DispatchKeySet final {
 
   // Public version of DispatchKeySet(uint64_t) API; external users
   // must be explicit when they do this!
+<<<<<<< HEAD
   constexpr DispatchKeySet(Raw /*unused*/, uint64_t x) : repr_(x) {}
+=======
+  constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   constexpr explicit DispatchKeySet(BackendComponent k) {
     if (k == BackendComponent::InvalidBit) {
@@ -631,10 +642,17 @@ class DispatchKeySet final {
   }
 };
 
+<<<<<<< HEAD
 C10_API std::string toString(DispatchKeySet /*ts*/);
 C10_API std::ostream& operator<<(std::ostream& /*os*/, DispatchKeySet /*ts*/);
 
 inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+=======
+C10_API std::string toString(DispatchKeySet);
+C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
+
+C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
 }
 
diff --git a/c10/core/Event.h b/c10/core/Event.h
index dfbb17e37da99..8bc87f3aa47e4 100644
--- a/c10/core/Event.h
+++ b/c10/core/Event.h
@@ -127,7 +127,11 @@ struct Event final {
   }
 
   void synchronize() const {
+<<<<<<< HEAD
     impl_.synchronize();
+=======
+    return impl_.synchronize();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 2885394031404..5b0a119e8417c 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -102,7 +102,11 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
   } else {
     std::random_device rd;
     // limit to 53 bits to ensure unique representation in double
+<<<<<<< HEAD
     s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+=======
+    s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return s;
 }
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
index 0d09e0ed46f4e..07b2b3a20ce27 100644
--- a/c10/core/Layout.h
+++ b/c10/core/Layout.h
@@ -32,7 +32,10 @@ inline Layout layout_from_backend(Backend backend) {
   switch (backend) {
     case Backend::SparseCPU:
     case Backend::SparseCUDA:
+<<<<<<< HEAD
     case Backend::SparseMPS:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Backend::SparseHIP:
     case Backend::SparseVE:
     case Backend::SparseXPU:
@@ -42,13 +45,20 @@ inline Layout layout_from_backend(Backend backend) {
       return Layout::Mkldnn;
     case Backend::SparseCsrCPU:
     case Backend::SparseCsrCUDA:
+<<<<<<< HEAD
     case Backend::SparseCsrMPS:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case Backend::SparseCsrHIP:
     case Backend::SparseCsrVE:
     case Backend::SparseCsrXPU:
       TORCH_CHECK(
           false,
+<<<<<<< HEAD
           "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU|MPS) to a unique layout.");
+=======
+          "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU) to a unique layout.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       return Layout::Strided;
   }
diff --git a/c10/core/RefcountedDeleter.cpp b/c10/core/RefcountedDeleter.cpp
index a23b0fbffb198..6de57df4ea631 100644
--- a/c10/core/RefcountedDeleter.cpp
+++ b/c10/core/RefcountedDeleter.cpp
@@ -20,8 +20,12 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
   std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
   c10::DataPtr& data_ptr = storage.mutable_data_ptr();
 
+<<<<<<< HEAD
   if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
       reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
+=======
+  if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Data pointer is already shared
     return;
   }
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 1ec0cdb6751e9..33cc2358a79af 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -60,7 +60,11 @@ struct C10_API SafePyObject {
   c10::impl::PyInterpreter& pyinterpreter() const {
     return *pyinterpreter_;
   }
+<<<<<<< HEAD
   PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
+=======
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // stop tracking the current object, and return it
   PyObject* release() {
@@ -103,7 +107,11 @@ struct C10_API SafePyHandle {
   c10::impl::PyInterpreter& pyinterpreter() const {
     return *pyinterpreter_;
   }
+<<<<<<< HEAD
   PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
+=======
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void reset() {
     data_ = nullptr;
     pyinterpreter_ = nullptr;
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index c76dfa002e62a..66907cb87b796 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -191,17 +191,23 @@ class C10_API Scalar {
   isIntegral() const {
     return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag;
   }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool isIntegral(bool includeBool) const {
     return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag ||
         (includeBool && isBoolean());
   }
 
+<<<<<<< HEAD
   // See Note [Meaning of HAS_u]
   bool isUnsigned() const {
     return Tag::HAS_u == tag || (Tag::HAS_i == tag && v.i >= 0);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool isComplex() const {
     return Tag::HAS_z == tag;
   }
@@ -336,7 +342,11 @@ class C10_API Scalar {
     } else if (isBoolean()) {
       return ScalarType::Bool;
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown scalar type.");
+=======
+      throw std::runtime_error("Unknown scalar type.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -428,7 +438,11 @@ class C10_API Scalar {
       typename std::enable_if_t<
           std::is_integral_v<T> && !std::is_same_v<T, bool>,
           bool>* = nullptr>
+<<<<<<< HEAD
   Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_i) {
+=======
+  Scalar(T vv, bool) : tag(Tag::HAS_i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     v.i = convert<decltype(v.i), T>(vv);
   }
 
@@ -437,14 +451,22 @@ class C10_API Scalar {
       typename std::enable_if_t<
           !std::is_integral_v<T> && !c10::is_complex<T>::value,
           bool>* = nullptr>
+<<<<<<< HEAD
   Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_d) {
+=======
+  Scalar(T vv, bool) : tag(Tag::HAS_d) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     v.d = convert<decltype(v.d), T>(vv);
   }
 
   template <
       typename T,
       typename std::enable_if_t<c10::is_complex<T>::value, bool>* = nullptr>
+<<<<<<< HEAD
   Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_z) {
+=======
+  Scalar(T vv, bool) : tag(Tag::HAS_z) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     v.z = convert<decltype(v.z), T>(vv);
   }
 };
diff --git a/c10/core/ScalarType.cpp b/c10/core/ScalarType.cpp
index 24cf425e41d18..d5a8b92c44414 100644
--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@@ -228,7 +228,11 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
     case c10::ScalarType::Float4_e2m1fn_x2:
       return std::make_pair("float4_e2m1fn_x2", "");
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unimplemented scalar type");
+=======
+      throw std::runtime_error("Unimplemented scalar type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index e0c84370e878c..8194b8a14291a 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -19,17 +19,192 @@
 
 #include <array>
 #include <cstddef>
+<<<<<<< HEAD
+=======
+#include <cstdint>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <limits>
 #include <ostream>
 #include <type_traits>
 #include <unordered_map>
 
+<<<<<<< HEAD
 #include <torch/headeronly/core/ScalarType.h>
 
 namespace c10 {
 
 // See [dtype Macros note] in torch/headeronly/core/ScalarType.h
 // regarding macros.
+=======
+namespace c10 {
+
+// dummy struct for uint1 to uint7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_uint1_7_t {};
+
+// dummy struct for int1 to int7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_int1_7_t {};
+
+// For the macros below:
+//
+// For users: If you want to macro some code for all non-QInt scalar types
+// (i.e. types with complete information, you probably want one of the
+// AT_FORALL_SCALAR_TYPES / AT_FORALL_SCALAR_TYPES_AND macros below, which are
+// designed to behave similarly to the Dispatch macros with the same name.
+//
+// For adding a new dtype: In the beginning, we had an idea that there was a
+// list of all scalar types, and you could use AT_FORALL_SCALAR_TYPES to
+// iterate over them.  But over the years we added weird types which couldn't
+// be handled uniformly everywhere and so in the end we ended up with some
+// mish-mosh of some helper macros, but mostly use sites making a call about
+// what dtypes they can or can't support.  So if you want to add a new dtype,
+// the preferred resolution is to find a dtype similar to what you want,
+// grep for it and edit all the sites you find this way.  If you need to add
+// a completely new kind of dtype, you're going to have to laboriously audit
+// all of the sites everywhere to figure out how it should work.  Consulting
+// some old PRs where we added new dtypes (check history of this file) can
+// help give you an idea where to start.
+
+// NB: Order matters for this macro; it is relied upon in
+// _promoteTypesLookup and the serialization format.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \
+  _(uint8_t, Byte) /* 0 */                               \
+  _(int8_t, Char) /* 1 */                                \
+  _(int16_t, Short) /* 2 */                              \
+  _(int, Int) /* 3 */                                    \
+  _(int64_t, Long) /* 4 */                               \
+  _(at::Half, Half) /* 5 */                              \
+  _(float, Float) /* 6 */                                \
+  _(double, Double) /* 7 */                              \
+  _(c10::complex<c10::Half>, ComplexHalf) /* 8 */        \
+  _(c10::complex<float>, ComplexFloat) /* 9 */           \
+  _(c10::complex<double>, ComplexDouble) /* 10 */        \
+  _(bool, Bool) /* 11 */                                 \
+  _(c10::qint8, QInt8) /* 12 */                          \
+  _(c10::quint8, QUInt8) /* 13 */                        \
+  _(c10::qint32, QInt32) /* 14 */                        \
+  _(at::BFloat16, BFloat16) /* 15 */                     \
+  _(c10::quint4x2, QUInt4x2) /* 16 */                    \
+  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
+  _(c10::bits1x8, Bits1x8) /* 18 */                      \
+  _(c10::bits2x4, Bits2x4) /* 19 */                      \
+  _(c10::bits4x2, Bits4x2) /* 20 */                      \
+  _(c10::bits8, Bits8) /* 21 */                          \
+  _(c10::bits16, Bits16) /* 22 */                        \
+  _(c10::Float8_e5m2, Float8_e5m2) /* 23 */              \
+  _(c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */          \
+  _(c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */      \
+  _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */      \
+  _(uint16_t, UInt16) /* 27 */                           \
+  _(uint32_t, UInt32) /* 28 */                           \
+  _(uint64_t, UInt64) /* 29 */                           \
+  _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */             \
+  _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */             \
+  _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */             \
+  _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */             \
+  _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */             \
+  _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */             \
+  _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */             \
+  _(c10::dummy_int1_7_t<1>, Int1) /* 37 */               \
+  _(c10::dummy_int1_7_t<2>, Int2) /* 38 */               \
+  _(c10::dummy_int1_7_t<3>, Int3) /* 39 */               \
+  _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
+  _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
+  _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
+  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
+  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */        \
+  _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
+
+// If you want to support ComplexHalf for real, add ComplexHalf
+// into this macro (and change the name).  But beware: convert()
+// doesn't work for all the conversions you need...
+//
+// TODO: To add unsigned int types here, we must define accumulate type.
+// But uint8 currently accumulates into int64, so we would have to make
+// an inconsistent choice for the larger types.  Difficult.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \
+  _(uint8_t, Byte)                                                      \
+  _(int8_t, Char)                                                       \
+  _(int16_t, Short)                                                     \
+  _(int, Int)                                                           \
+  _(int64_t, Long)                                                      \
+  _(at::Half, Half)                                                     \
+  _(float, Float)                                                       \
+  _(double, Double)                                                     \
+  _(c10::complex<float>, ComplexFloat)                                  \
+  _(c10::complex<double>, ComplexDouble)                                \
+  _(bool, Bool)                                                         \
+  _(at::BFloat16, BFloat16)                                             \
+  _(at::Float8_e5m2, Float8_e5m2)                                       \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)
+
+// This macro controls many of our C++ APIs, including constructors
+// for Scalar as well as the data() and item() accessors on Tensor
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
+  _(uint8_t, Byte)                             \
+  _(int8_t, Char)                              \
+  _(int16_t, Short)                            \
+  _(int, Int)                                  \
+  _(int64_t, Long)                             \
+  _(at::Half, Half)                            \
+  _(float, Float)                              \
+  _(double, Double)                            \
+  _(c10::complex<c10::Half>, ComplexHalf)      \
+  _(c10::complex<float>, ComplexFloat)         \
+  _(c10::complex<double>, ComplexDouble)       \
+  _(bool, Bool)                                \
+  _(at::BFloat16, BFloat16)                    \
+  _(at::Float8_e5m2, Float8_e5m2)              \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
+
+enum class ScalarType : int8_t {
+#define DEFINE_ST_ENUM_VAL_(_1, n) n,
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
+#undef DEFINE_ENUM_ST_ENUM_VAL_
+      Undefined,
+  NumOptions
+};
+
+constexpr uint16_t NumScalarTypes =
+    static_cast<uint16_t>(ScalarType::NumOptions);
+
+namespace impl {
+
+// These are used to map ScalarTypes to C++ types.
+
+template <c10::ScalarType N>
+struct ScalarTypeToCPPType;
+
+#define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type)                \
+  template <>                                                                \
+  struct ScalarTypeToCPPType<c10::ScalarType::scalar_type> {                 \
+    using type = cpp_type;                                                   \
+                                                                             \
+    /* This is a workaround for the CUDA bug which prevents */               \
+    /* ::detail::ScalarTypeToCType<T>::type being used directly due to */    \
+    /* ambiguous reference which can't to be resolved. For some reason it */ \
+    /* can't pick between at::detail and at::cuda::detail. */                \
+    /* For repro example, please see: */                                     \
+    /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */    \
+    /* TODO: remove once the bug is fixed. */                                \
+    static type t;                                                           \
+  };
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
+
+#undef SPECIALIZE_ScalarTypeToCPPType
+
+template <c10::ScalarType N>
+using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
+
+} // namespace impl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 struct CppTypeToScalarType;
@@ -45,6 +220,133 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 
 #undef SPECIALIZE_CppTypeToScalarType
 
+<<<<<<< HEAD
+=======
+// NB: despite its generic sounding name, the macros that don't take _AND
+// are mostly only used by tensorexpr
+#define AT_FORALL_INT_TYPES(_) \
+  _(uint8_t, Byte)             \
+  _(int8_t, Char)              \
+  _(int16_t, Short)            \
+  _(int, Int)                  \
+  _(int64_t, Long)
+
+#define AT_FORALL_SCALAR_TYPES(_) \
+  _(uint8_t, Byte)                \
+  _(int8_t, Char)                 \
+  _(int16_t, Short)               \
+  _(int, Int)                     \
+  _(int64_t, Long)                \
+  _(float, Float)                 \
+  _(double, Double)
+
+// These macros are often controlling how many template instantiations we
+// create for kernels.  It is typically inappropriate to add new dtypes here,
+// instead, new types should be added to use sites on a case-by-case basis.
+// We generally are not accepting new dtypes due to binary size concerns.
+
+#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE>::t),  \
+    SCALARTYPE)
+
+#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                               \
+  _(int8_t, Char)                                                \
+  _(int16_t, Short)                                              \
+  _(int, Int)                                                    \
+  _(int64_t, Long)                                               \
+  _(float, Float)                                                \
+  _(double, Double)                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE1>::t),                \
+    SCALARTYPE1)                                                 \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE2>::t),                \
+    SCALARTYPE2)
+
+#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
+  _(uint8_t, Byte)                                                            \
+  _(int8_t, Char)                                                             \
+  _(int16_t, Short)                                                           \
+  _(int, Int)                                                                 \
+  _(int64_t, Long)                                                            \
+  _(float, Float)                                                             \
+  _(double, Double)                                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE1>::t),                             \
+    SCALARTYPE1)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE2>::t),                             \
+    SCALARTYPE2)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE3>::t),                             \
+    SCALARTYPE3)
+
+#define AT_FORALL_SCALAR_TYPES_AND7(              \
+    SCALARTYPE1,                                  \
+    SCALARTYPE2,                                  \
+    SCALARTYPE3,                                  \
+    SCALARTYPE4,                                  \
+    SCALARTYPE5,                                  \
+    SCALARTYPE6,                                  \
+    SCALARTYPE7,                                  \
+    _)                                            \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE1>::t), \
+    SCALARTYPE1)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE2>::t), \
+    SCALARTYPE2)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE3>::t), \
+    SCALARTYPE3)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE4>::t), \
+    SCALARTYPE4)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE5>::t), \
+    SCALARTYPE5)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE6>::t), \
+    SCALARTYPE6)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE7>::t), \
+    SCALARTYPE7)
+
+#define AT_FORALL_QINT_TYPES(_) \
+  _(c10::qint8, QInt8)          \
+  _(c10::quint8, QUInt8)        \
+  _(c10::qint32, QInt32)        \
+  _(c10::quint4x2, QUInt4x2)    \
+  _(c10::quint2x4, QUInt2x4)
+
+#define AT_FORALL_FLOAT8_TYPES(_)         \
+  _(at::Float8_e5m2, Float8_e5m2)         \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
+
+#define AT_FORALL_COMPLEX_TYPES(_)     \
+  _(c10::complex<float>, ComplexFloat) \
+  _(c10::complex<double>, ComplexDouble)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
@@ -52,6 +354,22 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
 #undef DEFINE_CONSTANT
 
+<<<<<<< HEAD
+=======
+inline const char* toString(ScalarType t) {
+#define DEFINE_CASE(_, name) \
+  case ScalarType::name:     \
+    return #name;
+
+  switch (t) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline size_t elementSize(ScalarType t) {
 #define CASE_ELEMENTSIZE_CASE(ctype, name) \
   case ScalarType::name:                   \
@@ -137,6 +455,25 @@ inline ScalarType toQIntType(ScalarType t) {
   }
 }
 
+<<<<<<< HEAD
+=======
+inline ScalarType toUnderlying(ScalarType t) {
+  switch (t) {
+    case ScalarType::QUInt8:
+    case ScalarType::QUInt4x2:
+      [[fallthrough]];
+    case ScalarType::QUInt2x4:
+      return ScalarType::Byte;
+    case ScalarType::QInt8:
+      return ScalarType::Char;
+    case ScalarType::QInt32:
+      return ScalarType::Int;
+    default:
+      return t;
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool isSignedType(ScalarType t) {
 #define CASE_ISSIGNED(name)     \
   case ScalarType::name:        \
@@ -279,6 +616,15 @@ inline bool canCast(const ScalarType from, const ScalarType to) {
 
 C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
 
+<<<<<<< HEAD
+=======
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::ScalarType scalar_type) {
+  return stream << toString(scalar_type);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Returns a pair of strings representing the names for each dtype.
 // The returned pair is (name, legacy_name_if_applicable)
 C10_API std::pair<std::string, std::string> getDtypeNames(
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index e061375b8887b..f818047b73a9d 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -61,6 +61,7 @@ struct C10_API Storage {
             allocator,
             resizable)) {}
 
+<<<<<<< HEAD
   // Creates storage with pre-allocated memory buffer. Allocator is given for
   // potential future reallocations, however it can be nullptr if the storage
   // is non-resizable
@@ -79,6 +80,10 @@ struct C10_API Storage {
 
  protected:
   explicit Storage(unsafe_borrow_t /*unused*/, const Storage& rhs)
+=======
+ protected:
+  explicit Storage(unsafe_borrow_t, const Storage& rhs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
             rhs.storage_impl_.get())) {}
 
@@ -149,7 +154,11 @@ struct C10_API Storage {
   }
 
   void set_data_ptr_noswap(at::DataPtr&& data_ptr) const {
+<<<<<<< HEAD
     storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+=======
+    return storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   DeviceType device_type() const {
diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index f71bfe52bbf9c..c4563d5987e56 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -82,15 +82,23 @@ class C10_API Stream final {
   /// should use the provided APIs to get a stream.  In particular,
   /// we don't require backends to give any guarantees about non-zero
   /// StreamIds; they are welcome to allocate in whatever way they like.
+<<<<<<< HEAD
   explicit Stream(Unsafe /*unused*/, Device device, StreamId id)
+=======
+  explicit Stream(Unsafe, Device device, StreamId id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : device_(device), id_(id) {}
 
   /// Construct the default stream of a Device.  The default stream is
   /// NOT the same as the current stream; default stream is a fixed stream
   /// that never changes, whereas the current stream may be changed by
   /// StreamGuard.
+<<<<<<< HEAD
   explicit Stream(Default /*unused*/, Device device)
       : device_(device), id_(0) {}
+=======
+  explicit Stream(Default, Device device) : device_(device), id_(0) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool operator==(const Stream& other) const noexcept {
     return this->device_ == other.device_ && this->id_ == other.id_;
diff --git a/c10/core/StreamGuard.h b/c10/core/StreamGuard.h
index c901a8a768f15..88ba218e6d3fc 100644
--- a/c10/core/StreamGuard.h
+++ b/c10/core/StreamGuard.h
@@ -143,7 +143,11 @@ struct OptionalStreamGuard {
   }
 
  private:
+<<<<<<< HEAD
   c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_;
+=======
+  c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index d5d509e239b1d..6097de570570a 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -40,8 +40,13 @@ class C10_API SymBool {
     return *c;
   }
 
+<<<<<<< HEAD
   SymBool sym_and(const SymBool& /*sci*/) const;
   SymBool sym_or(const SymBool& /*sci*/) const;
+=======
+  SymBool sym_and(const SymBool&) const;
+  SymBool sym_or(const SymBool&) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SymBool sym_not() const;
 
   SymBool operator&(const SymBool& other) const {
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index e3064f0b85888..59c06decb3e81 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -43,6 +43,7 @@ class C10_API SymFloat {
     return data_;
   }
 
+<<<<<<< HEAD
   SymFloat operator+(const SymFloat& /*sci*/) const;
   SymFloat operator-(const SymFloat& /*sci*/) const;
   SymFloat operator*(const SymFloat& /*sci*/) const;
@@ -54,6 +55,19 @@ class C10_API SymFloat {
   SymBool sym_le(const SymFloat& /*sci*/) const;
   SymBool sym_gt(const SymFloat& /*sci*/) const;
   SymBool sym_ge(const SymFloat& /*sci*/) const;
+=======
+  SymFloat operator+(const SymFloat&) const;
+  SymFloat operator-(const SymFloat&) const;
+  SymFloat operator*(const SymFloat&) const;
+  SymFloat operator/(const SymFloat&) const;
+
+  SymBool sym_eq(const SymFloat&) const;
+  SymBool sym_ne(const SymFloat&) const;
+  SymBool sym_lt(const SymFloat&) const;
+  SymBool sym_le(const SymFloat&) const;
+  SymBool sym_gt(const SymFloat&) const;
+  SymBool sym_ge(const SymFloat&) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool operator==(const SymFloat& o) const {
     return sym_eq(o).guard_bool(__FILE__, __LINE__);
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 1078ae03a40d7..2cc2f4cb20be3 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -4,6 +4,10 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/safe_numerics.h>
+<<<<<<< HEAD
+=======
+#include <functional>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 
@@ -19,6 +23,7 @@ void SymInt::promote_to_negative() {
   s.data_ = 0;
 }
 
+<<<<<<< HEAD
 std::optional<int64_t> SymInt::maybe_as_int_slow_path() const {
   auto* node = toSymNodeImplUnowned();
   if (auto c = node->constant_int()) {
@@ -27,6 +32,8 @@ std::optional<int64_t> SymInt::maybe_as_int_slow_path() const {
   return node->maybe_as_int();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 SymNode SymInt::toSymNode() const {
   TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(
       is_heap_allocated(), "SymInt::toSymNode is_heap_allocated");
@@ -49,6 +56,7 @@ bool SymInt::has_hint() const {
   return toSymNodeImplUnowned()->has_hint();
 }
 
+<<<<<<< HEAD
 #define DEFINE_BINARY(API, METHOD, RET)                              \
   RET SymInt::API(const SymInt& sci) const {                         \
     if (auto ma = maybe_as_int()) {                                  \
@@ -57,6 +65,17 @@ bool SymInt::has_hint() const {
           "should have hit fast path in the header in this case.");  \
       auto b = sci.toSymNode();                                      \
       return RET(b->wrap_int(*ma)->METHOD(b));                       \
+=======
+#define DEFINE_BINARY(API, OP, METHOD, RET)                          \
+  RET SymInt::API(const SymInt& sci) const {                         \
+    if (auto ma = maybe_as_int()) {                                  \
+      if (auto mb = sci.maybe_as_int()) {                            \
+        return RET(OP(*ma, *mb));                                    \
+      } else {                                                       \
+        auto b = sci.toSymNode();                                    \
+        return RET(b->wrap_int(*ma)->METHOD(b));                     \
+      }                                                              \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {                                                         \
       if (auto mb = sci.maybe_as_int()) {                            \
         auto a = toSymNodeImplUnowned();                             \
@@ -67,6 +86,7 @@ bool SymInt::has_hint() const {
     }                                                                \
   }
 
+<<<<<<< HEAD
 DEFINE_BINARY(operator_add_slow_path, add, SymInt)
 DEFINE_BINARY(operator_sub_slow_path, sub, SymInt)
 DEFINE_BINARY(operator_mul_slow_path, mul, SymInt)
@@ -84,6 +104,25 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt)
 SymInt::operator SymFloat() const {
   if (auto ma = maybe_as_int()) {
     return SymFloat(static_cast<double>(*ma));
+=======
+DEFINE_BINARY(operator+, std::plus<>(), add, SymInt)
+DEFINE_BINARY(operator-, std::minus<>(), sub, SymInt)
+DEFINE_BINARY(operator*, std::multiplies<>(), mul, SymInt)
+DEFINE_BINARY(operator/, std::divides<>(), floordiv, SymInt)
+DEFINE_BINARY(operator%, std::modulus<>(), mod, SymInt)
+DEFINE_BINARY(sym_eq, std::equal_to<>(), eq, SymBool)
+DEFINE_BINARY(sym_ne, std::not_equal_to<>(), ne, SymBool)
+DEFINE_BINARY(sym_lt, std::less<>(), lt, SymBool)
+DEFINE_BINARY(sym_le, std::less_equal<>(), le, SymBool)
+DEFINE_BINARY(sym_gt, std::greater<>(), gt, SymBool)
+DEFINE_BINARY(sym_ge, std::greater_equal<>(), ge, SymBool)
+DEFINE_BINARY(min, std::min, sym_min, SymInt)
+DEFINE_BINARY(max, std::max, sym_max, SymInt)
+
+SymInt::operator SymFloat() const {
+  if (auto ma = maybe_as_int()) {
+    return SymFloat(double(*ma));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     return SymFloat(toSymNodeImplUnowned()->sym_float());
   }
@@ -129,6 +168,17 @@ int64_t SymInt::guard_int(const char* file, int64_t line) const {
   }
 }
 
+<<<<<<< HEAD
+=======
+bool SymInt::expect_size(const char* file, int64_t line) const {
+  if (auto ma = maybe_as_int()) {
+    return *ma >= 0;
+  } else {
+    return toSymNodeImplUnowned()->expect_size(file, line);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 SymInt operator-(const SymInt& s) {
   if (auto ma = s.maybe_as_int()) {
     const auto val = *ma;
@@ -151,6 +201,7 @@ SymInt operator-(const SymInt& s) {
   }
 }
 
+<<<<<<< HEAD
 void SymInt::operator_imul_slow_path(const SymInt& sci) {
   *this = *this * sci;
 }
@@ -160,6 +211,17 @@ void SymInt::operator_idiv_slow_path(const SymInt& sci) {
 }
 
 void SymInt::operator_iadd_slow_path(const SymInt& sci) {
+=======
+void SymInt::operator*=(const SymInt& sci) {
+  *this = *this * sci;
+}
+
+void SymInt::operator/=(const SymInt& sci) {
+  *this = *this / sci;
+}
+
+void SymInt::operator+=(const SymInt& sci) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   *this = *this + sci;
 }
 
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 0ee218c8cf54e..1b95c65d2ed05 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -7,7 +7,10 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
+<<<<<<< HEAD
 #include <algorithm>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdint>
 #include <iterator>
 #include <numeric>
@@ -52,7 +55,11 @@ class C10_API SymInt {
   // One appropriate use for this is when you are constructing a symint
   // in a situation where you know it is non-negative (or, if it is negative,
   // the negative value is -1; i.e., not user controlled)
+<<<<<<< HEAD
   SymInt(Unchecked /*unused*/, int64_t d) : data_(d) {}
+=======
+  SymInt(Unchecked, int64_t d) : data_(d) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // TODO: these implementations are not optimal because they allocate a
   // temporary and then use the move constructor/assignment
@@ -153,6 +160,17 @@ class C10_API SymInt {
   // number can be used to diagnose overspecialization.
   int64_t guard_int(const char* file, int64_t line) const;
 
+<<<<<<< HEAD
+=======
+  // Insert a guard that this SymInt must be size-like, returning true if
+  // the integer actually is >= 0.  Unlike manually performing a >= 0 test,
+  // if the SymInt in question is an unbacked SymInt (or, potentially in the
+  // future, if it contains unbacked SymInts), we will also treat the
+  // unbacked SymInt as statically testing >= 2 (which will prevent us from
+  // choking on, e.g., contiguity checks.)
+  bool expect_size(const char* file, int64_t line) const;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Distinguish actual symbolic values from constants stored on the heap
   bool is_symbolic() const {
     return is_heap_allocated() &&
@@ -170,6 +188,7 @@ class C10_API SymInt {
 #endif
   }
 
+<<<<<<< HEAD
   SymInt operator+(const SymInt& sci) const {
     if (auto ma = maybe_as_int()) {
       if (auto mb = sci.maybe_as_int()) {
@@ -300,6 +319,25 @@ class C10_API SymInt {
     }
     return sym_ge_slow_path(sci);
   }
+=======
+  SymInt operator+(const SymInt& sci) const;
+  SymInt operator-(const SymInt& sci) const;
+  SymInt operator*(const SymInt& sci) const;
+  SymInt operator/(const SymInt& sci) const;
+  SymInt operator%(const SymInt& sci) const;
+  void operator*=(const SymInt& sci);
+  void operator+=(const SymInt& sci);
+  void operator/=(const SymInt& sci);
+
+  SymInt clone() const;
+
+  SymBool sym_eq(const SymInt&) const;
+  SymBool sym_ne(const SymInt&) const;
+  SymBool sym_lt(const SymInt&) const;
+  SymBool sym_le(const SymInt&) const;
+  SymBool sym_gt(const SymInt&) const;
+  SymBool sym_ge(const SymInt&) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool operator==(const SymInt& o) const {
     return sym_eq(o).guard_bool(__FILE__, __LINE__);
@@ -320,6 +358,7 @@ class C10_API SymInt {
     return sym_ge(o).guard_bool(__FILE__, __LINE__);
   }
 
+<<<<<<< HEAD
   SymInt min(const SymInt& sci) const {
     if (auto ma = maybe_as_int()) {
       if (auto mb = sci.maybe_as_int()) {
@@ -337,6 +376,10 @@ class C10_API SymInt {
     }
     return max_slow_path(sci);
   }
+=======
+  SymInt min(const SymInt& sci) const;
+  SymInt max(const SymInt& sci) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // If both are symbolic, this checks if
   // they share the same node.
@@ -360,7 +403,15 @@ class C10_API SymInt {
     if (!is_heap_allocated()) {
       return data_;
     }
+<<<<<<< HEAD
     return maybe_as_int_slow_path();
+=======
+    auto* node = toSymNodeImplUnowned();
+    if (auto c = node->constant_int()) {
+      return c;
+    }
+    return node->maybe_as_int();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Return whether the integer is directly coercible to a SymInt
@@ -381,6 +432,7 @@ class C10_API SymInt {
 
  private:
   void promote_to_negative();
+<<<<<<< HEAD
   SymInt operator_add_slow_path(const SymInt& sci) const;
   SymInt operator_sub_slow_path(const SymInt& sci) const;
   SymInt operator_mul_slow_path(const SymInt& sci) const;
@@ -400,6 +452,8 @@ class C10_API SymInt {
   SymInt max_slow_path(const SymInt& sci) const;
 
   std::optional<int64_t> maybe_as_int_slow_path() const;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Constraints on the internal representation:
   //
@@ -556,6 +610,7 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
 }
 
 } // namespace c10
+<<<<<<< HEAD
 
 #include <limits>
 
@@ -579,3 +634,5 @@ class numeric_limits<c10::SymInt> {
 };
 
 } // namespace std
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
index 1b1867bfff1d5..109d78c0d3f98 100644
--- a/c10/core/SymIntArrayRef.h
+++ b/c10/core/SymIntArrayRef.h
@@ -86,6 +86,7 @@ inline SymIntArrayRef fromIntArrayRefSlow(IntArrayRef array_ref) {
       reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
 }
 
+<<<<<<< HEAD
 inline c10::SymBool sym_equals(SymIntArrayRef LHS, SymIntArrayRef RHS) {
   if (LHS.size() != RHS.size()) {
     return c10::SymBool(false);
@@ -105,4 +106,6 @@ inline c10::SymBool sym_equals(SymIntArrayRef LHS, SymIntArrayRef RHS) {
   return result;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index f509d38930bb4..ef0567dfff33e 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -210,6 +210,14 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
     // with a better implementation!
     return guard_bool(file, line);
   }
+<<<<<<< HEAD
+=======
+  virtual bool expect_size(const char* file, int64_t line) {
+    // No improvement for unbacked SymInts by default, replace this
+    // with a better implementation!
+    return ge(wrap_int(0))->guard_bool(file, line);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual int64_t int_() {
     TORCH_CHECK(false, "NYI");
   }
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index 01276d416fbb8..28dfe3869c94e 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -71,6 +71,7 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
   return std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
       std::move(base), std::move(size_nodes), std::move(stride_nodes));
 }
+<<<<<<< HEAD
 namespace {
 bool all_hinted(
     const c10::SymIntArrayRef& sizes,
@@ -92,6 +93,8 @@ bool all_hinted(
   return all_hinted;
 }
 } // namespace
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Special treatment because of numel
 SymBool SymbolicShapeMeta::compute_contiguous() const {
@@ -100,6 +103,7 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
   }
   c10::SymIntArrayRef sizes(sizes_);
   c10::SymIntArrayRef strides(strides_);
+<<<<<<< HEAD
 
   auto result = _compute_contiguous_sym(sizes, strides, numel());
 
@@ -178,6 +182,20 @@ SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
     c10::SymIntArrayRef sizes(sizes_);               \
     c10::SymIntArrayRef strides(strides_);           \
     return fallback(sizes, strides);                 \
+=======
+  return _compute_contiguous(sizes, strides, numel());
+}
+
+// The rest of them
+#define DEFINE_EAGER_SYMBOOL_COMPUTE(name, nodeimpl, fallback) \
+  SymBool SymbolicShapeMeta::name() const {                    \
+    if (!strides_valid_) {                                     \
+      return false;                                            \
+    }                                                          \
+    c10::SymIntArrayRef sizes(sizes_);                         \
+    c10::SymIntArrayRef strides(strides_);                     \
+    return fallback(sizes, strides);                           \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 #define DEFINE_SYMBOOL_COMPUTE(name, nodeimpl, fallback)        \
@@ -197,11 +215,19 @@ SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
   }
 
 // clang-format off
+<<<<<<< HEAD
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d)
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d)
 
 DEFINE_SYMBOOL_COMPUTE(compute_non_overlapping_and_dense, is_non_overlapping_and_dense, _compute_non_overlapping_and_dense)
 
+=======
+DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, is_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
+DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, is_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
+DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d, is_channels_last_strides_2d)
+DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d, is_channels_last_strides_3d)
+DEFINE_SYMBOOL_COMPUTE(compute_non_overlapping_and_dense, is_non_overlapping_and_dense, _compute_non_overlapping_and_dense)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // clang-format on
 
 #undef DEFINE_SYMBOOL_COMPUTE
@@ -279,7 +305,10 @@ void SymbolicShapeMeta::set_numel(SymInt val) const {
   numel_ = std::move(val);
   available_.fetch_or(numel_avail);
 }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void SymbolicShapeMeta::set_is_contiguous(SymBool val) const {
   std::scoped_lock lock(mutables_);
   if (has_is_contiguous()) {
@@ -288,7 +317,10 @@ void SymbolicShapeMeta::set_is_contiguous(SymBool val) const {
   is_contiguous_ = std::move(val);
   available_.fetch_or(is_contiguous_avail);
 }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void SymbolicShapeMeta::set_is_channels_last_contiguous(SymBool val) const {
   std::scoped_lock lock(mutables_);
   if (has_is_channels_last_contiguous()) {
@@ -297,7 +329,10 @@ void SymbolicShapeMeta::set_is_channels_last_contiguous(SymBool val) const {
   is_channels_last_contiguous_ = std::move(val);
   available_.fetch_or(is_channels_last_contiguous_avail);
 }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void SymbolicShapeMeta::set_is_channels_last_3d_contiguous(SymBool val) const {
   std::scoped_lock lock(mutables_);
   if (has_is_channels_last_3d_contiguous()) {
@@ -306,7 +341,10 @@ void SymbolicShapeMeta::set_is_channels_last_3d_contiguous(SymBool val) const {
   is_channels_last_3d_contiguous_ = std::move(val);
   available_.fetch_or(is_channels_last_3d_contiguous_avail);
 }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void SymbolicShapeMeta::set_is_channels_last(SymBool val) const {
   std::scoped_lock lock(mutables_);
   if (has_is_channels_last()) {
@@ -315,7 +353,10 @@ void SymbolicShapeMeta::set_is_channels_last(SymBool val) const {
   is_channels_last_ = std::move(val);
   available_.fetch_or(is_channels_last_avail);
 }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void SymbolicShapeMeta::set_is_channels_last_3d(SymBool val) const {
   std::scoped_lock lock(mutables_);
   if (has_is_channels_last_3d()) {
diff --git a/c10/core/SymbolicShapeMeta.h b/c10/core/SymbolicShapeMeta.h
index 0820038968a8e..630e98da5fb06 100644
--- a/c10/core/SymbolicShapeMeta.h
+++ b/c10/core/SymbolicShapeMeta.h
@@ -1,5 +1,8 @@
 #pragma once
+<<<<<<< HEAD
 #include <c10/core/MemoryFormat.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/SymBool.h>
 #include <c10/core/SymInt.h>
 #include <c10/macros/Export.h>
@@ -83,6 +86,7 @@ class C10_API SymbolicShapeMeta {
     return numel_;
   }
 
+<<<<<<< HEAD
   const SymBool& is_contiguous(at::MemoryFormat memory_format) const {
     if (memory_format == at::MemoryFormat::ChannelsLast) {
       return this->is_channels_last_contiguous();
@@ -92,6 +96,8 @@ class C10_API SymbolicShapeMeta {
     return this->is_contiguous();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const SymBool& is_contiguous() const {
     if (C10_UNLIKELY(!has_is_contiguous())) {
       init_is_contiguous();
@@ -204,7 +210,10 @@ class C10_API SymbolicShapeMeta {
   // Lazily initialized variables, with the corresponding available_ flag
   // indicating whether the value has been initialized
   mutable std::atomic<int> available_{0};
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   enum avail {
     numel_avail = 1 << 0,
     is_contiguous_avail = 1 << 1,
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index c59524a0932c2..b1b618be33951 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -9,6 +9,10 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Logging.h>
 #include <c10/util/accumulate.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 #include <utility>
@@ -309,6 +313,7 @@ void TensorImpl::throw_data_ptr_access_error() const {
       false, "Cannot access data pointer of Tensor that doesn't have storage");
 }
 
+<<<<<<< HEAD
 c10::SymBool TensorImpl::sym_is_contiguous_custom(
     at::MemoryFormat memory_format) const {
   if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
@@ -324,6 +329,14 @@ c10::SymBool TensorImpl::sym_is_contiguous_custom(
   }
 
   return sym_is_contiguous_default(memory_format);
+=======
+bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
+    return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
+        this, memory_format);
+  }
+  return is_contiguous_default(memory_format);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool TensorImpl::is_strides_like_custom(at::MemoryFormat memory_format) const {
@@ -334,12 +347,20 @@ bool TensorImpl::is_strides_like_custom(at::MemoryFormat memory_format) const {
   return is_strides_like_default(memory_format);
 }
 
+<<<<<<< HEAD
 c10::SymBool TensorImpl::sym_is_non_overlapping_and_dense_custom() const {
+=======
+bool TensorImpl::is_non_overlapping_and_dense_custom() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
     return pyobj_slot_.load_pyobj_interpreter()->is_non_overlapping_and_dense(
         this);
   }
+<<<<<<< HEAD
   return sym_is_non_overlapping_and_dense_default();
+=======
+  return is_non_overlapping_and_dense_default();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 IntArrayRef TensorImpl::sizes_custom() const {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 66893b86c8469..8a96fad86731c 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -359,7 +359,11 @@ struct C10_API VariableVersion {
   // https://cplusplus.github.io/LWG/issue2334.
   VariableVersion(uint32_t version)
       : version_counter_(c10::make_intrusive<VersionCounter>(version)) {}
+<<<<<<< HEAD
   VariableVersion(Disabled /*unused*/ = DISABLED) {}
+=======
+  VariableVersion(Disabled = DISABLED) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool enabled() const {
     return version_counter_;
@@ -522,21 +526,35 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   TensorImpl(
       Storage&& storage,
+<<<<<<< HEAD
       DispatchKeySet /*key_set*/,
+=======
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const caffe2::TypeMeta data_type);
 
   // See Note [Enum ImplType]
   TensorImpl(
+<<<<<<< HEAD
       ImplType /*unused*/,
       Storage&& storage,
       DispatchKeySet /*key_set*/,
+=======
+      ImplType,
+      Storage&& storage,
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const caffe2::TypeMeta data_type);
 
   /**
    * Construct a 1-dim 0 size tensor that doesn't have a storage.
    */
   TensorImpl(
+<<<<<<< HEAD
       DispatchKeySet /*key_set*/,
+=======
+      DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const caffe2::TypeMeta data_type,
       std::optional<c10::Device> device_opt);
 
@@ -563,9 +581,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // from under us.
   TensorImpl(
       Storage&& storage,
+<<<<<<< HEAD
       DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta data_type,
       std::optional<c10::Device> /*device_opt*/);
+=======
+      DispatchKeySet,
+      const caffe2::TypeMeta data_type,
+      std::optional<c10::Device>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   TensorImpl(const TensorImpl&) = delete;
@@ -643,6 +667,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
+<<<<<<< HEAD
   template <typename T>
   ArrayRef<T> generic_sizes() {
     static_assert(
@@ -654,10 +679,30 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     } else {
       return sym_sizes();
     }
+=======
+  // From https://stackoverflow.com/a/3057522/23845
+  // TODO: does C++14 have a stdlib template for this?
+  template <typename T>
+  struct identity {
+    typedef T type;
+  };
+
+  template <typename T>
+  ArrayRef<T> generic_sizes() {
+    return _generic_sizes(identity<T>());
+  }
+
+  ArrayRef<int64_t> _generic_sizes(identity<int64_t>) {
+    return sizes();
+  }
+  ArrayRef<c10::SymInt> _generic_sizes(identity<c10::SymInt>) {
+    return sym_sizes();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <typename T>
   ArrayRef<T> generic_strides() {
+<<<<<<< HEAD
     static_assert(
         std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
         "Only supports int64_t and c10::SymInt.");
@@ -667,10 +712,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     } else {
       return sym_strides();
     }
+=======
+    return _generic_strides(identity<T>());
+  }
+
+  ArrayRef<int64_t> _generic_strides(identity<int64_t>) {
+    return strides();
+  }
+  ArrayRef<c10::SymInt> _generic_strides(identity<c10::SymInt>) {
+    return sym_strides();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <typename T>
   T generic_storage_offset() {
+<<<<<<< HEAD
     static_assert(
         std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
         "Only supports int64_t and c10::SymInt.");
@@ -680,6 +736,16 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     } else {
       return sym_storage_offset();
     }
+=======
+    return _generic_storage_offset(identity<T>());
+  }
+
+  int64_t _generic_storage_offset(identity<int64_t>) {
+    return storage_offset();
+  }
+  c10::SymInt _generic_storage_offset(identity<c10::SymInt>) {
+    return sym_storage_offset();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /**
@@ -808,6 +874,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous(
       at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
     if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
@@ -845,6 +912,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_contiguous_default_impl<c10::SymBool>(memory_format);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /**
    * Whether or not a tensor is laid out in contiguous memory.
    *
@@ -860,6 +929,33 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_contiguous_default(memory_format);
   }
 
+<<<<<<< HEAD
+=======
+  // These are factored into separate functions in case subclasses
+  // want to use them
+  bool is_contiguous_default(at::MemoryFormat memory_format) const {
+    if (has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return symbolic_shape_meta().is_channels_last_contiguous().guard_bool(
+            __FILE__, __LINE__);
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return symbolic_shape_meta()
+            .is_channels_last_3d_contiguous()
+            .guard_bool(__FILE__, __LINE__);
+      }
+      return symbolic_shape_meta().is_contiguous().guard_bool(
+          __FILE__, __LINE__);
+    }
+
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_contiguous_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_contiguous_;
+    }
+    return is_contiguous_;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_strides_like_default(at::MemoryFormat memory_format) const {
     if (has_symbolic_sizes_strides_) {
       if (memory_format == at::MemoryFormat::ChannelsLast) {
@@ -882,6 +978,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
+<<<<<<< HEAD
   SymBool sym_is_non_overlapping_and_dense_default() const {
     if (has_symbolic_sizes_strides_) {
       return symbolic_shape_meta().is_non_overlapping_and_dense();
@@ -893,6 +990,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_non_overlapping_and_dense_default() const {
     if (has_symbolic_sizes_strides_) {
       return sym_is_non_overlapping_and_dense_default().guard_bool(
+=======
+  bool is_non_overlapping_and_dense_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().is_non_overlapping_and_dense().guard_bool(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           __FILE__, __LINE__);
     } else {
       return is_non_overlapping_and_dense_;
@@ -985,6 +1087,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * for a tensor to have rank, but not well defined sizes.
    */
   // sizes_strides_policy_ >= CustomStrides
+<<<<<<< HEAD
 
   virtual bool is_strides_like_custom(at::MemoryFormat memory_format) const;
 
@@ -1003,6 +1106,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         .guard_bool(__FILE__, __LINE__);
   }
 
+=======
+  virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const;
+  virtual bool is_strides_like_custom(at::MemoryFormat memory_format) const;
+  virtual bool is_non_overlapping_and_dense_custom() const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // sizes_strides_policy_ >= CustomSizes
   // Currently this method only exists to be overwritten by subclasses such as
   // NestedTensorImpl.
@@ -1036,9 +1144,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   virtual c10::SymInt sym_storage_offset_custom() const;
 
  public:
+<<<<<<< HEAD
 /**
  * True if this tensor has storage. See storage() for details.
  */
+=======
+  /**
+   * True if this tensor has storage. See storage() for details.
+   */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef DEBUG
   // Allow subclasses to check that their storage_ is never getting set in debug
   // builds.
@@ -1048,11 +1162,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 #endif
       bool
       has_storage() const
+<<<<<<< HEAD
 // NOTE: we devirtualize this because it arguably shouldn't be an
 // error just to ask subclasses if they have storage.
 // This used to throw for most subclasses, but OpaqueTensorImpl
 // wanted it to successfully return false, so we went ahead and made
 // it a non-error.
+=======
+  // NOTE: we devirtualize this because it arguably shouldn't be an
+  // error just to ask subclasses if they have storage.
+  // This used to throw for most subclasses, but OpaqueTensorImpl
+  // wanted it to successfully return false, so we went ahead and made
+  // it a non-error.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
   {
     return storage_;
@@ -2086,7 +2208,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       constexpr auto sparse_backends = DispatchKeySet(
           {BackendComponent::CPUBit,
            BackendComponent::CUDABit,
+<<<<<<< HEAD
            BackendComponent::MPSBit,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            BackendComponent::HIPBit,
            BackendComponent::XPUBit});
       constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
@@ -2480,11 +2605,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_strides_like(at::MemoryFormat::ChannelsLast3d);
   }
 
+<<<<<<< HEAD
   bool is_non_overlapping_and_dense_or_false() const {
     return sym_is_non_overlapping_and_dense().guard_or_false(
         __FILE__, __LINE__);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_non_overlapping_and_dense() const {
     if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
       return is_non_overlapping_and_dense_custom();
@@ -2492,6 +2620,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_non_overlapping_and_dense_default();
   }
 
+<<<<<<< HEAD
   SymBool sym_is_non_overlapping_and_dense() const {
     if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
       return sym_is_non_overlapping_and_dense_custom();
@@ -2499,6 +2628,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return sym_is_non_overlapping_and_dense_default();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // if this returns true, then it is guaranteed that this tensor has symbolic
   // sizes/strides
   bool has_symbolic_sizes_strides() const {
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index d3282ae7114e5..a3d04099ae997 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -1,5 +1,12 @@
 #include <c10/core/TensorOptions.h>
 
+<<<<<<< HEAD
+=======
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/util/Optional.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <iostream>
 
 namespace c10 {
diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
index 037cac0f63b89..e3581cd7fc1d3 100644
--- a/c10/core/UndefinedTensorImpl.cpp
+++ b/c10/core/UndefinedTensorImpl.cpp
@@ -12,8 +12,12 @@ UndefinedTensorImpl::UndefinedTensorImpl()
   set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
 }
 
+<<<<<<< HEAD
 c10::SymBool UndefinedTensorImpl::sym_is_contiguous_custom(
     MemoryFormat format) const {
+=======
+bool UndefinedTensorImpl::is_contiguous_custom(MemoryFormat format) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return is_contiguous_default(format);
 }
 IntArrayRef UndefinedTensorImpl::strides_custom() const {
@@ -31,7 +35,11 @@ bool UndefinedTensorImpl::has_storage() const {
 }
 #endif
 
+<<<<<<< HEAD
 void UndefinedTensorImpl::set_storage_offset(int64_t /*storage_offset*/) {
+=======
+void UndefinedTensorImpl::set_storage_offset(int64_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "set_storage_offset() called on an undefined Tensor");
 }
 
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index 6b7573a69388a..d1067e98559ec 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -32,7 +32,11 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
   void set_storage_offset(int64_t offset) override;
 
  protected:
+<<<<<<< HEAD
   c10::SymBool sym_is_contiguous_custom(MemoryFormat format) const override;
+=======
+  bool is_contiguous_custom(MemoryFormat format) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   IntArrayRef strides_custom() const override;
   SymIntArrayRef sym_strides_custom() const override;
 
diff --git a/c10/core/alignment.h b/c10/core/alignment.h
index c0ffb925b244b..6d3e6423d4ca2 100644
--- a/c10/core/alignment.h
+++ b/c10/core/alignment.h
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <cstddef>
+<<<<<<< HEAD
 #include <new>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 
@@ -19,6 +22,7 @@ constexpr size_t gPagesize = 4096;
 // since the default thp pagesize is 2MB, enable thp only
 // for buffers of size 2MB or larger to avoid memory bloating
 constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
+<<<<<<< HEAD
 
 // Cache line size used to avoid false sharing between threads. Falls back to 64
 // bytes if C++17 feature is unavailable.
@@ -27,4 +31,6 @@ using std::hardware_destructive_interference_size;
 #else
 constexpr std::size_t hardware_destructive_interference_size = 64;
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
diff --git a/c10/core/impl/COW.cpp b/c10/core/impl/COW.cpp
index 3adeebcd2d580..d26bcda7ebffe 100644
--- a/c10/core/impl/COW.cpp
+++ b/c10/core/impl/COW.cpp
@@ -2,6 +2,10 @@
 
 #include <c10/core/Allocator.h>
 #include <c10/core/StorageImpl.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/alignment.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/impl/COWDeleter.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ParallelGuard.h>
@@ -44,8 +48,12 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
 }
 
 bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
+<<<<<<< HEAD
   return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
       reinterpret_cast<const void*>(&cow::cow_deleter);
+=======
+  return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
diff --git a/c10/core/impl/DeviceGuardImplInterface.cpp b/c10/core/impl/DeviceGuardImplInterface.cpp
index 428ea63c04151..cd8a6472213a0 100644
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@@ -1,5 +1,8 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+<<<<<<< HEAD
 #include <c10/core/impl/FakeGuardImpl.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <array>
 
 namespace c10::impl {
@@ -9,12 +12,17 @@ std::array<
     static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
     device_guard_impl_registry;
 
+<<<<<<< HEAD
 void registerDeviceGuard(
+=======
+DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceType type,
     const DeviceGuardImplInterface* impl) {
   device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }
 
+<<<<<<< HEAD
 DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
     DeviceType type,
     const DeviceGuardImplInterface* impl) {
@@ -43,4 +51,6 @@ void ensureCUDADeviceGuardSet() {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::impl
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index f9f67497c6315..c3a607b5ab274 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -6,7 +6,10 @@
 #include <c10/util/Exception.h>
 
 // Just for C10_ANONYMOUS_VARIABLE
+<<<<<<< HEAD
 #include <c10/core/impl/TorchDispatchModeTLS.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Registry.h>
 
 #include <array>
@@ -111,16 +114,25 @@ struct C10_API DeviceGuardImplInterface {
   /**
    * Get the default stream for a given device.
    */
+<<<<<<< HEAD
   virtual Stream getDefaultStream(Device /*unused*/) const {
+=======
+  virtual Stream getDefaultStream(Device) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Backend doesn't support acquiring a default stream.")
   }
 
   /**
    * Get a stream from the global pool for a given device.
    */
+<<<<<<< HEAD
   virtual Stream getStreamFromGlobalPool(
       Device /*unused*/,
       bool isHighPriority = false) const {
+=======
+  virtual Stream getStreamFromGlobalPool(Device, bool isHighPriority = false)
+      const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (void)isHighPriority; // Suppress unused variable warning
     TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
   }
@@ -130,7 +142,11 @@ struct C10_API DeviceGuardImplInterface {
    * copied and shared around, device backend should be able to correctly handle
    * the lifetime of the stream.
    */
+<<<<<<< HEAD
   virtual Stream getNewStream(Device /*unused*/, int priority = 0) const {
+=======
+  virtual Stream getNewStream(Device, int priority = 0) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (void)priority;
     TORCH_CHECK(false, "Backend doesn't support create a new Stream.")
   }
@@ -229,9 +245,14 @@ struct C10_API DeviceGuardImplInterface {
    * being used on the given stream, and that it should thus avoid recycling the
    * DataPtr until all work on that stream is done.
    */
+<<<<<<< HEAD
   virtual void recordDataPtrOnStream(
       const c10::DataPtr& /*unused*/,
       const Stream& /*unused*/) const {}
+=======
+  virtual void recordDataPtrOnStream(const c10::DataPtr&, const Stream&) const {
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /**
    * Fetch the elapsed time between two recorded events.
@@ -254,17 +275,26 @@ struct C10_API DeviceGuardImplInterface {
 // for devices that don't actually have a concept of device index.  Prominent
 // examples are CPU and Meta.
 template <DeviceType D>
+<<<<<<< HEAD
 struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
+=======
+struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   NoOpDeviceGuardImpl() = default;
   DeviceType type() const override {
     return D;
   }
+<<<<<<< HEAD
   Device exchangeDevice(Device /*unused*/) const override {
+=======
+  Device exchangeDevice(Device) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Device(D, -1); // no-op
   }
   Device getDevice() const override {
     return Device(D, -1);
   }
+<<<<<<< HEAD
   void setDevice(Device /*unused*/) const override {
     // no-op
   }
@@ -272,18 +302,35 @@ struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
     // no-op
   }
   Stream getStream(Device /*unused*/) const noexcept override {
+=======
+  void setDevice(Device) const override {
+    // no-op
+  }
+  void uncheckedSetDevice(Device) const noexcept override {
+    // no-op
+  }
+  Stream getStream(Device) const noexcept override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
 
+<<<<<<< HEAD
   Stream getNewStream(Device /*unused*/, int priority = 0) const override {
+=======
+  Stream getNewStream(Device, int priority = 0) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // no-op
     (void)priority;
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
 
   // NB: These do NOT set the current device
+<<<<<<< HEAD
   Stream exchangeStream(Stream /*unused*/) const noexcept override {
+=======
+  Stream exchangeStream(Stream) const noexcept override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
@@ -346,9 +393,13 @@ extern C10_API std::array<
 
 class C10_API DeviceGuardImplRegistrar {
  public:
+<<<<<<< HEAD
   DeviceGuardImplRegistrar(
       DeviceType /*type*/,
       const DeviceGuardImplInterface* /*impl*/);
+=======
+  DeviceGuardImplRegistrar(DeviceType, const DeviceGuardImplInterface*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 #define C10_REGISTER_GUARD_IMPL(DevType, DeviceGuardImpl)              \
@@ -372,14 +423,20 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
   return p;
 }
 
+<<<<<<< HEAD
 void C10_API
 registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool hasDeviceGuardImpl(DeviceType type) {
   return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }
 
+<<<<<<< HEAD
 void C10_API ensureCUDADeviceGuardSet();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace impl
 } // namespace c10
diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h
index 7b2eeca79fd54..2f70a1662e158 100644
--- a/c10/core/impl/FakeGuardImpl.h
+++ b/c10/core/impl/FakeGuardImpl.h
@@ -19,7 +19,11 @@ template <DeviceType T>
 struct FakeGuardImpl final : public DeviceGuardImplInterface {
   static constexpr DeviceType static_type = T;
   // Runtime device type is not used
+<<<<<<< HEAD
   FakeGuardImpl(DeviceType /*unused*/) {}
+=======
+  FakeGuardImpl(DeviceType) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   FakeGuardImpl() = default;
   DeviceType type() const override {
     return T;
diff --git a/c10/core/impl/GPUTrace.h b/c10/core/impl/GPUTrace.h
index df8cec135e230..76c606d66bad0 100644
--- a/c10/core/impl/GPUTrace.h
+++ b/c10/core/impl/GPUTrace.h
@@ -16,7 +16,11 @@ struct C10_API GPUTrace {
 
   // This function will only register the first interpreter that tries to invoke
   // it. For all of the next ones it will be a no-op.
+<<<<<<< HEAD
   static void set_trace(const PyInterpreter* /*trace*/);
+=======
+  static void set_trace(const PyInterpreter*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static const PyInterpreter* get_trace() {
     if (!haveState)
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index bba089bb2ad11..f00e5840781de 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -81,7 +81,11 @@ C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);
 
 class C10_API IncludeDispatchKeyGuard {
  public:
+<<<<<<< HEAD
   IncludeDispatchKeyGuard(DispatchKeySet /*include*/);
+=======
+  IncludeDispatchKeyGuard(DispatchKeySet);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   IncludeDispatchKeyGuard(DispatchKey k)
       : IncludeDispatchKeyGuard(DispatchKeySet(k)) {}
   IncludeDispatchKeyGuard(const IncludeDispatchKeyGuard&) = delete;
@@ -99,7 +103,11 @@ class C10_API IncludeDispatchKeyGuard {
 
 class C10_API ExcludeDispatchKeyGuard {
  public:
+<<<<<<< HEAD
   ExcludeDispatchKeyGuard(DispatchKeySet /*exclude*/);
+=======
+  ExcludeDispatchKeyGuard(DispatchKeySet);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ExcludeDispatchKeyGuard(DispatchKey k)
       : ExcludeDispatchKeyGuard(DispatchKeySet(k)) {}
   ExcludeDispatchKeyGuard(const ExcludeDispatchKeyGuard&) = delete;
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index 8676f0aaf8e0e..b06382d8b4664 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -35,7 +35,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
+<<<<<<< HEAD
       c10::DispatchKey /*unused*/,
+=======
+      c10::DispatchKey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10::DispatchKeySet keyset,
       torch::jit::Stack* stack,
       bool with_keyset,
@@ -52,11 +56,16 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void python_dispatcher(
       const c10::OperatorHandle& op,
+<<<<<<< HEAD
       c10::DispatchKeySet /*unused*/,
+=======
+      c10::DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       torch::jit::Stack* stack) const override {
     PANIC(python_dispatcher);
   }
 
+<<<<<<< HEAD
   bool is_contiguous(const TensorImpl* self, at::MemoryFormat /*unused*/)
       const override {
     PANIC(is_contiguous);
@@ -67,6 +76,12 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
     PANIC(sym_is_contiguous);
   }
   bool is_strides_like(const TensorImpl* self, at::MemoryFormat /*unused*/)
+=======
+  bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override {
+    PANIC(is_contiguous);
+  }
+  bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const override {
     PANIC(is_strides_like);
   }
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index def708c24b802..abde7c8420131 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -168,9 +168,12 @@ struct C10_API PyInterpreterVTable {
 
   virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
       const = 0;
+<<<<<<< HEAD
   virtual c10::SymBool sym_is_contiguous(
       const TensorImpl* self,
       at::MemoryFormat) const = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
       const = 0;
   virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
@@ -243,4 +246,27 @@ struct C10_API PyInterpreter {
   void disarm() noexcept;
 };
 
+<<<<<<< HEAD
+=======
+// PyInterpreterStatus describes what the state of its interpreter tag
+// is, relative to the thread currently holding the GIL.
+enum class PyInterpreterStatus {
+  // We just allocated the Tensor, it hasn't escaped to other threads,
+  // we know that it definitely hasn't been tagged to be associated
+  // with an interpreter.
+  DEFINITELY_UNINITIALIZED,
+  // We queried the interpreter field and it looked uninitialized.  But
+  // another thread may have raced with us to tag it with some other
+  // interpreter id.  So we will have to do a CEX to make sure we can
+  // actually nab it.
+  MAYBE_UNINITIALIZED,
+  // We queried the interpreter field and it was tagged to belong to us.
+  // This means we have sole write access (as we hold the GIL for this
+  // interpreter)
+  TAGGED_BY_US,
+  // Someone else tagged this.  We can't use this TensorImpl from Python.
+  TAGGED_BY_OTHER,
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::impl
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
index 0f1bfb2110747..7b4fd8f30ff1b 100644
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@@ -34,12 +34,36 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
       reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
 }
 
+<<<<<<< HEAD
+=======
+void PyObjectSlot::unchecked_clear_pyobj(PyInterpreter* interpreter) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(interpreter == pyobj_interpreter_.load());
+  pyobj_ = nullptr;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
   auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
   if (interpreter) {
     return *interpreter;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
+=======
+  TORCH_CHECK(
+      false,
+      "cannot access PyObject for Tensor on interpreter ",
+      (*pyobj_interpreter_.load())->name());
+}
+
+bool PyObjectSlot::check_interpreter(PyInterpreter* interpreter) {
+  return interpreter == pyobj_interpreter();
+}
+
+bool PyObjectSlot::has_pyobj_nonhermetic() {
+  return check_pyobj(pyobj_interpreter(), /*ignore_hermetic_tls=*/true)
+      .has_value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool PyObjectSlot::owns_pyobj() {
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index 58b2490eba001..562eb3cf1fcfb 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -2,7 +2,10 @@
 
 #include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PyInterpreter.h>
+<<<<<<< HEAD
 #include <c10/core/impl/PyInterpreterHooks.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/python_stub.h>
 #include <optional>
 
@@ -25,9 +28,58 @@ struct C10_API PyObjectSlot {
   //
   // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
   // PyObject if necessary!
+<<<<<<< HEAD
   void init_pyobj(PyObject* pyobj) {
     pyobj_interpreter_.store(
         getGlobalPyInterpreter(), std::memory_order_relaxed);
+=======
+  void init_pyobj(
+      PyInterpreter* self_interpreter,
+      PyObject* pyobj,
+      PyInterpreterStatus status) {
+    impl::PyInterpreter* expected = nullptr;
+    switch (status) {
+      case impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED:
+        // caller guarantees there is no multithreaded access; if there is
+        // no data race OK to do a relaxed store
+        pyobj_interpreter_.store(self_interpreter, std::memory_order_relaxed);
+        break;
+      case impl::PyInterpreterStatus::TAGGED_BY_US:
+        // no tagging is necessary, the tag is already correct
+        break;
+      case impl::PyInterpreterStatus::MAYBE_UNINITIALIZED:
+        // attempt to claim this TensorImpl with the specified interpreter
+        // tag
+        if (pyobj_interpreter_.compare_exchange_strong(
+                expected, self_interpreter, std::memory_order_acq_rel)) {
+          break;
+        }
+        // test if, actually, it was already tagged by us!  this situation can't
+        // be caused by a race, but it could be caused by a situation
+        // where someone conservatively tagged the tensor as MAYBE_UNINITIALIZED
+        // (because they didn't pre-check the tag) when actually it was
+        // owned by the interpreter
+        if (expected == self_interpreter) {
+          break;
+        }
+        // fallthrough, we lost the race.  We are guaranteed not to lose the
+        // race with ourself, as calls to init_pyobj with the same interpreter
+        // ID must be sequentialized by the GIL
+        [[fallthrough]];
+      case impl::PyInterpreterStatus::TAGGED_BY_OTHER:
+        TORCH_CHECK(
+            false,
+            "cannot allocate PyObject for Tensor on interpreter ",
+            self_interpreter,
+            " that has already been used by another torch deploy interpreter ",
+            pyobj_interpreter_.load());
+    }
+
+    // we are the ONLY thread that can have gotten to this point.  It is not
+    // possible to conflict with another zero interpreter as access is protected
+    // by GIL
+    // NB: owns_pyobj tag is initially false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pyobj_ = pyobj;
   }
 
@@ -52,6 +104,7 @@ struct C10_API PyObjectSlot {
   //
   // NB: this lives in header so that we can avoid actually creating the
   // std::optional
+<<<<<<< HEAD
 
   // @todo alban: I'm not too sure what's going on here, we can probably delete
   // it but it's worthwhile making sure
@@ -71,6 +124,51 @@ struct C10_API PyObjectSlot {
 
   PyInterpreter& load_pyobj_interpreter() const;
 
+=======
+  std::optional<PyObject*> check_pyobj(
+      PyInterpreter* self_interpreter,
+      bool ignore_hermetic_tls = false) const {
+    // Note [Memory ordering on Python interpreter tag]
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
+      // NB: This never returns DEFINITELY_UNINITIALIZED because there is
+      // always the possibility that another thread races to initialize
+      // after we query here.  The only time when we can conclude a tensor
+      // is definitely uninitialized is when we have just allocated it and
+      // it cannot have escaped to other threads yet
+      return std::nullopt;
+    } else if (interpreter == self_interpreter) {
+      // NB: pyobj_ could still be null!
+      if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
+        return std::nullopt;
+      } else {
+        return _unchecked_untagged_pyobj();
+      }
+    } else {
+      TORCH_CHECK(
+          false,
+          "cannot access PyObject for Tensor on interpreter ",
+          (*self_interpreter)->name(),
+          " that has already been used by another torch deploy interpreter ",
+          (*pyobj_interpreter_.load())->name());
+    }
+  }
+
+  // Clear the PyObject field for an interpreter, in situations where we
+  // statically know the tensor is tagged with our interpreter.
+  void unchecked_clear_pyobj(PyInterpreter* interpreter);
+
+  PyInterpreter& load_pyobj_interpreter() const;
+
+  // Check if the PyObjectSlot's interpreter is the same as the specified
+  // interpreter
+  bool check_interpreter(PyInterpreter* interpreter);
+
+  // Check if the PyObjectSlot is holding a PyObject, owned or non-owned
+  bool has_pyobj_nonhermetic();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool owns_pyobj();
 
   void set_owns_pyobj(bool b);
diff --git a/c10/core/impl/SizesAndStrides.h b/c10/core/impl/SizesAndStrides.h
index 6cc87e1d6be3e..aaa90b7d97aff 100644
--- a/c10/core/impl/SizesAndStrides.h
+++ b/c10/core/impl/SizesAndStrides.h
@@ -64,10 +64,13 @@ class C10_API SizesAndStrides {
                   storageBytes(size_)));
   }
 
+<<<<<<< HEAD
   bool operator!=(const SizesAndStrides& other) const {
     return !(*this == other);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SizesAndStrides& operator=(const SizesAndStrides& rhs) {
     if (this == &rhs) {
       return *this;
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index 55d9e24a57212..02d14dca24b1d 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -1,4 +1,8 @@
 #include <c10/core/DispatchKey.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/SafePyObject.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/irange.h>
diff --git a/c10/core/impl/VirtualGuardImpl.h b/c10/core/impl/VirtualGuardImpl.h
index 3d259f5e390e3..8dd33de35205e 100644
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@@ -94,11 +94,19 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
   }
 
   void synchronizeEvent(void* event) const override {
+<<<<<<< HEAD
     impl_->synchronizeEvent(event);
   }
 
   void synchronizeDevice(const DeviceIndex device_index) const override {
     impl_->synchronizeDevice(device_index);
+=======
+    return impl_->synchronizeEvent(event);
+  }
+
+  void synchronizeDevice(const DeviceIndex device_index) const override {
+    return impl_->synchronizeDevice(device_index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index d48a6251ed5d9..98a0b0ab899a4 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -56,7 +56,11 @@ void memset_junk(void* data, size_t num) {
 }
 
 #if defined(__linux__) && !defined(__ANDROID__)
+<<<<<<< HEAD
 inline bool is_thp_alloc_enabled() {
+=======
+static inline bool is_thp_alloc_enabled() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static bool value = [&] {
     auto env = c10::utils::check_env("THP_MEM_ALLOC_ENABLE");
     return env.has_value() ? env.value() : 0;
@@ -108,6 +112,7 @@ void* alloc_cpu(size_t nbytes) {
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
       nbytes,
       " bytes.");
+<<<<<<< HEAD
 #elif defined(USE_MIMALLOC)
   data = mi_malloc_aligned(nbytes, gAlignment);
   CAFFE_ENFORCE(
@@ -117,6 +122,14 @@ void* alloc_cpu(size_t nbytes) {
       " bytes.");
 #elif defined(_MSC_VER)
   data = _aligned_malloc(nbytes, gAlignment);
+=======
+#elif defined(_MSC_VER)
+#ifdef USE_MIMALLOC
+  data = mi_malloc_aligned(nbytes, gAlignment);
+#else
+  data = _aligned_malloc(nbytes, gAlignment);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CAFFE_ENFORCE(
       data,
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
@@ -163,10 +176,19 @@ void* alloc_cpu(size_t nbytes) {
 }
 
 void free_cpu(void* data) {
+<<<<<<< HEAD
 #ifdef USE_MIMALLOC
   mi_free(data);
 #elif defined(_MSC_VER)
   _aligned_free(data);
+=======
+#ifdef _MSC_VER
+#ifdef USE_MIMALLOC
+  mi_free(data);
+#else
+  _aligned_free(data);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
   free(data);
diff --git a/c10/core/thread_pool.cpp b/c10/core/thread_pool.cpp
index 64858bed47d4c..9e188c2202533 100644
--- a/c10/core/thread_pool.cpp
+++ b/c10/core/thread_pool.cpp
@@ -87,7 +87,13 @@ bool ThreadPool::inThreadPool() const {
 }
 
 void ThreadPool::run(std::function<void()> func) {
+<<<<<<< HEAD
   TORCH_CHECK(threads_.size() > 0, "No threads to run a task");
+=======
+  if (threads_.empty()) {
+    throw std::runtime_error("No threads to run a task");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_lock<std::mutex> lock(mutex_);
 
   // Set task and signal condition variable so that a worker thread will
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 3046259b48a3e..1f017439f0286 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -1,5 +1,9 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/llvmMathExtras.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
@@ -7,16 +11,242 @@
 
 namespace c10::cuda::CUDACachingAllocator {
 
+<<<<<<< HEAD
 size_t CUDAAllocatorConfig::parseAllocatorConfig(
     const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i,
     bool& used_cudaMallocAsync) {
+=======
+constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
+
+CUDAAllocatorConfig::CUDAAllocatorConfig()
+    : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_max_non_split_rounding_size(kLargeBuffer),
+      m_garbage_collection_threshold(0),
+      m_pinned_num_register_threads(1),
+      m_expandable_segments(false),
+#if CUDA_VERSION >= 12030
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::UNSPECIFIED),
+#else
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::POSIX_FD),
+#endif
+      m_release_lock_on_cudamalloc(false),
+      m_pinned_use_cuda_host_register(false),
+      m_pinned_use_background_threads(false) {
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+}
+
+size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
+  size_t log_size = (63 - llvm::countLeadingZeros(size));
+
+  // Our intervals start at 1MB and end at 64GB
+  const size_t interval_start =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
+  const size_t interval_end =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
+  TORCH_CHECK(
+      (interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
+      "kRoundUpPowerOfTwoIntervals mismatch");
+
+  int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
+
+  index = std::max(0, index);
+  index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
+  return instance().m_roundup_power2_divisions[index];
+}
+
+void CUDAAllocatorConfig::lexArgs(
+    const std::string& env,
+    std::vector<std::string>& config) {
+  std::vector<char> buf;
+
+  for (char ch : env) {
+    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
+      if (!buf.empty()) {
+        config.emplace_back(buf.begin(), buf.end());
+        buf.clear();
+      }
+      config.emplace_back(1, ch);
+    } else if (ch != ' ') {
+      buf.emplace_back(ch);
+    }
+  }
+  if (!buf.empty()) {
+    config.emplace_back(buf.begin(), buf.end());
+  }
+}
+
+void CUDAAllocatorConfig::consumeToken(
+    const std::vector<std::string>& config,
+    size_t i,
+    const char c) {
+  TORCH_CHECK(
+      i < config.size() && config[i] == std::string(1, c),
+      "Error parsing CachingAllocator settings, expected ",
+      c,
+      "");
+}
+
+size_t CUDAAllocatorConfig::parseMaxSplitSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_split_size_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_split_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_non_split_rounding_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    double val1 = stod(config[i]);
+    TORCH_CHECK(
+        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
+    TORCH_CHECK(
+        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
+    m_garbage_collection_threshold = val1;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting garbage_collection_threshold value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  bool first_value = true;
+
+  if (++i < config.size()) {
+    if (std::string_view(config[i]) == "[") {
+      size_t last_index = 0;
+      // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+      while (++i < config.size() && std::string_view(config[i]) != "]") {
+        const std::string& val1 = config[i];
+        size_t val2 = 0;
+
+        consumeToken(config, ++i, ':');
+        if (++i < config.size()) {
+          val2 = stoi(config[i]);
+        } else {
+          TORCH_CHECK(
+              false, "Error parsing roundup_power2_divisions value", "");
+        }
+        TORCH_CHECK(
+            val2 == 0 || llvm::isPowerOf2_64(val2),
+            "For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
+            "");
+
+        if (std::string_view(val1) == ">") {
+          std::fill(
+              std::next(
+                  m_roundup_power2_divisions.begin(),
+                  static_cast<std::vector<unsigned long>::difference_type>(
+                      last_index)),
+              m_roundup_power2_divisions.end(),
+              val2);
+        } else {
+          size_t val1_long = stoul(val1);
+          TORCH_CHECK(
+              llvm::isPowerOf2_64(val1_long),
+              "For roundups, the intervals have to be power of 2 ",
+              "");
+
+          size_t index = 63 - llvm::countLeadingZeros(val1_long);
+          index = std::max((size_t)0, index);
+          index = std::min(index, m_roundup_power2_divisions.size() - 1);
+
+          if (first_value) {
+            std::fill(
+                m_roundup_power2_divisions.begin(),
+                std::next(
+                    m_roundup_power2_divisions.begin(),
+                    static_cast<std::vector<unsigned long>::difference_type>(
+                        index)),
+                val2);
+            first_value = false;
+          }
+          if (index < m_roundup_power2_divisions.size()) {
+            m_roundup_power2_divisions[index] = val2;
+          }
+          last_index = index;
+        }
+
+        if (std::string_view(config[i + 1]) != "]") {
+          consumeToken(config, ++i, ',');
+        }
+      }
+    } else { // Keep this for backwards compatibility
+      size_t val1 = stoi(config[i]);
+      TORCH_CHECK(
+          llvm::isPowerOf2_64(val1),
+          "For roundups, the divisions has to be power of 2 ",
+          "");
+      std::fill(
+          m_roundup_power2_divisions.begin(),
+          m_roundup_power2_divisions.end(),
+          val1);
+    }
+  } else {
+    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseAllocatorConfig(
+    const std::vector<std::string>& config,
+    size_t i,
+    bool& used_cudaMallocAsync) {
+  // For ease of maintenance and understanding, the CUDA and ROCm
+  // implementations of this function are separated. This avoids having many
+  // #ifdef's throughout.
+#ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Ease burden on ROCm users by allowing either cuda or hip tokens.
   // cuda token is broken up to prevent hipify matching it.
 #define PYTORCH_TOKEN1 \
   "cud"                \
   "aMallocAsync"
 #define PYTORCH_TOKEN2 "hipMallocAsync"
+<<<<<<< HEAD
   tokenizer.checkToken(++i, ":");
   i++; // Move to the value after the colon
 #ifdef USE_ROCM
@@ -120,6 +350,152 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
 
     if (i + 1 < tokenizer.size()) {
       tokenizer.checkToken(++i, ",");
+=======
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
+         (config[i] == PYTORCH_TOKEN2)),
+        "Unknown allocator backend, "
+        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+    used_cudaMallocAsync =
+        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name() ||
+            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time, ",
+        config[i],
+        " != ",
+        get()->name());
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
+  }
+  return i;
+#undef PYTORCH_TOKEN1
+#undef PYTORCH_TOKEN2
+#else // USE_ROCM
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
+        "Unknown allocator backend, "
+        "options are native and cudaMallocAsync");
+    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
+    if (used_cudaMallocAsync) {
+#if CUDA_VERSION >= 11040
+      int version = 0;
+      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+      TORCH_CHECK(
+          version >= 11040,
+          "backend:cudaMallocAsync requires CUDA runtime "
+          "11.4 or newer, but cudaDriverGetVersion returned ",
+          version);
+#else
+      TORCH_CHECK(
+          false,
+          "backend:cudaMallocAsync requires PyTorch to be built with "
+          "CUDA 11.4 or newer, but CUDA_VERSION is ",
+          CUDA_VERSION);
+#endif
+    }
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name(),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time");
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
+  }
+  return i;
+#endif // USE_ROCM
+}
+
+void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
+  // If empty, set the default values
+  m_max_split_size = std::numeric_limits<size_t>::max();
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+  m_garbage_collection_threshold = 0;
+  bool used_cudaMallocAsync = false;
+  bool used_native_specific_option = false;
+
+  if (!env.has_value()) {
+    return;
+  }
+  {
+    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
+    m_last_allocator_settings = env.value();
+  }
+
+  std::vector<std::string> config;
+  lexArgs(env.value(), config);
+
+  for (size_t i = 0; i < config.size(); i++) {
+    std::string_view config_item_view(config[i]);
+    if (config_item_view == "max_split_size_mb") {
+      i = parseMaxSplitSize(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "garbage_collection_threshold") {
+      i = parseGarbageCollectionThreshold(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "roundup_power2_divisions") {
+      i = parseRoundUpPower2Divisions(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "backend") {
+      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
+    } else if (config_item_view == "expandable_segments") {
+      used_native_specific_option = true;
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for expandable_segments");
+      config_item_view = config[i];
+      m_expandable_segments = (config_item_view == "True");
+    } else if (
+        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
+        // use, accept both. We must break up the string to prevent hipify here.
+        config_item_view == "release_lock_on_hipmalloc" ||
+        config_item_view ==
+            "release_lock_on_c"
+            "udamalloc") {
+      used_native_specific_option = true;
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for release_lock_on_cudamalloc");
+      config_item_view = config[i];
+      m_release_lock_on_cudamalloc = (config_item_view == "True");
+    } else if (
+        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
+        // use, accept both. We must break up the string to prevent hipify here.
+        config_item_view == "pinned_use_hip_host_register" ||
+        config_item_view ==
+            "pinned_use_c"
+            "uda_host_register") {
+      i = parsePinnedUseCudaHostRegister(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "pinned_num_register_threads") {
+      i = parsePinnedNumRegisterThreads(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "pinned_use_background_threads") {
+      i = parsePinnedUseBackgroundThreads(config, i);
+      used_native_specific_option = true;
+    } else {
+      TORCH_CHECK(
+          false, "Unrecognized CachingAllocator option: ", config_item_view);
+    }
+
+    if (i + 1 < config.size()) {
+      consumeToken(config, ++i, ',');
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -131,6 +507,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
 }
 
 size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
+<<<<<<< HEAD
     const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
   tokenizer.checkToken(++i, ":");
@@ -143,10 +520,25 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
     size_t i) {
   tokenizer.checkToken(++i, ":");
   m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
+=======
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for pinned_use_cuda_host_register");
+    m_pinned_use_cuda_host_register = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_use_cuda_host_register value", "");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return i;
 }
 
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
+<<<<<<< HEAD
     const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
   tokenizer.checkToken(++i, ":");
@@ -178,5 +570,50 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
 }
 
 REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
+=======
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val2 = stoi(config[i]);
+    TORCH_CHECK(
+        llvm::isPowerOf2_64(val2),
+        "Number of register threads has to be power of 2 ",
+        "");
+    auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
+    TORCH_CHECK(
+        val2 <= maxThreads,
+        "Number of register threads should be less than or equal to " +
+            std::to_string(maxThreads),
+        "");
+    m_pinned_num_register_threads = val2;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_num_register_threads value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for pinned_use_background_threads");
+    m_pinned_use_background_threads = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_use_background_threads value", "");
+  }
+  return i;
+}
+
+// General caching allocator utilities
+void setAllocatorSettings(const std::string& env) {
+  CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index d61f69467a2dc..378fd139a4491 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,5 +1,6 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
@@ -7,6 +8,19 @@
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
 
+=======
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/env.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
+#include <vector>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10::cuda::CUDACachingAllocator {
 
 enum class Expandable_Segments_Handle_Type : int {
@@ -18,6 +32,7 @@ enum class Expandable_Segments_Handle_Type : int {
 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
  public:
+<<<<<<< HEAD
   C10_DEPRECATED_MESSAGE(
       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
   static size_t max_split_size() {
@@ -36,11 +51,27 @@ class C10_CUDA_API CUDAAllocatorConfig {
         use_expandable_segments();
 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
     if (enabled) {
+=======
+  static size_t max_split_size() {
+    return instance().m_max_split_size;
+  }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
+
+  static bool expandable_segments() {
+#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+    if (instance().m_expandable_segments) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
     return false;
 #else
+<<<<<<< HEAD
     return enabled;
+=======
+    return instance().m_expandable_segments;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
@@ -57,10 +88,13 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_release_lock_on_cudamalloc;
   }
 
+<<<<<<< HEAD
   static bool graph_capture_record_stream_reuse() {
     return instance().m_graph_capture_record_stream_reuse;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /** Pinned memory allocator settings */
   static bool pinned_use_cuda_host_register() {
     return instance().m_pinned_use_cuda_host_register;
@@ -70,6 +104,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_pinned_num_register_threads;
   }
 
+<<<<<<< HEAD
   C10_DEPRECATED_MESSAGE(
       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
   static bool pinned_use_background_threads() {
@@ -79,6 +114,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
 
   static size_t pinned_reserve_segment_size_mb() {
     return instance().m_pinned_reserve_segment_size_mb;
+=======
+  static bool pinned_use_background_threads() {
+    return instance().m_pinned_use_background_threads;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static size_t pinned_max_register_threads() {
@@ -88,6 +127,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return 128;
   }
 
+<<<<<<< HEAD
   C10_DEPRECATED_MESSAGE(
       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
   static size_t roundup_power2_divisions(size_t size) {
@@ -111,6 +151,26 @@ class C10_CUDA_API CUDAAllocatorConfig {
       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
   static std::string last_allocator_settings() {
     return c10::CachingAllocator::getAllocatorSettings();
+=======
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As an example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions(size_t size);
+
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
+  static size_t max_non_split_rounding_size() {
+    return instance().m_max_non_split_rounding_size;
+  }
+
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static CUDAAllocatorConfig& instance() {
@@ -123,6 +183,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
         env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
       }
 #endif
+<<<<<<< HEAD
       // Note: keep the parsing order and logic stable to avoid potential
       // performance regressions in internal tests.
       if (!env.has_value()) {
@@ -131,11 +192,15 @@ class C10_CUDA_API CUDAAllocatorConfig {
       if (env.has_value()) {
         inst->parseArgs(env.value());
       }
+=======
+      inst->parseArgs(env);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return inst;
     })();
     return *s_instance;
   }
 
+<<<<<<< HEAD
   // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
   // issue.
   static const std::unordered_set<std::string>& getKeys() {
@@ -193,5 +258,58 @@ class C10_CUDA_API CUDAAllocatorConfig {
 
 // Keep this for backwards compatibility
 using c10::CachingAllocator::setAllocatorSettings;
+=======
+  void parseArgs(const std::optional<std::string>& env);
+
+ private:
+  CUDAAllocatorConfig();
+
+  static void lexArgs(const std::string& env, std::vector<std::string>& config);
+  static void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseMaxNonSplitRoundingSize(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_cudaMallocAsync);
+  size_t parsePinnedUseCudaHostRegister(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedNumRegisterThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedUseBackgroundThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+
+  std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_max_non_split_rounding_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<bool> m_expandable_segments;
+  std::atomic<Expandable_Segments_Handle_Type>
+      m_expandable_segments_handle_type;
+  std::atomic<bool> m_release_lock_on_cudamalloc;
+  std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::atomic<bool> m_pinned_use_background_threads;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
+};
+
+// General caching allocator utilities
+C10_CUDA_API void setAllocatorSettings(const std::string& env);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 091e580f95819..260be40cc7733 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -64,6 +64,13 @@ namespace cuda::CUDACachingAllocator {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
+<<<<<<< HEAD
+=======
+// Included here as this is externally used in CUDAAllocatorConfig
+const size_t kLargeBuffer =
+    20971520; // "large" allocations may be packed in 20 MiB blocks
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace Native {
 
 //
@@ -131,6 +138,18 @@ namespace Native {
  *                  notifyCaptureDestroy.
  */
 
+<<<<<<< HEAD
+=======
+constexpr size_t kMinBlockSize =
+    512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer =
+    2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kMinLargeAlloc =
+    10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static char SHAREABLE_HANDLE_VERSION = 2;
 enum ShareableHandleType : char {
   SHAREABLE_CUDA_MALLOC = 'c',
@@ -360,16 +379,27 @@ struct ExpandableSegment {
   ExpandableSegment(
       c10::DeviceIndex device,
       std::optional<cudaStream_t> stream,
+<<<<<<< HEAD
+=======
+      size_t address_space_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       size_t segment_size,
       std::vector<c10::DeviceIndex> peers)
       : device_(device),
         stream_(stream),
         // 2MB for small pool, 20MB for large pool
         segment_size_(segment_size),
+<<<<<<< HEAD
         peers_(std::move(peers)) {
     cudaDeviceProp prop{};
     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
     mapped_size_ = 0;
+=======
+        max_handles_(numSegments(address_space_size)),
+        peers_(std::move(peers)) {
+    cudaDeviceProp prop{};
+    C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // we allocate enough address space for 1 1/8 the total memory on the GPU.
     // This allows for some cases where we have to unmap pages earlier in the
     // segment to put them at the end.
@@ -481,7 +511,10 @@ struct ExpandableSegment {
       return SegmentRange{range.ptr, 0};
     }
     unmapHandles(begin, end);
+<<<<<<< HEAD
     mapped_size_ -= (end - begin) * segment_size_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return rangeFromHandles(begin, end);
   }
 
@@ -493,6 +526,7 @@ struct ExpandableSegment {
   SegmentRange share(SegmentRange range, std::ostream& buf) {
     auto begin = segmentLeft(range.ptr);
     auto end = segmentRight(range.ptr + range.size);
+<<<<<<< HEAD
 
     // header.pid needs to be padded with 4 bytes and initialized with
     // 0 values ​​to avoid random padding of different bytes each time,
@@ -504,6 +538,10 @@ struct ExpandableSegment {
     header.num_handles = end - begin;
 
     buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
+=======
+    ShareHeader header{getpid(), segment_size_, end - begin};
+    buf.write((const char*)&header, sizeof(ShareHeader));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto i : c10::irange(begin, end)) {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
       auto& handle = handles_.at(i).value();
@@ -519,9 +557,13 @@ struct ExpandableSegment {
         TORCH_CHECK(
             handle.shareable_handle != std::nullopt,
             "shareable_handle is null");
+<<<<<<< HEAD
         buf.write(
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(int));
+=======
+        buf.write((const char*)&*handle.shareable_handle, sizeof(int));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         if (!handle.shareable_handle) {
           CUmemFabricHandle fabric_handle;
@@ -534,8 +576,12 @@ struct ExpandableSegment {
             handle.shareable_handle != std::nullopt,
             "shareable_handle is null");
         buf.write(
+<<<<<<< HEAD
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(CUmemFabricHandle));
+=======
+            (const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     return rangeFromHandles(begin, end);
@@ -546,9 +592,19 @@ struct ExpandableSegment {
       std::vector<c10::DeviceIndex> peers,
       std::istream& buf) {
     ShareHeader header{};
+<<<<<<< HEAD
     buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
     auto segment = std::make_unique<ExpandableSegment>(
         device, std::nullopt, header.segment_size, std::move(peers));
+=======
+    buf.read((char*)&header, sizeof(ShareHeader));
+    auto segment = std::make_unique<ExpandableSegment>(
+        device,
+        std::nullopt,
+        header.num_handles * header.segment_size,
+        header.segment_size,
+        std::move(peers));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
 // but the kernel on the system might still support it.
 #ifndef SYS_pidfd_open
@@ -568,11 +624,19 @@ struct ExpandableSegment {
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         int fd = 0;
+<<<<<<< HEAD
         buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
         auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
         if (myfd == -1) {
           auto err = errno;
           close(static_cast<int>(pidfd));
+=======
+        buf.read((char*)&fd, sizeof(int));
+        auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
+        if (myfd == -1) {
+          auto err = errno;
+          close((int)pidfd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           for (auto& h : segment->handles_) {
             C10_CUDA_DRIVER_CHECK(
                 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@@ -592,16 +656,27 @@ struct ExpandableSegment {
             (void*)(uintptr_t)myfd,
             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
         LOG(INFO) << "use posix fd to import expandable segments.";
+<<<<<<< HEAD
         close(static_cast<int>(myfd));
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
       close(static_cast<int>(pidfd));
+=======
+        close((int)myfd);
+        segment->handles_.emplace_back(Handle{handle, std::nullopt});
+      }
+      close((int)pidfd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         CUmemFabricHandle fabric_handle;
+<<<<<<< HEAD
         buf.read(
             reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
+=======
+        buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CUmemGenericAllocationHandle handle = 0;
         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
             &handle,
@@ -625,6 +700,7 @@ struct ExpandableSegment {
     return max_handles_ * segment_size_;
   }
 
+<<<<<<< HEAD
   cudaStream_t getStream() {
     return *stream_;
   }
@@ -637,6 +713,8 @@ struct ExpandableSegment {
     return segment_size_;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void addPeer(c10::DeviceIndex device) {
     peers_.push_back(device);
     forEachAllocatedRange(
@@ -671,7 +749,10 @@ struct ExpandableSegment {
           handles_.at(i).value().handle,
           0ULL));
     }
+<<<<<<< HEAD
     mapped_size_ += (end - begin) * segment_size_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     setAccess(device_, begin, end);
     for (auto p : peers_) {
       setAccess(p, begin, end);
@@ -740,7 +821,10 @@ struct ExpandableSegment {
   std::optional<cudaStream_t> stream_;
   CUdeviceptr ptr_{};
   size_t segment_size_;
+<<<<<<< HEAD
   size_t mapped_size_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t max_handles_;
   struct Handle {
     CUmemGenericAllocationHandle handle;
@@ -761,6 +845,10 @@ struct ExpandableSegment {
   ExpandableSegment(
       c10::DeviceIndex device,
       std::optional<cudaStream_t> stream,
+<<<<<<< HEAD
+=======
+      size_t address_space_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       size_t segment_size,
       std::vector<c10::DeviceIndex> peers) {
     TORCH_INTERNAL_ASSERT(false, "expandable segment not supported");
@@ -786,6 +874,7 @@ struct ExpandableSegment {
   size_t size() const {
     return 0;
   }
+<<<<<<< HEAD
   cudaStream_t getStream() {
     return nullptr;
   }
@@ -797,6 +886,8 @@ struct ExpandableSegment {
   size_t getSegmentSize() const {
     return 0;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void addPeer(c10::DeviceIndex device) {}
 };
 #endif
@@ -807,7 +898,11 @@ struct ExpandableSegment {
 struct BlockState {
   c10::DeviceIndex device = 0;
   cudaStream_t stream = nullptr;
+<<<<<<< HEAD
   stream_set stream_uses;
+=======
+  stream_set stream_uses = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t size = 0;
   void* ptr = nullptr;
   bool allocated = false;
@@ -815,14 +910,22 @@ struct BlockState {
   // maintain invariant that event_count == 0 ;
   // history will be left alone in checkpoint
 
+<<<<<<< HEAD
   explicit BlockState(Block* block);
+=======
+  BlockState(Block* block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct SegmentState {
   std::vector<BlockState> blocks;
   bool is_small = false;
 
+<<<<<<< HEAD
   explicit SegmentState(Block* head);
+=======
+  SegmentState(Block* head);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct PrivatePoolState : AllocatorState {
@@ -841,7 +944,11 @@ struct RestoreResult {
   std::vector<Block*> allocations_created;
 };
 
+<<<<<<< HEAD
 bool BlockComparatorSize(const Block* a, const Block* b) {
+=======
+static bool BlockComparatorSize(const Block* a, const Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (a->stream != b->stream) {
     return (uintptr_t)a->stream < (uintptr_t)b->stream;
   }
@@ -850,7 +957,11 @@ bool BlockComparatorSize(const Block* a, const Block* b) {
   }
   return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
 }
+<<<<<<< HEAD
 bool BlockComparatorAddress(const Block* a, const Block* b) {
+=======
+static bool BlockComparatorAddress(const Block* a, const Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (a->stream != b->stream) {
     return (uintptr_t)a->stream < (uintptr_t)b->stream;
   }
@@ -863,7 +974,12 @@ struct AllocParams {
       size_t size,
       cudaStream_t stream,
       BlockPool* pool,
+<<<<<<< HEAD
       size_t alloc_size)
+=======
+      size_t alloc_size,
+      DeviceStats& stats)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : search_key(device, stream, size), pool(pool), alloc_size(alloc_size) {}
 
   c10::DeviceIndex device() const {
@@ -932,7 +1048,11 @@ class EventPool {
 
  private:
   struct PerDevicePool {
+<<<<<<< HEAD
     alignas(hardware_destructive_interference_size) std::mutex mutex_;
+=======
+    alignas(64) std::mutex mutex_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
   };
   std::vector<PerDevicePool> pools_;
@@ -940,7 +1060,11 @@ class EventPool {
 
 // CUDA graphs helper
 struct PrivatePool {
+<<<<<<< HEAD
   explicit PrivatePool(MempoolId_t id, CUDAAllocator* allocator = nullptr)
+=======
+  PrivatePool(MempoolId_t id, CUDAAllocator* allocator = nullptr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : id(std::move(id)),
         allocator_(allocator),
         large_blocks(/*small=*/false, this),
@@ -1054,7 +1178,11 @@ class RingBuffer {
 
   void setMaxEntries(size_t size) {
     std::lock_guard<std::mutex> lk(alloc_trace_lock);
+<<<<<<< HEAD
     alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
+=======
+    alloc_trace_max_entries_ = std::max(size_t(1), size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void insertEntries(const T& entry) {
@@ -1069,6 +1197,7 @@ class RingBuffer {
     }
   }
 
+<<<<<<< HEAD
   void getEntries(std::vector<T>& result) const {
     std::lock_guard<std::mutex> lk(alloc_trace_lock);
     result.reserve(result.size() + alloc_trace->size());
@@ -1077,6 +1206,23 @@ class RingBuffer {
         std::next(alloc_trace->begin(), alloc_trace_next),
         alloc_trace->end(),
         std::back_inserter(result));
+=======
+  void getEntries(std::vector<T>& result) {
+    std::lock_guard<std::mutex> lk(alloc_trace_lock);
+    result.reserve(alloc_trace->size());
+    result.insert(
+        result.end(),
+        alloc_trace->begin() +
+            static_cast<typename std::vector<T>::difference_type>(
+                alloc_trace_next),
+        alloc_trace->end());
+    result.insert(
+        result.end(),
+        alloc_trace->begin(),
+        alloc_trace->begin() +
+            static_cast<typename std::vector<T>::difference_type>(
+                alloc_trace_next));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void clear() {
@@ -1090,7 +1236,11 @@ class RingBuffer {
 
   // Both alloc_trace and alloc_trace_next needs to be used
   // under alloc_trace_lock.
+<<<<<<< HEAD
   mutable std::mutex alloc_trace_lock;
+=======
+  std::mutex alloc_trace_lock;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t alloc_trace_next = 0;
   std::vector<T>*
       alloc_trace; // pointer because we need to intentionally leak this on
@@ -1167,8 +1317,11 @@ class DeviceCachingAllocator {
   // device statistics
   DeviceStats stats;
 
+<<<<<<< HEAD
   c10::DeviceIndex device_id;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // unallocated cached blocks larger than 1 MB
   BlockPool large_blocks;
 
@@ -1189,6 +1342,7 @@ class DeviceCachingAllocator {
   // tracks which pools we can use as a last resort before ooming
   ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
 
+<<<<<<< HEAD
   // Map of blocks whose freeing is deferred until after CUDA graph capture.
   //   - Key: Block* to be freed.
   //   - Value: List of "empty nodes" inserted as free markers during capture.
@@ -1206,6 +1360,10 @@ class DeviceCachingAllocator {
       mempool_to_capture_id;
   ska::flat_hash_map<CaptureId_t, GraphReuseContext> graph_reuse_context;
 
+=======
+  // See free() for this thing's purpose
+  std::vector<Block*> needs_events_deferred_until_no_capture;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // outstanding cuda events
   ska::flat_hash_map<
       cuda::CUDAStream,
@@ -1255,6 +1413,7 @@ class DeviceCachingAllocator {
   // thread local compile context for each device
   static thread_local std::stack<std::string> compile_context;
 
+<<<<<<< HEAD
   // thread local user metadata for annotating allocations
   static thread_local std::string user_metadata;
 
@@ -1266,6 +1425,14 @@ class DeviceCachingAllocator {
         small_blocks(/*small=*/true) {
     stats.max_split_size =
         static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
+=======
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  DeviceCachingAllocator()
+      : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
+    stats.max_split_size =
+        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     context_recorder_.store(nullptr);
   }
 
@@ -1286,7 +1453,11 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   bool isHistoryEnabled() const {
+=======
+  bool isHistoryEnabled() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return record_history;
   }
 
@@ -1300,6 +1471,7 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   void setUserMetadata(const std::string& metadata) {
     user_metadata = metadata;
   }
@@ -1311,6 +1483,11 @@ class DeviceCachingAllocator {
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) const {
+=======
+  bool checkPoolLiveAllocations(
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::unique_lock<std::recursive_mutex> lock(mutex);
 
     PrivatePool* pool = nullptr;
@@ -1357,7 +1534,14 @@ class DeviceCachingAllocator {
   // All public methods (except the above) acquire the allocator mutex.
   // Thus, do not call a public method from another public method.
 
+<<<<<<< HEAD
   Block* malloc(size_t orig_size, cudaStream_t stream) {
+=======
+  Block* malloc(
+      c10::DeviceIndex device,
+      size_t orig_size,
+      cudaStream_t stream) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // done outside the lock because we don't know what locks the recorder needs
     // to have...
     auto context = maybeGatherContext(RecordContext::STATE);
@@ -1376,16 +1560,23 @@ class DeviceCachingAllocator {
       //    capture. Cross-stream memory use is uncommon, so the deferral's
       //    effect on memory use during capture should be small.
       process_events(context);
+<<<<<<< HEAD
     } else {
       if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
         // We check if there is some block that is safe to reuse on this stream
         free_safe_blocks_in_capture(context, stream);
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     size_t size = round_size(orig_size);
     auto& pool = get_pool(size, stream);
     const size_t alloc_size = get_allocation_size(size);
+<<<<<<< HEAD
     AllocParams params(device_id, size, stream, &pool, alloc_size);
+=======
+    AllocParams params(device, size, stream, &pool, alloc_size, stats);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     params.stat_types = get_stat_types_for_pool(pool);
 
     // First, try to get a block from the existing pool.
@@ -1400,8 +1591,12 @@ class DeviceCachingAllocator {
       // Do garbage collection if the flag is set.
       if (C10_UNLIKELY(
               set_fraction &&
+<<<<<<< HEAD
               AcceleratorAllocatorConfig::garbage_collection_threshold() >
                   0.0)) {
+=======
+              CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         garbage_collect_cached_blocks(context);
       }
       // Attempt allocate
@@ -1433,7 +1628,11 @@ class DeviceCachingAllocator {
           beginAllocateToPool(mempool_id, filter);
           auto& mempool = get_pool(size, stream);
           AllocParams mempool_params(
+<<<<<<< HEAD
               device_id, size, stream, &mempool, alloc_size);
+=======
+              device, size, stream, &mempool, alloc_size, stats);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           mempool_params.stat_types = get_stat_types_for_pool(mempool);
           block_found = get_free_block(mempool_params);
           endAllocateToPool(mempool_id);
@@ -1460,7 +1659,11 @@ class DeviceCachingAllocator {
         allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
       }
 
+<<<<<<< HEAD
       std::string proc_info = reportProcessMemoryInfo(device_id);
+=======
+      std::string proc_info = reportProcessMemoryInfo(device);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       record_trace(
           TraceEntry::OOM,
@@ -1478,7 +1681,11 @@ class DeviceCachingAllocator {
               .current,
           stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
               .current,
+<<<<<<< HEAD
           c10::Device(c10::DeviceType::CUDA, device_id));
+=======
+          c10::Device(c10::DeviceType::CUDA, device));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
@@ -1516,7 +1723,11 @@ class DeviceCachingAllocator {
       lock.unlock();
 
       for (const auto& obs : observers_local) {
+<<<<<<< HEAD
         obs(device_id,
+=======
+        obs(device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             alloc_size,
             set_fraction ? allowed_memory_maximum : device_total,
             device_free);
@@ -1546,7 +1757,11 @@ class DeviceCachingAllocator {
           "CUDA out of memory. Tried to allocate ",
           format_size(alloc_size),
           ". GPU ",
+<<<<<<< HEAD
           static_cast<int>(device_id),
+=======
+          static_cast<int>(device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " has a total capacity of ",
           format_size(device_total),
           " of which ",
@@ -1653,7 +1868,11 @@ class DeviceCachingAllocator {
       stats.active_bytes[stat_type].increase(block->size);
       stats.requested_bytes[stat_type].increase(block->requested_size);
     });
+<<<<<<< HEAD
     if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+=======
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats.oversize_allocations.increase(1);
 
     auto allocated_bytes_gauge =
@@ -1672,6 +1891,7 @@ class DeviceCachingAllocator {
     return block;
   }
 
+<<<<<<< HEAD
   struct CaptureInfo {
     cudaGraph_t graph{};
     CaptureId_t capture_id{0};
@@ -1890,6 +2110,8 @@ class DeviceCachingAllocator {
     }
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void free(Block* block) {
     std::shared_ptr<GatheredContext> context =
         maybeGatherContext(RecordContext::ALL);
@@ -1922,6 +2144,7 @@ class DeviceCachingAllocator {
         block->pool->owner_MempoolId(),
         context ? context : block->context_when_allocated);
 
+<<<<<<< HEAD
     if (block->size >= AcceleratorAllocatorConfig::max_split_size())
       stats.oversize_allocations.decrease(1);
 
@@ -1941,6 +2164,19 @@ class DeviceCachingAllocator {
         }
       } else {
         // If not in a capture, insert events for the block.
+=======
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
+      stats.oversize_allocations.decrease(1);
+
+    if (!block->stream_uses.empty()) {
+      if (C10_UNLIKELY(!captures_underway.empty())) {
+        // It's forbidden to cudaEventQuery an event recorded during CUDA graph
+        // capture. We conservatively defer recording end-of-life events until
+        // the next call to process_events() (which won't happen until no
+        // captures are underway)
+        needs_events_deferred_until_no_capture.push_back(block);
+      } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         insert_events(block);
       }
     } else {
@@ -1986,16 +2222,27 @@ class DeviceCachingAllocator {
       while (base_block->prev) {
         base_block = base_block->prev;
       }
+<<<<<<< HEAD
       offset = static_cast<const char*>(block->ptr) -
           static_cast<const char*>(base_block->ptr);
       cudaIpcMemHandle_t handle;
       C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
       ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
+=======
+      offset = (char*)block->ptr - (char*)base_block->ptr;
+      cudaIpcMemHandle_t handle;
+      C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
+      ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
       auto full_range = block->expandable_segment_->share(
           SegmentRange(block->ptr, block->size), ss);
+<<<<<<< HEAD
       offset = static_cast<const char*>(block->ptr) - full_range.ptr;
+=======
+      offset = (char*)block->ptr - (char*)full_range.ptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return ShareableHandle{offset, ss.str()};
   }
@@ -2036,6 +2283,7 @@ class DeviceCachingAllocator {
     set_fraction = true;
   }
 
+<<<<<<< HEAD
   /** get expandable segment size for all the streams on device **/
   std::vector<StreamSegmentSize> getExpandableSegmentSizes() {
     std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -2052,6 +2300,8 @@ class DeviceCachingAllocator {
     return sizes;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /** returns cached blocks to the system allocator **/
   void emptyCache(MempoolId_t mempool_id) {
     auto context = maybeGatherContext(RecordContext::ALL);
@@ -2078,7 +2328,11 @@ class DeviceCachingAllocator {
   }
 
   /** Returns a copy of the memory allocator stats **/
+<<<<<<< HEAD
   DeviceStats getStats() const {
+=======
+  DeviceStats getStats() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::lock_guard<std::recursive_mutex> lock(mutex);
     return stats;
   }
@@ -2217,7 +2471,12 @@ class DeviceCachingAllocator {
           block_state.size,
           block_state.stream,
           &pool,
+<<<<<<< HEAD
           block_state.size);
+=======
+          block_state.size,
+          stats);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       pool.blocks.erase(curr_block);
       params.block = curr_block;
       params.stat_types = get_stat_types_for_pool(pool);
@@ -2454,7 +2713,11 @@ class DeviceCachingAllocator {
   }
 
   std::vector<TraceEntry> trace(
+<<<<<<< HEAD
       const std::function<time_t(approx_time_t)>& tsc_to_us) const {
+=======
+      const std::function<time_t(approx_time_t)>& tsc_to_us) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::lock_guard<std::recursive_mutex> lock(mutex);
     std::vector<TraceEntry> result;
     alloc_buffer.getEntries(result);
@@ -2496,8 +2759,12 @@ class DeviceCachingAllocator {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
     } else {
+<<<<<<< HEAD
       auto divisions =
           AcceleratorAllocatorConfig::roundup_power2_divisions(size);
+=======
+      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (divisions > 1 && size > (kMinBlockSize * divisions)) {
         return roundup_power2_next_division(size, divisions);
       } else {
@@ -2539,6 +2806,7 @@ class DeviceCachingAllocator {
   // Called by CUDAGraph::capture_end
   void endAllocateToPool(MempoolId_t mempool_id) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
+<<<<<<< HEAD
 
     if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() &&
         !graph_reuse_context.empty()) {
@@ -2554,6 +2822,8 @@ class DeviceCachingAllocator {
       mempool_to_capture_id.erase(mempool_id);
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto it = captures_underway.begin(); it != captures_underway.end();
          ++it) {
       if (it->first == mempool_id) {
@@ -2589,7 +2859,11 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   int getPoolUseCount(MempoolId_t mempool_id) const {
+=======
+  int getPoolUseCount(MempoolId_t mempool_id) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::lock_guard<std::recursive_mutex> lock(mutex);
     auto pp = get_private_pool(mempool_id);
     return pp->use_count;
@@ -2685,7 +2959,11 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   PrivatePool* get_private_pool(MempoolId_t mempool_id) const {
+=======
+  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto it = graph_pools.find(mempool_id);
     TORCH_INTERNAL_ASSERT(it != graph_pools.end());
     return it->second.get();
@@ -2728,8 +3006,24 @@ class DeviceCachingAllocator {
       }
     }
     auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
+<<<<<<< HEAD
     expandable_segments_.emplace_back(new ExpandableSegment(
         device, stream, segment_size, devices_with_peer_access_));
+=======
+    cudaDeviceProp prop{};
+    C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    // we allocate enough address space for 1 1/8 the total memory on the GPU.
+    // This allows for some cases where we have to unmap pages earlier in the
+    // segment to put them at the end.
+    size_t address_space_size = prop.totalGlobalMem + prop.totalGlobalMem / 8;
+
+    expandable_segments_.emplace_back(new ExpandableSegment(
+        device,
+        stream,
+        address_space_size,
+        segment_size,
+        devices_with_peer_access_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ExpandableSegment* es = expandable_segments_.back();
     Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
@@ -2991,7 +3285,11 @@ class DeviceCachingAllocator {
     if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
       return remaining >= kMinBlockSize;
     } else {
+<<<<<<< HEAD
       return (size < AcceleratorAllocatorConfig::max_split_size()) &&
+=======
+      return (size < CUDAAllocatorConfig::max_split_size()) &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           (remaining > kSmallSize);
     }
   }
@@ -3011,7 +3309,11 @@ class DeviceCachingAllocator {
 
     if (C10_UNLIKELY(
             set_fraction &&
+<<<<<<< HEAD
             AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+=======
+            CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Track block reuse interval only when garbage collection is enabled.
       ++pool.get_free_blocks_call_count;
     }
@@ -3053,6 +3355,7 @@ class DeviceCachingAllocator {
     }
 
     // Do not return an oversized block for a large request
+<<<<<<< HEAD
     if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
         ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
       return false;
@@ -3060,6 +3363,15 @@ class DeviceCachingAllocator {
     if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
         ((*it)->size >=
          p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
+=======
+    if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
+      return false;
+    // Allow oversized block size to be rounded up but within a limit
+    if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
+        ((*it)->size >=
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     p.block = *it;
     pool.blocks.erase(it);
@@ -3082,7 +3394,11 @@ class DeviceCachingAllocator {
     // therefore should be of less overheads.
 
     size_t gc_threshold = static_cast<size_t>(
+<<<<<<< HEAD
         AcceleratorAllocatorConfig::garbage_collection_threshold() *
+=======
+        CUDAAllocatorConfig::garbage_collection_threshold() *
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         static_cast<double>(allowed_memory_maximum));
     // No need to trigger GC yet
     if (total_allocated_memory <= gc_threshold) {
@@ -3225,13 +3541,21 @@ class DeviceCachingAllocator {
     }
 
     total_allocated_memory += size;
+<<<<<<< HEAD
     p.block = new Block(
         p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
+=======
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
       stats.segment[stat_type].increase(1);
       stats.reserved_bytes[stat_type].increase(size);
     });
+<<<<<<< HEAD
     if (size >= AcceleratorAllocatorConfig::max_split_size())
+=======
+    if (size >= CUDAAllocatorConfig::max_split_size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats.oversize_segments.increase(1);
     auto reserved_bytes_gauge =
         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@@ -3260,7 +3584,11 @@ class DeviceCachingAllocator {
   bool release_available_cached_blocks(
       const AllocParams& p,
       const std::shared_ptr<GatheredContext>& context) {
+<<<<<<< HEAD
     if (AcceleratorAllocatorConfig::max_split_size() ==
+=======
+    if (CUDAAllocatorConfig::max_split_size() ==
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::numeric_limits<size_t>::max())
       return false;
     BlockPool& pool = *p.pool;
@@ -3268,8 +3596,13 @@ class DeviceCachingAllocator {
     // because of std::unique_ptr, block cannot be trivially copied
     // Use constructor for search key.
     Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
+<<<<<<< HEAD
     key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
         ? AcceleratorAllocatorConfig::max_split_size()
+=======
+    key.size = (key.size < CUDAAllocatorConfig::max_split_size())
+        ? CUDAAllocatorConfig::max_split_size()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         : key.size;
     auto it = pool.blocks.lower_bound(&key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@@ -3282,7 +3615,11 @@ class DeviceCachingAllocator {
       --it; // Back up one item.  Now on the largest block for the correct
             // stream
       while ((totalReleased < key.size) &&
+<<<<<<< HEAD
              ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
+=======
+             ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              ((*it)->stream == p.stream())) {
         auto cur = it;
         bool is_first = cur == pool.blocks.begin();
@@ -3290,8 +3627,13 @@ class DeviceCachingAllocator {
           --it;
         }
         if (!(*cur)->expandable_segment_) {
+<<<<<<< HEAD
           totalReleased += (*cur)->size;
           release_block(*cur, context);
+=======
+          release_block(*cur, context);
+          totalReleased += (*cur)->size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         if (is_first) {
           break;
@@ -3384,7 +3726,11 @@ class DeviceCachingAllocator {
     if (pool->owner_PrivatePool && pool->owner_PrivatePool->allocator()) {
       // If there is an active mempool with a given allocator,
       // we use the given allocator's delete function.
+<<<<<<< HEAD
       pool->owner_PrivatePool->allocator()->raw_delete(block->ptr);
+=======
+      pool->owner_PrivatePool->allocator()->raw_delete((void*)block->ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       C10_CUDA_CHECK(cudaFree((void*)block->ptr));
     }
@@ -3407,7 +3753,11 @@ class DeviceCachingAllocator {
         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
             .current);
 
+<<<<<<< HEAD
     if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+=======
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats.oversize_segments.decrease(1);
     pool->blocks.erase(block);
     delete block;
@@ -3423,7 +3773,12 @@ class DeviceCachingAllocator {
     }
     block->pool->blocks.erase(block);
 
+<<<<<<< HEAD
     ptrdiff_t before_size = unmapped.ptr - static_cast<char*>(block->ptr);
+=======
+    ptrdiff_t before_size =
+        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (before_size > 0) {
       // prev? -> before_free -> block
       Block* before_free = new Block(
@@ -3441,7 +3796,11 @@ class DeviceCachingAllocator {
           block->stream,
           after_size,
           block->pool,
+<<<<<<< HEAD
           unmapped.ptr + unmapped.size);
+=======
+          static_cast<char*>(unmapped.ptr) + unmapped.size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       after_free->expandable_segment_ = block->expandable_segment_;
       after_free->splice(block, block->next);
       block->pool->insert_into_blocks(after_free);
@@ -3599,8 +3958,13 @@ class DeviceCachingAllocator {
 
   void insert_events_deferred_until_no_capture(
       const std::shared_ptr<GatheredContext>& context) {
+<<<<<<< HEAD
     if (C10_UNLIKELY(!deferred_blocks.empty())) {
       for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
+=======
+    if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
+      for (auto* block : needs_events_deferred_until_no_capture) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
         // only streams recorded before cudagraph will be used to insert events
         // since we know all streams recorded during cudagraph must have
@@ -3612,7 +3976,11 @@ class DeviceCachingAllocator {
           free_block(block, context);
         }
       }
+<<<<<<< HEAD
       deferred_blocks.clear();
+=======
+      needs_events_deferred_until_no_capture.clear();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -3683,7 +4051,11 @@ class DeviceCachingAllocator {
     if (!compile_context.empty()) {
       compile_string = compile_context.top();
     }
+<<<<<<< HEAD
     TraceEntry te(
+=======
+    auto te = TraceEntry(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         action,
         device,
         addr,
@@ -3692,8 +4064,12 @@ class DeviceCachingAllocator {
         mempool_id,
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
+<<<<<<< HEAD
         compile_string,
         user_metadata);
+=======
+        compile_string);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3748,7 +4124,15 @@ static void uncached_delete(void* ptr) {
 
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+<<<<<<< HEAD
 thread_local std::string DeviceCachingAllocator::user_metadata;
+=======
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+static constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class NativeCachingAllocator : public CUDAAllocator {
  private:
@@ -3769,7 +4153,11 @@ class NativeCachingAllocator : public CUDAAllocator {
       allocated_blocks;
 
   static size_t get_mutex_shard_id(void* ptr) {
+<<<<<<< HEAD
     return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
+=======
+    return twang_mix64((size_t)ptr) % kNumMutexShard;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void add_allocated_block(Block* block) {
@@ -3806,8 +4194,12 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (size < device_count) {
       device_allocator.resize(device_count);
       for (const auto i : c10::irange(size, device_count)) {
+<<<<<<< HEAD
         device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
             static_cast<c10::DeviceIndex>(i));
+=======
+        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -3827,9 +4219,15 @@ class NativeCachingAllocator : public CUDAAllocator {
         "Allocator not initialized for device ",
         device,
         ": did you call init?");
+<<<<<<< HEAD
     Block* block = device_allocator[device]->malloc(size, stream);
     add_allocated_block(block);
     *devPtr = block->ptr;
+=======
+    Block* block = device_allocator[device]->malloc(device, size, stream);
+    add_allocated_block(block);
+    *devPtr = (void*)block->ptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_memory_allocation(
@@ -3878,6 +4276,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->setMemoryFraction(fraction);
   }
 
+<<<<<<< HEAD
   std::vector<StreamSegmentSize> getExpandableSegmentSizes(
       c10::DeviceIndex device) override {
     TORCH_INTERNAL_ASSERT(
@@ -3888,6 +4287,8 @@ class NativeCachingAllocator : public CUDAAllocator {
     return device_allocator[device]->getExpandableSegmentSizes();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void recordHistory(
       bool enabled,
       CreateContextFn context_recorder,
@@ -3941,6 +4342,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->popCompileContext();
   }
 
+<<<<<<< HEAD
   void setUserMetadata(const std::string& metadata) override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@@ -3953,6 +4355,8 @@ class NativeCachingAllocator : public CUDAAllocator {
     return device_allocator[device]->getUserMetadata();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@@ -4053,8 +4457,13 @@ class NativeCachingAllocator : public CUDAAllocator {
 
     auto& md = result.config_metadata;
     md.garbage_collection_threshold =
+<<<<<<< HEAD
         AcceleratorAllocatorConfig::garbage_collection_threshold();
     md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
+=======
+        CUDAAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = CUDAAllocatorConfig::max_split_size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     md.pinned_num_register_threads =
         CUDAAllocatorConfig::pinned_num_register_threads();
     md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@@ -4062,12 +4471,18 @@ class NativeCachingAllocator : public CUDAAllocator {
         CUDAAllocatorConfig::release_lock_on_cudamalloc();
     md.pinned_use_host_register =
         CUDAAllocatorConfig::pinned_use_cuda_host_register();
+<<<<<<< HEAD
     md.last_allocator_settings =
         AcceleratorAllocatorConfig::last_allocator_settings();
     md.graph_capture_record_stream_reuse =
         CUDAAllocatorConfig::graph_capture_record_stream_reuse();
     md.roundup_power2_divisions =
         AcceleratorAllocatorConfig::roundup_power2_divisions();
+=======
+    md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
+    md.roundup_power2_divisions =
+        CUDAAllocatorConfig::roundup_power2_divisions();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return result;
   }
@@ -4336,7 +4751,11 @@ class NativeCachingAllocator : public CUDAAllocator {
         // SHARABLE_CUDA_MALLOC
       if (type == SHAREABLE_CUDA_MALLOC) {
         cudaIpcMemHandle_t cuda_handle;
+<<<<<<< HEAD
         ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
+=======
+        ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         C10_CUDA_CHECK(cudaIpcOpenMemHandle(
             &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
       } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
@@ -4445,12 +4864,20 @@ CUDAAllocator* allocator();
 } // namespace CudaMallocAsync
 
 struct BackendStaticInitializer {
+<<<<<<< HEAD
   // Parses the environment configuration for CUDA/ROCm allocator backend at
   // load time. This duplicates some logic from CUDAAllocatorConfig to ensure
   // lazy initialization without triggering global static constructors. The
   // function looks for the key "backend" and returns the appropriate allocator
   // instance based on its value. If no valid configuration is found, it falls
   // back to the default Native allocator.
+=======
+  // Parses env for backend at load time, duplicating some logic from
+  // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
+  // runtime). Defers verbose exceptions and error checks, including Cuda
+  // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
+  // works, maybe we should move all of CUDAAllocatorConfig here?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDAAllocator* parseEnvForBackend() {
     auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
 #ifdef USE_ROCM
@@ -4459,6 +4886,7 @@ struct BackendStaticInitializer {
       val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
     }
 #endif
+<<<<<<< HEAD
     if (!val.has_value()) {
       val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
     }
@@ -4491,11 +4919,45 @@ struct BackendStaticInitializer {
       }
     }
     // Default fallback allocator.
+=======
+    if (val.has_value()) {
+      const std::string& config = val.value();
+
+      std::regex exp("[\\s,]+");
+      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+      std::sregex_token_iterator end;
+      std::vector<std::string> options(it, end);
+
+      for (auto option : options) {
+        std::regex exp2("[:]+");
+        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+        std::sregex_token_iterator end2;
+        std::vector<std::string> kv(it2, end2);
+        if (kv.size() >= 2) {
+          if (kv[0] == "backend") {
+#ifdef USE_ROCM
+            // convenience for ROCm users to allow either CUDA or HIP env var
+            if (kv[1] ==
+                    "cud"
+                    "aMallocAsync" ||
+                kv[1] == "hipMallocAsync")
+#else
+            if (kv[1] == "cudaMallocAsync")
+#endif
+              return CudaMallocAsync::allocator();
+            if (kv[1] == "native")
+              return &Native::allocator;
+          }
+        }
+      }
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return &Native::allocator;
   }
 
   BackendStaticInitializer() {
     auto r = parseEnvForBackend();
+<<<<<<< HEAD
 // Register this HIP allocator as the CUDA allocator to allow it to work
 // with both c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA)
 // APIs. We don't perform this masquerading inside
@@ -4506,6 +4968,9 @@ struct BackendStaticInitializer {
     at::SetAllocator(c10::Device(HIP_MASQUERADING_AS_CUDA).type(), r, 0);
     allocator.store(r);
 #undef HIP_MASQUERADING_AS_CUDA
+=======
+    allocator.store(r);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -4532,8 +4997,16 @@ std::atomic<CaptureId_t> MemPool::uuid_{1};
 MemPool::MemPool(
     CUDACachingAllocator::CUDAAllocator* allocator,
     bool is_user_created,
+<<<<<<< HEAD
     bool use_on_oom)
     : allocator_(allocator), is_user_created_(is_user_created) {
+=======
+    bool use_on_oom,
+    bool symmetric)
+    : allocator_(allocator),
+      is_user_created_(is_user_created),
+      symmetric_(symmetric) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_user_created_) {
     id_ = {0, uid_++};
   } else {
@@ -4556,6 +5029,13 @@ MempoolId_t MemPool::id() {
   return id_;
 }
 
+<<<<<<< HEAD
+=======
+bool MemPool::is_symmetric() {
+  return symmetric_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
   return allocator_;
 }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index fbe5dab18e0ae..540b58c44f4e6 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,6 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/CachingDeviceAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
@@ -50,9 +53,16 @@ namespace c10::cuda::CUDACachingAllocator {
 
 // Preserved only for BC reasons
 // NOLINTNEXTLINE(misc-unused-using-decls)
+<<<<<<< HEAD
 using c10::CachingAllocator::kLargeBuffer;
 using c10::CachingDeviceAllocator::DeviceStats;
 
+=======
+using c10::CachingDeviceAllocator::DeviceStats;
+
+extern const size_t kLargeBuffer;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
@@ -118,8 +128,12 @@ struct TraceEntry {
       MempoolId_t mempool,
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr,
+<<<<<<< HEAD
       std::string compile_context = "",
       std::string user_metadata = "")
+=======
+      std::string compile_context = "")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : action_(action),
         device_(device),
         addr_(addr),
@@ -127,8 +141,12 @@ struct TraceEntry {
         stream_(stream),
         size_(size),
         mempool_(std::move(mempool)),
+<<<<<<< HEAD
         compile_context_(std::move(compile_context)),
         user_metadata_(std::move(user_metadata)) {
+=======
+        compile_context_(std::move(compile_context)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     time_.approx_t_ = time;
   }
   Action action_;
@@ -139,8 +157,12 @@ struct TraceEntry {
   size_t size_;
   MempoolId_t mempool_;
   trace_time_ time_{};
+<<<<<<< HEAD
   std::string compile_context_;
   std::string user_metadata_;
+=======
+  std::string compile_context_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Calls made by record_function will save annotations
@@ -166,7 +188,10 @@ struct AllocatorConfigInfo {
   bool expandable_segments;
   bool release_lock_on_malloc;
   bool pinned_use_host_register;
+<<<<<<< HEAD
   bool graph_capture_record_stream_reuse;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string last_allocator_settings;
   std::vector<size_t> roundup_power2_divisions;
 };
@@ -206,6 +231,7 @@ struct ShareableHandle {
   std::string handle;
 };
 
+<<<<<<< HEAD
 struct StreamSegmentSize {
   StreamSegmentSize(cudaStream_t s, bool small, size_t sz)
       : stream(s), is_small_pool(small), total_size(sz) {}
@@ -215,25 +241,43 @@ struct StreamSegmentSize {
 };
 
 class CUDAAllocator : public DeviceAllocator {
+=======
+class CUDAAllocator : public Allocator {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public:
   virtual void* raw_alloc(size_t nbytes) = 0;
   virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
   virtual void raw_delete(void* ptr) = 0;
   virtual void init(int device_count) = 0;
+<<<<<<< HEAD
   virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
   virtual std::vector<StreamSegmentSize> getExpandableSegmentSizes(
       c10::DeviceIndex device) = 0;
+=======
+  virtual bool initialized() = 0;
+  virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void enable(bool value) = 0;
   virtual bool isEnabled() const = 0;
   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
+<<<<<<< HEAD
   // Keep for BC only
   virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
   void recordStream(const DataPtr& ptr, c10::Stream stream) override {
     CUDAStream cuda_stream = CUDAStream(stream);
     recordStream(ptr, cuda_stream);
   }
+=======
+  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
+  virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) = 0;
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
   virtual void beginAllocateToPool(
       c10::DeviceIndex device,
@@ -300,10 +344,13 @@ class CUDAAllocator : public DeviceAllocator {
       const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
   virtual void pushCompileContext(std::string& md) {}
   virtual void popCompileContext() {}
+<<<<<<< HEAD
   virtual void setUserMetadata(const std::string& metadata) {}
   virtual std::string getUserMetadata() {
     return "";
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -367,11 +414,19 @@ inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
 }
 
 inline void raw_delete(void* ptr) {
+<<<<<<< HEAD
   get()->raw_delete(ptr);
 }
 
 inline void init(int device_count) {
   get()->init(device_count);
+=======
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline double getMemoryFraction(c10::DeviceIndex device) {
@@ -379,6 +434,7 @@ inline double getMemoryFraction(c10::DeviceIndex device) {
 }
 
 inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+<<<<<<< HEAD
   get()->setMemoryFraction(fraction, device);
 }
 
@@ -393,6 +449,17 @@ inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
 
 inline void enable(bool value) {
   get()->enable(value);
+=======
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+}
+
+inline void enable(bool value) {
+  return get()->enable(value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline bool isEnabled() {
@@ -400,7 +467,11 @@ inline bool isEnabled() {
 }
 
 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+<<<<<<< HEAD
   get()->cacheInfo(device, largestBlock);
+=======
+  return get()->cacheInfo(device, largestBlock);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void* getBaseAllocation(void* ptr, size_t* size) {
@@ -408,7 +479,11 @@ inline void* getBaseAllocation(void* ptr, size_t* size) {
 }
 
 inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
+<<<<<<< HEAD
   get()->recordStream(dataPtr, stream);
+=======
+  return get()->recordStream(dataPtr, stream);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
@@ -417,11 +492,19 @@ inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
 }
 
 inline void resetAccumulatedStats(c10::DeviceIndex device) {
+<<<<<<< HEAD
   get()->resetAccumulatedStats(device);
 }
 
 inline void resetPeakStats(c10::DeviceIndex device) {
   get()->resetPeakStats(device);
+=======
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
@@ -458,12 +541,17 @@ inline void recordHistory(
     size_t alloc_trace_max_entries,
     RecordContext when,
     bool clearHistory) {
+<<<<<<< HEAD
   get()->recordHistory(
+=======
+  return get()->recordHistory(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
 }
 
 inline void recordAnnotation(
     const std::vector<std::pair<std::string, std::string>>& md) {
+<<<<<<< HEAD
   get()->recordAnnotation(md);
 }
 
@@ -473,6 +561,17 @@ inline void pushCompileContext(std::string& md) {
 
 inline void popCompileContext() {
   get()->popCompileContext();
+=======
+  return get()->recordAnnotation(md);
+}
+
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline bool isHistoryEnabled() {
@@ -488,6 +587,7 @@ inline bool checkPoolLiveAllocations(
 }
 
 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+<<<<<<< HEAD
   get()->attachOutOfMemoryObserver(std::move(observer));
 }
 
@@ -497,6 +597,17 @@ inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
 
 inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
   get()->releasePool(device, mempool_id);
+=======
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 inline void createOrIncrefPool(
     c10::DeviceIndex device,
@@ -540,6 +651,7 @@ inline cudaError_t memcpyAsync(
 inline void enablePeerAccess(
     c10::DeviceIndex dev,
     c10::DeviceIndex dev_to_access) {
+<<<<<<< HEAD
   get()->enablePeerAccess(dev, dev_to_access);
 }
 
@@ -549,16 +661,22 @@ inline void setUserMetadata(const std::string& metadata) {
 
 inline std::string getUserMetadata() {
   return get()->getUserMetadata();
+=======
+  return get()->enablePeerAccess(dev, dev_to_access);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10::cuda::CUDACachingAllocator
 
 namespace c10::cuda {
 
+<<<<<<< HEAD
 // Keep BC only
 using c10::CaptureId_t;
 using c10::MempoolId_t;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // MemPool represents a pool of memory in a caching allocator. Currently,
 // it's just the ID of the pool object maintained in the CUDACachingAllocator.
 //
@@ -569,7 +687,12 @@ struct C10_CUDA_API MemPool {
   MemPool(
       CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
       bool is_user_created = true,
+<<<<<<< HEAD
       bool use_on_oom = false);
+=======
+      bool use_on_oom = false,
+      bool symmetric = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MemPool(const MemPool&) = delete;
   MemPool(MemPool&&) = default;
   MemPool& operator=(const MemPool&) = delete;
@@ -577,6 +700,10 @@ struct C10_CUDA_API MemPool {
   ~MemPool();
 
   MempoolId_t id();
+<<<<<<< HEAD
+=======
+  bool is_symmetric();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDACachingAllocator::CUDAAllocator* allocator();
   int use_count();
   c10::DeviceIndex device();
@@ -588,6 +715,10 @@ struct C10_CUDA_API MemPool {
   CUDACachingAllocator::CUDAAllocator* allocator_;
   bool is_user_created_;
   MempoolId_t id_;
+<<<<<<< HEAD
+=======
+  bool symmetric_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::DeviceIndex device_;
 };
 
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index d67ee4b23e692..fa59b37cd2325 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -1,6 +1,11 @@
 #include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Backtrace.h>
+#include <c10/util/Exception.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 4e4419b4369a8..2dc114ca6f24e 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -10,9 +10,15 @@ namespace c10::cuda {
 
 void c10_cuda_check_implementation(
     const int32_t err,
+<<<<<<< HEAD
     const char* filename,
     const char* function_name,
     const uint32_t line_number,
+=======
+    const char* /*filename*/,
+    const char* /*function_name*/,
+    const int /*line_number*/,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool include_device_assertions) {
   const auto cuda_error = static_cast<cudaError_t>(err);
   const auto cuda_kernel_failure = include_device_assertions
@@ -28,9 +34,13 @@ void c10_cuda_check_implementation(
   std::string check_message;
 #ifndef STRIP_ERROR_MESSAGES
   check_message.append("CUDA error: ");
+<<<<<<< HEAD
   const char* error_string = cudaGetErrorString(cuda_error);
   check_message.append(error_string);
   check_message.append(c10::cuda::get_cuda_error_help(cuda_error));
+=======
+  check_message.append(cudaGetErrorString(cuda_error));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   check_message.append(c10::cuda::get_cuda_check_suffix());
   check_message.append("\n");
   if (include_device_assertions) {
@@ -41,7 +51,11 @@ void c10_cuda_check_implementation(
   }
 #endif
   throw c10::AcceleratorError(
+<<<<<<< HEAD
       {function_name, filename, line_number}, err, check_message);
+=======
+      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 2503b22e4765b..523b9745858a7 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -91,7 +91,11 @@ C10_CUDA_API void c10_cuda_check_implementation(
     const int32_t err,
     const char* filename,
     const char* function_name,
+<<<<<<< HEAD
     const uint32_t line_number,
+=======
+    const int line_number,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool include_device_assertions);
 
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 422652bb021b1..baadb83d1551e 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -53,12 +53,20 @@ int device_count_impl(bool fail_if_no_driver) {
             "https://pytorch.org to install a PyTorch version that has been "
             "compiled with your version of the CUDA driver.");
       }
+<<<<<<< HEAD
     }
+=======
+    } break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case cudaErrorInitializationError:
       TORCH_CHECK(
           false,
           "CUDA driver initialization failed, you might not "
           "have a CUDA gpu.");
+<<<<<<< HEAD
+=======
+      break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case cudaErrorUnknown:
       TORCH_CHECK(
           false,
@@ -66,6 +74,10 @@ int device_count_impl(bool fail_if_no_driver) {
           "incorrectly set up environment, e.g. changing env "
           "variable CUDA_VISIBLE_DEVICES after program start. "
           "Setting the available devices to be zero.");
+<<<<<<< HEAD
+=======
+      break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if C10_ASAN_ENABLED
     case cudaErrorMemoryAllocation:
       // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
@@ -78,6 +90,7 @@ int device_count_impl(bool fail_if_no_driver) {
           "would like to use GPUs, turn off ASAN.");
       break;
 #endif // C10_ASAN_ENABLED
+<<<<<<< HEAD
 #if defined(_WIN32) && CUDA_VERSION >= 13000
     // Workaround for CUDA-13.0 error handling on Windows, see
     // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
@@ -90,6 +103,8 @@ int device_count_impl(bool fail_if_no_driver) {
         break;
       }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK(
           false,
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 543c866027464..61116becc971a 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -90,6 +90,7 @@ C10_CUDA_API void __inline__ memcpy_and_sync(
     (*interp)->trace_gpu_stream_synchronization(
         c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
   }
+<<<<<<< HEAD
 #if defined(USE_ROCM) && USE_ROCM
   // As of ROCm 6.4.1, HIP runtime does not raise an error during capture of
   // hipMemcpyWithStream which is a synchronous call. Thus, we add a check
@@ -101,6 +102,10 @@ C10_CUDA_API void __inline__ memcpy_and_sync(
   } else {
     C10_CUDA_CHECK(hipErrorStreamCaptureUnsupported);
   }
+=======
+#if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
+  C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
   C10_CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h
index 936875fd71d5c..26639395cf620 100644
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@@ -9,6 +9,15 @@
 
 namespace c10::cuda {
 
+<<<<<<< HEAD
+=======
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // RAII guard for "cudaStreamCaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index 93bce51f1b9d0..83acd2a98d8ee 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -4,6 +4,10 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <unordered_set>
 #include <vector>
@@ -13,6 +17,10 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
+<<<<<<< HEAD
+=======
+#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.
 
@@ -46,7 +54,11 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
 
 struct UsageStreamHash {
   size_t operator()(const UsageStream& us) const noexcept {
+<<<<<<< HEAD
     return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
+=======
+    return std::hash<void*>{}(us.stream) + size_t(us.device);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -445,7 +457,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     return !devs_initialized_flags.empty();
   }
 
+<<<<<<< HEAD
   static void assertValidDevice(c10::DeviceIndex device) {
+=======
+  static inline void assertValidDevice(c10::DeviceIndex device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         0 <= device && device < device_count, "Invalid device argument.");
   }
@@ -494,6 +510,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // introduces performance nondeterminism.
   }
 
+<<<<<<< HEAD
   std::vector<StreamSegmentSize> getExpandableSegmentSizes(
       c10::DeviceIndex device) override {
     TORCH_CHECK(
@@ -501,6 +518,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         "CUDAMallocAsyncAllocator does not yet support getExpandableSegmentSizes.");
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void emptyCache(/*unused*/ MempoolId_t mempool_id) override {
     std::lock_guard<std::mutex> lk(general_mutex);
 
@@ -516,7 +535,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     }
   }
 
+<<<<<<< HEAD
   void enable(bool /*value*/) override {
+=======
+  void enable(bool) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // cannot disable
   }
 
@@ -798,7 +821,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   void beginAllocateToPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id,
+<<<<<<< HEAD
       std::function<bool(cudaStream_t)> /*filter*/) override {
+=======
+      std::function<bool(cudaStream_t)>) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::lock_guard<std::mutex> lk(general_mutex);
 
     TORCH_INTERNAL_ASSERT(capture_free_streams.empty());
@@ -913,9 +940,13 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     }
   }
   std::string name() override {
+<<<<<<< HEAD
     // break up token to trick hipify
     return "c"
            "udaMallocAsync";
+=======
+    return "cudaMallocAsync";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     C10_CUDA_CHECK(
@@ -933,4 +964,16 @@ CUDAAllocator* allocator() {
   return &device_allocator;
 }
 
+<<<<<<< HEAD
+=======
+#else
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+CUDAAllocator* allocator() {
+  TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
+  return nullptr;
+}
+
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp
index b305008d44f8c..392f04cea8bdd 100644
--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@@ -1,5 +1,6 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/util/env.h>
+<<<<<<< HEAD
 #include <cuda_runtime.h>
 #include <string>
 
@@ -24,6 +25,11 @@ std::string get_cuda_error_help(cudaError_t error) noexcept {
   return help_text;
 }
 
+=======
+
+namespace c10::cuda {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOLINTNEXTLINE(bugprone-exception-escape,-warnings-as-errors)
 const char* get_cuda_check_suffix() noexcept {
   static auto device_blocking_flag =
diff --git a/c10/cuda/CUDAMiscFunctions.h b/c10/cuda/CUDAMiscFunctions.h
index bdb2f9998ecd1..88b0a2ee65a91 100644
--- a/c10/cuda/CUDAMiscFunctions.h
+++ b/c10/cuda/CUDAMiscFunctions.h
@@ -3,6 +3,7 @@
 // CUDAExceptions.h
 
 #include <c10/cuda/CUDAMacros.h>
+<<<<<<< HEAD
 #include <cuda_runtime.h>
 
 #include <mutex>
@@ -10,6 +11,12 @@
 
 namespace c10::cuda {
 C10_CUDA_API std::string get_cuda_error_help(cudaError_t /*error*/) noexcept;
+=======
+
+#include <mutex>
+
+namespace c10::cuda {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
 C10_CUDA_API std::mutex* getFreeMutex();
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 975468de9f439..69b42abff350a 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -15,6 +15,7 @@ namespace c10::cuda {
 namespace {
 
 // Global stream state and constants
+<<<<<<< HEAD
 c10::once_flag init_flag;
 DeviceIndex num_gpus = -1;
 constexpr int kStreamsPerPoolBits = 5;
@@ -23,6 +24,16 @@ constexpr unsigned int kDefaultFlags = cudaStreamNonBlocking;
 constexpr int kStreamTypeBits = 4;
 
 int max_stream_priorities;
+=======
+static c10::once_flag init_flag;
+static DeviceIndex num_gpus = -1;
+static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+static constexpr unsigned int kDefaultFlags = cudaStreamNonBlocking;
+static constexpr int kStreamTypeBits = 4;
+
+static int max_stream_priorities;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Non-default streams
 // Note: the number of CUDA devices is determined at run time,
@@ -39,14 +50,24 @@ int max_stream_priorities;
 // the destruction.
 #if !defined(USE_ROCM)
 // CUDA-only: used to initializes the stream pools (once)
+<<<<<<< HEAD
 std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
 #endif
 std::array<
+=======
+static std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
+#endif
+static std::array<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::array<std::atomic<uint32_t>, C10_COMPILE_TIME_MAX_GPUS>,
     c10::cuda::max_compile_time_stream_priorities>
     priority_counters;
 
+<<<<<<< HEAD
 std::array<
+=======
+static std::array<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::array<
         std::array<cudaStream_t, kStreamsPerPool>,
         C10_COMPILE_TIME_MAX_GPUS>,
@@ -128,7 +149,11 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
   } else if (s.isExt()) {
     stream << "EXT";
   } else {
+<<<<<<< HEAD
     stream << "PRIORITY " << static_cast<int>(s.getStreamType());
+=======
+    stream << "PRIORITY " << int(s.getStreamType());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return stream;
 }
@@ -137,7 +162,11 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
 // We rely on streamIdIndex and streamIdType being non-negative;
 // see Note [Hazard when concatenating signed integers]
 
+<<<<<<< HEAD
 inline StreamIdType streamIdType(StreamId s) {
+=======
+static inline StreamIdType streamIdType(StreamId s) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Externally allocated streams have their id being the cudaStream_ptr
   // so the last bit will be 0
   if ((!(s & 1)) && s) {
@@ -147,11 +176,19 @@ inline StreamIdType streamIdType(StreamId s) {
   // rightmost bit
   int mask_for_type = (1 << kStreamTypeBits) - 1;
   auto val = (s >> 1) & mask_for_type;
+<<<<<<< HEAD
   TORCH_CHECK(val || !(s & 1), "invalid StreamId", s);
   return StreamIdType(val);
 }
 
 inline size_t streamIdIndex(StreamId s) {
+=======
+  TORCH_INTERNAL_ASSERT(val || !(s & 1), "invalid StreamId", s);
+  return StreamIdType(val);
+}
+
+static inline size_t streamIdIndex(StreamId s) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<size_t>(
       (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
 }
@@ -166,11 +203,19 @@ StreamId makeStreamId(StreamIdType st, size_t si) {
 
 // Thread-local current streams
 // NOLINTNEXTLINE(*-arrays)
+<<<<<<< HEAD
 thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
 
 // Populates global values.
 // Warning: this function must only be called once!
 void initGlobalStreamState() {
+=======
+static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+// Populates global values.
+// Warning: this function must only be called once!
+static void initGlobalStreamState() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   num_gpus = device_count();
   // Check if the number of GPUs matches the expected compile-time max number
   // of GPUs.
@@ -199,7 +244,11 @@ void initGlobalStreamState() {
 
 // Init a single CUDA or HIP stream
 // See Note [HIP Lazy Streams]
+<<<<<<< HEAD
 void initSingleStream(int p, DeviceIndex device_index, int i) {
+=======
+static void initSingleStream(int p, DeviceIndex device_index, int i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDAGuard device_guard(device_index);
   auto& stream = streams[p][device_index][i];
   auto pri = -p; // lower number is higher priority
@@ -215,7 +264,14 @@ void initSingleStream(int p, DeviceIndex device_index, int i) {
 
 // Creates the low and high priority stream pools for the specified device
 // Warning: only call once per device!
+<<<<<<< HEAD
 void initDeviceStreamState(DeviceIndex device_index) {
+=======
+static void initDeviceStreamState(DeviceIndex device_index) {
+  // Switches to the requested device so streams are properly associated
+  // with it.
+  CUDAGuard device_guard{device_index};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(kStreamsPerPool)) {
     for (const auto p : c10::irange(max_stream_priorities)) {
       initSingleStream(p, device_index, i);
@@ -224,7 +280,11 @@ void initDeviceStreamState(DeviceIndex device_index) {
 }
 
 // Init front-end to ensure initialization only occurs once
+<<<<<<< HEAD
 void initCUDAStreamsOnce() {
+=======
+static void initCUDAStreamsOnce() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Inits default streams (once, globally)
   c10::call_once(init_flag, initGlobalStreamState);
 
@@ -241,7 +301,11 @@ void initCUDAStreamsOnce() {
 }
 
 // Helper to verify the GPU index is valid
+<<<<<<< HEAD
 inline void check_gpu(DeviceIndex device_index) {
+=======
+static inline void check_gpu(DeviceIndex device_index) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       device_index >= 0 && device_index < num_gpus,
       "Device index value ",
@@ -253,7 +317,11 @@ inline void check_gpu(DeviceIndex device_index) {
 
 // Helper to determine the index of the stream to return
 // Note: Streams are returned round-robin (see note in CUDAStream.h)
+<<<<<<< HEAD
 uint32_t get_idx(std::atomic<uint32_t>& counter) {
+=======
+static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto raw_idx = counter++;
   return raw_idx % kStreamsPerPool;
 }
@@ -276,7 +344,11 @@ cudaStream_t CUDAStream::stream() const {
   StreamIdType st = streamIdType(stream_id);
   size_t si = streamIdIndex(stream_id);
   if (st.isDefault()) {
+<<<<<<< HEAD
     TORCH_CHECK(
+=======
+    TORCH_INTERNAL_ASSERT(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         si == 0,
         "Unrecognized stream ",
         stream_,
@@ -291,7 +363,11 @@ cudaStream_t CUDAStream::stream() const {
     return reinterpret_cast<cudaStream_t>(stream_id);
   } else {
     auto streamType = st.getStreamType();
+<<<<<<< HEAD
     TORCH_CHECK(
+=======
+    TORCH_INTERNAL_ASSERT(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         streamType >= 1 && streamType <= max_stream_priorities,
         "Unrecognized stream ",
         stream_,
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index a1233aeb05700..e8e379676e843 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -70,7 +70,11 @@ class C10_CUDA_API CUDAStream {
   /// Construct a CUDAStream from a Stream with no error checking.
   /// This constructor uses the "named" constructor idiom, and can
   /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
+<<<<<<< HEAD
   explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
+=======
+  explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool operator==(const CUDAStream& other) const noexcept {
     return unwrap() == other.unwrap();
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index 887c2d06347b4..b4368d43ee76d 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -1,6 +1,10 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/CallOnce.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <cuda_runtime.h>
@@ -37,6 +41,7 @@ DriverAPI create_driver_api() {
     C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
 #undef LOOKUP_NVML_ENTRY
   }
+<<<<<<< HEAD
 
   if (handle_1) {
 #define LOOKUP_NVML_ENTRY_OPTIONAL(name) \
@@ -44,6 +49,8 @@ DriverAPI create_driver_api() {
     C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL)
 #undef LOOKUP_NVML_ENTRY_OPTIONAL
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return r;
 }
 
@@ -60,14 +67,20 @@ void* get_symbol(const char* name, int version) {
   }
 #endif
 
+<<<<<<< HEAD
   // As of CUDA 13, this API is deprecated.
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This fallback to the old API to try getting the symbol again.
   if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
       st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
     return out;
   }
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // If the symbol cannot be resolved, report and return nullptr;
   // the caller is responsible for checking the pointer.
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
index 380e7939ff76c..0bf3cf4817aaf 100644
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@@ -51,6 +51,7 @@
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
+<<<<<<< HEAD
   _(cuCtxFromGreenCtx, 12080)              \
   _(cuCtxGetCurrent, 12080)                \
   _(cuCtxPopCurrent, 12080)                \
@@ -66,6 +67,11 @@
   _(cuMulticastBindMem, 12030)             \
   _(cuMulticastCreate, 12030)              \
   _(cuMulticastUnbind, 12030)
+=======
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
 #endif
@@ -78,12 +84,15 @@
   _(nvmlDeviceGetComputeRunningProcesses) \
   _(nvmlSystemGetCudaDriverVersion_v2)
 
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040)
 #define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
 #else
 #define C10_NVML_DRIVER_API_OPTIONAL(_)
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10::cuda {
 
 struct DriverAPI {
@@ -92,7 +101,10 @@ struct DriverAPI {
   C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
   C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
   C10_NVML_DRIVER_API(CREATE_MEMBER)
+<<<<<<< HEAD
   C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER
 
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 1b8a6811c53f5..dd1885243ebd9 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -1 +1,82 @@
+<<<<<<< HEAD
 #include <torch/headeronly/macros/Export.h>
+=======
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include <torch/headeronly/macros/Export.h>
+
+// This one is being used by libtorch.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_API C10_EXPORT
+#else
+#define TORCH_API C10_IMPORT
+#endif
+
+// You may be wondering: Whose brilliant idea was it to split torch_cuda into
+// two pieces with confusing names?
+// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
+// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
+// issues when linking big binaries.
+// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
+//    (1) Stop supporting so many GPU architectures
+//    (2) Do something else
+// We chose #2 and decided to split the behemoth that was torch_cuda into two
+// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
+// and the other that had..well..everything else (torch_cuda_cpp). The idea was
+// this: instead of linking our static libraries (like the hefty
+// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
+// relocation marker issues, we could link our static libraries to a smaller
+// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
+
+// libtorch_cuda_cu.so
+#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+// libtorch_cuda_cpp.so
+#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#endif
+
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
+// same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif !defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+#if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_CPP_API C10_EXPORT
+#define TORCH_HIP_API C10_EXPORT
+#else
+#define TORCH_HIP_CPP_API C10_IMPORT
+#define TORCH_HIP_API C10_IMPORT
+#endif
+
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
+#define C10_API_ENUM C10_API
+#else
+#define C10_API_ENUM
+#endif
+
+#endif // C10_MACROS_EXPORT_H_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 87ebc4f422c4c..fbf0071ae91e8 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -1 +1,568 @@
+<<<<<<< HEAD
 #include <torch/headeronly/macros/Macros.h>
+=======
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+#include <cassert>
+
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include <c10/macros/Export.h>
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ \
+  __attribute__((no_sanitize("float-divide-by-zero")))
+#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
+#define __ubsan_ignore_signed_int_overflow__ \
+  __attribute__((no_sanitize("signed-integer-overflow")))
+#define __ubsan_ignore_pointer_overflow__ \
+  __attribute__((no_sanitize("pointer-overflow")))
+#define __ubsan_ignore_function__ __attribute__((no_sanitize("function")))
+#define __ubsan_ignore_float_cast_overflow__ \
+  __attribute__((no_sanitize("float-cast-overflow")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#define __ubsan_ignore_undefined__
+#define __ubsan_ignore_signed_int_overflow__
+#define __ubsan_ignore_pointer_overflow__
+#define __ubsan_ignore_function__
+#define __ubsan_ignore_float_cast_overflow__
+#endif
+
+// Detect address sanitizer as some stuff doesn't work with it
+#undef C10_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 0
+#endif
+
+// Detect undefined-behavior sanitizer (UBSAN)
+#undef C10_UBSAN_ENABLED
+
+// for clang or gcc >= 14
+// NB: gcc 14 adds support for Clang's __has_feature
+//   https://gcc.gnu.org/gcc-14/changes.html
+//   gcc < 14 doesn't have a macro for UBSAN
+//   (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
+//   https://github.com/google/sanitizers/issues/765
+#if defined(__has_feature)
+#if ((__has_feature(undefined_behavior_sanitizer)))
+#define C10_UBSAN_ENABLED 1
+#endif
+#endif
+
+#if !defined(C10_UBSAN_ENABLED)
+#define C10_UBSAN_ENABLED 0
+#endif
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+
+#define C10_MACRO_EXPAND(args) args
+
+#define C10_STRINGIZE_IMPL(x) #x
+#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with
+ * str and ends with a unique number.
+ */
+#ifdef __COUNTER__
+#define C10_UID __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_UID __LINE__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+#ifdef __has_cpp_attribute
+#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#define C10_HAS_CPP_ATTRIBUTE(x) (0)
+#endif
+
+#ifndef FBCODE_CAFFE2
+/// DEPRECATED: Warn if a type or return value is discarded.
+#define C10_NODISCARD [[nodiscard]]
+
+/// DEPRECATED: Suppress an unused variable.
+#define C10_UNUSED [[maybe_unused]]
+#endif
+
+#if !defined(__has_attribute)
+#define __has_attribute(x) 0
+#endif
+
+// Direct port of LLVM_ATTRIBUTE_USED.
+#if __has_attribute(used)
+#define C10_USED __attribute__((__used__))
+#else
+#define C10_USED
+#endif
+
+#define C10_RESTRICT __restrict
+
+// Simply define the namespace, in case a dependent library want to refer to
+// the c10 namespace but not any nontrivial files.
+namespace c10 {}
+namespace c10::cuda {}
+namespace c10::hip {}
+namespace c10::xpu {}
+
+// Since C10 is the core library for caffe2 (and aten), we will simply reroute
+// all abstractions defined in c10 to be available in caffe2 as well.
+// This is only for backwards compatibility. Please use the symbols from the
+// c10 namespace where possible.
+namespace caffe2 {
+using namespace c10;
+}
+namespace at {
+using namespace c10;
+}
+namespace at::cuda {
+using namespace c10::cuda;
+} // namespace at::cuda
+
+// WARNING!!! THIS IS A GIANT HACK!!!
+// This line means you cannot simultaneously include c10/hip
+// and c10/cuda and then use them from the at::cuda namespace.
+// This is true in practice, because HIPIFY works inplace on
+// files in ATen/cuda, so it assumes that c10::hip is available
+// from at::cuda.  This namespace makes that happen.  When
+// HIPIFY is no longer out-of-place, we can switch the cuda
+// here to hip and everyone is happy.
+namespace at::cuda {
+using namespace c10::hip;
+} // namespace at::cuda
+
+namespace at::xpu {
+using namespace c10::xpu;
+} // namespace at::xpu
+
+// C10_LIKELY/C10_UNLIKELY
+//
+// These macros provide parentheses, so you can use these macros as:
+//
+//    if C10_LIKELY(some_expr) {
+//      ...
+//    }
+//
+// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
+// takes a long argument, which means you may trigger the wrong conversion
+// without it.
+//
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#else
+#define C10_LIKELY(expr) (expr)
+#define C10_UNLIKELY(expr) (expr)
+#endif
+
+/// C10_NOINLINE - Functions whose declaration is annotated with this will not
+/// be inlined.
+#ifdef __GNUC__
+#define C10_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define C10_NOINLINE __declspec(noinline)
+#else
+#define C10_NOINLINE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
+// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used
+// on a lambda.
+#if defined(_MSC_VER)
+// MSVC 14.39 is reasonably recent and doesn't like
+// [[msvc::forceinline]] on a lambda, so don't try to use it.
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__))
+#else
+#define C10_ALWAYS_INLINE_ATTRIBUTE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ATTR_VISIBILITY_HIDDEN
+#elif defined(__GNUC__)
+#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
+#else
+#define C10_ATTR_VISIBILITY_HIDDEN
+#endif
+
+#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
+
+#include <cstdint>
+
+#ifdef __HIPCC__
+// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
+// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
+// See https://github.com/ROCm/hip/issues/441
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+// constants from
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
+// The maximum number of threads per multiprocessor is 1024 for Turing
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
+#if __CUDA_ARCH__ == 750
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
+#else
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
+#endif
+// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
+constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
+// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
+// size. 256 is a good number for this fallback and should give good occupancy
+// and versatility across all architectures.
+constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
+// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
+//       turns out that although __launch_bounds__ can take constexpr, it
+//       can't take a constexpr that has anything to do with templates.
+//       Currently we use launch_bounds that depend on template arguments in
+//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
+//       and C10_MIN_BLOCKS_PER_SM are kept as macros.
+// Suppose you were planning to write __launch_bounds__(a, b), based on your
+// performance tuning on a modern GPU. Instead, you should write
+// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
+// which will also properly respect limits on old architectures.
+#define C10_MAX_THREADS_PER_BLOCK(val)           \
+  (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
+                                         : CUDA_THREADS_PER_BLOCK_FALLBACK)
+#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
+  ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
+        ? (blocks_per_sm)                                              \
+        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) /       \
+           (threads_per_block))))
+// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
+#define C10_LAUNCH_BOUNDS_0 \
+  __launch_bounds__(        \
+      256, 4) // default launch bounds that should give good occupancy and
+              // versatility across all architectures.
+#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
+  __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
+#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
+  __launch_bounds__(                                                  \
+      (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))),           \
+      (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
+#else
+#define C10_HOST_DEVICE
+#define C10_HOST
+#define C10_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_HIP_HOST_DEVICE __host__ __device__
+#else
+#define C10_HIP_HOST_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+// C10_WARP_SIZE is only allowed for device code.
+// Host code _must_ use at::cuda::warp_size()
+// HIP header used to define warpSize as a constexpr that was either 32 or 64
+// depending on the target device, and then always set it to 64 for host code.
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+
+namespace at::cuda {
+TORCH_CUDA_CPP_API int warp_size();
+}
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+  return at::cuda::warp_size();
+}
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
+#if defined(__GFX9__)
+  return 64;
+#else // __GFX9__
+  return 32;
+#endif // __GFX9__
+}
+#else // __HIPCC__
+inline int C10_WARP_SIZE_INTERNAL() {
+  return at::cuda::warp_size();
+}
+#endif // __HIPCC__
+
+#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
+#define C10_WARP_SIZE 32
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+// CUDA_KERNEL_ASSERT checks the assertion
+// even when NDEBUG is defined. This is useful for important assertions in CUDA
+// code that would otherwise be suppressed when building Release.
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
+// Those platforms do not support assert()
+#define CUDA_KERNEL_ASSERT(cond)
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define SYCL_KERNEL_ASSERT(cond)
+#elif defined(_MSC_VER)
+#if defined(NDEBUG)
+extern "C" {
+C10_IMPORT
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void _wassert(
+    const wchar_t* wexpr,
+    const wchar_t* wfile,
+    unsigned line);
+#else
+#if defined(__CUDA_ARCH__)
+__host__ __device__
+#endif // __CUDA_ARCH__
+    void
+    _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+#define CUDA_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+// TODO: This doesn't assert the message because I (chilli) couldn't figure out
+// a nice way to convert a char* to a wchar_t*
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)        \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#else // __APPLE__, _MSC_VER
+#if defined(NDEBUG)
+extern "C" {
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void __assert_fail(
+    const char* expr,
+    const char* file,
+    unsigned int line,
+    const char* func);
+#else // __SYCL_DEVICE_ONLY__
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+// CUDA supports __assert_fail function which are common for both device
+// and host side code.
+__host__ __device__
+#endif
+
+    // This forward declaration matching the declaration of __assert_fail
+    // exactly how it is in glibc in case parts of the program are compiled with
+    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
+    // error. Note: On ROCm - this declaration serves for host side compilation.
+    void
+    __assert_fail(
+        const char* assertion,
+        const char* file,
+        unsigned int line,
+        const char* function) noexcept __attribute__((__noreturn__));
+
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+// ROCm disables kernel assert by default for performance considerations.
+// Though ROCm supports __assert_fail, it uses kernel printf which has
+// a non-negligible performance impact even if the assert condition is
+// never triggered. We choose to use abort() instead which will still
+// terminate the application but without a more useful error message.
+#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
+#define CUDA_KERNEL_ASSERT(cond) \
+  if C10_UNLIKELY (!(cond)) {    \
+    abort();                     \
+  }
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \
+  if C10_UNLIKELY (!(cond)) {             \
+    abort();                              \
+  }
+#define SYCL_KERNEL_ASSERT(cond) \
+  if C10_UNLIKELY (!(cond)) {    \
+    abort();                     \
+  }
+#else
+#define CUDA_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg)                              \
+  if (C10_UNLIKELY(!(cond))) {                                         \
+    __assert_fail(                                                     \
+        msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#endif //  C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM
+#endif // __APPLE__
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__ANDROID__)
+#define C10_ANDROID 1
+#define C10_MOBILE 1
+#elif (                   \
+    defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define C10_IOS 1
+#define C10_MOBILE 1
+#endif // ANDROID / IOS
+
+#if defined(C10_MOBILE) && C10_MOBILE
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline
+#else
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
+#endif
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static constexpr const char field[] = val;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
+#ifndef HAS_DEMANGLE
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+#endif // HAS_DEMANGLE
+
+#define _C10_PRAGMA__(string) _Pragma(#string)
+#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
+
+#ifdef __clang__
+#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
+#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
+  _C10_PRAGMA_(clang diagnostic ignored flag)
+#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
+#else
+#define C10_CLANG_DIAGNOSTIC_PUSH()
+#define C10_CLANG_DIAGNOSTIC_POP()
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag)
+#define C10_CLANG_HAS_WARNING(flag) 0
+#endif
+
+#ifdef __clang__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)         \
+  _C10_PRAGMA_(clang diagnostic push)                               \
+  _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \
+  _C10_PRAGMA_(clang diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop)
+
+#elif __GNUC__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \
+  _C10_PRAGMA_(GCC diagnostic push)                         \
+  _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas")          \
+  _C10_PRAGMA_(GCC diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop)
+
+#else
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)
+#define C10_DIAGNOSTIC_POP()
+
+#endif
+
+// This macro is used to find older C++ compilers
+// that don't support move optimization for return values.
+
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
+    (defined(__clang_major__) && __clang_major__ < 13)
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
+#else
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 0
+#endif
+
+#endif // C10_MACROS_MACROS_H_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/macros/build.bzl b/c10/macros/build.bzl
index d5809d36687d7..ef44cdf922adf 100644
--- a/c10/macros/build.bzl
+++ b/c10/macros/build.bzl
@@ -1,13 +1,20 @@
 def define_targets(rules):
     rules.cc_library(
         name = "macros",
+<<<<<<< HEAD
+=======
+        srcs = [":cmake_macros_h"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hdrs = [
             "Macros.h",
             # Despite the documentation in Macros.h, Export.h is included
             # directly by many downstream files. Thus, we declare it as a
             # public header in this file.
             "Export.h",
+<<<<<<< HEAD
             "cmake_macros.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         linkstatic = True,
         local_defines = ["C10_BUILD_MAIN_LIB"],
@@ -17,6 +24,25 @@ def define_targets(rules):
         ],
     )
 
+<<<<<<< HEAD
+=======
+    rules.cmake_configure_file(
+        name = "cmake_macros_h",
+        src = "cmake_macros.h.in",
+        out = "cmake_macros.h",
+        definitions = [
+            "C10_BUILD_SHARED_LIBS",
+            "C10_USE_MSVC_STATIC_RUNTIME",
+        ] + rules.select({
+            "//c10:using_gflags": ["C10_USE_GFLAGS"],
+            "//conditions:default": [],
+        }) + rules.select({
+            "//c10:using_glog": ["C10_USE_GLOG"],
+            "//conditions:default": [],
+        }),
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rules.filegroup(
         name = "headers",
         srcs = rules.glob(
diff --git a/c10/macros/cmake_configure_file.bzl b/c10/macros/cmake_configure_file.bzl
new file mode 100644
index 0000000000000..16d09cc9ee30c
--- /dev/null
+++ b/c10/macros/cmake_configure_file.bzl
@@ -0,0 +1,65 @@
+# Forked from header_template_rule. header_template_rule is not
+# compatible with our usage of select because its substitutions
+# attribute is a dict, and dicts may not be appended with select. We
+# get around this limitation by using a list as our substitutions.
+def _cmake_configure_file_impl(ctx):
+    command = ["cat $1"]
+    for definition in ctx.attr.definitions:
+        command.append(
+            "| sed 's@#cmakedefine {}@#define {}@'".format(
+                definition,
+                definition,
+            ),
+        )
+
+    # Replace any that remain with /* #undef FOO */.
+    command.append("| sed -r 's@#cmakedefine ([A-Z0-9_]+)@/* #undef \\1 */@'")
+    command.append("> $2")
+
+    ctx.actions.run_shell(
+        inputs = [ctx.file.src],
+        outputs = [ctx.outputs.out],
+        command = " ".join(command),
+        arguments = [
+            ctx.file.src.path,
+            ctx.outputs.out.path,
+        ],
+    )
+    return [
+        # create a provider which says that this
+        # out file should be made available as a header
+        CcInfo(compilation_context = cc_common.create_compilation_context(
+
+            # pass out the include path for finding this header
+            includes = depset([ctx.outputs.out.dirname, ctx.bin_dir.path]),
+
+            # and the actual header here.
+            headers = depset([ctx.outputs.out]),
+        )),
+    ]
+
+cmake_configure_file = rule(
+    implementation = _cmake_configure_file_impl,
+    doc = """
+Mimics CMake's configure_file in Bazel.
+
+Args:
+  name: A unique name for this rule.
+  src: The input file template.
+  out: The generated output.
+  definitions: A mapping of identifier in template to its value.
+""",
+    attrs = {
+        # We use attr.string_list for compatibility with select and
+        # config_setting. See the comment above _cmake_configure_file_impl
+        # for more information.
+        "definitions": attr.string_list(mandatory = True),
+        "out": attr.output(mandatory = True),
+        "src": attr.label(
+            mandatory = True,
+            allow_single_file = True,
+        ),
+    },
+    # output_to_genfiles is required for header files.
+    output_to_genfiles = True,
+)
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
new file mode 100644
index 0000000000000..76c185b55236c
--- /dev/null
+++ b/c10/macros/cmake_macros.h.in
@@ -0,0 +1,14 @@
+#ifndef C10_MACROS_CMAKE_MACROS_H_
+#define C10_MACROS_CMAKE_MACROS_H_
+
+// Automatically generated header file for the C10 library.
+// Do not include this file directly. Instead, include c10/macros/Macros.h.
+
+#cmakedefine C10_BUILD_SHARED_LIBS
+#cmakedefine C10_USE_GLOG
+#cmakedefine C10_USE_GFLAGS
+#cmakedefine C10_USE_NUMA
+#cmakedefine C10_USE_MSVC_STATIC_RUNTIME
+#cmakedefine C10_USE_ROCM_KERNEL_ASSERT
+
+#endif // C10_MACROS_CMAKE_MACROS_H_
diff --git a/c10/metal/atomic.h b/c10/metal/atomic.h
index d0cbc03916989..1bd646c9b7573 100644
--- a/c10/metal/atomic.h
+++ b/c10/metal/atomic.h
@@ -35,6 +35,7 @@ static inline void atomic_add_helper(
     device ::metal::atomic<uint>* data,
     long offset,
     T value) {
+<<<<<<< HEAD
   constexpr auto elem_per_enum = sizeof(uint) / sizeof(T);
   auto ptr = data + (offset / elem_per_enum);
   auto old = ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
@@ -45,6 +46,17 @@ static inline void atomic_add_helper(
   do {
     val.i = old;
     val.t[offset & (elem_per_enum - 1)] += value;
+=======
+  auto ptr = data + (offset >> 1);
+  auto old = ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+  union {
+    uint i;
+    T t[2];
+  } val;
+  do {
+    val.i = old;
+    val.t[offset & 1] += value;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } while (!::metal::atomic_compare_exchange_weak_explicit(
       ptr,
       &old,
@@ -57,6 +69,7 @@ template <>
 struct AtomicType<half> {
   using type = ::metal::atomic<uint>;
   static inline void atomic_add(device type* data, long offset, half value) {
+<<<<<<< HEAD
     atomic_add_helper(data, offset, value);
   }
 };
@@ -85,6 +98,13 @@ struct AtomicType<uchar> {
   }
 };
 
+=======
+    atomic_add_helper<half>(data, offset, value);
+  }
+};
+
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 struct AtomicType<bfloat> {
   using type = ::metal::atomic<uint>;
@@ -92,6 +112,10 @@ struct AtomicType<bfloat> {
     atomic_add_helper<bfloat>(data, offset, value);
   }
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Metal supports atomic_store_explicit for bools, but
 // sizeof(::metal::atomic_bool) is 4 Therefore it could not be used to
@@ -124,6 +148,7 @@ struct AtomicType<bool> {
   }
 };
 
+<<<<<<< HEAD
 // ComplexHalf atomic op
 template <>
 struct AtomicType<half2> {
@@ -173,5 +198,7 @@ struct AtomicType<float2> {
   }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace metal
 } // namespace c10
diff --git a/c10/metal/common.h b/c10/metal/common.h
index e4b4d1a38ca4e..265c55375ef60 100644
--- a/c10/metal/common.h
+++ b/c10/metal/common.h
@@ -2,6 +2,7 @@
 // Set of global constants that could be shareable between CPU and Metal code
 
 #ifdef __METAL__
+<<<<<<< HEAD
 #include <metal_array>
 #define C10_METAL_CONSTEXPR constant constexpr
 #else
@@ -9,6 +10,14 @@
 #define C10_METAL_CONSTEXPR constexpr
 #endif
 
+=======
+#define C10_METAL_CONSTEXPR constant constexpr
+#else
+#define C10_METAL_CONSTEXPR constexpr
+#endif
+
+#if !defined(__METAL__) || __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define C10_METAL_ALL_TYPES_FUNCTOR(_) \
   _(Byte, 0)                           \
   _(Char, 1)                           \
@@ -21,10 +30,27 @@
   _(ComplexFloat, 9)                   \
   _(Bool, 11)                          \
   _(BFloat16, 15)
+<<<<<<< HEAD
+=======
+#else
+#define C10_METAL_ALL_TYPES_FUNCTOR(_) \
+  _(Byte, 0)                           \
+  _(Char, 1)                           \
+  _(Short, 2)                          \
+  _(Int, 3)                            \
+  _(Long, 4)                           \
+  _(Half, 5)                           \
+  _(Float, 6)                          \
+  _(ComplexHalf, 8)                    \
+  _(ComplexFloat, 9)                   \
+  _(Bool, 11)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 namespace metal {
 C10_METAL_CONSTEXPR unsigned max_ndim = 16;
+<<<<<<< HEAD
 C10_METAL_CONSTEXPR unsigned simdgroup_size = 32;
 
 #ifdef __METAL__
@@ -34,6 +60,8 @@ using array = ::metal::array<T, N>;
 template <typename T, unsigned N>
 using array = std::array<T, N>;
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 enum class ScalarType {
 #define _DEFINE_ENUM_VAL_(_v, _n) _v = _n,
diff --git a/c10/metal/indexing.h b/c10/metal/indexing.h
index 9cfe65f6a03a8..804f8c1c8e229 100644
--- a/c10/metal/indexing.h
+++ b/c10/metal/indexing.h
@@ -186,8 +186,15 @@ inline T val_at_offs(constant void* ptr, long offs, ScalarType type) {
       return cast_to<T>(val_at_offs<float>(ptr, offs));
     case ScalarType::Half:
       return cast_to<T>(val_at_offs<half>(ptr, offs));
+<<<<<<< HEAD
     case ScalarType::BFloat16:
       return cast_to<T>(val_at_offs<bfloat>(ptr, offs));
+=======
+#if __METAL_VERSION__ >= 310
+    case ScalarType::BFloat16:
+      return cast_to<T>(val_at_offs<bfloat>(ptr, offs));
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Complex
     case ScalarType::ComplexHalf:
       return cast_to<T>(val_at_offs<half2>(ptr, offs));
diff --git a/c10/metal/reduction_utils.h b/c10/metal/reduction_utils.h
index 2d97820191663..0018bafaef04b 100644
--- a/c10/metal/reduction_utils.h
+++ b/c10/metal/reduction_utils.h
@@ -5,6 +5,7 @@
 
 namespace c10 {
 namespace metal {
+<<<<<<< HEAD
 namespace detail {
 template <typename T>
 struct simd_type {
@@ -24,10 +25,19 @@ struct simd_type<bfloat> {
 template <typename T>
 inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_sum(T val) {
   return T(::metal::simd_sum(detail::simd_type_t<T>(val)));
+=======
+
+constant constexpr ushort simdgroup_size = 32;
+
+template <typename T>
+inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_sum(T val) {
+  return ::metal::simd_sum(val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
 inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_prod(T val) {
+<<<<<<< HEAD
   return T(::metal::simd_product(detail::simd_type_t<T>(val)));
 }
 
@@ -83,6 +93,9 @@ template <
         true>
 inline T simd_min(T val) {
   return ::metal::simd_min(val);
+=======
+  return ::metal::simd_product(val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Metal does not support SIMD reductions over 64-bit types, but it could be
@@ -97,7 +110,11 @@ inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_sum(T val) {
     val += as_type<T>(
         ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
   }
+<<<<<<< HEAD
   return simd_broadcast(val, 0);
+=======
+  return as_type<T>(::metal::simd_broadcast(as_type<int2>(val), 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -106,6 +123,7 @@ inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_prod(T val) {
     val *= as_type<T>(
         ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
   }
+<<<<<<< HEAD
   return simd_broadcast(val, 0);
 }
 
@@ -178,6 +196,9 @@ template <typename ARG_T, typename IDX_T>
 inline c10::metal::pair<ARG_T, IDX_T> simd_argmax(ARG_T val, IDX_T idx_val) {
   auto rc = simd_argmax(val);
   return {rc.first, simd_broadcast(idx_val, rc.second)};
+=======
+  return as_type<T>(::metal::simd_broadcast(as_type<int2>(val), 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Below algorithms are  written with hardcoded assumption that simdgroup is 32
@@ -229,6 +250,7 @@ opmath_t<T> threadgroup_prod(
 }
 
 template <typename T>
+<<<<<<< HEAD
 T threadgroup_max(threadgroup T* data, T val, unsigned idx, unsigned size) {
   auto rc = simd_max(val);
   if (idx % simdgroup_size == 0) {
@@ -267,6 +289,8 @@ T threadgroup_min(threadgroup T* data, T val, unsigned idx, unsigned size) {
 }
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 float3 threadgroup_welford_reduce(threadgroup T* data, unsigned size) {
   ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
   float m = data[0];
@@ -301,6 +325,7 @@ float3 threadgroup_welford_combine(threadgroup T* data, unsigned size) {
   return rc;
 }
 
+<<<<<<< HEAD
 template <typename ARG_T, typename IDX_T>
 IDX_T threadgroup_argmax(
     threadgroup ARG_T* arg_data,
@@ -353,6 +378,54 @@ IDX_T threadgroup_argmin(
   }
   ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
   return idx_data[0];
+=======
+template <typename T>
+T threadgroup_max(threadgroup T* data, unsigned size) {
+  // TODO: This should be moved to the callee
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  T rc = data[0];
+  for (unsigned idx = 1; idx < size; ++idx) {
+    rc = ::c10::metal::max(rc, data[idx]);
+  }
+  return rc;
+}
+
+template <typename T>
+T threadgroup_min(threadgroup T* data, unsigned size) {
+  // TODO: This should be moved to the callee
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  T rc = data[0];
+  for (unsigned idx = 1; idx < size; ++idx) {
+    rc = ::c10::metal::min(rc, data[idx]);
+  }
+  return rc;
+}
+
+template <typename T>
+int threadgroup_argmax(threadgroup T* data, unsigned size) {
+  // TODO: This should be moved to the callee
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  int rc = 0;
+  for (unsigned idx = 1; idx < size; ++idx) {
+    if (data[idx] > data[rc]) {
+      rc = idx;
+    }
+  }
+  return rc;
+}
+
+template <typename T>
+int threadgroup_argmin(threadgroup T* data, unsigned size) {
+  // TODO: This should be moved to the callee
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  int rc = 0;
+  for (unsigned idx = 1; idx < size; ++idx) {
+    if (data[idx] < data[rc]) {
+      rc = idx;
+    }
+  }
+  return rc;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace metal
diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h
index 29a45ff4c30b6..3737858785e75 100644
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@@ -1,7 +1,10 @@
 // Implementation of specal math functions for Metal
 #pragma once
 #include <c10/metal/expm1f.h>
+<<<<<<< HEAD
 #include <c10/metal/igamma.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 
@@ -49,11 +52,14 @@ inline float erf(T x) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 float erfc(T x) {
   return 1.0 - erf(x);
 }
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float erfinv(T y) {
   /* coefficients in rational expansion */
   constexpr float a[4] = {0.886226899, -1.645349621, 0.914624893, -0.140543331};
@@ -1565,7 +1571,11 @@ float chebyshev_polynomial_t_forward(T x, int64_t n) {
   float q = x;
   float r;
 
+<<<<<<< HEAD
   for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+=======
+  for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r = (x + x) * q - p;
     p = q;
     q = r;
@@ -1609,7 +1619,11 @@ float chebyshev_polynomial_u_forward(T x, int64_t n) {
   auto p = 1.0;
   float r;
 
+<<<<<<< HEAD
   for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+=======
+  for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r = 2 * x * q - p;
     p = q;
     q = r;
@@ -1662,7 +1676,11 @@ float chebyshev_polynomial_v_forward(T x, int64_t n) {
   auto p = 1.0;
   float r;
 
+<<<<<<< HEAD
   for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+=======
+  for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r = 2 * x * q - p;
     p = q;
     q = r;
@@ -1719,7 +1737,11 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
   auto p = 1.0;
   float r;
 
+<<<<<<< HEAD
   for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
+=======
+  for (int64_t k = 2; k <= n; k++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r = 2.0 * x * q - p;
     p = q;
     q = r;
@@ -1729,6 +1751,7 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
 } // chebyshev_polynomial_w_forward(T x, int64_t n)
 
 template <typename T>
+<<<<<<< HEAD
 float shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
   if (n < 0) {
     return 0.0;
@@ -1930,6 +1953,8 @@ float shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
 } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
 
 template <typename T>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: Add 512 if/when double will be supported in Metal
 inline constexpr int getHermitianLimit() {
   return 128;
diff --git a/c10/metal/utils.h b/c10/metal/utils.h
index 43d0eff27b8e8..bbabaa341fd07 100644
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@@ -24,12 +24,20 @@ struct vectypes<half> {
   using type2 = half2;
 };
 
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 struct vectypes<bfloat> {
   using type4 = bfloat4;
   using type3 = bfloat3;
   using type2 = bfloat2;
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <>
 struct vectypes<short> {
@@ -77,10 +85,18 @@ struct OpMathType<uchar> {
   using type = int;
 };
 
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 struct OpMathType<bfloat> {
   using type = float;
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Type promotion structure for higher precision accumulation
 template <typename T>
@@ -94,11 +110,19 @@ struct AccumulationType<half> {
   using type = float;
 };
 
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Specialization for bfloat - promote to float for accumulation
 template <>
 struct AccumulationType<bfloat> {
   using type = float;
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace detail
 
@@ -124,6 +148,10 @@ min(T a, U b) {
   return ::metal::min(a, static_cast<T>(b));
 }
 
+<<<<<<< HEAD
+=======
+#if __METAL_VERSION__ >= 310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 inline bfloat min(bfloat a, bfloat b) {
   return bfloat(
@@ -135,6 +163,10 @@ inline bfloat max(bfloat a, bfloat b) {
   return bfloat(
       ::metal::isunordered(a, b) ? NAN : ::metal::max(float(a), float(b)));
 }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 using vec2type_t = typename detail::vectypes<T>::type2;
@@ -322,6 +354,7 @@ inline float log1p(float x) {
   return rc;
 }
 
+<<<<<<< HEAD
 template <typename T1, typename T2 = T1>
 struct pair {
   T1 first;
@@ -359,5 +392,7 @@ inline float2 conj(float2 a) {
   MACRO(half);                             \
   MACRO(bfloat);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace metal
 } // namespace c10
diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl
index aafe5a4de8c42..ab5e5f1a6f9d6 100644
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@@ -63,6 +63,10 @@ def define_c10_ovrsource(name, is_mobile):
             "core/impl/*.h",
         ]),
         reexport_all_header_dependencies = False,
+<<<<<<< HEAD
+=======
+        # tests = C10_CPU_TEST_TARGETS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = [
             "//xplat/caffe2/c10:c10_ovrsource",
         ],
@@ -73,7 +77,12 @@ def define_c10_ovrsource(name, is_mobile):
             ],
         }),
         exported_deps = [
+<<<<<<< HEAD
             "//xplat/caffe2/torch/headeronly:torch_headeronly_ovrsource",
+=======
+            "//xplat/caffe2/torch/headeronly:torch_headeronly",
+            ":ovrsource_c10_cmake_macros.h",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "//arvr/third-party/gflags:gflags",
             "//third-party/cpuinfo:cpuinfo",
             "//third-party/fmt:fmt",
@@ -82,6 +91,77 @@ def define_c10_ovrsource(name, is_mobile):
     )
 
 def define_ovrsource_targets():
+<<<<<<< HEAD
+=======
+    # C10_CPU_TEST_FILES = native.glob([
+    #     "test/core/*.cpp",
+    #     "test/util/*.cpp",
+    # ])
+
+    # C10_GPU_TEST_FILES = native.glob([
+    #     "cuda/test/**/*.cpp",
+    # ])
+
+    # C10_CPU_TEST_TARGETS = [
+    #     ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
+    #     for test in C10_CPU_TEST_FILES
+    # ]
+
+    # C10_GPU_TEST_TARGETS = [
+    #     ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource"
+    #     for test in C10_GPU_TEST_FILES
+    # ]
+
+    common_c10_cmake_defines = [
+        ("#cmakedefine C10_BUILD_SHARED_LIBS", ""),
+        ("#cmakedefine C10_USE_NUMA", ""),
+        ("#cmakedefine C10_USE_MSVC_STATIC_RUNTIME", ""),
+        ("#cmakedefine C10_USE_ROCM_KERNEL_ASSERT", ""),
+    ]
+
+    mobile_c10_cmake_defines = [
+        ("#cmakedefine C10_USE_GLOG", ""),
+        ("#cmakedefine C10_USE_GFLAGS", ""),
+    ]
+
+    non_mobile_c10_cmake_defines = [
+        ("#cmakedefine C10_USE_GLOG", "#define C10_USE_GLOG 1"),
+        ("#cmakedefine C10_USE_GFLAGS", "#define C10_USE_GFLAGS 1"),
+    ]
+
+    gen_cmake_header(
+        src = "macros/cmake_macros.h.in",
+        defines = common_c10_cmake_defines + mobile_c10_cmake_defines,
+        header = "c10/macros/cmake_macros.h",
+        prefix = "ovrsource_c10_mobile_",
+    )
+
+    gen_cmake_header(
+        src = "macros/cmake_macros.h.in",
+        defines = common_c10_cmake_defines + non_mobile_c10_cmake_defines,
+        header = "c10/macros/cmake_macros.h",
+        prefix = "ovrsource_c10_non_mobile_",
+    )
+
+    oxx_static_library(
+        name = "ovrsource_c10_cmake_macros.h",
+        compatible_with = [
+            "ovr_config//os:android",
+            "ovr_config//os:iphoneos",
+            "ovr_config//os:linux",
+            "ovr_config//os:macos",
+            "ovr_config//os:windows",
+        ],
+        deps = select({
+            "ovr_config//os:android": [":ovrsource_c10_mobile_cmake_macros.h"],
+            "ovr_config//os:iphoneos": [":ovrsource_c10_mobile_cmake_macros.h"],
+            "ovr_config//os:linux": [":ovrsource_c10_non_mobile_cmake_macros.h"],
+            "ovr_config//os:macos": [":ovrsource_c10_non_mobile_cmake_macros.h"],
+            "ovr_config//os:windows": [":ovrsource_c10_non_mobile_cmake_macros.h"],
+        }),
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10_cuda_macros = gen_cmake_header(
         src = "cuda/impl/cuda_cmake_macros.h.in",
         defines = [
@@ -137,6 +217,10 @@ def define_ovrsource_targets():
             "cuda/impl/*.h",
         ]),
         reexport_all_header_dependencies = False,
+<<<<<<< HEAD
+=======
+        # tests = C10_GPU_TEST_TARGETS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             "//third-party/cuda:libcuda",
@@ -146,3 +230,67 @@ def define_ovrsource_targets():
             ":c10_ovrsource",
         ],
     )
+<<<<<<< HEAD
+=======
+
+    # [
+    #     oxx_test(
+    #         name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
+    #         srcs = [test],
+    #         compatible_with = cpu_supported_platforms,
+    #         compiler_flags = select({
+    #             "DEFAULT": [],
+    #             "ovr_config//compiler:cl": [
+    #                 "/w",
+    #             ],
+    #             "ovr_config//compiler:clang": [
+    #                 "-Wno-error",
+    #                 "-Wno-self-assign-overloaded",
+    #                 "-Wno-self-move",
+    #                 "-Wno-shadow",
+    #                 "-Wno-undef",
+    #                 "-Wno-unused-function",
+    #                 "-Wno-unused-variable",
+    #             ],
+    #         }),
+    #         framework = "gtest",
+    #         oncall = "ovrsource_pytorch",
+    #         raw_headers = native.glob([
+    #             "test/**/*.h",
+    #         ]),
+    #         deps = [
+    #             ":c10_ovrsource",
+    #         ],
+    #     )
+    #     for test in C10_CPU_TEST_FILES
+    # ]
+
+    # [
+    #     oxx_test(
+    #         name = paths.basename(test)[:-len(".cpp")] + "_ovrsource",
+    #         srcs = [test],
+    #         compatible_with = cuda_supported_platforms,
+    #         compiler_flags = select({
+    #             "DEFAULT": [],
+    #             "ovr_config//compiler:cl": [
+    #                 "/w",
+    #             ],
+    #             "ovr_config//compiler:clang": [
+    #                 "-Wno-error",
+    #             ],
+    #         }),
+    #         framework = "gtest",
+    #         oncall = "ovrsource_pytorch",
+    #         raw_headers = native.glob([
+    #             "test/**/*.h",
+    #         ]),
+    #         runtime_shared_libraries = [
+    #             "//third-party/cuda:cudart",
+    #         ],
+    #         deps = [
+    #             ":c10_cuda_ovrsource",
+    #         ],
+    #     )
+    #     for test in C10_GPU_TEST_FILES
+    # ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index deb917dd8fcf3..baee2d5357159 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -46,7 +46,11 @@ def define_targets(rules):
                 "util/typeid_test.cpp",
             ],
         ),
+<<<<<<< HEAD
         copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"],
+=======
+        copts = ["-Wno-deprecated-declarations"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         deps = [
             ":Macros",
             ":complex_math_test_common",
diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp
index e408543f5362c..de177a63e99cf 100644
--- a/c10/test/core/SymInt_test.cpp
+++ b/c10/test/core/SymInt_test.cpp
@@ -1,6 +1,9 @@
 #include <gtest/gtest.h>
 
+<<<<<<< HEAD
 #include <c10/core/ConstantSymNodeImpl.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
@@ -36,6 +39,7 @@ TEST(SymIntTest, Overflows) {
 }
 #endif
 
+<<<<<<< HEAD
 namespace {
 
 // We need a SymNodeImpl that 1) has working arithmetic with
@@ -201,4 +205,6 @@ TEST(SymIntTest, MinMax) {
   test_operator<MinWrapper>();
   test_operator<MaxWrapper>();
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
diff --git a/c10/test/util/generic_math_test.cpp b/c10/test/util/generic_math_test.cpp
index 461d55819c65c..e1872b52b32fd 100644
--- a/c10/test/util/generic_math_test.cpp
+++ b/c10/test/util/generic_math_test.cpp
@@ -14,6 +14,9 @@ TEST(GenericMathTest, div_floor_test) {
   EXPECT_DOUBLE_EQ(c10::div_floor_floating(5., -2.), -3.);
   EXPECT_EQ(c10::div_floor_integer(5, 2), 2);
   EXPECT_EQ(c10::div_floor_integer(5, -2), -3);
+<<<<<<< HEAD
   EXPECT_EQ(c10::div_mod(-9, -3), 0);
   EXPECT_EQ(c10::div_mod(-9., -3.), 0.);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/c10/util/ApproximateClock.cpp b/c10/util/ApproximateClock.cpp
index 74403830c68c6..9c61bd24989c8 100644
--- a/c10/util/ApproximateClock.cpp
+++ b/c10/util/ApproximateClock.cpp
@@ -1,6 +1,12 @@
 #include <c10/util/ApproximateClock.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
+=======
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 
@@ -46,8 +52,12 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   for (const auto i : c10::irange(replicates)) {
     auto delta_ns = end_times[i].t_ - start_times_[i].t_;
     auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
+<<<<<<< HEAD
     scale_factors[i] =
         static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
+=======
+    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   std::sort(scale_factors.begin(), scale_factors.end());
   long double scale_factor = scale_factors[replicates / 2 + 1];
@@ -65,8 +75,12 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   for (const auto i : c10::irange(replicates)) {
     auto dt = start_times_[i].t_ - t0;
     auto dt_approx =
+<<<<<<< HEAD
         static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
         scale_factor;
+=======
+        (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
   }
   t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
@@ -74,9 +88,13 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   return [=](approx_time_t t_approx) {
     // See above for why this is more stable than `A * t_approx + B`.
     return t_approx > t0_approx
+<<<<<<< HEAD
         ? static_cast<time_t>(
               static_cast<double>(t_approx - t0_approx) * scale_factor) +
             t0
+=======
+        ? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         : 0;
   };
 }
diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h
index 6d3510cd5be83..46f04ba907d09 100644
--- a/c10/util/BFloat16-inl.h
+++ b/c10/util/BFloat16-inl.h
@@ -1 +1,344 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/BFloat16.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+namespace c10 {
+
+/// Constructors
+inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
+    :
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
+      x(__bfloat16_as_ushort(__float2bfloat16(value)))
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+      x(c10::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
+#else
+      // RNE by default
+      x(detail::round_to_nearest_even(value))
+#endif
+{
+}
+
+/// Implicit conversions
+inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  return float(*reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x));
+#else
+  return detail::f32_from_bits(x);
+#endif
+}
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+inline C10_HOST_DEVICE BFloat16::BFloat16(
+    const sycl::ext::oneapi::bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const {
+  return *reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16*>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE BFloat16
+operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator/(const BFloat16& a, const BFloat16& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline C10_HOST_DEVICE bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline C10_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::BFloat16 min() {
+    return c10::BFloat16(0x0080, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 lowest() {
+    return c10::BFloat16(0xFF7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 max() {
+    return c10::BFloat16(0x7F7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 epsilon() {
+    return c10::BFloat16(0x3C00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 round_error() {
+    return c10::BFloat16(0x3F00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 infinity() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 quiet_NaN() {
+    return c10::BFloat16(0x7FC0, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 signaling_NaN() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 denorm_min() {
+    return c10::BFloat16(0x0001, c10::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 6d3510cd5be83..f4bbb9fafe5e6 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -1 +1,127 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/BFloat16.h>
+=======
+#pragma once
+
+// Defines the bloat16 type (brain floating-point). This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
+
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+namespace c10 {
+
+namespace detail {
+inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  float* tempRes;
+
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  tempRes = reinterpret_cast<float*>(&tmp);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &tmp, sizeof(tmp));
+#endif
+
+  return res;
+}
+
+inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &src, sizeof(res));
+#endif
+
+  return res >> 16;
+}
+
+inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  if (src != src) {
+#elif defined(_MSC_VER)
+  if (isnan(src)) {
+#else
+  if (std::isnan(src)) {
+#endif
+    return UINT16_C(0x7FC0);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    union {
+      uint32_t U32; // NOLINT(facebook-hte-BadMemberName)
+      float F32; // NOLINT(facebook-hte-BadMemberName)
+    };
+
+    F32 = src;
+    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+  }
+}
+} // namespace detail
+
+struct alignas(2) BFloat16 {
+  uint16_t x;
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM) && defined(__HIPCC__)
+  C10_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct from_bits_t {};
+  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+      : x(bits) {}
+  /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
+  inline C10_HOST_DEVICE operator float() const;
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index fcdaaae3cb450..24d874ea212f2 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -45,7 +45,18 @@ constexpr bool is_pod_v = is_pod<T>::value;
 
 namespace guts {
 
+<<<<<<< HEAD
 #if defined(__HIP__)
+=======
+#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+template <class F, class Tuple>
+C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+}
+
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
 // modified)
diff --git a/c10/util/DynamicCounter.h b/c10/util/DynamicCounter.h
index 22141f4cdc300..053d96db558fd 100644
--- a/c10/util/DynamicCounter.h
+++ b/c10/util/DynamicCounter.h
@@ -43,7 +43,12 @@ class DynamicCounterBackendIf {
   virtual void unregisterCounter(std::string_view key) = 0;
 };
 
+<<<<<<< HEAD
 void C10_API registerDynamicCounterBackend(
     std::unique_ptr<DynamicCounterBackendIf> /*backend*/);
+=======
+void C10_API
+    registerDynamicCounterBackend(std::unique_ptr<DynamicCounterBackendIf>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 } // namespace c10::monitor
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index f0c85a8b13d8c..5d35ee3a9aa66 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -217,7 +217,11 @@ class C10_API WarningHandlerGuard {
 /// The TORCH_WARN_ONCE macro is difficult to test for. Use
 /// setWarnAlways(true) to turn it into TORCH_WARN, which can be
 /// tested for more easily.
+<<<<<<< HEAD
 C10_API void set_warnAlways(bool /*setting*/) noexcept(true);
+=======
+C10_API void set_warnAlways(bool) noexcept(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_API bool get_warnAlways() noexcept(true);
 
 // A RAII guard that sets warn_always (not thread-local) on
@@ -267,6 +271,7 @@ class C10_API NotImplementedError : public Error {
   using Error::Error;
 };
 
+<<<<<<< HEAD
 // Used in ATen for buffer-related errors, e.g. trying to create a DLPack of
 // an unsupported device.  These turn into BufferError when they cross to
 // Python.
@@ -274,6 +279,8 @@ class C10_API BufferError : public Error {
   using Error::Error;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Used in ATen for non finite indices.  These turn into
 // ExitException when they cross to Python.
 class C10_API EnforceFiniteError : public Error {
@@ -372,7 +379,30 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
 #define C10_EXPAND_MSVC_WORKAROUND(x) x
 
+<<<<<<< HEAD
 #include <torch/headeronly/util/Exception.h>
+=======
+// On nvcc, C10_UNLIKELY thwarts missing return statement analysis.  In cases
+// where the unlikely expression may be a constant, use this macro to ensure
+// return statement analysis keeps working (at the cost of not getting the
+// likely/unlikely annotation on nvcc).
+// https://github.com/pytorch/pytorch/issues/21418
+//
+// Currently, this is only used in the error reporting macros below.  If you
+// want to use it more generally, move me to Macros.h
+//
+// TODO: Brian Vaughan observed that we might be able to get this to work on
+// nvcc by writing some sort of C++ overload that distinguishes constexpr inputs
+// from non-constexpr.  Since there isn't any evidence that losing C10_UNLIKELY
+// in nvcc is causing us perf problems, this is not yet implemented, but this
+// might be an interesting piece of C++ code for an intrepid bootcamper to
+// write.
+#if defined(__CUDACC__)
+#define C10_UNLIKELY_OR_CONST(e) e
+#else
+#define C10_UNLIKELY_OR_CONST(e) C10_UNLIKELY(e)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // ----------------------------------------------------------------------------
 // Error reporting macros
@@ -642,10 +672,13 @@ namespace c10::detail {
 #define TORCH_CHECK_NOT_IMPLEMENTED(cond, ...) \
   TORCH_CHECK_WITH_MSG(NotImplementedError, cond, "TYPE", __VA_ARGS__)
 
+<<<<<<< HEAD
 // Like TORCH_CHECK, but raises BufferError instead of Errors.
 #define TORCH_CHECK_BUFFER(cond, ...) \
   TORCH_CHECK_WITH_MSG(BufferError, cond, "TYPE", __VA_ARGS__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(cond, ...) \
   TORCH_CHECK_WITH_MSG(                                   \
       ErrorAlwaysShowCppStacktrace, cond, "TYPE", ##__VA_ARGS__)
diff --git a/c10/util/ExclusivelyOwned.h b/c10/util/ExclusivelyOwned.h
index ebb74a5823a08..2f08435ed42af 100644
--- a/c10/util/ExclusivelyOwned.h
+++ b/c10/util/ExclusivelyOwned.h
@@ -63,7 +63,11 @@ class ExclusivelyOwned {
   explicit ExclusivelyOwned(T&& t) : repr_(EOT::moveToRepr(std::move(t))) {}
 
   template <class... Args>
+<<<<<<< HEAD
   explicit ExclusivelyOwned(std::in_place_t /*unused*/, Args&&... args)
+=======
+  explicit ExclusivelyOwned(std::in_place_t, Args&&... args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : repr_(EOT::createInPlace(std::forward<Args>(args)...)) {}
 
   ExclusivelyOwned(const ExclusivelyOwned&) = delete;
diff --git a/c10/util/ExclusivelyOwnedTensorTraits.h b/c10/util/ExclusivelyOwnedTensorTraits.h
index f19df3089f77a..40824e034bfbc 100644
--- a/c10/util/ExclusivelyOwnedTensorTraits.h
+++ b/c10/util/ExclusivelyOwnedTensorTraits.h
@@ -35,6 +35,7 @@ struct ExclusivelyOwnedTensorTraits {
     // incremented.
     const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+<<<<<<< HEAD
         toDestroy->refcount() == 1 ||
             (toDestroy->refcount() == 0 && isUndefined),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
@@ -45,16 +46,36 @@ struct ExclusivelyOwnedTensorTraits {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         toDestroy->weakcount() == 1 ||
             (toDestroy->weakcount() == 0 &&
+=======
+        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
+        isUndefined,
+        " and refcount ",
+        toDestroy->refcount_,
+        ", expected 1 or, if isUndefined, 0!");
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        toDestroy->weakcount_ == 1 ||
+            (toDestroy->weakcount_ == 0 &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              toDestroy == UndefinedTensorImpl::singleton()),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
         isUndefined,
         " and weakcount ",
+<<<<<<< HEAD
         toDestroy->weakcount(),
+=======
+        toDestroy->weakcount_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ", expected 1 or, if isUndefined, 0!");
     if (!isUndefined) {
 #ifndef NDEBUG
       // Needed to pass the debug assertions in ~intrusive_ptr_target.
+<<<<<<< HEAD
       toDestroy->combined_refcount_.store(0, std::memory_order_relaxed);
+=======
+      toDestroy->refcount_ = 0;
+      toDestroy->weakcount_ = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
       delete toDestroy;
     }
diff --git a/c10/util/Float4_e2m1fn_x2.h b/c10/util/Float4_e2m1fn_x2.h
index 15f7ac70c4e83..6285e22828eb8 100644
--- a/c10/util/Float4_e2m1fn_x2.h
+++ b/c10/util/Float4_e2m1fn_x2.h
@@ -1 +1,32 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float4_e2m1fn_x2.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+/// Defines the Float4_e2m1fn_x2 type (4-bit floating-point, two elements packed
+/// into one byte). This is the FP4 dtype from the OCP MX format spec
+/// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
+/// Section 5.3.3)
+///
+/// Given two high precision values val0 and val1, here is the
+/// binary configuration of their packed representation, from MSB to LSB:
+///
+///   original value             | val1 : val0
+///   ========================================
+///   bit index (MSB==7, LSB==0) | 7654 : 3210
+///   sign/exponent/mantissa     | seem : seem
+///
+
+namespace c10 {
+
+struct alignas(1) Float4_e2m1fn_x2 {
+  uint8_t val_;
+  Float4_e2m1fn_x2() = default;
+  C10_HOST_DEVICE explicit Float4_e2m1fn_x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e4m3fn-inl.h b/c10/util/Float8_e4m3fn-inl.h
index ef52e38f506da..1581c1bf0ede1 100644
--- a/c10/util/Float8_e4m3fn-inl.h
+++ b/c10/util/Float8_e4m3fn-inl.h
@@ -1 +1,278 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e4m3fn.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <cstdint>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e4m3fn::Float8_e4m3fn(float value)
+    : x(detail::fp8e4m3fn_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e4m3fn::operator float() const {
+  return detail::fp8e4m3fn_to_fp32_value(x);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e4m3fn::isnan() const {
+  return (x & 0b01111111) == 0b01111111;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator+(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator-(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator*(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(
+    const Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(const Float8_e4m3fn& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator+=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator-=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator*=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator/=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e4m3fn a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e4m3fn b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e4m3fn b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e4m3fn b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e4m3fn b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e4m3fn& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e4m3fn& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e4m3fn& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e4m3fn& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e4m3fn a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e4m3fn b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e4m3fn b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e4m3fn b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e4m3fn b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(Float8_e4m3fn a, int b) {
+  return a + static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(Float8_e4m3fn a, int b) {
+  return a - static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(Float8_e4m3fn a, int b) {
+  return a * static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(Float8_e4m3fn a, int b) {
+  return a / static_cast<Float8_e4m3fn>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(Float8_e4m3fn a, int64_t b) {
+  return a + static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(Float8_e4m3fn a, int64_t b) {
+  return a - static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(Float8_e4m3fn a, int64_t b) {
+  return a * static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(Float8_e4m3fn a, int64_t b) {
+  return a / static_cast<Float8_e4m3fn>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e4m3fn to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e4m3fn> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -5;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e4m3fn min() {
+    return c10::Float8_e4m3fn(0x08, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn lowest() {
+    return c10::Float8_e4m3fn(0xFE, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn max() {
+    return c10::Float8_e4m3fn(0x7E, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn epsilon() {
+    return c10::Float8_e4m3fn(0x20, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn round_error() {
+    return c10::Float8_e4m3fn(0x30, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn quiet_NaN() {
+    return c10::Float8_e4m3fn(0x7F, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn denorm_min() {
+    return c10::Float8_e4m3fn(0x01, c10::Float8_e4m3fn::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h
index ef52e38f506da..5c642018c2815 100644
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@@ -1 +1,244 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e4m3fn.h>
+=======
+#pragma once
+
+/// Defines the Float8_e4m3fn type (8-bit floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration:
+/// s eeee mmm
+/// 1 sign bit
+/// 4 exponent bits
+/// 3 mantissa bits
+/// bias = 7
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2209.05433.pdf
+/// and inspired by Half implementation from pytorch/c10/util/Half.h
+
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+
+#if defined(__cplusplus)
+#include <cmath>
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <climits>
+#include <iostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 8-bit floating-point number in fp8 E4M3FN format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline C10_HOST_DEVICE float fp8e4m3fn_to_fp32_value(uint8_t input) {
+  /*
+   * Extend the fp8 E4M3FN number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEE|MMM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31 27-30 24-26          0-23
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)input << 24;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEE|MMM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31  27-30 24-26      0-23
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 5 bits (sign == 0 and 4-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  uint32_t renorm_shift = __clz(nonsign);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  // Note: zero is not a supported input into `__builtin_clz`
+  uint32_t renorm_shift =
+      nonsign != 0 ? __builtin_clz(nonsign) : sizeof(uint32_t) * CHAR_BIT;
+#elif defined(_MSC_VER) && !defined(__clang__)
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  // Note: zero is not a supported input into `__builtin_clz`
+  uint32_t renorm_shift =
+      nonsign != 0 ? __builtin_clz(nonsign) : sizeof(uint32_t) * CHAR_BIT;
+#endif
+  renorm_shift = renorm_shift > 4 ? renorm_shift - 4 : 0;
+  /*
+   * Iff fp8e4m3fn number has all exponent and mantissa bits set to 1,
+   * the addition overflows it into bit 31, and the subsequent shift turns the
+   * high 9 bits into 1. Thus inf_nan_mask == 0x7F800000 if the fp8e4m3fn number
+   * is Nan, 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x01000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 4 so the exponent (4 bits originally)
+   * becomes an 8-bit field and 3-bit mantissa shifts into the 3 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x78 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0x07
+   * for fp8e4m3fn number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x78, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  uint32_t result = sign |
+      ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+  return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E4M3FN format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e4m3fn_from_fp32_value(float f) {
+  /*
+   * Binary representation of 480.0f, which is the first value
+   * not representable in fp8e4m3fn range:
+   * 0 1111 111 - fp8e4m3fn
+   * 0 10000111 11100000000000000000000 - fp32
+   */
+  constexpr uint32_t fp8_max = UINT32_C(1087) << 20;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e4m3fn normal range
+   * into denorm representation
+   * magic number: ((127 - 7) + (23 - 3) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(141) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+
+  uint8_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fp8_max) {
+    // NaN - all exponent and mantissa bits set to 1
+    result = 0x7f;
+  } else {
+    if (f_bits < (UINT32_C(121) << 23)) {
+      // Input number is smaller than 2^(-6), which is the smallest
+      // fp8e4m3fn normal number
+      f_bits =
+          fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+      result = static_cast<uint8_t>(f_bits - denorm_mask);
+    } else {
+      // resulting mantissa is odd
+      uint8_t mant_odd = (f_bits >> 20) & 1;
+
+      // update exponent, rounding bias part 1
+      f_bits += ((uint32_t)(7 - 127) << 23) + 0x7FFFF;
+
+      // rounding bias part 2
+      f_bits += mant_odd;
+
+      // take the bits!
+      result = static_cast<uint8_t>(f_bits >> 20);
+    }
+  }
+
+  result |= static_cast<uint8_t>(sign >> 24);
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e4m3fn {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e4m3fn() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e4m3fn(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e4m3fn& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e4m3fn-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e4m3fnuz-inl.h b/c10/util/Float8_e4m3fnuz-inl.h
index f8fab7180e1e7..958731b624a3f 100644
--- a/c10/util/Float8_e4m3fnuz-inl.h
+++ b/c10/util/Float8_e4m3fnuz-inl.h
@@ -1 +1,283 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e4m3fnuz.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz::Float8_e4m3fnuz(float value)
+    : x(detail::fp8e4m3fnuz_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<4, 3>(x);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e4m3fnuz::isnan() const {
+  return x == 0b10000000;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator+(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator-(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator*(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(
+    const Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(const Float8_e4m3fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator+=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator-=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator*=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator/=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e4m3fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e4m3fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e4m3fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e4m3fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e4m3fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e4m3fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e4m3fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e4m3fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e4m3fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e4m3fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e4m3fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e4m3fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int64_t b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int64_t b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int64_t b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int64_t b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e4m3fnuz to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e4m3fnuz> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -6;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e4m3fnuz min() {
+    return c10::Float8_e4m3fnuz(0x08, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz lowest() {
+    return c10::Float8_e4m3fnuz(0xFF, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz max() {
+    return c10::Float8_e4m3fnuz(0x7F, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz epsilon() {
+    return c10::Float8_e4m3fnuz(0x28, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz round_error() {
+    return c10::Float8_e4m3fnuz(0x38, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz infinity() {
+    // NaN (no infinities)
+    return c10::Float8_e4m3fnuz(0x80, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz quiet_NaN() {
+    return c10::Float8_e4m3fnuz(0x80, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz denorm_min() {
+    return c10::Float8_e4m3fnuz(0x01, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h
index f8fab7180e1e7..c25b872f97444 100644
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@@ -1 +1,143 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e4m3fnuz.h>
+=======
+#pragma once
+
+/// Defines the Float8_e4m3fnuz type (8-bit floating-point) including
+/// conversions to standard C types and basic arithmetic operations. Note that
+/// arithmetic operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration remains the same as Float8_e4m3fn:
+/// s eeee mmm
+/// 1 sign bit
+/// 4 exponent bits
+/// 3 mantissa bits
+/// The key differences versus Float8_e4m3fn are:
+/// bias = 8
+/// no infinities or negative zero
+/// NaN only when sign bit is 1, rest all 0s
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
+/// the existing Float8_e4m3fn implementation.
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E4M3FNUZ format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
+  /*
+   * Binary representation of 256.0f, which is the first value not representable
+   * (i.e. the first value which would overflow in to the sign bit, resulting in
+   * a NaN) in fp8e4m3fnuz range:
+   * 1 0000 000 - fp8e4m3fnuz
+   * 0 10000111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fnuz_max = UINT32_C(0x87) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e4m3fnuz normal range
+   * into denorm representation
+   * magic number: ((127 - 8) + (23 - 3) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(0x8C) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+
+  uint32_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fnuz_max) {
+    // NaN -- sign bit set to 1, rest 0s.
+    return 0x80;
+  }
+
+  if (f_bits < (UINT32_C(0x78) << 23) /* 2^-7 in float32 */) {
+    // Input exponent is less than -7, the smallest e4m3fnuz exponent, so the
+    // number will become subnormal.
+    f_bits = fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+    result = static_cast<uint8_t>(f_bits - denorm_mask);
+    if (result == 0) {
+      // fnuz types don't have negative zero.
+      return 0;
+    }
+  } else {
+    // resulting mantissa is odd
+    uint8_t mant_odd = (f_bits >> 20) & 1;
+
+    // update exponent, rounding bias part 1
+    f_bits += ((uint32_t)(8 - 127) << 23) + 0x7FFFF;
+
+    // rounding bias part 2
+    f_bits += mant_odd;
+
+    // take the bits!
+    result = static_cast<uint8_t>(f_bits >> 20);
+  }
+
+  result |= sign >> 24;
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e4m3fnuz {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e4m3fnuz() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e4m3fnuz(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e4m3fnuz(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e4m3fnuz& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e4m3fnuz-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e5m2-inl.h b/c10/util/Float8_e5m2-inl.h
index 2e21840fba376..13bfe9a9629ba 100644
--- a/c10/util/Float8_e5m2-inl.h
+++ b/c10/util/Float8_e5m2-inl.h
@@ -1 +1,290 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e5m2.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#define EXP_WIDTH_FP8 5
+#define MAN_WIDTH_FP8 2
+#define EXP_BIAS_FP8 15
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e5m2::Float8_e5m2(float value)
+    : x(detail::fp8e5m2_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e5m2::operator float() const {
+  return detail::fp8e5m2_to_fp32_value(x);
+}
+
+/// Special values helpers
+
+inline C10_HOST_DEVICE bool Float8_e5m2::isnan() const {
+  return (x & 0b01111111) > 0b01111100;
+}
+
+inline C10_HOST_DEVICE bool Float8_e5m2::isinf() const {
+  return (x & 0b01111111) == 0b01111100;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator+(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator-(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator*(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator/(
+    const Float8_e5m2& a,
+    const Float8_e5m2& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator-(const Float8_e5m2& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator+=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator-=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator*=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator/=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e5m2 a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e5m2 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e5m2 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e5m2 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e5m2 b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e5m2& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e5m2& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e5m2& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e5m2& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e5m2 a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e5m2 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e5m2 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e5m2 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e5m2 b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(Float8_e5m2 a, int b) {
+  return a + static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(Float8_e5m2 a, int b) {
+  return a - static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(Float8_e5m2 a, int b) {
+  return a * static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(Float8_e5m2 a, int b) {
+  return a / static_cast<Float8_e5m2>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(Float8_e5m2 a, int64_t b) {
+  return a + static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(Float8_e5m2 a, int64_t b) {
+  return a - static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(Float8_e5m2 a, int64_t b) {
+  return a * static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(Float8_e5m2 a, int64_t b) {
+  return a / static_cast<Float8_e5m2>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e5m2 to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e5m2> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::Float8_e5m2 min() {
+    return c10::Float8_e5m2(0x4, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 max() {
+    return c10::Float8_e5m2(0x7B, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 lowest() {
+    return c10::Float8_e5m2(0xFB, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 epsilon() {
+    return c10::Float8_e5m2(0x34, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 round_error() {
+    return c10::Float8_e5m2(0x38, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 infinity() {
+    return c10::Float8_e5m2(0x7C, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 quiet_NaN() {
+    return c10::Float8_e5m2(0x7F, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 denorm_min() {
+    return c10::Float8_e5m2(0x01, c10::Float8_e5m2::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e5m2.h b/c10/util/Float8_e5m2.h
index 2e21840fba376..aea7cdb74a43d 100644
--- a/c10/util/Float8_e5m2.h
+++ b/c10/util/Float8_e5m2.h
@@ -1 +1,152 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e5m2.h>
+=======
+#pragma once
+
+/// Defines the Float8_e5m2 type (8-bit floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration:
+/// s eeeee mm
+/// 1 sign bit
+/// 5 exponent bits
+/// 2 mantissa bits
+/// bias = 15
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2209.05433.pdf
+/// and inspired by Half implementation from pytorch/c10/util/Half.h
+
+#include <c10/util/Half.h>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 8-bit floating-point number in fp8 E5M2 format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline C10_HOST_DEVICE float fp8e5m2_to_fp32_value(uint8_t input) {
+  /*
+   * Extend the fp8 E5M2 number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEEE|MM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31 26-30 24-25          0-23
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  uint16_t half_representation = input;
+  half_representation <<= 8;
+  return fp16_ieee_to_fp32_value(half_representation);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E5M2 format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e5m2_from_fp32_value(float f) {
+  /*
+   * Binary representation of fp32 infinity
+   * 0 11111111 00000000000000000000000
+   */
+  constexpr uint32_t fp32_inf = UINT32_C(255) << 23;
+
+  /*
+   * Binary representation of 65536.0f, which is the first value
+   * not representable in fp8e5m2 range:
+   * 0 11111 00 - fp8e5m2
+   * 0 10001111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fp8_max = UINT32_C(143) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e5m2 normal range
+   * into denorm representation
+   * magic number: ((127 - 15) + (23 - 2) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(134) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+  uint8_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fp8_max) {
+    // NaN - all exponent and mantissa bits set to 1
+    result = f_bits > fp32_inf ? UINT8_C(0x7F) : UINT8_C(0x7C);
+  } else {
+    if (f_bits < (UINT32_C(113) << 23)) {
+      // Input number is smaller than 2^(-14), which is the smallest
+      // fp8e5m2 normal number
+      f_bits =
+          fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+      result = static_cast<uint8_t>(f_bits - denorm_mask);
+    } else {
+      // resulting mantissa is odd
+      uint32_t mant_odd = (f_bits >> 21) & 1;
+
+      // update exponent, rounding bias part 1
+      f_bits += ((uint32_t)(15 - 127) << 23) + 0xFFFFF;
+
+      // rounding bias part 2
+      f_bits += mant_odd;
+
+      // take the bits!
+      result = static_cast<uint8_t>(f_bits >> 21);
+    }
+  }
+
+  result |= static_cast<uint8_t>(sign >> 24);
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e5m2 {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e5m2() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e5m2(uint8_t bits, from_bits_t) : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e5m2(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+  inline C10_HOST_DEVICE bool isinf() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e5m2& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e5m2-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e5m2fnuz-inl.h b/c10/util/Float8_e5m2fnuz-inl.h
index 1f2d3db723d02..0ac20143324c0 100644
--- a/c10/util/Float8_e5m2fnuz-inl.h
+++ b/c10/util/Float8_e5m2fnuz-inl.h
@@ -1 +1,289 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e5m2fnuz.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz::Float8_e5m2fnuz(float value)
+    : x(detail::fp8e5m2fnuz_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<5, 2>(x);
+}
+
+/// Special values helpers
+
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isnan() const {
+  return x == 0b10000000;
+}
+
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isinf() const {
+  return false;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator+(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator-(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator*(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(
+    const Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(const Float8_e5m2fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator+=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator-=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator*=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator/=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e5m2fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e5m2fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e5m2fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e5m2fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e5m2fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e5m2fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e5m2fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e5m2fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e5m2fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e5m2fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e5m2fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e5m2fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int64_t b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int64_t b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int64_t b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int64_t b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e5m2fnuz to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e5m2fnuz> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -14;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::Float8_e5m2fnuz min() {
+    return c10::Float8_e5m2fnuz(0x04, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz max() {
+    return c10::Float8_e5m2fnuz(0x7F, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz lowest() {
+    return c10::Float8_e5m2fnuz(0xFF, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz epsilon() {
+    return c10::Float8_e5m2fnuz(0x34, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz round_error() {
+    return c10::Float8_e5m2fnuz(0x38, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz infinity() {
+    return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
+  }
+  // TODO(future): we are mapping neg_zero to both inf and NaN, this is
+  // surprising and we should figure out what to do about it.
+  static constexpr c10::Float8_e5m2fnuz quiet_NaN() {
+    return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz denorm_min() {
+    return c10::Float8_e5m2fnuz(0x01, c10::Float8_e5m2fnuz::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e5m2fnuz.h b/c10/util/Float8_e5m2fnuz.h
index 1f2d3db723d02..02eac597a0bee 100644
--- a/c10/util/Float8_e5m2fnuz.h
+++ b/c10/util/Float8_e5m2fnuz.h
@@ -1 +1,142 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e5m2fnuz.h>
+=======
+#pragma once
+
+/// Defines the Float8_e5m2fnuz type (8-bit floating-point) including
+/// conversions to standard C types and basic arithmetic operations. Note that
+/// arithmetic operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration remains the same as e5m2:
+/// s eeeee mm
+/// 1 sign bit
+/// 5 exponent bits
+/// 2 mantissa bits
+/// The key differences that e5m2fnuz brings are:
+/// bias = 16
+/// no infinities or negative zero
+/// NaN only when sign bit is 1, rest all 0s
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
+/// the existing Float8_e4m3fn implementation.
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/floating_point_utils.h>
+
+#if defined(__cplusplus)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E5M2 format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e5m2fnuz_from_fp32_value(float f) {
+  /*
+   * Binary representation of 65536.0f, which is the first value not
+   * representable (i.e. the first value which would overflow in to the sign
+   * bit, resulting in a NaN) in fp8e4m3fnuz range:
+   * 1 00000 00 - fp8e5m2fnuz
+   * 0 10001111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fnuz_max = UINT32_C(0x8F) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e5m2fnuz normal range
+   * into denormalized representation.
+   * magic number: ((127 - 16) + (23 - 2) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(0x85) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+  uint32_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fnuz_max) {
+    // NaN -- sign bit set to 1, rest 0s
+    return 0x80;
+  }
+
+  if (f_bits < (UINT32_C(0x70) << 23) /* 2^-15 in float32 */) {
+    // Input exponent is less than -15, the smallest e5m2fnuz exponent, so the
+    // number will become subnormal.
+    f_bits = fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+    result = static_cast<uint8_t>(f_bits - denorm_mask);
+    if (result == 0) {
+      // fnuz types don't have negative zero.
+      return 0;
+    }
+  } else {
+    // resulting mantissa is odd
+    uint8_t mant_odd = (f_bits >> 21) & 1;
+
+    // update exponent, rounding bias part 1
+    f_bits += ((uint32_t)(16 - 127) << 23) + 0xFFFFF;
+
+    // rounding bias part 2
+    f_bits += mant_odd;
+
+    // take the bits!
+    result = static_cast<uint8_t>(f_bits >> 21);
+  }
+
+  result |= sign >> 24;
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e5m2fnuz {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e5m2fnuz() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e5m2fnuz(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e5m2fnuz(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+  inline C10_HOST_DEVICE bool isinf() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e5m2fnuz& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e5m2fnuz-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e8m0fnu-inl.h b/c10/util/Float8_e8m0fnu-inl.h
index 9982faa07976b..2392cb8b0bc27 100644
--- a/c10/util/Float8_e8m0fnu-inl.h
+++ b/c10/util/Float8_e8m0fnu-inl.h
@@ -1 +1,116 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e8m0fnu.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+#include <cstring>
+#include <limits>
+
+// TODO(#146647): Can we remove the below warning?
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e8m0fnu::Float8_e8m0fnu(float value)
+    : x(detail::fp8e8m0fnu_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e8m0fnu::operator float() const {
+  // TODO(#146647): maybe rewrite without control flow
+
+  // if exponent is zero, need to special case to return 2^-127 instead of zero
+  if (x == 0) {
+    return c10::detail::fp32_from_bits(0x00400000);
+  }
+
+  // if exponent is NaN, need to special case to return properly encoded NaN
+  if (isnan()) {
+    return c10::detail::fp32_from_bits(0x7f800001);
+  }
+
+  // leave sign at 0, set the exponent bits, leave stored mantissa at 0
+  uint32_t res = x << 23;
+
+  return c10::detail::fp32_from_bits(res);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
+  return x == 0b11111111;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e8m0fnu to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e8m0fnu> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = false;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = false;
+  static constexpr auto has_denorm_loss = false;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 1;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 1; // just a 2!
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -126;
+  static constexpr int min_exponent10 = -38;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e8m0fnu min() {
+    // 2^-127
+    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu lowest() {
+    // 2^-127
+    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu max() {
+    // 254 biased, which is 127 unbiased, so 2^127
+    return c10::Float8_e8m0fnu(0b11111110, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu epsilon() {
+    // according to https://en.cppreference.com/w/cpp/types/numeric_limits, this
+    // is "the difference between 1.0 and the next representable value of the
+    // given floating-point type". The next representable value is 2.0, so the
+    // difference is 1.0 which is 2^0. 0 unbiased is 127 biased.
+    return c10::Float8_e8m0fnu(0b01111111, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu round_error() {
+    // 0.5 in float, which is 2^-1, and -1 + 127 = 126
+    return c10::Float8_e8m0fnu(0b01111110, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu quiet_NaN() {
+    return c10::Float8_e8m0fnu(0b11111111, c10::Float8_e8m0fnu::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_e8m0fnu.h b/c10/util/Float8_e8m0fnu.h
index 9982faa07976b..338937f32e2e4 100644
--- a/c10/util/Float8_e8m0fnu.h
+++ b/c10/util/Float8_e8m0fnu.h
@@ -1 +1,124 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Float8_e8m0fnu.h>
+=======
+#pragma once
+
+/// Defines the Float8_e8m0fnu type (8-bit floating-point) including
+/// conversions to standard C types
+/// Binary configuration :
+/// eeeeeeee
+/// no sign bits
+/// 8 exponent bits
+/// no mantissa bits
+///
+/// This is the E8M0 dtype from the OCP MX format spec
+/// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
+/// Section 5.4.1)
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+// TODO(#146647): do we need to special case OPENCL?
+#if defined(__cplusplus)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 e8m0fnu format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) {
+  // TODO(#146647): maybe rewrite without control flow
+
+  uint32_t f_bits = c10::detail::fp32_to_bits(f);
+
+  // extract the exponent
+  uint32_t exponent = (f_bits >> 23) & 0b11111111;
+
+  // special case float32 NaN and +-inf to map to e8m0 nan
+  if (exponent == 0b11111111) {
+    return exponent;
+  }
+
+  // next, we use guard, round, sticky bits and the LSB to implement round to
+  // nearest, with ties to even
+
+  // guard bit - bit 23, or 22 zero-indexed
+  uint8_t g = (f_bits & 0x400000) > 0;
+  // round bit - bit 22, or 21 zero-indexed
+  uint8_t r = (f_bits & 0x200000) > 0;
+  // sticky bit - bits 21 to 1, or 20 to 0 zero-indexed
+  uint8_t s = (f_bits & 0x1FFFFF) > 0;
+  // in casting to e8m0, LSB is the implied mantissa bit. It equals to 0 if the
+  // original float32 is denormal, and to 1 if the original float32 is normal.
+  uint8_t lsb = exponent > 0;
+
+  // implement the RNE logic
+  bool round_up = false;
+
+  // if g == 0, round down (no-op)
+  if (g == 1) {
+    if ((r == 1) || (s == 1)) {
+      // round up
+      round_up = true;
+    } else {
+      if (lsb == 1) {
+        // round up
+        round_up = true;
+      }
+      // if lsb == 0, round down (no-op)
+    }
+  }
+
+  if (round_up) {
+    // adjust exponent
+    // note that if exponent was 255 we would have already returned earlier, so
+    // we know we can add one safely without running out of bounds
+    exponent++;
+  }
+
+  return exponent;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e8m0fnu {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e8m0fnu() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e8m0fnu(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e8m0fnu& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e8m0fnu-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Float8_fnuz_cvt.h b/c10/util/Float8_fnuz_cvt.h
new file mode 100644
index 0000000000000..327f90d11a719
--- /dev/null
+++ b/c10/util/Float8_fnuz_cvt.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <c10/util/floating_point_utils.h>
+
+#include <cstdint>
+
+#if defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp>
+#endif
+
+namespace c10::detail {
+
+/*
+ * Convert a 8-bit floating-point number in either f8 E4M3FNUZ or bf8 E5M2FNUZ
+ * format, in bit representation, to a 32-bit floating-point number.
+ */
+template <uint32_t we, uint32_t wm>
+inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
+  static_assert((we == 4 && wm == 3) || (we == 5 && wm == 2));
+  constexpr uint32_t weo = 8;
+  constexpr uint32_t wmo = 23;
+
+  if (x == 0) {
+    return 0;
+  }
+
+  if (x == 0x80) {
+    constexpr uint32_t ifNaN = 0x7F800001;
+    return fp32_from_bits(ifNaN);
+  }
+
+  uint32_t mantissa = x & ((1 << wm) - 1);
+  uint32_t exponent = (x & 0x7F) >> wm;
+
+  // subnormal input
+  if (exponent == 0) {
+    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    uint32_t renorm_shift = __clz(mantissa);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    uint32_t renorm_shift = sycl::clz(mantissa);
+#elif defined(_MSC_VER)
+    unsigned long nonsign_bsr;
+    _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
+    uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+    uint32_t renorm_shift = __builtin_clz(mantissa);
+#endif
+    uint32_t sh = 1 + renorm_shift - (32 - wm);
+    mantissa <<= sh;
+    exponent += 1 - sh;
+    mantissa &= ((1 << wm) - 1);
+  }
+
+  const uint32_t exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1));
+  exponent += exp_low_cutoff - 1;
+  mantissa <<= wmo - wm;
+
+  uint32_t sign = x >> 7;
+  uint32_t retval = (sign << 31) | (exponent << 23) | mantissa;
+  return fp32_from_bits(retval);
+}
+
+} // namespace c10::detail
diff --git a/c10/util/FunctionRef.h b/c10/util/FunctionRef.h
index 013874becc36f..a05005450590a 100644
--- a/c10/util/FunctionRef.h
+++ b/c10/util/FunctionRef.h
@@ -52,6 +52,7 @@ class function_ref<Ret(Params...)> {
   function_ref(
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
       Callable&& callable,
+<<<<<<< HEAD
       std::enable_if_t<!std::is_same_v<
           std::remove_reference_t<Callable>,
           function_ref>>* /*unused*/
@@ -60,6 +61,14 @@ class function_ref<Ret(Params...)> {
           typename std::invoke_result_t<Callable, Params...>,
           Ret>>* /*unused*/
       = nullptr)
+=======
+      std::enable_if_t<
+          !std::is_same_v<std::remove_reference_t<Callable>, function_ref>>* =
+          nullptr,
+      std::enable_if_t<std::is_convertible_v<
+          typename std::invoke_result_t<Callable, Params...>,
+          Ret>>* = nullptr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : callback(callback_fn<std::remove_reference_t<Callable>>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
 
diff --git a/c10/util/Gauge.h b/c10/util/Gauge.h
index e5596bde6e6fc..422165d6400a5 100644
--- a/c10/util/Gauge.h
+++ b/c10/util/Gauge.h
@@ -26,8 +26,12 @@ class GaugeBackendFactoryIf {
       std::string_view key) noexcept = 0;
 };
 
+<<<<<<< HEAD
 void C10_API
     registerGaugeBackend(std::unique_ptr<GaugeBackendFactoryIf> /*backend*/);
+=======
+void C10_API registerGaugeBackend(std::unique_ptr<GaugeBackendFactoryIf>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 // A handle to a Gauge.
diff --git a/c10/util/Half-inl.h b/c10/util/Half-inl.h
index fe66779a0e51d..90138f04399dc 100644
--- a/c10/util/Half-inl.h
+++ b/c10/util/Half-inl.h
@@ -1 +1,354 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Half.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+#include <ATen/cpu/vec/vec_half.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+/// Constructors
+inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {}
+inline Half::operator float16_t() const {
+  return detail::fp16_from_bits(x);
+}
+#else
+
+inline C10_HOST_DEVICE Half::Half(float value)
+    :
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      x(__half_as_short(__float2half(value)))
+#elif defined(__SYCL_DEVICE_ONLY__)
+      x(c10::bit_cast<uint16_t>(sycl::half(value)))
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+      x(at::vec::float2half_scalar(value))
+#else
+      x(detail::fp16_ieee_from_fp32_value(value))
+#endif
+{
+}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return float(c10::bit_cast<sycl::half>(x));
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+  return at::vec::half2float_scalar(x);
+#elif defined(__aarch64__) && !defined(__CUDACC__)
+  return detail::native_fp16_to_fp32_value(x);
+#else
+  return detail::fp16_ieee_to_fp32_value(x);
+#endif
+}
+
+#endif /* !defined(__aarch64__) || defined(__CUDACC__) \
+        */
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_HOST_DEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+#ifdef SYCL_LANGUAGE_VERSION
+inline C10_HOST_DEVICE Half::Half(const sycl::half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator sycl::half() const {
+  return *reinterpret_cast<const sycl::half*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \
+    (defined(__clang__) && defined(__CUDA__))
+inline __device__ Half __ldg(const Half* ptr) {
+  return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator/(const Half& a, const Half& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a) {
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+    defined(__HIP_DEVICE_COMPILE__)
+  return __hneg(a);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return -c10::bit_cast<sycl::half>(a);
+#else
+  return -static_cast<float>(a);
+#endif
+}
+
+inline C10_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Half a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Half a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Half a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Half a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Half b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Half b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Half b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Half& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Half& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Half& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Half& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Half a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Half a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Half a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Half a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Half b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Half b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Half b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Half operator+(Half a, int b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Half operator+(Half a, int64_t b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int64_t b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int64_t b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int64_t b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int64_t a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int64_t a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int64_t a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int64_t a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Half to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+  static constexpr c10::Half min() {
+    return c10::Half(0x0400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half lowest() {
+    return c10::Half(0xFBFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half max() {
+    return c10::Half(0x7BFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half epsilon() {
+    return c10::Half(0x1400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half round_error() {
+    return c10::Half(0x3800, c10::Half::from_bits());
+  }
+  static constexpr c10::Half infinity() {
+    return c10::Half(0x7C00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half quiet_NaN() {
+    return c10::Half(0x7E00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half signaling_NaN() {
+    return c10::Half(0x7D00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half denorm_min() {
+    return c10::Half(0x0001, c10::Half::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Half.h b/c10/util/Half.h
index 98480b22db334..5071731362e31 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/Half.h>
 
 // need to keep the following for BC because the APIs in here were exposed
@@ -6,3 +7,429 @@
     !defined(__APPLE__)
 #include <ATen/cpu/vec/vec_half.h>
 #endif
+=======
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinsics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinsics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus)
+#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <ostream>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+#include <arm_neon.h>
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
+    defined(_M_IX86)
+#if defined(__F16C__) &&                               \
+    !(defined(__CUDA_ARCH__) || defined(__CUDACC__) || \
+      defined(__HIP_DEVICE_COMPILE__))
+#define C10_X86_F16 1
+#include <immintrin.h> // import conversion ops from f16cintrin.h
+#endif // defined(__F16C__) && !(defined(__CUDA_ARCH__) || defined(__CUDACC__)
+       // || defined(__HIP_DEVICE_COMPILE__))
+#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
+#endif // __GNUC__ || __clang__
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+-----+------------+-------------------+
+   *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  30  27-31     17-26            0-16
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#ifdef _MSC_VER
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+  renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+  /*
+   * Iff half-precision number has exponent of 15, the addition overflows
+   * it into bit 31, and the subsequent shift turns the high 9 bits
+   * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number
+   * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 3 so the exponent (5 bits originally)
+   * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0xF
+   * for half-precision number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x70, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  return sign |
+      ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
+#ifdef C10_X86_F16
+  return _cvtsh_ss(h);
+#else
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the high bits
+   * of the 32-bit word:
+   *
+   *      +-----+------------+---------------------+
+   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+   *      +-----+------------+---------------------+
+   * Bits  27-31    17-26            0-16
+   */
+  const uint32_t two_w = w + w;
+
+  /*
+   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
+   * mantissa and exponent of a single-precision floating-point number:
+   *
+   *       S|Exponent |          Mantissa
+   *      +-+---+-----+------------+----------------+
+   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+   *      +-+---+-----+------------+----------------+
+   * Bits   | 23-31   |           0-22
+   *
+   * Next, there are some adjustments to the exponent:
+   * - The exponent needs to be corrected by the difference in exponent bias
+   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
+   * - Inf and NaN values in the inputs should become Inf and NaN values after
+   * conversion to the single-precision number. Therefore, if the biased
+   * exponent of the half-precision input was 0x1F (max possible value), the
+   * biased exponent of the single-precision output must be 0xFF (max possible
+   * value). We do this correction in two steps:
+   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
+   * below) rather than by 0x70 suggested by the difference in the exponent bias
+   * (see above).
+   *   - Then we multiply the single-precision result of exponent adjustment by
+   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
+   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
+   *     The floating-point multiplication hardware would ensure than Inf and
+   * NaN would retain their value on at least partially IEEE754-compliant
+   * implementations.
+   *
+   * Note that the above operations do not handle denormal inputs (where biased
+   * exponent == 0). However, they also do not operate on denormal inputs, and
+   * do not produce denormal results.
+   */
+  constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23;
+  // const float exp_scale = 0x1.0p-112f;
+  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
+  float exp_scale_val = 0;
+#if defined(_MSC_VER) && defined(__clang__)
+  __builtin_memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+#else
+  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+#endif
+
+  const float exp_scale = exp_scale_val;
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  /*
+   * Convert denormalized half-precision inputs into single-precision results
+   * (always normalized). Zero inputs are also handled here.
+   *
+   * In a denormalized number the biased exponent is zero, and mantissa has
+   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
+   *
+   *                  zeros           |  mantissa
+   *      +---------------------------+------------+
+   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+   *      +---------------------------+------------+
+   * Bits             10-31                0-9
+   *
+   * Now, remember that denormalized half-precision numbers are represented as:
+   *    FP16 = mantissa * 2**(-24).
+   * The trick is to construct a normalized single-precision number with the
+   * same mantissa and thehalf-precision input and with an exponent which would
+   * scale the corresponding mantissa bits to 2**(-24). A normalized
+   * single-precision floating-point number is represented as: FP32 = (1 +
+   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
+   * exponent is 126, a unit change in the mantissa of the input denormalized
+   * half-precision number causes a change of the constructed single-precision
+   * number by 2**(-24), i.e. the same amount.
+   *
+   * The last step is to adjust the bias of the constructed single-precision
+   * number. When the input half-precision number is zero, the constructed
+   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
+   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
+   * single-precision number to get the numerical equivalent of the input
+   * half-precision number.
+   */
+  constexpr uint32_t magic_mask = UINT32_C(126) << 23;
+  constexpr float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  /*
+   * - Choose either results of conversion of input as a normalized number, or
+   * as a denormalized number, depending on the input exponent. The variable
+   * two_w contains input exponent in bits 27-31, therefore if its smaller than
+   * 2**27, the input is either a denormal number, or zero.
+   * - Combine the result of conversion of exponent and mantissa with the sign
+   * of the input number.
+   */
+  constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result = sign |
+      (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                   : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+#endif // C10_X86_F16
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#ifdef C10_X86_F16
+  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
+#else
+  // const float scale_to_inf = 0x1.0p+112f;
+  // const float scale_to_zero = 0x1.0p-110f;
+  constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
+  constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
+  float scale_to_inf_val = 0, scale_to_zero_val = 0;
+  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+  std::memcpy(
+      &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+  const float scale_to_inf = scale_to_inf_val;
+  const float scale_to_zero = scale_to_zero_val;
+
+#if defined(_MSC_VER) && _MSC_VER == 1916
+  float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
+#else
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+#endif
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return static_cast<uint16_t>(
+      (sign >> 16) |
+      (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+#endif // C10_X86_F16
+}
+
+#ifdef C10_X86_F16
+#undef C10_X86_F16
+#endif // C10_X86_F16
+
+#if defined(__aarch64__) && !defined(__CUDACC__)
+inline float16_t fp16_from_bits(uint16_t h) {
+  return c10::bit_cast<float16_t>(h);
+}
+
+inline uint16_t fp16_to_bits(float16_t f) {
+  return c10::bit_cast<uint16_t>(f);
+}
+
+// According to https://godbolt.org/z/frExdbsWG it would translate to single
+// fcvt s0, h0
+inline float native_fp16_to_fp32_value(uint16_t h) {
+  return static_cast<float>(fp16_from_bits(h));
+}
+
+inline uint16_t native_fp16_from_fp32_value(float f) {
+  return fp16_to_bits(static_cast<float16_t>(f));
+}
+#endif
+
+} // namespace detail
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+#if defined(__aarch64__) && !defined(__CUDACC__)
+  inline Half(float16_t value);
+  inline operator float16_t() const;
+#else
+  inline C10_HOST_DEVICE Half(float value);
+  inline C10_HOST_DEVICE operator float() const;
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_HOST_DEVICE Half(const __half& value);
+  inline C10_HOST_DEVICE operator __half() const;
+#endif
+#ifdef SYCL_LANGUAGE_VERSION
+  inline C10_HOST_DEVICE Half(const sycl::half& value);
+  inline C10_HOST_DEVICE operator sycl::half() const;
+#endif
+};
+
+C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Half-inl.h> // IWYU pragma: keep
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index fd9b982b72940..ec26e515b5306 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -79,7 +79,11 @@ C10_API void UpdateLoggingLevelsFromFlags();
     const char* msg,
     const void* caller = nullptr);
 
+<<<<<<< HEAD
 [[noreturn]] inline void ThrowEnforceNotMet(
+=======
+[[noreturn]] C10_API inline void ThrowEnforceNotMet(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const char* file,
     const int line,
     const char* condition,
@@ -102,7 +106,11 @@ C10_API void UpdateLoggingLevelsFromFlags();
     const char* msg,
     const void* caller = nullptr);
 
+<<<<<<< HEAD
 [[noreturn]] inline void ThrowEnforceFiniteNotMet(
+=======
+[[noreturn]] C10_API inline void ThrowEnforceFiniteNotMet(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const char* file,
     const int line,
     const char* condition,
@@ -307,11 +315,18 @@ class C10_API EventSampledHandler {
 
 // Must be called in the main thread before any other threads are spawned.
 C10_API void InitEventSampledHandlers(
+<<<<<<< HEAD
     std::vector<std::pair<
         std::string_view,
         std::unique_ptr<EventSampledHandler>>> /*handlers*/);
 C10_API const std::unique_ptr<EventSampledHandler>& GetEventSampledHandler(
     std::string_view /*event*/);
+=======
+    std::vector<
+        std::pair<std::string_view, std::unique_ptr<EventSampledHandler>>>);
+C10_API const std::unique_ptr<EventSampledHandler>& GetEventSampledHandler(
+    std::string_view);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /**
  * Very lightweight logging for the first time API usage. It's beneficial for
diff --git a/c10/util/MaybeOwned.h b/c10/util/MaybeOwned.h
index 11b2d2a15a5cd..f5b6e33a0b471 100644
--- a/c10/util/MaybeOwned.h
+++ b/c10/util/MaybeOwned.h
@@ -82,7 +82,11 @@ class MaybeOwned final {
 
   /// Don't use this; use owned() instead.
   template <class... Args>
+<<<<<<< HEAD
   explicit MaybeOwned(std::in_place_t /*unused*/, Args&&... args)
+=======
+  explicit MaybeOwned(std::in_place_t, Args&&... args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : isBorrowed_(false), own_(std::forward<Args>(args)...) {}
 
  public:
@@ -177,7 +181,11 @@ class MaybeOwned final {
   }
 
   template <class... Args>
+<<<<<<< HEAD
   static MaybeOwned owned(std::in_place_t /*unused*/, Args&&... args) {
+=======
+  static MaybeOwned owned(std::in_place_t, Args&&... args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return MaybeOwned(std::in_place, std::forward<Args>(args)...);
   }
 
diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h
index d504706f3283a..cfa650355c3f4 100644
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@@ -112,7 +112,11 @@ using make_offset_index_sequence =
  * 2>());
  */
 template <class Tuple, size_t... Is>
+<<<<<<< HEAD
 constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
+=======
+constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
 }
 
@@ -209,7 +213,11 @@ auto tuple_map(
     // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
     std::tuple<Args...>&& tuple,
     const Mapper& mapper,
+<<<<<<< HEAD
     std::index_sequence<Indices...> /*unused*/) {
+=======
+    std::index_sequence<Indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
       tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
 }
diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h
index bf2a78985ed83..9a2102b23d878 100644
--- a/c10/util/OptionalArrayRef.h
+++ b/c10/util/OptionalArrayRef.h
@@ -27,7 +27,11 @@ class OptionalArrayRef final {
 
   constexpr OptionalArrayRef() noexcept = default;
 
+<<<<<<< HEAD
   constexpr OptionalArrayRef(std::nullopt_t /*unused*/) noexcept {}
+=======
+  constexpr OptionalArrayRef(std::nullopt_t) noexcept {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   OptionalArrayRef(const OptionalArrayRef& other) = default;
 
@@ -89,7 +93,11 @@ class OptionalArrayRef final {
 
   // Assignment
 
+<<<<<<< HEAD
   constexpr OptionalArrayRef& operator=(std::nullopt_t /*unused*/) noexcept {
+=======
+  constexpr OptionalArrayRef& operator=(std::nullopt_t) noexcept {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapped_opt_array_ref = std::nullopt;
     return *this;
   }
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index eaf3cbfc601e8..3abd519132ece 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -215,7 +215,11 @@ class SmallVectorTemplateCommon
       class ItTy,
       std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
           false>
+<<<<<<< HEAD
   void assertSafeToReferenceAfterClear(ItTy /*unused*/, ItTy /*unused*/) {}
+=======
+  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// Check whether any part of the range will be invalidated by growing.
   void assertSafeToAddRange(const T* From, const T* To) {
@@ -228,7 +232,11 @@ class SmallVectorTemplateCommon
       class ItTy,
       std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
           false>
+<<<<<<< HEAD
   void assertSafeToAddRange(ItTy /*unused*/, ItTy /*unused*/) {}
+=======
+  void assertSafeToAddRange(ItTy, ItTy) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// Reserve enough space to add one element, and return the updated element
   /// pointer in case it was a reference to the storage.
@@ -538,7 +546,11 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
 
   // No need to do a destroy loop for POD's.
+<<<<<<< HEAD
   static void destroy_range(T* /*unused*/, T* /*unused*/) {}
+=======
+  static void destroy_range(T*, T*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// Move the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
@@ -563,8 +575,13 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
       T1* I,
       T1* E,
       T2* Dest,
+<<<<<<< HEAD
       std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* /*unused*/
       = nullptr) {
+=======
+      std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* =
+          nullptr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Use memcpy for PODs iterated by pointers (which includes SmallVector
     // iterators): std::uninitialized_copy optimizes to memmove, but we can
     // use memcpy here. Note that I and E are iterators and thus might be
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index b2c41bb98ee1d..eef643ecfe82f 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -87,7 +87,11 @@ C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString);
 template <>
 inline std::ostream& _str<CompileTimeEmptyString>(
     std::ostream& ss,
+<<<<<<< HEAD
     const CompileTimeEmptyString& /*unused*/) {
+=======
+    const CompileTimeEmptyString&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ss;
 }
 
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index d8a92c2eaa8c2..735767d3c451b 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -52,7 +52,11 @@ struct maybe_bool {
 template <typename src_t>
 struct maybe_bool<true, src_t> {
   C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
+<<<<<<< HEAD
     // Don't use bool operator so as to also compile for ComplexHalf.
+=======
+    // Don't use bool operator so as to to also compile for ComplexHalf.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return src.real() || src.imag();
   }
 };
diff --git a/c10/util/TypeSafeSignMath.h b/c10/util/TypeSafeSignMath.h
index 28520225d4b26..7482b16dd27ea 100644
--- a/c10/util/TypeSafeSignMath.h
+++ b/c10/util/TypeSafeSignMath.h
@@ -1 +1,144 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/TypeSafeSignMath.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <limits>
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wstring-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Returns false since we cannot have x < 0 if x is unsigned.
+template <typename T>
+inline constexpr bool is_negative(
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if a signed variable x < 0
+template <typename T>
+inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns true if x < 0
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr bool is_negative(const T& x) {
+  return is_negative(x, std::is_unsigned<T>());
+}
+
+/// Returns the sign of an unsigned variable x as 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) {
+  return T(0) < x;
+}
+
+/// Returns the sign of a signed variable x as -1, 0, 1
+template <typename T>
+inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) {
+  return (T(0) < x) - (x < T(0));
+}
+
+/// Returns the sign of x as -1, 0, 1
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr int signum(const T& x) {
+  return signum(x, std::is_unsigned<T>());
+}
+
+/// Returns true if a and b are not both negative
+template <typename T, typename U>
+inline constexpr bool signs_differ(const T& a, const U& b) {
+  return is_negative(a) != is_negative(b);
+}
+
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+/// Returns true if x is greater than the greatest value of the type Limit
+template <typename Limit, typename T>
+inline constexpr bool greater_than_max(const T& x) {
+  constexpr bool can_overflow =
+      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
+  return can_overflow && x > std::numeric_limits<Limit>::max();
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+/// Returns true if x < lowest(Limit). Standard comparison
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < std::numeric_limits<Limit>::lowest();
+}
+
+/// Returns false since all the limit is signed and therefore includes
+/// negative values but x cannot be negative because it is unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x < 0, where 0 is constructed from T.
+/// Limit is not signed, so its lower value is zero
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& x,
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns false sign both types are unsigned
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x is less than the lowest value of type T
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(const T& x) {
+  return less_than_lowest<Limit>(
+      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/UniqueVoidPtr.cpp b/c10/util/UniqueVoidPtr.cpp
index dd92db1066f52..d8e370e88aba1 100644
--- a/c10/util/UniqueVoidPtr.cpp
+++ b/c10/util/UniqueVoidPtr.cpp
@@ -2,6 +2,10 @@
 
 namespace c10::detail {
 
+<<<<<<< HEAD
 void deleteNothing(void* /*unused*/) {}
+=======
+void deleteNothing(void*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10::detail
diff --git a/c10/util/UniqueVoidPtr.h b/c10/util/UniqueVoidPtr.h
index 394fb55000765..2dda0b3741032 100644
--- a/c10/util/UniqueVoidPtr.h
+++ b/c10/util/UniqueVoidPtr.h
@@ -13,7 +13,11 @@ using DeleterFnPtr = void (*)(void*);
 namespace detail {
 
 // Does not delete anything
+<<<<<<< HEAD
 C10_API void deleteNothing(void* /*unused*/);
+=======
+C10_API void deleteNothing(void*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/c10/util/WaitCounter.cpp b/c10/util/WaitCounter.cpp
index fb004ee39b65a..5aa4a7c702f85 100644
--- a/c10/util/WaitCounter.cpp
+++ b/c10/util/WaitCounter.cpp
@@ -49,7 +49,11 @@ class DynamicBackendWrapper : public WaitCounterBackendIf {
 
   void stop(std::chrono::steady_clock::time_point now, intptr_t ctx) noexcept
       override {
+<<<<<<< HEAD
     impl_.stop(
+=======
+    return impl_.stop(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         impl_.self,
         std::chrono::duration_cast<std::chrono::microseconds>(
             now.time_since_epoch())
@@ -162,6 +166,10 @@ WaitCounterHandle::WaitGuard WaitCounterHandle::start() {
 }
 
 void WaitCounterHandle::stop(const SmallVector<intptr_t>& ctxs) {
+<<<<<<< HEAD
   impl_.stop(ctxs);
+=======
+  return impl_.stop(ctxs);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace c10::monitor
diff --git a/c10/util/WaitCounter.h b/c10/util/WaitCounter.h
index e8fe2e90aecf5..7c6d94eb60613 100644
--- a/c10/util/WaitCounter.h
+++ b/c10/util/WaitCounter.h
@@ -3,7 +3,10 @@
 #include <chrono>
 #include <memory>
 #include <string_view>
+<<<<<<< HEAD
 #include <vector>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/macros/Macros.h>
 #include <c10/util/ScopeExit.h>
@@ -35,7 +38,11 @@ class WaitCounterBackendFactoryIf {
 };
 
 C10_API void registerWaitCounterBackend(
+<<<<<<< HEAD
     std::unique_ptr<WaitCounterBackendFactoryIf> /*factory*/);
+=======
+    std::unique_ptr<WaitCounterBackendFactoryIf>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_API std::vector<std::shared_ptr<WaitCounterBackendFactoryIf>>
 getRegisteredWaitCounterBackends();
diff --git a/c10/util/bit_cast.h b/c10/util/bit_cast.h
index 49d0822d94f1b..0c7251cc64260 100644
--- a/c10/util/bit_cast.h
+++ b/c10/util/bit_cast.h
@@ -1 +1,48 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/bit_cast.h>
+=======
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+
+#if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
+#include <bit>
+#define C10_HAVE_STD_BIT_CAST 1
+#else
+#define C10_HAVE_STD_BIT_CAST 0
+#endif // __has_include(<bit>) && (__cplusplus >= 202002L ||
+       // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
+
+namespace c10 {
+
+#if C10_HAVE_STD_BIT_CAST
+using std::bit_cast;
+#else
+// Implementations of std::bit_cast() from C++ 20.
+//
+// This is a less sketchy version of reinterpret_cast.
+//
+// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
+// information as well as the source of our implementations.
+template <class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
+        std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept {
+  static_assert(
+      std::is_trivially_constructible_v<To>,
+      "This implementation additionally requires "
+      "destination type to be trivially constructible");
+
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+#endif // C10_HAVE_STD_BIT_CAST
+#undef C10_HAVE_STD_BIT_CAST
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/bits.h b/c10/util/bits.h
index 1e3c4e5151aed..ba753fb920280 100644
--- a/c10/util/bits.h
+++ b/c10/util/bits.h
@@ -1 +1,65 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/bits.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits1x8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits1x8() = default;
+  C10_HOST_DEVICE explicit bits1x8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits2x4 is an uninterpreted dtype of a tensor with 2 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits2x4() = default;
+  C10_HOST_DEVICE explicit bits2x4(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits4x2 is an uninterpreted dtype of a tensor with 4 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits4x2() = default;
+  C10_HOST_DEVICE explicit bits4x2(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits8 is an uninterpreted dtype of a tensor with 8 bits, without any
+ * semantics defined.
+ */
+struct alignas(1) bits8 {
+  uint8_t val_;
+  bits8() = default;
+  C10_HOST_DEVICE explicit bits8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits16 is an uninterpreted dtype of a tensor with 16 bits, without any
+ * semantics defined.
+ */
+struct alignas(2) bits16 {
+  uint16_t val_;
+  bits16() = default;
+  C10_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/build.bzl b/c10/util/build.bzl
index f061d28b4ad29..53ecf0b8e154c 100644
--- a/c10/util/build.bzl
+++ b/c10/util/build.bzl
@@ -58,9 +58,12 @@ def define_targets(rules):
         name = "bit_cast",
         hdrs = ["bit_cast.h"],
         visibility = ["//:__subpackages__"],
+<<<<<<< HEAD
         deps = [
             "//c10/macros",
         ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     rules.cc_library(
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 4e699684bc38f..3baf8d1a85bf4 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -4,7 +4,535 @@
 
 #include <c10/macros/Macros.h>
 #include <c10/util/Half.h>
+<<<<<<< HEAD
 #include <torch/headeronly/util/complex.h>
+=======
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/complex.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wfloat-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion")
+#endif
+
+namespace c10 {
+
+// c10::complex is an implementation of complex numbers that aims
+// to work on all devices supported by PyTorch
+//
+// Most of the APIs duplicates std::complex
+// Reference: https://en.cppreference.com/w/cpp/numeric/complex
+//
+// [NOTE: Complex Operator Unification]
+// Operators currently use a mix of std::complex, thrust::complex, and
+// c10::complex internally. The end state is that all operators will use
+// c10::complex internally.  Until then, there may be some hacks to support all
+// variants.
+//
+//
+// [Note on Constructors]
+//
+// The APIs of constructors are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/complex
+//
+// Since C++14, all constructors are constexpr in std::complex
+//
+// There are three types of constructors:
+// - initializing from real and imag:
+//     `constexpr complex( const T& re = T(), const T& im = T() );`
+// - implicitly-declared copy constructor
+// - converting constructors
+//
+// Converting constructors:
+// - std::complex defines converting constructor between float/double/long
+// double,
+//   while we define converting constructor between float/double.
+// - For these converting constructors, upcasting is implicit, downcasting is
+//   explicit.
+// - We also define explicit casting from std::complex/thrust::complex
+//   - Note that the conversion from thrust is not constexpr, because
+//     thrust does not define them as constexpr ????
+//
+//
+// [Operator =]
+//
+// The APIs of operator = are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/operator%3D
+//
+// Since C++20, all operator= are constexpr. Although we are not building with
+// C++20, we also obey this behavior.
+//
+// There are three types of assign operator:
+// - Assign a real value from the same scalar type
+//   - In std, this is templated as complex& operator=(const T& x)
+//     with specialization `complex& operator=(T x)` for float/double/long
+//     double Since we only support float and double, on will use `complex&
+//     operator=(T x)`
+// - Copy assignment operator and converting assignment operator
+//   - There is no specialization of converting assignment operators, which type
+//   is
+//     convertible is solely dependent on whether the scalar type is convertible
+//
+// In addition to the standard assignment, we also provide assignment operators
+// with std and thrust
+//
+//
+// [Casting operators]
+//
+// std::complex does not have casting operators. We define casting operators
+// casting to std::complex and thrust::complex
+//
+//
+// [Operator ""]
+//
+// std::complex has custom literals `i`, `if` and `il` defined in namespace
+// `std::literals::complex_literals`. We define our own custom literals in the
+// namespace `c10::complex_literals`. Our custom literals does not follow the
+// same behavior as in std::complex, instead, we define _if, _id to construct
+// float/double complex literals.
+//
+//
+// [real() and imag()]
+//
+// In C++20, there are two overload of these functions, one it to return the
+// real/imag, another is to set real/imag, they are both constexpr. We follow
+// this design.
+//
+//
+// [Operator +=,-=,*=,/=]
+//
+// Since C++20, these operators become constexpr. In our implementation, they
+// are also constexpr.
+//
+// There are two types of such operators: operating with a real number, or
+// operating with another complex number. For the operating with a real number,
+// the generic template form has argument type `const T &`, while the overload
+// for float/double/long double has `T`. We will follow the same type as
+// float/double/long double in std.
+//
+// [Unary operator +-]
+//
+// Since C++20, they are constexpr. We also make them expr
+//
+// [Binary operators +-*/]
+//
+// Each operator has three versions (taking + as example):
+// - complex + complex
+// - complex + real
+// - real + complex
+//
+// [Operator ==, !=]
+//
+// Each operator has three versions (taking == as example):
+// - complex == complex
+// - complex == real
+// - real == complex
+//
+// Some of them are removed on C++20, but we decide to keep them
+//
+// [Operator <<, >>]
+//
+// These are implemented by casting to std::complex
+//
+//
+//
+// TODO(@zasdfgbnm): c10::complex<c10::Half> is not currently supported,
+// because:
+//  - lots of members and functions of c10::Half are not constexpr
+//  - thrust::complex only support float and double
+
+template <typename T>
+struct alignas(sizeof(T) * 2) complex {
+  using value_type = T;
+
+  T real_ = T(0);
+  T imag_ = T(0);
+
+  constexpr complex() = default;
+  C10_HOST_DEVICE constexpr complex(const T& re, const T& im = T())
+      : real_(re), imag_(im) {}
+  template <typename U>
+  explicit constexpr complex(const std::complex<U>& other)
+      : complex(other.real(), other.imag()) {}
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  explicit C10_HOST_DEVICE complex(const thrust::complex<U>& other)
+      : real_(other.real()), imag_(other.imag()) {}
+// NOTE can not be implemented as follow due to ROCm bug:
+//   explicit C10_HOST_DEVICE complex(const thrust::complex<U> &other):
+//   complex(other.real(), other.imag()) {}
+#endif
+
+  // Use SFINAE to specialize casting constructor for c10::complex<float> and
+  // c10::complex<double>
+  template <typename U = T>
+  C10_HOST_DEVICE explicit constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, float>, complex<double>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+  template <typename U = T>
+  C10_HOST_DEVICE constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, double>, complex<float>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+
+  constexpr complex<T>& operator=(T re) {
+    real_ = re;
+    imag_ = 0;
+    return *this;
+  }
+
+  constexpr complex<T>& operator+=(T re) {
+    real_ += re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator-=(T re) {
+    real_ -= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator*=(T re) {
+    real_ *= re;
+    imag_ *= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator/=(T re) {
+    real_ /= re;
+    imag_ /= re;
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator=(const complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator+=(const complex<U>& rhs) {
+    real_ += rhs.real();
+    imag_ += rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator-=(const complex<U>& rhs) {
+    real_ -= rhs.real();
+    imag_ -= rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator*=(const complex<U>& rhs) {
+    // (a + bi) * (c + di) = (a*c - b*d) + (a * d + b * c) i
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+
+#ifdef __APPLE__
+#define FORCE_INLINE_APPLE __attribute__((always_inline))
+#else
+#define FORCE_INLINE_APPLE
+#endif
+  template <typename U>
+  constexpr FORCE_INLINE_APPLE complex<T>& operator/=(const complex<U>& rhs)
+      __ubsan_ignore_float_divide_by_zero__ {
+    // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i
+    // the calculation below follows numpy's complex division
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // std::abs is already constexpr by gcc
+    auto abs_c = std::abs(c);
+    auto abs_d = std::abs(d);
+#else
+    auto abs_c = c < 0 ? -c : c;
+    auto abs_d = d < 0 ? -d : d;
+#endif
+
+    if (abs_c >= abs_d) {
+      if (abs_c == U(0) && abs_d == U(0)) {
+        /* divide by zeros should yield a complex inf or nan */
+        real_ = a / abs_c;
+        imag_ = b / abs_d;
+      } else {
+        auto rat = d / c;
+        auto scl = U(1.0) / (c + d * rat);
+        real_ = (a + b * rat) * scl;
+        imag_ = (b - a * rat) * scl;
+      }
+    } else {
+      auto rat = c / d;
+      auto scl = U(1.0) / (d + c * rat);
+      real_ = (a * rat + b) * scl;
+      imag_ = (b * rat - a) * scl;
+    }
+    return *this;
+  }
+#undef FORCE_INLINE_APPLE
+
+  template <typename U>
+  constexpr complex<T>& operator=(const std::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE complex<T>& operator=(const thrust::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+#endif
+
+  template <typename U>
+  explicit constexpr operator std::complex<U>() const {
+    return std::complex<U>(std::complex<T>(real(), imag()));
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE explicit operator thrust::complex<U>() const {
+    return static_cast<thrust::complex<U>>(thrust::complex<T>(real(), imag()));
+  }
+#endif
+
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
+  C10_HOST_DEVICE constexpr T real() const {
+    return real_;
+  }
+  constexpr void real(T value) {
+    real_ = value;
+  }
+  C10_HOST_DEVICE constexpr T imag() const {
+    return imag_;
+  }
+  constexpr void imag(T value) {
+    imag_ = value;
+  }
+};
+
+namespace complex_literals {
+
+constexpr complex<float> operator""_if(long double imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(long double imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+constexpr complex<float> operator""_if(unsigned long long imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(unsigned long long imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+} // namespace complex_literals
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& val) {
+  return val;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& val) {
+  return complex<T>(-val.real(), -val.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const T& lhs, const complex<T>& rhs) {
+  return complex<T>(lhs + rhs.real(), rhs.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = -rhs;
+  return result += lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = rhs;
+  return result *= lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const T& lhs, const complex<T>& rhs) {
+  complex<T> result(lhs, T());
+  return result /= rhs;
+}
+
+// Define operators between integral scalars and c10::complex. std::complex does
+// not support this when T is a floating-point number. This is useful because it
+// saves a lot of "static_cast" when operate a complex and an integer. This
+// makes the code both less verbose and potentially more efficient.
+#define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION                 \
+  typename std::enable_if_t<                                  \
+      std::is_floating_point_v<fT> && std::is_integral_v<iT>, \
+      int> = 0
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const c10::complex<fT>& a, const iT& b) {
+  return a + static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) + b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const c10::complex<fT>& a, const iT& b) {
+  return a - static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) - b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const c10::complex<fT>& a, const iT& b) {
+  return a * static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) * b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const c10::complex<fT>& a, const iT& b) {
+  return a / static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) / b;
+}
+
+#undef COMPLEX_INTEGER_OP_TEMPLATE_CONDITION
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const complex<T>& rhs) {
+  return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const T& rhs) {
+  return (lhs.real() == rhs) && (lhs.imag() == T());
+}
+
+template <typename T>
+constexpr bool operator==(const T& lhs, const complex<T>& rhs) {
+  return (lhs == rhs.real()) && (T() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const T& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const T& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>& operator<<(
+    std::basic_ostream<CharT, Traits>& os,
+    const complex<T>& x) {
+  return (os << static_cast<std::complex<T>>(x));
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_istream<CharT, Traits>& operator>>(
+    std::basic_istream<CharT, Traits>& is,
+    complex<T>& x) {
+  std::complex<T> tmp;
+  is >> tmp;
+  x = tmp;
+  return is;
+}
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // std functions
 //
@@ -70,6 +598,75 @@ constexpr c10::complex<T> conj(const c10::complex<T>& z) {
 
 } // namespace std
 
+<<<<<<< HEAD
+=======
+namespace c10 {
+
+template <typename T>
+C10_HOST_DEVICE complex<T> polar(const T& r, const T& theta = T()) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<complex<T>>(thrust::polar(r, theta));
+#else
+  // std::polar() requires r >= 0, so spell out the explicit implementation to
+  // avoid a branch.
+  return complex<T>(r * std::cos(theta), r * std::sin(theta));
+#endif
+}
+
+template <>
+struct alignas(4) complex<Half> {
+  Half real_;
+  Half imag_;
+
+  // Constructors
+  complex() = default;
+  // Half constructor is not constexpr so the following constructor can't
+  // be constexpr
+  C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag)
+      : real_(real), imag_(imag) {}
+  C10_HOST_DEVICE inline complex(const c10::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  // Conversion operator
+  inline C10_HOST_DEVICE operator c10::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  constexpr C10_HOST_DEVICE Half real() const {
+    return real_;
+  }
+  constexpr C10_HOST_DEVICE Half imag() const {
+    return imag_;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator+=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) + static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) + static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator-=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) - static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) - static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator*=(const complex<Half>& other) {
+    auto a = static_cast<float>(real_);
+    auto b = static_cast<float>(imag_);
+    auto c = static_cast<float>(other.real());
+    auto d = static_cast<float>(other.imag());
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+};
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
 // math functions are included in a separate file
 #include <c10/util/complex_math.h> // IWYU pragma: keep
diff --git a/c10/util/complex_math.cpp b/c10/util/complex_math.cpp
index d1d690917a9bc..88a60e60e6b58 100644
--- a/c10/util/complex_math.cpp
+++ b/c10/util/complex_math.cpp
@@ -1,5 +1,10 @@
 #include <c10/util/complex.h>
 
+<<<<<<< HEAD
+=======
+#include <cmath>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Note [ Complex Square root in libc++]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // In libc++ complex square root is computed using polar form
diff --git a/c10/util/flags_use_no_gflags.cpp b/c10/util/flags_use_no_gflags.cpp
index 533caa3367794..1975441ad41d4 100644
--- a/c10/util/flags_use_no_gflags.cpp
+++ b/c10/util/flags_use_no_gflags.cpp
@@ -15,7 +15,11 @@ using std::string;
 C10_DEFINE_REGISTRY(C10FlagsRegistry, C10FlagParser, const string&)
 
 namespace {
+<<<<<<< HEAD
 bool gCommandLineFlagsParsed = false;
+=======
+static bool gCommandLineFlagsParsed = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Since flags is going to be loaded before logging, we would
 // need to have a stringstream to hold the messages instead of directly
 // using caffe logging.
@@ -23,7 +27,11 @@ std::stringstream& GlobalInitStream() {
   static std::stringstream ss;
   return ss;
 }
+<<<<<<< HEAD
 const char* gUsageMessage = "(Usage message not set.)";
+=======
+static const char* gUsageMessage = "(Usage message not set.)";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 C10_EXPORT void SetUsageMessage(const string& str) {
diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
index df74877f8b176..0f42e6b595c99 100644
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@@ -573,6 +573,7 @@ class sherwood_v3_table : private EntryAlloc,
     return emplace(std::move(value));
   }
   template <typename... Args>
+<<<<<<< HEAD
   iterator emplace_hint(const_iterator /*unused*/, Args&&... args) {
     return emplace(std::forward<Args>(args)...).first;
   }
@@ -580,6 +581,15 @@ class sherwood_v3_table : private EntryAlloc,
     return emplace(value).first;
   }
   iterator insert(const_iterator /*unused*/, value_type&& value) {
+=======
+  iterator emplace_hint(const_iterator, Args&&... args) {
+    return emplace(std::forward<Args>(args)...).first;
+  }
+  iterator insert(const_iterator, const value_type& value) {
+    return emplace(value).first;
+  }
+  iterator insert(const_iterator, value_type&& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return emplace(std::move(value)).first;
   }
 
@@ -896,7 +906,11 @@ class sherwood_v3_table : private EntryAlloc,
 } // namespace detailv3
 
 struct prime_number_hash_policy {
+<<<<<<< HEAD
   static uint64_t mod0(uint64_t /*unused*/) {
+=======
+  static uint64_t mod0(uint64_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return 0llu;
   }
   static uint64_t mod2(uint64_t hash) {
@@ -1883,7 +1897,11 @@ struct power_of_two_hash_policy {
     size = detailv3::next_power_of_two(size);
     return 0;
   }
+<<<<<<< HEAD
   void commit(int8_t /*unused*/) {}
+=======
+  void commit(int8_t) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void reset() {}
 };
 
@@ -1989,14 +2007,22 @@ class flat_hash_map
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
+<<<<<<< HEAD
       typename Table::const_iterator /*unused*/,
+=======
+      typename Table::const_iterator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const key_type& key,
       M&& m) {
     return insert_or_assign(key, std::forward<M>(m)).first;
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
+<<<<<<< HEAD
       typename Table::const_iterator /*unused*/,
+=======
+      typename Table::const_iterator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       key_type&& key,
       M&& m) {
     return insert_or_assign(std::move(key), std::forward<M>(m)).first;
diff --git a/c10/util/floating_point_utils.h b/c10/util/floating_point_utils.h
index 10aa67c7cb843..0d17d2bb4031a 100644
--- a/c10/util/floating_point_utils.h
+++ b/c10/util/floating_point_utils.h
@@ -1 +1,37 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/floating_point_utils.h>
+=======
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+#include <cstdint>
+
+namespace c10::detail {
+
+C10_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+  return as_float(w);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+  return _castu32_f32(w);
+#else
+  return c10::bit_cast<float>(w);
+#endif
+}
+
+C10_HOST_DEVICE inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+  return as_uint(f);
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+  return _castf32_u32(f);
+#else
+  return c10::bit_cast<uint32_t>(f);
+#endif
+}
+
+} // namespace c10::detail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/generic_math.h b/c10/util/generic_math.h
index 493c03cb42e64..44ec87e45d221 100644
--- a/c10/util/generic_math.h
+++ b/c10/util/generic_math.h
@@ -93,7 +93,11 @@ template <
     std::enable_if_t<std::is_integral_v<scalar_t>, int> = 0>
 inline C10_HOST_DEVICE scalar_t div_mod(scalar_t a, scalar_t b) {
   auto mod = a % b;
+<<<<<<< HEAD
   if (mod != 0 && (b < 0) != (mod < 0)) {
+=======
+  if ((b < 0) != (mod < 0)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mod += b;
   }
   return mod;
diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp
index 4ce9c907558f7..8ebca7787b8a8 100644
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@@ -132,6 +132,7 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
   int div_base_log = 0;
   switch (flags & std::ios::basefield) {
     case std::ios::hex:
+<<<<<<< HEAD
       div = static_cast<uint64_t>(0x1000000000000000u); // 16^15
       div_base_log = 15;
       break;
@@ -141,6 +142,17 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
       break;
     default: // std::ios::dec
       div = static_cast<uint64_t>(10000000000000000000u); // 10^19
+=======
+      div = (uint64_t)0x1000000000000000u; // 16^15
+      div_base_log = 15;
+      break;
+    case std::ios::oct:
+      div = (uint64_t)01000000000000000000000u; // 8^21
+      div_base_log = 21;
+      break;
+    default: // std::ios::dec
+      div = (uint64_t)10000000000000000000u; // 10^19
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       div_base_log = 19;
       break;
   }
diff --git a/c10/util/int128.h b/c10/util/int128.h
index 11d903002d2ba..48ddebfad4eb1 100644
--- a/c10/util/int128.h
+++ b/c10/util/int128.h
@@ -79,8 +79,13 @@ class C10_API uint128 {
   // Make msvc happy with using operator<<= from DivModImpl
   // which is a static function, and linker complained about missing
   // static version of this overload
+<<<<<<< HEAD
   friend uint128& operator<<=(uint128& /*self*/, int /*amount*/);
   uint128& operator>>=(int /*amount*/);
+=======
+  friend uint128& operator<<=(uint128&, int);
+  uint128& operator>>=(int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint128& operator&=(const uint128& b);
   uint128& operator|=(const uint128& b);
   uint128& operator^=(const uint128& b);
@@ -154,6 +159,7 @@ inline bool operator!=(const uint128& lhs, const uint128& rhs) {
   return !(lhs == rhs);
 }
 
+<<<<<<< HEAD
 inline UINT128_CONSTEXPR uint128::uint128() : lo_(0), hi_(0) {}
 inline UINT128_CONSTEXPR uint128::uint128(uint64_t top, uint64_t bottom)
     : lo_(bottom), hi_(top) {}
@@ -165,12 +171,29 @@ inline UINT128_CONSTEXPR uint128::uint128(uint64_t bottom)
 inline UINT128_CONSTEXPR uint128::uint128(uint32_t bottom)
     : lo_(bottom), hi_(0) {}
 inline UINT128_CONSTEXPR uint128::uint128(int bottom)
+=======
+C10_API inline UINT128_CONSTEXPR uint128::uint128() : lo_(0), hi_(0) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint64_t top, uint64_t bottom)
+    : lo_(bottom), hi_(top) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(const uint128_pod& v)
+    : lo_(v.lo), hi_(v.hi) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint64_t bottom)
+    : lo_(bottom), hi_(0) {}
+#ifndef SWIG
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint32_t bottom)
+    : lo_(bottom), hi_(0) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(int bottom)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : lo_(bottom), hi_(static_cast<int64_t>((bottom < 0) ? -1 : 0)) {}
 #endif
 
 #undef UINT128_CONSTEXPR
 
+<<<<<<< HEAD
 inline void uint128::Initialize(uint64_t top, uint64_t bottom) {
+=======
+C10_API inline void uint128::Initialize(uint64_t top, uint64_t bottom) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hi_ = top;
   lo_ = bottom;
 }
@@ -226,11 +249,19 @@ LOGIC128(^)
 
 #undef LOGIC128
 
+<<<<<<< HEAD
 #define LOGICASSIGN128(op)                                      \
   inline uint128& uint128::operator op(const uint128 & other) { \
     hi_ op other.hi_;                                           \
     lo_ op other.lo_;                                           \
     return *this;                                               \
+=======
+#define LOGICASSIGN128(op)                                              \
+  C10_API inline uint128& uint128::operator op(const uint128 & other) { \
+    hi_ op other.hi_;                                                   \
+    lo_ op other.lo_;                                                   \
+    return *this;                                                       \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 LOGICASSIGN128(|=)
@@ -295,7 +326,11 @@ inline uint128& operator<<=(uint128& self, int amount) {
   return self;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator>>=(int amount) {
+=======
+C10_API inline uint128& uint128::operator>>=(int amount) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // uint64_t shifts of >= 64 are undefined, so we will need some
   // special-casing.
   if (amount < 64) {
@@ -333,7 +368,11 @@ inline uint128 operator%(const uint128& lhs, const uint128& rhs) {
   return uint128(lhs) %= rhs;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator+=(const uint128& b) {
+=======
+C10_API inline uint128& uint128::operator+=(const uint128& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hi_ += b.hi_;
   uint64_t lolo = lo_ + b.lo_;
   if (lolo < lo_)
@@ -342,7 +381,11 @@ inline uint128& uint128::operator+=(const uint128& b) {
   return *this;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator-=(const uint128& b) {
+=======
+C10_API inline uint128& uint128::operator-=(const uint128& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hi_ -= b.hi_;
   if (b.lo_ > lo_)
     --hi_;
@@ -350,7 +393,11 @@ inline uint128& uint128::operator-=(const uint128& b) {
   return *this;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator*=(const uint128& b) {
+=======
+C10_API inline uint128& uint128::operator*=(const uint128& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint64_t a96 = hi_ >> 32;
   uint64_t a64 = hi_ & 0xffffffffu;
   uint64_t a32 = lo_ >> 32;
@@ -373,24 +420,40 @@ inline uint128& uint128::operator*=(const uint128& b) {
   return *this;
 }
 
+<<<<<<< HEAD
 inline uint128 uint128::operator++(int) {
+=======
+C10_API inline uint128 uint128::operator++(int) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint128 tmp(*this);
   *this += 1;
   return tmp;
 }
 
+<<<<<<< HEAD
 inline uint128 uint128::operator--(int) {
+=======
+C10_API inline uint128 uint128::operator--(int) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint128 tmp(*this);
   *this -= 1;
   return tmp;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator++() {
+=======
+C10_API inline uint128& uint128::operator++() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   *this += 1;
   return *this;
 }
 
+<<<<<<< HEAD
 inline uint128& uint128::operator--() {
+=======
+C10_API inline uint128& uint128::operator--() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   *this -= 1;
   return *this;
 }
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 3d5478be90e60..f451a24e0aa49 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -27,6 +27,7 @@ struct DontIncreaseRefcount {};
 } // namespace raw
 
 namespace detail {
+<<<<<<< HEAD
 constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
 constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
     (kImpracticallyHugeReferenceCount << 32);
@@ -99,6 +100,9 @@ inline uint32_t atomic_weakcount_decrement(
       combined_refcount, kWeakReferenceCountOne));
 }
 
+=======
+constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 /**
@@ -151,6 +155,7 @@ class C10_API intrusive_ptr_target {
   //    atomically increment the use count, if it is greater than 0.
   //    If it is not, you must report that the storage is dead.
   //
+<<<<<<< HEAD
   //.We use a single combined count for refcount and weakcount so that
   // we can atomically operate on both at the same time for performance
   // and defined behaviors.
@@ -159,6 +164,10 @@ class C10_API intrusive_ptr_target {
   static_assert(sizeof(std::atomic<uint64_t>) == 8);
   static_assert(alignof(std::atomic<uint64_t>) == 8);
   static_assert(std::atomic<uint64_t>::is_always_lock_free);
+=======
+  mutable std::atomic<uint32_t> refcount_;
+  mutable std::atomic<uint32_t> weakcount_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   template <typename T, typename NullType>
   friend class intrusive_ptr;
@@ -203,6 +212,7 @@ class C10_API intrusive_ptr_target {
         // caller of unsafe_adapt_non_heap_allocated wanted to
         // use). We choose our reference count such that the count
         // will not dip below kImpracticallyHugeReferenceCount regardless.
+<<<<<<< HEAD
         refcount() == 0 ||
             refcount() >= detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
@@ -213,6 +223,18 @@ class C10_API intrusive_ptr_target {
         weakcount() == 1 || weakcount() == 0 ||
             weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
             weakcount() == detail::kImpracticallyHugeReferenceCount,
+=======
+        refcount_.load() == 0 ||
+            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
+        refcount_.load());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // See ~intrusive_ptr for optimization that will frequently result in 1
+        // at destruction time.
+        weakcount_.load() == 1 || weakcount_.load() == 0 ||
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@@ -221,7 +243,11 @@ class C10_API intrusive_ptr_target {
 #endif
   }
 
+<<<<<<< HEAD
   constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}
+=======
+  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // intrusive_ptr_target supports copy and move: but refcount and weakcount
   // don't participate (since they are intrinsic properties of the memory
@@ -254,6 +280,7 @@ class C10_API intrusive_ptr_target {
    * destructed), this function WILL NOT be called.
    */
   virtual void release_resources() {}
+<<<<<<< HEAD
 
   uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
     return detail::refcount(combined_refcount_.load(order));
@@ -265,6 +292,51 @@ class C10_API intrusive_ptr_target {
   }
 };
 
+=======
+};
+
+namespace detail {
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+// Increment needs to be acquire-release to make use_count() and
+// unique() reliable.
+inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
+  return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+}
+
+// weak_use_count() is only used for testing, so we don't need it to
+// be reliable. Relaxed should be fine.
+inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
+  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
+}
+
+// Both decrements need to be acquire-release for correctness. See
+// e.g. std::shared_ptr implementation.
+inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
+  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
+  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+} // namespace detail
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;
 
@@ -315,7 +387,11 @@ class intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_refcount =
+<<<<<<< HEAD
           detail::atomic_refcount_increment(target_->combined_refcount_);
+=======
+          detail::atomic_refcount_increment(target_->refcount_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
           "intrusive_ptr: Cannot increase refcount after it reached zero.");
@@ -323,6 +399,7 @@ class intrusive_ptr final {
   }
 
   void reset_() noexcept {
+<<<<<<< HEAD
     if (target_ != NullType::singleton()) {
       if (target_->combined_refcount_.load(std::memory_order_acquire) ==
           detail::kUniqueRef) {
@@ -356,6 +433,25 @@ class intrusive_ptr final {
         if (should_delete) {
           delete target_;
         }
+=======
+    if (target_ != NullType::singleton() &&
+        detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+      // See comment above about weakcount. As long as refcount>0,
+      // weakcount is one larger than the actual number of weak references.
+      // So we need to decrement it here.
+      bool should_delete =
+          target_->weakcount_.load(std::memory_order_acquire) == 1;
+      if (!should_delete) {
+        // justification for const_cast: release_resources is basically a
+        // destructor and a destructor always mutates the object, even for const
+        // objects. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();
+        should_delete =
+            detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+      }
+      if (should_delete) {
+        delete target_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -378,12 +474,21 @@ class intrusive_ptr final {
       // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
       // much more expensive: https://godbolt.org/z/eKPzj8.)
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+<<<<<<< HEAD
           target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
           "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
           "constructor do something strange like incref or create an "
           "intrusive_ptr from `this`?");
       target_->combined_refcount_.store(
           detail::kUniqueRef, std::memory_order_relaxed);
+=======
+          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
+          "constructor do something strange like incref or create an "
+          "intrusive_ptr from `this`?");
+      target_->refcount_.store(1, std::memory_order_relaxed);
+      target_->weakcount_.store(1, std::memory_order_relaxed);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -393,15 +498,23 @@ class intrusive_ptr final {
   intrusive_ptr() noexcept
       : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
+<<<<<<< HEAD
   /* implicit */ intrusive_ptr(std::nullptr_t) noexcept
+=======
+  intrusive_ptr(std::nullptr_t) noexcept
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
   // This constructor will not increase the ref counter for you.
   // We use the tagged dispatch mechanism to explicitly mark this constructor
   // to not increase the refcount
+<<<<<<< HEAD
   explicit intrusive_ptr(
       TTarget* target,
       raw::DontIncreaseRefcount /*unused*/) noexcept
+=======
+  explicit intrusive_ptr(TTarget* target, raw::DontIncreaseRefcount) noexcept
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : target_(target) {}
 
   explicit intrusive_ptr(std::unique_ptr<TTarget> rhs) noexcept
@@ -508,14 +621,22 @@ class intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
+<<<<<<< HEAD
     return target_->refcount(std::memory_order_relaxed);
+=======
+    return target_->refcount_.load(std::memory_order_acquire);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
+<<<<<<< HEAD
     return target_->weakcount(std::memory_order_relaxed);
+=======
+    return target_->weakcount_.load(std::memory_order_acquire);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool unique() const noexcept {
@@ -544,8 +665,13 @@ class intrusive_ptr final {
    */
   static intrusive_ptr reclaim(TTarget* owning_ptr) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+<<<<<<< HEAD
         owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
             owning_ptr->weakcount(),
+=======
+        owning_ptr == NullType::singleton() ||
+            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
     return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
   }
@@ -616,11 +742,19 @@ class intrusive_ptr final {
 #ifdef NDEBUG
     expected_decrefs = 0;
 #endif
+<<<<<<< HEAD
     result.target_->combined_refcount_.store(
         detail::refcount(
             detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
             detail::kImpracticallyHugeWeakReferenceCount,
         std::memory_order_relaxed);
+=======
+    result.target_->refcount_.store(
+        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+        std::memory_order_relaxed);
+    result.target_->weakcount_.store(
+        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   }
 
@@ -637,7 +771,11 @@ class intrusive_ptr final {
   static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
     // See Note [Stack allocated intrusive_ptr_target safety]
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+<<<<<<< HEAD
         raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
+=======
+        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "intrusive_ptr: Can only reclaim pointers that are owned by someone");
     auto ptr = reclaim(raw_ptr); // doesn't increase refcount
     ptr.retain_();
@@ -771,7 +909,11 @@ class weak_intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_weakcount =
+<<<<<<< HEAD
           detail::atomic_weakcount_increment(target_->combined_refcount_);
+=======
+          detail::atomic_weakcount_increment(target_->weakcount_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_weakcount != 1,
           "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
@@ -780,7 +922,11 @@ class weak_intrusive_ptr final {
 
   void reset_() noexcept {
     if (target_ != NullType::singleton() &&
+<<<<<<< HEAD
         detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
+=======
+        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
       delete target_;
     }
@@ -913,15 +1059,24 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
+<<<<<<< HEAD
     return target_->refcount(
         std::memory_order_relaxed); // refcount, not weakcount!
+=======
+    return target_->refcount_.load(
+        std::memory_order_acquire); // refcount, not weakcount!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
+<<<<<<< HEAD
     return target_->weakcount(std::memory_order_relaxed);
+=======
+    return target_->weakcount_.load(std::memory_order_acquire);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool expired() const noexcept {
@@ -929,6 +1084,7 @@ class weak_intrusive_ptr final {
   }
 
   intrusive_ptr<TTarget, NullType> lock() const noexcept {
+<<<<<<< HEAD
     if (target_ == NullType::singleton()) {
       return intrusive_ptr<TTarget, NullType>();
     } else {
@@ -936,16 +1092,29 @@ class weak_intrusive_ptr final {
           target_->combined_refcount_.load(std::memory_order_relaxed);
       do {
         if (detail::refcount(combined_refcount) == 0) {
+=======
+    if (expired()) {
+      return intrusive_ptr<TTarget, NullType>();
+    } else {
+      auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
+      do {
+        if (refcount == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // Object already destructed, no strong references left anymore.
           // Return nullptr.
           return intrusive_ptr<TTarget, NullType>();
         }
+<<<<<<< HEAD
       } while (!target_->combined_refcount_.compare_exchange_weak(
           combined_refcount,
           combined_refcount + detail::kReferenceCountOne,
           std::memory_order_acquire,
           std::memory_order_relaxed));
 
+=======
+      } while (
+          !target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return intrusive_ptr<TTarget, NullType>(
           target_, raw::DontIncreaseRefcount{});
     }
@@ -979,9 +1148,15 @@ class weak_intrusive_ptr final {
     // if refcount == 0, weakcount only must be >0.
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         owning_weak_ptr == NullType::singleton() ||
+<<<<<<< HEAD
             owning_weak_ptr->weakcount() > 1 ||
             (owning_weak_ptr->refcount() == 0 &&
              owning_weak_ptr->weakcount() > 0),
+=======
+            owning_weak_ptr->weakcount_.load() > 1 ||
+            (owning_weak_ptr->refcount_.load() == 0 &&
+             owning_weak_ptr->weakcount_.load() > 0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
     return weak_intrusive_ptr(owning_weak_ptr);
   }
@@ -1060,7 +1235,11 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
   if (self) {
+<<<<<<< HEAD
     detail::atomic_refcount_increment(self->combined_refcount_);
+=======
+    detail::atomic_refcount_increment(self->refcount_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1094,7 +1273,11 @@ inline uint32_t use_count(intrusive_ptr_target* self) {
 namespace weak_intrusive_ptr {
 
 inline void incref(weak_intrusive_ptr_target* self) {
+<<<<<<< HEAD
   detail::atomic_weakcount_increment(self->combined_refcount_);
+=======
+  detail::atomic_weakcount_increment(self->weakcount_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void decref(weak_intrusive_ptr_target* self) {
diff --git a/c10/util/irange.h b/c10/util/irange.h
index cc52d443ee5f3..1dec16987b8f1 100644
--- a/c10/util/irange.h
+++ b/c10/util/irange.h
@@ -24,7 +24,11 @@ struct integer_iterator {
   using pointer = I*;
   using reference = I&;
 
+<<<<<<< HEAD
   explicit constexpr integer_iterator(I val) : value(val) {}
+=======
+  explicit constexpr integer_iterator(I value) : value(value) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   constexpr I operator*() const {
     return value;
diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
index 6321297a61c75..5ee68528a65b6 100644
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@@ -70,7 +70,11 @@ enum ZeroBehavior {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct TrailingZerosCounter {
+<<<<<<< HEAD
   static std::size_t count(T Val, ZeroBehavior /*unused*/) {
+=======
+  static std::size_t count(T Val, ZeroBehavior) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!Val)
       return std::numeric_limits<T>::digits;
     if (Val & 0x1)
@@ -147,7 +151,11 @@ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct LeadingZerosCounter {
+<<<<<<< HEAD
   static std::size_t count(T Val, ZeroBehavior /*unused*/) {
+=======
+  static std::size_t count(T Val, ZeroBehavior) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!Val)
       return std::numeric_limits<T>::digits;
 
diff --git a/c10/util/logging_is_not_google_glog.h b/c10/util/logging_is_not_google_glog.h
index 803a833c3cae4..1879217ca3d1e 100644
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@@ -234,9 +234,13 @@ inline std::ostream& operator<<(
   return out;
 }
 
+<<<<<<< HEAD
 inline std::ostream& operator<<(
     std::ostream& out,
     const std::nullptr_t& /*unused*/) {
+=======
+inline std::ostream& operator<<(std::ostream& out, const std::nullptr_t&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out << "(null)";
   return out;
 }
diff --git a/c10/util/order_preserving_flat_hash_map.h b/c10/util/order_preserving_flat_hash_map.h
index a288894f69b6b..ae7f4495e7dae 100644
--- a/c10/util/order_preserving_flat_hash_map.h
+++ b/c10/util/order_preserving_flat_hash_map.h
@@ -560,6 +560,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     return emplace(std::move(value));
   }
   template <typename... Args>
+<<<<<<< HEAD
   iterator emplace_hint(const_iterator /*unused*/, Args&&... args) {
     return emplace(std::forward<Args>(args)...).first;
   }
@@ -567,6 +568,15 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     return emplace(value).first;
   }
   iterator insert(const_iterator /*unused*/, value_type&& value) {
+=======
+  iterator emplace_hint(const_iterator, Args&&... args) {
+    return emplace(std::forward<Args>(args)...).first;
+  }
+  iterator insert(const_iterator, const value_type& value) {
+    return emplace(value).first;
+  }
+  iterator insert(const_iterator, value_type&& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return emplace(std::move(value)).first;
   }
 
@@ -1013,7 +1023,11 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
 } // namespace detailv3
 
 struct prime_number_hash_policy {
+<<<<<<< HEAD
   static uint64_t mod0(uint64_t /*unused*/) {
+=======
+  static uint64_t mod0(uint64_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return 0llu;
   }
   static uint64_t mod2(uint64_t hash) {
@@ -2000,7 +2014,11 @@ struct power_of_two_hash_policy {
     size = detailv3::next_power_of_two(size);
     return 0;
   }
+<<<<<<< HEAD
   void commit(int8_t /*unused*/) {}
+=======
+  void commit(int8_t) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void reset() {}
 };
 
@@ -2106,14 +2124,22 @@ class order_preserving_flat_hash_map
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
+<<<<<<< HEAD
       typename Table::const_iterator /*unused*/,
+=======
+      typename Table::const_iterator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const key_type& key,
       M&& m) {
     return insert_or_assign(key, std::forward<M>(m)).first;
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
+<<<<<<< HEAD
       typename Table::const_iterator /*unused*/,
+=======
+      typename Table::const_iterator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       key_type&& key,
       M&& m) {
     return insert_or_assign(std::move(key), std::forward<M>(m)).first;
diff --git a/c10/util/qint32.h b/c10/util/qint32.h
index 2d1f877f98d48..68bf4dc545cc3 100644
--- a/c10/util/qint32.h
+++ b/c10/util/qint32.h
@@ -1 +1,22 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/qint32.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * qint32 is for signed 32 bit quantized Tensors
+ */
+struct alignas(4) qint32 {
+  using underlying = int32_t;
+  int32_t val_;
+  qint32() = default;
+  C10_HOST_DEVICE explicit qint32(int32_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/qint8.h b/c10/util/qint8.h
index 6eb25f755c901..aa7dd08743b54 100644
--- a/c10/util/qint8.h
+++ b/c10/util/qint8.h
@@ -1 +1,24 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/qint8.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * This is the data type for quantized Tensors. Right now we only have
+ * qint8 which is for 8 bit Tensors, and qint32 for 32 bit int Tensors,
+ * we might have 4 bit, 2 bit or 1 bit data types in the future.
+ */
+struct alignas(1) qint8 {
+  using underlying = int8_t;
+  int8_t val_;
+  qint8() = default;
+  C10_HOST_DEVICE explicit qint8(int8_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/quint2x4.h b/c10/util/quint2x4.h
index 67c846159dfd8..9be7cbdd0c84e 100644
--- a/c10/util/quint2x4.h
+++ b/c10/util/quint2x4.h
@@ -1 +1,23 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/quint2x4.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint2x4 is for un-signed 2 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint2x4() = default;
+  C10_HOST_DEVICE explicit quint2x4(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/quint4x2.h b/c10/util/quint4x2.h
index c9e06e6131777..ee14e5342102b 100644
--- a/c10/util/quint4x2.h
+++ b/c10/util/quint4x2.h
@@ -1 +1,23 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/quint4x2.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint4x2() = default;
+  C10_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/quint8.h b/c10/util/quint8.h
index 4d5719750c627..f9a2ccebecd63 100644
--- a/c10/util/quint8.h
+++ b/c10/util/quint8.h
@@ -1 +1,22 @@
+<<<<<<< HEAD
 #include <torch/headeronly/util/quint8.h>
+=======
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint8 is for unsigned 8 bit quantized Tensors
+ */
+struct alignas(1) quint8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint8() = default;
+  C10_HOST_DEVICE explicit quint8(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h
index 32ffca52e4864..842d95e36a027 100644
--- a/c10/util/safe_numerics.h
+++ b/c10/util/safe_numerics.h
@@ -1,7 +1,10 @@
 #pragma once
 #include <c10/macros/Macros.h>
 
+<<<<<<< HEAD
 #include <cstddef>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdint>
 
 // GCC has __builtin_mul_overflow from before it supported __has_builtin
@@ -32,6 +35,7 @@ C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) {
 #endif
 }
 
+<<<<<<< HEAD
 template <typename T>
 C10_ALWAYS_INLINE bool mul_overflows(T a, T b, T* out) {
 #if C10_HAS_BUILTIN_OVERFLOW()
@@ -62,6 +66,30 @@ C10_ALWAYS_INLINE bool mul_overflows(T a, T b, T* out) {
 
 C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
   return mul_overflows<uint64_t>(a, b, out);
+=======
+C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_mul_overflow(a, b, out);
+#else
+  *out = a * b;
+  // This test isn't exact, but avoids doing integer division
+  return (
+      (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64);
+#endif
+}
+
+C10_ALWAYS_INLINE bool mul_overflows(int64_t a, int64_t b, int64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_mul_overflow(a, b, out);
+#else
+  volatile int64_t tmp = a * b;
+  *out = tmp;
+  if (a == 0 || b == 0) {
+    return false;
+  }
+  return !(a == tmp / b);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename It>
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 831c0d0245245..548870d032b09 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -11,6 +11,10 @@
 #include <unistd.h>
 
 #include <atomic>
+<<<<<<< HEAD
+=======
+#include <chrono>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <condition_variable>
 #include <cstdint>
 #include <cstdio>
diff --git a/c10/util/strong_type.h b/c10/util/strong_type.h
index c7d2fc0ecdd56..22882c4b3aedd 100644
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@@ -65,7 +65,11 @@ struct default_constructible
 
 namespace impl {
   template <typename T>
+<<<<<<< HEAD
   constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>* /*unused*/)
+=======
+  constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>*)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {
     return true;
   }
@@ -76,7 +80,11 @@ class type : public modifier<M, type<T, Tag, M...>>...
 {
 public:
   template <typename TT = T, typename = std::enable_if_t<std::is_trivially_constructible<TT>{}>>
+<<<<<<< HEAD
   explicit type(uninitialized_t /*unused*/)
+=======
+  explicit type(uninitialized_t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     noexcept
   {
   }
@@ -138,7 +146,11 @@ class type : public modifier<M, type<T, Tag, M...>>...
 
 namespace impl {
   template <typename T, typename Tag, typename ... Ms>
+<<<<<<< HEAD
   constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>* /*unused*/) { return true;}
+=======
+  constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>*) { return true;}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr bool is_strong_type_func(...) { return false;}
   template <typename T, typename Tag, typename ... Ms>
   constexpr T underlying_type(strong::type<T, Tag, Ms...>*);
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index 244db3c91e0fb..0bcf667c2a2ef 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -14,6 +14,21 @@ using namespace c10::CachingDeviceAllocator;
 
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;
+<<<<<<< HEAD
+=======
+// all sizes are rounded to at least 512 bytes
+constexpr size_t kMinBlockSize = 512;
+// largest "small" allocation is 1 MiB
+constexpr size_t kSmallSize = 1048576;
+// "small" allocations are packed in 2 MiB blocks
+constexpr size_t kSmallBuffer = 2097152;
+// "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;
+// allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kMinLargeAlloc = 10485760;
+// round up large allocations to 2 MiB
+constexpr size_t kRoundLarge = 2097152;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 using stream_set = ska::flat_hash_set<xpu::XPUStream>;
@@ -123,8 +138,11 @@ class DeviceCachingAllocator {
   ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
       xpu_events;
   DeviceIndex device_index;
+<<<<<<< HEAD
   size_t allowed_memory_maximum = 0;
   bool set_fraction = false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
     if (!src || src->allocated || src->event_count > 0 ||
@@ -247,12 +265,15 @@ class DeviceCachingAllocator {
     if (isRetry) {
       stats.num_alloc_retries += 1;
     }
+<<<<<<< HEAD
     if (set_fraction &&
         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
                 size >
             allowed_memory_maximum) {
       return false;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void* ptr = sycl::aligned_alloc_device(
         kDeviceAlignment,
         size,
@@ -431,6 +452,7 @@ class DeviceCachingAllocator {
       c10::xpu::DeviceProp device_prop;
       c10::xpu::get_device_properties(&device_prop, device);
       auto device_total = device_prop.global_mem_size;
+<<<<<<< HEAD
       // Estimate the available device memory when the SYCL runtime does not
       // support the corresponding aspect (ext_intel_free_memory).
       size_t device_free = device_prop.global_mem_size -
@@ -448,6 +470,8 @@ class DeviceCachingAllocator {
         allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
       }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
               .current;
@@ -470,11 +494,15 @@ class DeviceCachingAllocator {
           static_cast<int>(device),
           " has a total capacity of ",
           format_size(device_total),
+<<<<<<< HEAD
           " of which ",
           format_size(device_free),
           " is free. ",
           allowed_info,
           "Of the allocated memory ",
+=======
+          ". Of the allocated memory ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           format_size(allocated_bytes),
           " is allocated by PyTorch, and ",
           format_size(reserved_bytes - allocated_bytes),
@@ -553,6 +581,7 @@ class DeviceCachingAllocator {
       stats.requested_bytes[statType].reset_peak();
     }
   }
+<<<<<<< HEAD
 
   void setMemoryFraction(double fraction) {
     c10::xpu::DeviceProp device_prop;
@@ -561,13 +590,21 @@ class DeviceCachingAllocator {
     allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
     set_fraction = true;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static void local_raw_delete(void* ptr);
 
+<<<<<<< HEAD
 class XPUAllocator : public DeviceAllocator {
  private:
   alignas(hardware_destructive_interference_size) std::mutex mutex;
+=======
+class XPUAllocator : public Allocator {
+ private:
+  std::mutex mutex;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ska::flat_hash_map<void*, Block*> allocated_blocks;
 
   void add_allocated_block(Block* block) {
@@ -601,10 +638,13 @@ class XPUAllocator : public DeviceAllocator {
     }
   }
 
+<<<<<<< HEAD
   bool initialized() override {
     return !device_allocators.empty();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void malloc(
       void** devPtr,
       DeviceIndex device,
@@ -639,13 +679,21 @@ class XPUAllocator : public DeviceAllocator {
     }
   }
 
+<<<<<<< HEAD
   void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
+=======
+  void emptyCache() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto& da : device_allocators) {
       da->emptyCache();
     }
   }
 
+<<<<<<< HEAD
   void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+=======
+  void recordStream(const DataPtr& ptr, XPUStream stream) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!ptr.get()) {
       return;
     }
@@ -655,8 +703,12 @@ class XPUAllocator : public DeviceAllocator {
 
     Block* block = get_allocated_block(ptr.get());
     TORCH_CHECK(block, "No allocated block can be found.");
+<<<<<<< HEAD
     c10::xpu::XPUStream xpu_stream{stream};
     device_allocators[block->device]->recordStream(block, xpu_stream);
+=======
+    device_allocators[block->device]->recordStream(block, stream);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   DataPtr allocate(size_t size) override {
@@ -709,16 +761,25 @@ class XPUAllocator : public DeviceAllocator {
         ": did you call init?");
   }
 
+<<<<<<< HEAD
   DeviceStats getDeviceStats(DeviceIndex device) override {
+=======
+  DeviceStats getDeviceStats(DeviceIndex device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assertValidDevice(device);
     return device_allocators[device]->getStats();
   }
 
+<<<<<<< HEAD
   void resetPeakStats(DeviceIndex device) override {
+=======
+  void resetPeakStats(DeviceIndex device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assertValidDevice(device);
     device_allocators[device]->resetPeakStats();
   }
 
+<<<<<<< HEAD
   void resetAccumulatedStats(DeviceIndex device) override {
     assertValidDevice(device);
     device_allocators[device]->resetAccumulatedStats();
@@ -733,6 +794,12 @@ class XPUAllocator : public DeviceAllocator {
         ". Please set within (0, 1].");
     device_allocators[device]->setMemoryFraction(fraction);
   }
+=======
+  void resetAccumulatedStats(DeviceIndex device) {
+    assertValidDevice(device);
+    device_allocators[device]->resetAccumulatedStats();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static XPUAllocator allocator;
@@ -777,10 +844,13 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
   return allocator.recordStream(dataPtr, stream);
 }
 
+<<<<<<< HEAD
 void setMemoryFraction(double fraction, DeviceIndex device) {
   return allocator.setMemoryFraction(fraction, device);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_ALLOCATOR(kXPU, &allocator)
 
 } // namespace c10::xpu::XPUCachingAllocator
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
index 44ac34fe9a9b0..7103b233832c7 100644
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@@ -1,6 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/CachingDeviceAllocator.h>
 #include <c10/xpu/XPUStream.h>
 
@@ -25,6 +28,9 @@ C10_XPU_API void raw_delete(void* ptr);
 
 C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
 
+<<<<<<< HEAD
 C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::xpu::XPUCachingAllocator
diff --git a/c10/xpu/XPUDeviceProp.h b/c10/xpu/XPUDeviceProp.h
index 085c6367477f0..12b0c2330d1a7 100644
--- a/c10/xpu/XPUDeviceProp.h
+++ b/c10/xpu/XPUDeviceProp.h
@@ -113,6 +113,7 @@ namespace c10::xpu {
   _(native_vector_width_double)                                                \
   _(native_vector_width_half)
 
+<<<<<<< HEAD
 #define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)                                \
   /* the number of EUs associated with the Intel GPU. */                      \
   _(gpu_eu_count, gpu_eu_count, 512)                                          \
@@ -131,6 +132,20 @@ namespace c10::xpu {
                                                                               \
   /* the device descriptor for device Universal Unique ID, 16 bytes*/         \
   _(uuid, device_info_uuid, (std::array<unsigned char, 16>{}))
+=======
+#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)           \
+  /* the number of EUs associated with the Intel GPU. */ \
+  _(gpu_eu_count, 512)                                   \
+                                                         \
+  /* the number of EUs in a subslice. */                 \
+  _(gpu_eu_count_per_subslice, 8)                        \
+                                                         \
+  /* the simd width of EU of GPU. */                     \
+  _(gpu_eu_simd_width, 8)                                \
+                                                         \
+  /* the number of hardware threads per EU of GPU. */    \
+  _(gpu_hw_threads_per_eu, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
   /* sycl::half is supported on device. */              \
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index 26edf295d1fca..be811e2331ccc 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -120,6 +120,7 @@ inline void initGlobalDevicePoolState() {
   TORCH_CHECK(
       gDevicePool.devices.size() <= std::numeric_limits<DeviceIndex>::max(),
       "Too many XPU devices, DeviceIndex overflowed!");
+<<<<<<< HEAD
   // Check each device's architecture and issue a warning if it is older than
   // the officially supported range (Intel GPUs starting from Arc (Alchemist)
   // series).
@@ -137,6 +138,19 @@ inline void initGlobalDevicePoolState() {
     }
   }
 
+=======
+
+#if defined(_WIN32) && SYCL_COMPILER_VERSION < 20250000
+  // The default context feature is disabled by default on Windows for SYCL
+  // compiler versions earlier than 2025.0.0.
+  std::vector<sycl::device> deviceList;
+  for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
+       ++it) {
+    deviceList.push_back(*(*it));
+  }
+  gDevicePool.context = std::make_unique<sycl::context>(deviceList);
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // The default context is utilized for each Intel GPU device, allowing the
   // retrieval of the context from any GPU device.
   const auto& platform = gDevicePool.devices[0]->get_platform();
@@ -146,6 +160,10 @@ inline void initGlobalDevicePoolState() {
 #else
       platform.ext_oneapi_get_default_context());
 #endif
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void initDevicePoolCallOnce() {
@@ -162,17 +180,29 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 #define ASSIGN_DEVICE_PROP(property) \
   device_prop->property = raw_device.get_info<device::property>();
 
+<<<<<<< HEAD
 #define ASSIGN_EXT_DEVICE_PROP(property, aspect_tag, default_value)            \
   device_prop->property = raw_device.has(sycl::aspect::ext_intel_##aspect_tag) \
       ? raw_device.get_info<intel::info::device::property>()                   \
+=======
+#define ASSIGN_EXT_DEVICE_PROP(property, default_value)                      \
+  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##property) \
+      ? raw_device.get_info<intel::info::device::property>()                 \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : default_value;
 
 #define ASSIGN_DEVICE_ASPECT(member) \
   device_prop->has_##member = raw_device.has(sycl::aspect::member);
 
+<<<<<<< HEAD
 #define ASSIGN_EXP_CL_ASPECT(member) \
   device_prop->has_##member =        \
       raw_device.ext_oneapi_supports_cl_extension("cl_intel_" #member);
+=======
+#define ASSIGN_EXP_CL_ASPECT(member)                                       \
+  device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
+      "cl_intel_" #member, &cl_version);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define ASSIGN_EXP_DEVICE_PROP(property) \
   device_prop->property =                \
@@ -187,6 +217,11 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 
   AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
 
+<<<<<<< HEAD
+=======
+  // TODO: Remove cl_version since it is unnecessary.
+  sycl::ext::oneapi::experimental::cl_version cl_version;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
 
 #if SYCL_COMPILER_VERSION >= 20250000
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 0e86e826405c6..1f65561bbb37c 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -243,8 +243,13 @@ configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
   COPYONLY)
 
 # Generate header with version info
+<<<<<<< HEAD
 configure_file("${TORCH_SRC_DIR}/headeronly/version.h.in"
   "${TORCH_SRC_DIR}/headeronly/version.h"
+=======
+configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
+  "${TORCH_SRC_DIR}/csrc/api/include/torch/version.h"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   @ONLY)
 
 set(GENERATED_CXX_TORCH
@@ -267,7 +272,10 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
     "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cpu.cpp"
+<<<<<<< HEAD
     "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_aten.cpp"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   )
   if(BUILD_LAZY_TS_BACKEND)
     list(APPEND GENERATED_CXX_TORCH
@@ -316,7 +324,10 @@ set(GENERATED_CXX_PYTHON
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
+<<<<<<< HEAD
   "${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   )
 
 set(GENERATED_H_PYTHON
@@ -380,9 +391,12 @@ add_custom_command(
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
+<<<<<<< HEAD
     "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
     "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ${autograd_python}
     ${autograd_yaml}
     ${autograd_templates}
@@ -556,6 +570,7 @@ if(USE_CUDA OR USE_ROCM)
   append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()
 
+<<<<<<< HEAD
 # NativeRT is disabled
 # if(USE_CUDA)
 #   append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
@@ -564,6 +579,8 @@ endif()
 #   append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
 # endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(USE_CUDA)
   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
   add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@@ -593,7 +610,10 @@ if(USE_CUDA)
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+<<<<<<< HEAD
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
       )
     endif()
@@ -607,12 +627,15 @@ if(USE_CUDA)
       set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
     endif()
   endif()
+<<<<<<< HEAD
   if(NOT WIN32)
     set_source_files_properties(
       ${TORCH_ROOT}/aten/src/ATen/cuda/CUDAGreenContext.cpp
       PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
     )
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set_source_files_properties(
     ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
     PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
@@ -844,6 +867,10 @@ if(USE_MPS)
   if(CAN_COMPILE_METAL)
     add_dependencies(torch_cpu metallibs)
     target_link_options(torch_cpu PRIVATE -Wl,-sectcreate,__TEXT,metal_basic,${CMAKE_CURRENT_BINARY_DIR}/aten/src/ATen/kernels_basic.metallib)
+<<<<<<< HEAD
+=======
+    target_link_options(torch_cpu PRIVATE -Wl,-sectcreate,__TEXT,metal_bfloat,${CMAKE_CURRENT_BINARY_DIR}/aten/src/ATen/kernels_bfloat.metallib)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else()
     target_compile_definitions(torch_cpu PRIVATE PYTORCH_JIT_COMPILE_SHADERS)
   endif()
@@ -910,6 +937,18 @@ if(USE_LLVM AND LLVM_FOUND)
     support core analysis executionengine instcombine
     scalaropts transformutils ${LLVM_TARGETS_TO_BUILD} orcjit)
   target_link_libraries(torch_cpu PRIVATE ${LLVM_LINK_LIBS})
+<<<<<<< HEAD
+=======
+  if(APPLE)
+    set(LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/unexported_symbols.lds")
+    set_target_properties(torch_cpu PROPERTIES LINK_DEPENDS ${LINKER_SCRIPT})
+    set_target_properties(torch_cpu PROPERTIES LINK_FLAGS "-Wl,-unexported_symbols_list,${LINKER_SCRIPT}")
+  elseif(UNIX)
+    set(LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/version_script.lds")
+    set_target_properties(torch_cpu PROPERTIES LINK_DEPENDS ${LINKER_SCRIPT})
+    target_link_libraries(torch_cpu PRIVATE "-Wl,--version-script=${LINKER_SCRIPT}")
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif(USE_LLVM AND LLVM_FOUND)
 
 # This is required for older versions of CMake, which don't allow
@@ -997,6 +1036,7 @@ elseif(USE_CUDA)
     target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
   endif()
 
+<<<<<<< HEAD
   # Compile with NVSHMEM
   # Default value of `USE_NVSHMEM` is set in CMakeLists.txt under root, to ON.
   if(USE_NVSHMEM)
@@ -1033,17 +1073,44 @@ elseif(USE_CUDA)
   # If NVSHMEM_LIBRARY is found, we build torch_cuda with NVSHMEM support.
   if(NVSHMEM_HOST_LIB AND NVSHMEM_DEVICE_LIB AND NVSHMEM_INCLUDE_DIR)
     message(STATUS "NVSHMEM found, building with NVSHMEM support")
+=======
+  # Use env var for these for now for prototyping purposes
+  set(USE_NVSHMEM $ENV{USE_NVSHMEM} CACHE BOOL "Whether to build with NVSHMEM support")
+  # If user has specified NVSHMEM_HOME, we use it;
+  # Otherwise, NVSHMEM_HOME is auto detected in tools/setup_helpers/cmake.py
+  if($ENV{NVSHMEM_HOME})
+    set(NVSHMEM_HOME $ENV{NVSHMEM_HOME} CACHE PATH "Path to NVSHMEM build dir")
+  endif()
+
+  if(USE_NVSHMEM AND NOT DEFINED NVSHMEM_HOME)
+    message(WARNING "USE_NVSHMEM set to 1 but NVSHMEM_HOME not found. Please run `pip install nvidia-nvshmem-<version>`, or set NVSHMEM_HOME to the NVSHMEM build dir")
+    # Disable nvshmem if NVSHMEM_HOME is not found
+    set(USE_NVSHMEM FALSE CACHE BOOL "Whether to build with NVSHMEM support")
+  endif()
+
+  if(USE_NVSHMEM)
+    message("Building with NVSHMEM support:  '${NVSHMEM_HOME}'")
+    set(NVSHMEM_INCLUDE_DIR "${NVSHMEM_HOME}/include")
+    set(NVSHMEM_LIB_DIR "${NVSHMEM_HOME}/lib")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     include_directories(${NVSHMEM_INCLUDE_DIR})
 
     # Linking with nvshmem requires the source binary to be built with -rdc
     # which is not viable for libtorch_cuda. So we isolate the linking of
+<<<<<<< HEAD
     # nvshmem in torch_nvshmem.
     add_library(torch_nvshmem SHARED
+=======
+    # nvshmem in nvshmem_extension.
+    add_library(nvshmem_extension SHARED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
         "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
         "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
         "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
     )
+<<<<<<< HEAD
     set_target_properties(torch_nvshmem PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     target_compile_options(torch_nvshmem PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
     target_compile_options(torch_nvshmem PRIVATE "-U__CUDA_NO_HALF_OPERATORS__")
@@ -1057,6 +1124,22 @@ elseif(USE_CUDA)
     install(TARGETS torch_nvshmem EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   else()
     message(STATUS "NVSHMEM not found, not building with NVSHMEM support.")
+=======
+    set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
+    target_compile_options(nvshmem_extension PRIVATE "-U__CUDA_NO_HALF_OPERATORS__")
+    target_link_directories(nvshmem_extension PRIVATE ${NVSHMEM_LIB_DIR})
+    target_link_libraries(nvshmem_extension PRIVATE
+        # Full path needed bc nvshmem wheel ships with .so.3 instead of .so;
+        # otherwise, we could just write `nvshmem_host`
+        ${NVSHMEM_LIB_DIR}/libnvshmem_host.so.3
+        nvshmem_device
+    )
+    target_compile_definitions(torch_cuda PUBLIC USE_NVSHMEM)
+    target_compile_definitions(nvshmem_extension PUBLIC USE_NVSHMEM)
+    target_link_libraries(torch_cuda PRIVATE nvshmem_extension)
+    install(TARGETS nvshmem_extension EXPORT Caffe2Targets DESTINATION lib)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 
   if(USE_UCC)
@@ -1072,7 +1155,11 @@ elseif(USE_CUDA)
         UNFUSE_FMA                      # Addressing issue #121558
       )
     target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
+<<<<<<< HEAD
     target_include_directories(torch_cuda SYSTEM PUBLIC
+=======
+    target_include_directories(torch_cuda PUBLIC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
@@ -1102,10 +1189,24 @@ elseif(USE_CUDA)
         torch_cuda
     )
     if($ENV{ATEN_STATIC_CUDA})
+<<<<<<< HEAD
     target_link_libraries(torch_cuda_linalg PRIVATE
         CUDA::cusolver_static
         ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
     )
+=======
+      if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
+        target_link_libraries(torch_cuda_linalg PRIVATE
+            CUDA::cusolver_static
+            ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static
+        )
+      elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
+        target_link_libraries(torch_cuda_linalg PRIVATE
+            CUDA::cusolver_static
+            ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static
+        )
+      endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else()
       target_link_libraries(torch_cuda_linalg PRIVATE
           CUDA::cusolver
@@ -1132,11 +1233,14 @@ elseif(USE_CUDA)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
   endif()
+<<<<<<< HEAD
   # Set driver api defined for PeerToPeerAccess
   if(NOT WIN32)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/PeerToPeerAccess.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1")
   endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_XPU)
@@ -1193,11 +1297,27 @@ if(USE_XPU)
   if(NOT TARGET torch_xpu_ops)
     message(WARNING "Failed to include ATen XPU implementation target")
   else()
+<<<<<<< HEAD
+=======
+    target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # USE_C10D_XCCL to decide if XCCL backend is enabled in torch-xpu-ops build.
     if(USE_C10D_XCCL)
       target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
     endif()
+<<<<<<< HEAD
     target_link_libraries(torch_xpu PRIVATE $<LINK_LIBRARY:WHOLE_ARCHIVE,torch_xpu_ops>)
+=======
+    if(MSVC)
+      # Windows
+      target_link_options(torch_xpu PRIVATE
+      "-WHOLEARCHIVE:$<TARGET_FILE:torch_xpu_ops>")
+    else()
+      # Linux
+      target_link_options(torch_xpu PRIVATE
+          "-Wl,--whole-archive,$<TARGET_FILE:torch_xpu_ops>,--no-whole-archive")
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Set cached ${ATen_XPU_INCLUDE_DIRS} to torch
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
@@ -1358,6 +1478,7 @@ if(BUILD_TEST)
     )
   else()
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
+<<<<<<< HEAD
     add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
     # NativeRT is disabled
     # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
@@ -1367,6 +1488,14 @@ if(BUILD_TEST)
       add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
     endif()
 
+=======
+    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
+    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/tensorexpr
+      ${CMAKE_BINARY_DIR}/test_tensorexpr
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if(USE_DISTRIBUTED)
       add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
       if(NOT WIN32)
@@ -1384,6 +1513,19 @@ if(BUILD_TEST)
         ${CMAKE_BINARY_DIR}/test_mobile_nnc
       )
     endif()
+<<<<<<< HEAD
+=======
+    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
+                     ${CMAKE_BINARY_DIR}/test_lazy)
+  endif()
+  if(BUILD_AOT_INDUCTOR_TEST)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_abi_check
+      ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_inference
+      ${CMAKE_BINARY_DIR}/test_aoti_inference)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 endif()
 
@@ -1454,8 +1596,13 @@ if(USE_ROCM)
   if(USE_MEM_EFF_ATTENTION)
     target_compile_definitions(torch_hip PRIVATE USE_MEM_EFF_ATTENTION)
   endif()
+<<<<<<< HEAD
   if(USE_ROCM_CK_SDPA)
     target_compile_definitions(torch_hip PRIVATE USE_ROCM_CK_SDPA)
+=======
+  if(USE_CK_FLASH_ATTENTION)
+    target_compile_definitions(torch_hip PRIVATE USE_CK_FLASH_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 endif()
 
@@ -1641,11 +1788,15 @@ if(USE_CUDA)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
   target_link_libraries(torch_cuda PUBLIC c10_cuda)
+<<<<<<< HEAD
   if(TARGET torch::nvtx3)
     target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
   else()
     target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
   endif()
+=======
+  target_link_libraries(torch_cuda PRIVATE CUDA::nvtx3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   target_include_directories(
       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@@ -1658,10 +1809,13 @@ if(USE_CUDA)
   # order of the libraries in the linker call matters here when statically
   # linking; libculibos and cublas must be last.
   target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+<<<<<<< HEAD
   if(USE_FBGEMM_GENAI)
     # Link fbgemm_genai to torch_cuda (only for (1) CUDA build for SM100).
     target_link_libraries(torch_cuda PRIVATE fbgemm_genai)
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 # ---[ XPU library.
@@ -1741,9 +1895,12 @@ if(BUILD_SHARED_LIBS)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
     target_link_libraries(torch_global_deps torch::cudart)
+<<<<<<< HEAD
     if(TARGET torch::nvtoolsext)
       target_link_libraries(torch_global_deps torch::nvtoolsext)
     endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
@@ -1782,11 +1939,14 @@ if(USE_ROCM)
   target_link_libraries(torch_hip PUBLIC torch_cpu_library ${Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS})
   target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
 
+<<<<<<< HEAD
   if(USE_FBGEMM_GENAI)
     if(USE_ROCM)
       target_link_libraries(torch_hip PRIVATE fbgemm_genai)
     endif()
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
   # ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
   target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})
@@ -1850,12 +2010,15 @@ if(BUILD_TEST)
               target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined)
             endif()
           endif()
+<<<<<<< HEAD
           if(USE_LSAN AND TARGET Sanitizer::leak)
             target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::leak)
           endif()
           if(USE_TSAN AND TARGET Sanitizer::thread)
             target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::thread)
           endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else()
           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
           target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 43254cddf26e2..7418f408449fe 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -74,7 +74,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
         )
 
     code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
+<<<<<<< HEAD
     for i in range(uf):
+=======
+    for i in range(0, uf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         j = 8 * i
         code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();")
 
@@ -158,7 +162,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
         "&input[idx_pref_T0 * fused_block_size];"
     )
 
+<<<<<<< HEAD
     for i in range(uf):
+=======
+    for i in range(0, uf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         j = 8 * i
         cachelinesize = 64
         byteoffset = sizeof[InType] * j
@@ -170,7 +178,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
         code.append("      if (!normalize_by_lengths || length == 0) {")
     else:
         code.append("      if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
+<<<<<<< HEAD
     for i in range(uf):
+=======
+    for i in range(0, uf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         j = 8 * i
         code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
     code.append("      } else {")
@@ -181,7 +193,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
         code.append(
             "        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
         )
+<<<<<<< HEAD
     for i in range(uf):
+=======
+    for i in range(0, uf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         j = 8 * i
         code.append(
             "        _mm256_storeu_ps(&op["
diff --git a/caffe2/perfkernels/sve_emblookup_codegen.py b/caffe2/perfkernels/sve_emblookup_codegen.py
index 6a63920cc8bb3..f1d9d3dff7cdd 100644
--- a/caffe2/perfkernels/sve_emblookup_codegen.py
+++ b/caffe2/perfkernels/sve_emblookup_codegen.py
@@ -38,7 +38,11 @@ def compute_output(num_unrolls, InType, is_main):
     code = []
 
     if num_unrolls == 1:
+<<<<<<< HEAD
         code.append("    // tail loop")
+=======
+        code.append(f"    // tail loop")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.append("    if (j < end_offset) {")
     else:
         code.append(f"    // unrolling {num_unrolls} times")
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 015c480cf04f0..ed9c392f5f4a7 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -27,10 +27,13 @@
 #include "caffe2/serialize/versions.h"
 #include "miniz.h"
 
+<<<<<<< HEAD
 #ifdef _WIN32
 #include <Windows.h>
 #endif // _WIN32
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace caffe2 {
 namespace serialize {
 constexpr std::string_view kDebugPklSuffix(".debug_pkl");
@@ -715,6 +718,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
   if (archive_name_.size() == 0) {
     CAFFE_THROW("invalid file name: ", file_name);
   }
+<<<<<<< HEAD
 
   const std::string dir_name = parentdir(file_name);
   if (!dir_name.empty()) {
@@ -744,6 +748,23 @@ void PyTorchStreamWriter::setup(const string& file_name) {
 #endif // _WIN32
     }
 
+=======
+  if (!writer_func_) {
+    file_stream_.open(
+        file_name,
+        std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
+    valid("opening archive ", file_name.c_str());
+
+    const std::string dir_name = parentdir(file_name);
+    if (!dir_name.empty()) {
+      struct stat st;
+      bool dir_exists =
+          (stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
+      TORCH_CHECK(
+          dir_exists, "Parent directory ", dir_name, " does not exist.");
+    }
+    TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
       if (!buf) {
         // See [Note: write_record_metadata]
diff --git a/caffe2/unexported_symbols.lds b/caffe2/unexported_symbols.lds
new file mode 100644
index 0000000000000..a8b10ca3de1a1
--- /dev/null
+++ b/caffe2/unexported_symbols.lds
@@ -0,0 +1 @@
+*4llvm*
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 274456ffc5322..151cd74bbe908 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -39,7 +39,11 @@ struct AllocAligned {
 #elif defined(_MSC_VER)
     p = _aligned_malloc(sizeof(T), kGEMMLOWPCacheLineSize);
 #else
+<<<<<<< HEAD
     auto res = posix_memalign(&p, kGEMMLOWPCacheLineSize, sizeof(T));
+=======
+    auto res = posix_memalign((void**)&p, kGEMMLOWPCacheLineSize, sizeof(T));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (void)res;
 #endif
 
diff --git a/caffe2/version_script.lds b/caffe2/version_script.lds
new file mode 100644
index 0000000000000..8a97b5aef027e
--- /dev/null
+++ b/caffe2/version_script.lds
@@ -0,0 +1,4 @@
+pytorch {
+  local:
+    *4llvm*;
+};
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 55d03b7c46320..ebdac2db90f26 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -91,6 +91,7 @@ if(INTERN_BUILD_ATEN_OPS)
       torch_cuda_get_nvcc_gencode_flag(_existing_arch_flags)
 
       set(_file_compile_flags "")
+<<<<<<< HEAD
       foreach(_arch ${archs})
         if("${_arch}" STREQUAL "89")
           if(_existing_arch_flags MATCHES ".*compute_86.*")
@@ -119,6 +120,32 @@ if(INTERN_BUILD_ATEN_OPS)
           endif()
         endif()
       endforeach()
+=======
+      if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
+        foreach(_arch ${archs})
+          if("${_arch}" STREQUAL "89")
+            if(_existing_arch_flags MATCHES ".*compute_86.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_89,code=sm_89")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "90a")
+            if(_existing_arch_flags MATCHES ".*compute_90.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_90a,code=sm_90a")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "100a")
+            if(_existing_arch_flags MATCHES ".*compute_100.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_100a,code=sm_100a")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "120a")
+            if(_existing_arch_flags MATCHES ".*compute_120.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
+            endif()
+          endif()
+        endforeach()
+      endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       list(JOIN _file_compile_flags " " _file_compile_flags)
 
       set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${_file_compile_flags}")
@@ -126,13 +153,21 @@ if(INTERN_BUILD_ATEN_OPS)
 
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
+<<<<<<< HEAD
       "89;90a;100a;103a;120a")
+=======
+      "89;90a;100a;120a")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
       "90a")
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/GroupMM.cu"
+<<<<<<< HEAD
       "90a;100a;103a")
+=======
+      "90a")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   endif()
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 733183ef50bd5..9a06f5d829c0e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -108,19 +108,28 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
   enable_ubsan()
 endif()
 
+<<<<<<< HEAD
 if(USE_ASAN OR USE_LSAN OR USE_TSAN)
+=======
+if(USE_ASAN OR USE_TSAN)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   find_package(Sanitizer REQUIRED)
   if(USE_ASAN)
     if(TARGET Sanitizer::address)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address)
     else()
+<<<<<<< HEAD
       message(WARNING "ASAN not found. Suppress this warning with -DUSE_ASAN=OFF.")
+=======
+      message(WARNING "Not ASAN found. Suppress this warning with -DUSE_ASAN=OFF.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       caffe2_update_option(USE_ASAN OFF)
     endif()
     if(TARGET Sanitizer::undefined)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined)
     endif()
   endif()
+<<<<<<< HEAD
   if(USE_LSAN)
     if(TARGET Sanitizer::leak)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::leak)
@@ -129,11 +138,17 @@ if(USE_ASAN OR USE_LSAN OR USE_TSAN)
       caffe2_update_option(USE_LSAN OFF)
     endif()
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(USE_TSAN)
     if(TARGET Sanitizer::thread)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread)
     else()
+<<<<<<< HEAD
       message(WARNING "TSAN not found. Suppress this warning with -DUSE_TSAN=OFF.")
+=======
+      message(WARNING "Not TSAN found. Suppress this warning with -DUSE_TSAN=OFF.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       caffe2_update_option(USE_TSAN OFF)
     endif()
   endif()
@@ -161,7 +176,10 @@ set(AT_MKLDNN_ACL_ENABLED 0)
 set(AT_MKLDNN_ENABLED 0)
 set(AT_MKL_ENABLED 0)
 set(AT_KLEIDIAI_ENABLED 0)
+<<<<<<< HEAD
 set(AT_USE_EIGEN_SPARSE 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # setting default preferred BLAS options if not already present.
 if(NOT INTERN_BUILD_MOBILE)
   set(BLAS "MKL" CACHE STRING "Selected BLAS library")
@@ -172,7 +190,10 @@ else()
 endif()
 set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL")
 message(STATUS "Trying to find preferred BLAS backend of choice: " ${BLAS})
+<<<<<<< HEAD
 set(BLAS_CHECK_F2C 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(BLAS STREQUAL "Eigen")
   # Eigen is header-only and we do not have any dependent libraries
@@ -185,7 +206,10 @@ elseif(BLAS STREQUAL "ATLAS")
   set(BLAS_INFO "atlas")
   set(BLAS_FOUND 1)
   set(BLAS_LIBRARIES ${ATLAS_LIBRARIES} cblas)
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "OpenBLAS")
   find_package(OpenBLAS REQUIRED)
   include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
@@ -193,12 +217,18 @@ elseif(BLAS STREQUAL "OpenBLAS")
   set(BLAS_INFO "open")
   set(BLAS_FOUND 1)
   set(BLAS_LIBRARIES ${OpenBLAS_LIB})
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "BLIS")
   find_package(BLIS REQUIRED)
   include_directories(SYSTEM ${BLIS_INCLUDE_DIR})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${BLIS_LIB})
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "MKL")
   if(BLAS_SET_BY_USER)
     find_package(MKL REQUIRED)
@@ -228,7 +258,10 @@ elseif(BLAS STREQUAL "NVPL")
   set(BLAS_INFO "nvpl")
   set(BLAS_FOUND 1)
   set(BLAS_USE_CBLAS_DOT TRUE)
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "vecLib")
   find_package(vecLib REQUIRED)
   include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
@@ -240,14 +273,20 @@ elseif(BLAS STREQUAL "FlexiBLAS")
   find_package(FlexiBLAS REQUIRED)
   include_directories(SYSTEM ${FlexiBLAS_INCLUDE_DIR})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${FlexiBLAS_LIB})
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "APL")
   find_package(APL REQUIRED)
   include_directories(SYSTEM ${APL_INCLUDE_DIR})
   set(BLAS_INFO "apl")
   set(BLAS_FOUND 1)
   set(BLAS_LIBRARIES ${APL_LIBRARIES})
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elseif(BLAS STREQUAL "Generic")
   # On Debian family, the CBLAS ABIs have been merged into libblas.so
   if(ENV{GENERIC_BLAS_LIBRARIES} STREQUAL "")
@@ -261,11 +300,15 @@ elseif(BLAS STREQUAL "Generic")
   set(GENERIC_BLAS_FOUND TRUE)
   set(BLAS_INFO "generic")
   set(BLAS_FOUND 1)
+<<<<<<< HEAD
   set(BLAS_CHECK_F2C 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else()
   message(FATAL_ERROR "Unrecognized BLAS option: " ${BLAS})
 endif()
 
+<<<<<<< HEAD
 # Determine if blas was compiled with the f2c conventions
 if(BLAS_LIBRARIES AND BLAS_CHECK_F2C)
   include(cmake/BLAS_ABI.cmake)
@@ -280,6 +323,8 @@ if(USE_EIGEN_SPARSE)
   set(AT_USE_EIGEN_SPARSE 1)
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(NOT INTERN_BUILD_MOBILE)
   set(AT_MKL_SEQUENTIAL 0)
   set(USE_BLAS 1)
@@ -594,7 +639,11 @@ elseif(NOT TARGET XNNPACK AND USE_SYSTEM_XNNPACK)
   find_library(microkernels-prod_LIBRARY microkernels-prod)
   set_property(TARGET XNNPACK PROPERTY IMPORTED_LOCATION "${XNNPACK_LIBRARY}")
   set_property(TARGET microkernels-prod PROPERTY IMPORTED_LOCATION "${microkernels-prod_LIBRARY}")
+<<<<<<< HEAD
   if(NOT XNNPACK_LIBRARY OR NOT microkernels-prod_LIBRARY)
+=======
+  if(NOT XNNPACK_LIBRARY or NOT microkernels-prod_LIBRARY)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     message(FATAL_ERROR "Cannot find XNNPACK")
   endif()
   message("-- Found XNNPACK: ${XNNPACK_LIBRARY}")
@@ -682,20 +731,71 @@ if(USE_FBGEMM)
   if(NOT DEFINED FBGEMM_SOURCE_DIR)
     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
   endif()
+<<<<<<< HEAD
   if(USE_FBGEMM AND NOT TARGET fbgemm)
     set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "")
     set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "")
     set(FBGEMM_LIBRARY_TYPE "static" CACHE STRING "")
     add_subdirectory("${FBGEMM_SOURCE_DIR}")
 
+=======
+  if(NOT CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
+    message(WARNING
+      "A compiler with AVX512 support is required for FBGEMM. "
+      "Not compiling with FBGEMM. "
+      "Turn this warning off by USE_FBGEMM=OFF.")
+    set(USE_FBGEMM OFF)
+  endif()
+  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    message(WARNING
+      "x64 operating system is required for FBGEMM. "
+      "Not compiling with FBGEMM. "
+      "Turn this warning off by USE_FBGEMM=OFF.")
+    set(USE_FBGEMM OFF)
+  endif()
+  if(USE_FBGEMM AND NOT TARGET fbgemm)
+    set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "")
+    set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "")
+    if(MSVC AND BUILD_SHARED_LIBS)
+      set(FBGEMM_LIBRARY_TYPE "shared" CACHE STRING "")
+    else()
+      set(FBGEMM_LIBRARY_TYPE "static" CACHE STRING "")
+    endif()
+    if(USE_ASAN)
+      set(USE_SANITIZER "address,undefined" CACHE STRING "-fsanitize options for FBGEMM")
+    endif()
+    add_subdirectory("${FBGEMM_SOURCE_DIR}")
+    set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+    # Disabling autovec in fbgemm due to large library size causing symbol relocation issues, which is only allowed in static builds.
+    # Long-term solution involves modularizing fbgemm targets.
+    target_compile_definitions(fbgemm_generic PUBLIC DISABLE_FBGEMM_AUTOVEC)
+    target_compile_definitions(fbgemm_avx2 PUBLIC DISABLE_FBGEMM_AUTOVEC)
+    target_compile_definitions(fbgemm_avx512 PUBLIC DISABLE_FBGEMM_AUTOVEC)
+
+    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0)
+      # See https://github.com/pytorch/pytorch/issues/74352
+      target_compile_options_if_supported(asmjit -Wno-deprecated-copy)
+      target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       target_compile_options_if_supported(asmjit -Wno-extra-semi)
       target_compile_options_if_supported(fbgemm -Wno-extra-semi)
     endif()
+<<<<<<< HEAD
     target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
     target_compile_options_if_supported(asmjit -Wno-unused-variable)
   endif()
   if(USE_FBGEMM)
+=======
+  endif()
+  if(USE_FBGEMM)
+    target_compile_definitions(fbgemm PUBLIC DISABLE_FBGEMM_AUTOVEC)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
   endif()
 endif()
@@ -704,6 +804,12 @@ if(USE_FBGEMM)
   caffe2_update_option(USE_FBGEMM ON)
 else()
   caffe2_update_option(USE_FBGEMM OFF)
+<<<<<<< HEAD
+=======
+  message(WARNING
+    "Turning USE_FAKELOWP off as it depends on USE_FBGEMM.")
+  caffe2_update_option(USE_FAKELOWP OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_OPENCL)
@@ -821,9 +927,15 @@ if(NOT Python_Interpreter_FOUND)
   message(FATAL_ERROR "Python3 could not be found.")
 endif()
 
+<<<<<<< HEAD
 if(${Python_VERSION} VERSION_LESS 3.10)
   message(FATAL_ERROR
     "Found Python libraries version ${Python_VERSION}. Python < 3.10 is no longer supported by PyTorch.")
+=======
+if(${Python_VERSION} VERSION_LESS 3.9)
+  message(FATAL_ERROR
+    "Found Python libraries version ${Python_VERSION}. Python < 3.9 is no longer supported by PyTorch.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 # ---[ Python + Numpy
@@ -961,6 +1073,7 @@ endif()
 # ---[ nvtx
 if(USE_SYSTEM_NVTX)
   find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS})
+<<<<<<< HEAD
 else()
   find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
 endif()
@@ -973,6 +1086,19 @@ else()
   message(WARNING "Cannot find NVTX3, find old NVTX instead")
   add_library(torch::nvtoolsext INTERFACE IMPORTED)
   set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
+=======
+  find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
+  if(NOT nvtx3_FOUND)
+    message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead")
+  endif()
+endif()
+if(NOT TARGET CUDA::nvtx3)
+  add_library(CUDA::nvtx3 INTERFACE IMPORTED)
+endif()
+if(NOT nvtx3_dir)
+  find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
+  target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 
@@ -1013,6 +1139,10 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
     list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative)
     list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow)
+<<<<<<< HEAD
+=======
+    list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
     list(APPEND HIP_CXX_FLAGS -std=c++17)
@@ -1024,9 +1154,12 @@ if(USE_ROCM)
     if(HIPBLASLT_VEC_EXT)
       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
     endif()
+<<<<<<< HEAD
     if(USE_ROCM_CK_GEMM)
       list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK_GEMM)
     endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
     if(WIN32)
       add_definitions(-DROCM_ON_WINDOWS)
@@ -1044,6 +1177,7 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+<<<<<<< HEAD
     # Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
     if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
       set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
@@ -1053,6 +1187,22 @@ if(USE_ROCM)
 
     if(USE_LAYERNORM_FAST_RECIPROCAL)
       add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
+=======
+    # Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
+    endif()
+
+    set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
+      ${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
+      CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
+    )
+
+    if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     endif()
 
     # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
@@ -1136,7 +1286,11 @@ if(USE_UCC)
 endif()
 
 # ---[ CUB
+<<<<<<< HEAD
 if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
+=======
+if(USE_CUDA)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   find_package(CUB)
   if(NOT CUB_FOUND)
     message(FATAL_ERROR "Cannot find CUB.")
@@ -1159,10 +1313,23 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
 
     # Tensorpipe uses cuda_add_library
     torch_update_find_cuda_flags()
+<<<<<<< HEAD
+=======
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
+      message(WARNING "Archived TensorPipe forces CMake compatibility mode")
+      set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
     # Suppress warning to unblock libnop compilation by clang-17
     # See https://github.com/pytorch/pytorch/issues/151316
     target_compile_options_if_supported(tensorpipe -Wno-missing-template-arg-list-after-template-kw)
+<<<<<<< HEAD
+=======
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
+      unset(CMAKE_POLICY_VERSION_MINIMUM)
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
     list(APPEND Caffe2_DEPENDENCY_LIBS nlohmann)
@@ -1228,6 +1395,7 @@ if(USE_GLOO)
       if(NOT Gloo_FOUND)
         message(FATAL_ERROR "Cannot find gloo")
       endif()
+<<<<<<< HEAD
       message("Found gloo: ${Gloo_NATIVE_LIBRARY}, cuda lib: ${Gloo_CUDA_LIBRARY}, hip lib: ${Gloo_HIP_LIBRARY}")
       message("Found gloo include directories: ${Gloo_INCLUDE_DIRS}")
       add_library(gloo SHARED IMPORTED)
@@ -1239,6 +1407,12 @@ if(USE_GLOO)
         add_library(gloo_hip SHARED IMPORTED)
         set_target_properties(gloo_hip PROPERTIES IMPORTED_LOCATION ${Gloo_HIP_LIBRARY})
       endif()
+=======
+      message("Found gloo: ${Gloo_LIBRARY}")
+      message("Found gloo include directories: ${Gloo_INCLUDE_DIRS}")
+      add_library(gloo SHARED IMPORTED)
+      set_target_properties(gloo PROPERTIES IMPORTED_LOCATION ${Gloo_LIBRARY})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # need to use Gloo_INCLUDE_DIRS over third_party/gloo to find Gloo's auto-generated config.h
       include_directories(BEFORE SYSTEM ${Gloo_INCLUDE_DIRS})
     endif()
@@ -1551,11 +1725,14 @@ if(NOT INTERN_BUILD_MOBILE)
     if(HAVE_MALLOC_USABLE_SIZE)
       add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
     endif(HAVE_MALLOC_USABLE_SIZE)
+<<<<<<< HEAD
     set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
     CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
     if(HAVE_POSIX_FALLOCATE)
       add_definitions(-DHAVE_POSIX_FALLOCATE=1)
     endif(HAVE_POSIX_FALLOCATE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif(UNIX)
 
   add_definitions(-DUSE_EXTERNAL_MZCRC)
@@ -1567,12 +1744,15 @@ endif()
 #
 # End ATen checks
 #
+<<<<<<< HEAD
 
 # Install `fmtlib` header.
 # This was the default behavior before version 12.0.0.
 # Since PyTorch C API depends on it, make it available for projects that
 # depend on PyTorch.
 set(FMT_INSTALL ON)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
@@ -1695,9 +1875,15 @@ if(USE_KINETO)
         set(CMAKE_REQUIRED_LINK_OPTIONS "")
         if(NOT EXCEPTIONS_WORK)
           message(FATAL_ERROR
+<<<<<<< HEAD
             "Detected that statically linking against CUPTI causes exceptions to stop working.  "
             "See https://github.com/pytorch/pytorch/issues/57744 for more details.  "
             "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python -m pip install -e . -v --no-build-isolation")
+=======
+            "Detected that statically linking against CUPTI causes exceptions to stop working. "
+            "See https://github.com/pytorch/pytorch/issues/57744 for more details. "
+            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         endif()
       endif()
 
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index b19f25609cad3..87e38c3a5bf22 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -244,8 +244,12 @@ if(NOT __AOTRITON_INCLUDED)
   else()
     set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
     list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_SYSTEM_ROCM}" __AOTRITON_RUNTIME_INDEX)
+<<<<<<< HEAD
     # Always build aotriton runtime from source on Windows due to lack of pre-built binaries
     if(${__AOTRITON_RUNTIME_INDEX} LESS 0 OR WIN32)
+=======
+    if(${__AOTRITON_RUNTIME_INDEX} LESS 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       message(STATUS "Cannot find AOTriton runtime for ROCM ${__AOTRITON_SYSTEM_ROCM}. \
       Build runtime from source")
       aotriton_build_from_source(ON aotriton_runtime)
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index 54126b1f130dc..a502a416c0997 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -2,6 +2,27 @@ include(CheckCXXSourceCompiles)
 include(CheckCXXCompilerFlag)
 include(CMakePushCheckState)
 
+<<<<<<< HEAD
+=======
+# ---[ Check if we want to turn off deprecated warning due to glog.
+if(USE_GLOG)
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+  CHECK_CXX_SOURCE_COMPILES(
+      "#include <glog/stl_logging.h>
+      int main(int argc, char** argv) {
+        return 0;
+      }" CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING
+      FAIL_REGEX ".*-Wno-deprecated.*")
+
+  if(NOT CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING AND NOT MSVC)
+    message(STATUS "Turning off deprecation warning due to glog.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
+  endif()
+  cmake_pop_check_state()
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ---[ Check if the compiler has AVX/AVX2 support. We only check AVX2.
 if(NOT INTERN_BUILD_MOBILE)
   find_package(AVX) # checks AVX and AVX2
@@ -12,6 +33,49 @@ if(NOT INTERN_BUILD_MOBILE)
     set(CAFFE2_PERF_WITH_AVX2 1)
   endif()
 endif()
+<<<<<<< HEAD
+=======
+# ---[ Check if the compiler has AVX512 support.
+cmake_push_check_state(RESET)
+if(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  # We could've used MSVC's hidden option /arch:AVX512 that defines __AVX512F__,
+  # __AVX512DQ__, and __AVX512VL__, and /arch:AVX512F that defines __AVX512F__.
+  # But, we chose not to do that not to rely on hidden options.
+  set(CMAKE_REQUIRED_FLAGS "/D__AVX512F__ /D__AVX512DQ__ /D__AVX512VL__")
+else()
+  # We only consider the case where all of avx512f, avx512dq, and avx512vl are
+  # supported.
+  # Platforms where avx512f is supported by not avx512dq and avx512vl as of
+  # Jan 15 2019 : linux_manywheel_2.7mu_cpu_build and
+  # linux_conda_3.7_cu100_build
+  set(CMAKE_REQUIRED_FLAGS "-mavx512f -mavx512dq -mavx512vl")
+endif()
+CHECK_CXX_SOURCE_COMPILES(
+    "#if defined(_MSC_VER)
+     #include <intrin.h>
+     #else
+     #include <immintrin.h>
+     #endif
+     // check avx512f
+     __m512 addConstant(__m512 arg) {
+       return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
+     }
+     // check avx512dq
+     __m512 andConstant(__m512 arg) {
+       return _mm512_and_ps(arg, _mm512_set1_ps(1.f));
+     }
+     int main() {
+       __m512i a = _mm512_set1_epi32(1);
+       __m256i ymm = _mm512_extracti64x4_epi64(a, 0);
+       ymm = _mm256_abs_epi64(ymm); // check avx512vl
+       __mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
+       __m512i r = _mm512_andnot_si512(a, a);
+     }" CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
+if(CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
+  message(STATUS "Current compiler supports avx512f extension. Will build fbgemm.")
+endif()
+cmake_pop_check_state()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # ---[ Checks if compiler supports -fvisibility=hidden
 check_cxx_compiler_flag("-fvisibility=hidden" COMPILER_SUPPORTS_HIDDEN_VISIBILITY)
diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
index b4b158fc4965c..9d8baf1eeac1a 100644
--- a/cmake/Modules/FindBLAS.cmake
+++ b/cmake/Modules/FindBLAS.cmake
@@ -311,8 +311,85 @@ endif()
 
 # Determine if blas was compiled with the f2c conventions
 IF (BLAS_LIBRARIES)
+<<<<<<< HEAD
   include(cmake/BLAS_ABI.cmake)
 endif(BLAS_LIBRARIES)
+=======
+   # Push host architecture when cross-compiling otherwise check would fail
+   # when cross-compiling for arm64 on x86_64
+   cmake_push_check_state(RESET)
+  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+    list(APPEND CMAKE_REQUIRED_FLAGS "-arch ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  endif()
+
+# Set values through env variables if cross compiling
+  IF (CMAKE_CROSSCOMPILING)
+    IF("$ENV{PYTORCH_BLAS_F2C}" STREQUAL "ON")
+      SET(BLAS_F2C TRUE)
+    ELSE()
+      SET(BLAS_F2C FALSE)
+    ENDIF()
+
+    IF("$ENV{PYTORCH_BLAS_USE_CBLAS_DOT}" STREQUAL "ON")
+      SET(BLAS_USE_CBLAS_DOT TRUE)
+    ELSE()
+      SET(BLAS_USE_CBLAS_DOT FALSE)
+    ENDIF()
+  ELSE ()
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  int four = 4;
+  int one = 1;
+  extern double sdot_();
+  int main() {
+    int i;
+    double r = sdot_(&four, x, &one, y, &one);
+    exit((float)r != (float).1234);
+  }" BLAS_F2C_DOUBLE_WORKS )
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  int four = 4;
+  int one = 1;
+  extern float sdot_();
+  int main() {
+    int i;
+    double r = sdot_(&four, x, &one, y, &one);
+    exit((float)r != (float).1234);
+  }" BLAS_F2C_FLOAT_WORKS )
+    IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+      MESSAGE(STATUS "This BLAS uses the F2C return conventions")
+      SET(BLAS_F2C TRUE)
+    ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+      SET(BLAS_F2C FALSE)
+    ENDIF(BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  extern float cblas_sdot();
+  int main() {
+    int i;
+    double r = cblas_sdot(4, x, 1, y, 1);
+    exit((float)r != (float).1234);
+  }" BLAS_USE_CBLAS_DOT )
+    IF (BLAS_USE_CBLAS_DOT)
+      SET(BLAS_USE_CBLAS_DOT TRUE)
+    ELSE (BLAS_USE_CBLAS_DOT)
+      SET(BLAS_USE_CBLAS_DOT FALSE)
+    ENDIF(BLAS_USE_CBLAS_DOT)
+    SET(CMAKE_REQUIRED_LIBRARIES)
+  ENDIF(CMAKE_CROSSCOMPILING)
+  cmake_pop_check_state()
+ENDIF(BLAS_LIBRARIES)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # epilogue
 
diff --git a/cmake/Modules/FindCUB.cmake b/cmake/Modules/FindCUB.cmake
index 1b04d07674961..b13bd1d6b48c3 100644
--- a/cmake/Modules/FindCUB.cmake
+++ b/cmake/Modules/FindCUB.cmake
@@ -3,7 +3,11 @@
 #  CUB_INCLUDE_DIRS - the CUB include directory
 
 find_path(CUB_INCLUDE_DIR
+<<<<<<< HEAD
         HINTS "${CUDAToolkit_INCLUDE_DIRS}"
+=======
+        HINTS "${CUDA_TOOLKIT_INCLUDE}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         NAMES cub/cub.cuh
         DOC "The directory where CUB includes reside"
 )
diff --git a/cmake/Modules/FindGloo.cmake b/cmake/Modules/FindGloo.cmake
index 944cd4d8d2573..1f1d407cafe3e 100644
--- a/cmake/Modules/FindGloo.cmake
+++ b/cmake/Modules/FindGloo.cmake
@@ -1,8 +1,12 @@
 # Try to find the Gloo library and headers.
 #  Gloo_FOUND        - system has Gloo lib
 #  Gloo_INCLUDE_DIRS - the Gloo include directory
+<<<<<<< HEAD
 #  Gloo_NATIVE_LIBRARY - base gloo library, needs to be linked
 #  Gloo_CUDA_LIBRARY/Gloo_HIP_LIBRARY - CUDA/HIP support library in Gloo
+=======
+#  Gloo_LIBRARY/Gloo_NATIVE_LIBRARY    - libraries needed to use Gloo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 find_path(Gloo_INCLUDE_DIR
   NAMES gloo/common/common.h
@@ -11,6 +15,7 @@ find_path(Gloo_INCLUDE_DIR
 
 find_library(Gloo_NATIVE_LIBRARY
   NAMES gloo
+<<<<<<< HEAD
   DOC "The Gloo library"
 )
 
@@ -28,15 +33,49 @@ find_library(Gloo_CUDA_LIBRARY
 find_library(Gloo_HIP_LIBRARY
   NAMES gloo_hiop
   DOC "Gloo's HIP support/code"
+=======
+  DOC "The Gloo library (without CUDA)"
+)
+
+find_library(Gloo_CUDA_LIBRARY
+  NAMES gloo_cuda
+  DOC "The Gloo library (with CUDA)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 set(Gloo_INCLUDE_DIRS ${Gloo_INCLUDE_DIR})
 
+<<<<<<< HEAD
+=======
+# use the CUDA library depending on the Gloo_USE_CUDA variable
+if (DEFINED Gloo_USE_CUDA)
+  if (${Gloo_USE_CUDA})
+    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  else()
+    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  endif()
+else()
+  # else try to use the CUDA library if found
+  if (${Gloo_CUDA_LIBRARY} STREQUAL "Gloo_CUDA_LIBRARY-NOTFOUND")
+    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  else()
+    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
+    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
+  endif()
+endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Gloo
   FOUND_VAR Gloo_FOUND
+<<<<<<< HEAD
   REQUIRED_VARS Gloo_INCLUDE_DIR Gloo_NATIVE_LIBRARY
+=======
+  REQUIRED_VARS Gloo_INCLUDE_DIR Gloo_LIBRARY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 mark_as_advanced(Gloo_FOUND)
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index 2018d5ec9370b..7b21873035f20 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -46,8 +46,13 @@ IF(NOT MKLDNN_FOUND)
       endif()
     endif()
     ExternalProject_Add(xpu_mkldnn_proj
+<<<<<<< HEAD
       GIT_REPOSITORY https://github.com/uxlfoundation/oneDNN
       GIT_TAG v3.9.1
+=======
+      GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
+      GIT_TAG v3.8.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       PREFIX ${XPU_MKLDNN_DIR_PREFIX}
       BUILD_IN_SOURCE 0
       CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index abd2512608387..65190a5ae70e9 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -16,7 +16,11 @@ find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
                 /System/Library/${__veclib_include_suffix}
+<<<<<<< HEAD
                 /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+=======
+                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
           NO_DEFAULT_PATH)
 
diff --git a/cmake/Modules_CUDA_fix/FindCUDA.cmake b/cmake/Modules_CUDA_fix/FindCUDA.cmake
index 55c4e83012d82..7f9d73d248402 100644
--- a/cmake/Modules_CUDA_fix/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/FindCUDA.cmake
@@ -7,4 +7,8 @@
 
 set(UPSTREAM_FIND_CUDA_DIR "${CMAKE_CURRENT_LIST_DIR}/upstream/")
 
+<<<<<<< HEAD
+=======
+include("${UPSTREAM_FIND_CUDA_DIR}/CMakeInitializeConfigs.cmake")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 include("${UPSTREAM_FIND_CUDA_DIR}/FindCUDA.cmake")
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index 411a246656b3b..27b94b27586a6 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -414,7 +414,10 @@
 
 # FindCUDA.cmake
 
+<<<<<<< HEAD
 include(FindPackageHandleStandardArgs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This macro helps us find the location of helper files we will need the full path to
 macro(CUDA_FIND_HELPER_FILE _name _extension)
   set(_full_name "${_name}.${_extension}")
@@ -1066,6 +1069,11 @@ set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
 set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
   "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
 
+<<<<<<< HEAD
+=======
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 find_package_handle_standard_args(CUDA
   REQUIRED_VARS
     CUDA_TOOLKIT_ROOT_DIR
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index bf7edd69ccd13..0c19b49979169 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -201,7 +201,11 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
       set(add_ptx TRUE)
       set(arch_name ${CMAKE_MATCH_1})
     endif()
+<<<<<<< HEAD
     if(arch_name MATCHES "^([0-9]+\\.[0-9][af]?(\\([0-9]+\\.[0-9]\\))?)$")
+=======
+    if(arch_name MATCHES "^([0-9]+\\.[0-9]a?(\\([0-9]+\\.[0-9]\\))?)$")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       set(arch_bin ${CMAKE_MATCH_1})
       set(arch_ptx ${arch_bin})
     else()
@@ -262,8 +266,13 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+<<<<<<< HEAD
   string(REGEX MATCHALL "[0-9()]+[af]?" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX MATCHALL "[0-9]+[af]?"   cuda_arch_ptx "${cuda_arch_ptx}")
+=======
+  string(REGEX MATCHALL "[0-9()]+a?" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+a?"   cuda_arch_ptx "${cuda_arch_ptx}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if(cuda_arch_bin)
     list(REMOVE_DUPLICATES cuda_arch_bin)
diff --git a/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake b/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
new file mode 100644
index 0000000000000..67f6bd6f2bcd1
--- /dev/null
+++ b/cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
@@ -0,0 +1,386 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindPackageHandleStandardArgs
+-----------------------------
+
+This module provides a function intended to be used in :ref:`Find Modules`
+implementing :command:`find_package(<PackageName>)` calls.  It handles the
+``REQUIRED``, ``QUIET`` and version-related arguments of ``find_package``.
+It also sets the ``<PackageName>_FOUND`` variable.  The package is
+considered found if all variables listed contain valid results, e.g.
+valid filepaths.
+
+.. command:: find_package_handle_standard_args
+
+  There are two signatures::
+
+    find_package_handle_standard_args(<PackageName>
+      (DEFAULT_MSG|<custom-failure-message>)
+      <required-var>...
+      )
+
+    find_package_handle_standard_args(<PackageName>
+      [FOUND_VAR <result-var>]
+      [REQUIRED_VARS <required-var>...]
+      [VERSION_VAR <version-var>]
+      [HANDLE_COMPONENTS]
+      [CONFIG_MODE]
+      [FAIL_MESSAGE <custom-failure-message>]
+      )
+
+  The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
+  the variables ``<required-var>...`` are valid and any optional
+  constraints are satisfied, and ``FALSE`` otherwise.  A success or
+  failure message may be displayed based on the results and on
+  whether the ``REQUIRED`` and/or ``QUIET`` option was given to
+  the :command:`find_package` call.
+
+  The options are:
+
+  ``(DEFAULT_MSG|<custom-failure-message>)``
+    In the simple signature this specifies the failure message.
+    Use ``DEFAULT_MSG`` to ask for a default message to be computed
+    (recommended).  Not valid in the full signature.
+
+  ``FOUND_VAR <result-var>``
+    Obsolete.  Specifies either ``<PackageName>_FOUND`` or
+    ``<PACKAGENAME>_FOUND`` as the result variable.  This exists only
+    for compatibility with older versions of CMake and is now ignored.
+    Result variables of both names are always set for compatibility.
+
+  ``REQUIRED_VARS <required-var>...``
+    Specify the variables which are required for this package.
+    These may be named in the generated failure message asking the
+    user to set the missing variable values.  Therefore these should
+    typically be cache entries such as ``FOO_LIBRARY`` and not output
+    variables like ``FOO_LIBRARIES``.
+
+  ``VERSION_VAR <version-var>``
+    Specify the name of a variable that holds the version of the package
+    that has been found.  This version will be checked against the
+    (potentially) specified required version given to the
+    :command:`find_package` call, including its ``EXACT`` option.
+    The default messages include information about the required
+    version and the version which has been actually found, both
+    if the version is ok or not.
+
+  ``HANDLE_COMPONENTS``
+    Enable handling of package components.  In this case, the command
+    will report which components have been found and which are missing,
+    and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
+    if any of the required components (i.e. not the ones listed after
+    the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
+    missing.
+
+  ``CONFIG_MODE``
+    Specify that the calling find module is a wrapper around a
+    call to ``find_package(<PackageName> NO_MODULE)``.  This implies
+    a ``VERSION_VAR`` value of ``<PackageName>_VERSION``.  The command
+    will automatically check whether the package configuration file
+    was found.
+
+  ``FAIL_MESSAGE <custom-failure-message>``
+    Specify a custom failure message instead of using the default
+    generated message.  Not recommended.
+
+Example for the simple signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibXml2 DEFAULT_MSG
+    LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
+
+The ``LibXml2`` package is considered to be found if both
+``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
+Then also ``LibXml2_FOUND`` is set to ``TRUE``.  If it is not found
+and ``REQUIRED`` was used, it fails with a
+:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
+used or not.  If it is found, success will be reported, including
+the content of the first ``<required-var>``.  On repeated CMake runs,
+the same message will not be printed again.
+
+Example for the full signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibArchive
+    REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
+    VERSION_VAR LibArchive_VERSION)
+
+In this case, the ``LibArchive`` package is considered to be found if
+both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
+Also the version of ``LibArchive`` will be checked by using the version
+contained in ``LibArchive_VERSION``.  Since no ``FAIL_MESSAGE`` is given,
+the default messages will be printed.
+
+Another example for the full signature:
+
+.. code-block:: cmake
+
+  find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
+  find_package_handle_standard_args(Automoc4  CONFIG_MODE)
+
+In this case, a ``FindAutmoc4.cmake`` module wraps a call to
+``find_package(Automoc4 NO_MODULE)`` and adds an additional search
+directory for ``automoc4``.  Then the call to
+``find_package_handle_standard_args`` produces a proper success/failure
+message.
+#]=======================================================================]
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
+
+# internal helper macro
+macro(_FPHSA_FAILURE_MESSAGE _msg)
+  if (${_NAME}_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else ()
+    if (NOT ${_NAME}_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif ()
+  endif ()
+endmacro()
+
+
+# internal helper macro to generate the failure message when used in CONFIG_MODE:
+macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
+  # <name>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
+  if(${_NAME}_CONFIG)
+    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
+  else()
+    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
+    # List them all in the error message:
+    if(${_NAME}_CONSIDERED_CONFIGS)
+      set(configsText "")
+      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
+      math(EXPR configsCount "${configsCount} - 1")
+      foreach(currentConfigIndex RANGE ${configsCount})
+        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
+        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
+        string(APPEND configsText "    ${filename} (version ${version})\n")
+      endforeach()
+      if (${_NAME}_NOT_FOUND_MESSAGE)
+        string(APPEND configsText "    Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n")
+      endif()
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}")
+
+    else()
+      # Simple case: No Config-file was found at all:
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
+    endif()
+  endif()
+endmacro()
+
+
+function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
+
+# Set up the arguments for `cmake_parse_arguments`.
+  set(options  CONFIG_MODE  HANDLE_COMPONENTS)
+  set(oneValueArgs  FAIL_MESSAGE  VERSION_VAR  FOUND_VAR)
+  set(multiValueArgs REQUIRED_VARS)
+
+# Check whether we are in 'simple' or 'extended' mode:
+  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
+  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
+
+  if(${INDEX} EQUAL -1)
+    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
+    set(FPHSA_REQUIRED_VARS ${ARGN})
+    set(FPHSA_VERSION_VAR)
+  else()
+    cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})
+
+    if(FPHSA_UNPARSED_ARGUMENTS)
+      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
+    endif()
+
+    if(NOT FPHSA_FAIL_MESSAGE)
+      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
+    endif()
+
+    # In config-mode, we rely on the variable <package>_CONFIG, which is set by find_package()
+    # when it successfully found the config-file, including version checking:
+    if(FPHSA_CONFIG_MODE)
+      list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
+      list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
+      set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
+    endif()
+
+    if(NOT FPHSA_REQUIRED_VARS)
+      message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
+    endif()
+  endif()
+
+# now that we collected all arguments, process them
+
+  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
+    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
+  endif()
+
+  list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
+
+  string(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOLOWER ${_NAME} _NAME_LOWER)
+
+  if(FPHSA_FOUND_VAR)
+    if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$"  OR  FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$")
+      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
+    else()
+      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.")
+    endif()
+  else()
+    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
+  endif()
+
+  # collect all variables which were not found, so they can be printed, so the
+  # user knows better what went wrong (#6375)
+  set(MISSING_VARS "")
+  set(DETAILS "")
+  # check if all passed variables are valid
+  set(FPHSA_FOUND_${_NAME} TRUE)
+  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
+    if(NOT ${_CURRENT_VAR})
+      set(FPHSA_FOUND_${_NAME} FALSE)
+      string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
+    else()
+      string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
+    endif()
+  endforeach()
+  if(FPHSA_FOUND_${_NAME})
+    set(${_NAME}_FOUND TRUE)
+    set(${_NAME_UPPER}_FOUND TRUE)
+  else()
+    set(${_NAME}_FOUND FALSE)
+    set(${_NAME_UPPER}_FOUND FALSE)
+  endif()
+
+  # component handling
+  unset(FOUND_COMPONENTS_MSG)
+  unset(MISSING_COMPONENTS_MSG)
+
+  if(FPHSA_HANDLE_COMPONENTS)
+    foreach(comp ${${_NAME}_FIND_COMPONENTS})
+      if(${_NAME}_${comp}_FOUND)
+
+        if(NOT DEFINED FOUND_COMPONENTS_MSG)
+          set(FOUND_COMPONENTS_MSG "found components: ")
+        endif()
+        string(APPEND FOUND_COMPONENTS_MSG " ${comp}")
+
+      else()
+
+        if(NOT DEFINED MISSING_COMPONENTS_MSG)
+          set(MISSING_COMPONENTS_MSG "missing components: ")
+        endif()
+        string(APPEND MISSING_COMPONENTS_MSG " ${comp}")
+
+        if(${_NAME}_FIND_REQUIRED_${comp})
+          set(${_NAME}_FOUND FALSE)
+          string(APPEND MISSING_VARS " ${comp}")
+        endif()
+
+      endif()
+    endforeach()
+    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
+    string(APPEND DETAILS "[c${COMPONENT_MSG}]")
+  endif()
+
+  # version handling:
+  set(VERSION_MSG "")
+  set(VERSION_OK TRUE)
+
+  # check with DEFINED here as the requested or found version may be "0"
+  if (DEFINED ${_NAME}_FIND_VERSION)
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})
+
+      if(${_NAME}_FIND_VERSION_EXACT)       # exact version required
+        # count the dots in the version string
+        string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${_FOUND_VERSION}")
+        # add one dot because there is one dot more than there are components
+        string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS)
+        if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT)
+          # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT
+          # is at most 4 here. Therefore a simple lookup table is used.
+          if (${_NAME}_FIND_VERSION_COUNT EQUAL 1)
+            set(_VERSION_REGEX "[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*")
+          else ()
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
+          endif ()
+          string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${_FOUND_VERSION}")
+          unset(_VERSION_REGEX)
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+          unset(_VERSION_HEAD)
+        else ()
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _FOUND_VERSION)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+        endif ()
+        unset(_VERSION_DOTS)
+
+      else()     # minimum version specified:
+        if (${_NAME}_FIND_VERSION VERSION_GREATER _FOUND_VERSION)
+          set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"")
+          set(VERSION_OK FALSE)
+        else ()
+          set(VERSION_MSG "(found suitable version \"${_FOUND_VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")")
+        endif ()
+      endif()
+
+    else()
+
+      # if the package was not found, but a version was given, add that to the output:
+      if(${_NAME}_FIND_VERSION_EXACT)
+         set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
+      else()
+         set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
+      endif()
+
+    endif()
+  else ()
+    # Check with DEFINED as the found version may be 0.
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
+    endif()
+  endif ()
+
+  if(VERSION_OK)
+    string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
+  else()
+    set(${_NAME}_FOUND FALSE)
+  endif()
+
+
+  # print the result:
+  if (${_NAME}_FOUND)
+    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
+  else ()
+
+    if(FPHSA_CONFIG_MODE)
+      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
+    else()
+      if(NOT VERSION_OK)
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})")
+      else()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
+      endif()
+    endif()
+
+  endif ()
+
+  set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+  set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 60951d6c68674..32c7ea7ea662a 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -66,11 +66,18 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    LAPACK              : ${LAPACK_INFO}")
   endif()
   message(STATUS "  USE_ASAN              : ${USE_ASAN}")
+<<<<<<< HEAD
   message(STATUS "  USE_LSAN              : ${USE_LSAN}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "  USE_TSAN              : ${USE_TSAN}")
   message(STATUS "  USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
   message(STATUS "  USE_CUDA              : ${USE_CUDA}")
   if(${USE_CUDA})
+<<<<<<< HEAD
+=======
+    message(STATUS "    Split CUDA          : ${BUILD_SPLIT_CUDA}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     message(STATUS "    CUDA static link    : ${CAFFE2_STATIC_LINK_CUDA}")
     message(STATUS "    USE_CUDNN           : ${USE_CUDNN}")
     message(STATUS "    USE_CUSPARSELT      : ${USE_CUSPARSELT}")
@@ -128,6 +135,7 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  USE_ROCM              : ${USE_ROCM}")
   if(${USE_ROCM})
+<<<<<<< HEAD
     message(STATUS "    ROCM_VERSION                  : ${ROCM_VERSION}")
     message(STATUS "    USE_FLASH_ATTENTION           : ${USE_FLASH_ATTENTION}")
     message(STATUS "    USE_MEM_EFF_ATTENTION         : ${USE_MEM_EFF_ATTENTION}")
@@ -140,6 +148,17 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_EIGEN_FOR_SPARSE  : ${USE_EIGEN_SPARSE}")
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
   message(STATUS "  USE_FBGEMM_GENAI      : ${USE_FBGEMM_GENAI}")
+=======
+    message(STATUS "    ROCM_VERSION        : ${ROCM_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
+    message(STATUS "    USE_CK_FLASH_ATTENTION : ${USE_CK_FLASH_ATTENTION}")
+    message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
+  endif()
+  message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
+  message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
+  message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
+  message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "  USE_KINETO            : ${USE_KINETO}")
   message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
   message(STATUS "  USE_GLOG              : ${USE_GLOG}")
@@ -160,7 +179,10 @@ function(caffe2_print_configuration_summary)
   if(${USE_KLEIDIAI})
     message(STATUS "  USE_KLEIDIAI          : ${USE_KLEIDIAI}")
   endif()
+<<<<<<< HEAD
   message(STATUS "  USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "  USE_UCC               : ${USE_UCC}")
   if(${USE_UCC})
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
@@ -176,7 +198,10 @@ function(caffe2_print_configuration_summary)
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
   endif()
+<<<<<<< HEAD
   message(STATUS "  Found NVSHMEM         : ${NVSHMEM_INCLUDE_DIR}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
   message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
   message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 0b32ffa99ceb5..db80ec7796133 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -132,9 +132,12 @@ if(@USE_CUDA@)
   else()
     set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
   endif()
+<<<<<<< HEAD
   if(TARGET torch::nvtoolsext)
     list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if(@BUILD_SHARED_LIBS@)
     find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 018bca837a5a8..8af6b682a1c4f 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -6,7 +6,11 @@ set(PYTORCH_FOUND_HIP FALSE)
 # In the latter case, if /opt/rocm does not exist emit status
 # message and return.
 if(DEFINED ENV{ROCM_PATH})
+<<<<<<< HEAD
   file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
+=======
+  set(ROCM_PATH $ENV{ROCM_PATH})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(NOT EXISTS ${ROCM_PATH})
     message(FATAL_ERROR
       "ROCM_PATH environment variable is set to ${ROCM_PATH} but does not exist.\n"
@@ -31,7 +35,11 @@ if(NOT DEFINED ENV{MAGMA_HOME})
   set(MAGMA_HOME ${ROCM_PATH}/magma)
   set(ENV{MAGMA_HOME} ${ROCM_PATH}/magma)
 else()
+<<<<<<< HEAD
   file(TO_CMAKE_PATH "$ENV{MAGMA_HOME}" MAGMA_HOME)
+=======
+  set(MAGMA_HOME $ENV{MAGMA_HOME})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 # MIOpen isn't a part of HIP-SDK for Windows and hence, may have a different
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 218c50a69c6fb..852a7e43e6278 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -69,8 +69,13 @@ endif()
 message(STATUS "PyTorch: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "PyTorch: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "PyTorch: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
+<<<<<<< HEAD
 if(CUDA_VERSION VERSION_LESS 12.0)
   message(FATAL_ERROR "PyTorch requires CUDA 12.0 or above.")
+=======
+if(CUDA_VERSION VERSION_LESS 11.0)
+  message(FATAL_ERROR "PyTorch requires CUDA 11.0 or above.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(CUDA_FOUND)
@@ -110,7 +115,11 @@ if(CUDA_FOUND)
       # Force CUDA to be processed for again next time
       # TODO: I'm not sure if this counts as an implementation detail of
       # FindCUDA
+<<<<<<< HEAD
       set(cuda_version_from_findcuda ${CUDA_VERSION_STRING})
+=======
+      set(${cuda_version_from_findcuda} ${CUDA_VERSION_STRING})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       unset(CUDA_TOOLKIT_ROOT_DIR_INTERNAL CACHE)
       # Not strictly necessary, but for good luck.
       unset(CUDA_VERSION CACHE)
@@ -282,6 +291,7 @@ endif()
 # cufft
 add_library(caffe2::cufft INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
+<<<<<<< HEAD
     if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
       set_property(
           TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
@@ -291,6 +301,11 @@ if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
           TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
           CUDA::cufft_static)
     endif()
+=======
+    set_property(
+        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cufft_static_nocallback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else()
     set_property(
         TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
@@ -319,6 +334,7 @@ endif()
 # setting nvcc arch flags
 torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
 # CMake 3.18 adds integrated support for architecture selection, but we can't rely on it
+<<<<<<< HEAD
 if(DEFINED CMAKE_CUDA_ARCHITECTURES)
   message(WARNING
           "pytorch is not compatible with `CMAKE_CUDA_ARCHITECTURES` and will ignore its value. "
@@ -326,6 +342,9 @@ if(DEFINED CMAKE_CUDA_ARCHITECTURES)
   set(CMAKE_CUDA_ARCHITECTURES OFF)
 endif()
 
+=======
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
 message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
 
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index efc39f2bc1481..8ab29609e4f2f 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -163,7 +163,24 @@ macro(caffe2_interface_library SRC DST)
   # link command for the specific SRC library.
   if(${__src_target_type} STREQUAL "STATIC_LIBRARY")
     # In the case of static library, we will need to add whole-static flags.
+<<<<<<< HEAD
     target_link_libraries(${DST} INTERFACE $<LINK_LIBRARY:WHOLE_ARCHIVE,${SRC}>)
+=======
+    if(APPLE)
+      target_link_libraries(
+          ${DST} INTERFACE -Wl,-force_load,\"$<TARGET_FILE:${SRC}>\")
+    elseif(MSVC)
+      # In MSVC, we will add whole archive in default.
+      target_link_libraries(
+         ${DST} INTERFACE "$<TARGET_FILE:${SRC}>")
+      target_link_options(
+         ${DST} INTERFACE "-WHOLEARCHIVE:$<TARGET_FILE:${SRC}>")
+    else()
+      # Assume everything else is like gcc
+      target_link_libraries(${DST} INTERFACE
+          "-Wl,--whole-archive,\"$<TARGET_FILE:${SRC}>\" -Wl,--no-whole-archive")
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Link all interface link libraries of the src target as well.
     # For static library, we need to explicitly depend on all the libraries
     # that are the dependent library of the source library. Note that we cannot
@@ -362,6 +379,17 @@ function(torch_compile_options libname)
     # For MS official doc: https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:preprocessor" PARENT_SCOPE)
 
+<<<<<<< HEAD
+=======
+    if(${MSVC_TOOLSET_VERSION} GREATER_EQUAL 143)
+      # Add /d2implyavx512upperregs- to disable compiler over-aggressive optimization, which caused involeved AVX512 register on AVX2 machine.
+      # Reference: https://github.com/pytorch/pytorch/issues/145702#issuecomment-2874029459
+      target_compile_options(${libname} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/d2implyavx512upperregs->)
+    endif()
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     target_compile_options(${libname} PUBLIC
       $<$<COMPILE_LANGUAGE:CXX>:
         ${MSVC_RUNTIME_LIBRARY_OPTION}
@@ -383,10 +411,17 @@ function(torch_compile_options libname)
       -Wno-strict-aliasing
       )
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+<<<<<<< HEAD
       list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
     endif()
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       list(APPEND private_compile_options -Wextra-semi -Wmove)
+=======
+      list(APPEND private_compile_options -Wredundant-move)
+    endif()
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND private_compile_options -Wextra-semi -Wno-error=extra-semi -Wmove)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else()
       list(APPEND private_compile_options
         # Considered to be flaky.  See the discussion at
@@ -397,7 +432,10 @@ function(torch_compile_options libname)
     if(WERROR)
       list(APPEND private_compile_options
         -Werror
+<<<<<<< HEAD
         -Werror=ignored-attributes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         -Werror=inconsistent-missing-override
         -Werror=inconsistent-missing-destructor-override
         -Werror=pedantic
@@ -439,6 +477,13 @@ function(torch_compile_options libname)
         $<$<COMPILE_LANGUAGE:CXX>: -fvisibility=hidden>)
   endif()
 
+<<<<<<< HEAD
+=======
+  # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
+  target_compile_options(${libname} PRIVATE
+      $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O2>)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endfunction()
 
 ##############################################################################
@@ -478,7 +523,10 @@ function(torch_update_find_cuda_flags)
 endfunction()
 
 include(CheckCXXCompilerFlag)
+<<<<<<< HEAD
 include(CheckLinkerFlag)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ##############################################################################
 # CHeck if given flag is supported and append it to provided outputvar
@@ -508,6 +556,7 @@ function(target_compile_options_if_supported target flag)
     target_compile_options(${target} PRIVATE ${flag})
   endif()
 endfunction()
+<<<<<<< HEAD
 
 # Check if a global link option is supported
 function(add_link_options_if_supported flag)
@@ -527,3 +576,5 @@ function(target_link_options_if_supported tgt flag)
     message(WARNING "Attempted to use unsupported link option : ${flag}.")
   endif()
 endfunction()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/cmake/public/xpu.cmake b/cmake/public/xpu.cmake
index b39e31d0ade8a..d78e4a1e3eb83 100644
--- a/cmake/public/xpu.cmake
+++ b/cmake/public/xpu.cmake
@@ -11,7 +11,10 @@ set(XPU_HOST_CXX_FLAGS)
 find_package(SYCLToolkit REQUIRED)
 if(NOT SYCL_FOUND)
   set(PYTORCH_FOUND_XPU FALSE)
+<<<<<<< HEAD
   # Exit early to avoid populating XPU_HOST_CXX_FLAGS.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return()
 endif()
 set(PYTORCH_FOUND_XPU TRUE)
@@ -37,8 +40,11 @@ torch_xpu_get_arch_list(XPU_ARCH_FLAGS)
 # propagate to torch-xpu-ops
 set(TORCH_XPU_ARCH_LIST ${XPU_ARCH_FLAGS})
 
+<<<<<<< HEAD
 # Ensure USE_XPU is enabled.
 string(APPEND XPU_HOST_CXX_FLAGS " -DUSE_XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 string(APPEND XPU_HOST_CXX_FLAGS " -DSYCL_COMPILER_VERSION=${SYCL_COMPILER_VERSION}")
 
 if(DEFINED ENV{XPU_ENABLE_KINETO})
diff --git a/docs/Makefile b/docs/Makefile
index 1337a1fc5dc03..998403b3b7df7 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -15,8 +15,17 @@ help:
 
 figures:
 	@$(PYCMD) source/scripts/build_activation_images.py
+<<<<<<< HEAD
 	@$(PYCMD) source/scripts/build_lr_scheduler_images.py
 
+=======
+	@$(PYCMD) source/scripts/build_quantization_configs.py
+	@$(PYCMD) source/scripts/build_lr_scheduler_images.py
+
+onnx:
+	@$(PYCMD) source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 opset:
 	@$(PYCMD) source/scripts/build_opsets.py
 
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index 10d854c21db4f..34c13466742a3 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -40,6 +40,7 @@
     "sphinx.ext.intersphinx",
 ] + (["breathe", "exhale"] if run_doxygen else [])
 
+<<<<<<< HEAD
 intersphinx_mapping = {"pytorch": ("https://docs.pytorch.org/docs/main", None)}
 
 # Configure Sphinx warnings and error handling
@@ -68,6 +69,9 @@
         collections.MutableMapping = MutableMapping
 except ImportError:
     pass
+=======
+intersphinx_mapping = {"pytorch": ("https://pytorch.org/docs/main", None)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Setup absolute paths for communicating with breathe / exhale where
 # items are expected / should be trimmed by.
@@ -128,6 +132,7 @@
         Welcome to the developer reference for the PyTorch C++ API.
     """
     ),
+<<<<<<< HEAD
     ############################################################################
     # Duplicate handling and error management.                                 #
     ############################################################################
@@ -143,6 +148,8 @@
         "variable",
     },
     "fullToctreeMaxDepth": 2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # Tell sphinx what the primary language being documented is.
@@ -216,7 +223,10 @@
 #
 html_theme_options = {
     "canonical_url": "https://pytorch.org/docs/stable/",
+<<<<<<< HEAD
     "analytics_id": "GTM-T8XT4PS",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "collapse_navigation": False,
     "logo": {"text": "Home"},
     "icon_links": [
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 6f4c0f320a388..a9769081de5c7 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -14,7 +14,11 @@ Combining, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
+<<<<<<< HEAD
 production; we are looking forward to welcoming more users of the PyTorch C++ API.
+=======
+production; we are looking forward to welcome more users of the PyTorch C++ API.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. warning::
 
diff --git a/docs/cpp/source/notes/inference_mode.rst b/docs/cpp/source/notes/inference_mode.rst
index 60cc48ca93d5b..825ad2a8a0624 100644
--- a/docs/cpp/source/notes/inference_mode.rst
+++ b/docs/cpp/source/notes/inference_mode.rst
@@ -64,7 +64,11 @@ users should pay additional attention to:
 
   - Both guards affects tensor execution process to skip work not related to inference, but ``InferenceMode``
     also affects tensor creation while ``AutoNonVariableTypeMode`` doesn't. In other words, tensors created
+<<<<<<< HEAD
     inside ``InferenceMode`` are marked as inference tensors so that certain limitations can be applied after
+=======
+    inside ``InferenceMode`` are marked as inference tensors so that certain limitation can be applied after
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     exiting ``InferenceMode``.
   - Enabled/disabled ``InferenceMode`` states can be nested while ``AutoNonVariableTypeMode`` only allows enabled state.
 
diff --git a/docs/source/_static/img/onnx/torch_script_exporter_memory_usage.png b/docs/source/_static/img/onnx/torch_script_exporter_memory_usage.png
new file mode 100644
index 0000000000000..b9c81a71ef3c0
Binary files /dev/null and b/docs/source/_static/img/onnx/torch_script_exporter_memory_usage.png differ
diff --git a/docs/source/accelerator.md b/docs/source/accelerator.md
index ce593a9acf518..c36d094ba17a3 100644
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@@ -25,6 +25,7 @@
     synchronize
     device_index
 ```
+<<<<<<< HEAD
 
 ```{eval-rst}
 .. automodule:: torch.accelerator.memory
@@ -48,3 +49,5 @@
      reset_accumulated_memory_stats
      reset_peak_memory_stats
 ```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/backends.md b/docs/source/backends.md
index 6f8791d9a6087..62c58aa2e296f 100644
--- a/docs/source/backends.md
+++ b/docs/source/backends.md
@@ -54,23 +54,33 @@ These backends include:
 .. attribute::  allow_tf32
 
     A :class:`bool` that controls whether TensorFloat-32 tensor cores may be used in matrix
+<<<<<<< HEAD
     multiplications on Ampere or newer GPUs. allow_tf32 is going to be deprecated. See :ref:`tf32_on_ampere`.
+=======
+    multiplications on Ampere or newer GPUs. See :ref:`tf32_on_ampere`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
 .. attribute::  allow_fp16_reduced_precision_reduction
 
     A :class:`bool` that controls whether reduced precision reductions (e.g., with fp16 accumulation type) are allowed with fp16 GEMMs.
+<<<<<<< HEAD
     Assigning a tuple ``(allow_reduced_precision, allow_splitk)`` lets you also toggle whether
     split-K heuristics may be used when dispatching to cuBLASLt. ``allow_splitk`` defaults to ``True``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
 .. attribute::  allow_bf16_reduced_precision_reduction
 
     A :class:`bool` that controls whether reduced precision reductions are allowed with bf16 GEMMs.
+<<<<<<< HEAD
     Assigning a tuple ``(allow_reduced_precision, allow_splitk)`` lets you also toggle whether
     split-K heuristics may be used when dispatching to cuBLASLt. ``allow_splitk`` defaults to ``True``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
@@ -197,7 +207,11 @@ These backends include:
 .. attribute::  allow_tf32
 
     A :class:`bool` that controls where TensorFloat-32 tensor cores may be used in cuDNN
+<<<<<<< HEAD
     convolutions on Ampere or newer GPUs. allow_tf32 is going to be deprecated. See :ref:`tf32_on_ampere`.
+=======
+    convolutions on Ampere or newer GPUs. See :ref:`tf32_on_ampere`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
@@ -257,6 +271,7 @@ These backends include:
 
 ```
 
+<<<<<<< HEAD
 ## torch.backends.miopen
 
 ```{eval-rst}
@@ -270,6 +285,8 @@ These backends include:
     (https://rocm.docs.amd.com/projects/MIOpen/en/latest/how-to/find-and-immediate.html).
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## torch.backends.mps
 
 ```{eval-rst}
diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
new file mode 100644
index 0000000000000..ed5caf3fff58c
--- /dev/null
+++ b/docs/source/bottleneck.rst
@@ -0,0 +1,62 @@
+torch.utils.bottleneck
+======================
+
+.. automodule:: torch.utils.bottleneck
+.. currentmodule:: torch.utils.bottleneck
+
+`torch.utils.bottleneck` is a tool that can be used as an initial step for
+debugging bottlenecks in your program. It summarizes runs of your script with
+the Python profiler and PyTorch's autograd profiler.
+
+Run it on the command line with
+
+::
+
+    python -m torch.utils.bottleneck /path/to/source/script.py [args]
+
+where [args] are any number of arguments to `script.py`, or run
+``python -m torch.utils.bottleneck -h`` for more usage instructions.
+
+.. warning::
+    Because your script will be profiled, please ensure that it exits in a
+    finite amount of time.
+
+.. warning::
+    Due to the asynchronous nature of CUDA kernels, when running against
+    CUDA code, the cProfile output and CPU-mode autograd profilers may
+    not show correct timings: the reported CPU time reports the amount of time
+    used to launch the kernels but does not include the time the kernel
+    spent executing on a GPU unless the operation does a synchronize.
+    Ops that do synchronize appear to be extremely expensive under regular
+    CPU-mode profilers.
+    In these case where timings are incorrect, the CUDA-mode autograd profiler
+    may be helpful.
+
+.. note::
+    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
+    look at, you should first check if your script is CPU-bound
+    ("CPU total time is much greater than CUDA total time").
+    If it is CPU-bound, looking at the results of the CPU-mode autograd
+    profiler will help. If on the other hand your script spends most of its
+    time executing on the GPU, then it makes sense to start
+    looking for responsible CUDA operators in the output of the CUDA-mode
+    autograd profiler.
+
+    Of course the reality is much more complicated and your script might not be
+    in one of those two extremes depending on the part of the model you're
+    evaluating. If the profiler outputs don't help, you could try looking at
+    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
+    However, please take into account that the NVTX overhead is very high and
+    often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
+    helps to analyze performance on Intel platforms further with
+    :func:`torch.autograd.profiler.emit_itt()`.
+
+.. warning::
+    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
+    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
+    in its time reporting. This should not matter if your bottlenecks result
+    in code much slower than the CUDA startup time.
+
+For more complicated uses of the profilers (like in a multi-GPU case),
+please see https://docs.python.org/3/library/profile.html
+or :func:`torch.autograd.profiler.profile()` for more information.
diff --git a/docs/source/checkpoint.md b/docs/source/checkpoint.md
index 8b587ebce170c..00dc9f397981f 100644
--- a/docs/source/checkpoint.md
+++ b/docs/source/checkpoint.md
@@ -17,7 +17,11 @@ restoring the RNG state during each checkpoint.
 The stashing logic saves and restores the RNG state for CPU and another
 device type (infer the device type from Tensor arguments excluding CPU
 tensors by `_infer_device_type`) to the `run_fn`. If there are multiple
+<<<<<<< HEAD
 devices, device state will only be saved for devices of a single device type,
+=======
+device, device state will only be saved for devices of a single device type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and the remaining devices will be ignored. Consequently, if any checkpointed
 functions involve randomness, this may result in incorrect gradients. (Note
 that if CUDA devices are among the devices detected, it will be prioritized;
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index 27758fe9384f1..40d7370d8457c 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -131,12 +131,19 @@ Distributed
 -  Ke Wen (`kwen2501 <https://github.com/kwen2501>`__)
 -  Chien-Chin Huang (`fegin <https://github.com/fegin>`__)
 -  Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
+<<<<<<< HEAD
 -  Junjie Wang (`fduwjj <https://github.com/fduwjj>`__)
 -  Wei Feng (`weifengpy <https://github.com/weifengpy>`__)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -  (emeritus) Shen Li (`mrshenli <https://github.com/mrshenli>`__)
 -  (emeritus) Pritam Damania (`pritamdamania87 <https://github.com/pritamdamania87>`__)
 -  (emeritus) Yanli Zhao (`zhaojuanmao <https://github.com/zhaojuanmao>`__)
 -  (emeritus) Rohan Varma (`rohan-varma <https://github.com/rohan-varma>`__)
+<<<<<<< HEAD
+=======
+-  (emeritus) Junjie Wang (`fduwjj <https://github.com/fduwjj>`__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -  (emeritus) Alisson Azzolini (`aazzolini <https://github.com/aazzolini>`__)
 -  (emeritus) James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
 -  (emeritus) Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
@@ -224,12 +231,15 @@ AMD/ROCm/HIP
 -  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
 -  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 
+<<<<<<< HEAD
 XPU/Intel GPU
 ~~~~~~~~~~~~~
 
 - Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
 - Guangye Yu (`guangyey <https://github.com/guangyey>`__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Build + CI
 ~~~~~~~~~~
 
@@ -333,11 +343,14 @@ AArch64 CPU
 
 -  Sunita Nadampalli (`snadampal <https://github.com/snadampal>`__)
 
+<<<<<<< HEAD
 Out-of-tree Backend Integration (PrivateUse1)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 -  Jiawei Li (`fffrog <https://github.com/fffrog>`__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Docs / Tutorials
 ~~~~~~~~~~~~~~~~
 
@@ -350,6 +363,7 @@ XLA
 ~~~
 
 -  Jack Cao (`JackCaoG <https://github.com/JackCaoG>`__)
+<<<<<<< HEAD
 -  Han Qi (`qihqi <https://github.com/qihqi>`__)
 -  Yifei Teng (`tengyifei <https://github.com/tengyifei>`__)
 -  Siyuan Liu (`lsy323 <https://github.com/lsy323>`__)
@@ -360,13 +374,28 @@ XLA
 -  (emeritus) Alex Suhan (`asuhan <https://github.com/asuhan>`__)
 -  (emeritus) Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
 -  (emeritus) Zach Cain (`zcain117 <https://github.com/zcain117>`__)
+=======
+-  Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
+-  Zach Cain (`zcain117 <https://github.com/zcain117>`__)
+-  Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
+-  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
+-  (emeritus) Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
+-  (emeritus) Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
+-  (emeritus) Alex Suhan (`asuhan <https://github.com/asuhan>`__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TorchServe
 ~~~~~~~~~~
 
+<<<<<<< HEAD
 -  (emeritus) Li Ning (`lxning <https://github.com/lxning>`__)
 -  (emeritus) Ankith Gunapal (`agunapal <https://github.com/agunapal>`__)
 -  (emeritus) Hamid Shojanazeri (`HamidShojanazeri <https://github.com/HamidShojanazeri>`__)
+=======
+-  Li Ning (`lxning <https://github.com/lxning>`__)
+-  Ankith Gunapal (`agunapal <https://github.com/agunapal>`__)
+-  Hamid Shojanazeri (`HamidShojanazeri <https://github.com/HamidShojanazeri>`__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -  (emeritus) Mark Saroufim (`msaroufIm <https://github.com/msaroufIm>`__)
 -  (emeritus) Manoj Rao (`mycpuorg <https://github.com/mycpuorg>`__)
 -  (emeritus) Vamshi Dantu (`vdantu <https://github.com/vdantu>`__)
diff --git a/docs/source/cond.md b/docs/source/cond.md
index 49722fd3b9676..9a02db186943c 100644
--- a/docs/source/cond.md
+++ b/docs/source/cond.md
@@ -34,6 +34,7 @@ Read more about feature classification at: https://pytorch.org/blog/pytorch-feat
 Below is an example that uses cond to branch based on input shape:
 
 ```python
+<<<<<<< HEAD
 import torch
 
 def true_fn(x: torch.Tensor):
@@ -60,29 +61,72 @@ class DynamicShapeCondPredicate(torch.nn.Module):
         return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
 
 dyn_shape_mod = DynamicShapeCondPredicate()
+=======
+    import torch
+
+    def true_fn(x: torch.Tensor):
+        return x.cos() + x.sin()
+
+    def false_fn(x: torch.Tensor):
+        return x.sin()
+
+    class DynamicShapeCondPredicate(torch.nn.Module):
+        """
+        A basic usage of cond based on dynamic shape predicate.
+        """
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            def true_fn(x: torch.Tensor):
+                return x.cos()
+
+            def false_fn(x: torch.Tensor):
+                return x.sin()
+
+            return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+
+    dyn_shape_mod = DynamicShapeCondPredicate()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 We can eagerly run the model and expect the results vary based on input shape:
 
 ```python
+<<<<<<< HEAD
 inp = torch.randn(3)
 inp2 = torch.randn(5)
 assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
 assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+=======
+    inp = torch.randn(3)
+    inp2 = torch.randn(5)
+    assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
+    assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 We can export the model for further transformations and deployment:
 
 ```python
+<<<<<<< HEAD
 inp = torch.randn(4, 3)
 dim_batch = torch.export.Dim("batch", min=2)
 ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
 print(ep)
+=======
+    inp = torch.randn(4, 3)
+    dim_batch = torch.export.Dim("batch", min=2)
+    ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
+    print(ep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 This gives us an exported program as shown below:
 
 ```
+<<<<<<< HEAD
 class GraphModule(torch.nn.Module):
     def forward(self, arg0_1: f32[s0, 3]):
         sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
@@ -103,6 +147,28 @@ class GraphModule(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
             sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
             return sin
+=======
+    class GraphModule(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
+            gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
+            true_graph_0 = self.true_graph_0
+            false_graph_0 = self.false_graph_0
+            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+            return (conditional,)
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+                return add
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                return sin
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Notice that `torch.cond` is lowered to `torch.ops.higher_order.cond`, its predicate becomes a Symbolic expression over the shape of input,
@@ -111,6 +177,7 @@ and branch functions becomes two sub-graph attributes of the top level graph mod
 Here is another example that showcases how to express a data-dependent control flow:
 
 ```python
+<<<<<<< HEAD
 class DataDependentCondPredicate(torch.nn.Module):
     """
     A basic usage of cond based on data dependent predicate.
@@ -120,11 +187,23 @@ class DataDependentCondPredicate(torch.nn.Module):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+=======
+    class DataDependentCondPredicate(torch.nn.Module):
+        """
+        A basic usage of cond based on data dependent predicate.
+        """
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 The exported program we get after export:
 
 ```
+<<<<<<< HEAD
 class GraphModule(torch.nn.Module):
     def forward(self, arg0_1: f32[s0, 3]):
         sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
@@ -146,6 +225,29 @@ class GraphModule(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
             sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
             return sin
+=======
+    class GraphModule(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
+            gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+
+            true_graph_0 = self.true_graph_0
+            false_graph_0 = self.false_graph_0
+            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+            return (conditional,)
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+                return add
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                return sin
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ## Invariants of torch.ops.higher_order.cond
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b5a04df3e090b..1de127f324dad 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,7 +62,11 @@
     "sphinxcontrib.katex",
     "sphinx_copybutton",
     "sphinx_design",
+<<<<<<< HEAD
     "myst_nb",
+=======
+    "myst_parser",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "sphinx.ext.linkcode",
     "sphinxcontrib.mermaid",
     "sphinx_sitemap",
@@ -133,7 +137,11 @@
 html_theme_options = {
     "logo": {"text": "Home"},
     "analytics_id": "GTM-T8XT4PS",
+<<<<<<< HEAD
     "canonical_url": "https://docs.pytorch.org/docs/stable/",
+=======
+    "canonical_url": "https://pytorch.org/docs/stable/",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "switcher": {
         "json_url": "https://docs.pytorch.org/docs/pytorch-versions.json",
         "version_match": switcher_version,
@@ -143,7 +151,11 @@
     "external_links": [
         {
             "name": "Tutorials",
+<<<<<<< HEAD
             "url": "https://docs.pytorch.org/tutorials/",
+=======
+            "url": "https://pytorch.org/tutorials/",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
     ],
     "show_version_warning_banner": True,
@@ -181,6 +193,10 @@
 
 theme_variables = pytorch_sphinx_theme2.get_theme_variables()
 html_context = {
+<<<<<<< HEAD
+=======
+    "theme_variables": theme_variables,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "github_url": "https://github.com",
     "github_user": "pytorch",
     "github_repo": "pytorch",
@@ -188,7 +204,11 @@
     "github_version": "main",
     "pytorch_project": "docs",
     "doc_path": "docs/source",
+<<<<<<< HEAD
     "theme_variables": theme_variables,
+=======
+    "theme_variables": theme_variables,  # noqa: F601
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # library links are defined in
     # pytorch_sphinx_theme2/pytorch_sphinx_theme2/links.json
     "library_links": theme_variables.get("library_links", []),
@@ -210,6 +230,13 @@
 coverage_ignore_functions = [
     # torch
     "typename",
+<<<<<<< HEAD
+=======
+    # torch.cuda
+    "check_error",
+    "cudart",
+    "is_bf16_supported",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.cuda._sanitizer
     "zip_arguments",
     "zip_by_key",
@@ -217,7 +244,13 @@
     "is_available",
     # torch.distributed.checkpoint.state_dict
     "gc_context",
+<<<<<<< HEAD
+    # torch.distributed.elastic.events
+=======
+    "state_dict",
     # torch.distributed.elastic.events
+    "construct_and_record_rdzv_event",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "record_rdzv_event",
     # torch.distributed.elastic.metrics
     "initialize_metrics",
@@ -257,6 +290,11 @@
     "flags_frozen",
     # torch.distributed.algorithms.ddp_comm_hooks
     "register_ddp_comm_hook",
+<<<<<<< HEAD
+=======
+    # torch.nn
+    "factory_kwargs",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.nn.parallel
     "DistributedDataParallelCPU",
     # torch.utils
@@ -428,6 +466,10 @@
     "get_default_qconfig_dict",
     "qconfig_equals",
     # torch.ao.quantization.quantization_mappings
+<<<<<<< HEAD
+=======
+    "get_default_compare_output_module_list",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "get_default_dynamic_quant_module_mappings",
     "get_default_dynamic_sparse_quant_module_mappings",
     "get_default_float_to_quantized_operator_mappings",
@@ -470,13 +512,37 @@
     "get_weight_qspec",
     "propagate_annotation",
     "register_annotator",
+<<<<<<< HEAD
+    "activation_dtype",
+    "check_node",
+=======
+    # torch.ao.quantization.utils
     "activation_dtype",
+    "activation_is_dynamically_quantized",
+    "activation_is_int32_quantized",
+    "activation_is_int8_quantized",
+    "activation_is_statically_quantized",
+    "calculate_qmin_qmax",
+    "check_min_max_valid",
     "check_node",
+    "determine_qparams",
+    "get_combined_dict",
+    "get_fqn_to_example_inputs",
+    "get_qconfig_dtypes",
+    "get_qparam_dict",
+    "get_quant_type",
+    "get_swapped_custom_module_class",
+    "getattr_from_fqn",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "has_no_children_ignoring_parametrizations",
     "is_per_channel",
     "is_per_tensor",
     "op_is_int8_dynamically_quantized",
     "to_underlying_dtype",
+<<<<<<< HEAD
+=======
+    "validate_qmin_qmax",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "weight_dtype",
     "weight_is_quantized",
     "weight_is_statically_quantized",
@@ -490,6 +556,13 @@
     "custom_fwd",
     # torch.cuda.amp.common
     "amp_definitely_not_available",
+<<<<<<< HEAD
+=======
+    # torch.cuda.graphs
+    "graph_pool_handle",
+    "is_current_stream_capturing",
+    "make_graphed_callables",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.mtia.memory
     "reset_peak_memory_stats",
     # torch.cuda.nccl
@@ -501,11 +574,31 @@
     "reduce_scatter",
     "unique_id",
     "version",
+<<<<<<< HEAD
+=======
+    # torch.cuda.nvtx
+    "range",
+    "range_end",
+    "range_start",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.cuda.profiler
     "init",
     "profile",
     "start",
     "stop",
+<<<<<<< HEAD
+=======
+    # torch.cuda.random
+    "get_rng_state",
+    "get_rng_state_all",
+    "initial_seed",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "set_rng_state",
+    "set_rng_state_all",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
     "hook_with_zero_step",
     "hook_with_zero_step_interleaved",
@@ -534,6 +627,45 @@
     # torch.distributed.checkpoint.utils
     "find_state_dict_object",
     "find_tensor_shard",
+<<<<<<< HEAD
+=======
+    # torch.distributed.collective_utils
+    "all_gather",
+    "all_gather_object_enforce_type",
+    "broadcast",
+    # torch.distributed.distributed_c10d
+    "all_gather",
+    "all_gather_coalesced",
+    "all_gather_into_tensor",
+    "all_gather_object",
+    "all_reduce",
+    "all_reduce_coalesced",
+    "all_to_all",
+    "all_to_all_single",
+    "barrier",
+    "batch_isend_irecv",
+    "broadcast",
+    "broadcast_object_list",
+    "destroy_process_group",
+    "gather",
+    "gather_object",
+    "get_backend",
+    "get_backend_config",
+    "get_global_rank",
+    "get_group_rank",
+    "get_process_group_ranks",
+    "get_rank",
+    "get_world_size",
+    "init_process_group",
+    "irecv",
+    "is_backend_available",
+    "is_gloo_available",
+    "is_initialized",
+    "is_mpi_available",
+    "is_nccl_available",
+    "is_torchelastic_launched",
+    "is_ucc_available",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "isend",
     "monitored_barrier",
     "new_group",
@@ -607,8 +739,20 @@
     "transformer_auto_wrap_policy",
     "wrap",
     # torch.distributed.nn.functional
+<<<<<<< HEAD
     "all_to_all",
     "all_to_all_single",
+=======
+    "all_gather",
+    "all_reduce",
+    "all_to_all",
+    "all_to_all_single",
+    "broadcast",
+    "gather",
+    "reduce",
+    "reduce_scatter",
+    "scatter",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.nn.jit.instantiator
     "get_arg_return_types_from_interface",
     "instantiate_non_scriptable_remote_module_template",
@@ -652,6 +796,30 @@
     "probs_to_logits",
     "tril_matrix_to_vec",
     "vec_to_tril_matrix",
+<<<<<<< HEAD
+=======
+    # torch.functional
+    "align_tensors",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "block_diag",
+    "broadcast_shapes",
+    "broadcast_tensors",
+    "cartesian_prod",
+    "cdist",
+    "chain_matmul",
+    "einsum",
+    "lu",
+    "meshgrid",
+    "norm",
+    "split",
+    "stft",
+    "tensordot",
+    "unique",
+    "unique_consecutive",
+    "unravel_index",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.fx.annotate
     "annotate",
     # torch.fx.experimental.accelerator_partitioner
@@ -978,6 +1146,10 @@
     "z3op",
     "z3str",
     # torch.fx.graph_module
+<<<<<<< HEAD
+=======
+    "reduce_deploy_graph_module",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "reduce_graph_module",
     "reduce_package_graph_module",
     # torch.fx.node
@@ -1019,8 +1191,11 @@
     "loop_pass",
     "these_before_those_pass_constraint",
     "this_before_that_pass_constraint",
+<<<<<<< HEAD
     # torch.fx.passes.regional_inductor
     "regional_inductor",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.fx.passes.reinplace
     "reinplace",
     # torch.fx.passes.split_module
@@ -1066,8 +1241,11 @@
     "set_current_meta",
     "set_grad_fn_seq_nr",
     "set_stack_trace",
+<<<<<<< HEAD
     "set_current_replay_node",
     "get_current_replay_node",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.jit.annotations
     "ann_to_type",
     "check_fn",
@@ -1143,6 +1321,7 @@
     # torch.multiprocessing.spawn
     "start_processes",
     # torch.nn.functional
+<<<<<<< HEAD
     "adaptive_max_pool1d_with_indices",  # documented as adaptive_max_pool1d
     "adaptive_max_pool2d_with_indices",  # documented as adaptive_max_pool2d
     "adaptive_max_pool3d_with_indices",  # documented as adaptive_max_pool3d
@@ -1174,6 +1353,39 @@
     "xavier_uniform",  # deprecated
     # torch.nn.modules.rnn
     "apply_permutation",  # deprecated
+=======
+    "adaptive_max_pool1d_with_indices",
+    "adaptive_max_pool2d_with_indices",
+    "adaptive_max_pool3d_with_indices",
+    "assert_int_or_pair",
+    "fractional_max_pool2d_with_indices",
+    "fractional_max_pool3d_with_indices",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "multi_head_attention_forward",
+    # torch.nn.grad
+    "conv1d_input",
+    "conv1d_weight",
+    "conv2d_input",
+    "conv2d_weight",
+    "conv3d_input",
+    "conv3d_weight",
+    # torch.nn.init
+    "constant",
+    "dirac",
+    "eye",
+    "kaiming_normal",
+    "kaiming_uniform",
+    "normal",
+    "orthogonal",
+    "sparse",
+    "uniform",
+    "xavier_normal",
+    "xavier_uniform",
+    # torch.nn.modules.rnn
+    "apply_permutation",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.nn.modules.utils
     "consume_prefix_in_state_dict_if_present",
     # torch.nn.parallel.comm
@@ -1195,8 +1407,39 @@
     "is_namedtuple",
     "scatter",
     "scatter_kwargs",
+<<<<<<< HEAD
     # torch.nn.utils.rnn
     "bind",  # looks unintentionally public
+=======
+    # torch.nn.parameter
+    "is_lazy",
+    # torch.nn.utils.convert_parameters
+    "parameters_to_vector",
+    "vector_to_parameters",
+    # torch.nn.utils.fusion
+    "fuse_conv_bn_eval",
+    "fuse_conv_bn_weights",
+    "fuse_linear_bn_eval",
+    "fuse_linear_bn_weights",
+    # torch.nn.utils.init
+    "skip_init",
+    # torch.nn.utils.memory_format
+    "convert_conv2d_weight_memory_format",
+    # torch.nn.utils.parametrizations
+    "weight_norm",
+    # torch.nn.utils.parametrize
+    "transfer_parametrizations_and_params",
+    "type_before_parametrizations",
+    # torch.nn.utils.rnn
+    "bind",
+    "invert_permutation",
+    # torch.nn.utils.spectral_norm
+    "remove_spectral_norm",
+    "spectral_norm",
+    # torch.nn.utils.weight_norm
+    "remove_weight_norm",
+    "weight_norm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.onnx.operators
     "reshape_from_tensor_shape",
     "shape_as_tensor",
@@ -1710,9 +1953,43 @@
     "check_export_model_diff",
     "verify",
     "verify_aten_graph",
+<<<<<<< HEAD
+    # torch.optim.optimizer
+    "register_optimizer_step_post_hook",
+    "register_optimizer_step_pre_hook",
+=======
+    # torch.optim.adadelta
+    "adadelta",
+    # torch.optim.adagrad
+    "adagrad",
+    # torch.optim.adam
+    "adam",
+    # torch.optim.adamax
+    "adamax",
+    # torch.optim.adamw
+    "adamw",
+    # torch.optim.asgd
+    "asgd",
+    # torch.optim.nadam
+    "nadam",
     # torch.optim.optimizer
     "register_optimizer_step_post_hook",
     "register_optimizer_step_pre_hook",
+    # torch.optim.radam
+    "radam",
+    # torch.optim.rmsprop
+    "rmsprop",
+    # torch.optim.rprop
+    "rprop",
+    # torch.optim.sgd
+    "sgd",
+    # torch.optim.swa_utils
+    "get_ema_avg_fn",
+    "get_ema_multi_avg_fn",
+    "get_swa_avg_fn",
+    "get_swa_multi_avg_fn",
+    "update_bn",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.overrides
     "enable_reentrant_dispatch",
     # torch.package.analyze.find_first_use_of_broken_modules
@@ -2096,6 +2373,11 @@
     "EventHandler",
     "SynchronizationError",
     "UnsynchronizedAccessError",
+<<<<<<< HEAD
+=======
+    # torch.cuda.memory
+    "MemPool",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
     "ProcessFailure",
@@ -2401,6 +2683,13 @@
     # torch.amp.grad_scaler
     "GradScaler",
     "OptState",
+<<<<<<< HEAD
+=======
+    # torch.cuda.graphs
+    "CUDAGraph",
+    # torch.cuda.streams
+    "Event",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
     "PostLocalSGDState",
     # torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
@@ -2428,8 +2717,11 @@
     # torch.distributed.checkpoint.hf_storage
     "HuggingFaceStorageReader",
     "HuggingFaceStorageWriter",
+<<<<<<< HEAD
     # torch.distributed.checkpoint.quantized_hf_storage
     "QuantizedHuggingFaceStorageReader",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.checkpoint.metadata
     "BytesStorageMetadata",
     "ChunkStorageMetadata",
@@ -2566,8 +2858,11 @@
     "ExpRelaxedCategorical",
     # torch.distributions.utils
     "lazy_property",
+<<<<<<< HEAD
     # torch.export.unflatten
     "UnflattenedModule",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.export.exported_program
     "ConstantArgument",
     "ExportedProgram",
@@ -2777,6 +3072,7 @@
     # torch.nn.cpp
     "ModuleWrapper",
     "OrderedDictWrapper",
+<<<<<<< HEAD
     # torch.nn.modules.container
     "Container",  # deprecated
     # torch.nn.modules.linear
@@ -2789,6 +3085,153 @@
     "NLLLoss2d",  # deprecated
     # torch.nn.modules.normalization
     "CrossMapLRN2d",
+=======
+    # torch.nn.modules.activation
+    "CELU",
+    "ELU",
+    "GELU",
+    "GLU",
+    "Hardshrink",
+    "Hardsigmoid",
+    "Hardswish",
+    "Hardtanh",
+    "LeakyReLU",
+    "LogSigmoid",
+    "LogSoftmax",
+    "Mish",
+    "MultiheadAttention",
+    "PReLU",
+    "RReLU",
+    "ReLU",
+    "ReLU6",
+    "SELU",
+    "SiLU",
+    "Sigmoid",
+    "Softmax",
+    "Softmax2d",
+    "Softmin",
+    "Softplus",
+    "Softshrink",
+    "Softsign",
+    "Tanh",
+    "Tanhshrink",
+    "Threshold",
+    # torch.nn.modules.adaptive
+    "AdaptiveLogSoftmaxWithLoss",
+    # torch.nn.modules.batchnorm
+    "SyncBatchNorm",
+    # torch.nn.modules.channelshuffle
+    "ChannelShuffle",
+    # torch.nn.modules.container
+    "Container",
+    "ModuleList",
+    "ParameterList",
+    "Sequential",
+    # torch.nn.modules.conv
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    # torch.nn.modules.distance
+    "CosineSimilarity",
+    "PairwiseDistance",
+    # torch.nn.modules.dropout
+    "AlphaDropout",
+    "Dropout",
+    "Dropout1d",
+    "Dropout2d",
+    "Dropout3d",
+    "FeatureAlphaDropout",
+    # torch.nn.modules.flatten
+    "Flatten",
+    "Unflatten",
+    # torch.nn.modules.fold
+    "Fold",
+    "Unfold",
+    # torch.nn.modules.linear
+    "Bilinear",
+    "Identity",
+    "LazyLinear",
+    "Linear",
+    "NonDynamicallyQuantizableLinear",
+    # torch.nn.modules.loss
+    "BCELoss",
+    "BCEWithLogitsLoss",
+    "CTCLoss",
+    "CosineEmbeddingLoss",
+    "CrossEntropyLoss",
+    "GaussianNLLLoss",
+    "HingeEmbeddingLoss",
+    "HuberLoss",
+    "KLDivLoss",
+    "L1Loss",
+    "MSELoss",
+    "MarginRankingLoss",
+    "MultiLabelMarginLoss",
+    "MultiLabelSoftMarginLoss",
+    "MultiMarginLoss",
+    "NLLLoss",
+    "NLLLoss2d",
+    "PoissonNLLLoss",
+    "SmoothL1Loss",
+    "SoftMarginLoss",
+    "TripletMarginLoss",
+    "TripletMarginWithDistanceLoss",
+    # torch.nn.modules.module
+    "Module",
+    # torch.nn.modules.normalization
+    "CrossMapLRN2d",
+    "GroupNorm",
+    "LayerNorm",
+    "LocalResponseNorm",
+    # torch.nn.modules.padding
+    "CircularPad1d",
+    "CircularPad2d",
+    "CircularPad3d",
+    "ZeroPad1d",
+    "ZeroPad2d",
+    "ZeroPad3d",
+    # torch.nn.modules.pixelshuffle
+    "PixelShuffle",
+    "PixelUnshuffle",
+    # torch.nn.modules.pooling
+    "AdaptiveAvgPool1d",
+    "AdaptiveAvgPool2d",
+    "AdaptiveAvgPool3d",
+    "AdaptiveMaxPool1d",
+    "AdaptiveMaxPool2d",
+    "AdaptiveMaxPool3d",
+    "AvgPool1d",
+    "AvgPool2d",
+    "AvgPool3d",
+    "FractionalMaxPool2d",
+    "FractionalMaxPool3d",
+    "LPPool1d",
+    "LPPool2d",
+    "LPPool3d",
+    "MaxPool1d",
+    "MaxPool2d",
+    "MaxPool3d",
+    "MaxUnpool1d",
+    "MaxUnpool2d",
+    "MaxUnpool3d",
+    # torch.nn.modules.rnn
+    "GRU",
+    "GRUCell",
+    "LSTM",
+    "LSTMCell",
+    "RNN",
+    "RNNBase",
+    "RNNCell",
+    "RNNCellBase",
+    # torch.nn.modules.sparse
+    "Embedding",
+    "EmbeddingBag",
+    # torch.nn.modules.upsampling
+    "Upsample",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.nn.parallel.data_parallel
     "DataParallel",
     # torch.nn.parallel.distributed
@@ -2819,8 +3262,59 @@
     # torch.onnx.verification
     "OnnxBackend",
     "OnnxTestCaseRepro",
+<<<<<<< HEAD
+    # torch.optim.optimizer
+    "Optimizer",
+=======
+    # torch.optim.adadelta
+    "Adadelta",
+    # torch.optim.adagrad
+    "Adagrad",
+    # torch.optim.adam
+    "Adam",
+    # torch.optim.adamax
+    "Adamax",
+    # torch.optim.adamw
+    "AdamW",
+    # torch.optim.asgd
+    "ASGD",
+    # torch.optim.lbfgs
+    "LBFGS",
+    # torch.optim.lr_scheduler
+    "ChainedScheduler",
+    "ConstantLR",
+    "CosineAnnealingLR",
+    "CosineAnnealingWarmRestarts",
+    "CyclicLR",
+    "ExponentialLR",
+    "LRScheduler",
+    "LambdaLR",
+    "LinearLR",
+    "MultiStepLR",
+    "MultiplicativeLR",
+    "OneCycleLR",
+    "PolynomialLR",
+    "ReduceLROnPlateau",
+    "SequentialLR",
+    "StepLR",
+    # torch.optim.nadam
+    "NAdam",
     # torch.optim.optimizer
     "Optimizer",
+    # torch.optim.radam
+    "RAdam",
+    # torch.optim.rmsprop
+    "RMSprop",
+    # torch.optim.rprop
+    "Rprop",
+    # torch.optim.sgd
+    "SGD",
+    # torch.optim.sparse_adam
+    "SparseAdam",
+    # torch.optim.swa_utils
+    "AveragedModel",
+    "SWALR",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.overrides
     "BaseTorchFunctionMode",
     "TorchFunctionMode",
@@ -3224,8 +3718,11 @@ def linkcode_resolve(domain, info):
     "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
 ]
 
+<<<<<<< HEAD
 html_js_files = ["js/runllm-widget.js"]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from sphinx.ext.coverage import CoverageBuilder
 
 
diff --git a/docs/source/cpp_index.rst b/docs/source/cpp_index.rst
index 37571b9c60bc2..feb55d46037c1 100644
--- a/docs/source/cpp_index.rst
+++ b/docs/source/cpp_index.rst
@@ -7,6 +7,23 @@ C++
 
 PyTorch provides several features for working with C++, and it’s best to choose from them based on your needs. At a high level, the following support is available:
 
+<<<<<<< HEAD
+=======
+TorchScript C++ API
+--------------------
+`TorchScript <https://pytorch.org/docs/stable/jit.html>`__ allows PyTorch models defined in Python to be serialized and then loaded and run in C++ capturing the model code via compilation or tracing its execution. You can learn more in the `Loading a TorchScript Model in C++ tutorial <https://pytorch.org/tutorials/advanced/cpp_export.html>`__. This means you can define your models in Python as much as possible, but subsequently export them via TorchScript for doing no-Python execution in production or embedded environments. The TorchScript C++ API is used to interact with these models and the TorchScript execution engine, including:
+
+* Loading serialized TorchScript models saved from Python
+* Doing simple model modifications if needed (e.g. pulling out submodules)
+* Constructing the input and doing preprocessing using C++ Tensor API
+
+Extending PyTorch and TorchScript with C++ Extensions
+------------------------------------------------------
+TorchScript can be augmented with user-supplied code through custom operators and custom classes.
+Once registered with TorchScript, these operators and classes can be invoked in TorchScript code run from
+Python or from C++ as part of a serialized TorchScript model. The `Extending TorchScript with Custom C++ Operators <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`__ tutorial walks through interfacing TorchScript with OpenCV. In addition to wrapping a function call with a custom operator, C++ classes and structs can be bound into TorchScript through a pybind11-like interface which is explained in the `Extending TorchScript with Custom C++ Classes <https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html>`__ tutorial.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor and Autograd in C++
 ---------------------------
 Most of the tensor and autograd operations in PyTorch Python API are also available in the C++ API. These include:
@@ -17,7 +34,13 @@ Most of the tensor and autograd operations in PyTorch Python API are also availa
 
 Authoring Models in C++
 ------------------------
+<<<<<<< HEAD
 We provide the full capability of authoring and training a neural net model purely in C++, with familiar components such as ``torch::nn`` / ``torch::nn::functional`` / ``torch::optim`` that closely resemble the Python API.
+=======
+The "author in TorchScript, infer in C++" workflow requires model authoring to be done in TorchScript.
+However, there might be cases where the model has to be authored in C++ (e.g. in workflows where a Python
+component is undesirable). To serve such use cases, we provide the full capability of authoring and training a neural net model purely in C++, with familiar components such as ``torch::nn`` / ``torch::nn::functional`` / ``torch::optim`` that closely resemble the Python API.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 * For an overview of the PyTorch C++ model authoring and training API, please see: https://pytorch.org/cppdocs/frontend.html
 * For a detailed tutorial on how to use the API, please see: https://pytorch.org/tutorials/advanced/cpp_frontend.html
diff --git a/docs/source/cpu.rst b/docs/source/cpu.rst
index f241ca7b98942..40e6937fff135 100644
--- a/docs/source/cpu.rst
+++ b/docs/source/cpu.rst
@@ -10,7 +10,10 @@ torch.cpu
     current_device
     current_stream
     is_available
+<<<<<<< HEAD
     is_initialized
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     synchronize
     stream
     set_device
diff --git a/docs/source/cuda.md b/docs/source/cuda.md
index 94894942b74e5..0af4e67ab04a2 100644
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@@ -15,7 +15,10 @@
 
     StreamContext
     can_device_access_peer
+<<<<<<< HEAD
     check_error
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     current_blas_handle
     current_device
     current_stream
@@ -35,7 +38,10 @@
     init
     ipc_collect
     is_available
+<<<<<<< HEAD
     is_bf16_supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_initialized
     is_tf32_supported
     memory_usage
@@ -176,6 +182,13 @@
 .. autoclass:: torch.cuda.use_mem_pool
 ```
 
+<<<<<<< HEAD
+=======
+% FIXME The following doesn't seem to exist. Is it supposed to?
+% https://github.com/pytorch/pytorch/issues/27785
+% .. autofunction:: reset_max_memory_reserved
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## NVIDIA Tools Extension (NVTX)
 
 ```{eval-rst}
@@ -258,6 +271,7 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 
 ```
 
+<<<<<<< HEAD
 ## Green Contexts (experimental)
 
 `torch.cuda.green_contexts` provides thin wrappers around the CUDA Green Context APIs
@@ -280,6 +294,8 @@ See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example
 ```
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 % This module needs to be documented. Adding here in the meantime
 
 % for tracking purposes
@@ -289,11 +305,22 @@ See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
+=======
+.. py:module:: torch.cuda.error
+```
+
+```{eval-rst}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.cuda.gds
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. py:module:: torch.cuda.green_contexts
+=======
+.. py:module:: torch.cuda.graphs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
@@ -313,12 +340,23 @@ See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
+=======
+.. py:module:: torch.cuda.random
+```
+
+```{eval-rst}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.cuda.sparse
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. toctree::
     :hidden:
 
     cuda.aliases.md
+=======
+.. py:module:: torch.cuda.streams
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/docs/source/cuda.tunable.md b/docs/source/cuda.tunable.md
index 6d877e05397b9..e59ae23f3d946 100644
--- a/docs/source/cuda.tunable.md
+++ b/docs/source/cuda.tunable.md
@@ -69,6 +69,17 @@
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
+=======
+.. autofunction:: write_file_on_exit
+```
+
+```{eval-rst}
+.. autofunction:: write_file
+```
+
+```{eval-rst}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autofunction:: read_file
 ```
 
@@ -87,7 +98,10 @@
 ```{eval-rst}
 .. autofunction:: get_rotating_buffer_size
 ```
+<<<<<<< HEAD
 
 ```{eval-rst}
 .. autofunction:: set_numerical_check_tolerances
-```
\ No newline at end of file
+```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/deploy.md b/docs/source/deploy.md
new file mode 100644
index 0000000000000..ef5131717bf7b
--- /dev/null
+++ b/docs/source/deploy.md
@@ -0,0 +1,8 @@
+---
+orphan: true
+---
+
+# torch::deploy has been moved to pytorch/multipy <!-- codespell:ignore -->
+
+
+``torch::deploy`` has been moved to its new home at [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy). <!-- codespell:ignore -->
diff --git a/docs/source/distributed.checkpoint.md b/docs/source/distributed.checkpoint.md
index c733ffef18d97..f13bf2decbd13 100644
--- a/docs/source/distributed.checkpoint.md
+++ b/docs/source/distributed.checkpoint.md
@@ -36,11 +36,14 @@ The entrypoints to load and save a checkpoint are the following:
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. autoclass:: torch.distributed.checkpoint.state_dict_saver.AsyncSaveResponse
   :members:
 ```
 
 ```{eval-rst}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autofunction::  save
 ```
 
@@ -76,6 +79,7 @@ The following module is also useful for additional customization of the staging
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. autoclass:: torch.distributed.checkpoint.staging.DefaultStager
   :members:
 ```
@@ -86,6 +90,8 @@ The following module is also useful for additional customization of the staging
 ```
 
 ```{eval-rst}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autoclass:: torch.distributed.checkpoint.staging.BlockingAsyncStager
   :members:
 ```
@@ -173,9 +179,12 @@ We also provide other storage layers, including ones to interact with HuggingFac
 .. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageWriter
   :members:
 
+<<<<<<< HEAD
 .. autoclass:: torch.distributed.checkpoint.QuantizedHuggingFaceStorageReader
   :members:
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 We provide default implementations of `LoadPlanner` and `SavePlanner` that
 can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
 
diff --git a/docs/source/distributed.elastic.md b/docs/source/distributed.elastic.md
index 1c7177dd4a9a0..5b5db578e207a 100644
--- a/docs/source/distributed.elastic.md
+++ b/docs/source/distributed.elastic.md
@@ -29,7 +29,10 @@ elastic/metrics
 elastic/events
 elastic/subprocess_handler
 elastic/control_plane
+<<<<<<< HEAD
 elastic/numa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{toctree}
diff --git a/docs/source/distributed.fsdp.fully_shard.md b/docs/source/distributed.fsdp.fully_shard.md
index d19c26067df1b..3afe725704117 100644
--- a/docs/source/distributed.fsdp.fully_shard.md
+++ b/docs/source/distributed.fsdp.fully_shard.md
@@ -123,7 +123,10 @@ The frontend API is `fully_shard` that can be called on a `module`:
 .. autoclass:: CPUOffloadPolicy
     :members:
 ```
+<<<<<<< HEAD
 
 ```{eval-rst}
 .. autofunction:: share_comm_ctx
 ```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 1c9d374b8ab02..802e4b60474cf 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -20,6 +20,7 @@ for a brief introduction to all features related to distributed training.
 
 ## Backends
 
+<<<<<<< HEAD
 `torch.distributed` supports four built-in backends, each with
 different capabilities. The table below shows which functions are available
 for use with a CPU or GPU for each backend. For NCCL, GPU refers to CUDA GPU
@@ -55,18 +56,61 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
 | barrier        | ✓   | ✘   | ✓   | ?   | ✘   | ✓   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
+=======
+`torch.distributed` supports three built-in backends, each with
+different capabilities. The table below shows which functions are available
+for use with CPU / CUDA tensors.
+MPI supports CUDA only if the implementation used to build PyTorch supports it.
+
+```{eval-rst}
++----------------+-----------+-----------+-----------+
+| Backend        | ``gloo``  | ``mpi``   | ``nccl``  |
++----------------+-----+-----+-----+-----+-----+-----+
+| Device         | CPU | GPU | CPU | GPU | CPU | GPU |
++================+=====+=====+=====+=====+=====+=====+
+| send           | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| recv           | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| broadcast      | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_reduce     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| reduce         | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_gather     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| gather         | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| scatter        | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| reduce_scatter | ✓   | ✓   | ✘   | ✘   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_to_all     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| barrier        | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ### Backends that come with PyTorch
 
+<<<<<<< HEAD
 PyTorch distributed package supports Linux (stable), macOS (stable), and Windows (prototype).
+=======
+PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
 distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
 included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI
 installed.)
 
 :::{note}
+<<<<<<< HEAD
 As of PyTorch v1.8, Windows supports all collective communications backends but NCCL,
+=======
+As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 If the `init_method` argument of {func}`init_process_group` points to a file it must adhere
 to the following schema:
 
@@ -83,9 +127,14 @@ In the past, we were often asked: "which backend should I use?".
 
 - Rule of thumb
 
+<<<<<<< HEAD
   - Use the NCCL backend for distributed training with CUDA **GPU**.
   - Use the XCCL backend for distributed training with XPU **GPU**.
   - Use the Gloo backend for distributed training with **CPU**.
+=======
+  - Use the NCCL backend for distributed **GPU** training
+  - Use the Gloo backend for distributed **CPU** training.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - GPU hosts with InfiniBand interconnect
 
@@ -221,6 +270,7 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
 
 ```{eval-rst}
 .. autofunction:: torch.distributed.distributed_c10d.is_xccl_available
+<<<<<<< HEAD
 .. autofunction:: torch.distributed.distributed_c10d.batch_isend_irecv
 .. autofunction:: torch.distributed.distributed_c10d.destroy_process_group
 .. autofunction:: torch.distributed.distributed_c10d.is_backend_available
@@ -231,6 +281,8 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
 .. autofunction:: torch.distributed.distributed_c10d.is_nccl_available
 .. autofunction:: torch.distributed.distributed_c10d.is_torchelastic_launched
 .. autofunction:: torch.distributed.distributed_c10d.is_ucc_available
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
@@ -1150,10 +1202,13 @@ If you are running single node training, it may be convenient to interactively b
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. py:module:: torch.distributed.checkpoint.quantized_hf_storage
 ```
 
 ```{eval-rst}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.distributed.checkpoint.metadata
 ```
 
@@ -1492,9 +1547,12 @@ If you are running single node training, it may be convenient to interactively b
 ```{eval-rst}
 .. py:module:: torch.distributed.checkpoint.state_dict
 ```
+<<<<<<< HEAD
 
 ```{toctree}
 :hidden:
 
 distributed._dist2
 ```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/distributed.pipelining.md b/docs/source/distributed.pipelining.md
index 9d8b6998aae43..9db6d553646fd 100644
--- a/docs/source/distributed.pipelining.md
+++ b/docs/source/distributed.pipelining.md
@@ -505,10 +505,13 @@ The following set of APIs transform your model into a pipeline representation.
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. autoclass:: ScheduleDualPipeV
 ```
 
 ```{eval-rst}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autoclass:: PipelineScheduleSingle
   :members:
 ```
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
index 4524bdfbbf23b..969b2cbb265c4 100644
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@@ -100,12 +100,15 @@ DTensor supports the following types of {class}`Placement` on each {class}`Devic
 ```
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. autoclass:: MaskPartial
   :members:
   :undoc-members:
 ```
 
 ```{eval-rst}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autoclass:: Placement
   :members:
   :undoc-members:
@@ -185,6 +188,7 @@ specifying the {class}`DeviceMesh` and {class}`Placement` for the {class}`DTenso
 
 ```
 
+<<<<<<< HEAD
 ### Random Operations
 
 DTensor provides distributed RNG functionality to ensure that random operations on sharded tensors get unique values, and random operations on replicated tensors get the same values. This system requires that all participating
@@ -197,6 +201,8 @@ When using DTensor together with Pipeline Parallelism, ranks for each pipeline s
 
 DTensor's RNG infra is based on the philox based RNG algorithm, and supports any philox based backend (cuda, and other cuda-like devices), but unfortunately does not yet support the CPU backend.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## Debugging
 
 ```{eval-rst}
@@ -266,6 +272,7 @@ these features.
 ```{eval-rst}
 .. py:module:: torch.distributed.tensor.device_mesh
 ```
+<<<<<<< HEAD
 
 ## Mixed Tensor and DTensor operations
 
@@ -336,3 +343,5 @@ can lead to silent incorrectness.
 
 - [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
 - [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/distributed.tensor.parallel.md b/docs/source/distributed.tensor.parallel.md
index fbfb6f1be2b8c..63c2be0a9d275 100644
--- a/docs/source/distributed.tensor.parallel.md
+++ b/docs/source/distributed.tensor.parallel.md
@@ -5,7 +5,11 @@
 # Tensor Parallelism - torch.distributed.tensor.parallel
 
 Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
+<<<<<<< HEAD
 ([DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md))
+=======
+(DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
 
 :::{warning}
@@ -89,4 +93,8 @@ Parallelized cross-entropy loss computation (loss parallelism), is supported via
 ```
 :::{warning}
     The loss_parallel API is experimental and subject to change.
+<<<<<<< HEAD
 :::
+=======
+:::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/draft_export.md b/docs/source/draft_export.md
new file mode 100644
index 0000000000000..cc7247d3b526d
--- /dev/null
+++ b/docs/source/draft_export.md
@@ -0,0 +1,262 @@
+(draft-export)=
+
+# Draft Export
+
+:::{warning}
+This feature is not meant to be used in production and is designed to be
+used as a tool for debugging torch.export tracing errors.
+:::
+
+Draft-export is a new version of export, which is designed to consistently
+produce a graph, even if there are potential soundness issues, and to generate a
+report listing out all of the issues export encountered during
+tracing and providing additional debugging information. For custom operators that
+don't have fake kernels, it will also generate a profile which you can register
+to automatically generate a fake kernel.
+
+Have you ever tried to export a model using {func}`torch.export.export`, only to
+encounter a data-dependent issue? You fix it, but then run into a missing fake
+kernel problem. And after resolving that, you get hit with another
+data-dependent issue. You wonder to yourself, I wish there was a way I could
+just get a graph to play around with, and be able to view all the issues in one
+place so that I can fix them later…
+
+`draft_export` to the rescue!
+
+`draft_export` is a version of export which will always successfully export a
+graph, even if there are potential soundness issues. These issues will then be
+compiled into a report for clearer visualization, which can be fixed later on.
+
+## What sort of errors does it catch?
+
+Draft-export helps to catch and debug the following errors:
+
+- Guard on data-dependent errors
+- Constraint violation errors
+- Missing fake kernels
+- Incorrectly written fake kernels
+
+## How does it work?
+
+In normal export, we will convert the sample inputs into FakeTensors and use
+them to record operations and trace the program into a graph. Input tensor
+shapes that can change (which are marked through `dynamic_shapes`), or values
+within tensors (typically from an `.item()` call) will be represented as a symbolic
+shape (`SymInt`) instead of a concrete integer. However some issues may occur
+while tracing - we may run into guards that we cannot evaluate, like if we want
+to check if some item in a tensor is greater than 0 (`u0 >= 0`). Since the tracer
+doesn't know anything about the value of `u0`, it will throw a data-dependent
+error. If the model uses a custom operator but a fake kernel hasn't been
+defined for it, then we will error with `fake_tensor.UnsupportedOperatorException`
+because export doesn't know how to apply this on `FakeTensors`. If a custom
+operator has a fake kernel implemented incorrectly, export will silently produce
+an incorrect graph that doesn't match the eager behavior.
+
+To fix the above errors, draft-export uses *real tensor tracing* to guide us on
+how to proceed when tracing. As we trace the model with fake tensors, for every
+operation that happens on a fake tensor, draft-export will also run the operator
+on stored real tensors which come from the example inputs passed to export. This
+allows us to address the above errors: When we reach a guard that we cannot
+evaluate, like `u0 >= 0`, we will use the stored real tensor values to
+evaluate this guard. Runtime asserts will be added into the graph to ensure that
+the graph asserts the same guard that we assumed while tracing. If we run into
+a custom operator without a fake kernel, we will run the operator's normal
+kernel with the stored real tensors, and return a fake tensor with the same rank
+but unbacked shapes. Since we have the real tensor output for every operation,
+we will compare this with the fake tensor output from the fake kernel. If the
+fake kernel is implemented incorrectly, we will then catch this behavior and
+generate a more correct fake kernel.
+
+## How can I use draft export?
+
+Let's say you're trying to export this piece of code:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y, z):
+        res = torch.ops.mylib.foo2(x, y)
+
+        a = res.item()
+        a = -a
+        a = a // 3
+        a = a + 5
+
+        z = torch.cat([z, z])
+
+        torch._check_is_size(a)
+        torch._check(a < z.shape[0])
+
+        return z[:a]
+
+inp = (torch.tensor(3), torch.tensor(4), torch.ones(3, 3))
+
+ep = torch.export.export(M(), inp)
+```
+
+This runs into a “missing fake kernel” error for `mylib.foo2` and then a
+`GuardOnDataDependentExpression` because of the slicing of `z` with `a`,
+an unbacked symint.
+
+To call `draft-export`, we can replace the `torch.export` line with the following:
+
+```python
+ep = torch.export.draft_export(M(), inp)
+```
+
+`ep` is a valid ExportedProgram which can now be passed through further environments!
+
+## Debugging with draft-export
+
+In the terminal output from draft-export, you should see the following message:
+
+```
+#########################################################################################
+WARNING: 2 issue(s) found during export, and it was not able to soundly produce a graph.
+To view the report of failures in an html page, please run the command:
+    `tlparse /tmp/export_angelayi/dedicated_log_torch_trace_axpofwe2.log --export`
+Or, you can view the errors in python by inspecting `print(ep._report)`.
+########################################################################################
+```
+
+Draft-export automatically dumps logs for `tlparse`. You can view the tracing
+errors by using `print(ep._report)`, or you can pass the logs into `tlparse`
+to generate an html report.
+
+Running the `tlparse` command in the terminal will generate a
+[tlparse](https://github.com/pytorch/tlparse)
+HTML report. Here is an example of the `tlparse` report:
+
+```{image} _static/img/export/draft_export_report.png
+```
+
+Clicking into the Data Dependent Error, we will see the following page which
+contains information to help debug this error. Specifically, it contains:
+
+- The stacktrace at which this error occurs
+- A list of local variables and their shapes
+- Information for how this guard was created
+
+```{image} _static/img/export/draft_export_report_dde.png
+```
+
+## The returned Exported Program
+
+Because draft-export specializes on code paths based on the example inputs, the
+exported program resulting from draft-export is guaranteed to be runnable and
+return correct results for **at least** the given example inputs. Other inputs can
+work, as long as they match the same guards that were taken when we were
+draft-exporting.
+
+For example, if we have a graph branching on if a value is greater than 5, if in
+draft-export our example inputs were greater than 5, then the returned
+`ExportedProgram` will specialize on that branch, and will assert that the value
+is greater than 5. This means that the program will succeed if you pass in
+another value greater than 5, but will fail if you pass in a value less than 5.
+This is more sound than `torch.jit.trace`, which will silently specialize on the
+branch. The proper way for `torch.export` to support both branches would be to
+rewrite the code using `torch.cond`, which will then capture both branches.
+
+Because of the runtime assertions in the graph, the returned exported-program is
+also retraceable with `torch.export` or `torch.compile`, with a minor addition in
+the case where a custom operator is missing a fake kernel.
+
+## Generating Fake Kernels
+
+If a custom operator does not contain a fake implementation, currently
+draft-export will use the real-tensor propagation to get an output for the
+operator and continue tracing. However, if we run the exported program with fake
+tensors or retrace the exported model, we will still fail because there is still
+no fake kernel implementation.
+
+To address this, after draft-export, we will generate an operator profile for
+each custom operator call that we encounter, and store this on the report
+attached to the exported program: `ep._report.op_profiles`. Users can then use the
+context manager `torch._library.fake_profile.unsafe_generate_fake_kernels` to
+generate and register a fake implementation based on these operator profiles.
+This way future fake tensor retracing will work.
+
+The workflow would look something like:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, a, b):
+        res = torch.ops.mylib.foo(a, b)  # no fake impl
+        return res
+
+ep = draft_export(M(), (torch.ones(3, 4), torch.ones(3, 4)))
+
+with torch._library.fake_profile.unsafe_generate_fake_kernels(ep._report.op_profiles):
+    decomp = ep.run_decompositions()
+
+new_inp = (
+    torch.ones(2, 3, 4),
+    torch.ones(2, 3, 4),
+)
+
+# Save the profile to a yaml and check it into a codebase
+save_op_profiles(ep._report.op_profiles, "op_profile.yaml")
+# Load the yaml
+loaded_op_profile = load_op_profiles("op_profile.yaml")
+```
+
+The operator profile is a dictionary mapping operator name to a set of profiles
+which describe the input and outputs of the operator, and could be manually
+written, saved into a yaml file, and checked into a codebase. Here's an example
+of a profile for `mylib.foo.default`:
+
+```python
+"mylib.foo.default": {
+    OpProfile(
+        args_profile=(
+            TensorMetadata(
+                rank=2,
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+                layout=torch.strided,
+            ),
+            TensorMetadata(
+                rank=2,
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+                layout=torch.strided,
+            ),
+        ),
+        out_profile=TensorMetadata(
+            rank=2,
+            dtype=torch.float32,
+            device=torch.device("cpu"),
+            layout=torch.strided,
+        ),
+    )
+}
+```
+
+`mylib.foo.default`'s profile contains only one profile, which says that for 2
+input tensors of rank 2, dtype `torch.float32`, device `cpu`, we will return
+one tensor of rank 2, dtype `torch.float32`, and device `cpu`. Using the
+context manager, will then generate a fake kernel where given 2 input tensors of
+rank 2 (and the other tensor metadata), we will output one tensor of rank 2 (and
+the other tensor metadata).
+
+If the operator also supports other input ranks, then we can add the profile to
+this list of profiles, either by manually adding it into the existing profile or
+rerunning draft-export with new inputs to get new profiles, so that the
+generated fake kernel will support more input types. Otherwise it will error.
+
+## Where to go from here?
+
+Now that we have successfully created an `ExportedProgram` using draft-export,
+we can use further compilers such as `AOTInductor` to optimize its performance
+and produce a runnable artifact. This optimized version can then be used for
+deployment. In parallel, we can utilize the report generated by draft-export to
+identify and fix `torch.export` errors that were encountered so that the
+original model can be directly traceable with `torch.export`.
+
+```{toctree}
+:caption: Additional Links
+:maxdepth: 1
+
+torch.compiler_fake_tensor
+torch.compiler_dynamic_shapes
+torch.compiler_aot_inductor
+```
diff --git a/docs/source/export.ir_spec.md b/docs/source/export.ir_spec.md
new file mode 100644
index 0000000000000..355539ecfcc94
--- /dev/null
+++ b/docs/source/export.ir_spec.md
@@ -0,0 +1,487 @@
+(export.ir_spec)=
+
+# torch.export IR Specification
+
+Export IR is an intermediate representation (IR) for compilers, which bears
+similarities to MLIR and TorchScript. It is specifically designed to express the
+semantics of PyTorch programs. Export IR primarily represents computation in a
+streamlined list of operations, with limited support for dynamism such as
+control flows.
+
+To create an Export IR graph, a frontend can be used that soundly captures a
+PyTorch program via a trace-specializing mechanism. The resulting Export IR can
+then be optimized and executed by a backend. This can be done today through
+{func}`torch.export.export`.
+
+The key concepts that will be covered in this document include:
+
+- ExportedProgram: the data structure containing the Export IR program
+- Graph: which consists of a list of nodes.
+- Nodes: which represents operations, control flow, and metadata stored on this node.
+- Values are produced and consumed by nodes.
+- Types are associated with values and nodes.
+- The size and memory layout of values are also defined.
+
+## Assumptions
+
+This doc assumes that the audience is sufficiently familiar with PyTorch,
+specifically with {class}`torch.fx` and its related toolings. Thus it will stop
+describing contents present in {class}`torch.fx` documentation and paper.
+
+## What is Export IR
+
+Export IR is a graph-based intermediate representation IR of PyTorch programs.
+Export IR is realized on top of {class}`torch.fx.Graph`. In other words, **all
+Export IR graphs are also valid FX graphs**, and if interpreted using standard
+FX semantics, Export IR can be interpreted soundly. One implication is that an
+exported graph can be converted to a valid Python program via standard FX
+codegen.
+
+This documentation will primarily focus on highlighting areas where Export IR
+differs from FX in terms of its strictness, while skipping parts where it shares
+similarities with FX.
+
+## ExportedProgram
+
+The top-level Export IR construct is an {class}`torch.export.ExportedProgram`
+class. It bundles the computational graph of a PyTorch model (which is usually a
+{class}`torch.nn.Module`) with the parameters or weights that this model
+consumes.
+
+Some notable attributes of the {class}`torch.export.ExportedProgram` class are:
+
+- `graph_module` ({class}`torch.fx.GraphModule`): Data structure containing
+  the flattened computational graph of the PyTorch model. The graph can be
+  directly accessed through `ExportedProgram.graph`.
+- `graph_signature` ({class}`torch.export.ExportGraphSignature`): The graph
+  signature, which specifies the parameters and buffer names used and mutated
+  within the graph. Instead of storing parameters and buffers as attributes of
+  the graph, they are lifted as inputs to the graph. The graph_signature is
+  utilized to keep track of additional information on these parameters and
+  buffers.
+- `state_dict` (`Dict[str, Union[torch.Tensor, torch.nn.Parameter]]`): Data
+  structure containing the parameters and buffers.
+- `range_constraints` (`Dict[sympy.Symbol, RangeConstraint]`): For programs
+  that are exported with data dependent behavior, the metadata on each node will
+  contain symbolic shapes (which look like `s0`, `i0`). This attribute maps
+  the symbolic shapes to their lower/upper ranges.
+
+## Graph
+
+An Export IR Graph is a PyTorch program represented in the form of a DAG
+(directed acyclic graph). Each node in this graph represents a particular
+computation or operation, and edges of this graph consist of references between
+nodes.
+
+We can view Graph having this schema:
+
+```python
+class Graph:
+  nodes: List[Node]
+```
+
+In practice, Export IR's graph is realized as {class}`torch.fx.Graph` Python class.
+
+An Export IR graph contains the following nodes (Nodes will be described in more
+details in the next section):
+
+- 0 or more nodes of op type `placeholder`
+- 0 or more nodes of op type `call_function`
+- exactly 1 node of op type `output`
+
+**Collorary:** The smallest valid Graph will be of one node. i.e. nodes is never empty.
+
+**Definition:**
+The set of `placeholder` nodes of a Graph represents the **inputs** of the
+Graph of GraphModule. The `output` node of a Graph represents the **outputs**
+of the Graph of GraphModule.
+
+Example:
+
+```python
+import torch
+from torch import nn
+
+class MyModule(nn.Module):
+
+    def forward(self, x, y):
+      return x + y
+
+example_args = (torch.randn(1), torch.randn(1))
+mod = torch.export.export(MyModule(), example_args)
+print(mod.graph)
+```
+
+```python
+graph():
+  %x : [num_users=1] = placeholder[target=x]
+  %y : [num_users=1] = placeholder[target=y]
+  %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %y), kwargs = {})
+  return (add,)
+```
+
+The above is the textual representation of a Graph, with each line being a node.
+
+## Node
+
+A Node represents a particular computation or operation and is represented in
+Python using the {class}`torch.fx.Node` class. Edges between nodes are
+represented as direct references to other nodes via the `args` property of the
+Node class. Using the same FX machinery, we can represent the following
+operations that a computational graph typically needs, such as operator calls,
+placeholders (aka inputs), conditionals, and loops.
+
+The Node has the following schema:
+
+```python
+class Node:
+  name: str # name of node
+  op_name: str  # type of operation
+
+  # interpretation of the fields below depends on op_name
+  target: [str|Callable]
+  args: List[object]
+  kwargs: Dict[str, object]
+  meta: Dict[str, object]
+```
+
+**FX Text Format**
+
+As in the example above, notice that each line has this format:
+
+```
+%<name>:[...] = <op_name>[target=<target>](args = (%arg1, %arg2, arg3, arg4, …)), kwargs = {"keyword": arg5})
+```
+
+This format captures everything present in the Node class, with the exception of
+`meta`, in a compact format.
+
+Concretely:
+
+- **<name>** is the name of the node as it would appear in `node.name`.
+- **<op_name>** is the `node.op` field, which must be one of these:
+  `<call_function>`, `<placeholder>`,
+  `<get_attr>`, or `<output>`.
+- **<target>** is the target of the node as `node.target`. The meaning of this
+  field depends on `op_name`.
+- **args1, … args 4…** are what is listed in the `node.args` tuple. If a
+  value in the list is an {class}`torch.fx.Node`, then it will be especially
+  indicated with a leading **%.**
+
+For example, a call to the add operator would appear as:
+
+```
+%add1 = call_function[target = torch.op.aten.add.Tensor](args = (%x, %y), kwargs = {})
+```
+
+Where `%x`, `%y` are two other Nodes that have names x and y. Worth noting
+that the string `torch.op.aten.add.Tensor` represents the callable object that
+is actually stored in the target field, not merely its string name.
+
+The final line of this text format is:
+
+```
+return [add]
+```
+
+which is a Node with `op_name = output`, indicating that we are returning this
+one element.
+
+### call_function
+
+A `call_function` node represents a call to an operator.
+
+**Definitions**
+
+- **Functional:** We say a callable is “functional” if it satisfies all the
+  following requirements:
+
+  - Non-mutating: The operator does not mutate the value of its input (for
+    tensors, this includes both metadata and data).
+  - No side effects: The operator does not mutate states that are visible
+    from outside, like changing values of module parameters.
+
+- **Operator:** is a functional callable with a predefined schema. Examples of
+  such operators include functional ATen operators.
+
+**Representation in FX**
+
+```
+%name = call_function[target = operator](args = (%x, %y, …), kwargs = {})
+```
+
+**Differences from vanilla FX call_function**
+
+1. In FX graph, a call_function can refer to any callable, in Export IR, we
+   restrict it to only a select subset of ATen operators, custom operators, and
+   control flow operators.
+2. In Export IR, constant arguments will be embedded within the graph.
+3. In FX graph, a get_attr node can represent reading any attribute stored in
+   the graph module. However, in Export IR this is restricted to reading only
+   submodules as all parameters/buffers will be passed in as inputs to the graph
+   module.
+
+#### Metadata
+
+`Node.meta` is a dict attached to every FX node. However, the FX spec does not
+specify what metadata can or will be there. Export IR provides a stronger
+contract, specifically all `call_function` nodes will guarantee having and
+only having the following metadata fields:
+
+- `node.meta["stack_trace"]` is a string containing the Python stack trace
+  referencing the original Python source code. An example stack trace looks
+  like:
+
+  ```
+  File "my_module.py", line 19, in forward
+  return x + dummy_helper(y)
+  File "helper_utility.py", line 89, in dummy_helper
+  return y + 1
+  ```
+
+- `node.meta["val"]` describes the output of running the operation. It can be
+  of type `<symint>`, `<FakeTensor>`, a
+  `List[Union[FakeTensor, SymInt]]`, or `None`.
+
+- `node.meta["nn_module_stack"]` describes the "stacktrace" of the
+  {class}`torch.nn.Module` from which the node came, if it was from a
+  {class}`torch.nn.Module` call. For example, if a node containing the `addmm`
+  op called from a {class}`torch.nn.Linear` module inside of a
+  {class}`torch.nn.Sequential` module, the `nn_module_stack` would look
+  something like:
+
+  ```
+  {'self_linear': ('self.linear', <class 'torch.nn.Linear'>), 'self_sequential': ('self.sequential', <class 'torch.nn.Sequential'>)}
+  ```
+
+- `node.meta["source_fn_stack"]` contains the torch function or the leaf
+  {class}`torch.nn.Module` class this node was called from before decomposition.
+  For example, a node containing the `addmm` op from a
+  {class}`torch.nn.Linear` module call would contain {class}`torch.nn.Linear` in
+  their `source_fn`, and a node containing the `addmm` op from a
+  {class}`torch.nn.functional.Linear` module call would contain
+  {class}`torch.nn.functional.Linear` in their `source_fn`.
+
+### placeholder
+
+Placeholder represents an input to a graph. Its semantics are exactly the same as in FX.
+Placeholder nodes must be the first N nodes in the nodes list of a graph. N can be zero.
+
+**Representation in FX**
+
+```python
+%name = placeholder[target = name](args = ())
+```
+
+The target field is a string which is the name of input.
+
+`args`, if non-empty, should be of size 1 representing the default value of this input.
+
+**Metadata**
+
+Placeholder nodes also have `meta[‘val’]`, like `call_function` nodes. The
+`val` field in this case represents the input shape/dtype that the graph is
+expected to receive for this input parameter.
+
+### output
+
+An output call represents a return statement in a function; it thus terminates the
+current graph. There is one and only one output node, and it will always be the
+last node of the graph.
+
+**Representation in FX**
+
+```
+output[](args = (%something, …))
+```
+
+This has the exact semantics as in {class}`torch.fx`. `args` represents the node
+to be returned.
+
+**Metadata**
+
+Output node has the same metadata as `call_function` nodes.
+
+### get_attr
+
+`get_attr` nodes represent reading a submodule from the encapsulating
+{class}`torch.fx.GraphModule`. Unlike a vanilla FX graph from
+{func}`torch.fx.symbolic_trace` in which `get_attr` nodes are used to read
+attributes such as parameters and buffers from the top-level
+{class}`torch.fx.GraphModule`, parameters and buffers are passed in as
+inputs to the graph module, and stored in the top-level
+{class}`torch.export.ExportedProgram`.
+
+**Representation in FX**
+
+```python
+%name = get_attr[target = name](args = ())
+```
+
+**Example**
+
+Consider the following model:
+
+```python
+from functorch.experimental.control_flow import cond
+
+def true_fn(x):
+    return x.sin()
+
+def false_fn(x):
+    return x.cos()
+
+def f(x, y):
+    return cond(y, true_fn, false_fn, [x])
+```
+
+Graph:
+
+```
+graph():
+    %x_1 : [num_users=1] = placeholder[target=x_1]
+    %y_1 : [num_users=1] = placeholder[target=y_1]
+    %true_graph_0 : [num_users=1] = get_attr[target=true_graph_0]
+    %false_graph_0 : [num_users=1] = get_attr[target=false_graph_0]
+    %conditional : [num_users=1] = call_function[target=torch.ops.higher_order.cond](args = (%y_1, %true_graph_0, %false_graph_0, [%x_1]), kwargs = {})
+    return conditional
+```
+
+The line, `%true_graph_0 : [num_users=1] = get_attr[target=true_graph_0]`,
+reads the submodule `true_graph_0` which contains the `sin` operator.
+
+## References
+
+### SymInt
+
+A SymInt is an object that can either be a literal integer or a symbol that represents
+an Integer (represented in Python by `sympy.Symbol` class). When SymInt is a
+symbol, it describes a variable of type integer that is unknown to the graph at
+compile time, that is, its value is only known at runtime.
+
+### FakeTensor
+
+A FakeTensor is an object that contains the metadata of a tensor. It can be
+viewed as having the following metadata.
+
+```python
+class FakeTensor:
+  size: List[SymInt]
+  dtype: torch.dtype
+  device: torch.device
+  dim_order: List[int]  # This doesn't exist yet
+```
+
+The size field of FakeTensor is a list of integers or SymInts. If SymInts are
+present, this means this tensor has a dynamic shape. If integers are present, it
+is assumed that the tensor will have that exact static shape. The rank of the
+TensorMeta is never dynamic. The dtype field represents the dtype of the
+output of that node. There are no implicit type promotions in Edge IR. There
+are no strides in FakeTensor.
+
+In other words:
+
+- If the operator in node.target returns a Tensor, then `node.meta['val']` is a
+  FakeTensor describing that tensor.
+- If the operator in node.target returns an n-tuple of Tensors, then
+  `node.meta['val']` is an n-tuple of FakeTensors describing each tensor.
+- If the operator in node.target returns an int/float/scalar that is known at
+  compile time, then `node.meta['val']` is None.
+- If the operator in node.target returns an int/float/scalar that is not known
+  at compile time, then `node.meta['val']` is of type SymInt.
+
+For example:
+
+- `aten::add` returns a Tensor; so its spec will be a FakeTensor with dtype
+  and size of the tensor returned by this operator.
+- `aten::sym_size` returns an integer; so its val will be a SymInt because its
+  value is only available at runtime.
+- `max_pool2d_with_indexes` returns a tuple of (Tensor, Tensor); so the spec
+  will also be a 2-tuple of FakeTensor objects, the first TensorMeta describes
+  the first element of the return value etc.
+
+Python code:
+
+```python
+def add_one(x):
+  return torch.ops.aten(x, 1)
+```
+
+Graph:
+
+```
+graph():
+  %ph_0 : [#users=1] = placeholder[target=ph_0]
+  %add_tensor : [#users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%ph_0, 1), kwargs = {})
+  return [add_tensor]
+```
+
+FakeTensor:
+
+```python
+FakeTensor(dtype=torch.int, size=[2,], device=CPU)
+```
+
+### Pytree-able Types
+
+We define a type “Pytree-able”, if it is either a leaf type or a container type
+that contains other Pytree-able types.
+
+Note:
+
+> The concept of pytree is the same as the one documented
+> [here](https://jax.readthedocs.io/en/latest/pytrees.html) for JAX:
+
+The following types are defined as **leaf type**:
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Type
+     - Definition
+   * - Tensor
+     - :class:`torch.Tensor`
+   * - Scalar
+     - Any numerical types from Python, including integral types, floating point types, and zero dimensional tensors.
+   * - int
+     - Python int (bound as int64_t in C++)
+   * - float
+     - Python float (bound as double in C++)
+   * - bool
+     - Python bool
+   * - str
+     - Python string
+   * - ScalarType
+     - :class:`torch.dtype`
+   * - Layout
+     - :class:`torch.layout`
+   * - MemoryFormat
+     - :class:`torch.memory_format`
+   * - Device
+     - :class:`torch.device`
+```
+
+The following types are defined as **container type**:
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Type
+     - Definition
+   * - Tuple
+     - Python tuple
+   * - List
+     - Python list
+   * - Dict
+     - Python dict with Scalar keys
+   * - NamedTuple
+     - Python namedtuple
+   * - Dataclass
+     - Must be registered through `register_dataclass <https://github.com/pytorch/pytorch/blob/901aa85b58e8f490631ce1db44e6555869a31893/torch/export/__init__.py#L693>`__
+   * - Custom class
+     - Any custom class defined with `_register_pytree_node <https://github.com/pytorch/pytorch/blob/901aa85b58e8f490631ce1db44e6555869a31893/torch/utils/_pytree.py#L72>`__
+```
diff --git a/docs/source/export.md b/docs/source/export.md
index 2ab7d85303c0d..86f253a3a08ff 100644
--- a/docs/source/export.md
+++ b/docs/source/export.md
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 ---
 file_format: mystnb
 kernelspec:
@@ -8,10 +9,20 @@ mystnb:
   merge_streams: True
 ---
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 (torch.export)=
 
 # torch.export
 
+<<<<<<< HEAD
+=======
+:::{warning}
+This feature is a prototype under active development and there WILL BE
+BREAKING CHANGES in the future.
+:::
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## Overview
 
 {func}`torch.export.export` takes a {class}`torch.nn.Module` and produces a traced graph
@@ -19,9 +30,15 @@ representing only the Tensor computation of the function in an Ahead-of-Time
 (AOT) fashion, which can subsequently be executed with different outputs or
 serialized.
 
+<<<<<<< HEAD
 ```{code-cell}
 import torch
 from torch.export import export, ExportedProgram
+=======
+```python
+import torch
+from torch.export import export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Mod(torch.nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -31,10 +48,60 @@ class Mod(torch.nn.Module):
 
 example_args = (torch.randn(10, 10), torch.randn(10, 10))
 
+<<<<<<< HEAD
 exported_program: ExportedProgram = export(Mod(), args=example_args)
 print(exported_program)
 ```
 
+=======
+exported_program: torch.export.ExportedProgram = export(
+    Mod(), args=example_args
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[10, 10]", y: "f32[10, 10]"):
+            # code: a = torch.sin(x)
+            sin: "f32[10, 10]" = torch.ops.aten.sin.default(x)
+
+            # code: b = torch.cos(y)
+            cos: "f32[10, 10]" = torch.ops.aten.cos.default(y)
+
+            # code: return a + b
+            add: f32[10, 10] = torch.ops.aten.add.Tensor(sin, cos)
+            return (add,)
+
+    Graph signature:
+        ExportGraphSignature(
+            input_specs=[
+                InputSpec(
+                    kind=<InputKind.USER_INPUT: 1>,
+                    arg=TensorArgument(name='x'),
+                    target=None,
+                    persistent=None
+                ),
+                InputSpec(
+                    kind=<InputKind.USER_INPUT: 1>,
+                    arg=TensorArgument(name='y'),
+                    target=None,
+                    persistent=None
+                )
+            ],
+            output_specs=[
+                OutputSpec(
+                    kind=<OutputKind.USER_OUTPUT: 1>,
+                    arg=TensorArgument(name='add'),
+                    target=None
+                )
+            ]
+        )
+    Range constraints: {}
+```
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 `torch.export` produces a clean intermediate representation (IR) with the
 following invariants. More specifications about the IR can be found
 {ref}`here <export.ir_spec>`.
@@ -44,9 +111,15 @@ following invariants. More specifications about the IR can be found
 - **Normalized**: There are no Python semantics within the graph. Submodules
   from the original programs are inlined to form one fully flattened
   computational graph.
+<<<<<<< HEAD
 - **Graph properties**: By default, the graph may contain both functional and
   non-functional operators (including mutations). To obtain a purely functional
   graph, use `run_decompositions()` which removes mutations and aliasing.
+=======
+- **Graph properties**: The graph is purely functional, meaning it does not
+  contain operations with side effects such as mutations or aliasing. It does
+  not mutate any intermediate values, parameters, or buffers.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - **Metadata**: The graph contains metadata captured during tracing, such as a
   stacktrace from user's code.
 
@@ -56,8 +129,13 @@ Under the hood, `torch.export` leverages the following latest technologies:
   called the Frame Evaluation API to safely trace PyTorch graphs. This
   provides a massively improved graph capturing experience, with much fewer
   rewrites needed in order to fully trace the PyTorch code.
+<<<<<<< HEAD
 - **AOT Autograd** ensures the graph is decomposed/lowered to the ATen operator
   set. When using `run_decompositions()`, it can also provide functionalization.
+=======
+- **AOT Autograd** provides a functionalized PyTorch graph and ensures the graph
+  is decomposed/lowered to the ATen operator set.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - **Torch FX (torch.fx)** is the underlying representation of the graph,
   allowing flexible Python-based transformations.
 
@@ -92,6 +170,7 @@ level). Note that users can still use {func}`torch.fx.symbolic_trace` as a
 preprocessing step before `torch.export`.
 
 Compared to {func}`torch.jit.script`, `torch.export` does not capture Python
+<<<<<<< HEAD
 control flow or data structures, unless using explicit {ref}`control flow operators <cond>`,
 but it supports more Python language features due to its comprehensive coverage
 over Python bytecodes. The resulting graphs are simpler and only have straight
@@ -112,6 +191,30 @@ example:
 ```{code-cell}
 import torch
 from torch.export import export, ExportedProgram
+=======
+control flow or data structures, but it supports more Python language features
+than TorchScript (as it is easier to have comprehensive coverage over Python
+bytecodes). The resulting graphs are simpler and only have straight line control
+flow (except for explicit control flow operators).
+
+Compared to {func}`torch.jit.trace`, `torch.export` is sound: it is able to
+trace code that performs integer computation on sizes and records all of the
+side-conditions necessary to show that a particular trace is valid for other
+inputs.
+
+## Exporting a PyTorch Model
+
+### An Example
+
+The main entrypoint is through {func}`torch.export.export`, which takes a
+callable ({class}`torch.nn.Module`, function, or method) and sample inputs, and
+captures the computation graph into an {class}`torch.export.ExportedProgram`. An
+example:
+
+```python
+import torch
+from torch.export import export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Simple module for demonstration
 class M(torch.nn.Module):
@@ -131,6 +234,7 @@ class M(torch.nn.Module):
 example_args = (torch.randn(1, 3, 256, 256),)
 example_kwargs = {"constant": torch.ones(1, 16, 256, 256)}
 
+<<<<<<< HEAD
 exported_program: ExportedProgram = export(
     M(), args=example_args, kwargs=example_kwargs
 )
@@ -138,6 +242,66 @@ print(exported_program)
 
 # To run the exported program, we can use the `module()` method
 print(exported_program.module()(torch.randn(1, 3, 256, 256), constant=torch.ones(1, 16, 256, 256)))
+=======
+exported_program: torch.export.ExportedProgram = export(
+    M(), args=example_args, kwargs=example_kwargs
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+    def forward(self, p_conv_weight: "f32[16, 3, 3, 3]", p_conv_bias: "f32[16]", x: "f32[1, 3, 256, 256]", constant: "f32[1, 16, 256, 256]"):
+            # code: a = self.conv(x)
+            conv2d: "f32[1, 16, 256, 256]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias, [1, 1], [1, 1])
+
+            # code: a.add_(constant)
+            add_: "f32[1, 16, 256, 256]" = torch.ops.aten.add_.Tensor(conv2d, constant)
+
+            # code: return self.maxpool(self.relu(a))
+            relu: "f32[1, 16, 256, 256]" = torch.ops.aten.relu.default(add_)
+            max_pool2d: "f32[1, 16, 85, 85]" = torch.ops.aten.max_pool2d.default(relu, [3, 3], [3, 3])
+            return (max_pool2d,)
+
+Graph signature:
+    ExportGraphSignature(
+        input_specs=[
+            InputSpec(
+                kind=<InputKind.PARAMETER: 2>,
+                arg=TensorArgument(name='p_conv_weight'),
+                target='conv.weight',
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.PARAMETER: 2>,
+                arg=TensorArgument(name='p_conv_bias'),
+                target='conv.bias',
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.USER_INPUT: 1>,
+                arg=TensorArgument(name='x'),
+                target=None,
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.USER_INPUT: 1>,
+                arg=TensorArgument(name='constant'),
+                target=None,
+                persistent=None
+            )
+        ],
+        output_specs=[
+            OutputSpec(
+                kind=<OutputKind.USER_OUTPUT: 1>,
+                arg=TensorArgument(name='max_pool2d'),
+                target=None
+            )
+        ]
+    )
+Range constraints: {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Inspecting the `ExportedProgram`, we can note the following:
@@ -145,13 +309,19 @@ Inspecting the `ExportedProgram`, we can note the following:
 - The {class}`torch.fx.Graph` contains the computation graph of the original
   program, along with records of the original code for easy debugging.
 - The graph contains only `torch.ops.aten` operators found [here](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)
+<<<<<<< HEAD
   and custom operators.
+=======
+  and custom operators, and is fully functional, without any inplace operators
+  such as `torch.add_`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - The parameters (weight and bias to conv) are lifted as inputs to the graph,
   resulting in no `get_attr` nodes in the graph, which previously existed in
   the result of {func}`torch.fx.symbolic_trace`.
 - The {class}`torch.export.ExportGraphSignature` models the input and output
   signature, along with specifying which inputs are parameters.
 - The resulting shape and dtype of tensors produced by each node in the graph is
+<<<<<<< HEAD
   noted. For example, the `conv2d` node will result in a tensor of dtype
   `torch.float32` and shape (1, 16, 256, 256).
 
@@ -205,6 +375,185 @@ from run to run. Such dimensions must be specified by using the
 
 ```{code-cell}
 import torch
+=======
+  noted. For example, the `convolution` node will result in a tensor of dtype
+  `torch.float32` and shape (1, 16, 256, 256).
+
+(non-strict-export)=
+
+### Non-Strict Export
+
+In PyTorch 2.3, we introduced a new mode of tracing called **non-strict mode**.
+It's still going through hardening, so if you run into any issues, please file
+them to Github with the "oncall: export" tag.
+
+In *non-strict mode*, we trace through the program using the Python interpreter.
+Your code will execute exactly as it would in eager mode; the only difference is
+that all Tensor objects will be replaced by ProxyTensors, which will record all
+their operations into a graph.
+
+In *strict* mode, which is currently the default, we first trace through the
+program using TorchDynamo, a bytecode analysis engine. TorchDynamo does not
+actually execute your Python code. Instead, it symbolically analyzes it and
+builds a graph based on the results. This analysis allows torch.export to
+provide stronger guarantees about safety, but not all Python code is supported.
+
+An example of a case where one might want to use non-strict mode is if you run
+into a unsupported TorchDynamo feature that might not be easily solved, and you
+know the python code is not exactly needed for computation. For example:
+
+```python
+import contextlib
+import torch
+
+class ContextManager():
+    def __init__(self):
+        self.count = 0
+    def __enter__(self):
+        self.count += 1
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.count -= 1
+
+class M(torch.nn.Module):
+    def forward(self, x):
+        with ContextManager():
+            return x.sin() + x.cos()
+
+export(M(), (torch.ones(3, 3),), strict=False)  # Non-strict traces successfully
+export(M(), (torch.ones(3, 3),))  # Strict mode fails with torch._dynamo.exc.Unsupported: ContextManager
+```
+
+In this example, the first call using non-strict mode (through the
+`strict=False` flag) traces successfully whereas the second call using strict
+mode (default) results with a failure, where TorchDynamo is unable to support
+context managers. One option is to rewrite the code (see {ref}`Limitations of torch.export <limitations-of-torch-export>`),
+but seeing as the context manager does not affect the tensor
+computations in the model, we can go with the non-strict mode's result.
+
+(training-export)=
+
+### Export for Training and Inference
+
+In PyTorch 2.5, we introduced a new API called {func}`export_for_training`.
+It's still going through hardening, so if you run into any issues, please file
+them to Github with the "oncall: export" tag.
+
+In this API, we produce the most generic IR that contains all ATen operators
+(including both functional and non-functional) which can be used to train in
+eager PyTorch Autograd. This API is intended for eager training use cases such as PT2 Quantization
+and will soon be the default IR of torch.export.export. To read further about
+the motivation behind this change, please refer to
+<https://dev-discuss.pytorch.org/t/why-pytorch-does-not-need-a-new-standardized-operator-set/2206>
+
+When this API is combined with {func}`run_decompositions()`, you should be able to get inference IR with
+any desired decomposition behavior.
+
+To show some examples:
+
+```python
+class ConvBatchnorm(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(1, 3, 1, 1)
+        self.bn = torch.nn.BatchNorm2d(3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return (x,)
+
+mod = ConvBatchnorm()
+inp = torch.randn(1, 1, 3, 3)
+
+ep_for_training = torch.export.export_for_training(mod, (inp,))
+print(ep_for_training)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            conv2d: "f32[1, 3, 3, 3]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias)
+            add_: "i64[]" = torch.ops.aten.add_.Tensor(b_bn_num_batches_tracked, 1)
+            batch_norm: "f32[1, 3, 3, 3]" = torch.ops.aten.batch_norm.default(conv2d, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05, True)
+            return (batch_norm,)
+```
+
+From the above output, you can see that {func}`export_for_training` produces pretty much the same ExportedProgram
+as {func}`export` except for the operators in the graph. You can see that we captured batch_norm in the most general
+form. This op is non-functional and will be lowered to different ops when running inference.
+
+You can also go from this IR to an inference IR via {func}`run_decompositions` with arbitrary customizations.
+
+```python
+# Lower to core aten inference IR, but keep conv2d
+decomp_table = torch.export.default_decompositions()
+del decomp_table[torch.ops.aten.conv2d.default]
+ep_for_inference = ep_for_training.run_decompositions(decomp_table)
+
+print(ep_for_inference)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            conv2d: "f32[1, 3, 3, 3]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias)
+            add: "i64[]" = torch.ops.aten.add.Tensor(b_bn_num_batches_tracked, 1)
+            _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(conv2d, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05)
+            getitem: "f32[1, 3, 3, 3]" = _native_batch_norm_legit_functional[0]
+            getitem_3: "f32[3]" = _native_batch_norm_legit_functional[3]
+            getitem_4: "f32[3]" = _native_batch_norm_legit_functional[4]
+            return (getitem_3, getitem_4, add, getitem)
+```
+
+Here you can see that we kept `conv2d` op in the IR while decomposing the rest. Now the IR is a functional IR
+containing core aten operators except for `conv2d`.
+
+You can do even more customization by directly registering your chosen decomposition behaviors.
+
+You can do even more customizations by directly registering custom decomp behaviour
+
+```python
+# Lower to core aten inference IR, but customize conv2d
+decomp_table = torch.export.default_decompositions()
+
+def my_awesome_custom_conv2d_function(x, weight, bias, stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1):
+    return 2 * torch.ops.aten.convolution(x, weight, bias, stride, padding, dilation, False, [0, 0], groups)
+
+decomp_table[torch.ops.aten.conv2d.default] = my_awesome_conv2d_function
+ep_for_inference = ep_for_training.run_decompositions(decomp_table)
+
+print(ep_for_inference)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            convolution: "f32[1, 3, 3, 3]" = torch.ops.aten.convolution.default(x, p_conv_weight, p_conv_bias, [1, 1], [0, 0], [1, 1], False, [0, 0], 1)
+            mul: "f32[1, 3, 3, 3]" = torch.ops.aten.mul.Tensor(convolution, 2)
+            add: "i64[]" = torch.ops.aten.add.Tensor(b_bn_num_batches_tracked, 1)
+            _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(mul, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05)
+            getitem: "f32[1, 3, 3, 3]" = _native_batch_norm_legit_functional[0]
+            getitem_3: "f32[3]" = _native_batch_norm_legit_functional[3]
+            getitem_4: "f32[3]" = _native_batch_norm_legit_functional[4];
+            return (getitem_3, getitem_4, add, getitem)
+```
+
+### Expressing Dynamism
+
+By default `torch.export` will trace the program assuming all input shapes are
+**static**, and specializing the exported program to those dimensions. However,
+some dimensions, such as a batch dimension, can be dynamic and vary from run to
+run. Such dimensions must be specified by using the
+{func}`torch.export.Dim` API to create them and by passing them into
+{func}`torch.export.export` through the `dynamic_shapes` argument. An example:
+
+```python
+import torch
+from torch.export import Dim, export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class M(torch.nn.Module):
     def __init__(self):
@@ -226,6 +575,7 @@ class M(torch.nn.Module):
 example_args = (torch.randn(32, 64), torch.randn(32, 128))
 
 # Create a dynamic batch size
+<<<<<<< HEAD
 batch = torch.export.Dim("batch")
 # Specify that the first dimension of each input is that batch size
 dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}}
@@ -237,14 +587,49 @@ print(ep)
 
 example_args2 = (torch.randn(64, 64), torch.randn(64, 128))
 ep.module()(*example_args2)  # success
+=======
+batch = Dim("batch")
+# Specify that the first dimension of each input is that batch size
+dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}}
+
+exported_program: torch.export.ExportedProgram = export(
+    M(), args=example_args, dynamic_shapes=dynamic_shapes
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, p_branch1_0_weight: "f32[32, 64]", p_branch1_0_bias: "f32[32]", p_branch2_0_weight: "f32[64, 128]", p_branch2_0_bias: "f32[64]", c_buffer: "f32[32]", x1: "f32[s0, 64]", x2: "f32[s0, 128]"):
+
+         # code: out1 = self.branch1(x1)
+        linear: "f32[s0, 32]" = torch.ops.aten.linear.default(x1, p_branch1_0_weight, p_branch1_0_bias)
+        relu: "f32[s0, 32]" = torch.ops.aten.relu.default(linear)
+
+         # code: out2 = self.branch2(x2)
+        linear_1: "f32[s0, 64]" = torch.ops.aten.linear.default(x2, p_branch2_0_weight, p_branch2_0_bias)
+        relu_1: "f32[s0, 64]" = torch.ops.aten.relu.default(linear_1)
+
+         # code: return (out1 + self.buffer, out2)
+        add: "f32[s0, 32]" = torch.ops.aten.add.Tensor(relu, c_buffer)
+        return (add, relu_1)
+
+Range constraints: {s0: VR[0, int_oo]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Some additional things to note:
 
 - Through the {func}`torch.export.Dim` API and the `dynamic_shapes` argument, we specified the first
   dimension of each input to be dynamic. Looking at the inputs `x1` and
+<<<<<<< HEAD
   `x2`, they have a symbolic shape of `(s0, 64)` and `(s0, 128)`, instead of
   the `(32, 64)` and `(32, 128)` shaped tensors that we passed in as example inputs.
+=======
+  `x2`, they have a symbolic shape of (s0, 64) and (s0, 128), instead of
+  the (32, 64) and (32, 128) shaped tensors that we passed in as example inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   `s0` is a symbol representing that this dimension can be a range
   of values.
 - `exported_program.range_constraints` describes the ranges of each symbol
@@ -255,6 +640,7 @@ Some additional things to note:
   [The 0/1 Specialization Problem](https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk)
   for an in-depth discussion of this topic.
 
+<<<<<<< HEAD
 
 In the example, we used `Dim("batch")` to create a dynamic dimension. This is
 the most explicit way to specify dynamism. We can also use `Dim.DYNAMIC` and
@@ -426,22 +812,82 @@ To save the `ExportedProgram`, users can use the {func}`torch.export.save` and
 {func}`torch.export.load` APIs. The resulting file is a zipfile with a specific
 structure. The details of the structure are defined in the
 {ref}`PT2 Archive Spec <export.pt2_archive>`.
+=======
+We can also specify more expressive relationships between input shapes, such as
+where a pair of shapes might differ by one, a shape might be double of
+another, or a shape is even. An example:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y):
+        return x + y[1:]
+
+x, y = torch.randn(5), torch.randn(6)
+dimx = torch.export.Dim("dimx", min=3, max=6)
+dimy = dimx + 1
+
+exported_program = torch.export.export(
+    M(), (x, y), dynamic_shapes=({0: dimx}, {0: dimy}),
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, x: "f32[s0]", y: "f32[s0 + 1]"):
+        # code: return x + y[1:]
+        slice_1: "f32[s0]" = torch.ops.aten.slice.Tensor(y, 0, 1, 9223372036854775807)
+        add: "f32[s0]" = torch.ops.aten.add.Tensor(x, slice_1)
+        return (add,)
+
+Range constraints: {s0: VR[3, 6], s0 + 1: VR[4, 7]}
+```
+
+Some things to note:
+
+- By specifying `{0: dimx}` for the first input, we see that the resulting
+  shape of the first input is now dynamic, being `[s0]`. And now by specifying
+  `{0: dimy}` for the second input, we see that the resulting shape of the
+  second input is also dynamic. However, because we expressed `dimy = dimx + 1`,
+  instead of `y`'s shape containing a new symbol, we see that it is
+  now being represented with the same symbol used in `x`, `s0`. We can
+  see that relationship of `dimy = dimx + 1` is being shown through `s0 + 1`.
+- Looking at the range constraints, we see that `s0` has the range [3, 6],
+  which is specified initially, and we can see that `s0 + 1` has the solved
+  range of [4, 7].
+
+### Serialization
+
+To save the `ExportedProgram`, users can use the {func}`torch.export.save` and
+{func}`torch.export.load` APIs. A convention is to save the `ExportedProgram`
+using a `.pt2` file extension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 An example:
 
 ```python
 import torch
+<<<<<<< HEAD
+=======
+import io
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MyModule(torch.nn.Module):
     def forward(self, x):
         return x + 10
 
+<<<<<<< HEAD
 exported_program = torch.export.export(MyModule(), (torch.randn(5),))
+=======
+exported_program = torch.export.export(MyModule(), torch.randn(5))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 torch.export.save(exported_program, 'exported_program.pt2')
 saved_exported_program = torch.export.load('exported_program.pt2')
 ```
 
+<<<<<<< HEAD
 (training-export)=
 
 ## Export IR: Training vs Inference
@@ -589,17 +1035,140 @@ Notice that instead of `torch.ops.aten.conv2d.default` being decomposed
 into `torch.ops.aten.convolution.default`, it is now decomposed into
 `torch.ops.aten.convolution.default` and `torch.ops.aten.mul.Tensor`,
 which matches our custom decomposition rule.
+=======
+### Specializations
+
+A key concept in understanding the behavior of `torch.export` is the
+difference between *static* and *dynamic* values.
+
+A *dynamic* value is one that can change from run to run. These behave like
+normal arguments to a Python function—you can pass different values for an
+argument and expect your function to do the right thing. Tensor *data* is
+treated as dynamic.
+
+A *static* value is a value that is fixed at export time and cannot change
+between executions of the exported program. When the value is encountered during
+tracing, the exporter will treat it as a constant and hard-code it into the
+graph.
+
+When an operation is performed (e.g. `x + y`) and all inputs are static, then
+the output of the operation will be directly hard-coded into the graph, and the
+operation won’t show up (i.e. it will get constant-folded).
+
+When a value has been hard-coded into the graph, we say that the graph has been
+*specialized* to that value.
+
+The following values are static:
+
+#### Input Tensor Shapes
+
+By default, `torch.export` will trace the program specializing on the input
+tensors' shapes, unless a dimension is specified as dynamic via the
+`dynamic_shapes` argument to `torch.export`. This means that if there exists
+shape-dependent control flow, `torch.export` will specialize on the branch
+that is being taken with the given sample inputs. For example:
+
+```python
+import torch
+from torch.export import export
+
+class Mod(torch.nn.Module):
+    def forward(self, x):
+        if x.shape[0] > 5:
+            return x + 1
+        else:
+            return x - 1
+
+example_inputs = (torch.rand(10, 2),)
+exported_program = export(Mod(), example_inputs)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, x: "f32[10, 2]"):
+        # code: return x + 1
+        add: "f32[10, 2]" = torch.ops.aten.add.Tensor(x, 1)
+        return (add,)
+```
+
+The conditional of (`x.shape[0] > 5`) does not appear in the
+`ExportedProgram` because the example inputs have the static
+shape of (10, 2). Since `torch.export` specializes on the inputs' static
+shapes, the else branch (`x - 1`) will never be reached. To preserve the dynamic
+branching behavior based on the shape of a tensor in the traced graph,
+{func}`torch.export.Dim` will need to be used to specify the dimension
+of the input tensor (`x.shape[0]`) to be dynamic, and the source code will
+need to be {ref}`rewritten <data-shape-dependent-control-flow>`.
+
+Note that tensors that are part of the module state (e.g. parameters and
+buffers) always have static shapes.
+
+#### Python Primitives
+
+`torch.export` also specializes on Python primitives,
+such as `int`, `float`, `bool`, and `str`. However they do have dynamic
+variants such as `SymInt`, `SymFloat`, and `SymBool`.
+
+For example:
+
+```python
+import torch
+from torch.export import export
+
+class Mod(torch.nn.Module):
+    def forward(self, x: torch.Tensor, const: int, times: int):
+        for i in range(times):
+            x = x + const
+        return x
+
+example_inputs = (torch.rand(2, 2), 1, 3)
+exported_program = export(Mod(), example_inputs)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[2, 2]", const, times):
+            # code: x = x + const
+            add: "f32[2, 2]" = torch.ops.aten.add.Tensor(x, 1)
+            add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(add, 1)
+            add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, 1)
+            return (add_2,)
+```
+
+Because integers are specialized, the `torch.ops.aten.add.Tensor` operations
+are all computed with the hard-coded constant `1`, rather than `const`. If
+a user passes a different value for `const` at runtime, like 2, than the one used
+during export time, 1, this will result in an error.
+Additionally, the `times` iterator used in the `for` loop is also "inlined"
+in the graph through the 3 repeated `torch.ops.aten.add.Tensor` calls, and the
+input `times` is never used.
+
+#### Python Containers
+
+Python containers (`List`, `Dict`, `NamedTuple`, etc.) are considered to
+have static structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 (limitations-of-torch-export)=
 
 ## Limitations of torch.export
 
+<<<<<<< HEAD
+=======
+### Graph Breaks
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 As `torch.export` is a one-shot process for capturing a computation graph from
 a PyTorch program, it might ultimately run into untraceable parts of programs as
 it is nearly impossible to support tracing all PyTorch and Python features. In
 the case of `torch.compile`, an unsupported operation will cause a "graph
 break" and the unsupported operation will be run with default Python evaluation.
 In contrast, `torch.export` will require users to provide additional
+<<<<<<< HEAD
 information or rewrite parts of their code to make it traceable.
 
 {ref}`Draft-export <export.draft_export>` is a great resource for listing out
@@ -620,6 +1189,19 @@ some Python features that are unsupported. An option to get past dealing with
 this graph breaks is by using
 {ref}`non-strict export <non-strict-export>` through changing the `strict` flag
 to `strict=False`.
+=======
+information or rewrite parts of their code to make it traceable. As the
+tracing is based on TorchDynamo, which evaluates at the Python
+bytecode level, there will be significantly fewer rewrites required compared to
+previous tracing frameworks.
+
+When a graph break is encountered, {ref}`ExportDB <torch.export_db>` is a great
+resource for learning about the kinds of programs that are supported and
+unsupported, along with ways to rewrite programs to make them traceable.
+
+An option to get past dealing with this graph breaks is by using
+{ref}`non-strict export <non-strict-export>`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 (data-shape-dependent-control-flow)=
 
@@ -632,6 +1214,7 @@ number of paths. In such cases, users will need to rewrite their code using
 special control flow operators. Currently, we support {ref}`torch.cond <cond>`
 to express if-else like control flow (more coming soon!).
 
+<<<<<<< HEAD
 You can also refer to this
 [tutorial](https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html#data-dependent-errors)
 for more ways of addressing data-dependent errors.
@@ -644,6 +1227,15 @@ operator.
 
 Please see this [tutorial](https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html)
 for more details.
+=======
+### Missing Fake/Meta/Abstract Kernels for Operators
+
+When tracing, a FakeTensor kernel (aka meta kernel, abstract impl) is
+required for all operators. This is used to reason about the input/output shapes
+for this operator.
+
+Please see {func}`torch.library.register_fake` for more details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 In the unfortunate case where your model uses an ATen operator that is does not
 have a FakeTensor kernel implementation yet, please file an issue.
@@ -654,6 +1246,7 @@ have a FakeTensor kernel implementation yet, please file an issue.
 :caption: Additional Links for Export Users
 :maxdepth: 1
 
+<<<<<<< HEAD
 export/api_reference
 export/programming_model
 export/ir_spec
@@ -664,13 +1257,210 @@ cond
 generated/exportdb/index
 torch.compiler_aot_inductor
 torch.compiler_ir
+=======
+export.programming_model
+export.ir_spec
+draft_export
+torch.compiler_transformations
+torch.compiler_ir
+generated/exportdb/index
+cond
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{toctree}
 :caption: Deep Dive for PyTorch Developers
 :maxdepth: 1
 
+<<<<<<< HEAD
 torch.compiler_dynamic_shapes
 torch.compiler_fake_tensor
 torch.compiler_transformations
+=======
+torch.compiler_dynamo_overview
+torch.compiler_dynamo_deepdive
+torch.compiler_dynamic_shapes
+torch.compiler_fake_tensor
+```
+
+## API Reference
+
+```{eval-rst}
+.. automodule:: torch.export
+```
+
+```{eval-rst}
+.. autofunction:: export
+```
+
+```{eval-rst}
+.. autofunction:: save
+```
+
+```{eval-rst}
+.. autofunction:: load
+```
+
+```{eval-rst}
+.. autofunction:: draft_export
+```
+
+```{eval-rst}
+.. autofunction:: register_dataclass
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.Dim
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.ShapesCollection
+
+    .. automethod:: dynamic_shapes
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.AdditionalInputs
+
+    .. automethod:: add
+    .. automethod:: dynamic_shapes
+    .. automethod:: verify
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.dynamic_shapes.refine_dynamic_shapes_from_suggested_fixes
+```
+
+```{eval-rst}
+.. autoclass:: ExportedProgram
+
+    .. attribute:: graph
+    .. attribute:: graph_signature
+    .. attribute:: state_dict
+    .. attribute:: constants
+    .. attribute:: range_constraints
+    .. attribute:: module_call_graph
+    .. attribute:: example_inputs
+    .. automethod:: module
+    .. automethod:: run_decompositions
+```
+
+```{eval-rst}
+.. autoclass:: ExportGraphSignature
+```
+
+```{eval-rst}
+.. autoclass:: ModuleCallSignature
+```
+
+```{eval-rst}
+.. autoclass:: ModuleCallEntry
+```
+
+```{eval-rst}
+.. automodule:: torch.export.decomp_utils
+```
+
+```{eval-rst}
+.. autoclass:: CustomDecompTable
+
+    .. automethod:: copy
+    .. automethod:: items
+    .. automethod:: keys
+    .. automethod:: materialize
+    .. automethod:: pop
+    .. automethod:: update
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.exported_program.default_decompositions
+```
+
+```{eval-rst}
+.. automodule:: torch.export.exported_program
+```
+
+```{eval-rst}
+.. automodule:: torch.export.graph_signature
+```
+
+```{eval-rst}
+.. autoclass:: ExportGraphSignature
+
+    .. automethod:: replace_all_uses
+    .. automethod:: get_replace_hook
+```
+
+```{eval-rst}
+.. autoclass:: ExportBackwardSignature
+```
+
+```{eval-rst}
+.. autoclass:: InputKind
+```
+
+```{eval-rst}
+.. autoclass:: InputSpec
+```
+
+```{eval-rst}
+.. autoclass:: OutputKind
+```
+
+```{eval-rst}
+.. autoclass:: OutputSpec
+```
+
+```{eval-rst}
+.. autoclass:: SymIntArgument
+```
+
+```{eval-rst}
+.. autoclass:: SymBoolArgument
+```
+
+```{eval-rst}
+.. autoclass:: SymFloatArgument
+```
+
+```{eval-rst}
+.. autoclass:: CustomObjArgument
+```
+
+```{eval-rst}
+.. py:module:: torch.export.dynamic_shapes
+```
+
+```{eval-rst}
+.. py:module:: torch.export.custom_ops
+```
+
+```{eval-rst}
+.. automodule:: torch.export.unflatten
+    :members:
+```
+
+```{eval-rst}
+.. automodule:: torch.export.custom_obj
+```
+
+```{eval-rst}
+.. automodule:: torch.export.experimental
+```
+
+```{eval-rst}
+.. automodule:: torch.export.passes
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.passes.move_to_device_pass
+```
+
+```{eval-rst}
+.. automodule:: torch.export.pt2_archive
+```
+
+```{eval-rst}
+.. automodule:: torch.export.pt2_archive.constants
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/docs/source/export.programming_model.md b/docs/source/export.programming_model.md
new file mode 100644
index 0000000000000..9a21db78464aa
--- /dev/null
+++ b/docs/source/export.programming_model.md
@@ -0,0 +1,523 @@
+(export-programming-model)=
+
+# torch.export Programming Model
+
+This document aims to explain the behaviors and capabilities of
+{func}`torch.export.export`. It is intended to help build your intuition
+for how {func}`torch.export.export` handles code.
+
+## Basics of Tracing
+
+{func}`torch.export.export` captures a graph representing your model by
+tracing its execution on "example" inputs and recording the PyTorch operations
+and conditions observed along the traced path. This graph can then be run
+on different inputs as long as they satisfy the same conditions.
+
+The basic output of {func}`torch.export.export` is a single graph of PyTorch
+operations, with associated metadata. The exact format of this output is
+covered in the {ref}`export.ir_spec`.
+
+### Strict vs. Non-Strict Tracing
+
+{func}`torch.export.export` provides two modes of tracing.
+
+In *non-strict mode*, we trace through the program using the normal Python
+interpreter. Your code executes exactly as it would in eager mode; the only
+difference is that all Tensors are replaced by
+[fake Tensors](https://pytorch.org/docs/main/torch.compiler_fake_tensor.html),
+**which have shapes and other forms of metadata but no data**, wrapped in
+[Proxy objects](https://pytorch.org/docs/main/fx.html) that record all
+operations on them into a graph. We also capture
+[conditions on Tensor shapes](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#the-guard-model)
+**that guard the correctness of the generated code**.
+
+In *strict mode*, we first trace through the program using
+{ref}`TorchDynamo <torch.compiler_dynamo_deepdive>`, a Python bytecode
+analysis engine. TorchDynamo does not actually execute your Python code.
+Instead, it symbolically analyzes it and builds a graph based on the results.
+On the one hand, this analysis allows {func}`torch.export.export` to provide
+additional guarantees on Python-level safety (beyond capturing conditions on
+Tensor shapes, as in non-strict mode). On the other hand, not all Python
+features are supported by this analysis.
+
+Although currently the default mode of tracing is strict, **we strongly
+recommend using non-strict**, which will soon become the default.
+For most models, conditions on Tensor shapes are enough for soundness, and
+the additional guarantees on Python-level safety have no impact; at the same
+time, the possibility of hitting unsupported Python features in TorchDynamo
+presents an unnecessary risk.
+
+In the rest of this document we assume we are tracing in
+[non-strict mode](https://pytorch.org/docs/main/export.html#non-strict-export);
+in particular, we assume that **all Python features are supported**.
+
+## Values: Static vs. Dynamic
+
+A key concept in understanding the behavior of {func}`torch.export.export` is
+the difference between *static* and *dynamic* values.
+
+### Static Values
+
+A *static* value is a value that is **fixed at export time and cannot change
+between executions of the exported program**. When the value is encountered
+during tracing, we treat it as a constant and hard-code it into the graph.
+
+When an operation is performed (e.g. `x + y`) and all inputs are static,
+the output of the operation is directly hard-coded into the graph and the
+operation does not show up (i.e. it gets "constant-folded").
+
+When a value has been hard-coded into the graph, we say that the graph has
+been *specialized* to that value. For example:
+
+```python
+import torch
+
+class MyMod(torch.nn.Module):
+    def forward(self, x, y):
+        z = y + 7
+        return x + z
+
+m = torch.export.export(MyMod(), (torch.randn(1), 3))
+print(m.graph_module.code)
+
+"""
+def forward(self, arg0_1, arg1_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 10);  arg0_1 = None
+    return (add,)
+
+"""
+```
+
+Here, we provide `3` as the traced value for `y`; it is treated as a static
+value and added to `7`, burning in the static value `10` in the graph.
+
+### Dynamic Values
+
+A *dynamic* value is one that **can change from run to run**. It behaves just
+like a "normal" function argument: you can pass different inputs and expect
+your function to do the right thing.
+
+### Which values are static vs. dynamic?
+
+Whether a value is static or dynamic depends on its type:
+
+- For Tensor:
+
+  - Tensor *data* is treated as dynamic.
+
+  - Tensor *shapes* can be treated by the system as static or dynamic.
+
+    - By default, shapes of all input Tensors are considered static.
+      The user can override this behavior for any input Tensor by specifying
+      a [dynamic shape](https://pytorch.org/docs/main/export.html#expressing-dynamism)
+      for it.
+    - Tensors that are part of module state, i.e., parameters and buffers,
+      always have static shapes.
+
+  - Other forms of Tensor *metadata* (e.g. `device`, `dtype`) are static.
+
+- Python *primitives* (`int`, `float`, `bool`, `str`, `None`) are static.
+
+  - There are dynamic variants for some primitive types (`SymInt`,
+    `SymFloat`, `SymBool`). Typically users do not have to deal with them.
+
+- For Python *standard containers* (`list`, `tuple`, `dict`, `namedtuple`):
+
+  - The structure (i.e., length for `list` and `tuple` values, and key
+    sequence for `dict` and `namedtuple` values) is static.
+  - The contained elements have these rules applied to them recursively
+    (basically the
+    [PyTree](https://jax.readthedocs.io/en/latest/pytrees.html) scheme)
+    with leaves that are either Tensor or primitive types.
+
+- Other *classes* (including data classes) can be registered with PyTree
+  (see below), and follow the same rules as the standard containers.
+
+## Input types
+
+Inputs will be treated as either static or dynamic, based on their type
+(as explained above).
+
+- A static input will get hard-coded into the graph, and passing a different
+  value at run time will result in an error. Recall that these are mostly
+  values of primitive types.
+- A dynamic input behaves like a "normal" function input. Recall that these
+  are mostly values of Tensor types.
+
+By default, the types of inputs you can use for your program are:
+
+- Tensor
+- Python primitives (`int`, `float`, `bool`, `str`, `None`)
+- Python standard containers (`list`, `tuple`, `dict`, `namedtuple`)
+
+### Custom Input Types
+
+In addition, you can also define your own (custom) class and use it as an
+input type, but you will need to register such a class as a PyTree.
+
+Here's an example of using an utility to register a dataclass that is used as
+an input type.
+
+```python
+@dataclass
+class Input:
+    f: torch.Tensor
+    p: torch.Tensor
+
+torch.export.register_dataclass(Input)
+
+class M(torch.nn.Module):
+    def forward(self, x: Input):
+        return x.f + 1
+
+torch.export.export(M(), (Input(f=torch.ones(10, 4), p=torch.zeros(10, 4)),))
+```
+
+### Optional input types
+
+For optional inputs to the program that are not passed in,
+{func}`torch.export.export` will specialize to their default values. As a
+result, the exported program will require users to explicitly pass in all
+arguments, and will lose the defaulting behavior. For example:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y=None):
+        if y is not None:
+            return y * x
+        return x + x
+
+# Optional input is passed in
+ep = torch.export.export(M(), (torch.randn(3, 3), torch.randn(3, 3)))
+print(ep)
+"""
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[3, 3]", y: "f32[3, 3]"):
+            # File: /data/users/angelayi/pytorch/moo.py:15 in forward, code: return y * x
+            mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(y, x);  y = x = None
+            return (mul,)
+"""
+
+# Optional input is not passed in
+ep = torch.export.export(M(), (torch.randn(3, 3),))
+print(ep)
+"""
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[3, 3]", y):
+            # File: /data/users/angelayi/pytorch/moo.py:16 in forward, code: return x + x
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(x, x);  x = None
+            return (add,)
+"""
+```
+
+## Control Flow: Static vs. Dynamic
+
+Control flow is supported by {func}`torch.export.export`. The behavior of
+control flow depends on whether the value you are branching on is static or
+dynamic.
+
+### Static Control Flow
+
+**Python control flow over static values is supported transparently**. (Recall
+that static values include static shapes, so control flow over static shapes
+is also covered by this case.)
+
+As mentioned above, we "burn in" static values, so the exported graph will
+never see any control flow over static values.
+
+In the case of an `if` statement, we will continue tracing the branch taken
+at export time. In the case of a `for` or `while` statement, we will continue
+tracing by unrolling the loop.
+
+### Dynamic Control Flow: Shape-Dependent vs. Data-Dependent
+
+When the value involved in a control flow is dynamic, it could depend on
+dynamic shapes or dynamic data. Given that the compiler traces with
+information on shapes rather than data, the implications on the programming
+model are different in these cases.
+
+#### Dynamic Shape-Dependent Control Flow
+
+When the value involved in a control flow is a
+[dynamic shape](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html),
+in most cases **we will also know the concrete value of the dynamic shape
+during tracing**: see the following section for more details on how the
+compiler tracks this information.
+
+In these cases we say that the control flow is shape-dependent. **We use the
+concrete value of the dynamic shape to evaluate the condition** to either
+`True` or `False` and continue tracing (as discussed above), additionally
+emitting a guard corresponding to the condition just evaluated.
+
+Otherwise the control flow is considered data-dependent. We cannot evaluate
+the condition to either `True` or `False`, so cannot continue tracing and have to
+raise an error at export time. See next section.
+
+#### Dynamic Data-Dependent Control Flow
+
+**Data-dependent control flow over dynamic values is supported, but you must
+use one of PyTorch's explicit operators** to continue tracing. Using Python
+control flow statements over dynamic values is not permitted, because the
+compiler cannot evaluate the conditions necessary to continue tracing and
+thus an error must be raised at export time.
+
+We provide **operators to express general conditionals and loops over dynamic
+values**, e.g., `torch.cond`, `torch.map`. Note that you only need to use these
+if you truly want *data-dependent control flow*.
+
+Here's an example of an `if` statement on a data-dependent condition,
+`x.sum() > 0`, where `x` is an input Tensor, rewritten using `torch.cond`.
+Instead of having to decide which branch to trace, now both branches are
+traced.
+
+```python
+class M_old(torch.nn.Module):
+    def forward(self, x):
+        if x.sum() > 0:
+            return x.sin()
+        else:
+            return x.cos()
+
+class M_new(torch.nn.Module):
+    def forward(self, x):
+        return torch.cond(
+            pred=x.sum() > 0,
+            true_fn=lambda x: x.sin(),
+            false_fn=lambda x: x.cos(),
+            operands=(x,),
+        )
+```
+
+A special case of data-dependent control flow is where it involves a
+[data-dependent dynamic shape](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#unbacked-symints):
+typically, the shape of some intermediate Tensor that depends on input data
+rather than on input shapes (thus not shape-dependent). Instead of using a
+control flow operator, in this case you can provide an assertion that decides
+whether the condition is `True` or `False`. Given such an assertion, we can
+continue tracing, emitting a guard as above.
+
+We provide **operators to express assertions on dynamic shapes**, e.g.,
+`torch._check`. Note that you only need to use this when there is control
+flow on data-dependent dynamic shapes.
+
+Here's an example of an `if` statement on a condition involving a
+data-dependent dynamic shape, `nz.shape[0] > 0`, where `nz` is the result of
+calling {func}`torch.nonzero`, an operator whose output shape depends on input
+data. Instead of rewriting it, you can add an assertion using `torch._check`
+to effectively decide which branch to trace.
+
+```python
+class M_old(torch.nn.Module):
+    def forward(self, x):
+        nz = x.nonzero()
+        if nz.shape[0] > 0:
+            return x.sin()
+        else:
+            return x.cos()
+
+class M_new(torch.nn.Module):
+    def forward(self, x):
+        nz = x.nonzero()
+        torch._check(nz.shape[0] > 0)
+        if nz.shape[0] > 0:
+            return x.sin()
+        else:
+            return x.cos()
+```
+
+## Basics of Symbolic Shapes
+
+During tracing, dynamic Tensor shapes and conditions over them are encoded as
+"symbolic expressions." (In contrast, static Tensor shapes and conditions
+over them are simply `int` and `bool` values.)
+
+A *symbol* is like a variable; it describes a dynamic Tensor shape.
+
+As tracing proceeds, shapes of intermediate Tensors may be described by more
+general expressions, typically involving integer arithmetic operators. This
+is because **for most PyTorch operators, shapes of output Tensors can be
+described as functions of shapes of input Tensors**. For example, the shape of
+the output of {func}`torch.cat` is the sum of the shapes of its inputs.
+
+Moreover, as we encounter control flow in the program, we create boolean
+expressions, typically involving relational operators, describing conditions
+along the traced path. These **expressions are evaluated to decide which path
+to trace through the program**, and recorded in a
+[shape environment](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#overall-architecture)
+to guard the correctness of the traced path and to evaluate subsequently
+created expressions.
+
+We briefly introduce these subsystems next.
+
+### Fake Implementations of PyTorch Operators
+
+Recall that during tracing, we are executing the program with
+[fake Tensors](https://pytorch.org/docs/main/torch.compiler_fake_tensor.html),
+which have no data. In general we cannot call the actual implementations of
+PyTorch operators with fake Tensors. Thus each operator needs to have an
+additional fake (a.k.a. "meta") implementation, which inputs and outputs fake
+Tensors, that matches the behavior of the actual implementation in terms of
+shapes and other forms of metadata carried by fake Tensors.
+
+For example, note how the fake implementation of {func}`torch.index_select`
+computes the shape of the output using the shape of the input (while ignoring
+input data and returning empty output data).
+
+```python
+def meta_index_select(self, dim, index):
+    result_size = list(self.size())
+    if self.dim() > 0:
+        result_size[dim] = index.numel()
+    return self.new_empty(result_size)
+```
+
+#### Shape Propagation: Backed vs. Unbacked Dynamic Shapes
+
+Shapes are propagated using fake implementations of PyTorch operators.
+
+A key concept to understand the propagation of dynamic shapes in particular
+is the difference between *backed* and *unbacked* dynamic shapes: we know the
+concrete values of the former but not the latter.
+
+Propagation of shapes, including tracking backed and unbacked dynamic shapes,
+proceeds as follows:
+
+- The shapes of Tensors representing inputs can be static or dynamic. When
+  dynamic, they are described by symbols; moreover, **such symbols are backed
+  since we also know their concrete values given the "real" example inputs
+  provided by the user at export time**.
+
+- The output shape of an operator is computed by its fake implementation, and
+  is either static or dynamic. When dynamic, in general it is described by a
+  symbolic expression. Moreover:
+
+  - If the output shape depends only on input shapes, it is either static or
+    backed dynamic whenever the input shapes are all static or backed dynamic.
+  - On the other hand, **if the output shape depends on input data**, it is
+    necessarily dynamic, and moreover, **because we cannot know its concrete
+    value it is unbacked**.
+
+### Control Flow: Guards and Assertions
+
+When a condition on shapes is encountered, it either involves only static
+shapes, in which case it is a `bool`, or it involves dynamic shapes, in which
+case it is a symbolic boolean expression. For the latter:
+
+- When the condition involves only backed dynamic shapes, we can use the
+  concrete values of those dynamic shapes to evaluate the condition to `True`
+  or `False`. We can then add a guard to the shape environment that states
+  that the corresponding symbolic boolean expression is `True` or `False`,
+  and continue tracing.
+- Otherwise the condition involves unbacked dynamic shapes. In general we
+  cannot evaluate such a condition without additional information; thus we
+  cannot continue tracing, and we must raise an error at export time. The
+  user is expected to use an explicit PyTorch operator for tracing to
+  continue. This information is added as a guard in the shape environment,
+  and can also possibly help evaluate other subsequently encountered
+  conditions to `True` or `False`.
+
+Once the model is exported, **any guards on backed dynamic shapes can be
+understood as conditions on input dynamic shapes**. These are verified against
+a dynamic shape specification that must have been provided to export,
+describing conditions on dynamic shapes that not only example inputs but also
+all future inputs are expected to satisfy for the generated code to be
+correct. More precisely, the dynamic shape specification must logically imply
+the generated guards, otherwise an error is raised at export time (along with
+suggested fixes to the dynamic shape specification). On the other hand, when
+there are no generated guards on backed dynamic shapes (in particular, when
+all shapes are static) no dynamic shape specification needs to be provided to
+export. In general, the dynamic shape specification is converted to runtime
+assertions on the inputs of the generated code.
+
+Finally, **any guards on unbacked dynamic shapes are converted to "inline"
+runtime assertions**. These are added in the generated code at the locations
+where those unbacked dynamic shapes were created: typically, right after
+data-dependent operator calls.
+
+## Allowed PyTorch operators
+
+All PyTorch operators are permitted.
+
+### Custom operators
+
+In addition, you can define and use
+[custom operators](https://pytorch.org/tutorials/advanced/python_custom_ops#python-custom-ops-tutorial).
+Defining a custom operator includes defining a fake implementation for it,
+just like any other PyTorch operator (see previous section).
+
+Here's an example of a custom `sin` operator that wraps NumPy, and its
+registered (trivial) fake implementation.
+
+```python
+@torch.library.custom_op("mylib::sin", mutates_args=())
+def sin(x: Tensor) -> Tensor:
+    x_np = x.numpy()
+    y_np = np.sin(x_np)
+    return torch.from_numpy(y_np)
+
+@torch.library.register_fake("mylib::sin")
+def _(x: Tensor) -> Tensor:
+    return torch.empty_like(x)
+```
+
+**Sometimes your custom operator's fake implementation will involve
+data-dependent shapes**. Here's how a fake implementation for a custom
+`nonzero` might look like.
+
+```python
+...
+
+@torch.library.register_fake("mylib::custom_nonzero")
+def _(x):
+    nnz = torch.library.get_ctx().new_dynamic_size()
+    shape = [nnz, x.dim()]
+    return x.new_empty(shape, dtype=torch.int64)
+```
+
+## Module State: Reads vs. Updates
+
+Module states include parameters, buffers, and regular attributes.
+
+- A regular attribute can be of any type.
+- On the other hand, parameters and buffers are always Tensors.
+
+Module states can be dynamic or static, based on their types as outlined
+above. For example, `self.training` is a `bool`, which means it is static; on
+the other hand, any parameter or buffer is dynamic.
+
+The *shapes* of any Tensors contained in module states cannot be dynamic, i.e.,
+those shapes are fixed at export time, and cannot change between executions
+of the exported program.
+
+### Access rules
+
+**All module states must be initialized**. Accessing a module state that is
+not already initialized causes an error to be raised at export time.
+
+**Reading module states is always permitted**.
+
+Updating module states is possible, but must follow the rules below:
+
+- **A static regular attribute** (e.g., of primitive type) **can be updated**.
+  Reads and updates can be freely interleaved, and as expected, any reads
+  will always see the values of the latest updates. Because these attributes
+  are static, we will also burn the values in, so the generated code will not
+  have any instructions to actually "get" or "set" such attributes.
+- **A dynamic regular attribute** (e.g., of Tensor type) **cannot be updated**.
+  To do so, it must be registered as a buffer during module initialization.
+- **A buffer can be updated**, where the updating can be in-place (e.g.,
+  `self.buffer[:] = ...`) or not (e.g., `self.buffer = ...`).
+- **A parameter cannot be updated**. Typically parameters are updated only
+  during training, not during inference. We recommend exporting with
+  {func}`torch.no_grad` to avoid parameter updates at export time.
+
+### Effects of functionalization
+
+Any dynamic module state that is read and/or updated is "lifted"
+(respectively) as an input and/or output of the generated code.
+
+The exported program stores, along with the generated code, the initial
+values of parameters and buffers and the constant values of other Tensor
+attributes.
diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md
index cba695b5e1c55..ca27db0cbc36e 100644
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@@ -8,10 +8,13 @@
 These APIs are experimental and subject to change without notice.
 :::
 
+<<<<<<< HEAD
 ```{eval-rst}
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## torch.fx.experimental.symbolic_shapes
 
 ```{eval-rst}
diff --git a/docs/source/fx.md b/docs/source/fx.md
index c9c235382893e..3a31e33eefb2a 100644
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@@ -44,7 +44,12 @@ Your transform will take in a {class}`torch.nn.Module`, acquire a {class}`Graph`
 from it, do some modifications, and return a new
 {class}`torch.nn.Module`. You should think of the {class}`torch.nn.Module` that your FX
 transform returns as identical to a regular {class}`torch.nn.Module` -- you can pass it to another
+<<<<<<< HEAD
 FX transform, or you can run it. Ensuring that the inputs and outputs of your FX transform are a
+=======
+FX transform, you can pass it to TorchScript, or you can
+run it. Ensuring that the inputs and outputs of your FX transform are a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {class}`torch.nn.Module` will allow for composability.
 
 ```{note}
@@ -1093,9 +1098,12 @@ The set of leaf modules can be customized by overriding
 ```{eval-rst}
 .. autofunction:: torch.fx.replace_pattern
 ```
+<<<<<<< HEAD
 ```{eval-rst}
 .. autofunction:: torch.fx.traceback.annotate
 ```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 <!-- The experimental and passes submodules are missing docs. -->
 <!-- Adding it here for coverage but this doesn't add anything to the -->
@@ -1169,7 +1177,10 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.passes.operator_support
 .. py:module:: torch.fx.passes.param_fetch
 .. py:module:: torch.fx.passes.pass_manager
+<<<<<<< HEAD
 .. py:module:: torch.fx.passes.regional_inductor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.fx.passes.reinplace
 .. py:module:: torch.fx.passes.runtime_assert
 .. py:module:: torch.fx.passes.shape_prop
diff --git a/docs/source/index.md b/docs/source/index.md
index df012d1d6e177..1d1d06cfcac3a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -23,11 +23,25 @@ The APIs and performance characteristics of these features may change.
 :glob:
 :maxdepth: 2
 
+<<<<<<< HEAD
 Install PyTorch <https://pytorch.org/get-started/locally/>
 user_guide/index
 pytorch-api
 notes
 community/index
+=======
+pytorch-api
+notes
+```
+
+```{toctree}
+:glob:
+:hidden:
+:maxdepth: 2
+
+community/index
+C++ <https://docs.pytorch.org/cppdocs/>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ## Indices and tables
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 5295f82f9ac19..377be5d7c8e7b 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -2,6 +2,7 @@ TorchScript
 ===========
 
 .. toctree::
+<<<<<<< HEAD
     :maxdepth: 1
     :hidden:
 
@@ -16,6 +17,49 @@ TorchScript
     `torch.export <https://docs.pytorch.org/docs/stable/export.html>`__ instead.
 
 .. automodule:: torch.jit
+=======
+   :maxdepth: 1
+   :caption: Builtin Functions
+   :hidden:
+
+   torch.jit.supported_ops <jit_builtin_functions>
+
+
+.. toctree::
+    :maxdepth: 1
+    :caption: Language Reference
+    :hidden:
+
+    jit_language_reference
+
+
+.. toctree::
+    :maxdepth: 1
+
+    jit_language_reference_v2
+
+
+.. contents:: :local:
+    :depth: 2
+
+.. automodule:: torch.jit
+.. currentmodule:: torch.jit
+
+TorchScript is a way to create serializable and optimizable models from PyTorch code.
+Any TorchScript program can be saved from a Python
+process and loaded in a process where there is no Python dependency.
+
+We provide tools to incrementally transition a model from a pure Python program
+to a TorchScript program that can be run independently from Python, such as in a standalone C++ program.
+This makes it possible to train models in PyTorch using familiar tools in Python and then export
+the model via TorchScript to a production environment where Python programs may be disadvantageous
+for performance and multi-threading reasons.
+
+For a gentle introduction to TorchScript, see the `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ tutorial.
+
+For an end-to-end example of converting a PyTorch model to TorchScript and running it in C++, see the
+`Loading a PyTorch Model in C++ <https://pytorch.org/tutorials/advanced/cpp_export.html>`_ tutorial.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Creating TorchScript Code
 --------------------------
@@ -47,11 +91,825 @@ Creating TorchScript Code
     Attribute
     annotate
 
+<<<<<<< HEAD
 
 .. This package is missing doc. Adding it here for coverage
 .. This does not add anything to the rendered page.
 .. py:module:: torch.jit.supported_ops
 .. py:module:: torch.jit.unsupported_tensor_ops
+=======
+Mixing Tracing and Scripting
+----------------------------
+
+In many cases either tracing or scripting is an easier approach for converting a model to TorchScript.
+Tracing and scripting can be composed to suit the particular requirements
+of a part of a model.
+
+Scripted functions can call traced functions. This is particularly useful when you need
+to use control-flow around a simple feed-forward model. For instance the beam search
+of a sequence to sequence model will typically be written in script but can call an
+encoder module generated using tracing.
+
+
+.. testsetup::
+
+    # These are hidden from the docs, but these are necessary for `doctest`
+    # since the `inspect` module doesn't play nicely with the execution
+    # environment for `doctest`
+    import torch
+
+    original_script = torch.jit.script
+    def script_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_script(obj, *args, **kwargs)
+
+    torch.jit.script = script_wrapper
+
+    original_trace = torch.jit.trace
+    def trace_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_trace(obj, *args, **kwargs)
+
+    torch.jit.trace = trace_wrapper
+
+
+Example (calling a traced function in script):
+
+.. testcode::
+
+    import torch
+
+    def foo(x, y):
+        return 2 * x + y
+
+    traced_foo = torch.jit.trace(foo, (torch.rand(3), torch.rand(3)))
+
+    @torch.jit.script
+    def bar(x):
+        return traced_foo(x, x)
+
+Traced functions can call script functions. This is useful when a small part of
+a model requires some control-flow even though most of the model is just a feed-forward
+network. Control-flow inside of a script function called by a traced function is
+preserved correctly.
+
+Example (calling a script function in a traced function):
+
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def foo(x, y):
+        if x.max() > y.max():
+            r = x
+        else:
+            r = y
+        return r
+
+
+    def bar(x, y, z):
+        return foo(x, y) + z
+
+    traced_bar = torch.jit.trace(bar, (torch.rand(3), torch.rand(3), torch.rand(3)))
+
+This composition also works for ``nn.Module``\s as well, where it can be used to generate
+a submodule using tracing that can be called from the methods of a script module.
+
+Example (using a traced module):
+
+.. testcode::
+    :skipif: torchvision is None
+
+    import torch
+    import torchvision
+
+    class MyScriptModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.means = torch.nn.Parameter(torch.tensor([103.939, 116.779, 123.68])
+                                            .resize_(1, 3, 1, 1))
+            self.resnet = torch.jit.trace(torchvision.models.resnet18(),
+                                          torch.rand(1, 3, 224, 224))
+
+        def forward(self, input):
+            return self.resnet(input - self.means)
+
+    my_script_module = torch.jit.script(MyScriptModule())
+
+
+TorchScript Language
+--------------------
+
+TorchScript is a statically typed subset of Python, so many Python features apply
+directly to TorchScript. See the full :ref:`language-reference` for details.
+
+
+.. _builtin functions:
+
+Built-in Functions and Modules
+------------------------------
+
+TorchScript supports the use of most PyTorch functions and many Python built-ins.
+See :ref:`builtin-functions` for a full reference of supported functions.
+
+PyTorch Functions and Modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TorchScript supports a subset of the tensor and neural network
+functions that PyTorch provides. Most methods on Tensor as well as functions in
+the ``torch`` namespace, all functions in ``torch.nn.functional`` and
+most modules from ``torch.nn`` are supported in TorchScript.
+
+See :ref:`jit_unsupported` for a list of unsupported PyTorch functions and modules.
+
+
+Python Functions and Modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Many of Python's `built-in functions <https://docs.python.org/3/library/functions.html>`_ are supported in TorchScript.
+The :any:`math` module is also supported (see :ref:`math-module` for details), but no other Python modules
+(built-in or third party) are supported.
+
+
+Python Language Reference Comparison
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For a full listing of supported Python features, see :ref:`python-language-reference`.
+
+Debugging
+---------
+
+.. _`disable TorchScript`:
+
+Disable JIT for Debugging
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. envvar:: PYTORCH_JIT
+
+Setting the environment variable ``PYTORCH_JIT=0`` will disable all script
+and tracing annotations. If there is hard-to-debug error in one of your
+TorchScript models, you can use this flag to force everything to run using native
+Python. Since TorchScript (scripting and tracing) is disabled with this flag,
+you can use tools like ``pdb`` to debug the model code.  For example::
+
+    @torch.jit.script
+    def scripted_fn(x : torch.Tensor):
+        for i in range(12):
+            x = x + x
+        return x
+
+    def fn(x):
+        x = torch.neg(x)
+        import pdb; pdb.set_trace()
+        return scripted_fn(x)
+
+    traced_fn = torch.jit.trace(fn, (torch.rand(4, 5),))
+    traced_fn(torch.rand(3, 4))
+
+Debugging this script with ``pdb`` works except for when we invoke the
+:func:`@torch.jit.script <torch.jit.script>` function. We can globally disable
+JIT, so that we can call the :func:`@torch.jit.script <torch.jit.script>`
+function as a normal Python function and not compile it. If the above script
+is called ``disable_jit_example.py``, we can invoke it like so::
+
+    $ PYTORCH_JIT=0 python disable_jit_example.py
+
+and we will be able to step into the :func:`@torch.jit.script
+<torch.jit.script>` function as a normal Python function. To disable the
+TorchScript compiler for a specific function, see
+:func:`@torch.jit.ignore <torch.jit.ignore>`.
+
+.. _inspecting-code:
+
+Inspecting Code
+~~~~~~~~~~~~~~~
+
+TorchScript provides a code pretty-printer for all :class:`ScriptModule` instances. This
+pretty-printer gives an interpretation of the script method's code as valid
+Python syntax. For example:
+
+.. testcode::
+
+    @torch.jit.script
+    def foo(len):
+        # type: (int) -> torch.Tensor
+        rv = torch.zeros(3, 4)
+        for i in range(len):
+            if i < 10:
+                rv = rv - 1.0
+            else:
+                rv = rv + 1.0
+        return rv
+
+    print(foo.code)
+
+.. testoutput::
+    :hide:
+
+    ...
+
+A :class:`ScriptModule` with a single ``forward`` method will have an attribute
+``code``, which you can use to inspect the :class:`ScriptModule`'s code.
+If the :class:`ScriptModule` has more than one method, you will need to access
+``.code`` on the method itself and not the module. We can inspect the
+code of a method named ``foo`` on a :class:`ScriptModule` by accessing ``.foo.code``.
+The example above produces this output: ::
+
+    def foo(len: int) -> Tensor:
+        rv = torch.zeros([3, 4], dtype=None, layout=None, device=None, pin_memory=None)
+        rv0 = rv
+        for i in range(len):
+            if torch.lt(i, 10):
+                rv1 = torch.sub(rv0, 1., 1)
+            else:
+                rv1 = torch.add(rv0, 1., 1)
+            rv0 = rv1
+        return rv0
+
+This is TorchScript's compilation of the code for the ``forward`` method.
+You can use this to ensure TorchScript (tracing or scripting) has captured
+your model code correctly.
+
+
+.. _interpreting-graphs:
+
+Interpreting Graphs
+~~~~~~~~~~~~~~~~~~~
+TorchScript also has a representation at a lower level than the code pretty-\
+printer, in the form of IR graphs.
+
+TorchScript uses a static single assignment (SSA) intermediate representation
+(IR) to represent computation. The instructions in this format consist of
+ATen (the C++ backend of PyTorch) operators and other primitive operators,
+including control flow operators for loops and conditionals. As an example:
+
+.. testcode::
+
+    @torch.jit.script
+    def foo(len):
+        # type: (int) -> torch.Tensor
+        rv = torch.zeros(3, 4)
+        for i in range(len):
+            if i < 10:
+                rv = rv - 1.0
+            else:
+                rv = rv + 1.0
+        return rv
+
+    print(foo.graph)
+
+.. testoutput::
+    :hide:
+
+    ...
+
+``graph`` follows the same rules described in the :ref:`inspecting-code` section
+with regard to ``forward`` method lookup.
+
+The example script above produces the graph::
+
+    graph(%len.1 : int):
+      %24 : int = prim::Constant[value=1]()
+      %17 : bool = prim::Constant[value=1]() # test.py:10:5
+      %12 : bool? = prim::Constant()
+      %10 : Device? = prim::Constant()
+      %6 : int? = prim::Constant()
+      %1 : int = prim::Constant[value=3]() # test.py:9:22
+      %2 : int = prim::Constant[value=4]() # test.py:9:25
+      %20 : int = prim::Constant[value=10]() # test.py:11:16
+      %23 : float = prim::Constant[value=1]() # test.py:12:23
+      %4 : int[] = prim::ListConstruct(%1, %2)
+      %rv.1 : Tensor = aten::zeros(%4, %6, %6, %10, %12) # test.py:9:10
+      %rv : Tensor = prim::Loop(%len.1, %17, %rv.1) # test.py:10:5
+        block0(%i.1 : int, %rv.14 : Tensor):
+          %21 : bool = aten::lt(%i.1, %20) # test.py:11:12
+          %rv.13 : Tensor = prim::If(%21) # test.py:11:9
+            block0():
+              %rv.3 : Tensor = aten::sub(%rv.14, %23, %24) # test.py:12:18
+              -> (%rv.3)
+            block1():
+              %rv.6 : Tensor = aten::add(%rv.14, %23, %24) # test.py:14:18
+              -> (%rv.6)
+          -> (%17, %rv.13)
+      return (%rv)
+
+
+Take the instruction ``%rv.1 : Tensor = aten::zeros(%4, %6, %6, %10, %12) # test.py:9:10`` for
+example.
+
+* ``%rv.1 : Tensor`` means we assign the output to a (unique) value named ``rv.1``, that value is of ``Tensor`` type and that we do not know its concrete shape.
+* ``aten::zeros`` is the operator (equivalent to ``torch.zeros``) and the input list ``(%4, %6, %6, %10, %12)`` specifies which values in scope should be passed as inputs. The schema for built-in functions like ``aten::zeros`` can be found at `Builtin Functions`_.
+* ``# test.py:9:10`` is the location in the original source file that generated this instruction. In this case, it is a file named `test.py`, on line 9, and at character 10.
+
+Notice that operators can also have associated ``blocks``, namely the
+``prim::Loop`` and ``prim::If`` operators. In the graph print-out, these
+operators are formatted to reflect their equivalent source code forms
+to facilitate easy debugging.
+
+Graphs can be inspected as shown to confirm that the computation described
+by a :class:`ScriptModule` is correct, in both automated and manual fashion, as
+described below.
+
+Tracer
+~~~~~~
+
+
+Tracing Edge Cases
+^^^^^^^^^^^^^^^^^^
+There are some edge cases that exist where the trace of a given Python
+function/module will not be representative of the underlying code. These
+cases can include:
+
+* Tracing of control flow that is dependent on inputs (e.g. tensor shapes)
+* Tracing of in-place operations of tensor views (e.g. indexing on the left-hand side of an assignment)
+
+Note that these cases may in fact be traceable in the future.
+
+
+Automatic Trace Checking
+^^^^^^^^^^^^^^^^^^^^^^^^
+One way to automatically catch many errors in traces is by using ``check_inputs``
+on the ``torch.jit.trace()`` API. ``check_inputs`` takes a list of tuples
+of inputs that will be used to re-trace the computation and verify the
+results. For example::
+
+    def loop_in_traced_fn(x):
+        result = x[0]
+        for i in range(x.size(0)):
+            result = result * x[i]
+        return result
+
+    inputs = (torch.rand(3, 4, 5),)
+    check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+    traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
+
+Gives us the following diagnostic information::
+
+    ERROR: Graphs differed across invocations!
+    Graph diff:
+
+                graph(%x : Tensor) {
+                %1 : int = prim::Constant[value=0]()
+                %2 : int = prim::Constant[value=0]()
+                %result.1 : Tensor = aten::select(%x, %1, %2)
+                %4 : int = prim::Constant[value=0]()
+                %5 : int = prim::Constant[value=0]()
+                %6 : Tensor = aten::select(%x, %4, %5)
+                %result.2 : Tensor = aten::mul(%result.1, %6)
+                %8 : int = prim::Constant[value=0]()
+                %9 : int = prim::Constant[value=1]()
+                %10 : Tensor = aten::select(%x, %8, %9)
+            -   %result : Tensor = aten::mul(%result.2, %10)
+            +   %result.3 : Tensor = aten::mul(%result.2, %10)
+            ?          ++
+                %12 : int = prim::Constant[value=0]()
+                %13 : int = prim::Constant[value=2]()
+                %14 : Tensor = aten::select(%x, %12, %13)
+            +   %result : Tensor = aten::mul(%result.3, %14)
+            +   %16 : int = prim::Constant[value=0]()
+            +   %17 : int = prim::Constant[value=3]()
+            +   %18 : Tensor = aten::select(%x, %16, %17)
+            -   %15 : Tensor = aten::mul(%result, %14)
+            ?     ^                                 ^
+            +   %19 : Tensor = aten::mul(%result, %18)
+            ?     ^                                 ^
+            -   return (%15);
+            ?             ^
+            +   return (%19);
+            ?             ^
+                }
+
+
+This message indicates to us that the computation differed between when
+we first traced it and when we traced it with the ``check_inputs``. Indeed,
+the loop within the body of ``loop_in_traced_fn`` depends on the shape
+of the input ``x``, and thus when we try another ``x`` with a different
+shape, the trace differs.
+
+In this case, data-dependent control flow like this can be captured using
+:func:`torch.jit.script` instead:
+
+.. testcode::
+
+    def fn(x):
+        result = x[0]
+        for i in range(x.size(0)):
+            result = result * x[i]
+        return result
+
+    inputs = (torch.rand(3, 4, 5),)
+    check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+    scripted_fn = torch.jit.script(fn)
+    print(scripted_fn.graph)
+    #print(str(scripted_fn.graph).strip())
+
+    for input_tuple in [inputs] + check_inputs:
+        torch.testing.assert_close(fn(*input_tuple), scripted_fn(*input_tuple))
+
+.. testoutput::
+    :hide:
+
+    ...
+
+
+Which produces::
+
+    graph(%x : Tensor) {
+        %5 : bool = prim::Constant[value=1]()
+        %1 : int = prim::Constant[value=0]()
+        %result.1 : Tensor = aten::select(%x, %1, %1)
+        %4 : int = aten::size(%x, %1)
+        %result : Tensor = prim::Loop(%4, %5, %result.1)
+        block0(%i : int, %7 : Tensor) {
+            %10 : Tensor = aten::select(%x, %1, %i)
+            %result.2 : Tensor = aten::mul(%7, %10)
+            -> (%5, %result.2)
+        }
+        return (%result);
+    }
+
+Tracer Warnings
+^^^^^^^^^^^^^^^
+The tracer produces warnings for several problematic patterns in traced
+computation. As an example, take a trace of a function that contains an
+in-place assignment on a slice (a view) of a Tensor:
+
+.. testcode::
+
+    def fill_row_zero(x):
+        x[0] = torch.rand(*x.shape[1:2])
+        return x
+
+    traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+    print(traced.graph)
+
+.. testoutput::
+    :hide:
+
+    ...
+
+Produces several warnings and a graph which simply returns the input::
+
+    fill_row_zero.py:4: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
+        x[0] = torch.rand(*x.shape[1:2])
+    fill_row_zero.py:6: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error:
+    Not within tolerance rtol=1e-05 atol=1e-05 at input[0, 1] (0.09115803241729736 vs. 0.6782537698745728) and 3 other locations (33.00%)
+        traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+    graph(%0 : Float(3, 4)) {
+        return (%0);
+    }
+
+We can fix this by modifying the code to not use the in-place update, but
+rather build up the result tensor out-of-place with ``torch.cat``:
+
+.. testcode::
+
+    def fill_row_zero(x):
+        x = torch.cat((torch.rand(1, *x.shape[1:2]), x[1:2]), dim=0)
+        return x
+
+    traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+    print(traced.graph)
+
+.. testoutput::
+    :hide:
+
+    ...
+
+Frequently Asked Questions
+--------------------------
+
+Q: I would like to train a model on GPU and do inference on CPU. What are the
+best practices?
+
+   First convert your model from GPU to CPU and then save it, like so: ::
+
+      cpu_model = gpu_model.cpu()
+      sample_input_cpu = sample_input_gpu.cpu()
+      traced_cpu = torch.jit.trace(cpu_model, sample_input_cpu)
+      torch.jit.save(traced_cpu, "cpu.pt")
+
+      traced_gpu = torch.jit.trace(gpu_model, sample_input_gpu)
+      torch.jit.save(traced_gpu, "gpu.pt")
+
+      # ... later, when using the model:
+
+      if use_gpu:
+        model = torch.jit.load("gpu.pt")
+      else:
+        model = torch.jit.load("cpu.pt")
+
+      model(input)
+
+   This is recommended because the tracer may witness tensor creation on a
+   specific device, so casting an already-loaded model may have unexpected
+   effects. Casting the model *before* saving it ensures that the tracer has
+   the correct device information.
+
+
+Q: How do I store attributes on a :class:`ScriptModule`?
+
+    Say we have a model like:
+
+    .. testcode::
+
+        import torch
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = 2
+
+            def forward(self):
+                return self.x
+
+        m = torch.jit.script(Model())
+
+
+
+    If ``Model`` is instantiated it will result in a compilation error
+    since the compiler doesn't know about ``x``. There are 4 ways to inform the
+    compiler of attributes on :class:`ScriptModule`:
+
+    1. ``nn.Parameter`` - Values wrapped in ``nn.Parameter`` will work as they
+    do on ``nn.Module``\s
+
+    2. ``register_buffer`` - Values wrapped in ``register_buffer`` will work as
+    they do on ``nn.Module``\s. This is equivalent to an attribute (see 4) of type
+    ``Tensor``.
+
+    3. Constants - Annotating a class member as ``Final`` (or adding it to a list called
+    ``__constants__`` at the class definition level) will mark the contained names
+    as constants. Constants are saved directly in the code of the model. See
+    `builtin-constants` for details.
+
+    4. Attributes - Values that are a `supported type` can be added as mutable
+    attributes. Most types can be inferred but some may need to be specified, see
+    `module attributes` for details.
+
+Q: I would like to trace module's method but I keep getting this error:
+
+``RuntimeError: Cannot insert a Tensor that requires grad as a constant. Consider making it a parameter or input, or detaching the gradient``
+
+    This error usually means that the method you are tracing uses a module's parameters and
+    you are passing the module's method instead of the module instance (e.g. ``my_module_instance.forward`` vs ``my_module_instance``).
+
+      - Invoking ``trace`` with a module's method captures module parameters (which may require gradients) as **constants**.
+      - On the other hand, invoking ``trace`` with module's instance (e.g. ``my_module``) creates a new module and correctly copies parameters into the new module, so they can accumulate gradients if required.
+
+    To trace a specific method on a module, see :func:`torch.jit.trace_module <torch.jit.trace_module>`
+
+Known Issues
+---------------
+
+If you're using ``Sequential`` with TorchScript, the inputs of some
+of the ``Sequential`` submodules may be falsely inferred to be
+``Tensor``, even if they're annotated otherwise. The canonical
+solution is to subclass ``nn.Sequential`` and redeclare ``forward``
+with the input typed correctly.
+
+Appendix
+--------
+
+Migrating to PyTorch 1.2 Recursive Scripting API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This section details the changes to TorchScript in PyTorch 1.2. If you are new to TorchScript you can
+skip this section. There are two main changes to the TorchScript API with PyTorch 1.2.
+
+1. :func:`torch.jit.script <torch.jit.script>` will now attempt to recursively compile functions,
+methods, and classes that it encounters. Once you call ``torch.jit.script``,
+compilation is "opt-out", rather than "opt-in".
+
+2. ``torch.jit.script(nn_module_instance)`` is now the preferred way to create
+:class:`ScriptModule`\s, instead of inheriting from ``torch.jit.ScriptModule``.
+These changes combine to provide a simpler, easier-to-use API for converting
+your ``nn.Module``\s into :class:`ScriptModule`\s, ready to be optimized and executed in a
+non-Python environment.
+
+The new usage looks like this:
+
+.. testcode::
+
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5)
+            self.conv2 = nn.Conv2d(20, 20, 5)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            return F.relu(self.conv2(x))
+
+    my_model = Model()
+    my_scripted_model = torch.jit.script(my_model)
+
+
+* The module's ``forward`` is compiled by default. Methods called from ``forward`` are lazily compiled in the order they are used in ``forward``.
+* To compile a method other than ``forward`` that is not called from ``forward``, add ``@torch.jit.export``.
+* To stop the compiler from compiling a method, add :func:`@torch.jit.ignore <torch.jit.ignore>` or :func:`@torch.jit.unused <torch.jit.unused>`. ``@ignore`` leaves the
+* method as a call to python, and ``@unused`` replaces it with an exception. ``@ignored`` cannot be exported; ``@unused`` can.
+* Most attribute types can be inferred, so ``torch.jit.Attribute`` is not necessary. For empty container types, annotate their types using `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_ class annotations.
+* Constants can be marked with a ``Final`` class annotation instead of adding the name of the member to ``__constants__``.
+* Python 3 type hints can be used in place of ``torch.jit.annotate``
+
+As a result of these changes, the following items are considered deprecated and should not appear in new code:
+  * The ``@torch.jit.script_method`` decorator
+  * Classes that inherit from ``torch.jit.ScriptModule``
+  * The ``torch.jit.Attribute`` wrapper class
+  * The ``__constants__`` array
+  * The ``torch.jit.annotate`` function
+
+Modules
+^^^^^^^
+.. warning::
+
+    The :func:`@torch.jit.ignore <torch.jit.ignore>` annotation's behavior changes in
+    PyTorch 1.2. Before PyTorch 1.2 the @ignore decorator was used to make a function
+    or method callable from code that is exported. To get this functionality back,
+    use ``@torch.jit.unused()``. ``@torch.jit.ignore`` is now equivalent
+    to ``@torch.jit.ignore(drop=False)``. See :func:`@torch.jit.ignore <torch.jit.ignore>`
+    and :func:`@torch.jit.unused<torch.jit.unused>` for details.
+
+When passed to the :func:`torch.jit.script <torch.jit.script>` function, a ``torch.nn.Module``\'s data is
+copied to a :class:`ScriptModule` and the TorchScript compiler compiles the module.
+The module's ``forward`` is compiled by default. Methods called from ``forward`` are
+lazily compiled in the order they are used in ``forward``, as well as any
+``@torch.jit.export`` methods.
+
+.. autofunction:: export
+
+Functions
+^^^^^^^^^
+Functions don't change much, they can be decorated with :func:`@torch.jit.ignore <torch.jit.ignore>` or :func:`torch.jit.unused <torch.jit.unused>` if needed.
+
+.. testcode::
+
+    # Same behavior as pre-PyTorch 1.2
+    @torch.jit.script
+    def some_fn():
+        return 2
+
+    # Marks a function as ignored, if nothing
+    # ever calls it then this has no effect
+    @torch.jit.ignore
+    def some_fn2():
+        return 2
+
+    # As with ignore, if nothing calls it then it has no effect.
+    # If it is called in script it is replaced with an exception.
+    @torch.jit.unused
+    def some_fn3():
+      import pdb; pdb.set_trace()
+      return 4
+
+    # Doesn't do anything, this function is already
+    # the main entry point
+    @torch.jit.export
+    def some_fn4():
+        return 2
+
+TorchScript Classes
+^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+
+    TorchScript class support is experimental. Currently it is best suited
+    for simple record-like types (think a ``NamedTuple`` with methods
+    attached).
+
+Everything in a user defined `TorchScript Class <torchscript-class>`_ is
+exported by default, functions can be decorated with :func:`@torch.jit.ignore
+<torch.jit.ignore>` if needed.
+
+Attributes
+^^^^^^^^^^
+The TorchScript compiler needs to know the types of `module attributes`. Most types
+can be inferred from the value of the member. Empty lists and dicts cannot have their
+types inferred and must have their types annotated with `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_ class annotations.
+If a type cannot be inferred and is not explicitly annotated, it will not be added as an attribute
+to the resulting :class:`ScriptModule`
+
+
+Old API:
+
+.. testcode::
+
+    from typing import Dict
+    import torch
+
+    class MyModule(torch.jit.ScriptModule):
+        def __init__(self):
+            super().__init__()
+            self.my_dict = torch.jit.Attribute({}, Dict[str, int])
+            self.my_int = torch.jit.Attribute(20, int)
+
+    m = MyModule()
+
+New API:
+
+.. testcode::
+
+    from typing import Dict
+
+    class MyModule(torch.nn.Module):
+        my_dict: Dict[str, int]
+
+        def __init__(self):
+            super().__init__()
+            # This type cannot be inferred and must be specified
+            self.my_dict = {}
+
+            # The attribute type here is inferred to be `int`
+            self.my_int = 20
+
+        def forward(self):
+            pass
+
+    m = torch.jit.script(MyModule())
+
+
+Constants
+^^^^^^^^^
+The ``Final`` type constructor can be used to mark members as `constant`. If members are not marked constant, they will be copied to the resulting :class:`ScriptModule` as an attribute. Using ``Final`` opens opportunities for optimization if the value is known to be fixed and gives additional type safety.
+
+Old API:
+
+.. testcode::
+
+    class MyModule(torch.jit.ScriptModule):
+        __constants__ = ['my_constant']
+
+        def __init__(self):
+            super().__init__()
+            self.my_constant = 2
+
+        def forward(self):
+            pass
+    m = MyModule()
+
+New API:
+
+::
+
+    from typing import Final
+
+    class MyModule(torch.nn.Module):
+
+        my_constant: Final[int]
+
+        def __init__(self):
+            super().__init__()
+            self.my_constant = 2
+
+        def forward(self):
+            pass
+
+    m = torch.jit.script(MyModule())
+
+.. _Python 3 type hints:
+
+Variables
+^^^^^^^^^
+Containers are assumed to have type ``Tensor`` and be non-optional (see
+`Default Types` for more information). Previously, ``torch.jit.annotate`` was used to
+tell the TorchScript compiler what the type should be. Python 3 style type hints are
+now supported.
+
+.. testcode::
+
+    import torch
+    from typing import Dict, Optional
+
+    @torch.jit.script
+    def make_dict(flag: bool):
+        x: Dict[str, int] = {}
+        x['hi'] = 2
+        b: Optional[int] = None
+        if flag:
+            b = 2
+        return x, b
+
+Fusion Backends
+~~~~~~~~~~~~~~~
+There are a couple of fusion backends available to optimize TorchScript execution. The default fuser on CPUs is NNC, which can perform fusions for both CPUs and GPUs. The default fuser on GPUs is NVFuser, which supports a wider range of operators and has demonstrated generated kernels with improved throughput. See the  `NVFuser documentation <https://github.com/pytorch/pytorch/blob/main/torch/csrc/jit/codegen/cuda/README.md>`_ for more details on usage and debugging.
+
+
+References
+~~~~~~~~~~
+.. toctree::
+    :maxdepth: 1
+
+    jit_python_reference
+    jit_unsupported
+
+.. This package is missing doc. Adding it here for coverage
+.. This does not add anything to the rendered page.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.jit.mobile
 .. py:module:: torch.jit.annotations
 .. py:module:: torch.jit.frontend
diff --git a/docs/source/jit_builtin_functions.rst b/docs/source/jit_builtin_functions.rst
index 6fd514f6e6fca..4e07ab9baceb4 100644
--- a/docs/source/jit_builtin_functions.rst
+++ b/docs/source/jit_builtin_functions.rst
@@ -3,6 +3,14 @@
 TorchScript Builtins
 ====================
 
+<<<<<<< HEAD
 .. warning::
     TorchScript is deprecated, please use
     `torch.export <https://docs.pytorch.org/docs/stable/export.html>`__ instead.
+=======
+This is a full reference of functions and Tensor methods accessible in TorchScript
+
+.. contents:: :local:
+
+.. automodule:: torch.jit.supported_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/jit_language_reference.md b/docs/source/jit_language_reference.md
index f2b31768e2d58..fbbc92f26855b 100644
--- a/docs/source/jit_language_reference.md
+++ b/docs/source/jit_language_reference.md
@@ -30,7 +30,930 @@
 
 # TorchScript Language Reference
 
+<<<<<<< HEAD
 :::{warning}
 TorchScript is deprecated, please use
 [torch.export](https://docs.pytorch.org/docs/stable/export.html) instead.
-:::
\ No newline at end of file
+:::
+=======
+TorchScript is a statically typed subset of Python that can either be written directly (using
+the {func}`@torch.jit.script <torch.jit.script>` decorator) or generated automatically from Python code via
+tracing. When using tracing, code is automatically converted into this subset of
+Python by recording only the actual operators on tensors and simply executing and
+discarding the other surrounding Python code.
+
+When writing TorchScript directly using `@torch.jit.script` decorator, the programmer must
+only use the subset of Python supported in TorchScript. This section documents
+what is supported in TorchScript as if it were a language reference for a stand
+alone language. Any features of Python not mentioned in this reference are not
+part of TorchScript. See `Builtin Functions` for a complete reference of available
+PyTorch tensor methods, modules, and functions.
+
+As a subset of Python, any valid TorchScript function is also a valid Python
+function. This makes it possible to `disable TorchScript` and debug the
+function using standard Python tools like `pdb`. The reverse is not true: there
+are many valid Python programs that are not valid TorchScript programs.
+Instead, TorchScript focuses specifically on the features of Python that are
+needed to represent neural network models in PyTorch.
+
+(types)=
+
+(supported-type)=
+
+## Types
+
+The largest difference between TorchScript and the full Python language is that
+TorchScript only supports a small set of types that are needed to express neural
+net models. In particular, TorchScript supports:
+
+```{eval-rst}
+.. csv-table::
+   :header: "Type", "Description"
+
+   "``Tensor``", "A PyTorch tensor of any dtype, dimension, or backend"
+   "``Tuple[T0, T1, ..., TN]``", "A tuple containing subtypes ``T0``, ``T1``, etc. (e.g. ``Tuple[Tensor, Tensor]``)"
+   "``bool``", "A boolean value"
+   "``int``", "A scalar integer"
+   "``float``", "A scalar floating point number"
+   "``str``", "A string"
+   "``List[T]``", "A list of which all members are type ``T``"
+   "``Optional[T]``", "A value which is either None or type ``T``"
+   "``Dict[K, V]``", "A dict with key type ``K`` and value type ``V``. Only ``str``, ``int``, and ``float`` are allowed as key types."
+   "``T``", "A {ref}`TorchScript Class`"
+   "``E``", "A {ref}`TorchScript Enum`"
+   "``NamedTuple[T0, T1, ...]``", "A :func:`collections.namedtuple <collections.namedtuple>` tuple type"
+   "``Union[T0, T1, ...]``", "One of the subtypes ``T0``, ``T1``, etc."
+```
+
+Unlike Python, each variable in TorchScript function must have a single static type.
+This makes it easier to optimize TorchScript functions.
+
+Example (a type mismatch)
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def an_error(x):
+        if x:
+            r = torch.rand(1)
+        else:
+            r = 4
+        return r
+
+```
+
+```{eval-rst}
+.. testoutput::
+
+     Traceback (most recent call last):
+       ...
+     RuntimeError: ...
+
+     Type mismatch: r is set to type Tensor in the true branch and type int in the false branch:
+     @torch.jit.script
+     def an_error(x):
+         if x:
+         ~~~~~
+             r = torch.rand(1)
+             ~~~~~~~~~~~~~~~~~
+         else:
+         ~~~~~
+             r = 4
+             ~~~~~ <--- HERE
+         return r
+     and was used here:
+         else:
+             r = 4
+         return r
+                ~ <--- HERE...
+```
+
+### Unsupported Typing Constructs
+
+TorchScript does not support all features and types of the {mod}`typing` module. Some of these
+are more fundamental things that are unlikely to be added in the future while others
+may be added if there is enough user demand to make it a priority.
+
+These types and features from the {mod}`typing` module are unavailable in TorchScript.
+
+```{eval-rst}
+.. csv-table::
+   :header: "Item", "Description"
+
+   ":any:`typing.Any`", ":any:`typing.Any` is currently in development but not yet released"
+   ":any:`typing.NoReturn`", "Not implemented"
+   ":any:`typing.Sequence`", "Not implemented"
+   ":any:`typing.Callable`", "Not implemented"
+   ":any:`typing.Literal`", "Not implemented"
+   ":any:`typing.ClassVar`", "Not implemented"
+   ":any:`typing.Final`", "This is supported for :any:`module attributes <Module Attributes>` class attribute annotations but not for functions"
+   ":any:`typing.AnyStr`", "TorchScript does not support :any:`bytes` so this type is not used"
+   ":any:`typing.overload`", ":any:`typing.overload` is currently in development but not yet released"
+   "Type aliases", "Not implemented"
+   "Nominal vs structural subtyping", "Nominal typing is in development, but structural typing is not"
+   "NewType", "Unlikely to be implemented"
+   "Generics", "Unlikely to be implemented"
+```
+
+Any other functionality from the {any}`typing` module not explicitly listed in this documentation is unsupported.
+
+### Default Types
+
+By default, all parameters to a TorchScript function are assumed to be Tensor.
+To specify that an argument to a TorchScript function is another type, it is possible to use
+MyPy-style type annotations using the types listed above.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def foo(x, tup):
+        # type: (int, Tuple[Tensor, Tensor]) -> Tensor
+        t0, t1 = tup
+        return t0 + t1 + x
+
+    print(foo(3, (torch.rand(3), torch.rand(3))))
+```
+
+```{eval-rst}
+.. testoutput::
+    :hide:
+
+    ...
+```
+
+:::{note}
+It is also possible to annotate types with Python 3 type hints from the
+`typing` module.
+
+```{eval-rst}
+.. testcode::
+
+  import torch
+  from typing import Tuple
+
+  @torch.jit.script
+  def foo(x: int, tup: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+      t0, t1 = tup
+      return t0 + t1 + x
+
+  print(foo(3, (torch.rand(3), torch.rand(3))))
+```
+
+```{eval-rst}
+.. testoutput::
+  :hide:
+
+  ...
+```
+:::
+
+An empty list is assumed to be `List[Tensor]` and empty dicts
+`Dict[str, Tensor]`. To instantiate an empty list or dict of other types,
+use `Python 3 type hints`.
+
+Example (type annotations for Python 3):
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import torch.nn as nn
+    from typing import Dict, List, Tuple
+
+    class EmptyDataStructures(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> Tuple[List[Tuple[int, float]], Dict[str, int]]:
+            # This annotates the list to be a `List[Tuple[int, float]]`
+            my_list: List[Tuple[int, float]] = []
+            for i in range(10):
+                my_list.append((i, x.item()))
+
+            my_dict: Dict[str, int] = {}
+            return my_list, my_dict
+
+    x = torch.jit.script(EmptyDataStructures())
+
+
+
+```
+
+### Optional Type Refinement
+
+TorchScript will refine the type of a variable of type `Optional[T]` when
+a comparison to `None` is made inside the conditional of an if-statement or checked in an `assert`.
+The compiler can reason about multiple `None` checks that are combined with
+`and`, `or`, and `not`. Refinement will also occur for else blocks of if-statements
+that are not explicitly written.
+
+The `None` check must be within the if-statement's condition; assigning
+a `None` check to a variable and using it in the if-statement's condition will
+not refine the types of variables in the check.
+Only local variables will be refined, an attribute like `self.x` will not and must assigned to
+a local variable to be refined.
+
+Example (refining types on parameters and locals):
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import torch.nn as nn
+    from typing import Optional
+
+    class M(nn.Module):
+        z: Optional[int]
+
+        def __init__(self, z):
+            super().__init__()
+            # If `z` is None, its type cannot be inferred, so it must
+            # be specified (above)
+            self.z = z
+
+        def forward(self, x, y, z):
+            # type: (Optional[int], Optional[int], Optional[int]) -> int
+            if x is None:
+                x = 1
+                x = x + 1
+
+            # Refinement for an attribute by assigning it to a local
+            z = self.z
+            if y is not None and z is not None:
+                x = y + z
+
+            # Refinement via an `assert`
+            assert z is not None
+            x += z
+            return x
+
+    module = torch.jit.script(M(2))
+    module = torch.jit.script(M(None))
+
+```
+
+(TorchScript Class)=
+
+(TorchScript Classes)=
+
+(torchscript-classes)=
+
+### TorchScript Classes
+
+:::{warning}
+TorchScript class support is experimental. Currently it is best suited
+for simple record-like types (think a `NamedTuple` with methods
+attached).
+:::
+
+Python classes can be used in TorchScript if they are annotated with {func}`@torch.jit.script <torch.jit.script>`,
+similar to how you would declare a TorchScript function:
+
+```{eval-rst}
+.. testcode::
+    :skipif: True  # TODO: fix the source file resolving so this can be tested
+
+    @torch.jit.script
+    class Foo:
+      def __init__(self, x, y):
+        self.x = x
+
+      def aug_add_x(self, inc):
+        self.x += inc
+
+```
+
+This subset is restricted:
+
+- All functions must be valid TorchScript functions (including `__init__()`).
+
+- Classes must be new-style classes, as we use `__new__()` to construct them with pybind11.
+
+- TorchScript classes are statically typed. Members can only be declared by assigning to
+  self in the `__init__()` method.
+
+  > For example, assigning to `self` outside of the `__init__()` method:
+  >
+  > ```
+  > @torch.jit.script
+  > class Foo:
+  >   def assign_x(self):
+  >     self.x = torch.rand(2, 3)
+  > ```
+  >
+  > Will result in:
+  >
+  > ```
+  > RuntimeError:
+  > Tried to set nonexistent attribute: x. Did you forget to initialize it in __init__()?:
+  > def assign_x(self):
+  >   self.x = torch.rand(2, 3)
+  >   ~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+  > ```
+
+- No expressions except method definitions are allowed in the body of the class.
+
+- No support for inheritance or any other polymorphism strategy, except for inheriting
+  from `object` to specify a new-style class.
+
+After a class is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type:
+
+```
+# Declare a TorchScript class
+@torch.jit.script
+class Pair:
+  def __init__(self, first, second):
+    self.first = first
+    self.second = second
+
+@torch.jit.script
+def sum_pair(p):
+  # type: (Pair) -> Tensor
+  return p.first + p.second
+
+p = Pair(torch.rand(2, 3), torch.rand(2, 3))
+print(sum_pair(p))
+```
+
+(TorchScript Enum)=
+
+(TorchScript Enums)=
+
+(torchscript-enums)=
+
+### TorchScript Enums
+
+Python enums can be used in TorchScript without any extra annotation or code:
+
+```
+from enum import Enum
+
+
+class Color(Enum):
+    RED = 1
+    GREEN = 2
+
+@torch.jit.script
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+
+    return x == y
+```
+
+After an enum is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type. The type of the values of an enum must be `int`,
+`float`, or `str`. All values must be of the same type; heterogeneous types for enum
+values are not supported.
+
+### Named Tuples
+
+Types produced by {func}`collections.namedtuple <collections.namedtuple>` can be used in TorchScript.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import collections
+
+    Point = collections.namedtuple('Point', ['x', 'y'])
+
+    @torch.jit.script
+    def total(point):
+        # type: (Point) -> Tensor
+        return point.x + point.y
+
+    p = Point(x=torch.rand(3), y=torch.rand(3))
+    print(total(p))
+```
+
+```{eval-rst}
+.. testoutput::
+    :hide:
+
+    ...
+
+```
+
+(jit_iterables)=
+
+### Iterables
+
+Some functions (for example, {any}`zip` and {any}`enumerate`) can only operate on iterable types.
+Iterable types in TorchScript include `Tensor`s, lists, tuples, dictionaries, strings,
+{any}`torch.nn.ModuleList` and {any}`torch.nn.ModuleDict`.
+
+## Expressions
+
+The following Python Expressions are supported.
+
+### Literals
+
+```
+True
+False
+None
+'string literals'
+"string literals"
+3  # interpreted as int
+3.4  # interpreted as a float
+```
+
+#### List Construction
+
+An empty list is assumed have type `List[Tensor]`.
+The types of other list literals are derived from the type of the members.
+See [Default Types] for more details.
+
+```
+[3, 4]
+[]
+[torch.rand(3), torch.rand(4)]
+```
+
+#### Tuple Construction
+
+```
+(3, 4)
+(3,)
+```
+
+#### Dict Construction
+
+An empty dict is assumed have type `Dict[str, Tensor]`.
+The types of other dict literals are derived from the type of the members.
+See [Default Types] for more details.
+
+```
+{'hello': 3}
+{}
+{'a': torch.rand(3), 'b': torch.rand(4)}
+```
+
+### Variables
+
+See [Variable Resolution] for how variables are resolved.
+
+```
+my_variable_name
+```
+
+### Arithmetic Operators
+
+```
+a + b
+a - b
+a * b
+a / b
+a ^ b
+a @ b
+```
+
+### Comparison Operators
+
+```
+a == b
+a != b
+a < b
+a > b
+a <= b
+a >= b
+```
+
+### Logical Operators
+
+```
+a and b
+a or b
+not b
+```
+
+### Subscripts and Slicing
+
+```
+t[0]
+t[-1]
+t[0:2]
+t[1:]
+t[:1]
+t[:]
+t[0, 1]
+t[0, 1:2]
+t[0, :1]
+t[-1, 1:, 0]
+t[1:, -1, 0]
+t[i:j, i]
+```
+
+### Function Calls
+
+Calls to `builtin functions`
+
+```
+torch.rand(3, dtype=torch.int)
+```
+
+Calls to other script functions:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def foo(x):
+        return x + 1
+
+    @torch.jit.script
+    def bar(x):
+        return foo(x)
+```
+
+### Method Calls
+
+Calls to methods of builtin types like tensor: `x.mm(y)`
+
+On modules, methods must be compiled before they can be called. The TorchScript
+compiler recursively compiles methods it sees when compiling other methods. By default,
+compilation starts on the `forward` method. Any methods called by `forward` will
+be compiled, and any methods called by those methods, and so on. To start compilation at
+a method other than `forward`, use the {func}`@torch.jit.export <torch.jit.export>` decorator
+(`forward` implicitly is marked `@torch.jit.export`).
+
+Calling a submodule directly (e.g. `self.resnet(input)`) is equivalent to
+calling its `forward` method (e.g. `self.resnet.forward(input)`).
+
+```{eval-rst}
+.. testcode::
+    :skipif: torchvision is None
+
+    import torch
+    import torch.nn as nn
+    import torchvision
+
+    class MyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            means = torch.tensor([103.939, 116.779, 123.68])
+            self.means = torch.nn.Parameter(means.resize_(1, 3, 1, 1))
+            resnet = torchvision.models.resnet18()
+            self.resnet = torch.jit.trace(resnet, torch.rand(1, 3, 224, 224))
+
+        def helper(self, input):
+            return self.resnet(input - self.means)
+
+        def forward(self, input):
+            return self.helper(input)
+
+        # Since nothing in the model calls `top_level_method`, the compiler
+        # must be explicitly told to compile this method
+        @torch.jit.export
+        def top_level_method(self, input):
+            return self.other_helper(input)
+
+        def other_helper(self, input):
+            return input + 10
+
+    # `my_script_module` will have the compiled methods `forward`, `helper`,
+    # `top_level_method`, and `other_helper`
+    my_script_module = torch.jit.script(MyModule())
+
+```
+
+### Ternary Expressions
+
+```
+x if x > y else y
+```
+
+### Casts
+
+```
+float(ten)
+int(3.5)
+bool(ten)
+str(2)``
+```
+
+### Accessing Module Parameters
+
+```
+self.my_parameter
+self.my_submodule.my_parameter
+```
+
+## Statements
+
+TorchScript supports the following types of statements:
+
+### Simple Assignments
+
+```
+a = b
+a += b # short-hand for a = a + b, does not operate in-place on a
+a -= b
+```
+
+### Pattern Matching Assignments
+
+```
+a, b = tuple_or_list
+a, b, *c = a_tuple
+```
+
+Multiple Assignments
+
+```
+a = b, c = tup
+```
+
+### Print Statements
+
+```
+print("the result of an add:", a + b)
+```
+
+### If Statements
+
+```
+if a < 4:
+    r = -a
+elif a < 3:
+    r = a + a
+else:
+    r = 3 * a
+```
+
+In addition to bools, floats, ints, and Tensors can be used in a conditional
+and will be implicitly casted to a boolean.
+
+### While Loops
+
+```
+a = 0
+while a < 4:
+    print(a)
+    a += 1
+```
+
+### For loops with range
+
+```
+x = 0
+for i in range(10):
+    x *= i
+```
+
+### For loops over tuples
+
+These unroll the loop, generating a body for
+each member of the tuple. The body must type-check correctly for each member.
+
+```
+tup = (3, torch.rand(4))
+for x in tup:
+    print(x)
+```
+
+### For loops over constant nn.ModuleList
+
+To use a `nn.ModuleList` inside a compiled method, it must be marked
+constant by adding the name of the attribute to the `__constants__`
+list for the type. For loops over a `nn.ModuleList` will unroll the body of the
+loop at compile time, with each member of the constant module list.
+
+```{eval-rst}
+.. testcode::
+
+    class SubModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.weight = nn.Parameter(torch.randn(2))
+
+        def forward(self, input):
+            return self.weight + input
+
+    class MyModule(torch.nn.Module):
+        __constants__ = ['mods']
+
+        def __init__(self):
+            super().__init__()
+            self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
+
+        def forward(self, v):
+            for module in self.mods:
+                v = module(v)
+            return v
+
+
+    m = torch.jit.script(MyModule())
+
+
+```
+
+### Break and Continue
+
+```
+for i in range(5):
+    if i == 1:
+        continue
+    if i == 3:
+        break
+    print(i)
+```
+
+### Return
+
+```
+return a, b
+```
+
+## Variable Resolution
+
+TorchScript supports a subset of Python's variable resolution (i.e. scoping)
+rules. Local variables behave the same as in Python, except for the restriction
+that a variable must have the same type along all paths through a function.
+If a variable has a different type on different branches of an if statement, it
+is an error to use it after the end of the if statement.
+
+Similarly, a variable is not allowed to be used if it is only *defined* along some
+paths through the function.
+
+Example:
+
+```{eval-rst}
+.. testcode::
+
+    @torch.jit.script
+    def foo(x):
+        if x < 0:
+            y = 4
+        print(y)
+```
+
+```{eval-rst}
+.. testoutput::
+
+     Traceback (most recent call last):
+       ...
+     RuntimeError: ...
+
+     y is not defined in the false branch...
+     @torch.jit.script...
+     def foo(x):
+         if x < 0:
+         ~~~~~~~~~
+             y = 4
+             ~~~~~ <--- HERE
+         print(y)
+     and was used here:
+         if x < 0:
+             y = 4
+         print(y)
+               ~ <--- HERE...
+```
+
+Non-local variables are resolved to Python values at compile time when the
+function is defined. These values are then converted into TorchScript values using
+the rules described in [Use of Python Values].
+
+## Use of Python Values
+
+To make writing TorchScript more convenient, we allow script code to refer
+to Python values in the surrounding scope. For instance, any time there is a
+reference to `torch`, the TorchScript compiler is actually resolving it to the
+`torch` Python module when the function is declared. These Python values are
+not a first class part of TorchScript. Instead they are de-sugared at compile-time
+into the primitive types that TorchScript supports. This depends
+on the dynamic type of the Python valued referenced when compilation occurs.
+This section describes the rules that are used when accessing Python values in TorchScript.
+
+### Functions
+
+TorchScript can call Python functions. This functionality is very useful when
+incrementally converting a model to TorchScript. The model can be moved function-by-function
+to TorchScript, leaving calls to Python functions in place. This way you can incrementally
+check the correctness of the model as you go.
+
+```{eval-rst}
+.. autofunction:: torch.jit.is_scripting
+```
+
+```{eval-rst}
+.. autofunction:: torch.jit.is_tracing
+
+```
+
+### Attribute Lookup On Python Modules
+
+TorchScript can lookup attributes on modules. `Builtin functions` like `torch.add`
+are accessed this way. This allows TorchScript to call functions defined in
+other modules.
+
+(constant)=
+
+### Python-defined Constants
+
+TorchScript also provides a way to use constants that are defined in Python.
+These can be used to hard-code hyper-parameters into the function, or to
+define universal constants. There are two ways of specifying that a Python
+value should be treated as a constant.
+
+1. Values looked up as attributes of a module are assumed to be constant:
+
+```{eval-rst}
+.. testcode::
+
+    import math
+    import torch
+
+    @torch.jit.script
+    def fn():
+        return math.pi
+```
+
+2. Attributes of a ScriptModule can be marked constant by annotating them with `Final[T]`
+
+```
+import torch
+import torch.nn as nn
+
+class Foo(nn.Module):
+    # `Final` from the `typing_extensions` module can also be used
+    a : torch.jit.Final[int]
+
+    def __init__(self):
+        super().__init__()
+        self.a = 1 + 4
+
+    def forward(self, input):
+        return self.a + input
+
+f = torch.jit.script(Foo())
+```
+
+Supported constant Python types are
+
+- `int`
+- `float`
+- `bool`
+- `torch.device`
+- `torch.layout`
+- `torch.dtype`
+- tuples containing supported types
+- `torch.nn.ModuleList` which can be used in a TorchScript for loop
+
+(module-attributes)=
+(Module Attributes)=
+
+### Module Attributes
+
+The `torch.nn.Parameter` wrapper and `register_buffer` can be used to assign
+tensors to a module. Other values assigned to a module that is compiled
+will be added to the compiled module if their types can be inferred. All [types]
+available in TorchScript can be used as module attributes. Tensor attributes are
+semantically the same as buffers. The type of empty lists and dictionaries and `None`
+values cannot be inferred and must be specified via
+[PEP 526-style](https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations) class annotations.
+If a type cannot be inferred and is not explicitly annotated, it will not be added as an attribute
+to the resulting {class}`ScriptModule`.
+
+Example:
+
+```{eval-rst}
+.. testcode::
+
+    from typing import List, Dict
+
+    class Foo(nn.Module):
+        # `words` is initialized as an empty list, so its type must be specified
+        words: List[str]
+
+        # The type could potentially be inferred if `a_dict` (below) was not
+        # empty, but this annotation ensures `some_dict` will be made into the
+        # proper type
+        some_dict: Dict[str, int]
+
+        def __init__(self, a_dict):
+            super().__init__()
+            self.words = []
+            self.some_dict = a_dict
+
+            # `int`s can be inferred
+            self.my_int = 10
+
+        def forward(self, input):
+            # type: (str) -> int
+            self.words.append(input)
+            return self.some_dict[input] + self.my_int
+
+    f = torch.jit.script(Foo({'hi': 2}))
+```
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/jit_language_reference_v2.md b/docs/source/jit_language_reference_v2.md
index 40da0740963ba..cf3bf88bc9f8b 100644
--- a/docs/source/jit_language_reference_v2.md
+++ b/docs/source/jit_language_reference_v2.md
@@ -25,7 +25,1837 @@
 
 # TorchScript Language Reference
 
+<<<<<<< HEAD
 :::{warning}
 TorchScript is deprecated, please use
 [torch.export](https://docs.pytorch.org/docs/stable/export.html) instead.
-:::
\ No newline at end of file
+:::
+=======
+This reference manual describes the syntax and core semantics of the TorchScript language.
+TorchScript is a statically typed subset of the Python language. This document explains the supported features of
+Python in TorchScript and also how the language diverges from regular Python. Any features of Python that are not mentioned in
+this reference manual are not part of TorchScript. TorchScript focuses specifically on the features of Python that are needed to
+represent neural network models in PyTorch.
+
+```{contents}
+:depth: 1
+:local: true
+```
+
+(type-system)=
+
+## Terminology
+
+This document uses the following terminologies:
+
+```{eval-rst}
+.. list-table::
+   :widths: 25 25
+   :header-rows: 1
+
+   * - Pattern
+     - Notes
+   * - ``::=``
+     - Indicates that the given symbol is defined as.
+   * - ``" "``
+     - Represents real keywords and delimiters that are part of the syntax.
+   * - ``A | B``
+     - Indicates either A or B.
+   * - ``( )``
+     - Indicates grouping.
+   * - ``[]``
+     - Indicates optional.
+   * - ``A+``
+     - Indicates a regular expression where term A is repeated at least once.
+   * - ``A*``
+     - Indicates a regular expression where term A is repeated zero or more times.
+```
+
+## Type System
+
+TorchScript is a statically typed subset of Python. The largest difference between TorchScript and the full Python language is that TorchScript only supports a small set of types that are needed to express
+neural net models.
+
+### TorchScript Types
+
+The TorchScript type system consists of `TSType` and `TSModuleType` as defined below.
+
+```
+TSAllType ::= TSType | TSModuleType
+TSType    ::= TSMetaType | TSPrimitiveType | TSStructuralType | TSNominalType
+```
+
+`TSType` represents the majority of TorchScript types that are composable and that can be used in TorchScript type annotations.
+`TSType` refers to any of the following:
+
+- Meta Types, e.g., `Any`
+- Primitive Types, e.g., `int`, `float`, and `str`
+- Structural Types, e.g., `Optional[int]` or `List[MyClass]`
+- Nominal Types (Python classes), e.g., `MyClass` (user-defined), `torch.tensor` (built-in)
+
+`TSModuleType` represents `torch.nn.Module` and its subclasses. It is treated differently from `TSType` because its type schema is inferred partly from the object instance and partly from the class definition.
+As such, instances of a `TSModuleType` may not follow the same static type schema. `TSModuleType` cannot be used as a TorchScript type annotation or be composed with `TSType` for type safety considerations.
+
+### Meta Types
+
+Meta types are so abstract that they are more like type constraints than concrete types.
+Currently TorchScript defines one meta-type, `Any`, that represents any TorchScript type.
+
+#### `Any` Type
+
+The `Any` type represents any TorchScript type. `Any` specifies no type constraints, thus there is no type-checking on `Any`.
+As such it can be bound to any Python or TorchScript data types (e.g., `int`, TorchScript `tuple`, or an arbitrary Python class that is not scripted).
+
+```
+TSMetaType ::= "Any"
+```
+
+Where:
+
+- `Any` is the Python class name from the typing module. Therefore, to use the `Any` type, you must import it from `typing` (e.g., `from typing import Any`).
+- Since `Any` can represent any TorchScript type, the set of operators that are allowed to operate on values of this type on `Any` is limited.
+
+#### Operators Supported for `Any` Type
+
+- Assignment to data of `Any` type.
+- Binding to parameter or return of `Any` type.
+- `x is`, `x is not` where `x` is of `Any` type.
+- `isinstance(x, Type)` where `x` is of `Any` type.
+- Data of `Any` type is printable.
+- Data of `List[Any]` type may be sortable if the data is a list of values of the same type `T` and that `T` supports comparison operators.
+
+**Compared to Python**
+
+`Any` is the least constrained type in the TorchScript type system. In that sense, it is quite similar to the
+`Object` class in Python. However, `Any` only supports a subset of the operators and methods that are supported by `Object`.
+
+#### Design Notes
+
+When we script a PyTorch module, we may encounter data that is not involved in the execution of the script. Nevertheless, it has to be described
+by a type schema. It is not only cumbersome to describe static types for unused data (in the context of the script), but also may lead to unnecessary
+scripting failures. `Any` is introduced to describe the type of the data where precise static types are not necessary for compilation.
+
+**Example 1**
+
+This example illustrates how `Any` can be used to allow the second element of the tuple parameter to be of any type. This is possible
+because `x[1]` is not involved in any computation that requires knowing its precise type.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    from typing import Tuple
+    from typing import Any
+
+    @torch.jit.export
+    def inc_first_element(x: Tuple[int, Any]):
+        return (x[0]+1, x[1])
+
+    m = torch.jit.script(inc_first_element)
+    print(m((1,2.0)))
+    print(m((1,(100,200))))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    (2, 2.0)
+    (2, (100, 200))
+```
+
+The second element of the tuple is of `Any` type, thus can bind to multiple types.
+For example, `(1, 2.0)` binds a float type to `Any` as in `Tuple[int, Any]`,
+whereas `(1, (100, 200))` binds a tuple to `Any` in the second invocation.
+
+**Example 2**
+
+This example illustrates how we can use `isinstance` to dynamically check the type of the data that is annotated as `Any` type:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import Any
+
+    def f(a:Any):
+        print(a)
+        return (isinstance(a, torch.Tensor))
+
+    ones = torch.ones([2])
+    m = torch.jit.script(f)
+    print(m(ones))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+     1
+     1
+    [ CPUFloatType{2} ]
+    True
+```
+
+### Primitive Types
+
+Primitive TorchScript types are types that represent a single type of value and go with a single pre-defined
+type name.
+
+```
+TSPrimitiveType ::= "int" | "float" | "double" | "complex" | "bool" | "str" | "None"
+```
+
+### Structural Types
+
+Structural types are types that are structurally defined without a user-defined name (unlike nominal types),
+such as `Future[int]`. Structural types are composable with any `TSType`.
+
+```
+TSStructuralType ::=  TSTuple | TSNamedTuple | TSList | TSDict |
+                    TSOptional | TSUnion | TSFuture | TSRRef | TSAwait
+
+TSTuple          ::= "Tuple" "[" (TSType ",")* TSType "]"
+TSNamedTuple     ::= "namedtuple" "(" (TSType ",")* TSType ")"
+TSList           ::= "List" "[" TSType "]"
+TSOptional       ::= "Optional" "[" TSType "]"
+TSUnion          ::= "Union" "[" (TSType ",")* TSType "]"
+TSFuture         ::= "Future" "[" TSType "]"
+TSRRef           ::= "RRef" "[" TSType "]"
+TSAwait          ::= "Await" "[" TSType "]"
+TSDict           ::= "Dict" "[" KeyType "," TSType "]"
+KeyType          ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
+```
+
+Where:
+
+- `Tuple`, `List`, `Optional`, `Union`, `Future`, `Dict` represent Python type class names that are defined in the module `typing`. To use these type names, you must import them from `typing` (e.g., `from typing import Tuple`).
+- `namedtuple` represents the Python class `collections.namedtuple` or `typing.NamedTuple`.
+- `Future` and `RRef` represent the Python classes `torch.futures` and `torch.distributed.rpc`.
+- `Await` represent the Python class `torch._awaits._Await`
+
+**Compared to Python**
+
+Apart from being composable with TorchScript types, these TorchScript structural types often support a common subset of the operators and methods of their Python counterparts.
+
+**Example 1**
+
+This example uses `typing.NamedTuple` syntax to define a tuple:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import NamedTuple
+    from typing import Tuple
+
+    class MyTuple(NamedTuple):
+        first: int
+        second: int
+
+    def inc(x: MyTuple) -> Tuple[int, int]:
+        return (x.first+1, x.second+1)
+
+    t = MyTuple(first=1, second=2)
+    scripted_inc = torch.jit.script(inc)
+    print("TorchScript:", scripted_inc(t))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    TorchScript: (2, 3)
+```
+
+**Example 2**
+
+This example uses `collections.namedtuple` syntax to define a tuple:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import NamedTuple
+    from typing import Tuple
+    from collections import namedtuple
+
+    _AnnotatedNamedTuple = NamedTuple('_NamedTupleAnnotated', [('first', int), ('second', int)])
+    _UnannotatedNamedTuple = namedtuple('_NamedTupleAnnotated', ['first', 'second'])
+
+    def inc(x: _AnnotatedNamedTuple) -> Tuple[int, int]:
+        return (x.first+1, x.second+1)
+
+    m = torch.jit.script(inc)
+    print(inc(_UnannotatedNamedTuple(1,2)))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    (2, 3)
+```
+
+**Example 3**
+
+This example illustrates a common mistake of annotating structural types, i.e., not importing the composite type
+classes from the `typing` module:
+
+```python
+import torch
+
+# ERROR: Tuple not recognized because not imported from typing
+@torch.jit.export
+def inc(x: Tuple[int, int]):
+    return (x[0]+1, x[1]+1)
+
+m = torch.jit.script(inc)
+print(m((1,2)))
+```
+
+Running the above code yields the following scripting error:
+
+```python
+File "test-tuple.py", line 5, in <module>
+    def inc(x: Tuple[int, int]):
+NameError: name 'Tuple' is not defined
+```
+
+The remedy is to add the line `from typing import Tuple` to the beginning of the code.
+
+### Nominal Types
+
+Nominal TorchScript types are Python classes. These types are called nominal because they are declared with a custom
+name and are compared using class names. Nominal classes are further classified into the following categories:
+
+```
+TSNominalType ::= TSBuiltinClasses | TSCustomClass | TSEnum
+```
+
+Among them, `TSCustomClass` and `TSEnum` must be compilable to TorchScript Intermediate Representation (IR). This is enforced by the type-checker.
+
+### Built-in Class
+
+Built-in nominal types are Python classes whose semantics are built into the TorchScript system (e.g., tensor types).
+TorchScript defines the semantics of these built-in nominal types, and often supports only a subset of the methods or
+attributes of its Python class definition.
+
+```
+TSBuiltinClass ::= TSTensor | "torch.device" | "torch.Stream" | "torch.dtype" |
+                    "torch.nn.ModuleList" | "torch.nn.ModuleDict" | ...
+TSTensor       ::= "torch.Tensor" | "common.SubTensor" | "common.SubWithTorchFunction" |
+                    "torch.nn.parameter.Parameter" | and subclasses of torch.Tensor
+```
+
+#### Special Note on torch.nn.ModuleList and torch.nn.ModuleDict
+
+Although `torch.nn.ModuleList` and `torch.nn.ModuleDict` are defined as a list and dictionary in Python,
+they behave more like tuples in TorchScript:
+
+- In TorchScript, instances of `torch.nn.ModuleList` or `torch.nn.ModuleDict` are immutable.
+- Code that iterates over `torch.nn.ModuleList` or `torch.nn.ModuleDict` is completely unrolled so that elements of `torch.nn.ModuleList` or keys of `torch.nn.ModuleDict` can be of different subclasses of `torch.nn.Module`.
+
+**Example**
+
+The following example highlights the use of a few built-in Torchscript classes (`torch.*`):
+
+```python
+import torch
+
+@torch.jit.script
+class A:
+    def __init__(self):
+        self.x = torch.rand(3)
+
+    def f(self, y: torch.device):
+        return self.x.to(device=y)
+
+def g():
+    a = A()
+    return a.f(torch.device("cpu"))
+
+script_g = torch.jit.script(g)
+print(script_g.graph)
+```
+
+### Custom Class
+
+Unlike built-in classes, semantics of custom classes are user-defined and the entire class definition must be compilable to TorchScript IR and subject to TorchScript type-checking rules.
+
+```
+TSClassDef ::= [ "@torch.jit.script" ]
+                  "class" ClassName [ "(object)" ]  ":"
+                    MethodDefinition |
+                [ "@torch.jit.ignore" ] | [ "@torch.jit.unused" ]
+                    MethodDefinition
+```
+
+Where:
+
+- Classes must be new-style classes. Python 3 supports only new-style classes. In Python 2.x, a new-style class is specified by subclassing from the object.
+- Instance data attributes are statically typed, and instance attributes must be declared by assignments inside the `__init__()` method.
+- Method overloading is not supported (i.e., you cannot have multiple methods with the same method name).
+- `MethodDefinition` must be compilable to TorchScript IR and adhere to TorchScript’s type-checking rules, (i.e., all methods must be valid TorchScript functions and class attribute definitions must be valid TorchScript statements).
+- `torch.jit.ignore` and `torch.jit.unused` can be used to ignore the method or function that is not fully torchscriptable or should be ignored by the compiler.
+
+**Compared to Python**
+
+TorchScript custom classes are quite limited compared to their Python counterpart. Torchscript custom classes:
+
+- Do not support class attributes.
+- Do not support subclassing except for subclassing an interface type or object.
+- Do not support method overloading.
+- Must initialize all its instance attributes in `__init__()`; this is because TorchScript constructs a static schema of the class by inferring attribute types in `__init__()`.
+- Must contain only methods that satisfy TorchScript type-checking rules and are compilable to TorchScript IRs.
+
+**Example 1**
+
+Python classes can be used in TorchScript if they are annotated with `@torch.jit.script`, similar to how a TorchScript function would be declared:
+
+```python
+@torch.jit.script
+class MyClass:
+    def __init__(self, x: int):
+        self.x = x
+
+    def inc(self, val: int):
+        self.x += val
+```
+
+**Example 2**
+
+A TorchScript custom class type must "declare" all its instance attributes by assignments in `__init__()`. If an instance attribute is not defined in `__init__()` but accessed in other methods of the class, the class cannot be compiled as a TorchScript class, as shown in the following example:
+
+```python
+import torch
+
+@torch.jit.script
+class foo:
+    def __init__(self):
+        self.y = 1
+
+# ERROR: self.x is not defined in __init__
+def assign_x(self):
+    self.x = torch.rand(2, 3)
+```
+
+The class will fail to compile and issue the following error:
+
+```
+RuntimeError:
+Tried to set nonexistent attribute: x. Did you forget to initialize it in __init__()?:
+def assign_x(self):
+    self.x = torch.rand(2, 3)
+    ~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+```
+
+**Example 3**
+
+In this example, a TorchScript custom class defines a class variable name, which is not allowed:
+
+```python
+import torch
+
+@torch.jit.script
+class MyClass(object):
+    name = "MyClass"
+    def __init__(self, x: int):
+        self.x = x
+
+def fn(a: MyClass):
+    return a.name
+```
+
+It leads to the following compile-time error:
+
+```
+RuntimeError:
+'__torch__.MyClass' object has no attribute or method 'name'. Did you forget to initialize an attribute in __init__()?:
+    File "test-class2.py", line 10
+def fn(a: MyClass):
+    return a.name
+        ~~~~~~ <--- HERE
+```
+
+### Enum Type
+
+Like custom classes, semantics of the enum type are user-defined and the entire class definition must be compilable to TorchScript IR and adhere to TorchScript type-checking rules.
+
+```
+TSEnumDef ::= "class" Identifier "(enum.Enum | TSEnumType)" ":"
+                ( MemberIdentifier "=" Value )+
+                ( MethodDefinition )*
+```
+
+Where:
+
+- Value must be a TorchScript literal of type `int`, `float`, or `str`, and must be of the same TorchScript type.
+- `TSEnumType` is the name of a TorchScript enumerated type. Similar to Python enum, TorchScript allows restricted `Enum` subclassing, that is, subclassing an enumerated is allowed only if it does not define any members.
+
+**Compared to Python**
+
+- TorchScript supports only `enum.Enum`. It does not support other variations such as `enum.IntEnum`, `enum.Flag`, `enum.IntFlag`, and `enum.auto`.
+- Values of TorchScript enum members must be of the same type and can only be `int`, `float`, or `str` types, whereas Python enum members can be of any type.
+- Enums containing methods are ignored in TorchScript.
+
+**Example 1**
+
+The following example defines the class `Color` as an `Enum` type:
+
+```python
+import torch
+from enum import Enum
+
+class Color(Enum):
+    RED = 1
+    GREEN = 2
+
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+    return x == y
+
+m = torch.jit.script(enum_fn)
+
+print("Eager: ", enum_fn(Color.RED, Color.GREEN))
+print("TorchScript: ", m(Color.RED, Color.GREEN))
+```
+
+**Example 2**
+
+The following example shows the case of restricted enum subclassing, where `BaseColor` does not define any member, thus can be subclassed by `Color`:
+
+```python
+import torch
+from enum import Enum
+
+class BaseColor(Enum):
+    def foo(self):
+        pass
+
+class Color(BaseColor):
+    RED = 1
+    GREEN = 2
+
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+    return x == y
+
+m = torch.jit.script(enum_fn)
+
+print("TorchScript: ", m(Color.RED, Color.GREEN))
+print("Eager: ", enum_fn(Color.RED, Color.GREEN))
+```
+
+### TorchScript Module Class
+
+`TSModuleType` is a special class type that is inferred from object instances that are created outside TorchScript. `TSModuleType` is named by the Python class of the object instance. The `__init__()` method of the Python class is not considered a TorchScript method, so it does not have to comply with TorchScript’s type-checking rules.
+
+The type schema of a module instance class is constructed directly from an instance object (created outside the scope of TorchScript) rather than inferred from `__init__()` like custom classes. It is possible that two objects of the same instance class type follow two different type schemas.
+
+In this sense, `TSModuleType` is not really a static type. Therefore, for type safety considerations, `TSModuleType` cannot be used in a TorchScript type annotation or be composed with `TSType`.
+
+### Module Instance Class
+
+TorchScript module type represents the type schema of a user-defined PyTorch module instance. When scripting a PyTorch module, the module object is always created outside TorchScript (i.e., passed in as parameter to `forward`). The Python module class is treated as a module instance class, so the `__init__()` method of the Python module class is not subject to the type-checking rules of TorchScript.
+
+```
+TSModuleType ::= "class" Identifier "(torch.nn.Module)" ":"
+                    ClassBodyDefinition
+```
+
+Where:
+
+- `forward()` and other methods decorated with `@torch.jit.export` must be compilable to TorchScript IR and subject to TorchScript’s type-checking rules.
+
+Unlike custom classes, only the forward method and other methods decorated with `@torch.jit.export` of the module type need to be compilable. Most notably, `__init__()` is not considered a TorchScript method. Consequently, module type constructors cannot be invoked within the scope of TorchScript. Instead, TorchScript module objects are always constructed outside and passed into `torch.jit.script(ModuleObj)`.
+
+**Example 1**
+
+This example illustrates a few features of module types:
+
+- The `TestModule` instance is created outside the scope of TorchScript (i.e., before invoking `torch.jit.script`).
+- `__init__()` is not considered a TorchScript method, therefore, it does not have to be annotated and can contain arbitrary Python code. In addition, the `__init__()` method of an instance class cannot be invoked in TorchScript code. Because `TestModule` instances are instantiated in Python, in this example, `TestModule(2.0)` and `TestModule(2)` create two instances with different types for its data attributes. `self.x` is of type `float` for `TestModule(2.0)`, whereas `self.y` is of type `int` for `TestModule(2.0)`.
+- TorchScript automatically compiles other methods (e.g., `mul()`) invoked by methods annotated via `@torch.jit.export` or `forward()` methods.
+- Entry-points to a TorchScript program are either `forward()` of a module type, functions annotated as `torch.jit.script`, or methods annotated as `torch.jit.export`.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    class TestModule(torch.nn.Module):
+        def __init__(self, v):
+            super().__init__()
+            self.x = v
+
+        def forward(self, inc: int):
+            return self.x + inc
+
+    m = torch.jit.script(TestModule(1))
+    print(f"First instance: {m(3)}")
+
+    m = torch.jit.script(TestModule(torch.ones([5])))
+    print(f"Second instance: {m(3)}")
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    First instance: 4
+    Second instance: tensor([4., 4., 4., 4., 4.])
+```
+
+**Example 2**
+
+The following example shows an incorrect usage of module type. Specifically, this example invokes the constructor of `TestModule` inside the scope of TorchScript:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    class TestModule(torch.nn.Module):
+        def __init__(self, v):
+            super().__init__()
+            self.x = v
+
+        def forward(self, x: int):
+            return self.x + x
+
+    class MyModel:
+        def __init__(self, v: int):
+            self.val = v
+
+        @torch.jit.export
+        def doSomething(self, val: int) -> int:
+            # error: should not invoke the constructor of module type
+            myModel = TestModule(self.val)
+            return myModel(val)
+
+    # m = torch.jit.script(MyModel(2)) # Results in below RuntimeError
+    # RuntimeError: Could not get name of python class object
+```
+
+(type-annotation)=
+
+## Type Annotation
+
+Since TorchScript is statically typed, programmers need to annotate types at *strategic points* of TorchScript code so that every local variable or
+instance data attribute has a static type, and every function and method has a statically typed signature.
+
+### When to Annotate Types
+
+In general, type annotations are only needed in places where static types cannot be automatically inferred (e.g., parameters or sometimes return types to
+methods or functions). Types of local variables and data attributes are often automatically inferred from their assignment statements. Sometimes an inferred type
+may be too restrictive, e.g., `x` being inferred as `NoneType` through assignment `x = None`, whereas `x` is actually used as an `Optional`. In such
+cases, type annotations may be needed to overwrite auto inference, e.g., `x: Optional[int] = None`. Note that it is always safe to type annotate a local variable
+or data attribute even if its type can be automatically inferred. The annotated type must be congruent with TorchScript’s type-checking.
+
+When a parameter, local variable, or data attribute is not type annotated and its type cannot be automatically inferred, TorchScript assumes it to be a
+default type of `TensorType`, `List[TensorType]`, or `Dict[str, TensorType]`.
+
+### Annotate Function Signature
+
+Since a parameter may not be automatically inferred from the body of the function (including both functions and methods), they need to be type annotated. Otherwise, they assume the default type `TensorType`.
+
+TorchScript supports two styles for method and function signature type annotation:
+
+- **Python3-style** annotates types directly on the signature. As such, it allows individual parameters to be left unannotated (whose type will be the default type of `TensorType`), or allows the return type to be left unannotated (whose type will be automatically inferred).
+
+```
+Python3Annotation ::= "def" Identifier [ "(" ParamAnnot* ")" ] [ReturnAnnot] ":"
+                            FuncOrMethodBody
+ParamAnnot        ::= Identifier [ ":" TSType ] ","
+ReturnAnnot       ::= "->" TSType
+```
+
+Note that when using Python3 style, the type `self` is automatically inferred and should not be annotated.
+
+- **Mypy style** annotates types as a comment right below the function/method declaration. In the Mypy style, since parameter names do not appear in the annotation, all parameters have to be annotated.
+
+```
+MyPyAnnotation ::= "# type:" "(" ParamAnnot* ")" [ ReturnAnnot ]
+ParamAnnot     ::= TSType ","
+ReturnAnnot    ::= "->" TSType
+```
+
+**Example 1**
+
+In this example:
+
+- `a` is not annotated and assumes the default type of `TensorType`.
+- `b` is annotated as type `int`.
+- The return type is not annotated and is automatically inferred as type `TensorType` (based on the type of the value being returned).
+
+```python
+import torch
+
+def f(a, b: int):
+    return a+b
+
+m = torch.jit.script(f)
+print("TorchScript:", m(torch.ones([6]), 100))
+```
+
+**Example 2**
+
+The following example uses Mypy style annotation. Note that parameters or return values must be annotated even if some of
+them assume the default type.
+
+```python
+import torch
+
+def f(a, b):
+    # type: (torch.Tensor, int) → torch.Tensor
+    return a+b
+
+m = torch.jit.script(f)
+print("TorchScript:", m(torch.ones([6]), 100))
+```
+
+### Annotate Variables and Data Attributes
+
+In general, types of data attributes (including class and instance data attributes) and local variables can be automatically inferred from assignment statements.
+Sometimes, however, if a variable or attribute is associated with values of different types (e.g., as `None` or `TensorType`), then they may need to be explicitly
+type annotated as a *wider* type such as `Optional[int]` or `Any`.
+
+#### Local Variables
+
+Local variables can be annotated according to Python3 typing module annotation rules, i.e.,
+
+```
+LocalVarAnnotation ::= Identifier [":" TSType] "=" Expr
+```
+
+In general, types of local variables can be automatically inferred. In some cases, however, you may need to annotate a multi-type for local variables
+that may be associated with different concrete types. Typical multi-types include `Optional[T]` and `Any`.
+
+**Example**
+
+```python
+import torch
+
+def f(a, setVal: bool):
+    value: Optional[torch.Tensor] = None
+    if setVal:
+        value = a
+    return value
+
+ones = torch.ones([6])
+m = torch.jit.script(f)
+print("TorchScript:", m(ones, True), m(ones, False))
+```
+
+#### Instance Data Attributes
+
+For `ModuleType` classes, instance data attributes can be annotated according to Python3 typing module annotation rules. Instance data attributes can be annotated (optionally) as final
+via `Final`.
+
+```
+"class" ClassIdentifier "(torch.nn.Module):"
+InstanceAttrIdentifier ":" ["Final("] TSType [")"]
+...
+```
+
+Where:
+
+- `InstanceAttrIdentifier` is the name of an instance attribute.
+- `Final` indicates that the attribute cannot be re-assigned outside of `__init__` or overridden in subclasses.
+
+**Example**
+
+```python
+import torch
+
+class MyModule(torch.nn.Module):
+    offset_: int
+
+def __init__(self, offset):
+    self.offset_ = offset
+
+...
+```
+
+### Type Annotation APIs
+
+#### `torch.jit.annotate(T, expr)`
+
+This API annotates type `T` to an expression `expr`. This is often used when the default type of an expression is not the type intended by the programmer.
+For instance, an empty list (dictionary) has the default type of `List[TensorType]` (`Dict[TensorType, TensorType]`), but sometimes it may be used to initialize
+a list of some other types. Another common use case is for annotating the return type of `tensor.tolist()`. Note, however, that it cannot be used to annotate
+the type of a module attribute in `__init__`; `torch.jit.Attribute` should be used for this instead.
+
+**Example**
+
+In this example, `[]` is declared as a list of integers via `torch.jit.annotate` (instead of assuming `[]` to be the default type of `List[TensorType]`):
+
+```python
+import torch
+from typing import List
+
+def g(l: List[int], val: int):
+    l.append(val)
+    return l
+
+def f(val: int):
+    l = g(torch.jit.annotate(List[int], []), val)
+    return l
+
+m = torch.jit.script(f)
+print("Eager:", f(3))
+print("TorchScript:", m(3))
+```
+
+See {meth}`torch.jit.annotate` for more information.
+
+### Type Annotation Appendix
+
+#### TorchScript Type System Definition
+
+```
+TSAllType       ::= TSType | TSModuleType
+TSType          ::= TSMetaType | TSPrimitiveType | TSStructuralType | TSNominalType
+
+TSMetaType      ::= "Any"
+TSPrimitiveType ::= "int" | "float" | "double" | "complex" | "bool" | "str" | "None"
+
+TSStructuralType ::= TSTuple | TSNamedTuple | TSList | TSDict | TSOptional |
+                    TSUnion | TSFuture | TSRRef | TSAwait
+TSTuple         ::= "Tuple" "[" (TSType ",")* TSType "]"
+TSNamedTuple    ::= "namedtuple" "(" (TSType ",")* TSType ")"
+TSList          ::= "List" "[" TSType "]"
+TSOptional      ::= "Optional" "[" TSType "]"
+TSUnion         ::= "Union" "[" (TSType ",")* TSType "]"
+TSFuture        ::= "Future" "[" TSType "]"
+TSRRef          ::= "RRef" "[" TSType "]"
+TSAwait         ::= "Await" "[" TSType "]"
+TSDict          ::= "Dict" "[" KeyType "," TSType "]"
+KeyType         ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
+
+TSNominalType   ::= TSBuiltinClasses | TSCustomClass | TSEnum
+TSBuiltinClass  ::= TSTensor | "torch.device" | "torch.stream"|
+                    "torch.dtype" | "torch.nn.ModuleList" |
+                    "torch.nn.ModuleDict" | ...
+TSTensor        ::= "torch.tensor" and subclasses
+```
+
+#### Unsupported Typing Constructs
+
+TorchScript does not support all features and types of the Python3 [typing](https://docs.python.org/3/library/typing.html#module-typing) module.
+Any functionality from the [typing](https://docs.python.org/3/library/typing.html#module-typing) module that is not explicitly specified in this
+documentation is unsupported. The following table summarizes `typing` constructs that are either unsupported or supported with restrictions in TorchScript.
+
+```{eval-rst}
+=============================  ================
+ Item                           Description
+-----------------------------  ----------------
+``typing.Any``                  In development
+``typing.NoReturn``             Not supported
+``typing.Callable``             Not supported
+``typing.Literal``              Not supported
+``typing.ClassVar``             Not supported
+``typing.Final``                Supported for module attributes, class attribute, and annotations, but not for functions.
+``typing.AnyStr``               Not supported
+``typing.overload``             In development
+Type aliases                    Not supported
+Nominal typing                  In development
+Structural typing               Not supported
+NewType                         Not supported
+Generics                        Not supported
+=============================  ================
+```
+
+(expressions)=
+
+## Expressions
+
+The following section describes the grammar of expressions that are supported in TorchScript.
+It is modeled after [the expressions chapter of the Python language reference](https://docs.python.org/3/reference/expressions.html).
+
+### Arithmetic Conversions
+
+There are a number of implicit type conversions that are performed in TorchScript:
+
+- A `Tensor` with a `float` or `int` data type can be implicitly converted to an instance of `FloatType` or `IntType` provided that it has a size of 0, does not have `require_grad` set to `True`, and will not require narrowing.
+- Instances of `StringType` can be implicitly converted to `DeviceType`.
+- The implicit conversion rules from the two bullet points above can be applied to instances of `TupleType` to produce instances of `ListType` with the appropriate contained type.
+
+Explicit conversions can be invoked using the `float`, `int`, `bool`, and `str` built-in functions
+that accept primitive data types as arguments and can accept user-defined types if they implement
+`__bool__`, `__str__`, etc.
+
+### Atoms
+
+Atoms are the most basic elements of expressions.
+
+```
+atom      ::=  identifier | literal | enclosure
+enclosure ::=  parenth_form | list_display | dict_display
+```
+
+#### Identifiers
+
+The rules that dictate what is a legal identifier in TorchScript are the same as
+their [Python counterparts](https://docs.python.org/3/reference/lexical_analysis.html#identifiers).
+
+#### Literals
+
+```
+literal ::=  stringliteral | integer | floatnumber
+```
+
+Evaluation of a literal yields an object of the appropriate type with the specific value
+(with approximations applied as necessary for floats). Literals are immutable, and multiple evaluations
+of identical literals may obtain the same object or distinct objects with the same value.
+[stringliteral](https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals),
+[integer](https://docs.python.org/3/reference/lexical_analysis.html#integer-literals), and
+[floatnumber](https://docs.python.org/3/reference/lexical_analysis.html#floating-point-literals)
+are defined in the same way as their Python counterparts.
+
+#### Parenthesized Forms
+
+```
+parenth_form ::=  '(' [expression_list] ')'
+```
+
+A parenthesized expression list yields whatever the expression list yields. If the list contains at least one
+comma, it yields a `Tuple`; otherwise, it yields the single expression inside the expression list. An empty
+pair of parentheses yields an empty `Tuple` object (`Tuple[]`).
+
+#### List and Dictionary Displays
+
+```
+list_comprehension ::=  expression comp_for
+comp_for           ::=  'for' target_list 'in' or_expr
+list_display       ::=  '[' [expression_list | list_comprehension] ']'
+dict_display       ::=  '{' [key_datum_list | dict_comprehension] '}'
+key_datum_list     ::=  key_datum (',' key_datum)*
+key_datum          ::=  expression ':' expression
+dict_comprehension ::=  key_datum comp_for
+```
+
+Lists and dicts can be constructed by either listing the container contents explicitly or by providing
+instructions on how to compute them via a set of looping instructions (i.e. a *comprehension*). A comprehension
+is semantically equivalent to using a for loop and appending to an ongoing list.
+Comprehensions implicitly create their own scope to make sure that the items of the target list do not leak into the
+enclosing scope. In the case that container items are explicitly listed, the expressions in the expression list
+are evaluated left-to-right. If a key is repeated in a `dict_display` that has a `key_datum_list`, the
+resultant dictionary uses the value from the rightmost datum in the list that uses the repeated key.
+
+### Primaries
+
+```
+primary ::=  atom | attributeref | subscription | slicing | call
+```
+
+#### Attribute References
+
+```
+attributeref ::=  primary '.' identifier
+```
+
+The `primary` must evaluate to an object of a type that supports attribute references that have an attribute named
+`identifier`.
+
+#### Subscriptions
+
+```
+subscription ::=  primary '[' expression_list ']'
+```
+
+The `primary` must evaluate to an object that supports subscription.
+
+- If the primary is a `List`, `Tuple`, or `str`, the expression list must evaluate to an integer or slice.
+- If the primary is a `Dict`, the expression list must evaluate to an object of the same type as the key type of the `Dict`.
+- If the primary is a `ModuleList`, the expression list must be an `integer` literal.
+- If the primary is a `ModuleDict`, the expression must be a `stringliteral`.
+
+#### Slicings
+
+A slicing selects a range of items in a `str`, `Tuple`, `List`, or `Tensor`. Slicings may be used as
+expressions or targets in assignment or `del` statements.
+
+```
+slicing      ::=  primary '[' slice_list ']'
+slice_list   ::=  slice_item (',' slice_item)* [',']
+slice_item   ::=  expression | proper_slice
+proper_slice ::=  [expression] ':' [expression] [':' [expression] ]
+```
+
+Slicings with more than one slice item in their slice lists can only be used with primaries that evaluate to an
+object of type `Tensor`.
+
+#### Calls
+
+```
+call          ::=  primary '(' argument_list ')'
+argument_list ::=  args [',' kwargs] | kwargs
+args          ::=  [arg (',' arg)*]
+kwargs        ::=  [kwarg (',' kwarg)*]
+kwarg         ::=  arg '=' expression
+arg           ::=  identifier
+```
+
+The `primary` must desugar or evaluate to a callable object. All argument expressions are evaluated
+before the call is attempted.
+
+### Power Operator
+
+```
+power ::=  primary ['**' u_expr]
+```
+
+The power operator has the same semantics as the built-in pow function (not supported); it computes its
+left argument raised to the power of its right argument. It binds more tightly than unary operators on the
+left, but less tightly than unary operators on the right; i.e. `-2 ** -3 == -(2 ** (-3))`. The left and right
+operands can be `int`, `float` or `Tensor`. Scalars are broadcast in the case of scalar-tensor/tensor-scalar
+exponentiation operations, and tensor-tensor exponentiation is done elementwise without any broadcasting.
+
+### Unary and Arithmetic Bitwise Operations
+
+```
+u_expr ::=  power | '-' power | '~' power
+```
+
+The unary `-` operator yields the negation of its argument. The unary `~` operator yields the bitwise inversion
+of its argument. `-` can be used with `int`, `float`, and `Tensor` of `int` and `float`.
+`~` can only be used with `int` and `Tensor` of `int`.
+
+### Binary Arithmetic Operations
+
+```
+m_expr ::=  u_expr | m_expr '*' u_expr | m_expr '@' m_expr | m_expr '//' u_expr | m_expr '/' u_expr | m_expr '%' u_expr
+a_expr ::=  m_expr | a_expr '+' m_expr | a_expr '-' m_expr
+```
+
+The binary arithmetic operators can operate on `Tensor`, `int`, and `float`. For tensor-tensor ops, both arguments must
+have the same shape. For scalar-tensor or tensor-scalar ops, the scalar is usually broadcast to the size of the
+tensor. Division ops can only accept scalars as their right-hand side argument, and do not support broadcasting.
+The `@` operator is for matrix multiplication and only operates on `Tensor` arguments. The multiplication operator
+(`*`) can be used with a list and integer in order to get a result that is the original list repeated a certain
+number of times.
+
+### Shifting Operations
+
+```
+shift_expr ::=  a_expr | shift_expr ( '<<' | '>>' ) a_expr
+```
+
+These operators accept two `int` arguments, two `Tensor` arguments, or a `Tensor` argument and an `int` or
+`float` argument. In all cases, a right shift by `n` is defined as floor division by `pow(2, n)`, and a left shift
+by `n` is defined as multiplication by `pow(2, n)`. When both arguments are `Tensors`, they must have the same
+shape. When one is a scalar and the other is a `Tensor`, the scalar is logically broadcast to match the size of
+the `Tensor`.
+
+### Binary Bitwise Operations
+
+```
+and_expr ::=  shift_expr | and_expr '&' shift_expr
+xor_expr ::=  and_expr | xor_expr '^' and_expr
+or_expr  ::=  xor_expr | or_expr '|' xor_expr
+```
+
+The `&` operator computes the bitwise AND of its arguments, the `^` the bitwise XOR, and the `|` the bitwise OR.
+Both operands must be `int` or `Tensor`, or the left operand must be `Tensor` and the right operand must be
+`int`. When both operands are `Tensor`, they must have the same shape. When the right operand is `int`, and
+the left operand is `Tensor`, the right operand is logically broadcast to match the shape of the `Tensor`.
+
+### Comparisons
+
+```
+comparison    ::=  or_expr (comp_operator or_expr)*
+comp_operator ::=  '<' | '>' | '==' | '>=' | '<=' | '!=' | 'is' ['not'] | ['not'] 'in'
+```
+
+A comparison yields a boolean value (`True` or `False`), or if one of the operands is a `Tensor`, a boolean
+`Tensor`. Comparisons can be chained arbitrarily as long as they do not yield boolean `Tensors` that have more
+than one element. `a op1 b op2 c ...` is equivalent to `a op1 b and b op2 c and ...`.
+
+#### Value Comparisons
+
+The operators `<`, `>`, `==`, `>=`, `<=`, and `!=` compare the values of two objects. The two objects generally need to be of
+the same type, unless there is an implicit type conversion available between the objects. User-defined types can
+be compared if rich comparison methods (e.g., `__lt__`) are defined on them. Built-in type comparison works like
+Python:
+
+- Numbers are compared mathematically.
+- Strings are compared lexicographically.
+- `lists`, `tuples`, and `dicts` can be compared only to other `lists`, `tuples`, and `dicts` of the same type and are compared using the comparison operator of corresponding elements.
+
+#### Membership Test Operations
+
+The operators `in` and `not in` test for membership. `x in s` evaluates to `True` if `x` is a member of `s` and `False` otherwise.
+`x not in s` is equivalent to `not x in s`. This operator is supported for `lists`, `dicts`, and `tuples`, and can be used with
+user-defined types if they implement the `__contains__` method.
+
+#### Identity Comparisons
+
+For all types except `int`, `double`, `bool`, and `torch.device`, operators `is` and `is not` test for the object’s identity;
+`x is y` is `True` if and only if `x` and `y` are the same object. For all other types, `is` is equivalent to
+comparing them using `==`. `x is not y` yields the inverse of `x is y`.
+
+### Boolean Operations
+
+```
+or_test  ::=  and_test | or_test 'or' and_test
+and_test ::=  not_test | and_test 'and' not_test
+not_test ::=  'bool' '(' or_expr ')' | comparison | 'not' not_test
+```
+
+User-defined objects can customize their conversion to `bool` by implementing a `__bool__` method. The operator `not`
+yields `True` if its operand is false, `False` otherwise. The expression `x` and `y` first evaluates `x`; if it is `False`, its
+value (`False`) is returned; otherwise, `y` is evaluated and its value is returned (`False` or `True`). The expression `x` or `y`
+first evaluates `x`; if it is `True`, its value (`True`) is returned; otherwise, `y` is evaluated and its value is returned
+(`False` or `True`).
+
+### Conditional Expressions
+
+```
+conditional_expression ::=  or_expr ['if' or_test 'else' conditional_expression]
+expression            ::=  conditional_expression
+```
+
+The expression `x if c else y` first evaluates the condition `c` rather than x. If `c` is `True`, `x` is
+evaluated and its value is returned; otherwise, `y` is evaluated and its value is returned. As with if-statements,
+`x` and `y` must evaluate to a value of the same type.
+
+### Expression Lists
+
+```
+expression_list ::=  expression (',' expression)* [',']
+starred_item    ::=  '*' primary
+```
+
+A starred item can only appear on the left-hand side of an assignment statement, e.g., `a, *b, c = ...`.
+
+% statements:
+
+## Simple Statements
+
+The following section describes the syntax of simple statements that are supported in TorchScript.
+It is modeled after [the simple statements chapter of the Python language reference](https://docs.python.org/3/reference/simple_stmts.html).
+
+### Expression Statements
+
+```
+expression_stmt    ::=  starred_expression
+starred_expression ::=  expression | (starred_item ",")* [starred_item]
+starred_item       ::=  assignment_expression | "*" or_expr
+```
+
+### Assignment Statements
+
+```
+assignment_stmt ::=  (target_list "=")+ (starred_expression)
+target_list     ::=  target ("," target)* [","]
+target          ::=  identifier
+                    | "(" [target_list] ")"
+                    | "[" [target_list] "]"
+                    | attributeref
+                    | subscription
+                    | slicing
+                    | "*" target
+```
+
+### Augmented Assignment Statements
+
+```
+augmented_assignment_stmt ::= augtarget augop (expression_list)
+augtarget                 ::= identifier | attributeref | subscription
+augop                     ::= "+=" | "-=" | "*=" | "/=" | "//=" | "%=" |
+                              "**="| ">>=" | "<<=" | "&=" | "^=" | "|="
+```
+
+### Annotated Assignment Statements
+
+```
+annotated_assignment_stmt ::= augtarget ":" expression
+                              ["=" (starred_expression)]
+```
+
+### The `raise` Statement
+
+```
+raise_stmt ::=  "raise" [expression ["from" expression]]
+```
+
+Raise statements in TorchScript do not support `try\except\finally`.
+
+### The `assert` Statement
+
+```
+assert_stmt ::=  "assert" expression ["," expression]
+```
+
+Assert statements in TorchScript do not support `try\except\finally`.
+
+### The `return` Statement
+
+```
+return_stmt ::=  "return" [expression_list]
+```
+
+Return statements in TorchScript do not support `try\except\finally`.
+
+### The `del` Statement
+
+```
+del_stmt ::=  "del" target_list
+```
+
+### The `pass` Statement
+
+```
+pass_stmt ::= "pass"
+```
+
+### The `print` Statement
+
+```
+print_stmt ::= "print" "(" expression  [, expression] [.format{expression_list}] ")"
+```
+
+### The `break` Statement
+
+```
+break_stmt ::= "break"
+```
+
+### The `continue` Statement:
+
+```
+continue_stmt ::= "continue"
+```
+
+## Compound Statements
+
+The following section describes the syntax of compound statements that are supported in TorchScript.
+The section also highlights how Torchscript differs from regular Python statements.
+It is modeled after [the compound statements chapter of the Python language reference](https://docs.python.org/3/reference/compound_stmts.html).
+
+### The `if` Statement
+
+Torchscript supports both basic `if/else` and ternary `if/else`.
+
+#### Basic `if/else` Statement
+
+```
+if_stmt ::= "if" assignment_expression ":" suite
+            ("elif" assignment_expression ":" suite)
+            ["else" ":" suite]
+```
+
+`elif` statements can repeat for an arbitrary number of times, but it needs to be before `else` statement.
+
+#### Ternary `if/else` Statement
+
+```
+if_stmt ::= return [expression_list] "if" assignment_expression "else" [expression_list]
+```
+
+**Example 1**
+
+A `tensor` with 1 dimension is promoted to `bool`:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def fn(x: torch.Tensor):
+        if x: # The tensor gets promoted to bool
+            return True
+        return False
+    print(fn(torch.rand(1)))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    True
+```
+
+**Example 2**
+
+A `tensor` with multi dimensions are not promoted to `bool`:
+
+```python
+import torch
+
+# Multi dimensional Tensors error out.
+
+@torch.jit.script
+def fn():
+    if torch.rand(2):
+        print("Tensor is available")
+
+    if torch.rand(4,5,6):
+        print("Tensor is available")
+
+print(fn())
+```
+
+Running the above code yields the following `RuntimeError`.
+
+```
+RuntimeError: The following operation failed in the TorchScript interpreter.
+Traceback of TorchScript (most recent call last):
+@torch.jit.script
+def fn():
+    if torch.rand(2):
+      ~~~~~~~~~~~~ <--- HERE
+        print("Tensor is available")
+RuntimeError: Boolean value of Tensor with more than one value is ambiguous
+```
+
+If a conditional variable is annotated as `final`, either the true or false branch is evaluated depending on the evaluation of the conditional variable.
+
+**Example 3**
+
+In this example, only the True branch is evaluated, since `a` is annotated as `final` and set to `True`:
+
+```python
+import torch
+
+a : torch.jit.final[Bool] = True
+
+if a:
+    return torch.empty(2,3)
+else:
+    return []
+```
+
+### The `while` Statement
+
+```
+while_stmt ::=  "while" assignment_expression ":" suite
+```
+
+`while...else` statements are not supported in Torchscript. It results in a `RuntimeError`.
+
+### The `for-in` Statement
+
+```
+for_stmt ::=  "for" target_list "in" expression_list ":" suite
+              ["else" ":" suite]
+```
+
+`for...else` statements are not supported in Torchscript. It results in a `RuntimeError`.
+
+**Example 1**
+
+For loops on tuples: these unroll the loop, generating a body for each member of the tuple. The body must type-check correctly for each member.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import Tuple
+
+    @torch.jit.script
+    def fn():
+        tup = (3, torch.ones(4))
+        for x in tup:
+            print(x)
+
+    fn()
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    3
+     1
+     1
+     1
+     1
+    [ CPUFloatType{4} ]
+
+```
+
+**Example 2**
+
+For loops on lists: for loops over a `nn.ModuleList` will unroll the body of the loop at compile time, with each member of the module list.
+
+```python
+class SubModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(2))
+
+    def forward(self, input):
+        return self.weight + input
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
+
+    def forward(self, v):
+        for module in self.mods:
+            v = module(v)
+        return v
+
+model = torch.jit.script(MyModule())
+```
+
+### The `with` Statement
+
+The `with` statement is used to wrap the execution of a block with methods defined by a context manager.
+
+```
+with_stmt ::=  "with" with_item ("," with_item) ":" suite
+with_item ::=  expression ["as" target]
+```
+
+- If a target was included in the `with` statement, the return value from the context manager’s `__enter__()` is assigned to it. Unlike python, if an exception caused the suite to be exited, its type, value, and traceback are not passed as arguments to `__exit__()`. Three `None` arguments are supplied.
+- `try`, `except`, and `finally` statements are not supported inside `with` blocks.
+- Exceptions raised within `with` block cannot be suppressed.
+
+### The `tuple` Statement
+
+```
+tuple_stmt ::= tuple([iterables])
+```
+
+- Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList`, and `torch.nn.ModuleDict`.
+- You cannot convert a List to Tuple by using this built-in function.
+
+Unpacking all outputs into a tuple is covered by:
+
+```
+abc = func() # Function that returns a tuple
+a,b = func()
+```
+
+### The `getattr` Statement
+
+```
+getattr_stmt ::= getattr(object, name[, default])
+```
+
+- Attribute name must be a literal string.
+- Module type object is not supported (e.g., torch.\_C).
+- Custom class object is not supported (e.g., torch.classes.\*).
+
+### The `hasattr` Statement
+
+```
+hasattr_stmt ::= hasattr(object, name)
+```
+
+- Attribute name must be a literal string.
+- Module type object is not supported (e.g., torch.\_C).
+- Custom class object is not supported (e.g., torch.classes.\*).
+
+### The `zip` Statement
+
+```
+zip_stmt ::= zip(iterable1, iterable2)
+```
+
+- Arguments must be iterables.
+- Two iterables of same outer container type but different length are supported.
+
+**Example 1**
+
+Both the iterables must be of the same container type:
+
+```{eval-rst}
+.. testcode::
+
+    a = [1, 2] # List
+    b = [2, 3, 4] # List
+    zip(a, b) # works
+```
+
+**Example 2**
+
+This example fails because the iterables are of different container types:
+
+```
+a = (1, 2) # Tuple
+b = [2, 3, 4] # List
+zip(a, b) # Runtime error
+```
+
+Running the above code yields the following `RuntimeError`.
+
+```
+RuntimeError: Can not iterate over a module list or
+    tuple with a value that does not have a statically determinable length.
+```
+
+**Example 3**
+
+Two iterables of the same container Type but different data type is supported:
+
+```{eval-rst}
+.. testcode::
+
+    a = [1.3, 2.4]
+    b = [2, 3, 4]
+    zip(a, b) # Works
+```
+
+Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList`, and `torch.nn.ModuleDict`.
+
+### The `enumerate` Statement
+
+```
+enumerate_stmt ::= enumerate([iterable])
+```
+
+- Arguments must be iterables.
+- Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList` and `torch.nn.ModuleDict`.
+
+(python-values-torch-script)=
+
+## Python Values
+
+(python-builtin-functions-values-resolution)=
+
+### Resolution Rules
+
+When given a Python value, TorchScript attempts to resolve it in the following five different ways:
+
+- Compilable Python Implementation:
+  : - When a Python value is backed by a Python implementation that can be compiled by TorchScript, TorchScript compiles and uses the underlying Python implementation.
+    - Example: `torch.jit.Attribute`
+- Op Python Wrapper:
+  : - When a Python value is a wrapper of a native PyTorch op, TorchScript emits the corresponding operator.
+    - Example: `torch.jit._logging.add_stat_value`
+- Python Object Identity Match:
+  : - For a limited set of `torch.*` API calls (in the form of Python values) that TorchScript supports, TorchScript attempts to match a Python value against each item in the set.
+    - When matched, TorchScript generates a corresponding `SugaredValue` instance that contains lowering logic for these values.
+    - Example: `torch.jit.isinstance()`
+- Name Match:
+  : - For Python built-in functions and constants, TorchScript identifies them by name, and creates a corresponding `SugaredValue` instance that implements their functionality.
+    - Example: `all()`
+- Value Snapshot:
+  : - For Python values from unrecognized modules, TorchScript attempts to take a snapshot of the value and converts it to a constant in the graph of the function(s) or method(s) that are being compiled.
+    - Example: `math.pi`
+
+(python-builtin-functions-support)=
+
+### Python Built-in Functions Support
+
+```{eval-rst}
+.. list-table:: TorchScript Support for Python Built-in Functions
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Built-in Function
+     - Support Level
+     - Notes
+   * - ``abs()``
+     - Partial
+     - Only supports ``Tensor``/``Int``/``Float`` type inputs. | Doesn't honor ``__abs__`` override.
+   * - ``all()``
+     - Full
+     -
+   * - ``any()``
+     - Full
+     -
+   * - ``ascii()``
+     - None
+     -
+   * - ``bin()``
+     - Partial
+     - Only supports ``Int`` type input.
+   * - ``bool()``
+     - Partial
+     - Only supports ``Tensor``/``Int``/``Float`` type inputs.
+   * - ``breakpoint()``
+     - None
+     -
+   * - ``bytearray()``
+     - None
+     -
+   * - ``bytes()``
+     - None
+     -
+   * - ``callable()``
+     - None
+     -
+   * - ``chr()``
+     - Partial
+     - Only ASCII character set is supported.
+   * - ``classmethod()``
+     - Full
+     -
+   * - ``compile()``
+     - None
+     -
+   * - ``complex()``
+     - None
+     -
+   * - ``delattr()``
+     - None
+     -
+   * - ``dict()``
+     - Full
+     -
+   * - ``dir()``
+     - None
+     -
+   * - ``divmod()``
+     - Full
+     -
+   * - ``enumerate()``
+     - Full
+     -
+   * - ``eval()``
+     - None
+     -
+   * - ``exec()``
+     - None
+     -
+   * - ``filter()``
+     - None
+     -
+   * - ``float()``
+     - Partial
+     - Doesn't honor ``__index__`` override.
+   * - ``format()``
+     - Partial
+     - Manual index specification not supported. | Format type modifier not supported.
+   * - ``frozenset()``
+     - None
+     -
+   * - ``getattr()``
+     - Partial
+     - Attribute name must be string literal.
+   * - ``globals()``
+     - None
+     -
+   * - ``hasattr()``
+     - Partial
+     - Attribute name must be string literal.
+   * - ``hash()``
+     - Full
+     - ``Tensor``'s hash is based on identity not numeric value.
+   * - ``hex()``
+     - Partial
+     - Only supports ``Int`` type input.
+   * - ``id()``
+     - Full
+     - Only supports ``Int`` type input.
+   * - ``input()``
+     - None
+     -
+   * - ``int()``
+     - Partial
+     - ``base`` argument not supported. | Doesn't honor ``__index__`` override.
+   * - ``isinstance()``
+     - Full
+     - ``torch.jit.isintance`` provides better support when checking against container types like ``Dict[str, int]``.
+   * - ``issubclass()``
+     - None
+     -
+   * - ``iter()``
+     - None
+     -
+   * - ``len()``
+     - Full
+     -
+   * - ``list()``
+     - Full
+     -
+   * - ``ord()``
+     - Partial
+     - Only ASCII character set is supported.
+   * - ``pow()``
+     - Full
+     -
+   * - ``print()``
+     - Partial
+     - ``separate``, ``end`` and ``file`` arguments are not supported.
+   * - ``property()``
+     - None
+     -
+   * - ``range()``
+     - Full
+     -
+   * - ``repr()``
+     - None
+     -
+   * - ``reversed()``
+     - None
+     -
+   * - ``round()``
+     - Partial
+     - ``ndigits`` argument is not supported.
+   * - ``set()``
+     - None
+     -
+   * - ``setattr()``
+     - None
+     -
+   * - ``slice()``
+     - Full
+     -
+   * - ``sorted()``
+     - Partial
+     - ``key`` argument is not supported.
+   * - ``staticmethod()``
+     - Full
+     -
+   * - ``str()``
+     - Partial
+     - ``encoding`` and ``errors`` arguments are not supported.
+   * - ``sum()``
+     - Full
+     -
+   * - ``super()``
+     - Partial
+     - It can only be used in ``nn.Module``'s ``__init__`` method.
+   * - ``type()``
+     - None
+     -
+   * - ``vars()``
+     - None
+     -
+   * - ``zip()``
+     - Full
+     -
+   * - ``__import__()``
+     - None
+     -
+```
+
+(python-builtin-values-support)=
+
+### Python Built-in Values Support
+
+```{eval-rst}
+.. list-table:: TorchScript Support for Python Built-in Values
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Built-in Value
+     - Support Level
+     - Notes
+   * - ``False``
+     - Full
+     -
+   * - ``True``
+     - Full
+     -
+   * - ``None``
+     - Full
+     -
+   * - ``NotImplemented``
+     - None
+     -
+   * - ``Ellipsis``
+     - Full
+     -
+
+```
+
+(torch-apis-in-torchscript)=
+
+## torch.\* APIs
+
+(torch-apis-in-torchscript-rpc)=
+
+### Remote Procedure Calls
+
+TorchScript supports a subset of RPC APIs that supports running a function on
+a specified remote worker instead of locally.
+
+Specifically, following APIs are fully supported:
+
+- `torch.distributed.rpc.rpc_sync()`
+  : - `rpc_sync()` makes a blocking RPC call to run a function on a remote worker. RPC messages are sent and received in parallel to execution of Python code.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.rpc_sync`.
+- `torch.distributed.rpc.rpc_async()`
+  : - `rpc_async()` makes a non-blocking RPC call to run a function on a remote worker. RPC messages are sent and received in parallel to execution of Python code.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.rpc_async`.
+- `torch.distributed.rpc.remote()`
+  : - `remote.()` executes a remote call on a worker and gets a Remote Reference `RRef` as the return value.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.remote`.
+
+(torch-apis-in-torchscript-async)=
+
+### Asynchronous Execution
+
+TorchScript enables you to create asynchronous computation tasks to make better use
+of computation resources. This is done via supporting a list of APIs that are
+only usable within TorchScript:
+
+- `torch.jit.fork()`
+  : - Creates an asynchronous task executing func and a reference to the value of the result of this execution. Fork will return immediately.
+    - Synonymous to `torch.jit._fork()`, which is only kept for backward compatibility reasons.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.fork`.
+- `torch.jit.wait()`
+  : - Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the result of the task.
+    - Synonymous to `torch.jit._wait()`, which is only kept for backward compatibility reasons.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.wait`.
+
+(torch-apis-in-torchscript-annotation)=
+
+### Type Annotations
+
+TorchScript is statically-typed. It provides and supports a set of utilities to help annotate variables and attributes:
+
+- `torch.jit.annotate()`
+  : - Provides a type hint to TorchScript where Python 3 style type hints do not work well.
+    - One common example is to annotate type for expressions like `[]`. `[]` is treated as `List[torch.Tensor]` by default. When a different type is needed, you can use this code to hint TorchScript: `torch.jit.annotate(List[int], [])`.
+    - More details can be found in {meth}`~torch.jit.annotate`
+- `torch.jit.Attribute`
+  : - Common use cases include providing type hint for `torch.nn.Module` attributes. Because their `__init__` methods are not parsed by TorchScript, `torch.jit.Attribute` should be used instead of `torch.jit.annotate` in the module's `__init__` methods.
+    - More details can be found in {meth}`~torch.jit.Attribute`
+- `torch.jit.Final`
+  : - An alias for Python's `typing.Final`. `torch.jit.Final` is kept only for backward compatibility reasons.
+
+(torch-apis-in-torchscript-meta-programming)=
+
+### Meta Programming
+
+TorchScript provides a set of utilities to facilitate meta programming:
+
+- `torch.jit.is_scripting()`
+  : - Returns a boolean value indicating whether the current program is compiled by `torch.jit.script` or not.
+    - When used in an `assert` or an `if` statement, the scope or branch where `torch.jit.is_scripting()` evaluates to `False` is not compiled.
+    - Its value can be evaluated statically at compile time, thus commonly used in `if` statements to stop TorchScript from compiling one of the branches.
+    - More details and examples can be found in {meth}`~torch.jit.is_scripting`
+- `torch.jit.is_tracing()`
+  : - Returns a boolean value indicating whether the current program is traced by `torch.jit.trace` / `torch.jit.trace_module` or not.
+    - More details can be found in {meth}`~torch.jit.is_tracing`
+- `@torch.jit.ignore`
+  : - This decorator indicates to the compiler that a function or method should be ignored and left as a Python function.
+    - This allows you to leave code in your model that is not yet TorchScript compatible.
+    - If a function decorated by `@torch.jit.ignore` is called from TorchScript, ignored functions will dispatch the call to the Python interpreter.
+    - Models with ignored functions cannot be exported.
+    - More details and examples can be found in {meth}`~torch.jit.ignore`
+- `@torch.jit.unused`
+  : - This decorator indicates to the compiler that a function or method should be ignored and replaced with the raising of an exception.
+    - This allows you to leave code in your model that is not yet TorchScript compatible and still export your model.
+    - If a function decorated by `@torch.jit.unused` is called from TorchScript, a runtime error will be raised.
+    - More details and examples can be found in {meth}`~torch.jit.unused`
+
+(torch-apis-in-torchscript-type-refinement)=
+
+### Type Refinement
+
+- `torch.jit.isinstance()`
+  : - Returns a boolean indicating whether a variable is of the specified type.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.isinstance`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/jit_python_reference.rst b/docs/source/jit_python_reference.rst
new file mode 100644
index 0000000000000..96e0fe13037c1
--- /dev/null
+++ b/docs/source/jit_python_reference.rst
@@ -0,0 +1,432 @@
+.. _python-language-reference:
+
+Python Language Reference Coverage
+==================================
+
+This is a 1:1 mapping of the features listed in https://docs.python.org/3/reference/ and their
+support in TorchScript. The categorizations are as follows:
+
+
+.. list-table::
+   :header-rows: 1
+
+   * - Section
+     - Status
+     - Note
+   * - `1. Introduction <https://docs.python.org/3/reference/introduction.html>`_
+     - Not Relevant
+     -
+   * - `1.1. Alternate Implementations <https://docs.python.org/3/reference/introduction.html#alternate-implementations>`_
+     - Not Relevant
+     -
+   * - `1.2. Notation <https://docs.python.org/3/reference/introduction.html#notation>`_
+     - Not Relevant
+     -
+   * - `2. Lexical analysis <https://docs.python.org/3/reference/lexical_analysis.html#>`_
+     - Not Relevant
+     -
+   * - `2.1. Line structure <https://docs.python.org/3/reference/lexical_analysis.html#line-structure>`_
+     - Not Relevant
+     -
+   * - `2.1.1. Logical lines <https://docs.python.org/3/reference/lexical_analysis.html#logical-lines>`_
+     - Not Relevant
+     -
+   * - `2.1.2. Physical lines <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>`_
+     - Supported
+     -
+   * - `2.1.3. Comments <https://docs.python.org/3/reference/lexical_analysis.html#comments>`_
+     - Supported
+     -
+   * - `2.1.4. Encoding declarations <https://docs.python.org/3/reference/lexical_analysis.html#encoding-declarations>`_
+     - Not Supported
+     - TorchScript explicitly don't support unicode
+   * - `2.1.5. Explicit line joining <https://docs.python.org/3/reference/lexical_analysis.html#explicit-line-joining>`_
+     - Supported
+     -
+   * - `2.1.6. Implicit line joining <https://docs.python.org/3/reference/lexical_analysis.html#implicit-line-joining>`_
+     - Supported
+     -
+   * - `2.1.7. Blank lines <https://docs.python.org/3/reference/lexical_analysis.html#blank-lines>`_
+     - Supported
+     -
+   * - `2.1.8. Indentation <https://docs.python.org/3/reference/lexical_analysis.html#indentation>`_
+     - Supported
+     -
+   * - `2.1.9. Whitespace between tokens <https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens>`_
+     - Not Relevant
+     -
+   * - `2.2. Other tokens <https://docs.python.org/3/reference/lexical_analysis.html#other-tokens>`_
+     - Not Relevant
+     -
+   * - `2.3. Identifiers and keywords <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>`_
+     - Supported
+     -
+   * - `2.3.1. Keywords <https://docs.python.org/3/reference/lexical_analysis.html#keywords>`_
+     - Supported
+     -
+   * - `2.3.2. Reserved classes of identifiers <https://docs.python.org/3/reference/lexical_analysis.html#reserved-classes-of-identifiers>`_
+     - Supported
+     -
+   * - `2.4. Literals <https://docs.python.org/3/reference/lexical_analysis.html#literals>`_
+     - Not Relevant
+     -
+   * - `2.4.1. String and Bytes literals <https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals>`_
+     - Supported
+     -
+   * - `2.4.2. String literal concatenation <https://docs.python.org/3/reference/lexical_analysis.html#string-literal-concatenation>`_
+     - Supported
+     -
+   * - `2.4.3. Formatted string literals <https://docs.python.org/3/reference/lexical_analysis.html#formatted-string-literals>`_
+     - Partially Supported
+     -
+   * - `2.4.4. Numeric literals <https://docs.python.org/3/reference/lexical_analysis.html#numeric-literals>`_
+     - Supported
+     -
+   * - `2.4.5. Integer literals <https://docs.python.org/3/reference/lexical_analysis.html#integer-literals>`_
+     - Supported
+     -
+   * - `2.4.6. Floating point literals <https://docs.python.org/3/reference/lexical_analysis.html#floating-point-literals>`_
+     - Supported
+     -
+   * - `2.4.7. Imaginary literals <https://docs.python.org/3/reference/lexical_analysis.html#imaginary-literals>`_
+     - Not Supported
+     -
+   * - `2.5. Operators <https://docs.python.org/3/reference/lexical_analysis.html#operators>`_
+     - Partially Supported
+     - Not supported: ``<<``, ``>>``, ``:=``
+   * - `2.6. Delimiters <https://docs.python.org/3/reference/lexical_analysis.html#delimiters>`_
+     - Partially Supported
+     - Not supported: ``**=``, ``<<=``, ``>>=``, ``%=``, ``^=``, ``@=``, ``&=``, ``//=``, ``%`` operator for some types (e.g. ``str``\ )
+   * - `3. Data model <https://docs.python.org/3/reference/datamodel.html#>`_
+     - Not Relevant
+     -
+   * - `3.1. Objects, values and types <https://docs.python.org/3/reference/datamodel.html#objects-values-and-types>`_
+     - Not Relevant
+     -
+   * - `3.2. The standard type hierarchy <https://docs.python.org/3/reference/datamodel.html#the-standard-type-hierarchy>`_
+     - Partially Supported
+     - Not supported: NotImplemented, Ellipsis, numbers.Complex, bytes, byte arrays, sets, frozen sets, generators, coroutines, async generators, modules, I/O objects, internal objects, slice objects ( though slicing is supported), classmethod
+   * - `3.3. Special method names <https://docs.python.org/3/reference/datamodel.html#special-method-names>`_
+     - Supported
+     -
+   * - `3.3.1. Basic customization <https://docs.python.org/3/reference/datamodel.html#basic-customization>`_
+     - Partially Supported
+     - Not supported: ``__new__`` , ``__del__`` , ``__bytes__`` , ``__format__`` , ``__hash__`` ,
+   * - `3.3.2. Customizing attribute access <https://docs.python.org/3/reference/datamodel.html#customizing-attribute-access>`_
+     - Not Supported
+     -
+   * - `3.3.2.1. Customizing module attribute access <https://docs.python.org/3/reference/datamodel.html#customizing-module-attribute-access>`_
+     - Not Supported
+     -
+   * - `3.3.2.2. Implementing Descriptors <https://docs.python.org/3/reference/datamodel.html#implementing-descriptors>`_
+     - Not Supported
+     -
+   * - `3.3.2.3. Invoking Descriptors <https://docs.python.org/3/reference/datamodel.html#invoking-descriptors>`_
+     - Not Supported
+     -
+   * - `3.3.2.4. __slots__ <https://docs.python.org/3/reference/datamodel.html#slots>`_
+     - Not Supported
+     -
+   * - `3.3.2.4.1. Notes on using __slots__ <https://docs.python.org/3/reference/datamodel.html#notes-on-using-slots>`_
+     - Not Supported
+     -
+   * - `3.3.3. Customizing class creation <https://docs.python.org/3/reference/datamodel.html#customizing-class-creation>`_
+     - Not Supported
+     -
+   * - `3.3.3.1. Metaclasses <https://docs.python.org/3/reference/datamodel.html#metaclasses>`_
+     - Not Supported
+     -
+   * - `3.3.3.2. Resolving MRO entries <https://docs.python.org/3/reference/datamodel.html#resolving-mro-entries>`_
+     - Not Supported
+     - ``super()`` is not supported
+   * - `3.3.3.3. Determining the appropriate metaclass <https://docs.python.org/3/reference/datamodel.html#determining-the-appropriate-metaclass>`_
+     - Not relevant
+     -
+   * - `3.3.3.4. Preparing the class namespace <https://docs.python.org/3/reference/datamodel.html#preparing-the-class-namespace>`_
+     - Not relevant
+     -
+   * - `3.3.3.5. Executing the class body <https://docs.python.org/3/reference/datamodel.html#executing-the-class-body>`_
+     - Not relevant
+     -
+   * - `3.3.3.6. Creating the class object <https://docs.python.org/3/reference/datamodel.html#creating-the-class-object>`_
+     - Not relevant
+     -
+   * - `3.3.3.7. Uses for metaclasses <https://docs.python.org/3/reference/datamodel.html#uses-for-metaclasses>`_
+     - Not relevant
+     -
+   * - `3.3.4. Customizing instance and subclass checks <https://docs.python.org/3/reference/datamodel.html#customizing-instance-and-subclass-checks>`_
+     - Not Supported
+     -
+   * - `3.3.5. Emulating generic types <https://docs.python.org/3/reference/datamodel.html#emulating-generic-types>`_
+     - Not Supported
+     -
+   * - `3.3.6. Emulating callable objects <https://docs.python.org/3/reference/datamodel.html#emulating-callable-objects>`_
+     - Supported
+     -
+   * - `3.3.7. Emulating container types <https://docs.python.org/3/reference/datamodel.html#emulating-container-types>`_
+     - Partially Supported
+     - Some magic methods not supported (e.g. ``__iter__`` )
+   * - `3.3.8. Emulating numeric types <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
+     - Partially Supported
+     - Magic methods with swapped operands not supported (``__r*__``)
+   * - `3.3.9. With Statement Context Managers <https://docs.python.org/3/reference/datamodel.html#with-statement-context-managers>`_
+     - Not Supported
+     -
+   * - `3.3.10. Special method lookup <https://docs.python.org/3/reference/datamodel.html#special-method-lookup>`_
+     - Not relevant
+     -
+   * - `3.4. Coroutines <https://docs.python.org/3/reference/datamodel.html#coroutines>`_
+     - Not Supported
+     -
+   * - `3.4.1. Awaitable Objects <https://docs.python.org/3/reference/datamodel.html#awaitable-objects>`_
+     - Not Supported
+     -
+   * - `3.4.2. Coroutine Objects <https://docs.python.org/3/reference/datamodel.html#coroutine-objects>`_
+     - Not Supported
+     -
+   * - `3.4.3. Asynchronous Iterators <https://docs.python.org/3/reference/datamodel.html#asynchronous-iterators>`_
+     - Not Supported
+     -
+   * - `3.4.4. Asynchronous Context Managers <https://docs.python.org/3/reference/datamodel.html#asynchronous-context-managers>`_
+     - Not Supported
+     -
+   * - `4. Execution model <https://docs.python.org/3/reference/executionmodel.html#>`_
+     - Not Relevant
+     -
+   * - `4.1. Structure of a program <https://docs.python.org/3/reference/executionmodel.html#structure-of-a-program>`_
+     - Not Relevant
+     -
+   * - `4.2. Naming and binding <https://docs.python.org/3/reference/executionmodel.html#naming-and-binding>`_
+     - Not Relevant
+     - Names are bound at compile time in TorchScript
+   * - `4.2.1. Binding of names <https://docs.python.org/3/reference/executionmodel.html#binding-of-names>`_
+     - Not Relevant
+     - See ``global`` and ``nonlocal`` statements section
+   * - `4.2.2. Resolution of names <https://docs.python.org/3/reference/executionmodel.html#resolution-of-names>`_
+     - Not Relevant
+     - See ``global`` and ``nonlocal`` statements section
+   * - `4.2.3. Builtins and restricted execution <https://docs.python.org/3/reference/executionmodel.html#builtins-and-restricted-execution>`_
+     - Not Relevant
+     -
+   * - `4.2.4. Interaction with dynamic features <https://docs.python.org/3/reference/executionmodel.html#interaction-with-dynamic-features>`_
+     - Not Supported
+     - Python values cannot be captured
+   * - `4.3. Exceptions <https://docs.python.org/3/reference/executionmodel.html#exceptions>`_
+     - Partially Supported
+     - See ``try`` and ``raise`` statement section
+   * - `5. The import system <https://docs.python.org/3/reference/import.html>`_
+     - Not Relevant
+     -
+   * - `6. Expressions <https://docs.python.org/3/reference/expressions.html#>`_
+     - Not Relevant
+     - See expressions section
+   * - `6.1. Arithmetic conversions <https://docs.python.org/3/reference/expressions.html#arithmetic-conversions>`_
+     - Supported
+     -
+   * - `6.2. Atoms <https://docs.python.org/3/reference/expressions.html#atoms>`_
+     - Not Relevant
+     -
+   * - `6.2.1. Identifiers (Names) <https://docs.python.org/3/reference/expressions.html#atom-identifiers>`_
+     - Supported
+     -
+   * - `6.2.2. Literals <https://docs.python.org/3/reference/expressions.html#literals>`_
+     - Partially Supported
+     - ``bytesliteral``\ , ``imagnumber`` not supported
+   * - `6.2.3. Parenthesized forms <https://docs.python.org/3/reference/expressions.html#parenthesized-forms>`_
+     - Supported
+     -
+   * - `6.2.4. Displays for lists, sets and dictionaries <https://docs.python.org/3/reference/expressions.html#displays-for-lists-sets-and-dictionaries>`_
+     - Partially Supported
+     - Not supported: comprehension ifs, async iterators
+   * - `6.2.5. List displays <https://docs.python.org/3/reference/expressions.html#list-displays>`_
+     - Supported
+     -
+   * - `6.2.6. Set displays <https://docs.python.org/3/reference/expressions.html#set-displays>`_
+     - Not Supported
+     -
+   * - `6.2.7. Dictionary displays <https://docs.python.org/3/reference/expressions.html#dictionary-displays>`_
+     - Supported
+     - dict() constructor with kwargs doesn't work, dict comprehensions, dictionary unpacking
+   * - `6.2.8. Generator expressions <https://docs.python.org/3/reference/expressions.html#generator-expressions>`_
+     - Not Supported
+     -
+   * - `6.2.9. Yield expressions <https://docs.python.org/3/reference/expressions.html#yield-expressions>`_
+     - Not Supported
+     -
+   * - `6.2.9.1. Generator-iterator methods <https://docs.python.org/3/reference/expressions.html#generator-iterator-methods>`_
+     - Not Supported
+     -
+   * - `6.2.9.2. Examples <https://docs.python.org/3/reference/expressions.html#examples>`_
+     - Not Supported
+     -
+   * - `6.2.9.3. Asynchronous generator functions <https://docs.python.org/3/reference/expressions.html#asynchronous-generator-functions>`_
+     - Not Supported
+     -
+   * - `6.2.9.4. Asynchronous generator-iterator methods <https://docs.python.org/3/reference/expressions.html#asynchronous-generator-iterator-methods>`_
+     - Not Supported
+     -
+   * - `6.3. Primaries <https://docs.python.org/3/reference/expressions.html#primaries>`_
+     - Supported
+     -
+   * - `6.3.1. Attribute references <https://docs.python.org/3/reference/expressions.html#attribute-references>`_
+     - Supported
+     -
+   * - `6.3.2. Subscriptions <https://docs.python.org/3/reference/expressions.html#subscriptions>`_
+     - Supported
+     -
+   * - `6.3.3. Slicings <https://docs.python.org/3/reference/expressions.html#slicings>`_
+     - Partially Supported
+     - Tuple slicing with stride is not supported
+   * - `6.3.4. Calls <https://docs.python.org/3/reference/expressions.html#calls>`_
+     - Partially Supported
+     - Args unpack / kwargs unpack is not supported
+   * - `6.4. Await expression <https://docs.python.org/3/reference/expressions.html#await-expression>`_
+     - Not Supported
+     -
+   * - `6.5. The power operator <https://docs.python.org/3/reference/expressions.html#the-power-operator>`_
+     - Supported
+     -
+   * - `6.6. Unary arithmetic and bitwise operations <https://docs.python.org/3/reference/expressions.html#unary-arithmetic-and-bitwise-operations>`_
+     - Partially Supported
+     - Some bitwise operators are not implemented for primitive types (e.g. ``~x`` where ``x`` is an ``int`` is not currently supported)
+   * - `6.7. Binary arithmetic operations <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+     - Partially Supported
+     - See delimiters section
+   * - `6.8. Shifting operations <https://docs.python.org/3/reference/expressions.html#shifting-operations>`_
+     - Not Supported
+     -
+   * - `6.9. Binary bitwise operations <https://docs.python.org/3/reference/expressions.html#binary-bitwise-operations>`_
+     - Supported
+     -
+   * - `6.10. Comparisons <https://docs.python.org/3/reference/expressions.html#comparisons>`_
+     - Supported
+     -
+   * - `6.10.1. Value comparisons <https://docs.python.org/3/reference/expressions.html#value-comparisons>`_
+     - Partially Supported
+     - Dictionary equality checks are not currently supported
+   * - `6.10.2. Membership test operations <https://docs.python.org/3/reference/expressions.html#membership-test-operations>`_
+     - Partially Supported
+     - Not supported for TorchScript classes
+   * - `6.10.3. Identity comparisons <https://docs.python.org/3/reference/expressions.html#is-not>`_
+     - Supported
+     -
+   * - `6.11. Boolean operations <https://docs.python.org/3/reference/expressions.html#boolean-operations>`_
+     - Supported
+     -
+   * - `6.12. Conditional expressions <https://docs.python.org/3/reference/expressions.html#conditional-expressions>`_
+     - Supported
+     -
+   * - `6.13. Lambdas <https://docs.python.org/3/reference/expressions.html#lambda>`_
+     - Not Supported
+     -
+   * - `6.14. Expression lists <https://docs.python.org/3/reference/expressions.html#expression-lists>`_
+     - Partially Supported
+     - Iterable unpacking not supported
+   * - `6.15. Evaluation order <https://docs.python.org/3/reference/expressions.html#evaluation-order>`_
+     - Supported
+     -
+   * - `6.16. Operator precedence <https://docs.python.org/3/reference/expressions.html#operator-precedence>`_
+     - Supported
+     -
+   * - `7. Simple statements <https://docs.python.org/3/reference/simple_stmts.html#>`_
+     - Supported
+     -
+   * - `7.1. Expression statements <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`_
+     - Supported
+     -
+   * - `7.2. Assignment statements <https://docs.python.org/3/reference/simple_stmts.html#assignment-statements>`_
+     - Supported
+     -
+   * - `7.2.1. Augmented assignment statements <https://docs.python.org/3/reference/simple_stmts.html#augmented-assignment-statements>`_
+     - Partially Supported
+     - See delimiters section
+   * - `7.2.2. Annotated assignment statements <https://docs.python.org/3/reference/simple_stmts.html#annotated-assignment-statements>`_
+     - Supported
+     -
+   * - `7.3. The assert statement <https://docs.python.org/3/reference/simple_stmts.html#the-assert-statement>`_
+     - Partially Supported
+     - Exception message is not customizable
+   * - `7.4. The pass statement <https://docs.python.org/3/reference/simple_stmts.html#the-pass-statement>`_
+     - Supported
+     -
+   * - `7.5. The del statement <https://docs.python.org/3/reference/simple_stmts.html#the-del-statement>`_
+     - Not Supported
+     -
+   * - `7.6. The return statement <https://docs.python.org/3/reference/simple_stmts.html#the-return-statement>`_
+     - Supported
+     - Some other features of returning (e.g. behavior with try..finally) are unsupported
+   * - `7.7. The yield statement <https://docs.python.org/3/reference/simple_stmts.html#the-yield-statement>`_
+     - Not Supported
+     -
+   * - `7.8. The raise statement <https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement>`_
+     - Partially Supported
+     - Exception message is not customizable
+   * - `7.9. The break statement <https://docs.python.org/3/reference/simple_stmts.html#the-break-statement>`_
+     - Supported
+     - Some other features of returning (e.g. behavior with try..finally) are unsupported
+   * - `7.10. The continue statement <https://docs.python.org/3/reference/simple_stmts.html#the-continue-statement>`_
+     - Supported
+     - Some other features of returning (e.g. behavior with try..finally) are unsupported
+   * - `7.11. The import statement <https://docs.python.org/3/reference/simple_stmts.html#the-import-statement>`_
+     - Not Supported
+     -
+   * - `7.11.1. Future statements <https://docs.python.org/3/reference/simple_stmts.html#future-statements>`_
+     - Not Supported
+     -
+   * - `7.12. The global statement <https://docs.python.org/3/reference/simple_stmts.html#the-global-statement>`_
+     - Not Supported
+     -
+   * - `7.13. The nonlocal statement <https://docs.python.org/3/reference/simple_stmts.html#the-nonlocal-statement>`_
+     - Not Supported
+     -
+   * - `8. Compound statements <https://docs.python.org/3/reference/compound_stmts.html#>`_
+     - Irrelevant
+     -
+   * - `8.1. The if statement <https://docs.python.org/3/reference/compound_stmts.html#the-if-statement>`_
+     - Supported
+     -
+   * - `8.2. The while statement <https://docs.python.org/3/reference/compound_stmts.html#the-while-statement>`_
+     - Partially Supported
+     - while..else is not supported
+   * - `8.3. The for statement <https://docs.python.org/3/reference/compound_stmts.html#the-for-statement>`_
+     - Partially Supported
+     - for..else is not supported
+   * - `8.4. The try statement <https://docs.python.org/3/reference/compound_stmts.html#the-try-statement>`_
+     - Not Supported
+     -
+   * - `8.5. The with statement <https://docs.python.org/3/reference/compound_stmts.html#the-with-statement>`_
+     - Partially Supported
+     - ``__exit__`` is always called with ``exc_type``, ``exc_value``, and ``traceback`` set to None, even if an exception was raised, and ``__exit__``'s return value is ignored.
+   * - `8.6. Function definitions <https://docs.python.org/3/reference/compound_stmts.html#function-definitions>`_
+     - Not Supported
+     -
+   * - `8.7. Class definitions <https://docs.python.org/3/reference/compound_stmts.html#class-definitions>`_
+     - Not Supported
+     -
+   * - `8.8. Coroutines <https://docs.python.org/3/reference/compound_stmts.html#coroutines>`_
+     - Not Supported
+     -
+   * - `8.8.1. Coroutine function definition <https://docs.python.org/3/reference/compound_stmts.html#coroutine-function-definition>`_
+     - Not Supported
+     -
+   * - `8.8.2. The async for statement <https://docs.python.org/3/reference/compound_stmts.html#the-async-for-statement>`_
+     - Not Supported
+     -
+   * - `8.8.3. The async with statement <https://docs.python.org/3/reference/compound_stmts.html#the-async-with-statement>`_
+     - Not Supported
+     -
+   * - `9. Top-level components <https://docs.python.org/3/reference/toplevel_components.html#>`_
+     - Not Relevant
+     -
+   * - `9.1. Complete Python programs <https://docs.python.org/3/reference/toplevel_components.html#complete-python-programs>`_
+     - Not Relevant
+     -
+   * - `9.2. File input <https://docs.python.org/3/reference/toplevel_components.html#file-input>`_
+     - Not Relevant
+     -
+   * - `9.3. Interactive input <https://docs.python.org/3/reference/toplevel_components.html#interactive-input>`_
+     - Not Relevant
+     -
+   * - `9.4. Expression input <https://docs.python.org/3/reference/toplevel_components.html#expression-input>`_
+     - Not Relevant
+     -
diff --git a/docs/source/jit_unsupported.rst b/docs/source/jit_unsupported.rst
new file mode 100644
index 0000000000000..60bca7d6d92c6
--- /dev/null
+++ b/docs/source/jit_unsupported.rst
@@ -0,0 +1,90 @@
+.. _jit_unsupported:
+
+TorchScript Unsupported PyTorch Constructs
+============================================
+
+Torch and Tensor Unsupported Attributes
+------------------------------------------
+
+
+TorchScript supports most methods defined on ``torch`` and ``torch.Tensor``, but we do not have full coverage.
+Here are specific known ops and categories of ops which have diverging behavior between
+Python and TorchScript. If you encounter something else that is not supported please
+file a GitHub issue. Deprecated ops are not listed below.
+
+
+
+.. automodule:: torch.jit.unsupported_tensor_ops
+
+
+Functions Not Correctly Bound on Torch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions will fail if used in TorchScript, either because they
+are not bound on `torch` or because Python expects a different schema than
+TorchScript.
+
+  * :func:`torch.tensordot`
+  * :func:`torch.nn.init.calculate_gain`
+  * :func:`torch.nn.init.eye_`
+  * :func:`torch.nn.init.dirac_`
+  * :func:`torch.nn.init.kaiming_normal_`
+  * :func:`torch.nn.init.orthogonal_`
+  * :func:`torch.nn.init.sparse`
+
+
+Ops With Divergent Schemas Between Torch & Python
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following categories of ops have divergent schemas:
+
+Functions which construct tensors from non-tensor inputs do not support the `requires_grad`
+argument, except for `torch.tensor`. This covers the following ops:
+
+  * :func:`torch.norm`
+  * :func:`torch.bartlett_window`
+  * :func:`torch.blackman_window`
+  * :func:`torch.empty`
+  * :func:`torch.empty_like`
+  * :func:`torch.empty_strided`
+  * :func:`torch.eye`
+  * :func:`torch.full`
+  * :func:`torch.full_like`
+  * :func:`torch.hamming_window`
+  * :func:`torch.hann_window`
+  * :func:`torch.linspace`
+  * :func:`torch.logspace`
+  * :func:`torch.normal`
+  * :func:`torch.ones`
+  * :func:`torch.rand`
+  * :func:`torch.rand_like`
+  * :func:`torch.randint_like`
+  * :func:`torch.randn`
+  * :func:`torch.randn_like`
+  * :func:`torch.randperm`
+  * :func:`torch.tril_indices`
+  * :func:`torch.triu_indices`
+  * :func:`torch.vander`
+  * :func:`torch.zeros`
+  * :func:`torch.zeros_like`
+
+The following functions require `dtype`, `layout`, `device` as parameters in TorchScript,
+but these parameters are optional in Python.
+
+  * :func:`torch.randint`
+  * :func:`torch.sparse_coo_tensor`
+  * :meth:`~torch.Tensor.to`
+
+
+PyTorch Unsupported Modules and Classes
+------------------------------------------
+
+TorchScript cannot currently compile a number of other commonly used PyTorch
+constructs. Below are listed the modules that TorchScript does not support, and
+an incomplete list of PyTorch classes that are not supported. For unsupported modules
+we suggest using :meth:`torch.jit.trace`.
+
+  * :class:`torch.nn.RNN`
+  * :class:`torch.nn.AdaptiveLogSoftmaxWithLoss`
+  * :class:`torch.autograd.Function`
+  * :class:`torch.autograd.enable_grad`
diff --git a/docs/source/jit_utils.rst b/docs/source/jit_utils.rst
new file mode 100644
index 0000000000000..abc4235912321
--- /dev/null
+++ b/docs/source/jit_utils.rst
@@ -0,0 +1,4 @@
+JIT Utils - torch.utils.jit
+==================================================
+
+.. automodule:: torch.utils.jit
diff --git a/docs/source/library.rst b/docs/source/library.rst
new file mode 100644
index 0000000000000..6cefdf1eb10c7
--- /dev/null
+++ b/docs/source/library.rst
@@ -0,0 +1,80 @@
+.. _torch-library-docs:
+
+torch.library
+===================================
+.. py:module:: torch.library
+.. currentmodule:: torch.library
+
+torch.library is a collection of APIs for extending PyTorch's core library
+of operators. It contains utilities for testing custom operators, creating new
+custom operators, and extending operators defined with PyTorch's C++ operator
+registration APIs (e.g. aten operators).
+
+For a detailed guide on effectively using these APIs, please see
+`PyTorch Custom Operators Landing Page <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html>`_
+for more details on how to effectively use these APIs.
+
+Testing custom ops
+------------------
+
+Use :func:`torch.library.opcheck` to test custom ops for incorrect usage of the
+Python torch.library and/or C++ TORCH_LIBRARY APIs. Also, if your operator supports
+training, use :func:`torch.autograd.gradcheck` to test that the gradients are
+mathematically correct.
+
+.. autofunction:: opcheck
+
+Creating new custom ops in Python
+---------------------------------
+
+Use :func:`torch.library.custom_op` to create new custom ops.
+
+.. autofunction:: custom_op
+.. autofunction:: triton_op
+.. autofunction:: wrap_triton
+
+Extending custom ops (created from Python or C++)
+-------------------------------------------------
+
+Use the register.* methods, such as :func:`torch.library.register_kernel` and
+:func:`torch.library.register_fake`, to add implementations
+for any operators (they may have been created using :func:`torch.library.custom_op` or
+via PyTorch's C++ operator registration APIs).
+
+.. autofunction:: register_kernel
+.. autofunction:: register_autocast
+.. autofunction:: register_autograd
+.. autofunction:: register_fake
+.. autofunction:: register_vmap
+.. autofunction:: impl_abstract
+.. autofunction:: get_ctx
+.. autofunction:: register_torch_dispatch
+.. autofunction:: infer_schema
+.. autoclass:: torch._library.custom_ops.CustomOpDef
+
+    .. automethod:: set_kernel_enabled
+
+
+Low-level APIs
+--------------
+
+The following APIs are direct bindings to PyTorch's C++ low-level
+operator registration APIs.
+
+.. warning::
+   The low-level operator registration APIs and the PyTorch Dispatcher are a
+   complicated PyTorch concept. We recommend you use the higher level APIs above
+   (that do not require a torch.library.Library object) when possible.
+   `This blog post <http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/>`_
+   is a good starting point to learn about the PyTorch Dispatcher.
+
+A tutorial that walks you through some examples on how to use this API is available on `Google Colab <https://colab.research.google.com/drive/1RRhSfk7So3Cn02itzLWE9K4Fam-8U011?usp=sharing>`_.
+
+.. autoclass:: torch.library.Library
+  :members:
+
+.. autofunction:: fallthrough_kernel
+
+.. autofunction:: define
+
+.. autofunction:: impl
diff --git a/docs/source/mtia.md b/docs/source/mtia.md
index badcb6be035bc..c9b5d3003ccc7 100644
--- a/docs/source/mtia.md
+++ b/docs/source/mtia.md
@@ -1,6 +1,10 @@
 # torch.mtia
 
+<<<<<<< HEAD
 The MTIA backend is implemented out of the tree, only interfaces are defined here.
+=======
+The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```{eval-rst}
 .. automodule:: torch.mtia
@@ -22,7 +26,10 @@ The MTIA backend is implemented out of the tree, only interfaces are defined her
     device_count
     init
     is_available
+<<<<<<< HEAD
     is_bf16_supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_initialized
     memory_stats
     get_device_capability
diff --git a/docs/source/mtia.memory.md b/docs/source/mtia.memory.md
index 7c10789f8cc3f..a2886ee1eec98 100644
--- a/docs/source/mtia.memory.md
+++ b/docs/source/mtia.memory.md
@@ -1,6 +1,10 @@
 # torch.mtia.memory
 
+<<<<<<< HEAD
 The MTIA backend is implemented out of the tree, only interfaces are defined here.
+=======
+The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```{eval-rst}
 .. automodule:: torch.mtia.memory
@@ -16,5 +20,8 @@ The MTIA backend is implemented out of the tree, only interfaces are defined her
     :nosignatures:
 
     memory_stats
+<<<<<<< HEAD
     memory_allocated
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/docs/source/nn.attention.flex_attention.md b/docs/source/nn.attention.flex_attention.md
index 8c51cee276514..f84ff5de346a2 100644
--- a/docs/source/nn.attention.flex_attention.md
+++ b/docs/source/nn.attention.flex_attention.md
@@ -14,12 +14,15 @@
 ```{eval-rst}
 .. autofunction:: flex_attention
 ```
+<<<<<<< HEAD
 ```{eval-rst}
 .. autoclass:: AuxOutput
 ```
 ```{eval-rst}
 .. autoclass:: AuxRequest
 ```
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## BlockMask Utilities
 
@@ -30,6 +33,12 @@
 .. autofunction:: create_mask
 ```
 ```{eval-rst}
+<<<<<<< HEAD
+=======
+.. autofunction:: create_nested_block_mask
+```
+```{eval-rst}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autofunction:: and_masks
 ```
 ```{eval-rst}
@@ -39,6 +48,7 @@
 .. autofunction:: noop_mask
 ```
 
+<<<<<<< HEAD
 ## FlexKernelOptions
 
 ```{eval-rst}
@@ -47,6 +57,8 @@
     :undoc-members:
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## BlockMask
 
 ```{eval-rst}
diff --git a/docs/source/nn.attention.rst b/docs/source/nn.attention.rst
index 8e7e6b0a762a4..c9343eb675b0d 100644
--- a/docs/source/nn.attention.rst
+++ b/docs/source/nn.attention.rst
@@ -23,7 +23,10 @@ Submodules
     flex_attention
     bias
     experimental
+<<<<<<< HEAD
     varlen
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. toctree::
     :hidden:
@@ -31,4 +34,7 @@ Submodules
     nn.attention.flex_attention
     nn.attention.bias
     nn.attention.experimental
+<<<<<<< HEAD
     nn.attention.varlen
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index 015d1d9ffda1a..d71f3dfe890b9 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -218,6 +218,7 @@ DataParallel functions (multi-GPU, distributed)
     :nosignatures:
 
     torch.nn.parallel.data_parallel
+<<<<<<< HEAD
 
 Low-Precision functions
 -----------------------
@@ -229,3 +230,5 @@ Low-Precision functions
     SwizzleType
     scaled_mm
     scaled_grouped_mm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 5d15e90a55499..fff24350e52f0 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -1,6 +1,7 @@
 .. role:: hidden
     :class: hidden-section
 
+<<<<<<< HEAD
 .. toctree::
    :maxdepth: 2
    :hidden:
@@ -10,6 +11,12 @@
 torch.nn
 ===================================
 .. automodule:: torch.nn
+=======
+torch.nn
+===================================
+.. automodule:: torch.nn
+.. automodule:: torch.nn.modules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 These are the basic building blocks for graphs:
 
@@ -480,8 +487,11 @@ for more information on how to implement your own parametrizations.
     parametrize.remove_parametrizations
     parametrize.cached
     parametrize.is_parametrized
+<<<<<<< HEAD
     parametrize.transfer_parametrizations_and_params
     parametrize.type_before_parametrizations
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. autosummary::
     :toctree: generated
@@ -512,17 +522,25 @@ Utility functions in other modules
     nn.utils.rnn.pack_sequence
     nn.utils.rnn.unpack_sequence
     nn.utils.rnn.unpad_sequence
+<<<<<<< HEAD
     nn.utils.rnn.invert_permutation
     nn.parameter.is_lazy
     nn.factory_kwargs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. autosummary::
     :toctree: generated
     :nosignatures:
     :template: classtemplate.rst
 
+<<<<<<< HEAD
     nn.modules.flatten.Flatten
     nn.modules.flatten.Unflatten
+=======
+    nn.Flatten
+    nn.Unflatten
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Quantized Functions
 --------------------
@@ -541,6 +559,21 @@ Lazy Modules Initialization
 
     nn.modules.lazy.LazyModuleMixin
 
+<<<<<<< HEAD
+=======
+Aliases
+_______
+
+The following are aliases to their counterparts in ``torch.nn``:
+
+.. currentmodule:: torch
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    nn.modules.normalization.RMSNorm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. This module needs to be documented. Adding here in the meantime
 .. for tracking purposes
diff --git a/docs/source/notes/broadcasting.rst b/docs/source/notes/broadcasting.rst
index c912429361948..7e636440ee3c1 100644
--- a/docs/source/notes/broadcasting.rst
+++ b/docs/source/notes/broadcasting.rst
@@ -13,6 +13,10 @@ General semantics
 -----------------
 Two tensors are "broadcastable" if the following rules hold:
 
+<<<<<<< HEAD
+=======
+- Each tensor has at least one dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - When iterating over the dimension sizes, starting at the trailing dimension,
   the dimension sizes must either be equal, one of them is 1, or one of them
   does not exist.
@@ -25,8 +29,12 @@ For Example::
 
     >>> x=torch.empty((0,))
     >>> y=torch.empty(2,2)
+<<<<<<< HEAD
     # x and y are not broadcastable, because the 0-sized dimension of x
     # does not match the 2-sized dimension of y.
+=======
+    # x and y are not broadcastable, because x does not have at least 1 dimension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # can line up trailing dimensions
     >>> x=torch.empty(5,3,4,1)
diff --git a/docs/source/notes/cpu_threading_runtimes.svg b/docs/source/notes/cpu_threading_runtimes.svg
new file mode 100644
index 0000000000000..e36ec598f063c
--- /dev/null
+++ b/docs/source/notes/cpu_threading_runtimes.svg
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   inkscape:version="1.0beta2 (2b71d25, 2019-12-03)"
+   sodipodi:docname="runtimes3.svg"
+   viewBox="0 0 404.04837 220.97278"
+   height="220.97278"
+   width="404.04837"
+   xml:space="preserve"
+   id="svg28"
+   version="1.1"><metadata
+     id="metadata34"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs32"><clipPath
+       id="clipPath48"
+       clipPathUnits="userSpaceOnUse"><path
+         inkscape:connector-curvature="0"
+         id="path46"
+         d="M 59.04,73.44 H 761.76 V 552.96 H 59.04 Z" /></clipPath></defs><sodipodi:namedview
+     fit-margin-bottom="0"
+     fit-margin-right="0"
+     fit-margin-left="0"
+     fit-margin-top="0"
+     scale-x="1"
+     inkscape:current-layer="g36"
+     inkscape:window-maximized="0"
+     inkscape:window-y="24"
+     inkscape:window-x="-1"
+     inkscape:cy="364.69698"
+     inkscape:cx="471.01619"
+     inkscape:zoom="1.2781863"
+     showgrid="false"
+     id="namedview30"
+     inkscape:window-height="1347"
+     inkscape:window-width="2560"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0"
+     guidetolerance="10"
+     gridtolerance="10"
+     objecttolerance="10"
+     borderopacity="1"
+     inkscape:document-rotation="0"
+     bordercolor="#666666"
+     pagecolor="#ffffff" /><g
+     transform="matrix(1.3333333,0,0,-1.3333333,-5.8035755,737.78012)"
+     inkscape:label="Rplot"
+     inkscape:groupmode="layer"
+     id="g36"><g
+       transform="translate(481.73731,-433.03549)"
+       id="g38"><path
+         inkscape:connector-curvature="0"
+         id="path40"
+         style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         d="M 0,0 H 792 V 612 H 0 Z" /></g><g
+       transform="matrix(0.28219533,0,0,0.30886107,40.679988,384.19425)"
+       id="g42"><g
+         clip-path="url(#clipPath48)"
+         id="g44"><path
+           inkscape:connector-curvature="0"
+           id="path50"
+           style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+           d="m 85.07,535.2 13.84,-216.25 27.69,-113.34 27.69,-42.78 27.68,-26.23 27.69,-13.44 27.69,-9.77 27.69,-6.72 27.69,-5.71 27.68,-3.07 27.69,-3.03 27.69,-2.06 27.69,-1.6 27.69,19.61 27.68,-1.17 27.69,-4.12 27.69,-2.15 27.69,0.09 27.69,-0.77 27.68,4.24 27.69,16.79 27.69,3.79 27.69,16.36 27.69,31.36 27.68,60.87" /></g></g><g
+       transform="matrix(0.28219533,0,0,0.30886107,40.679988,384.19425)"
+       id="g52"><path
+         inkscape:connector-curvature="0"
+         id="path54"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="M 71.22,73.44 H 624.98" /><path
+         inkscape:connector-curvature="0"
+         id="path56"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 71.22,73.44 v -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path58"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 209.66,73.44 v -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path60"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 348.1,73.44 v -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path62"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 486.54,73.44 v -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path64"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 624.98,73.44 v -7.2" /><text
+         id="text68"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,67.89,47.52)"><tspan
+           id="tspan66"
+           y="0"
+           x="0">0</tspan></text><text
+         id="text72"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,202.99,47.52)"><tspan
+           id="tspan70"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999">10</tspan></text><text
+         id="text76"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,341.43,47.52)"><tspan
+           id="tspan74"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999">20</tspan></text><text
+         id="text80"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,479.87,47.52)"><tspan
+           id="tspan78"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999">30</tspan></text><text
+         id="text84"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,618.31,47.52)"><tspan
+           id="tspan82"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999">40</tspan></text><path
+         inkscape:connector-curvature="0"
+         id="path86"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="M 59.04,155.99 V 535.17" /><path
+         inkscape:connector-curvature="0"
+         id="path88"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 59.04,155.99 h -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path90"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 59.04,250.79 h -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path92"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 59.04,345.58 h -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path94"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 59.04,440.37 h -7.2" /><path
+         inkscape:connector-curvature="0"
+         id="path96"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="m 59.04,535.17 h -7.2" /><text
+         id="text100"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,41.76,147.65)"><tspan
+           id="tspan98"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008">0.5</tspan></text><text
+         id="text104"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,41.76,242.45)"><tspan
+           id="tspan102"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008">1.0</tspan></text><text
+         id="text108"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,41.76,337.24)"><tspan
+           id="tspan106"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008">1.5</tspan></text><text
+         id="text112"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,41.76,432.03)"><tspan
+           id="tspan110"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008">2.0</tspan></text><text
+         id="text116"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,41.76,526.83)"><tspan
+           id="tspan114"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008">2.5</tspan></text><path
+         inkscape:connector-curvature="0"
+         id="path118"
+         style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+         d="M 59.04,73.44 H 761.76 V 552.96 H 59.04 V 73.44" /></g><g
+       transform="matrix(0.28219533,0,0,0.30886107,40.679988,384.19425)"
+       id="g120"><text
+         id="text124"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(1,0,0,-1,383.39,18.72)"><tspan
+           id="tspan122"
+           sodipodi:role="line"
+           y="0"
+           x="0 6.6719999 10.008 17.34 24.011999 28.007999 34.68 41.352001 48.023998"># Threads</tspan></text><text
+         id="text128"
+         style="font-variant:normal;font-weight:normal;font-size:12px;font-family:Helvetica;-inkscape-font-specification:Helvetica;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         transform="matrix(0,1,1,0,12.96,293.62)"><tspan
+           id="tspan126"
+           sodipodi:role="line"
+           y="0"
+           x="0 7.3319998 9.9960003 19.992001 26.483999 29.82 33.155998">Time, s</tspan></text></g></g></svg>
diff --git a/docs/source/notes/cpu_threading_torchscript_inference.rst b/docs/source/notes/cpu_threading_torchscript_inference.rst
index 8cac34c8c36fd..634ea6b309c5b 100644
--- a/docs/source/notes/cpu_threading_torchscript_inference.rst
+++ b/docs/source/notes/cpu_threading_torchscript_inference.rst
@@ -3,6 +3,166 @@
 CPU threading and TorchScript inference
 =================================================
 
+<<<<<<< HEAD
 .. warning::
     TorchScript is deprecated, please use
     `torch.export <https://docs.pytorch.org/docs/stable/export.html>`__ instead.
+=======
+PyTorch allows using multiple CPU threads during TorchScript model inference.
+The following figure shows different levels of parallelism one would find in a
+typical application:
+
+.. image:: cpu_threading_torchscript_inference.svg
+   :width: 75%
+
+One or more inference threads execute a model's forward pass on the given inputs.
+Each inference thread invokes a JIT interpreter that executes the ops
+of a model inline, one by one. A model can utilize a ``fork`` TorchScript
+primitive to launch an asynchronous task. Forking several operations at once
+results in a task that is executed in parallel. The ``fork`` operator returns a
+``Future`` object which can be used to synchronize on later, for example:
+
+.. code-block:: python
+
+    @torch.jit.script
+    def compute_z(x):
+        return torch.mm(x, self.w_z)
+
+    @torch.jit.script
+    def forward(x):
+        # launch compute_z asynchronously:
+        fut = torch.jit._fork(compute_z, x)
+        # execute the next operation in parallel to compute_z:
+        y = torch.mm(x, self.w_y)
+        # wait for the result of compute_z:
+        z = torch.jit._wait(fut)
+        return y + z
+
+
+PyTorch uses a single thread pool for the inter-op parallelism, this thread pool
+is shared by all inference tasks that are forked within the application process.
+
+In addition to the inter-op parallelism, PyTorch can also utilize multiple threads
+within the ops (`intra-op parallelism`). This can be useful in many cases,
+including element-wise ops on large tensors, convolutions, GEMMs, embedding
+lookups and others.
+
+
+Build options
+-------------
+
+PyTorch uses an internal ATen library to implement ops. In addition to that,
+PyTorch can also be built with support of external libraries, such as MKL_ and MKL-DNN_,
+to speed up computations on CPU.
+
+ATen, MKL and MKL-DNN support intra-op parallelism and depend on the
+following parallelization libraries to implement it:
+
+* OpenMP_ - a standard (and a library, usually shipped with a compiler), widely used in external libraries;
+* TBB_ - a newer parallelization library optimized for task-based parallelism and concurrent environments.
+
+OpenMP historically has been used by a large number of libraries. It is known
+for a relative ease of use and support for loop-based parallelism and other primitives.
+
+TBB is used to a lesser extent in external libraries, but, at the same time,
+is optimized for the concurrent environments. PyTorch's TBB backend guarantees that
+there's a separate, single, per-process intra-op thread pool used by all of the
+ops running in the application.
+
+Depending of the use case, one might find one or another parallelization
+library a better choice in their application.
+
+PyTorch allows selecting of the parallelization backend used by ATen and other
+libraries at the build time with the following build options:
+
++------------+------------------------+-----------------------------+----------------------------------------+
+| Library    | Build Option           | Values                      | Notes                                  |
++============+========================+=============================+========================================+
+| ATen       | ``ATEN_THREADING``     | ``OMP`` (default), ``TBB``  |                                        |
++------------+------------------------+-----------------------------+----------------------------------------+
+| MKL        | ``MKL_THREADING``      | (same)                      | To enable MKL use ``BLAS=MKL``         |
++------------+------------------------+-----------------------------+----------------------------------------+
+| MKL-DNN    | ``MKLDNN_CPU_RUNTIME`` | (same)                      | To enable MKL-DNN use ``USE_MKLDNN=1`` |
++------------+------------------------+-----------------------------+----------------------------------------+
+
+It is recommended not to mix OpenMP and TBB within one build.
+
+Any of the ``TBB`` values above require ``USE_TBB=1`` build setting (default: OFF).
+A separate setting ``USE_OPENMP=1`` (default: ON) is required for OpenMP parallelism.
+
+Runtime API
+-----------
+
+The following API is used to control thread settings:
+
++------------------------+-----------------------------------------------------------+---------------------------------------------------------+
+| Type of parallelism    | Settings                                                  | Notes                                                   |
++========================+===========================================================+=========================================================+
+| Inter-op parallelism   | ``at::set_num_interop_threads``,                          | Default number of threads: number of CPU cores.         |
+|                        | ``at::get_num_interop_threads`` (C++)                     |                                                         |
+|                        |                                                           |                                                         |
+|                        | ``set_num_interop_threads``,                              |                                                         |
+|                        | ``get_num_interop_threads`` (Python, :mod:`torch` module) |                                                         |
++------------------------+-----------------------------------------------------------+                                                         |
+| Intra-op parallelism   | ``at::set_num_threads``,                                  |                                                         |
+|                        | ``at::get_num_threads`` (C++)                             |                                                         |
+|                        | ``set_num_threads``,                                      |                                                         |
+|                        | ``get_num_threads`` (Python, :mod:`torch` module)         |                                                         |
+|                        |                                                           |                                                         |
+|                        | Environment variables:                                    |                                                         |
+|                        | ``OMP_NUM_THREADS`` and ``MKL_NUM_THREADS``               |                                                         |
++------------------------+-----------------------------------------------------------+---------------------------------------------------------+
+
+For the intra-op parallelism settings, ``at::set_num_threads``, ``torch.set_num_threads`` always take precedence
+over environment variables, ``MKL_NUM_THREADS`` variable takes precedence over ``OMP_NUM_THREADS``.
+
+Tuning the number of threads
+----------------------------
+
+The following simple script shows how a runtime of matrix multiplication changes with the number of threads:
+
+.. code-block:: python
+
+    import timeit
+    runtimes = []
+    threads = [1] + [t for t in range(2, 49, 2)]
+    for t in threads:
+        torch.set_num_threads(t)
+        r = timeit.timeit(setup = "import torch; x = torch.randn(1024, 1024); y = torch.randn(1024, 1024)", stmt="torch.mm(x, y)", number=100)
+        runtimes.append(r)
+    # ... plotting (threads, runtimes) ...
+
+Running the script on a system with 24 physical CPU cores (Xeon E5-2680, MKL and OpenMP based build) results in the following runtimes:
+
+.. image:: cpu_threading_runtimes.svg
+   :width: 75%
+
+The following considerations should be taken into account when tuning the number of intra- and inter-op threads:
+
+* When choosing the number of threads one needs to avoid `oversubscription` (using too many threads, leads to performance degradation). For example, in an application that uses a large application thread pool or heavily relies on
+  inter-op parallelism, one might find disabling intra-op parallelism as a possible option (i.e. by calling ``set_num_threads(1)``);
+
+* In a typical application one might encounter a trade off between `latency` (time spent on processing an inference request) and `throughput` (amount of work done per unit of time). Tuning the number of threads can be a useful
+  tool to adjust this trade off in one way or another. For example, in latency critical applications one might want to increase the number of intra-op threads to process each request as fast as possible. At the same time, parallel implementations
+  of ops may add an extra overhead that increases amount work done per single request and thus reduces the overall throughput.
+
+.. warning::
+    OpenMP does not guarantee that a single per-process intra-op thread
+    pool is going to be used in the application. On the contrary, two different application or inter-op
+    threads may use different OpenMP thread pools for intra-op work.
+    This might result in a large number of threads used by the application.
+    Extra care in tuning the number of threads is needed to avoid
+    oversubscription in multi-threaded applications in OpenMP case.
+
+.. note::
+    Pre-built PyTorch releases are compiled with OpenMP support.
+
+.. note::
+    ``parallel_info`` utility prints information about thread settings and can be used for debugging.
+    Similar output can be also obtained in Python with ``torch.__config__.parallel_info()`` call.
+
+.. _OpenMP: https://www.openmp.org/
+.. _TBB: https://github.com/intel/tbb
+.. _MKL: https://software.intel.com/en-us/mkl
+.. _MKL-DNN: https://github.com/intel/mkl-dnn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/notes/cpu_threading_torchscript_inference.svg b/docs/source/notes/cpu_threading_torchscript_inference.svg
new file mode 100644
index 0000000000000..f09884cc5f274
--- /dev/null
+++ b/docs/source/notes/cpu_threading_torchscript_inference.svg
@@ -0,0 +1,681 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   viewBox="0 0 576 336"
+   height="336"
+   width="576"
+   xml:space="preserve"
+   id="svg2"
+   version="1.1"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       id="clipPath18"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path16"
+         d="M 0,6.1035e-5 H 432 V 252.00006 H 0 Z" /></clipPath><clipPath
+       id="clipPath34"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path32"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath52"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path50"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath64"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path62"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath118"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path116"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath130"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path128"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath142"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path140"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath154"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path152"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath166"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path164"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath184"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path182"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath202"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path200"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath214"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path212"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath238"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path236"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath252"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path250"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath278"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path276"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath290"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path288"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath302"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path300"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath330"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path328"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath342"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path340"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath354"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path352"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath366"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path364"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath378"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path376"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath390"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path388"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath402"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path400"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath414"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path412"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath426"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path424"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath438"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path436"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath450"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path448"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath462"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path460"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath474"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path472"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath486"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path484"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath498"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path496"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath510"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path508"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath><clipPath
+       id="clipPath524"
+       clipPathUnits="userSpaceOnUse"><path
+         style="clip-rule:evenodd"
+         id="path522"
+         d="M 6.437e-6,0 H 432.00001 V 252 H 6.437e-6 Z" /></clipPath></defs><g
+     transform="matrix(1.3333333,0,0,-1.3333333,0,336)"
+     id="g10"><g
+       id="g12"><g
+         clip-path="url(#clipPath18)"
+         id="g14"><path
+           id="path20"
+           style="fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:none"
+           d="M 0,1.5259e-5 H 432 V 252.00002 H 0 Z" /></g></g><path
+       id="path22"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="m 95.58,212.04 h 96.5 v -1.56 h -96.5 z m 95.5,2.22 6,-3 -6,-3 z" /><path
+       id="path24"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="m 95.58,198.96 h 96.5 v -1.56 h -96.5 z m 95.5,2.22 6,-3 -6,-3 z" /><path
+       id="path26"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="m 96.3,171.48 h 96.5 v -1.56 H 96.3 Z m 95.5,2.22 6,-3 -6,-3 z" /><g
+       id="g28"><g
+         clip-path="url(#clipPath34)"
+         id="g30"><text
+           id="text38"
+           style="font-variant:normal;font-weight:normal;font-size:18px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,139.54,183.77)"><tspan
+             id="tspan36"
+             y="0"
+             x="0">…</tspan></text>
+</g></g><path
+       id="path40"
+       style="fill:#4472c4;fill-opacity:0.12941003;fill-rule:evenodd;stroke:none"
+       d="m 9.72,196.11 h 54.54 v 6.45 l 12.9,-12.9 -12.9,-12.9 v 6.45 H 9.72 Z" /><path
+       id="path42"
+       style="fill:none;stroke:#000000;stroke-width:0.23999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 9.72,196.11 h 54.54 v 6.45 l 12.9,-12.9 -12.9,-12.9 v 6.45 H 9.72 Z" /><path
+       id="path44"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 93.24,158.88 c -3.479,0 -6.3,0.47 -6.3,1.05 v 28.68 c 0,0.58 -2.821,1.05 -6.3,1.05 3.479,0 6.3,0.47 6.3,1.05 v 28.68 c 0,0.58 2.821,1.05 6.3,1.05" /><g
+       id="g46"><g
+         clip-path="url(#clipPath52)"
+         id="g48"><text
+           id="text56"
+           style="font-variant:normal;font-weight:normal;font-size:9.98400021px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,26.496,186.98)"><tspan
+             id="tspan54"
+             y="0"
+             x="0 2.5159681 7.7875199 13.059072 18.330624 21.675264">Inputs</tspan></text>
+</g></g><g
+       id="g58"><g
+         clip-path="url(#clipPath64)"
+         id="g60"><text
+           id="text68"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,87.888,229.97)"><tspan
+             id="tspan66"
+             y="0"
+             x="0 5.76684 11.02572 16.294559 18.58536 20.87616 25.059361 29.8302 33.206638 35.49744 40.746361 46.035118 48.07692 52.87764 58.146481 61.62252 66.542763 71.313599 76.61232 78.863281 84.04248 89.311317 94.580162">Application Thread Pool</tspan></text>
+</g></g><path
+       id="path70"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 117.24,167.64 h 6.48 V 174 h -6.48 z" /><path
+       id="path72"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 117.24,167.64 h 6.48 V 174 h -6.48 z" /><path
+       id="path74"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 129.12,167.64 h 6.36 V 174 h -6.36 z" /><path
+       id="path76"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 129.12,167.64 h 6.36 V 174 h -6.36 z" /><path
+       id="path78"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 140.88,167.64 h 6.36 V 174 h -6.36 z" /><path
+       id="path80"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 140.88,167.64 h 6.36 V 174 h -6.36 z" /><g
+       id="g82"><path
+         id="path84"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 112.92,162.12 h 38.76 v 17.28 h -38.76 z" /></g><g
+       id="g86"><path
+         id="path88"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 112.92,162.24 22.56,-42.97" /></g><g
+       id="g90"><path
+         id="path92"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="M 151.68,160.92 295.3,120.04" /></g><g
+       id="g94"><path
+         id="path96"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 135.48,11.28 h 159.84 v 108 H 135.48 Z" /></g><path
+       id="path98"
+       style="fill:none;stroke:#000000;stroke-width:1.55999994;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="M 138.06,100.86 H 292.64" /><path
+       id="path100"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 161.88,98.52 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path102"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 161.88,98.52 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path104"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 173.76,98.52 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path106"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 173.76,98.52 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path108"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 250.92,98.52 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path110"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 250.92,98.52 h 6.36 v 6.36 h -6.36 z" /><g
+       id="g112"><g
+         clip-path="url(#clipPath118)"
+         id="g114"><text
+           id="text122"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,206.16,97.704)"><tspan
+             id="tspan120"
+             y="0"
+             x="0">…</tspan></text>
+</g></g><g
+       id="g124"><g
+         clip-path="url(#clipPath130)"
+         id="g126"><text
+           id="text134"
+           style="font-variant:normal;font-weight:normal;font-size:8.03999996px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,160.66,108.98)"><tspan
+             id="tspan132"
+             y="0"
+             x="0 5.2799802">Op</tspan></text>
+</g></g><g
+       id="g136"><g
+         clip-path="url(#clipPath142)"
+         id="g138"><text
+           id="text146"
+           style="font-variant:normal;font-weight:normal;font-size:8.03999996px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,172.34,108.82)"><tspan
+             id="tspan144"
+             y="0"
+             x="0 5.2799802">Op</tspan></text>
+</g></g><g
+       id="g148"><g
+         clip-path="url(#clipPath154)"
+         id="g150"><text
+           id="text158"
+           style="font-variant:normal;font-weight:normal;font-size:8.03999996px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,249.19,109.1)"><tspan
+             id="tspan156"
+             y="0"
+             x="0 5.2799802">Op</tspan></text>
+</g></g><g
+       id="g160"><g
+         clip-path="url(#clipPath166)"
+         id="g162"><text
+           id="text170"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,61.512,97.632)"><tspan
+             id="tspan168"
+             y="0"
+             x="0 2.5099199 7.7887201 10.78668 15.70692 19.182961 24.103201 29.37204 33.585121 38.495399 41.005322 44.341919 49.63068 53.10672 58.026958 62.797798">Inference thread</tspan></text>
+</g></g><path
+       id="path172"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 152.58,78.6 H 268.49 V 77.04 H 152.58 Z m 114.91,2.22 6,-3 -6,-3 z" /><path
+       id="path174"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 151.2,21.48 c -3.48,0 -6.3,0.47 -6.3,1.05 v 28.62 c 0,0.58 -2.82,1.05 -6.3,1.05 3.48,0 6.3,0.47 6.3,1.05 v 28.62 c 0,0.58 2.82,1.05 6.3,1.05" /><path
+       id="path176"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 176.38,98.52 V 80.958 h 1 V 98.52 Z m -1.5,-16.562 2,-4 2,4 z" /><g
+       id="g178"><g
+         clip-path="url(#clipPath184)"
+         id="g180"><text
+           id="text188"
+           style="font-variant:normal;font-weight:normal;font-size:6.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,162.79,86.064)"><tspan
+             id="tspan186"
+             y="0"
+             x="0 3.2363999 6.9460802 9.3472795">Fork</tspan></text>
+</g></g><path
+       id="path190"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 242.42,77.356 V 95.52 h -1 V 77.356 Z m 1.5,17.164 -2,4 -2,-4 z" /><path
+       id="path192"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 238.68,98.52 h 6.48 v 6.36 h -6.48 z" /><path
+       id="path194"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 238.68,98.52 h 6.48 v 6.36 h -6.48 z" /><g
+       id="g196"><g
+         clip-path="url(#clipPath202)"
+         id="g198"><text
+           id="text206"
+           style="font-variant:normal;font-weight:normal;font-size:8.06400013px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,236.54,109.2)"><tspan
+             id="tspan204"
+             y="0"
+             x="0 5.2799678">Op</tspan></text>
+</g></g><g
+       id="g208"><g
+         clip-path="url(#clipPath214)"
+         id="g210"><text
+           id="text218"
+           style="font-variant:normal;font-weight:normal;font-size:6.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,229.08,86.016)"><tspan
+             id="tspan216"
+             y="0"
+             x="0 2.2759199 5.9856 7.5446401">Join</tspan></text>
+</g></g><path
+       id="path220"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 185.88,74.76 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path222"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 185.88,74.76 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path224"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 223.56,74.76 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path226"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 223.56,74.76 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path228"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 198.36,74.76 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path230"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 198.36,74.76 h 6.36 v 6.36 h -6.36 z" /><g
+       id="g232"><g
+         clip-path="url(#clipPath238)"
+         id="g234"><text
+           id="text242"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,210.7,74.544)"><tspan
+             id="tspan240"
+             y="0"
+             x="0">…</tspan></text>
+</g></g><path
+       id="path244"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 152.58,59.64 H 268.49 V 58.08 H 152.58 Z m 114.91,2.22 6,-3 -6,-3 z" /><g
+       id="g246"><g
+         clip-path="url(#clipPath252)"
+         id="g248"><text
+           id="text256"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,210.7,42.384)"><tspan
+             id="tspan254"
+             y="0"
+             x="0">…</tspan></text>
+</g></g><path
+       id="path258"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 152.58,28.32 H 268.49 V 26.76 H 152.58 Z m 114.91,2.22 6,-3 -6,-3 z" /><path
+       id="path260"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 188.62,74.28 V 61.891 h 1 V 74.28 Z m -1.5,-11.389 2,-4 2,4 z" /><path
+       id="path262"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="M 227.3,58.948 V 71.76 h -1 V 58.948 Z m 1.5,11.812 -2,4 -2,-4 z" /><path
+       id="path264"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 193.08,56.04 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path266"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 193.08,56.04 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path268"
+       style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none"
+       d="m 216.36,55.68 h 6.36 v 6.36 h -6.36 z" /><path
+       id="path270"
+       style="fill:none;stroke:#000000;stroke-width:0.95999998;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       d="m 216.36,55.68 h 6.36 v 6.36 h -6.36 z" /><g
+       id="g272"><g
+         clip-path="url(#clipPath278)"
+         id="g274"><text
+           id="text282"
+           style="font-variant:normal;font-weight:normal;font-size:9.98400021px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,50.928,50.4)"><tspan
+             id="tspan280"
+             y="0"
+             x="0 2.5159681 7.7875199 11.13216 16.064257">Inter</tspan></text>
+</g></g><g
+       id="g284"><g
+         clip-path="url(#clipPath290)"
+         id="g286"><text
+           id="text294"
+           style="font-variant:normal;font-weight:normal;font-size:9.98400021px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,70.488,50.4)"><tspan
+             id="tspan292"
+             y="0"
+             x="0">-</tspan></text>
+</g></g><g
+       id="g296"><g
+         clip-path="url(#clipPath302)"
+         id="g298"><text
+           id="text306"
+           style="font-variant:normal;font-weight:normal;font-size:9.98400021px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,73.512,50.4)"><tspan
+             id="tspan304"
+             y="0"
+             x="0 5.2615681 10.553088 12.809472 18.100992 22.883327 26.367744 31.170048 33.46637 35.732735 40.654846 42.951168 45.217537 49.061375">op parallelism</tspan></text>
+</g></g><g
+       id="g308"><path
+         id="path310"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 248.4,96.36 h 11.28 v 10.8 H 248.4 Z" /></g><g
+       id="g312"><path
+         id="path314"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="M 255.93,156.72 248.4,107.47" /></g><g
+       id="g316"><path
+         id="path318"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="M 349.46,158.88 259.8,107.22" /></g><g
+       id="g320"><path
+         id="path322"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 255.96,158.88 h 93.84 v 61.56 h -93.84 z" /></g><g
+       id="g324"><g
+         clip-path="url(#clipPath330)"
+         id="g326"><text
+           id="text334"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,255.65,232.3)"><tspan
+             id="tspan332"
+             y="0"
+             x="0 2.5099199 7.7887201 11.12532 14.60136">Intra</tspan></text>
+</g></g><g
+       id="g336"><g
+         clip-path="url(#clipPath342)"
+         id="g338"><text
+           id="text346"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,275.09,232.3)"><tspan
+             id="tspan344"
+             y="0"
+             x="0">-</tspan></text>
+</g></g><g
+       id="g348"><g
+         clip-path="url(#clipPath354)"
+         id="g350"><text
+           id="text358"
+           style="font-variant:normal;font-weight:normal;font-size:9.96000004px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,278.09,232.3)"><tspan
+             id="tspan356"
+             y="0"
+             x="0 5.2987199 10.59744 12.75876 18.05748 22.86816 26.3442 31.184759 33.475559 35.766361 40.6866 42.977402 45.2682 49.112759">op parallelism</tspan></text>
+</g></g><g
+       id="g360"><g
+         clip-path="url(#clipPath366)"
+         id="g362"><text
+           id="text370"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,261.82,207.34)"><tspan
+             id="tspan368"
+             y="0"
+             x="0">•</tspan></text>
+</g></g><g
+       id="g372"><g
+         clip-path="url(#clipPath378)"
+         id="g374"><text
+           id="text382"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,275.38,207.34)"><tspan
+             id="tspan380"
+             y="0"
+             x="0 5.1570001 9.585 14.022 18.702 22.176001 26.856001 31.167 34.307999 38.618999 40.653 42.695999 47.132999">ATen/Parallel</tspan></text>
+</g></g><g
+       id="g384"><g
+         clip-path="url(#clipPath390)"
+         id="g386"><text
+           id="text394"
+           style="font-variant:normal;font-weight:normal;font-size:9.02400017px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,265.78,196.54)"><tspan
+             id="tspan392"
+             y="0"
+             x="0 2.734272 7.2011518 9.4751997 13.680384 15.954432 17.993856 22.316353 25.339392 27.721727 30.14016 34.805569 39.128063 42.241344 46.563839 48.603264 50.642689 55.082497 57.121922 61.561729 64.314049 69.114815 72.237122">(e.g. at::parallel_for)</tspan></text>
+</g></g><g
+       id="g396"><g
+         clip-path="url(#clipPath402)"
+         id="g398"><text
+           id="text406"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,261.82,185.71)"><tspan
+             id="tspan404"
+             y="0"
+             x="0">•</tspan></text>
+</g></g><g
+       id="g408"><g
+         clip-path="url(#clipPath414)"
+         id="g410"><text
+           id="text418"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,275.38,185.71)"><tspan
+             id="tspan416"
+             y="0"
+             x="0 7.6950002 12.375">MKL</tspan></text>
+</g></g><g
+       id="g420"><g
+         clip-path="url(#clipPath426)"
+         id="g422"><text
+           id="text430"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,261.82,174.91)"><tspan
+             id="tspan428"
+             y="0"
+             x="0">•</tspan></text>
+</g></g><g
+       id="g432"><g
+         clip-path="url(#clipPath438)"
+         id="g434"><text
+           id="text442"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,275.38,174.91)"><tspan
+             id="tspan440"
+             y="0"
+             x="0 7.6950002 12.375">MKL</tspan></text>
+</g></g><g
+       id="g444"><g
+         clip-path="url(#clipPath450)"
+         id="g446"><text
+           id="text454"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,291.58,174.91)"><tspan
+             id="tspan452"
+             y="0"
+             x="0">-</tspan></text>
+</g></g><g
+       id="g456"><g
+         clip-path="url(#clipPath462)"
+         id="g458"><text
+           id="text466"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,294.34,174.91)"><tspan
+             id="tspan464"
+             y="0"
+             x="0 5.5349998 11.277">DNN</tspan></text>
+</g></g><g
+       id="g468"><g
+         clip-path="url(#clipPath474)"
+         id="g470"><text
+           id="text478"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:ArialMT;-inkscape-font-specification:ArialMT;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,261.82,164.11)"><tspan
+             id="tspan476"
+             y="0"
+             x="0">•</tspan></text>
+</g></g><g
+       id="g480"><g
+         clip-path="url(#clipPath486)"
+         id="g482"><text
+           id="text490"
+           style="font-variant:normal;font-weight:bold;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri-Bold;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,275.38,164.11)"><tspan
+             id="tspan488"
+             y="0"
+             x="0 2.4000001 4.8000002">...</tspan></text>
+</g></g><g
+       id="g492"><g
+         clip-path="url(#clipPath498)"
+         id="g494"><text
+           id="text502"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,382.58,201.26)"><tspan
+             id="tspan500"
+             y="0"
+             x="0 5.994 10.674 15.111 19.791 27.486">OpenMP</tspan></text>
+</g></g><g
+       id="g504"><g
+         clip-path="url(#clipPath510)"
+         id="g506"><text
+           id="text514"
+           style="font-variant:normal;font-weight:normal;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,382.58,187.32)"><tspan
+             id="tspan512"
+             y="0"
+             x="0 4.428 9.3240004">TBB</tspan></text>
+</g></g><path
+       id="path516"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       d="m 349.45,190.1 20.73,0.28 -0.01,1 -20.74,-0.28 z m 19.75,-1.23 3.97,2.05 -4.02,1.95 z" /><g
+       id="g518"><g
+         clip-path="url(#clipPath524)"
+         id="g520"><text
+           id="text528"
+           style="font-variant:normal;font-weight:bold;font-size:9px;font-family:Calibri;-inkscape-font-specification:Calibri-Bold;writing-mode:lr-tb;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           transform="matrix(1,0,0,-1,383.81,173.83)"><tspan
+             id="tspan526"
+             y="0"
+             x="0">…</tspan></text>
+</g></g><g
+       id="g530"><path
+         id="path532"
+         style="fill:none;stroke:#000000;stroke-width:0.47999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:1.92, 1.44;stroke-dashoffset:0;stroke-opacity:1"
+         d="m 375,167.64 h 44.52 V 214.2 H 375 Z" /></g></g></svg>
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index c7d3a93f73523..bda21cc7b21b8 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -64,6 +64,7 @@ Below you can find a small example showcasing this::
 TensorFloat-32 (TF32) on Ampere (and later) devices
 ---------------------------------------------------
 
+<<<<<<< HEAD
 After Pytorch 2.9, we provide a new sets of APIs to control the TF32 behavior in a more fine-grained way, and
 suggest to use the new APIs for better control.
 We can set float32 precision per backend and per operators. We can also override the global setting for a specific operator.
@@ -107,6 +108,8 @@ We suggest to use the new settings for better control. And we do not support to
   Old settings with `allow_tf32` as follows is going to be deprecated. We suggest to use the above new settings for
   better control. And we do not support to use mix of old and new settings.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Starting in PyTorch 1.7, there is a new flag called `allow_tf32`. This flag
 defaults to True in PyTorch 1.7 to PyTorch 1.11, and False in PyTorch 1.12 and later.
 This flag controls whether PyTorch is allowed to use the TensorFloat32 (TF32) tensor cores,
@@ -128,7 +131,11 @@ matmuls and convolutions are controlled separately, and their corresponding flag
   # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
   torch.backends.cudnn.allow_tf32 = True
 
+<<<<<<< HEAD
 The precision of matmuls can also be set more broadly (limited not just to CUDA) via :meth:`~torch.set_float32_matmul_precision`.
+=======
+The precision of matmuls can also be set more broadly (limited not just to CUDA) via :meth:`~torch.set_float_32_matmul_precision`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Note that besides matmuls and convolutions themselves, functions and nn modules that internally uses
 matmuls or convolutions are also affected. These include `nn.Linear`, `nn.Conv*`, cdist, tensordot,
 affine grid and grid sample, adaptive log softmax, GRU and LSTM.
@@ -608,6 +615,7 @@ Available options:
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
+<<<<<<< HEAD
 * `pinned_reserve_segment_size_mb` option is a size in MB to reserve for pinned memory
   segment. This allocates a large segment of pinned memory upfront and then uses to allocate
   small size requests. This helps reduce the number of expensive device library calls.
@@ -619,6 +627,8 @@ Available options:
   and reallocate buffers across multiple streams, especially when the capture DAG frequently
   reaches joined frontiers.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. note::
 
     Some stats reported by the
@@ -907,6 +917,7 @@ APIs can be used for debugging purposes:
     https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#memory-allocator
 
 
+<<<<<<< HEAD
 Tuning NVLink Performance with Custom Memory Allocator on H100/H200 GPUs
 ------------------------------------------------------------------------
 In rare cases, performance of NVLink on H100/H200 GPUs can be influenced by the physical memory
@@ -1031,6 +1042,8 @@ functions are:
   }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cuBLAS workspaces
 -----------------
 
diff --git a/docs/source/notes/fsdp.rst b/docs/source/notes/fsdp.rst
new file mode 100644
index 0000000000000..ce713fc1697f7
--- /dev/null
+++ b/docs/source/notes/fsdp.rst
@@ -0,0 +1,148 @@
+.. _fsdp_notes:
+
+FSDP Notes
+==========
+
+.. _fsdp_prefetch:
+
+FSDP Prefetch Nuances
+---------------------
+
+For overlapping ``forward`` all-gathers with ``forward`` compute, there are two possible mechanisms:
+
+1. Implicit forward prefetching (always enabled)
+2. Explicit forward prefetching (``forward_prefetch=True``)
+
+Implicit ``forward`` prefetching refers to relying on issuing the all-gathers from a separate CUDA
+stream to allow for overlapping an all-gather with ``forward`` compute issued before it (from the CPU
+perspective). For example, if we have layer 0 all-gather -> layer 0 ``forward`` compute -> layer 1
+all-gather -> …, then layer 1 all-gather can overlap with layer 0 ``forward`` compute even though the
+CPU thread issued it afterwards. (The 1st all-gather will not be able to overlap with anything.)
+
+Explicit ``forward`` prefetching refers to changing the CPU thread’s issue order: e.g. layer 0
+all-gather -> layer 1 all-gather -> layer 0 ``forward`` compute -> …. In eager mode, there is no way to
+know in general which layer is the next layer (e.g. layer 1 in the example) when still executing on
+layer 0. Therefore, explicit ``forward`` prefetching should only be used for models whose execution
+order is fixed from iteration to iteration (which we sometimes call “static graph”). An example of a
+model that does not satisfy this constraint is `FLAVA
+<https://pytorch.org/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/>`_).
+
+Explicit ``forward`` prefetching only saves the time taken to issue a layer’s ``forward`` compute kernels at
+the cost that the next all-gather’s output tensor must be allocated while the current one is still
+in use. By issuing the next all- gather before the current ``forward`` compute kernels, the next
+all-gather can start sooner on GPU. For most LLM workloads, this is not the case, so there is no
+motivation for enabling ``forward_prefetch=True``.
+
+In contrast, for ``backward``, we must use explicit ``backward`` prefetching or else there will be 0 overlap
+of communication and computation. The reason is because we use a single NCCL process group for both
+all-gather and reduce-scatter (partially because in earlier NCCL versions, it was not safe to use
+multiple concurrently on the same device over the same ranks). A single NCCL process group means a
+single internal NCCL stream on which reduce-scatters and all-gathers run serially. As such, unless
+we explicitly reorder the CPU issue order to be next all-gather -> current reduce-scatter, then the
+current reduce-scatter would block the next all-gather and hence the next ``backward`` computation,
+preventing the current reduce-scatter from overlapping.
+
+.. _fsdp_comms_payload_size:
+
+Communication payload size
+--------------------------
+
+In FSDP the communications are:
+
+1. all-gather on parameters in ``forward``
+2. all-gather on parameters in ``backward``
+3. reduce-scatter on gradients in ``backward``
+
+If activation checkpointing (:func:`~torch.utils.checkpoint.checkpoint`) is used there is no
+additional communication since the parameters are prefetched anyway during ``backward``.
+
+In the FSDP design, the communication payload per rank is determined as follows: Each call to
+:class:`FullyShardedDataParallel` creates one communication group consisting of the parameters in
+``module.parameters()`` except any already assigned to a nested :class:`FullyShardedDataParallel`
+instance. For example, for Llama, if you apply :class:`FullyShardedDataParallel` to every
+transformer block and also to the root module, then there is one communication group for each
+transformer block and finally one communication group with the initial embedding and final linear.
+Each communication group corresponds to a single all-gather call and single reduce-scatter call. In
+that way, how you apply :class:`FullyShardedDataParallel` determines the communication size. In
+general, applying FSDP to each transformer block is a good heuristic for LLMs, and it is hard to do
+better than that given the current design.
+
+Let's consider an example where we have a Transformer-based model sharded over 8 GPUs, where the
+sharding happens at the transformer block-level only, and each transformer block contains 1.6B
+parameters and the parameters are in fp32 (4 bytes each). Which means that once sharded, each
+transformer block will contain 0.2B parameters on each rank.
+
+* The ``forward`` pass will communicate in chunks of ``0.2*4 = 0.8GB`` in all-gather
+* The ``backward`` pass will communicate 2 times ``0.8GB`` each (1x all-gather and 1x reduce-scatter)
+
+In other words there will be 3 communications with a payload of ``0.8GB`` each. If the model was
+comprised of 10 transformer blocks there would be a total of 30 communications for a total of
+``30*0.8=24GB``.
+
+To formalize the payload size per communication per rank is
+``total_transformer_block_params_in_B*dtype_bytes/num_gpus`` (GBs).
+
+Please note that in this example we didn't include the additional communications required for the
+embedding, which should be accounted for as well. And the math would depend on whether the input and
+output embeddings are tied or not. If they aren't tied there will be 2x more communications.
+
+.. _fsdp_buffers_sizes:
+
+FSDP buffers sizes
+------------------
+
+First, let's cover the buffers allocated for communications:
+
+``forward`` currently requires 2x all-gather buffer size. Here is why:
+
+As explained in :ref:`fsdp_prefetch` in the case of explicit ``forward`` prefetching
+(``forward_prefetch=True``) case of layer 0 all-gather -> layer 0 forward compute -> layer 1
+all-gather there is a need for 2 all-gather-sized buffers, because one buffer is used in the current ``forward`` while the other is used to do the prefetching.
+
+While the implicit ``forward`` prefetching (``forward_prefetch=False``, default) case of the same sequence in theory should need only 1 buffer, in reality it's still 2x all-gather-sized buffers. The reason is that in the flat-parameter FSDP design, we do not copy-out of the all-gather buffer. The parameters used for compute are directly viewed into the all-gather buffer (in fact, the main benefit of the "flat parameter" is exactly this reason). In that case, while 'layer 1 all-gather' is overlapping with 'layer 0 forward compute', the 'layer 0 forward compute' is using the parameters viewed into the 'layer 0 all-gather' buffer.
+
+A natural question then is, when would you want ``forward_prefetch=False``? For static-graph models (like most LLMs), there is a major technical reason. It is more that, practically, we added this option quickly for some CPU-bound internal models and have not tested every code path with it in unit testing, so we are less confident in it. ``forward_prefetching=False`` can be slightly easier to reason about since we do not have to check the recorded forward order as a possible 'failure mode'; a module's all-gather can always be found under its own ``record_function`` label in its profiler trace.
+
+``backward`` currently requires at least 2x all-gather buffer size and potentially a bit more. Here is why:
+
+The current FSDP design uses ``recordStream`` to manage allocations produced in one stream consumed in another, which can lead to more memory usage than expected. How much more can be "non-deterministic" in that it depends on GPU kernel timing relative to the CPU. The ``limit_all_gathers=True`` argument is a mitigation to that - for more details refer to this discussion is `FSDP & CUDACachingAllocator <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486/1>`_.
+
+The way existing FSDP works with autograd:
+
+* Existing FSDP all-gathers the ``flat_param``, which is the autograd leaf.
+* It calls ``torch.split`` to get 1D views into the ``flat_param`` corresponding to its constituent original parameters.
+* It calls ``torch.view`` on each 1D split to view back to ND.
+* This means that in ``backward``, we end up with ``ViewBackward`` (ND -> 1D) and ``SplitWithSizesBackward`` (which is a concat). In particular, each individual gradient is computed as a separate allocation, and an explicit concat happens to construct the reduce-scatter input buffer. This implies actually a 2x buffer size for reduce-scatter at that peak memory point.
+
+In summary, for ``backward``, it is about 2x buffer size for reduce-scatter plus any ``recordStream`` effects.
+
+Second, let's discuss the additional buffers:
+
+Once the sharded parameters are gathered from all ranks, they require an additional buffer of `total_transformer_block_params_in_B*dtype_bytes` for the full parameters - so continuing the example from earlier if each transformer block is 1.6B parameters and the parameters are in fp32, then it'd be `1.6*4=6.4GB` buffer.
+
+And there is a need for 2 of those buffers, since there is one currently being used and another being prefetched.
+
+To summarize, we have:
+
+1. 2 times communication buffers of ``total_transformer_block_params_in_B*dtype_bytes/num_gpus``
+2. 2 times unsharded transformer block parameters buffer ````total_transformer_block_params_in_B*dtype_bytes``
+
+or if you have been following the example:
+
+1. ``2*1.6*4/8=1.6GB``
+2. ``2**1.6*4=12.8GB``
+
+and the total of ``14.4GB``.
+
+Now let's briefly discuss what happens to the embeddings as we have left those out from the calculations:
+
+Given the rule we discussed that you included in the note starting with "the communication buffer
+size is determined as follows", we can analyze as follows:
+
+* Suppose we apply FSDP to the root module (e.g. the ``Transformer`` class). Suppose we further apply FSDP to each transformer block (e.g. the ``TransformerBlock`` class).
+* Most commonly, the embedding and final linear projection are direct children of the root ``Transformer`` class.
+* Following our rule, that means that the embedding and final linear projection are assigned to the root ``Transformer``'s flat parameter.
+* We have _another_ special rule, which is that the root does not free its parameters after forward because they will be anyways immediately all-gathered in backward.
+* Putting this together, this means that the root's flat parameter including the embedding and final projection are all-gathered to begin forward and kept in GPU memory until the end of backward.
+* If the embedding and final linear are not weight-tied, then we _could_ further apply FSDP to the embedding and to the final linear. For weight-tied parameters, we require them to be part of the same flat parameter (or else it would get double-counted). That would allow the embedding to be freed after its usage in forward and only all-gathered toward the end of backward.
+* Hopefully, this gives a better sense -- each FSDP module gets assigned parameters in its ``module.parameters`` except those already assigned to another nested FSDP module, and the FSDP module's ``forward`` defines the 'live' interval for its parameters. Hence, the nested ``nn.Module`` structure can affect the all-gather/free schedule and hence the memory/throughput performance.
diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst
index 57cb47bd840d4..ea68ddb0264a8 100644
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@@ -24,12 +24,24 @@ For Intel Client GPU
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
 | Supported OS                        | Validated Hardware                                                                                 |
 +=====================================+====================================================================================================+
+<<<<<<< HEAD
 || Windows 11 & Ubuntu 24.04/25.04    || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+=======
+|| Windows 11 & Ubuntu 24.10          || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ||                                    || Intel® Arc B-Series Graphics (CodeName: Battlemage)                                               |
 ||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
 ||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
 ||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
+<<<<<<< HEAD
+=======
+|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
+||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
+||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
++-------------------------------------+----------------------------------------------------------------------------------------------------+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
 
@@ -102,7 +114,11 @@ If you are migrating code from ``cuda``, you would change references from ``cuda
 The following points outline the support and limitations for PyTorch with Intel GPU:
 
 #. Both training and inference workflows are supported.
+<<<<<<< HEAD
 #. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to use torch.compile on Windows CPU/XPU <https://pytorch.org/tutorials/unstable/inductor_windows.html>`_.
+=======
+#. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to Use Inductor on Windows with CPU/XPU <https://pytorch.org/tutorials/prototype/inductor_windows.html>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
 
 Examples
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index 7ee596b53f9cc..423c3676ccfea 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -179,6 +179,7 @@ by recompiling the PyTorch from source.
 Please add below line as an argument to cmake command parameters::
 
     -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON
+<<<<<<< HEAD
 
 Enabling/Disabling ROCm Composable Kernel
 -----------------------------------------
@@ -206,3 +207,5 @@ To enable CK in either scenario, simply pass 'ck' to those functions.
 In order to set the backend to CK, the user MUST have built with the correct environment variable. If not,
 PyTorch will print a warning and use the "default" backend. For GEMMs, this will route to hipblas and
 for SDPA it routes to aotriton.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/notes/large_scale_deployments.rst b/docs/source/notes/large_scale_deployments.rst
index 27380a68cf338..6f0a495768410 100644
--- a/docs/source/notes/large_scale_deployments.rst
+++ b/docs/source/notes/large_scale_deployments.rst
@@ -7,6 +7,12 @@ This note talks about several extension points and tricks that might be useful
 when running PyTorch within a larger system or operating multiple systems using
 PyTorch in a larger organization.
 
+<<<<<<< HEAD
+=======
+It doesn't cover topics of deploying models to production. Check
+:mod:`torch.jit` or one of the corresponding tutorials.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 The note assumes that you either build PyTorch from source in your
 organization or have an ability to statically link additional code to be loaded
 when PyTorch is used. Therefore, many of the hooks are exposed as C++ APIs that
@@ -83,7 +89,12 @@ scripts, the callback fires only once for a given process for each of the APIs.
 
 ``c10::SetAPIUsageHandler`` can be used to register API usage instrumentation
 handler. Passed argument is going to be an "api key" identifying used point, for
+<<<<<<< HEAD
 example ``python.import`` for PyTorch extension import.
+=======
+example ``python.import`` for PyTorch extension import or
+``torch.script.compile`` if TorchScript compilation was triggered.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. code-block:: cpp
 
@@ -95,6 +106,45 @@ Note for developers: new API trigger points can be added in code with
 ``C10_LOG_API_USAGE_ONCE("my_api")`` in C++ or
 ``torch._C._log_api_usage_once("my.api")`` in Python.
 
+<<<<<<< HEAD
+=======
+Attaching metadata to saved TorchScript models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TorchScript modules can be saved as an archive file that bundles serialized
+parameters and module code as TorchScript (see :meth:`torch.jit.save`). It's
+often convenient to bundle additional information together with the model, for
+example, description of model producer or auxiliary artifacts.
+
+It can be achieved by passing the ``_extra_files`` argument to
+:meth:`torch.jit.save` and ``torch::jit::load`` to store and retrieve
+arbitrary binary blobs during saving process. Since TorchScript files are
+regular ZIP archives, extra information gets stored as regular files inside
+archive's ``extra/`` directory.
+
+There's also a global hook allowing to attach extra files to any TorchScript
+archive produced in the current process. It might be useful to tag models with
+producer metadata, akin to JPEG metadata produced by digital cameras. Example
+usage might look like:
+
+.. code-block:: cpp
+
+    SetExportModuleExtraFilesHook([](const Module&) {
+        ExtraFilesMap files;
+        files["producer_info.json"] = "{\"user\": \"" + getenv("USER") + "\"}";
+        return files;
+    });
+
+
+Build environment considerations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TorchScript's compilation needs to have access to the original python files as
+it uses python's ``inspect.getsource`` call. In certain production environments
+it might require explicitly deploying ``.py`` files along with precompiled
+``.pyc``.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Common extension points
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/notes/libtorch_stable_abi.md b/docs/source/notes/libtorch_stable_abi.md
index 20bc0c16c198a..1432b7c598aed 100644
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@@ -1,5 +1,6 @@
 # LibTorch Stable ABI
 
+<<<<<<< HEAD
 ## Overview
 
 The LibTorch Stable ABI (Application Binary Interface) provides an interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases.
@@ -60,6 +61,9 @@ For a limited set of use cases, we also implicitly support any literal type that
 You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
 
 
+=======
+This note will eventually contain more details on how to use the APIs in torch/csrc/stable. For the moment, it contains a table of internal representations:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 1. type in custom extension: type used within the end user custom library.
 2. StableIValue representation: a stable conversion of the type to liaison between the user model vs libtorch.so in an ABI-stable manner.
 3. type in libtorch: type used within libtorch.so (or any code binary locked with libtorch).
@@ -68,9 +72,14 @@ You can always work with StableIValue abstractions in your custom kernel for typ
 |  type in custom extension    |   StableIValue representation   |   type in libtorch  |   Schema Type  |
 | -------- | ------- | ------- | ------- |
 | std::optional\<S> | if there is a value, raw bitwise copy into leading bytes of uint64_t of pointer to a new StableIValue representing S. if there is no value, nullptr. | std::optional\<T> | Type? |
+<<<<<<< HEAD
 | torch::stable::Tensor | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
 | RAIIATH (outdated) | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
 | torch::headeronly::ScalarType | raw bitwise copy of the translated underlying enum into leading bytes of uint64_t | torch::headeronly::ScalarType | ScalarType |
+=======
+| RAIIATH | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
+| int32_t | raw bitwise copy into leading bytes of uint64_t | at::ScalarType | ScalarType |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | int32_t | raw bitwise copy into leading bytes of uint64_t | at::Layout | Layout |
 | int32_t | raw bitwise copy into leading bytes of uint64_t | at::MemoryFormat | MemoryFormat |
 | bool | raw bitwise copy into leading bytes of uint64_t | bool | bool |
@@ -90,10 +99,23 @@ You can always work with StableIValue abstractions in your custom kernel for typ
 | ? | ? | c10::SymBool | SymBool |
 | ? | ? | at::QScheme | QScheme |
 
+<<<<<<< HEAD
 
 ### Stack Conventions
 
 There are two invariants for the stack:
+=======
+Our confidently supported types are the ones in the table that have completed rows. You can rely on this subset proper ABI stability.
+
+For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
+
+You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
+
+
+## How to use stack-based APIs
+
+`aoti_torch_call_dispatcher` is what we consider a stack-based API because it takes as input a stack of StableIValues. Working with the dispatcher will likely bring you into proximity with stack-based APIs, so we are documenting some invariants:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 1. The stack is populated left to right.
     a. For example, a stack representing arguments `arg0`, `arg1`, and `arg2` will have `arg0` at index 0, `arg1` at index 1, and `arg2` at index 2.
@@ -102,6 +124,7 @@ There are two invariants for the stack:
 2. The stack always has ownership of the objects it holds.
     a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
     b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
+<<<<<<< HEAD
 
 ### Stack-based APIs
 
@@ -131,3 +154,5 @@ The above is relevant in two places:
     `aoti_torch_call_dispatcher` will call the op overload defined by a given `opName`, `overloadName`, and a stack of
     StableIValues. This call will populate any return values of the op into the stack in their StableIValue form,
     with `ret0` at index 0, `ret1` at index 1, and so on.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
index 8944ecc05f277..13061b84e5733 100644
--- a/docs/source/notes/numerical_accuracy.rst
+++ b/docs/source/notes/numerical_accuracy.rst
@@ -93,8 +93,13 @@ On Ampere (and later) Nvidia GPUs, PyTorch can use TensorFloat32 (TF32) to speed
 When an operation is performed using TF32 tensor cores, only the first 10 bits of the input mantissa are read.
 This may reduce accuracy and produce surprising results (e.g., multiplying a matrix by the identity matrix may produce results that are different from the input).
 By default, TF32 tensor cores are disabled for matrix multiplications and enabled for convolutions, although most neural network workloads have the same convergence behavior when using TF32 as they have with fp32.
+<<<<<<< HEAD
 We recommend enabling TF32 tensor cores for matrix multiplications with ``torch.backends.cuda.matmul.fp32_precision = "tf32"`` (```torch.backends.cuda.matmul.allow_tf32 = True`` is going to be deprecated) if your network does not need full float32 precision.
 If your network needs full float32 precision for both matrix multiplications and convolutions, then TF32 tensor cores can also be disabled for convolutions with ``torch.backends.cudnn.conv.fp32_precision = "ieee"`` (``torch.backends.cudnn.allow_tf32 = False`` is going to be deprecated).
+=======
+We recommend enabling TF32 tensor cores for matrix multiplications with ``torch.backends.cuda.matmul.allow_tf32 = True`` if your network does not need full float32 precision.
+If your network needs full float32 precision for both matrix multiplications and convolutions, then TF32 tensor cores can also be disabled for convolutions with ``torch.backends.cudnn.allow_tf32 = False``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 For more information see :ref:`TensorFloat32<tf32_on_ampere>`.
 
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
index 588ab670fae79..50039cfc2650c 100644
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@@ -125,6 +125,13 @@ deterministic implementation will be used::
             [[ 0.1509,  1.8027],
              [ 0.0333, -1.1444]]], device='cuda:0')
 
+<<<<<<< HEAD
+=======
+Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
+should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
+`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA convolution determinism
 ----------------------------
 While disabling CUDA convolution benchmarking (discussed above) ensures that
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index 6c18bca640f6f..acefb1838ea50 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -263,17 +263,21 @@ offers a comprehensive example of using these features to manipulate a checkpoin
 Starting in version 2.6, ``torch.load`` will use ``weights_only=True`` if the ``pickle_module``
 argument is not passed.
 
+<<<<<<< HEAD
 .. _weights-only-security:
 
 weights_only security
 ^^^^^^^^^^^^^^^^^^^^^
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 As discussed in the documentation for :func:`torch.load`, ``weights_only=True`` restricts
 the unpickler used in ``torch.load`` to only executing functions/building classes required for
 ``state_dicts`` of plain ``torch.Tensors`` as well as some other primitive types. Further,
 unlike the default ``Unpickler`` provided by the ``pickle`` module, the ``weights_only`` Unpickler
 is not allowed to dynamically import anything during unpickling.
 
+<<<<<<< HEAD
 ``weights_only=True`` narrows the surface of remote code execution attacks but has the following limitations:
 
 1. ``weights_only=True`` does not guard against denial of service attacks.
@@ -288,6 +292,8 @@ indices and values made to a sparse Tensor in user code might write/read out of
 weights_only allowlist
 ^^^^^^^^^^^^^^^^^^^^^^
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 As mentioned above, saving a module's ``state_dict`` is a best practice when using ``torch.save``. If loading an old
 checkpoint that contains an ``nn.Module``, we recommend ``weights_only=False``. When loading a checkpoint that contains
 tensor subclasses, there will likely be functions/classes that need to be allowlisted, see below for further details.
@@ -358,6 +364,175 @@ if one does not have access to the ``torch.load`` callsites.
   if ``weights_only`` was not passed as an argument.
 
 
+<<<<<<< HEAD
+=======
+.. _serializing-python-modules:
+
+Serializing torch.nn.Modules and loading them in C++
+----------------------------------------------------
+
+See also: `Tutorial: Loading a TorchScript Model in C++ <https://pytorch.org/tutorials/advanced/cpp_export.html>`_
+
+ScriptModules can be serialized as a TorchScript program and loaded
+using :func:`torch.jit.load`.
+This serialization encodes all the modules’ methods, submodules, parameters,
+and attributes, and it allows the serialized program to be loaded in C++
+(i.e. without Python).
+
+The distinction between :func:`torch.jit.save` and :func:`torch.save` may not
+be immediately clear. :func:`torch.save` saves Python objects with pickle.
+This is especially useful for prototyping, researching, and training.
+:func:`torch.jit.save`, on the other hand, serializes ScriptModules to a format
+that can be loaded in Python or C++. This is useful when saving and loading C++
+modules or for running modules trained in Python with C++, a common practice
+when deploying PyTorch models.
+
+To script, serialize and load a module in Python:
+
+::
+
+    >>> scripted_module = torch.jit.script(MyModule())
+    >>> torch.jit.save(scripted_module, 'mymodule.pt')
+    >>> torch.jit.load('mymodule.pt')
+    RecursiveScriptModule( original_name=MyModule
+                          (l0): RecursiveScriptModule(original_name=Linear)
+                          (l1): RecursiveScriptModule(original_name=Linear) )
+
+
+Traced modules can also be saved with :func:`torch.jit.save`, with the caveat
+that only the traced code path is serialized. The following example demonstrates
+this:
+
+::
+
+    # A module with control flow
+    >>> class ControlFlowModule(torch.nn.Module):
+          def __init__(self):
+            super().__init__()
+            self.l0 = torch.nn.Linear(4, 2)
+            self.l1 = torch.nn.Linear(2, 1)
+
+          def forward(self, input):
+            if input.dim() > 1:
+                return torch.tensor(0)
+
+            out0 = self.l0(input)
+            out0_relu = torch.nn.functional.relu(out0)
+            return self.l1(out0_relu)
+
+    >>> traced_module = torch.jit.trace(ControlFlowModule(), torch.randn(4))
+    >>> torch.jit.save(traced_module, 'controlflowmodule_traced.pt')
+    >>> loaded = torch.jit.load('controlflowmodule_traced.pt')
+    >>> loaded(torch.randn(2, 4)))
+    tensor([[-0.1571], [-0.3793]], grad_fn=<AddBackward0>)
+
+    >>> scripted_module = torch.jit.script(ControlFlowModule(), torch.randn(4))
+    >>> torch.jit.save(scripted_module, 'controlflowmodule_scripted.pt')
+    >>> loaded = torch.jit.load('controlflowmodule_scripted.pt')
+    >> loaded(torch.randn(2, 4))
+    tensor(0)
+
+The above module has an if statement that is not triggered by the traced inputs,
+and so is not part of the traced module and not serialized with it.
+The scripted module, however, contains the if statement and is serialized with it.
+See the `TorchScript documentation <https://pytorch.org/docs/stable/jit.html>`_
+for more on scripting and tracing.
+
+Finally, to load the module in C++:
+
+::
+
+    >>> torch::jit::script::Module module;
+    >>> module = torch::jit::load('controlflowmodule_scripted.pt');
+
+See the `PyTorch C++ API documentation <https://pytorch.org/cppdocs/>`_
+for details about how to use PyTorch modules in C++.
+
+.. _saving-loading-across-versions:
+
+Saving and loading ScriptModules across PyTorch versions
+-----------------------------------------------------------
+
+The PyTorch Team recommends saving and loading modules with the same version of
+PyTorch. Older versions of PyTorch may not support newer modules, and newer
+versions may have removed or modified older behavior. These changes are
+explicitly described in
+PyTorch’s `release notes <https://github.com/pytorch/pytorch/releases>`_,
+and modules relying on functionality that has changed may need to be updated
+to continue working properly. In limited cases, detailed below, PyTorch will
+preserve the historic behavior of serialized ScriptModules so they do not require
+an update.
+
+torch.div performing integer division
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In PyTorch 1.5 and earlier :func:`torch.div` would perform floor division when
+given two integer inputs:
+
+::
+
+    # PyTorch 1.5 (and earlier)
+    >>> a = torch.tensor(5)
+    >>> b = torch.tensor(3)
+    >>> a / b
+    tensor(1)
+
+In PyTorch 1.7, however, :func:`torch.div` will always perform a true division
+of its inputs, just like division in Python 3:
+
+::
+
+    # PyTorch 1.7
+    >>> a = torch.tensor(5)
+    >>> b = torch.tensor(3)
+    >>> a / b
+    tensor(1.6667)
+
+The behavior of :func:`torch.div` is preserved in serialized ScriptModules.
+That is, ScriptModules serialized with versions of PyTorch before 1.6 will continue
+to see :func:`torch.div` perform floor division when given two integer inputs
+even when loaded with newer versions of PyTorch. ScriptModules using :func:`torch.div`
+and serialized on PyTorch 1.6 and later cannot be loaded in earlier versions of
+PyTorch, however, since those earlier versions do not understand the new behavior.
+
+torch.full always inferring a float dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In PyTorch 1.5 and earlier :func:`torch.full` always returned a float tensor,
+regardless of the fill value it’s given:
+
+::
+
+    # PyTorch 1.5 and earlier
+    >>> torch.full((3,), 1)  # Note the integer fill value...
+    tensor([1., 1., 1.])     # ...but float tensor!
+
+In PyTorch 1.7, however, :func:`torch.full` will infer the returned tensor’s
+dtype from the fill value:
+
+::
+
+    # PyTorch 1.7
+    >>> torch.full((3,), 1)
+    tensor([1, 1, 1])
+
+    >>> torch.full((3,), True)
+    tensor([True, True, True])
+
+    >>> torch.full((3,), 1.)
+    tensor([1., 1., 1.])
+
+    >>> torch.full((3,), 1 + 1j)
+    tensor([1.+1.j, 1.+1.j, 1.+1.j])
+
+The behavior of :func:`torch.full` is preserved in serialized ScriptModules. That is,
+ScriptModules serialized with versions of PyTorch before 1.6 will continue to see
+torch.full return float tensors by default, even when given bool or
+integer fill values. ScriptModules using :func:`torch.full` and serialized on PyTorch 1.6
+and later cannot be loaded in earlier versions of PyTorch, however, since those
+earlier versions do not understand the new behavior.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. _utility functions:
 
 Utility functions
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
index 8310b2aa71302..68a9c1ab5fdb6 100644
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@@ -12,6 +12,11 @@ The exported model can be consumed by any of the many
 [runtimes that support ONNX](https://onnx.ai/supported-tools.html#deployModel), including
 Microsoft's [ONNX Runtime](https://www.onnxruntime.ai).
 
+<<<<<<< HEAD
+=======
+**There are two flavors of ONNX exporter API that you can use, as listed below.**
+Both can be called through function {func}`torch.onnx.export`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Next example shows how to export a simple model.
 
 ```python
@@ -38,6 +43,7 @@ torch.onnx.export(
 )
 ```
 
+<<<<<<< HEAD
 
 ## torch.export-based ONNX Exporter
 
@@ -63,6 +69,41 @@ Q: How to export models containing loops?
 
   See {ref}`torch.cond <cond>`.
 
+=======
+Next sections introduce the two versions of the exporter.
+
+## TorchDynamo-based ONNX Exporter
+
+*The TorchDynamo-based ONNX exporter is the newest (and Beta) exporter for PyTorch 2.1 and newer*
+
+TorchDynamo engine is leveraged to hook into Python's frame evaluation API and dynamically rewrite its
+bytecode into an FX Graph. The resulting FX Graph is then polished before it is finally translated into an
+ONNX graph.
+
+The main advantage of this approach is that the [FX graph](https://pytorch.org/docs/stable/fx.html) is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.
+
+{doc}`Learn more about the TorchDynamo-based ONNX Exporter <onnx_dynamo>`
+
+## TorchScript-based ONNX Exporter
+
+*The TorchScript-based ONNX exporter is available since PyTorch 1.2.0*
+
+[TorchScript](https://pytorch.org/docs/stable/jit.html) is leveraged to trace (through {func}`torch.jit.trace`)
+the model and capture a static computation graph.
+
+As a consequence, the resulting graph has a couple limitations:
+
+* It does not record any control-flow, like if-statements or loops;
+* Does not handle nuances between `training` and `eval` mode;
+* Does not truly handle dynamic inputs
+
+As an attempt to support the static tracing limitations, the exporter also supports TorchScript scripting
+(through {func}`torch.jit.script`), which adds support for data-dependent control-flow, for example. However, TorchScript
+itself is a subset of the Python language, so not all features in Python are supported, such as in-place operations.
+
+{doc}`Learn more about the TorchScript-based ONNX Exporter <onnx_torchscript>`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Contributing / Developing
 
@@ -70,6 +111,7 @@ The ONNX exporter is a community project and we welcome contributions. We follow
 [PyTorch guidelines for contributions](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md), but you might
 also be interested in reading our [development wiki](https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter).
 
+<<<<<<< HEAD
 
 ## torch.onnx APIs
 
@@ -95,10 +137,13 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
     :noindex:
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```{eval-rst}
 .. toctree::
     :hidden:
 
+<<<<<<< HEAD
     onnx_export
     onnx_ops
     onnx_verification
@@ -119,6 +164,21 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
 ```{eval-rst}
 .. py:module:: torch.onnx.errors
 .. py:module:: torch.onnx.operators
+=======
+    onnx_dynamo
+    onnx_ops
+    onnx_verification
+    onnx_dynamo_onnxruntime_backend
+    onnx_torchscript
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.onnx.errors
+.. py:module:: torch.onnx.operators
+.. py:module:: torch.onnx.symbolic_caffe2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.onnx.symbolic_helper
 .. py:module:: torch.onnx.symbolic_opset10
 .. py:module:: torch.onnx.symbolic_opset11
diff --git a/docs/source/onnx_dynamo.md b/docs/source/onnx_dynamo.md
new file mode 100644
index 0000000000000..c5077ef360a5e
--- /dev/null
+++ b/docs/source/onnx_dynamo.md
@@ -0,0 +1,274 @@
+# TorchDynamo-based ONNX Exporter
+
+```{eval-rst}
+.. automodule:: torch.onnx
+  :noindex:
+```
+
+```{contents}
+:local:
+:depth: 1
+```
+
+## Overview
+
+The ONNX exporter leverages TorchDynamo engine to hook into Python's frame evaluation API
+and dynamically rewrite its bytecode into an FX Graph.
+The resulting FX Graph is then polished before it is finally translated into an ONNX graph.
+
+The main advantage of this approach is that the [FX graph](https://pytorch.org/docs/stable/fx.html) is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.
+
+In addition, during the export process, memory usage is significantly reduced compared to the TorchScript-enabled exporter.
+See the {doc}`memory usage documentation <onnx_dynamo_memory_usage>` for more information.
+
+
+## Dependencies
+
+The ONNX exporter depends on extra Python packages:
+
+  - [ONNX](https://onnx.ai)
+  - [ONNX Script](https://microsoft.github.io/onnxscript)
+
+They can be installed through [pip](https://pypi.org/project/pip/):
+
+```{code-block} bash
+
+  pip install --upgrade onnx onnxscript
+```
+
+[onnxruntime](https://onnxruntime.ai) can then be used to execute the model
+on a large variety of processors.
+
+## A simple example
+
+See below a demonstration of exporter API in action with a simple Multilayer Perceptron (MLP) as example:
+
+```{code-block} python
+import torch
+import torch.nn as nn
+
+class MLPModel(nn.Module):
+  def __init__(self):
+      super().__init__()
+      self.fc0 = nn.Linear(8, 8, bias=True)
+      self.fc1 = nn.Linear(8, 4, bias=True)
+      self.fc2 = nn.Linear(4, 2, bias=True)
+      self.fc3 = nn.Linear(2, 2, bias=True)
+      self.fc_combined = nn.Linear(8 + 8 + 8, 8, bias=True)  # Combine all inputs
+
+  def forward(self, tensor_x: torch.Tensor, input_dict: dict, input_list: list):
+      """
+      Forward method that requires all inputs:
+      - tensor_x: A direct tensor input.
+      - input_dict: A dictionary containing the tensor under the key 'tensor_x'.
+      - input_list: A list where the first element is the tensor.
+      """
+      # Extract tensors from inputs
+      dict_tensor = input_dict['tensor_x']
+      list_tensor = input_list[0]
+
+      # Combine all inputs into a single tensor
+      combined_tensor = torch.cat([tensor_x, dict_tensor, list_tensor], dim=1)
+
+      # Process the combined tensor through the layers
+      combined_tensor = self.fc_combined(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc0(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc1(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc2(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      output = self.fc3(combined_tensor)
+      return output
+
+model = MLPModel()
+
+# Example inputs
+tensor_input = torch.rand((97, 8), dtype=torch.float32)
+dict_input = {'tensor_x': torch.rand((97, 8), dtype=torch.float32)}
+list_input = [torch.rand((97, 8), dtype=torch.float32)]
+
+# The input_names and output_names are used to identify the inputs and outputs of the ONNX model
+input_names = ['tensor_input', 'tensor_x', 'list_input_index_0']
+output_names = ['output']
+
+# Exporting the model with all required inputs
+onnx_program = torch.onnx.export(model,(tensor_input, dict_input, list_input), dynamic_shapes=({0: "batch_size"},{"tensor_x": {0: "batch_size"}},[{0: "batch_size"}]), input_names=input_names, output_names=output_names, dynamo=True,)
+
+# Check the exported ONNX model is dynamic
+assert onnx_program.model.graph.inputs[0].shape == ("batch_size", 8)
+assert onnx_program.model.graph.inputs[1].shape == ("batch_size", 8)
+assert onnx_program.model.graph.inputs[2].shape == ("batch_size", 8)
+```
+
+As the code above shows, all you need is to provide {func}`torch.onnx.export` with an instance of the model and its input.
+The exporter will then return an instance of {class}`torch.onnx.ONNXProgram` that contains the exported ONNX graph along with extra information.
+
+The in-memory model available through ``onnx_program.model_proto`` is an ``onnx.ModelProto`` object in compliance with the [ONNX IR spec](https://github.com/onnx/onnx/blob/main/docs/IR.md).
+The ONNX model may then be serialized into a [Protobuf file](https://protobuf.dev/) using the {meth}`torch.onnx.ONNXProgram.save` API.
+
+```{code-block} python
+  onnx_program.save("mlp.onnx")
+```
+
+## Use the same model to compare with the TorchScript-enabled exporter
+
+The biggest difference between the TorchScript-enabled exporter and the TorchDynamo-based exporter is that the latter
+requires dynamic_shapes to be the same tree structure as the input, while the former
+requires the dynamic_shapes to be a single and flatten dictionary.
+
+```{code-block} python
+  torch.onnx.export(model,(tensor_input, dict_input, list_input), "mlp.onnx", dynamic_axes={"tensor_input":{0: "batch_size"}, "tensor_x": {0: "batch_size"}, "list_input_index_0": {0: "batch_size"}}, input_names=input_names, output_names=output_names)
+```
+
+## Inspecting the ONNX model using GUI
+
+You can view the exported model using [Netron](https://netron.app/).
+
+```{image} _static/img/onnx/onnx_dynamo_mlp_model.png
+:alt: MLP model as viewed using Netron
+:width: 30%
+:align: center
+```
+
+## When the conversion fails
+
+Function {func}`torch.onnx.export` should be called a second time with
+parameter ``report=True``. A markdown report is generated to help the user
+to resolve the issue.
+
+```{toctree}
+:hidden:
+onnx_dynamo_memory_usage
+```
+## Metadata
+
+During ONNX export, each ONNX node is annotated with metadata that helps trace its origin and context from the original PyTorch model. This metadata is useful for debugging, model inspection, and understanding the mapping between PyTorch and ONNX graphs.
+
+The following metadata fields are added to each ONNX node:
+
+- **namespace**
+
+  A string representing the hierarchical namespace of the node, consisting of a stack trace of modules/methods.
+
+  *Example:*
+  `__main__.SimpleAddModel/add: aten.add.Tensor`
+
+- **pkg.torch.onnx.class_hierarchy**
+
+  A list of class names representing the hierarchy of modules leading to this node.
+
+  *Example:*
+  `['__main__.SimpleAddModel', 'aten.add.Tensor']`
+
+- **pkg.torch.onnx.fx_node**
+
+  The string representation of the original FX node, including its name, number of consumers, the targeted torch op, arguments, and keyword arguments.
+
+  *Example:*
+  `%cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%tensor_x, %input_dict_tensor_x, %input_list_0], 1), kwargs = {})`
+
+- **pkg.torch.onnx.name_scopes**
+
+  A list of name scopes (methods) representing the path to this node in the PyTorch model.
+
+  *Example:*
+  `['', 'add']`
+
+- **pkg.torch.onnx.stack_trace**
+
+  The stack trace from the original code where this node was created, if available.
+
+  *Example:*
+  ```
+  File "simpleadd.py", line 7, in forward
+      return torch.add(x, y)
+  ```
+
+These metadata fields are stored in the metadata_props attribute of each ONNX node and can be inspected using Netron or programmatically.
+
+The overall ONNX graph has the following `metadata_props`:
+
+- **pkg.torch.export.ExportedProgram.graph_signature**
+
+  This property contains a string representation of the graph_signature from the original PyTorch ExportedProgram. The graph signature describes the structure of the model's inputs and outputs and how they map to the ONNX graph. The inputs are defined as `InputSpec` objects, which include the kind of input (e.g., `InputKind.PARAMETER` for parameters, `InputKind.USER_INPUT` for user-defined inputs), the argument name, the target (which can be a specific node in the model), and whether the input is persistent. The outputs are defined as `OutputSpec` objects, which specify the kind of output (e.g., `OutputKind.USER_OUTPUT`) and the argument name.
+
+  To read more about the graph signature, please see the {doc}`torch.export <export>` for more information.
+
+- **pkg.torch.export.ExportedProgram.range_constraints**
+
+  This property contains a string representation of any range constraints that were present in the original PyTorch ExportedProgram. Range constraints specify valid ranges for symbolic shapes or values in the model, which can be important for models that use dynamic shapes or symbolic dimensions.
+
+  *Example:*
+  `s0: VR[2, int_oo]`, which indicates that the size of the input tensor must be at least 2.
+
+  To read more about range constraints, please see the {doc}`torch.export <export>` for more information.
+
+Each input value in the ONNX graph may have the following metadata property:
+
+- **pkg.torch.export.graph_signature.InputSpec.kind**
+
+  The kind of input, as defined by PyTorch's InputKind enum.
+
+  *Example values:*
+  - "USER_INPUT": A user-provided input to the model.
+  - "PARAMETER": A model parameter (e.g., weight).
+  - "BUFFER": A model buffer (e.g., running mean in BatchNorm).
+  - "CONSTANT_TENSOR": A constant tensor argument.
+  - "CUSTOM_OBJ": A custom object input.
+  - "TOKEN": A token input.
+
+- **pkg.torch.export.graph_signature.InputSpec.persistent**
+
+  Indicates whether the input is persistent (i.e., should be saved as part of the model's state).
+
+  *Example values:*
+  - "True"
+  - "False"
+
+Each output value in the ONNX graph may have the following metadata property:
+
+- **pkg.torch.export.graph_signature.OutputSpec.kind**
+
+  The kind of input, as defined by PyTorch's OutputKind enum.
+
+  *Example values:*
+  - "USER_OUTPUT": A user-visible output.
+  - "LOSS_OUTPUT": A loss value output.
+  - "BUFFER_MUTATION": Indicates a buffer was mutated.
+  - "GRADIENT_TO_PARAMETER": Gradient output for a parameter.
+  - "GRADIENT_TO_USER_INPUT": Gradient output for a user input.
+  - "USER_INPUT_MUTATION": Indicates a user input was mutated.
+  - "TOKEN": A token output.
+
+Each initialized value, input, output has the following metadata:
+
+- **pkg.torch.onnx.original_node_name**
+
+  The original name of the node in the PyTorch FX graph that produced this value in the case where the value was renamed. This helps trace initializers back to their source in the original model.
+
+  *Example:*
+  `fc1.weight`
+
+## API Reference
+
+```{eval-rst}
+.. autofunction:: torch.onnx.export
+.. autoclass:: torch.onnx.ONNXProgram
+    :members:
+.. autofunction:: is_in_onnx_export
+.. autoclass:: torch.onnx.OnnxExporterError
+    :members:
+.. autofunction:: torch.onnx.enable_fake_mode
+```
+
+## Deprecated
+
+The following classes and functions are deprecated and will be removed.
+
+```{eval-rst}
+.. autofunction:: torch.onnx.dynamo_export
+.. autoclass:: torch.onnx.ExportOptions
+```
diff --git a/docs/source/onnx_dynamo_memory_usage.rst b/docs/source/onnx_dynamo_memory_usage.rst
new file mode 100644
index 0000000000000..ba1213c6ee085
--- /dev/null
+++ b/docs/source/onnx_dynamo_memory_usage.rst
@@ -0,0 +1,112 @@
+Understanding TorchDynamo-based ONNX Exporter Memory Usage
+==========================================================
+The previous TorchScript-based ONNX exporter would execute the model once to trace its execution, which could cause it to run out of
+memory on your GPU if the model's memory requirements exceeded the available GPU memory. This issue has been addressed with the new
+TorchDynamo-based ONNX exporter.
+
+The TorchDynamo-based ONNX exporter utilizes torch.export.export() function to leverage
+`FakeTensorMode <https://pytorch.org/docs/stable/torch.compiler_fake_tensor.html>`_ to avoid performing actual tensor computations
+during the export process. This approach results in significantly lower memory usage compared to the TorchScript-based ONNX exporter.
+
+Below is an example demonstrating the memory usage difference between TorchScript-based and TorchDynamo-based ONNX exporters.
+In this example, we use the HighResNet model from MONAI. Before proceeding, please install it from PyPI:
+
+.. code-block:: bash
+
+   pip install monai
+
+
+PyTorch offers a tool for capturing and visualizing memory usage traces. We will use this tool to record the memory usage of the two
+exporters during the export process and compare the results. You can find more details about this tool on
+`Understanding CUDA Memory Usage <https://pytorch.org/docs/stable/torch_cuda_memory.html>`_.
+
+
+TorchScript-based exporter
+==========================
+The code below could be run to generate a snapshot file which records the state of allocated CUDA memory during the export process.
+
+.. code-block:: python
+
+    import torch
+
+    from monai.networks.nets import (
+        HighResNet,
+    )
+
+    torch.cuda.memory._record_memory_history()
+
+    model = HighResNet(
+        spatial_dims=3, in_channels=1, out_channels=3, norm_type="batch"
+    ).eval()
+
+    model = model.to("cuda")
+    data = torch.randn(30, 1, 48, 48, 48, dtype=torch.float32).to("cuda")
+
+    with torch.no_grad():
+        onnx_program = torch.onnx.export(
+            model,
+            data,
+            "torchscript_exporter_highresnet.onnx",
+            dynamo=False,
+        )
+
+    snapshot_name = "torchscript_exporter_example.pickle"
+    print(f"generate {snapshot_name}")
+
+    torch.cuda.memory._dump_snapshot(snapshot_name)
+    print("Export is done.")
+
+
+Open `pytorch.org/memory_viz <https://pytorch.org/memory_viz>`_ and drag/drop the generated pickled snapshot file into the visualizer.
+The memory usage is described as below:
+
+.. image:: _static/img/onnx/torch_script_exporter_memory_usage.png
+
+
+By this figure, we can see the memory usage peak is above 2.8GB.
+
+
+TorchDynamo-based exporter
+==========================
+
+The code below could be run to generate a snapshot file which records the state of allocated CUDA memory during the export process.
+
+.. code-block:: python
+
+    import torch
+
+    from monai.networks.nets import (
+        HighResNet,
+    )
+
+    torch.cuda.memory._record_memory_history()
+
+    model = HighResNet(
+        spatial_dims=3, in_channels=1, out_channels=3, norm_type="batch"
+    ).eval()
+
+    model = model.to("cuda")
+    data = torch.randn(30, 1, 48, 48, 48, dtype=torch.float32).to("cuda")
+
+    with torch.no_grad():
+        onnx_program = torch.onnx.export(
+                            model,
+                            data,
+                            "test_faketensor.onnx",
+                            dynamo=True,
+                        )
+
+    snapshot_name = f"torchdynamo_exporter_example.pickle"
+    print(f"generate {snapshot_name}")
+
+    torch.cuda.memory._dump_snapshot(snapshot_name)
+    print(f"Export is done.")
+
+Open `pytorch.org/memory_viz <https://pytorch.org/memory_viz>`_ and drag/drop the generated pickled snapshot file into the visualizer.
+The memory usage is described as below:
+
+.. image:: _static/img/onnx/torch_dynamo_exporter_memory_usage.png
+
+
+By this figure, we can see the memory usage peak is only around 45MB. Comparing to the memory usage peak of TorchScript-based exporter,
+it reduces 98% memory usage.
diff --git a/docs/source/onnx_dynamo_onnxruntime_backend.md b/docs/source/onnx_dynamo_onnxruntime_backend.md
new file mode 100644
index 0000000000000..a59cd4ab919cd
--- /dev/null
+++ b/docs/source/onnx_dynamo_onnxruntime_backend.md
@@ -0,0 +1,11 @@
+# ONNX Backend for TorchDynamo
+
+For a quick overview of `torch.compiler`, see {ref}`torch.compiler_overview`.
+
+```{warning}
+  The ONNX backend for torch.compile is a rapidly evolving beta technology.
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.is_onnxrt_backend_supported
+```
\ No newline at end of file
diff --git a/docs/source/onnx_torchscript.rst b/docs/source/onnx_torchscript.rst
new file mode 100644
index 0000000000000..23a7adb06f7b1
--- /dev/null
+++ b/docs/source/onnx_torchscript.rst
@@ -0,0 +1,715 @@
+TorchScript-based ONNX Exporter
+===============================
+
+.. note::
+    To export an ONNX model using TorchDynamo instead of TorchScript, please see :doc:`Learn more about the TorchDynamo-based ONNX Exporter <onnx_dynamo>`
+
+.. contents:: :local:
+
+Example: AlexNet from PyTorch to ONNX
+-------------------------------------
+
+Here is a simple script which exports a pretrained AlexNet to an ONNX file named ``alexnet.onnx``.
+The call to ``torch.onnx.export`` runs the model once to trace its execution and then exports the
+traced model to the specified file::
+
+    import torch
+    import torchvision
+
+    dummy_input = torch.randn(10, 3, 224, 224, device="cuda")
+    model = torchvision.models.alexnet(pretrained=True).cuda()
+
+    # Providing input and output names sets the display names for values
+    # within the model's graph. Setting these does not change the semantics
+    # of the graph; it is only for readability.
+    #
+    # The inputs to the network consist of the flat list of inputs (i.e.
+    # the values you would pass to the forward() method) followed by the
+    # flat list of parameters. You can partially specify names, i.e. provide
+    # a list here shorter than the number of inputs to the model, and we will
+    # only set that subset of names, starting from the beginning.
+    input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+    output_names = [ "output1" ]
+
+    torch.onnx.export(model, dummy_input, "alexnet.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+The resulting ``alexnet.onnx`` file contains a binary `protocol buffer <https://developers.google.com/protocol-buffers/>`_
+which contains both the network structure and parameters of the model you exported
+(in this case, AlexNet).  The argument ``verbose=True`` causes the
+exporter to print out a human-readable representation of the model::
+
+    # These are the inputs and parameters to the network, which have taken on
+    # the names we specified earlier.
+    graph(%actual_input_1 : Float(10, 3, 224, 224)
+          %learned_0 : Float(64, 3, 11, 11)
+          %learned_1 : Float(64)
+          %learned_2 : Float(192, 64, 5, 5)
+          %learned_3 : Float(192)
+          # ---- omitted for brevity ----
+          %learned_14 : Float(1000, 4096)
+          %learned_15 : Float(1000)) {
+      # Every statement consists of some output tensors (and their types),
+      # the operator to be run (with its attributes, e.g., kernels, strides,
+      # etc.), its input tensors (%actual_input_1, %learned_0, %learned_1)
+      %17 : Float(10, 64, 55, 55) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[11, 11], pads=[2, 2, 2, 2], strides=[4, 4]](%actual_input_1, %learned_0, %learned_1), scope: AlexNet/Sequential[features]/Conv2d[0]
+      %18 : Float(10, 64, 55, 55) = onnx::Relu(%17), scope: AlexNet/Sequential[features]/ReLU[1]
+      %19 : Float(10, 64, 27, 27) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%18), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+      # ---- omitted for brevity ----
+      %29 : Float(10, 256, 6, 6) = onnx::MaxPool[kernel_shape=[3, 3], pads=[0, 0, 0, 0], strides=[2, 2]](%28), scope: AlexNet/Sequential[features]/MaxPool2d[12]
+      # Dynamic means that the shape is not known. This may be because of a
+      # limitation of our implementation (which we would like to fix in a
+      # future release) or shapes which are truly dynamic.
+      %30 : Dynamic = onnx::Shape(%29), scope: AlexNet
+      %31 : Dynamic = onnx::Slice[axes=[0], ends=[1], starts=[0]](%30), scope: AlexNet
+      %32 : Long() = onnx::Squeeze[axes=[0]](%31), scope: AlexNet
+      %33 : Long() = onnx::Constant[value={9216}](), scope: AlexNet
+      # ---- omitted for brevity ----
+      %output1 : Float(10, 1000) = onnx::Gemm[alpha=1, beta=1, broadcast=1, transB=1](%45, %learned_14, %learned_15), scope: AlexNet/Sequential[classifier]/Linear[6]
+      return (%output1);
+    }
+
+You can also verify the output using the `ONNX <https://github.com/onnx/onnx/>`_ library,
+which you can install using ``pip``::
+
+    pip install onnx
+
+Then, you can run::
+
+    import onnx
+
+    # Load the ONNX model
+    model = onnx.load("alexnet.onnx")
+
+    # Check that the model is well formed
+    onnx.checker.check_model(model)
+
+    # Print a human readable representation of the graph
+    print(onnx.helper.printable_graph(model.graph))
+
+You can also run the exported model with one of the many
+`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_.
+For example after installing `ONNX Runtime <https://www.onnxruntime.ai>`_, you can
+load and run the model::
+
+    import onnxruntime as ort
+    import numpy as np
+
+    ort_session = ort.InferenceSession("alexnet.onnx")
+
+    outputs = ort_session.run(
+        None,
+        {"actual_input_1": np.random.randn(10, 3, 224, 224).astype(np.float32)},
+    )
+    print(outputs[0])
+
+Here is a more involved `tutorial on exporting a model and running it with ONNX Runtime <https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html>`_.
+
+.. _tracing-vs-scripting:
+
+Tracing vs Scripting
+--------------------
+
+Internally, :func:`torch.onnx.export()` requires a :class:`torch.jit.ScriptModule` rather than
+a :class:`torch.nn.Module`. If the passed-in model is not already a ``ScriptModule``,
+``export()`` will use *tracing* to convert it to one:
+
+.. TODO(justinchuby): Add a word on recommending tracing over scripting for most use cases.
+
+* **Tracing**: If ``torch.onnx.export()`` is called with a Module that is not already a
+  ``ScriptModule``, it first does the equivalent of :func:`torch.jit.trace`, which executes the model
+  once with the given ``args`` and records all operations that happen during that execution. This
+  means that if your model is dynamic, e.g., changes behavior depending on input data, the exported
+  model will *not* capture this dynamic behavior.
+  We recommend examining the exported model and making sure the operators look
+  reasonable. Tracing will unroll loops and if statements, exporting a static graph that is exactly
+  the same as the traced run. If you want to export your model with dynamic control flow, you will
+  need to use *scripting*.
+
+* **Scripting**: Compiling a model via scripting preserves dynamic control flow and is valid for inputs
+  of different sizes. To use scripting:
+
+  * Use :func:`torch.jit.script` to produce a ``ScriptModule``.
+  * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model. The ``args`` are still required,
+    but they will be used internally only to produce example outputs, so that the types and shapes of the
+    outputs can be captured. No tracing will be performed.
+
+See `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
+and `TorchScript <jit.html>`_ for more details, including how to compose tracing and scripting to suit the
+particular requirements of different models.
+
+
+Avoiding Pitfalls
+-----------------
+
+Avoid NumPy and built-in Python types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch models can be written using NumPy or Python types and functions, but
+during :ref:`tracing<tracing-vs-scripting>`, any variables of NumPy or Python
+types (rather than torch.Tensor) are converted to constants, which will produce
+the wrong result if those values should change depending on the inputs.
+
+For example, rather than using numpy functions on numpy.ndarrays: ::
+
+    # Bad! Will be replaced with constants during tracing.
+    x, y = np.random.rand(1, 2), np.random.rand(1, 2)
+    np.concatenate((x, y), axis=1)
+
+Use torch operators on torch.Tensors: ::
+
+    # Good! Tensor operations will be captured during tracing.
+    x, y = torch.randn(1, 2), torch.randn(1, 2)
+    torch.cat((x, y), dim=1)
+
+
+And rather than use :func:`torch.Tensor.item` (which converts a Tensor to a Python
+built-in number): ::
+
+    # Bad! y.item() will be replaced with a constant during tracing.
+    def forward(self, x, y):
+        return x.reshape(y.item(), -1)
+
+Use torch's support for implicit casting of single-element tensors: ::
+
+    # Good! y will be preserved as a variable during tracing.
+    def forward(self, x, y):
+        return x.reshape(y, -1)
+
+Avoid Tensor.data
+^^^^^^^^^^^^^^^^^
+
+Using the Tensor.data field can produce an incorrect trace and therefore an incorrect ONNX graph.
+Use :func:`torch.Tensor.detach` instead. (Work is ongoing to
+`remove Tensor.data entirely <https://github.com/pytorch/pytorch/issues/30987>`_).
+
+Avoid in-place operations when using tensor.shape in tracing mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In tracing mode, shapes obtained from ``tensor.shape`` are traced as tensors,
+and share the same memory. This might cause a mismatch the final output values.
+As a workaround, avoid the use of inplace operations in these scenarios.
+For example, in the model::
+
+    class Model(torch.nn.Module):
+      def forward(self, states):
+          batch_size, seq_length = states.shape[:2]
+          real_seq_length = seq_length
+          real_seq_length += 2
+          return real_seq_length + seq_length
+
+``real_seq_length`` and ``seq_length`` share the same memory in tracing mode.
+This could be avoided by rewriting the inplace operation::
+
+    real_seq_length = real_seq_length + 2
+
+Limitations
+-----------
+
+Types
+^^^^^
+
+* Only :class:`torch.Tensors`, numeric types that can be trivially converted to torch.Tensors (e.g. float, int),
+  and tuples and lists of those types are supported as model inputs or outputs. Dict and str inputs and
+  outputs are accepted in :ref:`tracing<tracing-vs-scripting>` mode, but:
+
+  * Any computation that depends on the value of a dict or a str input **will be replaced with the
+    constant value** seen during the one traced execution.
+  * Any output that is a dict will be silently replaced with a **flattened sequence of its values
+    (keys will be removed)**. E.g. ``{"foo": 1, "bar": 2}`` becomes ``(1, 2)``.
+  * Any output that is a str will be silently removed.
+
+* Certain operations involving tuples and lists are not supported in
+  :ref:`scripting<tracing-vs-scripting>` mode due to limited support in ONNX for nested sequences.
+  In particular appending a tuple to a list is not supported. In tracing mode, the nested sequences
+  will be flattened automatically during the tracing.
+
+Differences in Operator Implementations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Due to differences in implementations of operators, running the exported model on different runtimes
+may produce different results from each other or from PyTorch. Normally these differences are
+numerically small, so this should only be a concern if your application is sensitive to these
+small differences.
+
+.. _tensor-indexing:
+
+Unsupported Tensor Indexing Patterns
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Tensor indexing patterns that cannot be exported are listed below.
+If you are experiencing issues exporting a model that does not include any of
+the unsupported patterns below, please double check that you are exporting with
+the latest ``opset_version``.
+
+Reads / Gets
+~~~~~~~~~~~~
+
+When indexing into a tensor for reading, the following patterns are not supported: ::
+
+  # Tensor indices that includes negative values.
+  data[torch.tensor([[1, 2], [2, -3]]), torch.tensor([-2, 3])]
+  # Workarounds: use positive index values.
+
+Writes / Sets
+~~~~~~~~~~~~~
+
+When indexing into a Tensor for writing, the following patterns are not supported: ::
+
+  # Multiple tensor indices if any has rank >= 2
+  data[torch.tensor([[1, 2], [2, 3]]), torch.tensor([2, 3])] = new_data
+  # Workarounds: use single tensor index with rank >= 2,
+  #              or multiple consecutive tensor indices with rank == 1.
+
+  # Multiple tensor indices that are not consecutive
+  data[torch.tensor([2, 3]), :, torch.tensor([1, 2])] = new_data
+  # Workarounds: transpose `data` such that tensor indices are consecutive.
+
+  # Tensor indices that includes negative values.
+  data[torch.tensor([1, -2]), torch.tensor([-2, 3])] = new_data
+  # Workarounds: use positive index values.
+
+  # Implicit broadcasting required for new_data.
+  data[torch.tensor([[0, 2], [1, 1]]), 1:3] = new_data
+  # Workarounds: expand new_data explicitly.
+  # Example:
+  #   data shape: [3, 4, 5]
+  #   new_data shape: [5]
+  #   expected new_data shape after broadcasting: [2, 2, 2, 5]
+
+Adding support for operators
+----------------------------
+
+When exporting a model that includes unsupported operators, you'll see an error message like:
+
+.. code-block:: text
+
+    RuntimeError: ONNX export failed: Couldn't export operator foo
+
+When that happens, there are a few things you can do:
+
+#. Change the model to not use that operator.
+#. Create a symbolic function to convert the operator and register it as a custom symbolic function.
+#. Contribute to PyTorch to add the same symbolic function to :mod:`torch.onnx` itself.
+
+If you decided to implement a symbolic function (we hope you will contribute it back to PyTorch!), here is how you can get started:
+
+ONNX exporter internals
+^^^^^^^^^^^^^^^^^^^^^^^
+
+A "symbolic function" is a function that decomposes a PyTorch operator into a
+composition of a series of ONNX operators.
+
+During export, each node (which contains a PyTorch operator) in the TorchScript
+graph is visited by the exporter in topological order.
+Upon visiting a node, the exporter looks for a registered symbolic functions for
+that operator. Symbolic functions are implemented in Python. A symbolic function for
+an op named ``foo`` would look something like::
+
+
+    def foo(
+      g,
+      input_0: torch._C.Value,
+      input_1: torch._C.Value) -> Union[None, torch._C.Value, List[torch._C.Value]]:
+      """
+      Adds the ONNX operations representing this PyTorch function by updating the
+      graph g with `g.op()` calls.
+
+      Args:
+        g (Graph): graph to write the ONNX representation into.
+        input_0 (Value): value representing the variables which contain
+            the first input for this operator.
+        input_1 (Value): value representing the variables which contain
+            the second input for this operator.
+
+      Returns:
+        A Value or List of Values specifying the ONNX nodes that compute something
+        equivalent to the original PyTorch operator with the given inputs.
+
+        None if it cannot be converted to ONNX.
+      """
+      ...
+
+The ``torch._C`` types are Python wrappers around the types defined in C++ in
+`ir.h <https://github.com/pytorch/pytorch/blob/main/torch/csrc/jit/ir/ir.h>`_.
+
+The process for adding a symbolic function depends on the type of operator.
+
+.. _adding-support-aten:
+
+ATen operators
+^^^^^^^^^^^^^^
+
+`ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library.
+If the operator is an ATen operator (shows up in the TorchScript graph with the prefix
+``aten::``), make sure it is not supported already.
+
+List of supported operators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Visit the auto generated :doc:`list of supported TorchScript operators <../onnx_torchscript_supported_aten_ops>`
+for details on which operator are supported in each ``opset_version``.
+
+Adding support for an aten or quantized operator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If the operator is not in the list above:
+
+* Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
+  `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/main/torch/onnx/symbolic_opset9.py>`_.
+  Make sure the function has the same name as the ATen function, which may be declared in
+  ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at
+  build time, so will not appear in your checkout until you build PyTorch).
+* By default, the first arg is the ONNX graph.
+  Other arg names must EXACTLY match the names in the ``.pyi`` file,
+  because dispatch is done with keyword arguments.
+* In the symbolic function, if the operator is in the
+  `ONNX standard operator set <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
+  we only need to create a node to represent the ONNX operator in the graph.
+  If not, we can compose several standard operators that have the
+  equivalent semantics to the ATen operator.
+
+Here is an example of handling missing symbolic function for the ``ELU`` operator.
+
+If we run the following code::
+
+    print(
+        torch.jit.trace(
+            torch.nn.ELU(), # module
+            torch.ones(1)   # example input
+        ).graph
+    )
+
+We see something like::
+
+  graph(%self : __torch__.torch.nn.modules.activation.___torch_mangle_0.ELU,
+        %input : Float(1, strides=[1], requires_grad=0, device=cpu)):
+    %4 : float = prim::Constant[value=1.]()
+    %5 : int = prim::Constant[value=1]()
+    %6 : int = prim::Constant[value=1]()
+    %7 : Float(1, strides=[1], requires_grad=0, device=cpu) = aten::elu(%input, %4, %5, %6)
+    return (%7)
+
+Since we see ``aten::elu`` in the graph, we know this is an ATen operator.
+
+We check the `ONNX operator list <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
+and confirm that ``Elu`` is standardized in ONNX.
+
+We find a signature for ``elu`` in ``torch/nn/functional.pyi``::
+
+    def elu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+
+We add the following lines to ``symbolic_opset9.py``::
+
+    def elu(g, input: torch.Value, alpha: torch.Value, inplace: bool = False):
+        return g.op("Elu", input, alpha_f=alpha)
+
+Now PyTorch is able to export models containing the ``aten::elu`` operator!
+
+See the ``torch/onnx/symbolic_opset*.py`` files for more examples.
+
+
+torch.autograd.Functions
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the operator is a sub-class of :class:`torch.autograd.Function`, there are three ways
+to export it.
+
+Static Symbolic Method
+~~~~~~~~~~~~~~~~~~~~~~
+
+You can add a static method named ``symbolic`` to your function class. It should return
+ONNX operators that represent the function's behavior in ONNX. For example::
+
+    class MyRelu(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+            ctx.save_for_backward(input)
+            return input.clamp(min=0)
+
+        @staticmethod
+        def symbolic(g: torch.Graph, input: torch.Value) -> torch.Value:
+            return g.op("Clip", input, g.op("Constant", value_t=torch.tensor(0, dtype=torch.float)))
+
+.. FIXME(justinchuby): PythonOps are too complicated and the example below
+..  uses private methods we do not expose. We are looking to
+..  improve the experience. Since SymbolicContext is deprecated, we think
+..  defining a symbolic staticmethod is a better way to go for now.
+
+.. PythonOp Symbolic
+.. ~~~~~~~~~~~~~~~~~
+
+.. Alternatively, you can register a custom symbolic function.
+.. This gives the symbolic function access to more info through the
+.. ``torch.onnx.SymbolicContext`` object, which gets passed in as the first
+.. argument (before the ``Graph`` object).
+
+.. All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes.
+.. In order to differentiate between different ``Function`` subclasses, the
+.. symbolic function should use the ``name`` kwarg which gets set to the name of the class.
+
+.. Custom symbolic functions should add type and shape information by calling ``setType(...)``
+.. on Value objects before returning them (implemented in C++ by
+.. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
+.. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
+.. ``test_aten_embedding_2`` in
+.. `test_operators.py <https://github.com/pytorch/pytorch/blob/release/2.5/test/onnx/test_operators.py#L1179>`_.
+
+.. The example below shows how you can access ``requires_grad`` via the ``Node`` object:
+
+..     class MyClip(torch.autograd.Function):
+..         @staticmethod
+..         def forward(ctx, input, min):
+..             ctx.save_for_backward(input)
+..             return input.clamp(min=min)
+
+..     class MyRelu(torch.autograd.Function):
+..         @staticmethod
+..         def forward(ctx, input):
+..             ctx.save_for_backward(input)
+..             return input.clamp(min=0)
+
+..     def symbolic_python_op(g: "GraphContext", *args, **kwargs):
+..         n = ctx.cur_node
+..         print("original node: ", n)
+..         for i, out in enumerate(n.outputs()):
+..             print("original output {}: {}, requires grad: {}".format(i, out, out.requiresGrad()))
+..         import torch.onnx.symbolic_helper as sym_helper
+..         for i, arg in enumerate(args):
+..             requires_grad = arg.requiresGrad() if sym_helper._is_value(arg) else False
+..             print("arg {}: {}, requires grad: {}".format(i, arg, requires_grad))
+
+..         name = kwargs["name"]
+..         ret = None
+..         if name == "MyClip":
+..             ret = g.op("Clip", args[0], args[1])
+..         elif name == "MyRelu":
+..             ret = g.op("Relu", args[0])
+..         else:
+..             # Logs a warning and returns None
+..             return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
+..         # Copy type and shape from original node.
+..         ret.setType(n.type())
+..         return ret
+
+..     from torch.onnx import register_custom_op_symbolic
+.. .     register_custom_op_symbolic("prim::PythonOp", symbolic_python_op, 1)
+
+Inline Autograd Function
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+In cases where a static symbolic method is not provided for its subsequent :class:`torch.autograd.Function` or
+where a function to register ``prim::PythonOp`` as custom symbolic functions is not provided,
+:func:`torch.onnx.export` tries to inline the graph that corresponds to that :class:`torch.autograd.Function` such that
+this function is broken down into individual operators that were used within the function.
+The export should be successful as long as these individual operators are supported. For example::
+
+    class MyLogExp(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+            ctx.save_for_backward(input)
+            h = input.exp()
+            return h.log().log()
+
+There is no static symbolic method present for this model, yet it is exported as follows::
+
+    graph(%input : Float(1, strides=[1], requires_grad=0, device=cpu)):
+        %1 : float = onnx::Exp[](%input)
+        %2 : float = onnx::Log[](%1)
+        %3 : float = onnx::Log[](%2)
+        return (%3)
+
+If you need to avoid inlining of :class:`torch.autograd.Function`, you should export models with
+``operator_export_type`` set to ``ONNX_FALLTHROUGH`` or ``ONNX_ATEN_FALLBACK``.
+
+Custom operators
+^^^^^^^^^^^^^^^^
+
+You can export your model with custom operators that includes a combination of many standard ONNX ops,
+or are driven by self-defined C++ backend.
+
+ONNX-script functions
+~~~~~~~~~~~~~~~~~~~~~
+
+If an operator is not a standard ONNX op, but can be composed of multiple existing ONNX ops, you can utilize
+`ONNX-script <https://github.com/microsoft/onnx-script>`_ to create an external ONNX function to support the operator.
+You can export it by following this example::
+
+    import onnxscript
+    # There are three opset version needed to be aligned
+    # This is (1) the opset version in ONNX function
+    from onnxscript.onnx_opset import opset15 as op
+    opset_version = 15
+
+    x = torch.randn(1, 2, 3, 4, requires_grad=True)
+    model = torch.nn.SELU()
+
+    custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
+
+    @onnxscript.script(custom_opset)
+    def Selu(X):
+        alpha = 1.67326  # auto wrapped as Constants
+        gamma = 1.0507
+        alphaX = op.CastLike(alpha, X)
+        gammaX = op.CastLike(gamma, X)
+        neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+        pos = gammaX * X
+        zero = op.CastLike(0, X)
+        return op.Where(X <= zero, neg, pos)
+
+    # setType API provides shape/type to ONNX shape/type inference
+    def custom_selu(g: jit_utils.GraphContext, X):
+        return g.onnxscript_op(Selu, X).setType(X.type())
+
+    # Register custom symbolic function
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in registry
+    torch.onnx.register_custom_op_symbolic(
+        symbolic_name="aten::selu",
+        symbolic_fn=custom_selu,
+        opset_version=opset_version,
+    )
+
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in exporter
+    torch.onnx.export(
+        model,
+        x,
+        "model.onnx",
+        opset_version=opset_version,
+        # only needed if you want to specify an opset version > 1.
+        custom_opsets={"onnx-script": 2}
+    )
+
+The example above exports it as a custom operator in the "onnx-script" opset.
+When exporting a custom operator, you can specify the custom domain version using the
+``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
+
+NOTE: Be careful to align the opset version mentioned in the above example, and make sure they are consumed in exporter step.
+The example usage of how to write a onnx-script function is a beta version in terms of the active development on onnx-script.
+Please follow the latest `ONNX-script <https://github.com/microsoft/onnx-script>`_
+
+C++ Operators
+~~~~~~~~~~~~~
+
+If a model uses a custom operator implemented in C++ as described in
+`Extending TorchScript with Custom C++ Operators <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_,
+you can export it by following this example::
+
+    from torch.onnx import symbolic_helper
+
+
+    # Define custom symbolic function
+    @symbolic_helper.parse_args("v", "v", "f", "i")
+    def symbolic_foo_forward(g, input1, input2, attr1, attr2):
+        return g.op("custom_domain::Foo", input1, input2, attr1_f=attr1, attr2_i=attr2)
+
+
+    # Register custom symbolic function
+    torch.onnx.register_custom_op_symbolic("custom_ops::foo_forward", symbolic_foo_forward, 9)
+
+
+    class FooModel(torch.nn.Module):
+        def __init__(self, attr1, attr2):
+            super().__init__()
+            self.attr1 = attr1
+            self.attr2 = attr2
+
+        def forward(self, input1, input2):
+            # Calling custom op
+            return torch.ops.custom_ops.foo_forward(input1, input2, self.attr1, self.attr2)
+
+
+    model = FooModel(attr1, attr2)
+    torch.onnx.export(
+        model,
+        (example_input1, example_input1),
+        "model.onnx",
+        # only needed if you want to specify an opset version > 1.
+        custom_opsets={"custom_domain": 2}
+    )
+
+The example above exports it as a custom operator in the "custom_domain" opset.
+When exporting a custom operator, you can specify the custom domain version using the
+``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
+
+The runtime that consumes the model needs to support the custom op. See
+`Caffe2 custom ops <https://caffe2.ai/docs/custom-operators.html>`_,
+`ONNX Runtime custom ops <https://onnxruntime.ai/docs/reference/operators/add-custom-op.html>`_,
+or your runtime of choice's documentation.
+
+
+Discovering all unconvertible ATen ops at once
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When export fails due to an unconvertible ATen op, there may in fact be more
+than one such op but the error message only mentions the first. To discover
+all of the unconvertible ops in one go you can::
+
+    # prepare model, args, opset_version
+    ...
+
+    torch_script_graph, unconvertible_ops = torch.onnx.utils.unconvertible_ops(
+        model, args, opset_version=opset_version
+    )
+
+    print(set(unconvertible_ops))
+
+The set is approximated because some ops may be removed during the conversion
+process and don't need to be converted. Some other ops may have partial support
+that will fail conversion with particular inputs, but this should give you a
+general idea of what ops are not supported. Please feel free to open GitHub Issues
+for op support requests.
+
+Frequently Asked Questions
+--------------------------
+Q: I have exported my LSTM model, but its input size seems to be fixed?
+
+  The tracer records the shapes of the example inputs. If the model should accept
+  inputs of dynamic shapes, set ``dynamic_axes`` when calling :func:`torch.onnx.export`.
+
+Q: How to export models containing loops?
+
+  See `Tracing vs Scripting`_.
+
+Q: How to export models with primitive type inputs (e.g. int, float)?
+
+  Support for primitive numeric type inputs was added in PyTorch 1.9.
+  However, the exporter does not support models with str inputs.
+
+Q: Does ONNX support implicit scalar datatype casting?
+
+  The ONNX standard does not, but the exporter will try to handle that part.
+  Scalars are exported as constant tensors.
+  The exporter will figure out the right data type for scalars. In rare cases when it is unable
+  to do so, you will need to manually specify the datatype with e.g. `dtype=torch.float32`.
+  If you see any errors, please [create a GitHub issue](https://github.com/pytorch/pytorch/issues).
+
+Q: Are lists of Tensors exportable to ONNX?
+
+  Yes, for ``opset_version`` >= 11, since ONNX introduced the Sequence type in opset 11.
+
+Python API
+----------
+
+.. automodule:: torch.onnx
+
+Functions
+^^^^^^^^^
+
+.. autofunction:: export
+    :noindex:
+.. autofunction:: register_custom_op_symbolic
+.. autofunction:: unregister_custom_op_symbolic
+.. autofunction:: select_model_mode_for_export
+.. autofunction:: is_in_onnx_export
+    :noindex:
+
+Classes
+^^^^^^^
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    JitScalarType
\ No newline at end of file
diff --git a/docs/source/onnx_torchscript_supported_aten_ops.rst b/docs/source/onnx_torchscript_supported_aten_ops.rst
new file mode 100644
index 0000000000000..e5a795a903860
--- /dev/null
+++ b/docs/source/onnx_torchscript_supported_aten_ops.rst
@@ -0,0 +1,30 @@
+:orphan:
+
+ONNX supported TorchScript operators
+====================================
+
+.. This file is automatically generated during the documentation build
+.. by cross referencing ONNX operator symbolics with TorchScript operators via
+.. ``docs/source/scripts/build_onnx_torchscript_supported_aten_op_csv_table.py``.
+.. Do not modify directly and instead `rebuild the docs <https://github.com/pytorch/pytorch#building-the-documentation>`_.
+
+This page lists the TorchScript operators that are supported/unsupported by ONNX export.
+
+Supported operators
+-------------------
+
+.. csv-table:: ONNX support for TorchScript operators
+   :file: ../build/onnx/auto_gen_supported_op_list.csv
+   :widths: 70, 30
+   :header-rows: 1
+
+
+Unsupported operators
+---------------------
+
+Operators that are not yet supported
+
+.. csv-table:: Unsupported operators
+   :file: ../build/onnx/auto_gen_unsupported_op_list.csv
+   :widths: 70, 30
+   :header-rows: 1
diff --git a/docs/source/onnx_verification.md b/docs/source/onnx_verification.md
index 4036aea8f81a7..f35f33f42d019 100644
--- a/docs/source/onnx_verification.md
+++ b/docs/source/onnx_verification.md
@@ -1,5 +1,8 @@
 # torch.onnx.verification
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```{eval-rst}
 .. automodule:: torch.onnx.verification
 ```
@@ -12,3 +15,26 @@
 .. autoclass:: VerificationInfo
     :members:
 ```
+<<<<<<< HEAD
+=======
+
+```{eval-rst}
+.. autofunction:: verify
+```
+
+## Deprecated
+
+The following classes and functions are deprecated.
+
+<!-- Some deprecated members are not publicly shown -->
+```{eval-rst}
+.. py:class:: check_export_model_diff
+.. py:class:: GraphInfo
+.. py:class:: GraphInfoPrettyPrinter
+.. py:class:: OnnxBackend
+.. py:class:: OnnxTestCaseRepro
+.. py:class:: VerificationOptions
+.. py:function:: find_mismatch
+.. py:function:: verify_aten_graph
+```
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/optim.md b/docs/source/optim.md
index 8c3174c76fb29..bfb4241b66d48 100644
--- a/docs/source/optim.md
+++ b/docs/source/optim.md
@@ -165,7 +165,10 @@ for input, target in dataset:
     Adamax
     ASGD
     LBFGS
+<<<<<<< HEAD
     Muon
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NAdam
     RAdam
     RMSprop
@@ -211,7 +214,10 @@ Below is a table showing the available and default implementations of each algor
     :class:`Adamax`;foreach;yes;no
     :class:`ASGD`;foreach;yes;no
     :class:`LBFGS`;for-loop;no;no
+<<<<<<< HEAD
     :class:`Muon`;for-loop;no;no
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     :class:`NAdam`;foreach;yes;no
     :class:`RAdam`;foreach;yes;no
     :class:`RMSprop`;foreach;yes;no
@@ -235,7 +241,10 @@ Below table is showing the stability status for fused implementations:
     :class:`Adamax`;unsupported;unsupported;unsupported
     :class:`ASGD`;unsupported;unsupported;unsupported
     :class:`LBFGS`;unsupported;unsupported;unsupported
+<<<<<<< HEAD
     :class:`Muon`;unsupported;unsupported;unsupported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     :class:`NAdam`;unsupported;unsupported;unsupported
     :class:`RAdam`;unsupported;unsupported;unsupported
     :class:`RMSprop`;unsupported;unsupported;unsupported
@@ -691,6 +700,7 @@ We train the model for a total of 300 epochs and start to collect EMA averages i
 <!-- This module needs to be documented. Adding here in the meantime
 for tracking purposes -->
 ```{eval-rst}
+<<<<<<< HEAD
 .. py:module:: torch.optim.lr_scheduler
 .. py:module:: torch.optim.optimizer
 .. py:module:: torch.optim.swa_utils
@@ -702,3 +712,22 @@ for tracking purposes -->
 
     optim.aliases.md
 ```
+=======
+.. py:module:: torch.optim.adadelta
+.. py:module:: torch.optim.adagrad
+.. py:module:: torch.optim.adam
+.. py:module:: torch.optim.adamax
+.. py:module:: torch.optim.adamw
+.. py:module:: torch.optim.asgd
+.. py:module:: torch.optim.lbfgs
+.. py:module:: torch.optim.lr_scheduler
+.. py:module:: torch.optim.nadam
+.. py:module:: torch.optim.optimizer
+.. py:module:: torch.optim.radam
+.. py:module:: torch.optim.rmsprop
+.. py:module:: torch.optim.rprop
+.. py:module:: torch.optim.sgd
+.. py:module:: torch.optim.sparse_adam
+.. py:module:: torch.optim.swa_utils
+```
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/package.md b/docs/source/package.md
index 1b50f743d5793..ebbdf0adb7bcf 100644
--- a/docs/source/package.md
+++ b/docs/source/package.md
@@ -416,6 +416,24 @@ with PackageExporter(f2, importer=(importer, sys_importer)) as exporter:
     exporter.save_pickle("model", "model.pkl", obj)
 ```
 
+<<<<<<< HEAD
+=======
+### Package a TorchScript module?
+To package a TorchScript model, use the same `save_pickle` and `load_pickle` APIs as you would with any other object.
+Saving TorchScript objects that are attributes or submodules is supported as well with no extra work.
+
+```python
+# save TorchScript just like any other object
+with PackageExporter(file_name) as e:
+    e.save_pickle("res", "script_model.pkl", scripted_model)
+    e.save_pickle("res", "mixed_model.pkl", python_model_with_scripted_submodule)
+# load as normal
+importer = PackageImporter(file_name)
+loaded_script = importer.load_pickle("res", "script_model.pkl")
+loaded_mixed = importer.load_pickle("res", "mixed_model.pkl"
+```
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## Explanation
 
 ### `torch.package` Format Overview
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
index 5f99e4334bb69..1b3f22134e5c7 100644
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@@ -1,4 +1,5 @@
 (pytorch_api)=
+<<<<<<< HEAD
 # Reference API
 
 ```{toctree}
@@ -6,11 +7,17 @@
 
 C++ <https://docs.pytorch.org/cppdocs/>
 ```
+=======
+# Python API
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```{toctree}
 :glob:
 :maxdepth: 1
+<<<<<<< HEAD
 :caption: Python API
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 torch
 nn
@@ -41,7 +48,10 @@ torch.distributed.fsdp.fully_shard <distributed.fsdp.fully_shard>
 torch.distributed.tensor.parallel <distributed.tensor.parallel>
 torch.distributed.optim <distributed.optim>
 torch.distributed.pipelining <distributed.pipelining>
+<<<<<<< HEAD
 torch.distributed._symmetric_memory <symmetric_memory>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.distributed.checkpoint <distributed.checkpoint>
 torch.distributions <distributions>
 torch.compiler <torch.compiler>
@@ -57,7 +67,10 @@ torch.monitor <monitor>
 torch.signal <signal>
 torch.special <special>
 torch.overrides
+<<<<<<< HEAD
 torch.nativert <nativert>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.package <package>
 profiler
 nn.init
@@ -77,6 +90,10 @@ storage
 torch.testing <testing>
 torch.utils <utils>
 torch.utils.benchmark <benchmark_utils>
+<<<<<<< HEAD
+=======
+torch.utils.bottleneck <bottleneck>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.utils.checkpoint <checkpoint>
 torch.utils.cpp_extension <cpp_extension>
 torch.utils.data <data>
diff --git a/docs/source/quantization-accuracy-debugging.md b/docs/source/quantization-accuracy-debugging.md
new file mode 100644
index 0000000000000..d13d83129570a
--- /dev/null
+++ b/docs/source/quantization-accuracy-debugging.md
@@ -0,0 +1,96 @@
+# Quantization Accuracy Debugging
+
+This document provides high level strategies for improving quantization
+accuracy. If a quantized model has error compared to the original model,
+we can categorize the error into:
+
+1. **data insensitive error** - caused by intrinsic model quantization error,
+   large portion of input data has large error
+2. **data sensitive error** - caused by outlier input data, small
+   portion of input data has large error
+3. **implementation error** - quantized kernel is not matching reference implementation
+
+## Data insensitive error
+
+### General tips
+
+1. For PTQ, ensure that the data you are calibrating with is representative
+   of your dataset. For example, for a classification problem a general
+   guideline is to have multiple samples in every category, and the overall
+   number of samples should be at least 100. There is no penalty for
+   calibrating with more data other than calibration time.
+2. If your model has Conv-BN or Linear-BN patterns, consider fusing them.
+   If you are using FX graph mode quantization, this is done automatically
+   by the workflow. If you are using Eager mode quantization, you can do
+   this manually with the ``torch.ao.quantization.fuse_modules`` API.
+3. Increase the precision of dtype of the problematic ops. Usually, fp32
+   will have the highest accuracy, followed by fp16, followed by dynamically
+   quantized int8, followed by statically quantized int8.
+
+   1. Note: this is trading off performance for accuracy.
+   2. Note: availability of kernels per dtype per op can vary by backend.
+   3. Note: dtype conversions add an additional performance cost. For example,
+      ``fp32_op -> quant -> int8_op -> dequant -> fp32_op -> quant -> int8_op -> dequant``
+      will have a performance penalty compared to
+      ``fp32_op -> fp32_op -> quant -> int8_op -> int8_op -> dequant``
+      because of a higher number of required dtype conversions.
+
+4. If you are using PTQ, consider using QAT to recover some of the accuracy loss
+   from quantization.
+
+### Int8 quantization tips
+
+1. If you are using per-tensor weight quantization, consider using per-channel
+   weight quantization.
+2. If you are doing inference on `fbgemm`, ensure that you set the `reduce_range`
+   argument to `False` if your CPU is Cooperlake or newer, and to `True` otherwise.
+3. Audit the input activation distribution variation across different samples.
+   If this variation is high, the layer may be suitable for dynamic quantization
+   but not static quantization.
+
+## Data sensitive error
+
+If you are using static quantization and a small portion of your input data is
+resulting in high quantization error, you can try:
+
+1. Adjust your calibration dataset to make it more representative of your
+   inference dataset.
+2. Manually inspect (using Numeric Suite) which layers have high quantization
+   error. For these layers, consider leaving them in floating point or adjusting
+   the observer settings to choose a better scale and zero_point.
+
+
+## Implementation error
+
+If you are using PyTorch quantization with your own backend
+you may see differences between the reference implementation of an
+operation (such as ``dequant -> op_fp32 -> quant``) and the quantized implementation
+(such as `op_int8`) of the op on the target hardware. This could mean one of two things:
+
+1. the differences (usually small) are expected due to specific behavior of
+   the target kernel on the target hardware compared to fp32/cpu. An example of this
+   is accumulating in an integer dtype. Unless the kernel guarantees bitwise
+   equivalency with the reference implementation, this is expected.
+2. the kernel on the target hardware has an accuracy issue. In this case, reach
+   out to the kernel developer.
+
+## Numerical Debugging Tooling (prototype)
+
+```{eval-rst}
+.. toctree::
+    :hidden:
+
+    torch.ao.ns._numeric_suite
+    torch.ao.ns._numeric_suite_fx
+```
+
+```{warning}
+Numerical debugging tooling is early prototype and subject to change.
+```
+
+```{eval-rst}
+* :ref:`torch_ao_ns_numeric_suite`
+  Eager mode numeric suite
+* :ref:`torch_ao_ns_numeric_suite_fx`
+  FX numeric suite
+```
diff --git a/docs/source/quantization-backend-configuration.md b/docs/source/quantization-backend-configuration.md
new file mode 100644
index 0000000000000..fb28fbef54387
--- /dev/null
+++ b/docs/source/quantization-backend-configuration.md
@@ -0,0 +1,19 @@
+# Quantization Backend Configuration
+
+FX Graph Mode Quantization allows the user to configure various
+quantization behaviors of an op in order to match the expectation
+of their backend.
+
+In the future, this document will contain a detailed spec of
+these configurations.
+
+## Default values for native configurations
+
+Below is the output of the configuration for quantization of ops
+in x86 and qnnpack (PyTorch's default quantized backends).
+
+Results:
+
+```{eval-rst}
+.. literalinclude:: scripts/quantization_backend_configs/default_backend_config.txt
+```
diff --git a/docs/source/quantization-support.md b/docs/source/quantization-support.md
index 986b1cb257513..2d959e24488eb 100644
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@@ -52,6 +52,7 @@ This module contains Eager mode quantization APIs.
     default_eval_fn
 ```
 
+<<<<<<< HEAD
 ## torch.ao.quantization.utils
 
 ```{eval-rst}
@@ -72,6 +73,8 @@ This module contains Eager mode quantization APIs.
     validate_qmin_qmax
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## torch.ao.quantization.quantize_fx
 
 This module contains FX graph mode quantization APIs (prototype).
@@ -170,7 +173,11 @@ This module contains a few CustomConfig classes that's used in both eager mode a
 ## torch.ao.quantization.pt2e.export_utils
 
 ```{eval-rst}
+<<<<<<< HEAD
 .. automodule:: torch.ao.quantization.pt2e.export_utils
+=======
+.. currentmodule:: torch.ao.quantization.pt2e.export_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{eval-rst}
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 386a18ffceb0f..e9295aa7f70cd 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -6,6 +6,7 @@ Quantization
 .. automodule:: torch.ao.quantization
 .. automodule:: torch.ao.quantization.fx
 
+<<<<<<< HEAD
 We are cetralizing all quantization related development to `torchao <https://github.com/pytorch/ao>`__, please checkout our new doc page: https://docs.pytorch.org/ao/stable/index.html
 
 Plan for the existing quantization flows:
@@ -25,6 +26,895 @@ We plan to delete `torch.ao.quantization` in 2.10 if there are no blockers, or i
 
 Quantization API Reference (Kept since APIs are still public)
 -----------------------------------------------------------------
+=======
+.. warning ::
+     Quantization is in beta and subject to change.
+
+Introduction to Quantization
+----------------------------
+
+Quantization refers to techniques for performing computations and storing
+tensors at lower bitwidths than floating point precision. A quantized model
+executes some or all of the operations on tensors with reduced precision rather than
+full precision (floating point) values. This allows for a more compact model representation and
+the use of high performance vectorized operations on many hardware platforms.
+PyTorch supports INT8 quantization compared to typical FP32 models allowing for
+a 4x reduction in the model size and a 4x reduction in memory bandwidth
+requirements. Hardware support for INT8 computations is typically 2 to 4
+times faster compared to FP32 compute. Quantization is primarily a technique to
+speed up inference and only the forward pass is supported for quantized
+operators.
+
+PyTorch supports multiple approaches to quantizing a deep learning model. In
+most cases the model is trained in FP32 and then the model is converted to
+INT8. In addition, PyTorch also supports quantization aware training, which
+models quantization errors in both the forward and backward passes using
+fake-quantization modules. Note that the entire computation is carried out in
+floating point. At the end of quantization aware training, PyTorch provides
+conversion functions to convert the trained model into lower precision.
+
+At lower level, PyTorch provides a way to represent quantized tensors and
+perform operations with them. They can be used to directly construct models
+that perform all or part of the computation in lower precision. Higher-level
+APIs are provided that incorporate typical workflows of converting FP32 model
+to lower precision with minimal accuracy loss.
+
+Quantization API Summary
+-----------------------------
+
+PyTorch provides three different modes of quantization: Eager Mode Quantization, FX Graph Mode Quantization (maintenance) and PyTorch 2 Export Quantization.
+
+Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
+
+FX Graph Mode Quantization is an automated quantization workflow in PyTorch, and currently it's a prototype feature, it is in maintenance mode since we have PyTorch 2 Export Quantization. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
+
+PyTorch 2 Export Quantization is the new full graph mode quantization workflow, released as prototype feature in PyTorch 2.1. With PyTorch 2, we are moving to a better solution for full program capture (torch.export) since it can capture a higher percentage (88.8% on 14K models) of models compared to torch.fx.symbolic_trace (72.7% on 14K models), the program capture solution used by FX Graph Mode Quantization. torch.export still has limitations around some python constructs and requires user involvement to support dynamism in the exported model, but overall it is an improvement over the previous program capture solution. PyTorch 2 Export Quantization is built for models captured by torch.export, with flexibility and productivity of both modeling users and backend developers in mind. The main features are
+(1). Programmable API for configuring how a model is quantized that can scale to many more use cases
+(2). Simplified UX for modeling users and backend developers since they only need to interact with a single object (Quantizer) for expressing user’s intention about how to quantize a model and what the backend support.
+(3). Optional reference quantized model representation that can represent quantized computation with integer operations that maps closer to actual quantized computations that happens in hardware.
+
+New users of quantization are encouraged to try out PyTorch 2 Export Quantization first, if it does not work well, user can try eager mode quantization.
+
+The following table compares the differences between Eager Mode Quantization, FX Graph Mode Quantization and PyTorch 2 Export Quantization:
+
++-----------------+-------------------+-------------------+-------------------------+
+|                 |Eager Mode         |FX Graph           |PyTorch 2 Export         |
+|                 |Quantization       |Mode               |Quantization             |
+|                 |                   |Quantization       |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Release          |beta               |prototype          |prototype                |
+|Status           |                   |(maintenance)      |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Operator         |Manual             |Automatic          |Automatic                |
+|Fusion           |                   |                   |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Quant/DeQuant    |Manual             |Automatic          |Automatic                |
+|Placement        |                   |                   |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Quantizing       |Supported          |Supported          |Supported                |
+|Modules          |                   |                   |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Quantizing       |Manual             |Automatic          |Supported                |
+|Functionals/Torch|                   |                   |                         |
+|Ops              |                   |                   |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Support for      |Limited Support    |Fully              |Fully Supported          |
+|Customization    |                   |Supported          |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Quantization Mode|Post Training      |Post Training      |Defined by               |
+|Support          |Quantization:      |Quantization:      |Backend Specific         |
+|                 |Static, Dynamic,   |Static, Dynamic,   |Quantizer                |
+|                 |Weight Only        |Weight Only        |                         |
+|                 |                   |                   |                         |
+|                 |Quantization Aware |Quantization Aware |                         |
+|                 |Training:          |Training:          |                         |
+|                 |Static             |Static             |                         |
++-----------------+-------------------+-------------------+-------------------------+
+|Input/Output     |``torch.nn.Module``|``torch.nn.Module``|``torch.fx.GraphModule`` |
+|Model Type       |                   |(May need some     |(captured by             |
+|                 |                   |refactors to make  |``torch.export``         |
+|                 |                   |the model          |                         |
+|                 |                   |compatible with FX |                         |
+|                 |                   |Graph Mode         |                         |
+|                 |                   |Quantization)      |                         |
++-----------------+-------------------+-------------------+-------------------------+
+
+
+
+There are three types of quantization supported:
+
+1. dynamic quantization (weights quantized with activations read/stored in
+   floating point and quantized for compute)
+2. static quantization (weights quantized, activations quantized, calibration
+   required post training)
+3. static quantization aware training (weights quantized, activations quantized,
+   quantization numerics modeled during training)
+
+Please see our `Introduction to Quantization on PyTorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Operator coverage varies between dynamic and static quantization and is captured in the table below.
+
++---------------------------+-------------------+--------------------+
+|                           |Static             | Dynamic            |
+|                           |Quantization       | Quantization       |
++---------------------------+-------------------+--------------------+
+| | nn.Linear               | | Y               | | Y                |
+| | nn.Conv1d/2d/3d         | | Y               | | N                |
++---------------------------+-------------------+--------------------+
+| | nn.LSTM                 | | Y (through      | | Y                |
+| |                         | | custom modules) | |                  |
+| | nn.GRU                  | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+| | nn.RNNCell              | | N               | | Y                |
+| | nn.GRUCell              | | N               | | Y                |
+| | nn.LSTMCell             | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+|nn.EmbeddingBag            | Y (activations    |                    |
+|                           | are in fp32)      | Y                  |
++---------------------------+-------------------+--------------------+
+|nn.Embedding               | Y                 | Y                  |
++---------------------------+-------------------+--------------------+
+| nn.MultiheadAttention     | Y (through        | Not supported      |
+|                           | custom modules)   |                    |
++---------------------------+-------------------+--------------------+
+| Activations               | Broadly supported | Un-changed,        |
+|                           |                   | computations       |
+|                           |                   | stay in fp32       |
++---------------------------+-------------------+--------------------+
+
+
+Eager Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^
+For a general introduction to the quantization flow, including different types of quantization, please take a look at `General Quantization Flow`_.
+
+Post Training Dynamic Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is the simplest to apply form of quantization where the weights are
+quantized ahead of time but the activations are dynamically quantized
+during inference. This is used for situations where the model execution time
+is dominated by loading weights from memory rather than computing the matrix
+multiplications. This is true for LSTM and Transformer type models with
+small batch size.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                   /
+  linear_weight_fp32
+
+  # dynamically quantized model
+  # linear and LSTM weights are in int8
+  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
+                       /
+     linear_weight_int8
+
+PTDQ API Example::
+
+  import torch
+
+  # define a floating point model
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.fc = torch.nn.Linear(4, 4)
+
+      def forward(self, x):
+          x = self.fc(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+  # create a quantized model instance
+  model_int8 = torch.ao.quantization.quantize_dynamic(
+      model_fp32,  # the original model
+      {torch.nn.Linear},  # a set of layers to dynamically quantize
+      dtype=torch.qint8)  # the target dtype for quantized weights
+
+  # run the model
+  input_fp32 = torch.randn(4, 4, 4, 4)
+  res = model_int8(input_fp32)
+
+To learn more about dynamic quantization please see our `dynamic quantization tutorial
+<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
+
+Post Training Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Post Training Static Quantization (PTQ static) quantizes the weights and activations of the model.  It
+fuses activations into preceding layers where possible.  It requires
+calibration with a representative dataset to determine optimal quantization
+parameters for activations. Post Training Static Quantization is typically used when
+both memory bandwidth and compute savings are important with CNNs being a
+typical use case.
+
+We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_.
+
+Diagram::
+
+    # original model
+    # all tensors and computations are in floating point
+    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+        linear_weight_fp32
+
+    # statically quantized model
+    # weights and activations are in int8
+    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                        /
+      linear_weight_int8
+
+PTSQ API Example::
+
+  import torch
+
+  # define a floating point model where some layers could be statically quantized
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.ao.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.ao.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # manually specify where tensors will be converted from floating
+          # point to quantized in the quantized model
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.relu(x)
+          # manually specify where tensors will be converted from quantized
+          # to floating point in the quantized model
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval mode for static quantization logic to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'x86' for server inference and 'qnnpack'
+  # for mobile inference. Other quantization configurations such as selecting
+  # symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
+  # can be specified here.
+  # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
+  # for server inference.
+  # model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('x86')
+
+  # Fuse the activations to preceding layers, where applicable.
+  # This needs to be done manually depending on the model architecture.
+  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
+  model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+
+  # Prepare the model for static quantization. This inserts observers in
+  # the model that will observe activation tensors during calibration.
+  model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)
+
+  # calibrate the prepared model to determine quantization parameters for activations
+  # in a real world setting, the calibration would be done with a representative dataset
+  input_fp32 = torch.randn(4, 1, 4, 4)
+  model_fp32_prepared(input_fp32)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, and replaces key operators with quantized
+  # implementations.
+  model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about static quantization, please see the `static quantization tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Quantization Aware Training for Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Quantization Aware Training (QAT) models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods. We can do QAT for static, dynamic or weight only quantization.  During
+training, all calculations are done in floating point, with fake_quant modules
+modeling the effects of quantization by clamping and rounding to simulate the
+effects of INT8.  After model conversion, weights and
+activations are quantized, and activations are fused into the preceding layer
+where possible.  It is commonly used with CNNs and yields a higher accuracy
+compared to static quantization.
+
+We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+      linear_weight_fp32
+
+  # model with fake_quants for modeling quantization numerics during training
+  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
+                             /
+     linear_weight_fp32 -- fq
+
+  # quantized model
+  # weights and activations are in int8
+  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                       /
+     linear_weight_int8
+
+QAT API Example::
+
+  import torch
+
+  # define a floating point model where some layers could benefit from QAT
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.ao.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.bn = torch.nn.BatchNorm2d(1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.ao.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.bn(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval for fusion to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'x86' for server inference and 'qnnpack'
+  # for mobile inference. Other quantization configurations such as selecting
+  # symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
+  # can be specified here.
+  # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
+  # for server inference.
+  # model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32,
+      [['conv', 'bn', 'relu']])
+
+  # Prepare the model for QAT. This inserts observers and fake_quants in
+  # the model needs to be set to train for QAT logic to work
+  # the model that will observe weight and activation tensors during calibration.
+  model_fp32_prepared = torch.ao.quantization.prepare_qat(model_fp32_fused.train())
+
+  # run the training loop (not shown)
+  training_loop(model_fp32_prepared)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, fuses modules where appropriate,
+  # and replaces key operators with quantized implementations.
+  model_fp32_prepared.eval()
+  model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about quantization aware training, please see the `QAT
+tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Model Preparation for Eager Mode Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is necessary to currently make some modifications to the model definition
+prior to Eager mode quantization. This is because currently quantization works on a module
+by module basis. Specifically, for all quantization techniques, the user needs to:
+
+1. Convert any operations that require output requantization (and thus have
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
+2. Specify which parts of the model need to be quantized either by assigning
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_mapping``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
+
+For static quantization techniques which quantize activations, the user needs
+to do the following in addition:
+
+1. Specify where activations are quantized and de-quantized. This is done using
+   :class:`~torch.ao.quantization.QuantStub` and
+   :class:`~torch.ao.quantization.DeQuantStub` modules.
+2. Use :class:`~torch.ao.nn.quantized.FloatFunctional` to wrap tensor operations
+   that require special handling for quantization into modules. Examples
+   are operations like ``add`` and ``cat`` which require special handling to
+   determine output quantization parameters.
+3. Fuse modules: combine operations/modules into a single module to obtain
+   higher accuracy and performance. This is done using the
+   :func:`~torch.ao.quantization.fuse_modules.fuse_modules` API, which takes in lists of modules
+   to be fused. We currently support the following fusions:
+   [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
+
+(Prototype - maintenance mode) FX Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_mapping` (an argument of the `prepare_fx` function).
+
+FXPTQ API Example::
+
+  import torch
+  from torch.ao.quantization import (
+    get_default_qconfig_mapping,
+    get_default_qat_qconfig_mapping,
+    QConfigMapping,
+  )
+  import torch.ao.quantization.quantize_fx as quantize_fx
+  import copy
+
+  model_fp = UserModel()
+
+  #
+  # post training dynamic/weight_only quantization
+  #
+
+  # we need to deepcopy if we still want to keep model_fp unchanged after quantization since quantization apis change the input model
+  model_to_quantize = copy.deepcopy(model_fp)
+  model_to_quantize.eval()
+  qconfig_mapping = QConfigMapping().set_global(torch.ao.quantization.default_dynamic_qconfig)
+  # a tuple of one or more example inputs are needed to trace the model
+  example_inputs = (input_fp32)
+  # prepare
+  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
+  # no calibration needed when we only have dynamic/weight_only quantization
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # post training static quantization
+  #
+
+  model_to_quantize = copy.deepcopy(model_fp)
+  qconfig_mapping = get_default_qconfig_mapping("qnnpack")
+  model_to_quantize.eval()
+  # prepare
+  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
+  # calibrate (not shown)
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # quantization aware training for static quantization
+  #
+
+  model_to_quantize = copy.deepcopy(model_fp)
+  qconfig_mapping = get_default_qat_qconfig_mapping("qnnpack")
+  model_to_quantize.train()
+  # prepare
+  model_prepared = quantize_fx.prepare_qat_fx(model_to_quantize, qconfig_mapping, example_inputs)
+  # training loop (not shown)
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # fusion
+  #
+  model_to_quantize = copy.deepcopy(model_fp)
+  model_fused = quantize_fx.fuse_fx(model_to_quantize)
+
+Please follow the tutorials below to learn more about FX Graph Mode Quantization:
+
+- `User Guide on Using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_
+- `FX Graph Mode Post Training Static Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`_
+- `FX Graph Mode Post Training Dynamic Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html>`_
+
+(Prototype) PyTorch 2 Export Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+API Example::
+
+  import torch
+  from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+  from torch.export import export_for_training
+  from torch.ao.quantization.quantizer import (
+      XNNPACKQuantizer,
+      get_symmetric_quantization_config,
+  )
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.linear = torch.nn.Linear(5, 10)
+
+     def forward(self, x):
+         return self.linear(x)
+
+  # initialize a floating point model
+  float_model = M().eval()
+
+  # define calibration function
+  def calibrate(model, data_loader):
+      model.eval()
+      with torch.no_grad():
+          for image, target in data_loader:
+              model(image)
+
+  # Step 1. program capture
+  # NOTE: this API will be updated to torch.export API in the future, but the captured
+  # result should mostly stay the same
+  m = export_for_training(m, *example_inputs).module()
+  # we get a model with aten ops
+
+  # Step 2. quantization
+  # backend developer will write their own Quantizer and expose methods to allow
+  # users to express how they
+  # want the model to be quantized
+  quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config())
+  # or prepare_qat_pt2e for Quantization Aware Training
+  m = prepare_pt2e(m, quantizer)
+
+  # run calibration
+  # calibrate(m, sample_inference_data)
+  m = convert_pt2e(m)
+
+  # Step 3. lowering
+  # lower to target backend
+
+
+Please follow these tutorials to get started on PyTorch 2 Export Quantization:
+
+Modeling Users:
+
+- `PyTorch 2 Export Post Training Quantization <https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html>`_
+- `PyTorch 2 Export Post Training Quantization with X86 Backend through Inductor <https://pytorch.org/tutorials/prototype/pt2e_quant_x86_inductor.html>`_
+- `PyTorch 2 Export Quantization Aware Training <https://pytorch.org/tutorials/prototype/pt2e_quant_qat.html>`_
+
+Backend Developers (please check out all Modeling Users docs as well):
+
+- `How to Write a Quantizer for PyTorch 2 Export Quantization <https://pytorch.org/tutorials/prototype/pt2e_quantizer.html>`_
+
+
+Quantization Stack
+------------------------
+Quantization is the process to convert a floating point model to a quantized model. So at high level the quantization stack can be split into two parts: 1). The building blocks or abstractions for a quantized model 2). The building blocks or abstractions for the quantization flow that converts a floating point model to a quantized model
+
+Quantized Model
+^^^^^^^^^^^^^^^^^^^^^^^
+Quantized Tensor
+~~~~~~~~~~~~~~~~~
+In order to do quantization in PyTorch, we need to be able to represent
+quantized data in Tensors. A Quantized Tensor allows for storing
+quantized data (represented as int8/uint8/int32) along with quantization
+parameters like scale and zero\_point. Quantized Tensors allow for many
+useful operations making quantized arithmetic easy, in addition to
+allowing for serialization of data in a quantized format.
+
+PyTorch supports both per tensor and per channel symmetric and asymmetric quantization. Per tensor means that all the values within the tensor are quantized the same way with the same quantization parameters. Per channel means that for each dimension, typically the channel dimension of a tensor, the values in the tensor are quantized with different quantization parameters. This allows for less error in converting tensors to quantized values since outlier values would only impact the channel it was in, instead of the entire Tensor.
+
+The mapping is performed by converting the floating point tensors using
+
+.. image:: math-quantizer-equation.png
+   :width: 40%
+
+Note that, we ensure that zero in floating point is represented with no error
+after quantization, thereby ensuring that operations like padding do not cause
+additional quantization error.
+
+Here are a few key attributes for quantized Tensor:
+
+* QScheme (torch.qscheme): a enum that specifies the way we quantize the Tensor
+
+  * torch.per_tensor_affine
+  * torch.per_tensor_symmetric
+  * torch.per_channel_affine
+  * torch.per_channel_symmetric
+
+* dtype (torch.dtype): data type of the quantized Tensor
+
+  * torch.quint8
+  * torch.qint8
+  * torch.qint32
+  * torch.float16
+
+* quantization parameters (varies based on QScheme): parameters for the chosen way of quantization
+
+  * torch.per_tensor_affine would have quantization parameters of
+
+    * scale (float)
+    * zero_point (int)
+  * torch.per_channel_affine would have quantization parameters of
+
+    * per_channel_scales (list of float)
+    * per_channel_zero_points (list of int)
+    * axis (int)
+
+Quantize and Dequantize
+~~~~~~~~~~~~~~~~~~~~~~~
+The input and output of a model are floating point Tensors, but activations in the quantized model are quantized, so we need operators to convert between floating point and quantized Tensors.
+
+* Quantize (float -> quantized)
+
+  * torch.quantize_per_tensor(x, scale, zero_point, dtype)
+  * torch.quantize_per_channel(x, scales, zero_points, axis, dtype)
+  * torch.quantize_per_tensor_dynamic(x, dtype, reduce_range)
+  * to(torch.float16)
+
+* Dequantize (quantized -> float)
+
+  * quantized_tensor.dequantize() - calling dequantize on a torch.float16 Tensor will convert the Tensor back to torch.float
+  * torch.dequantize(x)
+
+Quantized Operators/Modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Quantized Operator are the operators that takes quantized Tensor as inputs, and outputs a quantized Tensor.
+* Quantized Modules are PyTorch Modules that performs quantized operations. They are typically defined for weighted operations like linear and conv.
+
+Quantized Engine
+~~~~~~~~~~~~~~~~~~~~
+When a quantized model is executed, the qengine (torch.backends.quantized.engine) specifies which backend is to be used for execution. It is important to ensure that the qengine is compatible with the quantized model in terms of value range of quantized activation and weights.
+
+Quantization Flow
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Observer and FakeQuantize
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Observer are PyTorch Modules used to:
+
+  * collect tensor statistics like min value and max value of the Tensor passing through the observer
+  * and calculate quantization parameters based on the collected tensor statistics
+* FakeQuantize are PyTorch Modules used to:
+
+  * simulate quantization (performing quantize/dequantize) for a Tensor in the network
+  * it can calculate quantization parameters based on the collected statistics from observer, or it can learn the quantization parameters as well
+
+QConfig
+~~~~~~~~~~~
+* QConfig is a namedtuple of Observer or FakeQuantize Module class that can are configurable with qscheme, dtype etc. it is used to configure how an operator should be observed
+
+  * Quantization configuration for an operator/module
+
+    * different types of Observer/FakeQuantize
+    * dtype
+    * qscheme
+    * quant_min/quant_max: can be used to simulate lower precision Tensors
+  * Currently supports configuration for activation and weight
+  * We insert input/weight/output observer based on the qconfig that is configured for a given operator or module
+
+General Quantization Flow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In general, the flow is the following
+
+* prepare
+
+  * insert Observer/FakeQuantize modules based on user specified qconfig
+
+* calibrate/train (depending on post training quantization or quantization aware training)
+
+  * allow Observers to collect statistics or FakeQuantize modules to learn the quantization parameters
+
+* convert
+
+  * convert a calibrated/trained model to a quantized model
+
+There are different modes of quantization, they can be classified in two ways:
+
+In terms of where we apply the quantization flow, we have:
+
+1. Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data)
+2. Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data)
+
+And in terms of how we quantize the operators, we can have:
+
+- Weight Only Quantization (only weight is statically quantized)
+- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized)
+- Static Quantization (both weight and activations are statically quantized)
+
+We can mix different ways of quantizing operators in the same quantization flow. For example, we can have post training quantization that has both statically and dynamically quantized operators.
+
+Quantization Support Matrix
+--------------------------------------
+Quantization Mode Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
++-----------------------------+------------------------------------------------------+----------------+----------------+------------+-----------------+
+|                             |Quantization                                          |Dataset         | Works Best For | Accuracy   |      Notes      |
+|                             |Mode                                                  |Requirement     |                |            |                 |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|Post Training Quantization   |Dynamic/Weight Only Quantization |activation          |None            |LSTM, MLP,      |good        |Easy to use,     |
+|                             |                                 |dynamically         |                |Embedding,      |            |close to static  |
+|                             |                                 |quantized (fp16,    |                |Transformer     |            |quantization when|
+|                             |                                 |int8) or not        |                |                |            |performance is   |
+|                             |                                 |quantized, weight   |                |                |            |compute or memory|
+|                             |                                 |statically quantized|                |                |            |bound due to     |
+|                             |                                 |(fp16, int8, in4)   |                |                |            |weights          |
+|                             +---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Static Quantization              |activation and      |calibration     |CNN             |good        |Provides best    |
+|                             |                                 |weights statically  |dataset         |                |            |perf, may have   |
+|                             |                                 |quantized (int8)    |                |                |            |big impact on    |
+|                             |                                 |                    |                |                |            |accuracy, good   |
+|                             |                                 |                    |                |                |            |for hardwares    |
+|                             |                                 |                    |                |                |            |that only support|
+|                             |                                 |                    |                |                |            |int8 computation |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Dynamic Quantization             |activation and      |fine-tuning     |MLP, Embedding  |best        |Limited support  |
+|                             |                                 |weight are fake     |dataset         |                |            |for now          |
+|                             |                                 |quantized           |                |                |            |                 |
+|                             +---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Static Quantization              |activation and      |fine-tuning     |CNN, MLP,       |best        |Typically used   |
+|                             |                                 |weight are fake     |dataset         |Embedding       |            |when static      |
+|                             |                                 |quantized           |                |                |            |quantization     |
+|                             |                                 |                    |                |                |            |leads to bad     |
+|                             |                                 |                    |                |                |            |accuracy, and    |
+|                             |                                 |                    |                |                |            |used to close the|
+|                             |                                 |                    |                |                |            |accuracy gap     |
+|Quantization Aware Training  |                                 |                    |                |                |            |                 |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Quantization Flow Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PyTorch provides two modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization.
+
+Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
+
+FX Graph Mode Quantization is an automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
+
+New users of quantization are encouraged to try out FX Graph Mode Quantization first, if it does not work, user may try to follow the guideline of `using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_ or fall back to eager mode quantization.
+
+The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization:
+
++-----------------+-------------------+-------------------+
+|                 |Eager Mode         |FX Graph           |
+|                 |Quantization       |Mode               |
+|                 |                   |Quantization       |
++-----------------+-------------------+-------------------+
+|Release          |beta               |prototype          |
+|Status           |                   |                   |
++-----------------+-------------------+-------------------+
+|Operator         |Manual             |Automatic          |
+|Fusion           |                   |                   |
++-----------------+-------------------+-------------------+
+|Quant/DeQuant    |Manual             |Automatic          |
+|Placement        |                   |                   |
++-----------------+-------------------+-------------------+
+|Quantizing       |Supported          |Supported          |
+|Modules          |                   |                   |
++-----------------+-------------------+-------------------+
+|Quantizing       |Manual             |Automatic          |
+|Functionals/Torch|                   |                   |
+|Ops              |                   |                   |
++-----------------+-------------------+-------------------+
+|Support for      |Limited Support    |Fully              |
+|Customization    |                   |Supported          |
++-----------------+-------------------+-------------------+
+|Quantization Mode|Post Training      |Post Training      |
+|Support          |Quantization:      |Quantization:      |
+|                 |Static, Dynamic,   |Static, Dynamic,   |
+|                 |Weight Only        |Weight Only        |
+|                 |                   |                   |
+|                 |Quantization Aware |Quantization Aware |
+|                 |Training:          |Training:          |
+|                 |Static             |Static             |
++-----------------+-------------------+-------------------+
+|Input/Output     |``torch.nn.Module``|``torch.nn.Module``|
+|Model Type       |                   |(May need some     |
+|                 |                   |refactors to make  |
+|                 |                   |the model          |
+|                 |                   |compatible with FX |
+|                 |                   |Graph Mode         |
+|                 |                   |Quantization)      |
++-----------------+-------------------+-------------------+
+
+Backend/Hardware Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
++-----------------+---------------+------------+------------+------------+
+|Hardware         |Kernel Library |Eager Mode  |FX Graph    |Quantization|
+|                 |               |Quantization|Mode        |Mode Support|
+|                 |               |            |Quantization|            |
++-----------------+---------------+------------+------------+------------+
+|server CPU       |fbgemm/onednn  |Supported                |All         |
+|                 |               |                         |Supported   |
++-----------------+---------------+                         |            +
+|mobile CPU       |qnnpack/xnnpack|                         |            |
+|                 |               |                         |            |
++-----------------+---------------+------------+------------+------------+
+|server GPU       |TensorRT (early|Not support |Supported   |Static      |
+|                 |prototype)     |this it     |            |Quantization|
+|                 |               |requires a  |            |            |
+|                 |               |graph       |            |            |
++-----------------+---------------+------------+------------+------------+
+
+Today, PyTorch supports the following backends for running quantized operators efficiently:
+
+* x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via `x86` optimized by `fbgemm <https://github.com/pytorch/FBGEMM>`_ and `onednn <https://github.com/oneapi-src/oneDNN>`_ (see the details at `RFC <https://github.com/pytorch/pytorch/issues/83888>`_)
+* ARM CPUs (typically found in mobile/embedded devices), via `qnnpack <https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native/quantized/cpu/qnnpack>`_
+* (early prototype) support for NVidia GPU via `TensorRT <https://developer.nvidia.com/tensorrt>`_ through `fx2trt` (to be open sourced)
+
+
+Note for native CPU backends
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We expose both `x86` and `qnnpack` with the same native pytorch quantized operators, so we need additional flag to distinguish between them. The corresponding implementation of  `x86` and `qnnpack` is chosen automatically based on the PyTorch build mode, though users have the option to override this by setting `torch.backends.quantization.engine` to `x86` or `qnnpack`.
+
+When preparing a quantized model, it is necessary to ensure that qconfig
+and the engine used for quantized computations match the backend on which
+the model will be executed. The qconfig controls the type of observers used
+during the quantization passes. The qengine controls whether `x86` or `qnnpack`
+specific packing function is used when packing weights for
+linear and convolution functions and modules. For example:
+
+Default settings for x86::
+
+    # set the qconfig for PTQ
+    # Note: the old 'fbgemm' is still available but 'x86' is the recommended default on x86 CPUs
+    qconfig = torch.ao.quantization.get_default_qconfig('x86')
+    # or, set the qconfig for QAT
+    qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
+    # set the qengine to control weight packing
+    torch.backends.quantized.engine = 'x86'
+
+Default settings for qnnpack::
+
+    # set the qconfig for PTQ
+    qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
+    # or, set the qconfig for QAT
+    qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack')
+    # set the qengine to control weight packing
+    torch.backends.quantized.engine = 'qnnpack'
+
+Operator Support
+^^^^^^^^^^^^^^^^^^^^
+
+Operator coverage varies between dynamic and static quantization and is captured in the table below.
+Note that for FX Graph Mode Quantization, the corresponding functionals are also supported.
+
++---------------------------+-------------------+--------------------+
+|                           |Static             | Dynamic            |
+|                           |Quantization       | Quantization       |
++---------------------------+-------------------+--------------------+
+| | nn.Linear               | | Y               | | Y                |
+| | nn.Conv1d/2d/3d         | | Y               | | N                |
++---------------------------+-------------------+--------------------+
+| | nn.LSTM                 | | N               | | Y                |
+| | nn.GRU                  | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+| | nn.RNNCell              | | N               | | Y                |
+| | nn.GRUCell              | | N               | | Y                |
+| | nn.LSTMCell             | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+|nn.EmbeddingBag            | Y (activations    |                    |
+|                           | are in fp32)      | Y                  |
++---------------------------+-------------------+--------------------+
+|nn.Embedding               | Y                 | Y                  |
++---------------------------+-------------------+--------------------+
+|nn.MultiheadAttention      |Not Supported      | Not supported      |
++---------------------------+-------------------+--------------------+
+|Activations                |Broadly supported  | Un-changed,        |
+|                           |                   | computations       |
+|                           |                   | stay in fp32       |
++---------------------------+-------------------+--------------------+
+
+Note: this will be updated with some information generated from native backend_config_dict soon.
+
+Quantization API Reference
+---------------------------
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The :doc:`Quantization API Reference <quantization-support>` contains documentation
 of quantization APIs, such as quantization passes, quantized tensor operations,
@@ -35,6 +925,340 @@ and supported quantized modules and functions.
 
     quantization-support
 
+<<<<<<< HEAD
+=======
+Quantization Backend Configuration
+----------------------------------
+
+The :doc:`Quantization Backend Configuration <quantization-backend-configuration>` contains documentation
+on how to configure the quantization workflows for various backends.
+
+.. toctree::
+    :hidden:
+
+    quantization-backend-configuration
+
+Quantization Accuracy Debugging
+-------------------------------
+
+The :doc:`Quantization Accuracy Debugging <quantization-accuracy-debugging>` contains documentation
+on how to debug quantization accuracy.
+
+.. toctree::
+    :hidden:
+
+    quantization-accuracy-debugging
+
+Quantization Customizations
+---------------------------
+
+While default implementations of observers to select the scale factor and bias
+based on observed tensor data are provided, developers can provide their own
+quantization functions. Quantization can be applied selectively to different
+parts of the model or configured differently for different parts of the model.
+
+We also provide support for per channel quantization for **conv1d()**, **conv2d()**,
+**conv3d()** and **linear()**.
+
+Quantization workflows work by adding (e.g. adding observers as
+``.observer`` submodule) or replacing (e.g. converting ``nn.Conv2d`` to
+``nn.quantized.Conv2d``) submodules in the model's module hierarchy. It
+means that the model stays a regular ``nn.Module``-based instance throughout the
+process and thus can work with the rest of PyTorch APIs.
+
+Quantization Custom Module API
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Both Eager mode and FX graph mode quantization APIs provide a hook for the user
+to specify module quantized in a custom way, with user defined logic for
+observation and quantization. The user needs to specify:
+
+1. The Python type of the source fp32 module (existing in the model)
+2. The Python type of the observed module (provided by user). This module needs
+   to define a `from_float` function which defines how the observed module is
+   created from the original fp32 module.
+3. The Python type of the quantized module (provided by user). This module needs
+   to define a `from_observed` function which defines how the quantized module is
+   created from the observed module.
+4. A configuration describing (1), (2), (3) above, passed to the quantization APIs.
+
+
+The framework will then do the following:
+
+1. during the `prepare` module swaps, it will convert every module of type
+   specified in (1) to the type specified in (2), using the `from_float` function of
+   the class in (2).
+2. during the `convert` module swaps, it will convert every module of type
+   specified in (2) to the type specified in (3), using the `from_observed` function
+   of the class in (3).
+
+Currently, there is a requirement that `ObservedCustomModule` will have a single
+Tensor output, and an observer will be added by the framework (not by the user)
+on that output. The observer will be stored under the `activation_post_process` key
+as an attribute of the custom module instance. Relaxing these restrictions may
+be done at a future time.
+
+Custom API Example::
+
+  import torch
+  import torch.ao.nn.quantized as nnq
+  from torch.ao.quantization import QConfigMapping
+  import torch.ao.quantization.quantize_fx
+
+  # original fp32 module to replace
+  class CustomModule(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.linear = torch.nn.Linear(3, 3)
+
+      def forward(self, x):
+          return self.linear(x)
+
+  # custom observed module, provided by user
+  class ObservedCustomModule(torch.nn.Module):
+      def __init__(self, linear):
+          super().__init__()
+          self.linear = linear
+
+      def forward(self, x):
+          return self.linear(x)
+
+      @classmethod
+      def from_float(cls, float_module):
+          assert hasattr(float_module, 'qconfig')
+          observed = cls(float_module.linear)
+          observed.qconfig = float_module.qconfig
+          return observed
+
+  # custom quantized module, provided by user
+  class StaticQuantCustomModule(torch.nn.Module):
+      def __init__(self, linear):
+          super().__init__()
+          self.linear = linear
+
+      def forward(self, x):
+          return self.linear(x)
+
+      @classmethod
+      def from_observed(cls, observed_module):
+          assert hasattr(observed_module, 'qconfig')
+          assert hasattr(observed_module, 'activation_post_process')
+          observed_module.linear.activation_post_process = \
+              observed_module.activation_post_process
+          quantized = cls(nnq.Linear.from_float(observed_module.linear))
+          return quantized
+
+  #
+  # example API call (Eager mode quantization)
+  #
+
+  m = torch.nn.Sequential(CustomModule()).eval()
+  prepare_custom_config_dict = {
+      "float_to_observed_custom_module_class": {
+          CustomModule: ObservedCustomModule
+      }
+  }
+  convert_custom_config_dict = {
+      "observed_to_quantized_custom_module_class": {
+          ObservedCustomModule: StaticQuantCustomModule
+      }
+  }
+  m.qconfig = torch.ao.quantization.default_qconfig
+  mp = torch.ao.quantization.prepare(
+      m, prepare_custom_config_dict=prepare_custom_config_dict)
+  # calibration (not shown)
+  mq = torch.ao.quantization.convert(
+      mp, convert_custom_config_dict=convert_custom_config_dict)
+  #
+  # example API call (FX graph mode quantization)
+  #
+  m = torch.nn.Sequential(CustomModule()).eval()
+  qconfig_mapping = QConfigMapping().set_global(torch.ao.quantization.default_qconfig)
+  prepare_custom_config_dict = {
+      "float_to_observed_custom_module_class": {
+          "static": {
+              CustomModule: ObservedCustomModule,
+          }
+      }
+  }
+  convert_custom_config_dict = {
+      "observed_to_quantized_custom_module_class": {
+          "static": {
+              ObservedCustomModule: StaticQuantCustomModule,
+          }
+      }
+  }
+  mp = torch.ao.quantization.quantize_fx.prepare_fx(
+      m, qconfig_mapping, torch.randn(3,3), prepare_custom_config=prepare_custom_config_dict)
+  # calibration (not shown)
+  mq = torch.ao.quantization.quantize_fx.convert_fx(
+      mp, convert_custom_config=convert_custom_config_dict)
+
+Best Practices
+--------------
+
+1. If you are using the ``x86`` backend, we need to use 7 bits instead of 8 bits. Make sure you reduce the range for the ``quant\_min``, ``quant\_max``, e.g.
+if ``dtype`` is ``torch.quint8``, make sure to set a custom ``quant_min`` to be ``0`` and ``quant_max`` to be ``127`` (``255`` / ``2``)
+if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be ``-64`` (``-128`` / ``2``) and ``quant_max`` to be ``63`` (``127`` / ``2``), we already set this correctly if
+you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for
+``x86`` or ``qnnpack`` backend
+
+2. If ``onednn`` backend is selected, 8 bits for activation will be used in the default qconfig mapping ``torch.ao.quantization.get_default_qconfig_mapping('onednn')``
+and default qconfig ``torch.ao.quantization.get_default_qconfig('onednn')``. It is recommended to be used on CPUs with Vector Neural Network Instruction (VNNI)
+support. Otherwise, setting ``reduce_range`` to True of the activation's observer to get better accuracy on CPUs without VNNI support.
+
+Frequently Asked Questions
+--------------------------
+
+1. How can I do quantized inference on GPU?:
+
+   We don't have official GPU support yet, but this is an area of active development, you can find more information
+   `here <https://github.com/pytorch/pytorch/issues/87395>`_
+
+2. Where can I get ONNX support for my quantized model?
+
+   If you get errors exporting the model (using APIs under ``torch.onnx``), you may open an issue in the PyTorch repository. Prefix the issue title with ``[ONNX]`` and tag the issue as ``module: onnx``.
+
+   If you encounter issues with ONNX Runtime, open an issue at `GitHub - microsoft/onnxruntime <https://github.com/microsoft/onnxruntime/issues/>`_.
+
+3. How can I use quantization with LSTM's?:
+
+   LSTM is supported through our custom module api in both eager mode and fx graph mode quantization. Examples can be found at
+   Eager Mode: `pytorch/test_quantized_op.py TestQuantizedOps.test_custom_module_lstm <https://github.com/pytorch/pytorch/blob/9b88dcf248e717ca6c3f8c5e11f600825547a561/test/quantization/core/test_quantized_op.py#L2782>`_
+   FX Graph Mode: `pytorch/test_quantize_fx.py TestQuantizeFx.test_static_lstm <https://github.com/pytorch/pytorch/blob/9b88dcf248e717ca6c3f8c5e11f600825547a561/test/quantization/fx/test_quantize_fx.py#L4116>`_
+
+Common Errors
+---------------------------------------
+
+Passing a non-quantized Tensor into a quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'quantized::some_operator' with arguments from the 'CPU' backend...
+
+This means that you are trying to pass a non-quantized Tensor to a quantized
+kernel. A common workaround is to use ``torch.ao.quantization.QuantStub`` to
+quantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.ao.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv(x)
+          return x
+
+Passing a quantized Tensor into a non-quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend.
+
+This means that you are trying to pass a quantized Tensor to a non-quantized
+kernel. A common workaround is to use ``torch.ao.quantization.DeQuantStub`` to
+dequantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.ao.quantization.QuantStub()
+          self.conv1 = torch.nn.Conv2d(1, 1, 1)
+          # this module will not be quantized (see `qconfig = None` logic below)
+          self.conv2 = torch.nn.Conv2d(1, 1, 1)
+          self.dequant = torch.ao.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv1(x)
+          # during the convert step, this will be replaced with a
+          # `dequantize` call
+          x = self.dequant(x)
+          x = self.conv2(x)
+          return x
+
+  m = M()
+  m.qconfig = some_qconfig
+  # turn off quantization for conv2
+  m.conv2.qconfig = None
+
+Saving and Loading Quantized models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When calling ``torch.load`` on a quantized model, if you see an error like::
+
+  AttributeError: 'LinearPackedParams' object has no attribute '_modules'
+
+This is because directly saving and loading a quantized model using ``torch.save`` and ``torch.load``
+is not supported. To save/load quantized models, the following ways can be used:
+
+1. Saving/Loading the quantized model state_dict
+
+An example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.linear = nn.Linear(5, 5)
+          self.relu = nn.ReLU()
+
+      def forward(self, x):
+          x = self.linear(x)
+          x = self.relu(x)
+          return x
+
+  m = M().eval()
+  prepare_orig = prepare_fx(m, {'' : default_qconfig})
+  prepare_orig(torch.rand(5, 5))
+  quantized_orig = convert_fx(prepare_orig)
+
+  # Save/load using state_dict
+  b = io.BytesIO()
+  torch.save(quantized_orig.state_dict(), b)
+
+  m2 = M().eval()
+  prepared = prepare_fx(m2, {'' : default_qconfig})
+  quantized = convert_fx(prepared)
+  b.seek(0)
+  quantized.load_state_dict(torch.load(b))
+
+2. Saving/Loading scripted quantized models using ``torch.jit.save`` and ``torch.jit.load``
+
+An example::
+
+  # Note: using the same model M from previous example
+  m = M().eval()
+  prepare_orig = prepare_fx(m, {'' : default_qconfig})
+  prepare_orig(torch.rand(5, 5))
+  quantized_orig = convert_fx(prepare_orig)
+
+  # save/load using scripted model
+  scripted = torch.jit.script(quantized_orig)
+  b = io.BytesIO()
+  torch.jit.save(scripted, b)
+  b.seek(0)
+  scripted_quantized = torch.jit.load(b)
+
+Symbolic Trace Error when using FX Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Symbolic traceability is a requirement for `(Prototype - maintenance mode) FX Graph Mode Quantization`_, so if you pass a PyTorch Model that is not symbolically traceable to `torch.ao.quantization.prepare_fx` or `torch.ao.quantization.prepare_qat_fx`, we might see an error like the following::
+
+  torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow
+
+Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs/2.0/fx.html#limitations-of-symbolic-tracing>`_ and use - `User Guide on Using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_ to workaround the problem.
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. torch.ao is missing documentation. Since part of it is mentioned here, adding them here for now.
 .. They are here for tracking purposes until they are more permanently fixed.
 .. py:module:: torch.ao
@@ -96,8 +1320,13 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.ns.fx.ns_types
 .. py:module:: torch.ao.ns.fx.pattern_utils
 .. py:module:: torch.ao.ns.fx.qconfig_multi_mapping
+<<<<<<< HEAD
 .. py:module:: torch.ao.ns.fx.weight_utils
 .. py:module:: torch.ao.ns.fx.utils
+=======
+.. py:module:: torch.ao.ns.fx.utils
+.. py:module:: torch.ao.ns.fx.weight_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.ao.pruning.scheduler.base_scheduler
 .. py:module:: torch.ao.pruning.scheduler.cubic_scheduler
 .. py:module:: torch.ao.pruning.scheduler.lambda_scheduler
@@ -109,6 +1338,10 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.backend_config.executorch
 .. py:module:: torch.ao.quantization.backend_config.fbgemm
 .. py:module:: torch.ao.quantization.backend_config.native
+<<<<<<< HEAD
+=======
+.. py:module:: torch.ao.quantization.backend_config.observation_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.ao.quantization.backend_config.onednn
 .. py:module:: torch.ao.quantization.backend_config.qnnpack
 .. py:module:: torch.ao.quantization.backend_config.tensorrt
@@ -134,6 +1367,10 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.fx.utils
 .. py:module:: torch.ao.quantization.observer
 .. py:module:: torch.ao.quantization.pt2e.duplicate_dq_pass
+<<<<<<< HEAD
+=======
+.. py:module:: torch.ao.quantization.pt2e.export_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.ao.quantization.pt2e.graph_utils
 .. py:module:: torch.ao.quantization.pt2e.port_metadata_pass
 .. py:module:: torch.ao.quantization.pt2e.prepare
@@ -157,6 +1394,10 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer_utils
 .. py:module:: torch.ao.quantization.stubs
+<<<<<<< HEAD
+=======
+.. py:module:: torch.ao.quantization.utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.nn.intrinsic.modules.fused
 .. py:module:: torch.nn.intrinsic.qat.modules.conv_fused
 .. py:module:: torch.nn.intrinsic.qat.modules.linear_fused
@@ -207,9 +1448,12 @@ and supported quantized modules and functions.
 .. py:module:: torch.quantization.quantize_jit
 .. py:module:: torch.quantization.stubs
 .. py:module:: torch.quantization.utils
+<<<<<<< HEAD
 
 
 .. currentmodule:: torch.ao.ns.fx.utils
 .. autofunction:: torch.ao.ns.fx.utils.compute_sqnr(x, y)
 .. autofunction:: torch.ao.ns.fx.utils.compute_normalized_l2_error(x, y)
 .. autofunction:: torch.ao.ns.fx.utils.compute_cosine_similarity(x, y)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/rpc.md b/docs/source/rpc.md
index 38e9354f70d9f..24fa7cb157c59 100644
--- a/docs/source/rpc.md
+++ b/docs/source/rpc.md
@@ -8,6 +8,7 @@ higher-level API to automatically differentiate models split across several
 machines.
 
 ```{warning}
+<<<<<<< HEAD
 APIs in the RPC package are stable and in maintenance mode.
 ```
 
@@ -16,6 +17,18 @@ CUDA support is a **beta** feature.
 Not all features of the RPC package are yet compatible with CUDA support and
 thus their use is discouraged. These unsupported features include: RRefs,
 JIT compatibility, dist autograd and dist optimizer, and profiling.
+=======
+APIs in the RPC package are stable. There are multiple ongoing work items
+to improve performance and error handling, which will ship in future releases.
+```
+
+```{warning}
+CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
+Not all features of the RPC package are yet compatible with CUDA support and
+thus their use is discouraged. These unsupported features include: RRefs,
+JIT compatibility, dist autograd and dist optimizer, and profiling. These
+shortcomings will be addressed in future releases.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ```{note}
@@ -100,6 +113,16 @@ device lists on source and destination workers do not match. In such cases,
 applications can always explicitly move the input tensors to CPU on the caller
 and move it to the desired devices on the callee if necessary.
 
+<<<<<<< HEAD
+=======
+```{warning}
+  TorchScript support in RPC is a prototype feature and subject to change. Since
+  v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
+  RPC target functions, and this will help improve parallelism on the callee
+  side as executing TorchScript functions does not require GIL.
+```
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```{eval-rst}
 .. autofunction:: rpc_sync
 .. autofunction:: rpc_async
@@ -150,7 +173,13 @@ multiple different transports (TCP, of course, but also shared memory, NVLink,
 InfiniBand, ...) and can automatically detect their availability and negotiate
 the best transport to use for each pipe.
 
+<<<<<<< HEAD
 The TensorPipe backend comes with a TCP-based transport, just like Gloo. It is also able to
+=======
+The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
+developed. At the moment, it only supports CPU tensors, with GPU support coming
+soon. It comes with a TCP-based transport, just like Gloo. It is also able to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 automatically chunk and multiplex large tensors over multiple sockets and
 threads in order to achieve very high bandwidths. The agent will be able to pick
 the best transport on its own, with no intervention required.
@@ -290,4 +319,10 @@ to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to
 -  [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
 -  [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
 -  [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
+<<<<<<< HEAD
+-  [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
+=======
+-  [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
 -  [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
+-  [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/scripts/build_quantization_configs.py b/docs/source/scripts/build_quantization_configs.py
new file mode 100644
index 0000000000000..5d1f445ade9a1
--- /dev/null
+++ b/docs/source/scripts/build_quantization_configs.py
@@ -0,0 +1,64 @@
+"""
+This script will generate default values of quantization configs.
+These are for use in the documentation.
+"""
+
+import os.path
+
+import torch
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+from torch.ao.quantization.backend_config.utils import (
+    entry_to_pretty_str,
+    remove_boolean_dispatch_from_name,
+)
+
+
+# Create a directory for the images, if it doesn't exist
+QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH = os.path.join(
+    os.path.realpath(os.path.dirname(__file__)), "quantization_backend_configs"
+)
+
+if not os.path.exists(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH):
+    os.mkdir(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH)
+
+output_path = os.path.join(
+    QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH, "default_backend_config.txt"
+)
+
+with open(output_path, "w") as f:
+    native_backend_config_dict = get_native_backend_config_dict()
+
+    configs = native_backend_config_dict["configs"]
+
+    def _sort_key_func(entry):
+        pattern = entry["pattern"]
+        while isinstance(pattern, tuple):
+            pattern = pattern[-1]
+
+        pattern = remove_boolean_dispatch_from_name(pattern)
+        if not isinstance(pattern, str):
+            # methods are already strings
+            pattern = torch.typename(pattern)
+
+        # we want
+        #
+        #   torch.nn.modules.pooling.AdaptiveAvgPool1d
+        #
+        # and
+        #
+        #   torch._VariableFunctionsClass.adaptive_avg_pool1d
+        #
+        # to be next to each other, so convert to all lower case
+        # and remove the underscores, and compare the last part
+        # of the string
+        pattern_str_normalized = pattern.lower().replace("_", "")
+        key = pattern_str_normalized.split(".")[-1]
+        return key
+
+    configs.sort(key=_sort_key_func)
+
+    entries = []
+    for entry in configs:
+        entries.append(entry_to_pretty_str(entry))
+    entries = ",\n".join(entries)
+    f.write(entries)
diff --git a/docs/source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py b/docs/source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
new file mode 100644
index 0000000000000..6e512d59507c3
--- /dev/null
+++ b/docs/source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
@@ -0,0 +1,59 @@
+"""
+This script generates a CSV table with all ATen operators
+supported by `torch.onnx.export`. The generated table is included by
+docs/source/onnx_supported_aten_list.rst.
+"""
+
+import os
+
+from torch.onnx import _onnx_supported_ops
+
+
+# Constants
+BUILD_DIR = "build/onnx"
+SUPPORTED_OPS_CSV_FILE = "auto_gen_supported_op_list.csv"
+UNSUPPORTED_OPS_CSV_FILE = "auto_gen_unsupported_op_list.csv"
+
+
+def _sort_key(namespaced_opname):
+    return tuple(reversed(namespaced_opname.split("::")))
+
+
+def _get_op_lists():
+    all_schemas = _onnx_supported_ops.all_forward_schemas()
+    symbolic_schemas = _onnx_supported_ops.all_symbolics_schemas()
+    supported_result = set()
+    not_supported_result = set()
+    for opname in all_schemas:
+        opname = opname.removesuffix("_")
+        if opname in symbolic_schemas:
+            # Supported op
+            opsets = symbolic_schemas[opname].opsets
+            supported_result.add((opname, f"Since opset {opsets[0]}"))
+        else:
+            # Unsupported op
+            not_supported_result.add((opname, "Not yet supported"))
+    return (
+        sorted(supported_result, key=lambda x: _sort_key(x[0])),
+        sorted(not_supported_result),
+    )
+
+
+def main():
+    os.makedirs(BUILD_DIR, exist_ok=True)
+
+    supported, unsupported = _get_op_lists()
+
+    with open(os.path.join(BUILD_DIR, SUPPORTED_OPS_CSV_FILE), "w") as f:
+        f.write("Operator,opset_version(s)\n")
+        for name, opset_version in supported:
+            f.write(f'"``{name}``","{opset_version}"\n')
+
+    with open(os.path.join(BUILD_DIR, UNSUPPORTED_OPS_CSV_FILE), "w") as f:
+        f.write("Operator,opset_version(s)\n")
+        for name, opset_version in unsupported:
+            f.write(f'"``{name}``","{opset_version}"\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst
index eda8dbce234ce..856175a3c12af 100644
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@@ -17,6 +17,7 @@ torch.dtype
 A :class:`torch.dtype` is an object that represents the data type of a
 :class:`torch.Tensor`. PyTorch has several different data types:
 
+<<<<<<< HEAD
 **Floating point dtypes**
 
 =========================================  ===============================
@@ -76,6 +77,32 @@ dtype                                      description
 **Note**: legacy constructors such as ``torch.*.FloatTensor``, ``torch.*.DoubleTensor``, ``torch.*.HalfTensor``,
 ``torch.*.BFloat16Tensor``, ``torch.*.ByteTensor``, ``torch.*.CharTensor``, ``torch.*.ShortTensor``, ``torch.*.IntTensor``,
 ``torch.*.LongTensor``, ``torch.*.BoolTensor`` only remain for backwards compatibility and should no longer be used.
+=======
+========================== ===========================================   ===========================
+Data type                  dtype                                         Legacy Constructors
+========================== ===========================================   ===========================
+32-bit floating point      ``torch.float32`` or ``torch.float``          ``torch.*.FloatTensor``
+64-bit floating point      ``torch.float64`` or ``torch.double``         ``torch.*.DoubleTensor``
+32-bit complex             ``torch.complex32`` or ``torch.chalf``
+64-bit complex             ``torch.complex64`` or ``torch.cfloat``
+128-bit complex            ``torch.complex128`` or ``torch.cdouble``
+16-bit floating point [1]_ ``torch.float16`` or ``torch.half``           ``torch.*.HalfTensor``
+16-bit floating point [2]_ ``torch.bfloat16``                            ``torch.*.BFloat16Tensor``
+8-bit integer (unsigned)   ``torch.uint8``                               ``torch.*.ByteTensor``
+8-bit integer (signed)     ``torch.int8``                                ``torch.*.CharTensor``
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            ``torch.*.ShortTensor``
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              ``torch.*.IntTensor``
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             ``torch.*.LongTensor``
+Boolean                    ``torch.bool``                                ``torch.*.BoolTensor``
+========================== ===========================================   ===========================
+
+.. [1] Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10
+  significand bits. Useful when precision is important.
+
+.. [2] Sometimes referred to as Brain Floating Point: use 1 sign, 8 exponent and 7
+  significand bits. Useful when range is important, since it has the same
+  number of exponent bits as ``float32``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 To find out if a :class:`torch.dtype` is a floating point data type, the property :attr:`is_floating_point`
 can be used, which returns ``True`` if the data type is a floating point data type.
@@ -99,8 +126,13 @@ by finding the minimum dtype that satisfies the following rules:
 
 A floating point scalar operand has dtype `torch.get_default_dtype()` and an integral
 non-boolean scalar operand has dtype `torch.int64`. Unlike numpy, we do not inspect
+<<<<<<< HEAD
 values when determining the minimum `dtypes` of an operand.  Complex types
 are not yet supported. Promotion for shell dtypes is not defined.
+=======
+values when determining the minimum `dtypes` of an operand.  Quantized and complex types
+are not yet supported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Promotion Examples::
 
@@ -184,6 +216,7 @@ the result of :func:`torch.cuda.current_device()`.
 
 A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
 
+<<<<<<< HEAD
 A :class:`torch.device` can be constructed using:
 
   * A device string, which is a string representation of the device type and optionally the device ordinal.
@@ -191,6 +224,11 @@ A :class:`torch.device` can be constructed using:
   * A device ordinal, where the current :ref:`accelerator<accelerators>` type will be used.
 
 Via a device string:
+=======
+A :class:`torch.device` can be constructed via a string or via a string and device ordinal
+
+Via a string:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ::
 
     >>> torch.device('cuda:0')
@@ -202,10 +240,17 @@ Via a device string:
     >>> torch.device('mps')
     device(type='mps')
 
+<<<<<<< HEAD
     >>> torch.device('cuda')  # implicit index is the "current device index"
     device(type='cuda')
 
 Via a device type and a device ordinal:
+=======
+    >>> torch.device('cuda')  # current cuda device
+    device(type='cuda')
+
+Via a string and device ordinal:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ::
 
@@ -218,6 +263,7 @@ Via a device type and a device ordinal:
     >>> torch.device('cpu', 0)
     device(type='cpu', index=0)
 
+<<<<<<< HEAD
 Via a device ordinal:
 
 .. note::
@@ -236,6 +282,8 @@ Via a device ordinal:
       File "<stdin>", line 1, in <module>
     RuntimeError: Cannot access accelerator device when none is available.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 The device object can also be used as a context manager to change the default
 device tensors are allocated on:
 
@@ -269,12 +317,30 @@ non-None device argument.  To globally change the default device, see also
    >>> torch.randn((2,3), device='cuda:1')
 
 .. note::
+<<<<<<< HEAD
    Methods which take a device will generally accept a (properly formatted) string
    or an integer device ordinal, i.e. the following are all equivalent:
 
    >>> torch.randn((2,3), device=torch.device('cuda:1'))
    >>> torch.randn((2,3), device='cuda:1')
    >>> torch.randn((2,3), device=1)  # equivalent to 'cuda:1' if the current accelerator is cuda
+=======
+   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
+   as the current :ref:`accelerator<accelerators>` type.
+   This matches :meth:`Tensor.get_device`, which returns an ordinal for device
+   tensors and is not supported for cpu tensors.
+
+   >>> torch.device(1)
+   device(type='cuda', index=1)
+
+.. note::
+   Methods which take a device will generally accept a (properly formatted) string
+   or (legacy) integer device ordinal, i.e. the following are all equivalent:
+
+   >>> torch.randn((2,3), device=torch.device('cuda:1'))
+   >>> torch.randn((2,3), device='cuda:1')
+   >>> torch.randn((2,3), device=1)  # legacy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. note::
    Tensors are never moved automatically between devices and require an explicit call from the user. Scalar Tensors (with tensor.dim()==0) are the only exception to this rule and they are automatically transferred from CPU to GPU when needed as this operation can be done "for free".
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index c2336dfd81ec0..07e51e7b1bb25 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -6,7 +6,88 @@ torch.Tensor
 ===================================
 
 A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of
+<<<<<<< HEAD
 a single data type. Please see :ref:`dtype-doc` for more details about dtype support.
+=======
+a single data type.
+
+
+Data types
+----------
+
+Torch defines tensor types with the following data types:
+
+======================================= ===========================================
+Data type                               dtype
+======================================= ===========================================
+32-bit floating point                   ``torch.float32`` or ``torch.float``
+64-bit floating point                   ``torch.float64`` or ``torch.double``
+16-bit floating point [1]_              ``torch.float16`` or ``torch.half``
+16-bit floating point [2]_              ``torch.bfloat16``
+32-bit complex                          ``torch.complex32`` or ``torch.chalf``
+64-bit complex                          ``torch.complex64`` or ``torch.cfloat``
+128-bit complex                         ``torch.complex128`` or ``torch.cdouble``
+8-bit integer (unsigned)                ``torch.uint8``
+16-bit integer (unsigned)               ``torch.uint16`` (limited support) [4]_
+32-bit integer (unsigned)               ``torch.uint32`` (limited support) [4]_
+64-bit integer (unsigned)               ``torch.uint64`` (limited support) [4]_
+8-bit integer (signed)                  ``torch.int8``
+16-bit integer (signed)                 ``torch.int16`` or ``torch.short``
+32-bit integer (signed)                 ``torch.int32`` or ``torch.int``
+64-bit integer (signed)                 ``torch.int64`` or ``torch.long``
+Boolean                                 ``torch.bool``
+quantized 8-bit integer (unsigned)      ``torch.quint8``
+quantized 8-bit integer (signed)        ``torch.qint8``
+quantized 32-bit integer (signed)       ``torch.qint32``
+quantized 4-bit integer (unsigned) [3]_ ``torch.quint4x2``
+8-bit floating point, e4m3 [5]_         ``torch.float8_e4m3fn`` (limited support)
+8-bit floating point, e5m2 [5]_         ``torch.float8_e5m2`` (limited support)
+======================================= ===========================================
+
+.. [1]
+  Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10
+  significand bits. Useful when precision is important at the expense of range.
+.. [2]
+  Sometimes referred to as Brain Floating Point: uses 1 sign, 8 exponent, and 7
+  significand bits. Useful when range is important, since it has the same
+  number of exponent bits as ``float32``
+.. [3]
+  quantized 4-bit integer is stored as a 8-bit signed integer. Currently it's only supported in EmbeddingBag operator.
+.. [4]
+  Unsigned types asides from ``uint8`` are currently planned to only have
+  limited support in eager mode (they primarily exist to assist usage with
+  torch.compile); if you need eager support and the extra range is not needed,
+  we recommend using their signed variants instead.  See
+  https://github.com/pytorch/pytorch/issues/58734 for more details.
+.. [5]
+  ``torch.float8_e4m3fn`` and ``torch.float8_e5m2`` implement the spec for 8-bit
+  floating point types from https://arxiv.org/abs/2209.05433. The op support
+  is very limited.
+
+
+For backwards compatibility, we support the following alternate class names
+for these data types:
+
+======================================= ============================= ================================
+Data type                               CPU tensor                    GPU tensor
+======================================= ============================= ================================
+32-bit floating point                   :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
+64-bit floating point                   :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
+16-bit floating point                   :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
+16-bit floating point                   :class:`torch.BFloat16Tensor` :class:`torch.cuda.BFloat16Tensor`
+8-bit integer (unsigned)                :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
+8-bit integer (signed)                  :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
+16-bit integer (signed)                 :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
+32-bit integer (signed)                 :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
+64-bit integer (signed)                 :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
+Boolean                                 :class:`torch.BoolTensor`     :class:`torch.cuda.BoolTensor`
+======================================= ============================= ================================
+
+However, to construct tensors, we recommend using factory functions such as
+:func:`torch.empty` with the ``dtype`` argument instead.  The
+:class:`torch.Tensor` constructor is an alias for the default tensor type
+(:class:`torch.FloatTensor`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Initializing and basic operations
 ---------------------------------
diff --git a/docs/source/torch.ao.ns._numeric_suite.md b/docs/source/torch.ao.ns._numeric_suite.md
new file mode 100644
index 0000000000000..b1466470fe26c
--- /dev/null
+++ b/docs/source/torch.ao.ns._numeric_suite.md
@@ -0,0 +1,16 @@
+(torch_ao_ns_numeric_suite)=
+
+# torch.ao.ns._numeric_suite
+
+```{warning}
+This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.ns._numeric_suite
+```
+```{eval-rst}
+.. automodule:: torch.ao.ns._numeric_suite
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/torch.ao.ns._numeric_suite_fx.md b/docs/source/torch.ao.ns._numeric_suite_fx.md
new file mode 100644
index 0000000000000..46a46d598f4f5
--- /dev/null
+++ b/docs/source/torch.ao.ns._numeric_suite_fx.md
@@ -0,0 +1,39 @@
+(torch_ao_ns_numeric_suite_fx)=
+
+# torch.ao.ns._numeric_suite_fx
+
+
+```{warning}
+    This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. automodule:: torch.ao.ns._numeric_suite_fx
+    :members:
+    :member-order: bysource
+
+```
+---
+
+# torch.ao.ns.fx.utils
+
+
+```{warning}
+    This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.ns.fx.utils
+```
+
+```{eval-rst}
+.. function:: compute_sqnr(x, y)
+```
+
+```{eval-rst}
+.. function:: compute_normalized_l2_error(x, y)
+```
+
+```{eval-rst}
+.. function:: compute_cosine_similarity(x, y)
+```
\ No newline at end of file
diff --git a/docs/source/torch.compiler.config.md b/docs/source/torch.compiler.config.md
index e67cbb1f27112..77b087dcc1516 100644
--- a/docs/source/torch.compiler.config.md
+++ b/docs/source/torch.compiler.config.md
@@ -1,12 +1,23 @@
 ```{eval-rst}
 .. currentmodule:: torch.compiler.config
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 # torch.compiler.config
 
 ```{eval-rst}
 .. automodule:: torch.compiler.config
+<<<<<<< HEAD
    :members:
    :undoc-members:
    :show-inheritance:
+=======
+```
+
+```{eval-rst}
+.. autodata:: torch.compiler.config.job_id
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/docs/source/torch.compiler.md b/docs/source/torch.compiler.md
index 11e22aae4cf3f..c7c5a9d6db7db 100644
--- a/docs/source/torch.compiler.md
+++ b/docs/source/torch.compiler.md
@@ -26,9 +26,12 @@ written in Python and it marks the transition of PyTorch from C++ to Python.
   which results in capturing the backwards pass "ahead-of-time". This enables
   acceleration of both forwards and backwards pass using TorchInductor.
 
+<<<<<<< HEAD
 To better understand how `torch.compile` tracing behavior on your code, or to
 learn more about the internals of `torch.compile`, please refer to the [`torch.compile` programming model](compile/programming_model.md).
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 :::{note}
 In some cases, the terms `torch.compile`, TorchDynamo, `torch.compiler`
 might be used interchangeably in this documentation.
@@ -39,7 +42,11 @@ TorchDynamo requires a backend that converts the captured graphs into a fast
 machine code. Different backends can result in various optimization gains.
 The default backend is called TorchInductor, also known as *inductor*,
 TorchDynamo has a list of supported backends developed by our partners,
+<<<<<<< HEAD
 which can be seen by running `torch.compiler.list_backends()` each of which
+=======
+which can be see by running `torch.compiler.list_backends()` each of which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 with its optional dependencies.
 
 Some of the most commonly used backends include:
@@ -59,6 +66,11 @@ Some of the most commonly used backends include:
      - CUDA graphs with AOT Autograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
    * - ``torch.compile(m, backend="ipex")``
      - Uses IPEX on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+<<<<<<< HEAD
+=======
+   * - ``torch.compile(m, backend="onnxrt")``
+     - Uses ONNX Runtime for training on CPU/GPU. :doc:`Read more <onnx_dynamo_onnxruntime_backend>`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 **Inference-only backends**
@@ -82,6 +94,7 @@ Some of the most commonly used backends include:
 
 ## Read More
 
+<<<<<<< HEAD
 ```{toctree}
 :caption: Getting Started for PyTorch Users
 :maxdepth: 2
@@ -126,4 +139,49 @@ torch.compiler_fake_tensor
 torch.compiler_custom_backends
 torch.compiler_transformations
 torch.compiler_ir
+=======
+```{eval-rst}
+.. toctree::
+   :caption: Getting Started for PyTorch Users
+   :maxdepth: 1
+
+   torch.compiler_get_started
+   torch.compiler_api
+   torch.compiler.config
+   torch.compiler_fine_grain_apis
+   torch.compiler_aot_inductor
+   torch.compiler_inductor_profiling
+   torch.compiler_profiling_torch_compile
+   torch.compiler_faq
+   torch.compiler_troubleshooting
+   torch.compiler_performance_dashboard
+   torch.compiler_inductor_provenance
+```
+
+% _If you want to contribute a developer-level topic
+%  that provides in-depth overview of a torch._dynamo feature,
+%  add in the below toc.
+
+```{eval-rst}
+.. toctree::
+   :caption: Deep Dive for PyTorch Developers
+   :maxdepth: 1
+
+   torch.compiler_dynamo_overview
+   torch.compiler_dynamo_deepdive
+   torch.compiler_dynamic_shapes
+   torch.compiler_nn_module
+   torch.compiler_cudagraph_trees
+   torch.compiler_fake_tensor
+```
+
+```{eval-rst}
+.. toctree::
+   :caption: HowTo for PyTorch Backend Vendors
+   :maxdepth: 1
+
+   torch.compiler_custom_backends
+   torch.compiler_transformations
+   torch.compiler_ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/docs/source/torch.compiler_aot_inductor.md b/docs/source/torch.compiler_aot_inductor.md
index e1de040114915..36aa3b0d6531c 100644
--- a/docs/source/torch.compiler_aot_inductor.md
+++ b/docs/source/torch.compiler_aot_inductor.md
@@ -1,7 +1,17 @@
+<<<<<<< HEAD
 (torch.compiler_aot_inductor)=
 
 # AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models
 
+=======
+# AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models
+
+```{warning}
+AOTInductor and its related features are in prototype status and are
+subject to backwards compatibility breaking changes.
+```
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTInductor is a specialized version of
 [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747),
 designed to process exported PyTorch models, optimize them, and produce shared libraries as well
@@ -22,7 +32,11 @@ relies on.
 
 We will then use {func}`torch._inductor.aoti_compile_and_package` to compile the
 exported program using TorchInductor, and save the compiled artifacts into one
+<<<<<<< HEAD
 package. The package is in the format of a {ref}`PT2 Archive Spec <export.pt2_archive>`.
+=======
+package.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```{note}
 If you have a CUDA-enabled device on your machine and you installed PyTorch with CUDA support,
@@ -68,10 +82,13 @@ with torch.no_grad():
         # [Optional] Specify the generated shared library path. If not specified,
         # the generated artifact is stored in your system temp directory.
         package_path=os.path.join(os.getcwd(), "model.pt2"),
+<<<<<<< HEAD
         # [Optional] Specify Inductor configs
         # This specific max_autotune option will turn on more extensive kernel autotuning for
         # better performance.
         inductor_configs={"max_autotune": True,},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 ```
 
@@ -201,7 +218,10 @@ Below are some useful tools for debugging AOT Inductor.
 
 logging
 torch.compiler_aot_inductor_minifier
+<<<<<<< HEAD
 torch.compiler_aot_inductor_debugging_guide
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.
diff --git a/docs/source/torch.compiler_cudagraph_trees.md b/docs/source/torch.compiler_cudagraph_trees.md
index eb137625ea746..713a91304ed7e 100644
--- a/docs/source/torch.compiler_cudagraph_trees.md
+++ b/docs/source/torch.compiler_cudagraph_trees.md
@@ -219,7 +219,10 @@ may skip CUDAGraph when necessary. Here, we list common reasons for skipping CUD
   [dynamic shapes](https://pytorch.org/docs/stable/torch.compiler_dynamic_shapes.html).
   CUDAGraph Trees currently record a CUDAGraph for every unique input tensor shapes.
   Please see *Dynamic Shape Support* for more details.
+<<<<<<< HEAD
 - **CUDAGraph-unsafe custom ops**: Some custom ops may include cudagraph unsafe ops, which causes cudagraph to be skipped. Please see *CUDAGraph Unsafe Custom Ops* for more details.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - **Incompatible operators**: CUDAGraph Trees skip a function if it contain incompatible
   operators. Please replace these operators in a function with supported operators. We
   show an exhaustive list of incompatible operators:
@@ -250,6 +253,7 @@ aten._local_scalar_dense
 aten._assert_scalar
 ```
 
+<<<<<<< HEAD
 ### CUDAGraph Unsafe Custom Ops
 Custom ops are assumed to be safe for CUDAGraph by default. However, some custom ops may include unsupported ops such as cpu ops. Since custom op are treated as black boxes by the compiler, users must explicitly mark these ops as unsafe for CUDAGraph by setting the `torch._C.Tag.cudagraph_unsafe` tag, as demonstrated in the example below. When a function contains cudagraph-unsafe custom ops, it will be skipped by CUDAGraph unless *CUDAGraph partition* is enabled.
 
@@ -293,6 +297,8 @@ Currently, CUDAGraph partition supports splitting off the following types of ops
 - **Unbacked Symints**: Please refer to *Dynamic Shape Support* section for more information.
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ### Limitations
 
 Because CUDA Graph fixes memory addresses, CUDA Graphs do not have a great way of handling live tensors from a previous invocation.
@@ -328,4 +334,8 @@ tensors of a prior iteration (outside of torch.compile) before you begin the nex
 |---------------|------------------------------------------------------------|------------------------------------------------------------------------|
 | Memory Can Increase | On each graph compilation (new sizes, etc.)              | If you are also running non-cudagraph memory                           |
 | Recordings    | On any new invocation of a graph                           | Will re-record on any new, unique path you take through your program   |
+<<<<<<< HEAD
 | Footguns      | Invocation of one graph will overwrite prior invocation    | Cannot persist memory between separate runs through your model - one training loop training, or one run of inference |
+=======
+| Footguns      | Invocation of one graph will overwrite prior invocation    | Cannot persist memory between separate runs through your model - one training loop training, or one run of inference |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/torch.compiler_dynamic_shapes.md b/docs/source/torch.compiler_dynamic_shapes.md
index 22cb482cd20bd..efa6351e774aa 100644
--- a/docs/source/torch.compiler_dynamic_shapes.md
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 ---
 file_format: mystnb
 kernelspec:
@@ -293,3 +294,134 @@ compile/dynamic_shapes_beyond_the_basics
 * [tlparse documentation](https://github.com/pytorch/tlparse)
 * [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit?tab=t.0#heading=h.fh8zzonyw8ng)
 ```
+=======
+# Dynamic Shapes
+
+Code: [symbolic_shapes.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/fx/experimental/symbolic_shapes.py)
+
+See also: [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
+
+## Motivation
+
+Deep learning compilers commonly only work for static shapes, that is to say, they produced compiled programs which only work for a single specific configuration of input shapes, and must recompile if any input shape changes. This assumption works great for the majority of commonly run deep learning models today, but there are a few situations where it is insufficient:
+
+- Some dimensions, such as batch size or sequence length, may vary. For example, an inference service performing adaptive batching will execute inference requests with varying batch sizes depending on how many requests it received within its batching window. We may also want to consider padding out variable size sequences only to the maximum sequence length within a batch, which may vary from batch-to-batch.
+- Some models exhibit data-dependent output shapes, that is to say, the size of their outputs and intermediates may depend on the actual input data which may vary across runs. For example, detection models may first generate a variable number of potential bounding boxes before running a more expensive image recognition model to identify if the subject is in a bounding box. The number of bounding boxes is data dependent.
+- One particularly important case of data-dependent shapes occurs when dealing with sparse representations, such as sparse tensors, jagged tensors, and graph neural networks. In all of these cases, the amount of data to be processed depends on the sparse structure of the problem, which will typically vary in a data-dependent way.
+
+In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g., programs whose inputs tensors change in dimensionality, as this pattern rarely occurs in real-world deep learning programs, and it avoids the need to reason inductively over symbolic lists of shapes.
+
+## Abridged public API
+
+The default dynamic behavior in PyTorch 2.1 is:
+
+- PT2 assumes everything is static by default
+- If we recompile because a size changed, we will instead attempt to recompile
+  that size as being dynamic (sizes that have changed are likely to change in
+  the future). This generalization may fail (e.g., because user code does a
+  conditional branch on the size in question or missing dynamic shapes support
+  in PT2). If you are trying to understand why PT2 has overspecialized some
+  code, run with `TORCH_LOGS=dynamic` and look for "eval" entries that say
+  when guards are added and why.
+- If you know ahead of time something will be dynamic, you can skip the first
+  recompile with `torch._dynamo.mark_dynamic(tensor, dim)`. If you know ahead of time
+  the `min` and `max` value this dimension can take, you can specify `torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`
+- If you say `torch.compile(dynamic=False)`, we will turn off automatic
+  dynamic shapes on recompiles and always recompile for each distinct size.
+  Conversely, if you say `torch.compile(dynamic=True)`, we will try to make
+  everything as dynamic as possible. This is mostly useful for small
+  operators; if you try it on a big model it will (1) probably crash PT2 and (2) run slow for no good reason.
+- You can whitelist specific sources to be marked as dynamic using the
+  `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or by setting
+  `torch.compiler.config.dynamic_sources`. This is particularly useful for large
+  models with graph breaks, as you can maintain dynamism across graph breaks since
+  source names stay consistent. You can also use this to mark integers as dynamic.
+  The format is a comma-delimited list of source names, e.g., `"L['x'], L['y']"`.
+  You can also use regexes, e.g., `"L\['x.*'\], L\['y.*'\]")`.
+  This whitelist takes precedence over other flags like `dynamic=False`,
+  `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
+- Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
+  you're willing to take a performance hit for the first batch, one other affordable
+  option we have are the eager_then_compile stances which derive dynamism for you.
+  See [torch.compiler.set_stance](https://docs.pytorch.org/docs/stable/generated/torch.compiler.set_stance.html) for more details.
+
+## The Guard Model
+
+When considering how to add support for dynamic shapes to TorchDynamo and TorchInductor, we made a major design decision: in order to reuse decompositions and other preexisting code written in Python/C++ targeting the PyTorch API, we must be able to trace through dynamic shapes. Unlike a fully symbolic system which might capture both branches of a conditional, we always pick one branch and specialize our trace under the assumption that we only use this trace when we would have made the same choice for that branch in the future. To do this, we maintain a "hint" for every symbolic size saying what its concrete value is at compile time (as TorchDynamo is a just-in-time compiler, it always knows what the actual input sizes are.) When we perform a condition on a tensor, we simply consult the hint to find out which branch to take.
+
+This greatly simplifies the symbolic shape formulas we produce, but means we have a much more involved system for managing guards. Consider, for example, the following program:
+
+```python
+def f(x, y):
+    z = torch.cat([x, y])
+    if z.size(0) > 2:
+        return z.mul(2)
+    else:
+        return z.add(2)
+```
+
+The final IR we will compile with TorchInductor will either be `torch.cat([x, y]).add(2)` or `torch.cat([x, y]).mul(2)` (with the condition flattened away), but to determine which branch we are in, we would need to know the size of `z`, an intermediate. Because TorchDynamo must know upfront if a compiled trace is valid (we do not support bailouts, like some JIT compilers), we must be able to reduce `z.size(0)` as an expression in terms of the inputs, `x.size(0) + y.size(0)`. This is done by writing meta functions for all operators in PyTorch which can propagate size information to the output of a tensor without actually performing computation on the node.
+
+## Overall architecture
+
+Symbolic shapes workflow:
+
+1. When we start compiling a frame in Dynamo, we allocate a ShapeEnv (attached to FakeTensorMode) which keeps track of symbolic shapes state.
+2. We allocate symbolic sizes for tensors on entry (what is static or dynamic is a policy decision, with some knobs).
+3. We propagate the symbolic sizes through operators, maintaining both (1) FX IR so that we can faithfully export symbolic compute, and (2) Sympy expressions representing the size vars, so we can reason about them.
+4. When we condition on symbolic sizes, either in Dynamo tracing or in Inductor optimization, we add guards based on the conditional. These can be induced from both Python and C++.
+5. These guards can induce further simplifications on symbolic variables. For example, if you assert `s0 == 4`, we can now replace all occurrences of `s0` with `4`.
+6. When we're done tracing and optimizing, we install all of these guards with the compiled code; the compiled code is only reusable if all the guards evaluate true.
+
+Important files:
+
+- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
+- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
+- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
+- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
+- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+
+## Abridged internal API
+
+Understanding the Python class hierarchy:
+
+- SymInt/SymFloat/SymBool: these are user-visible classes that simulate their int/float/bool counterparts. If you add two SymInts, we give you a new SymInt that symbolically tracks that the integer addition had occurred.
+- SymNode: this is the internal structure (accessible via e.g., `symint.node`) which holds the actual symbolic tracking info. SymNode is type erased; this makes it more convenient to represent mixed-type operations. Note that technically you don't have to call into Python SymNode from SymInt; for example, XLA's C++ `SymNodeImpl` would take the place of SymNode.
+- ShapeEnv: per-compile context state which keeps track of all the free symbols and guards we have accumulated so far. Every SymNode records its ShapeEnv (but not vice versa; SymNodes only get used if they participate in a guard).
+
+C++ is fairly similar:
+
+- c10::SymInt/SymFloat/SymBool: user-visible classes that simulate int/float/bool.
+- c10::SymNode/SymNodeImpl: analogous to SymNode
+- There is no ShapeEnv in C++; for ease of debugging, the entire symbolic reasoning apparatus is in Python.
+
+When you write code that is traceable with `make_fx`, it must be able to deal with SymInt/SymFloat/SymBool flowing through it. [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng) gives some guidance for how to do this.
+
+## DimDynamic policy
+
+Symbolic reasoning:
+
+- Value ranges
+- Sympy usage notes
+- Constraints
+- DimDynamic/Constraint
+
+## Unbacked SymInts
+
+To resolve control flow, we check the hint, aka actual value, of a symbolic integer to determine which branch to go. However, in some cases, we may not have a hint: so-called unbacked symbolic integers arise when a size variable emerges from a data-dependent operation like `.nonzero()` or `.item()`. It is illegal to perform control flow on these symbolic integers, so we must graph break on these operations.
+
+Naively implemented, this is too restrictive: most PyTorch programs will immediately fail if you try to do anything with unbacked symbolic integers. Here are the most important enhancements to make this actually work:
+
+- On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use `empty_strided` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like `empty`, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
+- Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
+- The data in an integer tensor is generally not known to be non-negative. However, we provide an API `constrain_range` whereby a user can specify that a size is bounded above and below by known limits.
+
+Similar to the dynamic APIs, there are corresponding unbacked APIs: namely you can use mark_unbacked instead of `mark_dynamic` and `TORCH_COMPILE_UNBACKED_SOURCES` instead of `TORCH_COMPILE_DYNAMIC_SOURCES` to tell the compiler to mark an input as unbacked.
+
+In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
+to infer that an unbacked symbolic integer is size-like based on usage. For
+example, if you pass the result of an `.item()` call to a factory function
+like `torch.empty`, we will automatically infer that the result is a size
+(because if it was not, it would fail.) This assumption would get validated
+at runtime, raising an error if it was not fulfilled.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/torch.compiler_dynamo_deepdive.md b/docs/source/torch.compiler_dynamo_deepdive.md
index 9fa7654023ca5..835bca609c4f1 100644
--- a/docs/source/torch.compiler_dynamo_deepdive.md
+++ b/docs/source/torch.compiler_dynamo_deepdive.md
@@ -285,7 +285,11 @@ appear in the errors, and the `VariableTracker` method that throws the
 exception when you encounter a Dynamo error. In particular, sometimes we
 find that an object is tracked as a `UserDefinedObjectVariable` (this
 is Dynamo’s catch-all class), when it should have been tracked as
+<<<<<<< HEAD
 something more specific. In these cases, the `VariableBuilder`
+=======
+something more specific. In these cases, the `SourceBuilder.__call__`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 logic is often to blame.
 
 **Debugging tip**. When running a program with `TORCH_LOGS=dynamo`,
diff --git a/docs/source/torch.compiler_inductor_provenance.rst b/docs/source/torch.compiler_inductor_provenance.rst
index f20dfb40b2066..65f6f5d353b85 100644
--- a/docs/source/torch.compiler_inductor_provenance.rst
+++ b/docs/source/torch.compiler_inductor_provenance.rst
@@ -3,6 +3,15 @@
 TorchInductor and AOTInductor Provenance Tracking
 =================================================
 
+<<<<<<< HEAD
+=======
+.. warning::
+    This feature is a prototype under active development and there will be
+    breaking change in future releases.
+    The current compatibility of this tool is limited to the latest nightly build of PyTorch.
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 This section describes how to use the provenance tracking feature for TorchInductor and AOTInductor in ``tlparse``.
 Provenance tracking helps you visualize the relationships between the input GraphModule to (AOT)Inductor and the optimized code generated. This feature allows you to trace how your original operations are transformed during compilation.
 
@@ -31,7 +40,11 @@ Follow these steps to enable and use provenance tracking in your PyTorch project
 
    .. code-block:: bash
 
+<<<<<<< HEAD
      TORCH_TRACE=~/my_trace_log_dir INDUCTOR_PROVENANCE=1 python your_program.py
+=======
+     TORCH_TRACE=~/my_trace_log_dir TORCH_LOGS="+inductor" TORCH_COMPILE_DEBUG=1 python your_program.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
    This will generate a log file in ``~/my_trace_log_dir``. The log file will be used by tlparse to generate the provenance tracking highlighter.
 3. Run ``tlparse`` on the log with ``--inductor-provenance`` flag. For example:
@@ -56,6 +69,7 @@ For a demo, see: https://github.com/pytorch/tlparse/pull/93
  .. image:: _static/img/inductor_provenance/index.png
 
 
+<<<<<<< HEAD
 Source code corresponding to each Inductor kernel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -74,6 +88,8 @@ You can also find the debug handle in the comments within the kernel source code
  .. image:: _static/img/inductor_provenance/kernel_source_3.png
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See Also
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/torch.compiler_ir.md b/docs/source/torch.compiler_ir.md
index ff66b8cc7efce..0201d2befeb86 100644
--- a/docs/source/torch.compiler_ir.md
+++ b/docs/source/torch.compiler_ir.md
@@ -1,5 +1,8 @@
+<<<<<<< HEAD
 (torch.compiler_ir)=
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # IRs
 
 PyTorch 2.0 offers two set of IRs for backends to interface with: Core Aten IR and Prims IR.
diff --git a/docs/source/torch.compiler_profiling_torch_compile.md b/docs/source/torch.compiler_profiling_torch_compile.md
index 9c1a215920abf..1385607ad02da 100644
--- a/docs/source/torch.compiler_profiling_torch_compile.md
+++ b/docs/source/torch.compiler_profiling_torch_compile.md
@@ -134,7 +134,11 @@ Note a few things:
 
 Although there are logging tools for identifying graph breaks, the profiler provides a quick visual method of identifying :ref:`graph breaks <torch.compiler_graph_breaks>`. There are two profiler events to look for: **Torch-Compiled Region** and **CompiledFunction**.
 
+<<<<<<< HEAD
 **Torch-Compiled Region** - which was introduced in PyTorch 2.2 - is a profiler event that covers the entire compiled region. Graph breaks almost always look the same: nested “Torch-Compiled Region” events. Starting in PyTorch 2.5, the profiler event will also contain the frame ID and the frame compile ID. The frame ID is a unique identifier for the frame, and the frame compile ID denotes how many times the frame has been compiled.
+=======
+**Torch-Compiled Region** - which was introduced in PyTorch 2.2 - is a profiler event that covers the entire compiled region. Graph breaks almost always look the same: nested “Torch-Compiled Region” events.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 If you run two separate functions with torch.compile() applied independently on each of them, you should generally expect to see two adjacent (i.e NOT stacked/nested) Torch-Compiled regions. Meanwhile, if you encounter graph breaks (or disable()'ed/skipped regions), expect nested “Torch-Compiled Region” events.
 
@@ -249,4 +253,8 @@ One common issue is bad GPU utilization. A quick way to identify this is if ther
 
 This is often the result of CPU overhead, e.g. if the amount of time spent on the CPU between kernel launches is larger than the amount of time spent by the GPU to process the kernels. The issue is more common for small batch sizes.
 
+<<<<<<< HEAD
 When using inductor, enabling CUDA graphs can often help improve performance when launch overhead is a concern.
+=======
+When using inductor, enabling CUDA graphs can often help improve performance when launch overhead is a concern.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/torch.compiler_troubleshooting.md b/docs/source/torch.compiler_troubleshooting.md
index a4f7af3b9b8e9..8a00da1e2596a 100644
--- a/docs/source/torch.compiler_troubleshooting.md
+++ b/docs/source/torch.compiler_troubleshooting.md
@@ -192,8 +192,11 @@ For more information on dynamic shapes, see [The dynamic shapes manual](https://
 
 ## Logging Tools
 
+<<<<<<< HEAD
 (tlparse-torch-trace)=
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ### tlparse / TORCH_TRACE
 
 `tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
@@ -254,8 +257,11 @@ Here are some insights you can gain from a `tlparse`:
   For example, you can look at the high-level generated FX graph or the generated Triton code.
 - Is there relevant information for a particular frame? You can find these in `compilation_metrics`.
 
+<<<<<<< HEAD
 (torch-logs)=
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ### TORCH_LOGS
 
 You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.
diff --git a/docs/source/torch.compiler_troubleshooting_old.md b/docs/source/torch.compiler_troubleshooting_old.md
index ef13fc1772374..980f21095cf30 100644
--- a/docs/source/torch.compiler_troubleshooting_old.md
+++ b/docs/source/torch.compiler_troubleshooting_old.md
@@ -717,5 +717,9 @@ backtrace is slow and very spammy so it is not included by default with extended
 
 In order to measure the cold start compilation time or debug a cache corruption,
 it is possible pass `TORCHINDUCTOR_FORCE_DISABLE_CACHES=1` or set
+<<<<<<< HEAD
 `torch.compiler.config.force_disable_caches = True` which will override any
+=======
+`torch._inductor.config.force_disable_caches = True` which will override any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 other caching config option and disable all compile time caching.
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 068ffb52c0add..07b087d734563 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -122,7 +122,10 @@ Indexing, Slicing, Joining, Mutating Ops
     slice_scatter
     scatter_add
     scatter_reduce
+<<<<<<< HEAD
     segment_reduce
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     split
     squeeze
     stack
@@ -146,7 +149,11 @@ Indexing, Slicing, Joining, Mutating Ops
 Accelerators
 ----------------------------------
 Within the PyTorch repo, we define an "Accelerator" as a :class:`torch.device` that is being used
+<<<<<<< HEAD
 alongside a CPU to speed up computation. These devices use an asynchronous execution scheme,
+=======
+alongside a CPU to speed up computation. These device use an asynchronous execution scheme,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using :class:`torch.Stream` and :class:`torch.Event` as their main way to perform synchronization.
 We also assume that only one such accelerator can be available at once on a given host. This allows
 us to use the current accelerator as the default device for relevant concepts such as pinned memory,
@@ -475,7 +482,10 @@ Reduction Ops
     var
     var_mean
     count_nonzero
+<<<<<<< HEAD
     hash_tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Comparison Ops
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -806,6 +816,10 @@ Operator Tags
 .. for tracking purposes
 .. py:module:: torch.utils.model_dump
 .. py:module:: torch.utils.viz
+<<<<<<< HEAD
+=======
+.. py:module:: torch.functional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.quasirandom
 .. py:module:: torch.return_types
 .. py:module:: torch.serialization
@@ -815,6 +829,7 @@ Operator Tags
 .. py:module:: torch.torch_version
 .. py:module:: torch.types
 .. py:module:: torch.version
+<<<<<<< HEAD
 
 .. Compiler configuration module - documented in torch.compiler.config.md
 .. py:module:: torch.compiler.config
@@ -826,3 +841,5 @@ Operator Tags
     :hidden:
 
     torch.aliases.md
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/torch_cuda_memory.md b/docs/source/torch_cuda_memory.md
index f7f1fe706dad3..0f2de798dfb9e 100644
--- a/docs/source/torch_cuda_memory.md
+++ b/docs/source/torch_cuda_memory.md
@@ -32,7 +32,11 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
 
 ## Using the visualizer
 
+<<<<<<< HEAD
 Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
+=======
+Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz>) and drag/drop the pickled snapshot file into the visualizer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.
 
 
diff --git a/docs/source/type_info.md b/docs/source/type_info.md
index 9933d551506d9..400a23ffc85e2 100644
--- a/docs/source/type_info.md
+++ b/docs/source/type_info.md
@@ -20,6 +20,7 @@ This is similar to [numpy.finfo](https://numpy.org/doc/stable/reference/generate
 
 A {class}`torch.finfo` provides the following attributes:
 
+<<<<<<< HEAD
 | Name            | Type  | Description                                                                                 |
 | :-------------- | :---- | :------------------------------------------------------------------------------------------ |
 | bits            | int   | The number of bits occupied by the type.                                                    |
@@ -29,6 +30,17 @@ A {class}`torch.finfo` provides the following attributes:
 | tiny            | float | The smallest positive normal number. Equivalent to ``smallest_normal``.                     |
 | smallest_normal | float | The smallest positive normal number. See notes.                                             |
 | resolution      | float | The approximate decimal resolution of this type, i.e., ``10**-precision``.                  |
+=======
+| Name            | Type  | Description                                                                |
+| :-------------- | :---- | :------------------------------------------------------------------------- |
+| bits            | int   | The number of bits occupied by the type.                                   |
+| eps             | float | The smallest representable number such that ``1.0 + eps != 1.0``.          |
+| max             | float | The largest representable number.                                          |
+| min             | float | The smallest representable number (typically ``-max``).                    |
+| tiny            | float | The smallest positive normal number. Equivalent to ``smallest_normal``.    |
+| smallest_normal | float | The smallest positive normal number. See notes.                            |
+| resolution      | float | The approximate decimal resolution of this type, i.e., ``10**-precision``. |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```{note}
   The constructor of {class}`torch.finfo` can be called without argument,
diff --git a/docs/source/xpu.md b/docs/source/xpu.md
index 7a10e29b6af67..70eaab0c6f74d 100644
--- a/docs/source/xpu.md
+++ b/docs/source/xpu.md
@@ -12,7 +12,10 @@
     :nosignatures:
 
     StreamContext
+<<<<<<< HEAD
     can_device_access_peer
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     current_device
     current_stream
     device
@@ -26,9 +29,13 @@
     get_stream_from_external
     init
     is_available
+<<<<<<< HEAD
     is_bf16_supported
     is_initialized
     is_tf32_supported
+=======
+    is_initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_device
     set_stream
     stream
@@ -85,6 +92,7 @@
      memory_stats_as_nested_dict
      reset_accumulated_memory_stats
      reset_peak_memory_stats
+<<<<<<< HEAD
      set_per_process_memory_fraction
 ```
 
@@ -93,4 +101,14 @@
     :hidden:
 
     xpu.aliases.md
-```
\ No newline at end of file
+```
+=======
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.xpu.random
+.. py:module:: torch.xpu.streams
+```
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/functorch/.gitignore b/functorch/.gitignore
index 58bffff1353d6..e0bb0ff55fcf5 100644
--- a/functorch/.gitignore
+++ b/functorch/.gitignore
@@ -3,6 +3,10 @@ dist/
 functorch.egg-info/
 *__pycache__*
 functorch/version.py
+<<<<<<< HEAD
+=======
+functorch/_C.so
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .gdbinit
 t.py
 .vscode/
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
new file mode 100644
index 0000000000000..bdfa4bfe4550d
--- /dev/null
+++ b/functorch/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.18)
+project(functorch)
+set(CMAKE_CXX_STANDARD 17)
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(FT_DIR csrc)
+file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp ${FT_DIR}/*.c)
+
+add_library(${PROJECT_NAME} MODULE ${FT_SOURCES})
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_definitions(${PROJECT_NAME} PRIVATE FUNCTORCH_BUILD_MAIN_LIB)
+target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_EXTENSION_NAME=_C)
+target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_API_INCLUDE_EXTENSION_H)
+target_compile_options(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
+target_compile_options_if_supported(${PROJECT_NAME} "-Wmissing-prototypes")
+target_compile_options_if_supported(${PROJECT_NAME} "-Werror=missing-prototypes")
+if(BUILD_LIBTORCHLESS)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIB} torch_python)
+else()
+  # functorch cannot use the alias to build on windows
+  target_link_libraries(${PROJECT_NAME} PRIVATE torch torch_python)
+endif()
+target_link_libraries(${PROJECT_NAME} PRIVATE pybind::pybind11)
+
+set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+      ${CMAKE_BINARY_DIR}/functorch)
+set_target_properties(${PROJECT_NAME} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
+
+# Copy-pasted prefix/suffix logic for Python extensions from
+# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L1975
+# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L2022
+# TODO: It would be good to be able to use Python3_add_library target, but it does not work in many cases
+set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
+if(WIN32)
+  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".pyd")
+else()
+  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".so")
+endif()
+# Needed to link functorch on MacOS
+if(NOT ${TORCH_PYTHON_LINK_FLAGS} STREQUAL "")
+  set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
+endif()
+install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/functorch/COMPILE_README.md b/functorch/COMPILE_README.md
index 47ca484e7c07e..270ebb5fbf632 100644
--- a/functorch/COMPILE_README.md
+++ b/functorch/COMPILE_README.md
@@ -72,4 +72,8 @@ aot_function(f, ts_compiler, ts_compiler)(torch.randn(3, requires_grad=True))
 * Min-cut [recomputation](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467) with AOT Autograd.
 
 ## Tutorials
+<<<<<<< HEAD
 You can use this [tutorial](https://pytorch.org/functorch/nightly/tutorials/aot_autograd_optimizations.html) to play with AOT Autograd.
+=======
+You can use this [tutorial](https://pytorch.org/functorch/nightly/notebooks/aot_autograd_optimizations.html) to play with AOT Autograd.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/functorch/README.md b/functorch/README.md
index 5e16966b1daa9..0eed90d46a9f3 100644
--- a/functorch/README.md
+++ b/functorch/README.md
@@ -7,7 +7,11 @@
 | [**Future Plans**](#future-plans)
 
 **This library is currently under heavy development - if you have suggestions
+<<<<<<< HEAD
 on the API or use-cases you'd like to be covered, please open a GitHub issue
+=======
+on the API or use-cases you'd like to be covered, please open an github issue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 or reach out. We'd love to hear about how you're using the library.**
 
 `functorch` is [JAX-like](https://github.com/google/jax) composable function
@@ -161,7 +165,11 @@ result = vmap(model)(examples)
 
 ### grad
 
+<<<<<<< HEAD
 `grad(func)(*inputs)` assumes `func` returns a single-element Tensor. It computes
+=======
+`grad(func)(*inputs)` assumes `func` returns a single-element Tensor. It compute
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 the gradients of the output of func w.r.t. to `inputs[0]`.
 
 ```py
@@ -192,7 +200,11 @@ def compute_loss(weights, example, target):
 weights = torch.randn(feature_size, requires_grad=True)
 examples = torch.randn(batch_size, feature_size)
 targets = torch.randn(batch_size)
+<<<<<<< HEAD
 inputs = (weights, examples, targets)
+=======
+inputs = (weights,examples, targets)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(*inputs)
 ```
 
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index cc641c1cf81c9..762551d16408c 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -66,7 +66,11 @@ def main():
                 filenames, total_length
             )
             print(f"{modelname}, {utilization}, {mm_conv_utilization}")
+<<<<<<< HEAD
         except BaseException:  # noqa: B036
+=======
+        except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.exception("%s, ERROR", filename)
             print(f"{filename}, ERROR")
 
diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
new file mode 100644
index 0000000000000..ec2cfef668952
--- /dev/null
+++ b/functorch/csrc/dim/arena.h
@@ -0,0 +1,332 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <ATen/ATen.h>
+#include "minpybind.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+// https://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
+inline unsigned int __builtin_clz(unsigned int x) {
+    unsigned long r = 0;
+    _BitScanReverse(&r, x);
+    return (31 - r);
+}
+#endif
+
+inline int round2min8(int num) {
+   int nzeros = __builtin_clz((num - 1)|4);
+   return 1 << (32 - nzeros);
+}
+
+struct Arena;
+template<typename T>
+struct OwnedSlice;
+
+template<typename T>
+struct Slice {
+    Slice()
+    :  begin_(nullptr), size_(0), capacity_(0) {}
+
+    template<typename... Args>
+    Slice(Arena& arena, Args&&... args);
+
+    T* begin() const {
+        return begin_;
+    }
+    T* end() const {
+        return begin_ + size_;
+    }
+    int size() const {
+        return size_;
+    }
+    int capacity() const {
+        return capacity_;
+    }
+
+    T& back(int i=-1) {
+        return begin_[size_ + i];
+    }
+
+    T& operator[](int i) const {
+        return begin_[i];
+    }
+    std::optional<int> index(const T& value) {
+        for (int i : enumerate()) {
+            if (begin_[i] == value) {
+                return i;
+            }
+        }
+        return std::nullopt;
+    }
+    bool contains(const T& value) {
+        return index(value).has_value();
+    }
+
+    void insert(Arena& arena, Slice where, Slice to_insert);
+    void insert(Arena& arena, Slice where, T v) {
+        return insert(arena, where, Slice(&v, &v + 1));
+    }
+    void insert(Arena& arena, int where, T v) {
+        return insert(arena, slice(where, where), v);
+    }
+    void append(Arena& arena, T value);
+    void extend(Arena& arena, Slice to_insert);
+    void extend(Arena& arena, const T* begin, const T* end) {
+        return extend(arena, Slice<T>((T*)begin, (T*)end));
+    }
+
+    bool remove(Arena& A, T value) {
+        auto idx = index(value);
+        if (idx) {
+            insert(A, slice(*idx, *idx + 1), Slice());
+        }
+        return idx.has_value();
+    }
+
+    Slice slice(int begin) {
+        return slice(begin, size_);
+    }
+
+    Slice slice(int begin, int end) {
+        if (begin < 0) {
+            begin += size_;
+        }
+        if (end < 0) {
+            end += size_;
+        }
+        Slice result;
+        result.begin_ = begin_ + begin;
+        result.size_ = end - begin;
+        result.capacity_ = result.size_;
+        return result;
+    }
+
+    bool inside(Slice where) {
+        return begin() <= where.begin() && where.end() <= end();
+    }
+
+    irange enumerate() const {
+        return irange(size_);
+    }
+
+    irange reversed_enumerate() const {
+        return irange(size_ - 1, -1, -1);
+    }
+
+    bool operator==(const Slice<T>& rhs) const {
+        if (size() != rhs.size()) {
+            return false;
+        }
+        return std::equal(begin(), end(), rhs.begin());
+    }
+
+    Slice(T* begin, T* end)
+    : begin_(begin), size_(end - begin), capacity_(size_) {}
+
+protected:
+    static int _length(const T& t) {
+        return 1;
+    }
+    static int _length(Slice t) {
+        return t.size_;
+    }
+    static T* _insert(T*& dst, T t) {
+        *dst = std::move(t);
+        return ++dst;
+    }
+    static T* _insert(T*& dst, Slice t) {
+        std::memcpy(dst, t.begin_, sizeof(T)*t.size_);
+        dst += t.size_;
+        return dst;
+    }
+    T* begin_;
+    int size_;
+    int capacity_;
+    friend struct OwnedSlice<T>;
+};
+
+template<typename T>
+struct OwnedSlice {
+    typedef void (*deleter_t)(Slice<T>);
+    static void _no_delete(Slice<T>) {}
+    OwnedSlice()
+    : deleter_(_no_delete) {}
+    OwnedSlice(const OwnedSlice&) = delete;
+    OwnedSlice& operator=(const OwnedSlice&) = delete;
+    ~OwnedSlice() {
+        deleter_(slice_);
+        if (slice_.size_ > 8) {
+            delete [] slice_.begin_;
+        }
+    }
+    void set(Slice<T> to_own, deleter_t deleter = _no_delete) {
+        slice_.size_ = slice_.capacity_ = to_own.size();
+        slice_.begin_ = (slice_.size_ > 8) ? new T[slice_.size_] : &small_buf[0];
+        std::memcpy(slice_.begin_, to_own.begin(), slice_.size_ * sizeof(T));
+        deleter_ = deleter;
+    }
+    Slice<T> slice() const {
+        return slice_;
+    }
+private:
+    Slice<T> slice_;
+    deleter_t deleter_;
+    T small_buf[8];
+};
+
+template<typename T>
+inline std::ostream& operator<<(std::ostream& s, const Slice<T>& v) {
+    s << "[";
+    for (int i : v.enumerate()) {
+        if (i > 0) {
+            s << ", ";
+        }
+        s << v[i];
+    }
+    s << "]";
+    return s;
+}
+
+struct TensorRef {
+    TensorRef()
+    : impl_(nullptr){}
+    TensorRef(const at::Tensor& t)
+    : impl_(t.unsafeGetTensorImpl()) {}
+    const at::Tensor& operator*() const {
+        return *(at::Tensor*)this;
+    }
+    at::Tensor* operator->() const {
+        return (at::Tensor*)this;
+    }
+    operator bool() const {
+        return impl_ != nullptr;
+    }
+private:
+    at::TensorImpl* impl_;
+};
+
+constexpr int ARENA_MAX_SIZE = 4096;
+constexpr int ALIGNMENT = 8;
+struct Arena {
+    Arena()
+    : allocated_(0) {}
+    template<typename T>
+    T* allocate(int n) {
+        if (!n) {
+            return nullptr;
+        }
+        int to_allocate = sizeof(T)*n;
+        int to_allocate_rounded = ALIGNMENT * ((to_allocate - 1) / ALIGNMENT + 1);
+        auto prev_allocated = allocated_;
+        allocated_ += to_allocate_rounded;
+        if (C10_UNLIKELY_OR_CONST(allocated_ > ARENA_MAX_SIZE)) {
+            overflow_.emplace_back(new char[to_allocate]);
+            return (T*) &overflow_.back()[0];
+        }
+        return (T*) (buffer_ + prev_allocated);
+    }
+    TensorRef autorelease(at::Tensor s) {
+        auto ref = TensorRef(s);
+        s.unsafeReleaseTensorImpl();
+        ar_tensors_.append(*this, ref);
+        return ref;
+    }
+    mpy::handle autorelease(mpy::object obj) {
+        ar_objects_.append(*this, obj);
+        obj.release();
+        return ar_objects_.back();
+    }
+    ~Arena() {
+        for(TensorRef t: ar_tensors_) {
+            c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(t->unsafeGetTensorImpl());
+        }
+        for(mpy::handle h: ar_objects_) {
+            mpy::object::steal(h);
+        }
+    }
+private:
+    int64_t allocated_;
+    char buffer_[ARENA_MAX_SIZE];
+    Slice<TensorRef> ar_tensors_;
+    Slice<mpy::handle> ar_objects_;
+    std::vector<std::unique_ptr<char[]>> overflow_;
+};
+
+template<typename T>
+inline void Slice<T>::insert(Arena& arena, Slice where, Slice to_insert) {
+    AT_ASSERT(inside(where));
+    Slice result = *this;
+    /// b------sb---se-----e,  0----n
+    T* body_dest = where.begin();
+    if (where.size() != to_insert.size()) {
+        int new_size = size() - where.size() + to_insert.size();
+        T* tail_dest = where.begin() + to_insert.size();
+        if (new_size >= capacity_) {
+            int new_capacity = new_size ? round2min8(new_size) : 0;
+            result.capacity_ = new_capacity;
+            result.begin_ = arena.allocate<T>(new_capacity);
+            body_dest = result.begin_ + (where.begin() - begin());
+            tail_dest = body_dest + to_insert.size();
+            //std::memcpy(result.begin_, begin_, sizeof(T)*(where.begin() - begin()));
+            std::copy(begin_, begin_ + (where.begin() - begin()), result.begin_);
+        }
+        std::memmove(tail_dest, where.end(), sizeof(T)*(end() - where.end()));
+        result.size_ = new_size;
+    }
+
+    //std::memcpy(body_dest, to_insert.begin(), sizeof(T)*to_insert.size());
+    std::copy(to_insert.begin(), to_insert.end(), body_dest);
+    *this = result;
+}
+
+template<typename T>
+inline void Slice<T>::append(Arena& arena, T value) {
+    Slice result = *this;
+    if (size_ == capacity_) {
+        int new_size = size_ ? round2min8(size_)*2 : 8;
+        T* n = arena.allocate<T>(new_size);
+        //memcpy(n, begin_, size_*sizeof(T));
+        std::copy(begin_, begin_ + size_, n);
+        result.begin_ = n;
+        result.capacity_ = new_size;
+    }
+    result[result.size_++] = std::move(value);
+    *this = result;
+}
+
+template<typename T>
+inline void Slice<T>::extend(Arena& arena, Slice<T> rhs) {
+    Slice result = *this;
+    result.size_ = size_ + rhs.size();
+    if (result.size_ > capacity_) {
+        int new_size = round2min8(result.size_);
+        T* n = arena.allocate<T>(new_size);
+        //memcpy(n, begin_, size_*sizeof(T));
+        std::copy(begin_, begin_+size_, n);
+        result.begin_ = n;
+        result.capacity_ = new_size;
+    }
+    //memcpy(result.begin_ + size_, rhs.begin(), sizeof(T)*rhs.size());
+    std::copy(rhs.begin(), rhs.end(), result.begin_ + size_);
+    *this = result;
+}
+
+template<typename T>
+template<typename... Args>
+Slice<T>::Slice(Arena& arena, Args&&... args) {
+    int lens[] = {_length(args)...};
+    size_ = 0;
+    for (auto i : lens) {
+        size_ += i;
+    }
+    capacity_ = size_ ? round2min8(size_) : 0;
+    begin_ = arena.allocate<T>(capacity_);
+    T* dst_ = begin_;
+    T* unused[] = {_insert(dst_, args)...};
+    (void) unused;
+}
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
new file mode 100644
index 0000000000000..19270d2f9225d
--- /dev/null
+++ b/functorch/csrc/dim/dim.cpp
@@ -0,0 +1,3249 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/csrc/utils/python_compat.h>
+
+
+// Many APIs have changed/don't exist anymore
+#if IS_PYTHON_3_12_PLUS
+
+#include "dim.h"
+
+// Re-enable this some day
+PyObject* Dim_init() {
+    PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12");
+    return nullptr;
+}
+
+#else
+
+#include "minpybind.h"
+#include <frameobject.h>
+#include <opcode.h>
+#include <utility>
+#include <new>
+#include <iostream>
+#include <vector>
+//#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/Export.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
+#include <ATen/functorch/DynamicLayer.h>
+#include <ATen/ATen.h>
+#include <memory>
+#include "arena.h"
+#include "dim.h"
+#include "python_variable_simple.h"
+
+#if IS_PYTHON_3_11_PLUS
+
+#define Py_BUILD_CORE
+#include "internal/pycore_opcode.h"
+#undef Py_BUILD_CORE
+#endif
+
+// C++ API functions for objects to
+// * construct the object, returning a ref-counted handle
+// * The actual API, with methods that take/return C-typed values
+
+// extend minpybind.h to include
+// * typed handles so that -> can get to their raw API
+// * object/handle distinction for the typed handles
+
+// class Dim: ---------------
+mpy::handle torch_Tensor___mul__;
+mpy::handle _Tensor;
+mpy::handle _Tensor_sum;
+mpy::handle NamedTuple;
+mpy::dict_view pointwise;
+mpy::handle torch_Tensor_expand;
+binaryfunc THPVariable_getitem;
+objobjargproc THPVariable_setitem;
+mpy::handle no_slice;
+PyTypeObject* torch_Tensor;
+mpy::handle torch_Tensor_copy_;
+mpy::handle torch_Tensor_split;
+bool pointwise_optimize = true;
+PyTypeObject* DimType = nullptr;
+
+PyObject* Tensor_getitem(PyObject* self, PyObject* index);
+int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value);
+
+namespace{
+void maybeInitializeGlobals() {
+    // globals that depend on the python dim library,
+    // which we can't lookup until we finish initializing the _C module
+    if (_Tensor.ptr()) {
+        return;
+    }
+    auto dim = mpy::import("functorch.dim");
+    _Tensor = dim.attr("_Tensor");
+    pointwise = dim.attr("pointwise");
+    _Tensor_sum = _Tensor.attr("sum");
+    DimType = (PyTypeObject*) mpy::import("functorch.dim").attr("Dim").ptr();
+}
+
+void replaceMappingIfMatches(mpy::handle tp) {
+    auto T = (PyTypeObject*) tp.ptr();
+    bool recurse = false;
+    if (T->tp_as_mapping->mp_subscript == THPVariable_getitem) {
+        T->tp_as_mapping->mp_subscript = Tensor_getitem;
+        recurse = true;
+    }
+    if (T->tp_as_mapping->mp_ass_subscript == THPVariable_setitem) {
+        T->tp_as_mapping->mp_ass_subscript = Tensor_setitem;
+        recurse = true;
+    }
+    if (recurse) {
+        auto result = tp.attr("__subclasses__").call();
+        mpy::list_view lv(result);
+        for (auto i : lv.enumerate()) {
+            replaceMappingIfMatches(lv[i]);
+        }
+    }
+}
+
+void initializeGlobals(Arena & A) {
+    auto torch = mpy::import("torch");
+    torch_Tensor = (PyTypeObject*) torch.attr("Tensor").ptr();
+    torch_Tensor___mul__ = torch.attr("Tensor").attr("__mul__");
+
+    torch_Tensor_expand = torch.attr("_C").attr("TensorBase").attr("expand");
+    torch_Tensor_split = torch.attr("_C").attr("TensorBase").attr("split");
+    torch_Tensor_copy_ = torch.attr("Tensor").attr("copy_");
+    auto py_TensorBase = torch.attr("_C").attr("TensorBase");
+    auto TensorBase = (PyTypeObject*) py_TensorBase.ptr();
+    THPVariable_getitem = TensorBase->tp_as_mapping->mp_subscript;
+    THPVariable_setitem = TensorBase->tp_as_mapping->mp_ass_subscript;
+    NamedTuple = mpy::import("typing").attr("NamedTuple");
+    no_slice = PySlice_New(NULL, NULL, NULL);
+
+}
+
+mpy::handle DimensionBindError_;
+mpy::handle DimensionBindError() {
+    if(!DimensionBindError_.ptr()) {
+        DimensionBindError_ = mpy::import("functorch.dim").attr("DimensionBindError");
+    }
+    return DimensionBindError_;
+}
+
+static int64_t n_dims_created = 65;
+
+struct Dim : public mpy::base<Dim> {
+    int64_t level_; // for stable comparisons in prototype
+    mpy::object name_;
+    Dim()
+    : level_(n_dims_created++) {}
+    void init(mpy::object name, int64_t s = -1) {
+        name_ = std::move(name);
+        size_ = s;
+    }
+
+    static bool check_exact(mpy::handle v) {
+        return Py_TYPE(v.ptr()) == DimType;
+    }
+
+    int64_t size() const {
+        if (size_ == -1) {
+            mpy::raise_error(PyExc_ValueError, "dimension %S is unbound", name_.ptr());
+        }
+        return size_;
+    }
+    void set_size(int64_t v) {
+        if (size_ == -1) {
+            size_ = v;
+        } else if(size_ != v) {
+            mpy::raise_error(DimensionBindError(), "Dim '%R' previously bound to a dimension of size %lld cannot bind to a dimension of size %lld", this, this->size_, v);
+        }
+    }
+    bool is_bound() const {
+        return size_ != -1;
+    }
+    static mpy::obj<Dim> create(mpy::object name, int64_t s = -1) {
+        if (!DimType) {
+            maybeInitializeGlobals();
+        }
+        auto r = Dim::alloc(DimType);
+        r->init(std::move(name), s);
+        return r;
+    }
+    static PyTypeObject Type;
+    const at::Tensor& range() {
+        if (!range_.defined()) {
+            range_ = at::arange(size());
+        }
+        return range_;
+    }
+    const at::Tensor& batchtensor() {
+        if (!batchtensor_.defined()) {
+            batchtensor_ = at::functorch::addBatchDim(range(), 0, level_);
+        }
+        return batchtensor_;
+    }
+private:
+    int64_t size_{-1};
+    at::Tensor range_;
+    at::Tensor batchtensor_;
+};
+
+
+struct DimEntry {
+    // union of either a negative number indicating which dimension this is from the rhs,
+    // or a pointer to a first-class dimension.
+    // pointers do not have their highest bit set, so checking the number is negative tells us
+    // that it is not a dim.
+    bool is_positional() const {
+        return data_ < 0;
+    }
+    bool is_none() const {
+        return data_ == 0;
+    }
+    int64_t position() const {
+        return data_;
+    }
+    mpy::hdl<Dim> dim() const {
+        Dim* result;
+        std::memcpy(&result, &data_, sizeof(Dim*));
+        return mpy::hdl<Dim>(result);
+    }
+
+    DimEntry()
+    : data_(0) {}
+
+    DimEntry(int64_t pos)
+    : data_(pos) {
+        AT_ASSERT(pos < 0);
+    }
+    DimEntry(mpy::hdl<Dim> d) {
+       std::memcpy(&data_, &d, sizeof(int64_t));
+    }
+    bool operator==(const DimEntry& rhs) const {
+        return data_ == rhs.data_;
+    }
+private:
+    int64_t data_;
+};
+
+// Dim wrapper methods
+DimEntry _wrap_dim(mpy::handle d, size_t N, bool keepdim) {
+    if (Dim::check(d)) {
+        if (keepdim) {
+            mpy::raise_error(PyExc_ValueError, "cannot preserve first-class dimensions with keepdim=True");
+        }
+        return Dim::unchecked_wrap(d);
+    } else if (mpy::is_int(d)) {
+        auto i = mpy::to_int(d);
+        while (i >= 0) {
+            i -= N;
+        }
+        return i;
+    } else {
+        return DimEntry();
+    }
+}
+
+
+int Dim_init(mpy::hdl<Dim> self, PyObject *args, PyObject *kwds) {
+    PY_BEGIN
+    static constexpr const char* kwlist[] = {"name", "size", nullptr};
+    mpy::handle name;
+    mpy::handle size = nullptr;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", const_cast<char **>(kwlist), &name, &size)) {
+        return -1;
+    }
+    self->init(mpy::object::borrow(name), (size.ptr() && !mpy::is_none(size)) ? mpy::to_int(size) : -1);
+    return 0;
+    PY_END(-1)
+}
+
+PyObject* Dim_repr(Dim* self) {
+    PY_BEGIN
+    mpy::object name = (self->name_.ptr()) ? self->name_ : mpy::unicode_from_string("<uninitialized dim>");
+    return name.release();
+    PY_END(nullptr)
+}
+
+
+PyObject* Dim_getsize(Dim* self, void*) {
+    PY_BEGIN
+    return mpy::from_int(self->size()).release();
+    PY_END(nullptr)
+}
+
+int Dim_setsize(Dim* self, PyObject* size, void*) {
+    PY_BEGIN
+    self->set_size(mpy::to_int(size));
+    return 0;
+    PY_END(-1)
+}
+
+PyObject* Dim_getis_bound(Dim* self, void*) {
+    return PyBool_FromLong(self->is_bound());
+}
+
+PyObject* Dim_getlevel(Dim* self, void*) {
+    return PyLong_FromLong(self->level_);
+}
+
+PyObject* Dim_get_levels(Dim* self, void*) {
+    mpy::tuple t(1);
+    t.set(0, mpy::object::borrow(self->ptr()));
+    return t.release();
+}
+
+PyObject* Dim_get_has_device(Dim* self, void*) {
+    Py_RETURN_FALSE;
+}
+
+PyObject* Dim_get_tensor(Dim* self, void*) {
+    return THPVariable_Wrap(self->range());
+}
+
+PyObject* Dim_get_batchtensor(Dim* self, void*) {
+    return THPVariable_Wrap(self->batchtensor());
+}
+
+
+PyGetSetDef Dim_getsetters[] = {
+    {"size", (getter) Dim_getsize, (setter) Dim_setsize,
+     "Dimension size", NULL},
+    {"is_bound", (getter) Dim_getis_bound, NULL, "is_bound", NULL},
+    {"_level", (getter) Dim_getlevel, NULL, "_level", NULL},
+    {"_levels", (getter) Dim_get_levels, NULL, "_levels", NULL},
+    {"_has_device", (getter) Dim_get_has_device, NULL, "_has_device", NULL},
+    {"_tensor", (getter) Dim_get_tensor, NULL, "_tensor", NULL},
+    {"_batchtensor", (getter) Dim_get_batchtensor, NULL, "_batchtensor", NULL},
+    {"ndim", (getter) [](PyObject* self, void*) -> PyObject* { return mpy::from_int(1).release(); }, NULL, "ndim", NULL},
+    {NULL}  /* Sentinel */
+};
+}
+PyTypeObject Dim::Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_C.Dim",               /* tp_name */
+    sizeof(Dim),               /* tp_basicsize */
+    0,                              /* tp_itemsize */
+    Dim::dealloc_stub,      /* tp_dealloc */
+    0,                              /* tp_vectorcall_offset */
+    0,                              /* tp_getattr */
+    0,                              /* tp_setattr */
+    0,                              /* tp_as_async */
+    (reprfunc)Dim_repr,           /* tp_repr */
+    0,                 /* tp_as_number */
+    0,                              /* tp_as_sequence */
+    0,                              /* tp_as_mapping */
+    0,      /* tp_hash */
+    0,                              /* tp_call */
+    0,                              /* tp_str */
+    0,                              /* tp_getattro */
+    0,                              /* tp_setattro */
+    0,                              /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,  /* tp_flags */
+    "Dim Object",                   /* tp_doc */
+    0,                              /* tp_traverse */
+    0,                              /* tp_clear */
+    0,  /* tp_richcompare */
+    0,                              /* tp_weaklistoffset */
+    0,                              /* tp_iter */
+    0,                              /* tp_iternext */
+    0,                              /* tp_methods */
+    0,                              /* tp_members */
+    Dim_getsetters,                 /* tp_getset */
+    0,                              /* tp_base */
+    0,                              /* tp_dict */
+    0,                              /* tp_descr_get */
+    0,                              /* tp_descr_set */
+    0,                              /* tp_dictoffset */
+    (initproc)(void*)static_cast<int(*)(mpy::hdl<Dim>,PyObject*,PyObject*)>(Dim_init),      /* tp_init */
+    0,                              /* tp_alloc */
+    Dim::new_stub,                      /* tp_new */
+};
+
+// class DimList ------------
+
+struct DimList : public mpy::base<DimList> {
+    mpy::object name_;
+    std::vector<mpy::obj<Dim>> dims_;
+    static PyTypeObject Type;
+    void init(mpy::object name) {
+        name_ = std::move(name);
+    }
+    void set_dims(std::vector<mpy::obj<Dim>> dims) {
+        bound_ = true;
+        dims_ = std::move(dims);
+    }
+    bool is_bound() {
+        return bound_;
+    }
+    void bind_len(int64_t size) {
+        if (bound_) {
+            int64_t b_size = dims_.size();
+            if (b_size != size) {
+                mpy::raise_error(DimensionBindError(), "Dimlist has size %lld but it is being bound to size %d", b_size, size);
+            }
+        } else {
+            bound_ = true;
+            dims_.resize(size);
+            for (Py_ssize_t i = 0; i < size; ++i) {
+                dims_[i] = Dim::create(mpy::unicode_from_format("%S%i", name_.ptr(), (int)i));
+            }
+        }
+    }
+    int64_t size() const {
+        if (!bound_) {
+            mpy::raise_error(DimensionBindError(), "DimList not bound");
+        }
+        return dims_.size();
+    }
+    void set_bound(bool b) {
+        bound_ = b;
+    }
+private:
+    bool bound_ = false;
+};
+
+
+static int DimList_init(DimList *self, PyObject *args, PyObject *kwds);
+
+static PyObject* DimList_repr(DimList* self) {
+    PY_BEGIN
+    if (self->is_bound()) {
+        size_t size = self->dims_.size();
+        mpy::tuple t(size);
+        for(size_t i = 0; i < size; ++i) {
+            t.set(i, self->dims_[i]);
+        }
+        return mpy::repr(t).release();
+    } else if(!mpy::is_none(self->name_)) {
+        return mpy::unicode_from_format("*%S", self->name_.ptr()).release();
+    } else {
+        return mpy::unicode_from_string("<unbound_dimlist>").release();
+    }
+    PY_END(nullptr)
+}
+
+static PyObject* DimList_bind(DimList *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    mpy::handle sizes;
+    static const char * const _keywords[] = {"sizes", nullptr};
+    static _PyArg_Parser parser = {"O", _keywords, 0};
+    if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) {
+        return nullptr;
+    }
+    if (!mpy::is_sequence(sizes)) {
+        mpy::raise_error(PyExc_ValueError, "expected a sequence");
+    }
+    mpy::sequence_view seq = sizes;
+    auto size = seq.size();
+    self->bind_len(size);
+    for (Py_ssize_t i = 0; i < size; ++i) {
+        self->dims_[i]->set_size(mpy::to_int(seq[i]));
+    }
+    Py_RETURN_NONE;
+    PY_END(nullptr)
+}
+
+static PyObject* DimList_bind_len(DimList *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    int size;
+    static const char * const _keywords[] = {"N", nullptr};
+    static _PyArg_Parser parser = {"i", _keywords, 0};
+    if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) {
+        return nullptr;
+    }
+    self->bind_len(size);
+    Py_RETURN_NONE;
+    PY_END(nullptr)
+}
+
+static PyMethodDef DimList_methods[] = {
+    {"bind", (PyCFunction)(void*) DimList_bind, METH_FASTCALL | METH_KEYWORDS},
+    {"bind_len", (PyCFunction)(void*) DimList_bind_len, METH_FASTCALL | METH_KEYWORDS},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+
+static Py_ssize_t DimList_len(DimList* self) {
+    PY_BEGIN
+    return self->size();
+    PY_END(-1)
+}
+
+static PyObject * DimList_item(DimList* self, Py_ssize_t idx) {
+    PY_BEGIN
+    if (!self->is_bound()) {
+        mpy::raise_error(DimensionBindError(), "DimList not bound");
+    }
+    if (idx < 0 || (size_t) idx >= self->dims_.size()) {
+        mpy::raise_error(PyExc_IndexError, "index out of bounds");
+    }
+    mpy::object r = self->dims_[idx];
+    return r.release();
+    PY_END(nullptr)
+}
+
+PySequenceMethods DimList_seq {
+    (lenfunc) DimList_len, //lenfunc sq_length;
+    0, //binaryfunc sq_concat;
+    0, //ssizeargfunc sq_repeat;
+    (ssizeargfunc) DimList_item, //ssizeargfunc sq_item;
+    0, //void *was_sq_slice;
+    0, //ssizeobjargproc sq_ass_item;
+    0, //void *was_sq_ass_slice;
+    0, //objobjproc sq_contains;
+
+    0, //binaryfunc sq_inplace_concat;
+    0, //ssizeargfunc sq_inplace_repeat;
+};
+
+static PyObject* DimList_getis_bound(DimList* self, void*) {
+    return PyBool_FromLong(self->is_bound());
+}
+
+static PyGetSetDef DimList_getsetters[] = {
+    {"is_bound", (getter) DimList_getis_bound, NULL, "is_bound", NULL},
+    {NULL}  /* Sentinel */
+};
+
+
+static PyObject* DimList_subscript(DimList* self, mpy::handle idx) {
+    PY_BEGIN
+    if (mpy::is_int(idx)) {
+        return DimList_item(self, mpy::to_int(idx));
+    } else if (mpy::is_slice(idx)) {
+        if (!self->is_bound()) {
+            mpy::raise_error(DimensionBindError(), "DimList not bound");
+        }
+        mpy::slice_view s(idx, self->dims_.size());
+        mpy::tuple r(s.slicelength);
+        for (Py_ssize_t i = s.start, j = 0; i < s.stop; i += s.step) {
+            r.set(j++,  self->dims_[i]);
+        }
+        return r.release();
+    } else {
+        mpy::raise_error(PyExc_ValueError, "expected an int or a slice");
+        return nullptr;
+    }
+    PY_END(nullptr)
+}
+
+PyMappingMethods DimList_mapping = {
+    0, //lenfunc mp_length;
+    (binaryfunc)(void*) DimList_subscript, //binaryfunc mp_subscript;
+    0, //objobjargproc mp_ass_subscript;
+};
+
+
+
+PyTypeObject DimList::Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_C.DimList",               /* tp_name */
+    sizeof(DimList),               /* tp_basicsize */
+    0,                              /* tp_itemsize */
+    DimList::dealloc_stub,      /* tp_dealloc */
+    0,                              /* tp_vectorcall_offset */
+    0,                              /* tp_getattr */
+    0,                              /* tp_setattr */
+    0,                              /* tp_as_async */
+    (reprfunc)DimList_repr,           /* tp_repr */
+    0,                 /* tp_as_number */
+    &DimList_seq,                 /* tp_as_sequence */
+    &DimList_mapping,             /* tp_as_mapping */
+    0,      /* tp_hash */
+    0,                              /* tp_call */
+    0,                              /* tp_str */
+    0,                              /* tp_getattro */
+    0,                              /* tp_setattro */
+    0,                              /* tp_as_buffer */
+    0,                              /* tp_flags */
+    "DimList Object",                   /* tp_doc */
+    0,                              /* tp_traverse */
+    0,                              /* tp_clear */
+    0,                              /* tp_richcompare */
+    0,                              /* tp_weaklistoffset */
+    0,                              /* tp_iter */
+    0,                              /* tp_iternext */
+    DimList_methods,                /* tp_methods */
+    0,                              /* tp_members */
+    DimList_getsetters,             /* tp_getset */
+    0,                              /* tp_base */
+    0,                              /* tp_dict */
+    0,                              /* tp_descr_get */
+    0,                              /* tp_descr_set */
+    0,                              /* tp_dictoffset */
+    (initproc) DimList_init,            /* tp_init */
+    0,                              /* tp_alloc */
+    DimList::new_stub,                      /* tp_new */
+};
+
+static int DimList_init(DimList *self, PyObject *args, PyObject *kwds) {
+    PY_BEGIN
+    static constexpr const char* kwlist[] = {"len_or_dims", "name", nullptr};
+    mpy::handle len_or_dims = nullptr;
+    PyObject* name = nullptr;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO", const_cast<char**>(kwlist), &len_or_dims, &name)) {
+        return -1;
+    }
+    self->init(mpy::object::borrow(name ? name : Py_None));
+    if (len_or_dims.ptr()) {
+        if(mpy::is_int(len_or_dims)) {
+            self->bind_len(mpy::to_int(len_or_dims));
+        } else if (mpy::is_sequence(len_or_dims)) {
+            mpy::sequence_view s(len_or_dims);
+            std::vector<mpy::obj<Dim>> dims;
+            size_t size = s.size();
+            dims.reserve(size);
+            for (size_t i = 0; i < size; ++i) {
+                auto r = s[i];
+                if (mpy::is_int(r)) {
+                    dims.emplace_back(Dim::create(mpy::unicode_from_format("%S%i", self->name_.ptr(), (int)i),  mpy::to_int(r)));
+                } else {
+                    dims.emplace_back(Dim::wrap(r));
+                }
+            }
+            self->set_dims(std::move(dims));
+        } else {
+            PyErr_Format(PyExc_ValueError, "expected a length or a sequence of dimensions");
+            return -1;
+        }
+        return 0;
+    }
+    return 0;
+    PY_END(-1);
+}
+
+// Tensor -----------------------------
+
+PyTypeObject* TensorType = nullptr; // the python wrapper type.
+mpy::object run_torch_function(Arena &A, mpy::handle orig, mpy::vector_args args, bool is_pointwise);
+
+namespace{
+
+at::Tensor _add_batch_dims(Arena& A, at::Tensor t, Slice<DimEntry> levels_) {
+    auto levels = Slice<DimEntry>();
+    levels.extend(A, levels_);
+    while (true) {
+        int64_t min_real_index = -1;
+        int64_t min_index = -1;
+        int64_t min_value = INT_MAX;
+        int64_t i = 0;
+        int64_t r = 0;
+        for (auto l : levels) {
+            if (!l.is_none()) {
+                if (!l.is_positional() && l.dim()->level_ < min_value) {
+                    min_value = l.dim()->level_;
+                    min_index = i;
+                    min_real_index = r;
+                }
+                ++i;
+            }
+            ++r;
+        }
+        if (min_index == -1) {
+            return t;
+        }
+        auto t2 = at::functorch::addBatchDim(std::move(t), min_index, min_value);
+        t = std::move(t2);
+        levels[min_real_index] = DimEntry();
+    }
+}
+
+
+
+struct DelayedOperator {
+    DelayedOperator(mpy::object o, mpy::vector_args a)
+    : orig(std::move(o)), args(a) {
+        auto all = a.size();
+        // this will outlive the call so
+        // take ownership of temporaries
+        // in vector args
+        auto buf = new mpy::handle[all];
+        memcpy(buf, args.args, sizeof(mpy::handle)*all);
+        args.args = buf;
+        for (auto i : args.enumerate_all()) {
+            Py_INCREF(args.args[i].ptr());
+        }
+        Py_XINCREF(args.kwnames.ptr());
+    }
+    ~DelayedOperator() {
+        for (auto i : args.enumerate_all()) {
+            Py_DECREF(args[i].ptr());
+        }
+        if (args.has_keywords()) {
+            Py_XDECREF(args.kwnames.ptr());
+        }
+        delete [] args.args;
+    }
+    mpy::object orig;
+    mpy::vector_args args;
+};
+
+void free_levels_dims(Slice<DimEntry> levels) {
+    for(auto e : levels) {
+        if (!e.is_positional()) {
+            mpy::object::steal(e.dim());
+        }
+    }
+}
+}
+
+struct Tensor : public mpy::base<Tensor> {
+private:
+    at::Tensor tensor_;
+    at::Tensor batchtensor_;
+    OwnedSlice<DimEntry> levels_;
+    bool has_device_;
+    std::unique_ptr<DelayedOperator> delayed_;
+public:
+
+    at::Tensor& tensor(Arena& A) {
+        if (C10_UNLIKELY(!tensor_.defined())) {
+            AT_ASSERT(delayed_);
+            auto t = Tensor::wrap(run_torch_function(A, delayed_->orig, delayed_->args, true));
+            tensor_ = t->tensor(A);
+            delayed_.reset();
+            // don't force creation of batch tensor if it wasn't already provided.
+            batchtensor_ = t->batchtensor_;
+            AT_ASSERT(levels() == t->levels());
+        }
+        return tensor_;
+    }
+    at::Tensor& batchtensor(Arena& A) {
+        if (C10_UNLIKELY(!batchtensor_.defined())) {
+            batchtensor_ = _add_batch_dims(A, tensor(A), levels_.slice());
+        }
+        return batchtensor_;
+    }
+    Slice<DimEntry> levels() {
+        return levels_.slice();
+    }
+    bool has_device() {
+        return has_device_;
+    }
+    DelayedOperator* delayed() {
+        return delayed_.get();
+    }
+    static PyTypeObject Type;
+
+    static bool check_exact(mpy::handle v) {
+       return Py_TYPE(v.ptr()) == TensorType;
+    }
+
+
+    static mpy::obj<Tensor> create() {
+        if (!TensorType) {
+            TensorType = (PyTypeObject*) mpy::import("functorch.dim").attr("Tensor").release();
+        }
+        return Tensor::alloc(TensorType);
+    }
+    void capture_levels(Slice<DimEntry> levels) {
+        // grab ownership of the dims inside levels
+        for (auto l : levels) {
+            if (!l.is_positional()) {
+                mpy::object::borrow(l.dim()).release();
+            }
+        }
+        levels_.set(levels, free_levels_dims);
+    }
+    static mpy::object from_positional(Arena & A, at::Tensor tensor, Slice<DimEntry> levels, bool has_device);
+    static mpy::obj<Tensor> create_delayed(mpy::object op, mpy::vector_args args, Slice<DimEntry> levels, bool has_device);
+    friend struct EnableAllLayers;
+};
+
+namespace{
+// version in header does a unnecessary refcount +/-
+at::functorch::BatchedTensorImpl* maybeGetBatchedImpl(const at::Tensor& tensor) {
+    if (at::functorch::isBatchedTensor(tensor)) {
+        return static_cast<at::functorch::BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+    }
+    return nullptr;
+}
+
+TensorRef unchecked_tensor_from(mpy::handle p) {
+    auto v = (THPVariable*) p.ptr();
+    return TensorRef(*v->cdata);
+}
+
+static int64_t ndim_of_levels(Slice<DimEntry> levels) {
+    int64_t r = 0;
+    for (auto l : levels) {
+        if (l.is_positional()) {
+            ++r;
+        }
+    }
+    return r;
+}
+
+struct TensorInfo {
+    TensorRef tensor;
+    Slice<DimEntry> levels;
+    bool has_device;
+    TensorRef batchedtensor;
+    int64_t ndim() const {
+        return ndim_of_levels(levels);
+    }
+    operator bool() const {
+        return tensor;
+    }
+
+    static TensorInfo create(Arena& A, mpy::handle h, bool ensure_batched=true, bool ensure_present=true) {
+        if (Tensor::check_exact(h)) {
+            auto t = Tensor::unchecked_wrap(h);
+            return TensorInfo {t->tensor(A), t->levels(), t->has_device(), ensure_batched ? t->batchtensor(A) : TensorRef()};
+        } else if (Dim::check_exact(h)) {
+            auto d = Dim::unchecked_wrap(h);
+            return TensorInfo {d->range(), Slice<DimEntry>(A, DimEntry(d)), false, ensure_batched ? d->batchtensor() : TensorRef()};
+        } else if (THPVariable_Check(h.ptr())) {
+            TensorRef t = unchecked_tensor_from(h);
+            Slice<DimEntry> levels;
+            for (auto i : irange(-t->dim(), 0)) {
+                levels.append(A, i);
+            }
+            return TensorInfo {t, levels, true, t};
+        } else {
+            if (ensure_present) {
+                mpy::raise_error(PyExc_ValueError, "expected a tensor object");
+            }
+            return TensorInfo {};
+        }
+    }
+
+
+};
+
+static PyObject* py_Tensor_from_positional(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    #define ARGS(_) _(mpy::handle, tensor) _(mpy::handle, py_levels) _(int, has_device)
+    MPY_PARSE_ARGS_KWNAMES("OOp", ARGS)
+    #undef ARGS
+
+    if (!THPVariable_Check(tensor.ptr())) {
+        mpy::raise_error(PyExc_ValueError, "_tensor is not a Tensor?");
+    }
+
+    Slice<DimEntry> levels;
+    mpy::sequence_view sq(py_levels);
+    for (auto i : sq.enumerate()) {
+        mpy::object v = sq[i];
+        if (mpy::is_int(v)) {
+            auto vi = mpy::to_int(v);
+            levels.append(A, vi);
+        } else {
+            auto dim = Dim::wrap(std::move(v));
+            mpy::hdl<Dim> hdim = dim;
+            levels.append(A, hdim);
+        }
+    }
+    return Tensor::from_positional(A, THPVariable_Unpack(tensor.ptr()), levels, has_device != 0).release();
+    PY_END(nullptr)
+}
+}
+
+mpy::object Tensor::from_positional(Arena & A, at::Tensor tensor, Slice<DimEntry> levels, bool has_device) {
+    size_t seen_dims = 0;
+    int last = 0;
+    //auto sz = tensor.sizes();
+    for (auto i : levels.enumerate()) {
+        auto l = levels[i];
+        if (l.is_positional()) {
+            AT_ASSERT(last == 0 || last + 1 == l.position());
+            last = l.position();
+        } else {
+            mpy::object::borrow(l.dim()).release();
+            //AT_ASSERT(sz[i] == l.dim()->size());
+            ++seen_dims;
+        }
+    }
+    AT_ASSERT(last == 0 || last == -1);
+    if (!seen_dims) {
+        return mpy::object::steal(THPVariable_Wrap(tensor));
+    }
+
+    mpy::obj<Tensor> self = Tensor::create();
+    self->tensor_ = std::move(tensor);
+    AT_ASSERT(self->tensor_.dim() == levels.size());
+    self->levels_.set(levels, free_levels_dims);
+    self->has_device_ = has_device;
+    mpy::object r = std::move(self);
+    return r;
+}
+
+
+mpy::obj<Tensor> Tensor::create_delayed(mpy::object op, mpy::vector_args args, Slice<DimEntry> levels, bool has_device) {
+    mpy::obj<Tensor> self = Tensor::create();
+    self->capture_levels(levels);
+    self->has_device_ = has_device;
+    self->delayed_ = std::make_unique<DelayedOperator>(std::move(op), args);
+    return self;
+}
+
+namespace{
+mpy::list slice_to_list(Slice<mpy::handle> h) {
+    mpy::list lst(h.size());
+    for (auto i : h.enumerate()) {
+        lst.set(i, mpy::object::borrow(h[i]));
+    }
+    return lst;
+}
+
+mpy::tuple slice_to_tuple(Slice<mpy::handle> h) {
+    mpy::tuple lst(h.size());
+    for (auto i : h.enumerate()) {
+        lst.set(i, mpy::object::borrow(h[i]));
+    }
+    return lst;
+}
+
+enum UType {
+    U_ELEM,
+    U_TUPLE_LIKE,
+    U_DICT,
+};
+
+struct Unflatten {
+    mpy::object operator()(Slice<mpy::handle>& elements) {
+        mpy::object r;
+        switch (type) {
+            case U_ELEM: {
+                r = mpy::object::borrow(elements[0]);
+                elements = elements.slice(1);
+            } break;
+            case U_TUPLE_LIKE: {
+                mpy::tuple tup(children.size());
+                for (auto i : children.enumerate()) {
+                    tup.set(i, children[i](elements));
+                }
+                r = obj.call(tup);
+            } break;
+            case U_DICT: {
+                r = mpy::object::checked_steal(PyDict_New());
+                mpy::dict_view rv(r);
+                mpy::dict_view d(obj);
+                Py_ssize_t pos = 0;
+                mpy::handle k, v;
+                for (int i = 0; d.next(&pos, &k, &v); ++i) {
+                    rv.set(k, children[i](elements));
+                }
+            } break;
+        }
+        return r;
+    }
+    UType type;
+    mpy::handle obj;
+    Slice<Unflatten> children;
+};
+
+Unflatten tree_flatten(Arena& A, mpy::handle agg, Slice<mpy::handle>& flat_elements) {
+    Slice<Unflatten> c;
+    UType utype;
+    mpy::handle obj;
+    if (mpy::list_view::check(agg)) {
+        obj = agg.type();
+        utype = U_TUPLE_LIKE;
+        mpy::list_view l(agg);
+        for (auto i : l.enumerate()) {
+            c.append(A, tree_flatten(A, l[i], flat_elements));
+        }
+    } else if (mpy::tuple_view::check(agg)) {
+        obj = agg.type();
+        utype = U_TUPLE_LIKE;
+        // includes named tuples
+        mpy::tuple_view l(agg);
+        for (auto i : l.enumerate()) {
+            c.append(A, tree_flatten(A, l[i], flat_elements));
+        }
+    } else if (mpy::dict_view::check(agg)) {
+        utype = U_DICT;
+        mpy::dict_view d(agg);
+        obj = agg;
+        Py_ssize_t pos = 0;
+        mpy::handle k, v;
+        while (d.next(&pos, &k, &v)) {
+            c.append(A, tree_flatten(A, v, flat_elements));
+        }
+    } else {
+        utype = U_ELEM;
+        flat_elements.append(A, agg);
+    }
+    return Unflatten {utype, obj, c};
+}
+
+struct UnflattenVectorArgs {
+    mpy::vector_args operator()(Arena& A, Slice<mpy::handle>& elements) {
+        if (!had_nested) {
+            auto args = elements.begin();
+            elements = Slice<mpy::handle>();
+            return mpy::vector_args(args, nargs, kwnames);
+        }
+        Slice<mpy::handle> args;
+        for (auto u : children) {
+            args.append(A, A.autorelease(u(elements)));
+        }
+        return mpy::vector_args(args.begin(), nargs, kwnames);
+    }
+    Slice<Unflatten> children;
+    Py_ssize_t nargs;
+    mpy::handle kwnames;
+    bool had_nested;
+};
+
+UnflattenVectorArgs tree_flatten(Arena& A, mpy::vector_args args, Slice<mpy::handle>& flat_elements) {
+    UnflattenVectorArgs r;
+    r.kwnames = args.kwnames;
+    r.nargs = args.nargs;
+    r.had_nested = false;
+    auto N = args.size();
+    for(auto i : irange(N)) {
+        auto typ = Py_TYPE(args[i].ptr());
+        // fast checks that this thing isn't something that is nested.
+        bool is_element = !typ->tp_as_sequence ||  typ == torch_Tensor || typ == TensorType || typ == DimType;
+        if (!is_element) {
+            flat_elements.extend(A, args.args, args.args + i);
+            for (auto j : irange(i)) {
+                (void)j;
+                r.children.append(A, Unflatten {U_ELEM});
+            }
+            for (auto j : irange(i, N)) {
+                r.children.append(A, tree_flatten(A, args[j], flat_elements));
+                if (r.children.back().type != U_ELEM) {
+                    r.had_nested = true;
+                }
+            }
+            return r;
+        }
+    }
+    flat_elements.extend(A, args.args, args.args + N);
+    return r;
+}
+
+
+struct UnflattenArena {
+    Arena A;
+    Unflatten unflatten;
+};
+
+PyObject* py_unflatten(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    #define ARGS(_) _(mpy::handle, ns)
+    MPY_PARSE_ARGS_KWNAMES("O", ARGS)
+    #undef ARGS
+    mpy::sequence_view sv(ns);
+    // because we do not have a autorelase pool yet...
+    Arena A;
+    Slice<mpy::handle> slice;
+    mpy::handle Tuple = (PyObject*) &PyTuple_Type;
+    auto inputs = Tuple.call(ns);
+    mpy::tuple_view tv(inputs);
+    for (auto i : tv.enumerate()) {
+        slice.append(A, tv[i]);
+    }
+    auto AA = (UnflattenArena*) PyCapsule_GetPointer(self, "arena");
+    auto r = AA->unflatten(slice).release();
+    AT_ASSERT(r != nullptr);
+    return r;
+    PY_END(nullptr)
+}
+
+PyMethodDef py_unflatten_def = {"unflatten", (PyCFunction)(void*) py_unflatten, METH_FASTCALL | METH_KEYWORDS};
+
+void free_unflatten_arena(PyObject * pc) {
+    delete (UnflattenArena*) PyCapsule_GetPointer(pc, "arena");
+}
+
+PyObject* py_tree_flatten(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    #define ARGS(_) _(mpy::handle, tree)
+    MPY_PARSE_ARGS_KWNAMES("O", ARGS)
+    #undef ARGS
+    auto A = new UnflattenArena;
+    Slice<mpy::handle> elements;
+    A->unflatten = tree_flatten(A->A, tree, elements);
+    auto cap = mpy::object::checked_steal(PyCapsule_New(A, "arena", free_unflatten_arena));
+    auto unflatten = mpy::object::checked_steal(PyCFunction_New(&py_unflatten_def, cap.release()));
+    mpy::tuple r(2);
+    r.set(0, slice_to_list(elements));
+    r.set(1, std::move(unflatten));
+    return r.release();
+    PY_END(nullptr)
+}
+
+
+
+mpy::object tree_map(Arena& A, const std::function<mpy::handle(mpy::handle)>& fn, mpy::handle agg) {
+    Slice<mpy::handle> elements;
+    auto unflatten = tree_flatten(A, agg, elements);
+    for (auto i : elements.enumerate()) {
+        elements[i] = fn(elements[i]);
+    }
+    return unflatten(elements);
+}
+
+// prereq: isinstance(h, _Tensor)
+int64_t _Tensor_ndim(mpy::handle h) {
+    if (Tensor::check(h)) {
+        int64_t r = 0;
+        for (auto l : Tensor::unchecked_wrap(h)->levels()) {
+            if (l.is_positional()) {
+                ++r;
+            }
+        }
+        return r;
+    }
+    // Dim or DelayedMulTensor
+    return 0;
+}
+
+mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
+    // fast case: tensor is live in python
+    std::optional<PyObject*> mb_obj =
+        t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+    if (mb_obj.has_value() && !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
+        return *mb_obj;
+    }
+    return A.autorelease(mpy::object::checked_steal(THPVariable_Wrap(*t)));
+}
+}
+struct EnableAllLayers {
+    EnableAllLayers(Arena& A, Slice<DimEntry> levels) {
+        std::vector<std::pair<int64_t, int64_t>> layers;
+        layers.reserve(levels.size());
+        for (auto l : levels) {
+            if (!l.is_positional()) {
+                auto d = l.dim();
+                levels_to_dim_.append(A, d);
+            }
+        }
+        std::sort(levels_to_dim_.begin(), levels_to_dim_.end(), [](mpy::hdl<Dim> lhs, mpy::hdl<Dim> rhs) { return lhs->level_ < rhs->level_;});
+
+        for (auto i : levels_to_dim_.enumerate()) {
+            auto batch_size = levels_to_dim_[i]->size();
+            auto level = at::functorch::initAndPushDynamicLayer(at::functorch::TransformType::Vmap, batch_size, at::functorch::RandomnessType::Different);
+            if (i == 0) {
+                levels_start_ = level;
+            }
+        }
+    }
+
+    ~EnableAllLayers() {
+        auto to_remove = levels_start_ + levels_to_dim_.size() - 1;
+        for (auto i : levels_to_dim_.enumerate()) {
+            AT_ASSERT(at::functorch::popDynamicLayerAndDeleteMetadata().layerId() == to_remove - i);
+        }
+    }
+
+    mpy::obj<Tensor> from_batched(Arena& A, at::Tensor batchedtensor, bool has_device) {
+        Slice<DimEntry> levels;
+        for (auto i : irange(-batchedtensor.dim(), 0)) {
+            levels.append(A, i);
+        }
+        TensorRef tensor;
+        at::functorch::BatchedTensorImpl * impl = maybeGetBatchedImpl(batchedtensor);
+        while(true) {
+            auto level = impl->level();
+            AT_ASSERT(level >= levels_start_ && level < levels_start_ + levels_to_dim_.size());
+            mpy::hdl<Dim> dim = levels_to_dim_[level - levels_start_].ptr();
+            levels.insert(A, impl->bdim(), dim);
+            at::functorch::BatchedTensorImpl * nimpl = maybeGetBatchedImpl(impl->value());
+            if (!nimpl) {
+                tensor = impl->value();
+                break;
+            }
+            impl = nimpl;
+        }
+
+        mpy::obj<Tensor> self = Tensor::create();
+        // grab ownership of the tensors
+        self->tensor_ = *tensor;
+        self->batchtensor_ = std::move(batchedtensor);
+        self->has_device_ = has_device;
+        self->capture_levels(levels);
+        return self;
+    }
+    void inplace_update_layers(TensorRef batchtensor, Slice<DimEntry> levels) {
+        // XXX - requires a patch to functorch to att set_level
+        auto impl = maybeGetBatchedImpl(*batchtensor);
+        for (auto i : levels_to_dim_.reversed_enumerate()) {
+            if (!impl) {
+                break;
+            }
+            if (levels.contains(levels_to_dim_[i])) {
+                impl->_unsafe_set_level(levels_start_ + i);
+                impl = maybeGetBatchedImpl(impl->value());
+
+            }
+        }
+    }
+private:
+    int64_t levels_start_{};
+    Slice<mpy::hdl<Dim>> levels_to_dim_;
+};
+
+namespace{
+TensorRef _match_levels(Arena& A, TensorRef v, Slice<DimEntry> from_levels, Slice<DimEntry> to_levels, bool drop_levels=false) {
+    if (from_levels == to_levels) {
+        return v;
+    }
+    // drop_levels -> if a dim appears in from_levels but not to_levels, it is assumed it has stride 0.
+    at::IntArrayRef sz = v->sizes();
+    at::IntArrayRef sd = v->strides();
+    AT_ASSERT(drop_levels || from_levels.size() <= to_levels.size());
+    Slice<int64_t> nsz;
+    Slice<int64_t> nsd;
+    for (auto l : to_levels) {
+        auto oidx = from_levels.index(l);
+        if (!oidx) {
+            nsz.append(A, l.is_positional() ? 1 : l.dim()->size());
+            nsd.append(A, 0);
+        } else {
+            auto idx = *oidx;
+            nsz.append(A, sz[idx]);
+            nsd.append(A, sd[idx]);
+        }
+    }
+    return A.autorelease(v->as_strided(at::IntArrayRef(nsz.begin(), nsz.end()), at::IntArrayRef(nsd.begin(), nsd.end()), v->storage_offset()));
+}
+}
+mpy::object run_torch_function(Arena &A, mpy::handle orig, mpy::vector_args args, bool is_pointwise) {
+    if (!pointwise_optimize) {
+        is_pointwise = false;
+    }
+    // std::cout << "__torch_function__ " << ((is_pointwise) ? "pointwise" : "functorch") << " " << orig << "\n";
+
+    Slice<mpy::hdl<Dim>> all_dims;
+    Slice<mpy::handle> flat_args;
+    auto unflatten_args = tree_flatten(A, args, flat_args);
+    TensorRef device_holding_tensor;
+
+    Slice<TensorInfo> infos;
+    Slice<DimEntry> result_levels;
+    for (auto f : flat_args) {
+        infos.append(A, TensorInfo::create(A, f, !is_pointwise, false));
+        if (infos.back()) {
+            TensorInfo& info = infos.back();
+            AT_ASSERT(is_pointwise || info.batchedtensor);
+            if (!device_holding_tensor && info.has_device) {
+                device_holding_tensor = infos.back().tensor;
+            }
+            for (auto l : info.levels) {
+                if (!result_levels.contains(l)) {
+                    result_levels.append(A, l);
+                }
+            }
+        }
+    }
+
+    if (is_pointwise) {
+        for (auto i : flat_args.enumerate()) {
+            if (infos[i]) {
+                TensorRef tensor = infos[i].tensor;
+                if (device_holding_tensor && !infos[i].has_device) {
+                    tensor = A.autorelease(tensor->to(device_holding_tensor->device()));
+                }
+                auto ml = _match_levels(A, tensor, infos[i].levels, result_levels);
+                flat_args[i] = handle_from_tensor(A, std::move(ml));
+            }
+        }
+
+        Slice<mpy::handle> flat_it = flat_args;
+        mpy::vector_args uargs = unflatten_args(A, flat_it);
+
+        mpy::object result = orig.call_vector(uargs);
+
+        // fast wrap for normal case where operator just returns a tensor.
+        if (THPVariable_Check(result.ptr())) {
+            return Tensor::from_positional(A, THPVariable_Unpack(result.ptr()), result_levels, device_holding_tensor);
+        }
+        auto wrap = [&](mpy::handle h) {
+            if (THPVariable_Check(h.ptr())){
+                return A.autorelease(Tensor::from_positional(A, THPVariable_Unpack(h.ptr()), result_levels, device_holding_tensor));
+            }
+            return h;
+        };
+        return tree_map(A, wrap, result);
+    } else {
+        // std::cout << orig << " calling functorch...\n";
+        // std::cout << "rl: " << result_levels << "\n";
+        EnableAllLayers guard(A, result_levels);
+        for (auto i : flat_args.enumerate()) {
+            if (infos[i]) {
+                TensorRef batched = infos[i].batchedtensor;
+                if (device_holding_tensor && !infos[i].has_device) {
+                    batched = A.autorelease(batched->to(device_holding_tensor->device()));
+                }
+                guard.inplace_update_layers(batched, infos[i].levels);
+                flat_args[i] = handle_from_tensor(A, batched);
+            }
+        }
+        Slice<mpy::handle> flat_it = flat_args;
+        mpy::vector_args uargs = unflatten_args(A, flat_it);
+        AT_ASSERT(flat_it.size() == 0);
+        mpy::object result = orig.call_vector(uargs);
+        auto wrap = [&](mpy::handle h) {
+            if (THPVariable_Check(h.ptr())) {
+                return A.autorelease(guard.from_batched(A, THPVariable_Unpack(h.ptr()), device_holding_tensor));
+            }
+            return h;
+        };
+        if (THPVariable_Check(result.ptr())) {
+            return guard.from_batched(A, THPVariable_Unpack(result.ptr()), device_holding_tensor);
+        }
+        return tree_map(A, wrap, result);
+    }
+}
+
+namespace{
+
+mpy::object __torch_function__(Arena &A, mpy::handle orig, mpy::vector_args args, bool is_pointwise) {
+    if (orig == torch_Tensor___mul__) {
+        AT_ASSERT(args.nargs == 2 && !args.has_keywords());
+        auto lhs = args[0];
+        auto rhs = args[1];
+        if (mpy::isinstance(lhs, _Tensor) && mpy::isinstance(rhs, _Tensor) && _Tensor_ndim(lhs) == 0 && _Tensor_ndim(rhs) == 0) {
+            bool has_device = false;
+            Slice<DimEntry> levels;
+            for (auto i : args.enumerate_positional()) {
+                auto t = TensorInfo::create(A, args[i], false);
+                // something like a mask * rhs, which matrix multiplies don't correctly promote
+                if (!t.tensor->is_floating_point()) {
+                    return run_torch_function(A, orig, args, is_pointwise);
+                }
+                has_device = has_device || t.has_device;
+                for (auto l : t.levels) {
+                    if (!levels.contains(l)) {
+                        levels.append(A, l);
+                    }
+                }
+            }
+            // std::cout << "__torch_function__ " << "delay" << " " << orig << "\n";
+            return Tensor::create_delayed(mpy::object::borrow(orig), args, levels, has_device);
+        }
+    }
+    return run_torch_function(A, orig, args, is_pointwise);
+}
+
+mpy::vector_args as_vector_args(Arena& A, mpy::handle args, mpy::handle kwargs) {
+    auto pos_args = (mpy::handle*) &PyTuple_GET_ITEM(args.ptr(), 0);
+    auto pos_n = PyTuple_GET_SIZE(args.ptr());
+    if (!kwargs.ptr()) {
+        return mpy::vector_args(pos_args, pos_n, nullptr);
+    }
+    Slice<mpy::handle> all_args;
+    Slice<mpy::handle> kwnames;
+    all_args.extend(A, pos_args, pos_args + pos_n);
+    mpy::dict_view dv(kwargs);
+    Py_ssize_t pos = 0;
+    mpy::handle key, value;
+    while (dv.next(&pos, &key, &value)) {
+        all_args.append(A, value);
+        kwnames.append(A, key);
+    }
+    return mpy::vector_args(all_args.begin(), pos_n, A.autorelease(slice_to_tuple(kwnames)));
+}
+
+PyObject* py___torch_function__(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    maybeInitializeGlobals();
+    AT_ASSERT(nargs == 4 || nargs == 5);
+    auto va = as_vector_args(A, args[3], nargs == 5 ? args[4] : nullptr);
+    bool is_pointwise = pointwise.contains(args[1]);
+    return __torch_function__(A, args[1], std::move(va), is_pointwise).release();
+    PY_END(nullptr)
+}
+
+mpy::object levels_to_tuple(Slice<DimEntry> slice) {
+    mpy::tuple t(slice.size());
+    for (auto i : slice.enumerate()) {
+        t.set(i, slice[i].is_positional() ?  mpy::from_int(slice[i].position()) : mpy::object::borrow(slice[i].dim()));
+    }
+    mpy::object r = std::move(t);
+    return r;
+}
+
+PyObject* Tensor_ndim(Tensor* self, void*) {
+    Py_ssize_t i = 0;
+    for (auto l : self->levels()) {
+        if (l.is_positional()) {
+            ++i;
+        }
+    }
+    return mpy::from_int(i).release();
+}
+
+PyGetSetDef Tensor_getsetters[] = {
+   {"_has_device", (getter) [](PyObject* self, void*) -> PyObject* { return mpy::from_bool(((Tensor*)self)->has_device()).release(); }, NULL},
+   {"_tensor", (getter) [](PyObject* self, void*) -> PyObject* {
+       Arena A;
+       return THPVariable_Wrap(((Tensor*)self)->tensor(A)); }, NULL},
+   {"_batchtensor", (getter) [](PyObject* self, void*) -> PyObject* {
+       Arena A;
+       return THPVariable_Wrap(((Tensor*)self)->batchtensor(A)); }, NULL},
+   {"_levels", (getter) [](PyObject* self, void*) -> PyObject* {
+       PY_BEGIN
+       return levels_to_tuple(((Tensor*)self)->levels()).release();
+       PY_END(nullptr)
+   }},
+    {"ndim", (getter) Tensor_ndim, NULL, "ndim", NULL},
+    {NULL}  /* Sentinel */
+};
+
+PyMethodDef Tensor_methods[] = {
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+}
+
+
+PyTypeObject Tensor::Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_C.Tensor",               /* tp_name */
+    sizeof(Tensor),               /* tp_basicsize */
+    0,                              /* tp_itemsize */
+    Tensor::dealloc_stub,      /* tp_dealloc */
+    0,                              /* tp_vectorcall_offset */
+    0,                              /* tp_getattr */
+    0,                              /* tp_setattr */
+    0,                              /* tp_as_async */
+    0,           /* tp_repr */
+    0,                 /* tp_as_number */
+    0,                 /* tp_as_sequence */
+    0,             /* tp_as_mapping */
+    0,      /* tp_hash */
+    0,                              /* tp_call */
+    0,                              /* tp_str */
+    0,                              /* tp_getattro */
+    0,                              /* tp_setattro */
+    0,                              /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE , /* tp_flags */
+    "Tensor Object",                   /* tp_doc */
+    0,                              /* tp_traverse */
+    0,                              /* tp_clear */
+    0,  /* tp_richcompare */
+    0,                              /* tp_weaklistoffset */
+    0,                              /* tp_iter */
+    0,                              /* tp_iternext */
+    Tensor_methods,                /* tp_methods */
+    0,                              /* tp_members */
+    Tensor_getsetters,             /* tp_getset */
+    0,                              /* tp_base */
+    0,                              /* tp_dict */
+    0,                              /* tp_descr_get */
+    0,                              /* tp_descr_set */
+    0,                              /* tp_dictoffset */
+    0,            /* tp_init */
+    0,                              /* tp_alloc */
+    Tensor::new_stub,                      /* tp_new */
+};
+
+
+// dim() --------------------
+
+static bool relevant_op(_Py_CODEUNIT c) {
+    switch(c) {
+        case STORE_NAME:
+        case STORE_GLOBAL:
+        case STORE_FAST:
+        case STORE_DEREF:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static mpy::object create_dim(mpy::object name, mpy::handle size) {
+    auto d = Dim::create(std::move(name));
+    if (!mpy::is_none(size)) {
+        d->set_size(mpy::to_int(size));
+    }
+    return std::move(d);
+}
+
+static mpy::object create_dimlist(mpy::object name, mpy::handle size) {
+    auto d = DimList::create(std::move(name));
+    if (!mpy::is_none(size)) {
+        if (mpy::is_int(size)) {
+            d->bind_len(mpy::to_int(size));
+        } else {
+            mpy::sequence_view s(size);
+            d->bind_len(s.size());
+            for (auto i : irange(d->size())) {
+                d->dims_[i]->set_size(mpy::to_int(s[i]));
+            }
+        }
+    }
+    return std::move(d);
+}
+
+
+
+// Python wrappers that make new reflection primitives available for older runtimes
+#if !(IS_PYTHON_3_11_PLUS)
+#define _PyCode_CODE(CO) ((_Py_CODEUNIT*)PyBytes_AS_STRING((CO)->co_code))
+#endif
+
+namespace{
+struct PyInstDecoder {
+    PyInstDecoder(PyCodeObject* code_object, int lasti)
+    : code_object_(code_object), code_(_PyCode_CODE(code_object)), offset_(lasti / sizeof(_Py_CODEUNIT))  {}
+    // On Windows, _PyOpcode_Caches and _PyOpcode_Deopt are private symbols
+    // See https://github.com/pytorch/pytorch/issues/93854
+    void next() {
+    #if IS_PYTHON_3_11_PLUS
+        offset_ += _PyOpcode_Caches[opcode()];
+    #endif
+        offset_ += 1;
+    }
+    int opcode() {
+        auto r = _Py_OPCODE(code_[offset_]);
+    #if IS_PYTHON_3_11_PLUS
+        r = _PyOpcode_Deopt[r];
+    #endif
+        return r;
+    }
+    int oparg() {
+        return _Py_OPARG(code_[offset_]);
+    }
+
+    mpy::object name() {
+        mpy::object names;
+        switch(opcode()) {
+            case STORE_NAME:
+            case STORE_GLOBAL:
+                names = mpy::object::borrow(code_object_->co_names);
+                break;
+            case STORE_FAST:
+                names = mpy::object::steal(PyCode_GetVarnames(code_object_));
+                break;
+            case STORE_DEREF:
+                names = mpy::object::steal(PyCode_GetCellvars(code_object_));
+                break;
+            default:
+                return mpy::object();
+        }
+        return mpy::object::steal(PySequence_GetItem(names.ptr(), oparg()));
+    }
+private:
+    PyCodeObject* code_object_;
+    _Py_CODEUNIT* code_;
+    int offset_;
+};
+
+template<mpy::object (*create_object)(mpy::object, mpy::handle)>
+static PyObject* _dims(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    Py_ssize_t specified_ndims = -1;
+    Py_ssize_t found_ndims = 0;
+    Py_ssize_t sizes = -1;
+    mpy::handle n = Py_None;
+    mpy::handle py_sizes = Py_None;
+
+    if (nargs || kwnames) {
+        mpy::vector_args va(args, nargs, kwnames);
+        va.parse("dims", {"n", "sizes"}, {&n, &py_sizes}, 0);
+        if (!mpy::is_none(py_sizes)) {
+            sizes = mpy::sequence_view(py_sizes).size();
+            specified_ndims = sizes;
+        }
+        if (!mpy::is_none(n)) {
+            specified_ndims = mpy::to_int(n);
+        }
+    }
+
+    PyThreadState* state = PyThreadState_GET();
+    auto f = mpy::obj<PyFrameObject>::steal(PyThreadState_GetFrame(state));
+    auto c = mpy::obj<PyCodeObject>::steal(PyFrame_GetCode(f.ptr()));
+    auto lasti = PyFrame_GetLasti(f.ptr());
+    auto decoder = PyInstDecoder(c.ptr(), lasti);
+    #if IS_PYTHON_3_11_PLUS
+    // When py3.11 adapts bytecode lasti points to the precall
+    // rather than the call instruction after it
+    if (decoder.opcode() == PRECALL) {
+        decoder.next();
+    }
+    #endif
+    decoder.next();
+
+    if (relevant_op(decoder.opcode())) {
+        found_ndims = 1;
+    } else if (decoder.opcode() == UNPACK_SEQUENCE) {
+        found_ndims = decoder.oparg();
+        decoder.next();
+    }
+
+    if (specified_ndims == -1) {
+        if (found_ndims == 0) {
+            mpy::raise_error(PyExc_SyntaxError, "dims() must be assigned to a sequence of variable names or have argument n specified");
+        }
+        specified_ndims = found_ndims;
+    }
+    if (found_ndims != specified_ndims) {
+        found_ndims = 0; // avoid taking the wrong names for dimensions
+    }
+
+    auto genobject = [&](int i) -> mpy::object {
+        mpy::object name;
+        if (i < found_ndims) {
+            name = decoder.name();
+        }
+        if (!name.ptr()) {
+            name = mpy::unicode_from_format("d%d", i);
+            found_ndims = 0; // once we fail at finding a name, we can find any more
+        } else {
+            decoder.next();
+        }
+        return create_object(std::move(name), sizes != -1 ? mpy::sequence_view(py_sizes)[i] : mpy::handle(Py_None));
+    };
+    if (sizes != -1 && sizes != specified_ndims) {
+        mpy::raise_error(PyExc_ValueError, "expected %d sizes but found %d", int(specified_ndims), int(sizes));
+    }
+    if (specified_ndims == 1) {
+        return genobject(0).release();
+    }
+    mpy::tuple result(specified_ndims);
+    for (int i = 0; i < specified_ndims; ++i) {
+        result.set(i, genobject(i));
+    }
+    return result.release();
+    PY_END(nullptr)
+}
+
+struct DotPart {
+    Slice<DimEntry> dims;
+    size_t total_size = 1;
+    void append(Arena& A, mpy::hdl<Dim> d) {
+        total_size *= d->size();
+        dims.append(A, d);
+    }
+};
+
+template<typename T>
+static at::ArrayRef<T> as_array_ref(Slice<T> t) {
+    return at::ArrayRef<T>(t.begin(), t.end());
+}
+
+static TensorRef dot_prepare(Arena& A, std::initializer_list<DotPart> parts, const TensorInfo& t) {
+    Slice<DimEntry> new_levels;
+    bool needs_reshape = false;
+    for (auto p : parts) {
+        if (p.dims.size() != 1) {
+            needs_reshape = true;
+        }
+        new_levels.extend(A, p.dims);
+    }
+    auto r = _match_levels(A, t.tensor, t.levels, new_levels, true);
+    if (!needs_reshape) {
+        return r;
+    }
+    Slice<int64_t> view;
+    for (auto p : parts) {
+        view.append(A, p.total_size);
+    }
+    return A.autorelease(r->reshape(at::IntArrayRef(view.begin(), view.end())));
+}
+
+static mpy::object dot_finish(Arena& A, std::initializer_list<DotPart> parts, at::Tensor r) {
+    Slice<DimEntry> result_levels;
+    bool needs_reshape = false;
+    for (auto p : parts) {
+        if (p.dims.size() != 1) {
+            needs_reshape = true;
+        }
+        result_levels.extend(A, p.dims);
+    }
+    if (needs_reshape) {
+        Slice<int64_t> new_size;
+        for (auto l : result_levels) {
+            new_size.append(A, l.dim()->size());
+        }
+        r = r.reshape(at::IntArrayRef(new_size.begin(), new_size.end()));
+    }
+    return Tensor::from_positional(A, std::move(r), result_levels, true);
+}
+
+
+
+static mpy::object dot(Arena& A, TensorInfo lhs, TensorInfo rhs, Slice<DimEntry> sum) {
+    auto lhs_strides = lhs.tensor->strides();
+    auto rhs_strides = rhs.tensor->strides();
+
+    DotPart lro_dims;
+    DotPart lo_dims;
+    DotPart ro_dims;
+    DotPart lr_dims;
+
+    auto insert_dim = [&] (mpy::hdl<Dim> d, std::optional<int> lhs_idx, std::optional<int> rhs_idx) {
+        bool reduced = sum.contains(d);
+        int64_t lhs_stride = lhs_idx ? lhs_strides[*lhs_idx] : 0;
+        int64_t rhs_stride = rhs_idx ? rhs_strides[*rhs_idx] : 0;
+        if (reduced) {
+            // lr
+            lr_dims.append(A, d);
+        } else {
+            if ((lhs_stride == 0) == (rhs_stride == 0)) {
+                // lro
+                lro_dims.append(A, d);
+            } else if (lhs_stride != 0) {
+                // lo
+                lo_dims.append(A, d);
+            } else {
+                AT_ASSERT(rhs_stride != 0);
+                ro_dims.append(A, d);
+            }
+        }
+    };
+
+
+    auto rhs_seen = A.allocate<bool>(rhs.levels.size());
+    std::fill(rhs_seen, rhs_seen + rhs.levels.size(), false);
+
+    for (auto i : lhs.levels.enumerate()) {
+        auto d = lhs.levels[i];
+        auto rhs_idx = rhs.levels.index(d);
+        if (rhs_idx) {
+            rhs_seen[*rhs_idx] = true;
+        }
+        insert_dim(d.dim(), i, rhs_idx);
+    }
+
+    for (auto i : rhs.levels.enumerate()) {
+        if (rhs_seen[i]) {
+            continue;
+        }
+        auto d = rhs.levels[i];
+        insert_dim(d.dim(), std::nullopt, i);
+    }
+
+    if (lr_dims.dims.size() != sum.size()) {
+        for (auto & d : sum) {
+            if (!lhs.levels.contains(d) && !rhs.levels.contains(d)) {
+                mpy::raise_error(DimensionBindError(), "summing over non-existent dimension %S", d.dim().ptr());
+            }
+        }
+    }
+
+    // std::cout << lhs.levels << " " << rhs.levels << " " << sum << "\n";
+    // std::cout << lro_dims.dims << " " << lo_dims.dims << " " << ro_dims.dims << " " << lr_dims.dims << "\n";
+
+    // no batch, just call mm
+    if (lro_dims.dims.size() != 0) {
+        auto lhs_ = dot_prepare(A, {lro_dims, lo_dims, lr_dims}, lhs);
+        auto rhs_ = dot_prepare(A, {lro_dims, lr_dims, ro_dims}, rhs);
+        return dot_finish(A, {lro_dims, lo_dims, ro_dims}, at::bmm(*lhs_, *rhs_));
+    } else {
+        auto lhs_ = dot_prepare(A, {lo_dims, lr_dims}, lhs);
+        auto rhs_ = dot_prepare(A, {lr_dims, ro_dims}, rhs);
+        return dot_finish(A, {lo_dims, ro_dims}, at::mm(*lhs_, *rhs_));
+    }
+
+}
+
+static PyObject* test_c(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+
+    Arena A;
+    Slice<int> s(A, 3, 4, 5);
+    AT_ASSERT(s.size() == 3 && s.capacity() == 8);
+    AT_ASSERT(s[0] == 3 && s[1] == 4 && s[2] == 5);
+    s.append(A, 6);
+    AT_ASSERT(s[3] == 6);
+    for(int i : irange(10)) {
+        s.append(A, i);
+    }
+    AT_ASSERT(s[0] == 3 && s.back() == 9 && s.size() == 14 && s.capacity() == 16);
+
+    Slice<int> s2(A, -1, -2, -3);
+    AT_ASSERT(s2[1] == -2 && s[0] == 3);
+
+    auto ss = s.slice(1,2);
+    AT_ASSERT(ss.size() == 1);
+    AT_ASSERT(ss[0] == 4);
+    AT_ASSERT(ss.capacity() == 1);
+    ss.append(A, -4);
+    AT_ASSERT(ss.size() == 2 && ss[1] == -4);
+    ss[0] = 3;
+    AT_ASSERT(s[1] == 4);
+
+    s.insert(A, s.slice(1, 4), ss);
+    AT_ASSERT(s[1] == 3  && s[2] == -4 && s[3] == 0);
+
+    auto sz = s.size();
+    s.insert(A, s.slice(1, 1), 4);
+    AT_ASSERT(s[1] == 4 && sz + 1 == s.size());
+
+
+    Slice<int> d(A, 0, 1, 2, 3, 4);
+
+    Slice<int> b(A, 0, 1, 2, 3, 4);
+    b.insert(A, b.slice(1,1), d);
+    AT_ASSERT(b.size() == 10);
+    AT_ASSERT(b[1] == 0);
+    AT_ASSERT(b[5] == 4);
+    AT_ASSERT(b.back() == 4);
+
+    Py_RETURN_NONE;
+
+    PY_END(nullptr);
+}
+
+
+static PyObject* order(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    if (kwnames) {
+        mpy::raise_error(PyExc_TypeError, "unexpected keyword arguments %S", kwnames);
+    }
+    AT_ASSERT(nargs-- > 0);
+    Slice<DimEntry> orig_levels;
+    Slice<DimEntry> levels;
+    TensorRef data;
+    mpy::handle self = args++[0];
+    bool has_device;
+    if (Tensor::check_exact(self)) {
+        auto t = Tensor::unchecked_wrap(self);
+        orig_levels = t->levels();
+        data = t->tensor(A);
+        has_device = t->has_device();
+    } else {
+       auto d = Dim::unchecked_wrap(self);
+        orig_levels.append(A, d);
+        data = d->range();
+        has_device = false;
+    }
+
+    Slice<DimEntry> flat_positional_dims;
+    Slice<std::pair<int, int>> to_flatten;
+    levels.extend(A, orig_levels);
+
+    int orig_ndim = ndim_of_levels(levels);
+    auto append = [&](DimEntry d) {
+        auto midx = levels.index(d);
+        if (!midx) {
+            if (d.is_positional()) {
+                mpy::raise_error(PyExc_ValueError, "tensor has %d positional dimensions, but %d specified, or it was specified twice", int(orig_ndim), int(d.position() + orig_ndim));
+            } else {
+                mpy::raise_error(PyExc_ValueError, "tensor of dimensions %R does not contain dim %R or it was specified twice", levels_to_tuple(orig_levels).ptr(), d.dim().ptr());
+            }
+        }
+        levels[*midx] = DimEntry();
+        flat_positional_dims.append(A, d);
+    };
+
+    int n_new_positional = 0;
+    for (auto i :irange(nargs)) {
+        mpy::handle arg  = args[i];
+        DimEntry entry = _wrap_dim(arg, orig_ndim, false);
+        if (!entry.is_none()) {
+            append(entry);
+            ++n_new_positional;
+        } else if (DimList::check(arg)) {
+            auto dl = DimList::unchecked_wrap(arg);
+            for (mpy::obj<Dim> & d : dl->dims_) {
+                append(mpy::hdl<Dim>(d));
+                ++n_new_positional;
+            }
+        } else {
+            ++n_new_positional;
+            if (!mpy::is_sequence(arg)) {
+                mpy::raise_error(PyExc_ValueError, "expected a Dim, List[Dim], or Sequence[Dim]");
+            }
+            mpy::sequence_view sq(arg);
+            auto N = sq.size();
+            to_flatten.append(A, std::make_pair(flat_positional_dims.size(), N));
+            for (auto j : irange(N)) {
+                DimEntry e = _wrap_dim(A.autorelease(sq[j]), orig_ndim, false);
+                if (e.is_none()) {
+                    mpy::raise_error(PyExc_ValueError, "expected a Dim, or int");
+                }
+                append(e);
+            }
+        }
+    }
+
+    int insert_point = -1;
+    Slice<DimEntry> new_levels;
+    for (auto l : levels) {
+        if (l.is_none()) {
+            continue;
+        }
+        if (l.is_positional()) {
+            if (insert_point == -1) {
+                insert_point = new_levels.size();
+                new_levels.extend(A, flat_positional_dims);
+            }
+        }
+        new_levels.append(A, l);
+    }
+    if (insert_point == -1) {
+        insert_point = new_levels.size();
+        new_levels.extend(A, flat_positional_dims);
+    }
+
+    at::Tensor ndata = *_match_levels(A, data, orig_levels, new_levels);
+
+    if (to_flatten.size()) {
+        Slice<int64_t> view;
+        auto sz = ndata.sizes();
+        // before the new positional dims
+        for (auto i : irange(0, insert_point)) {
+            view.append(A, sz[i]);
+        }
+        int i = 0;
+        for (auto to_flat : to_flatten) {
+            for (;i < to_flat.first; ++i) {
+                view.append(A, sz[insert_point + i]);
+            }
+            int64_t new_size = 1;
+            int last = i + to_flat.second;
+            for (; i < last; ++i) {
+                new_size *= sz[insert_point + i];
+            }
+            view.append(A, new_size);
+        }
+        for (; i < flat_positional_dims.size(); ++i) {
+            view.append(A, sz[insert_point + i]);
+        }
+        // after the new positional dims
+        for (auto i : irange(insert_point + flat_positional_dims.size(), levels.size())) {
+            view.append(A, sz[i]);
+        }
+        // we shorted the number of dimension, so remove them from new levels
+        // we will renumber them later
+        auto n_to_remove = flat_positional_dims.size() - n_new_positional;
+        new_levels.insert(A, new_levels.slice(insert_point, insert_point + n_to_remove), Slice<DimEntry>());
+        ndata = std::move(ndata).reshape(at::IntArrayRef(view.begin(), view.end()));
+    }
+
+    // renumber the positional dimension
+    int seen = 0;
+    for (auto i : new_levels.reversed_enumerate()) {
+        if (new_levels[i].is_positional() || (i >= insert_point && i < insert_point + n_new_positional)) {
+            new_levels[i] = --seen;
+        }
+    }
+    return Tensor::from_positional(A, std::move(ndata), new_levels, has_device).release();
+
+    PY_END(nullptr)
+}
+
+static PyObject* expand(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    AT_ASSERT(nargs-- > 0);
+    auto info = TensorInfo::create(A, args++[0], false);
+    for (auto i : irange(nargs)) {
+        if (!Dim::check(args[i])) {
+            maybeInitializeGlobals();
+            mpy::vector_args vargs(args - 1, nargs + 1, kwnames);
+            if (THPVariable_Check(args[-1])) {
+                return torch_Tensor_expand.call_vector(vargs).release();
+            } else {
+                return __torch_function__(A, torch_Tensor_expand, vargs, false).release();
+            }
+        }
+    }
+    const at::Tensor& data = *info.tensor;
+    auto levels = info.levels;
+    Slice<DimEntry> new_levels;
+    Slice<int64_t> sz;
+    Slice<int64_t> sd;
+    for (auto i : irange(nargs)) {
+        auto d = Dim::unchecked_wrap(args[i]);
+        if (levels.contains(d) || new_levels.contains(d)) {
+            mpy::raise_error(DimensionBindError(), "expanding dimension %R already exists in tensor with dims", d.ptr());
+        }
+        new_levels.append(A, d);
+        sz.append(A, d->size());
+        sd.append(A, 0);
+    }
+    new_levels.extend(A, levels);
+    at::IntArrayRef osz = data.sizes();
+    at::IntArrayRef osd = data.strides();
+    sz.extend(A, osz.begin(), osz.end());
+    sd.extend(A, osd.begin(), osd.end());
+    at::Tensor ndata = data.as_strided(at::IntArrayRef(sz.begin(), sz.end()), at::IntArrayRef(sd.begin(), sd.end()), data.storage_offset());
+    return Tensor::from_positional(A, std::move(ndata), new_levels, info.has_device).release();
+    PY_END(nullptr)
+}
+
+
+static void _bind_dims_to_size(Arena & A, int64_t sz, int64_t sd,
+                        Slice<mpy::hdl<Dim>> dims, Slice<int64_t>& nsz, Slice<int64_t>& nsd) {
+    int64_t rhs_prod = 1;
+    for (auto i : dims.enumerate()) {
+        if (!dims[i]->is_bound()) {
+            for (auto j : irange(i + 1, dims.size())) {
+                if (!dims[j]->is_bound()) {
+                    mpy::raise_error(DimensionBindError(), "cannot infer the sizes of two dimensions at once %R and %R", dims[i].ptr(), dims[j].ptr());
+                }
+                rhs_prod *= dims[j]->size();
+            }
+            if (sz % rhs_prod != 0) {
+                mpy::tuple tup(dims.size());
+                for (auto j : dims.enumerate()) {
+                    tup.set(j, dims[j]->is_bound() ? mpy::from_int(dims[j]->size()) : mpy::unicode_from_string("?"));
+                }
+                mpy::raise_error(DimensionBindError(), "inferred dimension does not evenly fit into larger dimension: %d vs %R", (int) sz, tup.ptr());
+            }
+            int64_t inferred_size = sz / rhs_prod;
+            dims[i]->set_size(inferred_size);
+            rhs_prod = sz;
+            break;
+        }
+        rhs_prod *= dims[i]->size();
+    }
+    if (rhs_prod != sz) {
+        mpy::tuple tup(dims.size());
+        for (auto j : dims.enumerate()) {
+            tup.set(j, mpy::object::borrow(dims[j]));
+        }
+        mpy::raise_error(DimensionBindError(), "Dimension sizes to do not match (%d != %d) when matching dimension pack %R", (int) sz, (int) rhs_prod, tup.ptr());
+    }
+    auto new_strides = A.allocate<int64_t>(dims.size());
+    auto prev_stride = sd;
+    for (auto i : dims.reversed_enumerate()) {
+        new_strides[i] = prev_stride;
+        prev_stride = dims[i]->size()*prev_stride;
+    }
+    for (auto i : dims.enumerate()) {
+        nsd.append(A, new_strides[i]);
+        nsz.append(A, dims[i]->size());
+    }
+}
+
+static bool has_dims(mpy::handle d) {
+    return Dim::check_exact(d) || Tensor::check_exact(d);
+}
+
+struct IndexingInfo {
+    bool can_call_original; // if true, then it is safe to just call getitem or setitem, these objects do not need special handling
+    bool advanced_indexing; // requires actual lookup
+    TensorRef self;
+    Slice<mpy::handle> flat_inputs;
+    Slice<DimEntry> result_levels;
+    bool has_device;
+};
+}
+
+IndexingInfo getsetitem_flat(Arena& A, TensorInfo self_info, Slice<mpy::handle> input, Slice<DimEntry> keys, Slice<mpy::handle> values, bool has_dimpacks_or_none);
+namespace{
+Slice<mpy::handle> as_slice(mpy::tuple_view tv) {
+    PyObject** begin = &PyTuple_GET_ITEM(tv.ptr(),0);
+    return Slice<mpy::handle>((mpy::handle*)begin, (mpy::handle*) (begin + tv.size()));
+}
+
+Slice<mpy::handle> as_slice(mpy::list_view tv) {
+    PyObject** begin = &PyList_GET_ITEM(tv.ptr(),0);
+    return Slice<mpy::handle>((mpy::handle*)begin, (mpy::handle*) (begin + tv.size()));
+}
+
+
+bool maybe_dimpack(Slice<mpy::handle>& elements, mpy::handle s, bool check_first=true) {
+    // can we avoid rechecking?
+    if (mpy::list_view::check(s)) {
+        mpy::list_view tv(s);
+        if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
+            elements = as_slice(tv);
+            return true;
+        }
+    }
+    // can we avoid rechecking?
+    if (mpy::tuple_view::check(s)) {
+        mpy::tuple_view tv(s);
+        if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
+            elements = as_slice(tv);
+            return true;
+        }
+    }
+    return false;
+};
+
+bool is_dimpack(mpy::handle s) {
+    Slice<mpy::handle> e;
+    return maybe_dimpack(e, s);
+}
+
+mpy::object invoke_getitem(Arena& A, const IndexingInfo& iinfo) {
+    at::Tensor rtensor;
+    if (iinfo.advanced_indexing) {
+        auto self_hdl = handle_from_tensor(A, iinfo.self);
+        auto tup = slice_to_tuple(iinfo.flat_inputs);
+        // std::cout << "calling original getindex " << self_hdl << " " << tup << "\n";
+        auto pytensor = mpy::object::checked_steal(THPVariable_getitem(self_hdl.ptr(), tup.ptr()));
+        rtensor = THPVariable_Unpack(pytensor.ptr());
+    } else {
+        // std::cout << "skipping original getindex\n";
+        rtensor = *iinfo.self;
+    }
+    // std::cout << "returning (from_positional)\n";
+    return Tensor::from_positional(A, std::move(rtensor), iinfo.result_levels, iinfo.has_device);
+}
+
+mpy::object index(Arena& A, mpy::handle self, mpy::handle dims, mpy::handle indices) {
+    maybeInitializeGlobals();
+    Slice<mpy::handle> dims_list;
+    Slice<mpy::handle> indices_list;
+    // we allow for matching single dims to multiple dims,
+    // so we first have to normalize everything into the case where there is a list on lhs and the rhs
+    bool lhs_list = mpy::tuple_view::check(dims) || mpy::list_view::check(dims);
+    bool rhs_list = mpy::tuple_view::check(indices) || mpy::list_view::check(indices);
+    if (lhs_list && rhs_list) {
+        mpy::sequence_view dv(dims);
+        mpy::sequence_view ind(indices);
+        Py_ssize_t N = dv.size();
+        if (N != ind.size()) {
+            mpy::raise_error(PyExc_TypeError, "dims (%d) and indices (%d) must have the same length", int(N), int(ind.size()));
+        }
+        for (auto i : irange(N)) {
+            dims_list.append(A, A.autorelease(dv[i]));
+            indices_list.append(A, A.autorelease(ind[i]));
+        }
+    } else {
+        dims_list.append(A, dims);
+        indices_list.append(A, indices);
+    }
+
+    // dims being indexed can be grouped together into a single index space, and we have to
+    // flatten them int a single dimension before we can index them...
+    auto self_info = TensorInfo::create(A, self, false);
+    auto ndim = self_info.ndim();
+    Slice<DimEntry> new_levels;
+    Slice<DimEntry> to_flatten;
+    Slice<DimEntry> dims_list_flat;
+    auto parse_dim_entry = [&](mpy::handle s) -> DimEntry {
+        auto d = _wrap_dim(s, ndim, false);
+        if (d.is_none()) {
+            mpy::raise_error(PyExc_TypeError, "expected a dimension specifyer but found %R", s.ptr());
+        }
+        return d;
+    };
+    auto dim_not_present = [&](DimEntry d) {
+        if (d.is_positional()) {
+            mpy::raise_error(PyExc_TypeError, "dimension %d not in tensor of %d dimensions", d.position() + ndim , ndim);
+        } else {
+            mpy::raise_error(PyExc_TypeError, "dimension %R not in tensor", d.dim()->ptr());
+        }
+    };
+
+    for (auto i : dims_list.enumerate()) {
+        Slice<mpy::handle> m;
+        if (maybe_dimpack(m, dims_list[i], /*check_first=*/false)) {
+            if (m.size() == 0) {
+                // plausible semantics work for this to have 0 elements (e.g. the index will always be 0)
+                dims_list_flat.append(A, DimEntry()); // value is just dropped
+            }
+            auto first = parse_dim_entry(m[0]);
+            dims_list_flat.append(A, first);
+            if (m.size() == 1) {
+                continue;
+            }
+            if (to_flatten.size() == 0) {
+                new_levels.extend(A, self_info.levels);
+            }
+            Slice<DimEntry> rest;
+            for (auto i : irange(1, m.size())) {
+                auto d = parse_dim_entry(m[i]);
+                if (!new_levels.remove(A, d)) {
+                    dim_not_present(d);
+                }
+                rest.append(A, d);
+            }
+
+            auto first_idx = new_levels.index(first);
+            if (!first_idx) {
+                dim_not_present(first);
+            }
+            new_levels.insert(A, new_levels.slice(*first_idx + 1, *first_idx + 1), rest);
+            to_flatten.extend(A, rest);
+        } else {
+            dims_list_flat.append(A, parse_dim_entry(dims_list[i]));
+        }
+    }
+    if (to_flatten.size() > 0) {
+        TensorRef rearranged = _match_levels(A, self_info.tensor, self_info.levels, new_levels);
+        at::IntArrayRef sizes = rearranged->sizes();
+        Slice<int64_t> new_sizes;
+        Slice<DimEntry> reshape_levels;
+        for (auto i : new_levels.enumerate()) {
+            if (to_flatten.contains(new_levels[i])) {
+                new_sizes.back() *= sizes[i];
+            } else {
+                new_sizes.append(A, sizes[i]);
+                reshape_levels.append(A, new_levels[i]);
+            }
+        }
+        self_info.tensor = A.autorelease(rearranged->reshape(at::IntArrayRef(new_sizes.begin(), new_sizes.end())));
+
+        self_info.levels = reshape_levels; // note: we are using the first level in a flattened group to represent the group for the rest of the op
+                                           // we need to be careful not to rely the dimensions size because it doesn't match the size of the whole group
+    }
+    bool has_dimpacks = false;
+    for (auto idx : indices_list) {
+        if (mpy::tuple_view::check(idx) || mpy::list_view::check(idx)) {
+            has_dimpacks = true;
+            break;
+        }
+    }
+    IndexingInfo info = getsetitem_flat(A, self_info, Slice<mpy::handle>(), dims_list_flat, indices_list, has_dimpacks);
+    return invoke_getitem(A, info);
+}
+
+// true -- the indices were flattened out of a tuple, list or sequence...
+
+Slice<mpy::handle> slice_from_sequence(Arena& A, mpy::handle value) {
+    if (mpy::tuple_view::check(value)) {
+        return as_slice(mpy::tuple_view(value));
+    } else if (mpy::list_view::check(value)) {
+        return as_slice(mpy::list_view(value));
+    } else {
+        mpy::sequence_view sv(value);
+        Slice<mpy::handle> r;
+        for (auto i : sv.enumerate()) {
+            r.append(A, A.autorelease(sv[i]));
+        }
+        return r;
+    }
+}
+
+bool extractIndices(Arena& A, mpy::handle index, Slice<mpy::handle>& indices) {
+    if (mpy::tuple_view::check(index)) {
+        indices.extend(A, as_slice(mpy::tuple_view(index)));
+        return true;
+    } else if (THPVariable_Check(index.ptr())) {
+        indices.append(A, index);
+        return false;
+    } else if (!mpy::is_sequence(index)) {
+        indices.append(A, index);
+        return false;
+    }
+    // a copy of treatSequenceAsTuple modified to add Dim and our wrapped tensors..
+    mpy::sequence_view sv(index);
+    if (sv.size() >= 32) {
+        indices.extend(A, slice_from_sequence(A, index));
+        return true;
+    }
+    for (auto i : sv.enumerate()) {
+        mpy::handle item;
+        try {
+            item = sv[i];
+        } catch (mpy::exception_set & e) {
+            PyErr_Clear();
+            indices.append(A, index);
+            return false;
+        }
+        if (THPVariable_Check(item.ptr()) || mpy::is_sequence(item) || PySlice_Check(item.ptr()) || item.ptr() == Py_Ellipsis || mpy::is_none(item) || has_dims(item)) {
+            indices.extend(A, slice_from_sequence(A, index));
+            return true;
+        }
+    }
+    indices.append(A, index);
+    return false;
+}
+
+IndexingInfo getsetitem(Arena & A, mpy::handle self, mpy::handle index, bool tensors_have_dims) {
+    bool can_call_original_getitem = !tensors_have_dims;
+
+    Slice<mpy::handle> input;
+    if (has_dims(index)) {
+        input.append(A, index);
+    } else {
+        bool is_sequence = extractIndices(A, index, input);
+        // nothing about first class dims here, fallback to getitem
+        if (can_call_original_getitem && !is_sequence) {
+            return { true };
+        }
+    }
+
+    int64_t dims_indexed = 0;
+    int64_t expanding_object = -1;
+    DimList* unbound_dim_list = nullptr;
+    auto check_expanding = [&](int64_t i) {
+        if (expanding_object != -1) {
+            mpy::raise_error(DimensionBindError(), "at most one ... or unbound dimension list can exist in indexing list but found 2 at offsets %d and %d", (int) expanding_object, (int) i);
+        }
+        expanding_object = i;
+    };
+    Slice<int64_t> dimlists;
+
+    // calculate how many dimensioned have been indexed in order to compute the size of ...
+    // or expand a potentially unbound dimension list.
+
+    bool has_dimpacks_or_none = false;
+    for (auto i : input.enumerate()) {
+        mpy::handle s = input[i];
+        if (Dim::check_exact(s) || Tensor::check_exact(s)) {
+            can_call_original_getitem = false;
+            ++dims_indexed;
+        } else if (s.ptr() == Py_Ellipsis) {
+            check_expanding(i);
+        } else if (DimList::check(s)) {
+            can_call_original_getitem = false;
+            auto dl = DimList::unchecked_wrap(s);
+            if (!dl->is_bound()) {
+                check_expanding(i);
+                unbound_dim_list = dl.ptr();
+            } else {
+                dims_indexed += dl->dims_.size();
+            }
+            dimlists.append(A, i);
+        } else if (mpy::is_none(s)) {
+            has_dimpacks_or_none = true;
+        } else if (is_dimpack(s)) {
+            can_call_original_getitem = false;
+            has_dimpacks_or_none = true;
+            ++dims_indexed;
+        } else {
+            ++dims_indexed;
+        }
+    }
+
+    // at this point if we haven't seen any Dim objects, we also can fallback to the original getitem.
+    if (can_call_original_getitem) {
+        return {true};
+    }
+
+    // std::cout << "__getitem__ " << self << " " << index << "\n";
+
+    TensorInfo self_info = TensorInfo::create(A, self, false, true);
+    auto ndim = self_info.ndim();
+    if (dims_indexed > ndim) {
+        mpy::raise_error(PyExc_ValueError, "at least %d indices were supplied but the tensor only has %d dimensions", (int) dims_indexed, (int) ndim);
+    }
+    // expand any unbound dimension list, or expand ... into individual : slices.
+    auto expanding_dims = ndim - dims_indexed;
+    if (expanding_object != -1) {
+        if (unbound_dim_list) {
+            unbound_dim_list->bind_len(expanding_dims);
+        } else {
+            // ...
+            Slice<mpy::handle> no_slices;
+            for (auto i : irange(expanding_dims)) {
+                (void) i;
+                no_slices.append(A, no_slice);
+            }
+            input.insert(A, input.slice(expanding_object, expanding_object + 1), no_slices);
+        }
+    }
+
+    // flatten out any dimensions stored in dimlist elements directly into the inputs
+    // std::cout << dimlists << " <- dim lists!\n";
+    for (int64_t i = dimlists.size() - 1; i >=0; --i) {
+        auto idx = dimlists[i];
+        // we added more elements to input because of ...
+        // so we need to also adjust the index to get back to where the
+        // dimlist existed
+        if (!unbound_dim_list && expanding_object != -1 && idx > expanding_object) {
+            idx += expanding_dims;
+        }
+        auto dl = DimList::unchecked_wrap(input[idx]);
+        // XXX would be better if we used an OwnedSlice in DimList
+        Slice<mpy::handle> more_dims((mpy::handle*) &*dl->dims_.begin(), (mpy::handle*) &*dl->dims_.end());
+        input.insert(A, input.slice(idx, idx + 1), more_dims);
+    }
+
+    return getsetitem_flat(A, self_info, input, Slice<DimEntry>(), Slice<mpy::handle>(), has_dimpacks_or_none);
+}
+}
+IndexingInfo getsetitem_flat(Arena& A, TensorInfo self_info, Slice<mpy::handle> input, Slice<DimEntry> keys, Slice<mpy::handle> values, bool has_dimpacks_or_none) {
+    // At this point:
+    // ..., DimList have been eliminated
+    // Dim, Tensor, Tuple[Dim,...], int, slice still remain
+
+
+    // we have to count how many times we see a dimension.
+    // A[i,j] is a simple binding operation, but A[i, i+j] or A[i, i] requires advanced indexing.
+    Slice<mpy::hdl<Dim>> seen_dims;
+    Slice<int64_t> seen_dims_nuses;
+    auto add_dim = [&](mpy::hdl<Dim> entry) {
+        auto midx = seen_dims.index(entry);
+        if (!midx) {
+            seen_dims.append(A, entry);
+            seen_dims_nuses.append(A, 1);
+        } else {
+            ++seen_dims_nuses[*midx];
+        }
+    };
+
+    Slice<mpy::handle> input_it = input;
+
+    Slice<mpy::handle> flat_inputs;
+    // flat inputs will start with an empty mpy::handle if the
+    // actual value is in the tensor-like object in the tensor info
+    Slice<TensorInfo> tensor_inputs;
+
+    auto append_flat_handle = [&](mpy::handle h) {
+        flat_inputs.append(A, h);
+        tensor_inputs.append(A, TensorInfo());
+    };
+    TensorRef device_holding_tensor;
+    auto append_tensor_input = [&](TensorInfo ti) {
+        flat_inputs.append(A, mpy::handle());
+        tensor_inputs.append(A, ti);
+        if (ti.has_device && !device_holding_tensor) {
+            device_holding_tensor = ti.tensor;
+        }
+    };
+
+    Slice<int64_t> nsz;
+    Slice<int64_t> nsd;
+    at::IntArrayRef sz = self_info.tensor->sizes();
+    at::IntArrayRef sd = self_info.tensor->strides();
+
+    auto append_size = [&](int i) {
+        if (has_dimpacks_or_none) {
+            nsz.append(A, sz[i]);
+            nsd.append(A, sd[i]);
+        }
+    };
+    // std::cout << "self levels: " << self_info.levels << "\n";
+
+    auto parse_nones = [&]() {
+        while (input_it.size() && mpy::is_none(input_it[0])) {
+            append_flat_handle(no_slice);
+            nsz.append(A, 1);
+            nsd.append(A, 0);
+            input_it = input_it.slice(1);
+        }
+    };
+
+
+    auto append_item = [&](int i, mpy::handle arg) {
+        if (Dim::check_exact(arg)) {
+            auto d = Dim::unchecked_wrap(arg);
+            d->set_size(sz[i]);
+            add_dim(d);
+            append_size(i);
+            append_flat_handle(arg);
+            return;
+        }
+        auto info = TensorInfo::create(A, arg, false, false);
+        if (info) {
+            append_size(i);
+            append_tensor_input(info);
+            for (auto il : info.levels) {
+                if (!il.is_positional()) {
+                    add_dim(il.dim());
+                }
+            }
+            return;
+        }
+
+        if (has_dimpacks_or_none) {
+            Slice<mpy::handle> mp;
+            if (maybe_dimpack(mp, arg)) {
+                // dim pack
+                Slice<mpy::hdl<Dim>> dim_pack;
+                for (auto d : mp) {
+                    dim_pack.append(A, Dim::wrap(d));
+                    add_dim(dim_pack.back());
+                    append_flat_handle(dim_pack.back());
+                }
+                _bind_dims_to_size(A, sz[i], sd[i], dim_pack, nsz, nsd);
+                return;
+            }
+        }
+
+        append_size(i);
+        append_flat_handle(arg);
+    };
+
+    // pair up the indexing expressions with dimension of self it indexes
+    // self may have first-class dims, which do not participate the indexing.
+    for (auto i : self_info.levels.enumerate()) {
+        auto l = self_info.levels[i];
+        auto idx = keys.index(l);
+        if (idx) {
+            append_item(i, values[*idx]);
+        } else if (l.is_positional()) {
+            // grab and index from the positional list
+            parse_nones();
+            if (!input_it.size()) {
+                // we might have fewer indices than tensor dimensions,
+                // which implicitly indexes the remaining dimensions with :
+                append_flat_handle(no_slice);
+                append_size(i);
+            } else {
+                mpy::handle arg = input_it[0];
+                input_it = input_it.slice(1);
+                append_item(i, arg);
+            }
+        } else {
+            add_dim(l.dim());
+            append_flat_handle(l.dim());
+            append_size(i);
+        }
+    }
+    // any training Nones may have no existing dimension associated with them in self.
+    parse_nones();
+
+    // we have to restride the tensor to collapse dimension packs and introduce our none dimensions.
+    if (has_dimpacks_or_none) {
+        self_info.tensor = A.autorelease(self_info.tensor->as_strided(at::IntArrayRef(nsz.begin(), nsz.end()),at::IntArrayRef(nsd.begin(), nsd.end()), self_info.tensor->storage_offset()));
+    }
+
+
+    // figure out what the shape of the indexing tensors will be
+    // and what the shape of the resulting tensor will be
+    Slice<DimEntry> result_levels;
+    Slice<DimEntry> index_levels;
+    int64_t tensor_insert_point = -1;
+    bool requires_getindex = false;
+    auto mark_tensor_index = [&] {
+        if (tensor_insert_point == -1) {
+            tensor_insert_point = result_levels.size();
+        } else if (tensor_insert_point != result_levels.size()) {
+            tensor_insert_point = 0;
+        }
+    };
+    for (auto i : flat_inputs.enumerate()) {
+        auto inp = flat_inputs[i];
+         if(tensor_inputs[i]) {
+             requires_getindex = true;
+             mark_tensor_index();
+             for (auto l : tensor_inputs[i].levels) {
+                 // std::cout << "Consider to add " << l << "\n";
+                 if (!index_levels.contains(l)) {
+                     index_levels.append(A, l);
+                 }
+             }
+        } else if (Dim::check_exact(inp)) {
+            auto d = Dim::unchecked_wrap(inp);
+            // dimensions used once are just binding operations
+            if (1 == seen_dims_nuses[*seen_dims.index(d)]) {
+                flat_inputs[i] = no_slice;
+                result_levels.append(A, d);
+            } else {
+                requires_getindex = true;
+                flat_inputs[i] = mpy::handle();
+                tensor_inputs[i] = TensorInfo {d->range(), Slice<DimEntry>(A, DimEntry(d)), false, TensorRef()};
+                if (!index_levels.contains(d)) {
+                     index_levels.append(A, d);
+                }
+                mark_tensor_index();
+            }
+         } else {
+            if (inp.ptr() != no_slice.ptr()) {
+                requires_getindex = true;
+            }
+            if (!mpy::is_int(inp)) {
+                // note: actual positional indexes are accurately computed later
+                result_levels.append(A, -1);
+            }
+         }
+    }
+
+    // indexing dimensions appear in the tensor at the _first use of a tensor_ in the indexing. So insert
+    // the indexing leveles into the result klevels at this spot
+    if (tensor_insert_point != -1) {
+        result_levels.insert(A, result_levels.slice(tensor_insert_point, tensor_insert_point), index_levels);
+    }
+
+    // std::cout << "flat inputs: " << flat_inputs << "\n";
+    // std::cout << "result_levels: " << result_levels << "\n";
+    // std::cout << "index_levels: " << index_levels << "\n";
+
+    // get all the tensors to be the right shape for indexing
+    if (requires_getindex) {
+        for (auto i : flat_inputs.enumerate()) {
+            if (tensor_inputs[i]) {
+                AT_ASSERT(!flat_inputs[i].ptr());
+                // std::cout << "tensor " << i << " " << tensor_inputs[i].levels << "\n";
+                TensorRef t = tensor_inputs[i].tensor;
+                if (!tensor_inputs[i].has_device && device_holding_tensor) {
+                    t = A.autorelease(t->to(device_holding_tensor->device()));
+                }
+                flat_inputs[i] = handle_from_tensor(A, _match_levels(A, t, tensor_inputs[i].levels, index_levels));
+            }
+        }
+    }
+
+    // previously we didn't know how many positional dimensions there would be so we couldn't number them right
+    // so fill it in now.
+    auto seen_positionals = 0;
+    for (auto i : result_levels.reversed_enumerate()) {
+        if (result_levels[i].is_positional()) {
+            result_levels[i] = -(++seen_positionals);
+        }
+    }
+
+    return IndexingInfo {false, requires_getindex, self_info.tensor, flat_inputs, result_levels, self_info.has_device};
+}
+namespace{
+mpy::object __getitem__(Arena & A, mpy::handle self, mpy::handle index) {
+    maybeInitializeGlobals();
+    auto iinfo = getsetitem(A, self, index, has_dims(self));
+    if (iinfo.can_call_original) {
+        return mpy::object::checked_steal(THPVariable_getitem(self.ptr(), index.ptr()));
+    }
+
+    return invoke_getitem(A, iinfo);
+}
+
+
+
+void __setitem__(Arena & A, mpy::handle self, mpy::handle index, mpy::handle rhs) {
+    maybeInitializeGlobals();
+    auto iinfo = getsetitem(A, self, index, has_dims(self) || has_dims(rhs));
+    if (iinfo.can_call_original) {
+        if (-1 == THPVariable_setitem(self.ptr(), index.ptr(), rhs.ptr())) {
+            throw mpy::exception_set();
+        }
+        return;
+    }
+
+    auto rhs_info = TensorInfo::create(A, rhs, false, false);
+    if (rhs_info) { // otherwise rhs can be a scalar...
+        for (auto l : rhs_info.levels) {
+            if (!iinfo.result_levels.contains(l)) {
+                if (l.is_positional()) {
+                    mpy::raise_error(DimensionBindError(), "rhs contains too many dimensions (%d) compared to indexed value (%d)", ndim_of_levels(iinfo.result_levels), rhs_info.ndim());
+                } else {
+                    auto tup = levels_to_tuple(iinfo.result_levels);
+                    mpy::raise_error(DimensionBindError(), "rhs of setitem contains dimension %R which is not in the dimension on the left (%R)", l.dim().ptr(), tup.ptr());
+                }
+            }
+        }
+        auto rhs_matched = _match_levels(A, rhs_info.tensor, rhs_info.levels, iinfo.result_levels);
+        rhs = handle_from_tensor(A, rhs_matched);
+    }
+    self = handle_from_tensor(A, iinfo.self);
+
+    if (iinfo.advanced_indexing) {
+        auto tup = slice_to_tuple(iinfo.flat_inputs);
+        if (-1 == THPVariable_setitem(self.ptr(), tup.ptr(), rhs.ptr())) {
+            throw mpy::exception_set();
+        }
+    } else {
+        torch_Tensor_copy_.call(self, rhs);
+    }
+}
+}
+
+PyObject* Tensor_getitem(PyObject* self, PyObject* index) {
+    Arena A;
+    PY_BEGIN
+    return __getitem__(A, self, index).release();
+    PY_END(nullptr);
+}
+
+int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value) {
+    Arena A;
+    PY_BEGIN
+    __setitem__(A, self, index, value);
+    return 0;
+    PY_END(-1);
+}
+
+namespace{
+PyObject* py___getitem__(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    AT_ASSERT(nargs == 2);
+    return __getitem__(A, args[0], args[1]).release();
+    PY_END(nullptr)
+}
+
+PyObject* py___setitem__(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    AT_ASSERT(nargs == 3);
+    __setitem__(A, args[0], args[1], args[2]);
+    Py_RETURN_NONE;
+    PY_END(nullptr)
+}
+
+
+PyObject* py_index(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    mpy::vector_args va(args, nargs, kwnames);
+    mpy::handle self, dims, indices;
+    va.parse("index", {"self", "dims", "indices"}, {&self, &dims, &indices}, 3);
+    return index(A, self, dims, indices).release();
+    PY_END(nullptr)
+}
+
+
+PyObject* py_stack(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    mpy::vector_args va(args, nargs, kwnames);
+    mpy::handle tensors, new_dim, dim;
+    va.parse("stack", {"tensors", "new_dim", "dim"}, {&tensors, &new_dim, &dim}, 2);
+
+    Slice<DimEntry> result_levels;
+    Slice<TensorInfo> infos;
+    mpy::sequence_view sv(tensors);
+    auto new_dim_d = Dim::wrap(new_dim);
+    for (auto i : sv.enumerate()) {
+        infos.append(A, TensorInfo::create(A, A.autorelease(sv[i]), false));
+        for (auto l : infos.back().levels) {
+            if (!result_levels.contains(l)) {
+                result_levels.append(A, l);
+            }
+        }
+    }
+    new_dim_d->set_size(infos.size());
+    std::vector<at::Tensor> inputs;
+    inputs.reserve(infos.size());
+    for (auto in : infos) {
+        inputs.emplace_back(*_match_levels(A, in.tensor, in.levels, result_levels));
+    }
+    auto ndim = ndim_of_levels(result_levels);
+    int64_t rawdim = 0;
+    if (dim.ptr()) {
+        auto d = _wrap_dim(dim, ndim, false);
+        auto idx = result_levels.index(d);
+        if (!idx) {
+            mpy::raise_error(PyExc_TypeError, "Dimension %R does not exist in inputs", dim.ptr());
+        }
+        rawdim = *idx;
+    }
+    auto result = at::stack(inputs, rawdim);
+    result_levels.insert(A, rawdim, new_dim_d);
+    return Tensor::from_positional(A, std::move(result), result_levels, true).release();
+    PY_END(nullptr)
+}
+
+PyObject* py_split(PyObject *_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    maybeInitializeGlobals();
+    mpy::vector_args va(args, nargs, kwnames);
+    mpy::handle self, split_size_or_sections, dim;
+    va.parse("split", {"self", "split_size_or_sections", "dim"}, {&self, &split_size_or_sections, &dim}, 2);
+    bool dim_is_object = dim.ptr() && Dim::check_exact(dim);
+    Slice<mpy::handle> sizes;
+
+    bool all_dims = true;
+    bool all_ints = true;
+
+    if (!mpy::is_int(split_size_or_sections)) {
+        mpy::sequence_view sv(split_size_or_sections);
+        for (auto i : sv.enumerate()) {
+            sizes.append(A, A.autorelease(sv[i]));
+            if (Dim::check_exact(sizes.back())) {
+                all_ints = false;
+            } else {
+                all_dims = false;
+            }
+        }
+    }
+    if (all_ints) {
+        if (dim_is_object) {
+            mpy::raise_error(PyExc_TypeError, "when dim is specified as a Dim object, split sizes must also be dimensions.");
+        }
+        // call original split (if self has dimensions this will use torch function to do the split)
+        return torch_Tensor_split.call_vector(mpy::vector_args(args, nargs, kwnames)).release();
+    }
+    if (!all_dims) {
+        mpy::raise_error(PyExc_TypeError, "split list must be ints or dims but got a mix");
+    }
+
+    auto self_info = TensorInfo::create(A, self, false);
+    auto ndim = self_info.ndim();
+    if (!dim_is_object&& ndim == 0) {
+        mpy::raise_error(PyExc_TypeError, "split expects at least a 1-dimension tensor");
+    }
+    DimEntry dim_l = dim.ptr() ? _wrap_dim(dim, ndim, false) : -ndim;
+
+    auto idx = self_info.levels.index(dim_l);
+    if (!idx) {
+        if (!dim.ptr()) {
+            dim = A.autorelease(mpy::from_int(0));
+        }
+        mpy::raise_error(PyExc_TypeError, "tensor does not contain dimension %R", dim.ptr());
+    }
+    Slice<int64_t> indices;
+
+    int64_t total_size = 0;
+    Slice<int64_t> unbound;
+    for (auto i : sizes.enumerate()) {
+        auto d = Dim::unchecked_wrap(sizes[i]);
+        if (d->is_bound()) {
+            indices.append(A, d->size());
+            total_size += indices.back();
+        } else {
+            indices.append(A, 0);
+            unbound.append(A, i);
+        }
+    }
+    auto tensor_size = self_info.tensor->sizes()[*idx];
+
+    if (unbound.size()) {
+        if (total_size > tensor_size) {
+           mpy::raise_error(PyExc_TypeError, "sizes of target dimensions add up to more (%d) than source dim (%d)", int(total_size), int(tensor_size));
+        }
+        auto remaining_size = tensor_size - total_size;
+        auto chunk_size = (remaining_size + unbound.size() - 1) / unbound.size();
+        for (auto u : unbound) {
+            auto sz = std::min(chunk_size, remaining_size);
+            Dim::unchecked_wrap(sizes[u])->set_size(sz);
+            indices[u] = sz;
+            remaining_size -= sz;
+        }
+    } else if (tensor_size != total_size) {
+        mpy::raise_error(PyExc_TypeError, "sum of sizes of target dimensions (%d) do not match the than source dim (%d)", int(total_size), int(tensor_size));
+    }
+
+    auto result_tensors = self_info.tensor->split_with_sizes(at::IntArrayRef(indices.begin(), indices.end()), *idx);
+    mpy::tuple result(result_tensors.size());
+    Slice<DimEntry> new_levels;
+    new_levels.extend(A, self_info.levels);
+    for (auto i : sizes.enumerate()) {
+        new_levels[*idx] = Dim::unchecked_wrap(sizes[i]);
+        result.set(i, Tensor::from_positional(A, std::move(result_tensors[i]), new_levels, true));
+    }
+
+    return result.release();
+
+    PY_END(nullptr)
+}
+
+Slice<DimEntry> _wrap_dims(Arena& A, mpy::handle d, size_t N, bool keepdim) {
+    auto de = _wrap_dim(d, N, keepdim);
+    Slice<DimEntry> r;
+    if (!de.is_none()) {
+        r.append(A, de);
+    } else {
+        mpy::sequence_view sq(d);
+        for (auto i : sq.enumerate()) {
+            r.append(A, _wrap_dim(A.autorelease(sq[i]), N, keepdim));
+        }
+    }
+    return r;
+}
+
+struct WrappedOperator : public mpy::base<WrappedOperator> {
+    mpy::object orig;
+    PyMethodDef method_def;
+    mpy::object name, doc;
+
+    bool is_pointwise = false;
+    int64_t dim_offset = 0;
+    int64_t keepdim_offset = 1;
+    std::string dim_name;
+    bool single_dim = false;
+    bool reduce = true;
+
+    static PyTypeObject Type;
+
+    void init(mpy::object orig_, PyCFunction wrapper_implementation, std::string dim_name_="") {
+        orig = std::move(orig_);
+        method_def.ml_meth = wrapper_implementation;
+        name = orig.attr("__name__");
+        doc = orig.attr("__doc__");
+        dim_name = std::move(dim_name_);
+        if (!mpy::is_none(doc) && !dim_name.empty()) {
+            doc = mpy::unicode_from_format("%S\nArgument '%s' can be either an integer or a torchdim.Dim object.\n", doc.ptr(), dim_name.c_str());
+        }
+        method_def.ml_name = mpy::is_none(name) ? "" : PyUnicode_AsUTF8(name.ptr());
+        method_def.ml_doc = mpy::is_none(doc) ? "" : PyUnicode_AsUTF8(doc.ptr());
+        method_def.ml_flags = METH_FASTCALL | METH_KEYWORDS;
+    }
+
+    mpy::object function() {
+        return mpy::object::checked_steal(PyCFunction_New(&method_def, ptr()));
+    }
+
+};
+}
+
+PyTypeObject WrappedOperator::Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_C.WrappedOperator",               /* tp_name */
+    sizeof(WrappedOperator),               /* tp_basicsize */
+    0,                              /* tp_itemsize */
+    WrappedOperator::dealloc_stub,      /* tp_dealloc */
+    0,                              /* tp_vectorcall_offset */
+    0,                              /* tp_getattr */
+    0,                              /* tp_setattr */
+    0,                              /* tp_as_async */
+    0,           /* tp_repr */
+    0,                 /* tp_as_number */
+    0,                 /* tp_as_sequence */
+    0,             /* tp_as_mapping */
+    0,      /* tp_hash */
+    0,                              /* tp_call */
+    0,                              /* tp_str */
+    0,                              /* tp_getattro */
+    0,                              /* tp_setattro */
+    0,                              /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT, /* tp_flags */
+    "Wrapped Object Holder",                   /* tp_doc */
+    0,                              /* tp_traverse */
+    0,                              /* tp_clear */
+    0,  /* tp_richcompare */
+    0,                              /* tp_weaklistoffset */
+    0,                              /* tp_iter */
+    0,                              /* tp_iternext */
+    0,                /* tp_methods */
+    0,                              /* tp_members */
+    0,             /* tp_getset */
+    0,                              /* tp_base */
+    0,                              /* tp_dict */
+    0,                              /* tp_descr_get */
+    0,                              /* tp_descr_set */
+    0,                              /* tp_dictoffset */
+    0,            /* tp_init */
+    0,                              /* tp_alloc */
+    WrappedOperator::new_stub,                      /* tp_new */
+};
+
+namespace{
+PyObject* patched_dim_method(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    auto self = WrappedOperator::unchecked_wrap(self_);
+    PY_BEGIN
+
+    mpy::vector_args va(args, nargs, kwnames);
+
+    auto _getarg = [&](const char* name, int64_t offset_) -> mpy::handle {
+        auto offset = offset_ + 1; // do not include self
+        auto idx = va.index(name, offset);
+        return idx == -1 ? mpy::handle() : va[idx];
+    };
+    Slice<mpy::handle> patched_args;
+    patched_args.extend(A, va.begin(), va.end());
+    auto _patcharg = [&](const char* name, int64_t offset_, mpy::handle value) {
+        auto offset = offset_ + 1; // do not include self
+        auto idx = va.index(name, offset);
+        if (idx == -1) {
+            mpy::raise_error(PyExc_ValueError, "Missing argument %s", name);
+        }
+        patched_args[idx] = value;
+    };
+
+    auto dim = _getarg(self->dim_name.c_str(), self->dim_offset);
+    if (!dim.ptr()) {
+        auto info = TensorInfo::create(A, args[0], true);
+        EnableAllLayers l(A, info.levels);
+        l.inplace_update_layers(info.batchedtensor, info.levels);
+        patched_args[0] = handle_from_tensor(A, info.batchedtensor);
+        auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
+        return l.from_batched(A, THPVariable_Unpack(r.ptr()), info.has_device).release();
+    }
+
+    auto info = TensorInfo::create(A, args[0]);
+    auto keepdim = false;
+    if (self->reduce) {
+        auto py_keepdim = _getarg("keepdim", self->keepdim_offset);
+        if (py_keepdim.ptr()) {
+            keepdim = mpy::to_bool(py_keepdim);
+        }
+    }
+
+    auto ndim = info.ndim();
+    auto dims = _wrap_dims(A, dim, ndim, keepdim);
+    Slice<int64_t> dim_indices;
+    auto seen = A.allocate<bool>(info.levels.size());
+    std::fill(seen, seen + info.levels.size(), false);
+
+    for (auto d : dims) {
+        auto midx = info.levels.index(d);
+        if (!midx) {
+            auto tup = levels_to_tuple(info.levels);
+            mpy::raise_error(PyExc_ValueError, "Tensor with dimensions %R does not contain one of %R\n", tup.ptr(), dim.ptr());
+        }
+        seen[*midx] = true;
+        dim_indices.append(A, *midx);
+    }
+    Slice<DimEntry> new_levels;
+    if (self->reduce && !keepdim) {
+        for (auto i : info.levels.enumerate()) {
+            if (!seen[i]) {
+                new_levels.append(A, info.levels[i]);
+            }
+        }
+    } else {
+        new_levels = info.levels;
+    }
+    mpy::object py_indices;
+    if (dim_indices.size() == 1) {
+        py_indices = mpy::from_int(dim_indices[0]);
+    } else {
+        mpy::tuple tup(dim_indices.size());
+        for (auto i : dim_indices.enumerate()) {
+            tup.set(i, mpy::from_int(dim_indices[i]));
+        }
+        py_indices = std::move(tup);
+    }
+    _patcharg(self->dim_name.c_str(), self->dim_offset, py_indices);
+    patched_args[0] = handle_from_tensor(A, info.tensor);
+    auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
+    auto wrap = [&](mpy::handle h) {
+        if (THPVariable_Check(h.ptr())) {
+            return A.autorelease(Tensor::from_positional(A, THPVariable_Unpack(h.ptr()), new_levels, info.has_device));
+        }
+        return h;
+    };
+    return tree_map(A, wrap, r).release();
+    PY_END(nullptr)
+}
+
+PyObject* _wrap(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+
+    #define ARGS(_) _(mpy::handle, orig) _(mpy::handle, dim_offset) _(mpy::handle, keepdim_offset) \
+                    _(mpy::handle, dim_name) _(mpy::handle, single_dim) _(mpy::handle, reduce)
+    MPY_PARSE_ARGS_KWNAMES("O|OOOOO", ARGS)
+
+    std::string dim_name_str;
+    if (dim_name.ptr()) {
+        dim_name_str = PyUnicode_AsUTF8(dim_name.ptr());
+    } else {
+        dim_name_str = "dim";
+    }
+    auto info = WrappedOperator::create(mpy::object::borrow(orig), (PyCFunction)(void*) patched_dim_method, std::move(dim_name_str));
+    if (dim_offset.ptr()) {
+        info->dim_offset = mpy::to_int(dim_offset);
+    }
+    if (keepdim_offset.ptr()) {
+        info->keepdim_offset = mpy::to_int(keepdim_offset);
+    }
+
+    if (single_dim.ptr()) {
+        info->single_dim = mpy::to_bool(single_dim);
+    }
+    if (reduce.ptr()) {
+        info->reduce = mpy::to_bool(reduce);
+    }
+    return info->function().release();
+    #undef ARGS
+
+    PY_END(nullptr)
+}
+
+PyObject* call_torch_function(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    Arena A;
+    maybeInitializeGlobals();
+    auto info = WrappedOperator::unchecked_wrap(self);
+    return __torch_function__(A, info->orig, mpy::vector_args(args, nargs, kwnames), info->is_pointwise).release();
+    PY_END(nullptr)
+}
+
+PyObject* _wrap_method(PyObject *self,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    AT_ASSERT(nargs == 2);
+    // XXX - ignore python function wrapped, we will call torch function directly
+    mpy::handle orig = args[0];
+    if (!pointwise.ptr()) {
+        auto dim = mpy::import("functorch.dim");
+        pointwise = dim.attr("pointwise");
+    }
+    auto info = WrappedOperator::create(mpy::object::borrow(orig), (PyCFunction)(void*) call_torch_function);
+    info->is_pointwise = pointwise.contains(orig);
+    return PyInstanceMethod_New(info->function().release());
+    PY_END(nullptr);
+}
+
+
+PyObject* Tensor_sum(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    Arena A;
+    PY_BEGIN
+    maybeInitializeGlobals();
+    mpy::vector_args va(args, nargs, kwnames);
+    auto self_ = Tensor::unchecked_wrap(args[0]);
+    auto d = self_->delayed();
+    if (!d) {
+        return _Tensor_sum.call_vector(va).release();
+    }
+    mpy::handle self, dim, keepdim, dtype;
+    va.parse("sum", {"self", "dim", "keepdim", "dtype"}, {&self, &dim, &keepdim, &dtype}, 1, 1);
+
+    if (dtype.ptr() || (keepdim.ptr() && mpy::to_bool(keepdim))) {
+        // std::cout << "SKIPPING fusion because dtype or keepdim=True specified\n";
+        return _Tensor_sum.call_vector(va).release();
+    }
+    auto levels = self_->levels();
+
+    auto N = ndim_of_levels(levels);
+    auto reduced_dims = _wrap_dims(A, dim, N, false);
+
+    return dot(A, TensorInfo::create(A, d->args[0], false), TensorInfo::create(A, d->args[1], false), reduced_dims).release();
+    PY_END(nullptr)
+}
+
+PyObject* _parse_test(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    maybeInitializeGlobals();
+
+    int required = mpy::to_int(args[0]);
+    int kwonly = mpy::to_int(args[1]);
+
+    mpy::vector_args va(args + 2, nargs - 2, kwnames);
+
+
+    mpy::handle a, b, c, d;
+    va.parse("_parse_test", {"a", "b", "c", "d"}, {&a, &b, &c, &d}, required, kwonly);
+    mpy::tuple r(4);
+    r.set(0, mpy::object::borrow(a.ptr() ? a : Py_None));
+    r.set(1, mpy::object::borrow(b.ptr() ? b : Py_None));
+    r.set(2, mpy::object::borrow(c.ptr() ? c : Py_None));
+    r.set(3, mpy::object::borrow(d.ptr() ? d : Py_None));
+    return r.release();
+
+    PY_END(nullptr)
+}
+
+PyObject* _set_pointwise_optimize(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+    mpy::handle value;
+    mpy::vector_args va(args, nargs, kwnames);
+    va.parse("_set_pointwise_optimization", {"value"}, {&value}, 1);
+    pointwise_optimize = mpy::to_bool(value);
+    Py_RETURN_NONE;
+    PY_END(nullptr)
+}
+
+PyObject* _patch_tensor_class(PyObject * self_,
+                      PyObject *const *args,
+                      Py_ssize_t nargs,
+                      PyObject *kwnames) {
+    PY_BEGIN
+
+    auto torch = mpy::import("torch");
+    auto py_TensorBase = torch.attr("_C").attr("TensorBase");
+    replaceMappingIfMatches(py_TensorBase);
+
+    Py_RETURN_NONE;
+    PY_END(nullptr)
+}
+
+
+const char* dims_doc = R"""(
+dims(n=None, sizes=None) -> torchdim.Dim or Tuple[torchdim.Dim, ...]
+
+Creates and returns one or more Dim objects.
+
+Arg:
+    n (int, optional): The number of dimensions to create. Can be omitted if sizes is specified.
+    sizes (List[Optional[int]], optional): A list the same size as the number of dimensions to be
+      created, specifying each dimensions size, or None to leave the size unset.
+
+Example::
+    >>> batch, channel, width, height = dims(4)
+    >>> batch, channel, width, height = dims(sizes=[None, 3, 224, 224])
+)""";
+
+PyMethodDef methods[] = {
+    {"dims", (PyCFunction)(void*) _dims<create_dim>, METH_FASTCALL | METH_KEYWORDS, dims_doc},
+    {"dimlists", (PyCFunction)(void*) _dims<create_dimlist>, METH_FASTCALL | METH_KEYWORDS},
+    {"_test_c", (PyCFunction)(void*) test_c, METH_FASTCALL | METH_KEYWORDS},
+    {"_wrap_method", (PyCFunction)(void*) _wrap_method, METH_FASTCALL | METH_KEYWORDS},
+    {"Tensor_from_positional", (PyCFunction)(void*) py_Tensor_from_positional, METH_FASTCALL | METH_KEYWORDS},
+    {"__torch_function__", (PyCFunction)(void*) py___torch_function__, METH_FASTCALL | METH_KEYWORDS},
+    {"tree_flatten", (PyCFunction)(void*) py_tree_flatten, METH_FASTCALL | METH_KEYWORDS},
+    {"order", (PyCFunction)(void*) order, METH_FASTCALL | METH_KEYWORDS},
+    {"index", (PyCFunction)(void*) py_index, METH_FASTCALL | METH_KEYWORDS},
+    {"stack", (PyCFunction)(void*) py_stack, METH_FASTCALL | METH_KEYWORDS},
+    {"split", (PyCFunction)(void*) py_split, METH_FASTCALL | METH_KEYWORDS},
+    {"expand", (PyCFunction)(void*) expand, METH_FASTCALL | METH_KEYWORDS},
+    {"__getitem__", (PyCFunction)(void*) py___getitem__, METH_FASTCALL | METH_KEYWORDS},
+    {"__setitem__", (PyCFunction)(void*) py___setitem__, METH_FASTCALL | METH_KEYWORDS},
+    {"_wrap", (PyCFunction)(void*) _wrap, METH_FASTCALL | METH_KEYWORDS},
+    {"Tensor_sum", (PyCFunction)(void*) Tensor_sum, METH_FASTCALL | METH_KEYWORDS},
+    {"_parse_test", (PyCFunction)(void*) _parse_test, METH_FASTCALL | METH_KEYWORDS},
+    {"_set_pointwise_optimize", (PyCFunction)(void*) _set_pointwise_optimize, METH_FASTCALL | METH_KEYWORDS},
+    {"_patch_tensor_class", (PyCFunction)(void*) _patch_tensor_class, METH_FASTCALL | METH_KEYWORDS},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+struct PyModuleDef module_def = {
+    PyModuleDef_HEAD_INIT,
+    "_C",   /* name of module */
+    NULL, /* module documentation, may be NULL */
+    -1,       /* size of per-interpreter state of the module,
+                 or -1 if the module keeps state in global variables. */
+    methods
+};
+}
+
+PyObject* Dim_init() {
+    Arena A;
+    try {
+        mpy::object mod = mpy::object::checked_steal(PyModule_Create(&module_def));
+        Dim::ready(mod, "Dim");
+        DimList::ready(mod, "DimList");
+        Tensor::ready(mod, "Tensor");
+        WrappedOperator::ready(mod, "_WrappedOperator");
+        Py_INCREF(&PyInstanceMethod_Type);
+        PyModule_AddObject(mod.ptr(), "_instancemethod", (PyObject *)&PyInstanceMethod_Type);
+
+        initializeGlobals(A);
+        return mod.release();
+    } catch(mpy::exception_set& err) {
+        return nullptr;
+    }
+}
+
+#endif
diff --git a/functorch/csrc/dim/dim.h b/functorch/csrc/dim/dim.h
new file mode 100644
index 0000000000000..627caa729fc28
--- /dev/null
+++ b/functorch/csrc/dim/dim.h
@@ -0,0 +1,8 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <Python.h>
+PyObject* Dim_init();
diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
new file mode 100644
index 0000000000000..1b5d067734450
--- /dev/null
+++ b/functorch/csrc/dim/dim_opcode.c
@@ -0,0 +1,17 @@
+#include <torch/csrc/utils/python_compat.h>
+#if defined(_WIN32) && IS_PYTHON_3_11_PLUS
+#define Py_BUILD_CORE
+#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
+
+#if IS_PYTHON_3_13_PLUS
+#include <cpython/code.h> // To get PyUnstable_Code_GetFirstFree
+#define NEED_OPCODE_METADATA
+#include "internal/pycore_opcode_metadata.h"
+#undef NEED_OPCODE_METADATA
+#else
+#include "internal/pycore_opcode.h"
+#endif
+
+#undef NEED_OPCODE_TABLES
+#undef Py_BUILD_CORE
+#endif
diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
new file mode 100644
index 0000000000000..e1ac428864a9b
--- /dev/null
+++ b/functorch/csrc/dim/minpybind.h
@@ -0,0 +1,692 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <utility>
+#include <ostream>
+#include <memory>
+
+#define PY_BEGIN try {
+#define PY_END(v) } catch(mpy::exception_set & err) { return (v); }
+
+#if PY_VERSION_HEX < 0x03080000
+    #define PY_VECTORCALL _PyObject_FastCallKeywords
+#else
+    #define PY_VECTORCALL _PyObject_Vectorcall
+#endif
+
+struct irange {
+ public:
+    irange(int64_t end)
+    : irange(0, end, 1) {}
+    irange(int64_t begin, int64_t end, int64_t step = 1)
+    : begin_(begin), end_(end), step_(step) {}
+    int64_t operator*() const {
+        return begin_;
+    }
+    irange& operator++() {
+        begin_ += step_;
+        return *this;
+    }
+    bool operator!=(const irange& other) {
+        return begin_ != other.begin_;
+    }
+    irange begin() {
+        return *this;
+    }
+    irange end() {
+        return irange {end_, end_, step_};
+    }
+ private:
+    int64_t begin_;
+    int64_t end_;
+    int64_t step_;
+};
+
+namespace mpy {
+
+struct exception_set {
+};
+
+struct object;
+struct vector_args;
+
+struct handle {
+    handle(PyObject* ptr)
+    : ptr_(ptr) {}
+    handle() = default;
+
+
+    PyObject* ptr() const {
+        return ptr_;
+    }
+    object attr(const char* key);
+    bool hasattr(const char* key);
+    handle type() const {
+        return (PyObject*) Py_TYPE(ptr());
+    }
+
+    template<typename... Args>
+    object call(Args&&... args);
+    object call_object(mpy::handle args);
+    object call_object(mpy::handle args, mpy::handle kwargs);
+    object call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames);
+    object call_vector(vector_args args);
+    bool operator==(handle rhs) {
+        return ptr_ == rhs.ptr_;
+    }
+
+    static handle checked(PyObject* ptr) {
+        if (!ptr) {
+            throw exception_set();
+        }
+        return ptr;
+    }
+
+protected:
+    PyObject* ptr_ = nullptr;
+};
+
+
+template<typename T>
+struct obj;
+
+template<typename T>
+struct hdl : public handle {
+    T* ptr() {
+        return  (T*) handle::ptr();
+    }
+    T* operator->() {
+        return ptr();
+    }
+    hdl(T* ptr)
+    : hdl((PyObject*) ptr) {}
+    hdl(const obj<T>& o)
+    : hdl(o.ptr()) {}
+private:
+    hdl(handle h) : handle(h) {}
+};
+
+struct object : public handle {
+    object() = default;
+    object(const object& other)
+    : handle(other.ptr_) {
+        Py_XINCREF(ptr_);
+    }
+    object(object&& other) noexcept
+    : handle(other.ptr_) {
+        other.ptr_ = nullptr;
+    }
+    object& operator=(const object& other) {
+        return *this = object(other);
+    }
+    object& operator=(object&& other) noexcept {
+        PyObject* tmp = ptr_;
+        ptr_ = other.ptr_;
+        other.ptr_ = tmp;
+        return *this;
+    }
+    ~object() {
+        Py_XDECREF(ptr_);
+    }
+    static object steal(handle o) {
+        return object(o.ptr());
+    }
+    static object checked_steal(handle o) {
+        if (!o.ptr()) {
+            throw exception_set();
+        }
+        return steal(o);
+    }
+    static object borrow(handle o) {
+        Py_XINCREF(o.ptr());
+        return steal(o);
+    }
+    PyObject* release() {
+        auto tmp = ptr_;
+        ptr_ = nullptr;
+        return tmp;
+    }
+protected:
+    explicit object(PyObject* ptr)
+    : handle(ptr) {}
+};
+
+template<typename T>
+struct obj : public object {
+    obj() = default;
+    obj(const obj& other)
+    : object(other.ptr_) {
+        Py_XINCREF(ptr_);
+    }
+    obj(obj&& other) noexcept
+    : object(other.ptr_) {
+        other.ptr_ = nullptr;
+    }
+    obj& operator=(const obj& other) {
+        return *this = obj(other);
+    }
+    obj& operator=(obj&& other) noexcept {
+        PyObject* tmp = ptr_;
+        ptr_ = other.ptr_;
+        other.ptr_ = tmp;
+        return *this;
+    }
+    static obj steal(hdl<T> o) {
+        return obj(o.ptr());
+    }
+    static obj checked_steal(hdl<T> o) {
+        if (!o.ptr()) {
+            throw exception_set();
+        }
+        return steal(o);
+    }
+    static obj borrow(hdl<T> o) {
+        Py_XINCREF(o.ptr());
+        return steal(o);
+    }
+    T* ptr() const {
+        return (T*) object::ptr();
+    }
+    T* operator->() {
+        return ptr();
+    }
+protected:
+    explicit obj(T* ptr)
+    : object((PyObject*)ptr) {}
+};
+
+
+static bool isinstance(handle h, handle c) {
+    return PyObject_IsInstance(h.ptr(), c.ptr());
+}
+
+[[ noreturn ]] inline void raise_error(handle exception, const char *format, ...) {
+    va_list args;
+    va_start(args, format);
+    PyErr_FormatV(exception.ptr(), format, args);
+    va_end(args);
+    throw exception_set();
+}
+
+template<typename T>
+struct base {
+    PyObject_HEAD
+    PyObject* ptr() const {
+        return (PyObject*) this;
+    }
+    static obj<T> alloc(PyTypeObject* type = nullptr) {
+        if (!type) {
+            type = &T::Type;
+        }
+        auto self = (T*) type->tp_alloc(type, 0);
+        if (!self) {
+            throw mpy::exception_set();
+        }
+        new (self) T;
+        return obj<T>::steal(self);
+    }
+    template<typename ... Args>
+    static obj<T> create(Args ... args) {
+        auto self = alloc();
+        self->init(std::forward<Args>(args)...);
+        return self;
+    }
+    static bool check(handle v) {
+        return isinstance(v, (PyObject*)&T::Type);
+    }
+
+    static hdl<T> unchecked_wrap(handle self_) {
+        return hdl<T>((T*)self_.ptr());
+    }
+    static hdl<T> wrap(handle self_) {
+        if (!check(self_)) {
+            raise_error(PyExc_ValueError, "not an instance of %S", &T::Type);
+        }
+        return unchecked_wrap(self_);
+    }
+
+    static obj<T> unchecked_wrap(object self_) {
+        return obj<T>::steal(unchecked_wrap(self_.release()));
+    }
+    static obj<T> wrap(object self_) {
+        return obj<T>::steal(wrap(self_.release()));
+    }
+
+    static PyObject* new_stub(PyTypeObject *type, PyObject *args, PyObject *kwds) {
+        PY_BEGIN
+        return (PyObject*) alloc(type).release();
+        PY_END(nullptr)
+    }
+    static void dealloc_stub(PyObject *self) {
+        ((T*)self)->~T();
+        Py_TYPE(self)->tp_free(self);
+    }
+    static void ready(mpy::handle mod, const char* name) {
+        if (PyType_Ready(&T::Type)) {
+            throw exception_set();
+        }
+        if(PyModule_AddObject(mod.ptr(), name, (PyObject*) &T::Type) < 0) {
+            throw exception_set();
+        }
+    }
+};
+
+inline object handle::attr(const char* key) {
+    return object::checked_steal(PyObject_GetAttrString(ptr(), key));
+}
+
+inline bool handle::hasattr(const char* key) {
+    return PyObject_HasAttrString(ptr(), key);
+}
+
+inline object import(const char* module) {
+    return object::checked_steal(PyImport_ImportModule(module));
+}
+
+template<typename... Args>
+inline object handle::call(Args&&... args) {
+    return object::checked_steal(PyObject_CallFunctionObjArgs(ptr_, args.ptr()..., nullptr));
+}
+
+inline object handle::call_object(mpy::handle args) {
+    return object::checked_steal(PyObject_CallObject(ptr(), args.ptr()));
+}
+
+
+inline object handle::call_object(mpy::handle args, mpy::handle kwargs) {
+    return object::checked_steal(PyObject_Call(ptr(), args.ptr(), kwargs.ptr()));
+}
+
+inline object handle::call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames) {
+    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) begin, nargs, kwnames.ptr()));
+}
+
+struct tuple : public object {
+    void set(int i, object v) {
+        PyTuple_SET_ITEM(ptr_, i, v.release());
+    }
+    tuple(int size)
+    : object(checked_steal(PyTuple_New(size))) {}
+};
+
+struct list : public object {
+    void set(int i, object v) {
+        PyList_SET_ITEM(ptr_, i, v.release());
+    }
+    list(int size)
+    : object(checked_steal(PyList_New(size))) {}
+};
+
+namespace{
+mpy::object unicode_from_format(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    auto r = PyUnicode_FromFormatV(format, args);
+    va_end(args);
+    return mpy::object::checked_steal(r);
+}
+mpy::object unicode_from_string(const char * str) {
+    return mpy::object::checked_steal(PyUnicode_FromString(str));
+}
+
+mpy::object from_int(Py_ssize_t s) {
+    return mpy::object::checked_steal(PyLong_FromSsize_t(s));
+}
+mpy::object from_bool(bool b) {
+    return mpy::object::borrow(b ? Py_True : Py_False);
+}
+
+bool is_sequence(handle h) {
+    return PySequence_Check(h.ptr());
+}
+}
+
+struct sequence_view : public handle {
+    sequence_view(handle h)
+    : handle(h) {}
+    Py_ssize_t size() const {
+        auto r = PySequence_Size(ptr());
+        if (r == -1 && PyErr_Occurred()) {
+            throw mpy::exception_set();
+        }
+        return r;
+    }
+    irange enumerate() const {
+        return irange(size());
+    }
+    static sequence_view wrap(handle h) {
+        if (!is_sequence(h)) {
+            raise_error(PyExc_ValueError, "expected a sequence");
+        }
+        return sequence_view(h);
+    }
+    mpy::object operator[](Py_ssize_t i) const {
+        return mpy::object::checked_steal(PySequence_GetItem(ptr(), i));
+    }
+};
+
+namespace {
+mpy::object repr(handle h) {
+    return mpy::object::checked_steal(PyObject_Repr(h.ptr()));
+}
+
+mpy::object str(handle h) {
+    return mpy::object::checked_steal(PyObject_Str(h.ptr()));
+}
+
+
+bool is_int(handle h) {
+    return PyLong_Check(h.ptr());
+}
+
+bool is_none(handle h) {
+    return h.ptr() == Py_None;
+}
+
+Py_ssize_t to_int(handle h) {
+    Py_ssize_t r = PyLong_AsSsize_t(h.ptr());
+    if (r == -1 && PyErr_Occurred()) {
+        throw mpy::exception_set();
+    }
+    return r;
+}
+
+bool to_bool(handle h) {
+    return PyObject_IsTrue(h.ptr()) != 0;
+}
+}
+
+struct slice_view {
+    slice_view(handle h, Py_ssize_t size)  {
+        if(PySlice_Unpack(h.ptr(), &start, &stop, &step) == -1) {
+            throw mpy::exception_set();
+        }
+        slicelength = PySlice_AdjustIndices(size, &start, &stop, step);
+    }
+    Py_ssize_t start, stop, step, slicelength;
+};
+
+static bool is_slice(handle h) {
+    return PySlice_Check(h.ptr());
+}
+
+inline std::ostream& operator<<(std::ostream& ss, handle h) {
+    ss << PyUnicode_AsUTF8(str(h).ptr());
+    return ss;
+}
+
+struct tuple_view : public handle {
+    tuple_view() = default;
+    tuple_view(handle h) : handle(h) {}
+
+    Py_ssize_t size() const {
+        return PyTuple_GET_SIZE(ptr());
+    }
+
+    irange enumerate() const {
+        return irange(size());
+    }
+
+    handle operator[](Py_ssize_t i) {
+        return PyTuple_GET_ITEM(ptr(), i);
+    }
+
+    static bool check(handle h) {
+        return PyTuple_Check(h.ptr());
+    }
+};
+
+struct list_view : public handle {
+    list_view() = default;
+    list_view(handle h) : handle(h) {}
+    Py_ssize_t size() const {
+        return PyList_GET_SIZE(ptr());
+    }
+
+    irange enumerate() const {
+        return irange(size());
+    }
+
+    handle operator[](Py_ssize_t i) {
+        return PyList_GET_ITEM(ptr(), i);
+    }
+
+    static bool check(handle h) {
+        return PyList_Check(h.ptr());
+    }
+};
+
+struct dict_view : public handle {
+    dict_view() = default;
+    dict_view(handle h) : handle(h) {}
+    object keys() const {
+        return mpy::object::checked_steal(PyDict_Keys(ptr()));
+    }
+    object values() const {
+        return mpy::object::checked_steal(PyDict_Values(ptr()));
+    }
+    object items() const {
+        return mpy::object::checked_steal(PyDict_Items(ptr()));
+    }
+    bool contains(handle k) const {
+        return PyDict_Contains(ptr(), k.ptr());
+    }
+    handle operator[](handle k) {
+        return mpy::handle::checked(PyDict_GetItem(ptr(), k.ptr()));
+    }
+    static bool check(handle h) {
+        return PyDict_Check(h.ptr());
+    }
+    bool next(Py_ssize_t* pos, mpy::handle* key, mpy::handle* value) {
+        PyObject *k = nullptr, *v = nullptr;
+        auto r = PyDict_Next(ptr(), pos, &k, &v);
+        *key = k;
+        *value = v;
+        return r;
+    }
+    void set(handle k, handle v) {
+        if (-1 == PyDict_SetItem(ptr(), k.ptr(), v.ptr())) {
+            throw exception_set();
+        }
+    }
+};
+
+
+struct kwnames_view : public handle {
+    kwnames_view() = default;
+    kwnames_view(handle h) : handle(h) {}
+
+    Py_ssize_t size() const {
+        return PyTuple_GET_SIZE(ptr());
+    }
+
+    irange enumerate() const {
+        return irange(size());
+    }
+
+    const char* operator[](Py_ssize_t i) const {
+        PyObject* obj = PyTuple_GET_ITEM(ptr(), i);
+        return PyUnicode_AsUTF8(obj);
+    }
+
+    static bool check(handle h) {
+        return PyTuple_Check(h.ptr());
+    }
+};
+
+inline mpy::object funcname(mpy::handle func) {
+    if (func.hasattr("__name__")) {
+        return func.attr("__name__");
+    } else {
+        return mpy::str(func);
+    }
+}
+
+struct vector_args {
+    vector_args(PyObject *const *a,
+                      Py_ssize_t n,
+                      PyObject *k)
+    : vector_args((mpy::handle*)a, n, k) {}
+    vector_args(mpy::handle* a,
+                    Py_ssize_t n,
+                    mpy::handle k)
+    : args((mpy::handle*)a), nargs(n), kwnames(k) {}
+    mpy::handle* args;
+    Py_ssize_t nargs;
+    kwnames_view kwnames;
+
+    mpy::handle* begin() {
+        return args;
+    }
+    mpy::handle* end() {
+        return args + size();
+    }
+
+    mpy::handle operator[](int64_t i) const {
+        return args[i];
+    }
+    bool has_keywords() const {
+        return kwnames.ptr();
+    }
+    irange enumerate_positional() {
+        return irange(nargs);
+    }
+    irange enumerate_all() {
+        return irange(size());
+    }
+    int64_t size() const {
+        return nargs + (has_keywords() ? kwnames.size() : 0);
+    }
+
+    // bind a test function so this can be tested, first two args for required/kwonly, then return what was parsed...
+
+    // provide write kwarg
+    // don't provide a required arg
+    // don't provide an optional arg
+    // provide a kwarg that is the name of already provided positional
+    // provide a kwonly argument positionally
+    // provide keyword arguments in the wrong order
+    // provide only keyword arguments
+    void parse(const char * fname_cstr, std::initializer_list<const char*> names, std::initializer_list<mpy::handle*> values, int required, int kwonly=0) {
+        auto error = [&]() {
+            // rather than try to match the slower infrastructure with error messages exactly, once we have detected an error, just use that
+            // infrastructure to format it and throw it
+
+            // have to leak this, because python expects these to last
+            const char** names_buf = new const char*[names.size() + 1];
+            std::copy(names.begin(), names.end(), &names_buf[0]);
+            names_buf[names.size()] = nullptr;
+
+#if PY_VERSION_HEX < 0x03080000
+            char* format_str = new char[names.size() + 3];
+            int i = 0;
+            char* format_it = format_str;
+            for (auto it = names.begin(); it != names.end(); ++it, ++i) {
+                if (i == required) {
+                    *format_it++ = '|';
+                }
+                if (i == (int)names.size() - kwonly) {
+                    *format_it++ = '$';
+                }
+                *format_it++ = 'O';
+            }
+            *format_it++ = '\0';
+            _PyArg_Parser* _parser = new _PyArg_Parser{format_str, &names_buf[0], fname_cstr, 0};
+            PyObject *dummy = NULL;
+            _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
+#else
+            _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
+            std::unique_ptr<PyObject*[]> buf(new PyObject*[names.size()]);
+            _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
+#endif
+            throw exception_set();
+        };
+
+        auto values_it = values.begin();
+        auto names_it = names.begin();
+        auto npositional = values.size() - kwonly;
+
+        if (nargs > (Py_ssize_t)npositional) {
+            // TOO MANY ARGUMENTS
+            error();
+        }
+        for (auto i : irange(nargs)) {
+            *(*values_it++) = args[i];
+            ++names_it;
+        }
+
+        if (!kwnames.ptr()) {
+            if (nargs < required) {
+                // not enough positional arguments
+                error();
+            }
+        } else {
+            int consumed = 0;
+            for (auto i : irange(nargs, values.size())) {
+                bool success = i >= required;
+                const char* target_name = *(names_it++);
+                for (auto j : kwnames.enumerate()) {
+                    if (!strcmp(target_name,kwnames[j])) {
+                        *(*values_it) = args[nargs + j];
+                        ++consumed;
+                        success = true;
+                        break;
+                    }
+                }
+                ++values_it;
+                if (!success) {
+                    // REQUIRED ARGUMENT NOT SPECIFIED
+                    error();
+                }
+            }
+            if (consumed != kwnames.size()) {
+                // NOT ALL KWNAMES ARGUMENTS WERE USED
+                error();
+            }
+        }
+    }
+    int index(const char* name, int pos) {
+        if (pos < nargs) {
+            return pos;
+        }
+        if (kwnames.ptr()) {
+            for (auto j : kwnames.enumerate()) {
+                if (!strcmp(name, kwnames[j])) {
+                    return nargs + j;
+                }
+            }
+        }
+        return -1;
+    }
+};
+
+inline object handle::call_vector(vector_args args) {
+    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) args.args, args.nargs, args.kwnames.ptr()));
+}
+
+
+}
+
+#define MPY_ARGS_NAME(typ, name) #name ,
+#define MPY_ARGS_DECLARE(typ, name) typ name;
+#define MPY_ARGS_POINTER(typ, name) &name ,
+#define MPY_PARSE_ARGS_KWARGS(fmt, FORALL_ARGS) \
+    static char* kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
+    FORALL_ARGS(MPY_ARGS_DECLARE) \
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, fmt, kwlist, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
+        throw mpy::exception_set(); \
+    }
+
+#define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \
+    static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
+    FORALL_ARGS(MPY_ARGS_DECLARE) \
+    static _PyArg_Parser parser = {fmt, kwlist, 0}; \
+    if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
+        throw mpy::exception_set(); \
+    }
diff --git a/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
new file mode 100644
index 0000000000000..d8c22ca312e35
--- /dev/null
+++ b/functorch/csrc/dim/python_variable_simple.h
@@ -0,0 +1,49 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+// note: pytorch's python variable simple includes pybind which conflicts with minpybind
+// so this file just reproduces the minimal API needed to extract Tensors from python objects.
+
+#include <torch/csrc/python_headers.h>
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/Export.h>
+
+// Python object that backs torch.autograd.Variable
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPVariable {
+  PyObject_HEAD;
+  // Payload
+  c10::MaybeOwned<at::Tensor> cdata;
+  // Hooks to be run on backwards pass (corresponds to Python attr
+  // '_backwards_hooks', set by 'register_hook')
+  PyObject* backward_hooks = nullptr;
+};
+
+TORCH_PYTHON_API extern PyObject *THPVariableClass;
+TORCH_PYTHON_API extern PyObject *ParameterClass;
+
+TORCH_PYTHON_API PyObject * THPVariable_Wrap(const at::TensorBase& var);
+
+inline bool THPVariable_Check(PyObject *obj)
+{
+  if (!THPVariableClass)
+      return false;
+
+  const auto result = PyObject_IsInstance(obj, THPVariableClass);
+  AT_ASSERT(result != -1);
+  return result;
+}
+
+inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
+  return *var->cdata;
+}
+
+inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
+  return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
+}
+
+TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
diff --git a/functorch/csrc/init_dim_only.cpp b/functorch/csrc/init_dim_only.cpp
new file mode 100644
index 0000000000000..88d4cbcff7951
--- /dev/null
+++ b/functorch/csrc/init_dim_only.cpp
@@ -0,0 +1,22 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/extension.h>
+#include <functorch/csrc/dim/dim.h>
+
+namespace at {
+namespace functorch {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // initialize first-class dims and install it as a submodule on _C
+  auto dim = Dim_init();
+  if (!dim) {
+    throw py::error_already_set();
+  }
+  py::setattr(m, "dim", py::reinterpret_steal<py::object>(dim));
+}
+
+}}
diff --git a/functorch/dim/README.md b/functorch/dim/README.md
index 80435c2115c21..3cdf60f77d8a1 100644
--- a/functorch/dim/README.md
+++ b/functorch/dim/README.md
@@ -746,14 +746,22 @@ These compilers and language have syntax and semantics that resemble the loop-le
 
 Dimension objects are just an extension of the existing PyTorch tensors and eager semantics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivious to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'.
 
+<<<<<<< HEAD
 In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.  Note, however, that first class dimensions are not natively compiled, so if you write code that performs many outer products with the expectation of it being fused, you will generally not get good performance or memory use (except for matrix-multiply-like patterns specifically.)
+=======
+In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 Performance Expectations
 ========================
+<<<<<<< HEAD
 First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel.
 
 Originally, there was a C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them.  However, this implementation had some manual memory managemetn bugs and was not kept up to date with CPython updates.  The latest Python implementation is two orders of magnitude slower due to CPU overhead; for overhead sensitive applications you should compile the code to eliminate this overhead.
+=======
+First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel. The C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them. In the future, the implementation can incorporate more fusion optimization to further improve performance of this style of code.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ## License
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index df9ca766e28f6..3d68351e269aa 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 import dis
@@ -397,6 +398,18 @@ def genobject(i: int) -> str:
 
     finally:
         del frame
+=======
+import functorch._C
+import torch
+from functorch._C import dim as _C
+
+from .tree_map import tree_flatten, tree_map
+from .wrap_type import wrap_type
+
+
+_C._patch_tensor_class()
+dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DimensionMismatchError(Exception):
@@ -410,6 +423,7 @@ class DimensionBindError(Exception):
 from . import op_properties
 
 
+<<<<<<< HEAD
 def _safe_print(*args: Any, **kwargs: Any) -> None:
     """Safe print that avoids recursive torch function dispatches."""
     import sys
@@ -876,11 +890,45 @@ def __repr__(self) -> str:
             else:
                 dims_repr.append(l)
         return f"{tensor}\nwith dims={tuple(dims_repr)} sizes={tuple(tensor.size())}"  # type: ignore[union-attr]
+=======
+# use dict to avoid writing C++ bindings for set
+pointwise = dict.fromkeys(op_properties.pointwise, True)
+
+use_c = True
+if not use_c:
+    from . import reference
+
+
+class _Tensor:
+    # fast path around slow wrapping/unwrapping logic for simply queries used
+    # by the implementation...
+
+    @property
+    def dims(self):
+        return tuple(d for d in self._levels if isinstance(d, Dim))
+
+    def dim(self):
+        return self.ndim
+
+    if use_c:
+        __torch_function__ = classmethod(_C.__torch_function__)
+        expand = _C._instancemethod(_C.expand)
+    else:
+        __torch_function__ = reference.__torch_function__
+        expand = reference.expand
+
+    index = _C._instancemethod(_C.index)
+
+    def __repr__(self):
+        tensor, levels, ndim = self._tensor, self._levels, self.ndim
+        return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 TensorLike = (_Tensor, torch.Tensor)
 
 
+<<<<<<< HEAD
 class Dim(_Tensor):
     _level: int
     _name: str
@@ -955,10 +1003,15 @@ def __repr__(self) -> str:
         return self._name
 
     # note that Dim comes before tensor because we want the Dim API for things like size to take precedence.
+=======
+class Dim(_C.Dim, _Tensor):
+    # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precedence.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Tensor defines format, but we want to print Dims with special formatting
     __format__ = object.__format__
 
 
+<<<<<<< HEAD
 # Somewhat confusingly, an FCD tensor is also called Tensor.  This confusion
 # is somewhat intentional, as FCD tensors are intended to be substitutable
 # with regular Tensor (just with some positional dims hidden).
@@ -1527,6 +1580,57 @@ def _def(name: str, *args: Any, **kwargs: Any) -> None:
     orig = getattr(torch.Tensor, name)
     setattr(_Tensor, name, _wrap(orig, *args, **kwargs))
 
+=======
+class Tensor(_Tensor, _C.Tensor):
+    if not use_c:
+        from_batched = staticmethod(_C.Tensor_from_batched)
+    from_positional = staticmethod(_C.Tensor_from_positional)
+    sum = _C._instancemethod(_C.Tensor_sum)
+
+
+def cat(tensors, dim, new_dim):
+    n = dims()
+    return stack(tensors, n, dim).index([n, dim], new_dim)
+
+
+if use_c:
+    _wrap = _C._wrap
+
+    def _def(name, *args, **kwargs):
+        orig = getattr(torch.Tensor, name)
+        setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
+
+    t__getitem__ = _C._instancemethod(_C.__getitem__)
+    stack = _C.stack
+    split = _C._instancemethod(_C.split)
+else:
+    _wrap, _def = reference._wrap, reference._def
+    t__getitem__ = reference.t__getitem__
+    stack = reference.stack
+    split = reference.split
+
+# note: there is no python reference
+t__setitem__ = _C._instancemethod(_C.__setitem__)
+# this is patched in the C API because otherwise torch.Tensor will
+# no longer be considered a sequence and things will break
+# torch.Tensor.__getitem__ = t__getitem__
+
+_Tensor.__getitem__ = t__getitem__
+# torch.Tensor.__setitem__ = t__setitem__
+_Tensor.__setitem__ = t__setitem__
+
+torch.Tensor.split = split
+_Tensor.split = split
+torch.Tensor.expand = _C._instancemethod(_C.expand)
+torch.Tensor.index = _C._instancemethod(_C.index)
+wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__)
+del _Tensor.ndim
+
+if use_c:
+    _Tensor.order = _C._instancemethod(_C.order)
+else:
+    _Tensor.order = reference.positional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _def("mean")
 _def("sum")
diff --git a/functorch/dim/batch_tensor.py b/functorch/dim/batch_tensor.py
new file mode 100644
index 0000000000000..dae9b270896e9
--- /dev/null
+++ b/functorch/dim/batch_tensor.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from contextlib import contextmanager
+
+from torch._C._functorch import _vmap_add_layers, _vmap_remove_layers
+
+
+_enabled = False
+
+
+@contextmanager
+def _enable_layers(dims):
+    global _enabled
+    assert not _enabled
+    input = sorted((d._level, d.size) for d in dims if not isinstance(d, int))
+    n = len(input)
+    try:
+        _vmap_add_layers(input)
+        _enabled = True
+        yield
+    finally:
+        _enabled = False
+        _vmap_remove_layers(n)
diff --git a/functorch/dim/delayed_mul_tensor.py b/functorch/dim/delayed_mul_tensor.py
new file mode 100644
index 0000000000000..3c136cfe1247d
--- /dev/null
+++ b/functorch/dim/delayed_mul_tensor.py
@@ -0,0 +1,76 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from . import _Tensor, Tensor
+from .reference import _dims, _enable_layers, llist, ltuple
+
+
+class DelayedMulTensor(_Tensor):
+    def __init__(self, lhs, rhs):
+        self._lhs, self._rhs = lhs, rhs
+        self._data = None
+        self._levels_data = None
+        self._has_device = lhs._has_device or rhs._has_device
+        self._batchtensor_data = None
+        self._tensor_data = None
+
+    @property
+    def _levels(self):
+        if self._levels_data is None:
+            levels = llist(self._lhs._levels)
+            for l in self._rhs._levels:
+                if l not in levels:
+                    levels.append(l)
+            self._levels_data = ltuple(levels)
+        return self._levels_data
+
+    @property
+    def _batchtensor(self):
+        if self._batchtensor_data is None:
+            with _enable_layers(self._levels):
+                print("bt multiply fallback")
+                self._batchtensor_data = self._lhs._batchtensor * self._rhs._batchtensor
+        return self._batchtensor_data
+
+    @property
+    def _tensor(self):
+        if self._tensor_data is None:
+            self._tensor_data = Tensor.from_batched(
+                self._batchtensor, self._has_device
+            )._tensor
+        return self._tensor_data
+
+    @property
+    def ndim(self):
+        return self._batchtensor.ndim
+
+    @property
+    def dims(self):
+        return ltuple(super().dims)
+
+    def sum(self, dim):
+        dims = _dims(dim, 0, False, False)
+        n = ord("a")
+        all_levels = self._levels
+
+        def to_char(d):
+            return chr(n + all_levels.index(d))
+
+        plhs, levelslhs = self._lhs._tensor, self._lhs._levels
+        prhs, levelsrhs = self._rhs._tensor, self._rhs._levels
+        new_levels = [l for l in self._levels if l not in dims]
+        fmt = "".join(
+            [
+                *(to_char(d) for d in levelslhs),
+                ",",
+                *(to_char(d) for d in levelsrhs),
+                "->",
+                *(to_char(d) for d in new_levels),
+            ]
+        )
+        result_data = torch.einsum(fmt, (plhs, prhs))
+        return Tensor.from_positional(result_data, new_levels, True)
diff --git a/functorch/dim/dim.py b/functorch/dim/dim.py
new file mode 100644
index 0000000000000..9a4b568664849
--- /dev/null
+++ b/functorch/dim/dim.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import dis
+import inspect
+from dataclasses import dataclass
+from typing import Union
+
+from . import DimList
+
+
+_vmap_levels = []
+
+
+@dataclass
+class LevelInfo:
+    level: int
+    alive: bool = True
+
+
+class Dim:
+    def __init__(self, name: str, size: Union[None, int] = None):
+        self.name = name
+        self._size = None
+        self._vmap_level = None
+        if size is not None:
+            self.size = size
+
+    def __del__(self):
+        if self._vmap_level is not None:
+            _vmap_active_levels[self._vmap_stack].alive = False  # noqa: F821
+            while (
+                not _vmap_levels[-1].alive and current_level() == _vmap_levels[-1].level  # noqa: F821
+            ):
+                _vmap_decrement_nesting()  # noqa: F821
+                _vmap_levels.pop()
+
+    @property
+    def size(self):
+        assert self.is_bound
+        return self._size
+
+    @size.setter
+    def size(self, size: int):
+        from . import DimensionBindError
+
+        if self._size is None:
+            self._size = size
+            self._vmap_level = _vmap_increment_nesting(size, "same")  # noqa: F821
+            self._vmap_stack = len(_vmap_levels)
+            _vmap_levels.append(LevelInfo(self._vmap_level))
+
+        elif self._size != size:
+            raise DimensionBindError(
+                f"Dim '{self}' previously bound to a dimension of size {self._size} cannot bind to a dimension of size {size}"
+            )
+
+    @property
+    def is_bound(self):
+        return self._size is not None
+
+    def __repr__(self):
+        return self.name
+
+
+def extract_name(inst):
+    assert inst.opname == "STORE_FAST" or inst.opname == "STORE_NAME"
+    return inst.argval
+
+
+_cache = {}
+
+
+def dims(lists=0):
+    frame = inspect.currentframe()
+    assert frame is not None
+    calling_frame = frame.f_back
+    assert calling_frame is not None
+    code, lasti = calling_frame.f_code, calling_frame.f_lasti
+    key = (code, lasti)
+    if key not in _cache:
+        first = lasti // 2 + 1
+        instructions = list(dis.get_instructions(calling_frame.f_code))
+        unpack = instructions[first]
+
+        if unpack.opname == "STORE_FAST" or unpack.opname == "STORE_NAME":
+            # just a single dim, not a list
+            name = unpack.argval
+            ctor = Dim if lists == 0 else DimList
+            _cache[key] = lambda: ctor(name=name)
+        else:
+            assert unpack.opname == "UNPACK_SEQUENCE"
+            ndims = unpack.argval
+            names = tuple(
+                extract_name(instructions[first + 1 + i]) for i in range(ndims)
+            )
+            first_list = len(names) - lists
+            _cache[key] = lambda: tuple(
+                Dim(n) if i < first_list else DimList(name=n)
+                for i, n in enumerate(names)
+            )
+    return _cache[key]()
+
+
+def _dim_set(positional, arg):
+    def convert(a):
+        if isinstance(a, Dim):
+            return a
+        else:
+            assert isinstance(a, int)
+            return positional[a]
+
+    if arg is None:
+        return positional
+    elif not isinstance(arg, (Dim, int)):
+        return tuple(convert(a) for a in arg)
+    else:
+        return (convert(arg),)
diff --git a/functorch/dim/magic_trace.py b/functorch/dim/magic_trace.py
index d3be42cd5514c..dc4167e53c720 100644
--- a/functorch/dim/magic_trace.py
+++ b/functorch/dim/magic_trace.py
@@ -6,14 +6,21 @@
 import os
 import signal
 import subprocess
+<<<<<<< HEAD
 from collections.abc import Generator
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 
 
 @contextmanager
+<<<<<<< HEAD
 def magic_trace(
     output: str = "trace.fxt", magic_trace_cache: str = "/tmp/magic-trace"
 ) -> Generator[None, None, None]:
+=======
+def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pid = os.getpid()
     if not os.path.exists(magic_trace_cache):
         print(f"Downloading magic_trace to: {magic_trace_cache}")
@@ -29,7 +36,10 @@ def magic_trace(
         subprocess.run(["chmod", "+x", magic_trace_cache])
     args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output]
     p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8")
+<<<<<<< HEAD
     assert p.stderr is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while True:
         x = p.stderr.readline()
         print(x)
@@ -40,8 +50,13 @@ def magic_trace(
     finally:
         p.send_signal(signal.SIGINT)
         r = p.wait()
+<<<<<<< HEAD
         if p.stderr is not None:
             print(p.stderr.read())
             p.stderr.close()
+=======
+        print(p.stderr.read())
+        p.stderr.close()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if r != 0:
             raise ValueError(f"magic_trace exited abnormally: {r}")
diff --git a/functorch/dim/reference.py b/functorch/dim/reference.py
new file mode 100644
index 0000000000000..fd934011d8238
--- /dev/null
+++ b/functorch/dim/reference.py
@@ -0,0 +1,645 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# reference python implementations for C ops
+import torch
+from functorch._C import dim as _C
+
+from . import op_properties
+from .batch_tensor import _enable_layers
+from .tree_map import tree_flatten, tree_map
+
+
+DimList = _C.DimList
+import operator
+from functools import reduce
+
+
+# use dict to avoid writing C++ bindings for set
+pointwise = set(op_properties.pointwise)
+
+
+def prod(x):
+    return reduce(operator.mul, x, 1)
+
+
+def _wrap_dim(d, N, keepdim):
+    from . import Dim
+
+    if isinstance(d, Dim):
+        assert not keepdim, "cannot preserve first-class dimensions with keepdim=True"
+        return d
+    elif d >= 0:
+        return d - N
+    else:
+        return d
+
+
+def _dims(d, N, keepdim, single_dim):
+    from . import Dim
+
+    if isinstance(d, (Dim, int)):
+        return ltuple((_wrap_dim(d, N, keepdim),))
+    assert not single_dim, f"expected a single dimension or int but found: {d}"
+    return ltuple(_wrap_dim(x, N, keepdim) for x in d)
+
+
+def _bind_dims_to_size(lhs_size, rhs, lhs_debug):
+    from . import DimensionMismatchError
+
+    not_bound = tuple((i, r) for i, r in enumerate(rhs) if not r.is_bound)
+    if len(not_bound) == 1:
+        idx, d = not_bound[0]
+        rhs_so_far = prod(r.size for r in rhs if r.is_bound)
+        if lhs_size % rhs_so_far != 0:
+            rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs)
+            raise DimensionMismatchError(
+                f"inferred dimension does not evenly fit into larger dimension: {lhs_size} vs {rhs_s}"
+            )
+        new_size = lhs_size // rhs_so_far
+        d.size = new_size
+    elif len(not_bound) > 1:
+        rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs)
+        raise DimensionMismatchError(
+            f"cannot infer the size of two dimensions at once: {rhs} with sizes {rhs_s}"
+        )
+    else:
+        rhs_size = prod(r.size for r in rhs)
+        if lhs_size != rhs_size:
+            raise DimensionMismatchError(
+                f"Dimension sizes to do not match ({lhs_size} != {rhs_size}) when matching {lhs_debug} to {rhs}"
+            )
+
+
+def _tensor_levels(inp):
+    from . import _Tensor
+
+    if isinstance(inp, _Tensor):
+        return inp._tensor, llist(inp._levels), inp._has_device
+    else:
+        return inp, llist(range(-inp.ndim, 0)), True
+
+
+def _match_levels(v, from_levels, to_levels):
+    view = []
+    permute = []
+    requires_view = False
+    size = v.size()
+    for t in to_levels:
+        try:
+            idx = from_levels.index(t)
+            permute.append(idx)
+            view.append(size[idx])
+        except ValueError:
+            view.append(1)
+            requires_view = True
+    if permute != list(range(len(permute))):
+        v = v.permute(*permute)
+    if requires_view:
+        v = v.view(*view)
+    return v
+
+
+# make a single dimension positional but do not permute it,
+# used to do multi-tensor operators where the dim being acted on
+# should not physically move if possible
+def _positional_no_permute(self, dim, expand_dim=False):
+    from . import Tensor
+
+    ptensor, levels = self._tensor, llist(self._levels)
+    try:
+        idx = levels.index(dim)
+    except ValueError:
+        if not expand_dim:
+            raise
+        idx = 0
+        ptensor = ptensor.expand(dim.size, *ptensor.size())
+        levels.insert(0, 0)
+    idx_batched = 0
+    for i in range(idx):
+        if isinstance(levels[i], int):
+            levels[i] -= 1
+            idx_batched += 1
+    levels[idx] = -idx_batched - 1
+    return Tensor.from_positional(ptensor, levels, self._has_device), idx_batched
+
+
+def seq(a, b):
+    from . import Dim
+
+    if isinstance(a, Dim) != isinstance(b, Dim):
+        return False
+    if isinstance(a, Dim):
+        return a is b
+    else:
+        return a == b
+
+
+class isin:
+    __slots__ = ()
+
+    def __contains__(self, item):
+        for x in self:
+            if seq(item, x):
+                return True
+        return False
+
+    def index(self, item):
+        for i, x in enumerate(self):
+            if seq(item, x):
+                return i
+        raise ValueError
+
+
+class llist(isin, list):
+    __slots__ = ()
+
+
+class ltuple(isin, tuple):
+    __slots__ = ()
+
+
+empty_dict = {}
+
+
+@classmethod
+def __torch_function__(self, orig, cls, args, kwargs=empty_dict):
+    from . import _Tensor, Tensor, TensorLike
+    from .delayed_mul_tensor import DelayedMulTensor
+
+    if orig is torch.Tensor.__mul__:
+        lhs, rhs = args
+        if (
+            isinstance(lhs, _Tensor)
+            and isinstance(rhs, _Tensor)
+            and lhs.ndim == 0
+            and rhs.ndim == 0
+        ):
+            return DelayedMulTensor(lhs, rhs)
+    all_dims = llist()
+    flat_args, unflatten = tree_flatten((args, kwargs))
+    device_holding_tensor = None
+    for f in flat_args:
+        if isinstance(f, _Tensor):
+            if f._has_device:
+                device_holding_tensor = f._batchtensor
+            for d in f.dims:
+                if d not in all_dims:
+                    all_dims.append(d)
+
+    def unwrap(t):
+        if isinstance(t, _Tensor):
+            r = t._batchtensor
+            if device_holding_tensor is not None and not t._has_device:
+                r = r.to(device=device_holding_tensor.device)
+            return r
+        return t
+
+    if orig in pointwise:
+        result_levels = llist()
+        to_expand = []
+        for i, f in enumerate(flat_args):
+            if isinstance(f, TensorLike):
+                ptensor, levels, _ = _tensor_levels(f)
+                if (
+                    isinstance(f, _Tensor)
+                    and not f._has_device
+                    and device_holding_tensor is not None
+                ):
+                    ptensor = ptensor.to(device=device_holding_tensor.device)
+                flat_args[i] = ptensor
+                for l in levels:
+                    if l not in result_levels:
+                        result_levels.append(l)
+                to_expand.append((i, levels))
+
+        for i, levels in to_expand:
+            flat_args[i] = _match_levels(flat_args[i], levels, result_levels)
+        args, kwargs = unflatten(flat_args)
+        result = orig(*args, **kwargs)
+
+        def wrap(t):
+            if isinstance(t, TensorLike):
+                return Tensor.from_positional(
+                    t, result_levels, device_holding_tensor is not None
+                )
+            return t
+
+        return tree_map(wrap, result)
+    else:
+
+        def wrap(t):
+            if isinstance(t, TensorLike):
+                return Tensor.from_batched(t, device_holding_tensor is not None)
+            return t
+
+        with _enable_layers(all_dims):
+            print(f"batch_tensor for {orig}")
+            args, kwargs = unflatten(unwrap(f) for f in flat_args)
+            result = orig(*args, **kwargs)
+            # print("END", orig)
+            return tree_map(wrap, result)
+
+
+def positional(self, *dims):
+    from . import Dim, DimensionBindError, Tensor
+
+    ptensor, levels = self._tensor, llist(self._levels)
+    flat_dims = llist()
+    view = []
+    needs_view = False
+    ndim = self.ndim
+    for d in dims:
+        if isinstance(d, DimList):
+            flat_dims.extend(d)
+            view.extend(e.size for e in d)
+        elif isinstance(d, Dim):
+            flat_dims.append(d)
+            view.append(d.size)
+        elif isinstance(d, int):
+            d = _wrap_dim(d, ndim, False)
+            flat_dims.append(d)
+            view.append(ptensor.size(d))
+        else:
+            flat_dims.extend(d)
+            view.append(prod(e.size for e in d))
+            needs_view = True
+
+    permute = list(range(len(levels)))
+    for i, d in enumerate(flat_dims):
+        try:
+            idx = levels.index(d)
+        except ValueError as e:
+            raise DimensionBindError(
+                f"tensor of dimensions {self.dims} does not contain dim {d}"
+            ) from e
+        p = permute[idx]
+        del levels[idx]
+        del permute[idx]
+        levels.insert(i, 0)
+        permute.insert(i, p)
+    ptensor = ptensor.permute(*permute)
+    seen = 0
+    for i in range(len(levels) - 1, -1, -1):
+        if isinstance(levels[i], int):
+            seen += 1
+            levels[i] = -seen
+    result = Tensor.from_positional(ptensor, levels, self._has_device)
+    if needs_view:
+        result = result.reshape(*view, *result.size()[len(flat_dims) :])
+    return result
+
+
+def _contains_dim(input):
+    from . import Dim
+
+    for i in input:
+        if isinstance(i, Dim):
+            return True
+
+
+def expand(self, *sizes):
+    if not _contains_dim(sizes):
+        return self.__torch_function__(torch.Tensor.expand, None, (self, *sizes))
+    dims = sizes
+    sizes = [d.size for d in dims] + [-1] * self.ndim
+    self = self.expand(*sizes)
+    return self[dims]
+
+
+_not_present = object()
+
+
+def _getarg(name, offset, args, kwargs, default):
+    if len(args) > offset:
+        return args[offset]
+    return kwargs.get(name, default)
+
+
+def _patcharg(name, offset, args, kwargs, value):
+    if len(args) > offset:
+        args[offset] = value
+    else:
+        kwargs[name] = value
+
+
+def _wrap(
+    orig, dim_offset=0, keepdim_offset=1, dim_name="dim", single_dim=False, reduce=True
+):
+    from . import Dim, Tensor, TensorLike
+
+    def fn(self, *args, **kwargs):
+        dim = _getarg(dim_name, dim_offset, args, kwargs, _not_present)
+        if dim is _not_present or (single_dim and not isinstance(dim, Dim)):
+            with _enable_layers(self.dims):
+                print(f"dim fallback batch_tensor for {orig}")
+                return Tensor.from_batched(
+                    orig(self._batchtensor, *args, **kwargs), self._has_device
+                )
+        keepdim = (
+            _getarg("keepdim", keepdim_offset, args, kwargs, False) if reduce else False
+        )
+        t, levels = self._tensor, llist(self._levels)
+        dims = _dims(dim, self._batchtensor.ndim, keepdim, single_dim)
+        dim_indices = tuple(levels.index(d) for d in dims)
+        if reduce and not keepdim:
+            new_levels = [l for i, l in enumerate(levels) if i not in dim_indices]
+        else:
+            new_levels = levels
+
+        if len(dim_indices) == 1:
+            dim_indices = dim_indices[
+                0
+            ]  # so that dims that really only take a single argument work...
+        args = list(args)
+        _patcharg(dim_name, dim_offset, args, kwargs, dim_indices)
+
+        def wrap(t):
+            if isinstance(t, TensorLike):
+                return Tensor.from_positional(t, new_levels, self._has_device)
+            return t
+
+        with _enable_layers(new_levels):
+            print(f"dim used batch_tensor for {orig}")
+            r = orig(t, *args, **kwargs)
+            return tree_map(wrap, r)
+
+    return fn
+
+
+def _def(name, *args, **kwargs):
+    from . import _Tensor
+
+    orig = getattr(torch.Tensor, name)
+    setattr(_Tensor, name, _wrap(orig, *args, **kwargs))
+
+
+no_slice = slice(None)
+
+_orig_getitem = torch.Tensor.__getitem__
+
+
+class dim_tracker:
+    def __init__(self) -> None:
+        self.dims = llist()
+        self.count = []
+
+    def record(self, d):
+        if d not in self.dims:
+            self.dims.append(d)
+            self.count.append(1)
+
+    def __getitem__(self, d):
+        return self.count[self.dims.index(d)]
+
+
+def t__getitem__(self, input):
+    from . import _Tensor, Dim, DimensionBindError, DimList, Tensor, TensorLike
+
+    # * bail to original example if we have a single non-Dim tensor, or a non-tensor
+    # * locate ... or an unbound tensor list, and determine its size, bind dim list
+    #   (remember that None does not count to the total dim count)
+    # * bind simple dims and dim-packs to their sizes, count the number of uses of each dim,
+    #   produce the re-view if needed
+    # * for each single-use dim index, replace with no_slice and mark that it will be added
+    #   (keep track of whether we have to call super)
+    # * call super if needed
+    # * if we have dims to bind, bind them (it will help if we eliminated ... and None before)
+    # this handles bool indexing handling, as well as some other simple cases.
+
+    is_simple = (
+        not isinstance(input, Dim)
+        and not isinstance(input, (tuple, list))
+        and
+        # WAR for functorch bug where zero time tensors in getitem are not handled correctly.
+        not (isinstance(input, TensorLike) and input.ndim == 0)
+    )
+
+    if is_simple:
+        if isinstance(self, _Tensor):
+            return _Tensor.__torch_function__(_orig_getitem, None, (self, input))
+        else:
+            return _orig_getitem(self, input)
+
+    # can further optimize this case
+    if not isinstance(input, tuple):
+        input = [input]
+    else:
+        input = list(input)
+
+    dims_indexed = 0
+    expanding_object = None
+    dimlists = []
+    for i, s in enumerate(input):
+        if s is ... or isinstance(s, DimList) and not s.is_bound:
+            if expanding_object is not None:
+                msg = (
+                    "at most one ... or unbound dimension list can exist in indexing list but"
+                    f" found 2 at offsets {i} and {expanding_object}"
+                )
+                raise DimensionBindError(msg)
+            expanding_object = i
+
+        if isinstance(s, DimList):
+            dims_indexed += len(s) if s.is_bound else 0
+            dimlists.append(i)
+        elif s is not None and s is not ...:
+            dims_indexed += 1
+
+    ndim = self.ndim
+    if dims_indexed > ndim:
+        raise IndexError(
+            f"at least {dims_indexed} indices were supplied but the tensor only has {ndim} dimensions."
+        )
+    if expanding_object is not None:
+        expanding_ndims = ndim - dims_indexed
+        obj = input[expanding_object]
+        if obj is ...:
+            input[expanding_object : expanding_object + 1] = [
+                no_slice
+            ] * expanding_ndims
+        else:
+            obj.bind_len(expanding_ndims)
+    # flatten the dimslists into the indexing
+    for i in reversed(dimlists):
+        input[i : i + 1] = input[i]
+    dims_indexed = 0
+    requires_view = False
+    size = self.size()
+    view_sizes = []
+    dims_seen = dim_tracker()
+
+    def add_dims(t):
+        if not isinstance(t, _Tensor):
+            return
+        for d in t.dims:
+            dims_seen.record(d)
+
+    add_dims(self)
+    dim_packs = []
+    for i, idx in enumerate(input):
+        if idx is None:
+            input[i] = no_slice
+            view_sizes.append(1)
+            requires_view = True
+        else:
+            sz = size[dims_indexed]
+            if isinstance(idx, Dim):
+                idx.size = sz
+                dims_seen.record(idx)
+                view_sizes.append(sz)
+            elif isinstance(idx, (tuple, list)) and idx and isinstance(idx[0], Dim):
+                for d in idx:
+                    dims_seen.record(idx)
+                _bind_dims_to_size(sz, idx, f"offset {i}")
+                view_sizes.extend(d.size for d in idx)
+                requires_view = True
+                dim_packs.append(i)
+            else:
+                add_dims(idx)
+                view_sizes.append(sz)
+            dims_indexed += 1
+    if requires_view:
+        self = self.view(*view_sizes)
+    for i in reversed(dim_packs):
+        input[i : i + 1] = input[i]
+
+    # currently:
+    # input is flat, containing either Dim, or Tensor, or something valid for standard indexing
+    # self may have first-class dims as well.
+
+    # to index:
+    # drop the first class dims from self, they just become direct indices of their positions
+
+    # figure out the dimensions of the indexing tensors: union of all the dims in the tensors in the index.
+    # these dimensions will appear and need to be bound at the first place tensor occurs
+
+    if isinstance(self, _Tensor):
+        ptensor_self, levels = self._tensor, list(self._levels)
+        # indices to ptensor rather than self which has first-class dimensions
+        input_it = iter(input)
+        flat_inputs = [next(input_it) if isinstance(l, int) else l for l in levels]
+        has_device = self._has_device
+        to_pad = 0
+    else:
+        ptensor_self, flat_inputs = self, input
+        to_pad = ptensor_self.ndim - len(flat_inputs)
+        has_device = True
+
+    result_levels = []
+    index_levels = []
+    tensor_insert_point = None
+    to_expand = {}
+    requires_getindex = False
+    for i, inp in enumerate(flat_inputs):
+        if isinstance(inp, Dim) and dims_seen[inp] == 1:
+            flat_inputs[i] = no_slice
+            result_levels.append(inp)
+        elif isinstance(inp, TensorLike):
+            requires_getindex = True
+            if tensor_insert_point is None:
+                tensor_insert_point = len(result_levels)
+            ptensor, levels, _ = _tensor_levels(inp)
+            to_expand[i] = levels
+            flat_inputs[i] = ptensor
+            for l in levels:
+                if l not in index_levels:
+                    index_levels.append(l)
+        else:
+            requires_getindex = True
+            result_levels.append(0)
+
+    if tensor_insert_point is not None:
+        result_levels[tensor_insert_point:tensor_insert_point] = index_levels
+
+    for i, levels in to_expand.items():
+        flat_inputs[i] = _match_levels(flat_inputs[i], levels, index_levels)
+
+    if requires_getindex:
+        result = _orig_getitem(ptensor_self, flat_inputs)
+    else:
+        result = ptensor_self
+
+    next_positional = -1
+    if to_pad > 0:
+        result_levels.extend([0] * to_pad)
+    for i, r in enumerate(reversed(result_levels)):
+        if isinstance(r, int):
+            result_levels[-1 - i] = next_positional
+            next_positional -= 1
+
+    return Tensor.from_positional(result, result_levels, has_device)
+
+
+# XXX - dim is optional and can be the outer-most dimension...
+def stack(tensors, new_dim, dim=0, out=None):
+    if isinstance(dim, int):
+        return torch.stack(tensors, dim, out).index(dim, new_dim)
+    index = None
+    if out is not None:
+        out, index = _positional_no_permute(out, dim, expand_dim=True)
+    ptensors = []
+    for t in tensors:
+        pt, pi = _positional_no_permute(t, dim, expand_dim=True)
+        if index is not None and pi != index:
+            pt = pt.move_dim(pi, index)
+        else:
+            index = pi
+        ptensors.append(pt)
+    pr = torch.stack(ptensors, index, out=out)
+    return pr.index((index, index + 1), (new_dim, dim))
+
+
+_orig_split = torch.Tensor.split
+
+
+def split(self, split_size_or_sections, dim=0):
+    from . import _Tensor, Dim
+
+    if isinstance(split_size_or_sections, int) or any(
+        isinstance(t, int) for t in split_size_or_sections
+    ):
+        if isinstance(dim, Dim):
+            raise ValueError(
+                "when dim is specified as a Dim object, split sizes must also be dimensions."
+            )
+        return _orig_split(self, split_size_or_sections, dim=dim)
+
+    if isinstance(dim, Dim):
+        assert isinstance(self, _Tensor), f"Tensor does not have dimension {dim}"
+        self, dim = _positional_no_permute(self, dim)
+
+    size = self.size(dim)
+    total_bound_size = 0
+    unbound = []
+    sizes = []
+    for i, d in enumerate(split_size_or_sections):
+        if d.is_bound:
+            sizes.append(d.size)
+            total_bound_size += d.size
+        else:
+            sizes.append(0)
+            unbound.append(i)
+
+    if unbound:
+        assert total_bound_size <= size, (
+            f"result dimensions are larger than original: {total_bound_size} vs {size} ({split_size_or_sections})"
+        )
+        remaining_size = size - total_bound_size
+        chunk_size = -(-remaining_size // len(unbound))
+        for u in unbound:
+            sz = min(chunk_size, remaining_size)
+            split_size_or_sections[u].size = sz
+            sizes[u] = sz
+            remaining_size -= sz
+    else:
+        assert total_bound_size == size, (
+            f"result dimensions do not match original: {total_bound_size} vs {size} ({split_size_or_sections})"
+        )
+    return tuple(
+        t.index(dim, d)
+        for d, t in zip(split_size_or_sections, _orig_split(self, sizes, dim=dim))
+    )
diff --git a/functorch/dim/tree_map.py b/functorch/dim/tree_map.py
new file mode 100644
index 0000000000000..3d2eae0582c85
--- /dev/null
+++ b/functorch/dim/tree_map.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functorch._C import dim
+
+
+tree_flatten = dim.tree_flatten
+
+
+def tree_map(fn, tree):
+    vs, unflatten = tree_flatten(tree)
+    return unflatten(fn(v) for v in vs)
diff --git a/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
index 5020e756ce6c6..8da06fe68724c 100644
--- a/functorch/dim/wrap_type.py
+++ b/functorch/dim/wrap_type.py
@@ -4,8 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 import functools
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from types import (
     BuiltinMethodType,
     FunctionType,
@@ -13,8 +16,16 @@
     MethodDescriptorType,
     WrapperDescriptorType,
 )
+<<<<<<< HEAD
 from typing import Any
 
+=======
+
+from functorch._C import dim as _C
+
+
+_wrap_method = _C._wrap_method
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FUNC_TYPES = (
     FunctionType,
@@ -25,6 +36,7 @@
 PROPERTY_TYPES = (GetSetDescriptorType, property)
 
 
+<<<<<<< HEAD
 def _py_wrap_method(orig: Callable, __torch_function__: Callable) -> Callable:
     def impl(*args: Any, **kwargs: Any) -> Any:
         return __torch_function__(orig, None, args, kwargs)
@@ -43,6 +55,26 @@ def wrap_type(to_patch: Any, pattern: type, __torch_function__: Callable) -> Non
         all.update(t.__dict__)
 
     def wrap_attr(orig: Any) -> property:
+=======
+def _py_wrap_method(orig, __torch_function__):
+    def impl(*args, **kwargs):
+        return __torch_function__(orig, None, args, kwargs)
+
+    return impl
+
+
+def wrap_type(use_c, to_patch, pattern, __torch_function__):
+    if use_c:
+        wrap_method = _wrap_method
+    else:
+        wrap_method = _py_wrap_method
+
+    all = {}
+    for t in reversed(pattern.mro()[:-1]):  # skip object
+        all.update(t.__dict__)
+
+    def wrap_attr(orig):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return property(wrap_method(orig.__get__, __torch_function__))
 
     for name, obj in all.items():
diff --git a/functorch/docs/source/conf.py b/functorch/docs/source/conf.py
index 749a8435c03bc..c94f60d9ad2d6 100644
--- a/functorch/docs/source/conf.py
+++ b/functorch/docs/source/conf.py
@@ -50,7 +50,11 @@
     "myst_nb",
 ]
 
+<<<<<<< HEAD
 # sys.path.insert(0, os.path.abspath('./tutorials'))
+=======
+# sys.path.insert(0, os.path.abspath('./notebooks'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # build the templated autosummary files
 # autosummary_generate = True
@@ -131,7 +135,11 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
+<<<<<<< HEAD
 exclude_patterns = ["tutorials/colab**", "tutorials/_src/**"]
+=======
+exclude_patterns = ["notebooks/colab**", "notebooks/_src/**"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
diff --git a/functorch/docs/source/index.rst b/functorch/docs/source/index.rst
index 894fb9995afe5..a6a58c229f887 100644
--- a/functorch/docs/source/index.rst
+++ b/functorch/docs/source/index.rst
@@ -55,7 +55,11 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
    :caption: functorch: Getting Started
 
    install
+<<<<<<< HEAD
    tutorials/whirlwind_tour.ipynb
+=======
+   notebooks/whirlwind_tour.ipynb
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ux_limitations
 
 .. toctree::
@@ -70,9 +74,18 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
    :maxdepth: 1
    :caption: functorch Tutorials
 
+<<<<<<< HEAD
    tutorials/jacobians_hessians.ipynb
    tutorials/ensembling.ipynb
    tutorials/per_sample_grads.ipynb
    tutorials/neural_tangent_kernels.ipynb
    tutorials/aot_autograd_optimizations.ipynb
    tutorials/minifier.ipynb
+=======
+   notebooks/jacobians_hessians.ipynb
+   notebooks/ensembling.ipynb
+   notebooks/per_sample_grads.ipynb
+   notebooks/neural_tangent_kernels.ipynb
+   notebooks/aot_autograd_optimizations.ipynb
+   notebooks/minifier.ipynb
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/functorch/docs/source/notebooks b/functorch/docs/source/notebooks
new file mode 120000
index 0000000000000..d4082256dcfe3
--- /dev/null
+++ b/functorch/docs/source/notebooks
@@ -0,0 +1 @@
+../../notebooks/
\ No newline at end of file
diff --git a/functorch/einops/rearrange.py b/functorch/einops/rearrange.py
index 21e3bfaad4d83..e4d127a683ce1 100644
--- a/functorch/einops/rearrange.py
+++ b/functorch/einops/rearrange.py
@@ -1,10 +1,17 @@
 from __future__ import annotations
 
 import functools
+<<<<<<< HEAD
 from typing import TYPE_CHECKING, Union
 
 import torch
 from functorch.dim import dims  # noqa: F401
+=======
+from typing import Callable, TYPE_CHECKING, Union
+
+import torch
+from functorch._C import dim as _C
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ._parsing import (
     _ellipsis,
@@ -16,10 +23,19 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
 
 __all__ = ["rearrange"]
 
+=======
+    from collections.abc import Sequence
+
+__all__ = ["rearrange"]
+
+dims = _C.dims
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @functools.lru_cache(256)
 def _create_rearrange_callable(
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index 0500fc2c29d35..c4f55416592c5 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,12 @@
 # PyTorch forward-mode is not mature yet
+<<<<<<< HEAD
 from torch._functorch.apis import chunk_vmap
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.eager_transforms import hessian, jacfwd, jvp
 from torch.func import functionalize
+=======
+from functorch import functionalize
+from torch._functorch.apis import chunk_vmap
+from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
+from torch._functorch.eager_transforms import hessian, jacfwd, jvp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/functorch/notebooks/_src/plot_ensembling.py b/functorch/notebooks/_src/plot_ensembling.py
new file mode 100644
index 0000000000000..f720f3a612717
--- /dev/null
+++ b/functorch/notebooks/_src/plot_ensembling.py
@@ -0,0 +1,123 @@
+"""
+==========================
+Model ensembling
+==========================
+This example illustrates how to vectorize model ensembling using vmap.
+
+What is model ensembling?
+--------------------------------------------------------------------
+Model ensembling combines the predictions from multiple models together.
+Traditionally this is done by running each model on some inputs separately
+and then combining the predictions. However, if you're running models with
+the same architecture, then it may be possible to combine them together
+using ``vmap``. ``vmap`` is a function transform that maps functions across
+dimensions of the input tensors. One of its use cases is eliminating
+for-loops and speeding them up through vectorization.
+
+Let's demonstrate how to do this using an ensemble of simple CNNs.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+torch.manual_seed(0)
+
+
+# Here's a simple CNN
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        output = x
+        return output
+
+
+# Let's generate some dummy data. Pretend that we're working with an MNIST dataset
+# where the images are 28 by 28.
+# Furthermore, let's say we wish to combine the predictions from 10 different
+# models.
+device = "cuda"
+num_models = 10
+data = torch.randn(100, 64, 1, 28, 28, device=device)
+targets = torch.randint(10, (6400,), device=device)
+models = [SimpleCNN().to(device) for _ in range(num_models)]
+
+# We have a couple of options for generating predictions. Maybe we want
+# to give each model a different randomized minibatch of data, or maybe we
+# want to run the same minibatch of data through each model (e.g. if we were
+# testing the effect of different model initializations).
+
+# Option 1: different minibatch for each model
+minibatches = data[:num_models]
+predictions1 = [model(minibatch) for model, minibatch in zip(models, minibatches)]
+
+# Option 2: Same minibatch
+minibatch = data[0]
+predictions2 = [model(minibatch) for model in models]
+
+
+######################################################################
+# Using vmap to vectorize the ensemble
+# --------------------------------------------------------------------
+# Let's use ``vmap`` to speed up the for-loop. We must first prepare the models
+# for use with ``vmap``.
+#
+# First, let's combine the states of the model together by stacking each parameter.
+# For example, model[i].fc1.weight has shape [9216, 128]; we are going to stack the
+# .fc1.weight of each of the 10 models to produce a big weight of shape [10, 9216, 128].
+#
+# functorch offers the following convenience function to do that. It returns a
+# stateless version of the model (fmodel) and stacked parameters and buffers.
+from functorch import combine_state_for_ensemble
+
+
+fmodel, params, buffers = combine_state_for_ensemble(models)
+[p.requires_grad_() for p in params]
+
+# Option 1: get predictions using a different minibatch for each model.
+# By default, vmap maps a function across the first dimension of all inputs to the
+# passed-in function. After `combine_state_for_ensemble`, each of of ``params``,
+# ``buffers`` have an additional dimension of size ``num_models`` at the front;
+# and ``minibatches`` has a dimension of size ``num_models``.
+print([p.size(0) for p in params])
+assert minibatches.shape == (num_models, 64, 1, 28, 28)
+from functorch import vmap
+
+
+predictions1_vmap = vmap(fmodel)(params, buffers, minibatches)
+assert torch.allclose(
+    predictions1_vmap, torch.stack(predictions1), atol=1e-6, rtol=1e-6
+)
+
+# Option 2: get predictions using the same minibatch of data
+# vmap has an in_dims arg that specify which dimensions to map over.
+# Using ``None``, we tell vmap we want the same minibatch to apply for all of
+# the 10 models.
+predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)
+assert torch.allclose(
+    predictions2_vmap, torch.stack(predictions2), atol=1e-6, rtol=1e-6
+)
+
+# A quick note: there are limitations around what types of functions can be
+# transformed by vmap. The best functions to transform are ones that are
+# pure functions: a function where the outputs are only determined by the inputs
+# that have no side effects (e.g. mutation). vmap is unable to handle mutation of
+# arbitrary Python data structures, but it is able to handle many in-place
+# PyTorch operations.
diff --git a/functorch/notebooks/_src/plot_jacobians_and_hessians.py b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
new file mode 100644
index 0000000000000..cab6a0d989edb
--- /dev/null
+++ b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
@@ -0,0 +1,195 @@
+"""
+=============================
+Jacobians, hessians, and more
+=============================
+
+Computing jacobians or hessians are useful in a number of non-traditional
+deep learning models. It is difficult (or annoying) to compute these quantities
+efficiently using a standard autodiff system like PyTorch Autograd; functorch
+provides ways of computing various higher-order autodiff quantities efficiently.
+"""
+
+from functools import partial
+
+import torch
+import torch.nn.functional as F
+
+
+torch.manual_seed(0)
+
+
+######################################################################
+# Setup: Comparing functorch vs the naive approach
+# --------------------------------------------------------------------
+# Let's start with a function that we'd like to compute the jacobian of.
+# This is a simple linear function with non-linear activation.
+def predict(weight, bias, x):
+    return F.linear(x, weight, bias).tanh()
+
+
+# Here's some dummy data: a weight, a bias, and a feature vector.
+D = 16
+weight = torch.randn(D, D)
+bias = torch.randn(D)
+x = torch.randn(D)
+
+# Let's think of ``predict`` as a function that maps the input ``x`` from R^D -> R^D.
+# PyTorch Autograd computes vector-Jacobian products. In order to compute the full
+# Jacobian of this R^D -> R^D function, we would have to compute it row-by-row
+# by using a different unit vector each time.
+xp = x.clone().requires_grad_()
+unit_vectors = torch.eye(D)
+
+
+def compute_jac(xp):
+    jacobian_rows = [
+        torch.autograd.grad(predict(weight, bias, xp), xp, vec)[0]
+        for vec in unit_vectors
+    ]
+    return torch.stack(jacobian_rows)
+
+
+jacobian = compute_jac(xp)
+
+# Instead of computing the jacobian row-by-row, we can use ``vmap`` to get rid
+# of the for-loop and vectorize the computation. We can't directly apply vmap
+# to PyTorch Autograd; instead, functorch provides a ``vjp`` transform:
+from functorch import vjp, vmap
+
+
+_, vjp_fn = vjp(partial(predict, weight, bias), x)
+(ft_jacobian,) = vmap(vjp_fn)(unit_vectors)
+assert torch.allclose(ft_jacobian, jacobian)
+
+# In another tutorial a composition of reverse-mode AD and vmap gave us
+# per-sample-gradients. In this tutorial, composing reverse-mode AD and vmap
+# gives us Jacobian computation! Various compositions of vmap and autodiff
+# transforms can give us different interesting quantities.
+#
+# functorch provides ``jacrev`` as a convenience function that performs
+# the vmap-vjp composition to compute jacobians. ``jacrev`` accepts an argnums
+# argument that says which argument we would like to compute Jacobians with
+# respect to.
+from functorch import jacrev
+
+
+ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x)
+assert torch.allclose(ft_jacobian, jacobian)
+
+# Let's compare the performance of the two ways to compute jacobian.
+# The functorch version is much faster (and becomes even faster the more outputs
+# there are). In general, we expect that vectorization via ``vmap`` can help
+# eliminate overhead and give better utilization of your hardware.
+from torch.utils.benchmark import Timer
+
+
+without_vmap = Timer(stmt="compute_jac(xp)", globals=globals())
+with_vmap = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+print(without_vmap.timeit(500))
+print(with_vmap.timeit(500))
+
+# It's pretty easy to flip the problem around and say we want to compute
+# Jacobians of the parameters to our model (weight, bias) instead of the input.
+ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x)
+
+######################################################################
+# reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd)
+# --------------------------------------------------------------------
+# We offer two APIs to compute jacobians: jacrev and jacfwd:
+# - jacrev uses reverse-mode AD. As you saw above it is a composition of our
+#   vjp and vmap transforms.
+# - jacfwd uses forward-mode AD. It is implemented as a composition of our
+#   jvp and vmap transforms.
+# jacfwd and jacrev can be substituted for each other and have different
+# performance characteristics.
+#
+# As a general rule of thumb, if you're computing the jacobian of an R^N -> R^M
+# function, if there are many more outputs than inputs (i.e. M > N) then jacfwd is
+# preferred, otherwise use jacrev. There are exceptions to this rule, but a
+# non-rigorous argument for this follows:
+
+# In reverse-mode AD, we are computing the jacobian row-by-row, while in
+# forward-mode AD (which computes Jacobian-vector products), we are computing
+# it column-by-column. The Jacobian matrix has M rows and N columns.
+from functorch import jacfwd, jacrev
+
+
+# Benchmark with more inputs than outputs
+Din = 32
+Dout = 2048
+weight = torch.randn(Dout, Din)
+bias = torch.randn(Dout)
+x = torch.randn(Din)
+
+using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals())
+using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+print(f"jacfwd time: {using_fwd.timeit(500)}")
+print(f"jacrev time: {using_bwd.timeit(500)}")
+
+# Benchmark with more outputs than inputs
+Din = 2048
+Dout = 32
+weight = torch.randn(Dout, Din)
+bias = torch.randn(Dout)
+x = torch.randn(Din)
+
+using_fwd = Timer(stmt="jacfwd(predict, argnums=2)(weight, bias, x)", globals=globals())
+using_bwd = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)", globals=globals())
+print(f"jacfwd time: {using_fwd.timeit(500)}")
+print(f"jacrev time: {using_bwd.timeit(500)}")
+
+######################################################################
+# Hessian computation with functorch.hessian
+# --------------------------------------------------------------------
+# We offer a convenience API to compute hessians: functorch.hessian.
+# Hessians are the jacobian of the jacobian, which suggests that one can just
+# compose functorch's jacobian transforms to compute one.
+# Indeed, under the hood, ``hessian(f)`` is simply ``jacfwd(jacrev(f))``
+#
+# Depending on your model, you may want to use ``jacfwd(jacfwd(f))`` or
+# ``jacrev(jacrev(f))`` instead to compute hessians.
+from functorch import hessian
+
+
+# # TODO: make sure PyTorch has tanh_backward implemented for jvp!!
+# hess0 = hessian(predict, argnums=2)(weight, bias, x)
+# hess1 = jacfwd(jacfwd(predict, argnums=2), argnums=2)(weight, bias, x)
+hess2 = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x)
+
+######################################################################
+# Batch Jacobian (and Batch Hessian)
+# --------------------------------------------------------------------
+# In the above examples we've been operating with a single feature vector.
+# In some cases you might want to take the Jacobian of a batch of outputs
+# with respect to a batch of inputs where each input produces an independent
+# output. That is, given a batch of inputs of shape (B, N) and a function
+# that goes from (B, N) -> (B, M), we would like a Jacobian of shape (B, M, N).
+# The easiest way to do this is to sum over the batch dimension and then
+# compute the Jacobian of that function:
+
+
+def predict_with_output_summed(weight, bias, x):
+    return predict(weight, bias, x).sum(0)
+
+
+batch_size = 64
+Din = 31
+Dout = 33
+weight = torch.randn(Dout, Din)
+bias = torch.randn(Dout)
+x = torch.randn(batch_size, Din)
+
+batch_jacobian0 = jacrev(predict_with_output_summed, argnums=2)(weight, bias, x)
+
+# If you instead have a function that goes from R^N -> R^M but inputs that are
+# batched, you compose vmap with jacrev to compute batched jacobians:
+
+compute_batch_jacobian = vmap(jacrev(predict, argnums=2), in_dims=(None, None, 0))
+batch_jacobian1 = compute_batch_jacobian(weight, bias, x)
+assert torch.allclose(batch_jacobian0, batch_jacobian1)
+
+# Finally, batch hessians can be computed similarly. It's easiest to think about
+# them by using vmap to batch over hessian computation, but in some cases the sum
+# trick also works.
+compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0))
+batch_hess = compute_batch_hessian(weight, bias, x)
diff --git a/functorch/notebooks/_src/plot_per_sample_gradients.py b/functorch/notebooks/_src/plot_per_sample_gradients.py
new file mode 100644
index 0000000000000..c39e9a1794f2a
--- /dev/null
+++ b/functorch/notebooks/_src/plot_per_sample_gradients.py
@@ -0,0 +1,137 @@
+"""
+==========================
+Per-sample-gradients
+==========================
+
+What is it?
+--------------------------------------------------------------------
+Per-sample-gradient computation is computing the gradient for each and every
+sample in a batch of data. It is a useful quantity in differential privacy
+and optimization research.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+torch.manual_seed(0)
+
+
+# Here's a simple CNN
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        output = x
+        return output
+
+
+def loss_fn(predictions, targets):
+    return F.nll_loss(predictions, targets)
+
+
+# Let's generate a batch of dummy data. Pretend that we're working with an
+# MNIST dataset where the images are 28 by 28 and we have a minibatch of size 64.
+device = "cuda"
+num_models = 10
+batch_size = 64
+data = torch.randn(batch_size, 1, 28, 28, device=device)
+targets = torch.randint(10, (64,), device=device)
+
+# In regular model training, one would forward the batch of examples and then
+# call .backward() to compute gradients:
+
+model = SimpleCNN().to(device=device)
+predictions = model(data)
+loss = loss_fn(predictions, targets)
+loss.backward()
+
+
+# Conceptually, per-sample-gradient computation is equivalent to: for each sample
+# of the data, perform a forward and a backward pass to get a gradient.
+def compute_grad(sample, target):
+    sample = sample.unsqueeze(0)
+    target = target.unsqueeze(0)
+    prediction = model(sample)
+    loss = loss_fn(prediction, target)
+    return torch.autograd.grad(loss, list(model.parameters()))
+
+
+def compute_sample_grads(data, targets):
+    sample_grads = [compute_grad(data[i], targets[i]) for i in range(batch_size)]
+    sample_grads = zip(*sample_grads)
+    sample_grads = [torch.stack(shards) for shards in sample_grads]
+    return sample_grads
+
+
+per_sample_grads = compute_sample_grads(data, targets)
+
+# sample_grads[0] is the per-sample-grad for model.conv1.weight
+# model.conv1.weight.shape is [32, 1, 3, 3]; notice how there is one gradient
+# per sample in the batch for a total of 64.
+print(per_sample_grads[0].shape)
+
+
+######################################################################
+# Per-sample-grads using functorch
+# --------------------------------------------------------------------
+# We can compute per-sample-gradients efficiently by using function transforms.
+# First, let's create a stateless functional version of ``model`` by using
+# ``functorch.make_functional_with_buffers``.
+from functorch import grad, make_functional_with_buffers, vmap
+
+
+fmodel, params, buffers = make_functional_with_buffers(model)
+
+
+# Next, let's define a function to compute the loss of the model given a single
+# input rather than a batch of inputs. It is important that this function accepts the
+# parameters, the input, and the target, because we will be transforming over them.
+# Because the model was originally written to handle batches, we'll use
+# ``torch.unsqueeze`` to add a batch dimension.
+def compute_loss(params, buffers, sample, target):
+    batch = sample.unsqueeze(0)
+    targets = target.unsqueeze(0)
+    predictions = fmodel(params, buffers, batch)
+    loss = loss_fn(predictions, targets)
+    return loss
+
+
+# Now, let's use ``grad`` to create a new function that computes the gradient
+# with respect to the first argument of compute_loss (i.e. the params).
+ft_compute_grad = grad(compute_loss)
+
+# ``ft_compute_grad`` computes the gradient for a single (sample, target) pair.
+# We can use ``vmap`` to get it to compute the gradient over an entire batch
+# of samples and targets. Note that in_dims=(None, None, 0, 0) because we wish
+# to map ``ft_compute_grad`` over the 0th dimension of the data and targets
+# and use the same params and buffers for each.
+ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
+
+# Finally, let's used our transformed function to compute per-sample-gradients:
+ft_per_sample_grads = ft_compute_sample_grad(params, buffers, data, targets)
+for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads):
+    assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=1e-6, rtol=1e-6)
+
+# A quick note: there are limitations around what types of functions can be
+# transformed by vmap. The best functions to transform are ones that are
+# pure functions: a function where the outputs are only determined by the inputs
+# that have no side effects (e.g. mutation). vmap is unable to handle mutation of
+# arbitrary Python data structures, but it is able to handle many in-place
+# PyTorch operations.
diff --git a/functorch/notebooks/aot_autograd_optimizations.ipynb b/functorch/notebooks/aot_autograd_optimizations.ipynb
new file mode 100644
index 0000000000000..9a2db0fa9d1c5
--- /dev/null
+++ b/functorch/notebooks/aot_autograd_optimizations.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AOT Autograd - How to use and optimize?\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/aot_autograd_optimizations.ipynb\">\n",
+    "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "\n",
+    "## Background\n",
+    "In this tutorial, we will learn how to use AOT Autograd to speedup training of deep learning models.\n",
+    "\n",
+    "For background, AOT Autograd is a toolkit to assist developers in accelerating training on PyTorch. Broadly, it has two key features\n",
+    "* AOT Autograd traces the forward and backward graph ahead of time. Presence of forward and backward graph ahead of time facilitates joint graph optimizations such as recomputation or activation checkpointing.\n",
+    "* AOT Autograd provides simple mechanisms to compile the extracted forward and backward graphs through deep learning compilers, such as NVFuser, NNC, TVM and others.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## What will you learn?\n",
+    "In this tutorial, we will look at how AOT Autograd can be used, in conjunction with backend compilers, to accelerate the training of PyTorch models. More specifically, you will learn\n",
+    "* How to use AOT Autograd?\n",
+    "* How AOT Autograd uses backend compilers to perform operation fusion?\n",
+    "* How AOT Autograd enables training-specific optimizations such as Recomputation?\n",
+    "\n",
+    "So, lets get started.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Let's setup a simple model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "def fn(a, b, c, d):\n",
+    "    x = a + b + c + d\n",
+    "    return x.cos().cos()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test that it works\n",
+    "a, b, c, d = [torch.randn(2, 4, requires_grad=True) for _ in range(4)]\n",
+    "ref = fn(a, b, c, d)\n",
+    "loss = ref.sum()\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use AOT Autograd\n",
+    "\n",
+    "Now, lets use AOT Autograd and look at the extracted forward and backward graphs. Internally, AOT uses `__torch_dispatch__` based tracing mechanism to extract forward and backward graphs, and wraps them in `torch.Fx` GraphModule containers. Note that AOT Autograd tracing is different from the usual Fx symbolic tracing. AOT Autograd uses Fx GraphModule just to represent the traced graphs (and not for tracing).\n",
+    "\n",
+    "AOT Autograd then sends these forward and backward graphs to the user supplied compilers. So, lets write a compiler that just prints the graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, primals_1, primals_2, primals_3, primals_4):\n",
+      "    add = torch.ops.aten.add(primals_1, primals_2);  primals_1 = primals_2 = None\n",
+      "    add_1 = torch.ops.aten.add(add, primals_3);  add = primals_3 = None\n",
+      "    add_2 = torch.ops.aten.add(add_1, primals_4);  add_1 = primals_4 = None\n",
+      "    cos = torch.ops.aten.cos(add_2)\n",
+      "    cos_1 = torch.ops.aten.cos(cos)\n",
+      "    return [cos_1, add_2, cos]\n",
+      "    \n",
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, add_2, cos, tangents_1):\n",
+      "    sin = torch.ops.aten.sin(cos);  cos = None\n",
+      "    neg = torch.ops.aten.neg(sin);  sin = None\n",
+      "    mul = torch.ops.aten.mul(tangents_1, neg);  tangents_1 = neg = None\n",
+      "    sin_1 = torch.ops.aten.sin(add_2);  add_2 = None\n",
+      "    neg_1 = torch.ops.aten.neg(sin_1);  sin_1 = None\n",
+      "    mul_1 = torch.ops.aten.mul(mul, neg_1);  mul = neg_1 = None\n",
+      "    return [mul_1, mul_1, mul_1, mul_1]\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "from functorch.compile import aot_function\n",
+    "\n",
+    "# The compiler_fn is called after the forward and backward graphs are extracted.\n",
+    "# Here, we just print the code in the compiler_fn. Return of this function is a callable.\n",
+    "def compiler_fn(fx_module: torch.fx.GraphModule, _):\n",
+    "    print(fx_module.code)\n",
+    "    return fx_module\n",
+    "\n",
+    "# Pass on the compiler_fn to the aot_function API\n",
+    "aot_print_fn = aot_function(fn, fw_compiler=compiler_fn, bw_compiler=compiler_fn)\n",
+    "\n",
+    "# Run the aot_print_fn once to trigger the compilation and print the graphs\n",
+    "cloned_inputs = [x.clone().detach().requires_grad_(True) for x in (a, b, c, d)]\n",
+    "cloned_a, cloned_b, cloned_c, cloned_d = cloned_inputs\n",
+    "res = aot_print_fn(cloned_a, cloned_b, cloned_c, cloned_d)\n",
+    "res.sum().backward()\n",
+    "assert torch.allclose(ref, res)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above code prints the Fx graph for the forward and backward graph. You can see that in addition to the original input of the forward pass, the forward graph outputs some additional tensors. These tensors are saved for the backward pass for gradient calculation. We will come back to these later while talking about recomputation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Operator Fusion\n",
+    "Now that we understand how to use AOT Autograd to print forward and backward graphs, let us use AOT Autograd to use some actual deep learning compiler. In this tutorial, we use PyTorch Neural Network Compiler (NNC) to perform pointwise operator fusion for CPU devices. For CUDA devices, a suitable alternative is NvFuser. So, lets use NNC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# AOT Autograd has a suite of already integrated backends. Lets import the NNC compiler backend - ts_compile\n",
+    "from functorch.compile import ts_compile\n",
+    "\n",
+    "# Lets compile the forward and backward through ts_compile.\n",
+    "aot_nnc_fn = aot_function(fn, fw_compiler=ts_compile, bw_compiler=ts_compile)\n",
+    "\n",
+    "# Correctness checking. Lets clone the input so that we can check grads.\n",
+    "cloned_inputs = [x.clone().detach().requires_grad_(True) for x in (a, b, c, d)]\n",
+    "cloned_a, cloned_b, cloned_c, cloned_d = cloned_inputs\n",
+    "\n",
+    "res = aot_nnc_fn(*cloned_inputs)\n",
+    "loss = res.sum()\n",
+    "loss.backward()\n",
+    "assert torch.allclose(ref, res)\n",
+    "assert torch.allclose(a.grad, cloned_a.grad)\n",
+    "assert torch.allclose(b.grad, cloned_b.grad)\n",
+    "assert torch.allclose(c.grad, cloned_c.grad)\n",
+    "assert torch.allclose(d.grad, cloned_d.grad)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets benchmark the original and AOT Autograd + NNC compiled function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets write a function to benchmark the forward and backward pass\n",
+    "import time\n",
+    "import statistics\n",
+    "\n",
+    "def bench(fn, args, prefix):\n",
+    "    warmup = 10\n",
+    "    iterations = 100\n",
+    "\n",
+    "    for _ in range(warmup):\n",
+    "        ref = fn(*args)\n",
+    "        ref.sum().backward()\n",
+    "    \n",
+    "    fw_latencies = []\n",
+    "    bw_latencies = []\n",
+    "    for _ in range(iterations):\n",
+    "        for arg in args:\n",
+    "            arg.grad = None\n",
+    "\n",
+    "        fw_begin = time.perf_counter()\n",
+    "        ref = fn(*args)\n",
+    "        fw_end = time.perf_counter()\n",
+    "\n",
+    "        loss = ref.sum() \n",
+    "\n",
+    "        bw_begin = time.perf_counter()\n",
+    "        loss.backward()\n",
+    "        bw_end = time.perf_counter()\n",
+    "\n",
+    "        fw_latencies.append(fw_end - fw_begin)\n",
+    "        bw_latencies.append(bw_end - bw_begin)\n",
+    "    \n",
+    "    avg_fw_latency = statistics.mean(fw_latencies) * 10**6\n",
+    "    avg_bw_latency = statistics.mean(bw_latencies) * 10**6\n",
+    "    print(prefix, \"Fwd = \" + str(avg_fw_latency) + \" us\", \"Bwd = \" + str(avg_bw_latency) + \" us\", sep=', ')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Eager, Fwd = 982.6959593920038 us, Bwd = 1899.7003795811906 us\n",
+      "AOT, Fwd = 734.2723174951971 us, Bwd = 831.1696897726506 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "large_inputs = [torch.randn(1024, 2048, requires_grad=True) for _ in range(4)]\n",
+    "\n",
+    "# Benchmark the Eager and AOT Autograd functions\n",
+    "bench(fn, large_inputs, \"Eager\")\n",
+    "bench(aot_nnc_fn, large_inputs, \"AOT\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the help of NNC, AOT Autograd speeds up both the forward and backward pass. If we look at the printed graphs earlier, all the operators are pointwise. The pointwise operators are memory bandwidth bound, and thus benefit from operator fusion. Looking closely at the numbers, the backward pass gets higher speedup. This is because forward pass has to output some intermediate tensors for gradient calculation for the backward pass, preventing it from saving some memory reads and writes. However, such restriction does not exist in the backward graph."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Recomputation (aka Activation Checkpointing)\n",
+    "Recomputation (often called activation checkpointing) is a technique in which, instead of saving some activations for use in backwards, we recompute them **during** the backwards pass. Recomputing saves memory, but we incur performance overhead.\n",
+    "\n",
+    "However, in the presence of fusing compiler, we can do better than that. We can recompute the fusion-friendly operators to save memory, and then rely on the fusing compiler to fuse the recomputed operators. This reduces both memory and runtime. Please refer to this [discuss post](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467) for more details.\n",
+    "\n",
+    "Here, we use AOT Autograd with NNC to perform similar type of recomputation. At the end of `__torch_dispatch__` tracing, AOT Autograd has a forward graph and joint forward-backward graph. AOT Autograd then uses a partitioner to isolate the forward and backward graph. In the example above, we used a default partitioner. For this experiment, we will use another partitioner called `min_cut_rematerialization_partition` to perform smarter fusion-aware recomputation. The partitioner is configurable and one can write their own partitioner to plug it in AOT Autograd."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, primals_1, primals_2, primals_3, primals_4):\n",
+      "    add = torch.ops.aten.add(primals_1, primals_2);  primals_1 = primals_2 = None\n",
+      "    add_1 = torch.ops.aten.add(add, primals_3);  add = primals_3 = None\n",
+      "    add_2 = torch.ops.aten.add(add_1, primals_4);  add_1 = primals_4 = None\n",
+      "    cos = torch.ops.aten.cos(add_2)\n",
+      "    cos_1 = torch.ops.aten.cos(cos);  cos = None\n",
+      "    return [cos_1, add_2]\n",
+      "    \n",
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, add_2, tangents_1):\n",
+      "    cos = torch.ops.aten.cos(add_2)\n",
+      "    sin = torch.ops.aten.sin(cos);  cos = None\n",
+      "    neg = torch.ops.aten.neg(sin);  sin = None\n",
+      "    mul = torch.ops.aten.mul(tangents_1, neg);  tangents_1 = neg = None\n",
+      "    sin_1 = torch.ops.aten.sin(add_2);  add_2 = None\n",
+      "    neg_1 = torch.ops.aten.neg(sin_1);  sin_1 = None\n",
+      "    mul_1 = torch.ops.aten.mul(mul, neg_1);  mul = neg_1 = None\n",
+      "    return [mul_1, mul_1, mul_1, mul_1]\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "from functorch.compile import min_cut_rematerialization_partition\n",
+    "\n",
+    "# Zero out the gradients so we can do a comparison later\n",
+    "a.grad, b.grad, c.grad, d.grad = (None,) * 4\n",
+    "\n",
+    "# Lets set up the partitioner. Also set the fwd and bwd compilers to the printer function that we used earlier.\n",
+    "# This will show us how the recomputation has modified the graph.\n",
+    "aot_fn = aot_function(fn, fw_compiler=compiler_fn, bw_compiler=compiler_fn, partition_fn=min_cut_rematerialization_partition)\n",
+    "res = aot_fn(a, b, c, d).sum().backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that compared to default partitioner, forward pass now outputs fewer tensors, and recomputes some operations in the backward pass. Let us try NNC compiler now to perform operator fusions (note that we also have a wrapper function - `memory_efficient_fusion` which internally uses `min_cut_rematerialization_partition` and Torchscript compiler to achieve the same effect as following code)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Lets set up the partitioner and NNC compiler.\n",
+    "aot_recompute_nnc_fn = aot_function(fn, fw_compiler=ts_compile, bw_compiler=ts_compile, partition_fn=min_cut_rematerialization_partition)\n",
+    "\n",
+    "# Correctness checking. Lets clone the input so that we can check grads.\n",
+    "cloned_inputs = [x.clone().detach().requires_grad_(True) for x in (a, b, c, d)]\n",
+    "cloned_a, cloned_b, cloned_c, cloned_d = cloned_inputs\n",
+    "\n",
+    "res = aot_recompute_nnc_fn(*cloned_inputs)\n",
+    "loss = res.sum()\n",
+    "loss.backward()\n",
+    "assert torch.allclose(ref, res)\n",
+    "assert torch.allclose(a.grad, cloned_a.grad)\n",
+    "assert torch.allclose(b.grad, cloned_b.grad)\n",
+    "assert torch.allclose(c.grad, cloned_c.grad)\n",
+    "assert torch.allclose(d.grad, cloned_d.grad)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, lets benchmark the different functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Eager, Fwd = 740.7676504226401 us, Bwd = 1560.5240693548694 us\n",
+      "AOT, Fwd = 713.8530415249988 us, Bwd = 909.1200679540634 us\n",
+      "AOT_Recomp, Fwd = 712.2249767417088 us, Bwd = 791.4606417762116 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "bench(fn, large_inputs, \"Eager\")\n",
+    "bench(aot_nnc_fn, large_inputs, \"AOT\")\n",
+    "bench(aot_recompute_nnc_fn, large_inputs, \"AOT_Recomp\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We observe that both forward and backward latency improve over the default partitioner (and a lot better than eager). Fewer outputs in the forward pass and fewer inputs in the backward pass, along with fusion, allows better memory bandwidth utilization leading to further speedups."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Actual Usage\n",
+    "For actual usage on CUDA devices, we've wrapped AOTAutograd in a convenient wrapper - `memory_efficient_fusion`. Use this for fusion on GPU!\n",
+    "\n",
+    "```\n",
+    "from functorch.compile import memory_efficient_fusion\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.5 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "73b6e0ee7c860e06bb349c72324473b318d6cb6c97bcad772bce0703fb8d0dfb"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/functorch/notebooks/ensembling.ipynb b/functorch/notebooks/ensembling.ipynb
new file mode 100644
index 0000000000000..1ecc8738b0b5f
--- /dev/null
+++ b/functorch/notebooks/ensembling.ipynb
@@ -0,0 +1,391 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "de1548fb-a313-4e9c-ae5d-8ec4c12ddd94",
+      "metadata": {
+        "id": "de1548fb-a313-4e9c-ae5d-8ec4c12ddd94"
+      },
+      "source": [
+        "# Model ensembling\n",
+        "\n",
+        "This example illustrates how to vectorize model ensembling using vmap.\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/ensembling.ipynb\">\n",
+        "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+        "</a>\n",
+        "\n",
+        "## What is model ensembling?\n",
+        "Model ensembling combines the predictions from multiple models together.\n",
+        "Traditionally this is done by running each model on some inputs separately\n",
+        "and then combining the predictions. However, if you're running models with\n",
+        "the same architecture, then it may be possible to combine them together\n",
+        "using `vmap`. `vmap` is a function transform that maps functions across\n",
+        "dimensions of the input tensors. One of its use cases is eliminating\n",
+        "for-loops and speeding them up through vectorization.\n",
+        "\n",
+        "Let's demonstrate how to do this using an ensemble of simple MLPs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "from functools import partial\n",
+        "torch.manual_seed(0);"
+      ],
+      "metadata": {
+        "id": "Gb-yt4VKUUuc"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "Gb-yt4VKUUuc"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Here's a simple MLP\n",
+        "class SimpleMLP(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.fc1 = nn.Linear(784, 128)\n",
+        "        self.fc2 = nn.Linear(128, 128)\n",
+        "        self.fc3 = nn.Linear(128, 10)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = x.flatten(1)\n",
+        "        x = self.fc1(x)\n",
+        "        x = F.relu(x)\n",
+        "        x = self.fc2(x)\n",
+        "        x = F.relu(x)\n",
+        "        x = self.fc3(x)\n",
+        "        return x\n"
+      ],
+      "metadata": {
+        "id": "tf-HKHjUUbyY"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "tf-HKHjUUbyY"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let’s generate a batch of dummy data and pretend that we’re working with an MNIST dataset. Thus, the dummy images are 28 by 28, and we have a minibatch of size 64. Furthermore, lets say we want to combine the predictions from 10 different models. \n"
+      ],
+      "metadata": {
+        "id": "VEDPe-EoU5Fa"
+      },
+      "id": "VEDPe-EoU5Fa"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device = 'cuda'\n",
+        "num_models = 10\n",
+        "\n",
+        "data = torch.randn(100, 64, 1, 28, 28, device=device)\n",
+        "targets = torch.randint(10, (6400,), device=device)\n",
+        "\n",
+        "models = [SimpleMLP().to(device) for _ in range(num_models)]"
+      ],
+      "metadata": {
+        "id": "WB2Qe3AHUvPN"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WB2Qe3AHUvPN"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We have a couple of options for generating predictions. Maybe we want to give each model a different randomized minibatch of data. Alternatively, maybe we want to run the same minibatch of data through each model (e.g. if we were testing the effect of different model initializations).\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "GOGJ-OUxVcT5"
+      },
+      "id": "GOGJ-OUxVcT5"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Option 1: different minibatch for each model"
+      ],
+      "metadata": {
+        "id": "CwJBb09MxCN3"
+      },
+      "id": "CwJBb09MxCN3"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "minibatches = data[:num_models]\n",
+        "predictions_diff_minibatch_loop = [model(minibatch) for model, minibatch in zip(models, minibatches)]"
+      ],
+      "metadata": {
+        "id": "WYjMx8QTUvRu"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WYjMx8QTUvRu"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Option 2: Same minibatch"
+      ],
+      "metadata": {
+        "id": "HNw4_IVzU5Pz"
+      },
+      "id": "HNw4_IVzU5Pz"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "minibatch = data[0]\n",
+        "predictions2 = [model(minibatch) for model in models]"
+      ],
+      "metadata": {
+        "id": "vUsb3VfexJrY"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "vUsb3VfexJrY"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Using vmap to vectorize the ensemble\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "aNkX6lFIxzcm"
+      },
+      "id": "aNkX6lFIxzcm"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let’s use vmap to speed up the for-loop. We must first prepare the models for use with vmap.\n",
+        "\n",
+        "First, let’s combine the states of the model together by stacking each parameter. For example, `model[i].fc1.weight` has shape `[784, 128]`; we are going to stack the .fc1.weight of each of the 10 models to produce a big weight of shape `[10, 784, 128]`.\n",
+        "\n",
+        "functorch offers the 'combine_state_for_ensemble' convenience function to do that. It returns a stateless version of the model (fmodel) and stacked parameters and buffers.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "-sFMojhryviM"
+      },
+      "id": "-sFMojhryviM"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import combine_state_for_ensemble\n",
+        "\n",
+        "fmodel, params, buffers = combine_state_for_ensemble(models)\n",
+        "[p.requires_grad_() for p in params];\n"
+      ],
+      "metadata": {
+        "id": "C3a9_clvyPho"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "C3a9_clvyPho"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Option 1: get predictions using a different minibatch for each model. \n",
+        "\n",
+        "By default, vmap maps a function across the first dimension of all inputs to the passed-in function. After using the combine_state_for_ensemble, each of the params and buffers have an additional dimension of size 'num_models' at the front, and minibatches has a dimension of size 'num_models'.\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "mFJDWMM9yaYZ"
+      },
+      "id": "mFJDWMM9yaYZ"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print([p.size(0) for p in params]) # show the leading 'num_models' dimension\n",
+        "\n",
+        "assert minibatches.shape == (num_models, 64, 1, 28, 28) # verify minibatch has leading dimension of size 'num_models'"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ezuFQx1G1zLG",
+        "outputId": "ab260da3-77f2-4ff9-d843-e0d0f1e0a884"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[10, 10, 10, 10, 10, 10]\n"
+          ]
+        }
+      ],
+      "id": "ezuFQx1G1zLG"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import vmap\n",
+        "\n",
+        "predictions1_vmap = vmap(fmodel)(params, buffers, minibatches)\n",
+        "\n",
+        "# verify the vmap predictions match the \n",
+        "assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5)"
+      ],
+      "metadata": {
+        "id": "VroLnfD82DDf"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "VroLnfD82DDf"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Option 2: get predictions using the same minibatch of data.\n",
+        "\n",
+        "vmap has an in_dims arg that specifies which dimensions to map over. By using `None`, we tell vmap we want the same minibatch to apply for all of the 10 models.\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "tlkmyQyfY6XU"
+      },
+      "id": "tlkmyQyfY6XU"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)\n",
+        "\n",
+        "assert torch.allclose(predictions2_vmap, torch.stack(predictions2), atol=1e-3, rtol=1e-5)"
+      ],
+      "metadata": {
+        "id": "WiSMupvCyecd"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WiSMupvCyecd"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "A quick note: there are limitations around what types of functions can be transformed by vmap. The best functions to transform are ones that are pure functions: a function where the outputs are only determined by the inputs that have no side effects (e.g. mutation). vmap is unable to handle mutation of arbitrary Python data structures, but it is able to handle many in-place PyTorch operations."
+      ],
+      "metadata": {
+        "id": "KrXQsUCIGLWm"
+      },
+      "id": "KrXQsUCIGLWm"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Performance\n",
+        "\n",
+        "Curious about performance numbers? Here's how the numbers look on Google Colab."
+      ],
+      "metadata": {
+        "id": "MCjBhMrVF5hH"
+      },
+      "id": "MCjBhMrVF5hH"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from torch.utils.benchmark import Timer\n",
+        "without_vmap = Timer(\n",
+        "    stmt=\"[model(minibatch) for model, minibatch in zip(models, minibatches)]\",\n",
+        "    globals=globals())\n",
+        "with_vmap = Timer(\n",
+        "    stmt=\"vmap(fmodel)(params, buffers, minibatches)\",\n",
+        "    globals=globals())\n",
+        "print(f'Predictions without vmap {without_vmap.timeit(100)}')\n",
+        "print(f'Predictions with vmap {with_vmap.timeit(100)}')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gJPrGdS0GBjz",
+        "outputId": "04e75950-b964-419c-fa9c-f1590e0081bb"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Predictions without vmap <torch.utils.benchmark.utils.common.Measurement object at 0x7fe22c58b3d0>\n",
+            "[model(minibatch) for model, minibatch in zip(models, minibatches)]\n",
+            "  3.25 ms\n",
+            "  1 measurement, 100 runs , 1 thread\n",
+            "Predictions with vmap <torch.utils.benchmark.utils.common.Measurement object at 0x7fe22c50c450>\n",
+            "vmap(fmodel)(params, buffers, minibatches)\n",
+            "  879.28 us\n",
+            "  1 measurement, 100 runs , 1 thread\n"
+          ]
+        }
+      ],
+      "id": "gJPrGdS0GBjz"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "There's a large speedup using vmap! \n",
+        "\n",
+        "In general, vectorization with vmap should be faster than running a function in a for-loop and competitive with manual batching. There are some exceptions though, like if we haven’t implemented the vmap rule for a particular operation or if the underlying kernels weren’t optimized for older hardware (GPUs). If you see any of these cases, please let us know by opening an issue at our [GitHub](https://github.com/pytorch/functorch)!\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "UI74G9JarQU8"
+      },
+      "id": "UI74G9JarQU8"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "colab": {
+      "name": "ensembling.ipynb",
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/functorch/notebooks/jacobians_hessians.ipynb b/functorch/notebooks/jacobians_hessians.ipynb
new file mode 100644
index 0000000000000..4acf2ec609ff3
--- /dev/null
+++ b/functorch/notebooks/jacobians_hessians.ipynb
@@ -0,0 +1,952 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Jacobians, Hessians, hvp, vhp, and more: composing functorch transforms\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/jacobians_hessians.ipynb\">\n",
+        "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+        "</a>\n",
+        "\n",
+        "Computing jacobians or hessians are useful in a number of non-traditional\n",
+        "deep learning models. It is difficult (or annoying) to compute these quantities\n",
+        "efficiently using a standard autodiff system like PyTorch Autograd; functorch\n",
+        "provides ways of computing various higher-order autodiff quantities efficiently."
+      ],
+      "metadata": {
+        "id": "zPbR6-eP51fe"
+      },
+      "id": "zPbR6-eP51fe"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Computing the Jacobian"
+      ],
+      "metadata": {
+        "id": "3kDj8fhn52j3"
+      },
+      "id": "3kDj8fhn52j3"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "from functools import partial\n",
+        "_ = torch.manual_seed(0)"
+      ],
+      "metadata": {
+        "id": "w_IinyjzflUH"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "w_IinyjzflUH"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let’s start with a function that we’d like to compute the jacobian of.  This is a simple linear function with non-linear activation.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "cibF_PEYflUH"
+      },
+      "id": "cibF_PEYflUH"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def predict(weight, bias, x):\n",
+        "    return F.linear(x, weight, bias).tanh()"
+      ],
+      "metadata": {
+        "id": "qhcD9hWYflUH"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "qhcD9hWYflUH"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's add some dummy data:   a weight, a bias, and a feature vector x.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "G8tqQrO_flUH"
+      },
+      "id": "G8tqQrO_flUH"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "D = 16\n",
+        "weight = torch.randn(D, D)\n",
+        "bias = torch.randn(D)\n",
+        "x = torch.randn(D) # feature vector"
+      ],
+      "metadata": {
+        "id": "FZ4uJfZGflUH"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "FZ4uJfZGflUH"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's think of `predict` as a function that maps the input `x` from $R^D -> R^D$.\n",
+        "PyTorch Autograd computes vector-Jacobian products. In order to compute the full\n",
+        "Jacobian of this $R^D -> R^D$ function, we would have to compute it row-by-row\n",
+        "by using a different unit vector each time."
+      ],
+      "metadata": {
+        "id": "uMAW-ArQflUH"
+      },
+      "id": "uMAW-ArQflUH"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_jac(xp):\n",
+        "    jacobian_rows = [torch.autograd.grad(predict(weight, bias, xp), xp, vec)[0]\n",
+        "                     for vec in unit_vectors]\n",
+        "    return torch.stack(jacobian_rows)"
+      ],
+      "metadata": {
+        "id": "z-BJPtbpflUI"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "z-BJPtbpflUI"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "xp = x.clone().requires_grad_()\n",
+        "unit_vectors = torch.eye(D)\n",
+        "\n",
+        "jacobian = compute_jac(xp)\n",
+        "\n",
+        "print(jacobian.shape)\n",
+        "print(jacobian[0])  # show first row"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "f1f1ec12-56ef-40f7-8c3c-cbad7bf86644",
+        "id": "zuWGSXspflUI"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "torch.Size([16, 16])\n",
+            "tensor([-0.5956, -0.6096, -0.1326, -0.2295,  0.4490,  0.3661, -0.1672, -1.1190,\n",
+            "         0.1705, -0.6683,  0.1851,  0.1630,  0.0634,  0.6547,  0.5908, -0.1308])\n"
+          ]
+        }
+      ],
+      "id": "zuWGSXspflUI"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Instead of computing the jacobian row-by-row, we can use vmap to get rid of the for-loop and vectorize the computation. \n",
+        "We can’t directly apply vmap to PyTorch Autograd; instead, functorch provides a vjp transform:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "mxlEOUieflUI"
+      },
+      "id": "mxlEOUieflUI"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import vmap, vjp\n",
+        "\n",
+        "_, vjp_fn = vjp(partial(predict, weight, bias), x)\n",
+        "\n",
+        "ft_jacobian, = vmap(vjp_fn)(unit_vectors)\n",
+        "\n",
+        "# lets confirm both methods compute the same result\n",
+        "assert torch.allclose(ft_jacobian, jacobian)"
+      ],
+      "metadata": {
+        "id": "DeF6uy4WflUI"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "DeF6uy4WflUI"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In future tutorial a composition of reverse-mode AD and vmap will give us per-sample-gradients. \n",
+        "In this tutorial, composing reverse-mode AD and vmap gives us Jacobian computation! \n",
+        "Various compositions of vmap and autodiff transforms can give us different interesting quantities.\n",
+        "\n",
+        "functorch provides **jacrev** as a convenience function that performs the vmap-vjp composition to compute jacobians. **jacrev** accepts an argnums argument that says which argument we would like to compute Jacobians with respect to.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "Hy4REmwDflUI"
+      },
+      "id": "Hy4REmwDflUI"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import jacrev\n",
+        "\n",
+        "ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x)\n",
+        "\n",
+        "# confirm \n",
+        "assert torch.allclose(ft_jacobian, jacobian)"
+      ],
+      "metadata": {
+        "id": "Rt7i6_YlflUI"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "Rt7i6_YlflUI"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let’s compare the performance of the two ways to compute the jacobian. The functorch version is much faster (and becomes even faster the more outputs there are). \n",
+        "\n",
+        "In general, we expect that vectorization via vmap can help eliminate overhead and give better utilization of your hardware.\n",
+        "\n",
+        "Vmap does this magic by pushing the outer loop down into the functions primitive operations in order to obtain better performance.\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "JYe2H1UcflUJ"
+      },
+      "id": "JYe2H1UcflUJ"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's make a quick function to evaluate performance and deal with microseconds and milliseconds measurements:"
+      ],
+      "metadata": {
+        "id": "i_143LZwflUJ"
+      },
+      "id": "i_143LZwflUJ"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_perf(first, first_descriptor, second, second_descriptor):\n",
+        "  \"\"\"  takes torch.benchmark objects and compares delta of second vs first. \"\"\"\n",
+        "  faster = second.times[0]\n",
+        "  slower = first.times[0]\n",
+        "  gain = (slower-faster)/slower\n",
+        "  if gain < 0: gain *=-1 \n",
+        "  final_gain = gain*100\n",
+        "  print(f\" Performance delta: {final_gain:.4f} percent improvement with {second_descriptor} \")"
+      ],
+      "metadata": {
+        "id": "II7r6jBtflUJ"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "II7r6jBtflUJ"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "And then run the performance comparison:"
+      ],
+      "metadata": {
+        "id": "r4clPnPKflUJ"
+      },
+      "id": "r4clPnPKflUJ"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from torch.utils.benchmark import Timer\n",
+        "\n",
+        "without_vmap = Timer(stmt=\"compute_jac(xp)\", globals=globals())\n",
+        "with_vmap = Timer(stmt=\"jacrev(predict, argnums=2)(weight, bias, x)\", globals=globals())\n",
+        "\n",
+        "no_vmap_timer = without_vmap.timeit(500)\n",
+        "with_vmap_timer = with_vmap.timeit(500)\n",
+        "\n",
+        "print(no_vmap_timer)\n",
+        "print(with_vmap_timer)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "cbf77a19-aac9-428d-eba1-74d337c53e49",
+        "id": "ZPtoxF6eflUJ"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "<torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a911b350>\n",
+            "compute_jac(xp)\n",
+            "  2.25 ms\n",
+            "  1 measurement, 500 runs , 1 thread\n",
+            "<torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a6a99d50>\n",
+            "jacrev(predict, argnums=2)(weight, bias, x)\n",
+            "  884.34 us\n",
+            "  1 measurement, 500 runs , 1 thread\n"
+          ]
+        }
+      ],
+      "id": "ZPtoxF6eflUJ"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Lets do a relative performance comparison of the above with our get_perf function:"
+      ],
+      "metadata": {
+        "id": "nGBBi4dZflUJ"
+      },
+      "id": "nGBBi4dZflUJ"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "get_perf(no_vmap_timer, \"without vmap\",  with_vmap_timer, \"vmap\");"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "85d0bc5f-34aa-4826-f953-6c637404490c",
+        "id": "zqV2RzEXflUJ"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            " Performance delta: 60.7170 percent improvement with vmap \n"
+          ]
+        }
+      ],
+      "id": "zqV2RzEXflUJ"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Furthermore, it’s pretty easy to flip the problem around and say we want to compute Jacobians of the parameters to our model (weight, bias) instead of the input."
+      ],
+      "metadata": {
+        "id": "EQAB99EQflUJ"
+      },
+      "id": "EQAB99EQflUJ"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# note the change in input via argnums params of 0,1 to map to weight and bias\n",
+        "ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x)"
+      ],
+      "metadata": {
+        "id": "8UZpC8DnflUK"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "8UZpC8DnflUK"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd)\n"
+      ],
+      "metadata": {
+        "id": "F3USYENIflUK"
+      },
+      "id": "F3USYENIflUK"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We offer two APIs to compute jacobians: **jacrev** and **jacfwd**: \n",
+        "- jacrev uses reverse-mode AD. As you saw above it is a composition of our vjp and vmap transforms. \n",
+        "- jacfwd uses forward-mode AD. It is implemented as a composition of our jvp and vmap transforms. \n",
+        "\n",
+        "jacfwd and jacrev can be substituted for each other but they have different performance characteristics.\n",
+        "\n",
+        "As a general rule of thumb, if you’re computing the jacobian of an $𝑅^N \\to R^M$ function, and there are many more outputs than inputs (i.e. $M > N$) then jacfwd is preferred, otherwise use jacrev. There are exceptions to this rule, but a non-rigorous argument for this follows:\n",
+        "\n",
+        "In reverse-mode AD, we are computing the jacobian row-by-row, while in forward-mode AD (which computes Jacobian-vector products), we are computing it column-by-column. The Jacobian matrix has M rows and N columns, so if it is taller or wider one way we may prefer the method that deals with fewer rows or columns.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "V7B3vE8dflUK"
+      },
+      "id": "V7B3vE8dflUK"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import jacrev, jacfwd"
+      ],
+      "metadata": {
+        "id": "k7Tok7m3flUK"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "k7Tok7m3flUK"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First, let's benchmark with more inputs than outputs:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "YrV-gZAaflUL"
+      },
+      "id": "YrV-gZAaflUL"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Din = 32\n",
+        "Dout = 2048\n",
+        "weight = torch.randn(Dout, Din)\n",
+        "\n",
+        "bias = torch.randn(Dout)\n",
+        "x = torch.randn(Din)\n",
+        "\n",
+        "# remember the general rule about taller vs wider...here we have a taller matrix:\n",
+        "print(weight.shape)\n",
+        "\n",
+        "using_fwd = Timer(stmt=\"jacfwd(predict, argnums=2)(weight, bias, x)\", globals=globals())\n",
+        "using_bwd = Timer(stmt=\"jacrev(predict, argnums=2)(weight, bias, x)\", globals=globals())\n",
+        "\n",
+        "jacfwd_timing = using_fwd.timeit(500)\n",
+        "jacrev_timing = using_bwd.timeit(500)\n",
+        "\n",
+        "print(f'jacfwd time: {jacfwd_timing}')\n",
+        "print(f'jacrev time: {jacrev_timing}')\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "dd882726-9723-47c0-a72f-3c7835a85aa1",
+        "id": "m5j-4hSxflUL"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "torch.Size([2048, 32])\n",
+            "jacfwd time: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a5d792d0>\n",
+            "jacfwd(predict, argnums=2)(weight, bias, x)\n",
+            "  1.32 ms\n",
+            "  1 measurement, 500 runs , 1 thread\n",
+            "jacrev time: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a4dee450>\n",
+            "jacrev(predict, argnums=2)(weight, bias, x)\n",
+            "  12.46 ms\n",
+            "  1 measurement, 500 runs , 1 thread\n"
+          ]
+        }
+      ],
+      "id": "m5j-4hSxflUL"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "and then do a relative benchmark:"
+      ],
+      "metadata": {
+        "id": "k_Sg-4tVflUL"
+      },
+      "id": "k_Sg-4tVflUL"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "get_perf(jacfwd_timing, \"jacfwd\", jacrev_timing, \"jacrev\", );"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "3a6586a1-269d-46d8-d119-e24f6d46277f",
+        "id": "_4T96zGjflUL"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            " Performance delta: 842.8274 percent improvement with jacrev \n"
+          ]
+        }
+      ],
+      "id": "_4T96zGjflUL"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "and now the reverse - more outputs (M) than inputs (N):"
+      ],
+      "metadata": {
+        "id": "RCDPot1yflUL"
+      },
+      "id": "RCDPot1yflUL"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Din = 2048\n",
+        "Dout = 32\n",
+        "weight = torch.randn(Dout, Din)\n",
+        "bias = torch.randn(Dout)\n",
+        "x = torch.randn(Din)\n",
+        "\n",
+        "using_fwd = Timer(stmt=\"jacfwd(predict, argnums=2)(weight, bias, x)\", globals=globals())\n",
+        "using_bwd = Timer(stmt=\"jacrev(predict, argnums=2)(weight, bias, x)\", globals=globals())\n",
+        "\n",
+        "jacfwd_timing = using_fwd.timeit(500)\n",
+        "jacrev_timing = using_bwd.timeit(500)\n",
+        "\n",
+        "print(f'jacfwd time: {jacfwd_timing}')\n",
+        "print(f'jacrev time: {jacrev_timing}')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "913e9ccd-3d4f-472a-a749-19cee36d0a16",
+        "id": "_DRFqzqZflUM"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "jacfwd time: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a5d64790>\n",
+            "jacfwd(predict, argnums=2)(weight, bias, x)\n",
+            "  7.99 ms\n",
+            "  1 measurement, 500 runs , 1 thread\n",
+            "jacrev time: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa9a5d67b50>\n",
+            "jacrev(predict, argnums=2)(weight, bias, x)\n",
+            "  1.09 ms\n",
+            "  1 measurement, 500 runs , 1 thread\n"
+          ]
+        }
+      ],
+      "id": "_DRFqzqZflUM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "and a relative perf comparison:"
+      ],
+      "metadata": {
+        "id": "5SRbMCNsflUM"
+      },
+      "id": "5SRbMCNsflUM"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "get_perf(jacrev_timing, \"jacrev\", jacfwd_timing, \"jacfwd\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "c282ce25-4f6e-44cd-aed7-60f6f5010e5b",
+        "id": "uF_9GaoiflUM"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            " Performance delta: 635.2095 percent improvement with jacfwd \n"
+          ]
+        }
+      ],
+      "id": "uF_9GaoiflUM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Hessian computation with functorch.hessian\n"
+      ],
+      "metadata": {
+        "id": "J29FQaBQflUM"
+      },
+      "id": "J29FQaBQflUM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We offer a convenience API to compute hessians: `functorch.hessian`. \n",
+        "Hessians are the jacobian of the jacobian (or the partial derivative of the partial derivative, aka second order).\n",
+        "\n",
+        "This suggests that one can just compose functorch’s jacobian transforms to compute the Hessian. \n",
+        "Indeed, under the hood, `hessian(f)` is simply `jacfwd(jacrev(f))`.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "My4DPH97flUM"
+      },
+      "id": "My4DPH97flUM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Note: to boost performance: depending on your model, you may also want to use `jacfwd(jacfwd(f))` or `jacrev(jacrev(f))` instead to compute hessians leveraging the rule of thumb above regarding wider vs taller matrices.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "FJt038l5flUM"
+      },
+      "id": "FJt038l5flUM"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import hessian\n",
+        "\n",
+        "# lets reduce the size in order not to blow out colab. Hessians require significant memory:\n",
+        "Din = 512\n",
+        "Dout = 32\n",
+        "weight = torch.randn(Dout, Din)\n",
+        "bias = torch.randn(Dout)\n",
+        "x = torch.randn(Din)\n",
+        "\n",
+        "hess_api = hessian(predict, argnums=2)(weight, bias, x)\n",
+        "hess_fwdfwd = jacfwd(jacfwd(predict, argnums=2), argnums=2)(weight, bias, x)\n",
+        "#hess_revrev = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x)\n"
+      ],
+      "metadata": {
+        "id": "jEqr2ywZflUM"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "jEqr2ywZflUM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's verify we have the same result regardless of using hessian api or using jacfwd(jacfwd())"
+      ],
+      "metadata": {
+        "id": "n9BHcICQflUN"
+      },
+      "id": "n9BHcICQflUN"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "torch.allclose(hess_api, hess_fwdfwd)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "e457e3bc-f085-4f90-966d-f98893b98ea8",
+        "id": "eHiWRkjJflUN"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 18
+        }
+      ],
+      "id": "eHiWRkjJflUN"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Batch Jacobian and Batch Hessian\n"
+      ],
+      "metadata": {
+        "id": "Gjt1RO8HflUN"
+      },
+      "id": "Gjt1RO8HflUN"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In the above examples we’ve been operating with a single feature vector. In some cases you might want to take the Jacobian of a batch of outputs with respect to a batch of inputs. That is, given a batch of inputs of shape `(B, N)` and a function that goes from $R^N \\to R^M$, we would like a Jacobian of shape `(B, M, N)`. \n",
+        "\n",
+        "The easiest way to do this is to use vmap:"
+      ],
+      "metadata": {
+        "id": "RjIzdoQNflUN"
+      },
+      "id": "RjIzdoQNflUN"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "batch_size = 64\n",
+        "Din = 31\n",
+        "Dout = 33\n",
+        "\n",
+        "weight = torch.randn(Dout, Din)\n",
+        "print(f\"weight shape = {weight.shape}\")\n",
+        "\n",
+        "bias = torch.randn(Dout)\n",
+        "\n",
+        "x = torch.randn(batch_size, Din)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "561eb618-e00f-40d5-bd99-fa51ab82051f",
+        "id": "B1eoEO4UflUN"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "weight shape = torch.Size([33, 31])\n"
+          ]
+        }
+      ],
+      "id": "B1eoEO4UflUN"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compute_batch_jacobian = vmap(jacrev(predict, argnums=2), in_dims=(None, None, 0))\n",
+        "batch_jacobian0 = compute_batch_jacobian(weight, bias, x)"
+      ],
+      "metadata": {
+        "id": "nZ_V02NhflUN"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "nZ_V02NhflUN"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "If you have a function that goes from (B, N) -> (B, M) instead and are certain that each input produces an independent output, then it’s also sometimes possible to do this without using vmap by summing the outputs and then computing the Jacobian of that function:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "_OLDiY3MflUN"
+      },
+      "id": "_OLDiY3MflUN"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def predict_with_output_summed(weight, bias, x):\n",
+        "    return predict(weight, bias, x).sum(0)\n",
+        "\n",
+        "batch_jacobian1 = jacrev(predict_with_output_summed, argnums=2)(weight, bias, x).movedim(1, 0)\n",
+        "assert torch.allclose(batch_jacobian0, batch_jacobian1)"
+      ],
+      "metadata": {
+        "id": "_QH4hD8PflUO"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "_QH4hD8PflUO"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "If you instead have a function that goes from $𝑅^𝑁 \\to 𝑅^𝑀$ but inputs that are batched, you compose vmap with jacrev to compute batched jacobians:\n",
+        "\n",
+        "Finally, batch hessians can be computed similarly. It’s easiest to think about them by using vmap to batch over hessian computation, but in some cases the sum trick also works.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "eUjw65cCflUO"
+      },
+      "id": "eUjw65cCflUO"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0))\n",
+        "\n",
+        "batch_hess = compute_batch_hessian(weight, bias, x)\n",
+        "batch_hess.shape"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "f3135cfa-e9e5-4f18-8cb7-0655e8a37cb5",
+        "id": "3vAyQjMsflUO"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "torch.Size([64, 33, 31, 31])"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 22
+        }
+      ],
+      "id": "3vAyQjMsflUO"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Computing Hessian-vector products\n",
+        "\n",
+        "The naive way to compute a Hessian-vector product (hvp) is to materialize the full Hessian and perform a dot-product with a vector. We can do better: it turns out we don't need to materialize the full Hessian to do this. We'll go through two (of many) different strategies to compute Hessian-vector products:\n",
+        "- composing reverse-mode AD with reverse-mode AD\n",
+        "- composing reverse-mode AD with forward-mode AD\n",
+        "\n",
+        "Composing reverse-mode AD with forward-mode AD (as opposed to reverse-mode with reverse-mode) is generally the more memory efficient way to compute a hvp because forward-mode AD doesn't need to construct an Autograd graph and save intermediates for backward:"
+      ],
+      "metadata": {
+        "id": "Wa8E48sQgpkb"
+      },
+      "id": "Wa8E48sQgpkb"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import jvp, grad, vjp\n",
+        "\n",
+        "def hvp(f, primals, tangents):\n",
+        "  return jvp(grad(f), primals, tangents)[1]"
+      ],
+      "metadata": {
+        "id": "trw6WbAth6BM"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "trw6WbAth6BM"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Here's some sample usage."
+      ],
+      "metadata": {
+        "id": "DQMpRo6nitfr"
+      },
+      "id": "DQMpRo6nitfr"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def f(x):\n",
+        "  return x.sin().sum()\n",
+        "\n",
+        "x = torch.randn(2048)\n",
+        "tangent = torch.randn(2048)\n",
+        "\n",
+        "result = hvp(f, (x,), (tangent,))"
+      ],
+      "metadata": {
+        "id": "sPwg8SOdiVAK"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "sPwg8SOdiVAK"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "If PyTorch forward-AD does not have coverage for your operations, then we can instead compose reverse-mode AD with reverse-mode AD:"
+      ],
+      "metadata": {
+        "id": "zGvUIcB0j1Ez"
+      },
+      "id": "zGvUIcB0j1Ez"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def hvp_revrev(f, primals, tangents):\n",
+        "  _, vjp_fn = vjp(grad(f), *primals)\n",
+        "  return vjp_fn(*tangents)"
+      ],
+      "metadata": {
+        "id": "mdDFZdlekAOK"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "mdDFZdlekAOK"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result_hvp_revrev = hvp_revrev(f, (x,), (tangent,))\n",
+        "assert torch.allclose(result, result_hvp_revrev[0])"
+      ],
+      "metadata": {
+        "id": "_CuCk9X0lW7C"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "_CuCk9X0lW7C"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.3"
+    },
+    "colab": {
+      "name": "jacobians_hessians.ipynb",
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/functorch/notebooks/minifier.ipynb b/functorch/notebooks/minifier.ipynb
new file mode 100644
index 0000000000000..43eed6f11a0df
--- /dev/null
+++ b/functorch/notebooks/minifier.ipynb
@@ -0,0 +1,433 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using the Minifier\n",
+    "We have a pretty convenient test case minifier with this interface\n",
+    "```\n",
+    "def minifier(fail_f: fx.GraphModule, inps, module_fails):\n",
+    "    \"\"\"\n",
+    "    Minimizes a FX graph with given inputs, such that the resulting FX graph still returns True for module_fails.\n",
+    "\n",
+    "    Does 2 main strategies:\n",
+    "    1. Truncates suffix: Removes some suffix from the graph and sets a new output.\n",
+    "    2. Delta Debugging: Tries replacing half of the graph with inputs. If fails,\n",
+    "        tries replacing quarter of the graph, etc.\n",
+    "\n",
+    "    >>> failing_function = fx.symbolic_trace(f)\n",
+    "    >>> minimize(failing_function, [torch.randn(5)], lambda fx_g, inps: fx_g(*inps))\n",
+    "\n",
+    "    note: module_fails returns True if it fails.\n",
+    "    ...\n",
+    "```\n",
+    "\n",
+    "Specifically, it takes your FX graph, and tries to minify it with the following 4 strategies (while checking that the resulting graph still returns True for `module_fails`), until it can't minify it anymore.\n",
+    "\n",
+    "1. Truncates Suffix: Given a FX graph, it tries to remove some suffix from the graph. For example, given this:\n",
+    "\n",
+    "```\n",
+    "def f(a):\n",
+    "    b = x * 2\n",
+    "    c = b + 3\n",
+    "    d = c / 4\n",
+    "    return d\n",
+    "```\n",
+    "It might try truncating the suffix, and get\n",
+    "```\n",
+    "def f(a):\n",
+    "    b = x * 2\n",
+    "    c = b + 3\n",
+    "    return c\n",
+    "```\n",
+    "It tries this in a binary search manner, trying to remove the last 1/2, then 3/4, 1/4 then 7/8, 5/8, 3/8...\n",
+    "\n",
+    "2. [Delta Debugging](https://en.wikipedia.org/wiki/Delta_debugging): Of course, removing the suffix isn't always sufficient to minify a graph. What if the error is caused by the first instruction? So, we take an approach inspired by delta debugging - we try removing intermediate nodes of the graph. Unlike with suffixes, there are still dependencies on the removed nodes. So, instead of removing them entirely, we promote them to inputs. For example, given the above example:\n",
+    "\n",
+    "```\n",
+    "def f(a):\n",
+    "    b = x * 2\n",
+    "    c = b + 3\n",
+    "    d = c / 4\n",
+    "    return d\n",
+    "```\n",
+    "We might remove a middle node (say, c, in this case).\n",
+    "```\n",
+    "def f(a, c):\n",
+    "    b = x * 2\n",
+    "    d = c / 4\n",
+    "    return d\n",
+    "```\n",
+    "\n",
+    "Finally, there are 2 auxiliary strategies - eliminating dead code and removing unused inputs. These are somewhat self-explanatory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So, let's take a look at a toy example. Let's pretend that our graph fails if it has a \"multiply\" in it. Let's create a failing graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[W OperatorEntry.cpp:133] Warning: Overriding a previously registered kernel for the same operator and the same dispatch key\n",
+      "  operator: aten::multiply.Tensor(Tensor self, Tensor other) -> (Tensor)\n",
+      "    registered at aten/src/ATen/RegisterSchema.cpp:6\n",
+      "  dispatch key: FuncTorchBatched\n",
+      "  previous kernel: registered at aten/src/ATen/RegisterCompositeImplicitAutograd.cpp:10338\n",
+      "       new kernel: registered at /fsx/users/chilli/work/functorch/functorch/csrc/BatchRulesDecompositions.cpp:108 (function registerKernel)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started off with 7 nodes\n",
+      "###################\n",
+      "Current size: 7\n",
+      "###################\n",
+      "Strategy: Remove suffix\n",
+      "\n",
+      "SUCCESS: Removed [4:7)\n",
+      "\n",
+      "###################\n",
+      "Current size: 6\n",
+      "###################\n",
+      "Strategy: Delta Debugging\n",
+      "SUCCESS: Removed (0:4] - Went from 2 placeholders to 4\n",
+      "\n",
+      "###################\n",
+      "Current size: 6\n",
+      "###################\n",
+      "Strategy: Remove unused inputs\n",
+      "SUCCESS: Went from 4 inputs to 2 inputs\n",
+      "\n",
+      "###################\n",
+      "Current size: 4\n",
+      "###################\n",
+      "Strategy: Remove suffix\n",
+      "FAIL: Could not remove suffix\n",
+      "Strategy: Delta Debugging\n",
+      "FAIL: Could not remove prefix\n",
+      "\n",
+      "inps = [(torch.Size([3]), torch.float32), (torch.Size([3]), torch.float32)]\n",
+      "inps = [torch.zeros(())] + [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]\n",
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, div, add):\n",
+      "    mul = torch.ops.aten.mul(add, div);  add = div = None\n",
+      "    return (mul,)\n",
+      "    \n",
+      "f = torch.jit.script(forward)\n",
+      "with torch.jit.fuser(\"fuser2\"):\n",
+      "  for _ in range(5):\n",
+      "    f(*inps)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.fx as fx\n",
+    "from functorch.compile import minifier\n",
+    "\n",
+    "def failing_f(x, y):\n",
+    "    y = torch.ops.aten.div(x, y)\n",
+    "    x = torch.ops.aten.add(x, 3)\n",
+    "    x = torch.ops.aten.mul(x, y)\n",
+    "    return torch.ops.aten.sub(x, y)\n",
+    "\n",
+    "inps = [torch.randn(3), torch.randn(3)]\n",
+    "\n",
+    "def pass_checker(fx_g, inps):\n",
+    "    return (torch.ops.aten.mul in {i.target for i in fx_g.graph.nodes})\n",
+    "\n",
+    "min_f, inps = minifier(fx.symbolic_trace(failing_f), inps, pass_checker)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tada! Our graph is now a minimal example that still fails.\n",
+    "\n",
+    "Since the primary use case of this minifier (for now) is for NVFuser repros, we print out a string for convenience that creates a self-contained repro to run the minified graph with NVFuser.\n",
+    "\n",
+    "Note that in practice, we provide 2 main \"graph checkers\" - `check_nvfuser_subprocess` and `check_nvfuser_correctness_subprocess`. These are used to check for errors and correctness (i.e. do the results match eager) respectively. These can be used like\n",
+    "\n",
+    "```\n",
+    "from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess\n",
+    "minifier(failing_graph, inps, check_nvfuser_subprocess)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, assuming you're using AOTAutograd, there's another problem - how do you obtain the FX graph in the first place to pass to the minifier? One possible way is simply to use `print_compile`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, primals_1):\n",
+      "    cos = torch.ops.aten.cos(primals_1)\n",
+      "    cos_1 = torch.ops.aten.cos(cos)\n",
+      "    return [cos_1, primals_1, cos]\n",
+      "    \n",
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, primals_1, cos, tangents_1):\n",
+      "    sin = torch.ops.aten.sin(cos);  cos = None\n",
+      "    neg = torch.ops.aten.neg(sin);  sin = None\n",
+      "    mul = torch.ops.aten.mul(tangents_1, neg);  tangents_1 = neg = None\n",
+      "    sin_1 = torch.ops.aten.sin(primals_1);  primals_1 = None\n",
+      "    neg_1 = torch.ops.aten.neg(sin_1);  sin_1 = None\n",
+      "    mul_1 = torch.ops.aten.mul(mul, neg_1);  mul = neg_1 = None\n",
+      "    return [mul_1]\n",
+      "    \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.6062, 0.9982, 0.6474], grad_fn=<CompiledFunctionBackward>)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functorch.compile import aot_function\n",
+    "\n",
+    "from functorch.compile import print_compile\n",
+    "# Or...\n",
+    "def print_compile(fx_g, _):\n",
+    "    print(fx_g.code)\n",
+    "    return fx_g\n",
+    "\n",
+    "def foo(x):\n",
+    "    return x.cos().cos()\n",
+    "inp = torch.randn(3, requires_grad=True)\n",
+    "aot_function(foo, print_compile)(inp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, this doesn't provide the inputs, nor does it handle any tensor constants that might be saved in the graph. To resolve this, we have another \"compiler\" called `debug_compile`. It simply prints out a string that can be copy pasted and run from another file. It leverages FX's `to_folder` feature to serialize the graph to disk, along with any constants.\n",
+    "\n",
+    "You can apply it to either the `fw_compiler` to dump the forwards graph or `bw_compiler` to dump the backwards graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "##############################################################\n",
+      "# To minimize FX graph, copy and paste the below and run it  #\n",
+      "##############################################################\n",
+      "\n",
+      "import torch\n",
+      "import torch.fx as fx\n",
+      "from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess\n",
+      "\n",
+      "inps = [(torch.Size([3]), torch.float32), (torch.Size([3]), torch.float32)]\n",
+      "inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]\n",
+      "from foo import FxModule\n",
+      "mod = FxModule().cuda()\n",
+      "\n",
+      "with torch.jit.fuser(\"fuser2\"):\n",
+      "  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess\n",
+      "  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.6062, 0.9982, 0.6474], grad_fn=<CompiledFunctionBackward>)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functorch.compile import memory_efficient_fusion, debug_compile\n",
+    "\n",
+    "memory_efficient_fusion(foo, bw_compiler=debug_compile)(inp)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So, let's copy paste it and see how it works - note that I made a couple minor modifications to run on CPU and use the previous \"graph fails if there's a multiply in it\" checker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started off with 10 nodes\n",
+      "###################\n",
+      "Current size: 10\n",
+      "###################\n",
+      "Strategy: Remove suffix\n",
+      "\n",
+      "SUCCESS: Removed [6:10)\n",
+      "\n",
+      "###################\n",
+      "Current size: 8\n",
+      "###################\n",
+      "Strategy: Delta Debugging\n",
+      "SUCCESS: Removed (0:4] - Went from 2 placeholders to 4\n",
+      "\n",
+      "###################\n",
+      "Current size: 8\n",
+      "###################\n",
+      "Strategy: Remove unused inputs\n",
+      "SUCCESS: Went from 4 inputs to 3 inputs\n",
+      "\n",
+      "###################\n",
+      "Current size: 7\n",
+      "###################\n",
+      "Strategy: Remove suffix\n",
+      "\n",
+      "SUCCESS: Removed [4:7)\n",
+      "\n",
+      "###################\n",
+      "Current size: 6\n",
+      "###################\n",
+      "Strategy: Remove unused inputs\n",
+      "SUCCESS: Went from 3 inputs to 2 inputs\n",
+      "\n",
+      "###################\n",
+      "Current size: 5\n",
+      "###################\n",
+      "Strategy: Delta Debugging\n",
+      "SUCCESS: Removed (2:3] - Went from 2 placeholders to 3\n",
+      "\n",
+      "###################\n",
+      "Current size: 5\n",
+      "###################\n",
+      "Strategy: Remove unused inputs\n",
+      "SUCCESS: Went from 3 inputs to 2 inputs\n",
+      "\n",
+      "###################\n",
+      "Current size: 4\n",
+      "###################\n",
+      "Strategy: Remove suffix\n",
+      "FAIL: Could not remove suffix\n",
+      "Strategy: Delta Debugging\n",
+      "FAIL: Could not remove prefix\n",
+      "\n",
+      "inps = [(torch.Size([3]), torch.float32), (torch.Size([3]), torch.float32)]\n",
+      "inps = [torch.zeros(())] + [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]\n",
+      "\n",
+      "\n",
+      "\n",
+      "def forward(self, tangents_1, neg):\n",
+      "    mul = torch.ops.aten.mul(tangents_1, neg);  tangents_1 = neg = None\n",
+      "    return (mul,)\n",
+      "    \n",
+      "f = torch.jit.script(forward)\n",
+      "with torch.jit.fuser(\"fuser2\"):\n",
+      "  for _ in range(5):\n",
+      "    f(*inps)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(GraphModule(), [tensor([1., 1., 1.]), tensor([-0.5144, -0.5144, -0.5144])])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.fx as fx\n",
+    "from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess\n",
+    "\n",
+    "inps = [(torch.Size([3]), torch.float32), (torch.Size([3]), torch.float32)]\n",
+    "inps = [torch.ones(shape, dtype=dtype) for (shape, dtype) in inps]\n",
+    "from foo import FxModule\n",
+    "mod = FxModule()\n",
+    "\n",
+    "minifier(fx.symbolic_trace(mod), inps, pass_checker)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Hopefully that was useful :)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "a1cf69278e4496ab232105d2fffcc75678d2dcbec1c795483197519eb80161c7"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.12 ('py38')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/functorch/notebooks/neural_tangent_kernels.ipynb b/functorch/notebooks/neural_tangent_kernels.ipynb
new file mode 100644
index 0000000000000..9d041be909268
--- /dev/null
+++ b/functorch/notebooks/neural_tangent_kernels.ipynb
@@ -0,0 +1,353 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b687b169-ec83-493d-a7c5-f8c6cd402ea3",
+   "metadata": {},
+   "source": [
+    "# Neural Tangent Kernels\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/neural_tangent_kernels.ipynb\">\n",
+    "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "\n",
+    "The neural tangent kernel (NTK) is a kernel that describes [how a neural network evolves during training](https://en.wikipedia.org/wiki/Neural_tangent_kernel). There has been a lot of research around it [in recent years](https://arxiv.org/abs/1806.07572). This tutorial, inspired by the implementation of [NTKs in JAX](https://github.com/google/neural-tangents) (see [Fast Finite Width Neural Tangent Kernel](https://arxiv.org/abs/2206.08720) for details), demonstrates how to easily compute this quantity using functorch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77f41c65-f070-4b60-b3d0-1c8f56ed4f64",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "First, some setup. Let's define a simple CNN that we wish to compute the NTK of."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "855fa70b-5b63-4973-94df-41be57ab6ecf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from functorch import make_functional, vmap, vjp, jvp, jacrev\n",
+    "device = 'cuda'\n",
+    "\n",
+    "class CNN(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.conv1 = nn.Conv2d(3, 32, (3, 3))\n",
+    "        self.conv2 = nn.Conv2d(32, 32, (3, 3))\n",
+    "        self.conv3 = nn.Conv2d(32, 32, (3, 3))\n",
+    "        self.fc = nn.Linear(21632, 10)\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        x = self.conv1(x)\n",
+    "        x = x.relu()\n",
+    "        x = self.conv2(x)\n",
+    "        x = x.relu()\n",
+    "        x = self.conv3(x)\n",
+    "        x = x.flatten(1)\n",
+    "        x = self.fc(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52c600e9-207a-41ec-93b4-5d940827bda0",
+   "metadata": {},
+   "source": [
+    "And let's generate some random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0001a907-f5c9-4532-9ee9-2e94b8487d08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = torch.randn(20, 3, 32, 32, device=device)\n",
+    "x_test = torch.randn(5, 3, 32, 32, device=device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8af210fe-9613-48ee-a96c-d0836458b0f1",
+   "metadata": {},
+   "source": [
+    "## Create a function version of the model\n",
+    "\n",
+    "functorch transforms operate on functions. In particular, to compute the NTK, we will need a function that accepts the parameters of the model and a single input (as opposed to a batch of inputs!) and returns a single output.\n",
+    "\n",
+    "We'll use functorch's `make_functional` to accomplish the first step. If your module has buffers, you'll want to use `make_functional_with_buffers` instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e6b4bb59-bdde-46cd-8a28-7fd00a37a387",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = CNN().to(device)\n",
+    "fnet, params = make_functional(net)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "319276a4-da45-499a-af47-0677107559b6",
+   "metadata": {},
+   "source": [
+    "Keep in mind that the model was originally written to accept a batch of input data points. In our CNN example, there are no inter-batch operations. That is, each data point in the batch is independent of other data points. With this assumption in mind, we can easily generate a function that evaluates the model on a single data point:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0b8b4021-eb10-4a50-9d99-3817cb0ce4cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fnet_single(params, x):\n",
+    "    return fnet(params, x.unsqueeze(0)).squeeze(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62bc6b5a-31fa-411e-8069-e6c1f6d05248",
+   "metadata": {},
+   "source": [
+    "## Compute the NTK: method 1 (Jacobian contraction)\n",
+    "\n",
+    "We're ready to compute the empirical NTK. The empirical NTK for two data points $x_1$ and $x_2$ is defined as the matrix product between the Jacobian of the model evaluated at $x_1$ and the Jacobian of the model evaluated at $x_2$:\n",
+    "\n",
+    "$$J_{net}(x_1) J_{net}^T(x_2)$$\n",
+    "\n",
+    "In the batched case where $x_1$ is a batch of data points and $x_2$ is a batch of data points, then we want the matrix product between the Jacobians of all combinations of data points from $x_1$ and $x_2$.\n",
+    "\n",
+    "The first method consists of doing just that - computing the two Jacobians, and contracting them. Here's how to compute the NTK in the batched case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "99a38a4b-64d3-4e13-bd63-2d71e8dd6840",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2):\n",
+    "    # Compute J(x1)\n",
+    "    jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)\n",
+    "    jac1 = [j.flatten(2) for j in jac1]\n",
+    "    \n",
+    "    # Compute J(x2)\n",
+    "    jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2)\n",
+    "    jac2 = [j.flatten(2) for j in jac2]\n",
+    "    \n",
+    "    # Compute J(x1) @ J(x2).T\n",
+    "    result = torch.stack([torch.einsum('Naf,Mbf->NMab', j1, j2) for j1, j2 in zip(jac1, jac2)])\n",
+    "    result = result.sum(0)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cbf54d2b-c4bc-46bd-9e55-e1471d639a4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([20, 5, 10, 10])\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test)\n",
+    "print(result.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea844f45-98fb-4cba-8056-644292b968ab",
+   "metadata": {},
+   "source": [
+    "In some cases, you may only want the diagonal or the trace of this quantity, especially if you know beforehand that the network architecture results in an NTK where the non-diagonal elements can be approximated by zero. It's easy to adjust the above function to do that:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "aae760c9-e906-4fda-b490-1126a86b7e96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='full'):\n",
+    "    # Compute J(x1)\n",
+    "    jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)\n",
+    "    jac1 = [j.flatten(2) for j in jac1]\n",
+    "    \n",
+    "    # Compute J(x2)\n",
+    "    jac2 = vmap(jacrev(fnet_single), (None, 0))(params, x2)\n",
+    "    jac2 = [j.flatten(2) for j in jac2]\n",
+    "    \n",
+    "    # Compute J(x1) @ J(x2).T\n",
+    "    einsum_expr = None\n",
+    "    if compute == 'full':\n",
+    "        einsum_expr = 'Naf,Mbf->NMab'\n",
+    "    elif compute == 'trace':\n",
+    "        einsum_expr = 'Naf,Maf->NM'\n",
+    "    elif compute == 'diagonal':\n",
+    "        einsum_expr = 'Naf,Maf->NMa'\n",
+    "    else:\n",
+    "        assert False\n",
+    "        \n",
+    "    result = torch.stack([torch.einsum(einsum_expr, j1, j2) for j1, j2 in zip(jac1, jac2)])\n",
+    "    result = result.sum(0)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "42d974f3-1f9d-4953-8677-5ee22cfc67eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([20, 5])\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test, 'trace')\n",
+    "print(result.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c941e5d-51d7-47b2-80ee-edcd4aee6aaa",
+   "metadata": {},
+   "source": [
+    "The asymptotic time complexity of this method is $N O [FP]$ (time to compute the Jacobians) $ + N^2 O^2 P$ (time to contract the Jacobians), where $N$ is the batch size of $x_1$ and $x_2$, $O$ is the model's output size, $P$ is the total number of parameters, and $[FP]$ is the cost of a single forward pass through the model. See section section 3.2 in [Fast Finite Width Neural Tangent Kernel](https://arxiv.org/abs/2206.08720) for details."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c931e5d-51d7-47b2-80ee-ddcd4aee6aaa",
+   "metadata": {},
+   "source": [
+    "## Compute the NTK: method 2 (NTK-vector products)\n",
+    "\n",
+    "The next method we will discuss is a way to compute the NTK using NTK-vector products.\n",
+    "\n",
+    "This method reformulates NTK as a stack of NTK-vector products applied to columns of an identity matrix $I_O$ of size $O\\times O$ (where $O$ is the output size of the model):\n",
+    "\n",
+    "$$J_{net}(x_1) J_{net}^T(x_2) = J_{net}(x_1) J_{net}^T(x_2) I_{O} = \\left[J_{net}(x_1) \\left[J_{net}^T(x_2) e_o\\right]\\right]_{o=1}^{O},$$\n",
+    "where $e_o\\in \\mathbb{R}^O$ are column vectors of the identity matrix $I_O$.\n",
+    "\n",
+    "- Let $\\textrm{vjp}_o = J_{net}^T(x_2) e_o$. We can use a vector-Jacobian product to compute this.\n",
+    "- Now, consider $J_{net}(x_1) \\textrm{vjp}_o$. This is a Jacobian-vector product!\n",
+    "- Finally, we can run the above computation in parallel over all columns $e_o$ of $I_O$ using `vmap`.\n",
+    "\n",
+    "This suggests that we can use a combination of reverse-mode AD (to compute the vector-Jacobian product) and forward-mode AD (to compute the Jacobian-vector product) to compute the NTK.\n",
+    "\n",
+    "Let's code that up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "dc4b49d7-3096-45d5-a7a1-7032309a2613",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def empirical_ntk_ntk_vps(func, params, x1, x2, compute='full'):\n",
+    "    def get_ntk(x1, x2):\n",
+    "        def func_x1(params):\n",
+    "            return func(params, x1)\n",
+    "\n",
+    "        def func_x2(params):\n",
+    "            return func(params, x2)\n",
+    "\n",
+    "        output, vjp_fn = vjp(func_x1, params)\n",
+    "\n",
+    "        def get_ntk_slice(vec):\n",
+    "            # This computes vec @ J(x2).T\n",
+    "            # `vec` is some unit vector (a single slice of the Identity matrix)\n",
+    "            vjps = vjp_fn(vec)\n",
+    "            # This computes J(X1) @ vjps\n",
+    "            _, jvps = jvp(func_x2, (params,), vjps)\n",
+    "            return jvps\n",
+    "\n",
+    "        # Here's our identity matrix\n",
+    "        basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1)\n",
+    "        return vmap(get_ntk_slice)(basis)\n",
+    "        \n",
+    "    # get_ntk(x1, x2) computes the NTK for a single data point x1, x2\n",
+    "    # Since the x1, x2 inputs to empirical_ntk_ntk_vps are batched,\n",
+    "    # we actually wish to compute the NTK between every pair of data points\n",
+    "    # between {x1} and {x2}. That's what the vmaps here do.\n",
+    "    result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2)\n",
+    "    \n",
+    "    if compute == 'full':\n",
+    "        return result\n",
+    "    if compute == 'trace':\n",
+    "        return torch.einsum('NMKK->NM', result)\n",
+    "    if compute == 'diagonal':\n",
+    "        return torch.einsum('NMKK->NMK', result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f750544f-9e48-47fe-9f9b-e1b8ae49b245",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_from_jacobian_contraction = empirical_ntk_jacobian_contraction(fnet_single, params, x_test, x_train)\n",
+    "result_from_ntk_vps = empirical_ntk_ntk_vps(fnet_single, params, x_test, x_train)\n",
+    "assert torch.allclose(result_from_jacobian_contraction, result_from_ntk_vps, atol=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84253466-971d-4475-999c-fe3de6bd25b5",
+   "metadata": {},
+   "source": [
+    "Our code for `empirical_ntk_ntk_vps` looks like a direct translation from the math above! This showcases the power of function transforms: good luck trying to write an efficient version of the above using stock PyTorch.\n",
+    "\n",
+    "The asymptotic time complexity of this method is $N^2 O [FP]$, where $N$ is the batch size of $x_1$ and $x_2$, $O$ is the model's output size, and $[FP]$ is the cost of a single forward pass through the model. Hence this method performs more forward passes through the network than method 1, Jacobian contraction ($N^2 O$ instead of $N O$), but avoids the contraction cost altogether (no $N^2 O^2 P$ term, where $P$ is the total number of model's parameters). Therefore, this method is preferable when $O P$ is large relative to $[FP]$, such as fully-connected (not convolutional) models with many outputs $O$. Memory-wise, both methods should be comparable. See section 3.3 in [Fast Finite Width Neural Tangent Kernel](https://arxiv.org/abs/2206.08720) for details."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/functorch/notebooks/per_sample_grads.ipynb b/functorch/notebooks/per_sample_grads.ipynb
new file mode 100644
index 0000000000000..e2317351f7eb1
--- /dev/null
+++ b/functorch/notebooks/per_sample_grads.ipynb
@@ -0,0 +1,607 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "a474c143-05c4-43b6-b12c-17b592d07a6a",
+      "metadata": {
+        "id": "a474c143-05c4-43b6-b12c-17b592d07a6a"
+      },
+      "source": [
+        "# Per-sample-gradients\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/per_sample_grads.ipynb\">\n",
+        "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+        "</a>\n",
+        "\n",
+        "## What is it?\n",
+        "\n",
+        "Per-sample-gradient computation is computing the gradient for each and every\n",
+        "sample in a batch of data. It is a useful quantity in differential privacy, meta-learning,\n",
+        "and optimization research.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "from functools import partial\n",
+        "\n",
+        "torch.manual_seed(0);"
+      ],
+      "metadata": {
+        "id": "Gb-yt4VKUUuc"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "Gb-yt4VKUUuc"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Here's a simple CNN and loss function:\n",
+        "\n",
+        "class SimpleCNN(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
+        "        self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
+        "        self.fc1 = nn.Linear(9216, 128)\n",
+        "        self.fc2 = nn.Linear(128, 10)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = self.conv1(x)\n",
+        "        x = F.relu(x)\n",
+        "        x = self.conv2(x)\n",
+        "        x = F.relu(x)\n",
+        "        x = F.max_pool2d(x, 2)\n",
+        "        x = torch.flatten(x, 1)\n",
+        "        x = self.fc1(x)\n",
+        "        x = F.relu(x)\n",
+        "        x = self.fc2(x)\n",
+        "        x = F.log_softmax(x, dim=1)\n",
+        "        output = x\n",
+        "        return output\n",
+        "\n",
+        "def loss_fn(predictions, targets):\n",
+        "    return F.nll_loss(predictions, targets)"
+      ],
+      "metadata": {
+        "id": "tf-HKHjUUbyY"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "tf-HKHjUUbyY"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let’s generate a batch of dummy data and pretend that we’re working with an MNIST dataset.  \n",
+        "\n",
+        "The dummy images are 28 by 28 and we use a minibatch of size 64.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "VEDPe-EoU5Fa"
+      },
+      "id": "VEDPe-EoU5Fa"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device = 'cuda'\n",
+        "\n",
+        "num_models = 10\n",
+        "batch_size = 64\n",
+        "data = torch.randn(batch_size, 1, 28, 28, device=device)\n",
+        "\n",
+        "targets = torch.randint(10, (64,), device=device)"
+      ],
+      "metadata": {
+        "id": "WB2Qe3AHUvPN"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WB2Qe3AHUvPN"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In regular model training, one would forward the minibatch through the model, and then call .backward() to compute gradients.  This would generate an 'average' gradient of the entire mini-batch:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "GOGJ-OUxVcT5"
+      },
+      "id": "GOGJ-OUxVcT5"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = SimpleCNN().to(device=device)\n",
+        "predictions = model(data) # move the entire mini-batch through the model\n",
+        "\n",
+        "loss = loss_fn(predictions, targets)\n",
+        "loss.backward() # back propagate the 'average' gradient of this mini-batch"
+      ],
+      "metadata": {
+        "id": "WYjMx8QTUvRu"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WYjMx8QTUvRu"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In contrast to the above approach, per-sample-gradient computation is equivalent to: \n",
+        "- for each individual sample of the data, perform a forward and a backward pass to get an individual (per-sample) gradient.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "HNw4_IVzU5Pz"
+      },
+      "id": "HNw4_IVzU5Pz"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_grad(sample, target):\n",
+        "    \n",
+        "    sample = sample.unsqueeze(0)  # prepend batch dimension for processing\n",
+        "    target = target.unsqueeze(0)\n",
+        "\n",
+        "    prediction = model(sample)\n",
+        "    loss = loss_fn(prediction, target)\n",
+        "\n",
+        "    return torch.autograd.grad(loss, list(model.parameters()))\n",
+        "\n",
+        "\n",
+        "def compute_sample_grads(data, targets):\n",
+        "    \"\"\" manually process each sample with per sample gradient \"\"\"\n",
+        "    sample_grads = [compute_grad(data[i], targets[i]) for i in range(batch_size)]\n",
+        "    sample_grads = zip(*sample_grads)\n",
+        "    sample_grads = [torch.stack(shards) for shards in sample_grads]\n",
+        "    return sample_grads\n",
+        "\n",
+        "per_sample_grads = compute_sample_grads(data, targets)"
+      ],
+      "metadata": {
+        "id": "vUsb3VfexJrY"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "vUsb3VfexJrY"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "`sample_grads[0]` is the per-sample-grad for model.conv1.weight. `model.conv1.weight.shape` is `[32, 1, 3, 3]`; notice how there is one gradient, per sample, in the batch for a total of 64.\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "aNkX6lFIxzcm"
+      },
+      "id": "aNkX6lFIxzcm"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(per_sample_grads[0].shape)"
+      ],
+      "metadata": {
+        "id": "C3a9_clvyPho",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "407abc1a-846f-4e50-83bc-c90719a26073"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "torch.Size([64, 32, 1, 3, 3])\n"
+          ]
+        }
+      ],
+      "id": "C3a9_clvyPho"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Per-sample-grads, *the efficient way*, using functorch\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "mFJDWMM9yaYZ"
+      },
+      "id": "mFJDWMM9yaYZ"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We can compute per-sample-gradients efficiently by using function transforms. \n",
+        "\n",
+        "First, let’s create a stateless functional version of `model` by using `functorch.make_functional_with_buffers`.  \n",
+        "\n",
+        "This will separate state (the parameters) from the model and turn the model into a pure function:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "tlkmyQyfY6XU"
+      },
+      "id": "tlkmyQyfY6XU"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from functorch import make_functional_with_buffers, vmap, grad\n",
+        "\n",
+        "fmodel, params, buffers = make_functional_with_buffers(model)"
+      ],
+      "metadata": {
+        "id": "WiSMupvCyecd"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "WiSMupvCyecd"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's review the changes - first, the model has become the stateless FunctionalModuleWithBuffers:"
+      ],
+      "metadata": {
+        "id": "wMsbppPNZklo"
+      },
+      "id": "wMsbppPNZklo"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fmodel"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Xj0cZOJMZbbB",
+        "outputId": "2e87dfde-3af2-4e1f-cd91-5c232446fb53"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "FunctionalModuleWithBuffers(\n",
+              "  (stateless_model): SimpleCNN(\n",
+              "    (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))\n",
+              "    (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))\n",
+              "    (fc1): Linear(in_features=9216, out_features=128, bias=True)\n",
+              "    (fc2): Linear(in_features=128, out_features=10, bias=True)\n",
+              "  )\n",
+              ")"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 15
+        }
+      ],
+      "id": "Xj0cZOJMZbbB"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "And the model parameters now exist independently of the model, stored as a tuple:"
+      ],
+      "metadata": {
+        "id": "zv4_YYPxZvvg"
+      },
+      "id": "zv4_YYPxZvvg"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "for x in params:\n",
+        "  print(f\"{x.shape}\")\n",
+        "\n",
+        "print(f\"\\n{type(params)}\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "tH0TAZhBZ3bS",
+        "outputId": "97c4401f-cccb-43f6-b071-c85a18fc439b"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "torch.Size([32, 1, 3, 3])\n",
+            "torch.Size([32])\n",
+            "torch.Size([64, 32, 3, 3])\n",
+            "torch.Size([64])\n",
+            "torch.Size([128, 9216])\n",
+            "torch.Size([128])\n",
+            "torch.Size([10, 128])\n",
+            "torch.Size([10])\n",
+            "\n",
+            "<class 'tuple'>\n"
+          ]
+        }
+      ],
+      "id": "tH0TAZhBZ3bS"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Next, let’s define a function to compute the loss of the model given a single input rather than a batch of inputs. It is important that this function accepts the parameters, the input, and the target, because we will be transforming over them. \n",
+        "\n",
+        "Note - because the model was originally written to handle batches, we’ll use `torch.unsqueeze` to add a batch dimension.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "cTgIIZ9Wyih8"
+      },
+      "id": "cTgIIZ9Wyih8"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_loss_stateless_model (params, buffers, sample, target):\n",
+        "    batch = sample.unsqueeze(0)\n",
+        "    targets = target.unsqueeze(0)\n",
+        "\n",
+        "    predictions = fmodel(params, buffers, batch) \n",
+        "    loss = loss_fn(predictions, targets)\n",
+        "    return loss"
+      ],
+      "metadata": {
+        "id": "ItURFU3M-p98"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "ItURFU3M-p98"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now, let’s use functorch's `grad` to create a new function that computes the gradient with respect to the first argument of `compute_loss` (i.e. the params)."
+      ],
+      "metadata": {
+        "id": "Qo3sbDK2i_bH"
+      },
+      "id": "Qo3sbDK2i_bH"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ft_compute_grad = grad(compute_loss_stateless_model)"
+      ],
+      "metadata": {
+        "id": "sqRp_Sxni-Xm"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "sqRp_Sxni-Xm"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The `ft_compute_grad` function computes the gradient for a single (sample, target) pair. We can use vmap to get it to compute the gradient over an entire batch of samples and targets. Note that `in_dims=(None, None, 0, 0)` because we wish to map `ft_compute_grad` over the 0th dimension of the data and targets,  and use the same params and buffers for each.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "2pG3Ofqjjc8O"
+      },
+      "id": "2pG3Ofqjjc8O"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))"
+      ],
+      "metadata": {
+        "id": "62ecNMO6inqX"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "62ecNMO6inqX"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, let’s used our transformed function to compute per-sample-gradients:\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "_alXdQ3QkETu"
+      },
+      "id": "_alXdQ3QkETu"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ft_per_sample_grads = ft_compute_sample_grad(params, buffers, data, targets)\n",
+        "\n",
+        "# we can double check that the results using functorch grad and vmap match the results of hand processing each one individually:\n",
+        "for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads):\n",
+        "    assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=3e-3, rtol=1e-5)"
+      ],
+      "metadata": {
+        "id": "1gehVA1c-BHd"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "1gehVA1c-BHd"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "A quick note: there are limitations around what types of functions can be transformed by vmap. The best functions to transform are ones that are pure functions: a function where the outputs are only determined by the inputs, and that have no side effects (e.g. mutation). vmap is unable to handle mutation of arbitrary Python data structures, but it is able to handle many in-place PyTorch operations.\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "BEZaNt1d_bc1"
+      },
+      "id": "BEZaNt1d_bc1"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Performance comparison"
+      ],
+      "metadata": {
+        "id": "BASP151Iml7B"
+      },
+      "id": "BASP151Iml7B"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Curious about how the performance of vmap compares?\n",
+        "\n",
+        "Currently the best results are obtained on newer GPU's such as the A100 (Ampere) where we've seen up to 25x speedups on this example, but here are some results done in Colab:"
+      ],
+      "metadata": {
+        "id": "jr1xNpV4nJ7u"
+      },
+      "id": "jr1xNpV4nJ7u"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_perf(first, first_descriptor, second, second_descriptor):\n",
+        "  \"\"\"  takes torch.benchmark objects and compares delta of second vs first. \"\"\"\n",
+        "  second_res = second.times[0]\n",
+        "  first_res = first.times[0]\n",
+        "\n",
+        "  gain = (first_res-second_res)/first_res\n",
+        "  if gain < 0: gain *=-1 \n",
+        "  final_gain = gain*100\n",
+        "\n",
+        "  print(f\" Performance delta: {final_gain:.4f} percent improvement with {first_descriptor} \")"
+      ],
+      "metadata": {
+        "id": "GnAnMkYmoc-j"
+      },
+      "execution_count": null,
+      "outputs": [],
+      "id": "GnAnMkYmoc-j"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from torch.utils.benchmark import Timer\n",
+        "\n",
+        "without_vmap = Timer( stmt=\"compute_sample_grads(data, targets)\", globals=globals())\n",
+        "with_vmap = Timer(stmt=\"ft_compute_sample_grad(params, buffers, data, targets)\",globals=globals())\n",
+        "no_vmap_timing = without_vmap.timeit(100)\n",
+        "with_vmap_timing = with_vmap.timeit(100)\n",
+        "\n",
+        "print(f'Per-sample-grads without vmap {no_vmap_timing}')\n",
+        "print(f'Per-sample-grads with vmap {with_vmap_timing}')"
+      ],
+      "metadata": {
+        "id": "Zfnn2C2g-6Fb",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "922f3901-773f-446b-b562-88e78f49036c"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Per-sample-grads without vmap <torch.utils.benchmark.utils.common.Measurement object at 0x7f71ac3f1850>\n",
+            "compute_sample_grads(data, targets)\n",
+            "  79.86 ms\n",
+            "  1 measurement, 100 runs , 1 thread\n",
+            "Per-sample-grads with vmap <torch.utils.benchmark.utils.common.Measurement object at 0x7f7143e26f10>\n",
+            "ft_compute_sample_grad(params, buffers, data, targets)\n",
+            "  12.93 ms\n",
+            "  1 measurement, 100 runs , 1 thread\n"
+          ]
+        }
+      ],
+      "id": "Zfnn2C2g-6Fb"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "get_perf(with_vmap_timing, \"vmap\", no_vmap_timing,\"no vmap\" )"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NV9R3LZQoavl",
+        "outputId": "e11e8be9-287d-4e60-e517-e08f8d6909bd"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            " Performance delta: 517.5791 percent improvement with vmap \n"
+          ]
+        }
+      ],
+      "id": "NV9R3LZQoavl"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "There are other optimized solutions (like in https://github.com/pytorch/opacus) to computing per-sample-gradients in PyTorch that also perform better than the naive method. But it’s cool that composing `vmap` and `grad` give us a nice speedup.\n",
+        "\n",
+        "\n",
+        "In general, vectorization with vmap should be faster than running a function in a for-loop and competitive with manual batching. There are some exceptions though, like if we haven’t implemented the vmap rule for a particular operation or if the underlying kernels weren’t optimized for older hardware (GPUs). If you see any of these cases, please let us know by opening an issue at our [GitHub](https://github.com/pytorch/functorch)!\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "UI74G9JarQU8"
+      },
+      "id": "UI74G9JarQU8"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "colab": {
+      "name": "per_sample_grads.ipynb",
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/functorch/notebooks/whirlwind_tour.ipynb b/functorch/notebooks/whirlwind_tour.ipynb
new file mode 100644
index 0000000000000..deae3418966ba
--- /dev/null
+++ b/functorch/notebooks/whirlwind_tour.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "903e2f76",
+   "metadata": {},
+   "source": [
+    "# Whirlwind Tour\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/pytorch/pytorch/blob/master/functorch/notebooks/whirlwind_tour.ipynb\">\n",
+    "  <img style=\"width: auto\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "\n",
+    "## What is functorch?\n",
+    "\n",
+    "functorch is a library for [JAX](https://github.com/google/jax)-like composable function transforms in PyTorch.\n",
+    "- A \"function transform\" is a higher-order function that accepts a numerical function and returns a new function that computes a different quantity.\n",
+    "- functorch has auto-differentiation transforms (`grad(f)` returns a function that computes the gradient of `f`), a vectorization/batching transform (`vmap(f)` returns a function that computes `f` over batches of inputs), and others.\n",
+    "- These function transforms can compose with each other arbitrarily. For example, composing `vmap(grad(f))` computes a quantity called per-sample-gradients that stock PyTorch cannot efficiently compute today.\n",
+    "\n",
+    "Furthermore, we also provide an experimental compilation transform in the `functorch.compile` namespace. Our compilation transform, named AOT (ahead-of-time) Autograd, returns to you an [FX graph](https://pytorch.org/docs/stable/fx.html) (that optionally contains a backward pass), of which compilation via various backends is one path you can take.\n",
+    "\n",
+    "\n",
+    "## Why composable function transforms?\n",
+    "There are a number of use cases that are tricky to do in PyTorch today:\n",
+    "- computing per-sample-gradients (or other per-sample quantities)\n",
+    "- running ensembles of models on a single machine\n",
+    "- efficiently batching together tasks in the inner-loop of MAML\n",
+    "- efficiently computing Jacobians and Hessians\n",
+    "- efficiently computing batched Jacobians and Hessians\n",
+    "\n",
+    "Composing `vmap`, `grad`, `vjp`, and `jvp` transforms allows us to express the above without designing a separate subsystem for each.\n",
+    "\n",
+    "## What are the transforms?\n",
+    "\n",
+    "### grad (gradient computation)\n",
+    "\n",
+    "`grad(func)` is our gradient computation transform. It returns a new function that computes the gradients of `func`. It assumes `func` returns a single-element Tensor and by default it computes the gradients of the output of `func` w.r.t. to the first input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f920b923",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from functorch import grad\n",
+    "x = torch.randn([])\n",
+    "cos_x = grad(lambda x: torch.sin(x))(x)\n",
+    "assert torch.allclose(cos_x, x.cos())\n",
+    "\n",
+    "# Second-order gradients\n",
+    "neg_sin_x = grad(grad(lambda x: torch.sin(x)))(x)\n",
+    "assert torch.allclose(neg_sin_x, -x.sin())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef3b2d85",
+   "metadata": {},
+   "source": [
+    "### vmap (auto-vectorization)\n",
+    "\n",
+    "Note: vmap imposes restrictions on the code that it can be used on. For more details, please read its docstring.\n",
+    "\n",
+    "`vmap(func)(*inputs)` is a transform that adds a dimension to all Tensor operations in `func`. `vmap(func)` returns a new function that maps `func` over some dimension (default: 0) of each Tensor in inputs.\n",
+    "\n",
+    "vmap is useful for hiding batch dimensions: one can write a function func that runs on examples and then lift it to a function that can take batches of examples with `vmap(func)`, leading to a simpler modeling experience:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ebac649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from functorch import vmap\n",
+    "batch_size, feature_size = 3, 5\n",
+    "weights = torch.randn(feature_size, requires_grad=True)\n",
+    "\n",
+    "def model(feature_vec):\n",
+    "    # Very simple linear model with activation\n",
+    "    assert feature_vec.dim() == 1\n",
+    "    return feature_vec.dot(weights).relu()\n",
+    "\n",
+    "examples = torch.randn(batch_size, feature_size)\n",
+    "result = vmap(model)(examples)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5161e6d2",
+   "metadata": {},
+   "source": [
+    "When composed with `grad`, `vmap` can be used to compute per-sample-gradients:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffb2fcb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import vmap\n",
+    "batch_size, feature_size = 3, 5\n",
+    "\n",
+    "def model(weights,feature_vec):\n",
+    "    # Very simple linear model with activation\n",
+    "    assert feature_vec.dim() == 1\n",
+    "    return feature_vec.dot(weights).relu()\n",
+    "\n",
+    "def compute_loss(weights, example, target):\n",
+    "    y = model(weights, example)\n",
+    "    return ((y - target) ** 2).mean()  # MSELoss\n",
+    "\n",
+    "weights = torch.randn(feature_size, requires_grad=True)\n",
+    "examples = torch.randn(batch_size, feature_size)\n",
+    "targets = torch.randn(batch_size)\n",
+    "inputs = (weights,examples, targets)\n",
+    "grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(*inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11d711af",
+   "metadata": {},
+   "source": [
+    "### vjp (vector-Jacobian product)\n",
+    "\n",
+    "The `vjp` transform applies `func` to `inputs` and returns a new function that computes the vector-Jacobian product (vjp) given some `cotangents` Tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad48f9d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import vjp\n",
+    "\n",
+    "inputs = torch.randn(3)\n",
+    "func = torch.sin\n",
+    "cotangents = (torch.randn(3),)\n",
+    "\n",
+    "outputs, vjp_fn = vjp(func, inputs); vjps = vjp_fn(*cotangents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0221270",
+   "metadata": {},
+   "source": [
+    "### jvp (Jacobian-vector product)\n",
+    "\n",
+    "The `jvp` transforms computes Jacobian-vector-products and is also known as \"forward-mode AD\". It is not a higher-order function unlike most other transforms, but it returns the outputs of `func(inputs)` as well as the jvps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3772f43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import jvp\n",
+    "x = torch.randn(5)\n",
+    "y = torch.randn(5)\n",
+    "f = lambda x, y: (x * y)\n",
+    "_, output = jvp(f, (x, y), (torch.ones(5), torch.ones(5)))\n",
+    "assert torch.allclose(output, x + y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b00953b",
+   "metadata": {},
+   "source": [
+    "### jacrev, jacfwd, and hessian\n",
+    "\n",
+    "The `jacrev` transform returns a new function that takes in `x` and returns the Jacobian of the function\n",
+    "with respect to `x` using reverse-mode AD."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20f53be2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import jacrev\n",
+    "x = torch.randn(5)\n",
+    "jacobian = jacrev(torch.sin)(x)\n",
+    "expected = torch.diag(torch.cos(x))\n",
+    "assert torch.allclose(jacobian, expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9007c88",
+   "metadata": {},
+   "source": [
+    "Use `jacrev` to compute the jacobian. This can be composed with `vmap` to produce batched jacobians:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97d6c382",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = torch.randn(64, 5)\n",
+    "jacobian = vmap(jacrev(torch.sin))(x)\n",
+    "assert jacobian.shape == (64, 5, 5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cda642ec",
+   "metadata": {},
+   "source": [
+    "`jacfwd` is a drop-in replacement for `jacrev` that computes Jacobians using forward-mode AD:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8c1dedb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import jacfwd\n",
+    "x = torch.randn(5)\n",
+    "jacobian = jacfwd(torch.sin)(x)\n",
+    "expected = torch.diag(torch.cos(x))\n",
+    "assert torch.allclose(jacobian, expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39f85b50",
+   "metadata": {},
+   "source": [
+    "Composing `jacrev` with itself or `jacfwd` can produce hessians:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e511139",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def f(x):\n",
+    "  return x.sin().sum()\n",
+    "\n",
+    "x = torch.randn(5)\n",
+    "hessian0 = jacrev(jacrev(f))(x)\n",
+    "hessian1 = jacfwd(jacrev(f))(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18efdc65",
+   "metadata": {},
+   "source": [
+    "The `hessian` is a convenience function that combines `jacfwd` and `jacrev`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd1765df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functorch import hessian\n",
+    "\n",
+    "def f(x):\n",
+    "  return x.sin().sum()\n",
+    "\n",
+    "x = torch.randn(5)\n",
+    "hess = hessian(f)(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b597d7ad",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "Check out our other tutorials (in the left bar) for more detailed explanations of how to apply functorch transforms for various use cases. `functorch` is very much a work in progress and we'd love to hear how you're using it -- we encourage you to start a conversation at our [issues tracker](https://github.com/pytorch/functorch) to discuss your use case."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/functorch/op_analysis/gen_data.py b/functorch/op_analysis/gen_data.py
index 5e874e2bb1177..64f309f5741ad 100644
--- a/functorch/op_analysis/gen_data.py
+++ b/functorch/op_analysis/gen_data.py
@@ -33,6 +33,10 @@ def gen_data(special_op_lists, analysis_name):
     annotated_ops = {
         a.strip(): b.strip() for a, b in list(csv.reader(open("annotated_ops")))
     }
+<<<<<<< HEAD
+=======
+    from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     uniq_ops = []
     uniq_names = set()
diff --git a/functorch/writing_batching_rules.md b/functorch/writing_batching_rules.md
index 61872c8d52327..14e56b86f35f3 100644
--- a/functorch/writing_batching_rules.md
+++ b/functorch/writing_batching_rules.md
@@ -5,7 +5,11 @@ First off, what are batching rules and why do we need so many of them? Well, to
 ### How does vmap work?
 Vmap is a function transform (pioneered by Jax) that allows one to batch functions. That is, given a function `f(x: [N]) -> [N]`, `vmap(f)` now transforms the signature to be `f(x: [B, N]) -> [B, N]`. That is - it adds a batch dimension to both the input and the output of the function.
 
+<<<<<<< HEAD
 This guide will gloss over all the cool things you can do with this (there are many!), so let's focus on how we actually implement this.
+=======
+This guide will gloss over all the cool things you can do this (there are many!), so let's focus on how we actually implement this.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 One misconception is that this is some magic compiler voodoo, or that it is inherently some function transform. It is not - and there's another framing of it that might make it more clear.
 
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 11e520d9ad82b..ab1a4f9f26571 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -6,7 +6,11 @@
 # files.
 
 [mypy]
+<<<<<<< HEAD
 python_version = 3.10
+=======
+python_version = 3.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 plugins = mypy_plugins/check_mypy_version.py, numpy.typing.mypy_plugin
 
 cache_dir = .mypy_cache/strict
diff --git a/mypy.ini b/mypy.ini
index e6a8af4c88c20..6cd883a0576c1 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -55,6 +55,12 @@ python_version = 3.11
 # Extension modules without stubs.
 #
 
+<<<<<<< HEAD
+=======
+[mypy-torch._C._jit_tree_views]
+ignore_missing_imports = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [mypy-torch.for_onnx.onnx]
 ignore_missing_imports = True
 
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index 84f5f8bd3e627..441bdeb70bfc4 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -156,7 +156,10 @@ def get_generate_code_bin_outs():
             "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
             "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
             "autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
+<<<<<<< HEAD
             "functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         })
     return outs
 
diff --git a/pyproject.toml b/pyproject.toml
index 4cf3562886fd9..ff981b7fb049c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # Package ######################################################################
 
 [build-system]
@@ -83,11 +84,27 @@ classifiers = [
 dynamic = [
     "entry-points",
     "dependencies",
+=======
+[project]
+name = "torch"
+requires-python = ">=3.9"
+license = {text = "BSD-3-Clause"}
+dynamic = [
+    "authors",
+    "classifiers",
+    "entry-points",
+    "dependencies",
+    "description",
+    "keywords",
+    "optional-dependencies",
+    "readme",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "scripts",
     "version",
 ]
 
 [project.urls]
+<<<<<<< HEAD
 Homepage = "https://pytorch.org"
 Repository = "https://github.com/pytorch/pytorch"
 Documentation = "https://pytorch.org/docs"
@@ -100,6 +117,36 @@ opt-einsum = ["opt-einsum>=3.3"]
 pyyaml = ["pyyaml"]
 
 # Linter tools #################################################################
+=======
+Homepage = "https://pytorch.org/"
+Documentation = "https://pytorch.org/docs/"
+Source = "https://github.com/pytorch/pytorch"
+Forum = "https://discuss.pytorch.org/"
+
+
+[build-system]
+requires = [
+    # After 75.8.2 dropped dep disttools API. Please fix
+    # API temporarily restored and shim used. Please fix
+    # Setuptools will drop support for setup.py past 80
+    # min version for recursive glob package data support
+    "setuptools>=62.3.0,<80.0",
+    "wheel",
+    "astunparse",
+    "numpy",
+    "ninja",
+    "pyyaml",
+    "cmake",
+    "typing-extensions>=4.10.0",
+    "requests",
+]
+# Use legacy backend to import local packages in setup.py
+build-backend = "setuptools.build_meta:__legacy__"
+
+
+[tool.black]
+line-length = 88
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 [tool.isort]
 src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -115,10 +162,18 @@ multi_line_output = 3
 include_trailing_comma = true
 combine_as_imports = true
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [tool.usort.known]
 first_party = ["caffe2", "torch", "torchgen", "functorch", "test"]
 standard_library = ["typing_extensions"]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [tool.ruff]
 line-length = 88
 src = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -155,10 +210,19 @@ ignore = [
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
+<<<<<<< HEAD
     "E741",
     "EXE001",
     "F405",
     "FURB122", # writelines
+=======
+    "E721",
+    "E741",
+    "EXE001",
+    "F405",
+    # these ignores are from flake8-logging-format; please fix!
+    "G101",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these ignores are from ruff NPY; please fix!
     "NPY002",
     # these ignores are from ruff PERF; please fix!
@@ -178,6 +242,7 @@ ignore = [
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
+<<<<<<< HEAD
     "SIM300", # Yoda condition detected
     "UP007", # keep-runtime-typing
     "UP045", # keep-runtime-typing
@@ -185,6 +250,10 @@ ignore = [
     # TODO: Remove Python-3.10 specific suppressions
     "B905",
     "UP035",
+=======
+    "UP007", # keep-runtime-typing
+    "TC006",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 select = [
     "B",
@@ -194,7 +263,12 @@ select = [
     "E",
     "EXE",
     "F",
+<<<<<<< HEAD
     "SIM",
+=======
+    "SIM1",
+    "SIM911",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "W",
     # Not included in flake8
     "FURB",
@@ -202,6 +276,7 @@ select = [
     "NPY",
     "PERF",
     "PGH004",
+<<<<<<< HEAD
     "PIE",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
@@ -209,12 +284,27 @@ select = [
     "PLC0205", # string as __slots__
     "PLC3002", # unnecessary-direct-lambda-call
     "PLC0414", # Import alias does not rename original package
+=======
+    "PIE790",
+    "PIE794",
+    "PIE800",
+    "PIE804",
+    "PIE807",
+    "PIE810",
+    "PLC0131", # type bivariance
+    "PLC0132", # type param mismatch
+    "PLC0205", # string as __slots__
+    "PLC3002", # unnecessary-direct-lambda-call
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PLE",
     "PLR0133", # constant comparison
     "PLR0206", # property with params
     "PLR1722", # use sys exit
     "PLR1736", # unnecessary list index
+<<<<<<< HEAD
     "PLW0127", # Self-assignment of variable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PLW0129", # assert on string literal
     "PLW0131", # named expr without context
     "PLW0133", # useless exception statement
@@ -237,7 +327,10 @@ select = [
     "Q003",  # avoidable escaped quote
     "Q004",  # unnecessary escaped quote
     "RSE",
+<<<<<<< HEAD
     "RUF007", # pairwise over zip
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "RUF008", # mutable dataclass default
     "RUF013", # ban implicit optional
     "RUF015", # access first ele in constant time
@@ -263,10 +356,13 @@ select = [
     "YTT",
 ]
 
+<<<<<<< HEAD
 [tool.ruff.lint.pyupgrade]
 # Preserve types, even if a file imports `from __future__ import annotations`.
 keep-runtime-typing = true
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
@@ -276,7 +372,11 @@ keep-runtime-typing = true
     "PYI021", # docstring-in-stub
     "PYI053", # string-or-bytes-too-long
 ]
+<<<<<<< HEAD
 "functorch/docs/source/tutorials/**" = [
+=======
+"functorch/notebooks/**" = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "F401",
 ]
 "test/export/**" = [
diff --git a/pyrefly.toml b/pyrefly.toml
index cca6f5eb78cc1..28356148818d0 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # A Pyrefly configuration for PyTorch
 # Based on https://github.com/pytorch/pytorch/blob/main/mypy.ini
 python-version = "3.12"
@@ -6,10 +7,16 @@ project-includes = [
     "torch",
     "caffe2",
     "tools",
+=======
+project_includes = [
+    "torch",
+    "caffe2",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test/test_bundled_images.py",
     "test/test_bundled_inputs.py",
     "test/test_complex.py",
     "test/test_datapipe.py",
+<<<<<<< HEAD
     # "test/test_futures.py", # uncomment when enabling pyrefly
     "test/test_numpy_interop.py",
     # We exclude test_torch.py because it is full of errors, but most functions lack type signatures,
@@ -46,6 +53,16 @@ project-excludes = [
   "torch/distributed/elastic/metrics/__init__.py",
   "torch/_inductor/fx_passes/bucketing.py",
   # ====
+=======
+    "test/test_futures.py",
+    "test/test_numpy_interop.py",
+    "test/test_torch.py",
+    "test/test_type_hints.py",
+    "test/test_type_info.py",
+    "test/test_utils.py",
+]
+project_excludes = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch/include/**",
   "torch/csrc/**",
   "torch/distributed/elastic/agent/server/api.py",
@@ -60,7 +77,11 @@ project-excludes = [
   "*/__pycache__/**",
   "*/.*",
 ]
+<<<<<<< HEAD
 ignore-missing-imports = [
+=======
+replace_imports_with_any = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._C._jit_tree_views.*",
     "torch.for_onnx.onnx.*",
     "torch.ao.quantization.experimental.apot_utils.*",
@@ -115,6 +136,7 @@ ignore-missing-imports = [
     "onnx.*",
     "onnxruntime.*",
     "onnxscript.*",
+<<<<<<< HEAD
     "redis.*",
 ]
 # By default, mypy does not check untyped definitions.
@@ -134,3 +156,9 @@ errors.deprecated = false # re-enable after we've fix import formatting
 permissive-ignores = true
 replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"]
 search-path = ["tools/experimental"]
+=======
+    "redis.*"
+]
+
+untyped_def_behavior = "check-and-infer-return-any"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/related_commits b/related_commits
index d0522aa8510d4..a6bbca28558ce 100644
--- a/related_commits
+++ b/related_commits
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 ubuntu|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
 centos|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
 ubuntu|pytorch|torchvision|main|e3b5d3a8bf5e8636462fd8bce9897bccc690b2a0|https://github.com/pytorch/vision
@@ -8,3 +9,15 @@ ubuntu|pytorch|torchaudio|main|87ff22e49ed0e92576c4935ccb8c143daac4a3cd|https://
 centos|pytorch|torchaudio|main|87ff22e49ed0e92576c4935ccb8c143daac4a3cd|https://github.com/pytorch/audio
 ubuntu|pytorch|ao|main|5e90c477f6d49a17540bd7431728bfb2457610ee|https://github.com/pytorch/ao
 centos|pytorch|ao|main|5e90c477f6d49a17540bd7431728bfb2457610ee|https://github.com/pytorch/ao
+=======
+ubuntu|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/requirements.txt b/requirements.txt
index 090a733726658..0f5ac79c3d977 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,15 @@
 # Python dependencies required for development
+<<<<<<< HEAD
 
 # Build System requirements
 --requirement requirements-build.txt
 
 # Install / Development extra requirements
 build[uv]  # for building sdist and wheel
+=======
+astunparse==1.6.3
+cmake>=3.31.4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 expecttest==0.3.0
 filelock==3.18.0
 fsspec==2025.7.0
@@ -16,7 +21,20 @@ ninja==1.11.1.3
 numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
 optree==0.13.0
+<<<<<<< HEAD
 psutil==7.0.0
 sympy==1.13.3
 typing-extensions==4.14.1
 wheel==0.45.1
+=======
+packaging==25.0
+psutil==7.0.0
+pyyaml==6.0.2
+requests==2.32.4
+# setuptools develop deprecated on 80.0
+# issue on Windows after >= 75.8.2 - https://github.com/pytorch/pytorch/issues/148877
+setuptools==75.8.2
+sympy==1.13.3
+types-dataclasses==0.6.6
+typing-extensions==4.14.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/README.md b/scripts/README.md
index 367e7261f6a60..a63516bf8cb7c 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1 +1,43 @@
 This directory contains the useful tools.
+<<<<<<< HEAD
+=======
+
+
+## build_android.sh
+This script is to build PyTorch/Caffe2 library for Android. Take the following steps to start the build:
+
+- set ANDROID_NDK to the location of ndk
+
+```bash
+export ANDROID_NDK=YOUR_NDK_PATH
+```
+
+- run build_android.sh
+```bash
+#in your PyTorch root directory
+bash scripts/build_android.sh
+```
+If succeeded, the libraries and headers would be generated to build_android/install directory. You can then copy these files from build_android/install to your Android project for further usage.
+
+You can also override the cmake flags via command line, e.g., following command will also compile the executable binary files:
+```bash
+bash scripts/build_android.sh -DBUILD_BINARY=ON
+```
+
+## build_ios.sh
+This script is to build PyTorch/Caffe2 library for iOS, and can only be performed on macOS. Take the following steps to start the build:
+
+- Install Xcode from App Store, and configure "Command Line Tools" properly on Xcode.
+- Install the dependencies:
+
+```bash
+brew install cmake automake libtool
+```
+
+- run build_ios.sh
+```bash
+#in your PyTorch root directory
+bash scripts/build_ios.sh
+```
+If succeeded, the libraries and headers would be generated to build_ios/install directory. You can then copy these files  to your Xcode project for further usage.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/add_apache_header.sh b/scripts/add_apache_header.sh
new file mode 100755
index 0000000000000..a29a059d2d033
--- /dev/null
+++ b/scripts/add_apache_header.sh
@@ -0,0 +1 @@
+cat  apache_header.txt $1 > _add_apache_header.txt && mv _add_apache_header.txt $1
diff --git a/scripts/apache_header.txt b/scripts/apache_header.txt
new file mode 100644
index 0000000000000..b4eff258eb04d
--- /dev/null
+++ b/scripts/apache_header.txt
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
diff --git a/scripts/apache_python.txt b/scripts/apache_python.txt
new file mode 100644
index 0000000000000..bc104d8845154
--- /dev/null
+++ b/scripts/apache_python.txt
@@ -0,0 +1,14 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
new file mode 100755
index 0000000000000..43f11b86828d4
--- /dev/null
+++ b/scripts/build_android.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the android target.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for the Android platform
+# using android-cmake. A few notes:
+#
+# (1) This build also does a host build for protobuf. You will need autoconf
+#     to carry out this. If autoconf is not possible, you will need to provide
+#     a pre-built protoc binary that is the same version as the protobuf
+#     version under third_party.
+#     If you are building on Mac, you might need to install autotool and
+#     libtool. The easiest way is via homebrew:
+#         brew install automake
+#         brew install libtool
+# (2) You will need to have android ndk installed. The current script assumes
+#     that you set ANDROID_NDK to the location of ndk.
+# (3) The toolchain and the build target platform can be specified with the
+#     cmake arguments below. For more details, check out android-cmake's doc.
+
+set -e
+
+# Android specific flags
+if [ -z "$ANDROID_ABI" ]; then
+  ANDROID_ABI="armeabi-v7a with NEON"
+fi
+ANDROID_NATIVE_API_LEVEL="21"
+echo "Build with ANDROID_ABI[$ANDROID_ABI], ANDROID_NATIVE_API_LEVEL[$ANDROID_NATIVE_API_LEVEL]"
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+if [ -z "$ANDROID_NDK" ]; then
+  echo "ANDROID_NDK not set; please set it to the Android NDK directory"
+  exit 1
+fi
+
+if [ ! -d "$ANDROID_NDK" ]; then
+  echo "ANDROID_NDK not a directory; did you install it under $ANDROID_NDK?"
+  exit 1
+fi
+
+if [ -z "$PYTHON" ]; then
+  PYTHON=python
+  PYTHON_VERSION_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info[0])')
+  if [ "${PYTHON_VERSION_MAJOR}" -le 2 ]; then
+    echo "Default python executable is Python-2, trying to use python3 alias"
+    PYTHON=python3
+  fi
+fi
+
+ANDROID_NDK_PROPERTIES="$ANDROID_NDK/source.properties"
+[ -f "$ANDROID_NDK_PROPERTIES" ] && ANDROID_NDK_VERSION=$(sed -n 's/^Pkg.Revision[^=]*= *\([0-9]*\)\..*$/\1/p' "$ANDROID_NDK_PROPERTIES")
+
+echo "Bash: $(/bin/bash --version | head -1)"
+echo "Python: $($PYTHON -c 'import sys; print(sys.version)')"
+echo "Caffe2 path: $CAFFE2_ROOT"
+echo "Using Android NDK at $ANDROID_NDK"
+echo "Android NDK version: $ANDROID_NDK_VERSION"
+
+CMAKE_ARGS=()
+
+# Build PyTorch mobile
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$($PYTHON -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
+CMAKE_ARGS+=("-DPython_EXECUTABLE=$($PYTHON -c 'import sys; print(sys.executable)')")
+CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+
+# custom build with selected ops
+if [ -n "${SELECTED_OP_LIST}" ]; then
+  SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
+  echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
+  if [ ! -r ${SELECTED_OP_LIST} ]; then
+    echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
+    exit 1
+  fi
+  CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+fi
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+# Use android-cmake to build Android project from CMake.
+CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake")
+
+if [ -z "$BUILD_MOBILE_BENCHMARK" ]; then
+  BUILD_MOBILE_BENCHMARK=0
+fi
+
+if [ -z "$BUILD_MOBILE_TEST" ]; then
+  BUILD_MOBILE_TEST=0
+fi
+# Don't build artifacts we don't need
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+
+# If there exists env variable and it equals to 0, build full jit interpreter.
+# Default behavior is to build lite interpreter
+# cmd:  BUILD_LITE_INTERPRETER=0 ./scripts/build_android.sh
+if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
+fi
+if [ "${TRACING_BASED}" == 1 ]; then
+  CMAKE_ARGS+=("-DTRACING_BASED=ON")
+else
+  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
+fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
+CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK")
+CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST")
+CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
+CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
+if (( "${ANDROID_NDK_VERSION:-0}" < 18 )); then
+  CMAKE_ARGS+=("-DANDROID_TOOLCHAIN=gcc")
+else
+  CMAKE_ARGS+=("-DANDROID_TOOLCHAIN=clang")
+fi
+# Disable unused dependencies
+CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
+CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
+CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
+CMAKE_ARGS+=("-DUSE_MPI=OFF")
+CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
+# Only toggle if VERBOSE=1
+if [ "${VERBOSE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
+fi
+
+# Android specific flags
+CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
+CMAKE_ARGS+=("-DANDROID_ABI=$ANDROID_ABI")
+CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=$ANDROID_NATIVE_API_LEVEL")
+CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
+if [ "${ANDROID_STL_SHARED:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DANDROID_STL=c++_shared")
+fi
+if [ "${ANDROID_DEBUG_SYMBOLS:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DANDROID_DEBUG_SYMBOLS=1")
+fi
+
+if [ -n "${USE_VULKAN}" ]; then
+  CMAKE_ARGS+=("-DUSE_VULKAN=ON")
+  if [ -n "${USE_VULKAN_FP16_INFERENCE}" ]; then
+    CMAKE_ARGS+=("-DUSE_VULKAN_FP16_INFERENCE=ON")
+  fi
+  if [ -n "${USE_VULKAN_RELAXED_PRECISION}" ]; then
+    CMAKE_ARGS+=("-DUSE_VULKAN_RELAXED_PRECISION=ON")
+  fi
+fi
+
+# Use-specified CMake arguments go last to allow overriding defaults
+CMAKE_ARGS+=($@)
+
+# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
+if [ -f third_party/pocketfft/pocketfft_hdronly.h ]; then
+  sed -i -e "s/__cplusplus >= 201703L/0/" third_party/pocketfft/pocketfft_hdronly.h
+fi
+
+# Now, actually build the Android target.
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_android"}
+INSTALL_PREFIX=${BUILD_ROOT}/install
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+    -DCMAKE_BUILD_TYPE=Release \
+    "${CMAKE_ARGS[@]}"
+
+# Cross-platform parallel build
+if [ -z "$MAX_JOBS" ]; then
+  if [ "$(uname)" == 'Darwin' ]; then
+    MAX_JOBS=$(sysctl -n hw.ncpu)
+  else
+    MAX_JOBS=$(nproc)
+  fi
+fi
+
+echo "Will install headers and libs to $INSTALL_PREFIX for further Android project usage."
+cmake --build . --target install -- "-j${MAX_JOBS}"
+echo "Installation completed, now you can copy the headers/libs from $INSTALL_PREFIX to your Android project directory."
diff --git a/scripts/build_android_gradle.sh b/scripts/build_android_gradle.sh
new file mode 100755
index 0000000000000..fc27c5dd2516b
--- /dev/null
+++ b/scripts/build_android_gradle.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -eux -o pipefail
+
+env
+echo "BUILD_ENVIRONMENT:$BUILD_ENVIRONMENT"
+
+export ANDROID_NDK_HOME=/opt/ndk
+export ANDROID_NDK=/opt/ndk
+export ANDROID_HOME=/opt/android/sdk
+
+# Must be in sync with GRADLE_VERSION in docker image for android
+# https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh#L155
+export GRADLE_VERSION=6.8.3
+export GRADLE_HOME=/opt/gradle/gradle-$GRADLE_VERSION
+export GRADLE_PATH=$GRADLE_HOME/bin/gradle
+
+# touch gradle cache files to prevent expiration
+while IFS= read -r -d '' file
+do
+  touch "$file" || true
+done < <(find /var/lib/jenkins/.gradle -type f -print0)
+
+# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
+if [ -f ~/workspace/third_party/pocketfft/pocketfft_hdronly.h ]; then
+  sed -i -e "s/__cplusplus >= 201703L/0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
+fi
+
+export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
+rm -f $GRADLE_LOCAL_PROPERTIES
+echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
+echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
+echo "cmake.dir=/usr/local" >> $GRADLE_LOCAL_PROPERTIES
+
+retry () {
+  $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Run custom build script
+if [[ "${BUILD_ENVIRONMENT}" == *-gradle-custom-build* ]]; then
+  # Install torch & torchvision - used to download & dump used ops from test model.
+  retry pip install torch torchvision --progress-bar off
+
+  exec "$(dirname "${BASH_SOURCE[0]}")/../android/build_test_app_custom.sh" armeabi-v7a
+fi
+
+# Run default build
+BUILD_ANDROID_INCLUDE_DIR_x86=~/workspace/build_android/install/include
+BUILD_ANDROID_LIB_DIR_x86=~/workspace/build_android/install/lib
+
+BUILD_ANDROID_INCLUDE_DIR_x86_64=~/workspace/build_android_install_x86_64/install/include
+BUILD_ANDROID_LIB_DIR_x86_64=~/workspace/build_android_install_x86_64/install/lib
+
+BUILD_ANDROID_INCLUDE_DIR_arm_v7a=~/workspace/build_android_install_arm_v7a/install/include
+BUILD_ANDROID_LIB_DIR_arm_v7a=~/workspace/build_android_install_arm_v7a/install/lib
+
+BUILD_ANDROID_INCLUDE_DIR_arm_v8a=~/workspace/build_android_install_arm_v8a/install/include
+BUILD_ANDROID_LIB_DIR_arm_v8a=~/workspace/build_android_install_arm_v8a/install/lib
+
+PYTORCH_ANDROID_SRC_MAIN_DIR=~/workspace/android/pytorch_android/src/main
+
+JNI_INCLUDE_DIR=${PYTORCH_ANDROID_SRC_MAIN_DIR}/cpp/libtorch_include
+mkdir -p $JNI_INCLUDE_DIR
+
+JNI_LIBS_DIR=${PYTORCH_ANDROID_SRC_MAIN_DIR}/jniLibs
+mkdir -p $JNI_LIBS_DIR
+
+ln -s ${BUILD_ANDROID_INCLUDE_DIR_x86} ${JNI_INCLUDE_DIR}/x86
+ln -s ${BUILD_ANDROID_LIB_DIR_x86} ${JNI_LIBS_DIR}/x86
+
+if [[ "${BUILD_ENVIRONMENT}" != *-gradle-build-only-x86_32* ]]; then
+ln -s ${BUILD_ANDROID_INCLUDE_DIR_x86_64} ${JNI_INCLUDE_DIR}/x86_64
+ln -s ${BUILD_ANDROID_LIB_DIR_x86_64} ${JNI_LIBS_DIR}/x86_64
+
+ln -s ${BUILD_ANDROID_INCLUDE_DIR_arm_v7a} ${JNI_INCLUDE_DIR}/armeabi-v7a
+ln -s ${BUILD_ANDROID_LIB_DIR_arm_v7a} ${JNI_LIBS_DIR}/armeabi-v7a
+
+ln -s ${BUILD_ANDROID_INCLUDE_DIR_arm_v8a} ${JNI_INCLUDE_DIR}/arm64-v8a
+ln -s ${BUILD_ANDROID_LIB_DIR_arm_v8a} ${JNI_LIBS_DIR}/arm64-v8a
+fi
+
+GRADLE_PARAMS="-p android assembleRelease --debug --stacktrace"
+if [[ "${BUILD_ENVIRONMENT}" == *-gradle-build-only-x86_32* ]]; then
+    GRADLE_PARAMS+=" -PABI_FILTERS=x86"
+fi
+
+if [ -n "${GRADLE_OFFLINE:-}" ]; then
+    GRADLE_PARAMS+=" --offline"
+fi
+
+$GRADLE_PATH $GRADLE_PARAMS
+
+find . -type f -name "*.a" -exec ls -lh {} \;
+
+while IFS= read -r -d '' file
+do
+  echo
+  echo "$file"
+  ls -lah "$file"
+  zipinfo -l "$file"
+done < <(find . -type f -name '*.aar' -print0)
+
+find . -type f -name *aar -print | xargs tar cfvz ~/workspace/android/artifacts.tgz
diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
new file mode 100755
index 0000000000000..ad16cb940dcb8
--- /dev/null
+++ b/scripts/build_ios.sh
@@ -0,0 +1,155 @@
+#!/bin/bash -xe
+##############################################################################
+# Example command to build the iOS target.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for the iOS platform
+# using ios-cmake. This is very similar to the android-cmake - see
+# build_android.sh for more details.
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+if [ -z "$PYTHON" ]; then
+  PYTHON=python
+  PYTHON_VERSION_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info[0])')
+  if [ "${PYTHON_VERSION_MAJOR}" -le 2 ]; then
+    echo "Default python executable is Python-2, trying to use python3 alias"
+    PYTHON=python3
+  fi
+fi
+
+echo "Bash: $(/bin/bash --version | head -1)"
+echo "Python: $($PYTHON -c 'import sys; print(sys.version)')"
+echo "Caffe2 path: $CAFFE2_ROOT"
+
+CMAKE_ARGS=()
+
+# Build PyTorch mobile
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$($PYTHON -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
+CMAKE_ARGS+=("-DPython_EXECUTABLE=$($PYTHON -c 'import sys; print(sys.executable)')")
+CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+
+# custom build with selected ops
+if [ -n "${SELECTED_OP_LIST}" ]; then
+  SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
+  echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
+  if [ ! -r ${SELECTED_OP_LIST} ]; then
+    echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
+    exit 1
+  fi
+  CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+fi
+
+# bitcode
+if [ "${ENABLE_BITCODE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_C_FLAGS=-fembed-bitcode")
+  CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
+fi
+
+# Use ios-cmake to build iOS project from CMake.
+# This projects sets CMAKE_C_COMPILER to /usr/bin/gcc and
+# CMAKE_CXX_COMPILER to /usr/bin/g++. In order to use ccache (if it is available) we
+# must override these variables via CMake arguments.
+CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$CAFFE2_ROOT/cmake/iOS.cmake")
+if [ -n "${CCACHE_WRAPPER_PATH:-}"]; then
+  CCACHE_WRAPPER_PATH=/usr/local/opt/ccache/libexec
+fi
+if [ -d "$CCACHE_WRAPPER_PATH" ]; then
+  CMAKE_ARGS+=("-DCMAKE_C_COMPILER=$CCACHE_WRAPPER_PATH/gcc")
+  CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=$CCACHE_WRAPPER_PATH/g++")
+fi
+
+# IOS_PLATFORM controls type of iOS platform (see ios-cmake)
+if [ -n "${IOS_PLATFORM:-}" ]; then
+  CMAKE_ARGS+=("-DIOS_PLATFORM=${IOS_PLATFORM}")
+  if [ "${IOS_PLATFORM}" == "WATCHOS" ]; then
+      # enable bitcode by default for watchos
+      CMAKE_ARGS+=("-DCMAKE_C_FLAGS=-fembed-bitcode")
+      CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
+      # disable the QNNPACK
+      CMAKE_ARGS+=("-DUSE_PYTORCH_QNNPACK=OFF")
+  fi
+else
+  # IOS_PLATFORM is not set, default to OS, which builds iOS.
+  CMAKE_ARGS+=("-DIOS_PLATFORM=OS")
+fi
+
+if [ -n "${IOS_ARCH:-}" ]; then
+  CMAKE_ARGS+=("-DIOS_ARCH=${IOS_ARCH}")
+fi
+
+if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
+fi
+if [ "${TRACING_BASED}" == 1 ]; then
+  CMAKE_ARGS+=("-DTRACING_BASED=ON")
+else
+  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
+fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
+CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
+
+# Don't build binaries or tests (only the library)
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
+
+# Disable unused dependencies
+CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
+CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
+CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
+CMAKE_ARGS+=("-DUSE_MPI=OFF")
+CMAKE_ARGS+=("-DUSE_NUMPY=OFF")
+CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+CMAKE_ARGS+=("-DUSE_MKLDNN=OFF")
+
+# Metal
+if [ "${USE_PYTORCH_METAL:-}" == "1" ]; then
+  CMAKE_ARGS+=("-DUSE_PYTORCH_METAL=ON")
+fi
+
+# Core ML
+if [ "${USE_COREML_DELEGATE}" == "1" ]; then
+  CMAKE_ARGS+=("-DUSE_COREML_DELEGATE=ON")
+fi
+
+# pthreads
+CMAKE_ARGS+=("-DCMAKE_THREAD_LIBS_INIT=-lpthread")
+CMAKE_ARGS+=("-DCMAKE_HAVE_THREADS_LIBRARY=1")
+CMAKE_ARGS+=("-DCMAKE_USE_PTHREADS_INIT=1")
+
+# Only toggle if VERBOSE=1
+if [ "${VERBOSE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
+fi
+
+# enable ARC
+CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fobjc-arc")
+
+# Now, actually build the iOS target.
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_ios"}
+INSTALL_PREFIX=${BUILD_ROOT}/install
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+    -DCMAKE_BUILD_TYPE=MinSizeRel \
+    -DBUILD_SHARED_LIBS=OFF \
+    ${CMAKE_ARGS[@]} \
+    $@
+
+cmake --build . -- "-j$(sysctl -n hw.ncpu)"
+
+# copy headers and libs to install directory
+echo "Will install headers and libs to $INSTALL_PREFIX for further Xcode project usage."
+make install
+echo "Installation completed, now you can copy the headers/libs from $INSTALL_PREFIX to your Xcode project directory."
diff --git a/scripts/build_local.sh b/scripts/build_local.sh
new file mode 100755
index 0000000000000..b843671501256
--- /dev/null
+++ b/scripts/build_local.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#
+##############################################################################
+# Example command to build Caffe2
+##############################################################################
+#
+
+set -ex
+
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+CMAKE_ARGS=()
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+# Use ccache if available (this path is where Homebrew installs ccache symlinks)
+if [ "$(uname)" == 'Darwin' ]; then
+  if [ -n "${CCACHE_WRAPPER_PATH:-}"]; then
+    CCACHE_WRAPPER_PATH=/usr/local/opt/ccache/libexec
+  fi
+  if [ -d "$CCACHE_WRAPPER_PATH" ]; then
+    CMAKE_ARGS+=("-DCMAKE_C_COMPILER=$CCACHE_WRAPPER_PATH/gcc")
+    CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=$CCACHE_WRAPPER_PATH/g++")
+  fi
+fi
+
+# Use special install script with Anaconda
+if [ -n "${USE_ANACONDA}" ]; then
+  export SKIP_CONDA_TESTS=1
+  export CONDA_INSTALL_LOCALLY=1
+  "${ROOT_DIR}/scripts/build_anaconda.sh" "$@"
+else
+  # Make sure that pyyaml is installed for the codegen of building Aten to work
+  if [[ -n "$(python -c 'import yaml' 2>&1)" ]]; then
+    echo "Installing pyyaml with pip at $(which pip)"
+    pip install --user pyyaml
+  fi
+
+  # Make sure that typing is installed for the codegen of building Aten to work
+  if [[ -n "$(python -c 'import typing' 2>&1)" ]]; then
+    echo "Installing typing with pip at $(which pip)"
+    pip install --user typing
+  fi
+
+  # Build protobuf compiler from third_party if configured to do so
+  if [ -n "${USE_HOST_PROTOC:-}" ]; then
+    echo "USE_HOST_PROTOC is set; building protoc before building Caffe2..."
+    "$CAFFE2_ROOT/scripts/build_host_protoc.sh"
+    CUSTOM_PROTOC_EXECUTABLE="$CAFFE2_ROOT/build_host_protoc/bin/protoc"
+    echo "Built protoc $("$CUSTOM_PROTOC_EXECUTABLE" --version)"
+    CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CUSTOM_PROTOC_EXECUTABLE")
+  fi
+
+  # We are going to build the target into build.
+  BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+  mkdir -p "$BUILD_ROOT"
+  cd "$BUILD_ROOT"
+  echo "Building Caffe2 in: $BUILD_ROOT"
+
+  cmake "$CAFFE2_ROOT" \
+        -DCMAKE_BUILD_TYPE=Release \
+        "${CMAKE_ARGS[@]}" \
+        "$@"
+
+  # Determine the number of CPUs to build with.
+  # If the `CAFFE_MAKE_NCPUS` variable is not specified, use them all.
+  if [ -n "${MAX_JOBS}" ]; then
+      CAFFE_MAKE_NCPUS="$MAX_JOBS"
+  elif [ -n "${CAFFE_MAKE_NCPUS}" ]; then
+      CAFFE_MAKE_NCPUS="$CAFFE_MAKE_NCPUS"
+  elif [ "$(uname)" == 'Darwin' ]; then
+      CAFFE_MAKE_NCPUS="$(sysctl -n hw.ncpu)"
+  else
+      CAFFE_MAKE_NCPUS="$(nproc)"
+  fi
+
+  # Now, actually build the target.
+  cmake --build . -- "-j$CAFFE_MAKE_NCPUS"
+fi
diff --git a/scripts/build_mobile.sh b/scripts/build_mobile.sh
new file mode 100755
index 0000000000000..7b1995a61ebc7
--- /dev/null
+++ b/scripts/build_mobile.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the mobile target.
+##############################################################################
+#
+# This script shows how one can build a libtorch library optimized for mobile
+# devices using host toolchain.
+
+set -e
+
+export BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN=1
+CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
+
+echo "Bash: $(/bin/bash --version | head -1)"
+echo "Caffe2 path: $CAFFE2_ROOT"
+
+CMAKE_ARGS=()
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$(python -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
+CMAKE_ARGS+=("-DPython_EXECUTABLE=$(python -c 'import sys; print(sys.executable)')")
+CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
+
+# custom build with selected ops
+if [ -n "${SELECTED_OP_LIST}" ]; then
+  SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
+  echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
+  if [ ! -r ${SELECTED_OP_LIST} ]; then
+    echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
+    exit 1
+  fi
+  CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+fi
+
+# If Ninja is installed, prefer it to Make
+if [ -x "$(command -v ninja)" ]; then
+  CMAKE_ARGS+=("-GNinja")
+fi
+
+# Don't build artifacts we don't need
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+
+# If there exists env variable and it equals to 1, build lite interpreter.
+# Default behavior is to build full jit interpreter.
+# cmd:  BUILD_LITE_INTERPRETER=1 ./scripts/build_mobile.sh
+if [ "x${BUILD_LITE_INTERPRETER}" == "x1" ]; then
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+fi
+if [ "x${TRACING_BASED}" == "x1" ]; then
+  CMAKE_ARGS+=("-DTRACING_BASED=ON")
+else
+  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
+fi
+
+# Lightweight dispatch bypasses the PyTorch Dispatcher.
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
+# Disable unused dependencies
+CMAKE_ARGS+=("-DUSE_ROCM=OFF")
+CMAKE_ARGS+=("-DUSE_CUDA=OFF")
+CMAKE_ARGS+=("-DUSE_ITT=OFF")
+CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
+CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
+CMAKE_ARGS+=("-DUSE_MPI=OFF")
+CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
+CMAKE_ARGS+=("-DUSE_MKLDNN=OFF")
+CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+CMAKE_ARGS+=("-DUSE_NUMPY=OFF")
+CMAKE_ARGS+=("-DUSE_BLAS=OFF")
+
+# Only toggle if VERBOSE=1
+if [ "${VERBOSE:-}" == '1' ]; then
+  CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
+fi
+
+# Use-specified CMake arguments go last to allow overriding defaults
+CMAKE_ARGS+=("$@")
+
+# Now, actually build the Android target.
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_mobile"}
+INSTALL_PREFIX=${BUILD_ROOT}/install
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+    -DCMAKE_BUILD_TYPE=Release \
+    "${CMAKE_ARGS[@]}"
+
+# Cross-platform parallel build
+if [ -z "$MAX_JOBS" ]; then
+  if [ "$(uname)" == 'Darwin' ]; then
+    MAX_JOBS=$(sysctl -n hw.ncpu)
+  else
+    MAX_JOBS=$(nproc)
+  fi
+fi
+
+echo "Will install headers and libs to $INSTALL_PREFIX for further project usage."
+cmake --build . --target install -- "-j${MAX_JOBS}"
+echo "Installation completed, now you can copy the headers/libs from $INSTALL_PREFIX to your project directory."
diff --git a/scripts/build_pytorch_android.sh b/scripts/build_pytorch_android.sh
new file mode 100755
index 0000000000000..7b80965e34b5c
--- /dev/null
+++ b/scripts/build_pytorch_android.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -eux
+
+##############################################################################
+# Master script to build PyTorch Android library with Java bindings.
+##############################################################################
+# Example usage:
+# - Build default AARs:
+#   scripts/build_pytorch_android.sh
+#
+# - Build for specific ABI(s):
+#   scripts/build_pytorch_android.sh armeabi-v7a
+#   scripts/build_pytorch_android.sh arm64-v8a,x86,x86_64
+#
+# Script's workflow:
+# 1. Builds libtorch for android for specified android abisi (by default for all 4).
+# Custom list of android abis can be specified as a bash argument as comma separated list.
+# For example just for testing on android x86 emulator we need only x86 build.
+# ./scripts/build_pytorch_android.sh x86
+# 2. Creates symbolic links to android/pytorch_android/src/main/jniLibs/${abi} for libtorch build output,
+# android/pytorch_android/src/main/cpp/libtorch_include/${abi} for headers.
+# 3. Runs pyotrch_android gradle build:
+# gradle assembleRelease
+
+PYTORCH_DIR="$(cd $(dirname $0)/..; pwd -P)"
+PYTORCH_ANDROID_DIR=$PYTORCH_DIR/android
+
+echo "PYTORCH_DIR:$PYTORCH_DIR"
+
+source "$PYTORCH_ANDROID_DIR/common.sh"
+
+check_android_sdk
+check_gradle
+parse_abis_list "$@"
+build_android
+
+# To set proxy for gradle add following lines to ./gradle/gradle.properties:
+# systemProp.http.proxyHost=...
+# systemProp.http.proxyPort=8080
+# systemProp.https.proxyHost=...
+# systemProp.https.proxyPort=8080
+
+if [ "$CUSTOM_ABIS_LIST" = true ]; then
+  # Skipping clean task here as android gradle plugin 3.3.2 exteralNativeBuild has problems
+  # with it when abiFilters are specified.
+  $GRADLE_PATH -PABI_FILTERS=$ABIS_LIST -p $PYTORCH_ANDROID_DIR assembleRelease
+else
+  $GRADLE_PATH -p $PYTORCH_ANDROID_DIR clean assembleRelease
+fi
+
+find $PYTORCH_ANDROID_DIR -type f -name *aar | xargs ls -lah
diff --git a/scripts/build_raspbian.sh b/scripts/build_raspbian.sh
new file mode 100755
index 0000000000000..b1fe85926219e
--- /dev/null
+++ b/scripts/build_raspbian.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+##############################################################################
+# Example command to build the Raspbian target.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for raspbian. The build
+# is essentially much similar to a host build, with one additional change
+# which is to specify -mfpu=neon for optimized speed.
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 raspbian into: $BUILD_ROOT"
+
+# obtain dependencies.
+echo "Installing dependencies."
+sudo apt-get install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  libpython-dev \
+  python-pip \
+  python-numpy \
+  protobuf-compiler \
+  python-protobuf
+# python dependencies
+sudo pip install hypothesis
+
+# Now, actually build the raspbian target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: you can add more dependencies above if you need libraries such as
+# leveldb, lmdb, etc.
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=hard" \
+    || exit 1
+
+# Note: while Raspberry pi has 4 cores, running too many builds in parallel may
+# cause out of memory errors so we will simply run -j 2 only.
+make -j 2 || exit 1
diff --git a/scripts/build_tegra_x1.sh b/scripts/build_tegra_x1.sh
new file mode 100755
index 0000000000000..063e17dfe3514
--- /dev/null
+++ b/scripts/build_tegra_x1.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+##############################################################################
+# Example command to build Caffe2 on Tegra X1.
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for NVidia's TX1.
+# The build script assumes that you have the most recent libraries installed
+# via the JetPack toolkit available at
+#     https://developer.nvidia.com/embedded/jetpack
+# and it assumes that we are starting from a fresh system after the jetpack
+# installation. If you have already installed some of the dependencies, you
+# may be able to skip quite a few of the apt-get installs.
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 raspbian into: $BUILD_ROOT"
+
+# obtain necessary dependencies
+echo "Installing dependencies."
+sudo apt-get install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  protobuf-compiler
+
+# obtain optional dependencies that are usually useful to have.
+echo "Installing optional dependencies."
+sudo apt-get install \
+  libpython-dev \
+  python-numpy \
+  python-pip \
+  python-protobuf
+
+# Obtain python hypothesis, which Caffe2 uses for unit testing. Note that
+# the one provided by apt-get is quite old so we install it via pip
+sudo pip install hypothesis
+
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# CUDA_USE_STATIC_CUDA_RUNTIME needs to be set to off so that opencv can be
+# properly used. Otherwise, opencv will complain that opencv_dep_cudart cannot
+# be found.
+cmake "$CAFFE2_ROOT" -DCUDA_USE_STATIC_CUDA_RUNTIME=OFF \
+    || exit 1
+
+make -j 4 || exit 1
diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
new file mode 100755
index 0000000000000..2262a2503c1d0
--- /dev/null
+++ b/scripts/build_tizen.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+##############################################################################
+#  Example command to build the Tizen target (RPi3).
+##############################################################################
+#
+# This script shows how one can build a Caffe2 binary for a Tizen device (RPi3).
+# The build is essentially much similar to a host build, with one additional change
+# which is to specify -mfpu=neon for optimized speed.
+
+setup_environment(){
+# The rootfs image for a Tizen target (RPi3)is located at the below webpage:
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
+# If you do not have a Tizen device, Please, run qemu-arm-static and chroot command.
+# $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash
+
+CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
+echo "Caffe2 codebase root is: $CAFFE2_ROOT"
+BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build"}
+mkdir -p $BUILD_ROOT
+echo "Build Caffe2 Tizen into: $BUILD_ROOT"
+}
+
+caffe2_lite_dep_packages(){
+# Obtain necessary dependencies
+# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
+# software platform officially support rpm format such as Fedora, OpenSUSE.
+# The official Tizen repository is as following:
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
+echo "Installing dependencies."
+sudo zypper install \
+  make \
+  strace \
+  cmake \
+  gcc* \
+  binutils \
+  glibc* \
+  cpp \
+  protobuf-devel \
+  libstdc++*
+}
+
+caffe2_lite_build(){
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: add more dependencies above if you need libraries such as leveldb, lmdb, etc.
+# If you have to disable a specific package due to a package absence
+# from https://git.tizen.org/cgit/, append -Dxxx_xxx=OFF option before executing cmake.
+cmake .. \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DUSE_GFLAGS=OFF  \
+    -DUSE_GLOG=OFF -DUSE_NNPACK=OFF \
+    -DRUN_HAVE_STD_REGEX=0 \
+    -DRUN_HAVE_POSIX_REGEX=0 \
+    -DHAVE_GNU_POSIX_REGEX=0 \
+    -DUSE_MPI=OFF -DUSE_OPENMP=OFF \
+    -DBUILD_PYTHON=OFF \
+    -DUSE_GLOO=OFF \
+    -DUSE_OPENCV=OFF \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
+    || exit 1
+
+make -j`nproc` || exit 1
+}
+
+caffe2_full_dep_packages(){
+# Obtain necessary dependencies
+# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
+# software platform officially support rpm format such as Fedora, OpenSUSE.
+# The official Tizen repository is as following:
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
+echo "Installing dependencies."
+sudo zypper install \
+  cmake \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libprotobuf-dev \
+  protobuf-compiler
+
+# Obtain optional dependencies that are usually useful to have.
+echo "Installing optional dependencies."
+sudo zypper install \
+  libpython-dev \
+  python-numpy \
+  python-pip \
+  python-protobuf
+
+# Obtain python hypothesis, which Caffe2 uses for unit testing. Note that
+# the one provided by zypper is quite old so we install it via pip
+sudo pip install hypothesis
+}
+
+caffe2_full_build(){
+# Now, actually build the android target.
+echo "Building caffe2"
+cd $BUILD_ROOT
+
+# Note: add more dependencies above if you need libraries such as leveldb, lmdb, etc.
+# If you have to disable a specific package due to a package absence
+# from https://git.tizen.org/cgit/, append -Dxxx_xxx=OFF option before executing cmake.
+cmake "$CAFFE2_ROOT" \
+    -DCMAKE_VERBOSE_MAKEFILE=1 \
+    -DUSE_CUDA=OFF \
+    -DUSE_ITT=OFF \
+    -DUSE_OPENCV=OFF \
+    -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
+    || exit 1
+
+make -j`nproc` || exit 1
+}
+
+#### Main
+# Setup a build environment to compile Caffe2 deeplearning framework in Tizen platform.
+setup_environment
+# There are two build options to support 'full' version and 'lite' version (by default).
+caffe2_lite_dep_packages
+caffe2_lite_build
diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
new file mode 100644
index 0000000000000..60bfebad08c01
--- /dev/null
+++ b/scripts/build_windows.bat
@@ -0,0 +1,80 @@
+:: #############################################################################
+:: Example command to build on Windows.
+:: #############################################################################
+
+:: This script shows how one can build a Caffe2 binary for windows.
+
+@echo off
+setlocal
+
+SET ORIGINAL_DIR=%cd%
+SET CAFFE2_ROOT=%~dp0%..
+
+if NOT DEFINED BUILD_BINARY (
+  set BUILD_BINARY=OFF
+)
+
+if NOT DEFINED BUILD_SHARED_LIBS (
+  :: On CI, we test with BUILD_SHARED_LIBS=OFF.
+  :: By default, it will be BUILD_SHARED_LIBS=ON.
+  if NOT DEFINED BUILD_ENVIRONMENT (
+    set BUILD_SHARED_LIBS=OFF
+  )
+)
+
+if NOT DEFINED CAFFE2_STATIC_LINK_CUDA (
+  set CAFFE2_STATIC_LINK_CUDA=OFF
+)
+
+if NOT DEFINED CMAKE_BUILD_TYPE (
+  set CMAKE_BUILD_TYPE=Release
+)
+
+if NOT DEFINED ONNX_NAMESPACE (
+  set ONNX_NAMESPACE=onnx_c2
+)
+
+if NOT DEFINED TORCH_CUDA_ARCH_LIST (
+  set TORCH_CUDA_ARCH_LIST=5.0
+)
+
+if NOT DEFINED USE_CUDA (
+  set USE_CUDA=OFF
+)
+
+if NOT DEFINED USE_OBSERVERS (
+  set USE_OBSERVERS=OFF
+)
+
+if NOT DEFINED MSVC_Z7_OVERRIDE (
+  set MSVC_Z7_OVERRIDE=OFF
+)
+
+if NOT DEFINED CMAKE_GENERATOR (
+  set CMAKE_GENERATOR=Ninja
+)
+
+set CMAKE_VERBOSE_MAKEFILE=1
+
+:: Install pyyaml for Aten codegen
+pip install pyyaml ninja
+
+echo CAFFE2_ROOT=%CAFFE2_ROOT%
+echo CMAKE_GENERATOR=%CMAKE_GENERATOR%
+echo CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE%
+
+:: Set up cmake. We will skip building the test files right now.
+pushd %CAFFE2_ROOT%
+python tools\build_libtorch.py || goto :label_error
+popd
+
+echo "Caffe2 built successfully"
+cd %ORIGINAL_DIR%
+endlocal
+exit /b 0
+
+:label_error
+echo "Caffe2 building failed"
+cd %ORIGINAL_DIR%
+endlocal
+exit /b 1
diff --git a/scripts/diagnose_protobuf.py b/scripts/diagnose_protobuf.py
new file mode 100644
index 0000000000000..65af4618228db
--- /dev/null
+++ b/scripts/diagnose_protobuf.py
@@ -0,0 +1,92 @@
+## @package diagnose_protobuf
+# Module scripts.diagnose_protobuf
+"""Diagnoses the current protobuf situation.
+
+Protocol buffer needs to be properly installed for Caffe2 to work, and
+sometimes it is rather tricky. Specifically, we will need to have a
+consistent version between C++ and python simultaneously. This is a
+convenience script for one to quickly check if this is so on one's local
+machine.
+
+Usage:
+    [set your environmental variables like PATH and PYTHONPATH]
+    python scripts/diagnose_protobuf.py
+"""
+
+import os
+import re
+from subprocess import PIPE, Popen
+
+
+# Get python protobuf version.
+try:
+    import google.protobuf
+
+    python_version = google.protobuf.__version__
+    python_protobuf_installed = True
+except ImportError:
+    print("DEBUG: cannot find python protobuf install.")
+    python_protobuf_installed = False
+
+if os.name == "nt":
+    protoc_name = "protoc.exe"
+else:
+    protoc_name = "protoc"
+
+try:
+    p = Popen([protoc_name, "--version"], stdout=PIPE, stderr=PIPE)
+    out, err = p.communicate()
+except:
+    print("DEBUG: did not find protoc binary.")
+    print("DEBUG: out: " + out)
+    print("DEBUG: err: " + err)
+    native_protobuf_installed = False
+else:
+    if p.returncode:
+        print("DEBUG: protoc returned a non-zero return code.")
+        print("DEBUG: out: " + out)
+        print("DEBUG: err: " + err)
+        native_protobuf_installed = False
+    else:
+        tmp = re.search(r"\d\.\d\.\d", out)
+        if tmp:
+            native_version = tmp.group(0)
+            native_protobuf_installed = True
+        else:
+            print("DEBUG: cannot parse protoc version string.")
+            print("DEBUG: out: " + out)
+            native_protobuf_installed = False
+
+PYTHON_PROTOBUF_NOT_INSTALLED = """
+You have not installed python protobuf. Protobuf is needed to run caffe2. You
+can install protobuf via pip or conda (if you are using anaconda python).
+"""
+
+NATIVE_PROTOBUF_NOT_INSTALLED = """
+You have not installed the protoc binary. Protoc is needed to compile Caffe2
+protobuf source files. Depending on the platform you are on, you can install
+protobuf via:
+    (1) Mac: using homebrew and do brew install protobuf.
+    (2) Linux: use apt and do apt-get install libprotobuf-dev
+    (3) Windows: install from source, or from the releases here:
+        https://github.com/google/protobuf/releases/
+"""
+
+VERSION_MISMATCH = f"""
+Your python protobuf is of version {python_version} but your native protoc version is of
+version {native_version}. This will cause the installation to produce incompatible
+protobuf files. This is bad in general - consider installing the same version.
+"""
+
+# Now, give actual recommendations
+if not python_protobuf_installed:
+    print(PYTHON_PROTOBUF_NOT_INSTALLED)
+
+if not native_protobuf_installed:
+    print(NATIVE_PROTOBUF_NOT_INSTALLED)
+
+if python_protobuf_installed and native_protobuf_installed:
+    if python_version != native_version:
+        print(VERSION_MISMATCH)
+    else:
+        print("All looks good.")
diff --git a/scripts/fbcode-dev-setup/ccache_setup.sh b/scripts/fbcode-dev-setup/ccache_setup.sh
new file mode 100755
index 0000000000000..cb461bee2dd27
--- /dev/null
+++ b/scripts/fbcode-dev-setup/ccache_setup.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+# This script installs CCache with CUDA support.
+# Example usage:
+#     ./ccache_setup.sh --path /installed/folder
+
+set -e
+shopt -s expand_aliases
+
+# Setup the proxy
+alias with_proxy="HTTPS_PROXY=http://fwdproxy:8080 HTTP_PROXY=http://fwdproxy:8080 FTP_PROXY=http://fwdproxy:8080 https_proxy=http://fwdproxy:8080 http_proxy=http://fwdproxy:8080 ftp_proxy=http://fwdproxy:8080 http_no_proxy='*.facebook.com|*.tfbnw.net|*.fb.com'"
+
+# Parse options
+path="$HOME/ccache"
+force=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --path)
+      shift
+      path="$1"
+      path=$(realpath "$path")
+      ;;
+    --force)  # Force install
+      force=true
+      ;;
+    --help)
+      echo 'usage: ./ccache_setup.py --path /installed/folder [--force]'
+      exit 0
+      ;;
+    *)
+      echo "Invalid option: $1"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Check whether you put nvcc in PATH
+set +e
+nvcc_path=$(which nvcc)
+if [[ -z "$nvcc_path" ]]; then
+  nvcc_path="/usr/local/cuda/bin/nvcc"
+  export PATH="/usr/local/cuda/bin:$PATH"
+fi
+set -e
+if [ ! -f "$nvcc_path" ] && ! $force; then
+  # shellcheck disable=SC2016
+  echo 'nvcc is not detected in $PATH'
+  exit 1
+fi
+echo "nvcc is detected at $nvcc_path"
+
+if [ -f "$CUDA_NVCC_EXECUTABLE" ] && [[ "$CUDA_NVCC_EXECUTABLE" == *"ccache"* ]]; then  # Heuristic rule
+  if $CUDA_NVCC_EXECUTABLE --version; then
+    if ! $force; then
+      echo "CCache with nvcc support is already installed at $CUDA_NVCC_EXECUTABLE, please add --force"
+      exit 0
+    fi
+  fi
+fi
+
+# Installing CCache
+echo "CCache will be installed at $path"
+if [ -e "$path" ]; then
+  mv --backup=t -T "$path" "${path}.old"
+fi
+
+with_proxy git clone https://github.com/colesbury/ccache.git "$path" -b ccbin
+cd "$path"
+./autogen.sh
+./configure
+make install prefix="$path"
+
+mkdir -p "$path/lib"
+mkdir -p "$path/cuda"
+ln -sf "$path/bin/ccache" "$path/lib/cc"
+ln -sf "$path/bin/ccache" "$path/lib/c++"
+ln -sf "$path/bin/ccache" "$path/lib/gcc"
+ln -sf "$path/bin/ccache" "$path/lib/g++"
+ln -sf "$path/bin/ccache" "$path/cuda/nvcc"
+"$path/bin/ccache" -M 25Gi
+
+# Make sure the nvcc wrapped in CCache is runnable
+"$path/cuda/nvcc" --version
+echo 'Congrats! The CCache with nvcc support is installed!'
+echo -e "Please add the following lines to your bash init script:\\n"
+echo "################ Env Var for CCache with CUDA support ################"
+# shellcheck disable=SC2016
+echo 'export PATH="'"$path"'/lib:$PATH"'
+echo 'export CUDA_NVCC_EXECUTABLE="'"$path"'/cuda/nvcc"'
+echo '######################################################################'
diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py
new file mode 100644
index 0000000000000..a49debcc884ad
--- /dev/null
+++ b/scripts/get_python_cmake_flags.py
@@ -0,0 +1,24 @@
+## @package get_python_cmake_flags
+# Module scripts.get_python_cmake_flags
+##############################################################################
+# Use this script to find your preferred python installation.
+##############################################################################
+#
+# You can use the following to build with your preferred version of python
+# if your installation is not being properly detected by CMake.
+#
+#   mkdir -p build && cd build
+#   cmake $(python ../scripts/get_python_cmake_flags.py) ..
+#   make
+#
+
+
+import sys
+import sysconfig
+
+
+flags = [
+    f"-DPython_EXECUTABLE:FILEPATH={sys.executable}",
+]
+
+print(" ".join(flags), end="")
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
index 9875f88fff18a..9b7f2de742256 100755
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@@ -15,4 +15,8 @@ pip install --no-use-pep517 -e "$tp2_dir/onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh
index 3204b3212b3a9..0b4a1ffd72b0c 100755
--- a/scripts/onnx/install.sh
+++ b/scripts/onnx/install.sh
@@ -35,4 +35,8 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v .
+=======
+python setup.py install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/release/README.md b/scripts/release/README.md
index bc32bd0cb656c..ddde425de3009 100644
--- a/scripts/release/README.md
+++ b/scripts/release/README.md
@@ -10,7 +10,12 @@ These are a collection of scripts that are to be used for release activities.
 ### Order of Execution
 
 1. Run cut-release-branch.sh to cut the release branch
+<<<<<<< HEAD
 2. Run apply-release-changes.sh to apply release only changes to create a PR with release only changes similar to this [PR](https://github.com/pytorch/pytorch/pull/149056)
+=======
+2. Run tag-docker-images.sh to tag current docker images with release tag and push them to docker.io. These images will be used to build the release.
+3. Run apply-release-changes.sh to apply release only changes to create a PR with release only changes similar to this [PR](https://github.com/pytorch/pytorch/pull/149056)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #### Promoting packages
 
diff --git a/scripts/release/tag-docker-images.sh b/scripts/release/tag-docker-images.sh
new file mode 100644
index 0000000000000..f2299d6c463ee
--- /dev/null
+++ b/scripts/release/tag-docker-images.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#
+# Step 1 after branch cut is complete.
+#
+# Tags latest docker images for release branch.
+# In case of failure. The script can be rerun.
+#
+#  Before executing this script do:
+#  1. Create and Check out to Release Branch
+#  git checkout -b "${RELEASE_BRANCH}"
+#  2. Update submodules
+#  git submodule update --init --recursive
+#
+# Usage (run from root of project):
+#  DRY_RUN=disabled ./scripts/release/tag-docker-images.sh
+#
+
+set -eou pipefail
+
+GIT_TOP_DIR=$(git rev-parse --show-toplevel)
+RELEASE_VERSION=${RELEASE_VERSION:-$(cut -d'.' -f1-2 "${GIT_TOP_DIR}/version.txt")}
+DRY_RUN=${DRY_RUN:-enabled}
+
+python3 .github/scripts/tag_docker_images_for_release.py --version ${RELEASE_VERSION} --dry-run ${DRY_RUN}
diff --git a/scripts/release_notes/README.md b/scripts/release_notes/README.md
index c88533f937e7d..dd71c6e24fa74 100644
--- a/scripts/release_notes/README.md
+++ b/scripts/release_notes/README.md
@@ -130,7 +130,11 @@ This part is a little tedious but it seems to work. May want to explore using pa
 5. Install the google doc extension [docs to markdown](https://github.com/evbacher/gd2md-html)
 6. Start to compile back down these markdown files into a single markdown file.
 
+<<<<<<< HEAD
 `TODO`: This is by far the most manual process and is ripe for automation. If the next person up would like to investigate Google Doc APIS there is some room for improvement here.
+=======
+`TODO`: This is by far the most manual process and is ripe for automation. If the next person up would like to investigate Google Doc APIS there is some room hor improvement here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Part 4: Cherry Picks
 
@@ -187,7 +191,11 @@ You will then create a release at [Pytorch Release](https://github.com/pytorch/p
 
 #### Tidbits
 You will probably have a release note that doesn't fit into the character limit of github. I used the following regex:
+<<<<<<< HEAD
 `\[#(\d+)\]\(https://github.com/pytorch/pytorch/pull/\d+\)` to replace the full links to (#<pull-request-number>).
+=======
+`\[#(\d+)\]\(https://github.com/pytorch/pytorch/pull/\d+\)` to replace the full lunks to (#<pull-request-number>).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 This will get formatted correctly in the github UI and can be checked when creating a draft release.
 
 
diff --git a/scripts/remove_apache_header.sh b/scripts/remove_apache_header.sh
new file mode 100755
index 0000000000000..97980bfbb0ef6
--- /dev/null
+++ b/scripts/remove_apache_header.sh
@@ -0,0 +1,13 @@
+if [[ "$1" == *.py ]]; then
+  apache_header="apache_python.txt"
+else
+  apache_header="apache_header.txt"
+fi
+apache_lines=$(wc -l < "${apache_header}")
+apache_md5=$(cat "${apache_header}" | md5)
+header_md5=$(head -n ${apache_lines} $1 | md5)
+if [ "${header_md5}" == "${apache_md5}" ]; then
+  keep_lines=$(($(wc -l < $1) - ${apache_lines}))
+  tail -n ${keep_lines} $1 > _remove_apache_header.txt
+  mv _remove_apache_header.txt $1
+fi
diff --git a/scripts/temp.sh b/scripts/temp.sh
new file mode 100755
index 0000000000000..18eb2b4733816
--- /dev/null
+++ b/scripts/temp.sh
@@ -0,0 +1,7 @@
+find ../caffe2 -name "*.py" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.h" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cc" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cpp" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.cu" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.mm" -exec ./remove_apache_header.sh {} \;
+find ../caffe2 -name "*.m" -exec ./remove_apache_header.sh {} \;
diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb
new file mode 100644
index 0000000000000..0734167bdda11
--- /dev/null
+++ b/scripts/xcode_build.rb
@@ -0,0 +1,76 @@
+require 'optparse'
+require 'xcodeproj'
+
+options = {}
+option_parser = OptionParser.new do |opts|
+ opts.banner = 'Tools for building PyTorch iOS framework on MacOS'
+ opts.on('-i', '--install_path ', 'path to the cmake install folder') { |value|
+    options[:install] = value
+ }
+ opts.on('-x', '--xcodeproj_path ', 'path to the XCode project file') { |value|
+    options[:xcodeproj] = value
+ }
+ opts.on('-p', '--platform ', 'platform for the current build, OS or SIMULATOR') { |value|
+    options[:platform] = value
+ }
+end.parse!
+puts options.inspect
+
+install_path = File.expand_path(options[:install])
+if not Dir.exist? (install_path)
+    raise "path don't exist:#{install_path}!"
+end
+xcodeproj_path = File.expand_path(options[:xcodeproj])
+if not File.exist? (xcodeproj_path)
+    raise "path don't exist:#{xcodeproj_path}!"
+end
+
+project = Xcodeproj::Project.open(xcodeproj_path)
+target = project.targets.first #TestApp
+header_search_path      = ['$(inherited)', "#{install_path}/include"]
+libraries_search_path   = ['$(inherited)', "#{install_path}/lib"]
+other_linker_flags      = ['$(inherited)', "-all_load"]
+
+target.build_configurations.each do |config|
+    config.build_settings['HEADER_SEARCH_PATHS']    = header_search_path
+    config.build_settings['LIBRARY_SEARCH_PATHS']   = libraries_search_path
+    config.build_settings['OTHER_LDFLAGS']          = other_linker_flags
+    config.build_settings['ENABLE_BITCODE']         = 'No'
+end
+
+# link static libraries
+target.frameworks_build_phases.clear
+libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
+for lib in libs do
+    path = "#{install_path}/lib/#{lib}"
+    if File.exist?(path)
+        libref = project.frameworks_group.new_file(path)
+        target.frameworks_build_phases.add_file_reference(libref)
+    end
+end
+# link system frameworks
+frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
+if frameworks
+    frameworks.each do |framework|
+        path = "System/Library/Frameworks/#{framework}.framework"
+        framework_ref = project.frameworks_group.new_reference(path)
+        framework_ref.name = "#{framework}.framework"
+        framework_ref.source_tree = 'SDKROOT'
+        target.frameworks_build_phases.add_file_reference(framework_ref)
+    end
+end
+project.save
+
+sdk = nil
+arch = nil
+if options[:platform] == 'SIMULATOR'
+    sdk = 'iphonesimulator'
+    arch = 'arm64'
+elsif options[:platform] == 'OS'
+    sdk = 'iphoneos'
+    arch = 'arm64'
+else
+    raise "unsupported platform #{options[:platform]}"
+end
+
+exec "xcodebuild clean build -project #{xcodeproj_path} -alltargets -sdk #{sdk} -configuration Release -arch #{arch}"
diff --git a/setup.py b/setup.py
index 1d5f788c96cea..b986debaf34db 100644
--- a/setup.py
+++ b/setup.py
@@ -58,9 +58,12 @@
 #   USE_FBGEMM=0
 #     disables the FBGEMM build
 #
+<<<<<<< HEAD
 #   USE_FBGEMM_GENAI=1
 #     enables the FBGEMM GenAI kernels to build
 #
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   USE_KINETO=0
 #     disables usage of libkineto library for profiling
 #
@@ -156,6 +159,7 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+<<<<<<< HEAD
 #   USE_LAYERNORM_FAST_RECIPROCAL
 #     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
 #     layer normalization. Default: enabled.
@@ -166,6 +170,12 @@
 #   USE_ROCM_CK_SDPA=1
 #     Enable building CK SDPA backend in ROCm platform
 #
+=======
+#   PYTORCH_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Environment variables we respect (these environment variables are
 # conventional and are often understood/set by other software.)
 #
@@ -229,18 +239,28 @@
 #
 #   USE_MIMALLOC
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
+<<<<<<< HEAD
 #      By default, It is only enabled on Windows and AArch64.
+=======
+#      By default, It is only enabled on Windows.
+#
+#   USE_PRIORITIZED_TEXT_FOR_LD
+#      Uses prioritized text form cmake/prioritized_text.txt for LD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 #   BUILD_LIBTORCH_WHL
 #      Builds libtorch.so and its dependencies as a wheel
 #
 #   BUILD_PYTHON_ONLY
 #      Builds pytorch as a wheel using libtorch.so from a separate wheel
+<<<<<<< HEAD
 #
 #   USE_NIGHTLY=VERSION
 #      Skip cmake build and instead download and extract nightly PyTorch wheel
 #      matching the specified version (e.g., USE_NIGHTLY="2.8.0.dev20250608+cpu")
 #      into the local directory for development use
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from __future__ import annotations
 
@@ -250,15 +270,20 @@
 
 if sys.platform == "win32" and sys.maxsize.bit_length() == 31:
     print(
+<<<<<<< HEAD
         "32-bit Windows Python runtime is not supported. "
         "Please switch to 64-bit Python.",
         file=sys.stderr,
+=======
+        "32-bit Windows Python runtime is not supported. Please switch to 64-bit Python."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     sys.exit(-1)
 
 import platform
 
 
+<<<<<<< HEAD
 # Also update `project.requires-python` in pyproject.toml when changing this
 python_min_version = (3, 10, 0)
 python_min_version_str = ".".join(map(str, python_min_version))
@@ -267,17 +292,29 @@
         f"You are using Python {platform.python_version()}. "
         f"Python >={python_min_version_str} is required.",
         file=sys.stderr,
+=======
+python_min_version = (3, 9, 0)
+python_min_version_str = ".".join(map(str, python_min_version))
+if sys.version_info < python_min_version:
+    print(
+        f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     sys.exit(-1)
 
 import filecmp
 import glob
 import importlib
+<<<<<<< HEAD
 import itertools
+=======
+import importlib.util
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import json
 import shutil
 import subprocess
 import sysconfig
+<<<<<<< HEAD
 import tempfile
 import textwrap
 import time
@@ -324,6 +361,21 @@
     IS_LINUX,
     IS_WINDOWS,
 )
+=======
+import time
+from collections import defaultdict
+
+import setuptools.command.build_ext
+import setuptools.command.install
+import setuptools.command.sdist
+from setuptools import Extension, find_packages, setup
+from setuptools.dist import Distribution
+from tools.build_pytorch_libs import build_pytorch
+from tools.generate_torch_version import get_torch_version
+from tools.setup_helpers.cmake import CMake
+from tools.setup_helpers.env import build_type, IS_DARWIN, IS_LINUX, IS_WINDOWS
+from tools.setup_helpers.generate_linker_script import gen_linker_script
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def str2bool(value: str | None) -> bool:
@@ -367,28 +419,52 @@ def str2bool(value: str | None) -> bool:
     raise ValueError(f"Invalid string value for boolean conversion: {value}")
 
 
+<<<<<<< HEAD
 def _get_package_path(package_name: str) -> Path:
     from importlib.util import find_spec
 
     spec = find_spec(package_name)
+=======
+def _get_package_path(package_name):
+    spec = importlib.util.find_spec(package_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if spec:
         # The package might be a namespace package, so get_data may fail
         try:
             loader = spec.loader
             if loader is not None:
                 file_path = loader.get_filename()  # type: ignore[attr-defined]
+<<<<<<< HEAD
                 return Path(file_path).parent
         except AttributeError:
             pass
     return CWD / package_name
+=======
+                return os.path.dirname(file_path)
+        except AttributeError:
+            pass
+    return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 BUILD_LIBTORCH_WHL = str2bool(os.getenv("BUILD_LIBTORCH_WHL"))
 BUILD_PYTHON_ONLY = str2bool(os.getenv("BUILD_PYTHON_ONLY"))
 
+<<<<<<< HEAD
 if BUILD_PYTHON_ONLY:
     os.environ["BUILD_LIBTORCHLESS"] = "ON"
     os.environ["LIBTORCH_LIB_PATH"] = (_get_package_path("torch") / "lib").as_posix()
+=======
+# set up appropriate env variables
+if BUILD_LIBTORCH_WHL:
+    # Set up environment variables for ONLY building libtorch.so and not libtorch_python.so
+    # functorch is not supported without python
+    os.environ["BUILD_FUNCTORCH"] = "OFF"
+
+if BUILD_PYTHON_ONLY:
+    os.environ["BUILD_LIBTORCHLESS"] = "ON"
+    os.environ["LIBTORCH_LIB_PATH"] = f"{_get_package_path('torch')}/lib"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ################################################################################
 # Parameters parsed from environment
@@ -399,8 +475,13 @@ def _get_package_path(package_name: str) -> Path:
 # see if the user passed a quiet flag to setup.py arguments and respect
 # that in our parts of the build
 EMIT_BUILD_WARNING = False
+<<<<<<< HEAD
 RERUN_CMAKE = str2bool(os.environ.pop("CMAKE_FRESH", None))
 CMAKE_ONLY = str2bool(os.environ.pop("CMAKE_ONLY", None))
+=======
+RERUN_CMAKE = str2bool(os.getenv("CMAKE_FRESH"))
+CMAKE_ONLY = str2bool(os.getenv("CMAKE_ONLY"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 filtered_args = []
 for i, arg in enumerate(sys.argv):
     if arg == "--cmake":
@@ -414,6 +495,7 @@ def _get_package_path(package_name: str) -> Path:
     if arg == "rebuild" or arg == "build":
         arg = "build"  # rebuild is gone, make it build
         EMIT_BUILD_WARNING = True
+<<<<<<< HEAD
     if arg == "develop":
         print(
             (
@@ -449,18 +531,25 @@ def _get_package_path(package_name: str) -> Path:
             env={**os.environ},
         )
         sys.exit(result.returncode)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if arg == "--":
         filtered_args += sys.argv[i:]
         break
     if arg == "-q" or arg == "--quiet":
         VERBOSE_SCRIPT = False
+<<<<<<< HEAD
     if arg in ["clean", "dist_info", "egg_info", "sdist"]:
+=======
+    if arg in ["clean", "egg_info", "sdist"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         RUN_BUILD_DEPS = False
     filtered_args.append(arg)
 sys.argv = filtered_args
 
 if VERBOSE_SCRIPT:
 
+<<<<<<< HEAD
     def report(
         *args: Any, file: IO[str] = sys.stderr, flush: bool = True, **kwargs: Any
     ) -> None:
@@ -499,12 +588,46 @@ def report(
     CMAKE_PYTHON_LIBRARY = Path(
         sysconfig.get_config_var("LIBDIR")
     ) / sysconfig.get_config_var("INSTSONAME")
+=======
+    def report(*args):
+        print(*args)
+
+else:
+
+    def report(*args):
+        pass
+
+    # Make distutils respect --quiet too
+    setuptools.distutils.log.warn = report
+
+# Constant known variables used throughout this file
+cwd = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.join(cwd, "torch", "lib")
+third_party_path = os.path.join(cwd, "third_party")
+
+# CMAKE: full path to python library
+if IS_WINDOWS:
+    cmake_python_library = "{}/libs/python{}.lib".format(
+        sysconfig.get_config_var("prefix"), sysconfig.get_config_var("VERSION")
+    )
+    # Fix virtualenv builds
+    if not os.path.exists(cmake_python_library):
+        cmake_python_library = "{}/libs/python{}.lib".format(
+            sys.base_prefix, sysconfig.get_config_var("VERSION")
+        )
+else:
+    cmake_python_library = "{}/{}".format(
+        sysconfig.get_config_var("LIBDIR"), sysconfig.get_config_var("INSTSONAME")
+    )
+cmake_python_include_dir = sysconfig.get_path("include")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ################################################################################
 # Version, create_version_file, and package_name
 ################################################################################
 
+<<<<<<< HEAD
 TORCH_PACKAGE_NAME = os.getenv("TORCH_PACKAGE_NAME", "torch")
 LIBTORCH_PKG_NAME = os.getenv("LIBTORCH_PACKAGE_NAME", "torch_no_python")
 if BUILD_LIBTORCH_WHL:
@@ -512,14 +635,32 @@ def report(
 
 TORCH_VERSION = get_torch_version()
 report(f"Building wheel {TORCH_PACKAGE_NAME}-{TORCH_VERSION}")
+=======
+package_name = os.getenv("TORCH_PACKAGE_NAME", "torch")
+LIBTORCH_PKG_NAME = os.getenv("LIBTORCH_PACKAGE_NAME", "torch_no_python")
+if BUILD_LIBTORCH_WHL:
+    package_name = LIBTORCH_PKG_NAME
+
+
+package_type = os.getenv("PACKAGE_TYPE", "wheel")
+version = get_torch_version()
+report(f"Building wheel {package_name}-{version}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cmake = CMake()
 
 
+<<<<<<< HEAD
 def get_submodule_folders() -> list[Path]:
     git_modules_file = CWD / ".gitmodules"
     default_modules_path = [
         THIRD_PARTY_DIR / name
+=======
+def get_submodule_folders():
+    git_modules_path = os.path.join(cwd, ".gitmodules")
+    default_modules_path = [
+        os.path.join(third_party_path, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for name in [
             "gloo",
             "cpuinfo",
@@ -528,26 +669,46 @@ def get_submodule_folders() -> list[Path]:
             "cutlass",
         ]
     ]
+<<<<<<< HEAD
     if not git_modules_file.exists():
         return default_modules_path
     with git_modules_file.open(encoding="utf-8") as f:
         return [
             CWD / line.partition("=")[-1].strip()
+=======
+    if not os.path.exists(git_modules_path):
+        return default_modules_path
+    with open(git_modules_path) as f:
+        return [
+            os.path.join(cwd, line.split("=", 1)[1].strip())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for line in f
             if line.strip().startswith("path")
         ]
 
 
+<<<<<<< HEAD
 def check_submodules() -> None:
     def check_for_files(folder: Path, files: list[str]) -> None:
         if not any((folder / f).exists() for f in files):
+=======
+def check_submodules():
+    def check_for_files(folder, files):
+        if not any(os.path.exists(os.path.join(folder, f)) for f in files):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             report("Could not find any of {} in {}".format(", ".join(files), folder))
             report("Did you run 'git submodule update --init --recursive'?")
             sys.exit(1)
 
+<<<<<<< HEAD
     def not_exists_or_empty(folder: Path) -> bool:
         return not folder.exists() or (
             folder.is_dir() and next(folder.iterdir(), None) is None
+=======
+    def not_exists_or_empty(folder):
+        return not os.path.exists(folder) or (
+            os.path.isdir(folder) and len(os.listdir(folder)) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if str2bool(os.getenv("USE_SYSTEM_LIBS")):
@@ -559,7 +720,11 @@ def not_exists_or_empty(folder: Path) -> bool:
             report(" --- Trying to initialize submodules")
             start = time.time()
             subprocess.check_call(
+<<<<<<< HEAD
                 ["git", "submodule", "update", "--init", "--recursive"], cwd=CWD
+=======
+                ["git", "submodule", "update", "--init", "--recursive"], cwd=cwd
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             end = time.time()
             report(f" --- Submodule initialization took {end - start:.2f} sec")
@@ -580,18 +745,27 @@ def not_exists_or_empty(folder: Path) -> bool:
             ],
         )
     check_for_files(
+<<<<<<< HEAD
         THIRD_PARTY_DIR / "fbgemm" / "external" / "asmjit",
+=======
+        os.path.join(third_party_path, "fbgemm", "external", "asmjit"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ["CMakeLists.txt"],
     )
 
 
 # Windows has very bad support for symbolic links.
 # Instead of using symlinks, we're going to copy files over
+<<<<<<< HEAD
 def mirror_files_into_torchgen() -> None:
+=======
+def mirror_files_into_torchgen():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # (new_path, orig_path)
     # Directories are OK and are recursively mirrored.
     paths = [
         (
+<<<<<<< HEAD
             CWD / "torchgen/packaged/ATen/native/native_functions.yaml",
             CWD / "aten/src/ATen/native/native_functions.yaml",
         ),
@@ -623,6 +797,27 @@ def mirror_files_into_torchgen() -> None:
             continue
         if orig_path.is_dir():
             if new_path.exists():
+=======
+            "torchgen/packaged/ATen/native/native_functions.yaml",
+            "aten/src/ATen/native/native_functions.yaml",
+        ),
+        ("torchgen/packaged/ATen/native/tags.yaml", "aten/src/ATen/native/tags.yaml"),
+        ("torchgen/packaged/ATen/templates", "aten/src/ATen/templates"),
+        ("torchgen/packaged/autograd", "tools/autograd"),
+        ("torchgen/packaged/autograd/templates", "tools/autograd/templates"),
+    ]
+    for new_path, orig_path in paths:
+        # Create the dirs involved in new_path if they don't exist
+        if not os.path.exists(new_path):
+            os.makedirs(os.path.dirname(new_path), exist_ok=True)
+
+        # Copy the files from the orig location to the new location
+        if os.path.isfile(orig_path):
+            shutil.copyfile(orig_path, new_path)
+            continue
+        if os.path.isdir(orig_path):
+            if os.path.exists(new_path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # copytree fails if the tree exists already, so remove it.
                 shutil.rmtree(new_path)
             shutil.copytree(orig_path, new_path)
@@ -630,6 +825,7 @@ def mirror_files_into_torchgen() -> None:
         raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
 
 
+<<<<<<< HEAD
 # ATTENTION: THIS IS AI SLOP
 def extract_variant_from_version(version: str) -> str:
     """Extract variant from version string, defaulting to 'cpu'."""
@@ -1002,6 +1198,18 @@ def build_deps() -> None:
         version=TORCH_VERSION,
         cmake_python_library=CMAKE_PYTHON_LIBRARY.as_posix(),
         build_python=not BUILD_LIBTORCH_WHL,
+=======
+# all the work we need to do _before_ setup runs
+def build_deps():
+    report("-- Building version " + version)
+    check_submodules()
+    check_pydep("yaml", "pyyaml")
+    build_python = not BUILD_LIBTORCH_WHL
+    build_pytorch(
+        version=version,
+        cmake_python_library=cmake_python_library,
+        build_python=build_python,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rerun_cmake=RERUN_CMAKE,
         cmake_only=CMAKE_ONLY,
         cmake=cmake,
@@ -1011,13 +1219,18 @@ def build_deps() -> None:
         report(
             'Finished running cmake. Run "ccmake build" or '
             '"cmake-gui build" to adjust build options and '
+<<<<<<< HEAD
             '"python -m pip install --no-build-isolation -v ." to build.'
+=======
+            '"python setup.py install" to build.'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         sys.exit()
 
     # Use copies instead of symbolic files.
     # Windows has very poor support for them.
     sym_files = [
+<<<<<<< HEAD
         CWD / "tools/shared/_utils_internal.py",
         CWD / "torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h",
         CWD / "torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h",
@@ -1034,6 +1247,24 @@ def build_deps() -> None:
                 same = True
             else:
                 sym_file.unlink()
+=======
+        "tools/shared/_utils_internal.py",
+        "torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h",
+        "torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h",
+    ]
+    orig_files = [
+        "torch/_utils_internal.py",
+        "third_party/valgrind-headers/callgrind.h",
+        "third_party/valgrind-headers/valgrind.h",
+    ]
+    for sym_file, orig_file in zip(sym_files, orig_files):
+        same = False
+        if os.path.exists(sym_file):
+            if filecmp.cmp(sym_file, orig_file):
+                same = True
+            else:
+                os.remove(sym_file)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not same:
             shutil.copyfile(orig_file, sym_file)
 
@@ -1048,7 +1279,11 @@ def build_deps() -> None:
 """.strip()
 
 
+<<<<<<< HEAD
 def check_pydep(importname: str, module: str) -> None:
+=======
+def check_pydep(importname, module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         importlib.import_module(importname)
     except ImportError as e:
@@ -1058,6 +1293,7 @@ def check_pydep(importname: str, module: str) -> None:
 
 
 class build_ext(setuptools.command.build_ext.build_ext):
+<<<<<<< HEAD
     def _embed_libomp(self) -> None:
         # Copy libiomp5.dylib/libomp.dylib inside the wheel package on MacOS
         build_lib = Path(self.build_lib)
@@ -1074,6 +1310,21 @@ def _embed_libomp(self) -> None:
         )
         rpaths: list[str] = []
         libs: list[str] = []
+=======
+    def _embed_libomp(self):
+        # Copy libiomp5.dylib/libomp.dylib inside the wheel package on MacOS
+        lib_dir = os.path.join(self.build_lib, "torch", "lib")
+        libtorch_cpu_path = os.path.join(lib_dir, "libtorch_cpu.dylib")
+        if not os.path.exists(libtorch_cpu_path):
+            return
+        # Parse libtorch_cpu load commands
+        otool_cmds = (
+            subprocess.check_output(["otool", "-l", libtorch_cpu_path])
+            .decode("utf-8")
+            .split("\n")
+        )
+        rpaths, libs = [], []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx, line in enumerate(otool_cmds):
             if line.strip() == "cmd LC_LOAD_DYLIB":
                 lib_name = otool_cmds[idx + 2].strip()
@@ -1085,9 +1336,14 @@ def _embed_libomp(self) -> None:
                 assert rpath.startswith("path ")
                 rpaths.append(rpath.split(" ", 1)[1].rsplit("(", 1)[0][:-1])
 
+<<<<<<< HEAD
         omplib_path: str = get_cmake_cache_vars()["OpenMP_libomp_LIBRARY"]  # type: ignore[assignment]
         omplib_name: str = get_cmake_cache_vars()["OpenMP_C_LIB_NAMES"]  # type: ignore[assignment]
         omplib_name += ".dylib"
+=======
+        omplib_path = get_cmake_cache_vars()["OpenMP_libomp_LIBRARY"]
+        omplib_name = get_cmake_cache_vars()["OpenMP_C_LIB_NAMES"] + ".dylib"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         omplib_rpath_path = os.path.join("@rpath", omplib_name)
 
         # This logic is fragile and checks only two cases:
@@ -1097,9 +1353,14 @@ def _embed_libomp(self) -> None:
             return
 
         # Copy libomp/libiomp5 from rpath locations
+<<<<<<< HEAD
         target_lib = build_torch_lib_dir / omplib_name
         libomp_relocated = False
         install_name_tool_args: list[str] = []
+=======
+        target_lib = os.path.join(self.build_lib, "torch", "lib", omplib_name)
+        libomp_relocated = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for rpath in rpaths:
             source_lib = os.path.join(rpath, omplib_name)
             if not os.path.exists(source_lib):
@@ -1130,6 +1391,7 @@ def _embed_libomp(self) -> None:
                 ]
             libomp_relocated = True
         if libomp_relocated:
+<<<<<<< HEAD
             install_name_tool_args = [
                 "install_name_tool",
                 *install_name_tool_args,
@@ -1156,6 +1418,27 @@ def run(self) -> None:
         # Report build options. This is run after the build completes so # `CMakeCache.txt` exists
         # and we can get an accurate report on what is used and what is not.
         cmake_cache_vars = get_cmake_cache_vars()
+=======
+            install_name_tool_args.insert(0, "install_name_tool")
+            install_name_tool_args.append(libtorch_cpu_path)
+            subprocess.check_call(install_name_tool_args)
+        # Copy omp.h from OpenMP_C_FLAGS and copy it into include folder
+        omp_cflags = get_cmake_cache_vars()["OpenMP_C_FLAGS"]
+        if not omp_cflags:
+            return
+        for include_dir in [f[2:] for f in omp_cflags.split(" ") if f.startswith("-I")]:
+            omp_h = os.path.join(include_dir, "omp.h")
+            if not os.path.exists(omp_h):
+                continue
+            target_omp_h = os.path.join(self.build_lib, "torch", "include", "omp.h")
+            self.copy_file(omp_h, target_omp_h)
+            break
+
+    def run(self):
+        # Report build options. This is run after the build completes so # `CMakeCache.txt` exists and we can get an
+        # accurate report on what is used and what is not.
+        cmake_cache_vars = defaultdict(lambda: False, cmake.get_cmake_cache_variables())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cmake_cache_vars["USE_NUMPY"]:
             report("-- Building with NumPy bindings")
         else:
@@ -1163,17 +1446,31 @@ def run(self) -> None:
         if cmake_cache_vars["USE_CUDNN"]:
             report(
                 "-- Detected cuDNN at "
+<<<<<<< HEAD
                 f"{cmake_cache_vars['CUDNN_LIBRARY']}, "
                 f"{cmake_cache_vars['CUDNN_INCLUDE_DIR']}"
+=======
+                + cmake_cache_vars["CUDNN_LIBRARY"]
+                + ", "
+                + cmake_cache_vars["CUDNN_INCLUDE_DIR"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             report("-- Not using cuDNN")
         if cmake_cache_vars["USE_CUDA"]:
+<<<<<<< HEAD
             report(f"-- Detected CUDA at {cmake_cache_vars['CUDA_TOOLKIT_ROOT_DIR']}")
         else:
             report("-- Not using CUDA")
         if cmake_cache_vars["USE_XPU"]:
             report(f"-- Detected XPU runtime at {cmake_cache_vars['SYCL_LIBRARY_DIR']}")
+=======
+            report("-- Detected CUDA at " + cmake_cache_vars["CUDA_TOOLKIT_ROOT_DIR"])
+        else:
+            report("-- Not using CUDA")
+        if cmake_cache_vars["USE_XPU"]:
+            report("-- Detected XPU runtime at " + cmake_cache_vars["SYCL_LIBRARY_DIR"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             report("-- Not using XPU")
         if cmake_cache_vars["USE_MKLDNN"]:
@@ -1192,9 +1489,16 @@ def run(self) -> None:
             report("-- Not using MKLDNN")
         if cmake_cache_vars["USE_NCCL"] and cmake_cache_vars["USE_SYSTEM_NCCL"]:
             report(
+<<<<<<< HEAD
                 "-- Using system provided NCCL library at "
                 f"{cmake_cache_vars['NCCL_LIBRARIES']}, "
                 f"{cmake_cache_vars['NCCL_INCLUDE_DIRS']}"
+=======
+                "-- Using system provided NCCL library at {}, {}".format(
+                    cmake_cache_vars["NCCL_LIBRARIES"],
+                    cmake_cache_vars["NCCL_INCLUDE_DIRS"],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif cmake_cache_vars["USE_NCCL"]:
             report("-- Building NCCL library")
@@ -1205,15 +1509,29 @@ def run(self) -> None:
                 report("-- Building without distributed package")
             else:
                 report("-- Building with distributed package: ")
+<<<<<<< HEAD
                 report(f"  -- USE_TENSORPIPE={cmake_cache_vars['USE_TENSORPIPE']}")
                 report(f"  -- USE_GLOO={cmake_cache_vars['USE_GLOO']}")
                 report(f"  -- USE_MPI={cmake_cache_vars['USE_OPENMPI']}")
+=======
+                report(
+                    "  -- USE_TENSORPIPE={}".format(cmake_cache_vars["USE_TENSORPIPE"])
+                )
+                report("  -- USE_GLOO={}".format(cmake_cache_vars["USE_GLOO"]))
+                report("  -- USE_MPI={}".format(cmake_cache_vars["USE_OPENMPI"]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             report("-- Building without distributed package")
         if cmake_cache_vars["STATIC_DISPATCH_BACKEND"]:
             report(
+<<<<<<< HEAD
                 "-- Using static dispatch with "
                 f"backend {cmake_cache_vars['STATIC_DISPATCH_BACKEND']}"
+=======
+                "-- Using static dispatch with backend {}".format(
+                    cmake_cache_vars["STATIC_DISPATCH_BACKEND"]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         if cmake_cache_vars["USE_LIGHTWEIGHT_DISPATCH"]:
             report("-- Using lightweight dispatch")
@@ -1223,19 +1541,39 @@ def run(self) -> None:
         else:
             report("-- Not using ITT")
 
+<<<<<<< HEAD
         super().run()
+=======
+        # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
+        # in system CFLAGS
+        c_flags = str(os.getenv("CFLAGS", ""))
+        if (
+            IS_LINUX
+            and "-fstack-clash-protection" in c_flags
+            and "clang" in os.environ.get("CC", "")
+        ):
+            os.environ["CC"] = str(os.environ["CC"])
+
+        # It's an old-style class in Python 2.7...
+        setuptools.command.build_ext.build_ext.run(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if IS_DARWIN:
             self._embed_libomp()
 
         # Copy the essential export library to compile C++ extensions.
         if IS_WINDOWS:
+<<<<<<< HEAD
             build_temp = Path(self.build_temp)
             build_lib = Path(self.build_lib)
+=======
+            build_temp = self.build_temp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             ext_filename = self.get_ext_filename("_C")
             lib_filename = ".".join(ext_filename.split(".")[:-1]) + ".lib"
 
+<<<<<<< HEAD
             export_lib = build_temp / "torch" / "csrc" / lib_filename
             target_lib = build_lib / "torch" / "lib" / "_C.lib"
 
@@ -1252,10 +1590,72 @@ def build_extensions(self) -> None:
 
     def get_outputs(self) -> list[str]:
         outputs = super().get_outputs()
+=======
+            export_lib = os.path.join(
+                build_temp, "torch", "csrc", lib_filename
+            ).replace("\\", "/")
+
+            build_lib = self.build_lib
+
+            target_lib = os.path.join(build_lib, "torch", "lib", "_C.lib").replace(
+                "\\", "/"
+            )
+
+            # Create "torch/lib" directory if not exists.
+            # (It is not created yet in "develop" mode.)
+            target_dir = os.path.dirname(target_lib)
+            if not os.path.exists(target_dir):
+                os.makedirs(target_dir)
+
+            self.copy_file(export_lib, target_lib)
+
+            # In ROCm on Windows case copy rocblas and hipblaslt files into
+            # torch/lib/rocblas/library and torch/lib/hipblaslt/library
+            if str2bool(os.getenv("USE_ROCM")):
+                rocm_dir_path = os.environ.get("ROCM_DIR")
+                rocm_bin_path = os.path.join(rocm_dir_path, "bin")
+
+                rocblas_dir = os.path.join(rocm_bin_path, "rocblas")
+                target_rocblas_dir = os.path.join(target_dir, "rocblas")
+                os.makedirs(target_rocblas_dir, exist_ok=True)
+                self.copy_tree(rocblas_dir, target_rocblas_dir)
+
+                hipblaslt_dir = os.path.join(rocm_bin_path, "hipblaslt")
+                target_hipblaslt_dir = os.path.join(target_dir, "hipblaslt")
+                os.makedirs(target_hipblaslt_dir, exist_ok=True)
+                self.copy_tree(hipblaslt_dir, target_hipblaslt_dir)
+            else:
+                report("The specified environment variable does not exist.")
+
+    def build_extensions(self):
+        self.create_compile_commands()
+
+        # Copy functorch extension
+        for i, ext in enumerate(self.extensions):
+            if ext.name != "functorch._C":
+                continue
+            fullname = self.get_ext_fullname(ext.name)
+            filename = self.get_ext_filename(fullname)
+            fileext = os.path.splitext(filename)[1]
+            src = os.path.join(os.path.dirname(filename), "functorch" + fileext)
+            dst = os.path.join(os.path.realpath(self.build_lib), filename)
+            if os.path.exists(src):
+                report(f"Copying {ext.name} from {src} to {dst}")
+                dst_dir = os.path.dirname(dst)
+                if not os.path.exists(dst_dir):
+                    os.makedirs(dst_dir)
+                self.copy_file(src, dst)
+
+        setuptools.command.build_ext.build_ext.build_extensions(self)
+
+    def get_outputs(self):
+        outputs = setuptools.command.build_ext.build_ext.get_outputs(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs.append(os.path.join(self.build_lib, "caffe2"))
         report(f"setup.py::get_outputs returning {outputs}")
         return outputs
 
+<<<<<<< HEAD
     def create_compile_commands(self) -> None:
         def load(file: Path) -> list[dict[str, Any]]:
             return json.loads(file.read_text(encoding="utf-8"))
@@ -1267,6 +1667,16 @@ def load(file: Path) -> list[dict[str, Any]]:
             for f in itertools.chain(ninja_files, cmake_files)
             for entry in load(f)
         ]
+=======
+    def create_compile_commands(self):
+        def load(filename):
+            with open(filename) as f:
+                return json.load(f)
+
+        ninja_files = glob.glob("build/*compile_commands.json")
+        cmake_files = glob.glob("torch/lib/build/*/compile_commands.json")
+        all_commands = [entry for f in ninja_files + cmake_files for entry in load(f)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # cquery does not like c++ compiles that start with gcc.
         # It forgets to include the c++ header directories.
@@ -1278,11 +1688,20 @@ def load(file: Path) -> list[dict[str, Any]]:
 
         new_contents = json.dumps(all_commands, indent=2)
         contents = ""
+<<<<<<< HEAD
         compile_commands_json = CWD / "compile_commands.json"
         if compile_commands_json.exists():
             contents = compile_commands_json.read_text(encoding="utf-8")
         if contents != new_contents:
             compile_commands_json.write_text(new_contents, encoding="utf-8")
+=======
+        if os.path.exists("compile_commands.json"):
+            with open("compile_commands.json") as f:
+                contents = f.read()
+        if contents != new_contents:
+            with open("compile_commands.json", "w") as f:
+                f.write(new_contents)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class concat_license_files:
@@ -1294,6 +1713,7 @@ class concat_license_files:
     licensing info.
     """
 
+<<<<<<< HEAD
     def __init__(self, include_files: bool = False) -> None:
         self.f1 = CWD / "LICENSE"
         self.f2 = THIRD_PARTY_DIR / "LICENSES_BUNDLED.txt"
@@ -1380,10 +1800,117 @@ def run(self) -> None:
 # Need to dump submodule hashes and create the proper LICENSE.txt for the sdist
 class sdist(setuptools.command.sdist.sdist):
     def run(self) -> None:
+=======
+    def __init__(self, include_files=False):
+        self.f1 = "LICENSE"
+        self.f2 = "third_party/LICENSES_BUNDLED.txt"
+        self.include_files = include_files
+
+    def __enter__(self):
+        """Concatenate files"""
+
+        old_path = sys.path
+        sys.path.append(third_party_path)
+        try:
+            from build_bundled import create_bundled
+        finally:
+            sys.path = old_path
+
+        with open(self.f1) as f1:
+            self.bsd_text = f1.read()
+
+        with open(self.f1, "a") as f1:
+            f1.write("\n\n")
+            create_bundled(
+                os.path.relpath(third_party_path), f1, include_files=self.include_files
+            )
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        """Restore content of f1"""
+        with open(self.f1, "w") as f:
+            f.write(self.bsd_text)
+
+
+try:
+    from wheel.bdist_wheel import bdist_wheel
+except ImportError:
+    # This is useful when wheel is not installed and bdist_wheel is not
+    # specified on the command line. If it _is_ specified, parsing the command
+    # line will fail before wheel_concatenate is needed
+    wheel_concatenate = None
+else:
+    # Need to create the proper LICENSE.txt for the wheel
+    class wheel_concatenate(bdist_wheel):
+        """check submodules on sdist to prevent incomplete tarballs"""
+
+        def run(self):
+            with concat_license_files(include_files=True):
+                super().run()
+
+        def write_wheelfile(self, *args, **kwargs):
+            super().write_wheelfile(*args, **kwargs)
+
+            if BUILD_LIBTORCH_WHL:
+                # Remove extraneneous files in the libtorch wheel
+                for root, dirs, files in os.walk(self.bdist_dir):
+                    for file in files:
+                        if file.endswith((".a", ".so")) and os.path.isfile(
+                            os.path.join(self.bdist_dir, file)
+                        ):
+                            os.remove(os.path.join(root, file))
+                        elif file.endswith(".py"):
+                            os.remove(os.path.join(root, file))
+                # need an __init__.py file otherwise we wouldn't have a package
+                open(os.path.join(self.bdist_dir, "torch", "__init__.py"), "w").close()
+
+
+class install(setuptools.command.install.install):
+    def run(self):
+        super().run()
+
+
+class clean(setuptools.Command):
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        import glob
+        import re
+
+        with open(".gitignore") as f:
+            ignores = f.read()
+            pat = re.compile(r"^#( BEGIN NOT-CLEAN-FILES )?")
+            for wildcard in filter(None, ignores.split("\n")):
+                match = pat.match(wildcard)
+                if match:
+                    if match.group(1):
+                        # Marker is found and stop reading .gitignore.
+                        break
+                    # Ignore lines which begin with '#'.
+                else:
+                    # Don't remove absolute paths from the system
+                    wildcard = wildcard.lstrip("./")
+
+                    for filename in glob.glob(wildcard):
+                        try:
+                            os.remove(filename)
+                        except OSError:
+                            shutil.rmtree(filename, ignore_errors=True)
+
+
+class sdist(setuptools.command.sdist.sdist):
+    def run(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with concat_license_files():
             super().run()
 
 
+<<<<<<< HEAD
 def get_cmake_cache_vars() -> defaultdict[str, CMakeValue]:
     try:
         return defaultdict(lambda: False, cmake.get_cmake_cache_variables())
@@ -1400,6 +1927,17 @@ def configure_extension_build() -> tuple[
     dict[str, list[str]],  # entry_points
     list[str],  # extra_install_requires
 ]:
+=======
+def get_cmake_cache_vars():
+    try:
+        return defaultdict(lambda: False, cmake.get_cmake_cache_variables())
+    except FileNotFoundError:
+        # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory.
+        return defaultdict(lambda: False)
+
+
+def configure_extension_build():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Configures extension build options according to system environment and user's choice.
 
     Returns:
@@ -1412,17 +1950,30 @@ def configure_extension_build() -> tuple[
     # Configure compile flags
     ################################################################################
 
+<<<<<<< HEAD
     library_dirs: list[str] = [str(TORCH_LIB_DIR)]
     extra_install_requires: list[str] = []
+=======
+    library_dirs = []
+    extra_install_requires = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if IS_WINDOWS:
         # /NODEFAULTLIB makes sure we only link to DLL runtime
         # and matches the flags set for protobuf and ONNX
+<<<<<<< HEAD
         extra_link_args: list[str] = ["/NODEFAULTLIB:LIBCMT.LIB"]
         # /MD links against DLL runtime
         # and matches the flags set for protobuf and ONNX
         # /EHsc is about standard C++ exception handling
         extra_compile_args: list[str] = ["/MD", "/FS", "/EHsc"]
+=======
+        extra_link_args = ["/NODEFAULTLIB:LIBCMT.LIB"]
+        # /MD links against DLL runtime
+        # and matches the flags set for protobuf and ONNX
+        # /EHsc is about standard C++ exception handling
+        extra_compile_args = ["/MD", "/FS", "/EHsc"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         extra_link_args = []
         extra_compile_args = [
@@ -1438,11 +1989,21 @@ def configure_extension_build() -> tuple[
             "-fno-strict-aliasing",
         ]
 
+<<<<<<< HEAD
     main_compile_args: list[str] = []
     main_libraries: list[str] = ["torch_python"]
 
     main_link_args: list[str] = []
     main_sources: list[str] = ["torch/csrc/stub.c"]
+=======
+    library_dirs.append(lib_path)
+
+    main_compile_args = []
+    main_libraries = ["torch_python"]
+
+    main_link_args = []
+    main_sources = ["torch/csrc/stub.c"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if BUILD_LIBTORCH_WHL:
         main_libraries = ["torch"]
@@ -1450,28 +2011,49 @@ def configure_extension_build() -> tuple[
 
     if build_type.is_debug():
         if IS_WINDOWS:
+<<<<<<< HEAD
             extra_compile_args += ["/Z7"]
             extra_link_args += ["/DEBUG:FULL"]
+=======
+            extra_compile_args.append("/Z7")
+            extra_link_args.append("/DEBUG:FULL")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             extra_compile_args += ["-O0", "-g"]
             extra_link_args += ["-O0", "-g"]
 
     if build_type.is_rel_with_deb_info():
         if IS_WINDOWS:
+<<<<<<< HEAD
             extra_compile_args += ["/Z7"]
             extra_link_args += ["/DEBUG:FULL"]
+=======
+            extra_compile_args.append("/Z7")
+            extra_link_args.append("/DEBUG:FULL")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             extra_compile_args += ["-g"]
             extra_link_args += ["-g"]
 
     # pypi cuda package that requires installation of cuda runtime, cudnn and cublas
     # should be included in all wheels uploaded to pypi
+<<<<<<< HEAD
     pytorch_extra_install_requires = os.getenv("PYTORCH_EXTRA_INSTALL_REQUIREMENTS")
     if pytorch_extra_install_requires:
         report(f"pytorch_extra_install_requirements: {pytorch_extra_install_requires}")
         extra_install_requires.extend(
             map(str.strip, pytorch_extra_install_requires.split("|"))
         )
+=======
+    pytorch_extra_install_requirements = os.getenv(
+        "PYTORCH_EXTRA_INSTALL_REQUIREMENTS", ""
+    )
+    if pytorch_extra_install_requirements:
+        report(
+            f"pytorch_extra_install_requirements: {pytorch_extra_install_requirements}"
+        )
+        extra_install_requires += pytorch_extra_install_requirements.split("|")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Cross-compile for M1
     if IS_DARWIN:
@@ -1494,7 +2076,11 @@ def configure_extension_build() -> tuple[
             ]
             extra_link_args += ["-arch", macos_target_arch]
 
+<<<<<<< HEAD
     def make_relative_rpath_args(path: str) -> list[str]:
+=======
+    def make_relative_rpath_args(path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if IS_DARWIN:
             return ["-Wl,-rpath,@loader_path/" + path]
         elif IS_WINDOWS:
@@ -1506,6 +2092,7 @@ def make_relative_rpath_args(path: str) -> list[str]:
     # Declare extensions and package
     ################################################################################
 
+<<<<<<< HEAD
     ext_modules: list[Extension] = []
     # packages that we want to install into site-packages and include them in wheels
     includes = ["torch", "torch.*", "torchgen", "torchgen.*"]
@@ -1516,11 +2103,19 @@ def make_relative_rpath_args(path: str) -> list[str]:
     else:
         excludes.extend(["functorch", "functorch.*"])
     packages = find_packages(include=includes, exclude=excludes)
+=======
+    extensions = []
+    excludes = ["tools", "tools.*", "caffe2", "caffe2.*"]
+    if not cmake_cache_vars["BUILD_FUNCTORCH"]:
+        excludes.extend(["functorch", "functorch.*"])
+    packages = find_packages(exclude=excludes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C = Extension(
         "torch._C",
         libraries=main_libraries,
         sources=main_sources,
         language="c",
+<<<<<<< HEAD
         extra_compile_args=[
             *main_compile_args,
             *extra_compile_args,
@@ -1539,6 +2134,29 @@ def make_relative_rpath_args(path: str) -> list[str]:
         "bdist_wheel": bdist_wheel,
         "build_ext": build_ext,
         "clean": clean,
+=======
+        extra_compile_args=main_compile_args + extra_compile_args,
+        include_dirs=[],
+        library_dirs=library_dirs,
+        extra_link_args=extra_link_args
+        + main_link_args
+        + make_relative_rpath_args("lib"),
+    )
+    extensions.append(C)
+
+    # These extensions are built by cmake and copied manually in build_extensions()
+    # inside the build_ext implementation
+    if cmake_cache_vars["BUILD_FUNCTORCH"]:
+        extensions.append(
+            Extension(name="functorch._C", sources=[]),
+        )
+
+    cmdclass = {
+        "bdist_wheel": wheel_concatenate,
+        "build_ext": build_ext,
+        "clean": clean,
+        "install": install,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "sdist": sdist,
     }
 
@@ -1556,11 +2174,16 @@ def make_relative_rpath_args(path: str) -> list[str]:
         entry_points["console_scripts"].append(
             "torchfrtrace = tools.flight_recorder.fr_trace:main",
         )
+<<<<<<< HEAD
     return ext_modules, cmdclass, packages, entry_points, extra_install_requires
+=======
+    return extensions, cmdclass, packages, entry_points, extra_install_requires
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # post run, warnings, printed at the end to make them more visible
 build_update_message = """
+<<<<<<< HEAD
 It is no longer necessary to use the 'build' or 'rebuild' targets
 
 To install:
@@ -1589,17 +2212,74 @@ def main() -> None:
             "Set one to 0 and rerun."
         )
 
+=======
+    It is no longer necessary to use the 'build' or 'rebuild' targets
+
+    To install:
+      $ python setup.py install
+    To develop locally:
+      $ python setup.py develop
+    To force cmake to re-generate native build files (off by default):
+      $ CMAKE_FRESH=1 python setup.py develop
+"""
+
+
+def print_box(msg):
+    lines = msg.split("\n")
+    size = max(len(l) + 1 for l in lines)
+    print("-" * (size + 2))
+    for l in lines:
+        print("|{}{}|".format(l, " " * (size - len(l))))
+    print("-" * (size + 2))
+
+
+def main():
+    if BUILD_LIBTORCH_WHL and BUILD_PYTHON_ONLY:
+        raise RuntimeError(
+            "Conflict: 'BUILD_LIBTORCH_WHL' and 'BUILD_PYTHON_ONLY' can't both be 1. Set one to 0 and rerun."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     install_requires = [
         "filelock",
         "typing-extensions>=4.10.0",
         'setuptools ; python_version >= "3.12"',
         "sympy>=1.13.3",
+<<<<<<< HEAD
         "networkx>=2.5.1",
         "jinja2",
         "fsspec>=0.8.5",
     ]
     if BUILD_PYTHON_ONLY:
         install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
+=======
+        "networkx",
+        "jinja2",
+        "fsspec",
+    ]
+
+    if BUILD_PYTHON_ONLY:
+        install_requires.append(f"{LIBTORCH_PKG_NAME}=={get_torch_version()}")
+
+    if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
+        gen_linker_script(
+            filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
+        )
+        linker_script_path = os.path.abspath("cmake/linker_script.ld")
+        os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
+        os.environ["CFLAGS"] = (
+            os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+        os.environ["CXXFLAGS"] = (
+            os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+    elif platform.system() == "Linux" and platform.processor() == "aarch64":
+        print_box(
+            """
+            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
+            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+            """
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Parse the command line and check the arguments before we proceed with
     # building deps and setup. We need to set values so `--help` works.
@@ -1608,8 +2288,13 @@ def main() -> None:
     dist.script_args = sys.argv[1:]
     try:
         dist.parse_command_line()
+<<<<<<< HEAD
     except setuptools.errors.BaseError as e:
         print(e, file=sys.stderr)
+=======
+    except setuptools.distutils.errors.DistutilsArgError as e:
+        print(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sys.exit(1)
 
     mirror_files_into_torchgen()
@@ -1617,7 +2302,11 @@ def main() -> None:
         build_deps()
 
     (
+<<<<<<< HEAD
         ext_modules,
+=======
+        extensions,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cmdclass,
         packages,
         entry_points,
@@ -1625,6 +2314,20 @@ def main() -> None:
     ) = configure_extension_build()
     install_requires += extra_install_requires
 
+<<<<<<< HEAD
+=======
+    extras_require = {
+        "optree": ["optree>=0.13.0"],
+        "opt-einsum": ["opt-einsum>=3.3"],
+        "pyyaml": ["pyyaml"],
+    }
+
+    # Read in README.md for our long_description
+    with open(os.path.join(cwd, "README.md"), encoding="utf-8") as f:
+        long_description = f.read()
+
+    version_range_max = max(sys.version_info[1], 13) + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch_package_data = [
         "py.typed",
         "bin/*",
@@ -1643,12 +2346,18 @@ def main() -> None:
         "include/**/*.hpp",
         "include/*.cuh",
         "include/**/*.cuh",
+<<<<<<< HEAD
         "csrc/inductor/aoti_runtime/model.h",
         "_inductor/codegen/*.h",
         "_inductor/codegen/aoti_runtime/*.h",
         "_inductor/codegen/aoti_runtime/*.cpp",
         "_inductor/script.ld",
         "_inductor/kernel/flex/templates/*.jinja",
+=======
+        "_inductor/codegen/*.h",
+        "_inductor/codegen/aoti_runtime/*.cpp",
+        "_inductor/script.ld",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "_export/serde/*.yaml",
         "_export/serde/*.thrift",
         "share/cmake/ATen/*.cmake",
@@ -1667,6 +2376,7 @@ def main() -> None:
         "utils/model_dump/code.js",
         "utils/model_dump/*.mjs",
         "_dynamo/graph_break_registry.json",
+<<<<<<< HEAD
         "tools/dynamo/gb_id_mapping.py",
     ]
 
@@ -1701,6 +2411,48 @@ def main() -> None:
             "include/kineto/*.h",
             "include/kineto/**/*.h",
         ]
+=======
+    ]
+
+    if not BUILD_LIBTORCH_WHL:
+        torch_package_data.extend(
+            [
+                "lib/libtorch_python.so",
+                "lib/libtorch_python.dylib",
+                "lib/libtorch_python.dll",
+            ]
+        )
+    if not BUILD_PYTHON_ONLY:
+        torch_package_data.extend(
+            [
+                "lib/*.so*",
+                "lib/*.dylib*",
+                "lib/*.dll",
+                "lib/*.lib",
+            ]
+        )
+        aotriton_image_path = os.path.join(lib_path, "aotriton.images")
+        aks2_files = []
+        for root, dirs, files in os.walk(aotriton_image_path):
+            subpath = os.path.relpath(root, start=aotriton_image_path)
+            for fn in files:
+                aks2_files.append(os.path.join("lib/aotriton.images", subpath, fn))
+        torch_package_data += aks2_files
+    if get_cmake_cache_vars()["USE_TENSORPIPE"]:
+        torch_package_data.extend(
+            [
+                "include/tensorpipe/*.h",
+                "include/tensorpipe/**/*.h",
+            ]
+        )
+    if get_cmake_cache_vars()["USE_KINETO"]:
+        torch_package_data.extend(
+            [
+                "include/kineto/*.h",
+                "include/kineto/**/*.h",
+            ]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torchgen_package_data = [
         "packaged/*",
         "packaged/**/*",
@@ -1708,6 +2460,7 @@ def main() -> None:
     package_data = {
         "torch": torch_package_data,
     }
+<<<<<<< HEAD
     # some win libraries are excluded
     # these are statically linked
     exclude_windows_libs = [
@@ -1732,15 +2485,67 @@ def main() -> None:
         name=TORCH_PACKAGE_NAME,
         version=TORCH_VERSION,
         ext_modules=ext_modules,
+=======
+
+    if not BUILD_LIBTORCH_WHL:
+        package_data["torchgen"] = torchgen_package_data
+    else:
+        # no extensions in BUILD_LIBTORCH_WHL mode
+        extensions = []
+
+    setup(
+        name=package_name,
+        version=version,
+        description=(
+            "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+        ),
+        long_description=long_description,
+        long_description_content_type="text/markdown",
+        ext_modules=extensions,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cmdclass=cmdclass,
         packages=packages,
         entry_points=entry_points,
         install_requires=install_requires,
+<<<<<<< HEAD
         package_data=package_data,
         exclude_package_data=exclude_package_data,
         # Disable automatic inclusion of data files because we want to
         # explicitly control with `package_data` above.
         include_package_data=False,
+=======
+        extras_require=extras_require,
+        package_data=package_data,
+        # TODO fix later Manifest.IN file was previously ignored
+        include_package_data=False,  # defaults to True with pyproject.toml file
+        url="https://pytorch.org/",
+        download_url="https://github.com/pytorch/pytorch/tags",
+        author="PyTorch Team",
+        author_email="packages@pytorch.org",
+        python_requires=f">={python_min_version_str}",
+        # PyPI package information.
+        classifiers=[
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "Intended Audience :: Education",
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: BSD License",
+            "Topic :: Scientific/Engineering",
+            "Topic :: Scientific/Engineering :: Mathematics",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+            "Topic :: Software Development",
+            "Topic :: Software Development :: Libraries",
+            "Topic :: Software Development :: Libraries :: Python Modules",
+            "Programming Language :: C++",
+            "Programming Language :: Python :: 3",
+        ]
+        + [
+            f"Programming Language :: Python :: 3.{i}"
+            for i in range(python_min_version[1], version_range_max)
+        ],
+        license="BSD-3-Clause",
+        keywords="pytorch, machine learning",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if EMIT_BUILD_WARNING:
         print_box(build_update_message)
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index d01d41d37997e..08421875c9684 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -658,6 +658,16 @@
     "Iterable",
     "Optional"
   ],
+<<<<<<< HEAD
+=======
+  "torch.onnx": [
+    "Dict",
+    "OperatorExportTypes",
+    "Optional",
+    "TensorProtoDataType",
+    "TrainingMode"
+  ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.overrides": [
     "BaseTorchFunctionMode",
     "TorchFunctionMode",
@@ -2000,6 +2010,11 @@
     "cast_symbool_to_symint_guardless",
     "constrain_range",
     "constrain_unify",
+<<<<<<< HEAD
+=======
+    "guard_or_true",
+    "guard_or_false",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "error",
     "eval_guards",
     "eval_is_non_overlapping_and_dense",
@@ -2200,6 +2215,24 @@
     "Tuple",
     "abstractmethod"
   ],
+<<<<<<< HEAD
+=======
+  "torch.onnx.verification": [
+    "Any",
+    "Callable",
+    "Collection",
+    "Dict",
+    "FrozenSet",
+    "List",
+    "Mapping",
+    "Number",
+    "Optional",
+    "Sequence",
+    "Set",
+    "Tuple",
+    "Union"
+  ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.quantization.fx": [
     "convert",
     "fuse",
@@ -2626,6 +2659,14 @@
   "torch.export.graph_signature": [
     "TokenArgument"
   ],
+<<<<<<< HEAD
+=======
+  "torch.export.pt2_archive": [
+    "PT2ArchiveWriter",
+    "PT2ArchiveReader",
+    "is_pt2_package"
+  ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.fx.experimental.shape_inference.infer_shape": [
     "DimDynamic",
     "FakeTensorMode",
@@ -2663,6 +2704,38 @@
     "DeferredMtiaCallError",
     "StreamContext"
   ],
+<<<<<<< HEAD
+=======
+  "torch.onnx.symbolic_helper": [
+    "Any",
+    "Callable",
+    "List",
+    "Literal",
+    "NoReturn",
+    "Number",
+    "Optional",
+    "Sequence",
+    "Set",
+    "Tuple",
+    "Union"
+  ],
+  "torch.onnx.symbolic_opset18": [
+    "amax",
+    "amin",
+    "aminmax",
+    "embedding_bag",
+    "linalg_vector_norm",
+    "max",
+    "maximum",
+    "min",
+    "minimum"
+  ],
+  "torch.onnx.symbolic_opset20": [
+    "_affine_grid_generator",
+    "_grid_sampler",
+    "convert_grid_sample_mode"
+  ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.utils.data.datapipes.dataframe.dataframe_wrapper": [
     "Any",
     "Optional"
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 079f5e1941d2a..4e56f19b5c244 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import copy
 
@@ -50,12 +54,20 @@ def _check_constructor(self, activation_sparsifier, model, defaults, sparse_conf
         sparsifier_defaults = activation_sparsifier.defaults
         combined_defaults = {**defaults, "sparse_config": sparse_config}
 
+<<<<<<< HEAD
         # more keys are populated in activation sparsifier (even though they may be None)
+=======
+        # more keys are populated in activation sparsifier (eventhough they may be None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(combined_defaults) <= len(activation_sparsifier.defaults)
 
         for key, config in sparsifier_defaults.items():
             # all the keys in combined_defaults should be present in sparsifier defaults
+<<<<<<< HEAD
             assert config == combined_defaults.get(key)
+=======
+            assert config == combined_defaults.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_register_layer(
         self, activation_sparsifier, defaults, sparse_config, layer_args_list
@@ -190,7 +202,11 @@ def hook(module, input, output):
                 if features is None:
                     assert torch.all(mask * input_data == output)
                 else:
+<<<<<<< HEAD
                     for feature_idx in range(len(features)):
+=======
+                    for feature_idx in range(0, len(features)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         feature = torch.Tensor(
                             [features[feature_idx]], device=input_data.device
                         ).long()
@@ -243,7 +259,11 @@ def _check_state_dict(self, sparsifier1):
             if mask1 is None:
                 assert mask2 is None
             else:
+<<<<<<< HEAD
                 assert type(mask1) is type(mask2)
+=======
+                assert type(mask1) == type(mask2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(mask1, list):
                     assert len(mask1) == len(mask2)
                     for idx in range(len(mask1)):
@@ -378,7 +398,11 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
         # some dummy data
         data_list = []
         num_data_points = 5
+<<<<<<< HEAD
         for _ in range(num_data_points):
+=======
+        for _ in range(0, num_data_points):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rand_data = torch.randn(16, 1, 28, 28)
             activation_sparsifier.model(rand_data)
             data_list.append(rand_data)
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index 1725f288cf7c0..721ab894245fc 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 import torch
@@ -411,6 +415,10 @@ def test_q_prep_fx_before_s_prep(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
+<<<<<<< HEAD
+=======
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_q_prep_fx_s_prep_ref_conv(self):
         r"""
         This checks that the ordering: prepare_fx -> sparse prepare -> convert_to_reference_fx
@@ -585,6 +593,10 @@ def test_s_prep_before_qat_prep_fx(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
+<<<<<<< HEAD
+=======
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_s_prep_q_prep_fx_ref(self):
         r"""
         This checks that the ordering: sparse prepare -> prepare_fx -> convert_to_reference_fx
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index 47a85e1edda14..6178a50785afd 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import copy
 import warnings
@@ -143,7 +147,11 @@ def test_step(self):
 
         # checking step count
         step_cnt = 5
+<<<<<<< HEAD
         for _ in range(step_cnt):
+=======
+        for _ in range(0, step_cnt):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sparsifier.step()
             scheduler.step()
 
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index fa08e8c90ac2f..349ecdace89af 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import copy
 import itertools
@@ -123,7 +127,11 @@ def check_step(self, data_list, data_with_config, defaults, **kwargs):
 
         step_count = 3
 
+<<<<<<< HEAD
         for _ in range(step_count):
+=======
+        for _ in range(0, step_count):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sparsifier.step()
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
@@ -265,7 +273,11 @@ def check_memory_reference(self, data_list, data_with_config, defaults, **kwargs
 class _NormDataSparsifierTestCase(_BaseDataSparsiferTestCase):
     r"""This helper test class takes in any supported type of and runs some tests.
     This inherits the TestBaseDataSparsifierRuner wherein some functions are
+<<<<<<< HEAD
     over-ridden to take accommodate the specific sparsifier.
+=======
+    over-ridden to take accomodate the specific sparsifier.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TODO: Change the structure by creating a separate test case class for each
           member function
     """
@@ -710,6 +722,7 @@ def test_ptq_sparsify_first(self):
             **sparse_config,
         )
 
+<<<<<<< HEAD
         assert type(model.emb1) is torch.ao.nn.quantized.modules.embedding_ops.Embedding
         assert (
             type(model.embbag1)
@@ -719,6 +732,17 @@ def test_ptq_sparsify_first(self):
         assert type(model.emb_seq[1] is nn.EmbeddingBag)
         assert type(model.linear1) is nn.Linear
         assert type(model.linear2) is nn.Linear
+=======
+        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert (
+            type(model.embbag1)
+            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
+        assert type(model.emb_seq[0] == nn.Embedding)
+        assert type(model.emb_seq[1] == nn.EmbeddingBag)
+        assert type(model.linear1) == nn.Linear
+        assert type(model.linear2) == nn.Linear
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dequant_emb1 = torch.dequantize(model.emb1.weight())
         dequant_embbag1 = torch.dequantize(model.embbag1.weight())
@@ -749,6 +773,7 @@ def test_ptq_quantize_first(self):
             model, DataNormSparsifier, sparsify_first=False, **sparse_config
         )
 
+<<<<<<< HEAD
         assert type(model.emb1) is torch.ao.nn.quantized.modules.embedding_ops.Embedding
         assert (
             type(model.embbag1)
@@ -764,6 +789,21 @@ def test_ptq_quantize_first(self):
         )
         assert type(model.linear1) is nn.Linear  # not quantized
         assert type(model.linear2) is nn.Linear  # not quantized
+=======
+        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert (
+            type(model.embbag1)
+            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
+        assert type(
+            model.emb_seq[0] == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        )
+        assert type(
+            model.emb_seq[1] == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
+        assert type(model.linear1) == nn.Linear  # not quantized
+        assert type(model.linear2) == nn.Linear  # not quantized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dequant_emb1 = torch.dequantize(model.emb1.weight())
         dequant_embbag1 = torch.dequantize(model.embbag1.weight())
@@ -772,7 +812,11 @@ def test_ptq_quantize_first(self):
 
         # higher threshold as quantization occurs before sparsity
         threshold = (
+<<<<<<< HEAD
             1  # zero points seem to have higher magnitude with sparsity occurring after
+=======
+            1  # zero points seem to have higher magnitude with sparsity occuring after
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         sl_emb1 = (torch.abs(dequant_emb1) < threshold).float().mean()
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 86d8ad4d3a62d..8290e0e17f22a 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import copy
 import io
diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py
index 95d90725d3c6b..a697b345c2034 100644
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 import torch
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index 0477b70fd8783..4e90082c20312 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import warnings
 
@@ -188,7 +192,11 @@ def test_step(self):
         self.assertEqual(
             self._get_sparsity_levels(sparsifier),
             self.sorted_sparse_levels,
+<<<<<<< HEAD
             msg="Sparsity level is not reaching the target level after delta_t * n steps ",
+=======
+            msg="Sparsity level is not reaching the target level afer delta_t * n steps ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index a940a3e9febab..16d015c2c87fc 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import itertools
 import re
@@ -291,7 +295,11 @@ def test_prepare(self):
             assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
             assert is_parametrized(module, "weight")
+<<<<<<< HEAD
             assert type(module.parametrizations.weight[0]) is FakeSparsity
+=======
+            assert type(module.parametrizations.weight[0]) == FakeSparsity
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mask_squash(self):
         model = SimpleLinear()
@@ -415,7 +423,11 @@ def test_prepare(self):
             assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
             assert is_parametrized(module, "weight")
+<<<<<<< HEAD
             assert type(module.parametrizations.weight[0]) is FakeSparsity
+=======
+            assert type(module.parametrizations.weight[0]) == FakeSparsity
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mask_squash(self):
         model = SimpleLinear()
@@ -472,8 +484,13 @@ def _verify_nearliness(self, mask: torch.Tensor, nearliness: int):
         else:
             height, width = mask.shape
             dist_to_diagonal = nearliness // 2
+<<<<<<< HEAD
             for row in range(height):
                 for col in range(width):
+=======
+            for row in range(0, height):
+                for col in range(0, width):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if abs(row - col) <= dist_to_diagonal:
                         assert mask[row, col] == 1
                     else:
diff --git a/test/ao/sparsity/test_sparsity_utils.py b/test/ao/sparsity/test_sparsity_utils.py
index f2deaeb1ecc29..c2e81198264ab 100644
--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 import logging
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 4ed9bea7d0f76..7a2ce80aa83e3 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 import random
 
@@ -158,7 +162,11 @@ def _check_pruner_prepared(self, model, pruner, device):
             assert parametrize.is_parametrized(module)
             assert hasattr(module, "parametrizations")
             # Assume that this is the 1st/only parametrization
+<<<<<<< HEAD
             assert type(module.parametrizations.weight[0]) is FakeStructuredSparsity
+=======
+            assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
diff --git a/test/bench_mps_ops.py b/test/bench_mps_ops.py
index e81fb555c848a..f786622b6509e 100644
--- a/test/bench_mps_ops.py
+++ b/test/bench_mps_ops.py
@@ -71,6 +71,7 @@ def bench_binary(
     return rc
 
 
+<<<<<<< HEAD
 def check_eager_vs_compile(rc_c, rc_e, func, dtype):
     if not torch.allclose(rc_c, rc_e):
         mdiff = (rc_c - rc_e).abs().max()
@@ -80,6 +81,8 @@ def check_eager_vs_compile(rc_c, rc_e, func, dtype):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def bench_reduction(
     reduction_func, device: str = "mps", dtype: torch.dtype = torch.float32
 ) -> list[Measurement]:
@@ -90,23 +93,40 @@ def f(t):
         return reduction_func(t, dim=0)
 
     f.__name__ = reduction_func.__name__
+<<<<<<< HEAD
     f_c = torch.compile(f, dynamic=False, fullgraph=True)
+=======
+    f_c = torch.compile(f, dynamic=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for size in (512, 1024, 2048, 4096):
         x = torch.testing.make_tensor(size, size, device=device, dtype=dtype)
         rc_c, rc_e = f(x), f_c(x)
         rc_c, rc_e = (rc_c[0], rc_e[0]) if isinstance(rc_c, tuple) else (rc_c, rc_e)
+<<<<<<< HEAD
         check_eager_vs_compile(rc_c, rc_e, reduction_func, dtype)
+=======
+        if not torch.allclose(rc_c, rc_e):
+            mdiff = (rc_c - rc_e).abs().max()
+            warnings.warn(
+                f"Eager and compile reduction do not match for {reduction_func.__name__} and {dtype} max_diff={mdiff}",
+                stacklevel=2,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rc.append(bench_unary_op(f, x, f"eager-{size}x{size}"))
         rc.append(bench_unary_op(f_c, x, f"compile-{size}x{size}"))
     return rc
 
 
 def bench_scan(
+<<<<<<< HEAD
     scan_func,
     device: str = "mps",
     dtype: torch.dtype = torch.float32,
     with_indices: bool = False,
+=======
+    scan_func, device: str = "mps", dtype: torch.dtype = torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[Measurement]:
     rc = []
 
@@ -116,18 +136,31 @@ def bench_scan(
         def f(t):
             return scan_func(t, dim=dim)
 
+<<<<<<< HEAD
         f_c = torch.compile(f, dynamic=False, fullgraph=True)
+=======
+        f_c = torch.compile(f, dynamic=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for size in (32, 128, 512, 1024):
             f.__name__ = f"{scan_func.__name__}-dim{dim}-{size}x{size}"
             f_c.__name__ = f.__name__
             x = torch.testing.make_tensor(size, size, device=device, dtype=dtype)
             rc_c, rc_e = f(x), f_c(x)
+<<<<<<< HEAD
             if with_indices:
                 check_eager_vs_compile(rc_c[0], rc_e[0], scan_func, dtype)
                 check_eager_vs_compile(rc_c[1], rc_e[1], scan_func, dtype)
             else:
                 check_eager_vs_compile(rc_c, rc_e, scan_func, dtype)
+=======
+            if not torch.allclose(rc_c, rc_e):
+                mdiff = (rc_c - rc_e).abs().max()
+                warnings.warn(
+                    f"Eager and compile scan do not match for {scan_func.__name__} dim={dim} and {dtype} max_diff={mdiff}",
+                    stacklevel=2,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rc.append(bench_unary_op(f, x, "eager"))
             rc.append(bench_unary_op(f_c, x, "compile"))
 
@@ -135,18 +168,31 @@ def f(t):
     def f_1d(t):
         return scan_func(t, dim=0)
 
+<<<<<<< HEAD
     f_1d_c = torch.compile(f_1d, dynamic=False, fullgraph=True)
+=======
+    f_1d_c = torch.compile(f_1d, dynamic=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for size in (100, 10000, 1000000):
         f_1d.__name__ = f"{scan_func.__name__}-1d-{size}"
         f_1d_c.__name__ = f_1d.__name__
         x = torch.testing.make_tensor(size, device=device, dtype=dtype)
         rc_c, rc_e = f_1d(x), f_1d_c(x)
+<<<<<<< HEAD
         if with_indices:
             check_eager_vs_compile(rc_c[0], rc_e[0], scan_func, dtype)
             check_eager_vs_compile(rc_c[1], rc_e[1], scan_func, dtype)
         else:
             check_eager_vs_compile(rc_c, rc_e, scan_func, dtype)
+=======
+        if not torch.allclose(rc_c, rc_e):
+            mdiff = (rc_c - rc_e).abs().max()
+            warnings.warn(
+                f"Eager and compile 1D scan do not match for {scan_func.__name__} and {dtype} max_diff={mdiff}",
+                stacklevel=2,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rc.append(bench_unary_op(f_1d, x, "eager"))
         rc.append(bench_unary_op(f_1d_c, x, "compile"))
 
@@ -154,7 +200,13 @@ def f_1d(t):
 
 
 def main() -> None:
+<<<<<<< HEAD
     dtypes = [torch.float16, torch.float32, torch.bfloat16]
+=======
+    dtypes = [torch.float16, torch.float32]
+    if torch.backends.mps.is_macos_or_newer(14, 0):
+        dtypes.append(torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Profile index ops
     B = 11
@@ -185,12 +237,15 @@ def main() -> None:
         rc.extend(bench_scan(torch.cumsum, dtype=dtype))
     Compare(rc).print()
 
+<<<<<<< HEAD
     # Profile scan with indices ops (cummin)
     rc = []
     for dtype in dtypes:
         rc.extend(bench_scan(torch.cummin, dtype=dtype, with_indices=True))
     Compare(rc).print()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Profile binary ops
     rc = []
     ops = [torch.fmax, torch.add]
@@ -202,5 +257,8 @@ def main() -> None:
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     torch._dynamo.config.cache_size_limit = 2**16
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     main()
diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index f9120c26a132f..e272796212da5 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -699,16 +699,24 @@ def custom_transforms(fn: str):
               8959166  /tmp/build/80754af9/python_15996 ... a3/envs/throwaway/bin/python3.6]
                   ...
                 92821  /tmp/build/80754af9/python_15996 ... a3/envs/throwaway/bin/python3.6]
+<<<<<<< HEAD
                 91000  build/../torch/csrc/tensor/pytho ... ch/torch/lib/libtorch_python.so]  # codespell:ignore
+=======
+                91000  build/../torch/csrc/tensor/pytho ... ch/torch/lib/libtorch_python.so]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 91000  /data/users/test_user/repos/pyto ... nsors::get_default_scalar_type()
                 90090  ???:pthread_mutex_lock [/usr/lib64/libpthread-2.28.so]
                 90000  build/../c10/core/TensorImpl.h:c ... ch/torch/lib/libtorch_python.so]
                 90000  build/../aten/src/ATen/record_fu ... torch/torch/lib/libtorch_cpu.so]
                 90000  /data/users/test_user/repos/pyto ... uard(std::optional<c10::Device>)
                 90000  /data/users/test_user/repos/pyto ... ersionCounter::~VersionCounter()
+<<<<<<< HEAD
                 88000  /data/users/test_user/repos/pyto ... ratorKernel*, at::Tensor const&)""".replace(
                 "  # codespell:ignore", ""
             ),
+=======
+                88000  /data/users/test_user/repos/pyto ... ratorKernel*, at::Tensor const&)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.regularizeAndAssertExpectedInline(
diff --git a/test/bottleneck_test/test.py b/test/bottleneck_test/test.py
new file mode 100644
index 0000000000000..0549a6372ab95
--- /dev/null
+++ b/test/bottleneck_test/test.py
@@ -0,0 +1,7 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+
+
+x = torch.ones((3, 3), requires_grad=True)
+(3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_args.py b/test/bottleneck_test/test_args.py
new file mode 100644
index 0000000000000..38fc03701bf24
--- /dev/null
+++ b/test/bottleneck_test/test_args.py
@@ -0,0 +1,17 @@
+# Owner(s): ["module: unknown"]
+
+import argparse
+
+import torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Required args. Raises error if they aren't passed.
+    parser.add_argument("--foo", help="foo", required=True)
+    parser.add_argument("--bar", help="bar", required=True)
+    _ = parser.parse_args()
+
+    x = torch.ones((3, 3), requires_grad=True)
+    (3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py
new file mode 100644
index 0000000000000..d9f9b0b8274f2
--- /dev/null
+++ b/test/bottleneck_test/test_cuda.py
@@ -0,0 +1,29 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(20, 20)
+
+    def forward(self, input):
+        out = self.linear(input[:, 10:30])
+        return out.sum()
+
+
+def main():
+    data = torch.randn(10, 50).cuda()
+    model = Model().cuda()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+    for _ in range(10):
+        optimizer.zero_grad()
+        loss = model(data)
+        loss.backward()
+        optimizer.step()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/conftest.py b/test/conftest.py
index de5818bda8f32..ee47ef1aeb0af 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,6 +21,7 @@
 from pytest_shard_custom import pytest_addoptions as shard_addoptions, PytestShardPlugin
 
 
+<<<<<<< HEAD
 try:
     from torch.testing._internal.common_utils import parse_cmd_line_args
 except ImportError:
@@ -31,6 +32,8 @@ def parse_cmd_line_args():
         pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from _pytest._code.code import ReprFileLocation
 
@@ -93,7 +96,10 @@ def pytest_addoption(parser: Parser) -> None:
 
 
 def pytest_configure(config: Config) -> None:
+<<<<<<< HEAD
     parse_cmd_line_args()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xmlpath = config.option.xmlpath_reruns
     # Prevent opening xmllog on worker nodes (xdist).
     if xmlpath and not hasattr(config, "workerinput"):
@@ -238,7 +244,11 @@ def pytest_pycollect_makemodule(module_path, path, parent) -> Module:
 
 @pytest.hookimpl(hookwrapper=True)
 def pytest_report_teststatus(report, config):
+<<<<<<< HEAD
     # Add the test time to the verbose output, unfortunately I don't think this
+=======
+    # Add the test time to the verbose output, unforunately I don't think this
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # includes setup or teardown
     pluggy_result = yield
     if not isinstance(report, pytest.TestReport):
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
index 4763621f60394..3be419bc0a298 100644
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -1,14 +1,18 @@
+<<<<<<< HEAD
 # Skip on windows
 if(WIN32)
   return()
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
 
 # Build the cpp gtest binary containing the cpp-only tests.
 set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
+<<<<<<< HEAD
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
@@ -24,6 +28,13 @@ set(AOTI_ABI_CHECK_TEST_SRCS
 # You may think test_vec.cpp needs to be in there, but it does not.
 set(AOTI_ABI_CHECK_VEC_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec_half.cpp
+=======
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 add_executable(test_aoti_abi_check
@@ -34,6 +45,7 @@ add_executable(test_aoti_abi_check
 target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
 
 # WARNING: DO NOT LINK torch!!!
+<<<<<<< HEAD
 # The purpose is to check if the used aten/c10 headers are written in a header-only way
 target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
 target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
@@ -66,6 +78,11 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
     target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
   endforeach()
 endforeach()
+=======
+# The purpose is to check if the used aten/c10 headers are writtern in a header-only way
+target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
+target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(INSTALL_TEST)
   install(TARGETS test_aoti_abi_check DESTINATION bin)
diff --git a/test/cpp/aoti_abi_check/test_dtype.cpp b/test/cpp/aoti_abi_check/test_dtype.cpp
index e6e7e75867c8d..d993fcc3c086d 100644
--- a/test/cpp/aoti_abi_check/test_dtype.cpp
+++ b/test/cpp/aoti_abi_check/test_dtype.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+<<<<<<< HEAD
 #include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/util/BFloat16.h>
 #include <torch/headeronly/util/Float4_e2m1fn_x2.h>
@@ -24,6 +25,27 @@ TEST(TestDtype, TestBFloat16) {
   torch::headeronly::BFloat16 sub = -1.0f;
   torch::headeronly::BFloat16 mul = 2.0f;
   torch::headeronly::BFloat16 div = 0.5f;
+=======
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/complex.h>
+
+namespace torch {
+namespace aot_inductor {
+
+TEST(TestDtype, TestBFloat16) {
+  c10::BFloat16 a = 1.0f;
+  c10::BFloat16 b = 2.0f;
+  c10::BFloat16 add = 3.0f;
+  c10::BFloat16 sub = -1.0f;
+  c10::BFloat16 mul = 2.0f;
+  c10::BFloat16 div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -32,12 +54,21 @@ TEST(TestDtype, TestBFloat16) {
 }
 
 TEST(TestDtype, TestFloat8_e4m3fn) {
+<<<<<<< HEAD
   torch::headeronly::Float8_e4m3fn a = 1.0f;
   torch::headeronly::Float8_e4m3fn b = 2.0f;
   torch::headeronly::Float8_e4m3fn add = 3.0f;
   torch::headeronly::Float8_e4m3fn sub = -1.0f;
   torch::headeronly::Float8_e4m3fn mul = 2.0f;
   torch::headeronly::Float8_e4m3fn div = 0.5f;
+=======
+  c10::Float8_e4m3fn a = 1.0f;
+  c10::Float8_e4m3fn b = 2.0f;
+  c10::Float8_e4m3fn add = 3.0f;
+  c10::Float8_e4m3fn sub = -1.0f;
+  c10::Float8_e4m3fn mul = 2.0f;
+  c10::Float8_e4m3fn div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -46,12 +77,21 @@ TEST(TestDtype, TestFloat8_e4m3fn) {
 }
 
 TEST(TestDtype, TestFloat8_e4m3fuz) {
+<<<<<<< HEAD
   torch::headeronly::Float8_e4m3fnuz a = 1.0f;
   torch::headeronly::Float8_e4m3fnuz b = 2.0f;
   torch::headeronly::Float8_e4m3fnuz add = 3.0f;
   torch::headeronly::Float8_e4m3fnuz sub = -1.0f;
   torch::headeronly::Float8_e4m3fnuz mul = 2.0f;
   torch::headeronly::Float8_e4m3fnuz div = 0.5f;
+=======
+  c10::Float8_e4m3fnuz a = 1.0f;
+  c10::Float8_e4m3fnuz b = 2.0f;
+  c10::Float8_e4m3fnuz add = 3.0f;
+  c10::Float8_e4m3fnuz sub = -1.0f;
+  c10::Float8_e4m3fnuz mul = 2.0f;
+  c10::Float8_e4m3fnuz div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -60,12 +100,21 @@ TEST(TestDtype, TestFloat8_e4m3fuz) {
 }
 
 TEST(TestDtype, TestFloat8_e5m2) {
+<<<<<<< HEAD
   torch::headeronly::Float8_e5m2 a = 1.0f;
   torch::headeronly::Float8_e5m2 b = 2.0f;
   torch::headeronly::Float8_e5m2 add = 3.0f;
   torch::headeronly::Float8_e5m2 sub = -1.0f;
   torch::headeronly::Float8_e5m2 mul = 2.0f;
   torch::headeronly::Float8_e5m2 div = 0.5f;
+=======
+  c10::Float8_e5m2 a = 1.0f;
+  c10::Float8_e5m2 b = 2.0f;
+  c10::Float8_e5m2 add = 3.0f;
+  c10::Float8_e5m2 sub = -1.0f;
+  c10::Float8_e5m2 mul = 2.0f;
+  c10::Float8_e5m2 div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -74,12 +123,21 @@ TEST(TestDtype, TestFloat8_e5m2) {
 }
 
 TEST(TestDtype, TestFloat8_e5m2fnuz) {
+<<<<<<< HEAD
   torch::headeronly::Float8_e5m2fnuz a = 1.0f;
   torch::headeronly::Float8_e5m2fnuz b = 2.0f;
   torch::headeronly::Float8_e5m2fnuz add = 3.0f;
   torch::headeronly::Float8_e5m2fnuz sub = -1.0f;
   torch::headeronly::Float8_e5m2fnuz mul = 2.0f;
   torch::headeronly::Float8_e5m2fnuz div = 0.5f;
+=======
+  c10::Float8_e5m2fnuz a = 1.0f;
+  c10::Float8_e5m2fnuz b = 2.0f;
+  c10::Float8_e5m2fnuz add = 3.0f;
+  c10::Float8_e5m2fnuz sub = -1.0f;
+  c10::Float8_e5m2fnuz mul = 2.0f;
+  c10::Float8_e5m2fnuz div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -87,6 +145,7 @@ TEST(TestDtype, TestFloat8_e5m2fnuz) {
   EXPECT_EQ(a / b, div);
 }
 
+<<<<<<< HEAD
 TEST(TestDtype, TestFloat8_e8m0fnu) {
   torch::headeronly::Float8_e8m0fnu a = 1.0f;
   ASSERT_FALSE(a.isnan());
@@ -104,11 +163,21 @@ TEST(TestDtype, TestHalf) {
   torch::headeronly::Half sub = -1.0f;
   torch::headeronly::Half mul = 2.0f;
   torch::headeronly::Half div = 0.5f;
+=======
+TEST(TestDtype, TestHalf) {
+  c10::Half a = 1.0f;
+  c10::Half b = 2.0f;
+  c10::Half add = 3.0f;
+  c10::Half sub = -1.0f;
+  c10::Half mul = 2.0f;
+  c10::Half div = 0.5f;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
   EXPECT_EQ(a * b, mul);
   EXPECT_EQ(a / b, div);
+<<<<<<< HEAD
   EXPECT_EQ(a += b, add);
   EXPECT_EQ(a -= b, add - b);
   EXPECT_EQ(a *= b, b);
@@ -129,6 +198,17 @@ TEST(TestDtype, TestComplexFloat) {
   torch::headeronly::complex<float> sub(std::complex<float>(-2.0f, -2.0f));
   torch::headeronly::complex<float> mul(std::complex<float>(-5.0f, 10.0f));
   torch::headeronly::complex<float> div(std::complex<float>(0.44f, 0.08f));
+=======
+}
+
+TEST(TestDtype, TestComplexFloat) {
+  c10::complex<float> a(std::complex<float>(1.0f, 2.0f));
+  c10::complex<float> b(std::complex<float>(3.0f, 4.0f));
+  c10::complex<float> add(std::complex<float>(4.0f, 6.0f));
+  c10::complex<float> sub(std::complex<float>(-2.0f, -2.0f));
+  c10::complex<float> mul(std::complex<float>(-5.0f, 10.0f));
+  c10::complex<float> div(std::complex<float>(0.44f, 0.08f));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(a + b, add);
   EXPECT_EQ(a - b, sub);
@@ -136,6 +216,7 @@ TEST(TestDtype, TestComplexFloat) {
   EXPECT_EQ(a / b, div);
 }
 
+<<<<<<< HEAD
 TEST(TestDtype, TestQuintsQintsAndBits) {
   // There's not much you can do with these dtypes...
   // so we'll just check that it compiles
@@ -207,3 +288,7 @@ TEST(TestDtype, TestScalarType) {
     EXPECT_EQ(static_cast<ScalarType>(i), expected_scalar_types[i]);
   }
 }
+=======
+} // namespace aot_inductor
+} // namespace torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp/aoti_abi_check/test_vec.cpp b/test/cpp/aoti_abi_check/test_vec.cpp
index c51eacb640a0a..36ba397ee0921 100644
--- a/test/cpp/aoti_abi_check/test_vec.cpp
+++ b/test/cpp/aoti_abi_check/test_vec.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/cpu/vec/vec.h>
 
+<<<<<<< HEAD
 namespace torch {
 namespace aot_inductor {
 
@@ -23,6 +24,12 @@ void ExpectVecEqual(
   }
 }
 
+=======
+#include <iostream>
+namespace torch {
+namespace aot_inductor {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(TestVec, TestAdd) {
   using Vec = at::vec::Vectorized<int>;
   std::vector<int> a(1024, 1);
@@ -33,7 +40,13 @@ TEST(TestVec, TestAdd) {
   std::vector<int> expected(1024, 3);
   Vec expected_vec = Vec::loadu(expected.data());
 
+<<<<<<< HEAD
   ExpectVecEqual(expected_vec, actual_vec);
+=======
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(TestVec, TestMax) {
@@ -45,7 +58,13 @@ TEST(TestVec, TestMax) {
   Vec actual_vec = at::vec::maximum(a_vec, b_vec);
   Vec expected_vec = b_vec;
 
+<<<<<<< HEAD
   ExpectVecEqual(expected_vec, actual_vec);
+=======
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(TestVec, TestMin) {
@@ -57,7 +76,13 @@ TEST(TestVec, TestMin) {
   Vec actual_vec = at::vec::minimum(a_vec, b_vec);
   Vec expected_vec = a_vec;
 
+<<<<<<< HEAD
   ExpectVecEqual(expected_vec, actual_vec);
+=======
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(TestVec, TestConvert) {
@@ -69,7 +94,13 @@ TEST(TestVec, TestConvert) {
   auto actual_vec = at::vec::convert<float>(a_vec);
   auto expected_vec = b_vec;
 
+<<<<<<< HEAD
   ExpectVecEqual(expected_vec, actual_vec);
+=======
+  for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(TestVec, TestClampMin) {
@@ -81,7 +112,13 @@ TEST(TestVec, TestClampMin) {
   Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
   Vec expected_vec = min_vec;
 
+<<<<<<< HEAD
   ExpectVecEqual(expected_vec, actual_vec);
+=======
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace aot_inductor
diff --git a/test/cpp/aoti_inference/CMakeLists.txt b/test/cpp/aoti_inference/CMakeLists.txt
index 79e084715681e..b0a5eb79d9435 100644
--- a/test/cpp/aoti_inference/CMakeLists.txt
+++ b/test/cpp/aoti_inference/CMakeLists.txt
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)
 
 # Build custom TorchScript op for AOTInductor
@@ -7,12 +11,35 @@ set_target_properties(aoti_custom_class PROPERTIES
 if(USE_CUDA)
   target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
 elseif(USE_ROCM)
+<<<<<<< HEAD
   target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
 endif()
 
 # Link against LibTorch
 target_link_libraries(aoti_custom_class torch)
 
+=======
+    target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
+endif()
+# Link against LibTorch
+target_link_libraries(aoti_custom_class torch)
+
+# the custom command that generates the TorchScript module
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+    COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
+    DEPENDS compile_model.py
+)
+add_custom_target(aoti_script_model ALL
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+)
+add_dependencies(aoti_script_model aoti_custom_class)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Build the cpp gtest binary containing the cpp-only tests.
 set(INDUCTOR_TEST_SRCS
   ${AOT_INDUCTOR_TEST_ROOT}/test.cpp
@@ -21,12 +48,31 @@ set(INDUCTOR_TEST_SRCS
 add_executable(test_aoti_inference
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${INDUCTOR_TEST_SRCS}
+<<<<<<< HEAD
 )
 add_dependencies(test_aoti_inference aoti_custom_class)
+=======
+  data.pt
+  script_data.pt
+  script_model_cpu.pt
+  script_model_cuda.pt
+)
+add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)
 
+<<<<<<< HEAD
+=======
+# Define a custom command to generate the library
+add_custom_command(
+        OUTPUT data.pt
+        COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
+        DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 target_link_libraries(test_aoti_inference PRIVATE
   torch
   gtest_main
@@ -44,10 +90,13 @@ target_compile_definitions(test_aoti_inference PRIVATE
     CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
 )
 
+<<<<<<< HEAD
 target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
 target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
 target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(INSTALL_TEST)
   install(TARGETS test_aoti_inference DESTINATION bin)
   # Install PDB files for MSVC builds
diff --git a/test/cpp/aoti_inference/test.cpp b/test/cpp/aoti_inference/test.cpp
index 838faa70879ed..cdb785ea93483 100644
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@@ -2,9 +2,13 @@
 #include <gtest/gtest.h>
 #include <atomic>
 #include <condition_variable>
+<<<<<<< HEAD
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <functional>
 #include <mutex>
 #include <queue>
@@ -30,6 +34,7 @@
 
 namespace {
 
+<<<<<<< HEAD
 // Function to check if test data files exist and are valid
 bool testDataFilesExist() {
   std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
@@ -88,6 +93,8 @@ void ensureTestDataGenerated() {
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
     torch::inductor::TensorConstantMap tensor_constant_map) {
   std::unordered_map<std::string, at::Tensor> ret;
@@ -204,8 +211,11 @@ void test_aoti_package_loader_multi_gpu(
     const std::string& device,
     bool use_runtime_constant_folding) {
   torch::NoGradGuard no_grad;
+<<<<<<< HEAD
   // Ensure that this test will reset the default CUDA device on exit.
   torch::DeviceGuard device_guard(c10::Device("cuda"));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::string data_path =
       (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
@@ -915,6 +925,7 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
   }
 }
 
+<<<<<<< HEAD
 void test_cuda_alloc_test() {
   torch::NoGradGuard no_grad;
 
@@ -956,6 +967,8 @@ void test_cuda_alloc_test() {
 }
 
 #ifdef USE_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ThreadPool {
  private:
   struct Task {
@@ -1096,12 +1109,17 @@ void test_multi_cuda_streams(const std::string& device) {
     ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
   }
 }
+<<<<<<< HEAD
 #endif // USE_CUDA
 #endif // USE_CUDA || USE_ROCM
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 namespace torch::aot_inductor {
 
+<<<<<<< HEAD
 // Test fixture that ensures test data is generated once for all tests
 class AotInductorTest : public ::testing::Test {
  public:
@@ -1124,15 +1142,35 @@ TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
 }
 
 TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
+=======
+TEST(AotInductorTest, BasicTestCpu) {
+  test_aoti("cpu", false);
+}
+
+TEST(AotInductorTest, BasicScriptTestCpu) {
+  test_aoti_script("cpu");
+}
+
+TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
+  test_aoti_package_loader("cpu", false);
+}
+
+TEST(AotInductorTest, ExtractConstantsMapCpu) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   test_aoti_extract_constants_map("cpu");
 }
 
 #ifdef USE_CUDA
+<<<<<<< HEAD
 TEST_F(AotInductorTest, BasicTestCuda) {
+=======
+TEST(AotInductorTest, BasicTestCuda) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   test_aoti("cuda", true);
   test_aoti("cuda", false);
 }
 
+<<<<<<< HEAD
 TEST_F(AotInductorTest, BasicScriptTestCuda) {
   test_aoti_script("cuda");
 }
@@ -1188,6 +1226,59 @@ TEST_F(AotInductorTest, MultiStreamTestCuda) {
 TEST_F(AotInductorTest, CudaAllocTestCuda) {
   test_cuda_alloc_test();
 }
+=======
+TEST(AotInductorTest, BasicScriptTestCuda) {
+  test_aoti_script("cuda");
+}
+
+TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
+  test_aoti_package_loader("cuda", false);
+}
+
+TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
+  test_aoti_package_loader_multi_gpu("cuda", false);
+}
+
+TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
+  test_aoti_user_managed_buffer();
+}
+
+TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
+  test_aoti_constants_update("cuda", true);
+}
+
+TEST(AotInductorTest, UpdateConstantsCuda) {
+  test_aoti_constants_update("cuda", false);
+}
+
+TEST(AotInductorTest, ExtractConstantsMapCuda) {
+  test_aoti_extract_constants_map("cuda");
+}
+
+TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
+  test_aoti_double_buffering("cuda", true);
+}
+
+TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
+  test_aoti_double_buffering("cuda", false);
+}
+
+TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
+  test_aoti_double_buffering_with_tensor_constants();
+}
+
+TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
+  test_aoti_free_buffer(false);
+}
+
+TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
+  test_aoti_free_buffer(true);
+}
+
+TEST(AotInductorTest, MultiStreamTestCuda) {
+  test_multi_cuda_streams("cuda");
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 } // namespace torch::aot_inductor
diff --git a/test/cpp/aoti_inference/test.py b/test/cpp/aoti_inference/test.py
index 756fd4a172b87..b0cddb3f6379e 100644
--- a/test/cpp/aoti_inference/test.py
+++ b/test/cpp/aoti_inference/test.py
@@ -1,5 +1,8 @@
 import torch
+<<<<<<< HEAD
 import torch._inductor.config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export import aot_compile
 from torch.export import Dim
 
@@ -32,7 +35,10 @@ def forward(self, x, y):
 
 data = {}
 large_data = {}
+<<<<<<< HEAD
 cuda_alloc_data = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 data_with_tensor_constants = {}
 
 
@@ -87,6 +93,7 @@ def generate_basic_tests():
             )
 
 
+<<<<<<< HEAD
 def generate_basic_tests_consts_cpp():
     backup_consts_asm_cfg: bool = (
         torch._inductor.config.aot_inductor.use_consts_asm_build
@@ -99,6 +106,8 @@ def generate_basic_tests_consts_cpp():
     torch._inductor.config.aot_inductor.use_consts_asm_build = backup_consts_asm_cfg
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def generate_large_tests():
     device = "cuda"
     model = Net(device, size=4096).to(device=device)
@@ -140,6 +149,7 @@ def generate_large_tests():
         )
 
 
+<<<<<<< HEAD
 def generate_cuda_alloc_test():
     device = "cuda"
     model = Net(device, size=4096).to(device=device)
@@ -166,6 +176,8 @@ def generate_cuda_alloc_test():
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # AOTI model which will create additional tensors during autograd.
 def generate_test_with_additional_tensors():
     if not torch.cuda.is_available():
@@ -197,10 +209,15 @@ def generate_test_with_additional_tensors():
 
 
 generate_basic_tests()
+<<<<<<< HEAD
 generate_basic_tests_consts_cpp()
 generate_large_tests()
 generate_test_with_additional_tensors()
 generate_cuda_alloc_test()
+=======
+generate_large_tests()
+generate_test_with_additional_tensors()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Use this to communicate tensors to the cpp code
@@ -216,4 +233,7 @@ def __init__(self, data):
 torch.jit.script(Serializer(data_with_tensor_constants)).save(
     "data_with_tensor_constants.pt"
 )
+<<<<<<< HEAD
 torch.jit.script(Serializer(cuda_alloc_data)).save("cuda_alloc_data.pt")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index b3d47b71e9aca..c4fd4e4f6cc5f 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -584,7 +584,11 @@ TEST(CustomAutogradTest, MarkDirty) {
     }
   };
 
+<<<<<<< HEAD
   // Clone here because modifying leaves inplace is not allowed
+=======
+  // Clone here because modifying leafs inplace is not allowed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto x = torch::randn({5, 5}, torch::requires_grad()).clone();
   auto version_before = x._version();
   auto out = MyFunction::apply(x);
@@ -1292,6 +1296,15 @@ torch::Tensor view_op(const torch::Tensor& self) {
   return self.alias();
 }
 
+<<<<<<< HEAD
+=======
+torch::Tensor view_op_with_extra_arg(
+    const torch::Tensor& self,
+    const torch::Tensor& other) {
+  return self.alias();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::vector<torch::Tensor> ret_tensor_vector_view(
     const torch::Tensor& self,
     const torch::Tensor& other) {
@@ -1528,9 +1541,41 @@ TEST(TestAutogradNotImplementedFallback, ViewOp) {
   // Test inplace on view
   auto t = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
 
+<<<<<<< HEAD
   // this works as we can properly replay the view given by the user
   v1.add_(t);
   b1.add_(t);
+=======
+  // raise on rebase_history when it refreshes grad_fn
+  ASSERT_THROWS_WITH(
+      v1.add_(t), "which does not have a derivative implemented is forbidden");
+  // base should not be aware of the views, so this is still okay
+  b1.add_(t);
+  ASSERT_THROWS_WITH(
+      v1.grad_fn(),
+      "which does not have a derivative implemented is forbidden");
+}
+
+TEST(TestAutogradNotImplementedFallback, ViewOpWithExtraArg) {
+  REGISTER_TEST_OP(
+      "view_op_with_extra_arg",
+      "_test::view_op_with_extra_arg(Tensor(a) self, Tensor other) -> Tensor(a)",
+      view_op_with_extra_arg);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow(
+      "_test::view_op_with_extra_arg", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<
+        torch::Tensor,
+        const torch::Tensor&,
+        const torch::Tensor&>(opHandle, _1, _2);
+  };
+  assertBasicChecks(op);
+  auto a = torch::tensor({1.}, {torch::kFloat32});
+  auto b = torch::tensor({2.}, {torch::kFloat32});
+  auto out1 = op(a, b);
+  ASSERT_TRUE(out1.is_view());
+  ASSERT_EQ(out1._base().unsafeGetTensorImpl(), a.unsafeGetTensorImpl());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(TestAutogradNotImplementedFallback, RetTensorVectorView) {
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index 0dab78c3a28e6..e4e74c14865d5 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -264,7 +264,11 @@ TEST_F(ParallelTest, DataParallelNumericalEquivalence_MultiCUDA) {
     input += i;
     input_dp += i;
 
+<<<<<<< HEAD
     // non-parallel training
+=======
+    // non-prallel training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::optim::SGD optim(model->parameters(), torch::optim::SGDOptions(0.1));
     auto output = model->forward(input);
     auto loss = torch::mse_loss(output, torch::zeros_like(output));
diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp
index 1c48a33fb7c0f..9a39a2dd8bf5d 100644
--- a/test/cpp/api/tensor_cuda.cpp
+++ b/test/cpp/api/tensor_cuda.cpp
@@ -1,8 +1,11 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <ATen/Context.h>
 #include <torch/torch.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <cmath>
 
@@ -126,6 +129,7 @@ TEST(TensorTest, MagmaInitializesCorrectly_CUDA) {
     at::inverse(tensor);
   }
 }
+<<<<<<< HEAD
 
 #ifdef USE_CUDA
 #include <ATen/cuda/CUDAConfig.h>
@@ -186,3 +190,5 @@ TEST(CuDNNBatchNormTest, OutVariantMatchesFunctional) {
 }
 #endif // AT_CUDNN_ENABLED()
 #endif // USE_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp/api/transformer.cpp b/test/cpp/api/transformer.cpp
index fc4832d30157a..c4fdb59e10168 100644
--- a/test/cpp/api/transformer.cpp
+++ b/test/cpp/api/transformer.cpp
@@ -73,7 +73,11 @@ void transformer_encoder_layer_test_helper(
   ASSERT_TRUE(
       torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
 
+<<<<<<< HEAD
   // all 0 values are NOT masked. This shouldn't mask anything
+=======
+  // all 0 values are NOT masked. This should't mask anything
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torch::Tensor mask = torch::tensor({{0}}, tensor_options) == 1;
   result = model(
                encoder_input,
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
index 0831958da761d..fc1365029cb7e 100644
--- a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -386,7 +386,11 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   ASSERT_TRUE(
       setenv(c10d::TORCH_NCCL_ENABLE_MONITORING[0].c_str(), "1", 1) == 0);
   auto tempFilename = c10::str(
+<<<<<<< HEAD
       std::filesystem::temp_directory_path().string(), "/comm_lib_trace_rank_");
+=======
+      std::filesystem::temp_directory_path().string(), "/nccl_trace_rank_");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_TRUE(
       setenv("TORCH_NCCL_DEBUG_INFO_TEMP_FILE", tempFilename.c_str(), 1) == 0);
   // Enable nccl flight recorder.
@@ -401,7 +405,11 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   // The only difference is that we are storing traces also in memory for
   // validation.
   std::string fileNamePrefix = c10d::getCvarString(
+<<<<<<< HEAD
       {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/comm_lib_trace_rank_");
+=======
+      {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/nccl_trace_rank_");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_ptr<TestDebugInfoWriter> wrterForTestPtr =
       std::make_unique<TestDebugInfoWriter>(fileNamePrefix);
   std::vector<uint8_t>& traces = wrterForTestPtr->getTraces();
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index ac4ba4da01577..92642f24750b2 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -28,7 +28,11 @@ class NCCLTestBase {
 
   NCCLTestBase(NCCLTestBase&& other) noexcept = default;
 
+<<<<<<< HEAD
   ::c10::intrusive_ptr<::c10d::ProcessGroupNCCL> getProcessGroup() {
+=======
+  std::shared_ptr<::c10d::ProcessGroupNCCL> getProcessGroup() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return pg_;
   }
 
@@ -39,7 +43,11 @@ class NCCLTestBase {
   void initialize(
       int rank,
       size_t size,
+<<<<<<< HEAD
       std::optional<::c10::intrusive_ptr<::c10d::ProcessGroupNCCL>> split_from =
+=======
+      std::optional<::std::shared_ptr<::c10d::ProcessGroupNCCL>> split_from =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::nullopt) {
     store_ = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
@@ -52,13 +60,21 @@ class NCCLTestBase {
       opts->split_color = ++color_;
     }
 #endif
+<<<<<<< HEAD
     pg_ = c10::make_intrusive<::c10d::ProcessGroupNCCL>(
+=======
+    pg_ = std::make_unique<::c10d::ProcessGroupNCCL>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         store_, rank, size, std::move(opts));
   }
 
  protected:
   std::string path_;
+<<<<<<< HEAD
   ::c10::intrusive_ptr<::c10d::ProcessGroupNCCL> pg_;
+=======
+  std::shared_ptr<::c10d::ProcessGroupNCCL> pg_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::chrono::milliseconds pgTimeout_;
   ::c10::intrusive_ptr<::c10d::Store> store_;
   int color_{1};
@@ -767,8 +783,13 @@ TEST_F(ProcessGroupNCCLTest, CUDAEventCache) {
   }
 
   // Test that the CUDAEventCache can be used to create CUDA events and reuse.
+<<<<<<< HEAD
   auto event1 = c10d::CUDAEventCache::get(1)->create(true);
   auto event2 = c10d::CUDAEventCache::get(1)->create(false);
+=======
+  auto event1 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(true);
+  auto event2 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto event1_ptr = event1.get();
   auto event2_ptr = event2.get();
@@ -777,6 +798,7 @@ TEST_F(ProcessGroupNCCLTest, CUDAEventCache) {
   event2 = nullptr;
 
   // Test that the CUDAEventCache is indeed reused.
+<<<<<<< HEAD
   auto event3 = c10d::CUDAEventCache::get(2)->create(true);
   auto event4 = c10d::CUDAEventCache::get(2)->create(false);
   // The cache has been used up, new events should be created.
@@ -785,6 +807,16 @@ TEST_F(ProcessGroupNCCLTest, CUDAEventCache) {
   // The cache has been used up, new events should be created.
   auto event7 = c10d::CUDAEventCache::get(1)->create(true);
   auto event8 = c10d::CUDAEventCache::get(1)->create(false);
+=======
+  auto event3 = c10d::ProcessGroupNCCL::CUDAEventCache::get(2)->create(true);
+  auto event4 = c10d::ProcessGroupNCCL::CUDAEventCache::get(2)->create(false);
+  // The cache has been used up, new events should be created.
+  auto event5 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(true);
+  auto event6 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(false);
+  // The cache has been used up, new events should be created.
+  auto event7 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(true);
+  auto event8 = c10d::ProcessGroupNCCL::CUDAEventCache::get(1)->create(false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EXPECT_NE(event1_ptr, event3.get());
   EXPECT_NE(event2_ptr, event4.get());
   EXPECT_EQ(event1_ptr, event5.get());
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 0b2a06b53c9a2..7f463c25546e1 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -17,7 +17,11 @@ set(BACKEND_WITH_COMPILER_SRCS
 )
 if(USE_KINETO)
   # Testing edge profiler for backend use
+<<<<<<< HEAD
   # profiler_edge should only be added when USE_KINETO flag is on
+=======
+  # profiler_edge should only be aded when USE_KINETO flag is on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   list(APPEND BACKEND_WITH_COMPILER_SRCS
     ${TORCH_SRC_DIR}/csrc/jit/mobile/profiler_edge.cpp)
 endif()
@@ -88,7 +92,10 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
   ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
   ${JIT_TEST_ROOT}/test_subgraph_utils.cpp
+<<<<<<< HEAD
   ${JIT_TEST_ROOT}/test_te.cpp
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${JIT_TEST_ROOT}/test_union.cpp
   ${JIT_TEST_ROOT}/test_utils.cpp
   ${JIT_TEST_ROOT}/test_script_profile.cpp
diff --git a/test/cpp/jit/README.md b/test/cpp/jit/README.md
index 06704be5d9706..512065b4463ad 100644
--- a/test/cpp/jit/README.md
+++ b/test/cpp/jit/README.md
@@ -36,7 +36,11 @@ The following commands assume you are in PyTorch root.
 
 ```bash
 # ... Build PyTorch from source, e.g.
+<<<<<<< HEAD
 python -m pip install --no-build-isolation -v -e .
+=======
+python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (re)build just the binary
 ninja -C build bin/test_jit
 # run tests
diff --git a/test/cpp/jit/test_backend.cpp b/test/cpp/jit/test_backend.cpp
index 4a060e436f2b0..1ee50d7bedc02 100644
--- a/test/cpp/jit/test_backend.cpp
+++ b/test/cpp/jit/test_backend.cpp
@@ -789,7 +789,11 @@ TEST(
   c._save_for_mobile(ss, ExtraFilesMap(), true);
   auto c_loaded = _load_for_mobile(ss);
   /*
+<<<<<<< HEAD
    * Error stack trace will look like this:
+=======
+   * Erro stack trace will look like this:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * Module hierarchy:top(C).A0(backend_with_compiler_demoLoweredModule).AA0(AA)
    * Traceback of TorchScript (most recent call last):
    *  File "<string>", line 3, in FunctionName_UNKNOWN
diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp
index 55511c3e684a6..7708b53f909b3 100644
--- a/test/cpp/jit/test_backend_compiler_lib.cpp
+++ b/test/cpp/jit/test_backend_compiler_lib.cpp
@@ -79,7 +79,11 @@ class BackendWithCompiler : public PyTorchBackendInterface {
   // forwards everything along. In a non toy setup this could grab information
   // from that runtime that might be relevant to execute, such as build flags
   // the resolution of the devices camera, or basically any runtime specific
+<<<<<<< HEAD
   // information that wouldn't be available server side where preprocess is
+=======
+  // information that wouldnt be available server side where preprocess is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // called.
   c10::impl::GenericDict compile(
       c10::IValue processed,
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index 3aa981a7883ba..f2d5ae6a4bc8e 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -376,7 +376,11 @@ struct ElementwiseInterpreter : torch::CustomClassHolder {
   // for more info.
 
   // This is the type we will use to marshall information on disk during
+<<<<<<< HEAD
   // Ser/De. It is a simple tuple composed of primitive types and simple
+=======
+  // ser/de. It is a simple tuple composed of primitive types and simple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // collection types like vector, optional, and dict.
   using SerializationType = std::tuple<
       std::vector<std::string> /*input_names_*/,
@@ -421,9 +425,13 @@ struct FlattenWithTensorOp : public torch::CustomClassHolder {
   explicit FlattenWithTensorOp(at::Tensor t) : t_(t) {}
 
   at::Tensor get() {
+<<<<<<< HEAD
     // Need to return a copy of the tensor, otherwise the tensor will be
     // aliased with a tensor that may be modified by the user or backend.
     return t_.clone();
+=======
+    return t_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::tuple<std::tuple<std::string, at::Tensor>> __obj_flatten__() {
@@ -439,9 +447,13 @@ struct ContainsTensor : public torch::CustomClassHolder {
   explicit ContainsTensor(at::Tensor t) : t_(t) {}
 
   at::Tensor get() {
+<<<<<<< HEAD
     // Need to return a copy of the tensor, otherwise the tensor will be
     // aliased with a tensor that may be modified by the user or backend.
     return t_.clone();
+=======
+    return t_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::tuple<std::tuple<std::string, at::Tensor>> __obj_flatten__() {
@@ -507,6 +519,7 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
   m.class_<FlattenWithTensorOp>("_FlattenWithTensorOp")
       .def(torch::init<at::Tensor>())
       .def("get", &FlattenWithTensorOp::get)
+<<<<<<< HEAD
       .def("__obj_flatten__", &FlattenWithTensorOp::__obj_flatten__)
       .def_pickle(
           // __getstate__
@@ -516,6 +529,9 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
           [](at::Tensor data) -> c10::intrusive_ptr<FlattenWithTensorOp> {
             return c10::make_intrusive<FlattenWithTensorOp>(std::move(data));
           });
+=======
+      .def("__obj_flatten__", &FlattenWithTensorOp::__obj_flatten__);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.class_<ConstantTensorContainer>("_ConstantTensorContainer")
       .def(torch::init<at::Tensor>())
@@ -720,8 +736,12 @@ at::Tensor takes_foo_tensor_return(c10::intrusive_ptr<Foo> foo, at::Tensor x) {
 }
 
 void queue_push(c10::intrusive_ptr<TensorQueue> tq, at::Tensor x) {
+<<<<<<< HEAD
   // clone the tensor to avoid aliasing
   tq->push(x.clone());
+=======
+  tq->push(x);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor queue_pop(c10::intrusive_ptr<TensorQueue> tq) {
@@ -754,11 +774,14 @@ TORCH_LIBRARY_IMPL(_TorchScriptTesting, CPU, m) {
   m.impl("takes_foo_tensor_return", takes_foo_tensor_return);
 }
 
+<<<<<<< HEAD
 TORCH_LIBRARY_IMPL(_TorchScriptTesting, CUDA, m) {
   m.impl("queue_push", queue_push);
   m.impl("queue_pop", queue_pop);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(_TorchScriptTesting, Meta, m) {
   m.impl("takes_foo", &takes_foo);
   m.impl("takes_foo_list_return", takes_foo_list_return);
diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp
index 950d0c524ad3a..dd4cfaeaebd2c 100644
--- a/test/cpp/jit/test_lite_trainer.cpp
+++ b/test/cpp/jit/test_lite_trainer.cpp
@@ -78,7 +78,11 @@ TEST(LiteTrainerTest, Params) {
   AT_ASSERT(parameters[0].item<float>() == bc_parameters[0].item<float>());
 }
 
+<<<<<<< HEAD
 // TODO Re-enable these tests after parameters are correctly loaded on mobile
+=======
+// TODO Renable these tests after parameters are correctly loaded on mobile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /*
 TEST(MobileTest, NamedParameters) {
   Module m("m");
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index ebeeb953d95b6..e4e8b685c5562 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2709,7 +2709,10 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
 }
 
 TEST(RecordDebugHandles, Basic) {
+<<<<<<< HEAD
   GTEST_SKIP() << "Test is flaky and sometimes hangs on CI. ";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Enable the profiler in this thread
   const std::set<torch::autograd::profiler::ActivityType> activities(
       {torch::autograd::profiler::ActivityType::CPU});
diff --git a/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp b/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
index b6467b7c5b490..99faaa6d17c48 100644
--- a/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
@@ -106,7 +106,11 @@ TEST(RunTimeTest, DelegateException) {
    * inputs.emplace_back(torch::rand({2, 4}));
    * inputs.emplace_back(torch::rand({13, 9}));
    * Run with inputs and expect exception
+<<<<<<< HEAD
    * Error stack trace will look like this:
+=======
+   * Erro stack trace will look like this:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * Module hierarchy:top(C).A0(backend_with_compiler_demoLoweredModule).AA0(AA)
    * Traceback of TorchScript (most recent call last):
    *  File "<string>", line 3, in FunctionName_UNKNOWN
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 77636a2d10932..809f6f2c27f47 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -5,12 +5,18 @@ file(GLOB_RECURSE NATIVERT_ALL_TEST_FILES "${NATIVERT_TEST_ROOT}/test_*.cpp")
 # Build the cpp gtest binary containing the cpp-only tests.
 set(NATIVERT_TEST_SRCS
   ${NATIVERT_ALL_TEST_FILES}
+<<<<<<< HEAD
   ${TORCH_ROOT}/torch/nativert/ModelRunner.cpp
   ${TORCH_ROOT}/torch/nativert/graph/TensorMeta.cpp
   ${TORCH_ROOT}/torch/nativert/graph/Graph.cpp
   ${TORCH_ROOT}/torch/nativert/graph/GraphPasses.cpp
   ${TORCH_ROOT}/torch/nativert/graph/GraphSignature.cpp
   ${TORCH_ROOT}/torch/nativert/graph/GraphUtils.cpp
+=======
+  ${TORCH_ROOT}/torch/nativert/graph/TensorMeta.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/Graph.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/GraphSignature.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${TORCH_ROOT}/torch/nativert/graph/Serialization.cpp
   ${TORCH_ROOT}/torch/nativert/executor/OpKernel.cpp
   ${TORCH_ROOT}/torch/nativert/executor/PlacementUtils.cpp
@@ -23,6 +29,7 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/GreedyBySize.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/Bump.cpp
+<<<<<<< HEAD
   ${TORCH_ROOT}/torch/nativert/executor/memory/DisjointStorageGroups.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/LayoutPlanner.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/LayoutManager.cpp
@@ -53,21 +60,31 @@ if(USE_CUDA OR USE_ROCM)
   list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp)
 endif()
 
+=======
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 add_executable(test_nativert
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${NATIVERT_TEST_SRCS}
 )
 
+<<<<<<< HEAD
 if(MSVC)
   target_compile_definitions(test_nativert PRIVATE NATIVERT_MSVC_TEST)
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_nativert PRIVATE USE_GTEST)
 
 set(NATIVERT_TEST_DEPENDENCIES torch gtest_main)
 
+<<<<<<< HEAD
 target_link_libraries(test_nativert PRIVATE ${CMAKE_DL_LIBS})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
 target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
 target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
diff --git a/test/cpp/nativert/test_c10_kernel.cpp b/test/cpp/nativert/test_c10_kernel.cpp
index 84c04c39d408d..bc08c615e25f2 100644
--- a/test/cpp/nativert/test_c10_kernel.cpp
+++ b/test/cpp/nativert/test_c10_kernel.cpp
@@ -27,6 +27,11 @@ return (%x)
   std::advance(it, 1);
   const Node& node = *it;
 
+<<<<<<< HEAD
+=======
+  c10::Device device = torch::Device(torch::kCPU, 0);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto a = at::randn({6, 6, 6});
   auto b = at::randn({6, 6, 6});
 
@@ -34,7 +39,11 @@ return (%x)
   frame.setIValue(graph->getValue("a")->id(), a);
   frame.setIValue(graph->getValue("b")->id(), b);
 
+<<<<<<< HEAD
   auto kernel = C10Kernel(&node);
+=======
+  auto kernel = C10Kernel(&node, device);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   kernel.computeInternal(frame);
 
diff --git a/test/cpp/nativert/test_execution_frame.cpp b/test/cpp/nativert/test_execution_frame.cpp
index d2a8b69cef20f..06ae17b480875 100644
--- a/test/cpp/nativert/test_execution_frame.cpp
+++ b/test/cpp/nativert/test_execution_frame.cpp
@@ -1,6 +1,9 @@
 #include <gtest/gtest.h>
+<<<<<<< HEAD
 
 #include <ATen/ops/tensor.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/nativert/executor/ExecutionFrame.h>
 
 namespace torch::nativert {
@@ -92,9 +95,13 @@ TEST(ExecutionFrameTest, TestPersistentValue) {
   auto wid = graph->getValue("my_weight")->id();
 
   EXPECT_NO_THROW(frame.getTensor(wid));
+<<<<<<< HEAD
   // can't release persistent value
   frame.releaseValueIfNeeded(wid);
   EXPECT_FALSE(frame.getIValue(wid).isNone());
+=======
+  EXPECT_DEATH(frame.releaseValue(wid), "Cannot release persistent value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::nativert
diff --git a/test/cpp/nativert/test_itree.cpp b/test/cpp/nativert/test_itree.cpp
index 15ff600fe73d5..2616c97116742 100644
--- a/test/cpp/nativert/test_itree.cpp
+++ b/test/cpp/nativert/test_itree.cpp
@@ -4,7 +4,10 @@
 #include <fmt/format.h>
 
 #include <c10/util/Enumerate.h>
+<<<<<<< HEAD
 #include <torch/custom_class.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/nativert/detail/ITree.h>
 
 namespace torch::nativert::detail {
@@ -260,7 +263,11 @@ TEST(ITreeTest, NoContext) {
       c10::IValue(8),
       c10::IValue(9),
   };
+<<<<<<< HEAD
   EXPECT_THROW({ itreeUnflatten(flats, spec); }, c10::Error);
+=======
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, TooManyContext) {
@@ -305,7 +312,11 @@ TEST(ITreeTest, TooManyContext) {
       c10::IValue(8),
       c10::IValue(9),
   };
+<<<<<<< HEAD
   EXPECT_THROW({ itreeUnflatten(flats, spec); }, c10::Error);
+=======
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, DoubleRegister) {
@@ -376,7 +387,11 @@ TEST(ITreeTest, NotEnoughUnflatten) {
       c10::IValue(2),
       c10::IValue(7),
   };
+<<<<<<< HEAD
   EXPECT_THROW({ itreeUnflatten(flats, spec); }, c10::Error);
+=======
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, TooManyUnflatten) {
@@ -450,7 +465,11 @@ TEST(ITreeTest, TooManyUnflatten) {
       c10::IValue(2),
       c10::IValue(7),
   };
+<<<<<<< HEAD
   EXPECT_THROW({ itreeUnflatten(flats, spec); }, c10::Error);
+=======
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, Flatten) {
@@ -909,8 +928,13 @@ TEST(ITreeTest, UnmatchedDictFlatten) {
   list.push_back(std::move(tup));
   list.push_back(c10::IValue(2));
   list.push_back(std::move(dict));
+<<<<<<< HEAD
   EXPECT_THROW(
       { itreeFlatten(c10::IValue{std::move(list)}, spec); }, c10::Error);
+=======
+  ASSERT_DEATH(
+      { itreeFlatten(c10::IValue{std::move(list)}, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, DictFlattenTest) {
@@ -1026,8 +1050,13 @@ TEST(ITreeTest, UnmatchedTupleFlatten) {
   list.push_back(std::move(tup));
   list.push_back(c10::IValue(2));
   list.push_back(std::move(dict));
+<<<<<<< HEAD
   EXPECT_THROW(
       { itreeFlatten(c10::IValue{std::move(list)}, spec); }, c10::Error);
+=======
+  ASSERT_DEATH(
+      { itreeFlatten(c10::IValue{std::move(list)}, spec); }, "Check failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(ITreeTest, ToAtenType) {
@@ -1148,6 +1177,7 @@ TEST(ITreeTest, ToAtenType) {
       c10::TypeKind::AnyType);
 }
 
+<<<<<<< HEAD
 TEST(ITreeTest, KeyedJaggedTensorUnflatten) {
   // Test KeyedJaggedTensor pytree node registration
   // KeyedJaggedTensor has 6 tensor fields: _values, _weights, _lengths,
@@ -1344,4 +1374,6 @@ TEST(ITreeTest, JaggedTensorNodeRegistration) {
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::nativert::detail
diff --git a/test/cpp/nativert/test_layout_planner_algorithm.cpp b/test/cpp/nativert/test_layout_planner_algorithm.cpp
index 0d4f8fb0d2737..8c568bf706b5c 100644
--- a/test/cpp/nativert/test_layout_planner_algorithm.cpp
+++ b/test/cpp/nativert/test_layout_planner_algorithm.cpp
@@ -2,7 +2,10 @@
 #include <gtest/gtest.h>
 
 #include <torch/nativert/executor/memory/Bump.h>
+<<<<<<< HEAD
 #include <torch/nativert/executor/memory/DisjointStorageGroups.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/nativert/executor/memory/GreedyBySize.h>
 
 using namespace ::testing;
@@ -62,6 +65,7 @@ TEST(LayoutPlannerAlgorithmTests, TestBump) {
 
   EXPECT_EQ(result.total_size, offset);
 }
+<<<<<<< HEAD
 
 TEST(LayoutPlannerAlgorithmTests, TestStorageGroup) {
   auto specs = create_test_allocation_specs();
@@ -84,3 +88,5 @@ TEST(LayoutPlannerAlgorithmTests, TestStorageGroup) {
 
   EXPECT_EQ(result.total_size, 150);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp/nativert/test_op_kernel.cpp b/test/cpp/nativert/test_op_kernel.cpp
index 4854d39f5877f..f7423b591f5b0 100644
--- a/test/cpp/nativert/test_op_kernel.cpp
+++ b/test/cpp/nativert/test_op_kernel.cpp
@@ -1,6 +1,9 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/op_registration.h>
+<<<<<<< HEAD
 #include <ATen/ops/tensor.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <gtest/gtest.h>
 #include <torch/nativert/executor/OpKernel.h>
 
diff --git a/test/cpp/nativert/test_placement.cpp b/test/cpp/nativert/test_placement.cpp
index ab65bfc07b917..e563b59f893e2 100644
--- a/test/cpp/nativert/test_placement.cpp
+++ b/test/cpp/nativert/test_placement.cpp
@@ -8,6 +8,26 @@
 using namespace ::testing;
 
 namespace torch::nativert {
+<<<<<<< HEAD
+=======
+TEST(PlacementTest, NormalizeDevice) {
+  c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
+  c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
+  cpuDevice1.set_index(1);
+
+  EXPECT_EQ(normalizeDevice(cpuDevice), cpuDevice);
+  EXPECT_NE(normalizeDevice(cpuDevice1), cpuDevice1);
+
+  c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
+  c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
+  EXPECT_EQ(normalizeDevice(cudaDevice), c10::Device(c10::DeviceType::CUDA, 0));
+  EXPECT_EQ(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 1));
+
+  EXPECT_NE(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 0));
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST(PlacementTest, IsSameDevice) {
   c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
@@ -73,11 +93,19 @@ TEST(PlacementTest, Placement) {
       {c10::Device("cuda:0"), c10::Device("cuda:1")}};
   Placement p1(deviceMap1);
   EXPECT_EQ(p1.getMappedDevice(c10::Device("cpu")), c10::Device("cpu"));
+<<<<<<< HEAD
   EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda")), c10::Device("cuda"));
   EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:1"));
 
   std::unordered_map<c10::Device, c10::Device> deviceMap2 = {
       {c10::Device("cpu"), c10::Device("cuda:0")}};
+=======
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda")), c10::Device("cuda:1"));
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:1"));
+
+  std::unordered_map<c10::Device, c10::Device> deviceMap2 = {
+      {c10::Device("cpu"), c10::Device("cuda")}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Placement p2(deviceMap2);
   EXPECT_EQ(p2.getMappedDevice(c10::Device("cpu")), c10::Device("cuda:0"));
   EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:0"));
diff --git a/test/cpp/nativert/test_weights.cpp b/test/cpp/nativert/test_weights.cpp
index 566bc04698712..a31e6a1fa05d4 100644
--- a/test/cpp/nativert/test_weights.cpp
+++ b/test/cpp/nativert/test_weights.cpp
@@ -25,7 +25,11 @@ return(%o2, %baz)
 };
 TEST_F(WeightsTest, ConstructEmptyStateDict) {
   std::unordered_map<std::string, c10::IValue> stateDict;
+<<<<<<< HEAD
   Weights weights(graph.get(), stateDict);
+=======
+  Weights weights(graph.get(), stateDict, *placement);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Check that weights are initialized correctly
   EXPECT_TRUE(weights.parameters().empty());
   EXPECT_TRUE(weights.buffers().empty());
@@ -33,7 +37,11 @@ TEST_F(WeightsTest, ConstructEmptyStateDict) {
 }
 TEST_F(WeightsTest, SetAndGetValue) {
   std::unordered_map<std::string, c10::IValue> stateDict;
+<<<<<<< HEAD
   Weights weights(graph.get(), stateDict);
+=======
+  Weights weights(graph.get(), stateDict, *placement);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor tensor = at::ones({2, 2});
   weights.setValue("added_weight", tensor);
   EXPECT_TRUE(weights.contains("added_weight"));
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
new file mode 100644
index 0000000000000..8fe6ffd525e98
--- /dev/null
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -0,0 +1,83 @@
+set(TENSOREXPR_TEST_ROOT ${TORCH_ROOT}/test/cpp/tensorexpr)
+
+set(TENSOREXPR_TEST_SRCS
+  ${TENSOREXPR_TEST_ROOT}/test_approx.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_aten.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_boundsinference.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_conv.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_cpp_codegen.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_dynamic_shapes.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_expr.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_external_calls.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_graph_opt.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_ir_printer.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_ir_verifier.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_kernel.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_loopnest.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_memdependency.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_ops.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_quantization.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_memplanning.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_reductions.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_registerizer.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_te_fuser_pass.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_type.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_type_specializations.cpp
+)
+
+if(USE_CUDA)
+  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_cuda.cpp)
+endif()
+
+if(USE_LLVM AND LLVM_FOUND)
+  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_llvm.cpp)
+endif()
+
+add_executable(test_tensorexpr
+  ${TORCH_ROOT}/test/cpp/common/main.cpp
+  ${TENSOREXPR_TEST_ROOT}/padded_buffer.cpp
+  ${TENSOREXPR_TEST_SRCS})
+
+target_link_libraries(test_tensorexpr PRIVATE torch gtest_main)
+target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
+target_compile_definitions(test_tensorexpr PRIVATE USE_GTEST)
+
+add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
+target_link_libraries(tutorial_tensorexpr PRIVATE torch)
+target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
+
+# The test case depends on the xnnpack header which in turn depends on the
+# pthreadpool header. For some build environment we need add the dependency
+# explicitly.
+if(USE_PTHREADPOOL)
+  target_link_libraries(test_tensorexpr PRIVATE pthreadpool_interface)
+endif()
+if(USE_CUDA)
+  target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA)
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
+elseif(USE_ROCM)
+  target_link_libraries(test_tensorexpr PRIVATE
+    hiprtc::hiprtc
+    hip::amdhip64
+    ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
+
+  target_link_libraries(tutorial_tensorexpr PRIVATE
+    hiprtc::hiprtc
+    hip::amdhip64
+    ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
+endif()
+
+if(INSTALL_TEST)
+  set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
+  install(TARGETS test_tensorexpr DESTINATION bin)
+  set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
+  install(TARGETS tutorial_tensorexpr DESTINATION bin)
+  # Install PDB files for MSVC builds
+  if(MSVC AND BUILD_SHARED_LIBS)
+    install(FILES $<TARGET_PDB_FILE:test_tensorexpr> DESTINATION bin OPTIONAL)
+    install(FILES $<TARGET_PDB_FILE:tutorial_tensorexpr> DESTINATION bin OPTIONAL)
+  endif()
+endif()
diff --git a/test/cpp/tensorexpr/README.md b/test/cpp/tensorexpr/README.md
new file mode 100644
index 0000000000000..f86a50a65e804
--- /dev/null
+++ b/test/cpp/tensorexpr/README.md
@@ -0,0 +1,55 @@
+# TensorExpr C++ Tests
+
+## How to add a new test
+First, create a new test file. Test files should have be placed in this
+directory, with a name that starts with `test_`, like `test_foo.cpp`.
+
+Here is an example test file you can copy-paste.
+```cpp
+#include <test/cpp/tensorexpr/test_base.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+// 1. Test cases are void() functions.
+// 2. They start with the prefix `test`
+void testCaseOne() {
+    // ...
+}
+
+void testCaseTwo() {
+    // ...
+}
+}
+}
+```
+
+Then, register your test in `tests.h`:
+```cpp
+// Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests
+#define TH_FORALL_TESTS(_)             \
+  _(ADFormulas)                        \
+  _(Attributes)                        \
+  ...
+  _(CaseOne)  // note that the `test` prefix is omitted.
+  _(CaseTwo)
+```
+
+We glob all the test files together in `CMakeLists.txt` so that you don't
+have to edit it every time you add a test. Unfortunately, this means that in
+order to get the build to pick up your new test file, you need to re-run
+cmake:
+```bash
+CMAKE_FRESH=1 python setup.py build
+```
+
+## How do I run the tests?
+The following commands assume you are in PyTorch root.
+
+ ```bash
+ # (re)build the test binary
+ ninja build/bin/test_tensorexpr
+ # run
+ build/bin/test_tensorexpr --gtest_filter='glob_style_filter*'
+ ```
diff --git a/test/cpp/tensorexpr/gtest_assert_float_eq.h b/test/cpp/tensorexpr/gtest_assert_float_eq.h
new file mode 100644
index 0000000000000..f85264a8f5d3c
--- /dev/null
+++ b/test/cpp/tensorexpr/gtest_assert_float_eq.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <cmath>
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+using Bits = uint32_t;
+
+// this avoids the "dereferencing type-punned pointer
+// will break strict-aliasing rules" error
+union Float {
+  float float_;
+  Bits bits_;
+};
+
+// # of bits in a number.
+static const size_t kBitCount = 8 * sizeof(Bits);
+// The mask for the sign bit.
+static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+// Converts an integer from the sign-and-magnitude representation to
+// the biased representation.  More precisely, let N be 2 to the
+// power of (kBitCount - 1), an integer x is represented by the
+// unsigned number x + N.
+//
+// For instance,
+//
+//   -N + 1 (the most negative number representable using
+//          sign-and-magnitude) is represented by 1;
+//   0      is represented by N; and
+//   N - 1  (the biggest number representable using
+//          sign-and-magnitude) is represented by 2N - 1.
+//
+// Read http://en.wikipedia.org/wiki/Signed_number_representations
+// for more details on signed number representations.
+static Bits SignAndMagnitudeToBiased(const Bits& sam) {
+  if (kSignBitMask & sam) {
+    // sam represents a negative number.
+    return ~sam + 1;
+  } else {
+    // sam represents a positive number.
+    return kSignBitMask | sam;
+  }
+}
+
+// Given two numbers in the sign-and-magnitude representation,
+// returns the distance between them as an unsigned number.
+static Bits DistanceBetweenSignAndMagnitudeNumbers(
+    const Bits& sam1,
+    const Bits& sam2) {
+  const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+  const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+  return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+}
+
+// How many ULP's (Units in the Last Place) we want to tolerate when
+// comparing two numbers.  The larger the value, the more error we
+// allow.  A 0 value means that two numbers must be exactly the same
+// to be considered equal.
+//
+// The maximum error of a single floating-point operation is 0.5
+// units in the last place.  On Intel CPU's, all floating-point
+// calculations are done with 80-bit precision, while double has 64
+// bits.  Therefore, 4 should be enough for ordinary use.
+//
+// See the following article for more details on ULP:
+// http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+static const size_t kMaxUlps = 4;
+
+// Returns true if and only if this number is at most kMaxUlps ULP's away
+// from rhs.  In particular, this function:
+//
+//   - returns false if either number is (or both are) NAN.
+//   - treats really large numbers as almost equal to infinity.
+//   - thinks +0.0 and -0.0 are 0 DLP's apart.
+inline bool AlmostEquals(float lhs, float rhs) {
+  // The IEEE standard says that any comparison operation involving
+  // a NAN must return false.
+  if (std::isnan(lhs) || std::isnan(rhs))
+    return false;
+
+  Float l = {lhs};
+  Float r = {rhs};
+
+  return DistanceBetweenSignAndMagnitudeNumbers(l.bits_, r.bits_) <= kMaxUlps;
+}
diff --git a/test/cpp/tensorexpr/padded_buffer.cpp b/test/cpp/tensorexpr/padded_buffer.cpp
new file mode 100644
index 0000000000000..424d82c77453c
--- /dev/null
+++ b/test/cpp/tensorexpr/padded_buffer.cpp
@@ -0,0 +1,37 @@
+#include "test/cpp/tensorexpr/padded_buffer.h"
+
+#include <c10/util/Logging.h>
+#include <c10/util/irange.h>
+#include <sstream>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+int PaddedBufferBase::Index(const std::vector<int>& indices) const {
+  TORCH_DCHECK_EQ(dims_.size(), indices.size());
+  int total_index = 0;
+  for (const auto i : c10::irange(dims_.size())) {
+    total_index += indices[i] * strides_[i];
+  }
+  return total_index;
+}
+
+PaddedBufferBase::PaddedBufferBase(
+    const std::vector<int>& dims,
+    // NOLINTNEXTLINE(modernize-pass-by-value)
+    const std::string& name)
+    : dims_(dims), name_(name), strides_(dims.size()) {
+  for (int i = (int)dims.size() - 1; i >= 0; --i) {
+    if (i == (int)dims.size() - 1) {
+      strides_[i] = 1;
+    } else {
+      strides_[i] = strides_[i + 1] * dims[i + 1];
+    }
+  }
+  total_size_ = strides_[0] * dims[0];
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/padded_buffer.h b/test/cpp/tensorexpr/padded_buffer.h
new file mode 100644
index 0000000000000..b3e5227ae7e62
--- /dev/null
+++ b/test/cpp/tensorexpr/padded_buffer.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <c10/util/irange.h>
+#include "torch/csrc/jit/tensorexpr/eval.h"
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+template <typename T>
+struct DefaultPaddedValue;
+
+template <>
+struct DefaultPaddedValue<int> {
+  static const int kValue = static_cast<int>(0xDEADBEEF);
+};
+
+template <>
+struct DefaultPaddedValue<int8_t> {
+  static const int8_t kValue = static_cast<int8_t>(0xBE);
+};
+
+template <>
+struct DefaultPaddedValue<uint8_t> {
+  static const uint8_t kValue = static_cast<uint8_t>(0xBE);
+};
+
+template <>
+struct DefaultPaddedValue<int16_t> {
+  static const int16_t kValue = static_cast<int16_t>(0xBEEF);
+};
+
+template <>
+struct DefaultPaddedValue<int64_t> {
+  static const int64_t kValue = static_cast<int64_t>(0xDEADBEEF);
+};
+
+template <>
+struct DefaultPaddedValue<float> {
+  static constexpr float kValue = 0.1357;
+};
+
+template <>
+struct DefaultPaddedValue<at::Half> {
+  // at::Half ctor isn't constexpr, so just fill it with bits.
+  static constexpr uint16_t kValue = 1357;
+};
+
+template <>
+struct DefaultPaddedValue<double> {
+  static constexpr double kValue = 0.1357;
+};
+
+// A concrete base to be used in PaddedBase.
+class PaddedBufferBase {
+ public:
+  const std::string& name() const {
+    return name_;
+  }
+
+  int size() const {
+    return total_size_;
+  }
+
+  int raw_size() const {
+    return total_size_ + 2 * kPaddingSize;
+  }
+
+  virtual ~PaddedBufferBase() {}
+
+ protected:
+  explicit PaddedBufferBase(
+      const std::vector<int>& dims,
+      const std::string& name);
+  int Index(const std::vector<int>& indices) const;
+
+  std::vector<int> dims_;
+  std::string name_;
+  std::vector<int> strides_;
+  int total_size_; // total number of useful element, does not include the
+                   // paddings
+  static constexpr int kPaddingSize = 64;
+};
+
+// A padded buffer with wartermarks for testing.
+// The buffer carries padded watermarks on both sides to catch potential
+// out-of-bounds writes. For read-only data that are not supposed to change, it
+// can also make a backup and be compared later.
+template <typename T>
+class PaddedBuffer : public PaddedBufferBase {
+ public:
+  PaddedBuffer(int d0, const std::string& name = "")
+      : PaddedBuffer(std::vector<int>({d0}), name) {}
+  PaddedBuffer(int d0, int d1, const std::string& name = "")
+      : PaddedBuffer(std::vector<int>({d0, d1}), name) {}
+  PaddedBuffer(int d0, int d1, int d2, const std::string& name = "")
+      : PaddedBuffer(std::vector<int>({d0, d1, d2}), name) {}
+  PaddedBuffer(int d0, int d1, int d2, int d3, const std::string& name = "")
+      : PaddedBuffer(std::vector<int>({d0, d1, d2, d3}), name) {}
+  PaddedBuffer(const std::vector<int>& dims, const std::string& name = "")
+      : PaddedBufferBase(dims, name) {
+    data_.resize(total_size_ + 2 * kPaddingSize, kPaddingValue);
+  }
+  PaddedBuffer(const PaddedBuffer& other, const std::string& name)
+      : PaddedBuffer(other) {
+    this->name_ = name;
+  }
+
+  T* data() {
+    return data_.data() + kPaddingSize;
+  }
+  const T* data() const {
+    return const_cast<PaddedBuffer*>(this)->data();
+  }
+  T* raw_data() {
+    return data_.data();
+  }
+  const T* raw_data() const {
+    return const_cast<PaddedBuffer*>(this)->raw_data();
+  }
+  T& operator()(int i0) {
+    // There is a bit performance impact with forming a vector here. But this
+    // data structure is for testing only, and not performance critical.
+    return this->operator()(std::vector<int>({i0}));
+  }
+  const T& operator()(int i0) const {
+    return const_cast<PaddedBuffer*>(this)->operator()(i0);
+  }
+  T& operator()(int i0, int i1) {
+    return this->operator()(std::vector<int>({i0, i1}));
+  }
+  const T& operator()(int i0, int i1) const {
+    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1);
+  }
+  T& operator()(int i0, int i1, int i2) {
+    return this->operator()(std::vector<int>({i0, i1, i2}));
+  }
+  const T& operator()(int i0, int i1, int i2) const {
+    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2);
+  }
+  T& operator()(int i0, int i1, int i2, int i3) {
+    return this->operator()(std::vector<int>({i0, i1, i2, i3}));
+  }
+  const T& operator()(int i0, int i1, int i2, int i3) const {
+    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2, i3);
+  }
+  T& operator()(const std::vector<int>& indices) {
+    return data_[kPaddingSize + Index(indices)];
+  }
+  const T& operator()(const std::vector<int>& indices) const {
+    return const_cast<PaddedBuffer*>(this)->operator()(indices);
+  }
+
+  template <typename U>
+  friend void ExpectAllNear(
+      const PaddedBuffer<U>& v1,
+      const PaddedBuffer<U>& v2,
+      float abs_error);
+  template <typename U>
+  friend void ExpectAllEqual(
+      const PaddedBuffer<U>& v1,
+      const PaddedBuffer<U>& v2);
+  void Backup() {
+    backup_data_ = data_;
+  }
+
+  // Verify the watermarks in the paddings are intact.
+  void ValidateWatermark() const {
+    for (const auto i : c10::irange(kPaddingSize)) {
+      ASSERT_EQ(data_[i], kPaddingValue);
+      ASSERT_EQ(data_[i + total_size_ + kPaddingSize], kPaddingValue);
+    }
+  }
+
+  void CheckBackup() const {
+    ValidateWatermark();
+    DCHECK(backup_data_.size() == data_.size())
+        << "Please make sure you have call Backup() before calling CheckBackup()";
+    for (const auto i : c10::irange(total_size_)) {
+      ASSERT_EQ(data_[i + kPaddingSize], backup_data_[i + kPaddingSize]);
+    }
+  }
+
+ private:
+  std::vector<T> data_;
+  std::vector<T> backup_data_;
+  T kPaddingValue = DefaultPaddedValue<T>::kValue;
+};
+
+template <typename T>
+inline CodeGen::CallArg::CallArg(const PaddedBuffer<T>& buffer)
+    : data_(const_cast<T*>(buffer.data())) {}
+
+template <typename T>
+std::string CompareErrorMsg(
+    const PaddedBuffer<T>& v1,
+    const PaddedBuffer<T>& v2,
+    int index) {
+  std::ostringstream oss;
+  oss << "index: " << index << ", v1: (" << v1.name() << ", " << v1(index)
+      << ")"
+      << ", v2: (" << v2.name() << ", " << v2(index) << ")";
+  return oss.str();
+}
+
+template <typename T>
+void ExpectAllEqual(const PaddedBuffer<T>& f1, const PaddedBuffer<T>& f2) {
+  const std::vector<T>& v1 = f1.data_;
+  const std::vector<T>& v2 = f2.data_;
+  const int kPaddingSize = f1.kPaddingSize;
+  const int total_size = f1.total_size_;
+  ASSERT_EQ(v1.size(), v2.size());
+  f1.ValidateWatermark();
+  f2.ValidateWatermark();
+  for (const auto i : c10::irange(total_size)) {
+    ASSERT_EQ(v1[kPaddingSize + i], v2[kPaddingSize + i]);
+  }
+}
+
+template <typename T>
+void ExpectAllNear(
+    const PaddedBuffer<T>& f1,
+    const PaddedBuffer<T>& f2,
+    float abs_error) {
+  const std::vector<T>& v1 = f1.data_;
+  const std::vector<T>& v2 = f2.data_;
+  const int kPaddingSize = f1.kPaddingSize;
+  const int total_size = f1.total_size_;
+  ASSERT_EQ(v1.size(), v2.size());
+  f1.ValidateWatermark();
+  f2.ValidateWatermark();
+  for (const auto i : c10::irange(total_size)) {
+    ASSERT_NEAR(v1[kPaddingSize + i], v2[kPaddingSize + i], abs_error);
+  }
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
new file mode 100644
index 0000000000000..e1a576aecf526
--- /dev/null
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -0,0 +1,96 @@
+#ifdef TORCH_ENABLE_LLVM
+
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
+#include <cstring>
+
+using namespace torch::indexing;
+namespace te = torch::jit::tensorexpr;
+
+static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
+  auto loops = ln->getLoopStmtsFor(target);
+  te::ForPtr inner, tail;
+  ln->splitWithTail(loops[0], width, &inner, &tail);
+  ASSERT_TRUE(te::LoopNest::vectorize(inner));
+}
+
+std::string diffs(const at::Tensor& a, const at::Tensor& b) {
+  auto diff = torch::abs(a.flatten() - b.flatten());
+  auto count_diffs = torch::sum(diff > 0.f);
+  auto greatest_diff_index = torch::argmax(diff);
+  std::stringstream ss;
+  ss << "Found " << count_diffs << " unequal element(s). "
+     << "The greatest difference was " << diff.index({greatest_diff_index})
+     << " at index " << greatest_diff_index;
+  return ss.str();
+}
+
+TEST(Approx, log_vml) {
+  te::VarHandle N("N", te::kInt);
+  te::BufHandle A("A", {N}, te::kFloat);
+  te::Tensor B = te::Compute(
+      "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
+
+  te::LoopNest ln({B});
+  ln.prepareForCodegen();
+  vectorize(&ln, B, 8);
+  te::StmtPtr s = ln.root_stmt();
+  s = te::IRSimplifier::simplify(s);
+  te::LLVMCodeGen cg(s, {A, B, N});
+
+  auto eps = std::numeric_limits<float>::epsilon();
+  auto test = [&](const at::Tensor& A_t) {
+    at::Tensor B_ref = at::log(A_t);
+    at::Tensor B_t = at::empty_like(A_t);
+    auto ap = A_t.data_ptr<float>();
+    auto bp = B_t.data_ptr<float>();
+    cg.call({ap, bp, A_t.numel()});
+    // Results should be bit-identical.
+    ASSERT_TRUE(torch::allclose(
+        B_t, B_ref, /*rtol=*/eps, /*atol=*/0.0f, /*equal_nan=*/true))
+        << "Input[:8]\n"
+        << A_t.index({Slice(0, 8)}) << "\n"
+        << "Test[:8]\n"
+        << B_t.index({Slice(0, 8)}) << "\n"
+        << "Ref[:8]\n"
+        << B_ref.index({Slice(0, 8)}) << diffs(B_t, B_ref);
+  };
+
+  // Generate every single-precision FP value in [1.0, 2.0).
+  at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
+  ASSERT_EQ(A_t.numel(), 1 << 23);
+
+  test(A_t);
+
+  test(A_t * 2.0f);
+  test(A_t * 0.5f);
+
+  test(A_t * 4.0f);
+  test(A_t * 0.25f);
+
+  test(A_t * powf(2.0f, 16));
+  test(A_t * powf(2.0f, -16));
+
+  test(A_t * powf(2.0f, 126));
+  test(A_t * powf(2.0f, -126));
+
+  test(torch::full({32}, INFINITY));
+  test(torch::full({32}, NAN));
+
+  auto min = std::numeric_limits<float>::min();
+  auto denorm_min = std::numeric_limits<float>::denorm_min();
+
+  // Denormals aren't bit precise, because sleef isn't bit-precise either.
+  A_t = torch::arange(0.0f, min, denorm_min);
+  ASSERT_EQ(A_t.numel(), 1 << 23);
+  auto B_ref = at::log(A_t);
+  auto B_t = at::empty_like(B_ref);
+  cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
+  ASSERT_TRUE(torch::allclose(B_t, B_ref));
+}
+
+#endif // TORCH_ENABLE_LLVM
diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
new file mode 100644
index 0000000000000..34ce2bd069d55
--- /dev/null
+++ b/test/cpp/tensorexpr/test_aten.cpp
@@ -0,0 +1,1068 @@
+#include <algorithm>
+#include <sstream>
+#include <stdexcept>
+
+#include <gtest/gtest.h>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+#include "test/cpp/tensorexpr/padded_buffer.h"
+#include "test/cpp/tensorexpr/test_base.h"
+#include "torch/csrc/jit/tensorexpr/ir_printer.h"
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+TEST(ATen, _cast_Float) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle to_float = Cast::make(kFloat, load_a);
+  StmtPtr store_b = b_buf.store({index}, to_float);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), static_cast<float>(i));
+  }
+}
+
+TEST(ATen, negInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle to_float = Sub::make(0, load_a);
+  StmtPtr store_b = b_buf.store({index}, to_float);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), -static_cast<float>(i));
+  }
+}
+
+TEST(ATen, negFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle to_float = Sub::make(0, load_a);
+  StmtPtr store_b = b_buf.store({index}, to_float);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), -i);
+  }
+}
+
+TEST(ATen, addInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  StmtPtr store_d = d_buf.store({index}, load_a + load_b * load_c);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+  PaddedBuffer<int> d_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
+  ir_eval(a_v, b_v, c_v, d_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), a_v(i) + b_v(i) * c_v(i));
+  }
+}
+
+TEST(ATen, addFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  StmtPtr store_d = d_buf.store({index}, load_a + load_b * load_c);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+  PaddedBuffer<float> d_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
+  ir_eval(a_v, b_v, c_v, d_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), a_v(i) + b_v(i) * c_v(i));
+  }
+}
+
+TEST(ATen, subInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  StmtPtr store_d = d_buf.store({index}, load_a - load_b * load_c);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+  PaddedBuffer<int> d_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
+  ir_eval(a_v, b_v, c_v, d_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), a_v(i) - b_v(i) * c_v(i));
+  }
+}
+
+TEST(ATen, subFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  StmtPtr store_d = d_buf.store({index}, load_a - load_b * load_c);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+  PaddedBuffer<float> d_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
+  ir_eval(a_v, b_v, c_v, d_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), a_v(i) - b_v(i) * c_v(i));
+  }
+}
+
+TEST(ATen, lerp) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  StmtPtr store_d = d_buf.store({index}, load_a + load_c * (load_b - load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+  PaddedBuffer<float> d_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
+  ir_eval(a_v, b_v, c_v, d_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), a_v(i) + c_v(i) * (b_v(i) - a_v(i)));
+  }
+}
+
+TEST(ATen, addcmulInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle e_buf("E", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  StmtPtr store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_e);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+  PaddedBuffer<int> d_v(kTotalSize);
+  PaddedBuffer<int> e_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+    d_v(i) = 5 * i + 3;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf});
+  ir_eval(a_v, b_v, c_v, d_v, e_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), 5 * i + 3);
+    ASSERT_EQ(e_v(i), a_v(i) + b_v(i) * c_v(i) * d_v(i));
+  }
+}
+
+TEST(ATen, addcmulFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle e_buf("E", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  StmtPtr store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_e);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+  PaddedBuffer<float> d_v(kTotalSize);
+  PaddedBuffer<float> e_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+    c_v(i) = 3 * i + 2;
+    d_v(i) = 5 * i + 3;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf});
+  ir_eval(a_v, b_v, c_v, d_v, e_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), 3 * i + 2);
+    ASSERT_EQ(d_v(i), 5 * i + 3);
+    ASSERT_FLOAT_EQ(e_v(i), a_v(i) + b_v(i) * c_v(i) * d_v(i));
+  }
+}
+
+TEST(ATen, mulInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, load_a * load_b);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), a_v(i) * b_v(i));
+  }
+}
+
+TEST(ATen, mulFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, load_a * load_b);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), a_v(i) * b_v(i));
+  }
+}
+
+TEST(ATen, divInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, load_a / load_b);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = 2 * i + 1;
+    b_v(i) = i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), 2 * i + 1);
+    ASSERT_EQ(b_v(i), i + 1);
+    ASSERT_EQ(c_v(i), a_v(i) / b_v(i));
+  }
+}
+
+TEST(ATen, divFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, load_a / load_b);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = 2 * i + 1;
+    b_v(i) = i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), 2 * i + 1);
+    ASSERT_EQ(b_v(i), i + 1);
+    ASSERT_EQ(c_v(i), a_v(i) / b_v(i));
+  }
+}
+
+TEST(ATen, maxInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), std::max(a_v(i), b_v(i)));
+  }
+}
+
+TEST(ATen, maxFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), std::fmax(a_v(i), b_v(i)));
+  }
+}
+
+TEST(ATen, minInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+  PaddedBuffer<int> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), std::min(a_v(i), b_v(i)));
+  }
+}
+
+TEST(ATen, minFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  StmtPtr store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+    b_v(i) = 2 * i + 1;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 2 * i + 1);
+    ASSERT_EQ(c_v(i), std::fmin(a_v(i), b_v(i)));
+  }
+}
+
+void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, FloatImm::make(1.0f) / load_a);
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i);
+    ASSERT_EQ(b_v(i), 1.0f / i);
+  }
+}
+
+TEST(ATen, reluInt) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, Max::make(load_a, 0, false));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<int> a_v(kTotalSize);
+  PaddedBuffer<int> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i - 64;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i - 64);
+    ASSERT_EQ(b_v(i), std::max(a_v(i), 0));
+  }
+}
+
+TEST(ATen, reluFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store(
+      {index}, Max::make(load_a, 0, false) // relu does not propagate nans
+  );
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i - 64;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i - 64);
+    ASSERT_EQ(b_v(i), std::fmax(a_v(i), 0));
+  }
+}
+
+TEST(ATen, logFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, log(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i + 10;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i + 10);
+    ASSERT_EQ(b_v(i), std::log(a_v(i)));
+  }
+}
+
+TEST(ATen, fastLogFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, fast_log(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = at::randn({1}).item().to<float>();
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    auto test = b_v(i);
+    auto ref = std::log(a_v(i));
+    if (std::isnan(ref)) {
+      ASSERT_EQ(std::isnan(test), true);
+    } else {
+      ASSERT_FLOAT_EQ(test, ref);
+    }
+  }
+}
+
+TEST(ATen, fastTanhFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, fast_tanh(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = at::randn({1}).item().to<float>();
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    auto test = b_v(i);
+    auto ref = std::tanh(a_v(i));
+    if (std::isnan(ref)) {
+      ASSERT_EQ(std::isnan(test), true);
+    } else {
+      ASSERT_NEAR(test, ref, 1e-6);
+    }
+  }
+}
+
+TEST(ATen, fastSigmoidFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, fast_sigmoid(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = at::randn({1}).item().to<float>();
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    auto test = b_v(i);
+    at::Tensor t = at::ones({1}) * a_v(i);
+    float ref = at::sigmoid(t).item().to<float>();
+    if (std::isnan(ref)) {
+      ASSERT_EQ(std::isnan(test), true);
+    } else {
+      ASSERT_NEAR(test, ref, 1e-6);
+    }
+  }
+}
+
+TEST(ATen, log10Float) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, log10(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i + 10;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i + 10);
+    ASSERT_EQ(b_v(i), std::log10(a_v(i)));
+  }
+}
+
+TEST(ATen, log2Float) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, log2(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i + 10;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i + 10);
+    ASSERT_EQ(b_v(i), std::log2(a_v(i)));
+  }
+}
+
+TEST(ATen, expFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, exp(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    a_v(i) = i / 10.0f;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i / 10.0f);
+    ASSERT_EQ(b_v(i), std::exp(a_v(i)));
+  }
+}
+
+TEST(ATen, erfFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, erf(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    a_v(i) = i / 10.0f;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i / 10.0f);
+    ASSERT_EQ(b_v(i), std::erf(a_v(i)));
+  }
+}
+
+TEST(ATen, cosFloat) {
+  const int kTotalSize = 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, cos(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    a_v(i) = i / 10.0f;
+  }
+
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
+  ir_eval(a_v, b_v);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    ASSERT_EQ(a_v(i), i / 10.0f);
+    ASSERT_EQ(b_v(i), std::cos(a_v(i)));
+  }
+}
+
+TEST(ATen, eqInt) {
+  constexpr int N = 128;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 1);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<int> c_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(ATen, geInt) {
+  constexpr int N = 128;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 5);
+  std::vector<int> b_buffer(N, 5);
+  std::vector<int> c_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(ATen, gtInt) {
+  constexpr int N = 128;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 6);
+  std::vector<int> b_buffer(N, 3);
+  std::vector<int> c_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(ATen, leInt) {
+  constexpr int N = 128;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 5);
+  std::vector<int> b_buffer(N, 5);
+  std::vector<int> c_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(ATen, ltInt) {
+  constexpr int N = 128;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 5);
+  std::vector<int> b_buffer(N, 5);
+  std::vector<int> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  assertAllEqual(c_buffer, 0);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_base.h b/test/cpp/tensorexpr/test_base.h
new file mode 100644
index 0000000000000..68b96fe6c90f7
--- /dev/null
+++ b/test/cpp/tensorexpr/test_base.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#if defined(USE_GTEST)
+#include <gtest/gtest.h>
+#include <test/cpp/common/support.h>
+#else
+#include <cmath>
+#include "c10/util/Exception.h"
+#include "test/cpp/tensorexpr/gtest_assert_float_eq.h"
+#define ASSERT_EQ(x, y, ...) TORCH_INTERNAL_ASSERT((x) == (y), __VA_ARGS__)
+#define ASSERT_FLOAT_EQ(x, y, ...) \
+  TORCH_INTERNAL_ASSERT(AlmostEquals((x), (y)), __VA_ARGS__)
+#define ASSERT_NE(x, y, ...) TORCH_INTERNAL_ASSERT((x) != (y), __VA_ARGS__)
+#define ASSERT_GT(x, y, ...) TORCH_INTERNAL_ASSERT((x) > (y), __VA_ARGS__)
+#define ASSERT_GE(x, y, ...) TORCH_INTERNAL_ASSERT((x) >= (y), __VA_ARGS__)
+#define ASSERT_LT(x, y, ...) TORCH_INTERNAL_ASSERT((x) < (y), __VA_ARGS__)
+#define ASSERT_LE(x, y, ...) TORCH_INTERNAL_ASSERT((x) <= (y), __VA_ARGS__)
+
+#define ASSERT_NEAR(x, y, a, ...) \
+  TORCH_INTERNAL_ASSERT(std::fabs((x) - (y)) < (a), __VA_ARGS__)
+
+#define ASSERT_TRUE TORCH_INTERNAL_ASSERT
+#define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
+#define ASSERT_THROWS_WITH(statement, substring)                         \
+  try {                                                                  \
+    (void)statement;                                                     \
+    ASSERT_TRUE(false);                                                  \
+  } catch (const std::exception& e) {                                    \
+    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
+  }
+#define ASSERT_ANY_THROW(statement)     \
+  {                                     \
+    bool threw = false;                 \
+    try {                               \
+      (void)statement;                  \
+    } catch (const std::exception& e) { \
+      threw = true;                     \
+    }                                   \
+    ASSERT_TRUE(threw);                 \
+  }
+
+#endif // defined(USE_GTEST)
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+template <typename U, typename V>
+void ExpectAllNear(
+    const std::vector<U>& v1,
+    const std::vector<U>& v2,
+    V threshold,
+    const std::string& name = "") {
+  ASSERT_EQ(v1.size(), v2.size());
+  for (size_t i = 0; i < v1.size(); i++) {
+    ASSERT_NEAR(v1[i], v2[i], threshold);
+  }
+}
+
+template <typename U, typename V>
+void ExpectAllNear(
+    const std::vector<U>& vec,
+    const U& val,
+    V threshold,
+    const std::string& name = "") {
+  for (size_t i = 0; i < vec.size(); i++) {
+    ASSERT_NEAR(vec[i], val, threshold);
+  }
+}
+
+template <typename T>
+static void assertAllEqual(const std::vector<T>& vec, const T& val) {
+  for (auto const& elt : vec) {
+    ASSERT_EQ(elt, val);
+  }
+}
+
+template <typename T>
+static void assertAllEqual(const std::vector<T>& v1, const std::vector<T>& v2) {
+  ASSERT_EQ(v1.size(), v2.size());
+  for (size_t i = 0; i < v1.size(); ++i) {
+    ASSERT_EQ(v1[i], v2[i]);
+  }
+}
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
new file mode 100644
index 0000000000000..2605842d6e74d
--- /dev/null
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -0,0 +1,1019 @@
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+#include <gtest/gtest.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/bounds_inference.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+static void verifyConstBounds(
+    const TensorAccessBoundsInfo& access_info,
+    const std::vector<std::pair<int, int>>& ref) {
+  size_t ndim = ref.size();
+  ASSERT_EQ(access_info.start.size(), ndim);
+  ASSERT_EQ(access_info.stop.size(), ndim);
+  for (const auto i : c10::irange(ndim)) {
+    if (ref[i].first >= 0) { // Negative values are used to skip the check
+      ASSERT_TRUE(access_info.start[i]->isConstant());
+      int start_i = immediateAs<int>(access_info.start[i]);
+      ASSERT_EQ(start_i, ref[i].first);
+    }
+    if (ref[i].second >= 0) {
+      ASSERT_TRUE(access_info.stop[i]->isConstant());
+      int stop_i = immediateAs<int>(access_info.stop[i]);
+      ASSERT_EQ(stop_i, ref[i].second);
+    }
+  }
+}
+
+TEST(BoundsInference, _1) {
+  // Verify that bounds inference works for the following example:
+  // for i in 0..100:
+  //   b[i] = a[i]
+  // For this loop bounds inference should yield the following:
+  // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
+  ExprHandle n(100);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
+  LoopNest l({b});
+  auto bounds_info = inferBounds(l.root_stmt());
+
+  // We should have two entries: one for 'b' and one for 'a'.
+  ASSERT_EQ(bounds_info.size(), 2);
+  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+  verifyConstBounds(bounds_info.at(a.node())[0], {{0, 99}});
+
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
+}
+
+TEST(BoundsInference, _2) {
+  // Verify that bounds inference works for the following example:
+  // for i in 0..n:
+  //   b[i] = a[i]
+  // For this loop bounds inference should yield the following:
+  // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
+  VarHandle n("n", kInt);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
+  LoopNest l({b});
+  auto bounds_info = inferBounds(l.root_stmt());
+
+  // We should have two entries: one for 'b' and one for 'a'.
+  ASSERT_EQ(bounds_info.size(), 2);
+  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+  verifyConstBounds(bounds_info.at(a.node())[0], {{0, -1}});
+
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}});
+}
+
+TEST(BoundsInference, _3) {
+  // Verify that bounds inference works for the following example:
+  // for i in 0..100:
+  //   b[i] = a[i] * a[i+10]
+  // For this loop bounds inference should yield the following:
+  // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
+  ExprHandle n(100);
+  BufHandle a("a", {n + 10}, kFloat);
+  Tensor b = Compute(
+      "b", {n}, [&](const VarHandle& i) { return a.load(i) * a.load(i + 10); });
+  LoopNest l({b});
+  auto bounds_info = inferBounds(l.root_stmt());
+
+  // We should have two entries: one for 'b' and one for 'a'.
+  ASSERT_EQ(bounds_info.size(), 2);
+  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+  verifyConstBounds(bounds_info.at(a.node())[0], {{0, 109}});
+
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
+}
+
+TEST(BoundsInference, _4) {
+  // Verify that bounds inference works for the following example:
+  //
+  // for y in 0..200:
+  //   for x in 0..320:
+  //     b[y,x] = x*y
+  // for y in 0..200:
+  //   for x in 0..320:
+  //     c[y,x] = a[y,x] * b[y,x]
+  ExprHandle W(320);
+  ExprHandle H(200);
+  BufHandle a("a", {H, W}, kFloat);
+  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return x * y;
+  });
+  Tensor c = Compute("c", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return a.load(y, x) * b.load(y, x);
+  });
+  LoopNest l({c});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  StmtPtr body = l.getLoopBodyFor(c);
+  {
+    // Infer bounds on the top-level loop scope
+    auto bounds_info = inferBounds(loops[0]);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 199}, {0, 319}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}});
+  }
+  {
+    // Infer bounds on the inner loop scope
+    auto bounds_info = inferBounds(loops[1]);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {0, 319}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}});
+  }
+  {
+    // Infer bounds on the inner loop body's scope
+    auto bounds_info = inferBounds(body);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {-1, -1}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
+  }
+}
+
+TEST(BoundsInference, _5) {
+  // Verify that bounds inference works for the following example:
+  // for i in 0..100:
+  //   b[i] = a[i]
+  //
+  // ==> split ==>
+  //
+  // for i_outer in 0..100/16:
+  //   for i_inner in 0..16:
+  //     b[i_outer * 16 + i_inner] = a[i_outer * 16 + i_inner]
+  // for i_tail in 0..100%16:
+  //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
+  ExprHandle n(100);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
+  LoopNest l({b});
+
+  ForPtr inner;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
+  LoopNest::splitWithTail(loops[0], 16, &inner, &tail);
+  ForPtr outer = loops[0];
+
+  {
+    // Verify inferred bounds for the outer loop
+    auto bounds_info = inferBounds(outer);
+    ASSERT_EQ(bounds_info.size(), 2);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 95}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}});
+  }
+  {
+    // Verify inferred bounds for the tail loop
+    auto bounds_info = inferBounds(tail);
+    ASSERT_EQ(bounds_info.size(), 2);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{96, 99}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}});
+  }
+}
+
+TEST(BoundsInference, _6) {
+  // Verify that bounds inference works for the following example:
+  //
+  // for y in 0..200:
+  //   for x in 0..320:
+  //     b[y,x] = x*y
+  // for y in 0..20:
+  //   for x in 0..32:
+  //     c[y,x] = a[y+100,x+100] * b[y*2,x*5]
+  ExprHandle W(320);
+  ExprHandle H(200);
+  ExprHandle CW(32);
+  ExprHandle CH(20);
+  BufHandle a("a", {H, W}, kFloat);
+  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return x * y;
+  });
+  Tensor c =
+      Compute("c", {CH, CW}, [&](const VarHandle& y, const VarHandle& x) {
+        return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
+      });
+  LoopNest l({c});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  StmtPtr body = l.getLoopBodyFor(c);
+  {
+    // Infer bounds on the top-level loop scope
+    auto bounds_info = inferBounds(loops[0]);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{100, 119}, {100, 131}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}});
+  }
+  {
+    // Infer bounds on the inner loop scope
+    auto bounds_info = inferBounds(loops[1]);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {100, 131}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}});
+  }
+  {
+    // Infer bounds on the inner loop body's scope
+    auto bounds_info = inferBounds(body);
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {-1, -1}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
+  }
+}
+
+TEST(BoundsInference, Adjacent) {
+  ExprHandle H(6);
+  BufHandle a("a", {20}, kFloat);
+  Tensor b = Compute("b", {H}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor c =
+      Compute("c", {H}, [&](const VarHandle& x) { return a.load(x + H); });
+  LoopNest l({b, c});
+  std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
+
+  {
+    // Infer bounds on the top-level loop scope
+    auto bounds_info = inferBounds(loops[0]);
+    ASSERT_EQ(bounds_info.size(), 2);
+
+    // reads from a[0:5], writes to b[0:5]
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 5}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
+  }
+  {
+    // Infer bounds on the inner loop scope
+    auto bounds_info = inferBounds(loops[1]);
+    ASSERT_EQ(bounds_info.size(), 2);
+
+    // reads from a[0+6:5+6], writes to c[0:5]
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{6, 11}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
+  }
+  {
+    // Infer bounds on the high level program.
+    auto bounds_info = inferBounds(l.root_stmt());
+    ASSERT_EQ(bounds_info.size(), 3);
+
+    // Should be union of above 2 bounds, but this time the bounds of A can be
+    // merged.
+    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
+    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 11}});
+
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
+
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
+  }
+}
+
+TEST(BoundsInference, MultipleTopLoopLoad) {
+  BufHandle a("a", {100}, kFloat);
+  Tensor b = Compute("b", {64}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor c =
+      Compute("c", {32}, [&](const VarHandle& x) { return a.load(x + 10); });
+  Tensor d =
+      Compute("d", {96}, [&](const VarHandle& x) { return a.load(x + 2); });
+  LoopNest l({b, c, d});
+
+  auto bounds_info = inferBounds(l.root_stmt());
+
+  ASSERT_EQ(bounds_info.size(), 4);
+
+  // a only read.
+  {
+    auto bounds = bounds_info[a.node()];
+    ASSERT_EQ(bounds.size(), 1);
+    // One dimension.
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
+    // Bounds:
+    // start: Min of the 3 load bounds = Min of loop starts + offset = 0+0 (b).
+    // stop: Max of the 3 load bounds = Max of loop stops + offset - 1 =
+    //       96 + 2 - 1 (d).
+    verifyConstBounds(bound, {{0, 97}});
+  }
+
+  // b, c, d only written.
+  {
+    auto bounds = bounds_info[b.buf()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // Just the loop extents for b.
+    verifyConstBounds(bound, {{0, 63}});
+  }
+  {
+    auto bounds = bounds_info[c.buf()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // Just the loop extents for c.
+    verifyConstBounds(bound, {{0, 31}});
+  }
+  {
+    auto bounds = bounds_info[d.buf()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // Just the loop extents for d.
+    verifyConstBounds(bound, {{0, 95}});
+  }
+}
+
+TEST(BoundsInference, MultipleTopLoopStore) {
+  BufHandle a("a", {100}, kFloat);
+  BufHandle b("b", {100}, kFloat);
+  BufHandle c("c", {100}, kFloat);
+  BufHandle d("d", {100}, kFloat);
+  VarHandle x("x", kInt);
+
+  // Same as above but the offsets are on the Store now.
+  // Can't do this through ComputeAPI without transforms we don't have yet.
+  StmtPtr stmt = Block::make(
+      {For::make(x, 0, 64, Store::make(b, {x}, Load::make(a, {x}))),
+       For::make(x, 0, 32, Store::make(c, {x + 10}, Load::make(a, {x}))),
+       For::make(x, 0, 96, Store::make(d, {x + 2}, Load::make(a, {x})))});
+
+  auto bounds_info = inferBounds(stmt);
+
+  ASSERT_EQ(bounds_info.size(), 4);
+
+  // a only read.
+  {
+    auto bounds = bounds_info[a.node()];
+    ASSERT_EQ(bounds.size(), 1);
+    // One dimension.
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
+    // Bounds: there are no offsets, so this is just the max loop bounds.
+    verifyConstBounds(bound, {{0, 95}});
+  }
+
+  // b, c, d only written.
+  {
+    auto bounds = bounds_info[b.node()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // This should be equivalent to {offset, extent + offset} for the b loop.
+    // b loop has no offset, so just the loop extents.
+    verifyConstBounds(bound, {{0, 63}});
+  }
+  {
+    auto bounds = bounds_info[c.node()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // This should be equivalent to {offset, extent + offset} for the c loop.
+    // Offset is 10, extent is 32-1.
+    verifyConstBounds(bound, {{10, 41}});
+  }
+  {
+    auto bounds = bounds_info[d.node()];
+    ASSERT_EQ(bounds.size(), 1);
+    auto bound = bounds[0];
+    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
+    // This should be equivalent to {offset, extent + offset} for the d loop.
+    // Offset is 2, extent is 96-1.
+    verifyConstBounds(bound, {{2, 97}});
+  }
+}
+
+TEST(BoundsInference, CacheReads) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 30, j + 3);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C});
+  auto bounds_info_before = inferBounds(l.root_stmt());
+
+  StmtPtr j_loop = l.getLoopStmtsFor(B)[1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
+
+  auto bounds_info_after = inferBounds(l.root_stmt());
+
+  // CacheAccesses should not change existing bounds, but add a new one for the
+  // cache.
+  for (auto& pair : bounds_info_after) {
+    auto beforeIt = bounds_info_before.find(pair.first);
+    if (beforeIt != bounds_info_before.end()) {
+      // Same number of TensorAccessBoundInfos.
+      ASSERT_EQ(pair.second.size(), beforeIt->second.size());
+
+      for (const auto i : c10::irange(pair.second.size())) {
+        TensorAccessBoundsInfo& after = pair.second[i];
+        TensorAccessBoundsInfo& before = beforeIt->second[i];
+        // Same number of dimensions.
+        ASSERT_EQ(before.start.size(), after.start.size());
+
+        // Bounds are equal.
+        for (const auto j : c10::irange(before.start.size())) {
+          ASSERT_TRUE(exprEquals(before.start[j], after.start[j]));
+          ASSERT_TRUE(exprEquals(before.stop[j], after.stop[j]));
+        }
+      }
+    } else {
+      // This should be the cache.
+      ASSERT_EQ(pair.first->name_hint(), "A_local");
+      // Should have both a load and a store.
+      ASSERT_EQ(pair.second.size(), 2);
+      TensorAccessBoundsInfo& first = pair.second[0];
+      TensorAccessBoundsInfo& second = pair.second[1];
+
+      ASSERT_NE(first.kind, second.kind);
+      // 2 dimensions.
+      ASSERT_EQ(first.start.size(), second.start.size());
+      ASSERT_EQ(first.start.size(), 2);
+
+      // bounds for load and store are equal.
+      for (const auto j : c10::irange(first.start.size())) {
+        ASSERT_TRUE(exprEquals(first.start[j], second.start[j]));
+        ASSERT_TRUE(exprEquals(first.stop[j], second.stop[j]));
+      }
+    }
+  }
+}
+
+TEST(BoundsInference, Flattened) {
+  Tensor b = Compute(
+      "b",
+      {3, 4, 5},
+      [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
+        return x * y + z;
+      });
+
+  LoopNest l({b});
+  // Flatten indices.
+  l.prepareForCodegen();
+  auto bounds_info = inferBounds(l.root_stmt());
+
+  // There's only one buffer.
+  ASSERT_EQ(bounds_info.size(), 1);
+  auto& TABI = bounds_info[b.buf()][0];
+  ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
+  // Flattened bounds should have a single dimension.
+  ASSERT_EQ(TABI.start.size(), 1);
+  ASSERT_EQ(TABI.stop.size(), 1);
+
+  // Bounds should be 0 -> (3*4*5)-1
+  ASSERT_TRUE(exprEquals(TABI.start[0], alloc<IntImm>(0)));
+  ASSERT_TRUE(exprEquals(TABI.stop[0], alloc<IntImm>(3 * 4 * 5 - 1)));
+}
+
+TEST(BoundsInference, GetPotentialHazards) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  using namespace analysis;
+
+  {
+    /*
+     * A[0] = B[0];
+     * B[0] = 3;      WAR on B
+     * A[0] = B[0];   WAW on A, RAW on B
+     * C[0] = 5;
+     */
+
+    StorePtr store1 = Store::make(a, {0}, Load::make(b, {0}));
+    StorePtr store2 = Store::make(b, {0}, 3);
+    StorePtr store3 = Store::make(a, {0}, Load::make(b, {0}));
+    StorePtr store4 = Store::make(c, {0}, 5);
+    StmtPtr stmt = Block::make({store1, store2, store3, store4});
+
+    MemDependencyChecker analyzer;
+    stmt->accept(&analyzer);
+
+    ASSERT_EQ(
+        HazardKind::WriteAfterRead,
+        getPotentialHazards(analyzer, store1, store2));
+
+    ASSERT_EQ(
+        HazardKind::ReadAfterWrite,
+        getPotentialHazards(analyzer, store2, store3));
+
+    ASSERT_EQ(
+        HazardKind::WriteAfterWrite,
+        getPotentialHazards(analyzer, store1, store3));
+
+    // Fourth store has no dependencies
+    ASSERT_EQ(
+        HazardKind::NoDependency,
+        getPotentialHazards(analyzer, store1, store4));
+    ASSERT_EQ(
+        HazardKind::NoDependency,
+        getPotentialHazards(analyzer, store2, store4));
+    ASSERT_EQ(
+        HazardKind::NoDependency,
+        getPotentialHazards(analyzer, store3, store4));
+  }
+}
+
+TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B = Compute("B", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return (i + 1) * (j + 1);
+  });
+
+  LoopNest l({A, B});
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+  l.root_stmt()->accept(&analyzer);
+
+  ForPtr loopRootA = l.getLoopStmtsFor(A)[0];
+  ForPtr loopRootB = l.getLoopStmtsFor(B)[0];
+
+  // No dependencies between loops.
+  ASSERT_EQ(
+      HazardKind::NoDependency,
+      getPotentialHazards(analyzer, loopRootA, loopRootB));
+}
+
+TEST(BoundsInference, GetPotentialHazardsLoopCall) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {64, 64}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i, j) + 5;
+      });
+
+  LoopNest l({A, B});
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+  l.root_stmt()->accept(&analyzer);
+
+  ForPtr loopRootA = l.getLoopStmtsFor(A)[0];
+  ForPtr loopRootB = l.getLoopStmtsFor(B)[0];
+
+  ASSERT_EQ(
+      HazardKind::ReadAfterWrite,
+      getPotentialHazards(analyzer, loopRootA, loopRootB));
+}
+
+TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+
+  LoopNest l({A});
+  ForPtr inner, tail;
+
+  // Splitting with tail by something offset creates a tail which also writes to
+  // A.
+  ForPtr outer = l.getLoopStmtsFor(A)[0];
+  // `outer` loop get transformed to the outer loop after splitting.
+  LoopNest::splitWithTail(outer, 5, &inner, &tail);
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+  l.root_stmt()->accept(&analyzer);
+
+  ASSERT_EQ(
+      HazardKind::WriteAfterWrite, getPotentialHazards(analyzer, outer, tail));
+}
+
+TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) {
+  // Input IR:
+  //   for (const auto j : c10::irange(10, 100)) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (const auto k : c10::irange(10, 100)) {
+  //     A[k-1] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) {
+  // Input IR:
+  //   for (const auto j : c10::irange(10, 100)) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (const auto k : c10::irange(10, 100)) {
+  //     A[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 10, 100, Store::make(a_buf, {k}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) {
+  // Input IR:
+  //   for (const auto j : c10::irange(10, 100)) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (const auto k : c10::irange(10, 100)) {
+  //     B[k] = A[k];
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  BufHandle b_buf("B", {200}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(b_buf, {k}, Load::make(a_buf, {k})));
+  auto par = Block::make({forJ, forK});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) {
+  // Input IR:
+  //   for (const auto j : c10::irange(10, 100)) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (const auto k : c10::irange(10, 100)) {
+  //     A[k+100] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) {
+  // Input IR:
+  //   for (const auto i : c10::irange(20)) {
+  //     for (const auto j : c10::irange(100)) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (const auto m : c10::irange(20)) {
+  //     for (const auto n : c10::irange(50)) {
+  //       A[m+1,n] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 50}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
+  auto forJ = For::make(j, 0, 100, storeA1);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto storeA2 =
+      Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)));
+  auto forN = For::make(n, 0, 50, storeA2);
+  auto forM = For::make(m, 0, 20, forN);
+  auto par = Block::make({forI, forM});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forI, forM));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forM, forI));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forN));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forN, forJ));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, storeA2));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA2, storeA1));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, storeA2));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, forM));
+}
+
+TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) {
+  // Input IR:
+  //   for (const auto i : c10::irange(20)) {
+  //     for (const auto j : c10::irange(100)) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (const auto m : c10::irange(20)) {
+  //     for (const auto n : c10::irange(50)) {
+  //       A[m+20,n+100] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 50}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
+  auto forJ = For::make(j, 0, 100, storeA1);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto storeA2 =
+      Store::make(a_buf, {m + 20, n + 100}, Add::make(m, Mul::make(n, 100)));
+  auto forN = For::make(n, 0, 50, storeA2);
+  auto forM = For::make(m, 0, 20, forN);
+  auto par = Block::make({forI, forM});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
+}
+
+TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) {
+  // Input IR:
+  //   for (const auto i : c10::irange(20)) {
+  //     for (const auto j : c10::irange(100)) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (const auto m : c10::irange(20)) {
+  //     for (const auto n : c10::irange(50)) {
+  //       B[m,n] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 50}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
+  auto forJ = For::make(j, 0, 100, storeA1);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto storeA2 = Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)));
+  auto forN = For::make(n, 0, 50, storeA2);
+  auto forM = For::make(m, 0, 20, forN);
+  auto par = Block::make({forI, forM});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
+}
+
+TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) {
+  // Input IR:
+  //   for (const auto j : c10::irange(100)) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (const auto k : c10::irange(100)) {
+  //     B[k] = 20 * A[99-k];
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(
+      k,
+      0,
+      100,
+      Store::make(
+          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
+  auto par = Block::make({forJ, forK});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) {
+  // Input IR:
+  //   for (const auto k : c10::irange(100)) {
+  //     B[k] = 20 * A[99-k];
+  //   }
+  //   for (const auto j : c10::irange(100)) {
+  //     A[j] = 10 * j;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forK = For::make(
+      k,
+      0,
+      100,
+      Store::make(
+          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto par = Block::make({forK, forJ});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, HasConflictingOverlapWithLoads) {
+  // Input IR:
+  //   for (const auto k : c10::irange(10, 100)) {
+  //     B[k] = 20 * A[99-k];
+  //   }
+  //   for (const auto j : c10::irange(10, 100)) {
+  //     C[j] = 10 * A[j];
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  BufHandle c_buf("C", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forK = For::make(
+      k,
+      10,
+      100,
+      Store::make(
+          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
+  auto forJ = For::make(
+      j,
+      10,
+      100,
+      Store::make(c_buf, {j}, Mul::make(10, Load::make(a_buf, {j}))));
+  auto par = Block::make({forK, forJ});
+
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  par->accept(&analyzer);
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
+  ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
+}
+
+TEST(BoundsInference, IsOverlapping) {
+  // Input IR:
+  //   for (const auto i : c10::irange(100)) {
+  //     A[i] = i * 10;               // storeA1
+  //     B[i] = A[99-i] * 20;         // loadA1
+  //     C[i] = A[i + 100] * 10;      // loadA2
+  //     A[i + 50] = i * 50;          // storeA2
+  //     A[i + 150] = i * 150;        // storeA3
+  //   }
+  BufHandle a_buf("A", {300}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  BufHandle c_buf("C", {100}, kInt);
+  VarHandle i("i", kInt);
+  auto storeA1 = Store::make(a_buf, {i}, i * 10);
+  auto loadA1 = Load::make(a_buf, {ExprHandle(99) - i});
+  auto storeB = Store::make(b_buf, {i}, Mul::make(loadA1, 20));
+  auto loadA2 = Load::make(a_buf, {i + 100});
+  auto storeC = Store::make(c_buf, {i}, Mul::make(loadA2, 10));
+  auto storeA2 = Store::make(a_buf, {i + 50}, i * 50);
+  auto storeA3 = Store::make(a_buf, {i + 150}, i * 150);
+  auto forI = For::make(
+      i, 0, 100, Block::make({storeA1, storeB, storeC, storeA2, storeA3}));
+  tensorexpr::analysis::MemDependencyChecker analyzer;
+  forI->accept(&analyzer);
+  ASSERT_TRUE(isOverlapping(analyzer, storeA1, to<Load>(loadA1.node())));
+  ASSERT_FALSE(isOverlapping(analyzer, storeA1, to<Load>(loadA2.node())));
+  ASSERT_TRUE(isOverlapping(analyzer, storeA1, storeA2));
+  ASSERT_FALSE(isOverlapping(analyzer, storeA1, storeA3));
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
new file mode 100644
index 0000000000000..e72303873a6cf
--- /dev/null
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -0,0 +1,234 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/operators/conv2d.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
+
+namespace torch {
+namespace jit {
+
+namespace te = torch::jit::tensorexpr;
+namespace F = torch::nn::functional;
+
+#ifdef TORCH_ENABLE_LLVM
+
+// Generate test data with few bits of precision, to minimize error
+// accumulation from floating-point reordering.
+static at::Tensor genTestData(c10::IntArrayRef args) {
+  return at::trunc(at::randn(args) * 256.0f) / 256.0f;
+}
+
+TEST(Conv, DepthwiseConv2D) {
+  constexpr int N = 1, C = 72, H = 56, W = 56;
+  constexpr int K = 72, R = 3, S = 3;
+  constexpr int kPad = 1, kStride = 2, kGroups = C;
+  constexpr int CperG = C / kGroups;
+
+  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
+  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
+  te::BufHandle bias("bias", {K}, te::kFloat);
+  te::Tensor output =
+      te::conv2d_depthwise(input, weight, bias, kStride, kPad, kGroups);
+
+  te::LoopNest loop({output});
+  loop.simplify();
+  loop.prepareForCodegen();
+  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, bias, output});
+
+  auto it = genTestData({N, C, H, W});
+  auto wt = genTestData({K, CperG, R, S});
+  auto bt = genTestData({K});
+  auto ref = at::conv2d(it, wt, bt, kStride, kPad, /*dilation=*/1, kGroups);
+  auto ot = at::zeros_like(ref);
+  cg.call(
+      {it.data_ptr<float>(),
+       wt.data_ptr<float>(),
+       bt.data_ptr<float>(),
+       ot.data_ptr<float>()});
+
+  ASSERT_TRUE(at::allclose(ref, ot));
+}
+
+TEST(Conv, DepthwiseConv2DNoBias) {
+  constexpr int N = 1, C = 72, H = 56, W = 56;
+  constexpr int K = 72, R = 3, S = 3;
+  constexpr int kPad = 1, kStride = 2, kGroups = C;
+  constexpr int CperG = C / kGroups;
+
+  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
+  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
+  te::Tensor output =
+      te::conv2d_depthwise(input, weight, kStride, kPad, kGroups);
+
+  te::LoopNest loop({output});
+  loop.simplify();
+  loop.prepareForCodegen();
+  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, output});
+
+  auto it = genTestData({N, C, H, W});
+  auto wt = genTestData({K, CperG, R, S});
+  auto ref =
+      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
+  auto ot = at::zeros_like(ref);
+  cg.call({it.data_ptr<float>(), wt.data_ptr<float>(), ot.data_ptr<float>()});
+
+  ASSERT_TRUE(at::allclose(ref, ot));
+}
+
+TEST(Conv, DepthwiseConv2DDynamicShapes) {
+  te::VarHandle N_var("N", te::kInt);
+  te::VarHandle C_var("C", te::kInt);
+  te::VarHandle H_var("H", te::kInt);
+  te::VarHandle W_var("W", te::kInt);
+  te::VarHandle K_var("K", te::kInt);
+  te::VarHandle CperG_var("CperG", te::kInt);
+  te::VarHandle R_var("R", te::kInt);
+  te::VarHandle S_var("S", te::kInt);
+  te::VarHandle kPad_var("kPad", te::kInt);
+  te::VarHandle kStride_var("kStride", te::kInt);
+  te::VarHandle kGroups_var("kGroups", te::kInt);
+
+  te::BufHandle input("input", {N_var, C_var, H_var, W_var}, te::kFloat);
+  te::BufHandle weight("weight", {K_var, CperG_var, R_var, S_var}, te::kFloat);
+  te::Tensor output = te::conv2d_depthwise(
+      input,
+      weight,
+      N_var,
+      C_var,
+      H_var,
+      W_var,
+      K_var,
+      CperG_var,
+      R_var,
+      S_var,
+      kStride_var,
+      kPad_var,
+      kGroups_var);
+
+  te::LoopNest loop({output});
+  loop.simplify();
+  loop.prepareForCodegen();
+  std::vector<te::CodeGen::BufferArg> buffer_args = {
+      input,
+      weight,
+      N_var,
+      C_var,
+      H_var,
+      W_var,
+      K_var,
+      CperG_var,
+      R_var,
+      S_var,
+      kPad_var,
+      kStride_var,
+      kGroups_var,
+      output};
+  te::LLVMCodeGen cg(loop.root_stmt(), buffer_args);
+
+  constexpr int N = 1, C = 72, H = 56, W = 56;
+  constexpr int K = 72, R = 3, S = 3;
+  constexpr int kPad = 1, kStride = 2, kGroups = C;
+  constexpr int CperG = C / kGroups;
+
+  auto it = genTestData({N, C, H, W});
+  auto wt = genTestData({K, CperG, R, S});
+  auto ref =
+      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
+  auto ot = at::zeros_like(ref);
+  std::vector<te::CodeGen::CallArg> call_args = {
+      it.data_ptr<float>(),
+      wt.data_ptr<float>(),
+      N,
+      C,
+      H,
+      W,
+      K,
+      CperG,
+      R,
+      S,
+      kPad,
+      kStride,
+      kGroups,
+      ot.data_ptr<float>()};
+  cg.call(call_args);
+
+  ASSERT_TRUE(at::allclose(ref, ot));
+}
+
+#endif
+
+TEST(Conv, Conv2D) {
+  // Input dimensions.
+  constexpr int N = 1;
+  constexpr int C = 3;
+  constexpr int H = 11;
+  constexpr int W = 11;
+
+  // Filter dimensions.
+  constexpr int K = 8;
+  constexpr int R = 3;
+  constexpr int S = 3;
+
+  // Output dims.
+  constexpr int OH = H - R + 1;
+  constexpr int OW = W - S + 1;
+
+  // Compute reference result.
+  at::Tensor input = torch::randn({N, C, H, W});
+  at::Tensor filter = torch::randn({K, C, R, S});
+  at::Tensor ref = F::conv2d(input, filter);
+
+  // Double check the output size is as expected.
+  ASSERT_EQ(ref.size(0), N);
+  ASSERT_EQ(ref.size(1), K);
+  ASSERT_EQ(ref.size(2), OH);
+  ASSERT_EQ(ref.size(3), OW);
+
+  te::BufHandle inputB("input", {N, C, H, W}, te::kFloat);
+  te::BufHandle filterB("filter", {K, C, R, S}, te::kFloat);
+
+  te::Tensor conv = te::Reduce(
+      "conv",
+      {N, K, OH, OW},
+      te::Sum(),
+      // FIXME: We have to use a `std::vector` parameter here and then unpack
+      // it, because we don't have an overload allowing for an arbitrary number
+      // of ExprHandle/VarHandle parameters.
+      [&](const std::vector<te::VarHandle>& v) {
+        auto const& n = v[0];
+        auto const& k = v[1];
+        auto const& oh = v[2];
+        auto const& ow = v[3];
+        auto const& c = v[4];
+        auto const& r = v[5];
+        auto const& s = v[6];
+        // FIXME: We have to use `call` and construct a `std::vector` here
+        // because the `operator()` overload is only specialized for a small
+        // number of arguments.
+        return inputB.load(n, c, oh + r, ow + s) * filterB.load(k, c, r, s);
+      },
+      // FIXME: If you forget one of the reduction dims, you get a segfault.
+      // Could that be caught by a verifier?
+      {C, R, S});
+
+  // FIXME: It'd be nice to have a single header that pulls in things like
+  // LoopNest, IRSimplifier, etc.
+  te::LoopNest loop({conv});
+  loop.prepareForCodegen();
+  te::StmtPtr s = loop.root_stmt();
+  s = te::IRSimplifier::simplify(s);
+
+  at::Tensor result = at::empty_like(ref);
+  te::SimpleIREvaluator cg(s, {inputB, filterB, conv});
+  cg.call(
+      {input.data_ptr<float>(),
+       filter.data_ptr<float>(),
+       result.data_ptr<float>()});
+
+  ASSERT_TRUE(at::allclose(ref, result, 1e-3, 1e-3));
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp
new file mode 100644
index 0000000000000..ed7679053637c
--- /dev/null
+++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp
@@ -0,0 +1,259 @@
+#include <gtest/gtest.h>
+
+#include "test/cpp/tensorexpr/test_base.h"
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+#define STR_CHECK(node, expected) \
+  std::stringstream ss;           \
+  CppPrinter printer(&ss);        \
+  printer.visit(node);            \
+  ASSERT_EQ(ss.str(), expected)
+
+#define FILE_CHECK(node, pattern) \
+  std::stringstream ss;           \
+  CppPrinter printer(&ss);        \
+  printer.visit(node);            \
+  torch::jit::testing::FileCheck().run(pattern, ss.str())
+
+TEST(CppPrinter, IntImm) {
+  auto i = alloc<IntImm>(10);
+  STR_CHECK(i, "10");
+}
+
+TEST(CppPrinter, FloatImm) {
+  auto f = alloc<FloatImm>(10);
+  STR_CHECK(f, "10.f");
+}
+
+TEST(CppPrinter, FloatImm1) {
+  auto f = alloc<FloatImm>(10);
+  STR_CHECK(f, "10.f");
+}
+
+TEST(CppPrinter, DoubleImm) {
+  auto d = alloc<DoubleImm>(10);
+  STR_CHECK(d, "10.0");
+}
+
+TEST(CppPrinter, DoubleImm1) {
+  auto d = alloc<DoubleImm>(10.1);
+  STR_CHECK(d, "10.1");
+}
+
+TEST(CppPrinter, HalfImm) {
+  auto h = alloc<HalfImm>(10);
+  STR_CHECK(h, "10");
+}
+
+TEST(CppPrinter, Add) {
+  auto add = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(add, "1 + 2");
+}
+
+TEST(CppPrinter, AddExpr1) {
+  auto add = alloc<Add>(
+      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "(0 + 1) + (2 - 3)");
+}
+
+TEST(CppPrinter, AddExpr2) {
+  auto add = alloc<Add>(
+      alloc<Mul>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "0 * 1 + (2 - 3)");
+}
+
+TEST(CppPrinter, AddExpr3) {
+  auto add = alloc<Add>(
+      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Div>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "(0 + 1) + 2 / 3");
+}
+
+TEST(CppPrinter, Mod) {
+  auto mod = alloc<Mod>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(mod, "1 % 2");
+}
+
+TEST(CppPrinter, ModFloat) {
+  auto mod = alloc<Mod>(alloc<FloatImm>(1), alloc<FloatImm>(2));
+  STR_CHECK(mod, "std::fmod(1.f, 2.f)");
+}
+
+TEST(CppPrinter, Max) {
+  auto max = alloc<Max>(alloc<IntImm>(1), alloc<IntImm>(2), false);
+  STR_CHECK(max, "std::max(1, 2)");
+}
+
+TEST(CppPrinter, MaxFloat) {
+  auto max = alloc<Max>(alloc<FloatImm>(1), alloc<FloatImm>(2), false);
+  STR_CHECK(max, "std::max(1.f, 2.f)");
+}
+
+TEST(CppPrinter, MaxHalf) {
+  auto max = alloc<Max>(alloc<HalfImm>(1), alloc<HalfImm>(2), false);
+  STR_CHECK(max, "(1 < 2) ? 2 : 1");
+}
+
+TEST(CppPrinter, And) {
+  auto v = alloc<And>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(v, "1 & 2");
+}
+
+TEST(CppPrinter, CompareSelect) {
+  auto cs = alloc<CompareSelect>(
+      alloc<IntImm>(1),
+      alloc<IntImm>(2),
+      alloc<FloatImm>(1),
+      alloc<FloatImm>(2),
+      CompareSelectOperation::kLE);
+  STR_CHECK(cs, "((1 <= 2) ? 1.f : 2.f)");
+}
+
+TEST(CppPrinter, IfThenElse) {
+  auto cond = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  auto true_value = alloc<Sub>(alloc<IntImm>(0), alloc<IntImm>(1));
+  auto false_value = alloc<Mul>(alloc<IntImm>(2), alloc<IntImm>(3));
+  auto v = alloc<IfThenElse>(cond, true_value, false_value);
+  STR_CHECK(v, "((1 + 2) ? 0 - 1 : 2 * 3)");
+}
+
+TEST(CppPrinter, AllocateFree) {
+  BufHandle buf("x", {2, 3}, kInt);
+  AllocatePtr alloc = Allocate::make(buf);
+  FreePtr free = Free::make(buf);
+  BlockPtr block = Block::make({alloc, free});
+
+  const std::string pattern = R"(
+   # CHECK: {
+   # CHECK:   int* x = static_cast<int*>(malloc(24));
+   # CHECK:   free(x);
+   # CHECK: }
+  )";
+  FILE_CHECK(block, pattern);
+}
+
+TEST(CppPrinter, LoadStore) {
+  BufHandle a("A", {2, 3}, kInt);
+  BufHandle b("B", {3, 4}, kInt);
+  auto store = b.store({2, 2}, a.load(1, 1));
+  STR_CHECK(
+      store, "B[(0 + 2 * (1 * 4)) + 2 * 1] = A[(0 + 1 * (1 * 3)) + 1 * 1];\n");
+}
+
+TEST(CppPrinter, Var) {
+  auto var = alloc<Var>("x", kInt);
+  STR_CHECK(var, "x");
+}
+
+TEST(CppPrinter, Cast) {
+  auto cast = alloc<Cast>(kFloat, alloc<IntImm>(1));
+  STR_CHECK(cast, "static_cast<float>(1)");
+}
+
+TEST(CppPrinter, BitCast) {
+  auto cast = alloc<BitCast>(kInt, alloc<FloatImm>(20));
+  STR_CHECK(cast, "std::bitcast<float, int>(20.f)");
+}
+
+TEST(CppPrinter, Let) {
+  auto var = alloc<Var>("x", kFloat);
+  auto val = alloc<FloatImm>(2);
+  auto let = alloc<Let>(var, val);
+  STR_CHECK(let, "float x = 2.f;\n");
+}
+
+TEST(CppPrinter, For) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  VarHandle i("i", kInt);
+  auto f = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
+  const std::string pattern = R"(
+   # CHECK: for (int i = 0; i < 1024; i++) {
+   # CHECK:   C[i] = (A[i]) + (B[i]);
+   # CHECK: }
+  )";
+  FILE_CHECK(f, pattern);
+}
+
+TEST(CppPrinter, Cond) {
+  BufHandle x("X", {1}, kInt);
+  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
+  auto cond =
+      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
+  const std::string pattern = R"(
+    # CHECK: if (((X[0] < 10) ? 1 : 0)) {
+    # CHECK:   X[0] = (X[0]) + 1;
+    # CHECK: } else {
+    # CHECK:   X[0] = (X[0]) - 1;
+    # CHECK: }
+  )";
+  FILE_CHECK(cond, pattern);
+}
+
+TEST(CppPrinter, Intrinsics) {
+  const std::unordered_set<IntrinsicsOp, std::hash<int>> unsupported_ops{
+      kRand, kSigmoid};
+  for (const auto i : c10::irange(static_cast<uint32_t>(kMaxIntrinsicsOp))) {
+    IntrinsicsOp op = static_cast<IntrinsicsOp>(i);
+    if (unsupported_ops.count(op)) {
+      continue;
+    }
+
+    if (Intrinsics::OpArgCount(op) == 1) {
+      auto v = alloc<Intrinsics>(op, alloc<FloatImm>(2.0f));
+      STR_CHECK(v, "std::" + v->func_name() + "(2.f)");
+    } else {
+      auto v =
+          alloc<Intrinsics>(op, alloc<FloatImm>(1.0f), alloc<FloatImm>(2.0f));
+      STR_CHECK(v, "std::" + v->func_name() + "(1.f, 2.f)");
+    }
+  }
+}
+
+TEST(CppPrinter, ExternalCall) {
+  std::vector<ExprPtr> dims{alloc<IntImm>(2), alloc<IntImm>(2)};
+  auto output = alloc<Buf>("out", dims, kFloat);
+  auto buf_arg1 = alloc<Buf>("a", dims, kFloat);
+  auto buf_arg2 = alloc<Buf>("b", dims, kFloat);
+  auto scalar_arg = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  std::vector<BufPtr> buf_args{buf_arg1, buf_arg2};
+  std::vector<ExprPtr> scalar_args{scalar_arg};
+  auto call =
+      alloc<ExternalCall>(output, "nnc_aten_matmul", buf_args, scalar_args);
+  const std::string pattern = R"(
+   # CHECK: {
+   # CHECK:   void* buf_ptrs[]{out, a, b};
+   # CHECK:   int64_t buf_ranks[]{2, 2, 2};
+   # CHECK:   int64_t buf_dims[]{2, 2, 2, 2, 2, 2};
+   # CHECK:   int8_t buf_dtypes[]{6, 6, 6};
+   # CHECK:   int64_t extra_args[]{1 + 2};
+   # CHECK:   nnc_aten_matmul(
+   # CHECK:       3,
+   # CHECK:       buf_ptrs,
+   # CHECK:       buf_ranks,
+   # CHECK:       buf_dims,
+   # CHECK:       buf_dtypes,
+   # CHECK:       1,
+   # CHECK:       extra_args);
+   # CHECK: }
+  )";
+  FILE_CHECK(call, pattern);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
new file mode 100644
index 0000000000000..8a96c68dc75e4
--- /dev/null
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -0,0 +1,2344 @@
+#ifdef USE_CUDA
+
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <torch/csrc/jit/tensorexpr/cuda_codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+#include <torch/csrc/jit/testing/file_check.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Half.h>
+#include <c10/util/irange.h>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+using namespace torch::jit::tensorexpr;
+
+template <typename ctype>
+static void testCudaTestVectorAdd01_impl() {
+  const int num_iter = 3;
+  const int block_count = 16;
+  const int block_size = 128;
+  Dtype dtype = ToDtype<ctype>();
+  BufHandle a_buf("a", {num_iter, block_count, block_size}, dtype);
+  BufHandle b_buf("b", {num_iter, block_count, block_size}, dtype);
+  Tensor c = Compute(
+      "c",
+      {
+          num_iter,
+          block_count,
+          block_size,
+      },
+      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
+        return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id);
+      });
+  LoopNest l({c});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[1]->set_gpu_block_index(0);
+  loops[2]->set_gpu_thread_index(0);
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
+  const int N = block_count * block_size * num_iter;
+  PaddedBuffer<ctype> a_v(N);
+  PaddedBuffer<ctype> b_v(N);
+  PaddedBuffer<ctype> c_v(N);
+  PaddedBuffer<ctype> c_ref(N);
+
+  for (const auto i : c10::irange(N)) {
+    a_v(i) = ctype(i);
+    b_v(i) = ctype(i * 3 + 7);
+    c_ref(i) = a_v(i) + b_v(i);
+  }
+
+  // TODO: move gpu support into PaddedBuffer
+  ctype* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(ctype)));
+  ctype* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(ctype)));
+  ctype* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(ctype)));
+  C10_CUDA_CHECK(
+      cudaMemcpy(a_dev, a_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_dev, b_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_dev, c_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_v.data(), c_dev, N * sizeof(ctype), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+}
+
+float sigmoid(float x) {
+  return 1.0f / (1.0f + expf(-0.0f - x));
+}
+
+TEST(Cuda, Sigmoid_CUDA) {
+  const int num_iter = 3;
+  const int block_count = 16;
+  const int block_size = 128;
+  Dtype dtype = ToDtype<float>();
+  BufHandle a_buf("a", {num_iter, block_count, block_size}, dtype);
+  Tensor c = Compute(
+      "c",
+      {
+          num_iter,
+          block_count,
+          block_size,
+      },
+      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
+        return sigmoid(sigmoid(a_buf.load(n, b_id, t_id)));
+      });
+  LoopNest l({c});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[1]->set_gpu_block_index(0);
+  loops[2]->set_gpu_thread_index(0);
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, a_buf);
+  const int N = block_count * block_size * num_iter;
+  PaddedBuffer<float> a_v(N);
+  PaddedBuffer<float> c_v(N);
+  PaddedBuffer<float> c_ref(N);
+
+  for (const auto i : c10::irange(N)) {
+    a_v(i) = float(i);
+    c_ref(i) = sigmoid(sigmoid(a_v(i)));
+  }
+
+  // TODO: move gpu support into PaddedBuffer
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(
+      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, a_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+}
+
+TEST(Cuda, TestVectorAdd01_CUDA) {
+  // floating types.
+  testCudaTestVectorAdd01_impl<float>();
+  testCudaTestVectorAdd01_impl<at::Half>();
+  testCudaTestVectorAdd01_impl<double>();
+
+  // integer types.
+  testCudaTestVectorAdd01_impl<int8_t>();
+  testCudaTestVectorAdd01_impl<uint8_t>();
+  testCudaTestVectorAdd01_impl<int16_t>();
+  testCudaTestVectorAdd01_impl<int32_t>();
+  testCudaTestVectorAdd01_impl<int64_t>();
+}
+
+static void testCudaTestVectorAdd02_impl(int64_t N, int64_t block_size) {
+  BufHandle a_buf("a", {N}, kFloat);
+  BufHandle b_buf("b", {N}, kFloat);
+  Tensor c = Compute("c", {N}, [&](const VarHandle& n) {
+    return a_buf.load(n) + b_buf.load(n);
+  });
+  LoopNest l({c});
+  ForPtr n_inner;
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  l.splitWithMask(loops[0], block_size, &n_inner);
+  loops[0]->set_gpu_block_index(0);
+  n_inner->set_gpu_thread_index(0);
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
+  PaddedBuffer<float> a_v(N);
+  PaddedBuffer<float> b_v(N);
+  PaddedBuffer<float> c_v(N);
+  PaddedBuffer<float> c_ref(N);
+
+  for (const auto i : c10::irange(N)) {
+    a_v(i) = i;
+    b_v(i) = i * 3 + 7;
+    c_ref(i) = a_v(i) + b_v(i);
+  }
+
+  // TODO: move gpu support into PaddedBuffer
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(
+      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_dev, b_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+}
+
+TEST(Cuda, TestVectorAdd02_CUDA) {
+  testCudaTestVectorAdd02_impl(1024, 128);
+  testCudaTestVectorAdd02_impl(1030, 128);
+}
+
+TEST(Cuda, HalfCast_CUDA) {
+  auto half = ToDtype<at::Half>();
+  BufHandle a("a", {4}, half);
+  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
+    return Cast::make(kFloat, a.load(i));
+  });
+
+  LoopNest l({b});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+  CudaCodeGen cg(s, {a, b});
+
+  std::vector<at::Half> aData(4, 2.0f);
+  std::vector<float> bData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  float* bDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto bSize = bData.size() * sizeof(bData[0]);
+
+  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
+  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
+  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(bDev, bData.data(), bSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cg.call({aDev, bDev});
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  C10_CUDA_CHECK(cudaMemcpy(aData.data(), aDev, aSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(bData.data(), bDev, bSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  assertAllEqual(bData, 2.0f);
+
+  C10_CUDA_CHECK(cudaFree(aDev));
+  C10_CUDA_CHECK(cudaFree(bDev));
+}
+
+TEST(Cuda, DynamicShape2D_CUDA) {
+  auto testWithSize = [](int32_t M, int32_t N) {
+    VarHandle m("m", kInt);
+    VarHandle n("n", kInt);
+    BufHandle a("a", {m, n}, kFloat);
+    BufHandle b("b", {m, n}, kFloat);
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
+          return a.load(i, j) + b.load(i, j);
+        });
+    LoopNest l({c});
+    l.prepareForCodegen();
+    StmtPtr s = l.root_stmt();
+    CudaCodeGen cg(s, {a, b, c, m, n});
+
+    std::vector<float> aData(M * N, 1.0f);
+    std::vector<float> bData(M * N, 2.0f);
+    std::vector<float> cData(M * N, 0.0f);
+    float* aDev = nullptr;
+    float* bDev = nullptr;
+    float* cDev = nullptr;
+    C10_CUDA_CHECK(cudaMalloc(&aDev, aData.size() * sizeof(aData[0])));
+    C10_CUDA_CHECK(cudaMalloc(&bDev, bData.size() * sizeof(bData[0])));
+    C10_CUDA_CHECK(cudaMalloc(&cDev, cData.size() * sizeof(cData[0])));
+    C10_CUDA_CHECK(cudaMemcpy(
+        aDev,
+        aData.data(),
+        aData.size() * sizeof(aData[0]),
+        cudaMemcpyHostToDevice));
+    C10_CUDA_CHECK(cudaMemcpy(
+        bDev,
+        bData.data(),
+        bData.size() * sizeof(bData[0]),
+        cudaMemcpyHostToDevice));
+    C10_CUDA_CHECK(cudaMemcpy(
+        cDev,
+        cData.data(),
+        cData.size() * sizeof(cData[0]),
+        cudaMemcpyHostToDevice));
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+    cg.call({aDev, bDev, cDev, M, N});
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+    C10_CUDA_CHECK(cudaMemcpy(
+        cData.data(),
+        cDev,
+        cData.size() * sizeof(cData[0]),
+        cudaMemcpyDeviceToHost));
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
+
+    C10_CUDA_CHECK(cudaFree(aDev));
+    C10_CUDA_CHECK(cudaFree(bDev));
+    C10_CUDA_CHECK(cudaFree(cDev));
+  };
+  testWithSize(32, 32);
+  testWithSize(1, 16);
+  testWithSize(27, 13);
+}
+
+TEST(Cuda, TestRand01_CUDA) {
+  const int num_iter = 3;
+  const int block_count = 16;
+  const int block_size = 128;
+  Tensor c = Compute(
+      "c",
+      {
+          num_iter,
+          block_count,
+          block_size,
+      },
+      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
+        return Intrinsics::make(IntrinsicsOp::kRand, kFloat);
+      });
+  LoopNest l({c});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[1]->set_gpu_block_index(0);
+  loops[2]->set_gpu_thread_index(0);
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c);
+  const int N = block_count * block_size * num_iter;
+  PaddedBuffer<float> c_v(N);
+
+  // TODO: move gpu support into PaddedBuffer
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  float sum1 = 0;
+  float sum2 = 0;
+  float sum3 = 0;
+  for (const auto i : c10::irange(N)) {
+    float v = c_v.data()[i];
+    sum1 += v;
+    sum2 += v * v;
+    sum3 += v * v * v;
+    ASSERT_TRUE(v >= 0 && v < 1);
+  }
+  sum1 /= N;
+  sum2 /= N;
+  sum3 /= N;
+  float sum1_mean = 1.f / 2;
+  float sum2_mean = 1.f / 3;
+  float sum3_mean = 1.f / 4;
+
+  ASSERT_NEAR(sum1, sum1_mean, 2e-2);
+  ASSERT_NEAR(sum2, sum2_mean, 2e-2);
+  ASSERT_NEAR(sum3, sum3_mean, 2e-2);
+  C10_CUDA_CHECK(cudaFree(c_dev));
+}
+
+TEST(Cuda, DynamicShapeSplit_CUDA) {
+  constexpr int64_t N = 4096;
+  VarHandle n("n", kLong);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b =
+      Compute("b", {n}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
+  LoopNest l({b});
+  ForPtr inner;
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
+  l.splitWithMask(loops[0], 1024, &inner);
+  loops[0]->set_gpu_block_index(0);
+  inner->set_gpu_thread_index(0);
+  StmtPtr s = l.root_stmt();
+  CudaCodeGen cg(s, {a, b, n});
+
+  std::vector<float> aData(N, 1.0f);
+  std::vector<float> bData(N, 1.0f);
+  float* aDev = nullptr;
+  float* bDev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&aDev, aData.size() * sizeof(aData[0])));
+  C10_CUDA_CHECK(cudaMalloc(&bDev, bData.size() * sizeof(bData[0])));
+  C10_CUDA_CHECK(cudaMemcpy(
+      aDev,
+      aData.data(),
+      aData.size() * sizeof(aData[0]),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      bDev,
+      bData.data(),
+      bData.size() * sizeof(aData[0]),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cg.call({aDev, bDev, N});
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  C10_CUDA_CHECK(cudaMemcpy(
+      bData.data(),
+      bDev,
+      bData.size() * sizeof(aData[0]),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(bData, std::vector<float>(N, 2.0f), 1e-7);
+
+  C10_CUDA_CHECK(cudaFree(aDev));
+  C10_CUDA_CHECK(cudaFree(bDev));
+}
+
+TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) {
+  const static int N = 1024;
+  BufHandle data_buf("data", {N}, kFloat);
+  BufHandle output_buf("output", {1}, kFloat);
+
+  // The test adds the following code for trivial reduction:
+  // for (const auto bidx : c10::irange(1)) { // blockIdx.x
+  //   for (const auto tidx : c10::irange(1)) { // threadIdx.x
+  //     output[0] = 0.f;
+  //     for (const auto i1 : c10::irange(1024)) {
+  //       output[0] = output[0] + data[i1];
+  //     }
+  //   }
+  // }
+
+  StorePtr init_store = output_buf.store({0}, 0.f);
+  VarHandle i1("i1", kInt);
+  ExprHandle load_data = Load::make(data_buf, {i1});
+  ExprHandle load_output = Load::make(output_buf, {0});
+  ExprHandle add_value = load_output + load_data;
+  StorePtr store_output = output_buf.store({0}, add_value);
+  ForPtr for_output = For::make(i1, 0, N, store_output);
+  StmtPtr reduce_block = Block::make({init_store, for_output});
+  VarHandle thread_idx("tidx", kInt);
+  LoopOptions thread_idx_options;
+  thread_idx_options.set_gpu_thread_index(0);
+  ForPtr thread_idx_loop =
+      For::make(thread_idx, 0, 1, reduce_block, thread_idx_options);
+  VarHandle block_idx("bidx", kInt);
+  LoopOptions block_idx_options;
+  block_idx_options.set_gpu_block_index(0);
+  ForPtr block_idx_loop =
+      For::make(block_idx, 0, 1, thread_idx_loop, block_idx_options);
+
+  CudaCodeGen cuda_cg(block_idx_loop, data_buf, output_buf);
+  PaddedBuffer<float> data_v(N);
+  PaddedBuffer<float> output_v(1, "output_v");
+  PaddedBuffer<float> output_ref(1, "output_ref");
+
+  output_ref(0) = 0;
+  for (const auto i : c10::irange(N)) {
+    data_v(i) = i;
+    output_ref(0) += data_v(i);
+  }
+
+  float* data_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&data_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      data_dev, data_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  float* output_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&output_dev, 1 * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(data_dev, output_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      output_v.data(), output_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(output_v, output_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(data_dev));
+  C10_CUDA_CHECK(cudaFree(output_dev));
+}
+
+TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) {
+  const static int N = 1024;
+
+  // This test does the following reduction:
+  // clang-format off
+  //   for b in 0..1 // block-idx
+  //    for t in 0..1024: // thread-idx
+  //      if t < 1:
+  //        b[0] = 0
+  //    // implied sync_threads
+  //    for t in 0..1024: // thread-idx
+  //      b[0] = b[0] + a[t] // implied atomic
+  // clang-format on
+
+  BufHandle a_buf("a", {N}, kFloat);
+  BufHandle b_buf("b", {1}, kFloat);
+
+  StorePtr init_store = b_buf.store({0}, 0.f);
+  VarHandle t("t", kInt);
+  VarHandle b("b", kInt);
+
+  //  for t in 0..1024: // thread-idx
+  //    if t < 1:
+  //      b[0] = 0
+  ExprHandle cond_t_lt_1 =
+      CompareSelect::make(t, 1, CompareSelectOperation::kLT);
+  CondPtr masked_init_b = Cond::make(cond_t_lt_1, init_store, nullptr);
+  LoopOptions thread_idx_options;
+  thread_idx_options.set_gpu_thread_index(0);
+  ForPtr for_init = For::make(t, 0, N, masked_init_b, thread_idx_options);
+
+  //  for t in 0..1024: // thread-idx
+  //    b[0] = b[0] + a[t] // implied atomic
+  ExprHandle load_a = Load::make(a_buf, {t});
+  ExprHandle load_b = Load::make(b_buf, {0});
+  ExprHandle add_value = load_b + load_a;
+  StorePtr store_b = b_buf.store({0}, add_value);
+  ForPtr for_b = For::make(t, 0, N, store_b, thread_idx_options);
+
+  StmtPtr reduce_block = Block::make({for_init, for_b});
+
+  VarHandle block_idx("bidx", kInt);
+  LoopOptions block_idx_options;
+  block_idx_options.set_gpu_block_index(0);
+  ForPtr block_idx_loop =
+      For::make(block_idx, 0, 1, reduce_block, block_idx_options);
+
+  CudaCodeGen cuda_cg(block_idx_loop, a_buf, b_buf);
+  PaddedBuffer<float> a_v(N);
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(N)) {
+    a_v(i) = i;
+    b_ref(0) += a_v(i);
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(
+      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+}
+
+TEST(Cuda, NoThreadIdxWrite_1_CUDA) {
+  // This test does the following reduction:
+  //
+  // for k in 0..1: // block-idx
+  //   a[0] = 0
+  //   for n in 0..2:
+  //     a[0] = a[0] + n
+  //   for m in 0..1024: // thread-idx
+  //     b[m] = m
+  //   a[1] = 1
+  //   for l in 0..2:
+  //     a[1] = a[1] + n
+  //
+  //  note that the statements not covered by thread-idx are supposed to be
+  //  covered by its own thread-idx
+
+  const static int N = 1024;
+  BufHandle a_buf("a", {2}, kFloat);
+  BufHandle b_buf("b", {N}, kFloat);
+
+  VarHandle k("k", kInt);
+  VarHandle l("l", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+
+  //   a[0] = 0
+  //   for n in 0..2:
+  //     a[0] = a[0] + n
+  StorePtr store_a0_0 = a_buf.store({0}, 0.f);
+  ExprHandle load_a0 = Load::make(a_buf, {0});
+  ExprHandle v1 = load_a0 + n;
+  StorePtr store_a0_v1 = a_buf.store({0}, v1);
+  ForPtr loop_a_0 = For::make(n, 0, 2, store_a0_v1);
+
+  //   for m in 0..1024: // thread-idx
+  //     b[m] = m
+  StorePtr store_bm_m = b_buf.store({m}, m + 0.f);
+  LoopOptions thread_idx_options;
+  thread_idx_options.set_gpu_thread_index(0);
+  ForPtr loop_b_1 = For::make(m, 0, N, store_bm_m, thread_idx_options);
+
+  //   a[1] = 1
+  //   for l in 0..2:
+  //     a[1] = a[1] + l
+  StorePtr store_a1_1 = a_buf.store({1}, 1.f);
+  ExprHandle load_a1 = a_buf.load(1);
+  ExprHandle v2 = load_a1 + l;
+  StorePtr store_a1_v2 = a_buf.store({1}, v2);
+  ForPtr loop_a_1 = For::make(l, 0, 2, store_a1_v2);
+
+  StmtPtr reduce_block =
+      Block::make({store_a0_0, loop_a_0, loop_b_1, store_a1_1, loop_a_1});
+
+  VarHandle block_idx("bidx", kInt);
+  LoopOptions block_idx_options;
+  block_idx_options.set_gpu_block_index(0);
+  ForPtr block_idx_loop =
+      For::make(block_idx, 0, 1, reduce_block, block_idx_options);
+
+  CudaCodeGen cuda_cg(block_idx_loop, a_buf, b_buf);
+  PaddedBuffer<float> a_v(2);
+  PaddedBuffer<float> b_v(N, "b_v");
+  PaddedBuffer<float> a_ref(2, "a_ref");
+  PaddedBuffer<float> b_ref(N, "b_ref");
+
+  a_ref(0) = 0;
+  for (const auto i : c10::irange(2)) {
+    a_ref(0) += i;
+  }
+  a_ref(1) = a_ref(0) + 1;
+  for (const auto i : c10::irange(N)) {
+    b_ref(i) = i;
+  }
+
+  // TODO: add check of the generated code.
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, 2 * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(a_v.data(), a_dev, 2 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_v.data(), b_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(a_v, a_ref, 1e-5);
+  ExpectAllNear(b_v, b_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+}
+
+TEST(Cuda, SharedMemReduce_1_CUDA) {
+  // FIXME: this test is flaky in CI.
+  // This test does the following:
+  //  for k in 0..1:  // block-idx
+  //    alloc(c, 64)
+  //    for n in 0..64:  // thread-idx
+  //      c(n) = 0
+  //    for m in 0..128:
+  //      for n in 0..64:  // thread_idx
+  //        c(n) = c(n) + a(k, m, n)
+  //    b(k) = 0
+  //    for n in 0..64:  // thread_idx
+  //      b(k) = b(k) + c(n)
+  //    free(c)
+
+  const int M = 128;
+  const int N = 64;
+  const int kTotalSize = M * N;
+  LoopOptions thread_idx_opt;
+  thread_idx_opt.set_gpu_thread_index(0);
+  LoopOptions block_idx_opt;
+  block_idx_opt.set_gpu_block_index(0);
+
+  BufHandle a("a", {1, M, N}, kFloat);
+  BufHandle b("b", {1}, kFloat);
+  VarHandle k("k", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+
+  std::vector<StmtPtr> block;
+  std::vector<ExprPtr> dims;
+  dims.push_back(ExprHandle(N).node());
+  BufHandle c{alloc<Buf>("c", dims, kFloat)};
+  {
+    // alloc(c, 64);
+    AllocatePtr alloc = Allocate::make(c);
+    block.push_back(alloc);
+  }
+
+  {
+    //    for n in 0..64:  // thread-idx
+    //      c(n) = 0
+    StorePtr store_cn_0 = Store::make(c, {n}, 0.f);
+    ForPtr loop_n1 = For::make(n, 0, N, store_cn_0, thread_idx_opt);
+    block.push_back(loop_n1);
+  }
+
+  {
+    //  for m in 0..128:
+    //    for n in 0..64:  // thread_idx
+    //      c(n) = c(n) + a(k, m, n)
+    ExprHandle load_cn = Load::make(kFloat, c, {n});
+    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n});
+    ExprHandle v_add = load_cn + a_kmn;
+    StorePtr store_cn_v = Store::make(c, {n}, v_add);
+    ForPtr loop_n2 = For::make(n, 0, N, store_cn_v, thread_idx_opt);
+    ForPtr loop_m1 = For::make(m, 0, M, loop_n2);
+    block.push_back(loop_m1);
+  }
+
+  {
+    //    b(k) = 0
+    //    for n in 0..64:  // thread_idx
+    //      b(k) = b(k) + c(n)
+    StorePtr store_bk_0 = b.store({k}, 0.f);
+    block.push_back(store_bk_0);
+    ExprHandle load_bk = b.load(k);
+    ExprHandle load_cn = Load::make(kFloat, c, {n});
+    ExprHandle v_add = load_bk + load_cn;
+    StorePtr store_bk = b.store({k}, v_add);
+    ForPtr loop_n3 = For::make(n, 0, N, store_bk, thread_idx_opt);
+    block.push_back(loop_n3);
+  }
+
+  {
+    //    free(c)
+    FreePtr free_stmt = Free::make(c);
+    block.push_back(free_stmt);
+  }
+
+  BlockPtr reduce_body = Block::make(block);
+  ForPtr loop_k1 = For::make(k, 0, 1, reduce_body, block_idx_opt);
+
+  // TODO: check the generated code for correctness.
+  CudaCodeGen cuda_cg(loop_k1, a, b);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Check the c write is not masked, but the d write is.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: c_1 = 0
+# CHECK: for (int m = 0; m < 128
+# CHECK:   c_1 = c_1 +
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<1
+# CHECK:   b[blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: atomicAdd(&b[blockIdx.x], c_1)
+)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, kTotalSize * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), kTotalSize * sizeof(float), cudaMemcpyHostToDevice));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+}
+
+TEST(Cuda, LocalMemReduce_1_CUDA) {
+  // This test does the following:
+  //  for k in 0..1:  // block-idx
+  //    b(k) = 0
+  //    for n in 0..64:  // thread-idx
+  //      alloc(c, 1)
+  //      c(0) = 0
+  //      for m in 0..128:
+  //        c(0) = c(0) + a(k, m, n)
+  //      b(k) = b(k) + c(0)
+  //      free(c)
+
+  const int M = 128;
+  const int N = 64;
+  const int kTotalSize = M * N;
+  LoopOptions thread_idx_opt;
+  thread_idx_opt.set_gpu_thread_index(0);
+  LoopOptions block_idx_opt;
+  block_idx_opt.set_gpu_block_index(0);
+
+  BufHandle a("a", {1, M, N}, kFloat);
+  BufHandle b("b", {1}, kFloat);
+  VarHandle k("k", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+
+  BufHandle c{
+      alloc<Buf>("c", std::vector<ExprPtr>({alloc<IntImm>(1)}), kFloat)};
+  std::vector<StmtPtr> block_k;
+  {
+    //    b(k) = 0
+    StorePtr store_bk_0 = b.store({k}, 0.f);
+    block_k.push_back(store_bk_0);
+  }
+  std::vector<StmtPtr> block_n;
+  {
+    // alloc(c, 1);
+    AllocatePtr alloc = Allocate::make(c);
+    block_n.push_back(alloc);
+  }
+  {
+    // c(0) = 0
+    StorePtr store_c0_0 = Store::make(c, {0}, 0.f);
+    block_n.push_back(store_c0_0);
+  }
+  {
+    //      for m in 0..128:
+    //        c(0) = c(0) + a(k, m, n)
+    ExprHandle load_c0 = Load::make(kFloat, c, {0});
+    ExprHandle a_kmn = a.load(k * (M * N) + m * N + n);
+    ExprHandle v_add = load_c0 + a_kmn;
+    StorePtr store_c0_v = Store::make(c, {0}, v_add);
+    ForPtr loop_m = For::make(m, 0, M, store_c0_v);
+    block_n.push_back(loop_m);
+  }
+  {
+    //      b(k) = b(k) + c(0)
+    ExprHandle load_bk = b.load(k);
+    ExprHandle load_c0 = Load::make(kFloat, c, {0});
+    ExprHandle v_add = load_bk + load_c0;
+    StorePtr store_bk = b.store({k}, v_add);
+    block_n.push_back(store_bk);
+  }
+  {
+    //      free(c)
+    FreePtr free_stmt = Free::make(c);
+    block_n.push_back(free_stmt);
+  }
+  {
+    BlockPtr block_n_stmt = Block::make(block_n);
+    ForPtr for_n = For::make(n, 0, N, block_n_stmt, thread_idx_opt);
+    block_k.push_back(for_n);
+  }
+  BlockPtr block_k_stmt = Block::make(block_k);
+  ForPtr loop_k = For::make(k, 0, 1, block_k_stmt, block_idx_opt);
+
+  CudaCodeGen cuda_cg(loop_k, a, b);
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, kTotalSize * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), kTotalSize * sizeof(float), cudaMemcpyHostToDevice));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(
+      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+}
+
+TEST(Cuda, HalfSupport_CUDA) {
+  auto half = ToDtype<at::Half>();
+  BufHandle a("a", {4}, half);
+  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
+    return Cast::make(half, ExprHandle(2.0f) * a.load(i));
+  });
+
+  Tensor c = Compute("c", {4}, [&](const VarHandle& i) {
+    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
+  });
+
+  Tensor d = Compute("d", {4}, [&](const VarHandle& i) {
+    return Cast::make(half, c.load(i));
+  });
+
+  LoopNest l({b, c, d});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+  CudaCodeGen cg(s, {a, b, c, d});
+
+  std::vector<at::Half> aData(4, 2.0f);
+  std::vector<float> cData(4, 0.0f);
+  std::vector<at::Half> dData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  at::Half* bDev = nullptr;
+  at::Half* cDev = nullptr;
+  at::Half* dDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto bSize = aData.size() * sizeof(aData[0]);
+  auto cSize = cData.size() * sizeof(float);
+  auto dSize = dData.size() * sizeof(dData[0]);
+
+  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
+  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
+  C10_CUDA_CHECK(cudaMalloc(&cDev, cSize));
+  C10_CUDA_CHECK(cudaMalloc(&dDev, dSize));
+  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(cDev, cData.data(), cSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(dDev, dData.data(), dSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cg.call({aDev, bDev, cDev, dDev});
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  C10_CUDA_CHECK(cudaMemcpy(aData.data(), aDev, aSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(cData.data(), cDev, cSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(dData.data(), dDev, dSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  assertAllEqual(cData, 46.0f);
+
+  C10_CUDA_CHECK(cudaFree(aDev));
+  C10_CUDA_CHECK(cudaFree(bDev));
+  C10_CUDA_CHECK(cudaFree(cDev));
+  C10_CUDA_CHECK(cudaFree(dDev));
+}
+
+TEST(Cuda, HalfPropagation_CUDA) {
+  auto half = ToDtype<at::Half>();
+  BufHandle a("a", {4}, half);
+  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
+    return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
+  });
+
+  LoopNest l({relu});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+  CudaCodeGen cg(s, {a, relu});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+
+  // Check the types used by the Max are Float.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK:  float v = float(a[i]);
+# CHECK:  relu[i] = half(Max(v, 0.f
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<at::Half> aData(4, 2.0f);
+  std::vector<at::Half> reluData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  at::Half* reluDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto reluSize = reluData.size() * sizeof(reluData[0]);
+
+  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
+  C10_CUDA_CHECK(cudaMalloc(&reluDev, reluSize));
+  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cg.call({aDev, reluDev});
+  C10_CUDA_CHECK(
+      cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  assertAllEqual(aData, reluData);
+
+  C10_CUDA_CHECK(cudaFree(aDev));
+  C10_CUDA_CHECK(cudaFree(reluDev));
+}
+
+TEST(Cuda, UnusedHalfArgument_CUDA) {
+  BufHandle a("a", {4}, kFloat);
+  auto half = ToDtype<at::Half>();
+  BufHandle b("b", {4}, half);
+  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
+    return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
+  });
+
+  LoopNest l({relu});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+  CudaCodeGen cg(s, {a, b, relu});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+
+  // Check the types used by the Max are Float.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK:  float v = a[i];
+# CHECK:  relu[i] = Max(v, 0.f
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // Sanity Cbeck;
+  std::vector<float> aData(4, 2.0f);
+  std::vector<at::Half> bData(4, 2.0f);
+  std::vector<float> reluData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  at::Half* bDev = nullptr;
+  at::Half* reluDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto bSize = bData.size() * sizeof(bData[0]);
+  auto reluSize = reluData.size() * sizeof(reluData[0]);
+
+  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
+  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
+  C10_CUDA_CHECK(cudaMalloc(&reluDev, reluSize));
+  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(bDev, bData.data(), bSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(
+      cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cg.call({aDev, bDev, reluDev});
+  C10_CUDA_CHECK(
+      cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  assertAllEqual(aData, reluData);
+
+  C10_CUDA_CHECK(cudaFree(aDev));
+  C10_CUDA_CHECK(cudaFree(bDev));
+  C10_CUDA_CHECK(cudaFree(reluDev));
+}
+
+TEST(Cuda, PrioritizeDependents_CUDA) {
+  BufHandle a("a", {10}, kFloat);
+  BufHandle b("b", {12}, kFloat);
+  BufHandle c("c", {12}, kFloat);
+
+  LoopOptions block_idx_opt;
+  block_idx_opt.set_gpu_block_index(0);
+
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+
+  /*
+   * for (const auto i : c10::irange(12)) {
+   *   c[i] = (i < 10 ? a[i] + b[i] : b[i]);
+   * }
+   */
+  ExprHandle load_a = a.load({i});
+  ExprHandle load_b = b.load({i});
+  ExprHandle cmp = CompareSelect::make(i, 10, CompareSelectOperation::kLT);
+  ExprHandle ite = IfThenElse::make(cmp, Add::make(load_a, load_b), load_b);
+
+  ForPtr loop =
+      For::make(i, 0, 12, Block::make({c.store({i}, ite)}), block_idx_opt);
+
+  CudaCodeGen cuda_cg(loop, a, b, c);
+
+  PaddedBuffer<float> a_v(10, "a_v");
+  PaddedBuffer<float> b_v(12, "b_v");
+  PaddedBuffer<float> c_v(12, "c_v");
+  PaddedBuffer<float> c_ref(12, "c_ref");
+
+  for (const auto i : c10::irange(10)) {
+    a_v(i) = i * 100;
+    b_v(i) = i;
+    c_v(i) = 0;
+  }
+
+  for (const auto i : c10::irange(10, 12)) {
+    b_v(i) = i;
+    c_v(i) = 0;
+  }
+
+  float* a_dev = nullptr;
+  float* b_dev = nullptr;
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, 10 * sizeof(float)));
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, 12 * sizeof(float)));
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, 12 * sizeof(float)));
+
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), 10 * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev, b_v.data(), 12 * sizeof(float), cudaMemcpyHostToDevice));
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev, c_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(), c_dev, 12 * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  for (const auto i : c10::irange(12)) {
+    if (i < 10) {
+      c_ref(i) = i + i * 100;
+    } else {
+      c_ref(i) = i;
+    }
+  }
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+/// Tests the case where there are two loops which have different extents bound
+/// to the same block dimension. We must mask the smaller extent loop body.
+TEST(Cuda, MaskBlockDim_CUDA) {
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {A_SIZE}, kFloat);
+  BufHandle b_buf("b", {B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
+    return a_buf.load(i) + b_buf.load(i);
+  });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Check the c write is not masked, but the d write is.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: if (blockIdx
+# CHECK: c[blockIdx.x] =
+# CHECK: if (blockIdx.x<50
+# CHECK:   d[blockIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(1)));
+
+  // Sanity check that the kernel works.
+  PaddedBuffer<float> a_v(A_SIZE);
+  PaddedBuffer<float> b_v(B_SIZE);
+  PaddedBuffer<float> c_v(A_SIZE);
+  PaddedBuffer<float> d_v(B_SIZE);
+
+  PaddedBuffer<float> c_ref(A_SIZE);
+  PaddedBuffer<float> d_ref(B_SIZE);
+
+  for (const auto i : c10::irange(A_SIZE)) {
+    a_v(i) = (float)i;
+    c_ref(i) = (float)(i + 10);
+  }
+
+  for (const auto i : c10::irange(B_SIZE)) {
+    b_v(i) = (float)(B_SIZE - i);
+    d_ref(i) = a_v(i) + b_v(i);
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+/// Tests the case with two loops, which have different extents that are bound
+/// to the same thread dimension. This is the same as the above - the smaller
+/// rank write should be masked. But this time we also need to syncthreads.
+TEST(Cuda, MaskThreadDim_CUDA) {
+  int A_SIZE = 50;
+  int B_SIZE = 100;
+  BufHandle a_buf("a", {A_SIZE}, kFloat);
+  BufHandle b_buf("b", {B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
+    return a_buf.load(i / 2) + b_buf.load(i);
+  });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_thread_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_thread_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Check the c write is masked, but the d write is not.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (threadIdx.x<50
+# CHECK:   c[threadIdx.x] =
+# CHECK: __syncthreads();
+# CHECK-NOT: if (threadIdx.x
+# CHECK: d[threadIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(1)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(B_SIZE)));
+
+  PaddedBuffer<float> a_v(A_SIZE);
+  PaddedBuffer<float> b_v(B_SIZE);
+  PaddedBuffer<float> c_v(A_SIZE);
+  PaddedBuffer<float> d_v(B_SIZE);
+
+  PaddedBuffer<float> c_ref(A_SIZE);
+  PaddedBuffer<float> d_ref(B_SIZE);
+
+  for (const auto i : c10::irange(A_SIZE)) {
+    a_v(i) = (float)i;
+    c_ref(i) = (float)(i + 10);
+  }
+
+  for (const auto i : c10::irange(B_SIZE)) {
+    b_v(i) = (float)(B_SIZE - i);
+    d_ref(i) = a_v(i / 2) + b_v(i);
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+/// Tests the case where there are two loops, and each is bound to a different
+/// block dimension. In this case all writes should be masked since they occur
+/// in distinct dimensions.
+// Note: this is an extremely dumb pattern which we should never see, but is a
+// useful edge case to make sure we've got things covered.
+TEST(Cuda, MaskMultiBlockDim_CUDA) {
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {A_SIZE}, kFloat);
+  BufHandle b_buf("b", {B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
+    return a_buf.load(i) + b_buf.load(i);
+  });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(1);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Write to c should be masked against y, write to d against x.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (blockIdx.y<1
+# CHECK:   c[blockIdx.x] =
+# CHECK: if (blockIdx.x<1
+# CHECK:   d[blockIdx.y] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
+  ASSERT_TRUE(exprEquals(blockExtents[1], alloc<IntImm>(B_SIZE)));
+
+  PaddedBuffer<float> a_v(A_SIZE);
+  PaddedBuffer<float> b_v(B_SIZE);
+  PaddedBuffer<float> c_v(A_SIZE);
+  PaddedBuffer<float> d_v(B_SIZE);
+
+  PaddedBuffer<float> c_ref(A_SIZE);
+  PaddedBuffer<float> d_ref(B_SIZE);
+
+  for (const auto i : c10::irange(A_SIZE)) {
+    a_v(i) = (float)i;
+    c_ref(i) = (float)(i + 10);
+  }
+
+  for (const auto i : c10::irange(B_SIZE)) {
+    b_v(i) = (float)(B_SIZE - i);
+    d_ref(i) = a_v(i) + b_v(i);
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+/// Tests the case where both the blockDim and threadDim are bound to different
+/// loops. In this instance both stores should be masked since they are
+/// distinct.
+// Note: this is an extremely dumb pattern which we should never see, but is a
+// useful edge case to make sure we've got things covered.
+TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {A_SIZE}, kFloat);
+  BufHandle b_buf("b", {B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
+    return a_buf.load(i) + b_buf.load(i);
+  });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_thread_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (threadIdx.x<1
+# CHECK:   c[blockIdx.x] =
+# CHECK: }
+# CHECK: if (blockIdx.x<1
+# CHECK:   d[threadIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(B_SIZE)));
+
+  PaddedBuffer<float> a_v(A_SIZE);
+  PaddedBuffer<float> b_v(B_SIZE);
+  PaddedBuffer<float> c_v(A_SIZE);
+  PaddedBuffer<float> d_v(B_SIZE);
+
+  PaddedBuffer<float> c_ref(A_SIZE);
+  PaddedBuffer<float> d_ref(B_SIZE);
+
+  for (const auto i : c10::irange(A_SIZE)) {
+    a_v(i) = (float)i;
+    c_ref(i) = (float)(i + 10);
+  }
+
+  for (const auto i : c10::irange(B_SIZE)) {
+    b_v(i) = (float)(B_SIZE - i);
+    d_ref(i) = a_v(i) + b_v(i);
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+/// Tests the case where the loopnest has two loops of depth two: each with the
+/// outer loop bound to blockDim.x and the inner loop bound to threadDim.x. In
+/// this case all writes with a rank smaller than the max should be masked.
+TEST(Cuda, MaskMultiDim_CUDA) {
+  int OUTER_SIZE = 10;
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return ExprHandle(2) * a_buf.load(i, j);
+      });
+  Tensor d = Compute(
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return c.load(i, j * 2) + b_buf.load(i, j);
+      });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // The write to D should be masked, but not the write to C.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: if (
+# CHECK: C[threadIdx.x + 100 * blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<50
+# CHECK:   D[threadIdx.x + 50 * blockIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
+
+  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
+  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
+
+  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(A_SIZE)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+  }
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(B_SIZE)) {
+      b_v(o, i) = (float)(B_SIZE - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+// Tests the case where loop extents are symbolic and not known at compile time.
+// In this case both stores must be masked against the extent of the other loop,
+// incase it is larger.
+TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
+  VarHandle OUTER_SIZE("OUTER_SIZE", kLong);
+  VarHandle A_SIZE("A_SIZE", kLong);
+  VarHandle B_SIZE("B_SIZE", kLong);
+  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return ExprHandle(2) * a_buf.load(i, j);
+      });
+  Tensor d = Compute(
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return c.load(i, j * 2) + b_buf.load(i, j);
+      });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, OUTER_SIZE, A_SIZE, B_SIZE, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Since we don't know which is bigger (A_SIZE or B_SIZE) we must mask both.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (threadIdx.x<A_SIZE
+# CHECK:   C[A_SIZE * int64_t(blockIdx.x) + int64_t(threadIdx.x)] =
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<B_SIZE
+# CHECK:   D[B_SIZE * int64_t(blockIdx.x) + int64_t(threadIdx.x)] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], OUTER_SIZE.node()));
+  ASSERT_TRUE(exprEquals(
+      threadExtents[0], alloc<Max>(A_SIZE.node(), B_SIZE.node(), true)));
+
+  int64_t OUTER_EXTENT = 10;
+  int64_t A_EXTENT = 100;
+  int64_t B_EXTENT = 50;
+
+  PaddedBuffer<float> a_v(OUTER_EXTENT, A_EXTENT);
+  PaddedBuffer<float> b_v(OUTER_EXTENT, B_EXTENT);
+  PaddedBuffer<float> c_v(OUTER_EXTENT, A_EXTENT);
+  PaddedBuffer<float> d_v(OUTER_EXTENT, B_EXTENT);
+
+  PaddedBuffer<float> c_ref(OUTER_EXTENT, A_EXTENT);
+  PaddedBuffer<float> d_ref(OUTER_EXTENT, B_EXTENT);
+
+  for (const auto o : c10::irange(OUTER_EXTENT)) {
+    for (const auto i : c10::irange(A_EXTENT)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+  }
+
+  for (const auto o : c10::irange(OUTER_EXTENT)) {
+    for (const auto i : c10::irange(B_EXTENT)) {
+      b_v(o, i) = (float)(B_EXTENT - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_EXTENT * A_EXTENT * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_EXTENT * B_EXTENT * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_EXTENT * A_EXTENT * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_EXTENT * B_EXTENT * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_EXTENT * A_EXTENT * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_EXTENT * B_EXTENT * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_EXTENT * A_EXTENT * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_EXTENT * B_EXTENT * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, OUTER_EXTENT, A_EXTENT, B_EXTENT, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_EXTENT * A_EXTENT * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_EXTENT * B_EXTENT * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+// Tests the case where two loops are fused at a common parent loop, which is
+// bound to the block dimension. Internally the inner loops have different
+// extents but are bound to the same thread dimension. The smaller loop should
+// be masked.
+TEST(Cuda, MaskCompoundInnerLoop_CUDA) {
+  int OUTER_SIZE = 10;
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
+  BufHandle c_buf("c", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle d_buf("d", {OUTER_SIZE, B_SIZE}, kFloat);
+
+  // Can't build this using Compute and transforms yet.
+  LoopOptions blockBound;
+  blockBound.set_gpu_block_index(0);
+  LoopOptions threadBound;
+  threadBound.set_gpu_thread_index(0);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+
+  StmtPtr stmt = For::make(
+      i,
+      0,
+      OUTER_SIZE,
+      Block::make(
+          {For::make(
+               j,
+               0,
+               A_SIZE,
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
+               threadBound),
+           For::make(
+               k,
+               0,
+               B_SIZE,
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
+               threadBound)}),
+      blockBound);
+
+  stmt = FlattenIndexes(stmt);
+  stmt = IRSimplifier::simplify(stmt);
+
+  CudaCodeGen cuda_cg(stmt, a_buf, b_buf, c_buf, d_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // The write to D should be masked, but not the write to C.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: if (
+# CHECK: c[threadIdx.x + 100 * blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<50
+# CHECK:   d[threadIdx.x + 50 * blockIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
+
+  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
+  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
+
+  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(A_SIZE)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+    for (const auto i : c10::irange(B_SIZE)) {
+      b_v(o, i) = (float)(B_SIZE - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev, c_dev, d_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+// Tests the case with two loops fused into a common parent, which is not bound
+// to any block or thread dimension - however it's two inner loops are bound to
+// the first thread dimensions. This should work just like the MaskThreadDim
+// test where the bigger loop is unmasked but the smaller is masked.
+TEST(Cuda, MaskInnerLoopOneBlock_CUDA) {
+  int OUTER_SIZE = 10;
+  int A_SIZE = 100;
+  int B_SIZE = 50;
+  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
+  BufHandle c_buf("c", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle d_buf("d", {OUTER_SIZE, B_SIZE}, kFloat);
+
+  // Can't build this using Compute and transforms yet.
+  LoopOptions blockBound;
+  blockBound.set_gpu_block_index(0);
+  LoopOptions threadBound;
+  threadBound.set_gpu_thread_index(0);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+
+  StmtPtr stmt = For::make(
+      i,
+      0,
+      OUTER_SIZE,
+      Block::make(
+          {For::make(
+               j,
+               0,
+               A_SIZE,
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
+               threadBound),
+           For::make(
+               k,
+               0,
+               B_SIZE,
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
+               threadBound)}));
+
+  stmt = FlattenIndexes(stmt);
+  stmt = IRSimplifier::simplify(stmt);
+
+  CudaCodeGen cuda_cg(stmt, a_buf, b_buf, c_buf, d_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // The other loop remains the D write is masked.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i = 0; i < 10
+# CHECK-NOT: if (
+# CHECK: c[threadIdx.x + 100 * i] =
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<50
+# CHECK:   d[threadIdx.x + 50 * i] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(1)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
+
+  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
+  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
+
+  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(A_SIZE)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+    for (const auto i : c10::irange(B_SIZE)) {
+      b_v(o, i) = (float)(B_SIZE - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(a_dev, b_dev, c_dev, d_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+// Tests the case with two loop nests, each of which bound to the same block
+// size, but with internal loops bound to different thread rank (ie x and y). In
+// this case both bodies must be masked against the other dimension being > 0.
+// Note: this is a bit degenerate no one would actually write this for perf.
+TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
+  int OUTER_SIZE = 10;
+  int A_SIZE = 30;
+  int B_SIZE = 15;
+  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return ExprHandle(2) * a_buf.load(i, j);
+      });
+  Tensor d = Compute(
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return c.load(i, j * 2) + b_buf.load(i, j);
+      });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(1);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // Both stores masked against the other thread dim < 1.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (threadIdx.y<1
+# CHECK:   C[threadIdx.x + 30 * blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: if (threadIdx.x<1
+# CHECK:   D[threadIdx.y + 15 * blockIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
+
+  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
+  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
+
+  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
+  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(A_SIZE)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+  }
+
+  for (const auto o : c10::irange(OUTER_SIZE)) {
+    for (const auto i : c10::irange(B_SIZE)) {
+      b_v(o, i) = (float)(B_SIZE - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+// Tests the case with two loop nests, each bound to both Block and Thread but
+// the second loop is smaller in both cases - the second store must be masked
+// for both the block and thread dimension.
+TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
+  int OUTER_A_SIZE = 10;
+  int OUTER_B_SIZE = 5;
+  int A_SIZE = 30;
+  int B_SIZE = 15;
+  BufHandle a_buf("a", {OUTER_A_SIZE, A_SIZE}, kFloat);
+  BufHandle b_buf("b", {OUTER_B_SIZE, B_SIZE}, kFloat);
+  Tensor c = Compute(
+      "C", {OUTER_A_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return ExprHandle(2) * a_buf.load(i, j);
+      });
+  Tensor d = Compute(
+      "D", {OUTER_B_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
+        return c.load(i, j * 2) + b_buf.load(i, j);
+      });
+
+  LoopNest l({c, d});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+  loops = l.getLoopStmtsFor(d);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
+
+  std::ostringstream oss;
+  oss << *cuda_cg.stmt();
+
+  // The write to D should be masked twice, but not the write to C.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: if (
+# CHECK: C[threadIdx.x + 30 * blockIdx.x] =
+# CHECK: __syncthreads();
+# CHECK: if (blockIdx.x<5
+# CHECK:   if (threadIdx.x<15
+# CHECK:     D[threadIdx.x + 15 * blockIdx.x] =)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto blockExtents = cuda_cg.gpu_block_extents();
+  auto threadExtents = cuda_cg.gpu_thread_extents();
+  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_A_SIZE)));
+  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
+
+  PaddedBuffer<float> a_v(OUTER_A_SIZE, A_SIZE);
+  PaddedBuffer<float> b_v(OUTER_B_SIZE, B_SIZE);
+  PaddedBuffer<float> c_v(OUTER_A_SIZE, A_SIZE);
+  PaddedBuffer<float> d_v(OUTER_B_SIZE, B_SIZE);
+
+  PaddedBuffer<float> c_ref(OUTER_A_SIZE, A_SIZE);
+  PaddedBuffer<float> d_ref(OUTER_B_SIZE, B_SIZE);
+
+  for (const auto o : c10::irange(OUTER_A_SIZE)) {
+    for (const auto i : c10::irange(A_SIZE)) {
+      a_v(o, i) = (float)i;
+      c_ref(o, i) = (float)(i * 2);
+    }
+  }
+
+  for (const auto o : c10::irange(OUTER_B_SIZE)) {
+    for (const auto i : c10::irange(B_SIZE)) {
+      b_v(o, i) = (float)(B_SIZE - i);
+      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
+    }
+  }
+
+  float* a_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_A_SIZE * A_SIZE * sizeof(float)));
+  float* b_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_B_SIZE * B_SIZE * sizeof(float)));
+  float* c_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_A_SIZE * A_SIZE * sizeof(float)));
+  float* d_dev = nullptr;
+  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_B_SIZE * B_SIZE * sizeof(float)));
+  C10_CUDA_CHECK(cudaMemcpy(
+      a_dev,
+      a_v.data(),
+      OUTER_A_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      b_dev,
+      b_v.data(),
+      OUTER_B_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_dev,
+      c_v.data(),
+      OUTER_A_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_dev,
+      d_v.data(),
+      OUTER_B_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  cuda_cg(c_dev, d_dev, a_dev, b_dev);
+
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+  C10_CUDA_CHECK(cudaMemcpy(
+      c_v.data(),
+      c_dev,
+      OUTER_A_SIZE * A_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaMemcpy(
+      d_v.data(),
+      d_dev,
+      OUTER_B_SIZE * B_SIZE * sizeof(float),
+      cudaMemcpyDeviceToHost));
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+  ExpectAllNear(d_v, d_ref, 1e-5);
+
+  C10_CUDA_CHECK(cudaFree(a_dev));
+  C10_CUDA_CHECK(cudaFree(b_dev));
+  C10_CUDA_CHECK(cudaFree(c_dev));
+  C10_CUDA_CHECK(cudaFree(d_dev));
+}
+
+} // namespace jit
+} // namespace torch
+
+#endif
diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
new file mode 100644
index 0000000000000..07b9872fb8325
--- /dev/null
+++ b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
@@ -0,0 +1,701 @@
+#include <gtest/gtest.h>
+
+#include <ATen/code_template.h>
+#include <c10/core/DeviceType.h>
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/torch.h>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+#include <thread>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::indexing;
+using namespace torch::jit::tensorexpr;
+
+TEST(DynamicShapes, SimpleGraph) {
+#ifdef TORCH_ENABLE_LLVM
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+      graph(%x : Tensor,
+            %SS_2 : int,
+            %SS_3 : int):
+        %3 : Tensor = aten::tanh(%x)
+        %4 : Tensor = aten::erf(%3)
+        return (%4))IR";
+  torch::jit::parseIR(graph_string, graph.get());
+
+  auto x_inp = graph->inputs()[0];
+  auto x_type = TensorType::create(at::rand({10, 5}));
+  std::vector<ShapeSymbol> x_sym_dims(
+      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
+  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
+  graph->inputs().at(0)->setType(x_sym_type);
+  for (const auto n : graph->nodes()) {
+    n->output()->setType(x_sym_type);
+  }
+
+  // Graph with symbolic shapes:
+  //
+  // graph(%x : Float(SS(-2), SS(-3)),
+  //       %SS_2 : int,
+  //       %SS_3 : int):
+  //   %3 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
+  //   %4 : Float(SS(-2), SS(-3)) = aten::erf(%3)
+  //   return (%4)
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[x_inp] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
+      x_sym_dims,
+      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+  // Run with the same static dims as the one we initialized the graph with.
+  {
+    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::erf(at::tanh(a));
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
+    stack.push_back(10);
+    stack.push_back(5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  // Run with inputs having different dims.
+  {
+    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::erf(at::tanh(a));
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
+    stack.push_back(50);
+    stack.push_back(100);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, GraphWith2InputsSameDims) {
+#ifdef TORCH_ENABLE_LLVM
+  // The two inputs in this graph must have the same dims.
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+      graph(%x : Tensor,
+            %y : Tensor,
+            %SS_2 : int,
+            %SS_3 : int):
+        %3 : Tensor = aten::tanh(%x)
+        %4 : Tensor = aten::erf(%3)
+        %5 : Tensor = aten::mul(%4, %y)
+        return (%5))IR";
+  torch::jit::parseIR(graph_string, graph.get());
+
+  auto x_inp = graph->inputs()[0];
+  auto y_inp = graph->inputs()[1];
+  auto x_type = TensorType::create(at::rand({10, 5}));
+  std::vector<ShapeSymbol> x_sym_dims(
+      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
+  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
+  graph->inputs().at(0)->setType(x_sym_type);
+  graph->inputs().at(1)->setType(x_sym_type);
+  for (const auto n : graph->nodes()) {
+    n->output()->setType(x_sym_type);
+  }
+
+  // Graph with symbolic shapes:
+  //
+  // graph(%x : Float(SS(-4), SS(-5)),
+  //       %y : Float(SS(-4), SS(-5)),
+  //       %SS_2 : int,
+  //       %SS_3 : int):
+  //   %4 : Float(SS(-4), SS(-5)) = aten::tanh(%x)
+  //   %5 : Float(SS(-4), SS(-5)) = aten::erf(%4)
+  //   %6 : Float(SS(-4), SS(-5)) = aten::mul(%5, %y)
+  //   return (%6)
+
+  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
+      x_sym_dims,
+      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[x_inp] = input_desc;
+  symbolic_strides[y_inp] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  // Run with the same static dims as the one we initialized the graph with.
+  {
+    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(10);
+    stack.push_back(5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  // Run with inputs having different dims.
+  {
+    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(50);
+    stack.push_back(100);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, GraphWith2InputsAndBroadcast) {
+#ifdef TORCH_ENABLE_LLVM
+  // The second input to the graph has a dim of size 1 which should be
+  // broadcasted in the at::mul op.
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
+            %y : Float(1, 5, requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int):
+        %3 : Tensor = aten::tanh(%x)
+        %4 : Tensor = aten::erf(%3)
+        %5 : Tensor = aten::mul(%4, %y)
+        return (%5))IR";
+  torch::jit::parseIR(graph_string, graph.get());
+
+  auto x_inp = graph->inputs()[0];
+  auto y_inp = graph->inputs()[1];
+  auto x_type = TensorType::create(at::rand({10, 5}));
+  auto y_type = TensorType::create(at::rand({1, 5}));
+  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
+  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
+  auto x_sym_type = x_type->withSymbolicShapes(
+      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
+  auto y_sym_type = y_type->withSymbolicShapes(std::vector<ShapeSymbol>(
+      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
+  graph->inputs().at(0)->setType(x_sym_type);
+  graph->inputs().at(1)->setType(y_sym_type);
+  for (const auto n : graph->nodes()) {
+    n->output()->setType(x_sym_type);
+  }
+
+  // Graph with symbolic shapes:
+  //
+  // graph(%x : Float(SS(-6), SS(-7)),
+  //       %y : Float(1, SS(-7)),
+  //       %SS_2 : int,
+  //       %SS_3 : int):
+  //   %4 : Float(SS(-6), SS(-7)) = aten::tanh(%x)
+  //   %5 : Float(SS(-6), SS(-7)) = aten::erf(%4)
+  //   %6 : Float(SS(-6), SS(-7)) = aten::mul(%5, %y)
+  //   return (%6)
+
+  std::vector<int64_t> symbolic_shape_inputs(
+      {x_dim0_sym.value(), x_dim1_sym.value()});
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[x_inp] = input_desc;
+  symbolic_strides[y_inp] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  // Run with the same static dims as the one we initialized the graph with.
+  {
+    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(10);
+    stack.push_back(5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  // Run with inputs having different dims.
+  {
+    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(50);
+    stack.push_back(100);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, GraphWithPartiallySymbolicOutput) {
+#ifdef TORCH_ENABLE_LLVM
+  // The second input to the graph has a dim of size 1 which should be
+  // broadcasted in the at::mul op.
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+      graph(%x : Float(1, 5, requires_grad=0, device=cpu),
+            %y : Float(1, 5, requires_grad=0, device=cpu),
+            %SS_2 : int):
+        %4 : Tensor = aten::tanh(%x)
+        %5 : Tensor = aten::mul(%4, %y)
+        return (%5))IR";
+  torch::jit::parseIR(graph_string, graph.get());
+
+  auto x_inp = graph->inputs()[0];
+  auto y_inp = graph->inputs()[1];
+  auto x_type = TensorType::create(at::rand({1, 5}));
+  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
+  auto x_sym_type = x_type->withSymbolicShapes(std::vector<ShapeSymbol>(
+      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
+  graph->inputs().at(0)->setType(x_sym_type);
+  graph->inputs().at(1)->setType(x_sym_type);
+  for (const auto n : graph->nodes()) {
+    n->output()->setType(x_sym_type);
+  }
+
+  // Graph with symbolic shapes:
+  //
+  // graph(%x : Float(1, SS(-2)),
+  //       %y : Float(1, SS(-2)),
+  //       %SS_2 : int):
+  //   %3 : Float(1, SS(-2)) = aten::tanh(%x)
+  //   %4 : Float(1, SS(-2)) = aten::mul(%3, %y)
+  //   return (%4)
+
+  std::vector<int64_t> symbolic_shape_inputs({x_dim1_sym.value()});
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[x_inp] = input_desc;
+  symbolic_strides[y_inp] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  // Run with the same static dims as the one we initialized the graph with.
+  {
+    auto a = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::tanh(a), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  // Run with inputs having different dims.
+  {
+    auto a = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::tanh(a), b);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.push_back(100);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, GraphWithSymbolicStrides) {
+#ifdef TORCH_ENABLE_LLVM
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
+          %1 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
+          %SS_3 : int,
+          %SS_2 : int):
+      %15 : int = prim::Constant[value=1]()
+      %21 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::add(%0, %1, %15)
+      %22 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::mul(%21, %0)
+      return (%22))IR";
+  parseIR(graph_string, &*graph);
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::S_AS_ARG, torch::jit::StrideInput::S_ONE};
+  std::vector<torch::jit::StrideInput> output_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = output_desc;
+  std::vector<int64_t> symbolic_shape_inputs = {-3, -2};
+  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  {
+    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::add(x0, x1, 1), x0);
+
+    std::vector<at::Tensor> inputs = {x0, x1};
+    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
+    stack.push_back(32);
+    stack.push_back(10);
+    k.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  {
+    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto out =
+        at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto ref = at::mul(at::add(x0, x1, 1), x0);
+
+    std::vector<at::Tensor> inputs = {out, x0, x1};
+    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
+    stack.push_back(32);
+    stack.push_back(10);
+    k.runWithAllocatedOutputs(stack);
+
+    ASSERT_TRUE(at::allclose(out, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, GraphWithCatAndBroadcast) {
+#ifdef TORCH_ENABLE_LLVM
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
+            %y : Float(4, 5, requires_grad=0, device=cpu),
+            %z : Float(1, 1, requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int,
+            %SS_4 : int,
+            %SS_5 : int):
+        %11 : int = prim::Constant[value=0]()
+        %3 : Tensor = aten::tanh(%x)
+        %out1 : Tensor = aten::erf(%3)
+        %out2 : Tensor = aten::relu(%y)
+        %10 : Tensor[] = prim::ListConstruct(%out1, %out2)
+        %25 : Tensor = aten::cat(%10, %11)
+        %28 : Tensor = aten::hardswish(%25)
+        %29 : Tensor = aten::mul(%28, %z)
+        return (%29))IR";
+  torch::jit::parseIR(graph_string, graph.get());
+
+  auto x_inp = graph->inputs()[0];
+  auto y_inp = graph->inputs()[1];
+  auto z_inp = graph->inputs()[2];
+  auto x_type = TensorType::create(at::rand({10, 5}));
+  auto y_type = TensorType::create(at::rand({4, 5}));
+  auto z_type = TensorType::create(at::rand({1, 1}));
+  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
+  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
+  auto x_sym_type = x_type->withSymbolicShapes(
+      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
+  auto y_dim0_sym = c10::ShapeSymbol::newSymbol();
+  auto y_sym_type = y_type->withSymbolicShapes(
+      std::vector<ShapeSymbol>({y_dim0_sym, x_dim1_sym}));
+  graph->inputs().at(0)->setType(x_sym_type);
+  graph->inputs().at(1)->setType(y_sym_type);
+  auto cat_dim0_sym = c10::ShapeSymbol::newSymbol();
+  auto cat_out_type = x_type->withSymbolicShapes(
+      std::vector<ShapeSymbol>({cat_dim0_sym, x_dim1_sym}));
+  auto nodeIt = graph->nodes().begin();
+  ++nodeIt;
+  nodeIt->output()->setType(x_sym_type); // aten::tanh
+  ++nodeIt;
+  nodeIt->output()->setType(x_sym_type); // aten::erf
+  ++nodeIt;
+  nodeIt->output()->setType(y_sym_type); // aten::relu
+  ++nodeIt;
+  ++nodeIt;
+  nodeIt->output()->setType(cat_out_type); // aten::cat
+  ++nodeIt;
+  nodeIt->output()->setType(cat_out_type); // aten::hardswish
+  ++nodeIt;
+  nodeIt->output()->setType(cat_out_type); // aten::mul
+
+  // Graph with symbolic shapes:
+  //
+  // graph(%x : Float(SS(-2), SS(-3)),
+  //       %y : Float(SS(-4), SS(-3)),
+  //       %z : Float(1, 1),
+  //       %SS_2 : int,
+  //       %SS_3 : int,
+  //       %SS_4 : int,
+  //       %SS_5 : int):
+  //   %7 : int = prim::Constant[value=0]()
+  //   %8 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
+  //   %9 : Float(SS(-2), SS(-3)) = aten::erf(%8)
+  //   %10 : Float(SS(-4), SS(-3)) = aten::relu(%y)
+  //   %11 : Tensor[] = prim::ListConstruct(%9, %10)
+  //   %12 : Float(SS(-5), SS(-3)) = aten::cat(%11, %7)
+  //   %13 : Float(SS(-5), SS(-3)) = aten::hardswish(%12)
+  //   %14 : Float(SS(-5), SS(-3)) = aten::mul(%13, %z)
+  //   return (%14)
+
+  std::vector<int64_t> symbolic_shape_inputs(
+      {x_dim0_sym.value(),
+       x_dim1_sym.value(),
+       y_dim0_sym.value(),
+       cat_dim0_sym.value()});
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[x_inp] = input_desc;
+  symbolic_strides[y_inp] = input_desc;
+  symbolic_strides[z_inp] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto b = at::rand({4, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto c = at::rand({1, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto ref = at::mul(
+      at::hardswish(at::cat({at::erf(at::tanh(a)), at::relu(b)}, 0)), c);
+
+  std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
+  stack.push_back(10);
+  stack.push_back(5);
+  stack.push_back(4);
+  stack.push_back(14);
+  kernel.run(stack);
+
+  auto o = stack[0].toTensor();
+  ASSERT_TRUE(at::allclose(o, ref));
+#endif
+}
+
+TEST(DynamicShapes, GraphFromModel) {
+#ifdef TORCH_ENABLE_LLVM
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
+          %1 : Float(SS(-2), SS(-4), requires_grad=0, device=cpu),
+          %2 : Float(SS(-2), SS(-5), requires_grad=0, device=cpu),
+          %input.4 : Long(SS(-2), SS(-6), requires_grad=0, device=cpu),
+          %4 : Float(SS(-7), requires_grad=0, device=cpu),
+          %5 : Float(SS(-7), requires_grad=0, device=cpu),
+          %SS_10 : int,
+          %SS_9 : int,
+          %SS_8 : int,
+          %SS_7 : int,
+          %SS_6 : int,
+          %SS_5 : int,
+          %SS_4 : int,
+          %SS_3 : int,
+          %SS_2 : int):
+      %15 : int = prim::Constant[value=1]()
+      %16 : bool = prim::Constant[value=0]()
+      %17 : int = prim::Constant[value=6]()
+      %18 : Float(SS(-2), SS(-6), strides=[139, 1], requires_grad=0, device=cpu) = aten::to(%input.4, %17, %16, %16)
+      %19 : Tensor[] = prim::ListConstruct(%0, %1, %18, %2)
+      %20 : Float(SS(-2), SS(-8), strides=[261, 1], requires_grad=0, device=cpu) = aten::cat(%19, %15)
+      %21 : Float(SS(-2), SS(-9), strides=[261, 1], requires_grad=0, device=cpu) = aten::add(%20, %5, %15)
+      %22 : Float(SS(-2), SS(-10), requires_grad=0, device=cpu) = aten::mul(%21, %4)
+      return (%22))IR";
+  parseIR(graph_string, &*graph);
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->inputs().at(2)] = input_desc;
+  symbolic_strides[graph->inputs().at(3)] = input_desc;
+  symbolic_strides[graph->inputs().at(4)] = input_desc;
+  symbolic_strides[graph->inputs().at(5)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+  std::vector<int64_t> symbolic_shape_inputs = {
+      -10, -9, -8, -7, -6, -5, -4, -3, -2};
+  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  int64_t i2 = 10;
+  int64_t i3 = 32;
+  int64_t i4 = 19;
+  int64_t i5 = 71;
+  int64_t i6 = 139;
+  int64_t i7 = 261;
+  int64_t i8 = 261;
+  int64_t i9 = 261;
+  int64_t i10 = 261;
+  auto x0 = at::rand({i2, i3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto x1 = at::rand({i2, i4}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto x2 = at::rand({i2, i5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto x3 = at::ones({i2, i6}, at::TensorOptions(at::kCPU).dtype(at::kLong));
+  auto x4 = at::rand({i7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto x5 = at::rand({i8}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto ref = at::mul(at::add(at::cat({x0, x1, x3, x2}, 1), x5), x4);
+
+  {
+    std::vector<at::Tensor> inputs = {x0, x1, x2, x3, x4, x5};
+    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
+    stack.emplace_back(i10);
+    stack.emplace_back(i9);
+    stack.emplace_back(i8);
+    stack.emplace_back(i7);
+    stack.emplace_back(i6);
+    stack.emplace_back(i5);
+    stack.emplace_back(i4);
+    stack.emplace_back(i3);
+    stack.emplace_back(i2);
+    k.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+
+  {
+    auto out =
+        at::rand({i2, i10}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    std::vector<at::Tensor> inputs = {out, x0, x1, x2, x3, x4, x5};
+    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
+    stack.emplace_back(i10);
+    stack.emplace_back(i9);
+    stack.emplace_back(i8);
+    stack.emplace_back(i7);
+    stack.emplace_back(i6);
+    stack.emplace_back(i5);
+    stack.emplace_back(i4);
+    stack.emplace_back(i3);
+    stack.emplace_back(i2);
+    k.runWithAllocatedOutputs(stack);
+
+    ASSERT_TRUE(at::allclose(out, ref));
+  }
+#endif
+}
+
+TEST(DynamicShapes, MultiThreadedExecution) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_template = R"IR(
+      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
+            %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
+            %SS_2 : int,
+            %SS_3 : int):
+        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x)
+        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3)
+        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y)
+        return (%5))IR";
+  for (bool use_cuda : {false, true}) {
+    if (!torch::cuda::is_available() && use_cuda) {
+      continue;
+    }
+    auto device = use_cuda ? at::kCUDA : at::kCPU;
+    at::jit::TemplateEnv env;
+    env.s("device", use_cuda ? "cuda:0" : "cpu");
+    const auto graph_string = format(graph_template, env);
+    std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, graph.get());
+
+    std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
+
+    std::vector<torch::jit::StrideInput> input_desc = {
+        torch::jit::StrideInput::TENSOR_CONT};
+    std::unordered_map<
+        const torch::jit::Value*,
+        std::vector<torch::jit::StrideInput>>
+        symbolic_strides;
+    symbolic_strides[graph->inputs().at(0)] = input_desc;
+    symbolic_strides[graph->inputs().at(1)] = input_desc;
+    symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+    TensorExprKernel kernel(
+        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+    auto run_kernel = [&](int dim1, int dim2) {
+      auto a =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+      auto b =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+
+      auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+      std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+      stack.emplace_back(dim1);
+      stack.emplace_back(dim2);
+      kernel.run(stack);
+
+      auto o = stack[0].toTensor();
+      ASSERT_TRUE(at::allclose(o, ref));
+    };
+
+    // Run the kernel in parallel to ensure that the run() method calls in
+    // TensorExprKernel are not changing any state.
+    constexpr size_t kNumThreads = 4;
+    std::vector<std::thread> threads;
+    for (size_t id = 0; id < kNumThreads; ++id) {
+      threads.emplace_back(run_kernel, id + 5, id + 20);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
new file mode 100644
index 0000000000000..eb2d6296b2299
--- /dev/null
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -0,0 +1,836 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
+
+TEST(Expr, BasicValueTest) {
+  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
+  ExprHandle c = Add::make(a, b);
+  SimpleIRExprEval eval(c);
+  ASSERT_EQ(eval.value<int>(), 5);
+}
+
+TEST(Expr, BasicValueTest02) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(4.0f);
+  ExprHandle d(5.0f);
+  ExprHandle f = (a + b) - (c + d);
+  SimpleIRExprEval eval(f);
+  ASSERT_EQ(eval.value<float>(), -4.0f);
+}
+
+TEST(Expr, IsChannelsLastContiguous) {
+  std::vector<VarHandle> vars = {
+      VarHandle("var1", kLong),
+      VarHandle("var2", kLong),
+      VarHandle("var3", kLong),
+      VarHandle("var4", kLong),
+      VarHandle("var5", kLong)};
+
+  // {
+  //   key: ndims,
+  //   value: [
+  //     ...
+  //     [dim_2, dim_1, ..., dim_n]
+  //   ]
+  // }
+  using shapGenInfo = std::unordered_map<int, std::vector<std::vector<int>>>;
+
+  // {
+  //   size: [ExprHandle_1, ExprHandle_2, ..., ExprHandle_n],
+  //   strides: [
+  //     ...
+  //     [ExprHandle_x, ExprHandle_y, ..., ExprHandle_z]
+  //   ]
+  // }
+  using shapeInfo =
+      std::pair<std::vector<ExprHandle>, std::vector<std::vector<ExprHandle>>>;
+
+  std::vector<int> dims = {3, 4, 5};
+
+  std::unordered_map<int, std::vector<ExprHandle>> dims_expr_vec_conf = {
+      {3, std::vector<ExprHandle>(vars.begin(), vars.begin() + 2)},
+      {4, std::vector<ExprHandle>(vars.begin(), vars.begin() + 3)},
+      {5, std::vector<ExprHandle>(vars.begin(), vars.begin() + 4)},
+  };
+
+  shapGenInfo channels_last_cont_shape_conf = {
+      {3, {{1, 2, 0}}}, {4, {{1, 3, 2, 0}}}, {5, {{1, 4, 3, 2, 0}}}};
+  shapGenInfo channels_last_non_cont_shape_conf = {
+      {3, {{2, 1, 0}, {1, 0, 2}}},
+      {4, {{3, 1, 2, 0}, {1, 2, 3, 0}, {1, 0, 2, 3}}},
+      {5, {{4, 3, 2, 1, 0}, {1, 3, 2, 4, 0}, {1, 4, 3, 2, 0}}}};
+
+  shapGenInfo cont_shape_conf = {
+      {3, {{0, 1, 2}}}, {4, {{0, 1, 2, 3}}}, {5, {{0, 1, 2, 3, 4}}}};
+
+  auto shape_gen_fn = [dims_expr_vec_conf](
+                          int ndims, shapGenInfo shape_gen_info) -> shapeInfo {
+    auto dims_expr_vec = dims_expr_vec_conf.at(ndims);
+    std::vector<std::vector<ExprHandle>> strides_expr_vec;
+    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
+      strides_expr_vec[i].resize(ndims);
+    }
+
+    auto stride_gen_fn = [](int indicator, ExprHandle a, ExprHandle b) {
+      if (indicator % 2 == 0) {
+        return a * b;
+      } else {
+        return b * a;
+      }
+    };
+
+    auto stride_order_vec = shape_gen_info.at(ndims);
+    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
+      auto stride_order = stride_order_vec[i];
+
+      strides_expr_vec[i][stride_order[0]] = 1;
+      for (size_t j = 1; j < stride_order.size(); j++) {
+        auto cur_dim_idx = stride_order[j];
+        auto adjacent_dim_idx = stride_order[j - 1];
+
+        strides_expr_vec[i][cur_dim_idx] = stride_gen_fn(
+            i,
+            dims_expr_vec[adjacent_dim_idx],
+            strides_expr_vec[i][adjacent_dim_idx]);
+      }
+    }
+
+    return {dims_expr_vec, strides_expr_vec};
+  };
+
+  auto check_channels_last_fn = [](int ndims, BufHandle buf_handle) -> bool {
+    if (ndims == 3) {
+      return buf_handle.is_channels_last_1d_contiguous();
+    } else if (ndims == 4) {
+      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast);
+    } else {
+      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast3d);
+    }
+  };
+
+  // channels-last contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), true);
+    }
+  }
+
+  // channels-last non-contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_non_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), false);
+    }
+  }
+
+  // contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(buf_handle.is_contiguous(), true);
+    }
+  }
+
+  // non-contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(buf_handle.is_contiguous(), false);
+    }
+  }
+}
+
+TEST(Expr, LetTest01) {
+  VarHandle x("x", kFloat);
+  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle(3.f));
+  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, LetTest02) {
+  VarHandle x("x", kFloat);
+  VarHandle y("y", kFloat);
+  ExprHandle body =
+      ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f) * y);
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle(3.f));
+  eval.bindVar(y, ExprHandle(6.f));
+  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4 * 6));
+}
+
+TEST(Expr, LetStmtTest01) {
+  BufHandle a_buf("a", {1}, kFloat);
+  BufHandle b_buf("b", {1}, kFloat);
+
+  ExprHandle load_a = a_buf.load(0);
+  VarHandle var = VarHandle("v", kFloat);
+  StmtPtr let_store = Let::make(var, load_a);
+  StmtPtr store_b = b_buf.store({0}, var);
+  BlockPtr block = Block::make({let_store, store_b});
+
+  SimpleIREvaluator eval(block, {a_buf, b_buf});
+
+  PaddedBuffer<float> a_v(1);
+  PaddedBuffer<float> b_v(1);
+  PaddedBuffer<float> b_ref(1);
+
+  a_v(0) = 23;
+  b_ref(0) = a_v(0);
+  eval(a_v, b_v);
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+TEST(Expr, IntTest) {
+  VarHandle x("x", kInt);
+  ExprHandle body = ExprHandle(2) + (x * ExprHandle(3) + ExprHandle(4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle(3));
+  ASSERT_EQ(eval.value<int>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, FloatTest) {
+  VarHandle x("x", kFloat);
+  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle(3.f));
+  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, ByteTest) {
+  VarHandle x("x", kByte);
+  ExprHandle body = ExprHandle((uint8_t)2) +
+      (x * ExprHandle((uint8_t)3) + ExprHandle((uint8_t)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((uint8_t)3));
+  ASSERT_EQ(eval.value<uint8_t>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, CharTest) {
+  VarHandle x("x", kChar);
+  ExprHandle body = ExprHandle((int8_t)2) +
+      (x * ExprHandle((int8_t)3) + ExprHandle((int8_t)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((int8_t)3));
+  ASSERT_EQ(eval.value<int8_t>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, ShortTest) {
+  VarHandle x("x", kShort);
+  ExprHandle body = ExprHandle((int16_t)2) +
+      (x * ExprHandle((int16_t)3) + ExprHandle((int16_t)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((int16_t)3));
+  ASSERT_EQ(eval.value<int16_t>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, LongTest) {
+  VarHandle x("x", kLong);
+  ExprHandle body = ExprHandle((int64_t)2) +
+      (x * ExprHandle((int64_t)3) + ExprHandle((int64_t)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((int64_t)3));
+  ASSERT_EQ(eval.value<int64_t>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, HalfTest) {
+  VarHandle x("x", kHalf);
+  ExprHandle body = ExprHandle((at::Half)2) +
+      (x * ExprHandle((at::Half)3) + ExprHandle((at::Half)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((at::Half)3));
+  ASSERT_EQ(eval.value<at::Half>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, DoubleTest) {
+  VarHandle x("x", kDouble);
+  ExprHandle body = ExprHandle((double)2) +
+      (x * ExprHandle((double)3) + ExprHandle((double)4));
+  SimpleIRExprEval eval(body);
+  eval.bindVar(x, ExprHandle((double)3));
+  ASSERT_EQ(eval.value<double>(), 2 + (3 * 3 + 4));
+}
+
+TEST(Expr, VectorAdd01) {
+  const int kVectorSize = 8;
+  const int kVectorCount = 128;
+  const int kTotalSize = kVectorSize * kVectorCount;
+
+  BufHandle a_buf("A", {kTotalSize}, kFloat);
+  BufHandle b_buf("B", {kTotalSize}, kFloat);
+  BufHandle c_buf("C", {kTotalSize}, kFloat);
+
+  /*
+  Build the following:
+    for (const auto index : c10::irange(kVectorCount)) {
+      store(c_buf, ramp(index * 8, 1, 8),
+            load(a_buf, ramp(index * 8, 1, 8) +
+            load(b_buf, ramp(index * 8, 1, 8))))
+    }
+  */
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a =
+      a_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
+  ExprHandle load_b =
+      b_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
+  ExprHandle value = load_a + load_b;
+  StmtPtr store_c =
+      c_buf.store({Ramp::make(index * kVectorSize, 1, kVectorSize)}, value);
+  StmtPtr stmt = For::make(index, 0, kVectorCount, store_c);
+
+  ASSERT_EQ(load_a.dtype(), Dtype(kFloat, kVectorSize));
+  ASSERT_EQ(load_b.dtype(), Dtype(kFloat, kVectorSize));
+  ASSERT_EQ(value.dtype(), Dtype(kFloat, kVectorSize));
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+  PaddedBuffer<float> c_v(kTotalSize);
+  PaddedBuffer<float> c_ref(kTotalSize);
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = i * i;
+    b_v(i) = i * i * 4;
+    c_ref(i) = a_v(i) + b_v(i);
+  }
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
+  ir_eval(a_v, b_v, c_v);
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(Expr, CompareSelectEQ) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 1);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 0);
+
+  VarHandle i("i", kInt);
+  auto memcpy_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
+
+  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(a_buffer, 1);
+  assertAllEqual(b_buffer, 1);
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(Expr, CompareSelectDtypes) {
+  // LHS and RHS expressions should have the same dtype, but this dtype could
+  // differ from the dtype of the return values (but dtypes of true and false
+  // return values should be the same).
+  // This test constructs a CompareSelect expression where the input dtype is
+  // different from the output dtype and verifies that it works correctly:
+  //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<int> a_buffer(N, 1);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 0.0f);
+  std::vector<float> c_ref(N, 3.14f);
+
+  VarHandle i("i", kInt);
+  // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f
+  // A and B are int, C is float.
+  auto select_expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i),
+              b.load(i),
+              FloatImm::make(3.14f),
+              FloatImm::make(2.78f),
+              CompareSelectOperation::kEQ)));
+
+  SimpleIREvaluator ir_eval(select_expr, {a, b, c});
+  ir_eval(a_buffer, b_buffer, c_buffer);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(a_buffer, 1);
+  assertAllEqual(b_buffer, 1);
+  ExpectAllNear(c_buffer, c_ref, 1e-7);
+}
+
+TEST(Expr, IntrinsicsDtypes) {
+  constexpr int N = 256;
+  BufHandle a("A", {N}, kDouble);
+  BufHandle b("B", {N}, kDouble);
+  std::vector<double> a_buffer(N, -10.0);
+  std::vector<double> b_buffer(N, 0.0);
+  std::vector<double> b_ref(N, 10.0);
+
+  VarHandle i("i", kInt);
+  auto abs_expr = For::make(i, 0, N, b.store({i}, tensorexpr::abs(a.load(i))));
+
+  SimpleIREvaluator ir_eval(abs_expr, {a, b});
+  ir_eval(a_buffer, b_buffer);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+
+  assertAllEqual(a_buffer, -10.0);
+  ExpectAllNear(b_buffer, b_ref, 1e-7);
+}
+
+TEST(Expr, Substitute01) {
+  VarPtr x = alloc<Var>("x", kFloat);
+  VarPtr y = alloc<Var>("y", kFloat);
+  ExprPtr e =
+      alloc<Mul>(alloc<Sub>(x, alloc<FloatImm>(1.0f)), alloc<Add>(x, y));
+
+  VarPtr z = alloc<Var>("z", kFloat);
+  ExprPtr e2 = Substitute(e, {{x, alloc<Add>(z, alloc<FloatImm>(5.0f))}});
+  ExprPtr e2_ref = alloc<Mul>(
+      alloc<Sub>(alloc<Add>(z, alloc<FloatImm>(5.0f)), alloc<FloatImm>(1.0f)),
+      alloc<Add>(alloc<Add>(z, alloc<FloatImm>(5.0f)), y));
+  std::ostringstream oss;
+  oss << *e2;
+  std::string e2_str = oss.str();
+
+  oss.str("");
+  oss << *e2_ref;
+  std::string e2_ref_str = oss.str();
+  ASSERT_EQ(e2_str, e2_ref_str);
+}
+
+TEST(Expr, Math01) {
+  ExprHandle v = sin(ExprHandle(1.0f));
+
+  std::ostringstream oss;
+  oss << v;
+  ASSERT_EQ(oss.str(), "sin(1.f)");
+
+  SimpleIRExprEval eval(v);
+  float v_ref = std::sin(1.0f);
+  float res = eval.value<float>();
+  ASSERT_NEAR(res, v_ref, 1e-6);
+}
+
+TEST(Expr, UnaryMath01) {
+  struct TestConfig {
+    std::function<ExprHandle(const ExprHandle&)> func;
+    std::function<float(float)> ref_func;
+  };
+
+  std::vector<TestConfig> test_configs = {
+      {[](const ExprHandle& v) { return sin(v); },
+       [](float v) { return std::sin(v); }},
+      {[](const ExprHandle& v) { return sin(v); },
+       [](float v) { return std::sin(v); }},
+      {[](const ExprHandle& v) { return tan(v); },
+       [](float v) { return std::tan(v); }},
+      {[](const ExprHandle& v) { return asin(v); },
+       [](float v) { return std::asin(v); }},
+      {[](const ExprHandle& v) { return acos(v); },
+       [](float v) { return std::acos(v); }},
+      {[](const ExprHandle& v) { return atan(v); },
+       [](float v) { return std::atan(v); }},
+      {[](const ExprHandle& v) { return sinh(v); },
+       [](float v) { return std::sinh(v); }},
+      {[](const ExprHandle& v) { return cosh(v); },
+       [](float v) { return std::cosh(v); }},
+      {[](const ExprHandle& v) { return tanh(v); },
+       [](float v) { return std::tanh(v); }},
+      {[](const ExprHandle& v) { return exp(v); },
+       [](float v) { return std::exp(v); }},
+      {[](const ExprHandle& v) { return tensorexpr::abs(v); },
+       [](float v) { return std::fabs(v); }},
+      {[](const ExprHandle& v) { return log(v); },
+       [](float v) { return std::log(v); }},
+      {[](const ExprHandle& v) { return log2(v); },
+       [](float v) { return std::log2(v); }},
+      {[](const ExprHandle& v) { return log10(v); },
+       [](float v) { return std::log10(v); }},
+      {[](const ExprHandle& v) { return erf(v); },
+       [](float v) { return std::erf(v); }},
+      {[](const ExprHandle& v) { return sqrt(v); },
+       [](float v) { return std::sqrt(v); }},
+      {[](const ExprHandle& v) { return rsqrt(v); },
+       [](float v) { return 1.0f / std::sqrt(v); }},
+      {[](const ExprHandle& v) { return ceil(v); },
+       [](float v) { return std::ceil(v); }},
+      {[](const ExprHandle& v) { return floor(v); },
+       [](float v) { return std::floor(v); }},
+      {[](const ExprHandle& v) { return round(v); },
+       [](float v) { return std::round(v); }},
+      {[](const ExprHandle& v) { return trunc(v); },
+       [](float v) { return std::trunc(v); }},
+  };
+
+  for (const TestConfig& test_config : test_configs) {
+    const float input_v = 0.8765f;
+    ExprHandle v = test_config.func(ExprHandle(input_v));
+    float v_ref = test_config.ref_func(input_v);
+    SimpleIRExprEval eval(v);
+    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
+  }
+
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  for (float input_v : {std::nan("1"), 0., .5}) {
+    ExprHandle v = FloatImm::make(input_v);
+    SimpleIRExprEval eval(Intrinsics::make(kIsNan, v));
+    ASSERT_NEAR(eval.value<int>(), std::isnan(input_v), 0);
+  }
+}
+
+TEST(Expr, BinaryMath01) {
+  struct TestConfig {
+    std::function<ExprHandle(const ExprHandle&, const ExprHandle&)> func;
+    std::function<float(float, float)> ref_func;
+  };
+
+  std::vector<TestConfig> test_configs = {
+      {[](const ExprHandle& v1, const ExprHandle& v2) { return pow(v1, v2); },
+       [](float v1, float v2) { return std::pow(v1, v2); }},
+      {[](const ExprHandle& v1, const ExprHandle& v2) { return fmod(v1, v2); },
+       [](float v1, float v2) { return std::fmod(v1, v2); }},
+  };
+
+  for (const TestConfig& test_config : test_configs) {
+    const float v1 = 0.8765f;
+    float v2 = 1.2345f;
+    ExprHandle v_expr = test_config.func(ExprHandle(v1), ExprHandle(v2));
+    float v_ref = test_config.ref_func(v1, v2);
+    SimpleIRExprEval eval(v_expr);
+    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
+  }
+}
+
+TEST(Expr, LogicalOps01) {
+  ExprHandle a(23);
+  ExprHandle b(11);
+  ExprHandle c(0.72f);
+  ExprHandle d(0.69f);
+  ExprHandle f1 = (a > b) && (c > d);
+  ExprHandle f2 = (a > b) && (c < d);
+  ExprHandle f3 = (a < b) && (c > d);
+  ExprHandle f4 = (a < b) && (c < d);
+  ExprHandle f5 = (a < b) || (c > d);
+  ExprHandle f6 = (a < b) || (c < d);
+  ExprHandle f7 = (a > b) || (c < d);
+  ExprHandle f8 = (a > b) || (c > d);
+
+  SimpleIRExprEval eval1(f1);
+  SimpleIRExprEval eval2(f2);
+  SimpleIRExprEval eval3(f3);
+  SimpleIRExprEval eval4(f4);
+  SimpleIRExprEval eval5(f5);
+  SimpleIRExprEval eval6(f6);
+  SimpleIRExprEval eval7(f7);
+  SimpleIRExprEval eval8(f8);
+  ASSERT_EQ(eval1.value<int>(), 1);
+  ASSERT_EQ(eval2.value<int>(), 0);
+  ASSERT_EQ(eval3.value<int>(), 0);
+  ASSERT_EQ(eval4.value<int>(), 0);
+  ASSERT_EQ(eval5.value<int>(), 1);
+  ASSERT_EQ(eval6.value<int>(), 0);
+  ASSERT_EQ(eval7.value<int>(), 1);
+  ASSERT_EQ(eval8.value<int>(), 1);
+}
+
+TEST(Expr, LogicalOps02) {
+  ExprHandle a(23);
+  ExprHandle b(11);
+  ExprHandle c(0.72f);
+  ExprHandle d(0.72f);
+
+  ExprHandle f1 = (a > b) || (c > d);
+  ExprHandle f2 = (a > b) && (c <= d);
+  ExprHandle f3 = (a > b) && (c > d);
+  ExprHandle ff1 = f1 && f2;
+  ExprHandle ff2 = f2 || f3;
+
+  SimpleIRExprEval eval1(ff1);
+  SimpleIRExprEval eval2(ff2);
+  ASSERT_EQ(eval1.value<int>(), 1);
+  ASSERT_EQ(eval2.value<int>(), 1);
+}
+
+TEST(Expr, LogicalOps03) {
+  ExprHandle a(23);
+  ExprHandle b(11);
+  ExprHandle c(0.72f);
+  ExprHandle d(0.69f);
+
+  // Bool types
+  ExprHandle bool_f1 = (a > b) && BoolImm::make(true);
+  ExprHandle bool_f2 = (c <= d) || BoolImm::make(true);
+
+  // Int types
+  ExprHandle int_f1 = (a > b) && IntImm::make(1);
+  ExprHandle int_f2 = (c <= d) || IntImm::make(1);
+
+  // Short types
+  ExprHandle short_f1 = (a > b) && ShortImm::make(1);
+  ExprHandle short_f2 = (c <= d) || ShortImm::make(1);
+
+  // Long types
+  ExprHandle long_f1 = (a > b) && LongImm::make(1);
+  ExprHandle long_f2 = (c <= d) || LongImm::make(1);
+
+  // Char types
+  ExprHandle char_f1 = (a > b) && CharImm::make(1);
+  ExprHandle char_f2 = (c <= d) || CharImm::make(1);
+
+  // Byte types
+  ExprHandle byte_f1 = (a > b) && ByteImm::make(1);
+  ExprHandle byte_f2 = (c <= d) || ByteImm::make(1);
+
+  SimpleIRExprEval eval1(bool_f1);
+  SimpleIRExprEval eval2(bool_f2);
+  SimpleIRExprEval eval3(int_f1);
+  SimpleIRExprEval eval4(int_f2);
+  SimpleIRExprEval eval5(short_f1);
+  SimpleIRExprEval eval6(short_f2);
+  SimpleIRExprEval eval7(long_f1);
+  SimpleIRExprEval eval8(long_f2);
+  SimpleIRExprEval eval9(char_f1);
+  SimpleIRExprEval eval10(char_f2);
+  SimpleIRExprEval eval11(byte_f1);
+  SimpleIRExprEval eval12(byte_f2);
+
+  ASSERT_EQ(eval1.value<bool>(), true);
+  ASSERT_EQ(eval2.value<bool>(), true);
+  ASSERT_EQ(eval3.value<int>(), 1);
+  ASSERT_EQ(eval4.value<int>(), 1);
+  ASSERT_EQ(eval5.value<int16_t>(), 1);
+  ASSERT_EQ(eval6.value<int16_t>(), 1);
+  ASSERT_EQ(eval7.value<int64_t>(), 1);
+  ASSERT_EQ(eval8.value<int64_t>(), 1);
+  ASSERT_EQ(eval9.value<int8_t>(), 1);
+  ASSERT_EQ(eval10.value<int8_t>(), 1);
+  ASSERT_EQ(eval11.value<uint8_t>(), 1);
+  ASSERT_EQ(eval12.value<uint8_t>(), 1);
+}
+
+TEST(Expr, BitwiseOps) {
+  ExprHandle a(59);
+  ExprHandle b(11);
+  ExprHandle c(101);
+  ExprHandle d(2);
+  ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
+
+  SimpleIRExprEval eval(f);
+  ASSERT_EQ(eval.value<int>(), 11);
+}
+
+TEST(Expr, DynamicShapeAdd) {
+  auto testWithSize = [](int32_t size) {
+    VarHandle n("n", kInt);
+    BufHandle a("a", {n}, kFloat);
+    BufHandle b("b", {n}, kFloat);
+    BufHandle c("c", {n}, kFloat);
+    VarHandle i("i", kInt);
+    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
+    std::vector<float> aData(size, 1.0f);
+    std::vector<float> bData(size, 2.0f);
+    std::vector<float> cData(size, 0.0f);
+    SimpleIREvaluator(s, {a, b, c, n})(aData, bData, cData, size);
+    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
+  };
+  testWithSize(1);
+  testWithSize(16);
+  testWithSize(37);
+}
+
+TEST(Expr, OutOfBounds) {
+  ExprHandle N(10);
+  ExprHandle start(0);
+  ExprHandle stop(15);
+  VarHandle i("i", kInt);
+
+  BufHandle X("X", {N}, kInt);
+
+  auto body = Store::make(X, {i}, i);
+  auto stmt = For::make(i, start, stop, body);
+
+  PaddedBuffer<int> data(20);
+
+  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
+}
+
+TEST(Expr, OutOfBounds2d) {
+  std::vector<std::pair<int, int>> size_options = {{10, 15}, {15, 10}};
+  for (auto sizes : size_options) {
+    ExprHandle N(sizes.first);
+    ExprHandle M(sizes.second);
+    ExprHandle start(0);
+    ExprHandle stopInner(15);
+    ExprHandle stopOuter(15);
+    VarHandle i("i", kInt);
+    VarHandle j("j", kInt);
+
+    BufHandle X("X", {N, M}, kInt);
+
+    auto body = Store::make(X, {i, j}, i);
+    auto inner = For::make(j, start, stopInner, body);
+    auto stmt = For::make(i, start, stopOuter, inner);
+
+    PaddedBuffer<int> data(400);
+
+    EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
+  }
+}
+
+TEST(Expr, OutOfBounds2dFlattenedIndex) {
+  ExprHandle buf_size(149);
+  ExprHandle start(0);
+  ExprHandle stopInner(15);
+  ExprHandle stopOuter(10);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+
+  BufHandle X("X", {buf_size}, kInt);
+
+  auto idx = Add::make(Mul::make(i, stopInner), j);
+  auto body = Store::make(X, {idx}, i);
+  auto inner = For::make(j, start, stopInner, body);
+  auto stmt = For::make(i, start, stopOuter, inner);
+
+  PaddedBuffer<int> data(400);
+
+  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
+}
+
+void testCond01() {
+  const int N = 16;
+  PaddedBuffer<float> a_v(N);
+  BufHandle a_buf("a", {N}, kFloat);
+  VarHandle index = VarHandle("index", kInt);
+  StmtPtr assign_x2 = a_buf.store({index}, cast<float>(index) * 2);
+  StmtPtr assign_x3 = a_buf.store({index}, cast<float>(index) * 3);
+  ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ);
+  StmtPtr assign = Cond::make(even_cond, assign_x2, assign_x3);
+  StmtPtr for_stmt = For::make(index, 0, N, assign);
+  SimpleIREvaluator(for_stmt, {a_buf})(a_v);
+
+  PaddedBuffer<float> a_ref(N);
+  for (const auto i : c10::irange(N)) {
+    if (i % 2 == 0) {
+      a_ref(i) = i * 2;
+    } else {
+      a_ref(i) = i * 3;
+    }
+  }
+  ExpectAllNear(a_v, a_ref, 1e-5);
+}
+
+void testIfThenElse01() {
+  ExprHandle v = ifThenElse(ExprHandle(1), ExprHandle(1.0f), ExprHandle(2.0f));
+
+  std::ostringstream oss;
+  oss << v;
+  ASSERT_EQ(oss.str(), "IfThenElse(1, 1.f, 2.f)");
+
+  SimpleIRExprEval eval(v);
+  ASSERT_EQ(eval.value<float>(), 1.0f);
+}
+
+void testIfThenElse02() {
+  ExprHandle v = ifThenElse(ExprHandle(0), ExprHandle(1.0f), ExprHandle(2.0f));
+
+  std::ostringstream oss;
+  oss << v;
+  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
+
+  SimpleIRExprEval eval(v);
+  ASSERT_EQ(eval.value<float>(), 2.0f);
+}
+
+void testIfThenElse03() {
+  ExprHandle v =
+      ifThenElse(BoolImm::make(false), ExprHandle(1.0f), ExprHandle(2.0f));
+
+  std::ostringstream oss;
+  oss << v;
+  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
+
+  SimpleIRExprEval eval(v);
+  ASSERT_EQ(eval.value<float>(), 2.0f);
+}
+
+void testStmtClone() {
+  const int N = 16;
+
+  BufHandle a_buf("a", {N}, kInt);
+  VarHandle index = VarHandle("index", kInt);
+  StmtPtr body = a_buf.store({index}, 5);
+  StmtPtr loop = For::make(index, 0, N, body);
+
+  StmtPtr cloned_loop = Stmt::clone(loop);
+  std::vector<int> orig_loop_results(N);
+  std::vector<int> cloned_loop_results(N);
+  SimpleIREvaluator(loop, {a_buf})(orig_loop_results);
+  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results);
+
+  assertAllEqual(orig_loop_results, 5);
+  assertAllEqual(cloned_loop_results, 5);
+
+  // Let's add another assign to the body in the cloned loop and verify that the
+  // original statement hasn't changed while the cloned one has.
+  StmtPtr body_addition = a_buf.store({index}, 33);
+  BlockPtr cloned_body = static_to<Block>(static_to<For>(cloned_loop)->body());
+  cloned_body->append_stmt(body_addition);
+
+  std::vector<int> orig_loop_results_after_mutation(N);
+  std::vector<int> cloned_loop_results_after_mutation(N);
+  SimpleIREvaluator(loop, {a_buf})(orig_loop_results_after_mutation);
+  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results_after_mutation);
+
+  assertAllEqual(orig_loop_results_after_mutation, 5);
+  assertAllEqual(cloned_loop_results_after_mutation, 33);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
new file mode 100644
index 0000000000000..49f43d16b499d
--- /dev/null
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -0,0 +1,1061 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/jit.h>
+
+#include <ATen/NativeFunctions.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/native/xnnpack/OpContext.h>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+TEST(ExternalCall, Conv1d_float) {
+  BufHandle Input("Input", {1, 100, 115}, kFloat);
+  BufHandle Weight("Weight", {100, 1, 7}, kFloat);
+  BufHandle Bias("Bias", {100}, kFloat);
+  BufHandle ResultBuf("Result", {1, 100, 115}, kFloat);
+  int64_t stride = 1;
+  int64_t pad = 3;
+  int64_t dilation = 1;
+  int64_t groups = 100;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_aten_conv1d",
+          {Input, Weight, Bias},
+          {stride, pad, dilation, groups}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 100, 115}, options) * 5.f;
+  at::Tensor weight = at::ones({100, 1, 7}, options) * 6.f;
+  at::Tensor bias = at::ones({100}, options) * 11.f;
+  at::Tensor ref =
+      at::conv1d(input, weight, bias, {stride}, {pad}, {dilation}, groups);
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 100 * 115, 5.f);
+  std::vector<float> weight_buf(100 * 1 * 7, 6.f);
+  std::vector<float> bias_buf(100, 11.f);
+  std::vector<float> result_buf(1 * 100 * 115, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Conv1d_int) {
+  // A similar test, but now using kInt tensors
+  BufHandle Input("Input", {1, 100, 115}, kInt);
+  BufHandle Weight("Weight", {100, 1, 7}, kInt);
+  BufHandle Bias("Bias", {100}, kInt);
+  BufHandle ResultBuf("Result", {1, 100, 115}, kInt);
+  int64_t stride = 1;
+  int64_t pad = 3;
+  int64_t dilation = 1;
+  int64_t groups = 100;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_aten_conv1d",
+          {Input, Weight, Bias},
+          {stride, pad, dilation, groups}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kInt)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 100, 115}, options) * 5;
+  at::Tensor weight = at::ones({100, 1, 7}, options) * 6;
+  at::Tensor bias = at::ones({100}, options) * 11;
+  at::Tensor ref =
+      at::conv1d(input, weight, bias, {stride}, {pad}, {dilation}, groups);
+
+  at::Tensor nnc_result;
+  std::vector<int32_t> input_buf(1 * 100 * 115, 5);
+  std::vector<int32_t> weight_buf(100 * 1 * 7, 6);
+  std::vector<int32_t> bias_buf(100, 11);
+  std::vector<int32_t> result_buf(1 * 100 * 115, -1);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Conv1d_nobias_noargs) {
+  BufHandle Input("Input", {1, 1, 115}, kFloat);
+  BufHandle Weight("Weight", {10, 1, 7}, kFloat);
+  BufHandle ResultBuf("Result", {1, 10, 109}, kFloat);
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(ResultBuf, "nnc_aten_conv1d", {Input, Weight}, {}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 1, 115}, options) * 5.f;
+  at::Tensor weight = at::ones({10, 1, 7}, options) * 6.f;
+  at::Tensor ref = at::conv1d(input, weight);
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 1 * 115, 5.f);
+  std::vector<float> weight_buf(10 * 1 * 7, 6.f);
+  std::vector<float> result_buf(1 * 10 * 109, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 10, 109}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Result});
+
+  ir_eval.call({input_buf, weight_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 10, 109}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Conv2d_float) {
+  BufHandle Input("Input", {1, 3, 224, 224}, kFloat);
+  BufHandle Weight("Weight", {16, 3, 3, 3}, kFloat);
+  BufHandle Bias("Bias", {16}, kFloat);
+  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
+  int64_t stride = 2;
+  int64_t pad = 1;
+  int64_t dilation = 1;
+  int64_t groups = 1;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_aten_conv2d",
+          {Input, Weight, Bias},
+          {stride, stride, pad, pad, dilation, dilation, groups}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 3, 224, 224}, options) * 5.f;
+  at::Tensor weight = at::ones({16, 3, 3, 3}, options) * 6.f;
+  at::Tensor bias = at::ones({16}, options) * 11.f;
+  at::Tensor ref = at::conv2d(
+      input,
+      weight,
+      bias,
+      {stride, stride},
+      {pad, pad},
+      {dilation, dilation},
+      groups);
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 3 * 224 * 224, 5.f);
+  std::vector<float> weight_buf(16 * 3 * 3 * 3, 6.f);
+  std::vector<float> bias_buf(16, 11.f);
+  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Conv2d_int) {
+  // A similar test, but now using kInt tensors
+
+  BufHandle Input("Input", {1, 3, 224, 224}, kInt);
+  BufHandle Weight("Weight", {16, 3, 3, 3}, kInt);
+  BufHandle Bias("Bias", {16}, kInt);
+  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kInt);
+  int64_t stride = 2;
+  int64_t pad = 1;
+  int64_t dilation = 1;
+  int64_t groups = 1;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_aten_conv2d",
+          {Input, Weight, Bias},
+          {stride, stride, pad, pad, dilation, dilation, groups}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kInt)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 3, 224, 224}, options) * 5;
+  at::Tensor weight = at::ones({16, 3, 3, 3}, options) * 6;
+  at::Tensor bias = at::ones({16}, options) * 11;
+  at::Tensor ref = at::conv2d(
+      input,
+      weight,
+      bias,
+      {stride, stride},
+      {pad, pad},
+      {dilation, dilation},
+      groups);
+
+  at::Tensor nnc_result;
+  std::vector<int32_t> input_buf(1 * 3 * 224 * 224, 5);
+  std::vector<int32_t> weight_buf(16 * 3 * 3 * 3, 6);
+  std::vector<int32_t> bias_buf(16, 11);
+  std::vector<int32_t> result_buf(1 * 16 * 112 * 112, -1);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
+
+  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Conv2d_nobias_noargs) {
+  BufHandle Input("Input", {1, 16, 112, 112}, kFloat);
+  BufHandle Weight("Weight", {16, 16, 1, 1}, kFloat);
+  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(ResultBuf, "nnc_aten_conv2d", {Input, Weight}, {}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 16, 112, 112}, options) * 5.f;
+  at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f;
+  at::Tensor ref = at::conv2d(input, weight);
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 16 * 112 * 112, 5.f);
+  std::vector<float> weight_buf(16 * 16 * 1 * 1, 6.f);
+  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Result});
+
+  llvm_codegen.call({input_buf, weight_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Result});
+
+  ir_eval.call({input_buf, weight_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Addmm_float) {
+  BufHandle Input("Input", {100, 300}, kFloat);
+  BufHandle Mat1("Mat1", {100, 200}, kFloat);
+  BufHandle Mat2("Mat2", {200, 300}, kFloat);
+  BufHandle ResultBuf("Result", {100, 300}, kFloat);
+  int64_t beta = 2;
+  int64_t alpha = 2;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf, "nnc_aten_addmm", {Input, Mat1, Mat2}, {beta, alpha}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({100, 300}, options) * 5.f;
+  at::Tensor mat1 = at::ones({100, 200}, options) * 6.f;
+  at::Tensor mat2 = at::ones({200, 300}, options) * 11.f;
+  at::Tensor ref = at::addmm(input, mat1, mat2, beta, alpha);
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(100 * 300, 5.f);
+  std::vector<float> mat1_buf(100 * 200, 6.f);
+  std::vector<float> mat2_buf(200 * 300, 11.f);
+  std::vector<float> result_buf(100 * 300, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Mat1, Mat2, Result});
+
+  llvm_codegen.call({input_buf, mat1_buf, mat2_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Mat1, Mat2, Result});
+
+  ir_eval.call({input_buf, mat1_buf, mat2_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Embedding) {
+  BufHandle Weight("Weight", {256, 100}, kFloat);
+  BufHandle Indices("Indices", {1, 115}, kLong);
+  BufHandle ResultBuf("Result", {1, 115, 100}, kFloat);
+  int64_t padding_idx = -1;
+  bool scale_grad_by_freq = false;
+  bool sparse = false;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_aten_embedding",
+          {Weight, Indices},
+          {padding_idx, (int64_t)scale_grad_by_freq, (int64_t)sparse}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+
+  at::Tensor weight = at::ones({256, 100}, options.dtype(at::kFloat)) * 5.f;
+  at::Tensor indices = at::ones({1, 115}, options.dtype(at::kLong)) * 6;
+  at::Tensor ref =
+      at::embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
+
+  at::Tensor nnc_result;
+  std::vector<float> weight_buf(256 * 100, 5.f);
+  std::vector<int64_t> indices_buf(1 * 115, 6);
+  std::vector<float> result_buf(1 * 115 * 100, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Weight, Indices, Result});
+
+  llvm_codegen.call({weight_buf, indices_buf, result_buf});
+  nnc_result = at::from_blob(
+      result_buf.data(), {1, 115, 100}, options.dtype(at::kFloat));
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Weight, Indices, Result});
+
+  ir_eval.call({weight_buf, indices_buf, result_buf});
+  nnc_result = at::from_blob(
+      result_buf.data(), {1, 115, 100}, options.dtype(at::kFloat));
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, MaxReduction) {
+  BufHandle Input("Input", {1, 115, 152}, kFloat);
+  BufHandle ResultBuf("Result", {1, 152}, kFloat);
+  int64_t dim = 1;
+  bool keep_dim = false;
+
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf, "nnc_aten_max_red", {Input}, {dim, (int64_t)keep_dim}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+
+  at::Tensor input = at::ones({1, 115, 152}, options) * 5.f;
+  at::Tensor ref = std::get<0>(at::max(input, dim, keep_dim));
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 115 * 152, 5.f);
+  std::vector<float> result_buf(1 * 152, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Result});
+
+  llvm_codegen.call({input_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 152}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Result});
+
+  ir_eval.call({input_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 152}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+#ifdef USE_XNNPACK
+
+TEST(ExternalCall, Prepacked_Linear_float) {
+  using namespace at::native::xnnpack;
+
+  BufHandle Input("Input", {100, 200}, kFloat);
+  BufHandle ResultBuf("Result", {100, 300}, kFloat);
+
+  // Calculate reference result using at::linear.
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input =
+      at::linspace(-10.0, 10.0, 100 * 200, options).resize_({100, 200});
+  at::Tensor weight =
+      at::linspace(-10.0, 10.0, 300 * 200, options).resize_({300, 200});
+  at::Tensor bias = at::linspace(-10.0, 10.0, 300, options);
+  at::Tensor ref = at::linear(input, weight, bias);
+
+  // Create prepacked xnnpack context object.
+  auto linear_clamp_prepack_op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("prepacked::linear_clamp_prepack", "")
+          .typed<c10::intrusive_ptr<LinearOpContext>(
+              at::Tensor,
+              std::optional<at::Tensor>,
+              const std::optional<at::Scalar>&,
+              const std::optional<at::Scalar>&)>();
+  auto prepacked = linear_clamp_prepack_op.call(
+      weight, bias, std::optional<at::Scalar>(), std::optional<at::Scalar>());
+
+  BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_prepacked_linear_clamp_run",
+          {Input, DummyPrepacked},
+          {}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(
+      input.data_ptr<float>(), input.data_ptr<float>() + 100 * 200);
+  std::vector<float> result_buf(100 * 300, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, DummyPrepacked, Result});
+
+  llvm_codegen.call({input_buf, prepacked.get(), result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, DummyPrepacked, Result});
+
+  ir_eval.call({input_buf, prepacked.get(), result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Prepacked_Conv2d_float) {
+  using namespace at::native::xnnpack;
+
+  BufHandle Input("Input", {1, 3, 224, 224}, kFloat);
+  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
+  int64_t stride = 2;
+  int64_t pad = 1;
+  int64_t dilation = 1;
+  int64_t groups = 1;
+
+  // Calculate reference result using at::conv2d.
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::linspace(-10.0, 10.0, 1 * 3 * 224 * 224, options)
+                         .resize_({1, 3, 224, 224});
+  at::Tensor weight =
+      at::linspace(-10.0, 10.0, 16 * 3 * 3 * 3, options).resize_({16, 3, 3, 3});
+  at::Tensor bias = at::linspace(-10.0, 10.0, 16, options);
+  at::Tensor ref = at::conv2d(
+      input,
+      weight,
+      bias,
+      {stride, stride},
+      {pad, pad},
+      {dilation, dilation},
+      groups);
+
+  // Create prepacked xnnpack context object.
+  auto conv2d_clamp_prepack_op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("prepacked::conv2d_clamp_prepack", "")
+          .typed<c10::intrusive_ptr<Conv2dOpContext>(
+              at::Tensor,
+              std::optional<at::Tensor>,
+              std::vector<int64_t>,
+              std::vector<int64_t>,
+              std::vector<int64_t>,
+              int64_t,
+              const std::optional<at::Scalar>&,
+              const std::optional<at::Scalar>&)>();
+  auto prepacked = conv2d_clamp_prepack_op.call(
+      weight,
+      bias,
+      {stride, stride},
+      {pad, pad},
+      {dilation, dilation},
+      groups,
+      std::optional<at::Scalar>(),
+      std::optional<at::Scalar>());
+
+  BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
+  Tensor Result = Tensor(
+      ResultBuf.node(),
+      ExternalCall::make(
+          ResultBuf,
+          "nnc_prepacked_conv2d_clamp_run",
+          {Input, DummyPrepacked},
+          {}));
+  LoopNest l({Result});
+  l.prepareForCodegen();
+  l.simplify();
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(
+      input.data_ptr<float>(), input.data_ptr<float>() + 1 * 3 * 224 * 224);
+  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, DummyPrepacked, Result});
+
+  llvm_codegen.call({input_buf, prepacked.get(), result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref, 1e-03, 1e-03));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, DummyPrepacked, Result});
+
+  ir_eval.call({input_buf, prepacked.get(), result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref, 1e-03, 1e-03));
+}
+
+#endif // USE_XNNPACK
+
+TEST(ExternalCall, BinaryFloat) {
+  using TensorFunc = std::function<at::Tensor(at::Tensor, at::Tensor)>;
+  using Test = std::tuple<
+      std::vector<int64_t>,
+      std::vector<int64_t>,
+      std::vector<int64_t>,
+      TensorFunc,
+      std::string>;
+  std::vector<Test> tests = {};
+  tests.push_back(
+      Test{{100, 200}, {200, 300}, {100, 300}, at::matmul, "nnc_aten_matmul"});
+  tests.push_back(Test{{100, 300}, {300}, {100}, at::mv, "nnc_aten_mv"});
+  tests.push_back(Test{
+      {100, 200},
+      {200, 300},
+      {100, 300},
+      [&](const at::Tensor& a, const at::Tensor& b) { return at::mm(a, b); },
+      "nnc_aten_mm"});
+  for (auto curTest : tests) {
+    auto [aShape, bShape, resShape, torchFunc, externCallName] = curTest;
+    auto toExprHandleVec = [](std::vector<int64_t> v) {
+      auto intV = std::vector<int>(v.begin(), v.end());
+      return std::vector<ExprHandle>(intV.begin(), intV.end());
+    };
+    BufHandle A("A", toExprHandleVec(aShape), kFloat);
+    BufHandle B("B", toExprHandleVec(bShape), kFloat);
+    BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
+
+    Tensor Result = Tensor(
+        ResultBuf.node(),
+        ExternalCall::make(ResultBuf, externCallName, {A, B}, {}));
+    LoopNest l({Result});
+    l.prepareForCodegen();
+    l.simplify();
+
+    auto options = at::TensorOptions()
+                       .dtype(at::kFloat)
+                       .layout(at::kStrided)
+                       .device(at::kCPU)
+                       .requires_grad(false);
+    at::Tensor a = at::ones(c10::IntArrayRef(aShape), options) * 5.f;
+    at::Tensor b = at::ones(c10::IntArrayRef(bShape), options) * 6.f;
+    at::Tensor ref = torchFunc(a, b);
+
+    auto prod = [](std::vector<int64_t> v) {
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
+      return std::accumulate(v.begin(), v.end(), 1, std::multiplies<int64_t>());
+    };
+
+    at::Tensor nnc_result;
+    std::vector<float> a_buf(prod(aShape), 5.f);
+    std::vector<float> b_buf(prod(bShape), 6.f);
+    std::vector<float> result_buf(prod(resShape), -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+    LLVMCodeGen llvm_codegen(l.root_stmt(), {A, B, Result});
+
+    llvm_codegen.call({a_buf, b_buf, result_buf});
+    nnc_result =
+        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
+    ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+    SimpleIREvaluator ir_eval(l.root_stmt(), {A, B, Result});
+    ir_eval.call({a_buf, b_buf, result_buf});
+    nnc_result =
+        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
+    ASSERT_TRUE(at::allclose(nnc_result, ref));
+  }
+}
+
+TEST(ExternalCall, UnaryFloat) {
+  using TensorFunc = std::function<at::Tensor(at::Tensor)>;
+  auto toExprHandleVec = [](std::vector<int64_t> v) {
+    auto intV = std::vector<int>(v.begin(), v.end());
+    return std::vector<ExprHandle>(intV.begin(), intV.end());
+  };
+  using Test = std::tuple<
+      std::vector<int64_t>,
+      std::vector<int64_t>,
+      TensorFunc,
+      std::string,
+      std::vector<ExprHandle>>;
+  std::vector<Test> tests = {};
+  tests.push_back(Test{
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+      {1, 64, 8, 9},
+      {1, 64, 5, 7},
+      [](at::Tensor x) { return at::adaptive_avg_pool2d(x, {5, 7}); },
+      "nnc_aten_adaptive_avg_pool2d",
+      toExprHandleVec({5, 7})});
+  tests.push_back(Test{// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+                       {100, 200},
+                       {100},
+                       [](at::Tensor x) { return at::mean(x, {1}); },
+                       "nnc_aten_mean",
+                       toExprHandleVec({1, /*keepdim=*/0})});
+  for (auto curTest : tests) {
+    auto [aShape, resShape, torchFunc, externCallName, externCallArgs] =
+        curTest;
+    BufHandle A("A", toExprHandleVec(aShape), kFloat);
+    BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
+
+    Tensor Result = Tensor(
+        ResultBuf.node(),
+        ExternalCall::make(ResultBuf, externCallName, {A}, externCallArgs));
+    LoopNest l({Result});
+    l.prepareForCodegen();
+    l.simplify();
+
+    auto options = at::TensorOptions()
+                       .dtype(at::kFloat)
+                       .layout(at::kStrided)
+                       .device(at::kCPU)
+                       .requires_grad(false);
+    at::Tensor a = at::ones(c10::IntArrayRef(aShape), options) * 5.f;
+    at::Tensor ref = torchFunc(a);
+
+    auto prod = [](std::vector<int64_t> v) {
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
+      return std::accumulate(v.begin(), v.end(), 1, std::multiplies<int64_t>());
+    };
+
+    at::Tensor nnc_result;
+    std::vector<float> a_buf(prod(aShape), 5.f);
+    std::vector<float> result_buf(prod(resShape), -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+    LLVMCodeGen llvm_codegen(l.root_stmt(), {A, Result});
+
+    llvm_codegen.call({a_buf, result_buf});
+    nnc_result =
+        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
+    ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+    SimpleIREvaluator ir_eval(l.root_stmt(), {A, Result});
+    ir_eval.call({a_buf, result_buf});
+    nnc_result =
+        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
+    ASSERT_TRUE(at::allclose(nnc_result, ref));
+  }
+}
+
+TEST(ExternalCall, ComputeInterop) {
+  // This test verifies that Tensors using external calls can be used by and can
+  // use Tensors built with Compute API.
+
+  BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
+  BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
+
+  Tensor Input = Compute(
+      "Input",
+      {1, 16, 32, 32},
+      [&](const VarHandle& n,
+          const VarHandle& c,
+          const VarHandle& h,
+          const VarHandle& w) { return FloatImm::make(5.0f); });
+  Tensor Weight = Compute(
+      "Weight",
+      {16, 16, 1, 1},
+      [&](const VarHandle& n,
+          const VarHandle& c,
+          const VarHandle& h,
+          const VarHandle& w) { return FloatImm::make(6.0f); });
+
+  Tensor ConvResult = Tensor(
+      ConvResultBuf.node(),
+      ExternalCall::make(
+          ConvResultBuf,
+          "nnc_aten_conv2d",
+          {BufHandle(Input.buf()), BufHandle(Weight.buf())},
+          {}));
+  Tensor MatmulResult = Tensor(
+      MatmulResultBuf.node(),
+      ExternalCall::make(
+          MatmulResultBuf,
+          "nnc_aten_matmul",
+          {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())},
+          {}));
+  Tensor Result = Compute(
+      "Result",
+      {1, 16, 32, 32},
+      [&](const VarHandle& n,
+          const VarHandle& c,
+          const VarHandle& h,
+          const VarHandle& w) {
+        return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w);
+      });
+
+  LoopNest l({Input, Weight, ConvResult, MatmulResult, Result});
+
+  // Inlining should not inline anything here since all Bufs are either defined
+  // or used in ExternalCalls - we run it just for testing
+  l.inlineIntermediateBufs(true);
+
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor input = at::ones({1, 16, 32, 32}, options) * 5.f;
+  at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f;
+  at::Tensor t = at::conv2d(input, weight);
+  at::Tensor t2 = at::matmul(t, t);
+  at::Tensor ref = t + t2;
+
+  at::Tensor nnc_result;
+  std::vector<float> input_buf(1 * 16 * 32 * 32, 5.f);
+  std::vector<float> weight_buf(16 * 16 * 1 * 1, 6.f);
+  std::vector<float> conv_result_buf(1 * 16 * 32 * 32, -1.f);
+  std::vector<float> matmul_result_buf(1 * 16 * 32 * 32, -1.f);
+  std::vector<float> result_buf(1 * 16 * 32 * 32, -1.f);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(
+      l.root_stmt(), {Input, Weight, ConvResult, MatmulResult, Result});
+
+  llvm_codegen.call(
+      {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(
+      l.root_stmt(), {Input, Weight, ConvResult, MatmulResult, Result});
+
+  ir_eval.call(
+      {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, Inlining) {
+  // This test verifies that Tensors using external calls can be used by and
+  // can use Tensors built with Compute API.
+
+  BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
+
+  Tensor A = Compute("A", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
+    return FloatImm::make(5.0f);
+  });
+  Tensor B = Compute("B", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
+    return FloatImm::make(4.0f);
+  });
+  Tensor MatmulResult = Tensor(
+      MatmulResultBuf.node(),
+      ExternalCall::make(
+          MatmulResultBuf,
+          "nnc_aten_matmul",
+          {BufHandle(A.buf()), BufHandle(B.buf())},
+          {}));
+  Tensor Result =
+      Compute("Result", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
+        return MatmulResult.load(i, j) + FloatImm::make(3.0f);
+      });
+
+  StmtPtr root_stmt = alloc<torch::jit::tensorexpr::Block>(std::vector<StmtPtr>(
+      {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
+  LoopNest l(root_stmt, {Result.buf()});
+
+  // Inlining should not inline anything here since all Bufs are either
+  // defined or used in ExternalCalls
+  l.inlineIntermediateBufs(false);
+
+  l.prepareForCodegen();
+  l.simplify();
+
+  auto options = at::TensorOptions()
+                     .dtype(at::kFloat)
+                     .layout(at::kStrided)
+                     .device(at::kCPU)
+                     .requires_grad(false);
+  at::Tensor a = at::ones({8, 8}, options) * 5.f;
+  at::Tensor b = at::ones({8, 8}, options) * 4.f;
+  at::Tensor t = at::matmul(a, b);
+  at::Tensor ref = t + 3.f;
+
+  at::Tensor nnc_result;
+  std::vector<float> result_buf(8 * 8);
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen llvm_codegen(l.root_stmt(), {Result});
+
+  llvm_codegen.call({result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {8, 8}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+#endif
+
+  SimpleIREvaluator ir_eval(l.root_stmt(), {Result});
+
+  ir_eval.call({result_buf});
+  nnc_result = at::from_blob(result_buf.data(), {8, 8}, options);
+  ASSERT_TRUE(at::allclose(nnc_result, ref));
+}
+
+TEST(ExternalCall, JitCustomFusionOp) {
+  const char* custom_op_schema_literal =
+      "nnc_custom::add_mul(Tensor a, Tensor b, Tensor c) -> Tensor";
+  const char* external_func_name = "nnc_add_mul";
+
+  auto add_mul_lowering_func =
+      [external_func_name](
+          const std::vector<torch::jit::tensorexpr::ArgValue>& inputs,
+          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_shape,
+          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_strides,
+          const std::optional<torch::jit::tensorexpr::ScalarType>& output_type,
+          at::Device device) {
+        auto output_dtype = Dtype(*output_type);
+        torch::jit::tensorexpr::BufHandle result_buf(
+            "nnc_add_mul_res_buf", output_shape, output_dtype);
+        const torch::jit::tensorexpr::BufHandle& a =
+            std::get<torch::jit::tensorexpr::BufHandle>(inputs[0]);
+        const torch::jit::tensorexpr::BufHandle& b =
+            std::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
+        const torch::jit::tensorexpr::BufHandle& c =
+            std::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
+        torch::jit::tensorexpr::StmtPtr s =
+            torch::jit::tensorexpr::ExternalCall::make(
+                result_buf, external_func_name, {a, b, c}, {});
+        return Tensor(result_buf.node(), s);
+      };
+
+  auto add_mul_external_func = [](int64_t bufs_num,
+                                  void** buf_data,
+                                  int64_t* buf_ranks,
+                                  int64_t* buf_dims,
+                                  int64_t* buf_strides,
+                                  int8_t* buf_dtypes,
+                                  int64_t args_num,
+                                  int64_t* extra_args) {};
+
+  torch::jit::RegisterOperators reg({Operator(
+      custom_op_schema_literal,
+      [](const Node* node) -> Operation {
+        return [](Stack& _stack) {
+          auto a = std::move(peek(_stack, 0, 3)).toTensor();
+          auto b = std::move(peek(_stack, 1, 3)).toTensor();
+          auto c = std::move(peek(_stack, 2, 3)).toTensor();
+          drop(_stack, 3);
+          auto result = (a + b) * c;
+          pack(_stack, std::move(result));
+          return 0;
+        };
+      },
+      c10::AliasAnalysisKind::FROM_SCHEMA)});
+
+  auto& custom_operator_set = torch::jit::tensorexpr::getCustomOperatorSet();
+  custom_operator_set.insert({custom_op_schema_literal});
+
+  auto& te_lowering_registry = torch::jit::tensorexpr::getNNCLoweringRegistry();
+  te_lowering_registry.insert(
+      parseSchema(custom_op_schema_literal), add_mul_lowering_func);
+
+  auto& te_nnc_func_registry = torch::jit::tensorexpr::getNNCFunctionRegistry();
+  te_nnc_func_registry[external_func_name] = add_mul_external_func;
+
+  std::string graph_string = R"IR(
+    graph(%a : Float(10, 20, strides=[20, 1], device=cpu),
+          %b : Float(10, 20, strides=[20, 1], device=cpu),
+          %c : Float(10, 20, strides=[20, 1], device=cpu)):
+      %res : Float(10, 20, strides=[20, 1], device=cpu) = nnc_custom::add_mul(%a, %b, %c)
+      return (%res))IR";
+
+  auto graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::string shape_compute_python_string = R"PY(
+  def computOutput(a: List[int], b: List[int], c: List[int]):
+    expandedSizes: List[int] = []
+    dimsA = len(a)
+    dimsB = len(b)
+    dimsC = len(c)
+    ndim = max(dimsA, dimsB, dimsC)
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        dimC = dimsC - 1 - offset
+        sizeA = a[dimA] if (dimA >= 0) else 1
+        sizeB = b[dimB] if (dimB >= 0) else 1
+        sizeC = a[dimC] if (dimC >= 0) else 1
+
+        if sizeA != sizeB and sizeB != sizeC and sizeA != 1 and sizeB != 1 and sizeC != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                "The size of tensor a {} must match the size of tensor b ("
+                "{} and c {}) at non-singleton dimension {}".format(sizeA, sizeB, sizeC, i)
+            )
+
+        expandedSizes.append(max(sizeA, sizeB, sizeC))
+
+    return expandedSizes
+  )PY";
+  auto cu_ptr = torch::jit::compile(shape_compute_python_string);
+  torch::jit::GraphFunction* gf =
+      (torch::jit::GraphFunction*)&cu_ptr->get_function("computOutput");
+  ASSERT_TRUE(gf);
+
+#ifdef TORCH_ENABLE_LLVM
+  auto static_graph_case = graph->copy();
+  FuseTensorExprs(static_graph_case, 1);
+  torch::jit::testing::FileCheck()
+      .check("prim::TensorExprGroup_")
+      ->check("nnc_custom::add_mul")
+      ->run(*static_graph_case);
+
+  auto dynamic_graph_case = graph->copy();
+  auto custom_op = torch::jit::getOperatorForLiteral(custom_op_schema_literal);
+  ASSERT_TRUE(custom_op);
+  torch::jit::RegisterShapeComputeGraphForSchema(
+      custom_op->schema(), gf->graph());
+  FuseTensorExprs(dynamic_graph_case, 1, false, true);
+  torch::jit::testing::FileCheck()
+      .check("prim::TensorExprGroup_")
+      ->check("nnc_custom::add_mul")
+      ->run(*dynamic_graph_case);
+#else
+  torch::jit::testing::FileCheck().check("nnc_custom::add_mul")->run(*graph);
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_graph_opt.cpp b/test/cpp/tensorexpr/test_graph_opt.cpp
new file mode 100644
index 0000000000000..aed73d09d14d5
--- /dev/null
+++ b/test/cpp/tensorexpr/test_graph_opt.cpp
@@ -0,0 +1,319 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/tensorexpr/graph_opt.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/torch.h>
+
+#include <limits>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+class GraphOpt : public ::testing::Test {
+ public:
+  void SetUp() override {
+    old_cat_wo_conditionals_ = getCatWoConditionals();
+    getCatWoConditionals() = true;
+  }
+
+  void TearDown() override {
+    getCatWoConditionals() = old_cat_wo_conditionals_;
+  }
+
+ private:
+  bool old_cat_wo_conditionals_;
+};
+
+TEST_F(GraphOpt, OptimizeCat) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
+      return (%5))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // The `aten::log` op must be moved to the inputs of `aten::cat`.
+  testing::FileCheck()
+      .check("aten::log")
+      ->check("aten::log")
+      ->check("aten::log")
+      ->check("aten::cat")
+      ->check_not("aten::log")
+      ->run(*kernel.graph());
+
+  auto x = at::rand({10}, at::kFloat);
+  auto y = at::rand({20}, at::kFloat);
+  auto z = at::rand({30}, at::kFloat);
+  auto ref = at::log(at::cat({x, y, z}, 0));
+
+  std::vector<at::Tensor> inputs = {x, y, z};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  kernel.run(stack);
+  auto out = stack[0].toTensor();
+  ASSERT_EQ(out.sizes(), ref.sizes());
+  ASSERT_EQ(out.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(out, ref));
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCat2) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
+      %6 : Float(60, strides=[1], device=cpu) = aten::tanh(%5)
+      return (%6))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // The `aten::log` and `aten::tanh` ops must be moved to the inputs of
+  // `aten::cat`.
+  testing::FileCheck()
+      .check("aten::log")
+      ->check("aten::log")
+      ->check("aten::log")
+      ->check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::cat")
+      ->check_not("aten::log")
+      ->check_not("aten::tanh")
+      ->run(*kernel.graph());
+
+  auto x = at::rand({10}, at::kFloat);
+  auto y = at::rand({20}, at::kFloat);
+  auto z = at::rand({30}, at::kFloat);
+  auto ref = at::tanh(at::log(at::cat({x, y, z}, 0)));
+
+  std::vector<at::Tensor> inputs = {x, y, z};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  kernel.run(stack);
+  auto out = stack[0].toTensor();
+  ASSERT_EQ(out.sizes(), ref.sizes());
+  ASSERT_EQ(out.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(out, ref));
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCat3) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%a : Float(60, strides=[1], device=cpu),
+          %x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
+      %6 : Float(60, strides=[1], device=cpu) = aten::mul(%a, %5)
+      return (%6))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
+  // But the `aten::mul` op must not be moved since it is not a single-tensor
+  // op (it has 2 tensor inputs).
+  testing::FileCheck()
+      .check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::cat")
+      ->check("aten::mul")
+      ->check_not("aten::tanh")
+      ->run(*kernel.graph());
+
+  auto a = at::rand({60}, at::kFloat);
+  auto x = at::rand({10}, at::kFloat);
+  auto y = at::rand({20}, at::kFloat);
+  auto z = at::rand({30}, at::kFloat);
+  auto ref = at::tanh(at::cat({x, y, z}, 0)) * a;
+
+  std::vector<at::Tensor> inputs = {a, x, y, z};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  kernel.run(stack);
+  auto out = stack[0].toTensor();
+  ASSERT_EQ(out.sizes(), ref.sizes());
+  ASSERT_EQ(out.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(out, ref));
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCatWithTypePromotionInUser) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%x : Int(10, strides=[1], device=cpu),
+          %y : Int(20, strides=[1], device=cpu),
+          %z : Int(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Int(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
+      return (%5))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
+  // The scalar type of the inputs to `cat` should now be `Float` since they
+  // are the result of `tanh` which does the type promotion.
+  testing::FileCheck()
+      .check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::tanh")
+      ->check("aten::cat")
+      ->check_not("aten::tanh")
+      ->run(*kernel.graph());
+
+  auto x = at::randint(std::numeric_limits<int>::max(), {10}, at::kInt);
+  auto y = at::randint(std::numeric_limits<int>::max(), {20}, at::kInt);
+  auto z = at::randint(std::numeric_limits<int>::max(), {30}, at::kInt);
+  auto ref = at::tanh(at::cat({x, y, z}, 0));
+
+  std::vector<at::Tensor> inputs = {x, y, z};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  kernel.run(stack);
+  auto out = stack[0].toTensor();
+  ASSERT_EQ(out.sizes(), ref.sizes());
+  ASSERT_EQ(out.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(out, ref));
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCatWithTypePromotionInCat) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Double(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Double(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Double(60, strides=[1], device=cpu) = aten::log(%cat)
+      return (%5))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // No transformation should have happened because the `aten::cat` op performs
+  // type promotion. This case is currently not handled.
+  testing::FileCheck()
+      .check("aten::cat")
+      ->check("aten::log")
+      ->check_not("aten::cat")
+      ->check_not("aten::log")
+      ->run(*kernel.graph());
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%0 : Float(60, strides=[1], device=cpu),
+          %x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
+      return (%5))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // No transformation is expected since the consumers of cat are not
+  // single-tensor element-wise ops.
+  testing::FileCheck()
+      .check("aten::cat")
+      ->check("aten::mul")
+      ->check_not("aten::cat")
+      ->check_not("aten::mul")
+      ->run(*kernel.graph());
+#endif
+}
+
+TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp2) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+    graph(%0 : Float(60, strides=[1], device=cpu),
+          %1 : Float(60, strides=[1], device=cpu),
+          %x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %one : int = prim::Constant[value=1]()
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
+      %6 : Float(60, strides=[1], device=cpu) = aten::add(%5, %1, %one)
+      return (%6))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+
+  TensorExprKernel kernel(g);
+
+  // No transformation is expected since the consumers of cat are not
+  // single-tensor element-wise ops.
+  testing::FileCheck()
+      .check("aten::cat")
+      ->check("aten::mul")
+      ->check("aten::add")
+      ->check_not("aten::cat")
+      ->check_not("aten::mul")
+      ->check_not("aten::add")
+      ->run(*kernel.graph());
+#endif
+}
+
+TEST_F(GraphOpt, AOTGraphPrepPasses) {
+  const auto graph_string = R"IR(
+    graph(%x, %y, %z, %i : int):
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      return (%xyz_list, %i))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  removeGraphOutput(g, 1);
+  replaceListOutputWithTuple(g);
+  LowerAllTuples(g);
+
+  testing::FileCheck().check("return (%x, %y, %z)")->run(*g);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
new file mode 100644
index 0000000000000..4d2f8c6e906ee
--- /dev/null
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -0,0 +1,98 @@
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include "test/cpp/tensorexpr/test_base.h"
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+#include <sstream>
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+TEST(IRPrinter, BasicValueTest) {
+  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
+  ExprHandle c = Add::make(a, b);
+
+  std::stringstream ss;
+  ss << c;
+  ASSERT_EQ(ss.str(), "2 + 3");
+}
+
+TEST(IRPrinter, BasicValueTest02) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(4.0f);
+  ExprHandle d(5.0f);
+  ExprHandle f = (a + b) - (c + d);
+
+  std::stringstream ss;
+  ss << f;
+  ASSERT_EQ(ss.str(), "(2.f + 3.f) - (4.f + 5.f)");
+}
+
+TEST(IRPrinter, BasicValueTest03) {
+  ExprHandle a(3.402823466385289e+38f);
+  ExprHandle b(-3.402823466385289e+38f);
+  std::stringstream ss;
+  ss << a << ", " << b;
+  ASSERT_EQ(ss.str(), "3.402823466385289e+38f, -3.402823466385289e+38f");
+}
+
+TEST(IRPrinter, CastTest) {
+  VarHandle x("x", kHalf);
+  VarHandle y("y", kFloat);
+  ExprHandle body = ExprHandle(2.f) +
+      (Cast::make(kFloat, x) * ExprHandle(3.f) + ExprHandle(4.f) * y);
+
+  std::stringstream ss;
+  ss << body;
+  ASSERT_EQ(ss.str(), "2.f + (float(x) * 3.f + 4.f * y)");
+}
+
+TEST(IRPrinter, FunctionName) {
+  int M = 4;
+  int N = 20;
+
+  Tensor producer = Compute(
+      "producer", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return m * n;
+      });
+
+  Tensor chunk_0 = Compute(
+      "chunk_0", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return producer.load(m, n);
+      });
+
+  Tensor chunk_1 = Compute(
+      "chunk_1", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return producer.load(m, n + ExprHandle(N / 2));
+      });
+
+  Tensor consumer = Compute(
+      "consumer", {M, N / 2}, [&](const ExprHandle& i, const ExprHandle& j) {
+        return i * chunk_1.load(i, j);
+      });
+
+  LoopNest l({chunk_0, chunk_1, consumer});
+  auto body = LoopNest::sanitizeNames(l.root_stmt());
+
+  std::stringstream ss;
+  ss << *body;
+
+  const std::string& verification_pattern =
+      R"IR(
+ # CHECK:   for (int i_2
+ # CHECK:    for (int j_2
+ # CHECK:     consumer[i_2, j_2] = i_2 * (chunk_1[i_2, j_2])IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, ss.str());
+}
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ir_verifier.cpp b/test/cpp/tensorexpr/test_ir_verifier.cpp
new file mode 100644
index 0000000000000..886213ea9c760
--- /dev/null
+++ b/test/cpp/tensorexpr/test_ir_verifier.cpp
@@ -0,0 +1,191 @@
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include "test/cpp/tensorexpr/test_base.h"
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+#include <sstream>
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+TEST(IRVerifier, BitwiseOps) {
+  VarPtr X = alloc<Var>("x", kInt);
+  VarPtr Y = alloc<Var>("y", kFloat);
+  {
+    auto a = alloc<And>(X, Y);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    auto a = alloc<Or>(X, Y);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    auto a = alloc<Xor>(X, Y);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    auto a = alloc<Lshift>(X, Y);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    auto a = alloc<Rshift>(X, Y);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, CompareSelect) {
+  ExprPtr X = alloc<IntImm>(1);
+  ExprPtr Y = alloc<FloatImm>(3.14f);
+  {
+    auto a = alloc<CompareSelect>(X, X, X, Y, kEQ);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    auto a = alloc<CompareSelect>(X, Y, X, X, kEQ);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, Ramp) {
+  VarPtr I = alloc<Var>("i", kInt);
+  VarPtr J = alloc<Var>("j", kFloat);
+  {
+    auto a = alloc<Ramp>(I, J, 4);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, Load) {
+  VarPtr I = alloc<Var>("i", kInt);
+  VarPtr J = alloc<Var>("j", kLong);
+  VarPtr K = alloc<Var>("k", kFloat);
+  BufPtr B = alloc<Buf>(
+      "b",
+      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
+      kFloat);
+  {
+    // Indices with different int dtypes (kInt, kLong) are ok
+    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, J}));
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_NO_THROW(verify(a));
+  }
+  {
+    // Float index
+    auto a = alloc<Load>(B, std::vector<ExprPtr>({K, K}));
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    // Multilanes are only allowed in flattened indices
+    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
+    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, multilane_index}));
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, IfThenElse) {
+  VarPtr I = alloc<Var>("i", kInt);
+  VarPtr J = alloc<Var>("j", kLong);
+  VarPtr K = alloc<Var>("k", kFloat);
+  {
+    // Condition must be integral
+    auto a = alloc<IfThenElse>(K, I, I);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    // Dtypes of true and false exprs must match
+    auto a = alloc<IfThenElse>(I, I, J);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    // Can't have multiple lanes in condition expr
+    auto a = alloc<IfThenElse>(alloc<Broadcast>(I, 4), I, I);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, For) {
+  VarPtr I = alloc<Var>("i", kInt);
+  VarPtr J = alloc<Var>("j", kInt);
+  StmtPtr body = alloc<Block>(std::vector<StmtPtr>({}));
+  {
+    // Can't have nullptr as a Var
+    auto a = alloc<For>(nullptr, I, J, body);
+    // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+TEST(IRVerifier, Block) {
+  VarPtr I = alloc<Var>("i", kInt);
+  BufPtr B = alloc<Buf>("B", std::vector<ExprPtr>({alloc<IntImm>(10)}), kInt);
+  {
+    StmtPtr store = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    StmtPtr block1 = alloc<Block>(std::vector<StmtPtr>({store}));
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    StmtPtr block2 = alloc<Block>(std::vector<StmtPtr>({store}));
+    // Stmt can't have multiple parents, thus inserting it into several blocks
+    // is illegal
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(block2));
+  }
+}
+
+TEST(IRVerifier, Store) {
+  VarPtr I = alloc<Var>("i", kInt);
+  VarPtr J = alloc<Var>("j", kLong);
+  VarPtr K = alloc<Var>("k", kFloat);
+  BufPtr B = alloc<Buf>(
+      "b",
+      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
+      kFloat);
+  {
+    // Indices with different int dtypes (kInt, kLong) are ok
+    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, J}), K);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_NO_THROW(verify(a));
+  }
+  {
+    // Float index
+    auto a = alloc<Store>(B, std::vector<ExprPtr>({K, K}), K);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    // Multilanes are only allowed in flattened indices
+    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
+    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, multilane_index}), K);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+  {
+    // Value and buf dtypes mismatch
+    auto a = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
+    EXPECT_ANY_THROW(verify(a));
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
new file mode 100644
index 0000000000000..22f6b64efe1a8
--- /dev/null
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -0,0 +1,2133 @@
+#include <gtest/gtest.h>
+
+#include <ATen/code_template.h>
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/torch.h>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::indexing;
+using namespace torch::jit::tensorexpr;
+
+class Kernel : public ::testing::Test {
+ public:
+  void SetUp() override {
+    getTEMustUseLLVMOnCPU() = false;
+  }
+};
+
+TEST_F(Kernel, ParallelExternalCallBuf) {
+  const auto graph_string = R"IR(
+    graph(%0 : Float(1000, 5000, strides=[5000, 1], device=cpu),
+          %1 : Float(1000, 5000, strides=[5000, 1], device=cpu),
+          %2 : Float(5000, 1000, strides=[5000, 1], device=cpu)):
+      %3 : Float(1000, 5000, strides=[5000, 1], device=cpu) = aten::mul(%0, %1)
+      %4 : Float(1000, 5000, strides=[5000, 1], device=cpu) = aten::matmul(%3, %2)
+      return (%4))IR";
+  auto graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, &*graph);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i = 0ll; i < 5000ll; i++)  /* parallel */{)IR";
+
+#ifdef TORCH_ENABLE_LLVM
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+#endif
+}
+
+TEST_F(Kernel, InliningIntermediates) {
+  // here, each mul has only one use, so it should be completely inlined
+  {
+    const auto graph_string = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+              %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+          %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one)
+          return (%5))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+    TensorExprKernel k(graph);
+    auto stmt = k.getCodeGenStmt();
+    std::ostringstream oss;
+    oss << *stmt;
+    torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+  }
+  {
+    const auto graph_template = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=${device}),
+              %1 : Float(5, 3, strides=[3, 1], device=${device})):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %3 : Float(5, 3, strides=[3, 1]) = aten::sub(%0, %2, %one)
+          %4 : Float(5, 3, strides=[3, 1]) = aten::add(%3, %0, %one)
+          %5 : Float(5, 3, strides=[3, 1]) = aten::div(%3, %0)
+          return (%4, %5))IR";
+    for (bool use_cuda : {false, true}) {
+      if (!torch::cuda::is_available() && use_cuda) {
+        continue;
+      }
+
+      at::jit::TemplateEnv env;
+      env.s("device", use_cuda ? "cuda:0" : "cpu");
+      const auto graph_string = format(graph_template, env);
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+      TensorExprKernel k(graph);
+      auto stmt = k.getCodeGenStmt();
+      std::ostringstream oss;
+      oss << *stmt;
+      // aten_mul only has one use, inlined completely
+      torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+
+      // aten_sub should be removed by the CUDA backend by metavar rewriting
+      // and by the CPU backend by horizontal fusion.
+      torch::jit::testing::FileCheck().check_not("aten_sub")->run(oss.str());
+    }
+  }
+}
+
+TEST_F(Kernel, PreAllocIntermediateBufs) {
+  const auto graph_string = R"IR(
+graph(%a.1 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu),
+      %b.1 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu)):
+  %2 : int = prim::Constant[value=1]()
+  %c.2 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu) = aten::matmul(%a.1, %b.1) # test_matmul.py:12:12
+  %3 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu) = aten::add(%a.1, %c.2, %2) # test_matmul.py:13:15
+  return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto o = at::zeros({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = at::matmul(a, b) + a;
+  TensorExprKernel k(graph, {}, {}, true);
+
+  std::vector<at::Tensor> inputs = {a, b};
+  auto stmt = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  // Check whether the intermediate buffer has been added to constants
+  auto constants = k.getConstantDescriptors();
+  ASSERT_EQ(constants.size(), 1);
+
+  // Check the IR we produced
+  torch::jit::testing::FileCheck().check_not("Alloc")->run(oss.str());
+  torch::jit::testing::FileCheck().check_not("Free")->run(oss.str());
+
+  // Check correctness
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, _1) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NOT: for)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
+TEST_F(Kernel, _2) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b =
+      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
+  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NOT: for)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
+TEST_F(Kernel, _3) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[12, 2], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({10, 6}, TensorOptions(kCPU).dtype(at::kFloat))
+               .index({Slice(None, None, 2), Slice(None, None, 2)});
+  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NOT: for)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
+TEST_F(Kernel, Huge) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
+        %1 : int = prim::Constant[value=0]()
+        %2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
+        %3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  TensorExprKernel k(graph);
+  std::ostringstream oss;
+  oss << *k.getCodeGenStmt();
+  // The 4000000000 iterations loop will be split into 500000000 x 8 and the
+  // outer loop will be parallel. If LLVM is not present, it will not be split,
+  // and to cover both of these cases we're looking for 00000000ll; in the
+  // output.
+  const std::string& verification_pattern = R"IR(# CHECK: 00000000ll;)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST_F(Kernel, ParallelStrided) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
+            %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)):
+        %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat))
+               .index(
+                   {Slice(None, None, 2),
+                    Slice(None, None, 2),
+                    Slice(None, None, 2)});
+  auto ref = a * (a * b);
+  auto o = at::zeros_like(ref);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
+TEST_F(Kernel, DISABLED_Shape_Inference) {
+  // disabled: doesn't do stride propagation, and isn't being used currently
+
+  // Test TensorExpr shape inference capabilities: it should only require shapes
+  // for the inputs
+  {
+    const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[12, 2], device=cpu)):
+        %2 : Tensor = aten::mul(%0, %1)
+        %3 : Tensor = aten::mul(%0, %2)
+        return (%3))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({10, 6}, TensorOptions(kCPU).dtype(at::kFloat))
+                 .index({Slice(None, None, 2), Slice(None, None, 2)});
+    auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto ref = a * (a * b);
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a, b};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NOT: for)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    o = stack[0].toTensor();
+    for (size_t i = 0; i < 5 * 3; i++) {
+      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+    }
+  }
+  {
+    const auto graph_string = R"IR(
+      graph(%0 : Float(8, 8, strides=[8, 1], device=cpu),
+            %1 : Float(8, 8, strides=[8, 1], device=cpu)):
+        %2 : Tensor = aten::mul(%0, %1)
+        %3 : Tensor, %4 : Tensor = prim::ConstantChunk[dim=1,chunks=2](%2)
+        %r : Tensor = aten::mul(%3, %4)
+        return (%r))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto a = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto o = at::zeros({8, 4}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto t = torch::chunk(a * b, 2, 1);
+    auto ref = t[0] * t[1];
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a, b};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    o = stack[0].toTensor();
+    TORCH_CHECK_EQ(o.sizes()[0], 8);
+    TORCH_CHECK_EQ(o.sizes()[1], 4);
+    for (size_t i = 0; i < 8 * 4; i++) {
+      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+    }
+  }
+  {
+    // Test that shape inference handles aten::unsqueeze
+
+    const auto graph_string = R"IR(
+      graph(%a : Float(4, 2, strides=[2, 1], device=cpu),
+            %b : Float(4, 3, 2, strides=[6, 2, 1], device=cpu),
+            %c : Float(3, 2, 2, strides=[4, 2, 1], device=cpu)):
+        %one : int = prim::Constant[value=1]()
+        %minus_one : int = prim::Constant[value=-1]()
+        %three : int = prim::Constant[value=3]()
+        %minus_four : int = prim::Constant[value=-4]()
+        %a1 : Tensor = aten::unsqueeze(%a, %one)        # new size: [4,1,2]
+        %a2 : Tensor = aten::unsqueeze(%a1, %minus_one) # new size: [4,1,2,1]
+        %b1 : Tensor = aten::unsqueeze(%b, %three)      # new size: [4,3,2,1]
+        %c1 : Tensor = aten::unsqueeze(%c, %minus_four) # new size: [1,3,2,2]
+        %ab : Tensor = aten::mul(%a2, %b1)         # expected size: [4,3,2,1]
+        %abc : Tensor = aten::mul(%ab, %c1)        # expected size: [4,3,2,2]
+        return (%abc))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto a = at::rand({4, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({4, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto c = at::rand({3, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto o = at::zeros({4, 3, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto ref = at::unsqueeze(at::unsqueeze(a, 1), -1) * at::unsqueeze(b, 3) *
+        at::unsqueeze(c, -4);
+
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a, b, c};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NEXT: for
+# CHECK-NEXT: for
+# CHECK-NEXT: aten_mul)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    o = stack[0].toTensor();
+
+    // Check sizes
+    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
+    size_t num_el = 1;
+    for (const auto idx : c10::irange(ref.sizes().size())) {
+      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
+      num_el *= ref.sizes()[idx];
+    }
+
+    // Check the contents
+    for (const auto i : c10::irange(num_el)) {
+      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+    }
+  }
+  {
+    // Test that shape inference handles aten::cat
+
+    const auto graph_string = R"IR(
+      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
+            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
+            %c : Float(5, 9, 2, strides=[18, 2, 1], device=cpu)):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Tensor = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto o = at::zeros({5, 19, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto ref = at::cat({a, b, c}, 1);
+
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a, b, c};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NEXT: for
+# CHECK-NEXT: aten_cat)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    o = stack[0].toTensor();
+
+    // Check sizes
+    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
+    size_t num_el = 1;
+    for (const auto idx : c10::irange(ref.sizes().size())) {
+      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
+      num_el *= ref.sizes()[idx];
+    }
+
+    // Check the contents
+    for (const auto i : c10::irange(num_el)) {
+      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+    }
+  }
+  {
+    // Test that we throw an error when input list for aten::cat is empty
+
+    const auto graph_string = R"IR(
+      graph():
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct()
+        %r : Tensor = aten::cat(%inputs, %dim)
+        return (%r))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+    auto compile = [&]() {
+      TensorExprKernel k(graph);
+      k.getCodeGenStmt();
+    };
+    ASSERT_THROWS_WITH(compile(), "Empty input list is passed to aten::cat");
+  }
+  {
+    // Test that we throw an error when 'dim' passed to aten::cat is invalid
+
+    const auto ir_dim_99 = R"IR(
+      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
+            %b : Float(5, 3, 2, strides=[6, 2, 1], device=cpu)):
+        %dim : int = prim::Constant[value=99]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
+        %r : Float(5, 3, 2, strides=[6, 2, 1], device=cpu) = aten::cat(%inputs, %dim)
+        return (%r))IR";
+    const auto ir_dim_minus_6 = R"IR(
+      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
+            %b : Float(5, 3, 2, strides=[6, 2, 1], device=cpu)):
+        %dim : int = prim::Constant[value=-6]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
+        %r : Float(5, 3, 2, strides=[6, 2, 1], device=cpu) = aten::cat(%inputs, %dim)
+        return (%r))IR";
+
+    auto compile = [](const std::string& graph_string) {
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+      TensorExprKernel k(graph);
+      k.getCodeGenStmt();
+    };
+    ASSERT_THROWS_WITH(compile(ir_dim_99), "Invalid index");
+    ASSERT_THROWS_WITH(compile(ir_dim_minus_6), "Invalid index");
+  }
+}
+
+TEST_F(Kernel, CatInputTypesPromotion) {
+  {
+    // Test that we properly promote input types for aten::cat
+
+    const auto graph_string = R"IR(
+      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
+            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
+            %c : Double(5, 9, 2, strides=[18, 2, 1], device=cpu)):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Double(5, 19, 2, strides=[38, 2, 1]) = aten::cat(%inputs, %dim)
+        return (%r))IR";
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kDouble));
+    auto ref = at::cat({a, b, c}, 1);
+
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a, b, c};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NEXT: for
+# CHECK-NEXT: aten_cat)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    auto o = stack[0].toTensor();
+
+    // Check sizes
+    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
+    TORCH_CHECK_EQ(o.dtype(), ref.dtype());
+    size_t num_el = 1;
+    for (const auto idx : c10::irange(ref.sizes().size())) {
+      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
+      num_el *= ref.sizes()[idx];
+    }
+
+    // Check the contents
+    for (const auto i : c10::irange(num_el)) {
+      TORCH_CHECK_EQ(((double*)o.data_ptr())[i], ((double*)ref.data_ptr())[i]);
+    }
+  }
+}
+
+TEST_F(Kernel, ToDType) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+      graph(%x.1 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu)):
+        %1 : NoneType = prim::Constant()
+        %2 : bool = prim::Constant[value=0]()
+        %3 : int = prim::Constant[value=6]()
+        %4 : int = prim::Constant[value=15]()
+        %5 : int = prim::Constant[value=5]()
+        %6 : bool = prim::Constant[value=1]()
+        %y.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::sigmoid(%x.1)
+        %z.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::_autocast_to_reduced_precision(%y.3, %6, %6, %5, %4)
+        %h.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::_autocast_to_full_precision(%z.3, %6, %6)
+        %i.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%h.3, %3, %2, %2, %1)
+        %j.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%i.3, %4, %2, %2, %1)
+        %k.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%j.3, %3, %2, %2, %1)
+        return (%k.3))IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NEXT: aten_to
+# CHECK-NEXT: }
+# CHECK-NEXT: })IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto a = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kBFloat16));
+  auto ref =
+      at::_to_copy(at::sigmoid(a), TensorOptions(kCPU).dtype(at::kFloat));
+
+  std::vector<at::Tensor> inputs = {a};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  ASSERT_EQ(o.sizes(), ref.sizes());
+  ASSERT_EQ(o.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(o, ref, 4E-3, 4E-3));
+#endif
+}
+
+TEST_F(Kernel, CatAndInlineWithAConstantDim) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(1, 512, strides=[1024, 1], requires_grad=0, device=cpu),
+            %1 : Float(1, 512, strides=[1024, 1], requires_grad=0, device=cpu)):
+        %2 : bool = prim::Constant[value=0]()
+        %3 : int = prim::Constant[value=1]()
+        %4 : Tensor[] = prim::ListConstruct(%0, %1)
+        %5 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::cat(%4, %3)
+        %6 : Tensor[] = prim::ListConstruct(%5)
+        %7 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::cat(%6, %3)
+        %8 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::_cast_Float(%7, %2)
+        return (%8, %7))IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  TensorExprKernel k(graph);
+
+  auto a = at::rand({1, 512}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({1, 512}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = at::_cast_Float(at::cat({a, b}, 1), 0);
+
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  ASSERT_EQ(o.sizes(), ref.sizes());
+  ASSERT_EQ(o.dtype(), ref.dtype());
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, CatWithEmptyInputs) {
+  bool curr_cat_wo_conditionals = getCatWoConditionals();
+  for (auto cat_wo_conditionals : {true, false}) {
+    getCatWoConditionals() = cat_wo_conditionals;
+    const auto graph_string = R"IR(
+        graph(%0 : Float(0, 64, strides=[64, 1], requires_grad=0, device=cpu),
+              %1 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu)):
+          %3 : int = prim::Constant[value=0]()
+          %6 : Float(0, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::tanh(%0)
+          %7 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::tanh(%1)
+          %10 : Tensor[] = prim::ListConstruct(%6, %7)
+          %11 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::cat(%10, %3)
+          return (%11))IR";
+
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+    TensorExprKernel k(graph);
+
+    auto a = at::rand({0, 64}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto b = at::rand({10, 64}, TensorOptions(kCPU).dtype(at::kFloat));
+    auto ref = at::cat({at::tanh(a), at::tanh(b)}, 0);
+
+    std::vector<at::Tensor> inputs = {a, b};
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    auto o = stack[0].toTensor();
+    ASSERT_EQ(o.sizes(), ref.sizes());
+    ASSERT_EQ(o.dtype(), ref.dtype());
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+  getCatWoConditionals() = curr_cat_wo_conditionals;
+}
+
+TEST_F(Kernel, CatWoConditionals) {
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
+            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
+            %c : Float(5, 9, 2, strides=[18, 2, 1], device=cpu)):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(5, 19, 2, strides=[38, 2, 1]) = aten::cat(%inputs, %dim)
+        return (%r))IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK: for
+# CHECK: for
+# CHECK: aten_cat
+# CHECK: for
+# CHECK: for
+# CHECK: aten_cat
+# CHECK: for
+# CHECK: for
+# CHECK: aten_cat)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = at::cat({a, b, c}, 1);
+
+  std::vector<at::Tensor> inputs = {a, b, c};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+
+  // Check sizes
+  TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
+  TORCH_CHECK_EQ(o.dtype(), ref.dtype());
+  size_t num_el = 1;
+  for (const auto idx : c10::irange(ref.sizes().size())) {
+    TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
+    num_el *= ref.sizes()[idx];
+  }
+
+  // Check the contents
+  for (const auto i : c10::irange(num_el)) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+  getCatWoConditionals() = old_cat_wo_conditionals;
+}
+
+TEST_F(Kernel, OptimizeConditionals) {
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  bool old_opt_conditionals = getOptConditionals();
+  getCatWoConditionals() = false;
+  getOptConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(5, 3, strides=[3, 1], device=cpu),
+            %b : Float(5, 7, strides=[7, 1], device=cpu),
+            %c : Float(5, 9, strides=[9, 1], device=cpu)):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(5, 19, strides=[19, 1]) = aten::cat(%inputs, %dim)
+        %t : Float(5, 19, strides=[19, 1]) = aten::relu(%r)
+        return (%t))IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for
+# CHECK-NEXT: for
+# CHECK-NEXT: aten_relu
+# CHECK: for
+# CHECK-NEXT: aten_relu
+# CHECK: for
+# CHECK-NEXT: aten_relu
+# CHECK-NOT: Allocate
+# CHECK-NOT: Free)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto b = at::rand({5, 7}, TensorOptions(kCPU).dtype(at::kFloat));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto c = at::rand({5, 9}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = at::relu(at::cat({a, b, c}, 1));
+
+  std::vector<at::Tensor> inputs = {a, b, c};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+
+  // Check sizes
+  TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
+  TORCH_CHECK_EQ(o.dtype(), ref.dtype());
+  size_t num_el = 1;
+  for (const auto idx : c10::irange(ref.sizes().size())) {
+    TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
+    num_el *= ref.sizes()[idx];
+  }
+
+  // Check the contents
+  for (const auto i : c10::irange(num_el)) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+  getOptConditionals() = old_opt_conditionals;
+  getCatWoConditionals() = old_cat_wo_conditionals;
+}
+
+namespace {
+
+std::string dtypeConstant(ScalarType scalar_type) {
+  if (scalar_type == ScalarType::Undefined) {
+    return "None = prim::Constant()";
+  } else {
+    at::jit::TemplateEnv env_dtype;
+    env_dtype.d("scalar_type", static_cast<int>(scalar_type));
+    return format("int = prim::Constant[value=${scalar_type}]()", env_dtype);
+  }
+}
+
+at::Tensor iotaTensor(IntArrayRef sizes, const at::TensorOptions& options) {
+  int64_t numel = std::accumulate(
+      sizes.begin(),
+      sizes.end(),
+      1,
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
+      std::multiplies<int64_t>());
+  std::vector<float> values(numel);
+  std::iota(values.begin(), values.end(), 0);
+  auto a = at::tensor(values, options);
+  return a.reshape(sizes);
+}
+
+} // namespace
+
+TEST_F(Kernel, SumAllAxes) {
+  // Test lowering of sum on all axes.
+  const auto graph_template = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %1 : ${dtype}
+        %2 : ${out_dtype}(requires_grad=0, device=cpu) = aten::sum(%0, %1)
+        return (%2))IR";
+  auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
+    at::jit::TemplateEnv env;
+    env.s("dtype", dtypeConstant(scalar_type));
+    if (scalar_type == ScalarType::Undefined) {
+      env.s("out_dtype", "Float");
+    } else {
+      env.s("out_dtype", "Double");
+    }
+    const auto graph_string = format(graph_template, env);
+
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    auto o = at::empty({}, TensorOptions(kCPU));
+    std::optional<c10::ScalarType> dtype;
+    if (scalar_type != ScalarType::Undefined) {
+      dtype = static_cast<c10::ScalarType>(scalar_type);
+    }
+    auto ref = a.sum(/*dtype=*/dtype);
+    TensorExprKernel k(graph);
+    std::vector<at::Tensor> inputs = {a};
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::ostringstream oss;
+    oss << *s;
+
+    // Check the IR we produced
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for
+# CHECK-NEXT: for)IR";
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    o = stack[0].toTensor();
+    ASSERT_EQ(o.sizes(), ref.sizes());
+    ASSERT_EQ(o.dtype(), ref.dtype());
+    ASSERT_TRUE(at::allclose(o, ref));
+  }
+}
+
+std::string li_to_str(at::ArrayRef<int64_t> li) {
+  std::stringstream out;
+  bool first = true;
+  for (auto elem : li) {
+    if (!first) {
+      out << ", ";
+    }
+    out << elem;
+    first = false;
+  }
+  return out.str();
+}
+
+TEST_F(Kernel, SumOneAxis) {
+  // Test lowering of sum on one axis.
+  const auto graph_template = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %1 : int[] = prim::Constant[value=[${dim}]]()
+        %2 : bool = prim::Constant[value=${keepdim}]()
+        %3 : ${dtype}
+        %4 : ${out_dtype}(${size}, strides=[${strides}], device=cpu) = aten::sum(%0, %1, %2, %3)
+        return (%4))IR";
+  auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  for (int dim = -a.dim(); dim < a.dim(); ++dim) {
+    for (bool keepdim : {false, true}) {
+      for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
+        at::jit::TemplateEnv env;
+        env.d("dim", dim);
+        env.d("keepdim", keepdim);
+        env.s("dtype", dtypeConstant(scalar_type));
+        std::optional<c10::ScalarType> dtype;
+        if (scalar_type != ScalarType::Undefined) {
+          dtype = static_cast<c10::ScalarType>(scalar_type);
+        }
+        auto ref = a.sum({dim}, /*keepdim=*/keepdim, /*dtype=*/dtype);
+        if (scalar_type == ScalarType::Undefined) {
+          env.s("out_dtype", "Float");
+        } else {
+          env.s("out_dtype", "Double");
+        }
+        env.s("size", li_to_str(ref.sizes()));
+        env.s("strides", li_to_str(ref.strides()));
+        const auto graph_string = format(graph_template, env);
+        auto graph = std::make_shared<Graph>();
+        parseIR(graph_string, &*graph);
+
+        auto o = at::empty({}, TensorOptions(kCPU));
+        TensorExprKernel k(graph);
+        std::vector<at::Tensor> inputs = {a};
+        StmtPtr s = k.getCodeGenStmt();
+
+        std::ostringstream oss;
+        oss << *s;
+
+        // Check the IR we produced
+        const std::string& verification_pattern =
+            R"IR(
+# CHECK: for (int64_t
+# CHECK-NEXT: sum
+# CHECK-NEXT: for (int64_t
+# CHECK-NEXT:   sum)IR";
+        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+        std::vector<IValue> stack = fmap<IValue>(inputs);
+        k.run(stack);
+        o = stack[0].toTensor();
+        ASSERT_EQ(o.sizes(), ref.sizes());
+        ASSERT_EQ(o.dtype(), ref.dtype());
+        ASSERT_TRUE(at::allclose(o, ref, 4E-3, 4E-3));
+      }
+    }
+  }
+}
+
+TEST_F(Kernel, SumMultipleAxes) {
+  // Test lowering of sum on multiple axes.
+  const auto graph_template = R"IR(
+      graph(%0 : Float(2, 3, 2, 3, strides=[18, 6, 3, 1], requires_grad=0, device=cpu)):
+        %1 : int = prim::Constant[value=${dim1}]()
+        %2 : int = prim::Constant[value=${dim2}]()
+        %3 : int[] = prim::ListConstruct(%1, %2)
+        %4 : bool = prim::Constant[value=${keepdim}]()
+        %5 : ${dtype}
+        %6 : Float(${size}, strides=[${strides}], requires_grad=0, device=cpu) = aten::sum(%0, %3, %4, %5)
+        return (%6))IR";
+  auto a = iotaTensor({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  // Only iterate over positive values of axes to keep the running time
+  // reasonable, since the number of pairs is quadratic.
+  for (const auto dim1 : c10::irange(a.dim())) {
+    for (int dim2 = dim1 + 1; dim2 < a.dim(); ++dim2) {
+      for (bool keepdim : {false, true}) {
+        at::jit::TemplateEnv env;
+        env.d("dim1", dim1);
+        env.d("dim2", dim2);
+        env.d("keepdim", keepdim);
+        env.s("dtype", dtypeConstant(ScalarType::Undefined));
+        auto o = at::empty({}, TensorOptions(kCPU));
+        auto ref = a.sum(IntArrayRef{dim1, dim2}, /*keepdim=*/keepdim);
+
+        env.s("size", li_to_str(ref.sizes()));
+        env.s("strides", li_to_str(ref.strides()));
+
+        const auto graph_string = format(graph_template, env);
+
+        auto graph = std::make_shared<Graph>();
+        parseIR(graph_string, &*graph);
+
+        TensorExprKernel k(graph);
+        std::vector<at::Tensor> inputs = {a};
+        StmtPtr s = k.getCodeGenStmt();
+
+        std::ostringstream oss;
+        oss << *s;
+
+        // Check the IR we produced
+        const std::string& verification_pattern =
+            R"IR(
+# CHECK: for (int64_t
+# CHECK: for (int64_t
+# CHECK: for (int64_t
+# CHECK: for (int64_t
+# CHECK: sum)IR";
+        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+        std::vector<IValue> stack = fmap<IValue>(inputs);
+        k.run(stack);
+        o = stack[0].toTensor();
+        ASSERT_EQ(o.sizes(), ref.sizes());
+        ASSERT_EQ(o.dtype(), ref.dtype());
+        ASSERT_TRUE(at::allclose(o, ref));
+      }
+    }
+  }
+}
+
+// This test and the following ones testing Softmax only tests with dim set
+// to one of the valid input dimensions. It does not test with dim=None
+// because that is supposed to be deprecated.
+TEST_F(Kernel, Softmax2D) {
+  const auto graph_template = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %1 : int = prim::Constant[value=${dim}]()
+        %dt_float : int = prim::Constant[value=7]()
+        %dt_none : NoneType = prim::Constant()
+        %4 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %${dt})
+        return (%4))IR";
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  const std::string& verification_template =
+      R"IR(
+        # CHECK: for (int i${other_dim} = 0; i${other_dim} < ${other_dim_size}
+        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_max
+        # CHECK: for (int i${other_dim}_1 = 0; i${other_dim}_1 < ${other_dim_size}
+        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_sum
+        # CHECK: for (int i0_2 = 0; i0_2 < 5
+        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 3
+        # CHECK-NEXT: aten_softmax)IR";
+
+  for (bool empty_dtype : {false, true}) {
+    for (auto log_softmax : {false, true}) {
+      for (const auto softmax_dim : c10::irange(a.dim())) {
+        auto softmax_dim_size = a.sizes()[softmax_dim];
+        auto other_dim = (softmax_dim + 1) % a.dim();
+        auto ref =
+            log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
+        at::jit::TemplateEnv env;
+        env.d("dim", softmax_dim);
+        env.s("op", log_softmax ? "log_softmax" : "softmax");
+        env.s("size", li_to_str(ref.sizes()));
+        env.s("strides", li_to_str(ref.strides()));
+        env.s("dt", empty_dtype ? "dt_none" : "dt_float");
+
+        const auto graph_string = format(graph_template, env);
+
+        auto graph = std::make_shared<Graph>();
+        parseIR(graph_string, &*graph);
+
+        TensorExprKernel k(graph);
+        std::vector<at::Tensor> inputs = {a};
+        StmtPtr s = k.getCodeGenStmt();
+
+        std::ostringstream oss;
+        oss << *s;
+
+        at::jit::TemplateEnv ver_env;
+        ver_env.d("other_dim", other_dim);
+        ver_env.d("other_dim_size", a.sizes()[other_dim]);
+        ver_env.d("softmax_dim", softmax_dim);
+        ver_env.d("softmax_dim_size", softmax_dim_size);
+        const auto verification_pattern =
+            format(verification_template, ver_env);
+
+        // verification sting temporarily disabled until
+        // inlining of exp() is benchmarked and determined
+        // torch::jit::testing::FileCheck().run(verification_pattern,
+        // oss.str());
+
+        std::vector<IValue> stack = fmap<IValue>(inputs);
+        k.run(stack);
+        auto output = stack[0].toTensor();
+        ASSERT_EQ(output.sizes(), ref.sizes());
+        ASSERT_TRUE(at::allclose(output, ref));
+      }
+    }
+  }
+}
+
+TEST_F(Kernel, Softmax3D) {
+  const auto graph_template = R"IR(
+      graph(%0 : Float(3, 4, 5, strides=[20, 5, 1], device=cpu)):
+        %1 : int = prim::Constant[value=${dim}]()
+        %2 : int = prim::Constant[value=7]()
+        %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2)
+        return (%3))IR";
+
+  auto a = at::rand({3, 4, 5}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  const std::string& verification_template =
+      R"IR(
+        # CHECK: for (int i${dim1} = 0; i${dim1} < ${dim1_size}
+        # CHECK-NEXT: for (int i${dim2} = 0; i${dim2} < ${dim2_size}
+        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_max
+        # CHECK: for (int i${dim1}_1 = 0; i${dim1}_1 < ${dim1_size}
+        # CHECK-NEXT: for (int i${dim2}_1 = 0; i${dim2}_1 < ${dim2_size}
+        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_sum
+        # CHECK: for (int i0_2 = 0; i0_2 < 3
+        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 4
+        # CHECK-NEXT: for (int i2_2 = 0; i2_2 < 5
+        # CHECK-NEXT: aten_softmax)IR";
+
+  for (auto log_softmax : {false, true}) {
+    for (const auto softmax_dim : c10::irange(a.dim())) {
+      auto softmax_dim_size = a.sizes()[softmax_dim];
+      std::vector<int> other_dims;
+      for (const auto i : c10::irange(a.dim())) {
+        if (i != softmax_dim) {
+          other_dims.push_back(i);
+        }
+      }
+      auto ref =
+          log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
+
+      at::jit::TemplateEnv env;
+      env.d("dim", softmax_dim);
+      env.s("op", log_softmax ? "log_softmax" : "softmax");
+      env.s("size", li_to_str(ref.sizes()));
+      env.s("strides", li_to_str(ref.strides()));
+
+      const auto graph_string = format(graph_template, env);
+
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+
+      TensorExprKernel k(graph);
+      std::vector<at::Tensor> inputs = {a};
+      StmtPtr s = k.getCodeGenStmt();
+
+      std::ostringstream oss;
+      oss << *s;
+
+      at::jit::TemplateEnv ver_env;
+      ver_env.d("dim1", other_dims[0]);
+      ver_env.d("dim1_size", a.sizes()[other_dims[0]]);
+      ver_env.d("dim2", other_dims[1]);
+      ver_env.d("dim2_size", a.sizes()[other_dims[1]]);
+      ver_env.d("softmax_dim", softmax_dim);
+      ver_env.d("softmax_dim_size", softmax_dim_size);
+      const auto verification_pattern = format(verification_template, ver_env);
+
+      // verification sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      std::vector<IValue> stack = fmap<IValue>(inputs);
+      k.run(stack);
+      auto output = stack[0].toTensor();
+
+      ASSERT_EQ(output.sizes(), ref.sizes());
+      ASSERT_TRUE(at::allclose(output, ref));
+    }
+  }
+}
+
+TEST_F(Kernel, Softmax4D) {
+  const auto graph_template = R"IR(
+      graph(%0 : Float(2, 3, 2, 3, strides=[18, 6, 3, 1], device=cpu)):
+        %1 : int = prim::Constant[value=${dim}]()
+        %2 : int = prim::Constant[value=7]()
+        %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2)
+        return (%3))IR";
+
+  auto a = at::rand({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+
+  const std::string& verification_template =
+      R"IR(
+        # CHECK: for (int i${dim1} = 0; i${dim1} < ${dim1_size}
+        # CHECK-NEXT: for (int i${dim2} = 0; i${dim2} < ${dim2_size}
+        # CHECK-NEXT: for (int i${dim3} = 0; i${dim3} < ${dim3_size}
+        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_max
+        # CHECK: for (int i${dim1}_1 = 0; i${dim1}_1 < ${dim1_size}
+        # CHECK-NEXT: for (int i${dim2}_1 = 0; i${dim2}_1 < ${dim2_size}
+        # CHECK-NEXT: for (int i${dim3}_1 = 0; i${dim3}_1 < ${dim3_size}
+        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
+        # CHECK-NEXT: aten_softmax_sum
+        # CHECK: for (int i0_2 = 0; i0_2 < 2
+        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 3
+        # CHECK-NEXT: for (int i2_2 = 0; i2_2 < 2
+        # CHECK-NEXT: for (int i3_2 = 0; i3_2 < 3
+        # CHECK-NEXT: aten_softmax)IR";
+
+  for (auto log_softmax : {false, true}) {
+    for (const auto softmax_dim : c10::irange(a.dim())) {
+      auto softmax_dim_size = a.sizes()[softmax_dim];
+      std::vector<int> other_dims;
+      for (const auto i : c10::irange(a.dim())) {
+        if (i != softmax_dim) {
+          other_dims.push_back(i);
+        }
+      }
+      auto ref =
+          log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
+
+      at::jit::TemplateEnv env;
+      env.d("dim", softmax_dim);
+      env.s("op", log_softmax ? "log_softmax" : "softmax");
+      env.s("size", li_to_str(ref.sizes()));
+      env.s("strides", li_to_str(ref.strides()));
+
+      const auto graph_string = format(graph_template, env);
+
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+
+      TensorExprKernel k(graph);
+      std::vector<at::Tensor> inputs = {a};
+      StmtPtr s = k.getCodeGenStmt();
+
+      std::ostringstream oss;
+      oss << *s;
+
+      at::jit::TemplateEnv ver_env;
+      ver_env.d("dim1", other_dims[0]);
+      ver_env.d("dim1_size", a.sizes()[other_dims[0]]);
+      ver_env.d("dim2", other_dims[1]);
+      ver_env.d("dim2_size", a.sizes()[other_dims[1]]);
+      ver_env.d("dim3", other_dims[2]);
+      ver_env.d("dim3_size", a.sizes()[other_dims[2]]);
+      ver_env.d("softmax_dim", softmax_dim);
+      ver_env.d("softmax_dim_size", softmax_dim_size);
+      const auto verification_pattern = format(verification_template, ver_env);
+
+      // verification sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      std::vector<IValue> stack = fmap<IValue>(inputs);
+      k.run(stack);
+      auto output = stack[0].toTensor();
+      ASSERT_EQ(output.sizes(), ref.sizes());
+      ASSERT_TRUE(at::allclose(output, ref));
+    }
+  }
+}
+
+TEST_F(Kernel, SignTest) {
+  const auto graph_template = R"IR(
+      graph(%0 : ${dtype}(${size}, strides=[1], device=cpu)):
+        %2 : ${dtype}(${size}, strides=[1]) = aten::sign(%0)
+        return (%2))IR";
+
+  auto run_test = [](const std::string& graph_string, const at::Tensor& input) {
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+
+    TensorExprKernel k(graph);
+    StmtPtr s = k.getCodeGenStmt();
+
+    std::vector<at::Tensor> inputs = {input};
+    std::vector<IValue> stack = fmap<IValue>(inputs);
+    k.run(stack);
+    auto o = stack[0].toTensor();
+    auto ref = at::sign(input);
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+  auto common_options = at::TensorOptions()
+                            .layout(at::kStrided)
+                            .device(at::kCPU)
+                            .requires_grad(false);
+  int default_input_size = 100;
+  for (auto scalar_type : {ScalarType::Float, ScalarType::Double}) {
+    at::Tensor corner_case_inputs;
+    at::jit::TemplateEnv env;
+    auto options = common_options;
+    switch (scalar_type) {
+      case ScalarType::Float: {
+        env.s("dtype", "Float");
+        options = options.dtype(at::kFloat);
+        std::vector<float> input_float = {
+            0.0f,
+            -0.0f,
+            std::numeric_limits<float>::infinity(),
+            -std::numeric_limits<float>::infinity(),
+            std::nanf("1"),
+            -std::nanf("1")};
+        corner_case_inputs = at::from_blob(
+            input_float.data(),
+            {static_cast<long>(input_float.size())},
+            options);
+        auto rand_input = at::rand({default_input_size}, options);
+        auto input = at::cat({rand_input, corner_case_inputs});
+        env.d("size", at::numel(input));
+        const auto graph_string = format(graph_template, env);
+        run_test(graph_string, input);
+        break;
+      }
+      case ScalarType::Double: {
+        env.s("dtype", "Double");
+        options = options.dtype(at::kDouble);
+        std::vector<double> input_double = {
+            0.0,
+            -0.0,
+            std::numeric_limits<double>::infinity(),
+            -std::numeric_limits<double>::infinity(),
+            std::nan("1"),
+            -std::nan("1")};
+        corner_case_inputs = at::from_blob(
+            input_double.data(),
+            {static_cast<long>(input_double.size())},
+            options);
+        auto rand_input = at::rand({default_input_size}, options);
+        auto input = at::cat({rand_input, corner_case_inputs});
+        env.d("size", at::numel(input));
+        const auto graph_string = format(graph_template, env);
+        run_test(graph_string, input);
+        break;
+      }
+      default:
+        throw unsupported_dtype();
+    }
+  }
+}
+
+TEST_F(Kernel, InlineProducerIntoReduction) {
+  // Inline producer (mul) into reduction (sum).
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%0, %1)
+        %3 : int = prim::Constant[value=7]()
+        %4 : Double(device=cpu) = aten::sum(%2, %3)
+        return (%4))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced.
+  // We should have only one loop in the end.
+  const std::string& verification_pattern =
+      R"IR(
+        # CHECK: for (int64_t i_1 = 0ll; i_1 < 5
+        # CHECK-NEXT: for (int64_t j_1 = 0ll; j_1 < 3
+        # CHECK-NEXT:   sum
+        # CHECK-NOT: for)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto ref = (a * b).sum(at::kDouble);
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, InlineReductionIntoConsumer) {
+  // Inline producer (mul %2) into reduction (sum %4) but DO NOT
+  // inline the reduction into consumer (mul %4).
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : int = prim::Constant[value=6]()
+        %4 : Float(device=cpu) = aten::sum(%2, %3)
+        %5 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%2, %4)
+        return (%5))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  TensorExprKernel k(graph);
+  StmtPtr s = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced.
+  // We should have two loops in the end.
+  const std::string& verification_pattern =
+      R"IR(
+        # CHECK: for (int64_t i_1 = 0ll; i_1 < 5
+        # CHECK-NEXT: for (int64_t j_1 = 0ll; j_1 < 3
+        # CHECK-NEXT:   sum
+        # CHECK: for (int64_t i_2 = 0ll; i_2 < 5
+        # CHECK-NEXT: for (int64_t j_2 = 0ll; j_2 < 3
+        # CHECK-NEXT:   aten_mul
+        # CHECK-NOT: for)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto ref = (a * b).sum(at::kFloat) * (a * b);
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, SanitizeNames_CUDA) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cuda:0),
+            %1 : Float(5, 3, strides=[3, 1], device=cuda:0)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%4))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  graph->inputs().at(0)->setDebugName("aten::add:");
+  graph->inputs().at(1)->setDebugName("aten::add_");
+  TensorExprKernel k(graph);
+  auto a = at::rand({5, 3}, TensorOptions(kCUDA).dtype(at::kFloat));
+  auto b = at::rand({5, 3}, TensorOptions(kCUDA).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, SanitizeConstants_CUDA) {
+  const auto graph_string = R"IR(
+        graph(%x : Float(16, 16, strides=[16, 1], device=cuda:0)):
+          %none : NoneType = prim::Constant()
+          %size : int = prim::Constant[value=16]()
+          %sizes : int[] = prim::ListConstruct(%size, %size)
+          %30 : Device = prim::Constant[value="cuda"]()
+          %y : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::ones(%sizes, %none, %none, %30, %none)
+          %z : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::mul(%x, %y)
+          return (%z))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  // IRParser doesn't support tensor constants, so we insert a call to
+  // aten::ones and then const-prop it
+  ConstantPropagation(graph);
+
+  // We set the name of the constant to include special characters that are
+  // not allowed. This should be fixed by the sanitizer in TensorExprKernel.
+  graph->nodes().front()->output()->setDebugName("illegal.name");
+
+  // Check if we have a constant node with illegal name in the graph.
+  auto const_node = graph->nodes().front();
+  ASSERT_EQ(const_node->kind(), prim::Constant);
+  ASSERT_NE(const_node->output()->debugName().find('.'), std::string::npos);
+
+  TensorExprKernel k(graph);
+
+  auto x = at::rand({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {x};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto y = at::ones({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
+  auto ref = x * y;
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, ConstantTensors) {
+  const auto graph_string = R"IR(
+        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
+          %none : NoneType = prim::Constant()
+          %size : int = prim::Constant[value=16]()
+          %sizes : int[] = prim::ListConstruct(%size, %size)
+          %y : Float(16, 16, strides=[16, 1], device=cpu) = aten::ones(%sizes, %none, %none, %none, %none)
+          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
+          return (%z))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  // IRParser doesn't support tensor constants, so we insert a call to
+  // aten::ones and then const-prop it
+  ConstantPropagation(graph);
+
+  TensorExprKernel k(graph);
+
+  auto x = at::rand({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {x};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto y = at::ones({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = x * y;
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, ConstantTensorsNonContiguous) {
+  const auto graph_string = R"IR(
+        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
+          %none : NoneType = prim::Constant()
+          %dtype : int = prim::Constant[value=6]()
+          %c0 : int = prim::Constant[value=0]()
+          %c256 : int = prim::Constant[value=256]()
+          %c16 : int = prim::Constant[value=16]()
+          %y_flat : Tensor = aten::arange(%c0, %c256, %dtype, %none, %none, %none)
+          %sizes : int[] = prim::ListConstruct(%c16, %c16)
+          %y_t : Tensor = aten::view(%y_flat, %sizes)
+          %y : Tensor = aten::t(%y_t)
+          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
+          return (%z))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  // IRParser doesn't support tensor constants, so we generate several aten
+  // calls to produce non-contiguous constant tensor and then const-prop it
+  ConstantPropagation(graph);
+
+  TensorExprKernel k(graph);
+
+  auto x = at::rand({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {x};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto y = at::arange(0, 256, TensorOptions(kCPU).dtype(at::kFloat))
+               .view({16, 16})
+               .t();
+  auto ref = x * y;
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
+TEST_F(Kernel, RunFast) {
+#ifdef TORCH_ENABLE_LLVM
+  // TODO: Implement call_raw in IREval and remove the ifdef
+
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b =
+      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
+  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+
+  k.runFast({a.data_ptr(), b.data_ptr()}, {o.data_ptr()});
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+#endif
+}
+
+TEST_F(Kernel, RunWithAllocatedOutputs) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b =
+      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
+  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+
+  std::vector<at::Tensor> args = {o, a, b};
+  std::vector<IValue> stack = fmap<IValue>(args);
+  k.runWithAllocatedOutputs(stack);
+  for (size_t i = 0; i < 5 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+#endif
+}
+
+TEST_F(Kernel, CodegenInspection) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
+          %none : NoneType = prim::Constant()
+          %dtype : int = prim::Constant[value=6]()
+          %c0 : int = prim::Constant[value=0]()
+          %c256 : int = prim::Constant[value=256]()
+          %c16 : int = prim::Constant[value=16]()
+          %y_flat : Tensor = aten::arange(%c0, %c256, %dtype, %none, %none, %none)
+          %sizes : int[] = prim::ListConstruct(%c16, %c16)
+          %y_t : Tensor = aten::view(%y_flat, %sizes)
+          %y : Tensor = aten::t(%y_t)
+          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
+          return (%z))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  // IRParser doesn't support tensor constants, so we generate several aten
+  // calls to produce non-contiguous constant tensor and then const-prop it
+  ConstantPropagation(graph);
+
+  TensorExprKernel k(graph);
+
+  // Check that we could retrieve generated assembly
+  auto asm_str = k.getCodeText("asm");
+  const std::string& asm_verification_pattern =
+      R"ASM(
+        # CHECK: .text
+        # CHECK: retq)ASM";
+  torch::jit::testing::FileCheck().run(asm_verification_pattern, asm_str);
+
+  // Check that we could retrieve info about codegen parameters
+  auto constants = k.getConstantDescriptors();
+  auto buf_args = k.getBufferArgs();
+  // Expected buf args: [input0, output0, constant0]
+  ASSERT_EQ(buf_args.size(), 3);
+  ASSERT_EQ(constants.size(), 1);
+  ASSERT_TRUE(
+      !buf_args[0].isVar() && !buf_args[1].isVar() && !buf_args[2].isVar());
+#endif
+}
+
+Tensor lowerNanToNum(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device) {
+  auto input_buf = std::get<BufHandle>(inputs[0]);
+  auto e = Compute(
+      "custom_nan_to_num",
+      outputShape,
+      outputStrides,
+      [&](const std::vector<VarHandle>& axes) {
+        std::vector<ExprHandle> indices(axes.begin(), axes.end());
+        auto load = input_buf.load(indices);
+        return IfThenElse::make(Cast::make(kBool, isnan(load)), 0.0f, load);
+      });
+  return e;
+}
+
+TEST_F(Kernel, CustomLowering) {
+  const auto graph_string = R"IR(
+      graph(%x : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu)):
+          %none : NoneType = prim::Constant()
+          %y : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::nan_to_num(%x, %none, %none, %none)
+          return (%y)
+)IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  std::unordered_map<c10::Symbol, NNCLoweringFunction> lowerings = {
+      {aten::nan_to_num, lowerNanToNum}};
+  TensorExprKernel k(graph, lowerings);
+
+  auto stmt = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *stmt;
+
+  // Check that our custom lowering is actually used
+  torch::jit::testing::FileCheck().check("custom_nan_to_num")->run(oss.str());
+  torch::jit::testing::FileCheck().check("isnan")->run(oss.str());
+}
+
+TEST_F(Kernel, Vectorize) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+      graph(%0 : Float(100, 16, strides=[16, 1], device=cpu),
+            %1 : Float(100, 16, strides=[16, 1], device=cpu)):
+        %2 : Float(100, 16, strides=[16, 1]) = aten::mul(%0, %1)
+        %3 : Float(100, 16, strides=[16, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto o = at::zeros({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced
+  const std::string& verification_pattern = R"IR(# CHECK: Ramp)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 100 * 16; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+#endif
+}
+
+// TODO: To vectorize loopnest for 100x3 case, we need to flatten loops first.
+TEST_F(Kernel, DISABLED_FlattenVectorize) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_string = R"IR(
+      graph(%0 : Float(100, 3, strides=[3, 1], device=cpu),
+            %1 : Float(100, 3, strides=[3, 1], device=cpu)):
+        %2 : Float(100, 3, strides=[3, 1]) = aten::mul(%0, %1)
+        %3 : Float(100, 3, strides=[3, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto o = at::zeros({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto ref = a * (a * b);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::ostringstream oss;
+  oss << *s;
+
+  // Check the IR we produced
+  const std::string& verification_pattern = R"IR(# CHECK: Ramp)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 100 * 3; i++) {
+    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+#endif
+}
+
+TEST_F(Kernel, Strided1dWithinBounds) {
+  auto ir = R"IR(
+    graph(%0 : Float(3, strides=[1], device=cpu),
+          %1 : Float(3, strides=[2], device=cpu)):
+        %2 : int = prim::Constant[value=1]()
+        %3 : Float(3, strides=[1]) = aten::add(%0, %1, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(ir, graph.get(), vmap);
+  TensorExprKernel k(graph);
+
+  auto a = at::rand({3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({6}, TensorOptions(kCPU).dtype(at::kFloat))
+               .index({Slice(None, None, 2)});
+  auto expect = a + b;
+
+  std::vector<at::Tensor> inputs = {a, b};
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+
+  auto output = stack[0].toTensor();
+
+  for (size_t i = 0; i < 3; ++i) {
+    TORCH_CHECK_EQ(
+        ((float*)output.data_ptr())[i], ((float*)expect.data_ptr())[i]);
+  }
+}
+
+TEST_F(Kernel, InputAsOutput) {
+  const auto graph_string = R"IR(
+      graph(%x : Float(5, 3, strides=[3, 1], device=cpu),
+            %y : Float(5, 3, strides=[1, 5], device=cpu)):
+        return (%x, %y))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto y =
+      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x, y};
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  CHECK(at::allclose(x, stack[0].toTensor()));
+  CHECK(at::allclose(y, stack[1].toTensor()));
+}
+
+TEST_F(Kernel, ScalarOut) {
+  auto ir = R"IR(
+graph(%x : int, %y : int):
+  %z : int = aten::mul(%x, %y)
+  %r : int = aten::mul(%z, %x)
+  return (%r, %z))IR";
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(ir, graph.get(), vmap);
+  TensorExprKernel k(graph);
+
+  auto stmt = k.getCodeGenStmt();
+  std::ostringstream oss;
+  oss << *stmt;
+
+  // Verify the generated IR. We expect to see a scalar variable (Let) followed
+  // by a store to a 0-dim buffer.
+  const std::string& verification_pattern = R"IR(
+# CHECK: int64_t
+# CHECK-NEXT: [0ll] =
+# CHECK-NEXT: int64_t
+# CHECK-NEXT: [0ll] =
+)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  int64_t x = 2, y = 3, r = 0, z = 0;
+
+  // Verify that TEK::runFast works correctly with scalar outputs
+  std::vector<void*> inputs = {&x, &y};
+  std::vector<void*> outputs = {&r, &z};
+  k.runFast(inputs, outputs);
+  TORCH_CHECK_EQ(z, x * y);
+  TORCH_CHECK_EQ(r, z * x);
+
+  // Verify that TEK::run works correctly with scalar outputs
+  std::vector<IValue> stack = {x, y};
+  k.run(stack);
+  TORCH_CHECK_EQ(stack[0], x * y * x);
+  TORCH_CHECK_EQ(stack[1], x * y);
+}
+
+TEST_F(Kernel, ScalarTensorOut) {
+  auto ir = R"IR(
+graph(%x : int,
+      %xt : Long(3, strides=[1], device=cpu),
+      %y : int,
+      %yt : Long(3, strides=[1], device=cpu)):
+  %z : int = aten::mul(%x, %y)
+  %r : int = aten::mul(%z, %x)
+  %zt : Long(3, strides=[1], device=cpu) = aten::mul(%xt, %y)
+  %rt : Long(3, strides=[1], device=cpu) = aten::mul(%zt, %xt)
+  return (%r, %rt, %z, %zt))IR";
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(ir, graph.get(), vmap);
+  TensorExprKernel k(graph);
+  int64_t x = 2, y = 3, r = 0, z = 0;
+  auto xt = at::ones({3}, TensorOptions(kCPU).dtype(at::kLong)) * 2;
+  auto yt = at::ones({3}, TensorOptions(kCPU).dtype(at::kLong)) * 3;
+  auto zt = at::zeros({3}, TensorOptions(kCPU).dtype(at::kLong));
+  auto rt = at::zeros({3}, TensorOptions(kCPU).dtype(at::kLong));
+
+  // Verify that TEK::runFast works correctly with mixed scalar and tensor
+  // inputs/utputs
+  std::vector<void*> inputs = {&x, xt.data_ptr(), &y, yt.data_ptr()};
+  std::vector<void*> outputs = {&r, rt.data_ptr(), &z, zt.data_ptr()};
+  k.runFast(inputs, outputs);
+  TORCH_CHECK_EQ(z, x * y);
+  TORCH_CHECK_EQ(r, z * x);
+  ASSERT_TRUE(at::equal(zt, xt * yt));
+  ASSERT_TRUE(at::equal(rt, zt * xt));
+
+  // Verify that TEK::run works correctly with mixed scalar and tensor
+  // inputs/utputs
+  std::vector<IValue> stack = {x, xt, y, yt};
+  k.run(stack);
+  TORCH_CHECK_EQ(stack[0], x * y * x);
+  ASSERT_TRUE(at::equal(stack[1].toTensor(), xt * yt * xt));
+  TORCH_CHECK_EQ(stack[2], x * y);
+  ASSERT_TRUE(at::equal(stack[3].toTensor(), xt * yt));
+}
+
+TEST_F(Kernel, FuseLoopsWithVariableBounds) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), 3, SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), 7, SS(-3), requires_grad=0, device=cpu),
+            %c : Float(SS(-2), 9, SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(SS(-2), 19, SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->inputs().at(2)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim1, int dim2) {
+    auto a =
+        at::rand({dim1, 3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim1, 7, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto c =
+        at::rand({dim1, 9, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b, c}, 1);
+
+    std::vector<IValue> stack =
+        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
+    stack.emplace_back(dim1);
+    stack.emplace_back(dim2);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
+TEST_F(Kernel, FuseLoopsWithVariableConcatDim) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %c : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int,
+            %SS_4 : int,
+            %SS_5 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->inputs().at(2)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim1, int dim2, int dim3) {
+    auto a =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto c =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b, c}, 1);
+
+    std::vector<IValue> stack =
+        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
+    stack.emplace_back(dim1);
+    stack.emplace_back(dim2);
+    stack.emplace_back(dim3);
+    stack.emplace_back(3 * dim3);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20, 15);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
+TEST_F(Kernel, DoNotFuseLoopsWithMismatchingVariableDims) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int,
+            %SS_4 : int,
+            %SS_5 : int,
+            %SS_6 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
+        %r : Float(SS(-2), SS(-6), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5, -6};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t j
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim2, int dim3, int dim4, int dim5) {
+    auto a =
+        at::rand({dim2, dim4, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim2, dim5, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b}, 1);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.emplace_back(dim2);
+    stack.emplace_back(dim3);
+    stack.emplace_back(dim4);
+    stack.emplace_back(dim5);
+    stack.emplace_back(dim4 + dim5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20, 15, 8);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
new file mode 100644
index 0000000000000..f6ffc84f62c09
--- /dev/null
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -0,0 +1,1799 @@
+#ifdef TORCH_ENABLE_LLVM
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+#include <cmath>
+#include <numeric>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+using LLVMExprEval = ExprEval<LLVMCodeGen>;
+
+// Typed tests, can't use gtest params here due to the way we instantiate tests.
+#define TEST_LLVM_SCALAR_TYPES(_) \
+  _(uint8_t, Byte, 24)            \
+  _(int8_t, Char, -20)            \
+  _(int16_t, Short, 3332)         \
+  _(int, Int, 123456)             \
+  _(int64_t, Long, 2631563121321) \
+  _(float, Float, 0.122)          \
+  _(double, Double, 0.21312)      \
+  _(at::Half, Half, 0.128f)
+
+#define IMM_TEST(Type, Name, Val)                  \
+  TEST(LLVM, Name##ImmTest) {                      \
+    auto a = Name##Imm::make(Val);                 \
+    LLVMExprEval cg(a);                            \
+    if (std::is_floating_point<decltype(Val)>()) { \
+      ASSERT_NEAR(cg.value<Type>(), Val, 0.1);     \
+    } else {                                       \
+      ASSERT_EQ(cg.value<Type>(), Val);            \
+    }                                              \
+  }
+TEST_LLVM_SCALAR_TYPES(IMM_TEST)
+#undef IMM_TEST
+
+#define ADD_TEST(Type, Name, Val)                  \
+  TEST(LLVM, Name##AddTest) {                      \
+    auto a = Name##Imm::make(Val);                 \
+    auto b = Name##Imm::make(Val * 2);             \
+    auto c = Add::make(a, b);                      \
+    LLVMExprEval cg(c);                            \
+    if (std::is_floating_point<decltype(Val)>()) { \
+      ASSERT_NEAR(cg.value<Type>(), Val * 3, 0.1); \
+    } else {                                       \
+      ASSERT_EQ(cg.value<Type>(), Val * 3);        \
+    }                                              \
+  }
+TEST_LLVM_SCALAR_TYPES(ADD_TEST)
+#undef ADD_TEST
+
+#define SUB_TEST(Type, Name, Val)                  \
+  TEST(LLVM, Name##SubTest) {                      \
+    auto a = Name##Imm::make(Val * 2);             \
+    auto b = Name##Imm::make(Val);                 \
+    auto c = Sub::make(a, b);                      \
+    LLVMExprEval cg(c);                            \
+    if (std::is_floating_point<decltype(Val)>()) { \
+      ASSERT_NEAR(cg.value<Type>(), Val, 0.1);     \
+    } else {                                       \
+      ASSERT_EQ(cg.value<Type>(), Val);            \
+    }                                              \
+  }
+TEST_LLVM_SCALAR_TYPES(SUB_TEST)
+#undef SUB_TEST
+
+#define MUL_TEST(Type, Name, Val)                  \
+  TEST(LLVM, Name##MulTest) {                      \
+    auto a = Name##Imm::make(Val);                 \
+    auto b = Name##Imm::make((Type)4);             \
+    auto c = Mul::make(a, b);                      \
+    LLVMExprEval cg(c);                            \
+    if (std::is_floating_point<decltype(Val)>()) { \
+      ASSERT_NEAR(cg.value<Type>(), Val * 4, 0.1); \
+    } else {                                       \
+      ASSERT_EQ(cg.value<Type>(), Val * 4);        \
+    }                                              \
+  }
+TEST_LLVM_SCALAR_TYPES(MUL_TEST)
+#undef MUL_TEST
+
+#define DIV_TEST(Type, Name, Val)                  \
+  TEST(LLVM, Name##DivTest) {                      \
+    auto a = Name##Imm::make((Type)6);             \
+    auto b = Name##Imm::make((Type)3);             \
+    auto c = Div::make(a, b);                      \
+    LLVMExprEval cg(c);                            \
+    if (std::is_floating_point<decltype(Val)>()) { \
+      ASSERT_NEAR(cg.value<Type>(), 2, 0.1);       \
+    } else {                                       \
+      ASSERT_EQ(cg.value<Type>(), 2);              \
+    }                                              \
+  }
+TEST_LLVM_SCALAR_TYPES(DIV_TEST)
+#undef DIV_TEST
+
+TEST(LLVM, IntToFloatCastTest) {
+  auto a = IntImm::make(2);
+  auto b = Cast::make(kFloat, a);
+  LLVMExprEval cg(b, {});
+  ASSERT_EQ(cg.value<float>(), 2.0);
+}
+
+TEST(LLVM, FloatToIntCastTest) {
+  auto a = FloatImm::make(2.0);
+  auto b = Cast::make(kInt, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<int>(), 2);
+}
+
+TEST(LLVM, IntToLongCastTest) {
+  auto a = IntImm::make(12345);
+  auto b = Cast::make(kLong, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<int64_t>(), 12345);
+}
+
+TEST(LLVM, ByteToCharCastTest) {
+  auto a = ByteImm::make(250);
+  auto b = Cast::make(kChar, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<int8_t>(), (int8_t)250);
+}
+
+TEST(LLVM, HalfToLongCastTest) {
+  auto a = HalfImm::make(2.0);
+  auto b = Cast::make(kLong, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<int64_t>(), 2);
+}
+
+TEST(LLVM, ByteToDoubleCastTest) {
+  auto a = ByteImm::make(2);
+  auto b = Cast::make(kDouble, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<double>(), 2);
+}
+
+TEST(LLVM, FloatToByteCastTest) {
+  auto a = FloatImm::make(254.0);
+  auto b = Cast::make(kByte, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<uint8_t>(), 254);
+}
+
+TEST(LLVM, FloatToCharCastTest) {
+  auto a = FloatImm::make(-2.0);
+  auto b = Cast::make(kChar, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<int8_t>(), -2);
+}
+
+TEST(LLVM, ByteToFloatCastTest) {
+  auto a = ByteImm::make(254);
+  auto b = Cast::make(kFloat, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<float>(), 254.0);
+}
+
+TEST(LLVM, CharToFloatCastTest) {
+  auto a = CharImm::make(-2);
+  auto b = Cast::make(kFloat, a);
+  LLVMExprEval cg(b);
+  ASSERT_EQ(cg.value<float>(), -2.0);
+}
+
+TEST(LLVM, BitCast) {
+  /* constexpr int16_t ref16 = 1337; */
+  constexpr int32_t ref32 = 1337;
+  constexpr int64_t ref64 = 1337;
+  constexpr float reff32 = 1337.0f;
+  constexpr double reff64 = 1337.0f;
+
+  // this is broken
+  /*{
+    at::Half k_;
+    at::Half* k = &k_;
+    *reinterpret_cast<int16_t*>(k) = ref16;
+    auto a = HalfImm::make(k);
+    auto b = BitCast::make(kShort, a);
+    LLVMExprEval cg(b);
+    ASSERT_EQ(cg.value<int16_t>(), ref16);
+  }*/
+
+  {
+    float k = raw_bitcast<float>(ref32);
+    auto a = FloatImm::make(k);
+    auto b = BitCast::make(kInt, a);
+    LLVMExprEval cg(b);
+    ASSERT_EQ(cg.value<int32_t>(), ref32);
+  }
+
+  {
+    double k = raw_bitcast<double>(ref64);
+    auto a = DoubleImm::make(k);
+    auto b = BitCast::make(kLong, a);
+    LLVMExprEval cg(b);
+    ASSERT_EQ(cg.value<int64_t>(), ref64);
+  }
+
+  {
+    int64_t k = raw_bitcast<int64_t>(reff64);
+    auto a = LongImm::make(k);
+    auto b = BitCast::make(kDouble, a);
+    LLVMExprEval cg(b);
+    ASSERT_EQ(cg.value<double>(), reff64);
+  }
+
+  {
+    int32_t k = raw_bitcast<int32_t>(reff32);
+    auto a = IntImm::make(k);
+    auto b = BitCast::make(kFloat, a);
+    LLVMExprEval cg(b);
+    ASSERT_EQ(cg.value<float>(), reff32);
+  }
+}
+
+TEST(LLVM, fastLogFloat) {
+  const int kTotalSize = 128 * 128;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
+
+  VarHandle index = VarHandle("index", kInt);
+  ExprHandle load_a = a_buf.load(index);
+  StmtPtr store_b = b_buf.store({index}, fast_log(load_a));
+  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
+
+  PaddedBuffer<float> a_v(kTotalSize);
+  PaddedBuffer<float> b_v(kTotalSize);
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    a_v(i) = at::randn({1}).item().to<float>();
+  }
+
+  LLVMCodeGen ir_eval(stmt, {a_buf, b_buf});
+  ir_eval.call({a_v, b_v});
+
+  for (const auto i : c10::irange(kTotalSize)) {
+    auto test = b_v(i);
+    auto ref = std::log(a_v(i));
+    if (std::isnan(ref)) {
+      ASSERT_EQ(std::isnan(test), true);
+    } else {
+      ASSERT_FLOAT_EQ(test, ref);
+    }
+  }
+}
+
+TEST(LLVM, LetTest01) {
+  BufHandle a("A", {1}, kFloat);
+  std::vector<float> v = {1, 0};
+  std::vector<void*> args({v.data()});
+  VarHandle x("x", kFloat);
+  auto block = Block::make({
+      Let::make(x, 3.f),
+      a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))),
+  });
+
+  LLVMCodeGen cg(block, {a});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 4.f);
+}
+
+TEST(LLVM, LetTest02) {
+  BufHandle a("A", {1}, kFloat);
+  std::vector<float> v = {1, 0};
+  std::vector<void*> args({v.data()});
+  VarHandle x("x", kFloat);
+  VarHandle y("y", kFloat);
+  auto block = Block::make(
+      {Let::make(x, 3.f),
+       Let::make(y, 6.f),
+       a.store(
+           {IntImm::make(0)},
+           ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))});
+
+  LLVMCodeGen cg(block, {a});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 6.f * 4.f);
+}
+
+TEST(LLVM, LetTestMultitype) {
+  BufHandle a("A", {1}, kDouble);
+  std::vector<double> v = {1, 0};
+  std::vector<void*> args({v.data()});
+  VarHandle x("x", kByte);
+  VarHandle y("y", kHalf);
+  auto block = Block::make(
+      {Let::make(x, 3),
+       Let::make(y, 6.f),
+       a.store(
+           {0},
+           Cast::make(
+               kDouble,
+               ExprHandle(2.f) +
+                   (x * ExprHandle(3.f) + y * ExprHandle(4.f))))});
+
+  LLVMCodeGen cg(block, {a});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(v[0], 2.f + 3 * 3.f + 6.f * 4.f);
+}
+
+TEST(LLVM, BufferTest) {
+  BufHandle a("A", {32}, kFloat);
+  std::vector<int32_t> v(5);
+  std::vector<void*> args({v.data()});
+  auto rv = IntImm::make(0);
+  LLVMExprEval cg(rv, {a});
+  ASSERT_EQ(cg.value<int>(args), 0);
+}
+
+TEST(LLVM, BlockTest) {
+  BufHandle a("A", {32}, kInt);
+  std::vector<int32_t> v = {1, 2};
+  std::vector<void*> args({v.data()});
+
+  auto block = Block::make({
+      a.store({0}, 3),
+      a.store({1}, 4),
+      a.store({0}, 4),
+  });
+
+  LLVMCodeGen cg(block, {a});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(v[0], 4);
+  ASSERT_EQ(v[1], 4);
+}
+
+TEST(LLVM, LoadStoreTest) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  std::vector<int32_t> a_buffer = {42};
+  std::vector<int32_t> b_buffer = {-11};
+
+  auto store = b.store({0}, a.load(0));
+  LLVMCodeGen cg(store, {a, b});
+  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(a_buffer[0], 42);
+  ASSERT_EQ(b_buffer[0], 42);
+}
+
+TEST(LLVM, IfThenElseTest) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {1}, kInt);
+  std::vector<int32_t> a_buffer = {42};
+  std::vector<int32_t> b_buffer = {-11};
+  std::vector<int32_t> c_buffer = {1};
+
+  auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0));
+  LLVMCodeGen cg(store, {a, b, c});
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(a_buffer[0], 42);
+  ASSERT_EQ(b_buffer[0], 42);
+}
+
+// if (x < 10) x = x + 1
+TEST(LLVM, CondNoFalseBlockTest) {
+  BufHandle x("X", {1}, kInt);
+  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
+  auto cond = Cond::make(cmp, x.store({0}, x.load(0) + 1), nullptr);
+
+  for (int32_t x_value : {0, 10, 20}) {
+    std::vector<int32_t> x_buffer = {x_value};
+    std::vector<void*> args({x_buffer.data()});
+    LLVMCodeGen cg(cond, {x});
+    ASSERT_EQ(cg.value<int>(args), 0);
+    if (x_value < 10) {
+      ASSERT_EQ(x_buffer[0], x_value + 1);
+    } else {
+      ASSERT_EQ(x_buffer[0], x_value);
+    }
+  }
+}
+
+// if (x < 10) {
+//   x = x + 1;
+// } else {
+//   x = x - 1;
+// }
+TEST(LLVM, CondTest) {
+  BufHandle x("X", {1}, kInt);
+  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
+  auto cond =
+      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
+  auto block = Block::make({
+      cond,
+      x.store({0}, x.load(0) * 2),
+  });
+
+  for (int32_t x_value : {0, 10, 20}) {
+    std::vector<int32_t> x_buffer = {x_value};
+    std::vector<void*> args({x_buffer.data()});
+    LLVMCodeGen cg(block, {x});
+    ASSERT_EQ(cg.value<int>(args), 0);
+    if (x_value < 10) {
+      ASSERT_EQ(x_buffer[0], (x_value + 1) * 2);
+    } else {
+      ASSERT_EQ(x_buffer[0], (x_value - 1) * 2);
+    }
+  }
+}
+
+// if (x < 10) {
+//   if (x > 5) {
+//     x = x + 1;
+//   } else {
+//     x = x - 1;
+//   }
+// } else {
+//   if (x <= 15) {
+//     x = x + 2;
+//   } else {
+//     x = x - 2;
+//   }
+// }
+TEST(LLVM, CondNestedTest) {
+  BufHandle x("X", {1}, kInt);
+  auto true_cmp =
+      CompareSelect::make(x.load(0), 5, CompareSelectOperation::kGT);
+  auto true_cond = Cond::make(
+      true_cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
+  auto false_cmp =
+      CompareSelect::make(x.load(0), 15, CompareSelectOperation::kLE);
+  auto false_cond = Cond::make(
+      false_cmp, x.store({0}, x.load(0) + 2), x.store({0}, x.load(0) - 2));
+  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
+  auto cond = Cond::make(cmp, true_cond, false_cond);
+
+  for (int32_t x_value : {0, 8, 15, 20}) {
+    std::vector<int32_t> x_buffer = {x_value};
+    std::vector<void*> args({x_buffer.data()});
+    LLVMCodeGen cg(cond, {x});
+    ASSERT_EQ(cg.value<int>(args), 0);
+    if (x_value < 10) {
+      if (x_value > 5) {
+        ASSERT_EQ(x_buffer[0], x_value + 1);
+      } else {
+        ASSERT_EQ(x_buffer[0], x_value - 1);
+      }
+    } else {
+      if (x_value <= 15) {
+        ASSERT_EQ(x_buffer[0], x_value + 2);
+      } else {
+        ASSERT_EQ(x_buffer[0], x_value - 2);
+      }
+    }
+  }
+}
+
+TEST(LLVM, DirectVectorization) {
+  constexpr int M = 3;
+  constexpr int N = 64;
+  BufHandle a("a", {M, N}, kFloat);
+  BufHandle b("b", {M, N}, kFloat);
+  BufHandle c("c", {M, N}, kFloat);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  StmtPtr s = For::make(
+      m,
+      0,
+      M,
+      Store::make(
+          c,
+          {Ramp::make(m * 64, 1, 64)},
+          Load::make({kFloat, 64}, a, {Ramp::make(m * 64, 1, 64)}) *
+              Load::make({kFloat, 64}, b, {Ramp::make(m * 64, 1, 64)})));
+  LLVMCodeGen cg(s, {a, b, c});
+}
+
+TEST(LLVM, VecLoadStoreTest) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  std::vector<int32_t> a_buffer = {1, 1, 1, 1};
+  std::vector<int32_t> b_buffer = {2, 2, 2, 2};
+
+  auto store = b.store({Ramp::make(0, 1, 4)}, a.load({Ramp::make(0, 1, 4)}));
+  LLVMCodeGen cg(store, {a, b});
+  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(a_buffer[0], 1);
+  ASSERT_EQ(a_buffer[1], 1);
+  ASSERT_EQ(a_buffer[2], 1);
+  ASSERT_EQ(a_buffer[3], 1);
+  ASSERT_EQ(b_buffer[0], 1);
+  ASSERT_EQ(b_buffer[1], 1);
+  ASSERT_EQ(b_buffer[2], 1);
+  ASSERT_EQ(b_buffer[3], 1);
+}
+
+#define FLOAT_INTRINSICS_TEST(Name, Lanes)                                   \
+  TEST(LLVM, VecFloat_##Name##Lane##Lanes##Test) {                           \
+    BufHandle a("A", {1}, kFloat);                                           \
+    BufHandle b("B", {1}, kFloat);                                           \
+    float val = 0.5f;                                                        \
+    std::vector<float> a_buffer(Lanes, val);                                 \
+    std::vector<float> b_buffer(Lanes, val);                                 \
+    auto store = b.store(                                                    \
+        {Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
+    LLVMCodeGen cg(store, {a, b});                                           \
+    std::vector<void*> args({a_buffer.data(), b_buffer.data()});             \
+    ASSERT_EQ(cg.value<int>(args), 0);                                       \
+    for (const auto i : c10::irange(Lanes)) {                                \
+      ASSERT_FLOAT_EQ(a_buffer[i], val);                                     \
+    }                                                                        \
+  } // namespace jit
+FLOAT_INTRINSICS_TEST(erf, 4)
+FLOAT_INTRINSICS_TEST(erfc, 4)
+FLOAT_INTRINSICS_TEST(acos, 4)
+FLOAT_INTRINSICS_TEST(asin, 4)
+FLOAT_INTRINSICS_TEST(atan, 4)
+FLOAT_INTRINSICS_TEST(cosh, 4)
+FLOAT_INTRINSICS_TEST(sinh, 4)
+FLOAT_INTRINSICS_TEST(tanh, 4)
+FLOAT_INTRINSICS_TEST(expm1, 4)
+FLOAT_INTRINSICS_TEST(lgamma, 4)
+FLOAT_INTRINSICS_TEST(erf, 8)
+FLOAT_INTRINSICS_TEST(erfc, 8)
+FLOAT_INTRINSICS_TEST(acos, 8)
+FLOAT_INTRINSICS_TEST(asin, 8)
+FLOAT_INTRINSICS_TEST(atan, 8)
+FLOAT_INTRINSICS_TEST(cosh, 8)
+FLOAT_INTRINSICS_TEST(sinh, 8)
+FLOAT_INTRINSICS_TEST(tanh, 8)
+FLOAT_INTRINSICS_TEST(expm1, 8)
+FLOAT_INTRINSICS_TEST(lgamma, 8)
+#undef FLOAT_INTRINSICS_TEST
+
+#define DOUBLE_INTRINSICS_TEST(Name, Lanes)                                  \
+  TEST(LLVM, VecDouble_##Name##Lane##Lanes##Test) {                          \
+    BufHandle a("A", {1}, kDouble);                                          \
+    BufHandle b("B", {1}, kDouble);                                          \
+    float val = 0.5f;                                                        \
+    std::vector<double> a_buffer(Lanes, val);                                \
+    std::vector<double> b_buffer(Lanes, val);                                \
+    auto store = b.store(                                                    \
+        {Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
+    LLVMCodeGen cg(store, {a, b});                                           \
+    std::vector<void*> args({a_buffer.data(), b_buffer.data()});             \
+    ASSERT_EQ(cg.value<int>(args), 0);                                       \
+    for (const auto i : c10::irange(Lanes)) {                                \
+      ASSERT_FLOAT_EQ(a_buffer[i], val);                                     \
+    }                                                                        \
+  } // namespace jit
+DOUBLE_INTRINSICS_TEST(erf, 2)
+DOUBLE_INTRINSICS_TEST(erfc, 2)
+DOUBLE_INTRINSICS_TEST(acos, 2)
+DOUBLE_INTRINSICS_TEST(asin, 2)
+DOUBLE_INTRINSICS_TEST(atan, 2)
+DOUBLE_INTRINSICS_TEST(cosh, 2)
+DOUBLE_INTRINSICS_TEST(sinh, 2)
+DOUBLE_INTRINSICS_TEST(tanh, 2)
+DOUBLE_INTRINSICS_TEST(expm1, 2)
+DOUBLE_INTRINSICS_TEST(lgamma, 2)
+DOUBLE_INTRINSICS_TEST(erf, 4)
+DOUBLE_INTRINSICS_TEST(erfc, 4)
+DOUBLE_INTRINSICS_TEST(acos, 4)
+DOUBLE_INTRINSICS_TEST(asin, 4)
+DOUBLE_INTRINSICS_TEST(atan, 4)
+DOUBLE_INTRINSICS_TEST(cosh, 4)
+DOUBLE_INTRINSICS_TEST(sinh, 4)
+DOUBLE_INTRINSICS_TEST(tanh, 4)
+DOUBLE_INTRINSICS_TEST(expm1, 4)
+DOUBLE_INTRINSICS_TEST(lgamma, 4)
+#undef DOUBLE_INTRINSICS_TEST
+
+TEST(LLVM, VectorizerLoadStoreTest) {
+  BufHandle a("A", {1}, kInt);
+
+  Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return a.load(i); });
+
+  BufHandle c_buf(c.buf());
+  LoopNest l({c});
+  StmtPtr s = l.root_stmt();
+  ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
+
+  ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
+
+  LLVMCodeGen cg(s, {a, c_buf});
+
+  std::vector<int> a_vec(4, 21);
+  std::vector<int> c_vec(4, 0);
+  std::vector<void*> args({a_vec.data(), c_vec.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  assertAllEqual(c_vec, 21);
+}
+
+TEST(LLVM, VectorizeBitCast) {
+  BufHandle a("A", {128}, kInt);
+
+  Tensor c = Compute("c", {128}, [&](const VarHandle& i) {
+    return bitcast<float>(a.load(i));
+  });
+
+  BufHandle c_buf(c.buf());
+  LoopNest l({c});
+  StmtPtr s = l.root_stmt();
+  ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
+  ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
+
+  LLVMCodeGen cg(s, {a, c_buf});
+
+  std::vector<int> a_vec(128);
+  std::vector<float> c_vec(128);
+  for (const auto i : c10::irange(128)) {
+    a_vec[i] = raw_bitcast<int>(1337.f);
+  }
+  std::vector<void*> args({a_vec.data(), c_vec.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  assertAllEqual(c_vec, 1337.f);
+}
+
+TEST(LLVM, MemcpyTest) {
+  constexpr int N = 32;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  std::vector<int32_t> a_buffer(N, 42);
+  std::vector<int32_t> b_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(i, 0, N, b.store({i}, a.load(i)));
+
+  LLVMCodeGen cg(expr, {a, b});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  assertAllEqual(a_buffer, 42);
+  assertAllEqual(b_buffer, 42);
+}
+
+TEST(LLVM, BzeroTest) {
+  constexpr int N = 32;
+  BufHandle b("B", {N}, kInt);
+  std::vector<int32_t> b_buffer(N, 11);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(i, 0, N, b.store({i}, 0));
+
+  LLVMCodeGen cg(expr, {b});
+
+  std::vector<void*> args({b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(b_buffer.size(), N);
+  assertAllEqual(b_buffer, 0);
+}
+
+TEST(LLVM, ElemwiseAdd) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int32_t> a_buffer(N, 41);
+  std::vector<int32_t> b_buffer(N, 1);
+  std::vector<int32_t> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41);
+  assertAllEqual(b_buffer, 1);
+  assertAllEqual(c_buffer, 42);
+}
+
+TEST(LLVM, ElemwiseAddFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<float> a_buffer(N, 41);
+  std::vector<float> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41.0f);
+  assertAllEqual(b_buffer, 1.0f);
+  assertAllEqual(c_buffer, 42.0f);
+}
+
+TEST(LLVM, ElemwiseLog10Float) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  std::vector<float> a_buffer(N, 10.0f);
+  std::vector<float> b_buffer(N, 2.0f);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N / 4,
+      b.store(
+          {Ramp::make(i * 4, 1, 4)}, log10(a.load({Ramp::make(i * 4, 1, 4)}))));
+
+  LLVMCodeGen cg(expr, {a, b});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  assertAllEqual(a_buffer, 10.0f);
+  assertAllEqual(b_buffer, 1.0f);
+}
+
+TEST(LLVM, ElemwiseLog1pFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  std::vector<float> a_buffer(N, expf(3.0f) - 1);
+  std::vector<float> b_buffer(N, 42.0f);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N / 4,
+      b.store(
+          {Ramp::make(i * 4, 1, 4)}, log1p(a.load({Ramp::make(i * 4, 1, 4)}))));
+
+  LLVMCodeGen cg(expr, {a, b});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  assertAllEqual(a_buffer, expf(3.0f) - 1);
+  ExpectAllNear(b_buffer, 3.0f, 1e-5f);
+}
+
+TEST(LLVM, ElemwiseMaxInt) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 41);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<int> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41);
+  assertAllEqual(b_buffer, 1);
+  assertAllEqual(c_buffer, 41);
+}
+
+TEST(LLVM, ElemwiseMinInt) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 41);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<int> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41);
+  assertAllEqual(b_buffer, 1);
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(LLVM, ElemwiseMaxFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<float> a_buffer(N, 41);
+  std::vector<float> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41.0f);
+  assertAllEqual(b_buffer, 1.0f);
+  assertAllEqual(c_buffer, 41.0f);
+}
+
+TEST(LLVM, ElemwiseMaxNaNFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<float> a_buffer(N, NAN);
+  std::vector<float> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(b_buffer, 1.0f);
+  for (auto const& elt : c_buffer) {
+    ASSERT_TRUE(std::isnan(elt));
+  }
+}
+
+TEST(LLVM, ElemwiseMinFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<float> a_buffer(N, 41);
+  std::vector<float> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41.0f);
+  assertAllEqual(b_buffer, 1.0f);
+  assertAllEqual(c_buffer, 1.0f);
+}
+
+TEST(LLVM, ElemwiseMinNaNFloat) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kFloat);
+  std::vector<float> a_buffer(N, NAN);
+  std::vector<float> b_buffer(N, 1);
+  std::vector<float> c_buffer(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(b_buffer, 1.0f);
+  for (auto const& elt : c_buffer) {
+    ASSERT_TRUE(std::isnan(elt));
+  }
+}
+
+TEST(LLVM, ElemwiseMod) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int32_t> a_buffer(N, 41);
+  std::vector<int32_t> b_buffer(N, 23);
+  std::vector<int32_t> c_buffer(N, 18);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i))));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+  assertAllEqual(a_buffer, 41);
+  assertAllEqual(b_buffer, 23);
+  assertAllEqual(c_buffer, 18);
+}
+
+TEST(LLVM, CompareSelectIntEQ) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  std::vector<int> a_buffer(N, 1);
+  std::vector<int> b_buffer(N, 1);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 1);
+
+  for (int i = 0; i < N / 2; i++) {
+    b_buffer[i] = 0;
+    c_ref[i] = 0;
+  }
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(a_buffer, 1);
+  for (const auto i : c10::irange(N)) {
+    ASSERT_EQ(c_ref[i], c_buffer[i]);
+  }
+}
+
+TEST(LLVM, CompareSelectFloatEQ) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kFloat);
+  BufHandle b("B", {N}, kFloat);
+  BufHandle c("C", {N}, kInt);
+  std::vector<float> a_buffer(N, 1.0f);
+  std::vector<float> b_buffer(N, 1.0f);
+  std::vector<int> c_buffer(N, 0);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(a_buffer, 1.0f);
+  assertAllEqual(b_buffer, 1.0f);
+  assertAllEqual(c_buffer, 1);
+}
+
+TEST(LLVM, CompareSelectByteGT) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kByte);
+  BufHandle b("B", {N}, kByte);
+  BufHandle c("C", {N}, kInt);
+  std::vector<uint8_t> a_buffer(N, 0);
+  std::vector<uint8_t> b_buffer(N, 0);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 0);
+
+  for (int i = 0; i < N / 2; i++) {
+    a_buffer[i] = 128;
+    c_ref[i] = 1;
+  }
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(b_buffer, uint8_t(0));
+  for (const auto i : c10::irange(N)) {
+    ASSERT_EQ(c_ref[i], c_buffer[i]);
+  }
+}
+
+TEST(LLVM, CompareSelectByteGE) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kByte);
+  BufHandle b("B", {N}, kByte);
+  BufHandle c("C", {N}, kInt);
+  std::vector<uint8_t> a_buffer(N, 0);
+  std::vector<uint8_t> b_buffer(N, 0);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(b_buffer, uint8_t(0));
+  for (const auto i : c10::irange(N)) {
+    ASSERT_EQ(c_ref[i], c_buffer[i]);
+  }
+}
+
+TEST(LLVM, CompareSelectByteLT) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kByte);
+  BufHandle b("B", {N}, kByte);
+  BufHandle c("C", {N}, kInt);
+  std::vector<uint8_t> a_buffer(N, 0);
+  std::vector<uint8_t> b_buffer(N, 128);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 1);
+
+  for (int i = 0; i < N / 2; i++) {
+    a_buffer[i] = 128;
+    c_ref[i] = 0;
+  }
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(b_buffer, uint8_t(128));
+  for (const auto i : c10::irange(N)) {
+    ASSERT_EQ(c_ref[i], c_buffer[i]);
+  }
+}
+
+TEST(LLVM, CompareSelectByteLE) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kByte);
+  BufHandle b("B", {N}, kByte);
+  BufHandle c("C", {N}, kInt);
+  std::vector<uint8_t> a_buffer(N, 0);
+  std::vector<uint8_t> b_buffer(N, 128);
+  std::vector<int> c_buffer(N, 0);
+  std::vector<int> c_ref(N, 1);
+
+  VarHandle i("i", kInt);
+  auto expr = For::make(
+      i,
+      0,
+      N,
+      c.store(
+          {i},
+          CompareSelect::make(
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
+
+  LLVMCodeGen cg(expr, {a, b, c});
+
+  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  ASSERT_EQ(a_buffer.size(), N);
+  ASSERT_EQ(b_buffer.size(), N);
+  ASSERT_EQ(c_buffer.size(), N);
+
+  assertAllEqual(b_buffer, uint8_t(128));
+  for (const auto i : c10::irange(N)) {
+    ASSERT_EQ(c_ref[i], c_buffer[i]);
+  }
+}
+
+TEST(LLVM, StoreFloat) {
+  BufHandle result("result", {1}, kFloat);
+  std::vector<float> result_buffer = {0.0f};
+  auto expr = result.store({0}, FloatImm::make(3.14f));
+  LLVMCodeGen cg(expr, {result});
+  std::vector<void*> args({result_buffer.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  ASSERT_EQ(result_buffer[0], 3.14f);
+}
+
+TEST(LLVM, SimpleMath01) {
+  const int N = 1024;
+  Tensor tensor = Compute(
+      "f", {N}, [](const VarHandle& i) { return cast<float>(i * i + 1); });
+  LoopNest l({tensor});
+  StmtPtr stmt = l.root_stmt();
+  BufHandle f_buf(tensor.buf());
+  LLVMCodeGen cg(stmt, {f_buf});
+
+  PaddedBuffer<float> f_v(N, "f_v");
+  std::vector<void*> args({f_v.data()});
+  int value = cg.value<int>(args);
+  ASSERT_EQ(value, 0);
+  PaddedBuffer<float> f_ref(N, "f_ref");
+  for (const auto i : c10::irange(N)) {
+    f_ref(i) = i * i + 1;
+  }
+  ExpectAllNear(f_v, f_ref, 1e-5);
+}
+
+TEST(LLVM, ComputeMul) {
+  const int N = 1024;
+  BufHandle a("a", {N}, kFloat);
+  BufHandle b("b", {N}, kFloat);
+  Tensor c = Compute(
+      "c", {N}, [&](const VarHandle& i) { return a.load(i) * b.load(i); });
+
+  BufHandle c_buf(c.buf());
+  LoopNest l({c});
+  StmtPtr s = l.root_stmt();
+
+  LLVMCodeGen cg(s, {a, b, c_buf});
+
+  std::vector<float> a_vec(N, 21.0f);
+  std::vector<float> b_vec(N, 2.0f);
+  std::vector<float> c_vec(N, 0.0f);
+  std::vector<void*> args({a_vec.data(), b_vec.data(), c_vec.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+  assertAllEqual(c_vec, 42.0f);
+}
+
+TEST(LLVM, BroadcastAdd) {
+  const int M = 32;
+  const int N = 1024;
+  BufHandle a("a", {M, N}, kFloat);
+  BufHandle b("b", {N}, kFloat);
+  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
+    return a.load(i, j) + b.load(j);
+  });
+
+  BufHandle c_buf(c.buf());
+  LoopNest l({c});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+
+  LLVMCodeGen cg(s, {a, b, c_buf});
+
+  std::vector<float> av(M * N);
+  std::iota(av.begin(), av.end(), 0);
+  std::vector<float> bv(N);
+  std::iota(bv.begin(), bv.end(), 0);
+  std::vector<float> cv(M * N, 0);
+  std::vector<void*> args({av.data(), bv.data(), cv.data()});
+  ASSERT_EQ(cg.value<int>(args), 0);
+
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      ASSERT_EQ(cv[i * N + j], av[i * N + j] + bv[j]);
+    }
+  }
+}
+
+TEST(LLVM, BitwiseOps) {
+  auto a = IntImm::make(59);
+  auto b = IntImm::make(11);
+  auto c = IntImm::make(101);
+  auto d = IntImm::make(2);
+
+  ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
+  LLVMExprEval cg(f);
+
+  ASSERT_EQ(cg.value<int>(), 11);
+}
+
+TEST(LLVM, ArithmeticRightShift) {
+  auto a = CharImm::make(-4);
+  auto b = CharImm::make(1);
+  ExprHandle f = a >> b;
+  LLVMExprEval cg(f);
+  ASSERT_EQ(cg.value<int8_t>(), -2);
+}
+
+TEST(LLVM, LogicalRightShift) {
+  auto a = ByteImm::make(0xfc);
+  auto b = ByteImm::make(1);
+  ExprHandle f = a >> b;
+  LLVMExprEval cg(f);
+  ASSERT_EQ(cg.value<uint8_t>(), 0x7e);
+}
+
+TEST(LLVM, DynamicShapeAdd) {
+  auto testWithSize = [](int32_t size) {
+    VarHandle n("n", kInt);
+    BufHandle a("a", {n}, kFloat);
+    BufHandle b("b", {n}, kFloat);
+    BufHandle c("c", {n}, kFloat);
+    VarHandle i("i", kInt);
+    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
+    std::vector<float> aData(size, 1.0f);
+    std::vector<float> bData(size, 2.0f);
+    std::vector<float> cData(size, 0.0f);
+    LLVMCodeGen cg(s, {a, b, c, n});
+    std::vector<void*> args({aData.data(), bData.data(), cData.data(), &size});
+    cg.value<float>(args);
+    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
+  };
+  testWithSize(1);
+  testWithSize(16);
+  testWithSize(37);
+}
+
+TEST(LLVM, BindDynamicShapeAdd) {
+  auto testWithSize = [](int32_t size) {
+    VarHandle n("n", kInt);
+    BufHandle a("a", {n}, kFloat);
+    BufHandle b("b", {n}, kFloat);
+    BufHandle c("c", {n}, kFloat);
+    VarHandle i("i", kInt);
+    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
+    std::vector<float> aData(size, 1.0f);
+    std::vector<float> bData(size, 2.0f);
+    std::vector<float> cData(size, 0.0f);
+    LLVMCodeGen cg(s, {a, b, c, n});
+    cg.call({aData, bData, cData, size});
+    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
+  };
+  testWithSize(1);
+  testWithSize(16);
+  testWithSize(37);
+}
+
+TEST(LLVM, TensorDynamicShapeAdd) {
+  auto testWithSize = [](int32_t size) {
+    VarHandle n("n", kInt);
+    BufHandle a("a", {n}, kFloat);
+    BufHandle b("b", {n}, kFloat);
+    Tensor c = Compute(
+        "c", {n}, [&](const VarHandle& i) { return a.load(i) + b.load(i); });
+    LoopNest l({c});
+    StmtPtr s = l.root_stmt();
+    LLVMCodeGen cg(s, {a, b, c, n});
+    std::vector<float> aData(size, 1.0f);
+    std::vector<float> bData(size, 2.0f);
+    std::vector<float> cData(size, 0.0f);
+    cg.call({aData, bData, cData, size});
+    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
+  };
+  testWithSize(1);
+  testWithSize(16);
+  testWithSize(37);
+}
+
+TEST(LLVM, DynamicShape2D) {
+  auto testWithSize = [](int32_t M, int32_t N) {
+    VarHandle m("m", kInt);
+    VarHandle n("n", kInt);
+    BufHandle a("a", {m, n}, kFloat);
+    BufHandle b("b", {m, n}, kFloat);
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
+          return a.load(i, j) + b.load(i, j);
+        });
+    LoopNest l({c});
+    l.prepareForCodegen();
+    StmtPtr s = l.root_stmt();
+    LLVMCodeGen cg(s, {a, b, c, m, n});
+    std::vector<float> aData(M * N, 1.0f);
+    std::vector<float> bData(M * N, 2.0f);
+    std::vector<float> cData(M * N, 0.0f);
+    cg.call({aData, bData, cData, M, N});
+    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
+  };
+  testWithSize(1, 8);
+  testWithSize(16, 32);
+  testWithSize(37, 11);
+}
+
+TEST(LLVM, EmptyStmt) {
+  StmtPtr s = alloc<Block>(std::vector<StmtPtr>({}));
+
+  LLVMCodeGen cg(s, {});
+  cg.call({});
+  // Just don't crash.
+}
+
+TEST(LLVM, EliminatedStmt) {
+  BufHandle a("a", {1}, kFloat);
+
+  Tensor c = Compute("c", {0}, [&](const VarHandle& m) { return m; });
+
+  LoopNest l({c});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+  s = IRSimplifier::simplify(s);
+  LLVMCodeGen cg(s, {a, c});
+  std::vector<float> aData(1, 1.0f);
+  std::vector<float> cData(0, 0.0f);
+  cg.call({aData, cData});
+}
+
+TEST(LLVM, SimpleReduction) {
+  int M = 128;
+  int N = 64;
+
+  BufHandle a("a", {1, M, N}, kFloat);
+
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
+  LoopNest loop({b});
+
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+TEST(LLVM, RFactorReduction) {
+  int M = 128;
+  int N = 64;
+
+  BufHandle a("a", {1, M, N}, kFloat);
+
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
+  LoopNest loop({b});
+
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
+  ForPtr loop_m = loops.at(1);
+  ForPtr loop_n = loops.at(2);
+  loop.reorderAxis(loop_m, loop_n);
+
+  loops = loop.getLoopStmtsFor(b);
+  loop_m = loops.at(2);
+  loop_n = loops.at(1);
+  auto b_body = loop.getAllWritesToBuf(b.buf())[1];
+  ASSERT_TRUE(loop.rfactor(b_body, loop_n));
+
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+TEST(LLVM, RFactorVectorizedReduction) {
+  int M = 128;
+  int N = 64;
+
+  BufHandle a("a", {1, M, N}, kFloat);
+
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
+  LoopNest loopnest({b});
+  std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
+  // Reorder n and m loops
+  loopnest.reorderAxis(loops.at(1), loops.at(2));
+  auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
+  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
+  ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
+  ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
+  auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
+
+  // Vectorize initializer of rfac_buf
+  ASSERT_TRUE(LoopNest::vectorize(distributed_loops[0]));
+  // Vectorize producer of rfac_buf
+  ASSERT_TRUE(LoopNest::vectorize(distributed_loops[1]));
+  loopnest.simplify();
+
+  loopnest.prepareForCodegen();
+
+  StmtPtr s = IRSimplifier::simplify(loopnest.root_stmt());
+  LLVMCodeGen cg(s, {a, b});
+
+  PaddedBuffer<float> a_v(1, M, N, "a_v");
+  PaddedBuffer<float> b_v(1, "b_v");
+  PaddedBuffer<float> b_ref(1, "b_ref");
+
+  b_ref(0) = 0;
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      int v = i + j;
+      a_v(0, i, j) = v;
+      b_ref(0) += v;
+    }
+  }
+
+  cg.call({a_v, b_v});
+
+  ExpectAllNear(b_v, b_ref, 1e-5);
+}
+
+template <bool outer, bool inner>
+static void testSimpleParallel() {
+  // Compute a simple operation, and try all loop-axis combination to be
+  // parallel or sequential.
+  const int M = 4;
+  const int N = 6;
+  Tensor f = Compute("f", {M, N}, [](const VarHandle& m, const VarHandle& n) {
+    return cast<float>(m + n);
+  });
+  LoopNest loop_nest({f});
+  auto const& loops = loop_nest.getLoopStmtsFor(f);
+  ForPtr m = loops[0];
+  ForPtr n = loops[1];
+  if (outer) {
+    m->set_parallel();
+  }
+  if (inner) {
+    n->set_parallel();
+  }
+  loop_nest.prepareForCodegen();
+  StmtPtr stmt = loop_nest.root_stmt();
+  LLVMCodeGen cg(stmt, {f});
+
+  PaddedBuffer<float> f_v(M, N, "f_v");
+  std::vector<void*> args({f_v.data()});
+  int value = cg.value<int>(args);
+  ASSERT_EQ(value, 0);
+  PaddedBuffer<float> f_ref(M, N, "f_ref");
+  for (const auto m : c10::irange(M)) {
+    for (const auto n : c10::irange(N)) {
+      f_ref(m, n) = m + n;
+    }
+  }
+  ExpectAllNear(f_v, f_ref, 1e-5);
+}
+
+TEST(LLVM, SimpleParallelSS) {
+  testSimpleParallel<false, false>();
+}
+TEST(LLVM, SimpleParallelSP) {
+  testSimpleParallel<false, true>();
+}
+TEST(LLVM, SimpleParallelPS) {
+  testSimpleParallel<true, false>();
+}
+TEST(LLVM, SimpleParallelPP) {
+  testSimpleParallel<true, true>();
+}
+
+TEST(LLVM, CompositeParallel) {
+  int loop_count = 6;
+  int test_count = 1 << loop_count;
+  // Compute a composite operation, and try all loop-axis combination to be
+  // parallel or sequential.
+  for (const auto test_cfg : c10::irange(test_count)) {
+    int M = 5;
+    int N = 7;
+    Tensor t1 = Compute("t1", {M}, [](const VarHandle& m) { return m + 1.f; });
+    Tensor t2 = Compute("t2", {N}, [](const VarHandle& n) { return n + 2.f; });
+    Tensor t3 =
+        Compute("t3", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
+          return t1.load(m) * t2.load(n);
+        });
+    Tensor t4 =
+        Compute("t4", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
+          return t3.load(m, n) + m + n;
+        });
+    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
+    std::vector<ForPtr> loop_list;
+    {
+      auto const& loops = loop_nest.getLoopStmtsFor(t1);
+      loop_list.push_back(loops[0]);
+    }
+    {
+      auto const& loops = loop_nest.getLoopStmtsFor(t2);
+      loop_list.push_back(loops[0]);
+    }
+    {
+      auto const& loops = loop_nest.getLoopStmtsFor(t3);
+      loop_list.push_back(loops[0]);
+      loop_list.push_back(loops[1]);
+    }
+    {
+      auto const& loops = loop_nest.getLoopStmtsFor(t4);
+      loop_list.push_back(loops[0]);
+      loop_list.push_back(loops[1]);
+    }
+    ASSERT_EQ(loop_list.size(), loop_count);
+    for (const auto i : c10::irange(loop_count)) {
+      if (test_cfg & (1 << i)) {
+        loop_list[i]->set_parallel();
+      }
+    }
+    loop_nest.prepareForCodegen();
+    StmtPtr stmt = loop_nest.root_stmt();
+    LLVMCodeGen cg(stmt, {t4});
+
+    PaddedBuffer<float> t4_v(M, N, "t4_v");
+    std::vector<void*> args({t4_v.data()});
+    int value = cg.value<int>(args);
+    ASSERT_EQ(value, 0);
+    PaddedBuffer<float> t4_ref(M, N, "t4_ref");
+    for (const auto m : c10::irange(M)) {
+      for (const auto n : c10::irange(N)) {
+        t4_ref(m, n) = (m + 1) * (n + 2) + m + n;
+      }
+    }
+    ExpectAllNear(t4_v, t4_ref, 1e-5);
+  }
+}
+
+TEST(LLVM, VectorizedGEMM) {
+  int M = 32;
+  int N = 32;
+  int K = 48;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  LoopNest loop({CT});
+
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr m = loops[0];
+    loop.splitWithMask(m, 16);
+  }
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr n = loops[2];
+    loop.splitWithMask(n, 16);
+  }
+  // mo, mi, no, ni, k ->
+  // mo, no, mi, ni, k
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr mi = loops[1];
+    ForPtr no = loops[2];
+    loop.reorderAxis(mi, no);
+  }
+  // mo, no, mi, ni, k ->
+  // mo, no, mi, k, ni
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr ni = loops[3];
+    ForPtr k = loops[4];
+    loop.reorderAxis(ni, k);
+  }
+  // mo, no, mi, k, ni ->
+  // mo, no, k, mi, ni
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr mi = loops[2];
+    ForPtr k = loops[3];
+    loop.reorderAxis(mi, k);
+  }
+  {
+    auto loops = NodeFinder<For>::find(loop.root_stmt());
+    ASSERT_TRUE(LoopNest::vectorize(loops[3]));
+    ASSERT_TRUE(LoopNest::vectorize(loops.back()));
+  }
+
+  loop.prepareForCodegen();
+
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+  LLVMCodeGen cg(s, {AP, BP, CT});
+
+  PaddedBuffer<float> a_v(M, K, "a_v");
+  PaddedBuffer<float> b_v(K, N, "b_v");
+  PaddedBuffer<float> c_v(M, N, "c_v");
+  PaddedBuffer<float> c_ref(M, N, "c_ref");
+
+  for (const auto m : c10::irange(M)) {
+    for (const auto n : c10::irange(N)) {
+      c_ref(m, n) = 0.f;
+      for (const auto k : c10::irange(K)) {
+        c_ref(m, n) += a_v(m, k) * b_v(k, n);
+      }
+    }
+  }
+
+  cg.call({a_v, b_v, c_v});
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(LLVM, CallRaw) {
+  const int M = 32;
+  VarHandle N("N", kInt);
+  BufHandle a("a", {M, N}, kFloat);
+  BufHandle b("b", {N}, kFloat);
+  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
+    return a.load(i, j) + b.load(j);
+  });
+
+  LoopNest l({c});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+
+  int32_t N_value = 1024;
+  std::vector<float> av(M * N_value);
+  std::iota(av.begin(), av.end(), 0);
+  std::vector<float> bv(N_value);
+  std::iota(bv.begin(), bv.end(), 0);
+  std::vector<float> cv(M * N_value, 0);
+  std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
+
+  LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
+  cg.call_raw(args);
+
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N_value)) {
+      ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
+    }
+  }
+
+  SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
+  eval.call_raw(args);
+
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N_value)) {
+      ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
+    }
+  }
+}
+
+TEST(LLVM, CustomTarget) {
+  constexpr int M = 16;
+  BufHandle a("a", {M}, kFloat);
+  BufHandle b("b", {M}, kFloat);
+  BufHandle c("c", {M}, kFloat);
+  Tensor d = Compute("d", {M}, [&](const VarHandle& m) {
+    return a.load(m) * b.load(m) + c.load(m);
+  });
+  LoopNest nest({d});
+  nest.prepareForCodegen();
+  auto cg = LLVMCodeGenBuilder(nest.root_stmt(), {a, b, c, d})
+                .triple("i686-elf")
+                .cpu("i386")
+                .build();
+  std::ostringstream ss;
+  ss << cg->getCodeText("asm");
+  torch::jit::testing::FileCheck()
+      .check("fadds")
+      ->check("fmuls")
+      ->check_not("vfmadd")
+      ->run(ss.str());
+}
+
+TEST(LLVM, CodeGenKernelFuncName) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  std::vector<int32_t> a_buffer = {42};
+  std::vector<int32_t> b_buffer = {-11};
+  auto store = b.store({0}, a.load(0));
+
+  {
+    LLVMCodeGen cg(store, {a, b});
+    // Check that the kernel function name used by LLVMCodeGen
+    // is not empty.
+    ASSERT_NE(cg.kernel_func_name(), "");
+  }
+
+  {
+    LLVMCodeGen cg(store, {a, b}, at::kCPU, "new_func");
+    // Check that the kernel function name used by LLVMCodeGen
+    // is the one that was given above.
+    ASSERT_EQ(cg.kernel_func_name(), "new_func");
+  }
+}
+
+} // namespace jit
+} // namespace torch
+
+#endif // TORCH_ENABLE_LLVM
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
new file mode 100644
index 0000000000000..a8bda8814dbae
--- /dev/null
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -0,0 +1,6894 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/bounds_inference.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+void checkIR(StmtPtr s, const std::string& pattern) {
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(pattern, oss.str());
+}
+
+void checkExprIR(ExprPtr e, const std::string& pattern) {
+  std::string prefixed_pattern = "# CHECK: " + pattern + "\n";
+  std::ostringstream oss;
+  oss << *e << "\n";
+  torch::jit::testing::FileCheck().run(prefixed_pattern, oss.str());
+}
+
+void checkExprIR(const ExprHandle& e, const std::string& pattern) {
+  checkExprIR(e.node(), pattern);
+}
+
+TEST(LoopNest, ExprSimple01) {
+  Tensor tensor =
+      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+      });
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+  LoopNest::splitWithTail(loops[0], 2);
+  LoopNest::splitWithTail(loops[0], 2);
+}
+
+TEST(LoopNest, ExprLower01) {
+  Tensor tensor =
+      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+      });
+  LoopNest l({tensor});
+  StmtPtr stmt = l.root_stmt();
+  std::ostringstream oss;
+  oss << *stmt;
+  ASSERT_GT(oss.str().size(), 20);
+  ASSERT_LT(oss.str().size(), 200);
+}
+
+TEST(LoopNest, ExprSimple02) {
+  auto func = [](const ExprHandle& x, const ExprHandle& y) {
+    return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+  };
+  Tensor tensor = Compute("f", {26, 5}, func);
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+  LoopNest::splitWithTail(loops[0], 4);
+
+  StmtPtr stmt = l.root_stmt();
+  std::ostringstream oss;
+  oss << *stmt;
+  ASSERT_GT(oss.str().size(), 200);
+  ASSERT_LT(oss.str().size(), 600);
+
+  {
+    // Compare to a reference loop structure structure.
+    VarHandle x_outer("i_outer", kInt);
+    VarHandle x_inner("i_inner", kInt);
+    VarHandle y("i", kInt);
+    VarHandle x_tail("i_tail", kInt);
+    BufHandle f("f", {26, 5}, kFloat);
+    ExprHandle x_1 = x_outer * 4 + x_inner;
+    ExprHandle x_outer_end = (ExprHandle(26) - 0) / 4;
+    ForPtr stmt1 = For::make(
+        x_outer,
+        0,
+        x_outer_end,
+        For::make(
+            x_inner,
+            0,
+            4,
+            For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y)))));
+    ExprHandle x_2 = x_tail + x_outer_end * 4;
+    ForPtr stmt2 = For::make(
+        x_tail,
+        0,
+        (ExprHandle(26) - 0) % 4,
+        For::make(y, 0, 5, Store::make(f, {x_2, y}, func(x_2, y))));
+    StmtPtr stmt = Block::make({stmt1, stmt2});
+
+    std::ostringstream oss_ref;
+    oss_ref << *stmt;
+    ASSERT_EQ(oss.str(), oss_ref.str());
+  }
+
+  {
+    PaddedBuffer<float> f_v(26, 5, "f_v");
+    PaddedBuffer<float> f_ref(26, 5, "f_res");
+
+    stmt = FlattenIndexes(stmt);
+    SimpleIREvaluator ir_eval(stmt, {tensor});
+    ir_eval(f_v);
+
+    for (int x = 0; x < 26; x++) {
+      for (int y = 0; y < 5; y++) {
+        f_ref(x, y) = 1 + x * x + y * y;
+      }
+    }
+
+    ExpectAllNear(f_v, f_ref, 1e-5);
+  }
+}
+
+BlockPtr getSimplifiedBody(const LoopNest& l) {
+  StmtPtr stmt = l.root_stmt();
+  StmtPtr simplified = IRSimplifier::simplify(stmt);
+  return to<Block>(simplified);
+}
+
+void assertForRange(ForPtr f, int expected_start, int expected_stop) {
+  ASSERT_NE(f, nullptr);
+  IntImmPtr start = to<IntImm>(f->start());
+  ASSERT_NE(start, nullptr);
+  ASSERT_EQ(start->value(), expected_start);
+  IntImmPtr stop = to<IntImm>(f->stop());
+  ASSERT_NE(stop, nullptr);
+  ASSERT_EQ(stop->value(), expected_stop);
+}
+
+void assertForRanges(
+    BlockPtr body,
+    const std::vector<std::pair<int, int>>& start_stops) {
+  ASSERT_EQ(body->nstmts(), start_stops.size());
+
+  auto it = body->begin();
+  for (size_t i = 0; i < start_stops.size(); i++, it++) {
+    ForPtr loop = to<For>(*it);
+    assertForRange(loop, start_stops[i].first, start_stops[i].second);
+  }
+}
+
+TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
+  LoopNest::sliceHead(loops[0], 2, &head, &tail);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 2}, {0, 8}});
+
+  ASSERT_TRUE(tail->loop_options().is_gpu_block_index());
+  ASSERT_EQ(tail->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  ASSERT_TRUE(head->loop_options().isDefault());
+}
+
+TEST(LoopNest, ExprSliceTailWithLoopOptions) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceTail(loops[0], 4, &head, &tail);
+
+  ForPtr tail_head;
+  ForPtr tail_tail;
+  tail->set_gpu_block_index(LoopOptions::IDX_Y);
+  LoopNest::sliceTail(tail, 2, &tail_head, &tail_tail);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 6}, {0, 2}, {8, 10}});
+
+  ASSERT_TRUE(tail_head->loop_options().is_gpu_block_index());
+  ASSERT_EQ(tail_head->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  ASSERT_TRUE(head->loop_options().isDefault());
+  ASSERT_TRUE(tail_tail->loop_options().isDefault());
+}
+
+TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
+  // When factor equals the For loop's original size, keep using the original
+  // For loop.
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceHead(loops[0], 10, &head, &tail);
+
+  ASSERT_EQ(head, loops[0]);
+  ASSERT_EQ(tail, nullptr);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 10}});
+}
+
+TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceHead(loops[0], 100, &head, &tail);
+
+  ASSERT_EQ(head, loops[0]);
+  ASSERT_EQ(tail, nullptr);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 10}});
+}
+
+TEST(LoopNest, ExprSliceHead) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceHead(loops[0], 4, &head, &tail);
+
+  ASSERT_NE(head, nullptr);
+  ASSERT_NE(head, loops[0]);
+  ASSERT_NE(tail, nullptr);
+  ASSERT_EQ(tail, loops[0]);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 4}, {4, 10}});
+}
+
+TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+  ForPtr head;
+  ForPtr tail;
+  LoopNest::sliceTail(loops[0], 4, &head, &tail);
+  // head: [0, 6)
+  // tail: [6, 10)
+
+  LoopNest::sliceHead(tail, 2);
+  // tail_head: [6, 8)
+  // tail_tail: [8, 10)
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 6}, {6, 8}, {8, 10}});
+}
+
+TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
+  // When factor equals the For loop's original size, keep using the original
+  // For loop.
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceTail(loops[0], 10, &head, &tail);
+
+  ASSERT_EQ(head, nullptr);
+  ASSERT_EQ(tail, loops[0]);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 10}});
+}
+
+TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
+  // When factor equals the For loop's original size, keep using the original
+  // For loop.
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceTail(loops[0], 100, &head, &tail);
+
+  ASSERT_EQ(head, nullptr);
+  ASSERT_EQ(tail, loops[0]);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 10}});
+}
+
+TEST(LoopNest, ExprSliceTail) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  ForPtr head;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::sliceTail(loops[0], 4, &head, &tail);
+
+  ASSERT_NE(head, nullptr);
+  ASSERT_EQ(head, loops[0]);
+  ASSERT_NE(tail, nullptr);
+  ASSERT_NE(tail, loops[0]);
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 6}, {6, 10}});
+}
+
+TEST(LoopNest, ExprSplitAndSlice) {
+  // 0: splitWithTail
+  // 1: sliceTail on inner loop
+  // 2: sliceHead on outer loop
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {100}, func);
+  LoopNest l({tensor});
+
+  ForPtr inner;
+  ForPtr tail;
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  // outer: [0, 4)
+  // inner: [0, 21)
+  // tail:  [84, 100)
+  LoopNest::splitWithTail(loops[0], 21, &inner, &tail);
+  LoopNest::sliceTail(inner, 2);
+  LoopNest::sliceHead(loops[0], 2);
+
+  // for (int x_outer = 0; x_outer < 2; x_outer++) {
+  //   for (int x_inner = 0; x_inner < 19; x_inner++) {
+  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
+  //   }
+  //   for (int x_inner = 19; x_inner < 21; x_inner++) {
+  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
+  //   }
+  // }
+  // for (int x_outer = 2; x_outer < 4; x_outer++) {
+  //   for (int x_inner = 0; x_inner < 19; x_inner++) {
+  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
+  //   }
+  //   for (int x_inner = 19; x_inner < 21; x_inner++) {
+  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
+  //   }
+  // }
+  // for (int x_tail = 0; x_tail < 16; x_tail++) {
+  //   f[x_tail + 84] = 1.f + float(x_tail + 84);
+  // }
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 2}, {2, 4}, {0, 16}});
+
+  auto biter = body->begin();
+
+  ForPtr loop = to<For>(*biter++);
+  assertForRanges(loop->body(), {{0, 19}, {19, 21}});
+
+  loop = to<For>(*biter);
+  assertForRanges(loop->body(), {{0, 19}, {19, 21}});
+}
+
+TEST(LoopNest, ExprSliceAndNormalize) {
+  // 0: sliceHead
+  // 1: normalize tail
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {10}, func);
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+  ForPtr head;
+  ForPtr tail;
+  LoopNest::sliceHead(loops[0], 2, &head, &tail);
+  // head: [0, 2)
+  // tail: [2, 10)
+
+  LoopNest::normalize(tail);
+  // normalized_tail: [0, 8)
+
+  BlockPtr body = getSimplifiedBody(l);
+  assertForRanges(body, {{0, 2}, {0, 8}});
+}
+
+template <typename T>
+T evalExpr(const ExprHandle& expr, const VarHandle& var, T value) {
+  ExprEval<SimpleIREvaluator> eval(expr, {var});
+  return eval.value<T>(value);
+}
+
+TEST(LoopNest, ExprSliceWithVariableDimension) {
+  auto testWithDimension =
+      [](int dimension,
+         const std::vector<std::pair<int, int>>& expected_for_ranges) {
+        VarHandle dim("dim", kInt);
+        Tensor tensor =
+            Compute("f", {dim}, [](const ExprHandle& x) { return x; });
+        LoopNest l({tensor});
+        std::vector<ForPtr> loops =
+            l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+        ForPtr head;
+        ForPtr tail;
+        LoopNest::sliceHead(loops[0], 2, &head, &tail);
+
+        LoopNest::sliceTail(tail, 2);
+
+        BlockPtr body = getSimplifiedBody(l);
+        ASSERT_EQ(expected_for_ranges.size(), 3);
+        auto it = body->begin();
+        for (auto& start_stop : expected_for_ranges) {
+          ForPtr loop = to<For>(*it++);
+          int start = evalExpr<int>(ExprHandle(loop->start()), dim, dimension);
+          int stop = evalExpr<int>(ExprHandle(loop->stop()), dim, dimension);
+          ASSERT_EQ(start, start_stop.first);
+          ASSERT_EQ(stop, start_stop.second);
+        }
+      };
+
+  testWithDimension(1, {{0, 1}, {1, 1}, {1, 1}});
+  testWithDimension(2, {{0, 2}, {2, 2}, {2, 2}});
+  testWithDimension(3, {{0, 2}, {2, 2}, {2, 3}});
+  testWithDimension(4, {{0, 2}, {2, 2}, {2, 4}});
+  testWithDimension(5, {{0, 2}, {2, 3}, {3, 5}});
+  testWithDimension(10, {{0, 2}, {2, 8}, {8, 10}});
+}
+
+TEST(LoopNest, ExprSplitWithTail) {
+  auto func = [](const ExprHandle& x) {
+    return ExprHandle(1.0f) + cast<float>(x);
+  };
+  Tensor tensor = Compute("f", {199}, func);
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  LoopNest::splitWithTail(loops[0], 17);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  LoopNest::splitWithTail(loops[0], 7);
+
+  StmtPtr stmt = l.root_stmt();
+  StmtPtr simplified = IRSimplifier::simplify(stmt);
+  BlockPtr body = to<Block>(simplified);
+  ASSERT_EQ(body->nstmts(), 3);
+  auto biter = body->begin();
+
+  // Verify that the split loops are ordered correctly.
+  ForPtr loop = to<For>(*biter++);
+  assertForRange(loop, 0, 7);
+
+  loop = to<For>(*biter++);
+  assertForRange(loop, 0, 4);
+
+  loop = to<For>(*biter);
+  assertForRange(loop, 0, 12);
+}
+
+TEST(LoopNest, ExprSplitWithTailNone) {
+  auto func = [](const ExprHandle& x, const ExprHandle& y) {
+    return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+  };
+  Tensor tensor = Compute("f", {24, 5}, func);
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::splitWithTail(loops[0], 4);
+
+  StmtPtr stmt = l.root_stmt();
+  std::ostringstream oss;
+  oss << *stmt;
+  ASSERT_GT(oss.str().size(), 200);
+  ASSERT_LT(oss.str().size(), 600);
+
+  {
+    // Compare to a reference loop structure structure.
+    VarHandle x_outer("i_outer", kInt);
+    VarHandle x_inner("i_inner", kInt);
+    VarHandle y("i", kInt);
+    VarHandle x_tail("i_tail", kInt);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
+    BufHandle f("f", {24, 5}, kFloat);
+    ExprHandle x_1 = x_outer * 4 + x_inner;
+    ExprHandle x_outer_end = (ExprHandle(24) - 0) / 4;
+    StmtPtr stmt = alloc<Block>(std::vector<StmtPtr>({For::make(
+        x_outer,
+        0,
+        x_outer_end,
+        For::make(
+            x_inner,
+            0,
+            4,
+            For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y)))))}));
+
+    std::ostringstream oss_ref;
+    oss_ref << *stmt;
+    ASSERT_EQ(oss.str(), oss_ref.str());
+  }
+
+  {
+    PaddedBuffer<float> f_v(24, 5, "f_v");
+    PaddedBuffer<float> f_ref(24, 5, "f_res");
+
+    SimpleIREvaluator ir_eval(stmt, {tensor});
+    ir_eval(f_v);
+
+    for (int x = 0; x < 24; x++) {
+      for (int y = 0; y < 5; y++) {
+        f_ref(x, y) = 1 + x * x + y * y;
+      }
+    }
+
+    ExpectAllNear(f_v, f_ref, 1e-5);
+  }
+}
+
+TEST(LoopNest, ExprSplitWithMask01) {
+  const int M = 26;
+  const int N = 5;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {M, N}, kFloat);
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
+      });
+
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::splitWithMask(loops[1], 4);
+
+  StmtPtr stmt = l.root_stmt();
+
+  PaddedBuffer<float> a_v(M, N, "a");
+  PaddedBuffer<float> b_v(M, N, "b");
+  PaddedBuffer<float> c_v(M, N, "c");
+  PaddedBuffer<float> c_ref(M, N, "c_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      a_v(m, n) = 2 * m;
+      b_v(m, n) = 3 * n;
+      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
+    }
+  }
+
+  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
+
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+// Tests the case where we split a loop cleanly multiple times, we should not
+// insert any masks.
+TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
+  const int M = 64;
+  BufHandle a_buf("a", {M}, kFloat);
+  BufHandle b_buf("b", {M}, kFloat);
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
+  });
+
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 4);
+  LoopNest::splitWithMask(loops[0], 4);
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l.root_stmt());
+
+  // Two splits mean 3 loops, but should need no masks in this case.
+  checkIR(stmt1, R"IR(
+# CHECK: for (
+# CHECK-NOT: if (
+# CHECK:   for (
+# CHECK-NOT: if (
+# CHECK:     for (
+# CHECK-NOT: if (
+# CHECK:       f[)IR");
+}
+
+TEST(LoopNest, getLoopAt) {
+  // Input IR:
+  //  for (int i = 0; i < 100; i++) {
+  //    for (int j = 0; j < 100; j++) {
+  //      A[i, j] = sin(i * j);
+  //      for (int k1 = 0; k1 < 200; k1++) {
+  //        B[i, j, k1] = (A[i, j]) / (k1 + 1);
+  //      }
+  //      for (int k2 = 0; k2 < 300; k2++) {
+  //        C[i, j, k2] = (A[i, j]) * (k2 + 1);
+  //      }
+  //    }
+  //  }
+  BufPtr A = alloc<Buf>(
+      "A",
+      std::vector<ExprPtr>({alloc<IntImm>(100), alloc<IntImm>(100)}),
+      kInt);
+  BufPtr B = alloc<Buf>(
+      "B",
+      std::vector<ExprPtr>(
+          {alloc<IntImm>(100), alloc<IntImm>(100), alloc<IntImm>(200)}),
+      kInt);
+  BufPtr C = alloc<Buf>(
+      "C",
+      std::vector<ExprPtr>(
+          {alloc<IntImm>(100), alloc<IntImm>(100), alloc<IntImm>(300)}),
+      kInt);
+  BufHandle a_buf(A);
+  BufHandle b_buf(B);
+  BufHandle c_buf(C);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k1("k1", kInt);
+  VarHandle k2("k2", kInt);
+  auto store1 = Store::make(a_buf, {i, j}, sin(i * j));
+  auto store2 = Store::make(
+      b_buf, {i, j, k1}, Div::make(Load::make(a_buf, {i, j}), (k1 + 1)));
+  auto store3 = Store::make(
+      c_buf, {i, j, k2}, Mul::make(Load::make(a_buf, {i, j}), (k2 + 1)));
+  auto for_k2 = For::make(k2, 0, 300, Block::make({store3}));
+  auto for_k1 = For::make(k1, 0, 200, Block::make({store2}));
+  auto for_j = For::make(j, 0, 100, Block::make({store1, for_k1, for_k2}));
+  auto for_i = For::make(i, 0, 100, for_j);
+  LoopNest l(Block::make({for_i}), {B, C});
+  auto ret_k2 = l.getLoopAt(for_i, {0, 2});
+  TORCH_CHECK(ret_k2 == for_k2);
+
+  std::ostringstream oss;
+  oss << *ret_k2;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int k2
+# CHECK-NEXT: C[i, j, k2] =
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, TileSimple) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  const int M = 64, N = 64;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {M, N}, kFloat);
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
+      });
+
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  l.tile(loops[0], loops[1], 4, 8);
+
+  // IR check
+  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
+  checkIR(stmt, R"IR(
+# CHECK: for (int i_outer
+# CHECK:   for (int i_outer_1
+# CHECK:     for (int i_inner
+# CHECK:       for (int i_inner_1
+# CHECK:         f[
+# CHECK-NOT:     for (int i_tail
+# CHECK-NOT: for (int i_tail)IR");
+
+  // Correctness check
+  PaddedBuffer<float> a_v(M, N, "a");
+  PaddedBuffer<float> b_v(M, N, "b");
+  PaddedBuffer<float> c_v(M, N, "c");
+  PaddedBuffer<float> c_ref(M, N, "c_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      a_v(m, n) = 2 * m;
+      b_v(m, n) = 3 * n;
+      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
+    }
+  }
+
+  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(LoopNest, TileWithTails) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  const int M = 64, N = 64;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {M, N}, kFloat);
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
+      });
+
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  l.tile(loops[0], loops[1], 5, 9);
+
+  // IR check
+  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
+  checkIR(stmt, R"IR(
+# CHECK: for (int i_outer
+# CHECK:   for (int i_outer_1
+# CHECK:     for (int i_inner
+# CHECK:       for (int i_inner_1
+# CHECK:         f[
+# CHECK:   for (int i_inner
+# CHECK:     f[
+# CHECK: for (int i_tail)IR");
+
+  // Correctness check
+  PaddedBuffer<float> a_v(M, N, "a");
+  PaddedBuffer<float> b_v(M, N, "b");
+  PaddedBuffer<float> c_v(M, N, "c");
+  PaddedBuffer<float> c_ref(M, N, "c_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      a_v(m, n) = 2 * m;
+      b_v(m, n) = 3 * n;
+      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
+    }
+  }
+
+  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(LoopNest, TileInMiddle) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  const int M = 8, N = 8, L = 8, K = 8;
+  BufHandle a_buf("a", {M, N, L, K}, kFloat);
+  BufHandle b_buf("b", {M, N, L, K}, kFloat);
+  Tensor tensor = Compute(
+      "f",
+      {M, N, L, K},
+      [&](const ExprHandle& m,
+          const ExprHandle& n,
+          const ExprHandle& l,
+          const ExprHandle& k) {
+        return a_buf.load({m, n, l, k}) + b_buf.load({m, n, l, k}) + 1.0f;
+      });
+
+  LoopNest nest({tensor});
+  std::vector<ForPtr> loops =
+      nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  nest.tile(loops[1], loops[2], 3, 3);
+
+  // IR check
+  StmtPtr stmt = IRSimplifier::simplify(nest.root_stmt());
+  checkIR(stmt, R"IR(
+# CHECK: for (int i
+# CHECK:   for (int i_outer
+# CHECK:     for (int i_outer_1
+# CHECK:       for (int i_inner
+# CHECK:         for (int i_inner_1
+# CHECK:           for (int i_1
+# CHECK:             f[
+# CHECK:     for (int i_tail_1
+# CHECK:       for (int i_inner_1
+# CHECK:         for (int i_1
+# CHECK:           f[
+# CHECK:   for (int i_tail)IR");
+
+  // Correctness check
+  PaddedBuffer<float> a_v(M, N, L, K, "a");
+  PaddedBuffer<float> b_v(M, N, L, K, "b");
+  PaddedBuffer<float> c_v(M, N, L, K, "c");
+  PaddedBuffer<float> c_ref(M, N, L, K, "c_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      for (int l = 0; l < L; l++) {
+        for (int k = 0; k < K; k++) {
+          a_v(m, n, l, k) = 2 * (m + l);
+          b_v(m, n, l, k) = 3 * (n + k);
+          c_ref(m, n, l, k) = a_v(m, n, l, k) + b_v(m, n, l, k) + 1.0f;
+        }
+      }
+    }
+  }
+
+  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(LoopNest, SplitWithTailWithLoopOptions) {
+  const int M = 21;
+  BufHandle a_buf("a", {M}, kFloat);
+  BufHandle b_buf("b", {M}, kFloat);
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
+  });
+  ForPtr inner, tail;
+
+  LoopNest l({tensor});
+  auto loops = NodeFinder<For>::find(l.root_stmt());
+  ASSERT_GT(loops.size(), 0);
+  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
+  LoopNest::splitWithTail(loops[0], 4, &inner, &tail);
+  ASSERT_NE(inner, nullptr);
+  ASSERT_NE(tail, nullptr);
+  ForPtr outer = loops[0];
+
+  // Outer loop carries loop axis bindings.
+  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
+  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  // Inner loop has none.
+  ASSERT_TRUE(inner->loop_options().isDefault());
+
+  // Tail loop has none.
+  ASSERT_TRUE(tail->loop_options().isDefault());
+}
+
+TEST(LoopNest, SplitWithMaskWithLoopOptions) {
+  const int M = 21;
+  BufHandle a_buf("a", {M}, kFloat);
+  BufHandle b_buf("b", {M}, kFloat);
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
+  });
+  ForPtr inner;
+
+  LoopNest l({tensor});
+  auto loops = NodeFinder<For>::find(l.root_stmt());
+  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
+  LoopNest::splitWithMask(loops[0], 4, &inner);
+  ForPtr outer = loops[0];
+
+  // Outer loop carries loop axis bindings.
+  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
+  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
+
+  // Inner loop has none.
+  ASSERT_TRUE(inner->loop_options().isDefault());
+}
+
+TEST(LoopNest, ScheduleBroadcastAddBuffer) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+  LoopNest l({c});
+  StmtPtr stmt = l.root_stmt();
+
+  PaddedBuffer<float> a_v(M, N, "a_v");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      a_v(m, n) = 7 * m * n;
+    }
+  }
+  a_v.Backup();
+
+  PaddedBuffer<float> b_v(N, K, "b_v");
+  for (int n = 0; n < N; n++) {
+    for (int k = 0; k < K; k++) {
+      b_v(n, k) = 11 * n * k;
+    }
+  }
+  b_v.Backup();
+
+  PaddedBuffer<float> c_v(M, N, K, "c_buf");
+  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c});
+  ir_eval(a_v, b_v, c_v);
+
+  a_v.CheckBackup();
+  b_v.CheckBackup();
+  PaddedBuffer<float> c_ref(M, N, K, "c_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        c_ref(m, n, k) = 7 * m * n + 11 * n * k;
+      }
+    }
+  }
+  ExpectAllNear(c_v, c_ref, 1e-5);
+}
+
+TEST(LoopNest, ScheduleFunctionCall01) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+  Tensor d = Compute(
+      "d",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c.load(m, n, k) + 1;
+      });
+
+  LoopNest l({d}, {c, d});
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+  std::ostringstream oss;
+  oss << *stmt;
+  ASSERT_GT(oss.str().size(), 100);
+
+  PaddedBuffer<float> a_v(M, N);
+  PaddedBuffer<float> b_v(N, K);
+  PaddedBuffer<float> c_v(M, N, K);
+  PaddedBuffer<float> d_v(M, N, K);
+  PaddedBuffer<float> d_ref(M, N, K);
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      a_v(i, j) = i * i;
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      b_v(i, j) = j * j;
+    }
+  }
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < K; k++) {
+        d_ref(i, j, k) = a_v(i, j) + b_v(j, k) + 1;
+      }
+    }
+  }
+
+  SimpleIREvaluator eval(stmt, {a_buf, b_buf, d});
+  eval(a_v, b_v, d_v);
+
+  ExpectAllNear(d_v, d_ref, 1e-5);
+}
+
+TEST(LoopNest, ScheduleInlineSimple) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+  BufHandle c_buf("c", {M, N}, kFloat);
+  BufHandle d_buf("d", {M, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) * b_buf.load(n, k);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
+      });
+
+  LoopNest l1({y}, {x, y});
+  LoopNest l2(l1);
+  l2.computeInline(x.buf());
+
+  l1.prepareForCodegen();
+  l2.prepareForCodegen();
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
+
+  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, c_buf, d_buf, y});
+  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, c_buf, d_buf, y});
+
+  PaddedBuffer<float> a_v(M, N);
+  PaddedBuffer<float> b_v(N, K);
+  PaddedBuffer<float> c_v(M, N);
+  PaddedBuffer<float> d_v(M, K);
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      a_v(i, j) = i * i;
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      b_v(i, j) = j * j;
+    }
+  }
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      c_v(i, j) = i + j;
+    }
+  }
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < K; j++) {
+      d_v(i, j) = i * j;
+    }
+  }
+
+  PaddedBuffer<float> y_1(M, N, K);
+  PaddedBuffer<float> y_2(M, N, K);
+
+  eval1(a_v, b_v, c_v, d_v, y_1);
+  eval2(a_v, b_v, c_v, d_v, y_2);
+  ExpectAllNear(y_1, y_2, 1e-5);
+  std::ostringstream oss1, oss2;
+  oss1 << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_GT(oss1.str().size(), oss2.str().size());
+}
+
+static std::string remove_space(const std::string& str) {
+  std::string str_new = str;
+  str_new.erase(
+      remove_if(str_new.begin(), str_new.end(), isspace), str_new.end());
+  return str_new;
+}
+
+void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+  BufHandle c_buf("c", {M, N}, kFloat);
+  BufHandle d_buf("d", {M, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) * b_buf.load(n, k);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
+      });
+  Tensor z = Compute(
+      "z",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m, n, k) + y.load(m, n, k);
+      });
+
+  LoopNest l({z}, {x, y, z});
+  for (const std::string& order : inline_order) {
+    if (order == "x") {
+      l.computeInline(x.buf());
+    } else if (order == "y") {
+      l.computeInline(y.buf());
+    } else {
+      throw std::runtime_error("Invalid order: " + order);
+    }
+  }
+  l.prepareForCodegen();
+  StmtPtr stmt = l.root_stmt();
+
+  std::ostringstream oss;
+  oss << *stmt;
+  std::string str1 = remove_space(oss.str());
+
+  {
+    PaddedBuffer<float> a_v(M, N);
+    PaddedBuffer<float> b_v(N, K);
+    PaddedBuffer<float> c_v(M, N);
+    PaddedBuffer<float> d_v(M, K);
+
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        a_v(i, j) = i * i;
+      }
+    }
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < K; j++) {
+        b_v(i, j) = j * j;
+      }
+    }
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        c_v(i, j) = i + j;
+      }
+    }
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < K; j++) {
+        d_v(i, j) = i * j;
+      }
+    }
+
+    PaddedBuffer<float> z_v(M, N, K);
+    PaddedBuffer<float> z_ref(M, N, K);
+    for (int m = 0; m < M; m++) {
+      for (int n = 0; n < N; n++) {
+        for (int k = 0; k < K; k++) {
+          z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k);
+        }
+      }
+    }
+
+    SimpleIREvaluator eval(stmt, {a_buf, b_buf, c_buf, d_buf, z});
+    eval(a_v, b_v, c_v, d_v, z_v);
+    ExpectAllNear(z_v, z_ref, 1e-5);
+  }
+
+  if (inline_order.size() == 2) {
+    Tensor z2 = Compute(
+        "z",
+        {M, N, K},
+        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+          return a_buf.load(m, n) * b_buf.load(n, k) +
+              (c_buf.load(m, n) * d_buf.load(m, k) +
+               a_buf.load(m, n) * b_buf.load(n, k));
+        });
+    LoopNest l2({z2});
+    l2.prepareForCodegen();
+    StmtPtr stmt2 = l2.root_stmt();
+
+    std::ostringstream oss2;
+    oss2 << *stmt2;
+    std::string str2 = remove_space(oss2.str());
+
+    ASSERT_EQ(str1, str2);
+    ASSERT_GT(str1.size(), 100);
+  }
+}
+
+TEST(LoopNest, ScheduleInlineFunc01) {
+  InlineFunc01Helper({"x", "y"});
+  InlineFunc01Helper({"y", "x"});
+  InlineFunc01Helper({"x"});
+  InlineFunc01Helper({"y"});
+  InlineFunc01Helper({});
+}
+
+// Make sure we cache random vars if we should.
+TEST(LoopNest, ScheduleInlineRandom) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return Mod::make(Intrinsics::make(kRand, kInt), 5);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m, n, k) + x.load(m, n, k);
+      });
+
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
+
+  // would normally compare results but Rand isn't implemented in the
+  // SimpleIREvaluator, even if we could seed it.
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+
+  // Check the IR we produced
+  checkIR(stmt1, R"IR(
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       int x = rand();
+# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
+}
+
+// Make sure we don't cache random vars that are not being inlined.
+TEST(LoopNest, ScheduleInlineRandomUnrelated) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return m * n * k;
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
+            Intrinsics::make(kRand, kInt);
+      });
+
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
+
+  // would normally compare results but Rand isn't implemented in the
+  // SimpleIREvaluator, even if we could seed it.
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+
+  // Check the IR we produced
+  checkIR(stmt1, R"IR(
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       y[i, i_1, i_2] = ((i * i_1) * i_2 + (rand())) + (rand());)IR");
+}
+
+// Make sure we generate the right number of random values == the dimensionality
+// of the production tensor.
+TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  Tensor x = Compute("x", {M}, [&](const VarHandle& m) {
+    return Mod::make(Intrinsics::make(kRand, kInt), 5);
+  });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m) + x.load(m);
+      });
+
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
+
+  // would normally compare results but Rand isn't implemented in the
+  // SimpleIREvaluator, even if we could seed it.
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+
+  // Check the IR we produced
+  checkIR(stmt1, R"IR(
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   int x = rand();
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
+}
+
+// Make sure we don't screw up intrinsics thinking they're rand.
+TEST(LoopNest, ScheduleInlineIntrinsics) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) * b_buf.load(n, k);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
+      });
+
+  PaddedBuffer<float> a_v(M, N);
+  PaddedBuffer<float> b_v(N, K);
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      a_v(i, j) = i * i;
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      b_v(i, j) = j * j;
+    }
+  }
+
+  LoopNest l1({y}, {x, y});
+  LoopNest l2(l1);
+  l2.computeInline(x.buf());
+
+  l1.prepareForCodegen();
+  l2.prepareForCodegen();
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
+
+  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
+  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
+
+  PaddedBuffer<float> y_1(M, N, K);
+  PaddedBuffer<float> y_2(M, N, K);
+
+  eval1(a_v, b_v, y_1);
+  eval2(a_v, b_v, y_2);
+  ExpectAllNear(y_1, y_2, 1e-5);
+  std::ostringstream oss1, oss2;
+  oss1 << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_GT(oss1.str().size(), oss2.str().size());
+}
+
+// Make sure we can handle rand and non-rand intrinsics.
+TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return Intrinsics::make(kRand, kFloat);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
+      });
+
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+
+  // Check the IR we produced
+  checkIR(stmt1, R"IR(
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       float x = rand();
+# CHECK:       y[i, i_1, i_2] = sqrt(x);)IR");
+}
+
+// Split a Compute then inline it into another compute.
+TEST(LoopNest, ScheduleSplitAThenInline) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 4);
+  ASSERT_FALSE(l.computeInline(a.buf()));
+}
+
+// Split a Compute then inline another Compute into it.
+TEST(LoopNest, ScheduleSplitBThenInline) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 3);
+  l.computeInline(a.buf());
+  l.prepareForCodegen();
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+
+  std::vector<int> output(6, 0);
+  SimpleIREvaluator eval(s, {b});
+  eval(output);
+
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(output[i], (i + 8) * (i + 8));
+  }
+}
+
+// Split a Compute twice then inline it.
+TEST(LoopNest, ScheduleSplitTwiceThenInline) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  ForPtr i_inner;
+
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 4, &i_inner);
+  LoopNest::splitWithMask(i_inner, 2);
+  ASSERT_FALSE(l.computeInline(a.buf()));
+}
+
+// Inline a Compute, then split.
+TEST(LoopNest, ScheduleInlineThenSplit) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+
+  LoopNest l({b}, {a, b});
+  l.computeInline(a.buf());
+
+  std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
+  LoopNest::splitWithMask(loops.back(), 3);
+  l.prepareForCodegen();
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  std::vector<int> output(6, 0);
+  SimpleIREvaluator eval(s, {b});
+  eval(output);
+
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(output[i], (i + 8) * (i + 8));
+  }
+}
+
+// Split a Compute, inline it, then split the result.
+TEST(LoopNest, ScheduleSplitInlineThenSplit) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {16}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+
+  LoopNest l({b}, {a, b});
+  auto loops = NodeFinder<For>::find(l.root_stmt());
+  LoopNest::splitWithMask(loops.back(), 2);
+  l.computeInline(a.buf());
+
+  loops = NodeFinder<For>::find(l.root_stmt());
+  LoopNest::splitWithMask(loops.front(), 2);
+  l.prepareForCodegen();
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  std::vector<int> output(16, 0);
+  SimpleIREvaluator eval(s, {b});
+  eval(output);
+
+  for (int i = 0; i < 16; ++i) {
+    ASSERT_EQ(output[i], (i + 8) * (i + 8));
+  }
+}
+
+// Oversplit a loop that is simplified out after inlining.
+TEST(LoopNest, ScheduleSplitInlineSimplify) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) {
+    return ExprHandle(4) * i - ExprHandle(2) * i;
+  });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j) - ExprHandle(1); });
+
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 4);
+  ASSERT_FALSE(l.computeInline(a.buf()));
+}
+
+// Inline a Compute with two consumers.
+TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
+  });
+
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
+  l.prepareForCodegen();
+
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  std::vector<int> output(4 * 3, 0);
+  SimpleIREvaluator eval(s, {c});
+  eval(output);
+
+  for (int k = 0; k < 4; ++k) {
+    for (int l = 0; l < 3; ++l) {
+      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
+    }
+  }
+}
+
+// Inline Compute A into B, then inline B into C.
+TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
+  });
+
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
+  l.computeInline(b.buf());
+  l.prepareForCodegen();
+
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  std::vector<int> output(4 * 3, 0);
+  SimpleIREvaluator eval(s, {c});
+  eval(output);
+
+  for (int k = 0; k < 4; ++k) {
+    for (int l = 0; l < 3; ++l) {
+      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
+    }
+  }
+}
+
+// Inline a Compute that is both a producer and consumer.
+TEST(LoopNest, ScheduleInlineThreeMixedInner) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
+  });
+
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(b.buf());
+  l.prepareForCodegen();
+
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  std::vector<int> output(4 * 3, 0);
+  SimpleIREvaluator eval(s, {c});
+  eval(output);
+
+  for (int k = 0; k < 4; ++k) {
+    for (int l = 0; l < 3; ++l) {
+      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
+    }
+  }
+}
+
+// Split 3 Computes, then inline the first two into the last.
+TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
+  });
+
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 4);
+  loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 3);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+  LoopNest::splitWithMask(loops[0], 2);
+
+  ASSERT_FALSE(l.computeInline(a.buf()));
+}
+
+// Check that inlining works for output tensors too
+TEST(LoopNest, ScheduleInlineOutputTensors) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return m * n * k;
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m, n, k) + m;
+      });
+
+  LoopNest l1({x, y});
+  l1.computeInline(x.buf());
+
+  // would normally compare results but Rand isn't implemented in the
+  // SimpleIREvaluator, even if we could seed it.
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+
+  // Check the IR we produced
+  checkIR(stmt1, R"IR(
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       x[i, i_1, i_2] = (i * i_1) * i_2;
+# CHECK: for (int i_3 = 0; i_3 < 4; i_3++)
+# CHECK:   for (int i_4 = 0; i_4 < 5; i_4++)
+# CHECK:     for (int i_5 = 0; i_5 < 6; i_5++)
+# CHECK:       y[i_3, i_4, i_5] = i_3 + (i_3 * i_4) * i_5;)IR");
+}
+
+TEST(LoopNest, ScheduleInlineWithCompoundIndices) {
+  // Input IR:
+  //     for (int64_t i = 0; i < 100; i++) {
+  //       A[i*2,i] = i * 500ll;
+  //     }
+  //     for (int64_t j = 0; j < 100; j++) {
+  //       B[0ll,j] = A[0, j] + j * 100ll;
+  //     }
+  BufHandle a_buf("A", {20, 100}, kLong);
+  BufHandle b_buf("B", {20, 100}, kLong);
+  VarHandle i("i", kLong);
+  VarHandle j("j", kLong);
+  auto forI = For::make(
+      i,
+      0,
+      100,
+      Store::make(a_buf, {i * 2, i}, Mul::make(i, static_cast<int64_t>(500))));
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          b_buf,
+          {static_cast<int64_t>(0), j},
+          Add::make(
+              Load::make(a_buf, {static_cast<int64_t>(0), j}),
+              Mul::make(j, static_cast<int64_t>(100)))));
+  auto par = Block::make({forI, forJ});
+
+  LoopNest l(par, {b_buf.node()});
+  // Inlining should fail since the producer has compound expr as index.
+  ASSERT_FALSE(l.computeInline(a_buf.node()));
+
+  // The input statement must remain as is.
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int64_t i = 0;
+    # CHECK-NEXT:   A[
+    # CHECK: for (int64_t j = 0;
+    # CHECK-NEXT:   B[)IR");
+}
+
+TEST(LoopNest, ScheduleInlineConsumerIndicesWithCast) {
+  // Input IR:
+  //     for (int64_t i = 0; i < 100; i++) {
+  //       A[0ll,i] = i * 500ll;
+  //     }
+  //     for (int64_t j = 0; j < 100; j++) {
+  //       B[0ll,j] = A[(int64_t)0, j] + j * 100ll;
+  //     }
+  BufHandle a_buf("A", {20, 100}, kLong);
+  BufHandle b_buf("B", {20, 100}, kLong);
+  VarHandle i("i", kLong);
+  VarHandle j("j", kLong);
+  auto forI = For::make(
+      i,
+      0,
+      100,
+      Store::make(
+          a_buf,
+          {static_cast<int64_t>(0), i},
+          Mul::make(i, static_cast<int64_t>(500))));
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          b_buf,
+          {static_cast<int64_t>(0), j},
+          Add::make(
+              Load::make(a_buf, {0, j}),
+              Mul::make(j, static_cast<int64_t>(100)))));
+  auto par = Block::make({forI, forJ});
+
+  LoopNest l(par, {b_buf.node()});
+  ASSERT_TRUE(l.computeInline(a_buf.node()));
+
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int64_t j = 0; j < 100; j++) {
+    # CHECK:   B[0ll, j] = j * 500ll + j * 100ll;
+    # CHECK: })IR");
+}
+
+TEST(LoopNest, ScheduleInlineProducerIndicesWithCast) {
+  // Input IR:
+  //     for (int64_t i = 0; i < 100; i++) {
+  //       A[(int64_t)0,i] = i * 500ll;
+  //     }
+  //     for (int64_t j = 0; j < 100; j++) {
+  //       B[0ll,j] = A[0ll, j] + j * 100ll;
+  //     }
+  BufHandle a_buf("A", {20, 100}, kLong);
+  BufHandle b_buf("B", {20, 100}, kLong);
+  VarHandle i("i", kLong);
+  VarHandle j("j", kLong);
+  auto forI = For::make(
+      i,
+      0,
+      100,
+      Store::make(a_buf, {0, i}, Mul::make(i, static_cast<int64_t>(500))));
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          b_buf,
+          {static_cast<int64_t>(0), j},
+          Add::make(
+              Load::make(a_buf, {static_cast<int64_t>(0), j}),
+              Mul::make(j, static_cast<int64_t>(100)))));
+  auto par = Block::make({forI, forJ});
+
+  LoopNest l(par, {b_buf.node()});
+  ASSERT_TRUE(l.computeInline(a_buf.node()));
+
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int64_t j = 0; j < 100; j++) {
+    # CHECK:   B[0ll, j] = j * 500ll + j * 100ll;
+    # CHECK: })IR");
+}
+
+TEST(LoopNest, ScheduleFuserStyle) {
+  const int kVectorSize = 8;
+  const int kVectorCount = 128;
+  const int kTotalSize = kVectorSize * kVectorCount;
+
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+
+  Tensor b =
+      Compute("f", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
+        return a_buf.load(axes[0]) + 11.0f;
+      });
+
+  Tensor c =
+      Compute("g", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
+        return b.load(axes[0]) + 1.0f;
+      });
+
+  LoopNest l({b, c});
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+
+  std::vector<float> a_data(kTotalSize, 7.0f);
+  std::vector<float> b_data(kTotalSize, 0.0f);
+  std::vector<float> c_data(kTotalSize, 0.0f);
+  SimpleIREvaluator(s, {a_buf, b, c})(a_data, b_data, c_data);
+
+  for (int i = 0; i < kTotalSize; i++) {
+    ASSERT_EQ(b_data[i], 18.0f);
+    ASSERT_EQ(c_data[i], 19.0f);
+  }
+}
+
+TEST(LoopNest, ScheduleFuserThreeArg) {
+  const int kVectorSize = 8;
+  const int kVectorCount = 128;
+  const int kTotalSize = kVectorSize * kVectorCount;
+
+  BufHandle a("A", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle b("B", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle c("C", {ExprHandle(kTotalSize)}, kFloat);
+  BufHandle d("D", {ExprHandle(kTotalSize)}, kFloat);
+
+  Tensor e = Compute("e", {kTotalSize}, [&](const VarHandle& i) {
+    return a.load(i) + b.load(i);
+  });
+  Tensor f = Compute("f", {kTotalSize}, [&](const VarHandle& i) {
+    return e.load(i) + c.load(i);
+  });
+  Tensor g = Compute("g", {kTotalSize}, [&](const VarHandle& i) {
+    return f.load(i) + d.load(i);
+  });
+
+  LoopNest l({g}, {e, f, g});
+  l.computeInline(l.getLoopBodyFor(e));
+  l.computeInline(l.getLoopBodyFor(f));
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+
+  std::vector<float> a_data(kTotalSize, 1.0f);
+  std::vector<float> b_data(kTotalSize, 2.0f);
+  std::vector<float> c_data(kTotalSize, 3.0f);
+  std::vector<float> d_data(kTotalSize, 4.0f);
+  std::vector<float> g_data(kTotalSize, 0.0f);
+  SimpleIREvaluator(s, {a, b, c, d, g})(a_data, b_data, c_data, d_data, g_data);
+
+  for (int i = 0; i < kTotalSize; i++) {
+    ASSERT_EQ(g_data[i], 10.0f);
+  }
+}
+
+TEST(LoopNest, ScheduleDynamicShape2D) {
+  auto testWithSize = [](int32_t M, int32_t N) {
+    VarHandle m("m", kInt);
+    VarHandle n("n", kInt);
+    BufHandle a("a", {m, n}, kFloat);
+    BufHandle b("b", {m, n}, kFloat);
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
+          return a.load(i, j) + b.load(i, j);
+        });
+    LoopNest l({c});
+    StmtPtr s = l.root_stmt();
+    SimpleIREvaluator cg(s, {a, b, c, m, n});
+    std::vector<float> aData(M * N, 1.0f);
+    std::vector<float> bData(M * N, 2.0f);
+    std::vector<float> cData(M * N, 0.0f);
+    cg.call({aData, bData, cData, M, N});
+    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
+  };
+  testWithSize(1, 8);
+  testWithSize(16, 32);
+  testWithSize(37, 11);
+}
+
+TEST(LoopNest, LoopNestComputeAt_1) {
+  // Verify that compute_at works on the following example:
+  //
+  // for (int i_a = 0; i_a < N; i_a++) {
+  //   A[i_a] = i_a * i_a
+  // }
+  // for (int i_b = 0; i_b < N; i_b++) {
+  //   B[i_b] = A[i_b]
+  // }
+  //
+  // After the transformation the i_b loop should have an allocation for a temp
+  // buffer and that buffer should be used in computation of B. No use of A
+  // should be in that loop after the transformation. Also, computation of A
+  // should not be inlined into B. Instead, it should be computed into the temp,
+  // and the temp should be used in B.
+  VarHandle N("N", kInt);
+  Tensor A = Compute("A", {N}, [&](const VarHandle& i_a) { return i_a * i_a; });
+  Tensor B =
+      Compute("B", {N}, [&](const VarHandle& i_b) { return A.load(i_b); });
+  LoopNest l({B}, {A, B});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
+  LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
+  l.prepareForCodegen();
+  SimpleIREvaluator cg(l.root_stmt(), {B, N});
+  StmtPtr s = cg.stmt();
+
+  checkIR(s, R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[1]
+# CHECK: for (int i = 0; i < N; i++)
+# CHECK:   temp[
+# CHECK-NOT: A[
+# CHECK:   B[i_1] = temp[0]
+# CHECK:   Free(temp))IR");
+
+  // Now check that the loop still produces the correct result.
+  std::vector<int> b_data(100, 0);
+  cg.call({b_data, 100});
+
+  std::vector<int> b_ref(100, 0);
+  for (int i = 0; i < 100; i++) {
+    b_ref[i] = i * i;
+  }
+  assertAllEqual(b_data, b_ref);
+}
+
+TEST(LoopNest, LoopNestComputeAt_2) {
+  // Verify that compute_at works on the following example:
+  //
+  // for (int py = 0; py < H+1; py++) {
+  //   for (int px = 0; px < W+1; px++) {
+  //     p[py, px] = py*px
+  //   }
+  // }
+  // for (int cy = 0; cy < H; cy++) {
+  //   for (int cx = 0; cx < W; cx++) {
+  //     c[py, px] = p[cy,cx]   + p[cy+1,cx] +
+  //                 p[cy,cx+1] + p[cy+1,cx+1]
+  //   }
+  // }
+
+  const int kW = 16, kH = 16;
+  VarHandle W("W", kInt);
+  VarHandle H("H", kInt);
+  Tensor p = Compute(
+      "prod", {H + 1, W + 1}, [&](const VarHandle& py, const VarHandle& px) {
+        return px * py;
+      });
+  Tensor c =
+      Compute("cons", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+        return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
+            p.load(y + 1, x + 1);
+      });
+
+  std::vector<int> c_ref(kW * kH, 0);
+  for (int y = 0; y < kH; y++) {
+    for (int x = 0; x < kW; x++) {
+      c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
+    }
+  }
+  LoopNest orig_loopnest({c}, {p, c});
+
+  {
+    // First let's try to compute P at axis cy (the outer loop)
+    LoopNest l(orig_loopnest);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    StmtPtr s = cg.stmt();
+
+    // Check the IR we produced
+    checkIR(s, R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
+# CHECK:   for
+# CHECK:     for
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
+# CHECK-NOT: prod[
+# CHECK:     cons[
+# CHECK: Free(temp))IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+
+    assertAllEqual(c_data, c_ref);
+  }
+  {
+    // Now let's try to compute P at axis cx (the inner loop)
+    LoopNest l(orig_loopnest);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    StmtPtr s = cg.stmt();
+
+    // Check the IR we produced
+    checkIR(s, R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
+# CHECK:     for
+# CHECK:       for
+# CHECK-NOT: prod[
+# CHECK:     cons[
+# CHECK: Free(temp))IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+
+    assertAllEqual(c_data, c_ref);
+  }
+}
+
+TEST(LoopNest, LoopNestComputeAt_3) {
+  // Verify that compute_at works on the following example:
+  //
+  // A(x,y) = x*y
+  // B(x,y) = A(x, y)
+  // C(x,y) = B(x+1, y)
+  // D(x,y) = A(x, y+1) + C(x, y)
+  //
+  // i.e. when 'A' comes to 'D' directly and indirectly through 'C'.
+
+  const int kW = 16, kH = 16;
+  VarHandle W("W", kInt);
+  VarHandle H("H", kInt);
+  Tensor A = Compute(
+      "A", {H + 1, W + 1}, [&](const VarHandle& ay, const VarHandle& ax) {
+        return ax * ay;
+      });
+  Tensor B = Compute(
+      "B", {H + 1, W + 1}, [&](const VarHandle& by, const VarHandle& bx) {
+        return A.load(by, bx);
+      });
+  Tensor C =
+      Compute("C", {H, W}, [&](const VarHandle& cy, const VarHandle& cx) {
+        return B.load(cy, cx + 1);
+      });
+  Tensor D =
+      Compute("D", {H, W}, [&](const VarHandle& dy, const VarHandle& dx) {
+        return A.load(dy + 1, dx) + C.load(dy, dx);
+      });
+
+  std::vector<int> c_ref(kW * kH, 0);
+  for (int y = 0; y < kH; y++) {
+    for (int x = 0; x < kW; x++) {
+      c_ref[y * kW + x] = (y + 1) * x + y * (x + 1);
+    }
+  }
+
+  LoopNest orig_loopnest({D}, {A, B, C, D});
+  {
+    // First let's try to compute A at axis dy (the outer loop)
+    LoopNest l(orig_loopnest);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
+    StmtPtr s = cg.stmt();
+
+    // Check the IR we produced
+    checkIR(s, R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[1, W]
+# CHECK: for (int i = 0; i < H + 1; i++)
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
+# CHECK:     A[
+# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
+# CHECK:     B[
+# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
+# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
+# CHECK:     C[
+# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
+# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
+# CHECK-NOT: A[)IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+
+    assertAllEqual(c_data, c_ref);
+  }
+  {
+    // Now let's try to compute A at axis dx (the inner loop)
+    LoopNest l(orig_loopnest);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
+    StmtPtr s = cg.stmt();
+
+    // Check the IR we produced
+    checkIR(s, R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[1, 1]
+# CHECK: for (int i = 0; i < H + 1; i++)
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
+# CHECK:     A[
+# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
+# CHECK:     B[
+# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
+# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
+# CHECK:     C[
+# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
+# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
+# CHECK-NOT: A[)IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+
+    assertAllEqual(c_data, c_ref);
+  }
+}
+
+using Axis = const VarHandle&;
+
+TEST(LoopNest, Reduce2dComputeAt) {
+  const int kW = 16, kH = 16;
+  VarHandle W("W", kInt);
+  VarHandle H("H", kInt);
+
+  Tensor p = Compute(
+      "prod", {H + 1, W + 1}, [&](Axis py, Axis px) { return px * py; });
+  Tensor c = Reduce(
+      "cons",
+      {H, W},
+      Sum(),
+      [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
+      {2, 2});
+
+  std::vector<int> c_ref(kW * kH, 0);
+  for (int y = 0; y < kH; y++) {
+    for (int x = 0; x < kW; x++) {
+      c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
+    }
+  }
+  LoopNest orig_loopnest({c}, {p, c});
+  checkIR(orig_loopnest.root_stmt(), R"IR(
+# CHECK: for (int i = 0; i < H + 1; i++) {
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++) {
+# CHECK:     prod[i, i_1] = i_1 * i;
+# CHECK:   }
+# CHECK: }
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++) {
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++) {
+# CHECK:     cons[i_2, i_3] = int(0);
+# CHECK:     for (int i_4 = 0; i_4 < 2; i_4++) {
+# CHECK:       for (int i_5 = 0; i_5 < 2; i_5++) {
+# CHECK:         cons[i_2, i_3] = ReduceOp((cons[i_2, i_3]) + (prod[i_2 + i_4, i_3 + i_5]), reduce_args={i_4, i_5});
+# CHECK:       }
+# CHECK:     }
+# CHECK:   }
+# CHECK: }
+)IR");
+
+  {
+    // First let's try to compute P at axis cy (the outer loop)
+    LoopNest l(orig_loopnest);
+    auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
+    // FIXME: Calling simplify here breaks the IR:
+    // MALFORMED INPUT: could not find base node in Load - temp[...]
+    // l.simplify();
+    l.eliminateDeadStores();
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
+# CHECK: for (int i = 0; i < H; i++) {
+# CHECK:   for (int idx0 = 0; idx0 < 2; idx0++) {
+# CHECK:     for (int idx1 = 0; idx1 < W + 1; idx1++) {
+# CHECK:       temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + i) * (idx1 + 0);
+# CHECK:     }
+# CHECK:   }
+# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
+# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = int(0);
+# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
+# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
+# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * (W + 1))) + (i_1 + i_3) * 1]);
+# CHECK:       }
+# CHECK:     }
+# CHECK:   }
+# CHECK: }
+# CHECK: Free(temp);
+)IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+    assertAllEqual(c_data, c_ref);
+  }
+  {
+    // Now let's try to compute P at axis cx (the inner loop)
+    LoopNest l(orig_loopnest);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+    LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
+    l.simplify();
+    l.eliminateDeadStores();
+    l.prepareForCodegen();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
+# CHECK: for (int i = 0; i < H; i++) {
+# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
+# CHECK:     for (int idx0 = 0; idx0 < 2; idx0++) {
+# CHECK:       for (int idx1 = 0; idx1 < 2; idx1++) {
+# CHECK:         temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (i + idx0) * (i_1 + idx1);
+# CHECK:       }
+# CHECK:     }
+# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = 0;
+# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
+# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
+# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * 2)) + i_3 * 1]);
+# CHECK:       }
+# CHECK:     }
+# CHECK:   }
+# CHECK: }
+# CHECK: Free(temp);
+)IR");
+
+    // Now check that the loop still produces the correct result.
+    std::vector<int> c_data(kW * kH, 0);
+    cg.call({c_data, kW, kH});
+    assertAllEqual(c_data, c_ref);
+  }
+}
+
+TEST(LoopNest, DISABLED_Conv1d_NH) {
+  // Lots of stuff is broken here.  The computeAt swaps the axes for some odd
+  // reason.  Even without that, the index flattener fails due to "dimensions
+  // mismatch in flatten index".
+
+  int N = 4;
+  int H = 256;
+  int R = 3;
+  int Pad = 1;
+  BufHandle IP("input", {H}, kFloat);
+
+  Tensor A = Compute("A", {N, H + 2 * Pad}, [&](Axis n, Axis h) {
+    auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
+    cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
+    return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
+  });
+  Tensor B = Reduce(
+      "B",
+      {N, H},
+      Sum(),
+      [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
+      {R});
+  LoopNest l({B});
+  checkIR(l.root_stmt(), R"IR(
+# CHECK: for (int np = 0; np < 4; np++) {
+# CHECK:   for (int hp = 0; hp < 258; hp++) {
+# CHECK:     A[np, hp] = IfThenElse(hp>=257 ? 1 : (hp<1 ? 1 : 0), 0.f, input[np, hp - 1]);
+# CHECK:   }
+# CHECK: }
+# CHECK: for (int n = 0; n < 4; n++) {
+# CHECK:   for (int h = 0; h < 256; h++) {
+# CHECK:     B[n, h] = float(0);
+# CHECK:     for (int r = 0; r < 3; r++) {
+# CHECK:       B[n, h] = ReduceOp((B[n, h]) + (A(n, h + r)), reduce_args={r});
+# CHECK:     }
+# CHECK:   }
+# CHECK: }
+)IR");
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
+  LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
+  // FIXME: The current IR is totally broken.  The body of the inlined loop is:
+
+  // temp[idx0, idx1] = IfThenElse(idx0 + n>=257 ? 1 : (idx0 + n<1 ? 1 : 0),
+  // 0.f, input[idx1 + 0, (idx0 + n) - 1]);
+
+  // Which seems to mix up the axes.  The CHECK below is my best guess at what
+  // the input "should" look like
+
+  checkIR(l.root_stmt(), R"IR(
+# CHECK: for (int n = 0; n < 4; n++) {
+# CHECK:   for (int idx0 = 0; idx0 < 1; idx0++) {
+# CHECK:     for (int idx1 = 0; idx1 < 258; idx1++) {
+        temp[idx0, idx1] = IfThenElse(idx1>=257 ? 1 : (idx1<1 ? 1 : 0), 0.f, input[n, idx1 - 1]);
+# CHECK:     }
+# CHECK:   }
+# CHECK:   for (int h = 0; h < 256; h++) {
+# CHECK:     B[n, h] = float(0);
+# CHECK:     for (int r = 0; r < 3; r++) {
+# CHECK:       B[n, h] = ReduceOp((B[n, h]) + (temp[0, r + h]), reduce_args={r});
+# CHECK:     }
+# CHECK:   }
+# CHECK: }
+)IR");
+
+  l.simplify();
+  l.prepareForCodegen();
+  StmtPtr s = l.root_stmt();
+
+  SimpleIREvaluator cg(s, {IP, B});
+  // auto At = at::ones({N, H}, at::kFloat);
+  auto At = at::arange(N * H, at::kFloat).reshape({N, H});
+  auto Rt = at::conv1d(
+      At, at::ones({1, 1, 3}), at::Tensor(), /*stride=*/1, /*padding=*/3);
+  auto Bt = at::empty_like(Rt);
+  cg.call({At.data_ptr<float>(), Bt.data_ptr<float>()});
+  ASSERT_TRUE(at::allclose(Rt, Bt));
+}
+
+class LoopOrderHelper : public IRVisitor {
+  std::stringstream ordering;
+
+ public:
+  std::string getOrder(StmtPtr s) {
+    ordering.str("");
+    s->accept(this);
+    return ordering.str();
+  }
+
+  void visit(const ForPtr& v) final {
+    ordering << v->var()->name_hint() << ",";
+    IRVisitor::visit(v);
+  }
+};
+
+TEST(LoopNest, LoopNestReorderAxis1) {
+  Tensor tensor =
+      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+      });
+  LoopNest l({tensor});
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  std::vector<int> stmt1_output(6, 0);
+  SimpleIREvaluator cg(stmt1, {tensor});
+  cg.call({stmt1_output});
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[0], loops[1]);
+  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  ASSERT_NE(stmt1, stmt2);
+  LoopOrderHelper loopOrderHelper;
+  std::string order1 = loopOrderHelper.getOrder(stmt1);
+  std::string order2 = loopOrderHelper.getOrder(stmt2);
+
+  ASSERT_EQ(order1, "j,i,");
+  ASSERT_EQ(order2, "i,j,");
+
+  std::vector<int> stmt2_output(6, 0);
+  SimpleIREvaluator cg2(stmt2, {tensor});
+  cg.call({stmt2_output});
+
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
+  }
+
+  // Reorder them back.
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[0], loops[1]);
+  StmtPtr stmt3 = l.root_stmt();
+
+  std::string order3 = loopOrderHelper.getOrder(stmt3);
+  ASSERT_EQ(order3, order1);
+
+  std::ostringstream oss1, oss2;
+  oss1 << *stmt1;
+  oss2 << *stmt3;
+
+  // Should be identical to the unreordered statement.
+  ASSERT_EQ(oss1.str(), oss2.str());
+}
+
+TEST(LoopNest, LoopNestReorderPartialAxes) {
+  Tensor tensor = Compute(
+      "f",
+      {2, 3, 4},
+      [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
+            cast<float>(z) * z;
+      });
+  LoopNest l({tensor});
+
+  LoopOrderHelper loopOrderHelper;
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,");
+
+  std::vector<int> stmt1_output(24, 0);
+  SimpleIREvaluator cg(stmt1, {tensor});
+  cg.call({stmt1_output});
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[0], loops[1]);
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,i,k,");
+
+  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
+
+  std::vector<int> stmt2_output(24, 0);
+  SimpleIREvaluator cg2(stmt2, {tensor});
+  cg2.call({stmt2_output});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
+  }
+
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[1], loops[2]);
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,k,i,");
+
+  StmtPtr stmt3 = Stmt::clone(l.root_stmt());
+
+  std::vector<int> stmt3_output(24, 0);
+  SimpleIREvaluator cg3(stmt3, {tensor});
+  cg3.call({stmt3_output});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(stmt1_output[i], stmt3_output[i]);
+  }
+}
+
+TEST(LoopNest, LoopNestReorderInternalAxis) {
+  Tensor tensor = Compute(
+      "f",
+      {1, 2, 3, 4},
+      [](const VarHandle& w,
+         const VarHandle& x,
+         const VarHandle& y,
+         const VarHandle& z) {
+        return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y +
+            cast<float>(z) * z;
+      });
+  LoopNest l({tensor});
+
+  LoopOrderHelper loopOrderHelper;
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,l,");
+
+  std::vector<int> stmt1_output(24, 0);
+  SimpleIREvaluator cg(stmt1, {tensor});
+  cg.call({stmt1_output});
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[2], loops[1]);
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "i,k,j,l,");
+
+  StmtPtr stmt2 = l.root_stmt();
+
+  std::vector<int> stmt2_output(24, 0);
+  SimpleIREvaluator cg2(stmt2, {tensor});
+  cg2.call({stmt2_output});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
+  }
+}
+
+TEST(LoopNest, LoopNestReorderEnclosingAxis) {
+  Tensor tensor = Compute(
+      "f",
+      {1, 2, 3, 4},
+      [](const VarHandle& w,
+         const VarHandle& x,
+         const VarHandle& y,
+         const VarHandle& z) {
+        return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y +
+            cast<float>(z) * z;
+      });
+  LoopNest l({tensor});
+
+  LoopOrderHelper loopOrderHelper;
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  std::vector<int> stmt1_output(24, 0);
+  SimpleIREvaluator cg(stmt1, {tensor});
+  cg.call({stmt1_output});
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[0], loops[3]);
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "l,j,k,i,");
+
+  StmtPtr stmt2 = l.root_stmt();
+
+  std::vector<int> stmt2_output(24, 0);
+  SimpleIREvaluator cg2(stmt2, {tensor});
+  cg2.call({stmt2_output});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
+  }
+}
+
+TEST(LoopNest, LoopNestReorderSameAxis) {
+  Tensor tensor =
+      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+      });
+  LoopNest l({tensor});
+  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[1], loops[1]);
+  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
+
+  std::ostringstream oss, oss2;
+  oss << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_EQ(oss.str(), oss2.str());
+}
+
+TEST(LoopNest, LoopNestReorderExtraStatements) {
+  /* We're going for a structure like this:
+   * for i in ...
+   *   Stmt 1
+   *   for j in ...
+   *     Stmt 2
+   *     for k in ...
+   *       Stmt 3
+   *     Stmt 4
+   */
+
+  Tensor tensor = Compute(
+      "f",
+      {2, 3, 4},
+      [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
+            cast<float>(z) * z;
+      });
+  LoopNest l({tensor});
+
+  BufHandle extra("res", {6, 3}, kFloat);
+
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+
+  VarHandle i = VarHandle(loops[0]->var());
+
+  StmtPtr store_1 = Store::make(extra, {i, 0}, 1.f);
+  StmtPtr store_2 = Store::make(extra, {i, 1}, 2.f);
+  // stmt 3 is the Function body.
+  StmtPtr store_3 = Store::make(extra, {i, 2}, 4.f);
+
+  loops[0]->body()->prepend_stmt(store_1);
+  loops[1]->body()->prepend_stmt(store_2);
+  loops[1]->body()->append_stmt(store_3);
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  std::vector<int> extra1(6, 0);
+  std::vector<int> res1(24, 0);
+  SimpleIREvaluator cg(stmt1, {tensor, extra});
+  cg.call({res1, extra1});
+
+  /* Then we reorder loop y and z, we want it to look like:
+   *
+   * for i in ...
+   *   Stmt 1
+   *   for j in ...
+   *     Stmt 2
+   *   for j_1 in ...
+   *    for k in ...
+   *       Stmt 3
+   *   for j_2 in ...
+   *     Stmt 4
+   *
+   * We need extra loops because we don't have dependency info about stmt 3
+   * and 4.
+   *
+   */
+
+  LoopNest::reorderAxis(loops[1], loops[2]);
+  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  // Check the IR we produced
+  checkIR(stmt2, R"IR(
+# CHECK: for
+# CHECK:   res[i, 0] = 1
+# CHECK:   for
+# CHECK:     res[i, 1] = 2
+# CHECK:   for
+# CHECK:     for
+# CHECK:       f[
+# CHECK:   for
+# CHECK:     res[i, 2] = 4
+)IR");
+
+  std::vector<int> extra2(6, 0);
+  std::vector<int> res2(24, 0);
+  SimpleIREvaluator cg2(stmt2, {tensor, extra});
+  cg2.call({res2, extra2});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(res1[i], res2[i]);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(extra1[i], extra2[i]);
+  }
+
+  /* Now reorder x and the y above stmt 3:
+   *
+   *
+   * for x in ...
+   *   Stmt 1
+   *   for y in ...
+   *     Stmt 2
+   *
+   * for y in ...
+   *   for z in ...
+   *    for x in ...
+   *       Stmt 3
+   *
+   * for x in ...
+   *   for y in ...
+   *     Stmt 4
+   *
+   *
+   */
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
+  LoopNest::reorderAxis(loops[0], loops[2]);
+  StmtPtr stmt3 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+
+  // Check the IR we produced
+  checkIR(stmt3, R"IR(
+# CHECK: for
+# CHECK:   res[i, 0] = 1
+# CHECK:   for
+# CHECK:     res[i, 1] = 2
+# CHECK: for
+# CHECK:   for
+# CHECK:     for
+# CHECK:       f[
+# CHECK: for
+# CHECK:   for
+# CHECK:     res[i_2, 2] = 4
+)IR");
+
+  std::vector<int> extra3(6, 0);
+  std::vector<int> res3(24, 0);
+  SimpleIREvaluator cg3(stmt3, {tensor, extra});
+  cg3.call({res3, extra3});
+
+  for (int i = 0; i < 24; ++i) {
+    ASSERT_EQ(res1[i], res3[i]);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(extra1[i], extra3[i]);
+  }
+}
+
+void LoopNestReorderTestHelper(
+    bool prepend,
+    bool append,
+    int index1,
+    int index2) {
+  Tensor c = Compute(
+      "5d", {2, 3, 2, 3, 2}, [](const std::vector<VarHandle>&) { return -1; });
+  LoopNest l({c});
+
+  BufHandle extra("extra", {5}, kInt);
+
+  auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+  int j = 0;
+  for (auto l : loops) {
+    // Add an increment at each layer of the loop which counts the number of
+    // times the loop executes.
+    LoadPtr load =
+        alloc<Load>(extra.node(), std::vector<ExprPtr>({alloc<IntImm>(j)}));
+    AddPtr add = alloc<Add>(load, alloc<IntImm>(1));
+    StmtPtr store = alloc<Store>(
+        extra.node(), std::vector<ExprPtr>({alloc<IntImm>(j)}), add);
+    if (prepend) {
+      l->body()->prepend_stmt(store);
+    }
+    if (append) {
+      l->body()->append_stmt(Stmt::clone(store));
+    }
+
+    j++;
+  }
+
+  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
+
+  std::vector<int> extra1(5, 0);
+  std::vector<int> res1(2 * 3 * 2 * 3 * 2, 0);
+  SimpleIREvaluator cg(stmt1, {c, extra});
+  cg.call({res1, extra1});
+
+  std::vector<int> loopExtents = {2, 3, 2, 3, 2};
+
+  int expected_loops = 0;
+  if (prepend) {
+    expected_loops++;
+  }
+  if (append) {
+    expected_loops++;
+  }
+  for (int i = 0; i < 5; ++i) {
+    expected_loops *= loopExtents[i];
+    ASSERT_EQ(extra1[i], expected_loops);
+  }
+
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
+  LoopNest::reorderAxis(loops[index1], loops[index2]);
+  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
+
+  std::ostringstream oss, oss2;
+  oss << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_NE(oss.str(), oss2.str());
+
+  std::vector<int> extra2(5, 0);
+  std::vector<int> res2(2 * 3 * 2 * 3 * 2, 0);
+  SimpleIREvaluator cg2(stmt2, {c, extra});
+  cg2.call({res2, extra2});
+
+  expected_loops = 0;
+  if (prepend) {
+    expected_loops++;
+  }
+  if (append) {
+    expected_loops++;
+  }
+
+  for (int i = 0; i < 5; ++i) {
+    expected_loops *= loopExtents[i];
+    ASSERT_EQ(extra2[i], expected_loops);
+  }
+
+  for (int i = 0; i < 2 * 3 * 2 * 3 * 2; ++i) {
+    ASSERT_EQ(res2[i], res1[i]);
+  }
+}
+
+TEST(LoopNest, LoopNestReorderLongStringOfPreOrphans) {
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      // skip noops, since we check the loop isn't the same after reordering.
+      if (i != j) {
+        LoopNestReorderTestHelper(true, false, i, j);
+      }
+    }
+  }
+}
+
+TEST(LoopNest, LoopNestReorderLongStringOfPostOrphans) {
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      // skip noops, since we check the loop isn't the same after reordering.
+      if (i != j) {
+        LoopNestReorderTestHelper(false, true, i, j);
+      }
+    }
+  }
+}
+
+TEST(LoopNest, LoopNestReorderLongStringFull) {
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      // skip noops, since we check the loop isn't the same after reordering.
+      if (i != j) {
+        LoopNestReorderTestHelper(true, true, i, j);
+      }
+    }
+  }
+}
+
+TEST(LoopNest, LoopNestReorderInternalLoopNest) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+  BufHandle a_buf("a", {M, N}, kFloat);
+  BufHandle b_buf("b", {N, K}, kFloat);
+  BufHandle c_buf("c", {M, N}, kFloat);
+  BufHandle d_buf("d", {M, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) * b_buf.load(n, k);
+      });
+  Tensor y = Compute(
+      "y",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
+      });
+  Tensor z = Compute(
+      "z",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return x.load(m, n, k) + y.load(m, n, k);
+      });
+
+  LoopNest l({z}, {x, y, z});
+  ForPtr a = l.getAllLoopNestsWritingToBuf(y.buf())[0][2];
+  ForPtr b = l.getAllLoopNestsWritingToBuf(y.buf())[0][0];
+  LoopNest::reorderAxis(a, b);
+
+  l.prepareForCodegen();
+  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
+
+  // Check the IR we produced has the 3 nests in the right order, but k and m
+  // swapped in the middle.
+  checkIR(stmt, R"IR(
+# CHECK: < 4
+# CHECK: < 5
+# CHECK: < 6
+# CHECK: < 6
+# CHECK: < 5
+# CHECK: < 4
+# CHECK: < 4
+# CHECK: < 5
+# CHECK: < 6)IR");
+
+  {
+    PaddedBuffer<float> a_v(M, N);
+    PaddedBuffer<float> b_v(N, K);
+    PaddedBuffer<float> c_v(M, N);
+    PaddedBuffer<float> d_v(M, K);
+
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        a_v(i, j) = i * i;
+      }
+    }
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < K; j++) {
+        b_v(i, j) = j * j;
+      }
+    }
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        c_v(i, j) = i + j;
+      }
+    }
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < K; j++) {
+        d_v(i, j) = i * j;
+      }
+    }
+
+    PaddedBuffer<float> z_v(M, N, K);
+    PaddedBuffer<float> z_ref(M, N, K);
+    for (int m = 0; m < M; m++) {
+      for (int n = 0; n < N; n++) {
+        for (int k = 0; k < K; k++) {
+          z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k);
+        }
+      }
+    }
+
+    SimpleIREvaluator eval(stmt, {a_buf, b_buf, c_buf, d_buf, z});
+    eval(a_v, b_v, c_v, d_v, z_v);
+    ExpectAllNear(z_v, z_ref, 1e-5);
+  }
+}
+
+TEST(LoopNest, OuterLoopVectorization) {
+  Tensor tensor =
+      Compute("f", {8, 8}, [](const VarHandle& x, const VarHandle& y) {
+        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
+      });
+  LoopNest l({tensor});
+
+  ASSERT_TRUE(
+      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0]));
+
+  StmtPtr root_stmt = l.root_stmt();
+  BlockPtr outer_block = to<Block>(root_stmt);
+  ASSERT_NE(outer_block, nullptr);
+  while (BlockPtr inner_block = to<Block>(outer_block->front())) {
+    outer_block = inner_block;
+  }
+
+  // Verify that we have only a single loop level remaining after
+  // vectorization.
+  ASSERT_EQ(outer_block->nstmts(), 1);
+  ForPtr for_loop = to<For>(outer_block->front());
+  ASSERT_NE(for_loop, nullptr);
+  BlockPtr for_body = for_loop->body();
+  ASSERT_EQ(for_body->nstmts(), 1);
+  ASSERT_EQ(to<For>(for_body->front()), nullptr);
+}
+
+TEST(LoopNest, VectorizeLoopNotNormalized) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     for (int j = 1; j < 5; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 1, 5, for_body);
+  auto outer_for = For::make(i, 0, 10, inner_for);
+  auto block = Block::make({outer_for});
+  LoopNest l(block, {a_buf.node()});
+
+  ASSERT_TRUE(LoopNest::vectorize(inner_for));
+  ASSERT_EQ(outer_for->body()->nstmts(), 1);
+  ASSERT_EQ(to<For>(outer_for->body()->front()), nullptr);
+}
+
+namespace {
+
+std::string constantUpperBoundLoopIR(int upper_bound_val) {
+  ExprHandle upper_bound(upper_bound_val);
+  Tensor A =
+      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
+  LoopNest l({A});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(loops[0], &unrolled);
+  std::ostringstream oss;
+  oss << *unrolled;
+  return oss.str();
+}
+
+} // namespace
+
+TEST(LoopNest, Unroll) {
+  const std::string actual = constantUpperBoundLoopIR(3);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: A[0] = 0;
+# CHECK: A[1] = 2;
+# CHECK: A[2] = 4)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, actual);
+}
+
+TEST(LoopNest, UnrollOuter) {
+  ExprHandle outer_bound(3);
+  ExprHandle inner_bound(4);
+  Tensor A = Compute(
+      "A",
+      {outer_bound, inner_bound},
+      [&](const VarHandle& x, const VarHandle& y) { return x + y; });
+  LoopNest l({A});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(loops[0], &unrolled);
+  checkIR(unrolled, R"IR(
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[0, i] = i;
+# CHECK: }
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[1, i] = i + 1;
+# CHECK: }
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[2, i] = i + 2;
+# CHECK: })IR");
+}
+
+TEST(LoopNest, UnrollInner) {
+  ExprHandle outer_bound(3);
+  ExprHandle inner_bound(4);
+  Tensor A = Compute(
+      "A",
+      {outer_bound, inner_bound},
+      [&](const VarHandle& x, const VarHandle& y) { return x + y; });
+  LoopNest l({A});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(
+      static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
+  checkIR(loops[0], R"IR(
+# CHECK: for (int i = 0; i < 3; i++) {
+# CHECK: A[i, 0] = i;
+# CHECK: A[i, 1] = i + 1;
+# CHECK: A[i, 2] = i + 2;
+# CHECK: A[i, 3] = i + 3;
+# CHECK: })IR");
+}
+
+TEST(LoopNest, UnrollMultipleStatements) {
+  const int kTotalSize = 3;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle x("x", kInt);
+  auto f = For::make(
+      x,
+      0,
+      kTotalSize,
+      Block::make(
+          {Store::make(a_buf, {x}, x * 2),
+           Store::make(b_buf, {x}, Load::make(a_buf, {x}))}));
+  auto parent_block = Block::make({f});
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(f, &unrolled);
+  checkIR(unrolled, R"IR(
+# CHECK: A[0] = 0;
+# CHECK: B[0] = A[0];
+# CHECK: A[1] = 2;
+# CHECK: B[1] = A[1];
+# CHECK: A[2] = 4
+# CHECK: B[2] = A[2];)IR");
+}
+
+TEST(LoopNest, UnrollNonLiteralConstantBounds) {
+  // Input IR:
+  //   for (int i = 2 - 1; i < 12 / 3; i++) {
+  //     for (int j = 0; j < 4; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {3, 4}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 0, 4, for_body);
+  auto outer_for = For::make(
+      i,
+      IntImm::make(2) - IntImm::make(1),
+      IntImm::make(12) / IntImm::make(3),
+      inner_for);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto b = Block::make({outer_for});
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(loops[0], &unrolled);
+  checkIR(unrolled, R"IR(
+# CHECK: for (int j = 0; j < 4; j++) {
+# CHECK:   A[1, j] = j;
+# CHECK: }
+# CHECK: for (int j = 0; j < 4; j++) {
+# CHECK:   A[2, j] = 2 * j;
+# CHECK: }
+# CHECK: for (int j = 0; j < 4; j++) {
+# CHECK:   A[3, j] = 3 * j;
+# CHECK: })IR");
+}
+
+TEST(LoopNest, UnrollNonConstantBounds) {
+  // Input IR:
+  //   for (int i = 0; i < M; i++) {
+  //     for (int j = 0; j < N; j++) {
+  //       A[i, j] = i * j;
+  //     }
+  //   }
+  VarHandle M("M", kInt);
+  VarHandle N("N", kInt);
+  BufHandle a_buf("A", {M, N}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 0, N, for_body);
+  auto outer_for = For::make(i, 0, M, inner_for);
+  auto block = Block::make({outer_for});
+  LoopNest l(block, {a_buf.node()});
+
+  LoopNest::unroll(inner_for, 8);
+  l.simplify();
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int i = 0; i < M; i++) {
+    # CHECK:   for (int j_outer = 0; j_outer < N / 8; j_outer++) {
+    # CHECK:     A[i, 8 * j_outer] =
+    # CHECK:     A[i, 8 * j_outer + 1] =
+    # CHECK:     A[i, 2 * (4 * j_outer + 1)] =
+    # CHECK:     A[i, 8 * j_outer + 3] =
+    # CHECK:     A[i, 4 * (2 * j_outer + 1)] =
+    # CHECK:     A[i, 8 * j_outer + 5] =
+    # CHECK:     A[i, 8 * j_outer + 6] =
+    # CHECK:     A[i, 8 * j_outer + 7] =
+    # CHECK:   }
+    # CHECK:   for (int j_tail = 0; j_tail < N % 8; j_tail++) {
+    # CHECK:     A[i, 8 * (N / 8) + j_tail] =
+    # CHECK:   }
+    # CHECK: }
+  )IR");
+}
+
+TEST(LoopNest, UnrollByFactorsLessThan2) {
+  // Input IR:
+  //   for (int i = 0; i < M; i++) {
+  //     for (int j = 0; j < N; j++) {
+  //       A[i, j] = i * j;
+  //     }
+  //   }
+  VarHandle M("M", kInt);
+  VarHandle N("N", kInt);
+  BufHandle a_buf("A", {M, N}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 0, N, for_body);
+  auto outer_for = For::make(i, 0, M, inner_for);
+  auto block = Block::make({outer_for});
+  LoopNest l(block, {a_buf.node()});
+
+  // Unrolling by factor = 1 should do nothing.
+  LoopNest::unroll(inner_for, 1);
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int i = 0; i < M; i++) {
+    # CHECK:   for (int j = 0; j < N; j++) {
+    # CHECK:     A[i, j] =
+    # CHECK:   }
+    # CHECK: }
+  )IR");
+
+  // Unrolling by factor = 0 should do nothing.
+  LoopNest::unroll(inner_for, 0);
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int i = 0; i < M; i++) {
+    # CHECK:   for (int j = 0; j < N; j++) {
+    # CHECK:     A[i, j] =
+    # CHECK:   }
+    # CHECK: }
+  )IR");
+
+  // Unrolling by negative factor should do nothing.
+  LoopNest::unroll(inner_for, -2);
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int i = 0; i < M; i++) {
+    # CHECK:   for (int j = 0; j < N; j++) {
+    # CHECK:     A[i, j] =
+    # CHECK:   }
+    # CHECK: }
+  )IR");
+}
+
+TEST(LoopNest, UnrollByFactorEqualToIters) {
+  // Input IR:
+  //   for (int i = 0; i < 5; i++) {
+  //     A[i] = i * i;
+  //   }
+  BufHandle a_buf("A", {5}, kInt);
+  VarHandle i("i", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i}, i * i)});
+  auto for_loop = For::make(i, 0, 5, for_body);
+  auto block = Block::make({for_loop});
+  LoopNest l(block, {a_buf.node()});
+
+  LoopNest::unroll(for_loop, 5);
+  checkIR(l.root_stmt(), R"IR(
+    # CHECK: for (int i_outer = 0; i_outer < (5 - 0) / 5; i_outer++)
+    # CHECK:   A[5 * i_outer]
+    # CHECK:   A[5 * i_outer + 1]
+    # CHECK:   A[5 * i_outer + 2]
+    # CHECK:   A[5 * i_outer + 3]
+    # CHECK:   A[5 * i_outer + 4]
+  )IR");
+}
+
+TEST(LoopNest, UnrollEmpty) {
+  const std::string actual = constantUpperBoundLoopIR(0);
+  const std::string& verification_pattern = R"IR(
+# CHECK-NOT: A[
+  )IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, actual);
+}
+
+TEST(LoopNest, NoUnroll) {
+  VarHandle upper_bound("N", kInt);
+  Tensor A =
+      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
+  LoopNest l({A});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
+  StmtPtr unrolled = nullptr;
+  ASSERT_THROWS_WITH(
+      LoopNest::fullUnroll(loops[0], &unrolled), "non-constant loop");
+}
+
+TEST(LoopNest, UnrollWithLet) {
+  const int kTotalSize = 3;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+
+  VarHandle e("e", kInt);
+  VarHandle x("x", kInt);
+  auto f = For::make(
+      x,
+      0,
+      kTotalSize,
+      Block::make(
+          {Let::make(e, 7),
+           Store::make(a_buf, {x}, e),
+           Store::make(b_buf, {x}, e + 1)}));
+  auto parent_block = Block::make({f});
+  StmtPtr unrolled = nullptr;
+  LoopNest::fullUnroll(f, &unrolled);
+  std::ostringstream oss;
+  oss << *unrolled;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int e = 7;
+# CHECK: A[0] = e;
+# CHECK: B[0] = e + 1;
+# CHECK: A[1] = e;
+# CHECK: B[1] = e + 1;
+# CHECK: A[2] = e;
+# CHECK: B[2] = e + 1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<int> a_v(kTotalSize, 0);
+  std::vector<int> b_v(kTotalSize, 0);
+  SimpleIREvaluator eval(unrolled, {a_buf, b_buf});
+  eval(a_v, b_v);
+  for (int i = 0; i < kTotalSize; ++i) {
+    ASSERT_EQ(a_v[i], 7);
+    ASSERT_EQ(b_v[i], 8);
+  }
+}
+
+TEST(LoopNest, IsNormalized) {
+  // Input IR:
+  //   for (int i = 50; i < 100; i++) {
+  //     A[i] = B[i];
+  //   }
+  BufHandle a_buf("A", {ExprHandle(100)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto for_stmt =
+      For::make(i, 50, 100, Store::make(a_buf, {i}, Load::make(b_buf, {i})));
+  Block::make({for_stmt});
+  ASSERT_FALSE(LoopNest::isNormalized(for_stmt));
+
+  for_stmt->set_start(alloc<IntImm>(0));
+  ASSERT_TRUE(LoopNest::isNormalized(for_stmt));
+
+  VarHandle N("N", kInt);
+  for_stmt->set_start(N.node());
+  ASSERT_FALSE(LoopNest::isNormalized(for_stmt));
+}
+
+TEST(LoopNest, NormalizeStartPositive) {
+  // Input IR:
+  //   for (int x = 50; x < 100; x++) {
+  //     A[x] = B[x];
+  //     B[x] = x * 2;
+  //   }
+  const int kTotalSize = 50;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  VarHandle x("x", kInt);
+  auto for_body = Block::make(
+      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
+       Store::make(b_buf, {x}, x * 2)});
+  auto for_stmt = For::make(x, 50, 100, for_body);
+  Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 0; x < 50; x++) {
+        # CHECK:   A[x + 50] = B[x + 50];
+        # CHECK:   B[x + 50] = 2 * (x + 50);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeStartNegative) {
+  // Input IR:
+  //   for (int x = -50; x < 100; x++) {
+  //     A[x + 50] = B[x + 50];
+  //     B[x + 50] = x * 2;
+  //   }
+  const int kTotalSize = 150;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  VarHandle x("x", kInt);
+  auto for_body = Block::make(
+      {Store::make(a_buf, {x + 50}, Load::make(kInt, b_buf, {x + 50})),
+       Store::make(b_buf, {x + 50}, x * 2)});
+  auto for_stmt = For::make(x, -50, 100, for_body);
+  Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 0; x < 150; x++) {
+        # CHECK:   A[x] = B[x];
+        # CHECK:   B[x] = 2 * (x - 50);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeStartZero) {
+  // Input IR:
+  //   for (int x = 0; x < 100; x++) {
+  //     A[x] = B[x];
+  //     B[x] = x * 2;
+  //   }
+  // Should not be modified.
+
+  const int kTotalSize = 100;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  VarHandle x("x", kInt);
+  auto for_body = Block::make(
+      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
+       Store::make(b_buf, {x}, x * 2)});
+  auto for_stmt = For::make(x, 0, 100, for_body);
+  Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 0; x < 100; x++) {
+        # CHECK:   A[x] = B[x];
+        # CHECK:   B[x] = 2 * x;
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeStartVariable) {
+  // Input IR:
+  //   for (int x = y; x < 100; x++) {
+  //     A[x] = B[x];
+  //     B[x] = x * 2;
+  //   }
+
+  const int kTotalSize = 100;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto for_body = Block::make(
+      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
+       Store::make(b_buf, {x}, x * 2)});
+  auto for_stmt = For::make(x, y, 100, for_body);
+  auto parent_block = Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 0; x < 100 - y; x++) {
+        # CHECK:   A[x + y] = B[x + y];
+        # CHECK:   B[x + y] = 2 * (x + y);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeOnNestedOuterLoop) {
+  // Input IR:
+  //   for (int x = 50; x < 100; x++) {
+  //     for (int y = 10; y < 100; y++) {
+  //       A[x] = A[x] + B[y] + y * 2;
+  //     }
+  //   }
+
+  BufHandle a_buf("A", {ExprHandle(50)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto inner_for_body = Store::make(
+      a_buf, {x}, Load::make(a_buf, {x}) + Load::make(b_buf, {y}) + y * 2);
+  auto inner_for = For::make(y, 10, 100, inner_for_body);
+  auto for_stmt = For::make(x, 50, 100, inner_for);
+  Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 0; x < 50; x++) {
+        # CHECK:   for (int y = 10; y < 100; y++) {
+        # CHECK:     A[x + 50] = ((A[x + 50]) + (B[y])) + 2 * y;
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeOnNestedInnerLoop) {
+  // Input IR:
+  //   for (int x = 50; x < 100; x++) {
+  //     for (int y = 10; y < 100; y++) {
+  //       A[x] = A[x] + B[y] + y * 2;
+  //     }
+  //   }
+
+  BufHandle a_buf("A", {ExprHandle(50)}, kInt);
+  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto inner_for_body = Store::make(
+      a_buf, {x}, Load::make(a_buf, {x}) + Load::make(b_buf, {y}) + y * 2);
+  auto inner_for = For::make(y, 10, 100, inner_for_body);
+  auto for_stmt = For::make(x, 50, 100, inner_for);
+  Block::make({for_stmt});
+
+  LoopNest::normalize(inner_for);
+
+  auto result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int x = 50; x < 100; x++) {
+        # CHECK:   for (int y = 0; y < 90; y++) {
+        # CHECK:     A[x] = (((A[x]) + (B[y + 10])) + 2 * y) + 20;
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(LoopNest, NormalizeAndSplitWithTail) {
+  // Create a dummy tensor to construct LoopNest.
+  ExprHandle n(100);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
+  LoopNest l({b});
+
+  // Input IR:
+  //   for (int x = 5; x < 10; x++) {
+  //     A[x] = x * 2;
+  //   }
+  const int kTotalSize = 5;
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
+  VarHandle x("x", kInt);
+  auto for_stmt = For::make(x, 5, 10, Store::make(a_buf, {x}, x * 2));
+  auto parent_block = Block::make({for_stmt});
+
+  LoopNest::normalize(for_stmt);
+
+  ForPtr x_inner;
+  ForPtr x_tail;
+  LoopNest::splitWithTail(for_stmt, 10, &x_inner, &x_tail);
+
+  auto x_outer_result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss_outer;
+  oss_outer << *x_outer_result;
+  const std::string& expected_outer_ir =
+      R"IR(
+        # CHECK: {
+        # CHECK: }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_outer_ir, oss_outer.str());
+
+  auto x_tail_result = IRSimplifier::simplify(x_tail);
+  std::ostringstream oss_tail;
+  oss_tail << *x_tail_result;
+  const std::string& expected_tail_ir =
+      R"IR(
+        # CHECK: for (int x_tail = 0; x_tail < 5; x_tail++) {
+        # CHECK:   A[x_tail + 5] = 2 * (x_tail + 5);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_tail_ir, oss_tail.str());
+}
+
+TEST(LoopNest, NotNormalizeAndSplitWithTail) {
+  // Create a dummy tensor to construct LoopNest.
+  ExprHandle n(100);
+  BufHandle a("a", {n}, kFloat);
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
+  LoopNest l({b});
+
+  // Input IR:
+  //   for (int x = 5; x < 15; x++) {
+  //     A[x] = x * 2;
+  //   }
+  const int kTotalSize = 10;
+  BufHandle a_buf("A", {kTotalSize}, kInt);
+  VarHandle x("x", kInt);
+  auto for_stmt = For::make(x, 5, 15, Store::make(a_buf, {x}, x * 2));
+  auto parent_block = Block::make({for_stmt});
+
+  ForPtr x_inner;
+  ForPtr x_tail;
+  LoopNest::splitWithTail(for_stmt, 8, &x_inner, &x_tail);
+
+  auto x_outer_result = IRSimplifier::simplify(for_stmt);
+  std::ostringstream oss_outer;
+  oss_outer << *x_outer_result;
+  const std::string& expected_outer_ir =
+      R"IR(
+        # CHECK: {
+        # CHECK: }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_outer_ir, oss_outer.str());
+
+  auto x_tail_result = IRSimplifier::simplify(x_tail);
+  std::ostringstream oss_tail;
+  oss_tail << *x_tail_result;
+  const std::string& expected_tail_ir =
+      R"IR(
+        # CHECK: for (int x_tail = 0; x_tail < 2; x_tail++) {
+        # CHECK:   A[x_tail + 13] = 2 * (x_tail + 13);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_tail_ir, oss_tail.str());
+}
+
+TEST(LoopNest, FlattenSimpleLoopNest2D) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     for (int j = 0; j < 5; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 0, 5, for_body);
+  auto outer_for = For::make(i, 0, 10, inner_for);
+  auto parent_block = Block::make({outer_for});
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  ForPtr flattened = nullptr;
+  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, loops.front());
+
+  auto result = IRSimplifier::simplify(flattened);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int i_flat = 0; i_flat < 50; i_flat++) {
+        # CHECK:   A[i_flat / 5, i_flat % 5] =
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  {
+    SimpleIREvaluator eval1(loops[0], {a_buf});
+    PaddedBuffer<int> inp1(10, 5);
+    eval1(inp1);
+    SimpleIREvaluator eval2(flattened, {a_buf});
+    PaddedBuffer<int> inp2(10, 5);
+    eval2(inp2);
+    ExpectAllNear(inp1, inp2, 1e-5);
+  }
+}
+
+TEST(LoopNest, FlattenSimpleLoopNest3D) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     for (int j = 0; j < 5; j++) {
+  //       for (int k = 0; k < 7; k++) {
+  //         A[i,j,k] = i + j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {10, 5, 7}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j, k}, i + j * k)});
+  auto for1 = For::make(k, 0, 7, for_body);
+  auto for2 = For::make(j, 0, 5, for1);
+  auto for3 = For::make(i, 0, 10, for2);
+  auto parent_block = Block::make({for3});
+
+  std::vector<ForPtr> loops = {for3, for2, for1};
+  ForPtr flattened = nullptr;
+  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, loops.front());
+
+  auto result = IRSimplifier::simplify(flattened);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int i_flat = 0; i_flat < 350; i_flat++) {
+        # CHECK:   A[i_flat / 35, (i_flat / 7) % 5, i_flat % 7] =
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  {
+    SimpleIREvaluator eval1(loops[0], {a_buf});
+    PaddedBuffer<int> inp1(10, 5, 7);
+    eval1(inp1);
+    SimpleIREvaluator eval2(flattened, {a_buf});
+    PaddedBuffer<int> inp2(10, 5, 7);
+    eval2(inp2);
+    ExpectAllNear(inp1, inp2, 1e-5);
+  }
+}
+
+TEST(LoopNest, FlattenLoopNestAfterNormalize) {
+  // Input IR:
+  //   for (int i = 2; i < 10; i++) {
+  //     for (int j = 3; j < 15; j++) {
+  //       A[i - 2,j - 3] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {8, 12}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i - 2, j - 3}, i * j)});
+  auto inner_for = For::make(j, 3, 15, for_body);
+  auto outer_for = For::make(i, 2, 10, inner_for);
+  auto parent_block = Block::make({outer_for});
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  ForPtr flattened = nullptr;
+  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, loops.front());
+
+  auto result = IRSimplifier::simplify(flattened);
+  std::ostringstream oss;
+  oss << *result;
+  const std::string& expected_ir =
+      R"IR(
+        # CHECK: for (int i_flat = 0; i_flat < 96; i_flat++) {
+        # CHECK:   A[i_flat / 12, i_flat % 12] =
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  {
+    SimpleIREvaluator eval1(loops[0], {a_buf});
+    PaddedBuffer<int> inp1(8, 12);
+    eval1(inp1);
+    SimpleIREvaluator eval2(flattened, {a_buf});
+    PaddedBuffer<int> inp2(8, 12);
+    eval2(inp2);
+    ExpectAllNear(inp1, inp2, 1e-5);
+  }
+}
+
+TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) {
+  // Input IR:
+  //   for (int i = 0; i < 15-5; i++) {
+  //     for (int j = 0; j < 20/4; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for =
+      For::make(j, 0, IntImm::make(20) / IntImm::make(4), for_body);
+  auto outer_for =
+      For::make(i, 0, IntImm::make(15) - IntImm::make(5), inner_for);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto b = Block::make({outer_for});
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  ForPtr flattened = nullptr;
+  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, loops.front());
+
+  auto result = IRSimplifier::simplify(flattened);
+  checkIR(result, R"IR(
+        # CHECK: for (int i_flat = 0; i_flat < 50; i_flat++) {
+        # CHECK:   A[i_flat / 5, i_flat % 5] =
+      )IR");
+
+  {
+    SimpleIREvaluator eval1(loops[0], {a_buf});
+    PaddedBuffer<int> inp1(10, 5);
+    eval1(inp1);
+    SimpleIREvaluator eval2(flattened, {a_buf});
+    PaddedBuffer<int> inp2(10, 5);
+    eval2(inp2);
+    ExpectAllNear(inp1, inp2, 1e-5);
+  }
+}
+
+TEST(LoopNest, FlattenImperfectLoopNest) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     A[i, i] = 0;
+  //     for (int j = 0; j < 15; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  // Do not flatten.
+
+  BufHandle a_buf("A", {10, 15}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for = For::make(j, 0, 15, for_body);
+  auto outer_for = For::make(
+      i, 0, 10, Block::make({Store::make(a_buf, {i, i}, 0), inner_for}));
+  auto par = Block::make({outer_for});
+  HashProvider hasher;
+  auto hash_before = hasher.hash(par);
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  ForPtr flattened = nullptr;
+  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, nullptr);
+  auto hash_after = hasher.hash(par);
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, FlattenReductionLoopNest) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     S[i] = 0;
+  //     for (int j = 0; j < 15; j++) {
+  //       S[i] = S[i] + A[i,j];
+  //     }
+  //   }
+  // Do not flatten.
+
+  BufHandle a_buf("A", {10, 15}, kInt);
+  BufHandle s_buf("S", {10}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto for_body = Block::make({Store::make(
+      s_buf, {i}, Load::make(s_buf, {i}) + Load::make(a_buf, {i, j}))});
+  auto inner_for = For::make(j, 0, 15, for_body);
+  auto outer_for =
+      For::make(i, 0, 10, Block::make({Store::make(s_buf, {i}, 0), inner_for}));
+  auto par = Block::make({outer_for});
+  HashProvider hasher;
+  auto hash_before = hasher.hash(par);
+
+  std::vector<ForPtr> loops = {outer_for, inner_for};
+  ForPtr flattened = nullptr;
+  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, nullptr);
+  auto hash_after = hasher.hash(par);
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
+  const int M = 3;
+  const int N = 7;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  BufHandle b("b", {m, n}, kFloat);
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
+  LoopNest loop({c});
+  HashProvider hasher;
+  auto hash_before = hasher.hash(loop.root_stmt());
+
+  auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1];
+  ForPtr flattened = nullptr;
+  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, nullptr);
+  auto hash_after = hasher.hash(loop.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, FlattenIncorrectLoopsAsInput) {
+  // Input IR:
+  //   for (int i = 0; i < 10; i++) {
+  //     for (int j = 0; j < 5; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  //   for (int x = 0; x < 10; x++) {
+  //     for (int y = 0; y < 5; y++) {
+  //       A[x,y] = A[x,y] + x + y;
+  //     }
+  //   }
+  // Flatten({For_i, For_y}) => should not succeed
+
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for1 = For::make(j, 0, 5, for_body1);
+  auto outer_for1 = For::make(i, 0, 10, inner_for1);
+  auto for_body2 = Block::make(
+      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
+  auto inner_for2 = For::make(y, 0, 5, for_body2);
+  auto outer_for2 = For::make(x, 0, 10, inner_for2);
+  auto par = Block::make({outer_for1, outer_for2});
+  HashProvider hasher;
+  auto hash_before = hasher.hash(par);
+
+  std::vector<ForPtr> loops = {outer_for1, inner_for2};
+  ForPtr flattened = nullptr;
+  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
+  ASSERT_EQ(flattened, nullptr);
+  auto hash_after = hasher.hash(par);
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, DetectInlineRankMismatch) {
+  const int kTotalSize = 8;
+
+  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
+  Tensor a = Compute(
+      "a", {kTotalSize}, [&](const VarHandle& i) { return a_buf.load(i); });
+  Tensor reshape = Compute(
+      "reshape",
+      {kTotalSize / 2, 2},
+      [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
+  LoopNest l({reshape}, {a, reshape});
+  ASSERT_FALSE(l.computeInline(l.getLoopBodyFor(a)));
+}
+
+TEST(LoopNest, CacheReadsSimple) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 30, j + 3);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
+
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
+
+  // just this once: verify the whole thing.
+  checkIR(result, R"IR(
+#CHECK: Allocate(A); // dtype=int, dims=[64, 64]
+#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10]
+#CHECK: for (int i
+#CHECK:  for (int j
+#CHECK:   A[
+#CHECK:  }
+#CHECK: }
+#CHECK: for (int i_1
+#CHECK:  for (int j_1
+#CHECK:   A_local[j_1] = A[
+#CHECK:  }
+#CHECK:  for (int j_2
+#CHECK:   B[j_2 + 10 * i_1] = A_local[j_2];
+#CHECK:  }
+#CHECK: }
+#CHECK: for (int i_2
+#CHECK:  for (int j_3
+#CHECK:   C[
+#CHECK:  }
+#CHECK: }
+#CHECK: Free(A_local);
+#CHECK: Free(A);
+      )IR");
+
+  std::vector<int> b_data(200, 0);
+  std::vector<int> c_data(200, 0);
+  cg.call({b_data, c_data});
+
+  std::vector<int> b_ref(200, 0);
+  std::vector<int> c_ref(200, 0);
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      b_ref[i * 10 + j] = (i + 30) * (j + 3);
+      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
+    }
+  }
+
+  assertAllEqual(b_data, b_ref);
+  assertAllEqual(c_data, c_ref);
+}
+
+TEST(LoopNest, CacheReadsOuter) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0];
+  LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
+
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
+
+  checkIR(result, R"IR(
+#CHECK: Allocate(A_local); // dtype=int, dims=[21, 11]
+#CHECK: A_local[j_1 + 11 * i_1] =
+#CHECK: B[j_2 + 10 * i_2] = (A_local[j_2 + 11 * i_2]) + (A_local[(j_2 + 11 * i_2) + 12]);
+      )IR");
+
+  std::vector<int> b_data(200, 0);
+  std::vector<int> c_data(200, 0);
+  cg.call({b_data, c_data});
+
+  std::vector<int> b_ref(200, 0);
+  std::vector<int> c_ref(200, 0);
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
+      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
+    }
+  }
+
+  assertAllEqual(b_data, b_ref);
+  assertAllEqual(c_data, c_ref);
+}
+
+TEST(LoopNest, CacheReadsInternal) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
+
+  checkIR(result, R"IR(
+#CHECK: Allocate(A_local); // dtype=int, dims=[2, 11]
+#CHECK: A_local[k + 11 * j_1] =
+#CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]);
+      )IR");
+
+  std::vector<int> b_data(200, 0);
+  std::vector<int> c_data(200, 0);
+  cg.call({b_data, c_data});
+
+  std::vector<int> b_ref(200, 0);
+  std::vector<int> c_ref(200, 0);
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
+      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
+    }
+  }
+
+  assertAllEqual(b_data, b_ref);
+  assertAllEqual(c_data, c_ref);
+}
+
+TEST(LoopNest, CacheReadsInner) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  // note im changing the offset of the first arg of the first call to A.
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr body = l.getLoopBodyFor(B);
+  LoopNest::cacheAccesses(A.buf(), "A_local", body);
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
+
+  checkIR(result, R"IR(
+#CHECK: Allocate(A_local); // dtype=int, dims=[5, 2]
+#CHECK: A_local[l + 2 * k] =
+#CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]);
+      )IR");
+
+  std::vector<int> b_data(200, 0);
+  std::vector<int> c_data(200, 0);
+  cg.call({b_data, c_data});
+
+  std::vector<int> b_ref(200, 0);
+  std::vector<int> c_ref(200, 0);
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      b_ref[i * 10 + j] = (i + 34) * (j + 40) + (i + 30) * (j + 41);
+      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
+    }
+  }
+
+  assertAllEqual(b_data, b_ref);
+  assertAllEqual(c_data, c_ref);
+}
+
+TEST(LoopNest, CacheWritesSimple) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
+      });
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
+      });
+
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
+
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
+
+  checkIR(result, R"IR(
+#CHECK: Allocate(A_local); // dtype=int, dims=[1, 64]
+#CHECK: for (int j = 0; j < 64
+#CHECK:   A_local[j] = i * j;
+#CHECK: for (int j_1 = 0; j_1 < 64
+#CHECK:   A[j_1 + 64 * i] = A_local[
+#CHECK: Free(A_local);
+#CHECK-NOT: A_local
+      )IR");
+
+  std::vector<int> b_data(200, 0);
+  std::vector<int> c_data(200, 0);
+  cg.call({b_data, c_data});
+
+  std::vector<int> b_ref(200, 0);
+  std::vector<int> c_ref(200, 0);
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
+      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
+    }
+  }
+
+  assertAllEqual(b_data, b_ref);
+  assertAllEqual(c_data, c_ref);
+}
+
+TEST(LoopNest, DeadStoreElimination) {
+  VarHandle y("y", kInt);
+  VarHandle x("x_tail", kInt);
+  BufHandle f("f", {26, 5}, kInt);
+  BufHandle g("g", {26, 5}, kInt);
+  ExprHandle x_outer_end = 5;
+  ExprHandle x_2 = x + x_outer_end * 4;
+  ForPtr stmt1 = For::make(
+      x,
+      0,
+      5,
+      For::make(
+          y,
+          0,
+          5,
+          Block::make({
+              Store::make(f, {x_2, y}, (x_2 + y)),
+              Store::make(g, {x_2, y}, (x_2 * y)),
+          })));
+  StmtPtr stmt = Block::make({stmt1});
+
+  // Will eliminate if not used by an output.
+  LoopNest loop(Stmt::clone(stmt), {f.node()});
+  loop.eliminateDeadStores();
+
+  checkIR(loop.root_stmt(), R"IR(
+#CHECK:     f[x_tail + 5 * 4, y]
+#CHECK-NOT: g[x_tail + 5 * 4, y]
+      )IR");
+
+  // But won't eliminate if used by different outputs.
+  LoopNest loop2(stmt, {f.node(), g.node()});
+  loop2.eliminateDeadStores();
+
+  checkIR(loop2.root_stmt(), R"IR(
+#CHECK:     f[x_tail + 5 * 4, y]
+#CHECK:     g[x_tail + 5 * 4, y]
+      )IR");
+}
+
+TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  BufHandle f("f", {26 * 5}, kInt);
+  BufHandle g("g", {26 * 5}, kInt);
+  BufHandle h("h", {26, 5}, kInt);
+  ExprHandle x_outer_end = 5;
+  ExprHandle x_2 = x + x_outer_end * 4;
+  ForPtr stmt1 = For::make(x, 0, 26 * 5, Store::make(f, {x}, x));
+  ForPtr stmt2 = For::make(z, 0, 26 * 5, Store::make(g, {z}, z + 1));
+  ForPtr stmt3 = For::make(
+      x,
+      0,
+      5,
+      For::make(
+          y,
+          0,
+          5,
+          Block::make({
+              Store::make(h, {x, y}, Load::make(f, {x * y})),
+          })));
+  StmtPtr stmt = Block::make({stmt1, stmt2, stmt3});
+
+  // Will eliminate the write to g, but not f since it used by the producer of
+  // h.
+  LoopNest loop(Stmt::clone(stmt), {h.node()});
+  loop.eliminateDeadStores();
+
+  checkIR(loop.root_stmt(), R"IR(
+  #CHECK:     f[x] = x;
+  #CHECK-NOT: g[z] =
+  #CHECK:     h[x, y] = f[x * y];
+      )IR");
+
+  // Sanity check won't eliminate if g is an output.
+  LoopNest loop2(stmt, {h.node(), g.node()});
+  loop2.eliminateDeadStores();
+
+  checkIR(loop2.root_stmt(), R"IR(
+  #CHECK:     f[x] = x;
+  #CHECK:     g[z] = z + 1;
+  #CHECK:     h[x, y] = f[x * y];
+      )IR");
+}
+
+TEST(LoopNest, CompoundTensorSimple) {
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for1 = For::make(j, 0, 5, for_body1);
+  auto outer_for1 = For::make(i, 0, 10, inner_for1);
+  auto for_body2 = Block::make(
+      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
+  auto inner_for2 = For::make(y, 0, 5, for_body2);
+  auto outer_for2 = For::make(x, 0, 10, inner_for2);
+  BlockPtr body = Block::make({outer_for1, outer_for2});
+
+  Tensor A = Tensor(a_buf.node(), body);
+
+  LoopNest l({A});
+  l.prepareForCodegen();
+
+  std::vector<int> a_data(50, 0);
+
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(s, {A});
+
+  std::vector<int> a_ref(50, 0);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      a_ref[i * 5 + j] = (i * j) + i + j;
+    }
+  }
+  cg.call({a_data});
+
+  assertAllEqual(a_data, a_ref);
+}
+
+TEST(LoopNest, InlineConstantIndex) {
+  const int N = 10;
+  BufHandle x_buf("a", {1, N, 1}, kFloat);
+  Tensor y = Compute(
+      "f",
+      {1, N, 1},
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
+        return x_buf.load(m, n, o);
+      });
+  Tensor z = Compute(
+      "f",
+      {1, N, 1},
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
+        return y.load(m, n, o);
+      });
+
+  LoopNest l({z}, {y, z});
+  l.simplify();
+  ASSERT_TRUE(l.computeInline(y.buf()));
+}
+
+TEST(LoopNest, CompoundTensorUsed) {
+  BufHandle a_buf("A", {10, 5}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
+  auto inner_for1 = For::make(j, 0, 5, for_body1);
+  auto outer_for1 = For::make(i, 0, 10, inner_for1);
+  auto for_body2 = Block::make(
+      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
+  auto inner_for2 = For::make(y, 0, 5, for_body2);
+  auto outer_for2 = For::make(x, 0, 10, inner_for2);
+  BlockPtr body = Block::make({outer_for1, outer_for2});
+
+  Tensor A = Tensor(a_buf.node(), body);
+  Tensor B = Compute("B", {10, 3}, [&](const VarHandle& i, const VarHandle& j) {
+    return A.load(i, j + 1) + A.load(i, j + 2);
+  });
+
+  LoopNest l({B}, {A, B});
+  ASSERT_FALSE(l.computeInline(A.buf()));
+  l.prepareForCodegen();
+
+  std::vector<int> a_data(50, 0);
+  std::vector<int> b_data(50, 0);
+
+  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(s, {B});
+
+  std::vector<int> b_ref(50, 0);
+
+  auto AT = [](int i, int j) { return i * j + i + j; };
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      b_ref[i * 3 + j] = AT(i, j + 1) + AT(i, j + 2);
+    }
+  }
+  cg.call({b_data});
+
+  assertAllEqual(b_data, b_ref);
+}
+
+TEST(LoopNest, InlineFromLoad) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto store_a = For::make(i, 0, N, Store::make(a, {i}, i));
+  auto store_b = For::make(j, 0, N, Store::make(b, {j}, Load::make(a, {j})));
+  LoopNest l(Block::make({store_a, store_b}), {b.node()});
+
+  l.computeInline(a.node());
+
+  // Check that A[j] is replaced with j after inlining
+  std::ostringstream oss;
+  oss << *l.root_stmt();
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: for (int j
+# CHECK-NOT: B[j] = A[j]
+# CHECK-NEXT: B[j] = j
+)IR",
+      oss.str());
+}
+
+TEST(LoopNest, OptimizeConditionalsSimple) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //   }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {15}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 5, kLT),
+          Load::make(b_buf, {i}),
+          Load::make(c_buf, {i - 5})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+
+  LoopNest nest(par, {a_buf.node()});
+  nest.optimizeConditionals();
+
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i = 0; i < 5
+# CHECK-NEXT: A[i] = B[i]
+# CHECK: for (int i = 0; i < 15
+# CHECK-NEXT: A[i + 5] = C[i]
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, OptimizeConditionalsNestedConditions) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
+  //   }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 10, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+
+  LoopNest nest(par, {a_buf.node()});
+  nest.optimizeConditionals();
+
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i = 0; i < 5
+# CHECK-NEXT: A[i] = B[i]
+# CHECK: for (int i = 0; i < 5
+# CHECK-NEXT: A[i + 5] = C[i]
+# CHECK: for (int i = 0; i < 10
+# CHECK-NEXT: A[i + 10] = D[i]
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, OptimizeConditionalsMultipleStores) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //   }
+  //   for (int j = 0; j < 100; j++) {
+  //     B[j] = IfThenElse(j<30 ? 1 : 0, C[j], D[j])
+  //   }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {100}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto storeA = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 5, kLT),
+          Load::make(b_buf, {i}),
+          Load::make(c_buf, {i - 5})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, storeA);
+  auto storeB = Store::make(
+      b_buf,
+      {j},
+      IfThenElse::make(
+          CompareSelect::make(j, 30, kLT),
+          Load::make(c_buf, {j}),
+          Load::make(d_buf, {j})));
+  auto forJ = For::make(j, 0, 100, storeB);
+  auto par = Block::make({forI, forJ});
+
+  LoopNest nest(par, {a_buf.node()});
+  nest.optimizeConditionals();
+
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i = 0; i < 5
+# CHECK-NEXT: A[i] = B[i]
+# CHECK: for (int i = 0; i < 15
+# CHECK-NEXT: A[i + 5] = C[i]
+# CHECK: for (int j = 0; j < 30
+# CHECK-NEXT: B[j] = C[j]
+# CHECK: for (int j = 0; j < 70
+# CHECK-NEXT: B[j + 30] = D[j + 30]
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) {
+  // Input IR:
+  //   for (int i = 0; i < 50; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //     B[j] = IfThenElse(j<30 ? 1 : 0, C[j], D[j])
+  //   }
+  // Only the first conditional, in the write to A, will be optimized.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {100}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {100}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {100}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {100}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto storeA = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 5, kLT),
+          Load::make(b_buf, {i}),
+          Load::make(c_buf, {i - 5})));
+  auto storeB = Store::make(
+      b_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 30, kLT),
+          Load::make(c_buf, {i}),
+          Load::make(d_buf, {i})));
+  auto forI = For::make(i, 0, 50, Block::make({storeA, storeB}));
+  auto par = Block::make({forI});
+
+  LoopNest nest(par, {a_buf.node()});
+  nest.optimizeConditionals();
+
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i = 0; i < 5
+# CHECK-NEXT: A[i] = B[i]
+# CHECK-NEXT: B[i] = C[i]
+# CHECK: for (int i = 0; i < 45
+# CHECK-NEXT: A[i + 5] = C[i]
+# CHECK-NEXT: B[i + 5] = IfThenElse(i + 5<30 ? 1 : 0, C[i + 5], D[i + 5])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, OptimizeConditionalsOuterLoopVar) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
+  //     }
+  //   }
+  // Currently, this case where the condition variable `i` is not the
+  // inner-most loop variable, is not optimized.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 10, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, For::make(j, 0, 100, store));
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5, IfThenElse(i<10, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because one of the conditions use '>'.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 5, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 10, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsCompValuesNotConstants) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<N, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because one of the conditions use '>'.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle N("N", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, N, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsInvalidCondition) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<10, IfThenElse(i>5, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because one of the conditions use '>'.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 10, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 5, kGT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsInvalidCondition2) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(10<i, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because of the invalid condition:
+  //    "10 < i".
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(10, i, kLT),
+          IfThenElse::make(
+              CompareSelect::make(i, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsInvalidCondition3) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because the conditions use different
+  // variables: "i < 10" and "k < 5"
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle k("k", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 10, kLT),
+          IfThenElse::make(
+              CompareSelect::make(k, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsInvalidCondition4) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(k<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
+  //   }
+  // No optimization should be done here because the conditions use the
+  // variable 'k' which is not a loop variable.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle d_buf("D", {10}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle k("k", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(k, 10, kLT),
+          IfThenElse::make(
+              CompareSelect::make(k, 5, kLT),
+              Load::make(b_buf, {i}),
+              Load::make(c_buf, {i - 5})),
+          Load::make(d_buf, {i - 10})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+TEST(LoopNest, OptimizeConditionalsNotNormalized) {
+  // Input IR:
+  //   for (int i = 2; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //   }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle a_buf("A", {20}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle b_buf("B", {5}, kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  BufHandle c_buf("C", {15}, kInt);
+  VarHandle i("i", kInt);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto store = Store::make(
+      a_buf,
+      {i},
+      IfThenElse::make(
+          CompareSelect::make(i, 5, kLT),
+          Load::make(b_buf, {i}),
+          Load::make(c_buf, {i - 5})));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 2, 20, store);
+  auto par = Block::make({forI});
+  LoopNest nest(par, {a_buf.node()});
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(nest.root_stmt());
+  nest.optimizeConditionals();
+  auto hash_after = hasher.hash(nest.root_stmt());
+  ASSERT_EQ(hash_before, hash_after);
+}
+
+static std::pair<BufHandle, Tensor> colReduce(int M, int N) {
+  BufHandle a("a", {M, N}, kFloat);
+  Tensor t = Reduce(
+      "b",
+      {N},
+      Sum(),
+      [&](const VarHandle& n, const VarHandle& m) { return a.load(m, n); },
+      {M});
+  return {a, Tensor(t.buf(), LoopNest::sanitizeNames(t.stmt()))};
+}
+
+static StmtPtr splitTailReorder(Tensor b) {
+  constexpr int kVectorWidth = 8;
+  LoopNest nest({b});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
+  nest.splitWithTail(loops[0], kVectorWidth);
+  // Now the loopnests will look like:
+  //
+  // for (int i_outer = 0; ...
+  //   for (int i_inner = 0; ...
+  //     b[i_outer * 8 + i_inner] = float(0);
+  //     for (int j = 0; ...
+  //       b[i_outer * 8 + i_inner] = ReduceOp(...);
+  //
+  // for (int i_tail = 0; ...
+  //   b[i_tail + ((100 - 0) / 8) * 8] = float(0);
+  //   for (int j = 0; ...
+  //     b[i_tail + ((100 - 0) / 8) * 8] = ReduceOp(...);
+  //
+  // Since there are 4 writes to b, we will get 4 loopnests from the
+  // call to `getAllLoopNestsWritingToBuf` below.
+  //
+  // Write #2: "b[i_outer * 8 + i_inner] = ReduceOp(...)"
+  // Loopnest #2: {i_outer, i_inner, j};
+  // We will have to reorder i_inner and j.
+  auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
+  LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
+  nest.prepareForCodegen();
+  return nest.root_stmt();
+}
+
+static StmtPtr splitMaskReorder(Tensor b) {
+  constexpr int kVectorWidth = 8;
+  LoopNest nest({b});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
+  nest.splitWithMask(loops[0], kVectorWidth);
+  loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
+  LoopNest::reorderAxis(loops[1], loops[2]);
+  nest.prepareForCodegen();
+  return nest.root_stmt();
+}
+
+static void checkColReduce(StmtPtr s, BufHandle p, Tensor t) {
+  int M = immediateAs<int>(p.dim(0));
+  int N = immediateAs<int>(p.dim(1));
+  PaddedBuffer<float> a(M, N);
+  PaddedBuffer<float> b(N);
+  PaddedBuffer<float> ref(N);
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      a(i, j) = 1.0f;
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    b(i) = 0.0f;
+  }
+  for (int i = 0; i < N; i++) {
+    ref(i) = 76.0f;
+  }
+  SimpleIREvaluator(s, {p, t}).call({a, b});
+  ExpectAllNear(b, ref, 1e-5);
+}
+
+TEST(LoopNest, ColReduceSplitTailEvenReorder) {
+  constexpr int M = 76, N = 128;
+  auto p = colReduce(M, N);
+  StmtPtr s = splitTailReorder(p.second);
+
+  std::ostringstream oss;
+  oss << *s;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i_outer
+# CHECK-NEXT: for (int i_inner
+# CHECK-NEXT: b[
+# CHECK: for (int j
+# CHECK-NEXT: for (int i_inner
+# CHECK-NEXT: b[
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  checkColReduce(s, p.first, p.second);
+}
+
+TEST(LoopNest, ColReduceSplitTailUnevenReorder) {
+  constexpr int M = 76, N = 100;
+  auto p = colReduce(M, N);
+  StmtPtr s = splitTailReorder(p.second);
+
+  std::ostringstream oss;
+  oss << *s;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i_outer
+# CHECK-NEXT: for (int i_inner
+# CHECK-NEXT: b[
+# CHECK: for (int j
+# CHECK-NEXT: for (int i_inner
+# CHECK-NEXT: b[
+# CHECK: for (int i_tail
+# CHECK-NEXT: b[
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: b[
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  checkColReduce(s, p.first, p.second);
+}
+
+TEST(LoopNest, ColReduceSplitMaskEvenReorder) {
+  constexpr int M = 76, N = 128;
+  auto p = colReduce(M, N);
+  StmtPtr s = splitMaskReorder(p.second);
+  checkColReduce(s, p.first, p.second);
+}
+
+TEST(LoopNest, ColReduceSplitMaskUnevenReorder) {
+  constexpr int M = 76, N = 100;
+  auto p = colReduce(M, N);
+  StmtPtr s = splitMaskReorder(p.second);
+  checkColReduce(s, p.first, p.second);
+}
+
+TEST(LoopNest, ReorderAxisWithMultipleConds) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     if i > 5 {
+  //       if i < 10 {
+  //         for (int j = 0; j < 100; j++) {
+  //           A[i] = i * j;
+  //         }
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {i}, Mul::make(i, j)));
+  auto inner_cond = Cond::make(CompareSelect::make(i, 10, kLT), forJ, nullptr);
+  auto outer_cond =
+      Cond::make(CompareSelect::make(i, 5, kGT), inner_cond, nullptr);
+  auto forI = For::make(i, 0, 20, outer_cond);
+  StmtPtr par = Block::make({forI});
+  LoopNest l(par, {a_buf.node()});
+  LoopNest::reorderAxis(forI, forJ);
+  ASSERT_EQ(par, l.root_stmt());
+  par = IRSimplifier::simplify(par);
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: if (i>5
+# CHECK-NEXT: if (i<10
+# CHECK-NEXT: A[i] = i * j
+# CHECK-NOT: for (
+      )IR";
+  std::ostringstream oss;
+  oss << *par;
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(LoopNest, VectorizeUse) {
+  constexpr int N = 8;
+  BufHandle a("a", {N}, kFloat);
+  Tensor b =
+      Compute("b", {N}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
+  Tensor c =
+      Compute("c", {N}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
+  LoopNest nest({c}, {b, c});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
+  ASSERT_TRUE(LoopNest::vectorize(loops[0]));
+  loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0];
+  ASSERT_TRUE(LoopNest::vectorize(loops[0]));
+  nest.prepareForCodegen();
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  StmtPtr s = nest.root_stmt();
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: c[Ramp
+)IR",
+      oss.str());
+}
+
+const char* int64Loop = R"IR(
+# CHECK: for (int64_t i = 0ll; i < 12ll; i++) {
+# CHECK:   b[i] = (a[i]) + 1ll;
+# CHECK: }
+)IR";
+
+TEST(LoopNest, Int64Direct) {
+  constexpr int64_t N = 12;
+  BufHandle a("a", {N}, kLong);
+  BufHandle b("b", {N}, kLong);
+  VarHandle n("i", kLong);
+  StmtPtr s = For::make(
+      n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
+  s = IRSimplifier::simplify(s);
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(int64Loop, oss.str());
+}
+
+TEST(LoopNest, Int64Compute) {
+  constexpr int64_t N = 12;
+  BufHandle a("a", {N}, kLong);
+  Tensor b = Compute("b", {N}, [&](const VarHandle& n) {
+    return a.load(n) + LongImm::make(1l);
+  });
+  LoopNest nest({b});
+  nest.prepareForCodegen();
+  nest.simplify();
+  std::ostringstream oss;
+  oss << *nest.root_stmt();
+  torch::jit::testing::FileCheck().run(int64Loop, oss.str());
+}
+
+TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = A[i] + i * j;
+  //     }
+  //     B[i] = A[i];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[i] = B[i] + i * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {i}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
+  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
+  auto par = Block::make({forI});
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i] = 0
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i] =
+# CHECK: for (int i
+# CHECK-NEXT: B[i] = A[i]
+# CHECK: for (int i
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[i] =
+# CHECK-NOT: for (
+      )IR";
+
+  LoopNest nest(par, {a_buf.node(), b_buf.node()});
+  auto new_loops = LoopNest::distributeLoop(forI, {initA, forJ, initB});
+
+  std::ostringstream oss;
+  oss << *par;
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The first loop after distribution must be same as the original For.
+  ASSERT_EQ(new_loops.front(), forI);
+}
+
+TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = A[i] + i * j;
+  //     }
+  //     B[i] = A[i];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[i] = B[i] + i * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {i}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
+  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
+  auto par = Block::make({forI});
+
+  LoopNest nest(par, {a_buf.node(), b_buf.node()});
+  auto new_loops = LoopNest::distributeLoop(forI, {forJ});
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i] = 0
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i] =
+# CHECK: for (int i
+# CHECK-NEXT: B[i] = A[i]
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[i] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The first loop after distribution must be same as the original For.
+  ASSERT_EQ(new_loops.front(), forI);
+}
+
+TEST(LoopNest, DistributeLoopWithoutAnyPivot) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = A[i] + i * j;
+  //     }
+  //     B[i] = A[i];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[i] = B[i] + i * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {i}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
+  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
+  auto par = Block::make({forI});
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i] = 0
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i] =
+# CHECK: for (int i
+# CHECK-NEXT: B[i] = A[i]
+# CHECK: for (int i
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[i] =
+# CHECK-NOT: for (
+      )IR";
+
+  LoopNest nest(par, {a_buf.node(), b_buf.node()});
+  auto new_loops = LoopNest::distributeLoop(forI);
+
+  std::ostringstream oss;
+  oss << *par;
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The first loop after distribution must be same as the original For.
+  ASSERT_EQ(new_loops.front(), forI);
+}
+
+TEST(LoopNest, DistributeLoopOverInnerLoops) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = A[i] + i * j;
+  //     }
+  //     B[i] = A[i];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[i] = B[i] + i * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {i}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
+  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
+  auto par = Block::make({forI});
+
+  LoopNest nest(par, {a_buf.node(), b_buf.node()});
+  auto new_loops = LoopNest::distributeLoopOverInnerLoops(forI);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i] = 0
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i] =
+# CHECK: for (int i
+# CHECK-NEXT: B[i] = A[i]
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[i] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The first loop after distribution must be same as the original For.
+  ASSERT_EQ(new_loops.front(), forI);
+}
+
+TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) {
+  // Input IR:
+  // for (int m = 0; m < 50; m++) {
+  //   for (int i = 0; i < 20; i++) {
+  //     A[m,i] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[m,i] = A[m,i] + i * j;
+  //     }
+  //     B[m,i] = A[m,i];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[m,i] = B[m,i] + i * k;
+  //     }
+  //   }
+  // }
+  BufHandle a_buf("A", {100, 100}, kInt);
+  BufHandle b_buf("B", {100, 100}, kInt);
+  VarHandle m("m", kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {m, i}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf,
+          {m, i},
+          Add::make(Load::make(a_buf, {m, i}), Mul::make(i, j))));
+  auto initB = Store::make(b_buf, {m, i}, Load::make(a_buf, {m, i}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf,
+          {m, i},
+          Add::make(Load::make(b_buf, {m, i}), Mul::make(i, k))));
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
+
+  {
+    // Check the case of distributing loop and its parents over all the
+    // statements in the loop.
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: A[m, i] = 0
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[m, i] =
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: B[m, i] = A[m, i]
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[m, i] =
+# CHECK-NOT: for (
+        )IR";
+
+    auto newForI = to<For>(Stmt::clone(forI));
+    auto forM = For::make(m, 0, 50, newForI);
+    auto par = Block::make({forM});
+    LoopNest nest(par, {a_buf.node(), b_buf.node()});
+    auto newLoops = LoopNest::distributeLoopAndParents(newForI);
+
+    std::ostringstream oss;
+    oss << *par;
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    // The first loop after distribution must be same as the original For.
+    ASSERT_EQ(newLoops.front(), forM);
+  }
+
+  {
+    // Check the case of distributing loop and its parents over all the inner
+    // loops.
+    const std::string& verification_pattern =
+        R"IR(
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: A[m, i] = 0
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[m, i] =
+# CHECK: for (int m
+# CHECK-NEXT: for (int i
+# CHECK-NEXT: B[m, i] = A[m, i]
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[m, i] =
+# CHECK-NOT: for (
+        )IR";
+
+    auto newForI = to<For>(Stmt::clone(forI));
+    auto forM = For::make(m, 0, 50, newForI);
+    auto par = Block::make({forM});
+    LoopNest nest(par, {a_buf.node(), b_buf.node()});
+    auto newLoops = LoopNest::distributeLoopAndParentsOverInnerLoops(newForI);
+
+    std::ostringstream oss;
+    oss << *par;
+    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+    // The first loop after distribution must be same as the original For.
+    ASSERT_EQ(newLoops.front(), forM);
+  }
+}
+
+TEST(LoopNest, fuseLoopsSimple) {
+  // Input IR:
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < 100; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: A[j] =
+# CHECK-NEXT: B[j] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsMultiple) {
+  // Input IR:
+  //   for (int i = 0; i < 100; i++) {
+  //     A[i+100] = 20 + i;
+  //   }
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < 100; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forI =
+      For::make(i, 0, 100, Store::make(a_buf, {i + 100}, Add::make(20, i)));
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
+  auto par = Block::make({forI, forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i + 100] =
+# CHECK-NEXT: A[i] =
+# CHECK-NEXT: B[i] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsNested) {
+  // Input IR:
+  //   for (int m = 0; m < 20; m++) {
+  //     A[m] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[m] = A[m] + m * j;
+  //     }
+  //   }
+  //   for (int n = 0; n < 20; n++) {
+  //     B[n] = A[n];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[n] = B[n] + n * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 100}, kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {m}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {m}, Add::make(Load::make(a_buf, {m}), Mul::make(m, j))));
+  auto initB = Store::make(b_buf, {n}, Load::make(a_buf, {n}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {n}, Add::make(Load::make(b_buf, {n}), Mul::make(n, k))));
+  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
+  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
+  auto par = Block::make({forM, forN});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int m
+# CHECK-NEXT: A[m] = 0
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[m] =
+# CHECK: B[m] = A[m]
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: B[m] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forM);
+}
+
+TEST(LoopNest, fuseLoopsNested2D) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 50; n++) {
+  //       B[m,n] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto forI = For::make(
+      i,
+      0,
+      20,
+      For::make(
+          j,
+          0,
+          100,
+          Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500))));
+  auto forM = For::make(
+      m,
+      0,
+      20,
+      For::make(
+          n,
+          0,
+          50,
+          Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
+  auto par = Block::make({forI, forM});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, j] =
+# CHECK: for (int n
+# CHECK-NEXT: B[i, n] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsNested2DInner) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //     for (int n = 0; n < 100; n++) {
+  //       B[i,n] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle n("n", kInt);
+  auto forJ = For::make(
+      j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500)));
+  auto forN = For::make(
+      n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
+  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *forI;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, j] =
+# CHECK-NEXT: B[i, j] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsDifferentStopBounds) {
+  // Input IR:
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < 50; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsDifferentStartBounds) {
+  // Input IR:
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 50; k < 100; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsNotContiguous) {
+  // Input IR:
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   B[0] = 0;
+  //   for (int k = 0; k < 100; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto initB = Store::make(b_buf, {0}, 0);
+  auto forK = For::make(k, 0, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, initB, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsWithDifferentParents) {
+  // Input IR:
+  //   for (int i = 0; i < 50; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  //   B[0] = 0;
+  //   for (int k = 50; k < 100; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {50, 100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(i, j)));
+  auto forI = For::make(i, 0, 50, forJ);
+  auto initB = Store::make(b_buf, {0}, 0);
+  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forI, initB, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsWithVariableBounds) {
+  // Input IR:
+  //   for (int j = 0; j < N; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < N; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle N("N", kInt);
+  auto forJ = For::make(j, 0, N, Store::make(a_buf, {j}, Mul::make(10, j)));
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
+  auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: A[j] =
+# CHECK-NEXT: B[j] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsWithExprBounds) {
+  // Input IR:
+  //   for (int j = 0; j < M + N; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < M + N; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle M("M", kInt);
+  VarHandle N("N", kInt);
+  auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: A[j] =
+# CHECK-NEXT: B[j] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
+  // Input IR:
+  //   for (int j = M; j < N * 2; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = M; k < N + N; k++) {
+  //     B[k] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle M("M", kInt);
+  VarHandle N("N", kInt);
+  auto forJ = For::make(j, M, N * 2, Store::make(a_buf, {j}, Mul::make(10, j)));
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
+  auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: A[j] =
+# CHECK-NEXT: B[j] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
+  // Input IR:
+  //   for (int j = 10; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 10; k < 100; k++) {
+  //     A[k+100] = 30 * k
+  //   }
+  BufHandle a_buf("A", {200}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
+  auto par = Block::make({forJ, forK});
+
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int j
+# CHECK-NEXT: A[j] =
+# CHECK-NEXT: A[j + 100] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forJ);
+}
+
+TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 50; n++) {
+  //       A[m+20,n+100] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 100}, kInt);
+  BufHandle b_buf("B", {20, 50}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
+  auto forJ = For::make(j, 0, 100, storeA1);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto storeA2 =
+      Store::make(a_buf, {m + 20, n + 100}, Add::make(m, Mul::make(n, 100)));
+  auto forN = For::make(n, 0, 50, storeA2);
+  auto forM = For::make(m, 0, 20, forN);
+  auto par = Block::make({forI, forM});
+
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, j] =
+# CHECK: for (int n
+# CHECK-NEXT: A[i + 20, n + 100] =
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsWithReductions) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = 0
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i] = A[i] + B[i,j];
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     C[m] = A[m];
+  //   }
+  BufHandle a_buf("A", {20}, kInt);
+  BufHandle b_buf("B", {20, 100}, kInt);
+  BufHandle c_buf("C", {20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  auto initA = Store::make(a_buf, {i}, 0);
+  auto sumA = Store::make(
+      a_buf, {i}, Add::make(Load::make(a_buf, {i}), Load::make(b_buf, {i, j})));
+  auto forJ = For::make(j, 0, 100, sumA);
+  auto forI = For::make(i, 0, 20, Block::make({initA, forJ}));
+  auto forM =
+      For::make(m, 0, 20, Store::make(c_buf, {m}, Load::make(a_buf, {m})));
+  auto par = Block::make({forI, forM});
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: A[i] =
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i] = (A[i]) +
+# CHECK-NOT: for (
+# CHECK: C[i] = A[i]
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsWith2DReductions) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 50; j++) {
+  //       A[i,j] = 0
+  //       for (int k = 0; k < 100; k++) {
+  //         A[i,j] = A[i,j] + B[i,j,k];
+  //       }
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 40; n++) {
+  //       C[m,n] = A[m,n];
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 50}, kInt);
+  BufHandle b_buf("B", {20, 50, 100}, kInt);
+  BufHandle c_buf("C", {20, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto initA = Store::make(a_buf, {i, j}, 0);
+  auto sumA = Store::make(
+      a_buf,
+      {i, j},
+      Add::make(Load::make(a_buf, {i, j}), Load::make(b_buf, {i, j, k})));
+  auto forK = For::make(k, 0, 100, sumA);
+  auto forJ = For::make(j, 0, 50, Block::make({initA, forK}));
+  auto forI = For::make(i, 0, 20, forJ);
+  auto storeC = Store::make(c_buf, {m, n}, Load::make(a_buf, {m, n}));
+  auto forM = For::make(m, 0, 20, For::make(n, 0, 40, storeC));
+  auto par = Block::make({forI, forM});
+
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, j] =
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: A[i, j] = (A[i, j]) +
+# CHECK: for (int n
+# CHECK-NEXT: C[i, n] = A[i, n]
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsWithComplexIndices) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 20; j++) {
+  //       A[i,j*20+j+2] = i + j;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 20; n++) {
+  //       B[m,n] = A[m,n*20+n+2];
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 400}, kInt);
+  BufHandle b_buf("B", {20, 400}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto writeA = Store::make(a_buf, {i, j * 20 + j + 2}, i + j);
+  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
+  auto storeB =
+      Store::make(b_buf, {m, n}, Load::make(a_buf, {m, n * 20 + n + 2}));
+  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
+  auto par = Block::make({forI, forM});
+
+  ForPtr fused_loop;
+  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, (j * 20 + j) + 2] = i + j
+# CHECK: for (int n
+# CHECK-NEXT: B[i, n] = A[i, (n * 20 + n) + 2]
+# CHECK-NOT: for (
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  // The fused loop must be the same as the first loop.
+  ASSERT_EQ(fused_loop, forI);
+}
+
+TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 20; j++) {
+  //       A[i,i*20+j] = i + j;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 20; n++) {
+  //       B[m,n] = A[m,m*20+n];  // Both indices of A use m
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 500}, kInt);
+  BufHandle b_buf("B", {20, 500}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto writeA = Store::make(a_buf, {i, i * 20 + j}, i + j);
+  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
+  auto storeB = Store::make(b_buf, {m, n}, Load::make(a_buf, {m, m * 20 + n}));
+  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
+  auto par = Block::make({forI, forM});
+
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsWithTranspose) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 20; j++) {
+  //       A[i,j] = i + j;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 20; n++) {
+  //       B[m,n] = A[n,m];  // Transpose
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 20}, kInt);
+  BufHandle b_buf("B", {20, 20}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto writeA = Store::make(a_buf, {i, j}, i + j);
+  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
+  auto storeB = Store::make(b_buf, {m, n}, Load::make(a_buf, {n, m}));
+  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
+  auto par = Block::make({forI, forM});
+
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
+  // Input IR:
+  //   for (int j = 10; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 10; k < 100; k++) {
+  //     A[k-1] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
+  // Input IR:
+  //   for (int j = 10; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 10; k < 100; k++) {
+  //     A[k+50] = 20 * k;
+  //   }
+  BufHandle a_buf("A", {150}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK =
+      For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
+  // Input IR:
+  //   for (int m = 0; m < 20; m++) {
+  //     A[m] = 0;
+  //     for (int j = 0; j < 100; j++) {
+  //       A[m] = A[m] + m * j;
+  //     }
+  //   }
+  //   for (int n = 0; n < 20; n++) {
+  //     B[n] = A[n+1];
+  //     for (int k = 0; k < 50; k++) {
+  //       B[n] = B[n] + n * k;
+  //     }
+  //   }
+  BufHandle a_buf("A", {25, 100}, kInt);
+  BufHandle b_buf("B", {20, 50}, kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto initA = Store::make(a_buf, {m}, 0);
+  auto forJ = For::make(
+      j,
+      0,
+      100,
+      Store::make(
+          a_buf, {m}, Add::make(Load::make(a_buf, {m}), Mul::make(m, j))));
+  auto initB = Store::make(b_buf, {n}, Load::make(a_buf, {n + 1}));
+  auto forK = For::make(
+      k,
+      0,
+      50,
+      Store::make(
+          b_buf, {n}, Add::make(Load::make(b_buf, {n}), Mul::make(n, k))));
+  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
+  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forM, forN});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //   }
+  //   for (int m = 0; m < 20; m++) {
+  //     for (int n = 0; n < 50; n++) {
+  //       A[m+1,n] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {30, 100}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  auto forI = For::make(
+      i,
+      0,
+      20,
+      For::make(
+          j,
+          0,
+          100,
+          Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500))));
+  auto forM = For::make(
+      m,
+      0,
+      20,
+      For::make(
+          n,
+          0,
+          50,
+          Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forI, forM});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 100; j++) {
+  //       A[i,j] = i * j * 500;
+  //     }
+  //     for (int n = 0; n < 100; n++) {
+  //       A[i,n+1] = m + n * 100;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle n("n", kInt);
+  auto forJ = For::make(
+      j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500)));
+  auto forN = For::make(
+      n,
+      0,
+      100,
+      Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,cppcoreguidelines-avoid-magic-numbers)
+  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
+  // Input IR:
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  //   for (int k = 0; k < 100; k++) {
+  //     B[k] = 20 * A[99-k];
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  auto forK = For::make(
+      k,
+      0,
+      100,
+      Store::make(
+          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forJ, forK});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
+}
+
+TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
+  // Input IR:
+  //   for (int k = 0; k < 100; k++) {
+  //     B[k] = 20 * A[99-k];
+  //   }
+  //   for (int j = 0; j < 100; j++) {
+  //     A[j] = 10 * j;
+  //   }
+  BufHandle a_buf("A", {100}, kInt);
+  BufHandle b_buf("B", {100}, kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto forK = For::make(
+      k,
+      0,
+      100,
+      Store::make(
+          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
+  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forK, forJ});
+  ForPtr fused_loop;
+  ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
+}
+
+TEST(LoopNest, areLoopsPerfectlyNested) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       for (int k = 0; k < 40; k++) {
+  //         A[i,j,k] = i * j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
+  auto forK = For::make(k, 0, 40, store);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forI});
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
+
+  // Specifying the loops in any other order fails.
+  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forJ, forI, forK}));
+  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forK, forJ}));
+  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forK, forJ, forI}));
+
+  // Adding a statement to forK body should be OK.
+  auto init = Store::make(a_buf, {i, j}, 0);
+  forK->body()->insert_stmt_before(init, store);
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
+
+  // Adding a statement in forJ body should fail this test.
+  forK->body()->remove_stmt(init);
+  forJ->body()->insert_stmt_before(init, forK);
+  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
+
+  // Similarly, adding a statement in forI body should fail this test.
+  forJ->body()->remove_stmt(init);
+  forI->body()->insert_stmt_before(init, forJ);
+  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
+}
+
+TEST(LoopNest, reorderNestedLoops2D) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       A[i,j] = i * j;
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto store = Store::make(a_buf, {i, j}, Mul::make(i, j));
+  auto forJ = For::make(j, 0, 30, store);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto par = Block::make({forI});
+
+  auto reordered = LoopNest::reorder({forI, forJ}, {1, 0});
+
+  ASSERT_EQ(reordered[0], forJ);
+  ASSERT_EQ(reordered[1], forI);
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forJ, forI}));
+  ASSERT_EQ(forJ->get_parent(), par);
+  ASSERT_EQ(store->get_parent(), forI->body());
+}
+
+TEST(LoopNest, reorderNestedLoops3D) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       for (int k = 0; k < 40; k++) {
+  //         A[i,j,k] = i * j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
+  auto forK = For::make(k, 0, 40, store);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto par = Block::make({forI});
+
+  auto reordered = LoopNest::reorder({forI, forJ, forK}, {2, 0, 1});
+
+  ASSERT_EQ(reordered[0], forK);
+  ASSERT_EQ(reordered[1], forI);
+  ASSERT_EQ(reordered[2], forJ);
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forK, forI, forJ}));
+  ASSERT_EQ(forK->get_parent(), par);
+  ASSERT_EQ(store->get_parent(), forJ->body());
+}
+
+TEST(LoopNest, reorderNestedLoops4D) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       for (int k = 0; k < 40; k++) {
+  //         for (int l = 0; l < 50; l++) {
+  //           A[i,j,k,l] = i * j * k * l * 500;
+  //         }
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40, 50}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle l("l", kInt);
+  auto store = Store::make(
+      a_buf,
+      {i, j, k, l},
+      Mul::make(Mul::make(Mul::make(Mul::make(i, j), k), l), 500));
+  auto forL = For::make(l, 0, 50, store);
+  auto forK = For::make(k, 0, 40, forL);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto par = Block::make({forI});
+
+  auto reordered = LoopNest::reorder({forI, forJ, forK, forL}, {2, 0, 3, 1});
+
+  ASSERT_EQ(reordered[0], forK);
+  ASSERT_EQ(reordered[1], forI);
+  ASSERT_EQ(reordered[2], forL);
+  ASSERT_EQ(reordered[3], forJ);
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forK, forI, forL, forJ}));
+  ASSERT_EQ(forK->get_parent(), par);
+  ASSERT_EQ(store->get_parent(), forJ->body());
+}
+
+TEST(LoopNest, reorderTrivialPermutation) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       for (int k = 0; k < 40; k++) {
+  //         A[i,j,k] = i * j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
+  auto forK = For::make(k, 0, 40, store);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  auto par = Block::make({forI});
+
+  auto reordered = LoopNest::reorder({forI, forJ, forK}, {0, 1, 2});
+
+  ASSERT_EQ(reordered[0], forI);
+  ASSERT_EQ(reordered[1], forJ);
+  ASSERT_EQ(reordered[2], forK);
+  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
+  ASSERT_EQ(forI->get_parent(), par);
+  ASSERT_EQ(store->get_parent(), forK->body());
+}
+
+TEST(LoopNest, reorderInvalidPermutations) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       for (int k = 0; k < 40; k++) {
+  //         A[i,j,k] = i * j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
+  auto forK = For::make(k, 0, 40, store);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forI});
+
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {0, 1, 2, 3}),
+      "invalid permutation size");
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {1, 2}),
+      "invalid permutation size");
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {2, 1, 3}),
+      "invalid permutation for reorder");
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {1, 1, 0}),
+      "invalid permutation for reorder");
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {0, 0, 0}),
+      "invalid permutation for reorder");
+}
+
+TEST(LoopNest, reorderInvalidLoopNest) {
+  // Input IR:
+  //   for (int i = 0; i < 20; i++) {
+  //     for (int j = 0; j < 30; j++) {
+  //       A[i,j] = 0
+  //       for (int k = 0; k < 40; k++) {
+  //         A[i,j,k] = i * j * k;
+  //       }
+  //     }
+  //   }
+  BufHandle a_buf("A", {20, 30, 40}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
+  auto forK = For::make(k, 0, 40, store);
+  auto forJ = For::make(j, 0, 30, forK);
+  auto forI = For::make(i, 0, 20, forJ);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  auto par = Block::make({forI});
+
+  // Specifying the loops in incorrect order fails.
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forK, forI, forJ}, {1, 0, 2}),
+      "reorder is only allowed on perfectly nested loops");
+
+  // Adding a statement to forJ loop fails.
+  auto init = Store::make(a_buf, {i}, 0);
+  forJ->body()->insert_stmt_before(init, forK);
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {1, 0, 2}),
+      "reorder is only allowed on perfectly nested loops");
+
+  // Moving that statement to forI loop also fails.
+  forJ->body()->remove_stmt(init);
+  forI->body()->insert_stmt_before(init, forJ);
+  ASSERT_THROWS_WITH(
+      LoopNest::reorder({forI, forJ, forK}, {1, 0, 2}),
+      "reorder is only allowed on perfectly nested loops");
+}
+
+TEST(LoopNest, compressBufferSimple) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[i,j] + A[i, j+1]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
+  auto forJ2 = For::make(
+      j,
+      0,
+      199,
+      Store::make(
+          bBuf,
+          {i, j},
+          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
+  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[0, j] =
+# CHECK: for (int j
+# CHECK-NEXT: B[i, j] = (A[0, j]) + (A[0, j + 1])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
+}
+
+TEST(LoopNest, compressBufferMultipleDims) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //     B[i,j] = A[i,j] + A[i,j]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto store1 = Store::make(aBuf, {i, j}, sin(i * j));
+  auto store2 = Store::make(
+      bBuf,
+      {i, j},
+      Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j})));
+  auto forJ = For::make(j, 0, 200, Block::make({store1, store2}));
+  auto forI = For::make(i, 0, 100, forJ);
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[0, 0] =
+# CHECK-NEXT: B[i, j] = (A[0, 0]) + (A[0, 0])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
+}
+
+TEST(LoopNest, compressBufferMultipleDims2) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     for (int k = 0; k < 300; ++k) {
+  //       A[i,j,k] = sin(i*j*k)
+  //     }
+  //     for (int k = 0; k < 299; ++j) {
+  //       B[i,j,k] = A[i,j,k] + A[i,j,k+1]
+  //     }
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200, 300}, kInt);
+  BufHandle bBuf("B", {100, 200, 300}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  auto store1 = Store::make(aBuf, {i, j, k}, sin(i * j * k));
+  auto forK1 = For::make(k, 0, 300, store1);
+  auto store2 = Store::make(
+      bBuf,
+      {i, j, k},
+      Add::make(Load::make(aBuf, {i, j, k}), Load::make(aBuf, {i, j, k + 1})));
+  auto forK2 = For::make(k, 0, 299, store2);
+  auto forJ = For::make(j, 0, 200, Block::make({forK1, forK2}));
+  auto forI = For::make(i, 0, 100, forJ);
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: for (int k
+# CHECK-NEXT: A[0, 0, k] =
+# CHECK: for (int k
+# CHECK-NEXT: B[i, j, k] = (A[0, 0, k]) + (A[0, 0, k + 1])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 3);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(2), 300);
+}
+
+TEST(LoopNest, compressBufferDifferentOrderIndices) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[j, i] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 99; ++j) {
+  //     B[i, j] = A[j, i] + A[j+1, 0]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {j, i}, sin(i * j)));
+  auto forJ2 = For::make(
+      j,
+      0,
+      99,
+      Store::make(
+          bBuf,
+          {i, j},
+          Add::make(Load::make(aBuf, {j, i}), Load::make(aBuf, {j + 1, i}))));
+  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[j, 0] =
+# CHECK: for (int j
+# CHECK-NEXT: B[i, j] = (A[j, 0]) + (A[j + 1, 0])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 100);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
+}
+
+TEST(LoopNest, compressBufferVariableBounds) {
+  // Input IR:
+  // for (int i = 0; i < M; ++i) {
+  //   for (int j = 0; j < N; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < N-1; ++j) {
+  //     B[i,j] = A[i,j] + A[i, j+1]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle M("M", kInt);
+  VarHandle N("N", kInt);
+  auto forJ1 = For::make(j, 0, N, Store::make(aBuf, {i, j}, sin(i * j)));
+  auto forJ2 = For::make(
+      j,
+      0,
+      N - 1,
+      Store::make(
+          bBuf,
+          {i, j},
+          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  auto forI = For::make(i, 0, M, Block::make({forJ1, forJ2}));
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[0, j] =
+# CHECK: for (int j
+# CHECK-NEXT: B[i, j] = (A[0, j]) + (A[0, j + 1])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
+}
+
+TEST(LoopNest, compressBufferNoCommonParentLoops) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  // }
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[i,j] + A[i, j+1]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
+  auto forJ2 = For::make(
+      j,
+      0,
+      199,
+      Store::make(
+          bBuf,
+          {i, j},
+          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
+  auto forI1 = For::make(i, 0, 100, forJ1);
+  auto forI2 = For::make(i, 0, 100, forJ2);
+  auto par = Block::make({forI1, forI2});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  // There should be no change in the buffer or code.
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i, j] =
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: B[i, j] = (A[i, j]) + (A[i, j + 1])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 100);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
+}
+
+TEST(LoopNest, compressBufferIndicesMixed) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i + j, j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[i + j, j] + A[i + j, j+1]
+  //   }
+  // }
+  BufHandle aBuf("A", {300, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i + j, j}, sin(i * j)));
+  auto forJ2 = For::make(
+      j,
+      0,
+      199,
+      Store::make(
+          bBuf,
+          {i, j},
+          Add::make(
+              Load::make(aBuf, {i + j, j}), Load::make(aBuf, {i + j, j + 1}))));
+  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
+  auto par = Block::make({forI});
+  LoopNest::compressBuffer(aBuf.node(), par);
+
+  // There should be no change in the buffer or code.
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[i + j, j] =
+# CHECK: for (int j
+# CHECK-NEXT: B[i, j] = (A[i + j, j]) + (A[i + j, j + 1])
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 300);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
+}
+
+TEST(LoopNest, compressMultipleBuffers) {
+  // Input IR:
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  //   for (int k = 0; k < 199; ++k) {
+  //     B[i,k] = A[i,k] + A[i, k+1]
+  //   }
+  //   for (int m = 0; m < 50; ++m) {
+  //     C[i,m] = B[i,m]
+  //   }
+  // }
+  BufHandle aBuf("A", {100, 200}, kInt);
+  BufHandle bBuf("B", {100, 200}, kInt);
+  BufHandle cBuf("C", {100, 200}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  VarHandle k("k", kInt);
+  VarHandle m("m", kInt);
+  auto forJ = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
+  auto forK = For::make(
+      k,
+      0,
+      199,
+      Store::make(
+          bBuf,
+          {i, k},
+          Add::make(Load::make(aBuf, {i, k}), Load::make(aBuf, {i, k + 1}))));
+  auto forM =
+      For::make(m, 0, 50, Store::make(cBuf, {i, m}, Load::make(bBuf, {i, m})));
+  auto forI = For::make(i, 0, 100, Block::make({forJ, forK, forM}));
+  auto par = Block::make({forI});
+
+  // This should compress all buffers A, B, and C as follows:
+  //   A[100, 200] -> A[1, 200]
+  //   B[100, 200] -> B[1, 200]
+  //   C[100, 200] -> C[1, 1]
+  LoopNest::compressAllBuffers(par);
+
+  std::ostringstream oss;
+  oss << *par;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT: for (int j
+# CHECK-NEXT: A[0, j] =
+# CHECK: for (int k
+# CHECK-NEXT: B[0, k] = (A[0, k]) + (A[0, k + 1])
+# CHECK: for (int m
+# CHECK-NEXT: C[0, 0] = B[0, m]
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  ASSERT_EQ(aBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
+  ASSERT_EQ(bBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, bBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, bBuf.node()->dim(1), 200);
+  ASSERT_EQ(cBuf.node()->ndim(), 2);
+  IS_IMM_WITH_VAL(Int, cBuf.node()->dim(0), 1);
+  IS_IMM_WITH_VAL(Int, cBuf.node()->dim(1), 1);
+}
+
+TEST(LoopNest, sanitizeNames) {
+  std::vector<ExprHandle> dim_args;
+  // Let's pick names that would overlap with default index names if not
+  // sanitized properly:
+  dim_args.emplace_back(ExprHandle(alloc<Var>("i", kInt)));
+  dim_args.emplace_back(ExprHandle(alloc<Var>("N:2", kInt)));
+  // Now let's create a many dimensions so that we had to use the same letter
+  // for different loops
+  for (int i = 0; i < 10; i++) {
+    dim_args.emplace_back(ExprHandle(alloc<Var>("N", kInt)));
+  }
+
+  // Now create two Computes with conflicting after sanitization names:
+  Tensor X = Compute("$X:!", dim_args, [&](const std::vector<VarHandle>& v) {
+    return v[0] + v[1] + v[9] + 1;
+  });
+  Tensor Y = Reduce(
+      "%X\"+",
+      {},
+      Sum(),
+      [&](const std::vector<VarHandle>& v) { return X.load(v); },
+      dim_args);
+
+  // Finally, let's verify what we got after sanitization:
+  LoopNest l({X, Y});
+  StmtPtr s = l.root_stmt();
+  LoopNest::sanitizeNames(s);
+
+  std::ostringstream oss;
+  oss << *s;
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK:  for (int i = 0; i < i_1; i++) {
+# CHECK-NEXT:    for (int j = 0; j < N_2_1; j++) {
+# CHECK-NEXT:      for (int k = 0; k < N_9; k++) {
+# CHECK-NEXT:        for (int l = 0; l < N_8; l++) {
+# CHECK-NEXT:          for (int m = 0; m < N_7; m++) {
+# CHECK-NEXT:            for (int n = 0; n < N_6; n++) {
+# CHECK-NEXT:              for (int o = 0; o < N_5; o++) {
+# CHECK-NEXT:                for (int p = 0; p < N_4; p++) {
+# CHECK-NEXT:                  for (int i1 = 0; i1 < N_3; i1++) {
+# CHECK-NEXT:                    for (int j1 = 0; j1 < N_2; j1++) {
+# CHECK-NEXT:                      for (int k1 = 0; k1 < N_1; k1++) {
+# CHECK-NEXT:                        for (int l1 = 0; l1 < N; l1++) {
+# CHECK-NEXT:                          v_X__[i, j, k, l, m, n, o, p, i1, j1, k1, l1] = ((i + j) + j1) + 1;
+# CHECK:  v_X___1 = int(0);
+# CHECK-NEXT:  for (int i_2 = 0; i_2 < i_1; i_2++) {
+# CHECK-NEXT:    for (int j_1 = 0; j_1 < N_2_1; j_1++) {
+# CHECK-NEXT:      for (int k_1 = 0; k_1 < N_9; k_1++) {
+# CHECK-NEXT:        for (int l_1 = 0; l_1 < N_8; l_1++) {
+# CHECK-NEXT:          for (int m_1 = 0; m_1 < N_7; m_1++) {
+# CHECK-NEXT:            for (int n_1 = 0; n_1 < N_6; n_1++) {
+# CHECK-NEXT:              for (int o_1 = 0; o_1 < N_5; o_1++) {
+# CHECK-NEXT:                for (int p_1 = 0; p_1 < N_4; p_1++) {
+# CHECK-NEXT:                  for (int i1_1 = 0; i1_1 < N_3; i1_1++) {
+# CHECK-NEXT:                    for (int j1_1 = 0; j1_1 < N_2; j1_1++) {
+# CHECK-NEXT:                      for (int k1_1 = 0; k1_1 < N_1; k1_1++) {
+# CHECK-NEXT:                        for (int l1_1 = 0; l1_1 < N; l1_1++) {
+# CHECK-NEXT:                          v_X___1 = ReduceOp((v_X___1) + (v_X__[i_2, j_1, k_1, l_1, m_1, n_1, o_1, p_1, i1_1, j1_1, k1_1, l1_1]), reduce_args={i_2, j_1, k_1, l_1, m_1, n_1, o_1, p_1, i1_1, j1_1, k1_1, l1_1});
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
new file mode 100644
index 0000000000000..cac7283f2bebe
--- /dev/null
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -0,0 +1,3252 @@
+#include <gtest/gtest.h>
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+// Test helper function used to determine if two regions of a buffer have an
+// overlap. No Overlap & partial overlap is obvious. Contains means A is
+// larger and fully encloses B, while ContainedOrEqual is the reverse. Equal
+// ranges are ContainedOrEqual.
+TEST(MemDependency, BoundOverlap) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+
+  // Sanity check 3 overlap cases.
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(0, 0), CB(0, 0)));
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 3), CB(2, 5)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 0), CB(1, 1)));
+
+  // Partial overlap works in either order.
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 10), CB(7, 14)));
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(7, 14), CB(0, 10)));
+
+  // Total Overlap works when one bound encloses the other, and returns which.
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(7, 9)));
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 15), CB(0, 16)));
+
+  // Total overlap works when the bounds are an identical range, returns
+  // ContainedOrEqual.
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 15), CB(2, 15)));
+
+  // Total overlap when only one end of the bound matches.
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(2, 10)));
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(3, 15)));
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(0, 10), CB(0, 9)));
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 10), CB(2, 15)));
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(3, 15), CB(2, 15)));
+
+  // No overlap when a < b.
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 2), CB(5, 10)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(2, 2), CB(3, 3)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(100, 120), CB(130, 130)));
+
+  // No overlap when a > b.
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(5, 10), CB(0, 2)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(3, 3), CB(2, 2)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(130, 130), CB(100, 120)));
+
+  // No overlap when adjacent.
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 100), CB(101, 120)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(2, 3), CB(0, 1)));
+
+  // Partial overlap when middle bounds match.
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap, boundOverlap(CB(0, 100), CB(100, 120)));
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 2), CB(2, 4)));
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap, boundOverlap(CB(100, 120), CB(0, 100)));
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(2, 3), CB(1, 2)));
+
+  // Total overlap when one bound is single length over one end of the other.
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(15, 15)));
+  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(2, 2)));
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 2), CB(2, 15)));
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(15, 15), CB(2, 15)));
+}
+
+TEST(MemDependency, BoundComparison) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kEQ));
+
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kNE));
+
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLT));
+
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGE));
+
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGT));
+
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::True,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::False,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NotDetermined,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLE));
+}
+
+TEST(MemDependency, BoundOverlapSymbolic) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  VarHandle w("w", kInt);
+
+  using namespace analysis;
+
+  auto CB = [](ExprHandle s, ExprHandle e) {
+    return Bound(s.node(), e.node());
+  };
+
+  // Sanity check cases where the start and end is symbolic but the diff is
+  // constant.
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(x, x), CB(x, x)));
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      boundOverlap(CB(x, x + 3), CB(x + 2, x + 5)));
+  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(x, x), CB(x + 1, x + 1)));
+
+  // We can't infer the sign of y, so cannot tell whether adding y is larger or
+  // smaller than y/2.
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      boundOverlap(CB(x, x + y), CB(x, x + y / 2)));
+
+  // No information about this bound, have to take the most conservative option:
+  // there may be an overlap.
+  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(x, y), CB(z, w)));
+
+  // Math on opaque terms works.
+  ASSERT_EQ(
+      OverlapKind::ContainedOrEqual,
+      boundOverlap(CB(x + w, y - z), CB(x + w, y - z)));
+  // Even requiring simplification.
+  ASSERT_EQ(
+      OverlapKind::ContainedOrEqual,
+      boundOverlap(CB(x - w - w, y), CB(x - w * 2, y)));
+}
+
+// Tests the helper function for overlap of multi dimensional indices bounds.
+// This uses boundOverlap on each dimension and return the "lowest" kind of
+// overlap.
+TEST(MemDependency, BoundOverlapMultiDim) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+
+  // Sanity check one dimensional cases.
+  ASSERT_EQ(OverlapKind::ContainedOrEqual, overlaps({CB(0, 0)}, {CB(0, 0)}));
+  ASSERT_EQ(OverlapKind::NoOverlap, overlaps({CB(0, 2)}, {CB(5, 10)}));
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap, overlaps({CB(0, 100)}, {CB(100, 120)}));
+
+  // Total overlap in 3 dims.
+  ASSERT_EQ(
+      OverlapKind::ContainedOrEqual,
+      overlaps({CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(0, 4)}));
+  ASSERT_EQ(
+      OverlapKind::ContainedOrEqual,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(0, 10)}));
+
+  // Total overlap in 2 dims, no overlap in another.
+  ASSERT_EQ(
+      OverlapKind::NoOverlap,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(5, 10)}));
+
+  // Total overlap in 2 dims, partial overlap in another.
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 2), CB(0, 5), CB(5, 10)}));
+  // This case is most important, so verify the overlap in any dim. (dim 2)
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      overlaps({CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 2), CB(2, 6), CB(0, 5)}));
+  // Dim 1.
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      overlaps({CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(1, 3), CB(0, 5), CB(0, 5)}));
+  // Total overlap in 1 dim, partial in 2.
+  ASSERT_EQ(
+      OverlapKind::PartialOverlap,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(2, 6), CB(0, 5), CB(5, 10)}));
+  // Total overlap, partial overlap, no overlap.
+  ASSERT_EQ(
+      OverlapKind::NoOverlap,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(2, 6), CB(11, 15), CB(0, 5)}));
+
+  // Total overlap (B) in 2 dims, total overlap (A) in another.
+  ASSERT_EQ(
+      OverlapKind::Contains,
+      overlaps({CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 3), CB(0, 4)}));
+
+  // Total overlap (A) in 2 dims, total overlap (B) in another.
+  ASSERT_EQ(
+      OverlapKind::Contains,
+      overlaps(
+          {CB(0, 12), CB(0, 15), CB(0, 4)}, {CB(0, 2), CB(0, 3), CB(0, 14)}));
+
+  // Total (B), No Overlap, Total (A).
+  ASSERT_EQ(
+      OverlapKind::NoOverlap,
+      overlaps(
+          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 6), CB(11, 15), CB(1, 2)}));
+}
+
+// Test the helper we use to subtract bounds: returns the regions(s) of A which
+// remain after removing the region of B.
+TEST(MemDependency, BoundSubtract) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  // One element subtract.
+  ASSERT_EQ(subtractBound(CB(0, 0), CB(0, 0)).size(), 0);
+  ASSERT_EQ(subtractBound(CB(5, 5), CB(5, 5)).size(), 0);
+
+  // No Overlap.
+  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(2, 2)), {CB(5, 5)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(0, 4)), {CB(5, 5)}));
+
+  // one side overlap.
+  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(4, 7)), {CB(1, 3)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(0, 5), CB(5, 7)), {CB(0, 4)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(4, 5), CB(1, 4)), {CB(5, 5)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(0, 4)), {CB(5, 5)}));
+
+  // both sides overlap.
+  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(0, 7)), {}));
+  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(5, 7)), {}));
+
+  // internal overlap.
+  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(2, 3)), {CB(1, 1), CB(4, 5)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(0, 5), CB(2, 4)), {CB(0, 1), CB(5, 5)}));
+}
+
+TEST(MemDependency, BoundSubtractSymbolic) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  VarHandle w("w", kInt);
+
+  using namespace analysis;
+
+  auto CB = [](ExprHandle s, ExprHandle e) {
+    return Bound(s.node(), e.node());
+  };
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  // One element subtract.
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  ASSERT_TRUE(EQ(subtractBound(CB(x, x), CB(x, x)), {}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x + 1, x + 1), CB(x + 1, x + 1)), {}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x * 2, x * 2), CB(x * 2, x * 2)), {}));
+
+  // Subtract constant range low.
+  ASSERT_TRUE(
+      EQ(subtractBound(CB(x, x + 10), CB(x, x + 4)), {CB(x + 5, x + 10)}));
+  // Subtract constant range high.
+  ASSERT_TRUE(
+      EQ(subtractBound(CB(x, x + 10), CB(x + 6, x + 12)), {CB(x, x + 5)}));
+  // Subtract constant range total overlap.
+  ASSERT_TRUE(EQ(subtractBound(CB(x, x + 10), CB(x, x + 10)), {}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x + 2, x + 10), CB(x, x + 12)), {}));
+  // Subtract constant range internal.
+  ASSERT_TRUE(
+      EQ(subtractBound(CB(x, x + 10), CB(x + 3, x + 7)),
+         {CB(x, x + 2), CB(x + 8, x + 10)}));
+
+  // Size is inferable but not constant, only works with a single var.
+  ASSERT_TRUE(EQ(subtractBound(CB(0, x), CB(0, x * 2)), {}));
+  ASSERT_TRUE(EQ(subtractBound(CB(0, x * 2), CB(0, x - 1)), {CB(x, x * 2)}));
+
+  // Size is not inferable.
+  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(z, w)), {CB(x, y)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(x, z)), {CB(x, y)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(0, x)), {CB(x, y)}));
+  ASSERT_TRUE(EQ(subtractBound(CB(x, x), CB(0, 0)), {CB(x, x)}));
+}
+
+// Tests the helper function that does subtraction, but for multi dimensional
+// indices bounds.
+TEST(MemDependency, BoundSubtractMultiDim) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+  auto EQ = [](std::vector<IndexBounds> x, std::vector<IndexBounds> y) {
+    if (x.size() != y.size()) {
+      return false;
+    }
+    for (auto i = 0U; i < x.size(); ++i) {
+      if (!indexBoundsEquals(x[i], y[i])) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // sanity check one dimension.
+  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(0, 9)}, {CB(0, 9)}), {}));
+  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(3, 9)}, {CB(0, 12)}), {}));
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, 12)}, {CB(0, 9)}), {{CB(10, 12)}}));
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, 12)}, {CB(3, 12)}), {{CB(0, 2)}}));
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(0, 9)}, {CB(1, 8)}), {{CB(0, 0)}, {CB(9, 9)}}));
+
+  // Multi dim total overlap.
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 9), CB(0, 2)}), {}));
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 10), CB(0, 20)}), {}));
+
+  // Mutli dim one way partial in dim 1.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 3), CB(0, 2)}),
+         {{CB(4, 9), CB(0, 2)}}));
+
+  // Mutli dim one way partial in dim 2.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, 9), CB(0, 20)}, {CB(0, 9), CB(0, 10)}),
+         {{CB(0, 9), CB(11, 20)}}));
+
+  // Partial overlap in 2 dims.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, 5), CB(0, 5)}, {CB(2, 8), CB(2, 8)}),
+         {{CB(0, 1), CB(0, 5)}, {CB(2, 5), CB(0, 1)}}));
+
+  // Partial overlap in 3 dims.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds(
+             {CB(0, 5), CB(0, 5), CB(0, 5)}, {CB(2, 8), CB(2, 8), CB(2, 8)}),
+         {{CB(0, 1), CB(0, 5), CB(0, 5)},
+          {CB(2, 5), CB(0, 1), CB(0, 5)},
+          {CB(2, 5), CB(2, 5), CB(0, 1)}}));
+}
+
+// Tests the multi dimensional subtraction code for bounds that cannot be fully
+// materialized.
+TEST(MemDependency, BoundSubtractMultiDimSymbolic) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  using namespace analysis;
+
+  auto CB = [](ExprHandle s, ExprHandle e) {
+    return Bound(s.node(), e.node());
+  };
+
+  auto EQ = [](std::vector<IndexBounds> x, std::vector<IndexBounds> y) {
+    if (x.size() != y.size()) {
+      return false;
+    }
+    for (auto i = 0U; i < x.size(); ++i) {
+      if (!indexBoundsEquals(x[i], y[i])) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // Cannot determine overlaps.
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(x, x)}, {CB(0, 0)}), {{CB(x, x)}}));
+
+  // Various total Overlaps.
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(x, x), CB(x, x)}, {CB(x, x), CB(x, x)}), {}));
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(x, y), CB(x, y)}, {CB(x, y), CB(x, y)}), {}));
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(x, x), CB(y, y)}, {CB(x, x), CB(y, y)}), {}));
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(0, y)}), {}));
+
+  // one-way overlap in first dim.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x - 5), CB(0, y)}),
+         {{CB(x - 4, x), CB(0, y)}}));
+  // second dim.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(5, y)}),
+         {{CB(0, x), CB(0, 4)}}));
+
+  // Internal overlap in first dim.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(2, x - 5), CB(0, y)}),
+         {{CB(0, 1), CB(0, y)}, {CB(x - 4, x), CB(0, y)}}));
+  // second dim.
+  ASSERT_TRUE(EQ(
+      subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(10, y - 10)}),
+      {{CB(0, x), CB(0, 9)}, {CB(0, x), CB(y - 9, y)}}));
+
+  // Overlap in both dimensions.
+  ASSERT_TRUE(
+      EQ(subtractIndicesBounds(
+             {CB(0, x), CB(0, y)}, {CB(5, x - 5), CB(10, y - 10)}),
+         {
+             {CB(0, 4), CB(0, y)},
+             {CB(x - 4, x), CB(0, y)},
+             {CB(0, x), CB(0, 9)},
+             {CB(0, x), CB(y - 9, y)},
+         }));
+}
+
+// Simple check that the analyzer does anything at all...
+TEST(MemDependency, MemDependencyCheckerSimple) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+
+  analysis::MemDependencyChecker analyzer;
+
+  /*
+   * A[0] = 3;
+   * B[0] = A[0] + 1;
+   */
+
+  StorePtr aStore = Store::make(a, {0}, 3);
+  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {0}), 1));
+
+  StmtPtr stmt = Block::make({aStore, bStore});
+
+  stmt->accept(&analyzer);
+
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
+  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, bStore));
+  // sanity check, but anything that depends directly must depend indirectly.
+  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aStore));
+}
+
+// Check that there is a difference between direct and indirect dependence.
+TEST(MemDependency, MemDependencyCheckerMultiStmt) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {1}, kInt);
+
+  analysis::MemDependencyChecker analyzer;
+
+  /*
+   * A[0] = 3;
+   * B[0] = A[0];
+   * C[0] = B[0] + 1;
+   */
+
+  StorePtr aStore = Store::make(a, {0}, 3);
+  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
+  StorePtr cStore = Store::make(c, {0}, Add::make(Load::make(b, {0}), 1));
+
+  StmtPtr stmt = Block::make({aStore, bStore, cStore});
+
+  stmt->accept(&analyzer);
+
+  // C depends on A indirectly.
+  ASSERT_FALSE(analyzer.dependsDirectly(cStore, aStore));
+  ASSERT_TRUE(analyzer.dependsIndirectly(cStore, aStore));
+
+  // C depends on B directly, which depends on A directly.
+  ASSERT_TRUE(analyzer.dependsDirectly(cStore, bStore));
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
+
+  // Dependency goes top to bottom only.
+  ASSERT_FALSE(analyzer.dependsIndirectly(bStore, cStore));
+  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, bStore));
+  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, cStore));
+}
+
+// Verify that we do filter writes that are totally overlapped by later writes.
+TEST(MemDependency, MemDependencyCheckerOverlap) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+
+  analysis::MemDependencyChecker analyzer;
+
+  /*
+   * A[0] = 3;
+   * A[0] = 6;
+   * B[0] = A[0] + 1;
+   */
+
+  StorePtr aStore = Store::make(a, {0}, 3);
+  StorePtr a2Store = Store::make(a, {0}, 6);
+  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {0}), 1));
+
+  StmtPtr stmt = Block::make({aStore, a2Store, bStore});
+
+  stmt->accept(&analyzer);
+
+  // B store depends on second A store but not first since it is completely
+  // overlapped.
+  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, a2Store));
+  ASSERT_FALSE(analyzer.dependsIndirectly(bStore, aStore));
+
+  // No dependency between either A store.
+  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, a2Store));
+  ASSERT_FALSE(analyzer.dependsIndirectly(a2Store, aStore));
+}
+
+// Verify that bounds match loop iterations, and that dependencies progress
+// across loop scopes.
+TEST(MemDependency, MemDependencyCheckerLoop) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+
+  /*
+   * for (int x = 0; x < 10; ++x) {
+   *   A[x] = x;
+   * }
+   * B[0] = A[0] + 1;
+   */
+
+  StorePtr aStore = Store::make(a, {x}, x);
+  StmtPtr loop = For::make(x, 0, 10, aStore);
+  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {4}), 1));
+
+  StmtPtr stmt = Block::make({loop, bStore});
+
+  stmt->accept(&analyzer);
+
+  // Same A->B dependency.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
+
+  // B depends on the loop.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
+  // A is in the loop but does not depend on any loop iteration.
+  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, loop));
+
+  auto aStoreAccess = analyzer.accessFor(aStore);
+  ASSERT_NE(aStoreAccess, nullptr);
+
+  // It should have bounds covering the range of x: 0 <= x < 10.
+  ASSERT_TRUE(indexBoundsEquals(
+      aStoreAccess->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
+}
+
+// Reductions should promote dependencies as well.
+TEST(MemDependency, MemDependencyCheckerLoopReduce) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; ++x) {
+   *   A[0] = A[x] + 1;
+   * }
+   * B[0] = A[0];
+   */
+
+  StorePtr aInit = Store::make(a, {0}, 0);
+  ExprHandle reduce = Sum()(a, 1, {x}, {x});
+  StorePtr aReduce = Store::make(a, {0}, reduce);
+  StmtPtr loop = For::make(x, 0, 10, aReduce);
+  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
+
+  StmtPtr stmt = Block::make({aInit, loop, bStore});
+
+  stmt->accept(&analyzer);
+
+  // B -> A.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aReduce));
+
+  // B depends indirectly on the initializer of A, since the reduction depends
+  // on it.
+  ASSERT_FALSE(analyzer.dependsDirectly(bStore, aInit));
+  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aInit));
+
+  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, aInit));
+
+  // B depends on the loop.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
+  // A is in the loop and depends on other iterations.
+  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, loop));
+
+  // The loop contents depend on the initializer too.
+  ASSERT_TRUE(analyzer.dependsDirectly(loop, aInit));
+
+  // Find loads within the reduction:
+  auto reduceLoads = NodeFinder<Load>::find(reduce.node());
+  // Pull out the access for the load inside the loop.
+  for (auto load : reduceLoads) {
+    auto loopLoad = analyzer.accessFor(load);
+    // It should have 10 element long bounds.
+    ASSERT_TRUE(indexBoundsEquals(
+        loopLoad->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
+  }
+}
+
+// Lowering a reduction doesn't affect dependency analysis.
+TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer;
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; ++x) {
+   *   A[0] = A[x] + 1;
+   * }
+   * B[0] = A[0];
+   */
+
+  StorePtr aInit = Store::make(a, {0}, 0);
+  ExprHandle aLoad = Load::make(a, {x});
+  StorePtr aReduce = Store::make(a, {0}, Add::make(aLoad, 1));
+  StmtPtr loop = For::make(x, 0, 10, aReduce);
+  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
+
+  StmtPtr stmt = Block::make({aInit, loop, bStore});
+
+  stmt->accept(&analyzer);
+
+  // B -> A.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aReduce));
+
+  // B depends indirectly on the initializer of A, since the reduction depends
+  // on it.
+  ASSERT_FALSE(analyzer.dependsDirectly(bStore, aInit));
+  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aInit));
+
+  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, aInit));
+
+  // B depends on the loop.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
+  // A is in the loop and depends on other iterations.
+  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, loop));
+
+  // The loop contents depend on the initializer too.
+  ASSERT_TRUE(analyzer.dependsDirectly(loop, aInit));
+
+  // Pull out the access for the store inside the loop.
+  auto loopLoad = analyzer.accessFor(aLoad.node());
+  // It should have 10 element long bounds.
+  ASSERT_TRUE(indexBoundsEquals(
+      loopLoad->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
+}
+
+// Can determine dependencies of outputs, through to inputs.
+TEST(MemDependency, MemDependencyCheckerInputsOutputs) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  // initialize analyzer with inputs and outputs.
+  analysis::MemDependencyChecker analyzer({a}, {b});
+
+  // Here's a Relu.
+  /*
+   * for (int x = 0; x < 10; ++x) {
+   *   B[x] = Max(A[x], 0);
+   * }
+   */
+
+  ExprHandle aLoad = Load::make(a, {x});
+  StorePtr bStore = Store::make(b, {x}, Max::make(aLoad, 0, true));
+  StmtPtr loop = For::make(x, 0, 10, bStore);
+
+  StmtPtr stmt = Block::make({loop});
+
+  stmt->accept(&analyzer);
+
+  // Output depends indirectly on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+  // aLoad depends directly on the input A.
+  ASSERT_TRUE(analyzer.dependsDirectly(aLoad.node(), a.node()));
+  // bStore therefore depends directly on the input A.
+  ASSERT_TRUE(analyzer.dependsDirectly(bStore, a.node()));
+  // The output depends directly on the store.
+  ASSERT_TRUE(analyzer.dependsDirectly(b.node(), bStore));
+
+  // Check AccessInfo based overloads.
+  auto input = analyzer.input(a.node());
+  auto output = analyzer.output(b.node());
+
+  // Output depends indirectly on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(output, input));
+  // Not directly.
+  ASSERT_FALSE(analyzer.dependsDirectly(output, input));
+  // Not in reverse order.
+  ASSERT_FALSE(analyzer.dependsIndirectly(input, output));
+
+  // output -> bStore -> bLoad -> input.
+  auto storeAccess = analyzer.accessFor(bStore);
+  auto loadAccess = analyzer.accessFor(aLoad.node());
+
+  ASSERT_TRUE(analyzer.dependsDirectly(output, storeAccess));
+  ASSERT_TRUE(analyzer.dependsDirectly(loadAccess, input));
+}
+
+// Can tell if an output does not depend on an input.
+TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  // initialize analyzer with inputs and outputs.
+  analysis::MemDependencyChecker analyzer({a}, {b});
+
+  // Here's a dumb Relu.
+  /*
+   * for (int x = 0; x < 10; ++x) {
+   *   B[x] = Max(x, 0);
+   * }
+   */
+
+  StorePtr bStore = Store::make(b, {x}, Max::make(x, 0, true));
+  StmtPtr loop = For::make(x, 0, 10, bStore);
+
+  StmtPtr stmt = Block::make({loop});
+
+  stmt->accept(&analyzer);
+
+  // Output does not depend indirectly on input.
+  ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+  // The output still depends directly on the store.
+  ASSERT_TRUE(analyzer.dependsDirectly(b.node(), bStore));
+
+  // Check AccessInfo based overloads.
+  auto input = analyzer.input(a.node());
+  auto output = analyzer.output(b.node());
+
+  // Output does not depend indirectly on input.
+  ASSERT_FALSE(analyzer.dependsIndirectly(output, input));
+}
+
+// Verify different loop extents produce accesses with different bounds, and
+// that later accesses find dependencies that overlap their entire bound range.
+TEST(MemDependency, MemDependencyCheckerLoopBounds) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  BufHandle c("C", {10}, kInt);
+  VarHandle x("x", kInt);
+  using namespace analysis;
+
+  MemDependencyChecker analyzer({a}, {c});
+
+  // This enables using the execution order of the loops to determine if some
+  // loops are self dependent or not.
+  analyzer.allowLoopExecutionOrderAnalysis();
+
+  /*
+   * for (int x = 1; x < 10; ++x) {
+   *   B[x] = A[x];
+   * }
+   * for (int x = 1; x < 9; ++x) {
+   *   B[x] = B[x] * 2;
+   * }
+   * for (int x = 3; x < 4; ++x) {
+   *   C[x] = A[x];
+   * }
+   * for (int x = 0; x < 10; ++x) {
+   *   C[x] = B[x];
+   * }
+   */
+
+  std::vector<StmtPtr> stmts(
+      {For::make(x, 1, 10, Store::make(b, {x}, Load::make(a, {x}))),
+       For::make(
+           x, 1, 9, Store::make(b, {x}, Mul::make(Load::make(b, {x}), 2))),
+       For::make(x, 3, 4, Store::make(c, {x}, Load::make(a, {x}))),
+       For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x})))});
+
+  StmtPtr stmt = Block::make(stmts);
+
+  stmt->accept(&analyzer);
+
+  auto input = analyzer.input(a.node());
+  auto output = analyzer.output(c.node());
+
+  // sanity check Output -> Input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(output, input));
+
+  // Check the For loop dependencies:
+
+  // Last write to C depends on both writes to B since they contain the last
+  // write to at least one element.
+  ASSERT_TRUE(analyzer.dependsIndirectly(stmts[3], stmts[1]));
+  ASSERT_TRUE(analyzer.dependsIndirectly(stmts[3], stmts[0]));
+
+  // The last write to C does not depend on the other write to C.
+  ASSERT_FALSE(analyzer.dependsIndirectly(stmts[3], stmts[2]));
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  /*  0. Input: A[(0, 9)] - dependents: 1 5
+   *  1. Load: A[(1, 9)] - depends on: 0  - dependents: 2
+   *  2. Store: B[(1, 9)] - depends on: 1  - dependents: 3 7
+   *  3. Load: B[(1, 8)] - depends on: 2  - dependents: 4
+   *  4. Store: B[(1, 8)] - depends on: 3  - dependents: 7
+   *  5. Load: A[(3, 3)] - depends on: 0  - dependents: 6
+   *  6. Store: C[(3, 3)] - depends on: 5
+   *  7. Load: B[(0, 9)] - depends on: 2 4  - dependents: 8
+   *  8. Store: C[(0, 9)] - depends on: 7  - dependents: 9
+   *  9. Output: C[(0, 9)] - depends on: 8
+   */
+
+  // Now let's look at the bounds of each access.
+  // There are 9 accesses in this Stmt, so this is exhaustive, we wont do this
+  // much.
+  auto history = analyzer.getHistory();
+  ASSERT_EQ(history.size(), 10);
+  VarPtr aVar = a.node()->base_handle();
+  VarPtr bVar = b.node()->base_handle();
+  VarPtr cVar = c.node()->base_handle();
+
+  // The first access is the input A.
+  ASSERT_EQ(history[0]->type(), AccessType::Input);
+  ASSERT_EQ(history[0]->var(), aVar);
+  // It has the bounds of the producing Input.
+  ASSERT_TRUE(EQ(history[0]->bounds(), {CB(0, 9)}));
+  // sanity check the input we retrieved earlier matches.
+  ASSERT_EQ(history[0], input);
+
+  // The second access is the load of A in the first loop.
+  ASSERT_EQ(history[1]->type(), AccessType::Load);
+  ASSERT_EQ(history[1]->var(), aVar);
+  // It has the bounds of the loop, i.e. start == 1.
+  ASSERT_TRUE(EQ(history[1]->bounds(), {CB(1, 9)}));
+  // It reads from A, so it should have a dependency on the last write to this
+  // range - with is the input.
+  ASSERT_EQ(history[1]->dependencies().size(), 1);
+  ASSERT_TRUE(history[1]->hasDependency(history[0]));
+
+  // The third access is the store into B in the first loop.
+  ASSERT_EQ(history[2]->type(), AccessType::Store);
+  ASSERT_EQ(history[2]->var(), bVar);
+  // It also has the bounds of the loop, i.e. start == 1.
+  ASSERT_TRUE(EQ(history[2]->bounds(), {CB(1, 9)}));
+  // The previous load is in its RHS, so it depends on it.
+  ASSERT_EQ(history[2]->dependencies().size(), 1);
+  ASSERT_TRUE(history[2]->hasDependency(history[1]));
+
+  // The third access is the load from B in the second loop.
+  ASSERT_EQ(history[3]->type(), AccessType::Load);
+  ASSERT_EQ(history[3]->var(), bVar);
+  // It has the bounds of the second loop, i.e. >= 1 < 9.
+  ASSERT_TRUE(EQ(history[3]->bounds(), {CB(1, 8)}));
+  // It reads from B in a smaller range, so should depend on the previous
+  // store.
+  ASSERT_EQ(history[3]->dependencies().size(), 1);
+  ASSERT_TRUE(history[3]->hasDependency(history[2]));
+
+  // The fourth: the store to B in the second loop.
+  ASSERT_EQ(history[4]->type(), AccessType::Store);
+  ASSERT_EQ(history[4]->var(), bVar);
+  // It also has the bounds of the second loop.
+  ASSERT_TRUE(EQ(history[4]->bounds(), {CB(1, 8)}));
+  // The previous load is in its RHS, so it depends on it as before.
+  ASSERT_EQ(history[4]->dependencies().size(), 1);
+  ASSERT_TRUE(history[4]->hasDependency(history[3]));
+
+  // The fifth access is the load is from the 3rd loop, and skips previous B
+  // accesses.
+  ASSERT_EQ(history[5]->type(), AccessType::Load);
+  ASSERT_EQ(history[5]->var(), aVar);
+  // It has the bounds of the third loop: >= 3 < 4.
+  ASSERT_TRUE(EQ(history[5]->bounds(), {CB(3, 3)}));
+  // It depends on the last thing to write to A, which is the A input.
+  ASSERT_EQ(history[5]->dependencies().size(), 1);
+  ASSERT_TRUE(history[5]->hasDependency(history[0]));
+
+  // Sixth: the store into the output C.
+  ASSERT_EQ(history[6]->type(), AccessType::Store);
+  ASSERT_EQ(history[6]->var(), cVar);
+  // It also has the bounds of the third loop.
+  ASSERT_TRUE(EQ(history[6]->bounds(), {CB(3, 3)}));
+  // The previous load is in its RHS, so it depends on it as always.
+  ASSERT_EQ(history[6]->dependencies().size(), 1);
+  ASSERT_TRUE(history[6]->hasDependency(history[5]));
+
+  // The seventh access is the load of B in the fourth loop.
+  ASSERT_EQ(history[7]->type(), AccessType::Load);
+  ASSERT_EQ(history[7]->var(), bVar);
+  // It has the bounds of the final loop, >= 0 < 10
+  ASSERT_TRUE(EQ(history[7]->bounds(), {CB(0, 9)}));
+  // The bounds of this read are larger than the bounds of the previous write,
+  // so it depends on both previous Stores to B.
+  ASSERT_EQ(history[7]->dependencies().size(), 2);
+  ASSERT_TRUE(history[7]->hasDependency(history[2]));
+  ASSERT_TRUE(history[7]->hasDependency(history[4]));
+
+  // Eight: the final store into the output C.
+  ASSERT_EQ(history[8]->type(), AccessType::Store);
+  ASSERT_EQ(history[8]->var(), cVar);
+  // It also has the bounds of the final loop.
+  ASSERT_TRUE(EQ(history[8]->bounds(), {CB(0, 9)}));
+  // The previous load is in its RHS, so it depends on it as always.
+  ASSERT_EQ(history[8]->dependencies().size(), 1);
+  ASSERT_TRUE(history[8]->hasDependency(history[7]));
+
+  // The last access represents the output Buf.
+  ASSERT_EQ(history[9]->type(), AccessType::Output);
+  ASSERT_EQ(history[9]->var(), cVar);
+  // It has the bounds of the output Buf.
+  ASSERT_TRUE(EQ(history[9]->bounds(), {CB(0, 9)}));
+  // sanity check the input we retrieved earlier matches.
+  ASSERT_EQ(history[9], output);
+  // It depends on the last write to C only.
+  ASSERT_EQ(history[9]->dependencies().size(), 1);
+  ASSERT_TRUE(history[9]->hasDependency(history[8]));
+}
+
+// Verify that we can still infer bounds when the loop var is offset.
+TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  MemDependencyChecker analyzer({a}, {b});
+
+  // This enables using the execution order of the loops to determine if some
+  // loops are self dependent or not.
+  analyzer.allowLoopExecutionOrderAnalysis();
+
+  /*
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = A[x - 1];
+   * }
+   * for (int x = 0; x < 9; x++) {
+   *   A[x] = A[x + 1];
+   * }
+   * for (int x = 0; x < 9; x++) {
+   *   A[9 - x] = A[8 - x];
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = A[9 - x];
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = A[x];
+   * }
+   */
+
+  StmtPtr stmt = Block::make(
+      {For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1}))),
+       For::make(x, 0, 9, Store::make(a, {x}, Load::make(a, {x + 1}))),
+       For::make(
+           x,
+           0,
+           9,
+           Store::make(
+               a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x}))),
+       For::make(
+           x, 0, 10, Store::make(a, {x}, Load::make(a, {ExprHandle(9) - x}))),
+       For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x})))});
+
+  stmt->accept(&analyzer);
+
+  // Sanity check output depends on Input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  /*  0. Input: A[(0, 9)] - dependents: 1
+   *  1. Load: A[(0, 8)] - depends on: 0 2  - dependents: 2
+   *  2. Store: A[(1, 9)] - depends on: 1  - dependents: 1 3
+   *  3. Load: A[(1, 9)] - depends on: 2  - dependents: 4
+   *  4. Store: A[(0, 8)] - depends on: 3  - dependents: 5 7
+   *  5. Load: A[(0, 8)] - depends on: 4  - dependents: 6
+   *  6. Store: A[(1, 9)] - depends on: 5  - dependents: 7
+   *  7. Load: A[(0, 9)] - depends on: 4 6 8  - dependents: 8
+   *  8. Store: A[(0, 9)] - depends on: 7  - dependents: 7 9
+   *  9. Load: A[(0, 9)] - depends on: 8  - dependents: 10
+   *  10. Store: B[(0, 9)] - depends on: 9  - dependents: 11
+   *  11. Output: B[(0, 9)] - depends on: 10
+   */
+
+  // Now let's look at the bounds of each access.
+  auto history = analyzer.getHistory();
+  ASSERT_EQ(history.size(), 12);
+  VarPtr aVar = a.node()->base_handle();
+  VarPtr bVar = b.node()->base_handle();
+
+  // The first access is the input A.
+  ASSERT_EQ(history[0]->type(), AccessType::Input);
+  ASSERT_EQ(history[0]->var(), aVar);
+  // It has the bounds of the producing Input.
+  ASSERT_TRUE(EQ(history[0]->bounds(), {CB(0, 9)}));
+
+  // The second access is the load A[x-1].
+  ASSERT_EQ(history[1]->type(), AccessType::Load);
+  ASSERT_EQ(history[1]->var(), aVar);
+  // It has the bounds of the loop modified by the offset of each index, in
+  // this case -1.
+  ASSERT_TRUE(EQ(history[1]->bounds(), {CB(0, 8)}));
+  // It depends on the input, but also the store in the same loop, since
+  // different interations of the loop depend on each other.
+  ASSERT_EQ(history[1]->dependencies().size(), 2);
+  ASSERT_TRUE(history[1]->hasDependency(history[0]));
+  ASSERT_TRUE(history[1]->hasDependency(history[2]));
+
+  // The third access is the Store to A[x] in the first loop.
+  ASSERT_EQ(history[2]->type(), AccessType::Store);
+  ASSERT_EQ(history[2]->var(), aVar);
+  // It has no offset on x, so should have the same bounds as the loop.
+  ASSERT_TRUE(EQ(history[2]->bounds(), {CB(1, 9)}));
+
+  // The fourth access is the load A[x+1] in the second loop.
+  ASSERT_EQ(history[3]->type(), AccessType::Load);
+  ASSERT_EQ(history[3]->var(), aVar);
+  // It has the bounds of the loop (0 <= x < 9) modified by the offset of each
+  // index, in this case 1.
+  ASSERT_TRUE(EQ(history[3]->bounds(), {CB(1, 9)}));
+  // This load totally overlaps the previous write to A, so it depends only on
+  // it and not the input.
+  ASSERT_EQ(history[3]->dependencies().size(), 1);
+  ASSERT_TRUE(history[3]->hasDependency(history[2]));
+
+  // The fifth access is the store to A[x] in the second loop.
+  ASSERT_EQ(history[4]->type(), AccessType::Store);
+  ASSERT_EQ(history[4]->var(), aVar);
+  // It has no offset on x, so should have the same bounds as the loop.
+  ASSERT_TRUE(EQ(history[4]->bounds(), {CB(0, 8)}));
+
+  // The sixth access is the load to A[8 - x] in the third loop.
+  ASSERT_EQ(history[5]->type(), AccessType::Load);
+  ASSERT_EQ(history[5]->var(), aVar);
+  // It has the bounds of the loop (0 <= x < 9) modified by the offset of each
+  // index, in this case 8 - x.
+  // This access has a negative stride, which will be normalized.
+  ASSERT_TRUE(EQ(history[5]->bounds(), {CB(0, 8)}));
+  // This load totally overlaps the most recent write to A, so it depends only
+  // on it and not the input or the first write to A.
+  ASSERT_EQ(history[5]->dependencies().size(), 1);
+  ASSERT_TRUE(history[5]->hasDependency(history[4]));
+
+  // The seventh access is the store to A[9 - x] in the third loop.
+  ASSERT_EQ(history[6]->type(), AccessType::Store);
+  ASSERT_EQ(history[6]->var(), aVar);
+  // This store has a negative stride on it's indices, but is normalized
+  // internally.
+  ASSERT_TRUE(EQ(history[6]->bounds(), {CB(1, 9)}));
+
+  // The eighth access is the load A[9-x] in the second loop.
+  ASSERT_EQ(history[7]->type(), AccessType::Load);
+  ASSERT_EQ(history[7]->var(), aVar);
+  // It has the bounds of the loop (0 <= x < 9), modified by the offset 9 - x,
+  // which essentially traverses the loop backwards.
+  ASSERT_TRUE(EQ(history[7]->bounds(), {CB(0, 9)}));
+  // This Load has three write dependencies:
+  ASSERT_EQ(history[7]->dependencies().size(), 3);
+  //  * The previous store (#6) for elements 1-9
+  ASSERT_TRUE(history[7]->hasDependency(history[6]));
+  //  * An earlier store (#4) covering element 0
+  ASSERT_TRUE(history[7]->hasDependency(history[4]));
+  //  * A future store inside this loop, since this loop modifies the buffer
+  //  in a non distinct way (due to the load and store having different access
+  //  strides).
+  ASSERT_TRUE(history[7]->hasDependency(history[8]));
+
+  // The ninth access is the store to A[x] in the fourth loop.
+  ASSERT_EQ(history[8]->type(), AccessType::Store);
+  ASSERT_EQ(history[8]->var(), aVar);
+  // This store has a negative stride on it's indices, but is normalized
+  // internally.
+  ASSERT_TRUE(EQ(history[8]->bounds(), {CB(0, 9)}));
+
+  // The tenth and 11th accesses are the copy from A[x] to B[x].
+  ASSERT_EQ(history[9]->type(), AccessType::Load);
+  ASSERT_EQ(history[9]->var(), aVar);
+  ASSERT_EQ(history[10]->type(), AccessType::Store);
+  ASSERT_EQ(history[10]->var(), bVar);
+
+  // The last access represents the output Buf.
+  ASSERT_EQ(history[11]->type(), AccessType::Output);
+  ASSERT_EQ(history[11]->var(), bVar);
+  // It has the bounds of the output Buf.
+  ASSERT_TRUE(EQ(history[11]->bounds(), {CB(0, 9)}));
+  // It depends on the last write to B only.
+  ASSERT_EQ(history[11]->dependencies().size(), 1);
+  ASSERT_TRUE(history[11]->hasDependency(history[10]));
+
+  // ok that's enough of that.
+}
+
+// Check many different cases of loop self dependency - when a load within a
+// loop is dependent on a Store later in the same loop but in different
+// iteration. This is affected by whether or not we can trust the execution
+// order of the loop.
+TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+
+  using namespace analysis;
+
+  // This check assumes that the Stmt has a single Store with a single Load on
+  // the RHS.
+  auto isSelfDependent =
+      [](const std::vector<std::shared_ptr<AccessInfo>>& history) -> bool {
+    return history.front()->hasDependency(history.back());
+  };
+
+  {
+    /* for (int y = 0; y < 10; y++) {
+     *   A[y] = (A[y]) + 1;
+     * } */
+
+    // Not self dependent since all loop iterations use a different y.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        y,
+        0,
+        10,
+        Block::make({Store::make(a, {y}, Add::make(Load::make(a, {y}), 1))}));
+
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int y = 0; y < 10; y++) {
+     *   A[y + 1] = (A[y + 1]) + 1;
+     * }
+     */
+
+    // Not self dependent due to different y (with offset).
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        y,
+        0,
+        10,
+        Block::make(
+            {Store::make(a, {y + 1}, Add::make(Load::make(a, {y + 1}), 1))}));
+
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[0] = (A[0]) + x;
+     * }
+     */
+
+    // Is self dependent since all loops use a common constant element of A.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x,
+        0,
+        10,
+        Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[0] = (B[0]) + x;
+     * }
+     */
+
+    // Is not self dependent because there is no store to the buffer that is
+    // read.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x,
+        0,
+        10,
+        Block::make({Store::make(a, {0}, Add::make(Load::make(b, {0}), x))}));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[y] = (A[y]) + x;
+     * }
+     */
+
+    // Is self dependent since all loops use a common symbolic element of A.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x,
+        0,
+        10,
+        Block::make({Store::make(a, {y}, Add::make(Load::make(a, {y}), x))}));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x] = A[x + 1];
+     * }
+     */
+
+    // In this case it depends if we are considering execution order.
+
+    MemDependencyChecker analyzer;
+
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 1})));
+    stmt->accept(&analyzer);
+
+    // With analysis of order disabled, this is self dependent since the read
+    // from X+1 and the write to X+1 could be in reverse order.
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x] = A[x + 1];
+     * }
+     */
+
+    MemDependencyChecker analyzer;
+    analyzer.allowLoopExecutionOrderAnalysis();
+
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 1})));
+    stmt->accept(&analyzer);
+
+    // If order analysis is enabled, this is not dependent since the read for
+    // each element occurs before the write to that element.
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 1; x < 10; x++) {
+     *   A[x] = A[x - 1];
+     * }
+     */
+
+    MemDependencyChecker analyzer;
+
+    StmtPtr stmt =
+        For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 1; x < 10; x++) {
+     *   A[x] = A[x - 1];
+     * }
+     */
+
+    MemDependencyChecker analyzer;
+    analyzer.allowLoopExecutionOrderAnalysis();
+
+    StmtPtr stmt =
+        For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})));
+    stmt->accept(&analyzer);
+
+    // In this case, even with order analysis the Load is dependent on the
+    // Store, since the write to X occurs before the read from X.
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 9; x++) {
+     *   A[9 - x] = A[8 - x];
+     * }
+     */
+
+    // Still works if the execution order is reversed, so long as the read
+    // comes before the write.
+
+    MemDependencyChecker analyzer;
+    analyzer.allowLoopExecutionOrderAnalysis();
+
+    StmtPtr stmt = For::make(
+        x,
+        3,
+        10,
+        Store::make(
+            a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x})));
+    stmt->accept(&analyzer);
+
+    // However here was can determine the A store is earlier in the order than
+    // the load.
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 9; x++) {
+     *   A[8 - x] = A[9 - x];
+     * }
+     */
+
+    // But not if it doesn't.
+
+    MemDependencyChecker analyzer;
+    analyzer.allowLoopExecutionOrderAnalysis();
+
+    StmtPtr stmt = For::make(
+        x,
+        3,
+        10,
+        Store::make(
+            a, {ExprHandle(8) - x}, Load::make(a, {ExprHandle(9) - x})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 9; x++) {
+     *   A[9 - x] = A[8 - x];
+     * }
+     */
+
+    // And not if we're not relying on execution order.
+
+    MemDependencyChecker analyzer;
+
+    StmtPtr stmt = For::make(
+        x,
+        3,
+        10,
+        Store::make(
+            a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 3; x < 10; x++) {
+     *   A[x - 2] = A[x - 1];
+     * }
+     */
+
+    // Forward order but negative indices.
+
+    MemDependencyChecker analyzer;
+    analyzer.allowLoopExecutionOrderAnalysis();
+
+    StmtPtr stmt =
+        For::make(x, 3, 10, Store::make(a, {x - 2}, Load::make(a, {x - 1})));
+    stmt->accept(&analyzer);
+
+    // However here was can determine the A store is earlier in the order than
+    // the load.
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2];
+     * }
+     */
+
+    // With an access stride.
+
+    MemDependencyChecker analyzer;
+    // Execution order doesn't matter since the read and the write are totally
+    // distinct.
+
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 + 1];
+     * }
+     */
+
+    // Here we can use the common stride of the accesses to determine they are
+    // distinct.
+    // Note, this is the only place (loop self dependency) we use this stride
+    // to avoid unnecessary dependence.
+
+    MemDependencyChecker analyzer;
+    // Execution order doesn't matter since the read and the write are totally
+    // distinct.
+
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 - 1];
+     * }
+     */
+
+    // same if the read is behind the write so long as they are distinct.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 1, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 - 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 + 2];
+     * }
+     */
+
+    // But not if the offset is in the stride.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 - 2];
+     * }
+     */
+
+    // Works with negative offsets too.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 1, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 - 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 + 7];
+     * }
+     */
+
+    // Detects accesses are distinct when offset is large but not a multiple
+    // of stride.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 7})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 2 + 4];
+     * }
+     */
+
+    // Works with offsets which are multiples of the stride.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 4})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 6] = A[x * 6 + 5];
+     * }
+     */
+
+    // detects accesses are distinct with large strides when the offset is
+    // within.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 6}, Load::make(a, {x * 6 + 5})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 6];
+     * }
+     */
+
+    // detects accesses are overlapping when stride is different but a
+    // multiple.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 4] = A[x * 2];
+     * }
+     */
+
+    // still works when the read axis is the smaller stride.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x * 4}, Load::make(a, {x * 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 6 + 1];
+     * }
+     */
+
+    // detects accesses are distinct when stride is different but a multiple
+    // and there is an offset.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6 + 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 6 + 4];
+     * }
+     */
+
+    // The smaller stride determines whether there is overlap.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6 + 4})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2 + 3] = A[x * 6];
+     * }
+     */
+
+    // The smaller stride determines whether there is overlap, not the larger.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2 + 3}, Load::make(a, {x * 6})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[x * 3 + 1];
+     * }
+     */
+
+    // If they have strides with no common multiple > 1, they overlap.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 3 + 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x] = A[x + 10];
+     * }
+     */
+
+    // If the offset is greater than the size of the loop, they can't overlap.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 10})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x] = A[9 - x];
+     * }
+     */
+
+    // If they have different execution orders they may overlap.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x}, Load::make(a, {ExprHandle(9) - x})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x * 2] = A[19 - x * 2];
+     * }
+     */
+
+    // Or they may not, depending on their start offset and strides.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x,
+        0,
+        10,
+        Store::make(a, {x * 2}, Load::make(a, {ExprHandle(19) - x * 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x / 2] = A[x / 2];
+     * }
+     */
+
+    // If the stride is not monotonic, they overlap.
+
+    MemDependencyChecker analyzer;
+    StmtPtr stmt =
+        For::make(x, 0, 10, Store::make(a, {x / 2}, Load::make(a, {x / 2})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x / 2] = A[x / 2] + 1;
+     * }
+     */
+
+    // If the stride is not monotonic, they overlap - even with an offset.
+    MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x, 0, 10, Store::make(a, {x / 2}, Load::make(a, {x / 2 + 1})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   A[x % 2] = A[x % 2];
+     * }
+     */
+
+    // Mod too...
+
+    analysis::MemDependencyChecker analyzer;
+    StmtPtr stmt = For::make(
+        x,
+        0,
+        10,
+        Store::make(a, {Mod::make(x, 2)}, Load::make(a, {Mod::make(x, 2)})));
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+  }
+
+  {
+    /* for (int x = y; x < z; x++) {
+     *   A[x] = A[x + 1];
+     * }
+     */
+
+    // Still works with symbolic loop extents.
+
+    {
+      MemDependencyChecker analyzer;
+      StmtPtr stmt =
+          For::make(x, y, z, Store::make(a, {x}, Load::make(a, {x + 1})));
+      stmt->accept(&analyzer);
+
+      ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
+    }
+
+    {
+      MemDependencyChecker analyzer;
+      analyzer.allowLoopExecutionOrderAnalysis();
+      StmtPtr stmt =
+          For::make(x, y, z, Store::make(a, {x}, Load::make(a, {x + 1})));
+      stmt->accept(&analyzer);
+
+      ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
+    }
+  }
+}
+
+// Verify that a strided access still works.
+// TODO: actually this only works because of the size of the ranges, revisit
+// this test after strided overlap is implemented.
+TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
+  BufHandle a("A", {20}, kInt);
+  BufHandle b("B", {20}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  using namespace analysis;
+  MemDependencyChecker analyzer({a.node()}, {b.node()});
+  StmtPtr stmt = Block::make(
+      {For::make(
+           x, 0, 10, Store::make(b, {x * 2 + 1}, Load::make(a, {x * 2 + 1}))),
+       For::make(x, 0, 10, Store::make(b, {x * 2}, Load::make(a, {x * 2})))
+
+      });
+  stmt->accept(&analyzer);
+
+  // Sanity check output depends on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+  // Output has 2 dependencies... the store in each loop.
+  auto outputAccess = analyzer.output(b.node());
+  ASSERT_EQ(outputAccess->dependencies().size(), 2);
+}
+
+/* TODO(nickg) - this test will fail due to the lack of stride math in Bound
+TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
+  BufHandle a("A", {20}, kInt);
+  BufHandle b("B", {20}, kInt);
+  BufHandle c("C", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    analysis::MemDependencyChecker analyzer({a.node()}, {c.node()});
+    StmtPtr stmt = Block::make(
+        {For::make(
+             x,
+             0,
+             10,
+             Store::make(b, {x * 2 + 1}, Load::make(a, {x * 2 + 1}))),
+         For::make(
+             x, 0, 10, Store::make(b, {x * 2}, Load::make(a, {x * 2}))),
+         For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x})))
+
+        });
+    stmt->accept(&analyzer);
+
+    std::cout << *stmt << "\n";
+    for (auto& wi : analyzer.getHistory()) {
+      wi->print();
+    }
+  }
+}*/
+
+// analysis on Stmts using Cond.
+TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  BufHandle c("C", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  using namespace analysis;
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * if (y<5 ? 1 : 0) {
+     *   C[0] = (B[0]) + 1;
+     * } else {
+     *   C[0] = (B[1]) + 1;
+     * }
+     */
+
+    // Future usages may depend on accesses in both branches of a condition.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         Cond::make(
+             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+             Store::make(c, {0}, Add::make(Load::make(b, {0}), 1)),
+             Store::make(c, {0}, Add::make(Load::make(b, {1}), 1)))});
+
+    stmt->accept(&analyzer);
+
+    // Output C should have 3 dependencies, each of the three stores.
+    auto outputAccess = analyzer.output(c.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 3);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * if (y<5 ? 1 : 0) {
+     *   for (int x = 0; x < 10; x++) {
+     *     C[x] = B[x];
+     *   }
+     * } else {
+     *   for (int x = 0; x < 10; x++) {
+     *     C[x] = (B[x]) + 1;
+     *   }
+     * }
+     */
+
+    // Future usages may depend on accesses in both branches of a condition.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         Cond::make(
+             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+             For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x}))),
+             For::make(
+                 x,
+                 0,
+                 10,
+                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))))});
+
+    stmt->accept(&analyzer);
+
+    // Output C should have 3 dependencies, each of the three stores.
+    auto outputAccess = analyzer.output(c.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 3);
+
+    // TODO(nickg): actually since the true and false branch cover the total
+    // range of the first store this should have 2 dependencies, but we don't
+    // do that yet.
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * if (y<5 ? 1 : 0) {
+     *   for (int x = 0; x < 10; x++) {
+     *     C[x] = (B[x]) + 1;
+     *   }
+     * }
+     */
+
+    // Only has true branch.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         Cond::make(
+             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+             For::make(
+                 x,
+                 0,
+                 10,
+                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))),
+             nullptr)});
+
+    stmt->accept(&analyzer);
+
+    // Output C should have 3 dependencies, each of the three stores.
+    auto outputAccess = analyzer.output(c.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 2);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * if (y<5 ? 1 : 0) {
+     * } else {
+     *   for (int x = 0; x < 10; x++) {
+     *     C[x] = (B[x]) + 1;
+     *   }
+     * }
+     */
+
+    // Only has false branch.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         Cond::make(
+             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+             nullptr,
+             For::make(
+                 x,
+                 0,
+                 10,
+                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))))});
+
+    stmt->accept(&analyzer);
+
+    // Output C should have 3 dependencies, each of the three stores.
+    auto outputAccess = analyzer.output(c.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 2);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * if (C[0]<5 ? 1 : 0) {
+     *   C[0] = 5;
+     * }
+     */
+
+    // Cond's Condition depends on a previous access.
+
+    MemDependencyChecker analyzer({a}, {c});
+    StorePtr initStore = Store::make(c, {x}, Load::make(a, {x}));
+    ExprHandle conditionalLoad = Load::make(c, {0});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, initStore),
+         Cond::make(
+             CompareSelect::make(
+                 conditionalLoad, 5, CompareSelectOperation::kLT),
+             Store::make(c, {0}, 5),
+             nullptr)});
+
+    stmt->accept(&analyzer);
+
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+
+    ASSERT_TRUE(analyzer.dependsDirectly(conditionalLoad.node(), initStore));
+    ASSERT_FALSE(analyzer.dependsDirectly(conditionalLoad.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(conditionalLoad.node(), a.node()));
+  }
+}
+
+// Stmts using IfThenElse.
+TEST(MemDependency, MemDependencyCheckerIfThenElse) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  BufHandle c("C", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  using namespace analysis;
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * C[0] = (y < 5 ? (B[0]) + 1 : (B[1]) + 1;
+     */
+
+    // Future usages may depend on accesses in both branches of a condition.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StorePtr ifStore = Store::make(
+        c,
+        {0},
+        IfThenElse::make(
+            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+            Add::make(Load::make(b, {0}), 1),
+            Add::make(Load::make(b, {1}), 1)));
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         ifStore});
+
+    stmt->accept(&analyzer);
+
+    // Output C should have 2 dependencies, each of the two stores.
+    auto outputAccess = analyzer.output(c.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 2);
+
+    // Now we need to check the Store containing the IfThenElse.
+    auto ifStoreAccess = analyzer.accessFor(ifStore);
+
+    // It should have 2 dependencies.
+    ASSERT_EQ(ifStoreAccess->dependencies().size(), 2);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[x];
+     * }
+     * C[0] = (y < 5 ? (B[0]) + 1 : 42;
+     */
+
+    // If the load appears in only one side of an IfThenElse the output may be
+    // dependent on it.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StorePtr ifStore = Store::make(
+        c,
+        {0},
+        IfThenElse::make(
+            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+            Add::make(Load::make(b, {0}), 1),
+            42));
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
+         ifStore});
+
+    stmt->accept(&analyzer);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = (x < 5 ? B[x] : A[x];
+     * }
+     */
+
+    // In this case C is dependent on both A and B.
+
+    // TODO: in cases like this it would be possible to split the range of B
+    // into two bounds, one dependent on A and one dependent on B. We'd need to
+    // examine conditions relative to previously encountered loop variables. I'm
+    // uncertain if this would be helpful.
+
+    MemDependencyChecker analyzer({a, b}, {c});
+    StorePtr ifStore = Store::make(
+        c,
+        {0},
+        IfThenElse::make(
+            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
+            Load::make(b, {x}),
+            Load::make(a, {x})));
+    StmtPtr stmt = Block::make({For::make(x, 0, 10, ifStore)});
+
+    stmt->accept(&analyzer);
+
+    // C depends indirectly on A and B.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+  }
+}
+
+// Cutting a loop with single elem writes
+TEST(MemDependency, MemDependencyCheckerCutLoop) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   B[x] = A[x];
+     * }
+     * B[5] = 100;
+     */
+
+    // Cutting a loop with single element writes.
+
+    MemDependencyChecker analyzer({a}, {b});
+    StmtPtr stmt = Block::make(
+        {For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x}))),
+         Store::make(b, {5}, 100)});
+
+    stmt->accept(&analyzer);
+
+    // Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // Output has 2 dependencies.
+    auto outputAccess = analyzer.output(b.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 2);
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   B[x] = A[x];
+     * }
+     * for (int x = 4; x < 7; x++) {
+     *   B[x] = B[x] + 3;
+     * }
+     * B[5] = 100;
+     * B[6] = 101;
+     * B[7] = 102;
+     */
+
+    // Cutting a loop with a smaller loop but then totally overlap that second
+    // loop with one element writes.
+
+    MemDependencyChecker analyzer({a}, {b});
+    ForPtr firstLoop =
+        For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x})));
+    StorePtr secondStore =
+        Store::make(b, {x}, Add::make(Load::make(b, {x}), 1));
+    ForPtr secondLoop = For::make(x, 4, 7, secondStore);
+
+    StmtPtr stmt = Block::make(
+        {firstLoop,
+         secondLoop,
+         Store::make(b, {4}, 100),
+         Store::make(b, {5}, 101),
+         Store::make(b, {6}, 102)});
+
+    stmt->accept(&analyzer);
+
+    // Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // Output has 4 dependencies.
+    auto outputAccess = analyzer.output(b.node());
+    ASSERT_NE(outputAccess, nullptr);
+    ASSERT_EQ(outputAccess->dependencies().size(), 4);
+
+    // Second loop depends on first loop.
+    ASSERT_TRUE(analyzer.dependsDirectly(secondLoop, firstLoop));
+
+    // Output does not depend on second loop or store.
+    ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), secondLoop));
+    ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), secondStore));
+  }
+}
+
+// Dynamic shapes (load in indices).
+TEST(MemDependency, MemDependencyCheckerDynamicShapes) {
+  BufHandle a("A", {100}, kInt);
+  BufHandle b("B", {100}, kInt);
+  BufHandle c("C", {100}, kInt);
+  VarHandle x("x", kInt);
+
+  using namespace analysis;
+
+  auto CB = [](ExprHandle s, ExprHandle e) {
+    return Bound(s.node(), e.node());
+  };
+
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  {
+    /* for (int x = 0; x < B[0]; x++) {
+     *   C[x] = A[x];
+     * }
+     */
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make({For::make(
+        x, 0, Load::make(b, {0}), Store::make(c, {x}, Load::make(a, {x})))});
+
+    stmt->accept(&analyzer);
+
+    /*  0. Input: B[(0, 99)] - dependents: 2
+     *  1. Input: A[(0, 99)] - dependents: 3
+     *  2. Load: B[(0, 0)] - depends on: 0  - dependents: 3 4
+     *  3. Load: A[(0, (B[0]) - 1)] - depends on: 1 2  - dependents: 4
+     *  4. Store: C[(0, (B[0]) - 1)] - depends on: 2 3  - dependents: 5
+     *  5. Output: C[(0, 99)] - depends on: 4
+     */
+
+    // Output dependent on A input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    // Also dependent on B input to determine the size of the region written.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 6);
+
+    // The accesses in the loop depend on the load in the stop condition.
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+
+    // Make a load from B to compare against.
+    ExprHandle loadFromB = Load::make(b, {0});
+
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, loadFromB - 1)}));
+    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(0, loadFromB - 1)}));
+  }
+
+  {
+    /* for (int x = B[0]; x < B[1]; x++) {
+     *   C[x] = A[x];
+     * }
+     */
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        Load::make(b, {0}),
+        Load::make(b, {1}),
+        Store::make(c, {x}, Load::make(a, {x})))});
+
+    stmt->accept(&analyzer);
+
+    /*  0. Input: B[(0, 99)] - dependents: 2 3
+     *  1. Input: A[(0, 99)] - dependents: 4
+     *  2. Load: B[(0, 0)] - depends on: 0  - dependents: 4 5
+     *  3. Load: B[(1, 1)] - depends on: 0  - dependents: 4 5
+     *  4. Load: A[(B[0], (B[1]) - 1)] - depends on: 1 2 3  - dependents: 5
+     *  5. Store: C[(B[0], (B[1]) - 1)] - depends on: 2 3 4  - dependents: 6
+     *  6. Output: C[(0, 99)] - depends on: 5
+     */
+
+    // Sanity check output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 7);
+
+    // The accesses in the loop depend on the load in the start condition.
+    ASSERT_TRUE(history[5]->hasDependency(history[2]));
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+
+    // also the stop condition.
+    ASSERT_TRUE(history[5]->hasDependency(history[3]));
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+
+    // Make loads from B to compare against.
+    ExprHandle loadFromB0 = Load::make(b, {0});
+    ExprHandle loadFromB1 = Load::make(b, {1});
+    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(loadFromB0, loadFromB1 - 1)}));
+    ASSERT_TRUE(EQ(history[5]->bounds(), {CB(loadFromB0, loadFromB1 - 1)}));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[x] = A[B[x]];
+     * }
+     */
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make({For::make(
+        x, 0, 10, Store::make(c, {x}, Load::make(a, {Load::make(b, {x})})))});
+
+    stmt->accept(&analyzer);
+
+    /*  0. Input: B[(0, 99)] - dependents: 2
+     *  1. Input: A[(0, 99)] - dependents: 3
+     *  2. Load: B[(0, 9)] - depends on: 0  - dependents: 3 4
+     *  3. Load: A[(B[0], B[9])] - depends on: 1 2  - dependents: 4
+     *  4. Store: C[(0, 9)] - depends on: 2 3  - dependents: 5
+     *  5. Output: C[(0, 99)] - depends on: 4
+     */
+
+    // Sanity check output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 6);
+
+    // The store depends on both loads, the load of A depends on the load of B.
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+
+    // The loads in the indices depend on the relevant input buffer.
+    ASSERT_TRUE(history[3]->hasDependency(history[1]));
+    ASSERT_TRUE(history[2]->hasDependency(history[0]));
+
+    // The load from B has the loop bounds.
+    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
+
+    // The load from A has bounds B[0] to B[9].
+    ExprHandle loadFromB0 = Load::make(b, {0});
+    ExprHandle loadFromB9 = Load::make(b, {9});
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(loadFromB0, loadFromB9)}));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[B[x]] = A[x];
+     * }
+     */
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make({For::make(
+        x, 0, 10, Store::make(c, {Load::make(b, {x})}, Load::make(a, {x})))});
+
+    stmt->accept(&analyzer);
+
+    /*  0. Input: B[(0, 99)] - dependents: 3
+     *  1. Input: A[(0, 99)] - dependents: 2
+     *  2. Load: A[(0, 9)] - depends on: 1  - dependents: 4
+     *  3. Load: B[(0, 9)] - depends on: 0  - dependents: 4
+     *  4. Store: C[(B[0], B[9])] - depends on: 2 3  - dependents: 5
+     *  5. Output: C[(0, 99)] - depends on: 4
+     */
+    // Sanity check output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 6);
+
+    // The store depends on both loads, neither load is dependent.
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+
+    ASSERT_FALSE(history[3]->hasDependency(history[2]));
+    ASSERT_FALSE(history[2]->hasDependency(history[3]));
+
+    // The loads each depend on their relevant input. (but accesses are in a
+    // different order than the last case).
+    ASSERT_TRUE(history[3]->hasDependency(history[0]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+
+    // The load from B has the loop bounds.
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, 9)}));
+
+    // And so does the load from A.
+    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   C[B[A[x]]] = x;
+     * }
+     */
+    MemDependencyChecker analyzer({a, b}, {c});
+    StmtPtr stmt = Block::make({For::make(
+        x, 0, 10, Store::make(c, {Load::make(b, {Load::make(a, {x})})}, x))});
+
+    stmt->accept(&analyzer);
+
+    /*  0. Input: B[(0, 99)] - dependents: 3
+     *  1. Input: A[(0, 99)] - dependents: 2
+     *  2. Load: A[(0, 9)] - depends on: 1  - dependents: 3 4
+     *  3. Load: B[(A[0], A[9])] - depends on: 0 2  - dependents: 4
+     *  4. Store: C[(B[A[0]], B[A[9]])] - depends on: 2 3  - dependents: 5
+     *  5. Output: C[(0, 99)] - depends on: 4
+     */
+
+    // Sanity check output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
+
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 6);
+
+    // The store depends on both loads.
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+
+    // The outer load depends on the inner.
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+
+    // The loads each depend on their relevant input. (but accesses are in a
+    // different order than the last case).
+    ASSERT_TRUE(history[3]->hasDependency(history[0]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+
+    // The load from A has the loop bounds.
+    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
+    // The load from B as bounds A[0] to A[9].
+    ExprHandle loadFromA0 = Load::make(a, {0});
+    ExprHandle loadFromA9 = Load::make(a, {9});
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(loadFromA0, loadFromA9)}));
+
+    // The store has bounds of B[A[0]] to B[A[9]].
+    ExprHandle loadFromBA0 = Load::make(b, {loadFromA0});
+    ExprHandle loadFromBA9 = Load::make(b, {loadFromA9});
+    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(loadFromBA0, loadFromBA9)}));
+  }
+}
+
+// Verify multi dimensional bounds work.
+TEST(MemDependency, MemDependencyCheckerMultiDim) {
+  int M = 10, N = 9, K = 12;
+  BufHandle a("A", {M, N, K}, kInt);
+  BufHandle b("B", {M, N, K}, kInt);
+  BufHandle c("C", {M, K}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+
+  using namespace analysis;
+
+  auto CB = [](ExprHandle s, ExprHandle e) {
+    return Bound(s.node(), e.node());
+  };
+
+  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
+    return indexBoundsEquals(x, y);
+  };
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   for (int y = 0; y < 9; y++) {
+     *     for (int z = 0; z < 12; z++) {
+     *       B[x, y, z] = A[x, y, z];
+     *     }
+     *   }
+     * }
+     */
+    // Full range.
+
+    MemDependencyChecker analyzer({a}, {b});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        0,
+        M,
+        For::make(
+            y,
+            0,
+            N,
+            For::make(
+                z,
+                0,
+                K,
+                Store::make(b, {x, y, z}, Load::make(a, {x, y, z})))))});
+
+    stmt->accept(&analyzer);
+
+    // Sanity test: Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // 4 accesses: input, load, store, output.
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 4);
+
+    // Simple chain from input to output.
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+    ASSERT_TRUE(history[1]->hasDependency(history[0]));
+
+    ASSERT_TRUE(
+        EQ(history[1]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
+    ASSERT_TRUE(
+        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
+  }
+
+  {
+    /* for (int x = 0; x < 5; x++) {
+     *   for (int y = 0; y < 5; y++) {
+     *     for (int z = 0; z < 5; z++) {
+     *       B[x, y, z] = A[x, y, z];
+     *     }
+     *   }
+     * }
+     */
+    // Partial range.
+
+    MemDependencyChecker analyzer({a}, {b});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        0,
+        5,
+        For::make(
+            y,
+            0,
+            5,
+            For::make(
+                z,
+                0,
+                5,
+                Store::make(b, {x, y, z}, Load::make(a, {x, y, z})))))});
+
+    stmt->accept(&analyzer);
+
+    // Sanity test: Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // 4 accesses: input, load, store, output.
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 4);
+
+    // Simple chain from input to output.
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+    ASSERT_TRUE(history[1]->hasDependency(history[0]));
+
+    ASSERT_TRUE(EQ(history[1]->bounds(), {CB(0, 4), CB(0, 4), CB(0, 4)}));
+    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 4), CB(0, 4), CB(0, 4)}));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   for (int y = 0; y < 12; y++) {
+     *     B[x, 0, y] = A[x, 0, y];
+     *   }
+     * }
+     */
+
+    // Partial loops.
+
+    MemDependencyChecker analyzer({a}, {b});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        0,
+        N,
+        For::make(
+            y, 0, K, Store::make(b, {x, 0, y}, Load::make(a, {x, 0, y}))))});
+
+    stmt->accept(&analyzer);
+
+    // Sanity test: Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // 4 accesses: input, load, store, output.
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 4);
+
+    // Simple chain from input to output.
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+    ASSERT_TRUE(history[1]->hasDependency(history[0]));
+
+    ASSERT_TRUE(
+        EQ(history[1]->bounds(), {CB(0, N - 1), CB(0, 0), CB(0, K - 1)}));
+    ASSERT_TRUE(
+        EQ(history[2]->bounds(), {CB(0, N - 1), CB(0, 0), CB(0, K - 1)}));
+  }
+
+  {
+    /* for (int x = 0; x < 10; x++) {
+     *   for (int y = 0; y < 100; y++) {
+     *     for (int z = 0; z < 12; z++) {
+     *       B[x, 0, z] = (A[x, 0, z]) + (C[x, z]);
+     *     }
+     *   }
+     * }
+     */
+
+    // Loops that don't correspond to an index, bufs with different
+    // dimensionality.
+
+    MemDependencyChecker analyzer({a, c}, {b});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        0,
+        M,
+        For::make(
+            y,
+            0,
+            100,
+            For::make(
+                z,
+                0,
+                K,
+                Store::make(
+                    b,
+                    {x, 0, z},
+                    Add::make(
+                        Load::make(a, {x, 0, z}), Load::make(c, {x, z}))))))});
+
+    stmt->accept(&analyzer);
+
+    // Sanity test: Output depends on both inputs.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), c.node()));
+
+    // 6 accesses: 2 inputs, 2 loads, store, output.
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 6);
+
+    // Simple chain from input to output over the A buf.
+    // history[0] is the C input, history[3] is the load from C.
+    ASSERT_TRUE(history[5]->hasDependency(history[4]));
+    ASSERT_TRUE(history[4]->hasDependency(history[2]));
+    ASSERT_TRUE(history[2]->hasDependency(history[1]));
+    // The store also depends on the load from the C input.
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+    ASSERT_TRUE(history[3]->hasDependency(history[0]));
+
+    // A Buf accesses.
+    ASSERT_TRUE(
+        EQ(history[4]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, K - 1)}));
+    ASSERT_TRUE(
+        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, K - 1)}));
+
+    // C buf access.
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, M - 1), CB(0, K - 1)}));
+  }
+
+  {
+    /* for (int x = 0; x < 9; x++) {
+     *   for (int y = 0; y < 10; y++) {
+     *     for (int z = 0; z < 12; z++) {
+     *       B[x, 0, 0] = (B[x, y, z]) + (A[x, y, z]);
+     *     }
+     *   }
+     * }
+     */
+    // Multi-dim reductions.
+
+    MemDependencyChecker analyzer({a}, {b});
+    StmtPtr stmt = Block::make({For::make(
+        x,
+        0,
+        M,
+        For::make(
+            y,
+            0,
+            N,
+            For::make(
+                z,
+                0,
+                K,
+                Store::make(
+                    b,
+                    {x, 0, 0},
+                    Add::make(
+                        Load::make(b, {x, y, z}),
+                        Load::make(a, {x, y, z}))))))});
+
+    stmt->accept(&analyzer);
+
+    // Sanity test: Output depends on input.
+    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
+
+    // 4 accesses: input, 2 loads, store, output.
+    auto history = analyzer.getHistory();
+    ASSERT_EQ(history.size(), 5);
+
+    // Simple chain from input to output.
+    ASSERT_TRUE(history[4]->hasDependency(history[3]));
+    ASSERT_TRUE(history[3]->hasDependency(history[2]));
+    ASSERT_TRUE(history[3]->hasDependency(history[1]));
+    ASSERT_TRUE(history[2]->hasDependency(history[0]));
+
+    // The load from B depends on the store to B.
+    ASSERT_TRUE(history[1]->hasDependency(history[3]));
+
+    ASSERT_TRUE(
+        EQ(history[1]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
+    ASSERT_TRUE(
+        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
+    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, 0)}));
+  }
+}
+
+// Various tests using the external Compute/Reduce API.
+TEST(MemDependency, MemDependencyCheckerComputeAPI) {
+  using namespace analysis;
+
+  /* for (int m = 0; m < 4; m++) {
+   *   for (int n = 0; n < 5; n++) {
+   *     for (int k = 0; k < 6; k++) {
+   *       broadcast_add[m, n, k] = (a[m, n]) + (b[n, k]);
+   *     }
+   *   }
+   * }
+   * for (int m_1 = 0; m_1 < 4; m_1++) {
+   *   for (int n_1 = 0; n_1 < 5; n_1++) {
+   *     for (int k_1 = 0; k_1 < 6; k_1++) {
+   *       d[m_1, n_1, k_1] = (broadcast_add(m_1, n_1, k_1)) + float(1);
+   *     }
+   *   }
+   * }
+   */
+
+  // Can determine if 2 loops created by Compute are dependent.
+  BufHandle a_buf("a", {4, 5}, kFloat);
+  BufHandle b_buf("b", {5, 6}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+  Tensor d = Compute(
+      "d",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c.load(m, n, k) + 1;
+      });
+
+  LoopNest l({d}, {c, d});
+
+  MemDependencyChecker analyzer({a_buf.node(), b_buf.node()}, {d.buf()});
+
+  l.root_stmt()->accept(&analyzer);
+
+  // Sanity test: Output depends on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.node()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.node()));
+
+  // Second loop depends on first loop.
+  auto c_loop = l.getLoopStmtsFor(c)[0];
+  auto d_loop = l.getLoopStmtsFor(d)[0];
+  ASSERT_TRUE(analyzer.dependsDirectly(d_loop, c_loop));
+}
+
+TEST(MemDependency, MemDependencyCheckerComputeInline) {
+  using namespace analysis;
+
+  /* for (int m = 0; m < 4; m++) {
+   *   for (int n = 0; n < 5; n++) {
+   *     for (int k = 0; k < 6; k++) {
+   *       d[m, n, k] = ((a[m, n]) + (b[n, k])) + float(1);
+   *     }
+   *   }
+   * }
+   */
+
+  // Check inlining affects the number of accesses returned.
+
+  BufHandle a_buf("a", {4, 5}, kFloat);
+  BufHandle b_buf("b", {5, 6}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+  Tensor d = Compute(
+      "d",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return c.load(m, n, k) + 1;
+      });
+
+  LoopNest l({d}, {c, d});
+  l.computeInline(c.buf());
+
+  MemDependencyChecker analyzer({a_buf.node(), b_buf.node()}, {d.buf()});
+  l.root_stmt()->accept(&analyzer);
+
+  // Sanity test: Output depends on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.node()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.node()));
+
+  // broadcast_add tensor should not appear in trace at all.
+  for (auto& wi : analyzer.getHistory()) {
+    ASSERT_NE(wi->var(), c.buf()->base_handle());
+  }
+}
+
+TEST(MemDependency, MemDependencyCheckerComputeSplit) {
+  using namespace analysis;
+  // Split an axis, so the number of loops != the number of dimensions.
+
+  BufHandle a_buf("a", {4, 5}, kFloat);
+  BufHandle b_buf("b", {5, 6}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+
+  LoopNest l({c});
+
+  MemDependencyChecker analyzer_before({a_buf.node(), b_buf.node()}, {c.buf()});
+  l.root_stmt()->accept(&analyzer_before);
+
+  l.splitWithTail(l.getLoopStmtsFor(c)[0], 2);
+
+  MemDependencyChecker analyzer_after({a_buf.node(), b_buf.node()}, {c.buf()});
+  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
+  stmt->accept(&analyzer_after);
+
+  // Splitting should not change accesses at all.
+  auto history_before = analyzer_before.getHistory();
+  auto history_after = analyzer_after.getHistory();
+
+  ASSERT_EQ(history_before.size(), history_after.size());
+
+  for (size_t i = 0; i < history_before.size(); ++i) {
+    ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
+    ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
+    ASSERT_EQ(
+        history_before[i]->bounds().size(), history_after[i]->bounds().size());
+    ASSERT_TRUE(indexBoundsEquals(
+        history_before[i]->bounds(), history_after[i]->bounds()));
+    ASSERT_EQ(
+        history_before[i]->dependencies().size(),
+        history_after[i]->dependencies().size());
+    ASSERT_EQ(
+        history_before[i]->dependents().size(),
+        history_after[i]->dependents().size());
+  }
+}
+
+TEST(MemDependency, MemDependencyCheckerComputeReorder) {
+  using namespace analysis;
+  // Reorder an axis, so the loop order doesn't match the indexing order.
+
+  BufHandle a_buf("a", {4, 5}, kFloat);
+  BufHandle b_buf("b", {5, 6}, kFloat);
+  Tensor c = Compute(
+      "broadcast_add",
+      {4, 5, 6},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n) + b_buf.load(n, k);
+      });
+
+  LoopNest l({c});
+
+  MemDependencyChecker analyzer_before({a_buf.node(), b_buf.node()}, {c.buf()});
+  l.root_stmt()->accept(&analyzer_before);
+
+  auto loops = l.getLoopStmtsFor(c);
+  l.reorderAxis(loops[0], loops[1]);
+
+  MemDependencyChecker analyzer_after({a_buf.node(), b_buf.node()}, {c.buf()});
+  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
+  stmt->accept(&analyzer_after);
+
+  // Reordering should not change accesses at all.
+  auto history_before = analyzer_before.getHistory();
+  auto history_after = analyzer_after.getHistory();
+
+  ASSERT_EQ(history_before.size(), history_after.size());
+
+  for (size_t i = 0; i < history_before.size(); ++i) {
+    ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
+    ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
+    ASSERT_EQ(
+        history_before[i]->bounds().size(), history_after[i]->bounds().size());
+    ASSERT_TRUE(indexBoundsEquals(
+        history_before[i]->bounds(), history_after[i]->bounds()));
+    ASSERT_EQ(
+        history_before[i]->dependencies().size(),
+        history_after[i]->dependencies().size());
+    ASSERT_EQ(
+        history_before[i]->dependents().size(),
+        history_after[i]->dependents().size());
+  }
+}
+
+TEST(MemDependency, MemDependencyCheckerComputeReduce) {
+  using namespace analysis;
+  /* for (int l2 = 0; l2 < 2; l2++) {
+   *   for (int n1 = 0; n1 < 3; n1++) {
+   *     for (int m1 = 0; m1 < 6; m1++) {
+   *       scale[l2, n1, m1] = (b[l2, n1, m1]) * (a[l2, n1, m1]);
+   *     }
+   *   }
+   * }
+   * for (int l1 = 0; l1 < 2; l1++) {
+   *   sum[l1] = float(0);
+   *   for (int n1_1 = 0; n1_1 < 3; n1_1++) {
+   *     for (int m1_1 = 0; m1_1 < 6; m1_1++) {
+   *       sum[l1] = ReduceOp(sum, (sum[l1]) + (scale(l1, n1_1, m1_1)),
+   *                    out_args={l1}, reduce_args={n1, m1});
+   *     }
+   *   }
+   * }
+   */
+
+  // Can determine dependencies of a Reduction.
+
+  BufHandle a("a", {2, 3, 6}, kFloat);
+  BufHandle b("b", {2, 3, 6}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {2, 3, 6},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {2}, Sum(), c, {3, 6});
+  LoopNest l({d}, {c, d});
+
+  MemDependencyChecker analyzer({a.node(), b.node()}, {d.buf()});
+
+  l.root_stmt()->accept(&analyzer);
+
+  // Sanity test: Output depends on input.
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.node()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.node()));
+
+  // Second loop depends on first loop.
+  auto c_loop = l.getLoopStmtsFor(c)[0];
+  auto d_loop = l.getLoopStmtsFor(d)[0];
+  ASSERT_TRUE(analyzer.dependsDirectly(d_loop, c_loop));
+
+  // Reduction depends on both inputs.
+  auto reduces = NodeFinder<ReduceOp>::find(l.root_stmt());
+  ASSERT_TRUE(analyzer.dependsIndirectly(reduces[0], a.node()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(reduces[0], b.node()));
+}
+
+TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+  using namespace analysis;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  LoopNest loop({CT});
+
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr m = loops[0];
+    loop.splitWithMask(m, 4);
+  }
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr n = loops[2];
+    loop.splitWithMask(n, 16);
+  }
+  // mo, mi, no, ni, k ->
+  // mo, no, mi, ni, k
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr mi = loops[1];
+    ForPtr no = loops[2];
+    loop.reorderAxis(mi, no);
+  }
+  // mo, no, mi, ni, k ->
+  // mo, no, mi, k, ni
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr ni = loops[3];
+    ForPtr k = loops[4];
+    loop.reorderAxis(ni, k);
+  }
+  // mo, no, mi, k, ni ->
+  // mo, no, k, mi, ni
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    ForPtr mi = loops[2];
+    ForPtr k = loops[3];
+    loop.reorderAxis(mi, k);
+  }
+  {
+    auto const& loops = loop.getLoopStmtsFor(CT);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
+  }
+
+  MemDependencyChecker analyzer_unlowered(
+      loop.getInputBufs(), loop.getOutputBufs());
+
+  MemDependencyChecker analyzer_lowered(
+      loop.getInputBufs(), loop.getOutputBufs());
+
+  // Test both unlowered and lowered form.
+  {
+    StmtPtr stmt = IRSimplifier::simplify(loop.root_stmt());
+    stmt->accept(&analyzer_unlowered);
+
+    // Outputs depend on inputs.
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.node()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.node()));
+
+    // The last write to gemm should cover the total bound of the output.
+    std::shared_ptr<AccessInfo> outputAccess =
+        analyzer_unlowered.output(CT.buf());
+    // A single dependency.
+    ASSERT_EQ(outputAccess->dependencies().size(), 1);
+
+    // dependencies is a set with 1 element, so can just deref begin().
+    std::shared_ptr<AccessInfo> gemmStore =
+        outputAccess->dependencies().begin()->second;
+    // Check its a store.
+    ASSERT_EQ(gemmStore->type(), AccessType::Store);
+
+    ASSERT_TRUE(indexBoundsEquals(outputAccess->bounds(), gemmStore->bounds()));
+
+    // Likewise the first read from each input cover the entire range of the
+    // input.
+    auto aInput = analyzer_unlowered.input(AP.node());
+    auto bInput = analyzer_unlowered.input(BP.node());
+
+    // A single dependent each.
+    ASSERT_EQ(aInput->dependents().size(), 1);
+    ASSERT_EQ(bInput->dependents().size(), 1);
+
+    // They're both loads.
+    std::shared_ptr<AccessInfo> aLoad = aInput->dependents().begin()->second;
+    std::shared_ptr<AccessInfo> bLoad = bInput->dependents().begin()->second;
+    ASSERT_EQ(aLoad->type(), AccessType::Load);
+    ASSERT_EQ(bLoad->type(), AccessType::Load);
+
+    ASSERT_TRUE(indexBoundsEquals(aInput->bounds(), aLoad->bounds()));
+    ASSERT_TRUE(indexBoundsEquals(bInput->bounds(), bLoad->bounds()));
+  }
+
+  loop.prepareForCodegen();
+  SimpleIREvaluator cg(loop.root_stmt(), {AP, BP, CT});
+
+  // now check lowered dependency graph.
+  {
+    StmtPtr stmt = IRSimplifier::simplify(cg.stmt());
+    stmt->accept(&analyzer_lowered);
+
+    // Lowering will change the dimensionality of all bounds due to index
+    // flattening and will insert Allocates and Frees.
+
+    auto history_before = analyzer_unlowered.getHistory();
+    auto history_after = analyzer_lowered.getHistory();
+
+    ASSERT_EQ(history_before.size() + 2, history_after.size());
+
+    // Filter out the alloc/free;
+    auto isAllocFree = [](const auto& info) {
+      return info->type() == AccessType::Alloc ||
+          info->type() == AccessType::Free;
+    };
+    history_after.erase(
+        std::remove_if(history_after.begin(), history_after.end(), isAllocFree),
+        history_after.end());
+
+    ASSERT_EQ(history_before.size(), history_after.size());
+
+    for (size_t i = 0; i < history_before.size(); ++i) {
+      ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
+      ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
+
+      if (history_before[i]->dependencies().size() !=
+          history_after[i]->dependencies().size()) {
+        // Must depend on an Alloc.
+        ASSERT_TRUE(std::any_of(
+            history_after[i]->dependencies().begin(),
+            history_after[i]->dependencies().end(),
+            [](const auto& pair) {
+              return pair.second->type() == AccessType::Alloc;
+            }));
+
+        ASSERT_EQ(
+            history_before[i]->dependencies().size() + 1,
+            history_after[i]->dependencies().size());
+      }
+
+      if (history_before[i]->dependents().size() !=
+          history_after[i]->dependents().size()) {
+        // Must depend on an Free.
+        ASSERT_TRUE(std::any_of(
+            history_after[i]->dependents().begin(),
+            history_after[i]->dependents().end(),
+            [](const auto& pair) {
+              return pair.second->type() == AccessType::Free;
+            }));
+
+        ASSERT_EQ(
+            history_before[i]->dependents().size() + 1,
+            history_after[i]->dependents().size());
+      }
+
+      // Inputs and outputs are not flattened, only accesses.
+      if (history_before[i]->type() == AccessType::Input ||
+          history_before[i]->type() == AccessType::Output) {
+        ASSERT_EQ(
+            history_before[i]->bounds().size(),
+            history_after[i]->bounds().size());
+        ASSERT_TRUE(indexBoundsEquals(
+            history_before[i]->bounds(), history_after[i]->bounds()));
+      } else {
+        ASSERT_EQ(history_after[i]->bounds().size(), 1);
+        ExprPtr flat_bounds = alloc<IntImm>(1);
+
+        for (auto& b : history_before[i]->bounds()) {
+          flat_bounds =
+              alloc<Mul>(flat_bounds, alloc<Add>(b.end, alloc<IntImm>(1)));
+
+          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+          ASSERT_TRUE(exprEquals(b.start, history_after[i]->bounds()[0].start));
+        }
+
+        flat_bounds = IRSimplifier::simplify(flat_bounds);
+        ExprPtr after_bounds = IRSimplifier::simplify(
+            alloc<Add>(history_after[i]->bounds()[0].end, alloc<IntImm>(1)));
+        ASSERT_TRUE(exprEquals(flat_bounds, after_bounds));
+      }
+    }
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp
new file mode 100644
index 0000000000000..f5ee8747650fc
--- /dev/null
+++ b/test/cpp/tensorexpr/test_memplanning.cpp
@@ -0,0 +1,708 @@
+#include <gtest/gtest.h>
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+extern void checkIR(StmtPtr s, const std::string& pattern);
+
+TEST(BufLiveRange, SingleRangeLine) {
+  VarHandle i("i", kInt), j("j", kInt);
+  BufHandle a("a", {32}, kFloat);
+  BufHandle b("b", {32, 32}, kFloat);
+
+  // Construct Stmt:
+  // {
+  //   for (int i = 0; i < 32; i++) {
+  //     a[i] = 0;
+  //     for (int j = 0; j < 32; j++) {
+  //       a[i] = (a[i]) + (b[i, j]);
+  //     }
+  //   }
+  // }
+
+  StorePtr aInit = Store::make(a, {i}, 0);
+  ExprHandle reduce = a.load({i}) + b.load({i, j});
+  StorePtr aReduce = Store::make(a, {i}, reduce);
+  StmtPtr loop =
+      For::make(i, 0, 32, Block::make({aInit, For::make(j, 0, 32, aReduce)}));
+
+  StmtPtr stmt = Block::make({loop});
+
+  auto range = BufLiveRange::liveRange(stmt, a.node());
+  ASSERT_TRUE(std::get<0>(range) == 0);
+  ASSERT_TRUE(std::get<1>(range) == 0);
+}
+
+TEST(BufLiveRange, MulRangeLine) {
+  VarHandle i("i", kInt);
+  BufHandle a("a", {32}, kFloat);
+  BufHandle b("b", {32}, kFloat);
+
+  // Construct Stmt:
+  // {
+  //   for (int i = 0; i < 32; i++) {
+  //     if (i<10 ? 1 : 0) {
+  //       a[i] = i + i;
+  //       b[i] = i * i;
+  //     }
+  //   }
+  //   for (int i = 0; i < 32; i++) {
+  //     if (i>10 ? 1 : 0) {
+  //       a[i] = i * i;
+  //       b[i] = i + i;
+  //     }
+  //   }
+  // }
+
+  StorePtr aStore_1 = Store::make(a, {i}, i + i);
+  StorePtr bStore_1 = Store::make(b, {i}, i * i);
+  StmtPtr loop_1 = For::make(
+      i, 0, 32, Cond::make(i < 10, Block::make({aStore_1, bStore_1}), NULL));
+
+  StorePtr aStore_2 = Store::make(a, {i}, i * i);
+  StorePtr bStore_2 = Store::make(b, {i}, i + i);
+  StmtPtr loop_2 = For::make(
+      i, 0, 32, Cond::make(i > 10, Block::make({aStore_2, bStore_2}), NULL));
+
+  StmtPtr stmt = Block::make({loop_1, loop_2});
+
+  auto range_a = BufLiveRange::liveRange(stmt, a.node());
+  ASSERT_TRUE(std::get<0>(range_a) == 0);
+  ASSERT_TRUE(std::get<1>(range_a) == 1);
+
+  auto range_b = BufLiveRange::liveRange(stmt, b.node());
+  ASSERT_TRUE(std::get<0>(range_b) == 0);
+  ASSERT_TRUE(std::get<1>(range_b) == 1);
+}
+
+TEST(MemPlanning, MemReuseWithTypeCast) {
+  int M = 4;
+  int N = 4;
+  int K = 4;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return CompareSelect::make(
+            CT.load(m, n), 0.0f, 0.0f, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return Cast::make(kQUInt8, DT.load(m, n) + DT.load(m, n));
+      });
+  Tensor FT =
+      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n);
+      });
+  StmtPtr stmt =
+      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
+  // different: 'E' type quint8 < 'gemm' type float. We'll reuse 'gemm' for 'E'
+  // with typecasting.
+  //{
+  //  for (int i = 0; i < 4; i++) {
+  //    for (int i_1 = 0; i_1 < 4; i_1++) {
+  //      gemm[i, i_1] = float(0);
+  //      for (int i_2 = 0; i_2 < 4; i_2++) {
+  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
+  //        i_1]), reduce_args={i_2});
+  //      }
+  //    }
+  //  }
+  //  for (int i_3 = 0; i_3 < 4; i_3++) {
+  //    for (int i_4 = 0; i_4 < 4; i_4++) {
+  //      relu[i_3, i_4] = (gemm[i_3, i_4])<0.f ? 0.f : (gemm[i_3, i_4]);
+  //    }
+  //  }
+  //  for (int i_5 = 0; i_5 < 4; i_5++) {
+  //    for (int i_6 = 0; i_6 < 4; i_6++) {
+  //      E[i_5, i_6] = quint8((relu[i_5, i_6]) + (relu[i_5, i_6]));
+  //    }
+  //  }
+  //  for (int i_7 = 0; i_7 < 4; i_7++) {
+  //    for (int i_8 = 0; i_8 < 4; i_8++) {
+  //      F[i_7, i_8] = E[i_7, i_8];
+  //    }
+  //  }
+  //}
+
+  LoopNest l(stmt, {FT.buf()});
+  l.prepareForCodegen();
+  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
+# CHECK: Alias(E,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  PaddedBuffer<float> a_v(M, K, "a");
+  PaddedBuffer<float> b_v(K, N, "b");
+  PaddedBuffer<uint8_t> o1(M, N, "e_before");
+  PaddedBuffer<uint8_t> o2(M, N, "e_after");
+
+  for (const auto m : c10::irange(M)) {
+    for (const auto k : c10::irange(K)) {
+      a_v(m, k) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  for (const auto k : c10::irange(K)) {
+    for (const auto n : c10::irange(N)) {
+      b_v(k, n) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  cg.call({a_v, b_v, o1});
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
+# CHECK: Alias(E,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  cg_llvm.call({a_v, b_v, o2});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(o1, o2, 1e-5);
+#endif
+}
+
+TEST(MemPlanning, NoMemReuseForLargerType) {
+  int M = 4;
+  int N = 4;
+  int K = 4;
+
+  BufHandle AP("A", {M, K}, kShort);
+  BufHandle BP("B", {K, N}, kShort);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  auto zero = Cast::make(CT.buf()->dtype(), 0);
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return Cast::make(kFloat, DT.load(m, n) + DT.load(m, n));
+      });
+  Tensor FT =
+      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n);
+      });
+  StmtPtr stmt =
+      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
+  // different: 'E' type float > 'gemm' type int16. We won't reuse 'gemm' for
+  // 'E'.
+  //{
+  //  for (int i = 0; i < 4; i++) {
+  //    for (int i_1 = 0; i_1 < 4; i_1++) {
+  //      gemm[i, i_1] = int16_t(0);
+  //      for (int i_2 = 0; i_2 < 4; i_2++) {
+  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
+  //        i_1]), reduce_args={i_2});
+  //      }
+  //    }
+  //  }
+  //  for (int i_3 = 0; i_3 < 4; i_3++) {
+  //    for (int i_4 = 0; i_4 < 4; i_4++) {
+  //      relu[i_3, i_4] = (gemm[i_3, i_4])<int16_t(0) ? int16_t(0) : (gemm[i_3,
+  //      i_4]);
+  //    }
+  //  }
+  //  for (int i_5 = 0; i_5 < 4; i_5++) {
+  //    for (int i_6 = 0; i_6 < 4; i_6++) {
+  //      E[i_5, i_6] = float((relu[i_5, i_6]) + (relu[i_5, i_6]));
+  //    }
+  //  }
+  //  for (int i_7 = 0; i_7 < 4; i_7++) {
+  //    for (int i_8 = 0; i_8 < 4; i_8++) {
+  //      F[i_7, i_8] = E[i_7, i_8];
+  //    }
+  //  }
+  //}
+
+  LoopNest l(stmt, {FT.buf()});
+  l.prepareForCodegen();
+  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT.buf()});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
+# CHECK: Free(E);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  PaddedBuffer<short> a_v(M, K, "a");
+  PaddedBuffer<short> b_v(K, N, "b");
+  PaddedBuffer<float> o1(M, N, "e_before");
+  PaddedBuffer<float> o2(M, N, "e_after");
+
+  for (const auto m : c10::irange(M)) {
+    for (const auto k : c10::irange(K)) {
+      a_v(m, k) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  for (const auto k : c10::irange(K)) {
+    for (const auto n : c10::irange(N)) {
+      b_v(k, n) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  cg.call({a_v, b_v, o1});
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
+# CHECK: Free(E);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  cg_llvm.call({a_v, b_v, o2});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(o1, o2, 1e-5);
+#endif
+}
+
+TEST(MemPlanning, SameBufSizeMemReuse) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3] Buffer 'gemm' and 'add' are the same size; we'll reuse 'gemm'
+  // for 'add'.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SameBufSizeMultiMemReuses) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  Tensor GT =
+      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return FT.load(m, n) - ET.load(m, n);
+      });
+
+  auto stmt =
+      Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3], mul [3, 4] Buffer 'gemm', 'relu, ''add' and 'mul' are the same
+  // size; we'll reuse 'gemm' for 'add', and reuse 'relu' for 'mul'
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
+  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
+  //      sub[M_4, N_4] = (mul[M_4, N_4]) - (add[M_4, N_4]);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, GT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  Tensor GT =
+      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return FT.load(m, n) - 1;
+      });
+  Tensor HT =
+      Compute("div", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return GT.load(m, n) / 2;
+      });
+
+  auto stmt = Block::make(
+      {CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt(), HT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3], mul [3, 4], sub [4, 5] Buffer 'gemm', 'relu, ''add', 'mul' and
+  // 'sub' are the same size; we'll reuse 'gemm' for 'add', reuse 'relu' for
+  // 'mul', and reuse 'gemm' for 'sub'.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
+  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
+  //      sub[M_4, N_4] = (mul[M_4, N_4]) - float(1);
+  //    }
+  //  }
+  //  for (int M_5 = 0; M_5 < 1024; M_5++) {
+  //    for (int N_5 = 0; N_5 < 1024; N_5++) {
+  //      div[M_5, N_5] = (sub[M_5, N_5]) / float(2);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, HT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Alias(sub,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Alias(sub,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SmallerBufSizeNonMemReuse) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET = Compute(
+      "add", {M * 2, N * 2}, [&](const ExprHandle& em, const ExprHandle& en) {
+        return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2);
+      });
+  Tensor FT = Compute(
+      "mul", {M * 2, N * 2}, [&](const ExprHandle& fm, const ExprHandle& fn) {
+        return ET.load(fm, fn) * ET.load(fm, fn);
+      });
+  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3] We do not reuse buffer 'gemm' for 'add' because the size of
+  // buffer 'gemm' is smaller.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int EM = 0; EM < 2048; EM++) {
+  //    for (int EN = 0; EN < 2048; EN++) {
+  //      add[EM, EN] = (relu[EM / 2, EN / 2]) + (relu[EM / 2, EN / 2]);
+  //    }
+  //  }
+  //  for (int FM = 0; FM < 2048; FM++) {
+  //    for (int FN = 0; FN < 2048; FN++) {
+  //      mul[FM, FN] = (add[FM, FN]) * (add[FM, FN]);
+  //    }
+  //  }
+  //}
+  //
+
+  SimpleIREvaluator cg(stmt, {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK-NOT: Alias(add,gemm);
+# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
+# CHECK: Free(add);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK-NOT: Alias(add,gemm);
+# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
+# CHECK: Free(add);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
new file mode 100644
index 0000000000000..379c901968d54
--- /dev/null
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -0,0 +1,78 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/operators/operators.h>
+#include <torch/torch.h>
+
+using namespace torch::jit::tensorexpr;
+
+using Tensors = std::vector<Tensor>;
+using Args = std::vector<CodeGen::BufferArg>;
+std::unique_ptr<SimpleIREvaluator> compile(
+    const Args& inputs,
+    const Tensors& outputs) {
+  LoopNest nest({outputs});
+  nest.prepareForCodegen();
+  nest.simplify();
+  auto join = inputs;
+  join.insert(join.end(), outputs.begin(), outputs.end());
+  return std::make_unique<SimpleIREvaluator>(nest.root_stmt(), join);
+}
+
+TEST(Ops, Sum) {
+  constexpr int M = 8;
+  constexpr int N = 16;
+  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
+  std::vector<std::vector<ExprHandle>> outputShapes = {{N}, {M}, {}};
+  for (unsigned idx = 0; idx < testDims.size(); idx++) {
+    const auto& dims = testDims[idx];
+    const auto& outShape = outputShapes[idx];
+
+    BufHandle a("a", {M, N}, kFloat);
+    std::vector<ExprHandle> outStrides =
+        c10::fmap<ExprHandle>(make_contiguous_strides(outShape));
+    Tensor b = computeSum(
+        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
+    auto cg = compile({a}, {b});
+
+    auto at = at::arange(M * N, at::kFloat).view({M, N});
+    auto ref = at::sum(at, dims);
+    auto bt = at::empty_like(ref);
+
+    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
+
+    ASSERT_TRUE(at::allclose(bt, ref));
+  }
+}
+
+TEST(Ops, ChannelsLastSum) {
+  constexpr int A = 2;
+  constexpr int B = 3;
+  constexpr int C = 4;
+  constexpr int D = 5;
+  constexpr int E = 6;
+  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
+
+  std::vector<std::vector<ExprHandle>> outputShapes = {
+      {B, C, D, E}, {A, C, D, E}, {C, D, E}};
+  for (unsigned idx = 0; idx < testDims.size(); idx++) {
+    const auto& dims = testDims[idx];
+    const auto& outShape = outputShapes[idx];
+
+    BufHandle a("a", {A, B, C, D, E}, kFloat);
+    std::vector<ExprHandle> outStrides =
+        c10::fmap<ExprHandle>(make_channels_last_strides(outShape));
+    Tensor b = computeSum(
+        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
+    auto cg = compile({a}, {b});
+
+    auto at = at::arange(A * B * C * D * E, at::kFloat).view({A, B, C, D, E});
+    auto ref = at::sum(at, dims);
+    auto bt = at::empty_like(ref);
+
+    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
+
+    ASSERT_TRUE(at::allclose(bt, ref));
+  }
+}
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
new file mode 100644
index 0000000000000..af6b539ff33e9
--- /dev/null
+++ b/test/cpp/tensorexpr/test_quantization.cpp
@@ -0,0 +1,452 @@
+#include <gtest/gtest.h>
+
+#include <ATen/native/quantized/PackedParams.h>
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/torch.h>
+#include <cmath>
+#include <sstream>
+#include "torch/csrc/jit/tensorexpr/eval.h"
+#include "torch/csrc/jit/tensorexpr/ir.h"
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
+using namespace torch::indexing;
+using namespace torch::jit::tensorexpr;
+
+class Quantization : public ::testing::Test {
+ public:
+  void SetUp() override {
+    getTEMustUseLLVMOnCPU() = false;
+  }
+};
+
+TEST_F(Quantization, QuantDequantInt8) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=12]()
+        %3 : int = prim::Constant[value=13]()
+        %4 : float = prim::Constant[value=0.1]()
+        %q.1 : QInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
+        %6 : Float(2, 2) = aten::dequantize(%q.1)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQInt8);
+  auto y_expected = at::dequantize(q);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %3 : int = prim::Constant[value=122]()
+        %4 : float = prim::Constant[value=0.1]()
+        %q.1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
+        %6 : Float(2, 2) = aten::dequantize(%q.1)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = 2 * at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
+  auto y_expected = at::dequantize(q);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantDequantUInt8_NLC) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %3 : int = prim::Constant[value=122]()
+        %4 : float = prim::Constant[value=0.1]()
+        %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
+        %6 : Float(1, 2, 2) = aten::dequantize(%q.1)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  x.unsafeGetTensorImpl()->set_sizes_and_strides(
+      std::initializer_list<int64_t>{1, 2, 2}, {4, 1, 2});
+  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
+  auto y_expected = at::dequantize(q);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+at::Tensor quantized_add(
+    at::Tensor x1,
+    at::Tensor x2,
+    double scale,
+    int64_t zero) {
+  const auto qadd_op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("quantized::add", "")
+          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
+  return qadd_op.call(x1, x2, scale, zero);
+}
+
+TEST_F(Quantization, QuantAddDequantInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=12]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %qz2 : int = prim::Constant[value=13]()
+        %qs2 : float = prim::Constant[value=0.1]()
+        %qza : int = prim::Constant[value=13]()
+        %qsa : float = prim::Constant[value=0.1]()
+        %q1 : QInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %q2 : QInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
+        %qa : QInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQInt8);
+  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQInt8);
+  auto qa = quantized_add(q1, q2, 0.1f, 13);
+  auto y_expected = at::dequantize(qa);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1, x2};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "x2:\n" << x2 << std::endl;
+    std::cout << "q2:\n" << q2 << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantAddDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %qz2 : int = prim::Constant[value=13]()
+        %qs2 : float = prim::Constant[value=0.1]()
+        %qza : int = prim::Constant[value=13]()
+        %qsa : float = prim::Constant[value=0.1]()
+        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
+        %qa : QUInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
+  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
+  auto qa = quantized_add(q1, q2, 0.1f, 13);
+  auto y_expected = at::dequantize(qa);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1, x2};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "x2:\n" << x2 << std::endl;
+    std::cout << "q2:\n" << q2 << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantSigmoidDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %qa : QUInt8(2, 2) = aten::sigmoid(%q1)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
+  auto qs = at::sigmoid(q1);
+  auto y_expected = at::dequantize(qs);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "qs:\n" << qs << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+at::Tensor quantized_mul(
+    at::Tensor x1,
+    at::Tensor x2,
+    double scale,
+    int64_t zero) {
+  const auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("quantized::mul", "")
+          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
+  return op.call(x1, x2, scale, zero);
+}
+
+TEST_F(Quantization, QuantMulDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %qz2 : int = prim::Constant[value=13]()
+        %qs2 : float = prim::Constant[value=0.1]()
+        %qza : int = prim::Constant[value=13]()
+        %qsa : float = prim::Constant[value=0.1]()
+        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
+        %qa : QUInt8(2, 2) = quantized::mul(%q1, %q2, %qsa, %qza)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
+  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
+  auto qa = quantized_mul(q1, q2, 0.1f, 13);
+  auto y_expected = at::dequantize(qa);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1, x2};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "x2:\n" << x2 << std::endl;
+    std::cout << "q2:\n" << q2 << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantUpsampleNearst2dDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x : Float(1, 1, 4, 4, strides=[16, 16, 4, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %4 : NoneType = prim::Constant()
+        %3 : int[] = prim::Constant[value=[6, 6]]()
+        %qz : int = prim::Constant[value=13]()
+        %qs : float = prim::Constant[value=0.1]()
+        %q : QUInt8(1, 1, 4, 4) = aten::quantize_per_tensor(%x, %qs, %qz, %2)
+        %qu : QUInt8(1, 1, 6, 6) = aten::upsample_nearest2d(%q, %3, %4)
+        %6 : Float(1, 1, 6, 6) = aten::dequantize(%qu)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = at::rand({1, 1, 4, 4}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
+  auto qu = at::upsample_nearest2d(q, {6, 6});
+  auto y_expected = at::dequantize(qu);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "q:\n" << q << std::endl;
+    std::cout << "qu:\n" << qu << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, UpsampleNearst2d) {
+  const auto graph_string = R"IR(
+      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
+        %4 : NoneType = prim::Constant()
+        %3 : int[] = prim::Constant[value=[4, 4]]()
+        %u : Float(1, 1, 4, 4) = aten::upsample_nearest2d(%x, %3, %4)
+        return (%u))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto y_expected = at::upsample_nearest2d(x, {4, 4});
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+at::Tensor quantized_cat(
+    c10::List<at::Tensor> const& xs,
+    int64_t dim,
+    double scale,
+    int64_t zero) {
+  const auto op = c10::Dispatcher::singleton()
+                      .findSchemaOrThrow("quantized::cat", "")
+                      .typed<at::Tensor(
+                          c10::List<at::Tensor> const&,
+                          int64_t,
+                          std::optional<double>,
+                          std::optional<int64_t>)>();
+  return op.redispatch(
+      DispatchKeySet({DispatchKey::QuantizedCPU}), xs, dim, scale, zero);
+}
+
+TEST_F(Quantization, QuantCatDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %y : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %z : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
+        %qdt : int = prim::Constant[value=13]()
+        %qxz : int = prim::Constant[value=13]()
+        %qxs : float = prim::Constant[value=0.1]()
+        %qyz : int = prim::Constant[value=16]()
+        %qys : float = prim::Constant[value=0.15]()
+        %qzz : int = prim::Constant[value=19]()
+        %qzs : float = prim::Constant[value=0.2]()
+        %qx : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%x, %qxs, %qxz, %qdt)
+        %qy : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%y, %qys, %qyz, %qdt)
+        %qz : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%z, %qzs, %qzz, %qdt)
+        %catx : Tensor[] = prim::ListConstruct(%qx, %qy, %qz)
+        %catd : int = prim::Constant[value=0]()
+        %qcat : QUInt8(3, 1, 2, 2) = quantized::cat(%catx, %catd, %qxs, %qxz)
+        %cat : Float(3, 1, 2, 2) = aten::dequantize(%qcat)
+        return (%cat))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto y = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto z = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto qx = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
+  auto qy = at::quantize_per_tensor(y, 0.15f, 16, at::kQUInt8);
+  auto qz = at::quantize_per_tensor(z, 0.2f, 19, at::kQUInt8);
+  auto qcat = quantized_cat({qx, qy, qz}, 0, 0.1f, 13);
+  auto expected = at::dequantize(qcat);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x, y, z};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto result = stack[0].toTensor();
+  bool check = at::allclose(expected, result);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+    std::cout << "z:\n" << z << std::endl;
+    std::cout << "qx:\n" << qx << std::endl;
+    std::cout << "qy:\n" << qy << std::endl;
+    std::cout << "qz:\n" << qz << std::endl;
+    std::cout << "qcat:\n" << qcat << std::endl;
+    std::cout << "expected:\n" << expected << std::endl;
+    std::cout << "result:\n" << result << std::endl;
+  }
+  TORCH_CHECK_EQ(check, 1);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
new file mode 100644
index 0000000000000..bdc744ae4e033
--- /dev/null
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -0,0 +1,1928 @@
+#include <gtest/gtest.h>
+
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+TEST(Reductions, ReduceSum0D_1) {
+  const int M = 10;
+
+  BufHandle b("b", {M}, kFloat);
+  std::vector<float> in(M);
+  for (const auto j : c10::irange(M)) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(M, -1.f);
+
+  Tensor c = Reduce("sum", {M}, Sum(), b, {});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+  for (const auto i : c10::irange(M)) {
+    ASSERT_EQ(out[i], in[i]);
+  }
+}
+
+TEST(Reductions, ReduceSum0D_2) {
+  BufHandle b("b", {}, kFloat);
+  std::vector<float> in(1);
+  in[0] = 77.7;
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+  ASSERT_EQ(out[0], in[0]);
+}
+
+// Sum an array to a single value.
+TEST(Reductions, ReduceSum1D) {
+  BufHandle b("b", {10}, kFloat);
+  std::vector<float> in(10);
+  for (const auto j : c10::irange(10)) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {10});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+  ASSERT_EQ(out[0], 45);
+}
+// Sum a 2D tensor to a 1D tensor with dynamic shapes.
+TEST(Reductions, ReduceSum2D) {
+  const int M = 3;
+  const int N = 7;
+
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+
+  BufHandle b("b", {m, n}, kFloat);
+  std::vector<float> in(M * N);
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      in[i * N + j] = j;
+    }
+  }
+
+  std::vector<float> out(M, -1.f);
+
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c, n, m});
+
+  cg.call({in, out, 5, 7});
+
+  float expected = 0;
+  for (const auto i : c10::irange(N)) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    expected += i;
+  }
+
+  for (const auto i : c10::irange(M)) {
+    ASSERT_EQ(out[i], expected);
+  }
+}
+
+// Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to
+// check our work.
+TEST(Reductions, ReduceSum3D) {
+  const int M = 10;
+  VarHandle m("m", kInt);
+
+  BufHandle b("b", {2, 3, m}, kFloat);
+
+  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c, m});
+
+  std::vector<float> bData(2 * 3 * M, 0);
+  std::vector<float> cData(2 * 3, 6.0f);
+  std::vector<float> dData(2, 1.0f);
+  std::vector<float> eData(2, 1.0f);
+
+  for (int i = 0; i < 2 * 3; ++i) {
+    for (const auto j : c10::irange(M)) {
+      bData[i * M + j] = j;
+    }
+  }
+
+  cg.call({bData, cData, M});
+  float expected = 0;
+  for (const auto i : c10::irange(M)) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    expected += i;
+  }
+
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(cData[i], expected);
+  }
+
+  Tensor d = Reduce("sum2", {2}, Sum(), b, {3, m});
+  LoopNest loop2({d});
+  loop2.prepareForCodegen();
+  StmtPtr s2 = loop2.root_stmt();
+  s2 = IRSimplifier::simplify(s2);
+
+  SimpleIREvaluator cg2(s2, {b, d, m});
+  cg2.call({bData, dData, M});
+
+  // We're combining an additional dimension of 3, so the sum is 3x.
+  expected = expected * 3;
+
+  for (const auto i : c10::irange(2)) {
+    ASSERT_EQ(dData[i], expected);
+  }
+
+  // This is the same as just reducing the original result across that axis.
+  BufHandle c_buf(c.buf());
+  Tensor e = Reduce("sum3", {2}, Sum(), c_buf, {3});
+  LoopNest loop3({e});
+  loop3.prepareForCodegen();
+  StmtPtr s3 = loop3.root_stmt();
+  s3 = IRSimplifier::simplify(s3);
+
+  SimpleIREvaluator cg3(s3, {c, e});
+  cg3.call({cData, eData});
+
+  for (const auto i : c10::irange(2)) {
+    ASSERT_EQ(eData[i], expected);
+  }
+}
+
+// Sum a large (10 D) Tensor 5 dimensions in.
+TEST(Reductions, ReduceSum10D) {
+  BufHandle in_("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat);
+  const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
+  BufHandle out_("out_", {2, 3, 2, 3, 2}, kFloat);
+  const int OutputSize = 2 * 3 * 2 * 3 * 2;
+
+  std::vector<float> in(InputSize, 1.f);
+  std::vector<float> out(OutputSize, -1.f);
+
+  Tensor c = Reduce("sum", {2, 3, 2, 3, 2}, Sum(), in_, {3, 2, 3, 2, 3});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {in_, c});
+
+  cg.call({in, out});
+
+  // NOLINTNEXTLINE(bugprone-integer-division)
+  float expected = InputSize / OutputSize;
+  for (const auto i : c10::irange(OutputSize)) {
+    ASSERT_EQ(out[i], expected);
+  }
+}
+
+// Reduce via Mul rather than Add using a custom Reducer.
+TEST(Reductions, ReduceProduct) {
+  const int M = 4;
+  const int N = 4;
+
+  BufHandle b("b", {M, N}, kFloat);
+  std::vector<float> in(M * N);
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      in[i * N + j] = 2 + j;
+    }
+  }
+
+  std::vector<float> out(M, -1.f);
+
+  Reducer product(
+      ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
+
+  Tensor c = Reduce("product", {M}, product, b, {N});
+  LoopNest loop({c});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+
+  float expected = 1;
+  for (const auto i : c10::irange(N)) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    expected *= 2 + i;
+  }
+
+  for (const auto i : c10::irange(M)) {
+    ASSERT_EQ(out[i], expected);
+  }
+}
+
+// Maximum reductions.
+TEST(Reductions, ReduceMax) {
+  BufHandle in_("b", {10}, kFloat);
+
+  std::vector<float> in(10);
+  std::vector<float> out(1, -1.f);
+  for (const auto j : c10::irange(10)) {
+    in[j] = j;
+  }
+
+  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {10});
+
+  LoopNest loop({dm1});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+  SimpleIREvaluator cg(s, {in_, dm1});
+
+  cg.call({in, out});
+
+  ASSERT_EQ(out[0], 9);
+
+  BufHandle in2_("b", {2, 5}, kFloat);
+  std::vector<float> out2(2, -1.f);
+
+  Tensor m2d = Reduce("max", {2}, Maximum(kFloat), in2_, {5});
+
+  LoopNest loop2({m2d});
+  loop2.prepareForCodegen();
+  s = loop2.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg2(s, {in2_, m2d});
+  cg2.call({in, out2});
+
+  ASSERT_EQ(out2[0], 4);
+  ASSERT_EQ(out2[1], 9);
+}
+
+// Minimum reduction, with custom initialization.
+TEST(Reductions, ReduceMinCustomInitializer) {
+  VarHandle minInit("minInit", kFloat);
+  BufHandle in_("b", {10}, kFloat);
+
+  std::vector<float> in(10);
+  std::vector<float> out(1, -1.f);
+  for (const auto j : c10::irange(10)) {
+    in[j] = 10 + j;
+  }
+
+  Tensor min = Reduce(
+      "min",
+      {},
+      Minimum(ExprHandle(minInit)),
+      [&](ParameterList& v) { return in_.load(v); },
+      {10});
+
+  LoopNest loop({min});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {in_, min, minInit});
+
+  // Works normally (note that out data starts lower than the correct
+  // minimum).
+  cg.call({in, out, std::numeric_limits<float>::max()});
+  ASSERT_EQ(out[0], 10);
+
+  // With an initalizer lower than the min, that's the min.
+  cg.call({in, out, 5.f});
+  ASSERT_EQ(out[0], 5);
+}
+
+// Example implementation of Any/All.
+// TODO: this is very awkward without logical And/Or operators.
+TEST(Reductions, ReduceAnyAll) {
+  VarHandle searchValue("searchValue", kInt);
+  BufHandle b("b", {4, 10}, kInt);
+
+  Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
+    return CompareSelect::make(a, 1, 1, b, kEQ);
+  });
+
+  Tensor any = Reduce(
+      "anyEqual",
+      {4},
+      anyEqSV,
+      [&](const auto& i, const auto& j) {
+        return CompareSelect::make(b.load(i, j), searchValue, kEQ);
+      },
+      {10});
+
+  LoopNest loop({any});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, any, searchValue});
+
+  std::vector<int> in(40, 0);
+  std::vector<int> out(4, 0);
+
+  // input has 0-39 in 4 rows.
+  for (const auto i : c10::irange(40)) {
+    in[i] = i;
+  }
+  cg.call({in, out, 1});
+
+  // only the first row has 1
+  ASSERT_EQ(out[0], 1);
+  ASSERT_EQ(out[1], 0);
+  ASSERT_EQ(out[2], 0);
+  ASSERT_EQ(out[3], 0);
+
+  cg.call({in, out, 15});
+
+  // 15 in the 3rd row
+  ASSERT_EQ(out[0], 0);
+  ASSERT_EQ(out[1], 1);
+  ASSERT_EQ(out[2], 0);
+  ASSERT_EQ(out[3], 0);
+
+  Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) {
+    return CompareSelect::make(a, 0, 0, b, kEQ);
+  });
+
+  Tensor allGreaterThan = Reduce(
+      "allGreaterThan",
+      {4},
+      allGTSV,
+      [&](const auto& i, const auto& j) {
+        return CompareSelect::make(b.load(i, j), searchValue, kGT);
+      },
+      {10});
+
+  LoopNest loop2({allGreaterThan});
+  loop2.prepareForCodegen();
+  s = loop2.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue});
+
+  cg2.call({in, out, 11});
+
+  // 11 is in row 2.
+  ASSERT_EQ(out[0], 0);
+  ASSERT_EQ(out[1], 0);
+  ASSERT_EQ(out[2], 1);
+  ASSERT_EQ(out[3], 1);
+
+  cg2.call({in, out, -3});
+
+  // All are positive.
+  ASSERT_EQ(out[0], 1);
+  ASSERT_EQ(out[1], 1);
+  ASSERT_EQ(out[2], 1);
+  ASSERT_EQ(out[3], 1);
+}
+
+TEST(Reductions, ReduceMatmul2D) {
+  BufHandle tA("tA", {3, 2}, kFloat);
+  BufHandle tB("tB", {2, 3}, kFloat);
+
+  std::vector<float> tA_(6);
+  std::vector<float> tB_(6);
+
+  std::vector<float> out(9, -1.f);
+  for (const auto i : c10::irange(3)) {
+    for (const auto j : c10::irange(2)) {
+      tA_[i * 2 + j] = i * 2 + j;
+      tB_[j * 3 + i] = i * 2 + j;
+    }
+  }
+
+  Tensor mm = Reduce(
+      "mm",
+      {3, 3},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return tA.load(m, k) * tB.load(k, n);
+      },
+      {2});
+
+  LoopNest loop({mm});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {tA, tB, mm});
+  cg.call({tA_, tB_, out});
+
+  std::vector<float> expected(
+      {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f});
+
+  for (const auto i : c10::irange(9)) {
+    ASSERT_EQ(out[i], expected[i]);
+  }
+}
+
+TEST(Reductions, ReduceRfactorLike) {
+  BufHandle in("in", {10, 10}, kFloat);
+  std::vector<float> in_(100);
+  for (const auto i : c10::irange(100)) {
+    in_[i] = i;
+  }
+  std::vector<float> in_rf_(10, -2.f);
+  std::vector<float> out(1, -1.f);
+
+  Tensor l1 = Reduce("l1", {10}, Sum(), in, {10});
+  BufHandle in_rf(l1.buf());
+
+  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {10});
+
+  LoopNest loop({l1, l2});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {in, l1, l2});
+  cg.call({in_, in_rf_, out});
+
+  ASSERT_EQ(out[0], 99 * 50);
+}
+
+TEST(Reductions, ReduceAsProducer) {
+  const int M = 10;
+  VarHandle m("m", kInt);
+
+  BufHandle a("a", {2, 3}, kFloat);
+  BufHandle b("b", {2, 3, m}, kFloat);
+
+  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
+  Tensor d =
+      Compute("scale", {2, 3}, [&](const VarHandle& l, const VarHandle& n) {
+        return c.load(l, n) * a.load(l, n);
+      });
+  LoopNest loop({d}, {c, d});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {a, b, d, m});
+
+  std::vector<float> aData(2 * 3, 0);
+  std::vector<float> bData(2 * 3 * M, 0);
+  std::vector<float> dData(2 * 3, 6.0f);
+
+  for (int i = 0; i < 2 * 3; ++i) {
+    aData[i] = 6 - i;
+    for (const auto j : c10::irange(M)) {
+      bData[i * M + j] = j;
+    }
+  }
+
+  cg.call({aData, bData, dData, M});
+  float expected = 0;
+  for (const auto i : c10::irange(M)) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    expected += i;
+  }
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(dData[i], expected * (6 - i));
+  }
+}
+
+TEST(Reductions, ReduceAsConsumer) {
+  const int M = 10;
+  VarHandle m("m", kInt);
+
+  BufHandle a("a", {2, 3, m}, kFloat);
+  BufHandle b("b", {2, 3, m}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {2, 3, m},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {2}, Sum(), c, {3, m});
+  LoopNest loop({d}, {c, d});
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {a, b, d, m});
+
+  std::vector<float> aData(2 * 3 * M, 0);
+  std::vector<float> bData(2 * 3 * M, 0);
+  std::vector<float> dData(2, 6.0f);
+
+  for (int i = 0; i < 2 * 3; ++i) {
+    for (const auto j : c10::irange(M)) {
+      bData[i * M + j] = j + 1;
+      aData[i * M + j] = 6 - i;
+    }
+  }
+
+  cg.call({aData, bData, dData, M});
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  float expected[2] = {0, 0};
+  for (const auto i : c10::irange(2)) {
+    for (const auto j : c10::irange(3)) {
+      for (const auto k : c10::irange(M)) {
+        // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+        expected[i] += (k + 1) * (6 - (i * 3 + j));
+      }
+    }
+  }
+
+  for (const auto i : c10::irange(2)) {
+    ASSERT_EQ(dData[i], expected[i]);
+  }
+}
+
+TEST(Reductions, SplitReduceAxis) {
+  BufHandle in("in", {16, 8}, kFloat);
+
+  std::vector<float> in_(16 * 8);
+  for (const auto i : c10::irange(16)) {
+    for (const auto j : c10::irange(8)) {
+      in_[i * 8 + j] = i;
+    }
+  }
+  std::vector<float> out(16, -1.f);
+
+  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
+  LoopNest::splitWithTail(loops[1], 2);
+
+  l.prepareForCodegen();
+
+  StmtPtr s = l.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {in, tensor});
+  cg.call({in_, out});
+
+  for (const auto i : c10::irange(16)) {
+    ASSERT_EQ(out[i], i * 8);
+  }
+}
+
+TEST(Reductions, SplitNonReduceAxis) {
+  BufHandle in("in", {16, 8}, kFloat);
+
+  std::vector<float> in_(16 * 8);
+  for (const auto i : c10::irange(16)) {
+    for (const auto j : c10::irange(8)) {
+      in_[i * 8 + j] = i;
+    }
+  }
+  std::vector<float> out(16, -1.f);
+  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
+  LoopNest l({tensor});
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
+  LoopNest::splitWithTail(loops[0], 2);
+  LoopNest::splitWithTail(loops[0], 2);
+
+  l.prepareForCodegen();
+
+  StmtPtr s = l.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {in, tensor});
+  cg.call({in_, out});
+
+  for (const auto i : c10::irange(16)) {
+    ASSERT_EQ(out[i], i * 8);
+  }
+}
+
+TEST(Reductions, ReorderedReductionInitializer) {
+  /* From the quip:
+  for k in 0..1:  // blockIdx
+    for m in 0..128:
+      for n in 0..64: // threadIdx
+        SumOp(c(k, n), 0, a(k, m, n), {m})
+  */
+
+  BufHandle in("in", {1, 12, 6}, kFloat);
+  std::vector<float> in_(12 * 6, 1.f);
+
+  Tensor tensor_ = Reduce("sum", {1, 12}, Sum(), in, {6});
+  LoopNest l_({tensor_});
+
+  l_.prepareForCodegen();
+  StmtPtr s_ = Stmt::clone(l_.root_stmt());
+  s_ = IRSimplifier::simplify(s_);
+
+  Tensor tensor = Reduce("sum", {1, 12}, Sum(), in, {6});
+  LoopNest l({tensor});
+
+  auto loops = l.getLoopStmtsFor(tensor);
+  loops[0]->set_gpu_block_index(0);
+  loops[1]->set_gpu_thread_index(0);
+
+  LoopNest::reorderAxis(loops[1], loops[2]);
+
+  StmtPtr s = l.root_stmt();
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  s = IRSimplifier::simplify(s);
+
+  l.prepareForCodegen();
+
+  s = l.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  std::vector<float> out1(16, -1.f);
+  SimpleIREvaluator cg(s_, {in, tensor_});
+  cg.call({in_, out1});
+
+  std::vector<float> out2(16, -1.f);
+  SimpleIREvaluator cg2(s, {in, tensor});
+  cg2.call({in_, out2});
+
+  for (const auto i : c10::irange(16)) {
+    ASSERT_EQ(out1[i], out2[i]);
+  }
+}
+
+TEST(Reductions, ReduceRfactor) {
+  const int M = 10;
+  const int N = 10;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+
+  BufHandle b("b", {m, n}, kFloat);
+  std::vector<float> in(M * N);
+  for (int j = 0; j < M * N; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
+  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
+  ASSERT_EQ(rc.size(), 2);
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c, m, n});
+
+  cg.call({in, out, M, N});
+  ASSERT_EQ(out[0], 4950);
+}
+
+TEST(Reductions, Reduce3DRfactorInner) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle k("k", kInt);
+
+  BufHandle b("b", {m, n, k}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+  ASSERT_FALSE(loop.rfactor(c_body, loops.at(2)));
+  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
+  ASSERT_EQ(rc.size(), 1);
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
+
+  cg.call({in, out, M, N, K});
+  ASSERT_EQ(out[0], 499500);
+}
+
+TEST(Reductions, Reduce3DRfactorOuter) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle k("k", kInt);
+
+  BufHandle b("b", {m, n, k}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
+  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
+  ASSERT_EQ(rc.size(), 2);
+  loop.prepareForCodegen();
+  StmtPtr s = loop.root_stmt();
+  s = IRSimplifier::simplify(s);
+
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
+  cg.call({in, out, M, N, K});
+  ASSERT_EQ(out[0], 499500);
+}
+
+TEST(Reductions, ReduceRepeatedInternalRfactor) {
+  BufHandle in_("in_", {2, 3, 4, 5, 6}, kFloat);
+  const int InputSize = 2 * 3 * 4 * 5 * 6;
+
+  std::vector<float> in(InputSize, 1.f);
+  std::vector<float> out(1, -1.f);
+  std::vector<float> ref(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), in_, {2, 3, 4, 5, 6});
+  LoopNest orig_loop({c});
+
+  // Try rfactoring N outer loops
+  for (const auto rfac_number : c10::irange(1, 5)) {
+    LoopNest refloop(orig_loop);
+    LoopNest loop(orig_loop);
+    refloop.prepareForCodegen();
+    SimpleIREvaluator ref_cg(
+        IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
+    ref_cg.call({in, ref});
+
+    BufPtr tmp_buf = c.buf();
+
+    for (const auto idx : c10::irange(rfac_number)) {
+      auto reduce = loop.getAllWritesToBuf(tmp_buf)[1];
+      ASSERT_TRUE(loop.rfactor(
+          reduce, loop.getLoopStmtsFor(tmp_buf).at(idx), &tmp_buf));
+    }
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {in_, c});
+    cg.call({in, out});
+
+    ASSERT_EQ(ref[0], out[0]);
+  }
+}
+
+// Split a reduction axis with a tail loop.
+TEST(Reductions, ReduceSplitTail) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithTail(loops[i], 8);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Split a reduction axis cleanly so there is no tail loop.
+TEST(Reductions, ReduceSplitNoTail) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithTail(loops[i], 5);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Split a reduction axis with only a tail loop (the split loop will be size 0
+// and eliminated out).
+TEST(Reductions, ReduceOverSplitTail) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithTail(loops[i], 16);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Split a reduction axis with a mask.
+TEST(Reductions, ReduceSplitMask) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithMask(loops[i], 8);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Split a reduction axis cleanly not requiring a mask.
+TEST(Reductions, ReduceSplitNoMask) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithMask(loops[i], 5);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Split a reduction axis with all logic in the mask.
+TEST(Reductions, ReduceOverSplitMask) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  for (const auto i : c10::irange(3)) {
+    std::vector<float> out(M, -1.f);
+
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+    LoopNest loop({c});
+    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+    LoopNest::splitWithMask(loops[i], 16);
+
+    loop.prepareForCodegen();
+    StmtPtr s = loop.root_stmt();
+    s = IRSimplifier::simplify(s);
+
+    SimpleIREvaluator cg(s, {b, c});
+
+    cg.call({in, out});
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Test an rfactor when there are two ReduceOps in the graph due to a
+// splitWithTail.
+TEST(Reductions, ReduceSplitRfactor) {
+  const int M = 2;
+  const int N = 10;
+  const int K = 10;
+  const int SPLIT_FACTOR = 4;
+
+  BufHandle b("b", {M, N, K}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (const auto m : c10::irange(M)) {
+    for (int j = 0; j < N * K; ++j) {
+      in[m * N * K + j] = j;
+    }
+  }
+
+  std::vector<float> out(M, -1.f);
+
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
+
+  auto c_body = loop.getAllWritesToBuf(c.buf())[2];
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
+  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
+  LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]);
+  all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
+  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
+  ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1]));
+  loop.prepareForCodegen();
+  loop.simplify();
+  StmtPtr s = loop.root_stmt();
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+  for ([[maybe_unused]] const auto i : c10::irange(M)) {
+    ASSERT_EQ(out[0], 4950);
+  }
+}
+
+// Test an rfactor which ends up being eliminated since the total loop size is
+// smaller than the split factor.
+TEST(Reductions, ReduceOverSplitRfactor) {
+  const int N = 10;
+  const int K = 10;
+  const int SPLIT_FACTOR = 16;
+
+  BufHandle b("b", {N, K}, kFloat);
+  std::vector<float> in(N * K);
+  for (int j = 0; j < N * K; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {N, K});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  ForPtr i, t;
+  LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
+  LoopNest::reorderAxis(loops[0], i);
+
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
+  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+  ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0]));
+  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]);
+
+  loop.prepareForCodegen();
+  loop.simplify();
+  StmtPtr s = loop.root_stmt();
+
+  SimpleIREvaluator cg(s, {b, c});
+
+  cg.call({in, out});
+  ASSERT_EQ(out[0], 4950);
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+
+  // Check the IR to verify the rfactored reduce is eliminated.
+  // TODO: The alloc free should be eliminated here since it is size 0.
+  /*
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: Allocate(tmp_buf); // dtype=float, dims=[0]
+# CHECK: sum[0] = 0.f;
+# CHECK: for (int n = 0; n < 10; n++) {
+# CHECK:   for (int k_tail = 0; k_tail < 10; k_tail++) {
+# CHECK:     sum[0] = (sum[0]) + (b[k_tail + 10 * n]);
+# CHECK:   }
+# CHECK: }
+# CHECK: Free(tmp_buf);)IR";
+  */
+  // TODO: rfactor output is not consistent yet, will fix (@nickg).
+  // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Reductions, ReduceInlineReduction) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  BufHandle a_buf("a", {M}, kFloat);
+  BufHandle b_buf("b", {M, N, K}, kFloat);
+
+  Tensor x = Reduce("x", {M}, Sum(), b_buf, {N, K});
+  Tensor y = Compute(
+      "y", {M}, [&](const VarHandle& m) { return a_buf.load(m) + x.load(m); });
+
+  PaddedBuffer<float> a_v(M);
+  PaddedBuffer<float> b_v(M, N, K);
+
+  for (const auto i : c10::irange(M)) {
+    a_v(i) = i * i;
+  }
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      for (const auto k : c10::irange(K)) {
+        b_v(i, j, k) = j * j * k;
+      }
+    }
+  }
+
+  LoopNest l1({y}, {x, y});
+  // Cannot inline a reduction computation
+  ASSERT_FALSE(l1.computeInline(x.buf()));
+}
+
+TEST(Reductions, ReduceInlineConsumer) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  BufHandle a_buf("a", {M, N, K}, kFloat);
+  BufHandle b_buf("b", {M, N, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
+      });
+  Tensor y = Reduce("y", {M}, Sum(), x, {N, K});
+
+  PaddedBuffer<float> a_v(M, N, K);
+  PaddedBuffer<float> b_v(M, N, K);
+
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      for (const auto k : c10::irange(K)) {
+        a_v(i, j, k) = i * i + k;
+        b_v(i, j, k) = j * j + k;
+      }
+    }
+  }
+
+  LoopNest l1({y}, {x, y});
+  LoopNest l2(l1);
+  l2.computeInline(x.buf());
+
+  l1.prepareForCodegen();
+  l2.prepareForCodegen();
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
+
+  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
+  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
+
+  PaddedBuffer<float> y_1(M);
+  PaddedBuffer<float> y_2(M);
+
+  eval1(a_v, b_v, y_1);
+  eval2(a_v, b_v, y_2);
+  ExpectAllNear(y_1, y_2, 1e-5);
+  std::ostringstream oss1, oss2;
+  oss1 << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_GT(oss1.str().size(), oss2.str().size());
+}
+
+TEST(Reductions, ReduceInlineReducerInternal) {
+  const int M = 4;
+  const int N = 5;
+  const int K = 6;
+
+  BufHandle a_buf("a", {M, N, K}, kFloat);
+  BufHandle b_buf("b", {M, N, K}, kFloat);
+
+  Tensor x = Compute(
+      "x",
+      {M, N, K},
+      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
+      });
+
+  Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
+    return Add::make(ExprHandle(1.f), Min::make(a, b, false));
+  });
+  Tensor y = Reduce("y", {M}, minimum, x, {N, K});
+
+  PaddedBuffer<float> a_v(M, N, K);
+  PaddedBuffer<float> b_v(M, N, K);
+
+  for (const auto i : c10::irange(M)) {
+    for (const auto j : c10::irange(N)) {
+      for (const auto k : c10::irange(K)) {
+        a_v(i, j, k) = i * i + k;
+        b_v(i, j, k) = j * j + k;
+      }
+    }
+  }
+
+  LoopNest l1({y}, {x, y});
+  LoopNest l2(l1);
+  l2.computeInline(x.buf());
+
+  l1.prepareForCodegen();
+  l2.prepareForCodegen();
+
+  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
+  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
+
+  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
+  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
+
+  PaddedBuffer<float> y_1(M);
+  PaddedBuffer<float> y_2(M);
+
+  eval1(a_v, b_v, y_1);
+  eval2(a_v, b_v, y_2);
+  ExpectAllNear(y_1, y_2, 1e-5);
+  std::ostringstream oss1, oss2;
+  oss1 << *stmt1;
+  oss2 << *stmt2;
+  ASSERT_GT(oss1.str().size(), oss2.str().size());
+}
+
+TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
+  int L = 4;
+  int N = 3;
+  int M = 2;
+
+  BufHandle a("a", {L, N, M}, kFloat);
+  BufHandle b("b", {L, N, M}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {L, N, M},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
+
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+  LoopNest l_before(l);
+  l_before.prepareForCodegen();
+  SimpleIREvaluator cg_before(
+      LoopNest::sanitizeNames(l_before.root_stmt()), {a, b, e});
+
+  StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg_after(result, {a, b, e});
+
+  std::ostringstream oss;
+  oss << *cg_after.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(d_local); // dtype=float, dims=[4]
+#CHECK: for (int i_2
+#CHECK:   d_local[i_2] = 0.f
+#CHECK:   for (int
+#CHECK:     for (int
+#CHECK:       d_local[i_2] = (d_local[i_2]) + (scale[
+#CHECK:     }
+#CHECK:   }
+#CHECK: }
+#CHECK: for (int i_3
+#CHECK:   sum[i_3] = d_local[i_3]
+#CHECK: Free(d_local);
+#CHECK-NOT: d_local
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  PaddedBuffer<float> a_v(L, M, N, "a");
+  PaddedBuffer<float> b_v(L, M, N, "b");
+  PaddedBuffer<float> c_v(L, M, N, "c");
+  PaddedBuffer<float> d_v(L, "d");
+  PaddedBuffer<float> e_before(L, "e_before");
+  PaddedBuffer<float> e_after(L, "e_after");
+
+  for (const auto l : c10::irange(L)) {
+    for (const auto m : c10::irange(M)) {
+      for (const auto n : c10::irange(N)) {
+        a_v(l, m, n) = at::randn({1}).item().to<float>();
+        b_v(l, m, n) = at::randn({1}).item().to<float>();
+      }
+    }
+  }
+
+  cg_before.call({a_v, b_v, e_before});
+  cg_after.call({a_v, b_v, e_after});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(e_before, e_after, 1e-5);
+}
+
+TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
+  int L = 4;
+  int N = 3;
+  int M = 2;
+
+  BufHandle a("a", {L, N, M}, kFloat);
+  BufHandle b("b", {L, N, M}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {L, N, M},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
+
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+  LoopNest l_before(l);
+  l_before.prepareForCodegen();
+  SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
+
+  StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg_after(result, {a, b, e});
+
+  std::ostringstream oss;
+  oss << *cg_after.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(d_local); // dtype=float, dims=[1]
+#CHECK: sum[i_1] = 0
+#CHECK: d_local[0] = sum[i_1]
+#CHECK: for (int j_1
+#CHECK:   for (int k_1
+#CHECK: d_local[0] = (d_local[0]) + (scale[
+#CHECK:   }
+#CHECK: }
+#CHECK: sum[i_1] = d_local[0]
+#CHECK: Free(d_local);
+#CHECK-NOT: d_local
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  PaddedBuffer<float> a_v(L, M, N, "a");
+  PaddedBuffer<float> b_v(L, M, N, "b");
+  PaddedBuffer<float> c_v(L, M, N, "c");
+  PaddedBuffer<float> d_v(L, "d");
+  PaddedBuffer<float> e_before(L, "e_before");
+  PaddedBuffer<float> e_after(L, "e_after");
+
+  for (const auto l : c10::irange(L)) {
+    for (const auto m : c10::irange(M)) {
+      for (const auto n : c10::irange(N)) {
+        a_v(l, m, n) = at::randn({1}).item().to<float>();
+        b_v(l, m, n) = at::randn({1}).item().to<float>();
+      }
+    }
+  }
+
+  cg_before.call({a_v, b_v, e_before});
+  cg_after.call({a_v, b_v, e_after});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(e_before, e_after, 1e-5);
+}
+
+TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
+  int L = 4;
+  int N = 3;
+  int M = 2;
+
+  BufHandle a("a", {L, N, M}, kFloat);
+  BufHandle b("b", {L, N, M}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {L, N, M},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
+
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+  LoopNest l_before(l);
+  l_before.prepareForCodegen();
+  SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
+
+  StmtPtr d_loop = l.getLoopStmtsFor(d)[2];
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg_after(result, {a, b, e});
+
+  std::ostringstream oss;
+  oss << *cg_after.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(d_local); // dtype=float, dims=[1]
+#CHECK: sum[i_1] = 0
+#CHECK: for (int
+#CHECK:   d_local[0] = 0
+#CHECK:   for (int
+#CHECK:     d_local[0] = (d_local[0]) + (scale[
+#CHECK:   }
+#CHECK:   sum[i_1] = (sum[i_1]) + (d_local[0])
+#CHECK: }
+#CHECK: Free(d_local);
+#CHECK-NOT: d_local
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  PaddedBuffer<float> a_v(L, M, N, "a");
+  PaddedBuffer<float> b_v(L, M, N, "b");
+  PaddedBuffer<float> c_v(L, M, N, "c");
+  PaddedBuffer<float> d_v(L, "d");
+  PaddedBuffer<float> e_before(L, "e_before");
+  PaddedBuffer<float> e_after(L, "e_after");
+
+  for (const auto l : c10::irange(L)) {
+    for (const auto m : c10::irange(M)) {
+      for (const auto n : c10::irange(N)) {
+        a_v(l, m, n) = at::randn({1}).item().to<float>();
+        b_v(l, m, n) = at::randn({1}).item().to<float>();
+      }
+    }
+  }
+
+  cg_before.call({a_v, b_v, e_before});
+  cg_after.call({a_v, b_v, e_after});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(e_before, e_after, 1e-5);
+}
+
+TEST(Reductions, ReductionCacheBodyAccess) {
+  BufHandle a("a", {24, 32, 12}, kFloat);
+  BufHandle b("b", {24, 32, 12}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {24, 32, 12},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
+
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+
+  StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
+  l.cacheAccesses(c.buf(), "scale_local", d_loop);
+
+  l.prepareForCodegen();
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {a, b, e});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12]
+#CHECK: for (int j_1 = 0; j_1 < 32; j_1++) {
+#CHECK:   for (int k_1 = 0; k_1 < 12; k_1++) {
+#CHECK:     scale_local[k_1 + 12 * j_1] = scale[(k_1 + 12 * j_1) + 384 * i_1];
+#CHECK: sum[i_1] = (sum[i_1]) + (scale_local[k_2 + 12 * j_2]);
+#CHECK: scale_1[i_2] = (b[i_2]) * (sum[i_2]);
+#CHECK: Free(scale_local);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(Reductions, ReductionCacheConsumerAccess) {
+  BufHandle a("a", {24, 32, 12}, kFloat);
+  BufHandle b("b", {24, 32, 12}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {24, 32, 12},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
+
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+
+  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
+
+  StmtPtr e_loop = l.getLoopStmtsFor(e)[1];
+  l.cacheAccesses(d.buf(), "sum_local", e_loop);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {a, b, e});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Alias(sum_local,scale);
+#CHECK: sum[i_1] = (sum[i_1]) + (scale[
+#CHECK: for (int j_2 = 0; j_2 < 4
+#CHECK:   sum_local[j_2] = sum[j_2 + 4 * i_2];
+#CHECK:   scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(Reductions, ReductionSplitCacheConsumerAccess) {
+  BufHandle a("a", {24, 32, 12}, kFloat);
+  BufHandle b("b", {24, 32, 12}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {24, 32, 12},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
+
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+
+  ForPtr inner;
+
+  // Split outer reduction axis.
+  LoopNest::splitWithMask(l.getLoopStmtsFor(d)[0], 4, &inner);
+
+  // Split reduction consumer.
+  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
+
+  l.cacheAccesses(d.buf(), "sum_local", inner);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {a, b, e});
+
+  // reduction changes but cache does not.
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Alias(sum_local,scale);
+#CHECK:         sum[j_1 + 4 * i_1] = (sum[j_1 + 4 * i_1]) + (scale[((l + 12 * k_1) + 1536 * i_1) + 384 * j_1]);
+#CHECK: for (int i_2 = 0; i_2 < 6
+#CHECK:   for (int j_2 = 0; j_2 < 4
+#CHECK:     sum_local[j_2] = sum[j_2 + 4 * i_2];
+#CHECK:   for (int j_3 = 0; j_3 < 4
+#CHECK:     scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(Reductions, ReductionReorderCacheConsumerAccess) {
+  BufHandle a("a", {24, 32, 12}, kFloat);
+  BufHandle b("b", {24, 32, 12}, kFloat);
+
+  Tensor c = Compute(
+      "scale",
+      {24, 32, 12},
+      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
+        return b.load(l, n, m) * a.load(l, n, m);
+      });
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
+
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
+  });
+
+  LoopNest l({e}, {c, d, e});
+
+  ForPtr inner;
+
+  // reorder outer reduction axes.
+  auto loops = l.getLoopStmtsFor(d);
+  LoopNest::reorderAxis(loops[0], loops[1]);
+
+  // Split reduction consumer.
+  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
+
+  l.cacheAccesses(d.buf(), "sum_local", inner);
+  l.prepareForCodegen();
+
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
+  SimpleIREvaluator cg(result, {a, b, e});
+
+  // neither reduction body not cache changes.
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK:        sum[j_1] = (sum[j_1]) + (scale[(k_1 + 12 * i_2) + 384 * j_1]);
+#CHECK:  for (int i_3 = 0; i_3 < 6;
+#CHECK:    for (int j_2 = 0; j_2 < 4;
+#CHECK:      sum_local[j_2] = sum[j_2 + 4 * i_3];
+#CHECK:    for (int j_3 = 0; j_3 < 4;
+#CHECK:      scale_1[j_3 + 4 * i_3] = (b[j_3 + 4 * i_3]) * (sum_local[j_3]);
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+
+TEST(Reductions, ReductionRfactorCacheTempOuter) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle k("k", kInt);
+
+  BufHandle b("B", {m, n, k}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
+  LoopNest loop({c});
+
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  LoopNest::reorderAxis(loops.at(0), loops.at(1));
+  loops = loop.getLoopStmtsFor(c);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+  BufPtr rfac_buf;
+  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
+  loop.distributeLoop(loops.at(0));
+
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
+  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
+  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]);
+
+  all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
+  LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][1]);
+  loop.simplify();
+  loop.prepareForCodegen();
+  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
+#CHECK: Allocate(tmp); // dtype=float, dims=[n]
+#CHECK: for (int i_1 = 0; i_1 < m
+#CHECK:   for (int j = 0; j < n
+#CHECK:     tmp[j] = 0
+#CHECK:   }
+#CHECK:   for (int j_1 = 0; j_1 < n
+#CHECK:     for (int k
+#CHECK:       tmp[j_1] = (tmp[j_1]) + (B[
+#CHECK:     }
+#CHECK:   }
+#CHECK:   for (int j_2 = 0; j_2 < n
+#CHECK:     sum_rfac[j_2] = (sum_rfac[j_2]) + (tmp[j_2]);
+#CHECK:   }
+#CHECK:   Free(tmp);
+#CHECK-NOT: tmp
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  cg.call({in, out, M, N, K});
+  ASSERT_EQ(out[0], 499500);
+}
+
+TEST(Reductions, ReductionRfactorCacheTempInner) {
+  const int M = 10;
+  const int N = 10;
+  const int K = 10;
+  VarHandle m("m", kInt);
+  VarHandle n("n", kInt);
+  VarHandle k("k", kInt);
+
+  BufHandle b("B", {m, n, k}, kFloat);
+  std::vector<float> in(M * N * K);
+  for (int j = 0; j < M * N * K; ++j) {
+    in[j] = j;
+  }
+
+  std::vector<float> out(1, -1.f);
+
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
+  LoopNest loop({c});
+  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+
+  LoopNest::reorderAxis(loops.at(0), loops.at(1));
+  loops = loop.getLoopStmtsFor(c);
+  BufPtr rfac_buf;
+  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
+  loop.distributeLoop(loops.at(0));
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
+  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
+  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]);
+
+  all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
+  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
+  LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][2]);
+  loop.prepareForCodegen();
+  loop.simplify();
+  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
+#CHECK: Allocate(tmp); // dtype=float, dims=[1]
+#CHECK: for (int i_1 = 0; i_1 < m
+#CHECK:   for (int j = 0; j < n
+#CHECK:     tmp[0] = 0
+#CHECK:     for (int k
+#CHECK:       tmp[0] = (tmp[0]) + (B[
+#CHECK:     }
+#CHECK:   sum_rfac[j] = (sum_rfac[j]) + (tmp[0]);
+#CHECK:   Free(tmp);
+#CHECK-NOT: tmp
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  cg.call({in, out, M, N, K});
+  ASSERT_EQ(out[0], 499500);
+}
+
+TEST(Reductions, ReductionVectorize) {
+  std::vector<float> in_(8 * 8);
+  for (const auto i : c10::irange(8)) {
+    for (const auto j : c10::irange(8)) {
+      in_[i * 8 + j] = i;
+    }
+  }
+  std::vector<float> out_before(8, -1.f);
+  std::vector<float> out_after(8, -1.f);
+
+  BufHandle in("in", {8, 8}, kFloat);
+
+  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
+  LoopNest l_before({tensor});
+  LoopNest l(l_before);
+  l_before.prepareForCodegen();
+  SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
+  cg_before.call({in_, out_before});
+
+  ASSERT_TRUE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[0]));
+
+  StmtPtr s = l.root_stmt();
+  s = LoopNest::sanitizeNames(IRSimplifier::simplify(s));
+
+  std::ostringstream oss;
+  oss << *s;
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8);
+#CHECK: for (int i = 0; i < 8; i++) {
+#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(i, 8, 8)]), reduce_args={i});
+#CHECK: }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  // Vectorizing should not change result.
+  l.prepareForCodegen();
+  s = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg_after(s, {in, tensor});
+  cg_after.call({in_, out_after});
+  for (const auto i : c10::irange(8)) {
+    ASSERT_EQ(out_before[i], out_after[i]);
+  }
+}
+
+TEST(Reductions, ReductionVectorizeInner) {
+  BufHandle in("in", {8, 8}, kFloat);
+
+  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
+  LoopNest l({tensor});
+
+  ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
+}
+
+TEST(Reductions, ReductionVectorizeRfactor) {
+  std::vector<float> in_(8 * 8);
+  for (const auto i : c10::irange(8)) {
+    for (const auto j : c10::irange(8)) {
+      in_[i * 8 + j] = i;
+    }
+  }
+  std::vector<float> out_before(1, -1.f);
+  std::vector<float> out_after(1, -1.f);
+
+  BufHandle in("in", {8, 8}, kFloat);
+
+  Tensor tensor = Reduce("sum", {}, Sum(), in, {8, 8});
+
+  LoopNest l_before({tensor});
+  LoopNest l(l_before);
+  l_before.prepareForCodegen();
+  SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
+  cg_before.call({in_, out_before});
+
+  ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
+
+  // But if we rfactor this so it's not a reduce axis we can vectorize that
+  // loop.
+  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
+  LoopNest::reorderAxis(loops[0], loops[1]);
+  loops = l.getLoopStmtsFor(tensor);
+  auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1];
+  BufPtr rfac_buf = nullptr;
+  ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf));
+
+  LoopNest::distributeLoop(loops.at(0));
+  auto rfac_loops = l.getAllLoopNestsWritingToBuf(rfac_buf);
+
+  ASSERT_TRUE(LoopNest::vectorize(rfac_loops[1][0]));
+  l.simplify();
+
+  StmtPtr s = LoopNest::sanitizeNames(l.root_stmt());
+
+  std::ostringstream oss;
+  oss << *s;
+  const std::string& expected_ir =
+      R"IR(
+#CHECK: sum = 0.f;
+#CHECK: for (int i = 0; i < 8; i++) {
+#CHECK:   sum_rfac[i] = 0.f;
+#CHECK: }
+#CHECK: for (int i_1 = 0; i_1 < 8; i_1++) {
+#CHECK:   sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * i_1, 1, 8)]), reduce_args={i_1});
+#CHECK: }
+#CHECK: for (int i_2 = 0; i_2 < 8; i_2++) {
+#CHECK:   sum = ReduceOp((sum) + (sum_rfac[i_2]), reduce_args={i_2});
+#CHECK: }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+
+  // Vectorizing should not change result.
+  l.prepareForCodegen();
+  s = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg_after(s, {in, tensor});
+  cg_after.call({in_, out_after});
+
+  ASSERT_EQ(out_before[0], out_after[0]);
+}
+
+TEST(Reductions, InitFunction) {
+  constexpr int M = 32;
+  constexpr int N = 16;
+  BufHandle A("A", {M, N}, kFloat);
+  BufHandle B("B", {N}, kFloat);
+  Tensor C = Reduce(
+      "C",
+      {N},
+      Sum(),
+      [&](const std::vector<VarHandle>& v) { return B.load(v[0]); },
+      [&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); },
+      {M});
+  LoopNest nest({C});
+  nest.prepareForCodegen();
+  StmtPtr s = LoopNest::sanitizeNames(IRSimplifier::simplify(nest.root_stmt()));
+  std::ostringstream oss;
+  oss << *s << "\n";
+  const std::string& expected_ir =
+      R"IR(
+#CHECK:  for (int i = 0; i < 16; i++) {
+#CHECK:    C[i] = B[i];
+#CHECK:    for (int j = 0; j < 32; j++) {
+#CHECK:      C[i] = (C[i]) + (A[i + 16 * j]);
+#CHECK:    }
+#CHECK:  }
+      )IR";
+  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
+}
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
new file mode 100644
index 0000000000000..d6f5977789a9e
--- /dev/null
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -0,0 +1,3702 @@
+#include <gtest/gtest.h>
+#include "test/cpp/tensorexpr/test_base.h"
+
+#include "test/cpp/tensorexpr/test_utils.h"
+#include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
+#include "torch/csrc/jit/tensorexpr/registerizer.h"
+
+#include <iostream>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+// Can replace a simple scalar access with a local variable.
+TEST(Registerizer, RegisterizerSimple) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = x + A_1;
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Won't do replacement of a loop access.
+TEST(Registerizer, RegisterizerLoop) {
+  BufHandle a("A", {10}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[x]) + x;
+   * }
+   */
+
+  // No change.
+  stmt = registerize(stmt);
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[x]) + x;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: int
+# CHECK: A[0] = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A_
+# CHECK:   A[x] =
+# CHECK-NOT: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Won't replace even if the load is a fixed scalar, since the store could
+// invalidate it.
+TEST(Registerizer, RegisterizerLoopFixedLoad) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {x}, Add::make(Load::make(a, {0}), x))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[0]) + x;
+   * }
+   */
+
+  // No change.
+  stmt = registerize(stmt);
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[0]) + x;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: int
+# CHECK: A[0] = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A_
+# CHECK:   A[x] =
+# CHECK-NOT: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// We can registerize accesses that occur entirely within inner scopes, even if
+// they depend on the loop var.
+TEST(Registerizer, RegisterizerLoopInternal) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make(
+          {Store::make(a, {x}, Add::make(Load::make(a, {x}), x)),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[x]) + x;
+   *   A[x] = (A[x]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  // TODO: the order of terms in addition changes and in general depends on
+  // some hash value. This results in unpredictable swaps of the operands from
+  // random changes, which is not great. Ideally, we should ensure some
+  // specific order (ideally, the original one).
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   int A_1 = A[x];
+   *   A_1 = x + A_1;
+   *   A_1 = x + A_1;
+   *   A[x] = A_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK: int A_1 = A[x];
+# CHECK:   A_1 = A_1 + x;
+# CHECK:   A_1 = A_1 + x;
+# CHECK:   A[x] = A_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// An access can be overlapped by another read in the same Expr. In this case
+// B[z] and B[y] overlap and prevent registerization of both accesses.
+TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Store::make(a, {x}, Add::make(Load::make(b, {y}), Load::make(b, {z}))))});
+  stmt = IRSimplifier::simplify(stmt);
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (B[y]) + (B[z]);
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+TEST(Registerizer, RegisterizerLoopInternalRepeated) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {1}), x)),
+                Store::make(a, {0}, Add::make(Load::make(a, {1}), x))})),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {1}), x)),
+                Store::make(a, {0}, Add::make(Load::make(a, {1}), x))}))
+
+      });
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = x + (A[1]);
+   *   A[0] = x + (A[1]);
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = x + (A[1]);
+   *   A[0] = x + (A[1]);
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[1];
+   * int A_2 = A[0];
+   * for (int x = 0; x < 10; x++) {
+   *   A_2 = A_1 + x;
+   *   A_2 = A_1 + x;
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A_2 = A_1 + x;
+   *   A_2 = A_1 + x;
+   * }
+   * A[0] = A_2;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[1];
+# CHECK: int A_2 = A[0];
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK:   A_2 = A_1 + x;
+# CHECK:   A_2 = A_1 + x;
+# CHECK: }
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK:   A_2 = A_1 + x;
+# CHECK:   A_2 = A_1 + x;
+# CHECK: }
+# CHECK-NOT: A[1]
+# CHECK: A[0] = A_2;
+# CHECK-NOT: A[1]
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {x}), x)),
+                Store::make(a, {0}, Add::make(Load::make(a, {x}), x))})),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {x}), x)),
+                Store::make(a, {0}, Add::make(Load::make(a, {x}), x))}))
+
+      });
+  stmt = IRSimplifier::simplify(stmt);
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[x]) + x;
+   *   A[0] = (A[x]) + x;
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[x]) + x;
+   *   A[0] = (A[x]) + x;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = IRSimplifier::simplify(Block::make(
+      {For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))),
+                Store::make(a, {0}, Add::make(x, Load::make(a, {y})))})),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))),
+                Store::make(a, {0}, Add::make(x, Load::make(a, {y})))}))
+
+      }));
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[x]) + x;
+   *   A[0] = (A[x]) + x;
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[x]) + x;
+   *   A[0] = (A[x]) + x;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Will registerize multiple accesses of different items of the same buffer.
+TEST(Registerizer, RegisterizerMultiVar) {
+  BufHandle a("A", {2}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({
+      Store::make(a, {0}, 0),
+      Store::make(a, {1}, 0),
+      For::make(
+          x,
+          0,
+          10,
+          Block::make(
+              {Store::make(a, {0}, Add::make(Load::make(a, {0}), x)),
+               Store::make(a, {1}, Sub::make(Load::make(a, {1}), x))})),
+  });
+
+  /*
+   * A[0] = 0;
+   * A[1] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   *   A[1] = (A[1]) - x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * int A_2 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_2 = x + A_2;
+   *   A_1 = A_1 - x;
+   * }
+   * A[1] = A_2;
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: int A_2 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK:   A_2 =
+# CHECK: A[1] = A_2
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Will registerize the valid accesses while skipping invalid replacements.
+TEST(Registerizer, RegisterizerVariableLoad) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle x2("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(x, 0, 10, Store::make(b, {x}, x)),
+       For::make(
+           x2,
+           0,
+           10,
+           Block::make({Store::make(
+               a, {0}, Add::make(Load::make(a, {0}), Load::make(b, {x2})))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = x;
+   * }
+   * for (int x_1 = 0; x_1 < 10; x_1++) {
+   *   A[0] = (A[0]) + (B[x_1]);
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = x;
+   * }
+   * for (int x_1 = 0; x_1 < 10; x_1++) {
+   *   A_1 = A_1 + (B[x_1]);
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK:   B[x] = x
+# CHECK: for (int x_1 = 0; x_1 < 10; x_1++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize variable accesses so long as the variable does not change.
+TEST(Registerizer, RegisterizerSymbolicIndices) {
+  VarHandle i("i", kInt);
+  VarHandle N("N", kInt);
+  BufHandle a("A", {N}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {i}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {i}, Add::make(Load::make(a, {i}), x))}))});
+
+  /*
+   * A[i] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[i] = (A[i]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = x + A_1;
+   * }
+   * A[i] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[i] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize accesses dependent on multiple loop vars.
+TEST(Registerizer, RegisterizerMultiLoop) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           For::make(
+               y,
+               0,
+               10,
+               Block::make({Store::make(
+                   a,
+                   {0},
+                   Mul::make(Add::make(Load::make(a, {0}), x), y))})))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   for (int y = 0; y < 10; y++) {
+   *     A[0] = x * y + (A[0]) * y;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   for (int y = 0; y < 10; y++) {
+   *     A_1 = x * y + y * A_1;
+   *   }
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK:   for (int y = 0; y < 10; y++)
+# CHECK-NOT: A[
+# CHECK:     A_1 =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize correctly if scalars already exist in the program.
+TEST(Registerizer, RegisterizerRepeated) {
+  BufHandle a("A", {2}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({
+      Store::make(a, {0}, 0),
+      Store::make(a, {1}, 0),
+      For::make(
+          x,
+          0,
+          10,
+          Block::make(
+              {Store::make(a, {0}, Add::make(Load::make(a, {0}), x)),
+               Store::make(a, {1}, Sub::make(Load::make(a, {1}), x))})),
+  });
+
+  // Registerize manually to make sure we only replace a single target.
+  {
+    registerizer::RegisterizerAnalysis analysis;
+    stmt->accept(&analysis);
+    auto candidates = analysis.getCandidates();
+    ASSERT_EQ(candidates.size(), 2);
+
+    candidates.pop_back();
+    registerizer::RegisterizerReplacer replacer(candidates);
+    stmt = stmt->accept_mutator(&replacer);
+  }
+
+  // Re-analyze and replace the second target.
+  {
+    registerizer::RegisterizerAnalysis analysis;
+    stmt->accept(&analysis);
+    auto candidates = analysis.getCandidates();
+    ASSERT_EQ(candidates.size(), 1);
+
+    registerizer::RegisterizerReplacer replacer(candidates);
+    stmt = stmt->accept_mutator(&replacer);
+  }
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: int A_1_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK:   A_1_1 =
+# CHECK: A[1] = A_1_1;
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize the load of A.
+TEST(Registerizer, RegisterizerNoLoads) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x, 0, 10, Block::make({Store::make(a, {0}, Add::make(x, 1))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = x + 1;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = x + 1;
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize the load of A but not the store of B.
+TEST(Registerizer, RegisterizerNoRepeatedStores) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(b, {x}, Add::make(Load::make(a, {0}), x))}))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = (A[0]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  // TODO: its unnecessary to reorder the initializer of A[0], but it's not
+  // actually worse so lets not worry for now.
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = x + A_1;
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A_
+# CHECK:   B[x] =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Won't registerize if there are multiple accesses which may overlap.
+TEST(Registerizer, RegisterizerMultiVarOverlap) {
+  BufHandle a("A", {2}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({
+      Store::make(a, {0}, 0),
+      Store::make(a, {1}, 0),
+      For::make(
+          x,
+          0,
+          10,
+          Block::make(
+              {Store::make(a, {x}, Add::make(Load::make(a, {0}), x)),
+               Store::make(a, {x + 1}, Sub::make(Load::make(a, {1}), x))})),
+  });
+  stmt = IRSimplifier::simplify(stmt);
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+TEST(Registerizer, RegisterizerAllocs) {
+  BufHandle a("A", {2}, kInt);
+  BufHandle c("C", {1}, kInt);
+  VarHandle x("x", kInt);
+
+  BufHandle b("B", {Load::make(c, {0})}, kInt);
+
+  StmtPtr stmt = Block::make(
+      {Allocate::make(b),
+       Store::make(a, {0}, Load::make(c, {0})),
+       Store::make(b, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(b, {0}, Add::make(Load::make(b, {0}), x)),
+                Store::make(a, {0}, Load::make(c, {0}))})),
+       Free::make(b)});
+
+  /*
+   * Allocate(B, int, {C[0]});
+   * A[0] = C[0];
+   * B[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B[0] = (B[0]) + x;
+   *   A[0] = C[0];
+   * }
+   * Free(B);
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int C_1 = C[0];
+   * Allocate(B, int, {C_});
+   * int A_1 = C_1;
+   * int B_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   B_1 = B_1 + x;
+   *   A_1 = C_1;
+   * }
+   * B[0] = B_1;
+   * A[0] = A_1;
+   * Free(B);
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int C_1 = C[0];
+# CHECK: Allocate(B
+# CHECK: int A_1 = C_1;
+# CHECK: int B_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK:   B_1 =
+# CHECK:   A_1 = C_
+# CHECK: B[0] = B_1;
+# CHECK: A[0] = A_1;
+# CHECK: Free(B)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Registerizer, RegisterizerNoInitializer) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[0];
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = x + A_1;
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[0];
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Registerizer, RegisterizerNoInitializerLoopVar) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make({Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
+  stmt = IRSimplifier::simplify(stmt);
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = (A[x]) + x;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+TEST(Registerizer, RegisterizerLoadThenStore) {
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make(
+          {Store::make(b, {0}, Add::make(Load::make(a, {0}), x)),
+           Store::make(a, {0}, Load::make(b, {0}))}))});
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   B[0] = (A[0]) + x;
+   *   A[0] = B[0];
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[0];
+   * int B_1 = B[0];
+   * for (int x = 0; x < 10; x++) {
+   *   B_1 = x + A_1;
+   *   A_1 = B_1;
+   * }
+   * B[0] = B_1;
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[0];
+# CHECK: int B_1 = B[0];
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: B[
+# CHECK:   B_1 =
+# CHECK-NOT: A[
+# CHECK:   A_1 = B_
+# CHECK: B[0] = B_
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Registerizer, RegisterizerParallelized) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  LoopOptions loopOpts;
+  loopOpts.set_gpu_block_index(0);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}),
+           loopOpts)});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   */
+
+  ASSERT_THROWS_WITH(
+      registerize(stmt),
+      "Registerization must occur after parallelism flattening");
+}
+
+// Should be able to registerize this since the scalar would exist before the
+// branch.
+TEST(Registerizer, RegisterizerConditionAfter) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(c, {x}, Load::make(a, {x})),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr)});
+
+  /*
+   * A[x] = B[x];
+   * C[x] = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = B[x];
+   * C[x] = A_1;
+   * if (x<5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * }
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = B[x];
+# CHECK: C[x] = A_1;
+# CHECK: if (
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Should be able to registerize this since the scalar exists in the same form
+// after the branch and there is no overlap.
+TEST(Registerizer, RegisterizerConditionBefore) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr),
+       Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(c, {x}, Load::make(a, {x}))});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   * A[x] = B[x];
+   * C[x] = A[x];
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_ 1 = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * }
+   * A_1 = B[x];
+   * C[x] = A_1;
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: if (
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: }
+# CHECK: A_1 = B[x];
+# CHECK: C[x] = A_1;
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Should be able to registerize this as the combination of the two above rules.
+TEST(Registerizer, RegisterizerConditionInside) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(c, {x}, Load::make(a, {x})),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr),
+       Store::make(b, {x}, Load::make(a, {x})),
+       Store::make(a, {x}, Load::make(c, {x}))});
+
+  /*
+   * A[x] = B[x];
+   * C[x] = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   * B[x] = A[x];
+   * A[x] = C[x];
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = B[x];
+   * C[x] = A_1;
+   * if (x<5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * }
+   * B[x] = A_1;
+   * A_1 = C[x];
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = B[x];
+# CHECK: C[x] = A_1;
+# CHECK: if (
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: }
+# CHECK: B[x] = A_1;
+# CHECK: A_1 = C[x];
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// An example where an access is cut by an overlapping access inside a
+// condition, and both sides are large enough to be registerized but cannot be
+// because there is no safe place to put the initializer or finalizer.
+TEST(Registerizer, RegisterizerConditionInsideOverlap1) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = Block::make(
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+      {Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(c, {x}, Load::make(a, {x})),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({
+               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+               Store::make(a, {0}, 3),
+               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           }),
+           nullptr),
+       Store::make(b, {x}, Load::make(a, {x})),
+       Store::make(a, {x}, Load::make(c, {x}))});
+
+  /*
+   * A[x] = B[x];
+   * C[x] = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   *   A[0] = 3;
+   *   A[x] = (A[x]) + 1;
+   * }
+   * B[x] = A[x];
+   * A[x] = C[x];
+   */
+
+  // The A[0] store overlaps, A[x] cutting the region that can be registerized
+  // into two groups.
+  // Each group has 2 loads and 2 stores however, so we could registerize it,
+  // but the first group would need to be finalized inside the condition block,
+  // the second would need to be initialized inside the condition block. There's
+  // no safe place to put these that's visible to the other uses in the group
+  // and so neither registerization is possible.
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Same as the above, but the access group before the condition (and after the
+// condition) are large enough to be registerized without needing the access
+// from the loop. Registerization occurs but does not include any accesses in
+// the condition, and the first group must be finalized before the Cond, the
+// second initialized after it.
+TEST(Registerizer, RegisterizerConditionInsideOverlap2) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = Block::make(
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+      {Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(a, {x}, Load::make(b, {x + 1})),
+       Store::make(c, {x}, Load::make(a, {x})),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({
+               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+               Store::make(a, {0}, 3),
+               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           }),
+           nullptr),
+       Store::make(b, {x}, Load::make(a, {x})),
+       Store::make(b, {x + 1}, Load::make(a, {x})),
+       Store::make(a, {x}, Load::make(c, {x}))});
+
+  /*
+   * A[x] = B[x];
+   * A[x] = B[x + 1];
+   * C[x] = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   *   A[0] = 3;
+   *   A[x] = (A[x]) + 1;
+   * }
+   * B[x] = A[x];
+   * B[x + 1] = A[x];
+   * A[x] = C[x];
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = B[x];              // A_1 initializer
+   * A_1 = B[x + 1];              //
+   * C[x] = A_1;                  //
+   * A[x] = A_1;                  // A_1 finalizer
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   *   A[0] = 3;
+   *   A[x] = (A[x]) + 1;
+   * }
+   * int A_2 = A[x];              // A_2 initialier
+   * B[x] = A_2;                  //
+   * B[x + 1] = A_2;              //
+   * A_2 = C[x];                  //
+   * A[x] = A_2;                  // A_2 finalizer
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = B[x];
+# CHECK: A_1 = B[x + 1];
+# CHECK: C[x] = A_1;
+# CHECK: A[x] = A_1;
+# CHECK: if (
+# CHECK-NOT:   A_1 = A_1 + 1;
+# CHECK:   A[x] = (A[x]
+# CHECK:   A[0] =
+# CHECK:   A[x] = (A[x]
+# CHECK: }
+# CHECK: int A_2 = A[x];
+# CHECK: B[x] = A_2;
+# CHECK: B[x + 1] = A_2;
+# CHECK: A_2 = C[x];
+# CHECK: A[x] = A_2;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// When accesses are within conditional blocks they are not visible to the wider
+// program, because we don't know if the branch would be taken and if it isn't
+// the accesses in it don't need to be valid (think size checks on the index).
+// In this case the accesses cannot be registerized.
+TEST(Registerizer, RegisterizerConditionHidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   * if (x>5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// But... if the same access is found in a non conditional scope, that means
+// that that access is valid in the higher scope (or at least if its not it's
+// the user's fault). It "unhides" the conditional accesses, allowing
+// registerization to occur.
+TEST(Registerizer, RegisterizerConditionUnhidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr),
+       Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
+           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+           nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   * A[x] = (A[x]) + 1;            <-- this is doing the unhiding.
+   * if (x>5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[x];
+   * if (x<5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * }
+   * A_1 = A_1 + 1;
+   * if (x>5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * }
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: if (x<5
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: }
+# CHECK: A_1 = A_1 + 1;
+# CHECK: if (x>5
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: }
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize a load that occurs in the condition of a Cond.
+TEST(Registerizer, RegisterizerCondCondition) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {x}, Load::make(b, {x})),
+       Store::make(c, {x}, Load::make(a, {x})),
+       Cond::make(
+           CompareSelect::make(
+               Load::make(a, {x}), 5, CompareSelectOperation::kLT),
+           Store::make(c, {x}, Add::make(Load::make(c, {x}), 1)),
+           nullptr)});
+
+  /*
+   * A[x] = B[x];
+   * C[x] = A[x];
+   * if ((A[x])<5 ? 1 : 0) {
+   *   C[x] = (C[x]) + 1;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = B[x];
+   * int C_1 = A_1;
+   * if (A_1<5 ? 1 : 0) {
+   *   C_1 = C_1 + 1;
+   * }
+   * C[x] = C_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = B[x];
+# CHECK: int C_1 = A_1;
+# CHECK: if (A_1<5
+# CHECK:   C_1 = C_1 + 1;
+# CHECK: C[x] = C_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Appearing in the condition of a Cond makes it visible to the enclosing scope,
+// and so we can registerize internal usages.
+TEST(Registerizer, RegisterizerCondConditionUnhidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(Load::make(a, {x}), 5, CompareSelectOperation::kLT),
+      Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
+      Store::make(a, {x}, Add::make(Load::make(a, {x}), 10)))});
+
+  /*
+   * if ((A[x])<5 ? 1 : 0) {
+   *   A[x] = (A[x]) + 1;
+   * } else {
+   *   A[x] = (A[x]) + 10;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[x];
+   * if (A_1<5 ? 1 : 0) {
+   *   A_1 = A_1 + 1;
+   * } else {
+   *   A_1 = A_1 + 10;
+   * }
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: if (A_1<5
+# CHECK:   A_1 = A_1 + 1;
+# CHECK: } else {
+# CHECK:   A_1 = A_1 + 10;
+# CHECK: }
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Conditional hiding also works for IfThenElse exprs.
+TEST(Registerizer, RegisterizerIfThenElseHidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Store::make(
+           b,
+           {y},
+           IfThenElse::make(
+               CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+               Add::make(Load::make(a, {x}), 1),
+               Add::make(Load::make(a, {x + 1}), 2))),
+       Store::make(
+           b,
+           {y + 1},
+           IfThenElse::make(
+               CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+               Add::make(Load::make(a, {x}), 1),
+               Add::make(Load::make(a, {x + 1}), 2)))});
+
+  /*
+   * B[y] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
+   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Conditional unhiding also works for IfThenElse exprs.
+TEST(Registerizer, RegisterizerIfThenElseUnhidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = Block::make({
+      Store::make(a, {x}, 0),
+      Store::make(
+          b,
+          {y},
+          IfThenElse::make(
+              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+              Add::make(Load::make(a, {x}), 1),
+              Add::make(Load::make(a, {x + 1}), 2))),
+      Store::make(
+          b,
+          {y + 1},
+          IfThenElse::make(
+              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+              Add::make(Load::make(a, {x}), 1),
+              Add::make(Load::make(a, {x + 1}), 2))),
+  });
+
+  /*
+   * A[x] = 0;
+   * B[y] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
+   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * B[y] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
+   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: B[y] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
+# CHECK: B[y + 1] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Nested IfThenElse exprs can't promote to higher level scopes.
+TEST(Registerizer, RegisterizerIfThenElseNested) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  BufHandle d("D", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make({Store::make(
+      a,
+      {x},
+      IfThenElse::make(
+          CompareSelect::make(x, 3, CompareSelectOperation::kLT),
+          IfThenElse::make(
+              CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+              Load::make(d, {x}),
+              Load::make(b, {x})),
+          IfThenElse::make(
+              CompareSelect::make(x, 5, CompareSelectOperation::kEQ),
+              Load::make(c, {x}),
+              Load::make(d, {x}))))});
+
+  /*
+   * A[x] = IfThenElse(x<3 ? 1 : 0,
+   *          IfThenElse(x==2 ? 1 : 0, D[x], B[x]),
+   *            IfThenElse(x==5 ? 1 : 0, C[x], D[x]));
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Cannot registerize an access completely contained within an IfThenElse
+// branch, since it is not a Stmt and cannot hold variable definitions. We need
+// to check that we don't promote the initializer/finalizer to the enclosing
+// Block.
+TEST(Registerizer, RegisterizerIfThenElseInternal) {
+  // Making these floats so they don't get simplified to a single access.
+  BufHandle a("A", {5}, kFloat);
+  BufHandle b("B", {5}, kFloat);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make({Store::make(
+      a,
+      {x},
+      IfThenElse::make(
+          CompareSelect::make(x, 3, CompareSelectOperation::kLT),
+          Add::make(Load::make(b, {x}), Load::make(b, {x})),
+          Load::make(b, {x})))});
+
+  /*
+   * A[x] = IfThenElse(x<3 ? 1 : 0, (B[x]) + (B[x]), B[x]);
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+
+  // If this was a Cond instead of an IfThenElse then we could registerize the
+  // two accesses to B[x] in the True branch.
+
+  // Actually lets verify that.
+
+  stmt = Block::make({Cond::make(
+      CompareSelect::make(x, 3, CompareSelectOperation::kLT),
+      Store::make(a, {x}, Add::make(Load::make(b, {x}), Load::make(b, {x}))),
+      Store::make(a, {x}, Load::make(b, {x})))});
+
+  /*
+   * if (x<3 ? 1 : 0) {
+   *   A[x] = (B[x]) + (B[x]);
+   * } else {
+   *   A[x] = B[x];
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x<3 ? 1 : 0) {
+   *   float B_1 = B[x];
+   *   A[x] = B_1 + B_1;
+   * } else {
+   *   A[x] = B[x];
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: int
+# CHECK-NOT: float
+# CHECK: if (x<3
+# CHECK:   float B_1 =
+# CHECK:   A[x] = B_1 + B_1
+# CHECK: } else {
+# CHECK:   A[x] = B[x]
+# CHECK: }
+# CHECK-NOT: A[x]
+# CHECK-NOT: B[x])IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize a load that occurs in the condition of an IfThenElse;
+TEST(Registerizer, RegisterizerIfThenElseCondition) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {x}, Load::make(a, {x})),
+       Store::make(
+           a,
+           {x},
+           IfThenElse::make(
+               CompareSelect::make(
+                   Load::make(a, {x}), 5, CompareSelectOperation::kLT),
+               Load::make(b, {0}),
+               Load::make(c, {0})))});
+
+  /*
+   * A[x] = A[x];       <---- just here so there are enough accesses to combine.
+   * A[x] = IfThenElse((A[x])<5 ? 1 : 0, B[0], C[0]);
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[x];
+   * A_1 = A_1;
+   * A_1 = IfThenElse(A_1<5 ? 1 : 0, B[0], C[0]);
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: A_1 = IfThenElse(A_1<5 ? 1 : 0, B[0], C[0]);
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Appearing in the condition of a Cond makes it visible to the enclosing scope,
+// and so we can registerize internal usages.
+TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make({Store::make(
+      b,
+      {x},
+      IfThenElse::make(
+          CompareSelect::make(
+              Load::make(a, {x}), 5, CompareSelectOperation::kLT),
+          Add::make(Load::make(a, {x}), 1),
+          Add::make(Load::make(a, {x}), 10)))});
+
+  /*
+   * B[x] = IfThenElse((A[x])<5 ? 1 : 0, (A[x]) + 1, (A[x]) + 10);
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[x];
+   * B[x] = IfThenElse(A_1<5 ? 1 : 0, A_1 + 1, A_1 + 10);
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: B[x] = IfThenElse(A_1<5 ? 1 : 0, A_1 + 1, A_1 + 10);)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Cannot promote accesses internal to IfThenElse branches even if the enclosing
+// scope if conditional.
+TEST(Registerizer, RegisterizerConditionBranchOnly) {
+  BufHandle a("A", {5}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make({
+          Cond::make(
+              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+              Store::make(
+                  a,
+                  {x},
+                  IfThenElse::make(
+                      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+                      Add::make(Load::make(a, {x}), x),
+                      Add::make(Load::make(a, {x - 5}), x))),
+              Store::make(
+                  a,
+                  {x - 5},
+                  IfThenElse::make(
+                      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+                      Add::make(Load::make(a, {x}), x),
+                      Add::make(Load::make(a, {x - 5}), x)))),
+      }))});
+  stmt = IRSimplifier::simplify(stmt);
+
+  std::ostringstream before;
+  before << *stmt;
+
+  /* for (int x = 0; x < 10; x++) {
+   *   if (x<5 ? 1 : 0) {
+   *     A[x] = IfThenElse(x<5 ? 1 : 0, (A[x]) + x, (A[x - 5]) + x);
+   *   } else {
+   *     A[x - 5] = IfThenElse(x<5 ? 1 : 0, (A[x]) + x, (A[x - 5]) + x);
+   *   }
+   * }
+   */
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// We can registerize an IfThenElse that appears in the condition branch of a
+// Cond. This is a weird but valid thing to do.
+TEST(Registerizer, RegisterizerCondIfThenElse) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  BufHandle c("C", {5}, kInt);
+  VarHandle x("x", kInt);
+
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(
+          IfThenElse::make(
+              CompareSelect::make(
+                  Load::make(a, {x}), 5, CompareSelectOperation::kLT),
+              Load::make(a, {x}),
+              Load::make(b, {x})),
+          x,
+          CompareSelectOperation::kEQ),
+      Store::make(c, {x}, Add::make(Load::make(c, {x}), 1)),
+      nullptr)});
+
+  /*
+   * if ((IfThenElse((A[x])<5 ? 1 : 0, A[x], B[x]))==x ? 1 : 0) {
+   *   C[x] = (C[x]) + 1;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  // access to A can be registerized, but not B or C
+
+  /*
+   * int A_1 = A[x];
+   * if ((IfThenElse(A_1<5 ? 1 : 0, A_1, B[x]))==x ? 1 : 0) {
+   *   C[x] = (C[x]) + 1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: if ((IfThenElse(A_1<5 ? 1 : 0, A_1, B[x]
+# CHECK:   C[x] = (C[x]) + 1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can registerize a conditional access in the RHS of a store unhidden by it's
+// LHS, and hoist it out of a loop.
+TEST(Registerizer, RegisterizerIfThenElseLoop) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = For::make(
+      y,
+      0,
+      10,
+      Store::make(
+          a,
+          {x},
+          IfThenElse::make(
+              CompareSelect::make(x, 3, CompareSelectOperation::kLT),
+              Load::make(a, {x}),
+              Load::make(b, {y}))));
+
+  /*
+   * for (int y = 0; y < 10; y++) {
+   *   A[x] = IfThenElse(x<3 ? 1 : 0, A[x], B[y]);
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[x];
+   * for (int y = 0; y < 10; y++) {
+   *   A_1 = IfThenElse(x<3 ? 1 : 0, A_1, B[y]);
+   * }
+   * A[x] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[x];
+# CHECK: for (
+# CHECK:   A_1 = IfThenElse(x<3 ? 1 : 0, A_1, B[y]);
+# CHECK: }
+# CHECK: A[x] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Cannot registerize if the RHS overlaps the access creating visibility.
+TEST(Registerizer, RegisterizerIfThenElseLoopCut) {
+  BufHandle a("A", {5}, kInt);
+  BufHandle b("B", {5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  StmtPtr stmt = Block::make({For::make(
+      y,
+      0,
+      10,
+      Store::make(
+          a,
+          {x},
+          IfThenElse::make(
+              CompareSelect::make(x, 3, CompareSelectOperation::kLT),
+              Load::make(a, {x}),
+              Load::make(a, {y}))))});
+
+  /*
+   * for (int y = 0; y < 10; y++) {
+   *   A[x] = IfThenElse(x<3 ? 1 : 0, A[x], A[y]);
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Simple case where an access is cut by an overlapping access later in the
+// program, we can registerize up until the overlap.
+TEST(Registerizer, RegisterizerPartialAfter) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))})),
+       For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})))});
+
+  /*
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = A[x - 1];
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = A_1 + x;
+   * }
+   * A[0] = A_1;
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = A[x - 1];
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (
+# CHECK:   A_1 = A_1 + x;
+# CHECK: }
+# CHECK: A[0] = A_1;
+# CHECK: for (
+# CHECK:   A[x] = A[x - 1];
+# CHECK: }
+# CHECK-NOT: A)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// We can registerize an access which overlaps a previous access, the
+// initializer must be inserted after the previous access.
+TEST(Registerizer, RegisterizerPartialBefore) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1}))),
+       Store::make(a, {0}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
+
+  /*
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = A[x - 1];
+   * }
+   * A[0] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = A[x - 1];
+   * }
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = A_1 + x;
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK-NOT: int
+# CHECK: for (
+# CHECK:   A[x] = A[x - 1];
+# CHECK: }
+# CHECK: int A_1 = 0;
+# CHECK: for (
+# CHECK:   A_1 = A_1 + x;
+# CHECK: }
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// The combination of the previous two tests, an access is cut by an overlapping
+// access in both directions.
+TEST(Registerizer, RegisterizerPartialInside) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x1("x1", kInt);
+  VarHandle x2("x2", kInt);
+  VarHandle x3("x3", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 2),
+       For::make(
+           x1, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x1))),
+       For::make(x2, 1, 10, Store::make(a, {x2}, Load::make(a, {x2 - 1}))),
+       For::make(
+           x3, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x3)))});
+
+  /*
+   * A[0] = 2;
+   * for (int x1 = 0; x1 < 10; x1++) {
+   *   A[0] = (A[0]) + x1;
+   * }
+   * for (int x2 = 1; x2 < 10; x2++) {
+   *   A[x2] = A[x2 - 1];
+   * }
+   * for (int x3 = 0; x3 < 10; x3++) {
+   *   A[0] = (A[0]) + x3;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 2;
+   * for (int x1 = 0; x1 < 10; x1++) {
+   *   A_1 = A_1 + x1;
+   * }
+   * A[0] = A_1;
+   * for (int x2 = 1; x2 < 10; x2++) {
+   *   A[x2] = A[x2 - 1];
+   * }
+   * int A_2 = A[0];
+   * for (int x3 = 0; x3 < 10; x3++) {
+   *   A_2 = A_2 + x3;
+   * }
+   * A[0] = A_2;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 2;
+# CHECK: for (
+# CHECK:   A_1 = A_1 + x1;
+# CHECK: }
+# CHECK: A[0] = A_1;
+# CHECK: for (
+# CHECK:   A[x2] =
+# CHECK: }
+# CHECK: int A_2 = A[0];
+# CHECK: for (
+# CHECK:   A_2 = A_2 + x3;
+# CHECK: }
+# CHECK: A[0] = A_2;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// An element could be registerized program wide but is cut by a conditional
+// access, we should break this into two scalars and write back to the buffer
+// before the condition.
+TEST(Registerizer, RegisterizerPartialCondition) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 2),
+       For::make(
+           x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x))),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Store::make(a, {x}, Load::make(a, {x - 1})),
+           nullptr),
+       For::make(
+           x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x)))});
+
+  /*
+   * A[0] = 2;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = A[x - 1];
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[0] = (A[0]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 2;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = A_1 + x;
+   * }
+   * A[0] = A_1;
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = A[x - 1];
+   * }
+   * int A_2 = A[0];
+   * for (int x = 0; x < 10; x++) {
+   *   A_2 = A_2 + x;
+   * }
+   * A[0] = A_2;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 2;
+# CHECK: for (
+# CHECK:   A_1 = A_1 + x;
+# CHECK: }
+# CHECK: A[0] = A_1;
+# CHECK: if (
+# CHECK:   A[x] =
+# CHECK: }
+# CHECK: int A_2 = A[0];
+# CHECK: for (
+# CHECK:   A_2 = A_2 + x;
+# CHECK: }
+# CHECK: A[0] = A_2;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Tests case where an access is cut by an internal conditional access which
+// itself is registerized.
+TEST(Registerizer, RegisterizerPartialConditionInternalCut) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 1),
+       Store::make(a, {0}, 3),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({Store::make(a, {x}, 1), Store::make(a, {x}, 3)}),
+           nullptr),
+       Store::make(a, {0}, 4),
+       Store::make(a, {0}, 6)});
+
+  /*
+   * A[0] = 1;
+   * A[0] = 3;
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = 1;
+   *   A[x] = 3;
+   * }
+   * A[0] = 4;
+   * A[0] = 6;
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 1;
+   * A_1 = 3;
+   * A[0] = A_1;
+   * if (x<5 ? 1 : 0) {
+   *   int A_2 = 1;
+   *   A_2 = 3;
+   *   A[x] = A_2;
+   * }
+   * int A_3 = 4;
+   * A_3 = 6;
+   * A[0] = A_3;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 1;
+# CHECK: A_1 = 3
+# CHECK: A[0] = A_1;
+# CHECK: if (
+# CHECK:   int A_2 = 1;
+# CHECK:   A_2 = 3;
+# CHECK:   A[x] = A_2;
+# CHECK: }
+# CHECK: int A_3 = 4;
+# CHECK: A_3 = 6;
+# CHECK: A[0] = A_3;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// First statement in condition closes outer access, but can be registerized
+// with later statements.
+TEST(Registerizer, RegisterizerPartialConditionInternalStart) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, 1),
+       Store::make(a, {0}, 3),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({Store::make(a, {x}, 1), Store::make(a, {x}, 3)}),
+           nullptr),
+       Store::make(a, {x}, 4),
+       Store::make(a, {x}, 6)});
+
+  /*
+   * A[0] = 1;
+   * A[0] = 3;
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = 1;
+   *   A[x] = 3;
+   * }
+   * A[x] = 4;
+   * A[x] = 6;
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 1;
+   * A_1 = 3;
+   * A[0] = A_1;
+   * int A_2 = A[x];    <--- must read from the input here.
+   * if (x<5 ? 1 : 0) {
+   *   A_2 = 1;
+   *   A_2 = 3;
+   * }
+   * A_2 = 4;
+   * A_2 = 6;
+   * A[x] = A_2;
+   */
+
+  // TODO: I suppose we could refactor with a conditional initializer?
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 1;
+# CHECK: A_1 = 3
+# CHECK: A[0] = A_1;
+# CHECK: int A_2 = A[x];
+# CHECK: if (
+# CHECK:   A_2 = 1;
+# CHECK:   A_2 = 3;
+# CHECK: }
+# CHECK: A_2 = 4;
+# CHECK: A_2 = 6;
+# CHECK: A[x] = A_2;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// An access cuts two open overlaps and creates four scalar variables.
+TEST(Registerizer, RegisterizerPartialOverlapsTwo) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {1}, Load::make(a, {0})),
+       Store::make(a, {0}, Load::make(a, {1})),
+       Store::make(a, {0}, Load::make(a, {1})),
+       For::make(x, 1, 10, Store::make(a, {x}, x)),
+       Store::make(a, {1}, Load::make(a, {0})),
+       Store::make(a, {0}, Load::make(a, {1})),
+       Store::make(a, {0}, Load::make(a, {1}))});
+
+  /*
+   * A[1] = A[0];
+   * A[0] = A[1];
+   * A[0] = A[1];
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = x;
+   * }
+   * A[1] = A[0];
+   * A[0] = A[1];
+   * A[0] = A[1];
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[0];
+   * int A_2 = A_1;
+   * A_1 = A_2;
+   * A_1 = A_2;
+   * A[1] = A_2;
+   * A[0] = A_1;
+   * for (int x = 1; x < 10; x++) {
+   *   A[x] = x;
+   * }
+   * int A_3 = A[0];
+   * int A_4 = A_3;
+   * A_3 = A_4;
+   * A_3 = A_4;
+   * A[1] = A_4;
+   * A[0] = A_3;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[0];
+# CHECK: int A_2 = A_1;
+# CHECK: A_1 = A_2;
+# CHECK: A_1 = A_2;
+# CHECK: A[1] = A_2;
+# CHECK: A[0] = A_1;
+# CHECK: for (
+# CHECK:   A[x] = x;
+# CHECK: }
+# CHECK: int A_3 = A[0];
+# CHECK: int A_4 = A_3;
+# CHECK: A_3 = A_4;
+# CHECK: A_3 = A_4;
+# CHECK: A[1] = A_4;
+# CHECK: A[0] = A_3;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Nested blocks will automatically be flattened and do not provent
+// registerization of enclosed accesses.
+TEST(Registerizer, RegisterizerNestedBlocks) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+       Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), 2))}),
+       Block::make(
+           {Store::make(a, {0}, Add::make(Load::make(a, {0}), 3)),
+            Block::make(
+                {Store::make(a, {0}, Add::make(Load::make(a, {0}), 4))})})});
+
+  /*
+   * A[0] = (A[0]) + 1;
+   * {
+   *   A[0] = (A[0]) + 2;
+   * }
+   * {
+   *   A[0] = (A[0]) + 3;
+   *   {
+   *     A[0] = (A[0]) + 4;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[0];
+   * A_1 = A_1 + 1;
+   * A_1 = A_1 + 2;
+   * A_1 = A_1 + 3;
+   * A_1 = A_1 + 4;
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[0];
+# CHECK: A_1 = A_1 + 1;
+# CHECK: A_1 = A_1 + 2;
+# CHECK: A_1 = A_1 + 3;
+# CHECK: A_1 = A_1 + 4;
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// The access can be registerized internally to a condition, but must ensure
+// that both initializer and finalizer are within the same condition.
+TEST(Registerizer, RegisterizerNestedConditions) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+      Block::make(
+          {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+           Cond::make(
+               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+               nullptr)}),
+      nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   A[0] = (A[0]) + 1;
+   *   if (x==2 ? 1 : 0) {
+   *
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   int A_1 = A[0];
+   *   A_1 = A_1 + 1;
+   *   if (x==2 ? 1 : 0) {
+   *     A_1 = A_1 + 1;
+   *   }
+   * A[0] = A_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x<5
+# CHECK:   int A_1 = A[0];
+# CHECK:   A_1 = A_1 + 1;
+# CHECK:   if (x==2
+# CHECK:     A_1 = A_1 + 1;
+# CHECK:   }
+# CHECK: A[0] = A_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// If an access exists outside the scope of the condition then we can lift
+// nested conditional usages into the same scalar.
+TEST(Registerizer, RegisterizerNestedConditionsUnhidden) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make(
+               {Store::make(a, {1}, 1),
+                Cond::make(
+                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+                    nullptr)}),
+           nullptr)});
+
+  /*
+   * A[0] = (A[0]) + 1;
+   * if (x<5 ? 1 : 0) {
+   *   A[1] = 1;
+   *   if (x==2 ? 1 : 0) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = A[0];
+   * A_1 = A_1 + 1;
+   * if (x<5 ? 1 : 0) {
+   *   A[1] = 1;
+   *   if (x==2 ? 1 : 0) {
+   *     A_1 = A_1 + 1;
+   *   }
+   * }
+   * A[0] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = A[0];
+# CHECK: A_1 = A_1 + 1;
+# CHECK: if (x<5
+# CHECK:   A[1] = 1;
+# CHECK:   if (x==2
+# CHECK:     A_1 = A_1 + 1;
+# CHECK: A[0] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+           nullptr),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({Cond::make(
+               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+               nullptr)}),
+           nullptr)});
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = (A[0]) + 1;
+   * }
+   * if (x<5 ? 1 : 0) {
+   *   if (x==2 ? 1 : 0) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  stmt = registerize(stmt);
+}
+
+TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make({Cond::make(
+               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+               nullptr)}),
+           nullptr),
+       Cond::make(
+           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+           nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   if (x==2 ? 1 : 0) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = (A[0]) + 1;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  stmt = registerize(stmt);
+}
+
+// If an access is cut by another access internal to a condition block, it still
+// cuts the access.
+TEST(Registerizer, RegisterizerNestedConditionsCut) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           Block::make(
+               {Store::make(a, {x}, 1),
+                Cond::make(
+                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+                    nullptr)}),
+           nullptr)});
+
+  /*
+   * A[0] = (A[0]) + 1;
+   * if (x<5 ? 1 : 0) {
+   *   A[x] = 1;
+   *   if (x==2 ? 1 : 0) {
+   *
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+TEST(Registerizer, RegisterizerNestedConditionLoopHidden) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+           nullptr),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(b, {x}, 0),
+                Cond::make(
+                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
+                    nullptr)}))});
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = (A[0]) + 1;
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   B[x] = 0;     <-- this is only here to prevent Loop/Cond reordering.
+   *   if (x==2 ? 1 : 0) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// Three loops and four element regions, three of which should be registerized
+// at different levels of the IR.
+TEST(Registerizer, RegisterizerNestedConditionThreeDeep) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {4}, 0),
+       Cond::make(
+           CompareSelect::make(x, 2, CompareSelectOperation::kGT),
+           Cond::make(
+               CompareSelect::make(x, 3, CompareSelectOperation::kGT),
+               Block::make({
+                   Cond::make(
+                       CompareSelect::make(x, 4, CompareSelectOperation::kGT),
+                       Block::make({
+                           Store::make(
+                               a, {1}, Add::make(Load::make(a, {1}), 1)),
+                           Store::make(
+                               a, {2}, Add::make(Load::make(a, {2}), 1)),
+                           Store::make(
+                               a, {3}, Add::make(Load::make(a, {3}), 1)),
+                           Store::make(
+                               a, {4}, Add::make(Load::make(a, {4}), 1)),
+                           Store::make(
+                               a, {1}, Add::make(Load::make(a, {1}), 1)),
+                       }),
+                       nullptr),
+                   Store::make(a, {2}, Add::make(Load::make(a, {2}), 1)),
+               }),
+               nullptr),
+           nullptr)});
+
+  /*
+   * A[4] = 0;
+   * if (x>2 ? 1 : 0) {
+   *   if (x>3 ? 1 : 0) {
+   *     if (x>4 ? 1 : 0) {
+   *       A[1] = (A[1]) + 1;
+   *       A[2] = (A[2]) + 1;
+   *       A[3] = (A[3]) + 1;
+   *       A[4] = (A[4]) + 1;
+   *       A[1] = (A[1]) + 1;
+   *     }
+   *     A[2] = (A[2]) + 1;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * if (x>2 ? 1 : 0) {
+   *   if (x>3 ? 1 : 0) {
+   *     int A_3 = A[2];
+   *     if (x>4 ? 1 : 0) {
+   *       int A_2 = A[1];
+   *       A_2 = A_2 + 1;
+   *       A_3 = A_3 + 1;
+   *       A[3] = (A[3]) + 1;
+   *       A_1 = A_1 + 1;
+   *       A_2 = A_2 + 1;
+   *       A[1] = A_2;
+   *     }
+   *     A_3 = A_3 + 1;
+   *     A[2] = A_3;
+   *   }
+   * }
+   * A[4] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: if (x>2 ? 1 : 0) {
+# CHECK:   if (x>3 ? 1 : 0) {
+# CHECK:     int A_3 = A[2];
+# CHECK:     if (x>4 ? 1 : 0) {
+# CHECK:       int A_2 = A[1];
+# CHECK:       A_2 = A_2 + 1;
+# CHECK:       A_3 = A_3 + 1;
+# CHECK:       A[3] = (A[3]) + 1;
+# CHECK:       A_1 = A_1 + 1;
+# CHECK:       A_2 = A_2 + 1;
+# CHECK:       A[1] = A_2;
+# CHECK:     }
+# CHECK:     A_3 = A_3 + 1;
+# CHECK:     A[2] = A_3;
+# CHECK:   }
+# CHECK: }
+# CHECK: A[4] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Can replace a simple scalar access with a local variable even when that
+// variable is an outer loop var.
+TEST(Registerizer, RegisterizerNestedLoopSimple) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make({For::make(
+      y,
+      0,
+      10,
+      For::make(
+          x,
+          0,
+          10,
+          Block::make(
+              {Store::make(a, {y}, Add::make(Load::make(a, {y}), x))})))});
+
+  /*
+   * for (int y = 0; y < 10; y++) {
+   *   for (int x = 0; x < 10; x++) {
+   *     A[y] = (A[y]) + x;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * for (int y = 0; y < 10; y++) {
+   *   int A_1 = A[y];
+   *   for (int x = 0; x < 10; x++) {
+   *     A_1 = A_1 + x;
+   *   }
+   * A[y] = A_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int y
+# CHECK:   int A_1 = A[y];
+# CHECK:   for (int x
+# CHECK:     A_1 = A_1 + x;
+# CHECK:   }
+# CHECK:   A[y] = A_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Test the positive case of the hiddenAccess split, where an internal
+// conditional access can be hoisted up through a loop to match an existing
+// access in a higher scope and the two can be registerized.
+TEST(Registerizer, RegisterizerHiddenAccessYes) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+      Block::make(
+          {Store::make(a, {0}, 0),
+           For::make(
+               x,
+               0,
+               10,
+               Block::make(
+                   {Store::make(b, {x}, 0),
+                    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+                    Cond::make(
+                        CompareSelect::make(x, 3, CompareSelectOperation::kEQ),
+                        For::make(
+                            y,
+                            0,
+                            10,
+                            Store::make(
+                                a, {0}, Add::make(Load::make(a, {0}), 1))),
+                        nullptr)}))}),
+      nullptr)});
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = 0;
+   *   for (int x = 0; x < 10; x++) {
+   *     B[x] = 0;
+   *     if (x==3 ? 1 : 0) {
+   *       for (int y = 0; y < 10; y++) {
+   *         A[0] = (A[0]) + 1;
+   *       }
+   *     }
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   int A_1 = 0;
+   *   for (int x = 0; x < 10; x++) {
+   *     B[x] = 0;
+   *     if (x==3 ? 1 : 0) {
+   *       for (int y = 0; y < 10; y++) {
+   *         A_1 = A_1 + 1;
+   *       }
+   *     }
+   *   }
+   *   A[0] = A_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x==2
+# CHECK:   int A_1 = 0;
+# CHECK:   for (int x
+# CHECK:     B[x] = 0;
+# CHECK:     if (x==3
+# CHECK:       for (int y
+# CHECK:         A_1 = A_1 + 1;
+# CHECK:       }
+# CHECK:     }
+# CHECK:   }
+# CHECK:  A[0] = A_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Test the negative case of the hiddenAccess split, where the hoisted access is
+// never unhidden at a higher scope and registerization occurs at the lower
+// scope.
+TEST(Registerizer, RegisterizerHiddenAccessNo) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+      Block::make({For::make(
+          x,
+          0,
+          10,
+          Block::make(
+              {Store::make(b, {x}, 0),
+               // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+               Cond::make(
+                   CompareSelect::make(x, 3, CompareSelectOperation::kEQ),
+                   For::make(
+                       y,
+                       0,
+                       10,
+                       Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
+                   nullptr)}))}),
+      nullptr)});
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = 0;
+   *   for (int x = 0; x < 10; x++) {
+   *     B[x] = 0;
+   *     if (x==3 ? 1 : 0) {
+   *       for (int y = 0; y < 10; y++) {
+   *         A[0] = (A[0]) + 1;
+   *       }
+   *     }
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   for (int x = 0; x < 10; x++) {
+   *     B[x] = 0;
+   *     if (x==3 ? 1 : 0) {
+   *       int A_1 = A[0];
+   *       for (int y = 0; y < 10; y++) {
+   *         A_1 = A_1 + 1;
+   *       }
+   *       A[0] = A_1;
+   *     }
+   *   }
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x==2
+# CHECK:   for (int x
+# CHECK:     B[x] = 0;
+# CHECK:     if (x==3
+# CHECK:       int A_1 = A[0];
+# CHECK:       for (int y
+# CHECK:         A_1 = A_1 + 1;
+# CHECK:       }
+# CHECK:       A[0] = A_1;
+# CHECK:     }
+# CHECK:   }
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// In this case the conditional access must be hoisted by two loops, there are
+// two accesses here one is unhidden and the other isnt. A[0] can be
+// registerized but B[0] cannot.
+TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make({Cond::make(
+      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
+      Block::make(
+          {Store::make(a, {0}, 0),
+           For::make(
+               x,
+               0,
+               10,
+               For::make(
+                   y,
+                   0,
+                   10,
+                   Block::make({Cond::make(
+                       CompareSelect::make(y, 3, CompareSelectOperation::kEQ),
+                       Block::make(
+                           {Store::make(
+                                a, {0}, Add::make(Load::make(a, {0}), 1)),
+                            Store::make(
+                                b, {0}, Add::make(Load::make(b, {0}), 1))}),
+                       nullptr)})))}),
+      nullptr)});
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   A[0] = 0;
+   *   for (int x = 0; x < 10; x++) {
+   *     for (int y = 0; y < 10; y++) {
+   *       if (y==3 ? 1 : 0) {
+   *         A[0] = (A[0]) + 1;
+   *         B[0] = (B[0]) + 1;
+   *       }
+   *     }
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x==2 ? 1 : 0) {
+   *   int A_1 = 0;
+   *   for (int x = 0; x < 10; x++) {
+   *     for (int y = 0; y < 10; y++) {
+   *       if (y==3 ? 1 : 0) {
+   *         A_1 = A_1 + 1;
+   *         B[0] = (B[0]) + 1;
+   *       }
+   *     }
+   *   }
+   *   A[0] = A_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x==2
+# CHECK:   int A_1 = 0;
+# CHECK:   for (int x
+# CHECK:     for (int y
+# CHECK:       if (y==3
+# CHECK:         A_1 = A_1 + 1;
+# CHECK:         B[0] = (B[0]) + 1;
+# CHECK:       }
+# CHECK:     }
+# CHECK:   }
+# CHECK:  A[0] = A_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Accesses are registerized inside two conditions, but the immediate parent is
+// not a condition.
+TEST(Registerizer, RegisterizerTwoConditionalLoops) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           For::make(
+               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
+           nullptr),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
+           For::make(
+               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
+           nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   for (int x = 0; x < 10; x++) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   * if (x>5 ? 1 : 0) {
+   *   for (int x = 0; x < 10; x++) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   int A_1 = A[0];
+   *   for (int x = 0; x < 10; x++) {
+   *     A_1 = A_1 + 1;
+   *   }
+   *   A[0] = A_1;
+   * }
+   * if (x>5 ? 1 : 0) {
+   *   int A_2 = A[0];
+   *   for (int x = 0; x < 10; x++) {
+   *     A_2 = A_2 + 1;
+   *   }
+   *   A[0] = A_2;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x<5
+# CHECK:   int A_1 = A[0];
+# CHECK:   for (int x
+# CHECK:     A_1 = A_1 + 1;
+# CHECK:   }
+# CHECK:   A[0] = A_1;
+# CHECK: }
+# CHECK: if (x>5
+# CHECK:   int A_2 = A[0];
+# CHECK:   for (int x
+# CHECK:     A_2 = A_2 + 1;
+# CHECK:   }
+# CHECK:   A[0] = A_2;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Accesses are registerized inside two conditions, cut in the middle.
+TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) {
+  BufHandle a("A", {1}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
+           For::make(
+               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
+           nullptr),
+       For::make(x, 0, 10, Store::make(a, {x}, 1)),
+       Cond::make(
+           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
+           For::make(
+               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
+           nullptr)});
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   for (int x = 0; x < 10; x++) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = 1;
+   * }
+   * if (x>5 ? 1 : 0) {
+   *   for (int x = 0; x < 10; x++) {
+   *     A[0] = (A[0]) + 1;
+   *   }
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * if (x<5 ? 1 : 0) {
+   *   int A_1 = A[0];
+   *   for (int x = 0; x < 10; x++) {
+   *     A_1 = A_1 + 1;
+   *   }
+   *   A[0] = A_1;
+   * }
+   * for (int x = 0; x < 10; x++) {
+   *   A[x] = 1;
+   * }
+   * if (x>5 ? 1 : 0) {
+   *   int A_2 = A[0];
+   *   for (int x = 0; x < 10; x++) {
+   *     A_2 = A_2 + 1;
+   *   }
+   *   A[0] = A_2;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: if (x<5
+# CHECK:   int A_1 = A[0];
+# CHECK:   for (int x
+# CHECK:     A_1 = A_1 + 1;
+# CHECK:   }
+# CHECK:   A[0] = A_1;
+# CHECK: }
+# CHECK: for (int x
+# CHECK:  A[x] = 1;
+# CHECK: if (x>5
+# CHECK:   int A_2 = A[0];
+# CHECK:   for (int x
+# CHECK:     A_2 = A_2 + 1;
+# CHECK:   }
+# CHECK:   A[0] = A_2;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// references a Let var in a local scope which cannot be hoisted out of the
+// loop.
+TEST(Registerizer, RegisterizerLoopLetVar) {
+  BufHandle a("A", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = IRSimplifier::simplify(Block::make({For::make(
+      x,
+      0,
+      10,
+      Block::make(
+          {Let::make(y, 30),
+           Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))}));
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   int y = 30;
+   *   A[y] = x + (A[y]);
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// references a Let var in an outer scope that does not prevent hoisting the
+// initializer.
+TEST(Registerizer, RegisterizerLoopLetVarOuter) {
+  BufHandle a("A", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make(
+      {Let::make(y, 30),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make(
+               {Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))});
+
+  /*
+   * int y = 30;
+   * for (int x = 0; x < 10; x++) {
+   *   A[y] = x + (A[y]);
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int y = 30;
+   * int A_1 = A[y];
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = A_1 + x;
+   * }
+   * A[y] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int y = 30;
+# CHECK: int A_1 = A[y];
+# CHECK: for (int x
+# CHECK:   A_1 = A_1 + x;
+# CHECK: A[y] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Okay so the registerizer generally goes after index flattening, but just in
+// case. Test multi index registerization.
+TEST(Registerizer, RegisterizerMultiDim) {
+  BufHandle a("A", {3, 4, 5}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0, 1, 2}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make({Store::make(
+               a, {0, 1, 2}, Add::make(Load::make(a, {0, 1, 2}), x))}))});
+
+  /*
+   * A[0, 1, 2] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0, 1, 2] = (A[0, 1, 2]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * int A_1 = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A_1 = x + A_1;
+   * }
+   * A[0, 1, 2] = A_1;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: int A_1 = 0;
+# CHECK: for (int x = 0; x < 10; x++)
+# CHECK-NOT: A[
+# CHECK:   A_1 =
+# CHECK: A[0, 1, 2] = A_1;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// Wont registerize if only some dims match, but will still registerize distinct
+// elements.
+TEST(Registerizer, RegisterizerMultiDimPartial) {
+  BufHandle a("A", {3, 4, 5}, kInt);
+  VarHandle x("x", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0, 1, 2}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make({Store::make(
+               a, {0, 2, 2}, Add::make(Load::make(a, {0, 1, 4}), x))}))});
+
+  /*
+   * A[0, 1, 2] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0, 2, 2] = (A[0, 1, 4]) + x;
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * A[0, 1, 2] = 0;
+   * int A_1 = A[0, 1, 4];
+   * int A_2 = A[0, 2, 2];
+   * for (int x = 0; x < 10; x++) {
+   *   A_2 = A_1 + x;
+   * }
+   * A[0, 2, 2] = A_2;
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: A[0, 1, 2] = 0;
+# CHECK: int A_1 = A[0, 1, 4];
+# CHECK: int A_2 = A[0, 2, 2];
+# CHECK: for (
+# CHECK:   A_2 = A_1 + x;
+# CHECK: A[0, 2, 2] = A_2;)IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// If they could overlap across all dimensions we cannot registerize.
+TEST(Registerizer, RegisterizerMultiDimOverlap) {
+  BufHandle a("A", {3, 4, 5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0, 1, 2}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make({Store::make(
+               a, {0, x, 2}, Add::make(Load::make(a, {y, 2, 2}), x))}))});
+  stmt = IRSimplifier::simplify(stmt);
+
+  /*
+   * A[0, 1, 2] = 0;
+   * for (int x = 0; x < 10; x++) {
+   *   A[0, x, 2] = (A[y, 2, 2]) + x;
+   * }
+   */
+
+  std::ostringstream before;
+  before << *stmt;
+
+  // No change.
+  stmt = registerize(stmt);
+
+  std::ostringstream after;
+  after << *stmt;
+
+  ASSERT_EQ(before.str(), after.str());
+}
+
+// But, if one dimension is known to be distinct they do not overlap.
+TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
+  BufHandle a("A", {3, 4, 5}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  StmtPtr stmt = Block::make(
+      {Store::make(a, {0, 1, 2}, 0),
+       For::make(
+           x,
+           0,
+           10,
+           Block::make({Store::make(
+               a, {0, x, 2}, Add::make(Load::make(a, {y, 2, 4}), x))}))});
+
+  /*
+   * A[0, 1, 2] = 0;                          <---- 2nd dim overlaps with store.
+   * for (int x = 0; x < 10; x++) {
+   *   A[0, x, 2] = (A[y, 2, 4]) + x;           <---- 3rd dim has constant diff.
+   * }
+   */
+
+  stmt = registerize(stmt);
+
+  /*
+   * A[0, 1, 2] = 0;
+   * int A_1 = A[y, 2, 4];
+   * for (int x = 0; x < 10; x++) {
+   *   A[0, x, 2] = A_1 + x;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: A[0, 1, 2] = 0;
+# CHECK: int A_1 = A[y, 2, 4];
+# CHECK: for (
+# CHECK:   A[0, x, 2] = A_1 + x;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// A 3D reduction with different input dimensionality.
+TEST(Registerizer, RegisterizerMultiDim3DReduction1) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10, 10}, kInt);
+  BufHandle c("C", {10, 10, 10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  StmtPtr stmt = For::make(
+      x,
+      0,
+      10,
+      For::make(
+          y,
+          0,
+          10,
+          For::make(
+              z,
+              0,
+              10,
+              Store::make(
+                  c,
+                  {x, y, z},
+                  Add::make(
+                      Load::make(c, {x, y, z}),
+                      Mul::make(Load::make(b, {x, y}), Load::make(a, {x})))))));
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   for (int y = 0; y < 10; y++) {
+   *     for (int z = 0; z < 10; z++) {
+   *       C[x, y, z] = (C[x, y, z]) + (B[x, y]) * (A[x]);
+   *     }
+   *   }
+   * }
+   */
+
+  // We can registerize the A and B access since they can be hoisted before
+  // hitting a dependent loop var.
+
+  stmt = registerize(stmt);
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   int A_1 = A[x];
+   *   for (int y = 0; y < 10; y++) {
+   *     int B_1 = B[x, y];
+   *     for (int z = 0; z < 10; z++) {
+   *       C[x, y, z] = A_1 * B_1 + (C[x, y, z]);
+   *     }
+   *   }
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int x
+# CHECK:   int A_1 = A[x];
+# CHECK:   for (int y
+# CHECK:     int B_1 = B[x, y];
+# CHECK:       for (int z
+# CHECK:         C[x, y, z] = A_1 * B_1 + (C[x, y, z]);
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+// A 3D reduction with the same smaller dimensionality using different loop
+// vars.
+TEST(Registerizer, RegisterizerMultiDim3DReduction2) {
+  BufHandle a("A", {10}, kInt);
+  BufHandle b("B", {10}, kInt);
+  BufHandle c("C", {10}, kInt);
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  StmtPtr stmt = For::make(
+      x,
+      0,
+      10,
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+      For::make(
+          y,
+          0,
+          10,
+          For::make(
+              z,
+              0,
+              10,
+              Store::make(
+                  c,
+                  {x},
+                  Add::make(
+                      Load::make(c, {x}),
+                      Mul::make(Load::make(b, {y}), Load::make(a, {x})))))));
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   for (int y = 0; y < 10; y++) {
+   *     for (int z = 0; z < 10; z++) {
+   *       C[x] = (C[x]) + (B[y]) * (A[x]);
+   *     }
+   *   }
+   * }
+   */
+
+  // We can registerize all accesses, the A and C access can be hoisted to the
+  // outer loop since they depend only on it's loop var while the B can only be
+  // raised to the loop of y.
+
+  stmt = registerize(stmt);
+
+  /*
+   * for (int x = 0; x < 10; x++) {
+   *   int A_1 = A[x];
+   *   int C_1 = C[x];
+   *   for (int y = 0; y < 10; y++) {
+   *     int B_1 = B[y];
+   *     for (int z = 0; z < 10; z++) {
+   *       C_1 = A_1 * B_1 + C_1;
+   *     }
+   *   }
+   *   C[x] = C_1;
+   * }
+   */
+
+  std::ostringstream oss;
+  oss << *stmt;
+
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int x
+# CHECK:   int A_1 = A[x];
+# CHECK:   int C_1 = C[x];
+# CHECK:   for (int y
+# CHECK:     int B_1 = B[y];
+# CHECK:       for (int z
+# CHECK:         C_1 = A_1 * B_1 + C_1;
+# CHECK:       }
+# CHECK:     }
+# CHECK:   C[x] = C_1;
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
new file mode 100644
index 0000000000000..99a00d0d62c11
--- /dev/null
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -0,0 +1,5680 @@
+#include <gtest/gtest.h>
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/tensorexpr/hash_provider.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+#include <cmath>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
+
+TEST(Simplify, ConstantFoldSimple) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle f = (a + b);
+
+  ExprHandle newF = IRSimplifier::simplify(f);
+  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
+  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), 5);
+
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<float>(), 5.f);
+}
+
+TEST(Simplify, ConstantFoldTwoLayer) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(4.0f);
+  ExprHandle d(5.0f);
+  ExprHandle f = (a + b) - (c + d);
+
+  ExprHandle newF = IRSimplifier::simplify(f);
+  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
+  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), -4);
+
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<float>(), -4.f);
+}
+
+TEST(Simplify, ConstantFoldShifts) {
+  ExprHandle a(7);
+  ExprHandle b(2);
+  ExprHandle c(3);
+  ExprHandle f = ((a << b) << b) >> c;
+
+  ExprHandle newF = IRSimplifier::simplify(f);
+  ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+  ASSERT_EQ(newF.AsNode<IntImm>()->value(), 14);
+
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<int>(), 7 << (4 - 3));
+}
+
+TEST(Simplify, ConstantFoldBitwise) {
+  ExprHandle a(59);
+  ExprHandle b(22);
+  ExprHandle c(101);
+  ExprHandle f = (a ^ b) & c;
+
+  ExprHandle newF = IRSimplifier::simplify(f);
+  ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+  ASSERT_EQ(newF.AsNode<IntImm>()->value(), 37);
+
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<int>(), (59 ^ 22) & 101);
+}
+
+TEST(Simplify, ConstantFoldMultiOp) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(4.0f);
+  ExprHandle d(5.0f);
+  ExprHandle e(6.0f);
+  ExprHandle f(7.0f);
+  ExprHandle fn = ((a / e) - (c + d)) * (f / b);
+
+  ExprHandle newF = IRSimplifier::simplify(fn);
+  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
+
+  SimpleIRExprEval eval(newF);
+  SimpleIRExprEval ref(fn);
+
+  ASSERT_EQ(eval.value<float>(), ref.value<float>());
+}
+
+TEST(Simplify, ConstantFoldMinMax) {
+  ExprHandle a(12.0f);
+  ExprHandle b(15.0f);
+  ExprHandle c(17.0f);
+
+  // x = max(12, min(15, 17)).
+  ExprHandle minHandle = Min::make(b, c, true);
+  ExprHandle fn = Max::make(a, minHandle, false);
+
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  ASSERT_EQ(fn.dtype().scalar_type(), ScalarType::Float);
+
+  ExprHandle newF = IRSimplifier::simplify(fn);
+  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
+
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<float>(), 15.f);
+}
+
+TEST(Simplify, ConstantFoldIntrinsics) {
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(4.0f);
+  ExprHandle powHandle = Intrinsics::make(kPow, a, b);
+  ExprHandle sinHandle = Intrinsics::make(kSin, powHandle);
+  ExprHandle modHandle = Intrinsics::make(kFmod, c, sinHandle);
+  ExprHandle logHandle = Intrinsics::make(kLog10, modHandle);
+  ExprHandle rndHandle = Intrinsics::make(kRound, logHandle);
+  ExprHandle fn = Intrinsics::make(kAbs, rndHandle);
+
+  ExprHandle newF = IRSimplifier::simplify(fn);
+  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
+  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), 1);
+
+  SimpleIRExprEval eval(newF);
+  SimpleIRExprEval ref(fn);
+
+  ASSERT_EQ(eval.value<float>(), ref.value<float>());
+}
+
+TEST(Simplify, ConstantFoldCastToBool) {
+  ExprHandle f = Cast::make(kBool, IntImm::make(0));
+  ExprHandle newF = IRSimplifier::simplify(f);
+  SimpleIRExprEval eval(newF);
+  ASSERT_EQ(eval.value<bool>(), false);
+}
+
+TEST(Simplify, ConstantFoldWithVar) {
+  {
+    VarHandle x("x", kInt);
+    ExprHandle body = x * (ExprHandle(2) + ExprHandle(4));
+
+    ExprHandle newF = IRSimplifier::simplify(body);
+    MulPtr root = newF.AsNode<Mul>();
+    ASSERT_NE(root, nullptr);
+    ASSERT_NE(to<IntImm>(root->lhs()), nullptr);
+
+    SimpleIRExprEval eval(newF);
+    eval.bindVar(x, ExprHandle(3));
+    ASSERT_EQ(eval.value<int>(), 3 * (2 + 4));
+  }
+
+  {
+    VarHandle x("x", kFloat);
+    ExprHandle body = x * (ExprHandle(2.f) + ExprHandle(4.f));
+
+    ExprHandle newF = IRSimplifier::simplify(body);
+    MulPtr root = newF.AsNode<Mul>();
+    ASSERT_NE(root, nullptr);
+    ASSERT_NE(to<FloatImm>(root->rhs()), nullptr);
+
+    SimpleIRExprEval eval(newF);
+    eval.bindVar(x, ExprHandle(3.f));
+    ASSERT_EQ(eval.value<float>(), 3 * (2 + 4));
+  }
+}
+
+TEST(Simplify, ConditionalSelectFoldSimple) {
+  ExprHandle a(3.0f);
+  ExprHandle b(4.0f);
+  ExprHandle c(3.0f);
+  {
+    ExprHandle f = (a > b);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 0);
+  }
+  {
+    ExprHandle f = (a < b);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 1);
+  }
+  {
+    ExprHandle f = (a == c);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 1);
+  }
+  {
+    ExprHandle f = (a != c);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 0);
+  }
+}
+
+TEST(Simplify, ConditionalSelectFoldTwoLayer) {
+  ExprHandle a(3.0f);
+  ExprHandle b(2.0f);
+  ExprHandle c(2.0f);
+  ExprHandle d(1.0f);
+  {
+    ExprHandle f = (a + b < c + d);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 0);
+  }
+  {
+    ExprHandle f = (a + b > c + d);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 1);
+  }
+  {
+    ExprHandle f = (a + d == b + c);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 1);
+  }
+  {
+    ExprHandle f = (a + d != b + c);
+
+    ExprHandle newF = IRSimplifier::simplify(f);
+    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
+    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
+
+    SimpleIRExprEval eval(newF);
+    ASSERT_EQ(eval.value<int>(), 0);
+  }
+}
+
+TEST(Simplify, ConditionalSelectFoldWithVar) {
+  VarHandle x("x", kFloat);
+  ExprHandle f = x < 4.f;
+
+  ExprHandle newF = IRSimplifier::simplify(f);
+  IntImmPtr folded = newF.AsNode<IntImm>();
+  ASSERT_EQ(folded, nullptr);
+
+  {
+    SimpleIRExprEval eval(newF);
+    eval.bindVar(x, ExprHandle(3.f));
+    ASSERT_EQ(eval.value<int>(), 1);
+  }
+  {
+    SimpleIRExprEval eval(newF);
+    eval.bindVar(x, ExprHandle(5.f));
+    ASSERT_EQ(eval.value<int>(), 0);
+  }
+}
+
+TEST(Simplify, UnFoldableExpr) {
+  VarHandle x("x", kFloat);
+  VarHandle y("y", kFloat);
+  ExprHandle body = (ExprHandle(3) * x) + (ExprHandle(5) * y);
+
+  ExprHandle newF = IRSimplifier::simplify(body);
+  AddPtr root = newF.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+  ASSERT_EQ(to<FloatImm>(root->lhs()), nullptr);
+  ASSERT_EQ(to<FloatImm>(root->rhs()), nullptr);
+
+  SimpleIRExprEval eval(newF);
+  eval.bindVar(x, ExprHandle(3.f));
+  eval.bindVar(y, ExprHandle(2.f));
+  ASSERT_EQ(eval.value<float>(), 9 + 10);
+}
+
+TEST(Simplify, HashSimple) {
+  VarHandle x("x", kFloat);
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle f = a + b * x;
+
+  HashProvider hasher;
+
+  auto hash_x = hasher.hash(x.node());
+  auto hash_a = hasher.hash(a.node());
+  auto hash_f = hasher.hash(f.node());
+
+  ASSERT_NE(hash_x, (size_t)0);
+  ASSERT_NE(hash_a, (size_t)0);
+  ASSERT_NE(hash_f, (size_t)0);
+  ASSERT_NE(hash_x, hash_a);
+  ASSERT_NE(hash_x, hash_f);
+  ASSERT_NE(hash_a, hash_f);
+}
+
+TEST(Simplify, HashEquivalence) {
+  VarHandle x("x", kFloat);
+  VarHandle y("y", kFloat);
+  ExprHandle f = (x * y) + (x * y);
+
+  AddPtr root = f.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+
+  HashProvider hasher;
+  auto hash_f = hasher.hash(f.node());
+  auto hash_l = hasher.hash(root->lhs());
+  auto hash_r = hasher.hash(root->rhs());
+
+  // Root not equal to either branch.
+  ASSERT_NE(hash_f, hash_l);
+  ASSERT_NE(hash_f, hash_r);
+  // but branches are equal.
+  ASSERT_EQ(hash_l, hash_r);
+
+  // Still equivalent if separate.
+  ExprHandle a(2);
+  ExprHandle f2 = x + a / y;
+  ExprHandle b(2);
+  ExprHandle f3 = x + b / y;
+  ASSERT_EQ(hasher.hash(f2.node()), hasher.hash(f3.node()));
+
+  // Not equivalent if different vars (even with same name).
+  VarHandle z("x", kFloat);
+  ExprHandle f4 = z + b / y;
+  ASSERT_NE(hasher.hash(f2.node()), hasher.hash(f4.node()));
+
+  // Intrinsics sanity check.
+  ExprHandle f5 = Intrinsics::make(kSin, x) * Intrinsics::make(kCos, x);
+  ASSERT_NE(hasher.hash(f5.node()), (size_t)0);
+}
+
+TEST(Simplify, HashEquivalenceRand) {
+  ExprHandle f =
+      Intrinsics::make(kRand, kFloat) + Intrinsics::make(kRand, kInt);
+
+  AddPtr root = f.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+
+  HashProvider hasher;
+  auto hash_f = hasher.hash(f.node());
+  auto hash_l = hasher.hash(root->lhs());
+  auto hash_r = hasher.hash(root->rhs());
+
+  // Root not equal to either branch.
+  ASSERT_NE(hash_f, hash_l);
+  ASSERT_NE(hash_f, hash_r);
+  // and branches are NOT equal.
+  ASSERT_NE(hash_l, hash_r);
+}
+
+TEST(Simplify, HashEquivalenceAfterFolding) {
+  VarHandle x("x", kFloat);
+  ExprHandle a(2.0f);
+  ExprHandle b(3.0f);
+  ExprHandle c(5.0f);
+
+  ExprHandle f1 = ((a + b) * x);
+  ExprHandle f2 = (c * x);
+
+  HashProvider hasher;
+  auto hash_l = hasher.hash(f1.node());
+  auto hash_r = hasher.hash(f2.node());
+
+  // Root not equal to either branch, and branches not equal.
+  ASSERT_NE(hash_l, hash_r);
+
+  ExprHandle ff1 = IRSimplifier::simplify(f1);
+  ExprHandle ff2 = IRSimplifier::simplify(f2);
+
+  auto hash_l_n = hasher.hash(ff1.node());
+  auto hash_r_n = hasher.hash(ff2.node());
+  // but branches are now equal.
+  ASSERT_EQ(hash_l_n, hash_r_n);
+}
+
+TEST(Simplify, HashDifferenceTypes) {
+  HashProvider hasher;
+  std::vector<ExprPtr> immediates;
+
+  immediates.push_back(alloc<DoubleImm>(1));
+  immediates.push_back(alloc<FloatImm>(1));
+  immediates.push_back(alloc<HalfImm>(1));
+  // NOLINTNEXTLINE(modernize-use-bool-literals)
+  immediates.push_back(alloc<BoolImm>(1));
+  immediates.push_back(alloc<CharImm>(1));
+  immediates.push_back(alloc<ByteImm>(1));
+  immediates.push_back(alloc<ShortImm>(1));
+  immediates.push_back(alloc<IntImm>(1));
+  immediates.push_back(alloc<LongImm>(1));
+
+  // Immediates of different types are not equal.
+  for (unsigned int i = 0; i < immediates.size(); ++i) {
+    for (unsigned int j = i + 1; j < immediates.size(); ++j) {
+      ASSERT_NE(hasher.hash(immediates[i]), hasher.hash(immediates[j]));
+    }
+  }
+
+  // But coerced immediates are if they are the same type:
+  ExprHandle f1 = ExprHandle(2.f) + CharImm::make(1);
+  ExprHandle f2 = Cast::make(kFloat, IntImm::make(3));
+
+  ExprHandle ff1 = IRSimplifier::simplify(f1);
+  ExprHandle ff2 = IRSimplifier::simplify(f2);
+
+  ASSERT_EQ(hasher.hash(ff1.node()), hasher.hash(ff2.node()));
+}
+
+TEST(Simplify, HashLargeExpression) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  VarHandle i("i", kInt);
+  auto memcpy_stmt = For::make(
+      i,
+      0,
+      N,
+      Store::make(
+          c,
+          {i},
+          CompareSelect::make(
+              Load::make(a, {i}),
+              Load::make(b, {i}),
+              CompareSelectOperation::kEQ)));
+
+  BufHandle d("D", {1}, kInt);
+  BufHandle e("E", {1}, kInt);
+  auto store_ramp_stmt = Store::make(
+      e, {Ramp::make(0, 1, 4)}, Load::make(d, {Ramp::make(0, 1, 4)}));
+
+  auto if_stmt = Cond::make(
+      CompareSelect::make(
+          Load::make(a, {i}), Load::make(b, {i}), CompareSelectOperation::kGE),
+      memcpy_stmt,
+      store_ramp_stmt);
+
+  HashProvider hasher;
+  auto hash_r = hasher.hash(if_stmt);
+  // We should not have to do any more work.
+  ASSERT_TRUE(hasher.cachedHash(memcpy_stmt));
+  auto hash_t = hasher.hash(memcpy_stmt);
+  ASSERT_TRUE(hasher.cachedHash(store_ramp_stmt));
+  auto hash_f = hasher.hash(store_ramp_stmt);
+
+  // Root not equal to either branch, and branches not equal.
+  ASSERT_NE(hash_r, hash_t);
+  ASSERT_NE(hash_r, hash_f);
+  ASSERT_NE(hash_t, hash_f);
+}
+
+TEST(Simplify, HashForLoopOptions) {
+  constexpr int N = 1024;
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
+  VarHandle i("i", kInt);
+  auto for_stmt = For::make(
+      i,
+      0,
+      N,
+      Store::make(
+          c,
+          {i},
+          CompareSelect::make(
+              Load::make(a, {i}),
+              Load::make(b, {i}),
+              CompareSelectOperation::kEQ)));
+
+  HashProvider hasher;
+  auto hash_before = hasher.hash(for_stmt);
+  hasher.clearCache();
+
+  for_stmt->set_gpu_block_index(LoopOptions::IDX_X);
+  auto hash_block_idx = hasher.hash(for_stmt);
+  hasher.clearCache();
+
+  ASSERT_NE(hash_before, hash_block_idx);
+
+  for_stmt->set_gpu_block_index(LoopOptions::IDX_UNSET);
+  auto hash_reset = hasher.hash(for_stmt);
+  hasher.clearCache();
+
+  ASSERT_EQ(hash_before, hash_reset);
+  for_stmt->set_gpu_thread_index(LoopOptions::IDX_X);
+  auto hash_thread_idx = hasher.hash(for_stmt);
+
+  ASSERT_NE(hash_before, hash_thread_idx);
+  ASSERT_NE(hash_block_idx, hash_thread_idx);
+}
+
+/// (2 + x) + 4 => x + 6
+TEST(Simplify, SimplifyAdd) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  VarHandle m("m", kInt);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  VarHandle n("n", kInt);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  VarHandle n_1("n_1", kInt);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  ExprHandle body = (ExprHandle(2) + x) + ExprHandle(4);
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  AddPtr root = simplified.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+  VarPtr lhs = to<Var>(root->lhs());
+  ASSERT_NE(lhs, nullptr);
+  ASSERT_EQ(lhs->name_hint(), "x");
+  IntImmPtr rhs = to<IntImm>(root->rhs());
+  ASSERT_NE(rhs, nullptr);
+  ASSERT_EQ(rhs->value(), 6.f);
+}
+
+/// (2 - x) - 4 => -2 - x
+TEST(Simplify, SimplifySub) {
+  VarHandle x("x", kInt);
+  ExprHandle body = (ExprHandle(2) - x) - ExprHandle(4);
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  SubPtr root = simplified.AsNode<Sub>();
+  ASSERT_NE(root, nullptr);
+  IntImmPtr lhs = to<IntImm>(root->lhs());
+  ASSERT_NE(lhs, nullptr);
+  ASSERT_EQ(lhs->value(), -2.f);
+  VarPtr rhs = to<Var>(root->rhs());
+  ASSERT_NE(rhs, nullptr);
+  ASSERT_EQ(rhs->name_hint(), "x");
+}
+
+/// 2 * (1 - x) - 4 => 2 * (-3 - x)
+TEST(Simplify, SimplifyMultiLayer) {
+  VarHandle x("x", kInt);
+  ExprHandle body = ExprHandle(2) * ((ExprHandle(1) - x) - ExprHandle(4));
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+  IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+  IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
+  IS_IMM_WITH_VAL(Int, sub->lhs(), -3);
+  IS_VAR_WITH_NAME(sub->rhs(), "x");
+}
+
+/// 2 * (3 * x) - (x * 4) => 2 * x
+TEST(Simplify, SimplifyMultiTerm) {
+  VarHandle x("x", kInt);
+  ExprHandle body =
+      (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  MulPtr root = simplified.AsNode<Mul>();
+  ASSERT_NE(root, nullptr);
+  IntImmPtr lhs = to<IntImm>(root->lhs());
+  ASSERT_NE(lhs, nullptr);
+  ASSERT_EQ(lhs->value(), 2);
+  VarPtr rhs = to<Var>(root->rhs());
+  ASSERT_NE(rhs, nullptr);
+  ASSERT_EQ(rhs->name_hint(), "x");
+}
+
+/// 2 * (3 * (long)x) - (x * 4) => 2 * x
+TEST(Simplify, SimplifyCasts) {
+  VarHandle x("x", kLong);
+  ExprHandle body =
+      (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  MulPtr root = simplified.AsNode<Mul>();
+  ASSERT_NE(root, nullptr);
+  LongImmPtr lhs = to<LongImm>(root->lhs());
+  ASSERT_NE(lhs, nullptr);
+  ASSERT_EQ(lhs->value(), 2);
+  VarPtr rhs = to<Var>(root->rhs());
+  ASSERT_NE(rhs, nullptr);
+  ASSERT_EQ(rhs->name_hint(), "x");
+}
+
+/// (x + 0) * 1 => x
+TEST(Simplify, SimplifyEliminatesNoOps) {
+  VarHandle x("x", kInt);
+  ExprHandle body = (x + ExprHandle(0)) * 1;
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  VarPtr root = simplified.AsNode<Var>();
+  ASSERT_NE(root, nullptr);
+  ASSERT_EQ(root->name_hint(), "x");
+}
+
+/// Cannot simplify this.
+TEST(Simplify, SimplifyMultiVar) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  ExprHandle body = x * 24 + y * 34;
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  AddPtr root = simplified.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+  MulPtr lhs = to<Mul>(root->lhs());
+  ASSERT_NE(lhs, nullptr);
+  VarPtr varX = to<Var>(lhs->rhs());
+  ASSERT_NE(varX, nullptr);
+  ASSERT_EQ(varX->name_hint(), "x");
+  MulPtr rhs = to<Mul>(root->rhs());
+  ASSERT_NE(rhs, nullptr);
+  VarPtr varY = to<Var>(rhs->rhs());
+  ASSERT_NE(varY, nullptr);
+  ASSERT_EQ(varY->name_hint(), "y");
+}
+
+// x + 2 + y => x + y + 2
+TEST(Simplify, DISABLED_SimplifyReorderings) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  ExprHandle body = x + 2 + y;
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  AddPtr root = simplified.AsNode<Add>();
+  ASSERT_NE(root, nullptr);
+
+  IS_NODE_WITH_NAME(Add, root->lhs(), rhs);
+  IS_VAR_WITH_NAME(rhs->lhs(), "x");
+  IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  IS_IMM_WITH_VAL(Int, root->rhs(), 2);
+}
+
+/// y + x * 0 => y
+TEST(Simplify, SimplifyEliminatesVar) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  ExprHandle body = y + x * ExprHandle(0);
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  IS_VAR_WITH_NAME(simplified.node(), "y");
+}
+
+TEST(Simplify, SimplifyAdds) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // (x + y) + (x + y) => 2 * (x + y)
+    ExprHandle body = (x + y) + (x + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
+    IS_IMM_WITH_VAL(Int, root->lhs(), 2);
+    IS_NODE_WITH_NAME(Add, root->rhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
+  }
+
+  {
+    // (x * y) + (x * y) => 2 * (x * y)
+    ExprHandle body = (x * y) + (x * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
+    IS_IMM_WITH_VAL(Int, root->lhs(), 2);
+    IS_NODE_WITH_NAME(Mul, root->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // (x - y) + (x - y) => 2 * (x - y)
+    ExprHandle body = (x - y) + (x - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
+    IS_VAR_WITH_NAME(rhs->lhs(), "x");
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // (x + x + x + x) => 4 * x
+    ExprHandle body = (x + x + x + x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
+    IS_IMM_WITH_VAL(Int, root->lhs(), 4);
+    IS_VAR_WITH_NAME(root->rhs(), "x");
+  }
+
+  {
+    // (x + 0) => x.
+    ExprHandle body = x + 0;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // (x + 0.f) => float(x).
+    ExprHandle body = x + 0.f;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+  }
+}
+
+TEST(Simplify, SimplifyMuls) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // (x + y) * (x + y) => (x + y) * (x + y)
+    // We don't attempt to simplify multiplication of polynomials since the
+    // result is only very rarely more efficient.
+    ExprHandle body = (x + y) * (x + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
+    IS_NODE_WITH_NAME(Add, mul->rhs(), rhs);
+    IS_VAR_WITH_NAME(rhs->lhs(), "x");
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // x * y * x * y => x * x * y * y
+    // These get reordered only.
+    ExprHandle body = x * y * x * y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul1);
+    IS_NODE_WITH_NAME(Mul, mul1->lhs(), mul2);
+    IS_NODE_WITH_NAME(Mul, mul2->lhs(), mul3);
+    IS_VAR_WITH_NAME(mul1->rhs(), "y");
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+    IS_VAR_WITH_NAME(mul3->lhs(), "x");
+    IS_VAR_WITH_NAME(mul3->rhs(), "x");
+  }
+
+  {
+    // 1 * (x * 1) => x
+    // Ones cancel cleanly.
+    ExprHandle body = ExprHandle(1) * (x * ExprHandle(1));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // 1.f * (x * 1.f) => x
+    // Even float ones cancel cleanly, but carry their type.
+    ExprHandle body = ExprHandle(1.f) * (x * ExprHandle(1.f));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+  }
+
+  {
+    // 1 * (x * 1.f) => x
+    // One float is enough to cast the expr.
+    ExprHandle body = ExprHandle(1) * (x * ExprHandle(1.f));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+  }
+
+  {
+    // 1 * (x * 0) => 0
+    // Zeroes are eliminated.
+    ExprHandle body = ExprHandle(1) * (x * ExprHandle(0));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // 1 * (x * 0) => 0
+    // But not for Float since nan * 0 = nan.
+    ExprHandle body = ExprHandle(1.f) * (x * ExprHandle(0.f));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_NODE_WITH_NAME(Cast, mul->lhs(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+    IS_IMM_WITH_VAL(Float, mul->rhs(), 0.0);
+  }
+
+  {
+    // (x - y) * (x - y) => (x - y) * (x - y)
+    // As with Add we don't attempt simplification of this.
+    ExprHandle body = (x - y) * (x - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_NODE_WITH_NAME(Sub, mul->lhs(), lhs);
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
+    IS_VAR_WITH_NAME(rhs->lhs(), "x");
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // (x + y) * (x - y) => (x + y) * (x - y)
+    // Don't simplify with different ops on each side.
+    ExprHandle body = (x + y) * (x - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
+    IS_VAR_WITH_NAME(rhs->lhs(), "x");
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with no scalar, poly with non-identity scalar.
+    // x * (y + 1) => x + x * y
+    ExprHandle body = x * (y + ExprHandle(1));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with identity scalar, poly with non-identity scalar.
+    // (x * 1) * (y + 1) => x + x * y
+    ExprHandle body = (x * ExprHandle(1)) * (y + ExprHandle(1));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with non-identity scalar, poly with non-identity scalar.
+    // (x * 2) * (y + 1) => 2 * (x + x * y)
+    ExprHandle body = (x * ExprHandle(2)) * (y + ExprHandle(1));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul2);
+    IS_VAR_WITH_NAME(mul2->lhs(), "x");
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with non-identity scalar, poly with identity scalar.
+    // (x * 2) * (y + 0) => 2 * (x * y)
+    ExprHandle body = (x * ExprHandle(2)) * (y + ExprHandle(0));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_NODE_WITH_NAME(Mul, mul->rhs(), mul2);
+    IS_VAR_WITH_NAME(mul2->lhs(), "x");
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with identity scalar, poly with identity scalar.
+    // (x * 1) * (y + 0) => x * y
+    ExprHandle body = (x * ExprHandle(1)) * (y + ExprHandle(0));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Multiply a polynomial by a term.
+    //   - term with no scalar, poly with identity scalar.
+    // x * (y + 0) => x * y
+    ExprHandle body = x * (y + ExprHandle(0));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+}
+
+// Sub an expr from itself will result in zero.
+TEST(Simplify, SimplifySubs) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // (x + y) - (x + y) => 0
+    ExprHandle body = (x + y) - (x + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // (x * y) - (x * y) => 0
+    ExprHandle body = (x * y) - (x * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // (x - y) - (x - y) => 0
+    ExprHandle body = (x - y) - (x - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // (x + y) - 2 * (x + y) => -1 * x - y
+    ExprHandle body = (x + y) - ExprHandle(2) * (x + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Mul, sub->lhs(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), -1);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+    IS_VAR_WITH_NAME(sub->rhs(), "y");
+  }
+
+  {
+    // (x + y) - y => x
+    ExprHandle body = (x + y) - y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // (x - 0) => x.
+    ExprHandle body = x - 0;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // (x - 0.f) => x.
+    // Simple enough to cancel in float.
+    ExprHandle body = x - ExprHandle(0.f);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+  }
+
+  {
+    // (x - (float)(y - y)) => x.
+    ExprHandle body = x - Cast::make(kFloat, y - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
+    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
+    IS_VAR_WITH_NAME(cast->src_value(), "x");
+  }
+
+  {
+    // (x - y) - y => x - 2 * y
+    ExprHandle body = (x - y) - y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_VAR_WITH_NAME(sub->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // 2 * x - x => x
+    ExprHandle body = (ExprHandle(2) * x) - x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // x - 2 * x = -1 * x
+    // We don't have a unary negate, but this could be 0 -x I guess?
+    ExprHandle body = x - (ExprHandle(2) * x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+
+    IS_IMM_WITH_VAL(Int, mul->lhs(), -1);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // (x + y + 5) * (x - x) => 0
+    // Cancelling out one side of Mul cancels both.
+    ExprHandle body = (x + y + 5) * (x - x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // Cancel out opaque modulus.
+    ExprHandle body = (x % y + 2) - (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
+  }
+
+  {
+    // Cancel out opaque modulus with a bit more going on.
+    ExprHandle body = (x % y + (x * 2 - x - y * 0) - x + 2) - (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
+  }
+
+  {
+    // Sub where result is negative.
+    ExprHandle body = x - (x + 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), -1);
+  }
+
+  {
+    // Sub where result is positive due to negative scalar on RHS.
+    ExprHandle body = x - (x - 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 1);
+  }
+
+  {
+    // Term - Polynomial sub where RHS must be negated.
+    ExprHandle body = (x * 2) - (x * 2 + 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), -1);
+  }
+
+  {
+    // Term - Polynomial sub where the result is a Term.
+    ExprHandle body = (y * x * 2) - (x * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Term - Polynomial sub where the result is a Polynomial.
+    ExprHandle body = (x * 2) - (x + 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+
+    IS_VAR_WITH_NAME(sub->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, sub->rhs(), 1);
+  }
+}
+
+TEST(Simplify, SimplifyDiv) {
+  VarHandle x("x", kInt);
+
+  {
+    ExprHandle body = ExprHandle(0) / x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    ExprHandle body = x / 1;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext0) {
+  // Stmt to simplify:
+  // for (int i = 0; i < 100; i++) {
+  //  A[i] = i / 100;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {100}, kInt);
+  auto for_stmt = For::make(i, 0, 100, Store::make(a_buf, {i}, (i / 100)));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = 0;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext1) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  A[i] = (i + 24) / 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {6}, kInt);
+  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) / 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = 4;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext2) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(5)) {
+  //  A[i] = (i + 25) / 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {5}, kInt);
+  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + 25) / 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = 4;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext3) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  A[i] = (i + 24) / (-6);
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {6}, kInt);
+  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) / (-6)));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NOT:   A[i] = -4;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext4) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(5)) {
+  //  A[i] = (i - 5) / 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {5}, kInt);
+  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + (-5)) / 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NOT:   A[i] = 0;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext5) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (const auto j : c10::irange(10)) {
+  //    A[i, j] = (i + 6*j) / 6;
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j = For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) / 6));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NEXT:   A[i, j] = j;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext6) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (int j = -1; j < 9; j++) {
+  //    A[i, j+1] = (i + 6*j) / 6;
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j =
+      For::make(j, -1, 9, Store::make(a_buf, {i, j + 1}, (i + j * 6) / 6));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NOT:   A[i, j] = j;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyDivWithLoopContext7) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (const auto j : c10::irange(10)) {
+  //    A[i, j] = (i + 6*j) / (-6);
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j =
+      For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) / (-6)));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NOT:   A[i, j] = -j;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext0) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(100)) {
+  //  A[i] = i % 100;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {100}, kInt);
+  auto for_stmt = For::make(i, 0, 100, Store::make(a_buf, {i}, (i % 100)));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext1) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  A[i] = (i + 24) % 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {6}, kInt);
+  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) % 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext2) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(5)) {
+  //  A[i] = (i + 25) % 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {5}, kInt);
+  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + 25) % 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NEXT:   A[i] = i + 1;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext3) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  A[i] = (i + 24) % (-6);
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {6}, kInt);
+  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) % (-6)));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NOT:   A[i] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext4) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(5)) {
+  //  A[i] = (i - 5) % 6;
+  //}
+  VarHandle i("i", kInt);
+  BufHandle a_buf("A", {5}, kInt);
+  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + (-5)) % 6));
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK-NOT:   A[i] = i - 5;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext5) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (const auto j : c10::irange(10)) {
+  //    A[i, j] = (i + 6*j) % 6;
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j = For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) % 6));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NEXT:   A[i, j] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext6) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (int j = -1; j < 9; j++) {
+  //    A[i, j+1] = (i + 6*j) % 6;
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j =
+      For::make(j, -1, 9, Store::make(a_buf, {i, j + 1}, (i + j * 6) % 6));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NOT:   A[i, j] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyModWithLoopContext7) {
+  // Stmt to simplify:
+  // for (const auto i : c10::irange(6)) {
+  //  for (const auto j : c10::irange(10)) {
+  //    A[i, j] = (i + 6*j) % (-6);
+  //  }
+  //}
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  BufHandle a_buf("A", {6, 10}, kInt);
+  auto for_j =
+      For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) % (-6)));
+  auto for_i = For::make(i, 0, 6, for_j);
+
+  const StmtPtr simplified = IRSimplifier::simplify(for_i);
+
+  std::ostringstream oss;
+  oss << *(simplified);
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int i
+# CHECK:   for (int j
+# CHECK-NOT:   A[i, j] = i;
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
+TEST(Simplify, SimplifyMod) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+
+  {
+    // Constant folding works.
+    ExprHandle body = ExprHandle(10) % 8;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
+  }
+
+  {
+    // x % x => 0
+    ExprHandle body = x % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // 0 % x => 0
+    ExprHandle body = ExprHandle(0) % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // x % 1 => 0
+    ExprHandle body = x % 1;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // Doesn't change unknown mods.
+    // x % y => x % y
+    ExprHandle body = x % y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "x");
+    IS_VAR_WITH_NAME(mod->rhs(), "y");
+  }
+
+  {
+    // don't touch if RHS is unknown.
+    // 4 % x => 4 % x
+    ExprHandle body = ExprHandle(4) % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_IMM_WITH_VAL(Int, mod->lhs(), 4);
+    IS_VAR_WITH_NAME(mod->rhs(), "x");
+  }
+
+  {
+    // don't touch if LHS is unknown.
+    // x % 4 => x % 4
+    ExprHandle body = x % 4;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 4);
+  }
+
+  {
+    // if LHS is a multiple of RHS, mod is zero.
+    // 2 * x % x => 0
+    ExprHandle body = (x * 2) % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // true even if the multiple is not constant.
+    // x * y % x => 0
+    ExprHandle body = (x * y) % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // true with multiple unknown values in LHS.
+    // x * y * z % x => 0
+    ExprHandle body = (x * y * z) % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // true if the denom is compound.
+    // x * y * z % y * z => 0
+    ExprHandle body = (x * y * z) % (y * z);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // Sanity check true with scalars that are multiples.
+    // 12 * x % 4 => 0
+    ExprHandle body = (x * 12) % 4;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+
+  {
+    // Sanity check not true if the smaller scalar is on LHS.
+    // 4 * x % 12 => 4 * x % 12
+    ExprHandle body = (x * 4) % 12;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_NODE_WITH_NAME(Mul, mod->lhs(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 4);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 12);
+  }
+
+  {
+    // Both scalar and symbolic in multiple.
+    // (6 * x * y) % (3 * x * y) => 0
+    ExprHandle body = (ExprHandle(6) * x * y) % (x * y * 3);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
+  }
+}
+
+// Test that mixing ops together simplifies as expected.
+TEST(Simplify, SimplifyMultiOp) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // (x * y) + (x - y) => (x + x * y) - y
+    ExprHandle body = (x * y) + (x - y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+    IS_VAR_WITH_NAME(sub->rhs(), "y");
+  }
+
+  {
+    // (x + y) - x * y => (x + y) - x * y
+    ExprHandle body = (x + y) - x * y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // (x - y) - (x + y) => -2 * y
+    ExprHandle body = (x - y) - (x + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), -2);
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // (x - 0) + (x * 1) - (x + 0) => x
+    ExprHandle body = (x - 0) + (x * 1) - (x + 0);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // (x - 0.f) + (x * 1.f) - (x + 0.f) => float(x) + float(x) - float(x)
+    // Even in Float simple terms cancel out, but the variable ones cannot.
+    ExprHandle body =
+        (x - ExprHandle(0.f)) + (x * ExprHandle(1.f)) - (x + ExprHandle(0.f));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
+    IS_NODE_WITH_NAME(Cast, add->lhs(), cast1);
+    IS_VAR_WITH_NAME(cast1->src_value(), "x");
+    IS_NODE_WITH_NAME(Cast, add->rhs(), cast2);
+    IS_VAR_WITH_NAME(cast2->src_value(), "x");
+    IS_NODE_WITH_NAME(Cast, sub->rhs(), cast3);
+    IS_VAR_WITH_NAME(cast3->src_value(), "x");
+  }
+}
+
+// Test that chaining many ops together works as expected.
+TEST(Simplify, SimplifyManyOps) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // x + y + x + x + y + y + x + y + x = 4 * y + 5 * x
+    ExprHandle body = x + y + x + x + y + y + x + y + x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+
+    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 4);
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
+
+    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
+    IS_VAR_WITH_NAME(rhs->rhs(), "x");
+  }
+
+  {
+    // x - y + x + x - y - y + x - y + x = 5 * x - 4 * y
+    ExprHandle body = x - y + x + x - y - y + x - y + x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), add);
+
+    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 5);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+
+    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 4);
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // x + y + x - x - y - y + x + y + x = 3 * x
+    ExprHandle body = x + y + x - x - y - y + x + y + x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 3);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+}
+
+TEST(Simplify, SimplifyFactorization) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // (2 * x) + (2 * y) => 2 * (x + y)
+    ExprHandle body = (ExprHandle(2) * x + ExprHandle(2) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+
+    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
+  }
+
+  {
+    // Factorization when scalars have common divider.
+    // (2 * x) + (4 * y) => 2 * (2 * y + x)
+    ExprHandle body = (ExprHandle(2) * x + ExprHandle(4) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+
+    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul2);
+    IS_IMM_WITH_VAL(Int, mul2->lhs(), 2);
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+  }
+
+  {
+    // Factorization attempt without a common divider.
+    // (2 * x) + (5 * y) =>  (5 * y) + (2 * x)
+    ExprHandle body = (ExprHandle(2) * x + ExprHandle(5) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+
+    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+
+    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // Factorization after merging.
+    // (2 * x) + (4 * y) + (8 * x + 6 * y) => 10 * (x + y)
+    ExprHandle body = (ExprHandle(2) * x + ExprHandle(4) * y) +
+        (ExprHandle(8) * x + ExprHandle(6) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 10);
+
+    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
+  }
+
+  {
+    // Factorization with common divider but different signs.
+    // (2 * x) + (-4 * y) => 2 * (x - 2 * y)
+    ExprHandle body = (ExprHandle(2) * x + ExprHandle(-4) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
+    IS_VAR_WITH_NAME(sub->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul2);
+    IS_IMM_WITH_VAL(Int, mul2->lhs(), 2);
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+  }
+
+  {
+    // Factorization with all negative numbers.
+    // (-2 * x) + (-4 * y) => 2 * (-1 * x - 2 * y)
+    ExprHandle body = ExprHandle(-2) * x + ExprHandle(-4) * y;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
+    IS_NODE_WITH_NAME(Mul, sub->lhs(), mul2);
+    IS_IMM_WITH_VAL(Int, mul2->lhs(), -1);
+    IS_VAR_WITH_NAME(mul2->rhs(), "x");
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul3);
+    IS_IMM_WITH_VAL(Int, mul3->lhs(), 2);
+    IS_VAR_WITH_NAME(mul3->rhs(), "y");
+  }
+
+  {
+    // The following test ensures that there in no infinite recursion during
+    // factorization when negative numbers are involved.
+    VarHandle a("a", kInt);
+    VarHandle b("b", kInt);
+    VarHandle c("c", kInt);
+    VarHandle d("d", kInt);
+    VarHandle e("e", kInt);
+    VarHandle f("f", kInt);
+    VarHandle g("g", kInt);
+    VarHandle h("h", kInt);
+
+    ExprHandle body = a * 1024 + 0 + b * (-1) + c * (-1) + d * 1 + e * 1 +
+        f * 32 + g * (-1024) + h * (-32);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(
+        simplified,
+        "((((((d + e) + 1024 * a) + 32 * f) - b) - c) - 1024 * g) - 32 * h");
+  }
+}
+
+// (4 * x + y + z * 2) + (4 * x + y + z * 4) => 2 * (y + 3 * z + 4 * x)
+TEST(Simplify, SimplifyFactorizeUneven) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+  ExprHandle body =
+      (ExprHandle(4) * x + y + z * 2) + (ExprHandle(4) * x + y + z * 4);
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  IS_NODE_WITH_NAME(Mul, simplified.node(), root);
+  IS_IMM_WITH_VAL(Int, root->lhs(), 2);
+  IS_NODE_WITH_NAME(Add, root->rhs(), add1);
+  IS_NODE_WITH_NAME(Add, add1->lhs(), add2);
+
+  IS_VAR_WITH_NAME(add2->lhs(), "y");
+  IS_NODE_WITH_NAME(Mul, add2->rhs(), zmul);
+  IS_NODE_WITH_NAME(Mul, add1->rhs(), xmul);
+
+  IS_IMM_WITH_VAL(Int, xmul->lhs(), 4);
+  IS_VAR_WITH_NAME(xmul->rhs(), "x");
+
+  IS_IMM_WITH_VAL(Int, zmul->lhs(), 3);
+  IS_VAR_WITH_NAME(zmul->rhs(), "z");
+}
+
+// (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y)
+// This is kind of a placeholder test for variable factorization.
+TEST(Simplify, SimplifyDeeperTerms) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  ExprHandle body = (x * y) + (ExprHandle(2) * x) * (x + y);
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  IS_NODE_WITH_NAME(Add, simplified.node(), add);
+
+  IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
+  IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+  IS_NODE_WITH_NAME(Mul, lhs->rhs(), xxTerm);
+  IS_VAR_WITH_NAME(xxTerm->lhs(), "x");
+  IS_VAR_WITH_NAME(xxTerm->rhs(), "x");
+
+  IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
+  IS_IMM_WITH_VAL(Int, rhs->lhs(), 3);
+  IS_NODE_WITH_NAME(Mul, rhs->rhs(), xyTerm);
+  IS_VAR_WITH_NAME(xyTerm->lhs(), "x");
+  IS_VAR_WITH_NAME(xyTerm->rhs(), "y");
+}
+
+// Tests the difference between two less trivial expressions.
+// (m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n) => 1
+TEST(Simplify, SimplifyDeeperDifference) {
+  VarHandle n("n", kInt);
+  VarHandle n_1("n_1", kInt);
+  VarHandle m("m", kInt);
+  ExprHandle body =
+      (m * (ExprHandle(1) * n_1) + (n + 1)) - (m * (ExprHandle(1) * n_1) + n);
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  IS_IMM_WITH_VAL(Int, simplified.node(), 1);
+}
+
+// Test constant folding into the difference between expressions.
+// 2 + char((m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n)) => 3
+TEST(Simplify, SimplifyFoldComplexDifference) {
+  VarHandle n("n", kInt);
+  VarHandle n_1("n_1", kInt);
+  VarHandle m("m", kInt);
+  ExprHandle body =
+      (IntImm::make(2) +
+       (Cast::make(
+           kChar,
+           (m * (ExprHandle(1) * n_1) + (n + 1)) -
+               (m * (ExprHandle(1) * n_1) + n))));
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  IS_IMM_WITH_VAL(Int, simplified.node(), 3);
+}
+
+TEST(Simplify, SimplifyIfComponents) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  ExprHandle body = IfThenElse::make(
+      ((ExprHandle(5) - ExprHandle(4)) * x) > y,
+      ExprHandle(2) * x - x,
+      ExprHandle(2) * y - y);
+
+  ExprHandle simplified = IRSimplifier::simplify(body);
+
+  IS_NODE_WITH_NAME(IfThenElse, simplified.node(), ifexpr);
+
+  IS_NODE_WITH_NAME(CompareSelect, ifexpr->condition(), cmp);
+  ASSERT_EQ(cmp->compare_select_op(), kGT);
+  IS_VAR_WITH_NAME(cmp->lhs(), "x");
+  IS_VAR_WITH_NAME(cmp->rhs(), "y");
+
+  IS_VAR_WITH_NAME(ifexpr->true_value(), "x");
+  IS_VAR_WITH_NAME(ifexpr->false_value(), "y");
+}
+
+TEST(Simplify, SimplifyOpaqueTerms) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  {
+    // 2 * x/y * y - x/y * y => x/y * y
+    ExprHandle body = ((ExprHandle(2)) * (x / y) * y) - ((x / y) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_NODE_WITH_NAME(Div, mul->lhs(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "x");
+    IS_VAR_WITH_NAME(div->rhs(), "y");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // x%y - (x%y - 1) => 1
+    ExprHandle body = (x % y) - ((x % y) - 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_IMM_WITH_VAL(Int, simplified.node(), 1);
+  }
+}
+
+TEST(Simplify, SimplifySymbolicMinMax) {
+  {
+    // Minimum with constant difference between terms.
+    VarHandle x("x", kInt);
+    ExprHandle body = Min::make(x + 3, x + 7, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, add->rhs(), 3);
+  }
+
+  {
+    // Maximum with constant difference between terms.
+    VarHandle x("x", kInt);
+    ExprHandle body = Max::make(x + 3, x + 7, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, add->rhs(), 7);
+  }
+
+  {
+    // Can't simplify multiples because of signedness of variable component.
+    // TODO: maybe we could for unsigned types?
+    VarHandle x("x", kInt);
+    ExprHandle body = Max::make(x * 3, x * 7, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE(Max, simplified.node());
+  }
+}
+
+TEST(Simplify, SimplifyNestedMax) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+
+  {
+    // Max(x + y, x + y) => x + y
+    ExprHandle body = Max::make(x + y, x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
+  }
+
+  {
+    // Max(x + y, Max(x + y, z)) => Max(x + y, z)
+    ExprHandle body = Max::make(x + y, Max::make(x + y, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(max->rhs(), "z");
+  }
+
+  {
+    // Max(x + y, Max(z, x + y)) => Max(x + y, z)
+    ExprHandle body = Max::make(x + y, Max::make(z, x + y, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(max->rhs(), "z");
+  }
+
+  {
+    // Max(Max(x + y, z), x + y) => Max(x + y, z)
+    ExprHandle body = Max::make(Max::make(x + y, z, true), x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(max->rhs(), "z");
+  }
+
+  {
+    // Max(Max(z, x + y), x + y) => Max(x + y, z)
+    ExprHandle body = Max::make(Max::make(z, x + y, true), x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(max->rhs(), "z");
+  }
+
+  {
+    // Max(Max(x, y), x) => Max(Max(x, y), x)
+    // Nested Max ops with different propagate_nans should not be simplified.
+    ExprHandle body = Max::make(Max::make(x, y, true), x, false);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Max, max->lhs(), max1, "x", "y");
+    ASSERT_TRUE(max1->propagate_nans());
+    IS_VAR_WITH_NAME(max->rhs(), "x");
+    ASSERT_FALSE(max->propagate_nans());
+  }
+
+  {
+    // Max(Min(x, y), Min(x, z)) => Min(Max(y, z), x)
+    ExprHandle body =
+        Max::make(Min::make(x, y, true), Min::make(x, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
+  }
+
+  {
+    // Max(Min(x, y), Min(z, x)) => Min(Max(y, z), x)
+    ExprHandle body =
+        Max::make(Min::make(x, y, true), Min::make(z, x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
+  }
+
+  {
+    // Max(Min(y, x), Min(x, z)) => Min(Max(y, z), x)
+    ExprHandle body =
+        Max::make(Min::make(y, x, true), Min::make(x, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
+  }
+
+  {
+    // Max(Min(y, x), Min(z, x)) => Min(Max(y, z), x)
+    ExprHandle body =
+        Max::make(Min::make(y, x, true), Min::make(z, x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
+  }
+
+  {
+    // Max(Min(y, x), Min(z, x)) => Max(Min(x, y), Min(x, z))
+    // When all the ops in the pattern do not have the same propagate_nans,
+    // it should not be simplified.
+    ExprHandle body =
+        Max::make(Min::make(y, x, true), Min::make(z, x, false), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max);
+    IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "y");
+    ASSERT_TRUE(min1->propagate_nans());
+    IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "z");
+    ASSERT_FALSE(min2->propagate_nans());
+    ASSERT_TRUE(max->propagate_nans());
+  }
+
+  {
+    // Max(5, Max(x, 8)) => Max(x, 8)
+    ExprHandle body = Max::make(5, Max::make(x, 8, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
+    ASSERT_TRUE(max->propagate_nans());
+  }
+
+  {
+    // Max(8, Max(x, 5)) => Max(x, 8)
+    ExprHandle body = Max::make(8, Max::make(x, 5, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
+    ASSERT_TRUE(max->propagate_nans());
+  }
+
+  {
+    // Max(Max(x, 8), 5) => Max(x, 8)
+    ExprHandle body = Max::make(Max::make(x, 8, true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
+    ASSERT_TRUE(max->propagate_nans());
+  }
+
+  {
+    // Max(Max(x, 5), 8) => Max(x, 8)
+    ExprHandle body = Max::make(Max::make(x, 5, true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
+    ASSERT_TRUE(max->propagate_nans());
+  }
+
+  {
+    // Max(5, Max(x, Max(y, Max(z, 8)))) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        5, Max::make(x, Max::make(y, Max::make(z, 8, true), true), true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(8, Max(Max(y, Max(z, 5)), x)) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        8, Max::make(Max::make(y, Max::make(z, 5, true), true), x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(5, Max(Max(Max(z, 8), y), x)) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        5, Max::make(Max::make(Max::make(z, 8, true), y, true), x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(Max(x, Max(y, Max(5, z))), 8) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        Max::make(x, Max::make(y, Max::make(5, z, true), true), true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(Max(Max(y, Max(8, z)), x), 5) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        Max::make(Max::make(y, Max::make(z, 8, true), true), x, true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(Max(Max(Max(5, z), y), x), 8) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        Max::make(Max::make(Max::make(z, 5, true), y, true), x, true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(Max(Max(Max(z, 5), y), x), 8) => Max(Max(x, Max(Max(z, 5), y)), 8)
+    // Do not simplify when all the Max ops do not have the same
+    // propagate_nans.
+    ExprHandle body = Max::make(
+        Max::make(Max::make(Max::make(z, 5, true), y, false), x, true),
+        8,
+        false);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Max(Max(Max(Max(z, 5, 1), y, 0), x, 1), 8, 0)");
+  }
+
+  {
+    // Max(8, Max(Max(x, 5), Max(y, z))) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        8, Max::make(Max::make(x, 5, true), Max::make(y, z, true), true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+
+  {
+    // Max(Max(Max(x, 5), Max(y, z)), 8) => Max(Max(Max(x, 8), y), z)
+    ExprHandle body = Max::make(
+        Max::make(Max::make(x, 5, true), Max::make(y, z, true), true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
+    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
+    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
+    ASSERT_TRUE(max3->propagate_nans());
+    IS_VAR_WITH_NAME(max2->rhs(), "y");
+    IS_VAR_WITH_NAME(max1->rhs(), "z");
+  }
+}
+
+TEST(Simplify, SimplifyNestedMin) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+  VarHandle z("z", kInt);
+
+  {
+    // Min(x + y, x + y) => x + y
+    ExprHandle body = Min::make(x + y, x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
+  }
+
+  {
+    // Min(x + y, Min(x + y, z)) => Min(x + y, z)
+    ExprHandle body = Min::make(x + y, Min::make(x + y, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min);
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(min->rhs(), "z");
+  }
+
+  {
+    // Min(x + y, Min(z, x + y)) => Min(x + y, z)
+    ExprHandle body = Min::make(x + y, Min::make(z, x + y, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min);
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(min->rhs(), "z");
+  }
+
+  {
+    // Min(Min(x + y, z), x + y) => Min(x + y, z)
+    ExprHandle body = Min::make(Min::make(x + y, z, true), x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min);
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(min->rhs(), "z");
+  }
+
+  {
+    // Min(Min(z, x + y), x + y) => Min(x + y, z)
+    ExprHandle body = Min::make(Min::make(z, x + y, true), x + y, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min);
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
+    IS_VAR_WITH_NAME(min->rhs(), "z");
+  }
+
+  {
+    // Min(Min(x, y), x) => Min(Min(x, y), x)
+    // Nested Min ops with different propagate_nans should not be simplified.
+    ExprHandle body = Min::make(Min::make(x, y, true), x, false);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_BINOP_W_VARS(Min, min1->lhs(), min2, "x", "y");
+    ASSERT_TRUE(min2->propagate_nans());
+    IS_VAR_WITH_NAME(min1->rhs(), "x");
+    ASSERT_FALSE(min1->propagate_nans());
+  }
+
+  {
+    // Min(Max(x, y), Max(x, z)) => Max(Min(y, z), x)
+    ExprHandle body =
+        Min::make(Max::make(x, y, true), Max::make(x, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
+  }
+
+  {
+    // Min(Max(x, y), Max(z, x)) => Max(Min(y, z), x)
+    ExprHandle body =
+        Min::make(Max::make(x, y, true), Max::make(z, x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
+  }
+
+  {
+    // Min(Max(y, x), Max(x, z)) => Max(Min(y, z), x)
+    ExprHandle body =
+        Min::make(Max::make(y, x, true), Max::make(x, z, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
+  }
+
+  {
+    // Min(Max(y, x), Max(z, x)) => Max(Min(y, z), x)
+    ExprHandle body =
+        Min::make(Max::make(y, x, true), Max::make(z, x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
+  }
+
+  {
+    // Min(Max(y, x), Max(z, x)) => Min(Max(x, y), Max(x, z))
+    // When all the ops in the pattern do not have the same propagate_nans,
+    // it should not be simplified.
+    ExprHandle body =
+        Min::make(Max::make(y, x, true), Max::make(z, x, false), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min);
+    IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "y");
+    ASSERT_TRUE(max1->propagate_nans());
+    IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "z");
+    ASSERT_FALSE(max2->propagate_nans());
+    ASSERT_TRUE(min->propagate_nans());
+  }
+
+  {
+    // Min(5, Min(x, 8)) => Min(x, 8)
+    ExprHandle body = Min::make(5, Min::make(x, 8, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
+    ASSERT_TRUE(min->propagate_nans());
+  }
+
+  {
+    // Min(8, Min(x, 5)) => Min(x, 8)
+    ExprHandle body = Min::make(8, Min::make(x, 5, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
+    ASSERT_TRUE(min->propagate_nans());
+  }
+
+  {
+    // Min(Min(x, 8), 5) => Min(x, 8)
+    ExprHandle body = Min::make(Min::make(x, 8, true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
+    ASSERT_TRUE(min->propagate_nans());
+  }
+
+  {
+    // Min(Min(x, 5), 8) => Min(x, 8)
+    ExprHandle body = Min::make(Min::make(x, 5, true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
+    ASSERT_TRUE(min->propagate_nans());
+  }
+
+  {
+    // Min(5, Min(x, Min(y, Min(z, 8)))) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        5, Min::make(x, Min::make(y, Min::make(z, 8, true), true), true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(5, Min(Min(y, Min(z, 8)), x)) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        5, Min::make(Min::make(y, Min::make(z, 8, true), true), x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(5, Min(Min(Min(z, 8), y), x)) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        5, Min::make(Min::make(Min::make(z, 8, true), y, true), x, true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(Min(x, Min(y, Min(8, z))), 5) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        Min::make(x, Min::make(y, Min::make(8, z, true), true), true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(Min(Min(y, Min(8, z)), x), 5) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        Min::make(Min::make(y, Min::make(z, 8, true), true), x, true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(Min(Min(Min(8, z), y), x), 5) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        Min::make(Min::make(Min::make(z, 8, true), y, true), x, true), 5, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(Min(Min(z, 5), y), x), 8)
+    // Do not simplify when all the Min ops do not have the same
+    // propagate_nans.
+    ExprHandle body = Min::make(
+        Min::make(Min::make(Min::make(z, 5, true), y, false), x, true),
+        8,
+        false);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "Min(Min(Min(Min(z, 5, 1), y, 0), x, 1), 8, 0)");
+  }
+
+  {
+    // Min(8, Min(Min(x, 5), Min(y, z))) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        8, Min::make(Min::make(x, 5, true), Min::make(y, z, true), true), true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+
+  {
+    // Min(Min(Min(x, 5), Min(y, z)), 8) => Min(Min(Min(x, 5), y), z)
+    ExprHandle body = Min::make(
+        Min::make(Min::make(x, 5, true), Min::make(y, z, true), true), 8, true);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
+    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
+    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
+    ASSERT_TRUE(min3->propagate_nans());
+    IS_VAR_WITH_NAME(min2->rhs(), "y");
+    IS_VAR_WITH_NAME(min1->rhs(), "z");
+  }
+}
+
+TEST(Simplify, SimplifyWontReorderFloat) {
+  {
+    // 3 * (3 * x) - 3 * (3 * y) => 9 * (x - y)
+    // This is an expression we can simplify.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+
+    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
+        ExprHandle(3) * (ExprHandle(3) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 9);
+    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
+    IS_VAR_WITH_NAME(sub->lhs(), "x");
+    IS_VAR_WITH_NAME(sub->rhs(), "y");
+  }
+
+  {
+    // 3 * (3 * x) - 3 * (3 * y) => 3 * (3 * x) - 3 * (3 * y).
+    // If the vars are floating point, ops are not associative and we can't
+    // reorder.
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+
+    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
+        ExprHandle(3) * (ExprHandle(3) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
+    IS_IMM_WITH_VAL(Float, lhsMul->lhs(), 3);
+    IS_NODE_WITH_NAME(Mul, lhsMul->rhs(), lhsVarMul);
+    IS_IMM_WITH_VAL(Float, lhsVarMul->lhs(), 3);
+    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
+
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), rhsMul);
+    IS_IMM_WITH_VAL(Float, rhsMul->lhs(), 3);
+    IS_NODE_WITH_NAME(Mul, rhsMul->rhs(), rhsVarMul);
+    IS_IMM_WITH_VAL(Float, rhsVarMul->lhs(), 3);
+    IS_VAR_WITH_NAME(rhsVarMul->rhs(), "y");
+  }
+
+  {
+    // 3 * (3 * x) - 3 * (3 * y) => 3 * (3 * x) - (9 * y).
+    // We will simplify subexprs if they dont reorder floating point ops.
+    VarHandle x("x", kDouble);
+    VarHandle y("y", kInt);
+
+    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
+        ExprHandle(3) * (ExprHandle(3) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
+    IS_IMM_WITH_VAL(Double, lhsMul->lhs(), 3);
+    IS_NODE_WITH_NAME(Mul, lhsMul->rhs(), lhsVarMul);
+    IS_IMM_WITH_VAL(Double, lhsVarMul->lhs(), 3);
+    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
+
+    IS_NODE_WITH_NAME_AND_CAST(Mul, sub->rhs(), rhsMul, Double);
+    IS_IMM_WITH_VAL(Int, rhsMul->lhs(), 9);
+    IS_VAR_WITH_NAME(rhsMul->rhs(), "y");
+  }
+
+  {
+    // Prevent reordering if FP propagated from dtypes.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+
+    ExprHandle body = ExprHandle(3.f) * (ExprHandle(3) * x) -
+        ExprHandle(3) * (ExprHandle(3.f) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
+    IS_IMM_WITH_VAL(Float, lhsMul->lhs(), 3);
+    IS_NODE_WITH_NAME_AND_CAST(Mul, lhsMul->rhs(), lhsVarMul, Float);
+    IS_IMM_WITH_VAL(Int, lhsVarMul->lhs(), 3);
+    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
+
+    IS_NODE_WITH_NAME(Mul, sub->rhs(), rhsMul);
+    IS_IMM_WITH_VAL(Float, rhsMul->lhs(), 3);
+    IS_NODE_WITH_NAME(Mul, rhsMul->rhs(), rhsVarMul);
+    IS_IMM_WITH_VAL(Float, rhsVarMul->lhs(), 3);
+    IS_NODE_WITH_NAME(Cast, rhsVarMul->rhs(), yCast);
+    IS_VAR_WITH_NAME(yCast->src_value(), "y");
+  }
+
+  {
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+    // x%y - (x%y - 1) => x%y - (x%y - 1).
+    // We wont reorder opaque ops if they are FP.
+    ExprHandle body = (x % y) - ((x % y) - 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_NODE_WITH_NAME(Mod, sub->lhs(), lhsMod);
+    IS_VAR_WITH_NAME(lhsMod->lhs(), "x");
+    IS_VAR_WITH_NAME(lhsMod->rhs(), "y");
+
+    IS_NODE_WITH_NAME(Sub, sub->rhs(), rhsSub);
+    IS_NODE_WITH_NAME(Mod, rhsSub->lhs(), rhsMod);
+    IS_VAR_WITH_NAME(rhsMod->lhs(), "x");
+    IS_VAR_WITH_NAME(rhsMod->rhs(), "y");
+    IS_IMM_WITH_VAL(Float, rhsSub->rhs(), 1);
+  }
+}
+
+TEST(Simplify, SimplifyRoundModPattern) {
+  {
+    // (x/y)*y + x%y => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((x / y) * y) + (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Reverse order.
+    // x%y + (x/y)*y => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x % y) + ((x / y) * y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Non opaque denominator.
+    // (x / (4+y)) * (4+y)) + (x % (y + 4)) => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((x / (ExprHandle(4) + y)) * (ExprHandle(4) + y)) +
+        (x % (y + ExprHandle(4)));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Reverse order.
+    // (x % (y + 4)) + (x / (4+y)) * (4+y)) => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x % (y + ExprHandle(4))) +
+        ((x / (ExprHandle(4) + y)) * (ExprHandle(4) + y));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Opaque denominator.
+    // (x / (2/y)) * (2/y)) + (x % (2/y)) => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((x / (ExprHandle(2) / y)) * (ExprHandle(2) / y)) +
+        (x % (ExprHandle(2) / y));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Non opaque numerator
+    // ((2*x)/y * y) + ((2*x) % y) => 2 * x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body =
+        (((ExprHandle(2) * x) / y) * y) + ((ExprHandle(2) * x) % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // Opaque numerator.
+    // ((x/2) / y * y) + (x/2 % y) => x / 2.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body =
+        (((x / ExprHandle(2)) / y) * y) + ((x / ExprHandle(2)) % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, div->rhs(), 2);
+  }
+
+  {
+    // Numerator and denominator.
+    // ((2*x)/(2*y) * (2*y)) + ((2*x) % (2*y)) => 2 * x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body =
+        (((ExprHandle(2) * x) / (ExprHandle(2) * y)) * (ExprHandle(2) * y)) +
+        ((ExprHandle(2) * x) % (ExprHandle(2) * y));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // Reverse order.
+    // ((2*x) % (2*y)) + ((2*x)/(2*y) * (2*y)) => 2 * x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((ExprHandle(2) * x) % (ExprHandle(2) * y)) +
+        (((ExprHandle(2) * x) / (ExprHandle(2) * y)) * (ExprHandle(2) * y));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // Negated Subtraction of Round Mod.
+    // (x/y) * y - (0 - x%y) => x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((x / y) * y) - (ExprHandle(0) - (x % y));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // Other terms are preserved.
+    // (x/y)*y + x%y + (y * x) => x + (y * x).
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ((x / y) * y) + (x % y) + (y * x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Sanity checking we wont do the optimization on floats.
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+    ExprHandle body = ((x / y) * y) + (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul);
+    IS_NODE_WITH_NAME(Div, roundMul->lhs(), roundDiv);
+    IS_VAR_WITH_NAME(roundDiv->lhs(), "x");
+    IS_VAR_WITH_NAME(roundDiv->rhs(), "y");
+    IS_VAR_WITH_NAME(roundMul->rhs(), "y");
+    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "x");
+    IS_VAR_WITH_NAME(mod->rhs(), "y");
+  }
+
+  {
+    // Sanity check we wont do it if the mod term doesn't match.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle z("z", kInt);
+    ExprHandle body = ((x / y) * y) + (x % z);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "(x / y) * y + x % z");
+  }
+
+  {
+    // Sanity check we wont do it if the div term doesn't match.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle z("z", kInt);
+    ExprHandle body = (y * (x / z)) + (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "x % y + (x / z) * y");
+  }
+
+  {
+    // Sanity check we wont do it if the mul term doesn't match.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle z("z", kInt);
+    ExprHandle body = ((x / y) * z) + (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "x % y + (x / y) * z");
+  }
+}
+
+TEST(Simplify, SimplifyRoundModPatternFactorization) {
+  {
+    // Full factorization.
+    // 2 * (x/y * y) + 2 * (x%y) => 2 * x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = ExprHandle(2) * ((x / y) * y) + ExprHandle(2) * (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // Partial Factorization.
+    // 32 * (x/8) + 4 * (x % 8) => 4 * x.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
+    ExprHandle body = ExprHandle(32) * (x / 8) + ExprHandle(4) * (x % 8);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 4);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    // Factorization requiring constant folding.
+    // 20 * (x  / (16 / 2)) * 2 + (11 % 6) * (x % (7+1)) => 5 * x.
+    VarHandle x("x", kInt);
+    ExprHandle body = ExprHandle(40) * (x / (ExprHandle(16) / 2)) +
+        (ExprHandle(11) % 6) * (x % (ExprHandle(7) + 1));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 5);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    VarHandle x("x", kInt);
+    ExprHandle body = (x / 5) * 10 + ExprHandle(2) * (x % 5);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+
+  {
+    VarHandle x("x", kInt);
+    ExprHandle body = (x / 10) * 0 + x % 5;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 5);
+  }
+}
+
+TEST(Simplify, SimplifyRoundModPatternMultivar) {
+  {
+    // Multivar.
+    // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x / ExprHandle(8) * ExprHandle(8)) +
+        (y / ExprHandle(5) * ExprHandle(5)) + (x % 8) + (y % 5);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
+  }
+
+  {
+    // Find the right var.
+    // (y/8) * 8  x%8 + y%8 + z%8 => x%8 + y + z%8
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle z("z", kInt);
+    ExprHandle body =
+        (y / ExprHandle(8) * ExprHandle(8)) + (x % 8) + (y % 8) + (z % 8);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_NODE_WITH_NAME(Add, add->lhs(), add2);
+    IS_NODE_WITH_NAME(Mod, add2->lhs(), xMod);
+    IS_VAR_WITH_NAME(xMod->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, xMod->rhs(), 8);
+    IS_VAR_WITH_NAME(add2->rhs(), "y");
+    IS_NODE_WITH_NAME(Mod, add->rhs(), zMod);
+    IS_VAR_WITH_NAME(zMod->lhs(), "z");
+    IS_IMM_WITH_VAL(Int, zMod->rhs(), 8);
+  }
+
+  {
+    // Compound.
+    // (x + (z + 512 * y) % 16) + 16 * ((z + 512 * y) / 16)
+    // => (z + 512 * y) + x
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle z("z", kInt);
+
+    ExprHandle body = x + (z + y * 512) % 16 + ((z + y * 512) / 16 * 16);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "x + (z + 512 * y)");
+  }
+}
+
+TEST(Simplify, SimplifyModRoundModPattern) {
+  {
+    // t/7 % 9 * 7 + t % 7 => t%63
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / 7 % 9) * 7 + t % 7;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
+  }
+
+  {
+    // 2*t/7 % 9 * 7 + 2*t % 7 => 2*t % 63
+    VarHandle t("t", kInt);
+    ExprHandle body = (ExprHandle(2) * t / 7 % 9) * 7 + ExprHandle(2) * t % 7;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_NODE_WITH_NAME(Mul, mod->lhs(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
+  }
+
+  {
+    // t/x % y * x + t % x => t%(x*y)
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (t / x % y) * x + t % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // k*t/x % y * x + k*t % x => k*t%(x*y)
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle k("k", kInt);
+    ExprHandle body = (k * t / x % y) * x + k * t % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "(k * t) % (x * y)");
+  }
+
+  {
+    // t/k/x % y * x + t/k % x => t/k%(x*y)
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle k("k", kInt);
+    ExprHandle body = (t / k / x % y) * x + t / k % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "t");
+    IS_VAR_WITH_NAME(div->rhs(), "k");
+    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // Sanity checking we wont do the optimization on floats.
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+    VarHandle z("z", kFloat);
+    ExprHandle body = ((x / y % z) * y) + (x % y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_NODE_WITH_NAME(Mul, add->lhs(), mul);
+    IS_NODE_WITH_NAME(Mod, mul->lhs(), mod);
+    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "x");
+    IS_VAR_WITH_NAME(div->rhs(), "y");
+    IS_VAR_WITH_NAME(mod->rhs(), "z");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+    IS_NODE_WITH_NAME(Mod, add->rhs(), mod2);
+    IS_VAR_WITH_NAME(mod2->lhs(), "x");
+    IS_VAR_WITH_NAME(mod2->rhs(), "y");
+  }
+}
+
+TEST(Simplify, SimplifyModRoundModPatternFactorization) {
+  {
+    // 2 * (t /7 % 9 * 7) + 2 * (t % 7) => 2 * (t % 63)
+    VarHandle t("t", kInt);
+    ExprHandle body =
+        ExprHandle(2) * ((t / 7 % 9) * 7) + ExprHandle(2) * (t % 7);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
+  }
+
+  {
+    // t /7 % 9 * 14 + 2* (t % 7) => 2* (t % 63)
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / 7 % 9) * 14 + ExprHandle(2) * (t % 7);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
+  }
+
+  {
+    // t/14 % 9 * 7 + t/2 % 7 => t/2 % 63
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / 14 % 9) * 7 + t / 2 % 7;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, div->rhs(), 2);
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
+  }
+
+  {
+    // t/(7*3) % 9 * 7*3 + t % (7*3) => t % 189
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / (ExprHandle(7) * ExprHandle(3)) % 9) * 7 * 3 +
+        t % (ExprHandle(7) * ExprHandle(3));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod->rhs(), 189);
+  }
+
+  {
+    // 2*(t/x % y * x) + 2*(t % x) => 2*(t%(x*y))
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body =
+        ExprHandle(2) * ((t / x % y) * x) + ExprHandle(2) * (t % x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul2);
+    IS_VAR_WITH_NAME(mul2->lhs(), "x");
+    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+  }
+}
+
+TEST(Simplify, SimplifyModRoundModPatternMultivar) {
+  {
+    // t/7 % 9 * 7 + t % 7 + t => t % 63 + t
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / 7 % 9) * 7 + t % 7 + t;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "t % 63 + t");
+  }
+
+  {
+    // t/7 % 9 * 7 + t/8 % 9 * 8 + t % 7 + t % 8  => t % 63 + t % 72
+    VarHandle t("t", kInt);
+    ExprHandle body = (t / 7 % 9) * 7 + (t / 8 % 9) * 8 + t % 7 + t % 8;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_NODE_WITH_NAME(Mod, add->lhs(), mod1);
+    IS_VAR_WITH_NAME(mod1->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod1->rhs(), 63);
+    IS_NODE_WITH_NAME(Mod, add->rhs(), mod2);
+    IS_VAR_WITH_NAME(mod2->lhs(), "t");
+    IS_IMM_WITH_VAL(Int, mod2->rhs(), 72);
+  }
+
+  {
+    // k + t/x % y * x + t % x => k + t%(x*y)
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle k("k", kInt);
+    ExprHandle body = k + (t / x % y) * x + t % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_VAR_WITH_NAME(add->lhs(), "k");
+    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
+    IS_VAR_WITH_NAME(mod->lhs(), "t");
+    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
+    IS_VAR_WITH_NAME(mul->lhs(), "x");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
+  }
+
+  {
+    // t/x % y * x + t % x + (t/k / x % y) * x + t/k % x
+    // => t%(x*y) + t/k % (x*y)
+    VarHandle t("t", kInt);
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    VarHandle k("k", kInt);
+    ExprHandle body = (t / x % y) * x + t % x + (t / k / x % y) * x + t / k % x;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    checkExprIR(simplified, "(t / k) % (x * y) + t % (x * y)");
+  }
+
+  {
+    // 3D: (7 * ((i0_flat / 7) % 9) + i0_flat % 7) + 63 * (i0_flat / 63)
+    // => io_flat
+    VarHandle t("io_flat", kInt);
+    ExprHandle body =
+        ExprHandle(7) * (t / 7 % 9) + t % 7 + ExprHandle(63) * (t / 63);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
+  }
+
+  { // 5D: i0_flat / (11 * 10 * 9 * 7)  * (7 * 9 * 10 * 11) +
+    // (i0_flat / (10 * 9 * 7) % 11)  * 7 * 9 * 10 +
+    // (i0_flat / (9 * 7) % 10) * 7 * 9 +
+    // (i0_flat / 7 % 9)  * 7 +
+    // i0_flat % 7 => io_flat
+    VarHandle t("io_flat", kInt);
+    ExprHandle body = (t / (ExprHandle(11) * 10 * 9 * 7)) * (7 * 9 * 10 * 11) +
+        (t / (ExprHandle(10) * 9 * 7) % 11) * 7 * 9 * 10 +
+        (t / (ExprHandle(9) * 7) % 10) * 7 * 9 + (t / 7 % 9) * 7 + t % 7;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
+  }
+
+  {
+    // 3D: (m * ((i0_flat / m) % n) + i0_flat % m) + (m * n) *
+    // (i0_flat / (m * n)) => io_flat
+    VarHandle t("io_flat", kInt);
+    VarHandle m("m", kInt);
+    VarHandle n("n", kInt);
+    ExprHandle body = m * (t / m % n) + t % m + (m * n) * (t / (m * n));
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
+  }
+
+  { // 5D: i0_flat / (k * l * n * m)  * (m * n * l * k) +
+    // (i0_flat / (l * n * m) % k)  * m * n * l +
+    // (i0_flat / (n * m) % l) * m * n +
+    // (i0_flat / m % n)  * m +
+    // i0_flat % m => io_flat
+    VarHandle t("io_flat", kInt);
+    VarHandle m("m", kInt);
+    VarHandle n("n", kInt);
+    VarHandle l("l", kInt);
+    VarHandle k("k", kInt);
+    ExprHandle body = (t / (k * l * n * m)) * (m * n * l * k) +
+        (t / (l * n * m) % k) * m * n * l + (t / (n * m) % l) * m * n +
+        (t / m % n) * m + t % m;
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
+  }
+}
+
+TEST(Simplify, SimplifyDivisionScalarFactorization) {
+  {
+    // Simple factorization of numerator and denominator.
+    // 8x / 4y => 2x / y.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x * 8) / (y * 4);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_VAR_WITH_NAME(div->rhs(), "y");
+  }
+
+  {
+    // Don't change anything if we can't factorize.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x * 7) / (y * 4);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 7);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_NODE_WITH_NAME(Mul, div->rhs(), rhs);
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 4);
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+  }
+
+  {
+    // Don't reorder floats.
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+    ExprHandle body = (x * 8) / (y * 4);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_IMM_WITH_VAL(Float, lhs->rhs(), 8.f);
+    IS_NODE_WITH_NAME(Mul, div->rhs(), rhs);
+    IS_VAR_WITH_NAME(rhs->lhs(), "y");
+    IS_IMM_WITH_VAL(Float, rhs->rhs(), 4.f);
+  }
+
+  {
+    // Sanity check we do nothing if there are only scalar parts.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x * 1) / (y * 1);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_VAR_WITH_NAME(div->lhs(), "x");
+    IS_VAR_WITH_NAME(div->rhs(), "y");
+  }
+
+  {
+    // Can factorize amounts of variables.
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = (x + x + x + x) / (y + y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Div, simplified.node(), div);
+    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_VAR_WITH_NAME(div->rhs(), "y");
+  }
+}
+
+TEST(Simplify, SimplifyConstantBranches) {
+  {
+    // If the condition is constant true then take the true_value.
+    // 1 ? x : y => x
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle t(1);
+    ExprHandle body = IfThenElse::make(t, x, y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // If the condition is constant false then take the false_value.
+    // 0 ? x : y => y
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle t(0);
+    ExprHandle body = IfThenElse::make(t, x, y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "y");
+  }
+
+  {
+    // condition is simplified before checking.
+    // (x-x) ? x : y => y
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = IfThenElse::make(x - x, x, y);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "y");
+  }
+
+  {
+    // If both branches are the same then don't do the condition.
+    // y ? x : x => x
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = IfThenElse::make(y, x, x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_VAR_WITH_NAME(simplified.node(), "x");
+  }
+
+  {
+    // If both branches simplify to the same thing it still works.
+    // y ? (x + x) : (2 * x) => x
+    VarHandle x("x", kInt);
+    VarHandle y("y", kInt);
+    ExprHandle body = IfThenElse::make(y, x + x, ExprHandle(2) * x);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
+    IS_VAR_WITH_NAME(mul->rhs(), "x");
+  }
+}
+
+TEST(Simplify, SimplifyConstantCond) {
+  {
+    // If the condition is constant true then take the true_value.
+    // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
+    ExprHandle condition(1);
+    StmtPtr true_val = Store::make(a, {0}, 1);
+    StmtPtr false_val = Store::make(b, {0}, 1);
+
+    CondPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "A");
+  }
+
+  {
+    // If the condition is constant false then take the false_value.
+    // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
+    ExprHandle condition(0);
+    StmtPtr true_val = Store::make(a, {0}, 1);
+    StmtPtr false_val = Store::make(b, {0}, 1);
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "B");
+  }
+
+  {
+    // condition is simplified before checking.
+    // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1
+    VarHandle x("x", kInt);
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
+    ExprHandle condition(x - x);
+    StmtPtr true_val = Store::make(a, {0}, 1);
+    StmtPtr false_val = Store::make(b, {0}, 1);
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "B");
+  }
+
+  {
+    // If both branches are the same then don't do the condition.
+    // x ? A[0] = x : A[0] = x => A[0] = x
+    VarHandle x("x", kInt);
+    BufHandle a("A", {1}, kInt);
+    ExprHandle condition(x - x);
+    StmtPtr true_val = Store::make(a, {0}, x);
+    StmtPtr false_val = Store::make(a, {0}, x);
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "A");
+  }
+
+  {
+    // If both branches simplify to the same thing it still works.
+    // x ? (x + x) : (2 * x) => x
+    VarHandle x("x", kInt);
+    BufHandle a("A", {1}, kInt);
+    ExprHandle condition(x - x);
+    StmtPtr true_val = Store::make(a, {0}, ExprHandle(2) * x);
+    StmtPtr false_val = Store::make(a, {0}, x + x);
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "A");
+  }
+
+  {
+    // But not if they dont
+    // x ? x : (2 * x) => x ? x : (2 * x)
+    VarHandle x("x", kInt);
+    BufHandle a("A", {1}, kInt);
+    ExprHandle condition(x);
+    StmtPtr true_val = Store::make(a, {0}, x);
+    StmtPtr false_val = Store::make(a, {0}, ExprHandle(2) * x);
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_EQ(block, nullptr);
+  }
+
+  {
+    StmtPtr cond = alloc<Cond>(
+        ExprHandle(false).node(),
+        alloc<Block>(std::vector<StmtPtr>({})),
+        nullptr);
+    StmtPtr simplified = IRSimplifier::simplify(cond);
+    ASSERT_EQ(simplified, nullptr);
+  }
+
+  {
+    StmtPtr cond = alloc<Cond>(
+        ExprHandle(true).node(),
+        nullptr,
+        alloc<Block>(std::vector<StmtPtr>({})));
+    StmtPtr simplified = IRSimplifier::simplify(cond);
+    ASSERT_EQ(simplified, nullptr);
+  }
+}
+
+TEST(Simplify, SimplifyEliminateEmptyCond) {
+  // If the branches are empty in different ways, eliminate.
+  {
+    VarHandle x("x", kInt);
+    ExprHandle condition(x);
+    StmtPtr true_val = alloc<Block>(std::vector<StmtPtr>({}));
+
+    StmtPtr body = alloc<Cond>(condition.node(), true_val, nullptr);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_NE(block, nullptr);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+
+  {
+    VarHandle x("x", kInt);
+    ExprHandle condition(x);
+    StmtPtr false_val = alloc<Block>(std::vector<StmtPtr>({}));
+
+    StmtPtr body = alloc<Cond>(condition.node(), nullptr, false_val);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_NE(block, nullptr);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+}
+
+TEST(Simplify, SimplifyConstantComparisons) {
+  auto ComparisonTest =
+      [](ExprHandle a, ExprHandle b, CompareSelectOperation op, int result) {
+        ExprHandle body = CompareSelect::make(a, b, op);
+        ExprHandle simplified = IRSimplifier::simplify(body);
+        IS_IMM_WITH_VAL(Int, simplified.node(), result);
+      };
+
+  // Equals.
+  ComparisonTest(2, 2, kEQ, 1);
+  ComparisonTest(1, 2, kEQ, 0);
+  ComparisonTest(2, 1, kEQ, 0);
+
+  // Greater than.
+  ComparisonTest(2, 2, kGT, 0);
+  ComparisonTest(1, 2, kGT, 0);
+  ComparisonTest(2, 1, kGT, 1);
+
+  // Greater or Equal.
+  ComparisonTest(2, 2, kGE, 1);
+  ComparisonTest(1, 2, kGE, 0);
+  ComparisonTest(2, 1, kGE, 1);
+
+  // Less Than.
+  ComparisonTest(2, 2, kLT, 0);
+  ComparisonTest(1, 2, kLT, 1);
+  ComparisonTest(2, 1, kLT, 0);
+
+  // Less or Equal.
+  ComparisonTest(2, 2, kLE, 1);
+  ComparisonTest(1, 2, kLE, 1);
+  ComparisonTest(2, 1, kLE, 0);
+
+  // Not equal.
+  ComparisonTest(2, 2, kNE, 0);
+  ComparisonTest(1, 2, kNE, 1);
+  ComparisonTest(2, 1, kNE, 1);
+
+  // With specified results:
+  ExprHandle body = CompareSelect::make(2, 2, 5, 42, kNE);
+  ExprHandle simplified = IRSimplifier::simplify(body);
+  IS_IMM_WITH_VAL(Int, simplified.node(), 42);
+}
+
+TEST(Simplify, SimplifySymbolicComparisons) {
+  VarHandle x("x", kInt);
+  VarHandle y("y", kInt);
+
+  auto TookTrueBranch = [](ExprHandle a) { IS_IMM_WITH_VAL(Int, a.node(), 1); };
+  auto TookFalseBranch = [](ExprHandle a) {
+    IS_IMM_WITH_VAL(Int, a.node(), 0);
+  };
+
+  // EQ
+
+  // x == x => 1
+  ExprHandle body = CompareSelect::make(x, x, kEQ);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x == x+1 => 0
+  body = CompareSelect::make(x, x + 1, kEQ);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x == x * 2 cannot simplify since we don't know x is nonzero.
+  body = CompareSelect::make(x, x * 2, kEQ);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
+
+  // x == x * 1 => 1
+  body = CompareSelect::make(x, x * 1, kEQ);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  {
+    // x == y => x == y
+    body = CompareSelect::make(x, y, kEQ);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(CompareSelect, simplified.node(), cmp);
+    ASSERT_EQ(cmp->compare_select_op(), kEQ);
+    IS_VAR_WITH_NAME(cmp->lhs(), "x");
+    IS_VAR_WITH_NAME(cmp->rhs(), "y");
+  }
+
+  {
+    // x == 5 => x == 5
+    body = CompareSelect::make(x, 5, kEQ);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(CompareSelect, simplified.node(), cmp);
+    ASSERT_EQ(cmp->compare_select_op(), kEQ);
+    IS_VAR_WITH_NAME(cmp->lhs(), "x");
+    IS_IMM_WITH_VAL(Int, cmp->rhs(), 5);
+  }
+
+  // GT
+
+  // x+1 > x => 1
+  body = CompareSelect::make(x + 1, x, kGT);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x > x + 1 => 0
+  body = CompareSelect::make(x, x + 1, kGT);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x > x - 1 => 1
+  body = CompareSelect::make(x, x - 1, kGT);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x - 1 > x => 0
+  body = CompareSelect::make(x - 1, x, kGT);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x > x => 0
+  body = CompareSelect::make(x, x, kGT);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x * 2 > x => x * 2 > x
+  // since we don't know the sign of x.
+  body = CompareSelect::make(x * 2, x, kGT);
+  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
+
+  // GE
+
+  // x+1 >= x => 1
+  body = CompareSelect::make(x + 1, x, kGE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x >= x + 1 => 0
+  body = CompareSelect::make(x, x + 1, kGE);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x >= x => 1
+  body = CompareSelect::make(x, x, kGE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x * 2 >= x => x * 2 >= x
+  // since we don't know the sign of x.
+  body = CompareSelect::make(x * 2, x, kGE);
+  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
+
+  // LT
+
+  // x+1 < x => 0
+  body = CompareSelect::make(x + 1, x, kLT);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x < x + 1 => 1
+  body = CompareSelect::make(x, x + 1, kLT);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x < x => 0
+  body = CompareSelect::make(x, x, kLT);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // LE
+
+  // x+1 <= x => 0
+  body = CompareSelect::make(x + 1, x, kLE);
+  TookFalseBranch(IRSimplifier::simplify(body));
+
+  // x <= x + 1 => 1
+  body = CompareSelect::make(x, x + 1, kLE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x <= x => 1
+  body = CompareSelect::make(x, x, kLE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // NE
+
+  // x+1 != x => 1
+  body = CompareSelect::make(x + 1, x, kNE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x != x + 1 => 1
+  body = CompareSelect::make(x, x + 1, kNE);
+  TookTrueBranch(IRSimplifier::simplify(body));
+
+  // x != x => 0
+  body = CompareSelect::make(x, x, kNE);
+  TookFalseBranch(IRSimplifier::simplify(body));
+}
+
+TEST(Simplify, SimplifyEliminateZeroLengthFor) {
+  {
+    // Will eliminate zero loop For.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 0, 0, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+
+  {
+    // still works if start is not zero.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 2, 2, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+
+  {
+    // works if both terms are variable.
+    VarHandle x("x", kInt);
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, x, x, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+
+  {
+    // works if one term simplifies down.
+    VarHandle x("x", kInt);
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 0, x - x, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+
+  {
+    // Sanity check does nothing if the condition is not met.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 0, 3, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE(For, simplified);
+  }
+}
+
+TEST(Simplify, SimplifyOneLoopFor) {
+  {
+    // Will remove the loop if the body is run once.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
+  }
+
+  {
+    // still works if start is not zero.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 2, 3, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_IMM_WITH_VAL(Int, store->flat_index(), 2);
+  }
+
+  {
+    // works if both terms are variable.
+    VarHandle x("x", kInt);
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, x, x + 1, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_VAR_WITH_NAME(store->flat_index(), "x");
+  }
+
+  {
+    // works if one term simplifies down.
+    VarHandle x("x", kInt);
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body =
+        For::make(i, 0, x - x + 1, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
+  }
+
+  {
+    // Sanity check does nothing if the condition is not met.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    auto body = For::make(i, 0, 3, Store::make(c, {i}, Load::make(a, {i})));
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE(For, simplified);
+  }
+}
+
+TEST(Simplify, SimplifyForWontLoseLoopOptions) {
+  {
+    // Sanity check does nothing if the condition is not met.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    LoopOptions options;
+    options.set_gpu_block_index(LoopOptions::IDX_W);
+    auto body =
+        For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})), options);
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, for_);
+    LoopOptions options2 = for_->loop_options();
+    ASSERT_EQ(options.gpu_block_index(), options2.gpu_block_index());
+  }
+}
+
+TEST(Simplify, SimplifyMultilevelFor) {
+  {
+    // Multiple layers of For will be simplified out.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    VarHandle j("j", kInt);
+    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
+    auto outer = For::make(j, 0, 1, body);
+    StmtPtr simplified = IRSimplifier::simplify(outer);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
+  }
+
+  {
+    // Will maintain an outer loop if the inner loop is eliminated.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    VarHandle j("j", kInt);
+    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
+    auto outer = For::make(j, 0, 2, body);
+    StmtPtr simplified = IRSimplifier::simplify(outer);
+    ForPtr for__ = static_to<For>(simplified);
+    IS_NODE_WITH_NAME(For, for__, for_);
+    IS_VAR_WITH_NAME(for_->var(), "j");
+    IS_IMM_WITH_VAL(Int, for_->start(), 0);
+    IS_IMM_WITH_VAL(Int, for_->stop(), 2);
+    BlockPtr block = to<Block>(for_->body());
+    ASSERT_NE(block, nullptr);
+    IS_NODE_WITH_NAME(Store, block->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
+  }
+
+  {
+    // Will maintain inner loop if outer loops is eliminated.
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
+    VarHandle i("i", kInt);
+    VarHandle j("j", kInt);
+    auto body = For::make(i, 0, 2, Store::make(c, {i}, Load::make(a, {i})));
+    auto outer = For::make(j, 0, 1, body);
+    StmtPtr simplified = IRSimplifier::simplify(outer);
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(For, block->front(), for_);
+    IS_VAR_WITH_NAME(for_->var(), "i");
+    IS_IMM_WITH_VAL(Int, for_->start(), 0);
+    IS_IMM_WITH_VAL(Int, for_->stop(), 2);
+    IS_NODE_WITH_NAME(Store, for_->body()->front(), store);
+    IS_VAR_WITH_NAME(store->base_handle(), "C");
+    IS_VAR_WITH_NAME(store->flat_index(), "i");
+  }
+}
+
+TEST(Simplify, SimplifyForCleansUp) {
+  {
+    BufHandle a("a", {1, 12, 1}, kFloat);
+    VarHandle x("x", kInt);
+    Tensor b = Compute(
+        "x",
+        {1, 12, 1},
+        [](const VarHandle& i, const VarHandle& m, const VarHandle& n) {
+          return i + m + n;
+        });
+    LoopNest l({b});
+    l.prepareForCodegen();
+
+    StmtPtr body = LoopNest::sanitizeNames(l.root_stmt());
+    StmtPtr simplified = IRSimplifier::simplify(body);
+
+    BlockPtr block = to<Block>(simplified);
+    IS_NODE_WITH_NAME(For, block->front(), for_);
+    // for is over "m".
+    IS_VAR_WITH_NAME(for_->var(), "j");
+    // x[m] = m;
+    IS_NODE_WITH_NAME(Store, for_->body()->front(), store);
+    IS_VAR_WITH_NAME(store->flat_index(), "j");
+    IS_VAR_WITH_NAME(store->value(), "j");
+  }
+}
+
+TEST(Simplify, SimplifyEliminateEmptyFor) {
+  {
+    // Flatten many layers around an empty block to an empty block.
+    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
+    for ([[maybe_unused]] const auto i : c10::irange(11)) {
+      VarHandle loopVar("loopVar", kInt);
+      last = For::make(loopVar, 0, 10, last);
+    }
+
+    StmtPtr simplified = IRSimplifier::simplify(last);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+}
+
+TEST(Simplify, SimplifyFlattenBlock) {
+  {
+    // Flatten multiple blocks down to one.
+    // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
+    BufHandle a("A", {1}, kInt);
+    StorePtr store1 = Store::make(a, {0}, 1);
+    StorePtr store2 = Store::make(a, {0}, 0);
+
+    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store1, store2}));
+    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({block1}));
+
+    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({block2}));
+    StmtPtr simplified = IRSimplifier::simplify(enclosing);
+
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+
+    IS_NODE_WITH_NAME(Store, block->front(), store1_);
+    IS_NODE_WITH_NAME(Store, block->back(), store2_);
+
+    ASSERT_EQ(store1->value(), store1_->value());
+    ASSERT_EQ(store2->value(), store2_->value());
+  }
+
+  {
+    // Flatten multiple sub blocks containing statements.
+    // { { stmt1 }, { stmt2 } } =>  { stmt1, stmt2 }
+    BufHandle a("A", {1}, kInt);
+    StorePtr store1 = Store::make(a, {0}, 1);
+    StorePtr store2 = Store::make(a, {0}, 0);
+
+    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store1}));
+    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({store2}));
+
+    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({block1, block2}));
+    StmtPtr simplified = IRSimplifier::simplify(enclosing);
+
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+
+    IS_NODE_WITH_NAME(Store, block->front(), store1_);
+    IS_NODE_WITH_NAME(Store, block->back(), store2_);
+
+    ASSERT_EQ(store1->value(), store1_->value());
+    ASSERT_EQ(store2->value(), store2_->value());
+  }
+
+  {
+    // Flatten sub blocks with different depths.
+    // { stmt1 , { { stmt2 } } } =>  { stmt1, stmt2 }
+    BufHandle a("A", {1}, kInt);
+    StorePtr store1 = Store::make(a, {0}, 1);
+    StorePtr store2 = Store::make(a, {0}, 0);
+
+    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store2}));
+    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({block1}));
+
+    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({store1, block2}));
+    StmtPtr simplified = IRSimplifier::simplify(enclosing);
+
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+
+    IS_NODE_WITH_NAME(Store, block->front(), store1_);
+    IS_NODE_WITH_NAME(Store, block->back(), store2_);
+
+    ASSERT_EQ(store1->value(), store1_->value());
+    ASSERT_EQ(store2->value(), store2_->value());
+  }
+
+  {
+    // Flatten many layers around an empty block to an empty block.
+    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
+    for ([[maybe_unused]] const auto i : c10::irange(11)) {
+      last = alloc<Block>(std::vector<StmtPtr>({last}));
+    }
+
+    StmtPtr simplified = IRSimplifier::simplify(last);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 0);
+  }
+}
+
+TEST(Simplify, SimplifyEliminateZeroLengthAlloc) {
+  {
+    // Simple positive case.
+    BufHandle b("x", {0}, kInt);
+
+    AllocatePtr alloc_ = Allocate::make(b);
+    FreePtr free_ = Free::make(b);
+
+    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({alloc_, free_}));
+    ASSERT_EQ(block1->nstmts(), 2);
+
+    StmtPtr simplified = IRSimplifier::simplify(block1);
+    IS_NODE_WITH_NAME(Block, simplified, block2);
+    ASSERT_EQ(block2->nstmts(), 0);
+  }
+
+  {
+    // Simple negative case.
+    BufHandle b("x", {2}, kInt);
+
+    AllocatePtr alloc_ = Allocate::make(b);
+    FreePtr free_ = Free::make(b);
+
+    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({alloc_, free_}));
+    ASSERT_EQ(block1->nstmts(), 2);
+
+    StmtPtr simplified = IRSimplifier::simplify(block1);
+    IS_NODE_WITH_NAME(Block, simplified, block2);
+    ASSERT_EQ(block2->nstmts(), 2);
+  }
+
+  {
+    // Finds right Alloc/Free.
+    BufHandle b1("x", {0}, kInt);
+    BufHandle b2("y", {2}, kInt);
+
+    AllocatePtr alloc1 = Allocate::make(b1);
+    AllocatePtr alloc2 = Allocate::make(b2);
+    FreePtr free2_ = Free::make(b2);
+    FreePtr free1_ = Free::make(b1);
+
+    BlockPtr block1 =
+        alloc<Block>(std::vector<StmtPtr>({alloc1, alloc2, free2_, free1_}));
+    ASSERT_EQ(block1->nstmts(), 4);
+
+    StmtPtr simplified = IRSimplifier::simplify(block1);
+    IS_NODE_WITH_NAME(Block, simplified, block2);
+    ASSERT_EQ(block2->nstmts(), 2);
+    IS_NODE_WITH_NAME(Allocate, block2->stmts().front(), simplified_alloc);
+    IS_VAR_WITH_NAME(simplified_alloc->buffer_var(), "y");
+    IS_NODE_WITH_NAME(Free, block2->stmts().back(), simplified_free);
+    ASSERT_EQ(simplified_alloc->buffer_var(), simplified_free->buffer_var());
+  }
+
+  {
+    // Dynamic shape.
+    VarHandle z("z", kInt);
+    BufHandle b1("x", {0}, kInt);
+    BufHandle b2("y", {z}, kInt);
+
+    AllocatePtr alloc1 = Allocate::make(b1);
+    AllocatePtr alloc2 = Allocate::make(b2);
+    FreePtr free2_ = Free::make(b2);
+    FreePtr free1_ = Free::make(b1);
+
+    BlockPtr block1 =
+        alloc<Block>(std::vector<StmtPtr>({alloc1, alloc2, free2_, free1_}));
+    ASSERT_EQ(block1->nstmts(), 4);
+    StmtPtr simplified = IRSimplifier::simplify(block1);
+    IS_NODE_WITH_NAME(Block, simplified, block2);
+    ASSERT_EQ(block2->nstmts(), 2);
+  }
+}
+
+TEST(Simplify, DontSimplifyRand) {
+  {
+    // rand() + rand() = rand() + rand() NOT 2 * rand().
+    ExprHandle body =
+        Intrinsics::make(kRand, kInt) + Intrinsics::make(kRand, kInt);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Add, simplified.node(), add);
+    IS_RAND(add->lhs());
+    IS_RAND(add->rhs());
+  }
+
+  {
+    // rand() - rand() = rand() - rand() NOT 0.
+    ExprHandle body =
+        Intrinsics::make(kRand, kFloat) - Intrinsics::make(kRand, kFloat);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
+    IS_RAND(sub->lhs());
+    IS_RAND(sub->rhs());
+  }
+
+  {
+    // rand() * rand() = rand() * rand().
+    ExprHandle body =
+        Intrinsics::make(kRand, kInt) * Intrinsics::make(kRand, kInt);
+    ExprHandle simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
+    IS_RAND(mul->lhs());
+    IS_RAND(mul->rhs());
+  }
+}
+
+TEST(Simplify, SimplifyReorderForCond) {
+  BufHandle a("A", {4}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {4}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+
+  {
+    // for ( if ( ... ) ) => if ( for ( ... ) ).
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+            Store::make(c, {i}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(For, true_block->front(), loop);
+  }
+
+  {
+    // Can't reorder if condition is dependent on the loop var.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(i, 2, CompareSelectOperation::kEQ),
+            Store::make(c, {i}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, loop);
+    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
+  }
+
+  {
+    // Can't reorder if condition is dependent on a var that is modified inside
+    // the loop.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(c, {0}), 10, CompareSelectOperation::kLT),
+            Store::make(c, {0}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, loop);
+    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
+  }
+
+  {
+    // Condition based on buffer not referenced in body. Can reorder here.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(b, {0}), 10, CompareSelectOperation::kLT),
+            Store::make(c, {0}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(For, true_block->front(), loop);
+  }
+
+  {
+    // Condition based on buffer read only in body. Can reorder here.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
+            Store::make(c, {0}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(For, true_block->front(), loop);
+  }
+
+  {
+    // Condition depends on Let in the loop. Cannot reorder.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Block::make(
+            {Let::make(j, 3),
+             Cond::make(
+                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+                 Store::make(c, {0}, Load::make(a, {i})),
+                 nullptr)}));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, loop);
+    IS_NODE_WITH_NAME(Let, loop->body()->front(), let);
+    IS_NODE_WITH_NAME(Cond, loop->body()->back(), cond);
+  }
+
+  {
+    // Multi level Ifs where all conditions are distinct. Move BOTH Cond
+    // statements outside the loop.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
+            Cond::make(
+                CompareSelect::make(j, 10, CompareSelectOperation::kEQ),
+                Store::make(c, {0}, Load::make(a, {i})),
+                nullptr),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(Cond, true_block->front(), cond2);
+    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_block2);
+    IS_NODE_WITH_NAME(For, true_block2->front(), loop);
+  }
+
+  {
+    // Multi level Ifs where the inner condition does depend on a loop var,
+    // reorder only the first Cond.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
+            Cond::make(
+                CompareSelect::make(i, 3, CompareSelectOperation::kEQ),
+                Store::make(c, {0}, Load::make(a, {i})),
+                nullptr),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(For, true_block->front(), loop);
+    IS_NODE_WITH_NAME(Block, loop->body(), loop_body);
+    IS_NODE_WITH_NAME(Cond, loop_body->front(), cond2);
+  }
+
+  {
+    // Don't reorder if there's an else block of the Cond.
+    // We could, but is it much better?
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+            Store::make(c, {0}, Load::make(a, {i})),
+            Store::make(c, {0}, 0)));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, loop);
+    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
+  }
+
+  {
+    // Condition uses distinct region of Tensor.
+    // We could reorder here wih better analysis, but we don't. Included for
+    // completeness.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Cond::make(
+            CompareSelect::make(
+                Load::make(c, {0}), 10, CompareSelectOperation::kLT),
+            Store::make(c, {1}, Load::make(a, {i})),
+            nullptr));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(For, simplified, loop);
+    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
+  }
+}
+
+TEST(Simplify, SimplifyFuseConditions) {
+  BufHandle a("A", {2}, kInt);
+  BufHandle b("B", {2}, kInt);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+
+  {
+    // Can fuse since the conditions are identical.
+    // if (A) { X }; if (A) { Y }; => if (A) { X; Y }
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+  }
+
+  {
+    // Can't fuse, conditions are not identical in lhs (i != j).
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+
+    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
+    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
+    ASSERT_EQ(true_stmt1->nstmts(), 1);
+    ASSERT_EQ(true_stmt2->nstmts(), 1);
+
+    ASSERT_EQ(cond1->false_stmt(), nullptr);
+    ASSERT_EQ(cond2->false_stmt(), nullptr);
+  }
+  {
+    // Can't fuse, conditions are not identical in rhs (10 != 11).
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(i, 11, CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+
+    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
+    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
+    ASSERT_EQ(true_stmt1->nstmts(), 1);
+    ASSERT_EQ(true_stmt2->nstmts(), 1);
+
+    ASSERT_EQ(cond1->false_stmt(), nullptr);
+    ASSERT_EQ(cond2->false_stmt(), nullptr);
+  }
+
+  {
+    // Can't fuse, conditions are not identical in operation (LT vs GT).
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kGT),
+             Store::make(a, {1}, i),
+             nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+
+    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
+    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
+    ASSERT_EQ(true_stmt1->nstmts(), 1);
+    ASSERT_EQ(true_stmt2->nstmts(), 1);
+
+    ASSERT_EQ(cond1->false_stmt(), nullptr);
+    ASSERT_EQ(cond2->false_stmt(), nullptr);
+  }
+
+  {
+    // Can't fuse, CompareSelect results are different.
+    // Actually we totally could if we normalized CompareSelect results, but
+    // TODO for later.
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, 1, 0, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(j, 10, 2, 0, CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+
+    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
+    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
+    ASSERT_EQ(true_stmt1->nstmts(), 1);
+    ASSERT_EQ(true_stmt2->nstmts(), 1);
+
+    ASSERT_EQ(cond1->false_stmt(), nullptr);
+    ASSERT_EQ(cond2->false_stmt(), nullptr);
+  }
+
+  {
+    // Can fuse with false stmt only.
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             nullptr,
+             Store::make(a, {0}, i)),
+         Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             nullptr,
+             Store::make(a, {1}, i))});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->false_stmt(), false_stmt);
+    ASSERT_EQ(false_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->true_stmt(), nullptr);
+  }
+
+  {
+    // Can fuse with both true and false stmt.
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             Store::make(b, {0}, i)),
+         Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             Store::make(b, {1}, i))});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), false_stmt);
+    ASSERT_EQ(false_stmt->nstmts(), 2);
+  }
+
+  {
+    // Can fuse with mismatched true / false stmt existing
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+             nullptr,
+             Store::make(b, {1}, i))});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 1);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), false_stmt);
+    ASSERT_EQ(false_stmt->nstmts(), 1);
+  }
+
+  {
+    // Can fuse partial block contents, ie when there are non fused stmts before
+    // and after.
+    // before:
+    // if (j < 10) { A[0] = j; }
+    // if (i < 10) { A[0] = i; }
+    // if (i < 10) { A[1] = i; }
+    // if (i < 11) { A[1] = j; }
+    //
+    // after:
+    //
+    // if (j < 10) { A[0] = j; }
+    // if (i < 10) {
+    //   A[0] = i;
+    //   A[1] = i;
+    // }
+    // if (i < 11) { A[1] = j; }
+
+    auto body = Block::make({
+        Cond::make(
+            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, j),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, i),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {1}, i),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 11, CompareSelectOperation::kLT),
+            Store::make(a, {1}, j),
+            nullptr),
+    });
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    auto it = block->begin();
+    it++;
+    IS_NODE_WITH_NAME(Cond, *it, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+  }
+
+  {
+    // Can fuse longer sequences of identical conditions.
+    auto body = Block::make({
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, j),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, i),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {1}, i),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {1}, j),
+            nullptr),
+    });
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 4);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+  }
+
+  {
+    // Can't fuse through a non condition.
+    auto body = Block::make({
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, j),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {0}, i),
+            nullptr),
+        Store::make(b, {1}, i + j),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {1}, i),
+            nullptr),
+        Cond::make(
+            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            Store::make(a, {1}, j),
+            nullptr),
+    });
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt2);
+    ASSERT_EQ(true_stmt2->nstmts(), 2);
+    ASSERT_EQ(cond2->false_stmt(), nullptr);
+
+    auto it = block->begin();
+    it++;
+    IS_NODE_WITH_NAME(Store, *it, middle);
+  }
+
+  {
+    // Can fuse if the conditions simplify to the same thing.
+    auto body = Block::make(
+        {Cond::make(
+             CompareSelect::make(
+                 i * 2,
+                 ExprHandle(87) % ExprHandle(11),
+                 CompareSelectOperation::kLT),
+             Store::make(a, {0}, i),
+             nullptr),
+         Cond::make(
+             CompareSelect::make(
+                 i * 2,
+                 ExprHandle(300) / ExprHandle(30),
+                 CompareSelectOperation::kLT),
+             Store::make(a, {1}, i),
+             nullptr)});
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+  }
+
+  {
+    // Can fuse non-CompareSelects.
+    // if (i) { X } if (i) { Y } => if (i) { X; Y }
+    auto body = Block::make(
+        {Cond::make(i, Store::make(a, {0}, i), nullptr),
+         Cond::make(i, Store::make(a, {1}, i), nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
+    ASSERT_EQ(true_stmt->nstmts(), 2);
+    ASSERT_EQ(cond->false_stmt(), nullptr);
+  }
+
+  {
+    // Sanity check wont fuse different non-CompareSelects.
+    auto body = Block::make(
+        {Cond::make(i, Store::make(a, {0}, i), nullptr),
+         Cond::make(j, Store::make(a, {1}, i), nullptr)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
+    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
+  }
+
+  {
+    // Sanity check constant condition elimination still occurs when merging is
+    // possible.
+    auto body = Block::make(
+        {Cond::make(1, Store::make(a, {0}, i), nullptr),
+         Cond::make(1, Store::make(a, {1}, i), nullptr)});
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 2);
+    IS_NODE_WITH_NAME(Store, block->front(), store1);
+    IS_NODE_WITH_NAME(Store, block->back(), store2);
+  }
+
+  {
+    // Sanity check for-cond reordering occurs after fusing.
+    auto body = For::make(
+        i,
+        0,
+        4,
+        Block::make(
+            {Cond::make(
+                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+                 Store::make(a, {1}, Load::make(b, {0})),
+                 nullptr),
+             Cond::make(
+                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
+                 Store::make(a, {2}, Load::make(b, {0})),
+                 nullptr)}));
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Cond, simplified, cond);
+    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
+    IS_NODE_WITH_NAME(For, true_block->front(), loop);
+  }
+}
+
+TEST(Simplify, SimplifySyncThreads) {
+  BufHandle a("A", {4}, kInt);
+  VarHandle i("i", kInt);
+
+  {
+    // Merge two inner SyncThreads.
+    auto body = Block::make(
+        // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+        {Store::make(a, {0}, 1),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         Store::make(a, {1}, 0)});
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+
+  {
+    // Eliminate outer SyncThreads.
+    auto body = Block::make(
+        {alloc<SyncThreads>(), Store::make(a, {1}, 0), alloc<SyncThreads>()});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    auto it = block->begin();
+    IS_NODE(Store, *it);
+  }
+
+  {
+    // Merge many inner SyncThreads.
+    auto body = Block::make(
+        {Store::make(a, {0}, 1),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         Store::make(a, {1}, 0)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 3);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+
+  {
+    // Merge multiple outer SyncThreads.
+    auto body = Block::make(
+        {alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         Store::make(a, {1}, 0),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>()});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 1);
+    auto it = block->begin();
+    IS_NODE(Store, *it);
+  }
+
+  {
+    // Merge multiple sections;
+    auto body = Block::make(
+        {Store::make(a, {0}, 1),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         Store::make(a, {1}, 0),
+         Store::make(a, {2}, 0),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         alloc<SyncThreads>(),
+         Store::make(a, {3}, 0)});
+
+    StmtPtr simplified = IRSimplifier::simplify(body);
+    IS_NODE_WITH_NAME(Block, simplified, block);
+    ASSERT_EQ(block->nstmts(), 6);
+    auto it = block->begin();
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+    IS_NODE(Store, *it++);
+    IS_NODE(SyncThreads, *it++);
+    IS_NODE(Store, *it++);
+  }
+}
+
+TEST(Simplify, SimplifyRampSubBroadcast) {
+  int num_lanes = 4;
+  ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes);
+  ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes);
+  ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast);
+  RampPtr newRamp = simplified.AsNode<Ramp>();
+  IS_NODE_WITH_NAME(IntImm, newRamp->base(), base);
+  ASSERT_EQ(base->value(), 5);
+  IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride);
+  ASSERT_EQ(stride->value(), 6);
+  ASSERT_EQ(newRamp->lanes(), num_lanes);
+}
+
+TEST(Simplify, SimplifyBroadcastTermExpander) {
+  int num_lanes = 8;
+  ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes);
+  ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes);
+  ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes);
+  // NB: We need a term in the middle which isn't simplified to trigger the
+  // relevant path in TermExpander::mutate. The two bc1 terms are brought
+  // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
+  ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
+  BufHandle buf("buf", {num_lanes}, kInt);
+  // The result isn't fully simplified currently and thus would be brittle to
+  // match. Observe its value instead.
+  auto store = Store::make(buf, {Ramp::make(0, 1, num_lanes)}, simplified);
+  SimpleIREvaluator eval(store, {buf});
+  std::vector<int> output(num_lanes);
+  eval(output);
+  for (const auto i : c10::irange(num_lanes)) {
+    ASSERT_EQ(output[i], 2);
+  }
+}
+
+TEST(Simplify, CompareSelectLoopBounds) {
+  constexpr int N = 8;
+  BufHandle b("b", {N}, kFloat);
+  VarHandle n("n", kInt);
+  VarHandle m("m", kInt);
+  VarHandle var_N("var_N", kInt);
+  VarHandle var_M("var_M", kInt);
+
+  auto test_case_fn = [](const VarHandle& n,
+                         const BufHandle& b,
+                         const ExprHandle& start,
+                         const ExprHandle& stop,
+                         const int& cmp_val,
+                         const CompareSelectOperation& cmp_op,
+                         const std::string& check_string) {
+    StmtPtr s = For::make(
+        n,
+        start,
+        stop,
+        b.store({n}, CompareSelect::make(n, cmp_val, 0.f, 1.0f, cmp_op)));
+    s = IRSimplifier::simplify(s);
+    std::ostringstream oss;
+    oss << *s;
+    std::string target_string = "# CHECK: ";
+    target_string += check_string;
+    torch::jit::testing::FileCheck().run(target_string, oss.str());
+  };
+
+  auto test_case_nest_loops_fn = [](const VarHandle& n,
+                                    const VarHandle& m,
+                                    const BufHandle& b,
+                                    const ExprHandle& n_start,
+                                    const ExprHandle& n_stop,
+                                    const ExprHandle& m_start,
+                                    const ExprHandle& m_stop,
+                                    const CompareSelectOperation& cmp_op,
+                                    const std::string& check_string) {
+    StmtPtr s = For::make(
+        m,
+        m_start,
+        m_stop,
+        b.store({n, m}, CompareSelect::make(n, m, 0.f, 1.0f, cmp_op)));
+    StmtPtr root_s = For::make(n, n_start, n_stop, s);
+    root_s = IRSimplifier::simplify(root_s);
+    std::ostringstream oss;
+    oss << *root_s;
+    std::string target_string = "# CHECK: ";
+    target_string += check_string;
+    torch::jit::testing::FileCheck().run(target_string, oss.str());
+  };
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kLT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kLE, "b[n] = n<=1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kLE, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kLT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kLT, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kLE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kLE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kLT, "b[n] = n<7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kGT, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kGT, "b[n] = n>1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kGE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kGT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kGE, "b[n] = n>=7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kGT, "b[n] = n>5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kGE, "b[n] = n>=5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kGT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kGE, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, 2)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, 2)) {
+  //     b[1] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, 2, 1, kEQ, "b[1] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kEQ, "b[n] = n==1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kEQ, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kEQ, "b[n] = n==7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kEQ, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kNE, "b[n] = n!=1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kNE, "b[n] = n!=7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kNE, "b[n] = n!=5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kNE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kNE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kNE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kNE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kNE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kNE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(
+      n, m, b, 30, 40, 10, 31, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(
+      n, m, b, 10, 31, 30, 40, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n < m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kLT, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kLT,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kLT,
+      "b[n, m] = n<m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n < m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kLT, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kLT,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kLT,
+      "b[n, m] = n<m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n > m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kGT, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kGT,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kGT,
+      "b[n, m] = n>m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n > m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kGT, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kGT,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kGT,
+      "b[n, m] = n>m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n >= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kGE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kGE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kGE,
+      "b[n, m] = n>=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n >= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kGE, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kGE,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kGE,
+      "b[n, m] = n>=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n <= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kLE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kLE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kLE,
+      "b[n, m] = n<=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n <= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kLE, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kLE,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kLE,
+      "b[n, m] = n<=m ? 0.f : 1.f;");
+}
+
+TEST(Simplify, CompareSelectCondAlwaysInLoopBounds) {
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  constexpr int N = 8;
+  BufHandle b("b", {N}, kFloat);
+  VarHandle n("n", kInt);
+  StmtPtr s = For::make(
+      n, 1, N, b.store({n}, CompareSelect::make(n, 1, 0.f, 1.0f, kLT)));
+  s = IRSimplifier::simplify(s);
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: b[n] = 1.f;
+)IR",
+      oss.str());
+}
+
+TEST(Simplify, IfThenCondAlwaysInLoopBounds) {
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = IfThenElse(n < 1 ? 1 : 0, 0.f, 1.f);
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  constexpr int N = 8;
+  BufHandle b("b", {N}, kFloat);
+  VarHandle n("n", kInt);
+  StmtPtr s =
+      For::make(n, 1, N, b.store({n}, IfThenElse::make(n < 1, 0.f, 1.0f)));
+  s = IRSimplifier::simplify(s);
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: b[n] = 1.f;
+)IR",
+      oss.str());
+}
+
+TEST(Simplify, MultiClauseCondAlwaysInLoopBounds) {
+  // This test mimics the unpadded region of a conv2d.  We want to remove any
+  // conditional that is provably satisfied (or unsatisfied) by the entire loop
+  // range.
+  // Before:
+  //   for (const auto i : c10::irange(1, 7)) {
+  //     for (const auto j : c10::irange(1, 7)) {
+  //       b[i, j] = IfThenElse(
+  //         j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, 1.f);
+  // After:
+  //   for (const auto i : c10::irange(1, 7)) {
+  //     for (const auto j : c10::irange(1, 7)) {
+  //       b[i, j] = 1.f;
+  constexpr int N = 8;
+  BufHandle b("b", {N, N}, kFloat);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto csel = CompareSelect::make(i, 1, kLT);
+  csel = CompareSelect::make(j, 1, 1, csel, kLT);
+  csel = CompareSelect::make(i, N - 1, 1, csel, kGE);
+  csel = CompareSelect::make(j, N - 1, 1, csel, kGE);
+  StmtPtr s = b.store({i, j}, IfThenElse::make(csel, 0.f, 1.0f));
+  s = For::make(j, 1, N - 1, s);
+  s = For::make(i, 1, N - 1, s);
+  s = IRSimplifier::simplify(s);
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: b[i, j] = 1.f;
+)IR",
+      oss.str());
+}
+
+TEST(Simplify, DISABLED_SimplifyLoopBounds) {
+  // This test mimics the padded region of a conv2d.  We want to adjust the
+  // loop bounds such that the condition will be always met.  Note that this
+  // could be solved by peeling, and applying the range-based conditional
+  // simplification in the previous tests.
+  // Before:
+  //   for (const auto i : c10::irange(3)) {
+  //     for (const auto j : c10::irange(3)) {
+  //       b[i, j] = (b[i, j]) + (IfThenElse(
+  //         j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, a[i, j]));
+  // After:
+  //   for (const auto i : c10::irange(1, 3)) {
+  //     for (const auto j : c10::irange(1, 3)) {
+  //       b[i, j] = (b[i, j]) + 1.f;
+  constexpr int N = 8;
+  constexpr int K = 3;
+  BufHandle a("a", {N, N}, kFloat);
+  BufHandle b("b", {N, N}, kFloat);
+  VarHandle i("i", kInt);
+  VarHandle j("j", kInt);
+  auto csel = CompareSelect::make(i, 1, kLT);
+  csel = CompareSelect::make(j, 1, 1, csel, kLT);
+  csel = CompareSelect::make(i, N - 1, 1, csel, kGE);
+  csel = CompareSelect::make(j, N - 1, 1, csel, kGE);
+  StmtPtr s = b.store(
+      {i, j}, b.load({i, j}) + IfThenElse::make(csel, 0.f, a.load({i, j})));
+  s = For::make(j, 0, K, s);
+  s = For::make(i, 0, K, s);
+  s = IRSimplifier::simplify(s);
+  std::ostringstream oss;
+  oss << *s;
+  torch::jit::testing::FileCheck().run(
+      R"IR(
+# CHECK: for (const auto i : c10::irange(1, 3)) {
+# CHECK: for (const auto j : c10::irange(1, 3)) {
+# CHECK-NOT: IfThenElse
+)IR",
+      oss.str());
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
new file mode 100644
index 0000000000000..56535de914e43
--- /dev/null
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -0,0 +1,402 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/codegen/fuser/interface.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/testing/file_check.h>
+#include <sstream>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+struct WithCPUFuser {
+  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
+    overrideCanFuseOnCPU(val);
+  }
+
+  ~WithCPUFuser() {
+    overrideCanFuseOnCPU(cpuFuserEnabled);
+  }
+
+  bool cpuFuserEnabled;
+};
+
+TEST(TEFuserPass, FuserPass_1) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%0 : Float(128, strides=[1], device=cpu),
+          %1 : Float(128, strides=[1], device=cpu)):
+      %12 : int = prim::Constant[value=1]()
+      %2.1 : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
+      %2 : Float(128, strides=[1], device=cpu) = aten::mul(%2.1, %1)
+      %3 : Float(128, strides=[1], device=cpu) = aten::add_(%2, %1, %12)
+      %4 : Float(128, strides=[1], device=cpu) = aten::mul(%2, %1)
+      %5 : Float(128, strides=[1], device=cpu) = aten::add(%2, %4, %12)
+      return (%5))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g);
+
+  // We should not be able to fuse across the in-place operation here.
+  testing::FileCheck()
+      .check("prim::TensorExprGroup_")
+      ->check("aten::add_")
+      ->check("prim::TensorExprGroup_")
+      ->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_2) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%0 : Float(128, strides=[1], device=cpu),
+          %1 : Float(128, strides=[1], device=cpu)):
+      %12 : int = prim::Constant[value=1]()
+      %a : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
+      %b : Float(128, strides=[1], device=cpu) = aten::add(%0, %1, %12)
+      %c : Float(128, strides=[1], device=cpu) = aten::add_(%b, %1, %12)
+      %d : Float(128, strides=[1], device=cpu) = aten::mul(%c, %a)
+      return (%d))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g);
+
+  // We should not be able to fuse across the in-place operation here.
+  testing::FileCheck()
+      .check("aten::add_")
+      ->check("prim::TensorExprGroup_0")
+      ->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_3) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Float(128, strides=[1], device=cpu),
+          %y : Float(128, strides=[1], device=cpu)):
+      %r : Float(128, strides=[1], device=cpu) = aten::mul(%x, %y)
+      return (%r))IR";
+  {
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 2);
+
+    // We should not create a fusion group since its size would be too small
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 1);
+
+    // We should create a fusion group since its size is above the threshold
+    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+  }
+}
+
+TEST(TEFuserPass, FuserPass_0DimInput) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Float(device=cpu),
+          %y : Float(device=cpu)):
+      %one : int = prim::Constant[value=1]()
+      %a : Float(device=cpu) = aten::mul(%x, %y)
+      %b : Float(device=cpu) = aten::add(%x, %a, %one)
+      return (%b))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g);
+
+  // We should fuse 0-dim tensors too
+  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_UnfusibleDevice) {
+  WithCPUFuser cf(false);
+  const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(10, strides=[1], device=cpu)):
+      %a : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
+      return (%a))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 1);
+
+  // Test that we're not starting fusion groups from nodes with unfusible device
+  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_UnknownShapes) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Tensor,
+          %y : Tensor):
+      %a : Tensor = aten::mul(%x, %y)
+      %b : Tensor = aten::mul(%x, %a)
+      return (%b))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g);
+
+  // Test that we're not generating fusion groups when shapes are not known
+  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_Multidevice) {
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      return (%cat))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 1);
+
+    // We should be able to fuse this
+    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cuda:0),
+          %z : Float(30, strides=[1], device=cpu)):
+      %dim : int = prim::Constant[value=0]()
+      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
+      return (%cat))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 1);
+
+    // We should not fuse this aten::cat since its inputs are from different
+    // devices
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(10, strides=[1], device=cuda:0)):
+      %dim : int = prim::Constant[value=0]()
+      %xy_list : Tensor[] = prim::ListConstruct(%x, %y)
+      %xy_cat : Float(30, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
+      %r : Float(30, strides=[1], device=cpu) = aten::mul(%xy_cat, %z)
+      return (%r))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 2);
+
+    // Test that we check device before merging one node (cat) into another
+    // (mul)
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cpu),
+          %z : Float(10, strides=[1], device=cuda:0)):
+      %z2 : Tensor = aten::mul(%z, %z)
+      %dim : int = prim::Constant[value=0]()
+      %xy_list : Tensor[] = prim::ListConstruct(%x, %y, %z2)
+      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
+      return (%cat))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 2);
+
+    // Test that we check device before merging one node (mul) into another
+    // (cat)
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cpu),
+          %y : Float(20, strides=[1], device=cuda:0)):
+      %r : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
+      return (%r))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 1);
+
+    // We should not fuse this graph since its inputs are from different devices
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+  {
+    WithCPUFuser cf;
+    const auto graph_string = R"IR(
+    graph(%x : Float(10, strides=[1], device=cuda:0),
+          %y : Float(20, strides=[1], device=cuda:1),
+          %z : Float(20, strides=[1], device=cpu)):
+      %x2 : Float(10, strides=[1], device=cpu) = aten::mul(%x, %x)
+      %y2 : Float(10, strides=[1], device=cpu) = aten::mul(%y, %y)
+      %z2 : Float(10, strides=[1], device=cpu) = aten::mul(%z, %z)
+      return (%x2, %y2, %z2))IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, g.get());
+
+    g->lint();
+    FuseTensorExprs(g, /* min_group_size= */ 2);
+
+    // We should not fuse these two computations since they use different
+    // devices
+    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+  }
+}
+
+TEST(TEFuserPass, FuserPass_MergeGroups) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%a : Float(128, strides=[1], device=cpu),
+          %b : Float(128, strides=[1], device=cpu)):
+      %x : Float(128, strides=[1], device=cpu) = aten::mul(%a, %a)
+      %y : Float(128, strides=[1], device=cpu) = aten::mul(%b, %b)
+      return (%x, %y))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 1);
+
+  // The %x and %y computations are completely independent and yet we should put
+  // them into a single fusion group rather than having two separate ones.
+  testing::FileCheck()
+      .check("= prim::TensorExprGroup_")
+      ->check_not("= prim::TensorExprGroup_")
+      ->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_IgnoreUnknownShapeAtStart) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Bool(8, strides=[1], device=cpu),
+          %y : Bool(8, strides=[1], device=cpu)):
+      %a : Bool(8, strides=[1], device=cpu) = aten::__and__(%x, %y)
+      %b : Tensor = aten::__or__(%a, %y)
+      return (%b)
+    )IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 2);
+  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_Where) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Float(8, strides=[1], device=cpu),
+          %y : Float(8, strides=[1], device=cpu),
+          %z : Float(8, strides=[1], device=cpu)):
+      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
+      %b : Float(8, strides=[1], device=cpu) = aten::where(%cond, %y, %z)
+      return (%b)
+    )IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 2);
+  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, FuserPass_WhereList) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%x : Float(8, strides=[1], device=cpu),
+          %y : Float(8, strides=[1], device=cpu),
+          %z : Float(8, strides=[1], device=cpu)):
+      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
+      %b : Tensor[] = aten::where(%cond)
+      return (%b)
+    )IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 2);
+  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
+}
+
+TEST(TEFuserPass, DynamicShapeFusion) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
+          %1 : Float(10, 5, strides=[5, 1], device=cpu)):
+      %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
+      %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
+      return (%3))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(
+      g,
+      /* min_group_size = */ 2,
+      /* add_composed_op = */ true,
+      /* fuse_to_dynamic_shapes = */ true);
+  Code code(g, "");
+
+  testing::FileCheck()
+      .check("prim::TensorExprDynamicGroup_")
+      ->check("prim::TensorExprDynamicGuard")
+      ->check("prim::TensorExprGroup_")
+      ->run(*g);
+
+  auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
+    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
+
+    auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
+
+    InterpreterState interp(code);
+    Stack stack(inputs.begin(), inputs.end());
+    interp.run(stack);
+    at::Tensor out = pop(stack).toTensor();
+    ASSERT_TRUE(at::allclose(out, ref));
+  };
+
+  std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
+  run_and_compare(inputs);
+
+  std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
+  run_and_compare(inputs2);
+
+  std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
+  run_and_compare(inputs3);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp
new file mode 100644
index 0000000000000..6758503f4de79
--- /dev/null
+++ b/test/cpp/tensorexpr/test_type.cpp
@@ -0,0 +1,202 @@
+#include <gtest/gtest.h>
+
+#include "torch/csrc/jit/tensorexpr/eval.h"
+#include "torch/csrc/jit/tensorexpr/ir.h"
+#include "torch/csrc/jit/tensorexpr/tensor.h"
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+TEST(Type, Test01) {
+  {
+    Dtype dt1 = kInt;
+    ASSERT_EQ(dt1, kInt);
+  }
+  {
+    Dtype dt2_a(kInt, 8);
+    Dtype dt2_b(kInt, 4);
+    Dtype dt2_c(ScalarType::Int, 8);
+    ASSERT_EQ(dt2_a, dt2_c);
+    ASSERT_NE(dt2_a, dt2_b);
+  }
+  {
+    ASSERT_EQ(kInt, ToDtype<int>());
+    ASSERT_EQ(kFloat, ToDtype<float>());
+    ASSERT_EQ(kByte, ToDtype<uint8_t>());
+    ASSERT_EQ(kChar, ToDtype<int8_t>());
+    ASSERT_EQ(kShort, ToDtype<int16_t>());
+    ASSERT_EQ(kLong, ToDtype<int64_t>());
+    ASSERT_EQ(kHalf, ToDtype<at::Half>());
+    ASSERT_EQ(kDouble, ToDtype<double>());
+    ASSERT_EQ(kBool, ToDtype<bool>());
+  }
+  {
+    Dtype int32x8(kInt, 8);
+    Dtype float32x8(kFloat, 8);
+    ASSERT_NE(int32x8, float32x8);
+    ASSERT_EQ(float32x8, BinaryOpDtype(int32x8, float32x8));
+    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, int32x8));
+    ASSERT_EQ(int32x8, BinaryOpDtype(int32x8, int32x8));
+    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, float32x8));
+  }
+}
+
+TEST(Type, BitCasting) {
+  {
+    VarHandle x("x", kFloat);
+    ExprHandle y = bitcast<int32_t>(x);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    ASSERT_EQ(y.dtype(), kInt);
+  }
+  {
+    VarHandle x("x", kInt);
+    ExprHandle y = bitcast<float>(x);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    ASSERT_EQ(y.dtype(), kFloat);
+  }
+  {
+    VarHandle x("x", kShort);
+    ExprHandle y = bitcast<at::Half>(x);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    ASSERT_EQ(y.dtype(), kHalf);
+  }
+  {
+    VarHandle x("x", kHalf);
+    ExprHandle y = bitcast<int16_t>(x);
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+    ASSERT_EQ(y.dtype(), kShort);
+  }
+
+  constexpr int32_t ref32 = 1337;
+  constexpr int64_t ref64 = 1337;
+  constexpr float reff32 = 1337.0f;
+  constexpr double reff64 = 1337.0f;
+  using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
+  // this is broken
+  /*{
+    constexpr int16_t ref16 = 1337;
+    at::Half k_;
+    at::Half* k = &k_;
+    *reinterpret_cast<int16_t*>(k) = ref16;
+    auto a = HalfImm::make(*k);
+    auto b = BitCast::make(kShort, a);
+    SimpleIRExprEval cg(b);
+    ASSERT_EQ(cg.value<int16_t>(), ref16);
+  }*/
+
+  {
+    float k = raw_bitcast<float>(ref32);
+    auto a = FloatImm::make(k);
+    auto b = BitCast::make(kInt, a);
+    SimpleIRExprEval cg(b);
+    ASSERT_EQ(cg.value<int32_t>(), ref32);
+  }
+
+  {
+    double k = raw_bitcast<double>(ref64);
+    auto a = DoubleImm::make(k);
+    auto b = BitCast::make(kLong, a);
+    SimpleIRExprEval cg(b);
+    ASSERT_EQ(cg.value<int64_t>(), ref64);
+  }
+
+  {
+    int64_t k = raw_bitcast<int64_t>(reff64);
+    auto a = LongImm::make(k);
+    auto b = BitCast::make(kDouble, a);
+    SimpleIRExprEval cg(b);
+    ASSERT_EQ(cg.value<double>(), reff64);
+  }
+
+  {
+    int32_t k = raw_bitcast<int32_t>(reff32);
+    auto a = IntImm::make(k);
+    auto b = BitCast::make(kFloat, a);
+    SimpleIRExprEval cg(b);
+    ASSERT_EQ(cg.value<float>(), reff32);
+  }
+
+  // This segfaults :(
+  /*{
+    VarHandle x("x", kDouble);
+    ASSERT_ANY_THROW(ExprHandle y = bitcast<int32_t>(x));
+  }
+  {
+    VarHandle x("x", kFloat);
+    ASSERT_ANY_THROW(ExprHandle y = bitcast<int64_t>(x));
+  }
+  {
+    VarHandle x("x", kLong);
+    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
+  }
+  {
+    VarHandle x("x", kShort);
+    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
+  }
+  {
+    VarHandle x("x", kInt);
+    ASSERT_ANY_THROW(ExprHandle y = bitcast<at::Half>(x));
+  }*/
+}
+
+TEST(Type, Propagation) {
+  // Same types:
+  {
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kFloat);
+    ExprHandle body = FloatImm::make(2.f) +
+        (x * FloatImm::make(3.f) + FloatImm::make(4.f) * y);
+    ASSERT_EQ(body.dtype(), kFloat);
+  }
+  // Int to bigger int:
+  {
+    VarHandle x("x", kShort);
+    VarHandle y("y", kLong);
+    ExprHandle body =
+        ShortImm::make(2.f) + (x * ShortImm::make(3) + ShortImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kLong);
+  }
+  // Float to bigger float:
+  {
+    VarHandle x("x", kHalf);
+    VarHandle y("y", kDouble);
+    ExprHandle body =
+        HalfImm::make(2.f) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kDouble);
+  }
+  // Int to Float:
+  {
+    VarHandle x("x", kFloat);
+    VarHandle y("y", kInt);
+    ExprHandle body =
+        IntImm::make(2) + (x * IntImm::make(3) + IntImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kFloat);
+  }
+  // Smaller float, bigger Int:
+  {
+    VarHandle x("x", kHalf);
+    VarHandle y("y", kLong);
+    ExprHandle body =
+        HalfImm::make(2) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kHalf);
+  }
+  // Bigger float, smaller Int:
+  {
+    VarHandle x("x", kChar);
+    VarHandle y("y", kDouble);
+    ExprHandle body =
+        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kDouble);
+  }
+  // Sign change char/byte upgrades to short:
+  {
+    VarHandle x("x", kChar);
+    VarHandle y("y", kByte);
+    ExprHandle body =
+        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
+    ASSERT_EQ(body.dtype(), kShort);
+  }
+}
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_type_specializations.cpp b/test/cpp/tensorexpr/test_type_specializations.cpp
new file mode 100644
index 0000000000000..d9756627fa74d
--- /dev/null
+++ b/test/cpp/tensorexpr/test_type_specializations.cpp
@@ -0,0 +1,75 @@
+#include <gtest/gtest.h>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+
+// Test that tensor type specializations are available in
+// the custom passes
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+bool hasTensorTypeSpecializations(torch::jit::Block* block) {
+  for (Value* v : block->inputs()) {
+    if (hasTensorTypeSpecialization(v))
+      return true;
+  }
+  for (Node* n : block->nodes()) {
+    for (torch::jit::Block* b : n->blocks()) {
+      if (hasTensorTypeSpecializations(b))
+        return true;
+    }
+    for (Value* v : n->outputs()) {
+      if (hasTensorTypeSpecialization(v))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool hasSpecializations = false;
+void detectTTSpecializationPass(std::shared_ptr<Graph>& graph) {
+  GRAPH_DUMP("In detectTTSpecialization Custom Post Pass: ", graph);
+  hasSpecializations = hasTensorTypeSpecializations(graph->block());
+}
+
+} // namespace
+
+TEST(SpecializationsInCustomPasses, Basic) {
+  RegisterPass p(detectTTSpecializationPass);
+  hasSpecializations = false;
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
+graph(%a.1 : Tensor,
+      %b.1 : Tensor):
+  %c.1 : Tensor = aten::mul(%a.1, %b.1) # misc/test_specializations.py:5:8
+  %d.1 : Tensor = aten::mul(%c.1, %b.1) # misc/test_specializations.py:6:8
+  return (%d.1)
+  )IR",
+      &*graph);
+
+  IValue ival = IValue(torch::randn({22}, at::kCPU));
+  std::vector<IValue> stack = {ival, ival};
+  auto run = [&](std::shared_ptr<Graph>& graph, std::vector<IValue> stack) {
+    GraphExecutor executor(graph, "");
+    executor.run(stack);
+    return stack;
+  };
+  run(graph, stack);
+
+  // Profiling mode will not be run with simple executor
+  if (!getExecutorMode()) {
+    EXPECT_TRUE(hasSpecializations);
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_utils.h b/test/cpp/tensorexpr/test_utils.h
new file mode 100644
index 0000000000000..065e513c1a645
--- /dev/null
+++ b/test/cpp/tensorexpr/test_utils.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+namespace torch {
+namespace jit {
+using namespace torch::jit::tensorexpr;
+
+#define IS_NODE(T, node)       \
+  {                            \
+    auto node_ = to<T>(node);  \
+    ASSERT_NE(nullptr, node_); \
+  }
+
+#define IS_NODE_WITH_NAME(T, node, name) \
+  auto name = to<T>(node);               \
+  ASSERT_NE(nullptr, name);
+
+#define IS_NODE_WITH_NAME_AND_CAST(T, node, name, Type)        \
+  NodePtr<T> name = nullptr;                                   \
+  {                                                            \
+    auto node_ = to<Cast>(node);                               \
+    ASSERT_NE(nullptr, node_);                                 \
+    ASSERT_EQ(node_->dtype().scalar_type(), ScalarType::Type); \
+    name = to<T>(node_->src_value());                          \
+  }                                                            \
+  ASSERT_NE(nullptr, name);
+
+#define IS_IMM_WITH_VAL(T, node, val) \
+  {                                   \
+    auto node_ = to<T##Imm>(node);    \
+    ASSERT_NE(nullptr, node_);        \
+    ASSERT_EQ(node_->value(), val);   \
+  }
+
+#define IS_VAR_WITH_NAME(node, name)     \
+  {                                      \
+    auto node_ = to<Var>(node);          \
+    ASSERT_NE(nullptr, node_);           \
+    ASSERT_EQ(node_->name_hint(), name); \
+  }
+
+#define IS_BINOP_W_VARS(T, node, name, v1, v2) \
+  NodePtr<T> name = nullptr;                   \
+  {                                            \
+    name = to<T>(node);                        \
+    ASSERT_NE(nullptr, name);                  \
+    IS_VAR_WITH_NAME(name->lhs(), v1);         \
+    IS_VAR_WITH_NAME(name->rhs(), v2);         \
+  }
+
+#define IS_BINOP_W_CONST(T, node, name, v, c) \
+  NodePtr<T> name = nullptr;                  \
+  {                                           \
+    name = to<T>(node);                       \
+    ASSERT_NE(nullptr, name);                 \
+    IS_VAR_WITH_NAME(name->lhs(), v);         \
+    IS_IMM_WITH_VAL(Int, name->rhs(), c);     \
+  }
+
+#define IS_RAND(node)                   \
+  {                                     \
+    auto node_ = to<Intrinsics>(node);  \
+    ASSERT_NE(nullptr, node_);          \
+    ASSERT_EQ(node_->op_type(), kRand); \
+  }
+
+void checkIR(StmtPtr s, const std::string& pattern);
+void checkExprIR(ExprPtr e, const std::string& pattern);
+void checkExprIR(const ExprHandle& e, const std::string& pattern);
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
new file mode 100644
index 0000000000000..3f4c32af463b6
--- /dev/null
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -0,0 +1,542 @@
+// *** Tensor Expressions ***
+//
+// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
+// work with them, and outlines how they are used in the overall TorchScript
+// compilation pipeline. This doc is permanently a "work in progress" since NNC
+// is under active development and things change fast.
+//
+// This Tutorial's code is compiled in the standard pytorch build, and the
+// executable can be found in `build/bin/tutorial_tensorexpr`.
+//
+// *** What is NNC ***
+//
+// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
+// and it performs on-the-fly code generation for kernels, which are often a
+// combination of multiple aten (torch) operators.
+//
+// When the JIT interpreter executes a torchscript model, it automatically
+// extracts subgraphs from the torchscript IR graph for which specialized code
+// can be JIT generated. This usually improves performance as the 'combined'
+// kernel created from the subgraph could avoid unnecessary memory traffic that
+// is unavoidable when the subgraph is interpreted as-is, operator by operator.
+// This optimization is often referred to as 'fusion'. Relatedly, the process of
+// finding and extracting subgraphs suitable for NNC code generation is done by
+// a JIT pass called 'fuser'.
+//
+// *** What is TE ***
+//
+// TE stands for Tensor Expressions. TE is a commonly used approach for
+// compiling kernels performing tensor (~matrix) computation. The idea behind it
+// is that operators are represented as a mathematical formula describing what
+// computation they do (as TEs) and then the TE engine can perform mathematical
+// simplification and other optimizations using those formulas and eventually
+// generate executable code that would produce the same results as the original
+// sequence of operators, but more efficiently.
+//
+// NNC's design and implementation of TE was heavily inspired by Halide and TVM
+// projects.
+#include <iostream>
+#include <string>
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
+
+using namespace torch::jit::tensorexpr;
+
+#ifdef TORCH_ENABLE_LLVM
+
+// Helper function to print a snippet from a big multi-line string
+static void printLinesToFrom(const std::string& input_str, int from, int to);
+
+#endif
+
+int main(int argc, char* argv[]) {
+  std::cout << "*** Structure of tensor expressions and statements ***"
+            << std::endl;
+  {
+    // A tensor expression is a tree of expressions. Each expression has a type,
+    // and that type defines what sub-expressions the current expression has.
+    // For instance, an expression of type 'Mul' would have a type 'kMul' and
+    // two subexpressions: LHS and RHS. Each of these two sub-expressions could
+    // also be a 'Mul' or some other expression.
+    //
+    // Let's construct a simple TE:
+    ExprPtr lhs = alloc<IntImm>(5);
+    ExprPtr rhs = alloc<Var>("x", kInt);
+    ExprPtr mul = alloc<Mul>(lhs, rhs);
+    std::cout << "Tensor expression: " << *mul << std::endl;
+    // Prints: Tensor expression: 5 * x
+
+    // Here we created an expression representing a 5*x computation, where x is
+    // an int variable.
+
+    // Another, probably a more convenient, way to construct tensor expressions
+    // is to use so called expression handles (as opposed to raw expressions
+    // like we did in the previous example). Expression handles overload common
+    // operations and allow us to express the same semantics in a more natural
+    // way:
+    ExprHandle l = 5;
+    ExprHandle r = Var::make("x", kInt);
+    ExprHandle m = l * r;
+    std::cout << "Tensor expression: " << *m.node() << std::endl;
+    // Prints: Tensor expression: 5 * x
+
+    // Converting from handles to raw expressions and back is easy:
+    ExprHandle handle = Var::make("x", kInt);
+    ExprPtr raw_expr_from_handle = handle.node();
+    ExprPtr raw_expr = alloc<Var>("x", kInt);
+    ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);
+
+    // We could construct arbitrarily complex expressions using mathematical
+    // and logical operations, casts between various data types, and a bunch of
+    // intrinsics.
+    ExprHandle a = Var::make("a", kInt);
+    ExprHandle b = Var::make("b", kFloat);
+    ExprHandle c = Var::make("c", kFloat);
+    ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
+    std::cout << "Tensor expression: " << *x.node() << std::endl;
+    // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
+
+    // An ultimate purpose of tensor expressions is to optimize tensor
+    // computations, and in order to represent accesses to tensors data, there
+    // is a special kind of expression - a load.
+    // To construct a load we need two pieces: the base and the indices. The
+    // base of a load is a Buf expression, which could be thought of as a
+    // placeholder similar to Var, but with dimensions info.
+    //
+    // Let's construct a simple load:
+    BufHandle A("A", {64, 32}, kInt);
+    VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
+    ExprHandle i(i_var), j(j_var);
+    ExprHandle load = Load::make(A.dtype(), A, {i, j});
+    std::cout << "Tensor expression: " << *load.node() << std::endl;
+    // Prints: Tensor expression: A[i, j]
+
+    // Tensor Expressions constitute Tensor Statements, which are used to
+    // represent computation of a given operator or a group of operators from a
+    // fusion group.
+    //
+    // There are three main kinds of tensor statements:
+    //  - block
+    //  - store
+    //  - loop
+    //
+    // A Store represents a store to a single element of a tensor (or to a
+    // group of elements if it's a vectorized store). Store statements,
+    // similarly to Load expressions, have a base and indices, but on top of
+    // that they also include a value - an expression representing what needs
+    // to be stored at the given memory location. Let's create a Store stmt:
+    StmtPtr store_a = Store::make(A, {i, j}, i + j);
+    std::cout << "Store statement: " << *store_a << std::endl;
+    // Prints: Store statement: A[i, j] = i + j;
+
+    // An operator fills the entire tensor, not just a single element, and to
+    // represent this we need to use For stmt: let's wrap our store stmt with
+    // two nested loops to represent that variables i and j need to iterate
+    // over some ranges.
+    ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
+    ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);
+
+    std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
+    // Prints:
+    // Nested for loops:
+    // for (const auto i : c10::irange(64)) {
+    //   for (const auto j : c10::irange(32)) {
+    //     A[i, j] = i + j;
+    //   }
+    // }
+
+    // A Block statement is used when we need a sequence of other statements.
+    // E.g. if a fusion group contains several operators, we initially define
+    // separate loopnest for each of them and put them all into a common block:
+    BufHandle B("B", {64, 32}, kInt);
+    StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
+    ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
+    ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);
+
+    BlockPtr block = Block::make({loop_i_a, loop_i_b});
+    std::cout << "Compound Block statement: " << std::endl
+              << *block << std::endl;
+    // Prints:
+    // Compound Block statement:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       A[i, j] = i + j;
+    //     }
+    //   }
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       B[i, j] = A[i, j];
+    //     }
+    //   }
+    // }
+
+    // Manually constructing nested loops and blocks to represent a computation
+    // might be laborious, and instead we can use a 'Compute' API. This API
+    // requires us to specify dimensions and a lambda to compute a single
+    // element of the resulting tensor and returns a `Tensor` structure. This
+    // structure is simply a pair of a buffer that was created to represent the
+    // result of the computation (BufPtr) and a statement representing the
+    // computation itself (StmtPtr).
+    Tensor C =
+        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return i * j;
+        });
+    std::cout << "Stmt produced by 'Compute' API: " << std::endl
+              << *C.stmt() << std::endl;
+    // Prints:
+    // Stmt produced by 'Compute' API:
+    // for (const auto i : c10::irange(64)) {
+    //   for (const auto j : c10::irange(32)) {
+    //     C[i, j] = i * j;
+    //   }
+    // }
+
+    // To construct statements to represent computations with reductions, we
+    // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
+    // of extra arguments defining how to perform the reduction. Let's define a
+    // simple 2D sum of C using that:
+    Tensor D = Reduce(
+        "D",
+        {},
+        Sum(),
+        [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
+        {64, 32});
+    std::cout << "Stmt produced by 'Reduce' API: " << std::endl
+              << *D.stmt() << std::endl;
+  }
+
+  std::cout << "*** Loopnests transformations ***" << std::endl;
+  {
+    // When a statement for the computation is generated, we might want to
+    // apply some optimizations to it. These transformations allow us to end up
+    // with a statement producing the same results, but more efficiently.
+    //
+    // Let's look at a couple of transformations that are used in NNC. We will
+    // begin with constructing a Block statement like we did before.
+
+    Tensor C =
+        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return i * (j + 1);
+        });
+    BufHandle c_buf(C.buf());
+    Tensor D =
+        Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return c_buf.load(i, j) - i;
+        });
+    StmtPtr block = Block::make({C.stmt(), D.stmt()});
+    std::cout << "Stmt produced by 'Compute' API: " << std::endl
+              << *block << std::endl;
+    // Prints:
+    // Stmt produced by 'Compute' API:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       C[i, j] = i * (j + 1);
+    //     }
+    //   }
+    //   for (const auto i_1 : c10::irange(64)) {
+    //     for (const auto j_1 : c10::irange(32)) {
+    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
+    //     }
+    //   }
+    // }
+
+    // One transformation we can apply to this computation is inlining: i.e.
+    // taking the expression that defines values of C and substituting a load
+    // from C with it.
+    // To do that, we first need to create a special object called LoopNest -
+    // all transformations are methods of this class. To create a loopnest we
+    // need to provide a list of output buffers and the root statement:
+    LoopNest nest(block, {D.buf()});
+
+    // We can always retrieve the Stmt back from LoopNest:
+    std::cout << "LoopNest root stmt: " << std::endl
+              << *nest.root_stmt() << std::endl;
+    // Prints:
+    // LoopNest root stmt:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       C[i, j] = i * (j + 1);
+    //     }
+    //   }
+    //   for (const auto i_1 : c10::irange(64)) {
+    //     for (const auto j_1 : c10::irange(32)) {
+    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
+    //     }
+    //   }
+    // }
+
+    // Now we can apply the inlining transformation:
+    nest.computeInline(C.buf());
+    std::cout << "Stmt after inlining:" << std::endl
+              << *nest.root_stmt() << std::endl;
+    // Prints:
+    // Stmt after inlining:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       D[i, j] = i * (j + 1) - i;
+    //     }
+    //   }
+    // }
+
+    // We can also apply algebraic simplification to a statement:
+    StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
+    std::cout << "Stmt after simplification:" << std::endl
+              << *simplified << std::endl;
+    // Prints:
+    // Stmt after simplification:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       D[i, j] = i * j;
+    //     }
+    //   }
+    // }
+
+    // Many loopnest transformations are stateless and can be applied without
+    // creating a LoopNest object. In fact, we plan to make all transformations
+    // stateless.
+    // splitWithTail is one such transformation: it splits an iteration space
+    // of a given loop into two with a given factor.
+    ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
+    LoopNest::splitWithTail(outer_loop, 13);
+    // Call simplifier once more to fold some arithmetic.
+    simplified = IRSimplifier::simplify(simplified);
+    std::cout << "Stmt after splitWithTail:" << std::endl
+              << *simplified << std::endl;
+    // Prints:
+    // Stmt after splitWithTail:
+    // {
+    //   for (const auto i_outer : c10::irange(4)) {
+    //     for (const auto i_inner : c10::irange(13)) {
+    //       for (const auto j : c10::irange(32)) {
+    //         D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
+    //       }
+    //     }
+    //   }
+    //   for (const auto i_tail : c10::irange(12)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       D[i_tail + 52, j] = i_tail * j + 52 * j;
+    //     }
+    //   }
+    // }
+
+    // NNC supports a wide range of loop nest transformations, which we are not
+    // listing here. Please refer to documentation in
+    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
+    // for more details.
+  }
+
+  std::cout << "*** Codegen ***" << std::endl;
+  {
+    // An ultimate goal of tensor expressions is to be provide a mechanism to
+    // execute a given computation in the fastest possible way. So far we've
+    // looked at how we could describe what computation we're interested in, but
+    // we haven't looked at how to actually execute it.
+    //
+    // All we've been dealing with was just symbols with no actual data
+    // associated, in this section we would look at how we can bridge that gap.
+
+    // Let's start by constructing a simple computation for us to work with:
+    BufHandle A("A", {64, 32}, kInt);
+    BufHandle B("B", {64, 32}, kInt);
+    Tensor X =
+        Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return A.load(i, j) + B.load(i, j);
+        });
+
+    // And let's lower it to a loop nest, as we did in the previous section. We
+    // can pass Tensor object directly:
+    LoopNest loopnest({X});
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (const auto i : c10::irange(64)) {
+    //     for (const auto j : c10::irange(32)) {
+    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //     }
+    //   }
+
+    // Now imagine that we have two actual tensors 64x32 that we want sum
+    // together, how do we pass those tensors to the computation and how do we
+    // carry it out?
+    //
+    // Codegen object is aimed at providing exactly that functionality. Codegen
+    // is an abstract class and concrete codegens are derived from it.
+    // Currently, we have three codegens:
+    //  1) Simple Evaluator,
+    //  2) LLVM Codegen for CPU,
+    //  3) CUDA Codegen.
+    // In this example we will be using Simple Evaluator, since it's available
+    // everywhere.
+
+    // To create a codegen, we need to provide the statement - it specifies the
+    // computation we want to perform - and a list of placeholders and tensors
+    // used in the computation. The latter part is crucial since that's the only
+    // way the codegen could use to correlate symbols in the statement to actual
+    // data arrays that we will be passing when we will actually be performing
+    // the computation.
+    //
+    // Let's create a Simple IR Evaluator codegen for our computation:
+    SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
+
+    // We are using the simplest codegen and in it almost no work is done at the
+    // construction step. Real codegens such as CUDA and LLVM perform
+    // compilation during that stage so that when we're about to run the
+    // computation everything is ready.
+
+    // Let's now create some inputs and run our computation with them:
+    std::vector<int> data_A(64 * 32, 3); // This will be the input A
+    std::vector<int> data_B(64 * 32, 5); // This will be the input B
+    std::vector<int> data_X(64 * 32, 0); // This will be used for the result
+
+    // Now let's invoke our codegen to perform the computation on our data. We
+    // need to provide as many arguments as how many placeholders and tensors we
+    // passed at the codegen construction time. A position in these lists would
+    // define how real data arrays from the latter call (these arguments are
+    // referred to as 'CallArg's in our codebase) correspond to symbols
+    // (placeholders and tensors) used in the tensor expressions we constructed
+    // (these are referred to as 'BufferArg').
+    // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
+    // contains data for the placeholder A, data_B - for the placeholder B, and
+    // data_X would be used for contents of tensor X.
+    ir_eval(data_A, data_B, data_X);
+
+    // Let's print one of the elements from each array to verify that the
+    // computation did happen:
+    std::cout << "A[10] = " << data_A[10] << std::endl
+              << "B[10] = " << data_B[10] << std::endl
+              << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
+    // Prints:
+    // A[10] = 3
+    // B[10] = 5
+    // X[10] = A[10] + B[10] = 8
+  }
+
+  std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
+  {
+    // This section requires a LLVM-enabled PyTorch build, so we have to use a
+    // guard:
+#ifdef TORCH_ENABLE_LLVM
+
+    // Often we would like to convert a TorchScript IR to TE rather than
+    // construct TE IR from scratch.  NNC provides an API to perform such
+    // lowering: it takes a TorchScript graph and returns an object that can be
+    // used to invoke the generated kernel.
+    // This API is currently used by the TorchScript JIT fuser and can also be
+    // used ahead of time to pre-compile parts of a model.
+    //
+    // To get familiar with this API let's first start with defining a simple
+    // TorchScript graph:
+    const auto graph_string = R"IR(
+        graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
+              %B : Float(5, 3, strides=[3, 1], device=cpu)):
+          %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
+          %one : int = prim::Constant[value=1]()
+          %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
+          %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
+          return (%AAB_plus_B))IR";
+    auto graph = std::make_shared<torch::jit::Graph>();
+    parseIR(graph_string, &*graph);
+
+    // This graph defines a simple computation of A*A*B + B where A and B are
+    // input 5x3 tensors.
+
+    // To lower this TorchScript graph to TE, we just need to create a
+    // TensorExprKernel object. In its constructor it constructs the
+    // corresponding TE IR and compiles it for the given backend (in this
+    // example for CPU using LLVM compiler).
+    TensorExprKernel kernel(graph);
+
+    // We can retrieve the generated TE stmt from the kernel object:
+    StmtPtr kernel_stmt = kernel.getCodeGenStmt();
+    std::cout << "TE Stmt constructed from TorchScript: " << std::endl
+              << *kernel_stmt << std::endl;
+    // Prints:
+    // TE Stmt constructed from TorchScript:
+    // {
+    //   for (const auto v : c10::irange(5)) {
+    //     for (const auto _tail_tail : c10::irange(3)) {
+    //       aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
+    //       ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
+    //       (tB[_tail_tail + 3 * v]);
+    //     }
+    //   }
+    // }
+
+    // We can also examine generated LLVM IR and assembly code:
+    std::cout << "Generated LLVM IR: " << std::endl;
+    auto ir_str = kernel.getCodeText("ir");
+    printLinesToFrom(ir_str, 15, 20);
+    // Prints:
+    // Generated LLVM IR:
+    //   %9 = bitcast float* %2 to <8 x float>*
+    //   %10 = load <8 x float>, <8 x float>* %9 ...
+    //   %11 = bitcast float* %5 to <8 x float>*
+    //   %12 = load <8 x float>, <8 x float>* %11 ...
+    //   %13 = fmul <8 x float> %10, %12
+    //   %14 = fmul <8 x float> %10, %13
+
+    std::cout << "Generated assembly: " << std::endl;
+    auto asm_str = kernel.getCodeText("asm");
+    printLinesToFrom(asm_str, 10, 15);
+    // Prints:
+    // Generated assembly:
+    //         vmulps  %ymm1, %ymm0, %ymm2
+    //         vfmadd213ps     %ymm1, %ymm0, %ymm2
+    //         vmovups %ymm2, (%rax)
+    //         vmovss  32(%rcx), %xmm0
+    //         vmovss  32(%rdx), %xmm1
+    //         vmulss  %xmm1, %xmm0, %xmm2
+
+    // We can also execute the generated kernel:
+    auto A =
+        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
+        2.0;
+    auto B =
+        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
+        3.0;
+    std::vector<at::Tensor> inputs = {A, B};
+    std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
+    kernel.run(stack);
+    auto R = stack[0].toTensor();
+
+    // Let's print one of the elements from the result tensor to verify that the
+    // computation did happen and was correct:
+    std::cout << "R[2][2] = " << R[2][2] << std::endl;
+    // Prints:
+    // R[2][2] = 15
+    // [ CPUFloatType{} ]
+#endif
+  }
+  return 0;
+}
+
+void printLinesToFrom(const std::string& input_str, int from, int to) {
+  std::istringstream f(input_str);
+  std::string s;
+  int idx = 0;
+  while (getline(f, s)) {
+    if (idx > from) {
+      std::cout << s << "\n";
+    }
+    if (idx++ > to) {
+      break;
+    }
+  }
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 58c812b08cccb..e306cc0e12b4f 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+<<<<<<< HEAD
 #include <torch/csrc/stable/accelerator.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
@@ -9,6 +10,10 @@
 #ifdef LAE_USE_CUDA
 #include <cuda_runtime.h>
 #endif
+=======
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <optional>
 
@@ -40,6 +45,7 @@ Tensor sgd_out_of_place(
     const float weight_decay,
     const double lr,
     const bool maximize) {
+<<<<<<< HEAD
   STD_TORCH_CHECK(param.dim() == 1, "param must be 1D");
 
   // these test the get_device() and get_device_index() methods
@@ -47,6 +53,8 @@ Tensor sgd_out_of_place(
   STD_TORCH_CHECK(param.get_device() == -1, "CPU device index = -1");
   STD_TORCH_CHECK(param.get_device_index() == -1, "CPU device index = -1");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t *param_sizes;
   int64_t *param_strides;
   aoti_torch_get_sizes(param.get(), &param_sizes);
@@ -140,10 +148,19 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
   const auto num_args = 6;
   StableIValue stack[num_args];
 
+<<<<<<< HEAD
   auto mf = aoti_torch_memory_format_contiguous_format();
 
   stack[0] = from(t);
   stack[1] = from(std::optional(t.scalar_type()));    // dtype
+=======
+  int32_t t_dtype;
+  aoti_torch_get_dtype(t.get(), &t_dtype);
+  auto mf = aoti_torch_memory_format_contiguous_format();
+
+  stack[0] = from(t);
+  stack[1] = from(std::optional(t_dtype));    // dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   stack[2] = from(std::nullopt);              // layout
   stack[3] = from(std::optional(device));     // device
   stack[4] = from(std::optional(false));      // pin_memory
@@ -267,6 +284,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("is_contiguous", &boxed_is_contiguous);
 }
+<<<<<<< HEAD
 
 Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
   return transpose(t, dim0, dim1);
@@ -574,3 +592,5 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
 }
 #endif // LAE_USE_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index 0000d667e1cbc..68bea852f2115 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -51,6 +51,7 @@ def my_abs(t) -> Tensor:
     return torch.ops.libtorch_agnostic.my_abs.default(t)
 
 
+<<<<<<< HEAD
 def my_is_cpu(t) -> bool:
     """
     Returns is_cpu on the input tensor.
@@ -64,6 +65,8 @@ def my_is_cpu(t) -> bool:
     return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def my_ones_like(tensor, device) -> Tensor:
     """
     Returns a new Tensor like the input tensor, but with all ones
@@ -129,6 +132,7 @@ def is_contiguous(t) -> bool:
     Returns: is_contiguous(t)
     """
     return torch.ops.libtorch_agnostic.is_contiguous.default(t)
+<<<<<<< HEAD
 
 
 def my_transpose(t, dim0, dim1) -> Tensor:
@@ -333,3 +337,5 @@ def my_new_zeros_dtype_variant(t) -> Tensor:
     Returns: New zeros tensor
     """
     return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
index b7141a3e6fcd6..f0f7a3efc028f 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
@@ -4,8 +4,12 @@
 
 from setuptools import find_packages, setup
 
+<<<<<<< HEAD
 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+=======
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ROOT_DIR = Path(__file__).parent
@@ -36,6 +40,7 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
+<<<<<<< HEAD
     extension = CppExtension
     # allow including <cuda_runtime.h>
     if torch.cuda.is_available():
@@ -46,6 +51,12 @@ def get_extension():
 
     return [
         extension(
+=======
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    return [
+        CppExtension(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "libtorch_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 35610332a36cd..5752eb39d9fbf 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -1,11 +1,17 @@
 # Owner(s): ["module: cpp"]
 
+<<<<<<< HEAD
 import math
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 
 import torch
 from torch.testing._internal.common_device_type import (
+<<<<<<< HEAD
     deviceCountAtLeast,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -175,6 +181,7 @@ def _make_cuda_tensors(prior_mem):
                     curr_mem = torch.cuda.memory_allocated(device)
                     self.assertEqual(curr_mem, init_mem)
 
+<<<<<<< HEAD
         def test_my_transpose(self, device):
             import libtorch_agnostic
 
@@ -367,6 +374,8 @@ def test_my_clone(self, device):
             self.assertNotEqual(result.data_ptr(), expected.data_ptr())
             self.assertEqual(result.stride(), expected.stride())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/mps_extension.mm b/test/cpp_extensions/mps_extension.mm
index 30b70a76563d6..4dacc47dc9ae2 100644
--- a/test/cpp_extensions/mps_extension.mm
+++ b/test/cpp_extensions/mps_extension.mm
@@ -13,11 +13,14 @@ kernel void add_arrays(device const float* inA,
 {
     result[index] = inA[index] + inB[index];
 }
+<<<<<<< HEAD
 
 kernel void add_one(device float* data,
                     uint index [[thread_position_in_grid]]) {
   data[index] += 1.0;
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )MPS_ADD_ARRAYS");
 
 at::Tensor get_cpu_add_output(at::Tensor & cpu_input1, at::Tensor & cpu_input2) {
@@ -55,6 +58,7 @@ kernel void add_one(device float* data,
   return mps_output;
 }
 
+<<<<<<< HEAD
 void mps_add_one_new_encoder(const at::Tensor& input) {
   using namespace at::native::mps;
   TORCH_CHECK(input.is_mps());
@@ -82,4 +86,9 @@ void mps_add_one_new_encoder(const at::Tensor& input) {
   m.def("get_cpu_add_output", &get_cpu_add_output);
   m.def("get_mps_add_output", &get_mps_add_output);
   m.def("mps_add_one_new_context", &mps_add_one_new_encoder);
+=======
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_cpu_add_output", &get_cpu_add_output);
+  m.def("get_mps_add_output", &get_mps_add_output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp
index fbd53b96234b2..ba69c7d1c3bdd 100644
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@@ -139,6 +139,39 @@ void fallback_with_undefined_tensor() {
                            grad_scale, found_inf);
 }
 
+<<<<<<< HEAD
+=======
+struct CustomAutogradFnReturnsSelf : public torch::autograd::Function<CustomAutogradFnReturnsSelf> {
+
+  static at::Tensor forward(torch::autograd::AutogradContext* ctx, at::Tensor self) {
+    return self;
+  }
+
+  static torch::autograd::variable_list backward(torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) {
+    return {grad_output[0] * 0.5};
+  }
+};
+
+struct CustomAutogradFnAliasing : public torch::autograd::Function<CustomAutogradFnAliasing> {
+
+  static at::Tensor forward(torch::autograd::AutogradContext* ctx, at::Tensor self) {
+    return self.view_symint(self.sym_sizes());
+  }
+
+  static torch::autograd::variable_list backward(torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) {
+    return {grad_output[0] * 0.5};
+  }
+};
+
+at::Tensor custom_autograd_fn_returns_self(at::Tensor x) {
+  return CustomAutogradFnReturnsSelf::apply(x);
+}
+
+at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
+  return CustomAutogradFnAliasing::apply(x);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Here, we're exposing a custom device object that corresponds to our custom backend.
 // We do this using pybind: exposing an "extension_name.custom_device()" function in python,
 // that's implemented in C++.
@@ -149,4 +182,17 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("custom_storage_registry", &custom_storage_registry, "set custom storageImpl creat method");
     m.def("custom_storageImpl_called", &custom_storageImpl_called, "check if our custom abs function was called");
     m.def("fallback_with_undefined_tensor", &fallback_with_undefined_tensor, "fallback_with_undefined_tensor for privateuse1");
+<<<<<<< HEAD
+=======
+
+    // Co-opting this file to more easily test torch.compile'ing of custom autograd functions in C++
+    m.def("custom_autograd_fn_returns_self", &custom_autograd_fn_returns_self);
+}
+
+TORCH_LIBRARY(_test_funcs, m) {
+  m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)");
+}
+TORCH_LIBRARY_IMPL(_test_funcs, AutogradCPU, m) {
+  m.impl("custom_autograd_fn_aliasing", &custom_autograd_fn_aliasing);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/test/cpp_extensions/open_registration_extension/README.md b/test/cpp_extensions/open_registration_extension/README.md
new file mode 100644
index 0000000000000..24fec68c31835
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/README.md
@@ -0,0 +1,37 @@
+# PyTorch OpenReg
+
+This folder contains a self-contained example of a PyTorch out-of-tree backend leveraging the "PrivateUse1" backend from core.
+
+## How to use
+
+Install as standalone with `python setup.py develop` (or install) from this folder.
+You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
+
+## Design principles
+
+For simplicity anything that can be implemented from python is done so.
+A real implementation will most likely want to call these different APIs from c++ directly.
+
+The current version sends everything back to python and contains enough implementation to run basic model, transfer host/device and printing.
+
+The codebase is split as follows:
+
+- `pytorch_openreg/__init__.py`
+  - imports torch to get core state initialized.
+  - imports `._aten_impl` to register our aten op implementations to torch.
+  - imports `.C` to load our c++ extension that registers more ops, allocator and hooks.
+  - renames the PrivateUse1 backend and register our python-side module.
+- `pytorch_openreg/_aten_impl.py`
+  - Define a new `torch.Library` that registers a fallback that will be called whenever a backend kernel for PrivateUse1 is called. It contains the logic to handle all kind of native functions, computing the output metadata, allocating it and only calling into the device daemon to perform computation.
+- `pytorch_openreg/_device_daemon.py`
+  - contains the Allocator (responsible for allocating memory on the device side and host side, as int8 buffers).
+  - contains `Driver`, which as user-process driver to deal with some information needed to be done in driver.
+  - contains `Executor`, which as device-process exector to do something related device logic.
+- `pytorch_openreg/_meta_parser.py` mainly contain utilities to send objects over the wire from the user process to the device process.
+  - The main class there is `OpenRegTensorMeta` that contains all the metadata sent to the device which should be enough for it to populate the output Tensor.
+
+## Next steps
+
+The main next step would be to:
+
+- Replace the current `open_registration_extension.cpp` test in PyTorch CI with this.
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py
new file mode 100644
index 0000000000000..05b8955b6557b
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py
@@ -0,0 +1,122 @@
+import types
+
+import torch
+
+# Create our python implementation dict so that the C++ module
+# can access it during its initialization and also register aten impls.
+from ._aten_impl import impl_factory as impl_factory  # noqa: F401
+from ._device_daemon import driver
+
+
+# Load the C++ Module
+import pytorch_openreg._C  # isort:skip # type: ignore[import] # noqa: F401
+
+
+def _create_module():
+    module = types.ModuleType("_OpenRegMod")
+
+    class device:
+        r"""Context-manager that changes the selected device.
+
+        Args:
+            device (torch.device or int): device index to select. It's a no-op if
+                this argument is a negative integer or ``None``.
+        """
+
+        def __init__(self, device):
+            self.idx = torch.accelerator._get_device_index(device, optional=True)
+            self.prev_idx = -1
+
+        def __enter__(self):
+            self.prev_idx = driver.exec("exchangeDevice", self.idx)
+
+        def __exit__(self, type, value, traceback):
+            self.idx = driver.exec("uncheckedSetDevice", self.prev_idx)
+            return False
+
+    def device_count() -> int:
+        return driver.exec("deviceCount")
+
+    def is_available():
+        return True
+
+    def current_device():
+        return torch.accelerator.current_device_index()
+
+    def get_rng_state(device="openreg"):
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device("openreg", device)
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        return default_generator.get_state()
+
+    def set_rng_state(new_state, device="openreg"):
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device("openreg", device)
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        default_generator.set_state(new_state)
+
+    def initial_seed() -> int:
+        _lazy_init()
+        idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        return default_generator.initial_seed()
+
+    def manual_seed(seed: int) -> None:
+        seed = int(seed)
+
+        idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        default_generator.manual_seed(seed)
+
+    def manual_seed_all(seed: int) -> None:
+        seed = int(seed)
+
+        for idx in range(device_count()):
+            default_generator = pytorch_openreg._C._get_default_generator(idx)
+            default_generator.manual_seed(seed)
+
+    def is_initialized():
+        return module._initialized
+
+    def _is_in_bad_fork():
+        return False
+
+    def _lazy_init():
+        if is_initialized():
+            return
+        pytorch_openreg._C._init()
+        module._initialized = True
+
+    module.is_available = is_available  # type: ignore[assignment]
+
+    module._initialized = False  # type: ignore[assignment]
+    module._lazy_init = _lazy_init  # type: ignore[assignment]
+    module.is_initialized = is_initialized  # type: ignore[assignment]
+
+    module.device = device  # type: ignore[assignment]
+    module.device_count = device_count  # type: ignore[assignment]
+    module.current_device = current_device  # type: ignore[assignment]
+    module.get_rng_state = get_rng_state  # type: ignore[assignment]
+    module.set_rng_state = set_rng_state  # type: ignore[assignment]
+    module._is_in_bad_fork = _is_in_bad_fork  # type: ignore[assignment]
+    module.initial_seed = initial_seed  # type: ignore[assignment]
+    module.manual_seed = manual_seed  # type: ignore[assignment]
+    module.manual_seed_all = manual_seed_all  # type: ignore[assignment]
+
+    return module
+
+
+# Set all the appropriate state on PyTorch
+torch.utils.rename_privateuse1_backend("openreg")
+torch._register_device_module("openreg", _create_module())
+torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py
new file mode 100644
index 0000000000000..d4c49bd28d458
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py
@@ -0,0 +1,186 @@
+import logging
+
+import torch
+from torch.utils._pytree import tree_any
+
+
+log = logging.getLogger(__name__)
+
+from ._device_daemon import driver
+from ._meta_parser import prepare_for_sending, to_device_no_copy
+
+
+_IMPL_REGISTRY = {}
+
+
+def impl_factory(name):
+    if name in _IMPL_REGISTRY:
+        return _IMPL_REGISTRY[name]
+
+    def _(*args, **kwargs):
+        log.info("Calling hook %s", name)
+        return driver.exec(name, *args, **kwargs)
+
+    _IMPL_REGISTRY[name] = _
+    return _
+
+
+def _openreg_kernel_fallback(op, *args, **kwargs):
+    def get_tensor_device(*args):
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and arg.device.type == "openreg":
+                return arg.device
+
+    device = get_tensor_device(*args)
+    if device is None:
+        return _kernel_fallback(op, *args, **kwargs)
+
+    # Mimicks the DeviceGuard system we have in aten
+    with torch.openreg.device(device):  # type: ignore[misc]
+        return _kernel_fallback(op, *args, **kwargs)
+
+
+def _kernel_fallback(op, *args, **kwargs):
+    log.info("Calling kernel %s", op)
+
+    op_name = None
+    post_process = None
+    if "out" in op._overloadname:
+        # Note that all structured native op will call here
+        if isinstance(kwargs["out"], tuple):
+            raise RuntimeError(f"out= variant {op} with tuple out= not supported")
+        if kwargs["out"].nelement() == 0:
+            # Out variant that needs a resize, convert to an out of place
+            # and handle generically below
+            orig_out = kwargs["out"]
+            del kwargs["out"]
+            if op._overloadname != "out":
+                raise RuntimeError(
+                    "Cannot retranslate non-default out= variant form 0 size"
+                )
+            op = op.overloadpacket.default
+
+            def _post_process():
+                nonlocal real_res
+                orig_out.set_(real_res)
+                real_res = orig_out
+
+            post_process = _post_process
+
+        else:
+            # No metadata update to do, just run the op on the device
+            op_name = op.overloadpacket._qualified_op_name
+            real_res = kwargs["out"]
+    elif not tree_any(lambda obj: isinstance(obj, torch.Tensor), (args, kwargs)):
+        # No Tensor argument means factory function
+        # They should decompose and be handled in our c++ side directly
+        raise RuntimeError(f"{op} not handled yet.")
+    elif op._schema.is_mutable or op is torch.ops.aten._copy_from.default:
+        # Only handle inplace ops returning their first arg
+        assert len(args) >= 1, f"Inplace {op} needs at least one arg"
+        assert len(op._schema.returns) == 1, (
+            f"NYI Inplace {op} with more than one return"
+        )
+        op_name = op.overloadpacket._qualified_op_name
+        real_res = args[0]
+    elif any(r.alias_info is not None for r in op._schema.returns):
+        # View ops
+        if op is torch.ops.aten.view.default:
+            return torch.ops.aten._unsafe_view(*args, **kwargs)
+        raise RuntimeError(f"{op} view op is not handled yet")
+
+    if op_name is None:
+        # 1. Compute updated metadata
+        if torch.Tag.dynamic_output_shape not in op.tags:
+            # Usual case: run the meta op to see the output metadata
+            meta_args, meta_kwargs = to_device_no_copy("meta", args, kwargs)
+            meta_res = op(*meta_args, **meta_kwargs)
+
+            # 2. Allocate the output
+            real_res, _ = to_device_no_copy("openreg", meta_res, {})
+        else:
+            # Slow version for data-dependent functions:
+            # Run the op on the device just to get the output shape
+            args_, kwargs_ = prepare_for_sending(args, kwargs)
+            shape = driver.exec(
+                "get_op_output_shape",
+                op.overloadpacket._qualified_op_name,
+                args_,
+                kwargs_,
+            )
+
+            # 2. Allocate the output
+            real_res = args[0].new(shape)
+
+        # 3. Move to out variant
+        kwargs["out"] = real_res
+        # Let overload resolution find the out= overload
+        op_name = op.overloadpacket._qualified_op_name
+
+    # 4. Run the compute and populate the output on the device
+    args, kwargs = prepare_for_sending(args, kwargs)
+    driver.exec("run_op", op_name, args, kwargs)
+
+    if post_process is not None:
+        post_process()
+
+    return real_res
+
+
+def copy_from_device(from_):
+    with torch.openreg.device(from_.device):  # type: ignore[misc]
+        args, _ = prepare_for_sending((from_,), {})
+        return driver.exec("send_data", *args)
+
+
+def copy_from_host_to_device(from_, to_):
+    with torch.openreg.device(to_.device):  # type: ignore[misc]
+        args, _ = prepare_for_sending((to_,), {})
+        driver.exec("recv_data", from_, *args)
+    return to_
+
+
+def _copy_from(from_, to_):
+    if from_.device.type == to_.device.type:
+        assert from_.device.type == "openreg"
+        if from_.device.index == to_.device.index:
+            op = torch.ops.aten.copy_.default
+            return _openreg_kernel_fallback(op, to_, from_)
+        else:
+            host_mem = copy_from_device(from_)
+            return copy_from_host_to_device(host_mem, to_)
+    elif from_.device.type == "openreg":
+        host_mem = copy_from_device(from_)
+        return to_.copy_(host_mem)
+    elif to_.device.type == "openreg":
+        return copy_from_host_to_device(from_, to_)
+    else:
+        raise RuntimeError("Should not happen")
+
+
+def _set_source_tensor(ten1, ten2):
+    return torch.ops.aten.set_.source_Storage_storage_offset(
+        ten1,
+        ten2.untyped_storage(),
+        ten2.storage_offset(),
+        ten2.size(),
+        ten2.stride(),
+    )
+
+
+def _local_scalar_dense(ten):
+    host_mem = copy_from_device(ten)
+    return host_mem.item()
+
+
+_openreg_lib = torch.library.Library("_", "IMPL")
+_openreg_lib.fallback(_openreg_kernel_fallback, dispatch_key="PrivateUse1")
+
+_openreg_lib_aten = torch.library.Library("aten", "IMPL")
+_openreg_lib_aten.impl("_copy_from", _copy_from, dispatch_key="PrivateUse1")
+_openreg_lib_aten.impl(
+    "set_.source_Tensor", _set_source_tensor, dispatch_key="PrivateUse1"
+)
+_openreg_lib_aten.impl(
+    "_local_scalar_dense", _local_scalar_dense, dispatch_key="PrivateUse1"
+)
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py
new file mode 100644
index 0000000000000..d339869635001
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py
@@ -0,0 +1,391 @@
+import ctypes
+import logging
+import threading
+import time
+
+import torch
+
+from ._meta_parser import (
+    OpenRegTensorData,
+    receive_after_sending,
+    safe_str,
+    validate_send_queue_args,
+)
+
+
+log = logging.getLogger(__name__)
+mp_context = torch.multiprocessing.get_context("spawn")
+
+# Constant properties of our device
+NUM_DEVICES = 2
+
+
+# Our allocator
+class Allocator:
+    def __init__(self):
+        self.allocated = {}
+
+    def malloc(self, size):
+        mem = ctypes.create_string_buffer(size)
+        ptr = ctypes.addressof(mem)
+        self.allocated[ptr] = (size, mem)
+        return ptr
+
+    def free(self, ptr):
+        if ptr not in self.allocated:
+            return False
+        else:
+            del self.allocated[ptr]
+            return True
+
+
+class HostAllocator(Allocator):
+    def is_pinned_ptr(self, ptr):
+        return ptr in self.allocated or any(
+            ptr_ <= ptr and ptr < ptr_ + size
+            for ptr_, (size, _) in self.allocated.items()
+        )
+
+
+class DeviceAllocator(Allocator):
+    def tensor_from_meta(self, meta):
+        def create_tensor_from_data_ptr(ptr, size):
+            storage = torch._C._construct_storage_from_data_pointer(
+                ptr, torch.device("cpu"), size
+            )
+            return torch.Tensor(storage)
+
+        found_base = None
+        # Usual case, we're receiving a known Tensor
+        if meta.data_ptr in self.allocated:
+            found_base = create_tensor_from_data_ptr(
+                meta.data_ptr, self.allocated[meta.data_ptr][0]
+            )
+
+        # Might be a rewrap of another storage at a different offset
+        # Slow path to try and find the corresponding storage
+        if found_base is None:
+            for tag, (size, _) in self.allocated.items():
+                # t is always a 1D uint8 storage!
+                if meta.data_ptr > tag and meta.data_ptr < tag + size:
+                    # Blame @ngimel for this
+                    slice_size = size - (meta.data_ptr - tag)
+                    found_base = create_tensor_from_data_ptr(meta.data_ptr, slice_size)
+
+        # Might be an empty tensor
+        if found_base is None and meta.nelem_in_bytes == 0:
+            found_base = torch.tensor((), dtype=torch.uint8)
+
+        # This pointer is not allocated here, segfault !
+        if found_base is None:
+            log.info("Currently allocated blocks:\n %s", safe_str(self.allocated))
+            log.info("Trying to access %s", meta)
+            raise RuntimeError("SEGFAULT!")
+
+        # Raw 1d uint8 data
+        raw = found_base
+        # Reinterpret cast in the right dtype
+        as_dtype = raw.view(dtype=meta.dtype)
+        # View to the right shape/stride/offset
+        view = as_dtype.as_strided(meta.size, meta.stride, meta.storage_offset)
+        return view
+
+
+def register(registry):
+    def func(fn):
+        registry[fn.__name__] = fn
+        return fn
+
+    return func
+
+
+class Driver:
+    def __init__(self, num_devices):
+        super().__init__()
+        self.num_devices = num_devices
+        self.is_initialized = False
+
+        # State of our driver
+        self.curr_device_idx = 0
+        self.curr_streams = {}
+
+        # Allocated memory belongs to which device
+        self.memory_belong = {}
+        self.host_allocator = HostAllocator()
+        self.event_belong = {}
+
+        self.rlock = threading.RLock()
+
+    def _lazy_init(self):
+        if self.is_initialized:
+            return
+        self.devices = []
+
+        for i in range(self.num_devices):
+            req_queue = mp_context.Queue()
+            ans_queue = mp_context.Queue()
+            runner = mp_context.Process(
+                target=_Executor(i).run_forever,
+                args=(req_queue, ans_queue),
+                daemon=True,
+            )
+            runner.start()
+            self.devices.append((req_queue, ans_queue, runner))
+
+        self.is_initialized = True
+
+    def exec(self, cmd, *args):
+        with self.rlock:
+            log.info("Main process launched: %s(*%s)", cmd, safe_str(args))
+
+            if cmd in Driver.registry:
+                res = Driver.registry[cmd](self, *args)
+            else:
+                res = self.run_on_executor(self.curr_device_idx, cmd, *args)
+
+            log.info("Main process result for %s received: %s", cmd, safe_str(res))
+            if res == "ERROR":
+                raise RuntimeError(f"Error in daemon while executing {cmd}, see logs")
+            else:
+                return res
+
+    def run_on_executor(self, device_idx, cmd, *args):
+        self._lazy_init()
+        req_queue, ans_queue, _ = self.devices[device_idx]
+        stream = self.getStream(device_idx)
+        validate_send_queue_args(cmd, args)
+        req_queue.put((stream, cmd) + args)
+        return ans_queue.get()
+
+    registry = {}
+
+    @register(registry)
+    def hasPrimaryContext(self, device_idx):
+        return device_idx >= 0 and device_idx < self.num_devices
+
+    @register(registry)
+    def deviceCount(self, *args):
+        assert len(args) == 0
+        return self.num_devices
+
+    @register(registry)
+    def getDevice(self):
+        return self.curr_device_idx
+
+    @register(registry)
+    def setDevice(self, device_idx):
+        assert device_idx >= 0 and device_idx < self.num_devices
+        self.curr_device_idx = device_idx
+
+    @register(registry)
+    def uncheckedSetDevice(self, *args):
+        assert len(args) == 1
+        self.curr_device_idx = int(args[0])
+
+    @register(registry)
+    def exchangeDevice(self, *args):
+        assert len(args) == 1
+        res = self.curr_device_idx
+        self.curr_device_idx = int(args[0])
+        return res
+
+    @register(registry)
+    def malloc(self, size):
+        ptr = self.run_on_executor(self.curr_device_idx, "malloc", size)
+        self.memory_belong[ptr] = self.curr_device_idx
+        return ptr
+
+    @register(registry)
+    def free(self, ptr):
+        device_idx = self.memory_belong.pop(ptr, None)
+        if device_idx is None:
+            return False
+        return self.run_on_executor(device_idx, "free", ptr)
+
+    @register(registry)
+    def isPinnedPtr(self, ptr):
+        return self.host_allocator.is_pinned_ptr(ptr)
+
+    @register(registry)
+    def hostMalloc(self, size):
+        return self.host_allocator.malloc(size)
+
+    @register(registry)
+    def hostFree(self, ptr):
+        return self.host_allocator.free(ptr)
+
+    @register(registry)
+    def getNewStream(self, device_idx, priority):
+        return self.run_on_executor(device_idx, "getNewStream", priority)
+
+    @register(registry)
+    def queryStream(self, stream):
+        return self.run_on_executor(
+            stream.device_index, "queryStream", stream.stream_id
+        )
+
+    @register(registry)
+    def getStream(self, device_idx):
+        return self.curr_streams.get(device_idx, 0)
+
+    @register(registry)
+    def exchangeStream(self, stream):
+        stream_id = self.curr_streams.get(stream.device_index, 0)
+        self.curr_streams[stream.device_index] = stream.stream_id
+        return stream_id
+
+    @register(registry)
+    def synchronizeStream(self, stream):
+        self.run_on_executor(stream.device_index, "synchronizeStream", stream.stream_id)
+
+    @register(registry)
+    def record(self, event, stream, device_index, flags):
+        event_ptr = ctypes.cast(event, ctypes.POINTER(ctypes.c_int64))
+        # Create event if needed
+        if event_ptr.contents.value == 0:
+            event_ptr.contents.value = self.run_on_executor(
+                stream.device_index, "eventCreateWithFlags", flags
+            )
+            self.event_belong[event_ptr.contents.value] = stream.device_index
+
+        # Record event
+        self.run_on_executor(
+            stream.device_index,
+            "eventRecord",
+            event_ptr.contents.value,
+            stream.stream_id,
+        )
+
+    @register(registry)
+    def destroyEvent(self, event, device_index):
+        self.run_on_executor(device_index, "eventDestroy", event)
+        self.event_belong.pop(event)
+
+    @register(registry)
+    def synchronizeEvent(self, event):
+        self.run_on_executor(self.event_belong[event], "eventSynchronize", event)
+
+    @register(registry)
+    def queryEvent(self, event):
+        return self.run_on_executor(self.event_belong[event], "eventQuery", event)
+
+    @register(registry)
+    def elapsedTime(self, e1, e2, device_index):
+        return self.run_on_executor(device_index, "eventElapsedTime", e1, e2)
+
+    @register(registry)
+    def block(self, event, stream):
+        self.run_on_executor(stream.device_index, "block", event, stream.stream_id)
+
+
+class _Executor:
+    def __init__(self, id):
+        self.id = id
+        self.allocator = DeviceAllocator()
+        self.stream = 0
+        self.event_incr_id = 0
+        self.events = {}
+
+    def run_forever(self, req_queue, ans_queue):
+        # Serve all requests
+        while True:
+            # Ignore stream since cpu backend doesn't support asynchronous execution
+            _, cmd, *args = req_queue.get()
+            log.info("Worker executing: %s", cmd)
+            if cmd in _Executor.registry:
+                res = _Executor.registry[cmd](self, *args)
+            else:
+                log.warning("Bad command in worker")
+                res = "ERROR"
+
+            log.info("Worker answering to: %s", cmd)
+            ans_queue.put(res)
+
+    registry = {}
+
+    @register(registry)
+    def malloc(self, size):
+        return self.allocator.malloc(size)
+
+    @register(registry)
+    def free(self, ptr):
+        return self.allocator.free(ptr)
+
+    def _run_op(self, op_name, args, kwargs):
+        op, _ = torch._C._jit_get_operation(op_name)
+        args, kwargs = receive_after_sending(self.allocator, args, kwargs)
+        return op(*args, **kwargs)
+
+    @register(registry)
+    def run_op(self, op_name, args, kwargs):
+        self._run_op(op_name, args, kwargs)
+
+    @register(registry)
+    def get_op_output_shape(self, op_name, args, kwargs):
+        return self._run_op(op_name, args, kwargs).size()
+
+    @register(registry)
+    def send_data(self, *args):
+        assert len(args) == 1
+        return OpenRegTensorData.from_meta(self.allocator, args[0])
+
+    @register(registry)
+    def recv_data(self, host_tensor, dev_mem):
+        dev_tensor = OpenRegTensorData.from_meta(self.allocator, dev_mem)
+        dev_tensor.copy_(host_tensor)
+
+    @register(registry)
+    def getNewStream(self, priority):
+        self.stream += 1
+        return self.stream
+
+    @register(registry)
+    def queryStream(self, stream):
+        return True
+
+    @register(registry)
+    def synchronizeStream(self, stream):
+        # no-op
+        pass
+
+    @register(registry)
+    def eventCreateWithFlags(self, flags):
+        self.event_incr_id += 1
+        self.events[self.event_incr_id] = [flags, None]
+        return self.event_incr_id
+
+    @register(registry)
+    def eventRecord(self, event, stream):
+        # Only flags == 1 enables timing
+        if self.events[event][0] == 1:
+            self.events[event][1] = time.time() * 1000
+        return 0
+
+    @register(registry)
+    def eventDestroy(self, event):
+        self.events.pop(event)
+
+    @register(registry)
+    def eventSynchronize(self, event):
+        assert self.events.get(event) is not None
+        return 0
+
+    @register(registry)
+    def eventQuery(self, event):
+        assert self.events.get(event) is not None
+        return True
+
+    @register(registry)
+    def eventElapsedTime(self, e1, e2):
+        time_1 = self.events[e1][1]
+        time_2 = self.events[e2][1]
+        assert time_1 is not None and time_2 is not None
+        return time_2 - time_1
+
+    @register(registry)
+    def block(self, event, stream):
+        # no-op
+        pass
+
+
+driver = Driver(NUM_DEVICES)
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py
new file mode 100644
index 0000000000000..0f54f2ec4df00
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py
@@ -0,0 +1,103 @@
+import pprint
+
+import torch
+from torch.utils._pytree import tree_map, tree_map_only
+
+
+class OpenRegTensorMeta:
+    def __init__(self, tensor, checked=True):
+        if checked and not tensor.device.type == "openreg":
+            raise RuntimeError(
+                "Creating OpenRegTensorMeta is only for Tensors on openreg device"
+            )
+        self.data_ptr = tensor.untyped_storage().data_ptr()
+        self.size = tensor.size()
+        self.stride = tensor.stride()
+        self.storage_offset = tensor.storage_offset()
+        self.dtype = tensor.dtype
+        self.nelem_in_bytes = tensor.nelement() * tensor.element_size()
+
+    def __repr__(self):
+        return (
+            f"OpenRegTensorMeta({self.data_ptr=}, {self.size=}, {self.stride=}, "
+            f"{self.storage_offset=}, {self.dtype=}, {self.nelem_in_bytes=})"
+        )
+
+
+class OpenRegTensorData(torch.Tensor):
+    @staticmethod
+    def from_meta(allocator, tensor_meta):
+        return OpenRegTensorData(allocator.tensor_from_meta(tensor_meta))
+
+
+VALID_QUEUE_TYPES_IN = {torch.Tensor, int, float}
+
+VALID_QUEUE_TYPES_OUT = {OpenRegTensorMeta, int, float, str}
+
+
+def safe_str(args):
+    def convert(obj):
+        if isinstance(obj, torch.Tensor):
+            return str(OpenRegTensorMeta(obj, checked=False))
+        else:
+            return obj
+
+    new_args = tree_map(convert, args)
+    return pprint.pformat(new_args)
+
+
+def validate_send_queue_args(cmd, args):
+    def check(obj):
+        if type(obj) not in VALID_QUEUE_TYPES_OUT:
+            if (
+                cmd == "recv_data"
+                and type(obj) in [torch.Tensor, OpenRegTensorData]
+                and obj.device.type == "cpu"
+            ):
+                # Only HtoD copy command can send cpu Tensors over
+                return
+            raise RuntimeError(
+                f"Trying to send invalid object through queue: {type(obj)}"
+            )
+
+    tree_map(check, args)
+
+
+def prepare_for_sending(args, kwargs):
+    def convert(obj):
+        if type(obj) not in VALID_QUEUE_TYPES_IN:
+            raise RuntimeError(
+                f"Cannot send object of type {type(obj)} over openreg device pipe."
+            )
+
+        if isinstance(obj, torch.Tensor):
+            return OpenRegTensorMeta(obj)
+        else:
+            return obj
+
+    return tree_map(convert, (args, kwargs))
+
+
+def receive_after_sending(allocator, args, kwargs):
+    def convert(obj):
+        if type(obj) not in VALID_QUEUE_TYPES_OUT:
+            raise RuntimeError(
+                f"Received invalid object of type {type(obj)} over openreg device pipe."
+            )
+
+        if isinstance(obj, OpenRegTensorMeta):
+            return allocator.tensor_from_meta(obj)
+        else:
+            return obj
+
+    return tree_map(convert, (args, kwargs))
+
+
+def to_device_no_copy(device, args, kwargs):
+    def safe_to(t):
+        if device == "meta":
+            return t.to(device=device)
+        else:
+            return torch.empty_like(t, device=device)
+
+    return tree_map_only(torch.Tensor, safe_to, (args, kwargs))
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp
new file mode 100644
index 0000000000000..4580629454b76
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp
@@ -0,0 +1,51 @@
+#include "OpenReg.h"
+
+#include <ATen/Context.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  at::globalContext().lazyInitDevice(c10::DeviceType::PrivateUse1);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg),
+      "_get_default_generator expects an int, but got ",
+      THPUtils_typename(arg));
+  auto idx = static_cast<int>(THPUtils_unpackLong(arg));
+
+  return THPGenerator_initDefaultGenerator(
+      at::globalContext().defaultGenerator(
+          c10::Device(c10::DeviceType::PrivateUse1, idx)));
+
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef methods[] = {
+    {"_init", _initExtension, METH_NOARGS, nullptr},
+    {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
+    {nullptr, nullptr, 0, nullptr}
+};
+
+static struct PyModuleDef openreg_C_module =
+    {PyModuleDef_HEAD_INIT, "pytorch_openreg._C", nullptr, -1, methods};
+
+PyMODINIT_FUNC PyInit__C(void) {
+  PyObject* mod = PyModule_Create(&openreg_C_module);
+
+  py::object openreg_mod = py::module_::import("pytorch_openreg");
+  // Only borrowed from the python side!
+  openreg::set_impl_factory(openreg_mod.attr("impl_factory").ptr());
+
+  return mod;
+}
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h
new file mode 100644
index 0000000000000..a04248f2e5029
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace openreg {
+
+using openreg_ptr_t = uint64_t;
+
+void set_impl_factory(PyObject* factory);
+py::function get_method(const char* name);
+
+static constexpr char kFreeMethod[] = "free";
+static constexpr char kHostFreeMethod[] = "hostFree";
+
+template <const char* name>
+static void ReportAndDelete(void* ptr) {
+  if (!ptr || !Py_IsInitialized()) {
+    return;
+  }
+
+  py::gil_scoped_acquire acquire;
+
+  PyObject *type = nullptr, *value = nullptr, *traceback = nullptr;
+  // Always stash, this will be a no-op if there is no error
+  PyErr_Fetch(&type, &value, &traceback);
+
+  TORCH_CHECK(
+      get_method(name)(reinterpret_cast<openreg_ptr_t>(ptr)).cast<bool>(),
+      "Failed to free memory pointer at ",
+      ptr);
+
+  // If that user code raised an error, just print it without raising it
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+  }
+
+  // Restore the original error
+  PyErr_Restore(type, value, traceback);
+}
+
+#define REGISTER_PRIVATEUSE1_SERIALIZATION(                                    \
+    FOR_SERIALIZATION, FOR_DESERIALIZATION)                                    \
+  static int register_serialization() {                                        \
+    torch::jit::TensorBackendMetaRegistry(                                     \
+        c10::DeviceType::PrivateUse1, FOR_SERIALIZATION, FOR_DESERIALIZATION); \
+    return 0;                                                                  \
+  }                                                                            \
+  static const int _temp = register_serialization();
+
+} // namespace openreg
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp
new file mode 100644
index 0000000000000..a87b378fb95c8
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp
@@ -0,0 +1,350 @@
+#include "OpenReg.h"
+
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/core/GeneratorForPrivateuseone.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+namespace openreg {
+namespace {
+
+// Python factory function where real implementations can be found
+PyObject* py_factory;
+
+struct HostAllocator final : at::Allocator {
+  HostAllocator() = default;
+
+  at::DataPtr allocate(size_t nbytes) override {
+    py::gil_scoped_acquire acquire;
+    void* data = nullptr;
+    if (nbytes > 0) {
+      data = reinterpret_cast<void*>(
+          get_method("hostMalloc")(nbytes).cast<openreg_ptr_t>());
+      TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host.");
+    }
+    return {data, data, &ReportAndDelete<kHostFreeMethod>, at::Device(at::kCPU)};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete<kHostFreeMethod>;
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    py::gil_scoped_acquire acquire;
+    get_method("hostCopyData")(
+        reinterpret_cast<openreg_ptr_t>(dest),
+        reinterpret_cast<openreg_ptr_t>(src),
+        count);
+  }
+};
+
+static HostAllocator global_host_alloc;
+
+static c10::DeviceIndex device_count() {
+  py::gil_scoped_acquire acquire;
+  return get_method("deviceCount")().cast<c10::DeviceIndex>();
+}
+
+static c10::DeviceIndex current_device_idx() {
+  py::gil_scoped_acquire acquire;
+  return get_method("getDevice")().cast<c10::DeviceIndex>();
+}
+
+class OpenRegGeneratorImpl : public at::CPUGeneratorImpl {
+ public:
+  OpenRegGeneratorImpl(c10::DeviceIndex device_index) {
+    device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index);
+    key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1);
+  }
+  ~OpenRegGeneratorImpl() override = default;
+};
+
+static at::Generator make_openreg_generator(c10::DeviceIndex device_index) {
+  return at::make_generator<OpenRegGeneratorImpl>(device_index);
+}
+
+// Default, global generators, one per device.
+static std::vector<at::Generator> default_generators;
+
+struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
+  OpenRegHooksInterface() {};
+  ~OpenRegHooksInterface() override = default;
+
+  bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("hasPrimaryContext")(device_index).cast<bool>();
+  }
+
+  at::Allocator* getPinnedMemoryAllocator() const override {
+    return &global_host_alloc;
+  }
+
+  bool isPinnedPtr(const void* data) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("isPinnedPtr")(reinterpret_cast<openreg_ptr_t>(data))
+        .cast<bool>();
+  }
+
+  const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) const override {
+    static bool flag [[maybe_unused]] = []() {
+      auto deivce_nums = device_count();
+      default_generators.resize(deivce_nums);
+      for (auto i = 0; i < deivce_nums; i++) {
+        default_generators[i] = make_openreg_generator(i);
+        default_generators[i].seed();
+      }
+      return true;
+    }();
+
+    c10::DeviceIndex idx = device_index;
+    if (idx == -1) {
+      idx = current_device_idx();
+    } else {
+      TORCH_CHECK(idx >= 0 && idx < device_count());
+    }
+    return default_generators[idx];
+  }
+
+  at::Generator getNewGenerator(c10::DeviceIndex device_index) const override {
+    return make_openreg_generator(device_index);
+  }
+};
+
+static bool register_hook_flag [[maybe_unused]] = []() {
+  at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface());
+
+  return true;
+}();
+
+// Device guard registration
+struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
+
+  OpenRegGuardImpl() = default;
+  explicit OpenRegGuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == static_type);
+  }
+
+  /**
+   * Return the type of device managed by this guard implementation.
+   */
+  c10::DeviceType type() const override {
+    return static_type;
+  }
+
+  /**
+   * Set the current device to Device, and return the previous c10::Device.
+   */
+  c10::Device exchangeDevice(c10::Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_privateuseone());
+    py::gil_scoped_acquire acquire;
+    auto old_device_index =
+        get_method("exchangeDevice")(d.index()).cast<c10::DeviceIndex>();
+    return c10::Device(static_type, old_device_index);
+  }
+
+  /**
+   * Get the current device.
+   */
+  c10::Device getDevice() const override {
+    return c10::Device(static_type, current_device_idx());
+  }
+
+  /**
+   * Set the current device to c10::Device.
+   */
+  void setDevice(c10::Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_privateuseone());
+    py::gil_scoped_acquire acquire;
+    auto device = get_method("setDevice")(d.index());
+  }
+
+  /**
+   * Set the current device to c10::Device, without checking for errors
+   * (so, e.g., this can be called from a destructor).
+   */
+  void uncheckedSetDevice(c10::Device d) const noexcept override {
+    py::gil_scoped_acquire acquire;
+    auto device = get_method("uncheckedSetDevice")(d.index());
+  }
+
+  /**
+   * Get the current stream for a given device.
+   */
+  c10::Stream getStream(c10::Device d) const noexcept override {
+    py::gil_scoped_acquire acquire;
+    auto stream_id = get_method("getStream")(d.index()).cast<c10::StreamId>();
+    return c10::Stream(c10::Stream::UNSAFE, d, stream_id);
+  }
+
+  /**
+   * Get the default stream for a given device.
+   */
+  c10::Stream getDefaultStream(c10::Device d) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("getDefaultStream")(d.index()).cast<c10::Stream>();
+  }
+
+  /**
+   * Get a stream from the global pool for a given device.
+   */
+  c10::Stream getStreamFromGlobalPool(
+      c10::Device d,
+      bool isHighPriority = false) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("getStreamFromGlobalPool")(d.index(), isHighPriority)
+        .cast<c10::Stream>();
+  }
+
+  /**
+   * Return a new stream for a given device and priority. The stream will be
+   * copied and shared around, device backend should be able to correctly handle
+   * the lifetime of the stream.
+   */
+  c10::Stream getNewStream(c10::Device d, int priority = 0) const override {
+    py::gil_scoped_acquire acquire;
+    auto stream_id =
+        get_method("getNewStream")(d.index(), priority).cast<c10::StreamId>();
+    return c10::Stream(c10::Stream::UNSAFE, d, stream_id);
+  }
+
+  /**
+   * Set a stream to be the thread local current stream for its device.
+   * Return the previous stream for that device. You are NOT required
+   * to set the current device to match the device of this stream.
+   */
+  c10::Stream exchangeStream(c10::Stream s) const noexcept override {
+    py::gil_scoped_acquire acquire;
+    auto stream_id = get_method("exchangeStream")(s).cast<c10::StreamId>();
+    return c10::Stream(c10::Stream::UNSAFE, s.device(), stream_id);
+  }
+
+  /**
+   * Destroys the given event.
+   */
+  void destroyEvent(void* event, const c10::DeviceIndex device_index)
+      const noexcept override {
+    py::gil_scoped_acquire acquire;
+    get_method("destroyEvent")((int64_t)event, device_index);
+  }
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  void record(
+      void** event,
+      const c10::Stream& stream,
+      const c10::DeviceIndex device_index,
+      const c10::EventFlag flag) const override {
+    py::gil_scoped_acquire acquire;
+    get_method("record")((int64_t)event, stream, device_index, (int64_t)flag);
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  void block(void* event, const c10::Stream& stream) const override {
+    py::gil_scoped_acquire acquire;
+    get_method("block")((int64_t)event, stream);
+  }
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  bool queryEvent(void* event) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("queryEvent")((int64_t)event).cast<bool>();
+  }
+
+  /**
+   * Get the number of devices.  WARNING: This is REQUIRED to not raise
+   * an exception.  If there is some sort of problem, e.g., driver error,
+   * you should report that there are zero available devices.
+   */
+  c10::DeviceIndex deviceCount() const noexcept override {
+    return device_count();
+  }
+  /**
+   * Return true if all the work previously enqueued on the stream for
+   * asynchronous execution has completed running on the device.
+   */
+  bool queryStream(const c10::Stream& stream) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("queryStream")(stream).cast<bool>();
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the stream has completed running on the device.
+   */
+  virtual void synchronizeStream(const c10::Stream& stream) const override {
+    py::gil_scoped_acquire acquire;
+    get_method("synchronizeStream")(stream);
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * recorded on the event has completed running on the device.
+   */
+  void synchronizeEvent(void* event) const override {
+    py::gil_scoped_acquire acquire;
+    get_method("synchronizeEvent")((int64_t)event);
+  }
+
+  /**
+   * Ensure the caching allocator (if any) is aware that the given DataPtr is
+   * being used on the given stream, and that it should thus avoid recycling the
+   * DataPtr until all work on that stream is done.
+   */
+  void recordDataPtrOnStream(
+      const c10::DataPtr& data_ptr,
+      const c10::Stream& stream) const override {
+    py::gil_scoped_acquire acquire;
+    get_method("recordDataPtrOnStream")(data_ptr, stream);
+  }
+
+  /**
+   * Fetch the elapsed time between two recorded events.
+   */
+  double elapsedTime(
+      void* event1,
+      void* event2,
+      const c10::DeviceIndex device_index) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("elapsedTime")(
+               (int64_t)event1, (int64_t)event2, device_index)
+        .cast<double>();
+  }
+};
+
+// Register our device guard
+C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl);
+
+} // namespace
+
+// Setter for the python dictionary with implementations
+void set_impl_factory(PyObject* factory) {
+  py_factory = factory;
+}
+
+py::function get_method(const char* name) {
+  auto factory = py::cast<py::function>(py_factory);
+  return factory(name);
+}
+
+} // namespace openreg
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp
new file mode 100644
index 0000000000000..9289ec7b62db2
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp
@@ -0,0 +1,364 @@
+#include "OpenReg.h"
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/ops/as_strided_cpu_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/set_cpu_dispatch.h>
+#include <ATen/ops/set_native.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+#include <ATen/native/quantized/AffineQuantizer.h>
+
+#include <c10/core/Allocator.h>
+
+#include <torch/library.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace openreg {
+namespace {
+
+struct OpenRegAllocator final : at::Allocator {
+  OpenRegAllocator() = default;
+
+  at::DataPtr allocate(size_t nbytes) override {
+    py::gil_scoped_acquire acquire;
+    auto curr_device_idx = get_method("getDevice")().cast<c10::DeviceIndex>();
+    auto curr_device =
+        c10::Device(c10::DeviceType::PrivateUse1, curr_device_idx);
+    void* data = nullptr;
+    if (nbytes > 0) {
+      data = reinterpret_cast<void*>(
+          get_method("malloc")(nbytes).cast<openreg_ptr_t>());
+      TORCH_CHECK(
+          data, "Failed to allocator ", nbytes, " bytes on openreg device.");
+    }
+    return {data, data, &ReportAndDelete<kFreeMethod>, curr_device};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete<kFreeMethod>;
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    py::gil_scoped_acquire acquire;
+    get_method("copy_data")(
+        reinterpret_cast<openreg_ptr_t>(dest),
+        reinterpret_cast<openreg_ptr_t>(src),
+        count);
+  }
+};
+
+static OpenRegAllocator global_openreg_alloc;
+REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_openreg_alloc);
+
+// Empty op needs C++ code and cannot be handled by python side fallback
+at::Tensor empty_openreg(
+    c10::IntArrayRef size,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
+  const auto device = c10::device_or_default(device_opt);
+  const auto dtype = c10::dtype_or_default(dtype_opt);
+  TORCH_CHECK(device.is_privateuseone());
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+  const c10::DeviceGuard device_guard(device);
+  constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
+  return at::detail::empty_generic(
+      size, &global_openreg_alloc, pu1_dks, dtype, memory_format_opt);
+}
+
+at::Tensor empty_strided_openreg(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
+  const auto device = c10::device_or_default(device_opt);
+  const auto dtype = c10::dtype_or_default(dtype_opt);
+  TORCH_CHECK(device.is_privateuseone());
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+  const c10::DeviceGuard device_guard(device);
+  constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
+  return at::detail::empty_strided_generic(
+      size, stride, &global_openreg_alloc, pu1_dks, dtype);
+}
+
+at::Tensor as_strided_openreg(
+    const at::Tensor& self,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    std::optional<int64_t> storage_offset_) {
+  // Metadata-only change so we re-use the cpu impl
+  return at::cpu::as_strided(self, size, stride, storage_offset_);
+}
+
+const at::Tensor& resize__openreg(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format) {
+  return at::native::resize_(
+      self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
+}
+
+at::Tensor& set_source_Storage_storage_offsetset_openreg(
+    at::Tensor& result,
+    at::Storage storage,
+    int64_t storage_offset,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride) {
+  return at::cpu::set_(result, storage, storage_offset, size, stride);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, c10::SymInt, c10::SymInt, at::Tensor, at::Tensor, at::Tensor>
+custom_scaled_dot_product_fused_attention_overrideable(
+    const at::Tensor & query,
+    const at::Tensor & key,
+    const at::Tensor & value,
+    const std::optional<at::Tensor> & attn_bias,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+  const int64_t batch_size = query.size(0);
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim_v = value.size(3);
+  const int64_t max_seqlen_q = query.size(2);
+  const int64_t max_seqlen_kv = key.size(2);
+
+  auto opts = query.options();
+  auto output = at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts);
+  auto logsumexp = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+  auto debug_attn_mask = at::empty({batch_size, num_heads, max_seqlen_q, max_seqlen_kv},
+                                   opts.dtype(at::kFloat));
+  auto philox_seed = at::empty({}, at::dtype(at::kLong));
+  auto philox_offset = at::empty({}, at::dtype(at::kLong));
+
+  return std::make_tuple(output, logsumexp, at::Tensor(), at::Tensor(), max_seqlen_q, max_seqlen_kv, philox_seed, philox_offset, debug_attn_mask);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+custom_scaled_dot_product_fused_attention_overrideable_backward(
+    const at::Tensor & grad_out,
+    const at::Tensor & query,
+    const at::Tensor & key,
+    const at::Tensor & value,
+    const at::Tensor & attn_bias,
+    std::array<bool,4> grad_input_mask,
+    const at::Tensor & out,
+    const at::Tensor & logsumexp,
+    const at::Tensor & cum_seq_q,
+    const at::Tensor & cum_seq_k,
+    int64_t max_q,
+    int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    const at::Tensor & philox_seed,
+    const at::Tensor & philox_offset,
+    std::optional<double> scale) {
+  return std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>(
+          at::empty_like(query),
+          at::empty_like(key),
+          at::empty_like(value),
+          at::empty_like(attn_bias));
+}
+}
+
+// Using the simplest way to obtain continuous Tensor data and process it.
+// This is a demo for using operand API, and you can add more complex logic
+// for input and output tensor based on your custom device kernel.
+void abs_kernel(at::TensorIteratorBase& iter) {
+  // Abs only have a input tensor and a output tensor.
+  auto& output_operand = iter.operand(0);
+  auto& input_operand = iter.operand(1);
+  auto& output_tensor_base = output_operand.tensor_base();
+  auto& input_tensor_base = input_operand.tensor_base();
+  TORCH_CHECK(!input_operand.original_tensor_base().defined(),
+    "input original tensor is defined.");
+  TORCH_CHECK(!output_operand.original_tensor_base().defined(),
+    "output original tensor is defined.");
+  // For easy test, only accept contiguous input tensor for calculate.
+  auto memory_format = input_tensor_base.suggest_memory_format();
+  TORCH_CHECK(input_tensor_base.is_contiguous(memory_format),
+    "Input tensor need be contiguous.");
+  // Add necessary restrictions to ensure the security of the demo.
+  TORCH_CHECK(input_tensor_base.sizes() == output_tensor_base.sizes(),
+    "Intput and output tensor size are not equal.");
+  // Common dtype is calculate in TensorIteratorBase.
+  TORCH_CHECK(iter.common_dtype() == at::ScalarType::Float,
+    "Only support float type.")
+  // Using for loop for abs calculate.
+  auto abs_function = [](float* output_ptr, const float* input_ptr,
+                         const int64_t NUM) {
+    for (int64_t i = 0; i < NUM; ++i) {
+      *(output_ptr + i) = std::abs(*(input_ptr + i));
+    }
+  };
+  // To simplify the logic of the test demo code,
+  // we only use contiguous tensor to calculate on device side.
+  // And using input tensor memory format.
+  if (iter.is_contiguous()) {
+    // Add for will_resize flag check. You can convert to differernt
+    // tensor memory format when will_resize is True.
+    // If TensorIteratorConfig resize_outputs_ flag is true, and there are two
+    // situations:
+    // 1) Out tensor is undefined, and TensorIterator set will_resize to true;
+    // 2) Out tensor is defined and tensor size is not equal to input tensor size;
+    //    TensorIterator set will_resize to true, and call set_output_raw_strided
+    //    to resize output tensor.
+    // When output operand will_resize flag is ture, dummy
+    // device can convert tensor to dummy device preferred memory format.
+    // Here we don't convert tensor memory format, because it will become complex
+    // when dummy device want keep same memory format for training network.
+    TORCH_CHECK(output_operand.will_resize,
+      "output operand will_resize flag need be True.");
+    abs_function((float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
+  } else {
+    // Stride copy is not support for foo device, using cpu device instead.
+    // For abs op, the last situation is: output tensor is not contiguous with
+    // operand will_resize is False.
+    TORCH_CHECK(!output_operand.will_resize, "output operand will_resize is True.");
+    // Get a contiguous tensor with input memory format.
+    at::Tensor output = at::empty(output_tensor_base.sizes(),
+                                  input_tensor_base.options()
+                                                   .memory_format(memory_format));
+    // For structured op which inheried from TensorIteratorBase, maybe you need to
+    // call set_output_raw_strided function to update output stored in op sturctured.
+    // abs op is no need to do this.
+    output_operand.exchange_tensor(c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
+    abs_function((float*)output_operand.tensor_base().mutable_data_ptr(),
+                 (float*)iter.data_ptr(1), iter.numel());
+    // Copy tensor base to original tensor base, and keep same scalar type and
+    // stride with cpu and gpu.
+    if (output_operand.original_tensor_base().defined() &&
+        !output_operand.original_tensor_base().is_same(output_operand.tensor_base())) {
+      output_operand.original_tensor().copy_(output_operand.tensor());
+      output_operand.restore_original_tensor();
+    }
+  }
+}
+
+int64_t _fused_sdp_choice_privateuse1(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_mask,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa) {
+  auto backend = sdp::SDPBackend::overrideable;
+  return static_cast<int64_t>(backend);
+}
+
+void quantize_tensor_per_tensor_affine_privateuse1(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {
+    // Just test the process, so do nothing
+}
+
+/* Notes:
+ *
+ * OpenReg is currently designed to simulate device memory through multiple
+ * subprocesses on purpose to ensure we don't mistakenly poke at the "device's
+ * memory" from the main process. And be able to simulate the same thing that
+ * happens with other accelerators: any metadata-only change is cpu-only
+ * (main process), any data change must go through to the device (other process)
+ * and any data transfer between the two is expensive (serializing the whole
+ * Tensor).
+ *
+ * Currently, for the efficiency of IPC, most operations are to pass the Tensor
+ * metadata, and only a small number of operations involving copy will serialize
+ * and pass the Tensor body by custom pickler provided by torch.multiprocess.
+ *
+ * Therefore, in principle, only operations related to Metadata modification can
+ * be directly implemented at the C++ level and registered in PrivateUse1; but
+ * if memory access is involved, the relevant operations must be implemented at
+ * the Python level, otherwise invalid memory access will result.
+ */
+
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  m.impl("empty.memory_format", empty_openreg);
+  m.impl("empty_strided", empty_strided_openreg);
+  m.impl("as_strided", as_strided_openreg);
+  m.impl("resize_", resize__openreg);
+  m.impl("set_.source_Storage", at::native::set_);
+  m.impl("set_.source_Storage_storage_offset", set_source_Storage_storage_offsetset_openreg);
+  m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
+  m.impl("_fused_sdp_choice", &_fused_sdp_choice_privateuse1);
+  m.impl("_scaled_dot_product_fused_attention_overrideable", &custom_scaled_dot_product_fused_attention_overrideable);
+  m.impl("_scaled_dot_product_fused_attention_overrideable_backward", &custom_scaled_dot_product_fused_attention_overrideable_backward);
+}
+
+struct OpenRegBackendMeta : public c10::BackendMeta {
+  OpenRegBackendMeta(int version_number, int format_number)
+      : version_number_(version_number), format_number_(format_number) {}
+
+  int version_number_{-1};
+  int format_number_{-1};
+};
+
+void for_serialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  auto meta_ptr = t.unsafeGetTensorImpl()->get_backend_meta();
+
+  if (meta_ptr != nullptr) {
+    auto o_meta_ptr = dynamic_cast<OpenRegBackendMeta*>(meta_ptr);
+    if (o_meta_ptr->version_number_ == 1) {
+      m["version_number"] = true;
+    }
+    if (o_meta_ptr->format_number_ == 29) {
+      m["format_number"] = true;
+    }
+  }
+}
+
+void for_deserialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  int version_number{-1};
+  int format_number{-1};
+
+  if (m.find("version_number") != m.end()) {
+    version_number = 1;
+  }
+  if (m.find("format_number") != m.end()) {
+    format_number = 29;
+  }
+
+  c10::intrusive_ptr<c10::BackendMeta> meta{std::unique_ptr<c10::BackendMeta>(
+      new OpenRegBackendMeta(version_number, format_number))};
+  t.unsafeGetTensorImpl()->set_backend_meta(meta);
+}
+
+REGISTER_PRIVATEUSE1_SERIALIZATION(&for_serialization, &for_deserialization)
+} // namespace openreg
+
+namespace at::native {
+REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &openreg::abs_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    quantize_tensor_per_tensor_affine_stub,
+    &openreg::quantize_tensor_per_tensor_affine_privateuse1);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    _fused_sdp_choice_stub,
+    &openreg::_fused_sdp_choice_privateuse1);
+} // namespace at::native
diff --git a/test/cpp_extensions/open_registration_extension/setup.py b/test/cpp_extensions/open_registration_extension/setup.py
new file mode 100644
index 0000000000000..fa8c1308c6c52
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/setup.py
@@ -0,0 +1,78 @@
+import distutils.command.clean
+import os
+import platform
+import shutil
+import sys
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+
+PACKAGE_NAME = "pytorch_openreg"
+version = 1.0
+
+ROOT_DIR = Path(__file__).absolute().parent
+CSRS_DIR = ROOT_DIR / "pytorch_openreg/csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove pytorch_openreg extension
+        for path in (ROOT_DIR / "pytorch_openreg").glob("**/*.so"):
+            path.unlink()
+        # Remove build directory
+        build_dirs = [
+            ROOT_DIR / "build",
+        ]
+        for path in build_dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+if __name__ == "__main__":
+    if sys.platform == "win32":
+        vc_version = os.getenv("VCToolsVersion", "")
+        if vc_version.startswith("14.16."):
+            CXX_FLAGS = ["/sdl"]
+        else:
+            CXX_FLAGS = ["/sdl", "/permissive-"]
+    elif platform.machine() == "s390x":
+        # no -Werror on s390x due to newer compiler
+        CXX_FLAGS = {"cxx": ["-g", "-Wall"]}
+    else:
+        CXX_FLAGS = {"cxx": ["-g", "-Wall", "-Werror"]}
+
+    sources = list(CSRS_DIR.glob("*.cpp"))
+
+    # Note that we always compile with debug info
+    ext_modules = [
+        CppExtension(
+            name="pytorch_openreg._C",
+            sources=sorted(str(s) for s in sources),
+            include_dirs=[CSRS_DIR],
+            extra_compile_args=CXX_FLAGS,
+        )
+    ]
+
+    setup(
+        name=PACKAGE_NAME,
+        version=version,
+        author="PyTorch Core Team",
+        description="Example for PyTorch out of tree registration",
+        packages=find_packages(exclude=("test",)),
+        package_data={PACKAGE_NAME: ["*.dll", "*.dylib", "*.so"]},
+        install_requires=[
+            "torch",
+        ],
+        ext_modules=ext_modules,
+        python_requires=">=3.8",
+        cmdclass={
+            "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+            "clean": clean,
+        },
+    )
diff --git a/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py b/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
index 1fec358448064..19d4a5f6ba164 100644
--- a/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
+++ b/test/cpp_extensions/python_agnostic_extension/python_agnostic/__init__.py
@@ -15,6 +15,7 @@
 # The following is used to assert the ultra_norm op is properly loaded and
 # calculates correct results upon import of this extension.
 
+<<<<<<< HEAD
 if torch.cuda.is_available():
     device = "cuda"
 elif torch.xpu.is_available():
@@ -25,9 +26,18 @@
 inputs = [
     torch.tensor([1.0, 2.0, 3.0], device=device),
     torch.tensor([-4.0, -5.0, -6.0], device=device),
+=======
+inputs = [
+    torch.tensor([1.0, 2.0, 3.0], device="cuda"),
+    torch.tensor([-4.0, -5.0, -6.0], device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 assert torch.equal(
     ops.ultra_norm(inputs),
+<<<<<<< HEAD
     torch.norm(torch.tensor([1.0, 2.0, 3.0, -4.0, -5.0, -6.0], device=device)),
+=======
+    torch.norm(torch.tensor([1.0, 2.0, 3.0, -4.0, -5.0, -6.0], device="cuda")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
diff --git a/test/cpp_extensions/python_agnostic_extension/setup.py b/test/cpp_extensions/python_agnostic_extension/setup.py
index 007e0ac689942..c6ef8f47dc45a 100644
--- a/test/cpp_extensions/python_agnostic_extension/setup.py
+++ b/test/cpp_extensions/python_agnostic_extension/setup.py
@@ -9,8 +9,12 @@
 
 from setuptools import setup
 
+<<<<<<< HEAD
 import torch
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, SyclExtension
+=======
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ROOT_DIR = Path(__file__).parent
@@ -41,6 +45,7 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
+<<<<<<< HEAD
     if torch.cuda.is_available():
         sources = list(CSRC_DIR.glob("**/*.cu"))
         extension = CUDAExtension
@@ -52,6 +57,12 @@ def get_extension():
 
     return [
         extension(
+=======
+    sources = list(CSRC_DIR.glob("**/*.cu"))
+
+    return [
+        CUDAExtension(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "python_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
diff --git a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
index 95ca8638ab926..3d7e99803f6ff 100644
--- a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
+++ b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
@@ -7,6 +7,7 @@
 import unittest
 from pathlib import Path
 
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import (
@@ -16,6 +17,13 @@
     TEST_XPU,
     TestCase,
 )
+=======
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCUDA,
+)
+from torch.testing._internal.common_utils import IS_LINUX, run_tests, shell, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestPythonAgnostic(TestCase):
@@ -28,11 +36,16 @@ def setUpClass(cls):
             shutil.rmtree(cls.dist_dir)
 
         # Build the wheel
+<<<<<<< HEAD
         wheel_cmd = [sys.executable, "-m", "build", "--wheel", "--no-isolation"]
+=======
+        wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return_code = shell(wheel_cmd, cwd=cls.extension_root, env=os.environ)
         if return_code != 0:
             raise RuntimeError("python_agnostic bdist_wheel failed to build")
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not (TEST_CUDA or TEST_XPU),
         "test requires CUDA or XPU",
@@ -40,6 +53,12 @@ def setUpClass(cls):
     @unittest.skipIf(not IS_LINUX, "test requires linux tools ldd and nm")
     def test_extension_is_python_agnostic(self, device):
         # For this test, run_test.py will call `python -m build --wheel --no-isolation` in the
+=======
+    @onlyCUDA
+    @unittest.skipIf(not IS_LINUX, "test requires linux tools ldd and nm")
+    def test_extension_is_python_agnostic(self, device):
+        # For this test, run_test.py will call `python setup.py bdist_wheel` in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # cpp_extensions/python_agnostic_extension folder, where the extension and
         # setup calls specify py_limited_api to `True`. To approximate that the
         # extension is indeed python agnostic, we test
@@ -66,10 +85,14 @@ def test_extension_is_python_agnostic(self, device):
         self.assertFalse("Py" in missing_symbols)
 
 
+<<<<<<< HEAD
 devices = ("cuda", "xpu")
 instantiate_device_type_tests(
     TestPythonAgnostic, globals(), only_for=devices, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(TestPythonAgnostic, globals(), only_for="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/custom_operator/my_custom_ops.py b/test/custom_operator/my_custom_ops.py
index 0eedcb49c2c5b..4159a1d2e64e7 100644
--- a/test/custom_operator/my_custom_ops.py
+++ b/test/custom_operator/my_custom_ops.py
@@ -6,7 +6,11 @@
 torch.ops.load_library(get_custom_op_library_path())
 
 
+<<<<<<< HEAD
 @torch.library.register_fake("custom::nonzero")
+=======
+@torch.library.impl_abstract("custom::nonzero")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def nonzero_abstract(x):
     n = x.dim()
     ctx = torch.library.get_ctx()
diff --git a/test/custom_operator/my_custom_ops2.py b/test/custom_operator/my_custom_ops2.py
index 2a7f4b825f478..54be4a9404ad2 100644
--- a/test/custom_operator/my_custom_ops2.py
+++ b/test/custom_operator/my_custom_ops2.py
@@ -6,6 +6,10 @@
 torch.ops.load_library(get_custom_op_library_path())
 
 
+<<<<<<< HEAD
 @torch.library.register_fake("custom::sin")
+=======
+@torch.library.impl_abstract("custom::sin")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def sin_abstract(x):
     return torch.empty_like(x)
diff --git a/test/custom_operator/pointwise.py b/test/custom_operator/pointwise.py
index 53335fdb02677..df8fd277625a0 100644
--- a/test/custom_operator/pointwise.py
+++ b/test/custom_operator/pointwise.py
@@ -8,12 +8,20 @@
 
 # NB: The impl_abstract_pystub for cos actually
 # specifies it should live in the my_custom_ops2 module.
+<<<<<<< HEAD
 @torch.library.register_fake("custom::cos")
+=======
+@torch.library.impl_abstract("custom::cos")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cos_abstract(x):
     return torch.empty_like(x)
 
 
 # NB: There is no impl_abstract_pystub for tan
+<<<<<<< HEAD
 @torch.library.register_fake("custom::tan")
+=======
+@torch.library.impl_abstract("custom::tan")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def tan_abstract(x):
     return torch.empty_like(x)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
index 87e056c02e562..bb3cbd5e3bdbe 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -12,7 +12,11 @@
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -123,7 +127,10 @@ def world_size(self) -> int:
         return min(torch.get_device_module(device_type).device_count(), 4)
 
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1661
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_clip_grad_norm_2d(self):
         for norm_type in (2, 1, 3, float("inf")):
             dp_size = 2
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index 44000e761d8a0..a4b1070a8092c 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -5,19 +5,26 @@
 import itertools
 import os
 import tempfile
+<<<<<<< HEAD
 import unittest
 from collections.abc import Callable
 from typing import Optional, Union
 from unittest.mock import MagicMock
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._composable import checkpoint, replicate
+<<<<<<< HEAD
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.fsdp import (
     FSDPModule,
@@ -25,12 +32,18 @@
     MixedPrecisionPolicy,
     OffloadPolicy,
 )
+<<<<<<< HEAD
 from torch.distributed.fsdp._fully_shard._fsdp_api import AllGather
 from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
     _div_if_needed,
     _get_gradient_divide_factors,
     DefaultAllGather,
     DefaultReduceScatter,
+=======
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
+    _div_if_needed,
+    _get_gradient_divide_factors,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     foreach_all_gather,
     foreach_all_gather_copy_out,
     foreach_reduce,
@@ -59,9 +72,14 @@
     patch_reshard,
     patch_unshard,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     FeedForward,
+=======
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ModelArgs,
     Transformer,
     TransformerBlock,
@@ -172,7 +190,10 @@ def _test_all_gather(
         all_gather_stream,
     ):
         def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup):
+<<<<<<< HEAD
             all_gather_comm = DefaultAllGather()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             all_gather_result = foreach_all_gather(
                 fsdp_param_group.fsdp_params,
                 group,
@@ -180,7 +201,10 @@ def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup):
                 all_gather_copy_in_stream=all_gather_copy_in_stream,
                 all_gather_stream=all_gather_stream,
                 device=self.device,
+<<<<<<< HEAD
                 all_gather_comm=all_gather_comm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             foreach_all_gather_copy_out(all_gather_result, fsdp_params, group)
             # Transition to unsharded state to register unsharded parameters
@@ -273,7 +297,10 @@ def _test_reduce_scatter(
         group = fsdp_param_group.mesh_info.shard_process_group
         self.assertEqual(group.size(), self.world_size)
         all_reduce_stream = device_module.Stream()
+<<<<<<< HEAD
         comm = DefaultReduceScatter()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             _,
             _,
@@ -286,7 +313,10 @@ def _test_reduce_scatter(
             unsharded_grads,
             group,
             reduce_scatter_stream,
+<<<<<<< HEAD
             comm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_dtype=orig_params[0].dtype,
             reduce_dtype=reduce_scatter_dtype,
             device=self.device,
@@ -420,16 +450,22 @@ def test_manual_reshard_with_reshard_after_forward_false(self):
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1571
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_reduce_scatter_divide_factor(self):
         self.run_subtests(
             {"divide_factor": [self.world_size * 2, self.world_size]},
             self._test_set_reduce_scatter_divide_factor,
         )
+<<<<<<< HEAD
         self.run_subtests(
             {"divide_factor": [self.world_size]},
             self._test_set_reduce_scatter_divide_factor_mixed_prevision,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
         torch.manual_seed(42)
@@ -462,6 +498,7 @@ def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
             self.assertEqual(ref_loss, loss)
             check_sharded_parity(self, ref_model, model)
 
+<<<<<<< HEAD
     def _test_set_reduce_scatter_divide_factor_mixed_prevision(
         self, divide_factor: float
     ):
@@ -511,6 +548,8 @@ def _test_set_reduce_scatter_divide_factor_mixed_prevision(
             self.assertEqual(ref_loss, loss)
             check_sharded_parity(self, ref_model, model)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_set_reshard_after_forward(self):
         """
@@ -1015,6 +1054,7 @@ def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
             events.clear()
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     def test_set_modules_to_backward_prefetch_inside_ac(self):
         n_layers = 3
         reshard_after_forward = True
@@ -1231,6 +1271,8 @@ def set_backward_prefetch(model: Transformer) -> None:
             events.clear()
 
     @skip_if_lt_x_gpu(2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_multi_module_backward_prefetch(self):
         n_layers = 5
         model_args = ModelArgs(n_layers=n_layers, checkpoint_activations=True)
@@ -1638,6 +1680,7 @@ def test_fully_shard_alloc_from_pg(self):
         with open(self.nccl_log_dir.name + "/nccl_log") as f:
             self.assertRegex(f.read(), self.MEMORY_REGISTER_RE)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(2)
     def test_exception_when_used_together_with_comm_hooks(self):
         model = nn.Linear(16, 16)
@@ -1654,6 +1697,8 @@ def test_exception_when_used_together_with_comm_hooks(self):
         with self.assertRaises(AssertionError):
             model.set_allocate_memory_from_process_group_for_comm(True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFullyShardForceSumReduction(FSDPTest):
     # The messages might change when we move to a different NCCL version.
@@ -1677,9 +1722,12 @@ def _run(cls, *args, **kwargs):
 
     # Test reduce-scatter only on plain FSDP on 2 GPUs
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_XPU, "Related environment variable is not supported with XCCL"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_force_sum_reduce_scatter(self):
         torch.manual_seed(42)
         model_args = ModelArgs()
@@ -1732,9 +1780,12 @@ def test_fully_shard_force_sum_reduce_scatter(self):
 
     # Test both reduce-scatter and all-reduce on HSDP (DDP+FSDP) on 4 GPUs
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_XPU, "Related environment variable is not supported with XCCL"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_force_sum_both_reductions(self):
         mesh = init_device_mesh(
             device_type.type, (2, self.world_size // 2), mesh_dim_names=("ddp", "fsdp")
@@ -1799,6 +1850,7 @@ def test_fully_shard_force_sum_both_reductions(self):
         self.assertRegex(logs, all_reduce_sum_re)
 
 
+<<<<<<< HEAD
 class TestFullyShardReduceOpWorldSize1(FSDPTest):
     @property
     def world_size(self) -> int:
@@ -1848,5 +1900,7 @@ def test_size1_reduceop(self):
         self.assertEqual(all_reduce_op, ReduceOp.SUM)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index 57fff5fe8947a..9fa970ba41335 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -32,7 +32,11 @@
     sm_is_or_higher_than,
 )
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -133,11 +137,15 @@ def skipTestForOldSm(self):
             device_type.type,
             self.rank % torch.get_device_module(device_type).device_count(),
         )
+<<<<<<< HEAD
         if (
             device_type.type == "cuda"
             and not torch.version.hip
             and not sm_is_or_higher_than(device, 8, 0)
         ):
+=======
+        if not sm_is_or_higher_than(device, 8, 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.skipTest("bf16 requires sm >= 8.0")
 
     def test_dynamo_trace_use_training_state(self):
@@ -303,6 +311,7 @@ def _check_count(copy_count, resize_count):
 
     def _reinplace_all_gather_with_optional_checks(self, fwd_fullgraph):
         def _run_with_checks(graph, orig_fn):
+<<<<<<< HEAD
             if self.world_size > 1:
                 self.assertGreater(
                     _count_op_in_graph(
@@ -317,6 +326,14 @@ def _run_with_checks(graph, orig_fn):
                     ),
                     0,
                 )
+=======
+            self.assertGreater(
+                _count_op_in_graph(
+                    graph, torch.ops._c10d_functional.all_gather_into_tensor.default
+                ),
+                0,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             orig_fn(graph)
 
@@ -327,6 +344,7 @@ def _run_with_checks(graph, orig_fn):
                 0,
             )
 
+<<<<<<< HEAD
             if self.world_size > 1:
                 self.assertGreater(
                     _count_op_in_graph(
@@ -343,6 +361,14 @@ def _run_with_checks(graph, orig_fn):
                     ),
                     0,
                 )
+=======
+            self.assertGreater(
+                _count_op_in_graph(
+                    graph, torch.ops._c10d_functional.all_gather_into_tensor_out.default
+                ),
+                0,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if fwd_fullgraph:
             return mock.patch.object(
@@ -482,6 +508,10 @@ def inductor_code_check_fsdp_reduce_scatter(
         file_check = file_check.check("torch.ops._c10d_functional.wait_tensor.")
         return file_check
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_compiled_autograd_ctx(self):
         self.skipTestForOldSm()
@@ -569,8 +599,12 @@ def test_compiled():
   Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
 
   Developer debug context: call_method TensorVariable() backward () {}
+<<<<<<< HEAD
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0123.html""",  # noqa: B950
+=======
+""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 else:
                     self.assertGreater(len(counters["graph_break"]), 1)
@@ -646,12 +680,20 @@ def input_creation_fn():
 
         return model_init_fn, input_creation_fn
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_aot_eager(self):
         self._test_traceable_fsdp(
             *self._create_simple_mlp_factory_fns(), "aot_eager", fwd_fullgraph=True
         )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self):
         self._test_traceable_fsdp(
@@ -660,6 +702,10 @@ def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self):
             fwd_fullgraph=True,
         )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_inductor(self):
         self.skipTestForOldSm()
@@ -731,6 +777,10 @@ def input_creation_fn():
 
         return model_init_fn, input_creation_fn
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager(self):
         # TODO: fix fwd_fullgraph=False case
@@ -743,6 +793,10 @@ def test_nested_fully_shard_backend_aot_eager(self):
                 fwd_fullgraph=fwd_fullgraph,
             )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager_decomp_partition(self):
         # TODO: fix fwd_fullgraph=False case
@@ -864,16 +918,28 @@ def _test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                     pass
                 file_check.run(bwd_code)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
         self._test_nested_fully_shard_backend_inductor_fullgraph_True()
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch("graph_partition", True)
     def test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition(self):
         self._test_nested_fully_shard_backend_inductor_fullgraph_True()
 
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_False(self):
         self.skipTestForOldSm()
@@ -951,6 +1017,10 @@ def _sdpa_with_graph_break(*args, **kwargs):
         else:
             return contextlib.nullcontext()
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_transformer_backend_aot_eager(self):
         # TODO: fix fwd_fullgraph=False case
@@ -969,6 +1039,10 @@ def test_transformer_backend_aot_eager(self):
                     fwd_fullgraph=fwd_fullgraph,
                 )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout has worse accuracy after decomp, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
@@ -1103,14 +1177,22 @@ def _test_transformer_backend_inductor_fullgraph_True(self):
                     pass
                 file_check.run(bwd_code)
 
+<<<<<<< HEAD
     @unittest.skip('"Traceable FSDP2" is not being maintained anymore.')
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
     def test_transformer_backend_inductor_fullgraph_True(self):
         self._test_transformer_backend_inductor_fullgraph_True()
 
+<<<<<<< HEAD
     @unittest.skip('"Traceable FSDP2" is not being maintained anymore.')
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
@@ -1119,6 +1201,10 @@ def test_transformer_backend_inductor_fullgraph_True_graph_partition(self):
         self._test_transformer_backend_inductor_fullgraph_True()
 
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
index 0ce32057ffbe0..dbc105b60689f 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@@ -73,7 +73,11 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             opt.param_groups[0]["params"][0].grad._local_tensor[0, 0].fill_(
                 float("inf")
             )
+<<<<<<< HEAD
         initial_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+=======
+        inital_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         scaler.unscale_(opt)
         for found_inf in scaler._per_optimizer_states[id(opt)][
@@ -85,7 +89,11 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             OptState.UNSCALED.value,
         )
         unscaled_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+<<<<<<< HEAD
         self.assertEqual(unscaled_grad, initial_grad * inv_scale)
+=======
+        self.assertEqual(unscaled_grad, inital_grad * inv_scale)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         initial_scale = scaler.get_scale()
         initial_state = copy.copy(opt.state)
 
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
index bf5f7b8e40194..130e2fbde98b9 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -9,7 +9,11 @@
 import torch.nn as nn
 from torch.distributed._composable import replicate
 from torch.distributed.device_mesh import init_device_mesh
+<<<<<<< HEAD
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
+=======
+from torch.distributed.fsdp import fully_shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp._fully_shard._fsdp_init import (
     _get_managed_modules,
     _get_managed_states,
@@ -644,6 +648,7 @@ def world_size(self) -> int:
     def test_meta_device_1d_init(self):
         default_pg = torch.distributed.distributed_c10d._get_default_group()
         mesh = init_device_mesh(device_type.type, mesh_shape=(default_pg.size(),))
+<<<<<<< HEAD
         # Test both even sharding (8), uneven sharding (3), and empty local tensor (1)
         for mlp_dim in (8, 3, 1):
             # cover foreach_copy code path for bf16
@@ -666,6 +671,21 @@ def test_meta_device_1d_init(self):
                 for param in model.parameters():
                     self.assertEqual(param.device, torch.device("meta"))
                 self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
+=======
+
+        # Test both even sharding (8) and uneven sharding (3)
+        for mlp_dim in (8, 3):
+            with torch.device("meta"):
+                model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
+                for param in model.parameters():
+                    self.assertEqual(param.device, torch.device("meta"))
+                fully_shard(model[0], mesh=mesh)
+                fully_shard(model[1], mesh=mesh)
+                fully_shard(model, mesh=mesh)
+            for param in model.parameters():
+                self.assertEqual(param.device, torch.device("meta"))
+            self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test that we can call `fully_shard` under meta-device context and
         # that `init_device_mesh` call still works
@@ -1314,6 +1334,12 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(1)
     def test_old_import_training(self):
+<<<<<<< HEAD
+=======
+        from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
+        from torch.distributed._composable.fsdp.fully_shard import FSDPModule
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 16))
         mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
         fully_shard(model[0], mp_policy=mp_policy)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_logging.py b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
index c9450a2b8f475..f5520562e08d9 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@@ -6,9 +6,17 @@
 import torch.distributed as dist
 from torch._dynamo.test_case import run_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+<<<<<<< HEAD
 from torch.testing._internal.logging_utils import LoggingTestCase
 
 
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.logging_utils import LoggingTestCase
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
index 3174889163b79..57ae88e36655e 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_memory.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -8,12 +8,16 @@
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     run_tests,
     TEST_CUDA,
     TEST_HPU,
     TEST_XPU,
 )
+=======
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -67,6 +71,7 @@ def _test_fully_shard_training_memory(
         # allocate the cuBLAS workspaces before measuring the memory usage
         # since the workspace size can differ between hardwares
         lin = torch.nn.Linear(768, 768, device=device_type)
+<<<<<<< HEAD
         # NOTE: before https://github.com/pytorch/pytorch/pull/163955,
         # the input shape was (1, 768), so that the forward gemm used
         # cublaslt, and the backward used cublas.
@@ -82,6 +87,9 @@ def _test_fully_shard_training_memory(
         # since the input preparation can swap matrices based on output
         # row-/col-majorness.
         inp = torch.randn(2, 768, device=device_type)
+=======
+        inp = torch.randn(1, 768, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lin(inp).sum().backward()
         torch.get_device_module(device_type).empty_cache()
         base_mem_mb = self._get_peak_active_memory_mb()
@@ -255,15 +263,23 @@ def test_fully_shard_del_memory(self):
 
     def _get_peak_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
+<<<<<<< HEAD
 
         if TEST_CUDA or TEST_XPU:
+=======
+        if TEST_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return round(mem_stats["active_bytes.all.peak"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["MaxInUse"] / 1e6)
 
     def _get_curr_active_memory_mb(self) -> int:
         mem_stats = torch.get_device_module(device_type).memory_stats()
+<<<<<<< HEAD
         if TEST_CUDA or TEST_XPU:
+=======
+        if TEST_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return round(mem_stats["active_bytes.all.current"] / 1e6)
         if TEST_HPU:
             return round(mem_stats["InUse"] / 1e6)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
index 212420c784516..8ab3f5c7be5ae 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -28,11 +28,15 @@
     patch_reduce_scatter,
     reduce_scatter_with_assert,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfRocmVersionLessThan,
     TEST_HPU,
 )
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 device_type = torch.device(get_devtype())
@@ -90,7 +94,11 @@ def _get_use_shard_placement_fn_vals_for_bf16_reduce(self):
             use_shard_placement_fn_vals.append(True)
         return use_shard_placement_fn_vals
 
+<<<<<<< HEAD
     @skipIfRocmVersionLessThan((7, 0))
+=======
+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_compute_dtype(self):
@@ -170,7 +178,11 @@ def assert_fn(output: torch.Tensor):
             self.assertEqual(fsdp_loss, ref_loss)
             check_sharded_parity(self, ref_model, model)
 
+<<<<<<< HEAD
     @skipIfRocmVersionLessThan((7, 0))
+=======
+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_reduce_dtype(self):
@@ -614,7 +626,11 @@ def forward(self, input: Input):
             torch.bfloat16, torch.bfloat16, torch.bfloat16, True
         )
         model = Model()
+<<<<<<< HEAD
         inp = Input(torch.randn(2, 10).to(device_type))
+=======
+        inp = Input(torch.randn(2, 10).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fully_shard(model, mp_policy=mp_policy)
         loss = model(inp).sum()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
index f4a33a86c417b..478257be73bc4 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -3,29 +3,41 @@
 import copy
 import functools
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.experimental import implicit_replication
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     skip_if_lt_x_gpu,
     skip_if_rocm_arch_multiprocess,
 )
+=======
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_fsdp import (
     FSDPTest,
     get_devtype,
     patch_all_gather,
     patch_reduce_scatter,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
     MI200_ARCH,
     run_tests,
     TEST_HPU,
 )
+=======
+from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests, TEST_HPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 device_type = torch.device(get_devtype())
@@ -51,7 +63,10 @@ class TestFullyShardOverlap(FSDPTest):
     def world_size(self) -> int:
         return min(2, torch.get_device_module(device_type).device_count())
 
+<<<<<<< HEAD
     @skip_if_rocm_arch_multiprocess(MI200_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
     def test_fully_shard_training_overlap(self):
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
index d193d65b179a5..2ea5c8dc5d3b7 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -118,6 +118,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(value, sharded_sd[key])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     def test_cached_state_dict(self):
         self.run_subtests(
             {"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
@@ -161,6 +162,8 @@ def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
                 )
 
     @skip_if_lt_x_gpu(2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dp_state_dict_cpu_offload(self):
         self.run_subtests(
             {
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index 8331cd90ce9bc..5bfb96ed1eeba 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -6,7 +6,11 @@
 import itertools
 import unittest
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
+=======
+from collections.abc import Iterable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, Union
 
 import torch
@@ -24,6 +28,7 @@
     fully_shard,
     OffloadPolicy,
     register_fsdp_forward_method,
+<<<<<<< HEAD
     share_comm_ctx,
 )
 from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
@@ -36,6 +41,12 @@
     skip_if_lt_x_gpu,
     skip_if_rocm_arch_multiprocess,
 )
+=======
+)
+from torch.distributed.tensor import DTensor, init_device_mesh, Shard
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     compiled_fsdp_test,
@@ -44,18 +55,27 @@
     MLP,
     MLPStack,
     patch_all_gather,
+<<<<<<< HEAD
     patch_foreach_all_gather,
     patch_foreach_reduce,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     patch_reduce_scatter,
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
+<<<<<<< HEAD
     MI200_ARCH,
     run_tests,
     TEST_HPU,
     TEST_XPU,
     wrapSwapTensorsTest,
     xfailIf,
+=======
+    run_tests,
+    TEST_HPU,
+    wrapSwapTensorsTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -337,7 +357,11 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep kernel not supported for HPU/XPU")
+=======
+    @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group(self):
         """
@@ -348,7 +372,11 @@ def test_train_parity_multi_group(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True, False, 2],
+<<<<<<< HEAD
                 "test_device_type": [device_type.type],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -360,7 +388,11 @@ def test_train_parity_multi_group(self):
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
+=======
+    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_multi_group_cpu_offload_eager(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
@@ -373,7 +405,11 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
                     CPUOffloadPolicy(pin_memory=True),
                     CPUOffloadPolicy(pin_memory=False),
                 ],
+<<<<<<< HEAD
                 "test_device_type": [device_type.type],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
                 "delay_before_reduce_scatter": [False, True],
@@ -384,7 +420,11 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
+=======
+    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group_unshard_async_op(self):
         """
@@ -394,7 +434,11 @@ def test_train_parity_multi_group_unshard_async_op(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True],
+<<<<<<< HEAD
                 "test_device_type": [device_type.type],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -409,7 +453,11 @@ def _test_train_parity_multi_group(
         self,
         reshard_after_forward: Union[bool, int],
         offload_policy: OffloadPolicy,
+<<<<<<< HEAD
         test_device_type: str,
+=======
+        device_type: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         delay_after_forward: bool,
         delay_before_all_gather: bool,
         delay_before_reduce_scatter: bool,
@@ -425,7 +473,11 @@ def _test_train_parity_multi_group(
             in (2, 3)
         ):
             return
+<<<<<<< HEAD
         assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
+=======
+        assert device_type in ("cuda", "hpu", "xpu", "cpu"), f"{device_type}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         vocab_size = 1024
         model_args = ModelArgs(
@@ -437,7 +489,11 @@ def _test_train_parity_multi_group(
         )
         model = Transformer(model_args)
         ref_model = copy.deepcopy(model)
+<<<<<<< HEAD
         if test_device_type == device_type.type:
+=======
+        if device_type == device_type:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replicate(
                 ref_model.to(device_type),
                 device_ids=[self.rank],
@@ -446,7 +502,11 @@ def _test_train_parity_multi_group(
             gloo_pg = dist.new_group(backend="gloo")
             replicate(ref_model, process_group=gloo_pg)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+<<<<<<< HEAD
         mesh = init_device_mesh(test_device_type, (self.world_size,))
+=======
+        mesh = init_device_mesh(device_type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard_fn = functools.partial(
             fully_shard,
             mesh=mesh,
@@ -496,19 +556,30 @@ def delayed_reduce_scatter(*args, **kwargs):
                     _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
                     losses.append(_model(inp).sum())
                     if _model is model and delay_after_forward:
+<<<<<<< HEAD
                         torch.get_device_module(test_device_type)._sleep(
+=======
+                        torch.get_device_module(device_type)._sleep(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             int(delay_in_ms * get_cycles_per_ms())
                         )
                     losses[-1].backward()
                     if _model is model and delay_before_optim:
+<<<<<<< HEAD
                         torch.get_device_module(test_device_type)._sleep(
+=======
+                        torch.get_device_module(device_type)._sleep(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             int(delay_in_ms * get_cycles_per_ms())
                         )
                     _optim.step()
                 self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_root_forward_backward(self):
         """
         Tests running forward/backward through the root and then through a
@@ -639,7 +710,11 @@ def test_explicit_prefetching(self):
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep is not supported on HPU/XPU")
+=======
+    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_post_optim_event(self):
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0)
@@ -692,7 +767,10 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(2)
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1661
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_with_activation_checkpointing(self):
         """
         Tests train parity against DDP when composing with activation
@@ -1209,7 +1287,10 @@ def init_global_mesh(self) -> DeviceMesh:
             mesh_dim_names=("pp", "dp", "tp"),
         )
 
+<<<<<<< HEAD
     @skip_if_rocm_arch_multiprocess(MI200_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_2d_mlp_with_nd_mesh(self):
         global_mesh = self.init_global_mesh()
@@ -1327,7 +1408,11 @@ def _test_3d_mlp_with_nd_mesh(
             use_activation_checkpointing,
             reshard_after_forward=reshard_after_forward,
         )
+<<<<<<< HEAD
         # Checking parameters match orig model is critical to validate .full_tensor correctly replicates the
+=======
+        # Checking paramters match orig model is critical to validate .full_tensor correctly replicates the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # strided-sharded layers.
         for ref_p, p in zip(ref_model.parameters(), model.parameters()):
             self.assertIsInstance(p, DTensor)
@@ -1376,10 +1461,13 @@ def test_train_parity_hsdp(self):
                 "use_activation_checkpointing": [False, True],
                 "mlp_dim": [3, 16, 17],
                 "sync_gradients_at_last_batch": [True, False],
+<<<<<<< HEAD
                 "offload_policy": [
                     CPUOffloadPolicy(pin_memory=True),
                     CPUOffloadPolicy(pin_memory=False),
                 ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             functools.partial(self._test_train_parity_hsdp, global_mesh),
         )
@@ -1391,7 +1479,10 @@ def _test_train_parity_hsdp(
         use_activation_checkpointing: bool,
         mlp_dim: int,
         sync_gradients_at_last_batch: bool,
+<<<<<<< HEAD
         offload_policy: CPUOffloadPolicy,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         torch.manual_seed(42)
         model = nn.Sequential(
@@ -1410,6 +1501,7 @@ def _test_train_parity_hsdp(
             if use_activation_checkpointing:
                 checkpoint(mlp)
             fully_shard(
+<<<<<<< HEAD
                 mlp,
                 mesh=global_mesh,
                 reshard_after_forward=reshard_after_forward,
@@ -1420,6 +1512,12 @@ def _test_train_parity_hsdp(
             mesh=global_mesh,
             reshard_after_forward=reshard_after_forward,
             offload_policy=offload_policy,
+=======
+                mlp, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+            )
+        fully_shard(
+            model, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         check_sharded_parity(self, ref_model, model)
@@ -1494,6 +1592,7 @@ def forward(self, imgs: torch.Tensor) -> torch.Tensor:
         check_sharded_parity(self, ref_model, model)
 
 
+<<<<<<< HEAD
 class TestFullyShardShareCommContext(FSDPTest):
     @property
     def world_size(self) -> int:
@@ -1667,5 +1766,7 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(losses[0], losses[1])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/test_checkpoint.py b/test/distributed/_composable/test_checkpoint.py
index 7834328f1e359..219c95620e65b 100644
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@@ -10,6 +10,7 @@
 import torch.nn as nn
 from torch.distributed._composable import checkpoint
 from torch.testing._internal.common_cuda import TEST_CUDA
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.utils.checkpoint import CheckpointError
 
@@ -17,6 +18,12 @@
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.checkpoint import CheckpointError
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MemoryDelta(ContextDecorator):
     def __init__(self, device: torch.device):
         self.device: torch.device = device
@@ -25,16 +32,26 @@ def __init__(self, device: torch.device):
 
     def __enter__(self):
         self.active_memory_enter = (
+<<<<<<< HEAD
             torch.accelerator.memory_stats()["active_bytes.all.current"]
             if self.device.type == "cuda" or self.device.type == "xpu"
+=======
+            torch.cuda.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else 0
         )
         return self
 
     def __exit__(self, *exc):
         self.active_memory_exit = (
+<<<<<<< HEAD
             torch.accelerator.memory_stats()["active_bytes.all.current"]
             if self.device.type == "cuda" or self.device.type == "xpu"
+=======
+            torch.cuda.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else 0
         )
 
@@ -129,7 +146,11 @@ def _test_tensor_only(
             loss2 = net2(x2).sum()
         loss2.backward()
 
+<<<<<<< HEAD
         if x.is_cuda or x.is_xpu:
+=======
+        if x.is_cuda:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(mem2.delta() < mem1.delta())
 
         for p1, p2 in zip(net1.parameters(), net2.parameters()):
@@ -140,10 +161,17 @@ def test_tensor_only_cpu(self):
         net = ToyModel()
         self._test_tensor_only(net, x)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
     def test_tensor_only_gpu(self):
         x = torch.randn(20, 100, device=f"{device_type}:0")
         net = ToyModel().to(f"{device_type}:0")
+=======
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_tensor_only_gpu(self):
+        x = torch.randn(20, 100, device="cuda:0")
+        net = ToyModel().to("cuda:0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_tensor_only(net, x)
 
     def test_random_cpu(self):
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index 9375c86d35584..a22824386ae7e 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -41,6 +41,7 @@
 from torch.distributed.tensor.parallel.fsdp import DTensorExtensions
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
 from torch.nn.parallel import DistributedDataParallel as DDP
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     skip_if_lt_x_gpu,
     skip_if_rocm_arch_multiprocess,
@@ -53,6 +54,14 @@
     run_tests,
     TEST_XPU,
     xfailIf,
+=======
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, MLP, MLPStack
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -64,9 +73,12 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SimpleModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -82,7 +94,11 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(4, 5, device=device_type)
+=======
+        return torch.rand(4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SimpleModelUneven(nn.Module):
@@ -103,7 +119,11 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(4, 5, device=device_type)
+=======
+        return torch.rand(4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFullyShard2DTraining(FSDPTest):
@@ -114,18 +134,28 @@ class TestFullyShard2DTraining(FSDPTest):
 
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.accelerator.device_count())
+=======
+        return min(4, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def init_global_mesh(self) -> DeviceMesh:
         # Prefer to test with >=4 GPUs, but for 2 GPUs, use 2-way TP
         dp_size = 2 if self.world_size > 2 else 1
         return init_device_mesh(
+<<<<<<< HEAD
             device_type,
             (dp_size, self.world_size // dp_size),
             mesh_dim_names=("dp", "tp"),
         )
 
     @skip_if_rocm_arch_multiprocess(MI200_ARCH)
+=======
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_train_parity_2d_mlp(self):
         global_mesh = self.init_global_mesh()
@@ -150,7 +180,11 @@ def _test_train_parity_2d_mlp(
 
         torch.manual_seed(42)
         model = MLPStack(mlp_dim)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).to(device_type)
+=======
+        ref_model = copy.deepcopy(model).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         model.parallelize(
@@ -162,8 +196,14 @@ def _test_train_parity_2d_mlp(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
+<<<<<<< HEAD
         for iter_idx in range(10):
             inp = torch.randn((8, mlp_dim), device=device_type)
+=======
+        device = torch.device("cuda")
+        for iter_idx in range(10):
+            inp = torch.randn((8, mlp_dim), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
@@ -173,7 +213,10 @@ def _test_train_parity_2d_mlp(
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_2d_transformer(self):
         self.run_subtests(
             {"use_shard_placement_fn": [False, True]},
@@ -184,12 +227,20 @@ def _test_train_parity_2d_transformer(self, use_shard_placement_fn: bool):
         torch.manual_seed(42)
         model_args = ModelArgs(n_layers=3, dropout_p=0.0)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).to(device_type)
+=======
+        ref_model = copy.deepcopy(model).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
 
         dp_size, tp_size = self.world_size // 2, 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+=======
+            "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model = Transformer.parallelize(model, global_mesh["tp"], use_seq_parallel=True)
 
@@ -217,8 +268,13 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
         torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
         for _ in range(5):
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        for iter_idx in range(5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_loss = ref_model(inp).sum()
             loss = model(inp).sum()
             self.assertEqual(ref_loss, loss)
@@ -238,7 +294,13 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             # runs its reduce-scatter
             self.assertIsInstance(model.pos_embeddings.weight.placements[1], Shard)
             self.assertIsInstance(model.pos_embeddings.weight.grad.placements[1], Shard)
+<<<<<<< HEAD
             for ref_param, param in zip(ref_model.parameters(), model.parameters()):
+=======
+            for ref_param, (param_name, param) in zip(
+                ref_model.parameters(), model.named_parameters()
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 full_grad = param.grad.full_tensor()
                 self.assertEqual(ref_param.grad, full_grad)
 
@@ -252,16 +314,26 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/pytorch/pytorch/issues/156782
     def test_tp_with_fsdp_offloading(self):
         global_mesh = init_device_mesh(
             device_type, (1, self.world_size), mesh_dim_names=("dp", "tp")
+=======
+    def test_tp_with_fsdp_offloading(self):
+        global_mesh = init_device_mesh(
+            "cuda", (1, self.world_size), mesh_dim_names=("dp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         torch.manual_seed(42)
         mlp_dim = 16
         model = MLPStack(mlp_dim)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).to(device_type)
+=======
+        ref_model = copy.deepcopy(model).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         # Parallelize with N-way TP and 1-way FSDP
         model.parallelize(
@@ -279,7 +351,11 @@ def test_tp_with_fsdp_offloading(self):
         # NOTE: We still see the FSDP all-gather/reduce-scatter c10d ops
         # called, but they will just be no-ops without issuing any kernels.
         # We prefer to keep the no-op check at the c10d level, not in FSDP.
+<<<<<<< HEAD
         inp = torch.randn((4, mlp_dim), device=device_type)  # same on all ranks
+=======
+        inp = torch.randn((4, mlp_dim), device="cuda")  # same on all ranks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(10):
             ref_optim.zero_grad()
             optim.zero_grad()
@@ -288,27 +364,44 @@ def test_tp_with_fsdp_offloading(self):
                 loss = model(inp).sum()
 
             fwd_comm_counts = fwd_comm_mode.get_comm_counts()
+<<<<<<< HEAD
             self.assertEqual(len(fwd_comm_counts), 1)
             self.assertEqual(fwd_comm_counts[funcol.all_reduce], num_mlps)
             self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], 0)
+=======
+            self.assertEqual(len(fwd_comm_counts), 2)
+            self.assertEqual(fwd_comm_counts[funcol.all_reduce], num_mlps)
+            self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_mlps)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_loss = ref_model(inp).sum()
             self.assertEqual(loss, ref_loss)
 
             with CommDebugMode() as bwd_comm_mode:
                 loss.backward()
             bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+<<<<<<< HEAD
             self.assertEqual(len(bwd_comm_counts), 1)
             # First MLP's input gradient does not need to be all-reduced
             self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
             self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], 0)
             self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], 0)
+=======
+            self.assertEqual(len(bwd_comm_counts), 3)
+            # First MLP's input gradient does not need to be all-reduced
+            self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_mlps)
+            self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_mlps)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_loss.backward()
 
             optim.step()
             ref_optim.step()
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1881
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_train_parity_2d_transformer_checkpoint_resume(self):
         """
@@ -364,7 +457,11 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         )
 
         torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (3, 16), device=device_type)
+=======
+        inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss_no_cp1 = train_step(model_no_cp, optim_no_cp, inp)
         loss_no_cp2 = train_step(model_no_cp, optim_no_cp, inp)
 
@@ -422,14 +519,24 @@ class TestFullyShard2DStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
+<<<<<<< HEAD
         return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
+=======
+        return "cpu:gloo,cuda:nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_fully_shard_tp_2d_set_full_state_dict(self):
+<<<<<<< HEAD
         dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
             device_type,
+=======
+        dummy_model = SimpleModel().cuda()
+        mesh_2d = init_device_mesh(
+            "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (2, self.world_size // 2),
             mesh_dim_names=("dp", "tp"),
         )
@@ -568,12 +675,34 @@ def _compare_params(self, m1, m2):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
+=======
+    def test_raise_invalid_tp_composition(self):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
+        ):
+            mesh_2d = init_device_mesh(
+                self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
+            )
+            parallelize_plan = {
+                "net1": ColwiseParallel(),
+                "net2": RowwiseParallel(),
+            }
+            parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_2d_fsdp_state_enable_extension(self):
         mesh_2d = init_device_mesh(
             self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
         )
         model = FSDP(
+<<<<<<< HEAD
             SimpleModel().to(device_type),
+=======
+            SimpleModel().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_mesh=mesh_2d["dp"],
         )
         fsdp_state = _get_module_fsdp_state(model)
@@ -585,7 +714,11 @@ def _test_2d_e2e_training(
         recompute_activation=False,
     ) -> None:
         torch.manual_seed(0)
+<<<<<<< HEAD
         model = SimpleModel().to(f"{device_type}:{self.rank}")
+=======
+        model = SimpleModel().cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(model, use_orig_params=use_orig_params)
         optim = torch.optim.Adam(model.parameters(), lr=0.01)
 
@@ -599,9 +732,13 @@ def _test_2d_e2e_training(
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
+<<<<<<< HEAD
         model_2d = parallelize_module(
             SimpleModel().to(device_type), tp_mesh, parallelize_plan
         )
+=======
+        model_2d = parallelize_module(SimpleModel().cuda(), tp_mesh, parallelize_plan)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_2d = FSDP(
             model_2d,
             device_mesh=dp_mesh,
@@ -629,7 +766,11 @@ def _test_2d_e2e_training(
             # Ensure all input across TP ranks are same.
             # TODO: add a get_group_rank() to DeviceMesh.
             torch.manual_seed(i + dist.get_rank(dp_mesh.get_group(mesh_dim=0)))
+<<<<<<< HEAD
             input = torch.rand(4, 5).to(f"{device_type}:{self.rank}")
+=======
+            input = torch.rand(4, 5).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = model(input)
             output_2d = model_2d(input)
             self.assertEqual(output, output_2d)
@@ -666,7 +807,11 @@ class TestNew2dParallelStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
+<<<<<<< HEAD
         return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
+=======
+        return "cpu:gloo,cuda:nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -683,7 +828,11 @@ def test_fsdp_2d_extension(self):
             "net3": ColwiseParallel(),
         }
         model_2d = parallelize_module(
+<<<<<<< HEAD
             SimpleModel().to(device_type),
+=======
+            SimpleModel().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mesh_2d["tp"],
             parallelize_plan=parallelize_plan,
         )
@@ -693,10 +842,15 @@ def test_fsdp_2d_extension(self):
             isinstance(model_2d_fsdp_state._fsdp_extension, DTensorExtensions)
         )
 
+<<<<<<< HEAD
         mesh_1d = init_device_mesh(device_type, (self.world_size,))
         model_1d = FSDP(
             SimpleModel().to(device_type), device_mesh=mesh_1d, use_orig_params=True
         )
+=======
+        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+        model_1d = FSDP(SimpleModel().cuda(), device_mesh=mesh_1d, use_orig_params=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
         self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)
 
@@ -708,7 +862,11 @@ def test_2d_state_dict(self, is_even_sharded_model):
 
         # Create a model without wrapper
         torch.manual_seed(0)
+<<<<<<< HEAD
         no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
+=======
+        no_wrap_model = simple_model().cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         no_wrap_state_dict = no_wrap_model.state_dict()
 
         # Create a model and sharded it with 2D FSDP + TP
@@ -722,9 +880,13 @@ def test_2d_state_dict(self, is_even_sharded_model):
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
+<<<<<<< HEAD
         model_2d = parallelize_module(
             simple_model().to(device_type), tp_mesh, parallelize_plan
         )
+=======
+        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
 
         FSDP.set_state_dict_type(
@@ -772,9 +934,13 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
+<<<<<<< HEAD
         model_2d = parallelize_module(
             simple_model().to(device_type), tp_mesh, parallelize_plan
         )
+=======
+        model_2d = parallelize_module(simple_model().cuda(), tp_mesh, parallelize_plan)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_2d = FSDP(model_2d, device_mesh=dp_mesh, use_orig_params=True)
         optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
 
@@ -788,7 +954,11 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
         ref_state_dict = deepcopy(model_2d.state_dict())
 
         # Update the parameters so model.state_dict() will be different from ref_dtensor_sd.
+<<<<<<< HEAD
         model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
+=======
+        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim_2d.step()
 
         # Load ref_state_dict back.
@@ -819,11 +989,17 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
 
         # Create a model without wrapper
         torch.manual_seed(0)
+<<<<<<< HEAD
         no_wrap_model = simple_model().to(f"{device_type}:{self.rank}")
         no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
         no_wrap_model(
             no_wrap_model.get_input().to(f"{device_type}:{self.rank}")
         ).sum().backward()
+=======
+        no_wrap_model = simple_model().cuda(self.rank)
+        no_wrap_optim = torch.optim.Adam(no_wrap_model.parameters(), lr=0.01)
+        no_wrap_model(no_wrap_model.get_input().cuda(self.rank)).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         no_wrap_optim.step()
         no_wrap_osd = get_optimizer_state_dict(no_wrap_model, optimizers=no_wrap_optim)
 
@@ -837,7 +1013,11 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
             "net2": RowwiseParallel(),
         }
         model_2d = parallelize_module(
+<<<<<<< HEAD
             simple_model().to(device_type), mesh_2d["tp"], parallelize_plan
+=======
+            simple_model().cuda(), mesh_2d["tp"], parallelize_plan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model_2d = FSDP(model_2d, device_mesh=mesh_2d["dp"], use_orig_params=True)
         FSDP.set_state_dict_type(
@@ -845,7 +1025,11 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
             StateDictType.SHARDED_STATE_DICT,
         )
         optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
+<<<<<<< HEAD
         model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
+=======
+        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim_2d.step()
         optim_2d_osd = get_optimizer_state_dict(model_2d, optimizers=optim_2d)
         ref_optim_2d_osd = deepcopy(optim_2d_osd)
@@ -864,7 +1048,11 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 # compare with no_wrap state.
                 if isinstance(dist_state, DTensor):
                     dist_state = (
+<<<<<<< HEAD
                         dist_state.to(device_type)
+=======
+                        dist_state.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         .redistribute(placements=(Replicate(), Replicate()))
                         .to_local()
                     )
@@ -872,7 +1060,11 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 self.assertTrue(torch.allclose(state, dist_state))
 
         # Update the parameters 2d optim states will be different from ref_optim_state_dict.
+<<<<<<< HEAD
         model_2d(model_2d.get_input().to(f"{device_type}:{self.rank}")).sum().backward()
+=======
+        model_2d(model_2d.get_input().cuda(self.rank)).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim_2d.step()
 
         set_optimizer_state_dict(
@@ -914,8 +1106,13 @@ def test_fsdp1_tp_2d_set_full_state_dict(self):
         5) dcp.load the state dict from storage
         6) load the state dict into the 2D model
         """
+<<<<<<< HEAD
         dummy_model = SimpleModel().to(device_type)
         mesh_1d = init_device_mesh(device_type, (self.world_size,))
+=======
+        dummy_model = SimpleModel().cuda()
+        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(dummy_model, device_mesh=mesh_1d)
         optim = torch.optim.Adam(model.parameters(), lr=0.01)
         model(model.get_input()).sum().backward()
@@ -933,9 +1130,15 @@ def test_fsdp1_tp_2d_set_full_state_dict(self):
         dcp.save(state_dict, checkpoint_id=self.temp_dir)
 
         # initialize 2d model
+<<<<<<< HEAD
         dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
             device_type,
+=======
+        dummy_model = SimpleModel().cuda()
+        mesh_2d = init_device_mesh(
+            "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (2, self.world_size // 2),
             mesh_dim_names=("dp", "tp"),
         )
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
index a184901f6ef05..240bd72ae869c 100644
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -1,5 +1,8 @@
 # Owner(s): ["oncall: distributed"]
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 from typing import TYPE_CHECKING
 
@@ -7,7 +10,10 @@
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._composable.replicate_with_fsdp import replicate
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint import FileSystemReader
 from torch.distributed.checkpoint.default_planner import _EmptyStateDictLoadPlanner
 from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
@@ -32,7 +38,11 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
+<<<<<<< HEAD
     requires_accelerator_dist_backend,
+=======
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@@ -40,7 +50,10 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
@@ -49,10 +62,13 @@
     from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = torch.distributed.get_default_backend_for_device(device_type)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # MLP Layer
 class MLPModule(torch.nn.Module):
     def __init__(self, d_hid: int):
@@ -86,7 +102,11 @@ class ComposabilityTest(MultiProcessTestCase):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
+<<<<<<< HEAD
         return backend
+=======
+        return "nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def setUp(self):
         super().setUp()
@@ -101,17 +121,27 @@ def tearDown(self):
 
     @property
     def world_size(self):
+<<<<<<< HEAD
         return 8
+=======
+        return 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def device(self):
         return self.rank
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pp_and_dcp(self):
         """
         Test that pipeline parallelism and distributed checkpointing can be used together and
@@ -152,11 +182,19 @@ def forward(self, x):
                     x = layer(x)
                 return x
 
+<<<<<<< HEAD
         device = torch.device(device_type, self.device)
         torch.accelerator.set_device_index(self.device)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
         torch.distributed.init_process_group(
             backend=backend,
+=======
+        device = torch.device("cuda", self.device)
+        torch.cuda.set_device(self.device)
+        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        torch.distributed.init_process_group(
+            backend="nccl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             store=store,
             rank=self.rank,
             world_size=self.world_size,
@@ -169,8 +207,13 @@ def forward(self, x):
             {f"{i}": MLPModule(dim) for i in range(total_layers)}
         )
         # Calculate start and end indices based on rank
+<<<<<<< HEAD
         start_index = self.rank
         end_index = start_index + 1
+=======
+        start_index = self.rank * 2
+        end_index = start_index + 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pp_model = PPModelChunk(full_model, start_index, end_index)
 
         pp_model.to(self.device)
@@ -201,11 +244,17 @@ def _dcp_test(self):
 
         _dcp_test(self)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_if_lt_x_gpu(8)
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 8+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "ScheduleClass",
         [
@@ -224,10 +273,18 @@ def _dcp_test(self):
         ],
     )
     def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.device)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
         torch.distributed.init_process_group(
             backend=backend,
+=======
+        _device_raii = torch.device("cuda", self.device)
+        torch.cuda.set_device(self.device)
+        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        torch.distributed.init_process_group(
+            backend="nccl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             store=store,
             rank=self.rank,
             world_size=self.world_size,
@@ -238,7 +295,11 @@ def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
         num_microbatches = 8
         dp_size = self.world_size // (tp_size * pp_size)
         device_mesh = init_device_mesh(
+<<<<<<< HEAD
             device_type,
+=======
+            "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mesh_shape=(dp_size, pp_size, tp_size),
             mesh_dim_names=("dp", "pp", "tp"),
         )
@@ -285,6 +346,7 @@ def apply_tp(
                 parallelize_module(layer, tp_mesh, parallelize_plan)
             return model
 
+<<<<<<< HEAD
         if issubclass(ScheduleClass, PipelineScheduleSingle):
             n_virtual = 1
         else:
@@ -323,6 +385,58 @@ def apply_tp(
             loss_fn=loss_fn,
             scale_grads=False,
         )
+=======
+        # Attach to a schedule
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stage_idx = pp_group.rank()
+            partial_model = nn.Sequential(
+                *full_model[stage_idx * 2 : stage_idx * 2 + 2]
+            )
+            partial_model.to(self.device)
+
+            tp_model = apply_tp(partial_model, tp_mesh)
+            dp_model = apply_fsdp(tp_model)
+            pipeline_stage = PipelineStage(
+                dp_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            partial_models = [pipeline_stage.submod]
+            pipeline_schedule = ScheduleClass(
+                pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+            )
+        else:
+            n_virtual = 2
+            num_stages = pp_group.size() * n_virtual
+            stages = []
+            for i in range(n_virtual):
+                stage_idx = pp_group.rank() + n_virtual * i
+                # divide the model layers by the number of stages
+                partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
+                partial_model.to(self.device)
+
+                tp_model = apply_tp(partial_model, tp_mesh)
+                dp_model = apply_fsdp(tp_model)
+                stage = PipelineStage(
+                    dp_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                stages.append(stage)
+                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+            pipeline_schedule = ScheduleClass(
+                stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         optimizer_kwargs = {
             "lr": 0.01,
@@ -336,7 +450,11 @@ def apply_tp(
             for model in partial_models
         ]
 
+<<<<<<< HEAD
         for _train_step in range(5):
+=======
+        for train_step in range(5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for optimizer in optimizers:
                 optimizer.zero_grad()
             inputs = torch.rand((num_microbatches, dim), device=self.device)
@@ -355,6 +473,7 @@ def apply_tp(
 
         torch.distributed.destroy_process_group()
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
     @skip_but_pass_in_sandcastle_if(
@@ -804,6 +923,8 @@ def simulate_all_reduce_grads(model, group):
             )
         torch.distributed.destroy_process_group()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(ComposabilityTest)
 
diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 8c1cb3d5df32b..67428e6d45cd1 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -1,7 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import os
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from copy import deepcopy
 
 import torch
@@ -15,11 +18,15 @@
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TEST_XPU
 
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 device_module = torch.get_device_module(device_type)
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Net(nn.Module):
@@ -74,7 +81,11 @@ def test_replicate_single_module_save_load(self):
 
     def test_replicate_non_root_multiple_save_load(self):
         """
+<<<<<<< HEAD
         Tests the replicate() on multiple submodules matches
+=======
+        Tests tha replicate() on multiple submodules matches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local module state_dict.
         """
         self._init_pg()
@@ -159,7 +170,10 @@ def test_replicate_single_module(self):
         self._compare_module(model, replicate_model)
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_replicate_move_args_kwargs_to_device(self):
         class MyNet(nn.Module):
             def __init__(self) -> None:
@@ -172,14 +186,21 @@ def forward(self, inp, *, kwarg=None):
                 return self.a(inp)
 
         self._init_pg()
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
         model = MyNet().to(device_type)
         replicate(model, device_id=torch.accelerator.current_device_index())
+=======
+        torch.cuda.set_device(self.rank)
+        model = MyNet().cuda()
+        replicate(model, device_id=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # CPU input ensures replicate can move arg and kwargs to device.
         a, b = torch.randn(2, 2), torch.randn(2, 2)
         model(a, kwarg=b).sum().backward()
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_ignore_module(self):
         self._init_pg()
@@ -191,6 +212,18 @@ def test_replicate_ignore_module(self):
         replicate(model, ignored_modules=[model.fc1])
         # CPU input ensures that replicate can move input to GPU as DDP does.
         inp = torch.randn(5, 2, device=device_type) * (self.rank + 1)
+=======
+    def test_replicate_ignore_module(self):
+        self._init_pg()
+        torch.cuda.set_device(self.rank)
+        # Seed ensures diff input and thus different local grads across ranks.
+        torch.manual_seed(self.rank)
+        torch.cuda.manual_seed(self.rank)
+        model = Net().cuda()
+        replicate(model, ignored_modules=[model.fc1])
+        # CPU input ensures that replicate can move input to GPU as DDP does.
+        inp = torch.randn(5, 2, device="cuda") * (self.rank + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = model(inp) * 10
         out.sum().backward()
         # FC1 grads should not be synchronized, FC2 and 3 should be.
@@ -228,11 +261,18 @@ def test_replicate_with_kwargs(self):
         self._compare_module(model, replicate_model)
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
     def test_replicate_device_id(self):
         self._init_pg()
         model = Net()
         model_cuda = deepcopy(model).to(device_type)
+=======
+    def test_replicate_device_id(self):
+        self._init_pg()
+        model = Net()
+        model_cuda = deepcopy(model).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_cuda2 = deepcopy(model_cuda)
         replicate(model, device_id=torch.device("cpu"))
         # DDP instance is attached in first pre forward
@@ -241,15 +281,23 @@ def test_replicate_device_id(self):
         # Should be None for CPU training
         self.assertEqual(None, replicate_ddp_weakref.device_ids)
 
+<<<<<<< HEAD
         replicate(
             model_cuda, device_id=torch.device(torch.accelerator.current_device_index())
         )
+=======
+        replicate(model_cuda, device_id=torch.device(torch.cuda.current_device()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # DDP instance is attached in first pre forward
         model_cuda(torch.randn(2, 2))
         replicate_ddp_weakref = replicate.state(model_cuda)._ddp_weakref()
         self.assertEqual([0], replicate_ddp_weakref.device_ids)
         # Pass in int as device_id
+<<<<<<< HEAD
         replicate(model_cuda2, device_id=int(torch.accelerator.current_device_index()))
+=======
+        replicate(model_cuda2, device_id=int(torch.cuda.current_device()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # DDP instance is attached in first pre forward
         model_cuda2(torch.randn(2, 2))
         replicate_ddp_weakref = replicate.state(model_cuda2)._ddp_weakref()
@@ -266,7 +314,10 @@ def test_replicate_wrong_device_id_type(self):
 
 class ReplicateFullyShardInit(ReplicateTest):
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_replicate_fully_shard_init(self):
         class ToyModel(nn.Module):
             def __init__(self, dim: int):
@@ -284,6 +335,7 @@ def forward(self, x: torch.Tensor):
                 return y
 
         self._init_pg()
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
         dim = 3
         bz = 2
@@ -292,6 +344,16 @@ def forward(self, x: torch.Tensor):
             fully_shard(linear)
         fully_shard(model.linears)
         replicate(model, device_id=torch.accelerator.current_device_index())
+=======
+        torch.cuda.set_device(self.rank)
+        dim = 3
+        bz = 2
+        model = ToyModel(dim).cuda()
+        for linear in model.linears:
+            fully_shard(linear)
+        fully_shard(model.linears)
+        replicate(model, device_id=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for linear in model.linears:
             self.assertTrue(isinstance(linear.weight, DTensor))
         inp = torch.rand(bz, dim)
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
index 8b6ed2d74ffe7..07c4d31331faf 100644
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -3,9 +3,14 @@
 import contextlib
 import functools
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from copy import deepcopy
 from typing import Optional, Union
+=======
+from copy import deepcopy
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -99,8 +104,11 @@ def _test_compile(
         self.create_pg(device)
         torch._dynamo.config.optimize_ddp = "python_reducer"
         torch.manual_seed(123)
+<<<<<<< HEAD
         if device_type == "xpu":
             torch.use_deterministic_algorithms(True, warn_only=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = Net(checkpoint=checkpoint).to(device)
         input = torch.randn([1, DIM], device=device)
 
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index b39b3075060f6..950323d23deab 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -1733,7 +1733,11 @@ def test_sharded_tensor_to_cpu(self):
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
             self.assertEqual(str(remote_device_after.device()), "cpu")
 
+<<<<<<< HEAD
         # ensure metadata also get changed to CPU
+=======
+        # ensure metdata also get changed to CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metas = new_st.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device()), "cpu")
@@ -1764,7 +1768,11 @@ def test_sharded_tensor_to_cpu(self):
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
             self.assertEqual(str(remote_device_after.device()), "cpu")
 
+<<<<<<< HEAD
         # ensure metadata also get changed to CPU
+=======
+        # ensure metdata also get changed to CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metas = new_st.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device()), "cpu")
@@ -1820,7 +1828,11 @@ def test_sharded_tensor_to_cuda(self):
             self.assertEqual(str(remote_device_before.device().type), "cpu")
             self.assertEqual(str(remote_device_after.device().type), "cuda")
 
+<<<<<<< HEAD
         # ensure metadata also get changed to GPU
+=======
+        # ensure metdata also get changed to GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metas = new_st_gpu.metadata().shards_metadata
         for meta in metas:
             self.assertEqual(str(meta.placement.device().type), "cuda")
@@ -3074,7 +3086,11 @@ def test_init_from_local_shards_invalid_property_cross_ranks(self):
                 wrong_dtype_shards, [10, 10], init_rrefs=True
             )
 
+<<<<<<< HEAD
         tensor_requires_grad = self.rank == 0
+=======
+        tensor_requires_grad = True if self.rank == 0 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrong_requires_grad_shards = [
             sharded_tensor.Shard(
                 torch.randn(
@@ -3121,7 +3137,11 @@ def test_init_from_local_shards_invalid_pin_memory(self):
                 wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
             )
 
+<<<<<<< HEAD
         tensor_pin_memory = self.rank == 0
+=======
+        tensor_pin_memory = True if self.rank == 0 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrong_pin_memory_shards_cross_ranks = [
             sharded_tensor.Shard(
                 torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata
diff --git a/test/distributed/_test_template.py b/test/distributed/_test_template.py
index 517a4cf97f6e8..6b59c880356b3 100644
--- a/test/distributed/_test_template.py
+++ b/test/distributed/_test_template.py
@@ -1,10 +1,18 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import run_tests
 
 
 class TestTemplate(MultiProcContinuousTest):
+=======
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestTemplate(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def testABC(self):
         print(f"rank {self.rank} of {self.world_size} testing ABC")
 
diff --git a/test/distributed/_tools/test_fake_collectives.py b/test/distributed/_tools/test_fake_collectives.py
index fea834008c229..76e3604e49502 100644
--- a/test/distributed/_tools/test_fake_collectives.py
+++ b/test/distributed/_tools/test_fake_collectives.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 
 import torch
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index a38afd12de8f6..0365f185d37f0 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: fsdp"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import gc
 from typing import Union
@@ -37,16 +41,26 @@ def _init_cublas_workspace(dev: torch.device):
 
 
 def _reset_mem_stats(dev: torch.device):
+<<<<<<< HEAD
     mod = torch.get_device_module(dev)
     mod.empty_cache()
     mod.reset_accumulated_memory_stats(dev)
     mod.reset_peak_memory_stats(dev)
+=======
+    torch.cuda.empty_cache()
+    torch.cuda.reset_accumulated_memory_stats(dev)
+    torch.cuda.reset_peak_memory_stats(dev)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestTrackerFullyShard1DTrainingCore(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.accelerator.device_count())
+=======
+        return min(4, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_tracker_multi_group_eager(self):
@@ -78,6 +92,7 @@ def _test_tracker_multi_group(
         mp_policy: MixedPrecisionPolicy,
     ):
         debug = False
+<<<<<<< HEAD
         dev = torch.device(torch.accelerator.current_device_index())
         _init_cublas_workspace(dev)
         gc.collect()
@@ -85,11 +100,23 @@ def _test_tracker_multi_group(
         mod = torch.get_device_module(dev)
         mem_stats = mod.memory_stats(dev)
         pre_acc_active = mem_stats["active_bytes.all.current"]
+=======
+        dev = torch.device(torch.cuda.current_device())
+        _init_cublas_workspace(dev)
+        gc.collect()
+        _reset_mem_stats(dev)
+        mem_stats = torch.cuda.memory_stats(dev)
+        pre_cuda_active = mem_stats["active_bytes.all.current"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         lin_dim, bsz = 2048, 8192
         with torch.device(dev):
             model = nn.Sequential(*[MLP(dim=lin_dim, device=dev) for _ in range(4)])
+<<<<<<< HEAD
         mesh = init_device_mesh(dev.type, (self.world_size,))
+=======
+        mesh = init_device_mesh("cuda", (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard_fn = functools.partial(
             fully_shard,
             mesh=mesh,
@@ -112,6 +139,7 @@ def _test_tracker_multi_group(
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
+<<<<<<< HEAD
         mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
         acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
@@ -120,11 +148,23 @@ def _test_tracker_multi_group(
             print(
                 f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
             )
+=======
+        mem_stats = torch.cuda.memory_stats()
+        tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
+        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
+        accuracy = tracker_max / cuda_max
+        if self.rank == 0 and debug:
+            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertAlmostEqual(
             accuracy,
             1.0,
             delta=0.1,
+<<<<<<< HEAD
             msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
+=======
+            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         del model
         del inp
@@ -133,6 +173,7 @@ def _test_tracker_multi_group(
     @skip_if_lt_x_gpu(2)
     def test_tracker_non_root_forward_backward(self):
         """
+<<<<<<< HEAD
         Tests tracker accuracy when running forward/backward through a non-root.
         """
         debug = False
@@ -143,6 +184,17 @@ def test_tracker_non_root_forward_backward(self):
         mod = torch.get_device_module(dev)
         mem_stats = mod.memory_stats(dev)
         pre_acc_active = mem_stats["active_bytes.all.current"]
+=======
+        Tests tracker accracy when running forward/backward through a non-root.
+        """
+        debug = False
+        dev = torch.device(torch.cuda.current_device())
+        _init_cublas_workspace(dev)
+        gc.collect()
+        _reset_mem_stats(dev)
+        mem_stats = torch.cuda.memory_stats(dev)
+        pre_cuda_active = mem_stats["active_bytes.all.current"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         lin_dim, bsz = 2048, 8
         model = nn.Sequential(*[MLP(lin_dim, dev) for _ in range(3)])
@@ -162,6 +214,7 @@ def test_tracker_non_root_forward_backward(self):
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
+<<<<<<< HEAD
         mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
         acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
@@ -170,11 +223,23 @@ def test_tracker_non_root_forward_backward(self):
             print(
                 f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
             )
+=======
+        mem_stats = torch.cuda.memory_stats()
+        tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
+        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
+        accuracy = tracker_max / cuda_max
+        if self.rank == 0 and debug:
+            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertAlmostEqual(
             accuracy,
             1.0,
             delta=0.1,
+<<<<<<< HEAD
             msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
+=======
+            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         del inp
         del model
@@ -184,7 +249,11 @@ def test_tracker_non_root_forward_backward(self):
 class TestTrackerFullyShard1DTrainingCompose(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.accelerator.device_count(), 4)
+=======
+        return min(torch.cuda.device_count(), 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_tracker_with_activation_checkpointing(self):
@@ -204,6 +273,7 @@ def _test_tracker_with_activation_checkpointing(
     ):
         assert checkpoint_impl in ("composable", "wrapper")
         debug = False
+<<<<<<< HEAD
         dev = torch.device(torch.accelerator.current_device_index())
         _init_cublas_workspace(dev)
         gc.collect()
@@ -211,6 +281,14 @@ def _test_tracker_with_activation_checkpointing(
         mod = torch.get_device_module(dev)
         mem_stats = mod.memory_stats(dev)
         pre_acc_active = mem_stats["active_bytes.all.current"]
+=======
+        dev = torch.device(torch.cuda.current_device())
+        _init_cublas_workspace(dev)
+        gc.collect()
+        _reset_mem_stats(dev)
+        mem_stats = torch.cuda.memory_stats(dev)
+        pre_cuda_active = mem_stats["active_bytes.all.current"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         vocab_size = 8192
         bsz, seq_len = 16, 512
@@ -257,6 +335,7 @@ def _test_tracker_with_activation_checkpointing(
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
+<<<<<<< HEAD
         mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
         acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
@@ -265,11 +344,23 @@ def _test_tracker_with_activation_checkpointing(
             print(
                 f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
             )
+=======
+        mem_stats = torch.cuda.memory_stats()
+        tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
+        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
+        accuracy = tracker_max / cuda_max
+        if self.rank == 0 and debug:
+            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertAlmostEqual(
             accuracy,
             1.0,
             delta=0.1,
+<<<<<<< HEAD
             msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
+=======
+            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         del inp
         del model
diff --git a/test/distributed/_tools/test_mem_tracker.py b/test/distributed/_tools/test_mem_tracker.py
index b4c2c938b1313..444874bb5b5aa 100644
--- a/test/distributed/_tools/test_mem_tracker.py
+++ b/test/distributed/_tools/test_mem_tracker.py
@@ -1,15 +1,27 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import gc
 import unittest
 
 import torch
 import torch.nn as nn
 from torch.distributed._tools.mem_tracker import MemTracker
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
     TEST_CUDA,
     TEST_XPU,
+=======
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfRocm,
+    skipIfTorchDynamo,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.utils.checkpoint import checkpoint
@@ -24,6 +36,7 @@ def _init_cublas_workspace(self, dev: torch.device):
         del inp
 
     def _reset_mem_stats(self, dev: torch.device):
+<<<<<<< HEAD
         mod = torch.get_device_module(dev)
         mod.empty_cache()
         mod.reset_accumulated_memory_stats(dev)
@@ -34,11 +47,22 @@ def _reset_mem_stats(self, dev: torch.device):
         not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
     )
     def test_accelerator_tracker_equivalence(
+=======
+        torch.cuda.empty_cache()
+        torch.cuda.reset_accumulated_memory_stats(dev)
+        torch.cuda.reset_peak_memory_stats(dev)
+
+    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @skipIfRocm()
+    def test_cuda_tracker_equivalence(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
     ):
         """
         Tests that the tracker correctly calculates the peak memory.
         """
+<<<<<<< HEAD
         dev = torch.device(torch.accelerator.current_device_index())
         self._init_cublas_workspace(dev)
         gc.collect(1)
@@ -46,6 +70,14 @@ def test_accelerator_tracker_equivalence(
         mod = torch.get_device_module(dev)
         mem_stats = mod.memory_stats(dev)
         pre_acc_active = mem_stats["active_bytes.all.current"]
+=======
+        dev = torch.device(torch.cuda.current_device())
+        self._init_cublas_workspace(dev)
+        gc.collect(1)
+        self._reset_mem_stats(dev)
+        mem_stats = torch.cuda.memory_stats(dev)
+        pre_cuda_active = mem_stats["active_bytes.all.current"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bsz, n_layers, dim, dtype = 16, 4, 512, torch.bfloat16
 
         class DummyModel(nn.Module):
@@ -77,6 +109,7 @@ def forward(self, x):
         # Check for accuracy of peak memory
 
         tracker_max = mt.get_tracker_snapshot("peak")[dev]["Total"]
+<<<<<<< HEAD
         mem_stats = mod.memory_stats(dev)
         acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
         accuracy = tracker_max / acc_max
@@ -86,12 +119,22 @@ def forward(self, x):
     @unittest.skipIf(
         not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
     )
+=======
+        mem_stats = torch.cuda.memory_stats(dev)
+        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
+        accuracy = tracker_max / cuda_max
+        self.assertAlmostEqual(accuracy, 1.0, delta=0.1)
+
+    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tracker_with_activation_checkpointing(
         self,
     ):
         """
         Tests that the tracker correctly computes the peak memory during activation checkpointing.
         """
+<<<<<<< HEAD
         dev = torch.device(torch.accelerator.current_device_index())
         self._init_cublas_workspace(dev)
         gc.collect(1)
@@ -99,6 +142,14 @@ def test_tracker_with_activation_checkpointing(
         mod = torch.get_device_module(dev)
         mem_stats = mod.memory_stats(dev)
         pre_acc_active = mem_stats["active_bytes.all.current"]
+=======
+        dev = torch.device(torch.cuda.current_device())
+        self._init_cublas_workspace(dev)
+        gc.collect(1)
+        self._reset_mem_stats(dev)
+        mem_stats = torch.cuda.memory_stats(dev)
+        pre_cuda_active = mem_stats["active_bytes.all.current"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bsz, n_layers, dim, dtype = 128, 4, 1024, torch.float16
 
@@ -150,9 +201,15 @@ def forward(self, x):
 
         # Check for accuracy of peak memory
         tracker_max = mt.get_tracker_snapshot("peak")[dev]["Total"]
+<<<<<<< HEAD
         mem_stats = mod.memory_stats(dev)
         acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
         accuracy = tracker_max / acc_max
+=======
+        mem_stats = torch.cuda.memory_stats(dev)
+        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
+        accuracy = tracker_max / cuda_max
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertAlmostEqual(accuracy, 1.0, delta=0.1)
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index 63366033629ff..e27b6aa1cc85d 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -5,18 +5,31 @@
 import torch
 import torch.nn as nn
 from torch.distributed._tools import MemoryTracker
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestMemoryTracker(TestCase):
     @unittest.skipIf(not torch.accelerator.is_available(), "no accelerator")
+=======
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
+
+
+class TestMemoryTracker(TestCase):
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_local_model(self):
         """
         Minimal test case to check the memory tracker can collect the expected
         memory stats at operator level, as well as can print the summary result
         without crash.
         """
+<<<<<<< HEAD
         device = torch.accelerator.current_accelerator()
+=======
+        device = "cuda" if TEST_CUDA else "xpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Create a model with a hierarchy of modules
         torch.manual_seed(0)
         model = nn.Sequential(
@@ -34,9 +47,15 @@ def test_local_model(self):
         tracker = MemoryTracker()
         tracker.start_monitor(model)
 
+<<<<<<< HEAD
         x = torch.randn(size=(2, 3, 224, 224), device=device)
         # torch.LongTensor expects cpu device type, not gpu device type in
         # constructor, so calling .to() outside constructor here.
+=======
+        x = torch.randn(size=(2, 3, 224, 224), device=torch.device(device))
+        # torch.LongTensor expects cpu device type, not device type in
+        # constructor, so calling .to(device) outside constructor here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target = torch.LongTensor([0, 1]).to(device)
         criterion = nn.CrossEntropyLoss()
         criterion(model(x), target).backward()
diff --git a/test/distributed/_tools/test_mod_tracker.py b/test/distributed/_tools/test_mod_tracker.py
index b4de403b503a3..f375c793e3e39 100644
--- a/test/distributed/_tools/test_mod_tracker.py
+++ b/test/distributed/_tools/test_mod_tracker.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from copy import copy
 
diff --git a/test/distributed/_tools/test_runtime_estimator.py b/test/distributed/_tools/test_runtime_estimator.py
index 4087b8a855208..3108a372edd56 100644
--- a/test/distributed/_tools/test_runtime_estimator.py
+++ b/test/distributed/_tools/test_runtime_estimator.py
@@ -1,8 +1,15 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
 import unittest
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, cast, Union
+=======
+# Owner(s): ["module: unknown"]
+import unittest
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import nn, optim
diff --git a/test/distributed/_tools/test_sac_estimator.py b/test/distributed/_tools/test_sac_estimator.py
index c7bd9f2c30341..5052539efba85 100644
--- a/test/distributed/_tools/test_sac_estimator.py
+++ b/test/distributed/_tools/test_sac_estimator.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 
 import torch
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index da71ee9214eb6..855c1dae72823 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["oncall: distributed"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 import unittest
 
@@ -19,8 +23,14 @@
 )
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     run_tests,
     skipIfRocm,
+=======
+    MI300_ARCH,
+    run_tests,
+    skipIfRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TestCase,
 )
@@ -135,7 +145,11 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @skipIfRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sac_ilp_case1(self):
         """
         This is a case where the memory budget is either binding or too tight,
@@ -210,7 +224,11 @@ def test_sac_ilp_case3(self):
 
 
 class TestOptimalCheckpointingPolicy(TestCase):
+<<<<<<< HEAD
     # tests are adapted from tests in xformers
+=======
+    # tests are adpated from tests in xformers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/facebookresearch/xformers/blob/c6c0ac31f1b08542a0bc27278c6ed10f825f6963/tests/test_checkpoint.py#L222
     def setUp(self):
         super().setUp()
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index 6044eac70b517..bd52578fe531b 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -79,7 +79,11 @@ def test_all_gather_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@@ -94,7 +98,11 @@ def test_all_gather_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@@ -111,7 +119,11 @@ def test_all_to_all_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -135,7 +147,11 @@ def test_all_to_all_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -158,7 +174,11 @@ def test_all_to_all_single_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
@@ -181,7 +201,11 @@ def test_all_to_all_single_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
+<<<<<<< HEAD
             group = list(range(self.world_size))
+=======
+            group = list(range(0, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
diff --git a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
index 59cd11fa7ea48..e19bc43e4b1c3 100644
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@@ -1,7 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import time
+<<<<<<< HEAD
 from concurrent.futures import Future
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass, field
 from enum import auto, Enum
 from functools import partial
@@ -14,7 +17,10 @@
 import torch.distributed.checkpoint.state_dict_saver as saver
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed.checkpoint.staging import DefaultStager, StagingOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -24,10 +30,14 @@
     set_state_dict,
 )
 from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
+<<<<<<< HEAD
 from torch.distributed.checkpoint.state_dict_saver import (
     AsyncCheckpointerType,
     AsyncSaveResponse,
 )
+=======
+from torch.distributed.checkpoint.state_dict_saver import AsyncCheckpointerType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.distributed.checkpoint.utils import CheckpointException
 from torch.distributed.device_mesh import init_device_mesh
@@ -54,9 +64,12 @@
 from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Simple and boring model
 class TestDummyModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -75,12 +88,20 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(8, 8, device=device_type)
+=======
+        return torch.rand(8, 8, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestStatefulObj:
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.data = torch.rand(10, 10, device=device_type)
+=======
+        self.data = torch.rand(10, 10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def state_dict(self):
         return {"data": self.data}
@@ -154,11 +175,18 @@ def _train(model, optim, train_steps=1):
 class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
     @property
     def backend(self):
+<<<<<<< HEAD
         curr_backend = dist.get_default_backend_for_device(self.device_type)
         return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     def _create_model(self, compile, model_type, state_dict_options=None):
         dummy_model = TestDummyModel().to(self.device_type)
+=======
+        return "cpu:gloo,cuda:nccl"
+
+    def _create_model(self, compile, model_type, state_dict_options=None):
+        dummy_model = TestDummyModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert model_type in ModelType, f"{model_type} is not supported."
         if model_type == ModelType.FSDP:
@@ -211,8 +239,13 @@ def _create_model(self, compile, model_type, state_dict_options=None):
     def _optim(self, model):
         return torch.optim.Adam(model.parameters(), lr=0.1)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     @parametrize("compile", [True, False])
     # TODO: Previously PairwiseParallel does not shard properly, passing ModelType.FSDP_TP test where it
@@ -221,6 +254,7 @@ def _optim(self, model):
     def test_e2e(self, compile, model_type):
         self._run_e2e_test(compile, model_type)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
     @with_temp_dir
@@ -238,13 +272,31 @@ def test_e2e(self, compile, model_type):
     def test_e2e_async_cached(
         self, cache_staged_state_dict, async_checkpointer_type, zoc
     ):
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    @parametrize(
+        "cache_staged_state_dict, async_checkpointer_type",
+        [
+            (False, AsyncCheckpointerType.THREAD),
+            (True, AsyncCheckpointerType.THREAD),
+            (False, AsyncCheckpointerType.PROCESS),
+            (True, AsyncCheckpointerType.PROCESS),
+        ],
+    )
+    def test_e2e_async_cached(self, cache_staged_state_dict, async_checkpointer_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_e2e_test(
             compile=False,
             model_type=ModelType.FSDP,
             async_op=True,
             cache_staged_state_dict=cache_staged_state_dict,
             async_checkpointer_type=async_checkpointer_type,
+<<<<<<< HEAD
             zoc=zoc,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _run_e2e_test(
@@ -254,7 +306,10 @@ def _run_e2e_test(
         async_op=False,
         cache_staged_state_dict=False,
         async_checkpointer_type=None,
+<<<<<<< HEAD
         zoc=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         model, optim = self._create_model(compile, ModelType.NONE)
         _train(model, optim, train_steps=2)
@@ -274,6 +329,7 @@ def _run_e2e_test(
             writer = DCP.FileSystemWriter(
                 self.temp_dir, cache_staged_state_dict=cache_staged_state_dict
             )
+<<<<<<< HEAD
             stager = None
             if not cache_staged_state_dict:
                 use_shared_memory = (
@@ -287,6 +343,9 @@ def _run_e2e_test(
                 )
                 stager = DefaultStager(staging_options)
             async_save_response_or_future = saver.async_save(
+=======
+            f = saver.async_save(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sd,
                 storage_writer=writer,
                 async_checkpointer_type=(
@@ -294,6 +353,7 @@ def _run_e2e_test(
                     if async_checkpointer_type
                     else AsyncCheckpointerType.THREAD
                 ),
+<<<<<<< HEAD
                 async_stager=stager,
             )
             if isinstance(async_save_response_or_future, Future):
@@ -308,6 +368,15 @@ def _run_e2e_test(
                 print(f"still waiting... {time.monotonic() - t}")
 
             save_future.result()
+=======
+            )
+            t = time.monotonic()
+            while not f.done():
+                time.sleep(1)
+                print(f"still waiting... {time.monotonic() - t}")
+
+            f.result()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             DCP.save(sd, checkpoint_id=self.temp_dir)
 
@@ -382,9 +451,15 @@ def load_state_dict(self, state_dict):
         # Validate that the non-stateful state dict was replaced with the loaded state dict
         self.assertTrue(sd.set_sd_item_called)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
     @with_temp_dir
+=======
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_different_ordered_state_dict_keys(self):
         """Tests that the order of keys in the state dict does not matter when loading
         If order was not accounted for, the following test would cause a deadlock.
@@ -398,11 +473,19 @@ def state_dict(self):
 
             def load_state_dict(self, state_dict):
                 tl = [
+<<<<<<< HEAD
                     torch.ones(2, dtype=torch.int64, device=device_type)
                     for _ in range(world_size)
                 ]
                 t = (
                     torch.arange(2, dtype=torch.int64, device=device_type)
+=======
+                    torch.ones(2, dtype=torch.int64, device="cuda")
+                    for _ in range(world_size)
+                ]
+                t = (
+                    torch.arange(2, dtype=torch.int64, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     + 1
                     + 2 * dist.get_rank()
                 )
@@ -414,7 +497,11 @@ def state_dict(self):
 
             def load_state_dict(self, state_dict):
                 tensor = (
+<<<<<<< HEAD
                     torch.arange(2, dtype=torch.int64, device=device_type)
+=======
+                    torch.arange(2, dtype=torch.int64, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     + 1
                     + 2 * dist.get_rank()
                 )
@@ -441,8 +528,13 @@ def test_no_dist(self):
         DCP.save({}, checkpoint_id=self.temp_dir)
         DCP.load({}, checkpoint_id=self.temp_dir)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_partial_load(self):
         model, optim = self._create_model(compile=False, model_type=ModelType.NONE)
@@ -480,8 +572,13 @@ def test_partial_load(self):
                     loaded_optim_state[k][optim_key], v[optim_key], offload_to_cpu=True
                 )
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_overwrite(self):
         t1, t2 = torch.randn(10), torch.randn(10)
diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
index 50e158793abc2..d871cde4526cf 100644
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@@ -82,23 +82,39 @@ def forward(self, batch):
 class TestFineTuning(DTensorTestBase):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.accelerator.device_count())
 
     @property
     def backend(self):
         curr_backend = dist.get_default_backend_for_device(self.device_type)
         return f"cpu:gloo,{self.device_type}:{curr_backend}"
+=======
+        return min(4, torch.cuda.device_count())
+
+    @property
+    def backend(self):
+        return "cpu:gloo,cuda:nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def pretrain(self, pretrain_dir: str) -> None:
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
 
+<<<<<<< HEAD
         model = PreTrainedModel().to(self.device_type)
+=======
+        model = PreTrainedModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(model, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
 
         # Training
         for _ in range(3):
+<<<<<<< HEAD
             batch = torch.rand(32, DIM, device=self.device_type)
+=======
+            batch = torch.rand(32, DIM, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss = model(batch).sum()
             loss.backward()
             optim.step()
@@ -115,7 +131,11 @@ def pretrain(self, pretrain_dir: str) -> None:
     def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
 
+<<<<<<< HEAD
         model = FineTuningModel().to(self.device_type)
+=======
+        model = FineTuningModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: make the parallelism more complicated, e.g., using 2D + DDP.
         model = FSDP(model, use_orig_params=True, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
@@ -163,7 +183,11 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
 
             # Training
             for _ in range(3):
+<<<<<<< HEAD
                 batch = torch.rand(32, DIM, device=self.device_type)
+=======
+                batch = torch.rand(32, DIM, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss = model(batch).sum()
                 loss.backward()
                 optim.step()
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 03ec9d4d94e1a..681751140c43f 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -3,7 +3,11 @@
 import torch
 import torch.nn as nn
 from torch.distributed.checkpoint.state_dict import get_state_dict
+<<<<<<< HEAD
 from torch.distributed.device_mesh import init_device_mesh
+=======
+from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor import DTensor
 from torch.testing._internal.common_utils import run_tests
@@ -61,20 +65,34 @@ def forward(self, x):
 class TestFSDPWithEP(DTensorTestBase, VerifyStateDictMixin):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(8, torch.accelerator.device_count())
+=======
+        return min(8, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(8)
     @with_temp_dir
     def test_e2e(self):
+<<<<<<< HEAD
         model = TopModel(self.rank).to(self.device_type)
+=======
+        model = TopModel(self.rank).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mesh_fsdp_tp = init_device_mesh(
             self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
         )
+<<<<<<< HEAD
         # TODO: we are using an internal API atm. Change to a public API once it is ready.
         mesh_fsdp_ep = mesh_fsdp_tp["dp"]
         mesh_fsdp_ep._root_mesh = None
+=======
+        # TODO: we are using an internal API atm. Change to a publich API once it is ready.
+        mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
+        del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mesh_fsdp = init_device_mesh(self.device_type, (8,))
         for i, l in enumerate(model.second.ep_layers):
diff --git a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
index 5f5ab1ebd391d..f5cd9cd88e0bc 100644
--- a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
+++ b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
@@ -32,6 +32,7 @@
 from torch.utils._pytree import tree_all_only
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
@@ -39,6 +40,12 @@ class TestFullyShardWithDistributedStateDict(FSDPTest):
     @property
     def world_size(self) -> int:
         return min(4, torch.accelerator.device_count())
+=======
+class TestFullyShardWithDistributedStateDict(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_base_model(self, mlp_dim: int = 2):
         base_model = nn.Sequential(
@@ -76,7 +83,11 @@ def _test_1d_fsdp_get_model_state_dict(self, mlp_dim: int):
         for module in model2:
             fully_shard(module, reshard_after_forward=False)
         fully_shard(model2, reshard_after_forward=False)
+<<<<<<< HEAD
         inp = torch.randn((2, mlp_dim), device=device_type)
+=======
+        inp = torch.randn((2, mlp_dim), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model2(inp)  # parameters are not resharded after this forward
         # Check that state dict hooks reshard
         osd_2 = model2.state_dict()
@@ -134,7 +145,11 @@ def _test_save_with_fsdp1_and_load_with_fsdp2(self, state_dict_type: StateDictTy
 
         # Save state dict with model wrapped with FSDP1
         fsdp1_model = FSDP(
+<<<<<<< HEAD
             self._get_base_model().to(device_type),
+=======
+            self._get_base_model().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_orig_params=True,
             auto_wrap_policy=always_wrap_policy,
         )
@@ -210,14 +225,22 @@ def _get_base_model(mlp_dim: int = 2):
         # init device mesh
         dp_size = 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             device_type,
+=======
+            "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (dp_size, self.world_size // dp_size),
             mesh_dim_names=("dp", "tp"),
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
 
         # Save state dict with original model
+<<<<<<< HEAD
         base_model = _get_base_model().to(device_type)
+=======
+        base_model = _get_base_model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
         # Save state dict with model wrapped with FSDP1
@@ -344,17 +367,28 @@ def _get_base_model(mlp_dim: int = 2):
         # init device mesh
         dp_size = 2
         global_mesh_1d = init_device_mesh(
+<<<<<<< HEAD
             device_type, (self.world_size,), mesh_dim_names=("tp",)
         )
         global_mesh_2d = init_device_mesh(
             device_type,
             (dp_size, self.world_size // dp_size),
             mesh_dim_names=("dp", "tp"),
+=======
+            "cuda", (self.world_size,), mesh_dim_names=("tp",)
+        )
+        global_mesh_2d = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dp_mesh, tp_mesh = global_mesh_2d["dp"], global_mesh_2d["tp"]
 
         # Save state dict with original model
+<<<<<<< HEAD
         base_model = _get_base_model().to(device_type)
+=======
+        base_model = _get_base_model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
         # Save state dict with TP model
@@ -500,10 +534,17 @@ def _get_base_model(mlp_dim):
             # init device mesh
             dp_size = 2
             global_mesh_1d = init_device_mesh(
+<<<<<<< HEAD
                 device_type, (self.world_size,), mesh_dim_names=("tp",)
             )
             global_mesh_2d = init_device_mesh(
                 device_type,
+=======
+                "cuda", (self.world_size,), mesh_dim_names=("tp",)
+            )
+            global_mesh_2d = init_device_mesh(
+                "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (dp_size, self.world_size // dp_size),
                 mesh_dim_names=("dp", "tp"),
             )
@@ -511,7 +552,11 @@ def _get_base_model(mlp_dim):
 
             for save_full_state_dict in [True, False]:
                 # Save state dict with original model
+<<<<<<< HEAD
                 base_model = _get_base_model(mlp_dim).to(device_type)
+=======
+                base_model = _get_base_model(mlp_dim).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
                 # Save state dict with FSDP2 + TP model
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 0bc5bf69f2a50..e577a59398618 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -2,7 +2,11 @@
 
 import os
 import sys
+<<<<<<< HEAD
 from typing import Any, cast, Optional, Union
+=======
+from typing import cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -32,10 +36,14 @@
 )
 from torch.distributed.checkpoint.storage import WriteResult
 from torch.futures import Future
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
+=======
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
@@ -43,9 +51,12 @@
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
@@ -68,8 +79,13 @@ def spec(self) -> ChunkShardingSpec:
         return ChunkShardingSpec(
             dim=0,
             placements=[
+<<<<<<< HEAD
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -81,12 +97,20 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_metadata_with_missing_rank_spec(self) -> None:
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
+<<<<<<< HEAD
                 f"rank:1/{device_type}:1",
+=======
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -98,6 +122,7 @@ def test_tensor_metadata_with_missing_rank_spec(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
     def test_default_metadata(self) -> None:
         device = f"{device_type}:{dist.get_rank()}"
@@ -106,6 +131,16 @@ def test_default_metadata(self) -> None:
             placements=[
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+    @requires_nccl()
+    def test_default_metadata(self) -> None:
+        device = f"cuda:{dist.get_rank()}"
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -152,7 +187,11 @@ def __init__(self, fail_conf):
         self.rank = 0 if not dist.is_initialized() else dist.get_rank()
 
     def _get_ranks(self, name):
+<<<<<<< HEAD
         return self.fail_conf.get(name, None)
+=======
+        return self.fail_conf[name] if name in self.fail_conf else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _fail_rank(self, name):
         ranks = self._get_ranks(name)
@@ -176,9 +215,13 @@ def __init__(self, fail_conf):
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         return
 
+<<<<<<< HEAD
     def set_up_storage_writer(
         self, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
+=======
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._fail_rank("fail_set_up_storage_writer")
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
@@ -239,14 +282,22 @@ class TestDistributedFailure(ShardedTensorTestBase):
     def get_spec(self):
         return ChunkShardingSpec(
             dim=0,
+<<<<<<< HEAD
             placements=[
                 f"rank:{r}/{device_type}:{r}" for r in range(dist.get_world_size())
             ],
+=======
+            placements=[f"rank:{r}/cuda:{r}" for r in range(dist.get_world_size())],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dummy_writer_works(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -258,7 +309,11 @@ def test_dummy_writer_works(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dummy_reader_works(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -321,7 +376,11 @@ def _load():
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_error_handling(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -355,7 +414,11 @@ def test_save_error_handling_no_dist(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_error_handling(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -366,18 +429,26 @@ def test_load_error_handling(self) -> None:
         self._test_load(state_dict)
         self._test_load(state_dict, fail_set_up_storage_reader=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
+<<<<<<< HEAD
         self._test_load(state_dict, fail_read_metadata=[0], ignore_exception_type=True)
+=======
+        self._test_load(state_dict, fail_read_metadata=[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_load(state_dict, fail_prepare_local_plan=[1])
         self._test_load(state_dict, fail_read_data=[3])
         self._test_load(state_dict, fail_read_data_async=[1])
 
         self._test_load(state_dict, coordinator=3, fail_set_up_storage_reader=[0])
+<<<<<<< HEAD
         self._test_load(
             state_dict,
             coordinator=1,
             fail_read_metadata=[3],
             ignore_exception_type=True,
         )
+=======
+        self._test_load(state_dict, coordinator=1, fail_read_metadata=[3])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_load(state_dict, coordinator=2, fail_read_data=[0])
         self._test_load(state_dict, coordinator=3, fail_read_data_async=[2])
         self._test_load(state_dict, coordinator=1, fail_prepare_global_plan=[1])
@@ -386,7 +457,11 @@ def test_load_error_handling_no_dist(self) -> None:
         state_dict = {"replicated": torch.rand(10, 10), "bytes": [1, 2, 3, 4]}
         self._test_load(state_dict)
         self._test_load(state_dict, fail_set_up_storage_reader=[0])
+<<<<<<< HEAD
         self._test_load(state_dict, fail_read_metadata=[0], ignore_exception_type=True)
+=======
+        self._test_load(state_dict, fail_read_metadata=[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_load(state_dict, fail_prepare_local_plan=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
         self._test_load(state_dict, fail_read_data=[0])
diff --git a/test/distributed/checkpoint/test_dtensor_checkpoint.py b/test/distributed/checkpoint/test_dtensor_checkpoint.py
index e34d14b967487..ec33a1ccbdf8b 100644
--- a/test/distributed/checkpoint/test_dtensor_checkpoint.py
+++ b/test/distributed/checkpoint/test_dtensor_checkpoint.py
@@ -106,7 +106,11 @@ def create_dtensor_model(
             replicated_dt,
             submesh_sharded_dt,
             submesh_replicated_dt,
+<<<<<<< HEAD
         ).to(self.device_type)
+=======
+        ).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return (
             model,
@@ -135,7 +139,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'rdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([4., 5., 6., 7.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([4., 5., 6., 7.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Replicate()]
                         )
@@ -143,7 +151,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'sdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([0.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([0.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Shard(dim=0)])
                         ),
@@ -151,7 +163,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_sdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([8., 9.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([8., 9.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Shard(dim=0)]
                         ),
@@ -159,7 +175,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_rdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([12., 13., 14., 15.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([12., 13., 14., 15.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Replicate()]
                         )
@@ -189,7 +209,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'rdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([40., 50., 60., 70.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Replicate()],
                         )
@@ -197,7 +221,11 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'sdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([0.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([0.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Shard(dim=0)],
                         )
@@ -205,14 +233,22 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_sdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([80., 90.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([80., 90.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Shard(dim=0)]
                         )
                     ),
                     ('submesh_rdt',
                         DTensor(
+<<<<<<< HEAD
                             local_tensor=tensor([120., 130., 140., 150.], device=f'{self.device_type}:0'),
+=======
+                            local_tensor=tensor([120., 130., 140., 150.], device='cuda:0'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Replicate()]
                         )
diff --git a/test/distributed/checkpoint/test_dtensor_resharding.py b/test/distributed/checkpoint/test_dtensor_resharding.py
index 306f61a597c25..9a33a5649125e 100644
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@@ -1,4 +1,5 @@
 # Owner(s): ["oncall: distributed"]
+<<<<<<< HEAD
 import logging
 from typing import Any
 
@@ -19,6 +20,13 @@
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard, zeros
 from torch.distributed.tensor._shards_wrapper import LocalShardsWrapper
+=======
+import torch
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed.checkpoint._extension import ZStandard
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Replicate, Shard, zeros
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -36,9 +44,12 @@
 )
 
 
+<<<<<<< HEAD
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CHECKPOINT_DIR = "checkpoint"
 
 ONE_D_PLACEMENTS = [
@@ -278,7 +289,11 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
         """
         Test dtensor checkpoint resharding with dtensor containing empty shards.
         """
+<<<<<<< HEAD
         tensor = torch.rand(1).to(self.device_type)
+=======
+        tensor = torch.rand(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
         ref_state_dict = {"dtensor": dtensor}
@@ -288,7 +303,11 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             storage_writer=dist_cp.FileSystemWriter(path=self.temp_dir),
         )
 
+<<<<<<< HEAD
         tensor = torch.rand(1).to(self.device_type)
+=======
+        tensor = torch.rand(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh_2 = init_device_mesh(self.device_type, (2, self.world_size // 2))
         dtensor = distribute_tensor(tensor, mesh_2, [Shard(0), Shard(0)])
         state_dict = {"dtensor": dtensor}
@@ -297,6 +316,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             storage_reader=dist_cp.FileSystemReader(self.temp_dir),
         )
 
+<<<<<<< HEAD
     @with_comms
     @with_temp_dir
     @skip_if_lt_x_gpu(2)
@@ -585,6 +605,11 @@ def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
         )
         assert torch.equal(loading_local_tensor, expected_loaded_local_val_tensor)
         dist.barrier()
+=======
+    # TODO: Add a assertEqual for ref_state_dict["dtensor"].full_tensor()
+    # and state_dict["dtensor"].full_tensor() after we fix the size mismatch
+    # issue for un-even sharding dtensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO: Add dtensor resharding test when world size changes.
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index 8a7d7e191ce67..f3985820c7dcb 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -23,10 +23,14 @@
 )
 from torch.distributed.checkpoint._extension import ZStandard
 from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
+=======
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -48,9 +52,12 @@
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
@@ -172,7 +179,11 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("extensions", [None, [Rot13Example()], [ZStandard()]])
     def test_read_write_shard_tensor(self, extensions) -> None:
         paths = [tempfile.mkdtemp()]
@@ -184,8 +195,13 @@ def test_read_write_shard_tensor(self, extensions) -> None:
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
+<<<<<<< HEAD
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -234,16 +250,24 @@ def get_file_path(self) -> str:
 
     def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor:
         res = (
+<<<<<<< HEAD
             torch.zeros(tensor.shape, device=f"{device_type}:0")
             if dist.get_rank() == 0
             else None
+=======
+            torch.zeros(tensor.shape, device="cuda:0") if dist.get_rank() == 0 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         tensor.gather(out=res)
         return res
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_with_different_shard_plan(self) -> None:
         path = self.get_file_path()
 
@@ -255,18 +279,30 @@ def test_load_with_different_shard_plan(self) -> None:
             ChunkShardingSpec(
                 dim=0,
                 placements=[
+<<<<<<< HEAD
                     f"rank:0/{device_type}:0",
                     f"rank:1/{device_type}:1",
+=======
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
             # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
             ChunkShardingSpec(
                 dim=0,
                 placements=[
+<<<<<<< HEAD
                     f"rank:0/{device_type}:0",
                     f"rank:1/{device_type}:1",
                     f"rank:1/{device_type}:1",
                     f"rank:0/{device_type}:0",
+=======
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+                    "rank:1/cuda:1",
+                    "rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
             # This requires the tensors to be [10, 20]
@@ -275,27 +311,47 @@ def test_load_with_different_shard_plan(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0, 0],
                         shard_sizes=[2, 20],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[2, 0],
                         shard_sizes=[1, 20],
+<<<<<<< HEAD
                         placement=f"rank:1/{device_type}:1",
+=======
+                        placement="rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[3, 0],
                         shard_sizes=[3, 20],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[6, 0],
                         shard_sizes=[3, 20],
+<<<<<<< HEAD
                         placement=f"rank:1/{device_type}:1",
+=======
+                        placement="rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[9, 0],
                         shard_sizes=[1, 20],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ]
             ),
@@ -305,12 +361,20 @@ def test_load_with_different_shard_plan(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0, 0],
                         shard_sizes=[8, 20],
+<<<<<<< HEAD
                         placement=f"rank:1/{device_type}:1",
+=======
+                        placement="rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[8, 0],
                         shard_sizes=[2, 20],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ]
             ),
@@ -358,7 +422,11 @@ def test_load_with_different_shard_plan(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_rowwise_to_colwise(self) -> None:
         path = self.get_file_path()
         self.assertEqual(self.world_size, dist.get_world_size())
@@ -367,8 +435,13 @@ def test_load_rowwise_to_colwise(self) -> None:
         src_spec = ChunkShardingSpec(
             dim=0,
             placements=[
+<<<<<<< HEAD
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -376,8 +449,13 @@ def test_load_rowwise_to_colwise(self) -> None:
         dst_spec = ChunkShardingSpec(
             dim=1,
             placements=[
+<<<<<<< HEAD
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -385,14 +463,22 @@ def test_load_rowwise_to_colwise(self) -> None:
             shutil.rmtree(path, ignore_errors=True)
             os.makedirs(path)
 
+<<<<<<< HEAD
         model_to_save = MyShardedModel3(src_spec).to(dist.get_rank())
+=======
+        model_to_save = MyShardedModel3(src_spec).cuda(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_to_save._register_state_dict_hook(state_dict_hook)
         state_dict_to_save = model_to_save.state_dict()
 
         fs_writer = FileSystemWriter(path=path)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
+<<<<<<< HEAD
         model_to_load = MyShardedModel3(dst_spec).to(dist.get_rank())
+=======
+        model_to_load = MyShardedModel3(dst_spec).cuda(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_to_load._register_state_dict_hook(state_dict_hook)
         state_dict_to_load_to = model_to_load.state_dict()
 
@@ -409,7 +495,11 @@ def test_load_rowwise_to_colwise(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_load_bytes(self) -> None:
         path = self.get_file_path()
 
@@ -428,7 +518,11 @@ def test_save_load_bytes(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_switch_between_sharded_tensor_to_tensor(self) -> None:
         path = self.get_file_path()
         tensor_size = 32
@@ -437,17 +531,29 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
             ChunkShardingSpec(
                 dim=0,
                 placements=[
+<<<<<<< HEAD
                     f"rank:0/{device_type}:0",
                     f"rank:1/{device_type}:1",
+=======
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
             ChunkShardingSpec(
                 dim=0,
                 placements=[
+<<<<<<< HEAD
                     f"rank:0/{device_type}:0",
                     f"rank:1/{device_type}:1",
                     f"rank:1/{device_type}:1",
                     f"rank:0/{device_type}:0",
+=======
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+                    "rank:1/cuda:1",
+                    "rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
             EnumerableShardingSpec(
@@ -455,12 +561,20 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0],
                         shard_sizes=[8],
+<<<<<<< HEAD
                         placement=f"rank:1/{device_type}:1",
+=======
+                        placement="rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[8],
                         shard_sizes=[tensor_size - 8],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ]
             ),
@@ -469,12 +583,20 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0],
                         shard_sizes=[10],
+<<<<<<< HEAD
                         placement=f"rank:0/{device_type}:0",
+=======
+                        placement="rank:0/cuda:0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     ShardMetadata(
                         shard_offsets=[10],
                         shard_sizes=[tensor_size - 10],
+<<<<<<< HEAD
                         placement=f"rank:1/{device_type}:1",
+=======
+                        placement="rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ]
             ),
@@ -520,15 +642,24 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_read_write_shard_tensor(self) -> None:
         # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
+<<<<<<< HEAD
                 f"rank:0/{device_type}:0",
                 f"rank:1/{device_type}:1",
+=======
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
diff --git a/test/distributed/checkpoint/test_format_utils.py b/test/distributed/checkpoint/test_format_utils.py
index 35058e3853779..f824731b5f745 100644
--- a/test/distributed/checkpoint/test_format_utils.py
+++ b/test/distributed/checkpoint/test_format_utils.py
@@ -22,9 +22,12 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SimpleModelUneven(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -43,7 +46,11 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(4, 5, device=device_type)
+=======
+        return torch.rand(4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFormatUtils(DTensorTestBase):
@@ -90,7 +97,11 @@ def test_online_torch_save_to_dcp(self) -> None:
 
         # Load into a sharded model
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+<<<<<<< HEAD
         model = SimpleModelUneven().to(self.device_type)
+=======
+        model = SimpleModelUneven().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(
             model,
             device_mesh=device_mesh,
diff --git a/test/distributed/checkpoint/test_fsdp_model_state.py b/test/distributed/checkpoint/test_fsdp_model_state.py
index f73604a1d7711..66a920d5e964a 100644
--- a/test/distributed/checkpoint/test_fsdp_model_state.py
+++ b/test/distributed/checkpoint/test_fsdp_model_state.py
@@ -21,8 +21,12 @@
 class FsdpModelStateCheckpoint(DTensorTestBase):
     @property
     def backend(self):
+<<<<<<< HEAD
         curr_backend = dist.get_default_backend_for_device(self.device_type)
         return f"cpu:gloo,{self.device_type}:{curr_backend}"
+=======
+        return "cpu:gloo,cuda:nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_fsdp_model_state(self, process_group) -> None:
         CHECKPOINT_DIR = self.temp_dir
@@ -68,8 +72,13 @@ def _test_fsdp_model_state(self, process_group) -> None:
                 self.assertEqual(model.weight, model_2.weight)
                 self.assertEqual(model.bias, model_2.bias)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(2)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_fsdp_model_state_no_resharding(self):
         self._test_fsdp_model_state(process_group=None)
@@ -89,8 +98,13 @@ def _create_new_dist_group(self):
 
         return my_fsdp
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_fsdp_model_state_with_resharding(self):
         self._test_fsdp_model_state(process_group=self._create_new_dist_group())
diff --git a/test/distributed/checkpoint/test_fsdp_optim_state.py b/test/distributed/checkpoint/test_fsdp_optim_state.py
index 7adcdafe45307..aa7a60e971f4e 100644
--- a/test/distributed/checkpoint/test_fsdp_optim_state.py
+++ b/test/distributed/checkpoint/test_fsdp_optim_state.py
@@ -1,7 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+<<<<<<< HEAD
 import torch.distributed as dist
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
@@ -29,9 +32,14 @@ def _create_model(self):
         layer3_weight_dim = self.world_size * 3
 
         class TestDummyModel(torch.nn.Module):
+<<<<<<< HEAD
             def __init__(self, device_type) -> None:
                 super().__init__()
                 self.device_type = device_type
+=======
+            def __init__(self) -> None:
+                super().__init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.net1 = nn.Sequential(nn.Linear(8, layer1_weight_dim), nn.ReLU())
                 self.net2 = nn.Sequential(
                     nn.Linear(layer1_weight_dim, layer2_weight_dim), nn.ReLU()
@@ -44,18 +52,31 @@ def forward(self, x):
                 return self.net3(self.net2(self.net1(x)))
 
             def get_input(self):
+<<<<<<< HEAD
                 return torch.rand(8, 8, device=self.device_type)
 
         model = TestDummyModel(self.device_type).to(self.device_type)
+=======
+                return torch.rand(8, 8, device="cuda")
+
+        model = TestDummyModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return model
 
     @property
     def backend(self):
+<<<<<<< HEAD
         curr_backend = dist.get_default_backend_for_device(self.device_type)
         return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     @skip_if_lt_x_gpu(2)
     @with_comms
+=======
+        return "cpu:gloo,cuda:nccl"
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     @parametrize("pass_planner", [True, False])
     def test_load_sharded_optimizer_state_dict(self, pass_planner) -> None:
diff --git a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
index 612917963028c..7fbd1e3f29680 100644
--- a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
+++ b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@@ -30,7 +30,11 @@ class TestFsdpTpCheckpointConversion(DTensorTestBase):
     def test_fsdp_to_tp(self):
         CHECKPOINT_DIR = self.temp_dir
 
+<<<<<<< HEAD
         model = MLPModule(self.device_type).to(self.rank)
+=======
+        model = MLPModule(self.device_type).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # create a FSDP wrapped model
         fsdp_model = FSDP(model, use_orig_params=True)
 
@@ -49,7 +53,11 @@ def test_fsdp_to_tp(self):
         # create a TP wrapped model
         mesh_shape = (self.world_size,)
         device_mesh = init_device_mesh(self.device_type, mesh_shape)
+<<<<<<< HEAD
         model = MLPModule(self.device_type).to(self.rank)
+=======
+        model = MLPModule(self.device_type).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
@@ -60,7 +68,11 @@ def test_fsdp_to_tp(self):
 
         # Update the parameters so tp_model.state_dict() will be different from fsdp_model.state_dict().
         torch.manual_seed(0)
+<<<<<<< HEAD
         inp = torch.rand(20, 10).to(self.rank)
+=======
+        inp = torch.rand(20, 10).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = tp_model(inp)
         output.sum().backward()
         optimizer.step()
diff --git a/test/distributed/checkpoint/test_fsspec.py b/test/distributed/checkpoint/test_fsspec.py
index 530ffae852715..45036f5e2cf8c 100644
--- a/test/distributed/checkpoint/test_fsspec.py
+++ b/test/distributed/checkpoint/test_fsspec.py
@@ -2,9 +2,14 @@
 
 import shutil
 import tempfile
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from typing import Any, Optional
+=======
+from functools import wraps
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -19,10 +24,14 @@
 from torch.distributed.checkpoint.utils import CheckpointException
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
+=======
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
@@ -30,10 +39,13 @@
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 BACKEND = torch.distributed.get_default_backend_for_device(device_type)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def with_temp_dir(
     func: Optional[Callable] = None,
 ) -> Optional[Callable]:
@@ -83,14 +95,24 @@ class TestFSSpec(ShardedTensorTestBase):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @with_comms(backend=BACKEND, init_rpc=False)
     @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
+=======
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_fsspec(self):
         CHECKPOINT_DIR = self.temp_dir
 
+<<<<<<< HEAD
         model = FSDP(MyTestModule().to(device_type))
+=======
+        model = FSDP(MyTestModule().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim = torch.optim.Adam(model.parameters(), lr=0.1)
         model(torch.rand(8, 8, device=dist.get_rank())).sum().backward()
         optim.step()
@@ -107,7 +129,11 @@ def test_fsspec(self):
                 planner=dcp.DefaultSavePlanner(),
             )
 
+<<<<<<< HEAD
         model_2 = FSDP(MyTestModule().to(device_type))
+=======
+        model_2 = FSDP(MyTestModule().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim_2 = torch.optim.Adam(model_2.parameters(), lr=0.1)
 
         with FSDP.summon_full_params(model):
@@ -157,9 +183,15 @@ def opt_at(opt, idx):
             opt_at(optim, 0)["exp_avg_sq"], opt_at(optim_2, 0)["exp_avg_sq"]
         )
 
+<<<<<<< HEAD
     @with_comms(backend=BACKEND, init_rpc=False)
     @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
+=======
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     def test_overwrite(self):
         t1, t2 = torch.randn(10), torch.randn(10)
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index f0316fde9f2c5..c9facc7eb1a00 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed checkpointing"]
 
 import importlib
+<<<<<<< HEAD
 import json
 import os
 
@@ -13,6 +14,14 @@
 from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard, zeros
+=======
+
+import torch
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Replicate, Shard, zeros
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
@@ -121,6 +130,7 @@ def test_load_into_empty_dict(self) -> None:
                 torch.equal(state_dict_to_save[key], state_dict_loaded[key])
             )
 
+<<<<<<< HEAD
     @with_temp_dir
     def test_load_with_multiple_threads(self) -> None:
         if importlib.util.find_spec("safetensors") is None:
@@ -327,6 +337,8 @@ def test_consolidate_to_one_file(self) -> None:
 
         dist.barrier()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ONE_D_PLACEMENTS = [
     [Shard(0)],
@@ -381,7 +393,11 @@ def test_1d_to_1d_reshard_placement_change(self) -> None:
                 state_dict=state_dict_to_save,
                 storage_writer=dist_cp.HuggingFaceStorageWriter(
                     path=CHECKPOINT_DIR,
+<<<<<<< HEAD
                     save_distributed=True,
+=======
+                    save_sharded=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
 
@@ -439,7 +455,11 @@ def test_2d_to_2d_reshard_placement_change(self) -> None:
             dist_cp.save(
                 state_dict=state_dict_to_save,
                 storage_writer=dist_cp.HuggingFaceStorageWriter(
+<<<<<<< HEAD
                     path=CHECKPOINT_DIR, save_distributed=True
+=======
+                    path=CHECKPOINT_DIR, save_sharded=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 planner=dist_cp.DefaultSavePlanner(),
             )
@@ -494,7 +514,11 @@ def test_1d_to_2d_reshard_mesh_change(self) -> None:
             dist_cp.save(
                 state_dict=state_dict_to_save,
                 storage_writer=dist_cp.HuggingFaceStorageWriter(
+<<<<<<< HEAD
                     path=CHECKPOINT_DIR, save_distributed=True
+=======
+                    path=CHECKPOINT_DIR, save_sharded=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
 
@@ -545,7 +569,11 @@ def test_2d_to_1d_reshard_mesh_change(self) -> None:
             dist_cp.save(
                 state_dict=state_dict_to_save,
                 storage_writer=dist_cp.HuggingFaceStorageWriter(
+<<<<<<< HEAD
                     path=CHECKPOINT_DIR, save_distributed=True
+=======
+                    path=CHECKPOINT_DIR, save_sharded=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 planner=dist_cp.DefaultSavePlanner(),
             )
@@ -587,7 +615,11 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             print("safetensors not installed")
             return
 
+<<<<<<< HEAD
         tensor = torch.rand(1).to(self.device_type)
+=======
+        tensor = torch.rand(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
         ref_state_dict = {"dtensor": dtensor}
@@ -595,11 +627,19 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
         dist_cp.save(
             state_dict=ref_state_dict,
             storage_writer=dist_cp.HuggingFaceStorageWriter(
+<<<<<<< HEAD
                 path=self.temp_dir, save_distributed=True
             ),
         )
 
         tensor = torch.rand(1).to(self.device_type)
+=======
+                path=self.temp_dir, save_sharded=True
+            ),
+        )
+
+        tensor = torch.rand(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh_2 = init_device_mesh(self.device_type, (2, self.world_size // 2))
         dtensor = distribute_tensor(tensor, mesh_2, [Shard(0), Shard(0)])
         state_dict = {"dtensor": dtensor}
diff --git a/test/distributed/checkpoint/test_hf_storage.py b/test/distributed/checkpoint/test_hf_storage.py
index 81558db13a69f..91138d68af8bc 100644
--- a/test/distributed/checkpoint/test_hf_storage.py
+++ b/test/distributed/checkpoint/test_hf_storage.py
@@ -2,16 +2,24 @@
 
 import json
 import os
+<<<<<<< HEAD
+=======
+import pathlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import tempfile
 from unittest.mock import MagicMock
 
 import torch
 from torch.distributed.checkpoint import DefaultLoadPlanner
+<<<<<<< HEAD
 from torch.distributed.checkpoint._hf_utils import (
     _HFStorageInfo,
     NUM_BYTES_FOR_HEADER_LEN,
 )
+=======
+from torch.distributed.checkpoint._hf_utils import _HFStorageInfo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
 from torch.distributed.checkpoint.filesystem import _StorageInfo, FileSystem
 from torch.distributed.checkpoint.hf_storage import (
@@ -108,7 +116,11 @@ def test_write_data_with_sharding(self) -> None:
         with tempfile.TemporaryDirectory() as path:
             writer = HuggingFaceStorageWriter(
                 path=path,
+<<<<<<< HEAD
                 save_distributed=True,
+=======
+                save_sharded=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             writer.fs = FileSystem()
 
@@ -162,6 +174,7 @@ def test_write_data_with_sharding(self) -> None:
             )
 
     def test_read_data_hf(self) -> None:
+<<<<<<< HEAD
         tensor_0 = torch.tensor([1.0, 2.0, 3.0, 4.0])
 
         mock_safe_open = MagicMock()
@@ -171,14 +184,35 @@ def test_read_data_hf(self) -> None:
 
         sys.modules["safetensors"] = MagicMock()
         sys.modules["safetensors"].safe_open = mock_safe_open
+=======
+        mock_safetensors = MagicMock()
+        sys.modules["safetensors"] = mock_safetensors
+
+        # Create test tensors
+        tensor_0 = torch.tensor([1.0, 2.0, 3.0, 4.0])
+
+        # Mock the deserialize function to return our test tensors
+        # The format matches what's expected in the read_data method
+        mock_safetensors.deserialize.return_value = [
+            (
+                "tensor_0",
+                {"data": tensor_0.numpy().tobytes(), "dtype": "F32", "shape": [4]},
+            ),
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with tempfile.TemporaryDirectory() as path:
             # Create the reader
             reader = HuggingFaceStorageReader(path=path)
+<<<<<<< HEAD
+=======
+            reader.fs = FileSystem()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Create test file
             file_name = "model-00001-of-00001.safetensors"
             file_path = os.path.join(path, file_name)
+<<<<<<< HEAD
 
             with open(file_path, "wb") as f:
                 # write metadata the same way it would be in safetensors file
@@ -201,6 +235,9 @@ def test_read_data_hf(self) -> None:
                 f.write(metadata_bytes)
 
                 f.write(tensor_0.numpy().tobytes())
+=======
+            pathlib.Path(file_path).touch()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Set up storage data with _StorageInfo objects
             storage_data = {
@@ -208,6 +245,11 @@ def test_read_data_hf(self) -> None:
                     fqn="tensor_0", offset=torch.Size([0]), index=None
                 ): _HFStorageInfo(
                     file_path,
+<<<<<<< HEAD
+=======
+                    0,
+                    tensor_0.numel() * tensor_0.element_size(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     tensor_0.shape,
                     tensor_0.dtype,
                 ),
@@ -260,15 +302,22 @@ def test_read_data_hf(self) -> None:
                 ),
             )
 
+<<<<<<< HEAD
+=======
+            # Call read_data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             future = reader.read_data(load_plan, load_planner)
             future.wait()
 
             # Verify results - the target tensors should now contain the values from our test tensor
             self.assertTrue(torch.equal(state_dict["tensor_0"], tensor_0))
 
+<<<<<<< HEAD
             mock_safe_open.assert_called_once_with(filename=file_path, framework="pt")
             mock_context.__enter__.return_value.get_slice.assert_called_with("tensor_0")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_write_metadata_hf(self) -> None:
         mock_module = MagicMock()
         sys.modules["huggingface_hub"] = mock_module
@@ -322,6 +371,7 @@ def test_write_metadata_hf(self) -> None:
                 self.assertEqual(metadata, expected_metadata)
 
     def test_read_metadata_hf(self):
+<<<<<<< HEAD
         mock_safe_open = MagicMock()
         mock_context = MagicMock()
 
@@ -343,11 +393,14 @@ def test_read_metadata_hf(self):
         sys.modules["safetensors"] = mock_safetensors
         sys.modules["safetensors.torch"] = mock_safetensors.torch
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with tempfile.TemporaryDirectory() as path:
             reader = HuggingFaceStorageReader(path=path)
 
             key = "tensor_0"
             file_name = "test.safetensors"
+<<<<<<< HEAD
             file_path = os.path.join(path, file_name)
 
             # Create an empty file so fs.ls can find it
@@ -366,6 +419,25 @@ def test_read_metadata_hf(self):
 
             # Verify that safe_open was called with our file path
             mock_safe_open.assert_called_once_with(file_path, framework="pt")
+=======
+            with open(os.path.join(path, file_name), "wb") as f:
+                # write metadata the same way it would be in safetensors file
+                metadata_contents = json.dumps(
+                    {
+                        "tensor_0": {
+                            "dtype": "F32",
+                            "shape": [5, 10],
+                            "data_offsets": [0, 200],
+                        }
+                    }
+                )
+                metadata_bytes = metadata_contents.encode("utf-8")
+
+                f.write(len(metadata_bytes).to_bytes(8, byteorder="little"))
+                f.write(metadata_bytes)
+
+            metadata = reader.read_metadata()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertEqual(
                 metadata.state_dict_metadata,
@@ -381,7 +453,10 @@ def test_read_metadata_hf(self):
                     ),
                 },
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(
                 metadata.storage_data,
                 {
@@ -389,6 +464,11 @@ def test_read_metadata_hf(self):
                         fqn=key, offset=torch.Size([0, 0]), index=None
                     ): _HFStorageInfo(
                         os.path.join(path, file_name),
+<<<<<<< HEAD
+=======
+                        0,
+                        200,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         torch.Size([5, 10]),
                         torch.float32,
                     )
diff --git a/test/distributed/checkpoint/test_hsdp_checkpoint.py b/test/distributed/checkpoint/test_hsdp_checkpoint.py
index 8aa55cd2c24f2..67a0385e81c6f 100644
--- a/test/distributed/checkpoint/test_hsdp_checkpoint.py
+++ b/test/distributed/checkpoint/test_hsdp_checkpoint.py
@@ -2,7 +2,10 @@
 from copy import deepcopy
 
 import torch
+<<<<<<< HEAD
 import torch.distributed as dist
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed.checkpoint as dist_cp
 import torch.nn as nn
 import torch.nn.functional as F
@@ -30,9 +33,12 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SimpleModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -48,7 +54,11 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(4, 5, device=device_type)
+=======
+        return torch.rand(4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SimpleModelUneven(torch.nn.Module):
@@ -68,17 +78,28 @@ def forward(self, x):
         return x
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(4, 5, device=device_type)
+=======
+        return torch.rand(4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestHSDPCheckpoint(DTensorTestBase):
     @property
     def backend(self):
+<<<<<<< HEAD
         curr_backend = dist.get_default_backend_for_device(self.device_type)
         return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+        return "cpu:gloo,cuda:nccl"
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     @parametrize("is_even_sharded_model", [True, False])
     def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
@@ -87,7 +108,11 @@ def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
 
         mesh_2d = init_device_mesh(self.device_type, (2, self.world_size // 2))
         model = FSDP(
+<<<<<<< HEAD
             simple_model().to(self.device_type),
+=======
+            simple_model().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharding_strategy=ShardingStrategy.HYBRID_SHARD,
             device_mesh=mesh_2d,
         )
@@ -135,8 +160,13 @@ def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
             self.assertEqual(v1.placements, v2.placements)
             self.assertEqual(v1.to_local(), v2.to_local())
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_temp_dir
     @parametrize("is_even_sharded_model", [True, False])
     def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
@@ -146,7 +176,11 @@ def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
         # save the hsdp model state_dict
         mesh_2d = init_device_mesh(self.device_type, (2, self.world_size // 2))
         hsdp_model = FSDP(
+<<<<<<< HEAD
             simple_model().to(self.device_type),
+=======
+            simple_model().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharding_strategy=ShardingStrategy.HYBRID_SHARD,
             device_mesh=mesh_2d,
         )
@@ -164,7 +198,11 @@ def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
         # initialize a fsdp model to load checkpoint into
         mesh_1d = init_device_mesh(self.device_type, (self.world_size,))
         fsdp_model = FSDP(
+<<<<<<< HEAD
             simple_model().to(self.device_type),
+=======
+            simple_model().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_mesh=mesh_1d,
         )
         FSDP.set_state_dict_type(
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index 16f7089206e34..b47f563001c61 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -24,7 +24,10 @@
     DefaultLoadPlanner,
     DefaultSavePlanner,
 )
+<<<<<<< HEAD
 from torch.distributed.checkpoint.filesystem import CURRENT_DCP_VERSION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.metadata import (
     BytesStorageMetadata,
     ChunkStorageMetadata,
@@ -66,7 +69,11 @@
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     shards_metadata = []
     local_shards = []
+<<<<<<< HEAD
     for idx in range(world_size * shards_per_rank):
+=======
+    for idx in range(0, world_size * shards_per_rank):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * shard_size],
@@ -216,7 +223,11 @@ def create_data(rank):
         # Number of plans should remain unchanged
         self.assertEqual(len(all_plans), len(deduped_plans))
 
+<<<<<<< HEAD
         # Number of items in the deduped plans should be less than the original plans
+=======
+        # Numer of items in the deduped plans should be less than the original plans
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for new_plan, old_plan in zip(deduped_plans, all_plans):
             self.assertFalse(_compare_save_plans(new_plan, old_plan))
             self.assertTrue(len(new_plan.items) < len(old_plan.items))
@@ -594,6 +605,7 @@ def test_load_different_sizes_throws(self):
                 planner=DefaultLoadPlanner(),
             )
 
+<<<<<<< HEAD
     @with_temp_dir
     def test_version_key_in_planner_data(self):
         original_module = nn.Linear(2, 2)
@@ -610,6 +622,8 @@ def test_version_key_in_planner_data(self):
 
         self.assertEqual(planner.metadata.version, CURRENT_DCP_VERSION)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_save_load_api.py b/test/distributed/checkpoint/test_save_load_api.py
index 1a7f763dc8826..bb5802e7751ae 100644
--- a/test/distributed/checkpoint/test_save_load_api.py
+++ b/test/distributed/checkpoint/test_save_load_api.py
@@ -37,7 +37,11 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(4)
     @with_temp_dir
     def test_auto_detect(self):
+<<<<<<< HEAD
         model = FSDP(MyTestModule().to(self.device_type))
+=======
+        model = FSDP(MyTestModule().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
         model = FSDP(model, device_mesh=device_mesh)
         dcp.save(model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first"))
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index 095dc4bc3514a..7dee97f04ad6b 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -3,9 +3,14 @@
 import copy
 import functools
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 from itertools import chain, product
 from typing import Union
+=======
+from itertools import chain
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -63,9 +68,12 @@
 from torch.utils._pytree import tree_all, tree_all_only
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -83,7 +91,11 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
 
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.accelerator.device_count())
+=======
+        return min(4, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_save_load(
         self,
@@ -105,7 +117,11 @@ def _test_save_load(
             for d_optim in _dist_optim:
                 d_optim.zero_grad()
 
+<<<<<<< HEAD
             batch = torch.rand(8, 100, device=device_type)
+=======
+            batch = torch.rand(8, 100, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model(batch).sum().backward()
             dist_model(batch).sum().backward()
 
@@ -113,7 +129,11 @@ def _test_save_load(
             for d_optim in _dist_optim:
                 d_optim.step()
 
+<<<<<<< HEAD
         # We need to ensure gradients don't exist, this the invariant of using DSD.
+=======
+        # We need to ensure gradients don't exist, this the invarient of using DSD.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim.zero_grad()
 
         # Get the state_dict, and compare the result
@@ -139,7 +159,11 @@ def _test_save_load(
             # We won't be able to load the partial state_dict back.
             return
         # Since we already have the state_dict saved before, no need to call DCP.
+<<<<<<< HEAD
         # We can directly load them back. This assert is to ensure that optimizer
+=======
+        # We can directly load them back. This asser is to ensure that optimizer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # state storage are initialized.
         # self.assertEqual(len(curr_dist_osd[STATE]), len(dist_osd[STATE]))
         set_model_state_dict(
@@ -192,9 +216,15 @@ def _test_fsdp(
 
         def init_model_optim():
             if use_dtensor:
+<<<<<<< HEAD
                 device_mesh = init_device_mesh(device_type, (self.world_size,))
 
             orig_model = CompositeParamModel(device=torch.device(device_type))
+=======
+                device_mesh = init_device_mesh("cuda", (self.world_size,))
+
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4, foreach=True)
             copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4, foreach=True)
             if wrapping:
@@ -202,7 +232,11 @@ def init_model_optim():
             else:
                 strategy = {UnitModule}
             if use_dtensor:
+<<<<<<< HEAD
                 device_mesh = init_device_mesh(device_type, (self.world_size,))
+=======
+                device_mesh = init_device_mesh("cuda", (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dist_model = FSDP(
                     copy.deepcopy(orig_model),
                     auto_wrap_policy=ModuleWrapPolicy(strategy),
@@ -262,7 +296,11 @@ def _test_fsdp2(
         foreach: bool = True,
     ):
         def init_model_optim():
+<<<<<<< HEAD
             orig_model = CompositeParamModel(device=torch.device(device_type))
+=======
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_optim = optimizer_class(
                 orig_model.parameters(), lr=1e-4, foreach=foreach
             )
@@ -299,7 +337,11 @@ def test_fsdp2(self) -> None:
 
     def _test_ddp(self, use_composable: bool, optimizer_class: type[Optimizer]) -> None:
         def init_model_optim():
+<<<<<<< HEAD
             orig_model = CompositeParamModel(device=torch.device(device_type))
+=======
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
             copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
             if use_composable:
@@ -333,7 +375,11 @@ def _test_fsdp_ddp(
         test_frozen: bool = False,
     ) -> None:
         def init_model_optim():
+<<<<<<< HEAD
             orig_model = CompositeParamModel(device=torch.device(device_type))
+=======
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if test_frozen:
                 for param in chain(
                     orig_model.u1.parameters(), orig_model.u2.parameters()
@@ -374,7 +420,11 @@ def test_fsdp_ddp(self) -> None:
 
     def _test_single_gpu(self, optimizer_class: type[Optimizer]) -> None:
         def init_model_optim():
+<<<<<<< HEAD
             orig_model = CompositeParamModel(device=torch.device(device_type))
+=======
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
             copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
             model_copy = copy.deepcopy(orig_model)
@@ -389,7 +439,11 @@ def test_single_gpu(self) -> None:
         self._test_single_gpu(torch.optim.AdamW)
 
     def _test_strict(self, parallelism: str) -> None:
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if parallelism == "DDP":
             model = DDP(model)
         else:
@@ -426,8 +480,13 @@ def test_strict(self) -> None:
     def _test_cpu_offload_full_state_dict(
         self, optimizer_class: type[Optimizer]
     ) -> None:
+<<<<<<< HEAD
         orig_model = CompositeParamModel(device=torch.device(device_type))
         device_mesh = init_device_mesh(device_type, (self.world_size,))
+=======
+        orig_model = CompositeParamModel(device=torch.device("cuda"))
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist_model = FSDP(
             copy.deepcopy(orig_model),
             auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
@@ -503,7 +562,11 @@ def test_cpu_offload_full_state_dict(self) -> None:
     @skip_if_lt_x_gpu(1)
     def test_activation_ckpt_fqns_ddp(self) -> None:
         """Tests that activation checkpointing prefixes are removed from module names"""
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         original_keys = get_model_state_dict(model).keys()
 
         apply_activation_checkpointing(model)
@@ -522,7 +585,11 @@ def test_activation_ckpt_fqns_fsdp1(self) -> None:
 
     def _test_activation_ckpt_fqns_fsdp1(self, use_orig_params: bool) -> None:
         """Tests that activation checkpointing prefixes are removed from module names"""
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         original_keys = get_model_state_dict(model).keys()
 
         apply_activation_checkpointing(model)
@@ -533,7 +600,11 @@ def _test_activation_ckpt_fqns_fsdp1(self, use_orig_params: bool) -> None:
 
     @skip_if_lt_x_gpu(1)
     def test_extra_state(self) -> None:
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_extra_state(self):
             return "MyState"
@@ -551,21 +622,35 @@ def set_extra_state(self, state):
 
     @skip_if_lt_x_gpu(1)
     def test_non_persistent_buffers(self) -> None:
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
         model.register_buffer(
             "dont_save_me", torch.rand(100, device=device_type), persistent=False
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+        model.register_buffer(
+            "dont_save_me", torch.rand(100, device="cuda"), persistent=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         target_model = copy.deepcopy(model)
         set_model_state_dict(target_model, get_model_state_dict(target_model))
         self.assertEqual(model.state_dict(), get_model_state_dict(target_model))
 
     def _test_broadcast_from_rank0(self, wrapper) -> None:
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim = torch.optim.Adam(model.parameters())
         fsdp_model = wrapper(copy.deepcopy(model))
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters())
 
+<<<<<<< HEAD
         batch = torch.rand(8, 100, device=device_type)
+=======
+        batch = torch.rand(8, 100, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(batch).sum().backward()
         optim.step()
         states, optim_states = get_state_dict(model, optim)
@@ -635,8 +720,13 @@ def check(equal):
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_broadcast_from_rank0(self) -> None:
+<<<<<<< HEAD
         device_mesh = init_device_mesh(device_type, (self.world_size,))
         hsdp_device_mesh = init_device_mesh(device_type, (2, self.world_size // 2))
+=======
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        hsdp_device_mesh = init_device_mesh("cuda", (2, self.world_size // 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_subtests(
             {
                 "wrapper": [
@@ -658,8 +748,13 @@ def test_fsdp_root_not_initialized(self) -> None:
         # This test verifies that FSDP root is not initialized but we should
         # still be able to  get the state_dict without errors because
         # fsdp_model.state_dict() will trigger the FSDP initialization.
+<<<<<<< HEAD
         device_mesh = init_device_mesh(device_type, (self.world_size,))
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_model = FSDP(copy.deepcopy(model), device_mesh=device_mesh)
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters())
         get_model_state_dict(fsdp_model)
@@ -672,9 +767,16 @@ def test_optim_state_dict_param_matching(self) -> None:
         # "initial_lr" is added to optim_state_dict, but not to the new optim
         # We test whether "initial_lr" appear in optim after
         # set_optimizer_state_dict.
+<<<<<<< HEAD
         torch.manual_seed(0)
         model = nn.Sequential(
             *[nn.Linear(4, 4, device=device_type, bias=False) for _ in range(2)]
+=======
+        device = "cuda"
+        torch.manual_seed(0)
+        model = nn.Sequential(
+            *[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for layer in model:
             fully_shard(layer)
@@ -708,6 +810,7 @@ def test_optim_state_dict_param_matching(self) -> None:
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_flattened_osd(self) -> None:
+<<<<<<< HEAD
         """
         Test flattened optimizer state dictionaries with different combinations of
         flatten_optimizer_state_dict flag for saving and loading.
@@ -748,6 +851,34 @@ def test_flattened_osd(self) -> None:
 
     def _test_deprecate_partial(self) -> None:
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        model = CompositeParamModel(device=torch.device("cuda"))
+        fsdp_model = fully_shard(copy.deepcopy(model), mesh=device_mesh)
+        fsdp_optim = torch.optim.AdamW(fsdp_model.parameters())
+        batch = torch.rand(8, 100, device="cuda")
+        fsdp_model(batch).sum().backward()
+        fsdp_optim.step()
+        fsdp_optim.zero_grad()
+        osd1 = get_optimizer_state_dict(fsdp_model, fsdp_optim)
+        osd2 = get_optimizer_state_dict(
+            fsdp_model,
+            fsdp_optim,
+            options=StateDictOptions(flatten_optimizer_state_dict=True),
+        )
+        fsdp_optim2 = torch.optim.AdamW(fsdp_model.parameters())
+        set_optimizer_state_dict(
+            fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd2
+        )
+        self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
+        set_optimizer_state_dict(
+            fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd1
+        )
+        self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
+
+    def _test_deprecate_partial(self) -> None:
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model_state_dict1 = get_model_state_dict(model)
         model_state_dict1 = copy.deepcopy(model_state_dict1)
@@ -800,8 +931,13 @@ def _test_deprecate_partial(self) -> None:
         self.assertEqual(model.l.bias, model_state_dict1["l.bias"])
 
     def _test_deprecate_fsdp_api(self) -> None:
+<<<<<<< HEAD
         device_mesh = init_device_mesh(device_type, (self.world_size,))
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_model = FSDP(copy.deepcopy(model), device_mesh=device_mesh)
         with self.assertWarnsRegex(
             FutureWarning,
@@ -840,8 +976,13 @@ def forward(self, input):
                 return output
 
         def init_model_optim():
+<<<<<<< HEAD
             device_mesh = init_device_mesh(device_type, (self.world_size,))
             orig_model = TiedEmbeddingModel(10000, 300).to(torch.device(device_type))
+=======
+            device_mesh = init_device_mesh("cuda", (self.world_size,))
+            orig_model = TiedEmbeddingModel(10000, 300).to(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-4)
             copy_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-4)
             dist_model = FSDP(copy.deepcopy(orig_model), device_mesh=device_mesh)
@@ -922,12 +1063,17 @@ def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
             self.assertEqual(cpu_model_value, meta_model_value)
         # Memory allocated and reserved are lower due to the change at _distribute_tensors
         # from view to clone. This test would fail if with view due to higher memory cost.
+<<<<<<< HEAD
         memory_allocated = (
             torch.get_device_module(device_type).memory_allocated(0) / 1024 / 1024
         )
         memory_reserved = (
             torch.get_device_module(device_type).memory_reserved(0) / 1024 / 1024
         )
+=======
+        memory_allocated = torch.cuda.memory_allocated(0) / 1024 / 1024
+        memory_reserved = torch.cuda.memory_reserved(0) / 1024 / 1024
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(memory_allocated <= 384)
         self.assertTrue(memory_reserved <= 768)
 
@@ -963,11 +1109,19 @@ def test_multi_device_load_model_state_dict(self) -> None:
             meta_submodel = nn.Linear(4, 4, bias=False)
         with torch.device("cpu"):
             cpu_submodel = nn.Linear(4, 4, bias=False)
+<<<<<<< HEAD
         with torch.device(device_type):
             acc_submodel = nn.Linear(4, 4, bias=False)
 
         two_device_model_with_meta = nn.Sequential(meta_submodel, acc_submodel)
         two_device_model_without_meta = nn.Sequential(cpu_submodel, acc_submodel)
+=======
+        with torch.device("cuda"):
+            cuda_submodel = nn.Linear(4, 4, bias=False)
+
+        two_device_model_with_meta = nn.Sequential(meta_submodel, cuda_submodel)
+        two_device_model_without_meta = nn.Sequential(cpu_submodel, cuda_submodel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch.device("cpu"):
             model_to_set = nn.Sequential(
@@ -995,7 +1149,11 @@ def test_multi_device_load_model_state_dict(self) -> None:
     def test_state_dict_with_hook_on_keys(self) -> None:
         with torch.device("meta"):
             metamodel = FusionEmbedding(4, 4, 4)
+<<<<<<< HEAD
         with torch.device(device_type):
+=======
+        with torch.device("cuda"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gpumodel = FusionEmbeddingWithHook(4, 4, 4)
         gpumodel_state_dict = get_model_state_dict(gpumodel)
         with self.assertRaisesRegex(RuntimeError, "Missing key"):
@@ -1016,8 +1174,13 @@ def __init__(self):
             def forward(self, x):
                 return self.fc1(self.fc(x))
 
+<<<<<<< HEAD
         device_mesh = init_device_mesh(device_type, (self.world_size,))
         model = TestModel().to(device_type)
+=======
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        model = TestModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         parallelize_module(
             model,
             device_mesh,
@@ -1035,7 +1198,11 @@ def _test_multi(
 
             optim = torch.optim.AdamW(**optim_kwargs)
             optim.zero_grad()
+<<<<<<< HEAD
             model(torch.randn(64, 64, device=device_type)).sum().backward()
+=======
+            model(torch.randn(64, 64).cuda()).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optim.step()
             optim.zero_grad()
 
@@ -1088,7 +1255,11 @@ def setUp(self) -> None:
 
     @skip_if_lt_x_gpu(1)
     def test_no_dist(self) -> None:
+<<<<<<< HEAD
         model = CompositeParamModel(device=torch.device(device_type))
+=======
+        model = CompositeParamModel(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim = torch.optim.AdamW(model.parameters(), lr=1e-4)
 
         self.assertFalse(dist.is_initialized())
diff --git a/test/distributed/checkpoint/test_state_dict_stager.py b/test/distributed/checkpoint/test_state_dict_stager.py
index 22cb2f32cf4a4..321c914c8eab1 100644
--- a/test/distributed/checkpoint/test_state_dict_stager.py
+++ b/test/distributed/checkpoint/test_state_dict_stager.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import dataclasses
+<<<<<<< HEAD
 import os
 import tempfile
 import unittest
@@ -25,15 +26,28 @@
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.placement_types import Shard
+from torch.distributed.checkpoint._state_dict_stager import StateDictStager
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def create_cpu_state_dict(state_dict):
     cpu_state_dict = {}
     for key, value in state_dict.items():
@@ -41,16 +55,26 @@ def create_cpu_state_dict(state_dict):
     return cpu_state_dict
 
 
+<<<<<<< HEAD
 def compare_state_dicts(gpu_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
     """
     Compare if two state dictionaries (one on GPU, one on CPU) are otherwise the same.
+=======
+def compare_state_dicts(cuda_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
+    """
+    Compare if two state dictionaries (one on CUDA, one on CPU) are otherwise the same.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     This function checks if the tensors in both state dictionaries have the same values,
     shapes, dtypes, etc., ignoring the device difference. It also checks if tensors that
     share storage in one state dict also share storage in the other.
 
     Args:
+<<<<<<< HEAD
         gpu_state_dict: The state dictionary with tensors on GPU
+=======
+        cuda_state_dict: The state dictionary with tensors on CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cpu_state_dict: The state dictionary with tensors on CPU
         rtol: Relative tolerance for comparing tensor values
         atol: Absolute tolerance for comparing tensor values
@@ -60,6 +84,7 @@ def compare_state_dicts(gpu_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
         str: Error message if the state dictionaries are not equivalent, empty string otherwise
     """
     # Track storage data pointers to check storage sharing
+<<<<<<< HEAD
     gpu_storage_ptrs = {}
     cpu_storage_ptrs = {}
 
@@ -71,12 +96,26 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
                 return (
                     False,
                     f"Expected accelerator tensor, got {gpu_obj.device.type} tensor at {path}",
+=======
+    cuda_storage_ptrs = {}
+    cpu_storage_ptrs = {}
+
+    def compare_objects(cuda_obj, cpu_obj, path=""):
+        # If objects are tensors, compare them
+        if isinstance(cuda_obj, torch.Tensor) and isinstance(cpu_obj, torch.Tensor):
+            # Check if devices are as expected
+            if cuda_obj.device.type != "cuda":
+                return (
+                    False,
+                    f"Expected CUDA tensor, got {cuda_obj.device.type} tensor at {path}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if cpu_obj.device.type != "cpu":
                 return (
                     False,
                     f"Expected CPU tensor, got {cpu_obj.device.type} tensor at {path}",
                 )
+<<<<<<< HEAD
             if gpu_obj.storage_offset() != cpu_obj.storage_offset():
                 return (
                     False,
@@ -84,12 +123,22 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
                 )
 
             if not torch.equal(gpu_obj.cpu(), cpu_obj):
+=======
+            if cuda_obj.storage_offset() != cpu_obj.storage_offset():
+                return (
+                    False,
+                    f"Storage offset mismatch at {path}: {cuda_obj.storage_offset()} vs {cpu_obj.storage_offset()}",
+                )
+
+            if not torch.equal(cuda_obj.cpu(), cpu_obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (
                     False,
                     f"Tensors are not same at {path}",
                 )
 
             # Track storage sharing
+<<<<<<< HEAD
             gpu_storage_ptr = gpu_obj.storage().data_ptr()
             cpu_storage_ptr = cpu_obj.storage().data_ptr()
 
@@ -105,10 +154,28 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
                 # First time seeing this storage
                 gpu_storage_ptrs[gpu_storage_ptr] = cpu_storage_ptr
                 cpu_storage_ptrs[cpu_storage_ptr] = gpu_storage_ptr
+=======
+            cuda_storage_ptr = cuda_obj.storage().data_ptr()
+            cpu_storage_ptr = cpu_obj.storage().data_ptr()
+
+            if cuda_storage_ptr in cuda_storage_ptrs:
+                # This CUDA tensor shares storage with another tensor
+                # Check if the corresponding CPU tensors also share storage
+                if cpu_storage_ptr != cuda_storage_ptrs[cuda_storage_ptr]:
+                    return (
+                        False,
+                        f"Storage sharing mismatch: CUDA tensors share storage but CPU tensors don't at {path}",
+                    )
+            else:
+                # First time seeing this storage
+                cuda_storage_ptrs[cuda_storage_ptr] = cpu_storage_ptr
+                cpu_storage_ptrs[cpu_storage_ptr] = cuda_storage_ptr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return True, ""
 
         # If objects are dictionaries, compare them recursively
+<<<<<<< HEAD
         elif isinstance(gpu_obj, dict) and isinstance(cpu_obj, dict):
             if gpu_obj.keys() != cpu_obj.keys():
                 return (
@@ -119,6 +186,18 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
             for key in gpu_obj:
                 result, error = compare_objects(
                     gpu_obj[key], cpu_obj[key], f"{path}.{key}" if path else key
+=======
+        elif isinstance(cuda_obj, dict) and isinstance(cpu_obj, dict):
+            if cuda_obj.keys() != cpu_obj.keys():
+                return (
+                    False,
+                    f"Dictionary keys mismatch at {path}: {cuda_obj.keys()} vs {cpu_obj.keys()}",
+                )
+
+            for key in cuda_obj:
+                result, error = compare_objects(
+                    cuda_obj[key], cpu_obj[key], f"{path}.{key}" if path else key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if not result:
                     return False, error
@@ -126,6 +205,7 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
             return True, ""
 
         # If objects are lists, tuples, or sets, compare them recursively
+<<<<<<< HEAD
         elif isinstance(gpu_obj, (list, tuple, set)) and isinstance(
             cpu_obj, (list, tuple, set)
         ):
@@ -142,12 +222,31 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
 
             for i, (gpu_item, cpu_item) in enumerate(zip(gpu_obj, cpu_obj)):
                 result, error = compare_objects(gpu_item, cpu_item, f"{path}[{i}]")
+=======
+        elif isinstance(cuda_obj, (list, tuple, set)) and isinstance(
+            cpu_obj, (list, tuple, set)
+        ):
+            if len(cuda_obj) != len(cpu_obj):
+                return (
+                    False,
+                    f"Collection length mismatch at {path}: {len(cuda_obj)} vs {len(cpu_obj)}",
+                )
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Collection type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+
+            for i, (cuda_item, cpu_item) in enumerate(zip(cuda_obj, cpu_obj)):
+                result, error = compare_objects(cuda_item, cpu_item, f"{path}[{i}]")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not result:
                     return False, error
 
             return True, ""
 
         # If objects are custom classes, compare their attributes
+<<<<<<< HEAD
         elif hasattr(gpu_obj, "__dict__") and hasattr(cpu_obj, "__dict__"):
             if type(gpu_obj) is not type(cpu_obj):
                 return (
@@ -157,6 +256,17 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
 
             result, error = compare_objects(
                 gpu_obj.__dict__, cpu_obj.__dict__, f"{path}.__dict__"
+=======
+        elif hasattr(cuda_obj, "__dict__") and hasattr(cpu_obj, "__dict__"):
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Object type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+
+            result, error = compare_objects(
+                cuda_obj.__dict__, cpu_obj.__dict__, f"{path}.__dict__"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if not result:
                 return False, error
@@ -165,6 +275,7 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
 
         # For other types, use direct equality comparison
         else:
+<<<<<<< HEAD
             if type(gpu_obj) is not type(cpu_obj):
                 return (
                     False,
@@ -172,11 +283,24 @@ def compare_objects(gpu_obj, cpu_obj, path=""):
                 )
             if gpu_obj != cpu_obj:
                 return False, f"Value mismatch at {path}: {gpu_obj} vs {cpu_obj}"
+=======
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+            if cuda_obj != cpu_obj:
+                return False, f"Value mismatch at {path}: {cuda_obj} vs {cpu_obj}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return True, ""
 
     # Start the recursive comparison
+<<<<<<< HEAD
     result, error = compare_objects(gpu_state_dict, cpu_state_dict)
+=======
+    result, error = compare_objects(cuda_state_dict, cpu_state_dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result, error
 
 
@@ -206,7 +330,11 @@ class FrozenDataClass:
 
 
 class TestStateDictStager(TestCase):
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_views(self):
         test_configs = [
             (False, False),  # pin_memory=False, share_memory=False,
@@ -216,9 +344,15 @@ def test_views(self):
         ]
         for pin_memory, share_memory in test_configs:
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
+<<<<<<< HEAD
                 tensor1 = torch.randn(4, 4).to(device_type)
                 tensor2 = tensor1.view(16)
                 tensor3 = torch.randn(4, 4).to(device_type)
+=======
+                tensor1 = torch.randn(4, 4).cuda()
+                tensor2 = tensor1.view(16)
+                tensor3 = torch.randn(4, 4).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 state_dict = {
                     "tensor1": tensor1,
                     "tensor2": tensor2,
@@ -261,7 +395,11 @@ def test_views(self):
                 assert num_bytes == expected_bytes, (
                     f"Expected {expected_bytes} bytes, got {num_bytes}"
                 )
+<<<<<<< HEAD
                 # Verify that the CPU state dict is equivalent to the original GPU state dict
+=======
+                # Verify that the CPU state dict is equivalent to the original CUDA state dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result, error = compare_state_dicts(state_dict, cpu_state_dict)
                 assert result, f"State dicts are not equivalent: {error}"
 
@@ -281,7 +419,11 @@ def test_views(self):
                     == recursive["type"].tensor1.storage().data_ptr()
                 )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_caching(self):
         """
         Test that the StateDictStager correctly caches and reuses storages.
@@ -295,9 +437,15 @@ def test_caching(self):
         for pin_memory, share_memory in test_configs:
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
                 # Create test tensors and state dict
+<<<<<<< HEAD
                 tensor1 = torch.randn(4, 4).to(device_type)
                 tensor2 = tensor1.view(16)
                 tensor3 = torch.randn(4, 4).to(device_type)
+=======
+                tensor1 = torch.randn(4, 4).cuda()
+                tensor2 = tensor1.view(16)
+                tensor3 = torch.randn(4, 4).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 state_dict = {
                     "tensor1": tensor1,
                     "tensor2": tensor2,
@@ -373,14 +521,24 @@ def test_caching(self):
                     "Updated values should be reflected in the cached state dict"
                 )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_attrs(self):
         """
         Test that tensor attributes are preserved during stage with StateDictStager.
         """
+<<<<<<< HEAD
         tensor1 = torch.randn(4, 4).to(device_type)
         tensor2 = tensor1.view(16)
         tensor3 = torch.randn(4, 4).to(device_type)
+=======
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = tensor1.view(16)
+        tensor3 = torch.randn(4, 4).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Add custom attributes to tensors
         tensor1.a = 42
@@ -419,13 +577,18 @@ def test_tensor_attrs(self):
             "Tensor attribute 'c' has incorrect value"
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_different_dtypes(self):
         """
         Test that StateDictStager works correctly with tensors of different data types.
         """
         # Create tensors with different dtypes
         tensors = {
+<<<<<<< HEAD
             "float32": torch.randn(4, 4, dtype=torch.float32).to(device_type),
             "float64": torch.randn(4, 4, dtype=torch.float64).to(device_type),
             "int32": torch.randint(-100, 100, (4, 4), dtype=torch.int32).to(
@@ -435,6 +598,13 @@ def test_different_dtypes(self):
                 device_type
             ),
             "bool": torch.randint(0, 2, (4, 4), dtype=torch.bool).to(device_type),
+=======
+            "float32": torch.randn(4, 4, dtype=torch.float32).cuda(),
+            "float64": torch.randn(4, 4, dtype=torch.float64).cuda(),
+            "int32": torch.randint(-100, 100, (4, 4), dtype=torch.int32).cuda(),
+            "int64": torch.randint(-100, 100, (4, 4), dtype=torch.int64).cuda(),
+            "bool": torch.randint(0, 2, (4, 4), dtype=torch.bool).cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         # Create a state dict with these tensors
@@ -459,7 +629,11 @@ def test_different_dtypes(self):
                 f"Tensor {dtype_name} has incorrect values",
             )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_empty_tensors(self):
         """
         Test that StateDictStager works correctly with empty tensors.
@@ -474,6 +648,7 @@ def test_empty_tensors(self):
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
                 # Create empty tensors with different shapes
                 tensors = {
+<<<<<<< HEAD
                     "empty_0d": torch.tensor([], dtype=torch.float32).to(device_type),
                     "empty_1d": torch.tensor([], dtype=torch.float32)
                     .reshape(0)
@@ -485,6 +660,17 @@ def test_empty_tensors(self):
                     .reshape(0, 0, 0)
                     .to(device_type),
                     "zero_dim": torch.tensor(0.0).to(device_type),  # scalar tensor
+=======
+                    "empty_0d": torch.tensor([], dtype=torch.float32).cuda(),
+                    "empty_1d": torch.tensor([], dtype=torch.float32).reshape(0).cuda(),
+                    "empty_2d": torch.tensor([], dtype=torch.float32)
+                    .reshape(0, 0)
+                    .cuda(),
+                    "empty_3d": torch.tensor([], dtype=torch.float32)
+                    .reshape(0, 0, 0)
+                    .cuda(),
+                    "zero_dim": torch.tensor(0.0).cuda(),  # scalar tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 }
 
                 # Create a state dict with these tensors
@@ -514,13 +700,21 @@ def test_empty_tensors(self):
                         f"Tensor {tensor_name} has incorrect dtype",
                     )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_complex_storage_sharing(self):
         """
         Test that StateDictStager correctly handles complex storage sharing scenarios.
         """
         # Create a base tensor
+<<<<<<< HEAD
         base_tensor = torch.randn(10, 10).to(device_type)
+=======
+        base_tensor = torch.randn(10, 10).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Create various views and slices that share storage
         view1 = base_tensor.view(100)
@@ -596,6 +790,7 @@ def test_complex_storage_sharing(self):
             "slice3 should reflect changes to base",
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_dataclasses(self):
         # Create tensors
@@ -603,6 +798,15 @@ def test_dataclasses(self):
         tensor2 = torch.randn(8, 8).to(device_type)
         tensor3 = torch.randn(2, 6).to(device_type)
         tensor4 = torch.randn(3, 5).to(device_type)
+=======
+    @requires_cuda
+    def test_dataclasses(self):
+        # Create tensors
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = torch.randn(8, 8).cuda()
+        tensor3 = torch.randn(2, 6).cuda()
+        tensor4 = torch.randn(3, 5).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Create dataclass instances
         nested = NestedTensorStruct(tensor=tensor3)
@@ -709,14 +913,23 @@ def test_cpu_storage_independence(self):
             "CPU tensor should have the same values as the original tensor",
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_pinned_and_shared(self):
         """
         Test that verifies tensors are actually pinned and shared using tensor.is_pinned() and tensor.is_shared() methods.
         """
         # Create test tensors
+<<<<<<< HEAD
         tensor1 = torch.randn(4, 4).to(device_type)
         tensor2 = torch.randn(8, 8).to(device_type)
+=======
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = torch.randn(8, 8).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Create a state dict with these tensors
         state_dict = {
@@ -811,17 +1024,26 @@ def test_tensor_pinned_and_shared(self):
 
 class TestDTensorStateDictStager(DTensorTestBase):
     @with_comms
+<<<<<<< HEAD
     @requires_accelerator_dist_backend()
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_dtensor(self):
         """
         Test that StateDictStager works correctly with DTensors.
         """
         # Create a DTensor
+<<<<<<< HEAD
         device_mesh = dist.DeviceMesh(
             self.device_type, list(range(dist.get_world_size()))
         )
         tensor = torch.randn(3, 3, device=self.device_type)
+=======
+        device_mesh = dist.DeviceMesh("cuda", list(range(dist.get_world_size())))
+        tensor = torch.randn(3, 3, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtensor = DTensor.from_local(tensor, device_mesh, [Shard(0)])
 
         dtensor = dtensor + 1
@@ -842,6 +1064,7 @@ def test_dtensor(self):
             )
         )
         self.assertEqual(cpu_state_dict["dtensor"]._spec, dtensor._spec)
+<<<<<<< HEAD
         self.assertEqual(cpu_state_dict["dtensor"].size(), dtensor.size())
 
 
@@ -1361,6 +1584,8 @@ def test_replication_persistence(self):
 
             # Clean up
             stager.close()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
index 76e9aeb9e3302..db8963ca7fe18 100644
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -220,6 +220,7 @@ def _verify(cpu_state_dict):
             self.assertEqual(cpu_state_dict["step"], 7)
             self.assertEqual(cpu_state_dict["nested"], {"list": [1, 2, 3, 4]})
 
+<<<<<<< HEAD
         def _verify_weakref_finalize(cpu_state_dict):
             import gc
 
@@ -227,6 +228,8 @@ def _verify_weakref_finalize(cpu_state_dict):
             del cpu_state_dict
             gc.collect()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cpu_state_dict = _create_cpu_state_dict(state_dict)
         _verify(cpu_state_dict)
         cpu_state_dict = _create_cpu_state_dict(state_dict, pin_memory=True)
@@ -237,7 +240,10 @@ def _verify_weakref_finalize(cpu_state_dict):
             state_dict, share_memory=True, pin_memory=True
         )
         _verify(cpu_state_dict)
+<<<<<<< HEAD
         _verify_weakref_finalize(cpu_state_dict)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/checkpoint/test_tp_checkpoint.py b/test/distributed/checkpoint/test_tp_checkpoint.py
index a406999edc2fd..12096a07b7dc6 100644
--- a/test/distributed/checkpoint/test_tp_checkpoint.py
+++ b/test/distributed/checkpoint/test_tp_checkpoint.py
@@ -47,7 +47,11 @@ def test_tp_checkpoint(self):
         tp_mesh = init_device_mesh(self.device_type, mesh_shpe)
 
         # create model and move it to GPU with id rank
+<<<<<<< HEAD
         model = MLPModule(self.device_type).to(self.rank)
+=======
+        model = MLPModule(self.device_type).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
@@ -65,7 +69,11 @@ def test_tp_checkpoint(self):
 
         # Update the parameters so model.state_dict() will be different from original_state_dict.
         torch.manual_seed(0)
+<<<<<<< HEAD
         inp = torch.rand(20, 10).to(self.rank)
+=======
+        inp = torch.rand(20, 10).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = model(inp)
         output.sum().backward()
         optimizer.step()
@@ -94,7 +102,11 @@ def test_tp_checkpoint_load_on_meta_device(self):
         tp_mesh = init_device_mesh(self.device_type, mesh_shpe)
 
         # create model and move it to GPU with id rank
+<<<<<<< HEAD
         model = UnevenShardedModel(self.device_type).to(self.rank)
+=======
+        model = UnevenShardedModel(self.device_type).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index 79dbe741822c7..856246f400b8d 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -45,7 +45,11 @@
 def create_sharded_tensor(rank, world_size, shards_per_rank):
     shards_metadata = []
     local_shards = []
+<<<<<<< HEAD
     for idx in range(world_size * shards_per_rank):
+=======
+    for idx in range(0, world_size * shards_per_rank):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
@@ -199,7 +203,11 @@ def testLongReadinto(self):
 class TestDistWrapper(DTensorTestBase):
     @property
     def world_size(self):
+<<<<<<< HEAD
         return min(4, torch.accelerator.device_count())
+=======
+        return min(4, torch.cuda.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 2af4175e6e060..bb5cb47446866 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -1,21 +1,35 @@
 #!/usr/bin/env python3
 # Owner(s): ["oncall: r2p"]
 
+<<<<<<< HEAD
 import functools
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
+<<<<<<< HEAD
 # LICENSE file in the root directory  of this source tree.
 import json
+=======
+# LICENSE file in the root directory of this source tree.
+
+
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import signal
 import unittest
 import uuid
 from multiprocessing.pool import ThreadPool
 from typing import Any
+<<<<<<< HEAD
 from unittest.mock import call, MagicMock, patch
+=======
+from unittest.mock import call, patch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.distributed as dist
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
@@ -29,7 +43,10 @@
     WorkerSpec,
     WorkerState,
 )
+<<<<<<< HEAD
 from torch.distributed.elastic.events import EventSource
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.elastic.multiprocessing import SignalException
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure
 from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
@@ -158,6 +175,7 @@ def monres(state: WorkerState):
         return RunResult(state=state)
 
 
+<<<<<<< HEAD
 class RecordWorkerEventsTest(unittest.TestCase):
     def setUp(self):
         self.spec = MagicMock()
@@ -398,6 +416,8 @@ def test_construct_event_worker_no_error(self):
         self.assertNotIn("exit_code", [None])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SimpleElasticAgentTest(unittest.TestCase):
     def _get_worker_spec(
         self,
@@ -636,7 +656,11 @@ def test_restart_workers(self):
         worker_group = agent.get_worker_group()
 
         num_restarts = 3
+<<<<<<< HEAD
         for _ in range(num_restarts):
+=======
+        for _ in range(0, num_restarts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             agent._restart_workers(worker_group)
             self.assertEqual(WorkerState.HEALTHY, worker_group.state)
 
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index 22310a9ba4955..0ddc2677dda17 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -16,9 +16,14 @@
 import time
 import unittest
 import uuid
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Optional
+=======
+from dataclasses import dataclass
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 from unittest.mock import Mock, patch
 
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 5c31645e6b7f4..0133da8a63993 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -15,9 +15,14 @@
 import sys
 import tempfile
 import time
+<<<<<<< HEAD
 from collections.abc import Callable
 from itertools import product
 from typing import Union
+=======
+from itertools import product
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 import torch
@@ -127,9 +132,14 @@ def echo1(msg: str, exitcode: int = 0) -> str:
         print(f"exit {exitcode} from {rank}", file=sys.stderr)
         sys.exit(exitcode)
     else:
+<<<<<<< HEAD
         for m in msg.split(","):
             print(f"{m} stdout from {rank}")
             print(f"{m} stderr from {rank}", file=sys.stderr)
+=======
+        print(f"{msg} stdout from {rank}")
+        print(f"{msg} stderr from {rank}", file=sys.stderr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{msg}_{rank}"
 
 
@@ -147,7 +157,11 @@ def echo_large(size: int) -> dict[int, str]:
     returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
     """
     out = {}
+<<<<<<< HEAD
     for idx in range(size):
+=======
+    for idx in range(0, size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out[idx] = f"test{idx}"
     return out
 
@@ -248,6 +262,7 @@ def assert_in_file(self, expected: list[str], filename: str) -> None:
             for line in expected:
                 self.assertIn(line, actual)
 
+<<<<<<< HEAD
     def assert_not_in_file(self, lines: list[str], filename: str) -> None:
         lines = [f"{line.rstrip()}\n" for line in lines]
         with open(filename) as fp:
@@ -255,6 +270,8 @@ def assert_not_in_file(self, lines: list[str], filename: str) -> None:
             for line in lines:
                 self.assertNotIn(line, actual)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def assert_pids_noexist(self, pids: dict[int, int]):
         for local_rank, pid in pids.items():
             with self.assertRaises(
@@ -368,8 +385,13 @@ def test_pcontext_wait(self):
 
             self.assertIsNone(pc.wait(timeout=0.1, period=0.01))
             self.assertIsNotNone(pc.wait(period=0.1))
+<<<<<<< HEAD
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
+=======
+            self.assertTrue(pc._stderr_tail.stopped())
+            self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_pcontext_wait_on_a_child_thread(self):
             asyncio.run(asyncio.to_thread(self.test_pcontext_wait))
@@ -387,8 +409,13 @@ def test_multiprocess_context_close(self):
             pids = pc.pids()
             pc.close()
             self.assert_pids_noexist(pids)
+<<<<<<< HEAD
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
+=======
+            self.assertTrue(pc._stderr_tail.stopped())
+            self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_function_with_tensor(self):
             for start_method in self._start_methods:
@@ -490,8 +517,13 @@ def test_function_raise(self):
                         int(error_file_data["message"]["extraInfo"]["timestamp"]),
                         int(failure.timestamp),
                     )
+<<<<<<< HEAD
                     for tail_log in pc._tail_logs:
                         self.assertTrue(tail_log.stopped())
+=======
+                    self.assertTrue(pc._stderr_tail.stopped())
+                    self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_wait_for_all_child_procs_to_exit(self):
             """
@@ -567,7 +599,11 @@ def test_binary_exit(self):
             FAIL = 138
             pc = start_processes(
                 name="echo",
+<<<<<<< HEAD
                 entrypoint=bin("echo4.py"),
+=======
+                entrypoint=bin("echo1.py"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args={0: ("--exitcode", FAIL, "foo"), 1: ("--exitcode", 0, "bar")},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                 logs_specs=DefaultLogsSpecs(
@@ -577,8 +613,14 @@ def test_binary_exit(self):
             )
 
             results = pc.wait(period=0.1)
+<<<<<<< HEAD
             self.assertTrue(results.is_failed())
             self.assertEqual(2, len(results.failures))
+=======
+
+            self.assertTrue(results.is_failed())
+            self.assertEqual(1, len(results.failures))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             failure = results.failures[0]
             self.assertEqual(138, failure.exitcode)
@@ -588,6 +630,7 @@ def test_binary_exit(self):
             self.assert_in_file([], results.stdouts[0])
             self.assertFalse(results.stderrs[1])
             self.assertFalse(results.stdouts[1])
+<<<<<<< HEAD
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
 
@@ -597,6 +640,10 @@ def test_binary_exit(self):
             self.assertEqual("<NONE>", failure.error_file_data["message"])
             # Assert that the failure message contains expected substrings
             self.assertIn("Signal 15 (SIGTERM) received by PID", failure.message)
+=======
+            self.assertTrue(pc._stderr_tail.stopped())
+            self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_binary_raises(self):
             pc = start_processes(
@@ -739,6 +786,7 @@ def test_binary_redirect_and_tee(self):
             self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
             self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
             self.assertFalse(pc.stdouts[1])
+<<<<<<< HEAD
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
 
@@ -770,6 +818,10 @@ def test_binary_duplicate_log_filters(self):
             self.assert_in_file(["[rank1]:worldB stderr from 1"], pc.filtered_stderr)
             for tail_log in pc._tail_logs:
                 self.assertTrue(tail_log.stopped())
+=======
+            self.assertTrue(pc._stderr_tail.stopped())
+            self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
@@ -831,6 +883,7 @@ def test_function_redirect_and_tee(self):
                     self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
                     self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
                     self.assertFalse(pc.stdouts[1])
+<<<<<<< HEAD
                     for tail_log in pc._tail_logs:
                         self.assertTrue(tail_log.stopped())
 
@@ -869,6 +922,10 @@ def test_function_duplicate_log_filters(self):
                     )
                     for tail_log in pc._tail_logs:
                         self.assertTrue(tail_log.stopped())
+=======
+                    self.assertTrue(pc._stderr_tail.stopped())
+                    self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_function(self):
             for start_method, redirs in product(self._start_methods, redirects_all()):
@@ -953,8 +1010,13 @@ def test_function_exit(self):
                     self.assertFalse(results.stdouts[0])
                     self.assertFalse(results.stderrs[1])
                     self.assertFalse(results.stdouts[1])
+<<<<<<< HEAD
                     for tail_log in pc._tail_logs:
                         self.assertTrue(tail_log.stopped())
+=======
+                    self.assertTrue(pc._stderr_tail.stopped())
+                    self.assertTrue(pc._stdout_tail.stopped())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_no_zombie_process_function(self):
             signals = [signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT]
diff --git a/test/distributed/elastic/multiprocessing/bin/echo1.py b/test/distributed/elastic/multiprocessing/bin/echo1.py
index 68ba6bf50b47b..1b4c26777421d 100755
--- a/test/distributed/elastic/multiprocessing/bin/echo1.py
+++ b/test/distributed/elastic/multiprocessing/bin/echo1.py
@@ -23,6 +23,11 @@
         print(f"exit {exitcode} from {rank}", file=sys.stderr)
         sys.exit(exitcode)
     else:
+<<<<<<< HEAD
         for msg in args.msg.split(","):
             print(f"{msg} stdout from {rank}")
             print(f"{msg} stderr from {rank}", file=sys.stderr)
+=======
+        print(f"{args.msg} stdout from {rank}")
+        print(f"{args.msg} stderr from {rank}", file=sys.stderr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/elastic/multiprocessing/tail_log_test.py b/test/distributed/elastic/multiprocessing/tail_log_test.py
index 44626997c5401..bd36179346a52 100644
--- a/test/distributed/elastic/multiprocessing/tail_log_test.py
+++ b/test/distributed/elastic/multiprocessing/tail_log_test.py
@@ -84,6 +84,7 @@ def test_tail(self):
         )
         self.assertTrue(tail.stopped())
 
+<<<<<<< HEAD
     def test_tail_write_to_dst_file(self):
         """
         writer() writes 0 - max (on number on each line) to a log file.
@@ -131,6 +132,8 @@ def test_tail_write_to_dst_file(self):
         )
         self.assertTrue(tail.stopped())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tail_with_custom_prefix(self):
         """
         writer() writes 0 - max (on number on each line) to a log file.
@@ -178,6 +181,7 @@ def test_tail_with_custom_prefix(self):
             self.assertIn(f"[worker{i}][{i}]", headers)
         self.assertTrue(tail.stopped())
 
+<<<<<<< HEAD
     def test_tail_with_custom_filter(self):
         """
         writer() writes 0 - max (on number on each line) to a log file.
@@ -224,6 +228,8 @@ def test_tail_with_custom_filter(self):
         self.assertEqual({f"[writer{i}]": {2, 12} for i in range(nprocs)}, actual)
         self.assertTrue(tail.stopped())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tail_no_files(self):
         """
         Ensures that the log tail can gracefully handle no log files
diff --git a/test/distributed/elastic/rendezvous/api_test.py b/test/distributed/elastic/rendezvous/api_test.py
index 938353a9ffa09..f225c922aad94 100644
--- a/test/distributed/elastic/rendezvous/api_test.py
+++ b/test/distributed/elastic/rendezvous/api_test.py
@@ -140,6 +140,7 @@ def test_get_as_bool_returns_false_if_value_represents_false(self) -> None:
                 self.assertFalse(params.get_as_bool("dummy_param"))
 
     def test_get_as_bool_raises_error_if_value_is_invalid(self) -> None:
+<<<<<<< HEAD
         for value in [
             "01",
             "Flse",  # codespell:ignore
@@ -151,6 +152,9 @@ def test_get_as_bool_raises_error_if_value_is_invalid(self) -> None:
             2,
             -1,
         ]:
+=======
+        for value in ["01", "Flse", "Ture", "g", "4", "_", "truefalse", 2, -1]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.subTest(value=value):
                 self._kwargs["dummy_param"] = value
 
diff --git a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
index a0cffd57438ad..80f913718e02c 100644
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@@ -9,9 +9,14 @@
 import os
 import tempfile
 from base64 import b64encode
+<<<<<<< HEAD
 from collections.abc import Callable
 from datetime import timedelta
 from typing import cast, ClassVar
+=======
+from datetime import timedelta
+from typing import Callable, cast, ClassVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock, TestCase
 
 from rendezvous_backend_test import RendezvousBackendTestMixin
diff --git a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
index 0ee101598776d..e1836363aaef6 100644
--- a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
+++ b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
@@ -14,9 +14,14 @@
 import time
 from abc import ABC, abstractmethod
 from base64 import b64encode
+<<<<<<< HEAD
 from collections.abc import Callable
 from datetime import datetime, timedelta, timezone
 from typing import cast, Optional
+=======
+from datetime import datetime, timedelta, timezone
+from typing import Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import TestCase
 from unittest.mock import call, MagicMock, Mock, patch, PropertyMock
 
diff --git a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
index f4c0fd7b88db9..ea391976a4b23 100644
--- a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
@@ -7,8 +7,12 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast, Optional
+=======
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.rendezvous import RendezvousStateError
 from torch.distributed.elastic.rendezvous.dynamic_rendezvous import (
diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py
index 9b31cf3b1755b..fcc074ca20142 100644
--- a/test/distributed/elastic/test_control_plane.py
+++ b/test/distributed/elastic/test_control_plane.py
@@ -71,9 +71,15 @@ def test_worker_server(self) -> None:
             self.assertEqual(resp.status, 200)
             self.assertIn("ping", json.loads(resp.data))
 
+<<<<<<< HEAD
             resp = pool.request("POST", "/handler/nonexistent")
             self.assertEqual(resp.status, 404)
             self.assertIn(b"Handler nonexistent not found:", resp.data)
+=======
+            resp = pool.request("POST", "/handler/nonexistant")
+            self.assertEqual(resp.status, 404)
+            self.assertIn(b"Handler nonexistant not found:", resp.data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_cuda
     def test_dump_nccl_trace_pickle(self) -> None:
@@ -207,8 +213,13 @@ def set_status(self, status: int) -> None:
     def test_get_handler_nonexistant(self) -> None:
         from torch._C._distributed_c10d import _get_handler
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistent"):
             _get_handler("nonexistent")
+=======
+        with self.assertRaisesRegex(ValueError, "Failed to find handler nonexistant"):
+            _get_handler("nonexistant")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_get_handler_names(self) -> None:
         from torch._C._distributed_c10d import _get_handler_names
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index 0125ce5cd25aa..684553538475e 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -15,7 +15,10 @@
 
 import torch.distributed.elastic.timer as timer
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     IS_ARM64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -24,8 +27,13 @@
 )
 
 
+<<<<<<< HEAD
 # timer is not supported on these platforms
 if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
+=======
+# timer is not supported on windows or macos
+if not (IS_WINDOWS or IS_MACOS):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # func2 should time out
     def func2(n, file_path):
         if file_path is not None:
@@ -191,7 +199,11 @@ def _request_on_interval(file_path, n, interval, sem):
         """
         client = timer.FileTimerClient(file_path)
         sem.release()
+<<<<<<< HEAD
         for _ in range(n):
+=======
+        for _ in range(0, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 6d438f2536d6f..d7de3f2cce388 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -14,7 +14,10 @@
 import torch.distributed.elastic.timer as timer
 import torch.multiprocessing as torch_mp
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     IS_ARM64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -41,8 +44,13 @@ def _stuck_function(rank, mp_queue):
         time.sleep(5)
 
 
+<<<<<<< HEAD
 # timer is not supported on these platforms
 if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
+=======
+# timer is not supported on macos or windows
+if not (IS_WINDOWS or IS_MACOS):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class LocalTimerExample(TestCase):
         """
@@ -102,7 +110,11 @@ def _run_example_with(self, start_method):
 
             world_size = 8
             processes = []
+<<<<<<< HEAD
             for i in range(world_size):
+=======
+            for i in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if i % 2 == 0:
                     p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                 else:
@@ -110,7 +122,11 @@ def _run_example_with(self, start_method):
                 p.start()
                 processes.append(p)
 
+<<<<<<< HEAD
             for i in range(world_size):
+=======
+            for i in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 p = processes[i]
                 p.join()
                 if i % 2 == 0:
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index 8818b1788c62c..c7deba47e6d88 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -15,7 +15,10 @@
 from torch.distributed.elastic.timer.api import TimerRequest
 from torch.distributed.elastic.timer.local_timer import MultiprocessingRequestQueue
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     IS_ARM64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
@@ -25,10 +28,15 @@
 )
 
 
+<<<<<<< HEAD
 # timer is not supported on these platforms
 INVALID_PLATFORMS = IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN or IS_ARM64
 
 if not INVALID_PLATFORMS:
+=======
+# timer is not supported on windows or macos
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # func2 should time out
     def func2(n, mp_queue):
         if mp_queue is not None:
@@ -127,12 +135,21 @@ def _enqueue_on_interval(mp_queue, n, interval, sem):
         interval seconds. Releases the given semaphore once before going to work.
         """
         sem.release()
+<<<<<<< HEAD
         for i in range(n):
+=======
+        for i in range(0, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mp_queue.put(TimerRequest(i, "test_scope", 0))
             time.sleep(interval)
 
 
+<<<<<<< HEAD
 if not INVALID_PLATFORMS:
+=======
+# timer is not supported on windows or macos
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class MultiprocessingRequestQueueTest(TestCase):
         def test_get(self):
@@ -199,7 +216,12 @@ def test_get_less_than_size(self):
             self.assertLessEqual(n / 2, len(requests))
 
 
+<<<<<<< HEAD
 if not INVALID_PLATFORMS:
+=======
+# timer is not supported on windows or macos
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class LocalTimerServerTest(TestCase):
         def setUp(self):
diff --git a/test/distributed/elastic/utils/data/cycling_iterator_test.py b/test/distributed/elastic/utils/data/cycling_iterator_test.py
index 835ed6ebbd01b..2d6b86e147050 100644
--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@@ -15,7 +15,11 @@ class CyclingIteratorTest(unittest.TestCase):
     def generator(self, epoch, stride, max_epochs):
         # generate an continuously incrementing list each epoch
         # e.g. [0,1,2] [3,4,5] [6,7,8] ...
+<<<<<<< HEAD
         return iter([stride * epoch + i for i in range(stride)])
+=======
+        return iter([stride * epoch + i for i in range(0, stride)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cycling_iterator(self):
         stride = 3
@@ -25,7 +29,11 @@ def generator_fn(epoch):
             return self.generator(epoch, stride, max_epochs)
 
         it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
+<<<<<<< HEAD
         for i in range(stride * max_epochs):
+=======
+        for i in range(0, stride * max_epochs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(i, next(it))
 
         with self.assertRaises(StopIteration):
diff --git a/test/distributed/elastic/utils/distributed_test.py b/test/distributed/elastic/utils/distributed_test.py
index 1827d63361809..73f9513832acb 100644
--- a/test/distributed/elastic/utils/distributed_test.py
+++ b/test/distributed/elastic/utils/distributed_test.py
@@ -116,6 +116,10 @@ def test_create_store_timeout_on_server(self):
                 timeout=1,
             )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_create_store_timeout_on_worker(self):
         with self.assertRaises(DistNetworkError):
             # use any available port (port 0) since timeout is expected
diff --git a/test/distributed/flight_recorder/test_fr_analysis.py b/test/distributed/flight_recorder/test_fr_analysis.py
index 4e96360b8f1f6..8da45e5d5a11b 100644
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@@ -143,6 +143,7 @@ def test_match_one_event(self):
             match_one_event(e11, e12, membership, "0").state,
             MatchState.FULLY_MATCHED,
         )
+<<<<<<< HEAD
         e13 = create_one_event(
             "gather",
             ("0", "default"),
@@ -156,6 +157,8 @@ def test_match_one_event(self):
             match_one_event(e11, e13, membership, "0").state,
             MatchState.FULLY_MATCHED,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_all_events(self):
         for collective in sorted(COLLECTIVES):
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
index c80602c5d50f3..418a3a0604347 100644
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -31,10 +31,17 @@
     sys.exit(0)
 
 
+<<<<<<< HEAD
 _DISTRIBUTED_STATE_DICT_IMPLS = (
     StateDictType.LOCAL_STATE_DICT,
     StateDictType.SHARDED_STATE_DICT,
 )
+=======
+_DISTRIBUTED_STATE_DICT_IMPLS = {
+    StateDictType.LOCAL_STATE_DICT,
+    StateDictType.SHARDED_STATE_DICT,
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestDistributedCheckpoint(FSDPTest):
diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py
index c0f1a791c5346..d8ebf15e9afe8 100644
--- a/test/distributed/fsdp/test_fsdp_apply.py
+++ b/test/distributed/fsdp/test_fsdp_apply.py
@@ -44,14 +44,22 @@ def world_size(self):
 
     @torch.no_grad()
     def _init_linear_weights(self, m):
+<<<<<<< HEAD
         if type(m) is nn.Linear:
+=======
+        if type(m) == nn.Linear:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m.weight.fill_(1.0)
             m.bias.fill_(1.0)
 
     def check_weights(self, fsdp, expected_tensor_fn, check):
         with FSDP.summon_full_params(fsdp, recurse=True):
             linear_modules = [
+<<<<<<< HEAD
                 module for module in fsdp.modules() if type(module) is nn.Linear
+=======
+                module for module in fsdp.modules() if type(module) == nn.Linear
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             for module in linear_modules:
                 for param in module.parameters():
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index 6a204d5eb22fe..18d120032c330 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -13,7 +13,11 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_distributed import (
+<<<<<<< HEAD
     requires_accelerator_dist_backend,
+=======
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_nccl_version,
     skip_but_pass_in_sandcastle_if,
     skip_if_lt_x_gpu,
@@ -30,18 +34,30 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
 BFLOAT16_AVAILABLE = torch.cuda.is_bf16_supported() or torch.xpu.is_bf16_supported()
 
+=======
+# bfloat16 is only supported by CUDA 11+
+BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
+    torch.version.cuda is not None or torch.version.hip is not None
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Net(nn.Module):
     def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
         # to ensure determinism
         torch.manual_seed(0)
+<<<<<<< HEAD
         torch.get_device_module(device_type).manual_seed(0)
+=======
+        torch.cuda.manual_seed(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
 
         if has_wrapping:
@@ -51,12 +67,20 @@ def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
                     nn.ReLU(),
                     FSDP(
                         nn.Linear(16, 8),
+<<<<<<< HEAD
                         device_id=torch.accelerator.current_device_index(),
+=======
+                        device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         sharding_strategy=sharding_strategy,
                         mixed_precision=mixed_precision,
                     ),
                 ),
+<<<<<<< HEAD
                 device_id=torch.accelerator.current_device_index(),
+=======
+                device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sharding_strategy=sharding_strategy,
                 mixed_precision=mixed_precision,
             )
@@ -135,11 +159,19 @@ def test_default_communication_hook_behavior(
         """
         out_dim = self.world_size
         net = torch.nn.Linear(1, out_dim, bias=False)
+<<<<<<< HEAD
         inpt = torch.tensor([self.rank]).float().to(self.rank)
 
         net_default_hook = FSDP(
             net,
             device_id=torch.accelerator.current_device_index(),
+=======
+        inpt = torch.tensor([self.rank]).float().cuda(self.rank)
+
+        net_default_hook = FSDP(
+            net,
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharding_strategy=sharding_strategy,
         ).to(self.rank)
 
@@ -173,10 +205,17 @@ def _get_submodules(self, fsdp_net):
         ]
 
     def _init_model(self, core, sharding_strategy, mixed_precision=None):
+<<<<<<< HEAD
         device = torch.device(device_type)
         return FSDP(
             core,
             device_id=torch.accelerator.current_device_index(),
+=======
+        device = torch.device("cuda")
+        return FSDP(
+            core,
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharding_strategy=sharding_strategy,
             mixed_precision=mixed_precision,
         ).to(device)
@@ -278,7 +317,11 @@ def test_registering_hook_hybrid_strategy(self):
             ShardingStrategy.HYBRID_SHARD,
             ShardingStrategy._HYBRID_SHARD_ZERO2,
         ):
+<<<<<<< HEAD
             model = Net(False, None, None).to(device=device_type)
+=======
+            model = Net(False, None, None).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_model = FSDP(
                 model,
                 auto_wrap_policy=ModuleWrapPolicy({nn.Linear}),
@@ -338,7 +381,11 @@ def _check_low_precision_hook(
     ):
         # keep everything deterministic for input data
         torch.manual_seed(0)
+<<<<<<< HEAD
         torch.get_device_module(device_type).manual_seed(0)
+=======
+        torch.cuda.manual_seed(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fsdp_with_hook = self._init_model(
             Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy),
@@ -360,7 +407,11 @@ def _check_low_precision_hook(
         optim_hook = torch.optim.SGD(fsdp_with_hook.parameters(), lr=0.1)
         optim_mp = torch.optim.SGD(fsdp_with_mp.parameters(), lr=0.1)
 
+<<<<<<< HEAD
         in_data = torch.rand(16, 8).to(device=device_type)
+=======
+        in_data = torch.rand(16, 8).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_with_hook.train()
         fsdp_with_mp.train()
         loss_hook = fsdp_with_hook(in_data).sum()
@@ -379,7 +430,11 @@ def _check_low_precision_hook(
         ):
             self.assertEqual(hook_param.grad, mp_param.grad)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("has_wrapping", [True, False])
     @parametrize(
@@ -400,11 +455,19 @@ def test_fp16_hook(
             state, hook, sharding_strategy, torch.float16, has_wrapping
         )
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
     @skip_but_pass_in_sandcastle_if(
         not BFLOAT16_AVAILABLE,
         "BFloat16 is only supported by CUDA 11+ or XPU",
+=======
+    @requires_nccl()
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
+    @skip_but_pass_in_sandcastle_if(
+        not BFLOAT16_AVAILABLE,
+        "BFloat16 is only supported by CUDA 11+",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     @skip_if_lt_x_gpu(2)
     @parametrize("has_wrapping", [True, False])
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 45e58ea3d0491..988bfe24d4f02 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -4,8 +4,12 @@
 import itertools
 import sys
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 import torch
diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index 12e432f214f30..89a3fd06f1fb3 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -44,11 +44,16 @@ def world_size(self) -> int:
         return 1
 
     def _get_default_config(self):
+<<<<<<< HEAD
         device_type = (
             acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
         )
         return {
             "device": torch.device(device_type),
+=======
+        return {
+            "device": torch.device("cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "sharding_strategy": HandleShardingStrategy.FULL_SHARD,
             "offload_params": False,
             "mp_param_dtype": None,
@@ -650,6 +655,7 @@ def test_flat_param_shard_metadata_with_memory_format(self, memory_format):
             ),
         )
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(1)
     def test_writeback_orig_params_no_shard(self):
         class EmbeddingModel(nn.Module):
@@ -680,6 +686,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             out = fsdp_model(x)
         self.assertEqual(out.shape, torch.Size([]))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestFlattenParams)
 
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index 730b8cd7308e6..6303216a63b64 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -31,8 +31,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Model(nn.Module):
     def __init__(
@@ -49,6 +52,10 @@ def __init__(
             nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             nn.Flatten(),
         )
+<<<<<<< HEAD
+=======
+        self.device = torch.cuda.current_device()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.head = nn.Linear(64, 10)
         if with_fsdp and freeze_after_wrap_fsdp:
             self.fsdp_wrap(fsdp_kwargs)
@@ -146,7 +153,11 @@ def _dist_train(
         forward_prefetch,
     ):
         torch.manual_seed(0)
+<<<<<<< HEAD
         batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
+=======
+        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fsdp_kwargs = {
             "device_id": self.rank,
@@ -155,7 +166,11 @@ def _dist_train(
 
         ddp_kwargs = {
             "device_ids": [self.rank],
+<<<<<<< HEAD
             "find_unused_parameters": bool(disable_autograd),
+=======
+            "find_unused_parameters": True if disable_autograd else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         model = self._create_model(
@@ -165,7 +180,11 @@ def _dist_train(
             disable_autograd,
             fsdp_kwargs,
         )
+<<<<<<< HEAD
         model = model.to(device_type)
+=======
+        model = model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # freezing the trunk using requires_grad.
         if freezing_method == FreezingMethod.RequiresGrad:
@@ -179,7 +198,11 @@ def _dist_train(
         else:
             model = DistributedDataParallel(model, **ddp_kwargs)
 
+<<<<<<< HEAD
         target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
+=======
+        target = torch.tensor([0, 1], dtype=torch.long).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index e2ea4c5fc9afd..15fb7210592c3 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -49,8 +49,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextlib.contextmanager
 def patch_allreduce(new_allreduce):
@@ -99,7 +102,11 @@ class ShardingStrategyMode(Enum):
 class TestFSDPHybridShard(FSDPTest):
     @property
     def world_size(self):
+<<<<<<< HEAD
         return max(torch.accelerator.device_count(), 2)
+=======
+        return max(torch.cuda.device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def process_group(self):
@@ -107,7 +114,11 @@ def process_group(self):
 
     @skip_if_lt_x_gpu(2)
     def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
+<<<<<<< HEAD
         model = MyModel().to(device_type)
+=======
+        model = MyModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         err_ctx = self.assertRaisesRegex(
             ValueError,
             "requires explicit specification of process group or device_mesh.",
@@ -121,10 +132,17 @@ def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_save_load_state_dict(self):
+<<<<<<< HEAD
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(num_node_devices // 2)),
+=======
+        model = MyModel().cuda()
+        num_node_devices = torch.cuda.device_count()
+        shard_rank_lists = (
+            list(range(0, num_node_devices // 2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -163,7 +181,11 @@ def test_hsdp_save_load_state_dict(self):
             msd = model.state_dict()
             osd = FSDP.optim_state_dict(model, optim)
 
+<<<<<<< HEAD
         load_model = fsdp_ctor(MyModel().to(device_type))
+=======
+        load_model = fsdp_ctor(MyModel().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         load_optim = torch.optim.AdamW(load_model.parameters())
         with FSDP.state_dict_type(load_model, StateDictType.SHARDED_STATE_DICT):
             load_model.load_state_dict(msd)
@@ -172,10 +194,17 @@ def test_hsdp_save_load_state_dict(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_sync_module_state(self):
+<<<<<<< HEAD
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(num_node_devices // 2)),
+=======
+        model = MyModel().cuda()
+        num_node_devices = torch.cuda.device_count()
+        shard_rank_lists = (
+            list(range(0, num_node_devices // 2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -216,7 +245,11 @@ def test_hsdp_sync_module_state(self):
     @skip_if_lt_x_gpu(2)
     def test_invalid_pg_specification_raises(self):
         pol = ModuleWrapPolicy({nn.Linear})
+<<<<<<< HEAD
         model = MyModel().to(device_type)
+=======
+        model = MyModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             ValueError, "Expected process_group to be passed in"
         ):
@@ -262,7 +295,11 @@ def _test_fsdp_hybrid_shard_basic_setup(
         use_device_mesh: bool,
     ):
         if use_device_mesh:
+<<<<<<< HEAD
             device_mesh = init_device_mesh(device_type, (1, self.world_size))
+=======
+            device_mesh = init_device_mesh("cuda", (1, self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             device_mesh = None
         hsdp_model = self._init_hsdp_model(
@@ -318,7 +355,11 @@ def patched_collective(orig_collective, counter, *args, **kwargs):
             patch_allreduce(patched_allreduce),
             patch_reduce_scatter(patched_reduce_scatter),
         ):
+<<<<<<< HEAD
             inp = hsdp_model.get_input(device=torch.accelerator.current_device_index())
+=======
+            inp = hsdp_model.get_input(device=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = hsdp_model(inp[0], inp[1])
             loss = hsdp_model.get_loss(inp, out)
             loss.backward()
@@ -367,7 +408,11 @@ def _test_fsdp_hybrid_shard_parity(
         hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
         torch.manual_seed(global_pg.rank() + 1)
         for _ in range(5):
+<<<<<<< HEAD
             inp = fsdp_model.module.get_input(torch.device(device_type))
+=======
+            inp = fsdp_model.module.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             losses: list[torch.Tensor] = []
             for model, optim in ((fsdp_model, fsdp_optim), (hsdp_model, hsdp_optim)):
                 optim.zero_grad()
@@ -383,7 +428,11 @@ def _init_fsdp_model(self, use_orig_params: bool) -> nn.Module:
         )
         hsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
+<<<<<<< HEAD
             "device_id": torch.accelerator.current_device_index(),
+=======
+            "device_id": torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "use_orig_params": use_orig_params,
         }
         fsdp_model = TransformerWithSharedParams.init(
@@ -410,7 +459,11 @@ def _init_hsdp_model(
             {TransformerEncoderLayer, TransformerDecoderLayer},
         )
         hsdp_kwargs = {
+<<<<<<< HEAD
             "device_id": torch.accelerator.current_device_index(),
+=======
+            "device_id": torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "auto_wrap_policy": auto_wrap_policy,
             "sharding_strategy": hsdp_sharding_strategy,
             "use_orig_params": use_orig_params,
@@ -437,7 +490,11 @@ def _init_hsdp_model(
             # Use `FULL_SHARD` for the embedding and output projection
             hsdp_model = FSDP(
                 model,
+<<<<<<< HEAD
                 device_id=torch.accelerator.current_device_index(),
+=======
+                device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sharding_strategy=ShardingStrategy.FULL_SHARD,
                 use_orig_params=use_orig_params,
             )
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index d8974327ea5dd..6b6f8c0dd41a3 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -36,8 +36,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Model(torch.nn.Module):
     def __init__(self) -> None:
@@ -96,9 +99,15 @@ def __init__(self, num_ignored: int) -> None:
 class TestFSDPIgnoredModules(FSDPTest):
     @property
     def world_size(self):
+<<<<<<< HEAD
         return min(torch.accelerator.device_count(), 2)
 
     def _train_model(self, model, optim, num_iters, device=torch.device(device_type)):
+=======
+        return min(torch.cuda.device_count(), 2)
+
+    def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(num_iters):
             module = model.module if isinstance(model, FSDP) else model
             inp = module.get_input(device)
@@ -200,7 +209,11 @@ def _test_ignored_modules_nested(self, use_orig_params: bool, ignore_modules: bo
         # Initialize an FSDP-wrapped nested model that first wraps the nested
         # sequential's second linear layer (`layer1[1]`) and then wraps the
         # overall model while ignoring the nested sequential (`layer1`)
+<<<<<<< HEAD
         model = Model().to(device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_fn = functools.partial(FSDP, use_orig_params=use_orig_params)
         model.layer1[1] = fsdp_fn(model.layer1[1])
         if ignore_modules:
@@ -248,7 +261,11 @@ def test_ignored_states_auto_wrap(self):
         )
 
     def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
+<<<<<<< HEAD
         model = Model().to(device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_states = [model.layer1[1].weight]
         if ignore_bias:
             ignored_states.append(model.layer1[1].bias)
@@ -287,7 +304,11 @@ def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
     def test_ignored_modules_invalid(self):
         """Tests that passing an FSDP module as an ignored module or the
         top-level module itself errors."""
+<<<<<<< HEAD
         model = Model().to(device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrap_cls = FSDP
         model.layer1 = wrap_cls(model.layer1)
         # Passing an FSDP module as an ignored module should error
@@ -304,7 +325,11 @@ def test_ignored_modules_invalid(self):
         ):
             # FSDP does not allow to wrap the same model twice, so create
             # a new local model here.
+<<<<<<< HEAD
             new_model = Model().to(device_type)
+=======
+            new_model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             wrap_cls(new_model, ignored_modules=[new_model])
 
     @skip_if_lt_x_gpu(2)
@@ -336,7 +361,11 @@ def _test_diff_ignored_modules_across_ranks(
         # we wrap `layer3` with FSDP, where `layer3` is registered as a module
         # after `layer1`, which has the variable number of ignored modules
         wrap_cls = FSDP
+<<<<<<< HEAD
         model = ModelWithIgnoredModules(num_ignored=self.rank + 1).to(device_type)
+=======
+        model = ModelWithIgnoredModules(num_ignored=self.rank + 1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layer1_ignored_modules = [
             m for m in model.layer1.modules() if isinstance(m, IgnoredModule)
         ]
@@ -372,7 +401,11 @@ def _test_diff_ignored_modules_across_ranks(
     @skip_if_lt_x_gpu(2)
     @parametrize("ignore_modules", [True, False])
     def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
+<<<<<<< HEAD
         model = Model().to(device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_modules = list(model.layer1.children())[1:]
 
         ignore_kwargs = (
@@ -411,7 +444,11 @@ def test_ignored_states_check(self):
         )
 
     def _test_ignored_states_check(self, ignore_modules: bool):
+<<<<<<< HEAD
         model = Model().to(device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_modules = list(model.layer1.children())[1:]
         ignored_params = {p for m in ignored_modules for p in m.parameters()}
         ignored_states = ignored_params.union(set(ignored_modules))
diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py
index 93391f01b376d..dcc0cf0343f5d 100644
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@@ -14,7 +14,10 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
     TEST_CUDA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -32,6 +35,7 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
@@ -40,6 +44,13 @@ def get_cur_mem(rank, result, prefix):
     if TEST_CUDA:
         torch._C._cuda_clearCublasWorkspaces()
     result[prefix] = round(torch.accelerator.memory_allocated() / 1024 / 1024)
+=======
+
+def get_cur_mem(rank, result, prefix):
+    """Collect memory allocated values in a result dict in MB"""
+    torch._C._cuda_clearCublasWorkspaces()
+    result[prefix] = round(torch.cuda.memory_allocated() / 1024 / 1024)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Model(nn.Module):
@@ -114,14 +125,22 @@ def world_size(self):
 
     def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
         gpu_id = self.rank
+<<<<<<< HEAD
         batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
+=======
+        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = create_model(
             with_fsdp=True,
             with_checkpoint=with_checkpoint,
             model_hidden_dim=model_hidden_dim,
         )
+<<<<<<< HEAD
         model = model.to(device_type)
+=======
+        model = model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(model)
 
         # We enable momentum so that after the first iteration, the optimizer state is added
@@ -137,7 +156,11 @@ def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
             get_cur_mem(gpu_id, results, f"iter {iteration}: after fwd")
 
             out = sum(o.sum() for o in out[0])
+<<<<<<< HEAD
             fake_loss = criterion(out, torch.tensor(0.0).to(device_type))
+=======
+            fake_loss = criterion(out, torch.tensor(0.0).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_cur_mem(gpu_id, results, f"iter {iteration}: after loss")
 
             fake_loss.backward()
@@ -162,7 +185,11 @@ def cmp(results, expected):
         output = cmp(results, expected)
         self.assertEqual(output, "")
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_HPU, "Memory will be different for CUDA and HPU, skipping")
+=======
+    @unittest.skipIf(TEST_HPU, "Memory will be differnt for CUDA and HPU, skipping")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("ckpt", ["no_ckpt", "ckpt"])
     def test_fsdp_memory(self, ckpt):
@@ -171,8 +198,13 @@ def test_fsdp_memory(self, ckpt):
 
         model = create_model(
             with_fsdp=False, with_checkpoint=False, model_hidden_dim=model_hidden_dim
+<<<<<<< HEAD
         ).to(device_type)
         model_size_mb = round(torch.accelerator.memory_allocated() / 1024 / 1024)
+=======
+        ).cuda()
+        model_size_mb = round(torch.cuda.memory_allocated() / 1024 / 1024)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del model
 
         sharded_model_size_mb = int(model_size_mb / self.world_size)
diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py
index d3b0079a24adc..1586983f00e06 100644
--- a/test/distributed/fsdp/test_fsdp_meta.py
+++ b/test/distributed/fsdp/test_fsdp_meta.py
@@ -43,8 +43,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _reset_params_if_meta(is_meta: bool, model: nn.Module):
     # For torchdistX init, we don't need to call reset_params, as
@@ -119,7 +122,11 @@ def _init_with_reset_params(module: nn.Module):
         )
     )
     if has_meta_states:
+<<<<<<< HEAD
         device = torch.device(device_type, torch.accelerator.current_device_index())
+=======
+        device = torch.device("cuda", torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module.to_empty(device=device, recurse=False)
         module.reset_parameters()
 
@@ -166,13 +173,21 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
 
         # Test to make sure it is the same model parameters as regular FSDP
         # approach.
+<<<<<<< HEAD
         regular = MyModel(device=device_type)
+=======
+        regular = MyModel(device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
 
         self._compare_fsdp(fsdp_meta, fsdp_regular)
+<<<<<<< HEAD
         inp = torch.randn(10, 2, device=device_type)
+=======
+        inp = torch.randn(10, 2, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -184,7 +199,11 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
         model = meta_module_fn()
         fsdp_meta = FSDP(model, param_init_fn=init_fn)
         meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+<<<<<<< HEAD
         regular = MyModel(device=device_type)
+=======
+        regular = MyModel(device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
@@ -219,7 +238,11 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_default_init(self):
         def meta_module_fn():
+<<<<<<< HEAD
             return deferred_init.deferred_init(MyModel, device=device_type)
+=======
+            return deferred_init.deferred_init(MyModel, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_simple_model_with_meta_device(meta_module_fn)
 
@@ -230,7 +253,11 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_init_fn(self):
         def meta_module_fn():
+<<<<<<< HEAD
             return deferred_init.deferred_init(MyModel, device=device_type)
+=======
+            return deferred_init.deferred_init(MyModel, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_simple_model_with_meta_device(
             meta_module_fn, init_fn=_init_with_torchdistX
@@ -250,7 +277,11 @@ def _test_nested_model_with_meta_device(
                 param_init_fn=init_fn,
             )
             meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+<<<<<<< HEAD
             module_regular = NestedModel(device=device_type)
+=======
+            module_regular = NestedModel(device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _reset_params_if_meta(is_meta, module_regular)
             fsdp_regular = FSDP(
                 module_regular,
@@ -271,7 +302,11 @@ def _test_nested_model_with_meta_device(
 
             # Init and reset parameters before wrapping so that reset_params
             # matches up with meta device's initialization.
+<<<<<<< HEAD
             module_regular = NestedModel(device=device_type)
+=======
+            module_regular = NestedModel(device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _reset_params_if_meta(is_meta, module_regular)
             with enable_wrap(wrapper_cls=FSDP):
                 module_regular.lin1 = wrap(module_regular.lin1)
@@ -281,7 +316,11 @@ def _test_nested_model_with_meta_device(
 
         # Compare it before training
         self._compare_fsdp(fsdp_meta, fsdp_regular)
+<<<<<<< HEAD
         inp = torch.randn(10, 2, device=device_type)
+=======
+        inp = torch.randn(10, 2, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -319,7 +358,11 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
         def meta_module_fn():
+<<<<<<< HEAD
             return deferred_init.deferred_init(NestedModel, device=device_type)
+=======
+            return deferred_init.deferred_init(NestedModel, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
@@ -333,7 +376,11 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
         def meta_module_fn():
+<<<<<<< HEAD
             return deferred_init.deferred_init(NestedModel, device=device_type)
+=======
+            return deferred_init.deferred_init(NestedModel, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap,
@@ -353,7 +400,11 @@ def _test_bad_arg(self, meta_module_fn):
     )
     def test_bad_arg_torchdistx(self):
         def meta_module_fn():
+<<<<<<< HEAD
             return deferred_init.deferred_init(NestedModel, device_type)
+=======
+            return deferred_init.deferred_init(NestedModel, "cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_bad_arg(meta_module_fn)
 
@@ -403,7 +454,11 @@ def _param_init_fn(module: nn.Module) -> None:
             # TODO: `module.to_empty()` is not generally correct for meta
             # device initialization.
             # https://github.com/pytorch/pytorch/issues/90465
+<<<<<<< HEAD
             module.to_empty(device=torch.device(device_type))
+=======
+            module.to_empty(device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module.apply(model._module_init_fn)
 
         model = Model()
@@ -416,7 +471,11 @@ def _param_init_fn(module: nn.Module) -> None:
                 param_dtype=torch.float32, reduce_dtype=torch.float16
             ),
             param_init_fn=_param_init_fn,
+<<<<<<< HEAD
             device_id=torch.accelerator.current_device_index(),
+=======
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 2ae986af785b8..b083840754bf7 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -60,10 +60,13 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MyModel(nn.Module):
     def __init__(self) -> None:
@@ -97,9 +100,15 @@ def test_fsdp_device_id(self, use_index):
           without specifying a device ID (i.e. ``torch.device("cuda")``) warns
         """
         dev_id = (
+<<<<<<< HEAD
             torch.accelerator.current_device_index()
             if use_index
             else torch.device(device_type, torch.accelerator.current_device_index())
+=======
+            torch.cuda.current_device()
+            if use_index
+            else torch.device("cuda", torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def _check_device_matches(module, device_id):
@@ -112,7 +121,11 @@ def _check_device_matches(module, device_id):
             self.assertEqual(1, len(devices))
             found_device = devices.pop()
             if use_index and not isinstance(device_id, torch.device):
+<<<<<<< HEAD
                 device = torch.device(device_type, device_id)
+=======
+                device = torch.device("cuda", device_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 device = device_id
             self.assertEqual(found_device, device)
@@ -144,11 +157,18 @@ def _check_device_matches(module, device_id):
                 self.process_group,
                 FSDPInitMode.RECURSIVE,
                 DEVICEInitMode.DEVICE_BEFORE,
+<<<<<<< HEAD
                 fsdp_kwargs={"device_id": torch.device(device_type)},
             )
         _check_device_matches(
             nested_wrapped_module,
             torch.device(device_type, torch.accelerator.current_device_index()),
+=======
+                fsdp_kwargs={"device_id": torch.device("cuda")},
+            )
+        _check_device_matches(
+            nested_wrapped_module, torch.device("cuda", torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -183,8 +203,13 @@ def forward(self, x, y):
                 loss = torch.nn.functional.cross_entropy(output, y)
                 return loss
 
+<<<<<<< HEAD
         model = Mnist().to(device=device_type)
         model1 = Mnist().to(device=device_type)
+=======
+        model = Mnist().cuda()
+        model1 = Mnist().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model1.load_state_dict(model.state_dict())
         fsdp_model = FSDP(
             model,
@@ -202,17 +227,30 @@ def forward(self, x, y):
 
         seed = self.rank + 20231010
         torch.manual_seed(seed)
+<<<<<<< HEAD
         torch.get_device_module(device_type).manual_seed(seed)
+=======
+        torch.cuda.manual_seed(seed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         losses = []
         grads = []
         for i in range(5):
+<<<<<<< HEAD
             x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
             y = torch.randint(low=0, high=9, size=(8,), device=device_type)
             for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
                 seed = self.rank + i
                 torch.manual_seed(seed)
                 torch.get_device_module(device_type).manual_seed(seed)
+=======
+            x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
+            y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+            for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
+                seed = self.rank + i
+                torch.manual_seed(seed)
+                torch.cuda.manual_seed(seed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss = model(x, y).sum()
                 losses.append(loss)
                 loss.backward()
@@ -228,8 +266,13 @@ def forward(self, x, y):
             fsdp_model.eval()
             ddp_model.eval()
             for _ in range(5):
+<<<<<<< HEAD
                 x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
                 y = torch.randint(low=0, high=9, size=(8,), device=device_type)
+=======
+                x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
+                y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fsdp_loss = fsdp_model(x, y)
                 ddp_loss = ddp_model(x, y)
                 assert torch.allclose(fsdp_loss, ddp_loss)
@@ -237,12 +280,21 @@ def forward(self, x, y):
         fsdp_model.train()
         ddp_model.train()
         for i in range(5):
+<<<<<<< HEAD
             x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
             y = torch.randint(low=0, high=9, size=(8,), device=device_type)
             for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
                 seed = self.rank + i
                 torch.manual_seed(seed)
                 torch.get_device_module(device_type).manual_seed(seed)
+=======
+            x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
+            y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+            for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
+                seed = self.rank + i
+                torch.manual_seed(seed)
+                torch.cuda.manual_seed(seed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss = model(x, y).sum()
                 losses.append(loss)
                 loss.backward()
@@ -277,12 +329,21 @@ def forward(self, x, y):
                     return out1
 
         fsdp = FSDP(
+<<<<<<< HEAD
             MyModel().to(device=device_type),
             sharding_strategy=sharding_strategy,
             auto_wrap_policy=always_wrap_policy,
         )
         x = torch.randn(10, 10, device=device_type)
         y = torch.randn(10, 10, device=device_type)
+=======
+            MyModel().cuda(),
+            sharding_strategy=sharding_strategy,
+            auto_wrap_policy=always_wrap_policy,
+        )
+        x = torch.randn(10, 10, device="cuda")
+        y = torch.randn(10, 10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(4):
             if use_second_layer:
                 a, _ = fsdp(x, y)
@@ -341,7 +402,11 @@ def _check_equal(local, fsdp):
                     torch.testing.assert_close(p1, p2)
 
         fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy)
+<<<<<<< HEAD
         m = MyModule().to(device=device_type)
+=======
+        m = MyModule().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m_local = deepcopy(m)
         local_m = m_local
         prev_params = [p.clone() for p in m_local.parameters()]
@@ -354,7 +419,11 @@ def _check_equal(local, fsdp):
         opt_local = torch.optim.SGD(local_m.parameters(), lr=1e-3)
 
         for i in range(6):
+<<<<<<< HEAD
             t = torch.ones(4, device=device_type)
+=======
+            t = torch.ones(4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a, b = m(t)
             local_a, local_b = local_m(t)
             if i < 2:
@@ -390,7 +459,11 @@ def _check_equal(local, fsdp):
     @skip_if_lt_x_gpu(2)
     def test_fsdp_optim_overlap_no_use_orig_params_error(self):
         fsdp_overlap = FSDP(
+<<<<<<< HEAD
             MyModel().to(device=device_type),
+=======
+            MyModel().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto_wrap_policy=always_wrap_policy,
             use_orig_params=False,
         )
@@ -403,7 +476,11 @@ def test_fsdp_optim_overlap_no_use_orig_params_error(self):
             register_hook=False,
         )
 
+<<<<<<< HEAD
         inp = torch.randn(10, 10, device=device_type)
+=======
+        inp = torch.randn(10, 10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             RuntimeError, "only supported with use_orig_params=True"
         ):
@@ -414,16 +491,27 @@ def test_fsdp_optimizer_overlap(self):
         torch.manual_seed(0)
         for cpu_offload in [True, False]:
             offload = CPUOffload(offload_params=cpu_offload)
+<<<<<<< HEAD
             model = MyModel().to(device=device_type)
             model_overlap = deepcopy(model)
             fsdp = FSDP(
                 model.to(device=device_type),
+=======
+            model = MyModel().cuda()
+            model_overlap = deepcopy(model)
+            fsdp = FSDP(
+                model.cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 auto_wrap_policy=always_wrap_policy,
                 use_orig_params=True,
                 cpu_offload=offload,
             )
             fsdp_overlap = FSDP(
+<<<<<<< HEAD
                 model_overlap.to(device=device_type),
+=======
+                model_overlap.cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 auto_wrap_policy=always_wrap_policy,
                 use_orig_params=True,
                 cpu_offload=offload,
@@ -450,7 +538,11 @@ def test_fsdp_optimizer_overlap(self):
                 ]
 
             for i in range(6):
+<<<<<<< HEAD
                 inp = torch.randn(2, 2, device=device_type)
+=======
+                inp = torch.randn(2, 2, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with torch.no_grad():
                     inp_clone = inp.clone()
                 fsdp(inp, inp).sum().backward()
@@ -551,7 +643,11 @@ def test_fsdp_cpu_init_stays_on_cpu(self):
         """Tests that passing a CPU module to FSDP preserves that the wrapped
         module is on CPU after FSDP initialization, albeit after logging a
         warning, and that FSDP moves CPU input to GPU before the forward."""
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         regex = "passed-in `module` is on CPU"
         context = self.assertWarnsRegex(
             expected_warning=UserWarning, expected_regex=regex
@@ -566,7 +662,11 @@ def test_fsdp_cpu_init_stays_on_cpu(self):
         devices = {p.device for p in fsdp_model.parameters()}
         self.assertEqual(1, len(devices))
         self.assertEqual(torch.device("cpu"), devices.pop())
+<<<<<<< HEAD
         fsdp_model = fsdp_model.to(device=device_type)
+=======
+        fsdp_model = fsdp_model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Ensure fwd + backward can be performed after moving to CUDA.
         # CPU input also tests that input is correctly moved to appropriate
         # CUDA device.
@@ -611,19 +711,31 @@ def init_nested_wrapped_module():
             nested_wrapped_module,
             self.process_group,
             auto_wrap_policy=ModuleWrapPolicy({nn.Linear}),
+<<<<<<< HEAD
             device_id=torch.accelerator.current_device_index(),
+=======
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sync_module_states=True,
         )
         # Each rank's buffers should be 0s since rank 0 is the source, and they
         # should be on GPU since we specified `device_id`
         self.assertEqual(
             nested_wrapped_module.buf.device,
+<<<<<<< HEAD
             torch.device(device_type, torch.accelerator.current_device_index()),
+=======
+            torch.device("cuda", torch.cuda.current_device()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(nested_wrapped_module.buf, torch.zeros((2, 2)))
         self.assertEqual(
             nested_wrapped_module.module.module[0].buf.device,
+<<<<<<< HEAD
             torch.device(device_type, torch.accelerator.current_device_index()),
+=======
+            torch.device("cuda", torch.cuda.current_device()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(
             nested_wrapped_module.module.module[0].buf, torch.zeros((3, 2))
@@ -649,9 +761,15 @@ def __init__(self) -> None:
             def forward(self, x):
                 return x
 
+<<<<<<< HEAD
         m = MyModule().to(device=device_type)
         m = FSDP(m)
         t = torch.ones(1, device=device_type, requires_grad=True)
+=======
+        m = MyModule().cuda()
+        m = FSDP(m)
+        t = torch.ones(1, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         MyOutputType = namedtuple(
             "MyOutputType", ["a", "b", "c", "d"], defaults=(t, t, t, t)
@@ -688,7 +806,11 @@ def _test_device_id_auto_wrap(self, use_callable: bool):
             auto_wrap_policy = ModuleWrapPolicy(module_classes)
         fsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
+<<<<<<< HEAD
             "device_id": torch.accelerator.current_device_index(),
+=======
+            "device_id": torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         fsdp_model = TransformerWithSharedParams.init(
             self.process_group,
@@ -699,7 +821,11 @@ def _test_device_id_auto_wrap(self, use_callable: bool):
         for fsdp_module in FSDP.fsdp_modules(fsdp_model):
             self.assertEqual(
                 fsdp_module.compute_device,
+<<<<<<< HEAD
                 torch.device(device_type, torch.accelerator.current_device_index()),
+=======
+                torch.device("cuda", torch.cuda.current_device()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @skip_if_lt_x_gpu(2)
@@ -734,7 +860,11 @@ def forward(self, x):
             model,
             auto_wrap_policy=auto_wrap_policy,
             cpu_offload=CPUOffload(offload_params=True),
+<<<<<<< HEAD
             device_id=torch.accelerator.current_device_index(),
+=======
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_orig_params=use_orig_params,
         )
         cpu_device = torch.device("cpu")
@@ -747,6 +877,7 @@ def test_module_device_mismatches_device_id(self):
         module that does not match the GPU device ID raises an error."""
         # TODO: override FSDP MT Thread _run to set this instead of here for
         # every test.
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
 
         context = (
@@ -757,6 +888,14 @@ def test_module_device_mismatches_device_id(self):
             else nullcontext()
         )
 
+=======
+        torch.cuda.set_device(self.rank)
+        context = (
+            self.assertRaisesRegex(ValueError, f"cuda:{self.rank} vs cuda:0")
+            if self.rank != 0
+            else nullcontext()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with context:
             NestedWrappedModule.init(
                 self.process_group,
@@ -773,11 +912,16 @@ def test_cpu_gpu_module(self):
         """Tests a CPU + GPU module supported if device_id is passed
         in, errors if device_id is not.
         """
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class CPUGPUModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.a = nn.Linear(1, 1).to(device=device_type)
                 self.b = nn.Linear(1, 1)
 
@@ -787,6 +931,15 @@ def __init__(self) -> None:
             self.assertEqual(
                 param.device, torch.device(torch.accelerator.current_device_index())
             )
+=======
+                self.a = nn.Linear(1, 1).cuda()
+                self.b = nn.Linear(1, 1)
+
+        cpu_gpu = CPUGPUModule()
+        fsdp = FSDP(cpu_gpu, device_id=torch.cuda.current_device())
+        for param in fsdp.parameters():
+            self.assertEqual(param.device, torch.device(torch.cuda.current_device()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # without device_id, we hit an error
         with self.assertRaisesRegex(RuntimeError, "please pass in device_id"):
@@ -794,7 +947,11 @@ def __init__(self) -> None:
 
     @skip_if_lt_x_gpu(2)
     def test_fsdp_ignored_module_meta(self):
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class CPUGPUModule(nn.Module):
             def __init__(self) -> None:
@@ -813,11 +970,19 @@ def __init__(self) -> None:
             m = CPUGPUModule()
         m = FSDP(
             m,
+<<<<<<< HEAD
             device_id=torch.accelerator.current_device_index(),
             ignored_modules=[m.a],
             use_orig_params=True,
             param_init_fn=lambda m: m.to_empty(
                 device=torch.accelerator.current_device_index(), recurse=False
+=======
+            device_id=torch.cuda.current_device(),
+            ignored_modules=[m.a],
+            use_orig_params=True,
+            param_init_fn=lambda m: m.to_empty(
+                device=torch.cuda.current_device(), recurse=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
         self.assertEqual(meta_device, next(m.a.parameters()).device)
@@ -865,11 +1030,16 @@ def test_no_params(self):
         """
         # TODO: override FSDP MT Thread _run to set this instead of here for
         # every test.
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test CPU
         no_params = nn.ReLU()
         FSDP(no_params)
         # Test CUDA
+<<<<<<< HEAD
         no_params = nn.ReLU().to(device=device_type)
         FSDP(no_params)
         # Test CPU + device_id
@@ -879,6 +1049,17 @@ def test_no_params(self):
         # inconsistency between compute_device and device_id, since compute_device
         # is computed as torch.cuda.current_device when there are no params.
         no_params = nn.ReLU().to(device=device_type)
+=======
+        no_params = nn.ReLU().cuda()
+        FSDP(no_params)
+        # Test CPU + device_id
+        no_params = nn.ReLU()
+        FSDP(no_params, device_id=torch.cuda.current_device())
+        # For modules with no params, wrong device_id will raise error about
+        # inconsistency between compute_device and device_id, since compute_device
+        # is computed as torch.cuda.current_device when there are no params.
+        no_params = nn.ReLU().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         context = (
             (
                 self.assertRaisesRegex(
@@ -903,11 +1084,19 @@ def __init__(self, rank):
                 super().__init__()
                 # Seed via rank to make model different across ranks
                 torch.manual_seed(rank)
+<<<<<<< HEAD
                 torch.get_device_module(device_type).manual_seed(rank)
                 self.lin = nn.Linear(10, 10, bias=False)
                 self.buffer = nn.Buffer(torch.ones(1) * rank)
 
         m = MyModel(self.rank).to(device=device_type)
+=======
+                torch.cuda.manual_seed(rank)
+                self.lin = nn.Linear(10, 10, bias=False)
+                self.buffer = nn.Buffer(torch.ones(1) * rank)
+
+        m = MyModel(self.rank).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _assert_module_states(
             m, process_group=self.process_group, assert_fn=self.assertNotEqual
         )
@@ -924,11 +1113,15 @@ def __init__(self, rank):
             m, process_group=self.process_group, assert_fn=self.assertNotEqual
         )
         # Passing sync_module_states into FSDP makes model the same during init.
+<<<<<<< HEAD
         fsdp = FSDP(
             m,
             device_id=torch.accelerator.current_device_index(),
             sync_module_states=True,
         )
+=======
+        fsdp = FSDP(m, device_id=torch.cuda.current_device(), sync_module_states=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with fsdp.summon_full_params(fsdp):
             _assert_module_states(
                 fsdp, process_group=self.process_group, assert_fn=self.assertEqual
@@ -983,7 +1176,11 @@ def _test_homogeneous_attributes(self, attr_name_and_values: tuple[str, Any, Any
         with self.assertRaisesRegex(
             ValueError, f"Expects one homogeneous value for {attr_name}"
         ):
+<<<<<<< HEAD
             inp = fsdp_model.module.get_input(torch.device(device_type))
+=======
+            inp = fsdp_model.module.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_model(*inp)
 
     @skip_if_lt_x_gpu(2)
@@ -991,7 +1188,11 @@ def test_fsdp_unsupported_module_cls(self):
         regex = r"FSDP will not all-gather parameters for containers that do not implement forward"
         model = nn.ModuleList([MLP(8, torch.device("cpu")) for _ in range(3)])
         with self.assertWarnsRegex(UserWarning, regex):
+<<<<<<< HEAD
             FSDP(model, device_id=device_type)
+=======
+            FSDP(model, device_id="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = nn.ModuleDict(
             {"1": MLP(8, torch.device("cpu")), "2": MLP(8, torch.device("cpu"))}
         )
@@ -1015,6 +1216,7 @@ def test_world_size_1_sharding_strategy_warning(self):
         # warning
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")  # trigger all warnings
+<<<<<<< HEAD
             FSDP(
                 nn.Linear(3, 3).to(device=device_type),
                 sharding_strategy=ShardingStrategy.NO_SHARD,
@@ -1022,6 +1224,12 @@ def test_world_size_1_sharding_strategy_warning(self):
             for warning in w:
                 self.assertTrue(
                     warning.category is not UserWarning
+=======
+            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.NO_SHARD)
+            for warning in w:
+                self.assertTrue(
+                    warning.category != UserWarning
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     or not str(warning.message).startswith(warning_prefix)
                 )
 
@@ -1032,20 +1240,30 @@ def test_world_size_1_sharding_strategy_warning(self):
             warning_prefix + " " + str(ShardingStrategy.FULL_SHARD) + warning_suffix
         )
         with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
+<<<<<<< HEAD
             FSDP(
                 nn.Linear(3, 3).to(device=device_type),
                 sharding_strategy=ShardingStrategy.FULL_SHARD,
             )
         with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
             FSDP(nn.Linear(3, 3).to(device=device_type))
+=======
+            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.FULL_SHARD)
+        with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
+            FSDP(nn.Linear(3, 3).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # - Pass `SHARD_GRAD_OP`
         expected_regex_shard_grad_op = (
             warning_prefix + " " + str(ShardingStrategy.SHARD_GRAD_OP) + warning_suffix
         )
         with self.assertWarnsRegex(UserWarning, expected_regex_shard_grad_op):
             FSDP(
+<<<<<<< HEAD
                 nn.Linear(3, 3).to(device=device_type),
                 sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+=======
+                nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.SHARD_GRAD_OP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @skip_if_lt_x_gpu(1)
@@ -1069,7 +1287,11 @@ def test_training_device_mismatch_errors(self):
         # Incorrectly moving from CPU -> GPU
         model = torch.nn.Linear(10, 10)
         fsdp_model = FSDP(model, cpu_offload=CPUOffload(offload_params=True))
+<<<<<<< HEAD
         fsdp_model.to(torch.device(device_type))
+=======
+        fsdp_model.to(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = torch.randn((2, 10))
         with self.assertRaisesRegex(
             RuntimeError,
@@ -1110,16 +1332,26 @@ def __setattr__(self, name: str, value: Any) -> None:
 
         # Construct FSDP module without changing any environment variables and
         # run forward, which triggers both unsharded and sharded view setting
+<<<<<<< HEAD
         module = SetattrLinear(5, 5, torch.device(device_type))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
         inp = torch.randn((8, 5), device=torch.device(device_type))
+=======
+        module = SetattrLinear(5, 5, torch.device("cuda"))
+        fsdp_module = FSDP(module, use_orig_params=use_orig_params)
+        inp = torch.randn((8, 5), device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         called_setattr_override = False
         fsdp_module(inp)
         self.assertTrue(called_setattr_override)
 
         # Repeat with unsafe setattr explicitly enabled
         os.environ[_FSDP_USE_UNSAFE_SETATTR] = "1"
+<<<<<<< HEAD
         module = SetattrLinear(5, 5, torch.device(device_type))
+=======
+        module = SetattrLinear(5, 5, torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
         called_setattr_override = False
         fsdp_module(inp)
@@ -1127,7 +1359,11 @@ def __setattr__(self, name: str, value: Any) -> None:
 
         # Repeat with unsafe setattr explicitly disabled
         os.environ[_FSDP_USE_UNSAFE_SETATTR] = "0"
+<<<<<<< HEAD
         module = SetattrLinear(5, 5, torch.device(device_type))
+=======
+        module = SetattrLinear(5, 5, torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
         called_setattr_override = False
         fsdp_module(inp)
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 99e5db33d67d7..ebd70cce86f7d 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -2,10 +2,16 @@
 
 import bisect
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 from copy import deepcopy
 from enum import auto, Enum
 from typing import Any, Optional
+=======
+from copy import deepcopy
+from enum import auto, Enum
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -39,6 +45,10 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -421,7 +431,11 @@ def _are_equal_states(
             return False
         for state_name, value1 in state1.items():
             value2 = state2[state_name]
+<<<<<<< HEAD
             if type(value1) is not type(value2):
+=======
+            if type(value1) != type(value2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
             if torch.is_tensor(value1):  # tensor state
                 assert torch.is_tensor(value2)
@@ -514,6 +528,10 @@ def _check_same_param_groups(
                     continue
                 self.assertEqual(full_osd_value, ref_osd_pg[name])
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", STATE_DICT_TYPES)
     @parametrize("use_multiple_param_groups", [False, True])
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
index f9a43e748c464..539b923bdc02d 100644
--- a/test/distributed/fsdp/test_fsdp_overlap.py
+++ b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -8,7 +8,12 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch import distributed as dist, Event
+=======
+from torch import distributed as dist
+from torch.cuda import Event
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -18,8 +23,11 @@
     run_tests,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
+<<<<<<< HEAD
     TEST_XPU,
     xfailIf,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -34,8 +42,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Layer(nn.Module):
     def __init__(self, compute_cycles, has_params: bool):
@@ -53,8 +64,12 @@ def forward(self, x):
         # Record the fake forward compute time.
         self.e1.record()
         if self.sleep_cycles > 0:
+<<<<<<< HEAD
             if torch.cuda.is_available():
                 torch.cuda._sleep(self.sleep_cycles)
+=======
+            torch.cuda._sleep(self.sleep_cycles)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.optional_param is not None:
             x = x + self.optional_param  # force the param to be part of the graph
         self.e2.record()
@@ -76,7 +91,11 @@ def _create_model(compute_cycles, has_params: bool):
             FSDP(Layer(compute_cycles, has_params), limit_all_gathers=False),
         ),
         limit_all_gathers=False,
+<<<<<<< HEAD
     ).to(device_type)
+=======
+    ).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return model
 
 
@@ -114,7 +133,11 @@ def run(compute_cycles, all_gather_cycles):
 
             # Get the input and sets the input's requires_grad to True because
             # we have a fake compute in the forward pass.
+<<<<<<< HEAD
             batch = torch.rand(1).to(device_type)
+=======
+            batch = torch.rand(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batch.requires_grad = True
 
             # Run one dummy iteration to trigger the execution order validation
@@ -141,8 +164,12 @@ def run(compute_cycles, all_gather_cycles):
                 def _delayed_all_gather(*args, **kwargs):
                     nonlocal all_gather_called
                     all_gather_called = True
+<<<<<<< HEAD
                     if torch.cuda.is_available():
                         torch.cuda._sleep(all_gather_cycles)
+=======
+                    torch.cuda._sleep(all_gather_cycles)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert orig_all_gather
                     return orig_all_gather(*args, **kwargs)
 
@@ -250,7 +277,10 @@ def _delayed_all_gather(*args, **kwargs):
             self.assertTrue(compute_only + all_gather_only > 1.1 * both)
 
     @unittest.skipIf(TEST_HPU, "HPU doesn't has HW sleep API support, skipping")
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1504
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_forward_overlap(self):
         self._dist_train()
@@ -262,9 +292,15 @@ def world_size(self):
         return 2
 
 
+<<<<<<< HEAD
 devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
     TestForwardOverlapWorldSizeOne, globals(), only_for=devices, allow_xpu=True
+=======
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(
+    TestForwardOverlapWorldSizeOne, globals(), only_for=devices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
index 20c2f927651f6..a4fa323c49c63 100644
--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
+++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -152,6 +152,10 @@ def _test_fp16_dtypes(
 
 
 devices = ("cuda", "hpu", "xpu")
+<<<<<<< HEAD
 instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices, allow_xpu=True)
+=======
+instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index f14b9e0480cca..bdaeda8fa851b 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -35,10 +35,14 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+<<<<<<< HEAD
     TEST_XPU,
     TestCase,
     NAVI_ARCH,
     skipIfRocmArch,
+=======
+    TestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -54,8 +58,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 params = "cpu_offload,sharding_strategy,mixed_precision,use_orig_params"
 cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
@@ -81,6 +88,7 @@
 
 class TestShardGradScaler(TestCase):
     @unittest.skipIf(
+<<<<<<< HEAD
         amp_definitely_not_available() and not TEST_XPU,
         "no supported device (cuda, xla, xpu) found",
     )
@@ -89,6 +97,13 @@ def test_grad_scaling(self):
         scaler = ShardedGradScaler(
             device=device_type, init_scale=2.0, process_group=pg, enabled=True
         )
+=======
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
+    def test_grad_scaling(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
         t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu")
         outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]]
@@ -100,6 +115,7 @@ def test_grad_scaling(self):
         self.assertTrue(scaler._scale.device == t1.device)
 
     @unittest.skipIf(
+<<<<<<< HEAD
         amp_definitely_not_available() and not TEST_XPU,
         "no supported device (cuda, xla, xpu) found",
     )
@@ -108,6 +124,13 @@ def test_scaling_unscaling_sparse(self):
         scaler = ShardedGradScaler(
             device=device_type, init_scale=2.0, process_group=pg, enabled=True
         )
+=======
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
+    def test_scaling_unscaling_sparse(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu")
         found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu")
 
@@ -148,6 +171,7 @@ def test_scaling_unscaling_sparse(self):
         self.assertEqual(found_inf, 1.0)
 
     @unittest.skipIf(
+<<<<<<< HEAD
         amp_definitely_not_available() and not TEST_XPU,
         "no supported device (cuda, xla, xpu) found",
     )
@@ -156,6 +180,13 @@ def test_inf_gradients_skip_optim_step(self):
         scaler = ShardedGradScaler(
             device=device_type, init_scale=2.0, process_group=pg, enabled=True
         )
+=======
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
+    def test_inf_gradients_skip_optim_step(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
         t0 = torch.tensor([float("inf")], dtype=torch.float32, device="cpu")
         t0.grad = t0.clone()
@@ -242,16 +273,24 @@ def _build_model_and_optim(
                 {
                     TransformerEncoderLayer,
                     TransformerDecoderLayer,
+<<<<<<< HEAD
                 },
             ),
             "device_id": self.rank,
+=======
+                }
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         model = FSDP(model, **fsdp_kwargs)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         return model, optim, ref_model, ref_optim
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sharded_grad_scaler_found_inf(self):
         self.run_subtests(
             {
@@ -273,10 +312,17 @@ def _test_sharded_grad_scaler_found_inf(
             cpu_offload=cpu_offload,
             use_orig_params=use_orig_params,
         )
+<<<<<<< HEAD
         grad_scaler = ShardedGradScaler(device=device_type, init_scale=2.0)
         ref_grad_scaler = torch.amp.GradScaler(device=device_type, init_scale=2.0)
         scaled_losses: list[torch.Tensor] = []
         device = torch.device(device_type)
+=======
+        grad_scaler = ShardedGradScaler(init_scale=2.0)
+        ref_grad_scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0)
+        scaled_losses: list[torch.Tensor] = []
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42 + self.rank + 1)
 
         for iter in range(10):
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index b0677655186a6..bd4620f2f53a5 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -97,8 +97,11 @@
     "sharded_state_dict": StateDictType.SHARDED_STATE_DICT,
 }
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Model(Module):
     def __init__(
@@ -157,13 +160,21 @@ def forward(self, x):
         return self.net3(self.net2(self.net1(x)))
 
     def get_input(self):
+<<<<<<< HEAD
         return torch.rand(8, 8, device=device_type)
+=======
+        return torch.rand(8, 8, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFSDPStateDict(FSDPTest):
     @property
     def world_size(self):
+<<<<<<< HEAD
         return min(torch.accelerator.device_count(), 2)
+=======
+        return min(torch.cuda.device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _broadcast_state_dict(self, state_dict):
         return _broadcast_state_dict(self.rank, state_dict)
@@ -198,8 +209,13 @@ def _get_simple_nested_model(
         self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs
     ):
         if wrap:
+<<<<<<< HEAD
             lin1 = nn.Linear(10, 10, bias=False).to(device_type)
             lin2 = nn.Linear(10, 10, bias=False).to(device_type)
+=======
+            lin1 = nn.Linear(10, 10, bias=False).cuda()
+            lin2 = nn.Linear(10, 10, bias=False).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if checkpoint_wrap:
                 lin1 = checkpoint_wrapper(lin1)
                 lin2 = checkpoint_wrapper(lin2)
@@ -209,13 +225,22 @@ def _get_simple_nested_model(
             model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
         else:
             model = nn.Sequential(
+<<<<<<< HEAD
                 nn.Linear(10, 10, bias=False).to(device_type),
                 nn.Linear(10, 10, bias=False).to(device_type),
+=======
+                nn.Linear(10, 10, bias=False).cuda(),
+                nn.Linear(10, 10, bias=False).cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return model
 
     def _get_simple_model(self, *fsdp_args, checkpoint_wrap=False, **fsdp_kwargs):
+<<<<<<< HEAD
         lin = nn.Linear(10, 10, bias=False).to(device_type)
+=======
+        lin = nn.Linear(10, 10, bias=False).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if checkpoint_wrap:
             lin = checkpoint_wrapper(lin)
         model = FSDP(lin, *fsdp_args, **fsdp_kwargs)
@@ -232,9 +257,15 @@ def _get_multibuffer_nested_model(
             else None
         )
         if wrap:
+<<<<<<< HEAD
             lin1 = nn.Linear(10, 10, bias=False).to(device_type)
             bn1 = nn.BatchNorm1d(10).to(device_type)
             lin2 = nn.Linear(10, 10, bias=False).to(device_type)
+=======
+            lin1 = nn.Linear(10, 10, bias=False).cuda()
+            bn1 = nn.BatchNorm1d(10).cuda()
+            lin2 = nn.Linear(10, 10, bias=False).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if checkpoint_wrap:
                 lin1 = checkpoint_wrapper(lin1)
                 bn1 = checkpoint_wrapper(bn1)
@@ -249,9 +280,15 @@ def _get_multibuffer_nested_model(
             model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
         else:
             model = nn.Sequential(
+<<<<<<< HEAD
                 nn.Linear(10, 10, bias=False).to(device_type),
                 nn.BatchNorm1d(10).to(device_type),
                 nn.Linear(10, 10, bias=False).to(device_type),
+=======
+                nn.Linear(10, 10, bias=False).cuda(),
+                nn.BatchNorm1d(10).cuda(),
+                nn.Linear(10, 10, bias=False).cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return model
 
@@ -259,7 +296,11 @@ def _get_non_fsdp_root_module(self, *fsdp_args, wrap=True, **fsdp_kwargs):
         class FSDPContainer(nn.Module):
             def __init__(self, fsdp_1, fsdp_2):
                 super().__init__()
+<<<<<<< HEAD
                 self.non_fsdp_lin = nn.Linear(10, 10, bias=False).to(device_type)
+=======
+                self.non_fsdp_lin = nn.Linear(10, 10, bias=False).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.fsdp_1 = fsdp_1
                 self.fsdp_2 = fsdp_2
 
@@ -507,7 +548,11 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
         # Broadcast the module states from rank 0 with `sync_module_states=True`
         new_fsdp_model = FSDP(
             new_model,
+<<<<<<< HEAD
             device_id=torch.accelerator.current_device_index(),
+=======
+            device_id=torch.cuda.current_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto_wrap_policy=auto_wrap_policy,
             sync_module_states=True,
         )
@@ -604,7 +649,11 @@ def test_basic_save_and_load_state_dict(
 
             model_new = model_call()
             if not cpu_offload.offload_params:
+<<<<<<< HEAD
                 model_new = model_new.to(device_type)
+=======
+                model_new = model_new.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if fp16:
                 model_new.half()
             # Run a forward/backward to compute gradients to test the case
@@ -679,7 +728,11 @@ def test_buffers_save_and_load_state_dict(
 
         model_new = model_call()
         if not cpu_offload.offload_params:
+<<<<<<< HEAD
             model_new = model_new.to(device_type)
+=======
+            model_new = model_new.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # zero the model to ensure parameters are different.
         _zero_model(model_new, zero_buffers=True)
@@ -706,7 +759,11 @@ def test_save_and_load_after_forward_state_dict(
         """
         if state_dict_rank0_and_offload and state_dict_type != "state_dict":
             return
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mixed_precision = (
             MixedPrecision(
                 param_dtype=torch.float16,
@@ -720,7 +777,11 @@ def test_save_and_load_after_forward_state_dict(
         optim = torch.optim.SGD(model.parameters(), lr=0.1)
         initial_params = get_full_params(model)
         for _ in range(6):
+<<<<<<< HEAD
             inp = torch.randn(1, 10, device=torch.accelerator.current_device_index())
+=======
+            inp = torch.randn(1, 10, device=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = model(*inp)
             loss = output.sum()
             expected_dtype = torch.float32 if mixed_precision is None else torch.float16
@@ -770,7 +831,11 @@ def _initialize_model(
         # keep everything deterministic for input data
         torch.manual_seed(0)
 
+<<<<<<< HEAD
         model = Model(wrap_fsdp, register_buffers=register_buffers).to(device_type)
+=======
+        model = Model(wrap_fsdp, register_buffers=register_buffers).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if wrap_fsdp:
             model = FSDP(model)
         elif wrap_ddp:
@@ -806,9 +871,13 @@ def _dist_train(
         model = self._initialize_model(wrap_fsdp)
         optim = SGD(model.parameters(), lr=0.1)
 
+<<<<<<< HEAD
         in_data = torch.rand(
             64, 4, requires_grad=True, device=torch.device(device_type)
         )
+=======
+        in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             out = model(in_data)
             out.sum().backward()
@@ -816,7 +885,11 @@ def _dist_train(
             optim.zero_grad()
 
         if wrap_fsdp:
+<<<<<<< HEAD
             blank_model = FSDP(Model(True).to(device_type))
+=======
+            blank_model = FSDP(Model(True).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _zero_model(blank_model)
             state_dict = self._state_dict(model, state_dict_type)
             if move_to_cpu:
@@ -888,12 +961,19 @@ def test_state_dict_load_into_local_module(
         optim = SGD(model.parameters(), lr=0.1)
         if not fsdp_root:
             in_data = torch.randn(
+<<<<<<< HEAD
                 1, 10, requires_grad=True, device=torch.device(device_type)
             )
         else:
             in_data = torch.rand(
                 64, 4, requires_grad=True, device=torch.device(device_type)
             )
+=======
+                1, 10, requires_grad=True, device=torch.device("cuda")
+            )
+        else:
+            in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             out = model(in_data)
             out.sum().backward()
@@ -949,7 +1029,11 @@ def test_state_dict_load_into_local_module(
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     @parametrize("double_nest", [True])
     def test_state_dict_skip_module(self, state_dict_type, double_nest):
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _create_module(wrap_fsdp=True):
             LINEAR_SKIP = "linear_skip"
@@ -974,7 +1058,11 @@ def _create_module(wrap_fsdp=True):
 
         fsdp, _ = _create_module()
         # Run a forward pass
+<<<<<<< HEAD
         inp = torch.randn((1, 10), device=torch.accelerator.current_device_index())
+=======
+        inp = torch.randn((1, 10), device=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = fsdp(inp)
         loss.sum().backward()
 
@@ -1022,7 +1110,11 @@ def _create_module(wrap_fsdp=True):
 
     @skip_if_lt_x_gpu(2)
     def test_wrong_state_dict_config(self):
+<<<<<<< HEAD
         model = FSDP(Model(wrap_fsdp=True).to(device_type))
+=======
+        model = FSDP(Model(wrap_fsdp=True).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, "Expected state_dict_config of type"):
             with model.state_dict_type(
                 model, StateDictType.FULL_STATE_DICT, LocalStateDictConfig()
@@ -1044,7 +1136,11 @@ def test_state_dict_with_ignored_modules(
             register_buffers=True,
             ignore_inner=ignore_inner,
             mixed_precision=mixed_precision,
+<<<<<<< HEAD
         ).to(device_type)
+=======
+        ).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_modules = [model.outer]
         ignored_tensor_to_tensor_name = {
             model.outer.bias: "outer.bias",
@@ -1103,7 +1199,11 @@ def test_state_dict_with_ignored_modules(
             self.assertEqual(sd1[prefixed_buffer_name].dtype, torch.float32)
         # Check that the state dict can be loaded into a non-wrapped version of
         # the model
+<<<<<<< HEAD
         nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).to(device_type)
+=======
+        nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param in nonwrapped_model.parameters():
             with torch.no_grad():
                 param.zero_()
@@ -1150,7 +1250,11 @@ def __init__(self) -> None:
             def forward(self, x):
                 return self.my_parameter
 
+<<<<<<< HEAD
         model = FSDP(Model().to(device_type))
+=======
+        model = FSDP(Model().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
             out = model(None)
             out.backward()
@@ -1159,7 +1263,11 @@ def forward(self, x):
             with torch.no_grad():
                 with FSDP.summon_full_params(model):
                     self.assertEqual(model.my_parameter.item(), 3.1415926)
+<<<<<<< HEAD
                     model.my_parameter.copy_(torch.full((1,), 1.75).to(device_type))
+=======
+                    model.my_parameter.copy_(torch.full((1,), 1.75).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(model.my_parameter.item(), 1.75)
             model.load_state_dict(state_dict)
             with FSDP.summon_full_params(model):
@@ -1167,7 +1275,11 @@ def forward(self, x):
 
     @skip_if_lt_x_gpu(2)
     def test_torch_save_load(self):
+<<<<<<< HEAD
         model = Model(wrap_fsdp=True).to(device_type)
+=======
+        model = Model(wrap_fsdp=True).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
             state_dict = model.state_dict()
             checkpoint = io.BytesIO()
@@ -1198,7 +1310,11 @@ def test_torch_save_load(self):
 
     @skip_if_lt_x_gpu(2)
     def test_shared_module_and_shared_parameter(self):
+<<<<<<< HEAD
         model = FSDP(TestDummyModel().to(device_type))
+=======
+        model = FSDP(TestDummyModel().cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
             state_dict = model.state_dict()
             self.assertEqual(
@@ -1232,8 +1348,12 @@ def test_sharded_load_multi_backend_pg(self):
         }
         for load_cpu in [True, False]:
             with self.subTest(load_cpu=load_cpu):
+<<<<<<< HEAD
                 backend = torch.distributed.get_default_backend_for_device(device_type)
                 pg = dist.new_group(backend=f"cpu:gloo,{device_type}:{backend}")
+=======
+                pg = dist.new_group(backend="cpu:gloo,cuda:nccl")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fsdp_model = TransformerWithSharedParams.init(
                     pg,
                     FSDPInitMode.RECURSIVE,
@@ -1279,7 +1399,11 @@ def test_world_size_one(self):
 class TestFSDPStateDict4GPUs(FSDPTest):
     @property
     def world_size(self):
+<<<<<<< HEAD
         return torch.accelerator.device_count()
+=======
+        return torch.cuda.device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(4)
     def test_local_state_dict_reshard(self):
@@ -1289,10 +1413,17 @@ def test_local_state_dict_reshard(self):
         local_state_dict, there are still some corner cases that
         using local_state_dict is a better solution.
         """
+<<<<<<< HEAD
         model = FSDP(Model(wrap_fsdp=True)).to(device_type)
         optim = torch.optim.SGD(model.parameters(), lr=0.1)
 
         batch = torch.randn(4, 4, device=torch.accelerator.current_device_index())
+=======
+        model = FSDP(Model(wrap_fsdp=True)).cuda()
+        optim = torch.optim.SGD(model.parameters(), lr=0.1)
+
+        batch = torch.randn(4, 4, device=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = model(batch)
         loss = output.sum()
         loss.backward()
@@ -1326,7 +1457,11 @@ def test_local_state_dict_reshard(self):
         if rank < 2:
             model2 = FSDP(
                 Model(wrap_fsdp=True, process_group=new_pg), process_group=new_pg
+<<<<<<< HEAD
             ).to(device_type)
+=======
+            ).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with FSDP.state_dict_type(model2, StateDictType.LOCAL_STATE_DICT):
                 model2.load_state_dict(resharded_state_dict)
 
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index 4577848337317..0772c6070ad8c 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -49,8 +49,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SimpleModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -121,7 +124,11 @@ def _get_sub_pgs(self, tensor_parallel_size: int):
         """
         # 2-D mesh is [dp, tp]
         twod_mesh = DeviceMesh(
+<<<<<<< HEAD
             device_type=device_type,
+=======
+            device_type="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mesh=torch.arange(0, self.world_size).view(-1, tensor_parallel_size),
         )
 
@@ -168,7 +175,11 @@ def _sync_tp_grads(
                 self.rank // tp_world_size
             ]
             grad_device = flat_param.grad.device
+<<<<<<< HEAD
             grad = flat_param.grad.detach().clone().to(self.rank)
+=======
+            grad = flat_param.grad.detach().clone().cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_pg)
             grad = grad.to(grad_device)
             flat_param.grad[~sharded_mask] = grad[~sharded_mask]
@@ -201,7 +212,11 @@ def _get_grads_as_flattened(
                 ]
             )
             .contiguous()
+<<<<<<< HEAD
             .to(self.rank)
+=======
+            .cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         all_grads_as_flattened = torch.cat(
             [torch.empty_like(local_grads_as_flattened) for _ in range(fsdp_pg.size())]
@@ -254,7 +269,11 @@ def _test_fsdp_tp_integration(
         tensor_parallel_size = 2
         LR = 3e-5
         torch.manual_seed(0)
+<<<<<<< HEAD
         model = SimpleModel().to(self.rank)
+=======
+        model = SimpleModel().cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tp_fsdp_model = copy.deepcopy(model)
         sharded_param_names = SimpleModel.get_sharded_param_names()
         non_sharded_param_names = SimpleModel.get_non_sharded_param_names()
@@ -270,10 +289,17 @@ def _test_fsdp_tp_integration(
         input_seed = self.rank
         torch.manual_seed(input_seed + 1)
         inp_size = [2, 3, 5]
+<<<<<<< HEAD
         inp = torch.rand(*inp_size).to(self.rank)
         self.assertEqual(model(inp), tp_fsdp_model(inp))  # sanity check
 
         mesh_1d = init_device_mesh(device_type, (self.world_size,))
+=======
+        inp = torch.rand(*inp_size).cuda(self.rank)
+        self.assertEqual(model(inp), tp_fsdp_model(inp))  # sanity check
+
+        mesh_1d = init_device_mesh("cuda", (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_model = FSDP(
             model,
             cpu_offload=cpu_offload,
@@ -282,7 +308,11 @@ def _test_fsdp_tp_integration(
             use_orig_params=use_orig_params,
         )
         mesh_2d = init_device_mesh(
+<<<<<<< HEAD
             device_type,
+=======
+            "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.world_size // tensor_parallel_size, tensor_parallel_size),
             mesh_dim_names=["dp", "tp"],
         )
@@ -348,7 +378,11 @@ def _test_fsdp_tp_integration(
         fsdp_optim.step()
         tp_fsdp_optim.step()
         torch.manual_seed(input_seed + 16)
+<<<<<<< HEAD
         inp = torch.rand(*inp_size).to(self.rank)
+=======
+        inp = torch.rand(*inp_size).cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_out = fsdp_model(inp)
         tp_fsdp_out = tp_fsdp_model(inp)
         self.assertEqual(fsdp_out, tp_fsdp_out)
@@ -359,19 +393,31 @@ def test_fsdp_tp_extension_grad(self):
         Tests TP + FSDP extension with correct gradient (i.e. no ACT)
         """
         mesh_2d = init_device_mesh(
+<<<<<<< HEAD
             device_type, (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+=======
+            "cuda", (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         class TestModel(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.mlp = MLPModule(device_type)
+=======
+                self.mlp = MLPModule("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.mlp_norm = RMSNormPython(10)
 
             def forward(self, x):
                 return self.mlp(self.mlp_norm(x))
 
+<<<<<<< HEAD
         model = TestModel().to(self.rank)
+=======
+        model = TestModel().cuda(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Shard with TP and test gradient
         tp_mesh = mesh_2d["tp"]
@@ -389,7 +435,11 @@ def forward(self, x):
         comm_mode = CommDebugMode()
 
         with comm_mode:
+<<<<<<< HEAD
             fsdp_2d_model(torch.rand(2, 10).to(self.rank)).sum().backward()
+=======
+            fsdp_2d_model(torch.rand(2, 10).cuda(self.rank)).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         funcol = torch.ops.c10d_functional
         c10d_ops = torch.ops.c10d
@@ -411,7 +461,11 @@ def forward(self, x):
     @skip_if_lt_x_gpu(4)
     def test_fsdp_tp_sync_module_state(self):
         mesh_2d = init_device_mesh(
+<<<<<<< HEAD
             device_type, (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+=======
+            "cuda", (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         tp_mesh = mesh_2d["tp"]
         dp_mesh = mesh_2d["dp"]
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
index ad93d9a17829f..fabef344a08c1 100644
--- a/test/distributed/fsdp/test_fsdp_traversal.py
+++ b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -29,8 +29,13 @@
 class TestTraversal(FSDPTest):
     @property
     def world_size(self):
+<<<<<<< HEAD
         if torch.torch.accelerator.is_available():
             gpu_cnt = torch.accelerator.device_count()
+=======
+        if torch.cuda.is_available():
+            gpu_cnt = torch.cuda.device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if gpu_cnt < 2:
                 return gpu_cnt
         return 2
@@ -62,8 +67,12 @@ def test_fsdp_modules(self):
 
 
 devices = ("cuda", "hpu", "xpu")
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TestTraversal, globals(), only_for=devices, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(TestTraversal, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py
index d0094ce1de71f..393618af78099 100644
--- a/test/distributed/fsdp/test_fsdp_uneven.py
+++ b/test/distributed/fsdp/test_fsdp_uneven.py
@@ -45,7 +45,11 @@ def _get_ref_results(self, device, model, input, my_lr):
     def test_one_iteration(self, device):
         """Test FSDP with uneven divide of parameter shards."""
         model = Linear(3, 3, bias=False)
+<<<<<<< HEAD
         input = torch.rand(self.world_size, 3)
+=======
+        input = torch.rand(8, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         my_lr = 0.1
 
         ref_forward_output_my_rank, ref_weight_out = self._get_ref_results(
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index d0edb8045f470..eca256715dac1 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -41,7 +41,10 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
@@ -58,8 +61,11 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest):
     """Tests multiple parameter groups."""
@@ -161,7 +167,11 @@ def _get_fsdp_transformer_and_optim(
             device_init_mode == DEVICEInitMode.DEVICE_AFTER
             and not fsdp_model.cpu_offload.offload_params
         ):
+<<<<<<< HEAD
             fsdp_model = fsdp_model.to(device=device_type)
+=======
+            fsdp_model = fsdp_model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fsdp_model, fsdp_optim
 
     def _check_train_parity(
@@ -174,7 +184,11 @@ def _check_train_parity(
         num_iters: int = 10,
     ):
         """Checks training parity between DDP and FSDP."""
+<<<<<<< HEAD
         device = torch.device(device_type)
+=======
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(num_iters):
             iter_losses = []
             for model, optim in ((ddp_model, ddp_optim), (fsdp_model, fsdp_optim)):
@@ -265,7 +279,11 @@ def _test_fsdp_compile(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         for _ in range(10):
             losses = []
+<<<<<<< HEAD
             inp = ref_model.get_input(torch.device(device_type))
+=======
+            inp = ref_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad()
                 loss = _model(*inp).sum()
@@ -473,7 +491,11 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
         ):
             ddp_optims.append(optim_ctor(ddp_param_group["params"]))
             fsdp_optims.append(optim_ctor(fsdp_param_group["params"]))
+<<<<<<< HEAD
         device = torch.device(device_type)
+=======
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check that there exists a `FlatParameter` that has both a weight and
         # a bias in this rank's shard
@@ -646,7 +668,11 @@ def _test_multiple_forward(
             fsdp_model_orig_params,
             optim_orig_params,
         ) = self._get_fsdp_models_and_optims(sharding_strategy, cpu_offload)
+<<<<<<< HEAD
         device = torch.device(device_type)
+=======
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             inp1 = fsdp_model.get_input(device)
             _inp2 = fsdp_model.get_input(device)
@@ -704,7 +730,11 @@ def _test_summon_between_two_forwards(
             fsdp_model_orig_params,
             optim_orig_params,
         ) = self._get_fsdp_models_and_optims(sharding_strategy, cpu_offload)
+<<<<<<< HEAD
         device = torch.device(device_type)
+=======
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             optim.zero_grad()
             optim_orig_params.zero_grad()
@@ -831,9 +861,15 @@ def check_parameter_parity(
                         p1 = p1.flatten()
                 torch.testing.assert_close(p1, p2)
 
+<<<<<<< HEAD
         ddp_model = DDP(Model().to(device=device_type), device_ids=[self.rank])
         fsdp_model = FSDP(
             Model().to(device=device_type),
+=======
+        ddp_model = DDP(Model().cuda(), device_ids=[self.rank])
+        fsdp_model = FSDP(
+            Model().cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharding_strategy=sharding_strategy,
             auto_wrap_policy=always_wrap_policy,
             use_orig_params=True,
@@ -841,7 +877,11 @@ def check_parameter_parity(
         LR = 1e-2
         ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
+<<<<<<< HEAD
         device = torch.device(device_type)
+=======
+        device = torch.device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         inp = fsdp_model.get_input(device)
         ddp_out = ddp_model(*inp)
@@ -916,11 +956,19 @@ def transform_param(param: nn.Parameter) -> nn.Parameter:
 
         # Check that the writeback propagates
         ddp_model = DDP(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             device_ids=[self.rank],
         )
         fsdp_model = FSDP(
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            device_ids=[self.rank],
+        )
+        fsdp_model = FSDP(
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_orig_params=True,
         )
         ddp = ddp_model.module  # for brevity
@@ -969,11 +1017,19 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
             return None if set_to_none else torch.ones_like(param) * 2
 
         ddp_model = DDP(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             device_ids=[self.rank],
         )
         fsdp_model = FSDP(
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+            device_ids=[self.rank],
+        )
+        fsdp_model = FSDP(
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_orig_params=True,
         )
         LR = 1e-2
@@ -984,7 +1040,11 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
 
         # Generate an initial gradient
+<<<<<<< HEAD
         inp = fsdp_model.get_input(torch.device(device_type))
+=======
+        inp = fsdp_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ddp_out = ddp_model(*inp)
         fsdp_out = fsdp_model(*inp)
         ddp_out.sum().backward()
@@ -1014,7 +1074,11 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
         self._check_param_parity(ddp_model, fsdp_model)  # triggers a writeback
 
         # Intentionally do not zero the gradient to check writeback
+<<<<<<< HEAD
         inp = fsdp_model.get_input(torch.device(device_type))
+=======
+        inp = fsdp_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ddp_out = ddp_model(*inp)
         fsdp_out = fsdp_model(*inp)
         ddp_out.sum().backward()
@@ -1026,7 +1090,11 @@ def transform_grad(param: nn.Parameter) -> nn.Parameter:
     @skip_if_lt_x_gpu(2)
     def test_writeback_shape_mismatch(self):
         fsdp_model = FSDP(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_orig_params=True,
         )
         # Check that writing back with mismatched shape errors
@@ -1076,9 +1144,15 @@ def test_writeback_between_fwd_and_bwd_for_no_reshard_raises(self):
         # Test changing the parameter storage to no longer be a view into the
         # flat parameter
         fsdp_model = fsdp_wrapper(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type))
         )
         inp = fsdp_model.get_input(torch.device(device_type))
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda"))
+        )
+        inp = fsdp_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = fsdp_model(*inp).sum()
         fsdp_model.lin1.weight.data = fsdp_model.lin1.weight.clone()
         assert_msg = (
@@ -1089,9 +1163,15 @@ def test_writeback_between_fwd_and_bwd_for_no_reshard_raises(self):
 
         # Test changing the parameter variable itself
         fsdp_model = fsdp_wrapper(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type))
         )
         inp = fsdp_model.get_input(torch.device(device_type))
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda"))
+        )
+        inp = fsdp_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = fsdp_model(*inp).sum()
         fsdp_model.lin1._fsdp_wrapped_module.weight = nn.Parameter(
             fsdp_model.lin1.weight.clone()
@@ -1125,10 +1205,16 @@ def _test_no_reshard_and_mixed_precision(self, use_full_prec_in_eval: bool):
 
         # Train forward -> full-precision unshard -> train forward
         fsdp_model = FSDP(
+<<<<<<< HEAD
             TestFSDPUseOrigParamsWriteback.Model(torch.device(device_type)),
             **fsdp_kwargs,
         )
         inp = fsdp_model.get_input(torch.device(device_type))
+=======
+            TestFSDPUseOrigParamsWriteback.Model(torch.device("cuda")), **fsdp_kwargs
+        )
+        inp = fsdp_model.get_input(torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_model(*inp)
         with FSDP.summon_full_params(fsdp_model):
             ...
@@ -1187,13 +1273,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 assert_equal_fn(params[1].shape, param_shapes[1])
                 return self.lin(x)
 
+<<<<<<< HEAD
         model = Model().to(device=device_type)
+=======
+        model = Model().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Save the *unsharded* original parameter shapes and check the shapes
         # match in the forward pass
         param_shapes[0] = model.lin.weight.shape
         param_shapes[1] = model.lin.bias.shape
         fsdp_model = FSDP(model, use_orig_params=True)
+<<<<<<< HEAD
         inp = torch.randn((2, 5), device=torch.device(device_type))
+=======
+        inp = torch.randn((2, 5), device=torch.device("cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_model(inp)
 
 
@@ -1220,7 +1314,11 @@ def test_no_sync_correctness(self):
         )
 
     def _test_no_sync_correctness(self, sharding_strategy: ShardingStrategy):
+<<<<<<< HEAD
         model = nn.Linear(7, 1, bias=False, device=device_type)
+=======
+        model = nn.Linear(7, 1, bias=False, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_kwargs = {
             "sharding_strategy": sharding_strategy,
         }
@@ -1270,8 +1368,13 @@ def _check_param_grad_parity(
                     orig_param.grad,
                 )
 
+<<<<<<< HEAD
         inp = torch.randn((2, 7), device=device_type)
         grad = torch.randn((2, 1), device=device_type)
+=======
+        inp = torch.randn((2, 7), device="cuda")
+        grad = torch.randn((2, 1), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Compute some reference gradients using one forward/backward
         out_use_flat_params = model_use_flat_params(inp)
@@ -1337,7 +1440,11 @@ def test_no_sync_mixed_precision(self):
         )
 
     def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
+<<<<<<< HEAD
         model = nn.Linear(3, 3, device=device_type)
+=======
+        model = nn.Linear(3, 3, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mixed_precision = MixedPrecision(
             param_dtype=torch.float16,
             reduce_dtype=torch.float32,
@@ -1348,7 +1455,11 @@ def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
             "use_orig_params": True,
         }
         fsdp_model = FSDP(model, **fsdp_kwargs)
+<<<<<<< HEAD
         inp = torch.randn((2, 3), device=device_type)
+=======
+        inp = torch.randn((2, 3), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with fsdp_model.no_sync():
             # For each of these `no_sync()` backward passes, check that the
             # gradients are in the low precision parameter dtype (FP16)
@@ -1372,8 +1483,13 @@ class TestFSDPUseOrigParamsInit(FSDPTest):
     @skip_if_lt_x_gpu(2)
     def test_non_uniform_requires_grad(self):
         model = nn.Sequential(
+<<<<<<< HEAD
             nn.Linear(3, 3, device=device_type),
             nn.Linear(3, 3, device=device_type),
+=======
+            nn.Linear(3, 3, device="cuda"),
+            nn.Linear(3, 3, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Freeze biases only and flatten both weights and biases into the same
         # `FlatParameter` to exercise non-uniform `requires_grad`
@@ -1396,10 +1512,17 @@ def test_multi_tensor_apply_size0_tensors_cpu(self):
         # Check that this does not segfault
         torch._foreach_mul_(size0_tensors, 0.1)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda and no xpu")
     def test_multi_tensor_apply_size0_tensors_cuda(self):
         size0_tensors = [
             torch.empty(0, device=device_type) for _ in range(NUM_SIZE0_TENSORS)
+=======
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_multi_tensor_apply_size0_tensors_cuda(self):
+        size0_tensors = [
+            torch.empty(0, device="cuda") for _ in range(NUM_SIZE0_TENSORS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         # Check that this does not segfault
         torch._foreach_mul_(size0_tensors, 0.1)
diff --git a/test/distributed/fsdp/test_shard_utils.py b/test/distributed/fsdp/test_shard_utils.py
index 7e1fb3816670b..19b112ddc4626 100644
--- a/test/distributed/fsdp/test_shard_utils.py
+++ b/test/distributed/fsdp/test_shard_utils.py
@@ -15,9 +15,12 @@
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestShardUtilsDistributed(FSDPTest):
     @property
     def world_size(self):
@@ -26,7 +29,11 @@ def world_size(self):
     def _create_tensor(self, *size):
         # Keep everything deterministic.
         torch.manual_seed(0)
+<<<<<<< HEAD
         return torch.rand(*size).to(device=device_type)
+=======
+        return torch.rand(*size).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_create_chunk_sharded_tensor(self):
@@ -37,12 +44,19 @@ def test_create_chunk_sharded_tensor(self):
                 tensor,
                 self.rank,
                 self.world_size,
+<<<<<<< HEAD
                 torch.accelerator.device_count(),
                 _get_default_group(),
             )
             output = (
                 torch.empty(*size).to(device=device_type) if self.rank == 0 else None
             )
+=======
+                torch.cuda.device_count(),
+                _get_default_group(),
+            )
+            output = torch.empty(*size).cuda() if self.rank == 0 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sharded_tensor.gather(0, output)
             if self.rank == 0:
                 self.assertEqual(tensor, output)
@@ -56,7 +70,11 @@ def world_size(self):
     def _create_tensor(self, *size):
         # Keep everything deterministic.
         torch.manual_seed(0)
+<<<<<<< HEAD
         return torch.rand(*size).to(device=device_type)
+=======
+        return torch.rand(*size).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index aa224edaefa1d..f7677d6465f2e 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -5,9 +5,14 @@
 import os
 import tempfile
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import auto, Enum
 from typing import Union
+=======
+from enum import auto, Enum
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -51,15 +56,21 @@
     parametrize,
     run_tests,
     TEST_CUDA,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = torch.distributed.get_default_backend_for_device(device_type)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class BatchNormNet(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -138,14 +149,23 @@ def setUp(self) -> None:
 
     class NestedSequentialModel:
         @staticmethod
+<<<<<<< HEAD
         def get_model(device=True):
+=======
+        def get_model(cuda=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sequential = nn.Sequential(
                 nn.Linear(5, 5),
                 nn.Linear(5, 5),
                 nn.Sequential(nn.Linear(5, 5), nn.Linear(5, 5)),
             )
+<<<<<<< HEAD
             if device:
                 sequential = sequential.to(device=device_type)
+=======
+            if cuda:
+                sequential = sequential.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return sequential
 
         @staticmethod
@@ -220,7 +240,11 @@ def test_error_already_wrapped(self, nested, device_init_mode):
             nested=nested, device_init_mode=device_init_mode
         )
         if device_init_mode == DEVICEInitMode.DEVICE_AFTER:
+<<<<<<< HEAD
             wrapped_fsdp = wrapped_fsdp.to(device=device_type)
+=======
+            wrapped_fsdp = wrapped_fsdp.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         wrapped_module_name = "lin1.1" if nested else "lin1"
         with self.assertRaisesRegex(
@@ -375,7 +399,11 @@ def forward(self, input):
             forward_prefetch=forward_prefetch,
         )
         if device_init_mode == DEVICEInitMode.DEVICE_AFTER:
+<<<<<<< HEAD
             wrapped_model = wrapped_model.to(device=device_type)
+=======
+            wrapped_model = wrapped_model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         modules_in_fsdp_graph_order = [
             wrapped_model.module.lin1,
@@ -394,7 +422,11 @@ def forward(self, input):
 
         # Run model a few times for sanity check.
         optim = torch.optim.SGD(wrapped_model.parameters(), lr=1e-2, momentum=0.9)
+<<<<<<< HEAD
         inp = torch.ones(1).to(device=device_type)
+=======
+        inp = torch.ones(1).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(6):
             optim.zero_grad()
             loss = wrapped_model(inp).sum()
@@ -467,13 +499,21 @@ def test_wrap_override_defaults(self):
         self.assertEqual(layer.rank, 0)
         self.assertEqual(layer.world_size, 2)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "Test Requires CUDA or XPU")
+=======
+    @unittest.skipIf(not TEST_CUDA, "Test Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_always_wrap(self):
         """
         Test to ensure that if `always_wrap_policy` is
         passed into FSDP, all submodules are wrapped.
         """
+<<<<<<< HEAD
         seq = TestFSDPWrap.NestedSequentialModel.get_model(device=True)
+=======
+        seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = FSDP(
             seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy
         )
@@ -635,7 +675,11 @@ def test_auto_wrap_api(self):
         Test to ensure with auto wrap, we wrap child modules correctly based on the min_num_params.
         ``nn.Linear(5, 5)`` does not exceed the bucket size, but combined they do.
         """
+<<<<<<< HEAD
         sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
+=======
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy, min_num_params=40
         )
@@ -732,7 +776,11 @@ def test_auto_wrap_preset_force_leaf_custom(self):
         self.assertTrue(isinstance(model.module[0], nn.Linear))
         self.assertTrue(isinstance(model.module[1], nn.ModuleList))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "Test Requires CUDA or XPU")
+=======
+    @unittest.skipIf(not TEST_CUDA, "Test Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "device_init_mode", [DEVICEInitMode.DEVICE_BEFORE, DEVICEInitMode.DEVICE_AFTER]
     )
@@ -749,12 +797,19 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
         ):
             return
 
+<<<<<<< HEAD
         device = torch.device(device_type)
         torch.accelerator.set_device_index(0)
         device_id = (
             torch.device(device_type, torch.accelerator.current_device_index())
             if use_device_id
             else None
+=======
+        device = torch.device("cuda")
+        torch.cuda.set_device(0)
+        device_id = (
+            torch.device("cuda", torch.cuda.current_device()) if use_device_id else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Random port in case the next test run quickly, same port would cause conflict.
@@ -763,18 +818,31 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
 
         file_name = tempfile.NamedTemporaryFile(delete=False).name
         torch.distributed.init_process_group(
+<<<<<<< HEAD
             backend=backend,
+=======
+            backend="nccl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             init_method=f"{FILE_SCHEMA}_{file_name}",
             rank=0,
             world_size=1,
         )
 
+<<<<<<< HEAD
         # NOTE: We move model to GPU after init with FSDP to simulate real use
         # cases where full model cannot be loaded onto GPU, but their shards can.
         device_after_init = device_init_mode == DEVICEInitMode.DEVICE_AFTER
         try:
             sequential = TestFSDPWrap.NestedSequentialModel.get_model(
                 device=(not device_after_init)
+=======
+        # NOTE: We move model to CUDA after init with FSDP to simulate real use
+        # cases where full model cannot be loaded onto GPU, but their shards can.
+        cuda_after_init = device_init_mode == DEVICEInitMode.DEVICE_AFTER
+        try:
+            sequential = TestFSDPWrap.NestedSequentialModel.get_model(
+                cuda=(not cuda_after_init)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             my_auto_wrap_policy = functools.partial(
                 size_based_auto_wrap_policy, min_num_params=40
@@ -786,8 +854,13 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
                 device_id=device_id,
             )
             TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
+<<<<<<< HEAD
             if device_after_init:
                 model = model.to(device=device_type)
+=======
+            if cuda_after_init:
+                model = model.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input = torch.rand((1, 5), dtype=torch.float).to(device)
             output = model(input)
             loss = F.mse_loss(input, output)
@@ -803,7 +876,11 @@ def test_auto_wrap_smoke_test(self, device_init_mode, cpu_offload, use_device_id
     @unittest.skipIf(not TEST_MULTIGPU, "Requires at least 2 GPUs")
     @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
     def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
+<<<<<<< HEAD
         sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
+=======
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_modules = [sequential[1], sequential[2][0]]
         fsdp_kwargs = {
             "process_group": self.process_group,
@@ -828,7 +905,11 @@ def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
     @unittest.skipIf(not TEST_MULTIGPU, "Requires at least 2 GPUs")
     @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
     def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
+<<<<<<< HEAD
         sequential = TestFSDPWrap.NestedSequentialModel.get_model(device=False)
+=======
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignored_modules = [sequential[1], sequential[2][0]]
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy,
@@ -891,7 +972,11 @@ def lambda_fn_nonuniform(module: nn.Module):
             self._test_frozen_params(use_orig_params, policy)
 
     def _test_frozen_params(self, use_orig_params: bool, policy: _Policy):
+<<<<<<< HEAD
         model = LoraModel().to(device=device_type)
+=======
+        model = LoraModel().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         msg = "layers.0.attn has both parameters with requires_grad=True and False. "
         if use_orig_params:
             msg += "We do not recommend wrapping such modules"
diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index d271e60954ae7..1ce393f6cdade 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -273,18 +273,23 @@ def test_nproc_launch_unknown_configurations(self):
     )
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
+<<<<<<< HEAD
     @patch("torch.accelerator.is_available", return_value=True)
     @patch("torch.accelerator.device_count", return_value=3)
     @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu"))
     def test_nproc_gpu_launch_configurations(
         self, _mock1, _mock2, _mock3, _mock4, _mock5
     ):
+=======
+    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
+<<<<<<< HEAD
     @patch("torch.xpu.is_available", return_value=True)
     @patch("torch.xpu.device_count", return_value=3)
     @patch("torch.accelerator.is_available", return_value=True)
@@ -299,6 +304,8 @@ def test_nproc_xpu_launch_configurations(
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_launch_elastic(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
diff --git a/test/distributed/optim/test_apply_optimizer_in_backward.py b/test/distributed/optim/test_apply_optimizer_in_backward.py
index c7be2c8a1d085..e8877c27b814e 100644
--- a/test/distributed/optim/test_apply_optimizer_in_backward.py
+++ b/test/distributed/optim/test_apply_optimizer_in_backward.py
@@ -39,7 +39,11 @@ def _run_training_loop_and_validate(self, inp, models, optimizers):
             with self.subTest(i):
                 _validate_params(
                     [model.parameters() for model in models],
+<<<<<<< HEAD
                     torch.testing.assert_close,
+=======
+                    torch.testing.assert_allclose,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             for opt in optimizers:
@@ -77,7 +81,11 @@ def _test_apply_optimizer_in_backward(self, share_params) -> None:
                 model.parameters(),
                 model_with_opt_in_bwd.parameters(),
             ],
+<<<<<<< HEAD
             torch.testing.assert_close,
+=======
+            torch.testing.assert_allclose,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._run_training_loop_and_validate(
@@ -113,10 +121,17 @@ def test_no_register_hook(self):
 
         for p1, p2 in zip(model_with_hook.parameters(), initial_model.parameters()):
             with self.assertRaises(AssertionError):
+<<<<<<< HEAD
                 torch.testing.assert_close(p1, p2)
 
         for p1, p2 in zip(model_no_hook.parameters(), initial_model.parameters()):
             torch.testing.assert_close(p1, p2)
+=======
+                torch.testing.assert_allclose(p1, p2)
+
+        for p1, p2 in zip(model_no_hook.parameters(), initial_model.parameters()):
+            torch.testing.assert_allclose(p1, p2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_multiple_optim_for_params(self) -> None:
         model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10))
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 5c90ad8be144e..0a28b69bfe732 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -1165,23 +1165,45 @@ def closure_ddp():
 
                 # Increased tolerances are needed to pass when using TF32
                 # See: https://github.com/pytorch/pytorch/issues/67764
+<<<<<<< HEAD
                 torch.testing.assert_close(
                     local_loss.cpu(),
                     ddp_loss.cpu(),
                     rtol=1e-03,
                     atol=1e-08,
                     msg="Losses differ between local optimizer and ZeRO",
+=======
+                (
+                    torch.testing.assert_close(
+                        local_loss.cpu(),
+                        ddp_loss.cpu(),
+                        rtol=1e-03,
+                        atol=1e-08,
+                    ),
+                    "Losses differ between local optimizer and ZeRO",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 for local_p, ddp_p in zip(
                     local_model.parameters(), ddp_model.parameters()
                 ):
+<<<<<<< HEAD
                     torch.testing.assert_close(
                         local_p.cpu(),
                         ddp_p.cpu(),
                         rtol=1e-03,
                         atol=1e-04,
                         msg="Models differ after a step",
+=======
+                    (
+                        torch.testing.assert_close(
+                            local_p.cpu(),
+                            ddp_p.cpu(),
+                            rtol=1e-03,
+                            atol=1e-04,
+                        ),
+                        "Models differ after a step",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
     @skipIfHpu
diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py
index 5213cf4ac647a..ff17df144f2e3 100644
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@@ -8,7 +8,11 @@
 
 class ExampleCode(torch.nn.Module):
     def __init__(self, d_hid, splits=2):
+<<<<<<< HEAD
         assert splits <= 8
+=======
+        assert splits <= 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.splits = splits
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@@ -17,10 +21,13 @@ def __init__(self, d_hid, splits=2):
         self.lin0 = torch.nn.Linear(d_hid, d_hid)
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
         self.lin2 = torch.nn.Linear(d_hid, d_hid)
+<<<<<<< HEAD
         self.lin3 = torch.nn.Linear(d_hid, d_hid)
         self.lin4 = torch.nn.Linear(d_hid, d_hid)
         self.lin5 = torch.nn.Linear(d_hid, d_hid)
         self.lin6 = torch.nn.Linear(d_hid, d_hid)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x):
         x = torch.mm(x, self.mm_param0)
@@ -39,6 +46,7 @@ def forward(self, x):
             pipe_split()
             x = self.lin2(x)
             x = torch.relu(x)
+<<<<<<< HEAD
         if self.splits > 4:
             pipe_split()
             x = self.lin3(x)
@@ -55,6 +63,8 @@ def forward(self, x):
             pipe_split()
             x = self.lin6(x)
             x = torch.relu(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -63,7 +73,11 @@ class ModelWithKwargs(torch.nn.Module):
     DEFAULT_BATCH_SIZE = 256
 
     def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
+<<<<<<< HEAD
         assert splits <= 8
+=======
+        assert splits <= 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.splits = splits
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@@ -72,10 +86,13 @@ def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
         self.lin2 = torch.nn.Linear(d_hid, d_hid)
         self.lin3 = torch.nn.Linear(d_hid, d_hid)
+<<<<<<< HEAD
         self.lin4 = torch.nn.Linear(d_hid, d_hid)
         self.lin5 = torch.nn.Linear(d_hid, d_hid)
         self.lin6 = torch.nn.Linear(d_hid, d_hid)
         self.lin7 = torch.nn.Linear(d_hid, d_hid)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
         x = torch.mm(x, self.mm_param0)
@@ -94,6 +111,7 @@ def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
             pipe_split()
             x = self.lin3(x)
             x = torch.relu(x)
+<<<<<<< HEAD
         if self.splits > 4:
             pipe_split()
             x = self.lin4(x)
@@ -110,6 +128,8 @@ def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
             pipe_split()
             x = self.lin7(x)
             x = torch.relu(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -251,10 +271,17 @@ def __init__(self, d_hid: int):
         self.fc2_weight = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.fc2_bias = torch.nn.Parameter(torch.randn(d_hid))
 
+<<<<<<< HEAD
         torch.nn.init.uniform_(self.fc1_weight, -0.001, 0.001)
         torch.nn.init.uniform_(self.fc2_weight, -0.001, 0.001)
         torch.nn.init.uniform_(self.fc1_bias, -0.001, 0.001)
         torch.nn.init.uniform_(self.fc2_bias, -0.001, 0.001)
+=======
+        torch.nn.init.uniform_(self.fc1_weight, -0.01, 0.01)
+        torch.nn.init.uniform_(self.fc2_weight, -0.01, 0.01)
+        torch.nn.init.uniform_(self.fc1_bias, -0.01, 0.01)
+        torch.nn.init.uniform_(self.fc2_bias, -0.01, 0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.cached_context = {}
         self.cached_context["fc1"] = []
diff --git a/test/distributed/pipelining/schedule_registry.py b/test/distributed/pipelining/schedule_registry.py
index 0b037bd8b99b4..b44e15f9d2b76 100644
--- a/test/distributed/pipelining/schedule_registry.py
+++ b/test/distributed/pipelining/schedule_registry.py
@@ -2,8 +2,12 @@
 # Owner(s): ["oncall: distributed"]
 # This file is a Schedule zoo for testing torch.distributed.pipelining.
 # It includes schedules designed purely for testing purposes
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.pipelining.schedules import (
     _Action,
@@ -46,7 +50,11 @@ def __init__(
         )
 
         # Go through one microbatch
+<<<<<<< HEAD
         # Note(whc) - it might be easier to work with this schedules by writing them as a list of
+=======
+        # Note(whc) - it might be easier to work with thes schedules by writing them as a list of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ["0F0", ...] and then parsing them in the test infra to turn them into actions.
         self.pipeline_order = {
             0: [
diff --git a/test/distributed/pipelining/test_backward.py b/test/distributed/pipelining/test_backward.py
index b46a97d02c29e..1afb6e1faa2b3 100644
--- a/test/distributed/pipelining/test_backward.py
+++ b/test/distributed/pipelining/test_backward.py
@@ -10,10 +10,14 @@
     stage_backward_input,
     stage_backward_weight,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     skipXPUIf,
 )
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -22,7 +26,10 @@
 
 
 class StageBackwardTests(TestCase):
+<<<<<<< HEAD
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stage_backward(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -97,7 +104,10 @@ def test_stage_backward_input(self, device):
             # Check that the weight gradients were not updated
             self.assertEqual(p.grad, None)
 
+<<<<<<< HEAD
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stage_backward_weight(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -138,7 +148,10 @@ def test_stage_backward_weight(self, device):
                 print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                 raise
 
+<<<<<<< HEAD
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stage_backward_weight_multiple_iters(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -189,6 +202,7 @@ def test_stage_backward_weight_multiple_iters(self, device):
                 print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                 raise
 
+<<<<<<< HEAD
     def test_stage_backward_weight_grad_validation(self, device):
         test_cases = [
             (
@@ -232,6 +246,11 @@ def test_stage_backward_weight_grad_validation(self, device):
 instantiate_device_type_tests(
     StageBackwardTests, globals(), only_for=devices, allow_xpu=True
 )
+=======
+
+devices = ["cpu", "cuda", "hpu", "xpu"]
+instantiate_device_type_tests(StageBackwardTests, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_microbatch.py b/test/distributed/pipelining/test_microbatch.py
index b49a1cea324af..b8aca1d86041b 100644
--- a/test/distributed/pipelining/test_microbatch.py
+++ b/test/distributed/pipelining/test_microbatch.py
@@ -9,11 +9,15 @@
     split_args_kwargs_into_chunks,
     TensorChunkSpec,
 )
+<<<<<<< HEAD
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     skipXPUIf,
 )
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -60,6 +64,7 @@ def test_split_and_merge(self):
         torch.testing.assert_close(merged_kwargs, kwargs)
         print("Microbatch test passed")
 
+<<<<<<< HEAD
     def test_split_block_mask(self, device):
         B = 6
         H = 1
@@ -270,6 +275,8 @@ def test_split_block_mask_none(self, device):
             self.assertIsNone(kwarg_split[i]["attention_mask"])
 
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_chunk_spec(self, device):
         mod = ModelWithKwargs().to(device)
         batch_size = ModelWithKwargs.DEFAULT_BATCH_SIZE
@@ -298,15 +305,22 @@ def test_chunk_spec(self, device):
 
         ref = mod(x, y)
         out = pipe(x, y)[0]
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(out, ref)
         print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
+<<<<<<< HEAD
 instantiate_device_type_tests(
     MicrobatchTests, globals(), only_for=devices, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(MicrobatchTests, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index 714ab8f659111..ef5ddb974512c 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -8,15 +8,23 @@
 from model_registry import MultiMLP
 
 import torch
+<<<<<<< HEAD
 from torch._dynamo import OptimizedModule
 from torch.distributed.pipelining import (
     Schedule1F1B,
     ScheduleDualPipeV,
+=======
+from torch.distributed.pipelining import (
+    Schedule1F1B,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
     ScheduleLoopedBFS,
+<<<<<<< HEAD
     ScheduleZBVZeroBubble,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
 from torch.distributed.pipelining.schedules import (
@@ -41,7 +49,11 @@
     W,
 )
 from torch.distributed.pipelining.stage import _PipelineStageBase, PipelineStage
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
+=======
+from torch.testing._internal.common_distributed import requires_nccl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
     instantiate_parametrized_tests,
@@ -54,7 +66,10 @@
 
 ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts")
 
+<<<<<<< HEAD
 device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 logger = logging.getLogger(__name__)
 torch.manual_seed(0)
 
@@ -66,7 +81,11 @@ def __init__(self, *args, **kwargs):
         self.num_stages = kwargs.get("num_stages", 1)
         self.group_size = kwargs.get("group_size", 1)
         self.group_rank = kwargs.get("group_rank", 0)
+<<<<<<< HEAD
         self.group = kwargs.get("group")
+=======
+        self.group = kwargs.get("group", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _create_grad_recv_info(self, *args, **kwargs):
         return None
@@ -203,6 +222,7 @@ def loss_fn(y, target):
 
         torch.distributed.destroy_process_group()
 
+<<<<<<< HEAD
     @parametrize(
         "ScheduleClass",
         [
@@ -268,6 +288,9 @@ def loss_fn(y, target):
         ],
     )
     def test_zero_bubble_schedule_errors_with_compile(self, ScheduleClass):
+=======
+    def test_zero_bubble_schedule_errors_with_compile(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Test that zero bubble schedules raise an error when used with torch.compile.
         """
@@ -280,18 +303,28 @@ def test_zero_bubble_schedule_errors_with_compile(self, ScheduleClass):
         model = MultiMLP(8, n_layers=n_stages)
         # full_mod
         compiled_model = torch.compile(model)
+<<<<<<< HEAD
         self.assertTrue(isinstance(compiled_model, OptimizedModule))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stage = PipelineStage(
             compiled_model,
             0,
             n_stages,
             device,
         )
+<<<<<<< HEAD
         try:
             with self.assertRaises(RuntimeError):
                 ScheduleClass([stage], 2)
         finally:
             torch.distributed.destroy_process_group()
+=======
+        with self.assertRaises(RuntimeError):
+            ScheduleInterleavedZeroBubble([stage], 2)
+
+        torch.distributed.destroy_process_group()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 instantiate_parametrized_tests(ScheduleTest)
@@ -418,6 +451,7 @@ def stage_to_rank(stage):
                     num_stages=num_stages,
                 )
 
+<<<<<<< HEAD
     @parametrize(
         "ScheduleClass",
         [ScheduleDualPipeV, ScheduleZBVZeroBubble],
@@ -459,10 +493,13 @@ def test_pipeline_order_for_v_schedules(self, ScheduleClass):
                     schedule.pipeline_order, group_size, num_stages, num_microbatches
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestSchedulePlan)
 
 
+<<<<<<< HEAD
 class TestScheduleCsv(TestCase):
     @parametrize(
         "ScheduleClass,csv_name",
@@ -503,6 +540,8 @@ def test_csv_compare(self, ScheduleClass, csv_name):
 instantiate_parametrized_tests(TestScheduleCsv)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestScheduleLowering(TestCase):
     """Tests lowering passes that convert simple compute-only (FBW) schedules into compute+comms schedules"""
 
@@ -536,6 +575,7 @@ def test_action_parse(self, action_str_and_ref):
                 "compute": ["0F0", "0F1", "   ", "0B0", "0B1"],
                 "comms": ["0UNSHARD", "0F0", "0F1", "0B0", "0B1", "0RESHARD"],
             },
+<<<<<<< HEAD
             {
                 "compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
                 "comms": [
@@ -553,6 +593,8 @@ def test_action_parse(self, action_str_and_ref):
                     "0RESHARD",
                 ],
             },
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     def test_unshard_reshard(self, test_info):
@@ -825,7 +867,11 @@ def _dump_csv(pipeline_order_with_comms, filename: str):
         # print(_format_pipeline_order(simulated_schedule))
         self.assertEqual(num_steps, 113)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_grad_with_v_schedule(self):
         """
         We have a special case for V schedules where 2 adjacent stages are on the same rank.
@@ -845,6 +891,10 @@ def test_grad_with_v_schedule(self):
         d_hid = 512
         batch_size = 256
         n_stages = 2
+<<<<<<< HEAD
+=======
+        device = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
@@ -888,7 +938,11 @@ def test_grad_with_v_schedule(self):
             loss_fn=loss_fn,
             scale_grads=False,
         )
+<<<<<<< HEAD
         schedule._prepare_schedule_with_comms(
+=======
+        schedule._load_actions(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {
                 0: self._parse_actions(
                     [
@@ -943,7 +997,11 @@ def test_grad_with_v_schedule(self):
 
         torch.distributed.destroy_process_group()
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_grad_with_split_b_w(self):
         """
         Ensure that separate dInput and dWeight computations are correctly executed.
@@ -956,6 +1014,10 @@ def test_grad_with_split_b_w(self):
         d_hid = 512
         batch_size = 256
         n_stages = 1
+<<<<<<< HEAD
+=======
+        device = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
@@ -997,9 +1059,14 @@ def test_grad_with_split_b_w(self):
             stages,
             num_microbatches,
             loss_fn=loss_fn,
+<<<<<<< HEAD
             scale_grads=False,
         )
         schedule._prepare_schedule_with_comms(
+=======
+        )
+        schedule._load_actions(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {
                 0: self._parse_actions(
                     [
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index 9806bb5d03874..7a017129d1bb7 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -2,7 +2,11 @@
 # Owner(s): ["oncall: distributed"]
 import copy
 import logging
+<<<<<<< HEAD
 from dataclasses import dataclass
+=======
+import tempfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from model_registry import ModelWithKwargs, MultiMLP, MultiMLPKwargs, MultiMLPWithDw
 from schedule_registry import (
@@ -19,13 +23,17 @@
     pipeline,
     PipelineStage,
     Schedule1F1B,
+<<<<<<< HEAD
     ScheduleDualPipeV,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
     ScheduleLoopedBFS,
     ScheduleZBVZeroBubble,
 )
+<<<<<<< HEAD
 from torch.distributed.pipelining.schedules import (
     _Action,
     _PipelineContext,
@@ -39,6 +47,13 @@
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     requires_accelerator_dist_backend,
+=======
+from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_distributed import (
+    MultiProcContinousTest,
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
@@ -52,6 +67,7 @@
 logger = logging.getLogger(__name__)
 
 d_hid = 512
+<<<<<<< HEAD
 batch_size = 64
 torch.manual_seed(0)
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
@@ -209,16 +225,30 @@ def zero_gradients(stage_modules):
 
 class ScheduleTest(MultiProcContinuousTest):
     world_size = 4
+=======
+batch_size = 256
+torch.manual_seed(0)
+device_type = "cuda"
+
+
+class ScheduleTest(MultiProcContinousTest):
+    world_size = 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
+<<<<<<< HEAD
         return backend
+=======
+        return "nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+<<<<<<< HEAD
     @property
     def config(self) -> PipelineTestConfig:
         """Lazily create and return the pipeline test configuration."""
@@ -243,6 +273,40 @@ def test_forward_only(self, ScheduleClass):
 
         # Run forward-only schedule
         out = None
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [_ScheduleForwardOnly])
+    def test_forward_only(self, ScheduleClass):
+        mod = MultiMLP(d_hid, n_layers=self.world_size)
+        mod.to(self.device)
+
+        mod_ref = copy.deepcopy(mod)
+
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        x_clone = x.clone()
+
+        num_microbatches = 2 * self.world_size
+        x_mb = x.chunk(num_microbatches)[0]
+
+        # Create a pipeline
+        split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
+        pipe = pipeline(
+            mod,
+            mb_args=(x_mb,),
+            split_spec=split_spec,
+        )
+
+        stage = pipe.build_stage(
+            self.rank,
+            self.device,
+        )
+
+        # Attach to a schedule
+        schedule = ScheduleClass(stage, num_microbatches, scale_grads=False)
+
+        # Run
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_iters = 20
         for _ in range(num_iters):
             if self.rank == 0:
@@ -254,6 +318,7 @@ def test_forward_only(self, ScheduleClass):
             else:
                 schedule.step()
 
+<<<<<<< HEAD
         # Validate pipelined output matches reference model
         if self.rank == self.world_size - 1:
             for _ in range(num_iters):
@@ -410,6 +475,43 @@ def test_multi_iter(self, ScheduleClass):
         mod, _, x, target, loss_fn = setup_models_and_data(self.config)
         chunks = 4
         stage, _, _ = create_single_stage_pipeline(self.config, mod, x, chunks)
+=======
+        # Validate pipelined output is the same as reference model
+        if self.rank == self.world_size - 1:
+            for _ in range(num_iters):
+                x_clone = mod_ref(x_clone)
+
+            torch.testing.assert_close(x_clone, out)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    def test_multi_iter(self, ScheduleClass):
+        mod = MultiMLP(d_hid, n_layers=self.world_size)
+        mod.to(self.device)
+
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        target = torch.randn(batch_size, d_hid, device=self.device)
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        chunks = 4
+        x_mb = x.chunk(chunks)[0]
+
+        # Create a pipeline
+        split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
+        pipe = pipeline(
+            mod,
+            mb_args=(x_mb,),
+            split_spec=split_spec,
+        )
+
+        stage = pipe.build_stage(
+            self.rank,
+            self.device,
+        )
+
+        # Attach to a schedule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
 
         # Run
@@ -422,6 +524,7 @@ def test_multi_iter(self, ScheduleClass):
             else:
                 schedule.step()
 
+<<<<<<< HEAD
         dist.barrier(device_ids=[self.rank])
 
     @requires_accelerator_dist_backend(["nccl", "xccl"])
@@ -431,6 +534,19 @@ def test_multi_iter(self, ScheduleClass):
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_kwargs_with_tracer(self, ScheduleClass):
         mod = ModelWithKwargs(d_hid, splits=self.world_size)
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    def test_kwargs_with_tracer(self, ScheduleClass):
+        # Model has two stages only, thus limiting group size to 2
+        group_size = 2
+        group = dist.new_group(list(range(group_size)))
+        if self.rank >= group_size:
+            return
+
+        mod = ModelWithKwargs(d_hid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -451,31 +567,50 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         stage = pipe.build_stage(
             self.rank,
             self.device,
+<<<<<<< HEAD
+=======
+            group=group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Attach to a schedule
         schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
 
         # Run
+<<<<<<< HEAD
         out = None
         losses = []
         if self.rank == 0:
             schedule.step(x, y=y)
         elif self.rank == self.world_size - 1:
+=======
+        if self.rank == 0:
+            schedule.step(x, y=y)
+        elif self.rank == group_size - 1:
+            losses = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = schedule.step(target=target, losses=losses)
         else:
             schedule.step()
 
+<<<<<<< HEAD
         dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
         if self.rank == self.world_size - 1:
+=======
+        # dist.barrier()
+
+        # Last rank checks result
+        if self.rank == group_size - 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_out = mod(x, y=y)
             ref_loss = loss_fn(ref_out, target)
             pipe_loss = sum(losses)
             torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3)
             torch.testing.assert_close(pipe_loss, ref_loss)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
@@ -502,10 +637,63 @@ def test_grad_with_tracer(self, ScheduleClass):
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    def test_grad_with_tracer(self, ScheduleClass):
+        mod = MultiMLP(d_hid, n_layers=self.world_size)
+        mod.to(self.device)
+
+        ref_mod = copy.deepcopy(mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Create a pipeline
+        chunks = 2 * self.world_size
+        x_mb = x.chunk(chunks)[0]
+        split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
+        pipe = pipeline(
+            mod,
+            mb_args=(x_mb,),
+            split_spec=split_spec,
+        )
+
+        stage = pipe.build_stage(
+            self.rank,
+            self.device,
+        )
+
+        # Attach to a schedule
+        schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
+
+        # Run
+        stage_module = pipe.get_stage_module(self.rank)
+        for _ in range(2):
+            # Zero gradients
+            stage_module.zero_grad()
+            if self.rank == 0:
+                schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                losses = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = schedule.step(target=target, losses=losses)
             else:
                 schedule.step()
 
+<<<<<<< HEAD
         dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
@@ -559,10 +747,92 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
+=======
+        dist.barrier()
+
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for name, p in stage_module.named_parameters():
+            ref_p = ref_mod.get_parameter(name)
+            try:
+                torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+            except AssertionError:
+                print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @parametrize("shape_inference", [True, False])
+    def test_grad_with_manual(self, ScheduleClass, shape_inference):
+        full_mod = MultiMLP(d_hid, n_layers=self.world_size)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        submod_name = f"layers.{self.rank}"
+        stage_module = full_mod.get_submodule(submod_name)
+        chunks = 2 * self.world_size
+
+        if shape_inference:
+            input_args = None
+            output_args = None
+        else:
+            input_args = (x.chunk(chunks)[0],)
+            with torch.no_grad():
+                output_args = stage_module(*input_args)
+
+        # Create a pipeline stage to wrap that submodule
+        stage = PipelineStage(
+            stage_module,
+            self.rank,
+            self.world_size,
+            self.device,
+            input_args=input_args,
+            output_args=output_args,
+        )
+
+        # Attach to a schedule
+        schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
+
+        # Run
+        for _ in range(2):
+            # Zero gradients
+            stage_module.zero_grad()
+            if self.rank == 0:
+                schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                losses = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = schedule.step(target=target, losses=losses)
             else:
                 schedule.step()
 
+<<<<<<< HEAD
         dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
@@ -578,6 +848,32 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+        dist.barrier()
+
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        ref_submod = ref_mod.get_submodule(submod_name)
+        for name, p in stage_module.named_parameters():
+            ref_p = ref_submod.get_parameter(name)
+            try:
+                torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+            except AssertionError:
+                print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "ScheduleClass",
         [
@@ -586,6 +882,7 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             ScheduleInterleavedZeroBubble,
         ],
     )
+<<<<<<< HEAD
     def test_grad_with_manual_interleaved(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
@@ -602,11 +899,47 @@ def test_grad_with_manual_interleaved(self, ScheduleClass):
         )
         print(f"Rank {self.rank} stages: {[stage.stage_index for stage in stages]}")
 
+=======
+    @parametrize("use_new_runtime", [False, True])
+    def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
+        stages_per_rank = 2
+        n_stages = stages_per_rank * self.world_size
+        full_mod = MultiMLP(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        stage_indices = [
+            self.rank + i * self.world_size for i in range(stages_per_rank)
+        ]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        # Create a pipeline stage to wrap that submodule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_microbatches = (
             ScheduleClass.num_microbatches
             if hasattr(ScheduleClass, "num_microbatches")
             else 2 * self.world_size
         )
+<<<<<<< HEAD
 
         # Create schedule
         schedule = ScheduleClass(
@@ -626,6 +959,83 @@ def test_grad_with_manual_interleaved(self, ScheduleClass):
                 else:
                     schedule.step()
 
+=======
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, stage_indices)
+        ]
+
+        # Attach to a schedule
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+        if use_new_runtime:
+            old_schedule = schedule
+            tmp_schedule = _PipelineScheduleRuntime(
+                stages,
+                num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+            tmp_schedule._load_actions(old_schedule.pipeline_order)
+            # test that csv round-trip works for compute_comms schedule
+            schedule = _PipelineScheduleRuntime(
+                stages,
+                num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+            with tempfile.NamedTemporaryFile() as f:
+                tmp_schedule._dump_csv(f.name)
+                f.seek(0)
+                schedule._load_csv(f.name, format="compute_comms")
+            one_more_schedule = _PipelineScheduleRuntime(
+                stages,
+                num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+            one_more_schedule._load_actions(
+                schedule.pipeline_order_with_comms, format="compute_comms"
+            )
+            self.assertEqual(
+                len(schedule.pipeline_order_with_comms),
+                len(
+                    one_more_schedule.pipeline_order_with_comms,
+                ),
+            )
+            for rank in schedule.pipeline_order_with_comms:
+                self.assertEqual(
+                    len(schedule.pipeline_order_with_comms[rank]),
+                    len(
+                        one_more_schedule.pipeline_order_with_comms[rank],
+                    ),
+                )
+                for a, b in zip(
+                    schedule.pipeline_order_with_comms[rank],
+                    one_more_schedule.pipeline_order_with_comms[rank],
+                ):
+                    self.assertEqual(a, b)
+
+        # Run
+        with check_leaked_tensors() as garbage_tensors:
+            for _ in range(2):
+                # Zero gradients
+                for stage_module in stage_modules:
+                    stage_module.zero_grad()
+                if self.rank == 0:
+                    schedule.step(x)
+                elif self.rank == self.world_size - 1:
+                    losses = []
+                    out = schedule.step(target=target, losses=losses)
+                else:
+                    schedule.step()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             len(garbage_tensors),
             0,
@@ -633,6 +1043,7 @@ def test_grad_with_manual_interleaved(self, ScheduleClass):
         )
         dist.barrier()
 
+<<<<<<< HEAD
         # Verify results
         if self.rank == self.world_size - 1:
             torch.testing.assert_close(out, ref_out)
@@ -649,10 +1060,342 @@ def test_grad_with_manual_interleaved(self, ScheduleClass):
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=1e-3)
+                except AssertionError:
+                    print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                    raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleWithW, ScheduleInterleavedZeroBubble])
+    def test_schedule_with_native_zero_bubble(self, ScheduleClass):
+        print(ScheduleClass)
+        if ScheduleClass is ScheduleInterleavedZeroBubble:
+            n_stages = 4
+            num_microbatches = 2 * n_stages
+            rank_stages = {
+                0: [0, 2],
+                1: [1, 3],
+            }
+        else:
+            n_stages = ScheduleClass.n_stages
+            num_microbatches = ScheduleClass.num_microbatches
+            rank_stages = ScheduleClass.rank_stages
+
+        num_steps = 4
+        full_mod = MultiMLP(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        # x = torch.randn(batch_size, d_hid, device=self.device, requires_grad=True)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Create a pipeline stage to wrap that submodule
+        stage_indices = rank_stages[self.rank]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, rank_stages[self.rank])
+        ]
+
+        # We set scale_grads=False since we use a loss function that sums instead of mean-reduces
+        # (note: normally we recommend using mean-reduce loss functions, but we preserve at least one test case
+        #        using sum scaling for completeness)
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+
+        # Run reference
+        ref_x = x.detach().clone().requires_grad_(x.requires_grad)
+        torch.testing.assert_close(x, ref_x)
+        for _ in range(num_steps):
+            ref_out = ref_mod(ref_x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        with check_leaked_tensors() as garbage_tensors:
+            # Run pipelined stages
+            for _ in range(num_steps):
+                if self.rank == 0:
+                    schedule.step(x)
+                elif self.rank == self.world_size - 1:
+                    losses = []
+                    schedule.step(target=target, losses=losses)
+                else:
+                    schedule.step()
+        self.assertEqual(
+            len(garbage_tensors),
+            0,
+            "Found leaked tensors, check logs above for debug info",
+        )
+
+        # Every rank checks parameters compared with the reference model
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+                except AssertionError:
+                    print(
+                        f"Parameter test failed for {submod_name}.{name}: {p.grad} vs {ref_p.grad}"
+                    )
+                    raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleWithReorderedB,
+        ],
+    )
+    def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
+        n_stages = 2
+        num_microbatches = 2
+        stages_per_rank = 1
+        full_mod = MultiMLP(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        stage_indices = [
+            self.rank + i * self.world_size for i in range(stages_per_rank)
+        ]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        # Create a pipeline stage to wrap that submodule
+        num_microbatches = (
+            ScheduleClass.num_microbatches
+            if hasattr(ScheduleClass, "num_microbatches")
+            else 8
+        )
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, stage_indices)
+        ]
+
+        # Attach to a schedule
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+        assert isinstance(schedule, _PipelineScheduleRuntime)
+
+        # Run
+        with check_leaked_tensors() as garbage_tensors:
+            for _ in range(2):
+                # Zero gradients
+                for stage_module in stage_modules:
+                    stage_module.zero_grad()
+                if self.rank == 0:
+                    schedule.step(x)
+                elif self.rank == self.world_size - 1:
+                    losses = []
+                    out = schedule.step(target=target, losses=losses)
+                else:
+                    schedule.step()
+        self.assertEqual(
+            len(garbage_tensors),
+            0,
+            "Found leaked tensors, check logs above for debug info",
+        )
+        dist.barrier()
+
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+                except AssertionError:
+                    print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                    raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize(
+        "schedule_class", [ScheduleVShaped, ScheduleUnbalanced, ScheduleZBVZeroBubble]
+    )
+    @parametrize("use_new_runtime", [False, True])
+    def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
+        if schedule_class is ScheduleZBVZeroBubble:
+            n_stages = 4
+            rank_stages = {
+                0: [0, 3],
+                1: [1, 2],
+            }
+        else:
+            n_stages = schedule_class.n_stages
+            rank_stages = schedule_class.rank_stages
+        full_mod = MultiMLP(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Create a pipeline stage to wrap that submodule
+        num_microbatches = 1
+        stage_indices = rank_stages[self.rank]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, rank_stages[self.rank])
+        ]
+
+        schedule = schedule_class(
+            stages,
+            num_microbatches,
+            loss_fn=loss_fn,
+            scale_grads=False,
+        )
+        if use_new_runtime:
+            old_schedule = schedule
+            schedule = _PipelineScheduleRuntime(
+                stages,
+                num_microbatches,
+                loss_fn=loss_fn,
+            )
+            schedule._load_actions(old_schedule.pipeline_order)
+
+        # Run
+        # TODO how to better specify .step() when first and last stage are on rank 0...
+        for _ in range(2):
+            # Zero gradients
+            for stage_module in stage_modules:
+                stage_module.zero_grad()
+            if self.rank == 0:
+                losses = []
+                out = schedule.step(x, target=target, losses=losses)
+            else:
+                schedule.step()
+
+        dist.barrier()
+
+        # Last rank checks result
+        if self.rank == 0:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+                except AssertionError:
+                    print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                    raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
     def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
+<<<<<<< HEAD
         full_mod, ref_mod, x, target, _ = setup_models_and_data(
             self.config, n_layers=n_stages, model_class=MultiMLPWithDw
         )
@@ -666,6 +1409,44 @@ def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
         stages, stage_modules, submod_names = create_multi_stage_pipeline(
             self.config, full_mod, stages_per_rank, n_stages
         )
+=======
+        full_mod = MultiMLPWithDw(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        ref_loss_fn = torch.nn.MSELoss(reduction="sum")
+        full_loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        full_mod.toggle()
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        stage_indices = [
+            self.rank + i * self.world_size for i in range(stages_per_rank)
+        ]
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+
+        # Run reference
+        for _ in range(2):
+            ref_stage_modules = [
+                ref_mod.get_submodule(submod_name) for submod_name in submod_names
+            ]
+            for stage_module in ref_stage_modules:
+                stage_module.zero_grad()
+
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = ref_loss_fn(ref_out, target)
+            ref_loss.backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class CustomState:
             def __init__(self, stage_module, stage_idx, rank):
@@ -676,6 +1457,10 @@ def __init__(self, stage_module, stage_idx, rank):
 
             def dw_builder(self):
                 def dw_runner():
+<<<<<<< HEAD
+=======
+                    # This inner function would be called by PipelineStage during `backward_weight_one_chunk`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.i += 1
                     print(
                         f"[Rank {self.rank}] dw_count={self.i} stage={self.stage_idx}"
@@ -684,6 +1469,7 @@ def dw_runner():
 
                 return dw_runner
 
+<<<<<<< HEAD
         # Create custom states and rebuild stages with dw_builder
         cs = {}
         stage_indices = [
@@ -692,6 +1478,14 @@ def dw_runner():
         for stage_module, stage_idx in zip(stage_modules, stage_indices):
             cs[stage_idx] = CustomState(stage_module, stage_idx, self.rank)
 
+=======
+        cs = {}
+        for stage_module, stage_idx in zip(stage_modules, stage_indices):
+            cs[stage_idx] = CustomState(stage_module, stage_idx, self.rank)
+
+        # Create a pipeline stage to wrap that submodule
+        chunks = 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stages = [
             PipelineStage(
                 stage_module,
@@ -703,6 +1497,7 @@ def dw_runner():
             for stage_module, stage_idx in zip(stage_modules, stage_indices)
         ]
 
+<<<<<<< HEAD
         schedule = ScheduleClass(stages, 2, loss_fn=loss_fn)
 
         # Run pipeline
@@ -713,10 +1508,26 @@ def dw_runner():
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
+=======
+        # Attach to a schedule
+        schedule = ScheduleClass(
+            stages, chunks, loss_fn=full_loss_fn, scale_grads=False
+        )
+
+        for _ in range(2):
+            # Zero gradients
+            for stage_module in stage_modules:
+                stage_module.zero_grad()
+            if self.rank == 0:
+                schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                losses = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = schedule.step(target=target, losses=losses)
             else:
                 schedule.step()
 
+<<<<<<< HEAD
         dist.barrier(device_ids=[self.rank])
 
         # Verify results
@@ -975,6 +1786,31 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, "NCCL test requires 2+ GPUs"
     )
+=======
+        dist.barrier()
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "ScheduleClass",
         [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
@@ -982,6 +1818,7 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
+<<<<<<< HEAD
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(
             self.config, n_layers=n_stages, model_class=MultiMLPKwargs
         )
@@ -997,6 +1834,55 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
             self.config, mod, stages_per_rank, n_stages
         )
 
+=======
+        full_mod = MultiMLPKwargs(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        unused_kwarg = torch.tensor([1.0], device=self.device)
+
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        stage_indices = [
+            self.rank + i * self.world_size for i in range(stages_per_rank)
+        ]
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        # Run reference
+        for _ in range(2):
+            ref_stage_modules = [
+                ref_mod.get_submodule(submod_name) for submod_name in submod_names
+            ]
+            for stage_module in ref_stage_modules:
+                stage_module.zero_grad()
+
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x, unused_kwarg=unused_kwarg)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Create a pipeline stage to wrap that submodule
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, stage_indices)
+        ]
+
+        # Attach to a schedule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_microbatches = (
             ScheduleClass.num_microbatches
             if hasattr(ScheduleClass, "num_microbatches")
@@ -1006,11 +1892,18 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
             stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
         )
 
+<<<<<<< HEAD
         # Run pipeline with kwargs
         out = None
         losses = []
         for _ in range(2):
             zero_gradients(stage_modules)
+=======
+        for _ in range(2):
+            # Zero gradients
+            for stage_module in stage_modules:
+                stage_module.zero_grad()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.rank == 0:
                 schedule.step(
                     x,
@@ -1019,11 +1912,16 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
                     .expand(num_microbatches, -1),
                 )
             elif self.rank == self.world_size - 1:
+<<<<<<< HEAD
+=======
+                losses = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = schedule.step(target=target, losses=losses)
             else:
                 schedule.step()
 
         dist.barrier()
+<<<<<<< HEAD
 
         # Verify results
         if self.rank == self.world_size - 1:
@@ -1035,11 +1933,37 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
         check_gradients(
             self.config, stage_modules, ref_mod, submod_names, rtol=3e-5, atol=5e-3
         )
+=======
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+
+            # Check loss
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=5e-3)
+                except AssertionError:
+                    print(
+                        f"Gradient test failed for {name}: {p.grad=} vs {ref_p.grad=}"
+                    )
+                    raise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 instantiate_parametrized_tests(ScheduleTest)
 
 
+<<<<<<< HEAD
 class CustomSchedulesTest(MultiProcContinuousTest):
     """
     These schedules are from the ScheduleRegistry and require world_size == 2
@@ -1234,5 +2158,7 @@ def test_schedule_with_native_zero_bubble(self, ScheduleClass):
 instantiate_parametrized_tests(CustomSchedulesTest)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index 1e6dad4a77d77..21d31169cfdba 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -14,15 +14,27 @@
     ScheduleGPipe,
 )
 from torch.distributed.pipelining._utils import PipeliningShapeError
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
+=======
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_distributed import (
+    MultiProcContinousTest,
+    MultiProcessTestCase,
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
+=======
+    skip_but_pass_in_sandcastle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle_if,
 )
 from torch.utils._pytree import tree_map_only
@@ -30,11 +42,17 @@
 
 d_hid = 512
 batch_size = 256
+<<<<<<< HEAD
 chunks = 8
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = dist.get_default_backend_for_device(device_type)
 TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
+=======
+chunks = 4
+
+device_type = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 torch.manual_seed(0)
 
@@ -63,11 +81,19 @@ def f(x):
     return flatten_hook
 
 
+<<<<<<< HEAD
 class StageTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
         return backend
+=======
+class StageTest(MultiProcContinousTest):
+    @classmethod
+    def backend_str(cls) -> str:
+        # Testing with NCCL backend
+        return "nccl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def device_type(cls) -> str:
@@ -77,10 +103,15 @@ def device_type(cls) -> str:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("ModelClass", [ExampleCode, MultiMLP])
     def test_tracer(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -123,10 +154,15 @@ def _run_step(x):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("ModelClass", [ModelWithKwargs])
     def test_tracer_kwargs(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -174,10 +210,15 @@ def test_tracer_kwargs(self, ModelClass):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_manual(self):
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
         full_mod.to(self.device)
@@ -208,10 +249,15 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_dw_with_fb_schedule(self):
         """Tests that separate weight grad function 'dw_runner' gets run under a schedule that's only aware of F/B."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -270,10 +316,15 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_output_chunks_memory_usage(self):
         """Test that output_chunks doesn't store memory for non-first stages."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -357,17 +408,26 @@ def tearDown(self):
     def init_pg(self):
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
+<<<<<<< HEAD
             backend=backend,
+=======
+            backend="nccl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             store=store,
             rank=self.rank,
             world_size=self.world_size,
             device_id=self.device,
         )
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle("Flaky in CI")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shape_prop_mismatch(self):
         """Tests shape prop errors are raised"""
         self.init_pg()
@@ -414,10 +474,15 @@ def _run_step(x):
             with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
                 _run_step(x)
 
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_dw_errors(self):
         """Tests expected errors are raised"""
         self.init_pg()
@@ -433,7 +498,10 @@ def test_custom_dw_errors(self):
             self.device,
             dw_builder=lambda: None,
         )
+<<<<<<< HEAD
         stage_with_dw_builder._has_backward = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(AssertionError, "backward_one_chunk"):
             stage_with_dw_builder.backward_weight_one_chunk(bwd_chunk_id=0)
 
diff --git a/test/distributed/pipelining/test_transformer.py b/test/distributed/pipelining/test_transformer.py
index 20e830547de7b..2f740ff476b23 100644
--- a/test/distributed/pipelining/test_transformer.py
+++ b/test/distributed/pipelining/test_transformer.py
@@ -73,9 +73,13 @@ def get_layers(module):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TransformerTests, globals(), only_for=devices, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(TransformerTests, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
index 0493f39b16cb8..742f22e5dd2e0 100644
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -73,9 +73,13 @@ def test_unflatten(self, device):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
+<<<<<<< HEAD
 instantiate_device_type_tests(
     UnflattenTests, globals(), only_for=devices, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(UnflattenTests, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/rpc/test_faulty_agent.py b/test/distributed/rpc/test_faulty_agent.py
index f9e9db18cce50..d0f4e615ba23f 100644
--- a/test/distributed/rpc/test_faulty_agent.py
+++ b/test/distributed/rpc/test_faulty_agent.py
@@ -22,7 +22,11 @@
 
 
 # On CircleCI these tests are already run on CPU jobs, thus to save resources do
+<<<<<<< HEAD
 # not run them on GPU jobs, since they wouldn't provide additional test signal.
+=======
+# not run them on GPU jobs, since thet wouldn't provide additional test signal.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not (IS_CI and torch.cuda.is_available()):
     globals().update(
         generate_tests(
diff --git a/test/distributed/rpc/test_tensorpipe_agent.py b/test/distributed/rpc/test_tensorpipe_agent.py
index e21460ba04c82..24ae29be9d074 100644
--- a/test/distributed/rpc/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/test_tensorpipe_agent.py
@@ -23,7 +23,11 @@
 
 
 # On CircleCI these tests are already run on CPU jobs, thus to save resources do
+<<<<<<< HEAD
 # not run them on GPU jobs, since they wouldn't provide additional test signal.
+=======
+# not run them on GPU jobs, since thet wouldn't provide additional test signal.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not (IS_CI and torch.cuda.is_available()):
     globals().update(
         generate_tests(
diff --git a/test/distributed/tensor/debug/test_comm_mode_features.py b/test/distributed/tensor/debug/test_comm_mode_features.py
index 86b3849fda69a..702ac7f596912 100644
--- a/test/distributed/tensor/debug/test_comm_mode_features.py
+++ b/test/distributed/tensor/debug/test_comm_mode_features.py
@@ -11,7 +11,11 @@
     parallelize_module,
     RowwiseParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, skipIfHpu, TEST_XPU, xfailIf
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     MLPModule,
@@ -144,10 +148,17 @@ def test_MLPStacked_distributed_sharding_display(self):
         model2 = MLPStacked(self.device_type)
 
         parallelize_plan = {
+<<<<<<< HEAD
             "layers.0.net1": ColwiseParallel(),
             "layers.0.net2": RowwiseParallel(),
             "layers.1.net1": ColwiseParallel(),
             "layers.1.net2": RowwiseParallel(),
+=======
+            "MLPStacked.layers.0.net1": ColwiseParallel(),
+            "MLPStacked.layers.0.net2": RowwiseParallel(),
+            "MLPStacked.layers.1.net1": ColwiseParallel(),
+            "MLPStacked.layers.1.net2": RowwiseParallel(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         model2 = parallelize_module(model2, device_mesh, parallelize_plan)
@@ -221,7 +232,10 @@ def test_MLP_module_tracing(self):
 
     @skipIfHpu
     @skip_unless_torch_gpu
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1555
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms
     def test_transformer_module_tracing(self, is_seq_parallel=False):
         """
diff --git a/test/distributed/tensor/experimental/test_local_map.py b/test/distributed/tensor/experimental/test_local_map.py
index dad23226363ed..c14584ac5af29 100644
--- a/test/distributed/tensor/experimental/test_local_map.py
+++ b/test/distributed/tensor/experimental/test_local_map.py
@@ -1,5 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
+<<<<<<< HEAD
+=======
+from functools import partial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -13,7 +17,10 @@
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental import local_map
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -49,7 +56,12 @@ def mm_allreduce_forward(device_mesh, A, B):
     return funcol.all_reduce(partial_sum_tensor, "sum", device_mesh).wait()
 
 
+<<<<<<< HEAD
 @local_map(
+=======
+@partial(
+    local_map,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_placements=replicate,
     in_placements=(None, col_wise, row_wise),
 )
@@ -88,7 +100,11 @@ def test_local_map_correctness(self):
         )  # row-wisely sharded W tensor
 
         # Test 1: use the function returned from calling local_map
+<<<<<<< HEAD
         # get the function wrapped with DTensor/Tensor conversion
+=======
+        # get the function wrapped with DTensor/Tensor convertion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # mm_allreduce_forward is a function that applies to Tensors with manual collective
         # local_mm_allreduce_forward is the function that does the same but applies to
         # DTensors' `_local_tensor`.
@@ -384,6 +400,7 @@ def test_local_map_with_grad_placement(self):
             )
             self.assertEqual(W_dt.grad.full_tensor(), W.grad)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @with_comms
     def test_multi_mesh_inputs(self):
@@ -426,6 +443,8 @@ def test_multi_mesh_inputs(self):
         # output lives in mesh_2d
         self.assertEqual(Y_dt.device_mesh, mesh_2d)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/experimental/test_tp_transform.py b/test/distributed/tensor/experimental/test_tp_transform.py
index 2f52d9c18b2bc..96771047ef132 100644
--- a/test/distributed/tensor/experimental/test_tp_transform.py
+++ b/test/distributed/tensor/experimental/test_tp_transform.py
@@ -85,7 +85,11 @@ def test_tp_transform_with_uncovered_op(self):
         with torch.no_grad():
             tp_res = tp_model(*inputs)
         self.assertEqual(res, tp_res)
+<<<<<<< HEAD
         # Expect all_gather to be inserted to distributed sharded fc results
+=======
+        # Expect all_gather to be inserted to distributed sharded fc resutls
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assert_has_c10d_ops(
             tp_exported_program.graph_module,
             {
diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
index 25e50e493cc53..53792ffbbb2c9 100644
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -352,7 +352,11 @@ def func(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
     @parametrize("scatter_dim", [0, 1, 2])
     @fresh_cache()
     def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
+<<<<<<< HEAD
         if scatter_dim >= A_dims - 1:
+=======
+        if scatter_dim >= A_dims:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         group = dist.group.WORLD
@@ -402,7 +406,11 @@ def func(
 
     @runOnRocmArch(MI300_ARCH)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @parametrize("scatter_dim", [0, 1])
+=======
+    @parametrize("scatter_dim", [0, 1, 2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @fresh_cache()
     def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
         self, scatter_dim
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index 2ef70f1a447e3..c5a42abdb0cbd 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -33,7 +33,11 @@ def forward(self, x):
 class TensorParallelAPITests(DTensorTestBase):
     @property
     def world_size(self):
+<<<<<<< HEAD
         gpu_num = torch.accelerator.device_count()
+=======
+        gpu_num = torch.cuda.device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
 
     def _compare_params(
@@ -333,6 +337,7 @@ def test_parallelize_module_multi_wildcard(self):
         self._compare_module(model, model_tp, inp_size, rank0_only=False)
 
     @with_comms
+<<<<<<< HEAD
     def test_parallelize_module_with_root_module(self):
         inp_size = [16, 10]
         model = MLPModule(self.device_type)
@@ -376,6 +381,8 @@ def test_parallelize_module_with_no_match(self):
         self._compare_module(model, model_tp, inp_size, rank0_only=False)
 
     @with_comms
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_under_devicemesh_context(self):
         # test ColwiseParallel
         inp_size = [8, 10]
@@ -400,8 +407,12 @@ def test_empty_plan(self):
         # Call parallelize_module with empty plan.
         # Goal is not to crash.
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+<<<<<<< HEAD
         with self.assertWarns(UserWarning):
             parallelize_module(model, device_mesh)
+=======
+        parallelize_module(model, device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 497e661ff10d4..f5ff3043a90e8 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -28,7 +28,11 @@
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import skipIf, skipXPUIf
+=======
+from torch.testing._internal.common_device_type import skipIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -283,7 +287,10 @@ def _thaw_params(thaw_params, model, model_tp):
     @skip_unless_torch_gpu
     @parametrize("is_seq_parallel", [True, False])
     @parametrize("dtype", [torch.float64, torch.float32])
+<<<<<<< HEAD
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         EXP_BASE_CC = ExpCommCounts(
             fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9}
@@ -416,7 +423,10 @@ def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
     )
     @skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
+<<<<<<< HEAD
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
         # Sample a subset of `requires_grad` patterns
 
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index 490210517f517..7db0fc966447c 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -66,7 +66,11 @@ def test_model_init(self):
             # in the following way:
             #   - within a tensor parallel group, the RNG is set with the same seed
             #   - across data parallel groups, the RNG is set with different seeds
+<<<<<<< HEAD
             torch.get_device_module(self.device_type).manual_seed(0)
+=======
+            torch.cuda.manual_seed(dp_rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # disable/enable parallel RNG feature
             if random._rng_tracker:
@@ -118,10 +122,21 @@ def tp_weights_assert(tensor1, tensor2):
 
                 # compare local shards across TP groups
                 def dp_weights_assert(tensor1, tensor2):
+<<<<<<< HEAD
                     # local weights shall be initialized the same across TP groups,
                     # and it doesn't matter whether DTensor's RNG infra is activated since all spmd ranks
                     # started with the same seed.
                     self.assertEqual(tensor1, tensor2)
+=======
+                    if enable_distribute_flag:
+                        # local weights shall be initialized the same across TP groups
+                        self.assertEqual(tensor1, tensor2)
+                    else:
+                        # without the parallel RNG, weight initialization violates the TP setup:
+                        # local weights are initialized differently across TP groups due to different
+                        # random seeds set in data loading.
+                        self.assertNotEqual(tensor1, tensor2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.check_gathered_tensors(
                     dp_rank, dp_size, tensor_gather, dp_weights_assert
diff --git a/test/distributed/tensor/test_api.py b/test/distributed/tensor/test_api.py
index e1790f4829907..565a90e0099a1 100644
--- a/test/distributed/tensor/test_api.py
+++ b/test/distributed/tensor/test_api.py
@@ -1,26 +1,37 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import tempfile
 
 import torch
 import torch.distributed.checkpoint as dcp
+=======
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.nn as nn
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
     distribute_tensor,
     DTensor,
+<<<<<<< HEAD
     Partial,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Replicate,
     Shard,
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+<<<<<<< HEAD
     create_local_tensor_test_class,
     DTensorTestBase,
     map_local_tensor_for_rank,
+=======
+    DTensorTestBase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with_comms,
 )
 
@@ -54,7 +65,11 @@ def world_size(self) -> int:
     def test_distribute_tensor_rank(self):
         comm_mode = CommDebugMode()
 
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         for requires_grad in [True, False]:
@@ -80,6 +95,7 @@ def test_distribute_tensor_rank(self):
         self.assertEqual(dist_tensor.placements[0].dim, 1)
 
         placement_combs = [[Shard(0)], [Shard(1)], [Replicate()]]
+<<<<<<< HEAD
 
         if not self.is_local_tensor_enabled:
             # test src_data_rank == 1
@@ -95,6 +111,19 @@ def test_distribute_tensor_rank(self):
                 full_dtensor = dtensor.full_tensor()
                 if self.rank == 1:
                     self.assertEqual(full_dtensor, tensor_to_distribute)
+=======
+        # test src_data_rank == 1
+        # set seed differently for each rank
+        torch.manual_seed(self.rank)
+        for placement in placement_combs:
+            tensor_to_distribute = torch.randn(3 * self.world_size, 3 * self.world_size)
+            dtensor = distribute_tensor(
+                tensor_to_distribute, device_mesh, placement, src_data_rank=1
+            )
+            full_dtensor = dtensor.full_tensor()
+            if self.rank == 1:
+                self.assertEqual(full_dtensor, tensor_to_distribute)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # test src_data_rank = None, make sure it does not have communication
         with comm_mode:
@@ -144,7 +173,11 @@ def test_distribute_tensor_errors(self):
 
     @with_comms
     def test_distribute_tensor_uneven_sharding(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_sizes_and_shard_dims = [
             ((self.world_size * 3 + 1, 3, 3), 0),
             ((self.world_size * 3 + 2, 3, 3), 0),
@@ -162,6 +195,7 @@ def test_distribute_tensor_uneven_sharding(self):
             dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
             self.assertEqual(dist_tensor.size(), torch.Size(input_size))
             local_tensor = dist_tensor.to_local()
+<<<<<<< HEAD
             self.assertEqual(
                 local_tensor,
                 map_local_tensor_for_rank(
@@ -172,6 +206,13 @@ def test_distribute_tensor_uneven_sharding(self):
     @with_comms
     def test_distribute_module(self):
         device_mesh = self.build_device_mesh()
+=======
+            self.assertEqual(local_tensor, splitted_tensor_list[self.rank])
+
+    @with_comms
+    def test_distribute_module(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # fully shard all linear modules on dim 0
         module_to_shard = MyModel(5 * self.world_size, 20, device=self.device_type)
         shard_spec = [Shard(0)]
@@ -234,7 +275,11 @@ def shard_fn(name, module, device_mesh):
 
     @with_comms
     def test_distribute_module_input_fn_output_fn(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # fully replicate all linear modules
         module_to_replicate = MyModel(20, 1, device=self.device_type)
@@ -279,7 +324,11 @@ def replicate_input_fn(mod, inputs, device_mesh):
 
     @with_comms
     def test_distribute_module_input_fn_output_fn_warning(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # fully replicate all linear modules
         module_to_replicate = MyModel(20, 1, device=self.device_type)
@@ -307,7 +356,11 @@ def output_fn(outputs, device_mesh):
 
     @with_comms
     def test_distribute_module_casting(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # check DTensor casting
         dt = DTensor.from_local(torch.rand(10), device_mesh, [Replicate()])
@@ -350,7 +403,11 @@ def test_distribute_module_casting(self):
     def test_distribute_module_meta(self):
         # If  the model is too big, the user may first the create entire model on the meta device and then initialize
         # it on the device in the partition function.
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # fully shard all parameters on dim 0
         module_to_shard = MyModel(5 * self.world_size, 20, device="meta")
@@ -371,6 +428,7 @@ def shard_fn(name, module, device_mesh):
             self.assertFalse(param.is_meta)
             self.assertTrue(param.device.type == device_mesh.device_type)
 
+<<<<<<< HEAD
     @with_comms
     def test_checkpoint_apis_check_partial_placement(self):
         device_mesh = self.build_device_mesh()
@@ -402,6 +460,8 @@ def test_checkpoint_apis_check_partial_placement(self):
 DTensorAPITestWithLocalTensor = create_local_tensor_test_class(
     DTensorAPITest, skipped_tests=["test_checkpoint_apis_check_partial_placement"]
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index 64d86ba3c129f..4c93cbfbe5f29 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
+<<<<<<< HEAD
 import itertools
 import random
 import unittest
@@ -20,12 +21,28 @@
     _cp_options,
     _disable_context_parallel_dispatcher,
     _enable_context_parallel_dispatcher,
+=======
+import unittest
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+from torch.distributed.tensor import DeviceMesh
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental._attention import (
+    _AttentionContextParallel,
+    _CausalBehavior,
+    _cp_options,
+    _DispatchMode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _is_causal_behavior,
     _RotateMethod,
     context_parallel,
     context_parallel_unshard,
     set_rotate_method,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.experimental._cp_custom_ops import flex_cp_allgather
 from torch.distributed.tensor.experimental._load_balancer import (
     _HeadTailLoadBalancer,
@@ -43,6 +60,10 @@
     create_block_mask,
     flex_attention,
 )
+=======
+from torch.distributed.tensor.parallel import parallelize_module
+from torch.nn.attention import sdpa_kernel, SDPBackend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -53,6 +74,11 @@
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
+<<<<<<< HEAD
+=======
+    ModelArgs,
+    Transformer,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with_comms,
 )
 
@@ -72,6 +98,7 @@
 }  # mapping from _RotateMethod enum to string
 
 
+<<<<<<< HEAD
 class SDPAWrapper(torch.nn.Module):
     def __init__(self, compiled: bool, backend: SDPBackend) -> None:
         super().__init__()
@@ -90,6 +117,8 @@ def forward(self, *args: object, **kwargs: object) -> torch.Tensor:
             return self.sdpa(*args, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RingAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
@@ -115,11 +144,19 @@ def test_ring_attention_sdpa(self) -> None:
                 "load_balance": [True, False],
                 "rotater": [_RotateMethod.ALL_TO_ALL, _RotateMethod.ALL_GATHER],
                 "test_forward_only": [True, False],
+<<<<<<< HEAD
                 "use_context": [True, False],
+=======
+                "dispatch_mode": [
+                    _DispatchMode.MONKEY_PATCH,
+                    _DispatchMode.TORCH_FUNCTION,
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             self._test_ring_attention_sdpa,
         )
 
+<<<<<<< HEAD
     def _ring_attention_sdpa(
         self,
         cp_q: torch.Tensor,
@@ -214,6 +251,8 @@ def _ring_attention_sdpa(
 
         return cp_out, cp_dq, cp_dk, cp_dv
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_ring_attention_sdpa(
         self,
         is_causal: bool,
@@ -222,8 +261,15 @@ def _test_ring_attention_sdpa(
         load_balance: bool,
         rotater: _RotateMethod,
         test_forward_only: bool,
+<<<<<<< HEAD
         use_context: bool,
     ) -> None:
+=======
+        dispatch_mode: _DispatchMode,
+    ) -> None:
+        torch.distributed.tensor.experimental._attention._dispatch_mode = dispatch_mode
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn_eval(fn, *args, **kwargs):
             if test_forward_only:
                 with torch.no_grad():
@@ -241,8 +287,13 @@ def fn_eval(fn, *args, **kwargs):
         device_mesh = DeviceMesh(self.device_type, torch.arange(0, self.world_size))
         dtype = torch.bfloat16
         bs = 8
+<<<<<<< HEAD
         seq_length = 1024
         seq_dim = 2
+=======
+        query_tokens = 64
+        context_tokens = 64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 32
         nheads = 8
         torch.manual_seed(10)
@@ -253,6 +304,7 @@ def fn_eval(fn, *args, **kwargs):
             else torch.float32
         )
 
+<<<<<<< HEAD
         q, k, v = [
             torch.rand(
                 (bs, nheads, seq_length * self.world_size, dim),
@@ -262,6 +314,28 @@ def fn_eval(fn, *args, **kwargs):
             )
             for _ in range(3)
         ]
+=======
+        _cp_options.enable_load_balance = load_balance
+
+        q = torch.rand(
+            (bs, nheads, self.world_size * query_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        k = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        v = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+            requires_grad=True,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Ensure all ranks have the same initialization data.
         with torch.no_grad():
@@ -272,6 +346,7 @@ def fn_eval(fn, *args, **kwargs):
         with sdpa_kernel(backend):
             out = fn_eval(F.scaled_dot_product_attention, q, k, v, is_causal=is_causal)
 
+<<<<<<< HEAD
         cp_q, cp_k, cp_v = [target.detach().clone() for target in [q, k, v]]
         cp_out, cp_dq, cp_dk, cp_dv = self._ring_attention_sdpa(
             cp_q,
@@ -315,6 +390,83 @@ def fn_eval(fn, *args, **kwargs):
         torch.testing.assert_close(q.grad, cp_dq, atol=atol, rtol=rtol)
         torch.testing.assert_close(k.grad, cp_dk, atol=atol, rtol=rtol)
         torch.testing.assert_close(v.grad, cp_dv, atol=atol, rtol=rtol)
+=======
+        cp_q = q.detach().clone()
+        cp_k = k.detach().clone()
+        cp_v = v.detach().clone()
+        # Theoretically, context_parallel() should not be used to shard
+        # parameters because when require_grad is True, resize_ is not
+        # allowed. But requires_grad of cp_q, cp_k, and cp_v are False
+        # now. So we can just use context_parallel() to shard q, k, v.
+        # In reality, context_paralle() should be used to shard the input.
+        with context_parallel(
+            device_mesh, buffers=(cp_q, cp_k, cp_v), buffer_seq_dims=(2, 2, 2)
+        ):
+            cp_q.requires_grad = True
+            cp_k.requires_grad = True
+            cp_v.requires_grad = True
+            with CommDebugMode() as comm_mode:
+                with sdpa_kernel(backend):
+                    if compiled:
+                        fn = torch.compile(
+                            F.scaled_dot_product_attention,
+                            fullgraph=True,
+                            backend="aot_eager",
+                        )
+                    else:
+                        fn = F.scaled_dot_product_attention
+
+                    cp_out = fn_eval(fn, cp_q, cp_k, cp_v, is_causal=is_causal)
+
+                    if not compiled and rotater == _RotateMethod.ALL_TO_ALL:
+                        # Compiler and CommDebugMode do not work well together.
+                        expect_all2all_count = (
+                            self.world_size - 1
+                            if test_forward_only
+                            else self.world_size * 3 - 2
+                        )
+                        self.assertDictEqual(
+                            comm_mode.get_comm_counts(),
+                            {c10d_functional.all_to_all_single: expect_all2all_count},
+                        )
+
+            # Due to numerical error, we need to choose different atol for different
+            # attention kernels
+            (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
+            atol = (
+                1e-08
+                if backend == SDPBackend.EFFICIENT_ATTENTION
+                else 1e-3 * self.world_size
+            )
+            self.assertTrue(torch.allclose(out, cp_out, atol=atol))
+
+            if not test_forward_only:
+                cp_dq, cp_dk, cp_dv = context_parallel_unshard(
+                    device_mesh,
+                    [cp_q.grad, cp_k.grad, cp_v.grad],
+                    [2, 2, 2],
+                )
+                atol = (
+                    2e-06
+                    if backend == SDPBackend.EFFICIENT_ATTENTION
+                    else 8e-3 * self.world_size
+                )
+                self.assertTrue(torch.allclose(q.grad, cp_dq, atol=atol))
+                self.assertTrue(torch.allclose(k.grad, cp_dk, atol=atol))
+                self.assertTrue(torch.allclose(v.grad, cp_dv, atol=atol))
+
+                cp_q.grad = None
+                cp_k.grad = None
+                cp_v.grad = None
+
+            cp_q.requires_grad = False
+            cp_k.requires_grad = False
+            cp_v.requires_grad = False
+
+        torch.distributed.tensor.experimental._attention._dispatch_mode = (
+            _DispatchMode.MONKEY_PATCH
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_is_causal_behavior(self) -> None:
         _cp_options.enable_load_balance = False
@@ -346,6 +498,7 @@ def test_is_causal_behavior(self) -> None:
                     behavior,
                 )
 
+<<<<<<< HEAD
 
 # Compile the flex_attention function
 compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
@@ -806,5 +959,182 @@ def test_context_parallel_shard(self) -> None:
         )
 
 
+=======
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    def test_ring_attention_native_transformer(self) -> None:
+        self.run_subtests(
+            {
+                "is_causal": [True, False],
+                "rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
+            },
+            self._test_ring_attention_native_transformer,
+        )
+
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    def _test_ring_attention_native_transformer(
+        self, is_causal: bool, rotater: _RotateMethod
+    ) -> None:
+        _cp_options.enable_load_balance = is_causal
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 8
+        ntokens = 8
+        dim = 32
+        nheads = 8
+        num_layers = 2
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=dim,
+            nhead=nheads,
+            dim_feedforward=dim,
+            batch_first=True,
+        ).to(dtype)
+        encoder_layer = parallelize_module(
+            module=encoder_layer,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                "self_attn": _AttentionContextParallel(),
+            },
+        )
+        model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        model = model.to(self.device_type).to(dtype)
+
+        mask = (
+            nn.Transformer.generate_square_subsequent_mask(
+                ntokens, device=self.device_type, dtype=dtype
+            )
+            if is_causal
+            else None
+        )
+        seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq, mask=mask, is_causal=is_causal)
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size - 1)
+                    * num_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: num_layers,
+                },
+            )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
+                    * num_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: num_layers,
+                    c10d_functional.all_to_all_single: self.world_size * num_layers,
+                },
+            )
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    def test_ring_attention_custom_transformer(self) -> None:
+        self.run_subtests(
+            {"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
+            self._test_ring_attention_custom_transformer,
+        )
+
+    def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        # early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
+        torch.distributed.tensor._random.manual_seed(10, device_mesh)
+
+        dtype = torch.bfloat16
+        bs = 2
+        args = ModelArgs()
+
+        model = Transformer(args).to(dtype).to(self.device_type)
+
+        model = parallelize_module(
+            module=model,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                f"layers.{i}.attention": _AttentionContextParallel()
+                for i in range(args.n_layers)
+            },
+        )
+
+        seq = torch.randint(
+            args.vocab_size, (bs, args.max_seq_len), device=self.device_type
+        )
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq)
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size - 1)
+                    * args.n_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {c10d_functional.all_gather_into_tensor: args.n_layers},
+            )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
+                    * args.n_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: args.n_layers,
+                    c10d_functional.all_to_all_single: self.world_size * args.n_layers,
+                },
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_common_rules.py b/test/distributed/tensor/test_common_rules.py
index 3450f8faa2b5c..abcd910b1c686 100644
--- a/test/distributed/tensor/test_common_rules.py
+++ b/test/distributed/tensor/test_common_rules.py
@@ -8,17 +8,31 @@
 from torch.distributed.tensor._ops._common_rules import einop_rule, pointwise_rule
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+<<<<<<< HEAD
     DTensorContinuousTestBase,
+=======
+    DTensorTestBase,
+    with_comms,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
 aten = torch.ops.aten
 
 
+<<<<<<< HEAD
 class CommonRulesTest(DTensorContinuousTestBase):
     # hard code world size to 4 as we need to test
     # at least with 2d mesh
     world_size = 4
+=======
+class CommonRulesTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        # hard code world size to 4 as we need to test
+        # at least with 2d mesh
+        return 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _gen_tensor_meta(self, shape):
         empty_tensor = torch.empty(shape)
@@ -28,9 +42,16 @@ def _gen_tensor_meta(self, shape):
             empty_tensor.dtype,
         )
 
+<<<<<<< HEAD
     def test_einop_basic_propagation(self):
         # plain einsum, mm
         mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
+=======
+    @with_comms
+    def test_einop_basic_propagation(self):
+        # plain einsum, mm
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mm_call = aten.mm.default
         # propagate col-wise sharding
@@ -81,8 +102,14 @@ def test_einop_basic_propagation(self):
         self.assertIsNotNone(output_spec)
         self.assertTrue(output_spec.placements[0].is_partial())
 
+<<<<<<< HEAD
     def test_einop_pointwise_propagation(self):
         mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
+=======
+    @with_comms
+    def test_einop_pointwise_propagation(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_call = aten.add.Tensor
         # addition
@@ -132,12 +159,20 @@ def test_einop_pointwise_propagation(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, -1, -1])
 
+<<<<<<< HEAD
+=======
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_einop_merge_sharding(self):
         # 2d mesh einop merge sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
+<<<<<<< HEAD
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mm_call = aten.mm.default
 
@@ -157,11 +192,19 @@ def test_einop_merge_sharding(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, 1])
 
+<<<<<<< HEAD
+=======
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_einop_linearity(self):
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
+<<<<<<< HEAD
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mm_call = aten.mm.default
 
@@ -224,10 +267,18 @@ def test_einop_linearity(self):
         # mat2 mesh dim 1 should become partial now!
         self.assertTrue(mat2_spec.placements[1].is_partial())
 
+<<<<<<< HEAD
     def test_einop_multi_sharding_on_mesh_dim(self):
         # einop prop with multi sharding on same mesh dim
         mesh_shape = torch.arange(self.world_size)
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+    @with_comms
+    def test_einop_multi_sharding_on_mesh_dim(self):
+        # einop prop with multi sharding on same mesh dim
+        mesh_shape = torch.arange(self.world_size)
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mm_call = aten.mm.default
         mat1, mat2 = [0, -1], [0, -1]
@@ -252,11 +303,19 @@ def test_einop_multi_sharding_on_mesh_dim(self):
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [0, -1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, [-1, -1])
 
+<<<<<<< HEAD
+=======
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_einop_errors(self):
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
+<<<<<<< HEAD
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_call = aten.add.Tensor
         mat1, mat2 = [0, -1], [1, -1]
@@ -272,8 +331,14 @@ def test_einop_errors(self):
         with self.assertRaisesRegex(RuntimeError, "sharded two different ways:"):
             einop_rule("ij,ij->ij", OpSchema(add_call, (mat1_spec, mat2_spec), {}))
 
+<<<<<<< HEAD
     def test_pointwise_rules_broadcasting(self):
         mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
+=======
+    @with_comms
+    def test_pointwise_rules_broadcasting(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         where_call = aten.where.self
         inp1, inp2, inp3 = [0], [], [-1, -1]
@@ -297,8 +362,14 @@ def test_pointwise_rules_broadcasting(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [-1, 0])
 
+<<<<<<< HEAD
     def test_pointwise_rules_suggestion(self):
         mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
+=======
+    @with_comms
+    def test_pointwise_rules_suggestion(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         lerp_call = aten.lerp.Scalar
         # propagate point-wise sharding
@@ -324,12 +395,20 @@ def test_pointwise_rules_suggestion(self):
         self.assertEqual(len(schema_suggestion.args_schema), 3)
         self.assertEqual(schema_suggestion.args_schema[2], -1)
 
+<<<<<<< HEAD
+=======
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_multi_sharding_on_mesh_dim(self):
         # 2d mesh pointwise sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
+<<<<<<< HEAD
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_call = aten.add.Tensor
 
@@ -369,12 +448,20 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat2)
 
+<<<<<<< HEAD
+=======
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
         # 2d mesh pointwise sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
+<<<<<<< HEAD
         mesh = DeviceMesh(self.device_type(), mesh_shape)
+=======
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_call = aten.add_.Tensor
 
diff --git a/test/distributed/tensor/test_convolution_ops.py b/test/distributed/tensor/test_convolution_ops.py
index 68c52353b21ae..e48789b233e1e 100644
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@@ -5,7 +5,11 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed import DeviceMesh
+=======
+from torch.distributed import DeviceMesh, init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor import (
     distribute_module,
     distribute_tensor,
@@ -48,7 +52,11 @@ def world_size(self) -> int:
 
     @with_comms
     def test_downsampling_convolution(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(3)]
 
         input_list = torch.rand(ITER_TIME, 7, 3, 512, 1024)
@@ -118,7 +126,11 @@ def test_downsampling_convolution(self):
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_depthwise_convolution(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(3)]
 
         input_list = torch.rand(ITER_TIME, 7, 256, 128, 256)
@@ -186,7 +198,13 @@ def test_depthwise_convolution(self):
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_conv_backward_none_grad_inp(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         conv = nn.Conv2d(64, 64, 3, padding=1).train()
         x = torch.randn(1, 64, 32, 32)
         x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
@@ -203,6 +221,7 @@ def test_conv_backward_none_grad_inp(self):
         self.assertTrue(b_dt.grad is not None)
         self.assertTrue(x_dt.grad is None)
 
+<<<<<<< HEAD
     @with_comms
     def test_conv1d(self):
         device_mesh = self.build_device_mesh()
@@ -231,6 +250,8 @@ def test_conv3d(self):
         out = model_gt(x)
         self.assertEqual(out_dt.shape, out.shape)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index 0a607581a340b..ad04f1f42c0fd 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -1,6 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import pathlib
 import tempfile
 import unittest
@@ -20,11 +24,15 @@
     Shard,
 )
 from torch.distributed.tensor._api import _shard_tensor
+<<<<<<< HEAD
 from torch.distributed.tensor._dtensor_spec import (
     DTensorSpec,
     ShardOrderEntry,
     TensorMeta,
 )
+=======
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental import implicit_replication
 from torch.distributed.tensor.parallel import (
@@ -32,6 +40,7 @@
     parallelize_module,
     RowwiseParallel,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.placement_types import _StridedShard
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import IS_FBCODE, run_tests, skipIfHpu
@@ -41,6 +50,14 @@
     map_local_tensor_for_rank,
     with_comms,
 )
+=======
+from torch.testing._internal.common_utils import IS_FBCODE, run_tests, skipIfHpu
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.logging_utils import LoggingTestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 c10d_functional = torch.ops.c10d_functional
@@ -67,7 +84,11 @@ def reset_parameters(self, *args, **kwargs):
 class DTensorTest(DTensorTestBase):
     @with_comms
     def test_dtensor_constructor(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
         local_tensor = torch.randn(3, 3, requires_grad=True)
 
@@ -155,7 +176,11 @@ def test_modules_w_meta_dtensor(self):
 
     @with_comms
     def test_dtensor_stride(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard0_spec = [Shard(0)]
         local_tensor = torch.randn(4, 8)
         dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard0_spec)
@@ -178,10 +203,17 @@ def test_dtensor_stride(self):
 
     @with_comms
     def test_from_local(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         local_tensor = torch.randn(3, 3)
         sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        placements = [Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(sharded_tensor.size(), torch.Size([self.world_size * 3, 3]))
 
         replica_spec = [Replicate()]
@@ -198,14 +230,22 @@ def test_from_local(self):
         local_tensor_temp = local_tensor_with_grad * 3
         # create the dist tensor with non leaf local tensor, dist tensor created
         # should also be non leaf node
+<<<<<<< HEAD
         dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, shard_spec)
+=======
+        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertFalse(dist_tensor.is_leaf)
         # do some random operations on dist tensor
         output = dist_tensor * 3
         self.assertIsInstance(output, DTensor)
         # trigger .backward() on dist tensor directly
         local_grad = torch.ones(3, 3)
+<<<<<<< HEAD
         grad_output = DTensor.from_local(local_grad, device_mesh, shard_spec)
+=======
+        grad_output = DTensor.from_local(local_grad, device_mesh, placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # run backward directly on dist tensor
         output.backward(grad_output)
         # check it gradients flow back to original torch.Tensor
@@ -213,6 +253,7 @@ def test_from_local(self):
         expected_grad = torch.ones(3, 3) * 9
         self.assertEqual(local_tensor_with_grad.grad, expected_grad)
 
+<<<<<<< HEAD
         # DTensor.from_local should raise error if the `local_tensor`
         # argument is a DTensor
         local_tensor = torch.ones(2, 2)
@@ -226,6 +267,12 @@ def test_from_local(self):
     @with_comms
     def test_from_local_uneven_sharding(self):
         device_mesh = self.build_device_mesh()
+=======
+    @with_comms
+    def test_from_local_uneven_sharding(self):
+        mesh_shape = (self.world_size,)
+        device_mesh = init_device_mesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         uneven_dim0_size = self.world_size + 1
         global_tensor = torch.randn(uneven_dim0_size, 2)
@@ -238,7 +285,11 @@ def test_from_local_uneven_sharding(self):
         )
 
         dtensor = DTensor.from_local(
+<<<<<<< HEAD
             map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
+=======
+            tensor_list[self.rank],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_mesh,
             (Shard(0),),
             shape=global_tensor.size(),
@@ -250,7 +301,12 @@ def test_from_local_uneven_sharding(self):
 
     @with_comms
     def test_from_local_uneven_sharding_raise_error(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        mesh_shape = (self.world_size,)
+        device_mesh = init_device_mesh(self.device_type, mesh_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         uneven_dim0_size = self.world_size + 1
         global_tensor = torch.randn(uneven_dim0_size, 2)
@@ -266,7 +322,11 @@ def test_from_local_uneven_sharding_raise_error(self):
             RuntimeError, "Please pass both shape and stride at the same time."
         ):
             DTensor.from_local(
+<<<<<<< HEAD
                 map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
+=======
+                tensor_list[self.rank],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device_mesh,
                 (Shard(0),),
                 shape=global_tensor.size(),
@@ -276,7 +336,11 @@ def test_from_local_uneven_sharding_raise_error(self):
             RuntimeError, "Please pass both shape and stride at the same time."
         ):
             DTensor.from_local(
+<<<<<<< HEAD
                 map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
+=======
+                tensor_list[self.rank],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device_mesh,
                 (Shard(0),),
                 stride=global_tensor.stride(),
@@ -284,7 +348,11 @@ def test_from_local_uneven_sharding_raise_error(self):
 
     @with_comms
     def test_from_local_negative_dim(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(-1)]
         local_tensor = torch.randn(3, 3)
         sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
@@ -292,7 +360,11 @@ def test_from_local_negative_dim(self):
 
     @with_comms
     def test_to_local(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = (Shard(0),)
         local_tensor_with_grad = torch.randn(
             3, 3, device=self.device_type, requires_grad=True
@@ -352,7 +424,11 @@ def test_to_local(self):
 
     @with_comms
     def test_to_local_grad_hint(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = (Shard(0),)
         global_tensor = torch.ones(8, 3, requires_grad=True)
 
@@ -377,7 +453,11 @@ def test_to_local_grad_hint(self):
 
     @with_comms
     def test_full_tensor_sync(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = (Shard(0),)
         global_tensor = torch.ones(8, 3, requires_grad=True)
 
@@ -388,7 +468,11 @@ def test_full_tensor_sync(self):
 
     @with_comms
     def test_full_tensor_grad_hint(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = (Shard(0),)
         global_tensor = torch.ones(8, 3, requires_grad=True)
 
@@ -401,7 +485,11 @@ def test_full_tensor_grad_hint(self):
 
     @with_comms
     def test_dtensor_new_empty_strided(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_tensor = torch.randn(8, 8, requires_grad=True, device=self.device_type)
         my_dtensor = distribute_tensor(local_tensor, device_mesh, [Shard(0)])
         new_strided_dtensor = my_dtensor.new_empty_strided(
@@ -427,7 +515,11 @@ def test_dtensor_async_output(self):
         # Tests that if the output of some dtensor operations  isn't used in any compute,
         # the output should be an AsyncCollectiveTensor (representing the fact that
         # we haven't synced the collective yet).
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(dt):
             dt_out_redistribute = dt.redistribute(mesh, [Replicate()], async_op=True)
@@ -452,7 +544,11 @@ def fn(dt):
         self.assertEqual(type(out_view), AsyncCollectiveTensor)
         self.assertFalse(out.completed)
 
+<<<<<<< HEAD
         # Use the data, requiring a sync
+=======
+        # Use the daa, requiring a sync
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = torch.ones((4, 2), device=self.device_type) + 1
         ref = ref.view(-1)
         out_data = out_view + 1
@@ -467,7 +563,11 @@ def fn(dt):
     @with_comms
     def test_from_local_then_to_local(self):
         # this test ensure end to end from torch.Tensor -> dist tensor -> torch.Tensor works
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
 
         # step 1. construct from construct local tensor
@@ -499,7 +599,11 @@ def test_from_local_then_to_local(self):
 
     @with_comms
     def test_dtensor_spec_read_only_after_set(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
         local_tensor = torch.randn(3, 3)
         sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
@@ -511,7 +615,11 @@ def test_dtensor_spec_read_only_after_set(self):
 
     @with_comms
     def test_dtensor_spec_hash(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
         local_tensor = torch.randn(3, 3)
         local_tensor2 = torch.randn(3, 3)
@@ -531,7 +639,11 @@ def test_dtensor_spec_hash(self):
 
     @with_comms
     def test_dtensor_properties(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
         local_tensor = torch.randn(3, 3)
         sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
@@ -585,7 +697,11 @@ def test_dtensor_save_load_import(self):
     @with_comms
     def test_shard_tensor(self):
         ws = self.world_size
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(ws)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         full_tensor = torch.arange(ws * ws).reshape(ws, ws)
 
         # Shard by row
@@ -594,12 +710,16 @@ def test_shard_tensor(self):
         self.assertEqual(sharded_tensor.size(), torch.Size([ws, ws]))
         self.assertEqual(sharded_tensor.placements, placements)
         local_tensor = sharded_tensor.to_local()
+<<<<<<< HEAD
         self.assertEqual(
             local_tensor,
             map_local_tensor_for_rank(
                 full_tensor, self.rank, lambda ft, r: ft[range(r, r + 1), :]
             ),
         )
+=======
+        self.assertEqual(local_tensor, full_tensor[range(self.rank, self.rank + 1), :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Shard by column
         placements = [Shard(1)]
@@ -607,12 +727,16 @@ def test_shard_tensor(self):
         self.assertEqual(sharded_tensor.size(), torch.Size([ws, ws]))
         self.assertEqual(sharded_tensor.placements, placements)
         local_tensor = sharded_tensor.to_local()
+<<<<<<< HEAD
         self.assertEqual(
             local_tensor,
             map_local_tensor_for_rank(
                 full_tensor, self.rank, lambda ft, r: ft[:, range(r, r + 1)]
             ),
         )
+=======
+        self.assertEqual(local_tensor, full_tensor[:, range(self.rank, self.rank + 1)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # assert full tensor is not changed
         self.assertEqual(full_tensor, torch.arange(ws * ws).reshape(ws, ws))
@@ -632,6 +756,7 @@ def test_shard_tensor_2d(self):
         self.assertEqual(local_tensor.item(), self.rank)
 
 
+<<<<<<< HEAD
 DTensorTestWithLocalTensor = create_local_tensor_test_class(
     DTensorTest,
     skipped_tests=[
@@ -645,6 +770,8 @@ def test_shard_tensor_2d(self):
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DTensorMeshTest(DTensorTestBase):
     @property
     def world_size(self):
@@ -659,7 +786,11 @@ def sub_mesh_assert_equal(self, mesh, exp_in_mesh, exp_out_of_mesh, tensor):
     @with_comms
     def test_dtensor_device_mesh_device_conversion(self):
         # construct a cuda device mesh
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # construct from a cpu local tensor with cuda device mesh
         # should automatically convert the dist tensor to cuda
@@ -671,14 +802,22 @@ def test_dtensor_device_mesh_device_conversion(self):
 
     @with_comms
     def test_dtensor_api_device_mesh_context_manager(self):
+<<<<<<< HEAD
         with self.build_device_mesh() as mesh:
+=======
+        with DeviceMesh(self.device_type, list(range(self.world_size))) as mesh:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             placements = [Shard(0)]
             local_tensor = torch.randn(3, 3)
             sharded_tensor = DTensor.from_local(
                 local_tensor, device_mesh=mesh, placements=placements
             )
 
+<<<<<<< HEAD
         with self.build_device_mesh():
+=======
+        with DeviceMesh(self.device_type, list(range(self.world_size))):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             placements = [Shard(0)]
             local_tensor = torch.randn(3, 3)
             sharded_tensor = DTensor.from_local(local_tensor, placements=placements)
@@ -688,7 +827,11 @@ def test_dtensor_api_device_mesh_context_manager(self):
                 replica_tensor.size(), torch.Size([3 * self.world_size, 3])
             )
 
+<<<<<<< HEAD
         with self.build_device_mesh():
+=======
+        with DeviceMesh(self.device_type, torch.arange(self.world_size)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             placements = [Shard(0)]
             global_shape = torch.Size([3 * self.world_size, 3])
             global_tensor = torch.randn(global_shape)
@@ -874,7 +1017,11 @@ def test_redistribute_sub_mesh(self):
 
     @with_comms
     def test_implicit_replication(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_tensor1 = torch.ones(4, 3)
         sharded_dtensor = DTensor.from_local(local_tensor1, mesh, [Shard(0)])
 
@@ -889,6 +1036,7 @@ def test_implicit_replication(self):
             self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
 
     @with_comms
+<<<<<<< HEAD
     def test_vmap_embedding(self):
         mesh = self.build_device_mesh()
         batch_size, seq_len = 2, 6
@@ -928,6 +1076,10 @@ def test_inplace_on_local_tensor_view(self):
     @with_comms
     def test_auto_implicit_replication(self):
         mesh = self.build_device_mesh()
+=======
+    def test_auto_implicit_replication(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         local_tensor = torch.ones(self.world_size, 3, device=self.device_type)
         sharded_dtensor = DTensor.from_local(local_tensor, mesh, [Shard(0)])
@@ -951,6 +1103,7 @@ def add_scalar_tensor_with_dtensor():
             (numel_1_tensor + sharded_dtensor).to_local(), numel_1_tensor + local_tensor
         )
 
+<<<<<<< HEAD
     @unittest.expectedFailure
     @with_comms
     def test_dtensor_cond(self):
@@ -975,6 +1128,11 @@ def make_dtensor(*shape, dtype, device):
     @with_comms
     def test_metadata_consistency_check(self):
         device_mesh = self.build_device_mesh()
+=======
+    @with_comms
+    def test_metadata_consistency_check(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
 
         # Create a local tensor with specific metadata and check dtype change
@@ -1020,6 +1178,7 @@ def test_metadata_consistency_check(self):
             self.fail("Unexpected ValueError raised with run_check=False")
 
 
+<<<<<<< HEAD
 DTensorMeshTestWithLocalTensor = create_local_tensor_test_class(
     DTensorMeshTest,
     skipped_tests=[
@@ -1033,6 +1192,8 @@ def test_metadata_consistency_check(self):
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestDTensorPlacementTypes(DTensorTestBase):
     @property
     def world_size(self):
@@ -1049,7 +1210,11 @@ def _create_tensor(self, size):
 
     @with_comms
     def test_split_tensor_1D(self) -> None:
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_placement = Shard(0)
 
         for size in range(8):
@@ -1066,7 +1231,11 @@ def test_split_tensor_1D(self) -> None:
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
                 is_tensor_empty = [
+<<<<<<< HEAD
                     not splitted_tensor.numel() > 0
+=======
+                    False if splitted_tensor.numel() > 0 else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for splitted_tensor in splitted_tensor_list
                 ]
                 expected_is_tensor_empty = [True] * self.world_size
@@ -1089,14 +1258,24 @@ def test_split_tensor_1D(self) -> None:
                     for i, tensor in enumerate(splitted_tensor_list)
                 ]
                 expected_is_tensor_empty = [
+<<<<<<< HEAD
                     not idx < size for idx, _ in enumerate(range(self.world_size))
                 ]
                 is_tensor_empty = [
                     not unpadded_tensor.numel() > 0 for unpadded_tensor in unpadded_list
+=======
+                    False if idx < size else True
+                    for idx, _ in enumerate(range(self.world_size))
+                ]
+                is_tensor_empty = [
+                    False if unpadded_tensor.numel() > 0 else True
+                    for unpadded_tensor in unpadded_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
                 assert_array_equal(expected_is_tensor_empty, is_tensor_empty)
 
 
+<<<<<<< HEAD
 TestDTensorPlacementTypesWithLocalTensor = create_local_tensor_test_class(
     TestDTensorPlacementTypes,
 )
@@ -1284,6 +1463,38 @@ def test_default_shard_order(self):
 TestDTensorSpecWithLocalTensor = create_local_tensor_test_class(
     TestDTensorSpec,
 )
+=======
+class DTensorLogTest(LoggingTestCase):
+    def test_dtensor_log(self):
+        if not torch.distributed.is_available() or not torch.cuda.is_available():
+            return
+
+        env = dict(os.environ)
+        env["TORCH_LOGS"] = "+dtensor"
+        env["RANK"] = "0"
+        env["WORLD_SIZE"] = "1"
+        env["MASTER_PORT"] = "12345"
+        env["MASTER_ADDR"] = "localhost"
+
+        _, stderr = self.run_process_no_exception(
+            """\
+import logging
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Shard
+
+mesh = init_device_mesh("cuda", (1,), mesh_dim_names=("dp",))
+placements = [Shard(0)]
+tensor = torch.randn(12, 8, 8)
+dtensor = distribute_tensor(tensor, mesh, placements)
+dtensor.max()
+""",
+            env=env,
+        )
+        self.assertIn("_dispatch.py", stderr.decode("utf-8"))
+        self.assertIn("redistribute=False", stderr.decode("utf-8"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index b82e9c97b57a8..6fff442e4683c 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -20,6 +20,7 @@
 )
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+<<<<<<< HEAD
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -33,6 +34,12 @@
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     loss_parallel,
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parallelize_module,
     PrepareModuleInput,
     PrepareModuleOutput,
@@ -96,6 +103,7 @@ def extract_graph(fx_g, _, graph_cell):
 )
 
 
+<<<<<<< HEAD
 def _apply_sharding(mod: nn.Module, shard_dim: int, device_mesh: DeviceMesh):
     """
     Shards on the given dimension if possible, else replicate
@@ -123,6 +131,8 @@ def shard_module_params(name, module, device_mesh):
     return sharded_mod
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestDTensorCompile(torch._dynamo.test_case.TestCase):
     def setUp(self):
         super(
@@ -183,7 +193,11 @@ def forward(self, x):
         )
         torch.utils._pytree.register_constant(DeviceMesh)
 
+<<<<<<< HEAD
         ep = torch.export.export(
+=======
+        ep = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Foo(), (torch.randn(4, 4, dtype=torch.float64),), strict=False
         )
         self.assertExpectedInline(
@@ -202,8 +216,11 @@ def forward(self, b_buffer, x):
     return (view_as_1,)""",  # noqa: B950
         )
 
+<<<<<<< HEAD
         # During tracing, sharding propagation cache is skipped, so an extra dry run for
         # add is performed in _propagate_tensor_meta_non_cached, hence add_1 instead of add
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep.run_decompositions({}).graph_module.code).strip(),
             """\
@@ -258,7 +275,11 @@ def fn(x: DeviceMesh):
             group1 = x.get_group(mesh_dim=1)
             return size, coord, group0, group1
 
+<<<<<<< HEAD
         # Can't be fullgraph=True because ProcessGroup is not reconstructible in dynamo
+=======
+        # Cant be fullgraph=True because ProcessGroup is not reconstructible in dynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fn = torch.compile(backend="aot_eager")(fn)
 
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).unsqueeze(1))
@@ -266,6 +287,7 @@ def fn(x: DeviceMesh):
         compiled_out = compiled_fn(mesh)
         self.assertEqual(opt_fn, compiled_out)
 
+<<<<<<< HEAD
     def test_get_local_rank_compile(self):
         mesh = init_device_mesh(
             self.device_type, (self.world_size,), mesh_dim_names=("dp",)
@@ -301,6 +323,8 @@ def fn_without_arg(x):
         res3 = opt_fn3(x)
         self.assertEqual(res3, ref3)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fakify_dtensor(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -341,9 +365,13 @@ def fn(x):
                 .to_local()[0]
             )
 
+<<<<<<< HEAD
         x = DTensor.from_local(
             torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
         )
+=======
+        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._dynamo.mark_dynamic(x, 0)
         ref = fn(x)
 
@@ -351,6 +379,7 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(res, ref)
 
+<<<<<<< HEAD
     @skipIfHpu
     @unittest.skip(
         "DTensor + dynamic fails - s77 + 8 is not tracked with proxy .. proxy_tensor.PythonKeyTracer"
@@ -464,6 +493,8 @@ def g(x):
         run(g, 64, 8)
         self.assertEqual(cnt.frame_count, 2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtensor_attribute_access_on_intermediate(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -994,9 +1025,12 @@ def fn(x):
         out_dt = torch.matmul(tmp_dt, y_dt)
         out_dt.sum().backward()
 
+<<<<<<< HEAD
     @unittest.skipIf(
         torch._inductor.config.triton.native_matmul, "Matmul is now generated"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_tp_compile_comm_reordering(self):
         class FakeAttention(nn.Module):
             def __init__(self) -> None:
@@ -1300,6 +1334,7 @@ def fn(x, y):
         self.assertEqual(x_ref.grad, x.grad)
         self.assertEqual(y_ref.grad, y.grad)
 
+<<<<<<< HEAD
     @with_comms
     def test_compile_embedding_redistribute(self):
         mesh = self.build_device_mesh()
@@ -1323,6 +1358,8 @@ def forward(self, x):
         output = sharded_net(replicated_inp)
         self.assertEqual(output.full_tensor(), ref_out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index df51152a90307..1b45413984a22 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -1,14 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import copy
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 import warnings
 
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
+<<<<<<< HEAD
 from torch.distributed._local_tensor import LocalTensorMode, reconcile_args
 from torch.distributed.tensor import (
     distribute_tensor,
@@ -17,19 +21,29 @@
     Replicate,
     Shard,
 )
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     ops,
 )
 from torch.testing._internal.common_methods_invocations import DecorateInfo, op_db
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, suppress_warnings, TestCase
+=======
+from torch.testing._internal.common_utils import run_tests, suppress_warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorConverter,
     DTensorOpTestBase,
 )
 from torch.utils import _pytree as pytree
+<<<<<<< HEAD
 from torch.utils._debug_mode import DebugMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import tree_map
 
 
@@ -51,7 +65,11 @@ def skip(op_name, variant_name="", *, device_type=None, dtypes=None):
     return (op_name, variant_name, device_type, dtypes, False)
 
 
+<<<<<<< HEAD
 def skipOps(op_db, test_case_name, base_test_name, to_skip):
+=======
+def skipOps(test_case_name, base_test_name, to_skip):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_opinfos = op_db
     for xfail in to_skip:
         op_name, variant_name, device_type, dtypes, expected_failure = xfail
@@ -90,6 +108,7 @@ def wrapped(fn):
     return wrapped
 
 
+<<<<<<< HEAD
 def repurpose_ops(op_db, base_test_name, derived_test_name):
     """
     Copies op info database and for the decorators that applied to base test class updates
@@ -118,6 +137,8 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     return repurposed_ops
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Re-generate this failed list, turn on dry_run of the below func
 # check_dtensor_func(self, test, op, dry_run=True), then run sth
 # like python test/distributed/tensor/test_dtensor_ops.py > failed.expect
@@ -141,6 +162,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("arange"),
     xfail("argmax"),
     xfail("argmin"),
+<<<<<<< HEAD
+=======
+    xfail("argsort"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("as_strided"),
     xfail("as_strided", "partial_views"),
     xfail("as_strided_copy"),
@@ -155,14 +180,31 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("cholesky"),
     xfail("cholesky_inverse"),
     xfail("cholesky_solve"),
+<<<<<<< HEAD
     xfail("combinations"),
     xfail("complex"),
+=======
+    xfail("chunk"),
+    xfail("clamp"),
+    xfail("clamp_max"),
+    xfail("clamp_min"),
+    xfail("combinations"),
+    xfail("complex"),
+    xfail("constant_pad_nd"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("count_nonzero"),
     xfail("cross"),
     xfail("cummax"),
     xfail("cummin"),
     xfail("diagonal_scatter"),
     xfail("dist"),
+<<<<<<< HEAD
+=======
+    xfail("empty"),
+    xfail("empty_strided"),
+    xfail("empty_like"),
+    xfail("empty_permuted"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("expand_copy"),
     xfail("exponential"),
     xfail("equal"),
@@ -192,10 +234,21 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("fmin"),
     xfail("frexp"),
     xfail("full"),
+<<<<<<< HEAD
+    xfail("geometric"),
+    xfail("geqrf"),
+    xfail("grid_sampler_2d"),
+    xfail("heaviside"),
+=======
+    xfail("full_like"),
+    xfail("gather"),
     xfail("geometric"),
     xfail("geqrf"),
     xfail("grid_sampler_2d"),
+    xfail("gradient"),
     xfail("heaviside"),
+    xfail("histc"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("histogram"),
     xfail("histogramdd"),
     xfail("index_add"),
@@ -229,6 +282,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("linalg.lu_factor_ex"),
     xfail("linalg.lu_solve"),
     xfail("linalg.matrix_power"),
+<<<<<<< HEAD
+=======
+    xfail("linalg.multi_dot"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("linalg.pinv"),
     xfail("linalg.pinv", "hermitian"),
     xfail("linalg.slogdet"),
@@ -255,6 +312,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("masked_select"),
     xfail("masked.argmax"),
     xfail("masked.argmin"),
+<<<<<<< HEAD
+=======
+    xfail("masked.cumprod"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("masked.logsumexp"),
     xfail("masked.median"),
     xfail("matrix_exp"),
@@ -262,6 +323,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("median"),
     xfail("min", "reduction_with_dim"),
     xfail("mode"),
+<<<<<<< HEAD
+=======
+    xfail("msort"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("multinomial"),
     xfail("mv"),
     xfail("max_pool2d_with_indices_backward", ""),
@@ -270,8 +335,16 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("nanquantile"),
     xfail("nansum"),
     xfail("native_batch_norm"),
+<<<<<<< HEAD
+    xfail("narrow_copy"),
+    xfail("ne"),
+=======
+    xfail("native_dropout_backward"),
     xfail("narrow_copy"),
     xfail("ne"),
+    xfail("new_empty"),
+    xfail("new_empty_strided"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("transpose"),
     xfail("nn.functional.adaptive_avg_pool1d"),
     xfail("nn.functional.adaptive_avg_pool2d"),
@@ -298,6 +371,11 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("nn.functional.cosine_similarity"),
     xfail("nn.functional.ctc_loss"),
     xfail("nn.functional.dropout"),
+<<<<<<< HEAD
+=======
+    xfail("nn.functional.dropout2d"),
+    xfail("nn.functional.dropout3d"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("nn.functional.elu"),
     xfail("nn.functional.fractional_max_pool2d"),
     xfail("nn.functional.fractional_max_pool3d"),
@@ -314,6 +392,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("nn.functional.interpolate", "nearest"),
     xfail("nn.functional.interpolate", "nearest-exact"),
     xfail("nn.functional.leaky_relu"),
+<<<<<<< HEAD
+=======
+    xfail("nn.functional.linear"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("nn.functional.local_response_norm"),
     xfail("nn.functional.logsigmoid"),
     xfail("nn.functional.margin_ranking_loss"),
@@ -329,8 +411,16 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("nn.functional.mish"),
     xfail("nn.functional.mse_loss"),
     xfail("nn.functional.multi_margin_loss"),
+<<<<<<< HEAD
     xfail("nn.functional.multilabel_margin_loss"),
     xfail("nn.functional.multilabel_soft_margin_loss"),
+=======
+    xfail("nn.functional.multi_head_attention_forward"),
+    xfail("nn.functional.multilabel_margin_loss"),
+    xfail("nn.functional.multilabel_soft_margin_loss"),
+    xfail("nn.functional.normalize"),
+    xfail("nn.functional.pad", "constant"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("nn.functional.pad", "reflect"),
     xfail("nn.functional.pad", "replicate"),
     xfail("nn.functional.pad", "replicate_negative"),
@@ -379,6 +469,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("rot90"),
     xfail("rsub"),
     xfail("scalar_tensor"),
+<<<<<<< HEAD
+=======
+    xfail("scatter_add"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("scatter_reduce", "amax"),
     xfail("scatter_reduce", "amin"),
     xfail("scatter_reduce", "mean"),
@@ -386,6 +480,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("scatter_reduce", "sum"),
     xfail("searchsorted"),
     xfail("select_scatter"),
+<<<<<<< HEAD
+=======
+    xfail("sort"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("sparse.sampled_addmm"),
     xfail("sparse.mm", "reduce"),
     xfail("special.airy_ai"),
@@ -395,8 +493,11 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("special.bessel_y1"),
     xfail("special.chebyshev_polynomial_t"),
     xfail("special.chebyshev_polynomial_u"),
+<<<<<<< HEAD
     xfail("special.chebyshev_polynomial_v"),
     xfail("special.chebyshev_polynomial_w"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("special.entr"),
     xfail("special.erfcx"),
     xfail("special.hermite_polynomial_h"),
@@ -405,7 +506,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("special.i1"),
     xfail("special.i1e"),
     xfail("special.laguerre_polynomial_l"),
+<<<<<<< HEAD
     xfail("special.legendre_polynomial_p"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("special.log_ndtr"),
     xfail("special.modified_bessel_i0"),
     xfail("special.modified_bessel_i1"),
@@ -414,10 +518,13 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("special.ndtri"),
     xfail("special.scaled_modified_bessel_k0"),
     xfail("special.scaled_modified_bessel_k1"),
+<<<<<<< HEAD
     xfail("special.shifted_chebyshev_polynomial_t"),
     xfail("special.shifted_chebyshev_polynomial_u"),
     xfail("special.shifted_chebyshev_polynomial_v"),
     xfail("special.shifted_chebyshev_polynomial_w"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("special.spherical_bessel_j0"),
     xfail("special.xlog1py"),
     xfail("special.zeta"),
@@ -447,6 +554,11 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("tensor_split"),
     xfail("to_sparse"),
     xfail("trace"),
+<<<<<<< HEAD
+=======
+    xfail("trapezoid"),
+    xfail("trapz"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("triangular_solve"),
     xfail("unbind"),
     xfail("unbind_copy"),
@@ -505,6 +617,7 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     skip("_segment_reduce", "offsets"),
     # TODO: fix the following ops
     skip("squeeze"),
+<<<<<<< HEAD
     skip("empty"),
     skip("empty_strided"),
     skip("empty_like"),
@@ -520,6 +633,10 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
     xfail("masked.cumprod"),
     skip("nn.functional.multi_head_attention_forward"),
 }
+=======
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Add a list of ops that are currently failing BW pass
 skip_bw = [
@@ -538,6 +655,7 @@ def repurpose_ops(op_db, base_test_name, derived_test_name):
 DEVICE_TYPE = "cpu"
 
 
+<<<<<<< HEAD
 class TestDTensorOps(TestCase):
     __test__ = False
 
@@ -545,10 +663,14 @@ def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
         cls.__test__ = True
 
+=======
+class TestDTensorOps(DTensorOpTestBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def world_size(self) -> int:
         return OP_DB_WORLD_SIZE
 
+<<<<<<< HEAD
     def run_opinfo_test(
         self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
     ):
@@ -560,6 +682,20 @@ def test():
             for sample_input in samples:
                 if not sample_inputs_filter(sample_input):
                     continue
+=======
+    # only allow float dytpe for now, we can relax this constraint
+    # when feel necessary later (i.e when adding quantization support).
+    @suppress_warnings
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
+    def test_dtensor_op_db(self, dtype, op):
+        self.mesh = DeviceMesh(DEVICE_TYPE, torch.arange(self.world_size))
+
+        # test each op with dist tensor inputs and normal inputs
+        def test():
+            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=True)
+            for sample_input in samples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args = [sample_input.input] + list(sample_input.args)
                 kwargs = sample_input.kwargs
 
@@ -596,9 +732,12 @@ def assert_ref_dtensor_equal(self, dtensor_rs, rs):
 
             self.assertEqualOnRank(dtensor_r, r)
 
+<<<<<<< HEAD
     def assertEqualOnRank(self, x, y, msg=None, *, rank=0) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run_dtensor_crossref(self, func, args, kwargs):
         to_dtensor = DTensorConverter(self.mesh, args, kwargs)
 
@@ -612,8 +751,12 @@ def concat_res_if_necessary(func, res: object) -> object:
                 return res
 
         # TODO: also handle cases where func raise an exception
+<<<<<<< HEAD
         op_args, op_kwargs = reconcile_args(args, kwargs)
         rs = func(*op_args, **op_kwargs)
+=======
+        rs = func(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rs = concat_res_if_necessary(func, rs)
 
         def to_replicate(e: object) -> object:
@@ -668,12 +811,20 @@ def to_replicate(e: object) -> object:
                         self.assert_ref_dtensor_equal(dtensor_rs, rs)
                     else:
                         raise RuntimeError(
+<<<<<<< HEAD
                             f"Failed to convert args to DTensor; "
+=======
+                            f"failed to convert args to DTensor; "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             f"originally (*{args}, **{kwargs})"
                         )
                 except Exception as e:
                     raise RuntimeError(
+<<<<<<< HEAD
                         f"{str(e)}\n\nFailed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
+=======
+                        f"failed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ) from e
         return rs
 
@@ -689,6 +840,7 @@ def check_dtensor_func(self, test_func, opinfo, dry_run=False):
                 else:
                     print(f"xfail('{opinfo.name}'),")
 
+<<<<<<< HEAD
     def run_one_hot(self):
         ops = [op for op in op_db if op.name == "nn.functional.one_hot"]
         assert len(ops) == 1
@@ -824,6 +976,12 @@ def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
 )
 
 instantiate_device_type_tests(TestLocalDTensorOps, globals(), only_for=(DEVICE_TYPE,))
+=======
+
+# only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
+instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_embedding_ops.py b/test/distributed/tensor/test_embedding_ops.py
index 19ff3309c1590..0765eb7adc56a 100644
--- a/test/distributed/tensor/test_embedding_ops.py
+++ b/test/distributed/tensor/test_embedding_ops.py
@@ -13,7 +13,10 @@
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+<<<<<<< HEAD
     create_local_tensor_test_class,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     with_comms,
 )
@@ -168,7 +171,11 @@ def test_sharded_embedding_rowwise(self):
         self._run_embedding_op_test(mesh, 0, [6, 7, 6], 13, 22)
         self._run_embedding_op_test(mesh, 0, [34], 15, 14, padding_idx=10)
 
+<<<<<<< HEAD
         from torch.distributed.tensor.placement_types import MaskPartial
+=======
+        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # test collectives
         embedding_mod = torch.nn.Embedding(10, 20, device=self.device_type)
@@ -176,7 +183,11 @@ def test_sharded_embedding_rowwise(self):
         inp = torch.randint(0, 10, (8, 8), device=self.device_type)
         replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
         output = sharded_embedding(replicated_inp)
+<<<<<<< HEAD
         self.assertIsInstance(output.placements[0], MaskPartial)
+=======
+        self.assertIsInstance(output.placements[0], _MaskPartial)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         comm_mode = CommDebugMode()
 
@@ -192,9 +203,15 @@ def test_multiple_embeddings_rowwise(self):
         inp = torch.randint(0, 10, (4, 4), device=self.device_type)
         replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
 
+<<<<<<< HEAD
         from torch.distributed.tensor.placement_types import MaskPartial
 
         # case 1: two embeddings with the same shape, thus sharing the underlying MaskPartial
+=======
+        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+
+        # case 1: two embeddings with the same shape, thus sharing the underying _MaskPartial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # and MaskBuffer, because of cache hit from sharding propagation
 
         emb1 = torch.nn.Embedding(10, 23, device=self.device_type)
@@ -206,32 +223,51 @@ def test_multiple_embeddings_rowwise(self):
         output2 = sharded_emb2(replicated_inp)
 
         partial_placement1 = output1.placements[0]
+<<<<<<< HEAD
         self.assertIsInstance(partial_placement1, MaskPartial)
         output1.full_tensor()
 
         partial_placement2 = output2.placements[0]
         self.assertIsInstance(partial_placement2, MaskPartial)
+=======
+        self.assertIsInstance(partial_placement1, _MaskPartial)
+        output1.full_tensor()
+
+        partial_placement2 = output2.placements[0]
+        self.assertIsInstance(partial_placement2, _MaskPartial)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output2.full_tensor()
 
         self.assertTrue(id(partial_placement1), id(partial_placement2))
 
         # case 2: two embeddings with the same logical_dim_size, but different logical_shape
+<<<<<<< HEAD
         # thus they will have different MaskPartial placements (with no cache hit)
+=======
+        # thus they will have different _MaskPartial placements (with no cache hit)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         emb3 = torch.nn.Embedding(10, 29, device=self.device_type)
         sharded_emb3 = self._apply_sharding(emb3, 0, mesh)
         output3 = sharded_emb3(replicated_inp)
         partial_placement3 = output3.placements[0]
+<<<<<<< HEAD
         self.assertIsInstance(partial_placement3, MaskPartial)
+=======
+        self.assertIsInstance(partial_placement3, _MaskPartial)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output2.full_tensor()
 
         # not equal because of different logical_shape, despite of same logical_dim_size
         self.assertNotEqual(partial_placement1, partial_placement3)
 
 
+<<<<<<< HEAD
 TestEmbeddingOpWithLocalTensor = create_local_tensor_test_class(
     TestEmbeddingOp,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_experimental_ops.py b/test/distributed/tensor/test_experimental_ops.py
index decb3c9e7f4c8..6073c9d8d4dfe 100644
--- a/test/distributed/tensor/test_experimental_ops.py
+++ b/test/distributed/tensor/test_experimental_ops.py
@@ -4,10 +4,16 @@
 
 import torch
 import torch.distributed as dist
+<<<<<<< HEAD
 from torch.distributed.tensor import distribute_tensor, Replicate
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
+=======
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, Replicate
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     with_comms,
 )
@@ -25,7 +31,11 @@ def world_size(self) -> int:
 
     @with_comms
     def test_slice(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Replicate()]
 
         input_list = torch.rand(ITER_TIME, 1024, 10)
@@ -77,7 +87,11 @@ def test_slice(self):
     @with_comms
     def test_bernoulli(self):
         rank = dist.get_rank()
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Replicate()]
 
         input_list = torch.rand(ITER_TIME, 1024, 10)
@@ -139,7 +153,11 @@ def test_bernoulli(self):
 
     @with_comms
     def test_nll(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Replicate()]
 
         pred_list = torch.rand(ITER_TIME, 1024, 10)
@@ -189,11 +207,14 @@ def test_nll(self):
             )
 
 
+<<<<<<< HEAD
 DistOtherOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistOtherOpsTest,
     # Send / recv ops are not supported
     skipped_tests=["test_bernoulli"],
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_init.py b/test/distributed/tensor/test_init.py
index 83134e7519469..54c41989be491 100644
--- a/test/distributed/tensor/test_init.py
+++ b/test/distributed/tensor/test_init.py
@@ -2,11 +2,17 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     with_comms,
 )
@@ -39,7 +45,11 @@ def world_size(self):
 
     def _run_init_op(self, init_op, dist_init_op, eq_op, *args, **kwargs):
         # 1d mesh test
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements_list = [[Shard(0)], [Shard(1)], [Shard(2)], [Replicate()]]
 
         # even sharding
@@ -79,6 +89,7 @@ def _run_init_op(self, init_op, dist_init_op, eq_op, *args, **kwargs):
                         dim=shard_dim,
                     )
                 )
+<<<<<<< HEAD
 
                 @maybe_run_for_local_tensor
                 def check_per_rank_chunk(rank, local_tensor):
@@ -86,6 +97,10 @@ def check_per_rank_chunk(rank, local_tensor):
                         eq_op(exp_tensor_list[rank], local_tensor)
 
                 check_per_rank_chunk(self.rank, dist_tensor.to_local())
+=======
+                if self.rank < len(exp_tensor_list):
+                    eq_op(exp_tensor_list[self.rank], dist_tensor.to_local())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 exp_tensor = init_op(tensor_size, *args, **kwargs)
                 eq_op(exp_tensor, dist_tensor.to_local())
@@ -138,8 +153,13 @@ def test_zeros(self):
 
     @with_comms
     def test_zeros_full_mesh(self):
+<<<<<<< HEAD
         # construct a gpu device 1d mesh
         mesh = self.build_device_mesh()
+=======
+        # construct a cuda device 1d mesh
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0)]
         size = [32, 3]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@@ -157,6 +177,7 @@ def test_zeros_full_mesh(self):
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
         self.assertEqual(dist_tensor.size(), torch.Size(size))
         local_tensor = dist_tensor.to_local()
+<<<<<<< HEAD
 
         @maybe_run_for_local_tensor
         def check_per_rank_tensors(rank, local_tensor):
@@ -170,6 +191,16 @@ def check_per_rank_tensors(rank, local_tensor):
         check_per_rank_tensors(self.rank, local_tensor)
 
         # construct a gpu device mesh with 2d: shard, replicate
+=======
+        if self.rank <= 2:
+            self.assertEqual(local_tensor.size(), torch.Size([8, 3]))
+            self.assertEqual(torch.zeros(8, 3), local_tensor)
+        else:
+            self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
+            self.assertEqual(torch.zeros(7, 3), local_tensor)
+
+        # construct a cuda device mesh with 2d: shard, replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
         placements = [Shard(0), Replicate()]
         size = [32, 4]
@@ -180,7 +211,11 @@ def check_per_rank_tensors(rank, local_tensor):
         self.assertEqual(local_tensor.size(), torch.Size([16, 4]))
         self.assertEqual(local_tensor, torch.zeros([16, 4]))
 
+<<<<<<< HEAD
         # construct a gpu device mesh with 2d: shard, shard
+=======
+        # construct a cuda device mesh with 2d: shard, shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements = [Shard(0), Shard(1)]
         size = [32, 4]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@@ -209,7 +244,11 @@ def check_per_rank_tensors(rank, local_tensor):
     @with_comms
     def test_zeros_submesh(self):
         # default world_size is 4
+<<<<<<< HEAD
         # construct a gpu device 1d mesh, with no sub pg initialized
+=======
+        # construct a cuda device 1d mesh, with no sub pg initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sub_mesh_list = [0, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -225,7 +264,11 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.zeros(0))
 
+<<<<<<< HEAD
         # construct a gpu device 1d mesh: unevenly, with subpg initialized
+=======
+        # construct a cuda device 1d mesh: unevenly, with subpg initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sub_mesh_list = [0, 1, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -245,7 +288,11 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.tensor([]))
 
+<<<<<<< HEAD
         # construct a gpu device 2d mesh, with no subpg initialized
+=======
+        # construct a cuda device 2d mesh, with no subpg initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sub_mesh_list = [[0], [3]]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0), Shard(1)]
@@ -262,6 +309,7 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor, torch.tensor([]))
 
 
+<<<<<<< HEAD
 DTensorConstructorTestWithLocalTensor = create_local_tensor_test_class(
     DTensorConstructorTest,
     skipped_tests=[
@@ -270,5 +318,7 @@ def test_zeros_submesh(self):
     ],
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index 51a8186bac509..4e20df3e50ded 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -7,14 +7,20 @@
 from typing import NamedTuple
 
 import torch
+<<<<<<< HEAD
 import torch.distributed as dist
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
     distribute_tensor,
     DTensor,
+<<<<<<< HEAD
     Partial,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Replicate,
     Shard,
 )
@@ -26,11 +32,17 @@
     RowwiseParallel,
     SequenceParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
     DTensorTestBase,
     map_local_for_rank,
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfRocm
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_unless_torch_gpu,
     with_comms,
 )
@@ -275,6 +287,7 @@ def test_layer_norm_fwd(self):
         norm_shape_idx_list = list(range(x.ndim))
         shard_dims = [-1, 0, 1, 2]
         elementwise_affine_list = [False, True]
+<<<<<<< HEAD
 
         # Test RMSNorm as well if CUDA
         norm_types = [torch.nn.LayerNorm]
@@ -291,6 +304,16 @@ def test_layer_norm_fwd(self):
         for norm_type, shard_dim, norm_idx, elementwise_affine in test_config_list:
             normalized_shape = x.shape[norm_idx:]
             layer_norm = norm_type(
+=======
+        test_config_list = list(
+            itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
+        )
+
+        # normalized shape is a torch.Size object
+        for shard_dim, norm_idx, elementwise_affine in test_config_list:
+            normalized_shape = x.shape[norm_idx:]
+            layer_norm = torch.nn.LayerNorm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 normalized_shape,
                 elementwise_affine=elementwise_affine,
                 device=self.device_type,
@@ -299,7 +322,10 @@ def test_layer_norm_fwd(self):
 
             def _replicate_fn(name, module, device_mesh):
                 for name, param in module.named_parameters():
+<<<<<<< HEAD
                     # RMSNorm only has weight, LayerNorm has both weight and bias
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if name in ["weight", "bias"]:
                         param_dist = torch.nn.Parameter(
                             distribute_tensor(param, device_mesh, [Replicate()])
@@ -320,7 +346,11 @@ def _replicate_fn(name, module, device_mesh):
             self.assertLessEqual(
                 comm_mode.get_total_counts(),
                 1,  # TODO: This should be 0!
+<<<<<<< HEAD
                 f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
+=======
+                f"comm count={comm_mode.get_total_counts()}, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -342,6 +372,7 @@ def test_layer_norm_bwd(self):
         norm_shape_idx_list = list(range(3))
         shard_dims = [0, 1, 2]
         elementwise_affine_list = [False, True]
+<<<<<<< HEAD
 
         # Test both LayerNorm and RMSNorm (if CUDA)
         norm_types = [torch.nn.LayerNorm]
@@ -356,6 +387,14 @@ def test_layer_norm_bwd(self):
 
         # normalized shape is a torch.Size object
         for norm_type, shard_dim, norm_idx, elementwise_affine in test_config_list:
+=======
+        test_config_list = list(
+            itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
+        )
+
+        # normalized shape is a torch.Size object
+        for shard_dim, norm_idx, elementwise_affine in test_config_list:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.rand(
                 batch,
                 sentence_length,
@@ -364,7 +403,11 @@ def test_layer_norm_bwd(self):
                 requires_grad=True,
             )
             normalized_shape = x.shape[norm_idx:]
+<<<<<<< HEAD
             layer_norm = norm_type(
+=======
+            layer_norm = torch.nn.LayerNorm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 normalized_shape,
                 elementwise_affine=elementwise_affine,
                 device=self.device_type,
@@ -385,11 +428,17 @@ def _replicate_fn(name, module, device_mesh):
                 self.assertEqual(
                     layer_norm_local.weight, layer_norm_dist.weight.full_tensor()
                 )
+<<<<<<< HEAD
                 # RMSNorm doesn't have bias
                 if hasattr(layer_norm_local, "bias"):
                     self.assertEqual(
                         layer_norm_local.bias, layer_norm_dist.bias.full_tensor()
                     )
+=======
+                self.assertEqual(
+                    layer_norm_local.bias, layer_norm_dist.bias.full_tensor()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             x_local = x.detach().clone().requires_grad_(True)
             x_dist = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
@@ -407,7 +456,11 @@ def _replicate_fn(name, module, device_mesh):
             self.assertEqual(
                 sum(comm_mode.comm_module_counts["Global"]["forward"].values()),
                 expected_fwd_comm,
+<<<<<<< HEAD
                 f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
+=======
+                f"comm count={comm_mode.get_total_counts()}, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -421,7 +474,11 @@ def _replicate_fn(name, module, device_mesh):
             self.assertEqual(
                 sum(comm_mode.comm_module_counts["Global"]["backward"].values()),
                 expected_bwd_comm,
+<<<<<<< HEAD
                 f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
+=======
+                f"comm count={comm_mode.get_total_counts()}, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -435,22 +492,36 @@ def _replicate_fn(name, module, device_mesh):
                     is_tensor_partial(layer_norm_dist.weight.grad._spec),
                     needs_reduction,
                 )
+<<<<<<< HEAD
                 # RMSNorm doesn't have bias
                 if hasattr(layer_norm_dist, "bias"):
                     self.assertEqual(
                         is_tensor_partial(layer_norm_dist.bias.grad._spec),
                         needs_reduction,
                     )
+=======
+                self.assertEqual(
+                    is_tensor_partial(layer_norm_dist.bias.grad._spec),
+                    needs_reduction,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(
                     layer_norm_local.weight.grad,
                     layer_norm_dist.weight.grad.full_tensor(),
                 )
+<<<<<<< HEAD
                 # RMSNorm doesn't have bias
                 if hasattr(layer_norm_local, "bias"):
                     self.assertEqual(
                         layer_norm_local.bias.grad,
                         layer_norm_dist.bias.grad.full_tensor(),
                     )
+=======
+                self.assertEqual(
+                    layer_norm_local.bias.grad,
+                    layer_norm_dist.bias.grad.full_tensor(),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertEqual(x_local.grad, x_dist.grad.full_tensor())
 
@@ -459,6 +530,7 @@ def test_layer_norm_bwd_req_grad(self):
         device_mesh = self.build_device_mesh()
         batch, seq_len, embedding_dim, vocab_size = 8, 8, 10, 32
 
+<<<<<<< HEAD
         # Test both LayerNorm and RMSNorm (if CUDA)
         norm_types = [torch.nn.LayerNorm]
         if self.device_type == "cuda" and hasattr(torch.nn, "RMSNorm"):
@@ -467,6 +539,10 @@ def test_layer_norm_bwd_req_grad(self):
         # build our subtest configurations and filter out invalid ones
         class SubTest(NamedTuple):
             norm_type: type
+=======
+        # build our subtest configurations and filter out invalid ones
+        class SubTest(NamedTuple):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             multidim_norm: bool
             elementwise_affine: bool
             emb_req_grad: bool
@@ -474,6 +550,7 @@ class SubTest(NamedTuple):
             out_req_grad: bool
 
         subtest_fails = {}
+<<<<<<< HEAD
 
         def valid_filter(cfg):
             return not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
@@ -486,13 +563,27 @@ def valid_filter(cfg):
                     for norm_type in norm_types
                     for cfg in itertools.product(*(((False, True),) * 5))
                 ],
+=======
+        valid_filter = (  # noqa: E731
+            lambda cfg: (
+                not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[2:])
+            )
+        )
+        subtest_cfgs = list(
+            filter(
+                valid_filter,
+                [SubTest(*cfg) for cfg in itertools.product(*(((False, True),) * 5))],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
         for subtest_cfg in subtest_cfgs:
             try:
                 (
+<<<<<<< HEAD
                     norm_type,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     multidim_norm,
                     elementwise_affine,
                     emb_req_grad,
@@ -510,7 +601,11 @@ def __init__(self):
                         self.preln_embeddings = torch.nn.Embedding(
                             vocab_size, embedding_dim
                         )
+<<<<<<< HEAD
                         self.layer_norm = norm_type(
+=======
+                        self.layer_norm = torch.nn.LayerNorm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             normalized_shape, elementwise_affine=elementwise_affine
                         )
                         self.postln_linear = torch.nn.Linear(
@@ -653,6 +748,7 @@ def test_shard0_svd(self):
         self.assertEqual(comm_counts[funcol.all_gather_into_tensor], 1)
 
     @with_comms
+<<<<<<< HEAD
     def test_vector_norm(self):
         device_mesh = self.build_device_mesh()
 
@@ -689,6 +785,8 @@ def test_vector_norm_partial(self):
         self.assertEqual(partial_out.full_tensor(), out)
 
     @with_comms
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_norm(self):
         device_mesh = self.build_device_mesh()
 
@@ -708,6 +806,7 @@ def test_foreach_norm(self):
             self.assertEqual(so.full_tensor(), o)
 
     @with_comms
+<<<<<<< HEAD
     def test_foreach_norm_partial(self):
         device_mesh = self.build_device_mesh()
 
@@ -738,6 +837,8 @@ def test_foreach_norm_partial(self):
             self.assertEqual(po.full_tensor(), o)
 
     @with_comms
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_norm_different_mesh(self):
         mesh_shape = (2, self.world_size // 2)
         mesh_2d = init_device_mesh(
@@ -764,6 +865,10 @@ def test_foreach_norm_different_mesh(self):
         self.assertEqual(grad1_norm.device_mesh, mesh_y)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_add_different_mesh(self):
         mesh_shape = (2, self.world_size // 2)
         mesh_2d = init_device_mesh(
@@ -792,7 +897,11 @@ def test_foreach_add_different_mesh(self):
         self.assertEqual(out0.device_mesh, mesh_x)
         self.assertEqual(out1.device_mesh, mesh_y)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
+=======
+        with self.assertRaisesRegex(ValueError, "computation across different mesh"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.ops.aten._foreach_add(
                 [replica_inp00, replica_inp01], [replica_inp10, replica_inp11]
             )
@@ -861,6 +970,7 @@ def test_cumsum(self):
                     self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
                 self.assertEqual(output_dtensor.full_tensor(), output)
 
+<<<<<<< HEAD
     @with_comms
     def test_conj_complex_dtensor(self):
         mesh = self.build_device_mesh()
@@ -1039,6 +1149,8 @@ def test_matching_partial_reduction_ops(self):
 DistMathOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistMathOpsTest,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_matrix_ops.py b/test/distributed/tensor/test_matrix_ops.py
index ecb5fd5c27108..c89c08b2d492c 100644
--- a/test/distributed/tensor/test_matrix_ops.py
+++ b/test/distributed/tensor/test_matrix_ops.py
@@ -7,7 +7,11 @@
 
 import torch
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed import init_device_mesh
+=======
+from torch.distributed import DeviceMesh, init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor import (
     distribute_tensor,
     DTensor,
@@ -19,6 +23,7 @@
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM90OrLater
 from torch.testing._internal.common_device_type import E4M3_MAX_POS, e4m3_type
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -29,6 +34,10 @@
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
+=======
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     skip_unless_torch_gpu,
     with_comms,
@@ -55,7 +64,11 @@ def scale_for_fp8(
 class DistMatrixOpsTest(DTensorTestBase):
     @with_comms
     def test_addmm(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
         replica_spec = [Replicate()]
 
@@ -72,7 +85,11 @@ def test_addmm(self):
 
     @with_comms
     def test_addmm_empty_operand(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
         replica_spec = [Replicate()]
 
@@ -89,7 +106,11 @@ def test_addmm_empty_operand(self):
 
     @with_comms
     def test_addmm_auto_redistribute(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard0_spec = [Shard(0)]
         shard1_spec = [Shard(1)]
         replica_spec = [Replicate()]
@@ -120,7 +141,11 @@ def test_addmm_auto_redistribute(self):
 
     @with_comms
     def test_mm(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard0_spec = Shard(0)
         shard1_spec = Shard(1)
         replica_spec = Replicate()
@@ -154,9 +179,14 @@ def test_placement_comb(
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+<<<<<<< HEAD
     @skipIfRocmArch(MI350_ARCH) #Enable via https://github.com/ROCm/frameworks-internal/issues/13103
     def test_scaled_mm(self):
         device_mesh = self.build_device_mesh()
+=======
+    def test_scaled_mm(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shrd0 = Shard(0)
         shrd1 = Shard(1)
         repl = Replicate()
@@ -226,7 +256,11 @@ def test_scaled_mm(self):
 
     @with_comms
     def test_matmul(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 128
         x = torch.randn(8, dim)
         A = torch.randn(dim, dim)
@@ -245,7 +279,11 @@ def test_matmul(self):
 
     @with_comms
     def test_t(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         tensor_to_transpose = torch.randn(12, 8, requires_grad=True)
@@ -259,7 +297,11 @@ def test_t(self):
 
     @with_comms
     def test_t_partial(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = torch.randn(12, 8)
         b = torch.randn(8, 4)
@@ -284,7 +326,11 @@ def test_t_partial(self):
     @with_comms
     @skip_unless_torch_gpu
     def test_baddbmm(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
         batch_1 = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
         batch_2 = torch.rand(4, 8, 8, device=self.device_type, requires_grad=True)
@@ -348,7 +394,11 @@ def test_placement_comb(
 
     @with_comms
     def test_bmm(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mat1 = torch.rand(4, 8, 4, device=self.device_type, requires_grad=True)
         mat2 = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
         local_result = torch.bmm(mat1, mat2)
@@ -393,7 +443,11 @@ def test_placement_comb(
     @with_comms
     @skip_unless_torch_gpu
     def test_scaled_dot_product_attention(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
         # bsz, n_heads, slen, head_dim
         query = torch.rand(
@@ -415,6 +469,13 @@ def test_scaled_dot_product_attention(self):
             requires_grad=True,
         )
 
+<<<<<<< HEAD
+=======
+        dist_query = distribute_tensor(query, device_mesh, [Shard(1)])
+        dist_key = distribute_tensor(key, device_mesh, [Shard(1)])
+        dist_value = distribute_tensor(value, device_mesh, [Shard(1)])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
         available_backends = []
@@ -431,6 +492,7 @@ def test_scaled_dot_product_attention(self):
         if torch.backends.cuda.can_use_efficient_attention(params, debug=False):
             available_backends.append(SDPBackend.EFFICIENT_ATTENTION)
 
+<<<<<<< HEAD
         placement_specs = [(Replicate(),), (Shard(0),), (Shard(1),)]
         for backend, input_placements in itertools.product(
             available_backends, placement_specs
@@ -438,6 +500,9 @@ def test_scaled_dot_product_attention(self):
             dist_query = distribute_tensor(query, device_mesh, input_placements)
             dist_key = distribute_tensor(key, device_mesh, input_placements)
             dist_value = distribute_tensor(value, device_mesh, input_placements)
+=======
+        for backend in available_backends:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with sdpa_kernel(backends=[backend]):
                 out = F.scaled_dot_product_attention(
                     query, key, value, dropout_p=dropout_p, is_causal=is_causal
@@ -451,13 +516,18 @@ def test_scaled_dot_product_attention(self):
                         is_causal=is_causal,
                     )
                     self.assertEqual(comm_mode.get_total_counts(), 0)
+<<<<<<< HEAD
                     self.assertEqual(dist_out.placements, input_placements)
+=======
+                    self.assertTrue(dist_out.placements[0].is_shard(dim=1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(dist_out.full_tensor(), out)
 
                 out.sum().backward()
                 with comm_mode:
                     dist_out.sum().backward()
                     self.assertEqual(comm_mode.get_total_counts(), 0)
+<<<<<<< HEAD
                     self.assertEqual(dist_query.grad.placements, input_placements)
                     self.assertEqual(dist_query.grad.full_tensor(), query.grad)
                     self.assertEqual(dist_key.grad.placements, input_placements)
@@ -467,6 +537,14 @@ def test_scaled_dot_product_attention(self):
                     query.grad.zero_()
                     key.grad.zero_()
                     value.grad.zero_()
+=======
+                    self.assertTrue(dist_query.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_query.grad.full_tensor(), query.grad)
+                    self.assertTrue(dist_key.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_key.grad.full_tensor(), key.grad)
+                    self.assertTrue(dist_value.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_value.grad.full_tensor(), value.grad)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_unless_torch_gpu
     @with_comms()
@@ -501,7 +579,11 @@ def test_tensordot_shampoo(self):
         """
         Create a simple test for Shampoo's use case.
         """
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         local_a = torch.randn(4, 4)
         local_b = torch.randn(4, 15)
@@ -522,6 +604,7 @@ def test_tensordot_shampoo(self):
     @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
     @with_comms
     @skip_unless_torch_gpu
+<<<<<<< HEAD
     @parametrize(
         "kwargs",
         [
@@ -574,26 +657,57 @@ def test_grouped_mm(self, kwargs):
             device=self.device_type,
             dtype=dtype,
             requires_grad=True,
+=======
+    def test_grouped_mm(self):
+        # TODO: torch._grouped_mm can take inputs of dimension (2D, 3D) x (2D, 3D)
+        # Here we only test the 2D x 3D Tensor Parallel use case in an MoE layer.
+        # More tests need to be added.
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        comm_mode = CommDebugMode()
+        dtype = torch.bfloat16
+
+        inp = torch.rand(
+            64, 16, device=self.device_type, dtype=dtype, requires_grad=True
+        )
+        w1 = torch.rand(
+            2, 16, 32, device=self.device_type, dtype=dtype, requires_grad=True
+        )
+        w2 = torch.rand(
+            2, 32, 16, device=self.device_type, dtype=dtype, requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         offs = torch.tensor([16, 64], device=self.device_type, dtype=torch.int32)
 
         h = torch._grouped_mm(inp, w1, offs=offs)
         out = torch._grouped_mm(h, w2, offs=offs)
 
+<<<<<<< HEAD
         dist_inp = distribute_tensor(inp, device_mesh, kwargs["inp_placements"])
         # colwise sharded
         dist_w1 = distribute_tensor(w1, device_mesh, kwargs["w1_placements"])
         # rowwise sharded
         dist_w2 = distribute_tensor(w2, device_mesh, kwargs["w2_placements"])
+=======
+        dist_inp = distribute_tensor(inp, device_mesh, [Replicate()])
+        # colwise sharded
+        dist_w1 = distribute_tensor(w1, device_mesh, [Shard(2)])
+        # rowwise sharded
+        dist_w2 = distribute_tensor(w2, device_mesh, [Shard(1)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist_offs = distribute_tensor(offs, device_mesh, [Replicate()])
 
         with comm_mode:
             dist_h = torch._grouped_mm(dist_inp, dist_w1, offs=dist_offs)
             dist_out = torch._grouped_mm(dist_h, dist_w2, offs=dist_offs)
+<<<<<<< HEAD
             self.assertEqual(
                 comm_mode.get_total_counts(), kwargs["expected_comm_counts_fwd"]
             )
             self.assertEqual(dist_out.placements, kwargs["expected_out_placements"])
+=======
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertTrue(dist_out.placements[0].is_partial())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(dist_out.full_tensor(), out)
 
         out_grad = torch.ones_like(out)
@@ -604,23 +718,33 @@ def test_grouped_mm(self, kwargs):
 
         with comm_mode:
             dist_out.backward(dist_out_grad)
+<<<<<<< HEAD
             self.assertEqual(
                 comm_mode.get_total_counts(), kwargs["expected_comm_counts_bwd"]
             )
             self.assertEqual(
                 comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
                 kwargs["expected_comm_counts_bwd"],
+=======
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self.assertEqual(dist_inp.grad.full_tensor(), inp.grad)
         self.assertEqual(dist_w1.grad.full_tensor(), w1.grad)
         self.assertEqual(dist_w2.grad.full_tensor(), w2.grad)
 
 
+<<<<<<< HEAD
 instantiate_parametrized_tests(DistMatrixOpsTest)
 
 DistMatrixOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistMatrixOpsTest,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index da9c4b4174b5d..cb87baf597b20 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import itertools
 import random
 from contextlib import contextmanager
@@ -26,10 +27,20 @@
     OpStrategy,
     RuntimeSchemaInfo,
 )
+=======
+from itertools import chain
+
+import torch
+from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor._collective_utils import redistribute_cost
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor._op_schema import OpSchema, OpSpec, OpStrategy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._ops._einsum_strategy import (
     EinsumDims,
     gen_einsum_strategies,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor._ops.utils import (
     register_op_strategy,
     replicate_op_strategy,
@@ -52,6 +63,10 @@
 
 def extract_tensor_meta(t) -> TensorMeta:
     return TensorMeta(t.shape, t.stride(), t.dtype)
+=======
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed._tensor.common_dtensor import DTensorOpTestBase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestEinsumDims(TestCase):
@@ -104,7 +119,11 @@ def test_free_dims(self):
         self.assertEqual(edims.lhs_out_only_dims, ["c"])
         self.assertEqual(edims.rhs_out_only_dims, [])
 
+<<<<<<< HEAD
         equation = "abd,bf->abfd"  # codespell:ignore
+=======
+        equation = "abd,bf->abfd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_dims, output_dim = EinsumDims.parse_equation(equation)
         edims = EinsumDims.parse_dims(input_dims, output_dim)
 
@@ -137,6 +156,7 @@ def test_bmm_1d_mesh(self):
         all_strats = gen_einsum_strategies("bmk,bkn->bmn", mesh)
         self.assertEqual(len(all_strats.strategies), 5)
 
+<<<<<<< HEAD
     def test_bmm_diffinndim_2d_mesh(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
         all_strats = gen_einsum_strategies("bmk,kn->bmn", mesh)
@@ -147,6 +167,8 @@ def test_bmm_diffoutndim_2d_mesh(self):
         all_strats = gen_einsum_strategies("bmk,k->bm", mesh)
         self.assertEqual(len(all_strats.strategies), 16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bmm_2d_mesh(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
 
@@ -170,6 +192,12 @@ def test_linearity_1d_mesh(self):
 
 
 class TestCostModel(DTensorOpTestBase):
+<<<<<<< HEAD
+=======
+    def _extract_tensor_meta(self, t) -> TensorMeta:
+        return TensorMeta(t.shape, t.stride(), t.dtype)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def world_size(self) -> int:
         return 4
@@ -181,7 +209,11 @@ def test_redistribute_cost_mesh_1d(self):
         partial_placement = (Partial(),)
 
         global_tensor = torch.randn(10, 10)
+<<<<<<< HEAD
         global_tensor_meta = extract_tensor_meta(global_tensor)
+=======
+        global_tensor_meta = self._extract_tensor_meta(global_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # shard spec
         shard_spec = DTensorSpec(mesh_1d, shard_placement, global_tensor_meta)
@@ -216,9 +248,15 @@ def test_redistribute_cost_latency(self):
         partial_placement = (Partial(),)
         shard1_placement = (Shard(1),)
 
+<<<<<<< HEAD
         shard0_tensor_meta = extract_tensor_meta(torch.randn(8))
         partial_tensor_meta = extract_tensor_meta(torch.randn(50, 6))
         shard1_tensor_meta = extract_tensor_meta(torch.randn(6, 8))
+=======
+        shard0_tensor_meta = self._extract_tensor_meta(torch.randn(8))
+        partial_tensor_meta = self._extract_tensor_meta(torch.randn(50, 6))
+        shard1_tensor_meta = self._extract_tensor_meta(torch.randn(6, 8))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # shard spec
         shard0_spec = DTensorSpec(mesh, shard0_placement, shard0_tensor_meta)
@@ -262,7 +300,11 @@ def test_redistribute_cost_mesh_2d(self):
         partial_placement = (Partial(), Partial())
 
         global_tensor = torch.randn(8, 8)
+<<<<<<< HEAD
         global_tensor_meta = extract_tensor_meta(global_tensor)
+=======
+        global_tensor_meta = self._extract_tensor_meta(global_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # shard spec
         shard_spec = DTensorSpec(mesh_2d, shard_placement, global_tensor_meta)
@@ -291,8 +333,13 @@ def test_mm_strategies(self):
         mesh = self.build_device_mesh()
         lhs_tensor = torch.randn(6, 8)
         rhs_tensor = torch.randn(8, 12)
+<<<<<<< HEAD
         lhs_tensor_meta = extract_tensor_meta(lhs_tensor)
         rhs_tensor_meta = extract_tensor_meta(rhs_tensor)
+=======
+        lhs_tensor_meta = self._extract_tensor_meta(lhs_tensor)
+        rhs_tensor_meta = self._extract_tensor_meta(rhs_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mm_combs = (
             (Shard(0), Replicate()),
@@ -337,8 +384,13 @@ def test_bmm_strategies(self):
         mesh = self.build_device_mesh()
         lhs_tensor = torch.randn(8, 6, 8)
         rhs_tensor = torch.randn(8, 8, 12)
+<<<<<<< HEAD
         lhs_tensor_meta = extract_tensor_meta(lhs_tensor)
         rhs_tensor_meta = extract_tensor_meta(rhs_tensor)
+=======
+        lhs_tensor_meta = self._extract_tensor_meta(lhs_tensor)
+        rhs_tensor_meta = self._extract_tensor_meta(rhs_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bmm_combs = (
             (Shard(0), Shard(0)),
@@ -379,6 +431,7 @@ def test_bmm_strategies(self):
             self.assertFalse(output_sharding.needs_redistribute)
 
 
+<<<<<<< HEAD
 # -------------Test op strategy registration-------------
 # custom op without List[Tensor] as input
 # reference: https://docs.pytorch.org/docs/stable/library.html#torch.library.register_autograd
@@ -655,5 +708,7 @@ def test_call_with_different_nontensor_args(self):
     TestStrategyHashing,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_optimizers.py b/test/distributed/tensor/test_optimizers.py
index fc3932bb866eb..683f5a16b9c88 100644
--- a/test/distributed/tensor/test_optimizers.py
+++ b/test/distributed/tensor/test_optimizers.py
@@ -5,16 +5,26 @@
 import torch
 import torch.nn as nn
 from torch.distributed.tensor import (
+<<<<<<< HEAD
     distribute_module,
     distribute_tensor,
     DTensor,
     init_device_mesh,
+=======
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Replicate,
     Shard,
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+<<<<<<< HEAD
     create_local_tensor_test_class,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     MLPModule,
     with_comms,
@@ -89,7 +99,11 @@ def test_optimizer_foreach_supported_types_include_DTensor(self):
 
     @with_comms
     def test_adam_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # lr as a Tensor is not supported for capturable=False and foreach=True
         adam_float_lr_configs = [
@@ -148,7 +162,11 @@ def test_adam_1d_sharding(self):
 
     @with_comms
     def test_adamw_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # lr as a Tensor is not supported for capturable=False and foreach=True
         adamw_float_lr_configs = [
@@ -224,7 +242,11 @@ def test_adamw_1d_sharding(self):
 
     @with_comms
     def test_sgd_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sgd_configs = [
             {"lr": 0.1, "foreach": False},
@@ -264,7 +286,11 @@ def test_sgd_1d_sharding(self):
 
     @with_comms
     def test_adagrad_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         adagrad_configs = [
             {"lr": 0.1, "foreach": False},
@@ -320,7 +346,11 @@ def test_adagrad_1d_sharding(self):
 
     @with_comms
     def test_RMSprop_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         RMSprop_configs = [
             {"lr": 0.1, "foreach": False},
@@ -387,7 +417,11 @@ def test_RMSprop_1d_sharding(self):
 
     @with_comms
     def test_adadelta_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         adadelta_configs = [
             {"lr": 0.1, "foreach": False},
@@ -431,7 +465,11 @@ def test_adadelta_1d_sharding(self):
 
     @with_comms
     def test_nadam_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         nadam_configs = [
             {"lr": 0.1, "foreach": False},
@@ -468,7 +506,11 @@ def test_nadam_1d_sharding(self):
 
     @with_comms
     def test_radam_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         radam_configs = [
             {"lr": 0.1, "foreach": False},
@@ -508,7 +550,11 @@ def test_radam_1d_sharding(self):
 
     @with_comms
     def test_adamax_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         adamax_configs = [
             {"lr": 0.1, "foreach": False},
@@ -552,7 +598,11 @@ def test_adamax_1d_sharding(self):
 
     @with_comms
     def test_asgd_1d_sharding(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         asgd_configs = [
             {"lr": 0.1, "foreach": False},
@@ -607,6 +657,7 @@ def test_asgd_1d_sharding(self):
                 mesh, mod, opt, dist_mod, dist_opt, inp, atol=1.3e-5, rtol=1e-4
             )
 
+<<<<<<< HEAD
     @with_comms
     def test_admaw_fused_across_meshes(self):
         mesh_shape = (2, self.world_size // 2)
@@ -720,6 +771,8 @@ def _input_fn_2d(mod, inputs, device_mesh):
 TestDTensorOptimizerWithLocalTensor = create_local_tensor_test_class(
     TestDTensorOptimizer,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py
index 953709c197584..e4441dba7c385 100644
--- a/test/distributed/tensor/test_pointwise_ops.py
+++ b/test/distributed/tensor/test_pointwise_ops.py
@@ -1,8 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import skip
 
 import torch
@@ -17,7 +22,10 @@
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.debug import CommDebugMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorOpTestBase,
@@ -148,6 +156,17 @@ def test_partial_add(self):
         d_3 = d_1 + d_2
         self.assertTrue(d_3._spec.placements[0].is_partial())
 
+<<<<<<< HEAD
+=======
+    def test_partial_mul(self):
+        device_mesh = self.build_device_mesh()
+        d_1 = DTensor.from_local(torch.ones(2, 2), device_mesh, [Partial()])
+        d_2 = DTensor.from_local(torch.ones(2, 2), device_mesh, [Partial()])
+        d_3 = d_1 * d_2
+        self.assertTrue(d_3._spec.placements[0].is_replicate())
+        self.assertEqual(d_3.to_local(), torch.ones(2, 2) * (self.world_size**2))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_activations(self):
         device_mesh = self.build_device_mesh()
         self._run_sharded_elementwise_ops(
@@ -275,6 +294,7 @@ def test_mul_out(self):
         self.assertEqual(input_tensor, dtensor.to_local())
         self.assertEqual(expected, dt.to_local())
 
+<<<<<<< HEAD
     def test_mul_partial(self):
         # we only test the partial behavior for mul op as other placement
         # behaviors should be well tested in test_dtensor_ops.py
@@ -331,6 +351,8 @@ def test_mul_partial(self):
         self.assertEqual(z.placements, (Replicate(),))
         self.assertEqual(z.to_local(), input)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index 61b88ee169e2e..6fd29c9a2b7df 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -31,12 +31,15 @@
     skip_unless_torch_gpu,
     with_comms,
 )
+<<<<<<< HEAD
 from torch.utils._typing_utils import not_none
 
 
 def get_generator_seed_for_device_type(device_type: str) -> int:
     device_module = torch.get_device_module(device_type)
     return device_module.get_rng_state()[:8].view(torch.int64).item()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DistTensorRandomInitTest(DTensorTestBase):
@@ -45,7 +48,11 @@ def _run_init_op(self, init_op, *args, **kwargs):
         shard_spec = [Shard(0)]
         input_size = (8, 4)
 
+<<<<<<< HEAD
         # NOTE: currently random initialization on gpu device has different
+=======
+        # NOTE: currently random initialization on cuda device has different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # behavior from other devices. Unify the test once the behavior is unified.
         if not is_rng_supported_mesh(device_mesh):
             input_tensor = torch.randn(*input_size, device=self.device_type)
@@ -95,6 +102,7 @@ def test_init_ops(self):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     def test_init_with_user_generator(self):
         device_mesh = self.build_device_mesh()
         torch.manual_seed(42)
@@ -128,6 +136,16 @@ def test_meta_tensor_init(self):
 
         # Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
         torch.get_device_module(self.device_type).manual_seed(0)
+=======
+    def test_meta_tensor_init(self):
+        # test suite sets each rank's seed to the same value but in actual
+        # execution the default random seed will be different (a random value).
+        # The DTensor random ops will use the same random seed even though the
+        # torch random generator keeps different seeds on ranks. This ensures
+        # that Replicate DTensor will have the same initialized results
+        # across ranks.
+        torch.cuda.manual_seed(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         size = [1024, 2048]
         meta_dtensor = distribute_tensor(
@@ -146,7 +164,11 @@ def test_meta_tensor_init(self):
         self.assertTrue(random._rng_tracker.distribute_region_enabled)
 
         # allgather the local tensors
+<<<<<<< HEAD
         gathered_local_tensors = funcol.all_gather_tensor(
+=======
+        local_tensor = funcol.all_gather_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
 
@@ -157,8 +179,12 @@ def test_meta_tensor_init(self):
                 # other rank should have an identical local tensor
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
                 self.assertEqual(
+<<<<<<< HEAD
                     gathered_local_tensors[self_slice, :],
                     gathered_local_tensors[other_slice, :],
+=======
+                    local_tensor[self_slice, :], local_tensor[other_slice, :]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         # Test 2: disable the distribute region for RNG
@@ -177,11 +203,19 @@ def test_meta_tensor_init(self):
 
         # compare with local tensors from other ranks
         for other_rank in range(self.world_size):
+<<<<<<< HEAD
             # the RNG result on each rank are the same even without the help of DTensor's RNG infra,
             # since the default RNG is the same across ranks.
             if self.rank != other_rank:
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
                 self.assertEqual(
+=======
+            # the RNG result on each rank differs even they're supposed
+            # to be replicated
+            if self.rank != other_rank:
+                other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
+                self.assertNotEqual(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     local_tensor[self_slice, :], local_tensor[other_slice, :]
                 )
 
@@ -307,12 +341,16 @@ def test_rng_tracker_init(self):
         # seed synchronization only happens after `manual_seed` or the first DTensor
         # random op call
         dt.uniform_(0, 1)
+<<<<<<< HEAD
 
         # We do not maintain the copy of the seed in dtensor, but we do mutate the global rng state
         # since we now always pull it fresh from the local device generator
         self.assertEqual(
             seed_from_rank_0, get_generator_seed_for_device_type(self.device_type)
         )
+=======
+        self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_unless_torch_gpu
@@ -331,6 +369,7 @@ def test_manual_seed(self):
             manual_seed(self.rank, device_mesh)
             # RNG tracker should already be initialized
             self.assertTrue(random._rng_tracker is not None)
+<<<<<<< HEAD
             self.assertEqual(
                 self.rank, get_generator_seed_for_device_type(self.device_type)
             )
@@ -338,6 +377,13 @@ def test_manual_seed(self):
             # Test 2: set same seed on different ranks
             manual_seed(1234, device_mesh)
             self.assertEqual(1234, get_generator_seed_for_device_type(self.device_type))
+=======
+            self.assertEqual(self.rank, random._rng_tracker.get_seed("parallel-rng"))
+
+            # Test 2: set same seed on different ranks
+            manual_seed(1234, device_mesh)
+            self.assertEqual(1234, random._rng_tracker.get_seed("parallel-rng"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
@@ -370,10 +416,14 @@ def test_pipeline_parallel_manual_seed(self):
 
         # set the seed for each pipeline stage to 123 + pp_rank
         manual_seed(123 + pp_rank, spmd_mesh)
+<<<<<<< HEAD
         # dtensor no longer stores a copy of the seed, but it mutates the device's generator so we can check that
         self.assertEqual(
             123 + pp_rank, get_generator_seed_for_device_type(self.device_type)
         )
+=======
+        self.assertEqual(123 + pp_rank, random._rng_tracker.get_seed("parallel-rng"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # mimic initializing a model weight sharded on the SPMD mesh
         spmd_dtensor = torch.distributed.tensor.ones(
@@ -458,15 +508,23 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
+<<<<<<< HEAD
                     # other rank should have a different local tensor for shard placement
+=======
+                    # other rank should have an identical local tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertNotEqual(
                         local_tensor[self_slice, :],
                         local_tensor[other_slice, :],
                     )
 
+<<<<<<< HEAD
             # we should set manual seed to the same value on all SPMD ranks
             torch.manual_seed(0)
+=======
+            torch.manual_seed(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtensor = fn(size, device_mesh=device_mesh, placements=[Replicate()])
             local_tensor = funcol.all_gather_tensor(
                 dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
@@ -476,7 +534,11 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
+<<<<<<< HEAD
                     # other rank should have an identical local tensor for replicate placement
+=======
+                    # other rank should have an identical local tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertEqual(
                         local_tensor[self_slice, :],
@@ -550,9 +612,13 @@ def test_deterministic_uniform_2d(self):
             # local_shard_list_on_dim[i] has the list of all shards on that dim
             # as a tuple (local_shard_offset, local_shard_size)
             dtensor_shape = dtensor.shape
+<<<<<<< HEAD
             local_shard_list_on_dim: list[list[tuple[int, int]]] = [
                 [(0, l)] for l in dtensor_shape
             ]
+=======
+            local_shard_list_on_dim = [[(0, l)] for l in dtensor_shape]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for idx, placement in enumerate(placements):
                 if isinstance(placement, Shard):
                     mesh_dim_size = device_mesh.size(idx)
@@ -568,7 +634,11 @@ def test_deterministic_uniform_2d(self):
                             shard_idx_on_dim,
                         )
                         local_shard_list_on_dim[shard_dim].append(
+<<<<<<< HEAD
                             (not_none(shard_offset), shard_size)
+=======
+                            (shard_offset, shard_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             local_shard_comb = itertools.product(*local_shard_list_on_dim)
@@ -595,8 +665,13 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
     def world_size(self):
         return 8
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(8)
     @with_comms
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hsdp_tp_model_meta_init(self):
         # initialize the 3-d device mesh
         global_mesh = init_device_mesh(
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index 23593462f0a29..7cfe4f158cf33 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import contextlib
 import copy
 import itertools
@@ -11,6 +12,11 @@
     maybe_disable_local_tensor_mode,
     maybe_run_for_local_tensor,
 )
+=======
+import itertools
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import (
     DeviceMesh,
@@ -21,6 +27,7 @@
     Shard,
 )
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
+<<<<<<< HEAD
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
@@ -39,6 +46,14 @@
     with_comms,
 )
 from torch.utils._debug_mode import DebugMode
+=======
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 funcol = torch.ops.c10d_functional
@@ -50,10 +65,16 @@ def world_size(self):
         return 4
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_shard_to_replicate_forward_backward(self, dtype):
         # 1) test shard -> replicate forward
         device_mesh = self.build_device_mesh()
+=======
+    def test_shard_to_replicate_forward_backward(self):
+        # 1) test shard -> replicate forward
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replica_spec = [Replicate()]
 
         input_sizes_and_shard_dim = [
@@ -69,7 +90,11 @@ def test_shard_to_replicate_forward_backward(self, dtype):
         for input_size, shard_dim in input_sizes_and_shard_dim:
             shard_spec = [Shard(shard_dim)]
             expected_tensor = torch.randn(
+<<<<<<< HEAD
                 input_size, device=self.device_type, requires_grad=True, dtype=dtype
+=======
+                input_size, device=self.device_type, requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             dtensor = distribute_tensor(expected_tensor, device_mesh, shard_spec)
             with comm_mode:
@@ -88,14 +113,22 @@ def test_shard_to_replicate_forward_backward(self, dtype):
             grad_input = dtensor.grad
             self.assertEqual(grad_input.placements, shard_spec)
             self.assertEqual(
+<<<<<<< HEAD
                 grad_input.to_local(),
                 torch.ones(dtensor.to_local().size(), dtype=dtype),
+=======
+                grad_input.to_local(), torch.ones(dtensor.to_local().size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
     def test_replicate_to_replicate_forward_backward(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replica_spec = [Replicate()]
         local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
 
@@ -122,6 +155,7 @@ def test_replicate_to_replicate_forward_backward(self):
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_replicate_to_local_partial_grad(self, dtype):
         device_mesh = self.build_device_mesh()
@@ -129,6 +163,12 @@ def test_replicate_to_local_partial_grad(self, dtype):
         local_tensor = torch.randn(
             12, 3, device=self.device_type, requires_grad=True, dtype=dtype
         )
+=======
+    def test_replicate_to_local_partial_grad(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        replica_spec = [Replicate()]
+        local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         replica_tensor = distribute_tensor(local_tensor, device_mesh, replica_spec)
 
@@ -145,7 +185,11 @@ def test_replicate_to_local_partial_grad(self, dtype):
 
     @with_comms
     def test_replicate_to_shard_forward_backward(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replica_spec = [Replicate()]
 
         input_sizes_and_shard_dim = [
@@ -169,9 +213,13 @@ def test_replicate_to_shard_forward_backward(self):
             )
 
             # make local tensor as the element of the corresponding chunked list
+<<<<<<< HEAD
             local_tensor = map_local_tensor_for_rank(
                 splitted_list, self.rank, lambda tl, r: tl[r]
             )
+=======
+            local_tensor = splitted_list[self.rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replica_tensor = distribute_tensor(local_replica, device_mesh, replica_spec)
             with comm_mode:
                 reshard_tensor = replica_tensor.redistribute(device_mesh, shard_spec)
@@ -194,16 +242,25 @@ def test_replicate_to_shard_forward_backward(self):
             )
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_partial_to_replicate_forward_backward(self, dtype):
+=======
+    def test_partial_to_replicate_forward_backward(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Although we don't allow user to reshard to produce a partial
         # placement (i.e. user can't reshard to partial), we do allow
         # replicate to partial internally, and also partial to replicate
         # backward should work as expected
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
         partial_local = torch.ones(
             12, 3, device=self.device_type, requires_grad=True, dtype=dtype
         )
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        partial_local = torch.ones(12, 3, device=self.device_type, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         partial_spec = [Partial()]
         replica_spec = [Replicate()]
 
@@ -228,14 +285,22 @@ def test_partial_to_replicate_forward_backward(self, dtype):
             global_partial_tensor.backward(torch.ones_like(global_partial_tensor))
         self.assertIsNotNone(partial_local.grad)
         self.assertEqual(partial_local.grad.size(), partial_local.size())
+<<<<<<< HEAD
         self.assertEqual(
             partial_local.grad, torch.ones_like(partial_local, dtype=dtype)
         )
+=======
+        self.assertEqual(partial_local.grad, torch.ones_like(partial_local))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
     def test_replicate_to_replicate_forward_backward_datatype_conversion(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replica_spec = [Replicate()]
 
         forward_datatypes = [
@@ -292,7 +357,11 @@ def test_replicate_to_replicate_forward_backward_datatype_conversion(self):
 
     @with_comms
     def test_shard_to_replicate_forward_backward_datatype_conversion(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replica_spec = [Replicate()]
 
         shard_dim_and_input_sizes = [
@@ -364,13 +433,21 @@ def test_shard_to_replicate_forward_backward_datatype_conversion(self):
 
     @with_comms
     def test_replicate_to_partial(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
         partial_spec = Partial()
         replica_spec = Replicate()
         # 1) test replicate -> partial forward
         replica_tensor = distribute_tensor(local_tensor, device_mesh, [replica_spec])
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Can not redistribute"):
+=======
+        with self.assertRaisesRegex(RuntimeError, "Can not redistribute to Partial"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partial_tensor = replica_tensor.redistribute(device_mesh, [partial_spec])
 
         from torch.distributed.tensor._redistribute import Redistribute
@@ -411,11 +488,18 @@ def test_replicate_to_partial(self):
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_partial_to_shard(self, dtype):
         device_mesh = self.build_device_mesh()
         partial_spec = [Partial()]
         my_rank = self.rank
+=======
+    def test_partial_to_shard(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        partial_spec = [Partial()]
+        my_rank = device_mesh.get_rank()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input_sizes_and_shard_dim = [
             ((self.world_size * 3, 3), 0),
@@ -431,7 +515,11 @@ def test_partial_to_shard(self, dtype):
         for input_size, shard_dim in input_sizes_and_shard_dim:
             shard_spec = [Shard(shard_dim)]
 
+<<<<<<< HEAD
             partial_local = torch.ones(input_size, device=self.device_type, dtype=dtype)
+=======
+            partial_local = torch.ones(input_size, device=self.device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partial_tensor = DTensor.from_local(
                 partial_local, device_mesh, partial_spec, run_check=False
             )
@@ -448,6 +536,7 @@ def test_partial_to_shard(self, dtype):
                 for idx in range(self.world_size)
             ]
 
+<<<<<<< HEAD
             @maybe_run_for_local_tensor
             def _compute_local_shape(rank) -> list[int]:
                 local_shape = list(input_size)
@@ -455,6 +544,10 @@ def _compute_local_shape(rank) -> list[int]:
                 return local_shape
 
             local_shape = _compute_local_shape(my_rank)
+=======
+            local_shape = list(input_size)
+            local_shape[shard_dim] = chunk_sizes[my_rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # test partial to shard, trigger reduce_scatter
             with comm_mode:
@@ -465,7 +558,11 @@ def _compute_local_shape(rank) -> list[int]:
             self.assertEqual(scatter_shard_tensor.placements, shard_spec)
             self.assertEqual(
                 scatter_shard_tensor.to_local(),
+<<<<<<< HEAD
                 torch.ones(local_shape, dtype=dtype) * self.world_size,
+=======
+                torch.ones(local_shape) * self.world_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(
                 comm_mode.get_comm_counts()[funcol.reduce_scatter_tensor], 1
@@ -473,7 +570,11 @@ def _compute_local_shape(rank) -> list[int]:
 
     @with_comms
     def test_redistribute_negative_shard_dim(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
         shard_spec = [Shard(1)]
         shard_minus_spec = [Shard(-1)]
@@ -508,6 +609,7 @@ def test_redistribute_uneven_sharding(self):
                 self.assertEqual(dt_full_tensor, input_tensor)
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_redistribute_shard_dim_change(self, dtype):
         # test 1d device mesh
@@ -523,6 +625,22 @@ def test_redistribute_shard_dim_change(self, dtype):
             torch.randn((5, 8), device=self.device_type, dtype=dtype),
             # uneven case 3
             torch.randn((5, 5), device=self.device_type, dtype=dtype),
+=======
+    def test_redistribute_shard_dim_change(self):
+        # test 1d device mesh
+        mesh_1d = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        data_to_test = [
+            # evenly sharded case
+            torch.randn((8, 8), device=self.device_type),
+            # 3d or more dims
+            torch.randn((8, 8, 8), device=self.device_type),
+            # uneven case 1
+            torch.randn((8, 5), device=self.device_type),
+            # uneven case 2
+            torch.randn((5, 8), device=self.device_type),
+            # uneven case 3
+            torch.randn((5, 5), device=self.device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         sharding_src_dst_pairs = [([Shard(0)], [Shard(1)]), ([Shard(1)], [Shard(0)])]
@@ -547,12 +665,19 @@ def test_redistribute_shard_dim_change(self, dtype):
                         1,
                     )
                 else:
+<<<<<<< HEAD
                     # TODO: Integrate local tensor with CommDebugMode
                     if not self.is_local_tensor_enabled:
                         self.assertEqual(
                             comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
                             1,
                         )
+=======
+                    self.assertEqual(
+                        comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                        1,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # test 2d device mesh
         mesh_2d = DeviceMesh(
@@ -560,6 +685,7 @@ def test_redistribute_shard_dim_change(self, dtype):
         )
         data_to_test_2d = [
             # evenly sharded case
+<<<<<<< HEAD
             torch.randn((8, 8), device=self.device_type, dtype=dtype),
             # 3d or more dims
             torch.randn((8, 8, 8), device=self.device_type, dtype=dtype),
@@ -569,6 +695,17 @@ def test_redistribute_shard_dim_change(self, dtype):
             torch.randn((5, 8), device=self.device_type, dtype=dtype),
             # uneven case 3
             torch.randn((5, 5), device=self.device_type, dtype=dtype),
+=======
+            torch.randn((8, 8), device=self.device_type),
+            # 3d or more dims
+            torch.randn((8, 8, 8), device=self.device_type),
+            # uneven case 1
+            torch.randn((8, 5), device=self.device_type),
+            # uneven case 2
+            torch.randn((5, 8), device=self.device_type),
+            # uneven case 3
+            torch.randn((5, 5), device=self.device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         sharding_src_dst_pairs_2d = [
             ([Shard(0), Shard(1)], [Shard(0), Shard(0)]),
@@ -601,19 +738,30 @@ def test_redistribute_shard_dim_change(self, dtype):
                     out_dt = sharded_dt.redistribute(mesh_2d, dst)
 
                 self.assertEqual(out_dt.placements, expected_dt.placements)
+<<<<<<< HEAD
                 if not self.is_local_tensor_enabled:
                     self.assertEqual(comm_mode.get_total_counts(), comm_counts_2d[idx])
+=======
+                self.assertEqual(comm_mode.get_total_counts(), comm_counts_2d[idx])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 local_out_dt = out_dt.to_local()
                 local_expected_dt = expected_dt.to_local()
                 self.assertEqual(local_out_dt, local_expected_dt)
 
     @with_comms
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.cfloat])
     def test_shard_dim_alltoall(self, dtype):
         # init 2d mesh here so we can test when group_rank != global_rank
         mesh = init_device_mesh(self.device_type, (2, 2))
         tensor = torch.randn(12, self.world_size, device=self.device_type, dtype=dtype)
+=======
+    def test_shard_dim_alltoall(self):
+        # init 2d mesh here so we can test when group_rank != global_rank
+        mesh = init_device_mesh(self.device_type, (2, 2))
+        tensor = torch.randn(12, self.world_size, device=self.device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_tensor = shard_dim_alltoall(tensor, 0, 1, mesh, 0)
 
         meta_tensor = torch.randn(12, self.world_size, device="meta")
@@ -622,6 +770,7 @@ def test_shard_dim_alltoall(self, dtype):
         self.assertEqual(new_tensor.shape, new_meta_tensor.shape)
         self.assertEqual(new_tensor.stride(), new_meta_tensor.stride())
 
+<<<<<<< HEAD
     @with_comms
     def test_one_chunk_mesh(self):
         # mesh size is 1 on second dim
@@ -677,6 +826,8 @@ def test_redistribute_to_partial(self):
 
 instantiate_parametrized_tests(RedistributeTest)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MultiDimRedistributeTest(DTensorTestBase):
     @property
@@ -710,7 +861,11 @@ def test_multi_dim_mesh(self):
                 dt = distribute_tensor(full_tensor, device_mesh, repl_inputs)
 
                 if repl_inputs != inputs:
+<<<<<<< HEAD
                     # create a new DTensor reinterpreting some of the replicated entries as "Partial"
+=======
+                    # create a new DTensor reinterpreting some of the replicated entires as "Partial"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dt = DTensor.from_local(
                         dt.to_local(), device_mesh, inputs, run_check=False
                     )
@@ -770,6 +925,7 @@ def test_redistribute_shard_dim_multi_dim_mesh(self):
             self.assertEqual(local_out_dt, local_expected_dt)
 
 
+<<<<<<< HEAD
 class DistributeWithDeviceOrderTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
@@ -1201,5 +1357,7 @@ def test_shard_order_same_data_as_strided_shard(self):
     DistributeWithDeviceOrderTest,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index e8266aa4f4aef..a5f7ee8b332ba 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -1,14 +1,20 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import itertools
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
     DTensor,
+<<<<<<< HEAD
     init_device_mesh,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Partial,
     Replicate,
     Shard,
@@ -17,7 +23,10 @@
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+<<<<<<< HEAD
     create_local_tensor_test_class,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorConverter,
     DTensorTestBase,
     with_comms,
@@ -28,7 +37,11 @@ class DistTensorOpsTest(DTensorTestBase):
     @with_comms
     def test_aten_contiguous(self):
         # this op not covered by dtensor_ops
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_op(
             mesh,
             lambda x: torch.ops.aten.contiguous(x),
@@ -37,7 +50,11 @@ def test_aten_contiguous(self):
 
     @with_comms
     def test_detach(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         tensor_to_detach = torch.randn(12, 8, requires_grad=True)
@@ -47,7 +64,11 @@ def test_detach(self):
 
     @with_comms
     def test_clone(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         specs = [[Replicate()], [Shard(0)]]
         tensor_to_clone = torch.randn(12, 8, requires_grad=True)
         for spec in specs:
@@ -57,6 +78,7 @@ def test_clone(self):
             self.assertEqual(cloned_mat.to_local(), mat.to_local())
 
     @with_comms
+<<<<<<< HEAD
     def test_copy_(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
@@ -112,6 +134,10 @@ def test_copy_(self):
     @with_comms
     def test_contiguous(self):
         device_mesh = self.build_device_mesh()
+=======
+    def test_contiguous(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor = torch.rand(3, 5, 6, requires_grad=True)
         sharding = [Shard(0)]
         dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
@@ -137,7 +163,11 @@ def test_contiguous(self):
 
     @with_comms
     def test_inplace_op(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_tensor = torch.randn((12, 3), device=self.device_type)
         dt_to_add = distribute_tensor(input_tensor, mesh, [Shard(0)])
         dt_to_mul = dt_to_add.clone()
@@ -164,7 +194,11 @@ def test_inplace_op(self):
 
     @with_comms
     def test_op_out_variant(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_tensor = torch.randn((12, 3), device=self.device_type)
         sharded_dt_input = distribute_tensor(input_tensor, mesh, [Shard(0)])
         expected_dt = sharded_dt_input.clone() + 3
@@ -185,7 +219,11 @@ def test_op_out_variant(self):
 
     @with_comms
     def test_empty_like(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -196,7 +234,11 @@ def test_empty_like(self):
 
     @with_comms
     def test_fill_inplace(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -208,7 +250,11 @@ def test_fill_inplace(self):
 
     @with_comms
     def test_full_like(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -219,7 +265,11 @@ def test_full_like(self):
 
     @with_comms
     def test_ones_like(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -230,7 +280,11 @@ def test_ones_like(self):
 
     @with_comms
     def test_ones_like_partial_sum(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Partial()]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -243,7 +297,11 @@ def test_ones_like_partial_sum(self):
 
     @with_comms
     def test_fill_inplace_partial_sum(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Partial()]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -259,7 +317,11 @@ def test_fill_inplace_partial_sum(self):
 
     @with_comms
     def test_zeros_like_partial_sum(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Partial()]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -272,7 +334,11 @@ def test_zeros_like_partial_sum(self):
 
     @with_comms
     def test_zero_inplace(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -284,7 +350,11 @@ def test_zero_inplace(self):
 
     @with_comms
     def test_zeros_like(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor = torch.randn(4, 8, requires_grad=True)
@@ -336,7 +406,11 @@ def test_stack(self):
 
     @with_comms
     def test_equal(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
 
         input_tensor_1 = torch.ones(4, 4)
@@ -386,7 +460,11 @@ def _test_op(self, mesh, op_call, *args, **kwargs):
 
     @with_comms
     def test_new_full(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
 
         global_tensor = torch.randn(12, 8)
@@ -413,7 +491,11 @@ def test_new_full(self):
 
     @with_comms
     def test_new_empty_strided(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
 
         shard_dim = 1
@@ -458,7 +540,11 @@ def test_new_empty_strided(self):
 
     @with_comms
     def test_scatter(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
 
         # case 1 all replicate: input replicated, index/src replicated, output replicated
@@ -492,7 +578,11 @@ def test_scatter(self):
 
     @with_comms
     def test_gather(self):
+<<<<<<< HEAD
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
 
         # case 1 all replicate: input replicated, index replicated, output replicated
@@ -511,7 +601,11 @@ def test_gather(self):
         # case 2 input sharding: input sharded, index replicated, output mask partial
         # only works when index has size 1 on the gather dimension and
         # input is sharded on the gather dimension
+<<<<<<< HEAD
         from torch.distributed.tensor.placement_types import MaskPartial
+=======
+        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         gather_dim = 1
         global_input = torch.randn(12, 8, 16)
@@ -522,7 +616,11 @@ def test_gather(self):
         with comm_mode:
             output_dt = torch.gather(input_dt, gather_dim, index_dt)
             self.assertEqual(comm_mode.get_total_counts(), 0)
+<<<<<<< HEAD
         self.assertIsInstance(output_dt.placements[0], MaskPartial)
+=======
+        self.assertIsInstance(output_dt.placements[0], _MaskPartial)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(output_dt.full_tensor(), global_output)
 
         # case 3 index sharding: input replicated, index sharded, output sharded
@@ -543,7 +641,11 @@ def test_gather(self):
     @with_comms
     def test_index(self):
         meshes = [
+<<<<<<< HEAD
             self.build_device_mesh(),  # 1D mesh
+=======
+            DeviceMesh(self.device_type, list(range(self.world_size))),  # 1D mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO(@azzolini): un-comment when DTensorConverter supports N-D mesh
             # DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, -1)), # 2D mesh
         ]
@@ -650,6 +752,7 @@ def test_index(self):
             )
 
     @with_comms
+<<<<<<< HEAD
     def test_index_put_scalar(self):
         device_mesh = init_device_mesh(self.device_type, (2, self.world_size // 2))
         global_input = torch.randn(2, 4, 8, device=self.device_type)
@@ -694,6 +797,10 @@ def test_index_put_tensor(self):
     @with_comms
     def test_where_type_promotion(self):
         mesh = self.build_device_mesh()  # 1D mesh
+=======
+    def test_where_type_promotion(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))  # 1D mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         specs = [[Shard(0)], [Replicate()]]
         for spec in specs:
@@ -705,6 +812,7 @@ def test_where_type_promotion(self):
 
     @with_comms
     def test_dtensor_dtype_conversion(self):
+<<<<<<< HEAD
         from torch.distributed.tensor.debug import (
             _clear_sharding_prop_cache,
             _get_sharding_prop_cache_info,
@@ -712,6 +820,9 @@ def test_dtensor_dtype_conversion(self):
 
         _clear_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
+=======
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
         local_tenor = torch.randn(2, 8, dtype=torch.bfloat16)
@@ -729,6 +840,11 @@ def test_dtensor_dtype_conversion(self):
         self.assertEqual(bf16_sharded_dtensor1.dtype, torch.bfloat16)
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
+<<<<<<< HEAD
+=======
+        from torch.distributed.tensor.debug import _get_sharding_prop_cache_info
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # by this point we only have cache misses
         hits, misses, _, _ = _get_sharding_prop_cache_info()
         self.assertEqual(hits, 0)
@@ -743,7 +859,11 @@ def test_dtensor_dtype_conversion(self):
 
     @with_comms
     def test_slice(self):
+<<<<<<< HEAD
         mesh = self.build_device_mesh()  # 1D mesh
+=======
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))  # 1D mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comm_mode = CommDebugMode()
 
         shard_spec = [Shard(1)]
@@ -768,6 +888,7 @@ def test_slice(self):
         self.assertEqual(sharded_out.full_tensor(), global_out)
         self.assertEqual(sharded_dtensor.grad.full_tensor(), global_tensor.grad)
 
+<<<<<<< HEAD
     @with_comms
     def test_split_on_partial(self):
         self.run_subtests(
@@ -830,6 +951,8 @@ def test_unbind(self):
 DistTensorOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistTensorOpsTest,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_utils.py b/test/distributed/tensor/test_utils.py
index 01f150f090b73..fce5a5d2da701 100644
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@@ -1,7 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import itertools
+<<<<<<< HEAD
 from typing import Any
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed.device_mesh import init_device_mesh
@@ -10,6 +13,7 @@
 from torch.distributed.tensor._utils import (
     _compute_local_shape_and_global_offset,
     _explicit_order_placements,
+<<<<<<< HEAD
     compute_global_tensor_info,
     compute_global_tensor_shape,
     compute_local_shape_and_global_offset,
@@ -23,6 +27,13 @@
     Replicate,
     Shard,
 )
+=======
+    compute_global_tensor_shape,
+    compute_local_shape_and_global_offset,
+)
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.placement_types import _StridedShard, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -188,7 +199,11 @@ def test_compute_global_tensor_shape_1D_invalid_shape(self):
         )
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "Non-sharded dimensions should have identical size across ranks.",
+=======
+            "Non-sharded dimentions should have identical size across ranks.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             _ = compute_global_tensor_shape(
                 local_shape,
@@ -451,6 +466,7 @@ def test_strided_sharding_assumption_in_meta_compute(self):
             )
 
 
+<<<<<<< HEAD
 class UtilSingleDeviceTest(TestCase):
     def test_compute_global_tensor_info_unsupported_placement(self):
         class MockDeviceMesh:
@@ -550,6 +566,8 @@ def test_compute_tensor_info(self):
         torch.distributed.destroy_process_group()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestStridedSharding(DTensorTestBase):
     @property
     def world_size(self):
@@ -746,6 +764,7 @@ def test_2d_mesh_2d_tensor_strided_sharding(self):
         )
         self.assertEqual(full_tensor, x)
 
+<<<<<<< HEAD
     @with_comms
     def test_2d_mesh_uneven_strided_shard(self):
         mesh = init_device_mesh(
@@ -765,6 +784,8 @@ def test_2d_mesh_uneven_strided_shard(self):
             )
             self.assertEqual(dtensor.full_tensor(), tensor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Test2DStridedLocalShard(DTensorTestBase):
     @property
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 857d5bd7a91df..f37dbe8fe9bac 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -10,9 +10,13 @@
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
+<<<<<<< HEAD
     DTensor,
     init_device_mesh,
     Partial,
+=======
+    init_device_mesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Replicate,
     Shard,
 )
@@ -27,10 +31,16 @@
     view_groups,
 )
 from torch.distributed.tensor.debug import CommDebugMode
+<<<<<<< HEAD
 from torch.distributed.tensor.placement_types import _StridedShard, Placement
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     create_local_tensor_test_class,
+=======
+from torch.distributed.tensor.placement_types import Placement
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DTensorTestBase,
     with_comms,
 )
@@ -171,6 +181,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
             *(device_mesh.ndim * [sharding_choices])
         )
 
+<<<<<<< HEAD
         outer_mesh = device_mesh["outer"]
         inner_mesh = device_mesh["inner"]
         inner_mesh_size = inner_mesh.size()
@@ -199,6 +210,10 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
                 )
             else:
                 in_dt = distribute_tensor(args[0], device_mesh, in_shard)
+=======
+        for in_shard in all_sharding_choices:
+            in_dt = distribute_tensor(args[0], device_mesh, in_shard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             comm_mode = CommDebugMode()
             with comm_mode:
@@ -229,13 +244,20 @@ def test_illegal_views(self):
         shard.view(-1)
 
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten sharded dimension"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shard.view(-1)
 
         # 8 is the uneven case since mesh dim is 6
         tensor = torch.randn((8, 256))
         dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=0)])
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             shard.view(-1)
 
@@ -252,6 +274,17 @@ def test_view_ops(self):
         mesh_shape = (dist.get_world_size() // 2, 2)
         self.device_mesh = init_device_mesh(
             self.device_type, mesh_shape=mesh_shape, mesh_dim_names=("outer", "inner")
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten unevenly sharded dimension"
+        ):
+            shard.view(-1)
+
+    @with_comms
+    def test_view_ops(self):
+        self.device_mesh = DeviceMesh(
+            self.device_type, torch.arange(dist.get_world_size()).view(-1, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.dimmap_test(torch.atleast_1d, (randn(()),), (Singleton(),))
         self.dimmap_test(torch.atleast_1d, (randn(24),), (InputDim(0),))
@@ -476,6 +509,10 @@ def test_view_ops(self):
             (randn(42, 24, 36), 1),
             (InputDim(0), Singleton(), InputDim(1), InputDim(2)),
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dimmap_test(
             Tensor.view,
             (randn(6, 12, 24), 72, 24),
@@ -642,6 +679,7 @@ def test_view_redistribution(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor_x = distribute_tensor(x, mesh, (Shard(0),))
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             dtensor_x.view(-1, 8)
 
@@ -672,6 +710,13 @@ def test_squeeze_(self):
         "test_dtensor_view_op_uneven",
     ],
 )
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten unevenly sharded dimension"
+        ):
+            dtensor_x.view(-1, 8)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_xla_integration.py b/test/distributed/tensor/test_xla_integration.py
index 727b128b6b4c0..843cabb8d36ba 100644
--- a/test/distributed/tensor/test_xla_integration.py
+++ b/test/distributed/tensor/test_xla_integration.py
@@ -3,9 +3,14 @@
 
 import os
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from typing import Any
+=======
+from functools import wraps
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 
@@ -151,7 +156,11 @@ def text_xla_distribute_module(self):
 
         def shard_params(mod_name, mod, mesh):
             shard_spec = [Shard(0)]
+<<<<<<< HEAD
             # annotate fc1 and fc2
+=======
+            # annoate fc1 and fc2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(mod, nn.Linear):
                 for _, param in mod.named_parameters():
                     # annotate the parameter tensors directly
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 985e2d5f151a2..5d17a0adacb31 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -43,7 +43,10 @@
     retry_on_connect_failures,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.utils.checkpoint import checkpoint
@@ -55,7 +58,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if platform == "darwin":
     LOOPBACK = "lo0"
@@ -64,8 +71,11 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
@@ -73,9 +83,14 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
+<<<<<<< HEAD
     device_count = torch.accelerator.device_count()
     visible_devices = list(range(device_count))
     gpus_per_process = device_count // world_size
+=======
+    visible_devices = list(range(torch.cuda.device_count()))
+    gpus_per_process = torch.cuda.device_count() // world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -297,6 +312,7 @@ def forward(self, x):
         return self.conv3(x)
 
 
+<<<<<<< HEAD
 # A model involving FFTs, used to test DDP with complex tensors
 class FFTModel(nn.Module):
     def __init__(self, hin, win, n_features):
@@ -314,6 +330,8 @@ def forward(self, x):
         return x
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Task(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -405,7 +423,11 @@ def _prepare_multi_device_module(
             gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
+<<<<<<< HEAD
         input = torch.randn(global_batch_size, 2).to(devices[0])
+=======
+        input = torch.randn(global_batch_size, 2).cuda(devices[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target = torch.randn(global_batch_size, 4)
 
         return model, ddp_model, input, target
@@ -439,10 +461,17 @@ def _test_ddp_checkpointing(
         allow_none_grads=False,
     ):
         # to reproduce the same training results
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
         torch.manual_seed(31415)
         model = copy.deepcopy(input_model).to(device_type)
         ddp_model = copy.deepcopy(input_model).to(device_type)
+=======
+        torch.cuda.set_device(self.rank)
+        torch.manual_seed(31415)
+        model = copy.deepcopy(input_model).cuda()
+        ddp_model = copy.deepcopy(input_model).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ddp_model = nn.parallel.DistributedDataParallel(
             ddp_model,
             bucket_cap_mb=1,
@@ -558,8 +587,13 @@ def __init__(self, use_reentrant=True):
     def _prepare_dummy_data(self):
         ddp_bs = 16
         bs = ddp_bs * self.world_size
+<<<<<<< HEAD
         input = torch.rand((bs, 20), device=device_type, requires_grad=True)
         target = torch.randn((bs, 20), device=device_type)
+=======
+        input = torch.rand((bs, 20), device="cuda", requires_grad=True)
+        target = torch.randn((bs, 20), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offset = self.rank * ddp_bs
         ddp_input = input[offset : offset + ddp_bs]
         ddp_target = target[offset : offset + ddp_bs]
@@ -719,7 +753,11 @@ def test_ddp_checkpointing_weight_sharing(self, use_reentrant):
         Test that checkpointing with weight sharing works.
         """
         process_group = self._get_process_group()
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for use_bucket_view, static_graph in product((False, True), (False, True)):
             torch.manual_seed(31415)
             l1 = nn.Linear(20, 20)
@@ -742,7 +780,11 @@ def test_ddp_checkpointing_twice_weight_sharing(self):
         same layer twice and having weights shared across layers.
         """
         process_group = self._get_process_group()
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for use_bucket_view in (True, False):
             self._test_ddp_checkpointing(
                 self.CheckpointTwiceModuleWeightSharing(),
@@ -1166,7 +1208,11 @@ def _test_sequence_num_incremented(self, process_group, ranks):
 
         # Verify sequence numbers are appropriately incremented
         for i in range(10):
+<<<<<<< HEAD
             t = torch.ones(1, device=device_type)
+=======
+            t = torch.ones(1, device=torch.cuda.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist.all_reduce(t, group=process_group)
             if not c10d._rank_not_in_group(process_group):
                 seq_num = self._verify_sequence_number_across_pg(
@@ -1197,7 +1243,11 @@ def _test_sequence_num_incremented(self, process_group, ranks):
                 self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
 
     def _test_sequence_num_incremented_default_group(self, backend_name):
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             backend_name,
@@ -1211,7 +1261,11 @@ def _test_sequence_num_incremented_default_group(self, backend_name):
         )
 
     def _test_sequence_num_incremented_subgroup(self, backend_name):
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             backend_name,
@@ -1266,8 +1320,13 @@ def _test_warn_not_in_group(self, backend):
         in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
         group = dist.new_group(in_group_ranks)
 
+<<<<<<< HEAD
         x = torch.zeros(2, 2).to(self.rank)
         xs = [torch.zeros(2, 2).to(self.rank) for _ in range(len(in_group_ranks))]
+=======
+        x = torch.zeros(2, 2).cuda(self.rank)
+        xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.rank not in in_group_ranks:
             msg = ".*{}.*does not belong to.*"
             with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
@@ -1396,7 +1455,11 @@ def _test_bool_tensors(self, backend):
             rank=self.rank,
             store=store,
         )
+<<<<<<< HEAD
         device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
+=======
+        device = "cuda" if backend == "nccl" else "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # test alltoall_base
         tensor = torch.tensor([1, 0, 0, 1], dtype=torch.bool, device=device)
         zeros = torch.tensor([0, 0, 0, 0], dtype=torch.bool, device=device)
@@ -1578,8 +1641,13 @@ def test_debug_level(self):
 
 class DummyWork(dist._Work):
     def wait(self, timeout=5.0):
+<<<<<<< HEAD
         if torch.accelerator.is_available():
             torch.accelerator.current_stream().synchronize()
+=======
+        if torch.cuda.is_available():
+            torch.cuda.current_stream().synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
 
@@ -1794,6 +1862,7 @@ def test_backend_config(self):
             ("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
         ]
 
+<<<<<<< HEAD
         if TEST_XPU:
             # Override backend_config_strings_and_expected_values for Intel GPU.
             backend_config_strings_and_expected_values[4:10] = [
@@ -1806,6 +1875,8 @@ def test_backend_config(self):
                 ("cpu:gloo,xpu:xccl", "cpu:gloo,xpu:xccl"),
             ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for config_str, expected_value in backend_config_strings_and_expected_values:
             with self.subTest(config_str):
                 # ensures these configs strings are valid and no ValueError is raised
@@ -1816,8 +1887,11 @@ def test_backend_config(self):
         invalid_backend_config_strings = [
             "cpu:gloo,cuda:nccl,",  # trailing comma
             "cpu:gloo,cuda:nccl,cpu:dummy",  # duplicate device
+<<<<<<< HEAD
             "cpu:gloo,xpu:xccl,",  # trailing comma
             "cpu:gloo,xpu:xccl,cpu:dummy",  # duplicate device
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for config_str in invalid_backend_config_strings:
             with self.subTest(config_str):
@@ -1832,7 +1906,11 @@ def test_init_process_group_with_multiple_backends(self):
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "6789"
         dist.init_process_group(
+<<<<<<< HEAD
             "cpu:dummy,cuda:dummy,xpu:dummy", rank=self.rank, world_size=self.world_size
+=======
+            "cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # test all_gather
@@ -2071,7 +2149,11 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # correctly dispatched
 
         # TODO: this will be updated in the future to not be backend specific
+<<<<<<< HEAD
         device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
+=======
+        device = "cuda" if backend == "nccl" else "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ensure supported devices (cpu, cuda) succeeds during dispatch call
         tensor = torch.zeros(2, 2, device=torch.device(device))
         # multi tensor collectives
@@ -2137,7 +2219,11 @@ def _test_all_to_all_single(self, backend):
             rank=self.rank,
             store=store,
         )
+<<<<<<< HEAD
         device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
+=======
+        device = "cuda" if backend == "nccl" else "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # test alltoall_base
         input_tensor = torch.ones(2, 2, device=torch.device(device))
         output_tensor = torch.zeros(2, 2, device=torch.device(device))
@@ -2269,9 +2355,15 @@ def testNodeLocalRank(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if device_type != "cpu":
         assert not torch.get_device_module()._initialized, (
             f"test_distributed must not have initialized {device_type} context on main process"
         )
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 0877eb53cd6f5..c5d9c4bbd10fc 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: c10d"]
 import gc
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import threading
 import unittest
 from datetime import timedelta
@@ -10,7 +13,11 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 from torch._C import FileCheck
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_cache, run_and_get_code, run_and_get_triton_code
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_triton_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._functional_collectives import (
     all_gather_into_tensor_coalesced,
     all_gather_tensor,
@@ -21,15 +28,26 @@
     reduce_scatter_tensor,
     reduce_scatter_tensor_coalesced,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
+=======
+from torch.testing._internal.common_cuda import SM90OrLater
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -59,7 +77,11 @@ def load_test_module(name):
     sys.exit(0)
 
 
+<<<<<<< HEAD
 @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+@requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestWithNCCL(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -75,6 +97,7 @@ def ranks(self) -> list[int]:
 
     @property
     def device(self) -> torch.device:
+<<<<<<< HEAD
         return torch.device(self.rank)
 
     def _init_process_group(self) -> None:
@@ -84,6 +107,19 @@ def _init_process_group(self) -> None:
 
         dist.init_process_group(
             backend=backend,
+=======
+        return torch.device(f"cuda:{self.rank}")
+
+    def _init_process_group(self) -> None:
+        # Allow testing aoti after torch.compile
+        torch._inductor.config.triton.store_cubin = True
+        torch._inductor.config.debug = True
+
+        torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             world_size=self.world_size,
             rank=self.rank,
             store=store,
@@ -275,7 +311,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
         # check memory leak
         for i in range(1, 10):
+<<<<<<< HEAD
             mem_usage[i] = torch.accelerator.max_memory_allocated()
+=======
+            mem_usage[i] = torch.cuda.max_memory_allocated()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled(arg)
 
         assert mem_usage[9] == mem_usage[8]
@@ -372,16 +412,24 @@ def test_reduce_scatter_tensor_coalesced(self) -> None:
     @skip_if_lt_x_gpu(2)
     def test_all_to_all_single(self) -> None:
         self._init_process_group()
+<<<<<<< HEAD
         torch.accelerator.set_device_index(self.rank)
+=======
+        torch.cuda.set_device(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         torch.manual_seed(42)
         send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))
 
         input_split_sizes = send_sz_matrix[self.rank].tolist()
         output_split_sizes = send_sz_matrix[:, self.rank].tolist()
+<<<<<<< HEAD
         input = torch.full((sum(input_split_sizes),), float(self.rank)).to(
             self.device.type
         )
+=======
+        input = torch.full((sum(input_split_sizes),), float(self.rank)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output = torch.ops._c10d_functional.all_to_all_single(
             input,
@@ -392,7 +440,11 @@ def test_all_to_all_single(self) -> None:
         output = torch.ops._c10d_functional.wait_tensor(output)
         expect = torch.cat(
             [
+<<<<<<< HEAD
                 torch.full((sz,), float(rank)).to(self.device.type)
+=======
+                torch.full((sz,), float(rank)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for rank, sz in enumerate(output_split_sizes)
             ]
         )
@@ -468,7 +520,11 @@ def test_unwaited(self) -> None:
     @fresh_cache()
     def test_threading(self):
         self._init_process_group()
+<<<<<<< HEAD
         device = self.device
+=======
+        device = torch.device(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
@@ -493,7 +549,11 @@ def run(self):
                 try:
                     func(arg)
                     compiled(arg)
+<<<<<<< HEAD
                 except BaseException as exc:  # noqa: B036
+=======
+                except BaseException as exc:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.exc = exc
 
             def join(self):
@@ -505,9 +565,16 @@ def join(self):
         t.start()
         t.join()
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FP8,
         "_scaled_mm currently only supports sm>=90 on cuda and gfx94/95 on ROCm",
+=======
+    @skipIfRocm
+    @unittest.skipIf(
+        not SM90OrLater,
+        "_scaled_mm currently only supports sm>=90",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     @skip_if_lt_x_gpu(2)
     @fresh_cache()
@@ -516,9 +583,16 @@ def test_fixed_striding(self):
 
         def scale(t):
             scale = (
+<<<<<<< HEAD
                 torch.finfo(e4m3_type).max / t.abs().amax(dim=-1, keepdim=True).float()
             )
             t = t.mul(scale).to(e4m3_type)
+=======
+                torch.finfo(torch.float8_e4m3fn).max
+                / t.abs().amax(dim=-1, keepdim=True).float()
+            )
+            t = t.mul(scale).to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return t, scale
 
         def fp8_rowwise_backward(in_, w, out_grad):
@@ -548,9 +622,15 @@ def fp8_rowwise_backward(in_, w, out_grad):
             return in_grad, w_grad
 
         m, n, k = 128, 256, 64
+<<<<<<< HEAD
         in_ = torch.randn((m, k), device=self.device.type, dtype=torch.bfloat16)
         w = torch.randn((n, k), device=self.device.type, dtype=torch.bfloat16)
         out_grad = torch.randn((m, n), device=self.device.type, dtype=torch.bfloat16)
+=======
+        in_ = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+        w = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+        out_grad = torch.randn((m, n), device="cuda", dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         eager_in_grad, eager_w_grad = fp8_rowwise_backward(in_, w, out_grad)
         compile_in_grad, compile_w_grad = torch.compile(fp8_rowwise_backward)(
@@ -712,6 +792,7 @@ def test_collectives(self) -> None:
         self.assertEqual(pg.dels, 4)
 
 
+<<<<<<< HEAD
 def find_buffer_assignments(code):
     pattern = r"buf(\d+) = empty_strided_"
     matches = re.finditer(pattern, code)
@@ -781,6 +862,18 @@ def setUp(self):
         self.world_size = 2
         torch.accelerator.set_device_index(0)
         self.device = torch.accelerator.current_accelerator()
+=======
+class CompileTest(TestCase):
+    def setUp(self):
+        super().setUp()
+        # Allow testing aoti after torch.compile
+        torch._inductor.config.triton.store_cubin = True
+        torch._inductor.config.debug = True
+
+        self.rank = 0
+        self.world_size = 2
+        torch.cuda.set_device("cuda:0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         store = FakeStore()
         dist.init_process_group(
@@ -806,6 +899,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar1 = funcol.wait_tensor(ar1)
             return ar0, ar1
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device)
         compiled = torch.compile(func)
 
@@ -840,6 +934,31 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
         torch.accelerator.synchronize()
+=======
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
+
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            .check("buf7 = empty")
+            # Expect in-place with inductor allocated buf
+            .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
+            # Expect no in-place with graph input (buf5 is a clone)
+            .check("torch.ops._c10d_functional.all_reduce_.default(buf7")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf7")
+            # Expect no extra copy on return
+            .check("return (buf0, buf7, )")
+            .run(code)
+        )
+        assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
+
+        # Test aoti
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -854,6 +973,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             ar1 = [funcol.wait_tensor(out) for out in ar1]
             return ar0, ar1
 
+<<<<<<< HEAD
         args = [torch.rand(4, 4, device=self.device.type) for _ in range(2)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
@@ -878,13 +998,42 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             .check(f"torch.ops._c10d_functional.wait_tensor.default({buf3}")
             # Expect no extra copy on return
             .check(f"return ({buf0}, {buf2}, {buf1}, {buf3}, )")
+=======
+        args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
+        compiled = torch.compile(func)
+        code = run_and_get_triton_code(compiled, args)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            .check("buf5 = empty")
+            .check("buf1 = empty")
+            .check("buf6 = empty")
+            # Expect in-place with inductor allocated buf
+            .check(
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default([buf0, buf1]"
+            )
+            # Expect no in-place with graph input (buf5, buf6 are clones)
+            .check(
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default([buf5, buf6]"
+            )
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf1")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf5")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf6")
+            # Expect no extra copy on return
+            .check("return (buf0, buf1, buf5, buf6, )")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .run(code)
         )
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
 
         # Test aoti
         out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -895,6 +1044,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar0 = funcol.wait_tensor(ar0)
             return ar0
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
 
@@ -908,6 +1058,20 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             .check(f"torch.ops._c10d_functional.all_reduce_.default({buf0}")
             .check(f"torch.ops._c10d_functional.wait_tensor.default({buf0}")
             .check(f"return ({buf0}")
+=======
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
+
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            # We always call .contiguous() on the input to all_reduce_,
+            # so input will not be a view anymore.
+            .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
+            .check("return (buf0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .run(code)
         )
 
@@ -920,7 +1084,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             # Expect allocation
             return ar0
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type).T
+=======
+        arg = torch.rand(4, 4, device="cuda").T
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
@@ -938,9 +1106,12 @@ def func2(arg: torch.Tensor) -> torch.Tensor:
         assert "torch.ops._c10d_functional.wait_tensor.default" in code
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @unittest.skipIf(
         torch._inductor.config.triton.native_matmul, "no extern_kernels.mm"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @fresh_cache()
     def test_inductor_reuse_buffer_after_inplace_collective(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
@@ -954,6 +1125,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             buf2 = torch.mm(arg, buf1)
             return buf1, buf2
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
@@ -972,6 +1144,25 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             .check(f"extern_kernels.mm(arg0_1, {buf1}, out=buf8")
             # Expect no extra copy on return
             .check(f"return ({buf1}, buf8, )")
+=======
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            # Expect allocation
+            .check("buf0 = empty")
+            .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
+            # Expect allocation
+            .check("buf7 = empty")
+            .check("extern_kernels.mm(arg0_1, buf0, out=buf7")
+            # Expect buf0 to be reused
+            .check("buf8 = buf0; del buf0  # reuse")
+            .check("extern_kernels.mm(arg0_1, buf7, out=buf8")
+            # Expect no extra copy on return
+            .check("return (buf7, buf8, )")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .run(code)
         )
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
@@ -984,7 +1175,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ag0 = funcol.wait_tensor(ag0)
             return ag0
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
+=======
+        arg = torch.rand(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -1001,7 +1196,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1011,7 +1210,11 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             ag0 = [funcol.wait_tensor(out) for out in ag0]
             return ag0
 
+<<<<<<< HEAD
         args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
+=======
+        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
@@ -1035,7 +1238,11 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
 
         # Test aoti
         out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
     @fresh_cache()
@@ -1045,7 +1252,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             return funcol.wait_tensor(t)
 
         # Test aoti
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
+=======
+        arg = torch.rand(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -1057,7 +1268,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1067,7 +1282,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             rs0 = funcol.wait_tensor(rs0)
             return rs0
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
+=======
+        arg = torch.rand(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -1083,7 +1302,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1095,7 +1318,11 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             rs0 = [funcol.wait_tensor(out) for out in rs0]
             return rs0
 
+<<<<<<< HEAD
         args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
+=======
+        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
@@ -1119,11 +1346,24 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
 
         # Test aoti
         AOTIRunnerUtil.run(func, (args,))
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
     def test_inductor_all_to_all_single(self):
+<<<<<<< HEAD
+=======
+        def _tolist_with_constrain_as_size(tensor):
+            lst = tensor.tolist()
+            for elem in lst:
+                torch._check_is_size(elem)
+            return lst
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def func(
             input: torch.Tensor,
             output_split_sizes: torch.Tensor,
@@ -1131,8 +1371,13 @@ def func(
         ) -> torch.Tensor:
             output = funcol.all_to_all_single(
                 input,
+<<<<<<< HEAD
                 output_split_sizes.tolist(),
                 input_split_sizes.tolist(),
+=======
+                _tolist_with_constrain_as_size(output_split_sizes),
+                _tolist_with_constrain_as_size(input_split_sizes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "0",
             )
             return funcol.wait_tensor(output)
@@ -1142,9 +1387,13 @@ def func(
 
         input_split_sizes = send_sz_matrix[self.rank]
         output_split_sizes = send_sz_matrix[:, self.rank].contiguous()
+<<<<<<< HEAD
         input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).to(
             self.device.type
         )
+=======
+        input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch._dynamo.config.patch(
             dynamic_shapes=True,
@@ -1178,6 +1427,7 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             br1 = funcol.wait_tensor(br1)
             return br0, br1
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
         compiled = torch.compile(func)
 
@@ -1188,20 +1438,42 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             .check(f"{buf0} = empty")
             .check(f"buf1 = {buf0}")
             .check(f"{buf1} = empty")
+=======
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
+
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            .check("buf1 = buf0")
+            .check("buf8 = empty")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Expect in-place with inductor allocated buf
             .check("torch.ops._c10d_functional.broadcast_.default(buf1")
             .check("torch.ops._c10d_functional.wait_tensor.default(buf1")
             # Expect no in-place with graph input (buf5 is a clone)
+<<<<<<< HEAD
             .check(f"torch.ops._c10d_functional.broadcast_.default({buf1}")
             .check(f"torch.ops._c10d_functional.wait_tensor.default({buf1}")
             # Expect no extra copy on return
             .check(f"return (buf1, {buf1}, )")
+=======
+            .check("torch.ops._c10d_functional.broadcast_.default(buf8")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf8")
+            # Expect no extra copy on return
+            .check("return (buf1, buf8, )")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .run(code)
         )
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
+<<<<<<< HEAD
         torch.accelerator.synchronize()
+=======
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
@@ -1216,7 +1488,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             ar1 = funcol.wait_tensor(ar1)
             return ar0, ar1
 
+<<<<<<< HEAD
         arg = torch.rand(4, 4, device=self.device.type)
+=======
+        arg = torch.rand(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(func, fullgraph=True)
 
         code = run_and_get_triton_code(compiled, arg)
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 4b5e766cbd21a..f598b21f57150 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -25,7 +25,10 @@
 
 import test_c10d_common
 from test_c10d_common import (
+<<<<<<< HEAD
     FFTModel,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gpus_for_rank,
     LOOPBACK,
     ModuleForDdpCommHook,
@@ -59,6 +62,10 @@
     run_tests,
     skip_but_pass_in_sandcastle,
     skipIfRocm,
+<<<<<<< HEAD
+=======
+    skipIfRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
@@ -134,6 +141,7 @@ def simple_reduce_tests(rank, world_size):
             ),
         )
 
+<<<<<<< HEAD
     # Extend tests for cfloat dtype
     tests.extend(
         (
@@ -160,6 +168,8 @@ def simple_reduce_tests(rank, world_size):
             ),
         )
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return tests
 
 
@@ -399,6 +409,7 @@ def broadcast(xs, rootRank, rootTensor):
                     torch.tensor([i * num + j], dtype=torch.float32), output[1]
                 )
 
+<<<<<<< HEAD
             # Run with 1 input tensor of cfloat dtype
             x = fn(torch.tensor([complex(self.rank, self.rank)], dtype=torch.cfloat))
             output = broadcast([x], i, 0)
@@ -406,6 +417,8 @@ def broadcast(xs, rootRank, rootTensor):
                 torch.tensor([complex(i, i)], dtype=torch.cfloat), output[0]
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test overloaded convenience function
         x = torch.tensor([self.rank + 1.0])
         fut = pg.broadcast(x, root=0).get_future()
@@ -444,7 +457,10 @@ def test_broadcast_stress(self):
         inputs = [torch.tensor([i * self.world_size + self.rank]) for i in range(1000)]
         self._test_broadcast_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -477,6 +493,7 @@ def test_allreduce_checks(self):
             opts = c10d.AllreduceOptions()
             pg.allreduce([t1, t3], opts)
 
+<<<<<<< HEAD
     @requires_gloo()
     def test_allreduce_op_timeout(self):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -505,6 +522,8 @@ def test_allreduce_overall_timeout(self):
             with self.assertRaisesRegex(RuntimeError, "Timed out waiting 1ms"):
                 pg.allreduce([t1]).wait()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_allreduce_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_gloo(
@@ -603,7 +622,10 @@ def test_allreduce_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_allreduce_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -1058,8 +1080,11 @@ def test_scatter_stress(self):
     @skip_but_pass_in_sandcastle(
         "Test is flaky, see https://github.com/pytorch/pytorch/issues/15963"
     )
+<<<<<<< HEAD
 
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -1192,7 +1217,11 @@ def test_gather_basics_cuda(self):
     @requires_gloo()
     def test_gather_noncontiguous_input(self):
         # Take a column of 2D tensor, such that memory is not dense
+<<<<<<< HEAD
         self._test_gather_basics(lambda t: t.expand(2, 2).tril().contiguous()[:, 0])
+=======
+        self._test_gather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_gather_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -1235,8 +1264,13 @@ def test_gather_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_gather_stress(inputs, lambda t: t.clone())
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
+=======
+    @skip_if_lt_x_gpu(2)
+    @skipIfRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gloo()
     def test_gather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -1326,7 +1360,11 @@ def test_allgather_basics_cuda(self):
     @requires_gloo()
     def test_allgather_noncontiguous_input(self):
         # Take a column of 2D tensor, such that memory is not dense
+<<<<<<< HEAD
         self._test_allgather_basics(lambda t: t.expand(2, 2).tril().contiguous()[:, 0])
+=======
+        self._test_allgather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_gloo()
     def test_allgather_inference_mode(self):
@@ -1371,7 +1409,10 @@ def test_allgather_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_allgather_stress(inputs, lambda t: t.clone())
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -1463,7 +1504,11 @@ def test_allgather_coalesced_async(self):
     @requires_gloo()
     def test_reduce_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
+<<<<<<< HEAD
         pg = self._create_process_group_gloo(
+=======
+        pg = pg = self._create_process_group_gloo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             store, self.rank, self.world_size, self.opts()
         )
 
@@ -1559,7 +1604,10 @@ def test_reduce_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_reduce_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -1629,6 +1677,7 @@ def test_barrier_implies_wait(self):
         for i, tensor in enumerate(tensors):
             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
     @skipIfRocm
@@ -1660,6 +1709,8 @@ def test_send_recv_complex(self):
             pg.recv([recv_tensor], 0, 0).wait()
             self.assertEqual(send_tensor, recv_tensor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -2325,6 +2376,7 @@ def div_by_world_size(fut):
 
         self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
 
+<<<<<<< HEAD
     @requires_gloo()
     def test_ddp_complex_params(self):
         process_group = self._get_process_group()
@@ -2343,6 +2395,8 @@ def test_ddp_complex_params(self):
         loss.backward()
         optimizer.step()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReducerModule(nn.Module):
     def __init__(self) -> None:
@@ -2522,7 +2576,11 @@ def tearDown(self) -> None:
 
     def _verify_trace(self, t, is_json):
         ver = t["version"]
+<<<<<<< HEAD
         self.assertEqual(ver, "2.10")
+=======
+        self.assertEqual(ver, "2.9")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index de188a0f235e1..cf64e7f77b572 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -29,6 +29,7 @@
 
 
 import test_c10d_common
+<<<<<<< HEAD
 from test_c10d_common import (
     ConvNet,
     DoubleGpuNet,
@@ -36,6 +37,9 @@
     gpus_for_rank,
     ModuleForDdpCommHook,
 )
+=======
+from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
@@ -83,6 +87,10 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
+=======
+# bfloat16 is only supported by CUDA 11+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
     torch.version.cuda is not None or torch.version.hip is not None
 )
@@ -606,7 +614,11 @@ def test_nan_check(self):
 
     def _helper_test_extra_cuda_context_by_nvml(self):
         """
+<<<<<<< HEAD
         A helper for `test_extra_cuda_context`, if pynvml is available.
+=======
+        A helper for `test_extra_cuda_context`, if pynvml is avaiable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pynvml provides python bindings for NVIDIA NVML functionalities.
         Here we are interested in: nvmlDeviceGetComputeRunningProcesses
         """
@@ -639,7 +651,11 @@ def _helper_test_extra_cuda_context_by_nvml(self):
 
     def _helper_test_extra_cuda_context_by_memory(self):
         """
+<<<<<<< HEAD
         A helper for `test_extra_cuda_context`, if pynvml is NOT available.
+=======
+        A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         If extra context is created, it would manifest into device 0's memory usage.
         """
         device = torch.device(f"cuda:{self.rank:d}")
@@ -1096,6 +1112,7 @@ def test_comm_split_group(self):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+<<<<<<< HEAD
     def test_comm_split_group_mixed_backend(self):
         # Test `ncclCommSplit` for smaller subgroups of the world when
         # we've passed a specific device_id to init_process_group.
@@ -1152,6 +1169,8 @@ def test_comm_split_group_mixed_backend(self):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_blocking_init(self):
         # Test creating a pg using nonblocking mode but not eagerly
         os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
@@ -1181,7 +1200,11 @@ def test_non_blocking_with_eager_init(self):
         os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
         store = c10d.FileStore(self.file_name, self.world_size)
         device = torch.device(f"cuda:{self.rank}")
+<<<<<<< HEAD
         # bound device to trigger eager init mode
+=======
+        # bound device to triger eager init mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
         backend = pg._get_backend(torch.device(device))
         self.assertEqual(backend.comm_split_count(), 0)
@@ -1286,6 +1309,7 @@ def test_init_with_idx(self):
         )
         dist.all_reduce(torch.empty(1, device=torch.device("cuda", device_idx)))
 
+<<<<<<< HEAD
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_block_current_stream(self):
@@ -1301,6 +1325,8 @@ def test_block_current_stream(self):
         work.wait()
         torch.cuda.synchronize()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -2080,7 +2106,11 @@ def first_bucket_size(ddp_bucket_mb):
                         opt = torch.optim.SGD(m.parameters(), lr=0.1)
                         opt_ddp = torch.optim.SGD(m_ddp.parameters(), lr=0.1)
                         has_half = any(p.dtype is torch.half for p in m.parameters())
+<<<<<<< HEAD
                         tol = 3.0e-3 if has_half else 1.0e-5
+=======
+                        tol = 1.0e-3 if has_half else 1.0e-5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     except BaseException:
                         # Prints case-specific debugging info to narrow down failing case.
                         print(
@@ -2621,6 +2651,28 @@ def test_channels_last_contig(self):
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_complex_params(self):
+<<<<<<< HEAD
+=======
+        class FFTModel(nn.Module):
+            def __init__(self, hin, win, n_features):
+                super().__init__()
+                self.hin = hin
+                self.win = win
+                self.weight = nn.Parameter(
+                    torch.ones(
+                        (n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
+                    )
+                )
+
+            def forward(self, x):
+                xc = torch.fft.rfft2(
+                    x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
+                )
+                xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
+                x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
+                return x
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         process_group = self._get_process_group()
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         N, C, H, W = 1, 16, 64, 64
@@ -2778,7 +2830,15 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
         # from rank0 to other ranks. However, this is DDP's internal implementation,
         # which is subject to change in future versions.
         self.assertTrue(num_hook_fired[OpType.BROADCAST] > 0)
+<<<<<<< HEAD
         ctor_allreduce = num_hook_fired.get(OpType.ALLREDUCE, 0)
+=======
+        ctor_allreduce = (
+            num_hook_fired[OpType.ALLREDUCE]
+            if OpType.ALLREDUCE in num_hook_fired
+            else 0
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.zeros(2, 1000).cuda(self.rank)
         ddp(x).sum().backward()
@@ -2898,6 +2958,7 @@ def _reduce_timeout(self):
         os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
 
     @requires_nccl()
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(3)
     @skip_if_rocm_multiprocess
     def test_send_recv_non_dense_tensor(self):
@@ -2917,6 +2978,8 @@ def test_send_recv_non_dense_tensor(self):
                 dist.recv(block, src=0)
 
     @requires_nccl()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
     @skip_if_rocm_multiprocess
@@ -3060,7 +3123,11 @@ def assert_fut_success(fut):
             time.sleep(4)
             self.assertEqual(process_group.get_error(), ErrorType.REMOTE_ERROR)
 
+<<<<<<< HEAD
         # Mimicking all ranks sensing the timeout, abort
+=======
+        # Mimicing all ranks sensing the timeout, abort
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         process_group.abort()
 
         if prev_nccl_async_error_handling is not None:
@@ -3164,6 +3231,7 @@ def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env("4294967295")
 
 
+<<<<<<< HEAD
 class NcclUserBufferRegistrationTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -3185,6 +3253,24 @@ def setUp(self):
 
     def tearDown(self):
         self.env_patcher.stop()
+=======
+class NcclRegistrationTest(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        nccl_debug_file = tempfile.NamedTemporaryFile()
+        os.environ["NCCL_ALGO"] = "NVLS"
+        os.environ["NCCL_DEBUG"] = "INFO"
+        os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
+        if torch.cuda.nccl.version() >= (2, 24, 3):
+            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+        os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
+        self._spawn_processes()
+
+    def tearDown(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().tearDown()
         try:
             os.remove(self.file_name)
@@ -3197,6 +3283,7 @@ def tearDown(self):
     @requires_multicast_support()
     def test_nccl_user_buffer_registration(self):
         store = c10d.FileStore(self.file_name, self.world_size)
+<<<<<<< HEAD
         device = torch.device(f"cuda:{self.rank}")
         c10d.init_process_group(
             backend="nccl",
@@ -3205,6 +3292,12 @@ def test_nccl_user_buffer_registration(self):
             store=store,
             device_id=device,
         )
+=======
+        c10d.init_process_group(
+            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+        device = torch.device(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.cuda.set_device(self.rank)
         pg = c10d.distributed_c10d._get_default_group()
         backend = pg._get_backend(torch.device(device))
@@ -3246,6 +3339,7 @@ def test_nccl_user_buffer_registration(self):
     @requires_multicast_support()
     def test_nccl_window_registration(self):
         store = c10d.FileStore(self.file_name, self.world_size)
+<<<<<<< HEAD
         device = torch.device(f"cuda:{self.rank}")
         with torch.cuda.device(device):
             # Eager init the nccl comm so that we don't implicitly create one during register_mem_pool
@@ -3288,12 +3382,47 @@ def test_nccl_window_registration(self):
 
             # clean up memory
             del tensor, pool
+=======
+        c10d.init_process_group(
+            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+        device = torch.device(f"cuda:{self.rank}")
+        torch.cuda.set_device(self.rank)
+        pg = c10d.distributed_c10d._get_default_group()
+        backend = pg._get_backend(torch.device(device))
+
+        # Use NCCL memory allocator
+        # enable symmetric memory usage in NCCL
+        pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
+
+        # allocate memory with ncclMemAlloc
+        # note: symmetric kernels are not available for dtypes like torch.int64
+        with torch.cuda.use_mem_pool(pool):
+            tensor = torch.arange(1024 * 1024 * 2, device=device, dtype=torch.float32)
+
+        # register buffers to NCCL
+        backend.register_mem_pool(pool)
+
+        # allreduce now should use NVIDIA Switches
+        pg.allreduce(tensor).wait()
+        torch.cuda.synchronize(device=device)
+
+        # de-register buffers from NCCL
+        backend.deregister_mem_pool(pool)
+
+        # clean up memory
+        del tensor, pool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with open(os.environ["NCCL_DEBUG_FILE"]) as f:
             nccl_debug_file_content = f.read()
             # if buffers were registered and symmetric kernels ran, NCCL_DEBUG
             # should show successful registration in debug output
+<<<<<<< HEAD
             self.assertRegex(nccl_debug_file_content, "Symmetric")
+=======
+            self.assertRegex(nccl_debug_file_content, "[Symmetric]")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
@@ -4378,7 +4507,11 @@ def local_device(self):
 
     def _join_processes(self, fn):
         # We need to patch sys.exit() as skip_if will use sys.exit() and
+<<<<<<< HEAD
         # the exit code from the this process will not be caught.
+=======
+        # the exit code from the this process will not be catched.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with mock.patch("sys.exit"):
             fn()
         super()._join_processes(fn)
@@ -4437,12 +4570,19 @@ def started_or_scheduled(self, timing_enabled):
 class NCCLTraceTest(NCCLTraceTestBase):
     def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
         ver = t["version"]
+<<<<<<< HEAD
         self.assertEqual(ver, "2.10")
         comm_lib_version = t["comm_lib_version"]
         torch_comm_lib_version = torch.cuda.nccl.version()
         self.assertEqual(
             comm_lib_version, ".".join(str(v) for v in torch_comm_lib_version)
         )
+=======
+        self.assertEqual(ver, "2.9")
+        nccl_version = t["nccl_version"]
+        torch_nccl_version = torch.cuda.nccl.version()
+        self.assertEqual(nccl_version, ".".join(str(v) for v in torch_nccl_version))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
@@ -4587,6 +4727,7 @@ def test_short_pickle(self, timing_enabled, include_collectives):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+<<<<<<< HEAD
     @parametrize("timing_enabled", [True, False])
     def test_fr_record_reset(self, timing_enabled):
         if self.rank == self.MAIN_PROCESS_RANK:
@@ -4615,6 +4756,8 @@ def test_fr_record_reset(self, timing_enabled):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dump_pipe(self):
         def open_file_with_timeout(file_path, mode, timeout=1.0):
             start_time = time.time()
@@ -4909,7 +5052,11 @@ def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled):
             for p2p_op_idx, input_sizes in zip(
                 range(first_op, coalesced_op, 1), op_sizes_per_coalesce
             ):
+<<<<<<< HEAD
                 # the individual ops inside the coalescing group the individual op metadata,
+=======
+                # the indivudal ops inside the coalescing group the individual op metadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # but not the timing info coming from the actual coalesced kernel
                 profiling_name = (
                     "nccl:recv 0<-1" if self.rank == 0 else "nccl:send 1->0"
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index 4dd4fc72361cf..3113851d87c46 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -25,7 +25,11 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     init_multigpu_helper,
+<<<<<<< HEAD
     MultiProcContinuousTest,
+=======
+    MultiProcContinousTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_nccl,
     requires_nccl_version,
     sm_is_or_higher_than,
@@ -33,6 +37,10 @@
 from torch.testing._internal.common_utils import (
     run_tests,
     skip_but_pass_in_sandcastle_if,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -44,7 +52,11 @@
     sys.exit(0)
 
 
+<<<<<<< HEAD
 class ProcessGroupNCCLOpTest(MultiProcContinuousTest):
+=======
+class ProcessGroupNCCLOpTest(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def backend_str(cls) -> str:
         return "nccl"
@@ -318,6 +330,10 @@ def test_allreduce_in_cudagraph(self):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+<<<<<<< HEAD
+=======
+    @skipIfRocm()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nccl_watchdog_cudagraph(self):
         # test that the watchdog does not crash graphs with disallowed event query
         pg = self.pg
@@ -934,6 +950,7 @@ def test_reduce_scatter_float8(self):
         )
         torch.testing.assert_close(output_tensor, expected)
 
+<<<<<<< HEAD
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_reduce_scatter_bfloat16(self):
         device = torch.device("cuda", self.rank_to_GPU[self.rank][0])
@@ -953,6 +970,8 @@ def test_reduce_scatter_bfloat16(self):
         )
         torch.testing.assert_close(output_tensor, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_barrier(self):
diff --git a/test/distributed/test_c10d_pypg.py b/test/distributed/test_c10d_pypg.py
index 65faf2075daa6..735bbe66b8dd4 100644
--- a/test/distributed/test_c10d_pypg.py
+++ b/test/distributed/test_c10d_pypg.py
@@ -1,7 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import time
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import weakref
 
 import test_c10d_common
@@ -12,7 +15,10 @@
 from torch._C._distributed_c10d import _create_work_from_future
 from torch.futures import Future
 from torch.nn.parallel import DistributedDataParallel as DDP
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import MultiThreadedTestCase
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -181,6 +187,7 @@ def use_wrapper(self):
         return True
 
 
+<<<<<<< HEAD
 class BlockWork(dist._Work):
     """
     Dummy work that is used to test blocking the current stream.
@@ -194,6 +201,8 @@ def get_future(self):
         return self.future_
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPyProcessGroup(TestCase):
     def test_attr_overrides(self):
         pg = DummyAttrProcessGroup(0, 1)
@@ -213,6 +222,7 @@ def test_abort_shutdown(self) -> None:
         pg.abort()
         pg.shutdown()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda/xpu")
     def test_block_current_stream(self) -> None:
         torch.cuda.synchronize()
@@ -271,6 +281,8 @@ def test_block_current_stream_use_after_free(self) -> None:
             stream.synchronize()
             self.assertTrue(event.query())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index 26e20a4f45dbe..4a93ed2e71f78 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -21,7 +21,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if not c10d.is_available():
     print("c10d not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/test_collective_utils.py b/test/distributed/test_collective_utils.py
index 791aafa5a3a6b..f02c1924559ca 100644
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@@ -2,6 +2,7 @@
 
 from unittest import mock
 
+<<<<<<< HEAD
 import torch
 import torch.distributed as c10d
 from torch.distributed.collective_utils import (
@@ -21,6 +22,12 @@
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
+=======
+import torch.distributed as c10d
+from torch.distributed.collective_utils import all_gather, broadcast
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCollectiveUtils(MultiProcessTestCase):
@@ -131,6 +138,7 @@ def test_all_gather_result_raises_exceptions_from_func(
         with self.assertRaisesRegex(Exception, expected_exception):
             all_gather(data_or_fn=func)
 
+<<<<<<< HEAD
     @parametrize("device", ["cpu", "cuda"])
     def test_check_rng_sync(
         self,
@@ -211,6 +219,8 @@ def test_summarize_ranks(self):
 
 
 instantiate_parametrized_tests(TestCollectiveUtils)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_composability.py b/test/distributed/test_composability.py
index 3508a43cb548f..ad8df22f38c0d 100644
--- a/test/distributed/test_composability.py
+++ b/test/distributed/test_composability.py
@@ -6,12 +6,17 @@
 import torch.nn.functional as F
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
+<<<<<<< HEAD
 from torch.distributed.fsdp._fully_shard._fsdp_param import ShardedState
 from torch.distributed.pipelining import PipelineStage
 from torch.distributed.pipelining.schedules import (
     _Action,
     _ComputationType,
     _PipelineScheduleRuntime,
+=======
+from torch.distributed.pipelining import PipelineStage
+from torch.distributed.pipelining.schedules import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PipelineScheduleSingle,
     Schedule1F1B,
     ScheduleGPipe,
@@ -23,7 +28,11 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
+<<<<<<< HEAD
     MultiProcContinuousTest,
+=======
+    MultiProcContinousTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_nccl,
     skip_if_lt_x_gpu,
 )
@@ -95,7 +104,11 @@ def loss_fn(y, target, scale=1e-4):
     return torch.nn.functional.cross_entropy(y, target) * scale
 
 
+<<<<<<< HEAD
 class ComposabilityTest(MultiProcContinuousTest):
+=======
+class ComposabilityTest(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
@@ -146,7 +159,10 @@ def _build_pp_schedule(
         total_layers,
         apply_dp,
         loss_fn,
+<<<<<<< HEAD
         scale_grads=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if issubclass(ScheduleClass, PipelineScheduleSingle):
             pipeline_stage, offset = self._build_pp_stage(
@@ -164,7 +180,10 @@ def _build_pp_schedule(
                 pipeline_stage,
                 n_microbatches=num_microbatches,
                 loss_fn=loss_fn,
+<<<<<<< HEAD
                 scale_grads=scale_grads,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             n_virtual = 2
@@ -187,7 +206,10 @@ def _build_pp_schedule(
                 stages,
                 n_microbatches=num_microbatches,
                 loss_fn=loss_fn,
+<<<<<<< HEAD
                 scale_grads=scale_grads,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return pipeline_schedule, partial_models, offsets
 
@@ -382,6 +404,7 @@ def apply_dp(partial_model):
                     p.grad.full_tensor(), ref_p.grad, atol=5e-5, rtol=2e-2
                 )
 
+<<<<<<< HEAD
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
@@ -531,5 +554,10 @@ def create_schedule(computation_types, microbatch_index=None):
 
 
 instantiate_parametrized_tests(ComposabilityTest)
+=======
+
+instantiate_parametrized_tests(ComposabilityTest)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index a13611a53609f..e05904ba8d38f 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -29,6 +29,10 @@
     requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_fsdp import get_devtype
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
@@ -78,10 +82,13 @@ def create_grouped_node_for_allreduce_and_its_deps(snodes):
 
 
 @requires_accelerator_dist_backend()
+<<<<<<< HEAD
 @unittest.skipIf(
     torch._inductor.config.triton.native_matmul,
     "native matmul is fused with surrounding ops",
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
@@ -182,10 +189,14 @@ def func(a):
                 .check("extern_kernels.mm")
                 .check("triton_poi_fused_relu")
                 .check("torch.ops._c10d_functional.all_reduce_.default")
+<<<<<<< HEAD
                 .check_same("buf0")
                 # mm not use buf prior to wait_tensor
                 .check("extern_kernels.mm")
                 .check_not("buf0")
+=======
+                .check("extern_kernels.mm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .check("torch.ops._c10d_functional.wait_tensor.default")
                 .check("extern_kernels.mm")
                 .run(code)
@@ -262,11 +273,14 @@ def func(a, *, tag, ranks, group_size):
             "reorder_compute_for_overlap",
         ],
     )
+<<<<<<< HEAD
     @patch.object(
         torch._inductor.config,
         "runtime_estimations_mms_benchmark",
         False,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reorder_compute_for_overlap(self):
         def func(a, *, tag, ranks, group_size):
             ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
@@ -371,10 +385,14 @@ def func(a, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @unittest.skipIf(
         torch._inductor.config.triton.native_matmul,
         "native matmul is fused with surrounding ops",
     )
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     @patch.object(
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index d3273673e89c7..51b08330bb241 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -96,7 +96,11 @@ def step(model):
         step(model_dp)
 
         for p1, p2 in zip(model.parameters(), model_dp.parameters()):
+<<<<<<< HEAD
             self.assertEqual(p1, p2)
+=======
+            self.assertTrue(p1.allclose(p2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_lazy_linear(self):
@@ -760,7 +764,11 @@ def forward(self, x):
                     opt = torch.optim.SGD(m.parameters(), lr=0.1)
                     opt_dp = torch.optim.SGD(m_dp.parameters(), lr=0.1)
                     has_half = any(p.dtype is torch.half for p in m.parameters())
+<<<<<<< HEAD
                     tol = 3.0e-3 if has_half else 1.0e-5
+=======
+                    tol = 1.0e-3 if has_half else 1.0e-5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except BaseException:
                     # Prints case-specific debugging info to narrow down failing case.
                     print(
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index a0de1b13c6161..88cccf667304f 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -1,15 +1,22 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 import os
+<<<<<<< HEAD
 import unittest
 from datetime import timedelta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
+<<<<<<< HEAD
 from torch._C._distributed_c10d import Backend as C10dBackend
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed._mesh_layout import _MeshLayout as _Layout
+=======
+from torch._subclasses.fake_tensor import FakeTensorMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 from torch.distributed.distributed_c10d import (
     _get_default_group,
@@ -29,11 +36,16 @@
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
+<<<<<<< HEAD
 from torch.testing._internal.distributed.fake_pg import FakeProcessGroup, FakeStore
 from torch.utils._typing_utils import not_none
 
@@ -49,6 +61,12 @@
     _NCCL_AVAILABLE = False
 
 
+=======
+from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._typing_utils import not_none
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
     os.environ["MASTER_ADDR"] = addr
     os.environ["MASTER_PORT"] = port
@@ -58,7 +76,10 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_ran
         os.environ["LOCAL_RANK"] = f"{local_rank}"
 
 
+<<<<<<< HEAD
 @unittest.skipIf(TEST_XPU, "XPU does not support gloo backend.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DeviceMeshTestGlooBackend(DTensorTestBase):
     @property
     def backend(self):
@@ -88,16 +109,24 @@ def test_manual_set_device(self):
 
         # Set the device on each process before DeviceMesh constructor,
         # and device to be different than the default world rank
+<<<<<<< HEAD
         torch.accelerator.set_device_index((self.rank + 2) % self.world_size)
+=======
+        torch.cuda.set_device((self.rank + 2) % self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _set_env_var(world_size=self.world_size, rank=self.rank)
         DeviceMesh(self.device_type, mesh_tensor)
         self.assertTrue(is_initialized())
 
         # check that the device is set to the correct device
         # and respect the previous set_device calls
+<<<<<<< HEAD
         self.assertEqual(
             torch.accelerator.current_device_idx(), (self.rank + 2) % self.world_size
         )
+=======
+        self.assertEqual(torch.cuda.current_device(), (self.rank + 2) % self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.destroy_pg()
 
     @skip_if_lt_x_gpu(4)
@@ -118,7 +147,11 @@ def test_auto_set_device_from_local_rank(self):
 
         # check that the device is set to the correct device
         # and respect the LOCAL_RANK env var
+<<<<<<< HEAD
         self.assertEqual(torch.accelerator.current_device_idx(), local_rank)
+=======
+        self.assertEqual(torch.cuda.current_device(), local_rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.destroy_pg()
 
     @skip_if_lt_x_gpu(4)
@@ -137,7 +170,11 @@ def test_auto_set_device_from_heuristic(self):
         self.assertTrue(is_initialized())
 
         # check that the device is set to the correct device
+<<<<<<< HEAD
         self.assertEqual(torch.accelerator.current_device_idx(), self.rank)
+=======
+        self.assertEqual(torch.cuda.current_device(), self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.destroy_pg()
 
 
@@ -239,7 +276,11 @@ def test_get_local_rank(self):
     @with_comms
     def test_device_mesh_2d(self):
         mesh_tensor = torch.arange(4).reshape(2, 2)
+<<<<<<< HEAD
         # construct a device mesh for self.device_type
+=======
+        # construct a cuda device mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = DeviceMesh(self.device_type, mesh_tensor)
 
         # check all dim groups
@@ -263,25 +304,37 @@ def test_device_mesh_2d(self):
 
     @with_comms
     def test_device_mesh_init_backend(self):
+<<<<<<< HEAD
         mesh = DeviceMesh(
             self.device_type, torch.arange(10), _init_backend=False, _rank=5
         )
+=======
+        mesh = DeviceMesh(self.device_type, [1], _init_backend=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(RuntimeError, "process groups not initialized!"):
             mesh.get_group()
 
         # coordinates should always been populated when init_backend is False, as whenever
         # we call init_backend we should make sure the default pg already created
+<<<<<<< HEAD
         self.assertEqual(mesh.get_coordinate(), [5])
+=======
+        mesh.get_coordinate()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fake_pg_device_mesh(self):
         fake_store = FakeStore()
         init_process_group("fake", store=fake_store, rank=0, world_size=self.world_size)
+<<<<<<< HEAD
         device_type = (
             torch.accelerator.current_accelerator().type
             if torch.accelerator.is_available()
             else "cpu"
         )
+=======
+        device_type = "cuda" if torch.cuda.is_available() else "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = DeviceMesh(device_type, torch.arange(self.world_size))
 
         local_tensor = torch.randn(2, 8)
@@ -321,7 +374,11 @@ def test_from_group_with_invalid_mesh(self):
         regex = r"Invalid mesh \[\[0, 1\], \[2, 3\]\] for ProcessGroup with ranks \[0, 1, 2, 3\]"
         with self.assertRaisesRegex(ValueError, regex):
             DeviceMesh.from_group(
+<<<<<<< HEAD
                 global_pg, device_type, invalid_mesh, mesh_dim_names=("dim0", "dim1")
+=======
+                global_pg, "cuda", invalid_mesh, mesh_dim_names=("dim0", "dim1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         device_mesh = init_device_mesh(self.device_type, (2, 2))
@@ -341,6 +398,7 @@ def test_raises_invalid_device_type(self):
             # test init_device_mesh with an invalid device type that contains a GPU index
             mesh_shape = (2, self.world_size // 2)
             init_device_mesh(
+<<<<<<< HEAD
                 f"{device_type}:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
             )
 
@@ -372,6 +430,20 @@ def test_get_root_mesh_multiple_independent_meshes(self):
 
         self.assertNotEqual(_mesh_resources.get_root_mesh(mesh1_dp), mesh2)
         self.assertNotEqual(_mesh_resources.get_root_mesh(mesh1_tp), mesh2)
+=======
+                "cuda:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
+            )
+
+    @with_comms
+    def test_set_mesh_dim_group_options(self):
+        device_type = "cuda" if torch.cuda.is_available() else "cpu"
+        _mesh_resources._set_mesh_dim_group_options(1, "fake", None)
+
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        mesh = DeviceMesh(device_type, mesh_tensor)
+        # Fake pg only have BackendType as BackendType::CUSTOM.
+        self.assertEqual(mesh.get_group(1)._get_backend_name(), "custom")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DeviceMeshTestNDim(DTensorTestBase):
@@ -381,7 +453,11 @@ def world_size(self):
 
     @with_comms
     def test_device_mesh_nd(self):
+<<<<<<< HEAD
         # construct a device mesh for self.device_type
+=======
+        # construct a cuda device mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh_tensor = torch.arange(8).reshape(2, 2, 2)
         mesh = DeviceMesh(self.device_type, mesh_tensor)
 
@@ -462,10 +538,14 @@ def test_device_mesh_parent_child_hash(self):
         ep_mesh_2 = DeviceMesh(self.device_type, mesh_group_2)
         ep_mesh = ep_mesh_1 if self.rank < self.world_size // 2 else ep_mesh_2
         # ep_mesh is considered different from mesh_2d["TP"]
+<<<<<<< HEAD
         self.assertEqual(
             mesh_2d["TP"].mesh.flatten().tolist(), ep_mesh.mesh.flatten().tolist()
         )
         self.assertEqual(mesh_2d["TP"]._layout, ep_mesh._layout)
+=======
+        self.assertEqual(mesh_2d["TP"]._flatten_mesh_list, ep_mesh._flatten_mesh_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(mesh_2d["TP"].mesh.shape, ep_mesh.mesh.shape)
         self.assertEqual(mesh_2d["TP"].device_type, ep_mesh.device_type)
         self.assertNotEqual(mesh_2d["TP"].mesh_dim_names, ep_mesh.mesh_dim_names)
@@ -479,8 +559,12 @@ def test_device_mesh_parent_child_hash(self):
             another_mesh_1 if self.rank < self.world_size // 2 else another_mesh_2
         )
         # another_mesh is considered the same as ep_mesh
+<<<<<<< HEAD
         self.assertEqual(ep_mesh._flatten_rank_map, another_mesh._flatten_rank_map)
         self.assertEqual(ep_mesh._layout, another_mesh._layout)
+=======
+        self.assertEqual(ep_mesh._flatten_mesh_list, another_mesh._flatten_mesh_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ep_mesh.mesh.shape, another_mesh.mesh.shape)
         self.assertEqual(ep_mesh.device_type, another_mesh.device_type)
         self.assertEqual(ep_mesh.mesh_dim_names, another_mesh.mesh_dim_names)
@@ -538,7 +622,11 @@ def test_from_group_with_mesh_shape_2d(self):
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
         shard_rank_lists = (
+<<<<<<< HEAD
             list(range(self.world_size // 2)),
+=======
+            list(range(0, self.world_size // 2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             list(range(self.world_size // 2, self.world_size)),
         )
         shard_groups = (
@@ -566,6 +654,10 @@ def test_from_group_with_mesh_shape_2d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
+<<<<<<< HEAD
+=======
+        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for mesh_dim_group, ref_mesh_dim_group in zip(
             dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
         ):
@@ -624,6 +716,7 @@ def test_raises_mesh_shape_mesh_dim_names_mismatch(self):
                 mesh_dim_names=["dp", "tp"],
             )
 
+<<<<<<< HEAD
     def _test_backend_override_argument_dict_with_idx_and_backend(self):
         opts = FakeProcessGroup.Options()
         opts.fake_option = 42
@@ -733,6 +826,8 @@ def test_backend_override_argument_errors(self):
                 backend_override={42: "bar"},
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestDeviceMeshGetItem(DTensorTestBase):
     @property
@@ -753,9 +848,13 @@ def test_raises_invalid_mesh_dim_name(self):
         with self.assertRaisesRegex(KeyError, "Invalid mesh_dim_name"):
             mesh_dim_names = ("DP", "TP")
             mesh = init_device_mesh(
+<<<<<<< HEAD
                 self.device_type,
                 (2, 4),
                 mesh_dim_names=mesh_dim_names,
+=======
+                self.device_type, (2, 4), mesh_dim_names=mesh_dim_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             mesh[child_mesh_dim_name]
 
@@ -823,6 +922,7 @@ def test_get_item_3d(self):
         self.assertEqual(hsdp_mesh_2.mesh.tolist(), hsdp_group[hsdp_group_idx])
         self.assertEqual(hsdp_mesh_1, hsdp_mesh_2)
 
+<<<<<<< HEAD
         # Test slicing out 1D mesh from a sub-2D mesh.
         shard_mesh = hsdp_mesh_2["Shard"]
         self.assertEqual(shard_mesh.mesh.tolist(), shard_group[shard_group_idx])
@@ -831,6 +931,8 @@ def test_get_item_3d(self):
             replicate_mesh.mesh.tolist(), replicate_group[replicate_group_idx]
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms
     def test_cache_and_reuse_submesh_slice_result(self):
         mesh = init_device_mesh(self.device_type, (2, 4), mesh_dim_names=("dp", "tp"))
@@ -875,6 +977,7 @@ def test_get_item_3d_noncontiguous_slicing(self):
             mesh_3d["cp", "dp"]
 
     @with_comms
+<<<<<<< HEAD
     def test_flatten_mesh_1d(self):
         mesh_shape = (4,)
         mesh_dim_names = ("default",)
@@ -884,6 +987,8 @@ def test_flatten_mesh_1d(self):
         mesh_1d._flatten()
 
     @with_comms
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_flatten_mesh_3d(self):
         mesh_shape = (2, 2, 2)
         mesh_dim_names = ("dp", "cp", "tp")
@@ -891,6 +996,7 @@ def test_flatten_mesh_3d(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
+<<<<<<< HEAD
         # Test flatten into an existing mesh_dim_name inside the mesh
         with self.assertRaisesRegex(
             ValueError,
@@ -898,11 +1004,14 @@ def test_flatten_mesh_3d(self):
         ):
             mesh_3d._flatten("dp")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test flatten contiguous dims
         dp_cp_mesh = mesh_3d["dp", "cp"]
         flattened_dp_cp_mesh = dp_cp_mesh._flatten()
         self.assertEqual(dp_cp_mesh.mesh.flatten(), flattened_dp_cp_mesh.mesh)
         self.assertEqual(flattened_dp_cp_mesh.mesh_dim_names[0], "dp_cp")
+<<<<<<< HEAD
         self.assertEqual(flattened_dp_cp_mesh.get_group().group_desc, "mesh_dp_cp")
         root_mesh = dp_cp_mesh._get_root_mesh()
         self.assertEqual(root_mesh, mesh_3d)
@@ -912,6 +1021,14 @@ def test_flatten_mesh_3d(self):
             flattened_dp_cp_mesh._layout.global_ranks(8),
             [[0, 2, 4, 6], [1, 3, 5, 7]],
         )
+=======
+        root_mesh = _mesh_resources.get_root_mesh(dp_cp_mesh)
+        self.assertEqual(root_mesh, mesh_3d)
+        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
+            "dp_cp"
+        ]
+        self.assertEqual(flatten_mesh_root_dims, (0, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref_pg_count = _world.group_count
         # Calling flatten again should not create a new pg.
@@ -924,6 +1041,7 @@ def test_flatten_mesh_3d(self):
         flattened_dp_tp_mesh = dp_tp_mesh._flatten()
         self.assertEqual(dp_tp_mesh.mesh.flatten(), flattened_dp_tp_mesh.mesh)
         self.assertEqual(flattened_dp_tp_mesh.mesh_dim_names[0], "dp_tp")
+<<<<<<< HEAD
         root_mesh = dp_tp_mesh._get_root_mesh()
         self.assertEqual(root_mesh, mesh_3d)
         flatten_mesh_root_layout = root_mesh._flatten_mapping["dp_tp"]._layout
@@ -937,12 +1055,21 @@ def test_flatten_mesh_3d(self):
             "Currently, this only allows slicing out a contiguous flattened dim",
         ):
             mesh_3d["dp_tp", "cp"]
+=======
+        root_mesh = _mesh_resources.get_root_mesh(dp_tp_mesh)
+        self.assertEqual(root_mesh, mesh_3d)
+        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
+            "dp_tp"
+        ]
+        self.assertEqual(flatten_mesh_root_dims, (0, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test flatten with a flattened mesh_dim_name
         cp_tp_mesh = mesh_3d["cp", "tp"]
         cp_tp_mesh._flatten("dummy")
         self.assertEqual(mesh_3d["dummy"].mesh_dim_names[0], "dummy")
 
+<<<<<<< HEAD
         # Test flatten into an existing mesh_dim_name inside the mesh
         with self.assertRaisesRegex(
             ValueError,
@@ -955,6 +1082,8 @@ def test_flatten_mesh_3d(self):
         ):
             mesh_3d["cp", "tp"]._flatten("dp_tp")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms(eager_init=True)
     def test_flatten_mesh_4d(self):
         mesh_shape = (2, 2, 2, 1)
@@ -970,6 +1099,7 @@ def test_flatten_mesh_4d(self):
         # check flattened mesh dim names is correct
         self.assertEqual(dp_cp_mesh.mesh_dim_names, ("dp_cp",))
         # check flattened mesh dependency
+<<<<<<< HEAD
         self.assertEqual(dp_cp_mesh._get_root_mesh(), mesh_4d)
 
     @with_comms
@@ -1078,6 +1208,9 @@ def test_concatenate_3d(self):
         self.assertEqual(
             mesh_3d, DeviceMesh._concatenate([mesh_3d["pp", "dp"], mesh_3d["tp"]])
         )
+=======
+        self.assertEqual(_mesh_resources.get_root_mesh(dp_cp_mesh), mesh_4d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_reconstruct_mesh_with_flatten_dim(self):
@@ -1116,9 +1249,13 @@ def world_size(self):
     @with_comms
     def test_get_root_mesh(self):
         mesh_3d = init_device_mesh(
+<<<<<<< HEAD
             self.device_type,
             (2, 2, 2),
             mesh_dim_names=("dp", "cp", "tp"),
+=======
+            self.device_type, (2, 2, 2), mesh_dim_names=("dp", "cp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         dp_cp_mesh = mesh_3d["dp", "cp"]
@@ -1127,19 +1264,25 @@ def test_get_root_mesh(self):
         dp_mesh = mesh_3d["dp"]
         cp_mesh = mesh_3d["cp"]
         tp_mesh = mesh_3d["tp"]
+<<<<<<< HEAD
         # Test BC case is still working
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(_mesh_resources.get_root_mesh(dp_cp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(dp_tp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(cp_tp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(dp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(cp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(tp_mesh), mesh_3d)
+<<<<<<< HEAD
         self.assertEqual(dp_cp_mesh._get_root_mesh(), mesh_3d)
         self.assertEqual(dp_tp_mesh._get_root_mesh(), mesh_3d)
         self.assertEqual(cp_tp_mesh._get_root_mesh(), mesh_3d)
         self.assertEqual(dp_mesh._get_root_mesh(), mesh_3d)
         self.assertEqual(cp_mesh._get_root_mesh(), mesh_3d)
         self.assertEqual(tp_mesh._get_root_mesh(), mesh_3d)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_get_root_mesh_dim_exist(self):
@@ -1149,15 +1292,24 @@ def test_get_root_mesh_dim_exist(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
+<<<<<<< HEAD
         self.assertEqual(mesh_2d["DP"]._get_root_mesh_dim(), 0)
         self.assertEqual(mesh_2d["TP"]._get_root_mesh_dim(), 1)
+=======
+        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh_2d["DP"]), 0)
+        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh_2d["TP"]), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_get_root_mesh_dim_not_exist(self):
         mesh_shape = (self.world_size,)
         mesh = init_device_mesh(self.device_type, mesh_shape)
 
+<<<<<<< HEAD
         self.assertEqual(mesh._get_root_mesh_dim(), None)
+=======
+        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh), None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_get_mesh_dim_by_name(self):
@@ -1167,17 +1319,28 @@ def test_get_mesh_dim_by_name(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
+<<<<<<< HEAD
         self.assertEqual(mesh_2d._get_mesh_dim_by_name("DP"), 0)
         self.assertEqual(mesh_2d._get_mesh_dim_by_name("TP"), 1)
+=======
+        self.assertEqual(_mesh_resources.get_mesh_dim_by_name(mesh_2d, "DP"), 0)
+        self.assertEqual(_mesh_resources.get_mesh_dim_by_name(mesh_2d, "TP"), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_get_all_submeshes(self):
         mesh_2d = init_device_mesh(
+<<<<<<< HEAD
             self.device_type,
             (2, 4),
             mesh_dim_names=("replicate", "shard"),
         )
         all_submeshes = mesh_2d._get_all_submeshes("replicate")
+=======
+            self.device_type, (2, 4), mesh_dim_names=("replicate", "shard")
+        )
+        all_submeshes = _mesh_resources._get_all_submeshes(mesh_2d, "replicate")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(all_submeshes), 4)
         self.assertEqual(
             all(submesh.mesh.numel() == 2 for submesh in all_submeshes), True
@@ -1455,6 +1618,7 @@ def test_scatter_nd(self):
             self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
 
 
+<<<<<<< HEAD
 class CuTeLayoutTest(TestCase):
     def test_coalesce(self):
         # ((3,2),(2,1)) -> (6,1)
@@ -1736,5 +1900,7 @@ def test_remap_to_tensor(self):
         self.assertEqual(result7, expected7)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index b75fb91379f9c..61a5ba5bfe517 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -43,12 +43,20 @@
     DynamoDistributedMultiProcTestCase,
     DynamoDistributedSingleProcTestCase,
     import_transformers_or_skip,
+<<<<<<< HEAD
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import requires_cuda
+from torch.testing._internal.inductor_utils import HAS_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def reset_rng_state():
@@ -271,6 +279,7 @@ def get_hf_bert(rank):
     except ImportError as e:
         raise unittest.SkipTest("Unable to import transformers") from e
 
+<<<<<<< HEAD
     device_type = (
         acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
     )
@@ -280,6 +289,9 @@ def get_hf_bert(rank):
         BertConfig(),
         f"{device_type}:{rank}",
     )
+=======
+    batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     model = AutoModelForMaskedLM.from_config(config).to(device)
     input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
     decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
@@ -350,10 +362,13 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
+<<<<<<< HEAD
     @unittest.skipIf(
         torch._inductor.config.triton.native_matmul,
         "FIXME : native matmul fails. RuntimeError: Cannot access data pointer of Tensor",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hf_bert_ddp_inductor(self):
         model, inputs = get_hf_bert(0)
         model = FakeDDP(model)
@@ -563,8 +578,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 # Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
 # single process version; if it's just a problem in the Dynamo distributed
+<<<<<<< HEAD
 # # optimizer, you should be able to repro it single process!
 @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+# optimizer, you should be able to repro it single process!
+@requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Note: MultiProcTestCase spawns processes per test and is slow.
@@ -572,16 +592,23 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
     sparingly for integration tests.
     """
 
+<<<<<<< HEAD
     device_type = (
         acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @config.patch(optimize_ddp=False, enable_compiler_collectives=True)
     def test_ddp_baseline_aot_eager_multiprocess(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             self.assertFalse(config.optimize_ddp)
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
+=======
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m = DDP(m, device_ids=[self.rank])
             m = torch.compile(m, backend="aot_eager")
             outputs = m(inputs)
@@ -649,7 +676,11 @@ def forward(self, inp):
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             self.assertFalse(config.optimize_ddp)
+<<<<<<< HEAD
             model = MyModel().to(device=self.device_type)
+=======
+            model = MyModel().to(device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Activation checkpointing for Linear layers.
             non_reentrant_wrapper = functools.partial(
@@ -664,7 +695,11 @@ def forward(self, inp):
             )
 
             model = DDP(model)
+<<<<<<< HEAD
             x = torch.randn(10, 64).to(self.device_type)
+=======
+            x = torch.randn(10, 64).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             correct_outputs = model(x)
 
             opt_model = torch.compile(model)
@@ -676,14 +711,22 @@ def forward(self, inp):
     def test_fsdp_aot_eager(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
+=======
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="aot_eager")
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
             # Test with recursive wrapping, nested FSDP around each Linear
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
+=======
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
@@ -697,7 +740,10 @@ def test_fsdp_aot_eager(self):
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ddp_optimizer_cudagraph(self):
         class Net(nn.Module):
             def __init__(self):
@@ -748,9 +794,13 @@ def test_fsdp_setattr(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_mutating_model(
                 f"{self.device_type}:{self.rank}"
             )
+=======
+            m, inputs, correct_outputs = get_mutating_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -768,9 +818,13 @@ def test_fsdp_unspecialized_forced_getattr_no_inline(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_forced_getattr_module(
                 f"{self.device_type}:{self.rank}"
             )
+=======
+            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -784,9 +838,13 @@ def test_fsdp_unspecialized_forced_getattr_inline(self):
             from torch._dynamo.utils import counters
 
             counters.clear()
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_forced_getattr_module(
                 f"{self.device_type}:{self.rank}"
             )
+=======
+            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
             outputs = fsdp_m(inputs)
@@ -798,14 +856,22 @@ def test_fsdp_unspecialized_forced_getattr_inline(self):
     def test_fsdp_inductor(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
+=======
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(m, use_orig_params=True)
             fsdp_m = torch.compile(fsdp_m, backend="inductor")
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
             # Test with recursive wrapping, nested FSDP around each Linear
+<<<<<<< HEAD
             m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
+=======
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
@@ -823,7 +889,11 @@ def test_fsdp_inductor(self):
     def test_fsdp_activation_checkpointing(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             model, inputs = get_toy_model_for_activation_checkpointing(
+<<<<<<< HEAD
                 f"{self.device_type}:{self.rank}"
+=======
+                f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             is_inner = lambda module: isinstance(module, ToyInnerModel)  # noqa: E731
             wrap_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=is_inner)
@@ -985,7 +1055,11 @@ def test_compiler_collectives_automatic_dynamic_scalar(self):
             torch._dynamo.utils.clear_compilation_metrics()
 
             # TODO: This should be possible to do inside the function, but
+<<<<<<< HEAD
             device = f"{self.device_type}:{self.rank}"
+=======
+            device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             @torch.compile()
             def f(x, y):
@@ -1205,7 +1279,11 @@ def test_get_pg_attr(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             pg = dist.distributed_c10d._get_default_group()
 
+<<<<<<< HEAD
             device = f"{self.device_type}:{self.rank}"
+=======
+            device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             @torch.compile(fullgraph=True)
             def f(x):
@@ -1220,7 +1298,10 @@ def f(x):
             pg = dist.distributed_c10d.GroupMember.NON_GROUP_MEMBER
             self.assertEqual(f(x), x + 1)
 
+<<<<<<< HEAD
     @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(torch._inductor.config, "fx_graph_cache", False)
     @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@@ -1230,7 +1311,11 @@ def test_asymmetric_compilation(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             torch._dynamo.utils.clear_compilation_metrics()
 
+<<<<<<< HEAD
             device = f"{self.device_type}:{self.rank}"
+=======
+            device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             pg = dist.distributed_c10d._get_default_group()
 
@@ -1263,7 +1348,11 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
+<<<<<<< HEAD
             torch.accelerator.synchronize(device)
+=======
+            torch.cuda.synchronize(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             metrics = torch._dynamo.utils.get_compilation_metrics()
             # Number of compiles same on all nodes
@@ -1272,7 +1361,10 @@ def f(x):
             for r in res[1:]:
                 self.assertEqual(res[0], r)
 
+<<<<<<< HEAD
     @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(torch._inductor.config, "fx_graph_cache", True)
     @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@@ -1284,7 +1376,11 @@ def test_asymmetric_compilation_with_fx_cache(self):
         with fresh_cache(), _dynamo_dist_per_rank_init(self.rank, self.world_size):
             torch._dynamo.utils.clear_compilation_metrics()
 
+<<<<<<< HEAD
             device = f"{self.device_type}:{self.rank}"
+=======
+            device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             pg = dist.distributed_c10d._get_default_group()
 
@@ -1307,7 +1403,11 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
+<<<<<<< HEAD
             torch.accelerator.synchronize(device)
+=======
+            torch.cuda.synchronize(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.reset()
 
             if self.rank == 0:
@@ -1324,11 +1424,19 @@ def f(x):
 
             w = pg.allreduce(x)
             w.wait()
+<<<<<<< HEAD
             torch.accelerator.synchronize(device)
 
 
 @requires_accelerator_dist_backend(["nccl", "xccl"])
 @unittest.skipUnless(torch.accelerator.is_available(), "Requires accelerator")
+=======
+            torch.cuda.synchronize(device)
+
+
+@requires_nccl()
+@requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSingleProc(DynamoDistributedSingleProcTestCase):
     """
     Test harness initializes dist process group.
@@ -1337,10 +1445,13 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
     Use TestMultiProc for things that really need to run on multiple nodes
     """
 
+<<<<<<< HEAD
     device_type = (
         acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_model(
         self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None
     ):
@@ -1458,7 +1569,10 @@ def opt_fn(inputs):
                 self.assertEqual(len(break_reasons), 4)
                 self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons))
 
+<<<<<<< HEAD
     @skipIfXpu  # XPU device doesn't support flex_attention yet.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch.object(config, "optimize_ddp", True)
     def test_compiled_flex_attention_full_model_ddp(self):
         class Model(torch.nn.Module):
@@ -1505,6 +1619,7 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         S = 512
         D = 64
 
+<<<<<<< HEAD
         model = Model(S, H, D)
         model.to(self.device_type)
         model = torch.compile(model)
@@ -1515,6 +1630,18 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         torch.accelerator.synchronize()
 
     @skipIfXpu  # XPU device doesn't support flex_attention yet.
+=======
+        device = "cuda"
+        model = Model(S, H, D)
+        model.to(device)
+        model = torch.compile(model)
+        model = DDP(model, device_ids=self.device_ids)
+
+        hidden_states = torch.randn(B, S, H * D).to(device)
+        model(hidden_states)
+        torch.cuda.synchronize()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch.object(config, "optimize_ddp", True)
     def test_compiled_flex_attention_local_ddp(self):
         class Model(torch.nn.Module):
@@ -1561,6 +1688,7 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         S = 512
         D = 64
 
+<<<<<<< HEAD
         model = Model(S, H, D)
         model.to(self.device_type)
         model = torch.compile(model)
@@ -1569,6 +1697,17 @@ def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
         hidden_states = torch.randn(B, S, H * D).to(self.device_type)
         model(hidden_states)
         torch.accelerator.synchronize()
+=======
+        device = "cuda"
+        model = Model(S, H, D)
+        model.to(device)
+        model = torch.compile(model)
+        model = DDP(model, device_ids=self.device_ids)
+
+        hidden_states = torch.randn(B, S, H * D).to(device)
+        model(hidden_states)
+        torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -1817,9 +1956,15 @@ def forward(self, x):
                 a = torch.cos(a)
                 return a
 
+<<<<<<< HEAD
         mod = MockModule().to(self.device_type)
         mod = DDP(mod, bucket_cap_mb=1)
         x = torch.randn(N, N, device=self.device_type, requires_grad=True)
+=======
+        mod = MockModule().cuda()
+        mod = DDP(mod, bucket_cap_mb=1)
+        x = torch.randn(N, N, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = (x,)
 
         backend = "aot_eager"
@@ -1829,7 +1974,11 @@ def forward(self, x):
 
     def test_fsdp_orig_params_assert(self):
         # Test with basic FSDP wrapping (outer wrap around whole model)
+<<<<<<< HEAD
         m, inputs, _ = get_model(f"{self.device_type}:{self.rank}")
+=======
+        m, inputs, _ = get_model(f"cuda:{self.rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_m = FSDP(m, use_orig_params=False)
         # Test is that this function call does not throw an exception.
         fsdp_m = torch.compile(fsdp_m)
@@ -1844,7 +1993,11 @@ def test_fsdp_skip_guards(self):
 
         Note: comptime prints the guards before the time they get installed or not installed, so in both cases
         (skip or no skip) the same guards get printed.  The difference is that in the skip case, they show up
+<<<<<<< HEAD
         with a special 'guard source' which will cause them to not be installed.  So all we check for is the expected
+=======
+        with a special 'guard source' which will cuase them to not be installed.  So all we check for is the expected
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guard source 'local_fsdp_module'.
         """
         global GUARDS_FILE
@@ -1875,7 +2028,11 @@ def _(ctx):
 
                     return out
 
+<<<<<<< HEAD
             device = f"{self.device_type}:{self.rank}"
+=======
+            device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m = ToyModel(
                 in_feat=10,
                 hidden_feat=5000,
@@ -1901,7 +2058,11 @@ def _(ctx):
 
     def test_fsdp_skip_register_attr_or_module(self):
         """
+<<<<<<< HEAD
         ensure FSDP module is not registered as attributes
+=======
+        ensure FSDP module is not registered as attrbutes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in the fx graph
         see `not source.guard_source().is_fsdp_module()`
         before calling `register_attr_or_module`
@@ -1922,7 +2083,11 @@ def forward(self, inputs):
 
         torch._dynamo.reset()
 
+<<<<<<< HEAD
         device = f"{self.device_type}:{self.rank}"
+=======
+        device = f"cuda:{self.rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = ToyModel(
             in_feat=10,
             hidden_feat=5000,
@@ -1963,6 +2128,7 @@ def test_fsdp_dup_tensors_same_source(self):
         class DuplicateModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 device_type = (
                     acc.type
                     if (acc := torch.accelerator.current_accelerator())
@@ -1971,6 +2137,11 @@ def __init__(self) -> None:
                 self._param = torch.randn((3,), device=device_type)
                 self._buf = torch.nn.Buffer(
                     torch.randn((3,), requires_grad=False, device=device_type)
+=======
+                self._param = torch.randn((3,), device="cuda")
+                self._buf = torch.nn.Buffer(
+                    torch.randn((3,), requires_grad=False, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1983,7 +2154,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         model = DuplicateModule()
         fsdp_model = FSDP(copy.deepcopy(model), use_orig_params=True)
         fsdp_model = torch.compile(fsdp_model, backend="aot_eager")
+<<<<<<< HEAD
         inp = torch.randn((2, 3), device=self.device_type)
+=======
+        inp = torch.randn((2, 3), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_out = model(inp)
         fsdp_out = fsdp_model(inp)
         self.assertEqual(local_out, fsdp_out)
@@ -2000,6 +2175,7 @@ def test_fsdp_dup_tensors_diff_source(self):
         class BufModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 device_type = (
                     acc.type
                     if (acc := torch.accelerator.current_accelerator())
@@ -2007,6 +2183,10 @@ def __init__(self) -> None:
                 )
                 self._buf = nn.Buffer(
                     torch.randn((3,), requires_grad=False, device=device_type)
+=======
+                self._buf = nn.Buffer(
+                    torch.randn((3,), requires_grad=False, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -2015,12 +2195,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         class Model(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 device_type = (
                     acc.type
                     if (acc := torch.accelerator.current_accelerator())
                     else "cpu"
                 )
                 self._param = nn.Parameter(torch.randn((1,), device=device_type))
+=======
+                self._param = nn.Parameter(torch.randn((1,), device="cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._buf_module = BufModule()
                 # Share the buffer, meaning same tensor but different source
                 self._buf = self._buf_module._buf
@@ -2037,7 +2221,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         fsdp_model = FSDP(Model(), use_orig_params=True)
         cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
         fsdp_model = torch.compile(fsdp_model, backend=cnt)
+<<<<<<< HEAD
         inp = torch.randn((2, 3), device=self.device_type)
+=======
+        inp = torch.randn((2, 3), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(15):
             fsdp_model(inp)
         # Check for no recompiles (if there were incorrect de-dup guards, then
@@ -2056,12 +2244,16 @@ def __init__(self, use_self: bool):
                 super().__init__()
                 self._use_self = use_self
                 torch.manual_seed(42)  # force `_param` to be deterministic
+<<<<<<< HEAD
                 device_type = (
                     acc.type
                     if (acc := torch.accelerator.current_accelerator())
                     else "cpu"
                 )
                 self._param = nn.Parameter(torch.randn((3,), device=device_type))
+=======
+                self._param = nn.Parameter(torch.randn((3,), device="cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 if self._use_self:
@@ -2076,7 +2268,11 @@ def _add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y
 
         model = ModuleWithStaticMethod(False)
+<<<<<<< HEAD
         x = torch.randn((2, 3), device=self.device_type)
+=======
+        x = torch.randn((2, 3), device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_out = model(x)
         test_outs: list[torch.Tensor] = []
 
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index ad233bcdba4a1..9ef3e39c1f23b 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -7,7 +7,10 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+<<<<<<< HEAD
 from torch._C._distributed_c10d import FakeProcessGroup
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor import DeviceMesh, Shard
@@ -23,7 +26,10 @@
 from torch.testing._internal.common_utils import run_tests, skipIfHpu, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
+<<<<<<< HEAD
 from torch.utils._python_dispatch import TorchDispatchMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if not dist.is_available():
@@ -42,14 +48,24 @@ def tearDown(self):
             pass
 
     def test_all_reduce(self):
+<<<<<<< HEAD
         dist.init_process_group(backend="fake", rank=1, world_size=2)
+=======
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output = torch.ones(3, 3) * dist.get_rank()
         dist.all_reduce(output)
         self.assertEqual(tuple(output.shape), (3, 3))
 
     def test_allgather(self):
+<<<<<<< HEAD
         dist.init_process_group(backend="fake", rank=1, world_size=2)
+=======
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input_tensor = torch.ones(3, 3) * dist.get_rank()
         output_tensors = [torch.empty_like(input_tensor) for _ in range(2)]
@@ -106,7 +122,12 @@ def allgather_fn(tensor):
         FileCheck().check("all_gather").check("wait_tensor").run(str(gm.graph))
 
     def test_broadcast(self):
+<<<<<<< HEAD
         dist.init_process_group(backend="fake", rank=0, world_size=2)
+=======
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # src == rank
         output = torch.ones(3, 3)
@@ -218,6 +239,7 @@ def test_fsdp_tp_fake_e2e(self):
                 loss.backward()
                 optim.step()
 
+<<<<<<< HEAD
     def test_error_on_collective(self):
         from torch.testing._internal.distributed.fake_pg import FakeStore
 
@@ -302,6 +324,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertIn("aten.lift_fresh.default", op_names)
         self.assertIn("c10d.allreduce_.default", op_names)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index b5522fe2bef06..765d81e668620 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -13,6 +13,10 @@
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
+<<<<<<< HEAD
+=======
+from torch.testing._internal.distributed.fake_pg import FakeStore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
@@ -23,7 +27,11 @@
 from torch.testing._internal.common_distributed import (
     DistributedTestBase,
     MultiThreadedTestCase,
+<<<<<<< HEAD
     requires_accelerator_dist_backend,
+=======
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
@@ -33,7 +41,10 @@
     skipIfHpu,
     TEST_CUDA,
     TEST_HPU,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
@@ -64,9 +75,12 @@
 if TEST_HPU:
     devices.append("hpu")
     DEVICE = "hpu"
+<<<<<<< HEAD
 elif TEST_XPU:
     devices.append("xpu")
     DEVICE = "xpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif TEST_CUDA:
     devices.append("cuda")
 
@@ -272,10 +286,17 @@ def setUp(self):
 
     @parametrize("device", devices)
     def test_broadcast(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dist.get_rank() == 0:
             tensor = torch.ones([4], device=device)
@@ -288,10 +309,17 @@ def test_broadcast(self, device):
 
     @parametrize("device", devices)
     def test_all_reduce_eager(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tensor = torch.ones([4], device=device)
         mesh = dt.DeviceMesh(device, torch.arange(4))
@@ -305,10 +333,17 @@ def test_all_reduce_eager(self, device):
 
     @parametrize("device", devices)
     def test_all_reduce_coalesced_eager(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         t0 = torch.ones([4], device=device)
         t1 = torch.ones([6], device=device) + 2
@@ -320,10 +355,17 @@ def test_all_reduce_coalesced_eager(self, device):
 
     @parametrize("device", devices)
     def test_all_gather_tensor(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # testing 1d/2d mesh
         mesh_1d = dt.DeviceMesh(device, torch.arange(self.world_size))
@@ -342,10 +384,17 @@ def test_all_gather_tensor(self, device):
 
     @parametrize("device", devices)
     def test_all_gather_into_tensor_coalesced(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tensors = [torch.ones([4], device=device), torch.ones([4], device=device) + 1]
         mesh = dt.DeviceMesh(device, torch.arange(4))
@@ -359,10 +408,17 @@ def test_all_gather_into_tensor_coalesced(self, device):
 
     @parametrize("device", devices)
     def test_reduce_scatter_tensor(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # testing 1d/2d mesh
         mesh_1d = dt.DeviceMesh(device, torch.arange(self.world_size))
@@ -383,10 +439,17 @@ def test_reduce_scatter_tensor(self, device):
 
     @parametrize("device", devices)
     def test_reduce_scatter_into_tensor_coalesced(self, device):
+<<<<<<< HEAD
         if device != "cpu":
             if torch.accelerator.device_count() < self.world_size:
                 self.skipTest("Not enough accelerator devices")
             torch.accelerator.set_device_index(dist.get_rank())
+=======
+        if device == "cuda":
+            if torch.cuda.device_count() < self.world_size:
+                self.skipTest("Not enough CUDA devices")
+            torch.cuda.set_device(dist.get_rank())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensors = [
             torch.ones([4], dtype=torch.int64, device=device),
             torch.ones([4], dtype=torch.int64, device=device) + 1,
@@ -430,10 +493,18 @@ def setUp(self):
         # so create a fake_pg.
         self.rank = 0
         self.world_size = 2
+<<<<<<< HEAD
+=======
+        store = FakeStore()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.init_process_group(
             backend="fake",
             world_size=self.world_size,
             rank=self.rank,
+<<<<<<< HEAD
+=======
+            store=store,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def tearDown(self):
@@ -475,17 +546,29 @@ def allred_mesh_dim(input):
 # And then set the BACKEND variable appropriately.
 if TEST_HPU:
     BACKEND = dist.Backend.HCCL
+<<<<<<< HEAD
 elif TEST_XPU:
     BACKEND = dist.Backend.XCCL
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # allows you to check for multiple accelerator irrespective of device type
 # to add new device types to this check simply follow the same format
 # and append an elif with the conditional and appropriate device count function for your new device
 def exit_if_lt_x_accelerators(x):
+<<<<<<< HEAD
     if torch.accelerator.is_available():
         if torch.accelerator.device_count() < x:
             sys.exit(TEST_SKIPS[f"multi-accelerator-{x}"].exit_code)
+=======
+    if TEST_CUDA:
+        if torch.cuda.device_count() < x:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+    elif TEST_HPU:
+        if torch.hpu.device_count() < x:
+            sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def with_comms(func=None):
@@ -494,9 +577,13 @@ def with_comms(func=None):
 
     @wraps(func)
     def wrapper(self, *args, **kwargs):
+<<<<<<< HEAD
         if (
             BACKEND == dist.Backend.NCCL or BACKEND == dist.Backend.XCCL
         ) and torch.accelerator.device_count() < self.world_size:
+=======
+        if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < self.world_size:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
         kwargs["device"] = DEVICE
@@ -574,7 +661,11 @@ def test_all_to_all_single_split_sizes_none(self, device):
         self.assertEqual(y, expected)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms()
     def test_tracing(self, device):
         def allreduce(t, pg):
@@ -595,12 +686,20 @@ def allreduce(t, pg):
             backend="fake",
             rank=0,
             world_size=8,
+<<<<<<< HEAD
+=======
+            store=FakeStore(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         allreduce(torch.randn(8, device=device), pg=dist.group.WORLD)
         dist.destroy_process_group()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms()
     def test_tracing_with_dce_code(self, device):
         if self.world_size > 2:
@@ -819,6 +918,7 @@ def test_all_to_all_single(self, device) -> None:
 
 # Update the supported devices in DEVICE
 instantiate_device_type_tests(
+<<<<<<< HEAD
     TestCollectivesWithDistributedBackend, globals(), only_for=DEVICE, allow_xpu=True
 )
 instantiate_device_type_tests(
@@ -832,6 +932,15 @@ def test_all_to_all_single(self, device) -> None:
     globals(),
     only_for=DEVICE,
     allow_xpu=True,
+=======
+    TestCollectivesWithDistributedBackend, globals(), only_for=DEVICE
+)
+instantiate_device_type_tests(
+    TestDistributedBackendCollectivesWithWorldSize4, globals(), only_for=DEVICE
+)
+instantiate_device_type_tests(
+    TestFunctionalAutogradWithDistributedBackend, globals(), only_for=DEVICE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index ff34495fcdd13..c02e3acbf2b55 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -2,7 +2,11 @@
 import datetime
 import functools
 import unittest
+<<<<<<< HEAD
 from collections import Counter
+=======
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 from unittest.mock import patch
 
@@ -10,7 +14,10 @@
 import torch._dynamo
 import torch._dynamo.logging
 import torch._dynamo.test_case
+<<<<<<< HEAD
 import torch.distributed as c10d
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
@@ -20,6 +27,7 @@
 from torch._inductor.comms import (
     _reorder_communication_preserving_peak_memory_internal,
     ReorderInfo,
+<<<<<<< HEAD
     sink_waits_iterative,
 )
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
@@ -33,35 +41,66 @@
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_cuda import SM80OrLater
+=======
+)
+from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.utils import run_and_get_triton_code
+from torch.distributed.distributed_c10d import GroupMember
+from torch.fx.experimental.proxy_tensor import make_fx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import (
     _dynamo_dist_per_rank_init,
     DynamoDistributedMultiProcTestCase,
     DynamoDistributedSingleProcTestCase,
+<<<<<<< HEAD
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
+=======
+    requires_nccl,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
     skipIfRocm,
     skipIfXpu,
     TEST_XPU,
     xfailIf,
+=======
+    requires_cuda,
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
+<<<<<<< HEAD
 @requires_accelerator_dist_backend(["nccl", "xccl"])
+=======
+def _tolist_with_constrain_as_size(tensor):
+    lst = tensor.tolist()
+    for elem in lst:
+        torch._check_is_size(elem)
+    return lst
+
+
+@requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
     """
 
+<<<<<<< HEAD
     device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_world_trs(self):
         return {
             "tag": "",
@@ -98,11 +137,16 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             t = torch.randn(4, 4, device=self.device)
             inputs = (
                 t if self.rank == 0 else torch.zeros(4, 4, device=self.device),
                 0,
             )
+=======
+            t = torch.randn(4, 4, device="cuda")
+            inputs = (t if self.rank == 0 else torch.zeros(4, 4, device="cuda"), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_out = example(*inputs)
             self.assertTrue(same(t, eager_out))
 
@@ -136,7 +180,11 @@ def compile(func, example_inputs):
                 matmul_cat_col,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 6
+=======
+            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             eager_out = matmul_cat_col(*inputs)
             compiled_matmul_cat_col = compile(matmul_cat_col, inputs)
@@ -145,7 +193,10 @@ def compile(func, example_inputs):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm #Skip as flaky upstream as well, enable via https://github.com/ROCm/frameworks-internal/issues/13105
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allreduce_inductor_cudagraph_trees(self):
         """
         Tests whether cudagraph trees support all_reduce from nccl
@@ -179,7 +230,11 @@ def func(x):
             for nelem in [1024, 2048, 4096]:
                 # CI (Tesla T4) does not support bfloat16 compilation natively,
                 # using float
+<<<<<<< HEAD
                 x = torch.randn(nelem, device=self.device, dtype=torch.float)
+=======
+                x = torch.randn(nelem, device="cuda", dtype=torch.float)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 golden_out = eager_func(x)
 
                 for _ in range(3):
@@ -217,8 +272,13 @@ def compile(func, example_inputs):
                 eager_func,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
             inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
+=======
+            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
+            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             eager_out = inductor_func(eager_func(*eager_inputs), *inductor_inputs)
             compiled_inductor_func = compile(
@@ -256,8 +316,13 @@ def compile(func, example_inputs):
                 inductor_func,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
             eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
+=======
+            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
+            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             eager_out = eager_func(inductor_func(*inductor_inputs), *eager_inputs)
             compiled_inductor_func = compile(inductor_func, inductor_inputs)
@@ -268,9 +333,13 @@ def compile(func, example_inputs):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
     @skipIfRocm
     @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_eager_async_allreduce_inductor_wait(self):
         import torch.distributed as dist
         from torch._inductor.utils import run_and_get_code
@@ -293,7 +362,11 @@ def all_reduce_wait(work, y):  # potentially compiled
             return y * y
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+<<<<<<< HEAD
             x = torch.ones(12800, 12800, device=self.device) + self.rank
+=======
+            x = torch.ones(12800, 12800, device="cuda") + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
 
             # NOTE: We run for 10 iterations each, to ensure that the GPU execution is way behind CPU
@@ -364,7 +437,11 @@ def func(a, *, tag, ranks, group_size):
             return (e,)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+<<<<<<< HEAD
             inputs = torch.ones(4, 4, device=self.device) + self.rank
+=======
+            inputs = torch.ones(4, 4, device="cuda") + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             out = compiled(inputs, **self.get_world_trs())
             correct = func(inputs, **self.get_world_trs())
@@ -381,8 +458,12 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             inputs = (
                 # rank0: [0., 1.], rank1: [2., 3.]
+<<<<<<< HEAD
                 torch.arange(2, dtype=torch.float32, device=self.device)
                 + 2 * self.rank,
+=======
+                torch.arange(2, dtype=torch.float32, device="cuda") + 2 * self.rank,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 [1, 0],
             )
             compiled = torch.compile(func)
@@ -391,7 +472,11 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
             # rank0: [2., 3.], rank1: [0., 1.]
+<<<<<<< HEAD
             expected = torch.arange(2, dtype=torch.float32, device=self.device) + 2 * (
+=======
+            expected = torch.arange(2, dtype=torch.float32, device="cuda") + 2 * (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (self.rank - 1 + self.world_size) % self.world_size
             )
             self.assertEqual(out, expected)
@@ -414,15 +499,22 @@ def forward(self, x, world_size, tag, ranks, group_size):
                 return out
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+<<<<<<< HEAD
             model = Model().to(self.device)
             model_compiled = torch.compile(model)
             inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
+=======
+            model = Model().cuda()
+            model_compiled = torch.compile(model)
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = model_compiled(inp, self.world_size, **self.get_world_trs())
             correct = model(inp, self.world_size, **self.get_world_trs())
             self.assertTrue(same(out, correct))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     def test_allgather_scalar_tensor_input(self):
         def func(tensor, world_size):
             tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
@@ -438,6 +530,8 @@ def func(tensor, world_size):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allgather_contiguous_input(self):
         class Model(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None:
@@ -453,9 +547,15 @@ def forward(self, x, world_size, tag, ranks, group_size):
                 return out
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+<<<<<<< HEAD
             model = Model().to(self.device)
             model_compiled = torch.compile(model)
             inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
+=======
+            model = Model().cuda()
+            model_compiled = torch.compile(model)
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = model_compiled(inp, self.world_size, **self.get_world_trs())
             correct = model(inp, self.world_size, **self.get_world_trs())
             self.assertTrue(same(out, correct))
@@ -484,7 +584,11 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
+=======
+            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             eager_out = example(*inputs)
             compiled_matmul_cat_col = compile(example, inputs)
@@ -511,7 +615,11 @@ def compile(func, example_inputs):
                 example,
                 **self.get_world_trs(),
             )
+<<<<<<< HEAD
             inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2
+=======
+            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             eager_out = example(*inputs)
             compiled_fn = compile(example, inputs)
@@ -531,8 +639,15 @@ def example(
             ranks,
             group_size,
         ):
+<<<<<<< HEAD
             input_split_sizes = input_split_sizes_tensor.tolist()
             output_split_sizes = output_split_sizes_tensor.tolist()
+=======
+            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
+            output_split_sizes = _tolist_with_constrain_as_size(
+                output_split_sizes_tensor
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a2a = torch.ops.c10d_functional.all_to_all_single(
                 inp,
                 output_split_sizes,
@@ -563,7 +678,11 @@ def example(
                 dtype=torch.int64,
             )
             inputs = (
+<<<<<<< HEAD
                 torch.ones(int(row), 5, device=self.device) * (self.rank + 1),
+=======
+                torch.ones(int(row), 5, device="cuda") * (self.rank + 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_split_sizes_tensor,
                 output_split_sizes_tensor,
             )
@@ -674,7 +793,11 @@ def alltoall_autograd(
         class TrackingMode(TorchDispatchMode):
             def __init__(self):
                 super().__init__()
+<<<<<<< HEAD
                 self.ops_counter = Counter()
+=======
+                self.ops_counter = defaultdict(int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 if kwargs is None:
@@ -692,8 +815,15 @@ def example(
             ranks,
             group_size,
         ):
+<<<<<<< HEAD
             input_split_sizes = input_split_sizes_tensor.tolist()
             output_split_sizes = output_split_sizes_tensor.tolist()
+=======
+            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
+            output_split_sizes = _tolist_with_constrain_as_size(
+                output_split_sizes_tensor
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a2a = torch.ops.custom_ns.alltoall_autograd.default(
                 inp,
                 output_split_sizes,
@@ -730,7 +860,11 @@ def example(
                 dtype=torch.int64,
             )
             inputs = (
+<<<<<<< HEAD
                 torch.ones(int(row), 5, device=self.device, requires_grad=True)
+=======
+                torch.ones(int(row), 5, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 * (self.rank + 1),
                 input_split_sizes_tensor,
                 output_split_sizes_tensor,
@@ -793,7 +927,11 @@ def example(inp, *, tag, ranks, group_size):
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             inputs = (
+<<<<<<< HEAD
                 torch.ones(self.world_size, self.world_size, device=self.device)
+=======
+                torch.ones(self.world_size, self.world_size, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 * (self.rank + 1),
             )
             trs = self.get_world_trs()
@@ -817,11 +955,16 @@ def example(inp, *, tag, ranks, group_size):
 
 
 @instantiate_parametrized_tests
+<<<<<<< HEAD
 @requires_accelerator_dist_backend(["nccl", "xccl"])
 @unittest.skipIf(
     not torch.accelerator.is_available(),
     "No accelerator is available",
 )
+=======
+@requires_nccl()
+@requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
     """
     Prefer single-proc test runner for basic tests as it is easier to work with.
@@ -844,12 +987,20 @@ def func(inp, *, tag, ranks, group_size):
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             return ar
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compiled = torch.compile(func)
         out = compiled(inputs, **self.get_world_trs())
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+<<<<<<< HEAD
         # NOTE: Make sure we are not unnecessarily copying the outputs of
+=======
+        # NOTE: Make sure we are not unneccessarily copying the outputs of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -879,7 +1030,11 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar, other
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@@ -912,11 +1067,19 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar, y, other
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: Make sure we are not unnecessarily copying the outputs of
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+
+        compiled = torch.compile(func)
+        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+        # NOTE: Make sure we are not unneccessarily copying the outputs of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -953,7 +1116,11 @@ def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -964,13 +1131,20 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_all_gather_tensor(self):
         def func(inp):
             ar = _functional_collectives.all_gather_tensor(inp, 0, "0")
             return ar
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -981,7 +1155,10 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_all_gather_tensor_pg(self):
         def func(inp, *, pg):
             ar = _functional_collectives.all_gather_tensor(inp, 0, pg)
@@ -998,7 +1175,10 @@ def func(inp, *, pg):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_rewrite_dist_all_gather(self):
         def func(inp, out, *, pg):
             torch.distributed.all_gather_into_tensor(
@@ -1024,7 +1204,10 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_rewrite_dist_all_gather_list(self):
         def func(inp, out, *, pg):
             torch.distributed.all_gather(
@@ -1047,7 +1230,10 @@ def func(inp, out, *, pg):
         assert counter.frame_count == 1
         assert same(outputs, correct_outputs)
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_rewrite_dist_all_gather_args_match(self):
         # Duplicated most of the structure from test_dynamo_rewrite_dist_all_gather
         # except uses kwargs to ensure rewrite has matching arg names
@@ -1076,7 +1262,10 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_rewrite_dist_reduce_scatter(self):
         def func(inp, out, *, pg):
             torch.distributed.reduce_scatter_tensor(
@@ -1244,7 +1433,10 @@ def verify(gm, _):
         input = torch.ones(2, device=self.device)
         compiled(input)
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_support_collective_op_with_async_op_False(self):
         def func(inp, out, *, pg):
             # user explicitly set the attribute `async_op` to False,
@@ -1304,13 +1496,20 @@ def func(inp, *, pg):
         assert counter.op_count == 1
         assert same(outputs, correct_outputs)
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_reduce_scatter_tensor(self):
         def func(inp):
             ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, "0")
             return ar
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs)
@@ -1321,7 +1520,10 @@ def func(inp):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_allgather_coalesced(self):
         def func(inp, *, tag, ranks, group_size):
             ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(
@@ -1329,10 +1531,14 @@ def func(inp, *, tag, ranks, group_size):
             )
             return ar
 
+<<<<<<< HEAD
         inputs = [
             torch.ones(4, 4, device=self.device),
             torch.ones(6, 6, device=self.device),
         ]
+=======
+        inputs = [torch.ones(4, 4, device="cuda"), torch.ones(6, 6, device="cuda")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
         out = compiled(inputs, **self.get_world_trs())
@@ -1352,7 +1558,11 @@ def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
+<<<<<<< HEAD
         input = torch.ones(4, 4, device=self.device, requires_grad=True)
+=======
+        input = torch.ones(4, 4, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = torch.compile(
             func, backend="aot_eager"
         )  # inductor bug with single-op allreduce graph
@@ -1370,7 +1580,10 @@ def test_meta(self):
         out = torch.ops.c10d_functional.all_reduce(x, "sum", **self.get_world_trs())
         self.assertEqual(x.size(), out.size())
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
     def test_inductor_all_gather_coalesced(self):
@@ -1390,11 +1603,19 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: Make sure we are not unnecessarily copying the outputs of
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+
+        compiled = torch.compile(func)
+        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+        # NOTE: Make sure we are not unneccessarily copying the outputs of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # wait_tensors before they are returned from the graph.
         (
             FileCheck()
@@ -1417,7 +1638,10 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
     def test_inductor_reduce_scatter_coalesced(self):
@@ -1437,12 +1661,20 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
+<<<<<<< HEAD
         # We want to make sure no unnecessary copy is made.
+=======
+        # We want to make sure no unneccessary copy is made.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             FileCheck()
             .check("buf0 = empty_strided")
@@ -1464,7 +1696,10 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
 
+<<<<<<< HEAD
     @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_reorder_peak_memory(self):
         """
@@ -1486,7 +1721,11 @@ def func(inp, *, tag, ranks, group_size):
             other = torch.ones_like(inp) + 22
             return ar0, y, other, ar1
 
+<<<<<<< HEAD
         inputs = torch.ones(4, 4, device=self.device)
+=======
+        inputs = torch.ones(4, 4, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # get stats directly from the internal helper without affecting the real pass's signature
         node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
@@ -1514,7 +1753,11 @@ def _reorder_communication_preserving_peak_memory(
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
+<<<<<<< HEAD
         # We want to make sure no unnecessary copy is made.
+=======
+        # We want to make sure no unneccessary copy is made.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             FileCheck()
             .check("buf0 = empty_strided")
@@ -1542,6 +1785,7 @@ def _reorder_communication_preserving_peak_memory(
         self.assertEqual(len(node_stats), 1)
         for stats in node_stats.values():
             self.assertEqual(stats.initial_exposed, 0)
+<<<<<<< HEAD
             self.assertEqual(stats.limiting_factor, "None")
             self.assertEqual(stats.moves, 0)
 
@@ -2198,6 +2442,11 @@ def test_sync_decision_cross_ranks(self):
         saved_values = _sync_decision_cross_ranks(test_graph, saved_values)
         self.assertEqual(saved_values, [wt1])
 
+=======
+            self.assertEqual(stats.limiting_factor, "data dependency")
+            self.assertEqual(stats.moves, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index 7ca6d25ad1c97..754f3c5e9bb38 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -25,8 +25,11 @@
 from torch.testing._internal.common_utils import IS_SANDCASTLE, run_tests, TestCase
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFAULT_WORLD_SIZE = 4
 
 
@@ -332,7 +335,11 @@ def backward(ctx, grad_output):
                 return grad_output * result
 
         x = torch.tensor(
+<<<<<<< HEAD
             [dist.get_rank()], dtype=torch.float, device=device_type, requires_grad=True
+=======
+            [dist.get_rank()], dtype=torch.float, device="cuda", requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         x = MyFunc.apply(x)
         x.sum().backward()
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index c02676a37a02e..0e58342ab858e 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -1,5 +1,9 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
+=======
+import re
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 
 import torch
@@ -12,15 +16,22 @@
     dtypes,
     instantiate_device_type_tests,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
     skip_if_lt_x_gpu,
 )
+=======
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
     load_tests,
     NoTest,
+<<<<<<< HEAD
     requires_cuda_p2p_access,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_ROCM,
@@ -28,9 +39,21 @@
 )
 
 
+<<<<<<< HEAD
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests  # noqa: PLW0127
+=======
+HIP_VERSION = (
+    0.0
+    if torch.version.hip is None
+    else float(re.search(r"^\d+\.\d+", torch.version.hip)[0])
+)
+
+# load_tests from common_utils is used to automatically filter tests for
+# sharding on sandcastle. This line silences flake warnings
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 nGPUs = torch.cuda.device_count()
 if not TEST_CUDA:
@@ -59,6 +82,12 @@ def test_unique_id(self, device):
         self.assertIsInstance(uid, bytes)
         self.assertGreater(len(uid), 1)
 
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*broadcast_dtypes)
@@ -81,6 +110,12 @@ def test_broadcast(self, device, dtype):
         for i in range(torch.cuda.device_count()):
             self.assertEqual(tensors[i], expected)
 
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
@@ -105,6 +140,13 @@ def test_reduce(self, device, dtype):
 
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5 and dtype == torch.bfloat16,  # noqa: F821
+        "Skip bfloat16 test for ROCm < 3.5",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*datatypes)
     def test_all_reduce(self, device, dtype):
         cpu_tensors = [
@@ -134,6 +176,12 @@ def test_all_reduce(self, device, dtype):
         for tensor in tensors:
             self.assertEqual(tensor, expected)
 
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     def test_collective_errors(self, device):
         t = torch.rand(10).cuda(0)
@@ -162,6 +210,12 @@ def test_collective_errors(self, device):
         ):
             nccl.reduce_scatter(t, t)
 
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
@@ -188,6 +242,12 @@ def test_all_gather(self, device, dtype):
         for tensor in outputs:
             self.assertEqual(tensor, expected)
 
+<<<<<<< HEAD
+=======
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
@@ -219,6 +279,7 @@ def test_reduce_scatter(self, device, dtype):
             self.assertEqual(outputs[i], expected[i])
 
 
+<<<<<<< HEAD
 @requires_cuda_p2p_access()
 class NCCLSymmetricMemoryTest(MultiProcContinuousTest):
     @property
@@ -234,6 +295,26 @@ def test_nccl_symmem_alloc(self):
         # Need this all_reduce to initialize NCCL communicator. Otherwise, the
         # test will hang.  TODO: investigate how NCCLSymmetricMemory can
         # initialize NCCL communicator.
+=======
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+class NCCLSymmetricMemoryTest(MultiProcContinousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    # To run this test, one needs to TORCH_SYMMMEM=NCCL when running the test.
+    @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
+    @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
+    def test_nccl_symmem_alloc(self):
+        self._init_device()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10d.all_reduce(torch.ones(1, device=self.device))
         group_name = c10d.group.WORLD.group_name
         symm_mem.enable_symm_mem_for_group(group_name)
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 8c6d40ced0705..a421f92d0ca66 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -1,12 +1,17 @@
 # Owner(s): ["oncall: distributed"]
 
 # To run:
+<<<<<<< HEAD
 # python test/distributed/test_nvshmem.py
+=======
+# TORCH_SYMMMEM=NVSHMEM python test/distributed/test_nvshmem.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
+<<<<<<< HEAD
 from torch.distributed.device_mesh import init_device_mesh
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
@@ -16,10 +21,22 @@
     instantiate_parametrized_tests,
     parametrize,
     requires_cuda_p2p_access,
+=======
+import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
+from torch._inductor.runtime.triton_compat import tl, triton
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import requires_triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Decorator
@@ -35,6 +52,7 @@ def requires_nvshmem():
 device_module = torch.get_device_module(device_type)
 
 
+<<<<<<< HEAD
 @requires_nvshmem()
 @requires_cuda_p2p_access()
 class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
@@ -43,6 +61,16 @@ def _init_device(self) -> None:
         device_module.set_device(self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
+=======
+@instantiate_parametrized_tests
+@requires_nvshmem()
+class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # NOTE: required for nvshmem allocation
+        torch.empty(1, device=self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def device(self) -> torch.device:
@@ -68,6 +96,7 @@ def foo():
         symm_mem.rendezvous(out, group=group_name)
 
     @skipIfRocm
+<<<<<<< HEAD
     def test_alloc_without_device_context(self) -> None:
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
@@ -209,6 +238,8 @@ def test_get_remote_tensor(self) -> None:
         self.assertEqual(y, expected)
 
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nvshmem_put(self) -> None:
         self._init_device()
         group_name = dist.group.WORLD.group_name
@@ -217,6 +248,7 @@ def test_nvshmem_put(self) -> None:
         dtype = torch.float
         numel = 1024
         tensor = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
+<<<<<<< HEAD
         hdl = symm_mem.rendezvous(tensor, group=group_name)
         signal_pad = hdl.get_signal_pad(self.rank)
         signal_val = 5
@@ -268,6 +300,23 @@ def _init_device(self) -> None:
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
+=======
+        symm_mem.rendezvous(tensor, group=group_name)
+
+        if self.rank == 0:
+            torch.ops.symm_mem.nvshmem_put(tensor, 1)
+            # TODO: remove after we have wait_signal
+            dist.barrier()
+        elif self.rank == 1:
+            # handle.wait_signal(src_rank=0)
+            # TODO: remove after we have wait_signal
+            dist.barrier()
+            torch.testing.assert_close(
+                tensor, torch.zeros(numel, dtype=dtype, device=self.device)
+            )
+        else:
+            dist.barrier()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfRocm
     def test_nvshmem_all_to_all(self) -> None:
@@ -317,6 +366,7 @@ def test_all_to_all_vdev(self) -> None:
         overflow_factor = self.world_size  # worst case: one rank receives all data
         max_out_numel = max_inp_numel * overflow_factor
 
+<<<<<<< HEAD
         inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).copy_(
             torch.randn(max_inp_numel, dtype=dtype, device=self.device)
         )
@@ -342,12 +392,36 @@ def test_all_to_all_vdev(self) -> None:
 
         # Check output splits (row 1)
         torch.testing.assert_close(out_splits_offsets[0], out_splits)
+=======
+        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
+            self.rank
+        )
+        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
+        in_out_splits = symm_mem.empty(
+            (3, self.world_size), dtype=torch.int64, device=self.device
+        )
+        # Row 0 is input splits
+        in_out_splits[0].copy_(inp_splits)
+
+        torch.ops.symm_mem.all_to_all_vdev(inp, out, in_out_splits, group_name)
+
+        # Check input splits (row 0) -- should not change
+        torch.testing.assert_close(in_out_splits[0], inp_splits)
+
+        # Check output splits (row 1)
+        torch.testing.assert_close(in_out_splits[1], out_splits)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check output offsets (row 2)
         out_offsets = torch.cumsum(out_splits, dim=0)  # inclusive scan
         # output offsets from `all_to_all_vdev` is exclusive scan
+<<<<<<< HEAD
         self.assertEqual(out_splits_offsets[1][0], 0)
         torch.testing.assert_close(out_splits_offsets[1][1:], out_offsets[:-1])
+=======
+        self.assertEqual(in_out_splits[2][0], 0)
+        torch.testing.assert_close(in_out_splits[2][1:], out_offsets[:-1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check data
         expected = torch.empty(out_numel, dtype=dtype, device=self.device)
@@ -390,6 +464,7 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
         overflow_factor = self.world_size  # worst case: one rank receives all data
         max_out_numel = max_inp_numel * overflow_factor
 
+<<<<<<< HEAD
         inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).copy_(
             torch.randn(max_inp_numel, dtype=dtype, device=self.device)
         )
@@ -414,6 +489,28 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
 
         # Check input splits (row 0) -- should not change
         torch.testing.assert_close(in_splits, inp_splits)
+=======
+        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
+            self.rank
+        )
+        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
+        # 3 rows: input splits, output splits, output offsets
+        # Initiallizing all values to -1 to check if they are updated
+        in_out_splits = symm_mem.empty(
+            (3, nsplits), dtype=torch.int64, device=self.device
+        ).fill_(-1)
+        # Row 0 is input splits
+        in_out_splits[0].copy_(inp_splits)
+
+        torch.ops.symm_mem.all_to_all_vdev_2d(
+            inp, out, in_out_splits, group_name, major_align=align
+        )
+        received_out_splits = in_out_splits[1]
+        received_out_offsets = in_out_splits[2]
+
+        # Check input splits (row 0) -- should not change
+        torch.testing.assert_close(in_out_splits[0], inp_splits)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check output splits (row 1)
         torch.testing.assert_close(received_out_splits, out_splits_t.reshape(-1))
@@ -467,6 +564,7 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
             torch.testing.assert_close(received_chunk, chunk)
 
     @skipIfRocm
+<<<<<<< HEAD
     def test_all_to_all_vdev_2d_offset(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -838,6 +936,709 @@ def test_multi_root_tile_reduce(
         if self.rank in active_roots:
             expected_tile.fill_(self.world_size * (self.world_size - 1) / 2)
         torch.testing.assert_close(full_out, expected)
+=======
+    @requires_triton()
+    def test_triton_put(self) -> None:
+        # A Triton kernel that calls nvshmem device side API
+        @triton.jit
+        def put_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        val = 5
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        peer = 1 - rank
+        if rank == 0:
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        dist.barrier()
+        if rank == 1:
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_get(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for GET
+        @triton.jit
+        def get_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.getmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val = 7
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(
+            val if rank == 0 else -1
+        )
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+        peer = 1 - rank
+        if rank == 1:
+            # Rank 1 gets data from rank 0
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            get_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        if rank == 1:
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_get_ring(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for GET
+        # with ring topology
+        @triton.jit
+        def get_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.getmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        world_size = dist.get_world_size()
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Each rank fills its input buffer with its own rank value
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(rank)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+
+        # Ring topology: each rank gets data from the rank to its left
+        # rank 0 gets from rank (world_size-1), rank 1 gets from rank 0, etc.
+        peer = (rank - 1) % world_size
+
+        # All ranks execute the get operation
+        dst_ptr = out_hdl.buffer_ptrs[rank]
+        src_ptr = inp_hdl.buffer_ptrs[rank]
+        get_kernel[(1, 1, 1)](
+            dst_ptr,
+            src_ptr,
+            numel=numel,
+            peer=peer,
+            extern_libs=nvshmem_lib,
+        )
+
+        expected_value = peer
+        torch.testing.assert_close(
+            out, expected_value * torch.ones(numel, dtype=dtype, device=self.device)
+        )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_put_signal_set(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT with SIGNAL
+        @triton.jit
+        def put_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Data buffers
+        val = 11
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        # Use the signal pad attached to the output symmetric memory handle
+        # as the flag buffer for signaling completion.
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+
+        peer = 1 - rank
+        NVSHMEM_SIGNAL_SET = 0  # value defined by NVSHMEM for atomic set
+        SIGNAL_VAL = 1  # Signal completion value
+        NVSHMEM_CMP_EQ = 0  # compare equal for signal wait until
+
+        # Kernel for waiting on the signal locally (Rank 1).
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr, cmp_op: tl.constexpr, cmp_val: tl.constexpr
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        if rank == 0:
+            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
+                sig_op=NVSHMEM_SIGNAL_SET,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        if rank == 1:
+            # Wait until signal flag is set by Rank 0
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1,)](
+                sig_ptr_local,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=SIGNAL_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            # After wait completes, verify data and flag contents
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_put_signal_add(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT with SIGNAL
+        @triton.jit
+        def put_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Data buffers
+        val = 11
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        # Use the signal pad attached to the output symmetric memory handle
+        # as the flag buffer for signaling completion.
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+
+        peer = 1 - rank
+        NVSHMEM_SIGNAL_ADD = 5  # atomic add operation
+        SIGNAL_VAL = 16  # val + NVSHMEM_SIGNAL_ADD
+        NVSHMEM_CMP_EQ = 0
+
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr, cmp_op: tl.constexpr, cmp_val: tl.constexpr
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        if rank == 0:
+            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
+                sig_op=NVSHMEM_SIGNAL_ADD,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        if rank == 1:
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1, 1, 1)](
+                sig_ptr_local,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=SIGNAL_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_wait_until(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT
+        @triton.jit
+        def put_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+
+        # A Triton kernel that calls nvshmem device side API for WAIT_UNTIL
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        # Data buffers
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val = 13
+        flag_val = 21
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+
+        peer = 1 - rank
+        NVSHMEM_CMP_EQ = 0  # from nvshmem.h
+
+        if rank == 0:
+            # Rank 0 waits for the flag to be set by Rank 1, then checks the data
+            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+        if rank == 1:
+            # Rank 1 puts data into Rank 0's output buffer
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+            # Rank 1 sets the flag on Rank 0
+            # We use a temporary tensor for the value to put.
+            flag_update_val = torch.tensor(
+                [flag_val], dtype=torch.int64, device=self.device
+            )
+            dst_ptr = out_hdl.signal_pad_ptrs[rank]
+            src_ptr = flag_update_val.data_ptr()
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=1,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_signal_wait_until(self) -> None:
+        # A Triton kernel that waits on a signal variable until it meets the compare condition.
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        # A Triton kernel for the producer that puts data and then signals completion.
+        @triton.jit
+        def put_and_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        self._init_device()
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        peer = 1 - rank
+
+        # NVSHMEM constants from documentation
+        NVSHMEM_CMP_EQ = 0  # equal comparison
+        NVSHMEM_SIGNAL_SET = 0  # atomic set operation
+
+        # Message configuration
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val_to_put = 123  # arbitrary test value
+        COMPLETION_FLAG_VAL = 1
+
+        # Producer (rank 0) prepares the data to send
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        # Consumer (rank 1) prepares the destination buffer
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        # Use the signal pad for synchronization, as in previous tests
+        flag_dtype = torch.int64
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=flag_dtype).fill_(0)
+        # Ensure setup is complete on all ranks before proceeding
+        dist.barrier()
+
+        if rank == 0:
+            # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_and_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel,
+                sig_ptr,
+                signal_val=COMPLETION_FLAG_VAL,
+                sig_op=NVSHMEM_SIGNAL_SET,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        elif rank == 1:
+            # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
+            sig_ptr = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1, 1, 1)](
+                sig_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=COMPLETION_FLAG_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            # After the wait returns, verify data and flag
+            torch.testing.assert_close(
+                out, val_to_put * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag,
+                torch.tensor(
+                    [COMPLETION_FLAG_VAL], dtype=flag_dtype, device=self.device
+                ),
+            )
+        # Final barrier to ensure the test does not exit before assertions complete
+        dist.barrier()
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_fence(self) -> None:
+        """
+        Rank 0 performs two put operations into Rank 1's buffers with a fence
+        between them, followed by another fence and a flag update. Rank 1 waits
+        for the flag, then verifies that both destination buffers contain the
+        expected values. The flag is transferred after the final fence, so
+        its arrival implies that both preceding puts have been delivered in
+        order.
+        """
+
+        # Triton kernel that issues two ordered puts separated by fences and
+        # finally writes the completion flag.
+        @triton.jit
+        def put_with_fence_kernel(
+            dst_ptr1,
+            dst_ptr2,
+            src_ptr1,
+            src_ptr2,
+            flag_ptr,
+            flag_src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            # First put
+            nvshmem.putmem_block(dst_ptr1, src_ptr1, numel, peer)
+            # Ensure the first put is ordered before the next.
+            nvshmem.fence()
+            # Second put
+            nvshmem.putmem_block(dst_ptr2, src_ptr2, numel, peer)
+            # Order the second put before flag update.
+            nvshmem.fence()
+            # Write the flag (single int64) to signal completion.
+            nvshmem.putmem_block(flag_ptr, flag_src_ptr, 1, peer)
+
+        # Kernel for Rank 1 to wait until the flag becomes the expected value.
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        peer = 1 - rank
+        # Message configuration
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val1 = 10
+        val2 = 20
+        flag_val = 1
+        # Symmetric buffers
+        inp1 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val1)
+        inp2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val2)
+        out1 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        out2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp1_hdl = symm_mem.rendezvous(inp1, group=group_name)
+        inp2_hdl = symm_mem.rendezvous(inp2, group=group_name)
+        out1_hdl = symm_mem.rendezvous(out1, group=group_name)
+        out2_hdl = symm_mem.rendezvous(out2, group=group_name)
+
+        # Flag buffer resides in the signal pad of out2.
+        flag = out2_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+        flag_update_val = torch.tensor(
+            [flag_val], dtype=torch.int64, device=self.device
+        )
+        NVSHMEM_CMP_EQ = 0  # compare equal
+        dist.barrier()
+
+        if rank == 0:
+            dst_ptr1 = out1_hdl.buffer_ptrs[rank]
+            dst_ptr2 = out2_hdl.buffer_ptrs[rank]
+            src_ptr1 = inp1_hdl.buffer_ptrs[rank]
+            src_ptr2 = inp2_hdl.buffer_ptrs[rank]
+            flag_ptr = out2_hdl.signal_pad_ptrs[rank]
+            flag_src_ptr = flag_update_val.data_ptr()
+
+            put_with_fence_kernel[(1, 1, 1)](
+                dst_ptr1,
+                dst_ptr2,
+                src_ptr1,
+                src_ptr2,
+                flag_ptr,
+                flag_src_ptr,
+                numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        elif rank == 1:
+            # Wait until flag is set by Rank 0.
+            ivar_ptr = out2_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+
+            # Verify ordered data arrival.
+            torch.testing.assert_close(
+                out1, val1 * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
+            )
+        dist.barrier()
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_quiet(self) -> None:
+        # A Triton kernel that uses nvshmem_quiet to ensure completion
+        @triton.jit
+        def put_with_quiet_kernel(
+            dst_ptr,
+            src_ptr,
+            flag_dst_ptr,
+            flag_src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            # Put data
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+            # Call quiet to ensure put is complete
+            nvshmem.quiet()
+            # Only after quiet, set the completion flag
+            # This ensures the data put is complete before flag is set
+            nvshmem.putmem_block(flag_dst_ptr, flag_src_ptr, 1, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        # Data buffers
+        val = 15
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        # Use signal pad as completion flag
+        flag_val = 42
+        peer = 1 - rank
+        NVSHMEM_CMP_EQ = 0
+
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        dist.barrier()
+        if rank == 0:
+            # Rank 0 waits for flag from Rank 1
+            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+            # After flag is set, data should be complete due to quiet
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+        if rank == 1:
+            # Rank 1 puts data and flag with quiet in between
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            flag_dst_ptr = out_hdl.signal_pad_ptrs[rank]
+            # Create a tensor for the flag value
+            flag_update_val = torch.tensor(
+                [flag_val], dtype=torch.int64, device=self.device
+            )
+            flag_src_ptr = flag_update_val.data_ptr()
+            put_with_quiet_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                flag_dst_ptr,
+                flag_src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index c1fbf05e60a1c..1bc070c2addd4 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -376,7 +376,11 @@ def patched_isinstance(obj, clazz):
             ):
                 self._create_wrapper_pg(with_new_group=True)
                 # nothing to assert, isinstance(pg, _ProcessGroupWrapper)
+<<<<<<< HEAD
                 # should never be invoked since it is proceeded by
+=======
+                # should never be invoked since it is preceeded by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # _GLOO_AVAILABLE check, this test will fail on
                 # an unexpected NameError if not.
 
diff --git a/test/distributed/test_serialization.py b/test/distributed/test_serialization.py
index 3adb099aa7a3b..680059e025230 100644
--- a/test/distributed/test_serialization.py
+++ b/test/distributed/test_serialization.py
@@ -95,6 +95,7 @@ def test_various_data_types(self) -> None:
         result = _streaming_load(file)
         torch.testing.assert_close(result, state_dict)
 
+<<<<<<< HEAD
     def test_empty_tensor(self) -> None:
         state_dict = {
             "empty": torch.zeros(0, 10),
@@ -107,6 +108,8 @@ def test_empty_tensor(self) -> None:
         result = _streaming_load(file, weights_only=False)
         self.assertEqual(result, state_dict)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtensor(self) -> None:
         dist.init_process_group(
             backend="gloo", rank=0, world_size=1, store=dist.HashStore()
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index a6b69eeb8b93e..10c0f099079e5 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -43,7 +43,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if platform == "darwin":
     LOOPBACK = "lo0"
@@ -54,8 +58,11 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
@@ -63,8 +70,13 @@ def gpus_for_rank(world_size):
     On a single node, all visible GPUs are evenly
     divided to subsets, each process only uses a subset.
     """
+<<<<<<< HEAD
     visible_devices = list(range(torch.accelerator.device_count()))
     gpus_per_process = torch.accelerator.device_count() // world_size
+=======
+    visible_devices = list(range(torch.cuda.device_count()))
+    gpus_per_process = torch.cuda.device_count() // world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -839,9 +851,15 @@ def test_tcp_store_timeout_set(self):
         # not respected, it will take much longer to timeout.
         start = time.time()
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             DistStoreError, "wait timeout after 100ms, keys: /nonexistent key"
         ):
             store0.get("nonexistent key")
+=======
+            DistStoreError, "wait timeout after 100ms, keys: /nonexistant key"
+        ):
+            store0.get("nonexistant key")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         end = time.time()
         time_diff = end - start
@@ -1068,7 +1086,11 @@ def run(rank, my_store):
             wait_for_workers=False,
         )
 
+<<<<<<< HEAD
         threads = []
+=======
+        ths = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(2):
             t = threading.Thread(
                 target=run,
@@ -1078,16 +1100,26 @@ def run(rank, my_store):
                 ),
             )
             t.start()
+<<<<<<< HEAD
             threads.append(t)
+=======
+            ths.append(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def handler(a, b):
             pass
 
         signal.signal(signal.SIGUSR1, handler)
         time.sleep(1)
+<<<<<<< HEAD
         signal.pthread_kill(threads[1].ident, signal.SIGUSR1)
 
         for t in threads:
+=======
+        signal.pthread_kill(ths[1].ident, signal.SIGUSR1)
+
+        for t in ths:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t.join()
         self.assertTrue(rank_res[0], "rank0")
         self.assertTrue(rank_res[1], "rank1")
@@ -1176,8 +1208,15 @@ def listen() -> None:
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if device_type != "cpu":
         assert not torch.get_device_module()._initialized, (
             f"test_distributed must not have initialized {device_type} context on main process"
         )
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests()
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 57b6a0a9ef47f..ec6d84f2ecfe0 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -2,9 +2,14 @@
 
 import itertools
 import os
+<<<<<<< HEAD
 import random
 from contextlib import nullcontext
 from unittest import skip, skipIf, skipUnless
+=======
+from contextlib import nullcontext
+from unittest import skip, skipIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -22,6 +27,7 @@
     restride_A_for_fused_matmul_reduce_scatter,
     restride_A_shard_for_fused_all_gather_matmul,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import (
     _get_torch_cuda_version,
     SM100OrLater,
@@ -45,6 +51,25 @@
     requires_cuda,
     requires_cuda_p2p_access,
     run_tests,
+=======
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
+from torch.testing._internal.common_device_type import e4m3_type
+from torch.testing._internal.common_distributed import (
+    MultiProcContinousTest,
+    MultiProcessTestCase,
+    requires_multicast_support,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    MI300_ARCH,
+    parametrize,
+    requires_cuda,
+    run_tests,
+    runOnRocmArch,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_ROCM,
     TestCase,
 )
@@ -52,31 +77,70 @@
 
 test_contexts = [nullcontext, _test_mode]
 
+<<<<<<< HEAD
 # Set environment variable to disable multicast for all tests in this module
 os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # So that tests are written in device-agnostic way
 device_type = "cuda"
 device_module = torch.get_device_module(device_type)
 
 
+<<<<<<< HEAD
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
 class SymmetricMemoryTest(MultiProcContinuousTest):
+=======
+def requires_cuda_p2p_access():
+    cuda_p2p_access_available = (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (8, 0)
+        and torch.cuda.device_count() >= 2
+    )
+    num_devices = torch.cuda.device_count()
+    for i in range(num_devices - 1):
+        for j in range(i + 1, num_devices):
+            if not torch.cuda.can_device_access_peer(i, j):
+                cuda_p2p_access_available = False
+                break
+        if not cuda_p2p_access_available:
+            break
+
+    return skip_but_pass_in_sandcastle_if(
+        not cuda_p2p_access_available,
+        "cuda p2p access is not available",
+    )
+
+
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class SymmetricMemoryTest(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+<<<<<<< HEAD
     def _init_process(self):
         torch.cuda.set_device(self.device)
         torch.manual_seed(42 + self.rank)
 
     @requires_multicast_support()
+=======
+    def _init_process(self, set_device: bool = True):
+        if set_device:
+            torch.cuda.set_device(self.device)
+        torch.manual_seed(42 + self.rank)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_has_multicast_support(self) -> None:
         # validate that has_multicast_support() returns "false" instead of throwing
         self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
         # NOTE: DeviceType.CUDA is implicitly tested through @requires_multicast_support
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
@@ -88,6 +152,9 @@ def test_get_backend(self) -> None:
         self.assertIsNotNone(backend)
 
     @skip_if_rocm_multiprocess
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_cuda_nvlink_connectivity_detection(self) -> None:
         from torch._C._distributed_c10d import _detect_dma_connectivity
@@ -99,16 +166,104 @@ def test_cuda_nvlink_connectivity_detection(self) -> None:
         for row in connectivity.matrix:
             self.assertEqual(len(row), torch.cuda.device_count())
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_alloc(self) -> None:
         t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    def _get_test_alloc_args(self):
+        shape = (64, 64)
+        stride = (64, 1)
+        dtype = torch.float32
+        device = self.device
+        group_name = "0"
+        return (shape, stride, dtype, device, group_name)
+
+    def _verify_symmetric_memory(self, symm_mem_hdl):
+        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
+
+        buf = symm_mem_hdl.get_buffer(
+            0, (symm_mem_hdl.buffer_size // 4,), torch.float32
+        )
+        self.assertEqual(buf.storage_offset(), 0)
+        self.assertEqual(buf.untyped_storage().size(), symm_mem_hdl.buffer_size)
+
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.wait_signal(src_rank=1)
+            self.assertTrue(buf.eq(42).all())
+        else:
+            buf.fill_(42)
+            symm_mem_hdl.put_signal(dst_rank=0)
+
+        symm_mem_hdl.barrier()
+
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.barrier()
+            self.assertTrue(buf.eq(43).all())
+        else:
+            buf.fill_(43)
+            symm_mem_hdl.barrier()
+
+        symm_mem_hdl.barrier()
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        enable_symm_mem_for_group(dist.group.WORLD.group_name)
+
+        alloc_args = self._get_test_alloc_args()
+
+        t = torch.empty((64, 64), device=self.device)
+        self.assertIsNone(_SymmetricMemory.rendezvous(t))
+
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
+
+        del t
+        self._verify_symmetric_memory(symm_mem_hdl)
+
+    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        enable_symm_mem_for_group(dist.group.WORLD.group_name)
+
+        alloc_args = self._get_test_alloc_args()
+
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
+        data_ptr = t.data_ptr()
+
+        # Verify that persistent allocation would fail if there's an active
+        # allocation with the same alloc_id.
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
+
+        # Verify that persistent allocation would succeed in lieu of activate
+        # allocations with the same alloc_id, and the returned tensor would
+        # have the same data pointer.
+        del t
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
+        self.assertEqual(t.data_ptr(), data_ptr)
+
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
+        self._verify_symmetric_memory(symm_mem_hdl)
+
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
         self._init_process()
@@ -149,9 +304,13 @@ def test_get_signal_pad(self) -> None:
         t.fill_(0)
         self.assertTrue(signal_pad.eq(42).all())
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_cuda
     def test_allow_overlapping_devices(self) -> None:
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "1"
@@ -170,6 +329,7 @@ def test_allow_overlapping_devices(self) -> None:
 
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
@@ -300,6 +460,11 @@ def _init_process(self):
     )
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1, 2])
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("gather_dim", [0, 1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         self._init_process()
 
@@ -311,10 +476,14 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         rank = self.rank
 
         torch.manual_seed(42 + rank)
+<<<<<<< HEAD
         A_shard_shape = [BATCH, M, K]
         A_shard_shape[gather_dim] //= self.world_size
 
         A_shard = torch.rand(A_shard_shape, device="cuda")
+=======
+        A_shard = torch.rand(BATCH, M // self.world_size, K, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Bs = [torch.rand(K, N, device="cuda") for _ in range(3)]
 
         ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback(
@@ -330,7 +499,11 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
             assert torch.allclose(mm_output_0, mm_output_1)
             assert mm_output_0.stride(), mm_output_1.stride()
 
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess  # this requires async_input_mm support
+=======
+    @skipIfRocm  # this requires async_input_mm support
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIf(
         not SM90OrLater,
         "_fused_all_gather_matmul_native currently only supports sm>=90",
@@ -338,10 +511,13 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
     @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     @parametrize("is_b_row_major", [True, False])
+<<<<<<< HEAD
     @skipIf(
         SM100OrLater,
         "https://github.com/pytorch/pytorch/issues/162917",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fused_all_gather_matmul_native(
         self, symm_mem_input: bool, is_b_row_major: bool
     ) -> None:
@@ -431,11 +607,16 @@ def test_multimem_all_gather_matmul(self) -> None:
         torch.testing.assert_close(ag_target, ag_baseline)
         torch.testing.assert_close(mm_target[0], mm_baseline[0])
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @skip_if_lt_x_gpu(2)
     @skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("gather_dim", [0, 1])
     @parametrize(
         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
@@ -457,7 +638,11 @@ def test_fused_all_gather_scaled_matmul(
         elif gather_dim == 1:
             leading_dims = (BATCH, M // self.world_size)
         else:
+<<<<<<< HEAD
             raise AssertionError(f"Invalid scale_mode: {scale_mode}")
+=======
+            raise AssertionError("Invalid scale_mode: {scale_mode}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         torch.manual_seed(42 + rank)
 
@@ -520,18 +705,28 @@ def test_fused_all_gather_scaled_matmul(
             self.assertEqual(mm_output_0.stride(), mm_output_1.stride())
             self.assertEqual(mm_output_0.dtype, mm_output_1.dtype)
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1, 2])
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("scatter_dim", [0, 1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         self._init_process()
 
         BATCH = 8
         M = 64
         N = 16
+<<<<<<< HEAD
         K = 1024
+=======
+        K = 32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group = dist.group.WORLD
         rank = self.rank
 
@@ -549,6 +744,7 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         assert torch.allclose(output_0, output_1)
         assert output_0.stride() == output_1.stride()
 
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
     @skip_if_lt_x_gpu(2)
     @skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
@@ -558,6 +754,12 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         SM100OrLater,
         "https://github.com/pytorch/pytorch/issues/162940",
     )
+=======
+    @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+    @skip_if_lt_x_gpu(2)
+    @parametrize("scatter_dim", [0, 1])
+    @parametrize("rowwise", [True, False])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fused_scaled_matmul_reduce_scatter(
         self, scatter_dim: int, rowwise: bool
     ) -> None:
@@ -602,11 +804,17 @@ def test_fused_scaled_matmul_reduce_scatter(
                 )
 
         assert outputs[0].stride() == outputs[1].stride()
+<<<<<<< HEAD
         self.assertEqual(outputs[0], outputs[1])
 
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+        assert torch.allclose(outputs[0], outputs[1]), (outputs[0], outputs[1])
+
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dim", [0, 1, 2])
     def test_optimal_layout(self, dim: int) -> None:
         t = torch.rand(8, 64, 32, 16)
@@ -619,6 +827,7 @@ def test_optimal_layout(self, dim: int) -> None:
         self.assertTrue(x.movedim(dim, 0).is_contiguous())
         self.assertTrue(torch.allclose(x, t))
 
+<<<<<<< HEAD
 
 # [READ ME FIRST]
 # The `SymmMemEmptySetDeviceTest` suite parameterizes whether user sets the
@@ -748,11 +957,114 @@ def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
 
         symm_mem_hdl = _SymmetricMemory.rendezvous(t)
         self._verify_symmetric_memory(symm_mem_hdl)
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            ).fill_(self.rank)
+        else:
+            t = torch.full((64, 64), self.rank, dtype=torch.float32, device=self.device)
+
+        res = torch.ops.symm_mem._low_contention_all_gather(t, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 * self.world_size, 64))
+
+        chunks = res.chunk(self.world_size)
+        for r in range(self.world_size):
+            self.assertTrue(chunks[r].eq(r).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("reduce_op", ["sum", "avg"])
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_reduce_scatter(
+        self, reduce_op: str, symm_mem_input: bool
+    ) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            )
+        else:
+            t = torch.empty((64, 64), dtype=torch.float32, device=self.device)
+
+        chunks = t.chunk(self.world_size)
+        for r in range(self.world_size):
+            chunks[r].fill_(r)
+
+        res = torch.ops.symm_mem._low_contention_reduce_scatter(t, reduce_op, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 // self.world_size, 64))
+
+        if reduce_op == "sum":
+            expect = self.rank * self.world_size
+        elif reduce_op == "avg":
+            expect = self.rank
+        else:
+            raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
+        self.assertTrue(res.eq(expect).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_subgroup(self) -> None:
+        self._init_process()
+
+        ranks = list(range(self.world_size))
+        subgroup_0 = dist.new_group(ranks[: len(ranks) // 2])
+        subgroup_1 = dist.new_group(ranks[len(ranks) // 2 :])
+
+        world = dist.group.WORLD
+        subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
+
+        t = symm_mem.empty(64, device="cuda")
+        symm_mem_world = symm_mem.rendezvous(t, group=world)
+        symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
+
+        self.assertEqual(symm_mem_world.world_size, world.size())
+        self.assertEqual(symm_mem_world.rank, world.rank())
+        self.assertEqual(symm_mem_subgroup.world_size, world.size() // 2)
+        self.assertEqual(symm_mem_subgroup.rank, world.rank() % subgroup.size())
+
+        t.fill_(world.rank())
+        symm_mem_world.barrier()
+
+        # Observe a peer buffer via the world group
+        peer_rank = (world.rank() + 1) % world.size()
+        buf = symm_mem_world.get_buffer(peer_rank, (64,), torch.float32)
+        self.assertTrue(buf.eq(peer_rank).all())
+
+        # Observe a peer buffer via the subgroup
+        peer_rank = (subgroup.rank() + 1) % subgroup.size()
+        buf = symm_mem_subgroup.get_buffer(peer_rank, (64,), torch.float32)
+        if world.rank() < world.size() // 2:
+            self.assertTrue(buf.eq(peer_rank).all())
+        else:
+            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This Test class is used to test the error handling of SymmetricMemory APIs.
 # Since a process restart is often needed after each test, we use the
+<<<<<<< HEAD
 # MultiProcessTestCase instead of MultiProcContinuousTest.
+=======
+# MultiProcessTestCase instead of MultiProcContinousTest.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @requires_cuda_p2p_access()
 class SymmMemNegativeTest(MultiProcessTestCase):
     def setUp(self) -> None:
@@ -780,10 +1092,17 @@ def _init_process(self):
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
+<<<<<<< HEAD
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skip_if_rocm_multiprocess
+=======
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_barrier_timeout(self) -> None:
         self._init_process()
@@ -806,10 +1125,17 @@ def test_barrier_timeout(self) -> None:
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
+<<<<<<< HEAD
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skip_if_rocm_multiprocess
+=======
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_put_signal_timeout(self) -> None:
         self._init_process()
@@ -835,10 +1161,17 @@ def test_put_signal_timeout(self) -> None:
 
     # These timeout tests are skipped on ROCm because timeout calls trap(), which
     # is handled differently inside hip runtime. It collects gpu coredump and causes
+<<<<<<< HEAD
     # the linux kernel to create a core dump of the host application. The functionality
     # is there, meaning timeout is happening correctly. However, there isn't a nice way
     # to test it as the current executing thread will coredump and exit.
     @skip_if_rocm_multiprocess
+=======
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_wait_signal_timeout(self) -> None:
         self._init_process()
@@ -862,7 +1195,11 @@ def test_wait_signal_timeout(self) -> None:
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
+<<<<<<< HEAD
 class SymmMemCollectiveTest(MultiProcContinuousTest):
+=======
+class SymmMemCollectiveTest(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
@@ -907,8 +1244,11 @@ def test_multimem_all_reduce(
     @parametrize("dtype", [torch.float, torch.bfloat16])
     @parametrize("align_bytes", [4, 8, 16])
     @parametrize("size_bytes", [4, 8192, 8196])
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/164015
     @xfailIfSM100OrLater
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multimem_one_shot_all_reduce(
         self, dtype: torch.dtype, size_bytes: int, align_bytes: int
     ) -> None:
@@ -929,6 +1269,7 @@ def test_multimem_one_shot_all_reduce(
             gathered_inps.sum(dim=0), res, rtol=1e-03, atol=1e-05
         )
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @requires_multicast_support()
     @parametrize("dtype", [torch.float, torch.bfloat16])
@@ -963,6 +1304,9 @@ def test_multimem_one_shot_reduce_out(
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_one_shot_all_reduce(self) -> None:
         self._init_process()
@@ -993,9 +1337,13 @@ def test_one_shot_all_reduce(self) -> None:
                 )
             self._verify_all_reduce_result(local_inp if copy else inp[offset:], res)
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_two_shot_all_reduce(self) -> None:
         self._init_process()
@@ -1045,9 +1393,13 @@ def _verify_all_reduce_result(self, inp, res):
             gathered_inps.sum(dim=0), res, rtol=1e-01, atol=1e-01
         )
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_reduce_scatter(self) -> None:
         self._init_process()
@@ -1084,9 +1436,13 @@ def test_reduce_scatter(self) -> None:
             self.assertTrue(t[shift + numel :].eq(0).all().item())
             self._verify_reduce_scatter_result(inp, out)
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_reduce_scatter_corner_cases(self) -> None:
         self._init_process()
@@ -1124,7 +1480,10 @@ def _verify_reduce_scatter_result(self, inp, res):
     @skip_if_lt_x_gpu(4)
     @requires_multicast_support()
     @parametrize("align_bytes", [4, 8, 16])
+<<<<<<< HEAD
     @requires_multicast_support()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multimem_all_gather(self, align_bytes: int) -> None:
         self._init_process()
         group_name = dist.group.WORLD.group_name
@@ -1151,7 +1510,11 @@ def test_multimem_all_gather(self, align_bytes: int) -> None:
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
+<<<<<<< HEAD
 class LoweringTest(MultiProcContinuousTest):
+=======
+class LoweringTest(MultiProcContinousTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _init_process(self) -> None:
         torch.cuda.set_device(self.device)
         enable_symm_mem_for_group(dist.group.WORLD.group_name)
@@ -1163,7 +1526,11 @@ def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
     @skip("Fails with 'one_shot_all_reduce' not found in AOT graph, TODO: fix")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess  # requires registered-buffer support
+=======
+    @skipIfRocm  # requires registered-buffer support
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @fresh_cache()
     def test_lowering_one_shot_all_reduce(self):
@@ -1223,9 +1590,13 @@ class SymmMemSingleProcTest(TestCase):
         not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
         "stream_write_value32 currently only supports cuda version>=12.0",
     )
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stream_write_value32(self):
         tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")
         expect = torch.tril(torch.ones(4, 4, device="cuda")).to(torch.uint32)
@@ -1240,10 +1611,15 @@ def test_stream_write_value32(self):
         with self.assertRaises(RuntimeError):
             _SymmetricMemory.stream_write_value32(tensor, offset=0, val=4294967296)
 
+<<<<<<< HEAD
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @requires_cuda
+=======
+    @requires_cuda
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memset32(self):
         t = _SymmetricMemory.empty_strided_p2p(
             (64,),
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 5558a9c21eda0..73b2d176111de 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -115,16 +115,25 @@
     set_default_dtype,
     set_rng_seed,
     skipIfTorchDynamo,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
 
+<<<<<<< HEAD
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests  # noqa: PLW0127
+=======
+# load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
+# sharding on sandcastle. This line silences flake warnings
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST_NUMPY = True
 try:
@@ -1791,6 +1800,7 @@ def test_negative_binomial_log_prob_vectorized_count(self):
             ).logpmf(sample)
             self.assertEqual(log_prob, expected, atol=1e-4, rtol=0)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     def test_zero_excluded_binomial(self):
         vals = Binomial(
@@ -1806,11 +1816,26 @@ def test_zero_excluded_binomial(self):
         vals = Binomial(
             total_count=torch.tensor(1.0).to(device_type),
             probs=torch.tensor(0.5).to(device_type),
+=======
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_zero_excluded_binomial(self):
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.9).cuda()
+        ).sample(torch.Size((100000000,)))
+        self.assertTrue((vals >= 0).all())
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.1).cuda()
+        ).sample(torch.Size((100000000,)))
+        self.assertTrue((vals < 2).all())
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.5).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).sample(torch.Size((10000,)))
         # vals should be roughly half zeroes, half ones
         assert (vals == 0.0).sum() > 4000
         assert (vals == 1.0).sum() > 4000
 
+<<<<<<< HEAD
     def test_torch_binomial_dtype_errors(self):
         dtypes = [torch.int, torch.long, torch.short]
 
@@ -1834,6 +1859,8 @@ def test_torch_binomial_dtype_errors(self):
             ):
                 torch.binomial(total_count, total_prob)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @set_default_dtype(torch.double)
     def test_multinomial_1d(self):
         total_count = 10
@@ -2056,15 +2083,25 @@ def test_poisson_sample(self):
                 )
         torch.set_default_dtype(saved_dtype)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
+=======
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_gpu_sample(self):
         set_rng_seed(1)
         for rate in [0.12, 0.9, 4.0]:
             self._check_sampler_discrete(
+<<<<<<< HEAD
                 Poisson(torch.tensor([rate]).to(device_type)),
                 scipy.stats.poisson(rate),
                 f"Poisson(lambda={rate}, {device_type})",
+=======
+                Poisson(torch.tensor([rate]).cuda()),
+                scipy.stats.poisson(rate),
+                f"Poisson(lambda={rate}, cuda)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 failure_rate=1e-3,
             )
 
@@ -3496,6 +3533,7 @@ def ref_log_prob(idx, x, log_prob):
 
         self._check_log_prob(Gamma(alpha, beta), ref_log_prob)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_gpu_shape(self):
@@ -3503,6 +3541,15 @@ def test_gamma_gpu_shape(self):
         beta = torch.randn(2, 3).to(device_type).exp().requires_grad_()
         alpha_1d = torch.randn(1).to(device_type).exp().requires_grad_()
         beta_1d = torch.randn(1).to(device_type).exp().requires_grad_()
+=======
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma_gpu_shape(self):
+        alpha = torch.randn(2, 3).cuda().exp().requires_grad_()
+        beta = torch.randn(2, 3).cuda().exp().requires_grad_()
+        alpha_1d = torch.randn(1).cuda().exp().requires_grad_()
+        beta_1d = torch.randn(1).cuda().exp().requires_grad_()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -3533,10 +3580,14 @@ def test_gamma_sample(self):
     def test_gamma_gpu_sample(self):
         set_rng_seed(0)
         for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+<<<<<<< HEAD
             a, b = (
                 torch.tensor([alpha]).to(device_type),
                 torch.tensor([beta]).to(device_type),
             )
+=======
+            a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_sampler_sampler(
                 Gamma(a, b),
                 scipy.stats.gamma(alpha, scale=1.0 / beta),
@@ -3982,11 +4033,19 @@ def test_beta_underflow(self):
             self.assertEqual(frac_zeros, 0.5, atol=0.05, rtol=0)
             self.assertEqual(frac_ones, 0.5, atol=0.05, rtol=0)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
     def test_beta_underflow_gpu(self):
         set_rng_seed(1)
         num_samples = 50000
         conc = torch.tensor(1e-2, dtype=torch.float64).to(device_type)
+=======
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_beta_underflow_gpu(self):
+        set_rng_seed(1)
+        num_samples = 50000
+        conc = torch.tensor(1e-2, dtype=torch.float64).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         beta_samples = Beta(conc, conc).sample([num_samples])
         self.assertEqual((beta_samples == 0).sum(), 0)
         self.assertEqual((beta_samples == 1).sum(), 0)
@@ -5722,11 +5781,19 @@ def test_kl_monte_carlo(self):
     def test_kl_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for multivariate_normal
+<<<<<<< HEAD
         for i in range(n):
             loc = [torch.randn(4) for _ in range(2)]
             scale_tril = [
                 transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
                 for _ in range(2)
+=======
+        for i in range(0, n):
+            loc = [torch.randn(4) for _ in range(0, 2)]
+            scale_tril = [
+                transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
+                for _ in range(0, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
@@ -5755,10 +5822,17 @@ def test_kl_multivariate_normal(self):
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
+<<<<<<< HEAD
         loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
             for _ in range(2)
+=======
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        scale_tril = [
+            transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
+            for _ in range(0, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         expected_kl = torch.stack(
             [
@@ -5766,7 +5840,11 @@ def test_kl_multivariate_normal_batched(self):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
                 )
+<<<<<<< HEAD
                 for i in range(b)
+=======
+                for i in range(0, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
         actual_kl = kl_divergence(
@@ -5777,7 +5855,11 @@ def test_kl_multivariate_normal_batched(self):
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
+<<<<<<< HEAD
         loc = [torch.randn(b, 3) for _ in range(2)]
+=======
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
             transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
@@ -5788,7 +5870,11 @@ def test_kl_multivariate_normal_batched_broadcasted(self):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
                 )
+<<<<<<< HEAD
                 for i in range(b)
+=======
+                for i in range(0, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
         actual_kl = kl_divergence(
@@ -5800,6 +5886,7 @@ def test_kl_multivariate_normal_batched_broadcasted(self):
     def test_kl_lowrank_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for lowrank_multivariate_normal
+<<<<<<< HEAD
         for i in range(n):
             loc = [torch.randn(4) for _ in range(2)]
             cov_factor = [torch.randn(4, 3) for _ in range(2)]
@@ -5809,6 +5896,17 @@ def test_kl_lowrank_multivariate_normal(self):
             covariance_matrix = [
                 cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
                 for i in range(2)
+=======
+        for i in range(0, n):
+            loc = [torch.randn(4) for _ in range(0, 2)]
+            cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
+            cov_diag = [
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
+            ]
+            covariance_matrix = [
+                cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
+                for i in range(0, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
@@ -5861,10 +5959,17 @@ def test_kl_lowrank_multivariate_normal(self):
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
+<<<<<<< HEAD
         loc = [torch.randn(b, 3) for _ in range(2)]
         cov_factor = [torch.randn(b, 3, 2) for _ in range(2)]
         cov_diag = [
             transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(2)
+=======
+        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
+        cov_diag = [
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         expected_kl = torch.stack(
             [
@@ -5876,7 +5981,11 @@ def test_kl_lowrank_multivariate_normal_batched(self):
                         loc[1][i], cov_factor[1][i], cov_diag[1][i]
                     ),
                 )
+<<<<<<< HEAD
                 for i in range(b)
+=======
+                for i in range(0, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
         actual_kl = kl_divergence(
@@ -5887,7 +5996,11 @@ def test_kl_lowrank_multivariate_normal_batched(self):
 
     def test_kl_exponential_family(self):
         for (p, _), (_, q) in self.finite_examples:
+<<<<<<< HEAD
             if type(p) is type(q) and issubclass(type(p), ExponentialFamily):
+=======
+            if type(p) == type(q) and issubclass(type(p), ExponentialFamily):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 actual = kl_divergence(p, q)
                 expected = _kl_expfamily_expfamily(p, q)
                 self.assertEqual(
@@ -6378,7 +6491,11 @@ def test_lazy_logits_initialization(self):
             except NotImplementedError:
                 pass
             self.assertNotIn("probs", dist.__dict__, msg=message)
+<<<<<<< HEAD
             _ = (dist.batch_shape, dist.event_shape)
+=======
+            dist.batch_shape, dist.event_shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertNotIn("probs", dist.__dict__, msg=message)
 
     def test_lazy_probs_initialization(self):
@@ -6395,7 +6512,11 @@ def test_lazy_probs_initialization(self):
             except NotImplementedError:
                 pass
             self.assertNotIn("logits", dist.__dict__, msg=message)
+<<<<<<< HEAD
             _ = (dist.batch_shape, dist.event_shape)
+=======
+            dist.batch_shape, dist.event_shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertNotIn("logits", dist.__dict__, msg=message)
 
 
diff --git a/test/dynamo/cpython/3_13/list_tests.diff b/test/dynamo/cpython/3_13/list_tests.diff
index 1a5c63a9142dc..6f333a8c90514 100644
--- a/test/dynamo/cpython/3_13/list_tests.diff
+++ b/test/dynamo/cpython/3_13/list_tests.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
+<<<<<<< HEAD
 index dbc5ef4f9f2..af717703053 100644
 --- a/test/dynamo/cpython/3_13/list_tests.py
 +++ b/test/dynamo/cpython/3_13/list_tests.py
 @@ -1,3 +1,56 @@
+=======
+index dbc5ef4f9f2..2b9f3b9311f 100644
+--- a/test/dynamo/cpython/3_13/list_tests.py
++++ b/test/dynamo/cpython/3_13/list_tests.py
+@@ -1,3 +1,53 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/list_tests.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -59,6 +69,7 @@ index dbc5ef4f9f2..af717703053 100644
  """
  Tests common to list and UserList.UserList
  """
+<<<<<<< HEAD
 @@ -5,7 +58,7 @@ Tests common to list and UserList.UserList
  import sys
  from functools import cmp_to_key
@@ -157,3 +168,14 @@ index dbc5ef4f9f2..af717703053 100644
          a = self.type2test()
          a[:] = [EvilCmp(a) for _ in range(100)]
          # This used to seg fault before patch #1005778
+=======
+@@ -5,7 +55,7 @@ Tests common to list and UserList.UserList
+ import sys
+ from functools import cmp_to_key
+ 
+-from test import seq_tests
++import seq_tests
+ from test.support import ALWAYS_EQ, NEVER_EQ, get_c_recursion_limit
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
index 21e85eef179fd..bbbf67db4d956 100644
--- a/test/dynamo/cpython/3_13/list_tests.py
+++ b/test/dynamo/cpython/3_13/list_tests.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/list_tests.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -172,6 +175,13 @@ def test_setitem(self):
         a[-1] = 9
         self.assertEqual(a, self.type2test([5,6,7,8,9]))
 
+<<<<<<< HEAD
+=======
+        msg = "list indices must be integers or slices"
+        with self.assertRaisesRegex(TypeError, msg):
+            a['a'] = "python"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_delitem(self):
         a = self.type2test([0, 1])
         del a[1]
@@ -319,6 +329,7 @@ def test_extend(self):
         self.assertRaises(TypeError, a.extend)
 
         # overflow test. issue1621
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class CustomIter:
                 def __iter__(self):
@@ -327,6 +338,15 @@ def __next__(self):
                     raise StopIteration
                 def __length_hint__(self):
                     return sys.maxsize
+=======
+        class CustomIter:
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise StopIteration
+            def __length_hint__(self):
+                return sys.maxsize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.type2test([1,2,3,4])
         a.extend(CustomIter())
         self.assertEqual(a, [1,2,3,4])
@@ -387,6 +407,7 @@ def test_remove(self):
         a = self.type2test([NEVER_EQ])
         self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
@@ -396,14 +417,30 @@ def __eq__(self, other):
                     if other == 2:
                         raise BadExc()
                     return False
+=======
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = self.type2test([0, 1, 2, 3])
         self.assertRaises(BadExc, a.remove, BadCmp())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadCmp2:
                 def __eq__(self, other):
                     raise BadExc()
+=======
+        class BadCmp2:
+            def __eq__(self, other):
+                raise BadExc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self.type2test('abcdefghcij')
         d.remove('c')
@@ -428,6 +465,7 @@ def test_index(self):
         self.assertRaises(ValueError, a.index, 2, 0, 4)
         self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             # Test modifying the list during index's iteration
             class EvilCmp:
@@ -436,6 +474,15 @@ def __init__(self, victim):
                 def __eq__(self, other):
                     del self.victim[:]
                     return False
+=======
+        # Test modifying the list during index's iteration
+        class EvilCmp:
+            def __init__(self, victim):
+                self.victim = victim
+            def __eq__(self, other):
+                del self.victim[:]
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.type2test()
         a[:] = [EvilCmp(a) for _ in range(100)]
         # This used to seg fault before patch #1005778
diff --git a/test/dynamo/cpython/3_13/mapping_tests.diff b/test/dynamo/cpython/3_13/mapping_tests.diff
index c376ddf725ae5..c54ac6cda7de8 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.diff
+++ b/test/dynamo/cpython/3_13/mapping_tests.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
+<<<<<<< HEAD
 index ed89a81a6ea..b19cec7cb23 100644
 --- a/test/dynamo/cpython/3_13/mapping_tests.py
 +++ b/test/dynamo/cpython/3_13/mapping_tests.py
 @@ -1,10 +1,64 @@
+=======
+index ed89a81a6ea..eed59a68e94 100644
+--- a/test/dynamo/cpython/3_13/mapping_tests.py
++++ b/test/dynamo/cpython/3_13/mapping_tests.py
+@@ -1,10 +1,61 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/mapping_tests.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -61,12 +71,18 @@ index ed89a81a6ea..b19cec7cb23 100644
  import unittest
  import collections
  from test.support import get_c_recursion_limit
+<<<<<<< HEAD
 
 
+=======
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class BasicTestMappingProtocol(unittest.TestCase):
 +class BasicTestMappingProtocol(__TestCase):
      # This base class can be used to check that an object conforms to the
      # mapping protocol
+<<<<<<< HEAD
 
 @@ -196,70 +250,76 @@ class BasicTestMappingProtocol(unittest.TestCase):
          self.assertRaises((TypeError, AttributeError), d.update, 42)
@@ -418,3 +434,6 @@ index ed89a81a6ea..b19cec7cb23 100644
 
          d = self._empty_mapping()
          x = BadHash()
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
index 88c97899ae3eb..fece8aca6b134 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.py
+++ b/test/dynamo/cpython/3_13/mapping_tests.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/mapping_tests.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -250,6 +253,7 @@ def test_update(self):
         self.assertRaises((TypeError, AttributeError), d.update, 42)
 
         outerself = self
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class SimpleUserDict:
                 def __init__(self):
@@ -258,12 +262,22 @@ def keys(self):
                     return self.d.keys()
                 def __getitem__(self, i):
                     return self.d[i]
+=======
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = outerself.reference
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d.clear()
         d.update(SimpleUserDict())
         i1 = sorted(d.items())
         i2 = sorted(self.reference.items())
         self.assertEqual(i1, i2)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -272,10 +286,19 @@ class Exc(Exception): pass
             class FailingUserDict:
                 def keys(self):
                     raise Exc
+=======
+        class Exc(Exception): pass
+
+        d = self._empty_mapping()
+        class FailingUserDict:
+            def keys(self):
+                raise Exc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertRaises(Exc, d.update, FailingUserDict())
 
         d.clear()
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class FailingUserDict:
                 def keys(self):
@@ -320,6 +343,49 @@ def __iter__(self):
                     return self
                 def __next__(self):
                     raise Exc()
+=======
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = 1
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i:
+                            self.i = 0
+                            return 'a'
+                        raise Exc
+                return BogonIter()
+            def __getitem__(self, key):
+                return key
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = ord('a')
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i <= ord('z'):
+                            rtn = chr(self.i)
+                            self.i += 1
+                            return rtn
+                        raise StopIteration
+                return BogonIter()
+            def __getitem__(self, key):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        d = self._empty_mapping()
+        class badseq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(Exc, d.update, badseq())
 
@@ -469,6 +535,7 @@ def test_update(self):
         d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
         self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class SimpleUserDict:
                 def __init__(self):
@@ -477,6 +544,15 @@ def keys(self):
                     return self.d.keys()
                 def __getitem__(self, i):
                     return self.d[i]
+=======
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = {1:1, 2:2, 3:3}
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
@@ -492,22 +568,33 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class dictlike(self.type2test): pass
+=======
+        class dictlike(self.type2test): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
         self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
         self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class mydict(self.type2test):
                 def __new__(cls):
                     return collections.UserDict()
+=======
+        class mydict(self.type2test):
+            def __new__(cls):
+                return collections.UserDict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -530,6 +617,27 @@ def __next__(self):
             class baddict2(self.type2test):
                 def __setitem__(self, key, value):
                     raise Exc()
+=======
+        class Exc(Exception): pass
+
+        class baddict1(self.type2test):
+            def __init__(self, *args, **kwargs):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+        class BadSeq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
+
+        class baddict2(self.type2test):
+            def __setitem__(self, key, value):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -603,6 +711,7 @@ class TestHashMappingProtocol(TestMappingProtocol):
 
     def test_getitem(self):
         TestMappingProtocol.test_getitem(self)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -611,11 +720,21 @@ def __eq__(self, other):
                     raise Exc()
                 def __hash__(self):
                     return 24
+=======
+        class Exc(Exception): pass
+
+        class BadEq(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self._empty_mapping()
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadHash(object):
                 fail = False
@@ -624,6 +743,15 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self._empty_mapping()
         x = BadHash()
@@ -633,10 +761,16 @@ def __hash__(self):
 
     def test_fromkeys(self):
         TestMappingProtocol.test_fromkeys(self)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class mydict(self.type2test):
                 def __new__(cls):
                     return collections.UserDict()
+=======
+        class mydict(self.type2test):
+            def __new__(cls):
+                return collections.UserDict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
@@ -644,6 +778,7 @@ def __new__(cls):
     def test_pop(self):
         TestMappingProtocol.test_pop(self)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -654,6 +789,17 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self._empty_mapping()
         x = BadHash()
@@ -683,12 +829,20 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
             class BadRepr(object):
                 def __repr__(self):
                     raise Exc()
+=======
+        class Exc(Exception): pass
+
+        class BadRepr(object):
+            def __repr__(self):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self._full_mapping({1: BadRepr()})
         self.assertRaises(Exc, repr, d)
@@ -706,6 +860,7 @@ def test_eq(self):
         self.assertEqual(self._full_mapping({1: 2}),
                          self._full_mapping({1: 2}))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -714,6 +869,15 @@ def __eq__(self, other):
                     raise Exc()
                 def __hash__(self):
                     return 1
+=======
+        class Exc(Exception): pass
+
+        class BadCmp(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d1 = self._full_mapping({BadCmp(): 1})
         d2 = self._full_mapping({1: 1})
@@ -723,6 +887,7 @@ def __hash__(self):
     def test_setdefault(self):
         TestMappingProtocol.test_setdefault(self)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -733,6 +898,17 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = self._empty_mapping()
         x = BadHash()
diff --git a/test/dynamo/cpython/3_13/mathdata/ieee754.txt b/test/dynamo/cpython/3_13/mathdata/ieee754.txt
index 3e986cdb10280..fe4bac7c777c1 100644
--- a/test/dynamo/cpython/3_13/mathdata/ieee754.txt
+++ b/test/dynamo/cpython/3_13/mathdata/ieee754.txt
@@ -51,7 +51,11 @@ nan
 >>> INF / INF
 nan
 
+<<<<<<< HEAD
 However unambiguous operations with inf return inf:
+=======
+However unambigous operations with inf return inf:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 >>> INF * INF
 inf
 >>> 1.5 * INF
diff --git a/test/dynamo/cpython/3_13/seq_tests.diff b/test/dynamo/cpython/3_13/seq_tests.diff
index d5e6f92a07689..388c037c6aed4 100644
--- a/test/dynamo/cpython/3_13/seq_tests.diff
+++ b/test/dynamo/cpython/3_13/seq_tests.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
+<<<<<<< HEAD
 index 719c9434a16..290e57c04a0 100644
 --- a/test/dynamo/cpython/3_13/seq_tests.py
 +++ b/test/dynamo/cpython/3_13/seq_tests.py
 @@ -1,3 +1,57 @@
+=======
+index 719c9434a16..4325892276d 100644
+--- a/test/dynamo/cpython/3_13/seq_tests.py
++++ b/test/dynamo/cpython/3_13/seq_tests.py
+@@ -1,3 +1,54 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/seq_tests.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,14 +70,22 @@ index 719c9434a16..290e57c04a0 100644
  """
  Tests common to tuple, list and UserList.UserList
  """
+<<<<<<< HEAD
 @@ -95,7 +149,7 @@ class LyingList(list):
      def __iter__(self):
          yield 1
 
+=======
+@@ -95,7 +146,7 @@ class LyingList(list):
+     def __iter__(self):
+         yield 1
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class CommonTest(unittest.TestCase):
 +class CommonTest(__TestCase):
      # The type to be tested
      type2test = None
+<<<<<<< HEAD
 
 @@ -115,13 +169,14 @@ class CommonTest(unittest.TestCase):
          uu2 = self.type2test(u2)
@@ -181,3 +199,6 @@ index 719c9434a16..290e57c04a0 100644
 
          a = self.type2test([0, 1, 2, 3])
          self.assertRaises(BadExc, a.index, BadCmp())
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
index 11d59c847326c..7152b2d051de0 100644
--- a/test/dynamo/cpython/3_13/seq_tests.py
+++ b/test/dynamo/cpython/3_13/seq_tests.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/seq_tests.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -169,6 +172,7 @@ def test_constructors(self):
         uu2 = self.type2test(u2)
 
         v = self.type2test(tuple(u))
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class OtherSeq:
                 def __init__(self, initseq):
@@ -177,6 +181,15 @@ def __len__(self):
                     return len(self.__data)
                 def __getitem__(self, i):
                     return self.__data[i]
+=======
+        class OtherSeq:
+            def __init__(self, initseq):
+                self.__data = initseq
+            def __len__(self):
+                return len(self.__data)
+            def __getitem__(self, i):
+                return self.__data[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = OtherSeq(u0)
         v0 = self.type2test(s)
         self.assertEqual(len(v0), len(s))
@@ -294,12 +307,20 @@ def test_contains_order(self):
         # Sequences must test in-order.  If a rich comparison has side
         # effects, these will be visible to tests against later members.
         # In this test, the "side effect" is a short-circuiting raise.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class DoNotTestEq(Exception):
                 pass
             class StopCompares:
                 def __eq__(self, other):
                     raise DoNotTestEq
+=======
+        class DoNotTestEq(Exception):
+            pass
+        class StopCompares:
+            def __eq__(self, other):
+                raise DoNotTestEq
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         checkfirst = self.type2test([1, StopCompares()])
         self.assertIn(1, checkfirst)
@@ -339,9 +360,14 @@ def test_addmul(self):
         self.assertEqual(u2+u2+u2, u2*3)
         self.assertEqual(u2+u2+u2, 3*u2)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(self.type2test):
                 pass
+=======
+        class subclass(self.type2test):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u3 = subclass([0, 1])
         self.assertEqual(u3, u3*1)
         self.assertIsNot(u3, u3*1)
@@ -368,10 +394,16 @@ def test_imul(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides are not recognized by __iter__
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class T(self.type2test):
                 def __getitem__(self, key):
                     return str(key) + '!!!'
+=======
+        class T(self.type2test):
+            def __getitem__(self, key):
+                return str(key) + '!!!'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(next(iter(T((1,2)))), 1)
 
     def test_repeat(self):
@@ -419,6 +451,7 @@ def test_count(self):
 
         self.assertRaises(TypeError, a.count)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
@@ -428,6 +461,16 @@ def __eq__(self, other):
                     if other == 2:
                         raise BadExc()
                     return False
+=======
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(BadExc, a.count, BadCmp())
 
@@ -453,6 +496,7 @@ def test_index(self):
 
         self.assertRaises(TypeError, u.index)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
@@ -462,6 +506,16 @@ def __eq__(self, other):
                     if other == 2:
                         raise BadExc()
                     return False
+=======
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = self.type2test([0, 1, 2, 3])
         self.assertRaises(BadExc, a.index, BadCmp())
diff --git a/test/dynamo/cpython/3_13/test_cmath.diff b/test/dynamo/cpython/3_13/test_cmath.diff
index deb03570db1cd..217151a68326d 100644
--- a/test/dynamo/cpython/3_13/test_cmath.diff
+++ b/test/dynamo/cpython/3_13/test_cmath.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
+<<<<<<< HEAD
 index a96a5780b31..d00dfca8a17 100644
 --- a/test/dynamo/cpython/3_13/test_cmath.py
 +++ b/test/dynamo/cpython/3_13/test_cmath.py
 @@ -1,5 +1,58 @@
+=======
+index a96a5780b31..883e87a0733 100644
+--- a/test/dynamo/cpython/3_13/test_cmath.py
++++ b/test/dynamo/cpython/3_13/test_cmath.py
+@@ -1,5 +1,55 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_cmath.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -62,19 +72,33 @@ index a96a5780b31..d00dfca8a17 100644
  from test.test_math import parse_testfile, test_file
  import test.test_math as test_math
  import unittest
+<<<<<<< HEAD
 @@ -50,7 +103,7 @@ complex_nans = [complex(x, y) for x, y in [
          (INF, NAN)
          ]]
 
+=======
+@@ -50,7 +100,7 @@ complex_nans = [complex(x, y) for x, y in [
+         (INF, NAN)
+         ]]
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class CMathTests(__TestCase):
      # list of all functions in cmath
      test_functions = [getattr(cmath, fname) for fname in [
              'acos', 'acosh', 'asin', 'asinh', 'atan', 'atanh',
+<<<<<<< HEAD
 @@ -66,6 +119,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
      def tearDown(self):
          self.test_values.close()
 
+=======
+@@ -66,6 +116,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
+     def tearDown(self):
+         self.test_values.close()
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    def assertFloatIdentical(self, x, y):
 +        """Fail unless floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -111,6 +135,7 @@ index a96a5780b31..d00dfca8a17 100644
      def rAssertAlmostEqual(self, a, b, rel_err = 2e-15, abs_err = 5e-323,
                             msg=None):
          """Fail if the two floating-point numbers are not almost equal.
+<<<<<<< HEAD
 @@ -165,38 +251,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
          # end up being passed to the cmath functions
 
@@ -185,6 +210,11 @@ index a96a5780b31..d00dfca8a17 100644
 @@ -590,4 +677,4 @@ class IsCloseTests(test_math.IsCloseTests):
 
 
+=======
+@@ -590,4 +673,4 @@ class IsCloseTests(test_math.IsCloseTests):
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
index 95cb84121f9c3..506f127a0de65 100644
--- a/test/dynamo/cpython/3_13/test_cmath.py
+++ b/test/dynamo/cpython/3_13/test_cmath.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_cmath.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -251,6 +254,7 @@ def test_user_object(self):
         # end up being passed to the cmath functions
 
         # usual case: new-style class implementing __complex__
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyComplex:
                 def __init__(self, value):
@@ -284,6 +288,40 @@ def __complex__(self):
             class JustFloat:
                 def __float__(self):
                     return flt_arg
+=======
+        class MyComplex:
+            def __init__(self, value):
+                self.value = value
+            def __complex__(self):
+                return self.value
+
+        # classes for which __complex__ raises an exception
+        class SomeException(Exception):
+            pass
+        class MyComplexException:
+            def __complex__(self):
+                raise SomeException
+
+        # some classes not providing __float__ or __complex__
+        class NeitherComplexNorFloat(object):
+            pass
+        class Index:
+            def __int__(self): return 2
+            def __index__(self): return 2
+        class MyInt:
+            def __int__(self): return 2
+
+        # other possible combinations of __float__ and __complex__
+        # that should work
+        class FloatAndComplex:
+            def __float__(self):
+                return flt_arg
+            def __complex__(self):
+                return cx_arg
+        class JustFloat:
+            def __float__(self):
+                return flt_arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for f in self.test_functions:
             # usual usage
diff --git a/test/dynamo/cpython/3_13/test_complex.diff b/test/dynamo/cpython/3_13/test_complex.diff
index 2a7042b9c0a6f..446466ebb8919 100644
--- a/test/dynamo/cpython/3_13/test_complex.diff
+++ b/test/dynamo/cpython/3_13/test_complex.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
+<<<<<<< HEAD
 index 6ff1a8ab29d..1572433c5ae 100644
 --- a/test/dynamo/cpython/3_13/test_complex.py
 +++ b/test/dynamo/cpython/3_13/test_complex.py
 @@ -1,16 +1,147 @@
+=======
+index 6ff1a8ab29d..ab5bd3dab62 100644
+--- a/test/dynamo/cpython/3_13/test_complex.py
++++ b/test/dynamo/cpython/3_13/test_complex.py
+@@ -1,16 +1,143 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_complex.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -19,7 +29,10 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +from torch._dynamo.test_case import CPythonTestCase
 +from torch.testing._internal.common_utils import (
 +    run_tests,
+<<<<<<< HEAD
 +    slowTest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    xfailIfTorchDynamo,
 +)
 +
@@ -43,7 +56,11 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +    "test.test_iter",
 +    "test.typinganndata.ann_module",
  )
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +class RedirectImportFinder(importlib.abc.MetaPathFinder):
 +    def find_spec(self, fullname, path, target=None):
 +        # Check if the import is the problematic one
@@ -74,7 +91,11 @@ index 6ff1a8ab29d..1572433c5ae 100644
  from math import isnan, copysign
 +import math
  import operator
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +VALID_UNDERSCORE_LITERALS = [
 +    '0_0_0',
 +    '4_2',
@@ -155,10 +176,17 @@ index 6ff1a8ab29d..1572433c5ae 100644
  INF = float("inf")
  NAN = float("nan")
  DBL_MAX = sys.float_info.max
+<<<<<<< HEAD
 @@ -45,7 +176,40 @@ class WithComplex:
      def __complex__(self):
          return self.value
 
+=======
+@@ -45,7 +172,40 @@ class WithComplex:
+     def __complex__(self):
+         return self.value
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class ComplexTest(__TestCase):
 +
@@ -194,6 +222,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +        """
 +        self.assertFloatIdentical(x.real, y.real)
 +        self.assertFloatIdentical(x.imag, y.imag)
+<<<<<<< HEAD
 
      def assertAlmostEqual(self, a, b):
          if isinstance(a, complex):
@@ -201,6 +230,15 @@ index 6ff1a8ab29d..1572433c5ae 100644
          # check that relative difference < eps
          self.assertTrue(abs((x-y)/y) < eps)
 
+=======
+ 
+     def assertAlmostEqual(self, a, b):
+         if isinstance(a, complex):
+@@ -74,6 +234,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+         # check that relative difference < eps
+         self.assertTrue(abs((x-y)/y) < eps)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -227,6 +265,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
      def assertClose(self, x, y, eps=1e-9):
          """Return true iff complexes x and y "are close"."""
          self.assertCloseAbs(x.real, y.real, eps)
+<<<<<<< HEAD
 @@ -93,6 +280,7 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
              q = z.__truediv__(y)
              self.assertClose(q, x)
@@ -323,6 +362,11 @@ index 6ff1a8ab29d..1572433c5ae 100644
 @@ -855,4 +1049,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
 
 
+=======
+@@ -855,4 +1038,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
index 6921c1da6ec4c..140233ddc15b9 100644
--- a/test/dynamo/cpython/3_13/test_complex.py
+++ b/test/dynamo/cpython/3_13/test_complex.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_complex.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -14,7 +17,10 @@
 from torch._dynamo.test_case import CPythonTestCase
 from torch.testing._internal.common_utils import (
     run_tests,
+<<<<<<< HEAD
     slowTest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfailIfTorchDynamo,
 )
 
@@ -280,7 +286,10 @@ def check_div(self, x, y):
             q = z.__truediv__(y)
             self.assertClose(q, x)
 
+<<<<<<< HEAD
     @slowTest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_truediv(self):
         simple_real = [float(i) for i in range(-5, 6)]
         simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
@@ -526,10 +535,14 @@ def test_pow_with_small_integer_exponents(self):
 
     def test_boolcontext(self):
         for i in range(100):
+<<<<<<< HEAD
             with torch._dynamo.error_on_graph_break(False):
                 r1 = random()
                 r2 = random()
             self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
+=======
+            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(not complex(0.0, 0.0))
         self.assertTrue(1j)
 
@@ -622,6 +635,7 @@ def check(z, x, y):
         self.assertRaises(TypeError, complex, WithComplex(1), object())
         self.assertRaises(TypeError, complex, WithComplex(None), object())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class EvilExc(Exception):
                 pass
@@ -629,6 +643,14 @@ class EvilExc(Exception):
             class evilcomplex:
                 def __complex__(self):
                     raise EvilExc
+=======
+        class EvilExc(Exception):
+            pass
+
+        class evilcomplex:
+            def __complex__(self):
+                raise EvilExc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(EvilExc, complex, evilcomplex())
 
@@ -652,15 +674,22 @@ def __complex__(self):
         self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
         self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyInt:
                 def __int__(self):
                     return 42
+=======
+        class MyInt:
+            def __int__(self):
+                return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(TypeError, complex, MyInt())
         self.assertRaises(TypeError, complex, MyInt(), 1.5)
         self.assertRaises(TypeError, complex, 1.5, MyInt())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class complex0(complex):
                 """Test usage of __complex__() when inheriting from 'complex'"""
@@ -679,6 +708,25 @@ class complex2(complex):
                 complex is returned"""
                 def __complex__(self):
                     return None
+=======
+        class complex0(complex):
+            """Test usage of __complex__() when inheriting from 'complex'"""
+            def __complex__(self):
+                return 42j
+
+        class complex1(complex):
+            """Test usage of __complex__() with a __new__() method"""
+            def __new__(self, value=0j):
+                return complex.__new__(self, 2*value)
+            def __complex__(self):
+                return self
+
+        class complex2(complex):
+            """Make sure that __complex__() calls fail if anything other than a
+            complex is returned"""
+            def __complex__(self):
+                return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check(complex(complex0(1j)), 0.0, 42.0)
         with self.assertWarns(DeprecationWarning):
diff --git a/test/dynamo/cpython/3_13/test_dict.diff b/test/dynamo/cpython/3_13/test_dict.diff
index d8e24851409a9..66398efe92f24 100644
--- a/test/dynamo/cpython/3_13/test_dict.diff
+++ b/test/dynamo/cpython/3_13/test_dict.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
+<<<<<<< HEAD
 index 4729132c5a5..6ecf111c1e3 100644
 --- a/test/dynamo/cpython/3_13/test_dict.py
 +++ b/test/dynamo/cpython/3_13/test_dict.py
 @@ -1,3 +1,60 @@
+=======
+index 4729132c5a5..14f829c1715 100644
+--- a/test/dynamo/cpython/3_13/test_dict.py
++++ b/test/dynamo/cpython/3_13/test_dict.py
+@@ -1,3 +1,57 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_dict.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -63,6 +73,7 @@ index 4729132c5a5..6ecf111c1e3 100644
  import collections
  import collections.abc
  import gc
+<<<<<<< HEAD
 @@ -11,11 +68,12 @@ from test import support
  from test.support import import_helper, get_c_recursion_limit
 
@@ -255,10 +266,26 @@ index 4729132c5a5..6ecf111c1e3 100644
 
          self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
 
+=======
+@@ -11,7 +65,7 @@ from test import support
+ from test.support import import_helper, get_c_recursion_limit
+ 
+ 
+-class DictTest(unittest.TestCase):
++class DictTest(__TestCase):
+ 
+     def test_invalid_keyword_arguments(self):
+         class Custom(dict):
+@@ -265,6 +319,7 @@ class DictTest(unittest.TestCase):
+ 
+         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @unittest.skip("test hangs")
      def test_fromkeys(self):
          self.assertEqual(dict.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
          d = {}
+<<<<<<< HEAD
 @@ -276,38 +346,43 @@ class DictTest(unittest.TestCase):
              yield 1
          self.assertEqual(d.fromkeys(g()), {1:None})
@@ -441,6 +468,9 @@ index 4729132c5a5..6ecf111c1e3 100644
          # 5 items
          y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
 @@ -477,7 +559,7 @@ class DictTest(unittest.TestCase):
+=======
+@@ -477,7 +532,7 @@ class DictTest(unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          for copymode in -1, +1:
              # -1: b has same structure as a
              # +1: b is a.copy()
@@ -449,6 +479,7 @@ index 4729132c5a5..6ecf111c1e3 100644
                  size = 2**log2size
                  a = {}
                  b = {}
+<<<<<<< HEAD
 @@ -517,15 +599,16 @@ class DictTest(unittest.TestCase):
 
          self.assertRaises(TypeError, d.pop)
@@ -694,6 +725,12 @@ index 4729132c5a5..6ecf111c1e3 100644
              pass
          self._tracked(MyDict())
 
+=======
+@@ -1006,18 +1061,6 @@ class DictTest(unittest.TestCase):
+             pass
+         self._tracked(MyDict())
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -    @support.cpython_only
 -    def test_track_lazy_instance_dicts(self):
 -        class C:
@@ -707,6 +744,7 @@ index 4729132c5a5..6ecf111c1e3 100644
 -        self._tracked(d)
 -
      def make_shared_key_dict(self, n):
+<<<<<<< HEAD
 -        class C:
 -            pass
 +        with torch._dynamo.error_on_graph_break(False):
@@ -1103,6 +1141,22 @@ index 4729132c5a5..6ecf111c1e3 100644
 @@ -1666,4 +1773,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
 
 
+=======
+         class C:
+             pass
+@@ -1622,7 +1665,7 @@ class DictTest(unittest.TestCase):
+                 self.assertGreaterEqual(eq_count, 1)
+ 
+ 
+-class CAPITest(unittest.TestCase):
++class CAPITest(__TestCase):
+ 
+     # Test _PyDict_GetItem_KnownHash()
+     @support.cpython_only
+@@ -1666,4 +1709,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
index 4a4f170ad9727..d475cbc712c39 100644
--- a/test/dynamo/cpython/3_13/test_dict.py
+++ b/test/dynamo/cpython/3_13/test_dict.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_dict.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -71,9 +74,14 @@ def find_spec(self, fullname, path, target=None):
 class DictTest(__TestCase):
 
     def test_invalid_keyword_arguments(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Custom(dict):
                 pass
+=======
+        class Custom(dict):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for invalid in {1 : 2}, Custom({1 : 2}):
             with self.assertRaises(TypeError):
                 dict(**invalid)
@@ -166,9 +174,14 @@ def test_items(self):
 
     def test_views_mapping(self):
         mappingproxy = type(type.__dict__)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Dict(dict):
                 pass
+=======
+        class Dict(dict):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for cls in [dict, Dict]:
             d = cls()
             m1 = d.keys().mapping
@@ -216,17 +229,26 @@ def test_getitem(self):
 
         self.assertRaises(TypeError, d.__getitem__)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadEq(object):
                 def __eq__(self, other):
                     raise Exc()
                 def __hash__(self):
                     return 24
+=======
+        class BadEq(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {}
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -237,6 +259,17 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = BadHash()
         d[x] = 42
@@ -262,6 +295,7 @@ def test_update(self):
 
         self.assertRaises((TypeError, AttributeError), d.update, None)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class SimpleUserDict:
                 def __init__(self):
@@ -270,10 +304,20 @@ def keys(self):
                     return self.d.keys()
                 def __getitem__(self, i):
                     return self.d[i]
+=======
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = {1:1, 2:2, 3:3}
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -329,6 +373,56 @@ def __iter__(self):
                     return self
                 def __next__(self):
                     raise Exc()
+=======
+        class Exc(Exception): pass
+
+        d.clear()
+        class FailingUserDict:
+            def keys(self):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = 1
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i:
+                            self.i = 0
+                            return 'a'
+                        raise Exc
+                return BogonIter()
+            def __getitem__(self, key):
+                return key
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = ord('a')
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i <= ord('z'):
+                            rtn = chr(self.i)
+                            self.i += 1
+                            return rtn
+                        raise StopIteration
+                return BogonIter()
+            def __getitem__(self, key):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class badseq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(Exc, {}.update, badseq())
 
@@ -346,21 +440,32 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class dictlike(dict): pass
+=======
+        class dictlike(dict): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
         self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class mydict(dict):
                 def __new__(cls):
                     return collections.UserDict()
+=======
+        class mydict(dict):
+            def __new__(cls):
+                return collections.UserDict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -383,6 +488,27 @@ def __next__(self):
             class baddict2(dict):
                 def __setitem__(self, key, value):
                     raise Exc()
+=======
+        class Exc(Exception): pass
+
+        class baddict1(dict):
+            def __init__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+        class BadSeq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, dict.fromkeys, BadSeq())
+
+        class baddict2(dict):
+            def __setitem__(self, key, value):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -398,20 +524,32 @@ def __setitem__(self, key, value):
         self.assertEqual(dict.fromkeys(d, 0), res)
 
         # test fast path when object's constructor returns large non-empty dict
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class baddict3(dict):
                 def __new__(cls):
                     return d
+=======
+        class baddict3(dict):
+            def __new__(cls):
+                return d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
         self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
 
         # test slow path when object is a proper subclass of dict
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class baddict4(dict):
                 def __init__(self):
                     dict.__init__(self, d)
+=======
+        class baddict4(dict):
+            def __init__(self):
+                dict.__init__(self, d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
@@ -447,9 +585,14 @@ def test_copy_fuzz(self):
                 self.assertEqual(len(d2), len(d) + 1)
 
     def test_copy_maintains_tracking(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
+=======
+        class A:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         key = A()
 
@@ -494,6 +637,7 @@ def test_setdefault(self):
         self.assertEqual(len(d['key']), 2)
         self.assertRaises(TypeError, d.setdefault)
 
+<<<<<<< HEAD
 
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
@@ -505,6 +649,17 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = BadHash()
         d[x] = 42
@@ -513,6 +668,7 @@ def __hash__(self):
 
     def test_setdefault_atomic(self):
         # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Hashed(object):
                 def __init__(self):
@@ -524,6 +680,18 @@ def __hash__(self):
                 def __eq__(self, other):
                     self.eq_count += 1
                     return id(self) == id(other)
+=======
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hashed1 = Hashed()
         y = {hashed1: 5}
         hashed2 = Hashed()
@@ -533,6 +701,7 @@ def __eq__(self, other):
         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
 
     def test_setitem_atomic_at_resize(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Hashed(object):
                 def __init__(self):
@@ -544,6 +713,18 @@ def __hash__(self):
                 def __eq__(self, other):
                     self.eq_count += 1
                     return id(self) == id(other)
+=======
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hashed1 = Hashed()
         # 5 items
         y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
@@ -599,6 +780,7 @@ def test_pop(self):
 
         self.assertRaises(TypeError, d.pop)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -609,6 +791,17 @@ def __hash__(self):
                         raise Exc()
                     else:
                         return 42
+=======
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = BadHash()
         d[x] = 42
@@ -652,6 +845,7 @@ def test_mutating_iteration_delete_over_items(self):
 
     def test_mutating_lookup(self):
         # changing dict during a lookup (issue #14417)
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class NastyKey:
                 mutate_dict = None
@@ -669,6 +863,24 @@ def __eq__(self, other):
                         NastyKey.mutate_dict = None
                         del mydict[key]
                     return self.value == other.value
+=======
+        class NastyKey:
+            mutate_dict = None
+
+            def __init__(self, value):
+                self.value = value
+
+            def __hash__(self):
+                # hash collision!
+                return 1
+
+            def __eq__(self, other):
+                if NastyKey.mutate_dict:
+                    mydict, key = NastyKey.mutate_dict
+                    NastyKey.mutate_dict = None
+                    del mydict[key]
+                return self.value == other.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         key1 = NastyKey(1)
         key2 = NastyKey(2)
@@ -686,12 +898,20 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
             class BadRepr(object):
                 def __repr__(self):
                     raise Exc()
+=======
+        class Exc(Exception): pass
+
+        class BadRepr(object):
+            def __repr__(self):
+                raise Exc()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {1: BadRepr()}
         self.assertRaises(Exc, repr, d)
@@ -706,6 +926,7 @@ def test_eq(self):
         self.assertEqual({}, {})
         self.assertEqual({1: 2}, {1: 2})
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
 
@@ -714,6 +935,15 @@ def __eq__(self, other):
                     raise Exc()
                 def __hash__(self):
                     return 1
+=======
+        class Exc(Exception): pass
+
+        class BadCmp(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d1 = {BadCmp(): 1}
         d2 = {1: 1}
@@ -770,10 +1000,16 @@ def helper_keys_contained(self, fn):
         self.assertFalse(larger == larger3)
 
     def test_errors_in_view_containment_check(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __eq__(self, other):
                     raise RuntimeError
+=======
+        class C:
+            def __eq__(self, other):
+                raise RuntimeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d1 = {1: C()}
         d2 = {1: C()}
@@ -853,10 +1089,16 @@ def test_missing(self):
         # (E) subclass defines __missing__ method raising RuntimeError
         # (F) subclass sets __missing__ instance variable (no effect)
         # (G) subclass doesn't define __missing__ at all
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class D(dict):
                 def __missing__(self, key):
                     return 42
+=======
+        class D(dict):
+            def __missing__(self, key):
+                return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d = D({1: 2, 3: 4})
         self.assertEqual(d[1], 2)
         self.assertEqual(d[3], 4)
@@ -864,28 +1106,46 @@ def __missing__(self, key):
         self.assertNotIn(2, d.keys())
         self.assertEqual(d[2], 42)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class E(dict):
                 def __missing__(self, key):
                     raise RuntimeError(key)
+=======
+        class E(dict):
+            def __missing__(self, key):
+                raise RuntimeError(key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         e = E()
         with self.assertRaises(RuntimeError) as c:
             e[42]
         self.assertEqual(c.exception.args, (42,))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class F(dict):
                 def __init__(self):
                     # An instance variable __missing__ should have no effect
                     self.__missing__ = lambda key: None
+=======
+        class F(dict):
+            def __init__(self):
+                # An instance variable __missing__ should have no effect
+                self.__missing__ = lambda key: None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f = F()
         with self.assertRaises(KeyError) as c:
             f[42]
         self.assertEqual(c.exception.args, (42,))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class G(dict):
                 pass
+=======
+        class G(dict):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         g = G()
         with self.assertRaises(KeyError) as c:
             g[42]
@@ -900,6 +1160,7 @@ def test_tuple_keyerror(self):
 
     def test_bad_key(self):
         # Dictionary lookups should fail if __eq__() raises an exception.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class CustomException(Exception):
                 pass
@@ -912,6 +1173,19 @@ def __eq__(self, other):
                     if isinstance(other, self.__class__):
                         raise CustomException
                     return other
+=======
+        class CustomException(Exception):
+            pass
+
+        class BadDictKey:
+            def __hash__(self):
+                return hash(self.__class__)
+
+            def __eq__(self, other):
+                if isinstance(other, self.__class__):
+                    raise CustomException
+                return other
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {}
         x1 = BadDictKey()
@@ -947,6 +1221,7 @@ def test_resize2(self):
         # Another dict resizing bug (SF bug #1456209).
         # This caused Segmentation faults or Illegal instructions.
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X(object):
                 def __hash__(self):
@@ -955,6 +1230,15 @@ def __eq__(self, other):
                     if resizing:
                         d.clear()
                     return False
+=======
+        class X(object):
+            def __hash__(self):
+                return 5
+            def __eq__(self, other):
+                if resizing:
+                    d.clear()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d = {}
         resizing = False
         d[X()] = 1
@@ -977,9 +1261,14 @@ def test_empty_presized_dict_in_freelist(self):
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for dictiter and
         # dictview objects.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 pass
+=======
+        class C(object):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         views = (dict.items, dict.values, dict.keys)
         for v in views:
             obj = C()
@@ -1032,10 +1321,15 @@ def test_track_literals(self):
     @support.cpython_only
     def test_track_dynamic(self):
         # Test GC-optimization of dynamically-created dicts
+<<<<<<< HEAD
 
         with torch._dynamo.error_on_graph_break(False):
             class MyObject(object):
                 pass
+=======
+        class MyObject(object):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
 
         d = dict()
@@ -1103,9 +1397,14 @@ class MyDict(dict):
         self._tracked(MyDict())
 
     def make_shared_key_dict(self, n):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C:
                 pass
+=======
+        class C:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dicts = []
         for i in range(n):
@@ -1194,6 +1493,7 @@ def test_splittable_popitem(self):
     @support.cpython_only
     def test_splittable_update(self):
         """dict.update(other) must preserve order in other."""
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __init__(self, order):
@@ -1201,6 +1501,14 @@ def __init__(self, order):
                         self.a, self.b, self.c = 1, 2, 3
                     else:
                         self.c, self.b, self.a = 1, 2, 3
+=======
+        class C:
+            def __init__(self, order):
+                if order:
+                    self.a, self.b, self.c = 1, 2, 3
+                else:
+                    self.c, self.b, self.a = 1, 2, 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         o = C(True)
         o = C(False)  # o.__dict__ has reversed order.
         self.assertEqual(list(o.__dict__), ["c", "b", "a"])
@@ -1212,9 +1520,14 @@ def __init__(self, order):
     @support.cpython_only
     def test_splittable_to_generic_combinedtable(self):
         """split table must be correctly resized and converted to generic combined table"""
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C:
                 pass
+=======
+        class C:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = C()
         a.x = 1
@@ -1336,6 +1649,7 @@ def test_reversevaluesiterator_pickling(self):
             self.assertEqual(sorted(values), sorted(data.values()))
 
     def test_instance_dict_getattr_str_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Foo:
                 def __init__(self, msg):
@@ -1344,12 +1658,24 @@ def __init__(self, msg):
         with torch._dynamo.error_on_graph_break(False):
             class _str(str):
                 pass
+=======
+        class Foo:
+            def __init__(self, msg):
+                self.msg = msg
+        f = Foo('123')
+        class _str(str):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(f.msg, getattr(f, _str('msg')))
         self.assertEqual(f.msg, f.__dict__[_str('msg')])
 
     def test_object_set_item_single_instance_non_str_key(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Foo: pass
+=======
+        class Foo: pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f = Foo()
         f.__dict__[1] = 1
         f.a = 'a'
@@ -1359,10 +1685,16 @@ def check_reentrant_insertion(self, mutate):
         # This object will trigger mutation of the dict when replaced
         # by another value.  Note this relies on refcounting: the test
         # won't achieve its purpose on fully-GCed Python implementations.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Mutating:
                 def __del__(self):
                     mutate(d)
+=======
+        class Mutating:
+            def __del__(self):
+                mutate(d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
         for k in list(d):
@@ -1385,6 +1717,7 @@ def mutate(d):
         self.check_reentrant_insertion(mutate)
 
     def test_merge_and_mutate(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __hash__(self):
@@ -1393,6 +1726,15 @@ def __hash__(self):
                 def __eq__(self, o):
                     other.clear()
                     return False
+=======
+        class X:
+            def __hash__(self):
+                return 0
+
+            def __eq__(self, o):
+                other.clear()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         l = [(i,0) for i in range(1, 1337)]
         other = dict(l)
@@ -1408,6 +1750,7 @@ def test_free_after_iterating(self):
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-27945 part 3.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X():
                 def __del__(self):
@@ -1419,17 +1762,36 @@ def __eq__(self, other):
 
                 def __hash__(self):
                     return 13
+=======
+        class X():
+            def __del__(self):
+                dict_b.clear()
+
+            def __eq__(self, other):
+                dict_a.clear()
+                return True
+
+            def __hash__(self):
+                return 13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dict_a = {X(): 0}
         dict_b = {X(): X()}
         self.assertTrue(dict_a == dict_b)
 
         # test fix for seg fault reported in bpo-38588 part 1.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Y:
                 def __eq__(self, other):
                     dict_d.clear()
                     return True
+=======
+        class Y:
+            def __eq__(self, other):
+                dict_d.clear()
+                return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dict_c = {0: Y()}
         dict_d = {0: set()}
@@ -1437,6 +1799,7 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_dict_operand(self):
         # test fix for seg fault reported in issue 27945 part 4a.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X(int):
                 def __hash__(self):
@@ -1446,6 +1809,16 @@ def __eq__(self, other):
                     if len(d) > 1:
                         d.clear()
                     return False
+=======
+        class X(int):
+            def __hash__(self):
+                return 13
+
+            def __eq__(self, other):
+                if len(d) > 1:
+                    d.clear()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1): 1, X(2): 2}
@@ -1456,6 +1829,7 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_set_operand(self):
         # test fix for seg fault reported in issue 27945 part 4b.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X(int):
                 def __hash__(self):
@@ -1465,6 +1839,16 @@ def __eq__(self, other):
                     if len(d) > 1:
                         d.clear()
                     return False
+=======
+        class X(int):
+            def __hash__(self):
+                return 13
+
+            def __eq__(self, other):
+                if len(d) > 1:
+                    d.clear()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1), X(2)}
@@ -1474,17 +1858,25 @@ def __eq__(self, other):
             pass
 
     def test_dictitems_contains_use_after_free(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __eq__(self, other):
                     d.clear()
                     return NotImplemented
+=======
+        class X:
+            def __eq__(self, other):
+                d.clear()
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {0: set()}
         (0, X()) in d.items()
 
     def test_dict_contain_use_after_free(self):
         # bpo-40489
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class S(str):
                 def __eq__(self, other):
@@ -1493,25 +1885,47 @@ def __eq__(self, other):
 
                 def __hash__(self):
                     return hash('test')
+=======
+        class S(str):
+            def __eq__(self, other):
+                d.clear()
+                return NotImplemented
+
+            def __hash__(self):
+                return hash('test')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {S(): 'value'}
         self.assertFalse('test' in d)
 
     def test_init_use_after_free(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __hash__(self):
                     pair[:] = []
                     return 13
+=======
+        class X:
+            def __hash__(self):
+                pair[:] = []
+                return 13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         pair = [X(), 123]
         dict([pair])
 
     def test_oob_indexing_dictiter_iternextitem(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X(int):
                 def __del__(self):
                     d.clear()
+=======
+        class X(int):
+            def __del__(self):
+                d.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = {i: X(i) for i in range(8)}
 
@@ -1545,11 +1959,18 @@ def test_reverse_iterator_for_empty_dict(self):
         self.assertEqual(list(reversed(dict().keys())), [])
 
     def test_reverse_iterator_for_shared_shared_dicts(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class A:
                 def __init__(self, x, y):
                     if x: self.x = x
                     if y: self.y = y
+=======
+        class A:
+            def __init__(self, x, y):
+                if x: self.x = x
+                if y: self.y = y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
         self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
@@ -1565,15 +1986,21 @@ def test_dict_copy_order(self):
         self.assertEqual(list(copy.items()), expected)
 
         # dict subclass doesn't override __iter__
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class CustomDict(dict):
                 pass
+=======
+        class CustomDict(dict):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         pairs = [('a', 1), ('b', 2), ('c', 3)]
 
         d = CustomDict(pairs)
         self.assertEqual(pairs, list(dict(d).items()))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class CustomReversedDict(dict):
                 def keys(self):
@@ -1583,6 +2010,16 @@ def keys(self):
 
                 def items(self):
                     return reversed(dict.items(self))
+=======
+        class CustomReversedDict(dict):
+            def keys(self):
+                return reversed(list(dict.keys(self)))
+
+            __iter__ = keys
+
+            def items(self):
+                return reversed(dict.items(self))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         d = CustomReversedDict(pairs)
         self.assertEqual(pairs[::-1], list(dict(d).items()))
@@ -1607,6 +2044,7 @@ def test_dict_items_result_gc_reversed(self):
         self.assertTrue(gc.is_tracked(next(it)))
 
     def test_store_evilattr(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class EvilAttr:
                 def __init__(self, d):
@@ -1619,6 +2057,19 @@ def __del__(self):
 
             class Obj:
                 pass
+=======
+        class EvilAttr:
+            def __init__(self, d):
+                self.d = d
+
+            def __del__(self):
+                if 'attr' in self.d:
+                    del self.d['attr']
+                gc.collect()
+
+        class Obj:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         obj = Obj()
         obj.__dict__ = {}
@@ -1630,6 +2081,7 @@ def test_str_nonstr(self):
         # `str` keys. Make sure the unoptimized path is used when a non-`str`
         # key appears.
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class StrSub(str):
                 pass
@@ -1647,6 +2099,23 @@ def __eq__(self, other):
                         eq_count += 1
                         return True
                     return False
+=======
+        class StrSub(str):
+            pass
+
+        eq_count = 0
+        # This class compares equal to the string 'key3'
+        class Key3:
+            def __hash__(self):
+                return hash('key3')
+
+            def __eq__(self, other):
+                nonlocal eq_count
+                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+                    eq_count += 1
+                    return True
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         key3_1 = StrSub('key3')
         key3_2 = Key3()
@@ -1746,6 +2215,7 @@ def test_getitem_knownhash(self):
         # key does not exist
         self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Exc(Exception): pass
             class BadEq:
@@ -1753,6 +2223,14 @@ def __eq__(self, other):
                     raise Exc
                 def __hash__(self):
                     return 7
+=======
+        class Exc(Exception): pass
+        class BadEq:
+            def __eq__(self, other):
+                raise Exc
+            def __hash__(self):
+                return 7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         k1, k2 = BadEq(), BadEq()
         d = {k1: 1}
diff --git a/test/dynamo/cpython/3_13/test_float.diff b/test/dynamo/cpython/3_13/test_float.diff
index 3e1d08e8fe60a..56d2213353c37 100644
--- a/test/dynamo/cpython/3_13/test_float.diff
+++ b/test/dynamo/cpython/3_13/test_float.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
+<<<<<<< HEAD
 index 97f951f1299..da82bd190c3 100644
 --- a/test/dynamo/cpython/3_13/test_float.py
 +++ b/test/dynamo/cpython/3_13/test_float.py
 @@ -1,3 +1,57 @@
+=======
+index 97f951f1299..ce2c46777e0 100644
+--- a/test/dynamo/cpython/3_13/test_float.py
++++ b/test/dynamo/cpython/3_13/test_float.py
+@@ -1,3 +1,54 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_float.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,9 +70,15 @@ index 97f951f1299..da82bd190c3 100644
  import fractions
  import operator
  import os
+<<<<<<< HEAD
 @@ -8,11 +62,84 @@ import time
  import unittest
 
+=======
+@@ -8,11 +59,84 @@ import time
+ import unittest
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  from test import support
 -from test.support.testcase import FloatsAreIdenticalMixin
 -from test.support.numbers import (
@@ -149,6 +165,7 @@ index 97f951f1299..da82bd190c3 100644
 +
  from math import isinf, isnan, copysign, ldexp
  import math
+<<<<<<< HEAD
 
 @@ -35,7 +162,7 @@ class FloatSubclass(float):
  class OtherFloatSubclass(float):
@@ -338,12 +355,28 @@ index 97f951f1299..da82bd190c3 100644
          self.assertEqual(hash(value), object.__hash__(value))
 
 
+=======
+ 
+@@ -35,7 +159,7 @@ class FloatSubclass(float):
+ class OtherFloatSubclass(float):
+     pass
+ 
+-class GeneralFloatCases(unittest.TestCase):
++class GeneralFloatCases(__TestCase):
+ 
+     def test_float(self):
+         self.assertEqual(float(3.14), 3.14)
+@@ -620,7 +744,7 @@ class GeneralFloatCases(unittest.TestCase):
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  @unittest.skipUnless(hasattr(float, "__getformat__"), "requires __getformat__")
 -class FormatFunctionsTestCase(unittest.TestCase):
 +class FormatFunctionsTestCase(__TestCase):
      def test_getformat(self):
          self.assertIn(float.__getformat__('double'),
                        ['unknown', 'IEEE, big-endian', 'IEEE, little-endian'])
+<<<<<<< HEAD
 @@ -645,7 +782,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
  # is accident (today).
  # let's also try to guarantee that -0.0 and 0.0 don't get confused.
@@ -366,14 +399,44 @@ index 97f951f1299..da82bd190c3 100644
          self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
          self.assertEqual(format(-123.34, '00.10g'), '-123.34')
 
+=======
+@@ -645,7 +769,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
+ # is accident (today).
+ # let's also try to guarantee that -0.0 and 0.0 don't get confused.
+ 
+-class IEEEFormatTestCase(unittest.TestCase):
++class IEEEFormatTestCase(__TestCase):
+ 
+     @support.requires_IEEE_754
+     def test_double_specials_do_unpack(self):
+@@ -670,7 +794,7 @@ class IEEEFormatTestCase(unittest.TestCase):
+         self.assertEqual(struct.pack("<f", 3.40282356e38), struct.pack("<f", FLT_MAX))
+         self.assertEqual(struct.pack("<f", -3.40282356e38), struct.pack("<f", -FLT_MAX))
+ 
+-class FormatTestCase(unittest.TestCase):
++class FormatTestCase(__TestCase):
+ 
+     def test_format(self):
+         # these should be rewritten to use both format(x, spec) and
+@@ -767,7 +891,7 @@ class FormatTestCase(unittest.TestCase):
+         self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
+         self.assertEqual(format(-123.34, '00.10g'), '-123.34')
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class ReprTestCase(unittest.TestCase):
 +class ReprTestCase(__TestCase):
      def test_repr(self):
          with open(os.path.join(os.path.split(__file__)[0],
                    'mathdata',
+<<<<<<< HEAD
 @@ -832,7 +969,29 @@ class ReprTestCase(unittest.TestCase):
              self.assertEqual(repr(float(negs)), str(float(negs)))
 
+=======
+@@ -832,7 +956,29 @@ class ReprTestCase(unittest.TestCase):
+             self.assertEqual(repr(float(negs)), str(float(negs)))
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  @support.requires_IEEE_754
 -class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
 +class RoundTestCase(__TestCase):
@@ -399,11 +462,19 @@ index 97f951f1299..da82bd190c3 100644
 +            else:
 +                msg += ': zeros have different signs'
 +        self.fail(msg.format(x, y))
+<<<<<<< HEAD
 
      def test_inf_nan(self):
          self.assertRaises(OverflowError, round, INF)
 @@ -955,7 +1114,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
 
+=======
+ 
+     def test_inf_nan(self):
+         self.assertRaises(OverflowError, round, INF)
+@@ -955,7 +1101,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  # Beginning with Python 2.6 float has cross platform compatible
  # ways to create and represent inf and nan
 -class InfNanTest(unittest.TestCase):
@@ -411,8 +482,13 @@ index 97f951f1299..da82bd190c3 100644
      def test_inf_from_str(self):
          self.assertTrue(isinf(float("inf")))
          self.assertTrue(isinf(float("+inf")))
+<<<<<<< HEAD
 @@ -1056,12 +1215,35 @@ class InfNanTest(unittest.TestCase):
 
+=======
+@@ -1056,12 +1202,35 @@ class InfNanTest(unittest.TestCase):
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  fromHex = float.fromhex
  toHex = float.hex
 -class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
@@ -421,7 +497,11 @@ index 97f951f1299..da82bd190c3 100644
      MIN = fromHex('0x1p-1022')                # min normal
      TINY = fromHex('0x0.0000000000001p-1022') # min subnormal
      EPS = fromHex('0x0.0000000000001p0') # diff between 1.0 and next float up
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -447,6 +527,7 @@ index 97f951f1299..da82bd190c3 100644
 +
      def identical(self, x, y):
          self.assertFloatsAreIdentical(x, y)
+<<<<<<< HEAD
 
 @@ -1482,17 +1664,19 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
                  self.identical(x, fromHex(toHex(x)))
@@ -478,6 +559,13 @@ index 97f951f1299..da82bd190c3 100644
          self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
 
 
+=======
+ 
+@@ -1500,5 +1669,5 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
+         self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -if __name__ == '__main__':
 -    unittest.main()
 +if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
index efc387023a4ae..da0da62739f34 100644
--- a/test/dynamo/cpython/3_13/test_float.py
+++ b/test/dynamo/cpython/3_13/test_float.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_float.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -222,10 +225,16 @@ def test_underscores(self):
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class CustomStr(str): pass
             class CustomBytes(bytes): pass
             class CustomByteArray(bytearray): pass
+=======
+        class CustomStr(str): pass
+        class CustomBytes(bytes): pass
+        class CustomByteArray(bytearray): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         factories = [
             bytes,
@@ -312,6 +321,7 @@ def test_float_with_comma(self):
 
     def test_floatconversion(self):
         # Make sure that calls to __float__() work properly
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Foo1(object):
                 def __float__(self):
@@ -337,6 +347,32 @@ def __float__(self):
             class FooStr(str):
                 def __float__(self):
                     return float(str(self)) + 1
+=======
+        class Foo1(object):
+            def __float__(self):
+                return 42.
+
+        class Foo2(float):
+            def __float__(self):
+                return 42.
+
+        class Foo3(float):
+            def __new__(cls, value=0.):
+                return float.__new__(cls, 2*value)
+
+            def __float__(self):
+                return self
+
+        class Foo4(float):
+            def __float__(self):
+                return 42
+
+        # Issue 5759: __float__ not called on str subclasses (though it is on
+        # unicode subclasses).
+        class FooStr(str):
+            def __float__(self):
+                return float(str(self)) + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(float(Foo1()), 42.)
         self.assertEqual(float(Foo2()), 42.)
@@ -345,6 +381,7 @@ def __float__(self):
         self.assertRaises(TypeError, float, Foo4(42))
         self.assertEqual(float(FooStr('8')), 9.)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Foo5:
                 def __float__(self):
@@ -356,6 +393,17 @@ def __float__(self):
             class F:
                 def __float__(self):
                     return OtherFloatSubclass(42.)
+=======
+        class Foo5:
+            def __float__(self):
+                return ""
+        self.assertRaises(TypeError, time.sleep, Foo5())
+
+        # Issue #24731
+        class F:
+            def __float__(self):
+                return OtherFloatSubclass(42.)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertWarns(DeprecationWarning):
             self.assertEqual(float(F()), 42.)
         with self.assertWarns(DeprecationWarning):
@@ -365,20 +413,34 @@ def __float__(self):
         with self.assertWarns(DeprecationWarning):
             self.assertIs(type(FloatSubclass(F())), FloatSubclass)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyIndex:
                 def __init__(self, value):
                     self.value = value
                 def __index__(self):
                     return self.value
+=======
+        class MyIndex:
+            def __init__(self, value):
+                self.value = value
+            def __index__(self):
+                return self.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(float(MyIndex(42)), 42.0)
         self.assertRaises(OverflowError, float, MyIndex(2**2000))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyInt:
                 def __int__(self):
                     return 42
+=======
+        class MyInt:
+            def __int__(self):
+                return 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(TypeError, float, MyInt())
 
@@ -387,30 +449,49 @@ def test_keyword_args(self):
             float(x='3.14')
 
     def test_keywords_in_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(float):
                 pass
+=======
+        class subclass(float):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass(2.5)
         self.assertIs(type(u), subclass)
         self.assertEqual(float(u), 2.5)
         with self.assertRaises(TypeError):
             subclass(x=0)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(float):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
+=======
+        class subclass_with_init(float):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_init(2.5, newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(float(u), 2.5)
         self.assertEqual(u.newarg, 3)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(float):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
                     self.newarg = newarg
                     return self
+=======
+        class subclass_with_new(float):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_new(2.5, newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(float(u), 2.5)
@@ -746,12 +827,20 @@ def test_hash(self):
     def test_hash_nan(self):
         value = float('nan')
         self.assertEqual(hash(value), object.__hash__(value))
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class H:
                 def __hash__(self):
                     return 42
             class F(float, H):
                 pass
+=======
+        class H:
+            def __hash__(self):
+                return 42
+        class F(float, H):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = F('nan')
         self.assertEqual(hash(value), object.__hash__(value))
 
@@ -1664,19 +1753,31 @@ def roundtrip(x):
                 self.identical(x, fromHex(toHex(x)))
 
     def test_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class F(float):
                 def __new__(cls, value):
                     return float.__new__(cls, value + 1)
+=======
+        class F(float):
+            def __new__(cls, value):
+                return float.__new__(cls, value + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f = F.fromhex((1.5).hex())
         self.assertIs(type(f), F)
         self.assertEqual(f, 2.5)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class F2(float):
                 def __init__(self, value):
                     self.foo = 'bar'
+=======
+        class F2(float):
+            def __init__(self, value):
+                self.foo = 'bar'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f = F2.fromhex((1.5).hex())
         self.assertIs(type(f), F2)
diff --git a/test/dynamo/cpython/3_13/test_int.diff b/test/dynamo/cpython/3_13/test_int.diff
index 20ab3ed2f58bf..829db2c981223 100644
--- a/test/dynamo/cpython/3_13/test_int.diff
+++ b/test/dynamo/cpython/3_13/test_int.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
+<<<<<<< HEAD
 index 48825f46911..731680d82a0 100644
 --- a/test/dynamo/cpython/3_13/test_int.py
 +++ b/test/dynamo/cpython/3_13/test_int.py
 @@ -1,13 +1,140 @@
+=======
+index 48825f46911..ac7aeacbc01 100644
+--- a/test/dynamo/cpython/3_13/test_int.py
++++ b/test/dynamo/cpython/3_13/test_int.py
+@@ -1,13 +1,137 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_int.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -59,7 +69,11 @@ index 48825f46911..731680d82a0 100644
 +
  import sys
  import time
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  import unittest
  from unittest import mock
  from test import support
@@ -144,6 +158,7 @@ index 48825f46911..731680d82a0 100644
 +    '(1+1.5_j_)',
 +    '(1+1.5_j)',
 +]
+<<<<<<< HEAD
 
  try:
      import _pylong
@@ -454,14 +469,48 @@ index 48825f46911..731680d82a0 100644
      int_class = IntSubclass
 
 
+=======
+ 
+ try:
+     import _pylong
+@@ -38,7 +162,7 @@ L = [
+ class IntSubclass(int):
+     pass
+ 
+-class IntTestCases(unittest.TestCase):
++class IntTestCases(__TestCase):
+ 
+     def test_basic(self):
+         self.assertEqual(int(314), 314)
+@@ -607,7 +731,7 @@ class IntTestCases(unittest.TestCase):
+         self.assertEqual(int('1_2_3_4_5_6_7', 32), 1144132807)
+ 
+ 
+-class IntStrDigitLimitsTests(unittest.TestCase):
++class IntStrDigitLimitsTests(__TestCase):
+ 
+     int_class = int  # Override this in subclasses to reuse the suite.
+ 
+@@ -818,7 +942,7 @@ class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
+     int_class = IntSubclass
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class PyLongModuleTests(unittest.TestCase):
 +class PyLongModuleTests(__TestCase):
      # Tests of the functions in _pylong.py.  Those get used when the
      # number of digits in the input values are large enough.
+<<<<<<< HEAD
 
 @@ -922,4 +1068,4 @@ class PyLongModuleTests(unittest.TestCase):
              bits <<= 1
 
+=======
+ 
+@@ -922,4 +1046,4 @@ class PyLongModuleTests(unittest.TestCase):
+             bits <<= 1
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
index b0f8fe49d1b94..bacff85622251 100644
--- a/test/dynamo/cpython/3_13/test_int.py
+++ b/test/dynamo/cpython/3_13/test_int.py
@@ -4,15 +4,22 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_int.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
 import unittest
 from torch._dynamo.test_case import CPythonTestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __TestCase = CPythonTestCase
 
@@ -436,6 +443,7 @@ def test_int_base_bad_types(self):
             int('0', 5.0)
 
     def test_int_base_indexable(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             with torch._dynamo.error_on_graph_break(False):
                 class MyIndexable(object):
@@ -443,6 +451,13 @@ def __init__(self, value):
                         self.value = value
                     def __index__(self):
                         return self.value
+=======
+        class MyIndexable(object):
+            def __init__(self, value):
+                self.value = value
+            def __index__(self):
+                return self.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check out of range bases.
         for base in 2**100, -2**100, 1, 37:
@@ -457,11 +472,17 @@ def __index__(self):
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
+<<<<<<< HEAD
 
         with torch._dynamo.error_on_graph_break(False):
             class CustomStr(str): pass
             class CustomBytes(bytes): pass
             class CustomByteArray(bytearray): pass
+=======
+        class CustomStr(str): pass
+        class CustomBytes(bytes): pass
+        class CustomByteArray(bytearray): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         factories = [
             bytes,
@@ -503,6 +524,7 @@ def test_string_float(self):
 
     def test_intconversion(self):
         # Test __int__()
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class ClassicMissingMethods:
                 pass
@@ -543,11 +565,47 @@ def __trunc__(self):
                 class ExceptionalTrunc(base):
                     def __trunc__(self):
                         1 / 0
+=======
+        class ClassicMissingMethods:
+            pass
+        self.assertRaises(TypeError, int, ClassicMissingMethods())
+
+        class MissingMethods(object):
+            pass
+        self.assertRaises(TypeError, int, MissingMethods())
+
+        class Foo0:
+            def __int__(self):
+                return 42
+
+        self.assertEqual(int(Foo0()), 42)
+
+        class Classic:
+            pass
+        for base in (object, Classic):
+            class IntOverridesTrunc(base):
+                def __int__(self):
+                    return 42
+                def __trunc__(self):
+                    return -12
+            self.assertEqual(int(IntOverridesTrunc()), 42)
+
+            class JustTrunc(base):
+                def __trunc__(self):
+                    return 42
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(int(JustTrunc()), 42)
+
+            class ExceptionalTrunc(base):
+                def __trunc__(self):
+                    1 / 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaises(ZeroDivisionError), \
                  self.assertWarns(DeprecationWarning):
                 int(ExceptionalTrunc())
 
             for trunc_result_base in (object, Classic):
+<<<<<<< HEAD
                 with torch._dynamo.error_on_graph_break(False):
                     class Index(trunc_result_base):
                         def __index__(self):
@@ -579,6 +637,36 @@ def __trunc__(self):
                     class TruncReturnsNonIntegral(base):
                         def __trunc__(self):
                             return NonIntegral()
+=======
+                class Index(trunc_result_base):
+                    def __index__(self):
+                        return 42
+
+                class TruncReturnsNonInt(base):
+                    def __trunc__(self):
+                        return Index()
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+                class Intable(trunc_result_base):
+                    def __int__(self):
+                        return 42
+
+                class TruncReturnsNonIndex(base):
+                    def __trunc__(self):
+                        return Intable()
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+                class NonIntegral(trunc_result_base):
+                    def __trunc__(self):
+                        # Check that we avoid infinite recursion.
+                        return NonIntegral()
+
+                class TruncReturnsNonIntegral(base):
+                    def __trunc__(self):
+                        return NonIntegral()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     with self.assertWarns(DeprecationWarning):
                         int(TruncReturnsNonIntegral())
@@ -590,6 +678,7 @@ def __trunc__(self):
                     self.fail("Failed to raise TypeError with %s" %
                               ((base, trunc_result_base),))
 
+<<<<<<< HEAD
                 with torch._dynamo.error_on_graph_break(False):
                     # Regression test for bugs.python.org/issue16060.
                     class BadInt(trunc_result_base):
@@ -599,12 +688,23 @@ def __int__(self):
                     class TruncReturnsBadInt(base):
                         def __trunc__(self):
                             return BadInt()
+=======
+                # Regression test for bugs.python.org/issue16060.
+                class BadInt(trunc_result_base):
+                    def __int__(self):
+                        return 42.0
+
+                class TruncReturnsBadInt(base):
+                    def __trunc__(self):
+                        return BadInt()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 with self.assertRaises(TypeError), \
                      self.assertWarns(DeprecationWarning):
                     int(TruncReturnsBadInt())
 
     def test_int_subclass_with_index(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyIndex(int):
                 def __index__(self):
@@ -613,6 +713,15 @@ def __index__(self):
             class BadIndex(int):
                 def __index__(self):
                     return 42.0
+=======
+        class MyIndex(int):
+            def __index__(self):
+                return 42
+
+        class BadIndex(int):
+            def __index__(self):
+                return 42.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         my_int = MyIndex(7)
         self.assertEqual(my_int, 7)
@@ -621,6 +730,7 @@ def __index__(self):
         self.assertEqual(int(BadIndex()), 0)
 
     def test_int_subclass_with_int(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyInt(int):
                 def __int__(self):
@@ -629,6 +739,15 @@ def __int__(self):
             class BadInt(int):
                 def __int__(self):
                     return 42.0
+=======
+        class MyInt(int):
+            def __int__(self):
+                return 42
+
+        class BadInt(int):
+            def __int__(self):
+                return 42.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         my_int = MyInt(7)
         self.assertEqual(my_int, 7)
@@ -639,6 +758,7 @@ def __int__(self):
         self.assertRaises(TypeError, int, my_int)
 
     def test_int_returns_int_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadIndex:
                 def __index__(self):
@@ -667,6 +787,35 @@ def __trunc__(self):
             class TruncReturnsIntSubclass:
                 def __trunc__(self):
                     return True
+=======
+        class BadIndex:
+            def __index__(self):
+                return True
+
+        class BadIndex2(int):
+            def __index__(self):
+                return True
+
+        class BadInt:
+            def __int__(self):
+                return True
+
+        class BadInt2(int):
+            def __int__(self):
+                return True
+
+        class TruncReturnsBadIndex:
+            def __trunc__(self):
+                return BadIndex()
+
+        class TruncReturnsBadInt:
+            def __trunc__(self):
+                return BadInt()
+
+        class TruncReturnsIntSubclass:
+            def __trunc__(self):
+                return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bad_int = BadIndex()
         with self.assertWarns(DeprecationWarning):
@@ -711,7 +860,10 @@ def __trunc__(self):
         self.assertEqual(n, 1)
         self.assertIs(type(n), IntSubclass)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("flaky under dynamo")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_message(self):
         def check(s, base=None):
             with self.assertRaises(ValueError,
diff --git a/test/dynamo/cpython/3_13/test_int_literal.diff b/test/dynamo/cpython/3_13/test_int_literal.diff
index 65d7645590431..e569ad0831b30 100644
--- a/test/dynamo/cpython/3_13/test_int_literal.diff
+++ b/test/dynamo/cpython/3_13/test_int_literal.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_int_literal.py b/test/dynamo/cpython/3_13/test_int_literal.py
+<<<<<<< HEAD
 index bf725710d55..311b8713a36 100644
 --- a/test/dynamo/cpython/3_13/test_int_literal.py
 +++ b/test/dynamo/cpython/3_13/test_int_literal.py
 @@ -1,3 +1,57 @@
+=======
+index bf725710d55..831d03666fb 100644
+--- a/test/dynamo/cpython/3_13/test_int_literal.py
++++ b/test/dynamo/cpython/3_13/test_int_literal.py
+@@ -1,3 +1,54 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_int_literal.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,7 +70,11 @@ index bf725710d55..311b8713a36 100644
  """Test correct treatment of hex/oct constants.
  
  This is complex because of changes due to PEP 237.
+<<<<<<< HEAD
 @@ -5,7 +59,7 @@ This is complex because of changes due to PEP 237.
+=======
+@@ -5,7 +56,7 @@ This is complex because of changes due to PEP 237.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  
  import unittest
  
@@ -69,7 +83,11 @@ index bf725710d55..311b8713a36 100644
  
      def test_hex_baseline(self):
          # A few upper/lowercase tests
+<<<<<<< HEAD
 @@ -140,4 +194,4 @@ class TestHexOctBin(unittest.TestCase):
+=======
+@@ -140,4 +191,4 @@ class TestHexOctBin(unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.assertEqual(-0b1111111111111111111111111111111111111111111111111111111111111111, -18446744073709551615)
  
  if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_int_literal.py b/test/dynamo/cpython/3_13/test_int_literal.py
index 311b8713a36cc..ac0bdece1b96d 100644
--- a/test/dynamo/cpython/3_13/test_int_literal.py
+++ b/test/dynamo/cpython/3_13/test_int_literal.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_int_literal.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
diff --git a/test/dynamo/cpython/3_13/test_iter.diff b/test/dynamo/cpython/3_13/test_iter.diff
index 18bdcdfb3df82..20b257fcd6823 100644
--- a/test/dynamo/cpython/3_13/test_iter.diff
+++ b/test/dynamo/cpython/3_13/test_iter.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
+<<<<<<< HEAD
 index 1b9f3cf7624..6560c7423a6 100644
 --- a/test/dynamo/cpython/3_13/test_iter.py
 +++ b/test/dynamo/cpython/3_13/test_iter.py
 @@ -1,3 +1,60 @@
+=======
+index 1b9f3cf7624..d0c68f4314c 100644
+--- a/test/dynamo/cpython/3_13/test_iter.py
++++ b/test/dynamo/cpython/3_13/test_iter.py
+@@ -1,3 +1,57 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_iter.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -61,6 +71,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Test iterators.
+<<<<<<< HEAD
 
  import sys
 @@ -104,12 +161,10 @@ class EmptyIterClass:
@@ -231,11 +242,28 @@ index 1b9f3cf7624..6560c7423a6 100644
 @@ -635,6 +694,7 @@ class TestCase(unittest.TestCase):
                  pass
 
+=======
+ 
+ import sys
+@@ -104,7 +158,7 @@ class EmptyIterClass:
+ 
+ # Main test suite
+ 
+-class TestCase(unittest.TestCase):
++class TestCase(__TestCase):
+ 
+     # Helper to check that an iterator returns a given sequence
+     def check_iterator(self, it, seq, pickle=True):
+@@ -635,6 +689,7 @@ class TestCase(unittest.TestCase):
+                 pass
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      # Test zip()'s use of iterators.
 +    @skipIfTorchDynamo("infinite loop")
      def test_builtin_zip(self):
          self.assertEqual(list(zip()), [])
          self.assertEqual(list(zip(*[])), [])
+<<<<<<< HEAD
 @@ -653,17 +713,18 @@ class TestCase(unittest.TestCase):
          self.assertEqual(list(d.items()), list(zip(d, d.values())))
 
@@ -429,6 +457,11 @@ index 1b9f3cf7624..6560c7423a6 100644
 @@ -1187,4 +1253,4 @@ class TestCase(unittest.TestCase):
 
 
+=======
+@@ -1187,4 +1242,4 @@ class TestCase(unittest.TestCase):
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
index 8e6240d99ce6d..a66134a489f9e 100644
--- a/test/dynamo/cpython/3_13/test_iter.py
+++ b/test/dynamo/cpython/3_13/test_iter.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_iter.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -165,6 +168,11 @@ class TestCase(__TestCase):
 
     # Helper to check that an iterator returns a given sequence
     def check_iterator(self, it, seq, pickle=True):
+<<<<<<< HEAD
+=======
+        if pickle:
+            self.check_pickle(it, seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = []
         while 1:
             try:
@@ -176,6 +184,11 @@ def check_iterator(self, it, seq, pickle=True):
 
     # Helper to check that a for loop generates a given sequence
     def check_for_loop(self, expr, seq, pickle=True):
+<<<<<<< HEAD
+=======
+        if pickle:
+            self.check_pickle(iter(expr), seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = []
         for val in expr:
             res.append(val)
@@ -314,6 +327,7 @@ def test_reduce_mutating_builtins_iter(self):
         def run(builtin_name, item, sentinel=None):
             it = iter(item) if sentinel is None else iter(item, sentinel)
 
+<<<<<<< HEAD
             with torch._dynamo.error_on_graph_break(False):
                 class CustomStr:
                     def __init__(self, name, iterator):
@@ -328,6 +342,21 @@ def __eq__(self, other):
                         # the pointers after this call
                         list(self.iterator)
                         return other == self.name
+=======
+            class CustomStr:
+                def __init__(self, name, iterator):
+                    self.name = name
+                    self.iterator = iterator
+                def __hash__(self):
+                    return hash(self.name)
+                def __eq__(self, other):
+                    # Here we exhaust our iterator, possibly changing
+                    # its `it_seq` pointer to NULL
+                    # The `__reduce__` call should correctly get
+                    # the pointers after this call
+                    list(self.iterator)
+                    return other == self.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # del is required here
             # to not prematurely call __eq__ from
@@ -377,10 +406,16 @@ def __eq__(self, other):
 
     # Test a new_style class with __iter__ but no next() method
     def test_new_style_iter_class(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class IterClass(object):
                 def __iter__(self):
                     return self
+=======
+        class IterClass(object):
+            def __iter__(self):
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertRaises(TypeError, iter, IterClass())
 
     # Test two-argument iter() with callable instance
@@ -449,12 +484,20 @@ def spam(state=[0]):
 
     # Test exception propagation through sequence iterator
     def test_exception_sequence(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MySequenceClass(SequenceClass):
                 def __getitem__(self, i):
                     if i == 10:
                         raise RuntimeError
                     return SequenceClass.__getitem__(self, i)
+=======
+        class MySequenceClass(SequenceClass):
+            def __getitem__(self, i):
+                if i == 10:
+                    raise RuntimeError
+                return SequenceClass.__getitem__(self, i)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = []
         try:
             for x in MySequenceClass(20):
@@ -466,12 +509,20 @@ def __getitem__(self, i):
 
     # Test for StopIteration from __getitem__
     def test_stop_sequence(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MySequenceClass(SequenceClass):
                 def __getitem__(self, i):
                     if i == 10:
                         raise StopIteration
                     return SequenceClass.__getitem__(self, i)
+=======
+        class MySequenceClass(SequenceClass):
+            def __getitem__(self, i):
+                if i == 10:
+                    raise StopIteration
+                return SequenceClass.__getitem__(self, i)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_for_loop(MySequenceClass(20), list(range(10)), pickle=False)
 
     # Test a big range
@@ -598,6 +649,7 @@ def test_builtin_filter(self):
         self.assertRaises(TypeError, filter, None, list)
         self.assertRaises(TypeError, filter, None, 42)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Boolean:
                 def __init__(self, truth):
@@ -626,6 +678,34 @@ def __next__(self):
                             else:
                                 raise StopIteration
                     return SeqIter(self.vals)
+=======
+        class Boolean:
+            def __init__(self, truth):
+                self.truth = truth
+            def __bool__(self):
+                return self.truth
+        bTrue = Boolean(True)
+        bFalse = Boolean(False)
+
+        class Seq:
+            def __init__(self, *args):
+                self.vals = args
+            def __iter__(self):
+                class SeqIter:
+                    def __init__(self, vals):
+                        self.vals = vals
+                        self.i = 0
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        i = self.i
+                        self.i = i + 1
+                        if i < len(self.vals):
+                            return self.vals[i]
+                        else:
+                            raise StopIteration
+                return SeqIter(self.vals)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         seq = Seq(*([bTrue, bFalse] * 25))
         self.assertEqual(list(filter(lambda x: not x, seq)), [bFalse]*25)
@@ -713,6 +793,7 @@ def test_builtin_zip(self):
         self.assertEqual(list(d.items()), list(zip(d, d.values())))
 
         # Generate all ints starting at constructor arg.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class IntsFrom:
                 def __init__(self, start):
@@ -725,6 +806,19 @@ def __next__(self):
                     i = self.i
                     self.i = i+1
                     return i
+=======
+        class IntsFrom:
+            def __init__(self, start):
+                self.i = start
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                i = self.i
+                self.i = i+1
+                return i
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f = open(TESTFN, "w", encoding="utf-8")
         try:
@@ -747,6 +841,7 @@ def __next__(self):
         self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
 
         # Classes that lie about their lengths.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class NoGuessLen5:
                 def __getitem__(self, i):
@@ -761,6 +856,21 @@ def __len__(self):
             class Guess30Len5(NoGuessLen5):
                 def __len__(self):
                     return 30
+=======
+        class NoGuessLen5:
+            def __getitem__(self, i):
+                if i >= 5:
+                    raise IndexError
+                return i
+
+        class Guess3Len5(NoGuessLen5):
+            def __len__(self):
+                return 3
+
+        class Guess30Len5(NoGuessLen5):
+            def __len__(self):
+                return 30
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def lzip(*args):
             return list(zip(*args))
@@ -780,6 +890,7 @@ def test_unicode_join_endcase(self):
 
         # This class inserts a Unicode object into its argument's natural
         # iteration, in the 3rd position.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class OhPhooey:
                 def __init__(self, seq):
@@ -795,6 +906,22 @@ def __next__(self):
                     if i == 2:
                         return "fooled you!"
                     return next(self.it)
+=======
+        class OhPhooey:
+            def __init__(self, seq):
+                self.it = iter(seq)
+                self.i = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                i = self.i
+                self.i = i+1
+                if i == 2:
+                    return "fooled you!"
+                return next(self.it)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f = open(TESTFN, "w", encoding="utf-8")
         try:
@@ -958,6 +1085,7 @@ def test_writelines(self):
             f.writelines({})
 
             # Try a big chunk too.
+<<<<<<< HEAD
             with torch._dynamo.error_on_graph_break(False):
                 class Iterator:
                     def __init__(self, start, finish):
@@ -982,6 +1110,31 @@ def __init__(self, start, finish):
 
                     def __iter__(self):
                         return Iterator(self.start, self.finish)
+=======
+            class Iterator:
+                def __init__(self, start, finish):
+                    self.start = start
+                    self.finish = finish
+                    self.i = self.start
+
+                def __next__(self):
+                    if self.i >= self.finish:
+                        raise StopIteration
+                    result = str(self.i) + '\n'
+                    self.i += 1
+                    return result
+
+                def __iter__(self):
+                    return self
+
+            class Whatever:
+                def __init__(self, start, finish):
+                    self.start = start
+                    self.finish = finish
+
+                def __iter__(self):
+                    return Iterator(self.start, self.finish)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             f.writelines(Whatever(6, 6+2000))
             f.close()
@@ -1054,6 +1207,7 @@ def test_unpack_iter(self):
 
     @cpython_only
     def test_ref_counting_behavior(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 count = 0
@@ -1064,6 +1218,17 @@ def __del__(self):
                     cls = self.__class__
                     assert cls.count > 0
                     cls.count -= 1
+=======
+        class C(object):
+            count = 0
+            def __new__(cls):
+                cls.count += 1
+                return object.__new__(cls)
+            def __del__(self):
+                cls = self.__class__
+                assert cls.count > 0
+                cls.count -= 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = C()
         self.assertEqual(C.count, 1)
         del x
@@ -1154,6 +1319,7 @@ def test_sinkstate_enumerate(self):
 
     def test_3720(self):
         # Avoid a crash, when an iterator deletes its next() method.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadIterator(object):
                 def __iter__(self):
@@ -1161,6 +1327,14 @@ def __iter__(self):
                 def __next__(self):
                     del BadIterator.__next__
                     return 1
+=======
+        class BadIterator(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                del BadIterator.__next__
+                return 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             for i in BadIterator() :
diff --git a/test/dynamo/cpython/3_13/test_list.diff b/test/dynamo/cpython/3_13/test_list.diff
index 7b0a90735d87c..d0e489c36bc53 100644
--- a/test/dynamo/cpython/3_13/test_list.diff
+++ b/test/dynamo/cpython/3_13/test_list.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
+<<<<<<< HEAD
 index 23ef902aa0b..b9afb1ef26e 100644
 --- a/test/dynamo/cpython/3_13/test_list.py
 +++ b/test/dynamo/cpython/3_13/test_list.py
 @@ -1,6 +1,60 @@
+=======
+index 23ef902aa0b..30e69ff75bd 100644
+--- a/test/dynamo/cpython/3_13/test_list.py
++++ b/test/dynamo/cpython/3_13/test_list.py
+@@ -1,6 +1,57 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_list.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -64,6 +74,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
  from test.support import cpython_only
  from test.support.script_helper import assert_python_ok
  import pickle
+<<<<<<< HEAD
 @@ -36,7 +90,7 @@ class ListTest(list_tests.CommonTest):
              # earlier due to a newlib bug.  See the following mailing list
              # thread for the details:
@@ -255,13 +266,25 @@ index 23ef902aa0b..b9afb1ef26e 100644
              a.append(4)
              self.assertEqual(list(it), [])
 
+=======
+@@ -324,6 +375,7 @@ class ListTest(list_tests.CommonTest):
+             a.append(4)
+             self.assertEqual(list(it), [])
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @unittest.skip("Fails on python <=3.13.2 and passes on >=3.13.3")
      def test_deopt_from_append_list(self):
          # gh-132011: it used to crash, because
          # of `CALL_LIST_APPEND` specialization failure.
+<<<<<<< HEAD
 @@ -345,4 +410,4 @@ class ListTest(list_tests.CommonTest):
          self.assertEqual(rc, 0)
 
+=======
+@@ -345,4 +397,4 @@ class ListTest(list_tests.CommonTest):
+         self.assertEqual(rc, 0)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
index 7f91b7b840804..ab76e949e9395 100644
--- a/test/dynamo/cpython/3_13/test_list.py
+++ b/test/dynamo/cpython/3_13/test_list.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_list.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -101,31 +104,51 @@ def test_keyword_args(self):
             list(sequence=[])
 
     def test_keywords_in_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(list):
                 pass
+=======
+        class subclass(list):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(list(u), [1, 2])
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(list):
                 def __init__(self, seq, newarg=None):
                     super().__init__(seq)
                     self.newarg = newarg
+=======
+        class subclass_with_init(list):
+            def __init__(self, seq, newarg=None):
+                super().__init__(seq)
+                self.newarg = newarg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(list):
                 def __new__(cls, seq, newarg=None):
                     self = super().__new__(cls, seq)
                     self.newarg = newarg
                     return self
+=======
+        class subclass_with_new(list):
+            def __new__(cls, seq, newarg=None):
+                self = super().__new__(cls, seq)
+                self.newarg = newarg
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(list(u), [1, 2])
@@ -172,6 +195,7 @@ def test_list_resize_overflow(self):
             lst *= size
 
     def test_repr_mutate(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Obj:
                 @staticmethod
@@ -181,6 +205,16 @@ def __repr__():
                     except IndexError:
                         pass
                     return 'obj'
+=======
+        class Obj:
+            @staticmethod
+            def __repr__():
+                try:
+                    mylist.pop()
+                except IndexError:
+                    pass
+                return 'obj'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mylist = [Obj() for _ in range(5)]
         self.assertEqual(repr(mylist), '[obj, obj, obj]')
@@ -276,13 +310,18 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class L(list): pass
+=======
+        class L(list): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(TypeError):
             (3,) + L([1,2])
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-38588 part 2.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __eq__(self,other) :
@@ -298,6 +337,22 @@ class Z:
                 def __eq__(self, other):
                     list3.clear()
                     return NotImplemented
+=======
+        class X:
+            def __eq__(self,other) :
+                list2.clear()
+                return NotImplemented
+
+        class Y:
+            def __eq__(self, other):
+                list1.clear()
+                return NotImplemented
+
+        class Z:
+            def __eq__(self, other):
+                list3.clear()
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         list1 = [X()]
         list2 = [Y()]
@@ -308,18 +363,27 @@ def __eq__(self, other):
         self.assertFalse(list3 == list4)
 
     def test_lt_operator_modifying_operand(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             # See gh-120298
             class evil:
                 def __lt__(self, other):
                     other.clear()
                     return NotImplemented
+=======
+        # See gh-120298
+        class evil:
+            def __lt__(self, other):
+                other.clear()
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = [[evil()]]
         with self.assertRaises(TypeError):
             a[0] < a
 
     def test_list_index_modifing_operand(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             # See gh-120384
             class evil:
@@ -328,6 +392,15 @@ def __init__(self, lst):
                 def __iter__(self):
                     yield from self.lst
                     self.lst.clear()
+=======
+        # See gh-120384
+        class evil:
+            def __init__(self, lst):
+                self.lst = lst
+            def __iter__(self):
+                yield from self.lst
+                self.lst.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         lst = list(range(5))
         operand = evil(lst)
@@ -346,21 +419,35 @@ def test_count_index_remove_crashes(self):
         # bpo-38610: The count(), index(), and remove() methods were not
         # holding strong references to list elements while calling
         # PyObject_RichCompareBool().
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __eq__(self, other):
                     lst.clear()
                     return NotImplemented
+=======
+        class X:
+            def __eq__(self, other):
+                lst.clear()
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         lst = [X()]
         with self.assertRaises(ValueError):
             lst.index(lst)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class L(list):
                 def __eq__(self, other):
                     str(other)
                     return NotImplemented
+=======
+        class L(list):
+            def __eq__(self, other):
+                str(other)
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         lst = L([X()])
         lst.count(lst)
diff --git a/test/dynamo/cpython/3_13/test_math.diff b/test/dynamo/cpython/3_13/test_math.diff
index 058477820c63d..8f38e4df9a645 100644
--- a/test/dynamo/cpython/3_13/test_math.diff
+++ b/test/dynamo/cpython/3_13/test_math.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
+<<<<<<< HEAD
 index 5ee3055c871..5402cdc4a6c 100644
 --- a/test/dynamo/cpython/3_13/test_math.py
 +++ b/test/dynamo/cpython/3_13/test_math.py
 @@ -1,3 +1,61 @@
+=======
+index 5ee3055c871..51773d5f478 100644
+--- a/test/dynamo/cpython/3_13/test_math.py
++++ b/test/dynamo/cpython/3_13/test_math.py
+@@ -1,3 +1,58 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_math.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -63,6 +73,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 +
  # Python test set -- math module
  # XXXX Should not do tests around zero only
+<<<<<<< HEAD
 
 @@ -242,7 +300,7 @@ class BadDescr:
      def __get__(self, obj, objtype=None):
@@ -105,10 +116,27 @@ index 5ee3055c871..5402cdc4a6c 100644
          self.ftest('fabs(0)', math.fabs(0), 0)
          self.ftest('fabs(1)', math.fabs(1), 1)
 
+=======
+ 
+@@ -242,7 +297,7 @@ class BadDescr:
+     def __get__(self, obj, objtype=None):
+         raise ValueError
+ 
+-class MathTests(unittest.TestCase):
++class MathTests(__TestCase):
+ 
+     def ftest(self, name, got, expected, ulp_tol=5, abs_tol=0.0):
+         """Compare arguments expected and got, as floats, if either
+@@ -533,6 +588,7 @@ class MathTests(unittest.TestCase):
+         self.ftest('fabs(0)', math.fabs(0), 0)
+         self.ftest('fabs(1)', math.fabs(1), 1)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @skipIfTorchDynamo("infinite loop")
      def testFactorial(self):
          self.assertEqual(math.factorial(0), 1)
          total = 1
+<<<<<<< HEAD
 @@ -573,16 +633,17 @@ class MathTests(unittest.TestCase):
          #self.assertEqual(math.ceil(NINF), NINF)
          #self.assertTrue(math.isnan(math.floor(NAN)))
@@ -165,10 +193,17 @@ index 5ee3055c871..5402cdc4a6c 100644
          with self.assertRaises(ValueError):
              math.dist([1, 2], [3, 4, 5])
 
+=======
+@@ -1072,6 +1128,7 @@ class MathTests(unittest.TestCase):
+         with self.assertRaises(ValueError):
+             math.dist([1, 2], [3, 4, 5])
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @slowTest
      def testIsqrt(self):
          # Test a variety of inputs, large and small.
          test_values = (
+<<<<<<< HEAD
 @@ -1101,12 +1165,13 @@ class MathTests(unittest.TestCase):
          self.assertIs(type(s), int)
          self.assertEqual(s, 0)
@@ -192,6 +227,12 @@ index 5ee3055c871..5402cdc4a6c 100644
              self.assertEqual(math.ldexp(NINF, n), NINF)
              self.assertTrue(math.isnan(math.ldexp(NAN, n)))
 
+=======
+@@ -1202,12 +1259,6 @@ class MathTests(unittest.TestCase):
+             self.assertEqual(math.ldexp(NINF, n), NINF)
+             self.assertTrue(math.isnan(math.ldexp(NAN, n)))
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -    @requires_IEEE_754
 -    def testLdexp_denormal(self):
 -        # Denormal output incorrectly rounded (truncated)
@@ -201,22 +242,37 @@ index 5ee3055c871..5402cdc4a6c 100644
      def testLog(self):
          self.assertRaises(TypeError, math.log)
          self.assertRaises(TypeError, math.log, 1, 2, 3)
+<<<<<<< HEAD
 @@ -1233,6 +1292,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log1p, -1)
          self.assertEqual(math.log1p(INF), INF)
 
+=======
+@@ -1233,6 +1284,7 @@ class MathTests(unittest.TestCase):
+         self.assertRaises(ValueError, math.log1p, -1)
+         self.assertEqual(math.log1p(INF), INF)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      def testLog2(self):
          self.assertRaises(TypeError, math.log2)
+<<<<<<< HEAD
 @@ -1251,6 +1311,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log2, NINF)
          self.assertTrue(math.isnan(math.log2(NAN)))
 
+=======
+@@ -1251,6 +1303,7 @@ class MathTests(unittest.TestCase):
+         self.assertRaises(ValueError, math.log2, NINF)
+         self.assertTrue(math.isnan(math.log2(NAN)))
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      # log2() is not accurate enough on Mac OS X Tiger (10.4)
      @support.requires_mac_ver(10, 5)
+<<<<<<< HEAD
 @@ -1332,17 +1393,18 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(RuntimeError):
              sumprod(raise_after(5), range(10))
@@ -334,26 +390,56 @@ index 5ee3055c871..5402cdc4a6c 100644
          self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
                           decimal.Decimal)
 
+=======
+@@ -1332,7 +1385,7 @@ class MathTests(unittest.TestCase):
+         with self.assertRaises(RuntimeError):
+             sumprod(raise_after(5), range(10))
+ 
+-        from test.test_iter import BasicIterClass
++        from test_iter import BasicIterClass
+ 
+         self.assertEqual(sumprod(BasicIterClass(1), [1]), 0)
+         self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
+@@ -2252,6 +2305,7 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
+                          decimal.Decimal)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @skipIfTorchDynamo("Infinite loop")
      def testPerm(self):
          perm = math.perm
          factorial = math.factorial
+<<<<<<< HEAD
 @@ -2316,6 +2382,7 @@ class MathTests(unittest.TestCase):
              self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
              self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
 
+=======
+@@ -2316,6 +2370,7 @@ class MathTests(unittest.TestCase):
+             self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
+             self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @skipIfTorchDynamo("infinite loop")
      def testComb(self):
          comb = math.comb
          factorial = math.factorial
+<<<<<<< HEAD
 @@ -2446,6 +2513,7 @@ class MathTests(unittest.TestCase):
              math.nextafter(1.0, INF, steps=-1)
 
 
+=======
+@@ -2446,6 +2501,7 @@ class MathTests(unittest.TestCase):
+             math.nextafter(1.0, INF, steps=-1)
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +    @unittest.skip("flaky test under torch dynamo")  # works on pytest and crashes on unittest
      @requires_IEEE_754
      def test_ulp(self):
          self.assertEqual(math.ulp(1.0), sys.float_info.epsilon)
+<<<<<<< HEAD
 @@ -2472,10 +2540,11 @@ class MathTests(unittest.TestCase):
      def test_issue39871(self):
          # A SystemError should not be raised if the first arg to atan2(),
@@ -389,6 +475,27 @@ index 5ee3055c871..5402cdc4a6c 100644
 
      def test_fma_nan_results(self):
 @@ -2719,8 +2788,7 @@ class FMATests(unittest.TestCase):
+=======
+@@ -2508,7 +2564,7 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(math.copysign(1.0, x), math.copysign(1.0, y))
+ 
+ 
+-class IsCloseTests(unittest.TestCase):
++class IsCloseTests(__TestCase):
+     isclose = math.isclose  # subclasses should override this
+ 
+     def assertIsClose(self, a, b, *args, **kwargs):
+@@ -2631,7 +2687,7 @@ class IsCloseTests(unittest.TestCase):
+         self.assertAllNotClose(fraction_examples, rel_tol=1e-9)
+ 
+ 
+-class FMATests(unittest.TestCase):
++class FMATests(__TestCase):
+     """ Tests for math.fma. """
+ 
+     def test_fma_nan_results(self):
+@@ -2719,8 +2775,7 @@ class FMATests(unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      # properly: it doesn't use the right sign when the result is zero.
      @unittest.skipIf(
          sys.platform.startswith(("freebsd", "wasi", "netbsd", "emscripten"))
@@ -398,10 +505,17 @@ index 5ee3055c871..5402cdc4a6c 100644
          f"this platform doesn't implement IEE 754-2008 properly")
      def test_fma_zero_result(self):
          nonnegative_finites = [0.0, 1e-300, 2.3, 1e300]
+<<<<<<< HEAD
 @@ -2879,10 +2947,5 @@ class FMATests(unittest.TestCase):
          )
 
 
+=======
+@@ -2879,10 +2934,5 @@ class FMATests(unittest.TestCase):
+         )
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -def load_tests(loader, tests, pattern):
 -    from doctest import DocFileSuite
 -    tests.addTest(DocFileSuite(os.path.join("mathdata", "ieee754.txt")))
diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
index d9f6b5fd1d94c..d3096cb338bc8 100644
--- a/test/dynamo/cpython/3_13/test_math.py
+++ b/test/dynamo/cpython/3_13/test_math.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_math.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -475,6 +478,7 @@ def testCeil(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.ceil(NAN)))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class TestCeil:
                 def __ceil__(self):
@@ -486,6 +490,18 @@ class TestNoCeil:
                 pass
             class TestBadCeil:
                 __ceil__ = BadDescr()
+=======
+        class TestCeil:
+            def __ceil__(self):
+                return 42
+        class FloatCeil(float):
+            def __ceil__(self):
+                return 42
+        class TestNoCeil:
+            pass
+        class TestBadCeil:
+            __ceil__ = BadDescr()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(math.ceil(TestCeil()), 42)
         self.assertEqual(math.ceil(FloatCeil()), 42)
         self.assertEqual(math.ceil(FloatLike(42.5)), 43)
@@ -633,6 +649,7 @@ def testFloor(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.floor(NAN)))
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class TestFloor:
                 def __floor__(self):
@@ -644,6 +661,18 @@ class TestNoFloor:
                 pass
             class TestBadFloor:
                 __floor__ = BadDescr()
+=======
+        class TestFloor:
+            def __floor__(self):
+                return 42
+        class FloatFloor(float):
+            def __floor__(self):
+                return 42
+        class TestNoFloor:
+            pass
+        class TestBadFloor:
+            __floor__ = BadDescr()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(math.floor(TestFloor()), 42)
         self.assertEqual(math.floor(FloatFloor()), 42)
         self.assertEqual(math.floor(FloatLike(41.9)), 41)
@@ -1056,9 +1085,14 @@ def testDist(self):
         )
 
         # Verify tuple subclasses are allowed
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class T(tuple):
                 pass
+=======
+        class T(tuple):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
 
         # Test handling of bad arguments
@@ -1090,9 +1124,14 @@ class T(tuple):
         with self.assertRaises(TypeError):
             dist([1], 2)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadFloat:
                 __float__ = BadDescr()
+=======
+        class BadFloat:
+            __float__ = BadDescr()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaises(ValueError):
             dist([1], [BadFloat()])
@@ -1165,6 +1204,7 @@ def testIsqrt(self):
         self.assertIs(type(s), int)
         self.assertEqual(s, 0)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class IntegerLike(object):
                 def __init__(self, value):
@@ -1172,6 +1212,14 @@ def __init__(self, value):
 
                 def __index__(self):
                     return self.value
+=======
+        class IntegerLike(object):
+            def __init__(self, value):
+                self.value = value
+
+            def __index__(self):
+                return self.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         s = math.isqrt(IntegerLike(1729))
         self.assertIs(type(s), int)
@@ -1399,12 +1447,20 @@ def raise_after(n):
         self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
 
         # Error in multiplication
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadMultiply:
                 def __mul__(self, other):
                     raise RuntimeError
                 def __rmul__(self, other):
                     raise RuntimeError
+=======
+        class BadMultiply:
+            def __mul__(self, other):
+                raise RuntimeError
+            def __rmul__(self, other):
+                raise RuntimeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(RuntimeError):
             sumprod([10, BadMultiply(), 30], [1, 2, 3])
         with self.assertRaises(RuntimeError):
@@ -1449,6 +1505,7 @@ def test_sumprod_stress(self):
         Decimal = decimal.Decimal
         Fraction = fractions.Fraction
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Int(int):
                 def __add__(self, other):
@@ -1469,6 +1526,27 @@ def __mul__(self, other):
                 __rmul__ = __mul__
                 def __repr__(self):
                     return f'Flt({int(self)})'
+=======
+        class Int(int):
+            def __add__(self, other):
+                return Int(int(self) + int(other))
+            def __mul__(self, other):
+                return Int(int(self) * int(other))
+            __radd__ = __add__
+            __rmul__ = __mul__
+            def __repr__(self):
+                return f'Int({int(self)})'
+
+        class Flt(float):
+            def __add__(self, other):
+                return Int(int(self) + int(other))
+            def __mul__(self, other):
+                return Int(int(self) * int(other))
+            __radd__ = __add__
+            __rmul__ = __mul__
+            def __repr__(self):
+                return f'Flt({int(self)})'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def baseline_sumprod(p, q):
             """This defines the target behavior including exceptions and special values.
@@ -1988,6 +2066,7 @@ def test_trunc(self):
         self.assertEqual(math.trunc(-0.999999), -0)
         self.assertEqual(math.trunc(-100.999), -100)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class TestTrunc:
                 def __trunc__(self):
@@ -1999,6 +2078,18 @@ class TestNoTrunc:
                 pass
             class TestBadTrunc:
                 __trunc__ = BadDescr()
+=======
+        class TestTrunc:
+            def __trunc__(self):
+                return 23
+        class FloatTrunc(float):
+            def __trunc__(self):
+                return 23
+        class TestNoTrunc:
+            pass
+        class TestBadTrunc:
+            __trunc__ = BadDescr()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(math.trunc(TestTrunc()), 23)
         self.assertEqual(math.trunc(FloatTrunc()), 23)
@@ -2231,10 +2322,16 @@ def test_prod(self):
         self.assertEqual(prod([1., F(3, 2)]), 1.5)
 
         # Error in multiplication
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class BadMultiply:
                 def __rmul__(self, other):
                     raise RuntimeError
+=======
+        class BadMultiply:
+            def __rmul__(self, other):
+                raise RuntimeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(RuntimeError):
             prod([10., BadMultiply()])
 
@@ -2540,11 +2637,18 @@ def test_ulp(self):
     def test_issue39871(self):
         # A SystemError should not be raised if the first arg to atan2(),
         # copysign(), or remainder() cannot be converted to a float.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class F:
                 def __float__(self):
                     self.converted = True
                     1/0
+=======
+        class F:
+            def __float__(self):
+                self.converted = True
+                1/0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for func in math.atan2, math.copysign, math.remainder:
             y = F()
             with self.assertRaises(TypeError):
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.diff b/test/dynamo/cpython/3_13/test_ordered_dict.diff
index 1df02fabdfd27..83ceda25e45e6 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.diff
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
+<<<<<<< HEAD
 index a9b6a84996e..efc4288d1a4 100644
 --- a/test/dynamo/cpython/3_13/test_ordered_dict.py
 +++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
 @@ -1,3 +1,60 @@
+=======
+index a9b6a84996e..b77eff70414 100644
+--- a/test/dynamo/cpython/3_13/test_ordered_dict.py
++++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
+@@ -1,3 +1,57 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_ordered_dict.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -63,6 +73,7 @@ index a9b6a84996e..efc4288d1a4 100644
  import builtins
  import contextlib
  import copy
+<<<<<<< HEAD
 @@ -113,13 +170,14 @@ class OrderedDictTests:
 
      def test_init_calls(self):
@@ -332,11 +343,41 @@ index a9b6a84996e..efc4288d1a4 100644
          TODEL = Key()
          dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
 @@ -878,7 +951,7 @@ class CPythonOrderedDictSideEffects:
+=======
+@@ -760,7 +814,7 @@ class _TriggerSideEffectOnEqual:
+     def side_effect(self):
+         raise NotImplementedError
+ 
+-class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
++class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
+ 
+     module = py_coll
+     OrderedDict = py_coll.OrderedDict
+@@ -781,7 +835,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+ 
+ 
+-class CPythonBuiltinDictTests(unittest.TestCase):
++class CPythonBuiltinDictTests(__TestCase):
+     """Builtin dict preserves insertion order.
+ 
+     Reuse some of tests in OrderedDict selectively.
+@@ -800,6 +854,7 @@ for method in (
+ del method
+ 
+ 
++
+ class CPythonOrderedDictSideEffects:
+ 
+     def check_runtime_error_issue119004(self, dict1, dict2):
+@@ -878,7 +933,7 @@ class CPythonOrderedDictSideEffects:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
  class CPythonOrderedDictTests(OrderedDictTests,
                                CPythonOrderedDictSideEffects,
 -                              unittest.TestCase):
 +                              __TestCase):
+<<<<<<< HEAD
 
      module = c_coll
      OrderedDict = c_coll.OrderedDict
@@ -359,34 +400,80 @@ index a9b6a84996e..efc4288d1a4 100644
      module = c_coll
      class OrderedDict(c_coll.OrderedDict):
 @@ -1008,6 +1081,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+=======
+ 
+     module = c_coll
+     OrderedDict = c_coll.OrderedDict
+@@ -986,7 +1041,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
+         pass
+ 
+ 
+-class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
++class PurePythonOrderedDictWithSlotsCopyingTests(__TestCase):
+ 
+     module = py_coll
+     class OrderedDict(py_coll.OrderedDict):
+@@ -995,7 +1050,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
+ 
+ 
+ @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+-class CPythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
++class CPythonOrderedDictWithSlotsCopyingTests(__TestCase):
+ 
+     module = c_coll
+     class OrderedDict(c_coll.OrderedDict):
+@@ -1008,6 +1063,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      @classmethod
      def setUpClass(cls):
          cls.type2test = py_coll.OrderedDict
 +        super().setUpClass()
+<<<<<<< HEAD
 
      def test_popitem(self):
          d = self._empty_mapping()
 @@ -1020,6 +1094,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+=======
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1020,6 +1076,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      @classmethod
      def setUpClass(cls):
          cls.type2test = c_coll.OrderedDict
 +        super().setUpClass()
+<<<<<<< HEAD
 
      def test_popitem(self):
          d = self._empty_mapping()
 @@ -1033,6 +1108,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+=======
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1033,6 +1090,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          class MyOrderedDict(py_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
+<<<<<<< HEAD
 
      def test_popitem(self):
          d = self._empty_mapping()
 @@ -1047,6 +1123,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+=======
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1047,6 +1105,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          class MyOrderedDict(c_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
+<<<<<<< HEAD
 
      def test_popitem(self):
          d = self._empty_mapping()
@@ -405,14 +492,39 @@ index a9b6a84996e..efc4288d1a4 100644
 -class CSimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
 +class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
 
+=======
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1120,21 +1179,22 @@ class SimpleLRUCacheTests:
+         self.assertEqual(list(c), [1, 3, 2])
+ 
+ 
+-class PySimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
++class PySimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+ 
+     class type2test(SimpleLRUCache, py_coll.OrderedDict):
+         pass
+ 
+ 
+ @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+-class CSimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
++class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      @classmethod
      def setUpClass(cls):
          class type2test(SimpleLRUCache, c_coll.OrderedDict):
              pass
          cls.type2test = type2test
 +        super().setUpClass()
+<<<<<<< HEAD
 
 
+=======
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
index 56a8662de1335..fec0b68ef668f 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.py
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_ordered_dict.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -170,6 +173,7 @@ def test_update(self):
 
     def test_init_calls(self):
         calls = []
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Spam:
                 def keys(self):
@@ -178,6 +182,15 @@ def keys(self):
                 def items(self):
                     calls.append('items')
                     return ()
+=======
+        class Spam:
+            def keys(self):
+                calls.append('keys')
+                return ()
+            def items(self):
+                calls.append('items')
+                return ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.OrderedDict(Spam())
         self.assertEqual(calls, ['keys'])
@@ -187,10 +200,16 @@ def test_overridden_init(self):
         # a consistent internal state is created in __new__
         # rather than __init__.
         OrderedDict = self.OrderedDict
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class ODNI(OrderedDict):
                 def __init__(*args, **kwargs):
                     pass
+=======
+        class ODNI(OrderedDict):
+            def __init__(*args, **kwargs):
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         od = ODNI()
         od['a'] = 1  # This used to fail because __init__ was bypassed
 
@@ -326,10 +345,16 @@ def test_pop(self):
         self.assertEqual(od.pop(k, 12345), 12345)
 
         # make sure pop still works when __missing__ is defined
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Missing(OrderedDict):
                 def __missing__(self, key):
                     return 0
+=======
+        class Missing(OrderedDict):
+            def __missing__(self, key):
+                return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = Missing(a=1)
         self.assertEqual(m.pop('b', 5), 5)
         self.assertEqual(m.pop('a', 6), 1)
@@ -476,10 +501,16 @@ def test_setdefault(self):
         self.assertEqual(od.setdefault('g', default=9), 9)
 
         # make sure setdefault still works when __missing__ is defined
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Missing(OrderedDict):
                 def __missing__(self, key):
                     return 0
+=======
+        class Missing(OrderedDict):
+            def __missing__(self, key):
+                return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(Missing().setdefault(5, 9), 9)
 
     def test_reinsert(self):
@@ -545,10 +576,16 @@ def test_views(self):
     def test_override_update(self):
         OrderedDict = self.OrderedDict
         # Verify that subclasses can override update() without breaking __init__()
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyOD(OrderedDict):
                 def update(self, *args, **kwds):
                     raise Exception()
+=======
+        class MyOD(OrderedDict):
+            def update(self, *args, **kwds):
+                raise Exception()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items = [('a', 1), ('c', 3), ('b', 2)]
         self.assertEqual(list(MyOD(items).items()), items)
 
@@ -569,10 +606,16 @@ def test_highly_nested_subclass(self):
         # should not crash Python.
         OrderedDict = self.OrderedDict
         deleted = []
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyOD(OrderedDict):
                 def __del__(self):
                     deleted.append(self.i)
+=======
+        class MyOD(OrderedDict):
+            def __del__(self):
+                deleted.append(self.i)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = None
         for i in range(100):
             obj = MyOD([(None, obj)])
@@ -584,6 +627,7 @@ def __del__(self):
     def test_delitem_hash_collision(self):
         OrderedDict = self.OrderedDict
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key:
                 def __init__(self, hash):
@@ -598,6 +642,21 @@ def __eq__(self, other):
                         return False
                 def __repr__(self):
                     return self.value
+=======
+        class Key:
+            def __init__(self, hash):
+                self._hash = hash
+                self.value = str(id(self))
+            def __hash__(self):
+                return self._hash
+            def __eq__(self, other):
+                try:
+                    return self.value == other.value
+                except AttributeError:
+                    return False
+            def __repr__(self):
+                return self.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def blocking_hash(hash):
             # See the collision-handling in lookdict (in Objects/dictobject.c).
@@ -624,10 +683,16 @@ def blocking_hash(hash):
     def test_issue24347(self):
         OrderedDict = self.OrderedDict
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key:
                 def __hash__(self):
                     return randrange(100000)
+=======
+        class Key:
+            def __hash__(self):
+                return randrange(100000)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         od = OrderedDict()
         for i in range(100):
@@ -647,10 +712,16 @@ def __hash__(self):
     def test_issue24348(self):
         OrderedDict = self.OrderedDict
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key:
                 def __hash__(self):
                     return 1
+=======
+        class Key:
+            def __hash__(self):
+                return 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         od = OrderedDict()
         od[Key()] = 0
@@ -832,10 +903,16 @@ class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
     OrderedDict = py_coll.OrderedDict
 
     def test_issue119004_attribute_error(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 def side_effect(self):
                     del dict1[TODEL]
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -875,10 +952,16 @@ def check_runtime_error_issue119004(self, dict1, dict2):
         self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
 
     def test_issue119004_change_size_by_clear(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 def side_effect(self):
                     dict1.clear()
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                dict1.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -888,10 +971,16 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 def side_effect(self):
                     del dict1[TODEL]
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -902,11 +991,18 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_clear(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 def side_effect(self):
                     dict1.clear()
                     dict1['a'] = dict1['b'] = 'c'
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                dict1.clear()
+                dict1['a'] = dict1['b'] = 'c'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -916,11 +1012,18 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_delete_key(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 def side_effect(self):
                     del dict1[TODEL]
                     dict1['a'] = 'c'
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+                dict1['a'] = 'c'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -931,11 +1034,18 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Key(_TriggerSideEffectOnEqual):
                 trigger = 0
                 def side_effect(self):
                     del dict1[TODEL]
+=======
+        class Key(_TriggerSideEffectOnEqual):
+            trigger = 0
+            def side_effect(self):
+                del dict1[TODEL]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
diff --git a/test/dynamo/cpython/3_13/test_set.diff b/test/dynamo/cpython/3_13/test_set.diff
index 77dce156a1e12..66ae3841f35a0 100644
--- a/test/dynamo/cpython/3_13/test_set.diff
+++ b/test/dynamo/cpython/3_13/test_set.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
+<<<<<<< HEAD
 index d9102eb98a5..c8ee5ca451f 100644
 --- a/test/dynamo/cpython/3_13/test_set.py
 +++ b/test/dynamo/cpython/3_13/test_set.py
 @@ -1,3 +1,56 @@
+=======
+index d9102eb98a5..0b8e99a04c4 100644
+--- a/test/dynamo/cpython/3_13/test_set.py
++++ b/test/dynamo/cpython/3_13/test_set.py
+@@ -1,3 +1,53 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_set.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -59,6 +69,7 @@ index d9102eb98a5..c8ee5ca451f 100644
  import unittest
  from test import support
  from test.support import warnings_helper
+<<<<<<< HEAD
 @@ -38,7 +91,7 @@ class HashCountingInt(int):
          self.hash_count += 1
          return int.__hash__(self)
@@ -69,10 +80,23 @@ index d9102eb98a5..c8ee5ca451f 100644
 
      def setUp(self):
 @@ -47,6 +100,7 @@ class TestJointOps:
+=======
+@@ -38,7 +88,7 @@ class HashCountingInt(int):
+         self.hash_count += 1
+         return int.__hash__(self)
+ 
+-class TestJointOps:
++class _TestJointOps:
+     # Tests common to both set and frozenset
+ 
+     def setUp(self):
+@@ -47,6 +97,7 @@ class TestJointOps:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
          self.s = self.thetype(word)
          self.d = dict.fromkeys(word)
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_new_or_init(self):
          self.assertRaises(TypeError, self.thetype, [], 2)
@@ -140,10 +164,20 @@ index d9102eb98a5..c8ee5ca451f 100644
      def test_free_after_iterating(self):
          support.check_free_after_iterating(self, iter, self.thetype)
 
+=======
+ 
+     def test_new_or_init(self):
+         self.assertRaises(TypeError, self.thetype, [], 2)
+@@ -355,7 +406,7 @@ class TestJointOps:
+     def test_free_after_iterating(self):
+         support.check_free_after_iterating(self, iter, self.thetype)
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSet(TestJointOps, unittest.TestCase):
 +class TestSet(_TestJointOps, __TestCase):
      thetype = set
      basetype = set
+<<<<<<< HEAD
 
 @@ -600,19 +658,20 @@ class TestSet(TestJointOps, unittest.TestCase):
          self.assertRaises(ReferenceError, str, p)
@@ -226,10 +260,18 @@ index d9102eb98a5..c8ee5ca451f 100644
              subclass_with_new([1, 2], newarg=3)
 
 
+=======
+ 
+@@ -675,7 +726,7 @@ class TestSetSubclass(TestSet):
+             subclass_with_new([1, 2], newarg=3)
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestFrozenSet(TestJointOps, unittest.TestCase):
 +class TestFrozenSet(_TestJointOps, __TestCase):
      thetype = frozenset
      basetype = frozenset
+<<<<<<< HEAD
 
 @@ -756,27 +818,30 @@ class TestFrozenSetSubclass(TestFrozenSet):
      basetype = frozenset
@@ -276,6 +318,13 @@ index d9102eb98a5..c8ee5ca451f 100644
  class SetSubclassWithSlots(set):
      __slots__ = ('x', 'y', '__dict__')
 
+=======
+ 
+@@ -811,10 +862,17 @@ class TestFrozenSetSubclass(TestFrozenSet):
+ class SetSubclassWithSlots(set):
+     __slots__ = ('x', 'y', '__dict__')
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSetSubclassWithSlots(unittest.TestCase):
 +class TestSetSubclassWithSlots(__TestCase):
      thetype = SetSubclassWithSlots
@@ -290,6 +339,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 +        self.s = self.thetype(word)
 +        self.d = dict.fromkeys(word)
 +        super().setUp()
+<<<<<<< HEAD
 
  class FrozenSetSubclassWithSlots(frozenset):
      __slots__ = ('x', 'y', '__dict__')
@@ -306,29 +356,62 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ class FrozenSetSubclassWithSlots(frozenset):
+     __slots__ = ('x', 'y', '__dict__')
+@@ -828,7 +886,7 @@ empty_set = set()
+ 
+ #==============================================================================
+ 
+-class TestBasicOps:
++class _TestBasicOps:
+ 
+     def test_repr(self):
+         if self.repr is not None:
+@@ -934,7 +992,7 @@ class TestBasicOps:
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsEmpty(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "empty set"
          self.values = []
+<<<<<<< HEAD
 @@ -942,10 +1014,11 @@ class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
+=======
+@@ -942,10 +1000,11 @@ class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.dup    = set(self.values)
          self.length = 0
          self.repr   = "set()"
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsSingleton(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "unit set (number)"
          self.values = [3]
+<<<<<<< HEAD
 @@ -953,6 +1026,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+=======
+@@ -953,6 +1012,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.dup    = set(self.values)
          self.length = 1
          self.repr   = "{3}"
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_in(self):
          self.assertIn(3, self.set)
@@ -336,16 +419,30 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+     def test_in(self):
+         self.assertIn(3, self.set)
+@@ -962,7 +1022,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTuple(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "unit set (tuple)"
          self.values = [(0, "zero")]
+<<<<<<< HEAD
 @@ -970,6 +1044,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+=======
+@@ -970,6 +1030,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.dup    = set(self.values)
          self.length = 1
          self.repr   = "{(0, 'zero')}"
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_in(self):
          self.assertIn((0, "zero"), self.set)
@@ -353,19 +450,38 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+     def test_in(self):
+         self.assertIn((0, "zero"), self.set)
+@@ -979,7 +1040,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTriple(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "triple set"
          self.values = [0, "zero", operator.add]
+<<<<<<< HEAD
 @@ -987,36 +1062,39 @@ class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
+=======
+@@ -987,36 +1048,39 @@ class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.dup    = set(self.values)
          self.length = 3
          self.repr   = None
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsString(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsString(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -375,12 +491,21 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_repr(self):
          self.check_repr_against_values()
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsBytes(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -390,22 +515,36 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_repr(self):
          self.check_repr_against_values()
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsMixedStringBytes(_TestBasicOps, __TestCase):
      def setUp(self):
          self.enterContext(warnings_helper.check_warnings())
          warnings.simplefilter('ignore', BytesWarning)
+<<<<<<< HEAD
 @@ -1025,6 +1103,7 @@ class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
+=======
+@@ -1025,6 +1089,7 @@ class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.set    = set(self.values)
          self.dup    = set(self.values)
          self.length = 4
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_repr(self):
          self.check_repr_against_values()
@@ -422,20 +561,46 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #==============================================================================
 
+=======
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+@@ -1038,7 +1103,7 @@ def baditer():
+ def gooditer():
+     yield True
+ 
+-class TestExceptionPropagation(unittest.TestCase):
++class TestExceptionPropagation(__TestCase):
+     """SF 628246:  Set constructor should not trap iterator TypeErrors"""
+ 
+     def test_instanceWithException(self):
+@@ -1065,7 +1130,7 @@ class TestExceptionPropagation(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSetOfSets(unittest.TestCase):
 +class TestSetOfSets(__TestCase):
      def test_constructor(self):
          inner = frozenset([1])
          outer = set([inner])
+<<<<<<< HEAD
 @@ -1078,9 +1157,10 @@ class TestSetOfSets(unittest.TestCase):
 
  #==============================================================================
 
+=======
+@@ -1078,9 +1143,10 @@ class TestSetOfSets(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBinaryOps(unittest.TestCase):
 +class TestBinaryOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_eq(self):              # SF bug 643115
          self.assertEqual(self.set, set({2:1,4:3,6:5}))
@@ -443,11 +608,21 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #==============================================================================
 
+=======
+ 
+     def test_eq(self):              # SF bug 643115
+         self.assertEqual(self.set, set({2:1,4:3,6:5}))
+@@ -1151,9 +1217,10 @@ class TestBinaryOps(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestUpdateOps(unittest.TestCase):
 +class TestUpdateOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_union_subset(self):
          self.set |= set([2])
@@ -455,12 +630,22 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #==============================================================================
 
+=======
+ 
+     def test_union_subset(self):
+         self.set |= set([2])
+@@ -1237,10 +1304,11 @@ class TestUpdateOps(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMutate(unittest.TestCase):
 +class TestMutate(__TestCase):
      def setUp(self):
          self.values = ["a", "b", "c"]
          self.set = set(self.values)
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_add_present(self):
          self.set.add("c")
@@ -474,6 +659,21 @@ index d9102eb98a5..c8ee5ca451f 100644
      case2method = {"<=": "issubset",
                     ">=": "issuperset",
 @@ -1334,22 +1416,22 @@ class TestSubsets:
+=======
+ 
+     def test_add_present(self):
+         self.set.add("c")
+@@ -1311,7 +1379,7 @@ class TestMutate(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestSubsets:
++class _TestSubsets:
+ 
+     case2method = {"<=": "issubset",
+                    ">=": "issuperset",
+@@ -1334,22 +1402,22 @@ class TestSubsets:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              result = eval("x" + case + "y", locals())
              self.assertEqual(result, expected)
              # Test the "friendly" method-name spelling, if one exists.
@@ -483,7 +683,11 @@ index d9102eb98a5..c8ee5ca451f 100644
 +                method = getattr(x, _TestSubsets.case2method[case])
                  result = method(y)
                  self.assertEqual(result, expected)
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              # Now do the same for the operands reversed.
 -            rcase = TestSubsets.reverse[case]
 +            rcase = _TestSubsets.reverse[case]
@@ -496,48 +700,81 @@ index d9102eb98a5..c8ee5ca451f 100644
                  result = method(x)
                  self.assertEqual(result, expected)
  #------------------------------------------------------------------------------
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set()
      name  = "both empty"
+<<<<<<< HEAD
 @@ -1357,7 +1439,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
 
  #------------------------------------------------------------------------------
 
+=======
+@@ -1357,7 +1425,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualNonEmpty(_TestSubsets, __TestCase):
      left  = set([1, 2])
      right = set([1, 2])
      name  = "equal pair"
+<<<<<<< HEAD
 @@ -1365,7 +1447,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
 
  #------------------------------------------------------------------------------
 
+=======
+@@ -1365,7 +1433,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEmptyNonEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set([1, 2])
      name  = "one empty, one non-empty"
+<<<<<<< HEAD
 @@ -1373,7 +1455,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
 
  #------------------------------------------------------------------------------
 
+=======
+@@ -1373,7 +1441,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSubsetPartial(TestSubsets, unittest.TestCase):
 +class TestSubsetPartial(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([1, 2])
      name  = "one a non-empty proper subset of other"
+<<<<<<< HEAD
 @@ -1381,7 +1463,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
 
  #------------------------------------------------------------------------------
 
+=======
+@@ -1381,7 +1449,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
 +class TestSubsetNonOverlap(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([2])
      name  = "neither empty, neither contains"
+<<<<<<< HEAD
 @@ -1389,7 +1471,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
 
  #==============================================================================
@@ -551,6 +788,21 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #------------------------------------------------------------------------------
 
+=======
+@@ -1389,7 +1457,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestOnlySetsInBinaryOps:
++class _TestOnlySetsInBinaryOps:
+ 
+     def test_eq_ne(self):
+         # Unlike the others, this is testing that == and != *are* allowed.
+@@ -1505,47 +1573,52 @@ class TestOnlySetsInBinaryOps:
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsNumeric(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsNumeric(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -558,9 +810,15 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = 19
          self.otherIsIterable = False
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsDict(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsDict(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -568,9 +826,15 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = {1:2, 3:4}
          self.otherIsIterable = True
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsOperator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsOperator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -578,9 +842,15 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = operator.add
          self.otherIsIterable = False
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsTuple(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsTuple(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -588,9 +858,15 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = (2, 4, 6)
          self.otherIsIterable = True
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsString(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsString(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -598,19 +874,30 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = 'abc'
          self.otherIsIterable = True
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsGenerator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
          def gen():
              for i in range(0, 10, 2):
+<<<<<<< HEAD
 @@ -1553,10 +1640,11 @@ class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
+=======
+@@ -1553,10 +1626,11 @@ class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          self.set   = set((1, 2, 3))
          self.other = gen()
          self.otherIsIterable = True
 +        super().setUp()
+<<<<<<< HEAD
 
  #==============================================================================
 
@@ -623,52 +910,97 @@ index d9102eb98a5..c8ee5ca451f 100644
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #==============================================================================
+ 
+-class TestCopying:
++class _TestCopying:
+ 
+     def test_copy(self):
+         dup = self.set.copy()
+@@ -1577,40 +1651,46 @@ class TestCopying:
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestCopyingEmpty(TestCopying, unittest.TestCase):
 +class TestCopyingEmpty(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set()
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestCopyingSingleton(TestCopying, unittest.TestCase):
 +class TestCopyingSingleton(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["hello"])
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestCopyingTriple(TestCopying, unittest.TestCase):
 +class TestCopyingTriple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["zero", 0, None])
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestCopyingTuple(TestCopying, unittest.TestCase):
 +class TestCopyingTuple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([(1, 2)])
 +        super().setUp()
+<<<<<<< HEAD
 
  #------------------------------------------------------------------------------
 
+=======
+ 
+ #------------------------------------------------------------------------------
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestCopyingNested(TestCopying, unittest.TestCase):
 +class TestCopyingNested(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([((1, 2), (3, 4))])
 +        super().setUp()
+<<<<<<< HEAD
 
  #==============================================================================
 
+=======
+ 
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestIdentities(unittest.TestCase):
 +class TestIdentities(__TestCase):
      def setUp(self):
          self.a = set('abracadabra')
          self.b = set('alacazam')
 +        super().setUp()
+<<<<<<< HEAD
 
      def test_binopsVsSubsets(self):
          a, b = self.a, self.b
@@ -685,11 +1017,30 @@ index d9102eb98a5..c8ee5ca451f 100644
      def __hash__(self):
          return 0
 
+=======
+ 
+     def test_binopsVsSubsets(self):
+         a, b = self.a, self.b
+@@ -1727,7 +1807,7 @@ def L(seqn):
+     'Test multiple tiers of iterators'
+     return chain(map(lambda x:x, R(Ig(G(seqn)))))
+ 
+-class TestVariousIteratorArgs(unittest.TestCase):
++class TestVariousIteratorArgs(__TestCase):
+ 
+     def test_constructor(self):
+         for cons in (set, frozenset):
+@@ -1785,7 +1865,7 @@ class bad_dict_clear:
+     def __hash__(self):
+         return 0
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestWeirdBugs(unittest.TestCase):
 +class TestWeirdBugs(__TestCase):
      def test_8420_set_merge(self):
          # This used to segfault
          global be_bad, set2, dict2
+<<<<<<< HEAD
 @@ -1813,12 +1907,13 @@ class TestWeirdBugs(unittest.TestCase):
          list(si)
 
@@ -761,25 +1112,62 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.check_set_op_does_not_crash(f3)
 
 
+=======
+@@ -1826,7 +1906,7 @@ class TestWeirdBugs(unittest.TestCase):
+         s.update(other)
+ 
+ 
+-class TestOperationsMutating:
++class _TestOperationsMutating:
+     """Regression test for bpo-46615"""
+ 
+     constructor1 = None
+@@ -1862,7 +1942,7 @@ class TestOperationsMutating:
+                 self.assertIn("changed size during iteration", str(e))
+ 
+ 
+-class TestBinaryOpsMutating(TestOperationsMutating):
++class _TestBinaryOpsMutating(_TestOperationsMutating):
+ 
+     def test_eq_with_mutation(self):
+         self.check_set_op_does_not_crash(lambda a, b: a == b)
+@@ -1933,24 +2013,24 @@ class TestBinaryOpsMutating(TestOperationsMutating):
+         self.check_set_op_does_not_crash(f3)
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBinaryOpsMutating_Set_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBinaryOpsMutating_Subclass_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBinaryOpsMutating_Set_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBinaryOpsMutating_Subclass_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
+<<<<<<< HEAD
 
 
 -class TestMethodsMutating(TestOperationsMutating):
@@ -791,35 +1179,69 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.check_set_op_does_not_crash(set.update)
 
 
+=======
+ 
+ 
+-class TestMethodsMutating(TestOperationsMutating):
++class _TestMethodsMutating(_TestOperationsMutating):
+ 
+     def test_issubset_with_mutation(self):
+         self.check_set_op_does_not_crash(set.issubset)
+@@ -1986,27 +2066,27 @@ class TestMethodsMutating(TestOperationsMutating):
+         self.check_set_op_does_not_crash(set.update)
+ 
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Set_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Set(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Subclass_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Set_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Subclass_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Set(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Set_Dict(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Dict(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = dict.fromkeys
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestMethodsMutating_Set_List(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_List(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = list
+<<<<<<< HEAD
 
 @@ -2068,7 +2164,7 @@ def faces(G):
      return f
@@ -833,6 +1255,21 @@ index d9102eb98a5..c8ee5ca451f 100644
 @@ -2118,4 +2214,4 @@ class TestGraphs(unittest.TestCase):
  #==============================================================================
 
+=======
+ 
+@@ -2068,7 +2148,7 @@ def faces(G):
+     return f
+ 
+ 
+-class TestGraphs(unittest.TestCase):
++class TestGraphs(__TestCase):
+ 
+     def test_cube(self):
+ 
+@@ -2118,4 +2198,4 @@ class TestGraphs(unittest.TestCase):
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
index 1d80fccca5b13..cdc7b61e2e9a1 100644
--- a/test/dynamo/cpython/3_13/test_set.py
+++ b/test/dynamo/cpython/3_13/test_set.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_set.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -315,6 +318,7 @@ def test_iterator_pickling(self):
             self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
 
     def test_deepcopy(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Tracer:
                 def __init__(self, value):
@@ -323,6 +327,15 @@ def __hash__(self):
                     return self.value
                 def __deepcopy__(self, memo=None):
                     return Tracer(self.value + 1)
+=======
+        class Tracer:
+            def __init__(self, value):
+                self.value = value
+            def __hash__(self):
+                return self.value
+            def __deepcopy__(self, memo=None):
+                return Tracer(self.value + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t = Tracer(10)
         s = self.thetype([t])
         dup = copy.deepcopy(s)
@@ -334,9 +347,14 @@ def __deepcopy__(self, memo=None):
 
     def test_gc(self):
         # Create a nest of cycles to exercise overall ref count check
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
+=======
+        class A:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = set(A() for i in range(1000))
         for elem in s:
             elem.cycle = s
@@ -345,10 +363,16 @@ class A:
 
     def test_subclass_with_custom_hash(self):
         # Bug #1257731
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class H(self.thetype):
                 def __hash__(self):
                     return int(id(self) & 0x7fffffff)
+=======
+        class H(self.thetype):
+            def __hash__(self):
+                return int(id(self) & 0x7fffffff)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s=H()
         f=set()
         f.add(s)
@@ -399,9 +423,14 @@ def test_do_not_rehash_dict_keys(self):
 
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for set iterator object
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 pass
+=======
+        class C(object):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = C()
         ref = weakref.ref(obj)
         container = set([obj, 1])
@@ -658,6 +687,7 @@ def test_weakref(self):
         self.assertRaises(ReferenceError, str, p)
 
     def test_rich_compare(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class TestRichSetCompare:
                 def __gt__(self, some_set):
@@ -672,6 +702,21 @@ def __ge__(self, some_set):
                 def __le__(self, some_set):
                     self.le_called = True
                     return False
+=======
+        class TestRichSetCompare:
+            def __gt__(self, some_set):
+                self.gt_called = True
+                return False
+            def __lt__(self, some_set):
+                self.lt_called = True
+                return False
+            def __ge__(self, some_set):
+                self.ge_called = True
+                return False
+            def __le__(self, some_set):
+                self.le_called = True
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # This first tries the builtin rich set comparison, which doesn't know
         # how to handle the custom object. Upon returning NotImplemented, the
@@ -703,31 +748,51 @@ class TestSetSubclass(TestSet):
     basetype = set
 
     def test_keywords_in_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(set):
                 pass
+=======
+        class subclass(set):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(set(u), {1, 2})
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(set):
                 def __init__(self, arg, newarg=None):
                     super().__init__(arg)
                     self.newarg = newarg
+=======
+        class subclass_with_init(set):
+            def __init__(self, arg, newarg=None):
+                super().__init__(arg)
+                self.newarg = newarg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(set):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
                     self.newarg = newarg
                     return self
+=======
+        class subclass_with_new(set):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_new([1, 2])
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(set(u), {1, 2})
@@ -818,30 +883,49 @@ class TestFrozenSetSubclass(TestFrozenSet):
     basetype = frozenset
 
     def test_keywords_in_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(frozenset):
                 pass
+=======
+        class subclass(frozenset):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(set(u), {1, 2})
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(frozenset):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
+=======
+        class subclass_with_init(frozenset):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(frozenset):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
                     self.newarg = newarg
                     return self
+=======
+        class subclass_with_new(frozenset):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(set(u), {1, 2})
@@ -1907,6 +1991,7 @@ def test_iter_and_mutate(self):
         list(si)
 
     def test_merge_and_mutate(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __hash__(self):
@@ -1914,6 +1999,14 @@ def __hash__(self):
                 def __eq__(self, o):
                     other.clear()
                     return False
+=======
+        class X:
+            def __hash__(self):
+                return hash(0)
+            def __eq__(self, o):
+                other.clear()
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         other = set()
         other = {X() for i in range(10)}
@@ -1928,6 +2021,7 @@ class _TestOperationsMutating:
     constructor2 = None
 
     def make_sets_of_bad_objects(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Bad:
                 def __eq__(self, other):
@@ -1940,6 +2034,19 @@ def __eq__(self, other):
                     return bool(randrange(2))
                 def __hash__(self):
                     return randrange(2)
+=======
+        class Bad:
+            def __eq__(self, other):
+                if not enabled:
+                    return False
+                if randrange(20) == 0:
+                    set1.clear()
+                if randrange(20) == 0:
+                    set2.clear()
+                return bool(randrange(2))
+            def __hash__(self):
+                return randrange(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Don't behave poorly during construction.
         enabled = False
         set1 = self.constructor1(Bad() for _ in range(randrange(50)))
diff --git a/test/dynamo/cpython/3_13/test_sort.diff b/test/dynamo/cpython/3_13/test_sort.diff
index 2e719655d9dfa..28fdf1c454cf4 100644
--- a/test/dynamo/cpython/3_13/test_sort.diff
+++ b/test/dynamo/cpython/3_13/test_sort.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
+<<<<<<< HEAD
 index 2a7cfb7affa..4805f1fcceb 100644
 --- a/test/dynamo/cpython/3_13/test_sort.py
 +++ b/test/dynamo/cpython/3_13/test_sort.py
 @@ -1,3 +1,57 @@
+=======
+index 2a7cfb7affa..d661ae544b9 100644
+--- a/test/dynamo/cpython/3_13/test_sort.py
++++ b/test/dynamo/cpython/3_13/test_sort.py
+@@ -1,3 +1,54 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_sort.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,15 +70,23 @@ index 2a7cfb7affa..4805f1fcceb 100644
  from test import support
  import random
  import unittest
+<<<<<<< HEAD
 @@ -39,7 +93,7 @@ def check(tag, expected, raw, compare=None):
              nerrors += 1
              return
 
+=======
+@@ -39,7 +90,7 @@ def check(tag, expected, raw, compare=None):
+             nerrors += 1
+             return
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestBase(unittest.TestCase):
 +class TestBase(__TestCase):
      def testStressfully(self):
          # Try a variety of sizes at and around powers of 2, and at powers of 10.
          sizes = [0]
+<<<<<<< HEAD
 @@ -48,32 +102,33 @@ class TestBase(unittest.TestCase):
              sizes.extend(range(n-1, n+2))
          sizes.extend([10, 100, 1000])
@@ -210,11 +228,36 @@ index 2a7cfb7affa..4805f1fcceb 100644
              self.assertIs(opt, ref)
              #note: not assertEqual! We want to ensure *identical* behavior.
 
+=======
+@@ -151,7 +202,7 @@ class TestBase(unittest.TestCase):
+                 self.assertEqual(forced, native)
+ #==============================================================================
+ 
+-class TestBugs(unittest.TestCase):
++class TestBugs(__TestCase):
+ 
+     def test_bug453523(self):
+         # bug 453523 -- list.sort() crasher.
+@@ -188,7 +239,7 @@ class TestBugs(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestDecorateSortUndecorate(unittest.TestCase):
++class TestDecorateSortUndecorate(__TestCase):
+ 
+     def test_decorated(self):
+         data = 'The quick Brown fox Jumped over The lazy Dog'.split()
+@@ -309,7 +360,7 @@ def check_against_PyObject_RichCompareBool(self, L):
+             self.assertIs(opt, ref)
+             #note: not assertEqual! We want to ensure *identical* behavior.
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -class TestOptimizedCompares(unittest.TestCase):
 +class TestOptimizedCompares(__TestCase):
      def test_safe_object_compare(self):
          heterogeneous_lists = [[0, 'foo'],
                                 [0.0, 'foo'],
+<<<<<<< HEAD
 @@ -331,17 +389,18 @@ class TestOptimizedCompares(unittest.TestCase):
          # This test is by ppperry. It ensures that unsafe_object_compare is
          # verifying ms->key_richcompare == tp->richcompare before comparing.
@@ -260,6 +303,11 @@ index 2a7cfb7affa..4805f1fcceb 100644
 @@ -408,4 +468,4 @@ class TestOptimizedCompares(unittest.TestCase):
  #==============================================================================
 
+=======
+@@ -408,4 +459,4 @@ class TestOptimizedCompares(unittest.TestCase):
+ #==============================================================================
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
index ab9f094cab1b3..3fdf07d067e76 100644
--- a/test/dynamo/cpython/3_13/test_sort.py
+++ b/test/dynamo/cpython/3_13/test_sort.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_sort.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -102,6 +105,7 @@ def testStressfully(self):
             sizes.extend(range(n-1, n+2))
         sizes.extend([10, 100, 1000])
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class Complains(object):
                 maybe_complain = True
@@ -129,6 +133,34 @@ def __lt__(self, other):
 
                 def __repr__(self):
                     return "Stable(%d, %d)" % (self.key, self.index)
+=======
+        class Complains(object):
+            maybe_complain = True
+
+            def __init__(self, i):
+                self.i = i
+
+            def __lt__(self, other):
+                if Complains.maybe_complain and random.random() < 0.001:
+                    if verbose:
+                        print("        complaining at", self, other)
+                    raise RuntimeError
+                return self.i < other.i
+
+            def __repr__(self):
+                return "Complains(%d)" % self.i
+
+        class Stable(object):
+            def __init__(self, key, i):
+                self.key = key
+                self.index = i
+
+            def __lt__(self, other):
+                return self.key < other.key
+
+            def __repr__(self):
+                return "Stable(%d, %d)" % (self.key, self.index)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for n in sizes:
             x = list(range(n))
@@ -213,6 +245,7 @@ def test_bug453523(self):
         # If this fails, the most likely outcome is a core dump.
         # Mutations during a list sort should raise a ValueError.
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __lt__(self, other):
@@ -221,6 +254,15 @@ def __lt__(self, other):
                     else:
                         L.append(3)
                     return random.random() < 0.5
+=======
+        class C:
+            def __lt__(self, other):
+                if L and random.random() < 0.75:
+                    L.pop()
+                else:
+                    L.append(3)
+                return random.random() < 0.5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         L = [C() for i in range(50)]
         self.assertRaises(ValueError, L.sort)
@@ -284,6 +326,7 @@ def k(x):
 
     def test_key_with_mutating_del(self):
         data = list(range(10))
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class SortKiller(object):
                 def __init__(self, x):
@@ -293,11 +336,22 @@ def __del__(self):
                     data[:] = range(20)
                 def __lt__(self, other):
                     return id(self) < id(other)
+=======
+        class SortKiller(object):
+            def __init__(self, x):
+                pass
+            def __del__(self):
+                del data[:]
+                data[:] = range(20)
+            def __lt__(self, other):
+                return id(self) < id(other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertRaises(ValueError, data.sort, key=SortKiller)
 
     def test_key_with_mutating_del_and_exception(self):
         data = list(range(10))
         ## dup = data[:]
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class SortKiller(object):
                 def __init__(self, x):
@@ -306,6 +360,15 @@ def __init__(self, x):
                 def __del__(self):
                     del data[:]
                     data[:] = list(range(20))
+=======
+        class SortKiller(object):
+            def __init__(self, x):
+                if x > 2:
+                    raise RuntimeError
+            def __del__(self):
+                del data[:]
+                data[:] = list(range(20))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertRaises(RuntimeError, data.sort, key=SortKiller)
         ## major honking subtlety: we *can't* do:
         ##
@@ -389,6 +452,7 @@ def test_unsafe_object_compare(self):
         # This test is by ppperry. It ensures that unsafe_object_compare is
         # verifying ms->key_richcompare == tp->richcompare before comparing.
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class WackyComparator(int):
                 def __lt__(self, other):
@@ -401,6 +465,19 @@ class WackyList1(list):
             class WackyList2(list):
                 def __lt__(self, other):
                     raise ValueError
+=======
+        class WackyComparator(int):
+            def __lt__(self, other):
+                elem.__class__ = WackyList2
+                return int.__lt__(self, other)
+
+        class WackyList1(list):
+            pass
+
+        class WackyList2(list):
+            def __lt__(self, other):
+                raise ValueError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
         elem = L[-1]
@@ -414,10 +491,16 @@ def __lt__(self, other):
 
         # The following test is also by ppperry. It ensures that
         # unsafe_object_compare handles Py_NotImplemented appropriately.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class PointlessComparator:
                 def __lt__(self, other):
                     return NotImplemented
+=======
+        class PointlessComparator:
+            def __lt__(self, other):
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         L = [PointlessComparator(), PointlessComparator()]
         self.assertRaises(TypeError, L.sort)
         self.assertRaises(TypeError, [(x,) for x in L].sort)
diff --git a/test/dynamo/cpython/3_13/test_tuple.diff b/test/dynamo/cpython/3_13/test_tuple.diff
index d7ae3af2a2c82..86830a95bd054 100644
--- a/test/dynamo/cpython/3_13/test_tuple.diff
+++ b/test/dynamo/cpython/3_13/test_tuple.diff
@@ -1,8 +1,15 @@
 diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
+<<<<<<< HEAD
 index 9ce80c5e8ea..1080e85e31a 100644
 --- a/test/dynamo/cpython/3_13/test_tuple.py
 +++ b/test/dynamo/cpython/3_13/test_tuple.py
 @@ -1,4 +1,58 @@
+=======
+index 9ce80c5e8ea..e52c0cbc140 100644
+--- a/test/dynamo/cpython/3_13/test_tuple.py
++++ b/test/dynamo/cpython/3_13/test_tuple.py
+@@ -1,4 +1,55 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -from test import support, seq_tests
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
@@ -10,9 +17,12 @@ index 9ce80c5e8ea..1080e85e31a 100644
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_tuple.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,6 +70,7 @@ index 9ce80c5e8ea..1080e85e31a 100644
 +from test import support
 +import seq_tests
  import unittest
+<<<<<<< HEAD
 
  import gc
 @@ -43,27 +97,30 @@ class TupleTest(seq_tests.CommonTest):
@@ -128,6 +139,13 @@ index 9ce80c5e8ea..1080e85e31a 100644
 @@ -510,4 +569,4 @@ class TupleTest(seq_tests.CommonTest):
  #            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
 
+=======
+ 
+ import gc
+@@ -510,4 +561,4 @@ class TupleTest(seq_tests.CommonTest):
+ #            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
index 914e3443f2874..cce32f814c5b1 100644
--- a/test/dynamo/cpython/3_13/test_tuple.py
+++ b/test/dynamo/cpython/3_13/test_tuple.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_tuple.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -97,30 +100,49 @@ def test_keyword_args(self):
             tuple(sequence=())
 
     def test_keywords_in_subclass(self):
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass(tuple):
                 pass
+=======
+        class subclass(tuple):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(list(u), [1, 2])
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(tuple):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
+=======
+        class subclass_with_init(tuple):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(tuple):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
                     self.newarg = newarg
                     return self
+=======
+        class subclass_with_new(tuple):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(list(u), [1, 2])
@@ -408,9 +430,14 @@ def test_track_dynamic(self):
     @support.cpython_only
     def test_track_subtypes(self):
         # Tuple subtypes must always be tracked
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class MyTuple(tuple):
                 pass
+=======
+        class MyTuple(tuple):
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_track_dynamic(MyTuple, True)
 
     @support.cpython_only
@@ -462,8 +489,12 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class T(tuple): pass
+=======
+        class T(tuple): pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(TypeError):
             [3,] + T((1,2))
 
diff --git a/test/dynamo/cpython/3_13/test_userdict.diff b/test/dynamo/cpython/3_13/test_userdict.diff
index 8b8101ae9091d..191ef756deb40 100644
--- a/test/dynamo/cpython/3_13/test_userdict.diff
+++ b/test/dynamo/cpython/3_13/test_userdict.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_userdict.py b/test/dynamo/cpython/3_13/test_userdict.py
+<<<<<<< HEAD
 index 61e79f553e8..75b789633ed 100644
 --- a/test/dynamo/cpython/3_13/test_userdict.py
 +++ b/test/dynamo/cpython/3_13/test_userdict.py
 @@ -1,3 +1,57 @@
+=======
+index 61e79f553e8..c953390355e 100644
+--- a/test/dynamo/cpython/3_13/test_userdict.py
++++ b/test/dynamo/cpython/3_13/test_userdict.py
+@@ -1,3 +1,54 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_userdict.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -60,7 +70,11 @@ index 61e79f553e8..75b789633ed 100644
  # Check every path through every method of UserDict
  
  from test import mapping_tests, support
+<<<<<<< HEAD
 @@ -215,10 +269,10 @@ class UserDictTest(mapping_tests.TestHashMappingProtocol):
+=======
+@@ -215,10 +266,10 @@ class UserDictTest(mapping_tests.TestHashMappingProtocol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  
      # Decorate existing test with recursion limit, because
      # the test is for C structure, but `UserDict` is a Python structure.
diff --git a/test/dynamo/cpython/3_13/test_userdict.py b/test/dynamo/cpython/3_13/test_userdict.py
index 75b789633edf0..5ea65afc57be8 100644
--- a/test/dynamo/cpython/3_13/test_userdict.py
+++ b/test/dynamo/cpython/3_13/test_userdict.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_userdict.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
diff --git a/test/dynamo/cpython/3_13/test_userlist.diff b/test/dynamo/cpython/3_13/test_userlist.diff
index 77e951de5fad6..cdaea0820f4fd 100644
--- a/test/dynamo/cpython/3_13/test_userlist.diff
+++ b/test/dynamo/cpython/3_13/test_userlist.diff
@@ -1,17 +1,27 @@
 diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
+<<<<<<< HEAD
 index 312702c8e39..d3d8dbf394a 100644
 --- a/test/dynamo/cpython/3_13/test_userlist.py
 +++ b/test/dynamo/cpython/3_13/test_userlist.py
 @@ -1,7 +1,61 @@
+=======
+index 312702c8e39..a4532922f5d 100644
+--- a/test/dynamo/cpython/3_13/test_userlist.py
++++ b/test/dynamo/cpython/3_13/test_userlist.py
+@@ -1,7 +1,58 @@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
 +# ruff: noqa
 +# flake8: noqa
 +
+<<<<<<< HEAD
 +# Test copied from
 +# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_userlist.py
 +
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 +import sys
 +import torch
 +import torch._dynamo.test_case
@@ -58,12 +68,17 @@ index 312702c8e39..d3d8dbf394a 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Check every path through every method of UserList
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  from collections import UserList
 -from test import list_tests
 +import list_tests
  import unittest
  from test import support
+<<<<<<< HEAD
 
 @@ -56,9 +110,10 @@ class UserListTest(list_tests.CommonTest):
 
@@ -81,6 +96,11 @@ index 312702c8e39..d3d8dbf394a 100644
      def test_userlist_copy(self):
 @@ -69,9 +124,9 @@ class UserListTest(list_tests.CommonTest):
 
+=======
+ 
+@@ -69,9 +120,9 @@ class UserListTest(list_tests.CommonTest):
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      # Decorate existing test with recursion limit, because
      # the test is for C structure, but `UserList` is a Python structure.
 -    test_repr_deep = support.infinite_recursion(25)(
@@ -89,7 +109,11 @@ index 312702c8e39..d3d8dbf394a 100644
 +    # test_repr_deep = support.infinite_recursion(25)(
 +    #     list_tests.CommonTest.test_repr_deep,
 +    # )
+<<<<<<< HEAD
 
+=======
+ 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
index 9bd988c458836..31fa162deb262 100644
--- a/test/dynamo/cpython/3_13/test_userlist.py
+++ b/test/dynamo/cpython/3_13/test_userlist.py
@@ -4,9 +4,12 @@
 # ruff: noqa
 # flake8: noqa
 
+<<<<<<< HEAD
 # Test copied from
 # https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_userlist.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import torch
 import torch._dynamo.test_case
@@ -110,10 +113,16 @@ def test_mixedadd(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides *are* recognized by __iter__
+<<<<<<< HEAD
         with torch._dynamo.error_on_graph_break(False):
             class T(self.type2test):
                 def __getitem__(self, key):
                     return str(key) + '!!!'
+=======
+        class T(self.type2test):
+            def __getitem__(self, key):
+                return str(key) + '!!!'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(next(iter(T((1,2)))), "0!!!")
 
     def test_userlist_copy(self):
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index cd1f31cc2d5bc..cd697d2d1ae44 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -19,12 +19,20 @@
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+<<<<<<< HEAD
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
 )
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+    SM90OrLater,
+)
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu, skipIfRocm
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils.checkpoint import (
     checkpoint,
@@ -33,6 +41,7 @@
 )
 
 
+<<<<<<< HEAD
 if HAS_CUDA_AND_TRITON:
     import triton
     from triton import language as tl
@@ -53,6 +62,9 @@ def add_one_kernel(
         tl.store(out_ptr + offsets, output, mask=mask)
 
 
+=======
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -157,9 +169,13 @@ def _custom_policy(ctx, func, *args, **kwargs):
     return _custom_policy
 
 
+<<<<<<< HEAD
 class ActivationCheckpointingViaTagsTests(
     torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
 ):
+=======
+class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _validate(
         self,
         fn,
@@ -209,6 +225,7 @@ def _compare_orig_and_checkpointed_fns(
         # The original version and the checkpointed version of the same function
         # should produce the same outputs and the same gradients under torch.compile.
 
+<<<<<<< HEAD
         def clone_args(args):
             cloned_args = []
             for arg in args:
@@ -278,6 +295,48 @@ def runtime_wrapper(*runtime_args):
                 return runtime_wrapper
 
             run(export_compiler)
+=======
+        # Run original version
+        cloned_args_orig_fn = []
+        for arg in args:
+            cloned_args_orig_fn.append(
+                arg.detach().clone().requires_grad_(arg.requires_grad)
+            )
+        torch.manual_seed(0)
+        compiled_orig_fn = torch.compile(
+            orig_fn, fullgraph=fullgraph, backend="inductor"
+        )
+        result_orig_fn = compiled_orig_fn(*cloned_args_orig_fn)
+        result_orig_fn.sum().backward()
+
+        # Run checkpointed version
+        cloned_args_checkpointed_fn = []
+        for arg in args:
+            cloned_args_checkpointed_fn.append(
+                arg.detach().clone().requires_grad_(arg.requires_grad)
+            )
+        torch.manual_seed(0)
+        compiled_checkpointed_fn = torch.compile(
+            checkpointed_fn, fullgraph=fullgraph, backend="inductor"
+        )
+        result_checkpointed_fn = compiled_checkpointed_fn(*cloned_args_checkpointed_fn)
+        result_checkpointed_fn.sum().backward()
+
+        # Check that outputs and gradients are equal
+        self.assertEqual(
+            result_orig_fn,
+            result_checkpointed_fn,
+            msg="Output mismatch between the original version and the checkpointed version of the same function",
+        )
+        for cloned_arg_orig_fn, cloned_arg_checkpointed_fn in zip(
+            cloned_args_orig_fn, cloned_args_checkpointed_fn
+        ):
+            self.assertEqual(
+                cloned_arg_orig_fn.grad,
+                cloned_arg_checkpointed_fn.grad,
+                msg="Gradient mismatch between the original version and the checkpointed version of the same function",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_tags_function(self, device):
         def gn(x, y):
@@ -298,7 +357,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_function_via_global_checkpoint(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -317,7 +380,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_function_with_kwargs(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -337,7 +404,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_sequential_layers(self, device):
         def gn(x):
             x = x.cos()
@@ -362,7 +433,11 @@ def fn(x):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_multiple_checkpoints(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -384,7 +459,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_module(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -412,7 +491,11 @@ def fn(x):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tags_decomps(self, device):
         # Ensures that tags are passed on through decompositions as well
         class MockModule(torch.nn.Module):
@@ -447,7 +530,11 @@ def fn(x):
         )
         self._validate(fn, backend, x)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_recomputed_rand(self, device):
         def gn(x, y):
@@ -471,7 +558,11 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_rand(self, device):
         def gn(x, y):
@@ -498,7 +589,11 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_dropout(self, device):
         # Figure out a way to test the number of inductor_random calls
@@ -606,7 +701,11 @@ def _factory_fn():
 Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no_primal}.""",
         )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fallback(self, device):
         def gn(x, y):
             torch._dynamo.graph_break()
@@ -634,7 +733,11 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(cnt.graphs), 2)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kwargs(self, device):
         def gn(x, y, z=None):
             a = torch.matmul(x, y)
@@ -668,7 +771,11 @@ def fn(x, y, z):
         body_function = getattr(cnt.graphs[0], wrap_node.args[0].name)
         self.assertEqual(op_count(body_function), 2)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symints_location(self, device):
         def gn(x, y):
             return torch.matmul(x, torch.nn.functional.dropout(y, 0.5))
@@ -698,7 +805,11 @@ def fn(x, y):
         wrap_node = find_first_node(cnt.graphs[0], tag_activation_checkpoint)
         self.assertEqual(len(wrap_node.args), 3)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_recompute(self, device):
         def context_fn_must_recompute_mm():
@@ -765,6 +876,7 @@ def fn(x):
             ),
         )
 
+<<<<<<< HEAD
     def test_sac_with_partial_context_fn(self):
         class CustomPolicy:
             def __init__(self):
@@ -798,6 +910,9 @@ def fn(x, y):
         self.assertEqual(result, expected)
 
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_not_recompute_gemm(self, device):
         def selective_checkpointing_context_fn():
@@ -844,6 +959,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_not_recompute_gemm_no_functionalization(
@@ -961,6 +1077,9 @@ def fn(x, y):
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_tensor_subclass(self, device):
         def selective_checkpointing_context_fn():
@@ -1010,7 +1129,11 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_custom_rule(self, device):
         def _get_custom_policy(meta):
@@ -1075,7 +1198,11 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_partial_ctx_fn(self, device):
         def selective_checkpointing_context_fn(no_recompute_list):
@@ -1121,7 +1248,11 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_outplace_op(self, device):
         def selective_checkpointing_context_fn():
@@ -1166,7 +1297,11 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_list_ops(self, device):
         def selective_checkpointing_context_fn():
@@ -1214,7 +1349,11 @@ def fn(x, y):
         "In-place op support in selective checkpointing + torch.compile "
         "requires TorchDispatchMode + torch.compile work to complete"
     )
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_selective_checkpoint_inplace_op(self, device):
         def selective_checkpointing_context_fn():
             no_recompute_list = [
@@ -1260,7 +1399,11 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._inductor.config.patch(fallback_random=True)
     def test_compile_selective_checkpoint_random_op(self, device):
@@ -1320,7 +1463,11 @@ def fn(x):
             self._validate(fn, backend, x, skip_check=not preserve_rng_state)
             self._compare_orig_and_checkpointed_fns(gn, fn, x)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_invalid_context(self):
         def gn(x, y):
@@ -1358,7 +1505,11 @@ def fn(x, y):
         ):
             self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
     def test_compile_selective_checkpoint_parametrization(self):
         def sac_policy():
@@ -1451,7 +1602,12 @@ def reset_parameters(self):
         self.assertEqual(out, out_compiled)
         self.assertEqual(input.grad, input_compiled.grad)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @skipIfRocm
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_autocast_flash_attention(self, device):
         def fn(primals_1, primals_2, primals_3):
             return torch.ops.aten._scaled_dot_product_efficient_attention.default(
@@ -1475,7 +1631,11 @@ def gn(*args):
             res = opt_gn(*args)
             self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_msg(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -1499,7 +1659,11 @@ def fn(x):
         ):
             opt_fn(x)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_list_inputs(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -1524,11 +1688,15 @@ def fn(x, ys):
         res = opt_fn(x, [y, z])
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION and not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
         "Flash and CuDNN attention not support on GPU arch."
     )
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pattern_matcher(self, device):
         # Check that the sdpa op is recomputed in the backward graph
         # tests percolate_tags
@@ -1570,15 +1738,25 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
 
         opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
         opt_fn(*args1).sum().backward()
+<<<<<<< HEAD
 
         fwd_graph = aot_graphs[0]
         op1 = torch.ops.aten._scaled_dot_product_flash_attention.default
         op2 = torch.ops.aten._scaled_dot_product_cudnn_attention.default
+=======
+        if PLATFORM_SUPPORTS_CUDNN_ATTENTION and SM90OrLater:
+            op = torch.ops.aten._scaled_dot_product_cudnn_attention.default
+        else:
+            op = torch.ops.aten._scaled_dot_product_flash_attention.default
+
+        fwd_graph = aot_graphs[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             count_ops(
                 fwd_graph,
                 [],
                 freq=1,
+<<<<<<< HEAD
                 op=op1,
             )
             or count_ops(
@@ -1588,6 +1766,12 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
                 op=op2,
             )
         )
+=======
+                op=op,
+            )
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bwd_graph = aot_graphs[1]
         # Check that sin is not recomputed in the backward graph - checks percolate tags
         self.assertTrue(count_ops(bwd_graph, [], freq=0, op=torch.ops.aten.sin.default))
@@ -1597,6 +1781,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
                 bwd_graph,
                 [],
                 freq=1,
+<<<<<<< HEAD
                 op=op1,
             )
             or count_ops(
@@ -1604,11 +1789,18 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
                 [],
                 freq=1,
                 op=op2,
+=======
+                op=op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
     @requires_distributed()
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_distributed_utils_checkpoint_wrapper(self):
         from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
             checkpoint_wrapper as dist_checkpoint_wrapper,
@@ -1634,7 +1826,11 @@ def forward(self, x):
         self.assertEqual(ref, res)
 
     @requires_distributed()
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
     def test_dynamo_does_not_trace_getattr_as_top_frame(self):
         # inline_inbuilt_nn_modules is a proxy to emulate what FSDP tests do.
@@ -1657,6 +1853,7 @@ def fn(x):
 
         self.assertEqual(opt_fn(x), fn(x))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
     def test_nonlocal_mutation(self):
         counter = 0
@@ -1680,6 +1877,8 @@ def fn(x):
         # The mutation is not reapplied in the backward because the flag was on.
         self.assertEqual(counter, 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index a51e28e37a098..8ae0c7f3c1c59 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -715,6 +715,7 @@ def fn(x, y):
         out = compiled_fn(x, y)
         out.sum().backward()
 
+<<<<<<< HEAD
     def test_joint_custom_pass(self):
         is_called = False
 
@@ -751,6 +752,8 @@ def forward(self, x):
         self.assertTrue(is_called)
         out.sum().backward()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureDynamic  # https://github.com/pytorch/pytorch/issues/103539
     @torch._dynamo.config.patch(automatic_dynamic_shapes=False)
     @patch("torch._functorch.config.debug_assert", True)
@@ -916,6 +919,7 @@ def _prepare_model_args():
             dedent(
                 """\
 SeqNr|OrigAten|SrcFn|FwdSrcFn
+<<<<<<< HEAD
 0|aten.convolution.default|conv2d|
 0|aten.add.Tensor|add_|
 1|aten._native_batch_norm_legit_functional.default|batch_norm|
@@ -951,6 +955,45 @@ def _prepare_model_args():
 1|aten.native_batch_norm_backward.default||batch_norm
 0|aten.convolution_backward.default||conv2d
 11|aten.add.Tensor||l1_loss
+=======
+0|aten.convolution.default|l__self___conv1|
+0|aten.add.Tensor|l__self___bn1|
+1|aten._native_batch_norm_legit_functional.default|l__self___bn1|
+2|aten.relu.default|l__self___relu1|
+2|aten.detach.default|l__self___relu1|
+2|aten.detach.default|l__self___relu1|
+3|aten.add.Tensor|add|
+4|aten.view.default|flatten|
+5|aten.view.default|l__self___fc1|
+6|aten.t.default|l__self___fc1|
+7|aten.addmm.default|l__self___fc1|
+8|aten.view.default|l__self___fc1|
+9|aten.sub.Tensor|l__self___loss_fn|
+10|aten.abs.default|l__self___loss_fn|
+11|aten.mean.default|l__self___loss_fn|
+11|aten.ones_like.default||l__self___loss_fn
+11|aten.expand.default||l__self___loss_fn
+11|aten.div.Scalar||l__self___loss_fn
+10|aten.sgn.default||l__self___loss_fn
+10|aten.mul.Tensor||l__self___loss_fn
+8|aten.view.default||l__self___fc1
+7|aten.t.default||l__self___fc1
+7|aten.mm.default||l__self___fc1
+7|aten.t.default||l__self___fc1
+7|aten.mm.default||l__self___fc1
+7|aten.t.default||l__self___fc1
+7|aten.sum.dim_IntList||l__self___fc1
+7|aten.view.default||l__self___fc1
+6|aten.t.default||l__self___fc1
+5|aten.view.default||l__self___fc1
+4|aten.view.default||
+2|aten.detach.default||l__self___relu1
+2|aten.detach.default||l__self___relu1
+2|aten.threshold_backward.default||l__self___relu1
+1|aten.native_batch_norm_backward.default||l__self___bn1
+0|aten.convolution_backward.default||l__self___conv1
+11|aten.add.Tensor||l__self___loss_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
             ),
         )
@@ -1247,7 +1290,11 @@ def fn(x):
 
     @torch._functorch.config.patch(donated_buffer=True)
     def test_donated_buffer1(self):
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile()
         def relu(x):
@@ -1267,9 +1314,15 @@ def relu(x):
 
     @torch._functorch.config.patch("donated_buffer", True)
     def test_donated_buffer2(self):
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
 
         # we will reuse the graph for g across f1 and f2
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+
+        # we will re-use the graph for g across f1 and f2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile()
         def g(activation, param2):
             return torch.matmul(activation, param2)
@@ -1289,9 +1342,15 @@ def f(inp, param1, param2):
 
     @torch._functorch.config.patch("donated_buffer", True)
     def test_donated_buffer3(self):
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
 
         # we will reuse the graph for g across f1 and f2
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+
+        # we will re-use the graph for g across f1 and f2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile()
         def g(activation, param2):
             return torch.matmul(activation, param2)
@@ -1312,7 +1371,11 @@ def f(inp, param1, param2):
 
     @torch._functorch.config.patch("donated_buffer", True)
     def test_donated_buffer4(self):
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Mod(torch.nn.Module):
             def __init__(self) -> None:
@@ -1343,7 +1406,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
     @torch._functorch.config.patch("donated_buffer", True)
     def test_donated_buffer5(self):
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile()
         def f(x, z):
@@ -1373,7 +1440,10 @@ def f(x, z):
         FileCheck().check("bw_donated_idxs=[1]").run("\n".join(captured.output))
 
     @torch._functorch.config.patch("donated_buffer", True)
+<<<<<<< HEAD
     @torch._dynamo.config.patch("graph_break_on_nn_param_ctor", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_donated_buffer6(self):
         if is_dynamic_shape_test(self._testMethodName):
             # parameters should not be dynamic shape
@@ -1381,7 +1451,11 @@ def test_donated_buffer6(self):
             #    SymNodeVariable() is not a constant
             return
 
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x):
             p = torch.nn.Parameter(x + 123)
@@ -1684,6 +1758,7 @@ def __call__(self, *args, **kwargs):
         # However, at the time this change was introduced, it went down from 15154 to 403.
         self.assertLess(len(shape_env_guards), 1000)
 
+<<<<<<< HEAD
     # See # https://github.com/pytorch/pytorch/issues/164814
     def test_aot_autograd_stride_reconstruction_on_zero_dim_dynamic_shaped_tensor(
         self,
@@ -1718,6 +1793,8 @@ def repro(sentinel: torch.Tensor, skip_squeeze: bool = False) -> torch.Tensor:
         self.assertEqual(eager_no_sq, comp_ind_no_sq)
         self.assertEqual(eager_no_sq.stride(), comp_ind_no_sq.stride())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
index 04af76c90c529..65548877554c6 100644
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -37,7 +37,11 @@
     skipIfWindows,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_triton
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.two_tensor import TwoTensor
 
 
@@ -175,6 +179,7 @@ def fn(x, y):
             if hasattr(a, "_dynamo_weak_dynamic_indices"):
                 del a._dynamo_weak_dynamic_indices
             self.assertEqual(eager_result, compiled_result)
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -183,6 +188,11 @@ def fn(x, y):
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+=======
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
@@ -195,10 +205,14 @@ def fn(x, y):
 
         autotune_expect = 2 if device == GPU_TYPE else 0
 
+<<<<<<< HEAD
         if functorch_config.bundled_autograd_cache:
             self.assertEqual(len(cache_info.inductor_artifacts), 0)
         else:
             self.assertEqual(len(cache_info.inductor_artifacts), 2)
+=======
+        self.assertEqual(len(cache_info.inductor_artifacts), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
         self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
         self.assertEqual(len(cache_info.pgo_artifacts), 0)
@@ -216,6 +230,7 @@ def fn(x, y):
             compiled_result.sum().backward()
             if hasattr(a, "_dynamo_weak_dynamic_indices"):
                 del a._dynamo_weak_dynamic_indices
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -224,6 +239,11 @@ def fn(x, y):
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+=======
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
@@ -236,10 +256,15 @@ def fn(x, y):
         # Hot load and hit
         with fresh_cache():
             cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(len(cache_info.inductor_artifacts), 0)
             else:
                 self.assertEqual(len(cache_info.inductor_artifacts), 2)
+=======
+
+            self.assertEqual(len(cache_info.inductor_artifacts), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
             self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
             self.assertEqual(len(cache_info.pgo_artifacts), 0)
@@ -250,12 +275,17 @@ def fn(x, y):
             if hasattr(a, "_dynamo_weak_dynamic_indices"):
                 del a._dynamo_weak_dynamic_indices
             self.assertEqual(eager_result, compiled_result)
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
             else:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 2)
+=======
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 2)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
@@ -295,6 +325,7 @@ def fn(x, y):
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     def test_vmap(self):
         """
         make
@@ -345,6 +376,8 @@ def fn(x, y):
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_graph_specialization(self):
         """
         Verify multi graph specializations all cache hit
@@ -486,6 +519,7 @@ def fn(x, y):
 
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
+<<<<<<< HEAD
     @functorch_config.patch({"enable_autograd_cache": True})
     @functorch_config.patch({"strict_autograd_cache": True})
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
@@ -520,6 +554,16 @@ def fn(x, y):
         {"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
     )
     def test_view_replay(self):
+=======
+    @functorch_config.patch(
+        {"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
+    )
+    def test_view_replay_bypass(self):
+        """
+        Shoud bypass when view replay is turned on
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(a):
             tmp = a.detach()
             a.mul_(2)
@@ -527,6 +571,7 @@ def fn(a):
 
         with torch.autograd._force_original_view_tracking(True):
             compiled_fn = torch.compile(fn)
+<<<<<<< HEAD
 
         def run_and_check(miss, hit, bypass):
             self._clear_dynamo_and_codecache()
@@ -546,6 +591,12 @@ def run_and_check(miss, hit, bypass):
         run_and_check(miss=1, hit=0, bypass=0)
         run_and_check(miss=1, hit=1, bypass=0)
         run_and_check(miss=1, hit=2, bypass=0)
+=======
+            compiled_fn(torch.rand(2, 3))
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
@@ -751,7 +802,11 @@ def fn(a, b):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -807,7 +862,11 @@ def backward(ctx, grad_output):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -849,7 +908,12 @@ def fn(a):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+    @requires_triton()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -882,7 +946,11 @@ def backward(ctx, grad_output):
         def fn(a):
             return MyAutogradFunction.apply(a)
 
+<<<<<<< HEAD
         a = torch.randn(5, device=GPU_TYPE, requires_grad=True)
+=======
+        a = torch.randn(5, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a2 = a.clone().detach_().requires_grad_(True)
         compiled_fn = torch.compile(fn, backend="inductor")
         result = compiled_fn(a)
@@ -902,6 +970,7 @@ def fn(a):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
@@ -1110,6 +1179,8 @@ def my_triton_op2(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
 
         self.assertEqual(fn(a2), result)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch({"fx_graph_cache": True})
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -1528,7 +1599,11 @@ def f():
             result = f()
             self.assertEqual(result[0].device, torch.device("cuda:1"))
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_cache", True)
     @inductor_config.patch("fx_graph_remote_cache", False)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -1814,6 +1889,7 @@ def fn(x, y):
             self.assertEqual(eager_result, compiled_result)
             self.assertEqual(expected_grads[0], actual_grads[0])
             self.assertEqual(expected_grads[1], actual_grads[1])
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -1822,6 +1898,11 @@ def fn(x, y):
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 3)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
                 self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+=======
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 3)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
@@ -1835,10 +1916,14 @@ def fn(x, y):
 
         autotune_expect = 2 if device == GPU_TYPE else 0
 
+<<<<<<< HEAD
         if functorch_config.bundled_autograd_cache:
             self.assertEqual(len(cache_info.inductor_artifacts), 0)
         else:
             self.assertEqual(len(cache_info.inductor_artifacts), 3)
+=======
+        self.assertEqual(len(cache_info.inductor_artifacts), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
         self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
         self.assertEqual(len(cache_info.pgo_artifacts), 0)
@@ -1852,10 +1937,14 @@ def fn(x, y):
         with fresh_cache():
             cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
 
+<<<<<<< HEAD
             if functorch_config.bundled_autograd_cache:
                 self.assertEqual(len(cache_info.inductor_artifacts), 0)
             else:
                 self.assertEqual(len(cache_info.inductor_artifacts), 3)
+=======
+            self.assertEqual(len(cache_info.inductor_artifacts), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
             self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
             self.assertEqual(len(cache_info.pgo_artifacts), 0)
@@ -1879,6 +1968,7 @@ def fn(x, y):
 
                 if i == 0:
                     # initial compile
+<<<<<<< HEAD
                     if functorch_config.bundled_autograd_cache:
                         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
                         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -1888,6 +1978,13 @@ def fn(x, y):
                         self.assertEqual(
                             counters["inductor"]["fxgraph_lookup_write_file"], 3
                         )
+=======
+                    self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
+                    self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 3)
+                    self.assertEqual(
+                        counters["inductor"]["fxgraph_lookup_write_file"], 3
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
                     self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
                     self.assertEqual(
diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
index 326a1e627b3f4..68c15ea480941 100644
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@@ -8,6 +8,7 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 import torch._dynamo.utils
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import HAS_GPU, requires_gpu
 
 
@@ -16,6 +17,12 @@
 )
 
 if HAS_GPU:
+=======
+from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
+
+
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import triton
 
     from torch.testing._internal.triton_utils import add_kernel
@@ -508,13 +515,21 @@ def test_amp_custom_fwd_bwd(self):
 
         class MyMM(torch.autograd.Function):
             @staticmethod
+<<<<<<< HEAD
             @torch.amp.custom_fwd(device_type=device_type)
+=======
+            @torch.amp.custom_fwd(device_type="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(ctx, a, b):
                 ctx.save_for_backward(a, b)
                 return a.mm(b)
 
             @staticmethod
+<<<<<<< HEAD
             @torch.amp.custom_bwd(device_type=device_type)
+=======
+            @torch.amp.custom_bwd(device_type="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def backward(ctx, grad):
                 a, b = ctx.saved_tensors
                 return grad.mm(b.t()), a.t().mm(grad)
@@ -1433,7 +1448,11 @@ def backward(ctx, grad_output, grad_dx):
                 result = grad_output * dx + grad_dx * 6 * x
                 # Intentionally return a wrong value to test if the backward is triggered twice.
                 # Since if the first MyCube.apply returns values w/o requires_grad=True,
+<<<<<<< HEAD
                 # this backward would be only triggered once (the first MyCube.apply call),
+=======
+                # this backward would be only triggered once (the first MyCube.appy call),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # as the second MyCube.apply is inlined by Dynamo and the corresponding backward
                 # would be generated by autograd engine.
                 return result * 0.5
@@ -1477,7 +1496,11 @@ def fn():
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_basic(self):
         class Add(torch.autograd.Function):
             @staticmethod
@@ -1501,14 +1524,23 @@ def f(x, y):
             z = Add.apply(x, y)
             return z
 
+<<<<<<< HEAD
         x = torch.randn(10, device=device_type, requires_grad=True)
         y = torch.randn(10, device=device_type, requires_grad=True)
+=======
+        x = torch.randn(10, device="cuda", requires_grad=True)
+        y = torch.randn(10, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         z = f(x, y)
         loss = z.sum()
         loss.backward()
         self.assertEqual(x + y, z)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_multiple_out(self):
         class Add(torch.autograd.Function):
             @staticmethod
@@ -1536,8 +1568,13 @@ def f(x, y):
             z = Add.apply(x, y)
             return z
 
+<<<<<<< HEAD
         x = torch.randn(10, device=device_type, requires_grad=True)
         y = torch.randn(10, device=device_type, requires_grad=True)
+=======
+        x = torch.randn(10, device="cuda", requires_grad=True)
+        y = torch.randn(10, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         z, _ = f(x, y)
         loss = z.sum()
         loss.backward()
diff --git a/test/dynamo/test_backends.py b/test/dynamo/test_backends.py
index 28579f727b05a..995605780aebc 100644
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@@ -1,4 +1,8 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -7,6 +11,10 @@
 import torch._dynamo.backends
 import torch._dynamo.test_case
 from torch._dynamo.backends.debugging import ExplainWithBackend
+<<<<<<< HEAD
+=======
+from torch._dynamo.backends.onnxrt import has_onnxruntime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.backends.tvm import has_tvm
 from torch._dynamo.testing import same
 from torch.fx._lazy_graph_module import _force_skip_lazy_graph_module
@@ -15,7 +23,14 @@
     onlyHPU,
 )
 from torch.testing._internal.common_utils import skipIfHpu
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Seq(torch.nn.Module):
@@ -129,10 +144,21 @@ def test_aot_eager_decomp_partition(self, device):
     def test_aot_ts(self, device):
         self._check_backend_works("aot_ts", device)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_aot_cudagraphs(self, device):
         self._check_backend_works("cudagraphs", device)
 
+=======
+    @requires_cuda
+    def test_aot_cudagraphs(self, device):
+        self._check_backend_works("cudagraphs", device)
+
+    @unittest.skipIf(not has_onnxruntime(), "requires onnxruntime")
+    def test_onnxrt(self, device):
+        self._check_backend_works("onnxrt", device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not has_tvm(), "requires tvm")
     def test_tvm(self, device):
         self._check_backend_works("tvm", device)
@@ -304,15 +330,34 @@ def test_lookup_custom_backend(self):
         backends_group = "torch_dynamo_backends"
         name = "mycustombackend"
 
+<<<<<<< HEAD
+=======
+        mock_3_9 = MagicMock()
+        mock_3_9.load.return_value = lambda: "mocked 3.9"
+        mock_3_9.name = name
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mock_3_10 = MagicMock()
         mock_3_10.load.return_value = lambda: "mocked 3.10"
 
         def mock_eps(group=None):
+<<<<<<< HEAD
             assert group == backends_group, group
             mock_group = MagicMock()
             mock_group.names = [name]
             mock_group[name] = mock_3_10
             return mock_group
+=======
+            if sys.version_info < (3, 10):
+                return {backends_group: [mock_3_9]}
+            else:
+                assert group == backends_group, group
+                mock_group = MagicMock()
+                mock_group.names = [name]
+                mock_group[name] = mock_3_10
+                # mock_group[name].load.return_value = lambda: "mocked 3.10"
+                return mock_group
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with patch("importlib.metadata.entry_points", mock_eps):
             from torch._dynamo.backends import registry
@@ -377,7 +422,11 @@ def forward(self, x):
         self.assertTrue(backend_run)
 
 
+<<<<<<< HEAD
 devices = ["cpu", "cuda", "hpu", "xpu"]
+=======
+devices = ["cpu", "cuda", "hpu"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestOptimizations, globals(), only_for=devices)
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_backward_higher_order_ops.py b/test/dynamo/test_backward_higher_order_ops.py
index 97a3809344849..26cb218c1e4ca 100644
--- a/test/dynamo/test_backward_higher_order_ops.py
+++ b/test/dynamo/test_backward_higher_order_ops.py
@@ -140,7 +140,11 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
+<<<<<<< HEAD
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
+=======
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
@@ -171,7 +175,11 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
+<<<<<<< HEAD
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
+=======
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
@@ -255,7 +263,11 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]",
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
+<<<<<<< HEAD
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
+=======
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
diff --git a/test/dynamo/test_base_hop.py b/test/dynamo/test_base_hop.py
index 607b502351aaf..a8178dfdfcc90 100644
--- a/test/dynamo/test_base_hop.py
+++ b/test/dynamo/test_base_hop.py
@@ -1,4 +1,8 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest.mock as mock
 
 import torch
@@ -12,6 +16,13 @@
 )
 from torch._higher_order_ops.schema import find_hop_schema
 from torch.testing._internal.common_utils import instantiate_parametrized_tests
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def normalize_graph(gm):
diff --git a/test/dynamo/test_bytecode_utils.py b/test/dynamo/test_bytecode_utils.py
index bd079cca9307d..ed00c7c8d59c4 100644
--- a/test/dynamo/test_bytecode_utils.py
+++ b/test/dynamo/test_bytecode_utils.py
@@ -80,7 +80,11 @@ def fn():
         self.assertEqual(fn.__code__.co_lnotab, result[1].co_lnotab)
 
     @unittest.skipIf(
+<<<<<<< HEAD
         sys.version_info >= (3, 11),
+=======
+        sys.version_info < (3, 10) or sys.version_info >= (3, 11),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "linetable test for Python 3.10",
     )
     def test_linetable_310_writer(self):
@@ -95,6 +99,22 @@ def fn():
         result = bytecode_transformation.assemble(inst, fn.__code__.co_firstlineno)
         self.assertTrue(result[1] == fn.__code__.co_linetable)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info >= (3, 10), "use lnotab when python < 3.10")
+    def test_lnotab_writer(self):
+        def fn():
+            a = 10
+            b = 20
+            c = a + b
+            f = "lnotab_writer"
+            return f"Test if {f} generates correct co_lnotab: {c}"
+
+        inst = dis.get_instructions(fn)
+        result = bytecode_transformation.assemble(inst, fn.__code__.co_firstlineno)
+        self.assertTrue(result[1] == fn.__code__.co_lnotab)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_if_tensor_is_none(self):
         """
         Python 3.11 adds new jump instructions that check if
@@ -271,7 +291,11 @@ def fn():
         def nothing(*args):
             pass
 
+<<<<<<< HEAD
         code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+=======
+        code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
 
     @skipIfNotPy311
@@ -287,7 +311,11 @@ def fn():
         def nothing(*args):
             pass
 
+<<<<<<< HEAD
         code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+=======
+        code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
 
     @skipIfNotPy311
@@ -487,7 +515,10 @@ def fn():
                 self.assertIn("JUMP", i1.opname)
                 self.assertIs(i1.target, insts[-1])
 
+<<<<<<< HEAD
     @unittest.skipIf(sys.version_info >= (3, 14), "3.14+ removed RETURN_CONST")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfNotPy312
     def test_bytecode_from_template_noreturn_const(self):
         # Test 3.12+ RETURN_CONST
@@ -532,6 +563,7 @@ def fn(x):
 
         self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/160471
     def test_extended_args_starts_line(self):
         # NOTE: need to LOAD_CONST i before LOAD_FAST x
@@ -562,6 +594,8 @@ def transformations(instructions, _):
 
         bytecode_transformation.transform_code_object(fn.__code__, transformations)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class BytecodeHookTests(torch._dynamo.test_case.TestCase):
     def test_bytecode_hook(self):
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
index 881515717460b..7bf6ca4c35617 100644
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@@ -8,12 +8,16 @@
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._guards import CompileId
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON, requires_gpu
 
 
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CallbackTests(TestCase):
@@ -30,7 +34,11 @@ def tearDown(self) -> None:
 
     def test_callbacks_with_duplicate_prevention(self) -> None:
         trigger = CallbackTrigger.DYNAMO
+<<<<<<< HEAD
         compile_id = CompileId(frame_id=0, frame_compile_id=0)
+=======
+        compile_id = CompileId(0, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with (
             callback_handler.install_callbacks(trigger, compile_id),
             callback_handler.install_callbacks(trigger, compile_id),
@@ -40,7 +48,11 @@ def test_callbacks_with_duplicate_prevention(self) -> None:
 
     def test_counter(self) -> None:
         trigger = CallbackTrigger.DYNAMO
+<<<<<<< HEAD
         compile_id = CompileId(frame_id=0, frame_compile_id=0)
+=======
+        compile_id = CompileId(0, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with callback_handler.install_callbacks(trigger, compile_id):
             self.assertEqual(
                 callback_handler._CompilationCallbackHandler__pending_callbacks_counter,
@@ -56,7 +68,11 @@ def test_counter_assertion(self) -> None:
             AssertionError, "Pending callbacks counter cannot become negative."
         ):
             trigger = CallbackTrigger.DYNAMO
+<<<<<<< HEAD
             compile_id = CompileId(frame_id=0, frame_compile_id=0)
+=======
+            compile_id = CompileId(0, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with callback_handler.install_callbacks(trigger, str(compile_id)):
                 pass
         self.assertEqual(
@@ -66,7 +82,11 @@ def test_counter_assertion(self) -> None:
     @unittest.skipIf(
         TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
     )
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires triton")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(force_disable_caches=True)
     def test_triggers(self) -> None:
         torch._dynamo.reset()
@@ -96,9 +116,15 @@ def forward(self, x):
                 torch._dynamo.graph_break()
                 return self.fc2(temp)
 
+<<<<<<< HEAD
         model = TinyModel().to(device_type)
         compiled_model = torch.compile(model, mode="max-autotune")
         x = torch.randn(10, 10, device=device_type)
+=======
+        model = TinyModel().to("cuda")
+        compiled_model = torch.compile(model, mode="max-autotune")
+        x = torch.randn(10, 10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         loss = compiled_model(x).sum()
         loss.backward()
@@ -111,6 +137,7 @@ def forward(self, x):
 end=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='1/0')
 start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
 end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
+<<<<<<< HEAD
 start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
 end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')""",  # noqa: B950
         )
@@ -123,6 +150,20 @@ def forward(self, x):
         loss = compiled_model(x).sum()
         loss.backward()
 
+=======
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')""",  # noqa: B950
+        )
+        order.clear()
+
+        compiled_model.zero_grad()
+        loss = compiled_model(x).sum()
+        loss.backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             "\n".join(order),
             """\
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
index 7df0ba2f1d3e4..8efa34e3190d2 100644
--- a/test/dynamo/test_compile.py
+++ b/test/dynamo/test_compile.py
@@ -234,6 +234,7 @@ def fn(x, y):
         with self.assertRaises(IndexError):
             fn(torch.randn(10), 99)
 
+<<<<<<< HEAD
     def test_list_bad_weakref(self):
         import weakref
 
@@ -255,6 +256,8 @@ def forward(self, x):
         a = torch.randn(10)
         self.assertEqual(m(a), a)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The private variants of the below functions are extensively tested
 # So as long as the signatures match we're good
diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index 9ce4d714fbd97..088415305692d 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from importlib import import_module
 
@@ -10,18 +14,30 @@
 from torch._inductor.compiler_bisector import CompilerBisector
 from torch._inductor.test_case import TestCase
 from torch.library import _scoped_library, Library
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
 
+<<<<<<< HEAD
+=======
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 f32 = torch.float32
 i64 = torch.int64
 i32 = torch.int32
 
 
+<<<<<<< HEAD
 @requires_cuda_and_triton
+=======
+@requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestCompilerBisector(TestCase):
     test_ns = "_test_bisector"
 
@@ -95,6 +111,7 @@ def test_fn():
         self.assertEqual(out.bisect_number, 1)
         self.assertTrue("aten.exponential" in out.debug_info)
 
+<<<<<<< HEAD
     def test_pre_grad(self):
         import operator
 
@@ -129,6 +146,8 @@ def test_fn():
         self.assertEqual(out.bisect_number, 0)
         self.assertTrue("pre_grad_custom_pass" in out.debug_info)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_joint_graph(self):
         from torch._inductor import config
 
@@ -218,7 +237,11 @@ def test_fn():
                 torch._dynamo.reset()
 
                 try:
+<<<<<<< HEAD
                     torch.testing.assert_close(torch.compile(op)(x), op(x))
+=======
+                    torch.testing.assert_allclose(torch.compile(op)(x), op(x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except Exception:
                     return False
                 return True
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
index 2311eac402c71..d9c86acee98ab 100644
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@@ -77,7 +77,11 @@ def customized_ctx_manager_with_graph_break(mode):
         torch._C._set_grad_enabled(prev)
 
 
+<<<<<<< HEAD
 class CtxManagerTests(torch._dynamo.test_case.TestCaseWithNestedGraphBreaks):
+=======
+class CtxManagerTests(torch._dynamo.test_case.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_grad(self):
         def fn1(a, b):
             x = a + 1
@@ -230,7 +234,11 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnts.op_count, 20)
+=======
+        self.assertEqual(cnts.op_count, 12)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.expectedFailure  # https://github.com/pytorch/pytorch/issues/118204
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
@@ -335,7 +343,11 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnts.op_count, 37)
+=======
+        self.assertEqual(cnts.op_count, 21)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_stream_compared_with_constant(self):
@@ -388,7 +400,11 @@ def fn(x, s0, s1):
 
         ref1 = fn(x, s1, s1)
         res1 = opt_fn(x, s1, s1)
+<<<<<<< HEAD
         # We have a re-compilation because of changing inputs
+=======
+        # We have a re-compilation because of chaning inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(ref1, res1)
 
@@ -403,7 +419,11 @@ def fn(x, s0, s1):
 
         ref0 = fn(x, s0, s1)
         res0 = opt_fn(x, s0, s1)
+<<<<<<< HEAD
         # We have a re-compilation because of changing inputs
+=======
+        # We have a re-compilation because of chaning inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(ref0, res0)
 
@@ -517,7 +537,11 @@ def fn(x, cur_stream, new_stream):
         res = opt_fn(x, cur_stream, new_stream)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnts.op_count, 27)
+=======
+        self.assertEqual(cnts.op_count, 19)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_event_method(self):
@@ -557,7 +581,11 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnts.op_count, 27)
+=======
+        self.assertEqual(cnts.op_count, 19)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_device(self):
@@ -1706,7 +1734,11 @@ def f(x):
         cnts = torch._dynamo.testing.CompileCounter()
         opt_f = torch.compile(f, backend=cnts)
         opt_f(torch.randn(2, 2, 2, 2).to(dtype=torch.float16))
+<<<<<<< HEAD
         self.assertEqual(cnts.frame_count, 2)
+=======
+        self.assertEqual(cnts.frame_count, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # test sdpa_kernel graph break with 2 arguments
     def test_sdpa_kernel_ctx_manager3(self):
@@ -1742,6 +1774,7 @@ def f(x):
         opt_f = torch.compile(f, backend="eager")
         opt_f(torch.randn(2, 2))
 
+<<<<<<< HEAD
     # Regression test to make sure dynamo won't crash on these kwargs.
     def test_sdpa_kernel_ctx_manager_kwargs(self):
         backends = [torch.nn.attention.SDPBackend.MATH]
@@ -1819,6 +1852,8 @@ def f(x):
         opt_f = torch.compile(f, backend="eager")
         opt_f(torch.randn(2, 2))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_profiler_use_after_with_block(self):
         counters.clear()
 
@@ -1835,6 +1870,7 @@ def fn(x):
         self.assertEqual(ref, res)
         self.assertEqual(len(counters["graph_break"]), 1)
 
+<<<<<<< HEAD
     def test_311_resume_block_keyerror(self):
         # https://github.com/pytorch/pytorch/issues/162313
         flag = True
@@ -1894,13 +1930,21 @@ class ContextlibContextManagerTests(
 ):
     def setUp(self):
         super().setUp()
+=======
+
+class ContextlibContextManagerTests(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._prev = torch._dynamo.config.enable_trace_contextlib
         self._u_prev = torch._dynamo.config.enable_trace_unittest
         torch._dynamo.config.enable_trace_contextlib = True
         torch._dynamo.config.enable_trace_unittest = True
 
     def tearDown(self):
+<<<<<<< HEAD
         super().tearDown()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._dynamo.config.enable_trace_contextlib = self._prev
         torch._dynamo.config.enable_trace_unittest = self._u_prev
 
diff --git a/test/dynamo/test_debug_utils.py b/test/dynamo/test_debug_utils.py
index eae4d06d98904..828bfa0f7a531 100644
--- a/test/dynamo/test_debug_utils.py
+++ b/test/dynamo/test_debug_utils.py
@@ -1,6 +1,10 @@
 # Owner(s): ["module: dynamo"]
 
 import os
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -9,8 +13,16 @@
 from torch._dynamo.debug_utils import aot_graph_input_parser, generate_env_vars_string
 from torch._dynamo.test_case import TestCase
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
+<<<<<<< HEAD
 
 
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 f32 = torch.float32
 i64 = torch.int64
 i32 = torch.int32
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 2b626132103a1..f48106f827285 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -2,16 +2,24 @@
 import functools
 import operator
 import os
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest.mock as mock
 from unittest.mock import patch
 
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
+<<<<<<< HEAD
 from torch._dynamo.exc import IncorrectUsage, Unsupported
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import skipIfWindows
+=======
+from torch._dynamo.exc import IncorrectUsage
+from torch._dynamo.utils import counters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def my_custom_function(x):
@@ -516,6 +524,7 @@ def fn(x, s):
         fn(x, State(41))
         self.assertEqual(cnts.frame_count, 2)
 
+<<<<<<< HEAD
     def test_nonstrict_trace_int_and_float_output(self):
         @torch._dynamo.nonstrict_trace
         def trace_me(x):
@@ -533,6 +542,8 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nonstrict_trace_tuple_and_sym_int_output(self):
         @torch._dynamo.nonstrict_trace
         def trace_me(x):
@@ -690,7 +701,17 @@ def fn(p):
             fn(p)
             self.assertFalse(True)  # must raise error before this
         except torch._dynamo.exc.Unsupported as e:
+<<<<<<< HEAD
             self.assertIn("Invalid input type for nonstrict_trace-ed function", str(e))
+=======
+            msg = """
+For `nonstrict_trace`-ed function, the only allowed input types are basic types (e.g., torch.Tensor, int, float) or pytree containers of those. Here you are calling the function with arguments that contain a value of type <DecoratorTests.test_nonstrict_trace_custom_class_error.<locals>.Point>, please use one of the following to register the type with pytree:
+  * `torch.utils._pytree.register_constant`
+  * `torch.utils._pytree.register_dataclass`
+  * `torch.utils._pytree.register_pytree_node`
+"""  # NOQA: B950
+            self.assertIn(msg, str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nonstrict_trace_nested_custom_class_error(self):
         class Point:
@@ -736,6 +757,7 @@ def fn(x, y):
             fn(torch.ones(10), torch.ones(1))
             self.assertFalse(True)  # must raise error before this
         except torch._dynamo.exc.Unsupported as e:
+<<<<<<< HEAD
             self.assertIn("Invalid input type for nonstrict_trace-ed function", str(e))
 
     def test_nonstrict_trace_custom_class_output_error(self):
@@ -765,6 +787,15 @@ def fn(x):
             self.assertIn(
                 "Unsupported output type for nonstrict_trace-ed function", str(e)
             )
+=======
+            msg = """
+For `nonstrict_trace`-ed function, the only allowed input types are basic types (e.g., torch.Tensor, int, float) or pytree containers of those. Here you are calling the function with arguments that contain a value of type <DecoratorTests.test_nonstrict_trace_nested_custom_class_error.<locals>.Point>, please use one of the following to register the type with pytree:
+  * `torch.utils._pytree.register_constant`
+  * `torch.utils._pytree.register_dataclass`
+  * `torch.utils._pytree.register_pytree_node`
+"""  # NOQA: B950
+            self.assertIn(msg, str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nonstrict_newly_constructed_trace_register_constant_type_error(self):
         class State:
@@ -801,10 +832,19 @@ def fn(x):
             fn(x)
             self.assertFalse(True)  # must raise error before this
         except torch._dynamo.exc.Unsupported as e:
+<<<<<<< HEAD
             self.assertIn(
                 "Input marked with `pytree.register_constant` constructed in the `torch.compile` region",
                 str(e),
             )
+=======
+            msg = """
+You are calling a `nonstrict_trace`-ed function with an input that contains an object of type <DecoratorTests.test_nonstrict_newly_constructed_trace_register_constant_type_error.<locals>.State>, which was marked with `pytree.register_constant`. However, the object was constructed _inside_ the `torch.compile` region.
+
+Please construct the object _outside_ the `torch.compile` region, or submit an issue to GitHub.
+"""  # NOQA: B950
+            self.assertIn(msg, str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nonstrict_trace_object_in_context_error(self):
         class Point:
@@ -847,9 +887,23 @@ def fn(x, y):
             fn(x, y)
             self.assertFalse(True)  # must raise error before this
         except torch._dynamo.exc.Unsupported as e:
+<<<<<<< HEAD
             self.assertIn(
                 "Invalid use of pytree_flatten with nonstrict_trace-ed function", str(e)
             )
+=======
+            msg = """
+You are calling a `nonstrict_trace`-ed function where one one of the inputs has been registered with a `pytree_flatten` that puts an object of type <DecoratorTests.test_nonstrict_trace_object_in_context_error.<locals>.Point> into the context.
+
+Please consider modifying that `pytree_flatten` to avoid putting the object into context, and apply one of the following to <DecoratorTests.test_nonstrict_trace_object_in_context_error.<locals>.Point>
+  * `torch.utils._pytree.register_constant`
+  * `torch.utils._pytree.register_dataclass`
+  * `torch.utils._pytree.register_pytree_node`
+
+If the above doesn't work, please subtmit an issue to GitHub.
+"""  # NOQA: B950
+            self.assertIn(msg, str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
@@ -894,6 +948,7 @@ def gn(x):
         self.assertEqual(gn(inp), inp + 3)
         self.assertEqual(cnts.frame_count, 1)
 
+<<<<<<< HEAD
     def test_step_unsupported(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -920,6 +975,8 @@ def fn(x):
     @skipIfWindows(
         msg="TODO: (xuhancn), confirm if torch.compiler.disable work on Windows."
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_disable_recursive_false(self):
         def fn2(x):
             return x + 1
@@ -1091,10 +1148,18 @@ def fn3(x):
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(cnts.op_count, 4)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             Unsupported, r"Skip calling `torch.compiler.disable\(\)`d function"
         ):
             fn3(torch.randn(4, 5))
+=======
+        try:
+            fn3(torch.randn(4, 5))
+            self.assertFalse(True)
+        except torch._dynamo.exc.Unsupported as e:
+            self.assertIn("Skip calling `torch.compiler.disable()`d function", str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_disable_optimize(self):
         cnt = torch._dynamo.testing.CompileCounter()
@@ -1473,6 +1538,7 @@ def a(x, n):
         self.assertEqual(out1, inp + 2)
         self.assertEqual(out2, inp + 2)
 
+<<<<<<< HEAD
     def test_fail_on_recompile_shows_guard_details(self):
         @torch.compile(backend="eager", dynamic=False)
         def f(x):
@@ -1497,6 +1563,8 @@ def post_munge(s):
                 post_munge=post_munge,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_stance_fail_on_recompile_with_disable(self):
         @torch.compiler.disable
         def inner(x):
@@ -1768,6 +1836,7 @@ def f4(x):
         ):
             f4(torch.randn(3))
 
+<<<<<<< HEAD
     def test_error_on_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -2092,6 +2161,8 @@ def outer_f2(x):
         with self.assertRaises(Unsupported):
             outer_f2(inp)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index be605ccdd1e18..6abdb46f4ca7e 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -2,6 +2,7 @@
 
 # ruff: noqa: TRY002
 
+<<<<<<< HEAD
 import enum
 import itertools
 import operator
@@ -9,6 +10,13 @@
 import unittest
 import weakref
 from collections import defaultdict, namedtuple, OrderedDict, UserDict
+=======
+import itertools
+import types
+import unittest
+import weakref
+from collections import defaultdict, namedtuple, OrderedDict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import torch
@@ -19,6 +27,7 @@
 import torch.utils.checkpoint
 from torch._dynamo.testing import same
 from torch._dynamo.utils import dict_items
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     make_dynamo_test,
@@ -26,16 +35,21 @@
     parametrize,
 )
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SimpleDict(dict):
     pass
 
 
+<<<<<<< HEAD
 class DummyUserDict(UserDict):
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DictTests(torch._dynamo.test_case.TestCase):
     def test_dict_subclass_instantiation(self):
         def fn(x):
@@ -57,6 +71,7 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
     def test_dict_contains_enum(self):
         class TensorDim(str, enum.Enum):
             DDP = "ddp"
@@ -81,6 +96,8 @@ def forward(self, x):
         opt_f = torch.compile(mod)
         self.assertEqual(mod(inp), opt_f(inp))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dict_subclass_local_with_non_dict_method(self):
         # Checks that add_1 method is inlined
         class MethodDict(dict):
@@ -817,6 +834,7 @@ def fn(x):
         x = torch.randn(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
     def test_construct_user_dict_and_return(self):
         def fn(x):
             return DummyUserDict({"a": x + 1})
@@ -828,6 +846,8 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         self.assertEqual(res["a"], opt_fn(x)["a"])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fn_id(self):
         def fn(x, f):
             d = {id(f): 3}
@@ -947,7 +967,11 @@ def test_mapping_proxy_existing_local_mutation(self):
 
         def fn(x):
             # Dynamo should not cause a graph break here because it knows that
+<<<<<<< HEAD
             # the existing proxy can't point to this new dict
+=======
+            # the existing proxy cant point to this new dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             other_dict = {}
             other_dict["d"] = 4
             y = torch.sin(x * mp["c"])
@@ -971,6 +995,7 @@ def fn(x):
         self.assertEqual(["b", "c", "a"], list(opt_fn(x).keys()))
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
     def test_mapping_proxy_ban_muation_on_dict_realization(self):
         def fn(x):
             class Foo:
@@ -990,6 +1015,8 @@ class Foo:
         self.assertEqual(ref, res)
         self.assertEqual(foo1.bar, foo2.bar)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_overridden_get_item(self):
         class MyDict(dict):
             def __init__(self, *args, **kwargs):
@@ -1039,10 +1066,19 @@ def fn(b: Any):
             a = {"one": torch.ones(1)}
             return a | b
 
+<<<<<<< HEAD
         from torch._dynamo.exc import Unsupported
 
         for arg in args:
             with self.assertRaisesRegex(Unsupported, "Observed exception"):
+=======
+        from torch._dynamo.exc import InternalTorchDynamoError
+
+        for arg in args:
+            with self.assertRaisesRegex(
+                InternalTorchDynamoError, "unsupported operand type"
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _ = fn(arg)
 
     def test_builtin_or_with_diff_keys(self):
@@ -1076,9 +1112,13 @@ def f():
     def test_newly_constructed_default_dict(self):
         def f(x):
             d = defaultdict(list)
+<<<<<<< HEAD
             d[0] = [
                 42,
             ]
+=======
+            d[0] = 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x + 1, d
 
         x = torch.ones(2)
@@ -1087,6 +1127,7 @@ def f(x):
 
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @unittest.expectedFailure
     def test_newly_constructed_default_dict_with_dict(self):
         def f(x):
@@ -1728,6 +1769,8 @@ def test_move_to_end(self):
         p.move_to_end("a")
         self.assertEqual(list(p.keys()), list("bc"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 17e28d38001c7..0bd5f5ee3fc28 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -4,7 +4,10 @@
 import re
 import traceback
 import unittest
+<<<<<<< HEAD
 import unittest.mock
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from functools import lru_cache
 
@@ -13,8 +16,13 @@
 import torch._dynamo.config
 import torch._dynamo.test_case
 import torch.utils._pytree as python_pytree
+<<<<<<< HEAD
 from torch._dynamo.exc import ResumePrologueTracingError, Unsupported
 from torch._dynamo.testing import skipIfNotPy312, skipIfOnlyNotPy312
+=======
+from torch._dynamo.exc import Unsupported
+from torch._dynamo.testing import skipIfNotPy312
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -47,7 +55,31 @@ def __exit__(self, exc_type, exc_value, traceback):
         pass
 
 
+<<<<<<< HEAD
 class ErrorMessagesTest(LoggingTestCase):
+=======
+class GraphBreakMessagesTest(LoggingTestCase):
+    def test_dynamic_shape_operator(self):
+        def fn():
+            return torch.nonzero(torch.rand([10, 10]))
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
+            """\
+Dynamic shape operator
+  Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
+  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
+
+  Developer debug context: aten.nonzero.default
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return torch.nonzero(torch.rand([10, 10]))""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shape_operator_no_meta_kernel(self):
         def fn():
             return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
@@ -63,13 +95,41 @@ def fn():
 
   Developer debug context: aten.linalg_lstsq.default
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0037.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
     return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
             )
 
+<<<<<<< HEAD
+=======
+    def test_data_dependent_operator(self):
+        def fn(x):
+            return x.item()
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(
+                torch.Tensor([1])
+            ),
+            """\
+Unsupported Tensor.item() call with capture_scalar_outputs=False
+  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
+  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
+
+  Developer debug context: call_method TensorVariable() item () {}
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return x.item()""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_data_dependent_operator2(self):
         def fn(x):
             return torch.equal(x, x)
@@ -87,7 +147,10 @@ def fn(x):
 
   Developer debug context: aten.equal.default
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0033.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -113,9 +176,14 @@ def fn(lst):
   Explanation: Cannot perform sort with non-constant key. First non-constant key type: <class 'torch.Tensor'>. Most notably, we cannot sort with Tensor or SymInt keys, but we can sort ints.
   Hint: Use something else as the key.
 
+<<<<<<< HEAD
   Developer debug context: LazyVariableTracker(realized: TensorVariable())
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0207.html
+=======
+  Developer debug context: TensorVariable()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -137,11 +205,17 @@ def fn(it):
   Hint: Avoid calling `zip.__iter__` in your code.
   Hint: Please report an issue to PyTorch.
   Hint: Dynamo does not fully support tracing builtin iterators (e.g. `map`, `zip`, `enumerate`) passed in from uncompiled to compiled regions (e.g. `torch.compile(fn)(enumerate(...))`). This can happen unintentionally if a previous graph break happens with a builtin iterator in the local scope.
+<<<<<<< HEAD
   Hint: List/dict comprehensions in Python <= 3.11 result in implicit function calls, which Dynamo cannot trace as a top level frame. Possible workarounds are (1) use a loop instead of a comprehension, (2) fix any graph breaks in the function above the comprehension, (3) wrap the comprehension in a function, or (4) use Python 3.12+.
 
   Developer debug context: call_method UserDefinedObjectVariable(zip) __iter__ [] {}
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0156.html
+=======
+
+  Developer debug context: call_method UserDefinedObjectVariable(zip) __iter__ () {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -166,11 +240,17 @@ def fn(x, items):
   Hint: Please report an issue to PyTorch.
   Hint: Consider moving the creation of dict view object (e.g. `dict.keys()`, `dict.items()`,) to the compiled region, instead of passing it as an input to the compiled region.
   Hint: Dynamo does not fully support tracing builtin iterators (e.g. `map`, `zip`, `enumerate`) passed in from uncompiled to compiled regions (e.g. `torch.compile(fn)(enumerate(...))`). This can happen unintentionally if a previous graph break happens with a builtin iterator in the local scope.
+<<<<<<< HEAD
   Hint: List/dict comprehensions in Python <= 3.11 result in implicit function calls, which Dynamo cannot trace as a top level frame. Possible workarounds are (1) use a loop instead of a comprehension, (2) fix any graph breaks in the function above the comprehension, (3) wrap the comprehension in a function, or (4) use Python 3.12+.
 
   Developer debug context: call_method UserDefinedObjectVariable(dict_items) __iter__ [] {}
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0156.html
+=======
+
+  Developer debug context: call_method UserDefinedObjectVariable(dict_items) __iter__ () {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -194,7 +274,10 @@ def fn(it):
 
   Developer debug context: call_function UserDefinedObjectVariable(zip) [] {}
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0147.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -216,9 +299,14 @@ def fn(obj):
   Hint: If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then it may be the case that it was created outside the compiled region, which Dynamo does not support. Supported context managers can cross graph break boundaries only if they are local non-closure variables, or are intermediate values.
   Hint: File an issue to PyTorch. Simple context managers can potentially be supported, but note that context managers can't be supported in general
 
+<<<<<<< HEAD
   Developer debug context: Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on LazyVariableTracker(realized: ConstantVariable(int: 3))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0142.html
+=======
+  Developer debug context: Attempted SETUP_WITH/BEFORE_WITH on ConstantVariable(int: 3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -246,10 +334,14 @@ def fn(x):
     Exception:test
     Traceback:
       File "test_error_messages.py", line N, in fn
+<<<<<<< HEAD
         return x + 1
 
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0219.html""",
+=======
+        return x + 1""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_unsupported_builtin(self):
@@ -268,7 +360,10 @@ def fn():
 
   Developer debug context: builtin print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0059.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -294,7 +389,10 @@ def post_munge(s):
 
   Developer debug context: module: unittest.case, qualname: skip, skip reason: <missing reason>
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -316,7 +414,10 @@ def fn():
 
   Developer debug context: module: torch._dynamo.decorators, qualname: disable, skip reason: <missing reason>
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -345,7 +446,10 @@ def post_munge(s):
 
   Developer debug context: qualname: skip, name: skip, filename: `case.py`, skip reason: skipped according trace_rules.lookup unittest
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0008.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -367,7 +471,10 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -388,7 +495,10 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{'msg': ConstantVariable(str: 'test graph break')}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -410,7 +520,10 @@ def fn():
 
   Developer debug context: module: _warnings, qualname: warn, skip reason: <missing reason>
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -424,7 +537,11 @@ def test_optree_graph_break_message(self):
         @torch.compile(backend="eager")
         def fn(x):
             d = {"a": 1}
+<<<<<<< HEAD
             optree.tree_flatten_with_path(d)
+=======
+            optree.tree_flatten(d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.sin(x)
 
         fn(torch.randn(4))
@@ -434,12 +551,20 @@ def fn(x):
             first_graph_break,
             """\
 Attempted to call function marked as skipped
+<<<<<<< HEAD
   Explanation: Dynamo cannot trace optree C/C++ function optree._C.PyCapsule.flatten_with_path.
   Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
 
   Developer debug context: module: optree._C, qualname: PyCapsule.flatten_with_path, skip reason: <missing reason>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
+=======
+  Explanation: Dynamo cannot trace optree C/C++ function optree._C.PyCapsule.flatten.
+  Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
+
+  Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: <missing reason>
+""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @scoped_load_inline
@@ -475,6 +600,7 @@ def f(x):
         first_graph_break = next(iter(counters["graph_break"].keys()))
 
         first_graph_break = re.sub(r"mylib(_v\d+)?", "mylib", first_graph_break)
+<<<<<<< HEAD
         # HACK: this patches around the fact that PyBind11 improperly sets the
         # __qualname__ attribute on functions and methods; see
         # https://github.com/pybind/pybind11/issues/5774.  This should be removed if
@@ -482,6 +608,8 @@ def f(x):
         first_graph_break = re.sub(
             r"pybind11_detail_function_record_v[^ .]+", "PyCapsule", first_graph_break
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertExpectedInline(
             first_graph_break,
@@ -492,8 +620,12 @@ def f(x):
   Hint: If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
 
   Developer debug context: module: mylib, qualname: PyCapsule.foobar, skip reason: <missing reason>
+<<<<<<< HEAD
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
+=======
+""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         cpp_source = """
@@ -528,6 +660,32 @@ def f(x):
             f(x)
         self.assertEqual(len(ws), 2)
 
+<<<<<<< HEAD
+=======
+    def test_slice_with_tensor(self):
+        def fn(x, y):
+            return x[:y]
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(
+                torch.randn(10),
+                torch.tensor([3]),
+            ),
+            """\
+Dynamic slicing with Tensor arguments
+  Explanation: Creating slices with Tensor arguments is not supported. e.g. `l[:x]`, where `x` is a 1-element tensor.
+  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+
+  Developer debug context: SliceVariable start: ConstantVariable(NoneType: None), stop: TensorVariable(), step: ConstantVariable(NoneType: None)
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return x[:y]""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_observed_exception(self):
         def fn():
             raise RuntimeError("test")
@@ -541,9 +699,14 @@ def fn():
   Hint: Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled.
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
+<<<<<<< HEAD
   Developer debug context: raised exception RuntimeError([ConstantVariable(str: 'test')])
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0088.html
+=======
+  Developer debug context: raised exception ExceptionVariable(<class 'RuntimeError'>)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -569,7 +732,10 @@ def fn(mod):
 
   Developer debug context: Foo
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0119.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -598,7 +764,10 @@ def fn(mod, x):
 
   Developer debug context: nn.Module subclass: Foo, name: attr, attribute type: module
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0161.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -628,7 +797,10 @@ def fn():
 
   Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr), GenericContextWrappingVariable(GenericCtxMgr)]
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0066.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -643,8 +815,12 @@ def fn():
   Hint: Remove the `torch._dynamo.graph_break()` call.
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+<<<<<<< HEAD
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html""",
+=======
+""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_load_build_class(self):
@@ -658,6 +834,7 @@ class Foo:
             Unsupported,
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
+<<<<<<< HEAD
 Attempted to call function marked as skipped
   Explanation: Dynamo does not know how to trace the builtin `builtins.__build_class__.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
   Hint: If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
@@ -666,6 +843,15 @@ class Foo:
   Developer debug context: module: builtins, qualname: __build_class__, skip reason: <missing reason>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
+=======
+LOAD_BUILD_CLASS bytecode not supported
+  Explanation: Dynamo does not support tracing classes that are defined in the compiled region.
+  Hint: Move the class definition out of the compiled region.
+  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+
+  Developer debug context:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -698,7 +884,10 @@ def post_munge(s):
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
   Developer debug context: GET_AITER with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(GET_AITER)
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0082.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -722,14 +911,23 @@ def post_munge(s):
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
 Reconstruction failure
+<<<<<<< HEAD
   Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+=======
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
+<<<<<<< HEAD
   Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
+=======
+  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -765,7 +963,10 @@ def post_munge(s):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 User code traceback:
   File "test_error_messages.py", line N, in test_reconstruction_failure_gb
     torch.compile(fn, backend="eager")()
@@ -778,14 +979,23 @@ def post_munge(s):
             post_munge(munge_exc(records[1].exc_info[1], suppress_suffix=True, skip=0)),
             """\
 Reconstruction failure
+<<<<<<< HEAD
   Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+=======
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
+<<<<<<< HEAD
   Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
+=======
+  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -814,7 +1024,10 @@ def fn(x):
 
   Developer debug context:
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0087.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -838,13 +1051,17 @@ def fn(x):
 
   Developer debug context: attempted to jump with TensorVariable()
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0170.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
     if x.sum() > 0:""",
         )
 
+<<<<<<< HEAD
     # Test that the bytecode source attribution is correct with VariableTracker
     @make_logging_test(trace_bytecode=True)
     def test_variable_tracker_source_attribution(self, records):
@@ -890,6 +1107,8 @@ def find_trace_bytecode_lines(long_string):
             "LazyVariableTracker(realized: UserFunctionVariable())", all_lines[3]
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_logging_test(graph_breaks=True)
     def test_data_dependent_branching_gb(self, records):
         def fn(x):
@@ -950,7 +1169,10 @@ def fn(x):
 
   Developer debug context: value: ConstantVariable(bool: False)
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0034.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -994,7 +1216,10 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1036,7 +1261,10 @@ def gn():
             "<Internal traceback>\n",
             msg,
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             msg,
             """\
@@ -1048,7 +1276,10 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1073,6 +1304,10 @@ def hn(x):
 
         torch.compile(fn, backend="eager")(torch.randn(3))
 
+<<<<<<< HEAD
+=======
+        # check the log for the 2nd torch._dynamo.graph_break()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             munge_exc(records[-1].getMessage(), skip=0),
             """\
@@ -1083,7 +1318,10 @@ def hn(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 User code traceback:
   File "test_error_messages.py", line N, in test_nested_compile_user_frames
     torch.compile(fn, backend="eager")(torch.randn(3))
@@ -1098,6 +1336,7 @@ def hn(x):
 
     @torch._dynamo.config.patch(verbose=True)
     @make_logging_test(graph_breaks=True)
+<<<<<<< HEAD
     def test_latest_bytecode_to_graph_break_fullgraph(self, records):
         def fn(x):
             y = x + 1
@@ -1196,6 +1435,8 @@ def fn(x):
 
     @torch._dynamo.config.patch(verbose=True)
     @make_logging_test(graph_breaks=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_break_traceback_above_dynamo_shows_user_code(self, records):
         @torch.compile(backend="eager")
         # NOTE: comments in this test are used to differentiate lines!
@@ -1295,7 +1536,10 @@ def f3(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 User code traceback:
   File "test_error_messages.py", line N, in test_graph_break_traceback_collapsed_resume_frames
     f1(torch.randn(3))
@@ -1380,12 +1624,20 @@ def post_munge(s):
             lambda: outer(f, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
+<<<<<<< HEAD
   Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
   Hint: Remove the `torch.compiler.disable` call
 
   Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
+=======
+  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
+  Hint: Remove the `torch.compiler.disable` call
+
+  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1402,12 +1654,20 @@ def g(x):
             lambda: outer(g, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
+<<<<<<< HEAD
   Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
   Hint: Remove the `torch.compiler.disable` call
 
   Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
+=======
+  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
+  Hint: Remove the `torch.compiler.disable` call
+
+  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1433,7 +1693,10 @@ def forward(self, x):
 
   Developer debug context: source: LocalSource(local_name='fn', is_input=True, dynamism=None, is_derefed_cell_contents=False)
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0148.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1441,6 +1704,7 @@ def forward(self, x):
             post_munge=post_munge,
         )
 
+<<<<<<< HEAD
     # Test that errors while tracing resume function prologues do not get suppressed
     def test_graph_break_in_buggy_resume_prologue(self):
         import torch._dynamo.bytecode_transformation as bt
@@ -1483,6 +1747,8 @@ def bad_clean_and_assemble_instructions(instructions, *args):
             ):
                 fn(torch.randn(3))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
index ad56417ed568d..26fcad604f323 100644
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@@ -43,7 +43,10 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_exc.py", line N, in fn001
@@ -183,7 +186,10 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
+<<<<<<< HEAD
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 User code traceback:
   File "test_exc.py", line N, in test_graph_break_log
     torch.compile(fn001, backend="eager")(torch.randn(1))
@@ -253,6 +259,7 @@ def fn(x, shape):
 
 Model:
   ==> L['shape'][0]: 0
+<<<<<<< HEAD
   ==> L['shape'][1]: 0
   ==> L['shape'][2]: 0
   ==> L['x'].size()[0]: 3
@@ -260,6 +267,15 @@ def fn(x, shape):
   ==> L['x'].stride()[0]: 1
   ==> s3: 0
   ==> s52: 0
+=======
+  ==> L['shape'][1]: 1
+  ==> L['shape'][2]: 1
+  ==> L['x'].size()[0]: 3
+  ==> L['x'].storage_offset(): 0
+  ==> L['x'].stride()[0]: 1
+  ==> s3: 1
+  ==> s52: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ==> s77: 3
   ==> s86: 0
 
@@ -317,16 +333,27 @@ def fn(x, shape):
     %split : [num_users=3] = call_method[target=split](args = (%l_x_, (%l_shape_0_, %l_shape_1_, %l_shape_2_)), kwargs = {})
 
 Model:
+<<<<<<< HEAD
   ==> L['shape'][0]: 0
   ==> L['shape'][1]: 0
+=======
+  ==> L['shape'][0]: 1
+  ==> L['shape'][1]: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ==> L['shape'][2]: 0
   ==> L['x'].size()[0]: 3
   ==> L['x'].storage_offset(): 0
   ==> L['x'].stride()[0]: 1
   ==> s3: 0
+<<<<<<< HEAD
   ==> s52: 0
   ==> s77: 3
   ==> s86: 0
+=======
+  ==> s52: 1
+  ==> s77: 3
+  ==> s86: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Assertions:
   ==> (== 0 L['x'].storage_offset())
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 5240204df84c0..5cb6547cfc4fc 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -10,7 +10,10 @@
 import torch.nn
 import torch.utils.checkpoint
 from torch._dynamo.bytecode_transformation import Instruction
+<<<<<<< HEAD
 from torch._dynamo.exc import Unsupported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.symbolic_convert import SpeculationLog, SpeculationLogDivergence
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -127,7 +130,11 @@ def fn(x):
                 x = torch.sigmoid(x)
                 try:
                     x = torch.cos(x)
+<<<<<<< HEAD
                     raise AssertionError  # noqa: B904
+=======
+                    raise AssertionError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except AssertionError:
                     x = torch.cos(x)
 
@@ -137,6 +144,7 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_exception_with_vars(self):
         def fn(x):
             try:
@@ -151,6 +159,8 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_autocast_with_exception(self):
         class Optimizer(torch.autograd.Function):
             @staticmethod
@@ -187,7 +197,11 @@ def test_propagate_exception_inside_ctx_manager(self):
         def cm():
             try:
                 yield
+<<<<<<< HEAD
             except BaseException:  # noqa: B036
+=======
+            except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise ValueError  # noqa: B904
 
         @contextlib.contextmanager
@@ -265,7 +279,11 @@ def ctx():
                 for x, y in args:
                     try:
                         fn(x, y)
+<<<<<<< HEAD
                     except BaseException:  # noqa: B036
+=======
+                    except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         new_exc = sys.exc_info()
                         fix_exc_context(frame_exc[1], new_exc[1], prev_exc[1])
                         prev_exc = new_exc
@@ -273,7 +291,11 @@ def ctx():
                 try:
                     fixed_ctx = prev_exc[1].__context__
                     raise prev_exc[1]
+<<<<<<< HEAD
                 except BaseException:  # noqa: B036
+=======
+                except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     prev_exc[1].__context__ = fixed_ctx
                     raise
 
@@ -307,7 +329,11 @@ def fn(x):
 
         x = torch.randn(4)
         fn(x)
+<<<<<<< HEAD
         # Can't use fullgraph=True because RERAISE is not supported
+=======
+        # Cant use fullgraph=True because RERAISE is not supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opt_fn = torch.compile(fn, backend="eager")
         opt_fn(x)
 
@@ -631,7 +657,11 @@ def fn():
                 raise ZeroDivisionError
             except ZeroDivisionError:
                 try:
+<<<<<<< HEAD
                     raise ValueError  # noqa: B904
+=======
+                    raise ValueError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except ValueError:
                     pass
                 raise
@@ -681,7 +711,11 @@ def cm():
                 yield 1
             except ValueError:
                 try:
+<<<<<<< HEAD
                     raise TypeError  # noqa: B904
+=======
+                    raise TypeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 finally:
                     pass
 
@@ -711,7 +745,11 @@ def fn():
                 raise ValueError
             except ValueError:
                 try:
+<<<<<<< HEAD
                     raise TypeError  # noqa: B904
+=======
+                    raise TypeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 finally:
                     pass
 
@@ -764,7 +802,11 @@ def fn(t):
                 raise GeneratorExit
             except Exception:
                 return t.sin()
+<<<<<<< HEAD
             except BaseException:  # noqa: B036
+=======
+            except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return t.cos()
 
         t = torch.randn(2)
@@ -889,26 +931,37 @@ def fn():
         assert z == 1
 
     def test_user_defined_exception_variable(self):
+<<<<<<< HEAD
+=======
+        @torch.compile(backend="eager", fullgraph=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(t):
             z = 0
             try:
                 raise CustomException
             except ValueError:
                 z = 1
+<<<<<<< HEAD
             except CustomException as e:
                 # trying to call python_type on the
                 # UserDefinedExceptionClassVariable
                 cls = type(e)
                 if type(cls) is type:
                     t = t + 1
+=======
+            except CustomException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 z = 2
             assert z == 2
             return t.sin()
 
         t = torch.randn(2)
         fn(t)
+<<<<<<< HEAD
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         self.assertEqual(fn(t), opt_fn(t))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_user_defined_exception_with_args(self):
         @torch.compile(backend="eager", fullgraph=True)
@@ -941,6 +994,7 @@ def test_raise_set___context__(self):
 
         assert exc2.__context__ is None
 
+<<<<<<< HEAD
     def test_exception_kwargs(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn():
@@ -964,6 +1018,8 @@ def forward(self, x):
         with self.assertRaisesRegex(Exception, "weight = self.linear.w"):
             torch._dynamo.functional_export._dynamo_graph_capture_for_export(Model())(x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(ExceptionTests)
 
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index a411db42fdebd..31b6db0360021 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -49,9 +49,15 @@ def pre_attention_state_ops(input, mems, state):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
+<<<<<<< HEAD
             for _ in range(4):
                 bar2 = []
                 for _ in range(3):
+=======
+            for _ in range(0, 4):
+                bar2 = []
+                for _ in range(0, 3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -368,6 +374,7 @@ def func(x):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
+<<<<<<< HEAD
     def test_immutable_list_dict(self):
         class M(torch.nn.Module):
             def forward(self, x1, x2):
@@ -387,6 +394,8 @@ def forward(self, x1, x2):
         res = torch.compile(ep.module(), dynamic=True, fullgraph=True)(x1, x2)
         self.assertTrue(torch._dynamo.utils.same(res, M()(x1, x2)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dupes(self):
         inp = torch.tensor([0.1, 0.1])
 
@@ -665,9 +674,15 @@ def pre_attention_state_ops(input, mems, state):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
+<<<<<<< HEAD
             for _ in range(4):
                 bar2 = []
                 for _ in range(3):
+=======
+            for _ in range(0, 4):
+                bar2 = []
+                for _ in range(0, 3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -1923,6 +1938,10 @@ def forward(self, x):
     cond = torch.ops.higher_order.cond(le, cond_true_0, cond_false_0, (l_x_,));  le = cond_true_0 = cond_false_0 = l_x_ = None
     getitem_3 = cond[0]
     sym_size_int_1 = torch.ops.aten.sym_size.int(getitem_3, 0);  getitem_3 = None
+<<<<<<< HEAD
+=======
+    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ge = sym_size_int_1 >= 2;  sym_size_int_1 = None
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 2 on node 'ge'");  ge = _assert_scalar_default = None
     getitem_2 = cond[0];  cond = None
@@ -2077,6 +2096,25 @@ def f(x):
         self.assertEqual(count, 1)
         self.assertEqual(gm_torch_mode(inp).shape, f(inp).shape)
 
+<<<<<<< HEAD
+=======
+    def test_dynamic_slicing_invalid(self):
+        def g(x, y):
+            return x[y : x.shape[0]]
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported,
+            "Dynamic slicing with Tensor arguments",
+        ):
+            torch._dynamo.export(
+                g,
+                aten_graph=True,
+            )(
+                torch.randn(4, 5),
+                torch.tensor(2),
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(capture_scalar_outputs=True)
     def test_dynamic_slicing_simple(self):
         def f(x):
@@ -2696,6 +2734,7 @@ def forward(self, x, y):
             torch._dynamo.exc.UserError,
             ".*y.*size.*2.* = 4 is not equal to .*x.*size.*1.* = 3",
         ):
+<<<<<<< HEAD
             with torch._export.config.patch(use_new_tracer_experimental=True):
                 torch.export.export(
                     bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
@@ -2710,6 +2749,21 @@ def forward(self, x, y):
             if node.op == "placeholder":
                 shape = node.meta["val"].shape
                 self.assertEqual(shape[1], shape[2])
+=======
+            torch.export.export(bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True)
+        y = torch.randn(10, 3, 3)
+        ebar = torch.export.export(
+            bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in ebar.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s17, s27, s27])", "torch.Size([s17, s27, s27])"],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(
         capture_dynamic_output_shape_ops=True,
@@ -2738,6 +2792,10 @@ def f(x):
     def test_exported_graph_serialization(self):
         def f(x, y):
             b = x.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.empty((b, y.shape[0]))
 
         x = torch.tensor([3])
@@ -3132,6 +3190,10 @@ def f_pred_complex_expression_traced_as_symnode_var(x):
             gm, _ = torch._dynamo.export(f, aten_graph=True)(*example_inputs)
             self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
+<<<<<<< HEAD
+=======
+    @unittest.expectedFailure  # TODO: Not sure why dynamo creates a new inputs for self.a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sum_param(self):
         # Setting a new attribute inside forward()
         class Foo(torch.nn.Module):
@@ -3518,20 +3580,42 @@ def forward(self, pred, x):
             [3, 3, 4, 5],
             [true_graph, true_graph, false_graph, false_graph],
             [true_guard_code, true_guard_code, false_guard_code, false_guard_code],
+<<<<<<< HEAD
             # Outer shape env should have no guards in it because we never specialize on the outer symbool.
             [[], [], [], []],
         )
 
     def test_input_global(self) -> None:
+=======
+            # Outter shape env should have no guards in it because we never specialize on the outter symbool.
+            [[], [], [], []],
+        )
+
+    def test_invalid_input_global(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global bulbous_bouffant
         bulbous_bouffant = torch.randn(3)
 
         def f(y):
             return bulbous_bouffant + y
 
+<<<<<<< HEAD
         torch._dynamo.export(f)(torch.randn(3))
 
     def test_input_global_multiple_access(self) -> None:
+=======
+        self.assertExpectedInlineMunged(
+            UserError,
+            lambda: torch._dynamo.export(f)(torch.randn(3)),
+            """\
+G['bulbous_bouffant'], accessed at:
+  File "test_export.py", line N, in f
+    return bulbous_bouffant + y
+""",
+        )
+
+    def test_invalid_input_global_multiple_access(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global macademia
         macademia = torch.randn(3)
 
@@ -3545,17 +3629,45 @@ def f(y):
             y = g(y)
             return macademia + y
 
+<<<<<<< HEAD
         torch._dynamo.export(f)(torch.randn(3))
 
     def test_input_nonlocal(self) -> None:
+=======
+        # NB: This doesn't actually work (it only reports the first usage),
+        # but I'm leaving the test here in case we fix it later
+        self.assertExpectedInlineMunged(
+            UserError,
+            lambda: torch._dynamo.export(f)(torch.randn(3)),
+            """\
+G['macademia'], accessed at:
+  File "test_export.py", line N, in f
+    y = g(y)
+  File "test_export.py", line N, in g
+    y = macademia + y
+""",
+        )
+
+    def test_invalid_input_nonlocal(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arglebargle = torch.randn(3)
 
         def f(y):
             return arglebargle + y
 
+<<<<<<< HEAD
         torch._dynamo.export(f)(torch.randn(3))
 
     def test_input_unused_nonlocal_ok(self) -> None:
+=======
+        self.assertExpectedInlineMunged(
+            UserError,
+            lambda: torch._dynamo.export(f)(torch.randn(3)),
+            """L['arglebargle'], a closed over free variable""",
+        )
+
+    def test_invalid_input_unused_nonlocal_ok(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arglebargle = torch.randn(3)
 
         def f(y):
@@ -4627,6 +4739,10 @@ def test_export_fast_binary_broadcast_check_unbacked(self, device):
         class MyModel(torch.nn.Module):
             def forward(self, numel, scalar):
                 u0 = numel.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = torch.ones(u0 + 1)
                 return scalar - x
 
diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
index c67fafba2edbb..3bf37e046dbd9 100644
--- a/test/dynamo/test_export_mutations.py
+++ b/test/dynamo/test_export_mutations.py
@@ -29,7 +29,11 @@ def forward(self, x):
                 self.a = self.a.to(torch.float64)
                 return x.sum() + self.a.sum()
 
+<<<<<<< HEAD
         self.check_same_with_export(Foo(), torch.randn(3, 2))
+=======
+        self.check_failure_on_export(Foo(), torch.randn(3, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_module_attribute_mutation_violation_negative_1(self):
         # Mutating attribute with a Tensor type inside __init__ but
diff --git a/test/dynamo/test_frame_init.py b/test/dynamo/test_frame_init.py
index 20cebe9e7007e..5f52073a67715 100644
--- a/test/dynamo/test_frame_init.py
+++ b/test/dynamo/test_frame_init.py
@@ -95,11 +95,15 @@ def callback1(frame, cache_entry, frame_state):
                 transformed_code = code_map1[frame.f_code]
                 return wrap_guarded_code(
                     GuardedCode(
+<<<<<<< HEAD
                         transformed_code,
                         empty_guard_manager,
                         CompileId(
                             frame_id=None, frame_compile_id=0, compiled_autograd_id=0
                         ),
+=======
+                        transformed_code, empty_guard_manager, CompileId(None, 0, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
             return ConvertFrameReturn()
@@ -109,11 +113,15 @@ def callback2(frame, cache_entry, frame_state):
                 transformed_code = code_map2[frame.f_code]
                 return wrap_guarded_code(
                     GuardedCode(
+<<<<<<< HEAD
                         transformed_code,
                         empty_guard_manager,
                         CompileId(
                             frame_id=None, frame_compile_id=0, compiled_autograd_id=0
                         ),
+=======
+                        transformed_code, empty_guard_manager, CompileId(None, 0, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
             return ConvertFrameReturn()
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index c06331cea7dbf..c2b7cab4b181e 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -11,7 +11,10 @@
 import operator
 import random
 import sys
+<<<<<<< HEAD
 import types
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import typing
 import unittest
 from dataclasses import dataclass, field
@@ -31,7 +34,11 @@
     EagerAndRecordGraphs,
     normalize_gm,
 )
+<<<<<<< HEAD
 from torch._dynamo.utils import ifdynstaticdefault, range_iterator, same
+=======
+from torch._dynamo.utils import ifdynstaticdefault, same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.variables import ConstantVariable, SkipFunctionVariable
 from torch._dynamo.variables.lists import RangeVariable
 from torch.nn import functional as F
@@ -40,16 +47,22 @@
     instantiate_parametrized_tests,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_GPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import *  # noqa: F403
 
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T = TypeVar("T")
 
 d = torch.ones(10, 10)
@@ -273,6 +286,7 @@ def test_itertools_product(a, b):
             v = v + x * i
         return v
 
+<<<<<<< HEAD
     def test_itertools_product_args(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(*args, **kwargs):
@@ -321,6 +335,8 @@ def test_itertools_filterfalse_basic(a, b):
             a += x
         return a
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_itertools_chain(a, b):
         v = a
@@ -410,6 +426,13 @@ def test_itertools_combinations(a, b):
             combs.append(torch.ones(size))
         return combs
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        sys.version_info < (3, 10),
+        "itertools.pairwise was added at Python 3.10",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_itertools_pairwise(a):
         pairs = []
@@ -569,11 +592,14 @@ def test_tuple2(a, b):
         args = [a, b]
         return sub(*args)
 
+<<<<<<< HEAD
     @make_test
     def test_tuple_map(a, b):
         t = tuple(map(torch.sin, [a, b]))
         return t[0] + t[1]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_size_tuple_add(self):
         def fn():
             size = torch.Size([])
@@ -1151,10 +1177,17 @@ def test_tensor_type(a, b):
         m = a.to(torch.float16)
         return b.type(m.type())
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_GPU, "requires gpu")
     @make_test
     def test_tensor_type2(a, b):
         m = a.to(device_type)
+=======
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @make_test
+    def test_tensor_type2(a, b):
+        m = a.to("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return m + b.type(m.type())
 
     @make_test
@@ -1295,7 +1328,11 @@ def test_module_constant(x, y):
 
     @make_test
     def test_inline_softmax(x, y):
+<<<<<<< HEAD
         # This is common in some huggingface models
+=======
+        # This is common in sme huggingface models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.nn.Softmax(dim=-1)(x + y * 2)
 
     @make_test
@@ -1758,6 +1795,7 @@ def test_tuple_contains(a, b):
         return a - b
 
     @make_test
+<<<<<<< HEAD
     def test_set_in_frozenset(x):
         var = set("abc")
         other = set([frozenset("abc")])
@@ -1765,6 +1803,48 @@ def test_set_in_frozenset(x):
             return x + 1
         else:
             return x - 1
+=======
+    def test_set_invalid_ConstantVariable_op(a, b):
+        s = set({"banana", "apple", "orange"})
+        try:
+            s - 1
+        except TypeError:
+            return a + b
+        except Exception:
+            return a - b
+        else:
+            return a * b
+
+    @make_test
+    def test_set_pop_raise_KeyError(a, b):
+        s = set()
+        try:
+            s.pop()
+        except KeyError:
+            return a + b
+        except Exception:
+            return a - b
+        else:
+            return a * b
+
+    @make_test
+    def test_set_issubset(a, b):
+        vals1 = {"a", "b", "c"}
+        vals2 = {"b", "c"}
+        vals3 = {"b", "e", "f"}
+        if vals2.issubset(vals1) and not vals2.issubset(vals3):
+            return a + b
+        return a - b
+
+    @make_test
+    def test_set_issuperset(a, b):
+        vals1 = {"a", "b", "c"}
+        vals2 = {"b", "c"}
+        vals3 = {"b", "e", "f"}
+        if vals1.issuperset(vals2) and not vals1.issuperset(vals3):
+            return a + b
+        return a - b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @make_test
     def test_set_update_bytecode(x):
@@ -1784,6 +1864,184 @@ def test_set_update_list_with_duplicated_items(x):
         else:
             return x - 1
 
+<<<<<<< HEAD
+=======
+    @make_test
+    def test_set_contains(a, b):
+        vals = set(["a", "b", "c"])
+        if "a" in vals:
+            x = a + b
+        else:
+            x = a - b
+        if "d" in vals:
+            y = a + b
+        else:
+            y = a - b
+        return x, y
+
+    def test_set_isdisjoint(self):
+        x = {"apple", "banana", "cherry"}
+        y = {"google", "microsoft", "apple"}
+
+        def fn(a):
+            if x.isdisjoint(y):
+                return a + 1
+            else:
+                return a - 1
+
+        test = make_test(fn)
+        test(self)
+
+    @make_test
+    def test_set_intersection(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "apple"}
+        intersection_set = set1.intersection(set2, set3)
+        if "apple" in intersection_set:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in intersection_set:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in intersection_set:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_intersection_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "apple"}
+        set1.intersection_update(set2, set3)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in set1:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @parametrize("_type", [set])
+    def test_set_union(self, _type):
+        @make_test
+        def fn(a, b):
+            set1 = _type({"apple", "banana", "cherry"})
+            set2 = _type({"google", "microsoft", "apple"})
+            set3 = _type({"shoes", "flipflops", "sneakers"})
+            union_set = set1.union(set2, set3)
+            if "apple" in union_set:
+                x = a + b
+            else:
+                x = a - b
+            if "banana" in union_set:
+                y = a + b
+            else:
+                y = a - b
+            if "shoes" in union_set:
+                z = a + b
+            else:
+                z = a - b
+            return x, y, z
+
+        fn(self)
+
+    @parametrize(
+        "fn_name", ["add", "symmetric_difference", "symmetric_difference_update"]
+    )
+    def test_set_raise_TypeError(self, fn_name):
+        @make_test
+        def fn(a, b):
+            set1 = {"apple", "banana", "cherry"}
+            try:
+                getattr(set1, fn_name)()
+            except TypeError:
+                return a + b
+            return a - b
+
+        fn(self)
+
+    @make_test
+    def test_set_difference(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "sneakers"}
+        difference_set = set1.difference(set2, set3)
+        if "apple" in difference_set:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in difference_set:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in difference_set:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_difference_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "sneakers"}
+        set1.difference_update(set2, set3)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in set1:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_symmetric_difference(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        symmetric_diff_set = set1.difference(set2)
+        if "apple" in symmetric_diff_set:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in symmetric_diff_set:
+            y = a + b
+        else:
+            y = a - b
+        return x, y
+
+    @make_test
+    def test_set_symmetric_difference_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set1.difference(set2)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+            y = a + b
+        else:
+            y = a - b
+        return x, y
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_keys_view(self):
         from collections.abc import KeysView
 
@@ -1816,6 +2074,26 @@ def fn(x):
         x = torch.rand(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
+=======
+    @parametrize("method", ["add", "__contains__"])
+    def test_set_raise_TypeError_on_unshashable_obj(self, method):
+        @make_test
+        def fn(a, b):
+            s = set({1, 2, 3, 4})
+            try:
+                m = getattr(s, method)
+                m([[]])
+            except TypeError:
+                return a + b
+            except Exception:
+                return a - b
+            else:
+                return a * b
+
+        fn(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_set(self):
         s = set([1, 2])
 
@@ -2027,6 +2305,7 @@ def test_namedtuple_defaults(a, b):
         tmp = mytuple(a, xy=b)
         return mytuple(tmp.x, tmp[1], tmp.xy + b)
 
+<<<<<<< HEAD
     @make_test
     def test_namedtuple_replace(a, b):
         mytuple = collections.namedtuple("mytuple", ["x", "y"])
@@ -2042,6 +2321,8 @@ def test_namedtuple_fields(a, b):
         else:
             return a - b
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class MyNamedTuple(NamedTuple):
         first: torch.Tensor
         second: torch.Tensor
@@ -2084,12 +2365,15 @@ def test_namedtuple_user_methods(a, b):
         return mytuple.add(), mytuple.static_method(), mytuple.class_method()
 
     @make_test
+<<<<<<< HEAD
     def test_namedtuple_replace(a, b):
         mytuple = FunctionTests.MyNamedTuple(a, b)
         replaced = mytuple._replace(first=b)
         return mytuple.first + mytuple.second + replaced.first + replaced.second
 
     @make_test
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_generic_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyGenericNamedTuple(a, b)
         return mytuple.add(), mytuple.static_method(), mytuple.class_method()
@@ -2302,6 +2586,7 @@ def augment(x: torch.Tensor) -> torch.Tensor:
 
         return augment(x)
 
+<<<<<<< HEAD
     @make_test
     def test_match_sequence(a):
         point = (5, 8)
@@ -2323,6 +2608,32 @@ def test_match_mapping_and_match_keys(x):
                 return x * param
             case {"b": param}:
                 return x / param
+=======
+    # # This is to test the new syntax for pattern matching
+    # # ("match ... case ...") added on python 3.10.
+    # # Uncomment these test cases if you run on 3.10+
+    # @make_test
+    # def test_match_sequence(a):
+    #     point = (5, 8)
+    #     match point:
+    #         case (0, 0):
+    #             return a
+    #         case (0, y):
+    #             return a - y
+    #         case (x, 0):
+    #             return a + x
+    #         case (x, y):
+    #             return a + x - y
+
+    # @make_test
+    # def test_match_mapping_and_match_keys(x):
+    #     param = {"a": 0.5}
+    #     match param:
+    #         case {"a": param}:
+    #             return x * param
+    #         case {"b": param}:
+    #             return x / param
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_math_radians(self):
         def func(x, a):
@@ -2826,6 +3137,7 @@ def fn(x, a, b):
         opt_fn = torch.compile(fullgraph=True, backend="eager")(fn)
         self.assertEqual(opt_fn(x, a, b), fn(x, a, b))
 
+<<<<<<< HEAD
     def test_list_setitem(self):
         def fn(a: int):
             some_array = [1, 2, 3]
@@ -2846,6 +3158,8 @@ def fn(a: int):
         self.assertEqual(opt_fn(0), fn(0))
         self.assertEqual(opt_fn(1), fn(1))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pow_int(self):
         def fn(a, b):
             return torch.pow(a, b)
@@ -3495,6 +3809,7 @@ def gen_random_range_args(self):
             args[2] = 1
         return args
 
+<<<<<<< HEAD
     def test_range_iterator_graph_break(self):
         @torch.compile(backend="eager")
         def fn(x):
@@ -3540,6 +3855,8 @@ def test_range_iterator_2(a, b):
             return a + b
         return a - b
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_range_length(self):
         def test(*args, expected=None):
             r = range(*args)
@@ -3624,7 +3941,11 @@ def test(range, slice, expected=None):
                 )
 
         test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
+<<<<<<< HEAD
         test(range(10), slice(None, 10, None), expected=range(10))
+=======
+        test(range(10), slice(None, 10, None), expected=range(0, 10))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         test(range(10), slice(-1, 7, None), expected=range(9, 7))
         test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
         test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
@@ -4040,6 +4361,7 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
     def test_torch_get_device_module(self):
         def f1():
             mod1 = torch.get_device_module()
@@ -4119,6 +4441,8 @@ def f():
         finally:
             torch = old_torch
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def udf_mul(x, y):
     return x * y
@@ -4212,7 +4536,10 @@ def func():
         self.assertEqual(cnts.frame_count, 3)
         self.assertEqual(cnts.op_count, 6)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(assume_dunder_attributes_remain_unchanged=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_meth_default_tensor_args(self):
         """
         Tests that we indeed reference (and mutate) "the one" default tensor arg
@@ -4413,6 +4740,7 @@ def fn(a, b):
 
         fn(self)
 
+<<<<<<< HEAD
     @parametrize(
         "method_name",
         [
@@ -4440,6 +4768,8 @@ def fn(a, b):
 
         fn(self)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_frozenset_construction(self):
         def fn(x):
             s = frozenset({x})
@@ -4691,6 +5021,13 @@ def g():
         self.assertEqual(len(lst), 2)
         self.assertEqual(lst[0], lst[1])
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        sys.version_info < (3, 10),
+        "zip strict kwargs not implemented for Python < 3.10",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_zip_strict(self):
         def fn(x, ys, zs):
             x = x.clone()
@@ -4722,12 +5059,19 @@ def fn(x, ys, zs):
             opt_fn(x, ys, zs[:1])
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+<<<<<<< HEAD
     def test_gpu_current_device(self):
         def fn(x):
             y = torch.empty(
                 (2, 3),
                 dtype=torch.float32,
                 device=torch.accelerator.current_device_index(),
+=======
+    def test_cuda_current_device(self):
+        def fn(x):
+            y = torch.empty(
+                (2, 3), dtype=torch.float32, device=torch.cuda.current_device()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             y.copy_(x)
             return torch.sin(y + y.device.index)
@@ -4735,11 +5079,19 @@ def fn(x):
         counter = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(backend=counter, fullgraph=True)(fn)
 
+<<<<<<< HEAD
         with torch.accelerator.device_index(0):
             x = torch.randn(2, 3)
             self.assertEqual(opt_fn(x), fn(x))
             self.assertEqual(counter.frame_count, 1)
             with torch.accelerator.device_index(1):
+=======
+        with torch.cuda.device(0):
+            x = torch.randn(2, 3)
+            self.assertEqual(opt_fn(x), fn(x))
+            self.assertEqual(counter.frame_count, 1)
+            with torch.cuda.device(1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(opt_fn(x), fn(x))
                 self.assertEqual(counter.frame_count, 2)
 
@@ -5147,6 +5499,7 @@ def __getattribute__(self, name):
         with self.assertRaises(Unsupported):
             a.call_function(None, [], {})
 
+<<<<<<< HEAD
     def test_inspect_method_source(self):
         class Mod(torch.nn.Module):
             def __init__(self):
@@ -5216,6 +5569,8 @@ def forward(self, x):
         x = torch.randn(1)
         self.assertEqual(opt_mod(x), x + 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(FunctionTests)
 instantiate_parametrized_tests(DefaultsTests)
diff --git a/test/dynamo/test_generator.py b/test/dynamo/test_generator.py
index c02126c7404ff..5c0ed2e230f26 100644
--- a/test/dynamo/test_generator.py
+++ b/test/dynamo/test_generator.py
@@ -17,18 +17,28 @@
 )
 
 
+<<<<<<< HEAD
 class GeneratorTestsBase(torch._dynamo.test_case.TestCaseWithNestedGraphBreaks):
+=======
+class GeneratorTestsBase(torch._dynamo.test_case.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUp(self):
         super().setUp()
         self._old = torch._dynamo.config.enable_faithful_generator_behavior
         torch._dynamo.config.enable_faithful_generator_behavior = True
+<<<<<<< HEAD
         self._unittest_old = torch._dynamo.config.enable_trace_unittest
         torch._dynamo.config.enable_trace_unittest = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def tearDown(self):
         super().tearDown()
         torch._dynamo.config.enable_faithful_generator_behavior = self._old
+<<<<<<< HEAD
         torch._dynamo.config.enable_trace_unittest = self._unittest_old
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _compile_check(self, fn, args=None, fullgraph=True):
         eager = EagerAndRecordGraphs()
@@ -355,7 +365,11 @@ def fn(t, ctx):
         ctx = whoo()
         next(ctx)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             Unsupported, "Detected a method call to a user-defined generator object."
+=======
+            Unsupported, "Generator as graph argument is not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fn(t, ctx)
 
@@ -374,7 +388,11 @@ def fn(t, ctx):
         ctx = whoo(t)
         next(ctx)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             Unsupported, "Detected a method call to a user-defined generator object."
+=======
+            Unsupported, "Generator as graph argument is not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fn(t, ctx)
 
@@ -395,7 +413,11 @@ def fn(t, ctx):
         t = torch.randn(2)
         ctx = whoo()
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             Unsupported, "Detected a method call to a user-defined generator object."
+=======
+            Unsupported, "Generator as graph argument is not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fn(t, ctx)
 
@@ -413,8 +435,12 @@ def fn(t, ctx):
         t = torch.randn(2)
         ctx = whoo(t)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             Unsupported,
             "Detected a method call to a user-defined generator object.",
+=======
+            Unsupported, "Generator as graph argument is not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fn(t, ctx)
 
@@ -890,6 +916,7 @@ def f(x):
             torch.compile(f, backend="eager", fullgraph=True)(torch.ones(3)),
         )
 
+<<<<<<< HEAD
     @make_dynamo_test
     def test_generator___contains__(self):
         def whoo():
@@ -921,6 +948,8 @@ def whoo():
         self.assertRaises(StopIteration, next, g)
         self.assertFalse(3 in whoo())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestGeneratorSend(GeneratorTestsBase):
     def test_send(self):
@@ -1515,6 +1544,7 @@ def fn(t):
 
         self._compile_check(fn)
 
+<<<<<<< HEAD
     def test_return_const_value_in_except_and_finally(self):
         def whoo():
             try:
@@ -1585,6 +1615,8 @@ def fn(t):
 
         self._compile_check(fn)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(GeneratorTests)
 instantiate_parametrized_tests(TestGeneratorSend)
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
index 004aee88a8633..6c7ce1acc19f4 100644
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@@ -4,16 +4,23 @@
 
 import torch
 import torch.fx
+<<<<<<< HEAD
 from torch._dynamo.graph_deduplication import apply_graph_deduplication
 from torch._dynamo.graph_utils import _detect_cycles
 from torch._dynamo.output_graph import FakeRootModule
+=======
+from torch._dynamo.graph_utils import _detect_cycles
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import (
     AotEagerAndRecordGraphs,
     extract_graph_and_tracker,
     normalize_gm,
 )
+<<<<<<< HEAD
 from torch.compiler import allow_in_graph
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 
@@ -1109,6 +1116,7 @@ def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
     """,
         )
 
+<<<<<<< HEAD
     def test_tuple_return(self):
         @allow_in_graph
         def tuple_return(x, y):
@@ -1224,6 +1232,8 @@ def fn(x0, x1, x2, y0, y1, y2):
 
         fn_opt(*args)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
index ce456596fd55e..be851f2dfaecc 100644
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 import contextlib
+<<<<<<< HEAD
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -8,6 +12,31 @@
 from torch.utils._pytree import tree_map
 
 
+<<<<<<< HEAD
+=======
+def get_nodes_by_name(graph, names):
+    nodes = []
+    for node in graph.nodes:
+        if node.name in names:
+            nodes.append(node)
+
+    return nodes
+
+
+unique_ind = 0
+
+
+def track_same_nodes(names, graph, region_tracker):
+    global unique_ind
+    unique_ind += 1
+    # find nodes in graph with names and track them
+    # as if they were at the same code location
+    nodes = get_nodes_by_name(graph, names)
+    for node in nodes:
+        region_tracker.track_node("x", unique_ind, node)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphRegionTrackerTests(TestCase):
     def setUp(self):
         self.exit_stack = contextlib.ExitStack()
@@ -195,6 +224,24 @@ def fn(x, y, z):
         )
 
     def test_mismatched_global_state(self):
+<<<<<<< HEAD
+=======
+        @contextlib.contextmanager
+        def _hip_allow_tf32():
+            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+            # and only for MI300+
+            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+            try:
+                yield
+            finally:
+                if hip_allow_tf32 is not None:
+                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+                else:
+                    del os.environ["HIPBLASLT_ALLOW_TF32"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def inner_fn(x, y):
             x1 = x * 1
             y1 = y + 1
@@ -235,6 +282,7 @@ def set_default_dtype_bfloat16():
         def reset_default_dtype():
             torch.set_default_dtype(old_dtype)
 
+<<<<<<< HEAD
         for ctx in [
             lambda: torch.set_grad_enabled(False),
             torch.autograd.grad_mode.inference_mode,
@@ -258,6 +306,33 @@ def reset_default_dtype():
                 """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
 [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
             )
+=======
+        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+        with tf32_ctx():
+            for ctx in [
+                lambda: torch.set_grad_enabled(False),
+                torch.autograd.grad_mode.inference_mode,
+                lambda: torch.autograd.graph.disable_saved_tensors_hooks(
+                    "This is not supported"
+                ),
+                # lambda: torch.set_num_threads(2), : Unsupported
+                (set_default_dtype_bfloat16, reset_default_dtype),
+                (
+                    lambda: torch.use_deterministic_algorithms(True),
+                    lambda: torch.use_deterministic_algorithms(False),
+                ),
+                # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
+                # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
+                create_toggle_fns("allow_bf16_reduced_precision_reduction"),
+                create_toggle_fns("allow_fp16_reduced_precision_reduction"),
+                create_toggle_fns("allow_tf32"),
+            ]:
+                self.assertExpectedInline(
+                    self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
+                    """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
+[['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mutation_tracking_simple(self):
         def fn(x, y, z):
@@ -330,6 +405,7 @@ def fn(x, y):
             """[[['y', 'o1'], ['y_1', 'o2'], ['y_2', 'o3']]]""",
         )
 
+<<<<<<< HEAD
     def test_region_sorting(self):
         from torch._dynamo.graph_region_tracker import _sort_with_ref_region
 
@@ -367,6 +443,8 @@ def fn(x, y):
         key = next(iter(tracker.node_to_duplicates.keys()))
         tracker.track_node(None, key)  # this will fail if the node is added again
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_guard_manager.py b/test/dynamo/test_guard_manager.py
index f11c04c8071d8..f81a9171cb587 100644
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@@ -1,7 +1,11 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
 import abc
 import functools
 import inspect
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 import weakref
 
@@ -70,8 +74,12 @@ def less_match_verbose_code_parts(expected):
 
 class GuardManagerTests(torch._dynamo.test_case.TestCase):
     def test_global_state_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         guard = guards.GLOBAL_STATE(root, ["global_state_check"])
+=======
+        guard = guards.GLOBAL_STATE(["global_state_check"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(None))
         with set_default_dtype(torch.double):
             self.assertFalse(guard(None))
@@ -112,9 +120,13 @@ def test_global_state_reason(self):
             self.assertEqual(guards.reason(), "grad_mode ")
 
     def test_python_lambda_leaf_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         const_guard = guards.LAMBDA_GUARD(
             root,
+=======
+        const_guard = guards.LAMBDA_GUARD(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             functools.partial(equals_match, expected=5),
             equals_match_verbose_code_parts(5),
         )
@@ -123,16 +135,25 @@ def test_python_lambda_leaf_guard(self):
         self.assertFalse(const_guard("foo"))
 
     def test_type_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = 4
         guard = guards.TYPE_MATCH(root, id_type(foo), ["type(x) == int"])
+=======
+        foo = 4
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == int"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(guard(5))
         self.assertTrue(guard(4))
         self.assertFalse(guard("foo"))
 
         foo = {"a": 1}
+<<<<<<< HEAD
         guard = guards.TYPE_MATCH(root, id_type(foo), ["type(x) == dict"])
+=======
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == dict"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertTrue(guard({}))
         self.assertFalse(guard(5))
@@ -145,32 +166,50 @@ def __init__(self, x, y):
 
         foo = Foo(1, 2)
 
+<<<<<<< HEAD
         guard = guards.TYPE_MATCH(root, id_type(foo), ["type(x) == Foo"])
+=======
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == Foo"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertFalse(guard({}))
         self.assertFalse(guard(5))
         self.assertFalse(guard("foo"))
 
     def test_id_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = 4
         guard = guards.ID_MATCH(root, id(foo), ["id(x) == id(foo)"])
+=======
+        foo = 4
+        guard = guards.ID_MATCH(id(foo), ["id(x) == id(foo)"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(guard(foo))
         self.assertFalse(guard(5))
         self.assertFalse(guard("foo"))
 
         foo = {"a": 1}
+<<<<<<< HEAD
         guard = guards.ID_MATCH(root, id(foo), ["id(x) == id(foo)"])
+=======
+        guard = guards.ID_MATCH(id(foo), ["id(x) == id(foo)"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertFalse(guard({"a": 1}))
         self.assertFalse(guard({}))
         self.assertFalse(guard(5))
 
     def test_equals_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = 4
         guard = guards.EQUALS_MATCH(root, foo, ["x == 4"])
+=======
+        foo = 4
+        guard = guards.EQUALS_MATCH(foo, ["x == 4"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(guard(4))
         self.assertFalse(guard(5))
@@ -178,7 +217,11 @@ def test_equals_guard(self):
 
         # tuple
         foo = (1, 2, 3)
+<<<<<<< HEAD
         guard = guards.EQUALS_MATCH(root, foo, ["x == foo"])
+=======
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertTrue(guard((1, 2, 3)))
         self.assertFalse(guard((1, 2, 3, 4)))
@@ -186,22 +229,35 @@ def test_equals_guard(self):
 
         # list
         foo = [1, 2, 3]
+<<<<<<< HEAD
         guard = guards.EQUALS_MATCH(root, foo, ["x == foo"])
+=======
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertTrue(guard([1, 2, 3]))
         self.assertFalse(guard([1, 2, 3, 4]))
 
         # type
         foo = int
+<<<<<<< HEAD
         guard = guards.EQUALS_MATCH(root, foo, ["x == foo"])
+=======
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertTrue(guard(int))
         self.assertFalse(guard(float))
 
     def test_default_device_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = 1
         guard = guards.DEFAULT_DEVICE(root, ["cpu device"])
+=======
+        foo = 1
+        guard = guards.DEFAULT_DEVICE(["cpu device"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
 
         try:
@@ -211,15 +267,23 @@ def test_default_device_guard(self):
             torch.set_default_device(None)
 
     def test_length_check_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = [1, 2, 3]
         guard = guards.LENGTH_CHECK(root, len(foo), ["len(x) == len(foo)"])
+=======
+        foo = [1, 2, 3]
+        guard = guards.LENGTH_CHECK(len(foo), ["len(x) == len(foo)"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertFalse(guard([]))
 
     def test_no_hasattr_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Bar:
             def __init__(self) -> None:
                 self.bar = 2
@@ -232,7 +296,11 @@ def __init__(self) -> None:
 
         foo = Foo()
 
+<<<<<<< HEAD
         guard = guards.NO_HASATTR(root, "foo", ["hasattr(x, 'foo') == False"])
+=======
+        guard = guards.NO_HASATTR("foo", ["hasattr(x, 'foo') == False"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(bar))
         self.assertFalse(guard(foo))
 
@@ -270,9 +338,14 @@ def __init__(self, x, y):
         self.assertFalse(guard_manager.check(f_locals_unaliased))
 
     def test_dict_version_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = {"a": 1, "b": 2}
         guard = guards.DICT_VERSION(root, foo, ["x.version == foo.version"])
+=======
+        foo = {"a": 1, "b": 2}
+        guard = guards.DICT_VERSION(foo, ["x.version == foo.version"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(guard(foo))
         self.assertFalse(guard(dict(foo)))
@@ -282,9 +355,14 @@ def test_dict_version_guard(self):
         self.assertFalse(guard({}))
 
     def test_dynamic_indices_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         guard1 = guards.DYNAMIC_INDICES(root, set(), ["x.size(0) == y.size(0)"])
         guard2 = guards.DYNAMIC_INDICES(root, set({0, 1}), ["x.size(0) == y.size(0)"])
+=======
+        guard1 = guards.DYNAMIC_INDICES(set(), ["x.size(0) == y.size(0)"])
+        guard2 = guards.DYNAMIC_INDICES(set({0, 1}), ["x.size(0) == y.size(0)"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(4)
         self.assertTrue(guard1(x))
@@ -382,20 +460,32 @@ def __init__(self, x, y, z):
         self.assertFalse(guard_manager.check_verbose(f_locals_unaliased).result)
 
     def test_weakref_alive_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         x = torch.rand(3, 4)
         weakref_x = weakref.ref(x)
 
         guard = guards.NOT_NONE(root, ["weakref_x is not None"])
+=======
+        x = torch.rand(3, 4)
+        weakref_x = weakref.ref(x)
+
+        guard = guards.NOT_NONE(["weakref_x is not None"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(weakref_x()))
         del x
         self.assertFalse(guard(weakref_x()))
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_call_function_no_args_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         x = torch.cuda.current_device()
         guard = guards.EQUALS_MATCH(root, x, [0])
+=======
+        x = torch.cuda.current_device()
+        guard = guards.EQUALS_MATCH(x, [0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(0))
         self.assertFalse(guard(1))
         self.assertFalse(guard(2))
@@ -713,16 +803,25 @@ def fn(x):
         self.assertTrue("Test" in debug_info.verbose_code_parts[0])
 
     def test_dict_contains_guard(self):
+<<<<<<< HEAD
         root = RootGuardManager()
         foo = {"a": 1, "b": 2}
         guard = guards.DICT_CONTAINS(root, True, "a", ["has a"])
+=======
+        foo = {"a": 1, "b": 2}
+        guard = guards.DICT_CONTAINS(True, "a", ["has a"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(guard(foo))
         self.assertTrue(guard({"a": 1, "b": 2}))
         self.assertFalse(guard({"b": 2, "c": 3}))
         self.assertFalse(guard({}))
 
+<<<<<<< HEAD
         guard = guards.DICT_CONTAINS(root, False, "c", ["not has c"])
+=======
+        guard = guards.DICT_CONTAINS(False, "c", ["not has c"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard(foo))
         self.assertTrue(guard({"a": 1, "b": 2}))
         self.assertFalse(guard({"b": 2, "c": 3}))
@@ -813,7 +912,11 @@ def test_clone(self):
         except ImportError:
             from utils import install_guard_manager_testing_hook
 
+<<<<<<< HEAD
         def hook(guard_wrapper, f_locals, builder):
+=======
+        def hook(guard_wrapper, f_locals):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             root = guard_wrapper.root
 
             # Check full cloning works as expected
@@ -853,7 +956,11 @@ def test_diff_guard_manager(self):
             from utils import install_guard_manager_testing_hook
         counter = 0
 
+<<<<<<< HEAD
         def hook(guard_wrapper, f_locals, builder):
+=======
+        def hook(guard_wrapper, f_locals):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal counter
             root = guard_wrapper.root
             diff_guard_root = guard_wrapper.diff_guard_root
@@ -882,9 +989,14 @@ def hook(guard_wrapper, f_locals, builder):
             counter += 1
 
         class Bar:
+<<<<<<< HEAD
             def __init__(self):
                 self.x = 4
                 self.y = torch.randn(4)
+=======
+            x = 4
+            y = torch.randn(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bar = Bar()
 
@@ -901,6 +1013,7 @@ def fn(x, foo, bar):
             opt_fn(x, foo, bar)
 
 
+<<<<<<< HEAD
 class TypePropagationTests(torch._dynamo.test_case.TestCase):
     @torch._dynamo.config.patch(skip_tensor_guards_with_matching_dict_tags=True)
     def test_basic_types(self):
@@ -1396,6 +1509,8 @@ def max_size_test(guard_wrapper, f_locals, builder):
                 opt_fn(x)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index d81032a457abc..db47c0caf1d22 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -1,12 +1,20 @@
 # Owner(s): ["module: dynamo"]
 
 import dataclasses
+<<<<<<< HEAD
 import pickle
 import sys
 import tempfile
 import types
 import unittest
 import weakref
+=======
+import importlib
+import pickle
+import sys
+import types
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 from unittest.mock import patch
 
@@ -19,7 +27,10 @@
 from torch._dynamo.bytecode_transformation import transform_code_object
 from torch._dynamo.exc import PackageError
 from torch._dynamo.guards import CheckFunctionManager, CompileId
+<<<<<<< HEAD
 from torch._dynamo.package import CompilePackage
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.symbolic_convert import (
     ExceptionStack,
     InstructionTranslator,
@@ -28,7 +39,10 @@
 from torch._dynamo.utils import dynamo_timed, get_metrics_context
 from torch._guards import compile_context, CompileContext, tracing
 from torch.overrides import TorchFunctionMode
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils import _pytree as pytree
 
@@ -46,6 +60,7 @@ def forward(self, x):
         return x + 1
 
 
+<<<<<<< HEAD
 class GlobalNestedModule(torch.nn.Module):
     def __init__(self, submodule=None):
         super().__init__()
@@ -57,10 +72,13 @@ def forward(self, x):
         return self.linear(x) + 1
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def global_func(x):
     return x + 1
 
 
+<<<<<<< HEAD
 class ModuleNotSerializable(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -73,6 +91,8 @@ def forward(self, x):
         return x + self.param
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GlobalTorchFunctionMode(TorchFunctionMode):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -80,6 +100,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
         return func(*args, **kwargs)
 
 
+<<<<<<< HEAD
 class MyClass:
     def __getstate__(self):
         raise RuntimeError("Cannot pickle")
@@ -121,6 +142,8 @@ def __init__(self, d):
         self.d = d
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SubclassWithMeta(torch.Tensor):
     @staticmethod
     def __new__(cls, a, extra, outer_size=None, outer_stride=None):
@@ -302,7 +325,20 @@ def __hash__(self):
 pytree.register_constant(CustomConstantType)
 
 
+<<<<<<< HEAD
 class TestGuardSerializationBase(torch._inductor.test_case.TestCase):
+=======
+class TestGuardSerialization(torch._inductor.test_case.TestCase):
+    def test_function_locals(self):
+        def foo(x):
+            return x + 1
+
+        def fn(x, g):
+            return g(x) + 1
+
+        self._test_serialization("TENSOR_MATCH", fn, torch.randn(3), foo)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _tracefunc(self, frame, event, arg):
         if event != "call":
             return
@@ -312,15 +348,23 @@ def _tracefunc(self, frame, event, arg):
 
         self._frame_state = _FrameState(
             f_locals=dict(frame.f_locals),
+<<<<<<< HEAD
             f_globals=frame.f_globals,
+=======
+            f_globals=dict(frame.f_globals),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f_code=frame.f_code,
             f_builtins=frame.f_builtins,
         )
 
     def _test_serialization(self, guard_type, fn, *args, **kwargs):
         # kwargs might contain a callable that generates kwargs
+<<<<<<< HEAD
         torch._dynamo.reset()
         kwarg_gen_fn = kwargs.get("_gen_fn")
+=======
+        kwarg_gen_fn = kwargs.get("_gen_fn", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if kwarg_gen_fn is not None:
             kwargs = kwarg_gen_fn()
 
@@ -364,9 +408,12 @@ def transform(instructions: list, code_options: dict[str, object]):
             nonlocal ref_gm
             nonlocal loaded_gm
 
+<<<<<<< HEAD
             torch._dynamo.convert_frame.initial_global_state = (
                 torch._C._dynamo.guards.GlobalStateGuard()
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tracer = InstructionTranslator(
                 instructions,
                 self._frame_state.f_code,
@@ -387,9 +434,13 @@ def transform(instructions: list, code_options: dict[str, object]):
                 package=None,
             )
             with (
+<<<<<<< HEAD
                 compile_context(
                     CompileContext(CompileId(frame_id=0, frame_compile_id=0))
                 ),
+=======
+                compile_context(CompileContext(CompileId(0, 0))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tracing(tracer.output.tracing_context),
                 tracer.set_current_tx(),
                 get_metrics_context(),
@@ -397,16 +448,20 @@ def transform(instructions: list, code_options: dict[str, object]):
             ):
                 tracer.run()
 
+<<<<<<< HEAD
                 ref_gm = CheckFunctionManager(
                     self._frame_state.f_code,
                     tracer.output,
                     guard_filter_fn=guard_filter_fn,
                 ).guard_manager
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 check_fn_manager = CheckFunctionManager(
                     self._frame_state.f_code,
                     tracer.output,
                     guard_filter_fn=guard_filter_fn,
+<<<<<<< HEAD
                     save_guards=True,
                 )
                 guards_state = check_fn_manager.guards_state
@@ -420,11 +475,30 @@ def transform(instructions: list, code_options: dict[str, object]):
                     self._frame_state.f_code,
                     self._frame_state.f_globals,
                 )
+=======
+                    guards_serialization_mode="save",
+                )
+                ref_gm = check_fn_manager.guard_manager
+                guards_state = check_fn_manager.guards_state
+                self.assertIsNotNone(guards_state)
+                guards_state = pickle.loads(guards_state)
+
+                check_fn_manager = CheckFunctionManager(
+                    self._frame_state.f_code,
+                    guards_state.output_graph,
+                    guards_serialization_mode="load",
+                    shape_code_parts=guards_state.shape_code_parts,
+                )
+                loaded_gm = check_fn_manager.guard_manager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             transform_code_object(self._frame_state.f_code, transform)
         finally:
+<<<<<<< HEAD
             torch._dynamo.convert_frame.initial_global_state = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._frame_state = None
 
         self.assertIsNotNone(ref_gm)
@@ -436,6 +510,7 @@ def _test_check_fn(self, ref, loaded, inputs, expected):
         self.assertEqual(ref.check(inputs), expected)
         self.assertEqual(ref.check(inputs), loaded.check(inputs))
 
+<<<<<<< HEAD
 
 @torch._dynamo.config.patch({"strict_precompile": True})
 class TestGuardSerialization(TestGuardSerializationBase):
@@ -448,6 +523,8 @@ def fn(x, g):
 
         self._test_serialization("TENSOR_MATCH", fn, torch.randn(3), foo)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_match(self):
         def f(x: torch.Tensor):
             return x + 1
@@ -747,7 +824,11 @@ def fn(m, x):
             ):
                 self._test_serialization("NN_MODULE", fn, m, x)
 
+<<<<<<< HEAD
     def test_class_match(self):
+=======
+    def test_function_match(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(x):
             # usage of this context manager installs a FUNCTION_MATCH guard
             with torch.no_grad():
@@ -759,9 +840,15 @@ def fn(x):
         # we don't support FUNCTION_MATCH because it adds an ID_MATCH guard, and we don't
         # support that in serialization
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             PackageError, "CLASS_MATCH guard cannot be serialized."
         ):
             self._test_serialization("CLASS_MATCH", fn, x)
+=======
+            PackageError, "FUNCTION_MATCH guard cannot be serialized."
+        ):
+            self._test_serialization("FUNCTION_MATCH", fn, x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_closure_match(self):
         def fn(x):
@@ -948,6 +1035,7 @@ def fn(x):
         ):
             self._test_serialization("ID_MATCH", fn, torch.randn(3))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_id_match_with_config(self):
         def fn(x):
@@ -965,6 +1053,8 @@ def fn(x):
         ref, loaded = self._test_serialization("CLASS_MATCH", fn, torch.randn(3))
         self._test_check_fn(ref, loaded, {"x": torch.randn(3)}, True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dispatch_key_set_match(self):
         def fn(x, dks):
             if dks.has("CPU"):
@@ -982,6 +1072,26 @@ def fn(x, dks):
         dks = torch._C._dispatch_keys(x)
         self._test_check_fn(ref, loaded, {"x": x, "dks": dks}, False)
 
+<<<<<<< HEAD
+=======
+    def test_name_match(self):
+        def fn(x, y):
+            return torch.cond(x, lambda x: y + 1, lambda x: y - 1, (y,))
+
+        x = torch.tensor(True)
+        y = torch.randn(3)
+        ref, loaded = self._test_serialization("NAME_MATCH", fn, x, y)
+
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+
+        op = importlib.import_module("torch._higher_order_ops.cond").cond_op
+        prev, op.__name__ = op.__name__, ""
+        try:
+            self._test_check_fn(ref, loaded, {"x": x, "y": y}, False)
+        finally:
+            op.__name__ = prev
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dual_level(self):
         def fn(x):
             with torch.autograd.forward_ad.dual_level():
@@ -1119,10 +1229,17 @@ def fn(x, x_):
             return x + x_
 
         x = torch.randn(3, 2)
+<<<<<<< HEAD
         ref, loaded = self._test_serialization("DUPLICATE_INPUT", fn, x, x)
 
         self._test_check_fn(ref, loaded, {"x": x, "x_": x}, True)
         self._test_check_fn(ref, loaded, {"x": x, "x_": torch.randn(3, 2)}, False)
+=======
+        with self.assertRaisesRegex(
+            PackageError, "DUPLICATE_INPUT guard cannot be serialized"
+        ):
+            self._test_serialization("DUPLICATE_INPUT", fn, x, x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_weakref_alive(self):
         mod = torch.nn.Linear(10, 10, bias=False)
@@ -1220,6 +1337,7 @@ def fn(x):
         with torch.enable_grad():
             self._test_check_fn(ref, loaded, {"x": x}, True)
 
+<<<<<<< HEAD
     def test_grad_mode_loading(self):
         def fn(x):
             return x + 1
@@ -1238,6 +1356,8 @@ def fn(x):
             loaded = check_fn_manager.guard_manager
             self._test_check_fn(ref, loaded, {"x": x}, False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deterministic_algorithms(self):
         def fn(x):
             return x + 1
@@ -1353,6 +1473,7 @@ def fn(x):
         self._test_check_fn(ref, loaded, {"x": torch.randn(3, 11, 2)}, False)
         self._test_check_fn(ref, loaded, {"x": torch.randn(3, 2, 2)}, False)
 
+<<<<<<< HEAD
     def test_builtin_match(self):
         def fn(x):
             # usage of getattr() here installs a BUILTIN_MATCH guard
@@ -1725,6 +1846,8 @@ def test_guard_serialization_fsdp_module(self):
             ref, loaded = self._test_serialization("TENSOR_MATCH", m, inputs)
             self._test_check_fn(ref, loaded, {"self": m, "x": inputs}, True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 693c90a10b3a4..7275401e62747 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -38,8 +38,16 @@
     xfailIfTorchDynamo,
 )
 from torch.testing._internal.hop_db import hop_db
+<<<<<<< HEAD
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def count_ops(gm, args, freq, op):
@@ -815,7 +823,10 @@ def forward(self, l_x_: "f32[3]", u0: "Sym(u0)", c: "i64[u0, 1]"):
 
     @torch._dynamo.config.patch(
         capture_dynamic_output_shape_ops=True,
+<<<<<<< HEAD
         capture_scalar_outputs=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_tensor_to_list_closure(self):
         def f(x):
@@ -1184,7 +1195,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         pred = a.sum() > 0
         with self.assertRaisesRegex(
             NotImplementedError,
+<<<<<<< HEAD
             "no rule registered for HigherOrderOperator cond and mode .*MyMode",
+=======
+            "no rule registered for HOP cond and mode .*MyMode",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             with MyMode():
                 res = cond_op(pred, torch.sin, torch.cos, (a,))
@@ -2131,7 +2146,11 @@ def false_fn(x):
                 and node.target == torch.ops.higher_order.cond
             ):
                 _, _, _, operands = node.args
+<<<<<<< HEAD
                 # Since we compile with dynamic, each branch takes 4 inputs (buffer, x, z, s1)
+=======
+                # Since we compile wit dynamic, each branch takes 4 inputs (buffer, x, z, s1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(len(operands), 4)
             if node.op == "get_attr":
                 if str(node.target) in ("cond_true_0, cond_false_0"):
@@ -2609,17 +2628,36 @@ def f(x):
             f, default_args_generator((x,)), arg_count, expected_opcount=3
         )
 
+<<<<<<< HEAD
     def test_support_float_in_output(self):
         counters.clear()
         cnt = CompileCounter()
 
         @torch.compile(backend=cnt, fullgraph=True)
+=======
+    def test_fallback_on_python_primitives_output(self):
+        counters.clear()
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return wrap(lambda x: [1, torch.sin(x), 2.0], x)
 
         x = torch.randn(3)
         result = f(x)
         self.assertEqual(result, [1, torch.sin(x), 2.0])
+<<<<<<< HEAD
+=======
+        self.assertEqual(cnt.frame_count, 0)
+        assert_dict_matches_regex(
+            self,
+            dict(counters["graph_break"]),
+            {
+                ".*HigherOrderOperator body's output must consist of tensors or ints only but got": 1
+            },
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nested_tuple_output(self):
         def f(x):
@@ -3077,6 +3115,7 @@ def forward(self, L_a_ : torch.SymInt, L_b_ : torch.SymInt, L_c_ : torch.SymInt,
     b = torch.arange(l_b_)
     c = torch.arange(l_c_)
     d = torch.arange(l_d_)
+<<<<<<< HEAD
     lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
     _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(l_d_, 'error');  _vmap_increment_nesting = None
     child = torch._functorch.predispatch._add_batch_dim(d, 0, 1);  d = None
@@ -3100,6 +3139,31 @@ def forward(self, L_a_ : torch.SymInt, L_b_ : torch.SymInt, L_c_ : torch.SymInt,
     _vmap_decrement_nesting_2 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_2 = None
     _remove_batch_dim_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0);  batched_outputs_3 = l_d_ = None
     _vmap_decrement_nesting_3 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_3 = None
+=======
+    lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+    _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(l_d_, 'error');  _vmap_increment_nesting = None
+    child = torch._C._functorch._add_batch_dim(d, 0, 1);  d = None
+    lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+    _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(l_c_, 'error');  _vmap_increment_nesting_1 = None
+    child_1 = torch._C._functorch._add_batch_dim(c, 0, 2);  c = None
+    lazy_load_decompositions_2 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_2 = None
+    _vmap_increment_nesting_2 = torch._C._functorch._vmap_increment_nesting(l_b_, 'error');  _vmap_increment_nesting_2 = None
+    child_2 = torch._C._functorch._add_batch_dim(b, 0, 3);  b = None
+    lazy_load_decompositions_3 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_3 = None
+    _vmap_increment_nesting_3 = torch._C._functorch._vmap_increment_nesting(l_a_, 'error');  _vmap_increment_nesting_3 = None
+    _add_batch_dim_3 = torch._C._functorch._add_batch_dim(a, 0, 4);  a = None
+    add = _add_batch_dim_3 + child_2;  _add_batch_dim_3 = child_2 = None
+    add_1 = add + child_1;  add = child_1 = None
+    batched_outputs = add_1 + child;  add_1 = child = None
+    batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 4, l_a_, 0);  batched_outputs = l_a_ = None
+    _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+    batched_outputs_2 = torch._C._functorch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0);  batched_outputs_1 = l_b_ = None
+    _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+    batched_outputs_3 = torch._C._functorch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0);  batched_outputs_2 = l_c_ = None
+    _vmap_decrement_nesting_2 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_2 = None
+    _remove_batch_dim_3 = torch._C._functorch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0);  batched_outputs_3 = l_d_ = None
+    _vmap_decrement_nesting_3 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (_remove_batch_dim_3,)""",  # noqa: B950
             )
 
@@ -3732,11 +3796,19 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -3779,18 +3851,32 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3);  chunk_1 = None
 
+<<<<<<< HEAD
         lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
         _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
 
         _add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3);  basis = None
+=======
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim_1], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim_1 = None
         batched_outputs: "f32[4, 3]" = _autograd_grad[0];  _autograd_grad = None
 
+<<<<<<< HEAD
         chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 3, 12, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 3, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 4, 3]" = split[0];  split = None
@@ -3802,15 +3888,25 @@ def forward(self, L_x_: "f32[4, 3]"):
         dual: "f32[4, 3, 4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         results_1: "f32[12, 4, 3, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
         _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+=======
+        results_1: "f32[12, 4, 3, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         movedim: "f32[4, 3, 4, 3, 12]" = results_1.movedim(0, -1);  results_1 = None
         split_2 = movedim.split((12,), dim = -1);  movedim = None
@@ -3859,11 +3955,19 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -3908,18 +4012,32 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3);  chunk_1 = None
 
+<<<<<<< HEAD
         lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
         _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
 
         _add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3);  basis = None
+=======
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [child_4], [_add_batch_dim_1], retain_graph = True, create_graph = True);  primals_out = child_4 = _add_batch_dim_1 = None
         child_5: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
+<<<<<<< HEAD
         child_6: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_5, 3, 12, 0);  child_5 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        child_6: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_5, 3, 12, 0);  child_5 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         split = child_6.split((12,), dim = 0);  child_6 = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -3932,15 +4050,25 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         tangent: "f32[4, 3, 3, 4]" = torch.zeros_like(primal)
 
         child_8: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_8 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         child_9: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         child_10: "f32[12, 4, 3, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_9, 1, 12, 0);  child_9 = None
 
         _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+=======
+        child_10: "f32[12, 4, 3, 3, 4]" = torch._C._functorch._remove_batch_dim(child_9, 1, 12, 0);  child_9 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         movedim: "f32[4, 3, 3, 4, 12]" = child_10.movedim(0, -1);  child_10 = None
         split_2 = movedim.split((12,), dim = -1);  movedim = None
@@ -4005,18 +4133,32 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         basis: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[4, 3]" = _autograd_grad[0];  _autograd_grad = None
 
+<<<<<<< HEAD
         chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 4, 3]" = split[0];  split = None
@@ -4083,18 +4225,32 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
+<<<<<<< HEAD
         chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -4144,6 +4300,10 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         primals_out: "f32[3, 4]" = diff_primals.sin()
 
         aux_1: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         results: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primals_out, 1)
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4162,18 +4322,32 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
+<<<<<<< HEAD
         chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -4378,6 +4552,10 @@ def forward(self, L_x_: "f32[5]"):
         primals_out: "f32[]" = sin.sum();  sin = None
 
         aux: "f32[5]" = torch._C._functorch._unwrap_for_grad(child, 1);  child = aux = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         results: "f32[]" = torch._C._functorch._unwrap_for_grad(primals_out, 1);  primals_out = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4469,6 +4647,7 @@ def wrapper_fn(model, params, buffers, inputs):
         if torch._dynamo.config.inline_inbuilt_nn_modules:
             expected = """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, L_inputs_: "f32[1, 1]", L_model_modules_l1_parameters_weight_: "f32[1, 1]", L_model_modules_l1_parameters_bias_: "f32[1]", L_model_buffers_buffer_: "f32[1]"):
         l_inputs_ = L_inputs_
         l_model_modules_l1_parameters_weight_ = L_model_modules_l1_parameters_weight_
@@ -4476,6 +4655,17 @@ def forward(self, L_inputs_: "f32[1, 1]", L_model_modules_l1_parameters_weight_:
         l_model_buffers_buffer_ = L_model_buffers_buffer_
         linear: "f32[1, 1]" = torch._C._nn.linear(l_inputs_, l_model_modules_l1_parameters_weight_, l_model_modules_l1_parameters_bias_);  l_inputs_ = l_model_modules_l1_parameters_weight_ = l_model_modules_l1_parameters_bias_ = None
         add: "f32[1, 1]" = linear + l_model_buffers_buffer_;  linear = l_model_buffers_buffer_ = None
+=======
+    def forward(self, L_params_l1_weight_: "f32[1, 1]", L_params_l1_bias_: "f32[1]", L_buffers_buffer_: "f32[1]", L_inputs_: "f32[1, 1]"):
+        l_params_l1_weight_ = L_params_l1_weight_
+        l_params_l1_bias_ = L_params_l1_bias_
+        l_buffers_buffer_ = L_buffers_buffer_
+        l_inputs_ = L_inputs_
+
+        linear: "f32[1, 1]" = torch._C._nn.linear(l_inputs_, l_params_l1_weight_, l_params_l1_bias_);  l_inputs_ = l_params_l1_weight_ = l_params_l1_bias_ = None
+
+        add: "f32[1, 1]" = linear + l_buffers_buffer_;  linear = l_buffers_buffer_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)
 """
             # We found Windows/Linux have some empty line difference, empty_line_normalizer will help fix it.
@@ -4567,6 +4757,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4634,6 +4828,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4690,6 +4888,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4746,6 +4948,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4800,7 +5006,13 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+=======
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4856,7 +5068,13 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+=======
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4930,7 +5148,13 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+<<<<<<< HEAD
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+=======
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -4974,7 +5198,13 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
 
         _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
         _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+<<<<<<< HEAD
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+=======
+
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5034,6 +5264,10 @@ def forward(self, L_x_: "f32[]"):
         grad_input: "f32[]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input, 2);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 2);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5043,6 +5277,10 @@ def forward(self, L_x_: "f32[]"):
         grad_input_2: "f32[]" = _autograd_grad_1[0];  _autograd_grad_1 = None
 
         grad_input_3: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_2, 1);  grad_input_2 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_2: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_1, 1);  grad_input_1 = output_2 = None
 
         _grad_decrement_nesting_1 = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting_1 = None
@@ -5148,6 +5386,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         grad_input: "f32[3, 3, 3]" = _autograd_grad[0];  _autograd_grad = None
 
         grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = output_1 = None
 
         _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting();  _grad_decrement_nesting = None
@@ -5203,11 +5445,19 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5226,15 +5476,25 @@ def forward(self, L_x_: "f32[4, 3]"):
         dual: "f32[4, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         results: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        results: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         movedim: "f32[4, 3, 12]" = results.movedim(0, -1);  results = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -5283,11 +5543,19 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5307,15 +5575,25 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         movedim: "f32[3, 4, 12]" = results.movedim(0, -1);  results = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -5364,11 +5642,19 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
         child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+
+        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5390,16 +5676,27 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         dual: "f32[3, 4]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = primals_out_unflatten = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
         aux_2: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(aux_1, 1, 12, 0);  aux_1 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        aux_2: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(aux_1, 1, 12, 0);  aux_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         aux_3: "f32[4, 3]" = aux_2[0];  aux_2 = None
 
@@ -5450,11 +5747,19 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'same');  _vmap_increment_nesting = None
 
         child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'same');  _vmap_increment_nesting = None
+
+        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5480,6 +5785,10 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child_4: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = child_4 = None
         child_5: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 2);  primal_1 = child_5 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         child_6: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
         child_7: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
@@ -5487,10 +5796,17 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
+<<<<<<< HEAD
         child_8: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 12, 0);  child_6 = None
         child_9: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(child_7, 1, 12, 0);  child_7 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        child_8: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_6, 1, 12, 0);  child_6 = None
+        child_9: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(child_7, 1, 12, 0);  child_7 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         movedim: "f32[3, 4, 12]" = child_8.movedim(0, -1);  child_8 = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -5549,6 +5865,10 @@ def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5602,6 +5922,10 @@ def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5663,6 +5987,10 @@ def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         dual: "f32[3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5716,6 +6044,10 @@ def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5783,6 +6115,10 @@ def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         dual: "f32[]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
 
         _exit_dual_level = torch._C._exit_dual_level(0);  _exit_dual_level = None
@@ -5859,6 +6195,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
         dual: "f32[3, 3, 3]" = _unpack_dual[1];  _unpack_dual = None
 
         primals_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangents_out_unflatten: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
 
         _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_2 = None
@@ -5873,6 +6213,10 @@ def forward(self, L_x_: "f32[3, 3, 3]"):
 
         _unwrap_for_grad_2: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 1);  primal_1 = None
         _unwrap_for_grad_3: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_2, 1);  primal_2 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _unwrap_for_grad_4: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_1, 1);  dual_1 = None
         _unwrap_for_grad_5: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_2, 1);  dual_2 = None
 
@@ -6223,19 +6567,33 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[3, 3, 3]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         batched_outputs: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim,)
 """,
         )
@@ -6261,20 +6619,34 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[3, 3, 3]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + 3;  add = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim,)
 """,
         )
@@ -6301,20 +6673,34 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3, 3]" = add + l_y_;  add = l_y_ = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim,)
 """,
         )
@@ -6342,21 +6728,36 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
         _add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim,)
 """,
         )
@@ -6386,21 +6787,36 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
         _add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim,)
 """,
         )
@@ -6426,6 +6842,7 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
@@ -6449,6 +6866,31 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
         _remove_batch_dim_1: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 3, 0);  batched_outputs_1 = None
 
         _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+
+        child: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        child_1: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_2: "f32[3]" = torch._C._functorch._add_batch_dim(child, 1, 2);  child = None
+        _add_batch_dim_3: "f32[3]" = torch._C._functorch._add_batch_dim(child_1, 1, 2);  child_1 = None
+
+        batched_outputs: "f32[3]" = _add_batch_dim_2 + _add_batch_dim_3;  _add_batch_dim_2 = _add_batch_dim_3 = None
+
+        batched_outputs_1: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+
+        _remove_batch_dim_1: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 3, 0);  batched_outputs_1 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim_1,)
 """,
         )
@@ -6475,6 +6917,7 @@ def forward(self, L_y_: "f32[5, 3]", L_x_: "f32[2, 3]"):
         l_y_ = L_y_
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(5, 'error');  _vmap_increment_nesting = None
@@ -6496,6 +6939,29 @@ def forward(self, L_y_: "f32[5, 3]", L_x_: "f32[2, 3]"):
         _remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 5, 0);  batched_outputs_1 = None
 
         _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(5, 'error');  _vmap_increment_nesting = None
+
+        child: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_1: "f32[]" = torch._C._functorch._add_batch_dim(child, 0, 2);  child = None
+
+        batched_outputs: "f32[2, 3]" = l_x_ * _add_batch_dim_1;  l_x_ = _add_batch_dim_1 = None
+
+        batched_outputs_1: "f32[3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+
+        _remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 5, 0);  batched_outputs_1 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim_1,)
 """,
         )
@@ -6520,19 +6986,34 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[2, 3]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 0);  child = None
         _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[2, 3]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 0);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6557,19 +7038,34 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1);  child = None
         _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6595,19 +7091,34 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
         _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
+<<<<<<< HEAD
         _remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1);  child = None
         _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6805,7 +7316,11 @@ def _validate(self, fn, backend, *args, skip_check=False, fullgraph=True):
             for arg, cloned_arg in zip(args, cloned_args):
                 self.assertEqual(arg.grad, cloned_arg.grad)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function(self):
         def gn(x, y):
@@ -6824,7 +7339,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function_with_kwargs(self):
         def gn(x, y):
@@ -6847,7 +7366,11 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout(self):
         def gn(x, y):
@@ -6873,7 +7396,11 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout_inductor(self):
         def gn(x, y):
@@ -6892,7 +7419,11 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_fallback(self):
         def gn(x, y):
@@ -6923,7 +7454,11 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(backend.graphs), 2)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_module(self):
         class MockModule(torch.nn.Module):
@@ -7066,6 +7601,7 @@ def test_non_aliasing_util(self):
         ):
             _assert_tensors_nonaliasing(a, a)
 
+<<<<<<< HEAD
     def test_flop_counter_for_cond(self):
         from torch.utils.flop_counter import FlopCounterMode
 
@@ -7163,12 +7699,17 @@ def false_branch(x):
             },
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 xfail_hops_compile = {
     # aot_eager
     "map",  # assert type(args[1].realize()) is TensorVariable
     "scan",  # scan is not an OpOverload
+<<<<<<< HEAD
     "local_map_hop",  # can't retrace
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # inductor
     "while_loop",  # LoweringException: AssertionError
     "flex_attention",  # LoweringException: AssertionError
@@ -7177,7 +7718,11 @@ def false_branch(x):
 
 
 class TestHigherOrderOpsOpInfo(torch._dynamo.test_case.TestCase):
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("backend", ("aot_eager", "inductor"))
     @ops(
         list(filter(lambda op: op.name not in xfail_hops_compile, hop_db)),
diff --git a/test/dynamo/test_hooks.py b/test/dynamo/test_hooks.py
index 3f3a3bd7f6537..1f8e5bdedc83c 100644
--- a/test/dynamo/test_hooks.py
+++ b/test/dynamo/test_hooks.py
@@ -746,7 +746,11 @@ def test_fn(fn):
             if cnts:
                 self.assertEqual(cnts.frame_count, 1)
             # These same exact assertions run on both eager and compiled
+<<<<<<< HEAD
             # X goes to x*2 because of mul_
+=======
+            # X goes to x*2 becaue of mul_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(x, torch.tensor([0.5, 0.5, 0.5]) * 2)
             # This test proves grad aliasing works -
             self.assertEqual(x.grad, b * 5)
diff --git a/test/dynamo/test_inline_and_install.py b/test/dynamo/test_inline_and_install.py
index e484ebaf9de51..bb2da8273c2ca 100644
--- a/test/dynamo/test_inline_and_install.py
+++ b/test/dynamo/test_inline_and_install.py
@@ -1,4 +1,8 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._dynamo import config
 from torch._dynamo.testing import make_test_cls_with_patches
@@ -41,6 +45,43 @@ def make_dynamic_cls(cls):
     make_dynamic_cls(test)
 del test
 
+<<<<<<< HEAD
+=======
+# After installing and inlining is turned on, these tests won't throw
+# errors in export (which is expected for the test to pass)
+# Therefore, these unittest are expected to fail, and we need to update the
+# semantics
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_global_inline_and_install  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_global_multiple_access_inline_and_install  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_nonlocal_inline_and_install  # noqa: F821
+)
+
+
+# These tests do string comparisson on the graphs, and since buffers are now inlined, they
+# are named different, resulting in failure
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_param_buffer_safe_from_mutation_simple_inline_and_install  # noqa: F821
+)
+
+
+# This particular test is marked expecting failure, since dynamo was creating second param for a
+# and this was causing a failure in the sum; however with these changes, that test is fixed
+# so will now pass, so we need to mark that it is no longer expected to fail
+def expectedSuccess(test_item):
+    test_item.__unittest_expecting_failure__ = False
+    return test_item
+
+
+expectedSuccess(
+    InlineAndInstallExportTests.test_sum_param_inline_and_install  # noqa: F821
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 860d82784ea70..8f5ed2769b069 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -21,6 +21,7 @@
 from torch.testing._internal.common_cuda import SM90OrLater
 from torch.testing._internal.common_utils import (
     find_free_port,
+<<<<<<< HEAD
     IS_WINDOWS,
     munge_exc,
     skipIfTorchDynamo,
@@ -32,11 +33,19 @@
     HAS_CUDA_AND_TRITON,
     HAS_XPU_AND_TRITON,
 )
+=======
+    munge_exc,
+    skipIfTorchDynamo,
+    xfailIfS390X,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.logging_utils import (
     LoggingTestCase,
     make_logging_test,
     make_settings_test,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
@@ -44,14 +53,22 @@
     HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON, "requires cuda or xpu with triton"
 )
 
+=======
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def munge_shape_guards(s: str) -> str:
     SHAPE_GUARD_REGEX = (
@@ -86,7 +103,11 @@ def inductor_error_fn(a):
 
 
 def inductor_schedule_fn(a):
+<<<<<<< HEAD
     output = a.add(torch.ones(1000, 1000, device=device_type))
+=======
+    output = a.add(torch.ones(1000, 1000, device="cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return output
 
 
@@ -123,6 +144,7 @@ class LoggingTests(LoggingTestCase):
     test_output_code = multi_record_test(3, output_code=True)
     test_aot_graphs = multi_record_test(3, aot_graphs=True)
 
+<<<<<<< HEAD
     @requires_gpu
     @make_logging_test(schedule=True)
     def test_schedule(self, records):
@@ -148,6 +170,29 @@ def test_fusion(self, records):
     def test_cudagraphs(self, records):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
         fn_opt(torch.ones(1000, 1000, device=device_type))
+=======
+    @requires_cuda
+    @make_logging_test(schedule=True)
+    def test_schedule(self, records):
+        fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+        self.assertGreater(len(records), 0)
+        self.assertLess(len(records), 5)
+
+    @requires_cuda
+    @make_logging_test(fusion=True)
+    def test_fusion(self, records):
+        fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+        self.assertGreater(len(records), 0)
+        self.assertLess(len(records), 8)
+
+    @requires_cuda
+    @make_logging_test(cudagraphs=True)
+    def test_cudagraphs(self, records):
+        fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertGreater(len(records), 0)
         self.assertLess(len(records), 8)
 
@@ -245,7 +290,12 @@ def throw(x):
 Traceback (most recent call last):
   File "test_logging.py", line N, in throw
     raise AssertionError
+<<<<<<< HEAD
 torch._inductor.exc.InductorError: LoweringException: AssertionError:
+=======
+torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
+LoweringException: AssertionError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   target: aten.round.default
   args[0]: TensorBox(StorageBox(
     InputBuffer(name='primals_1', layout=FixedLayout('cpu', torch.float32, size=[1000, 1000], stride=[1000, 1]))
@@ -255,7 +305,11 @@ def throw(x):
         exitstack.close()
 
     @requires_distributed()
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_logging_test(ddp_graphs=True)
     def test_ddp_graphs(self, records):
         class ToyModel(torch.nn.Module):
@@ -533,7 +587,11 @@ def test_invalid_artifact_flag_error_msg(self):
             "import torch",
             env=env,
         )
+<<<<<<< HEAD
         lines = stderr.decode().split("\r\n" if IS_WINDOWS else "\n")
+=======
+        lines = stderr.decode().split("\n")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is a sanity assert that our error is not spammy.
         # As of this test creation this was 18.
         # See this issue for the purpose o this test:
@@ -549,7 +607,10 @@ def test_invalid_artifact_flag_error_msg(self):
         self.assertEqual(lines[-4], "Valid settings:")
 
     @requires_distributed()
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn), Can't reproduce locally")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_distributed_rank_logging(self):
         env = dict(os.environ)
         env["TORCH_LOGS"] = "dynamo"
@@ -729,10 +790,17 @@ def f(x, y, z):
         self.assertExpectedInline(
             munge_shape_guards(record.getMessage()),
             """\
+<<<<<<< HEAD
 +- __SHAPE_GUARD__: L['x'].size()[0] == 2*L['y'].size()[0]  # return x + torch.cat([y, z])  # #:# in # #:# in #
 +- __SHAPE_GUARD__: L['z'].size()[0] == L['y'].size()[0]  # duck sizing added this equality because these variables had the same size 3 (to avoid this specialization, set torch.fx.experimental._config.use_duck_shape = False)
 +- __SHAPE_GUARD__: ((2*L['y'].size()[0]) % 3) == 0  # if x.size(0) % 3 == 0:  # #:# in # #:# in #
 +- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return x + torch.cat([y, z])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim))""",  # noqa: B950
+=======
++- __SHAPE_GUARD__: L['x'].size()[0] == 2*L['z'].size()[0]  # return x + torch.cat([y, z])  # #:# in # #:# in #
++- __SHAPE_GUARD__: L['y'].size()[0] == L['z'].size()[0]  # duck sizing added this equality because these variables had the same size 3 (to avoid this specialization, set torch.fx.experimental._config.use_duck_shape = False)
++- __SHAPE_GUARD__: ((2*L['z'].size()[0]) % 3) == 0  # if x.size(0) % 3 == 0:  # #:# in # #:# in #
++- __SHAPE_GUARD__: 2 <= L['z'].size()[0]  # return x + torch.cat([y, z])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @make_logging_test(guards=True)
@@ -748,7 +816,11 @@ def f(x, y):
             munge_shape_guards(record.getMessage()),
             """\
 +- __SHAPE_GUARD__: L['x'].size()[0] == 2*L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # #:# in #
+<<<<<<< HEAD
 +- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim))""",  # noqa: B950
+=======
++- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @make_logging_test(guards=True)
@@ -782,11 +854,18 @@ def fn(x):
         self.assertGreater(len(records), 0)
         self.assertLess(len(records), 4)
 
+<<<<<<< HEAD
     @xfailIf(TEST_XPU)  # https://github.com/pytorch/pytorch/issues/157778
     @make_logging_test(perf_hints=True)
     @requires_gpu
     def test_optimizer_non_static_param(self, records):
         params = [torch.randn(10, 10, device=device_type) for _ in range(2)]
+=======
+    @make_logging_test(perf_hints=True)
+    @requires_cuda
+    def test_optimizer_non_static_param(self, records):
+        params = [torch.randn(10, 10, device="cuda") for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param in params:
             param.grad = torch.zeros_like(param)
         opt = torch.optim.Adam(params)
@@ -796,7 +875,11 @@ def test_optimizer_non_static_param(self, records):
         self.assertLess(len(records), 3)
 
     @make_logging_test(autotuning=True)
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM90OrLater, "requires H100+ GPU")
     def test_autotuning(self, records):
         with torch._inductor.utils.fresh_cache():
@@ -805,10 +888,14 @@ def f(a, b):
                 return torch.mm(a, b)
 
             f = torch.compile(f, mode="max-autotune-no-cudagraphs")
+<<<<<<< HEAD
             f(
                 torch.randn(10, 10, device=device_type),
                 torch.randn(10, 10, device=device_type),
             )
+=======
+            f(torch.randn(10, 10, device="cuda"), torch.randn(10, 10, device="cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertGreater(len(records), 0)
             self.assertLess(len(records), 40)
 
@@ -858,6 +945,11 @@ def fn(a):
             len([r for r in records if "return a + 1" in r.getMessage()]), 0
         )
 
+<<<<<<< HEAD
+=======
+    # there are some additional deprecation warnings in stderr, probably due to newer dependencies used on s390x
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logs_out(self):
         import tempfile
 
@@ -892,6 +984,7 @@ def fn(a):
                 os.remove(
                     file_path
                 )  # Delete temp file manually, due to setup NamedTemporaryFile as delete=False.
+<<<<<<< HEAD
                 orig_maxDiff = unittest.TestCase.maxDiff
                 unittest.TestCase.maxDiff = None
                 try:
@@ -902,6 +995,12 @@ def fn(a):
                 except Exception:
                     unittest.TestCase.maxDiff = orig_maxDiff
                     raise
+=======
+                self.assertEqual(  # process wrap difference: /r/n on Windows, /n on posix.
+                    empty_line_normalizer(lines),
+                    empty_line_normalizer(stderr.decode("utf-8")),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @make_settings_test("torch._dynamo.eval_frame")
     def test_log_traced_frames(self, records):
@@ -951,7 +1050,10 @@ def bar():
     "aot_graphs",
     "aot_graphs_effects",
     "pre_grad_graphs",
+<<<<<<< HEAD
     "joint_graph_passes",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "post_grad_graphs",
     "inductor_metrics",
     "ir_pre_fusion",
@@ -987,7 +1089,10 @@ def bar():
     "graph_region_expansion",
     "hierarchical_compile",
     "compute_dependencies",
+<<<<<<< HEAD
     "annotation",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/dynamo/test_metrics_context.py b/test/dynamo/test_metrics_context.py
index 3a8657003cd19..52fb9c9c3b00b 100644
--- a/test/dynamo/test_metrics_context.py
+++ b/test/dynamo/test_metrics_context.py
@@ -64,7 +64,11 @@ def test_set_disallow_overwrite(self):
 
     def test_update_disallow_overwrite(self):
         """
+<<<<<<< HEAD
         Validate update won't overwrite.
+=======
+        Validate update won't overwite.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         with MetricsContext(self._on_exit) as context:
             context.update({"m1": 1, "m2": 2})
@@ -73,7 +77,11 @@ def test_update_disallow_overwrite(self):
 
     def test_update_allow_overwrite(self):
         """
+<<<<<<< HEAD
         Validate update will overwrite when given param.
+=======
+        Validate update will overwite when given param.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         with MetricsContext(self._on_exit) as context:
             context.update({"m1": 1, "m2": 2})
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 3856b5078375c..5a9ec5e8ffb24 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: dynamo"]
 # ruff: noqa: F841
 import abc
+<<<<<<< HEAD
 import builtins
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import collections
 import collections.abc
 import copy
@@ -17,13 +20,19 @@
 import math
 import operator
 import os
+<<<<<<< HEAD
 import pickle
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import random
 import sys
 import tempfile
 import threading
 import traceback
+<<<<<<< HEAD
 import types
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import typing
 import unittest
 import unittest.mock as mock
@@ -50,13 +59,20 @@
     CompileCounter,
     CompileCounterWithBackend,
     expectedFailureDynamic,
+<<<<<<< HEAD
+=======
+    requiresPy310,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     same,
     skipIfNotPy311,
     unsupported,
 )
 from torch._dynamo.utils import call_size, counters, ifdynstaticdefault
 from torch._dynamo.variables import builder
+<<<<<<< HEAD
 from torch._inductor.codecache import WritableTempFile
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.utils import fresh_cache, run_and_get_code
 from torch.ao.quantization import MinMaxObserver
 from torch.ao.quantization.fake_quantize import FakeQuantize
@@ -86,23 +102,32 @@
 )
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
+<<<<<<< HEAD
     instantiate_parametrized_tests,
     IS_FBCODE,
     parametrize,
+=======
+    IS_FBCODE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scoped_load_inline,
     set_default_dtype,
     skipIfHpu,
     skipIfNNModuleInlined,
     skipIfWindows,
+<<<<<<< HEAD
     subtest,
     TEST_HPU,
     TEST_XPU,
+=======
+    TEST_HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapDeterministicFlagAPITest,
 )
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.logging_utils import logs_to_string
 
 
+<<<<<<< HEAD
 pytree_modules = {
     "python": python_pytree,
 }
@@ -119,6 +144,13 @@
     [subtest(module, name=name) for name, module in pytree_modules.items()],
 )
 
+=======
+if python_pytree._cxx_pytree_dynamo_traceable:
+    import torch.utils._cxx_pytree as cxx_pytree
+else:
+    cxx_pytree = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MyTuple = collections.namedtuple("MyTuple", ["a", "b", "ab"])
 T = typing.TypeVar("T")
 
@@ -243,6 +275,7 @@ def boolarg(aa, bb, flag):
         self.assertTrue(same(val4, correct1))
         self.assertEqual(counter.frame_count, 3)
 
+<<<<<<< HEAD
     def test_dynamo_inside_custom_op(self):
         cnt = torch._dynamo.testing.InductorAndRecordGraphs()
         cnt1 = torch._dynamo.testing.InductorAndRecordGraphs()
@@ -294,6 +327,8 @@ def f(x):
     return (foo,)""",
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(accumulated_recompile_limit=1)
     def test_dynamo_disabled_in_custom_op_kernels(self):
         counters.clear()
@@ -658,6 +693,7 @@ def f(x, n):
         fn = torch.compile(f, backend="eager", dynamic=True, fullgraph=True)
         fn(torch.tensor([5]), 5)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
     def test_cond_runtime_assert_generation(self):
@@ -683,6 +719,8 @@ def foo(x):
         ):
             foo(torch.randn(5, 5))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_setattr_getset_descriptor(self):
         # Tensor attribute `real` has special getter/setter for complex dtype.
         def f(x):
@@ -1330,7 +1368,11 @@ def _fn(a, b, func=func):
     def test_bound_shape_checks(self):
         def f1(x, y):
             b = x.item()
+<<<<<<< HEAD
             torch._check(b >= 0)
+=======
+            torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(b < y.shape[0])
             return y[:b]
 
@@ -1353,6 +1395,10 @@ def test_arange_length_with_float32_dtype(self):
         @torch.compile(fullgraph=True)
         def f(x):
             y = x.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = torch.arange(y, dtype=torch.float32)
 
             if r.size(0) == y:
@@ -1399,13 +1445,21 @@ def f(x):
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     # Translation validation changes the exception type, don't run with it
     @torch.fx.experimental._config.patch(translation_validation=False)
+<<<<<<< HEAD
     def test_torch_check_nonnegative(self):
+=======
+    def test_torch_check_is_size(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnts = torch._dynamo.testing.CompileCounter()
 
         @torch.compile(backend=cnts, fullgraph=True)
         def f(x):
             y = x.item()
+<<<<<<< HEAD
             torch._check(y >= 0)
+=======
+            torch._check_is_size(y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Cannot conditional on unbacked SymInt
             if y == 0:
                 assert False
@@ -1794,17 +1848,27 @@ def fn(packed):
             if hasattr(packed, "b"):
                 b = packed.b + 1
             c = packed[2]
+<<<<<<< HEAD
             d = len(packed._fields)
             return a + b + c + d
+=======
+            return a + b + c
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         v1 = torch.Tensor([1])
         v2 = torch.Tensor([2])
         v3 = torch.Tensor([3])
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(fn, backend=cnts)
+<<<<<<< HEAD
         self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 10)
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 4)
+=======
+        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 7)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_namedtuple3(self):
         def fn(x, packed):
@@ -1842,6 +1906,7 @@ def __getitem__(self, index):
         out = f(MyTuple(a, b))
         self.assertTrue(same(a + 1, out))
 
+<<<<<<< HEAD
     def test_namedtuple_source_dynamic_attributes(self):
         class MyNamedTuple(typing.NamedTuple):
             a: torch.Tensor
@@ -1888,6 +1953,8 @@ def f():
         self.assertTrue(hasattr(result, "c"))
         self.assertEqual(result.c, torch.tensor(3.0))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_structseq1(self):
         def fn(x, y):
             return torch.return_types.max((x, y))
@@ -2097,6 +2164,7 @@ def fn(a, b):
 
         self.assertEqual(exp, act)
 
+<<<<<<< HEAD
     def test_class_binop(self):
         class Foo:
             def __init__(self, x):
@@ -2122,6 +2190,8 @@ def fn(a, b):
         opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
         self.assertRaises(torch._dynamo.exc.Unsupported, opt_fn, a, b)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_user_getattr1(self):
         class MyConfig(dict):
             def __getattr__(self, name):
@@ -2667,7 +2737,10 @@ def fn(x):
             y = fn(x)
         self.assertTrue(y.flags.writeable)  # XXX: differs from numpy
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_numpy_tolist(self):
         def fn(x):
             return x.tolist()
@@ -3440,16 +3513,26 @@ def fn(m, x):
     def test_global_state_guard_serialization(self):
         GlobalStateGuard = torch._C._dynamo.guards.GlobalStateGuard
         guards = GlobalStateGuard()
+<<<<<<< HEAD
         serialized_guards = guards.__getstate__()
+=======
+        serialized_guards = guards.dump()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         json_guards = json.loads(serialized_guards)
 
         samples = []
         # Test on non autocast state and autocast cache states.
         self.assertIn("autocast_state", json_guards)
         for key, value in json_guards.items():
+<<<<<<< HEAD
             if type(value) is int:
                 variant = value + 1
             elif type(value) is bool:
+=======
+            if type(value) == int:
+                variant = value + 1
+            elif type(value) == bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 variant = not value
             elif isinstance(value, dict) and key == "autocast_state":
                 variant = value.copy()
@@ -3462,17 +3545,28 @@ def test_global_state_guard_serialization(self):
             samples.append(new_dict)
 
         for sample in samples:
+<<<<<<< HEAD
             guards.__setstate__(json.dumps(sample))
             self.assertFalse(guards.check())
 
         guards.__setstate__(json.dumps(json_guards))
+=======
+            guards.load(json.dumps(sample))
+            self.assertFalse(guards.check())
+
+        guards.load(json.dumps(json_guards))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guards.check())
 
         # Test on autocast states.
         def _test_autocast(dtype):
             with torch.autocast("cpu", dtype):
                 guards = GlobalStateGuard()
+<<<<<<< HEAD
                 serialized_guards = guards.__getstate__()
+=======
+                serialized_guards = guards.dump()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 json_guards = json.loads(serialized_guards)
 
                 for i, enabled in enumerate(json_guards["autocast_state"]["enabled"]):
@@ -3481,7 +3575,11 @@ def _test_autocast(dtype):
                             type(json_guards["autocast_state"]["dtype"][i]), int
                         )
                         json_guards["autocast_state"]["dtype"][i] += 1
+<<<<<<< HEAD
                         guards.__setstate__(json.dumps(json_guards))
+=======
+                        guards.load(json.dumps(json_guards))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertFalse(guards.check())
 
         _test_autocast(torch.float16)
@@ -4217,7 +4315,11 @@ def test_write_to_cells_with_name_shadowing(self):
         y = x
 
         def make_x_get_set():
+<<<<<<< HEAD
             # NOTE: this `x` is a different cell object than the outer `x`.
+=======
+            # NOTE: this `x` is a different cell object than the outter `x`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = y
 
             def set_x(v):
@@ -5009,7 +5111,11 @@ def fn(x, y):
         self.assertEqual(cnts.frame_count, 2)
 
     def test_id_guarded_object(self):
+<<<<<<< HEAD
         class UserDefinedObject:
+=======
+        class UDO:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             @torch.compile(backend="eager")
             def call(self, x, ref_id):
                 self_id = id(self)
@@ -5022,11 +5128,19 @@ def call(self, x, ref_id):
         # Make sure we do recompile when id(self) is executed on
         # different self objects.
         x = torch.ones(2)
+<<<<<<< HEAD
         obj1 = UserDefinedObject()
         obj1_id = id(obj1)
         self.assertEqual(obj1.call(x, obj1_id), torch.ones(2))
 
         obj2 = UserDefinedObject()
+=======
+        obj1 = UDO()
+        obj1_id = id(obj1)
+        self.assertEqual(obj1.call(x, obj1_id), torch.ones(2))
+
+        obj2 = UDO()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if we do not install ID_MATCH: ___check_obj_id(L['self'], xxx) this fails.
         self.assertEqual(obj2.call(x, obj1_id), torch.zeros(2))
 
@@ -5305,9 +5419,12 @@ def fn(sample):
 
         self.assertTrue(same(ref, res))
 
+<<<<<<< HEAD
     @skipIfWindows(
         msg="TODO(xuhancn): confirm, AssertionError: tensor([0.0290, 0.4019, 0.2598, 0.3666]) is not None"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_release_input_memory(self):
         x = torch.rand([4])
         x_ref = weakref.ref(x)
@@ -5323,9 +5440,12 @@ def foo(x):
         del x
         self.assertIs(x_ref(), None)
 
+<<<<<<< HEAD
     @skipIfWindows(
         msg="TODO: (xuhancn) conform, AssertionError: Linear(in_features=10, out_features=10, bias=True) is not None"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_release_module_memory(self):
         mod = torch.nn.Linear(10, 10)
         x = torch.rand([10, 10])
@@ -5357,7 +5477,10 @@ def foo(mod, x):
         self.assertIsNone(mod_ref(), None)
         self.assertIsNone(mod_weight_ref(), None)
 
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) conform, AssertionError: False is not true")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_release_scope_memory(self):
         def inner(y):
             y
@@ -6527,6 +6650,7 @@ def func(x, y):
         self.assertTrue(same(ref, res))
         self.assertTrue(same(x, x1))
 
+<<<<<<< HEAD
     def test_inference_mode_param(self):
         def fn(x):
             p = torch.nn.Parameter(x, requires_grad=False)
@@ -6540,6 +6664,8 @@ def fn(x):
             res = opt_fn(x)
             self.assertEqual(ref, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_if_cond_nn_mod1(self):
         class MockModule(torch.nn.Module):
             def __init__(self, output_relu=True):
@@ -6903,10 +7029,20 @@ def guard_failures(failure):
 
         self.assertTrue(guard_failure is not None)
         first_guard_failure = guard_failure[0].partition("\n")[0]
+<<<<<<< HEAD
         self.assertIn(
             """tensor 'x' size mismatch at index 0. expected 2, actual 5""",
             first_guard_failure,
         )
+=======
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertIn(
+                """tensor 'x' size mismatch at index 0. expected 2, actual 5""",
+                first_guard_failure,
+            )
+        else:
+            self.assertIn("""x.size()[0] < 3""", first_guard_failure)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_guard_failure_fn2(self):
         def fn(x, y):
@@ -7005,7 +7141,11 @@ def fn(x):
             # assign fstring to a variable causes the fstring to be used,
             # which realizes the variable tracker.
             f_str = f"{x.shape[0]}"
+<<<<<<< HEAD
             return x.sin(), f_str
+=======
+            return x.sin()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         guard_failure = None
 
@@ -7087,7 +7227,11 @@ def guard_failures(failure):
         self.assertTrue(guard_failure is not None)
         self.assertIn("""tensor 'rank' size mismatch at index 0""", guard_failure[0])
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "Test requires CUDA or XPU.")
+=======
+    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symint_as_device_kwarg_non_strict_export(self):
         class Mod(torch.nn.Module):
             def forward(self, x):
@@ -7391,6 +7535,7 @@ def injected(x):
         with torch.compiler.set_stance("fail_on_recompile"):
             self.assertEqual(compiled_fn(*args), injected(*args))
 
+<<<<<<< HEAD
     def test_fail_on_recompile_error_message(self):
         from torch._C._dynamo.eval_frame import (
             _load_precompile_entry,
@@ -7426,6 +7571,8 @@ def injected_bool(x: bool):
         finally:
             _reset_precompile_entries(fn.__code__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shape_and_tuple_equality(self):
         def fn(x, y, t):
             z = x * y
@@ -7779,6 +7926,7 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager")
         self.assertEqual(opt_fn(torch.ones(1)), torch.tensor([3.0]))
 
+<<<<<<< HEAD
     def test_sparse_output_inductor_should_break(self) -> None:
         # See https://github.com/pytorch/pytorch/issues/164823
         # We want consistent semantics here
@@ -7792,6 +7940,8 @@ def forward(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(pt, aot_eager)
         inductor = torch.compile(forward, backend="inductor")(test_tensor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nested_sequential_try_with(self):
         def fn(x):
             with torch.set_grad_enabled(True):
@@ -8054,7 +8204,10 @@ def fn(x):
         self.assertEqual(fn(torch.tensor([4])).size(0), 1)
         self.assertEqual(fn(torch.tensor([1])).size(0), 0)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sym_and_terms(self):
         from torch.fx.experimental.symbolic_shapes import sym_and
 
@@ -8093,11 +8246,16 @@ def func(a, b):
         torch._dynamo.decorators.mark_unbacked(b, 1)
         func(a, b)
         func(torch.rand(4, 5), torch.rand(4, 5))
+<<<<<<< HEAD
         # This does not raise an error right now because of a recompilation.
         # https://github.com/pytorch/pytorch/issues/163785
         # with self.assertRaises(AssertionError):
         #     func(torch.rand(1, 1), torch.rand(2, 1))
         func(torch.rand(1, 1), torch.rand(2, 1))
+=======
+        with self.assertRaises(RuntimeError):
+            func(torch.rand(1, 1), torch.rand(2, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_sym_constrain_range_on_replaced_unbacked_symbol(self):
@@ -8157,6 +8315,10 @@ def test_symint_fold_nontrivial_product_modulo(self):
         @torch.compile(fullgraph=True)
         def f(x):
             u0, u1 = x.tolist()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # The condition should fold to true.
             if ((u0 + 10) * (u0 + 10)) % (u0 + 10) == 0:
                 return torch.tensor(True)
@@ -8224,6 +8386,7 @@ def my_dyn_fn(x, y):
         torch._dynamo.reset()
         torch.compile(my_dyn_fn, backend="eager")(y, y)
 
+<<<<<<< HEAD
     def test_tolist(self):
         # This should compile with no faluire.
         cnt = CompileCounterWithBackend("inductor")
@@ -8237,6 +8400,8 @@ def func(a):
         func(torch.tensor([1, 2, 3, 4, 5]))
         self.assertEqual(cnt.frame_count, 2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Sadly, this does not throw - we do not prop correctly across the graph break
     @unittest.expectedFailure
     def test_raise_guard_partial_constraint_across_break(self):
@@ -8569,6 +8734,7 @@ def write_state(state):
         def fn(x):
             return x + 1
 
+<<<<<<< HEAD
         initial_state = read_state()
         y = torch.randn(10)
         try:
@@ -8587,6 +8753,45 @@ def fn(x):
                         assert cnt == len(initial_state)
         finally:
             write_state(initial_state)
+=======
+        import contextlib
+
+        @contextlib.contextmanager
+        def _hip_allow_tf32():
+            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+            # and only for MI300+
+            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+            try:
+                yield
+            finally:
+                if hip_allow_tf32 is not None:
+                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+                else:
+                    del os.environ["HIPBLASLT_ALLOW_TF32"]
+
+        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+        with tf32_ctx():
+            initial_state = read_state()
+            y = torch.randn(10)
+            try:
+                for round in range(3):
+                    for i in range(len(initial_state)):
+                        new_state = [False] * len(initial_state)
+                        new_state[i] = True
+                        write_state(new_state)
+                        assert read_state() == new_state
+                        last_state.clear()
+                        fn(y)
+                        assert last_state == new_state
+                        if round == 0:
+                            assert cnt == i + 1
+                        else:
+                            assert cnt == len(initial_state)
+            finally:
+                write_state(initial_state)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_grad_state_mutated(self):
         prior = torch.is_grad_enabled()
@@ -8704,6 +8909,7 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[0].name, "fn")
         self.assertEqual(seen_frames[0].line, "r, r2 = uwu_inline_me(x, y, z)")
 
+<<<<<<< HEAD
     def test_fullgraph_capture(self):
         from torch._dynamo.convert_frame import fullgraph_capture
         from torch._dynamo.utils import dynamo_timed, get_metrics_context
@@ -8769,6 +8975,8 @@ def foo(x):
             )(x),
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_guards_stack_frame_register_inlining_deep(self):
         x = torch.tensor([0.5, 0.5])
         y = torch.tensor([0.75, 0.75, 0.75, 0.75])
@@ -8806,6 +9014,7 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[1].name, "uwu_inline_me")
         self.assertEqual(seen_frames[2].line, "r2 = uwu_inline_me_deep(y, z)")
 
+<<<<<<< HEAD
     def test_recompile_on_disable_1(self):
         # fix https://github.com/pytorch/pytorch/issues/157399
         @torch.compile(backend="eager")
@@ -8842,6 +9051,8 @@ def fn1(y):
             # there will be a resume function here
             return f(x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_on_recompile(self):
         @torch.compile(backend="eager")
         def fn(a, b):
@@ -8991,7 +9202,11 @@ def test_guards_cse_pass_single(self):
             ),
             testcase(expr="f(m.n[0], '1').x.y.z", expected="f(_var3, '1').x.y.z"),
             testcase(expr="f(m.n[0], '2').x.y.z", expected="f(_var3, '2').x.y.z"),
+<<<<<<< HEAD
             # The whole expression gets CSE-d, as well as all of its sub-expressions.
+=======
+            # The whole expressiong gets CSE-d, as well as all of its sub-expressions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             testcase(
                 expr="self.g(a, b).k",
                 preface=["_var4 = self.g", "_var5 = _var4(a, b)", "_var6 = _var5.k"],
@@ -9266,6 +9481,74 @@ def fn():
         opt = torch.compile(fn, backend="eager")
         opt()
 
+<<<<<<< HEAD
+=======
+    def test_tracing_py_tree(self):
+        def fn(xs):
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return python_pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+
+        counter = CompileCounter()
+        torch.compile(fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 3)
+
+    def test_tracing_nested_py_tree(self):
+        def fn(xs):
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return python_pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = [xs, xs, xs, xs]
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    def test_tracing_nested_py_tree_tuples(self):
+        def fn(xs):
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return python_pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = (xs, xs, xs, xs)
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    def test_tracing_nested_py_tree_dicts(self):
+        def fn(xs):
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return python_pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = {
+            "a": xs,
+            "b": xs,
+            "c": xs,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_one_hot(self):
         def fn(x):
             x = x + 1
@@ -9282,6 +9565,7 @@ def fn(x):
         self.assertEqual(counter.frame_count, 2)
         self.assertEqual(counter.op_count, 2)
 
+<<<<<<< HEAD
     def test_jacfwd_one_hot_dynamic_compile(self):
         import torch.nn.functional as F
 
@@ -9301,6 +9585,8 @@ def jacfunc(x, idxs):
         out_comp = compiled(x, idxs)
         self.assertEqual(eager[0], out_comp[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tracing_nested_py_tree_mixed_all(self):
         def fn(xs):
             flat_xs, spec = python_pytree.tree_flatten(xs)
@@ -9349,6 +9635,7 @@ def fn(x):
         self.assertEqual(fn(y3), y3 - 3)
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
@@ -9356,6 +9643,59 @@ def test_unbacked_symint_split(self):
         @torch.compile(backend="eager")
         def f(lengths, values):
             sizes = lengths.tolist()
+=======
+    def test_tracing_py_tree_tensor_subclass(self):
+        from torch.testing._internal.two_tensor import TwoTensor
+        from torch.utils.checkpoint import checkpoint
+
+        def fn(xs):
+            nested_xs = [[xs]]
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            return flat_xs[0].clone()
+
+        # use checkpoint to trigger a "sourceless" tensor subclass
+        def checkpoint_fn(xs):
+            return checkpoint(fn, xs, use_reentrant=True)
+
+        xs = TwoTensor(torch.ones(2, 2), torch.ones(2, 2))
+
+        counter = CompileCounter()
+        torch.compile(checkpoint_fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 2)
+
+    def test_tracing_tree_map_only(self):
+        def fn(xs):
+            def mapper(x):
+                return x.clone()
+
+            y = python_pytree.tree_map_only(torch.Tensor, mapper, xs)
+            return y
+
+        xs = [torch.tensor(i) for i in range(3)] + ["hi"]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsb)
+        real_out = fn(xsb)
+
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_unbacked_symint(self):
+        @torch.compile(backend="eager")
+        def f(lengths, values):
+            sizes = lengths.tolist()
+            for s in sizes:
+                torch._check_is_size(s)
+                torch._check(s >= 2)
+                torch._check(s <= 100)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.split(values, sizes)
 
         f(torch.tensor([2, 3, 4]), torch.randn(9))
@@ -9524,6 +9864,34 @@ def foo(x, y):
         self.assertEqual(counter.frame_count, 1)
         self.assertEqual(result, eager_result)
 
+<<<<<<< HEAD
+=======
+    def test_input_set_graph_break(self):
+        def foo(x):
+            return x.pop() * x.pop()
+
+        x = torch.randn(10, 10)
+        y = torch.randn(10, 10)
+
+        counter = CompileCounter()
+
+        inp = {x, x, x, x, y, y}
+        foo = torch.compile(foo, backend=counter, fullgraph=True)
+
+        # There's a lot of stuff about sets that cannot work without a good deal of exertion on our part.
+        # Specifically, getting a set as input won't ever work with how GetItemSource works (Can't arbitrary access set contents)
+        # and so the guard story for the objects passed into input just isn't there atm.
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported,
+            "Unsupported method call",
+        ):
+            foo(inp)
+
+        foo = torch.compile(foo, backend=counter, fullgraph=False)
+        foo(inp)
+        self.assertEqual(counter.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reconstruct_set_across_graph_break(self):
         def foo(x, y):
             setty = set()
@@ -9730,6 +10098,7 @@ def f(x, i):
 
             f(torch.randn(9, requires_grad=True), torch.tensor([3, 6]))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_dim_order(self):
         @torch.compile(dynamic=False, fullgraph=True, backend="eager")
@@ -9793,6 +10162,8 @@ def test(x):
         ):
             h1(xs, ambiguity_check=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_str_format_assert1(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(img):
@@ -9825,7 +10196,10 @@ def fn(img):
         img2 = torch.randn(1, 3, 8, 15)
         self.assertRaises(AssertionError, lambda: fn(img2))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tolist_scalar(self):
         def fn(x):
             new_list = []
@@ -9840,7 +10214,10 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tolist_1d(self):
         def fn(x):
             new_list = []
@@ -9855,7 +10232,10 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tolist_kd(self):
         def fn(x):
             new_list = []
@@ -9870,7 +10250,10 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch.object(torch._dynamo.config, "specialize_int", True)
     def test_tolist_0d(self):
         def fn(x):
@@ -9893,12 +10276,20 @@ def fn(x):
             new_list = []
             i = x.tolist()
             new_list.append(i * 4)
+<<<<<<< HEAD
             return new_list, x * 10
+=======
+            return new_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randint(3, 5, [5, 5])
         eager = fn(x)
         counter = CompileCounter()
+<<<<<<< HEAD
         compiled_fn = torch.compile(fn, backend=counter, fullgraph=False)
+=======
+        compiled_fn = torch.compile(fn, backend=counter, fullgraph=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled = compiled_fn(x)
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
@@ -10649,7 +11040,11 @@ def forward(self, x):
         expected_fqn = {
             "L__self___test_param": "test_param",
             "L__self___test_buf": "test_buf",
+<<<<<<< HEAD
             "L__self___foo_bar_0": "foo_bar.0",
+=======
+            "getattr_L__self___foo_bar___0__": "foo_bar.0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "L__self___foo_bar_test_param": "foo_bar.test_param",
             "L__self___foo_bar_test_buf": "foo_bar.test_buf",
         }
@@ -10758,6 +11153,10 @@ def fn(x, y):
 
         self.assertEqual(actual, expected)
 
+<<<<<<< HEAD
+=======
+    @requiresPy310
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_frozen_dataclass_kw_only(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
@@ -10781,6 +11180,7 @@ def fn(x, y):
 
         self.assertEqual(actual, expected)
 
+<<<<<<< HEAD
     def test_frozen_dataclass_attr_access(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
@@ -10852,6 +11252,140 @@ def fn(x, y):
         actual = fn_opt(*inps)
         expected = fn(*inps)
         self.assertEqual(actual, expected)
+=======
+    def test_pytree_tree_leaves(self):
+        implemtations = [("python", python_pytree)]
+        if cxx_pytree is not None:
+            implemtations.append(("cxx", cxx_pytree))
+
+        for name, module in implemtations:
+            with self.subTest(f"pytree implement: {name}"):
+
+                def fn(x):
+                    tree = {
+                        "a": [x, x - 1],
+                        "b": x + 2,
+                        "c": (
+                            x,
+                            3.0,
+                            collections.deque([0.0, -x, 1, 2], maxlen=3),
+                        ),
+                        "d": collections.OrderedDict(
+                            {
+                                "e": torch.return_types.qr((2 * x, None)),
+                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                            },
+                        ),
+                    }
+                    leaves = module.tree_leaves(tree)
+                    return leaves
+
+                x = torch.randn(3, 2)
+                expected = fn(x)
+                fn_opt = torch.compile(fullgraph=True)(fn)
+                actual = fn_opt(x)
+
+                self.assertEqual(actual, expected)
+
+    def test_pytree_tree_flatten_unflatten(self):
+        implemtations = [("python", python_pytree)]
+        if cxx_pytree is not None:
+            implemtations.append(("cxx", cxx_pytree))
+
+        for name, module in implemtations:
+            with self.subTest(f"pytree implement: {name}"):
+
+                def fn(x, y):
+                    tree = {
+                        "a": [x, x - 1],
+                        "b": x + 2,
+                        "c": (
+                            x,
+                            3.0,
+                            collections.deque([0.0, -x, 1, 2], maxlen=3),
+                        ),
+                        "d": collections.OrderedDict(
+                            {
+                                "e": torch.return_types.qr((2 * x, None)),
+                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                            },
+                        ),
+                    }
+                    leaves, treespec = module.tree_flatten(tree)
+                    new_leaves = [
+                        x - 1,
+                        y,
+                        x * y,
+                        3.0,
+                        y - 2,
+                        1,
+                        torch.zeros(2, 2),
+                        2 * y,
+                        -y,
+                        x + y,
+                        x - y,
+                        torch.ones(3, 2),
+                        1,
+                    ]
+                    new_tree = module.tree_unflatten(new_leaves, treespec)
+                    return leaves, new_tree
+
+            x = torch.randn(3, 2)
+            y = torch.randn(3, 2)
+            expected = fn(x, y)
+            fn_opt = torch.compile(fullgraph=True)(fn)
+            actual = fn_opt(x, y)
+
+            self.assertEqual(actual, expected)
+
+    def test_pytree_tree_map(self):
+        implemtations = [("python", python_pytree)]
+        if cxx_pytree is not None:
+            implemtations.append(("cxx", cxx_pytree))
+
+        for name, module in implemtations:
+            with self.subTest(f"pytree implement: {name}"):
+
+                def fn(x, y):
+                    tree1 = {
+                        "a": [x, x - 1],
+                        "b": x + 2,
+                        "c": (
+                            x,
+                            3.0,
+                            collections.deque([0.0, -x, 1, 2], maxlen=3),
+                        ),
+                        "d": collections.OrderedDict(
+                            {
+                                "e": torch.return_types.qr((2 * x, None)),
+                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                            },
+                        ),
+                    }
+                    tree2 = collections.OrderedDict(
+                        [
+                            ("c", (y, 3.0, collections.deque([1, -y, 10.0]))),
+                            ("a", [y, y + 1]),
+                            ("b", y + 2),
+                            (
+                                "d",
+                                {
+                                    "f": MyTuple(torch.ones(4, 3), -y, y + 1),
+                                    "e": torch.return_types.qr((2 * y, None)),
+                                },
+                            ),
+                        ],
+                    )
+                    return module.tree_map(lambda u, v: (u, v), tree1, tree2)
+
+                x = torch.randn(3, 2)
+                y = torch.randn(3, 2)
+                expected = fn(x, y)
+                fn_opt = torch.compile(fullgraph=True)(fn)
+                actual = fn_opt(x, y)
+
+                self.assertEqual(actual, expected)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_shape_env_no_recording(self):
         main = ShapeEnv(should_record_events=False)
@@ -10905,8 +11439,13 @@ def test_shape_env_equal_constructor(self):
 ShapeEnv not equal: field values don't match:
 
 ==> settings: values don't match.
+<<<<<<< HEAD
   >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
   > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
+=======
+  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
         )
         self._replay_and_check(main)
@@ -11372,7 +11911,11 @@ def EEE():
 def fn():
     return 3
 """
+<<<<<<< HEAD
         with WritableTempFile(mode="w") as f:
+=======
+        with tempfile.NamedTemporaryFile(mode="w") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f.write(src)
             f.flush()
             from torch._dynamo.funcname_cache import get_funcname
@@ -11440,6 +11983,7 @@ def fn(x):
         c2 = _debug_get_cache_entry_list(fn.__code__)
         self.assertEqual(len(c2), 0)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_check_simplification(self):
         @torch.compile(backend="eager", fullgraph=True)
@@ -11447,6 +11991,17 @@ def fn(x):
             u0, u1 = x.tolist()
             torch._check((2 * u0) // (u0 + u1) != 0)
             if (2 * u0) // (u0 + u1) == 0:
+=======
+    def test_guard_size_oblivious_simplification(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            u0, u1 = x.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+            torch._check((2 * u0) % (u0 + u1) == 0)
+            torch._check((2 * u0) // (u0 + u1) != 0)
+            if guard_size_oblivious((2 * u0) // (u0 + u1) == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.tensor(True)
             else:
                 return torch.tensor(False)
@@ -11467,13 +12022,42 @@ def fn(x, y):
         with self.assertRaisesRegex(RuntimeError, "specialized"):
             fn(x, y)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_infer_unbacked_size_gt_zero(self):
+=======
+    def test_sym_max_unbacked_sizelike_simplification(self):
+        @torch.compile(fullgraph=True, backend="eager")
+        def cf(x):
+            u0, u1 = x.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+            torch._check(u0 + u1 == 20)
+
+            y = 0
+            if guard_size_oblivious(torch.sym_max(1, u0 + u1) == 20):
+                y += 1
+            if guard_size_oblivious(torch.sym_max(1, u0**2 + u1 + 2) != 1):
+                y += 1
+            if guard_size_oblivious(torch.sym_min(1, u0) == 1):
+                y += 1
+            return y
+
+        # Previously would have thrown guard on data dependent
+        self.assertEqual(cf(torch.tensor([10, 10])), 3)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_guard_size_oblivious(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This code, in fact, does NOT work in eager
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
             y = torch.zeros(x.item())
+<<<<<<< HEAD
             if y.size(0) < 0:
+=======
+            if guard_size_oblivious(y.size(0) == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert False
             return y
 
@@ -11743,7 +12327,10 @@ def fn(x, const):
         self.assertIs(c1[1], c2[0])
 
     @torch._dynamo.config.patch(inline_inbuilt_nn_modules=False)
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) conform, AssertionError: False is not true")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_cache_invalidate(self):
         DeletedGuardManagerWrapper = torch._dynamo.guards.DeletedGuardManagerWrapper
 
@@ -11938,7 +12525,11 @@ def fn(x, y):
 
         # Ensure that the generated graph returns only one output. We want the
         # add_ on the grad to be part of the graph itself, so that inductor can
+<<<<<<< HEAD
         # theoretically move the add_ and resulting copy_ nodes at the right
+=======
+        # theoretically move the add_ and resutling copy_ nodes at the right
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # place to free memory.
         self.assertEqual(len(list(cnt.graphs[0].graph.nodes)[-1].all_input_nodes), 1)
         self.assertEqual(z, ref_y)
@@ -12051,6 +12642,7 @@ def fn(x, d):
         with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
             fn(torch.randn(4), d)
 
+<<<<<<< HEAD
     def test_hash_hop(self):
         associative_scan = importlib.import_module(
             "torch._higher_order_ops.associative_scan"
@@ -12064,6 +12656,8 @@ def fn(y, s):
 
         fn(torch.ones(2, 2, device="cpu"), associative_scan.AssociativeScanOp())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_iter_type(self):
         @torch.compile(fullgraph=True)
         def fn(y):
@@ -12513,7 +13107,11 @@ def __init__(self, x):
                 self.ne_called = False
 
             def __ne__(self, other):
+<<<<<<< HEAD
                 # ne_called attr is later checked to ensure that overridden
+=======
+                # ne_called attr is later checked to ensure that overrideen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # `__ne__` is traced
                 self.ne_called = True
                 return not self.__eq__(other)
@@ -12834,6 +13432,7 @@ def f(x):
         res = opt_f(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_builtin_complex(self):
         def f(x):
             c = (
@@ -13157,6 +13756,8 @@ def mapper(x):
         self.assertEqual(counter.frame_count, 1)
         self.assertEqual(counter.op_count, 9)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestTracer(JitTestCase):
     def test_jit_save(self):
@@ -13325,7 +13926,11 @@ def forward(self, query, key, value):
 
     def test_torch_device_is_available(self, device):
         def fn(x):
+<<<<<<< HEAD
             if torch.accelerator.is_available():
+=======
+            if TEST_HPU or TEST_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x + 1
             else:
                 return x - 1
@@ -13425,6 +14030,7 @@ def f(rank):
         self.assertEqual(out, opt_out)
 
     @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
+<<<<<<< HEAD
     def test_gpu_set_device(self, device):
         def fn():
             a = torch.ones(2, device=device)
@@ -13445,6 +14051,32 @@ def test_torch_device_python_type(self, device):
             ("cpu", "cpu", None),
             (device, device_type, 0),
         ]:
+=======
+    def test_cuda_set_device(self, device):
+        def fn():
+            a = torch.ones(2, device=device)
+            torch.cuda.set_device(1)
+            return a + 1
+
+        with torch.cuda.device(0):
+            counter = CompileCounter()
+            opt_fn = torch.compile(fn, backend=counter)
+            res = opt_fn()
+            self.assertEqual(res.device.type, "cuda")
+            self.assertEqual(res.device.index, 0)
+            self.assertEqual(counter.frame_count, 2)
+
+    def test_torch_device_python_type(self):
+        for device, device_type, index in [
+            ("cpu", "cpu", None),
+            ("cuda:0", "cuda", 0),
+            ("hpu:0", "hpu", 0),
+        ]:
+            if (device == "cuda:0" and not TEST_CUDA) or (
+                device == "hpu:0" and not TEST_HPU
+            ):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def fn(target):
                 target_device = target.device
@@ -13505,6 +14137,7 @@ def f(actions, n_act, epsilon=0.1):
         y = torch.tensor(5)
         f(x, y)
 
+<<<<<<< HEAD
     def test_full_graph_capture_scalar_outputs(self):
         @torch.compile(fullgraph=True)
         def foo(a):
@@ -13533,6 +14166,8 @@ def f(x):
         x = torch.zeros(4, 4)
         f(x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_float_scalar_tensor_coersion(self):
         # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
         class Foo:
@@ -13566,6 +14201,7 @@ def forward(self, input):
         #   RuntimeError: value cannot be converted to type at::Half without overflow
 
 
+<<<<<<< HEAD
 instantiate_parametrized_tests(MiscTestsPyTree)
 
 devices = ("cuda", "hpu", "xpu")
@@ -13574,6 +14210,10 @@ def forward(self, input):
 )
 
 
+=======
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(MiscTestsDevice, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_model_output.py b/test/dynamo/test_model_output.py
index d2833e1a7195a..db4dd10566482 100644
--- a/test/dynamo/test_model_output.py
+++ b/test/dynamo/test_model_output.py
@@ -7,7 +7,11 @@
 import torch._dynamo.testing
 from torch._dynamo.testing import same
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import TEST_HPU, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -359,11 +363,19 @@ def forward(
         )
 
 
+<<<<<<< HEAD
 devices = ["cpu", "cuda", "xpu", "hpu"]
 
 instantiate_device_type_tests(
     TestModelOutputBert, globals(), only_for=devices, allow_xpu=True
 )
+=======
+devices = ["cpu", "cuda"]
+if TEST_HPU:
+    devices.append("hpu")
+
+instantiate_device_type_tests(TestModelOutputBert, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index 82c87bde8c0ba..f785a109cc07d 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -11,6 +11,7 @@
     _pop_torch_function_stack,
     _push_on_torch_function_stack,
 )
+<<<<<<< HEAD
 from torch._dynamo.utils import counters
 from torch.overrides import (
     _get_current_function_mode_stack,
@@ -20,15 +21,22 @@
 from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
+=======
+from torch.overrides import _get_current_function_mode_stack, BaseTorchFunctionMode
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._device import DeviceContext
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestMode(BaseTorchFunctionMode):
     def __torch_function__(self, func, types, args, kwargs=None):
         if not kwargs:
@@ -40,6 +48,7 @@ def __torch_function__(self, func, types, args, kwargs=None):
         return super().__torch_function__(func, types, args, kwargs)
 
 
+<<<<<<< HEAD
 class HopDetectionError(Exception):
     pass
 
@@ -57,6 +66,8 @@ def __torch_function__(self, func, types, args, kwargs=None):
         return super().__torch_function__(func, types, args, kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TorchDispatchModeTests(torch._dynamo.test_case.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -66,6 +77,7 @@ def setUpClass(cls):
     def tearDownClass(cls):
         super().tearDownClass()
 
+<<<<<<< HEAD
     def test_torch_dispatch_ignore_compile_internals(self):
         counters.clear()
         from torch.utils._python_dispatch import TorchDispatchMode
@@ -114,6 +126,8 @@ def g(x):
         self.assertEqual(counters["frames"]["total"], 1)
         self.assertEqual(counters["frames"]["ok"], 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_skip_torch_dispatch_modes(self):
         class RewriteAddToMul(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
@@ -194,6 +208,7 @@ def test_torch_function_mode_guards_py(self):
     def test_torch_function_mode_guards_cpp(self):
         self._run_torch_function_mode_guard_test()
 
+<<<<<<< HEAD
     @requires_gpu
     def test_torch_function_mode_preserves_cuda_rng_state(self):
         class ConstantReturnMode(TorchFunctionMode):
@@ -207,6 +222,8 @@ def fn():
 
         self.assertEqual(fn(), 123)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stack_state_mutation_default_device(self):
         m = BaseTorchFunctionMode()
         m1 = BaseTorchFunctionMode()
@@ -317,7 +334,11 @@ def fn(x):
 
         self.assertRaisesRegex(
             torch._dynamo.exc.Unsupported,
+<<<<<<< HEAD
             "Attempted to pop from empty torch function mode stack",
+=======
+            "Popping from an empty torch function mode stack",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda: fn(torch.ones(2, 2)),
         )
 
@@ -589,6 +610,7 @@ def fn(x, y):
     # Needs larger cache size since we recompile for each op
     @patch.object(torch._dynamo.config, "recompile_limit", 48)
     def test_builtin_equivalent_funcs(self):
+<<<<<<< HEAD
         from torch._dynamo.variables.builtin import (
             BUILTIN_TO_TENSOR_FN_MAP,
             BUILTIN_TO_TENSOR_RFN_MAP,
@@ -596,6 +618,13 @@ def test_builtin_equivalent_funcs(self):
         from torch._dynamo.variables.torch_function import (
             bin_int_ops,
             bin_ops,
+=======
+        from torch._dynamo.variables.torch_function import (
+            bin_int_ops,
+            bin_ops,
+            BUILTIN_TO_TENSOR_FN_MAP,
+            BUILTIN_TO_TENSOR_RFN_MAP,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tensor_and_int_ops,
             un_int_ops,
             un_ops,
@@ -705,12 +734,20 @@ def func(a):
 
             func(torch.randn(3))
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_flex_attention(self):
         import torch
         from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 
+<<<<<<< HEAD
         torch.set_default_device(device_type)
+=======
+        torch.set_default_device("cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         flex_attention = torch.compile(flex_attention, dynamic=False)
 
@@ -720,9 +757,13 @@ def prefix_lm(b, h, q, kv):
             return prefix_lengths[b] >= kv
 
         # This runs in fullgraph already
+<<<<<<< HEAD
         create_block_mask(
             prefix_lm, 8, None, 512, 512, _compile=True, device=device_type
         )
+=======
+        create_block_mask(prefix_lm, 8, None, 512, 512, _compile=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_register_hook(self):
         import functools
@@ -745,6 +786,7 @@ def forward(self, x):
         with torch.device("cpu"):
             torch.compile(mod, fullgraph=True)(x)
 
+<<<<<<< HEAD
     @requires_gpu
     @skipIfXpu(msg="XPU does not support flex attention")
     def test_hop(self):
@@ -790,6 +832,8 @@ def test_hop_eager(self):
                         torch.ones(2, 2, 2, 2),
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index c251ce28bac49..7a07eeb94dffc 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -699,6 +699,7 @@ def forward(self, x, y):
         return self.layer(x, y=y)
 
 
+<<<<<<< HEAD
 class LazyModuleBadInferParams(LazyModuleMixin, torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -710,6 +711,8 @@ def forward(self, x, y):
         return self.layer(x, y=y)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LazyParentModule(LazyModuleMixin, torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1665,6 +1668,7 @@ def test_lazy_module_kwargs(self):
         exp_res = m(x, y)
         self.assertTrue(torch.allclose(exp_res, opt_m(x, y)))
 
+<<<<<<< HEAD
     def test_lazy_module_bad_params(self):
         m = LazyModuleBadInferParams()
         x = [torch.rand([5, 5])] * 3
@@ -1691,6 +1695,8 @@ def m(x, y):
         with self.assertRaises(AttributeError):
             exp_res = opt_m(x, y)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # RuntimeError: SymIntArrayRef expected to contain only concrete integers
     @expectedFailureDynamic
     def test_lazy_module_speculation_log_divergence(self):
@@ -2024,7 +2030,11 @@ def forward(self, x):
         # Check order of _modules
         def fn(x):
             for idx, p in enumerate(mod.modules()):
+<<<<<<< HEAD
                 # Something silly to force dependency on the order
+=======
+                # Something silly to force depedency on the order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x += coeffs_for_mod[p] * coeffs[idx]
             for idx, p in enumerate(mod.named_modules()):
                 x += coeffs_for_mod[p[1]] * coeffs[idx]
@@ -3047,7 +3057,11 @@ def forward(self, x):
         def generate(x, c):
             return mod(x) + c
 
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for _ in range(0, 10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             generate(torch.randn(10, 10), 0)
             generate(torch.randn(10, 10), 1)
         self.assertEqual(cnt.frame_count, 2)
@@ -3422,6 +3436,7 @@ def forward(self, x):
         compiled_mod = torch.compile(mod, backend="eager")
         compiled_mod(x)
 
+<<<<<<< HEAD
     def test_trace_delattr(self):
         TMP_PREFIX = "_tmp_"
 
@@ -3479,6 +3494,11 @@ def forward(self, x):
 instantiate_device_type_tests(
     NNModuleTestsDevice, globals(), only_for=devices, allow_xpu=True
 )
+=======
+
+devices = ["cuda", "hpu"]
+instantiate_device_type_tests(NNModuleTestsDevice, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index 82bde3a50d58d..f58a7fa038006 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -1,9 +1,13 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 import importlib
 import os
 import sys
 import tempfile
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 
 import torch
@@ -12,15 +16,20 @@
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils.cpp_extension
+<<<<<<< HEAD
 from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.testing import reduce_to_scalar_loss
+=======
+from torch._dynamo.package import CompilePackage, DynamoStore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch import config as functorch_config
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     HAS_CUDA_AND_TRITON,
     HAS_XPU_AND_TRITON,
@@ -33,6 +42,12 @@ def compute_loss_helper(x):
 
 @functorch_config.patch("bundled_autograd_cache", True)
 @torch._dynamo.config.patch({"strict_precompile": True})
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+@functorch_config.patch("bundled_autograd_cache", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class TestPackage(torch._inductor.test_case.TestCase):
     def path(self):
@@ -40,6 +55,7 @@ def path(self):
         os.makedirs(path, exist_ok=True)
         return path
 
+<<<<<<< HEAD
     def setUp(self):
         super().setUp()
         torch._dynamo.reset()
@@ -87,6 +103,14 @@ def test_basic_fn(self, backend, device):
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
+=======
+    @parametrize("backend", ("eager", "inductor"))
+    @parametrize("device", ("cpu", "cuda"))
+    def test_basic_fn(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        ctx = DynamoStore()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x):
             return x + 1
@@ -123,6 +147,7 @@ def fn(x):
             self.assertEqual(expected, compiled_fn(*args))
 
     @parametrize("backend", ("eager", "inductor"))
+<<<<<<< HEAD
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_lazy_backward(self, backend, device):
         if device == "cuda" and not HAS_CUDA_AND_TRITON:
@@ -178,6 +203,14 @@ def test_graph_break_bomb(self, backend, device):
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
+=======
+    @parametrize("device", ("cpu", "cuda"))
+    def test_graph_break_bomb(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+
+        ctx = DynamoStore()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, l, r):
             if l > r:
@@ -234,6 +267,7 @@ def guard_filter_fn(guards):
                 compiled_fn(torch.tensor(N), 0, N - 1)
 
     @parametrize("backend", ("eager", "inductor"))
+<<<<<<< HEAD
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_dynamic_shape(self, backend, device):
         if device == "cuda" and not HAS_CUDA_AND_TRITON:
@@ -242,6 +276,13 @@ def test_dynamic_shape(self, backend, device):
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
+=======
+    @parametrize("device", ("cpu", "cuda"))
+    def test_dynamic_shape(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        ctx = DynamoStore()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x):
             return x + x.shape[0]
@@ -283,6 +324,7 @@ def fn(x):
             ):
                 compiled_fn(*args2)
 
+<<<<<<< HEAD
     def test_file_change(self):
         ctx = DiskDynamoStore()
 
@@ -628,6 +670,8 @@ def foo(set_of_x):
         compiled_fn(*args)
         self._save_and_reload(expected_backends=1, expected_dynamo=1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_pgo.py b/test/dynamo/test_pgo.py
index e7e572d6f1ab6..0ed096badc58b 100644
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@@ -12,9 +12,13 @@
 import torch.compiler.config
 import torch.nested
 from torch._dynamo.testing import CompileCounter
+<<<<<<< HEAD
 from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.utils import clear_caches, fresh_cache
 from torch.testing._internal.common_utils import IS_WINDOWS
+=======
+from torch._inductor.utils import clear_caches, fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PgoTest(torch._dynamo.test_case.TestCase):
@@ -57,6 +61,7 @@ def f(x):
         f(torch.randn(2, 6))
         self.assertEqual(cnts.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(
         force_parameter_static_shapes=False,
         force_nn_module_property_static_shapes=False,
@@ -69,6 +74,9 @@ def test_whitelist_suggestion(self):
             render_code_state,
         )
 
+=======
+    def test_whitelist_suggestion(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnts = CompileCounter()
 
         @torch.compile(backend=cnts, fullgraph=True)
@@ -90,19 +98,28 @@ def forward(self, x, y):
         ]
 
         def check_whitelist(sources_):
+<<<<<<< HEAD
             state = render_code_state(get_code_state())
+=======
+            state = torch._dynamo.pgo.render_code_state(
+                torch._dynamo.pgo.get_code_state()
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(
                 1
             )
             for src in sources_:
                 self.assertTrue(src in whitelist)
 
+<<<<<<< HEAD
         def check_num_missing_whitelist(expected):
             frame_state = next(iter(get_code_state().values()))
             all_dynamic_sources = _collect_dynamic_sources(frame_state)
             missing_whitelist = _collect_missing_sources(all_dynamic_sources)
             self.assertEqual(len(missing_whitelist), expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check growing whitelist
         f = Foo()
         f(torch.randn(2, 4), torch.randn(4))
@@ -118,13 +135,20 @@ def check_num_missing_whitelist(expected):
         f.attr = torch.randn(8)
         f(torch.randn(8, 8), torch.randn(8))
         check_whitelist(sources)
+<<<<<<< HEAD
         check_num_missing_whitelist(5)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # now use suggested whitelist
         self.reset()
         cnts.clear()
+<<<<<<< HEAD
         code_state = get_code_state()
         state = render_code_state(code_state)
+=======
+        state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(1)
         with torch.compiler.config.patch(dynamic_sources=whitelist):
             f = Foo()
@@ -134,6 +158,7 @@ def check_num_missing_whitelist(expected):
             f.attr = torch.randn(8)
             f(torch.randn(8, 8), torch.randn(8))
             self.assertEqual(cnts.frame_count, 1)
+<<<<<<< HEAD
             check_num_missing_whitelist(0)
 
     def test_no_empty_graph_allowlist(self):
@@ -158,6 +183,8 @@ def f1(x):
         f1(torch.randn(4))
         f1(torch.randn(8))
         self.assertEqual(torch._dynamo.pgo._LOGGED_DYNAMIC_ALLOWLIST, True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pgo_dynamic_false(self):
         @torch.compile(backend="eager", dynamic=False)
@@ -183,6 +210,7 @@ def forward(self, x, y):
     def test_whitelist_ints_floats(self):
         @torch.compile(backend="eager", fullgraph=True)
         class Bar(torch.nn.Module):
+<<<<<<< HEAD
             def __init__(self, c, d):
                 super().__init__()
                 self.c = c
@@ -197,6 +225,18 @@ def forward(self, x, y, z):
         f = Bar(1.0, 2)
         f(2, 1.0, 2.0)
         f.d = 3
+=======
+            def __init__(self, c):
+                super().__init__()
+                self.c = c
+
+            def forward(self, x, y, z):
+                if self.c == 1.0:
+                    return x + y + torch.tensor([z])
+
+        f = Bar(1.0)
+        f(2, 1.0, 2.0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f(3, 1.2, 2.0)
         state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
         whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(1)
@@ -207,7 +247,10 @@ def forward(self, x, y, z):
         )  # ephemeral FloatTensor source
         self.assertTrue("L['z']" not in whitelist)  # static float
         self.assertTrue("L['self'].c" not in whitelist)  # static float property
+<<<<<<< HEAD
         self.assertTrue("L['self'].d" in whitelist)  # dynamic int property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pgo_dynamic_params(self):
         cnts = CompileCounter()
@@ -237,6 +280,7 @@ def run():
         self.assertEqual(cnts.frame_count, 3)
 
         # parameter static shapes are forced static, so we recompile once
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             force_parameter_static_shapes=False,
             force_nn_module_property_static_shapes=False,
@@ -247,6 +291,16 @@ def run():
             # because flags were flipped, params were included in PGO
             run()
             self.assertEqual(cnts.frame_count, 1)
+=======
+        run()
+        self.assertEqual(cnts.frame_count, 2)
+
+        # flags are flipped, PGO records dynamism, so params are dynamically compiled to start
+        torch._dynamo.config.force_parameter_static_shapes = False
+        torch._dynamo.config.force_nn_module_property_static_shapes = False
+        run()
+        self.assertEqual(cnts.frame_count, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_njt(self):
         cnts = CompileCounter()
@@ -367,9 +421,14 @@ def func(x):
         temp_dir1 = tempfile.TemporaryDirectory()
         temp_dir2 = tempfile.TemporaryDirectory()
 
+<<<<<<< HEAD
         # We need normalize_path_separator for Windows file path.
         path1 = normalize_path_separator(os.path.join(temp_dir1.name, "example.py"))
         path2 = normalize_path_separator(os.path.join(temp_dir2.name, "example.py"))
+=======
+        path1 = os.path.join(temp_dir1.name, "example.py")
+        path2 = os.path.join(temp_dir2.name, "example.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnts = CompileCounter()
 
         assert path1 != path2
@@ -387,11 +446,15 @@ def write_load_and_run(path):
         write_load_and_run(path1)
         self.assertEqual(cnts.frame_count, 2)
         state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+<<<<<<< HEAD
 
         # Windows can't create unification temp path:
         #   hash(a18a3259)C:/Users/Xuhan/AppData/Local/Temp/tmpx3hfkuqa/example.py
         # Skip hash check
         self.assertTrue("hash" if IS_WINDOWS else "hash(390fe689)" in state)
+=======
+        self.assertTrue("hash(390fe689)" in state)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue("/example.py:4:func:" in state)
         self.assertTrue(" L['x']: tensor size=[?] stride=[1]" in state)
         # We should compile this only once due to PGO.
@@ -399,6 +462,7 @@ def write_load_and_run(path):
         write_load_and_run(path2)
         self.assertEqual(cnts.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(
         automatic_dynamic_remote_pgo=True, automatic_dynamic_local_pgo=False
     )
@@ -467,6 +531,8 @@ def t(x, y):
                 f(t(2, 2), t(2, 4))
                 self.assertEqual(cnts.frame_count, 2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_precompile_context.py b/test/dynamo/test_precompile_context.py
index 6c72f65f53ae2..e32c74654eb34 100644
--- a/test/dynamo/test_precompile_context.py
+++ b/test/dynamo/test_precompile_context.py
@@ -1,9 +1,17 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._functorch
+<<<<<<< HEAD
 from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
+=======
+from torch._dynamo.precompile_context import PrecompileContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import (
     BundledAOTAutogradCacheArtifact,
@@ -13,8 +21,13 @@
 
 
 @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
 @torch._dynamo.config.patch(
     {"caching_precompile": True}
+=======
+@functorch_config.patch(
+    {"bundled_autograd_cache": True}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )  # Requires bundledaotautograd cache for now
 class PrecompileContextTests(InductorTestCase):
     def setUp(self):
@@ -40,6 +53,7 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
+<<<<<<< HEAD
         self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
         self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
         cache_entries, _ = PrecompileContext.create_cache_entries()
@@ -69,6 +83,31 @@ def simple_function(x):
 
     @requires_triton()
     def test_editable(self):
+=======
+        # Check that PrecompileContext._new_cache_artifacts_by_key has length 1
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 1)
+
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
+        result = PrecompileContext.serialize()
+        assert result is not None
+        serialized, cache_info = result
+        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+
+        artifacts = PrecompileContext.deserialize(serialized)
+        assert artifacts is not None
+        deserialized = artifacts["precompile_aot_autograd"]
+        assert len(deserialized) == 1
+        entry = deserialized[0]
+        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
+        entry = entry.after_deserialization()
+        # Now that we've serialized, there should be no new cache artifacts
+        self.assertEqual(
+            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
+        )
+
+    @requires_triton()
+    def test_serialize_by_key(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Test that after torch.compile, PrecompileContext._new_cache_artifacts length is 1
         """
@@ -82,6 +121,7 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
+<<<<<<< HEAD
         self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
         self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
         # Find the key for the artifact of type "precompile_aot_autograd"
@@ -93,16 +133,31 @@ def edit_fn(x):
 
         PrecompileContext.edit_artifact(key, edit_fn)
 
+=======
+        # Check that PrecompileContext._new_cache_artifacts_by_key has length 1
+        # TODO: the key right now is the AOTAutogradCacheKey, but will be backend_id once
+        # we have torch._dynamo.package implemented
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 1)
+        key = next(iter(PrecompileContext._new_cache_artifacts_by_key.keys()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = PrecompileContext.serialize_artifact_by_key(key)
         assert isinstance(result, BundledAOTAutogradCacheArtifact)
         self.assertEqual(result.key, key)
 
+<<<<<<< HEAD
         result, _ = PrecompileContext.create_cache_entries()
         assert len(result) == 1
         aot_autograd_artifacts = next(iter(result.values())).backends
         assert len(aot_autograd_artifacts) == 1
         entry = next(iter(aot_autograd_artifacts.values())).content
         self.assertEqual(entry._my_private_field, 42)
+=======
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
+        result = PrecompileContext.serialize()
+        assert result is not None
+        _, cache_info = result
+        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_profiler.py b/test/dynamo/test_profiler.py
index 921d7021650fc..d8ad225e5ade6 100644
--- a/test/dynamo/test_profiler.py
+++ b/test/dynamo/test_profiler.py
@@ -162,6 +162,7 @@ def fn(x, y, z):
             any(e.name == "TorchDynamo Cache Lookup" for e in prof.events())
         )
 
+<<<<<<< HEAD
     def test_profiler_enabled_export(self):
         class Mod(torch.nn.Module):
             def __init__(self):
@@ -190,6 +191,8 @@ def forward(self, x):
             res = opt_mod.graph_module(x)
             self.assertEqual(ref, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_profiler_dynamo_compiled_region(self):
         def fn(x, y):
             r = y.sum(dim=1)
@@ -209,7 +212,11 @@ def fn(x, y):
                 torch.randn(10, 15),
             )
 
+<<<<<<< HEAD
         annotations = [e.name for e in prof.events() if "Torch-Compiled" in e.name]
+=======
+        annotations = [e.name for e in prof.events() if "Compiled" in e.name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             annotations,
             [
diff --git a/test/dynamo/test_python_autograd.py b/test/dynamo/test_python_autograd.py
index a615c653f56c3..c22bda3dbdd9b 100644
--- a/test/dynamo/test_python_autograd.py
+++ b/test/dynamo/test_python_autograd.py
@@ -82,7 +82,11 @@ def grad(L, desired_results: list[Variable]) -> list[Variable]:
     # look up dL_dentries. If a variable is never used to compute the loss,
     # we consider its gradient None, see the note below about zeros for more information.
     def gather_grad(entries: list[str]):
+<<<<<<< HEAD
         return [dL_d.get(entry) for entry in entries]
+=======
+        return [dL_d[entry] if entry in dL_d else None for entry in entries]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # propagate the gradient information backward
     for entry in reversed(gradient_tape):
diff --git a/test/dynamo/test_python_dispatcher.py b/test/dynamo/test_python_dispatcher.py
index d74077a5be4ce..c984a5e4e2c31 100644
--- a/test/dynamo/test_python_dispatcher.py
+++ b/test/dynamo/test_python_dispatcher.py
@@ -5,12 +5,15 @@
 import torch._dynamo.test_case
 from torch._dynamo.testing import CompileCounter, EagerAndRecordGraphs, normalize_gm
 from torch.testing._internal.common_cuda import TEST_CUDA
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_XPU
 
 
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PythonDispatcherTests(torch._dynamo.test_case.TestCase):
@@ -80,7 +83,11 @@ def forward(self, L_x_: "f32[2, 3]"):
 """,  # NOQA: B950
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "requires cuda or xpu")
+=======
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dispatch_key_set_guard(self):
         counter = CompileCounter()
 
@@ -102,7 +109,11 @@ def fn(x, dks):
         # No recompile since the dispatch key set is the same though the tensor is different.
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
         x3 = torch.randn(2, 3, device=device_type)
+=======
+        x3 = torch.randn(2, 3, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dks3 = torch._C._dispatch_keys(x3)
         self.assertEqual(fn(x3, dks3), torch.sin(x3 - 1))
         # Re-compile since the dispatch key set is different.
diff --git a/test/dynamo/test_recompile_ux.py b/test/dynamo/test_recompile_ux.py
index 880d37434b31f..9536f081d495c 100644
--- a/test/dynamo/test_recompile_ux.py
+++ b/test/dynamo/test_recompile_ux.py
@@ -12,11 +12,14 @@
 from torch.testing._internal.logging_utils import kwargs_to_settings, log_settings
 
 
+<<<<<<< HEAD
 device_type = (
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RecompileUxTests(torch._dynamo.test_case.TestCase):
     # TODO(whc) dynamo actually recompiles one more time than the cache limit
     cache_limit = 1
@@ -106,10 +109,14 @@ def model(input):
             .startswith("torch._dynamo hit config.recompile_limit")
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not torch.cuda.is_available() and not torch.xpu.is_available(),
         "requires cuda or xpu",
     )
+=======
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nvfuser_guards(self):
         # we may want to model dynamo's guards sufficiently after nvfuser's ProfilingExecutor guards
         # such that we ensure dynamo is in charge of all the recompilations at the top level,
@@ -117,11 +124,19 @@ def test_nvfuser_guards(self):
         def func(a, b, c):
             return a + b * c
 
+<<<<<<< HEAD
         a = torch.rand(3, 4, 5, device=device_type)
         b = torch.rand(3, 4, 5, device=device_type)
         b_v = torch.rand(3, 5, 4, device=device_type).view(3, 4, 5)
         b_p = torch.rand(3, 5, 4, device=device_type).permute(0, 2, 1)
         c = torch.rand(3, 4, 5, device=device_type)
+=======
+        a = torch.rand(3, 4, 5, device="cuda")
+        b = torch.rand(3, 4, 5, device="cuda")
+        b_v = torch.rand(3, 5, 4, device="cuda").view(3, 4, 5)
+        b_p = torch.rand(3, 5, 4, device="cuda").permute(0, 2, 1)
+        c = torch.rand(3, 4, 5, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_counter = torch._dynamo.testing.CompileCounter()
 
         with torch._dynamo.config.patch("recompile_limit", 2):
@@ -242,12 +257,20 @@ def f(x):
             opt_f(torch.randn(8 + i))
 
         failure_str = "\n".join(failure_reasons)
+<<<<<<< HEAD
         for line in [
             "tensor 'x' size mismatch at index 0. expected 11, actual 12",
             "tensor 'x' size mismatch at index 0. expected 10, actual 12",
             "tensor 'x' size mismatch at index 0. expected 9, actual 12",
             "tensor 'x' size mismatch at index 0. expected 8, actual 12",
         ]:
+=======
+        for line in """\
+tensor 'x' size mismatch at index 0. expected 11, actual 12
+tensor 'x' size mismatch at index 0. expected 10, actual 12
+tensor 'x' size mismatch at index 0. expected 9, actual 12
+tensor 'x' size mismatch at index 0. expected 8, actual 12""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIn(
                 line,
                 failure_str,
@@ -282,13 +305,24 @@ def filter_reasons():
             failure_reasons.clear()
             opt_f([7, 8])
 
+<<<<<<< HEAD
             for line in ["len(x) == 3"]:
+=======
+            for line in """\
+len(x) == 3""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertIn(line, filter_reasons())
 
             failure_reasons.clear()
             opt_f([9])
 
+<<<<<<< HEAD
             for line in ["len(x) == 2", "len(x) == 3"]:
+=======
+            for line in """\
+len(x) == 2
+len(x) == 3""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertIn(line, filter_reasons())
 
     @torch._dynamo.config.patch(recompile_limit=1)
diff --git a/test/dynamo/test_recompiles.py b/test/dynamo/test_recompiles.py
index 825d2e5d674a9..80a0990c00081 100644
--- a/test/dynamo/test_recompiles.py
+++ b/test/dynamo/test_recompiles.py
@@ -4,6 +4,7 @@
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
+<<<<<<< HEAD
 from torch._dynamo import config as dc
 
 
@@ -56,6 +57,11 @@ def forward(self, x):
             mod = Mod()
             mod(torch.randn(2, 2))
 
+=======
+
+
+class RecompileTests(torch._dynamo.test_case.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_automatic_dynamic_reduce_recompiles(self):
         # Test the counterfactual, lots of recompiles without this config
         def foo(x, y):
@@ -548,6 +554,7 @@ def f(x, foo):
         f(x, foo1)
         self.assertEqual(counter.frame_count, 2)
 
+<<<<<<< HEAD
     def test_no_recompile_over_unused_objects(self):
         # This is a regression test case that imitates
         # https://github.com/city96/ComfyUI-GGUF/blob/47bec6147569a138dd30ad3e14f190a36a3be456/ops.py#L169-L182
@@ -571,6 +578,8 @@ def apply_patches(f, x, keys):
         apply_patches(f, x, [("c", 3), ("d", 4)])
         self.assertEqual(counter.frame_count, 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_reconstruct.py b/test/dynamo/test_reconstruct.py
index 9f3d41964195d..aa83d649773be 100644
--- a/test/dynamo/test_reconstruct.py
+++ b/test/dynamo/test_reconstruct.py
@@ -7,7 +7,11 @@
 import torch
 import torch._dynamo.test_case
 from torch.testing._internal.common_utils import IS_FBCODE
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, requires_triton
+=======
+from torch.testing._internal.inductor_utils import requires_triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import (
     has_triton_experimental_host_tma,
     has_triton_tensor_descriptor_host_tma,
@@ -82,6 +86,10 @@ def f(d, t):
             opt_f(d_opt, t)
             self.assertEqual(d, d_opt)
 
+<<<<<<< HEAD
+=======
+    @unittest.expectedFailure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ConstDict_popitem_reconstruct(self):
         """
         If something is pop'ed from the dict, we reconstruct everything
@@ -420,7 +428,11 @@ def create_tma(tensor):
             )
             return tensor + 1, tma
 
+<<<<<<< HEAD
         x = torch.randn(128, 128, device=GPU_TYPE)
+=======
+        x = torch.randn(128, 128, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref = create_tma(x)
         res = torch.compile(create_tma, backend="eager")(x)
@@ -441,7 +453,11 @@ def create_tma(tensor):
             )
             return tensor + 1, tma
 
+<<<<<<< HEAD
         x = torch.randn(128, 128, device=GPU_TYPE)
+=======
+        x = torch.randn(128, 128, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref = create_tma(x)
         res = torch.compile(create_tma, backend="eager")(x)
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
index be6bf8085af27..70da2bbc6dc33 100644
--- a/test/dynamo/test_reorder_logs.py
+++ b/test/dynamo/test_reorder_logs.py
@@ -210,8 +210,12 @@ def f(x):
   Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
 
   Developer debug context: call_method TensorVariable() item () {}
+<<<<<<< HEAD
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html""",  # noqa: B950
+=======
+""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index c6138f7574fd4..76b0c16b3c8b7 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -42,6 +42,7 @@
 import torch.library
 import torch.utils._pytree as pytree
 from torch import nn
+<<<<<<< HEAD
 from torch._dynamo.backends.debugging import ExplainWithBackend
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import (
@@ -56,10 +57,17 @@
 from torch._inductor.utils import fresh_cache
 from torch.nn import functional as F
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+=======
+from torch._dynamo.debug_utils import same_two_models
+from torch._dynamo.testing import CompileCounter, rand_strided, same, skipIfPy312
+from torch._inductor.utils import fresh_cache
+from torch.nn import functional as F
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.profiler import profile, ProfilerActivity
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_FP8,
+<<<<<<< HEAD
     SM70OrLater,
     TEST_CUDA,
 )
@@ -68,6 +76,11 @@
     e4m3_type,
     instantiate_device_type_tests,
 )
+=======
+    TEST_CUDA,
+)
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -1000,6 +1013,7 @@ def tearDown(self) -> None:
         self.exit_stack.close()
         super().tearDown()
 
+<<<<<<< HEAD
     def test_compiled_module_truthiness(self):
         # Test with empty ModuleList
         original_empty = nn.ModuleList()
@@ -1013,6 +1027,9 @@ def test_compiled_module_truthiness(self):
         self.assertTrue(bool(compiled_filled))
 
     def guard_manager_clone_hook_fn(self, guard_manager_wrapper, f_locals, builder):
+=======
+    def guard_manager_clone_hook_fn(self, guard_manager_wrapper, f_locals):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         root = guard_manager_wrapper.root
         cloned_root = root.clone_manager(lambda x: True)
         cloned_wrapper = torch._dynamo.guards.GuardManagerWrapper(cloned_root)
@@ -2062,6 +2079,7 @@ def fn(x):
             ref0 = fn(x)
             ref1 = fn(x)
 
+<<<<<<< HEAD
             opt_fn = torch.compile(fn, backend="eager")
             # Especially for internal usage, there are many calls to random functions
             # on first compile, e.g., from various library initializations. Run once
@@ -2069,6 +2087,10 @@ def fn(x):
             opt_fn(x)
 
             random.seed(0)
+=======
+            random.seed(0)
+            opt_fn = torch.compile(fn, backend="eager")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res0 = opt_fn(x)
             res1 = opt_fn(x)
 
@@ -3263,7 +3285,11 @@ def f(x):
     def test_rewrite_assert_with_non_string_msg(self):
         def f(x):
             b = x.sin()
+<<<<<<< HEAD
             assert x[0] == 2, f"Error {x}: {x.size()}"
+=======
+            assert x[0] == 2, x.size()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x.cos() + b
 
         torch._dynamo.utils.counters.clear()
@@ -3857,9 +3883,12 @@ def f(x):
 
         self.assertEqual(f(torch.ones(8, 4)), gm(torch.ones(8, 4)))
 
+<<<<<<< HEAD
     @skipIfWindows(
         msg="TODO: (xuhancn) fix, AssertionError: tensor([[0.1000, 0.1000, 0.1000,  ..., 0.1000, 0.1000, 0.1000],"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_optim_state_references_cleared(self):
         model = torch.nn.Linear(2048, 2048, bias=False)
         x = torch.ones(2048)
@@ -3975,7 +4004,11 @@ def randint_fn(high, size, out):
         opt_model(17, (12,), out2)
 
     @requires_cuda
+<<<<<<< HEAD
     @serialTest()
+=======
+    @serialTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mem_leak_guards(self):
         def gn(x0, x):
             return x0 * x
@@ -4204,6 +4237,7 @@ def fn(x, l):
         torch.compile(fn, backend=counter)(torch.randn([2, 2]), [])
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
     def test_get_type_hints(self):
         class Foo:
             pass
@@ -4219,6 +4253,8 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_break_on_jit_isinstance(self):
         @torch.compile(backend="eager")
         def fn(x):
@@ -4273,7 +4309,11 @@ def test_deferred_runtime_asserts(self):
         @torch.compile(fullgraph=True)
         def f(x):
             y = x.item()
+<<<<<<< HEAD
             torch._check(y >= 0)
+=======
+            torch._check_is_size(y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if y >= 0:
                 return x * 2
             else:
@@ -4483,7 +4523,11 @@ def func3(x, y):
 
         compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
         requires_grad = func is not func1
+<<<<<<< HEAD
         for _ in range(5):
+=======
+        for _ in range(0, 5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Inputs
             eager_a = torch.ones([6], requires_grad=requires_grad)
             compiled_a = torch.ones([6], requires_grad=requires_grad)
@@ -4511,6 +4555,7 @@ def func3(x, y):
         # frame_count should stay at 1.
         self.assertEqual(cnt.frame_count, 1)
 
+<<<<<<< HEAD
     def test_tensor_set_data_mismatched_dtype(self):
         def func(x, y):
             x.data = y.to(dtype=torch.bfloat16)
@@ -4525,6 +4570,8 @@ def func(x, y):
         self.assertEqual(x1.data, x2.data)
         self.assertEqual(y1, y2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_user_ctor_ctx_manager(self):
         class UserCtxManager:
             def __enter__(self):
@@ -4635,7 +4682,11 @@ def fn(x, counter):
         x = torch.rand([2, 2])
         self.assertEqual(opt_fn(x, counter), fn(x, counter))
         self.assertEqual(counter[0], 2)
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for _ in range(0, 10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             opt_fn(x, counter)
         self.assertEqual(counter[0], 12)
         if torch._dynamo.config.assume_static_by_default:
@@ -4796,7 +4847,11 @@ def fn(x):
     def test_contains_range_constprop(self):
         def fn(x):
             # dynamo should const prop to False
+<<<<<<< HEAD
             if 3 in range(10):
+=======
+            if 3 in range(0, 10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x + 1
             else:
                 return x + 2
@@ -4837,6 +4892,7 @@ def foo(a):
             "encountered a mutation on a view chain of length 2, where view 1 was an as_strided",
         ):
             f_compiled(a)
+<<<<<<< HEAD
         # See https://github.com/pytorch/pytorch/issues/161010
 
     def test_preserve_stride_with_clone(self) -> None:
@@ -4898,6 +4954,8 @@ def foo() -> torch.Tensor:
             (1, 4),
             "Compile with inductor backend should have stride (1, 4)",
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # https://github.com/pytorch/pytorch/issues/146598
     @unittest.expectedFailure
@@ -5085,6 +5143,7 @@ def fn(x_weak, weight, y):
         res = opt_fn(x_weak, weight, y)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/159258
     def test_weakref_proxy(self):
         class DummyTrainer:
@@ -5106,6 +5165,8 @@ def foo(self):
         compiled_foo = torch.compile(model.foo, backend="eager", fullgraph=True)
         self.assertEqual(compiled_foo(), x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_weakref_reconstruct(self):
         def fn(x_weak, weight, y):
             y = torch.sin(y)
@@ -5167,7 +5228,10 @@ def fn(x_weak, y):
     # any behavior that depends on deallocation order. We do guarantee "eventual consistency",
     # that is, after the torch.compile'd function is finished running (including any graph breaks),
     # refcount semantics will match eager's.
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) fix, AssertionError: False is not true")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_weakref_callback(self):
         called1 = False
 
@@ -5760,7 +5824,11 @@ def func(x, m):
         self.assertEqual(func(x, 0), opt_func(x, 0))
 
     def test_grad(self):
+<<<<<<< HEAD
         # Write to `grad` or `_grad` should reflective in reading from the other,
+=======
+        # Write to `grad` or `_grad` should reflecte in reading from the other,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # and should be codegen-ed.
         def fn(x, y):
             x._grad = y + 1
@@ -5831,6 +5899,7 @@ def fn(x):
 
         fn(torch.rand(4))
 
+<<<<<<< HEAD
     def test_export_vs_dynamo_for_multiheadattention(self):
         # More details at https://github.com/pytorch/pytorch/issues/164062
 
@@ -5856,6 +5925,8 @@ def test_export_vs_dynamo_for_multiheadattention(self):
             self.assertEqual(len(compile_nodes), 0)
             self.assertEqual(len(export_nodes), 0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_negative_floor_div_solve(self):
         class CompiledClass(nn.Module):
             def __init__(self) -> None:
@@ -5997,10 +6068,13 @@ def f(x):
         torch.view_as_real(out_test).sum().backward()
         self.assertEqual(x_ref.grad, x_test.grad)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not SM70OrLater,
         "Triton only supports devices of CUDA capability >= 7.0",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_complex_conj(self):
         def f(x):
             return x + x.conj()
@@ -6321,7 +6395,11 @@ def f(x, param):
         self.assertEqual(out_ref, out_test)
 
     @requires_cuda
+<<<<<<< HEAD
     # This test will fail as flip in combination with particular input lengths
+=======
+    # This test will fail as flip in combination with particular input lenghts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # produces weird results.
     # This is under investigations in
     # https://github.com/pytorch/pytorch/issues/131805
@@ -6665,7 +6743,10 @@ def fn(x, y):
         self.assertEqual(ref, res)
 
     @skipIfPy312  # listcomp bytecode is optimized
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) fix, AssertionError: Scalars are not equal!")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_listcomp(self):
         class Module(torch.nn.Module):
             def __init__(self):
@@ -7065,8 +7146,11 @@ def fn(x):
 
         torch._dynamo.utils.clear_compilation_metrics()
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/156580
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dont_dce_rand(self):
         # https://github.com/pytorch/pytorch/issues/143431
         def f(image_latent):
@@ -7128,6 +7212,7 @@ def f(x, c):
         c = "foobar"
         self.assertEqual(f(x, c), opt_f(x, c))
 
+<<<<<<< HEAD
     def test_nn_param_freevar_codegen(self):
         class Model2(nn.Module):
             def __init__(self) -> None:
@@ -7160,6 +7245,8 @@ def wrapper(*args, **kwargs):
             v2 = jit_func(input_tensor)
             self.assertEqual(v1, v2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_amp_foreach_fake_impl(self):
         inv_scale = torch.full((1,), 0.25)
         found_inf = torch.full((1,), 0.0)
@@ -7175,6 +7262,7 @@ def f():
         res = torch.compile(f, backend="aot_eager")()
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_deleted_compile_wrapper_segfault(self):
         def fn(x):
             return x + 1
@@ -7187,6 +7275,8 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager")
         opt_fn(torch.randn(3))  # possible segfault due to first opt_fn deletion
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_delete_local_error(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
@@ -7198,6 +7288,7 @@ def fn(x):
         with self.assertRaises(torch._dynamo.exc.Unsupported):
             fn(torch.ones(3))
 
+<<<<<<< HEAD
     def test_nanmean_out(self):
         def f(x, out):
             torch.nanmean(x, out=out)
@@ -7455,6 +7546,8 @@ def forward(self, x):
             msg,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReproTestsDevice(torch._dynamo.test_case.TestCase):
     def test_sub_alpha_scalar_repro(self, device):
@@ -7707,9 +7800,15 @@ def mul_tiled(a, *bs):
             return a
 
         def scale(t, amax_t):
+<<<<<<< HEAD
             max_v = E4M3_MAX_POS
             scale_t = torch.clamp(amax_t.float(), min=1e-12) / max_v
             t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(e4m3_type)
+=======
+            max_v = torch.finfo(torch.float8_e4m3fn).max
+            scale_t = torch.clamp(amax_t.float(), min=1e-12) / max_v
+            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return t_fp8, scale_t
 
         def matmul(first, amax_first, second_t, amax_second_t, bias):
@@ -7832,7 +7931,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # *are* saved for backward, and become back inputs.
         # The easier-to-test thing I'm checking for here is that the recompute
         # on primals_2 happens in the backward. With the recompute,
+<<<<<<< HEAD
         # there are 5 _to_copy ops in the backward. Without it, there are 4
+=======
+        # there are 5 _to_copy ops in the backwrad. Without it, there are 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # (aka if you set torch._functorch.config.treat_parameters_as_free_to_save = False)
         self.assertEqual(mode.ops_counter[torch.ops.aten._to_copy.default], 5)
 
@@ -7945,6 +8048,7 @@ def f(x):
         with mock.patch("torch.cuda.is_initialized", lambda: False):
             self.assertEqual(f(inp), inp + 2)
 
+<<<<<<< HEAD
     def test_named_tuple_vt_clone(self):
         # https://github.com/pytorch/pytorch/issues/157945
         class SVDCompressor(nn.Module):
@@ -8097,6 +8201,8 @@ def fn(x):
 
         self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/test/dynamo/test_sdpa.py b/test/dynamo/test_sdpa.py
index 02a867af76d53..d83dedcf6bf5c 100644
--- a/test/dynamo/test_sdpa.py
+++ b/test/dynamo/test_sdpa.py
@@ -5,7 +5,10 @@
 import torch._dynamo.testing
 from torch._dynamo.testing import CompileCounter
 from torch.backends.cuda import SDPAParams
+<<<<<<< HEAD
 from torch.nn.attention import _cur_sdpa_kernel_backends, sdpa_kernel, SDPBackend
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -100,6 +103,7 @@ def fn(q, k, v, m):
             self.assert_ref_equals_params(o, expected)
             self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
     def test_sdpa_c_functions_no_graph_break(self):
         counter = CompileCounter()
 
@@ -137,6 +141,8 @@ def f(x):
         self.assertEqual(result.shape, x.shape)
         self.assertEqual(counter.frame_count, 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_skip_guard_eval_unsafe.py b/test/dynamo/test_skip_guard_eval_unsafe.py
index dc7d74bc3629d..5657c6f7d7ece 100644
--- a/test/dynamo/test_skip_guard_eval_unsafe.py
+++ b/test/dynamo/test_skip_guard_eval_unsafe.py
@@ -54,9 +54,14 @@ def fn(x, y):
 
     def test_post_recompile(self):
         class Foo:
+<<<<<<< HEAD
             def __init__(self):
                 self.a = 4
                 self.b = 5
+=======
+            a = 4
+            b = 5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         foo = Foo()
 
diff --git a/test/dynamo/test_sources.py b/test/dynamo/test_sources.py
index a2f91afc93b74..133241b5c42b2 100644
--- a/test/dynamo/test_sources.py
+++ b/test/dynamo/test_sources.py
@@ -59,7 +59,11 @@ def __init__(self) -> None:
             def forward(self):
                 if (
                     torch.utils._pytree.SUPPORTED_NODES[CausalLMOutputWithPast].type
+<<<<<<< HEAD
                     is int
+=======
+                    == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     x = torch.sin(self.x)
                 else:
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index 8fbeee3e7128c..e6fefe528427c 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -10,7 +10,10 @@
 import subprocess
 import tempfile
 import unittest.mock
+<<<<<<< HEAD
 from contextlib import contextmanager
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.test_case
@@ -21,6 +24,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import find_free_port, xfailIfS390X
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
@@ -30,6 +34,15 @@
 
 HAS_TLPARSE = shutil.which("tlparse") is not None
 requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
+=======
+from torch.testing._internal.common_utils import find_free_port
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+HAS_TLPARSE = shutil.which("tlparse") is not None
+requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -109,8 +122,11 @@ def format(self, record):
             metadata["dynamo_start"]["stack"] = "STACK"
         if "inductor_output_code" in metadata:
             metadata["inductor_output_code"]["filename"] = "FILENAME"
+<<<<<<< HEAD
             if "file_path" in metadata["inductor_output_code"]:
                 metadata["inductor_output_code"]["file_path"] = "FILENAME"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if "stack" in metadata:
             metadata["stack"] = "STACK"
         if "compilation_metrics" in metadata:
@@ -238,7 +254,11 @@ def test_compile_id_serialization_deserialization(self):
             with self.assertRaises(ValueError):
                 torch._guards.CompileId.from_string(bad_cid)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_schedule(self):
         fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
         fn_opt(torch.ones(1000, 1000, device="cuda"))
@@ -247,11 +267,16 @@ def test_schedule(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -264,6 +289,18 @@ def test_schedule(self):
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics_runtime": "METRICS", "frame_id": 0, "frame_compile_id": 0}
@@ -272,8 +309,12 @@ def test_schedule(self):
 
         self.assertParses()
 
+<<<<<<< HEAD
     @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudagraphs(self):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
         fn_opt(torch.ones(1000, 1000, device="cuda"))
@@ -282,11 +323,16 @@ def test_cudagraphs(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -299,6 +345,18 @@ def test_cudagraphs(self):
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics_runtime": "METRICS", "frame_id": 0, "frame_compile_id": 0}
@@ -321,6 +379,7 @@ def fn(x, y):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -340,17 +399,41 @@ def fn(x, y):
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_y_": [1000, 1000], "l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s48", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -362,6 +445,17 @@ def fn(x, y):
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 """,  # noqa: B950
@@ -378,11 +472,16 @@ def test_example_fn(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -394,6 +493,17 @@ def test_example_fn(self):
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -414,33 +524,56 @@ def test_example_training_fn(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack1']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_output_graph": {"sizes": {"l_stack0_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "sum_1": []}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
@@ -448,22 +581,38 @@ def test_example_training_fn(self):
 {"aot_backward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics": "METRICS", "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['output']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics": "METRICS", "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -483,7 +632,11 @@ def test_dynamo_error(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -517,12 +670,19 @@ def throw(x):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -530,7 +690,11 @@ def throw(x):
 {"aot_backward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -539,7 +703,11 @@ def throw(x):
         self.assertParses()
 
     @requires_distributed()
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ddp_graphs(self):
         class ToyModel(torch.nn.Module):
             def __init__(self) -> None:
@@ -600,8 +768,13 @@ def forward(self, x):
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -613,8 +786,13 @@ def forward(self, x):
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -628,7 +806,11 @@ def forward(self, x):
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['args'][0]"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
@@ -644,6 +826,7 @@ def forward(self, x):
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -657,12 +840,28 @@ def forward(self, x):
 {"describe_source": {"describer_id": "ID", "id": 8, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 4, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 3, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 8, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 4, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 9, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_self_modules_layers_modules_0_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_0_parameters_bias_": [1024], "l_x_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_bias_": [1024], "input_1": [1024, 1024], "input_2": [1024, 1024]}}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -674,6 +873,18 @@ def forward(self, x):
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -681,6 +892,7 @@ def forward(self, x):
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -693,6 +905,20 @@ def forward(self, x):
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -700,9 +926,16 @@ def forward(self, x):
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -728,11 +961,16 @@ def fn(x):
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -744,6 +982,17 @@ def fn(x):
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -769,10 +1018,17 @@ def fn(a, b):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 800}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 2400}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 2400}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [10, 20], "l_b_": [20, 30], "matmul": [10, 30]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -780,12 +1036,20 @@ def fn(a, b):
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 200}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s97", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s98", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+=======
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s52", "val": "10", "vr": "[2, int_oo]", "source": "L['b'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s20", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -821,7 +1085,11 @@ def inner(x, ys, zs):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -829,7 +1097,11 @@ def inner(x, ys, zs):
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -859,10 +1131,17 @@ def forward(self, x, y):
     return add
 
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [3], "l_y_": [3], "add": [3]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -889,11 +1168,16 @@ def fn(a):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -905,11 +1189,26 @@ def fn(a):
 {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -918,9 +1217,13 @@ def fn(a):
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+=======
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -1018,7 +1321,10 @@ def fn(a):
         logs = self.buffer.getvalue()
         self.assertTrue(all(event in logs for event in chromium_events))
 
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_tlparse
     @torch._dynamo.config.patch("compiled_autograd", True)
     def test_compiled_autograd_attribution(self):
@@ -1054,13 +1360,26 @@ def backward(ctx, gO):
             '{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}',
+<<<<<<< HEAD
+=======
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}',
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 0, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}',
+<<<<<<< HEAD
             '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 12, "frame_compile_id": 1, "attempt": 0}',
+=======
+            '{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 1, "attempt": 0}',
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 6, "frame_compile_id": 1, "attempt": 0}',
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}',
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 11, "frame_compile_id": 0, "attempt": 0}',
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 10, "frame_compile_id": 1, "attempt": 0}',
+            '{"dynamo_start": {"stack": "STACK"}, "compiled_autograd_id": 1, "frame_id": 11, "frame_compile_id": 1, "attempt": 0}',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         logs = self.buffer.getvalue()
         self.assertTrue(all(event in logs for event in expected))
@@ -1085,6 +1404,7 @@ def test_compiled_autograd_chromium(self):
         logs = self.buffer.getvalue()
         self.assertTrue(all(event in logs for event in expected))
 
+<<<<<<< HEAD
     def test_recompile_user_contexts(self):
         # test that user_context is called only once per recompile
         num_calls = 0
@@ -1531,6 +1851,8 @@ def fn(x):
             )
             self.assertParses()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 39a0dc628baec..32475e483b112 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -31,7 +31,11 @@
     parametrize,
     subtest,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
@@ -145,6 +149,11 @@ def mk_subclass_dense_subclass_dense():
 VIEW_TEST_CASES = {k: v for v, k in get_view_test_cases()}
 
 
+<<<<<<< HEAD
+=======
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 compile_full_eager = torch.compile(backend="eager", fullgraph=True)
 
 
@@ -286,7 +295,11 @@ def __tensor_flatten__(self):
     def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
         return OptionalScaledTensor(
             inner_tensors["_data"],
+<<<<<<< HEAD
             inner_tensors.get("_scale", None),
+=======
+            inner_tensors["_scale"] if "_scale" in inner_tensors else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             constant=metadata["_constant"],
         )
 
@@ -662,7 +675,11 @@ def fn(v0, v1):
         "comparison",
         [
             subtest(isinstance, "isinstance"),
+<<<<<<< HEAD
             subtest(lambda instance, type_: type(instance) is type_, "equality"),
+=======
+            subtest(lambda instance, type_: type(instance) == type_, "equality"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             subtest(lambda instance, type_: type(instance) is type_, "identity"),
         ],
     )
@@ -1366,7 +1383,11 @@ def forward(self, L_x_: "f32[3, 4]"):
         )
         self.assertTrue(torch._is_functional_tensor(backend.example_inputs[1][0]))
 
+<<<<<<< HEAD
         # Cannot reuse the version from AOTAutograd, since that uses python functional tensors.
+=======
+        # Cannot re-use the version from AOTAutograd, since that uses python functional tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def to_fun(x):
             x_functional = torch._to_functional_tensor(x)
             torch._mirror_autograd_meta_to(x, x_functional)
@@ -2015,7 +2036,11 @@ def forward(self):
             exp_frame_count=[1, 1, 2, 2],
             exp_shape_env_guards=[
                 [],
+<<<<<<< HEAD
                 # s0 is specialized and guarded in outer shape_env when dynamo checks the guards
+=======
+                # s0 is specialized and guarded in outter shape_env when dynamo checks the guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ["Eq(Piecewise((1, Eq(s0, 3)), (0, True)), 1)"],
                 [
                     "Eq(Piecewise((1, Eq(s0, 3)), (0, True)), 1)",
@@ -2037,7 +2062,11 @@ def forward(self):
             exp_frame_count=[1, 1, 2, 2],
             exp_shape_env_guards=[
                 [],
+<<<<<<< HEAD
                 # s0 is specialized and guarded in outer shape_env when dynamo checks the guards
+=======
+                # s0 is specialized and guarded in outter shape_env when dynamo checks the guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ["Ne(Piecewise((1, Eq(s0, 5)), (0, True)), 1)"],
                 [
                     "Ne(Piecewise((1, Eq(s0, 5)), (0, True)), 1)",
@@ -2129,8 +2158,12 @@ def f(x):
     @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
     @parametrize("dynamic", [True, False])
     def test_mark_static_with_subclass_desugaring(self, dynamic):
+<<<<<<< HEAD
         from collections.abc import Callable
         from typing import Any, Optional
+=======
+        from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch._dynamo.decorators import mark_static_address
         from torch._inductor.compile_fx import compile_fx
@@ -2245,6 +2278,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2270,10 +2304,20 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=1)
             primals_7,  # SavedForBackwardsAOTOutput(idx=2)
         )
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        mul: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
+        mul_3: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
+        return (mul, mul_3, primals_5, primals_7, primals_7, primals_1, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2296,6 +2340,15 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s47, s16]", tangents_2: "f32[s47, s16]"):
+        mul_8: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = None
+        mul_9: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(tangents_2, primals_1);  tangents_2 = primals_1 = None
+        return (None, None, mul_8, mul_9, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2311,6 +2364,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2324,11 +2378,18 @@ def forward(
         primals_6: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=2), idx=1)
         primals_7: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=2), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         view: "f32[s16, s47]" = torch.ops.aten.view.default(clone, [primals_2, primals_1]);  clone = None
         view_1: "f32[s16, s47]" = torch.ops.aten.view.default(clone_1, [primals_2, primals_1]);  clone_1 = primals_1 = None
+<<<<<<< HEAD
         return (
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='a')
             view_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='b')
@@ -2338,10 +2399,14 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
             primals_7,  # SavedForBackwardsAOTOutput(idx=1)
         )
+=======
+        return (view, view_1, primals_2, primals_5, primals_5, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2363,6 +2428,15 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16, s47]", tangents_2: "f32[s16, s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+        return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2380,6 +2454,7 @@ def f(tt, a, b):
         fw, bw = self._compile_check(f, [(tt, a, b)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2393,6 +2468,12 @@ def forward(
         primals_6: "Sym(s98)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=2), idx=1)
         primals_7: "Sym(s98)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=2), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s97)", primals_2: "Sym(s98)", primals_3: "f32[s97, s98]", primals_4: "f32[s97, s98]", primals_5: "Sym(s97)", primals_6: "Sym(s98)", primals_7: "Sym(s98)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mul: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
         mul_3: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
         mul_8: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul, primals_2);  mul = None
@@ -2401,6 +2482,7 @@ def forward(
         mul_19: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_11, primals_1);  mul_11 = None
         mul_24: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_16, primals_2);  mul_16 = None
         mul_27: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_19, primals_2);  mul_19 = None
+<<<<<<< HEAD
         return (
             mul_24,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='a')
             mul_27,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='b')
@@ -2412,10 +2494,14 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=2)
             primals_7,  # SavedForBackwardsAOTOutput(idx=3)
         )
+=======
+        return (mul_24, mul_27, primals_5, primals_7, primals_7, primals_1, primals_2, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2428,6 +2514,12 @@ def forward(
         tangents_1: "f32[s97, s98]",  # SubclassGetAttrAOTInput(base=TangentAOTInput(output=PlainAOTOutput(idx=0)), attr='a')
         tangents_2: "f32[s97, s98]",  # SubclassGetAttrAOTInput(base=TangentAOTInput(output=PlainAOTOutput(idx=0)), attr='b')
     ):
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s97)", primals_2: "Sym(s98)", primals_5: "Sym(s97)", primals_7: "Sym(s98)", tangents_1: "f32[s97, s98]", tangents_2: "f32[s97, s98]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mul_32: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(tangents_1, primals_2);  tangents_1 = None
         mul_33: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(tangents_2, primals_2);  tangents_2 = None
         mul_34: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_32, primals_1);  mul_32 = None
@@ -2436,6 +2528,7 @@ def forward(
         mul_37: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_35, primals_2);  mul_35 = primals_2 = None
         mul_38: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_36, primals_1);  mul_36 = None
         mul_39: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_37, primals_1);  mul_37 = primals_1 = None
+<<<<<<< HEAD
         return (
             None,  # None
             None,  # None
@@ -2445,6 +2538,9 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+        return (None, None, mul_38, mul_39, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2460,6 +2556,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2473,11 +2570,18 @@ def forward(
         primals_6: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=2), idx=1)
         primals_7: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=2), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         view: "f32[s47, s16]" = torch.ops.aten.view.default(clone, [primals_1, primals_2]);  clone = None
         view_1: "f32[s47, s16]" = torch.ops.aten.view.default(clone_1, [primals_1, primals_2]);  clone_1 = primals_1 = primals_2 = None
+<<<<<<< HEAD
         return (
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='a')
             view_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='b')
@@ -2487,10 +2591,14 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
             primals_7,  # SavedForBackwardsAOTOutput(idx=1)
         )
+=======
+        return (view, view_1, primals_5, primals_7, primals_7, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2512,6 +2620,15 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s47, s16]", tangents_2: "f32[s47, s16]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+        return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2527,6 +2644,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2540,12 +2658,19 @@ def forward(
         primals_6: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=2), idx=1)
         primals_7: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=2), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         mul_6: "Sym(s16*s47)" = primals_1 * primals_2;  primals_1 = primals_2 = None
         view: "f32[s16*s47]" = torch.ops.aten.view.default(clone, [mul_6]);  clone = None
         view_1: "f32[s16*s47]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+<<<<<<< HEAD
         return (
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='a')
             view_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='b')
@@ -2553,10 +2678,14 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
             primals_7,  # SavedForBackwardsAOTOutput(idx=1)
         )
+=======
+        return (view, view_1, mul_6, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2578,6 +2707,15 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16*s47]", tangents_2: "f32[s16*s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+        return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2593,6 +2731,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2606,12 +2745,19 @@ def forward(
         primals_6: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=2), idx=1)
         primals_7: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=2), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         mul_6: "Sym(s16*s47)" = primals_1 * primals_2;  primals_1 = primals_2 = None
         view: "f32[s16*s47]" = torch.ops.aten.view.default(clone, [mul_6])
         view_1: "f32[s16*s47]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+<<<<<<< HEAD
         return (
             clone,  # PlainAOTOutput(idx=0)
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=1), attr='a')
@@ -2620,10 +2766,14 @@ def forward(
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
             primals_7,  # SavedForBackwardsAOTOutput(idx=1)
         )
+=======
+        return (clone, view, view_1, mul_6, primals_5, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2645,6 +2795,15 @@ def forward(
             primals_7,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=1)
             primals_7,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=2)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16*s47]", tangents_2: "f32[s16*s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+        return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2705,6 +2864,7 @@ def f(tt):
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2713,21 +2873,32 @@ def forward(
         primals_1: "f32[3, 4]",  # SubclassGetAttrAOTInput(base=PlainAOTInput(idx=0), attr='a')
         primals_2: "f32[3, 4]",  # SubclassGetAttrAOTInput(base=PlainAOTInput(idx=0), attr='b')
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 4]", primals_2: "f32[3, 4]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[3, 4]" = torch.ops.aten.clone.default(primals_1);  primals_1 = None
         clone_1: "f32[3, 4]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
 
         view: "f32[12]" = torch.ops.aten.view.default(clone, [-1])
         view_1: "f32[12]" = torch.ops.aten.view.default(clone_1, [-1])
+<<<<<<< HEAD
         return (
             clone,  # PlainAOTOutput(idx=0)
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=1), attr='a')
             view_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=1), attr='b')
             clone_1,  # PlainAOTOutput(idx=2)
         )
+=======
+        return (clone, view, view_1, clone_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[1].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2739,12 +2910,19 @@ def forward(
         primals_4: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=1), idx=1)
         primals_5: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=1), idx=0)
     ):
+=======
+            normalize_gm(fw[1].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s16)", primals_2: "f32[3, s16]", primals_3: "f32[3, s16]", primals_4: "Sym(s16)", primals_5: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[3, s16]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
         clone_1: "f32[3, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
 
         view: "f32[3*s16]" = torch.ops.aten.view.default(clone, [-1])
         sym_size_int_2: "Sym(3*s16)" = torch.ops.aten.sym_size.int(view, 0)
         view_1: "f32[3*s16]" = torch.ops.aten.view.default(clone_1, [-1])
+<<<<<<< HEAD
         return (
             clone,  # PlainAOTOutput(idx=0)
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=1), attr='a')
@@ -2753,10 +2931,14 @@ def forward(
             clone_1,  # PlainAOTOutput(idx=2)
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
         )
+=======
+        return (clone, view, view_1, sym_size_int_2, clone_1, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2771,10 +2953,20 @@ def forward(
             view_2,  # SubclassGetAttrAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=0)), attr='a')
             view_3,  # SubclassGetAttrAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=0)), attr='b')
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[12]", tangents_2: "f32[12]"):
+        view_2: "f32[3, 4]" = torch.ops.aten.view.default(tangents_1, [3, 4]);  tangents_1 = None
+        view_3: "f32[3, 4]" = torch.ops.aten.view.default(tangents_2, [3, 4]);  tangents_2 = None
+        return (view_2, view_3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[1].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2793,6 +2985,15 @@ def forward(
             primals_5,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=1)), idx=1)
             primals_5,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=1)), idx=0)
         )
+=======
+            normalize_gm(bw[1].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s16)", tangents_1: "f32[3*s16]", tangents_2: "f32[3*s16]"):
+        view_2: "f32[3, s16]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
+        view_3: "f32[3, s16]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+        return (None, view_2, view_3, primals_5, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2816,6 +3017,7 @@ def f(tt):
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2827,12 +3029,19 @@ def forward(
         primals_4: "Sym(s16)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=1), idx=1)
         primals_5: "Sym(s16)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=1), idx=0)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s16)", primals_2: "f32[3, s16]", primals_3: "f32[3, s16]", primals_4: "Sym(s16)", primals_5: "Sym(s16)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[3, s16]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
         clone_1: "f32[3, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
 
         view: "f32[3*s16]" = torch.ops.aten.view.default(clone, [-1])
         sym_size_int_2: "Sym(3*s16)" = torch.ops.aten.sym_size.int(view, 0)
         view_1: "f32[3*s16]" = torch.ops.aten.view.default(clone_1, [-1])
+<<<<<<< HEAD
         return (
             clone,  # PlainAOTOutput(idx=0)
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=1), attr='a')
@@ -2841,10 +3050,14 @@ def forward(
             clone_1,  # PlainAOTOutput(idx=2)
             primals_5,  # SavedForBackwardsAOTOutput(idx=0)
         )
+=======
+        return (clone, view, view_1, sym_size_int_2, clone_1, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2863,6 +3076,15 @@ def forward(
             primals_5,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=1)), idx=1)
             primals_5,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=1)), idx=0)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_5: "Sym(s16)", tangents_1: "f32[3*s16]", tangents_2: "f32[3*s16]"):
+        view_2: "f32[3, s16]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
+        view_3: "f32[3, s16]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+        return (None, view_2, view_3, primals_5, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2878,6 +3100,7 @@ def f(tt):
         fw, bw = self._compile_check(f, [(tt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2886,19 +3109,30 @@ def forward(
         primals_1: "f32[24]",  # SubclassGetAttrAOTInput(base=PlainAOTInput(idx=0), attr='a')
         primals_2: "f32[24]",  # SubclassGetAttrAOTInput(base=PlainAOTInput(idx=0), attr='b')
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[24]", primals_2: "f32[24]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f32[24]" = torch.ops.aten.clone.default(primals_1);  primals_1 = None
         clone_1: "f32[24]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
 
         view: "f32[3, 2, 4]" = torch.ops.aten.view.default(clone, [3, 2, 4]);  clone = None
         view_1: "f32[3, 2, 4]" = torch.ops.aten.view.default(clone_1, [3, 2, 4]);  clone_1 = None
+<<<<<<< HEAD
         return (
             view,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='a')
             view_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='b')
         )
+=======
+        return (view, view_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -2913,6 +3147,15 @@ def forward(
             view_2,  # SubclassGetAttrAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=0)), attr='a')
             view_3,  # SubclassGetAttrAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=0)), attr='b')
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[3, 2, 4]", tangents_2: "f32[3, 2, 4]"):
+        view_2: "f32[24]" = torch.ops.aten.view.default(tangents_1, [24]);  tangents_1 = None
+        view_3: "f32[24]" = torch.ops.aten.view.default(tangents_2, [24]);  tangents_2 = None
+        return (view_2, view_3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -3039,6 +3282,7 @@ def f(nt):
         fw, bw = self._compile_check(f, [(nt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -3070,10 +3314,21 @@ def forward(
             primals_8,  # SavedForBackwardsAOTOutput(idx=1)
             primals_10,  # SavedForBackwardsAOTOutput(idx=2)
         )
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s51)", primals_2: "Sym(s71)", primals_3: "Sym(s55)", primals_4: "f64[s64, s55]", primals_5: "i64[s51 + 1]", primals_6: "f32[s0, 0]", primals_7: "f32[s83, 0]", primals_8: "Sym(s51)", primals_9: "Sym(s55)", primals_10: "Sym(s55)"):
+        clone: "f64[s64, s55]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        mul: "f64[s64, s55]" = torch.ops.aten.mul.Tensor(clone, primals_1);  clone = None
+        return (mul, primals_5, primals_6, primals_7, primals_8, primals_10, primals_10, primals_1, primals_8, primals_10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -3100,6 +3355,14 @@ def forward(
             primals_10,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=3)), idx=2)
             primals_10,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=3)), idx=1)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s51)", primals_8: "Sym(s51)", primals_10: "Sym(s55)", tangents_1: "f64[s64, s55]", tangents_2: "i64[s51 + 1]", tangents_3: "f32[s0, 0]", tangents_4: "f32[s83, 0]"):
+        mul_1: "f64[s64, s55]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = primals_1 = None
+        return (None, None, None, mul_1, tangents_2, tangents_3, tangents_4, primals_8, primals_10, primals_10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -3115,6 +3378,7 @@ def f(nt):
         fw, bw = self._compile_check(f, [(nt,)], dynamic=True, call_backward=True)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -3131,10 +3395,17 @@ def forward(
         primals_9: "Sym(s55)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=3), idx=2)
         primals_10: "Sym(s55)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=3), idx=1)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s51)", primals_2: "Sym(s71)", primals_3: "Sym(s55)", primals_4: "f64[s64, s55]", primals_5: "i64[s51 + 1]", primals_6: "f32[s0, 0]", primals_7: "f32[s83, 0]", primals_8: "Sym(s51)", primals_9: "Sym(s55)", primals_10: "Sym(s55)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone: "f64[s64, s55]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         cat: "f64[s64, 2*s55]" = torch.ops.aten.cat.default([clone, clone], 1);  clone = None
         add_2: "Sym(2*s55)" = primals_10 + primals_10
+<<<<<<< HEAD
         return (
             cat,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='_values')
             primals_5,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='_offsets')
@@ -3147,10 +3418,14 @@ def forward(
             primals_10,  # SavedForBackwardsAOTOutput(idx=1)
             add_2,  # SavedForBackwardsAOTOutput(idx=2)
         )
+=======
+        return (cat, primals_5, primals_6, primals_7, primals_8, add_2, add_2, primals_8, primals_10, add_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(bw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class GraphModule(torch.nn.Module):
@@ -3179,6 +3454,17 @@ def forward(
             primals_10,  # SubclassSizeAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=3)), idx=2)
             primals_10,  # SubclassStrideAOTOutput(base=GradAOTOutput(grad_of=PlainAOTInput(idx=3)), idx=1)
         )
+=======
+            normalize_gm(bw[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_8: "Sym(s51)", primals_10: "Sym(s55)", add_2: "Sym(2*s55)", tangents_1: "f64[s64, 2*s55]", tangents_2: "i64[s51 + 1]", tangents_3: "f32[s0, 0]", tangents_4: "f32[s83, 0]"):
+        slice_1: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, 0, primals_10)
+        slice_2: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, primals_10, add_2);  tangents_1 = add_2 = None
+
+        add_4: "f64[s64, s55]" = torch.ops.aten.add.Tensor(slice_1, slice_2);  slice_1 = slice_2 = None
+        return (None, None, None, add_4, tangents_2, tangents_3, tangents_4, primals_8, primals_10, primals_10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -3203,6 +3489,7 @@ def f(nt):
         )
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(fw[0].print_readable(print_output=False, expanded_def=True)),
             """\
 class <lambda>(torch.nn.Module):
@@ -3219,6 +3506,12 @@ def forward(
         arg8_1: "Sym(s55)",  # SubclassSizeAOTInput(base=PlainAOTInput(idx=3), idx=2)
         arg9_1: "Sym(s55)",  # SubclassStrideAOTInput(base=PlainAOTInput(idx=3), idx=1)
     ):
+=======
+            normalize_gm(fw[0].print_readable(print_output=False)),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "Sym(s51)", arg1_1: "Sym(s71)", arg2_1: "Sym(s55)", arg3_1: "f64[9, s55]", arg4_1: "i64[s51 + 1]", arg5_1: "f32[s0, 0]", arg6_1: "f32[s83, 0]", arg7_1: "Sym(s51)", arg8_1: "Sym(s55)", arg9_1: "Sym(s55)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         randn: "f64[2, 5]" = torch.ops.aten.randn.default([2, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
         randn_1: "f64[3, 5]" = torch.ops.aten.randn.default([3, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
         randn_2: "f64[4, 5]" = torch.ops.aten.randn.default([4, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
@@ -3239,6 +3532,7 @@ def forward(
 
         sym_size_int: "Sym(s55 + 5)" = torch.ops.aten.sym_size.int(cat_2, 1);  cat_2 = None
         sym_stride_int: "Sym(s55 + 5)" = torch.ops.aten.sym_stride.int(mul, 0)
+<<<<<<< HEAD
         return (
             mul,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='_values')
             cat_1,  # SubclassGetAttrAOTOutput(base=PlainAOTOutput(idx=0), attr='_offsets')
@@ -3247,6 +3541,9 @@ def forward(
             sym_size_int,  # SubclassSizeAOTOutput(base=PlainAOTOutput(idx=0), idx=2)
             sym_stride_int,  # SubclassStrideAOTOutput(base=PlainAOTOutput(idx=0), idx=1)
         )
+=======
+        return (mul, cat_1, zeros_1, zeros_2, sym_size_int, sym_stride_int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -3402,10 +3699,17 @@ def f(nt):
             norm_graph,
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s71: "Sym(s71)", L_nt_: "NestedTensor(f64[3, s71, 5])"):
         l_nt_ = L_nt_
 
         add: "NestedTensor(f64[3, s71, 5])" = l_nt_ + 2;  l_nt_ = None
+=======
+    def forward(self, s71: "Sym(s71)", L_nt_: "f64[3, s71, 5]"):
+        l_nt_ = L_nt_
+
+        add: "f64[3, s71, 5]" = l_nt_ + 2;  l_nt_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)
 """,  # noqa: B950
         )
@@ -3457,7 +3761,11 @@ def forward(self, s71: "Sym(s71)", L_nt_: "NestedTensor(f64[3, s71, 5])"):
     # triggers the eager logic to run, updating the counter and registry.
     #
     # Notably however, compile differs in two ways from eager:
+<<<<<<< HEAD
     # (1) The order in which the offsets are assigned ids is different
+=======
+    # (1) The order in which the offsets are assigned ids is differnet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #     the registry would be set in the order the offsets are returned
     #     which is not necessarily the same order as they were constructed.
     # (2) If a NestedTensor is not returned, then the AOTAutograd wrapping
@@ -3796,7 +4104,11 @@ def fn1(nt1, nt2):
     def test_basic_autograd(self):
         self._test_autograd("aot_eager")
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_basic_autograd_inductor(self):
         self._test_autograd("inductor")
 
diff --git a/test/dynamo/test_subgraphs.py b/test/dynamo/test_subgraphs.py
index 35036fd1de3fa..b91ba49ac16d4 100644
--- a/test/dynamo/test_subgraphs.py
+++ b/test/dynamo/test_subgraphs.py
@@ -401,7 +401,11 @@ def fn(a, b):
         y = torch.randn(3)
         self.assertEqual(opt_fn(x, y), fn(x, y))
         self.assertEqual(opt_fn(x, x), fn(x, x))
+<<<<<<< HEAD
         # NB: This COULD validly be 2, but we don't test disjointedness in the
+=======
+        # NB: This COULD validly be 2, but we don't test disjointness in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # guards for when x and y didn't duck size together, so we end up
         # with a generic graph that also works when x and y happen to duck
         # size together.
diff --git a/test/dynamo/test_trace_rules.py b/test/dynamo/test_trace_rules.py
index 9bfccd94b1f7e..aa5df42d59026 100644
--- a/test/dynamo/test_trace_rules.py
+++ b/test/dynamo/test_trace_rules.py
@@ -126,7 +126,11 @@ def gen_allowed_objs_and_ids(record=False, c_binding_only=True) -> AllowedObject
     torch_name_rule_map = {}
 
     # In some platforms, these functions were loaded as classes instead of functions.
+<<<<<<< HEAD
     # To mitigate these weird cases, we need this special check.
+=======
+    # To mitigate these weired cases, we need this special check.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_special_functions(obj):
         return hashable(obj) and obj in {
             torch._C._cuda_isCurrentStreamCapturing,
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 91862e6d3eb00..12698f076c33c 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -132,9 +132,12 @@ def fn(shape):
         res1 = fn(shape)
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(fn, backend=cnts)
+<<<<<<< HEAD
         # Especially for internal: before resetting the seed, first shake out any rng
         # calls that occur on compile, e.g., as a result of some module initializations.
         opt_fn(shape)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         random.seed(1)
         res2 = opt_fn(shape)
 
@@ -154,9 +157,12 @@ def fn(x):
         res1 = fn(x)
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(fn, backend=cnts)
+<<<<<<< HEAD
         # Especially for internal: before resetting the seed, first shake out any rng
         # calls that occur on compile, e.g., as a result of some module initializations.
         opt_fn(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         random.seed(1)
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
@@ -182,9 +188,12 @@ def fn(x):
         res1 = fn(x)
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(fn, backend=cnts)
+<<<<<<< HEAD
         # Especially for internal: before resetting the seed, first shake out any rng
         # calls that occur on compile, e.g., as a result of some module initializations.
         opt_fn(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         random.seed(1)
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
@@ -215,9 +224,12 @@ def fn(x):
         random.seed(1)
         res1 = fn(x)
         opt_fn = torch.compile(fn, backend="eager")
+<<<<<<< HEAD
         # Especially for internal: before resetting the seed, first shake out any rng
         # calls that occur on compile, e.g., as a result of some module initializations.
         opt_fn(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         random.seed(1)
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
@@ -244,9 +256,12 @@ def fn(x, rand2):
         random.seed(0)
         y_1, rand2_1, rand3_1 = fn(inp, random.Random(12))
         state_1 = random.getstate()
+<<<<<<< HEAD
         # Especially for internal: before resetting the seed, first shake out any rng
         # calls that occur on compile, e.g., as a result of some module initializations.
         opt_fn(inp, random.Random(12))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         random.seed(0)
         y_2, rand2_2, rand3_2 = opt_fn(inp, random.Random(12))
         state_2 = random.getstate()
@@ -714,6 +729,7 @@ def fn(x, y):
             self.assertEqual(fn_opt(x, y3), fn(x, y3))
             self.assertEqual(cnt.frame_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_tensorfiy_python_scalars_1(self):
         @torch.compile(backend="aot_eager")
@@ -748,6 +764,8 @@ def f(x):
         x = torch.tensor([finfo_float16.max], dtype=torch.float16)
         self.assertEqual(f(x), x.item() * 101 * torch.tensor([1], dtype=torch.float32))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=False)
     def test_unspec_float_input_f64(self):
         cnts = torch._dynamo.testing.CompileCounter()
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index 66ebe17399ac7..d9ef2dee6a7f8 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -8,14 +8,20 @@
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
+<<<<<<< HEAD
 import torch.compiler.config as compiler_config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo import utils
 from torch._inductor.test_case import TestCase
 
 
+<<<<<<< HEAD
 _IS_WINDOWS = sys.platform == "win32"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestUtils(TestCase):
     def test_nan(self):
         a = torch.Tensor([float("nan")])
@@ -247,6 +253,7 @@ def add(x, y):
         utils.reset_frame_count()
         torch._logging._internal.structured_logging_overhead.clear()
 
+<<<<<<< HEAD
     @dynamo_config.patch({"log_compilation_metrics": True})
     @inductor_config.patch({"force_disable_caches": True})
     def test_stack_trace(self):
@@ -351,6 +358,8 @@ def backward(grad_output):
             "'Dynamo does not know how to trace builtin operator `print`'",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
@@ -390,6 +399,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         # directly inspect the dict it prints instead:
         self.assertExpectedInline(
             pprint.pformat(utils.compilation_time_metrics),
+<<<<<<< HEAD
             (
                 """\
 {'GraphLowering.codegen': [0.0, 0.0],
@@ -423,6 +433,9 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'min_cut_rematerialization_partition': [0.0]}"""
                 if _IS_WINDOWS
                 else """\
+=======
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {'GraphLowering.codegen': [0.0, 0.0],
  'GraphLowering.compile_to_fn': [0.0, 0.0],
  'GraphLowering.compile_to_module': [0.0, 0.0],
@@ -452,8 +465,12 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'create_aot_dispatcher_function': [0.0],
  'fx_codegen_and_compile': [0.0, 0.0],
  'gc': [0.0],
+<<<<<<< HEAD
  'min_cut_rematerialization_partition': [0.0]}"""
             ),  # noqa: B950
+=======
+ 'min_cut_rematerialization_partition': [0.0]}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Now validate utils.calculate_time_spent(). Formatting the return
@@ -461,6 +478,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         time_spent = utils.calculate_time_spent()
         self.assertExpectedInline(
             pprint.pformat(time_spent),
+<<<<<<< HEAD
             (
                 """\
 {'_recursive_joint_graph_passes': 0.0,
@@ -475,6 +493,9 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'total_wall_time': 0.0}"""
                 if _IS_WINDOWS
                 else """\
+=======
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {'_recursive_joint_graph_passes': 0.0,
  '_recursive_post_grad_passes': 0.0,
  '_recursive_pre_grad_passes': 0.0,
@@ -485,8 +506,12 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'entire_frame_compile': 0.0,
  'gc': 0.0,
  'inductor_compile': 0.0,
+<<<<<<< HEAD
  'total_wall_time': 0.0}"""
             ),  # noqa: B950
+=======
+ 'total_wall_time': 0.0}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Now validate the CompilationMetrics logs. We expect a log for the
@@ -502,6 +527,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.co_filename = None
             e.co_firstlineno = None
             e.inductor_config = None
+<<<<<<< HEAD
             e.compiler_config = None
             e.cuda_version = None
             e.triton_version = None
@@ -510,13 +536,21 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.stack_trace = None
             e.graph_node_shapes = None
             e.exception_stack_trace = None
+=======
+            e.cuda_version = None
+            e.triton_version = None
+            e.python_version = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # First event is for the forward. Formatting makes reading diffs
         # much easier.
         raw = dataclasses.asdict(compilation_events[0])
         del raw["feature_usage"]
         del raw["ir_count"]
+<<<<<<< HEAD
         del raw["inductor_provenance"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del raw["param_numel"]
         del raw["param_bytes"]
         del raw["param_count"]
@@ -524,8 +558,12 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         del raw["guard_latency_us"]
         self.assertExpectedInline(
             pprint.pformat(raw),
+<<<<<<< HEAD
             (
                 """\
+=======
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {'accumulated_cache_size': 0,
  'aot_autograd_cumulative_compile_time_us': 0,
  'backend_compile_time_s': 0.0,
@@ -537,7 +575,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+<<<<<<< HEAD
  'compiler_config': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -551,7 +592,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': 0.0,
  'end_time_us': 100,
  'entire_frame_compile_time_s': 0.0,
+<<<<<<< HEAD
  'exception_stack_trace': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -560,7 +604,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'gc_time_us': 0,
  'graph_input_count': 1,
  'graph_node_count': 3,
+<<<<<<< HEAD
  'graph_node_shapes': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'graph_op_count': 1,
  'guard_count': 9,
  'has_guarded_code': True,
@@ -573,7 +620,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'inductor_fx_remote_cache_hit_keys': None,
  'inductor_fx_remote_cache_miss_count': None,
  'inductor_fx_remote_cache_miss_keys': None,
+<<<<<<< HEAD
  'inline_inbuilt_nn_modules_candidate': False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'is_forward': True,
  'is_runtime': False,
  'joint_graph_pass_time_us': 0,
@@ -586,9 +636,13 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': 0,
  'python_version': None,
+<<<<<<< HEAD
  'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
+=======
+ 'recompile_reason': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'remote_cache_time_saved_s': None,
  'remote_cache_version': None,
  'remote_fx_graph_cache_get_time_ms': None,
@@ -600,6 +654,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'runtime_triton_autotune_time_us': None,
  'shape_env_guard_count': 0,
  'specialize_float': False,
+<<<<<<< HEAD
  'stack_trace': None,
  'start_time': 0.0001,
  'start_time_us': 100,
@@ -688,6 +743,8 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'shape_env_guard_count': 0,
  'specialize_float': False,
  'stack_trace': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'start_time': 0.0001,
  'start_time_us': 100,
  'structured_logging_overhead_s': 0.0,
@@ -697,23 +754,34 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'tensorify_float_success': None,
  'triton_compile_time_us': 0,
  'triton_kernel_compile_times_us': None,
+<<<<<<< HEAD
  'triton_version': None}"""
             ),  # noqa: B950
+=======
+ 'triton_version': None}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Second event is for the backward
         raw = dataclasses.asdict(compilation_events[1])
         del raw["feature_usage"]
         del raw["ir_count"]
+<<<<<<< HEAD
         del raw["inductor_provenance"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del raw["guard_latency_us"]
         del raw["param_numel"]
         del raw["param_bytes"]
         del raw["param_count"]
         self.assertExpectedInline(
             pprint.pformat(raw),
+<<<<<<< HEAD
             (
                 """\
+=======
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {'accumulated_cache_size': None,
  'aot_autograd_cumulative_compile_time_us': None,
  'backend_compile_time_s': None,
@@ -725,10 +793,16 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+<<<<<<< HEAD
  'compiler_config': None,
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
+=======
+ 'compliant_custom_ops': None,
+ 'config_inline_inbuilt_nn_modules': None,
+ 'config_suppress_errors': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'cuda_version': None,
  'cudagraph_skip_reason': None,
  'distributed_ephemeral_timeout_us': None,
@@ -739,7 +813,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': None,
  'end_time_us': 100,
  'entire_frame_compile_time_s': None,
+<<<<<<< HEAD
  'exception_stack_trace': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -748,7 +825,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'gc_time_us': None,
  'graph_input_count': None,
  'graph_node_count': None,
+<<<<<<< HEAD
  'graph_node_shapes': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'graph_op_count': None,
  'guard_count': None,
  'has_guarded_code': None,
@@ -761,7 +841,10 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'inductor_fx_remote_cache_hit_keys': None,
  'inductor_fx_remote_cache_miss_count': None,
  'inductor_fx_remote_cache_miss_keys': None,
+<<<<<<< HEAD
  'inline_inbuilt_nn_modules_candidate': False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'is_forward': False,
  'is_runtime': False,
  'joint_graph_pass_time_us': None,
@@ -774,9 +857,13 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': None,
  'python_version': None,
+<<<<<<< HEAD
  'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
+=======
+ 'recompile_reason': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'remote_cache_time_saved_s': None,
  'remote_cache_version': None,
  'remote_fx_graph_cache_get_time_ms': None,
@@ -788,6 +875,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'runtime_triton_autotune_time_us': None,
  'shape_env_guard_count': None,
  'specialize_float': None,
+<<<<<<< HEAD
  'stack_trace': None,
  'start_time': 0.0001,
  'start_time_us': 100,
@@ -876,6 +964,8 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'shape_env_guard_count': None,
  'specialize_float': None,
  'stack_trace': None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'start_time': 0.0001,
  'start_time_us': 100,
  'structured_logging_overhead_s': 0.0,
@@ -885,6 +975,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'tensorify_float_success': None,
  'triton_compile_time_us': 0,
  'triton_kernel_compile_times_us': None,
+<<<<<<< HEAD
  'triton_version': None}"""
             ),  # noqa: B950
         )
@@ -906,6 +997,9 @@ def test1(x):
         self.assertIn(
             '"job_id": "test_job_id"',
             compilation_events[0].compiler_config,
+=======
+ 'triton_version': None}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dynamo_config.patch(
@@ -916,14 +1010,21 @@ def test1(x):
     def test_ir_count(self):
         # Different python versions have different potential IR counts.
         version = (sys.version_info[0], sys.version_info[1])
+<<<<<<< HEAD
         self.assertIn(version, ((3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14)))
+=======
+        self.assertIn(version, ((3, 9), (3, 10), (3, 11), (3, 12), (3, 13)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         first, second = {
             (3, 9): (10, 6),
             (3, 10): (10, 6),
             (3, 11): (10, 6),
             (3, 12): (11, 7),
             (3, 13): (11, 7),
+<<<<<<< HEAD
             (3, 14): (11, 7),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }[version]
 
         def test1(x):
@@ -947,6 +1048,7 @@ def test2(x):
             compilation_events = [arg[0][0] for arg in log_event.call_args_list]
         self.assertEqual(compilation_events[0].ir_count, second)
 
+<<<<<<< HEAD
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
@@ -968,6 +1070,8 @@ def test_inductor_provenance(self):
             {'{"extern_kernels.addmm:1": []}'},
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dynamo_config.patch({"log_compilation_metrics": True})
     @inductor_config.patch({"force_disable_caches": True})
     def test_dynamic_shape_feature_use(self):
diff --git a/test/dynamo/test_view.py b/test/dynamo/test_view.py
index a9a6e0deca367..fd401410d740d 100644
--- a/test/dynamo/test_view.py
+++ b/test/dynamo/test_view.py
@@ -11,6 +11,11 @@ def test_view_to_2d(self):
         def f(t, _u0):
             u0 = t[0].item()
             u1 = t[1].item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             n = u0 * u1
             a = torch.randn(n)
             return a.view(-1, _u0)
@@ -23,12 +28,18 @@ def test_view_to_1d(self):
         def f(t, _n):
             u0 = t[0].item()
             u1 = t[1].item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = torch.randn(u0, u1)
             return a.view(_n)
 
         t = torch.tensor([2, 4], dtype=torch.int32)
         f(t, 8)
 
+<<<<<<< HEAD
     def test_view_with_tensor_shape_params(self):
         # Test for issue #156720: aten.view.default with tensor shape parameters
         class TestModel(torch.nn.Module):
@@ -109,6 +120,8 @@ def test_fn(x, shape_params):
 
         torch.testing.assert_close(result, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv_zero_division b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv_zero_division
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_contains b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_keys b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_empty_presized_dict_in_freelist b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_empty_presized_dict_in_freelist
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_get b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_empty_dict b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_empty_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_items
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_items
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_ceil b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_ceil
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_floor b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_floor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_mod b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_mod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_pow b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_pow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_from_hex b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_from_hex
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_invalid_inputs b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_invalid_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_whitespace b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_whitespace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_double_specials_do_unpack b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_double_specials_do_unpack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_float_specials_do_unpack b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_float_specials_do_unpack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_serialized_float_rounding b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_serialized_float_rounding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_inf_from_str b/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_inf_from_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_nan_from_str b/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_nan_from_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_inf_nan b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_inf_nan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_overflow b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_overflow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits_edge_cases b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits_edge_cases
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_sign_not_counted b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_sign_not_counted
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_underscores_ignored b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_underscores_ignored
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_basic b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_limits b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_limits
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_invalid_signs b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_invalid_signs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_string_float b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_string_float
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_str_to_int b/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_str_to_int
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_function b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_basic b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_big_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_big_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_callable b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_callable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_for
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_dict b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_empty b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_for_loop b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_for_loop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_stop b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_stop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_independence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_independence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_string b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_string
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_tuple b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_tuple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_for
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_for
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_callable b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_callable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_dict b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_enumerate b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_enumerate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_list b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_sequence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_string b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_string
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_tuple b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_tuple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_yield b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_yield
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_append b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_append
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_basic b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_clear b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_copy b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_copy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_empty_slice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_empty_slice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extendedslicing b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extendedslicing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iadd b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iadd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_insert b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_insert
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_resize_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_resize_overflow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_overflow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pop b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reverse b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reverse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_set_subscript b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_set_subscript
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice_assign_iterator b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice_assign_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_step_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_step_overflow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_subscript b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_subscript
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_tier2_invalidates_iterator b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_tier2_invalidates_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_negative_tolerances b/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_negative_tolerances
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcos b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcos
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcosh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcosh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsin b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsin
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsinh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsinh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan2 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtanh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtanh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCbrt b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCbrt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCopysign b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCopysign
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCos b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCos
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCosh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCosh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDegrees b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDegrees
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp2 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFabs b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFabs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialHugeInputs b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialHugeInputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFmod b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFmod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFrexp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFrexp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLdexp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLdexp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog10 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog10
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog1p b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog1p
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testModf b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testModf
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPow b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRadians b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRadians
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSin b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSin
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSinh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSinh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSqrt b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSqrt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTan b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTanh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTanh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_exceptions b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_input_exceptions b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_input_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_math_dist_leak b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_math_dist_leak
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_nextafter b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_nextafter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_ulp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_ulp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_reinsert
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_setitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_items
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_read
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24667
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reinsert
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_fromkeys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24667
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reinsert
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_items
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_popitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_setdefault
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_copy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference_rev b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference_rev
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_intersection
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_isdisjoint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_union b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_equivalent_equality b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_equivalent_equality
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_intersection_empty b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_intersection_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_length b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_length
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_equality b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_equality
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_intersection
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_isdisjoint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_union b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_union_empty b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_union_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_and
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_equality
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_or
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_setOfFrozensets
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_sub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_uniquification
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_xor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_and
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_constructor_identity
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_equality
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_init b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_isdisjoint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_len b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_nested_empty_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_nested_empty_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_or
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_setOfFrozensets
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub_and_super b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub_and_super
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_uniquification
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_xor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cube b/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cube
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_update_operator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_or
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove_keyerror_unpacking b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove_keyerror_unpacking
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_setOfFrozensets
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_set_literal_evaluation_order b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_set_literal_evaluation_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_uniquification
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetOfSets.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestSetOfSets.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_add b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_add
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_and
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_clear b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_clear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_constructor_identity
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_equality
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iand b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_init b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_inplace_on_self b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_inplace_on_self
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ior b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ior
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isdisjoint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isub b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ixor b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ixor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_len b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_or
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pop b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_set b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_set
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_unpacking b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_unpacking
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_setOfFrozensets
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_set_literal_evaluation_order b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_set_literal_evaluation_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub_and_super b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub_and_super
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_union
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_uniquification
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_xor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_exception b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_exception
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_none_in_tuples b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_none_in_tuples
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iadd b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iadd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_imul b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_imul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_subscript b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_subscript
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_tupleresizebug b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_tupleresizebug
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBaseSparsifier.test_state_dict b/test/dynamo_expected_failures/TestBaseSparsifier.test_state_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_flops b/test/dynamo_expected_failures/TestProfiler.test_flops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend b/test/dynamo_expected_failures/TestScript.test_python_frontend
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend_py3 b/test/dynamo_expected_failures/TestScript.test_python_frontend_py3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapAPI.test_fallback_warns_when_warnings_are_enabled b/test/dynamo_expected_failures/TestVmapAPI.test_fallback_warns_when_warnings_are_enabled
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread b/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index 42c63ad8706f2..03af5822e5c5c 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -75,7 +75,10 @@ aten::_ctc_loss.out
 aten::_ctc_loss_backward
 aten::_ctc_loss_backward.Tensor
 aten::_ctc_loss_backward.out
+<<<<<<< HEAD
 aten::_cudnn_attention_backward
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_cudnn_attention_forward
 aten::_cudnn_ctc_loss
 aten::_cudnn_ctc_loss.Tensor
@@ -375,6 +378,10 @@ aten::_fused_adamw_.tensor_lr
 aten::_fused_moving_avg_obs_fq_helper
 aten::_fused_moving_avg_obs_fq_helper.out
 aten::_fused_moving_avg_obs_fq_helper_functional
+<<<<<<< HEAD
+=======
+aten::_fused_rms_norm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_fused_sdp_choice
 aten::_fused_sgd
 aten::_fused_sgd.out
@@ -524,11 +531,16 @@ aten::_scaled_dot_product_flash_attention_for_cpu_backward
 aten::_scaled_dot_product_fused_attention_overrideable
 aten::_scaled_dot_product_fused_attention_overrideable_backward
 aten::_scaled_grouped_mm
+<<<<<<< HEAD
 aten::_scaled_grouped_mm_v2
 aten::_scaled_mm
 aten::_scaled_mm.out
 aten::_scaled_mm_v2
 aten::_scaled_mm_v2.out
+=======
+aten::_scaled_mm
+aten::_scaled_mm.out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_segment_reduce_backward
 aten::_segment_reduce_backward.out
 aten::_slow_conv2d_backward.grad_input
@@ -856,8 +868,11 @@ aten::hann_window.periodic
 aten::hann_window.periodic_out
 aten::hardshrink_backward
 aten::hardshrink_backward.grad_input
+<<<<<<< HEAD
 aten::hash_tensor
 aten::hash_tensor.out
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::histc
 aten::histc.out
 aten::histogram.bin_ct
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index a404e15a977ee..eaff6bf450c5b 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -23,12 +23,20 @@ torch.fx.graph.Graph.node_copy(self, node: torch.fx.node.Node, arg_transform: Ca
 torch.fx.graph.Graph.output(self, result: 'Argument', type_expr: Optional[Any] = None)
 torch.fx.graph.Graph.placeholder(self, name: str, type_expr: Optional[Any] = None, default_value: Any) -> torch.fx.node.Node
 torch.fx.graph.Graph.print_tabular(self)
+<<<<<<< HEAD
 torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False, include_stride: bool = False, include_device: bool = False, colored: bool = False, expanded_def: bool = False) -> torch.fx.graph.PythonCode
+=======
+torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False, include_stride: bool = False, include_device: bool = False, colored: bool = False) -> torch.fx.graph.PythonCode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.graph_module.GraphModule.__init__(self, root: Union[torch.nn.modules.module.Module, Dict[str, Any]], graph: torch.fx.graph.Graph, class_name: str = 'GraphModule')
 torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.modules.module.Module) -> bool
 torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None
 torch.fx.graph_module.GraphModule.delete_submodule(self, target: str) -> bool
 torch.fx.graph_module.GraphModule.recompile(self) -> torch.fx.graph.PythonCode
+<<<<<<< HEAD
+=======
+torch.fx.graph_module.reduce_deploy_graph_module(importer: Callable, body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.graph_module.reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module
 torch.fx.graph_module.reduce_package_graph_module(importer: Callable, body: Dict[Any, Any], generated_module_name: str) -> torch.nn.modules.module.Module
 torch.fx.interpreter.Interpreter.__init__(self, module: torch.nn.modules.module.Module, garbage_collect_values: bool = True, graph: Optional[torch.fx.graph.Graph] = None)
@@ -52,10 +60,17 @@ torch.fx.interpreter.Transformer.placeholder(self, target: 'Target', args: Tuple
 torch.fx.interpreter.Transformer.transform(self) -> torch.fx.graph_module.GraphModule
 torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Argument], return_type: Optional[Any] = None) -> None
 torch.fx.node.Node.append(self, x: 'Node') -> None
+<<<<<<< HEAD
 torch.fx.node.Node.format_node(self, placeholder_names: Optional[List[str]] = None, maybe_return_typename: Optional[List[str]] = None, include_tensor_metadata: bool = False) -> Optional[str]
 torch.fx.node.Node.insert_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
 torch.fx.node.Node.prepend(self, x: 'Node') -> None
 torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Optional[Callable[[Node], bool]] = None, propagate_meta: bool = False) -> List[Node]
+=======
+torch.fx.node.Node.format_node(self, placeholder_names: Optional[List[str]] = None, maybe_return_typename: Optional[List[str]] = None) -> Optional[str]
+torch.fx.node.Node.insert_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
+torch.fx.node.Node.prepend(self, x: 'Node') -> None
+torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Callable[[Node], bool] = <function <lambda>>, propagate_meta: bool = False) -> List[Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node') -> None
 torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
 torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None
@@ -63,7 +78,11 @@ torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.no
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.passes.reinplace.reinplace(gm, *sample_args)
 torch.fx.passes.runtime_assert.insert_deferred_runtime_asserts(gm: torch.fx.graph_module.GraphModule, shape_env: Any, name: str, export: bool = False) -> None
+<<<<<<< HEAD
 torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True, partition_affix: Optional[str] = None)
+=======
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/export/opinfo_schema.py b/test/export/opinfo_schema.py
index 292d06fc04d89..0c0577d55ca3b 100644
--- a/test/export/opinfo_schema.py
+++ b/test/export/opinfo_schema.py
@@ -38,7 +38,11 @@ def __init__(self) -> None:
 
     def _may_alias_or_mutate(self, func, types, args, kwargs):
         def unwrap(e):
+<<<<<<< HEAD
             if isinstance(e, torch.Tensor) and type(e) is not torch.Tensor:
+=======
+            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     return e.elem
                 except AttributeError:
diff --git a/test/export/test_converter.py b/test/export/test_converter.py
index e739e5c346677..6490939cdbc7e 100644
--- a/test/export/test_converter.py
+++ b/test/export/test_converter.py
@@ -700,7 +700,11 @@ def forward(self, x: torch.Tensor):
                 else:
                     return self.w + self.m2(x)
 
+<<<<<<< HEAD
         # Super nested, parameters need to be lifted
+=======
+        # Super nested, parameters neeed to lifted
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # multiple times.
         class SuperNestedM(torch.nn.Module):
             def __init__(self) -> None:
@@ -755,7 +759,11 @@ def forward(self, x: torch.Tensor):
                 else:
                     return self.linear(self.m2(x))
 
+<<<<<<< HEAD
         # Super nested, parameters need to be lifted
+=======
+        # Super nested, parameters neeed to lifted
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # multiple times.
         class SuperNestedM1(torch.nn.Module):
             def __init__(self, dim: int) -> None:
@@ -771,7 +779,11 @@ def forward(self, x: torch.Tensor):
                     return self.linear(self.m2(x))
 
         # Super nested, even the input needs to be
+<<<<<<< HEAD
         # lifted recursively due to value propagation optimization.
+=======
+        # lifted recursively due to value propogation optimiztaion.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class SuperNestedM2(torch.nn.Module):
             def __init__(self, dim: int) -> None:
                 super().__init__()
@@ -911,7 +923,11 @@ def foo_impl(x):
                 return x + x
 
             # Meta function of the custom op.
+<<<<<<< HEAD
             @torch.library.register_fake(
+=======
+            @torch.library.impl_abstract(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "mylib::foo",
                 lib=lib,
             )
@@ -1448,11 +1464,15 @@ def fuse_model(self):
             ep_out, _ = pytree.tree_flatten(ep.module()(*inp))
             self._check_tensor_list_equal(orig_out, ep_out)
 
+<<<<<<< HEAD
     # qnnpack/xnnpack not supported on s390x.
     # it is required by
     # torch.ops.prepacked.linear_clamp_prepack
     # and
     # torch.ops.prepacked.linear_clamp_run
+=======
+    # qnnpack not supported on s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext(self):
         class M(torch.nn.Module):
@@ -1471,12 +1491,15 @@ def forward(self, x):
         inp = (torch.randn(1, 10),)
         self._check_equal_ts_ep_converter(m, inp, ["script"])
 
+<<<<<<< HEAD
     # qnnpack/xnnpack not supported on s390x.
     # it is required by
     # torch.ops.prepacked.linear_clamp_prepack
     # and
     # torch.ops.prepacked.linear_clamp_run
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ts2ep_convert_quantized_model_with_opcontext_and_constant(self):
         class M(torch.nn.Module):
             def __init__(self, linear_op):
diff --git a/test/export/test_db.py b/test/export/test_db.py
index 7b7dd91c009ad..ce6d4730fd0ee 100644
--- a/test/export/test_db.py
+++ b/test/export/test_db.py
@@ -4,13 +4,20 @@
 import unittest
 
 import torch._dynamo as torchdynamo
+<<<<<<< HEAD
 from torch._export import config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import (
     filter_examples_by_support_level,
     get_rewrite_cases,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
@@ -36,6 +43,7 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         kwargs_export = case.example_kwargs
         args_model = copy.deepcopy(args_export)
         kwargs_model = copy.deepcopy(kwargs_export)
+<<<<<<< HEAD
         with config.patch(use_new_tracer_experimental=True):
             exported_program = export(
                 model,
@@ -44,6 +52,15 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
                 dynamic_shapes=case.dynamic_shapes,
                 strict=True,
             )
+=======
+        exported_program = export_for_training(
+            model,
+            args_export,
+            kwargs_export,
+            dynamic_shapes=case.dynamic_shapes,
+            strict=True,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exported_program.graph_module.print_readable()
 
         self.assertEqual(
@@ -70,6 +87,7 @@ def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
         with self.assertRaises(
             (torchdynamo.exc.Unsupported, AssertionError, RuntimeError)
         ):
+<<<<<<< HEAD
             with config.patch(use_new_tracer_experimental=True):
                 _ = export(
                     model,
@@ -78,6 +96,15 @@ def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
                     dynamic_shapes=case.dynamic_shapes,
                     strict=True,
                 )
+=======
+            export_for_training(
+                model,
+                case.example_args,
+                case.example_kwargs,
+                dynamic_shapes=case.dynamic_shapes,
+                strict=True,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     exportdb_not_supported_rewrite_cases = [
         (name, rewrite_case)
@@ -97,7 +124,11 @@ def test_exportdb_not_supported_rewrite(
             self, name: str, rewrite_case: ExportCase
         ) -> None:
             # pyre-ignore
+<<<<<<< HEAD
             export(
+=======
+            export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rewrite_case.model,
                 rewrite_case.example_args,
                 rewrite_case.example_kwargs,
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index 30147b1a7c42f..fca33f955f1df 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -1,6 +1,9 @@
 # Owner(s): ["oncall: export"]
 import copy
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import tempfile
 import unittest
 
@@ -297,8 +300,12 @@ def forward(self, a, b, c):
                     res = torch.ops.mylib.foo1(a, b)
 
                     c_item = c.item()
+<<<<<<< HEAD
                     if c_item > 0:
                         return res[:c_item]
+=======
+                    return res[:c_item]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             inp = (torch.ones(3, 3), torch.ones(3, 3), torch.tensor(3))
 
@@ -370,8 +377,12 @@ def forward(self, x, y):
 
                 z = torch.cat([y, y])
 
+<<<<<<< HEAD
                 if a > 0:
                     return z[:a]
+=======
+                return z[:a]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ep = draft_export(
             M(),
@@ -389,7 +400,11 @@ def forward(self, x, y):
             for node in _ep.graph.nodes:
                 if bindings := node.meta.get("unbacked_bindings"):
                     unbacked_binding_symbols.update(bindings.keys())
+<<<<<<< HEAD
             self.assertEqual(len(unbacked_binding_symbols), 2)
+=======
+            self.assertEqual(len(unbacked_binding_symbols), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_offsets(self):
         class M(torch.nn.Module):
@@ -410,12 +425,16 @@ def forward(self, a):
 
         inp = (torch.ones(3, 3),)
 
+<<<<<<< HEAD
         ep = draft_export(
             M(),
             inp,
             dynamic_shapes={"a": {0: Dim("a0")}},
             prefer_deferred_runtime_asserts_over_guards=True,
         )
+=======
+        ep = draft_export(M(), inp, dynamic_shapes={"a": {0: Dim("a0")}})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         report = ep._report
 
         self.assertEqual(len(report.failures), 1)
@@ -425,11 +444,15 @@ def forward(self, a):
         self.assertEqual(ep.module()(*inp), M()(*inp))
 
         inp = (torch.randn(4, 3),)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             re.escape("Guard failed: a.size()[0] <= 3"),
         ):
             # expected <= 3, but got 4
+=======
+        with self.assertRaises(RuntimeError):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(*inp)
 
     def test_side_effect1(self):
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index a1cc88568107f..b2491e15c201e 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -1,13 +1,19 @@
 # Owner(s): ["oncall: export"]
 # flake8: noqa
+<<<<<<< HEAD
 import copy
 import types
 import unittest
 from dataclasses import dataclass
+=======
+import types
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Dict, List, Tuple
 
 import torch
 import torch._dynamo
+<<<<<<< HEAD
 from torch._dynamo.functional_export import dynamo_graph_capture_for_export
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._functorch.aot_autograd import aot_export_module
@@ -16,10 +22,109 @@
 from torch.export.graph_signature import OutputKind
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import TEST_CUDA
+=======
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._functorch.aot_autograd import aot_export_module
+from torch.export import export, export_for_training
+from torch.export._trace import _convert_ts_to_export_experimental
+from torch.export.experimental import _export_forward_backward, _sticky_export
+from torch.export.graph_signature import OutputKind
+from torch.testing import FileCheck
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "dynamo isn't supported")
 class TestExperiment(TestCase):
+<<<<<<< HEAD
+=======
+    def test_torchscript_module_export(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.cos() + x.sin()
+
+        model_to_trace = M()
+        inps = (torch.randn(4, 4),)
+        traced_module_by_torchscript = torch.jit.trace(M(), example_inputs=inps)
+
+        exported_module = _convert_ts_to_export_experimental(
+            traced_module_by_torchscript, inps
+        )
+
+        self.assertTrue(torch.allclose(exported_module(*inps), model_to_trace(*inps)))
+
+    def test_torchscript_module_export_single_input(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.cos() + x.sin()
+
+        model_to_trace = M()
+        inps = torch.randn(4, 4)
+        traced_module_by_torchscript = torch.jit.trace(M(), example_inputs=inps)
+
+        exported_module = _convert_ts_to_export_experimental(
+            traced_module_by_torchscript, inps
+        )
+
+        self.assertTrue(torch.allclose(exported_module(inps), model_to_trace(inps)))
+
+    def test_torchscript_module_export_various_inputs_with_annotated_input_names(self):
+        def _check_equality_and_annotations(m_func, inps):
+            # Original module.
+            model_to_trace = m_func()
+
+            # ExportedProgram from TorchScript module.
+            traced_module_by_torchscript = torch.jit.trace(
+                m_func(), example_inputs=inps
+            )
+            exported_module = _convert_ts_to_export_experimental(
+                traced_module_by_torchscript, inps
+            )
+
+            # ExportedProgram from original module.
+            original_exported_module = torch.export.export_for_training(
+                m_func(), inps, strict=True
+            )
+
+            # Check whether input annotations are the same as tracing the original module.
+            orig_ph_name_list = [
+                n.name
+                for n in original_exported_module.graph.nodes
+                if n.op == "placeholder"
+            ]
+            ph_name_list = [
+                n.name for n in exported_module.graph.nodes if n.op == "placeholder"
+            ]
+            self.assertEqual(orig_ph_name_list, ph_name_list)
+
+            # Check results equality.
+            self.assertTrue(
+                torch.allclose(exported_module(*inps), model_to_trace(*inps))
+            )
+
+        # Tuple
+        class MTuple(torch.nn.Module):
+            def forward(self, x: Tuple[torch.Tensor]):
+                return x[0] + x[1]
+
+        _check_equality_and_annotations(MTuple, ((torch.randn(4), torch.randn(4)),))
+
+        # List
+        class MList(torch.nn.Module):
+            def forward(self, x: List[torch.Tensor]):
+                return x[0] + x[1]
+
+        _check_equality_and_annotations(MList, ([torch.randn(4), torch.randn(4)],))
+
+        # Dict
+        class MDict(torch.nn.Module):
+            def forward(self, x: Dict[str, torch.Tensor]):
+                return x["0"] + x["1"]
+
+        _check_equality_and_annotations(
+            MDict, ({"0": torch.randn(4), "1": torch.randn(4)},)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_joint_basic(self) -> None:
         class Module(torch.nn.Module):
             def __init__(self) -> None:
@@ -35,8 +140,12 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
+<<<<<<< HEAD
         with torch._export.config.patch(use_new_tracer_experimental=True):
             ep = torch.export.export(m, example_inputs, strict=True)
+=======
+        ep = torch.export.export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         joint_ep = _export_forward_backward(ep)
         self.assertExpectedInline(
             str(joint_ep.graph_module.code).strip(),
@@ -48,9 +157,17 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     view_1 = torch.ops.aten.view.default(addmm, [3]);  addmm = None
     _softmax = torch.ops.aten._softmax.default(view_1, 0, False);  view_1 = None
     alias = torch.ops.aten.alias.default(_softmax)
+<<<<<<< HEAD
     clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
     _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
     alias_1 = torch.ops.aten.alias.default(_log_softmax)
+=======
+    alias_1 = torch.ops.aten.alias.default(alias);  alias = None
+    clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
+    _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
+    alias_2 = torch.ops.aten.alias.default(_log_softmax)
+    alias_3 = torch.ops.aten.alias.default(alias_2);  alias_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mul = torch.ops.aten.mul.Tensor(_log_softmax, clone);  _log_softmax = None
     sum_1 = torch.ops.aten.sum.dim_IntList(mul, []);  mul = None
     neg = torch.ops.aten.neg.default(sum_1);  sum_1 = None
@@ -60,6 +177,7 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     neg_1 = torch.ops.aten.neg.default(div_1);  div_1 = None
     expand = torch.ops.aten.expand.default(neg_1, [3]);  neg_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(expand, clone);  expand = clone = None
+<<<<<<< HEAD
     alias_2 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
     exp = torch.ops.aten.exp.default(alias_2);  alias_2 = None
     sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
@@ -69,6 +187,19 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     mul_3 = torch.ops.aten.mul.Tensor(sub, alias_3);  sub = None
     sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
     mul_4 = torch.ops.aten.mul.Tensor(alias_3, sum_3);  alias_3 = sum_3 = None
+=======
+    alias_4 = torch.ops.aten.alias.default(alias_3);  alias_3 = None
+    alias_5 = torch.ops.aten.alias.default(alias_4);  alias_4 = None
+    exp = torch.ops.aten.exp.default(alias_5);  alias_5 = None
+    sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
+    mul_2 = torch.ops.aten.mul.Tensor(exp, sum_2);  exp = sum_2 = None
+    sub = torch.ops.aten.sub.Tensor(mul_1, mul_2);  mul_1 = mul_2 = None
+    alias_6 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
+    alias_7 = torch.ops.aten.alias.default(alias_6);  alias_6 = None
+    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_7);  sub = None
+    sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
+    mul_4 = torch.ops.aten.mul.Tensor(alias_7, sum_3);  alias_7 = sum_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sub_1 = torch.ops.aten.sub.Tensor(mul_3, mul_4);  mul_3 = mul_4 = None
     view_2 = torch.ops.aten.view.default(sub_1, [1, 3]);  sub_1 = None
     permute_1 = torch.ops.aten.permute.default(view_2, [1, 0])
@@ -90,9 +221,17 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     view_1 = torch.ops.aten.view.default(addmm, [3]);  addmm = None
     _softmax = torch.ops.aten._softmax.default(view_1, 0, False);  view_1 = None
     alias = torch.ops.aten.alias.default(_softmax)
+<<<<<<< HEAD
     clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
     _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
     alias_1 = torch.ops.aten.alias.default(_log_softmax)
+=======
+    alias_1 = torch.ops.aten.alias.default(alias);  alias = None
+    clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
+    _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
+    alias_2 = torch.ops.aten.alias.default(_log_softmax)
+    alias_3 = torch.ops.aten.alias.default(alias_2);  alias_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mul = torch.ops.aten.mul.Tensor(_log_softmax, clone);  _log_softmax = None
     sum_1 = torch.ops.aten.sum.dim_IntList(mul, []);  mul = None
     neg = torch.ops.aten.neg.default(sum_1);  sum_1 = None
@@ -102,6 +241,7 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     neg_1 = torch.ops.aten.neg.default(div_1);  div_1 = None
     expand = torch.ops.aten.expand.default(neg_1, [3]);  neg_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(expand, clone);  expand = clone = None
+<<<<<<< HEAD
     alias_2 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
     exp = torch.ops.aten.exp.default(alias_2);  alias_2 = None
     sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
@@ -111,6 +251,19 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     mul_3 = torch.ops.aten.mul.Tensor(sub, alias_3);  sub = None
     sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
     mul_4 = torch.ops.aten.mul.Tensor(alias_3, sum_3);  alias_3 = sum_3 = None
+=======
+    alias_4 = torch.ops.aten.alias.default(alias_3);  alias_3 = None
+    alias_5 = torch.ops.aten.alias.default(alias_4);  alias_4 = None
+    exp = torch.ops.aten.exp.default(alias_5);  alias_5 = None
+    sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
+    mul_2 = torch.ops.aten.mul.Tensor(exp, sum_2);  exp = sum_2 = None
+    sub = torch.ops.aten.sub.Tensor(mul_1, mul_2);  mul_1 = mul_2 = None
+    alias_6 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
+    alias_7 = torch.ops.aten.alias.default(alias_6);  alias_6 = None
+    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_7);  sub = None
+    sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
+    mul_4 = torch.ops.aten.mul.Tensor(alias_7, sum_3);  alias_7 = sum_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sub_1 = torch.ops.aten.sub.Tensor(mul_3, mul_4);  mul_3 = mul_4 = None
     view_2 = torch.ops.aten.view.default(sub_1, [1, 3]);  sub_1 = None
     permute_1 = torch.ops.aten.permute.default(view_2, [1, 0])
@@ -137,7 +290,11 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
+<<<<<<< HEAD
         ep = torch.export.export(
+=======
+        ep = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m, example_inputs, dynamic_shapes={"x": {0: Dim("x0")}}, strict=True
         )
         _export_forward_backward(ep)
@@ -173,7 +330,11 @@ def forward(self, x, labels):
         labels = torch.ones(4, dtype=torch.int64)
         inputs = (x, labels)
 
+<<<<<<< HEAD
         ep = export(net, inputs, strict=True)
+=======
+        ep = export_for_training(net, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep = _export_forward_backward(ep)
 
     def test_joint_loss_index(self):
@@ -193,7 +354,11 @@ def forward(self, x):
 
         inputs = (torch.randn(4, 4),)
         for i in [0, 1]:
+<<<<<<< HEAD
             ep = export(Foo(i), inputs, strict=True)
+=======
+            ep = export_for_training(Foo(i), inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_joint = _export_forward_backward(ep, joint_loss_index=i)
             for j, spec in enumerate(ep_joint.graph_signature.output_specs):
                 if i == j:
@@ -315,8 +480,15 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
+=======
+    sym_size_int_2 = torch.ops.aten.sym_size.int(x, 1)
+    linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
+    eq = sym_size_int_2 == 4;  sym_size_int_2 = None
+    _assert_scalar_default = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(s27, 4) on node 'eq'");  eq = _assert_scalar_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
 
@@ -349,6 +521,7 @@ def generate(self, *, input_tensor, input_tensor2):
         res2 = p.generate(input_tensor=inp, input_tensor2=inp2)
         self.assertTrue(torch.allclose(res, res2))
 
+<<<<<<< HEAD
     def test_export_add_in_out_info(self):
         class Foo(torch.nn.Module):
             def forward(self, dct, lst, bleh):
@@ -585,6 +758,8 @@ def forward(self, args_0, args_1):
         test_inputs = input_fn()
         self.assertEqual(gm(*test_inputs), model(*test_inputs))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 3250d82c3eae8..d86f1e9345a54 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -1,14 +1,20 @@
 # Owner(s): ["oncall: export"]
 # ruff: noqa: F841
 # flake8: noqa
+<<<<<<< HEAD
 import contextlib
 import copy
 import dataclasses
 import enum
+=======
+import copy
+import dataclasses
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import logging
 import math
 import operator
+<<<<<<< HEAD
 import os
 import re
 import traceback
@@ -16,6 +22,12 @@
 import warnings
 import weakref
 from contextlib import contextmanager, nullcontext
+=======
+import re
+import unittest
+import warnings
+from contextlib import contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from re import escape
 from typing import Dict, List, Union
@@ -23,16 +35,25 @@
 
 import torch
 import torch._dynamo as torchdynamo
+<<<<<<< HEAD
 import torch.fx.traceback as fx_traceback
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.nn.functional as F
 import torch.utils._pytree as pytree
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
+<<<<<<< HEAD
 from torch._decomp import decomposition_table, get_decompositions
 from torch._dynamo._trace_wrapped_higher_order_op import mod_index
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import normalize_gm
 from torch._export import config
+=======
+from torch._decomp import decomposition_table
+from torch._dynamo.test_case import TestCase
+from torch._dynamo.testing import normalize_gm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
 from torch._export.utils import (
     get_buffer,
@@ -41,6 +62,7 @@
     is_param,
     register_dataclass_as_pytree_node,
 )
+<<<<<<< HEAD
 from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
 from torch._higher_order_ops.associative_scan import associative_scan
 from torch._higher_order_ops.hints_wrap import hints_wrapper
@@ -49,6 +71,20 @@
 from torch._inductor.compile_fx import split_const_gm
 from torch._subclasses import FakeTensorMode
 from torch.export import default_decompositions, Dim, export, unflatten
+=======
+from torch._higher_order_ops.associative_scan import associative_scan
+from torch._higher_order_ops.hints_wrap import hints_wrapper
+from torch._higher_order_ops.scan import scan
+from torch._inductor.compile_fx import split_const_gm
+from torch._subclasses import FakeTensorMode
+from torch.export import (
+    default_decompositions,
+    Dim,
+    export,
+    export_for_training,
+    unflatten,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export._trace import (
     _export,
     _export_to_torch_ir,
@@ -61,7 +97,10 @@
     OutputSpec,
     TensorArgument,
 )
+<<<<<<< HEAD
 from torch.export.passes import move_to_device_pass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing import FileCheck
@@ -88,16 +127,26 @@
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.testing._internal.torchbind_impls import load_torchbind_test_lib
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils._pytree import (
+=======
+from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+from torch.testing._internal.two_tensor import TwoTensor
+from torch.utils._pytree import (
+    LeafSpec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_constant,
     tree_flatten,
     tree_map,
     tree_unflatten,
     TreeSpec,
     treespec_dumps,
+<<<<<<< HEAD
     treespec_leaf,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     treespec_loads,
 )
 
@@ -109,7 +158,11 @@
     from torch._library import capture_triton
 
 try:
+<<<<<<< HEAD
     from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
+=======
+    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     HAS_TORCHREC = True
 except ImportError:
@@ -149,7 +202,11 @@
 
 
 @torch.library.impl("testlib::returns_tensor_symint", "cpu")
+<<<<<<< HEAD
 @torch.library.register_fake("testlib::returns_tensor_symint")
+=======
+@torch.library.impl_abstract("testlib::returns_tensor_symint")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def returns_tensor_symint_impl(x):
     return x, x.shape[0]
 
@@ -162,7 +219,11 @@ def foo_impl(x, z):
     return x, z, x + z
 
 
+<<<<<<< HEAD
 @torch.library.register_fake("testlib::foo")
+=======
+@torch.library.impl_abstract("testlib::foo")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def foo_abstract(x, z):
     return x, z, x + z
 
@@ -220,12 +281,16 @@ class Inp3:
 TRAINING_IR_DECOMP_NON_STRICT_SUFFIX = "_training_ir_to_decomp_nonstrict"
 CPP_RUNTIME_STRICT_SUFFIX = "_cpp_runtime_strict"
 CPP_RUNTIME_NONSTRICT_SUFFIX = "_cpp_runtime_nonstrict"
+<<<<<<< HEAD
 STRICT_EXPORT_V2_SUFFIX = "_strict_export_v2"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Now default mode is non strict, so original unammended test names
 # should be treated as non-strict
 def is_non_strict_test(test_name):
+<<<<<<< HEAD
     return not test_name.endswith(STRICT_SUFFIX) and not test_name.endswith(
         STRICT_EXPORT_V2_SUFFIX
     )
@@ -237,6 +302,9 @@ def is_strict_test(test_name):
 
 def is_strict_v2_test(test_name):
     return test_name.endswith(STRICT_EXPORT_V2_SUFFIX)
+=======
+    return not test_name.endswith(STRICT_SUFFIX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_inline_and_install_strict_test(test_name: str) -> bool:
@@ -265,10 +333,13 @@ def is_training_ir_test(test_name):
     )
 
 
+<<<<<<< HEAD
 def is_training_ir_strict_test(test_name):
     return test_name.endswith(TRAINING_IR_DECOMP_STRICT_SUFFIX)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_cpp_runtime_test(test_name):
     return test_name.endswith(CPP_RUNTIME_STRICT_SUFFIX) or test_name.endswith(
         CPP_RUNTIME_NONSTRICT_SUFFIX
@@ -290,6 +361,10 @@ def test_export_inline_constraints(self):
         class Module(torch.nn.Module):
             def forward(self, x):
                 b = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.full((b, 1), 1)
 
         f = Module()
@@ -338,6 +413,7 @@ def forward(self, *args):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
     def test_no_grad_param_inplace(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -384,10 +460,16 @@ def forward(self, x):
 
         self.assertTrue(torch.allclose(res, res_export))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_slice_unbacked_dim1(self):
         class MySlice(torch.nn.Module):
             def forward(self, x, seq_len):
                 l = seq_len.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(l, max=x.size(1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = x.narrow(1, 0, l)
                 return x
 
@@ -416,6 +498,10 @@ def test_export_constraints_error(self):
         class ConflictingConstraints(torch.nn.Module):
             def forward(self, x):
                 b = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(b >= 4)
                 torch._check(b <= 5)
                 torch._check(b <= 5)
@@ -525,6 +611,7 @@ def _test_export_same_as_eager(self, f, args, kwargs=None):
         # )
 
     def _check_dynamic_shapes_specs_and_shapes(
+<<<<<<< HEAD
         self,
         model,
         inputs,
@@ -532,6 +619,9 @@ def _check_dynamic_shapes_specs_and_shapes(
         passing_shapes,
         failing_shapes,
         test_serdes=False,
+=======
+        self, model, inputs, specs, passing_shapes, failing_shapes, test_serdes=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         from torch._export.serde.dynamic_shapes import (
             _dump_dynamic_shapes,
@@ -556,7 +646,11 @@ def _is_tensor_leaf(x):
             eps = [ep]
             if test_serdes:
                 # test dynamic shapes serialization
+<<<<<<< HEAD
                 # test that behavior remains the same when exporting with Ser/Des specs:
+=======
+                # test that behavior remains the same when exporting with ser/des specs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # serialize + deserialize original specs, and export.
                 ep_serdes = export(
                     model,
@@ -573,7 +667,11 @@ def _is_tensor_leaf(x):
                     ep.module()(*test_inputs)
                 for shapes in failing_shapes:
                     test_inputs = _construct_inputs(shapes)
+<<<<<<< HEAD
                     with self.assertRaisesRegex(AssertionError, "Guard failed"):
+=======
+                    with self.assertRaises(RuntimeError):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ep.module()(*test_inputs)
 
     def test_basic(self):
@@ -628,6 +726,7 @@ def forward(self, x):
 
         self.assertEqual(counter, 1)
 
+<<<<<<< HEAD
     @testing.expectedFailureSerDer  # can't serialize functorch ops
     @testing.expectedFailureSerDerNonStrict  # can't serialize functorch ops
     def test_vmap_to_assert(self):
@@ -644,6 +743,8 @@ def forward(self, x, y):
         eager = VmapToAssert()(torch.ones(4, 4, 4, 4), torch.ones(4, 4, 4, 4))
         self.assertEqual(exported, eager)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_from_node_metadata_export(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -662,6 +763,7 @@ def example_inputs(self):
 
         f = Foo()
         inputs = (torch.randn(1, 3, 5, 5),)
+<<<<<<< HEAD
         ep = export(f, inputs)
         graph_id = id(ep.graph)
         gm = ep.module()
@@ -669,6 +771,13 @@ def example_inputs(self):
 
         for node in gm.graph.nodes:
             if node.op in ("placeholder", "output", "call_module"):
+=======
+        gm = export(f, inputs).module()
+        from torch.fx.traceback import NodeSourceAction
+
+        for node in gm.graph.nodes:
+            if node.op in ("placeholder", "output"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             if "weight" in node.name or "bias" in node.name:
                 self.assertTrue(
@@ -679,9 +788,12 @@ def example_inputs(self):
                     node.meta["from_node"][-1].action
                     == [NodeSourceAction.CREATE, NodeSourceAction.REPLACE]
                 )
+<<<<<<< HEAD
                 self.assertEqual(
                     node.meta["from_node"][-1].from_node[-1].graph_id, graph_id
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 self.assertTrue(
                     node.meta["from_node"][-1].pass_name == "ExportedProgram.module()"
@@ -689,6 +801,7 @@ def example_inputs(self):
                 self.assertTrue(
                     node.meta["from_node"][-1].action == [NodeSourceAction.CREATE]
                 )
+<<<<<<< HEAD
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
         ## re-export
@@ -700,6 +813,15 @@ def example_inputs(self):
             if node.op in ("placeholder", "output", "call_module"):
                 continue
 
+=======
+
+        ## re-export
+        gm2 = export(gm, inputs).module()
+
+        for node in gm2.graph.nodes:
+            if node.op in ("placeholder", "output"):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if "weight" in node.name or "bias" in node.name:
                 self.assertTrue(
                     node.meta["from_node"][-1].pass_name
@@ -709,9 +831,12 @@ def example_inputs(self):
                     node.meta["from_node"][-1].action
                     == [NodeSourceAction.CREATE, NodeSourceAction.REPLACE]
                 )
+<<<<<<< HEAD
                 self.assertEqual(
                     node.meta["from_node"][-1].from_node[-1].graph_id, graph_id
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 self.assertTrue(
                     node.meta["from_node"][-1].pass_name == "ExportedProgram.module()"
@@ -719,6 +844,7 @@ def example_inputs(self):
                 self.assertTrue(
                     node.meta["from_node"][-1].action == [NodeSourceAction.CREATE]
                 )
+<<<<<<< HEAD
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
     @requires_gpu
@@ -962,6 +1088,8 @@ def i_want_faster_code(inp1, inp2):
     %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %add_1), kwargs = {})
     return (add_2,)""",
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_bincount(self):
         class M(torch.nn.Module):
@@ -975,7 +1103,11 @@ def forward(self, x):
 
         model = M()
         ep = export(model, (torch.randint(0, 8, (5,), dtype=torch.int64),))
+<<<<<<< HEAD
 
+=======
+        print(ep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = torch.randint(0, 8, (5,), dtype=torch.int64)
         self.assertTrue(torch.allclose(ep.module()(inp), M()(inp)))
 
@@ -1032,6 +1164,7 @@ def forward(self, x: torch.Tensor):
         # instead of the scripted function, so we get x.sin()
         self.assertEqual(res, x.sin())
 
+<<<<<<< HEAD
     def test_nested_module_fake_tensor_leak(self):
         class Bar(torch.nn.Module):
             def __init__(self):
@@ -1120,6 +1253,8 @@ def forward(self, x):
                     MyModel(), (torch.randn(1, 3, 5),), strict=False
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inline_script_class_method(self):
         class M(torch.nn.Module):
             @staticmethod
@@ -1145,6 +1280,7 @@ def forward(self, x: torch.Tensor):
         # instead of the scripted function, so we get x.sin()
         self.assertEqual(res, x.sin())
 
+<<<<<<< HEAD
     def test_tag_ac_export(self):
         ops_to_save = [torch.ops.aten.addmm.default]
 
@@ -1254,6 +1390,8 @@ def forward(self, primals, tangents):
     return pytree.tree_unflatten([addmm_1, t_9, view_1, t_5, view, mm_2], self._out_spec)""",
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inline_script_class_method_recursive(self):
         f = 0.4
         i = 2
@@ -1399,8 +1537,12 @@ def forward(self, x):
             """\
 graph():
     %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
+<<<<<<< HEAD
     %x : [num_users=2] = placeholder[target=x]
     %_guards_fn : [num_users=0] = call_module[target=_guards_fn](args = (%x,), kwargs = {})
+=======
+    %x : [num_users=1] = placeholder[target=x]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %lifted_tensor_0), kwargs = {})
     return (add,)""",
         )
@@ -1452,6 +1594,10 @@ def forward(self, x):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # Cpp serder seems to fail parsing complicated guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_statically_known_true(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -1464,6 +1610,7 @@ def forward(self, x, y):
             (torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC),
         )
 
+<<<<<<< HEAD
         m = Foo()
         inp = (torch.randn(4, 4), torch.randn(4, 4))
         ep = export(
@@ -1476,6 +1623,15 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
         FileCheck().check_count("torch.ops.aten.slice.Tensor", 1, exactly=True).run(
+=======
+        ep = export(
+            Foo(),
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+        FileCheck().check_count("torch.ops.aten.slice.Tensor", 2, exactly=True).run(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             str(ep.graph)
         )
         FileCheck().check_count("operator.sub", 1, exactly=True).run(str(ep.graph))
@@ -1521,7 +1677,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(exported_program.module()(*args), m(*args))
 
+<<<<<<< HEAD
         gm: torch.fx.GraphModule = torch.export.export(
+=======
+        gm: torch.fx.GraphModule = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m, args=example_args, dynamic_shapes=dynamic_shapes
         ).module()
 
@@ -1530,6 +1690,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(gm(*args), m(*args))
 
+<<<<<<< HEAD
     # stride() is called for an undefined tensor
     @testing.expectedFailureCppRuntimeNonStrict
     def test_native_multi_attention_head(self):
@@ -1617,6 +1778,8 @@ def forward(self, q, k, v, key_padding_mask):
                     )
                     self.assertEqual(ep.module()(*sample_input), npt(*sample_input))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unused_constant(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1626,6 +1789,7 @@ def forward(self, x):
         ep = export(M(), (torch.ones(3),))
         self.assertEqual(len(ep.constants), 0)
 
+<<<<<<< HEAD
         class M(torch.nn.Module):
             def __init__(self, num_features: int = 1) -> None:
                 super().__init__()
@@ -1644,6 +1808,8 @@ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
         unf = unflatten(ep)
         self.assertTrue(torch.allclose(M()(inp)[0], unf(inp)[0]))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unbacked_bincount(self):
         class Foo(torch.nn.Module):
             def forward(self, xs):
@@ -1897,6 +2063,7 @@ def forward(self, x: torch.Tensor, as_tuple: bool) -> torch.Tensor:
         for vr_upper in vr_upper_bounds:
             self.assertEqual(vr_upper, 1)
 
+<<<<<<< HEAD
     def test_detect_leak_strict(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -1942,6 +2109,8 @@ def update(self):
         ):
             ref(torch.randn(4, 4), torch.randn(4, 4))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mask_nonzero_static(self):
         class TestModule(torch.nn.Module):
             def forward(self, seq_embeddings, mask, exp):
@@ -1999,6 +2168,7 @@ def forward(self, x, y):
             ep = export(m, (x, y))
         self.assertEqual(ep.module()(x, y), m(x, y))
 
+<<<<<<< HEAD
     def test_subclass_context(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -2032,6 +2202,8 @@ def forward(self, x):
             ):
                 ep.module()(input_test)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_basic_non_strict_real_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self) -> None:
@@ -2046,6 +2218,7 @@ def forward(self, x, y):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
+<<<<<<< HEAD
     def test_where_decomp(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
@@ -2080,6 +2253,8 @@ def auto_dynamic_shapes_from_args(args):  # pyre-ignore
             dynamic_shapes=auto_dynamic_shapes_from_args(sample_input),
         ).run_decompositions({})
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_basic_non_strict_fake_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self) -> None:
@@ -2139,11 +2314,15 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(5), "b": torch.ones(5)},
             torch.ones(4),
         )
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: ys[0].size()[0] == x.size()[0]"),
         ):
             # expected 6, but got 5
+=======
+        with self.assertRaisesRegex(RuntimeError, "to be equal to 6, but got 5"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_ns.module()(*bad_runtime_inp1)
 
         bad_runtime_inp2 = (
@@ -2153,10 +2332,16 @@ def forward(self, x, ys, zs, c):
             torch.ones(6),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: c.size()[0] == 4"),
         ):
             # expected 4, but got 6
+=======
+            RuntimeError,
+            escape("Expected input at *args[3].shape[0] to be equal to 4, but got 6"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_ns.module()(*bad_runtime_inp2)
 
         good_runtime_inp = (
@@ -2284,9 +2469,12 @@ def false_fn(x):
         torch.export.export(M(), (torch.randn(7),), strict=strict)
 
     def test_cond_branches_return_constant_int(self):
+<<<<<<< HEAD
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def forward(self, x):
                 idx = torch.cond(x.sum() > 3, lambda: 0, lambda: 1, tuple())
@@ -2304,8 +2492,11 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(x);  _guards_fn = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -2334,6 +2525,7 @@ def forward(self):
             )
         self.assertEqual(m(*args), ep.module()(*args))
 
+<<<<<<< HEAD
     @testing.expectedFailureCppRuntimeNonStrict
     def test_cond_access_identical_symint_closure(self):
         class Example2(torch.nn.Module):
@@ -2358,6 +2550,8 @@ def forward(self, x, trigger, target):
             )
         self.assertEqual(m(*args), ep.module()(*args))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_branches_return_same_int(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -2381,8 +2575,11 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(x);  _guards_fn = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -2486,8 +2683,13 @@ def forward(self, x):
                 # z = 3
                 return x + y + z
 
+<<<<<<< HEAD
         with self.assertWarnsRegex(
             UserWarning,
+=======
+        with self.assertRaisesRegex(
+            ValueError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "The tensor attribute self.buf was assigned during export",
         ):
             export(M(), (torch.randn(2, 3),), strict=False)
@@ -2544,12 +2746,18 @@ def forward(self, x):
                 # z = 3 + 3
                 return x + y + z
 
+<<<<<<< HEAD
         with self.assertWarnsRegex(
             UserWarning,
+=======
+        with self.assertRaisesRegex(
+            ValueError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "The tensor attributes self.tensors\\[0\\], self.tensors\\[1\\] were assigned during export",
         ):
             export(M(), (torch.randn(2, 3),), strict=False)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_while_loop_tensor_constant_idx(self):
         def while_loop_decomp(x, y0):
@@ -2582,6 +2790,8 @@ def forward(self, x, y0):
         out = ep.module()(x, y0)
         self.assertEqual(exp_out, out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_malformed_fqn_from_source_name(self):
         # See https://github.com/pytorch/pytorch/issues/141939
         from types import MethodType
@@ -2640,11 +2850,16 @@ def annotate_split_points(mod: torch.nn.Module, spec):
         for problem in [Problem1, Problem2]:
             m = problem()
             m(torch.rand(64, 64))
+<<<<<<< HEAD
             # simplified torch.distributed.pipeline code
+=======
+            # simpified torch.distributed.pipeline code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             annotate_split_points(m, {"blocks.1": 1, "blocks.3": 1})
             gm = export(m, (torch.rand(64, 64),))
             torch.export.unflatten(gm)
 
+<<<<<<< HEAD
     def test_unflatten_closure(self):
         class Dummy(torch.nn.Module):
             def forward(self, fn, x):
@@ -2693,6 +2908,8 @@ def forward(self, add):
     return add_5""",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_state_primitives(self):
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -3067,9 +3284,12 @@ def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 ep = export(model, inputs)
 
     def test_subclasses_parameterization(self):
+<<<<<<< HEAD
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -3086,7 +3306,11 @@ def forward(self, x):
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
 
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,))
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3122,7 +3346,10 @@ def forward(self, x):
 
         self.assertEqual(res, ref_out)
 
+<<<<<<< HEAD
     @testing.expectedFailureCppRuntimeNonStrict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclasses_parameterization_nested(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -3149,7 +3376,11 @@ def forward(self, x):
         ref_x = torch.randn(2, 2)
         ref_out = m(ref_x)
 
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3195,6 +3426,7 @@ def forward(self, x):
         res = ep.module()(ref_x)
         self.assertEqual(res, ref_out)
 
+<<<<<<< HEAD
     @testing.expectedFailureSerDer  # can't serialize functorch ops
     @testing.expectedFailureSerDerNonStrict  # can't serialize functorch ops
     @testing.expectedFailureCppRuntime
@@ -3256,6 +3488,8 @@ def forward(self, x, y):
 
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -3281,7 +3515,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(ep_training.module()(ref_x), ref_out))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
@@ -3336,7 +3574,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3376,7 +3618,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3414,7 +3660,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3453,7 +3703,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3493,7 +3747,11 @@ def forward(self, x):
         m = Foo()
         ref_x = torch.randn(3, 4)
         ref_out = m(ref_x)
+<<<<<<< HEAD
         ep_training = torch.export.export(m, (ref_x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(m, (ref_x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_training.graph).strip(),
             """\
@@ -3575,6 +3833,10 @@ def _bool_tensor(nz):
                     sample_input = _tensor(nz=nz)
                     ep = export(mod, (sample_input,), strict=False)
                     self.assertEqual(ep.module()(sample_input), nz)
+<<<<<<< HEAD
+=======
+                    print(ep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_script_module(self):
         class Foo(torch.nn.Module):
@@ -3647,6 +3909,7 @@ def forward(self, x, y, z):
         ):
             export(Foo(), inputs, dynamic_shapes=shapes)
 
+<<<<<<< HEAD
     def test_issue_157289(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -3693,6 +3956,8 @@ def forward(self, causal_mask, fill_value):
     return (slice_scatter,)""",
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dim_dynamic_specialization(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -3736,6 +4001,7 @@ def forward(self, x):
                 },
             )
 
+<<<<<<< HEAD
     def test_unbacked_slice_forward(self):
         class Foo(torch.nn.Module):
             def forward(self, x, xs):
@@ -3762,6 +4028,8 @@ def forward(self, x, xs):
             bound_unbacked |= node.meta.get("unbacked_bindings", {}).keys()
         self.assertEqual(len(bound_unbacked), 4)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dim_hint_ranges(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -3778,6 +4046,7 @@ def forward(self, x, y):
         ep = export(Foo(), inputs, dynamic_shapes=shapes)
         ep.module()(torch.randn(8, 5), torch.randn(8, 5))
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] >= 4"),
         ):
@@ -3794,6 +4063,18 @@ def forward(self, x, y):
             escape("Guard failed: x.size()[1] <= 32"),
         ):
             # expected <= 32, but got 33
+=======
+            RuntimeError, "Expected input at .* to be >= 4, but got 3"
+        ):
+            ep.module()(torch.randn(3, 5), torch.randn(3, 5))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be <= 16, but got 17"
+        ):
+            ep.module()(torch.randn(17, 5), torch.randn(17, 5))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be <= 32, but got 33"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(9, 33), torch.randn(9, 33))
 
     def test_dim_hint_range_violations(self):
@@ -3955,7 +4236,11 @@ def forward(self, x):
     def test_export_custom_op_lib(self):
         ops_registered_before = set(torch.ops.mylib)
 
+<<<<<<< HEAD
         # Assert warning for CompositeImplicitAutograd op
+=======
+        # Assert warning for CompositeImplictAutograd op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
             lib.define("foo123(Tensor x) -> Tensor")
             lib.impl("foo123", lambda x: x.sin(), "CompositeImplicitAutograd")
@@ -4048,12 +4333,20 @@ def forward(self, x):
 
         actual_torch_fns = []
         for mod in gm.modules():
+<<<<<<< HEAD
             if hasattr(mod, "graph"):
                 for node in mod.graph.nodes:
                     if node.name in {"sin", "cos"}:
                         torch_fn = node.meta.get("torch_fn")
                         print(torch_fn)
                         actual_torch_fns.append(torch_fn)
+=======
+            for node in mod.graph.nodes:
+                if node.name in {"sin", "cos"}:
+                    torch_fn = node.meta.get("torch_fn")
+                    print(torch_fn)
+                    actual_torch_fns.append(torch_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exp_torch_fns = [
             ("cos_1", "method_descriptor.cos"),
             ("sin_1", "method_descriptor.sin"),
@@ -4226,10 +4519,16 @@ def forward(self, x, y):
             dynamic_shapes=({0: dimx}, {0: dimy}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] == -1 + y.size()[0]"),
         ):
             # expected 5, but got 6
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 5, but got 6",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(4), torch.randn(6))
 
         self.assertEqual(ep.module()(torch.randn(4), torch.randn(5)).size()[0], 4)
@@ -4288,6 +4587,7 @@ def forward(self, z, y):
             dynamic_shapes=({0: dimz}, {0: dimy}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: z.size()[0] <= 7"),
         ):
@@ -4298,6 +4598,15 @@ def forward(self, z, y):
             escape("Guard failed: -1 + 2 * z.size()[0] == y.size()[0]"),
         ):
             # expected 9, but got 8
+=======
+            RuntimeError, "Expected input.*shape.*to be <= 7, but got 8"
+        ):
+            ep.module()(torch.randn(8), torch.randn(15))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 9, but got 8",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(5), torch.randn(8))
 
         self.assertEqual(ep.module()(torch.randn(5), torch.randn(9)).size()[0], 4)
@@ -4333,18 +4642,31 @@ def forward(self, w):
             dynamic_shapes=({0: dimw},),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: w.size()[0] % 2 == 0"),
         ):
             # expected 2*..., got 9
+=======
+            RuntimeError,
+            "Expected input.*shape.*= 9 to be "
+            "of the form 2\\*s92, where s92 is an integer",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(9))
 
         self.assertEqual(ep.module()(torch.randn(8)).size()[0], 4)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: w.size()[0] <= 12"),
         ):
             # expected <= 12, but got 14
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be <= 12, but got 14",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(14))
 
     def test_derived_dim_repeat_derived(self):
@@ -4382,10 +4704,16 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: z.size()[0] >= 6"),
         ):
             # expected 8, but got 5
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 8, but got 5",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -4418,10 +4746,16 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
             # expected 6, but got 5
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 6, but got 5",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4447,10 +4781,16 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
             # expected 6, but got 5
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 6, but got 5",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4638,7 +4978,11 @@ def forward(self, x, y):
                 x_linear = self.linear(x_conv)
                 return x_linear.cos() + y_conv_1d.sum()
 
+<<<<<<< HEAD
         ep = torch.export.export(
+=======
+        ep = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Foo(), (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50))
         )
 
@@ -4837,6 +5181,7 @@ def forward(self, x):
         inp = torch.randn(3, 3)
         self.assertTrue(torch.allclose(ep.module()(inp)[0], inp + 1))
 
+<<<<<<< HEAD
     def test_set_grad_as_side_effect(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -4848,6 +5193,8 @@ def forward(self, x):
         after = torch.is_grad_enabled()
         self.assertEqual(before, after)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_derived_dim_out_of_order_simplified(self):
         _dimz = torch.export.Dim("_dimz", min=6, max=8)
         dimy = _dimz - 1
@@ -4886,10 +5233,16 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: z.size()[0] >= 6"),
         ):
             # expected 8, but got 5
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 8, but got 5",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -4906,7 +5259,13 @@ def forward(self, x):
                 return self.linear(x)
 
         eager_model = Foo()
+<<<<<<< HEAD
         ep_for_training = torch.export.export(eager_model, (torch.ones(2, 2),))
+=======
+        ep_for_training = torch.export.export_for_training(
+            eager_model, (torch.ones(2, 2),)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_for_training.graph_module.code).strip(),
             """\
@@ -4922,7 +5281,10 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
@@ -4944,7 +5306,11 @@ def forward(self, x):
 
         eager_model_for_export = Foo()
         eager_model_for_testing = Foo()
+<<<<<<< HEAD
         ep_for_training = torch.export.export(
+=======
+        ep_for_training = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_model_for_export, (torch.ones(4, 4),)
         )
         self.assertExpectedInline(
@@ -4963,7 +5329,10 @@ def forward(self, b_buffer, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     buffer = self.buffer
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_ = torch.ops.aten.add_.Tensor(x, 5);  x = None
     add__1 = torch.ops.aten.add_.Tensor(buffer, 5);  buffer = None
     add = torch.ops.aten.add.Tensor(add_, add__1);  add_ = add__1 = None
@@ -4990,7 +5359,11 @@ def forward(self, x):
         eager_model_for_export_training = Foo()
         eager_model_for_export_inference = Foo()
         eager_model_for_testing = Foo()
+<<<<<<< HEAD
         ep_for_training = torch.export.export(
+=======
+        ep_for_training = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_model_for_export_training,
             (torch.ones(4, 4),),
             dynamic_shapes=({0: Dim("x")},),
@@ -5044,7 +5417,11 @@ def forward(self, container):
                 return x + y + self.buffer.sum()
 
         eager_model = Foo()
+<<<<<<< HEAD
         ep_for_training = torch.export.export(
+=======
+        ep_for_training = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_model,
             ([torch.ones(4, 4), torch.ones(4, 4)],),
         )
@@ -5058,6 +5435,7 @@ def forward(self, container):
             )
         )
 
+<<<<<<< HEAD
     def test_function_holding_tensor(self):
         global_storage = []
 
@@ -5202,6 +5580,8 @@ def forward(self, x, y):
             with self.assertWarnsRegex(UserWarning, warn_re):
                 ep = export(lc, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_for_training_run_decomp(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -5214,7 +5594,11 @@ def forward(self, x):
                 return self.linear(x) + self.buffer.sum()
 
         eager_model = Foo()
+<<<<<<< HEAD
         ep_for_training = torch.export.export(
+=======
+        ep_for_training = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_model,
             (torch.ones(2, 2),),
         )
@@ -5249,10 +5633,16 @@ def forward(self, x, y, y1, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y1.size()[0] == y.size()[0]"),
         ):
             # expected 7, but got 5
+=======
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 7, but got 5",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -5304,9 +5694,14 @@ def forward(self, x, y, z):
             ep = export(foo, inputs, dynamic_shapes=dynamic_shapes)
             self.assertEqual(foo(*inputs), ep.module()(*inputs))
             for wrong_inputs in wrong_shape_inputs:
+<<<<<<< HEAD
                 with self.assertRaisesRegex(AssertionError, "Guard failed"):
                     with self.assertRaises(RuntimeError):
                         ep.module()(*wrong_inputs)
+=======
+                with self.assertRaises(RuntimeError):
+                    ep.module()(*wrong_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # check range_constraints - static dims shouldn't be present
         ep = export(foo, inputs, dynamic_shapes=((dx, None), (dy, 4), (dz, 3)))
@@ -5342,10 +5737,15 @@ def forward(self, x):
         ep.module()(torch.randn(1, 2))
         ep.module()(torch.randn(2, 2))
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] <= 2"),
         ):
             # expected <= 2, but got 3
+=======
+            RuntimeError, "Expected input at .* to be <= 2, but got 3"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.randn(3, 2))
         vr = list(ep.range_constraints.values())[0]
         self.assertEqual(vr.lower, 1)
@@ -5362,17 +5762,22 @@ def forward(self, x, y):
             (torch.randn(2, 2), torch.randn(3, 2)),
             dynamic_shapes=({0: dx, 1: None}, {0: dx + 1, 1: None}),
         )
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: -1 + y.size()[0] != 1"),
         ):
             # TODO: this should not error?
             ep.module()(torch.randn(1, 2), torch.randn(2, 2))
+=======
+        ep.module()(torch.randn(1, 2), torch.randn(2, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         range_lower_bounds = sorted(vr.lower for vr in ep.range_constraints.values())
         range_upper_bounds = sorted(vr.upper for vr in ep.range_constraints.values())
         self.assertEqual(range_lower_bounds, [1, 2])
         self.assertEqual(range_upper_bounds, [2, 3])
 
+<<<<<<< HEAD
     def test_issue_161902(self):
         class Add(torch.nn.Module):
             def forward(self, x, y):
@@ -5393,6 +5798,8 @@ def forward(self, x, y):
         ):
             export(m, (x, y), dynamic_shapes=conflicting)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_range_constraints_with_replacement(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -5434,6 +5841,7 @@ def forward(self, x, mask):
         self.assertTrue(torch.allclose(ref[0], actual[0]))
         self.assertTrue(torch.allclose(ref[1], actual[1]))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_layer_norm_unbacked_normalized_shape(self):
         class MyModel(torch.nn.Module):
@@ -5512,6 +5920,8 @@ def forward(self, *args):
             got = mod(*args)
             self.assertTrue(torch.allclose(expected, got))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_builder_basic(self):
         class M(torch.nn.Module):
             def forward(self, x, y, z):
@@ -5618,7 +6028,11 @@ def forward(self, x, y, z):
         self.assertEqual(got_shapes, expected_shapes)
 
         def expect_error(bad_args, run_time_msg, compile_time_msg):
+<<<<<<< HEAD
             with self.assertRaisesRegex(AssertionError, run_time_msg):
+=======
+            with self.assertRaisesRegex(RuntimeError, run_time_msg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ep.module()(*bad_args)
 
             additional_inputs = torch.export.AdditionalInputs()
@@ -5630,27 +6044,39 @@ def expect_error(bad_args, run_time_msg, compile_time_msg):
         expect_error(
             # 4->2, 4->2, 3->3
             bad_args=(torch.randn(2), [torch.randn(2)], {"k": torch.randn(3)}),
+<<<<<<< HEAD
             run_time_msg=escape(
                 "Guard failed: x.size()[0] >= 3"
             ),  # expected >= 3, but got 2
+=======
+            run_time_msg="Expected input.*to be >= 3, but got 2",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compile_time_msg="Expected input.*to be >= 3, but got 2",
         )
 
         expect_error(
             # 4->6, 4->7, 3->3
             bad_args=(torch.randn(6), [torch.randn(7)], {"k": torch.randn(3)}),
+<<<<<<< HEAD
             run_time_msg=escape(
                 "Guard failed: y[0].size()[0] == x.size()[0]"
             ),  # expected 6, but got 7
+=======
+            run_time_msg="Expected input.*to be equal to 6, but got 7",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compile_time_msg="Expected input.*to be equal to 6, but got 7",
         )
 
         expect_error(
             # 4->5, 4->5, 3->4
             bad_args=(torch.randn(5), [torch.randn(5)], {"k": torch.randn(4)}),
+<<<<<<< HEAD
             run_time_msg=escape(
                 "Guard failed: z['k'].size()[0] == 3"
             ),  # expected 3, but got 4
+=======
+            run_time_msg="Expected input.*to be equal to 3, but got 4",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compile_time_msg=r"You marked.*but your code specialized it to be a constant.*If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
         )
 
@@ -5895,8 +6321,12 @@ def foo_unbacked_fake_impl(a, b):
 
         # check ShapeEnv counters compared to binding indices
         shape_env = _get_shape_env_from_gm(ep.graph_module)
+<<<<<<< HEAD
         next_index = shape_env.unbacked_symint_counter
         shape_env.unbacked_symint_counter += 1
+=======
+        next_index = next(shape_env.unbacked_symint_counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for symbol in bound:
             self.assertTrue(symbol_is_type(symbol, SymT.UNBACKED_INT))
             self.assertTrue(
@@ -5996,6 +6426,10 @@ def test_unbacked_infer_size(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
                 u0 = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 t = torch.empty(u0 - 1)
                 return t + t
 
@@ -6133,9 +6567,12 @@ def forward(self, x, offsets_t, fixes):
         )
 
     def test_simple_unbacked_view(self):
+<<<<<<< HEAD
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Foo(torch.nn.Module):
             def forward(self, x):
                 u0 = x.item()
@@ -6245,6 +6682,10 @@ def forward(self, x):
         # There should be nonzero view nodes in the graph
         self.assertTrue(view_count > 0)
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # cpp ser/der not handling complicated symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_solver_unsupported_sympy_function(self):
         # repro of https://github.com/pytorch/pytorch/issues/131897
 
@@ -6299,6 +6740,7 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(x, y), model(x, y)))
         x2 = torch.arange(4).reshape((2, 2))
         y2 = torch.arange(9).reshape((3, 3))
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             (
@@ -6311,6 +6753,9 @@ def forward(self, x, y):
         ):
             # TODO: this should not error?
             self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
+=======
+        self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_max_nonstrict(self):
         class FooMax(torch.nn.Module):
@@ -6434,11 +6879,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         dim0_x = torch.export.Dim("dim0_x", min=3)
         dim1_x = torch.export.Dim("dim1_x", max=8000)
         dynamic_shapes = {"x": (dim0_x, dim1_x)}
+<<<<<<< HEAD
         em = torch.export.export(
             m,
             (a,),
             dynamic_shapes=dynamic_shapes,
             prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        em = torch.export._trace._export(
+            m,
+            (a,),
+            dynamic_shapes=dynamic_shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         em.module()(torch.randn(4, 3))
         with self.assertRaisesRegex(
@@ -6453,10 +6906,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         em = torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
         x = torch.randn(3, 5)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: 3 * x.size()[1] % 2 == 0"),
         ):
             # expected 2*..., but got 5
+=======
+            RuntimeError,
+            "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s33, where s33 is an integer",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             em.module()(x)
 
     def test_dont_duck_size_for_auto_dynamic(self):
@@ -6479,9 +6938,12 @@ def forward(self, x, y):
         ep.module()(torch.randn(6, 3), torch.randn(7, 4))
 
     def test_map(self):
+<<<<<<< HEAD
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Module(torch.nn.Module):
             def forward(self, xs, y, z):
                 def body(x, y, z):
@@ -6686,6 +7148,7 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
         }
         self._test_export_same_as_eager(kw_func, args, kwargs)
 
+<<<<<<< HEAD
     def test_unbacked_stack(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -6707,6 +7170,9 @@ def forward(self, x):
         self.assertTrue(torch.allclose(orig_res, ep_res))
 
     def test_unbacked_slice_simple(self):
+=======
+    def test_unbacked_slice(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
                 valid_mask = scores > score_thr
@@ -6714,6 +7180,11 @@ def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
                 valid_idxs = torch.nonzero(valid_mask).to(scores.device)
 
                 num_topk = torch.minimum(topk, torch.tensor(valid_idxs.shape[0])).item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(num_topk)
+                torch._check(scores.shape[0] >= num_topk)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 scores, idxs = scores.sort(descending=True)
                 scores = scores[:num_topk]
                 topk_idxs = valid_idxs[idxs[:num_topk]]
@@ -6742,7 +7213,10 @@ def forward(self, x, y):
                 b = x.item()
                 torch._check(b >= 0)
                 torch._check(b < y.shape[0])
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return y[0, b]
 
         if is_non_strict_test(self._testMethodName):
@@ -6755,6 +7229,7 @@ def forward(self, x, y):
 
             self.assertTrue(torch.allclose(er, r))
 
+<<<<<<< HEAD
     @testing.expectedFailureSerDerNonStrict
     @testing.expectedFailureCppRuntimeNonStrict
     def test_more_multidimensional_slicing(self):
@@ -6913,6 +7388,8 @@ def forward(self, t, x):
 
         test(M_slice_None_Ellipsis_int(), G_slice_None_Ellipsis_int())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sequential_slicing(self):
         # See https://github.com/pytorch/pytorch/issues/137455
 
@@ -6985,7 +7462,11 @@ class M1(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
 
+<<<<<<< HEAD
                 torch._check(b >= 0)
+=======
+                torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(b < y.size(0))
                 return y[:b]
 
@@ -6993,7 +7474,11 @@ class M3(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
 
+<<<<<<< HEAD
                 torch._check(b >= 0)
+=======
+                torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(b < y.size(0) * 2)
                 return y[:b]
 
@@ -7472,9 +7957,13 @@ def forward(self, kjt) -> torch.Tensor:
             efoo = torch.export.export(
                 foo,
                 inputs,
+<<<<<<< HEAD
                 dynamic_shapes={
                     "kjt": [{0: dim}, None, {0: dim}, {0: dim_plus_one}, None, None]
                 },
+=======
+                dynamic_shapes={"kjt": [{0: dim}, None, {0: dim}, {0: dim_plus_one}]},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(
                 [out.shape for out in efoo.module()(*inputs)],
@@ -7791,7 +8280,11 @@ class MyDataClass:
 
         dt = MyDataClass(x=3, y=4)
         flat, spec = tree_flatten(dt)
+<<<<<<< HEAD
         self.assertTrue(spec, treespec_leaf())
+=======
+        self.assertTrue(spec, LeafSpec())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(len(flat) == 1)
 
         torch.export.register_dataclass(
@@ -7802,9 +8295,13 @@ class MyDataClass:
         flat, spec = tree_flatten(dt)
         self.assertEqual(
             spec,
+<<<<<<< HEAD
             TreeSpec(
                 MyDataClass, [["x", "y"], ["z"]], [treespec_leaf(), treespec_leaf()]
             ),
+=======
+            TreeSpec(MyDataClass, [["x", "y"], ["z"]], [LeafSpec(), LeafSpec()]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(flat, [3, 4])
 
@@ -7837,7 +8334,11 @@ class MyOtherDataClass:  # the pytree registration don't allow registering the s
             TreeSpec(
                 MyOtherDataClass,
                 [["x", "y", "z"], []],
+<<<<<<< HEAD
                 [treespec_leaf(), treespec_leaf(), treespec_leaf()],
+=======
+                [LeafSpec(), LeafSpec(), LeafSpec()],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
         self.assertEqual(flat, [3, 4, None])
@@ -7916,11 +8417,17 @@ def test_buffer_util(self):
                 buffer.append(get_buffer(ep, node))
         self.assertEqual(num_buffer, 3)
 
+<<<<<<< HEAD
         # The insertion order is not guaranteed to be same for strict vs
         # non-strict, so commenting this out.
         # self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
         # self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
         # self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
+=======
+        self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
+        self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
+        self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_dynamo_config(self):
         class MyModule(torch.nn.Module):
@@ -8231,7 +8738,11 @@ def forward(self, x):
 
         inp = torch.randn(4, 4)
 
+<<<<<<< HEAD
         ep = torch.export.export(
+=======
+        ep = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Foo(), (inp,), strict=False, preserve_module_call_signature=("bar",)
         )
         unflat = unflatten(ep).bar
@@ -8537,7 +9048,11 @@ def forward(self, x):
 
         decomp_table = {**default_decompositions(), **decomposition_table}
 
+<<<<<<< HEAD
         ep = torch.export.export(M(), (torch.randn(2, 2),)).run_decompositions(
+=======
+        ep = export_for_training(M(), (torch.randn(2, 2),)).run_decompositions(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             decomp_table
         )
 
@@ -8566,7 +9081,11 @@ def forward(self, x):
         mod.eval()
         inp = torch.randn(1, 1, 3, 3)
 
+<<<<<<< HEAD
         gm = torch.export.export(mod, (inp,)).module()
+=======
+        gm = torch.export.export_for_training(mod, (inp,)).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(gm.code).strip(),
             """\
@@ -8579,14 +9098,21 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked;  bn_num_batches_tracked = None
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, False, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
     return pytree.tree_unflatten((batch_norm,), self._out_spec)""",
         )
 
         mod.train()
+<<<<<<< HEAD
         gm_train = torch.export.export(mod, (inp,)).module()
+=======
+        gm_train = torch.export.export_for_training(mod, (inp,)).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(gm_train.code).strip(),
             """\
@@ -8599,7 +9125,10 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     add_ = torch.ops.aten.add_.Tensor(bn_num_batches_tracked, 1);  bn_num_batches_tracked = add_ = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, True, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
@@ -8610,7 +9139,11 @@ def test_constrain_size_in_eager(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
                 n = x.max().item()
+<<<<<<< HEAD
                 torch._check(n >= 0)
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return y + n
 
         fn = Module()
@@ -8627,6 +9160,10 @@ def forward(self, x, y):
                 n = x.max().item()
                 torch._check(n >= 2)
                 torch._check(n <= 10)
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return y + n
 
         fn = Module()
@@ -8668,6 +9205,10 @@ def test_constrain_size_with_various_cases(self):
         class Module1(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(n >= 0)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8676,6 +9217,10 @@ def forward(self, x, y):
         class Module2(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(n >= 0)
                 torch._check(n <= 6)
                 return y.sum() + torch.ones(n, 5).sum()
@@ -8685,6 +9230,10 @@ def forward(self, x, y):
         class Module3(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(n >= 0)
                 torch._check(n <= 1)
                 return y.sum() + torch.ones(n, 5).sum()
@@ -8694,6 +9243,10 @@ def forward(self, x, y):
         class Module4(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(n >= 2)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8702,6 +9255,10 @@ def forward(self, x, y):
         class Module5(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(n >= 1)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8789,7 +9346,11 @@ def forward(self, x, y):
         ep = export(M(), (torch.tensor(1), torch.ones(4, 5)))
 
         # This is because we insert sym_constrain_range in the graph now
+<<<<<<< HEAD
         error_msg = r".* failed for expression u0 >= 0 on node .*"
+=======
+        error_msg = r"Invalid value range for -1 between"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, error_msg):
             _ = ep.module()(torch.tensor(-1), torch.randn(4, 5))
 
@@ -8829,6 +9390,7 @@ def forward(self, inputs):
             ]:
                 self.assertFalse(hasattr(tensor, attr))
 
+<<<<<<< HEAD
     @testing.expectedFailureCppRuntime
     def test_while_loop_index_assertions(self):
         from torch._higher_order_ops import while_loop
@@ -8891,6 +9453,8 @@ def body_fn(idx, x):
         ):
             ep.graph_module.while_loop_body_graph_0(torch.tensor([5]), torch.zeros(1))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constrain_decomp(self) -> None:
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -8899,6 +9463,10 @@ def __init__(self) -> None:
 
             def forward(self, start_pos: torch.Tensor):
                 pos = start_pos.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(pos)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(pos >= 0)
                 torch._check(pos <= 4)
                 return self.freq[pos] * self.freq[pos]
@@ -8907,11 +9475,23 @@ def forward(self, start_pos: torch.Tensor):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         decompose_ep = ep.run_decompositions()
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mixed_input(self):
         class Module(torch.nn.Module):
@@ -8996,6 +9576,15 @@ def forward(self, x):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             RuntimeError,
@@ -9019,6 +9608,15 @@ def forward(self, x):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_to_module_with_mutated_buffer(self):
         class Foo(torch.nn.Module):
@@ -9125,6 +9723,7 @@ def forward(self, x, m):
         ref_x = torch.randn(2, 2)
         ref_out = f(ref_x, mod)
 
+<<<<<<< HEAD
         ep = torch.export.export(f, (torch.randn(2, 2), mod), strict=False)
         self.assertEqual(ref_out, ep.module()(ref_x, mod))
 
@@ -9132,6 +9731,12 @@ def test_unbacked_noncontig_lin(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+        ep = torch.export.export_for_training(f, (torch.randn(2, 2), mod), strict=False)
+        self.assertEqual(ref_out, ep.module()(ref_x, mod))
+
+    def test_unbacked_noncontig_lin(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -9167,20 +9772,32 @@ def forward(self, x, y):
             )
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y == 5"),
         ):
             # expected 5, but got 6
+=======
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5, but got 6"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _ = exported.module()(torch.ones(8, 5), 6)
 
         exported = torch.export.export(
             foo, (tensor_inp, 5.0), dynamic_shapes=dynamic_shapes
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y == 5.0"),
         ):
             # expected 5.0, but got 6.0
+=======
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5.0, but got 6.0"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _ = exported.module()(torch.ones(7, 5), 6.0)
 
     def test_runtime_assert_for_prm_str(self):
@@ -9192,17 +9809,25 @@ def forward(self, a, b, mode):
         inps = (torch.randn(4, 4), torch.randn(4), "trunc")
         exported = export(foo, inps)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: mode == 'trunc'"),
         ):
             # expected 'trunc', but got 'floor'
+=======
+            RuntimeError, "to be equal to trunc, but got floor"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _ = exported.module()(torch.randn(4, 4), torch.randn(4), "floor")
         self.assertTrue(torch.allclose(exported.module()(*inps), foo(*inps)))
 
     def test_sym_or_sym_and(self):
+<<<<<<< HEAD
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.fx.experimental.symbolic_shapes import sym_and, sym_or
 
         class Foo(torch.nn.Module):
@@ -9322,12 +9947,18 @@ def forward(self, x):
         dim0_x = torch.export.Dim("dim0_x")
         exported = torch.export.export(Foo(), (inp,), dynamic_shapes=({0: dim0_x},))
         reexported = torch.export.export(exported.module(), (inp,))
+<<<<<<< HEAD
 
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x.size()[0] == 5"),
         ):
             # expected 5, but got 7
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "shape\[0\] to be equal to 5, but got 7"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             reexported.module()(torch.ones(7, 5))
 
         reexported = torch.export.export(
@@ -9345,10 +9976,16 @@ def forward(self, x):
             Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x_v2}}
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] >= 3"),
         ):
             # expected >= 3, but got 2
+=======
+            RuntimeError,
+            escape("Expected input at *args[0].shape[0] to be >= 3, but got 2"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.export.export(exported_v2.module(), (torch.randn(2, 2),))
 
     def test_export_cond_symbool_pred(self):
@@ -9382,7 +10019,11 @@ def false_fn(x):
             str(schema),
             """cond(SymBool pred, GraphModule true_fn, GraphModule false_fn, Tensor[2] operands) -> Tensor[1]""",
         )
+<<<<<<< HEAD
         # serdes deserializes tuple as list
+=======
+        # serdes deserailizes tuple as list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if need_serdes_test(self._testMethodName):
             self.assertExpectedInline(
                 ep.graph_module.code.strip(),
@@ -9398,9 +10039,29 @@ def forward(self, b_a_buffer, x):
             )
 
         else:
+<<<<<<< HEAD
             self.assertExpectedInline(
                 ep.graph_module.code.strip(),
                 """\
+=======
+            if is_inline_and_install_strict_test(self._testMethodName):
+                self.assertExpectedInline(
+                    ep.graph_module.code.strip(),
+                    """\
+def forward(self, b____modules__a____buffers__buffer, x):
+    sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
+    gt = sym_size_int_1 > 4;  sym_size_int_1 = None
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b____modules__a____buffers__buffer));  gt = true_graph_0 = false_graph_0 = x = b____modules__a____buffers__buffer = None
+    getitem = cond[0];  cond = None
+    return (getitem,)""",
+                )
+            else:
+                self.assertExpectedInline(
+                    ep.graph_module.code.strip(),
+                    """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, b_a_buffer, x):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
     gt = sym_size_int_1 > 4;  sym_size_int_1 = None
@@ -9409,11 +10070,16 @@ def forward(self, b_a_buffer, x):
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b_a_buffer));  gt = true_graph_0 = false_graph_0 = x = b_a_buffer = None
     getitem = cond[0];  cond = None
     return (getitem,)""",
+<<<<<<< HEAD
             )
+=======
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             torch.allclose(ep.module()(torch.ones(6, 4)), Foo()(torch.ones(6, 4)))
         )
 
+<<<<<<< HEAD
     def test_ccode_python_mod(self):
         import sympy
 
@@ -9436,6 +10102,8 @@ def forward(self, xs):
             """(u0 % u1) < 0 ? u0 % u1 + abs(u1) : u0 % u1""",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_aten_lift_fresh_copy(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -9478,7 +10146,11 @@ def forward(self, x):
                 len([node for node in gm.graph.nodes if node.op == "placeholder"]), 1
             )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_dim(self):
         device = torch.device("cuda")
@@ -9503,7 +10175,11 @@ def forward(self, x):
         module_out = Foo()(xs)
         self.assertTrue(torch.allclose(ep.module()(xs), module_out))
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_scandim(self):
         device = torch.device("cuda")
@@ -9528,11 +10204,16 @@ def forward(self, x):
         module_out = Foo()(xs)
         self.assertTrue(torch.allclose(ep.module()(xs), module_out))
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_export_associative_scan_lifted_buffers(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+    @requires_cuda
+    def test_export_associative_scan_lifted_buffers(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = torch.device("cuda")
         combine_mode = "pointwise"
 
@@ -9633,6 +10314,27 @@ def forward(self, x):
         ):
             ep.module()(torch.tensor(5))
 
+<<<<<<< HEAD
+=======
+    def test_is_non_negative_check_function(self):
+        import sympy as sp
+
+        from torch.fx.experimental.symbolic_shapes import _is_non_negative_check
+
+        x = sp.Symbol("x")
+        variable_name = sp.Symbol("variable_name")
+        tensor_shape = sp.Symbol("tensor.shape[0]")
+
+        self.assertEqual(_is_non_negative_check(variable_name >= 0), "variable_name")
+        self.assertEqual(_is_non_negative_check(tensor_shape >= 0), "tensor.shape[0]")
+
+        # Test cases where the condition is not checking for x >= 0
+        self.assertIsNone(_is_non_negative_check(x > 0))
+        self.assertIsNone(_is_non_negative_check(x == 0))
+        self.assertIsNotNone(_is_non_negative_check(0 <= x))
+        self.assertIsNone(_is_non_negative_check(x >= 1))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_suggest_torch_checks_with_non_negative_check(self):
         from unittest.mock import patch
 
@@ -9661,6 +10363,10 @@ def test_suggest_torch_checks_with_non_negative_check(self):
             src_map["u"] = ["u"]
             _suggest_torch_checks(mock_exception, src_map)
             error_msg = mock_exception.args[0]
+<<<<<<< HEAD
+=======
+            self.assertIn("torch._check_is_size(u)", error_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIn("torch._check(u < 0)", error_msg)
 
     def test_suggest_torch_checks_with_regular_check(self):
@@ -9695,6 +10401,10 @@ def test_suggest_torch_checks_with_regular_check(self):
             error_msg = mock_exception.args[0]
             self.assertIn("torch._check(u > 5)", error_msg)
             self.assertIn("torch._check(u <= 5)", error_msg)
+<<<<<<< HEAD
+=======
+            self.assertNotIn("torch._check_is_size", error_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_train_eval_on_exported_preautograd_module(self):
         class Foo(torch.nn.Module):
@@ -9987,9 +10697,33 @@ def _decompose_linear_custom(x, weight, bias):
             decomp_table={torch.ops.aten.linear.default: _decompose_linear_custom}
         )
 
+<<<<<<< HEAD
         self.assertExpectedInline(
             str(ep_decompose_linear.graph_module.code).strip(),
             """\
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            self.assertExpectedInline(
+                str(ep_decompose_linear.graph_module.code).strip(),
+                """\
+def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_bias, c_linear_weight, x, y):
+    conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
+    conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
+    permute = torch.ops.aten.permute.default(c_linear_weight, [1, 0]);  c_linear_weight = None
+    matmul = torch.ops.aten.matmul.default(conv2d, permute);  conv2d = permute = None
+    mul = torch.ops.aten.mul.Tensor(c_linear_bias, 2);  c_linear_bias = None
+    add = torch.ops.aten.add.Tensor(matmul, mul);  matmul = mul = None
+    cos = torch.ops.aten.cos.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
+    add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+    return (add_1,)""",
+            )
+
+        else:
+            self.assertExpectedInline(
+                str(ep_decompose_linear.graph_module.code).strip(),
+                """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_weight, c_linear_bias, x, y):
     conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
     conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
@@ -10001,7 +10735,11 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_
     sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
     add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
     return (add_1,)""",
+<<<<<<< HEAD
         )
+=======
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_decomps_dynamic(self):
         class M(torch.nn.Module):
@@ -10041,10 +10779,21 @@ def forward(self, x):
         inp = torch.randn(2)
         self.assertTrue(torch.allclose(ep.module()(inp), torch.nonzero(inp)))
 
+<<<<<<< HEAD
+=======
+    # TODO(pianpwk) blocker: https://github.com/pytorch/pytorch/issues/151809
+    @testing.expectedFailureSerDer
+    @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureCppSerDes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_redundant_asserts(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
                 y = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.zeros(y)
 
         f = Foo()
@@ -10052,12 +10801,24 @@ def forward(self, x):
         ep = export(f, (torch.tensor([3]),))
 
         FileCheck().check_count(
+<<<<<<< HEAD
+=======
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch.ops.aten._assert_scalar.default", 1, exactly=True
         ).run(ep.graph_module.code)
 
         ep = ep.run_decompositions()
 
         FileCheck().check_count(
+<<<<<<< HEAD
+=======
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch.ops.aten._assert_scalar.default", 1, exactly=True
         ).run(ep.graph_module.code)
 
@@ -10083,10 +10844,15 @@ def forward(self, a, b):
             dynamic_shapes=(None, None),
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: b.size()[0] == 4"),
         ):
             # expected 4, but got 7
+=======
+            RuntimeError, "shape\[0\] to be equal to 4, but got 7"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_v2.module()(*test_inp)
 
     def test_constant_output(self):
@@ -10166,11 +10932,15 @@ def dynamify_inp(x):
         ep = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes)
 
         test_inp = ((torch.randn(4, 4), torch.randn(2, 4)), torch.randn(4, 4))
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: a[1].size()[0] >= 3"),
         ):
             # expected >= 3, but got 2
+=======
+        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be >= 3, but got 2"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(*test_inp)
 
     def test_nested_module(self):
@@ -10260,7 +11030,11 @@ def forward(self, x):
                 return m(x) * x
 
         inps = (torch.randn(3, 3),)
+<<<<<<< HEAD
         ep = torch.export.export(M2(), inps).run_decompositions({})
+=======
+        ep = export_for_training(M2(), inps).run_decompositions({})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(ep.module()(*inps), M2()(*inps)))
 
         self.assertEqual(len(ep.state_dict), 0)
@@ -10297,7 +11071,11 @@ def forward(self, x):
 
         inps = (torch.randn(3, 3),)
         # Strict export segfaults (Issue #128109)
+<<<<<<< HEAD
         ep = torch.export.export(M2(), inps, strict=False).run_decompositions({})
+=======
+        ep = export_for_training(M2(), inps, strict=False).run_decompositions({})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(ep.module()(*inps), M2()(*inps)))
 
         self.assertEqual(len(ep.state_dict), 0)
@@ -10311,9 +11089,19 @@ def forward(self, x):
     %x : [num_users=2] = placeholder[target=x]
     %ones : [num_users=1] = call_function[target=torch.ops.aten.ones.default](args = ([3, 3],), kwargs = {device: cpu, pin_memory: False})
     %detach : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%ones,), kwargs = {})
+<<<<<<< HEAD
     %clone : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%c_lifted_tensor_0,), kwargs = {})
     %detach_1 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%clone,), kwargs = {})
     %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%detach, %detach_1), kwargs = {})
+=======
+    %detach_1 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach,), kwargs = {})
+    %detach_2 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_1,), kwargs = {})
+    %clone : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%c_lifted_tensor_0,), kwargs = {})
+    %detach_3 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%clone,), kwargs = {})
+    %detach_4 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_3,), kwargs = {})
+    %detach_5 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_4,), kwargs = {})
+    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%detach_2, %detach_5), kwargs = {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %mul), kwargs = {})
     %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, %x), kwargs = {})
     return (mul_1,)""",
@@ -10368,6 +11156,7 @@ def forward(self, x):
         ).module()
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] >= 3"),
         ):
@@ -10379,6 +11168,15 @@ def forward(self, x):
             escape("Guard failed: x.size()[0] >= 3"),
         ):
             # expected >= 3, got 2
+=======
+            RuntimeError, escape("Expected input at *args[0].shape[0]")
+        ):
+            gm(torch.randn(2, 2))
+
+        with self.assertRaisesRegex(
+            RuntimeError, escape("Expected input at *args[0].shape[0]")
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             export(gm, (torch.randn(2, 2),))
 
         ep = export(
@@ -10394,6 +11192,11 @@ def test_runtime_assert_with_size(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
                 a = x.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(a)
+                torch._check(a <= y.size(0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return y[:a]
 
         ep = export(
@@ -10484,7 +11287,11 @@ def forward(self, x):
             x = torch.rand(5, 2, 2)
             model = Model()
 
+<<<<<<< HEAD
         # Manually set the fake_device of fake tensors.
+=======
+        # Manualy set the fake_device of fake tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x.fake_device = torch.device("cuda:0")
         for n, p in model.named_parameters():
             p.fake_device = torch.device("cuda:0")
@@ -10599,7 +11406,11 @@ def forward(self):
                 return (torch.full((i0,), 0.0),)
 
         f = M()
+<<<<<<< HEAD
         ep = export(f, ())
+=======
+        ep = torch.export.export(f, ())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = ep.module()()[0]
         self.assertEqual(a.size(), torch.Size([11]))
         self.assertEqual(a, torch.zeros(11))
@@ -10671,6 +11482,7 @@ def forward(self, x):
         ep = export(m, args)
         self.assertEqual(ep.module()(*args), m(*args))
 
+<<<<<<< HEAD
     def test_cdist_forward_compute_mode_zero_export(self):
         class CDistModel(torch.nn.Module):
             def __init__(self):
@@ -10693,6 +11505,8 @@ def forward(self, x, y, compute_mode):
         ep_0 = torch.export.export(model, (x, y, 0))
         self.assertTrue(torch.equal(ep_0.module()(x, y, 0), expected_0))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_then_compile_tensor_ctor(self):
         class M(torch.nn.Module):
             def forward(self, scores, mask):
@@ -10878,16 +11692,27 @@ def forward(self, x):
         test_inp = torch.randn(2, 3)
 
         torch_gm = _export_to_torch_ir(orig_eager, (torch.rand(2, 3),), {})
+<<<<<<< HEAD
         torch_gm.state_dict().keys()
         for k, v in orig_eager.state_dict().items():
             self.assertIn(k, torch_gm.state_dict())
             self.assertEqual(v, torch_gm.state_dict()[k])
+=======
+        for k, v in orig_eager.state_dict().items():
+            normalized_k = k.replace(".", "_")
+            self.assertIn(normalized_k, torch_gm.state_dict())
+            self.assertEqual(v, torch_gm.state_dict()[normalized_k])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(torch_gm(test_inp), orig_eager(test_inp)))
 
         pre_autograd_gm = torch.export._trace._export(
             orig_eager, (torch.rand(2, 3),), {}, pre_dispatch=True
         ).module()
         for k, v in orig_eager.state_dict().items():
+<<<<<<< HEAD
+=======
+            normalized_k = k.replace(".", "_")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIn(k, pre_autograd_gm.state_dict())
             self.assertEqual(v, pre_autograd_gm.state_dict()[k])
         self.assertTrue(torch.allclose(pre_autograd_gm(test_inp), orig_eager(test_inp)))
@@ -10899,7 +11724,10 @@ def forward(self, x):
             self.assertIn(k, ep.state_dict)
             self.assertEqual(v, ep.state_dict[k])
         self.assertTrue(torch.allclose(ep.module()(test_inp), orig_eager(test_inp)))
+<<<<<<< HEAD
         self.assertTrue(torch_gm.state_dict().keys(), orig_eager.state_dict().keys())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nn_module_stack(self):
         class Leaf(torch.nn.Module):
@@ -12217,11 +13045,15 @@ def forward(self, x, y):
 
         ep = export(M(), (4, 5))
         self.assertEqual(ep.module()(4, 5), 20)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x == 4"),
         ):
             # expected 4, but got 3
+=======
+        with self.assertRaisesRegex(RuntimeError, r"to be equal to 4, but got 3"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ep.module()(3, 6), 18)
 
         ep = export(M(), (4, 5), dynamic_shapes={"x": Dim.DYNAMIC, "y": Dim.AUTO})
@@ -12234,11 +13066,15 @@ def forward(self, x, y):
 
         ep = export(M(), (5, 5), dynamic_shapes={"x": None, "y": Dim.AUTO})
         self.assertEqual(ep.module()(5, 6), 30)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x == 5"),
         ):
             # expected 5, but got 3
+=======
+        with self.assertRaisesRegex(RuntimeError, r"to be equal to 5, but got 3"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ep.module()(3, 5), 18)
 
         class M(torch.nn.Module):
@@ -12254,6 +13090,10 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
 
     @testing.expectedFailureCppRuntime
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symint_input_specialization(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -12278,6 +13118,7 @@ def forward(self, x, y):
             inp,
             dynamic_shapes=(Dim.AUTO, None),
         )
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x == 3"),
@@ -12286,6 +13127,13 @@ def forward(self, x, y):
             ep.module()(4, torch.randn(4, 4))
 
     @testing.expectedFailureCppRuntime
+=======
+        with self.assertRaisesRegex(RuntimeError, "to be equal to 3, but got 4"):
+            ep.module()(4, torch.randn(4, 4))
+
+    @testing.expectedFailureCppRuntime
+    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symint_input_ranges(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -12299,6 +13147,7 @@ def forward(self, x, y):
         )
 
         ep.module()(4, torch.randn(4, 4))
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x <= 10"),
@@ -12310,6 +13159,11 @@ def forward(self, x, y):
             escape("Guard failed: x >= 3"),
         ):
             # expected >= 3, but got 2
+=======
+        with self.assertRaisesRegex(RuntimeError, "to be <= 10, but got 16"):
+            ep.module()(16, torch.randn(4, 4))
+        with self.assertRaisesRegex(RuntimeError, "to be >= 3, but got 2"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(2, torch.randn(4, 4))
 
         # While tracing the range was found to be a subset of the original range
@@ -12644,7 +13498,11 @@ def test(ep):
 
         if is_training_ir_test(self._testMethodName):
             test(
+<<<<<<< HEAD
                 torch.export.export(
+=======
+                torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     M(),
                     inp,
                     strict=not is_non_strict_test(self._testMethodName),
@@ -12765,7 +13623,11 @@ def test(ep, swap=None):
         test(export(M(), inp))
 
         strict = not is_non_strict_test(self._testMethodName)
+<<<<<<< HEAD
         ept = torch.export.export(
+=======
+        ept = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             M(),
             inp,
             strict=strict,
@@ -12840,7 +13702,11 @@ def forward(self, x):
 
         x = torch.zeros((4, 4, 10))
 
+<<<<<<< HEAD
         ep_training = torch.export.export(model, (x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(model, (x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state_dict_before = ep_training.state_dict
 
         ep = export(model, (x,), strict=False).run_decompositions()
@@ -12884,7 +13750,11 @@ def forward(self, x):
 
         x = torch.zeros((4, 4, 10))
 
+<<<<<<< HEAD
         ep_training = torch.export.export(model, (x,), strict=False)
+=======
+        ep_training = torch.export.export_for_training(model, (x,), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state_dict_before = ep_training.state_dict
 
         ep = export(model, (x,), strict=False).run_decompositions()
@@ -12990,7 +13860,10 @@ def test(m, expected_graph, expected_fqns, expected_duplicates):
                     [
                         fqn
                         for fqn, _ in unflattened.named_modules(remove_duplicate=False)
+<<<<<<< HEAD
                         if fqn != "_guards_fn"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ]
                 ),
                 expected_fqns,
@@ -13141,9 +14014,13 @@ def forward(self, x):
                 return x
 
         inp = torch.randn(4, 4)
+<<<<<<< HEAD
         gm = torch.fx.experimental.proxy_tensor.make_fx(
             Foo(), record_stack_traces=True
         )(
+=======
+        gm = torch.fx.experimental.proxy_tensor.make_fx(Foo(), stack_trace=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inp,
         )
 
@@ -13184,6 +14061,7 @@ def forward(self, x):
             )
         )
 
+<<<<<<< HEAD
     def test_filter_traceback_frames(self):
         class TestTracer(torch.fx.Tracer):
             def __init__(self) -> None:
@@ -13208,6 +14086,8 @@ def forward(self, x):
         trace_x = [node for node in graph.nodes if node.name == "x"][0].stack_trace
         self.assertTrue(re.search(r"proxy.py.*in create_node\n", trace_x))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureSerDerNonStrict  # register_constant needs to handle serialization
     @testing.expectedFailureSerDer  # register_constant needs to handle serialization
     def test_register_constant(self):
@@ -13403,7 +14283,11 @@ def true_fn(x, y):
 
         model = Model()
         with torch.no_grad():
+<<<<<<< HEAD
             exported_program = torch.export.export(
+=======
+            exported_program = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (torch.tensor(10), torch.tensor(12)),
                 {},
@@ -13499,7 +14383,11 @@ def forward(self, x, y):
         # no grad
         model = Model()
         with torch.no_grad():
+<<<<<<< HEAD
             ep_nograd = torch.export.export(
+=======
+            ep_nograd = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (torch.tensor(10), torch.tensor(12)),
                 {},
@@ -13519,7 +14407,11 @@ def forward(self, x, y):
 
         # enable grad
         model = Model()
+<<<<<<< HEAD
         ep_grad = torch.export.export(
+=======
+        ep_grad = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model,
             (torch.tensor(10), torch.tensor(12)),
             {},
@@ -13623,8 +14515,11 @@ def _test(m, non_persistent_buffer):
         _test(MyModule(), "foo")
         _test(MyOuterModule(), "inner.foo")
 
+<<<<<<< HEAD
     @testing.expectedFailureTrainingIRToRunDecomp  # set_grad disappears after decomp
     @testing.expectedFailureTrainingIRToRunDecompNonStrict  # set_grad disappears after decomp
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_with_set_grad_enabled(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -13637,9 +14532,24 @@ def forward(self, x):
 
         model = Model()
         ep = export(model, (torch.randn(4, 4),), {})
+<<<<<<< HEAD
         FileCheck().check_count(
             "torch.ops.higher_order.wrap_with_set_grad_enabled", 1, exactly=True
         ).run(ep.graph_module.code)
+=======
+        # _export_for_traininig is using pre_dispatch=False
+        # Therefore the set_grad calls are not replaced with a hop.
+        if not is_training_ir_test(self._testMethodName):
+            self.assertIn(
+                "torch.ops.higher_order.wrap_with_set_grad_enabled",
+                ep.graph_module.code,
+            )
+        gm = torch.export.export_for_training(model, (torch.randn(4, 4),)).module()
+        self.assertIn(
+            "set_grad_enabled",
+            gm.code,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_with_autocast(self):
         class Model(torch.nn.Module):
@@ -13664,7 +14574,11 @@ def forward(self, x):
             )
         # _export_for_traininig is using pre_dispatch=False
         # Therefore the autocast calls are not replaced with a hop.
+<<<<<<< HEAD
         gm = torch.export.export(model, (torch.randn(4, 4),)).module()
+=======
+        gm = torch.export.export_for_training(model, (torch.randn(4, 4),)).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIn(
             "autocast",
             gm.code,
@@ -13911,17 +14825,25 @@ def forward(self, x):
 
         inps = (torch.ones(5),)
 
+<<<<<<< HEAD
         ep = torch.export.export(M(), inps).run_decompositions({})
         if IS_FBCODE:
             self.assertExpectedInline(
                 str(ep.graph_module.code.strip()),
                 """\
+=======
+        ep = export_for_training(M(), inps).run_decompositions({})
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, x):
     cos = torch.ops.aten.cos.default(x)
     auto_functionalized = torch.ops.higher_order.auto_functionalized(torch.ops.testlib.foo.default, x = x, z = cos);  x = cos = None
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, getitem_3, cos_1)""",
+<<<<<<< HEAD
             )
         else:
             self.assertExpectedInline(
@@ -13934,6 +14856,9 @@ def forward(self, x):
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, getitem_3, cos_1)""",
             )
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_op_auto_warn_pre_dispatch(self):
         class M(torch.nn.Module):
@@ -13946,10 +14871,16 @@ def forward(self, x):
         inps = (torch.ones(5),)
 
         ep = torch.export.export(M(), inps).run_decompositions()
+<<<<<<< HEAD
         if IS_FBCODE:
             self.assertExpectedInline(
                 str(ep.graph_module.code.strip()),
                 """\
+=======
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, x):
     cos = torch.ops.aten.cos.default(x)
     cos_1 = torch.ops.aten.cos.default(x);  x = None
@@ -13957,6 +14888,7 @@ def forward(self, x):
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_2 = torch.ops.aten.cos.default(getitem_3);  getitem_3 = None
     return (cos_2,)""",
+<<<<<<< HEAD
             )
         else:
             self.assertExpectedInline(
@@ -13970,6 +14902,9 @@ def forward(self, x):
     cos_2 = torch.ops.aten.cos.default(getitem_3);  getitem_3 = None
     return (cos_2,)""",
             )
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ep = torch.export._trace._export(M(), inps, pre_dispatch=True)
         self.assertExpectedInline(
@@ -14093,6 +15028,7 @@ def forward(self, p, b_alpha, b, c_gamma):
             torch.randn(4),
         )
         ep = export(Foo(), inputs)
+<<<<<<< HEAD
         expected_names = [  # user inputs should be prioritized, unprefixed
             ("p_param_1", InputKind.PARAMETER),
             ("b_alpha_1", InputKind.BUFFER),
@@ -14103,6 +15039,31 @@ def forward(self, p, b_alpha, b, c_gamma):
             ("b_beta", InputKind.USER_INPUT),
             ("c_gamma", InputKind.USER_INPUT),
         ]
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            # when installed, prefix name
+            expected_names = [  # user inputs should be prioritized, unprefixed
+                ("p____parameters__param", InputKind.PARAMETER),
+                ("b____buffers__alpha", InputKind.BUFFER),
+                ("b____buffers__beta", InputKind.BUFFER),
+                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+                ("p_param", InputKind.USER_INPUT),
+                ("b_alpha", InputKind.USER_INPUT),
+                ("b_beta", InputKind.USER_INPUT),
+                ("c_gamma", InputKind.USER_INPUT),
+            ]
+        else:
+            expected_names = [  # user inputs should be prioritized, unprefixed
+                ("p_param_1", InputKind.PARAMETER),
+                ("b_alpha_1", InputKind.BUFFER),
+                ("b_beta_1", InputKind.BUFFER),
+                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+                ("p_param", InputKind.USER_INPUT),
+                ("b_alpha", InputKind.USER_INPUT),
+                ("b_beta", InputKind.USER_INPUT),
+                ("c_gamma", InputKind.USER_INPUT),
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         real_names = [
             (spec.arg.name, spec.kind) for spec in ep.graph_signature.input_specs
         ]
@@ -14225,6 +15186,7 @@ def forward(self, x):
                 self.bar = x.sum()
                 return x + 2
 
+<<<<<<< HEAD
         with self.assertWarnsRegex(
             UserWarning,
             "The tensor attribute self.bar was assigned during export",
@@ -14277,26 +15239,48 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
         )
         self.assertEqual(m(idxs), ep.module()(idxs))
 
+=======
+        with self.assertRaisesRegex(
+            ValueError,
+            "During torch.export, following attrs were created in the model.forward:",
+        ):
+            _ = export(Foo(), (torch.randn(4, 4),), strict=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unbacked_deferred_runtime_retrace(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
                 y_sum = y.sin().sum()
                 with torch.no_grad():
                     a = x.item()
+<<<<<<< HEAD
+=======
+                    torch._check_is_size(a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch._check(a > 2)
                     torch._check(a < 6)
                     unbacked_shape = torch.ops.testlib.foo_unbacked(a)
                 return y + y_sum + unbacked_shape.sum()
 
         inps = (torch.tensor(4), torch.randn(5, 5))
+<<<<<<< HEAD
         ep_pre = torch.export.export(Foo(), inps, strict=False)
+=======
+        ep_pre = torch.export.export_for_training(Foo(), inps, strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep_pre.graph_module.submod_1.code).strip(),
             """\
 def forward(self, x):
     item = torch.ops.aten.item.default(x);  x = None
+<<<<<<< HEAD
     ge = item >= 3
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 3 on node 'ge'");  ge = _assert_scalar_default = None
+=======
+    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(item);  sym_constrain_range_for_size_default = None
+    ge_1 = item >= 3
+    _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 3 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     le = item <= 5
     _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u0 <= 5 on node 'le'");  le = _assert_scalar_default_1 = None
     gt_1 = item > 2
@@ -14314,6 +15298,10 @@ def forward(self, x, y):
     sin = torch.ops.aten.sin.default(y)
     sum_1 = torch.ops.aten.sum.dim_IntList(sin, []);  sin = None
     _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(x);  x = None
+<<<<<<< HEAD
+=======
+    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense);  sym_constrain_range_for_size_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ge_1 = _local_scalar_dense >= 3
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 3 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
     le_1 = _local_scalar_dense <= 5
@@ -14360,7 +15348,11 @@ def forward(self, x):
 
     def test_disable_forced_specializations_ok(self):
         # check that we don't force specialization, and defer to runtime asserts
+<<<<<<< HEAD
         # with prefer_deferred_runtime_asserts_over_guards=True to successfully export
+=======
+        # with allow_complex_guards_as_runtime_asserts=True to successfully export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # case 1: modulo guards
         from torch.export import dims
 
@@ -14370,6 +15362,7 @@ def forward(self, x):
 
         inputs = (torch.randn(10, 72),)
         dx, dy = dims("dx", "dy")
+<<<<<<< HEAD
         for use_new_tracer in [True, False]:
             with torch._export.config.patch(use_new_tracer_experimental=use_new_tracer):
                 ep = torch.export._trace._export(
@@ -14388,6 +15381,23 @@ def forward(self, x):
                 r"^Runtime assertion failed for expression Eq\(Mod\(s\d+\*s\d+, 4\*s\d+\s*-\s*4\), 0\) on node 'eq[^']*'$",
             ):
                 ep.module()(torch.randn(8, 8))  # fail
+=======
+        ep = torch.export._trace._export(
+            Mod4Reshape(),
+            inputs,
+            dynamic_shapes={"x": (dx, dy)},
+            allow_complex_guards_as_runtime_asserts=True,
+        )
+        out1 = ep.module()(torch.randn(8, 7))
+        self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
+        out2 = ep.module()(torch.randn(12, 11))
+        self.assertEqual(out2.shape, torch.ones(11, 4, 3).shape)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, 4\*s77 \- 4\), 0\) on node 'eq.*'",
+        ):
+            ep.module()(torch.randn(8, 8))  # fail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # case 2: 2d reshape
         class FreeReshape(torch.nn.Module):
@@ -14404,6 +15414,7 @@ def forward(self, x, y, z):
             "y": [Dim(f"dy{i}", min=2) for i in range(2)],
             "z": [Dim(f"dz{i}", min=4) for i in range(1)],
         }
+<<<<<<< HEAD
 
         for private_api in (True, False):
             if private_api:
@@ -14437,6 +15448,24 @@ def forward(self, x, y, z):
                 ):
                     # expected 40, but got 20
                     ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))
+=======
+        ep = torch.export._trace._export(
+            FreeReshape(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+        )
+        ep = export(FreeReshape(), inputs, dynamic_shapes=dynamic_shapes)
+        out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
+        self.assertEqual(out1.shape, torch.ones(48).shape)
+        out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
+        self.assertEqual(out2.shape, torch.ones(40).shape)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
+        ):  # fail only at runtime
+            ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))  # fail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # case 3: 3d reshape (previously failing with different issue)
         class Reshape3d(torch.nn.Module):
@@ -14451,11 +15480,19 @@ def forward(self, x, y):
             "x": (Dim("dx0", min=2), Dim("dx1", min=2), Dim("dx2", min=2)),
             "y": (Dim("dy", min=8),),
         }
+<<<<<<< HEAD
         ep = torch.export.export(
             Reshape3d(),
             inputs,
             dynamic_shapes=dynamic_shapes,
             prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        ep = torch.export._trace._export(
+            Reshape3d(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         out1 = ep.module()(torch.randn(9, 7, 2), torch.randn(126))
         self.assertEqual(out1.shape, torch.ones(126).shape)
@@ -14516,6 +15553,7 @@ def forward(self, x, y):
         self.assertFalse(placeholders[1].meta["val"].requires_grad)
         self.assertTrue(placeholders[2].meta["val"].requires_grad)
 
+<<<<<<< HEAD
     def test_expand_copy_export_handles_implicit_true(self):
         class ExpandModel(torch.nn.Module):
             def __init__(self):
@@ -14536,6 +15574,9 @@ def test_unbacked_expand(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
 
+=======
+    def test_unbacked_expand(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Foo(torch.nn.Module):
             def forward(self, xs):
                 u0, u1, u2 = xs.tolist()
@@ -14593,11 +15634,19 @@ def forward(self, x):
         model = Model()
         x = torch.rand(1024, 20, 16)
         dynamic_shapes = {"x": {0: Dim("batch")}}
+<<<<<<< HEAD
         ep = torch.export.export(
             model,
             (x,),
             dynamic_shapes=dynamic_shapes,
             prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        ep = torch.export._trace._export(
+            model,
+            (x,),
+            dynamic_shapes=dynamic_shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with self.assertRaisesRegex(
             RuntimeError,
@@ -14670,11 +15719,19 @@ def forward(self, x, y):
 
         inputs = (torch.randn(6), torch.randn(12))
         dynamic_shapes = {"x": [Dim("dx", min=4)], "y": [Dim("dy", min=4)]}
+<<<<<<< HEAD
         ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=dynamic_shapes,
             prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        ep = torch.export._trace._export(
+            Foo(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # check forward pass
         out0, out1 = ep.module()(torch.randn(9), torch.randn(27))
@@ -14709,7 +15766,11 @@ def forward(self, x, y):
                 Foo(),
                 inputs,
                 dynamic_shapes=dynamic_shapes,
+<<<<<<< HEAD
                 prefer_deferred_runtime_asserts_over_guards=True,
+=======
+                allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).run_decompositions()
 
         self.assertEqual(
@@ -14900,6 +15961,10 @@ class Foo(torch.nn.Module):
             def forward(self, x, y):
                 n = y.item()
                 m = y.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(m >= 0)
                 torch._check(n >= 3)
                 torch._check(-m >= -9)  # m <= 9
@@ -14912,11 +15977,29 @@ def forward(self, x, y):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ep = ep.run_decompositions()
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
+<<<<<<< HEAD
+=======
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
+        ).run(ep.graph_module.code)
+        FileCheck().check_count(
+            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # check runtime
         ep.module()(torch.randn(10), torch.tensor(5))
@@ -14926,6 +16009,12 @@ def forward(self, x, y):
         ):
             ep.module()(torch.randn(10), torch.tensor(2))
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # TODO: When we deserialize we somehow hardcode sympy.lower to 2
+    @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureSerDer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.fx.experimental._config.patch(backed_size_oblivious=True)
     def test_baddbmm(self):
         class M(torch.nn.Module):
@@ -14950,7 +16039,11 @@ def forward(self, x):
         self.assertTrue(torch.allclose(m(x2), ep.module()(x2)))
         self.assertTrue(torch.allclose(m(x1), ep.module()(x1)))
 
+<<<<<<< HEAD
     @testing.expectedFailureSerDerNonStrict  # constructor is not serialized today
+=======
+    @testing.expectedFailureSerDerNonStrict  # construtor is not serialized today
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureSerDer  # constructor is not serialized today
     @testing.expectedFailureRetraceability  # dynamo doesn't work with FlatApply op
     def test_capture_subclass_constructor(self):
@@ -14967,7 +16060,11 @@ def forward(self, x):
                 return val.b.a
 
         mod = Foo()
+<<<<<<< HEAD
         ep = torch.export.export(mod, (torch.randn(4, 4),), strict=False)
+=======
+        ep = export_for_training(mod, (torch.randn(4, 4),), strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(ep.graph).strip(),
             """\
@@ -15061,7 +16158,14 @@ def __init__(self):
             def forward(self, x):
                 return x.cos()
 
+<<<<<<< HEAD
         export(Foo(), (torch.randn(4, 4),))
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "TestExport.test_capture_subclass_wrong.<locals>.Foo"
+        ):
+            export(Foo(), (torch.randn(4, 4),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_capture_subclass_constructor_torch_ir(self):
         class Foo(torch.nn.Module):
@@ -15105,11 +16209,19 @@ def forward(self, x, y):
 
         inputs = (torch.randn(5), torch.randn(3))
         shapes = {"x": (Dim("dx"),), "y": (Dim("dy"),)}
+<<<<<<< HEAD
         ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=shapes,
             prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        ep = torch.export._trace._export(
+            Foo(),
+            inputs,
+            dynamic_shapes=shapes,
+            allow_complex_guards_as_runtime_asserts=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # count 2 pow nodes, 2 sym_size.int nodes
         self.assertEqual(
@@ -15163,8 +16275,18 @@ def forward(self, x):
             for nn_module_stack in nn_module_stacks
         ]
 
+<<<<<<< HEAD
         self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
         self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            # when inlined and install have same ID so reference same layer
+            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
+            self.assertEqual(filtered_nn_module_stack[1], "sub_net.0")
+        else:
+            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
+            self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_slice_nn_module_stack(self):
         class N(torch.nn.Module):
@@ -15197,11 +16319,17 @@ def forward(self, x, y):
             list(nn_module_stack.values())[-1][0]
             for nn_module_stack in nn_module_stacks
         ]
+<<<<<<< HEAD
         if is_strict_test(self._testMethodName) or is_strict_v2_test(
             self._testMethodName
         ):
             self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.2")
             self.assertEqual(filtered_nn_module_stack[1], "mod_list_2.4")
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.2")
+            self.assertEqual(filtered_nn_module_stack[1], "mod_list_1.2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self.assertEqual(
                 filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).2"
@@ -15210,6 +16338,7 @@ def forward(self, x, y):
                 filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0"
             )
 
+<<<<<<< HEAD
     def test_invalid_pytree_dynamo_graph_capture(self):
         class Block:
             def __init__(self, a, b):
@@ -15268,6 +16397,8 @@ def forward(self, x):
 
         self.assertEqual(gm(inp), Foo()(inp))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_split_const_gm_with_lifted_constants(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -15365,10 +16496,16 @@ def forward(self, x):
             decomp_table,
         )
 
+<<<<<<< HEAD
         if IS_FBCODE:
             self.assertExpectedInline(
                 str(ep.graph_module.code).strip(),
                 """\
+=======
+        self.assertExpectedInline(
+            str(ep.graph_module.code).strip(),
+            """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, x):
     foo_functional = torch.ops.testlib.foo_functional.default(x);  x = None
     cos = torch.ops.aten.cos.default(foo_functional)
@@ -15376,6 +16513,7 @@ def forward(self, x):
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, cos_1)""",
+<<<<<<< HEAD
             )
         else:
             self.assertExpectedInline(
@@ -15389,6 +16527,9 @@ def forward(self, x):
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, cos_1)""",
             )
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_run_decompositions_keep_metadata(self):
         """Make sure the metadata is kept after exported program run_decompositions."""
@@ -15674,6 +16815,7 @@ def forward(self, x, y):
             test_serdes=True,
         )
 
+<<<<<<< HEAD
     def test_preserve_annotation(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -15709,6 +16851,8 @@ def forward(self, x):
             else:
                 raise AssertionError(f"Node not checked: {node}, {node.target}")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_serdes_generic(self):
         from torch._export.serde.dynamic_shapes import (
             _dump_dynamic_shapes,
@@ -16009,6 +17153,7 @@ class ModConstraint(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return x.view(x.shape[0] - 1, -1)
 
+<<<<<<< HEAD
         for private_api in (True, False):
             if private_api:
                 ep = torch.export.export(
@@ -16047,6 +17192,23 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 ):
                     # expected 3*..., but got 8
                     ep.module()(torch.randn(4, 2))
+=======
+        ep = export(
+            ModConstraint(),
+            (torch.randn(3, 4),),
+            dynamic_shapes={
+                "x": (dynamic, dynamic),
+            },
+        )
+        ep.module()(torch.randn(5, 8))
+        num_asserts = [
+            node.target == torch.ops.aten._assert_scalar.default
+            for node in ep.graph.nodes
+        ].count(True)
+        self.assertEqual(num_asserts, 2)
+        with self.assertRaises(RuntimeError):
+            ep.module()(torch.randn(4, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @testing.expectedFailureSerDer  # T195866111
     @testing.expectedFailureSerDerNonStrict
@@ -16080,7 +17242,11 @@ def outer_body_fn(x, y):
         x = torch.randn(2, 4)
         y = torch.ones(4)
 
+<<<<<<< HEAD
         ep_for_training = torch.export.export(M(), (x, y), strict=strict)
+=======
+        ep_for_training = torch.export.export_for_training(M(), (x, y), strict=strict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             normalize_gm(
                 ep_for_training.graph_module.print_readable(print_output=False)
@@ -16111,7 +17277,10 @@ def forward(self, arg0_1: "f32[2, 4]", arg1_1: "f32[4]"):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
+<<<<<<< HEAD
             ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         ep = export(M(), (x, y), strict=strict).run_decompositions({})
@@ -16144,11 +17313,17 @@ def forward(self, arg0_1: "f32[2, 4]", arg1_1: "f32[4]"):
                 add: "f32[2, 4]" = torch.ops.aten.add.Tensor(relu, arg1_1);  relu = arg1_1 = None
                 return (add,)
 """,
+<<<<<<< HEAD
             ignore_empty_lines=True,
         )
 
     @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureStrictV2  # test_hop doesn't have a dynamo implementation
+=======
+        )
+
+    @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureRetraceability  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureTrainingIRToRunDecomp  # test_hop doesn't have a dynamo implementation
     @testing.expectedFailureSerDerNonStrict  # TODO: serde torch.FunctionSchema is not implemented yet
@@ -16242,6 +17417,7 @@ def fn(x):
         self.assertEqual(x.sin(), ep.module()(x))
         pytree._deregister_pytree_node(torch.FunctionSchema)
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
     def test_exception(self):
         class Model(torch.nn.Module):
@@ -16287,6 +17463,8 @@ def forward(self, x):
                     strict=False,
                 ).module()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_for_training_with_state_dict_hooks(self):
         def _state_dict_pre_hook(mod, prefix, keep_vars):
             mod._buffers["test"] = torch.Tensor([1])
@@ -16417,6 +17595,7 @@ def forward(self, x):
             ]
             self.assertEqual(len(shift_op), 1)
 
+<<<<<<< HEAD
     def test_export_rnn_variants_with_warning(self):
         """
         Test that when exporting RNN, LSTM, and GRU models in non-strict mode, it:
@@ -16464,10 +17643,21 @@ def test_export_rnn_variants_with_warning(self):
     @contextmanager
     def distributed_env(self, world_size):
         try:
+=======
+    @contextmanager
+    def distributed_env(self, world_size):
+        try:
+            from torch.testing._internal.distributed.fake_pg import FakeStore
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.distributed.init_process_group(
                 backend="fake",
                 world_size=world_size,
                 rank=0,
+<<<<<<< HEAD
+=======
+                store=FakeStore(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             yield
 
@@ -16624,6 +17814,7 @@ def forward(self, args_0):
     return (abs_1,)""",
         )
 
+<<<<<<< HEAD
     def test_sdpa_gqa(self):
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
@@ -16747,6 +17938,8 @@ def forward(self, x, y):
 
         self.assertEqual(result_non_strict, result_strict)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
@@ -16783,7 +17976,19 @@ def forward(self, q, k, v):
 
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
             ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
+<<<<<<< HEAD
+            ep.run_decompositions()
+=======
+            print(ep.graph)
             ep.run_decompositions()
+            print(ep.graph)
+
+    #         self.assertExpectedInline(ep.graph_module.code.strip(), """\
+    # def forward(self, arg0_1, arg1_1, arg2_1):
+    #     _scaled_dot_product_flash_attention_for_cpu = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(arg0_1, arg1_1, arg2_1, 0.0, True);  arg0_1 = arg1_1 = arg2_1 = None
+    #     getitem = _scaled_dot_product_flash_attention_for_cpu[0];  _scaled_dot_product_flash_attention_for_cpu = None
+    #     return (getitem,)""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfCrossRef
     @unittest.skipIf(
@@ -16867,10 +18072,16 @@ def forward(self, x, y):
         self.assertEqual(res[1], 5)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y == 5"),
         ):
             # expected 5, but got 20
+=======
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5, but got 20"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = ep.module()(torch.tensor(4), 20)
 
         class F(torch.nn.Module):
@@ -16904,11 +18115,17 @@ def forward(
             ) -> torch.Tensor:
                 # x.sizes(): 1, 128, 16, 128
                 sp = start_pos.item()
+<<<<<<< HEAD
 
                 # Checks needed for slicing.
                 torch._check(sp >= 0)
                 torch._check(sp <= 126)
 
+=======
+                torch._check_is_size(sp)
+                torch._check(sp >= 0)
+                torch._check(sp <= 126)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 key = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
                 value = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
                 query = query.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
@@ -17266,6 +18483,7 @@ def forward(self, x):
             len(list(new_ep.graph.nodes)[-1].args[0]), len(signature.output_specs)
         )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_assert_tensor_metadata_device_index(self):
         class N(torch.nn.Module):
@@ -17390,6 +18608,8 @@ def forward(self, x):
         exported_param_names = [name for name, _ in gm.named_parameters()]
         self.assertEqual(original_param_names, exported_param_names)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestExportCustomClass(TorchTestCase):
@@ -17451,6 +18671,7 @@ def forward(self, x):
                 arg = node.args[0]
                 self.assertTrue(arg.op == "placeholder")
 
+<<<<<<< HEAD
     def test_int_lift_constant(self):
         class M(torch.nn.Module):
             def forward(self, a, x):
@@ -17462,6 +18683,8 @@ def forward(self, a, x):
         inp = (3, torch.randn(4))
         self.assertTrue(torch.allclose(M()(*inp), ep.module()(*inp)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_script_module(self):
         class Add(torch.nn.Module):
             def forward(self, x, y):
@@ -17555,6 +18778,7 @@ def forward(self, x, ranks):
             MyModel(), inps, dynamic_shapes=spec, strict=True
         ).run_decompositions({})
 
+<<<<<<< HEAD
     def test_unbacked_contiguous(self):
         class MyModel(torch.nn.Module):
             def forward(self, x, mask):
@@ -17669,6 +18893,8 @@ def forward(self, y):
             str(ep.graph)
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export_training_ir_to_run_decomp.py b/test/export/test_export_training_ir_to_run_decomp.py
index 6781e6d3c7d75..2330062019db6 100644
--- a/test/export/test_export_training_ir_to_run_decomp.py
+++ b/test/export/test_export_training_ir_to_run_decomp.py
@@ -15,14 +15,24 @@
 
 def mocked_training_ir_to_run_decomp_export_strict(*args, **kwargs):
     if "strict" in kwargs:
+<<<<<<< HEAD
         ep = torch.export.export(*args, **kwargs)
     else:
         ep = torch.export.export(*args, **kwargs, strict=True)
+=======
+        ep = torch.export.export_for_training(*args, **kwargs)
+    else:
+        ep = torch.export.export_for_training(*args, **kwargs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ep.run_decompositions({})
 
 
 def mocked_training_ir_to_run_decomp_export_non_strict(*args, **kwargs):
+<<<<<<< HEAD
     ep = torch.export.export(*args, **kwargs)
+=======
+    ep = torch.export.export_for_training(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return ep.run_decompositions({})
 
diff --git a/test/export/test_export_with_inline_and_install.py b/test/export/test_export_with_inline_and_install.py
index bb5ad8b63ae1b..0208476bf6093 100644
--- a/test/export/test_export_with_inline_and_install.py
+++ b/test/export/test_export_with_inline_and_install.py
@@ -1,9 +1,16 @@
 # Owner(s): ["oncall: export"]
 
 
+<<<<<<< HEAD
 from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import make_test_cls_with_patches
 from torch._export import config as export_config
+=======
+import unittest
+
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -43,9 +50,14 @@ def make_dynamic_cls(cls):
         cls_a,
         cls_prefix,
         "",
+<<<<<<< HEAD
         (export_config, "use_new_tracer_experimental", True),
         (dynamo_config, "install_free_tensors", True),
         (dynamo_config, "inline_inbuilt_nn_modules", True),
+=======
+        (config, "install_free_tensors", True),
+        (config, "inline_inbuilt_nn_modules", True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         xfail_prop="_expected_failure_inline_and_install",
     )
 
@@ -65,6 +77,17 @@ def make_dynamic_cls(cls):
 del test
 
 
+<<<<<<< HEAD
+=======
+# NOTE: For this test, we have a failure that occurs because the buffers (for BatchNorm2D) are installed, and not
+# graph input.  Therefore, they are not in the `program.graph_signature.inputs_to_buffers`
+# and so not found by the unit test when counting the buffers
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_buffer_util_inline_and_install_strict  # noqa: F821
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index 9cf442c27a2bb..49fc946c2646d 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -14,7 +14,10 @@
 import torch
 from functorch.experimental.control_flow import cond
 from torch._dynamo.eval_frame import is_dynamo_supported
+<<<<<<< HEAD
 from torch._export import config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.non_strict_utils import (
     _fakify_script_objects,
     _gather_constant_attrs,
@@ -180,8 +183,12 @@ def forward(self, x):
 
     def _get_predispatch_module(mod, args, ambient_grad_enabled=True):
         with torch.set_grad_enabled(ambient_grad_enabled):
+<<<<<<< HEAD
             with config.patch(use_new_tracer_experimental=True):
                 return _export(mod, args, pre_dispatch=True).module()
+=======
+            return _export(mod, args, pre_dispatch=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return {
         "ctx_manager": (
@@ -309,9 +316,13 @@ def forward(self, x):
     x = torch.randn(2, 2)
 
     def _get_predispatch_module(mod, args):
+<<<<<<< HEAD
         with torch._export.config.patch(use_new_tracer_experimental=True):
             ep = _export(mod, args, pre_dispatch=True).module()
             return ep
+=======
+        return _export(mod, args, pre_dispatch=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return {
         "multi_ctx_manager": (
@@ -358,7 +369,13 @@ def _insert_dilimiter_nodes(gm: torch.fx.GraphModule, step: int = 1):
 
         for i, node in enumerate(insert_locs):
             with gm.graph.inserting_before(node):
+<<<<<<< HEAD
                 gm.graph.call_function(torch._C._set_grad_enabled, (i % 2 == 0,), {})
+=======
+                gm.graph.call_function(
+                    torch._C._set_grad_enabled, (True if i % 2 == 0 else False,), {}
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return gm
 
     x = torch.randn(2, 2)
@@ -413,10 +430,16 @@ def forward(self, x):
         )
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[1] <= 6"),
         ):
             # expected <= 6, but got 7
+=======
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(2, 7, 3))
 
         self.assertEqual(
@@ -445,6 +468,7 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[1] <= 6"),
         ):
@@ -456,6 +480,17 @@ def forward(self, x, y):
             escape("Guard failed: y.size()[0] >= 3"),
         ):
             # expected >= 3, but got 2
+=======
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+            ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be >= 3, but got 2"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
     def test_runtime_assert_some_dims_not_specified(self) -> None:
@@ -480,18 +515,30 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[1] <= 6"),
         ):
             # expected <= 6, but got 7
+=======
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y.size()[0] == 5"),
         ):
             # expected 5, but got 2
+=======
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -516,19 +563,29 @@ def forward(self, x, y):
             M(), (x, y), dynamic_shapes={"x": None, "y": {1: dim1_y}}, strict=True
         )
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             AssertionError,
             escape("Guard failed: x.size()[1] == 2"),
         ):
             # expected 2, but got 7
+=======
+        with self.assertRaisesRegex(RuntimeError, escape("shape[1] to be equal to 2")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: y.size()[0] == 5"),
         ):
             # expected 5, but got 2
+=======
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -815,7 +872,10 @@ def test_predispatch_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -835,6 +895,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
@@ -845,6 +906,16 @@ def forward(self, x):
     sub = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add_1);  submod_5 = None
     getitem_1 = sub[0];  sub = None
     return pytree.tree_unflatten((add_1, getitem_1), self._out_spec)
+=======
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.aten.sin.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
+    submod_4 = self.submod_2
+    add_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(False, submod_4, sum_1);  submod_4 = sum_1 = None
+    getitem = add_1[0];  add_1 = None
+    sub = torch.ops.aten.sub.Tensor(getitem, 1)
+    return pytree.tree_unflatten((getitem, sub), self._out_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """,
         )
 
@@ -856,7 +927,10 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -876,7 +950,10 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -897,7 +974,10 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add)
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -923,7 +1003,10 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     wrap_with_set_grad_enabled = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -959,7 +1042,10 @@ def test_sequential_split_graph(self):
             """\
 def forward(self, x1, x2):
     x1, x2, = fx_pytree.tree_flatten_spec(([x1, x2], {}), self._in_spec)
+<<<<<<< HEAD
     submod_0 = self.submod_0(x1, x2);  submod_0 = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     submod_1 = self.submod_1(x1, x2);  x1 = x2 = None
     getitem = submod_1[0]
     getitem_1 = submod_1[1];  submod_1 = None
@@ -1015,6 +1101,7 @@ def test_predispatch_autocast_and_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
@@ -1039,6 +1126,30 @@ def forward(self, sin):
         )
         self.assertExpectedInline(
             mod.submod_2.submod_1.code.strip("\n"),
+=======
+    submod_3 = self.submod_3
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_3, add);  submod_3 = add = None
+    getitem_2 = sin[0];  sin = None
+    cos = torch.ops.aten.cos.default(getitem_2);  getitem_2 = None
+    submod_4 = self.submod_1
+    add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, False, None, submod_4, cos);  submod_4 = cos = None
+    getitem = add_1[0];  add_1 = None
+    sub = torch.ops.aten.sub.Tensor(getitem, 1)
+    return pytree.tree_unflatten((getitem, sub), self._out_spec)
+    """,
+        )
+        self.assertExpectedInline(
+            mod.submod_3.code.strip("\n"),
+            """\
+def forward(self, add):
+    sin = torch.ops.aten.sin.default(add);  add = None
+    return (sin,)
+    """,
+        )
+        self.assertExpectedInline(
+            mod.submod_1.code.strip("\n"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """\
 def forward(self, cos):
     add_1 = torch.ops.aten.add.Tensor(cos, 1);  cos = None
@@ -1055,7 +1166,10 @@ def test_predispatch_autocast(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_3 = self.submod_1
     add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_3, add);  submod_3 = add = None
@@ -1088,7 +1202,10 @@ def forward(self, add):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1139,7 +1256,10 @@ def forward(self, add_1):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     wrap_with_autocast = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1197,7 +1317,10 @@ def forward(self, add_1, add_2):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1239,7 +1362,10 @@ def test_inline_(self):
             )
             after_inline_str = new_gm.print_readable(print_output=False)
             self.assertEqual(before_str, after_inline_str)
+<<<<<<< HEAD
             new_gm._guards_fn = gm._guards_fn
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(gm(*args), new_gm(*args))
 
     def test_remove_auto_functionalized_pass(self) -> None:
@@ -1330,6 +1456,7 @@ def forward(self, x):
             )
 
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
+<<<<<<< HEAD
     def test_move_device_to(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1373,6 +1500,8 @@ def forward(self, arg0_1):
         )
 
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_move_to_device_pass(self):
         class Model(torch.nn.Module):
             def __init__(self, size=4, h_dim=10):
@@ -1408,6 +1537,7 @@ def forward(self, x):
         outputs = gm(*test_inputs)
         self.assertEqual(outputs.device, torch.device("cuda:0"))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
     def test_move_device_example_inputs(self):
         class Model(torch.nn.Module):
@@ -1440,6 +1570,8 @@ def forward(self, x, y, z):
         self.assertEqual(ep_cuda.example_inputs[0][1].device, torch.device("cuda:0"))
         self.assertEqual(ep_cuda.example_inputs[1]["z"].device, torch.device("cuda:0"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_folding_pass(self):
         from torch.ao.quantization.observer import MappingType, PerGroup, PerToken
         from torch.ao.quantization.pt2e._affine_quantization import (
diff --git a/test/export/test_schema.py b/test/export/test_schema.py
index f184fead8b413..0407901a5aa66 100644
--- a/test/export/test_schema.py
+++ b/test/export/test_schema.py
@@ -404,6 +404,7 @@ def test_schema_check(self):
         next_version, _ = check(commit)
         self.assertEqual(next_version, [4, 1])
 
+<<<<<<< HEAD
     def test_schema_comparison(self):
         import torch._export.serde.schema as schema
 
@@ -460,6 +461,8 @@ def test_schema_comparison(self):
         self.assertEqual(sig, sig_same)
         self.assertNotEqual(sig, sig_diff)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 2f68cdf479439..7bb62aa44403c 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from typing import NamedTuple
 
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.testing._internal.triton_utils import requires_gpu
 
@@ -25,6 +26,8 @@
     from torch.library import wrap_triton
     from torch.utils._triton import has_triton
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._dynamo as torchdynamo
 import torch._export.serde.schema as schema
@@ -32,9 +35,13 @@
 import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import all_examples
+<<<<<<< HEAD
 from torch._export.serde.schema import ArgumentKind
 from torch._export.serde.serialize import (
     _dict_to_dataclass,
+=======
+from torch._export.serde.serialize import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _to_json_bytes,
     canonicalize,
     deserialize,
@@ -46,7 +53,11 @@
 )
 from torch._higher_order_ops.torchbind import enable_torchbind_tracing
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+<<<<<<< HEAD
 from torch.export import Dim, export, load, save, unflatten
+=======
+from torch.export import Dim, export_for_training, load, save, unflatten
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export.pt2_archive.constants import ARCHIVE_VERSION_PATH
 from torch.fx.experimental.symbolic_shapes import is_concrete_int, ValueRanges
 from torch.testing._internal.common_utils import (
@@ -82,7 +93,11 @@ def __hash__(self):
                 return 0
 
             def __eq__(self, other):
+<<<<<<< HEAD
                 return type(other) is type(self)
+=======
+                return type(other) == type(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def __call__(self, *args, **kwargs):
                 return torch.ops.aten.add.Tensor(*args, **kwargs)
@@ -116,7 +131,11 @@ def op_schema(cls, op):
                 return torch.ops.aten.add.Tensor._schema
 
         inp = (torch.ones(10),)
+<<<<<<< HEAD
         ep = export(TestModule(), inp, strict=True)
+=======
+        ep = export_for_training(TestModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Register the custom op handler.
         foo_custom_op = FooExtensionOp()
@@ -181,7 +200,13 @@ def forward(self, x, y, use_p=False):
 
         model = MyModule().eval()
         random_inputs = (torch.rand([2, 3]), torch.rand([2, 3]))
+<<<<<<< HEAD
         exp_program = export(model, random_inputs, {"use_p": True}, strict=True)
+=======
+        exp_program = export_for_training(
+            model, random_inputs, {"use_p": True}, strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_buffer = io.BytesIO()
         # Tests that example inputs are preserved when saving and loading module.
@@ -200,7 +225,11 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return x.sin()
 
+<<<<<<< HEAD
         exp_program = export(M(), (torch.randn(4, 4),), strict=True)
+=======
+        exp_program = export_for_training(M(), (torch.randn(4, 4),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_buffer = io.BytesIO()
         # Tests that example forward arg names are preserved when saving and loading module.
@@ -240,7 +269,11 @@ def forward(self, x):
         inp = (torch.ones(10),)
         # Module will only be able to roundtrip if metadata
         # can be correctly parsed.
+<<<<<<< HEAD
         ep = export(MyModule(), inp, strict=True)
+=======
+        ep = export_for_training(MyModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -281,7 +314,11 @@ def forward(self, x):
                 return h + out_c
 
         inp = (torch.ones(10),)
+<<<<<<< HEAD
         ep = export(Foo(), inp, strict=True)
+=======
+        ep = export_for_training(Foo(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -291,6 +328,7 @@ def forward(self, x):
         actual_out = loaded_ep.module()(*inp)
         self.assertEqual(exp_out, actual_out)
 
+<<<<<<< HEAD
     def test_serialize_param_mutation(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -310,6 +348,8 @@ def forward(self, x):
         val = loaded_ep.graph_signature.parameters_to_mutate
         self.assertEqual({"div": "parameter"}, val)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialize_constant_outputs(self):
         class MyModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -323,7 +363,11 @@ def forward(self, x):
 
         # Check that module can be roundtripped, thereby confirming proper deserialization.
         inp = (torch.ones(10),)
+<<<<<<< HEAD
         ep = export(MyModule(), inp, strict=True)
+=======
+        ep = export_for_training(MyModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -346,7 +390,11 @@ def forward(self, x, w, b):
                     eps=1e-5,
                 )
 
+<<<<<<< HEAD
         exported_module = export(
+=======
+        exported_module = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             MyModule(),
             (
                 torch.ones([512, 512], requires_grad=True),
@@ -390,7 +438,11 @@ def forward(self, a, b, c) -> torch.Tensor:
             "b": {1: dim1_bc},
             "c": {0: dim0_ac, 1: dim1_bc},
         }
+<<<<<<< HEAD
         exported_module = export(
+=======
+        exported_module = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DynamicShapeSimpleModel(),
             inputs,
             dynamic_shapes=dynamic_shapes,
@@ -454,7 +506,11 @@ def forward(self, a, b, c) -> torch.Tensor:
             "b": {1: dim1_bc},
             "c": {0: dim0_ac, 1: dim1_bc},
         }
+<<<<<<< HEAD
         exported_module = export(
+=======
+        exported_module = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DynamicShapeSimpleModel(),
             inputs,
             dynamic_shapes=dynamic_shapes,
@@ -484,7 +540,13 @@ def forward(self, x):
                 return torch.split(x, 2)
 
         input = torch.arange(10.0).reshape(5, 2)
+<<<<<<< HEAD
         exported_module = export(MyModule(), (input,), strict=True).run_decompositions()
+=======
+        exported_module = export_for_training(
+            MyModule(), (input,), strict=True
+        ).run_decompositions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized = ExportedProgramSerializer().serialize(exported_module)
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -512,12 +574,19 @@ def forward(self, x):
             self.assertNotIn(name, seen)
             seen.add(name)
 
+<<<<<<< HEAD
     def test_nonfinite_inputs(self) -> None:
         class Module(torch.nn.Module):
             def forward(self, x):
                 x = torch.ops.aten.add.Scalar(x, math.inf)
                 x = torch.ops.aten.add.Scalar(x, -math.inf)
                 return torch.ops.aten.add.Scalar(x, math.nan)
+=======
+    def test_infinity_inputs(self) -> None:
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.add.Scalar(x, math.inf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fn = Module()
         ep = torch.export.export(
@@ -547,7 +616,11 @@ def __init__(self) -> None:
             def forward(self, x):
                 return torch.ops.aten.var_mean.correction(x, [1])[0]
 
+<<<<<<< HEAD
         exported_module = export(
+=======
+        exported_module = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             MyModule(), (torch.ones([512, 512], requires_grad=True),), strict=True
         ).run_decompositions()
 
@@ -568,7 +641,11 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return x + x
 
+<<<<<<< HEAD
         ep = export(
+=======
+        ep = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             M(), (torch.randn(4),), dynamic_shapes=({0: Dim("temp")},), strict=True
         )
 
@@ -590,6 +667,7 @@ def forward(self, x):
             serialized.exported_program.range_constraints[symint.name].max_val, 3
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not torch.cuda.is_available() or not has_triton(), "requires cuda and triton"
     )
@@ -703,6 +781,8 @@ def forward(self, x, y):
                     serialized.example_inputs,
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kwargs_default(self) -> None:
         """
         Tests that the kwargs default values are serialized even if they are not
@@ -717,7 +797,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         f = Foo()
 
         x, _ = torch.sort(torch.randn(3, 4))
+<<<<<<< HEAD
         exported_module = export(f, (x,), strict=True).run_decompositions()
+=======
+        exported_module = export_for_training(f, (x,), strict=True).run_decompositions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         serialized = ExportedProgramSerializer().serialize(exported_module)
 
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -735,7 +819,13 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 b = x + y
                 return b + a
 
+<<<<<<< HEAD
         ep = export(Module(), (torch.randn(3, 2), torch.randn(3, 2)), strict=True)
+=======
+        ep = export_for_training(
+            Module(), (torch.randn(3, 2), torch.randn(3, 2)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = ExportedProgramSerializer().serialize(ep)
         c = canonicalize(s.exported_program)
         g = c.graph_module.graph
@@ -749,12 +839,17 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return torch.ops.aten.sum.dim_IntList(x, [])
 
+<<<<<<< HEAD
         ep = torch.export.export(M(), (torch.randn(3, 2),), strict=True)
+=======
+        ep = torch.export.export_for_training(M(), (torch.randn(3, 2),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         serialized = ExportedProgramSerializer().serialize(ep)
         for node in serialized.exported_program.graph_module.graph.nodes:
             if "aten.sum.dim_IntList" in node.target:
                 self.assertEqual(node.inputs[1].arg.type, "as_ints")
 
+<<<<<<< HEAD
     def test_empty_constant(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -992,6 +1087,8 @@ def forward(self, x):
         loaded_ep = load(buffer)
         self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
@@ -1105,7 +1202,11 @@ def _deepcopy_inputs(inputs):
 
         def _check_graph(pre_dispatch):
             if pre_dispatch:
+<<<<<<< HEAD
                 ep = torch.export.export(
+=======
+                ep = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     fn,
                     _deepcopy_inputs(inputs),
                     {},
@@ -1171,7 +1272,11 @@ def test_optional_tuple(self):
             )
 
             @torch.library.impl("mylib::foo", "cpu", lib=lib)
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo")
+=======
+            @torch.library.impl_abstract("mylib::foo")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo_impl(a, b, c):
                 res2 = None
                 if c is not None:
@@ -1217,8 +1322,12 @@ def forward(self, x, y):
 
             # check ShapeEnv counters
             shape_env = _get_shape_env_from_gm(loaded_ep.graph_module)
+<<<<<<< HEAD
             next_index = shape_env.unbacked_symint_counter
             shape_env.unbacked_symint_counter += 1
+=======
+            next_index = next(shape_env.unbacked_symint_counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for symbol in bound:
                 self.assertTrue(symbol_is_type(symbol, SymT.UNBACKED_INT))
                 self.assertTrue(
@@ -1261,21 +1370,33 @@ def test_auto_functionalize(self):
             )
 
             @torch.library.impl("mylib::foo1", "cpu", lib=lib)
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo1")
+=======
+            @torch.library.impl_abstract("mylib::foo1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo1_impl(x, y, z, w, n):
                 x.add_(y[0] + w)
                 z.add_(y[1] + n)
                 return n + n
 
             @torch.library.impl("mylib::foo2", "cpu", lib=lib)
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo2")
+=======
+            @torch.library.impl_abstract("mylib::foo2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo2_impl(x, y, z, w, n):
                 x.add_(y[0] + w)
                 z.add_(y[1] + n)
                 return (n + n, n * n)
 
             @torch.library.impl("mylib::foo3", "cpu", lib=lib)
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo3")
+=======
+            @torch.library.impl_abstract("mylib::foo3")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo3_impl(x, y, z, w, n):
                 x.add_(y[0] + w)
                 z.add_(y[1] + n)
@@ -1412,6 +1533,10 @@ class Module(torch.nn.Module):
             def forward(self, x):
                 y = x.nonzero()
                 z = y.size(0)
+<<<<<<< HEAD
+=======
+                torch._check_is_size(z)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(z == 2)
                 return y
 
@@ -1422,6 +1547,10 @@ class Module(torch.nn.Module):
             def forward(self, x):
                 y = x.nonzero()
                 z = y.size(0)
+<<<<<<< HEAD
+=======
+                torch._check_is_size(z)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(z % 3 == 0)
                 torch._check(z == 3)
                 return y
@@ -1654,7 +1783,11 @@ def forward(self, x):
                 a = a * 2
                 return a, b
 
+<<<<<<< HEAD
         ep = torch.export.export(M(), (torch.ones(3),), strict=True)
+=======
+        ep = torch.export.export_for_training(M(), (torch.ones(3),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # insert another getitem node
         for node in ep.graph.nodes:
@@ -1698,14 +1831,22 @@ def forward(self, x):
     def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         model = case.model
         _check_meta = "map" not in name
+<<<<<<< HEAD
         with torch._export.config.patch(use_new_tracer_experimental=True):
             self.check_graph(model, case.example_args, _check_meta=_check_meta)
+=======
+        self.check_graph(model, case.example_args, _check_meta=_check_meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_constraints(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
+<<<<<<< HEAD
                 torch._check(n >= 0)
+=======
+                torch._check_is_size(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return y.sum() + torch.ones(n, 5).sum()
 
         f = Module()
@@ -1801,7 +1942,11 @@ def __init__(self) -> None:
             def forward(self):
                 return self.p * self.p
 
+<<<<<<< HEAD
         ep = torch.export.export(M(), (), strict=True)
+=======
+        ep = torch.export.export_for_training(M(), (), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep._example_inputs = None
         roundtrip_ep = deserialize(serialize(ep))
         self.assertTrue(torch.allclose(ep.module()(), roundtrip_ep.module()()))
@@ -1817,6 +1962,7 @@ def forward(self, x):
             inputs = (torch.ones(2, 3),)
             self.check_graph(m, inputs, strict=False)
 
+<<<<<<< HEAD
     def test_forward_compatibility(self):
         self.assertEqual(
             schema.TensorArgument(
@@ -1831,6 +1977,8 @@ def test_forward_compatibility(self):
             ),
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestDeserialize)
 
@@ -1843,7 +1991,11 @@ def forward(self, x):
                 return x + x
 
         f = Module()
+<<<<<<< HEAD
         ep = export(f, (torch.randn(1, 3),), strict=True)
+=======
+        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized_program = ExportedProgramSerializer().serialize(ep)
         serialized_program.exported_program.schema_version.major = -1
@@ -1879,7 +2031,11 @@ def forward(self, x):
                 y = self.linear(y)
                 return y
 
+<<<<<<< HEAD
         ep = export(Module(), inp, strict=True)
+=======
+        ep = export_for_training(Module(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         buffer = io.BytesIO()
         save(ep, buffer)
@@ -1897,7 +2053,11 @@ def forward(self, x):
         f = Foo()
 
         inp = (torch.randn(2, 2),)
+<<<<<<< HEAD
         ep = export(f, inp, strict=True)
+=======
+        ep = export_for_training(f, inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
             save(ep, f.name)
@@ -1914,7 +2074,11 @@ def forward(self, x, y):
         f = Foo()
 
         inp = (torch.tensor([6]), torch.tensor([7]))
+<<<<<<< HEAD
         ep = export(f, inp, strict=True)
+=======
+        ep = export_for_training(f, inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with TemporaryFileName(suffix=".pt2") as fname:
             path = Path(fname)
@@ -1932,7 +2096,11 @@ def forward(self, x):
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(f, inp, strict=True)
+=======
+        ep = export_for_training(f, inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         buffer = io.BytesIO()
         save(ep, buffer, extra_files={"extra.txt": "moo"})
@@ -1953,7 +2121,11 @@ def forward(self, x):
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(f, (torch.randn(1, 3),), strict=True)
+=======
+        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             ValueError, r"Saved archive version -1 does not match our current"
@@ -1963,6 +2135,7 @@ def forward(self, x):
                 f.seek(0)
                 file_prefix = f.name.split("/")[2].split(".")[0]
 
+<<<<<<< HEAD
                 # Create a new file and copy things over, but modify the
                 # archive version
                 with tempfile.NamedTemporaryFile(suffix=".pt2") as fnew:
@@ -1978,6 +2151,14 @@ def forward(self, x):
 
                     f.seek(0)
                     load(fnew.name)
+=======
+                # Modify the version
+                with zipfile.ZipFile(f, "a") as zipf:
+                    zipf.writestr(f"{file_prefix}/{ARCHIVE_VERSION_PATH}", "-1")
+
+                f.seek(0)
+                load(f.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_save_constants(self):
         class Foo(torch.nn.Module):
@@ -1989,7 +2170,11 @@ def forward(self, x):
                 list_tensor = [torch.tensor(3), torch.tensor(4)]
                 return x + self.a + list_tensor[0] + list_tensor[1]
 
+<<<<<<< HEAD
         ep = export(Foo(), (torch.tensor(1),), strict=True)
+=======
+        ep = export_for_training(Foo(), (torch.tensor(1),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         buffer.seek(0)
@@ -1998,6 +2183,7 @@ def forward(self, x):
         inp = (torch.tensor(1),)
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
+<<<<<<< HEAD
     def test_save_load_with_multiple_empty_tensors(self) -> None:
         # Test scenario where models have multiple empty tensors
         # but with different data types.
@@ -2043,6 +2229,8 @@ def forward(self, t: torch.Tensor) -> torch.Tensor:
         self.assertEqual(unf.int_buffer2.dtype, torch.uint8)
         self.assertEqual(unf.float_buffer.dtype, torch.float32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSerializeCustomClass(TestCase):
@@ -2060,7 +2248,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export(f, inputs, strict=True)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Replace one of the values with an instance of our custom class
         for node in ep.graph.nodes:
@@ -2114,7 +2306,11 @@ def forward(self, x):
 
         inputs = (torch.zeros(2, 3),)
         with enable_torchbind_tracing():
+<<<<<<< HEAD
             ep = export(f, inputs, strict=False)
+=======
+            ep = export_for_training(f, inputs, strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized_vals = serialize(ep)
         ep = deserialize(serialized_vals)
@@ -2134,7 +2330,11 @@ def forward(self, x):
 
         inputs = (torch.zeros(2, 3),)
         with enable_torchbind_tracing():
+<<<<<<< HEAD
             ep = export(f, inputs, strict=False)
+=======
+            ep = export_for_training(f, inputs, strict=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized_vals = serialize(ep)
         ep = deserialize(serialized_vals)
@@ -2154,7 +2354,10 @@ def forward(self, obj_attr, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo);  x = takes_foo = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -2169,7 +2372,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export(f, inputs, strict=True)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -2204,7 +2411,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.ones(2, 2),)
+<<<<<<< HEAD
         ep = export(f, inputs, strict=True)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -2240,7 +2451,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export(f, inputs, strict=True)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -2261,6 +2476,7 @@ def forward(self, x):
                 self.assertTrue(node.meta["custom"]["quantization_tag"] == "foo")
         self.assertEqual(counter, 1)
 
+<<<<<<< HEAD
     def test_unbacked_range_serdes(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -2316,6 +2532,8 @@ def forward(self, x, y, z):
         s0 = next(iter(ep.graph.nodes)).meta["val"].size(0)
         self.assertEqual(shape_env.var_to_range[s0.node.expr].lower, 0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_swap.py b/test/export/test_swap.py
index c25d11017f71d..1ecb7c6324541 100644
--- a/test/export/test_swap.py
+++ b/test/export/test_swap.py
@@ -9,9 +9,14 @@
 import torch
 import torch._dynamo as torchdynamo
 from torch import Tensor
+<<<<<<< HEAD
 from torch._export import config
 from torch._export.utils import register_dataclass_as_pytree_node
 from torch.export import export, register_dataclass
+=======
+from torch._export.utils import register_dataclass_as_pytree_node
+from torch.export import export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export._swap import _swap_modules
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
@@ -369,6 +374,7 @@ class CustomInput:
             a: Tensor
             b: Tensor
 
+<<<<<<< HEAD
         register_dataclass(
             CustomInput,
             serialized_type_name="test_swap.test_custom_input.CustomInput",
@@ -399,6 +405,8 @@ class CustomInput:
             a: Tensor
             b: Tensor
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         register_dataclass_as_pytree_node(
             CustomInput,
             serialized_type_name="test_swap.test_custom_input.CustomInput",
@@ -408,6 +416,7 @@ class Foo(torch.nn.Module):
             def forward(self, x, *, inputs):
                 return x + torch.matmul(inputs.a, inputs.b)
 
+<<<<<<< HEAD
         # shouldn't error
         with config.patch(use_new_tracer_experimental=True):
             _ = export(
@@ -416,6 +425,20 @@ def forward(self, x, *, inputs):
                 {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))},
                 strict=self.strict,
             )
+=======
+        ep = export(
+            Foo(),
+            (torch.randn(2, 2),),
+            {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))},
+            strict=self.strict,
+        )
+        swapped = _swap_modules(ep, {})
+        inp_args = (torch.randn(2, 2),)
+        inp_kwargs = {"inputs": CustomInput(torch.randn(2, 3), torch.randn(3, 2))}
+        res1 = torch.fx.Interpreter(swapped).run(*(*inp_args, *inp_kwargs.values()))
+        res2 = swapped(*inp_args, **inp_kwargs)
+        self.assertTrue(torch.allclose(res1, res2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_output(self):
         @dataclass
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index 246122433e06c..38b289bd1db6e 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -2,6 +2,10 @@
 # ruff: noqa: F841
 
 import copy
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -24,7 +28,10 @@
     _empty_tensor_queue,
     init_torchbind_implementations,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _assertEqualSkipScriptObject(test_case, exp, actual):
@@ -138,7 +145,11 @@ def _test_export_same_as_eager(
         def export_wrapper(f, args, kwargs, strict, pre_dispatch):
             with enable_torchbind_tracing():
                 if pre_dispatch:
+<<<<<<< HEAD
                     exported_program = torch.export.export(
+=======
+                    exported_program = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         f, args, kwargs, strict=strict
                     ).run_decompositions({})
                 else:
@@ -185,7 +196,10 @@ def forward(self, x, n):
 def forward(self, x, n):
     x, n, = fx_pytree.tree_flatten_spec(([x, n], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x, n);  n = _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -233,7 +247,10 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -268,7 +285,10 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -303,7 +323,10 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -366,7 +389,10 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(cc, x);  cc = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -411,7 +437,14 @@ def forward(self, x):
             F1(), (torch.ones(2, 3),), strict=False, pre_dispatch=pre_dispatch
         )
 
+<<<<<<< HEAD
     def test_torchbind_register_attr_at_runtime_get_restored(self):
+=======
+    # TODO(pianpwk): look into this
+    @unittest.expectedFailure
+    @parametrize("pre_dispatch", [True, False])
+    def test_torchbind_input_and_alias(self, pre_dispatch):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # alias as model attribute
         class F3(torch.nn.Module):
             def forward(self, x, foo):
@@ -419,6 +452,7 @@ def forward(self, x, foo):
                 return x + self.foo.add_tensor(x)
 
         foo = torch.classes._TorchScriptTesting._Foo(10, 20)
+<<<<<<< HEAD
         torch.export.export(F3(), (torch.ones(2, 3), foo), strict=False)
         self.assertFalse(hasattr(foo, "foo"))
 
@@ -436,6 +470,10 @@ def forward(self, x):
         foo = torch.classes._TorchScriptTesting._Foo(10, 20)
         self._test_export_same_as_eager(
             F3(foo), (torch.ones(2, 3),), strict=False, pre_dispatch=pre_dispatch
+=======
+        self._test_export_same_as_eager(
+            F3(), (torch.ones(2, 3), foo), strict=False, pre_dispatch=pre_dispatch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @parametrize("pre_dispatch", [True, False])
@@ -460,7 +498,10 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default_1);  attr = takes_foo_default_1 = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
@@ -503,7 +544,10 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo_list_return_default = torch.ops._TorchScriptTesting.takes_foo_list_return.default(attr, x)
     getitem_2 = takes_foo_list_return_default[0]
     getitem_3 = takes_foo_list_return_default[1]
@@ -556,7 +600,10 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     takes_foo_tuple_return_default = torch.ops._TorchScriptTesting.takes_foo_tuple_return.default(attr, x)
     getitem_1 = takes_foo_tuple_return_default[0]
     getitem_2 = takes_foo_tuple_return_default[1];  takes_foo_tuple_return_default = None
@@ -753,7 +800,11 @@ def forward(self, tq, x):
         b = torch.randn(2, 2)
         tq.push(a)
         tq.push(b)
+<<<<<<< HEAD
         ep = torch.export.export(
+=======
+        ep = torch.export.export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod, (tq, torch.randn(2, 2)), strict=False
         ).run_decompositions({})
         self.assertExpectedInline(
@@ -807,9 +858,15 @@ def forward(self, L_safe_obj_ : torch.ScriptObject):
         )
 
         with enable_torchbind_tracing():
+<<<<<<< HEAD
             ep = torch.export.export(mod, (safe_obj,), strict=False).run_decompositions(
                 {}
             )
+=======
+            ep = torch.export.export_for_training(
+                mod, (safe_obj,), strict=False
+            ).run_decompositions({})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertExpectedInline(
                 ep.graph_module.code.strip(),
                 """\
@@ -832,8 +889,13 @@ def test_identifying_torchbind_ops(self):
             self.assertFalse(op._has_torchbind_op_overload)
 
     def test_torchbind_op_register_fallthrough(self):
+<<<<<<< HEAD
         TEST_DISPATCH_KEY = torch._C.DispatchKey.AutogradCPU
         TEST_DISPATCH_KEY_STR = "AutogradCPU"
+=======
+        TEST_DISPATCH_KEY = torch._C.DispatchKey.AutocastCPU
+        TEST_DISPATCH_KEY_STR = "AutocastCPU"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for op_packet in self.torch_bind_ops:
             op = op_packet.default
@@ -964,19 +1026,31 @@ def forward(self, tq, x):
             with torch.library._scoped_library(ns, "FRAGMENT") as lib:
                 for op in ops:
                     lib.impl(
+<<<<<<< HEAD
                         op.__name__, torch.library.fallthrough_kernel, "AutogradCPU"
+=======
+                        op.__name__, torch.library.fallthrough_kernel, "AutocastCUDA"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                 gm = make_fx(mod, tracing_mode="fake")(tq1, x)
         else:
             for op in ops:
+<<<<<<< HEAD
                 op.default.py_impl(torch._C.DispatchKey.AutogradCPU)(
+=======
+                op.default.py_impl(torch._C.DispatchKey.AutocastCUDA)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.library.fallthrough_kernel
                 )
             gm = make_fx(mod, tracing_mode="fake")(tq1, x)
             for op in ops:
                 op.default._dispatch_cache.clear()
+<<<<<<< HEAD
                 del op.default.py_kernels[torch._C.DispatchKey.AutogradCPU]
+=======
+                del op.default.py_kernels[torch._C.DispatchKey.AutocastCUDA]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertExpectedInline(
             gm.code.strip(),
@@ -1071,7 +1145,10 @@ def forward(self, tq: torch.ScriptObject, x: torch.Tensor) -> None:
             """\
 def forward(self, tq, x):
     tq, x, = fx_pytree.tree_flatten_spec(([tq, x], {}), self._in_spec)
+<<<<<<< HEAD
     _guards_fn = self._guards_fn(tq, x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     queue_push_default = torch.ops._TorchScriptTesting.queue_push.default(tq, x);  x = queue_push_default = None
     return pytree.tree_unflatten((tq,), self._out_spec)""",
         )
@@ -1157,7 +1234,11 @@ def __obj_unflatten__(cls, flattened_ctx):
     def tearDown(self):
         torch._dynamo.reset()
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_script_object_input(self, backend):
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -1221,7 +1302,11 @@ def forward(self, L_tq_ : torch.ScriptObject, L_x_ : torch.Tensor):
         return (x_sin,)""",
             )
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_script_object_input_guards(self, backend):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1295,7 +1380,11 @@ def forward(self, tq, x):
         self.assertEqual(cnt.frame_count, 1)
 
         tq2 = _empty_tensor_queue()
+<<<<<<< HEAD
         # make first tensor's second dim dynamic
+=======
+        # make first tensor's secon dim dynamic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tq2.push(torch.randn(2, 4, requires_grad=False))
         torch.compile(mod, backend=cnt)(tq2, x)
         self.assertEqual(cnt.frame_count, 2)
@@ -1306,7 +1395,11 @@ def forward(self, tq, x):
         torch.compile(mod, backend=cnt)(tq3, x)
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_error_on_input_aliasing_contents(self, backend):
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -1337,7 +1430,11 @@ def setattr_f(tq):
             return tq
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, "Weird method call on TorchScript object"
+=======
+            RuntimeError, "call method __setattr__ on script object is not safe"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.compile(setattr_f, backend=backend)(_empty_tensor_queue())
 
@@ -1350,11 +1447,19 @@ def setattr_f(tq):
             return tq._not_defined_attr
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, "FakeScriptObject missing method implementation"
         ):
             torch.compile(setattr_f, backend=backend)(_empty_tensor_queue())
 
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+            RuntimeError, "doesn't define method _not_defined_attr"
+        ):
+            torch.compile(setattr_f, backend=backend)(_empty_tensor_queue())
+
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_body_aliasing_contents(self, backend):
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -1390,7 +1495,11 @@ def forward(self, L_x_ : torch.Tensor, L_tq_ : torch.ScriptObject):
     return (sub, add)""",
             )
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_tensor_op_in_tensor_flatten(self, backend):
         test_obj = torch.classes._TorchScriptTesting._FlattenWithTensorOp(
             torch.randn(3, 2)
@@ -1398,6 +1507,7 @@ def test_compile_tensor_op_in_tensor_flatten(self, backend):
 
         class TestMod(torch.nn.Module):
             def forward(self, obj, x):
+<<<<<<< HEAD
                 return obj.get() + x + obj.get().size(0)
 
         mod = TestMod()
@@ -1408,10 +1518,21 @@ def forward(self, obj, x):
         ep = torch.export.export(mod, (test_obj, x), strict=False).run_decompositions(
             {}
         )
+=======
+                return obj.get() + x
+
+        mod = TestMod()
+
+        torch.compile(mod, backend=backend, fullgraph=True)(test_obj, torch.randn(3, 1))
+        ep = torch.export.export_for_training(
+            mod, (test_obj, torch.randn(3, 1)), strict=False
+        ).run_decompositions({})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             ep.graph_module.code.strip(),
             """\
 def forward(self, token, obj, x):
+<<<<<<< HEAD
     with_effects = torch.ops.higher_order.with_effects(token, torch.ops.higher_order.call_torchbind, obj, 'get');  token = None
     getitem = with_effects[0]
     getitem_1 = with_effects[1];  with_effects = None
@@ -1425,6 +1546,16 @@ def forward(self, token, obj, x):
         self.assertEqual(eager_out, ep.module()(test_obj, x))
 
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    with_effects = torch.ops.higher_order.with_effects(token, torch.ops.higher_order.call_torchbind, obj, 'get');  token = obj = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add_3 = torch.ops.aten.add.Tensor(getitem_1, x);  getitem_1 = x = None
+    return (getitem, add_3)""",  # noqa: B950
+        )
+
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_error_on_non_fakified_method(self, backend):
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -1441,11 +1572,19 @@ def f(tq, x):
 
         x = torch.randn(2, 3)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, "FakeScriptObject missing method implementation"
         ):
             torch.compile(f, backend=backend)(_empty_tensor_queue(), x)
 
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+            RuntimeError, "FakeScriptObject doesn't define method"
+        ):
+            torch.compile(f, backend=backend)(_empty_tensor_queue(), x)
+
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_obj_as_hop_input(self, backend):
         def f(tq, x):
             def fn(tq, x):
@@ -1461,7 +1600,11 @@ def fn(tq, x):
             torch.compile(f, backend=backend)(_empty_tensor_queue(), x),
         )
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_obj_closure(self, backend):
         def f(x):
             def inner_f(x):
@@ -1476,7 +1619,11 @@ def inner_f(x):
         x = torch.randn(3, 2)
         _assertEqualScriptObject(self, f(x), opt_f(x))
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_global_obj(self, backend):
         global _TENSOR_QUEUE_GLOBAL_TEST
         _TENSOR_QUEUE_GLOBAL_TEST = _empty_tensor_queue()
@@ -1512,7 +1659,11 @@ def f(tq, x):
         )
         self.assertEqual(cnt.frame_count, 4)
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_obj_attributes(self, backend):
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -1544,7 +1695,11 @@ def forward(self, L_self_tq : torch.ScriptObject, L_x_ : torch.Tensor):
         return (call_torchbind_1,)""",
             )
 
+<<<<<<< HEAD
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
+=======
+    @parametrize("backend", ["eager", "aot_eager"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_obj_torchbind_op(self, backend):
         def f(tq, x):
             torch.ops._TorchScriptTesting.queue_push(tq, x.cos())
@@ -1559,6 +1714,7 @@ def f(tq, x):
             self, f(_empty_tensor_queue(), x), opt_f(_empty_tensor_queue(), x)
         )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     @parametrize("device", ["cpu", "cuda"])
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
@@ -1597,6 +1753,8 @@ def forward(self, x, tq):
             self, ep.module()(x, _empty_tensor_queue()), mod(x, _empty_tensor_queue())
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @skipIfTorchDynamo("torchbind not supported with dynamo yet")
 class TestRegisterFakeClass(TestCase):
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index 13b234c173e5b..aacff4897cfdc 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -178,6 +178,7 @@ def forward(self, x):
             id(getattr(unflattened_module.sub_net, "2")),
         )
 
+<<<<<<< HEAD
     def test_assert_tensor_metadata_stack(self):
         class N(torch.nn.Module):
             def __init__(self):
@@ -211,6 +212,8 @@ def forward(self, x, y):
         inp = (torch.randn(3), torch.randn(3))
         self.assertTrue(torch.allclose(uep(*inp), m(*inp)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
     @skipIfTorchDynamo("Non strict mode is not meant to run with dynamo")
     def test_unflatten_preserve_signature(self):
@@ -359,10 +362,16 @@ def forward(self, x):
 
         export_module = torch.export.export(Mod(), (torch.randn((2, 3)),), strict=True)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             AssertionError,
             escape("Guard failed: x.size()[0] == 2"),
         ):
             # expected 2, but got 6
+=======
+            RuntimeError,
+            escape("Expected input at *args[0].shape[0] to be equal to 2, but got 6"),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             export_module.module()(torch.randn(6, 6))
 
         unflattened = unflatten(export_module)
@@ -665,6 +674,11 @@ def forward(self, x):
             export_module.module(), unflattened, (torch.randn((2, 3)),)
         )
 
+<<<<<<< HEAD
+=======
+    # skip connection is not supported yet
+    @unittest.expectedFailure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unflatten_skipped_call_module(self):
         class C(torch.nn.Module):
             def __init__(self):
@@ -702,6 +716,7 @@ def forward(self, x):
         # The call chain looks like this:
         # A -> B -> C -> A.d
         ep = torch.export.export(a, (torch.randn(3),), strict=False)
+<<<<<<< HEAD
         ufm = unflatten(ep)
         self.assertExpectedInline(
             str(ufm.graph_module.code).strip(),
@@ -726,6 +741,9 @@ def forward(self, x):
     sin = torch.ops.aten.sin.default(cos);  cos = None
     return sin""",
         )
+=======
+        unflatten(ep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nested_leaf_non_strict(self):
         class Leaf(torch.nn.Module):
@@ -934,7 +952,11 @@ def forward(self, x, y):
         fn_count_sym_size = lambda graph: [node.target for node in graph.nodes].count(
             torch.ops.aten.sym_size.int
         )
+<<<<<<< HEAD
         self.assertEqual(fn_count_sym_size(unflat.graph), 1)
+=======
+        self.assertEqual(fn_count_sym_size(unflat.graph), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(fn_count_sym_size(unflat.m1.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m2.graph), 0)
 
@@ -1060,6 +1082,7 @@ def forward(self, x, y):
         inp = (torch.randn(3), None)
         self.assertTrue(torch.allclose(unf(*inp), M1()(*inp)))
 
+<<<<<<< HEAD
     def test_unflatten_root_module_type(self) -> None:
         class M(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1081,6 +1104,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(unf.m.type_name().split(".")[-1], "M")
         self.assertTrue(torch.allclose(unf(*inp), M1()(*inp)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_unflatten_training_ir.py b/test/export/test_unflatten_training_ir.py
index 677b427f37545..3cffb26041fff 100644
--- a/test/export/test_unflatten_training_ir.py
+++ b/test/export/test_unflatten_training_ir.py
@@ -7,14 +7,22 @@
     import test_unflatten  # @manual=fbcode//caffe2/test:test_export-library
     import testing  # @manual=fbcode//caffe2/test:test_export-library
 
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 test_classes = {}
 
 
 def mocked_training_ir_export(*args, **kwargs):
+<<<<<<< HEAD
     return export(*args, **kwargs, strict=True)
+=======
+    return export_for_training(*args, **kwargs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def make_dynamic_cls(cls):
diff --git a/test/export/test_verifier.py b/test/export/test_verifier.py
index f6e49791edf84..086c399c6a65c 100644
--- a/test/export/test_verifier.py
+++ b/test/export/test_verifier.py
@@ -6,7 +6,11 @@
 from torch import Tensor
 from torch._dynamo.eval_frame import is_dynamo_supported
 from torch._export.verifier import SpecViolationError, Verifier
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export.exported_program import InputKind, InputSpec, TensorArgument
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
@@ -20,7 +24,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(f, (torch.randn(100), torch.randn(100)), strict=True)
+=======
+        ep = export_for_training(f, (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         verifier = Verifier()
         verifier.check(ep)
@@ -47,7 +55,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(
+=======
+        ep = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f, (torch.randn(100), torch.randn(100)), strict=True
         ).run_decompositions({})
         for node in ep.graph.nodes:
@@ -72,7 +84,11 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True)
+=======
+        ep = export_for_training(f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         verifier = Verifier()
         verifier.check(ep)
@@ -91,7 +107,11 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export(
+=======
+        ep = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True
         ).run_decompositions({})
         for node in ep.graph_module.true_graph_0.graph.nodes:
@@ -111,7 +131,11 @@ def __init__(self) -> None:
             def forward(self, x: Tensor) -> Tensor:
                 return self.linear(x)
 
+<<<<<<< HEAD
         ep = export(M(), (torch.randn(10, 10),), strict=True)
+=======
+        ep = export_for_training(M(), (torch.randn(10, 10),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.validate()
 
     def test_ep_verifier_invalid_param(self) -> None:
@@ -125,7 +149,11 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
+<<<<<<< HEAD
         ep = export(M(), (torch.randn(100), torch.randn(100)), strict=True)
+=======
+        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Parameter doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -150,7 +178,11 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
+<<<<<<< HEAD
         ep = export(M(), (torch.randn(100), torch.randn(100)), strict=True)
+=======
+        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Buffer doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -182,7 +214,13 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
+<<<<<<< HEAD
         ep = export(M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True)
+=======
+        ep = export_for_training(
+            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.validate()
 
     def test_ep_verifier_invalid_output(self) -> None:
@@ -205,7 +243,13 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
+<<<<<<< HEAD
         ep = export(M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True)
+=======
+        ep = export_for_training(
+            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_node = list(ep.graph.nodes)[-1]
         output_node.args = (
diff --git a/test/export/testing.py b/test/export/testing.py
index cfa29cf693dea..0e47a6cddcebb 100644
--- a/test/export/testing.py
+++ b/test/export/testing.py
@@ -257,12 +257,15 @@ def expectedFailureTrainingIRToRunDecompNonStrict(fn):
     return fn
 
 
+<<<<<<< HEAD
 # Controls tests generated in test/export/test_export_strict_v2.py
 def expectedFailureStrictV2(fn):
     fn._expected_failure_strict_v2 = True
     return fn
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Controls tests generated in test/export/test_export_strict.py
 def expectedFailureStrict(fn):
     fn._expected_failure_strict = True
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 5a962dfa57c05..c2c325d7b070a 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -139,8 +139,11 @@
     # These ops are defined in torch/csrc/distributed/c10d/Ops.cpp
     # TODO: add back restriction when c10d ops can be exported
     ("c10d::.*", datetime.date(9999, 1, 1)),
+<<<<<<< HEAD
     # Previously MPS_only did not support backward
     ("aten::_fused_rms_norm", datetime.date(2025, 12, 30)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/functorch/attn_ft.py b/test/functorch/attn_ft.py
index d2aabe51ec9f8..099d24f357e5d 100644
--- a/test/functorch/attn_ft.py
+++ b/test/functorch/attn_ft.py
@@ -6,7 +6,11 @@
 import math
 
 import torch
+<<<<<<< HEAD
 from functorch.dim import cat, dimlists, dims
+=======
+from functorch.dim import cat, dimlists, dims, softmax
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import nn
 
 
@@ -115,7 +119,11 @@ def forward(
 
             # we can then use that as an indirect index into the embedding table values to look up the features for that index
             # this is just a `gather` primitive op. The resulting tensor will
+<<<<<<< HEAD
             # have all the dimensions of embedding_idx (query_sequence x key_sequence),
+=======
+            # have all the dimensions of embeddeding_idx (query_sequence x key_sequence),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # plus all the dimensions of `embed` that were not indirectly accessed (`embedding_range`).
             # this form of indirect indexing is more straightforward than either advanced indexing or torch.gather which both
             # have a lot of dependencies on the positions of indexing tensors.
@@ -126,7 +134,11 @@ def forward(
 
             if self.position_embedding_type == "relative_key":
                 # these were einsum ops in the positional code because they are not easy to fit to existing matmul operators
+<<<<<<< HEAD
                 # even though they are degenerate matmuls
+=======
+                # eventhough they are degenerate matmuls
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 relative_position_scores = (q * positional_embedding).sum(features)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
@@ -142,7 +154,11 @@ def forward(
 
         attention_probs = attention_scores
         # Normalize the attention scores to probabilities.
+<<<<<<< HEAD
         attention_probs = torch.softmax(attention_scores, dim=key_sequence)
+=======
+        attention_probs = softmax(attention_scores, dim=key_sequence)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # # This is actually dropping out entire tokens to attend to, which might
         # # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = torch.nn.functional.dropout(
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index 2ffdfec1e8633..de673b6a2d5b6 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -36,7 +36,11 @@ def get_public_overridable_apis(pytorch_root="/raid/rzou/pt/debug-cpu"):
     for module, module_name, src in public_docs:
         with open(f"{pytorch_root}/{src}") as f:
             lines = f.readlines()
+<<<<<<< HEAD
         # APIs either begin with 4 spaces or ".. autofunction::"
+=======
+        # APIs eitehr begin with 4 spaces or ".. autofunction::"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         api_lines1 = [line.strip() for line in lines if line.startswith(" " * 4)]
         api_lines2 = [
             line.strip()[len(".. autofunction:: ") :]
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index d0611f19cf2ac..e4b8ca05e3801 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -6,7 +6,11 @@
 import torch
 import torch._functorch.config as config
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton
 from torch.utils.checkpoint import checkpoint
 from torch.utils.flop_counter import FlopCounterMode, register_flop_formula
@@ -106,7 +110,11 @@ def call():
             return f(x, ws)
 
         _, eager_flops = get_mem_and_flops(call)
+<<<<<<< HEAD
         for budget in range(11):
+=======
+        for budget in range(0, 11):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mem, flops = get_mem_and_flops(call, memory_budget=budget / 10)
             if budget <= 5:
                 # We start saving the matmuls
@@ -251,7 +259,11 @@ def call():
             return f(x, ws)
 
         expected = call()
+<<<<<<< HEAD
         for budget in range(11):
+=======
+        for budget in range(0, 11):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             memory_budget = budget / 10
             torch._dynamo.reset()
             with config.patch(activation_memory_budget=memory_budget):
@@ -405,5 +417,9 @@ def call():
 
 if __name__ == "__main__":
     # I'm using the cuda memory allocator to verify memory allocations
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON and not TEST_WITH_ROCM:
+=======
+    if HAS_CUDA and not TEST_WITH_ROCM:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/functorch/test_ac_logging.py b/test/functorch/test_ac_logging.py
index cb65f028a00f3..6589cea4abb4c 100644
--- a/test/functorch/test_ac_logging.py
+++ b/test/functorch/test_ac_logging.py
@@ -37,7 +37,10 @@ def setUp(self) -> None:
         self.recomputable_node_idxs: list[int] = []
         self.expected_runtime: int = 100
         self.memories_banned_nodes: list[int] = [50]
+<<<<<<< HEAD
         self.normalized_memories_banned_nodes: list[float] = [0.10344827586206896]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.runtimes_banned_nodes: list[int] = [10]
         self.min_cut_saved_values: list[Node] = [self.node1]
 
@@ -94,12 +97,17 @@ def test_create_activation_checkpointing_logging_structure_payload(self) -> None
             "Expected Runtime": self.expected_runtime,
             "Knapsack Saved Nodes": self.saved_node_idxs,
             "Knapsack Recomputed Nodes": self.recomputable_node_idxs,
+<<<<<<< HEAD
             "Knapsack Input Memories": self.normalized_memories_banned_nodes,
             "Absolute Memories": self.memories_banned_nodes,
+=======
+            "Knapsack Input Memories": self.memories_banned_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Knapsack Input Runtimes": self.runtimes_banned_nodes,
             "Min Cut Solution Saved Values": ["node1"],
         }
         result = create_activation_checkpointing_logging_structure_payload(
+<<<<<<< HEAD
             joint_graph=self.graph,
             joint_graph_node_information=input_joint_graph_node_information,
             joint_graph_edges=joint_graph_edges,
@@ -111,6 +119,18 @@ def test_create_activation_checkpointing_logging_structure_payload(self) -> None
             normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
             runtimes_banned_nodes=self.runtimes_banned_nodes,
             min_cut_saved_values=self.min_cut_saved_values,
+=======
+            self.graph,
+            input_joint_graph_node_information,
+            joint_graph_edges,
+            self.all_recomputable_banned_nodes,
+            self.expected_runtime,
+            self.saved_node_idxs,
+            self.recomputable_node_idxs,
+            self.memories_banned_nodes,
+            self.runtimes_banned_nodes,
+            self.min_cut_saved_values,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(result, expected_payload)
 
@@ -122,6 +142,7 @@ def test_create_structured_trace_for_min_cut_info(
         self, mock_json_dumps: MagicMock, mock_trace_structured: MagicMock
     ) -> None:
         create_structured_trace_for_min_cut_info(
+<<<<<<< HEAD
             joint_graph=self.graph,
             all_recomputable_banned_nodes=self.all_recomputable_banned_nodes,
             saved_node_idxs=self.saved_node_idxs,
@@ -131,6 +152,16 @@ def test_create_structured_trace_for_min_cut_info(
             normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
             runtimes_banned_nodes=self.runtimes_banned_nodes,
             min_cut_saved_values=self.min_cut_saved_values,
+=======
+            self.graph,
+            self.all_recomputable_banned_nodes,
+            self.saved_node_idxs,
+            self.recomputable_node_idxs,
+            self.expected_runtime,
+            self.memories_banned_nodes,
+            self.runtimes_banned_nodes,
+            self.min_cut_saved_values,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(mock_trace_structured.call_count, 1)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 29b69322d2fc8..bf22c4752afe0 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -10,10 +10,16 @@
 import itertools
 import unittest
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import ContextDecorator, ExitStack, nullcontext
 from functools import partial, wraps
 from typing import Any, Optional, Union
+=======
+from contextlib import ContextDecorator, ExitStack, nullcontext
+from functools import partial, wraps
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 from common_utils import (
@@ -28,7 +34,10 @@
 import torch
 import torch._dynamo as torchdynamo
 import torch.nn as nn
+<<<<<<< HEAD
 import torch.nn.functional as F
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.utils._pytree as pytree
 from functorch import grad, jacrev, make_fx, vjp, vmap
 from functorch.compile import (
@@ -53,14 +62,20 @@
 from torch._dynamo.utils import counters
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
 from torch._functorch.aot_autograd import (
+<<<<<<< HEAD
     _aot_export_function,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aot_export_joint_simple,
     aot_export_module,
     SerializableAOTDispatchCompiler,
 )
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.codecache import compiled_fx_graph_hash
+<<<<<<< HEAD
 from torch._inductor.custom_graph_pass import CustomPartitionerFn
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.output_code import MockFXGraphCacheOutput
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
@@ -101,7 +116,10 @@
 )
 from torch.testing._internal.subclasses import WrapperSubclass
 from torch.testing._internal.two_tensor import TwoTensor, TwoTensorMode
+<<<<<<< HEAD
 from torch.utils._python_dispatch import TorchDispatchMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 USE_TORCHVISION = False
@@ -695,6 +713,7 @@ def f(a, b):
         ]
         self.verify_aot_autograd(f, inp, keep_inp_mutations=True)
 
+<<<<<<< HEAD
     def _compile_autocast(self, device, *, forward_autocast):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor x) -> Tensor")
@@ -769,6 +788,8 @@ def test_backward_pass_autocast_custom(self):
                 self.assertEqual(out, torch.zeros_like(out))
                 self.assertEqual(grad, torch.ones_like(grad))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfDynamoInput(
         "Test doesn't make sense with dynamo, which changes order of mutations"
     )
@@ -995,6 +1016,7 @@ def f(x):
         ):
             new_out.sum().backward()
 
+<<<<<<< HEAD
     def test_nested_subclasses_non_homogenous(self):
         def f(x):
             x_elem = x.elem
@@ -1099,6 +1121,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertTrue(isinstance(aa2.grad, ConstantExtraMetadataTensor))
         self.assertTrue(isinstance(aa2.grad.elem, TwoTensor))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
     def test_custom_tensor_metadata(self):
         def f(x):
@@ -2278,7 +2302,13 @@ def forward(self, primals_1):
     view = torch.ops.aten.view.default(mul, [-1])
     select = torch.ops.aten.select.int(mul, 0, 0)
     detach = torch.ops.aten.detach.default(select);  select = None
+<<<<<<< HEAD
     return (view, mul, detach)""",
+=======
+    detach_1 = torch.ops.aten.detach.default(detach);  detach = None
+    detach_2 = torch.ops.aten.detach.default(detach_1);  detach_1 = None
+    return (view, mul, detach_2)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_output_aliases_intermediate_inplace_view(self):
@@ -2478,7 +2508,11 @@ def f(a, b):
             return a.mul(3), b.mul(4)
 
         inp = [
+<<<<<<< HEAD
             # First inp doesn't require grad, but we switch it on
+=======
+            # First inp doesnt require grad, but we switch it on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.ones(3, 3, requires_grad=False),
             torch.ones(3, 3, requires_grad=True),
         ]
@@ -2605,6 +2639,7 @@ def f(a, b):
         ]
         self.verify_aot_autograd(f, inp_grad, test_mutation=True)
 
+<<<<<<< HEAD
     def test_fw_bw_mutation_no_functionalization1(self):
         class FwBwMutation(torch.autograd.Function):
             @staticmethod
@@ -2769,6 +2804,8 @@ def forward(self, add, tangents_1):
     return (mul_1, None)""",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_backward_mutation_metadata(self):
         class BwMutation(torch.autograd.Function):
             @staticmethod
@@ -3245,8 +3282,13 @@ def forward(self, primals_1):
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
     as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = add = None
+<<<<<<< HEAD
     as_strided_9 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     view_1 = torch.ops.aten.view.default(as_strided_9, [4]);  as_strided_9 = None
+=======
+    as_strided_8 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    view_1 = torch.ops.aten.view.default(as_strided_8, [4]);  as_strided_8 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (as_strided_scatter, view_1)""",
         )  # noqa: B950
 
@@ -3409,6 +3451,7 @@ def forward(self, primals_1, primals_2, primals_3):
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
     as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = add = None
+<<<<<<< HEAD
     as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     unsqueeze = torch.ops.aten.unsqueeze.default(as_strided_5, 0);  as_strided_5 = None
     add_1 = torch.ops.aten.add.Tensor(primals_2, primals_3);  primals_2 = primals_3 = None
@@ -3416,6 +3459,15 @@ def forward(self, primals_1, primals_2, primals_3):
     as_strided_14 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     view_2 = torch.ops.aten.view.default(as_strided_14, [-1]);  as_strided_14 = None
     return (as_strided_scatter, add_2, view_2, unsqueeze)""",
+=======
+    add_1 = torch.ops.aten.add.Tensor(primals_2, primals_3);  primals_2 = primals_3 = None
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    unsqueeze_1 = torch.ops.aten.unsqueeze.default(as_strided_5, 0);  as_strided_5 = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, unsqueeze_1);  add_1 = None
+    as_strided_14 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    view_2 = torch.ops.aten.view.default(as_strided_14, [-1]);  as_strided_14 = None
+    return (as_strided_scatter, add_2, view_2, unsqueeze_1)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )  # noqa: B950
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
@@ -5002,6 +5054,7 @@ def f(x, y):
         inps = [torch.randn(2, 2), torch.ones(2)]
         gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(gm.print_readable(False, expanded_def=True)),
             """\
 class <lambda>(torch.nn.Module):
@@ -5010,6 +5063,12 @@ def forward(
         arg0_1: "f32[2, 2]",  # PlainAOTInput(idx=0)
         arg1_1: "f32[2]",  # PlainAOTInput(idx=1)
     ):
+=======
+            normalize_gm(gm.print_readable(False)),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_1: "f32[]" = torch.ops.aten.sum.default(arg0_1)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
 
@@ -5020,10 +5079,14 @@ def forward(
 
         add: "f32[2, 2]" = torch.ops.aten.add.Tensor(getitem, 3)
         add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
+<<<<<<< HEAD
         return (
             add,  # PlainAOTOutput(idx=0)
             add_1,  # PlainAOTOutput(idx=1)
         )
+=======
+        return (add, add_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class true_graph_0(torch.nn.Module):
         def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
@@ -5097,6 +5160,7 @@ def f(x, y):
         inps = [torch.randn(2, 2), torch.ones(2)]
         gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
         self.assertExpectedInline(
+<<<<<<< HEAD
             normalize_gm(gm.print_readable(False, expanded_def=True)),
             """\
 class <lambda>(torch.nn.Module):
@@ -5107,15 +5171,29 @@ def forward(
     ):
         cos: "f32[2, 2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
 
+=======
+            normalize_gm(gm.print_readable(False)),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+        cos: "f32[2, 2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+        _set_grad_enabled = torch._C._set_grad_enabled(True);  _set_grad_enabled = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         body_graph_0 = self.body_graph_0
         map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = arg1_1 = None
         getitem_2: "f32[2, 2]" = map_impl[0];  map_impl = None
 
         sum_1: "f32[]" = torch.ops.aten.sum.default(getitem_2);  getitem_2 = None
         add: "f32[2, 2]" = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+<<<<<<< HEAD
         return (
             add,  # PlainAOTOutput(idx=0)
         )
+=======
+        return (add,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class body_graph_0(torch.nn.Module):
         def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]"):
@@ -5274,6 +5352,7 @@ def forward(self, x):
         for node in fx_g.graph.nodes:
             node.meta.pop("stack_trace", None)
         self.assertExpectedInline(
+<<<<<<< HEAD
             fx_g.print_readable(print_output=False, expanded_def=True),
             """\
 class <lambda>(torch.nn.Module):
@@ -5288,6 +5367,12 @@ def forward(
         arg6_1: "i64[]",
         arg7_1: "f32[1, 1, 3, 3]",
     ):
+=======
+            fx_g.print_readable(print_output=False),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]", arg5_1: "f32[3]", arg6_1: "i64[]", arg7_1: "f32[1, 1, 3, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # No stacktrace found for following nodes
         convolution: "f32[1, 3, 3, 3]" = torch.ops.aten.convolution.default(arg7_1, arg0_1, arg1_1, [1, 1], [0, 0], [1, 1], False, [0, 0], 1);  arg1_1 = None
         add: "i64[]" = torch.ops.aten.add.Tensor(arg6_1, 1);  arg6_1 = None
@@ -5300,12 +5385,32 @@ def forward(
         relu: "f32[1, 3, 3, 3]" = torch.ops.aten.relu.default(getitem);  getitem = None
         detach: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  detach = None
         detach_1: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu)
+<<<<<<< HEAD
         sum_1: "f32[]" = torch.ops.aten.sum.default(relu)
         detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
         ones_like: "f32[]" = torch.ops.aten.ones_like.default(sum_1, pin_memory = False, memory_format = torch.preserve_format)
         expand: "f32[1, 3, 3, 3]" = torch.ops.aten.expand.default(ones_like, [1, 3, 3, 3]);  ones_like = None
         detach_3: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
         threshold_backward: "f32[1, 3, 3, 3]" = torch.ops.aten.threshold_backward.default(expand, detach_3, 0);  expand = detach_3 = None
+=======
+        detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
+        detach_3: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_2);  detach_2 = None
+        detach_4: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_3);  detach_3 = None
+        sum_1: "f32[]" = torch.ops.aten.sum.default(relu)
+        detach_5: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
+        detach_6: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_5);  detach_5 = None
+        detach_7: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_6);  detach_6 = None
+        detach_8: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_7);  detach_7 = None
+        detach_9: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_8);  detach_8 = None
+        detach_10: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_9);  detach_9 = None
+        ones_like: "f32[]" = torch.ops.aten.ones_like.default(sum_1, pin_memory = False, memory_format = torch.preserve_format)
+        expand: "f32[1, 3, 3, 3]" = torch.ops.aten.expand.default(ones_like, [1, 3, 3, 3]);  ones_like = None
+        detach_11: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_4);  detach_4 = None
+        detach_12: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_11);  detach_11 = None
+        detach_13: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_12);  detach_12 = None
+        detach_14: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_13);  detach_13 = None
+        threshold_backward: "f32[1, 3, 3, 3]" = torch.ops.aten.threshold_backward.default(expand, detach_14, 0);  expand = detach_14 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         native_batch_norm_backward = torch.ops.aten.native_batch_norm_backward.default(threshold_backward, convolution, arg2_1, getitem_3, getitem_4, getitem_1, getitem_2, True, 1e-05, [True, True, True]);  threshold_backward = convolution = arg2_1 = getitem_1 = getitem_2 = None
         getitem_5: "f32[1, 3, 3, 3]" = native_batch_norm_backward[0]
         getitem_6: "f32[3]" = native_batch_norm_backward[1]
@@ -5314,7 +5419,11 @@ def forward(
         getitem_8 = convolution_backward[0];  getitem_8 = None
         getitem_9: "f32[3, 1, 1, 1]" = convolution_backward[1]
         getitem_10: "f32[3]" = convolution_backward[2];  convolution_backward = None
+<<<<<<< HEAD
         return (getitem_3, getitem_4, add, sum_1, detach_2, getitem_9, getitem_10, getitem_6, getitem_7)
+=======
+        return (getitem_3, getitem_4, add, sum_1, detach_10, getitem_9, getitem_10, getitem_6, getitem_7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """,  # noqa: B950
         )
 
@@ -5358,6 +5467,7 @@ def forward(
         for node in fx_g_inference.graph.nodes:
             node.meta.pop("stack_trace", None)
         self.assertExpectedInline(
+<<<<<<< HEAD
             fx_g_inference.print_readable(print_output=False, expanded_def=True),
             """\
 class <lambda>(torch.nn.Module):
@@ -5372,6 +5482,12 @@ def forward(
         arg6_1: "i64[]",  # PlainAOTInput(idx=6)
         arg7_1: "f32[1, 1, 3, 3]",  # PlainAOTInput(idx=7)
     ):
+=======
+            fx_g_inference.print_readable(print_output=False),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]", arg5_1: "f32[3]", arg6_1: "i64[]", arg7_1: "f32[1, 1, 3, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # No stacktrace found for following nodes
         convolution: "f32[1, 3, 3, 3]" = torch.ops.aten.convolution.default(arg7_1, arg0_1, arg1_1, [1, 1], [0, 0], [1, 1], False, [0, 0], 1);  arg7_1 = arg0_1 = arg1_1 = None
         add: "i64[]" = torch.ops.aten.add.Tensor(arg6_1, 1);  arg6_1 = None
@@ -5382,6 +5498,7 @@ def forward(
         relu: "f32[1, 3, 3, 3]" = torch.ops.aten.relu.default(getitem);  getitem = None
         sum_1: "f32[]" = torch.ops.aten.sum.default(relu)
         detach: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
+<<<<<<< HEAD
         return (
             getitem_3,  # InputMutationAOTOutput(mutated_input=PlainAOTInput(idx=4))
             getitem_4,  # InputMutationAOTOutput(mutated_input=PlainAOTInput(idx=5))
@@ -5389,6 +5506,11 @@ def forward(
             sum_1,  # PlainAOTOutput(idx=0)
             detach,  # PlainAOTOutput(idx=1)
         )
+=======
+        detach_1: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach);  detach = None
+        detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
+        return (getitem_3, getitem_4, add, sum_1, detach_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """,  # noqa: B950
         )
         # Some important characteristics of the exported graph below:
@@ -5514,6 +5636,7 @@ def forward(self, x):
 
         mod = M()
         inp = torch.randn(2, requires_grad=True)
+<<<<<<< HEAD
         gm, _ = aot_export_module(mod, [inp], trace_joint=False)
         self.assertExpectedInline(
             str(gm.graph).strip(),
@@ -5523,6 +5646,13 @@ def forward(self, x):
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg0_1, 4), kwargs = {})
     return (add, add)""",
         )
+=======
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Found a graph input that requires gradients, and received a mutation",
+        ):
+            aot_export_module(mod, [inp], trace_joint=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_aot_export_input_mutation_on_parameter_banned(self):
         def fn(p, x):
@@ -5533,6 +5663,7 @@ def fn(p, x):
         inp = torch.randn(2)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "aot_export_joint_simple does not support input mutations. ViewAndMutationMeta",
         ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
@@ -5553,6 +5684,13 @@ def fn(p, x):
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, %arg1_1), kwargs = {})
     return (mul, add)""",
         )
+=======
+            "Found a graph input that requires gradients, and received a mutation",
+        ):
+            aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
+            aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
+            aot_export_module(mod, [inp], trace_joint=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_aot_export_synthetic_bases_banned(self):
         def fn(p, x, y):
@@ -5703,6 +5841,7 @@ def forward(self):
     return (full_1,)""",  # noqa: B950
         )
 
+<<<<<<< HEAD
     def test_aot_export_input_mutation(self):
         def f(x, buf):
             buf.add_(1)
@@ -5734,6 +5873,8 @@ def forward(self, primals, tangents):
     return pytree.tree_unflatten([mul, mul_1, None], self._out_spec)""",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestPartitioning(AOTTestCase):
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
@@ -5838,6 +5979,7 @@ def forward(self, primals_1, tangents_1):
         )
 
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+<<<<<<< HEAD
     def test_custom_partitioner_fn(self):
         class MyCustomPartitionerFn(CustomPartitionerFn):
             def __init__(self):
@@ -5881,6 +6023,8 @@ def forward(self, primals_1, tangents_1):
         )
 
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_min_cut_partitioner_save_shape(self):
         def f(x):
             s = x.sum(dim=1)
@@ -6026,7 +6170,11 @@ def f(a, b, c, d):
         _, fw_graph_out_nodes = get_ins_outs(fw_graph)
         self.assertEqual(
             # fw outputs include b.size() which expands to 2 symints,
+<<<<<<< HEAD
             # then 4 tensors (transposes of matrices used for mm) are saved
+=======
+            # then 4 tensors (transposes of matricies used for mm) are saved
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # finally 3 symints are saved
             [False, True, True, False, False] + [False] * 4 + [True] * 3,
             [is_sym_node(n) for n in fw_graph_out_nodes],
@@ -6115,6 +6263,7 @@ def test_autocast(self):
             res = aot_mod(x)
         res.sum().backward()
 
+<<<<<<< HEAD
     def test_quantize_activation_duplicate_nodes(self):
         """Test both quantize_activation_fw and quantize_activation_bw handle duplicate nodes correctly"""
         import torch.fx as fx
@@ -6328,6 +6477,8 @@ def test_quantize_activation_duplicate_nodes(self):
                     f"Quantized placeholder {quant_placeholder.name} should have minimal direct users",
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestAOTDispatch(AOTTestCase):
     # Tests to add cases for (non-exhaustive list, mostly for my notes):
@@ -6399,7 +6550,11 @@ def forward(self, primals_1, primals_2, primals_3):
 
         # Important pieces of the graph:
         # - 4 total dense outputs.
+<<<<<<< HEAD
         #   This corresponds to the fact that each user fwd input (a, b)
+=======
+        #   This corresponds to the fact that each user fwd inpt (a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #   will get a gradient that is a TwoTensor subclass,
         #   so (mul_2, mul_3) will be wrapped into a.grad
         #   and (div_1, div_2) will be wrapped into b.grad
@@ -6481,7 +6636,11 @@ def f(a, b):
         self.assertEqual(out_ref[0].b, out_test[0].b)
         self.assertEqual(out_ref[1], out_test[1])
 
+<<<<<<< HEAD
         # We compiled our graph assuming type(grad_out[1]) is torch.Tensor,
+=======
+        # We compiled our graph assuming type(grad_out[1]) == torch.Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # but we were wrong: in the below tests, it is a subclass.
         # This will eventually require a repartition + recompile
         with self.assertRaisesRegex(
@@ -6569,7 +6728,11 @@ def f(a, b):
         self.assertEqual(b_test.a, b_ref.a)
         self.assertEqual(b_test.b, b_ref.b)
 
+<<<<<<< HEAD
         # NOTE: we need to use b in our gradient compute. Otherwise we will need to recompile the backward.
+=======
+        # NOTE: we need to use b in our gradient compute. Otherwise we will need to recompile teh backward.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (b_ref * out_ref).sum().backward()
         (b_test * out_test).sum().backward()
         # Both grad_inputs are TwoTensor
@@ -7347,6 +7510,7 @@ def fn_(x):
         torch.compile(fn, backend="inductor", fullgraph=True)(x)
         torch.compile(fn_, backend="inductor", fullgraph=True)(x)
 
+<<<<<<< HEAD
     def test_layer_norm(self):
         def fn(x):
             return F.layer_norm(x, normalized_shape=(8,))
@@ -7368,6 +7532,8 @@ def fn(x):
         aot_eager = torch.compile(backend="aot_eager")(fn)(x)
         self.assertEqual(eager, aot_eager, atol=0, rtol=0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_parameters(self):
         class _M(torch.nn.Module):
             def __init__(self):
@@ -8059,7 +8225,11 @@ def test_saved_tensors_hooks_donated_buffers(self):
             "pack_hash",
             "unpack_hash",
         )
+<<<<<<< HEAD
         logger_name = "torch._functorch._aot_autograd.graph_compile"
+=======
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class SAF(torch.autograd.Function):
             @staticmethod
@@ -8143,9 +8313,13 @@ def fn(x):
     decorate(
         "linalg.pinv",
         "singular",
+<<<<<<< HEAD
         # This delta is coming entirely from the clone() on tangents
         # in AOTDispatcher to make them contiguous
         decorator=toleranceOverride({torch.float32: tol(atol=1e-02, rtol=1e-02)}),
+=======
+        decorator=toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     decorate(
         "nn.functional.interpolate",
@@ -8203,6 +8377,12 @@ def fn(x):
     xfail(
         "nn.functional.fractional_max_pool3d", ""
     ),  # rand() received an invalid combination of arguments - g...
+<<<<<<< HEAD
+=======
+    xfail(
+        "nn.functional.group_norm", ""
+    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("trace", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
     decorate(
         "linalg.householder_product",
@@ -8211,6 +8391,7 @@ def fn(x):
 }
 
 
+<<<<<<< HEAD
 def _test_aot_autograd_helper(
     self,
     device,
@@ -8219,6 +8400,9 @@ def _test_aot_autograd_helper(
     dynamic=False,
     disable_functionalization=False,
 ):
+=======
+def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not op.supports_autograd:
         self.skipTest("Op does not support autograd")
 
@@ -8249,7 +8433,10 @@ def _test_aot_autograd_helper(
                 check_gradients=True,
                 try_check_data_specialization=try_check_data_specialization,
                 skip_correctness_check=op.skip_correctness_check_compile_vs_eager,
+<<<<<<< HEAD
                 disable_functionalization=disable_functionalization,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         except DynamicOutputShapeException:
             self.skipTest("Dynamic output shape operation in trace")
@@ -8350,6 +8537,7 @@ def test_aot_autograd_exhaustive(self, device, dtype, op):
     def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op, dynamic=True)
 
+<<<<<<< HEAD
     @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
     @skipOps(
         "TestEagerFusionOpInfo",
@@ -8380,6 +8568,8 @@ def test_aot_autograd_disable_functionalization_symbolic_exhaustive(
             disable_functionalization=True,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aot_autograd_module_failures = set(
     {
@@ -8395,7 +8585,11 @@ def test_aot_autograd_disable_functionalization_symbolic_exhaustive(
         # implementation not traceable or that there is a bug in AOTAutograd.
         torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.eq compares a mask input
         # to a causal mask tensor, to see if Boolean is_causal should be set
+<<<<<<< HEAD
         # for TransformerEncoder layers, MHA and sdp custom kernels
+=======
+        # for TrnasformerEncoder layers, MHA and sdp custom kernels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input
         # to a causal mask tensor, to see if Boolean is_causal should be set
         # for TransformerEncoder layers, MHA and sdp custom kernels
@@ -8407,6 +8601,11 @@ def test_aot_autograd_disable_functionalization_symbolic_exhaustive(
     torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.GaussianNLLLoss,  # NotImplementedError: local_scalar_dense/item NYI for torch.bool
+<<<<<<< HEAD
+=======
+    torch.nn.GroupNorm,  # in native_group_norm_backward cpg, _rem = divmod(C, group)
+    # TypeError: unsupported operand type(s) for divmod(): 'SymInt' and 'int'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.nn.FractionalMaxPool3d,  # int() argument must be a string, a bytes-like object or a number, not 'SymFloat'
     torch.nn.BCELoss,  # new_size = _infer_size(target.size(), weight.size())
     # RuntimeError: expected int at position 0, but got: SymInt
@@ -8575,6 +8774,7 @@ def _inps():
         self.assertEqual(ref_inps_after_fw, inps_after_fw)
         self.assertEqual(ref_inps_after_bw, inps_after_bw)
 
+<<<<<<< HEAD
     def test_mutation_of_input_in_fw_and_bw(self):
         class AF(torch.autograd.Function):
             @staticmethod
@@ -8622,6 +8822,8 @@ def sc_inps():
             y.sum().backward()
             self.assertEqual(ref, inplace)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MockFXGraphCache:
     """
@@ -8710,6 +8912,10 @@ def run_autograd(
         {
             "enable_autograd_cache": True,
             "strict_autograd_cache": True,
+<<<<<<< HEAD
+=======
+            "view_replay_for_aliased_outputs": False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     )
     @torch._inductor.config.patch("fx_graph_cache", True)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 5bfd1f200dd02..78a11265dbac7 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -7,7 +7,11 @@
 import torch.utils._pytree as pytree
 from functorch.experimental import control_flow
 from functorch.experimental.control_flow import cond
+<<<<<<< HEAD
 from torch._dynamo.testing import EagerAndRecordGraphs, normalize_gm
+=======
+from torch._dynamo.testing import normalize_gm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.associative_scan import (
     _fake_associative_scan,
     associative_scan,
@@ -33,6 +37,10 @@
     requires_cuda,
     run_tests,
     skipIfCrossRef,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TEST_WITH_CROSSREF,
     TEST_WITH_TORCHDYNAMO,
@@ -143,7 +151,11 @@ def complex_pointwise(x, y):
         }
 
     def non_pointwise(x: torch.Tensor, y: torch.Tensor):
+<<<<<<< HEAD
         W = torch.arange(4, dtype=torch.float, device=x.device).view(2, 2)
+=======
+        W = torch.diag(torch.ones(2, device=x.device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x @ W + y @ W
 
     def RNN(x: torch.Tensor, y: torch.Tensor):
@@ -394,6 +406,7 @@ def body_fn(a, b, c1, c2, c3, c0, u0, x):
                 ([torch.randn(3, 3)], {"x": torch.randn(3, 3), "y": torch.randn(3, 3)}),
             ),
         ),
+<<<<<<< HEAD
         "int_carry": (int_carry, (torch.randn(2, 3),)),
         "pytree_int_carry": (
             pytree_int_carry,
@@ -402,6 +415,16 @@ def body_fn(a, b, c1, c2, c3, c0, u0, x):
         "const_and_symint_output": (
             const_and_symint_output,
             (torch.randn(2, 3),),
+=======
+        "int_carry": (int_carry, (torch.randn(2, 3, requires_grad=True),)),
+        "pytree_int_carry": (
+            pytree_int_carry,
+            (torch.randn(2, 3, requires_grad=True),),
+        ),
+        "const_and_symint_output": (
+            const_and_symint_output,
+            (torch.randn(2, 3, requires_grad=True),),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     }
 
@@ -736,7 +759,12 @@ def forward(self, pred_1, x_1):
     getitem_1 = cond_1[0];  getitem_1 = None
     getitem_2 = cond_1[1]
     getitem_3 = cond_1[2];  getitem_3 = None
+<<<<<<< HEAD
     getitem_4 = cond_1[3];  cond_1 = getitem_4 = None
+=======
+    getitem_4 = cond_1[3];  getitem_4 = None
+    getitem_5 = cond_1[4];  cond_1 = getitem_5 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (getitem_2,)""",  # noqa: B950
         )
 
@@ -852,7 +880,14 @@ def forward(self, pred_1, a_1, b_1, c_1):
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (a_1, b_1, sym_size_int, sym_size_int_1, c_1, sym_size_int_2, ones_like));  pred_1 = true_graph_1 = false_graph_1 = a_1 = b_1 = sym_size_int = sym_size_int_1 = c_1 = sym_size_int_2 = ones_like = None
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1]
+<<<<<<< HEAD
     getitem_3 = cond_1[2];  cond_1 = getitem_3 = None
+=======
+    getitem_3 = cond_1[2];  getitem_3 = None
+    getitem_4 = cond_1[3];  getitem_4 = None
+    getitem_5 = cond_1[4];  getitem_5 = None
+    getitem_6 = cond_1[5];  cond_1 = getitem_6 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (getitem_1, getitem_2)""",  # noqa: B950
         )
         # Forward
@@ -869,10 +904,17 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1):
             """\
 def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1):
     add = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = add = None
+<<<<<<< HEAD
     zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
     clone = torch.ops.aten.clone.default(arg6_1)
     clone_1 = torch.ops.aten.clone.default(arg6_1);  arg6_1 = None
     return [clone, clone_1, zeros_like]""",
+=======
+    clone = torch.ops.aten.clone.default(arg6_1)
+    clone_1 = torch.ops.aten.clone.default(arg6_1);  arg6_1 = None
+    zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
+    return [clone, clone_1, None, None, zeros_like, None]""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_cond_autograd_pytree_input(self):
@@ -1236,7 +1278,11 @@ def _test_cond_autograd(self, cond_fct, pred_fn, true_fn, false_fn, operands):
         from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
 
         # This is a helper function that extracts the metadata from the tensor and
+<<<<<<< HEAD
         # sets the requires_grad flag to false. This is needed as we compare the
+=======
+        # sets the requries_grad flag to false. This is needed as we compare the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # metadata of the operands and the gradients
         def _extract_tensor_metadata_except_requires_grad(arg):
             metadata = _extract_tensor_metadata(arg)
@@ -1297,11 +1343,21 @@ def _extract_tensor_metadata_except_requires_grad(arg):
 
         return cond_outputs, cond_inputs
 
+<<<<<<< HEAD
+=======
+    # TODO: The compile_mode = `compile_dynamic_shape` raises the Error
+    # torch._inductor.exc.LoweringException: NotImplementedError: get_size() is not
+    # implemented by <class 'torch._inductor.ir.NoneAsConstantBuffer'>!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("don't test compile on compile")
     @unittest.skipIf(not SM70OrLater, "triton")
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
     @parametrize("compile_mode", ["compile_dynamic_shape"])
     @parametrize("scalar", [False])
+<<<<<<< HEAD
+=======
+    @unittest.expectedFailure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_autograd_zeros_unused_branch_complex_compile_fail(
         self, compile_mode, scalar
     ):
@@ -1400,6 +1456,10 @@ def f(x, y):
                 f, (torch.ones(3, 4, 5), torch.ones(4, 4, 5)), torch.ones(5)
             )
 
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_illegal_outputs(self):
         def f(x, y):
             return x.item()
@@ -1830,6 +1890,7 @@ def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
             ]
         )
 
+<<<<<<< HEAD
         init_clone = [i.clone() for i in init]
         init_clone2 = [i.clone() for i in init]
         elements_clone = [ele.clone() for ele in elements]
@@ -1838,17 +1899,31 @@ def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
             get_scan_combine_fn("s5_operator", False),
             init_clone,
             elements_clone,
+=======
+        result = scan_fct(
+            get_scan_combine_fn("s5_operator", False),
+            init,
+            elements,
+            dim=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             reverse=reverse,
         )
         expected_result = _fake_scan(
             get_scan_combine_fn("s5_operator", False),
+<<<<<<< HEAD
             init_clone2,
             elements_clone2,
+=======
+            init=init,
+            xs=elements,
+            dim=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             reverse=reverse,
         )
         self.assertEqual(result, expected_result)
 
         if autograd:
+<<<<<<< HEAD
             result_flatten, _ = pytree.tree_flatten(result)
             result_exp_flatten, _ = pytree.tree_flatten(expected_result)
 
@@ -1861,6 +1936,23 @@ def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
             )
             self.assertEqual(grads, expected_grads)
 
+=======
+            init_flatten, _ = pytree.tree_flatten(init)
+            elements_flatten, _ = pytree.tree_flatten(elements)
+
+            result_flatten, _ = pytree.tree_flatten(result)
+            result_exp_flatten, _ = pytree.tree_flatten(expected_result)
+            grad_out = [torch.ones_like(el) for el in result_exp_flatten]
+            expected_grads = torch.autograd.grad(
+                result_exp_flatten, (*init_flatten, *elements_flatten), grad_out
+            )
+            grads = torch.autograd.grad(
+                result_flatten, (*init_flatten, *elements_flatten), grad_out
+            )
+            self.assertEqual(grads, expected_grads)
+
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -2002,11 +2094,21 @@ def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd):
         if autograd:
             self.check_autograd(result, expected_result, (init, inp))
 
+<<<<<<< HEAD
     # TODO: Does not work because of the usage of vmap within associative_scan
     # The paT206899919 rameterization is commented out for the moment and the test is marked with expected fail
     # Fails with: AssertionError: scan is not an OpOverload
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
+=======
+    # TODO: Does not work because of the usage of vmap witin associative_scan
+    # The paT206899919 rameterization is commented out for the moment and the test is marked with expected fail
+    # Fails with: AssertionError: scan is not an OpOverload
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @unittest.expectedFailure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scan_associative_scan(self):
         combine_mode = "generic"
         compile_mode_scan = "compile"
@@ -2733,6 +2835,11 @@ def fct_pointwise_different_carry(x, y):
     @skipIfNoDynamoSupport
     @skipIfCrossRef  # Arg order changes with crossref
     def test_scan_pytree_output(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(3, 10, 2, device=torch.device("cpu"))
         init = torch.randn(1, 10, 2, device=torch.device("cpu"))
 
@@ -2757,7 +2864,13 @@ def forward(self, L_init_0_: "f32[1, 10, 2]", L_init_1_: "f32[1, 10, 2]", L_xs_:
         l_init_1_ = L_init_1_
         l_xs_ = L_xs_
 
+<<<<<<< HEAD
         flip: "f32[3, 10, 2]" = torch.flip(l_xs_, [0]);  l_xs_ = None
+=======
+        elem: "f32[3, 10, 2]" = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
+
+        flip: "f32[3, 10, 2]" = torch.flip(elem, [0]);  elem = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         scan_combine_fn_0 = self.scan_combine_fn_0
         scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_0_, l_init_1_], [flip], []);  scan_combine_fn_0 = l_init_0_ = l_init_1_ = flip = None
@@ -2932,13 +3045,20 @@ def RNN(x: torch.Tensor, y: torch.Tensor):
             if autograd:
                 result_flat = pytree.tree_leaves(result)
                 result_exp_flat = pytree.tree_leaves(result_exp)
+<<<<<<< HEAD
                 exp_grad_mask = [bool(r.requires_grad) for r in result_exp_flat]
+=======
+                exp_grad_mask = [
+                    True if r.requires_grad else False for r in result_exp_flat
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.check_autograd(
                     [r for r, m in zip(result_flat, exp_grad_mask) if m],
                     [r for r, m in zip(result_exp_flat, exp_grad_mask) if m],
                     params,
                 )
 
+<<<<<<< HEAD
     @requires_cuda
     @skipIfTorchDynamo("not a dynamo test")
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3102,6 +3222,8 @@ def run_test_and_get_grads_loss(model, initial_hs, inputs):
                 compiled_loss,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -3439,6 +3561,11 @@ def f(fct, init, xs):
     @skipIfNoDynamoSupport
     @skipIfCrossRef  # Arg order changes with crossref
     def test_scan_simple_graph(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(3, 10, 2, device=torch.device("cpu"))
         init = torch.randn(1, 10, 2, device=torch.device("cpu"))
 
@@ -3453,7 +3580,12 @@ def f(fct, init, xs):
             gm.code.strip(),
             """\
 def forward(self, fct_1, init_1, xs_1):
+<<<<<<< HEAD
     flip = torch.ops.aten.flip.default(xs_1, [0])
+=======
+    permute = torch.ops.aten.permute.default(xs_1, [0, 1, 2])
+    flip = torch.ops.aten.flip.default(permute, [0]);  permute = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sym_size_int_1 = torch.ops.aten.sym_size.int(init_1, 1)
     sym_size_int_2 = torch.ops.aten.sym_size.int(init_1, 2)
     sym_size_int_3 = torch.ops.aten.sym_size.int(xs_1, 1)
@@ -3477,7 +3609,12 @@ def forward(self, fct_1, init_1, xs_1):
 def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor):
     l_init_ = L_init_
     l_xs_ = L_xs_
+<<<<<<< HEAD
     flip = torch.flip(l_xs_, [0]);  l_xs_ = None
+=======
+    elem = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
+    flip = torch.flip(elem, [0]);  elem = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scan_combine_fn_0 = self.scan_combine_fn_0
     scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [flip], []);  scan_combine_fn_0 = l_init_ = flip = None
     carry = scan[0]
@@ -3669,7 +3806,11 @@ def __init__(self, combine_fn, dim, reverse, combine_mode, compile_mode):
                     # Check if val is a list and if it has the same length as combine_fn
                     # If so, then use the individual elements.
                     # If not, duplicate the first element.
+<<<<<<< HEAD
                     if type(val) is list and len(val) == chain_len:
+=======
+                    if type(val) == list and len(val) == chain_len:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         kwargs_el[key] = val[ind]
                     else:
                         kwargs_el[key] = val
@@ -3707,6 +3848,7 @@ def setUp(self):
         torch._dynamo.reset()
         super().setUp()
 
+<<<<<<< HEAD
     def _check_autograd(self, result, result_exp, autograd_param):
         grad_param = [p for p in autograd_param if p.requires_grad]
 
@@ -3730,10 +3872,14 @@ def _check_autograd(self, result, result_exp, autograd_param):
         self.assertEqual(grads, expected_grads, atol=6e-05, rtol=6e-06)
 
     def _run_test(self, model, model_fake, inputs, autograd_param=None):
+=======
+    def _run_test(self, model, model_fake, inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = model(inputs)
         result_exp = model_fake(inputs)
         self.assertEqual(result, result_exp)
 
+<<<<<<< HEAD
         if autograd_param is not None and any(
             par.requires_grad for par in autograd_param
         ):
@@ -3747,6 +3893,8 @@ def _run_test(self, model, model_fake, inputs, autograd_param=None):
                 autograd_param,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Return the result of the functions under test for further investigations
         return result
 
@@ -3761,7 +3909,10 @@ def _prepare_fake_kwargs(self, original_kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3773,6 +3924,7 @@ def _prepare_fake_kwargs(self, original_kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
             )
         ),
     )
@@ -3792,6 +3944,16 @@ def test_associative_scan_compile(
         self, combine_mode, reverse, compile_mode, device, autograd
     ):
         x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_compile(
+        self, combine_mode, reverse, compile_mode, device
+    ):
+        x = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3803,7 +3965,10 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.Simple(**kwargs),
             model_fake=AssociativeScanModels.Simple(**kwargs_fake),
             inputs=x,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if not reverse:
@@ -3813,9 +3978,13 @@ def test_associative_scan_compile(
             self.assertEqual(results, results_torch)
 
         # Jax Examples
+<<<<<<< HEAD
         x = torch.arange(
             0, 4, device=device, dtype=torch.float32, requires_grad=autograd
         )
+=======
+        x = torch.arange(0, 4, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3828,6 +3997,7 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x,),
         )
 
@@ -3835,6 +4005,14 @@ def test_associative_scan_compile(
             results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.float32)
         else:
             results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.float32)
+=======
+        )
+
+        if not reverse:
+            results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.int64)
+        else:
+            results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.int64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(result, results_torch)
 
@@ -3844,7 +4022,10 @@ def test_associative_scan_compile(
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3856,12 +4037,20 @@ def test_associative_scan_compile(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
             )
         ),
     )
     def test_associative_scan_dim(
         self, combine_mode, compile_mode, reverse, device, autograd
     ):
+=======
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import random
 
         random.seed(1234)
@@ -3872,7 +4061,11 @@ def test_associative_scan_dim(
             torch._dynamo.reset()
             shapes = [random.randint(1, 9) for _ in range(num_dim)]
             rnd_scan_dim = random.randint(0, num_dim - 1)
+<<<<<<< HEAD
             x = torch.randn(*shapes, device=device, requires_grad=autograd)
+=======
+            x = torch.randn(*shapes, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             kwargs = {
                 "dim": rnd_scan_dim,
@@ -3885,7 +4078,10 @@ def test_associative_scan_dim(
                 model=AssociativeScanModels.Simple(**kwargs),
                 model_fake=AssociativeScanModels.Simple(**kwargs_fake),
                 inputs=x,
+<<<<<<< HEAD
                 autograd_param=None if not autograd else (x,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             if not reverse:
@@ -3917,13 +4113,20 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
                 inputs=x,
             )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3935,6 +4138,7 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
             )
         ),
     )
@@ -3943,6 +4147,15 @@ def test_associative_scan_tuple(
     ):
         x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+=======
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, device):
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = (x, y)
 
         kwargs = {
@@ -3957,12 +4170,16 @@ def test_associative_scan_tuple(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else inp,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
+<<<<<<< HEAD
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     @parametrize("autograd", [False, True])
@@ -3970,6 +4187,15 @@ def test_associative_scan_expand_in_combine_fn(
         self, compile_mode, reverse, device, autograd
     ):
         x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+=======
+    @parametrize("combine_mode", ["pointwise", "generic"])
+    @parametrize("reverse", [False, True])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    def test_associative_scan_expand_in_combine_fn(
+        self, compile_mode, combine_mode, reverse, device
+    ):
+        x = torch.randn(3, 2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def combine_fn(x, y):
             return x * torch.sum(y, -1).expand(x.shape)
@@ -3986,7 +4212,10 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3994,6 +4223,7 @@ def combine_fn(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     def test_associative_scan_non_contiguous_tensor(
         self, compile_mode, reverse, device, autograd
@@ -4003,6 +4233,12 @@ def test_associative_scan_non_contiguous_tensor(
             .view(10, 3)
             .t()
         )
+=======
+    def test_associative_scan_non_contiguous_tensor(
+        self, compile_mode, reverse, device
+    ):
+        x = torch.arange(30, device=device).view(10, 3).t()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert not x.is_contiguous()
 
         kwargs = {
@@ -4017,7 +4253,10 @@ def test_associative_scan_non_contiguous_tensor(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4026,7 +4265,10 @@ def test_associative_scan_non_contiguous_tensor(
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4038,15 +4280,27 @@ def test_associative_scan_non_contiguous_tensor(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
+=======
+                or torch.version.hip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ),
     )
     def test_associative_scan_complex_pytree(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
     ):
         x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+=======
+        self, compile_mode, combine_mode, reverse, device
+    ):
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        z = torch.randn(3, 2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -4061,13 +4315,21 @@ def test_associative_scan_complex_pytree(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x, y, z),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfTorchDynamo("don't test compile on compile")
     @skipIfNoDynamoSupport
     @skipIfCrossRef  # Arg order changes with crossref
     def test_associative_scan_pytree_output(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = (
             (
                 torch.randn(3, 10, 2, device=torch.device("cpu")),
@@ -4115,6 +4377,7 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
         child_4: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 1, None, 2)
         child_5: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 1, None, 2)
 
+<<<<<<< HEAD
         lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
         _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting = None
@@ -4125,22 +4388,43 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
         _add_batch_dim_3: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_3, 0, 1);  child_3 = _add_batch_dim_3 = None
         _add_batch_dim_4: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_4, 0, 1);  child_4 = _add_batch_dim_4 = None
         _add_batch_dim_5: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_5, 0, 1);  child_5 = None
+=======
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        _add_batch_dim_1: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_1, 0, 1);  child_1 = None
+        _add_batch_dim_2: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_2, 0, 1);  child_2 = _add_batch_dim_2 = None
+        _add_batch_dim_3: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_3, 0, 1);  child_3 = _add_batch_dim_3 = None
+        _add_batch_dim_4: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_4, 0, 1);  child_4 = _add_batch_dim_4 = None
+        _add_batch_dim_5: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_5, 0, 1);  child_5 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a: "f32[10, 2]" = _add_batch_dim + _add_batch_dim_5;  _add_batch_dim = None
         b: "f32[10, 2]" = _add_batch_dim_1 - _add_batch_dim_5;  _add_batch_dim_1 = _add_batch_dim_5 = None
 
         child_6: "f32[10, 2]" = a - b
 
+<<<<<<< HEAD
         child_7: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(a, 1, 1, 0);  a = None
         child_8: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(b, 1, 1, 0);  b = None
         child_9: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 1, 0);  child_6 = None
 
         _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+=======
+        child_7: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a, 1, 1, 0);  a = None
+        child_8: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b, 1, 1, 0);  b = None
+        child_9: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_6, 1, 1, 0);  child_6 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         child_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 2, None, 2)
         child_11: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 2, None, 2)
         child_12: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 2, None, 2)
 
+<<<<<<< HEAD
         lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
         _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
@@ -4151,17 +4435,37 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
         _add_batch_dim_9: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_10, 0, 1);  child_10 = _add_batch_dim_9 = None
         _add_batch_dim_10: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_11, 0, 1);  child_11 = _add_batch_dim_10 = None
         _add_batch_dim_11: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_12, 0, 1);  child_12 = None
+=======
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_6: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_7, 0, 1)
+        _add_batch_dim_7: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_8, 0, 1)
+        _add_batch_dim_8: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_9, 0, 1);  _add_batch_dim_8 = None
+        _add_batch_dim_9: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_10, 0, 1);  child_10 = _add_batch_dim_9 = None
+        _add_batch_dim_10: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_11, 0, 1);  child_11 = _add_batch_dim_10 = None
+        _add_batch_dim_11: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_12, 0, 1);  child_12 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a_1: "f32[10, 2]" = _add_batch_dim_6 + _add_batch_dim_11;  _add_batch_dim_6 = None
         b_1: "f32[10, 2]" = _add_batch_dim_7 - _add_batch_dim_11;  _add_batch_dim_7 = _add_batch_dim_11 = None
 
         child_13: "f32[10, 2]" = a_1 - b_1
 
+<<<<<<< HEAD
         child_14: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(a_1, 1, 1, 0);  a_1 = None
         child_15: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(b_1, 1, 1, 0);  b_1 = None
         child_16: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(child_13, 1, 1, 0);  child_13 = None
 
         _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+=======
+        child_14: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a_1, 1, 1, 0);  a_1 = None
+        child_15: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b_1, 1, 1, 0);  b_1 = None
+        child_16: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_13, 1, 1, 0);  child_13 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         slice_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 0, 1);  elem_3 = None
         cat: "f32[2, 10, 2]" = torch.cat([slice_10, child_14], dim = 0);  slice_10 = child_14 = None
@@ -4211,7 +4515,10 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4223,11 +4530,19 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
+=======
+                or torch.version.hip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ),
     )
     def test_associative_scan_downstream_scan_matmul(
+<<<<<<< HEAD
         self, combine_mode, compile_mode, reverse, device, autograd
+=======
+        self, combine_mode, compile_mode, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4237,7 +4552,11 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             W = torch.ones(2, 5, device=device)
             return inp @ W
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = {
             "dim": 1,
             "reverse": reverse,
@@ -4250,7 +4569,10 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4259,7 +4581,10 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4271,11 +4596,19 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
+=======
+                or torch.version.hip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ),
     )
     def test_associative_scan_downstream_scan_scan(
+<<<<<<< HEAD
         self, combine_mode, compile_mode, reverse, device, autograd
+=======
+        self, combine_mode, compile_mode, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o1 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4285,7 +4618,11 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 1,
@@ -4299,7 +4636,10 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4309,7 +4649,10 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("reverse_first", [False, True])
     @parametrize("same_direction", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4321,6 +4664,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
             )
         ),
     )
@@ -4338,6 +4682,14 @@ def test_associative_scan_downstream_scan_scan_different_dim(
         same_direction,
         device,
         autograd,
+=======
+                or torch.version.hip
+            )
+        ),
+    )
+    def test_associative_scan_downstream_scan_scan_different_dim(
+        self, combine_mode, compile_mode, reverse_first, same_direction, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         reverse_second = reverse_first if same_direction else not reverse_first
 
@@ -4349,7 +4701,11 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": [1, 0],
@@ -4363,10 +4719,16 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
+=======
+        )
+
+    # TODO: Does not work because of the usage of vmap witin associative_scan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Re-enable additional parameters again once this issues has been resolved
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
@@ -4422,9 +4784,14 @@ def second_nested_fct(x, y):
     @parametrize("loop_type", ["for"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     def test_associative_scan_loop_in_combine_fn(
         self, compile_mode, loop_type, reverse, device, autograd
+=======
+    def test_associative_scan_loop_in_combine_fn(
+        self, compile_mode, loop_type, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def combine_fn(x, y):
             cnt = torch.zeros_like(y[0, :])
@@ -4449,7 +4816,11 @@ def body_fn(ind, loop_val):
                     cnt += torch.abs(y[ind])
             return x * cnt
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd) * 2
+=======
+        inp = torch.randn(3, 10, 1, device=device) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4463,10 +4834,16 @@ def body_fn(ind, loop_val):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
+=======
+        )
+
+    # TODO: Does not work because of the usage of vmap witin associative_scan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Re-enable additional parameters again once this issues has been resolved
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
@@ -4508,7 +4885,10 @@ def body_fn(ind, loop_val):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of compile_mode=compile_dynamic_shape
     # as the current implementation does not support lifted arguments
     @decorateIf(
@@ -4516,16 +4896,27 @@ def body_fn(ind, loop_val):
         lambda params: (
             params["device"] == torch.device("cpu")
             or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
         ),
     )
     def test_associative_scan_cond_in_combine_fn(
         self, compile_mode, reverse, device, autograd
     ):
+=======
+            or torch.version.hip
+        ),
+    )
+    def test_associative_scan_cond_in_combine_fn(self, compile_mode, reverse, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def combine_fn(x, y):
             val = cond(torch.sum(y) > 0.0, lambda y: y.clone(), lambda y: 1.0 - y, (y,))
             return x * val
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 10, 1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4539,10 +4930,16 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
+=======
+        )
+
+    # TODO: Does not work because of the usage of vmap witin associative_scan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Re-enable additional parameters again once this issues has been resolved
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
@@ -4581,10 +4978,14 @@ def body(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     def test_associative_scan_vmap_in_combine_fn(
         self, compile_mode, reverse, device, autograd
     ):
+=======
+    def test_associative_scan_vmap_in_combine_fn(self, compile_mode, reverse, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def combine_fn(x, y):
             def body(x):
                 return x**2
@@ -4593,7 +4994,11 @@ def body(x):
             y_new = mapped_body(y)
             return x + y_new
 
+<<<<<<< HEAD
         inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4607,7 +5012,10 @@ def body(x):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4615,7 +5023,10 @@ def body(x):
     @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of associative_scan and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     @decorateIf(
@@ -4623,9 +5034,15 @@ def body(x):
         lambda params: (params["device"] == torch.device("cpu")),
     )
     def test_associative_scan_non_pointwise_generic(
+<<<<<<< HEAD
         self, reverse, compile_mode, device, autograd
     ):
         x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+=======
+        self, reverse, compile_mode, device
+    ):
+        x = torch.randn(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4639,16 +5056,25 @@ def test_associative_scan_non_pointwise_generic(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (x,),
         )
 
+=======
+        )
+
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4660,18 +5086,32 @@ def test_associative_scan_non_pointwise_generic(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
+<<<<<<< HEAD
+=======
+                or torch.version.hip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ),
     )
     def test_associative_scan_binary_operator(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
+=======
+        self, compile_mode, combine_mode, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         state_dim = 20
         timesteps = 10
         projected_inputs = torch.randn(
+<<<<<<< HEAD
             timesteps, state_dim, device=device, requires_grad=autograd
         )
         A = torch.randn(state_dim, device=device, requires_grad=autograd)
+=======
+            timesteps, state_dim, requires_grad=True, device=device
+        )
+        A = torch.randn(state_dim, requires_grad=True, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elements = (A.repeat((timesteps, 1)), projected_inputs)
 
         kwargs = {
@@ -4686,9 +5126,15 @@ def test_associative_scan_binary_operator(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=elements,
+<<<<<<< HEAD
             autograd_param=None if not autograd else elements,
         )
 
+=======
+        )
+
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
@@ -4767,7 +5213,10 @@ def test_associative_scan_different_input_size_wrong_dim(self):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4776,9 +5225,15 @@ def test_associative_scan_different_input_size_wrong_dim(self):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_simple(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
     ):
         H = torch.rand(2, device=device, requires_grad=autograd)
+=======
+        self, compile_mode, combine_mode, reverse, device
+    ):
+        H = torch.rand(2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * 2
@@ -4786,13 +5241,22 @@ def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
         def fct_freevars2(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * H
 
+<<<<<<< HEAD
         H1 = torch.rand(1, device=device, requires_grad=autograd)
         H2 = torch.rand(1, device=device, requires_grad=autograd)
+=======
+        H1 = torch.rand(1, device=device)
+        H2 = torch.rand(1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
             return x * H1 + y * H2
 
+<<<<<<< HEAD
         inp = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for fct, param in [
             (fct_freevars1, (H,)),
@@ -4811,7 +5275,10 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+<<<<<<< HEAD
                 autograd_param=None if not autograd else (inp, *param),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4820,7 +5287,10 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4829,10 +5299,17 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_nested(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
     ):
         H1 = torch.rand(4, 5, device=device, requires_grad=autograd)
         H2 = torch.rand(4, 1, device=device, requires_grad=autograd)
+=======
+        self, compile_mode, combine_mode, reverse, device
+    ):
+        H1 = torch.rand(4, 5, device=device)
+        H2 = torch.rand(4, 1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             def inner(xi):
@@ -4848,10 +5325,19 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
+<<<<<<< HEAD
+=======
+        H1_i = torch.rand(4, 5, device=device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Using random tensors in the `combine_fn` triggers the vmap randomness error:
         # RuntimeError: vmap: called random operation while in randomness error mode.
         # Please either use the 'same' or 'different' randomness flags on vmap or perform the randomness operation out of vmap
         def fct_nested_inside(x: torch.Tensor, y: torch.Tensor):
+<<<<<<< HEAD
+=======
+            # H2_i = torch.rand(4, 1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4861,6 +5347,10 @@ def inner(xi):
             return x + ret * H1
 
         def fct_nested_inside_fake(x: torch.Tensor, y: torch.Tensor):
+<<<<<<< HEAD
+=======
+            # H2_i = torch.rand(4, 1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4869,11 +5359,19 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
+<<<<<<< HEAD
         inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         for fct, fct_fake, param in [
             (fct_nested_outside, fct_nested_outside_fake, (H1, H2)),
             (fct_nested_inside, fct_nested_inside_fake, ()),
+=======
+        inp = torch.randn(3, 4, 5, device=device)
+
+        for fct, fct_fake, param in [
+            (fct_nested_outside, fct_nested_outside_fake, (H1, H2)),
+            (fct_nested_inside, fct_nested_inside_fake, (H1_i,)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]:
             kwargs = {
                 "dim": 0,
@@ -4888,7 +5386,10 @@ def inner(xi):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+<<<<<<< HEAD
                 autograd_param=None if not autograd else (inp, *param),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4897,7 +5398,10 @@ def inner(xi):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4906,7 +5410,11 @@ def inner(xi):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_fct(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
+=======
+        self, compile_mode, combine_mode, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def additional_fct_no_add_inp(x, y):
             return x * y
@@ -4915,7 +5423,11 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             ret = additional_fct_no_add_inp(y, y)
             return x + ret
 
+<<<<<<< HEAD
         inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 4, 5, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4929,7 +5441,10 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4937,10 +5452,14 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     def test_associative_scan_freevars_fct_generic(
         self, compile_mode, reverse, device, autograd
     ):
+=======
+    def test_associative_scan_freevars_fct_generic(self, compile_mode, reverse, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def additional_fct_no_add_inp(x, y):
             return x * y
 
@@ -4954,7 +5473,11 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             ret = _fake_associative_scan(additional_fct_no_add_inp, y, 1)
             return x + ret
 
+<<<<<<< HEAD
         inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
+=======
+        inp = torch.randn(3, 4, 5, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kwargs = {
             "dim": 0,
@@ -4969,7 +5492,10 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4978,7 +5504,10 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4987,7 +5516,11 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_shape_check(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
+=======
+        self, compile_mode, combine_mode, reverse, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         H = torch.eye(2, device=device, requires_grad=True)
 
@@ -5008,7 +5541,10 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (inp,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -5017,7 +5553,10 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     @parametrize("combine_mode", ["pointwise", "generic"])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -5026,11 +5565,19 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_pytree(
+<<<<<<< HEAD
         self, compile_mode, combine_mode, reverse, device, autograd
     ):
         xf = torch.randn(2, 2, device=device, requires_grad=autograd)
         yf = torch.randn(2, 2, device=device, requires_grad=autograd)
         zf = torch.randn(2, 2, device=device, requires_grad=autograd)
+=======
+        self, compile_mode, combine_mode, reverse, device
+    ):
+        xf = torch.randn(2, 2, device=device, requires_grad=True)
+        yf = torch.randn(2, 2, device=device, requires_grad=True)
+        zf = torch.randn(2, 2, device=device, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inpf = {"i": xf, "j": ([yf], [{"o": zf}])}
 
         def fct_pointwise(x, y):
@@ -5047,9 +5594,15 @@ def fct_pointwise(x, y):
                 ),
             }
 
+<<<<<<< HEAD
         x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+=======
+        x = torch.randn(3, 2, 2, device=device, requires_grad=True)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=True)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -5064,6 +5617,7 @@ def fct_pointwise(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+<<<<<<< HEAD
             autograd_param=None if not autograd else (*pytree.tree_leaves(inp),),
         )
 
@@ -5174,6 +5728,8 @@ def mul_single_nograd(x, y):
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
             autograd_param=inp[0:1],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -5341,10 +5897,14 @@ def _check_compile(self, fn, args, *, dynamic=False, backend="eager"):
 
     def _check_export(self, fn, args, *, strict=False, dynamic_shapes=None):
         eg_out = fn(*args)
+<<<<<<< HEAD
         with torch._export.config.patch(use_new_tracer_experimental=True):
             ep = torch.export.export(
                 fn, args, strict=strict, dynamic_shapes=dynamic_shapes
             )
+=======
+        ep = torch.export.export(fn, args, strict=strict, dynamic_shapes=dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep_out = ep.module()(*args)
         self.assertEqual(eg_out, ep_out)
         return ep
@@ -5373,6 +5933,11 @@ def f(x, y):
     @skipIfTorchDynamo("Graph is not captured by backend if test with dynamo")
     @skipIfCrossRef  # Arg order changes with crossref
     def test_cond_simple_with_linear_compile_check_graph(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def true_fn(x):
             return x.sin()
 
@@ -5420,7 +5985,11 @@ def forward(self, L_ctx_saved_tensors_0_: "f32[4]", L_ctx_pred: "b8[]", L_args_1
         return (getitem,)
 
     class cond_true_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, l_args_1_: "f32[4]", l_ctx_saved_tensors_0_: "f32[4]"):
+=======
+        def forward(self, l_args_1_, l_ctx_saved_tensors_0_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             l_args_1__1 = l_args_1_
             l_ctx_saved_tensors_0__1 = l_ctx_saved_tensors_0_
 
@@ -5432,7 +6001,11 @@ def forward(self, l_args_1_: "f32[4]", l_ctx_saved_tensors_0_: "f32[4]"):
             return (mul,)
 
     class cond_false_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, l_args_1_: "f32[4]", l_ctx_saved_tensors_0_: "f32[4]"):
+=======
+        def forward(self, l_args_1_, l_ctx_saved_tensors_0_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             l_args_1__1 = l_args_1_
             l_ctx_saved_tensors_0__1 = l_ctx_saved_tensors_0_
 
@@ -5517,6 +6090,11 @@ def forward(self, arg0_1, arg1_1, arg2_1):
 
     def test_while_loop_pytree_carry(self):
         fn, inp = WHILE_LOOP_TESTS["simple_with_pytree_carry"]
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend = EagerAndRecordGraphs()
         expected_res = fn(*inp)
         compiled_res = torch.compile(fn, backend=backend)(*inp)
@@ -5670,7 +6248,11 @@ def forward(self, arg0_1):
             )
 
     @parametrize("func_type", ["no", "cpp", "python", "functorch"])
+<<<<<<< HEAD
     # - "simple_with_linear" and "nested_with_linear" doesn't work because parameters and buffers
+=======
+    # - "simple_with_linear" and "nested_with_linear" doesn't work becaue parameters and buffers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #   are not inputs so they're not wrapped by functionalization and tracing.
     #
     # - make_fx tracing mode "real" fails for "int_carry", "pytree_int_carry" and "const_and_symint_output"
@@ -5706,9 +6288,16 @@ def test_while_loop_functionalize(self, func_type, while_loop_test):
     )
     def test_while_loop_tracing(self, while_loop_test):
         fn, inp = WHILE_LOOP_TESTS[while_loop_test]
+<<<<<<< HEAD
         allow_non_fake_inputs = while_loop_test in (
             "simple_with_linear",
             "nested_with_linear",
+=======
+        allow_non_fake_inputs = (
+            False
+            if while_loop_test not in ("simple_with_linear", "nested_with_linear")
+            else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_tracing(fn, inp, allow_non_fake_inputs)
 
@@ -5722,12 +6311,18 @@ def test_while_loop_compile(self, backend, while_loop_test):
     @skipIfCrossRef  # Arg order changes with cross ref
     def test_while_loop_simple_with_linear_compile_check_graph(self):
         fn, inp = WHILE_LOOP_TESTS["simple_with_linear"]
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend = EagerAndRecordGraphs()
         torch.compile(fn, backend=backend)(*inp)
         self.assertEqual(len(backend.graphs), 1)
         gm = backend.graphs[0]
         if torch._dynamo.config.inline_inbuilt_nn_modules:
             self.assertExpectedInline(
+<<<<<<< HEAD
                 normalize_gm(gm.print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
@@ -5757,6 +6352,71 @@ def forward(self, child_2: "i64[]", child_3: "f32[2, 2]", l_self_buffers_dec__co
             child_4: "f32[2, 2]" = torch._C._nn.linear(child_3, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  child_3 = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
             return (child, child_4)
 """,  # noqa: B950
+=======
+                gm.code.strip(),
+                """\
+def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor, L_self_buffers_dec_ : torch.Tensor, L_self_modules_linear_parameters_weight_ : torch.nn.parameter.Parameter, L_self_modules_linear_parameters_bias_ : torch.nn.parameter.Parameter):
+    l_iter_ = L_iter_
+    l_x_ = L_x_
+    l_self_buffers_dec_ = L_self_buffers_dec_
+    l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
+    l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
+    cond_fn_0 = self.cond_fn_0
+    body_fn_0 = self.body_fn_0
+    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l_self_buffers_dec_, l_self_modules_linear_parameters_bias_, l_self_modules_linear_parameters_weight_));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l_self_buffers_dec_ = l_self_modules_linear_parameters_bias_ = l_self_modules_linear_parameters_weight_ = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1];  while_loop = None
+    return (getitem, getitem_1)""",  # noqa: B950
+            )
+            self.assertExpectedInline(
+                gm.cond_fn_0.code.strip(),
+                """\
+def forward(self, l_iter_ : torch.Tensor, l_x_ : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
+    sub = l_iter_ - l_self_buffers_dec__cond_fn;  l_iter_ = l_self_buffers_dec__cond_fn = None
+    gt = sub > 0;  sub = None
+    return gt""",  # noqa: B950
+            )
+            self.assertExpectedInline(
+                gm.body_fn_0.code.strip(),
+                """\
+def forward(self, l_iter_ : torch.Tensor, l_x_ : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
+    child = l_iter_ - 1;  l_iter_ = None
+    child_1 = torch._C._nn.linear(l_x_, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  l_x_ = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
+    return (child, child_1)""",  # noqa: B950
+            )
+        else:
+            self.assertExpectedInline(
+                gm.code.strip(),
+                """\
+def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor):
+    l_iter_ = L_iter_
+    l_x_ = L_x_
+    l__self___dec = self.L__self___dec
+    l__self___linear_weight = self.L__self___linear_weight
+    l__self___linear_bias = self.L__self___linear_bias
+    cond_fn_0 = self.cond_fn_0
+    body_fn_0 = self.body_fn_0
+    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l__self___dec, l__self___linear_bias, l__self___linear_weight));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l__self___dec = l__self___linear_bias = l__self___linear_weight = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1];  while_loop = None
+    return (getitem, getitem_1)""",  # noqa: B950
+            )
+            self.assertExpectedInline(
+                gm.cond_fn_0.code.strip(),
+                """\
+def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
+    sub = l_iter_ - l__self___dec_cond_fn;  l_iter_ = l__self___dec_cond_fn = None
+    gt = sub > 0;  sub = None
+    return gt""",  # noqa: B950
+            )
+            self.assertExpectedInline(
+                gm.body_fn_0.code.strip(),
+                """\
+def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
+    child = l_iter_ - 1;  l_iter_ = None
+    child_1 = torch._C._nn.linear(l_x_, l__self___linear_weight_body_fn, l__self___linear_bias_body_fn);  l_x_ = l__self___linear_weight_body_fn = l__self___linear_bias_body_fn = None
+    return (child, child_1)""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_while_loop_nested2_traced(self):
@@ -6973,6 +7633,7 @@ def f(x, y):
         res_compiled = torch.compile(f)(*example_inputs)
         self.assertEqual(res, res_compiled)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Skip because we're testing export")
     def test_cond_autograd_backward_inp_out_aliasing(self):
         from torch._dynamo.testing import AotEagerAndRecordGraphs
@@ -7073,6 +7734,8 @@ def forward(self, arg0_1: "f32[3, 4]", arg1_1: "f32[3, 4]", arg2_1: "f32[3, 4]")
             )
         self.assertEqual(res, res_compiled)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_functionalized_elem_alias(self):
         def map_fn(x):
             x.view(x.shape)
@@ -7849,6 +8512,11 @@ def forward(self, inp: torch.Tensor, tmp: torch.Tensor) -> torch.Tensor:
         ):
             out = torch.compile(Mod(), backend="inductor")(inp, tmp)
 
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend = EagerAndRecordGraphs()
         out = torch.compile(Mod(), backend=backend)(inp, tmp)
         self.assertExpectedInline(
@@ -7870,9 +8538,16 @@ def forward(self, l_inp_, l_tmp_):
         )
         self.assertEqual(out, f(inp, tmp))
 
+<<<<<<< HEAD
     @skipIfCrossRef  # Args get renamed to r in crossref mode
     @parametrize("requires_grad", [True, False])
     def test_cond_symint_operands(self, requires_grad):
+=======
+    @parametrize("requires_grad", [True, False])
+    def test_cond_symint_operands(self, requires_grad):
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend = EagerAndRecordGraphs()
 
         class Mod(torch.nn.Module):
@@ -8036,6 +8711,11 @@ def f(init, xs):
 
     @skipIfTorchDynamo("Graph is not captured by backend if test with dynamo")
     def test_scan_pytree_closure(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         param_buffer = ({"param": torch.randn(3, 3)}, (torch.randn(3),))
 
         def add(carry, x):
@@ -8063,8 +8743,14 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
     l_xs_ = L_xs_
     l_add_closure_0_cell_contents_0_param_ = L_add_closure_0_cell_contents_0_param_
     l_add_closure_0_cell_contents_1_0_ = L_add_closure_0_cell_contents_1_0_
+<<<<<<< HEAD
     scan_combine_fn_0 = self.scan_combine_fn_0
     scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [l_xs_], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = l_xs_ = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+=======
+    r = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
+    scan_combine_fn_0 = self.scan_combine_fn_0
+    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [r], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = r = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     carry = scan[0]
     out = scan[1];  scan = None
     return (carry, out)""",  # noqa: B950
@@ -8078,8 +8764,14 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
     l_xs_ = L_xs_
     l_add_closure_0_cell_contents_0_param_ = L_add_closure_0_cell_contents_0_param_
     l_add_closure_0_cell_contents_1_0_ = L_add_closure_0_cell_contents_1_0_
+<<<<<<< HEAD
     scan_combine_fn_0 = self.scan_combine_fn_0
     scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [l_xs_], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = l_xs_ = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+=======
+    movedim = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
+    scan_combine_fn_0 = self.scan_combine_fn_0
+    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [movedim], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = movedim = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     carry = scan[0]
     out = scan[1];  scan = None
     return (carry, out)""",  # noqa: B950
@@ -8087,6 +8779,7 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
         self.assertEqual(eager_out, exp_out)
         self.assertEqual(compiled_out, exp_out)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_scan_in_vmap_simple(self):
         x = torch.randn(3, 4, 4)
@@ -8343,6 +9036,15 @@ def inner_fn(carry_elem, xs_elem):
 
     @skipIfTorchDynamo("Skip because we're testing export")
     @parametrize("strict", [True, False])
+=======
+    @skipIfTorchDynamo("Skip because we're testing export")
+    # TODO: we cannot turn on strict=True yet because torch._check for out_it > 0 is
+    # removed from the graph in dynamo and in non-strict export's graph capturing
+    # step, we re-run the traced graph module to get graph captured result.
+    # Since torch._check is removed from graph, we end up getting a data-dependent
+    # error when we call torch.ones(out_it * 2).
+    @parametrize("strict", [False])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", [True, False])
     def test_while_loop_op_int_carry_export(self, strict, dynamic):
         m, args = WHILE_LOOP_TESTS["int_carry"]
@@ -8357,7 +9059,10 @@ def forward(self, x):
         x: "f32[s77, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(x);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
@@ -8386,9 +9091,14 @@ def forward(self, x):
 
     class while_loop_cond_graph_0(torch.nn.Module):
         def forward(self, it_1: "Sym(u0)", x_1: "f32[s77, 3]"):
+<<<<<<< HEAD
             sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x_1, 0);  x_1 = None
 
             lt: "Sym(u0 < s77)" = it_1 < sym_size_int_1;  it_1 = sym_size_int_1 = None
+=======
+            sym_size_int: "Sym(s77)" = torch.ops.aten.sym_size.int(x_1, 0);  x_1 = None
+            lt: "Sym(u0 < s77)" = it_1 < sym_size_int;  it_1 = sym_size_int = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return lt
 
     class while_loop_body_graph_0(torch.nn.Module):
@@ -8407,6 +9117,11 @@ def forward(self, it_1: "Sym(u0)", x_1: "f32[s77, 3]"):
     @parametrize("dynamic", [True, False])
     @parametrize("backend", ["eager", "aot_eager"])
     def test_while_loop_op_int_carry_compile(self, dynamic, backend):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, args = WHILE_LOOP_TESTS["int_carry"]
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -8428,6 +9143,7 @@ def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
         body_fn_0 = self.body_fn_0
         while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (0, l_x_), (s27, s77));  cond_fn_0 = body_fn_0 = l_x_ = s27 = None
 
+<<<<<<< HEAD
         getitem_4: "Sym(u2)" = while_loop[0]
 
         ge: "Sym(u2 >= 1)" = getitem_4 >= 1
@@ -8484,6 +9200,64 @@ def forward(self, unbacked_symint_0: "Sym(u1)", child_1: "f32[s77, s27]", s27: "
             copy_: "f32[s27]" = select.copy_(add);  select = add = copy_ = None
 
             add_1: "Sym(u1 + 1)" = unbacked_symint_0 + 1;  unbacked_symint_0 = None
+=======
+        getitem_4: "Sym(u1)" = while_loop[0]
+
+        ge: "Sym(u1 >= 1)" = getitem_4 >= 1
+        _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u1 >= 1 on node 'ge'");  ge = _assert_scalar_default = None
+
+        gt_1: "Sym(u1 > 0)" = getitem_4 > 0
+        _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(gt_1, "Runtime assertion failed for expression 0 < u1 on node 'gt_1'");  gt_1 = _assert_scalar_default_1 = None
+
+        out_x: "f32[s77, s27]" = while_loop[1];  while_loop = None
+
+        gt: "Sym(u1 > 0)" = getitem_4 > 0
+        _check = torch._check(gt);  gt = _check = None
+
+        add: "Sym(u1 + 1)" = getitem_4 + 1
+
+        add_1: "f32[s77, s27]" = getitem_4 + out_x;  out_x = None
+
+        lt: "Sym(u1 < s77)" = getitem_4 < s77;  s77 = None
+
+        mul: "Sym(2*u1)" = getitem_4 * 2;  getitem_4 = None
+        ones: "f32[2*u1]" = torch.ones(mul);  mul = None
+        return (add, add_1, lt, ones)
+
+    class cond_fn_0(torch.nn.Module):
+        def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+
+            size = l_x_.size();  l_x_ = None
+            getitem: "Sym(s77)" = size[0]
+            getitem_1: "Sym(s27)" = size[1];  size = getitem_1 = None
+            lt: "Sym(u0 < s77)" = unbacked_symint < getitem;  unbacked_symint = getitem = None
+            return lt
+
+    class body_fn_0(torch.nn.Module):
+        def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+
+            x_clone: "f32[s77, s27]" = l_x_.clone()
+
+            ge: "Sym(u0 >= 0)" = unbacked_symint >= 0
+            _check = torch._check(ge);  ge = _check = None
+
+            size = l_x_.size();  l_x_ = None
+            getitem: "Sym(s77)" = size[0]
+            getitem_1: "Sym(s27)" = size[1];  size = getitem_1 = None
+            lt: "Sym(u0 < s77)" = unbacked_symint < getitem;  getitem = None
+            _check_1 = torch._check(lt);  lt = _check_1 = None
+
+            select: "f32[s27]" = x_clone.select(0, unbacked_symint)
+            select_1: "f32[s27]" = x_clone.select(0, unbacked_symint)
+            add: "f32[s27]" = select_1 + unbacked_symint;  select_1 = None
+            copy_: "f32[s27]" = select.copy_(add);  select = add = copy_ = None
+
+            add_1: "Sym(u0 + 1)" = unbacked_symint + 1;  unbacked_symint = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add_1, x_clone)
 """,  # noqa: B950
             )
@@ -8506,8 +9280,11 @@ def forward(self, t):
         t: "f32[2, 3]";
 
         t, = fx_pytree.tree_flatten_spec(([t], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(t);  _guards_fn = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_1: "f32[]" = torch.ops.aten.sum.default(t)
         _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(sum_1, dtype = torch.float32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
         to: "i64[]" = torch.ops.aten.to.dtype(sum_1, torch.int64);  sum_1 = None
@@ -8568,6 +9345,11 @@ def forward(self, a_1: "Sym(u1)", b_1: "Sym(u2)", c1_1: "Sym(u3)", c2_1: "Sym(u4
     @parametrize("backend", ["eager", "aot_eager"])
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_while_loop_op_constant_and_symint_output_compile(self, dynamic, backend):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, args = WHILE_LOOP_TESTS["const_and_symint_output"]
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -8589,6 +9371,7 @@ def forward(self, L_t_: "f32[2, 3]"):
         sum_1: "f32[]" = l_t_.sum()
         to: "i64[]" = sum_1.to(torch.int64);  sum_1 = None
         item: "Sym(u0)" = to.item();  to = None
+<<<<<<< HEAD
         sin: "f32[2, 3]" = l_t_.sin()
 
         cond_fn_0 = self.cond_fn_0
@@ -8613,6 +9396,32 @@ def forward(self, L_t_: "f32[2, 3]"):
         add_5: "Sym(u20 + 1)" = getitem_13 + 1
         add_6: "Sym(u21 + 1)" = getitem_14 + 1
         add_7: "f32[2, 3]" = child + 1
+=======
+        child: "f32[2, 3]" = l_t_.sin()
+
+        cond_fn_0 = self.cond_fn_0
+        body_fn_0 = self.body_fn_0
+        while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (2, 3, 1, 1, 1, 3, item, child), ());  cond_fn_0 = body_fn_0 = item = child = None
+
+        getitem_8: "Sym(u8)" = while_loop[0]
+        getitem_9: "Sym(u9)" = while_loop[1]
+        getitem_10: "Sym(u10)" = while_loop[2]
+        getitem_11: "Sym(u11)" = while_loop[3]
+        getitem_12: "Sym(u12)" = while_loop[4]
+        getitem_13: "Sym(u13)" = while_loop[5]
+        getitem_14: "Sym(u14)" = while_loop[6]
+
+        child_1: "f32[2, 3]" = while_loop[7];  while_loop = None
+
+        add: "Sym(u8 + 1)" = getitem_8 + 1
+        add_1: "Sym(u9 + 1)" = getitem_9 + 1
+        add_2: "Sym(u10 + 1)" = getitem_10 + 1
+        add_3: "Sym(u11 + 1)" = getitem_11 + 1
+        add_4: "Sym(u12 + 1)" = getitem_12 + 1
+        add_5: "Sym(u13 + 1)" = getitem_13 + 1
+        add_6: "Sym(u14 + 1)" = getitem_14 + 1
+        add_7: "f32[2, 3]" = child_1 + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_8: "f32[2, 3]" = getitem_8 + l_t_;  getitem_8 = None
         add_9: "f32[2, 3]" = getitem_9 + l_t_;  getitem_9 = None
@@ -8621,7 +9430,11 @@ def forward(self, L_t_: "f32[2, 3]"):
         add_12: "f32[2, 3]" = getitem_12 + l_t_;  getitem_12 = None
         add_13: "f32[2, 3]" = getitem_13 + l_t_;  getitem_13 = None
         add_14: "f32[2, 3]" = getitem_14 + l_t_;  getitem_14 = None
+<<<<<<< HEAD
         add_15: "f32[2, 3]" = child + l_t_;  child = l_t_ = None
+=======
+        add_15: "f32[2, 3]" = child_1 + l_t_;  child_1 = l_t_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add, add_1, add_2, add_3, add_4, add_5, add_6, add_7, add_8, add_9, add_10, add_11, add_12, add_13, add_14, add_15)
 
     class cond_fn_0(torch.nn.Module):
@@ -8633,10 +9446,17 @@ def forward(self, unbacked_symint: "Sym(u1)", unbacked_symint_0: "Sym(u2)", unba
             return lt
 
     class body_fn_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, unbacked_symint_6: "Sym(u8)", unbacked_symint_7: "Sym(u9)", unbacked_symint_8: "Sym(u10)", unbacked_symint_9: "Sym(u11)", unbacked_symint_10: "Sym(u12)", unbacked_symint_11: "Sym(u13)", unbacked_symint_12: "Sym(u14)", child_1: "f32[2, 3]"):
             add: "Sym(u14 + 1)" = unbacked_symint_12 + 1;  unbacked_symint_12 = None
             child: "f32[2, 3]" = child_1 + 1;  child_1 = None
             return (unbacked_symint_7, unbacked_symint_8, unbacked_symint_9, unbacked_symint_10, unbacked_symint_6, 0, add, child)
+=======
+        def forward(self, unbacked_symint: "Sym(u1)", unbacked_symint_0: "Sym(u2)", unbacked_symint_1: "Sym(u3)", unbacked_symint_2: "Sym(u4)", unbacked_symint_3: "Sym(u5)", unbacked_symint_4: "Sym(u6)", unbacked_symint_5: "Sym(u7)", child: "f32[2, 3]"):
+            add: "Sym(u7 + 1)" = unbacked_symint_5 + 1;  unbacked_symint_5 = None
+            child_1: "f32[2, 3]" = child + 1;  child = None
+            return (unbacked_symint_0, unbacked_symint_1, unbacked_symint_2, unbacked_symint_3, unbacked_symint, 0, add, child_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
             )
 
@@ -8647,12 +9467,17 @@ def test_while_loop_op_pytree_int_carry_export(self, strict, dynamic):
         m, args = WHILE_LOOP_TESTS["pytree_int_carry"]
         dynamic_shapes = {"x": {0: torch.export.Dim("dim_x")}} if dynamic else None
         ep = self._check_export(m, args, strict=strict, dynamic_shapes=dynamic_shapes)
+<<<<<<< HEAD
         if strict and dynamic and not TEST_WITH_CROSSREF:
+=======
+        if strict and dynamic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertExpectedInline(
                 normalize_gm(ep.module().print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, x):
+<<<<<<< HEAD
         x: "f32[s6, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
@@ -8660,11 +9485,20 @@ def forward(self, x):
         sym_size_int_1: "Sym(s6)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s6, 3]" = torch.ops.aten.sin.default(x);  x = None
+=======
+        x: "f32[s77, 3]";
+
+        x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
+
+        sin: "f32[s77, 3]" = torch.ops.aten.sin.default(x);  x = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
         while_loop_body_graph_0 = self.while_loop_body_graph_0
         while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (sym_size_int_1, 3, 2, 2, 3, sin), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = sym_size_int_1 = sin = None
 
+<<<<<<< HEAD
         getitem_6: "Sym(u10)" = while_loop[0]
         getitem_7: "Sym(u11)" = while_loop[1]
         getitem_8: "Sym(u12)" = while_loop[2]
@@ -8684,6 +9518,27 @@ def forward(self, x):
 
     class while_loop_cond_graph_0(torch.nn.Module):
         def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s6, 3]"):
+=======
+        getitem_6: "Sym(u5)" = while_loop[0]
+        getitem_7: "Sym(u6)" = while_loop[1]
+        getitem_8: "Sym(u7)" = while_loop[2]
+        getitem_9: "Sym(u8)" = while_loop[3]
+        getitem_10: "Sym(u9)" = while_loop[4]
+
+        getitem_5: "f32[s77, 3]" = while_loop[5];  while_loop = None
+
+        add: "Sym(u7 + 1)" = getitem_8 + 1
+        add_1: "Sym(u8 + 1)" = getitem_9 + 1
+        add_2: "Sym(u9 + 1)" = getitem_10 + 1
+
+        add_3: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_8);  getitem_8 = None
+        add_4: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_9);  getitem_9 = None
+        add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_10);  getitem_10 = None
+        return pytree.tree_unflatten((getitem_6, getitem_7, add, add_1, add_2, add_3, add_4, add_5, getitem_5), self._out_spec)
+
+    class while_loop_cond_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s77, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mul: "Sym(u17*u18)" = arg2_1 * arg3_1;  arg2_1 = arg3_1 = None
             mul_1: "Sym(u17*u18*u19)" = mul * arg4_1;  mul = arg4_1 = None
             mul_2: "Sym(u15*u16)" = arg0_1 * arg1_1;  arg0_1 = arg1_1 = None
@@ -8691,7 +9546,11 @@ def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", ar
             return lt
 
     class while_loop_body_graph_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s6, 3]"):
+=======
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s77, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             add: "Sym(u15 + 1)" = arg0_1 + 1;  arg0_1 = None
             add_1: "Sym(u16 + 1)" = arg1_1 + 1;  arg1_1 = None
 
@@ -8699,7 +9558,11 @@ def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", ar
             add_3: "Sym(u18 + 1)" = arg3_1 + 1;  arg3_1 = None
             add_4: "Sym(u19 + 1)" = arg4_1 + 1;  arg4_1 = None
 
+<<<<<<< HEAD
             add_5: "f32[s6, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
+=======
+            add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add, add_1, add_2, add_3, add_4, add_5)
 """,  # noqa: B950
             )
@@ -8707,7 +9570,14 @@ def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", ar
     @skipIfTorchDynamo("Graph is not captured correctly when test with dynamo")
     @parametrize("dynamic", [True, False])
     @parametrize("backend", ["eager", "aot_eager"])
+<<<<<<< HEAD
+    def test_while_loop_op_pytree_int_carry_compile(self, dynamic, backend):
+=======
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_while_loop_op_pytree_int_carry_compile(self, dynamic, backend):
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, args = WHILE_LOOP_TESTS["pytree_int_carry"]
         if backend == "eager":
             backend = EagerAndRecordGraphs()
@@ -8731,6 +9601,7 @@ def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
         body_fn_0 = self.body_fn_0
         while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (s77, s27, 2, 2, 3, child), (s27, s77));  cond_fn_0 = body_fn_0 = s77 = s27 = child = None
 
+<<<<<<< HEAD
         getitem_10: "Sym(u10)" = while_loop[0]
         getitem_11: "Sym(u11)" = while_loop[1]
         getitem_12: "Sym(u12)" = while_loop[2]
@@ -8742,6 +9613,19 @@ def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
         add: "Sym(u12 + 1)" = getitem_12 + 1
         add_1: "Sym(u13 + 1)" = getitem_13 + 1
         add_2: "Sym(u14 + 1)" = getitem_14 + 1
+=======
+        getitem_10: "Sym(u5)" = while_loop[0]
+        getitem_11: "Sym(u6)" = while_loop[1]
+        getitem_12: "Sym(u7)" = while_loop[2]
+        getitem_13: "Sym(u8)" = while_loop[3]
+        getitem_14: "Sym(u9)" = while_loop[4]
+
+        out_x: "f32[s77, s27]" = while_loop[5];  while_loop = None
+
+        add: "Sym(u7 + 1)" = getitem_12 + 1
+        add_1: "Sym(u8 + 1)" = getitem_13 + 1
+        add_2: "Sym(u9 + 1)" = getitem_14 + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add_3: "f32[s77, s27]" = getitem_12 + out_x;  getitem_12 = None
         add_4: "f32[s77, s27]" = getitem_13 + out_x;  getitem_13 = None
@@ -8749,7 +9633,11 @@ def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
         return (getitem_10, getitem_11, add, add_1, add_2, add_3, add_4, add_5, out_x)
 
     class cond_fn_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child_1: "f32[s77, s27]", s27: "Sym(s27)", s77: "Sym(s77)"):
+=======
+        def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s77, s27]", s27, s77):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             s27_1 = s27
             s77_1 = s77
 
@@ -8760,6 +9648,7 @@ def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unba
             return lt
 
     class body_fn_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, unbacked_symint_4: "Sym(u5)", unbacked_symint_5: "Sym(u6)", unbacked_symint_6: "Sym(u7)", unbacked_symint_7: "Sym(u8)", unbacked_symint_8: "Sym(u9)", child_2: "f32[s77, s27]", s27: "Sym(s27)", s77: "Sym(s77)"):
             s27_1 = s27
             s77_1 = s77
@@ -8949,6 +9838,21 @@ def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1
             add_10: "f32[3]" = torch.ops.aten.add.Tensor(view, arg2_1);  view = arg2_1 = None
             add_11: "f32[3, 3]" = torch.ops.aten.add.Tensor(t_4, arg3_1);  t_4 = arg3_1 = None
             return (add_9, add_8, add_10, add_11)
+=======
+        def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+
+            add: "Sym(u0 + 1)" = unbacked_symint + 1;  unbacked_symint = None
+            add_1: "Sym(u1 + 1)" = unbacked_symint_0 + 1;  unbacked_symint_0 = None
+
+            add_2: "Sym(u2 + 1)" = unbacked_symint_1 + 1;  unbacked_symint_1 = None
+            add_3: "Sym(u3 + 1)" = unbacked_symint_2 + 1;  unbacked_symint_2 = None
+            add_4: "Sym(u4 + 1)" = unbacked_symint_3 + 1;  unbacked_symint_3 = None
+
+            child_1: "f32[s77, s27]" = child + 1;  child = None
+            return (add, add_1, add_2, add_3, add_4, child_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
             )
 
@@ -9020,6 +9924,11 @@ def mutate_f(x):
 
     @skipIfTorchDynamo("Graph is not captured correctly when test with dynamo")
     def test_while_loop_unbacked_bindings(self):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, args = WHILE_LOOP_TESTS["pytree_int_carry"]
         backend = EagerAndRecordGraphs()
         self._check_compile(m, args, dynamic=True, backend=backend)
@@ -9044,6 +9953,10 @@ def _check_export_ret_graph_str(self, fn, args, dynamic_shapes=None) -> str:
         return normalize_gm(non_strict_ep.module().print_readable(print_output=False))
 
     @skipIfTorchDynamo("Skip because dynamo cannot trace torch.export.")
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_eager_run_with_item(self):
         class M(torch.nn.Module):
             def forward(self, a, b1, b2, c):
@@ -9074,8 +9987,11 @@ def forward(self, a, b1, b2, c):
         a: "b8[]"; b1: "i64[1]"; b2: "i64[1]"; c: "f32[10]";
 
         a, b1, b2, c, = fx_pytree.tree_flatten_spec(([a, b1, b2, c], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(a, b1, b2, c);  _guards_fn = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         true_graph_0 = self.true_graph_0
         false_graph_0 = self.false_graph_0
         cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, (c, b1, b2));  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
@@ -9100,6 +10016,7 @@ def forward(self, c: "f32[10]", b1: "i64[1]", b2: "i64[1]"):
 """,  # noqa: B950
         )
 
+<<<<<<< HEAD
     def test_cond_merge_graph_preserves_ph_meta(self):
         class M(torch.nn.Module):
             def forward(self, x, y, z):
@@ -9125,6 +10042,8 @@ def false_fn(x):
         for ph in subgm.graph.find_nodes(op="placeholder"):
             self.assertTrue("example_value" in ph.meta)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("Skip because dynamo cannot trace torch.export.")
     def test_cond_symint_closure(self):
         from torch.export import Dim
@@ -9158,7 +10077,10 @@ def forward(self, x, y, z):
         x: "f32[s68, 3]"; y: "f32[s17]"; z: "f32[s68, 3]";
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
+<<<<<<< HEAD
         _guards_fn = self._guards_fn(x, y, z);  _guards_fn = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
         sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
 
@@ -9306,6 +10228,11 @@ def _inner(case):
     @parametrize("dynamic", [True, False])
     @parametrize("backend", ["eager", "aot_eager"])
     def test_cond_mismatched_branch_output(self, dynamic, backend):
+<<<<<<< HEAD
+=======
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def forward(self, x, y, z):
                 a = y.shape[0]
@@ -9354,6 +10281,10 @@ def forward(self, s17: "Sym(s17)", s94: "Sym(s94)", L_y_: "f32[s17, s94]", L_z_:
 
         getitem_5: "f32[u0, s94]" = cond[0]
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(getitem_5, 0);  getitem_5 = None
+<<<<<<< HEAD
+=======
+        _check_is_size = torch._check_is_size(sym_size_int);  _check_is_size = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ge: "Sym(u0 >= 0)" = sym_size_int >= 0;  sym_size_int = None
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
@@ -9364,7 +10295,11 @@ def forward(self, s17: "Sym(s17)", s94: "Sym(s94)", L_y_: "f32[s17, s94]", L_z_:
         return (sub,)
 
     class cond_true_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, l_x_: "f32[s17, s94]", s94: "Sym(s94)", s17_true_branch: "Sym(s17)", getitem_2_false_branch: "Sym(s17)", l_z__false_branch: "f32[s17, s94]"):
+=======
+        def forward(self, l_x_, s94, s17_true_branch, getitem_2_false_branch, l_z__false_branch):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             l_x__1 = l_x_
             s94_1 = s94
 
@@ -9374,7 +10309,11 @@ def forward(self, l_x_: "f32[s17, s94]", s94: "Sym(s94)", s17_true_branch: "Sym(
             return (clone,)
 
     class cond_false_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, l_x_: "f32[s17, s94]", s94: "Sym(s94)", s17_true_branch: "Sym(s17)", getitem_2_false_branch: "Sym(s17)", l_z__false_branch: "f32[s17, s94]"):
+=======
+        def forward(self, l_x_, s94, s17_true_branch, getitem_2_false_branch, l_z__false_branch):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             l_x__1 = l_x_
             s94_1 = s94
 
@@ -9537,6 +10476,25 @@ def test_function_schema_gen(self):
         self.assertEqual(schema2.parse(str(schema2)), schema2)
         self.assertEqual(schema3.parse(str(schema3)), schema3)
 
+<<<<<<< HEAD
+=======
+    def test_while_loop_schema_gen(self):
+        fn, inp = WHILE_LOOP_TESTS["simple_with_linear"]
+        graph = make_fx(fn)(*inp).graph
+        while_loop_node = next(
+            node
+            for node in graph.nodes
+            if node.op == "call_function"
+            and node.target is torch.ops.higher_order.while_loop
+        )
+        schema = torch._library.utils.hop_schema_from_fx_node(while_loop_node)
+        self.assertExpectedInline(
+            str(schema),
+            """while_loop(GraphModule cond_fn, GraphModule body_fn, Tensor[2] carried_inputs, Tensor[3] additional_inputs) -> Tensor[2]""",  # noqa: B950
+        )
+        self.assertEqual(schema.parse(str(schema)), schema)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_schema_tree_spec(self):
         schema_gen = HopSchemaGenerator(torch.ops.higher_order.cond)
         args = (torch.randn(3, 4), torch.randn(2, 3))
@@ -9553,6 +10511,7 @@ def test_schema_tree_spec(self):
             str(flat_schema), """cond(Tensor tuple_args0, Tensor tuple_args1) -> ()"""
         )
 
+<<<<<<< HEAD
     def test_cond_gen_schema_tensor_inputs(self):
         schema = torch.ops.higher_order.cond.gen_schema(
             torch.tensor(True),
@@ -9749,6 +10708,8 @@ def body_fn(x, y, z, c):
             """while_loop(Any cond_fn, Any body_fn, Tensor(a2!) carried_input0, Tensor(a3!) carried_input1, Tensor(a4!) carried_input2, Tensor(a5!) additional_input0) -> (Tensor, Tensor, Tensor)""",  # noqa: B950
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestHopSchema)
 instantiate_parametrized_tests(TestControlFlowTraced)
diff --git a/test/functorch/test_dims.py b/test/functorch/test_dims.py
index eb5202d4bb2ef..fbc1cd1bb56f9 100644
--- a/test/functorch/test_dims.py
+++ b/test/functorch/test_dims.py
@@ -10,9 +10,23 @@
 from attn_ft import BertSelfAttention as BertSelfAttentionA, Linear
 from attn_positional import BertSelfAttention as BertSelfAttentionB
 
+<<<<<<< HEAD
 import functorch.dim
 import torch
 from functorch.dim import Dim, DimList, dimlists, dims, stack, Tensor
+=======
+import torch
+from functorch._C import dim as _C
+from functorch.dim import (
+    Dim,
+    DimensionBindError,
+    DimList,
+    dimlists,
+    dims,
+    stack,
+    Tensor,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -26,6 +40,15 @@
 except ImportError:
     resnet18 = None
 
+<<<<<<< HEAD
+=======
+_test_c, _parse_test, _set_pointwise_optimize = (
+    _C._test_c,
+    _C._parse_test,
+    _C._set_pointwise_optimize,
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from time import perf_counter
 
@@ -398,6 +421,14 @@ def test_hello(self):
         torch.testing.assert_close(
             A[c + 1, c + 0].order(c), A[torch.arange(2) + 1, torch.arange(2)]
         )
+<<<<<<< HEAD
+=======
+        try:
+            A[..., 3, ...]
+            raise NotImplementedError
+        except DimensionBindError:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         C = torch.rand(4, 7)
         c_, x, y, z = dims()
@@ -474,6 +505,12 @@ def test_compare_dims(self):
         j.size = 4
         (i < j)  # noqa: B015
 
+<<<<<<< HEAD
+=======
+    def test_c(self):
+        _test_c()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_seg(self):
         i, k = dims()
         i.size = 4
@@ -485,6 +522,26 @@ def test_expand(self):
         i = dims()
         self.assertEqual(list(A[i].expand(2, 4).order(i).size()), [3, 2, 4])
 
+<<<<<<< HEAD
+=======
+    def test_parse(self):
+        self.assertEqual(("x", None, None, None), _parse_test(1, 0, "x"))
+        self.assertEqual(("x", None, "y", None), _parse_test(1, 0, "x", c="y"))
+        self.assertEqual(("x", None, "y", "z"), _parse_test(1, 0, "x", d="z", c="y"))
+
+        self.assertEqual(("x", "4", None, None), _parse_test(2, 0, "x", b="4"))
+        self.assertEqual(("x", "y", "z", "q"), _parse_test(2, 0, "x", "y", "z", "q"))
+        with self.assertRaises(TypeError):
+            _parse_test(2, 0, "x", "y", "z", "q", "5")
+        with self.assertRaises(TypeError):
+            _parse_test(2, 0, "x", "y", b="y")
+
+        with self.assertRaises(TypeError):
+            _parse_test(2, 0, "x", c="y")
+        with self.assertRaises(TypeError):
+            _parse_test(2, 0, "x")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_network(self):
         if resnet18 is None:
             self.skipTest("no torchvision")
@@ -677,10 +734,17 @@ def test_big_split(self):
 class TestMinFunctorchOnly(TestMin):
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         functorch.dim.POINTWISE_OPTIMIZE = False
 
     def tearDown(self):
         functorch.dim.POINTWISE_OPTIMIZE = True
+=======
+        _set_pointwise_optimize(False)
+
+    def tearDown(self):
+        _set_pointwise_optimize(True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().tearDown()
 
 
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 0a5d03f9dd1f0..71ad69331de4d 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -71,6 +71,10 @@
     markDynamoStrictTest,
     parametrize,
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     subtest,
     TEST_CUDA_MEM_LEAK_CHECK,
@@ -313,6 +317,7 @@ def test_is_cuda(self, device):
     def test_numel(self, device):
         self._test_attributes(lambda x: x.numel(), device)
 
+<<<<<<< HEAD
     def test_layout_sparse(self, device):
         indices = torch.tensor([[0, 1, 1], [2, 0, 2]], device=device)
         values = torch.tensor([3.0, 4.0, 5.0], device=device)
@@ -331,6 +336,8 @@ def foo(x):
         # The gradient should also be sparse
         self.assertEqual(result.layout, torch.sparse_coo)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace(self, device):
         x = torch.randn([], device=device)
 
@@ -5180,6 +5187,10 @@ def wrapper(*args, **kwargs):
 
 @markDynamoStrictTest
 class TestCompileTransforms(TestCase):
+<<<<<<< HEAD
+=======
+    @skipIfRocm(msg="test leaks memory on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.compile is not supported on Windows CUDA.
     # Triton only supports GPU with SM70 or later.
     @expectedFailureIf((IS_WINDOWS and TEST_CUDA) or (TEST_CUDA and not SM70OrLater))
@@ -5240,6 +5251,7 @@ def wrapper_fn(x, y):
         self.assertEqual(actual, expected)
 
 
+<<<<<<< HEAD
 class TestGradTrackingTensorToList(TestCase):
     """Tests for tolist() method with GradTrackingTensor (functorch tensors)."""
 
@@ -5335,6 +5347,8 @@ def f(x):
         self.assertEqual(result, [2.0 + 4.0j, 6.0 + 8.0j])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(
     TestGradTransform,
@@ -5414,9 +5428,12 @@ def f(x):
     globals(),
     only_for=only_for,
 )
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TestGradTrackingTensorToList, globals(), only_for=only_for
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index 96e0ccaa6f48a..e68dc4d25decc 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -3,7 +3,11 @@
 import inspect
 import random
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx as fx
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index a8e22f276d516..911bbb3654ff7 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -399,6 +399,7 @@ def is_inplace(op, variant):
     "as_strided_copy",
 }
 
+<<<<<<< HEAD
 bool_unsupported_ordered_ops = {
     "topk",
     "argmin",
@@ -431,6 +432,8 @@ def is_inplace(op, variant):
     filter(lambda op: op.name in complex_unsupported_ordered_ops, op_db)
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(TEST_WITH_ASAN, "tests time out with asan, are probably redundant")
 @unMarkDynamoStrictTest
@@ -965,7 +968,11 @@ def fn(inp, *args, **kwargs):
                 # (3) encountering this error in PyTorch internals.
                 xfail("index_reduce", "prod"),
                 decorate(
+<<<<<<< HEAD
                     "linalg.householder_product", decorator=skipIfRocm
+=======
+                    "linalg.householder_product", decorator=runOnRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),  # works on ROCm
                 xfail(
                     # nans
@@ -2986,6 +2993,7 @@ def func(x):
             actual_fn(torch.ones_like(actual_o)),
         )
 
+<<<<<<< HEAD
     @ops(bool_ordered_op_db, dtypes=[torch.bool])
     def test_ordered_bool_raises(self, device, dtype, op):
         # Generate sample inputs for the op
@@ -3019,6 +3027,8 @@ def test_ordered_complex_raises(self, device, dtype, op):
                 **sample_input.kwargs,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestOperators, globals(), only_for=only_for)
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 0f893201733d3..c34e7ae805467 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -734,7 +734,10 @@ def test_fallback_does_not_warn_by_default(self):
             # warning, not a warning from the vmap fallback path.
             self.assertEqual(len(wa), 1)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Flaky test")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.expectedFailure
     def test_fallback_warns_when_warnings_are_enabled(self):
         # NB: One day we will implement a batching rule for torch.atan2.
@@ -4152,7 +4155,11 @@ def test():
                 with subtest_ctx(self), skip_xfail_ctx(self):
                     args = (sample_input.input,) + sample_input.args
                     if not any(isinstance(arg, torch.Tensor) for arg in args):
+<<<<<<< HEAD
                         # At least one tensor required for vmap.
+=======
+                        # Atleast one tensor required for vmap.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         continue
                     kwargs = sample_input.kwargs
                     is_batch_norm_and_training = is_batch_norm_training(op.name, kwargs)
@@ -4230,7 +4237,11 @@ def sample_vmap_out_dim_numpy_split_copy_with_int(
         xfail("as_strided_copy"),
         xfail(
             "as_strided_scatter"
+<<<<<<< HEAD
         ),  # no batching rule implemented, default doesn't work
+=======
+        ),  # no batching rule implemented, default doesnt work
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         skip(
             "new_empty_strided"
         ),  # empty tensor data is garbage so it's hard to make comparisons with it
@@ -4534,6 +4545,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
                 xfail("clamp_min", ""),
                 xfail("sparse.sampled_addmm"),
                 xfail("sparse.mm", "reduce"),
+<<<<<<< HEAD
                 xfail("special.chebyshev_polynomial_t"),
                 xfail("special.chebyshev_polynomial_v"),
                 xfail("special.chebyshev_polynomial_u"),
@@ -4542,13 +4554,19 @@ def test_vmap_exhaustive(self, device, dtype, op):
                 xfail("special.shifted_chebyshev_polynomial_v"),
                 xfail("special.shifted_chebyshev_polynomial_u"),
                 xfail("special.shifted_chebyshev_polynomial_w"),
+=======
+                xfail("special.chebyshev_polynomial_u"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 xfail("_segment_reduce", "offsets"),
                 xfail("index_reduce", "prod"),
                 xfail("index_reduce", "mean"),
                 xfail("index_reduce", "amin"),
                 xfail("index_reduce", "amax"),
                 xfail("special.laguerre_polynomial_l"),
+<<<<<<< HEAD
                 xfail("special.legendre_polynomial_p"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 xfail("special.hermite_polynomial_h"),
                 xfail("jiterator_binary", device_type="cuda"),
                 xfail("jiterator_4inputs_with_extra_args", device_type="cuda"),
@@ -4556,6 +4574,10 @@ def test_vmap_exhaustive(self, device, dtype, op):
                 xfail("lu_solve", ""),
                 xfail("special.hermite_polynomial_he"),
                 xfail("nn.functional.dropout3d", ""),
+<<<<<<< HEAD
+=======
+                xfail("special.chebyshev_polynomial_t"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 xfail("as_strided_scatter", ""),
                 xfail("equal", ""),
                 xfail("linalg.lu", ""),
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index adb66ac4d9709..516914846e10e 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -208,7 +208,10 @@
     "aten::subtract_.Scalar",
     "aten::subtract_.Tensor",
     "aten::svd.U",
+<<<<<<< HEAD
     "aten::sym_is_contiguous",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten::sym_size.int",
     "aten::sym_stride.int",
     "aten::sym_numel",
diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index cab6b018d5782..26e8a8d761cdd 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -121,7 +121,13 @@ def get_suggested_xfails(base, tests):
         cpu_variant = base + "_cpu_float32"
         cuda_variant = base + "_cuda_float32"
         namespace, api, variant = parse_base(base)
+<<<<<<< HEAD
         if namespace is not None:
+=======
+        if namespace is None:
+            api = api
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             api = f"{namespace}.{api}"
         if cpu_variant in tests and cuda_variant in tests:
             result.append(f"xfail('{api}', '{variant}'),")
diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 96c100ef82ffa..a1925bedbb8d5 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -341,7 +341,11 @@ def copy_recursive(node):
                     lambda a: map_arg(a, lambda n: load_arg(n, quantized=True)),
                 )
                 if r is NotImplemented:
+<<<<<<< HEAD
                     # quantizer choose to quantize the node take the entire match, and just copy it over
+=======
+                    # quantizer choose to to quantize the node take the entire match, and just copy it over
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     env[node.name] = copy_recursive(node)
                 else:
                     quant_env[node.name] = r
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index 7fd3a6dbb0041..dcd7075cd1d58 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -238,8 +238,12 @@ def forward(self, a: torch.Tensor) -> torch.Tensor:
 
     def test_impure_random(self):
         """
+<<<<<<< HEAD
         Test that DCE doesn't remove call_function for torch.rand and other random functions.
         Tests both FX tracing and AOT compilation (issue #151524).
+=======
+        Test that DCE doesn't remove call_function for torch.rand.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         class TestModule(torch.nn.Module):
@@ -247,6 +251,7 @@ def forward(self, a: torch.Tensor) -> torch.Tensor:
                 x = torch.rand([10])  # noqa: F841
                 return a * 2
 
+<<<<<<< HEAD
         # Test FX tracing + DCE
         self._run_dce_and_test(TestModule(), expect_dce_changes=False)
 
@@ -304,6 +309,11 @@ def count_random_ops():
         compiled_result = torch.compile(model, backend=aot_backend)(torch.tensor([1.0]))
         self.assertEqual(eager_result, compiled_result)
 
+=======
+        # %torch.rand should not be removed because it has side effects.
+        self._run_dce_and_test(TestModule(), expect_dce_changes=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_impure_kwargs(self):
         """
         Test that DCE doesn't remove call_function nodes with side effects on kwargs.
@@ -338,6 +348,11 @@ def test_keep_collectives(self):
         Test that DCE doesn't remote collective ops even the results are not used.
         """
 
+<<<<<<< HEAD
+=======
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -352,6 +367,10 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
+<<<<<<< HEAD
+=======
+            store=FakeStore(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
@@ -363,6 +382,11 @@ def test_keep_collectives_no_overload(self):
         Test that DCE doesn't remote collective ops (no overload version) even the results are not used.
         """
 
+<<<<<<< HEAD
+=======
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -377,6 +401,10 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
+<<<<<<< HEAD
+=======
+            store=FakeStore(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
diff --git a/test/fx/test_fx_param_shape_control_flow.py b/test/fx/test_fx_param_shape_control_flow.py
index f3485f94a951a..c2c7b7ec06235 100644
--- a/test/fx/test_fx_param_shape_control_flow.py
+++ b/test/fx/test_fx_param_shape_control_flow.py
@@ -118,7 +118,11 @@ def verify_mm_relu_mods(self, mm_only_mod, relu_mod):
         graph1_node_targets = [n.target for n in traced_graph.nodes]
         graph2_node_targets = [n.target for n in traced_graph2.nodes]
 
+<<<<<<< HEAD
         # the second graph has an extra relu function call node
+=======
+        # the second graph has an exta relu function call node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert torch.mm in graph1_node_targets and torch.mm in graph2_node_targets
         assert (
             torch.relu not in graph1_node_targets and torch.relu in graph2_node_targets
diff --git a/test/fx/test_fx_split.py b/test/fx/test_fx_split.py
index 8d2b120e534ae..b7bf2a48bf7a0 100644
--- a/test/fx/test_fx_split.py
+++ b/test/fx/test_fx_split.py
@@ -1,15 +1,22 @@
 # Owner(s): ["module: fx"]
 
+<<<<<<< HEAD
 import dataclasses
 from collections import defaultdict
 
 import torch
 import torch.fx.passes.operator_support as op_support
 import torch.fx.passes.splitter_base as splitter_base
+=======
+from collections import defaultdict
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.passes.split_utils import split_by_tags
 from torch.testing._internal.common_utils import TestCase
 
 
+<<<<<<< HEAD
 @torch.jit.script
 @dataclasses.dataclass
 class DummyDataClass:
@@ -23,6 +30,8 @@ def wrapped_add(_dataclass, y):
     return _dataclass.c + y
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestFXSplit(TestCase):
     def test_split_preserve_node_meta(self):
         class TestModule(torch.nn.Module):
@@ -54,6 +63,7 @@ def forward(self, x, y):
                 self.assertIn("name", node.meta)
                 self.assertEqual(node.meta["name"], node.name)
 
+<<<<<<< HEAD
     def test_dataclass_as_graph_entry(self):
         """
         Test that splitting works when the graph entry is a dataclass instance
@@ -113,6 +123,8 @@ def __init__(self, module, sample_input, operator_support):
         split_module_result = split_result(test_input)
         self.assertTrue(torch.equal(original_result, split_module_result))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestSplitByTags(TestCase):
     class TestModule(torch.nn.Module):
@@ -296,7 +308,11 @@ def test_split_by_tags(self) -> None:
         gm_output = module(inputs)
         split_gm_output = split_gm(inputs)
 
+<<<<<<< HEAD
         self.assertTrue(type(gm_output) is type(split_gm_output))
+=======
+        self.assertTrue(type(gm_output) == type(split_gm_output))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.equal(gm_output, split_gm_output))
 
 
diff --git a/test/fx/test_fx_traceback.py b/test/fx/test_fx_traceback.py
index 610ba8560d0c4..c0058d6ecbe35 100644
--- a/test/fx/test_fx_traceback.py
+++ b/test/fx/test_fx_traceback.py
@@ -2,7 +2,10 @@
 
 import torch
 from torch._inductor.compile_fx import aot_export_module
+<<<<<<< HEAD
 from torch.export import default_decompositions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.traceback import get_graph_provenance_json, NodeSource, NodeSourceAction
 from torch.testing._internal.common_utils import TestCase
 
@@ -32,8 +35,11 @@ def test_node_source(self):
             dummy_source_dict,
         )
 
+<<<<<<< HEAD
         self.assertEqual(node_source, NodeSource._from_dict(node_source.to_dict()))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Dummy node
         node = torch.fx.Node(
             graph=torch.fx.Graph(),
@@ -67,6 +73,7 @@ def test_node_source(self):
             },
         )
 
+<<<<<<< HEAD
         # Test two node sources are same
         node_source1 = NodeSource(
             node=None, pass_name="test_pass", action=NodeSourceAction.CREATE
@@ -123,6 +130,8 @@ def test_node_source(self):
         self.assertNotEqual(node_source_replace, node_source_create)
         self.assertNotEqual(hash(node_source_replace), hash(node_source_create))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_provenance(self):
         def check_node_source(node_source_dict, name, pass_name, action):
             self.assertEqual(node_source_dict["name"], name)
@@ -154,6 +163,7 @@ def forward(self, x):
         model = Model()
         example_inputs = (torch.randn(8, 10),)
         ep = torch.export.export(model, example_inputs, strict=True)
+<<<<<<< HEAD
 
         decomposed_ep = ep.run_decompositions(default_decompositions())
         # node decomposed from same ancestor node should have same from_node info
@@ -203,6 +213,8 @@ def forward(self, x):
                         node_name_to_from_node[node_name_2],
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = ep.module()
         provenance = get_graph_provenance_json(gm.graph)
         self.assertEqual(
@@ -225,7 +237,11 @@ def forward(self, x):
         check_node_source(
             key_provenance,
             "x",
+<<<<<<< HEAD
             "Interpreter_DynamoGraphTransformer",
+=======
+            "Interpreter_FlattenInputOutputSignature",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             CREATE_STR,
         )
 
@@ -274,7 +290,11 @@ def forward(self, x):
             check_node_source(
                 key_provenance,
                 "x",
+<<<<<<< HEAD
                 "Interpreter_DynamoGraphTransformer",
+=======
+                "Interpreter_FlattenInputOutputSignature",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 CREATE_STR,
             )
 
diff --git a/test/fx/test_fx_xform_observer.py b/test/fx/test_fx_xform_observer.py
index 8db18f0c55e46..aeecd463a22f9 100644
--- a/test/fx/test_fx_xform_observer.py
+++ b/test/fx/test_fx_xform_observer.py
@@ -55,7 +55,11 @@ def replacement(x):
             )
         )
 
+<<<<<<< HEAD
     @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
+=======
+    @torch._inductor.config.patch("trace.enabled", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_transform_observer_node_tracking(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -156,7 +160,11 @@ def forward(self, x):
             [NodeSourceAction.REPLACE, NodeSourceAction.CREATE],
         )
 
+<<<<<<< HEAD
     @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
+=======
+    @torch._inductor.config.patch("trace.enabled", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_transform_observer_deepcopy(self):
         class SimpleLinearModel(torch.nn.Module):
             def forward(self, x):
@@ -179,6 +187,7 @@ def forward(self, x):
         self.assertEqual(len(gm2._erase_node_hooks), 0)
         self.assertEqual(len(gm2._deepcopy_hooks), 0)
 
+<<<<<<< HEAD
     @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
     def test_graph_transform_observer_replace(self):
         # the node should should not be duplicated
@@ -205,6 +214,8 @@ def forward(self, x):
         self.assertEqual(new_node.meta["from_node"][0].name, "add")
         self.assertEqual(new_node.meta["from_node"][0].pass_name, "test")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     raise RuntimeError(
diff --git a/test/fx/test_lazy_graph_module.py b/test/fx/test_lazy_graph_module.py
index a17bcb9151def..6d51d92a30d58 100644
--- a/test/fx/test_lazy_graph_module.py
+++ b/test/fx/test_lazy_graph_module.py
@@ -69,7 +69,11 @@ def f(x):
 
     def test_needs_recompile(self):
         """
+<<<<<<< HEAD
         Make sure needs_recompile() return the correct state.
+=======
+        Make sure needs_recompile() return the corrent state.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         def f(x):
@@ -141,7 +145,11 @@ def f(x):
         self.assertTrue(isinstance(gm2, _LazyGraphModule))
         self.assertTrue(gm2._needs_recompile())
 
+<<<<<<< HEAD
         # make_fx will cal forward method of gm. That clears the _needs_recompile()
+=======
+        # make_fx will cal foward method of gm. That clears the _needs_recompile()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # flag.
         self.assertFalse(gm._needs_recompile())
 
@@ -175,7 +183,11 @@ def f(x):
 
     def test_save_lazy_foward(self):
         """
+<<<<<<< HEAD
         Save the lazy forward method and call it repeatedly. Make sure we
+=======
+        Save the lazy forward method and call it repeatly. Make sure we
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         don't recompile for each such call.
         """
 
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index 6354fec2c6ed9..75355e956de80 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -2,11 +2,19 @@
 
 import os
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
 from torch.export import export
+=======
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx import symbolic_trace
 from torch.fx.experimental.proxy_tensor import make_fx
 
@@ -172,7 +180,11 @@ def pattern(x, weight):
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
+<<<<<<< HEAD
         pattern_gm = export(
+=======
+        pattern_gm = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             WrapperModule(pattern), example_inputs, strict=True
         ).module()
         before_split_res = pattern_gm(*example_inputs)
@@ -203,11 +215,19 @@ def pattern(x, weight):
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
+<<<<<<< HEAD
         pattern_gm = export(
             WrapperModule(pattern), example_inputs, strict=True
         ).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
         target_gm = export(
+=======
+        pattern_gm = export_for_training(
+            WrapperModule(pattern), example_inputs, strict=True
+        ).module()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        target_gm = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             WrapperModule(target_graph), example_inputs, strict=True
         ).module()
         internal_matches = matcher.match(target_gm.graph)
@@ -248,9 +268,17 @@ def forward(self, x):
                 return linear, {"linear": linear, "x": x}
 
         example_inputs = (torch.randn(3, 5),)
+<<<<<<< HEAD
         pattern_gm = export(Pattern(), example_inputs, strict=True).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
         target_gm = export(M(), example_inputs, strict=True).module()
+=======
+        pattern_gm = export_for_training(
+            Pattern(), example_inputs, strict=True
+        ).module()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        target_gm = export_for_training(M(), example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         internal_matches = matcher.match(target_gm.graph)
         for internal_match in internal_matches:
             name_node_map = internal_match.name_node_map
diff --git a/test/fx/test_partitioner_order.py b/test/fx/test_partitioner_order.py
index f4c3ef072f9a6..b1e6100079211 100644
--- a/test/fx/test_partitioner_order.py
+++ b/test/fx/test_partitioner_order.py
@@ -24,7 +24,10 @@ def __init__(self, graph_module: torch.fx.GraphModule):
         )
 
 
+<<<<<<< HEAD
 # original graph node order is: ['x', 'add', 'add_1', 'output']
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AddModule(torch.nn.Module):
     def forward(self, x):
         y = torch.add(x, x)
@@ -33,6 +36,7 @@ def forward(self, x):
 
 
 class TestPartitionerOrder(TestCase):
+<<<<<<< HEAD
     # partitoner test to check graph node order remains the same with the original graph after partitioning
     def test_partitioner_graph_node_order(self):
         m = AddModule()
@@ -50,6 +54,15 @@ def test_partitioner_multiple_runs_order(self):
         partitions = DummyPartitioner(traced_m).propose_partitions()
         partition_nodes = [list(partition.nodes) for partition in partitions]
         node_order = [n.name for n in partition_nodes[0]]
+=======
+    # partitoner test to check graph node order
+    def test_partitioner_order(self):
+        m = AddModule()
+        traced_m = torch.fx.symbolic_trace(m)
+        partions = DummyPartitioner(traced_m).propose_partitions()
+        partion_nodes = [list(partition.nodes) for partition in partions]
+        node_order = [n.name for n in partion_nodes[0]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(10):
             traced_m = torch.fx.symbolic_trace(m)
             new_partion = DummyPartitioner(traced_m).propose_partitions()
diff --git a/test/fx/test_pass_infra.py b/test/fx/test_pass_infra.py
index 47531e15040eb..97cc813566b89 100644
--- a/test/fx/test_pass_infra.py
+++ b/test/fx/test_pass_infra.py
@@ -131,7 +131,11 @@ def check_bad_args(graph_module, i):
 
     def test_topological_sort(self):
         """
+<<<<<<< HEAD
         Tests that passes are correctly ordered based on constraints.
+=======
+        Tests that passes are correctly ordered based on contraints.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         def pass0(x):
diff --git a/test/fx/test_source_matcher_utils.py b/test/fx/test_source_matcher_utils.py
index 1a5755c383d9c..17bd3aea419a2 100644
--- a/test/fx/test_source_matcher_utils.py
+++ b/test/fx/test_source_matcher_utils.py
@@ -222,7 +222,11 @@ def test_legalize_slice(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
+<<<<<<< HEAD
                 torch._check(b >= 0)
+=======
+                torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(b + 1 < y.size(0))
                 return y[: b + 1]
 
diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index 0ee60f978127d..017dedfdb42d3 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -514,8 +514,13 @@ def forward(self, x):
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n, m in zip(symbolic_traced.graph.nodes, graph.nodes):
             if n.op == "placeholder":
+<<<<<<< HEAD
                 assert n.type is int
                 assert m.type is int
+=======
+                assert n.type == int
+                assert m.type == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_subgraph_rewriter_replace_consecutive_submodules(self):
         def f(x):
diff --git a/test/higher_order_ops/test_invoke_quant.py b/test/higher_order_ops/test_invoke_quant.py
index 7796a9e4a1685..2d894662458ab 100644
--- a/test/higher_order_ops/test_invoke_quant.py
+++ b/test/higher_order_ops/test_invoke_quant.py
@@ -186,7 +186,11 @@ def quant_matching(match: Match, *args, **kwargs):
 
     @skipIfXpu(
         msg="MM Triton template fusion for XPU not work because the fusion"
+<<<<<<< HEAD
         " can not speedup, unskip until #146568 fixed."
+=======
+        " can not speedup, unskip untill #146568 fixed."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     @requires_gpu()
     @config.patch(prologue_fusion=True)
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index 700751942ba13..42e19c96a32df 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -17,12 +17,18 @@
 from torch._dynamo.testing import (
     AotEagerAndRecordGraphs,
     EagerAndRecordGraphs,
+<<<<<<< HEAD
     empty_line_normalizer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InductorAndRecordGraphs,
     normalize_gm,
 )
 from torch._higher_order_ops.schema import find_hop_schema
+<<<<<<< HEAD
 from torch._inductor import config as inductor_config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.pattern_matcher import (
     CallFunctionVarArgs,
     PatternMatcherPass,
@@ -35,7 +41,11 @@
     TestCase,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
+=======
+from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 nested_compile_region = torch.compiler.nested_compile_region
@@ -254,6 +264,13 @@ def fn(mod, x, y):
         y_clone = y.detach().clone().requires_grad_(True)
         backend = EagerAndRecordGraphs()
         with (
+<<<<<<< HEAD
+=======
+            mock.patch(
+                "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+                True,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.no_grad(),
         ):
             res = torch.compile(fn, backend=backend, fullgraph=True)(
@@ -330,8 +347,19 @@ def fn(mod, x, y):
         x_clone = x.detach().clone().requires_grad_(True)
         y_clone = y.detach().clone().requires_grad_(True)
         backend = AotEagerAndRecordGraphs()
+<<<<<<< HEAD
         res = torch.compile(fn, backend=backend, fullgraph=True)(mod, x_clone, y_clone)
         res.sum().backward()
+=======
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            res = torch.compile(fn, backend=backend, fullgraph=True)(
+                mod, x_clone, y_clone
+            )
+            res.sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(backend.fw_graphs), 1)
         self.assertEqual(len(backend.bw_graphs), 1)
         self.assertEqual(ref, res)
@@ -341,19 +369,36 @@ def fn(mod, x, y):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]", primals_2: "f32[8]", primals_3: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_0 = None
         getitem_12: "f32[8]" = invoke_subgraph_4[3]
         getitem_11: "f32[8]" = invoke_subgraph_4[2]
         getitem_10: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+<<<<<<< HEAD
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+=======
+
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_1 = primals_1 = primals_2 = primals_3 = None
         getitem_15: "f32[8]" = invoke_subgraph_6[3]
         getitem_14: "f32[8]" = invoke_subgraph_6[2]
         getitem_13: "f32[8]" = invoke_subgraph_6[1]
         getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+<<<<<<< HEAD
         add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
         return (add, getitem_12, getitem_11, getitem_10, getitem_15, getitem_14, getitem_13)
+=======
+
+        add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
+        return (add, getitem_12, getitem_11, getitem_10, getitem_15, getitem_14, getitem_13)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
@@ -362,7 +407,10 @@ def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]")
             mul_2: "f32[8]" = torch.ops.aten.mul.Tensor(mul_1, primals_2);  mul_1 = None
             return (mul_2, primals_0, primals_1, primals_2)
 """,
+<<<<<<< HEAD
             ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertExpectedInline(
             normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -370,6 +418,7 @@ def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]")
 class GraphModule(torch.nn.Module):
     def forward(self, getitem_12: "f32[8]", getitem_11: "f32[8]", getitem_10: "f32[8]", getitem_15: "f32[8]", getitem_14: "f32[8]", getitem_13: "f32[8]", tangents_1: "f32[8]"):
         partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_0
+<<<<<<< HEAD
         invoke_subgraph_7 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_0', getitem_13, getitem_14, getitem_15, tangents_1);  partitioned_bw_subgraph_0_1 = getitem_13 = getitem_14 = getitem_15 = None
         getitem_2: "f32[8]" = invoke_subgraph_7[0]
         getitem_3: "f32[8]" = invoke_subgraph_7[1];  invoke_subgraph_7 = None
@@ -377,6 +426,19 @@ def forward(self, getitem_12: "f32[8]", getitem_11: "f32[8]", getitem_10: "f32[8
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
         getitem_6: "f32[8]" = invoke_subgraph_5[0]
         getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
+=======
+
+        invoke_subgraph_7 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_0', getitem_13, getitem_14, getitem_15, tangents_1);  partitioned_bw_subgraph_0_1 = getitem_13 = getitem_14 = getitem_15 = None
+        getitem_2: "f32[8]" = invoke_subgraph_7[0]
+        getitem_3: "f32[8]" = invoke_subgraph_7[1];  invoke_subgraph_7 = None
+
+        partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
+        getitem_6: "f32[8]" = invoke_subgraph_5[0]
+        getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         add_1: "f32[8]" = torch.ops.aten.add.Tensor(getitem_2, getitem_6);  getitem_2 = getitem_6 = None
         add_2: "f32[8]" = torch.ops.aten.add.Tensor(getitem_3, getitem_7);  getitem_3 = getitem_7 = None
         return (add_1, add_2, None)
@@ -392,7 +454,10 @@ def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]",
             mul_7: "f32[8]" = torch.ops.aten.mul.Tensor(mul_5, primals_1);  mul_5 = primals_1 = None
             return (mul_7, mul_6, None)
 """,
+<<<<<<< HEAD
             ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_buffer_mutation_works_under_no_grad(self):
@@ -418,23 +483,49 @@ def fn(mod, x, y):
 
         x_clone = x.detach().clone().requires_grad_(True)
         y_clone = y.detach().clone().requires_grad_(True)
+<<<<<<< HEAD
         with torch.no_grad():
             res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+=======
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            with torch.no_grad():
+                res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ref, res)
         self.assertEqual(mod_ref.buf, mod.buf)
 
         mod = Mod()
         x_clone = x.detach().clone().requires_grad_(True)
         y_clone = y.detach().clone().requires_grad_(True)
+<<<<<<< HEAD
         with torch.inference_mode():
             res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+=======
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            with torch.inference_mode():
+                res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ref, res)
         self.assertEqual(mod_ref.buf, mod.buf)
 
         mod = Mod()
         x_clone = x.detach().clone().requires_grad_(False)
         y_clone = y.detach().clone().requires_grad_(False)
+<<<<<<< HEAD
         res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+=======
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ref, res)
         self.assertEqual(mod_ref.buf, mod.buf)
 
@@ -460,7 +551,15 @@ def fn(mod, x, y):
             RuntimeError,
             "does not currently support training with in-place input or buffer mutations",
         ):
+<<<<<<< HEAD
             torch.compile(fn, backend="inductor", fullgraph=True)(mod, x, y)
+=======
+            with mock.patch(
+                "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+                True,
+            ):
+                torch.compile(fn, backend="inductor", fullgraph=True)(mod, x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_list(self):
         @nested_compile_region
@@ -550,7 +649,11 @@ def fn(x):
         self.assertEqual(ref, res)
         self.assertEqual(x.grad, x_clone.grad)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sdpa(self):
         @nested_compile_region
         def gn(q, k, v):
@@ -614,7 +717,10 @@ def fn(x, y):
         self.assertEqual(ref, res)
         res.sum().backward()
 
+<<<<<<< HEAD
     @inductor_config.patch("fx_graph_cache", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dropout_checks_joint_graph(self):
         # `dropout` tests that joint graph passes (not just partitioner) is ran
         # on the hop graphs. Inductor rng functionalization happens in the joint
@@ -651,11 +757,21 @@ def fn(x):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = None
         getitem_7: "b8[8]" = invoke_subgraph_4[2]
         getitem_6: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+<<<<<<< HEAD
+        partitioned_fw_subgraph_1_0 = self.partitioned_fw_subgraph_1_0
+=======
+
         partitioned_fw_subgraph_1_0 = self.partitioned_fw_subgraph_1_0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_1_0, 'partitioned_fw_subgraph_1_0', primals_1);  partitioned_fw_subgraph_1_0 = primals_1 = None
         getitem_8: "f32[8]" = invoke_subgraph_6[1]
         getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
@@ -668,9 +784,15 @@ def forward(self, primals_0: "f32[8]"):
             sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
 
             inductor_seeds_default: "i64[1]" = torch.ops.prims.inductor_seeds.default(1, device(type='cpu'))
+<<<<<<< HEAD
 
             inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
             inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+=======
+            inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
+            inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gt: "b8[8]" = torch.ops.aten.gt.Scalar(inductor_random_default, 0.5);  inductor_random_default = None
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(gt, sin);  sin = None
             mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
@@ -681,10 +803,15 @@ def forward(self, primals_0: "f32[8]"):
             sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
             return (sin, primals_0)
 """,
+<<<<<<< HEAD
                 ignore_empty_lines=True,
             )
 
     @inductor_config.patch("fx_graph_cache", False)
+=======
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dropout_checks_joint_graph_inference(self):
         # Checks that joint graph results in inductor seeds for just the inference graph
         @nested_compile_region
@@ -714,16 +841,25 @@ def forward(self, arg0_1: "f32[8]"):
     class repeated_subgraph0(torch.nn.Module):
         def forward(self, arg0_1: "f32[8]"):
             inductor_seeds_default: "i64[1]" = torch.ops.prims.inductor_seeds.default(1, device(type='cpu'))
+<<<<<<< HEAD
 
             inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
             inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+=======
+            inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
+            inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gt: "b8[8]" = torch.ops.aten.gt.Scalar(inductor_random_default, 0.5);  inductor_random_default = None
             sin: "f32[8]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(gt, sin);  gt = sin = None
             mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
             return (mul_1,)
 """,
+<<<<<<< HEAD
                 ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_dedupe(self):
@@ -772,6 +908,10 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subgraph_1 = self.subgraph_0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -790,12 +930,20 @@ def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]", primals_2: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem_9: "f32[8]" = invoke_subgraph_4[2]
         getitem_8: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', getitem, primals_2);  partitioned_fw_subgraph_0_1 = getitem = primals_2 = None
         getitem_11: "f32[8]" = invoke_subgraph_6[2]
         getitem_10: "f32[8]" = invoke_subgraph_6[1]
@@ -807,7 +955,10 @@ def forward(self, primals_0: "f32[8]", primals_1: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
             return (mul, primals_0, primals_1)
 """,
+<<<<<<< HEAD
             ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_dce(self):
@@ -891,6 +1042,10 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', a, l_y_);  subgraph_1 = a = l_y_ = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -910,7 +1065,10 @@ def forward(self, a: "f32[8]", l_y_: "f32[8]"):
 """,
             )
 
+<<<<<<< HEAD
     @inductor_config.patch("fx_graph_cache", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_to_reshape(self):
         @nested_compile_region
         def gn(x):
@@ -1023,6 +1181,7 @@ def fn(x, y):
 
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
+<<<<<<< HEAD
         x_clone = x.clone()
         self.assertEqual(opt_fn(x, y), fn(x_clone, y))
 
@@ -1105,6 +1264,19 @@ def _mock_invoke_subgraph(mode, subgraph, identifier, *operands):
         exp_out = fn(x_clone, y)
         self.assertEqual(exp_out, out)
         self.assertEqual(x_clone, x)
+=======
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_input_mutation_inference_mode(self):
         @nested_compile_region
@@ -1123,10 +1295,23 @@ def fn(x, y):
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "Inplace update to inference tensor outside InferenceMode is not allowed",
         ):
             opt_fn(x, y)
 
+=======
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_simple_module(self):
         mod = torch.nn.Linear(8, 8)
 
@@ -1185,11 +1370,25 @@ def fn(x, y):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.UncapturedHigherOrderOpError,
             "Encountered aliasing during higher order op tracing",
         ):
             opt_fn(x, y)
 
+=======
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_input_input_aliasing(self):
         @nested_compile_region
         def gn(x, y):
@@ -1203,11 +1402,25 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.UncapturedHigherOrderOpError,
             "Encountered aliasing during higher order op tracing",
         ):
             opt_fn(x)
 
+=======
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_output_output_aliasing(self):
         @nested_compile_region
         def gn(x):
@@ -1222,11 +1435,25 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.UncapturedHigherOrderOpError,
             "Encountered aliasing during higher order op tracing",
         ):
             opt_fn(x)
 
+=======
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mod_attr_aliasing(self):
         class MutateParam(torch.nn.Module):
             def __init__(self):
@@ -1248,12 +1475,30 @@ def fn(x, y):
         x = torch.randn(8, requires_grad=False)
         y = torch.randn(8, requires_grad=False)
 
+<<<<<<< HEAD
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         compiled_out = opt_fn(x, y)
         # reset constant attr
         mod.a = torch.ones(8)
         self.assertEqual(compiled_out, fn(x, y))
+=======
+        fn(x, y)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_redundant_compile_region(self):
         @nested_compile_region
@@ -1419,7 +1664,11 @@ def forward(self, l_x_: "f32[8, 8]"):
 """,
             )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_return_none(self):
         from torch.nn import functional as F
 
@@ -1488,7 +1737,11 @@ def forward(self, L_x_: "f32[8, 8]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = l_x_ = None
         getitem: "f32[8, 8]" = invoke_subgraph[0]
+<<<<<<< HEAD
         getitem_1: "f32[8, 8]" = invoke_subgraph[1];  invoke_subgraph = None
+=======
+        getitem_1: "f32[8, 8]" = invoke_subgraph[2];  invoke_subgraph = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
         return (add,)
@@ -1497,7 +1750,11 @@ class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8, 8]"):
             child: "f32[8, 8]" = l_x_ * 2
             child_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
+<<<<<<< HEAD
             return (child, child_1)
+=======
+            return (child, None, child_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
             )
 
@@ -1507,18 +1764,31 @@ def forward(self, l_x_: "f32[8, 8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8, 8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem: "f32[8, 8]" = invoke_subgraph_2[0]
         getitem_1: "f32[8, 8]" = invoke_subgraph_2[1];  invoke_subgraph_2 = None
 
         add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
+=======
+
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
+        getitem: "f32[8, 8]" = invoke_subgraph_2[0]
+        getitem_2: "f32[8, 8]" = invoke_subgraph_2[2];  invoke_subgraph_2 = None
+
+        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_2);  getitem = getitem_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)
 
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8, 8]"):
             mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 2)
             mul_1: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 3);  primals_0 = None
+<<<<<<< HEAD
             return (mul, mul_1)
+=======
+            return (mul, None, mul_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
             )
 
@@ -1528,14 +1798,25 @@ def forward(self, primals_0: "f32[8, 8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, tangents_1: "f32[8, 8]"):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+<<<<<<< HEAD
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
         getitem_2: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
         return (getitem_2,)
+=======
+
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
+        getitem_3: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        return (getitem_3,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class partitioned_bw_subgraph_0_0(torch.nn.Module):
         def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
             mul_2: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 3)
             mul_3: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             add: "f32[8, 8]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
             return (add,)
 """,
@@ -1591,6 +1872,7 @@ def fn(x):
         self.assertEqual(ref, res)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+<<<<<<< HEAD
     def test_unbacked1(self):
         @nested_compile_region
         def gn(x, y):
@@ -1615,6 +1897,13 @@ def test_unbacked2(self):
         def gn(x, y):
             b = x.item()
             torch._check(b >= 0)
+=======
+    def test_unbacked(self):
+        @nested_compile_region
+        def gn(x, y):
+            b = x.item()
+            torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(b < y.shape[0])
             return y[:b].clone()
 
@@ -1665,6 +1954,10 @@ def fn(x, y):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8, 8]", primals_2: "f32[8, 8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = primals_2 = None
         getitem_6: "f32[8, 8]" = invoke_subgraph_2[3]
         getitem_5: "f32[8, 8]" = invoke_subgraph_2[2]
@@ -1695,6 +1988,10 @@ def forward(self, getitem_6: "f32[8, 8]", getitem_5: "f32[8, 8]", getitem_4: "f3
         mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, cos);  tangents_1 = cos = None
 
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_4, getitem_5, getitem_6, mul);  partitioned_bw_subgraph_0_0 = getitem_4 = getitem_5 = getitem_6 = mul = None
         getitem_1: "f32[8, 8]" = invoke_subgraph_3[0]
         getitem_2: "f32[8, 8]" = invoke_subgraph_3[1];  invoke_subgraph_3 = None
@@ -1776,6 +2073,7 @@ def fn(x):
         res = torch.compile(fn, backend="inductor", fullgraph=True)(x_clone)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @torch._inductor.config.patch(fallback_random=True)
     def test_ac_rng(self):
         def fn1(x):
@@ -1842,6 +2140,8 @@ def fn(q, k, v):
         )(q, k, v)
         res.sum().backward()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fake_tensor_checking(self):
         @nested_compile_region
         def gn(x):
@@ -1892,6 +2192,7 @@ def forward(self, l_y_: "f32[16, 16]"):
 """,
             )
 
+<<<<<<< HEAD
     def test_return_size(self):
         def run(dynamic):
             torch.compiler.reset()
@@ -1923,6 +2224,8 @@ def fn(x):
         run(dynamic=True)
         run(dynamic=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_different_symint(self):
         """
         Tests check that the same subgraph called with different symints use different graphs
@@ -2127,7 +2430,11 @@ def fn(x, y):
 
         # NOTE THAT THIS TEST DOES NOT REALLY WORK
         # We wanted one invoke_subgraph called twice, but because of
+<<<<<<< HEAD
         # constant_args_idx changing in the graph, the graph equivalence fails
+=======
+        # constant_args_idx changing in the grpah, the graph equivalence fails
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not TEST_WITH_CROSSREF:
             self.assertExpectedInline(
@@ -2145,6 +2452,10 @@ def forward(self, L_x_: "f32[5]", L_y_: "f32[5]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', x, y);  subgraph_0 = x = None
         z: "f32[5]" = invoke_subgraph[0];  invoke_subgraph = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subgraph_1 = self.subgraph_1
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', z, y);  subgraph_1 = z = y = None
         getitem_1: "f32[5]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
@@ -2240,12 +2551,20 @@ def fn(x):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "Sym(s77)", primals_2: "f32[s77, 16]"):
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_8 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_1', primals_1, primals_2);  partitioned_fw_subgraph_0_1 = primals_2 = None
         getitem_17: "Sym(s77)" = invoke_subgraph_8[2]
         getitem_16: "f32[s77, 16]" = invoke_subgraph_8[1]
         getitem: "f32[s77, 16]" = invoke_subgraph_8[0];  invoke_subgraph_8 = None
 
         partitioned_fw_subgraph_0_2 = self.partitioned_fw_subgraph_0_1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_10 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_2, 'partitioned_fw_subgraph_0_1', primals_1, getitem);  partitioned_fw_subgraph_0_2 = getitem = None
         getitem_19: "Sym(s77)" = invoke_subgraph_10[2]
         getitem_18: "f32[s77, 16]" = invoke_subgraph_10[1]
@@ -2254,12 +2573,20 @@ def forward(self, primals_1: "Sym(s77)", primals_2: "f32[s77, 16]"):
         sin: "f32[s77, 16]" = torch.ops.aten.sin.default(getitem_1)
 
         partitioned_fw_subgraph_0_3 = self.partitioned_fw_subgraph_0_1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_12 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_3, 'partitioned_fw_subgraph_0_1', primals_1, sin);  partitioned_fw_subgraph_0_3 = sin = None
         getitem_21: "Sym(s77)" = invoke_subgraph_12[2]
         getitem_20: "f32[s77, 16]" = invoke_subgraph_12[1]
         getitem_2: "f32[s77, 16]" = invoke_subgraph_12[0];  invoke_subgraph_12 = None
 
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_14 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, getitem_2);  partitioned_fw_subgraph_0_0 = None
         getitem_23: "Sym(s77)" = invoke_subgraph_14[2]
         getitem_22: "f32[s77, 16]" = invoke_subgraph_14[1]
@@ -2282,7 +2609,10 @@ def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]"):
             cos: "f32[s77, 16]" = torch.ops.aten.cos.default(primals_1)
             return (cos, primals_1, primals_0)
 """,
+<<<<<<< HEAD
                 ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
@@ -2292,21 +2622,40 @@ def forward(self, primals_1: "Sym(s77)", getitem_17: "Sym(s77)", getitem_19: "Sy
         expand: "f32[s77, 16]" = torch.ops.aten.expand.default(tangents_1, [primals_1, 16]);  tangents_1 = primals_1 = None
 
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+<<<<<<< HEAD
+        invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
+        getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
+        add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
+
+        partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
+=======
+
         invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
         getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
+
         add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
 
         partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_13 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_3, 'partitioned_bw_subgraph_0_1', getitem_21, getitem_20, add_16);  partitioned_bw_subgraph_0_3 = getitem_21 = getitem_20 = add_16 = None
         getitem_8: "f32[s77, 16]" = invoke_subgraph_13[1];  invoke_subgraph_13 = None
 
         mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(getitem_8, cos);  getitem_8 = cos = None
 
         partitioned_bw_subgraph_0_2 = self.partitioned_bw_subgraph_0_1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_11 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_2, 'partitioned_bw_subgraph_0_1', getitem_19, getitem_18, mul_10);  partitioned_bw_subgraph_0_2 = getitem_19 = getitem_18 = mul_10 = None
         getitem_11: "f32[s77, 16]" = invoke_subgraph_11[1];  invoke_subgraph_11 = None
 
         partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph_9 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_1', getitem_17, getitem_16, getitem_11);  partitioned_bw_subgraph_0_1 = getitem_17 = getitem_16 = getitem_11 = None
         getitem_14: "f32[s77, 16]" = invoke_subgraph_9[1];  invoke_subgraph_9 = None
         return (None, getitem_14)
@@ -2325,7 +2674,10 @@ def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]", tangents_0:
             mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(tangents_0, neg);  tangents_0 = neg = None
             return (None, mul_10)
 """,
+<<<<<<< HEAD
                 ignore_empty_lines=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_div(self):
@@ -2535,19 +2887,31 @@ def forward(self, x, y):
         self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
 
         self.assertExpectedInline(
+<<<<<<< HEAD
             empty_line_normalizer(
                 normalize_gm(ep.graph_module.print_readable(print_output=False))
             ),
+=======
+            normalize_gm(ep.graph_module.print_readable(print_output=False)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, x: "f32[8]", y: "f32[8]"):
         repeated_subgraph0 = self.repeated_subgraph0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', x, y);  repeated_subgraph0 = x = None
         getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         repeated_subgraph0_1 = self.repeated_subgraph0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', getitem, y);  repeated_subgraph0_1 = getitem = y = None
         getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
         return (getitem_1,)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class repeated_subgraph0(torch.nn.Module):
         def forward(self, arg0_1: "f32[8]", arg1_1: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
@@ -2559,6 +2923,10 @@ def test_unbacked(self):
         @nested_compile_region
         def gn(x, y):
             b = x.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(b < y.shape[0])
             return y[:b].clone()
 
diff --git a/test/higher_order_ops/test_with_effects.py b/test/higher_order_ops/test_with_effects.py
index 67facfb127d8e..2658a1b840081 100644
--- a/test/higher_order_ops/test_with_effects.py
+++ b/test/higher_order_ops/test_with_effects.py
@@ -328,7 +328,11 @@ def record_scalar_tensor(x, prefix):
                 return
 
             # Meta function of the custom op
+<<<<<<< HEAD
             @torch.library.register_fake(
+=======
+            @torch.library.impl_abstract(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "mylib::record_scalar_tensor",
                 lib=lib,
             )
diff --git a/test/inductor/custom_ops.cpp b/test/inductor/custom_ops.cpp
index ade7695a10d02..1b9a5df656880 100644
--- a/test/inductor/custom_ops.cpp
+++ b/test/inductor/custom_ops.cpp
@@ -1,7 +1,12 @@
 #include <torch/csrc/api/include/torch/types.h>  // @manual=fbcode//caffe2:libtorch
 
+<<<<<<< HEAD
 #include <torch/csrc/inductor/aoti_torch/c/shim.h> // @manual
 #include <torch/csrc/inductor/aoti_torch/utils.h> // @manual
+=======
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <cstdint>
 #include <iostream>
diff --git a/test/inductor/indirect_assert_helper.py b/test/inductor/indirect_assert_helper.py
index 6d1bc2b608fba..c34f427326360 100644
--- a/test/inductor/indirect_assert_helper.py
+++ b/test/inductor/indirect_assert_helper.py
@@ -73,6 +73,7 @@ def lower2(x):
         shape = (y.numel(),) + x.shape[2:]
         z = torch.randn(shape, device=GPU_TYPE)
         fn(x, y, z)
+<<<<<<< HEAD
         # On Windows, Python will optimize away a function call if its updated value is not used.
         # Touch the memory of x so that the fn(x, y, z) will not be optimized away
         print(x)
@@ -80,3 +81,9 @@ def lower2(x):
         print(fn(x))
     else:
         print(fn(x, y))
+=======
+    elif fn_name in ("upper1", "upper2", "lower1", "lower2"):
+        fn(x)
+    else:
+        fn(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index e4cc8faa0aed1..0ceda3a93891f 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -2,8 +2,11 @@
 import itertools
 import logging
 import os
+<<<<<<< HEAD
 import pathlib
 import subprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import tempfile
 import unittest
@@ -22,6 +25,7 @@
 from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.codecache import WritableTempFile
 from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.package import package_aoti
@@ -37,22 +41,41 @@
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
 from torch.export import Dim, export
+=======
+from torch._inductor.package import package_aoti
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import is_big_gpu, run_and_get_cpp_code
+from torch._utils_internal import full_aoti_runtime_assert
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.export import Dim, export, export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export.pt2_archive._package import load_pt2
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import (
+<<<<<<< HEAD
     _get_torch_cuda_version,
     CDNA2OrLater,
     IS_SM90,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
+<<<<<<< HEAD
     tf32_on_and_off,
 )
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
     e4m3_type,
+=======
+)
+from torch.testing._internal.common_device_type import (
+    _has_sufficient_memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipCUDAIf,
 )
 from torch.testing._internal.common_quantization import (
@@ -66,6 +89,7 @@
     IS_FBCODE,
     IS_MACOS,
     IS_WINDOWS,
+<<<<<<< HEAD
     MACOS_VERSION,
     MI300_ARCH,
     parametrize,
@@ -86,6 +110,15 @@
     HAS_XPU_AND_TRITON,
     IS_BIG_GPU,
 )
+=======
+    parametrize,
+    skipIfRocm,
+    skipIfXpu,
+    TEST_WITH_ROCM,
+)
+from torch.testing._internal.custom_tensor import CustomTensorPlainOut
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import requires_gpu
 from torch.utils import _pytree as pytree
@@ -106,7 +139,10 @@
         add_kernel_autotuned_weird_param_order,
         add_kernel_on_device_tma_new_api,
         add_kernel_on_device_tma_old_api,
+<<<<<<< HEAD
         add_kernel_with_boolean_param,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         add_kernel_with_none_param_and_equal_to_1_arg,
         add_kernel_with_optional_param,
         add_kernel_with_scaling,
@@ -167,6 +203,7 @@
     raise
 
 
+<<<<<<< HEAD
 def get_module_ext_type():
     if IS_WINDOWS:
         return "pyd"
@@ -180,6 +217,11 @@ class AOTInductorTestsTemplate:
     @common_utils.parametrize("embed_kernel_binary", [False, True])
     @common_utils.parametrize("max_autotune", [False, True])
     @skipIfRocmArch(MI300_ARCH)
+=======
+class AOTInductorTestsTemplate:
+    @common_utils.parametrize("embed_kernel_binary", [False, True])
+    @common_utils.parametrize("max_autotune", [False, True])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_simple(self, embed_kernel_binary, max_autotune):
         if self.device == "cpu" and IS_MACOS and max_autotune:
             raise unittest.SkipTest("max_autotune not supported on macos")
@@ -208,9 +250,13 @@ def forward(self, x, y):
             _, code = run_and_get_cpp_code(
                 AOTIRunnerUtil.compile, model, example_inputs
             )
+<<<<<<< HEAD
             if self.device == "mps":
                 FileCheck().check("aoti_torch_mps_get_kernel_function(").run(code)
             elif self.device == GPU_TYPE:
+=======
+            if self.device == GPU_TYPE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 FileCheck().check("launchKernel(").run(code)
                 if config.aot_inductor.embed_kernel_binary:
                     # Not expect to see launchKernel("CUBIN_FILE_NAME"
@@ -221,6 +267,7 @@ def forward(self, x, y):
                 model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
             )
 
+<<<<<<< HEAD
     def test_triton_kernel_bool_param(self):
         if self.device != GPU_TYPE or self.device == "mps":
             raise unittest.SkipTest("requires GPU")
@@ -241,10 +288,13 @@ def forward(self, x):
         inputs = (torch.randn(4, device=self.device),)
         self.check_model(Model(), inputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         IS_FBCODE,
         "toolchain doesn't support ptx to fatbin",
     )
+<<<<<<< HEAD
     @skipIfMPS
     @skipIfRocm
     # Skip embed_kernel_binary == True for now as it shows random
@@ -253,6 +303,10 @@ def forward(self, x):
     @unittest.skipIf(
         _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
     )
+=======
+    @skipIfRocm
+    @common_utils.parametrize("embed_kernel_binary", [True, False])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_simple_multi_arch(self, embed_kernel_binary):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU_TYPE")
@@ -327,20 +381,27 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
             torch.randn(10, 10, device=self.device),
         )
+<<<<<<< HEAD
         expected_path = normalize_path_separator(
             os.path.join(
                 tempfile.mkdtemp(dir=cache_dir()), f"model.{get_module_ext_type()}"
             )
         )
+=======
+        expected_path = os.path.join(tempfile.mkdtemp(dir=cache_dir()), "model.so")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual_path = AOTIRunnerUtil.legacy_compile(
             model, example_inputs, options={"aot_inductor.output_path": expected_path}
         )
         self.assertTrue(actual_path == expected_path)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "different # of input/output/constants in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_empty_constant_folding(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -498,6 +559,7 @@ def forward(self, y):
             ep, inductor_configs={"aot_inductor.use_runtime_constant_folding": True}
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_MPS and MACOS_VERSION < 14.0,
         "Compilation error",
@@ -531,6 +593,8 @@ def forward(self, y):
             },
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("dynamic", [False, True])
     @common_utils.parametrize("tma_version", ["new", "old"])
     def test_triton_kernel_on_device_tma(self, dynamic, tma_version):
@@ -583,7 +647,11 @@ def forward(self, a, b):
 
         triton.set_allocator(
             lambda size, align, stream: torch.empty(
+<<<<<<< HEAD
                 size, dtype=torch.int8, device=GPU_TYPE
+=======
+                size, dtype=torch.int8, device="cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
@@ -622,9 +690,12 @@ def forward(self, x):
         example_inputs = (torch.randn(32, 64, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     @unittest.skip(
         "install_free_tensors leads to OOM - https://github.com/pytorch/pytorch/issues/164062"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_weight(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -712,6 +783,7 @@ def forward(self, x, y):
         with config.patch({"aot_inductor.force_mmap_weights": True}):
             self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     def test_large_mmaped_weights_on_disk(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -730,6 +802,8 @@ def forward(self, x, y):
         ):
             self.check_model(Model(), example_inputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_with_offset(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -804,7 +878,10 @@ def forward(self, y):
         IS_FBCODE,
         "Not yet runnable in fbcode when the model.so is newly generated while older PyTorch is used",
     )
+<<<<<<< HEAD
     @tf32_on_and_off(0.005)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deconv_freezing(self):
         dtypes = [torch.float]
         if torch._C._has_mkldnn and torch.ops.mkldnn._is_mkldnn_bf16_supported():
@@ -880,10 +957,13 @@ def forward(self, a, b):
             inp = (torch.ones(3, device=self.device), torch.ones(3, device=self.device))
             self.check_model(M(), inp)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_MPS and MACOS_VERSION < 14.0,
         "MPS BFloat16 is only supported on MacOS 14+",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_empty_cat_dtype_promotion(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -1173,7 +1253,10 @@ def forward(self, x, y):
             options={"debug_check_inf_and_nan": True},
         )
 
+<<<<<<< HEAD
     @skipIfWindowsXPU(msg="crash on Windows XPU.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_assert_async(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU_TYPE")
@@ -1207,7 +1290,10 @@ def forward(self, x, y):
         example_inputs = (x, y)
         self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) confirm, Crash: access violation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_dynamic_dim(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1229,6 +1315,10 @@ def forward(self, x, y):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu
     def test_fp8(self):
         # cuda only
@@ -1241,7 +1331,11 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
+<<<<<<< HEAD
                 weight = weight.to(e4m3_type)
+=======
+                weight = weight.to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -1263,7 +1357,11 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
+=======
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None, None)
         self.check_model(
@@ -1273,6 +1371,7 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         )
 
     @unittest.skipIf(
+<<<<<<< HEAD
         TEST_WITH_ROCM or not IS_SM90,
         "scaled_grouped_mm is only supported on SM90",
     )
@@ -1342,6 +1441,12 @@ def forward(self, x, weight, scale_a, scale_b, offsets):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+=======
+        not PLATFORM_SUPPORTS_FP8,
+        "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
+    )
+    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu
     def test_fp8_view_of_param(self):
         # cuda only
@@ -1376,13 +1481,23 @@ def forward(self, x, bias, scale_a, scale_b):
         input_bias = torch.rand(32, device=self.device, dtype=dtype)
         weight_shape = (32, 16)
         weight = torch.rand(*weight_shape, device=self.device, dtype=dtype).to(
+<<<<<<< HEAD
             e4m3_type
+=======
+            torch.float8_e4m3fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         a_inverse_scale = 1 / a_scale
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(e4m3_type)
+=======
+        x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(
+            torch.float8_e4m3fn
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None)
         self.check_model(
@@ -1512,7 +1627,10 @@ def forward(self, a, b):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) confirm, Crash: access violation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_multiple_dynamic(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1549,9 +1667,13 @@ def forward(self, x, y):
         )
 
     # scaled_dot_product_flash_attention
+<<<<<<< HEAD
     @unittest.skipIf(
         not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
     )
+=======
+    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sdpa(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1611,6 +1733,7 @@ def forward(self, x):
             self.check_model(Model(self.device), example_inputs)
 
     @skipIfNoFBGEMM
+<<<<<<< HEAD
     def test_quantized_linear_bias_none(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -1627,6 +1750,8 @@ def forward(self, x):
             self.check_model(Model(self.device), example_inputs)
 
     @skipIfNoFBGEMM
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_quanatized_int8_linear(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -1677,6 +1802,7 @@ def forward(self, x, y):
         )
         self.check_model(Repro(), example_inputs)
 
+<<<<<<< HEAD
     @skipIfMPS
     @config.patch({"unbacked_symint_fallback": 12})
     @parametrize("shift_k", [0, 1, 2, 3])
@@ -1808,6 +1934,8 @@ def forward(self, x, y, lengths):
         self.check_model(model, example_inputs, dynamic_shapes=spec)
         torch.cuda.caching_allocator_enable(True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"triton.autotune_at_compile_time": None})
     def test_stride_with_unbacked_expr(self):
         class Repro(torch.nn.Module):
@@ -1827,10 +1955,13 @@ def forward(self, x, y):
         )
         self.check_model(Repro(), example_inputs)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_MPS and MACOS_VERSION < 14.0,
         "bfloat16 is only supported on MacOS 14+",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_size_with_unbacked_add_expr(self):
         # Tests AOTI autotuning to make sure the correct input tensor sizes
         # are generated for sizes that include an expr such as s0 + u0.
@@ -1843,6 +1974,10 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
 
                 backed = z.size(0)
                 unbacked = scalar.item()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(unbacked)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 unbacked_add_expr = backed + unbacked
                 repeated = x.repeat(unbacked_add_expr, 1)
@@ -1868,7 +2003,10 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
         }
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
+<<<<<<< HEAD
     @skipIfWindowsXPU(msg="crash on Windows XPU.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_size_with_unbacked_add_expr_transitive(self):
         # Edge case with torch._check(expr1, expr2) + torch._check(expr2, unbacked).
         # When generating example input sizes for autotuning, it should coalesce
@@ -1882,6 +2020,11 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
                 index_select = torch.index_select(embeddings, 0, index)
 
                 u0, u1 = lst.tolist()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 backed0, backed1 = z.size(0), z.size(1)
 
                 repeated0 = y.repeat(backed0 + u0, 1)
@@ -1931,6 +2074,12 @@ def test_size_with_unbacked_add_and_mul_expr(self):
         class Repro(torch.nn.Module):
             def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
                 u0, u1, u2 = lst.tolist()
+<<<<<<< HEAD
+=======
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+                torch._check_is_size(u2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 backed = z.size(0)
                 backed1 = z.size(1)
 
@@ -2057,7 +2206,11 @@ def __init__(self, user_float_feature_idx, device):
                 self.user_float_feature_idx = user_float_feature_idx
                 self.register_buffer(
                     "_tensor_constant0",
+<<<<<<< HEAD
                     torch.ones(5, device=device, dtype=torch.float32),
+=======
+                    torch.ones(1, device=device, dtype=torch.float32),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     persistent=True,
                 )
                 self.register_buffer(
@@ -2068,7 +2221,10 @@ def __init__(self, user_float_feature_idx, device):
                 self.sub_mod = SubModule(device)
 
             def forward(self, x):
+<<<<<<< HEAD
                 self._tensor_constant0[1:2] = 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (
                     torch.index_select(
                         x, 1, torch.tensor(self.user_float_feature_idx, device=x.device)
@@ -2085,7 +2241,11 @@ def forward(self, x):
             Foo(user_float_feature_idx, self.device), example_inputs, strict=False
         ).run_decompositions()
         gm = ep.module()
+<<<<<<< HEAD
         self.check_model(gm.to(self.device), example_inputs)
+=======
+        self.check_model(gm, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_large_grid(self):
         if self.device != GPU_TYPE:
@@ -2288,7 +2448,10 @@ def test_cond_unbacked_symint_closure(self, dynamic):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) confirm, Crash: access violation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("dynamic", [False, True])
     def test_cond_mismatched_branch_output(self, dynamic):
         inputs = (
@@ -2301,7 +2464,11 @@ def test_cond_mismatched_branch_output(self, dynamic):
             # Note the minimum has to be 4 because the model
             # is slicing over the first dim with [2:], if first
             # dim is 2 or 3, the slicing will be 0/1 specialized,
+<<<<<<< HEAD
             # causing a constraint violation error.
+=======
+            # causing a constraint violation eror.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dim0_a = Dim("s0", min=4, max=1024)
             dim0_b = Dim("s1", min=4, max=1024)
             dynamic_shapes = {
@@ -2348,6 +2515,7 @@ def false_fn(x):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
     def test_cond_symint_input_disable_one_pass(self):
         class M(torch.nn.Module):
             def forward(self, x, y, z):
@@ -2381,6 +2549,8 @@ def false_fn(x):
                 dynamic_shapes=dynamic_shapes,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_while_loop_simple(self):
         inputs = (
             torch.randn((10, 20), device=self.device),
@@ -2433,6 +2603,7 @@ def test_while_loop_with_outer_code(self):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
     # mps doesn't support float64
     @skipIfMPS
     @unittest.skipIf(
@@ -2450,6 +2621,10 @@ def test_while_loop_with_parameters(self):
                 device=self.device,
             ),
         )
+=======
+    def test_while_loop_with_parameters(self):
+        inputs = (torch.randn((10, 20), device=self.device),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim0_a = Dim("s0", min=2, max=1024)
         dynamic_shapes = {
             "c": {},
@@ -2703,9 +2878,12 @@ def forward(self, x):
         example_inputs = (torch.randn(10, device=self.device),)
         self.check_model(Model(self.device), example_inputs)
 
+<<<<<<< HEAD
     @skipIfWindows(
         msg="OpenMP crashed application on windows"
     )  # TODO: (xuhancn) need to root cause and fix.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_buffer_mutation_3(self):
         class KVCache(torch.nn.Module):
             def __init__(
@@ -2771,6 +2949,10 @@ def forward(self, x):
         torch._export.aot_compile(Model(), example_inputs)
 
     @skipCUDAIf(True, "Test for x86 backend")
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_FBCODE, "Need newer ideep")
     def test_buffer_mutation_and_force_mmap_weights(self):
         class Model(nn.Module):
@@ -2790,7 +2972,13 @@ def forward(self, x):
             config.patch({"freezing": True, "aot_inductor.force_mmap_weights": True}),
             torch.no_grad(),
         ):
+<<<<<<< HEAD
             exported_model = export(model, example_inputs, strict=True).module()
+=======
+            exported_model = export_for_training(
+                model, example_inputs, strict=True
+            ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantizer = X86InductorQuantizer()
             quantizer.set_global(
                 xiq.get_default_x86_inductor_quantization_config(reduce_range=True)
@@ -2802,7 +2990,10 @@ def forward(self, x):
 
             self.check_model(converted_model, example_inputs)
 
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fallback_mem_leak_fix(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -2847,7 +3038,10 @@ def forward(self, x, y, idx):
         torch.testing.assert_close(actual, expected)
 
     @requires_multigpu()
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_replicate_on_devices(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -2887,7 +3081,10 @@ def forward(self, x, y):
             self.assertTrue(same(result_cpu, result_gpu.cpu()))
 
     @requires_multigpu()
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_on_gpu_device1(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -3016,9 +3213,12 @@ def forward(self, x, y):
                 result_package = model_package(*inputs_on_device)
             self.assertTrue(same(result_ref.cpu(), result_package.cpu()))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul, "sin and mm are fused in native matmul"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reuse_kernel(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -3036,6 +3236,7 @@ def forward(self, x, y):
             torch.randn(87, 87, device=self.device),
         )
         model = Model()
+<<<<<<< HEAD
 
         # 1e-4 is the tol value used in pytorch/torch/_dynamo/utils.py
         self.check_model(model, example_inputs, atol=1e-4, rtol=1e-4)
@@ -3045,6 +3246,13 @@ def forward(self, x, y):
                 model, example_inputs, "aoti_torch_mps_get_kernel_function(", 1
             )
         elif self.device == GPU_TYPE:
+=======
+        self.check_model(
+            model, example_inputs, atol=1e-4, rtol=1e-4
+        )  # 1e-4 is the tol value used in pytorch/torch/_dynamo/utils.py
+
+        if self.device == GPU_TYPE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.code_check_count(
                 model, example_inputs, "triton_poi_fused_sin_0 = loadKernel(", 1
             )
@@ -3111,6 +3319,7 @@ def forward(self, x, y, z):
 
         example_inputs = (x, y, z)
         model = Model(self.device).to(dtype=torch.float)
+<<<<<<< HEAD
         self.check_model(
             model,
             example_inputs,
@@ -3118,6 +3327,9 @@ def forward(self, x, y, z):
             atol=1e-5,
             rtol=1e-5,
         )
+=======
+        self.check_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fake_tensor_device_validation(self):
         if self.device != GPU_TYPE:
@@ -3449,9 +3661,16 @@ def forward(self, x):
 
         # Call eval() here so that batch_norm won't update the running stats
         # Use float64 to avoid numeric difference failure
+<<<<<<< HEAD
         dtype = torch.float32 if self.device == "mps" else torch.float64
         model = Model().to(device=self.device, dtype=dtype).eval()
         example_inputs = (torch.randn(4, 3, 64, 64, device=self.device, dtype=dtype),)
+=======
+        model = Model().to(device=self.device, dtype=torch.float64).eval()
+        example_inputs = (
+            torch.randn(4, 3, 64, 64, device=self.device, dtype=torch.float64),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_model(model, example_inputs)
 
     def test_triton_next_power_of_2(self):
@@ -3482,7 +3701,10 @@ def forward(self, a, b, lengths):
         self.check_model(Model(), example_inputs)
 
     @common_utils.parametrize("minmax", [min, max])
+<<<<<<< HEAD
     @skipIfWindowsXPU(msg="crash on Windows XPU.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sympy_cpp_printer_min_max(self, minmax):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -3513,7 +3735,10 @@ def forward(self, a, b, ranks):
         torch._dynamo.mark_dynamic(example_inputs[1], 0)
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("grid_type", [1, 2, 3])
     @common_utils.parametrize("num_dims", [1, 2])
     @common_utils.parametrize("dynamic", [False, True])
@@ -3972,7 +4197,10 @@ def forward(self, x):
         x = torch.randn(16, 16, device=self.device)
         self.check_model(Model(), (x,))
 
+<<<<<<< HEAD
     @skipIfWindowsXPU(msg="crash on Windows XPU.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_dynamic_grid(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4317,7 +4545,11 @@ def forward(self, x):
         expected_original_fqns = {
             "L__self___test_param": "test_param",
             "L__self___test_buf": "test_buf",
+<<<<<<< HEAD
             "L__self___foo_bar_0": "foo_bar.0",
+=======
+            "getattr_L__self___foo_bar___0__": "foo_bar.0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "L__self___foo_bar_test_param": "foo_bar.test_param",
             "L__self___foo_bar_test_buf": "foo_bar.test_buf",
         }
@@ -4328,7 +4560,11 @@ def forward(self, x):
         expected_dtypes = {
             "L__self___test_param": 6,
             "L__self___test_buf": 6,
+<<<<<<< HEAD
             "L__self___foo_bar_0": 6,
+=======
+            "getattr_L__self___foo_bar___0__": 6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "L__self___foo_bar_test_param": 6,
             "L__self___foo_bar_test_buf": 6,
         }
@@ -4470,7 +4706,10 @@ def forward(self, a):
         model.weight += 1
         self.check_model(model, example_inputs)
 
+<<<<<<< HEAD
     @skipIfWindowsXPU(msg="crash on Windows XPU.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_extern_kernel_arg(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4507,6 +4746,10 @@ def forward(self, x, y):
 
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    # @skipIfXpu(msg="torch.xpu.memory_allocated not supported yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_reinterpret_view_mem_leak(self):
         # Check for memory leak when using user-defined Triton Kernel + AOTI.
         if self.device != GPU_TYPE:
@@ -4546,7 +4789,10 @@ def forward(self, x, y):
         expected = Model()(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @common_utils.parametrize("dynamic", [False, True])
     @common_utils.parametrize("autotuning", [False, True])
@@ -4638,7 +4884,11 @@ def test_aoti_runtime_asserts(self):
             def foo(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
                 return a[: b.item()]
 
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo", lib=lib)
+=======
+            @torch.library.impl_abstract("mylib::foo", lib=lib)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo_fake_impl(a, b):
                 ctx = torch.library.get_ctx()
                 u = ctx.new_dynamic_size()
@@ -4702,7 +4952,10 @@ def forward(self, x):
         with self.assertRaisesRegex(Exception, "run_func_(.*) API call failed "):
             optimized(*input2)
 
+<<<<<<< HEAD
     @skipIfWindows(msg="TODO: (xuhancn) confirm, Crash: access violation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_put_with_none_index(self):
         # index_put falls back in the deterministic mode
         with DeterministicGuard(True):
@@ -4727,6 +4980,7 @@ def forward(self, x, i1, i2, y):
     @patch.dict(os.environ, {"AOTI_RUNTIME_CHECK_INPUTS": "1"})
     def test_runtime_checks(self):
         class Model(torch.nn.Module):
+<<<<<<< HEAD
             def forward(self, inputs):
                 return list(inputs.values())
 
@@ -4734,6 +4988,26 @@ def forward(self, inputs):
         dtypes = [
             torch.float16,
             torch.float32,
+=======
+            def __init__(self) -> None:
+                super().__init__()
+
+            if SM80OrLater:
+
+                def forward(self, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9):
+                    return (x0, x1, x2, x3, x4, x5, x6, x7, x8, x9)
+
+            else:
+
+                def forward(self, x0, x1, x2, x4, x5, x6, x7, x8, x9):
+                    return (x0, x1, x2, x4, x5, x6, x7, x8, x9)
+
+        inputs = []
+        dtypes = [
+            torch.float16,
+            torch.float32,
+            torch.float64,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.bool,
             torch.int8,
             torch.int16,
@@ -4741,6 +5015,7 @@ def forward(self, inputs):
             torch.int64,
             torch.uint8,
         ]
+<<<<<<< HEAD
 
         if not TEST_MPS:
             dtypes.append(torch.float64)
@@ -4751,11 +5026,18 @@ def forward(self, inputs):
             inputs[f"x_{str(dtype)}"] = torch.ones(
                 4, 8, 10, dtype=dtype, device=self.device
             )
+=======
+        if SM80OrLater:
+            dtypes.append(torch.bfloat16)
+        for dtype in dtypes:
+            inputs.append(torch.ones(4, 8, 10, dtype=dtype, device=self.device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dim0 = Dim("s0", min=2, max=1024)
         dim1 = Dim("s1", min=2, max=512)
         dim2 = Dim("s2", min=2, max=128)
         dynamic_shapes = {
+<<<<<<< HEAD
             "x_torch.float16": {0: dim0},
             "x_torch.float32": {0: dim0},
             "x_torch.bool": {1: dim1},
@@ -4773,10 +5055,28 @@ def forward(self, inputs):
         m = Model()
         inputs = (inputs,)
         dynamic_shapes = (dynamic_shapes,)
+=======
+            "x0": {0: dim0},
+            "x1": {0: dim0},
+            "x2": {0: dim0},
+            "x4": {1: dim1},
+            "x5": {1: dim1},
+            "x6": {},
+            "x7": {2: dim2},
+            "x8": {2: dim2},
+            "x9": {2: dim2},
+        }
+        if SM80OrLater:
+            dynamic_shapes["x3"] = {1: dim1}
+
+        m = Model()
+        inputs = tuple(inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.no_grad():
             so_path = AOTIRunnerUtil.legacy_compile(
                 m, inputs, dynamic_shapes=dynamic_shapes
             )
+<<<<<<< HEAD
 
         # Expected results for the following checks:
         # ("unmatched dtype", "unmatched dim value at", "dim value is too", "unmatched stride value at")
@@ -4790,26 +5090,50 @@ def forward(self, inputs):
             # 9 dynamic dims
             expected_results = (9, 19, 16, 19)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(os.path.splitext(so_path)[0] + ".cpp") as cpp:
             src_code = cpp.read()
             FileCheck().check_count(
                 "unmatched dtype",
+<<<<<<< HEAD
                 expected_results[0],
+=======
+                10 if SM80OrLater else 9,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 exactly=True,
             ).run(src_code)
             FileCheck().check_count(
                 "unmatched dim value at",
+<<<<<<< HEAD
                 expected_results[1],
+=======
+                21
+                if SM80OrLater
+                else 19,  # we have 9 dynamic dims for which we generate different checks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 exactly=True,
             ).run(src_code)
             FileCheck().check_count(
                 "dim value is too",
+<<<<<<< HEAD
                 expected_results[2],
+=======
+                18
+                if SM80OrLater
+                else 16,  # we have 9 dynamic dims for which we generate two checks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 exactly=True,
             ).run(src_code)
             FileCheck().check_count(
                 "unmatched stride value at",
+<<<<<<< HEAD
                 expected_results[3],
+=======
+                21
+                if SM80OrLater
+                else 19,  # we have 9 symbolic strides for which we don't generate checks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 exactly=True,
             ).run(src_code)
 
@@ -4983,7 +5307,11 @@ def forward(self, x):
         self.assertTrue(result[0].data_ptr() != result[1].data_ptr())
 
     def test_multiple_output_alias(self):
+<<<<<<< HEAD
         # Test when multiple outputs alias the same tensor
+=======
+        # Test when mutliple outputs alias the same tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def forward(self, x):
                 squared = x * x
@@ -5073,10 +5401,13 @@ def forward(self, w, i, o):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_MPS and MACOS_VERSION < 14.0,
         "FFT operations are only supported on MacOS 14+",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fft_c2c(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -5161,10 +5492,14 @@ def forward(self, values, offsets):
             )
             self.assertTrue(same(model(*example_input), actual))
 
+<<<<<<< HEAD
     # Temporarily skipping test as pytorch/cpuinfo not able to retrieve cache size for
     # AMD EPYC 9575F 64-Core Processor CPU in gfx942 VM Runners
     @common_utils.parametrize("max_autotune", [True, False])
     @skipIfRocmArch(MI300_ARCH)
+=======
+    @common_utils.parametrize("max_autotune", [True, False])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_misc_1(self, max_autotune):
         if self.device == "cpu" and IS_MACOS and max_autotune:
             raise unittest.SkipTest("max_autotune not supported on macos")
@@ -5227,7 +5562,10 @@ def forward(self, image: torch.Tensor, target_size: torch.Tensor):
         }
         self.check_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "matmul is generated")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_aoti_debug_printer_codegen(self):
         # basic addmm model to test codegen for aoti intermediate debug printer
         class Model(torch.nn.Module):
@@ -5247,6 +5585,7 @@ def forward(self, a):
         a = torch.randn(batch, M, K, device=self.device)
         example_inputs = (a,)
 
+<<<<<<< HEAD
         if self.device == "mps":
             kernel_calls = [("aoti_torch_mps_addmm_out", 2)]
         elif self.device == GPU_TYPE:
@@ -5256,6 +5595,18 @@ def forward(self, a):
             ]
         else:
             kernel_calls = [("aoti_torch_cpu_addmm_out", 2)]
+=======
+        kernel_calls = (
+            [
+                ("triton_poi_fused_0", 1),
+                (f"aoti_torch_{GPU_TYPE}_addmm_out", 2),
+            ]
+            if self.device == GPU_TYPE
+            else [
+                ("aoti_torch_cpu_addmm_out", 2),
+            ]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # test default debug printing all tensor values codegen
         with config.patch({"aot_inductor.debug_intermediate_value_printer": "2"}):
@@ -5311,9 +5662,12 @@ def forward(self, a):
                 FileCheck().check_not(f"before_launch - {kernel_name}").run(code)
                 FileCheck().check_not(f"after_launch - {kernel_name}").run(code)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul, "different kernel name when native matmul"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("enable_kernel_profile", (True, False))
     def test_aoti_profiler(self, enable_kernel_profile):
         # basic addmm model
@@ -5347,6 +5701,7 @@ def forward(self, a):
             _, code = run_and_get_cpp_code(
                 AOTIRunnerUtil.compile, model, example_inputs
             )
+<<<<<<< HEAD
             shim_fn_codes = f'RAIIAtenRecordFunctionHandle .*\\("{kernel_calls}"'
             if enable_kernel_profile:
                 FileCheck().check_regex(shim_fn_codes).run(code)
@@ -5400,6 +5755,15 @@ def forward(self, x, y):
                     op_events[0]["args"].get("Input Args", ""),
                     ["in_ptr0", "in_ptr1", "out_ptr", "n_elements"],
                 )
+=======
+            shim_fn_codes = (
+                f'RECORD_FUNCTION("{kernel_calls}", c10::ArrayRef<c10::IValue>());'
+            )
+            if enable_kernel_profile:
+                FileCheck().check(shim_fn_codes).run(code)
+            else:
+                FileCheck().check_not(shim_fn_codes).run(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_aoti_debug_printer_user_defined_triton_kernel(self):
         if self.device != GPU_TYPE:
@@ -5501,7 +5865,11 @@ def forward(self, x):
 
         expected_scalar_args = [
             "triton_poi_fused_zeros_like_0_xnumel",
+<<<<<<< HEAD
             "triton_poi_fused_ones_1_xnumel",
+=======
+            "triton_poi_fused_1_xnumel",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "std::max(static_cast<int64_t>(512L), static_cast<int64_t>(u0))",
         ]
 
@@ -5520,6 +5888,10 @@ def forward(self, x):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu
     def test_aoti_debug_printer_fp8_dtype(self):
         if self.device != GPU_TYPE:
@@ -5531,7 +5903,11 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
+<<<<<<< HEAD
                 weight = weight.to(e4m3_type)
+=======
+                weight = weight.to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -5553,7 +5929,11 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
+=======
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kernel_calls = [
             (f"aoti_torch_{GPU_TYPE}__scaled_mm_out", 5),
@@ -5582,8 +5962,13 @@ def forward(self, x, weight, bias, scale_a, scale_b):
                 ).run(code)
 
     def test_aoti_debug_printing_model_inputs_codegen(self):
+<<<<<<< HEAD
         if self.device not in ["cuda", "xpu"]:
             raise unittest.SkipTest("requires CUDA/XPU")
+=======
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self):
@@ -5596,9 +5981,15 @@ def forward(self, a, b, c):
                 return z
 
         example_inputs = (
+<<<<<<< HEAD
             torch.randn(10, 20, device=GPU_TYPE),
             torch.randn(20, 30, device=GPU_TYPE),
             torch.randn(10, 30, device=GPU_TYPE),
+=======
+            torch.randn(10, 20, device="cuda"),
+            torch.randn(20, 30, device="cuda"),
+            torch.randn(10, 30, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model = Model()
         kernel_calls = [
@@ -5814,6 +6205,7 @@ def sin_triton(x, out):
         self.check_model(sin_triton, none_inputs)
         self.check_model(sin_triton, not_none_inputs)
 
+<<<<<<< HEAD
     @skipIfRocm  # RoCM does not support the config block size in test suite.
     def test_autotune_int64_user_defined_triton_kernel(self):
         if self.device != GPU_TYPE:
@@ -5879,6 +6271,8 @@ def forward(self, x):
     @skipIfWindows(
         msg="OpenMP crashed application on windows"
     )  # TODO: (xuhancn) need to root cause and fix.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_issue_140766(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -5919,6 +6313,7 @@ def forward_block(self, x):
         example_inputs = (torch.randn(2, 128, 4096, device=self.device),)
         self.check_model(Model(), example_inputs, dynamic_shapes={"x": {0: bs}})
 
+<<<<<<< HEAD
     @requires_gpu
     def test_d2h_copy(self):
         # device to copy host should always have the same stride
@@ -5956,6 +6351,8 @@ def forward(self, x):
         all_ops = [event.key for event in prof.key_averages()]
         self.assertTrue(not any("aten::contiguous" in op for op in all_ops))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_so_without_weight(self):
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
@@ -6030,9 +6427,13 @@ def runner_call(*args, **kwargs):
         runner.update_constant_buffer(attach_weights, False, False)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
+<<<<<<< HEAD
 
         atol, rtol = 3e-4, 3e-4
         self.assertEqual(expected, output, atol=atol, rtol=rtol)
+=======
+        self.assertEqual(expected, output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_weight_on_disk_legacy(self):
         class Model(torch.nn.Module):
@@ -6055,7 +6456,11 @@ def forward(self, a):
                 {
                     "always_keep_tensor_constants": True,
                     "aot_inductor.package_constants_in_so": False,
+<<<<<<< HEAD
                     "aot_inductor.package_constants_on_disk_format": "pickle_weights",
+=======
+                    "aot_inductor.package_constants_on_disk": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "aot_inductor.package": True,
                 }
             ),
@@ -6065,7 +6470,11 @@ def forward(self, a):
                 example_inputs=example_inputs,
             )
 
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(
                 f.name,
                 {"model": aoti_files},
@@ -6073,8 +6482,12 @@ def forward(self, a):
             pt2_contents = load_pt2(package_path, load_weights_from_disk=True)
             loaded1 = pt2_contents.aoti_runners["model"]
 
+<<<<<<< HEAD
         atol, rtol = 3e-4, 3e-4
         self.assertEqual(loaded1(a), model(a), atol=atol, rtol=rtol)
+=======
+        self.assertEqual(loaded1(a), model(a))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_extract_constants_map(self):
         class Model(torch.nn.Module):
@@ -6199,6 +6612,7 @@ def runner_call(*args, **kwargs):
         )
         self.assertEqual(new_expected, new_output)
 
+<<<<<<< HEAD
     def test_update_constant_buffer_simple(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -6246,6 +6660,8 @@ def runner_call(*args, **kwargs):
         output = runner_call(test_inputs)
         self.assertEqual(expected, output)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_update_inactive_constant_buffer(self):
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
@@ -6282,7 +6698,11 @@ def runner_call(*args, **kwargs):
         test_inputs = torch.randn(M, K, device=self.device)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
+<<<<<<< HEAD
         self.assertEqual(expected, output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(expected, output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_weights = {
             "L__self___weight": torch.randn(N, K, device=self.device),
@@ -6375,8 +6795,13 @@ def runner_call(*args, **kwargs):
         runner.free_inactive_constant_buffer()
 
     def test_update_user_managed_buffer(self):
+<<<<<<< HEAD
         if self.device not in ["cuda", "xpu"]:
             raise unittest.SkipTest("requires CUDA/XPU")
+=======
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
@@ -6414,16 +6839,27 @@ def runner_call(*args, **kwargs):
         test_inputs = torch.randn(M, K, device=self.device)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
+<<<<<<< HEAD
         self.assertEqual(expected, output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(expected, output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_weights = {
             "L__self___weight": torch.randn(N, K, device=self.device),
             "L__self___bias": torch.randn(N, device=self.device),
         }
+<<<<<<< HEAD
         mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         # Do not use user managed_buffer, should have less free memory.
         runner.update_constant_buffer(new_weights, True, False, False)
         mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
+=======
+        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        # Do not use user managed_buffer, should have less free memory.
+        runner.update_constant_buffer(new_weights, True, False, False)
+        mem_after, _ = torch.cuda.mem_get_info(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertGreater(mem_before, mem_after)
 
         runner.swap_constant_buffer()
@@ -6431,7 +6867,11 @@ def runner_call(*args, **kwargs):
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
+<<<<<<< HEAD
         self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(new_expected, new_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Inplace substitube tensor, without user managed buffer, result should be different.
         new_weights["L__self___weight"].add_(1)
@@ -6439,7 +6879,11 @@ def runner_call(*args, **kwargs):
 
         new_output = runner_call(test_inputs)
         # Same as the previous result
+<<<<<<< HEAD
         self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(new_expected, new_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
@@ -6455,18 +6899,30 @@ def runner_call(*args, **kwargs):
             "L__self___weight": torch.randn(N, K, device=self.device),
             "L__self___bias": torch.randn(N, device=self.device),
         }
+<<<<<<< HEAD
         mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         # Try user managed_buffer, should have same free memory.
         runner.update_constant_buffer(new_weights, True, False, True)
         mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
+=======
+        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        # Try user managed_buffer, should have same free memory.
+        runner.update_constant_buffer(new_weights, True, False, True)
+        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        self.assertEqual(mem_before, mem_after)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         runner.swap_constant_buffer()
         new_output = runner_call(test_inputs)
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
+<<<<<<< HEAD
         self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(new_expected, new_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Inplace substitube tensor, with user managed buffer, result should be the same.
         new_weights["L__self___weight"].add_(1)
@@ -6476,6 +6932,7 @@ def runner_call(*args, **kwargs):
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
+<<<<<<< HEAD
         self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
 
         new_weights = {
@@ -6502,6 +6959,9 @@ def runner_call(*args, **kwargs):
 
         with self.assertRaises(AssertionError):
             torch.testing.assert_close(new_expected, new_output, atol=1e-3, rtol=1e-3)
+=======
+        self.assertEqual(new_expected, new_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cond_share_predicte(self):
         class Model(torch.nn.Module):
@@ -6530,8 +6990,13 @@ def forward(self, predicate, x):
         "To enable after the C shim FC window ends",
     )
     def test_misaligned_input_1(self):
+<<<<<<< HEAD
         if self.device not in ["cuda", "xpu"]:
             raise unittest.SkipTest("CUDA/XPU test only")
+=======
+        if self.device != "cuda":
+            raise unittest.SkipTest("CUDA test only")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -6557,8 +7022,13 @@ def forward(self, x):
         torch.testing.assert_close(actual, expected)
 
     def test_misaligned_input_2(self):
+<<<<<<< HEAD
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("GPU test only")
+=======
+        if self.device != "cuda":
+            raise unittest.SkipTest("CUDA test only")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -6616,17 +7086,24 @@ def forward(self, x, y):
         )
 
     @unittest.skipIf(IS_FBCODE, "Not runnable in fbcode")
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_MPS and MACOS_VERSION < 14.0,
         "FFT operations are only supported on MacOS 14+",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stft(self):
         N_FFT = 400
         HOP_LENGTH = 160
 
         class Model(torch.nn.Module):
             def forward(self, x):
+<<<<<<< HEAD
                 window = torch.hann_window(N_FFT, device=x.device)
+=======
+                window = torch.hann_window(N_FFT).to(x.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 stft = torch.stft(
                     x, N_FFT, HOP_LENGTH, window=window, return_complex=True
                 )
@@ -6637,7 +7114,10 @@ def forward(self, x):
         example_inputs = (torch.randn(500, device=self.device),)
         self.check_model(model, example_inputs)
 
+<<<<<<< HEAD
     @skipIfXpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d(self):
         if self.device != GPU_TYPE or not is_big_gpu():
             raise unittest.SkipTest("requires modern GPU to run max-autotune")
@@ -6694,6 +7174,12 @@ def forward(
                 dynamic_shapes=dynamic_shapes,
             )
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu(
+        msg="The operator 'aten::_int_mm' is not currently implemented for the XPU device"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test__int_mm(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -6708,7 +7194,10 @@ def forward(self, x, y):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     @skipIfMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu(
         msg="aten::convert_weight_to_int4pack is not currently implemented for XPU"
     )
@@ -6720,10 +7209,13 @@ def test__weight_int4pack_mm(self, m, n, q_group, num_groups):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
 
+<<<<<<< HEAD
         if TEST_WITH_ROCM:
             if not CDNA2OrLater():
                 self.skipTest("_int4_mm is supported only for CDNA2 or later")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self, weight, scale_and_zeros) -> None:
                 super().__init__()
@@ -6757,10 +7249,13 @@ def test__weight_int4pack_mm_with_scales_and_zeros(self, m, n, q_group, num_grou
         if "xpu" not in self.device:
             raise unittest.SkipTest("requires Intel GPU")
 
+<<<<<<< HEAD
         if TEST_WITH_ROCM:
             if not CDNA2OrLater():
                 self.skipTest("_int4_mm is supported only for CDNA2 or later")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self, weight, scale, zeros) -> None:
                 super().__init__()
@@ -6851,6 +7346,7 @@ def forward(self, x):
                 rtol=1e-3,
             )
 
+<<<<<<< HEAD
     @runOnRocm
     def test_rocm_triton_autotuning(self):
         if self.device != GPU_TYPE:
@@ -6893,6 +7389,8 @@ def forward(self, x, y, m):
         ):
             torch._export.aot_compile(Model(), (x, y, m))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfRocm  # RoCM does not support the config block size in test suite.
     def test_triton_autotuning(self):
         if self.device != GPU_TYPE:
@@ -7084,6 +7582,7 @@ def forward(self, x):
         }
         self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
     def test_boolean_indexing(self):
         class Model(torch.nn.Module):
             def forward(self, x, y, z, x1, z1):
@@ -7177,6 +7676,8 @@ def forward(
         )
         self.check_model(m, example_inputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_with_cudagraphs(self):
         if self.device != "cuda":
             raise unittest.SkipTest("requires CUDA")
@@ -7249,6 +7750,7 @@ def wrapped(**kwargs):
         # compare against eager
         self.assertEqual(optimized(**model_kwargs), model(**model_kwargs))
 
+<<<<<<< HEAD
     def test_custom_op_in_subgraph(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
             torch.library.define(
@@ -7292,6 +7794,8 @@ def forward(self, x):
                 M(), list_example_inputs, dynamic_shapes=({0: Dim.DYNAMIC},)
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_clamp_decomposition(self):
         class Model1(torch.nn.Module):
             def forward(self, x):
@@ -7308,6 +7812,7 @@ def forward(self, x):
         # the output should have int type
         self.check_model(Model2(), (x,))
 
+<<<<<<< HEAD
     def test_upper_bound_i64(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -7336,6 +7841,8 @@ def forward(self, x, y):
         # this test is mostly checking to ensure there's no IMA.
         m(*inp)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_using_model_name_for_files(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -7362,14 +7869,21 @@ def forward(self, x, y):
         with zipfile.ZipFile(package_path, "r") as zip_ref:
             all_files = zip_ref.namelist()
             base_dir = "test_model.wrapper/data/aotinductor/model/test_model"
+<<<<<<< HEAD
             ext_type = get_module_ext_type()
             self.assertTrue(f"{base_dir}.wrapper.cpp" in all_files)
             self.assertTrue(f"{base_dir}.kernel.cpp" in all_files)
             self.assertTrue(f"{base_dir}.wrapper.{ext_type}" in all_files)
+=======
+            self.assertTrue(f"{base_dir}.wrapper.cpp" in all_files)
+            self.assertTrue(f"{base_dir}.kernel.cpp" in all_files)
+            self.assertTrue(f"{base_dir}.wrapper.so" in all_files)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         aot_inductor_module = torch._inductor.aoti_load_package(package_path)
         self.assertEqual(aot_inductor_module(*example_inputs), model(*example_inputs))
 
+<<<<<<< HEAD
     def test_copy_non_blocking_is_pinned(self):
         if self.device == "cpu" or self.device == "mps":
             raise unittest.SkipTest("only matters for device-to-cpu copy")
@@ -7518,6 +8032,8 @@ def forward(self, x, y, a, b):
         eager_outputs = model(*example_inputs)
         torch.testing.assert_close(eager_outputs, compiled_outputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -7536,6 +8052,7 @@ def forward(self, x):
             torch._inductor.aot_compile(ep.module(), inputs)
         self.assertEqual([r.msg == "create_env" for r in records].count(True), 1)
 
+<<<<<<< HEAD
     @make_logging_test(dynamic=logging.DEBUG)
     def test_shape_env_reuse_zero_consts_use_consts_asm_false(self, records):
         # make sure ShapeEnv is only created once and reused afterwards
@@ -7613,6 +8130,8 @@ def test_compile_standalone_cross_compile_windows_package_format(self):
         with self.assertRaises(RuntimeError):
             maybe_aoti_standalone_config(patches)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
 
@@ -7624,6 +8143,7 @@ def fail_cpu(is_skip=False):
     )
 
 
+<<<<<<< HEAD
 def fail_mps(is_skip=False):
     return TestFailure(
         ("mps",),
@@ -7631,6 +8151,8 @@ def fail_mps(is_skip=False):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     return TestFailure(
         suffixes,
@@ -7649,6 +8171,7 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     # quantized unsupported for GPU
     "test_quantized_linear": fail_gpu(("cuda", "xpu")),
     "test_quanatized_int8_linear": fail_gpu(("cuda", "xpu")),
+<<<<<<< HEAD
     "test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
     # No scaled_dot_product_efficient_attention implementation for XPU yet.
     "test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
@@ -7724,6 +8247,14 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     if "gfx95" in gcnArchName:
         GPU_TEST_FAILURES["test_stft"] = fail_gpu(("cuda", "xpu")) # override "test_stft" for MI35x also
 
+=======
+    # No scaled_dot_product_efficient_attention implementation for XPU yet.
+    "test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
+    # No fft implementation for XPU yet.
+    "test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AOTInductorTestABICompatibleCpu(TestCase):
     device = "cpu"
@@ -7761,6 +8292,7 @@ class AOTInductorTestABICompatibleGpu(TestCase):
     GPU_TEST_FAILURES,
 )
 
+<<<<<<< HEAD
 
 @unittest.skipIf(not torch.backends.mps.is_available(), "No MPS backend available")
 class AOTInductorTestABICompatibleMps(TestCase):
@@ -7781,6 +8313,8 @@ class AOTInductorTestABICompatibleMps(TestCase):
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
diff --git a/test/inductor/test_aot_inductor_arrayref.py b/test/inductor/test_aot_inductor_arrayref.py
index 492ad9c23c5c7..8ef48686dc258 100644
--- a/test/inductor/test_aot_inductor_arrayref.py
+++ b/test/inductor/test_aot_inductor_arrayref.py
@@ -70,7 +70,10 @@ def fail_minimal_arrayref_interface(is_skip=False):
     "test_cond_with_multiple_outputs": fail_minimal_arrayref_interface(),
     "test_cond_with_parameters": fail_minimal_arrayref_interface(),
     "test_cond_with_reinterpret_view_inputs_outputs": fail_minimal_arrayref_interface(),
+<<<<<<< HEAD
     "test_custom_op_in_subgraph": fail_minimal_arrayref_interface(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_cond_share_predicte": fail_stack_allocation(is_skip=True),
     "test_cond_unbacked_symint_closure_dynamic_True": fail_minimal_arrayref_interface(),
     "test_while_loop_with_unbacked_symint_closure_dynamic_True": fail_minimal_arrayref_interface(),
@@ -85,7 +88,10 @@ def fail_minimal_arrayref_interface(is_skip=False):
     "test_while_loop_with_pytree_inputs": fail_stack_allocation(),
     # FIXME: failed with Segfault while exiting the Python runtime
     "test_duplicate_constant_folding": fail_stack_allocation(is_skip=True),
+<<<<<<< HEAD
     "test_aot_inductor_consts_cpp_build": fail_stack_allocation(is_skip=True),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_stride_with_unbacked_expr": fail_minimal_arrayref_interface(is_skip=True),
     # TODO: use of deleted function RAIIAtenTensorHandle
     "test_dup_unbacked_sym_decl": fail_minimal_arrayref_interface(is_skip=True),
diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py
index 6f62362120b33..60268f80f79c1 100644
--- a/test/inductor/test_aot_inductor_custom_ops.py
+++ b/test/inductor/test_aot_inductor_custom_ops.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 # This test requires libaoti_custom_ops.so to be built, which happens when BUILD_TEST = 1
+=======
+# This test requires libaoti_custom_ops.so to be built, which happnes when BUILD_TEST = 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import os
 import sys
@@ -20,10 +24,18 @@
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
+<<<<<<< HEAD
     skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
+=======
+    skipIfRocm,
+    skipIfXpu,
+)
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
+from torch.testing._internal.triton_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -414,7 +426,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertTrue(sentinel_seen)
 
     @skipIfXpu
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE, "unable to find library -laoti_custom_ops")
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_op_square(self) -> None:
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -492,9 +508,15 @@ def fail_cpu(is_skip=False):
     )
 
 
+<<<<<<< HEAD
 def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     return TestFailure(
         suffixes,
+=======
+def fail_cuda(is_skip=False):
+    return TestFailure(
+        ("cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_skip=is_skip,
     )
 
@@ -506,11 +528,18 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
 }
 
 # test_failures, xfail by default, set is_skip=True to skip
+<<<<<<< HEAD
 GPU_TEST_FAILURES = {
     # quantized unsupported for GPU
     "test_quantized_linear": fail_gpu(("cuda", "xpu")),
     "test_quanatized_int8_linear": fail_gpu(("cuda", "xpu")),
     "test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
+=======
+CUDA_TEST_FAILURES = {
+    # quantized unsupported for GPU
+    "test_quantized_linear": fail_cuda(),
+    "test_quanatized_int8_linear": fail_cuda(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -533,9 +562,15 @@ class AOTInductorTestABICompatibleCpu(AOTICustomOpTestCase):
 
 
 @unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
+<<<<<<< HEAD
 class AOTInductorTestABICompatibleGpu(AOTICustomOpTestCase):
     device = GPU_TYPE
     device_type = GPU_TYPE
+=======
+class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
+    device = "cuda"
+    device_type = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_model = check_model
     check_model_with_multiple_inputs = check_model_with_multiple_inputs
     code_check_count = code_check_count
@@ -545,14 +580,24 @@ class AOTInductorTestABICompatibleGpu(AOTICustomOpTestCase):
 
 copy_tests(
     AOTInductorTestsTemplate,
+<<<<<<< HEAD
     AOTInductorTestABICompatibleGpu,
     GPU_TYPE,
     GPU_TEST_FAILURES,
+=======
+    AOTInductorTestABICompatibleCuda,
+    "cuda",
+    CUDA_TEST_FAILURES,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
     # cpp_extension N/A in fbcode
+<<<<<<< HEAD
     if HAS_GPU_AND_TRITON or sys.platform == "darwin":
+=======
+    if HAS_CUDA or sys.platform == "darwin":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index d8b9ad5473bae..3091d269406ca 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -9,12 +9,18 @@
 import tempfile
 import unittest
 import zipfile
+<<<<<<< HEAD
 from collections.abc import Callable
 from pathlib import Path
+=======
+from pathlib import Path
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from parameterized import parameterized_class
 
 import torch
+<<<<<<< HEAD
 import torch._inductor.config
 from torch._inductor.codecache import get_kernel_bin_format, WritableTempFile
 from torch._inductor.package import load_package, package_aoti
@@ -29,6 +35,20 @@
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
 from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
+=======
+from torch._inductor.codecache import get_kernel_bin_format
+from torch._inductor.package import AOTICompiledModel, load_package, package_aoti
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import fresh_cache
+from torch.export import Dim
+from torch.export.pt2_archive._package import load_pt2, load_weights_to_pt2_contents
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    skipIfRocm,
+    skipIfXpu,
+    TEST_CUDA,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -114,7 +134,11 @@ def check_model(
             inductor_configs["aot_inductor.package_cpp_only"] = self.package_cpp_only
 
             torch.manual_seed(0)
+<<<<<<< HEAD
             with WritableTempFile(suffix=".pt2") as f:
+=======
+            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiled_model = compile(
                     model,
                     example_inputs,
@@ -128,6 +152,7 @@ def check_model(
         self.assertEqual(actual, expected, atol=atol, rtol=rtol)
         return compiled_model
 
+<<<<<<< HEAD
     def check_package_cpp_only(self: TestCase) -> None:
         """
         Check if cmake and make are available.
@@ -205,6 +230,8 @@ def cmake_compile(self, model, example_inputs, options, tmp_dir):
             subprocess.run(["make"], cwd=build_path, check=True)
         return build_path, tmp_path
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -219,7 +246,11 @@ def forward(self, x, y):
     def test_remove_intermediate_files(self):
         # For CUDA, generated cpp files contain absolute path to the generated cubin files.
         # With the package artifact, that cubin path should be overridden at the run time,
+<<<<<<< HEAD
         # so removing those intermediate files in this test to verify that.
+=======
+        # so removing those intermeidate files in this test to verify that.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def forward(self, x, y):
                 return x + y
@@ -237,7 +268,11 @@ def forward(self, x, y):
             expected = ref_model(*ref_inputs)
 
             torch.manual_seed(0)
+<<<<<<< HEAD
             with WritableTempFile(suffix=".pt2") as f:
+=======
+            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ep = torch.export.export(model, example_inputs, strict=True)
                 with fresh_cache():
                     # cubin files are removed when exiting this context
@@ -266,12 +301,23 @@ def forward(self, x, y):
         self.check_model(Model(), example_inputs)
 
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+<<<<<<< HEAD
     @unittest.skipIf(
         _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
     )
     @skipIfXpu  # build system may be different
     def test_compile_after_package(self):
         self.check_package_cpp_only()
+=======
+    @skipIfXpu  # build system may be different
+    def test_compile_after_package(self):
+        if not self.package_cpp_only:
+            raise unittest.SkipTest("Only meant to test cpp package")
+        if shutil.which("cmake") is None:
+            raise unittest.SkipTest("cmake is not available")
+        if shutil.which("make") is None:
+            raise unittest.SkipTest("make is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -294,6 +340,7 @@ def forward(self, x, y):
                 # Require kernels to be compiled into .o files
                 "aot_inductor.embed_kernel_binary": True,
             }
+<<<<<<< HEAD
             with (
                 tempfile.TemporaryDirectory() as tmp_dir,
             ):
@@ -301,12 +348,44 @@ def forward(self, x, y):
                     model, example_inputs, options, tmp_dir
                 )
 
+=======
+            ep = torch.export.export(model, example_inputs, strict=True)
+            package_path = torch._inductor.aoti_compile_and_package(
+                ep, inductor_configs=options
+            )
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+                zipfile.ZipFile(package_path, "r") as zip_ref,
+            ):
+                filenames = zip_ref.namelist()
+                prefix = filenames[0].split("/")[0]
+                zip_ref.extractall(tmp_dir)
+                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
+                self.assertTrue(tmp_path.exists())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if self.device == GPU_TYPE:
                     kernel_bin = get_kernel_bin_format(self.device)
                     self.assertTrue(not list(tmp_path.glob(f"*.{kernel_bin}")))
                     # Check if .cubin.o files exist and use unique kernel names
                     self.assertTrue(list(tmp_path.glob(f"triton_*.{kernel_bin}.o")))
 
+<<<<<<< HEAD
+=======
+                build_path = tmp_path / "build"
+                self.assertTrue(not build_path.exists())
+
+                # Create a build directory to run cmake
+                build_path.mkdir()
+                custom_env = os.environ.copy()
+                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+                subprocess.run(
+                    ["cmake", ".."],
+                    cwd=build_path,
+                    env=custom_env,
+                )
+                subprocess.run(["make"], cwd=build_path)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Check if the .so file was build successfully
                 so_path = build_path / "libaoti_model.so"
                 self.assertTrue(so_path.exists())
@@ -314,16 +393,28 @@ def forward(self, x, y):
                 actual = optimized(*example_inputs)
                 self.assertTrue(torch.allclose(actual, expected))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
     @skipIfRocm  # doesn't support multi-arch binary
     @skipIfXpu  # doesn't support multi-arch binary
     def test_compile_after_package_multi_arch(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("Only meant to test GPU_TYPE")
+<<<<<<< HEAD
         self.check_package_cpp_only()
+=======
+        if not self.package_cpp_only:
+            raise unittest.SkipTest("Only meant to test cpp package")
+        if shutil.which("cmake") is None:
+            raise unittest.SkipTest("cmake is not available")
+        if shutil.which("make") is None:
+            raise unittest.SkipTest("make is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -343,17 +434,49 @@ def forward(self, x, y):
 
             options = {
                 "aot_inductor.package_cpp_only": self.package_cpp_only,
+<<<<<<< HEAD
                 # Expect kernel to be embedded in the final binary.
+=======
+                # Expect kernel to be embeded in the final binary.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # We will make it the default behavior for the standalone mode.
                 "aot_inductor.emit_multi_arch_kernel": True,
                 "aot_inductor.embed_kernel_binary": True,
             }
+<<<<<<< HEAD
             with (
                 tempfile.TemporaryDirectory() as tmp_dir,
             ):
                 build_path, _ = self.cmake_compile(
                     model, example_inputs, options, tmp_dir
                 )
+=======
+            ep = torch.export.export(model, example_inputs)
+            package_path = torch._inductor.aoti_compile_and_package(
+                ep, inductor_configs=options
+            )
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+                zipfile.ZipFile(package_path, "r") as zip_ref,
+            ):
+                filenames = zip_ref.namelist()
+                prefix = filenames[0].split("/")[0]
+                zip_ref.extractall(tmp_dir)
+                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
+                self.assertTrue(tmp_path.exists())
+                # Create a build directory to run cmake
+                build_path = tmp_path / "build"
+                build_path.mkdir()
+                custom_env = os.environ.copy()
+                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+                subprocess.run(
+                    ["cmake", ".."],
+                    cwd=build_path,
+                    env=custom_env,
+                )
+                subprocess.run(["make"], cwd=build_path)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Check if the .so file was build successfully
                 so_path = build_path / "libaoti_model.so"
                 self.assertTrue(so_path.exists())
@@ -361,6 +484,7 @@ def forward(self, x, y):
                 actual = optimized(*example_inputs)
                 self.assertTrue(torch.allclose(actual, expected))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
     )
@@ -560,6 +684,8 @@ def default(*args, **kwargs):
             true_res = next(iter(tensor_model.parameters()))
             self.assertEqual(expected_res, true_res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_metadata(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -574,6 +700,7 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
         )
         metadata = {"dummy": "moo"}
+<<<<<<< HEAD
 
         with torch.no_grad():
             torch.manual_seed(0)
@@ -616,6 +743,16 @@ def forward(self, x, y):
             self.assertEqual(actual, expected)
 
         loaded_metadata = compiled_model.get_metadata()  # type: ignore[attr-defined]
+=======
+        compiled_model = self.check_model(
+            Model(),
+            example_inputs,
+            inductor_configs={"aot_inductor.metadata": metadata},
+        )
+
+        loaded_metadata = compiled_model.get_metadata()  # type: ignore[attr-defined]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(loaded_metadata.get("dummy"), "moo")
 
     def test_bool_input(self):
@@ -673,7 +810,11 @@ def forward(self, x):
             ep2.module(), example_inputs2, options=options
         )
 
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -683,13 +824,21 @@ def forward(self, x):
         self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
         self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_GPU, "requires gpu")
+=======
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_duplicate_calls(self):
         options = {
             "aot_inductor.package": True,
         }
 
+<<<<<<< HEAD
         device = GPU_TYPE
+=======
+        device = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model1(torch.nn.Module):
             def __init__(self) -> None:
@@ -725,7 +874,11 @@ def forward(self, a, b):
             ep2.module(), example_inputs2, options=options
         )
 
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -761,7 +914,11 @@ def forward(self, a, b):
                 "aot_inductor.package_cpp_only": self.package_cpp_only,
             },
         )
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(f.name, {"model1": aoti_files})
             loaded = load_package(package_path, "model1")
         self.assertTrue(
@@ -945,7 +1102,11 @@ def test_package_shared_weights(self):
             "aot_inductor.package_cpp_only": self.package_cpp_only,
             "always_keep_tensor_constants": True,
             "aot_inductor.package_constants_in_so": False,
+<<<<<<< HEAD
             "aot_inductor.package_constants_on_disk_format": "pickle_weights",
+=======
+            "aot_inductor.package_constants_on_disk": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         class Bar(torch.nn.Module):
@@ -981,7 +1142,11 @@ def forward(self):
         aoti_files1 = torch._inductor.aot_compile(ep1.module(), (), options=options)
         aoti_files2 = torch._inductor.aot_compile(ep2.module(), (), options=options)
 
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(
                 f.name,
                 {"model1": aoti_files1, "model2": aoti_files2},
@@ -1029,7 +1194,11 @@ def test_package_weights_on_disk_nested_module(self):
             "aot_inductor.package_cpp_only": self.package_cpp_only,
             "always_keep_tensor_constants": True,
             "aot_inductor.package_constants_in_so": False,
+<<<<<<< HEAD
             "aot_inductor.package_constants_on_disk_format": "pickle_weights",
+=======
+            "aot_inductor.package_constants_on_disk": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         # linear.weight's node name is linear_weight.
@@ -1053,6 +1222,7 @@ def forward(self, x):
         loaded1 = pt2_contents.aoti_runners["model"]
         self.assertEqual(loaded1(x), bar1(x))
 
+<<<<<<< HEAD
     def test_loading_wrong_model(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -1069,6 +1239,8 @@ def forward(self, x):
         ):
             load_package(package_path, model_name="forward")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index 2a9f593c5a6c4..ecb06752a7917 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -52,6 +52,7 @@ def legacy_compile(
             )
             gm = ep.module()
         else:
+<<<<<<< HEAD
             with torch._export.config.patch(use_new_tracer_experimental=True):
                 gm = torch.export._trace._export_to_torch_ir(
                     model,
@@ -62,6 +63,17 @@ def legacy_compile(
                     # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
                     restore_fqn=False,
                 )
+=======
+            gm = torch.export._trace._export_to_torch_ir(
+                model,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                disable_constraint_solver=disable_constraint_solver,
+                # Disabling this flag, because instead we can rely on the mapping
+                # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+                restore_fqn=False,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if IS_FBCODE:
             from deeplearning.aot_inductor.extern_node_thrift_serializer import (
@@ -103,8 +115,11 @@ def legacy_load_runner(device, so_path: str) -> "AOTIModelContainerRunner":
                 return torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)
             elif device == "xpu":
                 return torch._C._aoti.AOTIModelContainerRunnerXpu(so_path, 1, device)
+<<<<<<< HEAD
             elif device == "mps":
                 return torch._C._aoti.AOTIModelContainerRunnerMps(so_path, 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)
 
@@ -149,7 +164,11 @@ def legacy_run(
     @staticmethod
     def compile(
         model: Union[torch.nn.Module, types.FunctionType],
+<<<<<<< HEAD
         example_inputs: tuple[torch.Tensor, ...],
+=======
+        example_inputs: list[torch.Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
@@ -157,6 +176,7 @@ def compile(
             # This should really be the default behavior of torch.export.export
             model = WrapperModule(model)
 
+<<<<<<< HEAD
         with (
             torch.no_grad(),
             torch._export.config.patch(use_new_tracer_experimental=True),
@@ -168,6 +188,12 @@ def compile(
                 dynamic_shapes=dynamic_shapes,
                 strict=True,
                 prefer_deferred_runtime_asserts_over_guards=True,
+=======
+        with torch.no_grad():
+            # strict=False needs extra migration work
+            ep = torch.export.export(
+                model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             package_path = torch._inductor.aoti_compile_and_package(
                 ep, inductor_configs=inductor_configs
@@ -177,7 +203,11 @@ def compile(
     @staticmethod
     def run(
         model: Union[torch.nn.Module, types.FunctionType],
+<<<<<<< HEAD
         example_inputs: tuple[torch.Tensor, ...],
+=======
+        example_inputs: list[torch.Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
@@ -193,7 +223,11 @@ def run(
     @staticmethod
     def run_multiple(
         model: Union[torch.nn.Module, types.FunctionType],
+<<<<<<< HEAD
         list_example_inputs: list[tuple[torch.Tensor, ...]],
+=======
+        list_example_inputs: list[list[torch.Tensor]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
@@ -232,7 +266,11 @@ def check_model(
         if not isinstance(model, types.FunctionType):
             model = model.to(self.device)
 
+<<<<<<< HEAD
         # For non mixed device inputs with default "cpu",set the device manually.
+=======
+        # For non mixed device inputs with default "cpu",set the device manully.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if all(
             t.device.type == "cpu"
             for t in example_inputs
diff --git a/test/inductor/test_async_compile.py b/test/inductor/test_async_compile.py
index 67a4bc2449489..b1dfde8f15638 100644
--- a/test/inductor/test_async_compile.py
+++ b/test/inductor/test_async_compile.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 from unittest.mock import patch
 
 import torch
@@ -9,6 +10,11 @@
 from torch._inductor.runtime.triton_heuristics import (
     generate_lookup_hash_from_source_code,
 )
+=======
+import torch
+from torch._inductor import config
+from torch._inductor.async_compile import AsyncCompile, shutdown_compile_workers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import fresh_cache
 from torch.testing._internal.common_utils import (
@@ -36,12 +42,18 @@ def fn(x, y):
 
         with config.patch("worker_start_method", method):
             shutdown_compile_workers()
+<<<<<<< HEAD
             AsyncCompile.wait_pool_ready()
+=======
+            pool = AsyncCompile.process_pool()
+            pool.ready_future.result(timeout=120)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with fresh_cache():
                 compiled_fn = torch.compile(fn)
                 self.assertEqual(fn(x, y), compiled_fn(x, y))
 
+<<<<<<< HEAD
     @requires_gpu()
     @requires_triton()
     def test_bad_kernel(self):
@@ -153,6 +165,8 @@ def triton_fused_fake_name(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.cons
         self.assertEqual(args[1].num_warps, autotune_config.num_warps)
         self.assertEqual(args[1].num_stages, autotune_config.num_stages)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_auto_functionalize.py b/test/inductor/test_auto_functionalize.py
index 6025c90cdb4a2..c3402cf643b0d 100644
--- a/test/inductor/test_auto_functionalize.py
+++ b/test/inductor/test_auto_functionalize.py
@@ -185,6 +185,7 @@ def f(x, y, z, n):
                     post_grad_graphs,
                     """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu", arg4_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg0_1 = arg3_1 = arg4_1 = arg1_1 = arg2_1 = foo_default = None
         return ()""",  # noqa: B950
                     ignore_comments=True,
@@ -193,6 +194,11 @@ def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3
                 # stack trace should be in post_grad_graph
                 self.assertTrue(
                     "code: torch.ops.mylib.foo(x, y, z, 2, n)" in post_grad_graphs,
+=======
+        # No stacktrace found for following nodes
+        foo_default = torch.ops.mylib.foo.default(arg2_1, [arg3_1, arg4_1], arg1_1, 2, arg0_1);  arg2_1 = arg3_1 = arg4_1 = arg1_1 = arg0_1 = foo_default = None
+        return ()""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
@@ -216,7 +222,11 @@ def foo_impl(x, y, z, w, n):
                 z.add_(y[1] + n)
                 return y[0] + w, y[1] + n
 
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo", lib=lib)
+=======
+            @torch.library.impl_abstract("mylib::foo", lib=lib)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo_abstract(x, y, z, w, n):
                 return y[0] + w, y[1] + n
 
@@ -246,7 +256,11 @@ def f(x, y, z, n):
                     post_grad_graphs,
                     """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu", arg4_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg0_1 = arg3_1 = arg4_1 = arg1_1 = arg2_1 = None
+=======
+        foo_default = torch.ops.mylib.foo.default(arg2_1, [arg3_1, arg4_1], arg1_1, 2, arg0_1);  arg2_1 = arg3_1 = arg4_1 = arg1_1 = arg0_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_4: "f32[3][1]cpu" = foo_default[0]
         getitem_5: "f32[3][1]cpu" = foo_default[1];  foo_default = None
         return (getitem_4, getitem_5)""",  # noqa: B950
@@ -333,6 +347,7 @@ def f(x, y, z, n):
                     post_grad_graphs,
                     """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(None, [arg2_1, arg3_1], arg0_1, 2, arg1_1);  arg2_1 = \
 arg3_1 = arg0_1 = arg1_1 = foo_default = None
         return ()""",
@@ -342,6 +357,12 @@ def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3
                 # stack trace should be in post_grad_graph
                 self.assertTrue(
                     "code: torch.ops.mylib.foo(x, y, z, 2, n)" in post_grad_graphs,
+=======
+        # No stacktrace found for following nodes
+        foo_default = torch.ops.mylib.foo.default(None, [arg2_1, arg3_1], arg1_1, 2, arg0_1);  \
+arg2_1 = arg3_1 = arg1_1 = arg0_1 = foo_default = None
+        return ()""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
@@ -414,10 +435,17 @@ def f(x, y, z, n):
                     self.assertExpectedInline(
                         post_grad_graphs,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77][1]cpu", arg3_1: "f32[s77][1]cpu", arg4_1: "f32[s77][1]cpu", arg5_1: "f32[s77][1]cpu"):
         foo_default = torch.ops.mylib.foo.default(arg1_1, [arg4_1, arg5_1], arg2_1, 2, arg3_1);  arg4_1 = arg5_1 = arg3_1 = foo_default = None
         copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
         copy__1: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s72)", arg1_1: "f32[s72][1]cpu", arg2_1: "f32[s72][1]cpu", arg3_1: "f32[s72][1]cpu", arg4_1: "f32[s72][1]cpu", arg5_1: "f32[s72][1]cpu"):
+        foo_default = torch.ops.mylib.foo.default(arg3_1, [arg4_1, arg5_1], arg2_1, 2, arg1_1);  arg4_1 = arg5_1 = arg1_1 = foo_default = None
+        copy_: "f32[s72][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy_ = None
+        copy__1: "f32[s72][1]cpu" = torch.ops.aten.copy_.default(arg3_1, arg3_1);  arg3_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -427,9 +455,15 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77
                         post_grad_graphs,
                         """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu", arg4_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg3_1 = arg4_1 = arg2_1 = foo_default = None
         copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg0_1, arg0_1);  arg0_1 = copy_ = None
         copy__1: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy__1 = None
+=======
+        foo_default = torch.ops.mylib.foo.default(arg2_1, [arg3_1, arg4_1], arg1_1, 2, arg0_1);  arg3_1 = arg4_1 = arg0_1 = foo_default = None
+        copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+        copy__1: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -443,7 +477,11 @@ def run_aot_eager(self, f, orig_args, _dynamic=False):
         aot_eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
 
         log_stream, ctx = logs_to_string(
+<<<<<<< HEAD
             "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+=======
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         result = None
@@ -493,7 +531,11 @@ def foo_impl(x, y, z, w, n):
                 z.add_(y[1] + n)
                 return y[0] + w, y[1] + n
 
+<<<<<<< HEAD
             @torch.library.register_fake("mylib::foo", lib=lib)
+=======
+            @torch.library.impl_abstract("mylib::foo", lib=lib)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo_abstract(x, y, z, w, n):
                 return y[0] + w, y[1] + n
 
@@ -521,11 +563,19 @@ def f(x, y, z, n):
                     post_grad_graphs,
                     """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu", arg4_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg3_1 = arg4_1 = arg2_1 = None
         getitem_4: "f32[3][1]cpu" = foo_default[0]
         getitem_5: "f32[3][1]cpu" = foo_default[1];  foo_default = None
         copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg0_1, arg0_1);  arg0_1 = copy_ = None
         copy__1: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy__1 = None
+=======
+        foo_default = torch.ops.mylib.foo.default(arg2_1, [arg3_1, arg4_1], arg1_1, 2, arg0_1);  arg3_1 = arg4_1 = arg0_1 = None
+        getitem_4: "f32[3][1]cpu" = foo_default[0]
+        getitem_5: "f32[3][1]cpu" = foo_default[1];  foo_default = None
+        copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+        copy__1: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (getitem_4, getitem_5)""",  # noqa: B950
                     ignore_comments=True,
                     ignore_empty_lines=True,
@@ -579,6 +629,7 @@ def f(x, y):
                     self.assertExpectedInline(
                         graph_aot,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77][1]cpu"):
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg1_1, arg2_1])
         getitem_1: "f32[s77][1]cpu" = auto_functionalized_v2[1]
@@ -586,6 +637,15 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77
         add: "f32[s77][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
         copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = getitem_1 = copy_ = None
         copy__1: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg2_1, getitem_2);  arg2_1 = getitem_2 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s17)", arg1_1: "f32[s17][1]cpu", arg2_1: "f32[s17][1]cpu"):
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg2_1, arg1_1])
+        getitem_1: "f32[s17][1]cpu" = auto_functionalized_v2[1]
+        getitem_2: "f32[s17][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
+        add: "f32[s17][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
+        copy_: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_2);  arg1_1 = getitem_2 = copy_ = None
+        copy__1: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg2_1, getitem_1);  arg2_1 = getitem_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -595,12 +655,21 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77
                         graph_aot,
                         """\
 def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
+<<<<<<< HEAD
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg0_1, arg1_1])
         getitem_1: "f32[2][1]cpu" = auto_functionalized_v2[1]
         getitem_2: "f32[2][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
         add: "f32[2][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
         copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, getitem_1);  arg0_1 = getitem_1 = copy_ = None
         copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_2);  arg1_1 = getitem_2 = copy__1 = None
+=======
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg1_1, arg0_1])
+        getitem_1: "f32[2][1]cpu" = auto_functionalized_v2[1]
+        getitem_2: "f32[2][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
+        add: "f32[2][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
+        copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, getitem_2);  arg0_1 = getitem_2 = copy_ = None
+        copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = getitem_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -611,11 +680,19 @@ def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
                     self.assertExpectedInline(
                         graph_inductor,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77][1]cpu"):
         foo_default = torch.ops.mylib.foo.default(arg1_1, arg2_1);  foo_default = None
         add: "f32[s77][1]cpu" = torch.ops.aten.add.Tensor(arg1_1, arg2_1)
         copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
         copy__1: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s17)", arg1_1: "f32[s17][1]cpu", arg2_1: "f32[s17][1]cpu"):
+        foo_default = torch.ops.mylib.foo.default(arg2_1, arg1_1);  foo_default = None
+        add: "f32[s17][1]cpu" = torch.ops.aten.add.Tensor(arg2_1, arg1_1)
+        copy_: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+        copy__1: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)""",
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -625,8 +702,13 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu", arg2_1: "f32[s77
                         graph_inductor,
                         """\
 def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, arg1_1);  foo_default = None
         add: "f32[2][1]cpu" = torch.ops.aten.add.Tensor(arg0_1, arg1_1)
+=======
+        foo_default = torch.ops.mylib.foo.default(arg1_1, arg0_1);  foo_default = None
+        add: "f32[2][1]cpu" = torch.ops.aten.add.Tensor(arg1_1, arg0_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, arg0_1);  arg0_1 = copy_ = None
         copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy__1 = None
         return (add,)""",
@@ -841,11 +923,19 @@ def f(x, y):
                     graph_aot,
                     """\
 def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
+<<<<<<< HEAD
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg0_1, arg1_1])
         getitem_1: "f32[2][1]cpu" = auto_functionalized_v2[1]
         getitem_2: "f32[2][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
         copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, getitem_1);  arg0_1 = getitem_1 = copy_ = None
         copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_2);  arg1_1 = getitem_2 = copy__1 = None
+=======
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg1_1, arg0_1])
+        getitem_1: "f32[2][1]cpu" = auto_functionalized_v2[1]
+        getitem_2: "f32[2][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
+        copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, getitem_2);  arg0_1 = getitem_2 = copy_ = None
+        copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = getitem_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                     ignore_comments=True,
                     ignore_empty_lines=True,
@@ -857,7 +947,11 @@ def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
                     graph_inductor,
                     """\
 def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(arg0_1, arg1_1);  foo_default = None
+=======
+        foo_default = torch.ops.mylib.foo.default(arg1_1, arg0_1);  foo_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         copy_: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg0_1, arg0_1);  arg0_1 = copy_ = None
         copy__1: "f32[2][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy__1 = None
         return ()""",  # noqa: B950
@@ -977,8 +1071,13 @@ def f(x, y, z, n):
                     post_grad_graphs,
                     """\
 def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3][1]cpu", arg3_1: "f32[3][1]cpu"):
+<<<<<<< HEAD
         foo_default = torch.ops.mylib.foo.default(None, [arg2_1, arg3_1], arg0_1, 2, arg1_1);  arg2_1 = arg3_1 = arg1_1 = foo_default = None
         copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg0_1, arg0_1);  arg0_1 = copy_ = None
+=======
+        foo_default = torch.ops.mylib.foo.default(None, [arg2_1, arg3_1], arg1_1, 2, arg0_1);  arg2_1 = arg3_1 = arg0_1 = foo_default = None
+        copy_: "f32[3][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                     ignore_comments=True,
                     ignore_empty_lines=True,
@@ -1414,7 +1513,11 @@ def test_round_trip(base, tensor):
             test_round_trip(t, f[1])
             test_round_trip(t, f[2])
 
+<<<<<<< HEAD
         # example where slice won't work
+=======
+        # example where slice wont work
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # selection
         t = torch.ones(10)
@@ -1576,7 +1679,11 @@ def forward(self, arg0_1: "f32[2][1]cpu"):
     def test_alias2_dynamic(self):
         self.test_alias2(_dynamic=True)
 
+<<<<<<< HEAD
     # Test that the view regeneration optimizations do not result in recompilations. By comparing re-compilation in eager backend
+=======
+    # Test that the view regenration optimizations do not result in recompilations. By comparing re-compilation in eager backend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # with recompilation in inductor backend.
     @torch.fx.experimental._config.patch(use_duck_shape=False)
     def test_recompile(self):
diff --git a/test/inductor/test_b2b_gemm.py b/test/inductor/test_b2b_gemm.py
index fa5194fc83409..ee4f344115c98 100644
--- a/test/inductor/test_b2b_gemm.py
+++ b/test/inductor/test_b2b_gemm.py
@@ -164,7 +164,13 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         self.assertTrue("B2B_GEMM_LEFT_TRITON_ENTRANCE" not in code)
         self.assertTrue("B2B_GEMM_RIGHT_TRITON_ENTRANCE" not in code)
 
+<<<<<<< HEAD
     @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
+=======
+    @unittest.skipIf(
+        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_plain_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -217,7 +223,13 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
+<<<<<<< HEAD
     @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
+=======
+    @unittest.skipIf(
+        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -272,7 +284,13 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
+<<<<<<< HEAD
     @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
+=======
+    @unittest.skipIf(
+        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_mlp_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index a23a1ca4c3393..3f6f671b2447f 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -12,9 +12,14 @@
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import (
     get_func_call,
+<<<<<<< HEAD
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU_AND_TRITON,
+=======
+    HAS_CPU,
+    HAS_CUDA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_BIG_GPU,
 )
 
@@ -28,7 +33,11 @@
 
 from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
     check_model,
+<<<<<<< HEAD
     check_model_gpu,
+=======
+    check_model_cuda,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     copy_tests,
     skip_if_cpp_wrapper,
 )
@@ -141,8 +150,13 @@ def f(a, b):
     )
     @config.patch(max_autotune_gemm_backends="TRITON")
     def test_avoid_register_spilling(self):
+<<<<<<< HEAD
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("GPU only")
+=======
+        if self.device != "cuda":
+            raise unittest.SkipTest("CUDA only")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch.nn.functional import gelu
 
@@ -157,8 +171,13 @@ def foo(m, inp):
 
             return curr
 
+<<<<<<< HEAD
         m = torch.nn.Linear(2048, 2048, bias=True).half().to(GPU_TYPE)
         inp = torch.rand([2048, 2048]).half().to(GPU_TYPE)
+=======
+        m = torch.nn.Linear(2048, 2048, bias=True).half().cuda()
+        inp = torch.rand([2048, 2048]).half().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch.no_grad():
             foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
@@ -166,7 +185,11 @@ def foo(m, inp):
             _, out_code = run_and_get_code(foo_c, m, inp)
 
             # occasionally, CI will make this one kernel. just skip in this case
+<<<<<<< HEAD
             if out_code[0].count("def triton_") != 2:
+=======
+            if not out_code[0].count("def triton_") == 2:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return
 
             # should be multiple triton invocations
@@ -186,7 +209,11 @@ def foo(m, inp):
 
         for c in out_code[0], out_code2[0]:
             FileCheck().check("async_compile.wait").check("DeviceGuard").check_count(
+<<<<<<< HEAD
                 f"empty_strided_{GPU_TYPE}", 1, exactly=True
+=======
+                "empty_strided_cuda", 1, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).check_regex("buf[0-9]* = buf[0-9]*; del buf[0-9]*").check("return").run(c)
 
     def test_tield_kernel_fusion(self):
@@ -198,6 +225,7 @@ def f(x):
         self.common(f, (x,))
 
 
+<<<<<<< HEAD
 if HAS_GPU_AND_TRITON:
 
     class BenchmarkFusionGpuTest(TestCase):
@@ -210,16 +238,34 @@ class BenchmarkingTest(TestCase):
         @unittest.skipIf(
             getattr(torch, GPU_TYPE).device_count() < 2,
             "The test need at least 2 devices",
+=======
+if HAS_CUDA:
+
+    class BenchmarkFusionCudaTest(TestCase):
+        common = check_model_cuda
+        device = "cuda"
+
+    copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
+
+    class BenchmarkingTest(TestCase):
+        @unittest.skipIf(
+            torch.cuda.device_count() < 2, "The test need at least 2 devices"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         @skip_if_cpp_wrapper("This tests triton scheduling directly")
         def test_benchmark_on_non_zero_device(self):
             hit_count = 0
+<<<<<<< HEAD
             with getattr(torch, GPU_TYPE).device(f"{GPU_TYPE}:0"):
+=======
+            with torch.cuda.device("cuda:0"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 @torch.compile
                 def relu(x):
                     return realize(x.relu()) + x
 
+<<<<<<< HEAD
                 x = torch.randn(int(16e6), device=f"{GPU_TYPE}:1")
 
                 orig_benchmark_codegened_module = (
@@ -230,18 +276,37 @@ def benchmark_codegened_module(*args, **kwargs):
                     nonlocal hit_count
                     hit_count += 1
                     ms, path = orig_benchmark_codegened_module(*args, **kwargs)
+=======
+                x = torch.randn(int(16e6), device="cuda:1")
+
+                orig_benchmark_fused_nodes = TritonScheduling.benchmark_fused_nodes
+
+                def mock_benchmark_fused_nodes(*args, **kwargs):
+                    nonlocal hit_count
+                    hit_count += 1
+                    ms, path = orig_benchmark_fused_nodes(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertTrue(ms > 0)
                     return ms, path
 
                 with unittest.mock.patch.object(
                     TritonScheduling,
+<<<<<<< HEAD
                     "benchmark_codegened_module",
                     benchmark_codegened_module,
+=======
+                    "benchmark_fused_nodes",
+                    mock_benchmark_fused_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     relu(x)
                 self.assertTrue(hit_count > 0)
 
+<<<<<<< HEAD
     class BenchmarkMultiTemplateFusionGpuTest(InductorTestCase):
+=======
+    class BenchmarkMultiTemplateFusionCudaTest(InductorTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @classmethod
         def setUpClass(cls):
             super().setUpClass()
@@ -276,8 +341,13 @@ def foo(m, inp):
             foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
             first_dim = first_dim if first_dim is not None else size
 
+<<<<<<< HEAD
             m = torch.nn.Linear(size, size, bias=True).half().to(GPU_TYPE)
             inp = torch.rand([first_dim, size]).half().to(GPU_TYPE)
+=======
+            m = torch.nn.Linear(size, size, bias=True).half().cuda()
+            inp = torch.rand([first_dim, size]).half().cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with torch.no_grad():
                 res, code = run_and_get_code(foo_c, m, inp)
@@ -298,7 +368,11 @@ def test_equivalent_template_code(self):
             for out_code in [code, code2]:
                 FileCheck().check(get_func_call()).check_count(
                     "empty_strided", 1, exactly=True
+<<<<<<< HEAD
                 ).check("triton_tem_fused_addmm_relu_t_0").check_count(
+=======
+                ).check("triton_tem_fused_addmm_relu_0").check_count(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
                 ).check("" if config.cpp_wrapper else "return").run(out_code[0])
 
@@ -328,9 +402,15 @@ def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
                 )
 
             args = [
+<<<<<<< HEAD
                 torch.randn(4, 4, device=GPU_TYPE),
                 torch.randn(4, 4, device=GPU_TYPE),
                 torch.randn(4, 4, device=GPU_TYPE),
+=======
+                torch.randn(4, 4, device="cuda"),
+                torch.randn(4, 4, device="cuda"),
+                torch.randn(4, 4, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
 
             expected = fn(*args)
@@ -351,5 +431,9 @@ class BenchmarkFusionCpuTest(TestCase):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_GPU_AND_TRITON:
+=======
+    if HAS_CPU or HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_best_config.py b/test/inductor/test_best_config.py
index 3911619407067..f5931453e82da 100644
--- a/test/inductor/test_best_config.py
+++ b/test/inductor/test_best_config.py
@@ -9,7 +9,11 @@
 
 import torch
 from torch._inductor import config
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_LINUX
+=======
+from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -48,6 +52,10 @@ def tearDownClass(cls):
         config.max_autotune = cls.original_max_autotune
         super().tearDownClass()
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_best_config_has_triton_cache_key(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             os.environ["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
diff --git a/test/inductor/test_binary_folding.py b/test/inductor/test_binary_folding.py
index 746a2808c9018..5273290cb5deb 100644
--- a/test/inductor/test_binary_folding.py
+++ b/test/inductor/test_binary_folding.py
@@ -81,9 +81,15 @@ def forward(self, x):
             out_optimized = torch.compile(mod_eager)
 
             inps = [4, 3, 4]
+<<<<<<< HEAD
             if module is nn.Conv2d:
                 inps.append(inps[-1])
             if module is nn.Conv3d:
+=======
+            if module == nn.Conv2d:
+                inps.append(inps[-1])
+            if module == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -195,9 +201,15 @@ def my_inner_compile(gm, example_inputs, *args, **kwargs):
             )
 
             inps = [4, 3, 4]
+<<<<<<< HEAD
             if module[0] is nn.Conv2d:
                 inps.append(inps[-1])
             if module[0] is nn.Conv3d:
+=======
+            if module[0] == nn.Conv2d:
+                inps.append(inps[-1])
+            if module[0] == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index 079be79fcc9d8..ec21fad5424bc 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -12,12 +12,16 @@
 import torch
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import try_import_ck_lib
 from torch.testing._internal.common_cuda import tf32_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     _quantize_rowwise,
     _quantize_tensorwise,
@@ -27,13 +31,36 @@
 
 
 if HAS_CUDA_AND_TRITON:
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+
+
+try:
+    from .test_fp8 import _quantize_rowwise, _quantize_tensorwise
+except ImportError:
+    from test_fp8 import _quantize_rowwise, _quantize_tensorwise
+
+
+torch.set_float32_matmul_precision("high")
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 # patch env for tests if needed
 _test_env = {}
+=======
+def _get_path_without_sccache() -> str:
+    """
+    Get the PATH environment variable without sccache.
+    """
+    path_envs = os.environ.get("PATH", "").split(":")
+    path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
+    return ":".join(path_envs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @instantiate_parametrized_tests
@@ -49,10 +76,20 @@ def setUp(self):
         )
 
         torch.random.manual_seed(1234)
+<<<<<<< HEAD
 
         self.ck_dir, _, _, _ = try_import_ck_lib()
         if not self.ck_dir:
             raise unittest.SkipTest("Composable Kernel library is not installed")
+=======
+        try:
+            import ck4inductor  # @manual
+
+            self.ck_dir = os.path.dirname(ck4inductor.__file__)
+            os.environ["TORCHINDUCTOR_CK_DIR"] = self.ck_dir
+        except ImportError as e:
+            raise unittest.SkipTest("Composable Kernel library not installed") from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
@@ -63,7 +100,11 @@ def setUp(self):
             )
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
+<<<<<<< HEAD
     @unittest.mock.patch.dict(os.environ, _test_env)
+=======
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("max_autotune_gemm_backends", ("CK", "CKTILE", "ATen,Triton,CK"))
     @parametrize("autotune_in_subproc", (True, False))
     @parametrize("use_aoti", (True, False))
@@ -74,6 +115,11 @@ def test_max_autotune_precompile_matmul(
         Make sure autotuning mm doesn't crash.
         """
 
+<<<<<<< HEAD
+=======
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mm(a, b):
             return a @ b
 
@@ -84,6 +130,7 @@ def mm(a, b):
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -97,6 +144,18 @@ def mm(a, b):
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": autotune_in_subproc,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 16,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+                "rocm.ck_dir": self.ck_dir,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if use_aoti:
                 Y_compiled = AOTIRunnerUtil.run(
@@ -115,7 +174,11 @@ def compiled_mm(x, w):
             torch.testing.assert_close(Y_compiled, Y)
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
+<<<<<<< HEAD
     @unittest.mock.patch.dict(os.environ, _test_env)
+=======
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("max_autotune_gemm_backends", ("CK",))
     @parametrize("autotune_in_subproc", (True,))
     def test_max_autotune_precompile_matmul_dynamic(
@@ -125,6 +188,11 @@ def test_max_autotune_precompile_matmul_dynamic(
         Test matmul with dynamic shapes
         """
 
+<<<<<<< HEAD
+=======
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_options = {"device": "cuda", "dtype": torch.bfloat16}
 
         a = torch.randn(2240, 256, **tensor_options)
@@ -134,6 +202,7 @@ def test_max_autotune_precompile_matmul_dynamic(
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -147,6 +216,18 @@ def test_max_autotune_precompile_matmul_dynamic(
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": autotune_in_subproc,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 16,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+                "rocm.ck_dir": self.ck_dir,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile(dynamic=True)
@@ -163,13 +244,22 @@ def compiled_mm(a, b):
             torch.testing.assert_close(Y1_compiled, Y1)
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
+<<<<<<< HEAD
     @unittest.mock.patch.dict(os.environ, _test_env)
+=======
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     def test_max_autotune_precompile_preselected(self, max_autotune_gemm_backends):
         """
         End to end test for picking preselected ck instances
         """
 
+<<<<<<< HEAD
+=======
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mm(a, b):
             return a @ b
 
@@ -180,6 +270,7 @@ def mm(a, b):
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -192,12 +283,24 @@ def mm(a, b):
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 12,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.use_preselected_instances": True,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             Y_compiled = torch.compile(mm, dynamic=False)(a, b)
             Y = mm(a, b)
             torch.testing.assert_close(Y_compiled, Y)
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
+<<<<<<< HEAD
     @unittest.mock.patch.dict(os.environ, _test_env)
     @parametrize("max_autotune_gemm_backends", ("Aten,CK",))
     def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends):
@@ -205,6 +308,17 @@ def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends
         Make sure the matmul with non-contiguous inputs can fallback
         """
 
+=======
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
+    def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends):
+        """
+        Make sure the ck template can work with non-contiguous inputs
+        """
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_options = {"device": "cuda", "dtype": torch.float16}
 
         a = torch.empty_strided((50257, 32768), (1, 50304), **tensor_options)
@@ -212,6 +326,7 @@ def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -225,6 +340,18 @@ def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 16,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile(dynamic=False)
@@ -233,6 +360,7 @@ def mm(a, b):
 
             Y_compiled = mm(a, b)
             Y_eager = a @ b
+<<<<<<< HEAD
             torch.testing.assert_close(Y_compiled, Y_eager, equal_nan=True)
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
@@ -240,6 +368,17 @@ def mm(a, b):
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     @parametrize("x_shape", ([4096, 2048], [2048], [4096, 1]))
     def test_max_autotune_addmm(self, max_autotune_gemm_backends, x_shape):
+=======
+            torch.testing.assert_close(Y_compiled, Y_eager)
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
+    @parametrize("x_shape", ([4096, 2048], [2048], [4096, 1]))
+    def test_max_autotune_addmm(self, max_autotune_gemm_backends, x_shape):
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, k, n = 4096, 224, 2048
         alpha, beta = 1.0, 1.0
 
@@ -250,6 +389,7 @@ def test_max_autotune_addmm(self, max_autotune_gemm_backends, x_shape):
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -262,6 +402,17 @@ def test_max_autotune_addmm(self, max_autotune_gemm_backends, x_shape):
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 2,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.ck_max_profiling_configs": 2,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile(dynamic=False)
@@ -273,6 +424,7 @@ def addmm(x, a, b, alpha, beta):
 
             torch.testing.assert_close(Y_compiled, Y_eager)
 
+<<<<<<< HEAD
     @unittest.skip(
         "FIXME(tenpercent): kernel compilation errors on gfx942 as of 09/01/25"
     )
@@ -290,6 +442,18 @@ def test_max_autotune_scaled_mm(
             self.skipTest(f"Unsupported arch {runtime_arch}")
         # output dtype
         dtype = torch.bfloat16
+=======
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
+    @parametrize("dtype", (torch.bfloat16,))
+    @parametrize("use_fast_accum", (True,))
+    @parametrize("quantize_type", ("tensorwise", "rowwise"))
+    @parametrize("has_bias", (True, False))
+    def test_max_autotune_scaled_mm(
+        self, max_autotune_gemm_backends, dtype, use_fast_accum, quantize_type, has_bias
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_options = {"device": "cuda", "dtype": dtype}
 
         M = 2240
@@ -303,9 +467,13 @@ def test_max_autotune_scaled_mm(
         if has_bias:
             bias = torch.randn(N, **tensor_options)
 
+<<<<<<< HEAD
         dtype_float8 = (
             torch.float8_e4m3fnuz if "gfx94" in runtime_arch else torch.float8_e4m3fn
         )
+=======
+        dtype_float8 = torch.float8_e4m3fnuz
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f_quantize = (
             _quantize_tensorwise if quantize_type == "tensorwise" else _quantize_rowwise
@@ -333,6 +501,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             )
             return y
 
+<<<<<<< HEAD
         y_eager = linear(
             x_fp8,
             x_inverse_scale,
@@ -340,6 +509,29 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             w_inverse_scale_t,
             bias,
         )
+=======
+        if quantize_type == "tensorwise":
+            y_eager = linear(
+                x_fp8,
+                x_inverse_scale,
+                w_t_fp8,
+                w_inverse_scale_t,
+                bias,
+            )
+        else:
+            # FIXME when rowwise quantize is supported by pt eager on ROCm
+            w_fp8_tw, w_inverse_scale_tw = _quantize_tensorwise(w, dtype_float8)
+            w_fp8_tw_t = w_fp8_tw.t()
+            w_inverse_scale_tw_t = w_inverse_scale_tw.t()
+            x_fp8_tw, x_inverse_scale_tw = _quantize_tensorwise(x, dtype_float8)
+            y_eager = linear(
+                x_fp8_tw,
+                x_inverse_scale_tw,
+                w_fp8_tw_t,
+                w_inverse_scale_tw_t,
+                bias,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -368,10 +560,19 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.mock.patch.dict(
         os.environ,
+<<<<<<< HEAD
         {**_test_env, "PYTORCH_MIOPEN_SUGGEST_NHWC": "1"},
     )
     @parametrize("max_autotune_conv_backends", ("CK", "ATEN,CK,TRITON"))
     def test_max_autotune_conv2d(self, max_autotune_conv_backends):
+=======
+        {"PATH": _get_path_without_sccache(), "PYTORCH_MIOPEN_SUGGEST_NHWC": "1"},
+    )
+    @parametrize("max_autotune_conv_backends", ("CK", "ATEN,CK,TRITON"))
+    def test_max_autotune_conv2d(self, max_autotune_conv_backends):
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_options = {"device": "cuda", "dtype": torch.float32}
 
         x = torch.randn(1, 8, 224, 224, **tensor_options)
@@ -381,6 +582,7 @@ def test_max_autotune_conv2d(self, max_autotune_conv_backends):
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -393,6 +595,17 @@ def test_max_autotune_conv2d(self, max_autotune_conv_backends):
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": False,
+                "max_autotune_conv_backends": max_autotune_conv_backends,
+                "compile_threads": 4,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.ck_max_profiling_configs": 4,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile(dynamic=False)
@@ -405,7 +618,11 @@ def conv2d(x, w):
             torch.testing.assert_close(Y_compiled, Y_eager, atol=2e-4, rtol=2e-4)
 
     @unittest.skipIf(not torch.version.hip, "ROCM only")
+<<<<<<< HEAD
     @unittest.mock.patch.dict(os.environ, _test_env)
+=======
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
     def test_max_autotune_precompile_bmm(
         self,
@@ -415,6 +632,11 @@ def test_max_autotune_precompile_bmm(
         Test gemm-max-autotune torch.bmm with CK backend
         """
 
+<<<<<<< HEAD
+=======
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def bmm(a, b):
             return torch.bmm(a, b)
 
@@ -425,6 +647,7 @@ def bmm(a, b):
 
         assert "rocm" in dir(config)
 
+<<<<<<< HEAD
         with (
             config.patch(
                 {
@@ -436,6 +659,16 @@ def bmm(a, b):
                 }
             ),
             tf32_off(),
+=======
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "compile_threads": 2,
+                "rocm.ck_max_profiling_configs": 2,
+                "rocm.ck_dir": self.ck_dir,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile(dynamic=False)
@@ -452,5 +685,9 @@ def compiled_bmm(x, w):
     from torch._inductor.utils import is_big_gpu
 
     # Set env to make it work in CI.
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON and HAS_CPU and is_big_gpu():
+=======
+    if HAS_CUDA and HAS_CPU and is_big_gpu():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 46f1ca031bf83..ee16060037007 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -7,7 +7,10 @@
 import subprocess
 import sys
 import tempfile
+<<<<<<< HEAD
 import textwrap
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from contextlib import contextmanager
 from typing import Optional, Union
@@ -16,8 +19,11 @@
 
 import torch
 from torch._dynamo import reset
+<<<<<<< HEAD
 from torch._dynamo.package import DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import counters
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
@@ -32,11 +38,17 @@
     TensorMetadata,
     TensorMetadataAndValues,
 )
+<<<<<<< HEAD
 from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.custom_graph_pass import (
     CustomGraphModulePass,
     CustomGraphPass,
     CustomPartitionerFn,
+=======
+from torch._inductor.custom_graph_pass import (
+    CustomGraphModulePass,
+    CustomGraphPass,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_hash_for_files,
 )
 from torch._inductor.graph import GraphLowering
@@ -50,29 +62,44 @@
     CacheArtifactFactory,
     CacheArtifactManager,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import (
     SM80OrLater,
     TEST_MULTIGPU,
     with_tf32_off,
 )
+=======
+from torch.testing._internal.common_cuda import SM80OrLater, TEST_MULTIGPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_FBCODE,
+<<<<<<< HEAD
     IS_SANDCASTLE,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
+<<<<<<< HEAD
     HAS_GPU,
     HAS_MULTIGPU,
     HAS_TRITON,
     HAS_XPU_AND_TRITON,
+=======
+    HAS_CUDA,
+    HAS_GPU,
+    HAS_MULTIGPU,
+    HAS_TRITON,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     patch_inductor_backend,
     requires_gpu,
     requires_triton,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import (
     requires_cuda_and_triton,
     requires_gpu_and_triton,
@@ -83,6 +110,9 @@
     from . import custom_inductor_config
 except ImportError:
     import custom_inductor_config
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if HAS_TRITON:
@@ -146,6 +176,7 @@ def test_linemaps_empty(self):
         stack_frames = PyCodeCache.stack_frames_for_code(path, 0)
         self.assertEqual(stack_frames, None)
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Skip in fbcode/sandcastle")
     def test_editable_cached_wrapper(self):
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -241,6 +272,8 @@ def f(x):
             ).decode()
             self.assertIn("debug", out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
@@ -249,12 +282,17 @@ class TestFxGraphCache(TestCase):
     def setUp(self):
         super().setUp()
         counters.clear()
+<<<<<<< HEAD
         DynamoCache.clear()
         PrecompileContext.clear()
         AOTAutogradCache.clear()
         PatchCaches.setUp()
         CacheArtifactManager.clear()
         torch._dynamo.reset()
+=======
+        PatchCaches.setUp()
+        CacheArtifactManager.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def tearDown(self):
         super().tearDown()
@@ -262,8 +300,11 @@ def tearDown(self):
 
     def reset(self):
         AOTAutogradCache.clear()
+<<<<<<< HEAD
         DynamoCache.clear()
         PrecompileContext.clear()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         PyCodeCache.cache_clear(purge=True)
         torch._dynamo.reset()
         clear_caches()
@@ -607,6 +648,7 @@ def fn(x, y):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
+<<<<<<< HEAD
     @requires_triton()
     @config.patch(
         {
@@ -710,6 +752,8 @@ def fn(x, y):
             self.assertEqual(counters["dynamo_cache"]["dynamo_cache_miss"], 2)
             self.assertEqual(counters["dynamo_cache"]["dynamo_cache_hit"], 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         {
             "fx_graph_cache": True,
@@ -1085,7 +1129,11 @@ def fn(x):
     @torch._functorch.config.patch({"enable_autograd_cache": False})
     @config.patch("fx_graph_remote_cache", False)
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_arguments_tensor_device_guards(self):
         """
         Usually, when there are example inputs, the device index of the inputs
@@ -1115,7 +1163,11 @@ def f():
     @torch._functorch.config.patch({"enable_autograd_cache": False})
     @config.patch("fx_graph_remote_cache", False)
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_device_guards_cpu_tensor(self):
         """
         CPU tensor arguments should still cache hit
@@ -1150,7 +1202,11 @@ def test_constant_handling(self, device):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn1(x):
+<<<<<<< HEAD
             return x + torch.tensor(list(range(12)), device=device)
+=======
+            return x + torch.tensor(list(range(0, 12)), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn2(x):
             return x + torch.tensor(list(range(1, 13)), device=device)
@@ -1219,10 +1275,16 @@ def fn(x, op):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @with_tf32_off
+=======
+    @requires_cuda
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_flex_attention_caching(self):
         from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 
@@ -1242,7 +1304,11 @@ def score_mod2(score, b, h, q, kv):
         def fn2(q, k, v):
             return flex_attention(q, k, v, score_mod=score_mod2, block_mask=block_mask)
 
+<<<<<<< HEAD
         a, b, c = (torch.randn(1, 4, 512, 64).to(GPU_TYPE) for _ in range(3))
+=======
+        a, b, c = (torch.randn(1, 4, 512, 64).cuda() for _ in range(3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fn = torch.compile(fn)
         compiled_fn2 = torch.compile(fn2)
 
@@ -1677,7 +1743,11 @@ def f(x, val):
         self.assertNotEqual(a, b)
 
     @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.expectedFailure  # TODO: pass in optimize_mem at runtime
     def test_async_compile_cache(self):
         class SimpleFunction(torch.autograd.Function):
@@ -1843,6 +1913,7 @@ def backend(gm_, args_, **kwargs_):
     @parametrize("format", ("binary", "unpacked"))
     @parametrize("dynamic", (False, True))
     @parametrize("graph_partition", (False, True))
+<<<<<<< HEAD
     @parametrize("is_aot", (False, True))
     def test_basic(
         self,
@@ -1851,14 +1922,21 @@ def test_basic(
         dynamic: bool,
         graph_partition: bool,
         is_aot: bool,
+=======
+    def test_basic(
+        self, device: str, format: str, dynamic: bool, graph_partition: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if device == GPU_TYPE and not HAS_GPU:
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
+<<<<<<< HEAD
         # AOT mode does not support unpacked format
         if is_aot and format == "unpacked":
             raise unittest.SkipTest("AOT mode does not support unpacked format")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = torch.nn.Linear(1, 3, device=device)
         x = torch.randn(4, 1, device=device)
         if dynamic:
@@ -1883,9 +1961,13 @@ def f(x):
                 gm, args, kwargs = self.capture(f)(x)
                 assert not kwargs
 
+<<<<<<< HEAD
                 compiled_artifact = torch._inductor.standalone_compile(
                     gm, args, aot=is_aot
                 )
+=======
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiled_artifact.save(path=path, format=format)
 
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -1901,15 +1983,23 @@ def f(x):
                 compiled_out = loaded(*concrete_args)
                 self.assertEqual(eager_out, compiled_out)
 
+<<<<<<< HEAD
             if not is_aot:
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+=======
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
     @parametrize("dynamic", (False, True))
+<<<<<<< HEAD
     @parametrize("is_aot", (False, True))
     def test_call_in_backend(self, dynamic: bool, is_aot: bool) -> None:
+=======
+    def test_call_in_backend(self, dynamic: bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = torch.nn.Linear(1, 3)
         x = torch.randn(4, 1)
         if dynamic:
@@ -1922,7 +2012,11 @@ def f(x):
         eager_out = f(x)
 
         def backend(gm, args, **kwargs):
+<<<<<<< HEAD
             return torch._inductor.standalone_compile(gm, args, aot=is_aot)
+=======
+            return torch._inductor.standalone_compile(gm, args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with fresh_cache():
             compiled_out = torch.compile(f, fullgraph=True, backend=backend)(x)
@@ -2034,9 +2128,13 @@ def f(x):
         assert not kwargs
 
         with tempfile.TemporaryDirectory() as temp_dir:
+<<<<<<< HEAD
             path = normalize_path_separator(
                 os.path.join(temp_dir, "compiled_artifact.bin")
             )
+=======
+            path = os.path.join(temp_dir, "compiled_artifact.bin")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with fresh_cache():
                 compiled_artifact = torch._inductor.standalone_compile(gm, args)
@@ -2073,8 +2171,12 @@ def f(x):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     @parametrize("is_aot", (False, True))
     def test_dynamic_shapes_from_graph(self, is_aot: bool):
+=======
+    def test_dynamic_shapes_from_graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return x.shape[0] * x
 
@@ -2086,7 +2188,11 @@ def f(x):
             assert not kwargs
 
         compiled_artifact = torch._inductor.standalone_compile(
+<<<<<<< HEAD
             gm, args, dynamic_shapes="from_graph", aot=is_aot
+=======
+            gm, args, dynamic_shapes="from_graph"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         x = torch.ones(4)
         (result,) = compiled_artifact(4, x)
@@ -2095,6 +2201,7 @@ def f(x):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     @functorch_config.patch({"autograd_cache_normalize_inputs": True})
     @parametrize("is_aot", (False, True))
     def test_split_module(self, is_aot):
@@ -2169,6 +2276,10 @@ def t():
     @parametrize("is_aot", (False, True))
     @parametrize("config_patches", [True, False])
     def test_dynamic_shapes_from_example_inputs(self, config_patches, is_aot):
+=======
+    @parametrize("config_patches", [True, False])
+    def test_dynamic_shapes_from_example_inputs(self, config_patches):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return x.shape[0] * x
 
@@ -2190,7 +2301,10 @@ def f(x):
             (5, torch.ones(4)),
             dynamic_shapes="from_example_inputs",
             options={"config_patches": config_patches},
+<<<<<<< HEAD
             aot=is_aot,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         x = torch.ones(4)
         (result,) = compiled_artifact(3, x)
@@ -2205,9 +2319,14 @@ def f(x):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     @parametrize("is_aot", (True, False))
     @parametrize("dynamic_shapes", ["from_graph", "from_example_inputs"])
     def test_static_shapes(self, dynamic_shapes, is_aot):
+=======
+    @parametrize("dynamic_shapes", ["from_graph", "from_example_inputs"])
+    def test_static_shapes(self, dynamic_shapes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return x.shape[0] * x
 
@@ -2217,7 +2336,11 @@ def f(x):
             static_gm, args, kwargs = self.capture(f, dynamic=False)(static_x)
             assert not kwargs
         compiled_artifact = torch._inductor.standalone_compile(
+<<<<<<< HEAD
             static_gm, [static_x], dynamic_shapes=dynamic_shapes, aot=is_aot
+=======
+            static_gm, [static_x], dynamic_shapes=dynamic_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         x = torch.randn(3)
         (result,) = compiled_artifact(x)
@@ -2229,9 +2352,14 @@ def f(x):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     @parametrize("is_aot", (True, False))
     @parametrize("dynamic_shapes", ["from_tracing_context", "from_graph"])
     def test_backend(self, dynamic_shapes, is_aot):
+=======
+    @parametrize("dynamic_shapes", ["from_tracing_context", "from_graph"])
+    def test_backend(self, dynamic_shapes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return x.shape[0] * x
 
@@ -2240,7 +2368,11 @@ def f(x):
 
         def backend(gm, args, **kwargs):
             compiled_artifact = torch._inductor.standalone_compile(
+<<<<<<< HEAD
                 gm, args, dynamic_shapes=dynamic_shapes, aot=is_aot
+=======
+                gm, args, dynamic_shapes=dynamic_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             y = torch.randn(4)
             (result,) = compiled_artifact(4, y)
@@ -2253,8 +2385,12 @@ def backend(gm, args, **kwargs):
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @functorch_config.patch({"enable_autograd_cache": True})
+<<<<<<< HEAD
     @parametrize("is_aot", (True, False))
     def test_backend_dynamic_shapes_from_example_inputs(self, is_aot):
+=======
+    def test_backend_dynamic_shapes_from_example_inputs(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x):
             return x.shape[0] * x
 
@@ -2263,7 +2399,11 @@ def f(x):
 
         def backend(gm, args, **kwargs):
             compiled_artifact = torch._inductor.standalone_compile(
+<<<<<<< HEAD
                 gm, [5, torch.ones(4)], dynamic_shapes="from_example_inputs", aot=is_aot
+=======
+                gm, [5, torch.ones(4)], dynamic_shapes="from_example_inputs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             y = torch.ones(4)
             (result,) = compiled_artifact(4, y)
@@ -2302,6 +2442,7 @@ def backend(gm, args, **kwargs):
         result = torch.compile(f, backend=backend)(static_x)
         self.assertEqual(result, static_x * 3)
 
+<<<<<<< HEAD
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     def test_custom_pass_handling(self):
@@ -2369,6 +2510,8 @@ def __call__(
     def uuid(self) -> Optional[Union[bytes, str]]:
         return self._uuid
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFxGraphCacheHashing(TestCase):
     def test_parameter_constants(self):
@@ -2731,6 +2874,7 @@ def uuid(self) -> Optional[Union[bytes, str]]:
                 pickler.dumps(details3),
             )
 
+<<<<<<< HEAD
     def test_hash_custom_backend_config(self):
         """
         Test cache correctness when a custom inductor codegen config
@@ -2804,6 +2948,8 @@ def test_hash_custom_partitioner_fn(self):
                 pickler.dumps(details3),
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bypass_unsupported(self):
         """
         Test _reduce_unsupported
@@ -2820,7 +2966,11 @@ def test_stable_strings(self):
         even if they are not the same id.
         """
         s1 = "string"
+<<<<<<< HEAD
         s2 = "strin"  # codespell:ignore
+=======
+        s2 = "strin"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s2 += "g"
 
         self.assertNotEqual(id(s1), id(s2))
@@ -2860,7 +3010,11 @@ def test_get_hash_for_files(self):
 
 
 class TestCudaCompileCommand(TestCase):
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cuda_compile_command(self):
         cmd_no_extra_args: str = cuda_compile_command(
             ["abc.cu", "def.cu"], "output", "so"
@@ -2905,7 +3059,11 @@ def reset(self):
         torch._dynamo.reset()
         clear_caches()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @unittest.skipIf(
         TEST_WITH_ROCM, "Requires static cuda launcher, which does not support ROCM"
@@ -2956,8 +3114,13 @@ def f(x, y, a, b):
         for k in global_stats.triton.cache.keys():
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
     @config.patch({"autotune_local_cache": False})
@@ -2975,10 +3138,17 @@ def forward(self, x, y, a, b):
         def f(x, y, a, b):
             return Model()(x, y, a, b)
 
+<<<<<<< HEAD
         x = torch.randn(100, 100).to(GPU_TYPE)
         y = torch.randn(100, 100).to(GPU_TYPE)
         a = torch.randn(1000, 100).to(GPU_TYPE)
         b = torch.randn(1000, 100).to(GPU_TYPE)
+=======
+        x = torch.randn(100, 100).cuda()
+        y = torch.randn(100, 100).cuda()
+        a = torch.randn(1000, 100).cuda()
+        b = torch.randn(1000, 100).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_compiled = torch.compile(f, fullgraph=True)
 
         with PatchCaches():
@@ -2997,8 +3167,13 @@ def f(x, y, a, b):
         for k in global_stats.triton.cache.keys():
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
     @config.patch({"autotune_local_cache": True})
@@ -3016,12 +3191,21 @@ def f(a, b, c, d, e, f):
 
         f_compiled = torch.compile(f, fullgraph=True)
 
+<<<<<<< HEAD
         a = torch.randn(101, 100).to(GPU_TYPE)
         b = torch.randn(101, 100).to(GPU_TYPE)
         c = torch.randn(102, 100).to(GPU_TYPE)
         d = torch.randn(102, 100).to(GPU_TYPE)
         e = torch.randn(103, 100).to(GPU_TYPE)
         f = torch.randn(103, 100).to(GPU_TYPE)
+=======
+        a = torch.randn(101, 100).cuda()
+        b = torch.randn(101, 100).cuda()
+        c = torch.randn(102, 100).cuda()
+        d = torch.randn(102, 100).cuda()
+        e = torch.randn(103, 100).cuda()
+        f = torch.randn(103, 100).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with PatchCaches():
             f_compiled(a, b, c, d, e, f)
@@ -3058,8 +3242,13 @@ def f(a, b, c, d, e, f):
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
     @requires_triton()
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
     @config.patch({"bundled_autotune_remote_cache": False})
@@ -3087,8 +3276,13 @@ def get_autotune_stats():
         def fn(x, y):
             return (x + y).relu()
 
+<<<<<<< HEAD
         x = torch.randn(100, 100).to(GPU_TYPE)
         y = torch.randn(100, 100).to(GPU_TYPE)
+=======
+        x = torch.randn(100, 100).cuda()
+        y = torch.randn(100, 100).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -3122,8 +3316,13 @@ def fn(x, y):
 
 
 class TestRemoteAOTAutogradCache(TestCase):
+<<<<<<< HEAD
     @requires_gpu()
     @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": True})
     @torch._functorch.config.patch({"enable_autograd_cache": False})
@@ -3133,8 +3332,13 @@ def f(a, b):
             return a + b
 
         f_compiled = torch.compile(f)
+<<<<<<< HEAD
         a = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
         b = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
+=======
+        a = torch.randn(101, 100, device="cuda", requires_grad=False)
+        b = torch.randn(101, 100, device="cuda", requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with PatchCaches():
             f_compiled(a, b)
 
@@ -3161,8 +3365,13 @@ def f(a, b):
         for k in global_stats.fx_graph.cache.keys():
             self.assertRegex(k, r"pt2:fx-graph-v1::[0-9a-z]{52}:c[0-9]+")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
+=======
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": True})
     @torch._functorch.config.patch({"enable_autograd_cache": False})
@@ -3236,7 +3445,11 @@ def fn(x, y):
 
     # This combination of settings exposed a bug where we cleared the
     # PyCodeCache disk artifacts while they were still needed:
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         {
             "coordinate_descent_tuning": True,
@@ -3245,9 +3458,15 @@ def fn(x, y):
     )
     def test_force_disable_coordinate_descent(self):
         def fn():
+<<<<<<< HEAD
             inp = torch.randn(32, 50, 768, device=GPU_TYPE)
             weight = torch.randn(768, 768, device=GPU_TYPE)
             layer = torch.nn.LayerNorm(768, device=GPU_TYPE)
+=======
+            inp = torch.randn(32, 50, 768, device="cuda")
+            weight = torch.randn(768, 768, device="cuda")
+            layer = torch.nn.LayerNorm(768, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return layer(inp @ weight)
 
         torch.compile(fn)()
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index 59187c7349a09..ad282fd2e55c8 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -6,13 +6,21 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
 from torch._inductor.utils import run_and_get_code
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     TestCase,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
@@ -56,7 +64,11 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_activation_functions(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -76,7 +88,11 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_functions(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -99,7 +115,11 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mutated_args(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -122,7 +142,11 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_split(self):
         def fn(a, b):
             a1 = torch.linalg.vector_norm(a)
@@ -138,7 +162,11 @@ def fn(a, b):
 
         self.assertEqual(out_eager, out_compiled)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_2d_blocking_partitioning(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -185,7 +213,11 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_activation_benchmark(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -205,7 +237,11 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_benchmark(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -228,7 +264,11 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mutated_benchmark(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -251,7 +291,11 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_round_robin_dispatch(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -275,7 +319,11 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_2d_blocking_benchmark(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -297,6 +345,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_persistent_reduction_no_x_dim(self):
         def fn(x, y):
@@ -314,6 +363,8 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
@@ -347,7 +398,11 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_activations(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -367,7 +422,11 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_2d_blocking(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -389,7 +448,11 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_reduce(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -412,7 +475,11 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_mutated(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -436,7 +503,11 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch("combo_kernels_autotune", 0)
     def test_dynamic_shapes_activations_no_autotune(self):
         def test_activations(a, b, c):
@@ -457,7 +528,11 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
@@ -476,7 +551,11 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
@@ -495,7 +574,11 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_2d_blocking_round_robin(self):
@@ -534,7 +617,11 @@ def fn(a0, a1, a2, b0, b1, b2):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     @torch._inductor.config.patch("triton.autotune_at_compile_time", True)
@@ -555,6 +642,7 @@ def fn(x, y, z):
 
         self.assertEqual(out_eager, out_compiled)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_helper_fn_defined(self):
         def fn(x, y, z):
@@ -573,9 +661,15 @@ def fn(x, y, z):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(code.count("def _triton_helper_fn_add0(arg0_0, arg1_0):"), 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_CUDA_AND_TRITON:
+=======
+    if HAS_CPU or HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_compile.py b/test/inductor/test_compile.py
index 6908936eca3f3..6bcf2be03f5cb 100644
--- a/test/inductor/test_compile.py
+++ b/test/inductor/test_compile.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 import os
 import shlex
 import subprocess
@@ -9,6 +10,10 @@
 from torch import _dynamo as dynamo, _inductor as inductor
 from torch._inductor.codecache import write
 from torch._inductor.cpp_builder import CppBuilder, CppOptions
+=======
+import torch
+from torch import _dynamo as dynamo, _inductor as inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import gen_gm_and_inputs
 from torch.fx import symbolic_trace
@@ -16,6 +21,7 @@
 from torch.testing._internal.inductor_utils import HAS_CPU
 
 
+<<<<<<< HEAD
 _IS_MACOS = sys.platform.startswith("darwin")
 _IS_WINDOWS = sys.platform == "win32"
 
@@ -35,6 +41,8 @@ def safe_command_output(cmd, timeout=30):
         return "runt timeout"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MyModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -136,6 +144,7 @@ def test_inductor_via_op_with_multiple_outputs(self):
         mod_opt = inductor.compile(mod, inp)
         self.assertEqual(mod(*inp), mod_opt(*inp))
 
+<<<<<<< HEAD
     @mock.patch.dict(os.environ, {"TORCHINDUCTOR_DEBUG_SYMBOL": "1"})
     def test_inductor_generate_debug_symbol(self):
         cpp_code = """
@@ -183,6 +192,8 @@ def check_windows_pdb_exist(module_path: str):
         else:
             check_linux_debug_section(binary_path)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     if HAS_CPU:
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index bf474bfbf1776..ff336a3c545e4 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -9,13 +9,17 @@
 import os
 import sys
 import time
+<<<<<<< HEAD
 import unittest
 from unittest import mock
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
 import torch.library
 from torch._inductor.compile_fx import _InProcessFxCompile, FxCompile, FxCompileMode
+<<<<<<< HEAD
 from torch._inductor.graph import GraphLowering
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, TEST_WITH_ASAN
@@ -37,6 +41,11 @@
     if __name__ == "__main__":
         sys.exit(0)
     raise unittest.SkipTest("pass_fds not supported on Windows")
+=======
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import TEST_WITH_ASAN
+from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_CPU, RUN_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -62,6 +71,12 @@
     "test_remove_noop_slice_scatter": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_default": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_dtype": TestFailure(("xpu"), is_skip=True),
+<<<<<<< HEAD
+=======
+    # TODO:remove test_upsample_bicubic2d after the following issue resolved:
+    # https://github.com/intel/intel-xpu-backend-for-triton/issues/4184
+    "test_upsample_bicubic2d": TestFailure(("xpu"), is_skip=False),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -92,6 +107,7 @@ def tearDown(self):
         TestCase.tearDown(self)
         torch._dynamo.reset()
 
+<<<<<<< HEAD
     @requires_gpu()
     @requires_triton()
     @unittest.skipIf(
@@ -174,6 +190,8 @@ def baseline(x, y):
         )
         self.assertTrue("'max_autotune': True" in source_codes[-1])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch("torch._inductor.compile_fx.fx_compile_async", True)
     def test_async(self):
         # Test that async+subprocess works.
@@ -189,7 +207,11 @@ def model_add(x, y):
         _AsyncFxCompile._reset_stats()
 
         with contextlib.ExitStack() as stack:
+<<<<<<< HEAD
             assert torch._inductor.compile_fx_async.BUG_CACHES_DONT_WORK_WITH_ASYNC
+=======
+            # TODO: Turn off local caches - they don't play nice w/ async currently.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             stack.enter_context(
                 torch._inductor.config.patch(
                     autotune_local_cache=False, fx_graph_cache=False
@@ -206,6 +228,7 @@ def model_add(x, y):
 
             start = time.time()
             last_report = start
+<<<<<<< HEAD
             while True:
                 start_stat_compiled_runs = _AsyncFxCompile._stat_compiled_runs
                 # Sleep a bit so we don't drive the CPU unnecessarily.
@@ -222,6 +245,15 @@ def model_add(x, y):
 
                 if _AsyncFxCompile._stat_compiled_runs - start_stat_compiled_runs == 2:
                     break
+=======
+            while _AsyncFxCompile._stat_compiled_runs < 4:
+                # Sleep a bit so we don't drive the CPU unnecessarily.
+                time.sleep(0.25)
+
+                x = torch.randn(100, 100)
+                y = torch.randn(100, 100)
+                model_add(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # DEBUGGING: Print a periodic message so we know we're still
                 # running...
@@ -235,12 +267,21 @@ def model_add(x, y):
                         "Test timed out before producing a compiled artifact."
                     )
 
+<<<<<<< HEAD
             self.assertGreater(_AsyncFxCompile._stat_compiled_runs, 1)
             # Make sure we ran eager at least once. Normally this will be
             # something like 80.
             self.assertGreater(_AsyncFxCompile._stat_eager_runs, 0)
             self.assertEqual(_AsyncFxCompile._stat_bg_started, 2)
             self.assertEqual(_AsyncFxCompile._stat_bg_finished, 2)
+=======
+            self.assertEqual(_AsyncFxCompile._stat_compiled_runs, 4)
+            # Make sure we ran eager at least once. Normally this will be
+            # something like 80.
+            self.assertGreater(_AsyncFxCompile._stat_eager_runs, 0)
+            self.assertEqual(_AsyncFxCompile._stat_bg_started, 1)
+            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if RUN_CPU:
diff --git a/test/inductor/test_compile_worker.py b/test/inductor/test_compile_worker.py
index 8fde26c6acf67..1127837f9352a 100644
--- a/test/inductor/test_compile_worker.py
+++ b/test/inductor/test_compile_worker.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: inductor"]
 import operator
 import os
+<<<<<<< HEAD
 import tempfile
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._inductor.compile_worker.subproc_pool import (
     raise_testexc,
@@ -54,6 +57,7 @@ def test_crash(self):
         finally:
             pool.shutdown()
 
+<<<<<<< HEAD
     @skipIfWindows(msg="pass_fds not supported on Windows.")
     def test_quiesce(self):
         pool = SubprocPool(2)
@@ -80,6 +84,8 @@ def test_logging(self):
             finally:
                 pool.shutdown()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 6781e16aa6d7b..21c3e7a6ea360 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -29,7 +29,10 @@
 from torch._dynamo.testing import normalize_gm
 from torch._dynamo.utils import counters
 from torch._inductor import config as inductor_config
+<<<<<<< HEAD
 from torch._inductor.cpp_builder import is_msvc_cl
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import run_tests, TestCase
 from torch.nn.attention.flex_attention import flex_attention
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -41,12 +44,16 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_S390X,
+<<<<<<< HEAD
     IS_WINDOWS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     scoped_load_inline,
     skipIfWindows,
 )
 from torch.testing._internal.hop_db import hop_db
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
@@ -55,6 +62,10 @@
 )
 from torch.testing._internal.logging_utils import logs_to_string
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
+from torch.testing._internal.logging_utils import logs_to_string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -172,6 +183,7 @@ def run_as_subprocess(self, script) -> bytes:
         except subprocess.CalledProcessError as e:
             self.fail(f"Subprocess exited with return code: {e.returncode}")
 
+<<<<<<< HEAD
     def test_hipify_not_loaded_with_import_torch(self):
         script = """
 import torch
@@ -186,6 +198,8 @@ def test_hipify_not_loaded_with_import_cpp_extension(self):
 """
         self.run_as_subprocess(script)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_flaky_segfault(self):
         script = """
 import torch
@@ -215,6 +229,7 @@ def model(i):
         for _ in range(3):
             self.run_as_subprocess(script)
 
+<<<<<<< HEAD
     def gen_cache_miss_log_prefix(self):
         if IS_WINDOWS:
             if is_msvc_cl():
@@ -227,6 +242,8 @@ def gen_cache_miss_log_prefix(self):
         else:
             return "Cache miss due to new autograd node: "
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reset(self):
         compiled_autograd.compiled_autograd_enabled = True
         torch._C._dynamo.compiled_autograd.set_autograd_compiler(lambda: None, True)
@@ -1042,8 +1059,13 @@ def test_inputs_aliasing_bytecode_attr_mutations(self):
         # Freeze compiled autograd graph
         compiler = torch._dynamo.compiled_autograd.AutogradCompilerInstance(compiler_fn)
         param = torch.ones(100)
+<<<<<<< HEAD
         active = torch.ones(100) * 2
         inputs = [param, active]
+=======
+        activ = torch.ones(100) * 2
+        inputs = [param, activ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _, proxies, _, _ = compiler.begin_capture(
             inputs=inputs,
             sizes=[],
@@ -1094,7 +1116,11 @@ def bytecode_hook(code, out_code):
         try:
             runtime_wrapper(
                 compiled_fn=compiled_fn,
+<<<<<<< HEAD
                 inputs=[param, active],
+=======
+                inputs=[param, activ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sizes=(),
                 scalars=(),
                 hooks=[],
@@ -1599,7 +1625,11 @@ def eager_check():
 
         eager_check()
 
+<<<<<<< HEAD
         for i in range(5):
+=======
+        for i in range(0, 5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with compiled_autograd._enable(compiler_fn):
                 eager_check()
 
@@ -3009,7 +3039,11 @@ def backward(ctx, grad):
                 b = MyFunc.apply(a)
                 b.sum().backward()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudagraphs_cpu_division(self):
         from torch._dynamo.testing import reduce_to_scalar_loss
 
@@ -3049,7 +3083,11 @@ def test_cudagraphs_cpu_graph(self):
 
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudagraphs_sdpa(self):
         query = torch.rand(
             32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
@@ -3071,7 +3109,11 @@ def test_cudagraphs_sdpa(self):
             2 if inductor_config.cpp_wrapper else 0,
         )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudagraphs_cpu_scalar_used_in_python_custom_op(self):
         class MyFn(torch.autograd.Function):
             @staticmethod
@@ -3099,6 +3141,7 @@ def backward(ctx, gO):
         self.assertEqual(counters["compiled_autograd"]["captures"], 1)
         # Compiled autograd lifts custom autograd.Function bwd instead of tracing it.
         # Must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
+<<<<<<< HEAD
         if inductor_config.graph_partition:
             # instead of skipping cudagraph, graph partition splits off cpu inputs/outputs and ops
             # and cudagraphify the remaining computation. So there is no cudagraph skip.
@@ -3112,6 +3155,12 @@ def backward(ctx, gO):
 
     @scoped_load_inline
     @requires_cuda_and_triton
+=======
+        self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+    @scoped_load_inline
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
@@ -3173,6 +3222,7 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         # into it. We must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
         # In the future, we can consider having a cpu scalar movement pass sometime after we trace
         # into the custom C++ autograd::Function (like in AOTDispatcher)
+<<<<<<< HEAD
         if inductor_config.graph_partition:
             # instead of skipping cudagraph, graph partition splits off cpu inputs/outputs and ops
             # and cudagraphify the remaining computation. So there is no cudagraph skip.
@@ -3185,6 +3235,11 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         self.assertEqual(
             counters["inductor"]["cudagraph_skips"],
             expected_cudagraph_skips,
+=======
+        self.assertEqual(
+            counters["inductor"]["cudagraph_skips"],
+            2 if inductor_config.cpp_wrapper else 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_logs(self):
@@ -3198,7 +3253,11 @@ def test_logs(self):
         self.assertEqual(counters["compiled_autograd"]["compiles"], 1)
         assert "torch::autograd::AccumulateGrad (NodeCall" in logs.getvalue()
         assert (
+<<<<<<< HEAD
             self.gen_cache_miss_log_prefix() + "torch::autograd::GraphRoot"
+=======
+            "Cache miss due to new autograd node: torch::autograd::GraphRoot"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             not in logs.getvalue()
         )
 
@@ -3405,6 +3464,10 @@ def fn(x, obj):
             sum(1 for e in expected_logs if e in logs.getvalue()), len(expected_logs)
         )
 
+<<<<<<< HEAD
+=======
+    @skipIfWindows(msg="AssertionError: Scalars are not equal!")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_verbose_logs_cpp(self):
         torch._logging.set_logs(compiled_autograd_verbose=True)
 
@@ -3432,9 +3495,14 @@ def fn():
             self.check_output_and_recompiles(fn)
 
         patterns1 = [
+<<<<<<< HEAD
             r".*"
             + self.gen_cache_miss_log_prefix()
             + r"torch::autograd::GraphRoot \(NodeCall 0\) with key size (\d+), previous key sizes=\[\]\n",
+=======
+            r".*Cache miss due to new autograd node: torch::autograd::GraphRoot \(NodeCall 0\) with key size (\d+), "
+            r"previous key sizes=\[\]\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         all_logs = logs.getvalue()
@@ -3447,7 +3515,10 @@ def fn():
         )  # for a single match: matches1=['match'], for multiple matches: matches1=[('match1', 'match2')]...
         self.assertEqual(len(matches1), len(patterns1))
 
+<<<<<<< HEAD
     @skipIfWindows(msg="node name demangling inconsistent on windows")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_verbose_logs_dynamic_shapes(self):
         logs, ctx = logs_to_string(
             torch._dynamo.compiled_autograd.__name__, "compiled_autograd_verbose"
@@ -3472,8 +3543,12 @@ def test_verbose_logs_dynamic_shapes(self):
 
         actual_logs = logs.getvalue()
         expected_logs = [
+<<<<<<< HEAD
             self.gen_cache_miss_log_prefix()
             + "torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]",
+=======
+            "Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for expected in expected_logs:
             self.assertTrue(expected in actual_logs)
@@ -3504,7 +3579,11 @@ def fn():
                 fn()
 
         unexpected_logs = [
+<<<<<<< HEAD
             self.gen_cache_miss_log_prefix() + "torch::autograd::GraphRoot (NodeCall 0)"
+=======
+            "Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         self.assertEqual(sum(1 for e in unexpected_logs if e in logs.getvalue()), 0)
@@ -3604,12 +3683,20 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         unwrap_maybe_dynamic_int_18 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_23);  getitem_23 = None
         unwrap_maybe_dynamic_int_19 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_24);  getitem_24 = None
 
+<<<<<<< HEAD
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True, 6)]);  getitem = None
+=======
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_25 = validate_outputs[0];  validate_outputs = None
 
         sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_25], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_25 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
         getitem_26 = sum_backward0[0];  sum_backward0 = None
+<<<<<<< HEAD
         validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_26], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], True, 6)]);  getitem_26 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+=======
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_26], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], True)]);  getitem_26 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_27 = validate_outputs_1[0];  validate_outputs_1 = None
 
         getitem_28 = hooks[0];  getitem_28 = None
@@ -3621,6 +3708,10 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
 
         aot0_mul_2 = torch.ops.aten.mul.Tensor(aot0_tangents_1, aot0_primals_1);  aot0_tangents_1 = aot0_primals_1 = None
         aot0_mul_3 = torch.ops.aten.mul.Tensor(aot0_tangents_2, aot0_primals_2);  aot0_tangents_2 = aot0_primals_2 = None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot0_add_2 = torch.ops.aten.add.Tensor(aot0_mul_2, aot0_mul_2);  aot0_mul_2 = None
         aot0_add_3 = torch.ops.aten.add.Tensor(aot0_mul_3, aot0_mul_3);  aot0_mul_3 = None
 
@@ -3630,7 +3721,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_backward = torch__dynamo_external_utils_call_backward(getitem_33, (), make_subclass);  getitem_33 = make_subclass = None
         getitem_36 = call_backward[0]
         getitem_37 = call_backward[1];  call_backward = None
+<<<<<<< HEAD
         validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_36, getitem_37], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_16, unwrap_maybe_dynamic_int_17], False, 6), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_18, unwrap_maybe_dynamic_int_19], False, 6)]);  getitem_36 = getitem_37 = unwrap_maybe_dynamic_int_16 = unwrap_maybe_dynamic_int_17 = unwrap_maybe_dynamic_int_18 = unwrap_maybe_dynamic_int_19 = None
+=======
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_36, getitem_37], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_16, unwrap_maybe_dynamic_int_17], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_18, unwrap_maybe_dynamic_int_19], False)]);  getitem_36 = getitem_37 = unwrap_maybe_dynamic_int_16 = unwrap_maybe_dynamic_int_17 = unwrap_maybe_dynamic_int_18 = unwrap_maybe_dynamic_int_19 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_39 = validate_outputs_2[0]
 
         call_accumulate_grad_1 = torch__dynamo_external_utils_call_accumulate_grad(getitem_4, getitem_39, False);  getitem_4 = getitem_39 = call_accumulate_grad_1 = None
@@ -3747,7 +3842,11 @@ def inner_compiler(gm_, example_inputs_):
         self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
         self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_flex_attention(self):
         def _squared(score, b, h, m, n):
             """Joint graph needed for correctness"""
@@ -3865,12 +3964,20 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         unwrap_maybe_dynamic_int_10 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_12);  getitem_12 = None
         unwrap_maybe_dynamic_int_11 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_13);  getitem_13 = None
 
+<<<<<<< HEAD
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False, 6)]);  getitem = None
+=======
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False)]);  getitem = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_14 = validate_outputs[0];  validate_outputs = None
 
         sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_14], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_14 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
         getitem_15 = sum_backward0[0];  sum_backward0 = None
+<<<<<<< HEAD
         validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_15], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], False, 6)]);  getitem_15 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+=======
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_15], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], False)]);  getitem_15 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_16 = validate_outputs_1[0];  validate_outputs_1 = None
 
         getitem_17 = hooks[0]
@@ -3882,7 +3989,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         mul_backward0 = torch__dynamo_compiled_autograd_ops_MulBackward0([getitem_16], [True, True], call_hook, 6, call_hook_1, 6);  getitem_16 = call_hook = call_hook_1 = None
         getitem_21 = mul_backward0[0]
         getitem_22 = mul_backward0[1];  mul_backward0 = None
+<<<<<<< HEAD
         validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_21, getitem_22], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_4, unwrap_maybe_dynamic_int_5], False, 6), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_6, unwrap_maybe_dynamic_int_7], False, 6)]);  getitem_21 = getitem_22 = unwrap_maybe_dynamic_int_4 = unwrap_maybe_dynamic_int_5 = unwrap_maybe_dynamic_int_6 = unwrap_maybe_dynamic_int_7 = None
+=======
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_21, getitem_22], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_4, unwrap_maybe_dynamic_int_5], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_6, unwrap_maybe_dynamic_int_7], False)]);  getitem_21 = getitem_22 = unwrap_maybe_dynamic_int_4 = unwrap_maybe_dynamic_int_5 = unwrap_maybe_dynamic_int_6 = unwrap_maybe_dynamic_int_7 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_23 = validate_outputs_2[0]
         getitem_24 = validate_outputs_2[1];  validate_outputs_2 = None
 
@@ -3891,7 +4002,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_hook_2 = torch__dynamo_external_utils_call_hook(getitem_25, getitem_26, hook_type = 'unpack_hook');  getitem_25 = getitem_26 = None
         cos_backward0 = torch__dynamo_compiled_autograd_ops_CosBackward0([getitem_24], [True], call_hook_2);  getitem_24 = call_hook_2 = None
         getitem_27 = cos_backward0[0];  cos_backward0 = None
+<<<<<<< HEAD
         validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_27], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_8, unwrap_maybe_dynamic_int_9], False, 6)]);  getitem_27 = unwrap_maybe_dynamic_int_8 = unwrap_maybe_dynamic_int_9 = None
+=======
+        validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_27], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_8, unwrap_maybe_dynamic_int_9], False)]);  getitem_27 = unwrap_maybe_dynamic_int_8 = unwrap_maybe_dynamic_int_9 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_28 = validate_outputs_3[0];  validate_outputs_3 = None
         add = torch.add(getitem_23, getitem_28);  getitem_23 = getitem_28 = None
 
@@ -3900,7 +4015,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_hook_3 = torch__dynamo_external_utils_call_hook(getitem_29, getitem_30, hook_type = 'unpack_hook');  getitem_29 = getitem_30 = None
         sin_backward0 = torch__dynamo_compiled_autograd_ops_SinBackward0([add], [True], call_hook_3);  add = call_hook_3 = None
         getitem_31 = sin_backward0[0];  sin_backward0 = None
+<<<<<<< HEAD
         validate_outputs_4 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_31], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_10, unwrap_maybe_dynamic_int_11], False, 6)]);  getitem_31 = unwrap_maybe_dynamic_int_10 = unwrap_maybe_dynamic_int_11 = None
+=======
+        validate_outputs_4 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_31], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_10, unwrap_maybe_dynamic_int_11], False)]);  getitem_31 = unwrap_maybe_dynamic_int_10 = unwrap_maybe_dynamic_int_11 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_32 = validate_outputs_4[0];  validate_outputs_4 = None
 
         call_accumulate_grad = torch__dynamo_external_utils_call_accumulate_grad(getitem_1, getitem_32, False);  getitem_1 = getitem_32 = call_accumulate_grad = None
@@ -3915,7 +4034,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
                 compiler_fn=make_compiler_fn(backend="ca_eager", gm_hook=check),
             )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cpu_offloading(self):
         def fn():
             def pack(x):
@@ -5083,7 +5206,11 @@ def wrap_test_class(orig_cls):
             dct[name] = unittest.expectedFailure
         elif name.startswith("test_"):
             backend = lookup_backend(name)
+<<<<<<< HEAD
             if not HAS_CUDA_AND_TRITON and backend == "inductor":
+=======
+            if not HAS_CUDA and backend == "inductor":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             ctxs = [
                 compiled_autograd._enable(
@@ -5196,7 +5323,10 @@ def wrap_test_class(orig_cls):
     "test_nested_checkpoint_set_early_stop",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_False",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_True",  # dynamo disable
+<<<<<<< HEAD
     "test_custom_autograd_ac_early_stop",  # marked as skipped
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_dropout",  # dynamo disable
     "test_dropout_inductor",  # dynamo disable
     "test_function_with_kwargs",  # dynamo disable
@@ -5265,7 +5395,10 @@ def wrap_test_class(orig_cls):
         "test_dropout_inductor",  # functionalize_rng_ops not yet supported
         "test_function_with_kwargs",  # functionalize_rng_ops not yet supported
         "test_module",  # functionalize_rng_ops not yet supported
+<<<<<<< HEAD
         "test_grad_dtype",  # AttributeError: args / Float did not match Double
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
     "eager": {  # will be run without torch.compiling the CA graph
         "test_setup_context_when_forward_has_default_args",  # autograd.Function with class methods
@@ -5322,7 +5455,11 @@ def wrap_test_class(orig_cls):
 
 skipped_tests = set()
 
+<<<<<<< HEAD
 if not HAS_CUDA_AND_TRITON:
+=======
+if not HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Found Tesla M60 which is too old to be supported by the triton GPU compiler
     skipped_tests.add("test_type_conversions")
 
@@ -5348,13 +5485,21 @@ def wrap_test_class(orig_cls):
     test_higher_order_ops.ActivationCheckpointingTests
 )
 
+<<<<<<< HEAD
 if torch.distributed.is_available() and HAS_CUDA_AND_TRITON:
+=======
+if torch.distributed.is_available() and HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_dtensor = load_test_module("distributed/tensor/test_dtensor_compile")
     TestDTensorCompileWithCompiledAutograd = wrap_test_class(
         test_dtensor.TestDTensorCompile
     )
 
+<<<<<<< HEAD
 xfail_hops = {"local_map_hop"}
+=======
+xfail_hops = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCompiledAutogradOpInfo(TestCase):
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index df93e7e1e4d61..a8ad2ece342f0 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -1,16 +1,23 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 import random
 import sys
 import types
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 import weakref
 from contextlib import ExitStack
 from copy import deepcopy
 from typing import NamedTuple
 
+<<<<<<< HEAD
 from expecttest import assert_expected_inline
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._inductor
 import torch._inductor.cudagraph_trees
@@ -58,14 +65,22 @@
     optim_db,
     optims,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import parametrize, skipIfWindows
+=======
+from torch.testing._internal.common_utils import parametrize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU,
     has_triton,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
+=======
+from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_inputs(optim):
@@ -190,6 +205,7 @@ class KernelCounts(NamedTuple):
 # tests you can get different kernel counts
 # This maps the test name to the
 # expected kernel count
+<<<<<<< HEAD
 
 # fmt: off
 # expecttest got error after PYFMT add line break for the triple quotes
@@ -260,6 +276,71 @@ class KernelCounts(NamedTuple):
     "test_sgd_tensor_lr_xpu": lambda x: assert_expected_inline(x, """2"""),
 }
 # fmt: on
+=======
+KERNEL_COUNT_OVERRIDES = {
+    "test_rmsprop_foreach_weight_decay_cpu": 12,
+    "test_nadam_foreach_weight_decay_momentum_decay_cpu": 20,
+    "test_adamw_amsgrad_capturable_foreach_cuda": 3,
+    "test_adamw_amsgrad_capturable_foreach_xpu": 3,
+    "test_adamw_amsgrad_capturable_cuda": 6,
+    "test_adamw_amsgrad_capturable_xpu": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": 6,
+    "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_amsgrad_capturable_xpu": 6,
+    "test_adam_tensor_lr_amsgrad_capturable_cuda": 6,
+    "test_adam_tensor_lr_amsgrad_capturable_xpu": 6,
+    "test_adam_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
+    "test_adam_tensor_lr_tensor_betas_capturable_cuda": 6,
+    "test_adam_amsgrad_capturable_cuda": 6,
+    "test_adam_amsgrad_capturable_xpu": 6,
+    "test_adadelta_tensor_lr_capturable_cuda": 6,
+    "test_adadelta_tensor_lr_capturable_xpu": 6,
+    "test_rmsprop_tensor_lr_capturable_cuda": 6,
+    "test_rmsprop_tensor_lr_capturable_xpu": 6,
+    "test_adadelta_foreach_weight_decay_maximize_cpu": 12,
+    "test_adadelta_foreach_rho_weight_decay_cpu": 12,
+    "test_adadelta_foreach_weight_decay_cpu": 12,
+    "test_sgd_foreach_momentum_weight_decay_cpu": 16,
+    "test_sgd_foreach_momentum_nesterov_weight_decay_cpu": 16,
+    "test_sgd_momentum_dampening_foreach_cuda": 5,
+    "test_sgd_momentum_dampening_foreach_xpu": 5,
+    "test_sgd_momentum_foreach_cuda": 5,
+    "test_sgd_momentum_foreach_xpu": 5,
+    "test_sgd_weight_decay_maximize_cuda": 4,
+    "test_sgd_weight_decay_maximize_xpu": 4,
+    "test_sgd_weight_decay_maximize_cpu": 4,
+    "test_sgd_weight_decay_cpu": 4,
+    "test_sgd_weight_decay_cuda": 4,
+    "test_sgd_weight_decay_xpu": 4,
+    "test_sgd_momentum_weight_decay_foreach_cuda": 2,
+    "test_sgd_momentum_weight_decay_foreach_xpu": 2,
+    "test_sgd_momentum_nesterov_weight_decay_foreach_cuda": 2,
+    "test_sgd_momentum_nesterov_weight_decay_foreach_xpu": 2,
+    "test_sgd_cuda": 4,
+    "test_sgd_cpu": 4,
+    "test_sgd_xpu": 4,
+    "test_adagrad_initial_accumulator_value_weight_decay_foreach_xpu": 2,
+    "test_adagrad_lr_decay_weight_decay_foreach_xpu": 2,
+    "test_adagrad_weight_decay_foreach_xpu": 2,
+    "test_adagrad_weight_decay_maximize_foreach_xpu": 2,
+    "test_adagrad_tensor_lr_cpu": 6,
+    "test_adagrad_tensor_lr_cuda": 6,
+    "test_adagrad_tensor_lr_xpu": 6,
+    "test_adamax_tensor_lr_weight_decay_capturable_cuda": 6,
+    "test_adamax_tensor_lr_weight_decay_capturable_xpu": 6,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": 5,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": 8,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 6,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": 9,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": 6,
+    "test_sgd_tensor_lr_cpu": 2,
+    "test_sgd_tensor_lr_cuda": 2,
+    "test_sgd_tensor_lr_xpu": 2,
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # also tracks currently supported optimizers
 KERNEL_COUNTS = {
@@ -292,7 +373,11 @@ def build_opt_kwarg_db():
 
                 has_tensor_lr = False
                 for key, val in kwargs.items():
+<<<<<<< HEAD
                     if (key != "lr" and key != "betas") and (
+=======
+                    if (not key == "lr" and not key == "betas") and (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         not isinstance(val, bool) or (isinstance(val, bool) and val)
                     ):
                         name += "_" + key
@@ -439,7 +524,11 @@ def make_test(
     closure=None,
     scheduler_cls=None,
     kernel_count=2,
+<<<<<<< HEAD
     device=GPU_TYPE,
+=======
+    device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     **kwargs,
 ):
     @config.patch("score_fusion_memory_threshold", 1)
@@ -453,7 +542,11 @@ def test_fn(self):
                 stack.enter_context(config.patch({"triton.cudagraphs": True}))
 
             kwargs_compiled = deepcopy(kwargs)
+<<<<<<< HEAD
             if isinstance(kwargs.get("lr"), torch.Tensor):
+=======
+            if isinstance(kwargs.get("lr", None), torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["lr"] = kwargs["lr"].to(device)
                 kwargs_compiled["lr"] = kwargs_compiled["lr"].to(device)
 
@@ -514,12 +607,18 @@ def test_fn(self):
                 # currently, we compile the step and the rest of the computation
                 # separately because the step is a single element tensor
                 # hence, the usual kernel count is 2
+<<<<<<< HEAD
                 if isinstance(kernel_count, types.LambdaType):
                     kernel_count(str(torch._inductor.metrics.generated_kernel_count))
                 else:
                     self.assertEqual(
                         torch._inductor.metrics.generated_kernel_count, kernel_count
                     )
+=======
+                self.assertEqual(
+                    torch._inductor.metrics.generated_kernel_count, kernel_count
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             stack.close()
 
@@ -587,9 +686,12 @@ class CompiledOptimizerParityTests(TestCase):
     @optims(optim_db, dtypes=[torch.float32])
     @parametrize("use_closure", [True, False])
     def test_correctness(self, device, dtype, optim_info, use_closure):
+<<<<<<< HEAD
         torch.cuda.manual_seed_all(0)
         torch.manual_seed(0)
         random.seed(0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim_cls = optim_info.optim_cls
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
@@ -611,10 +713,14 @@ def test_correctness(self, device, dtype, optim_info, use_closure):
                 torch._inductor.metrics.reset()
                 input = torch.ones([10, 10], device=device)
                 model_eager = torch.nn.Sequential(
+<<<<<<< HEAD
                     *[
                         torch.nn.Linear(10, 10, device=device, bias=False)
                         for _ in range(2)
                     ]
+=======
+                    *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 model_eager(input).sum().backward()
                 model_compiled = deepcopy(model_eager)
@@ -734,7 +840,10 @@ def check_cudagraphs_ran(self):
         SGD, kernel_count=1, lr=0.01, foreach=True
     )
 
+<<<<<<< HEAD
     @skipIfWindows
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_static_address_finalizer(self):
         import gc
@@ -902,7 +1011,11 @@ def test_get_value_on_static_address(self):
         compiled = torch.compile(_get_value)
 
         x = torch.ones(2, 2)
+<<<<<<< HEAD
         mark_static_address(x, guard=True)
+=======
+        mark_static_address(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ret_val = compiled(x)
 
@@ -927,7 +1040,11 @@ def fn(xs, ys):
 
         self.assertLess(end - start, 90)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_S429861(self):
         # Just verify we can compile this function without error
         try:
@@ -946,7 +1063,11 @@ def test_S429861(self):
             kwargs = aot_graph_input_parser(forward)
             torch.compile(forward)(**kwargs)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_map_adam(self):
         params = [
             torch.rand(
diff --git a/test/inductor/test_config.py b/test/inductor/test_config.py
index f617a4d12f11f..a404e3b790692 100644
--- a/test/inductor/test_config.py
+++ b/test/inductor/test_config.py
@@ -115,7 +115,11 @@ def test_compile_api(self):
         for kwargs in checks:
             torch._dynamo.reset()
             opt_fn = torch.compile(dummy_fn, **kwargs)
+<<<<<<< HEAD
             torch.testing.assert_close(
+=======
+            torch.testing.assert_allclose(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 opt_fn(x), y, msg=f"torch.compile(..., **{kwargs!r}) failed"
             )
 
@@ -244,6 +248,7 @@ def __call__(self, g: torch.fx.Graph):
             code = torch._inductor.config.codegen_config()
             self.assertNotIn("post_grad_custom", code)
 
+<<<<<<< HEAD
     def test_select_decomp_table_fallback_embedding_bag_byte_unpack(self):
         """Test that select_decomp_table removes embedding_bag_byte_unpack when fallback is enabled"""
         from torch._inductor.decomposition import select_decomp_table
@@ -267,6 +272,8 @@ def test_select_decomp_table_fallback_embedding_bag_byte_unpack(self):
                 torch.ops.quantized.embedding_bag_byte_unpack.default, decomp_table
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_TRITON, "requires triton")
     def test_options_do_something(self):
         """
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index a3c81bdfd15b0..338174ce06728 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -1,11 +1,18 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 
+=======
+import contextlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import unittest
 
 import torch
 import torch._dynamo.testing
+<<<<<<< HEAD
 import torch.utils._pytree as pytree
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.associative_scan import associative_scan
 from torch._higher_order_ops.map import _fake_map
 from torch._higher_order_ops.scan import _fake_scan, scan
@@ -38,6 +45,7 @@ def prepend_counters(inputs, num_counters=1, counter_values=(0, 1, 5)):
     return _prepend_product_of_values(inputs, counter_values, num_counters)
 
 
+<<<<<<< HEAD
 # a testing loss_fn
 def loss_fn(result) -> torch.Tensor:
     flat_results, _ = pytree.tree_flatten(result)
@@ -56,6 +64,8 @@ def loss_fn(result) -> torch.Tensor:
     return total_loss
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CondModels:
     class Simple(torch.nn.Module):
         def forward(self, p, a, b):
@@ -232,6 +242,7 @@ def false_fn(x):
 
             return y.sum() - torch.cond(x.sum() > 0, true_fn, false_fn, (x,))
 
+<<<<<<< HEAD
     class FunctionalCall(torch.nn.Module):
         def __init__(self):
             super().__init__()
@@ -276,6 +287,8 @@ def fn():
 
             return torch.cond(x0.sum() > 0, fn, fn)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CondTests(TestCase):
     def _run_test(
@@ -294,6 +307,7 @@ def _run_test(
         if dynamic:
             larger_inputs = []
             for inp in inputs:
+<<<<<<< HEAD
                 # only tile non-scalar tensor inputs
                 if inp.ndim > 0:
                     # tile every first dim 5x
@@ -301,6 +315,11 @@ def _run_test(
                     larger_inputs.append(torch.tile(inp, tiling))
                 else:
                     larger_inputs.append(inp)
+=======
+                # tile every first dim 5x
+                tiling = [5] + [1] * (inp.ndim - 1)
+                larger_inputs.append(torch.tile(inp, tiling))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_sets.append(larger_inputs)
             for inputs in input_sets:
                 for inp in inputs:
@@ -505,9 +524,12 @@ def false_fn(x):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @torch._inductor.config.patch(size_asserts=False)
+<<<<<<< HEAD
     # TODO: graph partition does not support creating tensor
     # with dynamic shape in conditional subgraph yet
     @torch._inductor.config.patch(graph_partition=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_unbacked_symint_inner(self, device):
         class Model(torch.nn.Module):
             def forward(self, p, a):
@@ -745,6 +767,7 @@ def test_cond_mismatched_branch_output_size(self, device, dynamic):
             dynamic=dynamic,
         )
 
+<<<<<<< HEAD
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
@@ -768,6 +791,8 @@ def test_cond_select_with_input_idx(self, device, dynamic):
             dynamic=dynamic,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class WhileLoopModels:
     class Simple(torch.nn.Module):
@@ -804,12 +829,17 @@ class Parameters(torch.nn.Module):
         class InnerModel(torch.nn.Module):
             def __init__(self, device):
                 super().__init__()
+<<<<<<< HEAD
                 self.layer1 = torch.nn.Linear(
                     20, 30, device=device, dtype=torch.float64
                 )
                 self.layer2 = torch.nn.Linear(
                     30, 20, device=device, dtype=torch.float64
                 )
+=======
+                self.layer1 = torch.nn.Linear(20, 30, device=device)
+                self.layer2 = torch.nn.Linear(30, 20, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, c, x):
                 return c - 1, self.layer2(self.layer1(x - 2)) * 3.14
@@ -989,6 +1019,7 @@ def body_fn(c, a_view):
             )
             return out1 + 1, out2 + 2
 
+<<<<<<< HEAD
     class ZeroLoop4(torch.nn.Module):
         def forward(self, c, a):
             a_view = torch.sin(a.view(-1, 1))
@@ -1006,6 +1037,8 @@ def body_fn(c, a_view):
             )
             return out2.sin_(), a_view.cos_()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class UnbackedSymIntClosure(torch.nn.Module):
         def forward(self, c, a, b):
             d = a.sum().to(torch.int64).item()
@@ -1029,7 +1062,11 @@ def forward(self, c, a, b):
             e = torch.nonzero(b).size(0)
 
             def cond_fn(c, a, b):
+<<<<<<< HEAD
                 return c + d + e + a.shape[0] - b.shape[0] < 10
+=======
+                return d + e + a.shape[0] - b.shape[0] < 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def body_fn(c, a, b):
                 return c + 1, a + e, b + d
@@ -1092,6 +1129,7 @@ def body_fn(loop_idx, x):
                 (c, x),
             )
 
+<<<<<<< HEAD
     class WhileLoopStackOutputSimple(torch.nn.Module):
         def __init__(self, device):
             super().__init__()
@@ -1113,10 +1151,22 @@ def body_fn(c, x):
 class WhileLoopTests(TestCase):
     def _run_test(
         self, model, inputs, device, dynamic=False, num_counters=1, autograd=False
+=======
+
+class WhileLoopTests(TestCase):
+    def _run_test(
+        self,
+        model,
+        inputs,
+        device,
+        dynamic=False,
+        num_counters=1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         import torch.utils._pytree as pytree
 
         cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+<<<<<<< HEAD
         import copy
 
         if not autograd:
@@ -1134,10 +1184,28 @@ def mark_first_dim_dyn(inp):
 
         if dynamic:
 
+=======
+        compiled_model = torch.compile(backend=cnt, fullgraph=True)(model)
+
+        inputs = pytree.tree_map(lambda t: t.to(device=device), inputs)
+        input_sets = [inputs]
+        if dynamic:
+
+            def mark_first_dim_dyn(inp):
+                torch._dynamo.mark_dynamic(inp, 0)
+
+            pytree.tree_map(mark_first_dim_dyn, input_sets)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def tile_fn(inp):
                 # tile every first dim 5x
                 tiling = [5] + [1] * (inp.ndim - 1)
                 t = torch.tile(inp, tiling)
+<<<<<<< HEAD
+=======
+                # mark every first dim as dynamic
+                torch._dynamo.mark_dynamic(inp, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return t
 
             larger_inputs = pytree.tree_map(tile_fn, inputs)
@@ -1154,6 +1222,7 @@ def tile_fn(inp):
                 )
                 unflat_inputs = pytree.tree_unflatten(flat, inp_spec)
                 inputs_with_counters = counters + unflat_inputs
+<<<<<<< HEAD
 
                 def process_inputs(inp):
                     inp = inp.clone()
@@ -1169,12 +1238,21 @@ def process_inputs(inp):
 
                 result = model(*cloned_inputs)
                 result_compiled = compiled_fn(*cloned_inputs2)
+=======
+                cloned_inputs = pytree.tree_map(
+                    lambda t: t.clone(), inputs_with_counters
+                )
+                result = model(*inputs_with_counters)
+                with torch.no_grad():
+                    result_compiled = compiled_model(*inputs_with_counters)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # inputs must not be mutated
                 torch.testing.assert_close(cloned_inputs, inputs_with_counters)
                 torch.testing.assert_close(
                     result, result_compiled, atol=1e-4, rtol=1e-4
                 )
 
+<<<<<<< HEAD
                 if autograd and any(
                     pytree.tree_map_only(
                         torch.Tensor, lambda t: t.requires_grad, cloned_inputs
@@ -1218,14 +1296,20 @@ def process_inputs(inp):
                                 rtol=1e-4,
                             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(cnt.frame_count, 1, "only one compilation expected")
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_simple_control_flow(self, device, dynamic, autograd):
+=======
+    def test_while_loop_simple_control_flow(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # while_loop control flow without nesting
         self._run_test(
             model=WhileLoopModels.Simple(),
@@ -1235,15 +1319,22 @@ def test_while_loop_simple_control_flow(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_nested_control_flow(self, device, dynamic, autograd):
+=======
+    def test_while_loop_nested_control_flow(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # while_loop control flow with nesting
         self._run_test(
             model=WhileLoopModels.Nested(),
@@ -1254,15 +1345,22 @@ def test_while_loop_nested_control_flow(self, device, dynamic, autograd):
             device=device,
             dynamic=dynamic,
             num_counters=2,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_outer_code(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_outer_code(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterCode(),
@@ -1272,12 +1370,16 @@ def test_while_loop_with_outer_code(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_parameters(self, device, dynamic, autograd):
@@ -1288,6 +1390,15 @@ def test_while_loop_with_parameters(self, device, dynamic, autograd):
             device=device,
             dynamic=dynamic,
             autograd=autograd,
+=======
+    def test_while_loop_with_parameters(self, device, dynamic):
+        # while_loop control flow with parameters
+        self._run_test(
+            model=WhileLoopModels.Parameters(device),
+            inputs=(torch.randn(10, 20),),
+            device=device,
+            dynamic=dynamic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -1295,9 +1406,13 @@ def test_while_loop_with_parameters(self, device, dynamic, autograd):
     # dynamic=True doesn't work now due to
     # https://github.com/pytorch/pytorch/issues/123596
     @parametrize("dynamic", [False])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_outer_buffers(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_outer_buffers(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterBuffers(),
@@ -1307,15 +1422,24 @@ def test_while_loop_with_outer_buffers(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
+<<<<<<< HEAD
     @parametrize("dynamic", [True, False])
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_pytree_inputs(self, device, dynamic, autograd):
+=======
+    # dynamic=True doesn't work due to we haven't handle lifted symbols
+    @parametrize("dynamic", [True, False])
+    def test_while_loop_with_pytree_inputs(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=WhileLoopModels.PytreeCarry(),
             inputs=(
@@ -1326,15 +1450,22 @@ def test_while_loop_with_pytree_inputs(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_data_dependent_ops(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_data_dependent_ops(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1350,15 +1481,22 @@ def test_while_loop_with_data_dependent_ops(self, device, dynamic, autograd):
                 ),
                 device=device,
                 dynamic=dynamic,
+<<<<<<< HEAD
                 autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_data_dependent_in_out(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_data_dependent_in_out(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1375,7 +1513,10 @@ def test_while_loop_with_data_dependent_in_out(self, device, dynamic, autograd):
                 ),
                 device=device,
                 dynamic=dynamic,
+<<<<<<< HEAD
                 autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @parametrize("dynamic", [True, False])
@@ -1421,7 +1562,10 @@ def test_while_loop_zero_loop(self, device, dynamic):
             WhileLoopModels.ZeroLoop(),
             WhileLoopModels.ZeroLoop2(),
             WhileLoopModels.ZeroLoop3(),
+<<<<<<< HEAD
             WhileLoopModels.ZeroLoop4(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]:
             self._run_test(
                 model=model,
@@ -1436,8 +1580,12 @@ def test_while_loop_zero_loop(self, device, dynamic):
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     def test_while_loop_with_unbacked_symint_closure(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=WhileLoopModels.UnbackedSymIntClosure(),
             inputs=(
@@ -1446,7 +1594,10 @@ def test_while_loop_with_unbacked_symint_closure(self, device, dynamic, autograd
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -1481,11 +1632,18 @@ def test_while_loop_models_with_mixed_device(self, device):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
     def test_while_loop_with_sym_expr_cond(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch(
+        {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
+    )
+    def test_while_loop_with_sym_expr_cond(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=WhileLoopModels.SymExprCond(),
             inputs=(
@@ -1494,20 +1652,28 @@ def test_while_loop_with_sym_expr_cond(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_while_loop_with_conv(self, device, dynamic, autograd):
+=======
+    def test_while_loop_with_conv(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=WhileLoopModels.Conv(device),
             inputs=(torch.randn(2, 4, 4, 4, dtype=torch.float64),),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
         )
 
@@ -1521,6 +1687,8 @@ def test_while_loop_stack_output_simple(self, device, dynamic):
             inputs=(torch.randn(3, 3, dtype=torch.float32),),
             device=device,
             dynamic=dynamic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1529,7 +1697,11 @@ class AssociativeScanTests(TestCase):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("backend", ["inductor"])
     @parametrize("device", [torch.device("cpu"), GPU_TYPE])
+<<<<<<< HEAD
     # This test will fail as flip in combination with particular input lengths
+=======
+    # This test will fail as flip in combination with particular input lenghts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # produces weird results.
     # This is under investigations in
     # https://github.com/pytorch/pytorch/issues/131805
@@ -1553,7 +1725,11 @@ def fct(x: torch.Tensor, y: torch.Tensor):
                         fct, x, 0, reverse=False, combine_mode=combine_mode
                     )
 
+<<<<<<< HEAD
                 # Skipping test because combine_mode currently only supports CUDA tensors
+=======
+                # Skipping test because combine_mode currently only suppors CUDA tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return
 
             result1 = associative_scan1(
@@ -1661,7 +1837,11 @@ def __init__(self, reverse, dim):
             super().__init__()
             self.reverse = reverse
             self.dim = dim
+<<<<<<< HEAD
             self.linear = torch.nn.Linear(4, 4, dtype=torch.float64)
+=======
+            self.linear = torch.nn.Linear(4, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def forward(self, scan_op, init, xs):
             def combine_fn(carry, x):
@@ -1743,6 +1923,11 @@ def __init__(self, reverse, dim):
 
         def forward(self, scan_op, _input, weight, bias):
             def combine_fn(carry, x):
+<<<<<<< HEAD
+=======
+                from torch.utils import _pytree as pytree
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_carry = {
                     "param": carry["param"] @ x + carry["bias"],
                     "bias": carry["bias"].sin(),
@@ -1889,6 +2074,7 @@ def _run_test(
         inputs,
         device,
         dynamic,
+<<<<<<< HEAD
         autograd=False,
     ):
         import copy
@@ -1941,6 +2127,29 @@ def _run_model(model, inputs):
         self.assertEqual(result_exp, result_eager)
         self.assertEqual(result_exp, result_compiled)
         self.assertEqual(result_exp, result_compiled_exp)
+=======
+        requires_grad=False,
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
+            model
+        )
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        model = model.to(device=device)
+        cloned_inputs = [inp.clone() for inp in inputs]
+        grad_ctx = contextlib.nullcontext() if requires_grad else torch.no_grad()
+        with grad_ctx:
+            result = model(scan, *cloned_inputs)
+            result_exp = model(_fake_scan, *cloned_inputs)
+
+            result_compiled = compiled_model(scan, *cloned_inputs)
+            result_compiled_exp = compiled_model(_fake_scan, *cloned_inputs)
+
+        self.assertEqual(result, result_exp)
+        self.assertEqual(result_exp, result_compiled)
+        self.assertEqual(result_compiled, result_compiled_exp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _compare_result(
         self,
@@ -1960,9 +2169,14 @@ def _compare_result(
     @parametrize("dynamic", [True, False])
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 2])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_pytree_in_out(self, device, dynamic, reverse, dim, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_pytree_in_out(self, device, dynamic, reverse, dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=ScanModels.SimpleWithPytreeInOuts(reverse=reverse, dim=dim),
             inputs=(
@@ -1972,7 +2186,10 @@ def test_scan_pytree_in_out(self, device, dynamic, reverse, dim, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -1981,6 +2198,7 @@ def test_scan_pytree_in_out(self, device, dynamic, reverse, dim, autograd):
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_nn_modules(
@@ -1988,6 +2206,12 @@ def test_scan_nn_modules(
     ):
         init = torch.randn(20, 16, 4, 4, dtype=torch.float64)
         xs = torch.randn(scan_length, 20, 16, 4, 4, dtype=torch.float64)
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_nn_modules(self, device, dynamic, reverse, dim, scan_length):
+        init = torch.randn(20, 16, 4, 4)
+        xs = torch.randn(scan_length, 20, 16, 4, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         xs = xs.movedim(0, dim)
         self._run_test(
             model=ScanModels.ScanLinearWithView(reverse=reverse, dim=dim),
@@ -1997,7 +2221,10 @@ def test_scan_nn_modules(
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -2006,9 +2233,14 @@ def test_scan_nn_modules(
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_conv(self, device, dynamic, reverse, dim, scan_length, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_conv(self, device, dynamic, reverse, dim, scan_length):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init = torch.randn(2, 4, 4, 4, dtype=torch.float64)
         xs = torch.randn(scan_length, 2, 4, 4, 4, dtype=torch.float64)
         xs = xs.movedim(0, dim)
@@ -2020,7 +2252,10 @@ def test_scan_conv(self, device, dynamic, reverse, dim, scan_length, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -2030,6 +2265,7 @@ def test_scan_conv(self, device, dynamic, reverse, dim, scan_length, autograd):
     @parametrize("dim", [0, 1, 3])
     @parametrize("pred", [True, False])
     @parametrize("scan_length", [1, 5])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_in_cond(
@@ -2037,6 +2273,12 @@ def test_scan_in_cond(
     ):
         init = torch.randn(4, 4, 4, dtype=torch.float64)
         xs = torch.randn(scan_length, 4, 4, 4, dtype=torch.float64)
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_in_cond(self, device, dynamic, reverse, dim, pred, scan_length):
+        init = torch.randn(4, 4, 4)
+        xs = torch.randn(scan_length, 4, 4, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         xs = xs.movedim(0, dim)
         self._run_test(
             model=ScanModels.ScanInCond(reverse=reverse, dim=dim),
@@ -2047,7 +2289,10 @@ def test_scan_in_cond(
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -2056,9 +2301,14 @@ def test_scan_in_cond(
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init = torch.randn(2, 4, 4, 4)
         xs = torch.randn(scan_length, 4, 4, 4)
         xs = xs.movedim(0, dim)
@@ -2070,15 +2320,23 @@ def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length, autograd
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_chunked_ce(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_chunked_ce(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=ScanModels.ChunkedCE(10),
             inputs=(
@@ -2089,7 +2347,10 @@ def test_scan_chunked_ce(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
@@ -2113,9 +2374,14 @@ def test_scan_compare_chunked_ce_with_no_scan(self, device, dynamic):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_scan_with_clamp(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_with_clamp(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         B = 4
         T = 8
         H = 16
@@ -2123,11 +2389,18 @@ def test_scan_with_clamp(self, device, dynamic, autograd):
             model=ScanModels.ScanWithClamp(),
             inputs=(
                 torch.randn((B, H)),
+<<<<<<< HEAD
                 torch.randn((T, B, H)),
             ),
             device=device,
             dynamic=dynamic,
             autograd=autograd,
+=======
+                torch.randn((T, B, H), requires_grad=True),
+            ),
+            device=device,
+            dynamic=dynamic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -2201,6 +2474,7 @@ def _run_test(
         inputs,
         device,
         dynamic=False,
+<<<<<<< HEAD
         autograd=False,
     ):
         import copy
@@ -2220,11 +2494,25 @@ def _run_test(
         cloned_inputs = [inp.clone() for inp in inputs]
         result = model(torch._higher_order_ops.map, *cloned_inputs)
         result_exp = model_eager(_fake_map, *cloned_inputs)
+=======
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
+            model
+        )
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        model = model.to(device=device)
+        cloned_inputs = [inp.clone() for inp in inputs]
+        result = model(torch._higher_order_ops.map, *cloned_inputs)
+        result_exp = model(_fake_map, *cloned_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result_compiled = compiled_model(torch._higher_order_ops.map, *cloned_inputs)
 
         self.assertEqual(result, result_exp)
         self.assertEqual(result, result_compiled)
 
+<<<<<<< HEAD
         if autograd:
             loss_fn(result).backward()
             loss_fn(result_exp).backward()
@@ -2245,34 +2533,57 @@ def _run_test(
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_map_simple(self, device, dynamic, autograd):
+=======
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_simple(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=MapModels.Simple(),
             inputs=(torch.randn(3, 4),),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_map_simple_linear_with_view(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_simple_linear_with_view(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=MapModels.SimpleWithLinearWithView(),
             inputs=(torch.randn(3, 4),),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_map_pytree_in_out(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_pytree_in_out(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=MapModels.PytreeInOut(),
             inputs=(
@@ -2282,15 +2593,23 @@ def test_map_pytree_in_out(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+<<<<<<< HEAD
     @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_map_nested_with_cond(self, device, dynamic, autograd):
+=======
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_nested_with_cond(self, device, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_test(
             model=MapModels.NestedWithCond(),
             inputs=(
@@ -2300,7 +2619,10 @@ def test_map_nested_with_cond(self, device, dynamic, autograd):
             ),
             device=device,
             dynamic=dynamic,
+<<<<<<< HEAD
             autograd=autograd,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index 4548a819b07aa..f56128488f102 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -18,7 +18,11 @@
     instantiate_parametrized_tests,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestingHeuristics(InductorChoices):
@@ -176,7 +180,11 @@ def fn(x, y):
             return reduction_fn(x + y, dim=-1)
 
         reduction_fn = getattr(torch, name)
+<<<<<<< HEAD
         args = [torch.randn(1, 1024**2, device=GPU_TYPE, dtype=dtype) for _ in range(2)]
+=======
+        args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_and_check(fn, args, dtype)
 
     def test_bool_reduction_fns(self):
@@ -190,7 +198,11 @@ def fn(x, y):
                 torch.all(x > y),
             ]
 
+<<<<<<< HEAD
         args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
+=======
+        args = [torch.randn(1024, device="cuda") for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source_code = self.run_and_check(fn, args)
         if "async_compile.multi_kernel" in source_code:
             return
@@ -204,7 +216,11 @@ def test_non_power_of_2(self, bs, count):
         def fn(x):
             return x.mean(), x.std() + x.min()
 
+<<<<<<< HEAD
         args = [torch.randn([bs, count], device=GPU_TYPE)]
+=======
+        args = [torch.randn([bs, count], device="cuda")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_and_check(fn, args)
 
     def test_chained_reductions(self):
@@ -213,19 +229,31 @@ def fn(x):
                 x = x + torch.softmax(x, 1)
             return x
 
+<<<<<<< HEAD
         args = [torch.randn(4, 100000, device=GPU_TYPE)]
+=======
+        args = [torch.randn(4, 100000, device="cuda")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source_code = self.run_and_check(fn, args)
         if "async_compile.multi_kernel" in source_code:
             return
 
         # With online softmax, the computation of max and sum are done
         # jointly and they share a single barrier call.
+<<<<<<< HEAD
         # XPU doesn't support online softmax yet.
         expected_num_barrier = 8 if config.online_softmax and GPU_TYPE != "xpu" else 16
         self.assertEqual(
             source_code.count("triton_helpers.x_grid_barrier"), expected_num_barrier
         )
         self.assertEqual(source_code.count(f"empty_strided_{GPU_TYPE}"), 5)
+=======
+        expected_num_barrier = 8 if config.online_softmax else 16
+        self.assertEqual(
+            source_code.count("triton_helpers.x_grid_barrier"), expected_num_barrier
+        )
+        self.assertEqual(source_code.count("empty_strided_cuda"), 5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_reduce_split(self):
         def fn(a, b):
@@ -234,8 +262,13 @@ def fn(a, b):
             return a1, b1
 
         inps = [
+<<<<<<< HEAD
             torch.rand(2048, 512, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
+=======
+            torch.rand(2048, 512, device="cuda"),
+            torch.rand(20, 20, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         self.run_and_check(fn, inps, expect_kernel_count=2)
 
@@ -291,7 +324,11 @@ def test_fixed_configs(self, persistent, cooperative, cfg):
         def fn(x):
             return torch.softmax(x + 1, dim=-1) + x
 
+<<<<<<< HEAD
         args = [torch.randn(8, 8000, device=GPU_TYPE)]
+=======
+        args = [torch.randn(8, 8000, device="cuda")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check(fn, args, persistent=persistent, cooperative=cooperative, cfg=cfg)
 
     @parametrize(
@@ -316,7 +353,11 @@ def fn(x):
         cfg = {"XBLOCK": 64, "RSPLIT": rsplit, "num_warps": 8}
         if not persistent:
             cfg["R0_BLOCK"] = 64
+<<<<<<< HEAD
         args = [torch.randn(x, r, device=GPU_TYPE)]
+=======
+        args = [torch.randn(x, r, device="cuda")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check(fn, args, persistent=persistent, cfg=cfg)
 
     @parametrize("persistent", [True, False])
@@ -336,8 +377,13 @@ def fn(x):
         args = [
             torch.stack(
                 [
+<<<<<<< HEAD
                     torch.arange(10, 4096, device=GPU_TYPE),
                     -torch.arange(10, 4096, device=GPU_TYPE),
+=======
+                    torch.arange(10, 4096, device="cuda"),
+                    -torch.arange(10, 4096, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
         ]
@@ -347,12 +393,20 @@ def fn(x):
                 [
                     torch.tensor(
                         [0.0] * 150 + [float("inf")] * 150,
+<<<<<<< HEAD
                         device=GPU_TYPE,
+=======
+                        device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         dtype=torch.float32,
                     ),
                     torch.tensor(
                         [0.0] * 150 + [-float("inf")] * 150,
+<<<<<<< HEAD
                         device=GPU_TYPE,
+=======
+                        device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         dtype=torch.float32,
                     ),
                 ]
@@ -375,12 +429,20 @@ def fn(x, y):
         cfg = {"XBLOCK": 128, "RSPLIT": rsplit, "num_warps": 16, "num_stages": 1}
         if not persistent:
             cfg["R0_BLOCK"] = 64
+<<<<<<< HEAD
         args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
+=======
+        args = [torch.randn(1024, device="cuda") for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check(fn, args, persistent=persistent, cfg=cfg)
 
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_GPU:
+=======
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index 47a8f3aa063e3..d5d95f06d2dc6 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -110,6 +110,10 @@ def make_test_case(
 
     @config.patch(
         cpp_wrapper=True,
+<<<<<<< HEAD
+=======
+        search_autotune_cache=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cpp_wrapper_build_separate=test_build_separate,
     )
     def fn(self):
@@ -268,7 +272,11 @@ class BaseTest(NamedTuple):
             "test_multi_threading",
             condition=not IS_WINDOWS,
             # Two threads compile, so we expect the output code to be printed twice.
+<<<<<<< HEAD
             code_string_count={"py::gil_scoped_release_simple release;": 2},
+=======
+            code_string_count={"py::gil_scoped_release release;": 2},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         BaseTest("test_profiler_mark_wrapper_call"),
         BaseTest(
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 1ce5d88a20f8d..670be99435638 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -8,7 +8,11 @@
 import platform
 import sys
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -128,6 +132,7 @@ def forward(self, x, h=None):
 class CPUReproTests(TestCase):
     common = check_model
 
+<<<<<<< HEAD
     def test_torch_linalg_qr_tuple_slice(self):
         def fn(x):
             return torch.linalg.qr(x)[:1]
@@ -142,6 +147,8 @@ def fn(x):
         self.assertEqual(len(actual), 1)
         torch.testing.assert_close(actual[0], expected[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfRocm
     def test_conv_stride_constraints(self):
         for fmt in [torch.contiguous_format, torch.channels_last]:
@@ -166,7 +173,11 @@ class RecordFunctions(TorchDispatchMode):
                 def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     kwargs = kwargs if kwargs else {}
                     if func == torch.ops.aten.convolution.default:
+<<<<<<< HEAD
                         # For CPU and mkldnn enable, we always using channels last
+=======
+                        # For CPU and mkldnn enable, we always using channles last
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         nonlocal fmt
                         if (
                             torch.backends.mkldnn.enabled
@@ -216,6 +227,7 @@ def forward(self, x):
                 (v,),
             )
 
+<<<<<<< HEAD
     def test_complex_cholesky_mh_view_fallback(self):
         torch.manual_seed(0)
 
@@ -244,6 +256,8 @@ def run(compiled_fn):
         torch.testing.assert_close(actual_loss, expected_loss)
         torch.testing.assert_close(actual_grad, expected_grad)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nn_fold(self):
         # Fix https://github.com/pytorch/pytorch/issues/147848
 
@@ -311,6 +325,7 @@ def test_conv2d_autocast(self):
                 (v,),
             )
 
+<<<<<<< HEAD
     def test_conv1d_strided_weight_torch_compile(self):
         def fn(x, w):
             wt = w.transpose(2, 1)
@@ -338,6 +353,8 @@ def fn(x, w):
         torch.testing.assert_close(x_comp.grad, grad_x_eager)
         torch.testing.assert_close(w_comp.grad, grad_w_eager)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(freezing=True)
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     @patch("torch.cuda.is_available", lambda: False)
@@ -1065,7 +1082,11 @@ def fn(x):
 
         v = torch.randn(10)
         # TODO: OMP parallel reduction order is not deterministic.
+<<<<<<< HEAD
         # Hence, the accuracy might vary up and down. For short term,
+=======
+        # Hence, the accurarcy might vary up and down. For short term,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # we increase the tolerance and will fix it later by using
         # aten parallel.
         self.common(fn, (v,), atol=5e-1, rtol=5e-1)
@@ -1073,7 +1094,11 @@ def fn(x):
     def test_parallel_reduction_vectorization(self):
         # Fix issue: https://github.com/pytorch/pytorch/issues/151523
         class Model(torch.nn.Module):
+<<<<<<< HEAD
             def __init__(self, enable_masked_tail_vec):
+=======
+            def __init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 super().__init__()
                 self.conv = torch.nn.Conv2d(
                     in_channels=3,
@@ -1082,16 +1107,24 @@ def __init__(self, enable_masked_tail_vec):
                     stride=(2, 1),
                     padding=0,
                 )
+<<<<<<< HEAD
                 self.enable_masked_tail_vec = enable_masked_tail_vec
 
             def forward(self, x, weight):
                 x = self.conv(x)
                 if not self.enable_masked_tail_vec:
                     x = F.hardshrink(x, lambd=0)
+=======
+
+            def forward(self, x, weight):
+                x = self.conv(x)
+                x = F.hardshrink(x, lambd=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = x.view(x.size(0), -1)
                 x = torch.mv(weight, x[0])
                 return x
 
+<<<<<<< HEAD
         for enable_masked_tail_vec in [True, False]:
             mod = Model(enable_masked_tail_vec).eval()
             x = torch.randn(2, 3, 127, 255)
@@ -1099,6 +1132,14 @@ def forward(self, x, weight):
             # Use same criterion as test_inplace_squeeze_needed
             # for parallel reduction.
             self.common(mod, (x, weight), atol=5e-1, rtol=5e-1)
+=======
+        mod = Model().eval()
+        x = torch.randn(2, 3, 127, 255)
+        weight = torch.randn(10, 254976)
+        # Use same criterion as test_inplace_squeeze_needed
+        # for parallel reduction.
+        self.common(mod, (x, weight), atol=5e-1, rtol=5e-1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cat_mul(self):
         # https://github.com/pytorch/pytorch/issues/93365
@@ -2713,6 +2754,7 @@ def fn(a, dim, index, b):
             self.common(fn, inps)
             assert metrics.generated_cpp_vec_kernel_count == 2
 
+<<<<<<< HEAD
     def test_large_mean(self):
         size = (30000, 100000)
         t = torch.rand(size, dtype=torch.float)
@@ -2725,6 +2767,8 @@ def test_large_mean(self):
             actual = torch.compile(op)(t)
             self.assertEqual(expected, actual)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
@@ -3186,6 +3230,7 @@ def get_traj_idx(lengths: torch.Tensor, num_slices: int) -> torch.Tensor:
         lengths = torch.zeros(11, dtype=torch.long)
         get_traj_idx(lengths, num_slices=4)
 
+<<<<<<< HEAD
     def test_store_reduction(self):
         # fix https://github.com/pytorch/pytorch/issues/157683
         def fn(x, y):
@@ -3210,6 +3255,8 @@ def fn(x, y):
                 ),
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_sign_cpu_only(self):
@@ -4170,6 +4217,7 @@ def fn(x1, x2):
         )
         self.assertEqual(metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     def test_relu_permute_reshape_reinterpret_view(self):
         def fn(x):
             n, c, h, w = x.shape
@@ -4188,6 +4236,8 @@ def fn(x):
             # check that there is no transpose
             FileCheck().check_count("transpose_mxn", 0, exactly=True).run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_attention_size_mismatch(self):
         class Attention(torch.nn.Module):
             def __init__(self, hidden_size, num_heads):
@@ -4417,6 +4467,7 @@ def forward(self, x):
                 actual = compiled_m(x)
                 self.assertEqual(expected, actual)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
@@ -4450,6 +4501,8 @@ def fn(x, weight, bias):
         torch.testing.assert_close(weight_cmp.grad, weight_ref.grad)
         torch.testing.assert_close(bias_cmp.grad, bias_ref.grad)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_int_div_vec(self):
         def fn(x, y, mode):
             return torch.div(x, y, rounding_mode=mode)
@@ -4486,6 +4539,7 @@ def fn(x, y):
         y = torch.randint(0, 255, (3, 3), dtype=torch.uint8)
         self.common(fn, (x, y))
 
+<<<<<<< HEAD
     def test_float32_to_uint8(self):
         # https://github.com/pytorch/pytorch/issues/156788
         @torch.compile
@@ -4499,6 +4553,8 @@ def fn(x):
             msg=f"Expected {x.to(torch.uint8)} but got {fn(x)}",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contiguous_reduction_store(self):
         # https://github.com/pytorch/pytorch/issues/113018
         class M(torch.nn.Module):
@@ -4810,6 +4866,7 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+<<<<<<< HEAD
         # Tail vectorization case
         x = torch.randn((37, 37), dtype=torch.double)
         torch._dynamo.reset()
@@ -4827,6 +4884,8 @@ def fn(x):
                     "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
                 ).run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_double_reduction_vec(self):
         def fn(x):
             return x.sum(dim=1)
@@ -4836,6 +4895,7 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+<<<<<<< HEAD
         # Tail vectorization case
         x = torch.randn((37, 37), dtype=torch.double)
         torch._dynamo.reset()
@@ -4853,6 +4913,8 @@ def fn(x):
                     "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
                 ).run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_convert_fp32_to_double_vec(self):
         def fn(x):
             return x.to(torch.double)
@@ -4862,6 +4924,7 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+<<<<<<< HEAD
         # Tail vectorization case
         x = torch.randn(37, 37)
         torch._dynamo.reset()
@@ -4879,6 +4942,8 @@ def fn(x):
                     "at::vec::convert<double,2,float,1>", 2, exactly=True
                 ).run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_convert_double_to_fp32_vec(self):
         def fn(x):
             return x.to(torch.float32)
@@ -4888,6 +4953,7 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+<<<<<<< HEAD
         # Tail vectorization case
         x = torch.randn((37, 37), dtype=torch.double)
         torch._dynamo.reset()
@@ -4905,6 +4971,8 @@ def fn(x):
                     "at::vec::convert<float,1,double,2>", 2, exactly=True
                 ).run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_redundant_to_dtypes_between_fused_scheduler_node(self):
         # https://github.com/pytorch/pytorch/issues/115260
         p0 = torch.tensor([1.0879], dtype=torch.float16)
@@ -5604,6 +5672,7 @@ def test_vector_norm_compile(self):
         res = compiled_vector_norm(x, ord=2, dim=[], keepdim=False, dtype=None)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_fractional_max_pool2d_3d_input(self):
         """Test for https://github.com/pytorch/pytorch/issues/156682 - 3D input causing assertion error"""
 
@@ -5660,6 +5729,8 @@ def fn(
         result = compiled_func(xs, Ls)
         torch.testing.assert_close(result, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 4e1c48496ebc5..512d101945eca 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -26,7 +26,10 @@
 )
 from torch.testing._internal.common_utils import (
     IS_MACOS,
+<<<<<<< HEAD
     IS_WINDOWS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     skipIfWindows,
     TEST_MKL,
@@ -52,7 +55,11 @@
 
 
 def patches(fn):
+<<<<<<< HEAD
     def skip_cache(self, choices, name, key, benchmark, hint_override=None):
+=======
+    def skip_cache(self, choices, name, key, benchmark):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if benchmark is None:
             return {}
         timings = benchmark(choices)
@@ -297,10 +304,13 @@ def forward(self, x):
                     dtype == torch.float16
                     and torch.ops.mkldnn._is_mkldnn_fp16_supported()
                 )
+<<<<<<< HEAD
                 or (
                     dtype == torch.float32
                     and not dynamo_config.assume_static_by_default
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             and epilogue != "mul"
             and epilogue != "div"
@@ -309,15 +319,33 @@ def forward(self, x):
                 and epilogue == "add"
                 and not bias
             )
+<<<<<<< HEAD
+=======
+            or (
+                dtype == torch.float32
+                and epilogue == "add"
+                and not bias
+                and not dynamo_config.assume_static_by_default
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # Several scenarios where epilogue fusion is not counted in:
             # 1. For bfloat16, the epilogue fusion is part of the template,
             #    not fused via scheduler. This will also be true for float16 when
+<<<<<<< HEAD
             #    hardware has the float16 instruction. And this will also be true
             #    for float32 dynamic mode. The exception is mul or div fusion
             #    which is not supported for oneDNN linear.
             # 2. For bfloat16/float16, when oneDNN linear is not applied, linear w/o bias
             #    plus epilogue add is treated as linear w/ bias.
+=======
+            #    hardware has the float16 instruction. The exception is mul or
+            #    div fusion which is not supported for oneDNN linear.
+            # 2. For bfloat16/float16, when oneDNN linear is not applied, linear w/o bias
+            #    plus epilogue add is treated as linear w/ bias.
+            # 3. For float32, when dynamic shapes is enabled, mkl linear is not applied.
+            #    and linear w/o bias plus epilogue add is treated as addmm.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 0)
         else:
             self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
@@ -798,7 +826,11 @@ def forward(self, arg7_1):
         with verify(dtype) as (atol, rtol):
             self.common(mod, (v,), atol=atol, rtol=rtol)
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 3)
+<<<<<<< HEAD
         self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 0)
+=======
+        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
@@ -828,7 +860,11 @@ def forward(self, x):
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
         vec_amx = VecAMX()
         # Currently brgemm config is only added for half
+<<<<<<< HEAD
         if dtype == torch.half and not vec_amx.is_amx_fp16_supported():
+=======
+        if dtype == torch.half:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_brgemm_counter(vec_amx)
         else:
             self._check_amx_counter(vec_amx)
@@ -1956,7 +1992,11 @@ def test_quantized_linear_with_pointwise_binary(
         input = torch.randn(*B, in_features).to(dtype=torch.float32)
 
         other = torch.randn(*B, out_features).to(dtype=dtype)
+<<<<<<< HEAD
         # Avoid hitting qlinear inplace sum fusion
+=======
+        # Avoid hiting qlinear inplace sum fusion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input_3d:
             other2 = torch.randn(B[0] * B[1], out_features).to(dtype=dtype)
         else:
@@ -1973,7 +2013,11 @@ def __init__(self, bias, input_3d):
 
             def forward(self, x, other, other2):
                 res = self.epilogue(self.linear(x) + other)
+<<<<<<< HEAD
                 # Avoid hitting qlinear inplace sum fusion
+=======
+                # Avoid hiting qlinear inplace sum fusion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if self.input_3d:
                     other2 = other2.view(2, other2.size(0) // 2, other2.size(1))
                 else:
@@ -2213,7 +2257,11 @@ def __init__(self, in_feature, out_feature, gemm_num):
             def forward(self, x):
                 return [linear(x) for linear in self.linears]
 
+<<<<<<< HEAD
         # each linear has different num of out features, thus invalid grouped gemm
+=======
+        # each linear has different num of out features, thus invaild grouped gemm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
@@ -2680,7 +2728,11 @@ def forward(self, x):
     @torch.no_grad
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     @parametrize("bs", (5,))
+<<<<<<< HEAD
     @parametrize("Mdim", (3, 64))  # Test small Mdim which uses reshaped weights
+=======
+    @parametrize("Mdim", (64,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float)
     def test_bmm_self_square(self, bs, Mdim, dtype):
         class M(torch.nn.Module):
@@ -2769,6 +2821,7 @@ def forward(self, x, w):
 
     @patches
     @torch.no_grad
+<<<<<<< HEAD
     @parametrize("bs", (1, 50))
     @parametrize("Mdim", (192,))
     @parametrize("Kdim", (196,))
@@ -2796,6 +2849,8 @@ def forward(self, x, y):
 
     @patches
     @torch.no_grad
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float)
     def test_aoti_bmm_unique_identifiers(self, dtype):
         try:
@@ -2910,6 +2965,7 @@ def forward(self, u, v):
         with verify(u.dtype) as (atol, rtol):
             self.common(mod, (u, v))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
     )
@@ -2951,6 +3007,8 @@ def forward(self, x):
             # Check that only 2 kernels are in the generated code
             assert code.count("AMXState amx_state") == 2
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class _DynamicShapesTestBase(BaseTestSelectAlgorithm):
@@ -3160,5 +3218,9 @@ def forward(self, x, weight):
 if __name__ == "__main__":
     from torch.testing._internal.inductor_utils import HAS_CPU
 
+<<<<<<< HEAD
     if HAS_CPU and not (IS_MACOS or IS_WINDOWS):
+=======
+    if HAS_CPU and not IS_MACOS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index d59ffde121eb5..164d6a061ed0f 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: inductor"]
 # ruff: noqa: F841
 
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import gc
 import math
@@ -27,7 +30,10 @@
     run_fw_bw_and_get_code,
 )
 from torch.fx.experimental.proxy_tensor import make_fx
+<<<<<<< HEAD
 from torch.nn.attention import sdpa_kernel, SDPBackend
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -39,8 +45,11 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
+<<<<<<< HEAD
     MI350_ARCH,
     skipIfRocmArch,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfPy312Plus,
@@ -78,7 +87,11 @@
         sys.exit(0)
     raise
 
+<<<<<<< HEAD
 NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
@@ -89,6 +102,7 @@ class CudaReproTests(TestCase):
     device = "cuda"
     common = check_model_cuda
 
+<<<<<<< HEAD
     def test_mm_out_dtype_compile(self):
         a = torch.randn(1, 3, device="cuda", dtype=torch.float16)
         b = torch.randn(3, 2, device="cuda", dtype=torch.float16)
@@ -102,6 +116,8 @@ def fn(x, y):
         self.assertEqual(result.dtype, expected.dtype)
         self.assertEqual(result, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_put_issue(self):
         def forward(
             self,
@@ -136,6 +152,7 @@ def forward(
         compiled = compile_fx_inner(mod, inps)
         compiled(inps)
 
+<<<<<<< HEAD
     def test_view_replay_padding_issue_163328(self):
         class ReproModule(nn.Module):
             def __init__(self):
@@ -177,6 +194,8 @@ def forward(self, x: torch.Tensor):
         self.assertEqual(compiled_out["ten0"], eager_out["ten0"])
         self.assertEqual(compiled_out["ten1"], eager_out["ten1"])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_effn_attn_bias_padding(self):
         batch_size, num_heads, seq_len, head_dim = 2, 32, 512, 128
 
@@ -220,7 +239,10 @@ def fn(
         # dont check rng state
         self.assertEqual(out[:2], fn(query, key, value, input_tensor2)[:2])
 
+<<<<<<< HEAD
     @skipIfRocmArch(MI350_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_effn_attn_bias_padding_misaligned(self):
         seqlen_start = 1008
 
@@ -236,10 +258,16 @@ def test_effn_attn_bias_padding_misaligned(self):
             inputs = [q, k, v, mask]
 
             def f(q, k, v, mask):
+<<<<<<< HEAD
                 with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
                     return F.scaled_dot_product_attention(
                         q, k, v, attn_mask=mask, dropout_p=0.0
                     )
+=======
+                return F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=mask, dropout_p=0.0
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             f_compiled = torch.compile(f)
 
@@ -247,9 +275,15 @@ def f(q, k, v, mask):
             # padded bias should have an expanded dim
             FileCheck().check("buf0 =").check_same(", 0, ").run(code[0])
             # single fused padded kernel
+<<<<<<< HEAD
             FileCheck().check_count("empty_strided_cuda(", 1, exactly=True).check(
                 "return"
             ).run(code[0])
+=======
+            FileCheck().check("def call").check_count(
+                "empty_strided_cuda", 1, exactly=True
+            ).check("return").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertEqual(out, f(*inputs))
 
@@ -981,6 +1015,7 @@ def test_scatter_index_not_wrapped(self):
             out, torch.scatter_reduce(input_orig.clone(), 0, index, src, "sum")
         )
 
+<<<<<<< HEAD
     def test_normalize_norm_leq_one(self):
         def fn(x: torch.Tensor) -> torch.Tensor:
             return torch.nn.functional.normalize(x, dim=-1)
@@ -993,6 +1028,8 @@ def fn(x: torch.Tensor) -> torch.Tensor:
             torch.all(norm <= 1.0), f"expected norm <= 1.0 but got {norm.item()}"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_libdevice_routing(self):
         def foo(x):
             return x.exp()
@@ -1005,7 +1042,11 @@ def foo(x):
 
         inp = inp.to(torch.float)
         out, code = run_and_get_code(torch.compile(foo), inp)
+<<<<<<< HEAD
         FileCheck().check_not("tl_math.exp").check("libdevice.exp").run(code[0])
+=======
+        FileCheck().check_not("libdevice.exp").check("tl_math.exp").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(foo(inp), out)
 
         def foo(x):
@@ -1418,7 +1459,11 @@ def forward(self, x):
             out2 = model(input2)
             out3 = model(input3)
 
+<<<<<<< HEAD
         self.assertEqual(cnts.frame_count, 2)
+=======
+        self.assertEqual(cnts.frame_count, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch({"triton.cudagraphs": True})
     def test_index_put_no_fallback_cudagraph(self):
@@ -1441,6 +1486,7 @@ def fn(x, y, z):
         self.assertEqual(ref, res)
 
     @torch._inductor.config.patch(emulate_precision_casts=True)
+<<<<<<< HEAD
     def test_emulate_precision_casts_norm_rounding(self):
         torch.manual_seed(0)
         torch.cuda.manual_seed_all(0)
@@ -1563,6 +1609,8 @@ def fn(a0, a1, a2, a3, a4, a5):
             )
 
     @torch._inductor.config.patch(emulate_precision_casts=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dont_inplace_disjoint_accesses(self):
         # TODO - would not need mms if we could annotate donated buffer..
         def forward(  # noqa: F821, F722
@@ -2037,7 +2085,10 @@ def fn(x):
         self.assertEqual(graph.disable_cudagraphs_reason, None)
         self.assertEqual(graph.device_types, {"cuda"})
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE, "Not runnable in fbcode")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_interpret(self):
         import subprocess
 
@@ -2050,7 +2101,11 @@ def test_triton_interpret(self):
 def foo(x):
     return x + 1
 
+<<<<<<< HEAD
 # somehow gives different results.. still, check that it doesn't error
+=======
+# somehow gives different results.. still, check that it doesnt error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 foo(torch.rand([256], device="cuda"))
 """
         subprocess.run([sys.executable, "-c", script], check=True)
@@ -2194,6 +2249,7 @@ def f(x, y):
 
         self.assertEqual(f(x_ref, y_ref), out)
 
+<<<<<<< HEAD
     def test_red_dtype_mismatch(self):
         for per in (True, False):
             torch._dynamo.reset()
@@ -2228,6 +2284,8 @@ def f(arg0_1, arg1_1):
             out = f(x, y)
             self.assertEqual(torch.compile(f)(x, y), out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not config.is_fbcode(),
         "bfloat16 atomic add is only supported in fbcode today #97016",
@@ -2325,7 +2383,10 @@ def get_input() -> torch.Tensor:
         self.assertIn("znumel", code)
 
     @xfailIfPy312Plus  # https://github.com/pytorch/pytorch/issues/142032
+<<<<<<< HEAD
     @unittest.skipIf(config.is_fbcode(), "Dependence on functorch.einops")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_repeated_masked_load(self):
         target_size = (8, 2)
         mem_eff_temporal_upsampling_interp_chunks = 2
@@ -2395,6 +2456,7 @@ def forward(self, x):
 
         self.assertEqual(default_output, max_autotune_output)
 
+<<<<<<< HEAD
     def test_adaptive_avg_pool3d_issue_157248(self):
         """Test for GitHub issue #157248: Conv2d-unsqueeze-AdaptiveAvgPool3d produces incorrect results"""
 
@@ -2632,4 +2694,12 @@ def test_truediv_base_not_bitwise_equivalent(self):
     from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
     if HAS_CUDA_AND_TRITON and not TEST_WITH_ASAN:
+=======
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+    from torch.testing._internal.inductor_utils import HAS_CUDA
+
+    if HAS_CUDA and not TEST_WITH_ASAN:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cudacodecache.py b/test/inductor/test_cudacodecache.py
index b6786130416bd..de676d36fa038 100644
--- a/test/inductor/test_cudacodecache.py
+++ b/test/inductor/test_cudacodecache.py
@@ -1,15 +1,25 @@
 # Owner(s): ["module: inductor"]
 
 import ctypes
+<<<<<<< HEAD
 
 import torch
+=======
+import unittest
+
+import torch
+from torch._inductor import config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.async_compile import AsyncCompile
 from torch._inductor.codecache import CUDACodeCache
 from torch._inductor.codegen.cuda.cuda_env import nvcc_exist
 from torch._inductor.exc import CUDACompileError
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import fresh_cache
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _SOURCE_CODE = r"""
@@ -36,8 +46,13 @@
 """
 
 
+<<<<<<< HEAD
 class TestCUDACodeCache(InductorTestCase):
     @requires_cuda_and_triton
+=======
+@unittest.skipIf(config.is_fbcode(), "fbcode requires different CUDA_HOME setup")
+class TestCUDACodeCache(InductorTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cuda_load(self):
         with fresh_cache():
             # Test both .o and .so compilation.
@@ -49,8 +64,13 @@ def test_cuda_load(self):
             dll_wrapper, so_hash_key, source_code_path1 = CUDACodeCache.load(
                 _SOURCE_CODE, "so"
             )
+<<<<<<< HEAD
             self.assertEqual(source_code_path0, source_code_path1)
             self.assertEqual(object_hash_key, so_hash_key)
+=======
+            self.assertNotEqual(source_code_path0, source_code_path1)
+            self.assertNotEqual(object_hash_key, so_hash_key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Test load and call functions in .so.
             x = torch.rand(10).float().cuda()
@@ -65,14 +85,20 @@ def test_cuda_load(self):
             )
             torch.testing.assert_close(y, expected_y)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compilation_error(self):
         with fresh_cache():
             error_source_code = _SOURCE_CODE.replace("saxpy_device", "saxpy_wrong", 1)
             with self.assertRaises(CUDACompileError):
                 CUDACodeCache.compile(error_source_code, "o")
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_async_compile(self):
         with fresh_cache():
             async_compile = AsyncCompile()
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index c46c3b86055cb..0180af99012c1 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -5,7 +5,10 @@
 import gc
 import importlib
 import itertools
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import unittest
 import warnings
@@ -41,7 +44,10 @@
     skipIfRocm,
     TEST_CUDA_GRAPH,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -57,8 +63,16 @@
 importlib.import_module("functorch")
 importlib.import_module("filelock")
 
+<<<<<<< HEAD
 
 aten = torch.ops.aten
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+aten = torch.ops.aten
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_multigpu = functools.partial(
     unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
 )
@@ -123,7 +137,11 @@ def tearDown(self):
         torch._dynamo.reset()
 
 
+<<<<<<< HEAD
 if HAS_CUDA_AND_TRITON:
+=======
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_all_cudagraph_segments():
         segments = torch.cuda.memory_snapshot()
@@ -177,7 +195,11 @@ def tearDown(self):
 
         def get_manager(self, device_index=None):
             return torch._inductor.cudagraph_trees.get_container(
+<<<<<<< HEAD
                 device_index if device_index else self.device_idx
+=======
+                self.device_idx if not device_index else device_index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).tree_manager
 
         def get_roots(self):
@@ -280,6 +302,7 @@ def foo(x, y):
             with capture_stderr() as captured_output:
                 foo(torch.ones([10], device="cuda"), torch.ones([20]))
 
+<<<<<<< HEAD
             if torch._inductor.config.graph_partition:
                 # graph partition splits on cpu ops
                 self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
@@ -288,6 +311,12 @@ def foo(x, y):
                     "skipping cudagraphs due to cpu device (arg1_1). Found from"
                 ).check("y + 2").run(captured_output[0])
                 self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+=======
+            FileCheck().check(
+                "skipping cudagraphs due to cpu device (arg1_1). Found from"
+            ).check("y + 2").run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with capture_stderr() as captured_output:
                 foo(
@@ -297,10 +326,14 @@ def foo(x, y):
             FileCheck().check("skipping cudagraphs due to multiple devices").run(
                 captured_output[0]
             )
+<<<<<<< HEAD
             self.assertEqual(
                 counters["inductor"]["cudagraph_skips"],
                 1 if torch._inductor.config.graph_partition else 2,
             )
+=======
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch._inductor.config.patch("triton.cudagraph_skip_dynamic_graphs", True)
         def test_skip_symbolic(self):
@@ -338,7 +371,11 @@ def inp():
             ).check(".add_(2)").run(captured_output[0])
             self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
+<<<<<<< HEAD
             # mutation on inp doesn't hit cudagraphs
+=======
+            # mutation on inp doesnt hit cudagraphs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(self.get_manager().roots), 0)
 
             # mutation on parameters/buffers hits cudagraphs
@@ -570,8 +607,13 @@ def foo2(x):
                 del out
 
                 # when I tried inducing separate recordings via graph break,
+<<<<<<< HEAD
                 # the frame kept interfering by keeping outputs alive
                 # this isn't great by simulates the logic.
+=======
+                # the frame kept interferring by keeping outputs alive
+                # this isnt great by simulates the logic.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 from torch._dynamo.mutation_guard import GenerationTracker
 
                 GenerationTracker.generation -= 1
@@ -581,7 +623,11 @@ def foo2(x):
 
             foo_opt(torch.ones([4, 4], device="cuda"))
 
+<<<<<<< HEAD
             # Two separate traces - one has a child, one doesn't
+=======
+            # Two separate traces - one has a child, one doesnt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(self.get_root_children(), [1, 0])
 
         def test_execution_into_recording(self):
@@ -815,6 +861,7 @@ def foo(x):
             # the three saved tensors should die in the backward
             # we kept alive the output
             self.assertEqual(self.curr_node().expected_dead_indices_before_graph, [])
+<<<<<<< HEAD
             if torch._inductor.config.graph_partition:
                 self.assertEqual(
                     self.curr_node().expected_dead_indices_after_graph,
@@ -825,6 +872,12 @@ def foo(x):
                     self.curr_node().expected_dead_indices_after_graph,
                     [(0, 1), (0, 2)],
                 )
+=======
+            self.assertEqual(
+                self.curr_node().expected_dead_indices_after_graph,
+                [(0, 1), (0, 2)],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertFalse(self.get_manager().new_graph_id().id == 0)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
 
@@ -914,6 +967,7 @@ def test_unaligned_static_input_no_cudagraphs(self):
             self._test_unaligned_static_input_impl(expected_clones=0)
 
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
         @torch._inductor.config.patch("implicit_fallbacks", True)
         def test_graph_partition_custom_rule(self):
             def get_num_partitions(code):
@@ -1094,6 +1148,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 self.assertEqual(eager_out, compiled_out)
 
         @torch._inductor.config.patch("graph_partition", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._inductor.config.patch("triton.cudagraph_trees", False)
         def test_graph_partition_gc(self):
             def _test_dummy():
@@ -1321,6 +1377,7 @@ def foo2(x):
 
             node = self.curr_node()
             first_node = next(node._path_from_root)
+<<<<<<< HEAD
             if torch._inductor.config.graph_partition:
                 # graph partition may changed the order of outputs
                 self.assertFalse(first_node.unaliased_in_all_paths[1])
@@ -1328,6 +1385,10 @@ def foo2(x):
             else:
                 self.assertFalse(first_node.unaliased_in_all_paths[0])
                 self.assertTrue(first_node.cached_tensor_outputs[0] is None)
+=======
+            self.assertFalse(first_node.unaliased_in_all_paths[0])
+            self.assertTrue(first_node.cached_tensor_outputs[0] is None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch._inductor.config.patch("implicit_fallbacks", True)
         def test_multinomial(self):
@@ -1522,7 +1583,11 @@ def test_multiple_insert_removal_caching(self):
                 torch._C._set_cached_tensors_enabled(False)
 
         def test_accumulate_grad(self):
+<<<<<<< HEAD
             # cudagraph trees shouldn't interfere with accumulation logic
+=======
+            # cudagraph trees shouldnt interfere with accumulation logic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def compute_grad(grad_output, create_graph):
                 x = torch.randn(5, 5, requires_grad=True, device="cuda")
@@ -1563,7 +1628,11 @@ def foo(x):
             for _ in range(3):
                 out = frozen(torch.rand([10, 10], device="cuda"))
 
+<<<<<<< HEAD
             # didn't do additional recordings
+=======
+            # didnt do additional recordings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(self.get_manager().new_graph_id().id == 2)
 
         def test_empty_cpu_tensor(self):
@@ -1830,6 +1899,7 @@ def foo(x):
             # the three saved tensors should die in the backward
             # we kept alive the output
             self.assertEqual(self.curr_node().expected_dead_indices_before_graph, [])
+<<<<<<< HEAD
             if torch._inductor.config.graph_partition:
                 self.assertEqual(
                     self.curr_node().expected_dead_indices_after_graph,
@@ -1840,6 +1910,12 @@ def foo(x):
                     self.curr_node().expected_dead_indices_after_graph,
                     [(0, 1), (0, 2)],
                 )
+=======
+            self.assertEqual(
+                self.curr_node().expected_dead_indices_after_graph,
+                [(0, 1), (0, 2)],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertFalse(self.get_manager().new_graph_id().id == 0)
 
         def test_separate_recordings(self):
@@ -1937,6 +2013,7 @@ def foo(args):
                     args.clear()
                     return (x + 3,)
 
+<<<<<<< HEAD
                 inp = torch.rand([20, 20], device=f"cuda:{self.device_idx}")
 
                 inp_list = [inp]
@@ -1950,6 +2027,27 @@ def foo(args):
 
             test()
             self.assertTrue(self.get_manager(device_index=self.device_idx) is None)
+=======
+                inp = torch.rand([20, 20], device="cuda:1")
+
+                inp_list = [inp]
+                foo_cg = tree_cudagraphify_impl(
+                    foo,
+                    inp_list,
+                    (),
+                    device_index=1,
+                    is_backward=False,
+                    is_inference=True,
+                )
+                for _ in range(3):
+                    self.assertEqual(foo_cg([inp]), foo([inp]))
+
+                self.assertTrue(self.get_manager(device_index=0) is None)
+                self.assertFalse(self.get_manager(device_index=1) is None)
+
+            test()
+            self.assertTrue(self.get_manager(device_index=1) is None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_error_on_dealloc_use(self):
             @torch.compile()
@@ -2259,19 +2357,31 @@ def test_storage_access_error(self):
                 device = x.untyped_storage()
 
         def test_side_stream_memory_allocation(self):
+<<<<<<< HEAD
             device = f"cuda:{self.device_idx}"
+=======
+            from torch._inductor.cudagraph_trees import cudagraphify_impl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def multi_stream_allocation(args):
                 side_stream = torch.cuda.Stream()
                 side_stream.wait_stream(torch.cuda.current_stream())
                 with torch.cuda.stream(side_stream):
                     side_stream_buffer = torch.ones(
+<<<<<<< HEAD
                         *args, device=device, dtype=torch.float32
+=======
+                        *args, device="cuda:0", dtype=torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 torch.cuda.current_stream().wait_stream(side_stream)
 
                 main_stream_buffer = torch.ones(
+<<<<<<< HEAD
                     *args, device=device, dtype=torch.float32
+=======
+                    *args, device="cuda:0", dtype=torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 if isinstance(args, list):
@@ -2279,17 +2389,29 @@ def multi_stream_allocation(args):
 
                 return main_stream_buffer, side_stream_buffer
 
+<<<<<<< HEAD
             graphed_multi_stream_func = tree_cudagraphify_impl(
+=======
+            graphed_multi_stream_func = cudagraphify_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 multi_stream_allocation,
                 inputs=[],
                 static_input_idxs=[],
                 is_backward=False,
                 is_inference=False,
+<<<<<<< HEAD
                 device_index=self.device_idx,
                 stack_traces=["dummy stack trace1", "dummy stack trace2"],
             )
 
             ref_out = torch.ones((2, 3), device=device, dtype=torch.float32)
+=======
+                device_index=0,
+                stack_traces=["dummy stack trace1", "dummy stack trace2"],
+            )
+
+            ref_out = torch.ones((2, 3), device="cuda:0", dtype=torch.float32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for _ in range(3):
                 torch.compiler.cudagraph_mark_step_begin()
@@ -2336,8 +2458,13 @@ def forward(self, x) -> torch.Tensor:
             with self.assertRaisesRegex(
                 Exception,
                 r"(?s)static input data pointer changed.\n"
+<<<<<<< HEAD
                 r"input name: primals_.*. data pointer changed from .* to .*. input stack trace:.*"
                 r"input name: primals_.*. data pointer changed from .* to .*. input stack trace:.*,"
+=======
+                r"input name: primals_2. data pointer changed from .* to .*. input stack trace:.*"
+                r"input name: primals_3. data pointer changed from .* to .*. input stack trace:.*,"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r" in forward\n.* self.static_tensor.add\_\(torch.ones\(\(2, 2\), device=\"cuda\"\)\).*\n",
             ):
                 self.curr_node().run(
@@ -2859,6 +2986,11 @@ def iter(batch_size: int, mod: torch.nn.Module):
                 for batch_size in range(10, 200, 10):
                     iter(batch_size, mod)
 
+<<<<<<< HEAD
+=======
+            print(captured_output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FileCheck().check_count(
                 "CUDAGraph supports dynamic shapes by recording a new graph for each "
                 "distinct input size. Recording too many CUDAGraphs may lead to "
@@ -2905,6 +3037,7 @@ def f(x, y):
             # 2 graph partitions lead to 2 cudagraph
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+<<<<<<< HEAD
         def test_graph_partition_view_fallback(self):
             def f(x):
                 y = x + 1
@@ -2937,6 +3070,8 @@ def foo(x, y):
                 "cudagraph partition into 2 partitions"
             ).run(captured_output[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_cpu_scalar1(self):
             def f(x, y):
@@ -3061,6 +3196,7 @@ def foo(x):
             self.assertEqual(x, torch.tensor(1, device="cpu"))
 
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
         def test_graph_partition_cpu_scalar_multiple(self):
             def f(x, y, z):
                 return x + y, x + z
@@ -3083,6 +3219,8 @@ def f(x, y, z):
             self.assertEqual(self.get_manager().new_graph_id().id, 1)
 
         @torch._inductor.config.patch("graph_partition", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._inductor.config.patch("triton.cudagraphs", False)
         def test_graph_partition_reduce_overhead_mode_effectiveness(self):
             # test that `mode="reduce-overhead"` still controls whether
@@ -3424,6 +3562,7 @@ def fn(x):
 
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
         def test_graph_partition_custom_op_mutation_late_free(self):
             @torch.library.custom_op(
                 "mylib::op1",
@@ -3478,6 +3617,8 @@ def f(x):
 
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_graph_partition_custom_op_dynamoc_shapes(self):
             @torch.library.custom_op(
                 "mylib::movement",
@@ -3818,6 +3959,7 @@ def run(padded_size, original_size):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+<<<<<<< HEAD
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_simple(self):
             def f(x, y):
@@ -4090,6 +4232,8 @@ def foo(x):
             compiled_out = compiled_foo(x)
             self.assertEqual(eager_out, compiled_out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_meta_tensor(self):
             def foobar(x, y):
                 return x * 2, y * 3
@@ -4107,8 +4251,14 @@ def foobar(x, y):
             self.assertEqual(eager_out, compiled_out)
             self.assertEqual(self.get_manager().new_graph_id().id, 1)
 
+<<<<<<< HEAD
         @torch._inductor.config.patch("triton.cudagraph_capture_sizes", (2, 5, 7))
         def test_cudagraph_capture_sizes(self):
+=======
+        def test_cudagraph_capture_sizes(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (2, 5, 7)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def f(x):
                 return x + 1
 
@@ -4125,16 +4275,26 @@ def run(shape):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 3)
 
+<<<<<<< HEAD
         @torch._inductor.config.patch(
             "triton.cudagraph_capture_sizes",
             (
+=======
+        def test_cudagraph_capture_sizes1(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (2, 3),
                 (4, 5),
                 (6, 2),
                 (7, 3),
+<<<<<<< HEAD
             ),
         )
         def test_cudagraph_capture_sizes1(self):
+=======
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def f(x):
                 return x + 1
 
@@ -4153,16 +4313,26 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
+<<<<<<< HEAD
         @torch._inductor.config.patch(
             "triton.cudagraph_capture_sizes",
             (
+=======
+        def test_cudagraph_capture_sizes2(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (2, 3, 4),
                 (4, 4, 3),
                 (3, 4, 4),
                 (4, 2, 3),
+<<<<<<< HEAD
             ),
         )
         def test_cudagraph_capture_sizes2(self):
+=======
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def f(x):
                 return x + 1
 
@@ -4183,6 +4353,7 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
+<<<<<<< HEAD
         @torch._inductor.config.patch("triton.cudagraph_or_error", True)
         def test_cudagraph_or_error(self):
             def f(x):
@@ -4194,6 +4365,8 @@ def f(x):
             with self.assertRaises(RuntimeError):
                 f(torch.tensor(1, device="cuda"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class TestSAC(TestCase):
         def _make_observer_mode(self):
             class ObserverMode(TorchDispatchMode):
@@ -4543,7 +4716,11 @@ def multi_fn(x, y, a, b):
             a = torch.randn(4, 4, device="cuda:1", requires_grad=True)
             b = torch.randn(4, 4, device="cuda:1", requires_grad=True)
 
+<<<<<<< HEAD
             # No errors. TODO - get graphs from logging, couldn't figure out how
+=======
+            # No errors. TODO - get graphs from logging, couldnt figure out how
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             multi_fn_c = torch.compile(multi_fn, backend="aot_eager_decomp_partition")
 
             out = multi_fn_c(x, y, a, b)
@@ -4608,5 +4785,9 @@ def fn(x, y):
             sys.exit(0)
         raise unittest.SkipTest("cuda graph test is skipped")
 
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON:
+=======
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cudagraph_trees_expandable_segments.py b/test/inductor/test_cudagraph_trees_expandable_segments.py
index 65597316091d4..036fe2687add5 100644
--- a/test/inductor/test_cudagraph_trees_expandable_segments.py
+++ b/test/inductor/test_cudagraph_trees_expandable_segments.py
@@ -8,13 +8,21 @@
 import torch
 from torch.testing._internal.common_cuda import IS_JETSON, IS_WINDOWS
 from torch.testing._internal.common_utils import run_tests
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if HAS_CUDA_AND_TRITON:
+=======
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         from .test_cudagraph_trees import CudaGraphTreeTests
     except ImportError:
@@ -32,12 +40,16 @@
 sys.path.remove(str(REPO_ROOT))
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if (
         torch.cuda.is_available()
         and not IS_JETSON
         and not IS_WINDOWS
         and HAS_CUDA_AND_TRITON
     ):
+=======
+    if torch.cuda.is_available() and not IS_JETSON and not IS_WINDOWS and HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_disabled_tests(".")
 
         torch.cuda.memory._set_allocator_settings("expandable_segments:True")
diff --git a/test/inductor/test_custom_post_grad_passes.py b/test/inductor/test_custom_post_grad_passes.py
index c7823845bd570..f427f66c5322e 100644
--- a/test/inductor/test_custom_post_grad_passes.py
+++ b/test/inductor/test_custom_post_grad_passes.py
@@ -66,6 +66,7 @@ def change_cos_pass(graph):
             node.target = aten.sin.default
 
 
+<<<<<<< HEAD
 class ChangeCosCustomPass(CustomGraphPass):
     def __init__(self) -> None:
         super().__init__()
@@ -77,6 +78,8 @@ def uuid(self) -> bytes:
         return get_hash_for_files((__file__,))
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPostGradCustomPrePostPass(TestCustomPassBase):
     #  mkldnn fusion's pattern_matcher
     # (torch/_inductor/fx_passes/mkldnn_fusion.py),
@@ -145,7 +148,11 @@ def forward(self, x):
             return x1.relu()
 
     def test_custom_joint_pass_pre(self):
+<<<<<<< HEAD
         with config.patch(joint_custom_pre_pass=ChangeCosCustomPass()):
+=======
+        with config.patch(joint_custom_pre_pass=change_cos_pass):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def g(x):
                 return x.sin().sin().sin()
@@ -157,7 +164,11 @@ def f(x):
             torch.testing.assert_close(torch.compile(f)(x), g(x))
 
     def test_custom_joint_pass_post(self):
+<<<<<<< HEAD
         with config.patch(joint_custom_post_pass=ChangeCosCustomPass()):
+=======
+        with config.patch(joint_custom_post_pass=change_cos_pass):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def g(x):
                 return x.sin().sin().sin()
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index 55f8dd5d24ebc..1884dc6a7e428 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -8,17 +8,27 @@
 import time
 import unittest
 import unittest.mock as mock
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from pathlib import Path
 from typing import Optional
 
 from torch._dynamo.exc import BackendCompilerFailed
+=======
+from enum import Enum
+from pathlib import Path
+from typing import Callable, Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
 from torch._inductor.utils import clear_caches
 from torch.export import Dim
 from torch.testing._internal.logging_utils import log_settings
+<<<<<<< HEAD
 from torch.utils import _pytree as pytree
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -59,12 +69,20 @@
     _quantize_rowwise,
     _quantize_tensorwise,
     HAS_CPU,
+<<<<<<< HEAD
     HAS_CUDA_AND_TRITON,
+=======
+    HAS_CUDA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
 torch.set_float32_matmul_precision("high")
+<<<<<<< HEAD
 if HAS_CUDA_AND_TRITON:
+=======
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
@@ -85,10 +103,17 @@ def _check_if_instances_equal(op1, op2) -> bool:
     Utility function to check if two instances of a class are equal.
     """
     # cutlass uses list and tuple inconsistently
+<<<<<<< HEAD
     if isinstance(op1, (list | tuple)):
         return tuple(op1) == tuple(op2)
 
     if type(op1) is not type(op2):
+=======
+    if isinstance(op1, (list, tuple)):
+        return tuple(op1) == tuple(op2)
+
+    if type(op1) != type(op2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     # some classes have __eq__ defined but they may be insufficient
@@ -108,15 +133,22 @@ def _check_if_instances_equal(op1, op2) -> bool:
     return True
 
 
+<<<<<<< HEAD
 un_ops_under_test = [torch.relu, torch.tanh, torch.exp, torch.sigmoid]
+=======
+un_ops_under_test = [torch.relu]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bin_ops_under_test = [torch.add, torch.mul, torch.sub, torch.div]
 
 evt_all_ops = parametrize(
     "op", un_ops_under_test + bin_ops_under_test, name_fn=lambda f: f.__name__
 )
 
+<<<<<<< HEAD
 evt_un_ops = parametrize("op", un_ops_under_test, name_fn=lambda f: f.__name__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 evt_bin_ops = parametrize("op", bin_ops_under_test, name_fn=lambda f: f.__name__)
 
 evt_all_shapes = parametrize("shape", itertools.product([512, 1024], repeat=2))
@@ -151,6 +183,7 @@ def gen_args(op, shape, dtype=torch.float16):
 )
 
 
+<<<<<<< HEAD
 def select_no_algorithm(*args, **kwargs):
     """
     Utility function to skip precompilation and autotuning.
@@ -163,6 +196,13 @@ class TestCutlassBackend(TestCase):
     def setUp(self):
         if not HAS_CUDA_AND_TRITON:
             self.skipTest("CUDA and triton are not available")
+=======
+@instantiate_parametrized_tests
+class TestCutlassBackend(TestCase):
+    def setUp(self):
+        if not HAS_CUDA:
+            self.skipTest("CUDA is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip:
             self.skipTest("CUTLASS backend is not supported on HIP")
 
@@ -190,7 +230,11 @@ def tearDown(self):
     def run_evt_test(self, model, op, shape, num_fusions=1):
         M, N = shape
         a = torch.ones(M, N).cuda().half()
+<<<<<<< HEAD
         b = torch.ones(N, N).cuda().half().t()
+=======
+        b = torch.ones(N, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_args = gen_args(op, (M, N))
         model = model.cuda()
 
@@ -203,6 +247,7 @@ def run_evt_test(self, model, op, shape, num_fusions=1):
         )
         torch.testing.assert_close(result, ref_result)
 
+<<<<<<< HEAD
     def test_check_paths(self):
         cutlass_mock_imports_path = os.path.join(
             os.path.dirname(torch.__file__),
@@ -216,6 +261,8 @@ def test_check_paths(self):
         self.assertTrue(os.path.exists(cutlass_mock_pydot_path))
         self.assertTrue(os.path.exists(cutlass_mock_scipy_path))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_threshold(self):
@@ -227,7 +274,11 @@ def mm(a, b):
             return a @ b
 
         a = torch.randn(100, 10).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(100, 10).cuda().half().t()
+=======
+        b = torch.randn(10, 100).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -238,6 +289,13 @@ def mm(a, b):
                 "cuda.cutlass_max_profiling_configs": 2,
             }
         ):
+<<<<<<< HEAD
+=======
+
+            def select_no_algorithm(*args, **kwargs):
+                raise NoValidChoicesError
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with mock.patch(
                 "torch._inductor.kernel.mm.autotune_select_algorithm",
                 wraps=select_no_algorithm,
@@ -255,7 +313,11 @@ def test_import_cutlass(self):
 
         self.assertTrue(try_import_cutlass())
 
+<<<<<<< HEAD
         import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401
+=======
+        import cutlass  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import cutlass_library  # noqa: F401
 
     def test_cutlass_key(self):
@@ -279,7 +341,11 @@ def test_cutlass_backend_subproc_mm(self):
         M, N, K = 4096, 2048, 25728
 
         a = torch.randn(M, K).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(N, K).cuda().half().t()
+=======
+        b = torch.randn(K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -294,19 +360,34 @@ def test_cutlass_backend_subproc_mm(self):
             Y = torch.mm(a, b)
             torch.testing.assert_close(Y_compiled, Y)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     def test_cutlass_backend_subproc_addmm(self, dtype):
+=======
+    @unittest.skipIf(
+        True, "FIXME: Disabled temporarily since IMA or crashing in subprocess"
+    )
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_cutlass_backend_subproc_addmm(self, shape_combo):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Test autotune_in_subproc works for addmm.
         """
 
         M, N, K = 4096, 2048, 25728
+<<<<<<< HEAD
         dtype = torch.float16
 
         a = torch.randn(M, K, dtype=dtype).cuda()
         b = torch.randn(N, K, dtype=dtype).cuda().t()
+=======
+
+        a = torch.randn(M, K).cuda().half()
+        b = torch.randn(K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x_shapes = [
             (M, N),
@@ -328,10 +409,14 @@ def test_cutlass_backend_subproc_addmm(self, dtype):
             }
         ):
             for x_shape in x_shapes:
+<<<<<<< HEAD
                 torch._dynamo.reset()
                 clear_caches()
 
                 x = torch.randn(x_shape).cuda().to(dtype)
+=======
+                x = torch.randn(x_shape).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Y_compiled = torch.compile(torch.addmm)(x, a, b, alpha=alpha, beta=beta)
                 Y = torch.addmm(x, a, b, alpha=alpha, beta=beta)
                 torch.testing.assert_close(Y_compiled, Y)
@@ -346,7 +431,11 @@ def test_cutlass_backend_subproc_bmm(self):
         B, M, N, K = 10, 4096, 2048, 25728
 
         a = torch.randn(B, M, K).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(B, N, K).cuda().half().permute(0, 2, 1)
+=======
+        b = torch.randn(B, K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -378,8 +467,13 @@ def forward(self, a, b, c):
 
         model = MyModel()
         a = torch.randn(128, 16).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 16).cuda().half().t()
         c = torch.randn(512, 16).cuda().half().t()
+=======
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -420,8 +514,13 @@ def forward(self, a, b, c):
 
         model = MyModel()
         a = torch.randn(128, 16).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 16).cuda().half().t()
         c = torch.randn(512, 16).cuda().half().t()
+=======
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -434,9 +533,13 @@ def forward(self, a, b, c):
                     2,
                     4,
                 ],  # guarantees > 1 choices
+<<<<<<< HEAD
                 "fx_graph_cache": False,
                 "fx_graph_remote_cache": False,
                 "autotune_local_cache": False,
+=======
+                "force_disable_caches": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             from torch._inductor.utils import run_and_get_code
@@ -655,7 +758,11 @@ def forward(self, x, a, b):
                 (
                     torch.randn(x_shape(M, N)).cuda().to(dtype),
                     torch.randn(M, K).cuda().to(dtype),
+<<<<<<< HEAD
                     torch.randn(N, K).cuda().to(dtype).t(),
+=======
+                    torch.randn(K, N).cuda().to(dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for (M, N, K) in shapes
             ]
@@ -697,7 +804,10 @@ def forward(self, x, a, b):
     @parametrize("dynamic", (False, True))
     @parametrize("use_aoti", (False, True))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
+<<<<<<< HEAD
     @parametrize("use_expand", (False, True))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_bmm(
         self,
@@ -705,7 +815,10 @@ def test_max_autotune_cutlass_backend_bmm(
         use_aoti: bool = False,
         max_autotune_gemm_backends: str = "CUTLASS",
         dtype: torch.dtype = torch.float16,
+<<<<<<< HEAD
         use_expand: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Main test for bmm.
@@ -723,6 +836,7 @@ def forward(self, a, b):
         ]
         shapes = shapes[0:1] if not dynamic else shapes
 
+<<<<<<< HEAD
         inputs = []
         for B, M, N, K in shapes:
             if use_expand:
@@ -734,6 +848,15 @@ def forward(self, a, b):
 
             B_tensor = torch.randn(B, N, K).cuda().to(dtype).permute(0, 2, 1)
             inputs.append((A, B_tensor))
+=======
+        inputs = [
+            (
+                torch.randn(B, M, K).cuda().to(dtype),
+                torch.randn(B, N, K).cuda().to(dtype).permute(0, 2, 1),
+            )
+            for B, M, N, K in shapes
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_shapes = (
             {
                 "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.DYNAMIC},
@@ -768,7 +891,15 @@ def test_max_autotune_cutlass_backend_regular_mm_streamk(
         Make sure autotuning mm in sub processes work without crashes.
         """
 
+<<<<<<< HEAD
         compiled_model = torch.compile(torch.mm, dynamic=dynamic)
+=======
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(128, 16).cuda().half()
+        b = torch.randn(16, 128).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -794,13 +925,20 @@ def test_max_autotune_cutlass_backend_regular_mm_streamk(
                 ),
             ):
                 a = torch.randn(M, K).cuda().half()
+<<<<<<< HEAD
                 b = torch.randn(N, K).cuda().half().t()
                 Y_compiled = compiled_model(a, b)
                 Y = torch.mm(a, b)
+=======
+                b = torch.randn(K, N).cuda().half()
+                Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+                Y = mm(a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # we need relaxed numerical limits due to the sheer size of the
                 # matmuls involved. Many small addition differences add up.
                 torch.testing.assert_close(Y_compiled, Y, atol=0.01, rtol=0.01)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     def test_streamk_with_dynamic(
         self,
@@ -855,6 +993,8 @@ def test_streamk_with_static(
             ):
                 _ = compiled_model(a, b)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_max_autotune_cutlass_backend_epilogue_fusion(
         self,
         dynamic: bool = False,
@@ -871,10 +1011,17 @@ def _test_max_autotune_cutlass_backend_epilogue_fusion(
         # that allows fusions
         if batch_size is None:
             a = torch.randn(256, 32).cuda()
+<<<<<<< HEAD
             b = torch.randn(256, 32).cuda().t()
         else:
             a = torch.randn(batch_size, 256, 32).cuda()
             b = torch.randn(batch_size, 256, 32).cuda().permute(0, 2, 1)
+=======
+            b = torch.randn(32, 256).cuda()
+        else:
+            a = torch.randn(batch_size, 256, 32).cuda()
+            b = torch.randn(batch_size, 32, 256).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if fp16:
             a = a.half()
             b = b.half()
@@ -1013,7 +1160,11 @@ def forward(self, x, w):
             }
 
             x = torch.randn(M, K).cuda().half()
+<<<<<<< HEAD
             w = torch.randn(N, K).cuda().half().t()
+=======
+            w = torch.randn(K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -1051,7 +1202,11 @@ def forward(self, x, w):
             }
 
             x = torch.randn(M, K).cuda().half()
+<<<<<<< HEAD
             w = torch.randn(N, K).cuda().half().t()
+=======
+            w = torch.randn(K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -1081,7 +1236,11 @@ def forward(self, x, w):
             M, N, K = 200, 5216, 10_432
 
             x = torch.randn(M, K).cuda().half()
+<<<<<<< HEAD
             w = torch.randn(N, K).cuda().half().t()
+=======
+            w = torch.randn(K, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             actual = AOTIRunnerUtil.run(
                 model,
@@ -1150,7 +1309,14 @@ def my_addmm(x, a, b, alpha, beta):
 
         x = torch.randn((128, 128)).cuda().half()
         a = torch.randn(128, 128).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 128).cuda().half().t()
+=======
+        b = torch.randn(128, 128).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with fresh_cache():
             with config.patch(
@@ -1195,7 +1361,14 @@ def addmm(x, a, b, alpha, beta):
 
         x = torch.randn((128, 128)).cuda().half()
         a = torch.randn(128, 128).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 128).cuda().half().t()
+=======
+        b = torch.randn(128, 128).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with fresh_cache():
             with config.patch(
@@ -1267,6 +1440,12 @@ def linear(
 
         linear_compiled = torch.compile(linear, backend="inductor")
 
+<<<<<<< HEAD
+=======
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def run_test(use_fast_accum):
             with fresh_cache():
                 with config.patch(
@@ -1344,6 +1523,12 @@ def test_cutlass_backend_shape_coverage_mm(
             ),
         ]
 
+<<<<<<< HEAD
+=======
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with (
             fresh_cache(),
             config.patch(
@@ -1388,6 +1573,67 @@ def test_cutlass_backend_shape_coverage_mm(
                     f"M={M}, N={N}, K={K}",
                 )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("presets", ("", "0", "0,999"))
+    def test_cutlass_presets(
+        self,
+        presets: str,
+    ):
+        """
+        Test if some configs can be generated with presets.
+        """
+
+        M, N, K = (128, 128, 16)
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(K, N).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+        with (
+            fresh_cache(),
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "CUTLASS",
+                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cuda.cutlass_presets": presets,
+                }
+            ),
+            mock.patch(
+                "torch._inductor.kernel.mm.autotune_select_algorithm",
+                wraps=select_no_algorithm,
+            ) as sa,
+        ):
+            with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
+                torch.compile(torch.mm)(A, B)
+
+            self.assertTrue(
+                sa.called,
+                f"autotune_select_algorithm was not called with shape M={M}, N={N}, K={K}",
+            )
+            args, _ = sa.call_args
+            op_name, choices, _, __ = args
+            assert op_name == "mm"
+            cuda_template_count = 0
+            for choice in choices:
+                if isinstance(choice, CUDATemplateCaller):
+                    choice_info = choice.info_dict()
+                    op_conf_name = choice_info.get("op_conf_name", "")
+                    assert isinstance(op_conf_name, str)
+                    cuda_template_count += 1
+
+            self.assertGreater(
+                cuda_template_count,
+                0,
+                "No CUDATemplateCaller choices found for matmul with shape "
+                f"M={M}, N={N}, K={K}",
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM80OrLater, "need sm_80")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_get_max_alignment(self):
@@ -1455,7 +1701,11 @@ def test_standalone_runner(self):
         max_autotune_gemm_backends = "CUTLASS"
 
         a = torch.randn(128, 16).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 16).cuda().half().t()
+=======
+        b = torch.randn(16, 128).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -1538,7 +1788,11 @@ def mm(a, b):
             return a @ b
 
         a = torch.randn(128, 16).cuda().half()
+<<<<<<< HEAD
         b = torch.randn(128, 16).cuda().half().t()
+=======
+        b = torch.randn(16, 128).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -1546,8 +1800,12 @@ def mm(a, b):
                 "max_autotune_gemm_backends": "ATEN,TRITON,CUTLASS",
                 "cuda.cutlass_max_profiling_configs": 2,
                 # needed for log searching
+<<<<<<< HEAD
                 "fx_graph_cache": False,
                 "fx_graph_remote_cache": False,
+=======
+                "force_disable_caches": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             with (
@@ -1570,6 +1828,7 @@ def mm(a, b):
             self.assertTrue(num_ops > 0, "The number of ops should be greater than 0")
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
+<<<<<<< HEAD
     def test_maybe_append_choice_caching(self):
         """
         Test if maybe_append_choice's caching leads to correct results and
@@ -1741,6 +2000,8 @@ def counting_render(self, *args, **kwargs):
         self.assertEqual(render_call_count, num_matmuls + num_matmuls * 2)
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_cutlass_backend_matmul_same_tensor(self):
         max_autotune_gemm_backends = "CUTLASS"
@@ -1761,6 +2022,7 @@ def test_cutlass_backend_matmul_same_tensor(self):
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+<<<<<<< HEAD
     def test_cutlass_backend_matmul_nonzero_offset(self):
         max_autotune_gemm_backends = "CUTLASS"
 
@@ -1781,11 +2043,17 @@ def test_cutlass_backend_matmul_nonzero_offset(self):
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_flexible_layout(self):
         class TestModel(torch.nn.Module):
             def forward(self, B):
                 A = torch.zeros_like(B)
+<<<<<<< HEAD
                 return A @ B.t()
+=======
+                return A @ B
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         M = 1024
         B = torch.randn(M, M).cuda().half()
@@ -1807,7 +2075,11 @@ def test_evt_flexible_layout(self):
         class TestModel(torch.nn.Module):
             def forward(self, B):
                 A = torch.zeros_like(B)
+<<<<<<< HEAD
                 return (A @ B.t()).relu()
+=======
+                return (A @ B).relu()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         M = 1024
         B = torch.randn(M, M).cuda().half()
@@ -1833,7 +2105,11 @@ class TestModel(torch.nn.Module):
             def forward(self, B):
                 A = torch.zeros_like(B)
                 for _ in range(100):
+<<<<<<< HEAD
                     A = A @ B.t()
+=======
+                    A = A @ B
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return A
 
         M = 1024
@@ -1853,6 +2129,7 @@ def forward(self, B):
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+<<<<<<< HEAD
     @parametrize("use_aoti", (False, True))
     def test_compilation_time(self, use_aoti):
         M = 1024
@@ -1865,6 +2142,12 @@ def forward(self, a, b):
 
         model = MyModel().cuda()
         expected = model(A, B)
+=======
+    def test_compilation_time(self):
+        M = 1024
+        A = torch.randn(M, M).cuda().half()
+        B = torch.randn(M, M).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         start_time = time.time()
         with config.patch(
@@ -1874,6 +2157,7 @@ def forward(self, a, b):
                 "cuda.cutlass_max_profiling_configs": 1,
             }
         ):
+<<<<<<< HEAD
             if use_aoti:
                 actual = AOTIRunnerUtil.run(
                     model,
@@ -1883,6 +2167,9 @@ def forward(self, a, b):
                 actual = torch.compile(model, fullgraph=True)(A, B)
 
             torch.testing.assert_close(actual, expected)
+=======
+            _ = torch.compile(torch.mm)(A, B)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(time.time() - start_time < 50)
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
@@ -1909,6 +2196,7 @@ def forward(self, a, b, extra_args):
         M = 1024
         N = 512
         a = torch.ones(M, N).cuda().half()
+<<<<<<< HEAD
         b = torch.ones(N, N).cuda().half().t()
         extra_args = gen_args(op, (M, N))
         model = TestModel().cuda()
@@ -1934,6 +2222,9 @@ def forward(self, a, b, extra_args):
         N = 512
         a = torch.ones(M, N).cuda().half()
         b = torch.ones(N, N).cuda().half().t()
+=======
+        b = torch.ones(N, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_args = gen_args(op, (M, N))
         model = TestModel().cuda()
 
@@ -1963,7 +2254,11 @@ def forward(self, a, b, extra_args):
 
         model = TestModel().cuda()
         a = torch.ones(M, N).cuda().half()
+<<<<<<< HEAD
         b = torch.ones(N, N).cuda().half().t()
+=======
+        b = torch.ones(N, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_args = gen_args(op, (M, N), dtype=torch.float16)
 
         # baseline is cutlass kernel + triton
@@ -2028,7 +2323,11 @@ def forward(self, a, b, extra_args):
         for i, shape in enumerate(shapes):
             M, N = shape
             a = torch.ones(M, N).cuda().half()
+<<<<<<< HEAD
             b = torch.ones(N, N).cuda().half().t()
+=======
+            b = torch.ones(N, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extra_args = gen_args(op, (M, N))
             model = TestModel().cuda()
 
@@ -2056,7 +2355,11 @@ def forward(self, a, b, extra_args):
         M = 1024
         N = 512
         a = torch.ones(M, N).cuda().half()
+<<<<<<< HEAD
         b = torch.ones(N, N).cuda().half().t()
+=======
+        b = torch.ones(N, N).cuda().half()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_args = gen_args(op, (M, N))
         model = TestModel().cuda()
 
@@ -2070,13 +2373,18 @@ def forward(self, a, b, extra_args):
 
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("arch", ("90", "100"))
+<<<<<<< HEAD
     @parametrize("cuda_version", ("12.4", "12.8"))
+=======
+    @parametrize("cuda_version", ("12.4", "12.6", "12.8"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
         """
         Testing serialization for GEMM operations generated by CUTLASS.
         This should cover GroupedGemmOperation as well.
         """
         full_ops = _gen_ops_cached(arch, cuda_version)
+<<<<<<< HEAD
         ops = pytree.tree_flatten(full_ops)[0]
 
         # sanity check
@@ -2085,16 +2393,32 @@ def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
         # test if configuration name is unique
         op_config_names = [op.configuration_name() for op in ops]
         self.assertEqual(len(op_config_names), len(set(op_config_names)))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serializer = get_cutlass_operation_serializer()
         self.assertIsNotNone(serializer)
 
+<<<<<<< HEAD
         serialized_ops = [serializer.serialize(op) for op in ops]
         deserialized_ops = [
             serializer.deserialize(serialized_op) for serialized_op in serialized_ops
         ]
         for op, deserialized_op in zip(ops, deserialized_ops, strict=False):
             self.assertTrue(_check_if_instances_equal(op, deserialized_op))
+=======
+        count = 0
+        for ops in full_ops.values():
+            for op_dict in ops.values():
+                for op_list in op_dict.values():
+                    for op in op_list:
+                        count += 1
+                        serialized = serializer.serialize(op)
+                        deserialized = serializer.deserialize(serialized)
+                        self.assertTrue(_check_if_instances_equal(op, deserialized))
+
+        self.assertGreater(count, 1000, "Too few ops generated")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
     @unittest.skipIf(not SM90OrLater, "need sm_90")
@@ -2111,25 +2435,40 @@ def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
         ),
     )
     @parametrize("has_bias", (False, True))
+<<<<<<< HEAD
     @parametrize("use_fast_accum", (False, True))
     @parametrize("input_dtype", (torch.bfloat16, torch.float16))
+=======
+    @parametrize("use_fast_accum", (False,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_rowwise_scaling(
         self,
         float8_dtype: torch.dtype,
         shape: tuple[int, int, int],
         has_bias: bool,
         use_fast_accum: bool,
+<<<<<<< HEAD
         input_dtype: torch.dtype,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # Only bf16 output type is supported for row-wise scaling, not fp32
         output_dtype: torch.dtype = torch.bfloat16
         device = "cuda"
         M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+<<<<<<< HEAD
         x = torch.randn(M, K, dtype=input_dtype, device=device)
         w = torch.randn(N, K, dtype=input_dtype, device=device)
         bias = None
         if has_bias:
             bias = torch.randn(N, device=device, dtype=input_dtype).to(torch.bfloat16)
+=======
+        x = torch.randn(M, K, dtype=output_dtype, device=device)
+        w = torch.randn(N, K, dtype=output_dtype, device=device)
+        bias = None
+        if has_bias:
+            bias = torch.randn(N, device=device, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # quantize weight (prior to inference)
         w_fp8, w_inverse_scale = _quantize_rowwise(w, float8_dtype)
@@ -2179,6 +2518,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         (
             (
                 512,
+<<<<<<< HEAD
                 1024,
             ),
         ),
@@ -2273,6 +2613,8 @@ def forward(self, x):
         (
             (
                 512,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 128,
                 64,
             ),
@@ -2280,25 +2622,40 @@ def forward(self, x):
     )
     @parametrize("has_bias", (False, True))
     @parametrize("use_fast_accum", (False,))
+<<<<<<< HEAD
     @parametrize("input_dtype", (torch.bfloat16, torch.float16))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_tensorwise_scaling(
         self,
         float8_dtype: torch.dtype,
         shape: tuple[int, int, int],
         has_bias: bool,
         use_fast_accum: bool,
+<<<<<<< HEAD
         input_dtype: torch.dtype,
     ):
         device = "cuda"
         M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
         output_dtype = input_dtype
+=======
+    ):
+        device = "cuda"
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        input_dtype = torch.bfloat16
+        output_dtype = torch.bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # input and output dtypes of _scaled_mm do not need to be the same, but
         # typically in a model they are
         x = torch.randn(M, K, dtype=input_dtype, device=device)
         w = torch.randn(N, K, dtype=input_dtype, device=device)
         bias = None
         if has_bias:
+<<<<<<< HEAD
             bias = torch.randn(N, device=device, dtype=input_dtype)
+=======
+            bias = torch.randn(N, device=device, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # quantize weight (prior to inference)
         w_fp8, w_inverse_scale = _quantize_tensorwise(w, float8_dtype)
@@ -2342,6 +2699,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         # setting a small absolute tolerance in these tests
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     def test_config_number_post_filtering(self) -> None:
         """
@@ -2392,10 +2750,16 @@ def test_config_number_post_filtering(self) -> None:
             f"Got counts: {config_counts}",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
 
     # Set env to make it work in CI.
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON and HAS_CPU and is_big_gpu():
+=======
+    if HAS_CUDA and HAS_CPU and is_big_gpu():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
index 862aeb5db1c88..b4d9cd1878f4e 100644
--- a/test/inductor/test_cutlass_evt.py
+++ b/test/inductor/test_cutlass_evt.py
@@ -9,15 +9,23 @@
     torch_dtype_to_cutlass_type,
     try_import_cutlass,
 )
+<<<<<<< HEAD
+=======
+from torch._inductor.graph import GraphLowering
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.ir import ComputedBuffer, FixedLayout, PermuteView, Pointwise
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.utils import OrderedSet
 from torch.testing._internal.common_cuda import SM90OrLater
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     HAS_CPU,
     HAS_CUDA_AND_TRITON,
     MockGraphHandler,
 )
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if try_import_cutlass():
@@ -26,11 +34,18 @@
 
     LayoutType = cutlass_lib.LayoutType
     DataType = cutlass_lib.DataType
+<<<<<<< HEAD
     from cutlass_cppgen.backend.evt.ir.tensor import Tensor as CutlassTensor
 
     from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
         _render_argument_type,
         _trace,
+=======
+    from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
+        _render_argument_type,
+        _trace,
+        CutlassTensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         trace,
     )
 
@@ -103,6 +118,20 @@ def num_reads(self):
         return 1
 
 
+<<<<<<< HEAD
+=======
+class MockGraphHandler(GraphLowering):
+    def __init__(self, name_to_buffer):
+        import torch._inductor.sizevars
+
+        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
+        self.name_to_buffer = name_to_buffer
+        self.graph_inputs = dict()
+        self.mutated_buffers = OrderedSet()
+        self.constants = dict()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestCutlassEVT(TestCase):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
@@ -332,6 +361,7 @@ def test_example_tensor_creation(self):
         from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
             create_example_tensors,
         )
+<<<<<<< HEAD
         from torch._inductor.virtualized import V
 
         with V.set_graph_handler(MockGraphHandler({})):
@@ -357,6 +387,31 @@ def test_example_tensor_creation(self):
             self.assertEqual(
                 result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
             )
+=======
+
+        row_major_buf0 = MockComputedBuffer(
+            "buf0", None, torch.float32, (3, 4, 1), (4, 1, 0)
+        )
+        col_major_buf1 = MockComputedBuffer(
+            "buf1", None, torch.float32, (3, 2, 1), (1, 3, 0)
+        )
+        buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
+        name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
+        result = create_example_tensors(
+            buffer_renames, name_to_buffer, lambda x: int(x)
+        )
+        self.assertEqual(result["acc"].shape, (3, 4, 1))
+        self.assertEqual(result["acc"].stride, (4, 1, 0))
+        self.assertEqual(
+            result["acc"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+
+        self.assertEqual(result["buf1"].shape, (3, 2, 1))
+        self.assertEqual(result["buf1"].stride, (1, 3, 0))
+        self.assertEqual(
+            result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
@@ -371,7 +426,11 @@ def test_evt_argument_codegen(self):
                 epilogue_functor,
                 _create_mock_buffer_name_map(EXAMPLE_TENSORS),
                 lambda x: int(x),
+<<<<<<< HEAD
             )[0],
+=======
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """\
 { /* thread */
         { /* F */
@@ -381,12 +440,21 @@ def test_evt_argument_codegen(self):
               {}, /* C */
               {}, /* compute_0 */
             },
+<<<<<<< HEAD
             {/* ptr_aux */ (float*) (ptr_0 + ptr_0_offset), /* null_default */ float(0), /* dAux */ {2048, _1{}, _0{}}}, /* aux */
             {}, /* compute_1 */
           },
           {/* ptr_aux */ (float*) (ptr_1 + ptr_1_offset), /* dAux */ {2048, _1{}, _0{}}}, /* F */
         },
         {/* ptr_col */ (float*) (ptr_2 + ptr_2_offset), /* null_default */ float(0), /* dCol */ {}}, /* bias */
+=======
+            {/* ptr_aux */ (float*) aux, /* null_default */ float(0), /* dAux */ {2048, _1{}, _0{}}}, /* aux */
+            {}, /* compute_1 */
+          },
+          {/* ptr_aux */ (float*) F, /* dAux */ {2048, _1{}, _0{}}}, /* F */
+        },
+        {/* ptr_col */ (float*) bias, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {}, /* compute_2 */
         {}, /* compute_3 */
         {}, /* compute_4 */
@@ -428,14 +496,24 @@ def fn(accum, bias):
                 epilogue_functor,
                 _create_mock_buffer_name_map(example_tensors),
                 lambda x: int(x),
+<<<<<<< HEAD
             )[0],
+=======
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """\
 { /* thread */
         { /* E */
           {}, /* accum */
+<<<<<<< HEAD
           {/* ptr_aux */ (float*) (ptr_0 + ptr_0_offset), /* dAux */ {2048, _1{}, _0{}}}, /* E */
         },
         {/* ptr_col */ (float*) (ptr_1 + ptr_1_offset), /* null_default */ float(0), /* dCol */ {}}, /* bias */
+=======
+          {/* ptr_aux */ (float*) E, /* dAux */ {2048, _1{}, _0{}}}, /* E */
+        },
+        {/* ptr_col */ (float*) bias, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {}, /* compute_0 */
       }
 """,
@@ -444,7 +522,11 @@ def fn(accum, bias):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_evt_codegen(self):
+<<<<<<< HEAD
         _, _, code, _ = trace(
+=======
+        _, _, code = trace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             BIAS_CODE,
             EXAMPLE_TENSORS,
             DataType.f32,
@@ -560,5 +642,9 @@ def test_evt_codegen(self):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_CUDA_AND_TRITON:
+=======
+    if HAS_CPU or HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
index b9cfc428c9ade..fd571013fd5e6 100644
--- a/test/inductor/test_decompose_mem_bound_mm.py
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -12,6 +12,7 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     NAVI3_ARCH,
     is_arch,
     parametrize,
@@ -19,6 +20,14 @@
     TEST_XPU,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA_AND_TRITON
+=======
+    patch_test_members,
+    is_navi3_arch,
+    parametrize,
+    TEST_XPU,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.triton_utils import requires_gpu
 
 
@@ -51,6 +60,7 @@ def forward(self, input1, input2):
         return output
 
 
+<<<<<<< HEAD
 class TestDecomposeAddMM(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -61,6 +71,8 @@ def forward(
         return torch.ops.aten.addmm.default(z, x, y)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @requires_gpu
 @unittest.skipIf(
     TEST_XPU,
@@ -74,7 +86,11 @@ def forward(
 )
 @instantiate_parametrized_tests
 class TestDecomposeMemMM(TestCase):
+<<<<<<< HEAD
     def __init__(self, method_name="runTest", methodName="runTest"):
+=======
+    def __init__(self, method_name='runTest', methodName='runTest'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(method_name, methodName)
         self.atol = 1e-3
         self.rtol = 1e-3
@@ -83,9 +99,13 @@ def setup_tolerance(self, rtol=None, atol=None):
         if rtol is None:
             rtol = self.rtol
         if atol is None:
+<<<<<<< HEAD
             atol = self.atol
         self.rtol = rtol
         self.atol = atol
+=======
+            atol = self.rtol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
@@ -94,9 +114,13 @@ def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
         for key1 in ref_dict.keys():
             key2 = "_orig_mod." + key1
             assert key2 in res_dict, f"{key1} does not exist in traced module"
+<<<<<<< HEAD
             if not torch.allclose(
                 ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol
             ):
+=======
+            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
         return True
 
@@ -110,20 +134,28 @@ def compare_parameters(self, module, traced, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
         ref_params = dict(module.named_parameters())
         res_params = dict(traced.named_parameters())
+<<<<<<< HEAD
         self.assertTrue(
             self.compare_dict_tensors(
                 ref_params, res_params, rtol=self.rtol, atol=self.atol
             )
         )
+=======
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compare_gradients(self, module, traced, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
         ref_grad = {key: param.grad for key, param in module.named_parameters()}
         res_grad = {key: param.grad for key, param in traced.named_parameters()}
         self.assertTrue(
+<<<<<<< HEAD
             self.compare_dict_tensors(
                 ref_grad, res_grad, rtol=self.rtol, atol=self.atol
             )
+=======
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @parametrize(
@@ -145,7 +177,11 @@ def test_decompose_bmm(self, b, m, n, k, should_decompose):
 
         self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
         expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             counters["inductor"]["decompose_bmm"],
             expected_val,
@@ -156,7 +192,11 @@ def test_decompose_bmm(self, b, m, n, k, should_decompose):
         self.compare_parameters(module, traced)
         self.compare_gradients(module, traced)
 
+<<<<<<< HEAD
         expected_val = 3 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 3 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             counters["inductor"]["decompose_bmm"],
             expected_val,
@@ -205,7 +245,11 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
         expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if has_bias:
             self.assertEqual(
                 counters["inductor"]["decompose_addmm"],
@@ -233,8 +277,13 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
     # We have to increase tolerance for navi3 because all fp16, bf16
     # GEMMs operations have an accuracy issue caused by hardware limitation
     @patch_test_members({
+<<<<<<< HEAD
         "atol": 2e-3 if is_arch(NAVI3_ARCH) else 1e-3,
         "rtol": 2e-3 if is_arch(NAVI3_ARCH) else 1e-3
+=======
+        "atol": 2e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 2e-3 if is_navi3_arch() else 1e-3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     })
     @parametrize(
         "m,k,n, should_decompose",
@@ -258,7 +307,11 @@ def test_decompose_linear_mixed_precision(
 
             self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
             expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+            expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if has_bias:
                 self.assertEqual(
                     counters["inductor"]["decompose_addmm"],
@@ -303,7 +356,11 @@ def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
         expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             counters["inductor"]["decompose_mm"],
             expected_val,
@@ -315,16 +372,28 @@ def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
         self.compare_parameters(module, traced)
         self.compare_gradients(module, traced)
 
+<<<<<<< HEAD
         expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
             expected_val,
         )
         counters.clear()
 
+<<<<<<< HEAD
     @parametrize(
         "m,k,n, should_decompose",
         [(1, 64, 16, True), (2, 64, 16, False), (1, 64, 32, True)],
+=======
+    # (1, 64, 32, False) vesrion fails
+    @unittest.skip
+    @parametrize(
+        "m,k,n, should_decompose",
+        [(1, 64, 16, True), (2, 64, 16, False), (1, 64, 32, False)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_decompose_mm_cpu(self, m, n, k, should_decompose):
         torch._logging.set_logs(inductor=logging.DEBUG)
@@ -347,8 +416,13 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose):
     # We have to increase tolerance for navi3 because all fp16, bf16
     # GEMMs operations have an accuracy issue caused by hardware limitation
     @patch_test_members({
+<<<<<<< HEAD
         "atol": 3e-3 if is_arch(NAVI3_ARCH) else 1e-3,
         "rtol": 4e-3 if is_arch(NAVI3_ARCH) else 1e-3
+=======
+        "atol": 3e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 4e-3 if is_navi3_arch() else 1e-3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     })
     @parametrize(
         "m,k,n, should_decompose",
@@ -371,7 +445,11 @@ def test_decompose_mm_mixed_precision(self, m, n, k, has_bias, should_decompose)
 
             self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
             expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+            expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(
                 counters["inductor"]["decompose_mm"],
                 expected_val,
@@ -383,7 +461,11 @@ def test_decompose_mm_mixed_precision(self, m, n, k, has_bias, should_decompose)
             self.compare_parameters(module, traced)
             self.compare_gradients(module, traced)
 
+<<<<<<< HEAD
             expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+            expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(
                 counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
                 expected_val,
@@ -407,7 +489,11 @@ def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
+<<<<<<< HEAD
         expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
+=======
+        expected_val = 1 if should_decompose and HAS_CUDA else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if has_bias:
             self.assertEqual(
                 counters["inductor"]["decompose_addmm"],
@@ -421,7 +507,11 @@ def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
         self.compare_gradients(module, traced)
 
         expected_val = 0
+<<<<<<< HEAD
         if HAS_CUDA_AND_TRITON:
+=======
+        if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected_val = 1 if has_bias else 2
 
         self.assertEqual(
@@ -474,6 +564,7 @@ def test_check_device(self):
 
         self.assertFalse(check_device(input1, input2, device="mtia"))
 
+<<<<<<< HEAD
     @torch._inductor.config.patch(
         post_grad_fusion_options={
             "decompose_mm_pass": {"skip_dynamic_shape_dim_check": True},
@@ -499,6 +590,8 @@ def test_dynamic_shape_decompose_addmm(self):
         )
         counters.clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py
index e067bdfedc090..70d800ffbe018 100644
--- a/test/inductor/test_distributed_patterns.py
+++ b/test/inductor/test_distributed_patterns.py
@@ -7,7 +7,11 @@
 from torch._dynamo import compiled_autograd
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.testing import CompileCounter
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu
+=======
+from torch.testing._internal.common_utils import IS_MACOS, skipIfRocm, skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu
 
 
@@ -205,6 +209,10 @@ def fn(x):
     def test_storage_resize_zero_cpu(self):
         self._test_storage_resize_zero("cpu")
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu()
     def test_storage_resize_zero_gpu(self):
         self._test_storage_resize_zero(GPU_TYPE)
@@ -229,6 +237,10 @@ def fn(x, out):
     def test_storage_resize_nonzero_cpu(self):
         self._test_storage_resize_nonzero("cpu")
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu()
     def test_storage_resize_nonzero_gpu(self):
         self._test_storage_resize_nonzero(GPU_TYPE)
@@ -434,7 +446,10 @@ def fn(x):
         self._assert_same_grad(r1, r2)
         self._assert_same_grad(p1, p2)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch("graph_break_on_nn_param_ctor", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nn_param_return3(self):
         def fn(x):
             p = torch.nn.Parameter(x + 123)
@@ -451,7 +466,10 @@ def fn(x):
         self._assert_same_grad(r1, r2)
         self._assert_same_grad(p1, p2)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch("graph_break_on_nn_param_ctor", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nn_param_return4(self):
         def fn(x):
             p = torch.nn.Parameter(x + 123, requires_grad=False)
@@ -483,6 +501,10 @@ def test_fake_distributed_aot_eager(self):
         # Recompile on grad==None/grad!=None
         self.assertEqual(bw_cnt.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu
     @requires_gpu()
     @torch._functorch.config.patch(recompute_views=True)
diff --git a/test/inductor/test_efficient_conv_bn_eval.py b/test/inductor/test_efficient_conv_bn_eval.py
index 86b6b6ac8a0d8..0f520cc10113a 100644
--- a/test/inductor/test_efficient_conv_bn_eval.py
+++ b/test/inductor/test_efficient_conv_bn_eval.py
@@ -127,11 +127,19 @@ def test_conv_bn_eval(
             spatial_d = (
                 4 if issubclass(module[0], nn.modules.conv._ConvTransposeNd) else 96
             )
+<<<<<<< HEAD
             if module[0] is nn.Conv1d or module[0] is nn.ConvTranspose1d:
                 inps += [spatial_d] * 1
             if module[0] is nn.Conv2d or module[0] is nn.ConvTranspose2d:
                 inps += [spatial_d] * 2
             if module[0] is nn.Conv3d or module[0] is nn.ConvTranspose3d:
+=======
+            if module[0] == nn.Conv1d or module[0] == nn.ConvTranspose1d:
+                inps += [spatial_d] * 1
+            if module[0] == nn.Conv2d or module[0] == nn.ConvTranspose2d:
+                inps += [spatial_d] * 2
+            if module[0] == nn.Conv3d or module[0] == nn.ConvTranspose3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps += [spatial_d] * 3
             inp = torch.rand(inps).to(self.device)
 
diff --git a/test/inductor/test_external_callables.py b/test/inductor/test_external_callables.py
index a8aab1c00d80b..d773be53c216b 100644
--- a/test/inductor/test_external_callables.py
+++ b/test/inductor/test_external_callables.py
@@ -16,7 +16,11 @@ def forward(self, x):
         return torch.matmul(x, self.matrix)
 
 
+<<<<<<< HEAD
 # torch.add performs better than torch.mm and got chosen during tuning
+=======
+# torch.add performs better than torch.mm and got choosed during tuning
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def matmul_cpu(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
     torch.add(a, b, out=out)
 
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index a1e5aa3cebc45..7e37b643a35c7 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -2,6 +2,7 @@
 # flake8: noqa: B950
 
 import functools
+<<<<<<< HEAD
 import json
 import os
 import random
@@ -15,17 +16,34 @@
 from dataclasses import dataclass
 from itertools import product
 from typing import Optional, TypeVar, Union
+=======
+import random
+import string
+import unittest
+from collections import namedtuple
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import expectedFailure, skip, skipUnless
 from unittest.mock import patch
 
 import torch
 import torch.nn as nn
 from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
+<<<<<<< HEAD
 from torch._inductor import config, metrics
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import run_and_get_code
 from torch.nn.attention import SDPBackend
+=======
+from torch._inductor import metrics
+from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import run_and_get_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.nn.attention.experimental._paged_attention import PagedAttention
 from torch.nn.attention.flex_attention import (
     _create_empty_block_mask,
@@ -33,10 +51,14 @@
     _identity,
     _mask_mod_signature,
     _score_mod_signature,
+<<<<<<< HEAD
     _WARNINGS_SHOWN,
     and_masks,
     AuxOutput,
     AuxRequest,
+=======
+    and_masks,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BlockMask,
     create_block_mask,
     flex_attention,
@@ -50,26 +72,39 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+<<<<<<< HEAD
     dtypesIfXPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
     largeTensorTest,
     skipCPUIf,
     skipCUDAIf,
+<<<<<<< HEAD
     skipXPUIf,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
+=======
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton, has_triton_tma_device
 
 
 # Use this decorator only when hitting Triton bugs on H100
 running_on_a100_only = skipUnless(
+<<<<<<< HEAD
     (
         (torch.cuda.is_available() and has_triton())
         and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip)
     )
     or (torch.xpu.is_available() and has_triton()),
     "Requires Triton + A100 or Triton + ROCm or Triton + Intel GPU",
+=======
+    (torch.cuda.is_available() and has_triton())
+    and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip),
+    "Requires Triton + A100 or Triton + ROCm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
@@ -103,6 +138,7 @@ def temp_float32_matmul_precision(precision: str):
     Args:
     precision (str): The precision to set ('highest', 'high', or 'medium').
     """
+<<<<<<< HEAD
 
     def set_float32_matmul_precision_xpu(precision: str):
         if precision == "highest":
@@ -120,6 +156,14 @@ def set_float32_matmul_precision_xpu(precision: str):
         torch.set_float32_matmul_precision(original_precision)
         if TEST_ON_XPU:
             set_float32_matmul_precision_xpu(original_precision)
+=======
+    original_precision = torch.get_float32_matmul_precision()
+    try:
+        torch.set_float32_matmul_precision(precision)
+        yield
+    finally:
+        torch.set_float32_matmul_precision(original_precision)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def skip_on_cpu(test_func):
@@ -141,28 +185,41 @@ def skip_on_rocm(test_func):
     return decorated_func
 
 
+<<<<<<< HEAD
 def skip_on_xpu(test_func):
     """Decorator to skip tests that are not supported on Intel GPU."""
     decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
     return decorated_func
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def rmse(ref, res):
     """
     Calculate root mean squared error
     """
+<<<<<<< HEAD
     ref = ref.to(torch.float64)
     res = res.to(torch.float64)
     return torch.sqrt(torch.mean(torch.square(ref - res)))
 
 
 def create_attention(score_mod, block_mask, enable_gqa=False, kernel_options=None):
+=======
+    return torch.sqrt(torch.mean(torch.square(ref - res)))
+
+
+def create_attention(score_mod, block_mask, enable_gqa=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return functools.partial(
         flex_attention,
         score_mod=score_mod,
         block_mask=block_mask,
         enable_gqa=enable_gqa,
+<<<<<<< HEAD
         kernel_options=kernel_options,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -189,6 +246,7 @@ class DeviceConfig:
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
+<<<<<<< HEAD
 TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
 
 device_configs = {}
@@ -203,6 +261,11 @@ class DeviceConfig:
         test_device = ("xpu",)
 else:
     test_device = ("cpu",)
+=======
+
+device_configs = {}
+test_device = ("cpu", "cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SubstringSet:
@@ -212,8 +275,11 @@ def __init__(self, items):
     def __contains__(self, item):
         if "cuda" in item:
             item = "cuda"
+<<<<<<< HEAD
         if "xpu" in item:
             item = "xpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return item in self.items
 
 
@@ -231,10 +297,13 @@ def __contains__(self, item):
     ),
     dtypes_fast=[torch.float16],
 )
+<<<<<<< HEAD
 device_configs["xpu"] = DeviceConfig(
     dtypes=([torch.float32, torch.bfloat16, torch.float16]),
     dtypes_fast=[torch.float16],
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 device_configs["cpu"] = DeviceConfig(
     dtypes=(
         [torch.float32, torch.bfloat16, torch.float16]
@@ -443,7 +512,11 @@ def batch_reserve(paged_attention: PagedAttention, target_seq_len: Tensor):
         )
 
 
+<<<<<<< HEAD
 @large_tensor_test_class("2GB", device=test_device[0])
+=======
+@large_tensor_test_class("2GB", device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestFlexAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -459,17 +532,28 @@ def _check_equal(
         compiled_out: torch.Tensor,
         fudge_factor: float,
         tensor_name: Optional[str] = None,
+<<<<<<< HEAD
         fudge_atol: float = 0,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         compiled_error = (golden_out - compiled_out).abs().mean()
         ref_error = (golden_out - ref_out).abs().mean()
         if torch.isnan(compiled_error).any() or torch.isnan(ref_error).any():
+<<<<<<< HEAD
             self.fail("Output/Grad with NaN")
         name = tensor_name if tensor_name is not None else ""
         msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
         torch.testing.assert_close(
             compiled_error, ref_error, rtol=fudge_factor, atol=1e-7, msg=msg
         )
+=======
+            self.assertTrue(False, "Output/Grad with NaN")
+        if compiled_error > ref_error * fudge_factor:
+            name = tensor_name if tensor_name is not None else ""
+            msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
+            self.assertTrue(False, msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_out(
         self,
@@ -588,7 +672,13 @@ def run_test(
             )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
+<<<<<<< HEAD
         sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
+=======
+        sdpa_partial = create_attention(
+            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compiled_sdpa = torch.compile(sdpa_partial)
         golden_out = sdpa_partial(q_gold, k_gold, v_gold)
@@ -709,6 +799,7 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
+<<<<<<< HEAD
         kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
         converted_block_mask = paged_attention.convert_logical_block_mask(
             block_mask, kv_len=kv_len_tensor
@@ -716,6 +807,10 @@ def preprocess_paged_attention(
         converted_score_mod = paged_attention.get_score_mod(
             score_mod, kv_len=kv_len_tensor
         )
+=======
+        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
+        converted_score_mod = paged_attention.get_score_mod(score_mod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
     def run_paged_attention(
@@ -727,7 +822,10 @@ def run_paged_attention(
         dtype: torch.dtype,
         device: str,
         block_mask: Optional[BlockMask] = None,
+<<<<<<< HEAD
         kernel_options: Optional[dict] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, Tensor]:
         B, Q_H, Q_S, KV_H, KV_S = (
             q.shape[0],
@@ -762,8 +860,12 @@ def run_paged_attention(
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
+<<<<<<< HEAD
                 enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
+=======
+                enable_gqa=(not Q_H == KV_H),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             return_lse = False
@@ -775,8 +877,12 @@ def run_paged_attention(
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
+<<<<<<< HEAD
                 enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
+=======
+                enable_gqa=(not Q_H == KV_H),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return compiled_out, compiled_lse
 
@@ -820,7 +926,13 @@ def run_test_with_paged_attention(
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S, device=device)
 
+<<<<<<< HEAD
         sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
+=======
+        sdpa_partial = create_attention(
+            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         golden_out, golden_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -1247,7 +1359,10 @@ def run_automatic_dynamic_test(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
         self.run_test(score_mod, dtype, device=device)
@@ -1257,7 +1372,10 @@ def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
         self, device, dtype, score_mod: Callable
     ):
@@ -1272,7 +1390,10 @@ def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
     @running_on_a100_only
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_seqlen_lt_custom_sparse_block_size(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1306,7 +1427,10 @@ def causal_mask(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mask_mod", test_score_mask_mod_map.items())
     def test_builtin_score_mods_dynamic(
         self, device, dtype: torch.dtype, score_mask_mod: tuple[Callable, Callable]
@@ -1316,7 +1440,10 @@ def test_builtin_score_mods_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_automatic_dynamic(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1326,7 +1453,10 @@ def test_builtin_score_mods_automatic_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_different_seqlen(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1350,7 +1480,10 @@ def test_builtin_score_mods_different_seqlen(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("BLOCK_SIZE", test_block_size)
     def test_builtin_score_mods_different_block_size(
@@ -1371,7 +1504,10 @@ def test_builtin_score_mods_different_block_size(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1442,7 +1578,10 @@ def batch_mask_mod(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1465,7 +1604,11 @@ def mask_mod(b, h, q, kv):
 
         block_mask = create_block_mask(mask_mod, Bq, 1, S, S, device=device)
         attention = functools.partial(
+<<<<<<< HEAD
             flex_attention, block_mask=block_mask, enable_gqa=(Hq != Hkv)
+=======
+            flex_attention, block_mask=block_mask, enable_gqa=(not Hq == Hkv)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.run_test_with_call(attention, dtype, device, Bq, Hq, S, D, Bkv, Hkv, S, D)
@@ -1473,10 +1616,15 @@ def mask_mod(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     @skip_on_rocm  # TODO: NaNs on ROCM
     @skip_on_xpu  # TODO: NaNs on XPU like ROCM, need another PR to fix.
+=======
+    @common_utils.parametrize("score_mod", test_score_mods)
+    @skip_on_rocm  # TODO: NaNs on ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
         inputs = (
             score_mod,
@@ -1497,7 +1645,10 @@ def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize(
         "q_s", test_strides[:2]
     )  # TODO: fix layout for query braodcasting
@@ -1540,6 +1691,7 @@ def coerce_to_strides(val, shape, strides):
         v = coerce_to_strides(v1, v_shape, v_s)
         do = coerce_to_strides(do1, do_shape, do_s)
 
+<<<<<<< HEAD
         kernel_options = {"USE_TMA": True}
 
         block_mask = _create_empty_block_mask(q, k)
@@ -1547,6 +1699,11 @@ def coerce_to_strides(val, shape, strides):
         sdpa_partial = create_attention(
             score_mod=score_mod, block_mask=block_mask, kernel_options=kernel_options
         )
+=======
+        block_mask = _create_empty_block_mask(q, k)
+        score_mod = _generate_alibi_bias(8)
+        sdpa_partial = create_attention(score_mod=score_mod, block_mask=block_mask)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_sdpa = torch.compile(sdpa_partial, fullgraph=True)
         ref_out = sdpa_partial(q, k, v)
         compiled_out = compiled_sdpa(q, k, v)
@@ -1589,7 +1746,11 @@ def coerce_to_strides(val, shape, strides):
         # test paged attention which does not support backward
         q.requires_grad, k.requires_grad, v.requires_grad = False, False, False
         paged_compiled_out, _ = self.run_paged_attention(
+<<<<<<< HEAD
             score_mod, q, k, v, dtype, device=device, kernel_options=kernel_options
+=======
+            score_mod, q, k, v, dtype, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         torch.testing.assert_close(
             ref_out, paged_compiled_out, atol=tolerance.atol, rtol=tolerance.rtol
@@ -1645,7 +1806,10 @@ def index_weird2(score, b, h, q_idx, kv_idx):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_skip_odd_keys(self, device, dtype: torch.dtype):
         def score_mod(score, b, h, q, kv):
             return torch.where(kv % 2 == 0, score, float("-inf"))
@@ -1656,7 +1820,10 @@ def score_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_function_composition(self, device, dtype: torch.dtype):
         def score_mod_1(score, b, h, m, n):
             return score + (m - n)
@@ -1673,7 +1840,10 @@ def composed_score_mod(score, b, h, m, n):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_captured_buffers_all_dims(self, device, dtype: torch.dtype):
         head_scale = torch.randn(H, device=device)
         batch_scale = torch.randn(B, device=device)
@@ -1691,7 +1861,10 @@ def all_bias(score, batch, head, token_q, token_kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_seq_masking(self, device, dtype):
         seq_idx = torch.zeros(S, device=device, dtype=torch.bool)
         seq_idx[S // 2 :] = 1
@@ -1705,7 +1878,10 @@ def seq_mask_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_from_bias_seq_only(self, device, dtype):
         bias = torch.randn(S, S, device=device, dtype=dtype)
 
@@ -1718,7 +1894,10 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_from_bias_seq_batch(self, device, dtype):
         bias = torch.randn(B, S, S, device=device, dtype=dtype)
 
@@ -1778,7 +1957,10 @@ def add_decomposed_rel_pos(self, q):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_from_bias_head_seq_batch(self, device, dtype):
         bias = torch.randn(B, H, S, S, device=device, dtype=dtype)
 
@@ -1791,7 +1973,10 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_rel_bias(self, device, dtype):
         rel_bias = torch.randn(2 * S, device=device, dtype=dtype)
 
@@ -1804,7 +1989,10 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dependent_causal_bidirectional(self, device, dtype):
         num_bidirectional = torch.randint(0, S, (B,), device=device, dtype=torch.int32)
 
@@ -1826,7 +2014,10 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_natten_2d(self, device, dtype):
         H = 32
         W = S // H
@@ -1865,7 +2056,11 @@ def score_mod_func(score, b, h, q, kv):
             requires_grad=True,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
+<<<<<<< HEAD
         # floor_div is not decomposed in decomposition_table is empty
+=======
+        # floor_div is not decomposed in decompostion_table is empty
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(flex_attention, score_mod=score_mod_func)
         gm = make_fx(attention, decomposition_table={})(query, key, value)
         self.assertExpectedInline(
@@ -1895,7 +2090,10 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_silu_on_score(self, device, dtype):
         def silu_score(score, b, h, q, kv):
             return torch.nn.functional.silu(score)
@@ -1906,7 +2104,10 @@ def silu_score(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_padded_dense_causal(self, device, dtype):
         seq_len = torch.arange(B, device=device, dtype=torch.int32) + 1
 
@@ -1925,7 +2126,10 @@ def njt_score_mod(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_captured_scale(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
 
@@ -1938,7 +2142,10 @@ def score_mod_scale(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_recompile_changed_score_mod(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
         ADD = True
@@ -1960,7 +2167,10 @@ def score_mod_scale(qk, b, h, q, kv):
     @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_captured_reduction(self, device, dtype):
         scale = torch.randn((B, 8), device=device)
 
@@ -1970,6 +2180,7 @@ def score_mod_scale(qk, b, h, q, kv):
         self.run_test(score_mod_scale, dtype, device=device)
 
     @supported_platform
+<<<<<<< HEAD
     @skip_on_cpu
     @dtypes(torch.float16)
     @dtypesIfCUDA(torch.float16)
@@ -2278,6 +2489,8 @@ def test_shape(S, backend):
         _ = [test_shape(S, backend) for S in test_shapes]
 
     @supported_platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multiple_score_mod_calls(self, device):
         query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
         keys = [
@@ -2684,7 +2897,10 @@ def f(q, k, v):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_njt_causal(self, device, dtype):
         offsets = torch.tensor(
             [0, 1024, 1024 + 512, S], device=device, dtype=torch.int32
@@ -2726,12 +2942,15 @@ def score_mod(score, b, h, m, n):
         self.run_test_with_paged_attention(
             score_mod, dtype=torch.float16, device=device
         )
+<<<<<<< HEAD
         self.run_test_with_paged_attention(
             score_mod=score_mod,
             dtype=torch.bfloat16,
             KV_S=64,
             device=device,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @skip("TODO: Figure out why this is erroring")
@@ -2753,7 +2972,10 @@ def bias_mod(score, batch, head, token_q, token_kv):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
     def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
         qk_d, v_d = head_dims
@@ -2847,7 +3069,10 @@ def causal(b, h, q_idx, kv_idx):
     @common_utils.parametrize("head_dim", [17, 24, 94, 121])
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_pow_2_headdim(self, device, dtype, head_dim):
         self.run_test(_rel_bias, dtype, device, B, H, S, head_dim, B, H, S, head_dim)
 
@@ -2912,7 +3137,10 @@ def causal_constructor(S):
     @skip_on_cpu
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", [_identity, _causal])
     def test_logsumexp_correctness(self, device, dtype, score_mod):
         make_tensor = functools.partial(
@@ -3048,7 +3276,13 @@ def test_differentiable_logsumexp_gradcheck(self, device):
         def flex_attention_lse_only(q, k, v):
             return flex_attention(q, k, v, return_lse=True)[1]
 
+<<<<<<< HEAD
         func = torch.compile(flex_attention_lse_only, backend="aot_eager")
+=======
+        func = torch.compile(
+            flex_attention_lse_only, backend="aot_eager", fullgraph=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(
             torch.autograd.gradcheck(func, (query, key, value), raise_exception=True)
@@ -3074,7 +3308,13 @@ def test_differentiable_logsumexp_compiled(self, device):
         k.grad = None
         v.grad = None
 
+<<<<<<< HEAD
         out2, lse2 = torch.compile(flex_attention)(q, k, v, return_lse=True)
+=======
+        out2, lse2 = torch.compile(flex_attention, fullgraph=True)(
+            q, k, v, return_lse=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (out2.mean() + (lse2 * lse_mask).sum()).backward()
         q_grad2, k_grad2, v_grad2 = q.grad, k.grad, v.grad
         tolerance = Tolerances(atol=1e-1, rtol=1e-1)
@@ -3121,7 +3361,11 @@ def mask(b, h, q, kv):
         )
         q, k, v = make_tensor2(), make_tensor2(), make_tensor2()
 
+<<<<<<< HEAD
         # Compile 2nd version with q/k/v(seqlen=2048) and block_mask(seqlen=4096),
+=======
+        # Compile 2st version with q/k/v(seqlen=2048) and block_mask(seqlen=4096),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The graph includes the BlockMask._adjust part.
         out = torch.compile(flex_attention, dynamic=True, fullgraph=True)(
             q, k, v, block_mask=block_mask
@@ -3260,7 +3504,10 @@ def test_strided_backwards(self, device):
             torch.testing.assert_close(eager, compiled, atol=9e-3, rtol=0)
 
     @supported_platform
+<<<<<<< HEAD
     @skip_on_cpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("mode", ["eager", "inductor", "paged_attention"])
     @common_utils.parametrize(
         "permute_order",
@@ -3276,11 +3523,14 @@ def test_strided_backwards(self, device):
     def test_flex_attention_stride_ordering(self, device, mode, permute_order, shape):
         from torch._inductor.ir import get_stride_order
 
+<<<<<<< HEAD
         if torch.version.hip and mode == "paged_attention":
             raise self.skipTest(
                 "TODO: figure out why mode_paged_attention_permute_order3_shape0 on MI200 caused mem fault"
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = torch.float32
         # Setup
         requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
@@ -3371,7 +3621,11 @@ def test_flex_attention_backward_stride_ordering(
     def test_non_contiguous_last_dim(self, device):
         """Test flex_attention with tensors having non contiguous last dimension."""
         B, H, D = 4, 8, 64
+<<<<<<< HEAD
         dtype = torch.float16 if device in DEVICE_SUPPORTS_BACKWARDS else torch.float32
+=======
+        dtype = torch.float16 if device == "cuda" else torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for S in [16, 64]:
 
             def column_major_tensor():
@@ -3593,7 +3847,11 @@ def test_force_write_lse(self, device):
         query, key, value = make_tensor(), make_tensor(), make_tensor()
         out_eager, lse_eager = flex_attention(query, key, value, return_lse=True)
 
+<<<<<<< HEAD
         flex_compile = torch.compile(flex_attention)
+=======
+        flex_compile = torch.compile(flex_attention, fullgraph=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_compiled, lse_compiled = flex_compile(query, key, value, return_lse=True)
 
         out_paged, lse_paged = self.run_paged_attention(
@@ -3601,9 +3859,13 @@ def test_force_write_lse(self, device):
         )
 
         torch.testing.assert_close(lse_eager, lse_compiled, atol=3e-3, rtol=0)
+<<<<<<< HEAD
         requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
         if requires_grad:
             torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
+=======
+        torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @skip_on_cpu
@@ -4008,7 +4270,10 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @skip_on_cpu
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "different dynamo counters")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_free_symbol_dynamic(self, device):
         def batch_flip_causal(b, h, q_idx, kv_idx):
             return (q_idx >= kv_idx) & (b % 2 == 0)
@@ -4128,9 +4393,13 @@ def causal_mask(b, h, q_idx, kv_idx):
         self.assertEqual(len(cnt.graphs), 1)
         graph = cnt.graphs[0]
         norm_graph = normalize_gm(graph.print_readable(print_output=False))
+<<<<<<< HEAD
         self.assertExpectedInline(
             norm_graph,
             """\
+=======
+        expected_graph = """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphModule(torch.nn.Module):
     def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_value_: "f64[2, 2, 128, 4]", L_block_mask_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_q_num_blocks: "i32[1, 1, 1]", L_block_mask_q_indices: "i32[1, 1, 1, 1]", L_block_mask_full_q_num_blocks: "i32[1, 1, 1]", L_block_mask_full_q_indices: "i32[1, 1, 1, 1]"):
         l_query_ = L_query_
@@ -4147,7 +4416,11 @@ def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_
 
         score_mod_0 = self.score_mod_0
         mask_fn_0 = self.mask_fn_0
+<<<<<<< HEAD
         flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
+=======
+        flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out: "f64[2, 2, 128, 4]" = flex_attention[0];  flex_attention = None
         return (out,)
 
@@ -4160,7 +4433,14 @@ class mask_fn_0(torch.nn.Module):
         def forward(self, child: "i32[]", child_1: "i32[]", child_2: "i32[]", child_3: "i32[]"):
             ge: "b8[]" = child_2 >= child_3;  child_2 = child_3 = None
             return ge
+<<<<<<< HEAD
 """,  # noqa: B950
+=======
+"""
+        self.assertExpectedInline(
+            norm_graph,
+            expected_graph,  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Save the AOT graphs
         aot_graphs = []
@@ -4178,20 +4458,32 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         out.sum().backward()
 
         joint_graph = normalize_gm(aot_graphs[1].print_readable(print_output=False))
+<<<<<<< HEAD
         self.assertExpectedInline(
             joint_graph,
             """\
+=======
+        expected_joint_graph = """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
         full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
         fw_graph0 = self.fw_graph0
         joint_graph0 = self.joint_graph0
         mask_graph0 = self.mask_graph0
+<<<<<<< HEAD
         flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
         getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
         getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
         getitem_7: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
         return (getitem_5, getitem_6, getitem_7)
+=======
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
+        getitem_4: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
+        getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
+        getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
+        return (getitem_4, getitem_5, getitem_6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class fw_graph0(torch.nn.Module):
         def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]"):
@@ -4207,11 +4499,23 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 
     class mask_graph0(torch.nn.Module):
         def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]"):
+<<<<<<< HEAD
             full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
             return full_default
 """.replace(  # noqa: B950
                 "GPU_TYPE", torch.device(device).type
             ),
+=======
+            full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
+            return full_default
+""".replace(  # noqa: B950
+            "GPU_TYPE", torch.device(device).type
+        )
+
+        self.assertExpectedInline(
+            joint_graph,
+            expected_joint_graph,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @supported_platform
@@ -4297,7 +4601,11 @@ def flex_attention_as_strided_error_tensor(
             mask_mod_other_buffers=(),
         ):
             inner_q, inner_k, inner_v = query.elem, key.elem, value.elem
+<<<<<<< HEAD
             out, lse, max_scores = flex_attention_hop(
+=======
+            out, lse = flex_attention_hop(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inner_q,
                 inner_k,
                 inner_v,
@@ -4308,11 +4616,15 @@ def flex_attention_as_strided_error_tensor(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
+<<<<<<< HEAD
             return (
                 AsStridedErrorTensor(out),
                 AsStridedErrorTensor(lse),
                 AsStridedErrorTensor(max_scores),
             )
+=======
+            return AsStridedErrorTensor(out), AsStridedErrorTensor(lse)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test setup
         B, H, S, D = 2, 1, 128, 16
@@ -4333,7 +4645,11 @@ def flex_attention_as_strided_error_tensor(
             )
 
         # Test 2: Run flex_attention with normal tensors first
+<<<<<<< HEAD
         compiled_fn = torch.compile(flex_attention, backend="aot_eager")
+=======
+        compiled_fn = torch.compile(flex_attention, backend="aot_eager", fullgraph=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         normal_out, normal_lse = compiled_fn(
             query_elem, key_elem, value_elem, return_lse=True
         )
@@ -4398,6 +4714,7 @@ def forward(self, q, k, v, block_mask):
 
     @supported_platform
     @skip_on_cpu
+<<<<<<< HEAD
     def test_custom_score_mod_layout_freeze(self, device):
         torch.manual_seed(0)
 
@@ -4481,6 +4798,8 @@ def forward(self, q, k, v, mu):
 
     @supported_platform
     @skip_on_cpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize(
         "ops_to_save",
         [
@@ -4578,9 +4897,15 @@ def flex_attn_fn(x):
                 return output
 
         flex_module = SacModule(hidden_size=512, num_heads=8, context_fn=context_fn).to(
+<<<<<<< HEAD
             device, dtype=torch.bfloat16
         )
         x = torch.ones(8, 1024, 512, device=device, dtype=torch.bfloat16)
+=======
+            "cuda", dtype=torch.bfloat16
+        )
+        x = torch.ones(8, 1024, 512, device="cuda", dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run without compilation
         output_module = flex_module(x)
@@ -4607,6 +4932,7 @@ def flex_attn_fn(x):
 
     @supported_platform
     @skip_on_cpu
+<<<<<<< HEAD
     def test_selective_ac_with_max_autotune_short_query(self, device):
         from functools import partial
 
@@ -4712,6 +5038,8 @@ def forward(self, x, block_mask):
 
     @supported_platform
     @skip_on_cpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_validate_small_embedding_size_error_message(self, device):
         # eager support for small embedding size
         q, k, v = [torch.randn(2, 2, 128, 8, device=device) for _ in range(3)]
@@ -4781,11 +5109,19 @@ def make_tensor():
     @supported_platform
     @skip_on_cpu
     @skipCUDAIf(not has_triton_tma_device(), "Requires TMA enabled CUDA device")
+<<<<<<< HEAD
     def test_tma_with_customer_kernel_options(self, device):
         make_tensor = functools.partial(
             torch.ones,
             (1, 1, 256, 128),
             device=device,
+=======
+    def test_tma_with_customer_kernel_options(self):
+        make_tensor = functools.partial(
+            torch.ones,
+            (1, 1, 256, 128),
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.bfloat16,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -4806,6 +5142,7 @@ def test_tma_with_customer_kernel_options(self, device):
         # vanilla compiled vs TMA compiled
         torch.testing.assert_close(out_tma_compiled, out_compiled, atol=2e-1, rtol=2e-1)
 
+<<<<<<< HEAD
     @supported_platform
     @skip_on_cpu
     def test_large_batch_heads_grid_dimension(self, device):
@@ -4882,6 +5219,8 @@ def simple_score_mod(score, b, h, q_idx, kv_idx):
             fa._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = original_flag
             fa._WARNINGS_SHOWN = original_warnings_shown
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestBlockMask(InductorTestCase):
     def setUp(self):
@@ -4896,8 +5235,13 @@ def causal_mask(b, h, q, kv):
 
         block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048, device=device)
         self.assertEqual(block_mask.shape, (4, 2, 2048, 2048))
+<<<<<<< HEAD
         self.assertEqual(block_mask[0].shape, (1, 2, 2048, 2048))
         self.assertEqual(block_mask[0, 0].shape, (1, 1, 2048, 2048))
+=======
+        self.assertEqual(block_mask[0].shape, (2, 2048, 2048))
+        self.assertEqual(block_mask[0, 0].shape, (2048, 2048))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(block_mask.numel(), 4 * 2 * 2048 * 2048)
         self.assertEqual(block_mask.sparsity(), 46.875)
         self.assertEqual(block_mask[0].sparsity(), 46.875)
@@ -4941,6 +5285,7 @@ def causal_mask(b, h, q, kv):
 
         # Index on batch dimension
         new_block_mask = block_mask[0]
+<<<<<<< HEAD
         assert new_block_mask.kv_num_blocks.shape == (1, 2, 4)
         assert new_block_mask.kv_indices.shape == (1, 2, 4, 4)
 
@@ -4961,6 +5306,15 @@ def causal_mask(b, h, q, kv):
             4,
         )
         assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
+=======
+        assert new_block_mask.kv_num_blocks.shape == (2, 4)
+        assert new_block_mask.kv_indices.shape == (2, 4, 4)
+
+        # Index on batch and head dimension
+        new_block_mask = block_mask[0, 1]
+        assert new_block_mask.kv_num_blocks.shape == (4,)
+        assert new_block_mask.kv_indices.shape == (4, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # slicing on batch and head dimension
         new_block_mask = block_mask[0:2, 1:2]
@@ -4999,6 +5353,7 @@ def causal_mask(b, h, q, kv):
             )
 
     @supported_platform
+<<<<<<< HEAD
     def test_sliced_blockmask_mask_mod_error(self, device):
         """Test that sliced BlockMask raises helpful error when used with flex_attention"""
 
@@ -5021,6 +5376,8 @@ def causal_mask(b, h, q_idx, kv_idx):
             compiled_fa(q, k, v, block_mask=sliced_mask)
 
     @supported_platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_block_mask_device_change(self, device):
         device = torch.device(device)
         offset = torch.zeros(8, device=device)
@@ -5291,6 +5648,47 @@ def test_init_mismatched_full_q(self, device):
             )
 
     @supported_platform
+<<<<<<< HEAD
+=======
+    @common_utils.parametrize("compile", [False, True])
+    def test_no_q_info(self, device, compile: bool):
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        block_mask = create_block_mask(causal_mask, 1, 1, 2048, 2048, device=device)
+        # manually set q_num_blocks and q_indices to None
+        block_mask.q_num_blocks = None
+        block_mask.q_indices = None
+        block_mask.full_q_num_blocks = None
+        block_mask.full_q_indices = None
+
+        mask_mod_sparse_flex = functools.partial(flex_attention, block_mask=block_mask)
+        if compile:
+            mask_mod_sparse_flex = torch.compile(
+                mask_mod_sparse_flex, backend="inductor"
+            )
+        inputs = [
+            torch.randn(
+                2,
+                2,
+                2048,
+                64,
+                device=device,
+                dtype=torch.float16,
+                requires_grad=True,
+            )
+            for _ in range(3)
+        ]
+
+        causal_mask_out = mask_mod_sparse_flex(*inputs)
+        sdpa_mask_out = torch.nn.functional.scaled_dot_product_attention(
+            *inputs, is_causal=True
+        )
+
+        torch.testing.assert_close(causal_mask_out, sdpa_mask_out, atol=5e-3, rtol=0.0)
+
+    @supported_platform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_doc_mask_clamped_repro(self, device):
         def _offsets_to_doc_ids_tensor(offsets):
             device = offsets.device
@@ -5404,7 +5802,10 @@ def flex_attention_fn():
         )
 
     @supported_platform
+<<<<<<< HEAD
     @skip_on_xpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_create_is_cuda_graphable(self, device):
         def mask_mod(b, h, q, kv):
             return q >= kv
@@ -5445,6 +5846,7 @@ def create_inputs(S):
         with self.assertRaisesRegex(ValueError, "block_mask was created for"):
             flex_attention_call(*create_inputs(1024), block_mask=block_mask)
 
+<<<<<<< HEAD
     @supported_platform
     @common_utils.parametrize("full_indices", [False, True])
     def test_from_kv_blocks_without_q_computation(self, device, full_indices: bool):
@@ -5874,6 +6276,10 @@ def causal_mask(b, h, q_idx, kv_idx):
 
 
 @large_tensor_test_class("2GB", device=test_device[0])
+=======
+
+@large_tensor_test_class("2GB", device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPagedAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -5995,12 +6401,16 @@ def causal_mask(b, h, q, kv):
         block_mask = create_block_mask(
             causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=device
         )
+<<<<<<< HEAD
         kv_len_tensor = torch.full(
             (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
         )
         new_block_mask = paged_cache.convert_logical_block_mask(
             block_mask, kv_len=kv_len_tensor
         )
+=======
+        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         zeros = [0, 0, 0, 0]
         # Check that the new block mask is correct
@@ -6193,7 +6603,10 @@ def test_update(self, device):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+<<<<<<< HEAD
     @dtypesIfXPU(*device_configs["xpu"].dtypes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_paged_builtin_score_mods(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -6276,6 +6689,7 @@ def causal_mask(b, h, q, kv):
         )
         paged_cache.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
+<<<<<<< HEAD
         kv_len_tensor = torch.full(
             (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
         )
@@ -6288,6 +6702,13 @@ def causal_mask(b, h, q, kv):
                 paged_cache.get_score_mod(score_mod, kv_len=kv_len_tensor),
                 block_mask,
                 enable_gqa=False,
+=======
+        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+
+        compiled_sdpa = torch.compile(
+            create_attention(
+                paged_cache.get_score_mod(score_mod), block_mask, enable_gqa=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
         paged_out = compiled_sdpa(q, k_cache, v_cache, block_mask=new_block_mask)
@@ -6329,16 +6750,25 @@ def get_params(dtypes: list[torch.dtype]) -> list[Params]:
 
 
 supports_learnable_bias = unittest.skipUnless(
+<<<<<<< HEAD
     (
         (torch.cuda.is_available() and has_triton())
         and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip)
     ),
+=======
+    (torch.cuda.is_available() and has_triton())
+    and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Requires Triton + A100 or Triton + ROCm",
 )
 
 
 @supports_learnable_bias
+<<<<<<< HEAD
 @large_tensor_test_class("2GB", device=test_device[0])
+=======
+@large_tensor_test_class("2GB", device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestLearnableBiases(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -6391,7 +6821,11 @@ def _gold_check(self, eager, compiled, gold, tensor_name, fudge_factor=1.35):
     def _check_outputs_and_grads(
         self, out_eager, out_compiled, out_gold, tensors, names=None
     ):
+<<<<<<< HEAD
         backwards_grad = torch.randn_like(out_eager, device="cpu").to(out_eager.device)
+=======
+        backwards_grad = torch.randn_like(out_eager)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grads_eager = torch.autograd.grad((out_eager,), tensors, backwards_grad)
         grads_compiled = torch.autograd.grad((out_compiled,), tensors, backwards_grad)
         grads_gold = torch.autograd.grad((out_gold,), tensors, backwards_grad)
@@ -6593,7 +7027,11 @@ def test_local_window_bias(self, device, params):
         bias = torch.randn(
             2 * window_size + 1,
             device=device,
+<<<<<<< HEAD
             dtype=torch.float32,
+=======
+            dtype=params.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
 
@@ -6627,7 +7065,11 @@ def test_global_tokens_bias(self, device, params):
         bias = torch.randn(
             params.seq_length,
             device=device,
+<<<<<<< HEAD
             dtype=torch.float32,
+=======
+            dtype=params.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
 
@@ -6756,7 +7198,11 @@ def bias_func(score, b, h, q_idx, kv_idx):
         )
         # Error in backwards
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._inductor.exc.InductorError,
+=======
+            torch._inductor.exc.LoweringException,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Using multiple indexing operations on the same tensor that requires gradients",
         ):
             self._check_outputs_and_grads(
@@ -6810,12 +7256,20 @@ def test_head_specific_gate(self, device, params, mode: str):
         gate_score = torch.randn(
             params.num_heads,
             device=device,
+<<<<<<< HEAD
             dtype=torch.float32,
+=======
+            dtype=params.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
 
         def bias_func(score, b, h, q_idx, kv_idx):
+<<<<<<< HEAD
             return score * torch.sigmoid(gate_score[h])
+=======
+            return score * torch.sigmoid(gate_score[h].to(torch.float32))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         flex_compiled = torch.compile(flex_attention, mode=mode)
         out_eager = flex_attention(query, key, value, score_mod=bias_func)
@@ -6850,7 +7304,11 @@ def test_distinct_biases(self, device, params):
         bias2 = torch.randn(
             params.seq_length,
             device=device,
+<<<<<<< HEAD
             dtype=torch.float32,
+=======
+            dtype=params.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
 
@@ -6887,6 +7345,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
     )
+<<<<<<< HEAD
     @torch.compile
     def test_learnable_bias_global_compiled(self, device, params):
         batch_size = 1
@@ -6966,6 +7425,8 @@ def score_mod(score, b, h, q_idx, kv_idx):
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_relative_1d_bias_only_grad(self, device, params):
         query, key, value = self._init_tensors(params, device=device)
         query = query.detach().requires_grad_(False)
@@ -7049,6 +7510,7 @@ def test_flex_attention_with_dynamic_max_autotune_graph_partition(self, device):
         self._test_flex_attention_with_dynamic_max_autotune(device)
 
     @skip_on_cpu
+<<<<<<< HEAD
     def test_flex_attention_logging(self, device):
         with tempfile.TemporaryDirectory() as tmpdir:
             log_file = os.path.join(tmpdir, "flex_attention_configs")
@@ -7163,6 +7625,8 @@ def causal_mask(b, h, q_idx, kv_idx):
                             self.assertLessEqual(choices[0]["time"], choice["time"])
 
     @skip_on_cpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inspect_bug(self, device):
         # https://github.com/pytorch/pytorch/issues/139374
         def sliding_window(b, h, q_idx, kv_idx, val):
@@ -7422,6 +7886,7 @@ def _test_learnable_bias_inner(
             )
 
 
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TestFlexAttention, globals(), only_for=test_device, allow_xpu=True
 )
@@ -7438,6 +7903,12 @@ def _test_learnable_bias_inner(
     TestLearnableBiases, globals(), only_for=test_device, allow_xpu=True
 )
 
+=======
+instantiate_device_type_tests(TestFlexAttention, globals(), only_for=test_device)
+instantiate_device_type_tests(TestPagedAttention, globals(), only_for=test_device)
+instantiate_device_type_tests(TestBlockMask, globals(), only_for=("cuda",))
+instantiate_device_type_tests(TestLearnableBiases, globals(), only_for=test_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 728b77a2c12a9..bfc668bc7e26a 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -2,11 +2,17 @@
 # flake8: noqa: B950
 
 import functools
+<<<<<<< HEAD
 import sys
 import unittest
 from collections import namedtuple
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+import unittest
+from collections import namedtuple
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import expectedFailure
 from unittest.mock import patch
 
@@ -25,13 +31,19 @@
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import (
+<<<<<<< HEAD
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_BF16,
     with_tf32_off,
+=======
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
+<<<<<<< HEAD
     skipXPUIf,
 )
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
@@ -52,6 +64,13 @@
     torch.set_float32_matmul_precision("highest")
 else:
     torch.set_float32_matmul_precision("high")
+=======
+)
+
+
+Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
+torch.set_float32_matmul_precision("high")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 index = torch.ops.aten.index
 Tensor = torch.Tensor
@@ -61,6 +80,7 @@
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
+<<<<<<< HEAD
 TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
 
 if HAS_GPU:
@@ -79,6 +99,18 @@
         test_dtypes = [torch.float32, torch.bfloat16, torch.float16]
         test_dtypes_fast = [torch.float16]
         SKIP_UT_ON_CPU = False
+=======
+
+if TEST_ON_CUDA:
+    test_device = ("cuda",)
+    test_dtypes = (
+        [torch.float32, torch.bfloat16, torch.float16]
+        if PLATFORM_SUPPORTS_BF16
+        else [torch.float16, torch.float32]
+    )
+    test_dtypes_fast = [torch.float16]
+    SKIP_UT_ON_CPU = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else:
     test_device = ("cpu",)
     torch_config_string = torch.__config__.show()
@@ -98,6 +130,7 @@
     test_dtypes_fast = [torch.float32]
 
 
+<<<<<<< HEAD
 def skip_on_xpu(test_func):
     """Decorator to skip tests that are not supported on Intel GPU."""
     decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
@@ -105,12 +138,18 @@ def skip_on_xpu(test_func):
 
 
 def create_attention(score_mod, block_mask, enable_gqa=False, kernel_options=None):
+=======
+def create_attention(score_mod, block_mask, enable_gqa=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return functools.partial(
         flex_attention,
         score_mod=score_mod,
         block_mask=block_mask,
         enable_gqa=enable_gqa,
+<<<<<<< HEAD
         kernel_options=kernel_options,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -383,7 +422,10 @@ def run_test(
         V_D: int = D,
         block_mask: Optional[BlockMask] = None,
         device="cuda",
+<<<<<<< HEAD
         kernel_options=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         assert score_mod is not None or block_mask is not None, (
             "Must provide score_mod or block_mask"
@@ -414,10 +456,14 @@ def run_test(
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
         sdpa_partial = create_attention(
+<<<<<<< HEAD
             score_mod,
             block_mask,
             enable_gqa=(Q_H != KV_H),
             kernel_options=kernel_options,
+=======
+            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         if not self.test_inference_only:
@@ -564,6 +610,7 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
+<<<<<<< HEAD
         kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
         converted_block_mask = paged_attention.convert_logical_block_mask(
             block_mask, kv_len=kv_len_tensor
@@ -571,6 +618,10 @@ def preprocess_paged_attention(
         converted_score_mod = paged_attention.get_score_mod(
             score_mod, kv_len=kv_len_tensor
         )
+=======
+        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
+        converted_score_mod = paged_attention.get_score_mod(score_mod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
@@ -611,7 +662,11 @@ def run_paged_attention(
                 return_lse=True,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
+<<<<<<< HEAD
                 enable_gqa=(Q_H != KV_H),
+=======
+                enable_gqa=(not Q_H == KV_H),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             compiled_lse = None
@@ -622,7 +677,11 @@ def run_paged_attention(
                 return_lse=False,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
+<<<<<<< HEAD
                 enable_gqa=(Q_H != KV_H),
+=======
+                enable_gqa=(not Q_H == KV_H),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return compiled_out, compiled_lse
 
@@ -668,7 +727,13 @@ def run_test_with_paged_attention(
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, 1, KV_S, device=device)
 
+<<<<<<< HEAD
         sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
+=======
+        sdpa_partial = create_attention(
+            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         golden_out, gold_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -750,22 +815,37 @@ def run_test_with_call_paged_attention(
         )
 
     @supported_platform
+<<<<<<< HEAD
     @expectedFailure  # tl.dot does not support embedding size less than 16
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_bw_decoding_fails(self, device, dtype):
+=======
+    @expectedFailure
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_bw_decoding_fails(self, dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_kv = functools.partial(
             torch.randn,
             (2, 2, 128, 4),
             dtype=dtype,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (2, 2, 8, 4),
             dtype=dtype,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         q, k, v, backward_grad = make_q(), make_kv(), make_kv(), make_q()
@@ -784,7 +864,10 @@ def sdpa_hop(q, k, v, score_mod, block_mask):
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
+<<<<<<< HEAD
     @with_tf32_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_builtin_score_mods(
         self, device, dtype: torch.dtype, score_mod: Callable, head_dims
     ):
@@ -852,6 +935,7 @@ def test_builtin_score_mods_different_block_size(
         )
         self.run_test(score_mod, dtype, block_mask=block_mask, device=device)
 
+<<<<<<< HEAD
     @unittest.skipIf(not has_triton_tma_device(), "Skip when TMA is not available")
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_tma_decoding(self, device, dtype: torch.dtype):
@@ -874,6 +958,8 @@ def test_tma_decoding(self, device, dtype: torch.dtype):
             kernel_options=kernel_options,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("k_s", test_input_strides)
@@ -908,7 +994,11 @@ def test_strided_inputs(self, device, dtype: torch.dtype, k_s, v_s, head_dims):
         sdpa_partial = create_attention(
             score_mod=score_mod,
             block_mask=None,
+<<<<<<< HEAD
             enable_gqa=(Hq != Hkv),
+=======
+            enable_gqa=(not Hq == Hkv),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         ref_out = sdpa_partial(q, k, v)
@@ -1058,12 +1148,21 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+<<<<<<< HEAD
     def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self, device):
         KV_S = S - 3
         Q_S = 3
         offset_kv = torch.randn(KV_S, device=device, dtype=torch.bfloat16)
         offset_q = torch.randn(Q_S, device=device, dtype=torch.bfloat16)
         offset_tensor = torch.tensor(S // 2 - 3, device=device, dtype=torch.int32)
+=======
+    def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self):
+        KV_S = S - 3
+        Q_S = 3
+        offset_kv = torch.randn(KV_S, device="cuda", dtype=torch.bfloat16)
+        offset_q = torch.randn(Q_S, device="cuda", dtype=torch.bfloat16)
+        offset_tensor = torch.tensor(S // 2 - 3, device="cuda", dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, q, kv):
             return score + offset_kv[kv] + offset_q[q]
@@ -1071,6 +1170,7 @@ def score_mod(score, b, h, q, kv):
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S, device=device)
         self.run_test(
             Q_S=Q_S,
@@ -1079,6 +1179,10 @@ def mask_mod(b, h, q, kv):
             score_mod=score_mod,
             device=device,
         )
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S)
+        self.run_test(Q_S=Q_S, KV_S=KV_S, block_mask=block_mask, score_mod=score_mod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1124,7 +1228,10 @@ def bias_mod(score, b, h, q, kv):
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
+<<<<<<< HEAD
     @with_tf32_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
         qk_d, v_d = head_dims
         self.run_test(
@@ -1146,7 +1253,11 @@ def test_head_dependent_mask_mod(
 
         def head_attention_mod(kv_head_num):
             head_type = torch.tensor(
+<<<<<<< HEAD
                 [i % kv_head_num != 0 for i in range(kv_head_num)],
+=======
+                [False if i % kv_head_num == 0 else True for i in range(kv_head_num)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=torch.bool,
                 device=device,
             )
@@ -1192,7 +1303,11 @@ def score_mod_func(score, b, h, q, kv):
             requires_grad=True,
         )
         query, key, value = make_q(), make_kv(), make_kv()
+<<<<<<< HEAD
         # floor_div is not decomposed in decomposition_table is empty
+=======
+        # floor_div is not decomposed in decompostion_table is empty
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(flex_attention, score_mod=score_mod_func)
         gm = make_fx(attention, decomposition_table={})(query, key, value)
         self.assertExpectedInline(
@@ -1279,7 +1394,10 @@ def score_mod_scale(qk, b, h, q, kv):
     @supported_platform
     @common_utils.parametrize("head_dim", [17, 24, 94, 121])
     @common_utils.parametrize("dtype", test_dtypes_fast)
+<<<<<<< HEAD
     @common_utils.serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_pow_2_headdim(self, device, dtype, head_dim):
         self.run_test(
             _rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
@@ -1581,6 +1699,7 @@ def score_mod(score, b, h, m, n):
 
         self.run_test(score_mod, device=device)
         self.run_test_with_paged_attention(score_mod, device=device)
+<<<<<<< HEAD
         self.run_test_with_paged_attention(
             score_mod=score_mod,
             dtype=torch.bfloat16,
@@ -1594,6 +1713,8 @@ def score_mod(score, b, h, m, n):
             V_D=16,
             device=device,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @patch.object(torch._inductor.config, "max_autotune", True)
@@ -1758,19 +1879,31 @@ def mask_mod(b, h, q, kv):
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
+<<<<<<< HEAD
     def test_logsumexp_correctness(self, device, dtype, score_mod):
+=======
+    def test_logsumexp_correctness(self, dtype, score_mod):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         q, k, v = make_q(), make_kv(), make_kv()
@@ -1810,6 +1943,7 @@ def eager_sdpa_hop(q, k, v, score_mod):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+<<<<<<< HEAD
     def test_not_pw_of_two(self, device):
         query = torch.randn(1, 12, 1, 16, device=device)
         key = torch.randn(1, 2, 128, 16, device=device)
@@ -1821,18 +1955,29 @@ def test_not_pw_of_two(self, device):
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     def test_logsumexp_only_return(self, device):
+=======
+    def test_logsumexp_only_return(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=torch.float32,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=torch.float32,
+<<<<<<< HEAD
             device=device,
+=======
+            device="cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
 
@@ -1851,7 +1996,10 @@ def func(q, k, v, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
     @skip_on_xpu  # TODO: SYCL acc issue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_sparse_mulitple_block_size(self, device):
         def generate_causal_offset(offset: torch.Tensor):
             def causal_offset_mask(b, h, q_idx, kv_idx):
@@ -1968,7 +2116,11 @@ def causal_mask(b, h, q, kv):
 
         # init 4 requests with different prefill length
         prefill_length = [5, 98, 47, 194]
+<<<<<<< HEAD
         queries, keys, values = [], [], []
+=======
+        querys, keys, values = [], [], []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for seq_len in prefill_length:
             q = torch.randn(
                 1,
@@ -1997,13 +2149,21 @@ def causal_mask(b, h, q, kv):
                 dtype=dtype,
                 requires_grad=False,
             )
+<<<<<<< HEAD
             queries.append(q)
+=======
+            querys.append(q)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             keys.append(k)
             values.append(v)
 
         # get ground truth output
         ref_outs, golden_outs = [], []
+<<<<<<< HEAD
         for q, k, v in zip(queries, keys, values):
+=======
+        for q, k, v in zip(querys, keys, values):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
             q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
@@ -2063,6 +2223,7 @@ def causal_mask(b, h, q, kv):
         input_pos = torch.tensor(prefill_length, device=device, dtype=torch.int32).view(
             max_batch_size, 1
         )
+<<<<<<< HEAD
         kv_len_tensor = torch.full(
             (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
         )
@@ -2079,6 +2240,17 @@ def causal_mask(b, h, q, kv):
         )
         paged_out = compiled_sdpa(
             torch.cat(queries, 0), k_cache, v_cache, block_mask=new_block_mask
+=======
+        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        new_block_mask.seq_lengths = (1, new_block_mask.seq_lengths[1])
+        compiled_sdpa = torch.compile(
+            create_attention(
+                paged_cache.get_score_mod(score_mod), new_block_mask, enable_gqa=False
+            )
+        )
+        paged_out = compiled_sdpa(
+            torch.cat(querys, 0), k_cache, v_cache, block_mask=new_block_mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         with torch.no_grad():
@@ -2092,9 +2264,13 @@ def causal_mask(b, h, q, kv):
             self._check_equal(golden_outs, ref_outs, paged_out, fudge_factor, "Out")
 
 
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TestFlexDecoding, globals(), only_for=test_device, allow_xpu=True
 )
+=======
+instantiate_device_type_tests(TestFlexDecoding, globals(), only_for=test_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_foreach.py b/test/inductor/test_foreach.py
index 0ccf6c33f7a63..f35364dc8e7b6 100644
--- a/test/inductor/test_foreach.py
+++ b/test/inductor/test_foreach.py
@@ -14,8 +14,13 @@
     IS_FBCODE,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 from torch.testing._internal.triton_utils import requires_gpu
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import tree_flatten
 
 
@@ -23,11 +28,19 @@
 
 try:
     try:
+<<<<<<< HEAD
         from .test_torchinductor import check_model, check_model_gpu
     except ImportError:
         from test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
             check_model,
             check_model_gpu,
+=======
+        from .test_torchinductor import check_model, check_model_cuda
+    except ImportError:
+        from test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+            check_model,
+            check_model_cuda,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 except (unittest.SkipTest, ImportError) as e:
     sys.stderr.write(f"{type(e)}: {e}\n")
@@ -188,6 +201,7 @@ def recipaddmul_op(x, y, z):
 def gen_args(op):
     if op in un_ops_under_test:
         return (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
         )
@@ -206,12 +220,36 @@ def gen_args(op):
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+    elif op in bin_ops_under_test:
+        return (
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+    else:
+        return (
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
 @instantiate_parametrized_tests
 class ForeachTests(TestCase):
+<<<<<<< HEAD
     check_model_gpu = check_model_gpu
+=======
+    check_model_cuda = check_model_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_model_cpu = check_model
     check_kernel_count = True
 
@@ -239,7 +277,11 @@ def fn(a0, a1, b0, b1):
             def fn(a0, a1, b0, b1, c0, c1):
                 return op([a0, a1], [b0, b1], [c0, c1])
 
+<<<<<<< HEAD
         self.check_model_gpu(
+=======
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             gen_args(op),
         )
@@ -248,16 +290,25 @@ def _test_single_scalar(self, op):
         def fn(a0, a1):
             return op([a0, a1], 3.3)
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
     def _test_single_scalar_tensor(self, op):
         def fn(a0, a1):
+<<<<<<< HEAD
             return op([a0, a1], torch.tensor(3.3, device=GPU_TYPE))
 
         self.check_model_gpu(
@@ -277,24 +328,54 @@ def test_foreach_cpp_wrapper_cuda(self):
     test_foreach_cpp_wrapper_xpu = test_foreach_cpp_wrapper_cuda
 
     @requires_gpu
+=======
+            return op([a0, a1], torch.tensor(3.3, device="cuda:0"))
+
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+            ),
+        )
+
+    # called in test_cuda_cpp_wrapper.py
+    @requires_cuda
+    def test_foreach_cpp_wrapper_cuda(self):
+        self._test_single_list(op=torch._foreach_add)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_single_list(self, op):
         self._test_single_list(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_single_scalar(self, op):
         self._test_single_scalar(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_tensor_bin_ops
     def test_single_scalar_tensor(self, op):
         self._test_single_scalar_tensor(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_scheduler_fusion_list(self, op):
         if op in un_ops_under_test:
@@ -315,31 +396,51 @@ def fn(a0, a1, b0, b1, c0, c1):
                 c = op([a0, a1], [b0, b1], [c0, c1])
                 return c, torch._foreach_add([a0, a1], c)
 
+<<<<<<< HEAD
         self.check_model_gpu(
+=======
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             gen_args(op),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_scheduler_fusion_scalar(self, op):
         def fn(a0, a1):
             c = op([a0, a1], 3.4)
             return c, torch._foreach_add([a0, a1], c)
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_broadcasting(self, op):
         def fn(a0, a1, b0, b1):
@@ -348,17 +449,28 @@ def fn(a0, a1, b0, b1):
         fn_opt = torch.compile(fn)
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 1, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(1, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
+=======
+            torch.rand(10, 1, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(1, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         actual = fn_opt(*inputs)
         expected = fn(*inputs)
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_singleton_lists(self, op):
         if op in un_ops_under_test:
@@ -366,15 +478,24 @@ def test_singleton_lists(self, op):
             def fn(a0):
                 return op([a0])
 
+<<<<<<< HEAD
             args = (torch.rand(10, 10, device=GPU_TYPE),)
+=======
+            args = (torch.rand(10, 10, device="cuda:0"),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif op in bin_ops_under_test:
 
             def fn(a0, b0):
                 return op([a0], [b0])
 
             args = (
+<<<<<<< HEAD
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
+=======
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         else:
@@ -383,19 +504,32 @@ def fn(a0, b0, c0):
                 return op([a0], [b0], [c0])
 
             args = (
+<<<<<<< HEAD
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
             )
 
         self.check_model_gpu(
+=======
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+            )
+
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             args,
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     def test_type_promotion(self, op):
         def fn(a0, a1, b0, b1):
@@ -406,17 +540,28 @@ def fn(a0, a1, b0, b1):
         max32 = torch.iinfo(torch.int32).max
         max64 = torch.iinfo(torch.int64).max
         inputs = (
+<<<<<<< HEAD
             torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
             torch.randint(max32, (20, 20), device=GPU_TYPE, dtype=torch.int32),
             torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
             torch.randint(max64, (20, 20), device=GPU_TYPE, dtype=torch.int64),
+=======
+            torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
+            torch.randint(max32, (20, 20), device="cuda:0", dtype=torch.int32),
+            torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
+            torch.randint(max64, (20, 20), device="cuda:0", dtype=torch.int64),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         actual = fn_opt(*inputs)
         expected = fn(*inputs)
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_kernel_split_arg_limit_list(self, op):
         # NB: foeach_copy won't pass this test because it will dce one set of buffers
@@ -429,8 +574,13 @@ def fn(a, b):
         max_args = 370
         max_list_len = (max_args // 3) + 1
         inputs = (
+<<<<<<< HEAD
             [torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
             [torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
+=======
+            [torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
+            [torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         actual = fn_opt(*inputs)
@@ -438,7 +588,11 @@ def fn(a, b):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     @unittest.skip(
         "Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
@@ -451,20 +605,29 @@ def fn(a):
 
         max_args = 370
         max_list_len = (max_args // 2) + 1
+<<<<<<< HEAD
         inputs = ([torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],)
+=======
+        inputs = ([torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         actual = fn_opt(*inputs)
         expected = fn(*inputs)
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     def test_fusion_duplicate_buffer_list(self, op):
         def fn(a0, a1, b0, b1):
             c = op([a0, a1], [b0, b1])
             return op([a0, b0], [c[0], c[0]])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -472,6 +635,15 @@ def fn(a0, a1, b0, b1):
                 torch.rand(20, 20, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             reference_in_float=False,
             check_lowp=False,
@@ -482,7 +654,11 @@ def fn(a0, a1, b0, b1):
             kernel_count = 2
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, kernel_count)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_non_foreach_consumer_list(self, op):
         if op in un_ops_under_test:
@@ -503,31 +679,51 @@ def fn(a0, a1, b0, b1, c0, c1):
                 c = op([a0, a1], [b0, b1], [c0, c1])
                 return torch.mul(c[0], a0)
 
+<<<<<<< HEAD
         self.check_model_gpu(
+=======
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             gen_args(op),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_non_foreach_consumer_scalar(self, op):
         def fn(a0, a1):
             c = op([a0, a1], 4.7)
             return torch.mul(c[0], a0)
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_non_foreach_producer_list(self, op):
         if op in un_ops_under_test:
@@ -551,13 +747,21 @@ def fn(a0, a1, b0, b1, c0, c1):
                 c1 = torch.add(a1, b1)
                 return op([a0, a1], [b0, b1], [c0, c1])
 
+<<<<<<< HEAD
         self.check_model_gpu(
+=======
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn, gen_args(op), reference_in_float=False, check_lowp=False
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_non_foreach_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -565,6 +769,7 @@ def fn(a0, a1, b0, b1):
             c1 = torch.mul(a1, b1)
             return op([c0, c1], 5.6)
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -572,12 +777,25 @@ def fn(a0, a1, b0, b1):
                 torch.rand(20, 20, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @all_ops
     def test_non_foreach_consumer_producer_list(self, op):
         if op in un_ops_under_test:
@@ -610,7 +828,11 @@ def fn(a0, a1, b0, b1, c0, c1):
                 e1 = torch.mul(d[1], a1)
                 return [e0, e1]
 
+<<<<<<< HEAD
         self.check_model_gpu(
+=======
+        self.check_model_cuda(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             gen_args(op),
             reference_in_float=False,
@@ -619,7 +841,11 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scalar_bin_ops
     def test_non_foreach_consumer_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -630,6 +856,7 @@ def fn(a0, a1, b0, b1):
             e1 = torch.mul(d[1], a1)
             return [e0, e1]
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -637,6 +864,15 @@ def fn(a0, a1, b0, b1):
                 torch.rand(20, 20, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             reference_in_float=False,
             check_lowp=False,
@@ -644,16 +880,24 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
     @bin_ops
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
     @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", False)
+=======
+    @requires_cuda
+    @bin_ops
+    @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
+    @torch._dynamo.config.patch("assume_static_by_default", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_fallback(self, op):
         def fn(a0, a1, b0, b1):
             return op([a0, a1], [b0, b1])
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
@@ -665,6 +909,19 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
     @requires_gpu
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
     @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@@ -673,6 +930,7 @@ def fn(a0, a1, b0, b1):
             return op([a0, a1], [b0, b1])
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
@@ -684,6 +942,19 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
     @requires_gpu
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
     @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@@ -693,6 +964,7 @@ def fn(a0, a1, b0, b1):
             return op([a0, a1], [b0, b1])
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
@@ -705,6 +977,15 @@ def fn(a0, a1, b0, b1):
     test_enable_dynamic_shapes_cpp_wrapper_xpu = (
         test_enable_dynamic_shapes_cpp_wrapper_cuda
     )
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(IS_FBCODE, "cpp compile not supported in fbcode")
     @bin_ops
@@ -723,12 +1004,17 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decomp_ops
     def test_decomp(self, op):
         def fn(a0, a1, b0, b1, c0, c1):
             return op([a0, a1], [b0, b1], [c0, c1], value=0.5)
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -738,12 +1024,27 @@ def fn(a0, a1, b0, b1, c0, c1):
                 torch.rand(20, 20, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(20, 20, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(20, 20, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_concat(self):
         def fn(x1, x2, x3, w1, w2, w3):
             x = torch.stack([x1, x2, x3])
@@ -753,24 +1054,40 @@ def fn(x1, x2, x3, w1, w2, w3):
 
             return y
 
+<<<<<<< HEAD
         x1 = torch.randn(5, 4).to(GPU_TYPE)
         x2 = x1 + 1
         x3 = x1 + 2
         w1 = torch.randn(4, 3).to(GPU_TYPE)
+=======
+        x1 = torch.randn(5, 4).cuda()
+        x2 = x1 + 1
+        x3 = x1 + 2
+        w1 = torch.randn(4, 3).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         w2 = w1 + 1
         w3 = w1 + 2
 
         args = (x1, x2, x3, w1, w2, w3)
 
+<<<<<<< HEAD
         self.check_model_gpu(fn, args)
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
     @requires_gpu
+=======
+        self.check_model_cuda(fn, args)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_zero_elems(self):
         def fn(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -778,17 +1095,31 @@ def fn(a0, a1, b0, b1):
                 torch.rand(10, 10, device=GPU_TYPE),
                 torch.rand(0, device=GPU_TYPE),
                 torch.rand(10, 10, device=GPU_TYPE),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(0, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(0, device="cuda:0"),
+                torch.rand(10, 10, device="cuda:0"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     def test_2d_blocking(self, op):
         def fn(a0, a1, b0, b1):
             return op([a0, a1], [b0, b1])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -796,17 +1127,31 @@ def fn(a0, a1, b0, b1):
                 torch.rand(10, 30, device=GPU_TYPE),
                 torch.rand(40, 10, device=GPU_TYPE).t(),
                 torch.rand(30, 10, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 40, device="cuda:0"),
+                torch.rand(10, 30, device="cuda:0"),
+                torch.rand(40, 10, device="cuda:0").t(),
+                torch.rand(30, 10, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     def test_2d_blocking_partitioning(self, op):
         def fn(a0, a1, b0, b1):
             return op([a0, a1], [b0, b1])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -814,12 +1159,25 @@ def fn(a0, a1, b0, b1):
                 torch.rand(40, 30, device=GPU_TYPE),
                 torch.rand(30, 20, device=GPU_TYPE),
                 torch.rand(30, 40, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(30, 20, device="cuda:0"),
+                torch.rand(40, 30, device="cuda:0"),
+                torch.rand(30, 20, device="cuda:0"),
+                torch.rand(30, 40, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     def test_2d_blocking_partitioning_elems(self, op):
         """2D blocking should be grouped by number of yelems"""
@@ -827,6 +1185,7 @@ def test_2d_blocking_partitioning_elems(self, op):
         def fn(a0, a1, a2, b0, b1, b2):
             return op([a0, a1, a2], [b0, b1, b2])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -836,12 +1195,27 @@ def fn(a0, a1, a2, b0, b1, b2):
                 torch.rand(20, 10, device=GPU_TYPE).t(),
                 torch.rand(20, 30, device=GPU_TYPE).t(),
                 torch.rand(30, 10, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 20, device="cuda:0"),
+                torch.rand(30, 20, device="cuda:0"),
+                torch.rand(10, 30, device="cuda:0"),
+                torch.rand(20, 10, device="cuda:0").t(),
+                torch.rand(20, 30, device="cuda:0").t(),
+                torch.rand(30, 10, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @bin_ops
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
     def test_2d_blocking_partitioning_mixed_sizes(self, op):
@@ -850,6 +1224,7 @@ def test_2d_blocking_partitioning_mixed_sizes(self, op):
         def fn(a0, a1, a2, b0, b1, b2):
             return op([a0, a1, a2], [b0, b1, b2])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -859,12 +1234,27 @@ def fn(a0, a1, a2, b0, b1, b2):
                 torch.rand(20, 10, device=GPU_TYPE).t(),
                 torch.rand(20, 30, device=GPU_TYPE).t(),
                 torch.rand(30, 10, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(10, 20, device="cuda:0"),
+                torch.rand(30, 20, device="cuda:0"),
+                torch.rand(10, 30, device="cuda:0"),
+                torch.rand(20, 10, device="cuda:0").t(),
+                torch.rand(20, 30, device="cuda:0").t(),
+                torch.rand(30, 10, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inplace_bin_ops
     def test_reinplacing(self, op):
         def fn(a0, a1, b0, b1):
@@ -872,6 +1262,7 @@ def fn(a0, a1, b0, b1):
             return [a0, a1]
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
@@ -887,10 +1278,28 @@ def fn(a0, a1, b0, b1):
     def test_reinplacing_mut_before(self, op):
         def fn(a0, a1, b0, b1):
             a0.add_(torch.ones(10, 10, device=GPU_TYPE))
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs, check_lowp=False)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda
+    @inplace_bin_ops
+    def test_reinplacing_mut_before(self, op):
+        def fn(a0, a1, b0, b1):
+            a0.add_(torch.ones(10, 10, device="cuda:0"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op([a0, a1], [b0, b1])
             return [a0, a1]
 
         inputs = (
+<<<<<<< HEAD
             torch.rand(10, 10, device=GPU_TYPE),
             torch.rand(20, 20, device=GPU_TYPE),
             torch.rand(10, 10, device=GPU_TYPE),
@@ -902,10 +1311,24 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
     @requires_gpu
+=======
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs, check_lowp=False)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inplace_bin_ops
     def test_reinplacing_mut_after(self, op):
         def fn(a0, a1, b0, b1):
             op([a0, a1], [b0, b1])
+<<<<<<< HEAD
             a0.add_(torch.ones(10, 10, device=GPU_TYPE))
             return [a0, a1]
 
@@ -921,14 +1344,37 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
     @requires_gpu
+=======
+            a0.add_(torch.ones(10, 10, device="cuda:0"))
+            return [a0, a1]
+
+        inputs = (
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device="cuda:0"),
+            torch.rand(20, 20, device="cuda:0"),
+        )
+
+        self.check_model_cuda(fn, inputs, check_lowp=False)
+
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_device(self):
         def test_foreach_add(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
 
         inps = [
+<<<<<<< HEAD
             torch.ones(10, 10, device=GPU_TYPE),
             torch.ones(20, 20, device="cpu"),
             torch.zeros(10, 10, device=GPU_TYPE),
+=======
+            torch.ones(10, 10, device="cuda"),
+            torch.ones(20, 20, device="cpu"),
+            torch.zeros(10, 10, device="cuda"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.zeros(20, 20, device="cpu"),
         ]
 
@@ -938,13 +1384,22 @@ def test_foreach_add(a0, a1, b0, b1):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_aliasing(self):
         def test_foreach_add(a0, a1, a2, b0, b1, b2):
             return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])
 
+<<<<<<< HEAD
         input = torch.ones(10, 10, device=GPU_TYPE)
         input2 = torch.ones(10, 10, device=GPU_TYPE)
+=======
+        input = torch.ones(10, 10, device="cuda")
+        input2 = torch.ones(10, 10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inps = [
             input,
             input.view(10, 10),
@@ -960,7 +1415,11 @@ def test_foreach_add(a0, a1, a2, b0, b1, b2):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 1)
     def test_2d_block_no_mixed_sizes_no_mask(self):
         """2D blocking with no mixed sizes constant mask"""
@@ -968,6 +1427,7 @@ def test_2d_block_no_mixed_sizes_no_mask(self):
         def fn(a0, a1, a2, b0, b1, b2):
             return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -977,12 +1437,27 @@ def fn(a0, a1, a2, b0, b1, b2):
                 torch.rand(2048, 1024, device=GPU_TYPE).t(),
                 torch.rand(2048, 2048, device=GPU_TYPE).t(),
                 torch.rand(2048, 1024, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(1024, 2048, device="cuda:0"),
+                torch.rand(2048, 2048, device="cuda:0"),
+                torch.rand(1024, 2048, device="cuda:0"),
+                torch.rand(2048, 1024, device="cuda:0").t(),
+                torch.rand(2048, 2048, device="cuda:0").t(),
+                torch.rand(2048, 1024, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
     def test_2d_block_mixed_sizes_with_mask(self):
         """2D blocking with mixed sizes should have mask"""
@@ -990,6 +1465,7 @@ def test_2d_block_mixed_sizes_with_mask(self):
         def fn(a0, a1, a2, b0, b1, b2):
             return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
 
+<<<<<<< HEAD
         self.check_model_gpu(
             fn,
             (
@@ -999,12 +1475,27 @@ def fn(a0, a1, a2, b0, b1, b2):
                 torch.rand(2048, 1024, device=GPU_TYPE).t(),
                 torch.rand(2048, 2048, device=GPU_TYPE).t(),
                 torch.rand(2048, 1024, device=GPU_TYPE).t(),
+=======
+        self.check_model_cuda(
+            fn,
+            (
+                torch.rand(1024, 2048, device="cuda:0"),
+                torch.rand(2048, 2048, device="cuda:0"),
+                torch.rand(1024, 2048, device="cuda:0"),
+                torch.rand(2048, 1024, device="cuda:0").t(),
+                torch.rand(2048, 2048, device="cuda:0").t(),
+                torch.rand(2048, 1024, device="cuda:0").t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @foreach_map_bin_ops
     def test_foreach_map_backward_binary(self, op):
         from torch._dynamo.polyfills import foreach_map_fn
@@ -1019,6 +1510,7 @@ def ref_fn(xs, ys):
 
         ref_inps = (
             [
+<<<<<<< HEAD
                 torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
                 torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
                 torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
@@ -1027,6 +1519,16 @@ def ref_fn(xs, ys):
                 torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
                 torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
                 torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
+=======
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+            ],
+            [
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
         inps = (
@@ -1045,7 +1547,11 @@ def ref_fn(xs, ys):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_map_input_mutation(self):
         def fn(xs, ys):
             outs = foreach_map_add_inplace(xs, ys)
@@ -1053,6 +1559,7 @@ def fn(xs, ys):
 
         ref_inps = (
             [
+<<<<<<< HEAD
                 torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
                 torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
                 torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
@@ -1061,6 +1568,16 @@ def fn(xs, ys):
                 torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
                 torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
                 torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
+=======
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+            ],
+            [
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
         # Set requires_grad to be False to avoid mutating a leaf variable
@@ -1081,7 +1598,11 @@ def fn(xs, ys):
             ):
                 _ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))
 
+<<<<<<< HEAD
     @requires_gpu
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @foreach_map_un_ops
     def test_foreach_map_backward_unary(self, op):
         from torch._dynamo.polyfills import foreach_map_fn
@@ -1095,9 +1616,15 @@ def ref_fn(xs):
             return outs[0].sum() + outs[1].sum() + outs[2].sum()
 
         ref_inp = [
+<<<<<<< HEAD
             torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
             torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
             torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
+=======
+            torch.rand(10, 20, device="cuda:0", requires_grad=True),
+            torch.rand(10, 30, device="cuda:0", requires_grad=True),
+            torch.rand(30, 30, device="cuda:0", requires_grad=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         inp = [x.clone().detach().requires_grad_(True) for x in ref_inp]
@@ -1117,5 +1644,9 @@ def ref_fn(xs):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_GPU:
+=======
+    if HAS_CPU or HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index 1d7551ba1e5aa..e185dac8cdb0a 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -7,12 +7,17 @@
 import torch
 from torch import Tensor
 from torch._inductor import config, utils
+<<<<<<< HEAD
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.nn.functional import ScalingType  # type: ignore[attr-defined]
 from torch.testing._internal.common_cuda import (
     _get_torch_cuda_version,
+=======
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_cuda import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MX_GEMM,
 )
@@ -22,14 +27,22 @@
     parametrize,
 )
 from torch.testing._internal.inductor_utils import (
+<<<<<<< HEAD
     _quantize_blockwise,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _quantize_rowwise,
     _quantize_tensorwise,
     _to_fp8_saturated,
     HAS_CPU,
+<<<<<<< HEAD
     HAS_CUDA_AND_TRITON,
 )
 from torch.testing._internal.jit_utils import FileCheck
+=======
+    HAS_CUDA,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton_tma_device
 
 
@@ -46,7 +59,11 @@ def _fix_fp8_dtype_for_rocm(
     # with MI300 supported FP8 types if device is GPU:
     #    e4m3fn -> e4m3fnuz
     #    e5m2   -> e5m2fnuz
+<<<<<<< HEAD
     # Supports single, tuple and list of dtypes
+=======
+    # Supports single, typle and list of dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Keeps the same test name for CUDA and ROCm
     # Also it allows to enable FP8 inductor tests for CPU
     if (
@@ -469,6 +486,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # autotuning for the compiled case, the results can be different because of
             # the way blocks of results are accumulated (float addition not associative), so
             # setting a small absolute tolerance in these tests
+<<<<<<< HEAD
             if dtype == torch.bfloat16:
                 self.assertEqual(y_eager, y_compiled, rtol=5e-2, atol=0.07)
             else:
@@ -638,6 +656,8 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # autotuning for the compiled case, the results can be different because of
             # the way blocks of results are accumulated (float addition not associative), so
             # setting a small absolute tolerance in these tests
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@@ -704,6 +724,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
+<<<<<<< HEAD
         torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@@ -877,6 +898,8 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         ).run(code[0])
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@@ -935,7 +958,11 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
+<<<<<<< HEAD
         torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
+=======
+        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
@@ -1114,5 +1141,9 @@ def linear(x, w_t_fp8, w_inverse_scale, bias):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON or HAS_CPU:
+=======
+    if HAS_CUDA or HAS_CPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index 4438df2884871..fc8acd2abcf0d 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -15,12 +15,16 @@
     SM80OrLater,
 )
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
     HAS_CUDA_AND_TRITON,
     HAS_XPU_AND_TRITON,
 )
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def checkpoint_wrapper(fn):
@@ -997,6 +1001,7 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value)
 
+<<<<<<< HEAD
         tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
         for tensor_shape in tensor_shapes:
             attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
@@ -1012,6 +1017,22 @@ def dot_prod_attention(
                 has_dropout=False,
                 check_train=False,
             )
+=======
+        tensor_shape = (4, 2, 16, 32)
+        attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            attn_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_sdpa_rewriter_22(self):
         def dot_prod_attention(
@@ -1028,6 +1049,7 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value), key, value
 
+<<<<<<< HEAD
         tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
         for tensor_shape in tensor_shapes:
             attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
@@ -1053,6 +1075,32 @@ def dot_prod_attention(
                 check_train=False,
                 contains=self.device == "cpu",
             )
+=======
+        tensor_shape = (4, 2, 16, 32)
+        attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            attn_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+        )
+        # test attn_mask with stride of last dim != 1
+        attn_mask_ = attn_mask.transpose(2, 3)
+        args[3] = attn_mask_
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+            contains=self.device == "cpu",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_sdpa_rewriter_23(self):
         def dot_prod_attention(
@@ -1069,6 +1117,7 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value), key, value
 
+<<<<<<< HEAD
         tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
         for tensor_shape in tensor_shapes:
             args = [
@@ -1113,6 +1162,13 @@ def dot_prod_attention(
             torch.randn(tensor_shape, device=self.device, dtype=torch.float),
             torch.randn(tensor_shape, device=self.device, dtype=torch.float),
             attn_mask,
+=======
+        tensor_shape = (4, 2, 16, 32)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         self._check_common(
             dot_prod_attention,
@@ -1122,7 +1178,11 @@ def dot_prod_attention(
         )
 
 
+<<<<<<< HEAD
 if HAS_XPU_AND_TRITON or (HAS_CUDA_AND_TRITON and PLATFORM_SUPPORTS_FUSED_ATTENTION):
+=======
+if HAS_XPU or (HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SDPAPatternRewriterGpuTests(TestSDPAPatternRewriterTemplate):
         device = GPU_TYPE
@@ -1189,9 +1249,12 @@ class SDPAPatternRewriterGpuTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_23_gpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_23
         )
+<<<<<<< HEAD
         test_sdpa_rewriter_24_gpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_24
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SDPAPatternRewriterGpuDynamicTests(SDPAPatternRewriterGpuTests):
         use_static_shapes = False
@@ -1258,9 +1321,12 @@ class SDPAPatternRewriterCpuTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_23_cpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_23
         )
+<<<<<<< HEAD
         test_sdpa_rewriter_24_cpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_24
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SDPAPatternRewriterCpuDynamicTests(SDPAPatternRewriterCpuTests):
         use_static_shapes = False
diff --git a/test/inductor/test_fuzzer.py b/test/inductor/test_fuzzer.py
index d08f4c9282fa4..07caa406811ff 100644
--- a/test/inductor/test_fuzzer.py
+++ b/test/inductor/test_fuzzer.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from typing import Literal
 from unittest.mock import MagicMock, patch
@@ -44,6 +48,10 @@ def test_fn() -> bool:
 
 
 class TestConfigFuzzer(TestCase):
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sampling_method_toggle(self):
         toggle = SamplingMethod.dispatch(SamplingMethod.TOGGLE)
         self.assertEqual(toggle("", bool, False), True)
@@ -53,22 +61,38 @@ def test_sampling_method_toggle(self):
         self.assertTrue("bar" in toggle("", list[Literal["foo", "bar"]], ["foo"]))
         self.assertTrue("foo" in toggle("", list[Literal["foo", "bar"]], ["bar"]))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sampling_method_random(self):
         random = SamplingMethod.dispatch(SamplingMethod.RANDOM)
         samp = [random("", bool, False) for i in range(1000)]
         self.assertTrue(not all(samp))
 
     @unittest.skipIf(not HAS_GPU, "requires gpu")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_inductor_gpu(self):
         fuzzer = ConfigFuzzer(inductor_config, create_simple_test_model_gpu, seed=30)
         self.assertIsNotNone(fuzzer.default)
         fuzzer.reproduce([{"max_fusion_size": 1}])
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_inductor_cpu(self):
         fuzzer = ConfigFuzzer(inductor_config, create_simple_test_model_cpu, seed=100)
         self.assertIsNotNone(fuzzer.default)
         fuzzer.reproduce([{"max_fusion_size": 1}])
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_bisector_exception(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -89,6 +113,10 @@ def myfn():
         for res in results:
             self.assertEqual(res, key_1)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_bisector_boolean(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -107,6 +135,10 @@ def myfn():
         for res in results:
             self.assertEqual(res, key_1)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_n_tuple(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -124,6 +156,10 @@ def myfn():
         self.assertEqual(results.num_ran(), max_combo)
         self.assertEqual(results.lookup(tuple(key_1.keys())), Status.FAILED_RUN_RETURN)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_inductor_bisect(self):
         # these values just chosen randomly, change to different ones if necessary
         key_1 = {"split_reductions": False, "compute_all_bounds": True}
@@ -154,10 +190,15 @@ def myfn():
             - set(MODULE_DEFAULTS["torch._inductor.config"].keys()),
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not IS_LINUX, "PerfCounters are only supported on Linux")
     @unittest.skip(
         "Need default values for dynamo flags - https://github.com/pytorch/pytorch/issues/164062"
     )
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+    @unittest.skipIf(not IS_LINUX, "PerfCounters are only supported on Linux")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_dynamo_bisect(self):
         # these values just chosen randomly, change to different ones if necessary
         key_1 = {"dead_code_elimination": False, "specialize_int": True}
@@ -188,6 +229,10 @@ def myfn():
             - set(MODULE_DEFAULTS["torch._dynamo.config"].keys()),
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch("torch.compile")
     def test_fuzzer_inductor_calling_compile(self, compile):
         def create_key_1():
@@ -201,6 +246,10 @@ def myfn():
         fuzzer.bisect(num_attempts=num_attempts, p=0.5)
         self.assertEqual(compile.call_count, num_attempts)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuzzer_running_test(self):
         def create_key_1():
             def myfn():
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index 72eb37c1e1b96..6298a3ba6fe40 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -6,8 +6,12 @@
 import itertools
 import operator
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -18,6 +22,7 @@
 from torch._dynamo.utils import same
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
@@ -30,6 +35,15 @@
 from torch.export import Dim
 from torch.testing._internal.common_utils import (
     DeterministicGuard,
+=======
+from torch._inductor.codegen.common import register_backend_for_device
+from torch._inductor.codegen.cpp import CppScheduling
+from torch._inductor.codegen.triton import TritonScheduling
+from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.common_utils import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_parametrized_tests,
     parametrize,
 )
@@ -39,6 +53,7 @@
     requires_gpu,
     TRITON_HAS_CPU,
 )
+<<<<<<< HEAD
 from torch.utils._sympy.functions import FloorDiv
 
 
@@ -66,6 +81,18 @@
 
 @requires_gpu()
 @config.patch(test_config)
+=======
+
+
+@requires_gpu()
+@config.patch(
+    compile_threads=1,
+    alignment_asserts=False,
+    size_asserts=False,
+    scalar_asserts=False,
+    nan_asserts=False,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class FxirTestCase(InductorTestCase):
     device = GPU_TYPE
@@ -130,6 +157,7 @@ def _compile_and_check(
     def setUpClass(cls):
         super().setUpClass()
 
+<<<<<<< HEAD
         # Register the FX backend, storing the default for later.
         common.init_backend_registration()
         cls._default_backend = common.device_codegens[cls.device]
@@ -143,6 +171,10 @@ def tearDownClass(cls):
 
         # Restore the default backend.
         common.device_codegens[cls.device] = cls._default_backend
+=======
+        # Register the FX backend.
+        register_backend_for_device(cls.device, TritonScheduling, WrapperFxCodegen)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_basic(self):
         args = [torch.randn(8, device=self.device) for _ in range(2)]
@@ -185,11 +217,16 @@ def foo(x, y):
         (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=1)
 
         # Check for the extern kernel
+<<<<<<< HEAD
         num_extern = self._count_ops(gm, torch.ops.aten.addmm.out)
+=======
+        num_extern = self._count_ops(gm, extern_kernels.addmm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(num_extern, 1)
 
     def test_fallback(self):
         """
+<<<<<<< HEAD
         Test a program that calls aten fallbacks.
         """
 
@@ -199,6 +236,17 @@ def foo(x):
             return torch.addbmm(x, batch1, batch2)
 
         args = (torch.randn(3, 4, device=self.device),)
+=======
+        Test a program that calls an aten fallback.
+        """
+
+        length = 8
+
+        def foo(x):
+            return x + torch.randn(1, device=self.device)
+
+        args = (torch.randn(length, device=self.device),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Since the program has a random output, just check metadata.
         # Don't check for an exact value.
@@ -207,10 +255,15 @@ def foo(x):
         )
 
         # Check for the fallback kernel.
+<<<<<<< HEAD
         num_fallback = self._count_ops(
             gm, torch.ops.aten.randint.low_out
         ) + self._count_ops(gm, torch.ops.aten.addbmm.default)
         self.assertEqual(num_fallback, 2)
+=======
+        num_fallback = self._count_ops(gm, torch.ops.aten.randint.low_out)
+        self.assertEqual(num_fallback, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cat_inputs(self):
         """
@@ -429,6 +482,7 @@ def get_input():
             ]
             self.assertEqual(placeholder.meta["val"], symbol)
 
+<<<<<<< HEAD
     @parametrize(
         "shape",
         [
@@ -519,6 +573,8 @@ def test_dynamic_launch_grid_calc(self):
         self.assertEqual(grid[1], 1)
         self.assertEqual(grid[2], 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"trace.enabled": True})
     @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
     def test_debug(self, mock_output_code):
@@ -559,6 +615,7 @@ def compile_module(*inps):
 
         self.assertTrue(same(ref, result))
 
+<<<<<<< HEAD
     def test_scatter_fallback_scalar_src(self):
         """
         Test a special case where ScatterFallback takes a scalar 'src' argument.
@@ -662,6 +719,23 @@ def foo(pred):
 
         pred_tensor = torch.tensor([pred], device=self.device)
         self._compile_and_check(foo, [pred_tensor], expected_num_triton_kernels=2)
+=======
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_subgraph_raises(self):
+        """
+        Test a model with subgraphs. This is not yet supported, so check that we get the
+        expected exception.
+        """
+
+        def foo(cond, x):
+            return torch.cond(cond, torch.cos, torch.sin, [x])
+
+        cond = torch.tensor([True], device=self.device)
+        x = torch.ones([2, 3], device=self.device)
+
+        with self.assertRaisesRegex(BackendCompilerFailed, "Subgraph"):
+            self._compile_and_check(foo, [cond, x])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cpp_raises(self):
         """
@@ -715,6 +789,7 @@ def run(*args, **kwargs):
             op="call_function", target=torch.empty_strided
         )
         (shape, stride) = empty_strided.args
+<<<<<<< HEAD
         if use_dynamic_shapes:
             self.assertEqual(type(shape[0]), torch.fx.Node)
 
@@ -1239,6 +1314,10 @@ def test_launch_grid_dynamic_padding(self):
         x, y = sympy.symbols("x y")
         expr = sympy.floor(-FloorDiv(x * y, 2) / FloorDiv(-x * y, 131070))
         self._check(expr)
+=======
+        output_is_symbolic = any(isinstance(dim, torch.SymInt) for dim in shape)
+        self.assertEqual(output_is_symbolic, use_dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_gpu_cpp_wrapper.py b/test/inductor/test_gpu_cpp_wrapper.py
index a805a5b77550c..a3064ea037d92 100644
--- a/test/inductor/test_gpu_cpp_wrapper.py
+++ b/test/inductor/test_gpu_cpp_wrapper.py
@@ -7,7 +7,11 @@
 import torch
 from torch._inductor import config
 from torch._inductor.test_case import TestCase as InductorTestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_WITH_ROCM, slowTest
+=======
+from torch.testing._internal.common_utils import slowTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
 
 
@@ -62,6 +66,7 @@ def test_fn():
         )(test_fn)
         comp()
 
+<<<<<<< HEAD
     def test_non_tensor_args_wrapped_on_cpu(self):
         if not RUN_GPU:
             self.skipTest("GPU not available")
@@ -75,6 +80,8 @@ def test_fn(x, s):
             _, code = test_torchinductor.run_and_get_cpp_code(compiled, x, 3)
         self.assertIn("torch.tensor(arg, device='cpu')", code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DynamicShapesGpuWrapperGpuTests(InductorTestCase):
     device = GPU_TYPE
@@ -138,7 +145,11 @@ def make_test_case(
     assert callable(func), "not a callable"
     func = slowTest(func) if slow else func
 
+<<<<<<< HEAD
     @config.patch(cpp_wrapper=True)
+=======
+    @config.patch(cpp_wrapper=True, search_autotune_cache=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def fn(self):
         tests.setUpClass()
         tests.setUp()
@@ -185,7 +196,16 @@ class BaseTest(NamedTuple):
 
     # XPU Not implemented yet
     XPU_BASE_TEST_SKIP = [
+<<<<<<< HEAD
         "test_dynamic_shapes_persistent_reduction_mixed_x_dim",
+=======
+        "test_foreach_cpp_wrapper",
+        "test_enable_dynamic_shapes_cpp_wrapper",
+        "test_dynamic_shapes_persistent_reduction_mixed_x_dim",
+        "test_cat_slice_cat",
+        "test_fft_real_input",
+        "test_fft_real_input_real_output",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     # Maintain two separate test lists for cuda and cpp for now
@@ -317,6 +337,7 @@ class BaseTest(NamedTuple):
                 test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
             )
 
+<<<<<<< HEAD
     if TEST_WITH_ROCM:
         prop = torch.cuda.get_device_properties(0)
         gcnArchName = prop.gcnArchName.split(":")[0]
@@ -327,6 +348,8 @@ class BaseTest(NamedTuple):
                     f"{test_name}"
                 ] = test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_torchinductor.copy_tests(
         GpuWrapperTemplate, TestGpuWrapper, "gpu_wrapper", test_failures_gpu_wrapper
     )
diff --git a/test/inductor/test_graph_transform_observer.py b/test/inductor/test_graph_transform_observer.py
index e30f2189cd42d..a211f2946b6e3 100644
--- a/test/inductor/test_graph_transform_observer.py
+++ b/test/inductor/test_graph_transform_observer.py
@@ -11,7 +11,11 @@
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
 from torch.testing._internal.common_utils import IS_LINUX
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -22,16 +26,24 @@
     HAS_PYDOT = False
 
 
+<<<<<<< HEAD
 HAS_DOT = shutil.which("dot") is not None
+=======
+HAS_DOT = True if shutil.which("dot") is not None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestGraphTransformObserver(TestCase):
     def test_sdpa_rewriter(self):
         if not (
+<<<<<<< HEAD
             HAS_CUDA_AND_TRITON
             and PLATFORM_SUPPORTS_FUSED_ATTENTION
             and HAS_PYDOT
             and HAS_DOT
+=======
+            HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION and HAS_PYDOT and HAS_DOT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return
 
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index 7111e10a69fc6..72e3051aa1b53 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -286,6 +286,7 @@ def forward(self, x):
         return torch.stack((stack_input, stack_other), dim=0)
 
 
+<<<<<<< HEAD
 class TestDropout(torch.nn.Module):
     def __init__(self, device):
         super().__init__()
@@ -318,6 +319,26 @@ def forward(
         return (dropout, dropout_1, dropout_2, dropout_3, dropout_4)
 
 
+=======
+@requires_gpu()
+@torch._inductor.config.patch(
+    pre_grad_fusion_options={
+        "batch_linear": {},
+        "batch_linear_lhs": {},
+        "batch_layernorm": {},
+        "batch_tanh": {},
+        "batch_relu": {},
+        "batch_sigmoid": {},
+    },
+    post_grad_fusion_options={
+        "batch_aten_add": {},
+        "batch_aten_mul": {},
+        "batch_aten_sub": {},
+        "batch_aten_div": {},
+        "group_linear": {"require_fbgemm": True},
+    },
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestGroupBatchFusion(TestCase):
     def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
         if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
@@ -346,6 +367,7 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
             self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
         )
 
+<<<<<<< HEAD
     @requires_gpu()
     @unittest.skipIf(not has_fbgemm, "requires fbgemm")
     @torch._inductor.config.patch(
@@ -354,6 +376,9 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
             "group_linear": {"require_fbgemm": True},
         },
     )
+=======
+    @unittest.skipIf(not has_fbgemm, "requires fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_group_linear_fusion(self):
         z = 10
         for has_bias in [True, False]:
@@ -376,6 +401,7 @@ def test_group_linear_fusion(self):
                 counters["inductor"]["group_linear"],
                 4,
             )
+<<<<<<< HEAD
             counters.clear()
 
     @requires_gpu()
@@ -386,6 +412,15 @@ def test_group_linear_fusion(self):
             "group_linear": {"require_fbgemm": True},
         },
     )
+=======
+            self.assertEqual(
+                counters["inductor"]["batch_aten_add"],
+                0,
+            )
+            counters.clear()
+
+    @unittest.skipIf(not has_fbgemm, "requires fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_group_linear_fusion_different_shapes(self):
         counters.clear()
         module = MyModule2().eval().to(GPU_TYPE)
@@ -410,6 +445,7 @@ def test_group_linear_fusion_different_shapes(self):
             counters["inductor"]["group_linear"],
             2,
         )
+<<<<<<< HEAD
         counters.clear()
 
     @requires_gpu()
@@ -418,6 +454,15 @@ def test_group_linear_fusion_different_shapes(self):
         pre_grad_fusion_options={"batch_layernorm": {}},
         post_grad_fusion_options={},
     )
+=======
+        self.assertEqual(
+            counters["inductor"]["batch_aten_mul"],
+            1,
+        )
+        counters.clear()
+
+    @unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_batch_layer_norm_fusion(self):
         for has_weight in [True, False]:
             for has_bias in [True, False]:
@@ -435,11 +480,14 @@ def test_batch_layer_norm_fusion(self):
                 self.compare_gradients(module, traced, rtol=1e-8, atol=1e-8)
                 counters.clear()
 
+<<<<<<< HEAD
     @requires_gpu()
     @torch._inductor.config.patch(
         pre_grad_fusion_options={"batch_linear_lhs": {}},
         post_grad_fusion_options={},
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_batch_linear_lhs_fusion(self):
         z = 10
         for has_bias in [True, False]:
@@ -457,11 +505,14 @@ def test_batch_linear_lhs_fusion(self):
             self.compare_gradients(module, traced, rtol=1e-8, atol=1e-8)
             counters.clear()
 
+<<<<<<< HEAD
     @requires_gpu()
     @torch._inductor.config.patch(
         pre_grad_fusion_options={"batch_linear": {}},
         post_grad_fusion_options={},
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_batch_linear_pre_grad_fusion(self):
         for has_bias in [True, False]:
             counters.clear()
@@ -478,6 +529,7 @@ def test_batch_linear_pre_grad_fusion(self):
             self.compare_gradients(module, traced, rtol=1e-8, atol=1e-8)
             counters.clear()
 
+<<<<<<< HEAD
     @requires_gpu()
     @torch._inductor.config.patch(
         pre_grad_fusion_options={
@@ -491,6 +543,8 @@ def test_batch_linear_pre_grad_fusion(self):
             "batch_aten_div": {},
         },
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_op_fusion(self):
         counters.clear()
         module = TestPoitwiseOps(GPU_TYPE)
@@ -613,6 +667,7 @@ def test_math_op_fusion(self):
         self.assertTrue(torch.allclose(ref, res))
         counters.clear()
 
+<<<<<<< HEAD
     @requires_gpu()
     @torch._inductor.config.patch(
         pre_grad_fusion_options={
@@ -631,6 +686,8 @@ def test_batch_dropout_pre_grad_fusion(self):
         self.assertEqual(counters["inductor"]["batch_dropout"], 1)
         counters.clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestBMMFusionModule(torch.nn.Module):
     def __init__(self) -> None:
@@ -686,7 +743,11 @@ def build_graph(self, desc):
             unsatisfied += 1
             assert unsatisfied <= len(desc)  # cycle or bad input?
             name, v = desc.popleft()
+<<<<<<< HEAD
             args = tuple(lookup.get(n) for n in v)
+=======
+            args = tuple(lookup.get(n, None) for n in v)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if None in args:
                 desc.append((name, v))
                 continue
@@ -1280,7 +1341,11 @@ def test_find_independent_subset_greedy_fuse(self):
         )
         self.assertEqual(next(i), [lookup[n] for n in ["n2", "n3", "n5"]])
 
+<<<<<<< HEAD
         # fuse n2 and n3 which makes n4 now dependent on n1.
+=======
+        # fuse n2 and n3 which makes n4 now dependant on n1.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = tuple(lookup[n] for n in ["n0", "n1"])
         fused = g.create_node("placeholder", "target", name="n2+n3", args=args)
         lookup["n2"].replace_all_uses_with(fused)
diff --git a/test/inductor/test_indexing.py b/test/inductor/test_indexing.py
index 611b1dd966e1b..24df90b761f5f 100644
--- a/test/inductor/test_indexing.py
+++ b/test/inductor/test_indexing.py
@@ -55,7 +55,11 @@ def test_indexing_simplification(self):
             sizevars.simplify_with_ranges(expr, var_ranges),
             i1 + 128 * i2 + 64 * ModularIndexing(r3, 1, 2),
         )
+<<<<<<< HEAD
         # all the modular indexing should be removed when the body can't be larger than the modulus
+=======
+        # all the modular indexing should be removed when the body cant be larger than the modulus
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var_ranges[r3] = 2
         self.assertEqual(
             sizevars.simplify_with_ranges(expr, var_ranges), i1 + 128 * i2 + 64 * r3
@@ -247,13 +251,18 @@ def f(x):
         x = torch.randint(0, 255, (2, 4096, 5504), dtype=torch.uint8, device=GPU_TYPE)
 
         triton_code = run_and_get_triton_code(f, x)
+<<<<<<< HEAD
         # Make sure the 2 load uses simplified indexing rather than something like
+=======
+        # Make sure the 2 load uses simpified indexing rather than something like
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # tl.load(in_ptr0 + ((5504*x1) + (x0 // 2)),
         self.assertEqual(2, triton_code.count("tl.load(in_ptr0 + (x2 // 2),"))
         if DO_PERF_TEST:
             ms = benchmarker.benchmark_gpu(lambda: f(x))
             print(f"{ms=:.03f}")
 
+<<<<<<< HEAD
     @unittest.skipUnless(HAS_GPU, "Need GPU for this test")
     def test_floordiv_div_sympy_is_integer_bug(self):
         def foo(arg0, arg1, arg2, arg3, arg4, sentinel):
@@ -294,6 +303,8 @@ def foo(arg0, arg1, arg2, arg3, arg4, sentinel):
         out_compiled = compiled_foo(arg0, arg1, arg2, arg3, arg4, sentinel)
         out_compiled.sum().backward()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ExprPrinterTests(InductorTestCase):
     def test_print_pow(self):
diff --git a/test/inductor/test_inductor_annotations.py b/test/inductor/test_inductor_annotations.py
index 3824b25cdeaea..5eb76610e83c7 100644
--- a/test/inductor/test_inductor_annotations.py
+++ b/test/inductor/test_inductor_annotations.py
@@ -3,7 +3,11 @@
 import torch._inductor.config as inductor_config
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class InductorAnnotationTestCase(TestCase):
@@ -18,7 +22,11 @@ def f(a, b):
         _, code = run_and_get_code(f_comp, a, b)
         return code[0]
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_annotations(self):
         code = self.get_code()
 
@@ -26,16 +34,27 @@ def test_no_annotations(self):
         self.assertTrue("training_annotation" not in code)
 
     @inductor_config.patch(annotate_training=True)
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_training_annotation(self):
         code = self.get_code()
 
         self.assertTrue("from torch.cuda import nvtx" in code)
+<<<<<<< HEAD
         self.assertTrue(
             code.count("training_annotation = nvtx._device_range_start('inference')")
             >= 1
         )
         self.assertTrue(code.count("nvtx._device_range_end(training_annotation)") >= 1)
+=======
+        self.assertEqual(
+            code.count("training_annotation = nvtx._device_range_start('inference')"), 1
+        )
+        self.assertEqual(code.count("nvtx._device_range_end(training_annotation)"), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
index ab8a13d4f0df1..6f86f6bdb6098 100644
--- a/test/inductor/test_inductor_freezing.py
+++ b/test/inductor/test_inductor_freezing.py
@@ -34,7 +34,11 @@
     check_model_gpu,
     copy_tests,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
+=======
+from torch.testing._internal.common_utils import TEST_WITH_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 importlib.import_module("functorch")
diff --git a/test/inductor/test_inductor_scheduler.py b/test/inductor/test_inductor_scheduler.py
index ef383bb8fee98..9d67d1532a954 100644
--- a/test/inductor/test_inductor_scheduler.py
+++ b/test/inductor/test_inductor_scheduler.py
@@ -1,15 +1,23 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 from unittest import skipIf
 from unittest.mock import Mock
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._inductor.metrics as metrics
 import torch.utils.flop_counter
 from torch._dynamo.utils import counters
+<<<<<<< HEAD
 from torch._inductor.dependencies import Dep, ReadWrites
 from torch._inductor.scheduler import BaseSchedulerNode, Scheduler
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.ir import FixedLayout
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import SM70OrLater
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -17,8 +25,11 @@
     skipCUDAIf,
 )
 from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import IS_BIG_GPU
 from torch.utils._ordered_set import OrderedSet
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def FlopCounterMode(*args, **kwargs):
@@ -83,7 +94,11 @@ def test_disable_get_estimated_runtime_logging(self, device, dtype):
         for op, example_inputs, kwargs in tc:
             comp = torch.compile(op)
             torch._dynamo.reset()
+<<<<<<< HEAD
             with fresh_inductor_cache():
+=======
+            with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 comp(*example_inputs, **kwargs)
             self.assertEqual(metrics.num_bytes_accessed, 0)
             self.assertEqual(any(m[1] for m in metrics.node_runtimes), False)
@@ -93,16 +108,55 @@ def test_disable_get_estimated_runtime_logging(self, device, dtype):
 
     @dtypes(torch.float, torch.float16)
     @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+<<<<<<< HEAD
+=======
+    def test_get_estimated_runtime_logging(self, device, dtype):
+        if device == "cpu":
+            return
+        tc = _test_cases(device, dtype)
+        expected_metrics = [
+            # num_bytes_accessed, number of nonzero node_runtimes
+            (74 * dtype.itemsize, 1),
+            (60 * dtype.itemsize, 1),
+            (222 * dtype.itemsize, 4),
+            (77 * dtype.itemsize, 2),
+        ]
+        tc_plus_metrics = zip(tc, expected_metrics)
+
+        metrics.reset()
+        torch._logging.set_logs(inductor_metrics=True)
+        for test_case, met in tc_plus_metrics:
+            op, example_inputs, kwargs = test_case
+            enba, enr = met
+
+            comp = torch.compile(op)
+            torch._dynamo.reset()
+            with fresh_cache():
+                comp(*example_inputs, **kwargs)
+            self.assertEqual(enba, metrics.num_bytes_accessed)
+            nonzero_node_runtimes = sum(1 for x in metrics.node_runtimes if x[1] != 0)
+            self.assertEqual(enr, nonzero_node_runtimes)
+            metrics.reset()
+        torch._logging.set_logs()
+
+    @dtypes(torch.float, torch.float16)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "options",
         [
             {
                 "max_autotune": True,
                 "max_autotune_gemm_backends": "TRITON",
+<<<<<<< HEAD
+=======
+                "force_disable_caches": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             {
                 "max_autotune": True,
                 "max_autotune_gemm_backends": "TRITON,ATEN",
+<<<<<<< HEAD
             },
         ],
     )
@@ -111,15 +165,38 @@ def test_disable_get_estimated_runtime_logging(self, device, dtype):
     def test_flop_counter_op(self, device, dtype, options):
         if device == "cpu":
             return
+=======
+                "force_disable_caches": True,
+            },
+        ],
+    )
+    def test_flop_counter_op(self, device, dtype, options):
+        if device == "cpu":
+            return
+        if (
+            options["max_autotune_gemm_backends"] == "TRITON"
+            and torch.cuda.is_available()
+            and not torch._inductor.utils.use_triton_template(
+                FixedLayout(torch.device("cuda"), torch.float16, [400, 800])
+            )
+        ):
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tc = _test_cases(device, dtype)
 
         torch._logging.set_logs(inductor_metrics=True)
         for op, example_inputs, kwargs in tc:
             comp = torch.compile(op, options=options)
+<<<<<<< HEAD
             # next two lines are required, otherwise the flops will be cached from previous runs of this function.
             torch._dynamo.reset()
             with fresh_inductor_cache():
+=======
+            # next two lines are required, otherwise the flops will be cached from pervious runs of this function.
+            torch._dynamo.reset()
+            with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # actually run to set the counters
                 comp(*example_inputs, **kwargs)
                 with FlopCounterMode() as mode:
@@ -136,6 +213,7 @@ def test_flop_counter_op(self, device, dtype, options):
             counters["inductor"]["flop_count"] = 0
         torch._logging.set_logs()
 
+<<<<<<< HEAD
     def test_fusion_prevent_too_many_reads_and_writes_prevents_fusion(self):
         """Test that fusion is prevented when unique I/O buffers exceed threshold"""
         # Setup: Create nodes with many unique I/O buffers
@@ -209,6 +287,8 @@ def _create_mock_node(self, name: str, reads: list[str], writes: list[str]) -> M
         node.read_writes = read_writes
         return node
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestScheduler, globals())
 
diff --git a/test/inductor/test_inplace_padding.py b/test/inductor/test_inplace_padding.py
index 7ddd0dd4441b8..120ee6c0fc894 100644
--- a/test/inductor/test_inplace_padding.py
+++ b/test/inductor/test_inplace_padding.py
@@ -9,7 +9,10 @@
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import serialTest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
@@ -212,7 +215,10 @@ def f(x, y):
 
     @requires_cuda_with_enough_memory(2e10)
     @inductor_config.patch(force_shape_pad=True)
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_and_cel(self):
         # Use nan for torch.empty
         torch.use_deterministic_algorithms(True)
@@ -233,9 +239,15 @@ def f(x, y):
             loss.backward()
             return loss
 
+<<<<<<< HEAD
         x = torch.randn(B * T, C, requires_grad=True).to(GPU_TYPE).bfloat16()
         x.retain_grad()
         y = torch.randint(0, V, (B * T,)).to(GPU_TYPE)
+=======
+        x = torch.randn(B * T, C, requires_grad=True).cuda().bfloat16()
+        x.retain_grad()
+        y = torch.randint(0, V, (B * T,)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         opt_f = torch.compile(f)
 
diff --git a/test/inductor/test_inplacing_pass.py b/test/inductor/test_inplacing_pass.py
index dd592f8c4e823..ab790b6793a2a 100644
--- a/test/inductor/test_inplacing_pass.py
+++ b/test/inductor/test_inplacing_pass.py
@@ -413,6 +413,7 @@ def f(b):
             # Both list inputs failed to reinplace. So we should have emitted clones for them.
             self.assertEqual(post_grad_graphs.count("aten.clone"), 2)
 
+<<<<<<< HEAD
     def test_generalized_scatter(self):
         # This is an integration test for the reinplacing pass.
         def fn(x_1):
@@ -438,6 +439,8 @@ def fn(x_1):
         result = torch.compile(fn, fullgraph=True, backend="inductor")(x)
         self.assertEqual(result, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "factory_op",
         [
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
index 48fb77982d1e1..de0a23f0adcb2 100644
--- a/test/inductor/test_kernel_benchmark.py
+++ b/test/inductor/test_kernel_benchmark.py
@@ -16,7 +16,10 @@
 from torch._inductor.utils import fresh_cache
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import xfailIfSM89
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
 
 
@@ -172,8 +175,11 @@ def f(a, b):
         max_autotune=True, max_autotune_gemm_backends="TRITON", shape_padding=False
     )
     @fresh_cache()
+<<<<<<< HEAD
     @fresh_inductor_cache()
     @skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mm_triton_kernel_benchmark(self):
         M = 2048
         N = 2432
@@ -202,7 +208,11 @@ def test_matmul_bandwidth_computation(self):
             def triton_(in_out_ptr0, xnumel, XBLOCK : tl.constexpr):
 
         Note the in_out_ptr0 argument. It's for a 1000x1000 tensor, but it's
+<<<<<<< HEAD
         inplace updated, so when computing the bandwidth, we should count
+=======
+        inplace udpated, so when computing the bandwidth, we should count
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the total memory access as 2 * 1000 * 1000 * 4 = 8MB. This amount is
         what this test asserts.
         """
@@ -389,9 +399,12 @@ def f(a, b, c):
         max_autotune=True, max_autotune_gemm_backends="TRITON", force_shape_pad=True
     )
     def test_slice_mm_bandwidth_computation(self):
+<<<<<<< HEAD
         if GPU_TYPE == "xpu" and not torch._inductor.utils.is_big_gpu():
             raise unittest.SkipTest("unsupported device")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         M, N, K = 1000, 2000, 3000
 
         @torch.compile
diff --git a/test/inductor/test_layout_optim.py b/test/inductor/test_layout_optim.py
index 8962e6bb18b5f..17daaa2b43467 100644
--- a/test/inductor/test_layout_optim.py
+++ b/test/inductor/test_layout_optim.py
@@ -300,7 +300,11 @@ def test_nll_loss_backward(self):
         The CUDA implementation of aten.nll_loss2d_backward.default requires
         the self tensor (whose layout will be used to create grad_input)
         to be contiguous. Layout optimization may change the self tensor's layout
+<<<<<<< HEAD
         and cause failure. We fix that by adding layout constraints to the
+=======
+        and cause failure. We fix that by adding layout constaints to the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fallback of aten.nll_loss2d_backward.default .
         """
 
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index 04cfbb914f303..2f9ceae6a4eca 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -3,13 +3,19 @@
 import contextlib
 import os
 import unittest
+<<<<<<< HEAD
 from unittest import skipUnless
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 import sympy
 
 import torch
+<<<<<<< HEAD
 import torch.nn.functional as F
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import nn
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import same
@@ -19,13 +25,21 @@
 from torch._inductor.scheduler import SchedulerNode
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.test_operators import realize
+<<<<<<< HEAD
 from torch._inductor.utils import is_big_gpu, run_and_get_code, sympy_index_symbol
+=======
+from torch._inductor.utils import run_and_get_code, sympy_index_symbol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.virtualized import ops, V
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.utils._ordered_set import OrderedSet
@@ -231,7 +245,11 @@ def _cast(x):
                     return x.to(torch.float32)
                 return x
 
+<<<<<<< HEAD
             # Workaround the issue that call allclose on fp8 tensor triggers error
+=======
+            # Wordaround the issue that call allclose on fp8 tensor triggers error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #   RuntimeError: "mul_cuda" not implemented for 'Float8_e4m3fn'
             expect = tree_map(_cast, expect)
             actual = tree_map(_cast, actual)
@@ -414,11 +432,16 @@ def f(x):
         self.do_acc_test(f, x)
         self.assertEqual(1, metrics.generated_kernel_count)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
     # This test can't function for ROCm because fp8 'mul_cuda' op is not supported
     # in eager mode that is required here to check vs compiled results
+=======
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_cast_and_t(self):
         """
         This test repros the not able to fuses issue in
@@ -440,11 +463,16 @@ def f(x, scale):
         self.do_acc_test(f, x, scale)
         self.assertEqual(1, metrics.generated_kernel_count)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
     # This test can't function for ROCm because fp8 'mul_cuda' op is not supported
     # in eager mode that is required here to check vs compiled results
+=======
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_pattern_2(self):
         """
         This test repros the fp8 fusion relation issue here:
@@ -483,6 +511,7 @@ def test_pattern2(tensor_x_inp, scale_x):
         expected_numbytes += tensor_fp8.nbytes + tensor_fp8_t.nbytes  # output
         self.assertEqual(expected_numbytes, metrics.num_bytes_accessed)
 
+<<<<<<< HEAD
     def test_outer_dimension_softmax(self):
         """
         This test repros the not able to fuse problem for outer dimension
@@ -524,6 +553,8 @@ def f(x):
             optf = torch.compile(f)
             print(f"ms={do_bench(lambda: optf(x))}")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Disable split reduction to make it easier to calculate the expected
     # number of bytes accessed. In this case, split reduction does not
     # help perf much.
@@ -555,7 +586,11 @@ def f(x):
 
         # A small amount of extra memory access for:
         # - store output for the first reduction
+<<<<<<< HEAD
         # - load input for the second reduction
+=======
+        # - load input for the second redution
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # - store output for the second reduction
         expected_numbytes += (M * 2 + 1) * x.itemsize
 
@@ -568,6 +603,7 @@ def f(x):
             ms = do_bench(lambda: opt_f(x))
             print(f"{ms=:.3f}")
 
+<<<<<<< HEAD
     @inductor_config.patch(
         {
             "max_autotune": True,
@@ -670,6 +706,8 @@ def f(x):
         self.do_acc_test(f, x)
         self.assertEqual(0, metrics.num_loop_reordering)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @inductor_config.patch(
     {
@@ -1152,7 +1190,11 @@ def test_penalized_small_dim(self):
         x = torch.rand([2000, 1], device=GPU_TYPE)
         y = torch.rand([4, 1], device=GPU_TYPE).T
 
+<<<<<<< HEAD
         # don't tile when it doesn't affect total coalesced mem accesses much
+=======
+        # dont tile when it doesnt affect total coalesced mem accesses much
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(x, y):
             return x + y
 
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 68442886faaa7..9f46a076ba3ab 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -10,16 +10,26 @@
 import re
 import tempfile
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
 from unittest import mock
+=======
+from typing import Callable, Optional
+from unittest import mock
+from unittest.mock import MagicMock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import multiprocessing as mp, nn
 from torch._dynamo import reset
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._dynamo.testing import rand_strided, reset_rng_state
+<<<<<<< HEAD
 from torch._dynamo.utils import counters, same
+=======
+from torch._dynamo.utils import same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     _TestBenchmarkRequest,
@@ -28,6 +38,7 @@
     TuningProcessPool,
 )
 from torch._inductor.graph import GraphLowering
+<<<<<<< HEAD
 from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout, FlexibleLayout
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
@@ -61,11 +72,35 @@
     has_triton_stable_tma_api,
     has_triton_tma_device,
 )
+=======
+from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout
+from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
+from torch._inductor.select_algorithm import (
+    AlgorithmSelectorCache,
+    TritonTemplate,
+    TritonTemplateCaller,
+)
+from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_device_type import largeTensorTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_WINDOWS,
+    parametrize,
+    TEST_WITH_ROCM,
+    MI300_ARCH,
+    runOnRocmArch,
+    skipIfXpu,
+)
+from torch.testing._internal.logging_utils import multiple_logs_to_string
+from torch.utils._triton import has_triton_tma_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
 from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import (
     fresh_cache,
     get_k_splits,
@@ -76,18 +111,32 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import MI300_ARCH, runOnRocmArch, skipIfXpu
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_code
+from torch._inductor.virtualized import V
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing import FileCheck
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import (
     get_func_call,
     get_kernel_launch,
     GPU_TYPE,
     HAS_CPU,
+<<<<<<< HEAD
     HAS_CUDA_AND_TRITON,
+=======
+    HAS_CUDA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_GPU,
 )
 
 
 torch.set_float32_matmul_precision("high")
+<<<<<<< HEAD
 if HAS_CUDA_AND_TRITON:
+=======
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
@@ -111,8 +160,12 @@ def benchmark(self, *args, out):
 @instantiate_parametrized_tests
 class TestMaxAutotune(TestCase):
     @parametrize("dynamic", (False, True))
+<<<<<<< HEAD
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic, search_space):
+=======
+    def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Make sure autotuning mm_plus_mm with zero-size input works without crashes.
         """
@@ -126,9 +179,13 @@ def mm_plus_mm(a, b, c, d):
         c = torch.randn(m, k).to(GPU_TYPE)
         d = torch.randn(k, n).to(GPU_TYPE)
 
+<<<<<<< HEAD
         with config.patch(
             {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
         ):
+=======
+        with config.patch({"max_autotune": True}):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(mm_plus_mm, dynamic=dynamic)(a, b, c, d)
 
     @unittest.skipIf(
@@ -137,13 +194,19 @@ def mm_plus_mm(a, b, c, d):
     @parametrize("a_transposed", (False, True))
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
+<<<<<<< HEAD
     @parametrize("tma_store", (False, True))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_max_autotune_regular_mm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
+<<<<<<< HEAD
         tma_store: bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def mm(a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -160,6 +223,7 @@ def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = (
             torch.randn(*((K, M) if a_transposed else (M, K)))
             .to(torch.float16)
@@ -170,11 +234,16 @@ def mm(a, b):
             .to(torch.float16)
             .to(GPU_TYPE)
         )
+=======
+        a = torch.randn(*((K, M) if a_transposed else (M, K))).to(torch.float16).cuda()
+        b = torch.randn(*((N, K) if b_transposed else (K, N))).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                 "triton.native_matmul": False,
                 "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
@@ -205,11 +274,20 @@ def mm(a, b):
             read_api
         ).check(write_api).run(code[0])
 
+=======
+                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+            }
+        ):
+            c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
+            c_expected = mm(a, b)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+<<<<<<< HEAD
     @parametrize("a_transposed", (False, True))
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
@@ -342,14 +420,21 @@ def mm(a, b):
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
     @skipIfXpu(msg="TMA path on Intel GPU not require this check")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm_persistent_tma_illegal_alignment(self, dynamic):
         def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
         b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
+=======
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with (
             self.assertRaises(BackendCompilerFailed) as context,
@@ -357,7 +442,10 @@ def mm(a, b):
                 {
                     "max_autotune": True,
                     "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                     "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
                 }
             ),
@@ -372,6 +460,7 @@ def mm(a, b):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+<<<<<<< HEAD
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm_persistent_tma_illegal_output_alignment(
         self, dynamic
@@ -409,13 +498,20 @@ def mm(a, b, out):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_max_autotune_regular_mm_tma_dynamic_outer_dim(self):
         def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
         b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
+=======
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TMA requires 16-byte alignment: here we repeat the dims
         # by the factor of 8, as float16 is 2-byte. All dims are
@@ -429,7 +525,10 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                 "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
@@ -454,6 +553,7 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+<<<<<<< HEAD
     # NOTE: the current Inductor template verifies that the scaling mode is either per-tensor or per-row
     # TODO: support additional scaling modes for Blackwell
     @unittest.skipIf(
@@ -580,19 +680,27 @@ def get_scale_per_row(t):
             "triton.language.make_tensor_descriptor"
         ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
     @parametrize("a_transposed", (False, True))
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
+<<<<<<< HEAD
     @parametrize("tma_store", (False, True))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_max_autotune_addmm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
+<<<<<<< HEAD
         tma_store: bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def addmm(x, a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -610,6 +718,7 @@ def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = (
             torch.randn(*((K, M) if a_transposed else (M, K)))
             .to(torch.float16)
@@ -621,11 +730,17 @@ def addmm(x, a, b):
             .to(GPU_TYPE)
         )
         x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
+=======
+        a = torch.randn(*((K, M) if a_transposed else (M, K))).to(torch.float16).cuda()
+        b = torch.randn(*((N, K) if b_transposed else (K, N))).to(torch.float16).cuda()
+        x = torch.randn(N).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                 "triton.native_matmul": False,
                 "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
@@ -735,21 +850,38 @@ def addmm(x, a, b):
             read_api
         ).check_count(write_api, write_count).run(code[0])
 
+=======
+                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+            }
+        ):
+            c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
+            c_expected = addmm(x, a, b)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+<<<<<<< HEAD
     @skipIfXpu(msg="TMA path on Intel GPU not require this check")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", (False, True))
     def test_max_autotune_addmm_persistent_tma_illegal_alignment(self, dynamic):
         def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
         b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
         x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
+=======
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+        x = torch.randn(N).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with (
             self.assertRaises(BackendCompilerFailed) as context,
@@ -757,7 +889,10 @@ def addmm(x, a, b):
                 {
                     "max_autotune": True,
                     "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                     "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
                 }
             ),
@@ -777,9 +912,15 @@ def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
+<<<<<<< HEAD
         a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
         b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
         x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
+=======
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+        x = torch.randn(N).to(torch.float16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TMA requires 16-byte alignment: here we repeat the dims
         # by the factor of 8, as float16 is 2-byte. All dims are
@@ -794,7 +935,10 @@ def addmm(x, a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+<<<<<<< HEAD
                 "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
@@ -804,15 +948,21 @@ def addmm(x, a, b):
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @fresh_cache()
+<<<<<<< HEAD
     @skipIfXpu(msg="XPU doesn't support sm carveout")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
     @unittest.skipIf(IS_WINDOWS, "Windows doesn't support persistent TMA")
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         has_datacenter_blackwell_tma_device(), "B200 doesn't support sm carveout"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("carveout", (None, 0, 27))
     @parametrize("op", ("mm", "scaled_mm"))
     def test_honor_sm_carveout_with_triton_tma(self, carveout, op: str):
@@ -829,15 +979,26 @@ def scaled_mm(
 
         # Create large matrices to ensure we use all possible sms
         size = 2560
+<<<<<<< HEAD
         a = torch.randn(size, size, device=GPU_TYPE, dtype=torch.bfloat16)
         b = (
             torch.randn(size, size, device=GPU_TYPE, dtype=torch.bfloat16)
+=======
+        a = torch.randn(size, size, device="cuda", dtype=torch.bfloat16)
+        b = (
+            torch.randn(size, size, device="cuda", dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .transpose(0, 1)
             .contiguous()
             .transpose(0, 1)
         )
+<<<<<<< HEAD
         scale_a = torch.tensor(1, dtype=torch.float32, device=GPU_TYPE)
         scale_b = torch.tensor(1, dtype=torch.float32, device=GPU_TYPE)
+=======
+        scale_a = torch.tensor(1, dtype=torch.float32, device="cuda")
+        scale_b = torch.tensor(1, dtype=torch.float32, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         args = (
             (a.to(torch.float8_e4m3fn), b.to(torch.float8_e4m3fn), scale_a, scale_b)
@@ -857,7 +1018,10 @@ def scaled_mm(
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": True,
+<<<<<<< HEAD
                 "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "max_autotune_gemm_backends": "TRITON",
                 "test_configs.autotune_choice_name_regex": "tma",
             }
@@ -923,8 +1087,12 @@ def addmm(x, a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
+<<<<<<< HEAD
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_autotune_conv1x1(self, search_space):
+=======
+    def test_autotune_conv1x1(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Assuming input has 3 channels and we want to produce 16 channels as output
         conv1x1 = (
             torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=1)
@@ -941,11 +1109,15 @@ def test_autotune_conv1x1(self, search_space):
         )
 
         with config.patch(
+<<<<<<< HEAD
             {
                 "max_autotune": True,
                 "max_autotune_gemm_backends": "TRITON",
                 "max_autotune_gemm_search_space": search_space,
             }
+=======
+            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
 
             @torch.compile()
@@ -1057,9 +1229,13 @@ def f(x, y):
         self.assertTrue(torch.allclose(act, ref, atol=4 * 1e-3, rtol=4 * 1e-3))
 
     @config.patch(max_autotune=True)
+<<<<<<< HEAD
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     @parametrize("kernel_size", (1, 3))
     def test_empty_conv_input(self, search_space, kernel_size):
+=======
+    def test_empty_conv_input(self, kernel_size=3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(0, 256, 14, 14, device=GPU_TYPE)
         weight = torch.randn(256, 256, kernel_size, kernel_size, device=GPU_TYPE)
 
@@ -1076,6 +1252,7 @@ def f(x, weight):
                 groups=1,
             )
 
+<<<<<<< HEAD
         with config.patch({"max_autotune_gemm_search_space": search_space}):
             opt_f = torch.compile(f)
             ref = f(x, weight)
@@ -1088,6 +1265,19 @@ def f(x, weight):
     @config.patch(max_autotune_gemm_backends="TRITON")
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_baddmm(self, search_space):
+=======
+        opt_f = torch.compile(f)
+        ref = f(x, weight)
+        act = opt_f(x, weight)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    @config.patch(max_autotune=True)
+    def test_empty_conv_input_with_1x1_kernel(self):
+        self.test_empty_conv_input(kernel_size=1)
+
+    @config.patch(max_autotune_gemm_backends="TRITON")
+    def test_baddmm(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1106,6 +1296,7 @@ def forward(self, x):
         )
         mod = M().to(GPU_TYPE)
 
+<<<<<<< HEAD
         with config.patch({"max_autotune_gemm_search_space": search_space}):
             m_c = torch.compile(mode="max-autotune")(mod)
             out, code = run_and_get_code(m_c, x)
@@ -1113,6 +1304,13 @@ def forward(self, x):
 
             if not config.triton.native_matmul:
                 FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
+=======
+        m_c = torch.compile(mode="max-autotune")(mod)
+        out, code = run_and_get_code(m_c, x)
+        self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
+
+        FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(max_autotune=True)
     def test_conv1x1_with_free_symbols(self):
@@ -1154,7 +1352,11 @@ def f(x, y):
         self.assertEqual(f_c(*inps), f(*inps), atol=0.03, rtol=0.25)
 
         # mm kernel, and cos kernel
+<<<<<<< HEAD
         count = 2 if (using_triton_mm or config.triton.native_matmul) else 1
+=======
+        count = 2 if using_triton_mm else 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         FileCheck().check(get_func_call()).check_count(
             get_kernel_launch(), count, exactly=True
         ).run(code[0])
@@ -1182,7 +1384,10 @@ def f(x, y):
 
     @config.patch("trace.enabled", True)
     @config.patch({"test_configs.force_extern_kernel_in_multi_template": True})
+<<<<<<< HEAD
     @config.patch("triton.native_matmul", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mutation_rename(self):
         torch._logging.set_logs(ir_post_fusion=True)
 
@@ -1197,7 +1402,10 @@ def f(x, y, z, other):
         t = functools.partial(torch.randn, device=GPU_TYPE)
         inps = (t(3, 3), t(3, 3), t(3, 3), t(3))
         fn = torch.compile(f, mode="max-autotune-no-cudagraphs")
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             (
                 pre_fusion_tream,
@@ -1238,6 +1446,7 @@ def test_cat_max_autotune_extern(self):
         self._test_cat_max_autotune_impl(using_triton_mm=False)
 
     @skipIfXpu(
+<<<<<<< HEAD
         msg="The fusion not happened because it do not speedup on XPU, see issue #146568"
     )
     @config.patch(
@@ -1251,6 +1460,15 @@ def test_cat_max_autotune_triton(self):
 
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_conv_cat(self, search_space):
+=======
+        msg="The fusion not happend because it do not speedup on XPU, see issue #146568"
+    )
+    @config.patch(max_autotune_gemm_backends="TRITON")
+    def test_cat_max_autotune_triton(self):
+        self._test_cat_max_autotune_impl(using_triton_mm=True)
+
+    def test_conv_cat(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class ToyModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1262,6 +1480,7 @@ def forward(self, x):
                 x = self.conv(x)
                 return torch.cat((x, x + 1))
 
+<<<<<<< HEAD
         with config.patch({"max_autotune_gemm_search_space": search_space}):
             with torch.no_grad():
                 m = ToyModel().to(device=GPU_TYPE)
@@ -1277,13 +1496,32 @@ def forward(self, x):
 
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_conv3d(self, search_space):
+=======
+        with torch.no_grad():
+            m = ToyModel().to(device=GPU_TYPE)
+            input_tensor = torch.randn(32, 3, 64, 64).to(device=GPU_TYPE)
+
+            # convolution is not currently plannable
+            m = torch.compile(m, mode="max-autotune-no-cudagraphs")
+            out, code = run_and_get_code(m, input_tensor)
+            self.assertEqual(out, m(input_tensor))
+
+            if not TEST_WITH_ROCM:
+                FileCheck().check("triton_poi_fused_cat_2.run").run(code[0])
+
+    def test_conv3d(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = torch.nn.functional.conv3d
         image = torch.randn([1, 3, 8, 16, 32])
         filt = torch.randn([3, 3, 7, 7, 7])
 
+<<<<<<< HEAD
         with config.patch(
             {"max_autotune": True, "max_autotune_gemm_search_space": search_space}
         ):
+=======
+        with config.patch({"max_autotune": True}):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected = fn(image, filt)
             actual = torch.compile(fn)(image, filt)
             torch.testing.assert_close(actual, expected, atol=6e-5, rtol=0.001)
@@ -1302,16 +1540,27 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)
+=======
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
         Check https://github.com/pytorch/pytorch/issues/125437 for more details.
         """
         x = rand_strided(
+<<<<<<< HEAD
             (50257, 2048), (1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
         )
         y = rand_strided((2048, 768), (768, 1), dtype=torch.bfloat16, device=GPU_TYPE)
+=======
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
+        )
+        y = rand_strided((32768, 768), (768, 1), dtype=torch.bfloat16, device=GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(mode="max-autotune")
         def f(x, y):
@@ -1321,6 +1570,7 @@ def f(x, y):
         act = f(x, y)
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_addmm(self):
         b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
@@ -1328,6 +1578,14 @@ def test_non_contiguous_input_addmm(self):
             (50257, 2048), (1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
         )
         y = rand_strided((2048, 768), (768, 1), dtype=torch.bfloat16, device=GPU_TYPE)
+=======
+    def test_non_contiguous_input_addmm(self):
+        b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
+        x = rand_strided(
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
+        )
+        y = rand_strided((32768, 768), (768, 1), dtype=torch.bfloat16, device=GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(mode="max-autotune")
         def f(x, y):
@@ -1337,6 +1595,7 @@ def f(x, y):
         act = f(x, y)
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_bmm(self):
         x = rand_strided(
@@ -1344,6 +1603,14 @@ def test_non_contiguous_input_bmm(self):
         )
         y = rand_strided(
             (1, 2048, 768), (0, 768, 1), dtype=torch.bfloat16, device=GPU_TYPE
+=======
+    def test_non_contiguous_input_bmm(self):
+        x = rand_strided(
+            (1, 50257, 32768), (0, 1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
+        )
+        y = rand_strided(
+            (1, 32768, 768), (0, 768, 1), dtype=torch.bfloat16, device=GPU_TYPE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         @torch.compile(mode="max-autotune")
@@ -1357,6 +1624,7 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "native matmul and Triton template both have accuracy fail (2.2%)",
@@ -1367,6 +1635,16 @@ def test_non_contiguous_input_mm_plus_mm(self):
 
         x2 = rand_strided((50257, 2048), (1, 50304), device=GPU_TYPE)
         y2 = rand_strided((2048, 768), (768, 1), device=GPU_TYPE)
+=======
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
+    def test_non_contiguous_input_mm_plus_mm(self):
+        x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
+        y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)
+
+        x2 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
+        y2 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(mode="max-autotune")
         def f(x1, y1, x2, y2):
@@ -1374,15 +1652,22 @@ def f(x1, y1, x2, y2):
 
         ref = x1 @ y1 + x2 @ y2
         act = f(x1, y1, x2, y2)
+<<<<<<< HEAD
         torch.testing.assert_close(act, ref, atol=1e-1, rtol=1e-2)
+=======
+        torch.testing.assert_close(act, ref, atol=1e-2, rtol=1e-2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="",
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul, "native matmul generates when size >=2"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_valid_choices(self):
         a = torch.zeros([2, 2], device=GPU_TYPE)
         b = torch.zeros([2, 2], device=GPU_TYPE)
@@ -1390,9 +1675,12 @@ def test_no_valid_choices(self):
             torch.compile(lambda a, b: a.matmul(b))(a, b)
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul, "Only test when template is being called"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("multi_template", (True, False))
     @config.patch(
         max_autotune=True,
@@ -1447,9 +1735,15 @@ def f(x, y):
             loss.backward()
             return loss
 
+<<<<<<< HEAD
         x = torch.randn(B * T, C, requires_grad=True).to(GPU_TYPE).bfloat16()
         x.retain_grad()
         y = torch.randint(0, V, (B * T,)).to(GPU_TYPE)
+=======
+        x = torch.randn(B * T, C, requires_grad=True).cuda().bfloat16()
+        x.retain_grad()
+        y = torch.randint(0, V, (B * T,)).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         import torch._inductor.utils as inductor_utils
 
@@ -1461,6 +1755,7 @@ def f(x, y):
             assert same(expect, actual, tol=1e-2), f"ref:\n{expect}\nact:\n{actual}"
 
     @skipIfXpu
+<<<<<<< HEAD
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
@@ -1468,14 +1763,23 @@ def f(x, y):
         config.triton.native_matmul,
         "ignore decompose_k when native matmul codegen",
     )
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", (True, False))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     @parametrize("sizes", ((32, 32, 32768), (64, 128, 200000), (64, 64, 177147)))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
+<<<<<<< HEAD
         comprehensive_padding=False,
         shape_padding=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
         fp16_red_setting = (
@@ -1489,8 +1793,13 @@ def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
 
         M, N, K = sizes
 
+<<<<<<< HEAD
         a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
         b = torch.randn(K, N, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+=======
+        a = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=dtype, device="cuda", requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         possible_splits = range(2, min(K // M, K // N) + 1)
 
@@ -1514,7 +1823,11 @@ def check_divisors(code):
         # We assume with the large k dim relative to m, n, decompose_k will be most performant
         out, code = run_and_get_code(compiled_func, a, b)
 
+<<<<<<< HEAD
         if dynamic or torch.version.hip:
+=======
+        if dynamic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
                 "decompose_k"
             ).run(code[0])
@@ -1528,13 +1841,21 @@ def check_divisors(code):
         # Test adding epilogue also equivalent to eager
         compiled_func = torch.compile(lambda a, b: (a @ b).relu(), dynamic=dynamic)
         out, code = run_and_get_code(compiled_func, a, b)
+<<<<<<< HEAD
         if dynamic or torch.version.hip:
+=======
+        if dynamic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
                 "decompose_k"
             ).run(code[0])
         else:
             FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+<<<<<<< HEAD
                 "triton_.*_fused_mm_0.run"
+=======
+                "triton_.*_fused_0.run"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).check("decompose_k").run(code[0])
             check_divisors(code)
             torch.testing.assert_close(
@@ -1547,15 +1868,23 @@ def check_divisors(code):
             lambda a, b: (a.transpose(0, 1) @ b).relu(), dynamic=dynamic
         )
         out, code = run_and_get_code(compiled_func, a, b)
+<<<<<<< HEAD
 
         # DecomposeK is not enabled for AMD yet
         if dynamic or torch.version.hip:
+=======
+        if dynamic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
                 "decompose_k"
             ).run(code[0])
         else:
             FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+<<<<<<< HEAD
                 "triton_.*_fused_.*_0.run"
+=======
+                "triton_.*_fused_0.run"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).check("decompose_k").run(code[0])
             check_divisors(code)
             torch.testing.assert_close(
@@ -1577,10 +1906,13 @@ def check_divisors(code):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "ignore decompose_k when native matmul codegen",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1591,10 +1923,17 @@ def f(a, b):
             return (a_in @ b).relu()
 
         a = torch.randn(
+<<<<<<< HEAD
             32, 32768, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
         )
         b = torch.randn(
             32768, 64, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+=======
+            32, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        b = torch.randn(
+            32768, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         torch._dynamo.reset()
@@ -1604,6 +1943,7 @@ def f(a, b):
         with mock.patch(
             "torch._inductor.kernel.mm.use_decompose_k_choice"
         ) as decomp_mock:
+<<<<<<< HEAD
             decomp_mock.side_effect = (
                 lambda *args, **kwargs: kwargs.get("threshold_multiple", 1) == 1
             )
@@ -1611,6 +1951,13 @@ def f(a, b):
             out, code = run_and_get_code(compiled_func, a, b)
             FileCheck().check("extern_kernels.bmm_dtype").check_regex(
                 "triton_.*_fused_.*.run"
+=======
+            decomp_mock.return_value = True
+
+            out, code = run_and_get_code(compiled_func, a, b)
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).check("decompose_k").check_regex(r"s[0-9]+ = s[0-9]+").check_regex(
                 r"2\*s[0-9]+"
             ).check_regex("s[0-9]+ = 32").run(code[0])
@@ -1626,10 +1973,13 @@ def f(a, b):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "ignore decompose_k when native matmul codegen",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1640,11 +1990,17 @@ def f(a, b):
             a_in = torch.cat([a for _ in range(256)], dim=0)
             return (a_in @ b).relu().sum()
 
+<<<<<<< HEAD
         a = torch.randn(
             8, 64, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
         )
         b = torch.randn(
             64, 32768, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+=======
+        a = torch.randn(8, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+        b = torch.randn(
+            64, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         torch._dynamo.reset()
@@ -1656,9 +2012,13 @@ def f(a, b):
         with mock.patch(
             "torch._inductor.kernel.mm.use_decompose_k_choice"
         ) as decomp_mock:
+<<<<<<< HEAD
             decomp_mock.side_effect = (
                 lambda *args, **kwargs: kwargs.get("threshold_multiple", 1) == 1
             )
+=======
+            decomp_mock.return_value = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             out, code = run_and_get_code(compiled_func, a, b)
             out.backward()
@@ -1677,10 +2037,13 @@ def f(a, b):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "ignore decompose_k when native matmul codegen",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1690,21 +2053,38 @@ def f(a, b):
             a = a.transpose(0, 1)
             return a @ b
 
+<<<<<<< HEAD
         a = torch.randn((32768, 256), device=GPU_TYPE, dtype=torch.bfloat16)
         b = torch.randn((32768, 1152), device=GPU_TYPE, dtype=torch.bfloat16)
+=======
+        a = torch.randn((32768, 256), device="cuda", dtype=torch.bfloat16)
+        b = torch.randn((32768, 1152), device="cuda", dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         b = b[:, :1096]
 
         # Force only decomposeK choice
         with (
+<<<<<<< HEAD
             override_template_heuristics(
                 device_type=GPU_TYPE,
                 template_op_pairs=[(torch._inductor.kernel.mm.mm_template.name, "mm")],
             ),
+=======
+            mock.patch(
+                "torch._inductor.kernel.mm.V.choices.get_base_mm_configs"
+            ) as base_mm_mock,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mock.patch(
                 "torch._inductor.kernel.mm.use_decompose_k_choice"
             ) as decompose_mock,
         ):
+<<<<<<< HEAD
+=======
+            mm_configs_mock = MagicMock()
+            mm_configs_mock.return_value = []
+            base_mm_mock.return_value = mm_configs_mock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             decompose_mock.return_value = True
             compiled_f = torch.compile(f)
             out, code = run_and_get_code(compiled_f, a, b)
@@ -1719,6 +2099,7 @@ def f(a, b):
                 code[0]
             )
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float32))
     @parametrize("sizes", ((64, 128, 256), (128, 256, 512), (256, 512, 1024)))
@@ -1914,6 +2295,8 @@ def mm_transpose_relu(a, b):
             # Check that contiguous transform was used
             FileCheck().check("contiguous_mm").run(code[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_template_generated_code_cache_key(self):
         generate_and_load_args = len(
             inspect.signature(
@@ -1929,7 +2312,11 @@ def test_triton_template_generated_code_cache_key(self):
         # Make sure all args of generate_and_load_args are passed to make_key_args (Except generate_with_caching)
         # update this function each time new arg added to generate_and_load and make sure arg is added to make_key
         self.assertEqual(generate_and_load_args - 1, make_key_args)
+<<<<<<< HEAD
         self.assertEqual(generate_and_load_args, 18)
+=======
+        self.assertEqual(generate_and_load_args, 16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @fresh_cache()
     @config.patch(
@@ -1939,7 +2326,10 @@ def test_triton_template_generated_code_cache_key(self):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_template_generated_code_cache_strategy(self):
         def func_test1(x, y, z, m):
             a = torch.matmul(x, y)
@@ -1966,7 +2356,10 @@ def func_test1(x, y, z, m):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_template_generated_code_caching(self):
         def reset_counters():
             torch._dynamo.utils.counters.clear()
@@ -2017,9 +2410,15 @@ def func_test1(x, y, z, m):
                             "[[22,30],[30,1],torch.float32,device(type='cuda',index=0),0]"],
                         'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
                         'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
+<<<<<<< HEAD
                         'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,
                         'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
                         'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
+=======
+                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
+                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
+                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8}}"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
@@ -2053,12 +2452,21 @@ def func_test1(x, y, z, m):
             if not TEST_WITH_ROCM:
                 expected = """{
                     'input_nodes':[
+<<<<<<< HEAD
                         "[[s77,s27],[s27,1],torch.float32,device(type='cuda',index=0),0]",
                         "[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
                     'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
                     'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
                     'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,
                     'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
+=======
+                        "[[s77,s17],[s17,1],torch.float32,device(type='cuda',index=0),0]",
+                        "[[s17,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
+                    'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
+                    'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
+                    'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8}}"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
                     remove_white_space(cache_key),
@@ -2096,7 +2504,11 @@ def func_test1(x, y, z, m):
 
         # Test loop.
         def test_func2(x):
+<<<<<<< HEAD
             for i in range(10):
+=======
+            for i in range(0, 10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = torch.matmul(x, x)
             return x
 
@@ -2151,7 +2563,10 @@ def test_func3(x, y, z, m, l):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_template_generated_code_caching_bmm(self):
         def func_test1(x, y, z, m):
             a = torch.bmm(x, y)
@@ -2187,7 +2602,10 @@ def misses():
             "max_autotune_gemm_backends": "ATEN, TRITON",
         }
     )
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_template_generated_code_caching_mm_plus_mm(self):
         def func_test1(x, y, z, m):
             a = torch.mm(x, y)
@@ -2221,20 +2639,27 @@ def misses():
             self.assertEqual(hits(), 4)
             self.assertEqual(misses(), 4)
 
+<<<<<<< HEAD
     @fresh_cache()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu
     @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "ignore decompose_k when native matmul codegen",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
         autotune_fallback_to_aten=False,
+<<<<<<< HEAD
     )
     @parametrize("num_decompose_k_splits", (0, 5, 20))
     @parametrize("decompose_k_threshold", (8, 16))
@@ -2271,15 +2696,33 @@ def test_max_autotune_decompose_k_envvars(
                 else:
                     self.assertTrue(decompose_count > 0)
                     self.assertTrue(decompose_count <= num_decompose_k_splits)
+=======
+        disable_decompose_k=True,
+    )
+    def test_max_autotune_disable_decompose_K(self):
+        M, N, K = (32, 32, 32768)
+
+        a = torch.randn(M, K, dtype=torch.float16, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
+
+        compiled_func = torch.compile(lambda a, b: a @ b)
+        out, code = run_and_get_code(compiled_func, a, b)
+
+        for codegen in code:
+            FileCheck().check_not("decompose_k").run(codegen)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfXpu
     @unittest.skipIf(
         TEST_WITH_ROCM, "exhaustive currently only thoroughly tested on NVIDIA"
     )
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "native matmul takes different tuning configs",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(max_autotune=True, max_autotune_gemm_search_space="EXHAUSTIVE")
     def test_max_autotune_exhaustive(self):
         def f(a, b):
@@ -2287,6 +2730,7 @@ def f(a, b):
 
         M, N, K = (1024, 1024, 1024)
 
+<<<<<<< HEAD
         a = torch.randn(M, K, dtype=torch.float16, device=GPU_TYPE, requires_grad=True)
         b = torch.randn(K, N, dtype=torch.float16, device=GPU_TYPE, requires_grad=True)
 
@@ -2294,6 +2738,15 @@ def f(a, b):
             "torch._inductor.template_heuristics.registry.get_template_heuristic"
         ) as config_mock:
             config_heuristics = CUDAMMTemplateConfigHeuristic()
+=======
+        a = torch.randn(M, K, dtype=torch.float16, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
+
+        with mock.patch(
+            "torch._inductor.kernel.mm.V.choices.get_config_heuristics"
+        ) as config_mock:
+            config_heuristics = CUDAConfigHeuristic()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Traditionally, this would be set of all possible configs
             # We mock out the code path for the sake of the unit test
@@ -2311,6 +2764,7 @@ def f(a, b):
                 if "benchmark_gpu" in counter:
                     self.assertEqual(counters["inductor"][counter], 2)
 
+<<<<<<< HEAD
     @config.patch(
         {
             "max_autotune": True,
@@ -2460,6 +2914,8 @@ def layout_checker(choices):
         finally:
             clear_preprocessing_fns(clear_defaults=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
@@ -2495,7 +2951,10 @@ def no_lookup(
             op: str,
             inputs: str,
             benchmark: Callable[[Any], dict[ChoiceCaller, float]],
+<<<<<<< HEAD
             hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Optional[dict[ChoiceCaller, float]]:
             if benchmark is not None:
                 return benchmark(choices)
@@ -2549,9 +3008,29 @@ def fn(a, b, c):
         fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
         self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
 
+<<<<<<< HEAD
     @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
     @runOnRocmArch(MI300_ARCH)
     @unittest.skipIf(config.triton.native_matmul, "native matmul has counter 0")
+=======
+    @fresh_cache()
+    @config.patch(search_autotune_cache=True)
+    def test_search_autotune_cache(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile()(fn)
+        inputs = [torch.rand([256, 256], device=GPU_TYPE) for _ in range(3)]
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
+
+    @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_precompilations(self):
         def fn(a, b, c):
             a = (a @ b) @ c
@@ -2586,7 +3065,11 @@ def test_benchmark_choice_in_subproc(self):
         )()  # a dummy graph to construct the GraphLowering
         graph = GraphLowering(gm)
 
+<<<<<<< HEAD
         # the graph handler is needed to create benchmark example value below
+=======
+        # the graph handler is neede to create benchmark example value below
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with V.set_graph_handler(graph):
             buf1 = self._create_buffer("mat1", (2, 3))
             buf2 = self._create_buffer("mat2", (3, 2))
@@ -2626,7 +3109,11 @@ def test_benchmark_choice_fail_in_subproc(self):
         )()  # a dummy graph to construct the GraphLowering
         graph = GraphLowering(gm)
 
+<<<<<<< HEAD
         # the graph handler is needed to create benchmark example value below
+=======
+        # the graph handler is neede to create benchmark example value below
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with V.set_graph_handler(graph):
             buf1 = self._create_buffer("mat1", (2, 3))
             buf2 = self._create_buffer("mat2", (3, 2))
@@ -2698,9 +3185,14 @@ def mm(a, b):
         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+<<<<<<< HEAD
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     @parametrize("dynamic", (False, True))
     def test_max_autotune_addmm(self, search_space, dynamic=False):
+=======
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_addmm(self, dynamic=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Make sure autotuning addmm in sub processes work without crashes.
         """
@@ -2713,6 +3205,7 @@ def addmm(x, a, b):
         x = torch.randn(100).to(GPU_TYPE)
         a = torch.randn(100, 10).to(GPU_TYPE)
         b = torch.randn(10, 100).to(GPU_TYPE)
+<<<<<<< HEAD
         with config.patch(
             {
                 "max_autotune": True,
@@ -2720,6 +3213,9 @@ def addmm(x, a, b):
                 "max_autotune_gemm_search_space": search_space,
             }
         ):
+=======
+        with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Y_compiled = torch.compile(addmm, dynamic=dynamic)(x, a, b)
             Y = addmm(x, a, b)
             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
@@ -2981,6 +3477,7 @@ def test_tuning_pool_multiple_devices(self):
 
         tuning_pool.shutdown()
 
+<<<<<<< HEAD
     def test_add_feedback_saver(self):
         """Test that add_feedback_saver correctly adds feedback functions."""
         from torch._inductor.select_algorithm import get_algorithm_selector_cache
@@ -3097,6 +3594,8 @@ def mm(a, b):
         # Clean up
         clear_feedback_savers()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @instantiate_parametrized_tests
 class TestPrologueFusion(TestCase):
@@ -3136,9 +3635,12 @@ def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
                 "del", num_deallocs, exactly=True
             ).run(code_str)
 
+<<<<<<< HEAD
     @skipIfXpu(
         msg="Triton issue exposed by new driver, will be resolved after next triton update."
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
     def test_upcast(self, sizes):
         M, K, N = sizes
@@ -3152,12 +3654,17 @@ def foo(x, y):
         out, code = run_and_get_code(torch.compile(foo), x, y)
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
         self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+<<<<<<< HEAD
         if config.triton.native_matmul:
             # native matmul preserves zero mask - need to optimize; see codegen/triton.py
             FileCheck().check("a =").check("tl.where").check("tl.dot").run(code[0])
         else:
             # upcast preserves zero mask
             FileCheck().check("a =").check_not("tl.where").check("tl.dot").run(code[0])
+=======
+        # upcast preserves zero mask
+        FileCheck().check("a =").check_not("tl.where").check("tl.dot").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skip("Triton bug in compilation")
     def test_gather_fusion(self):
@@ -3188,7 +3695,10 @@ def foo(x, y, index):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+<<<<<<< HEAD
     @config.patch({"triton.native_matmul": False})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_low_precision(self):
         M = K = N = 128
 
@@ -3220,10 +3730,13 @@ def foo(x, y):
         # should not be done in low precision, two kernels
         self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_downcast(self):
         # per heuristics, dont fuse a downcast into a mm because it would lead to more reads inside kernel
         M, K, N = (64, 128, 256)
@@ -3238,10 +3751,13 @@ def foo(x, y):
         self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
 
     @parametrize("sizes", ((64, 128, 256), (64, 64, 64), (64, 120, 64)))
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multiple_fusions(self, sizes):
         M, K, N = sizes
 
@@ -3268,7 +3784,11 @@ def foo(x, y):
         }
     )
     @skipIfXpu(
+<<<<<<< HEAD
         msg="The fusion not happened because it do not speedup on XPU, see issue #146568"
+=======
+        msg="The fusion not happend because it do not speedup on XPU, see issue #146568"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_pending_fusions_multiple(self):
         def multi_use(x, y):
@@ -3302,7 +3822,11 @@ def resolve_pending(x):
         }
     )
     @skipIfXpu(
+<<<<<<< HEAD
         msg="The fusion not happened because it do not speedup on XPU, see issue #146568"
+=======
+        msg="The fusion not happend because it do not speedup on XPU, see issue #146568"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_pending_fusion_pro_and_epi(self):
         def test_multiple_fusions(x):
@@ -3316,9 +3840,12 @@ def test_multiple_fusions(x):
         ).run(code[0])
         self.assertEqual(out, test_multiple_fusions(x), atol=0.05, rtol=0.05)
 
+<<<<<<< HEAD
     @skipIfXpu(
         msg="Triton issue exposed by new driver, will be resolved after next triton update."
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
     def test_multiple_inputs(self, sizes):
         M, K, N = sizes
@@ -3347,10 +3874,13 @@ def foo(a):
 
     @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_prologue_multiple_nodes(self, sizes):
         M, K, N = sizes
 
@@ -3390,10 +3920,13 @@ def foo(x, y):
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
         self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_preserves_zero_analysis(self):
         fns = (
             (lambda x: x.relu(), False),  # preserves zero
@@ -3446,10 +3979,13 @@ def foo(x):
 
     @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
     @config.patch(allow_buffer_reuse=False)
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mismatched_prologue_group(self):
         def foo(x, y, z):
             a = (x + 2) * 2
@@ -3462,8 +3998,13 @@ def foo(x, y, z):
 
         out, code = run_and_get_code(torch.compile(foo), x, y, z)
         self.assertEqual(out, foo(x, y, z), atol=0.05, rtol=0.05)
+<<<<<<< HEAD
         # there's one more dealloc than there should be because of a buffer reuse. TODO:
         # not sure why disabling buffer reuse doesn't stop
+=======
+        # theres one more dealloc than there should be because of a buffer reuse. TODO:
+        # not sure why disabling buffer reuse doesnt stop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=4)
 
     # XPU have not enabled pad_mm in fx_passes, so there is always one kernel.
@@ -3471,10 +4012,13 @@ def foo(x, y, z):
     @config.patch(shape_padding=True)
     @config.patch(force_shape_pad=True)
     @parametrize("sizes", ((250, 245, 128), (250, 256, 128), (256, 128, 62)))
+<<<<<<< HEAD
     @unittest.skipIf(
         config.triton.native_matmul,
         "generated code is different in native matmul",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_prologue_masked_load(self, sizes):
         M, K, N = sizes
 
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index bf994b5e6b847..ad0858ab64157 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -8,6 +8,7 @@
 from torch._inductor import config, memory
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import run_and_get_triton_code
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import serialTest
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
@@ -21,6 +22,11 @@
     TRITON_AVAILABLE = False
 
 
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Foo(torch.nn.Module):
     """
     The default compiled graph is
@@ -68,6 +74,7 @@ def test_reorder_peak_memory(self):
         outp_corr = self.model(self.inputs)
         compiled_model = torch.compile(self.model)
         code = run_and_get_triton_code(compiled_model, self.inputs)
+<<<<<<< HEAD
 
         call_str = (
             "def call(self, args):"
@@ -78,6 +85,11 @@ def test_reorder_peak_memory(self):
         (
             FileCheck()
             .check(call_str)
+=======
+        (
+            FileCheck()
+            .check("def call(args):")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .check("buf1 = ")
             .check("buf0 = ")
             .check("buf2 = ")
@@ -112,12 +124,15 @@ def reorder_with_only_lpmf(
                 methods=[memory.topological_sort_lpmf],
             )
 
+<<<<<<< HEAD
         call_str = (
             "def call(self, args):"
             if torch._inductor.config.graph_partition
             else "def call(args):"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_lpmf
         ):
@@ -126,7 +141,11 @@ def reorder_with_only_lpmf(
             code = run_and_get_triton_code(compiled_model, self.inputs)
             (
                 FileCheck()
+<<<<<<< HEAD
                 .check(call_str)
+=======
+                .check("def call(args):")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .check("buf1 = ")
                 .check("buf0 = ")
                 .check("buf2 = ")
@@ -161,22 +180,31 @@ def reorder_with_only_bfs(
                 methods=[memory.topological_sort_bfs],
             )
 
+<<<<<<< HEAD
         call_str = (
             "def call(self, args):"
             if torch._inductor.config.graph_partition
             else "def call(args):"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_bfs
         ):
             compiled_model = torch.compile(self.model)
 
             code = run_and_get_triton_code(compiled_model, self.inputs)
+<<<<<<< HEAD
 
             (
                 FileCheck()
                 .check(call_str)
+=======
+            (
+                FileCheck()
+                .check("def call(args):")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .check("buf0 = ")
                 .check("buf1 = ")
                 .check("buf2 = ")
@@ -211,12 +239,15 @@ def reorder_with_only_dfs(
                 methods=[memory.topological_sort_dfs],
             )
 
+<<<<<<< HEAD
         call_str = (
             "def call(self, args):"
             if torch._inductor.config.graph_partition
             else "def call(args):"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_dfs
         ):
@@ -225,7 +256,11 @@ def reorder_with_only_dfs(
             code = run_and_get_triton_code(compiled_model, self.inputs)
             (
                 FileCheck()
+<<<<<<< HEAD
                 .check(call_str)
+=======
+                .check("def call(args):")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .check("buf0 = ")
                 .check("buf2 = ")
                 .check("buf4 = ")
@@ -239,6 +274,7 @@ def reorder_with_only_dfs(
             outp = compiled_model(self.inputs)
             self.assertTrue(same(outp, outp_corr))
 
+<<<<<<< HEAD
     @mock.patch.object(config, "allow_buffer_reuse", False)
     @unittest.skipUnless(TRITON_AVAILABLE, "Triton is not available")
     @config.patch("test_configs.track_memory_lifecycle", "assert")
@@ -319,6 +355,8 @@ def f(a, p):
                 # succ nodes should be forwarded to pre mutation buffer
                 self.assertTrue(buffer_info[post][2] <= buffer_info[pre][2])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not torch.cuda.is_available()
         or torch.cuda.get_device_properties().total_memory < int(1e10),
@@ -339,6 +377,7 @@ def f(a, b, c):
         expected_bound = a.size(0) * c.size(1) * a.dtype.itemsize * 2
         self.assertLess(peak_mem, expected_bound)
 
+<<<<<<< HEAD
     @serialTest()
     def test_fusion_acc_large_reads(self):
         def f(x, y, z):
@@ -457,6 +496,8 @@ def replace_foreach(gm):
             code = run_and_get_triton_code(foo, inp, inp2)
             FileCheck().check("allocated=['buf0']").run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index 867121cd68f9e..93c425a372564 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -24,6 +24,7 @@
 from torch.export import Dim
 
 
+<<<<<<< HEAD
 try:
     from .test_aot_inductor import AOTIRunnerUtil
 except ImportError:
@@ -32,6 +33,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @requires_gpu()
 @config.patch(memory_planning=True)
 class TestMemoryPlanning(TestCase):
@@ -84,6 +87,16 @@ def test_cpp_wrapper(self):
 
     @skipIfXpu(msg="aoti doesn't work on XPU")
     def test_aoti(self):
+<<<<<<< HEAD
+=======
+        try:
+            from .test_aot_inductor import AOTIRunnerUtil
+        except ImportError:
+            from test_aot_inductor import (  # @manual=fbcode//caffe2/test/inductor:test_aot_inductor-library
+                AOTIRunnerUtil,
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f, args = self._generate(device=GPU_TYPE)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None)
@@ -92,18 +105,27 @@ def test_aoti(self):
         )
 
         FileCheck().check(
+<<<<<<< HEAD
             "int64_t int_array_0[] = {24L + align(12L*s6), };"
+=======
+            "int64_t int_array_0[] = {24L + align(12L*s77), };"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).check_next("int64_t int_array_1[] = {1L, };").check_next(
             "AtenTensorHandle pool1_handle;"
         ).check_next(
             "aoti_torch_empty_strided(1, int_array_0, int_array_1,"
         ).check_next("RAIIAtenTensorHandle pool1(pool1_handle);").check_next(
+<<<<<<< HEAD
             "int64_t int_array_2[] = {s6, 3L};"
+=======
+            "int64_t int_array_2[] = {s77, 3L};"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).check_next("int64_t int_array_3[] = {3L, 1L};").check_next(
             "AtenTensorHandle tmp_tensor_handle_0;"
         ).check_next("aoti_torch__alloc_from_pool(pool1, 0").run(code)
         self.assertTrue(same(f(*args), result))
 
+<<<<<<< HEAD
     @config.patch({"triton.autotune_at_compile_time": False})
     def test_unbacked_symint(self):
         # when allocation's size has unbacked symints
@@ -152,6 +174,8 @@ def forward(self, x, y):
             "AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool(pool0, 0, cached_torch_dtype_float32, 3, int_array_4, int_array_5, &tmp_tensor_handle_1));"  # noqa: B950
         ).check("RAIIAtenTensorHandle(tmp_tensor_handle_1);").run(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     if HAS_GPU:
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index e8d695a1852d3..79efd5c64d4ef 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -249,7 +249,11 @@ def _aoti_check_relu_repro(self, res):
         assert res is not None
         ep_file_path = res.get_exported_program_path()
         assert ep_file_path is not None
+<<<<<<< HEAD
         gm = export_load(ep_file_path).module(check_guards=False)
+=======
+        gm = export_load(ep_file_path).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             str(gm.code).strip(),
             """\
diff --git a/test/inductor/test_minifier_utils.py b/test/inductor/test_minifier_utils.py
index 80c773830b4af..6617c82b5db29 100644
--- a/test/inductor/test_minifier_utils.py
+++ b/test/inductor/test_minifier_utils.py
@@ -63,7 +63,11 @@ def true_fn(x):
         )
 
         model = M()
+<<<<<<< HEAD
         gm = torch.export.export(model, inputs, strict=False).module(check_guards=False)
+=======
+        gm = torch.export.export(model, inputs, strict=False).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: make NNModuleToString.convert() generate string for nested submodules.
         model_string = get_module_string(gm)
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 709b1fe7f0798..64f500e6531e1 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -18,7 +18,10 @@
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
 from torch.nn import functional as F
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
+<<<<<<< HEAD
 from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import (
     _generate_qdq_quantized_model,
     skipIfNoDynamoSupport,
@@ -31,7 +34,10 @@
     IS_LINUX,
     IS_X86,
     MI300_ARCH,
+<<<<<<< HEAD
     MI350_ARCH,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     skipIfNoXPU,
     skipIfRocm,
@@ -178,7 +184,10 @@ def _test_common(
         is_dynamic=False,
         quantizer=None,
         compile_options={},  # noqa: B006
+<<<<<<< HEAD
         quantization_with_autocast=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if not hasattr(self, "device"):
             has_xpu = any(
@@ -208,6 +217,7 @@ def _test_common(
             assert check_autocast == torch.float32
             maybe_autocast = contextlib.nullcontext()
         if check_quantization:
+<<<<<<< HEAD
             if quantization_with_autocast:
                 with maybe_autocast:
                     convert_model = _generate_qdq_quantized_model(
@@ -217,6 +227,11 @@ def _test_common(
                 convert_model = _generate_qdq_quantized_model(
                     mod, inputs, is_qat, is_dynamic, quantizer
                 )
+=======
+            convert_model = _generate_qdq_quantized_model(
+                mod, inputs, is_qat, is_dynamic, quantizer
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with torch.no_grad(), maybe_autocast:
                 _ = torch.compile(convert_model)(*inputs)
                 matcher_check_fn()
@@ -225,12 +240,16 @@ def _test_common(
                 clone_inputs = self._clone_inputs(inputs)
                 expected = mod(*inputs)
                 actual = torch.compile(mod, **compile_options)(*clone_inputs)
+<<<<<<< HEAD
                 if self.precision != 0:
                     torch.testing.assert_close(
                         actual, expected, atol=self.precision, rtol=self.precision
                     )
                 else:
                     torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
+=======
+                torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 matcher_check_fn()
 
     def _test_code_common(
@@ -320,11 +339,14 @@ def forward(self, x):
             memory_format,
             dtype,
         ) in options:
+<<<<<<< HEAD
             if (
                 dtype != torch.float32
                 and torch.backends.mkldnn.matmul.fp32_precision == "tf32"
             ):
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             metrics.reset()
             if dim == 4:
                 x_shape = (1, 3, 56, 56)
@@ -363,7 +385,10 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv2d_unary(self, device):
         self.device = device
         self._test_conv_unary_base(dim=4)
@@ -371,7 +396,10 @@ def test_conv2d_unary(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_unary(self, device):
         self.device = device
         self._test_conv_unary_base(dim=5)
@@ -455,7 +483,10 @@ def matcher_check_fn():
     @skipIfXpu(
         msg="The operator 'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device."
     )
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_transpose2d_unary(self, device):
         self.device = device
         self._test_conv_transpose_unary_base(dim=4)
@@ -466,7 +497,10 @@ def test_conv_transpose2d_unary(self, device):
     @skipIfXpu(
         msg="The operator 'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device."
     )
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_transpose3d_unary(self, device):
         self.device = device
         self._test_conv_transpose_unary_base(dim=5)
@@ -521,11 +555,14 @@ def forward(self, x):
             memory_format,
             dtype,
         ) in options:
+<<<<<<< HEAD
             if (
                 dtype != torch.float32
                 and torch.backends.mkldnn.matmul.fp32_precision == "tf32"
             ):
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             metrics.reset()
             if dim == 4:
                 x_shape = (1, 3, 56, 56)
@@ -561,7 +598,10 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.02)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv2d_binary(self, device):
         self.device = device
         self._test_conv_binary_base(dim=4)
@@ -569,7 +609,10 @@ def test_conv2d_binary(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.02)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_binary(self, device):
         self.device = device
         self._test_conv_binary_base(dim=5)
@@ -668,7 +711,10 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv2d_binary_broadcast_shapes(self, device):
         self.device = device
         self._test_conv_binary_broadcast_shapes_base(dim=4)
@@ -676,7 +722,10 @@ def test_conv2d_binary_broadcast_shapes(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_binary_broadcast_shapes(self, device):
         self.device = device
         self._test_conv_binary_broadcast_shapes_base(dim=5)
@@ -685,7 +734,10 @@ def test_conv3d_binary_broadcast_shapes(self, device):
     @skipIfNoONEDNN
     @skipIfRocm
     @unittest.skipIf(IS_FBCODE, "Failing in fbcode")
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv2d_linear_add_broadcast_shapes(self, device):
         self.device = device
 
@@ -717,7 +769,10 @@ def matcher_check_fn():
 
 
 class TestPatternMatcher(TestPatternMatcherBase):
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_unary(self, device="cpu"):
         self.device = device
 
@@ -748,6 +803,7 @@ def forward(self, x):
             dtypes.append(torch.bfloat16)
         if is_mkldnn_fp16_supported(self.device):
             dtypes.append(torch.float16)
+<<<<<<< HEAD
         if torch.backends.mkldnn.matmul.fp32_precision in ["bf16", "tf32"]:
             dtypes.append(torch.float32)
         options = itertools.product(unary_list, [True, False], dtypes)
@@ -760,11 +816,23 @@ def forward(self, x):
             metrics.reset()
             mod = M(unary_fn, 10, 30, bias=bias).eval()
             # only fuse for linear when the dtype is bf16
+=======
+        options = itertools.product(unary_list, [True, False], dtypes)
+        for unary_fn, bias, dtype in options:
+            metrics.reset()
+            mod = M(unary_fn, 10, 30, bias=bias).eval()
+            # only fuse for linear when the dtype is bf16
+            mod = mod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             v = torch.randn(2, 10)
 
             def matcher_check_fn():
                 match_nodes = unary_list[unary_fn]
+<<<<<<< HEAD
                 if dtype != torch.float32 and self._check_unary_is_decomposed(unary_fn):
+=======
+                if self._check_unary_is_decomposed(unary_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Has extra dtype conversion nodes for autocast.
                     match_nodes += 2
                 self.assertEqual(
@@ -776,6 +844,7 @@ def matcher_check_fn():
                 )
 
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
+<<<<<<< HEAD
             # only generated 1 kernel for "to_dtype"
             expected_kernel_count = 2 if TEST_ACL else 1
             if dtype == torch.float32:
@@ -784,6 +853,11 @@ def matcher_check_fn():
             self.assertEqual(metrics.generated_kernel_count, expected_kernel_count)
 
     @reduced_f32_on_and_off()
+=======
+            # only generated 1 kernel for "to"
+            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     def test_linear_fp32(self, device="cpu"):
         self.device = device
@@ -834,7 +908,13 @@ def forward(self, x):
 
         for dtype in dtypes:
             torch._dynamo.reset()
+<<<<<<< HEAD
             autocast_enabled = dtype in [torch.bfloat16, torch.float16]
+=======
+            autocast_enabled = (
+                True if dtype in [torch.bfloat16, torch.float16] else False
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with (
                 torch.no_grad(),
                 torch.autocast(
@@ -929,7 +1009,10 @@ def matcher_check_fn():
             # 1 kernel for "to_lowp", 2 kernels for unary ops
             self.assertEqual(metrics.generated_kernel_count, 3)
 
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_binary(self, device="cpu"):
         self.device = device
 
@@ -951,8 +1034,11 @@ def forward(self, x, y):
             dtypes.append(torch.bfloat16)
         if is_mkldnn_fp16_supported(self.device):
             dtypes.append(torch.float16)
+<<<<<<< HEAD
         if torch.backends.mkldnn.matmul.fp32_precision in ["bf16", "tf32"]:
             dtypes.append(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         options = itertools.product(
             binary_list, [[2, 3, 10], [2, 10]], [True, False], dtypes
         )
@@ -960,11 +1046,14 @@ def forward(self, x, y):
 
         for binary_fn, input_shape, bias, dtype in options:
             metrics.reset()
+<<<<<<< HEAD
             if (
                 dtype != torch.float32
                 and torch.backends.mkldnn.matmul.fp32_precision == "tf32"
             ):
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def matcher_check_fn():
                 self.assertEqual(
@@ -994,12 +1083,16 @@ def matcher_check_fn():
                 matcher_check_fn,
                 check_autocast=dtype,
             )
+<<<<<<< HEAD
             # only generated 1 kernel for "to_dtype"
             expected_kernel_count = 2 if TEST_ACL else 1
             if dtype == torch.float32:
                 # In BF32, input is float32, will not generate kernel for "to_dtype"
                 expected_kernel_count -= 1
             self.assertEqual(metrics.generated_kernel_count, expected_kernel_count)
+=======
+            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_linear_binary_broadcast_shapes(self, device="cpu"):
         self.device = device
@@ -1111,12 +1204,16 @@ def matcher_check_fn():
             v = torch.randn(2, 4, 16).to(dtype)
             self._test_common(mod, (v,), matcher_check_fn, rtol=1e-2, atol=1e-2)
 
+<<<<<<< HEAD
     def _qconv2d_test_helper(
         self,
         device="cpu",
         int8_mixed_bf16=False,
         quantization_with_autocast=False,
     ):
+=======
+    def _qconv2d_test_helper(self, device="cpu", int8_mixed_bf16=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -1149,7 +1246,11 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv_weight_prepack_matcher_nodes"],
+<<<<<<< HEAD
                 (16 if quantization_with_autocast else 18) if int8_mixed_bf16 else 12,
+=======
+                18 if int8_mixed_bf16 else 12,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(
                 counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 3
@@ -1161,7 +1262,10 @@ def matcher_check_fn():
             matcher_check_fn,
             check_quantization=True,
             check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
+<<<<<<< HEAD
             quantization_with_autocast=quantization_with_autocast,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1185,7 +1289,11 @@ def test_qconv2d_xpu(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+<<<<<<< HEAD
     @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
+=======
+    @skipIfRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_qconv2d_int8_mixed_bf16(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
@@ -1195,6 +1303,7 @@ def test_qconv2d_int8_mixed_bf16(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+<<<<<<< HEAD
     @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
     def test_qconv2d_int8_mixed_bf16_use_autocast(self):
         r"""
@@ -1205,6 +1314,8 @@ def test_qconv2d_int8_mixed_bf16_use_autocast(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfNoXPU
     def test_qconv2d_int8_mixed_bf16_xpu(self):
         r"""
@@ -2351,7 +2462,10 @@ def _qlinear_test_helper(
         bias=True,
         is_dynamic=False,
         is_qat=False,
+<<<<<<< HEAD
         quantization_with_autocast=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias, do_permute=False):
@@ -2390,14 +2504,21 @@ def _default_matcher_check_fn():
             check_quantization=True,
             is_qat=is_qat,
             is_dynamic=is_dynamic,
+<<<<<<< HEAD
             quantization_with_autocast=quantization_with_autocast,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     def test_qlinear_cpu(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper((torch.randn((2, 4)),), bias=bias)
@@ -2407,7 +2528,11 @@ def test_qlinear_cpu(self):
     @skipIfNoXPU
     def test_qlinear_xpu(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2418,7 +2543,11 @@ def test_qlinear_xpu(self):
     @skipIfNoONEDNN
     def test_dynamic_qlinear_cpu(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2429,7 +2558,11 @@ def test_dynamic_qlinear_cpu(self):
     @skipIfNoONEDNN
     def test_dynamic_qlinear_qat_cpu(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2440,7 +2573,11 @@ def test_dynamic_qlinear_qat_cpu(self):
     @skipIfNoONEDNN
     def test_dynamic_qlinear_input_dim_exceeds_2(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2452,7 +2589,11 @@ def test_dynamic_qlinear_input_dim_exceeds_2(self):
     @skipIfNoONEDNN
     def test_qlinear_int8_mixed_bf16(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+=======
+        This testcase will quantize a single Linear Moduel with int8_mixed_bf16 quantization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2461,6 +2602,7 @@ def test_qlinear_int8_mixed_bf16(self):
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
+<<<<<<< HEAD
     @skipIfNoONEDNN
     def test_qlinear_int8_mixed_bf16_use_autocast(self):
         r"""
@@ -2480,6 +2622,12 @@ def test_qlinear_int8_mixed_bf16_use_autocast(self):
     def test_qlinear_int8_mixed_bf16_xpu(self):
         r"""
         This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+=======
+    @skipIfNoXPU
+    def test_qlinear_int8_mixed_bf16_xpu(self):
+        r"""
+        This testcase will quantize a single Linear Moduel with int8_mixed_bf16 quantization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2493,7 +2641,11 @@ def test_qlinear_int8_mixed_bf16_xpu(self):
     @skipIfNoONEDNN
     def test_qlinear_input_dim_exceeds_2(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper((torch.randn((2, 3, 4)),), bias=bias)
@@ -2503,7 +2655,11 @@ def test_qlinear_input_dim_exceeds_2(self):
     @skipIfNoXPU
     def test_qlinear_input_dim_exceeds_2_xpu(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module.
+=======
+        This testcase will quantize a single Linear Moduel.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2515,7 +2671,11 @@ def test_qlinear_input_dim_exceeds_2_xpu(self):
     @skipIfNoONEDNN
     def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2(self):
         r"""
+<<<<<<< HEAD
         This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+=======
+        This testcase will quantize a single Linear Moduel with int8_mixed_bf16 quantization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2525,6 +2685,7 @@ def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+<<<<<<< HEAD
     def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_use_autocast(self):
         r"""
         This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
@@ -2544,6 +2705,12 @@ def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_use_autocast(self):
     def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_xpu(self):
         r"""
         This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+=======
+    @skipIfNoXPU
+    def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_xpu(self):
+        r"""
+        This testcase will quantize a single Linear Moduel with int8_mixed_bf16 quantization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         for bias in [True, False]:
             self._qlinear_test_helper(
@@ -2610,6 +2777,7 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+<<<<<<< HEAD
     def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_and_not_contiguous_use_autocast(
         self,
     ):
@@ -2641,6 +2809,8 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfNoXPU
     def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_and_not_contiguous_xpu(self):
         r"""
@@ -2986,8 +3156,13 @@ def matcher_check_fn():
                     mod,
                     (v,),
                     [
+<<<<<<< HEAD
                         f"aoti_torch_{device}__qlinear_pointwise_tensor",
                         f"aoti_torch_{device}__qlinear_pointwise_binary_tensor",
+=======
+                        "aoti_torch_cpu__qlinear_pointwise_tensor",
+                        "aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ],
                     [],
                     check_quantization=True,
@@ -3056,6 +3231,7 @@ def test_qlinear_add_int8_mixed_bf16_xpu(self, use_relu, is_qat, is_dynamic):
             is_dynamic=is_dynamic,
         )
 
+<<<<<<< HEAD
     def _test_qlinear_fp8_inductor_cpu_helper(self, qlinear_op, post_op="none"):
         dtype = torch.float8_e4m3fn
         qlinear_prepack = torch.ops.onednn.qlinear_prepack
@@ -3154,6 +3330,8 @@ def test_qlinear_add_fp8_inductor_cpu(self):
         qlinear_op = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_fp8_inductor_cpu_helper(qlinear_op, "add")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _qlinear_dequant_promotion_test_helper(
         self,
         inputs,
@@ -4418,12 +4596,22 @@ def test_da8w8_sym_act_sym_wgt_with_int_mm(
         out_feature = 64
         q_min, q_max = -32, 31
         # we only test for qlinear_binary in this case
+<<<<<<< HEAD
         test_for_pointwise_binary = bool(
             M == 1
+=======
+        test_for_pointwise_binary = (
+            True
+            if M == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and inplace_add
             and not expand_a_scale
             and not dynamic
             and not has_bias
+<<<<<<< HEAD
+=======
+            else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if test_for_pointwise_binary and not IS_X86:
             self.skipTest("Some UTs are only supported on x86_64 CPUs")
@@ -4690,6 +4878,10 @@ class SelfAttnLikeModule(torch.nn.Module):
             def __init__(
                 self,
                 input_dim,
+<<<<<<< HEAD
+=======
+                transpose_for_score=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 num_attention_heads=None,
                 attention_head_size=None,
             ) -> None:
@@ -4699,10 +4891,19 @@ def __init__(
                 self.k_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
                 self.v_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
                 self.softmax = torch.nn.Softmax(dim=-1)
+<<<<<<< HEAD
                 self.num_attention_heads = num_attention_heads
                 self.attention_head_size = attention_head_size
                 self.all_head_size = self.num_attention_heads * self.attention_head_size
                 self.dense = torch.nn.Linear(self.all_head_size, self.all_head_size)
+=======
+                self.transpose_for_score = transpose_for_score
+                if self.transpose_for_score:
+                    assert num_attention_heads is not None
+                    assert attention_head_size is not None
+                    self.num_attention_heads = num_attention_heads
+                    self.attention_head_size = attention_head_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
                 new_x_shape = x.size()[:-1] + (
@@ -4716,6 +4917,7 @@ def forward(self, x):
                 q = self.q_proj(x)
                 k = self.k_proj(x)
                 v = self.v_proj(x)
+<<<<<<< HEAD
                 q = self.transpose_for_scores(q)
                 k = self.transpose_for_scores(k)
                 v = self.transpose_for_scores(v)
@@ -4731,6 +4933,21 @@ def forward(self, x):
         for annotate_matmul in [True, False]:
             mod = SelfAttnLikeModule(
                 input_dim=64 * 16,
+=======
+                if self.transpose_for_score:
+                    q = self.transpose_for_scores(q)
+                    k = self.transpose_for_scores(k)
+                    v = self.transpose_for_scores(v)
+                scores = torch.matmul(q, k.transpose(-1, -2)) / (self.input_dim**0.5)
+                attention = self.softmax(scores)
+                weighted = torch.matmul(attention, v)
+                return weighted
+
+        for annotate_matmul in [False, True]:
+            mod = SelfAttnLikeModule(
+                input_dim=64 * 16,
+                transpose_for_score=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 num_attention_heads=16,
                 attention_head_size=64,
             ).eval()
@@ -4738,17 +4955,24 @@ def forward(self, x):
 
             def matcher_check_fn():
                 self.assertEqual(
+<<<<<<< HEAD
                     counters["inductor"]["qlinear_weight_prepack_matcher_count"], 4
+=======
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.assertEqual(
                     counters["inductor"]["qlinear_unary_matcher_count"],
                     3 if annotate_matmul and not TEST_ACL else 0,
                 )
+<<<<<<< HEAD
                 if IS_X86:  # Some issues on ARM
                     self.assertEqual(
                         counters["inductor"]["quant_lift_up_count"],
                         4 if annotate_matmul and not TEST_ACL else 1,
                     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             quantizer = X86InductorQuantizer()
             quantizer.set_global(xiq.get_default_x86_inductor_quantization_config())
diff --git a/test/inductor/test_mmdecomp.py b/test/inductor/test_mmdecomp.py
index 22a5d83324597..bccd45e429849 100644
--- a/test/inductor/test_mmdecomp.py
+++ b/test/inductor/test_mmdecomp.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.decomposition import mm
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.symbolic_shapes import (
@@ -13,6 +14,8 @@
     ShapeEnv,
     StatelessSymbolicContext,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_nn import NNTestCase
@@ -85,6 +88,7 @@ def torch_baddbmm(add, b, c, alpha, beta):
     return torch.baddbmm(add, b, c, alpha=alpha, beta=beta)
 
 
+<<<<<<< HEAD
 def create_fake_tensor_with_dynamic_size(x, fake_mode):
     with fake_mode:
         dynamic_sizes = [DimDynamic.DYNAMIC for _ in range(x.dim())]
@@ -98,6 +102,8 @@ def create_fake_tensor_with_dynamic_size(x, fake_mode):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The shapes we test on
 ts_list = [
     (1, 32, 32, 1),
@@ -172,7 +178,11 @@ def test_bmm_batch2_last_dim_size_is_one(self, device):
     @parametrize("dtype", [torch.float, torch.bfloat16, torch.int])
     def test_some(self, device, dtype):
         # this Pytorch data type is not fully supported on cuda today
+<<<<<<< HEAD
         # - unfortunately we can't skipIf because we don't see the actual params in skipIf
+=======
+        # - unfortunately we can't skipIf because we don't see the actual parms in skipIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if device.startswith(GPU_TYPE) and dtype == torch.int:
             return
 
@@ -192,7 +202,11 @@ def test_some(self, device, dtype):
     @parametrize("bs", [1, 2, 4, 10])
     def test_some_batched(self, device, dtype, bs):
         # this Pytorch data type is not fully supported on cuda today
+<<<<<<< HEAD
         # - unfortunately we can't skipIf because we don't see the actual params in skipIf
+=======
+        # - unfortunately we can't skipIf because we don't see the actual parms in skipIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if device.startswith(GPU_TYPE) and dtype == torch.int:
             return
 
@@ -207,6 +221,7 @@ def test_some_batched(self, device, dtype, bs):
             init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
         )
 
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float, torch.bfloat16])
     def test_dynamic_shape_mm(self, device, dtype):
         # Test that the mm decomp does not evaluate expressions for dynamic shapes
@@ -272,6 +287,8 @@ def test_dynamic_shape_mm(self, device, dtype):
                 self.assertTrue(r_expr_types[0] == og_t1_expr_types[0])
                 self.assertTrue(r_expr_types[1] == og_t2_expr_types[1])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 device_types = ("cpu", GPU_TYPE)
 instantiate_device_type_tests(TestDecomp, globals(), only_for=device_types)
diff --git a/test/inductor/test_move_constructors_to_cuda.py b/test/inductor/test_move_constructors_to_cuda.py
index b174c79f1ebd0..38b84de868d7a 100644
--- a/test/inductor/test_move_constructors_to_cuda.py
+++ b/test/inductor/test_move_constructors_to_cuda.py
@@ -9,7 +9,11 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import IS_LINUX
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 requires_multigpu = functools.partial(
@@ -112,5 +116,9 @@ def foo(x):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if IS_LINUX and HAS_CUDA_AND_TRITON:
+=======
+    if IS_LINUX and HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index c2f4505a9f84f..6f39f5f3c857d 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -6,7 +6,11 @@
 import numpy as np
 
 import torch
+<<<<<<< HEAD
 from torch.testing import FileCheck, make_tensor
+=======
+from torch.testing import make_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -85,6 +89,68 @@ def foo(x):
     def test_cast(self, dtype):
         self.common(lambda a: a.to(dtype), (torch.rand(1024),))
 
+<<<<<<< HEAD
+=======
+    pointwise_unary_ops = [
+        "i0",
+        "i0e",
+        "i1",
+        "i1e",
+        "erf",
+        "digamma",
+        "sinc",
+        "spherical_bessel_j0",
+        "bessel_j0",
+        "bessel_j1",
+        "bessel_y0",
+        "bessel_y1",
+        "modified_bessel_i0",
+        "modified_bessel_i1",
+        "modified_bessel_k0",
+        "modified_bessel_k1",
+        "scaled_modified_bessel_k0",
+        "scaled_modified_bessel_k1",
+        "entr",
+    ]
+
+    @parametrize("op_name", pointwise_unary_ops)
+    def test_pointwise_unary_op(self, op_name):
+        self.common(
+            lambda x: getattr(torch.special, op_name)(x),
+            (torch.rand(128, 128),),
+            check_lowp=False,
+        )
+
+    def test_pointwise_polygamma(self):
+        self.common(
+            torch.special.polygamma,
+            (
+                1,
+                torch.rand(128, 128),
+            ),
+            check_lowp=False,
+        )
+
+    @parametrize(
+        "op_name",
+        [
+            "zeta",
+            "xlog1py",
+            "chebyshev_polynomial_t",
+            "chebyshev_polynomial_u",
+            "chebyshev_polynomial_v",
+            "chebyshev_polynomial_w",
+            "hermite_polynomial_he",
+        ],
+    )
+    def test_pointwise_binary_op(self, op_name):
+        self.common(
+            lambda x, y: getattr(torch.special, op_name)(x, y),
+            (torch.rand(128, 128), torch.rand(128, 128)),
+            check_lowp=False,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_broadcast(self):
         self.common(torch.add, (torch.rand(32, 1024), torch.rand(1024)))
 
@@ -121,6 +187,7 @@ def fn(x, y):
             ),
         )
 
+<<<<<<< HEAD
     def test_conv_train(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/161905
         def fn(x, y):
@@ -135,6 +202,8 @@ def fn(x, y):
             check_gradient=True,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cholesky(self):
         def fn(x):
             return (
@@ -144,6 +213,7 @@ def fn(x):
 
         self.common(fn, (torch.eye(64),), check_lowp=False)
 
+<<<<<<< HEAD
     def test_reduced_max(self):
         # inductor test do not validate that max of say 16K half elements can be computed
         self.common(torch.max, (torch.rand(16384, dtype=torch.half),), check_lowp=False)
@@ -155,6 +225,8 @@ def fn(x):
         A = torch.diag(torch.tensor([20.0, 0.5, 5.0], dtype=torch.float32) ** 2)
         self.common(fn, (A,), check_lowp=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):
@@ -249,6 +321,7 @@ def forward(self, a, b):
         dynamic_shapes = {"a": {0: dim0_a}, "b": {0: dim0_b}}
         self.check_model(m, inp, dynamic_shapes)
 
+<<<<<<< HEAD
     def test_reuse_kernel(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -281,6 +354,8 @@ def forward(self, x, y):
                 exactly=True,
             ).run(src_code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
index 8284ac639fbe9..d71e121c02892 100644
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@@ -16,6 +16,7 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
     skipIfRocm,
     skipIfXpu,
 )
@@ -25,6 +26,11 @@
     IS_BIG_GPU,
     requires_triton,
 )
+=======
+    skipIfXpu,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TransformerSnippet(nn.Module):
@@ -50,6 +56,7 @@ def _contains_multi_kernel_code(wrapper_code: str):
     )
 
 
+<<<<<<< HEAD
 def _contains_size_hint_multi_kernel_code(wrapper_code: str):
     return (
         re.search(
@@ -60,6 +67,8 @@ def _contains_size_hint_multi_kernel_code(wrapper_code: str):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def make_cpp_wrapper_test(orig_test, **extra_args):
     """
     Wrap an existing test into a new test with cpp-wrapper enabled.
@@ -87,7 +96,10 @@ def fn(self):
     {
         "triton.multi_kernel": int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "1")),
         "benchmark_kernel": True,
+<<<<<<< HEAD
         "multi_kernel_hints": [64, 256, 4096],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 )
 @instantiate_parametrized_tests
@@ -108,6 +120,7 @@ def test_softmax(self, expect_multi_kernel=True):
         else:
             self.assertFalse(_contains_multi_kernel_code(wrapper_code))
 
+<<<<<<< HEAD
     @requires_triton()
     # TODO: bobrenjc93 to fix multi-kernel for ROCM
     @skipIfRocm
@@ -166,6 +179,8 @@ def fn(x, y):
         self.assertEqual(ref, act)
         self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("force_kernel", (0, 1))
     @unittest.mock.patch.dict(
         os.environ, {"TORCHINDUCTOR_DISABLE_MULTI_KERNEL_CACHE": "1"}
@@ -266,8 +281,13 @@ def test_batchnorm_training(self):
         once for input and once for output. They are ruled out as in-out argument because
         they are considered as graph inputs.
 
+<<<<<<< HEAD
         Multi-kernel previously assumes that we never pass the same argument multi times
         for a kernel. No matter if we change inductor behavior to assure that, it's better
+=======
+        Multi-kernel previously assumes that we never pass the same argument mutli times
+        for a kernel. No mater if we change inductor behavior to assure that, it's better
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to make multi-kernel being able to handle those cases.
         """
         bn = nn.BatchNorm2d(3).to(GPU_TYPE)
@@ -307,7 +327,11 @@ def f(x, y):
 
     def test_reduction_scratch_buffer(self, force_multi_kernel=1):
         """
+<<<<<<< HEAD
         The explicitly realized buffer in the test function will be passed in
+=======
+        The explicited realized buffer in the test function will be passed in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         as a scratch buffer for the non-persistent reduction kernel but
         can be skipped for the persistent reduction kernel.
 
diff --git a/test/inductor/test_online_softmax.py b/test/inductor/test_online_softmax.py
index 808757b7e041f..1ff232d24c3ba 100644
--- a/test/inductor/test_online_softmax.py
+++ b/test/inductor/test_online_softmax.py
@@ -14,7 +14,11 @@
     IS_LINUX,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
@@ -293,6 +297,7 @@ def f(x, mask):
         self.assertTrue(not act.isnan().any())
         self.assertTrue(torch.allclose(ref, act))
 
+<<<<<<< HEAD
     @inductor_config.patch(split_reductions=False)
     def test_3d_tiled_online_softmax(self):
         def f(x, y):
@@ -306,9 +311,15 @@ def f(x, y):
         opt_f = torch.compile(f)
         torch.testing.assert_close(f(x, y), opt_f(x, y), atol=1e-3, rtol=1e-3)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestOnlineSoftmax)
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if IS_LINUX and HAS_CUDA_AND_TRITON:
+=======
+    if IS_LINUX and HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
index 073806b492ba3..ac2c8345e0281 100644
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@@ -9,7 +9,11 @@
 from torch._inductor import config
 from torch._inductor.codegen.triton import OpDtypeSupport
 from torch._inductor.test_case import TestCase as InductorTestCase
+<<<<<<< HEAD
 from torch._inductor.utils import run_and_get_code, run_and_get_triton_code, triton_type
+=======
+from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.operator_schemas import get_signature_for_torch_op
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
@@ -67,14 +71,21 @@ class TestCase(InductorTestCase):
     )
     # @config.patch("triton.codegen_upcast_to_fp32", False) # TODO enable
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
     @config.patch("test_configs.runtime_triton_shape_assert", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @disable_cache_limit()
     def test_op_dtype_propagation(self, op, dtype):
         def run(op, args, kwargs):
             return op(*args, **kwargs)
 
+<<<<<<< HEAD
         sample_inputs_itr = op.sample_inputs(GPU_TYPE, dtype, requires_grad=False)
+=======
+        sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for sample_input in sample_inputs_itr:
             args = (sample_input.input,) + sample_input.args
             kwargs = sample_input.kwargs
@@ -205,8 +216,11 @@ def test_dtype_aware_codegen(self, op_name: str, load_upcast_to_fp32, input_dtyp
         # Edge case: torch.round maps to libdevice.nearbyint.
         triton_op_name_overrides = {
             "round": "nearbyint",
+<<<<<<< HEAD
             # torch.sqrt lowers to tl.sqrt_rn after switching away from libdevice.sqrt
             "sqrt": "sqrt_rn",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         override = triton_op_name_overrides.get(op_name)
         triton_op_name = override if override is not None else torch_op_name
@@ -258,13 +272,20 @@ def test_binary_math_mixed_precision(self):
 
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
     @config.patch("test_configs.runtime_triton_shape_assert", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("triton.codegen_upcast_to_fp32", False)
     def test_downcast_div_mod(self):
         def fn(x, y):
             return x % y, x / y
 
+<<<<<<< HEAD
         x, y = (torch.rand([8], dtype=torch.float16, device=GPU_TYPE) for _ in range(2))
+=======
+        x, y = (torch.rand([8], dtype=torch.float16, device="cuda") for _ in range(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out, code = run_and_get_code(torch.compile(fn), x, y)
 
@@ -273,39 +294,60 @@ def fn(x, y):
 
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
     @config.patch("test_configs.runtime_triton_shape_assert", True)
     def test_constant(self):
         def fn():
             return (torch.full((2, 3), 3.1416, device=GPU_TYPE, dtype=torch.float16),)
+=======
+    def test_constant(self):
+        def fn():
+            return (torch.full((2, 3), 3.1416, device="cuda", dtype=torch.float16),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out, code = run_and_get_code(torch.compile(fn))
         FileCheck().check("static_assert").check_same(".dtype").run(code[0])
         self.assertEqual(fn(), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
     @config.patch("test_configs.runtime_triton_shape_assert", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("triton.persistent_reductions", False)
     def test_any(self):
         def fn(x):
             return torch.any(x)
 
+<<<<<<< HEAD
         x = torch.rand([40], device=GPU_TYPE).to(torch.bool)
+=======
+        x = torch.rand([40], device="cuda").to(torch.bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out, code = run_and_get_code(torch.compile(fn), x)
         self.assertEqual(fn(x), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
     @config.patch("test_configs.runtime_triton_shape_assert", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     def test_assoc_scan(self):
         from torch._higher_order_ops.associative_scan import associative_scan
 
+<<<<<<< HEAD
         x = torch.randn(10, device=GPU_TYPE)
+=======
+        x = torch.randn(10, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # dtype check correctly
         associative_scan(
             lambda acc, curr: acc + torch.abs(curr), x, dim=-1, combine_mode="pointwise"
         )
 
+<<<<<<< HEAD
     @parametrize("upcast_to_fp32", (False, True))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     def test_upcast_rank_0_cpu(self, dtype: torch.dtype, upcast_to_fp32: bool):
@@ -338,6 +380,10 @@ def test_upcast_rank_0_cpu(self, dtype: torch.dtype, upcast_to_fp32: bool):
 instantiate_device_type_tests(
     TestCase, globals(), only_for=("cuda", "xpu"), allow_xpu=True
 )
+=======
+
+instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_ordered_set.py b/test/inductor/test_ordered_set.py
index 216b8ab0f0216..9cbe948fd318a 100644
--- a/test/inductor/test_ordered_set.py
+++ b/test/inductor/test_ordered_set.py
@@ -156,8 +156,13 @@ def f(s1, s2):
             "Pure python equivalent of isdisjoint()"
             return not OrderedSet(s1).intersection(s2)
 
+<<<<<<< HEAD
         for large in "", "a", "ab", "abc", "ababac", "cdc", "cc", "efgfe", "ccb", "ef":
             s1 = self.thetype(large)
+=======
+        for larg in "", "a", "ab", "abc", "ababac", "cdc", "cc", "efgfe", "ccb", "ef":
+            s1 = self.thetype(larg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for rarg in (
                 "",
                 "a",
@@ -235,8 +240,12 @@ def test_symmetric_difference(self):
         self.assertRaises(TypeError, self.s.symmetric_difference, [[]])
         for C in OrderedSet, frozenset, dict.fromkeys, str, list, tuple:
             self.assertEqual(
+<<<<<<< HEAD
                 self.thetype("abcba").symmetric_difference(C("cdc")),
                 OrderedSet("abd"),  # codespell:ignore
+=======
+                self.thetype("abcba").symmetric_difference(C("cdc")), OrderedSet("abd")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(
                 self.thetype("abcba").symmetric_difference(C("efgfe")),
@@ -652,7 +661,11 @@ def test_symmetric_difference_update(self):
         )
         self.assertRaises(TypeError, self.s.symmetric_difference_update, [[]])
         for p, q in (
+<<<<<<< HEAD
             ("cdc", "abd"),  # codespell:ignore
+=======
+            ("cdc", "abd"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ("efgfe", "abcefg"),
             ("ccb", "a"),
             ("ef", "abcef"),
@@ -991,7 +1004,11 @@ def test_changingSizeWhileIterating(self):
         s = OrderedSet([1, 2, 3])
         try:
             for i in s:
+<<<<<<< HEAD
                 s.update([4])  # noqa: B909
+=======
+                s.update([4])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except RuntimeError:
             pass
         else:
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index 781f4588e1472..100786c200d7c 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -16,7 +16,11 @@
 from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PadMMTest(TestCase):
@@ -539,6 +543,7 @@ def fn(x, y):
         # Its name should contain `mm` because `mm` was the original aten op where the mm came from.
         FileCheck().check("def triton_tem_fused_mm").run(code[0])
 
+<<<<<<< HEAD
     def test_no_autocast_in_pad_bmm_joint_graph_pass(self):
         # Track bmm dtypes before and after joint graph passes
         bmm_dtypes_pre = {}
@@ -651,4 +656,9 @@ def test_masked_mha(B, H, S, D, device, dtype):
 
 if __name__ == "__main__":
     if HAS_CUDA_AND_TRITON:
+=======
+
+if __name__ == "__main__":
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index c67bde87a369b..9604667eb5bd8 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -49,6 +49,7 @@ def geninp():
     return input_dict
 
 
+<<<<<<< HEAD
 def get_padded_stride(shape, alignment_bytes, pad_output, itemsize):
     align = alignment_bytes // itemsize
     new_strides = [0 for _ in range(len(shape))]
@@ -61,6 +62,8 @@ def get_padded_stride(shape, alignment_bytes, pad_output, itemsize):
     return tuple(new_strides)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearAndSoftmax(nn.Module):
     """
     It's very common that a transformer model will do a matmul and then
@@ -109,10 +112,14 @@ def setUpClass(cls):
         if HAS_GPU:
             cls.prior_float32_matmul_precision = torch.get_float32_matmul_precision()
             cls.prior_default_device = torch.get_default_device()
+<<<<<<< HEAD
             if torch.version.hip:
                 torch.set_float32_matmul_precision("highest")
             else:
                 torch.set_float32_matmul_precision("high")
+=======
+            torch.set_float32_matmul_precision("high")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.set_default_device(GPU_TYPE)
 
     @classmethod
@@ -378,7 +385,11 @@ def test_longformer(self, bs=4):
     @unittest.skipIf(not DO_PERF_TEST or not HAS_TRANSFORMER, "Perf test not enabled")
     def test_longformer_small_bs(self):
         """
+<<<<<<< HEAD
         The model exists in both HF and TB. In TB it uses a smaller batch size.
+=======
+        The model exists in both HF and TB. In TB it uses a samller batch size.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.test_longformer(bs=2)
 
@@ -419,7 +430,11 @@ def pad_mm(a, b, align=16):
     @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
     def test_padmm(self):
         """
+<<<<<<< HEAD
         Latency between original matmul and padded matmul: 2.717 v.s. 2.356
+=======
+        Latency between origional matmul and padded matmul: 2.717 v.s. 2.356
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         mat1_pad = torch.randn(8192, 30522, dtype=torch.float16)
         mat2_pad = torch.randn(30522, 768, dtype=torch.float16)
@@ -443,7 +458,11 @@ def g():
         pad_time = benchmarker.benchmark_gpu(g)
 
         print(
+<<<<<<< HEAD
             f"Latency between original matmul and padded matmul: {ori_time:.3f} v.s. {pad_time:.3f}"
+=======
+            f"Latency between origional matmul and padded matmul: {ori_time:.3f} v.s. {pad_time:.3f}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.do_profiling(f, g, "No MM Padding", "With mm padding")
 
@@ -496,7 +515,11 @@ def test_LinearAndSoftmax_codegen(self, bias=True):
         self.assertEqual(
             m_bad_shape.linear.weight.grad, m_bad_shape_opt.linear.weight.grad
         )
+<<<<<<< HEAD
         self.assertTrue(len(wrapper_codes) == 2)  # one for forward and one for backward
+=======
+        self.assertTrue(len(wrapper_codes) == 2)  # one for forward and oen for backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         forward_wrapper = wrapper_codes[0]
 
         # make sure the load for softmax is aligned
@@ -776,6 +799,7 @@ def get_input(size: tuple[int], alignment_bytes: int) -> torch.Tensor:
         output_shape = (shape[0] * num_inputs, shape[1])
         output_stride = input_tensors[0].stride()
         output_line = f"buf12 = empty_strided_{GPU_TYPE}({output_shape}, {output_stride}, torch.float32)"
+<<<<<<< HEAD
         self.assertTrue(output_line in code[0])
 
     @parametrize(
@@ -907,6 +931,9 @@ def test_dynamic_shape_padding(self, shape, alignment_bytes, enable_pad):
             result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
         )
         self.assertEqual(result.stride(), expected_stride)
+=======
+        self.assertTrue(any(output_line in line for line in code))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 4b8c866b9c291..d10f0882af6c8 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -3,8 +3,12 @@
 import itertools
 import os
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.config as dynamo_config
@@ -35,7 +39,11 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater, xfailIfSM89
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import skipCUDAIf
+=======
+from torch.testing._internal.common_device_type import expectedFailureXPU, skipCUDAIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_LINUX,
@@ -139,7 +147,11 @@ def _test_fused_int_mm_mul_impl(self, fn, args, fused_int_mm_mul_expected=True):
                 ref[indices], test[indices]
             )  # also checks that dtype is correct
 
+<<<<<<< HEAD
     # @skipIfXpu
+=======
+    @skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -181,7 +193,12 @@ def fn2(a, b, c):
             self._test_fused_int_mm_mul_impl(fn2, args, True)
 
     def test_duplicate_search(self):
+<<<<<<< HEAD
         from collections.abc import Callable, Iterable
+=======
+        from collections.abc import Iterable
+        from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         import torch
         from torch._inductor.pattern_matcher import (
@@ -239,6 +256,10 @@ def f_replaced(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(f(inp), f_replaced(inp))
         self.assertEqual(count, 2)
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -287,6 +308,10 @@ def fn2(a, b, c):
             self._test_fused_int_mm_mul_impl(fn2, args, True)
 
     @skipIfRocm
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -334,6 +359,10 @@ def _test_mixed_impl(
                 "triton_tem" if not extern_mm else "extern_kernels.mm"
             ).run(code)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -369,6 +398,10 @@ def fn(a, b):
         for args in args_list:
             self._test_mixed_impl(fn, args, True, False)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -395,6 +428,10 @@ def fn(a, b):
             )
             self._test_mixed_impl(fn, args, True, False, rtol=0.16, atol=1e-4)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -426,6 +463,10 @@ def fn(a, b):
         for args in args_list:
             self._test_mixed_impl(fn, args, True, False)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(
         {
@@ -463,6 +504,10 @@ def fn(a, b, c, d):
         for args in args_list:
             self._test_mixed_impl(fn, args, True, False)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
     def test_mixed_mm_gating(self):
@@ -746,9 +791,12 @@ def fn(a, b):
         ]
         self.common(fn, args, 1, 3)
 
+<<<<<<< HEAD
     # called in test_gpu_cpp_wrapper
     test_cat_slice_cat_xpu = test_cat_slice_cat_cuda
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointless_view_pair(self):
         def f(x):
             x = aten.view.default(x, [3, 5, 7])
@@ -1000,7 +1048,11 @@ def fn(a, b, c):
         ]
         self.common(fn, args, 0, 0)
 
+<<<<<<< HEAD
         # cat and split lengths are different
+=======
+        # cat and split lenghts are different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(a, b, c):
             cat = torch.ops.aten.cat.default([a, b, c], 1)
             split_with_sizes = torch.ops.aten.split_with_sizes.default(cat, [5, 5], 1)
@@ -1350,6 +1402,7 @@ def repl(inp, x1, x2):
                 # addmm should be replaced
                 FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
+<<<<<<< HEAD
     def test_addmm_dtype_mismatch(self):
         a = torch.nn.Linear(1024, 1024, bias=False).to(GPU_TYPE)
         a = a.to(dtype=torch.float16)
@@ -1366,6 +1419,8 @@ def func():
         self.assertEqual(actual, func())
         FileCheck().check_not("addmm").run(code[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_replace_mul_zero(self):
         def test(x, y):
             return x + (y * 0)
@@ -1425,6 +1480,7 @@ def repl(inp, x1, x2):
                 self.assertEqual(counter, 1)
                 torch.testing.assert_close(actual, expected)
 
+<<<<<<< HEAD
     def test_input_output_same(self):
         def pattern(x, y):
             out1 = torch.add(x, y)
@@ -1460,6 +1516,8 @@ def f(x, y):
         self.assertTrue("aten.add.default" not in code)
         self.assertTrue("aten.mul.default" not in code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch(fx_graph_remote_cache=False)
     def test_match_equivalent_function_invocations3(self):
         counter = 0
@@ -1783,6 +1841,7 @@ def my_func_static(x, w, epsilon):
         test, (code,) = run_and_get_code(my_func_static, *inputs)
         self.assertTrue("static_scaled_int8_quant" not in code)
 
+<<<<<<< HEAD
     def test_fwd_only_generate_original_aten_meta(self):
         def f(x):
             return torch.ops.aten.sigmoid(x)
@@ -1795,6 +1854,8 @@ def f(x):
         self.assertEqual(len(sigmoid_nodes), 1)
         self.assertTrue("original_aten" in sigmoid_nodes[0].meta)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_GPU:
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 8a48bee86ba4e..c06f27813edeb 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -28,14 +28,22 @@
 # performance for that setting.
 #
 # Defines all the kernels for tests
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
 from torch.testing._internal.triton_utils import requires_gpu_and_triton
+=======
+from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # set so that metrics appear
 torch._logging.set_logs(inductor_metrics=True)
 
+<<<<<<< HEAD
 if HAS_GPU_AND_TRITON:
+=======
+if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import triton  # @manual
     import triton.language as tl  # @manual
 
@@ -78,7 +86,11 @@ def count_numel_train(f, *args):
     return str(metrics.num_bytes_accessed // 4)
 
 
+<<<<<<< HEAD
 DEVICE = GPU_TYPE
+=======
+DEVICE = "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def T(*size, dtype=torch.float32, device=DEVICE, grad=False):
@@ -486,7 +498,11 @@ def f(a, b, c):
 
     def test_reduction_pointwise_multi_level_reduction(self):
         hidden_size = 4096
+<<<<<<< HEAD
         layer_norm = torch.nn.LayerNorm(hidden_size).to(GPU_TYPE).float()
+=======
+        layer_norm = torch.nn.LayerNorm(hidden_size).cuda().float()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.inference_mode()
         def f(x, scale, amax_keep_dim):
@@ -647,7 +663,11 @@ def f(a):
 
     @patch.object(config, "pattern_matcher", False)
     def test_fusion_choice4_cpu(self):
+<<<<<<< HEAD
         # Fuse nodes with same number of elements and compatible original var ranges
+=======
+        # Fuse nodes with same number of elements and compatible orginal var ranges
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # [buf0: {d0: 60, d1: 11}, buf1: {d0: 660}] -> buf0_buf1
         def f(x, w):
             o1 = x * w
@@ -839,7 +859,11 @@ def f(a):
 
     def test_noop_device_conversion(self):
         def f(a):
+<<<<<<< HEAD
             b = torch.ops.prims.device_put(a, DEVICE)
+=======
+            b = torch.ops.prims.device_put(a, "cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             c = unfusible(b)
             return c
 
@@ -921,7 +945,11 @@ def f(a, b):
         inp = (T(10, 10), TI(2, mx=5))
         self.assertExpectedInline(count_numel(f, *inp), """42""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_training(self):
         @triton.jit
         def sin_kernel(
@@ -965,7 +993,11 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel_train(f, x), """9""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_not_fusable_with_users(self):
         @triton.jit
         def _sin_kernel(
@@ -1018,7 +1050,11 @@ def f(x):
         # (it will cost an extra kernel)
         self.assertExpectedInline(count_numel_train(f, x), """27""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_custom_op_training_two_mutated_inputs(self):
         @torch.library.custom_op(
             "_reinplacing::sin_cos", mutates_args={"out_sin", "out_cos"}
@@ -1038,7 +1074,11 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel(f, x), """21""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_custom_op_training(self):
         @torch.library.custom_op("_reinplacing::sin", mutates_args={"result"})
         def sin(x: torch.Tensor, result: torch.Tensor) -> None:
@@ -1067,7 +1107,11 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel_train(f, x), """9""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_custom_op(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor x, Tensor(a!) out) -> ()")
@@ -1097,7 +1141,11 @@ def f(x, out):
 
             self.assertExpectedInline(count_numel(f, x, out), """21""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_custom_op_intermediate(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor x, Tensor(a!) out) -> ()")
@@ -1128,7 +1176,11 @@ def f(x, out):
 
             self.assertExpectedInline(count_numel(f, x, out), """21""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_custom_op_two_mutated_inputs(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor q, Tensor(a!) k_cache, Tensor(b!) v_cache) -> Tensor")
@@ -1154,6 +1206,7 @@ def f():
                 torch.compile(f, fullgraph=True),
             )
 
+<<<<<<< HEAD
             # Check that we are not allocate intermediate buffers
             # which can be reused.
             matches = re.findall(r"empty_strided_\w+\(", code)
@@ -1163,6 +1216,15 @@ def f():
             self.assertExpectedInline(count_numel(f), """45""")
 
     @requires_gpu_and_triton
+=======
+            # Check that we are allocating the minimum number of intermediate buffers
+            matches = re.findall(r"empty_strided_\w+\(", code)
+            self.assertEqual(len(matches), 1)
+
+            self.assertExpectedInline(count_numel(f), """39""")
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v1(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1174,7 +1236,11 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """50""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v2(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1187,7 +1253,11 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """70""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v3(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1200,7 +1270,11 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """80""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v4(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -1214,7 +1288,11 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """70""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v5(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -1228,7 +1306,11 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """80""")
 
+<<<<<<< HEAD
     @requires_gpu_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inplace_triton_kernel_v6(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1295,5 +1377,9 @@ def f(a, b):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_GPU_AND_TRITON:
+=======
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index e6cf6bbcc91bd..3098205cb30ba 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -3,8 +3,12 @@
 import os
 import tempfile
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.test_case
@@ -13,7 +17,11 @@
 from torch._inductor import config
 from torch.profiler import ProfilerActivity
 from torch.testing._internal.common_utils import TemporaryFileName
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, IS_BIG_GPU
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA, IS_BIG_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.torch_version import TorchVersion
 from torch.utils._triton import has_triton
 
@@ -314,5 +322,9 @@ def fn(x, y):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON:
+=======
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index cc8596d903610..7737c165975de 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -1,14 +1,20 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 import contextlib
 import io
 import json
 import logging
 import os
+=======
+import json
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import shutil
 import tempfile
 import unittest
+<<<<<<< HEAD
 import zipfile
 from pathlib import Path
 
@@ -28,6 +34,16 @@
 from torch._inductor.virtualized import V
 from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from pathlib import Path
+
+import torch
+from torch._inductor import config
+from torch._inductor.debug import create_node_mapping
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -36,9 +52,12 @@
     from test_aot_inductor_utils import AOTIRunnerUtil
 
 
+<<<<<<< HEAD
 trace_log = logging.getLogger("torch.__trace")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -72,6 +91,7 @@ def forward(self, a):
         return torch.nn.functional.linear(a, self.weight, self.bias)
 
 
+<<<<<<< HEAD
 class Model4(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -91,18 +111,29 @@ def forward(self, x, a, b, c):
 
 @config.patch("trace.enabled", True)
 @config.patch("trace.provenance_tracking_level", 1)
+=======
+@config.patch("trace.enabled", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestProvenanceTracingArtifact(TestCase):
     """
     This test checks that generated provenance tracing artifact from "post_grad" to
     corresponding "inductor triton kernel node" is expected.
     """
 
+<<<<<<< HEAD
     def _check_provenance_tracing_kernel_to_post_grad(self, filepath, expected_data):
         self.assertTrue(filepath.is_dir())
         filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
         with open(filename) as f:
             actual_data = json.load(f)
         actual_data = actual_data["cppCodeToPost"]
+=======
+    def _check_provenance_tracing_artifact(self, filepath, expected_data):
+        self.assertTrue(filepath.is_dir())
+        filename = Path(filepath) / "inductor_generated_kernel_to_post_grad_nodes.json"
+        with open(filename) as f:
+            actual_data = json.load(f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check that the generated provenance tracing artifact is expected
         self.assertEqual(sorted(actual_data.items()), sorted(expected_data.items()))
 
@@ -120,11 +151,18 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
         c = torch.randn(10, 30, device=device)
         example_inputs = (a, b, c)
 
+<<<<<<< HEAD
         model = Model().to(device)
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
             reset_inductor_kernel_provenance_debug_handle()
+=======
+        model = Model()
+        filepath = None
+
+        for backend in ["aot_inductor", "inductor"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 with config.patch(
                     {
@@ -147,12 +185,32 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                     self.assertTrue(m)
                     filepath = Path(m.group(1))
                     if device == "cuda":
+<<<<<<< HEAD
+=======
+                        expected_data = {
+                            "triton_poi_fused_mul_0": ["mul"],
+                            "triton_poi_fused_addmm_gelu_1": [
+                                "mul_3",
+                                "mul_1",
+                                "add_tensor",
+                                "add",
+                                "erf",
+                                "mul_2",
+                            ],
+                        }
+                        self._check_provenance_tracing_artifact(filepath, expected_data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         expected_mapping = [
                             (
                                 "cppCodeToPost",
                                 {
+<<<<<<< HEAD
                                     "triton_poi_fused_mul_0:1": ["mul"],
                                     "triton_poi_fused_addmm_gelu_1:3": [
+=======
+                                    "triton_poi_fused_mul_0": ["mul"],
+                                    "triton_poi_fused_addmm_gelu_1": [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                         "mul_3",
                                         "mul_1",
                                         "add_tensor",
@@ -165,6 +223,7 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             (
                                 "postToCppCode",
                                 {
+<<<<<<< HEAD
                                     "mul": ["triton_poi_fused_mul_0:1"],
                                     "mul_3": ["triton_poi_fused_addmm_gelu_1:3"],
                                     "mul_1": ["triton_poi_fused_addmm_gelu_1:3"],
@@ -172,6 +231,15 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                                     "add": ["triton_poi_fused_addmm_gelu_1:3"],
                                     "erf": ["triton_poi_fused_addmm_gelu_1:3"],
                                     "mul_2": ["triton_poi_fused_addmm_gelu_1:3"],
+=======
+                                    "mul": ["triton_poi_fused_mul_0"],
+                                    "mul_3": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul_1": ["triton_poi_fused_addmm_gelu_1"],
+                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1"],
+                                    "add": ["triton_poi_fused_addmm_gelu_1"],
+                                    "erf": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul_2": ["triton_poi_fused_addmm_gelu_1"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 },
                             ),
                             (
@@ -196,6 +264,7 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                                 },
                             ),
                         ]
+<<<<<<< HEAD
                         if backend == "aot_inductor":
                             expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
                                 "mm_default"
@@ -210,6 +279,8 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             expected_mapping[1][1]["mm_default"] = [
                                 "extern_kernels.mm:2"
                             ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self._check_provenance_tracking_node_mappings(
                             filepath, expected_mapping
                         )
@@ -218,9 +289,15 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         # check the inductor kernel to post grad nodes mapping is expected for cpu
                         if backend == "aot_inductor":
                             expected_data = {
+<<<<<<< HEAD
                                 "cpp_fused_mul_0:1": ["mul"],
                                 "aoti_torch_cpu_addmm_out:2": ["addmm"],
                                 "cpp_fused_gelu_1:3": [
+=======
+                                "cpp_fused_mul_0": ["mul"],
+                                "aoti_torch_cpu_addmm_out": ["addmm", "mul"],
+                                "cpp_fused_gelu_1": [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     "mul_3",
                                     "mul_1",
                                     "add",
@@ -231,24 +308,37 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         else:
                             # backend == "inductor"
                             expected_data = {
+<<<<<<< HEAD
                                 "cpp_fused_mul_0:1": ["mul"],
                                 "cpp_fused_gelu_1:3": [
+=======
+                                "cpp_fused_mul_0": ["mul"],
+                                "aoti_torch_cpu_addmm_out": ["addmm", "mul"],
+                                "cpp_fused_gelu_1": [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     "mul_3",
                                     "mul_1",
                                     "add",
                                     "erf",
                                     "mul_2",
                                 ],
+<<<<<<< HEAD
                                 "extern_kernels.addmm:2": ["addmm"],
                             }
                         self._check_provenance_tracing_kernel_to_post_grad(
                             filepath, expected_data
                         )
+=======
+                                "extern_kernels.addmm": ["addmm", "mul"],
+                            }
+                        self._check_provenance_tracing_artifact(filepath, expected_data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             finally:
                 if filepath:
                     shutil.rmtree(filepath)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     def test_triton_kernel_to_post_grad_tracing_cuda(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cuda")
@@ -257,6 +347,17 @@ def test_triton_kernel_to_post_grad_tracing_cpu(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cpu")
 
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+    def test_triton_kernel_to_post_grad_tracing_cuda(self):
+        self._test_triton_kernel_to_post_grad_tracing(device="cuda")
+
+    @unittest.skipIf(HAS_GPU, "the test is only for cpu")
+    def test_triton_kernel_to_post_grad_tracing_cpu(self):
+        self._test_triton_kernel_to_post_grad_tracing(device="cpu")
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         M = 8
         N = 6
@@ -268,7 +369,10 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
+<<<<<<< HEAD
             reset_inductor_kernel_provenance_debug_handle()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 with config.patch(
                     {
@@ -292,22 +396,39 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
                     filepath = Path(m.group(1))
                     if backend == "inductor":
                         expected_data = {
+<<<<<<< HEAD
                             "extern_kernels.addmm:1": ["addmm"],
+=======
+                            "aoti_torch_cuda_addmm_out": ["addmm", "_tensor_constant1"],
+                            "triton_poi_fused_0": ["_tensor_constant1"],
+                            "extern_kernels.addmm": ["addmm"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         }
                     else:
                         # backend = aot_inductor
                         expected_data = {
+<<<<<<< HEAD
                             "aoti_torch_cuda_addmm_out:2": ["addmm"],
                             "triton_poi_fused_0:1": ["_tensor_constant1"],
                         }
                     self._check_provenance_tracing_kernel_to_post_grad(
                         filepath, expected_data
                     )
+=======
+                            "aoti_torch_cuda_addmm_out": ["addmm", "_tensor_constant1"],
+                            "triton_poi_fused_0": ["_tensor_constant1"],
+                        }
+                    self._check_provenance_tracing_artifact(filepath, expected_data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             finally:
                 if filepath:
                     shutil.rmtree(filepath)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_pt_tracing_combo_kernel(self, backend):
         """This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
         a = torch.randn(10, 10, device="cuda")
@@ -316,7 +437,10 @@ def _test_pt_tracing_combo_kernel(self, backend):
         example_inputs = (a, b, c)
 
         model = Model2()
+<<<<<<< HEAD
         reset_inductor_kernel_provenance_debug_handle()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with config.patch(
             {
@@ -340,10 +464,17 @@ def _test_pt_tracing_combo_kernel(self, backend):
             m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
             self.assertTrue(m)
             filepath = Path(m.group(1)).resolve()
+<<<<<<< HEAD
             expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
             self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
 
     @requires_cuda_and_triton
+=======
+            expected_data = {"triton_poi_fused_0": ["relu", "sigmoid", "tanh"]}
+            self._check_provenance_tracing_artifact(filepath, expected_data)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
         self._test_pt_tracing_combo_kernel(backend="inductor")
         self._test_pt_tracing_combo_kernel(backend="aot_inductor")
@@ -415,6 +546,7 @@ def test_create_node_mapping(self):
             "triton_poi_fused_addmm_relu_sigmoid_0": ["relu", "add_tensor"]
         }
 
+<<<<<<< HEAD
         result = create_mapping_pre_post_grad_nodes(
             pre_grad_graph_id,
             post_to_pre_grad_nodes_json,
@@ -426,6 +558,13 @@ def test_create_node_mapping(self):
             ),
         }
 
+=======
+        result = create_node_mapping(
+            pre_grad_graph_id,
+            post_to_pre_grad_nodes_json,
+            triton_kernel_to_post_grad_json,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             result,
             {
@@ -453,6 +592,7 @@ def test_create_node_mapping(self):
         )
 
 
+<<<<<<< HEAD
 class TestProvenanceTracingNodeMeta(TestCase):
     def get_node_with_target(self, gm, target):
         """
@@ -806,5 +946,7 @@ def forward(self, x):
             self.assertTrue("aoti_torch_cpu_convolution" in keys)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index 2e6ee09bf10ac..9f83529d95288 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -1,9 +1,13 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 import contextlib
 import functools
 import unittest.mock
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -13,6 +17,7 @@
 import torch.nn.functional as F
 from torch._dynamo.testing import expectedFailureDynamicWrapper
 from torch._dynamo.utils import counters
+<<<<<<< HEAD
 from torch._inductor import config
 from torch._inductor.autotune_process import TritonBenchmarkRequest
 from torch._inductor.choices import InductorChoices
@@ -36,13 +41,24 @@
     requires_gpu,
     requires_triton,
 )
+=======
+from torch._inductor.autotune_process import TritonBenchmarkRequest
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import is_big_gpu
+from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
 
 
 def patches(fn):
+<<<<<<< HEAD
     def skip_cache(self, choices, name, key, benchmark, hint_override=None):
+=======
+    def skip_cache(self, choices, name, key, benchmark):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if benchmark is None:
             return {}
         return benchmark(choices)
@@ -73,8 +89,11 @@ def setUp(self):
         super().setUp()
         if not is_big_gpu():
             return self.skipTest("Need a big GPU to run max_autotune=True")
+<<<<<<< HEAD
         # Clear preprocessing functions to ensure clean state
         select_algorithm.clear_preprocessing_fns()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @patches
     def test_linear_relu(self):
@@ -107,6 +126,7 @@ def foo(input, weight, bias):
         foo(*inps)
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+<<<<<<< HEAD
     @patches
     def test_preprocessing_single_choice(self):
         # pass a list to the preprocessing function to assert that it was
@@ -138,6 +158,8 @@ def foo(input, weight, bias):
         # The preprocessing function should have been called
         self.assertTrue(func_called[0])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch.object(select_algorithm, "VERIFY", dict(atol=5e-2, rtol=5e-2))
     @patches
     def test_addmm_fp16(self):
@@ -168,6 +190,10 @@ def foo(a, b):
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
     @patches
+<<<<<<< HEAD
+=======
+    @skipIfXpu(msg="XPU has not supported _int_mm yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test__int_mm(self):
         @torch.compile
         def foo(a, b):
@@ -399,6 +425,7 @@ def foo(x, w, b):
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+<<<<<<< HEAD
     @patches
     @torch._inductor.config.patch(
         {"conv_1x1_as_mm": True, "max_autotune_gemm_backends": "TRITON"}
@@ -486,6 +513,8 @@ def foo(x, w, b):
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_TritonTemplateCaller_str(self):
         """
         Make sure str(TritonTemplateCaller) does not raise exceptions.
@@ -510,6 +539,7 @@ def test_TritonTemplateCaller_str(self):
         self.assertEqual(caller_str, f"TritonTemplateCaller({module_path}, extra)")
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
 def patch_lowering(lowering_overrides) -> Callable[[], None]:
     import torch._inductor.lowering as inductor_lowering
@@ -624,6 +654,8 @@ def add(a, b):
             assert hook_identifier in kernels[0]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     if IS_LINUX and HAS_GPU and is_big_gpu():
         run_tests()
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
index 2a247fddbe76e..1019ef0d9abd8 100644
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@@ -6,11 +6,15 @@
 import torch._logging
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import IS_LINUX
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CUDA_AND_TRITON,
     HAS_GPU,
 )
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MLP(torch.nn.Module):
@@ -66,5 +70,9 @@ def test_compile_invalid_options(self):
     from torch._inductor.test_case import run_tests
 
     if IS_LINUX and HAS_GPU:
+<<<<<<< HEAD
         if (not HAS_CUDA_AND_TRITON) or torch.cuda.get_device_properties(0).major <= 5:
+=======
+        if (not HAS_CUDA) or torch.cuda.get_device_properties(0).major <= 5:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             run_tests()
diff --git a/test/inductor/test_snode_runtime.py b/test/inductor/test_snode_runtime.py
index cee78592153db..ea0921c16da36 100644
--- a/test/inductor/test_snode_runtime.py
+++ b/test/inductor/test_snode_runtime.py
@@ -56,7 +56,11 @@ class TestCase(InductorTestCase):
 
     """
     Helper methods to compare runtime estimate against 0. Since this estimate is hardware dependent,
+<<<<<<< HEAD
     stronger comparisons may fail depending on the host's specs.
+=======
+    stronger comparisons may fail dependending on the host's specs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     atol/rtol must be provided explicitly with each call, since precision/rel_tol overrides are not always utilized
     """
@@ -258,6 +262,11 @@ def _verify_runtime_estimation(self, fn, inps):
         finally:
             dist.destroy_process_group()
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_legacy_all_reduce(self):
         def fn(x):
             r = c10d.all_reduce(x, "sum", "", self.RANKS, self.WORLD_SIZE)
@@ -266,6 +275,11 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_legacy_all_reduce_coalesced(self):
         def fn(x):
             rs = c10d.all_reduce_coalesced(x, "sum", "", self.RANKS, self.WORLD_SIZE)
@@ -274,6 +288,11 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_legacy_all_gather_into_tensor_coalesced(self):
         def fn(x):
             rs = c10d.all_gather_into_tensor_coalesced(
@@ -287,6 +306,11 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_reduce(self):
         def fn(x):
             r = _c10d.all_reduce(x, "sum", "0")
@@ -295,6 +319,11 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_reduce_coalesced(self):
         def fn(x):
             rs = _c10d.all_reduce_coalesced(x, "sum", "0")
@@ -303,6 +332,11 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_into_tensor(self):
         def fn(x):
             rs = _c10d.all_gather_into_tensor(
@@ -315,6 +349,11 @@ def fn(x):
         inp = T(10, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_into_tensor_coalesced(self):
         def fn(x):
             rs = _c10d.all_gather_into_tensor_coalesced(
@@ -327,6 +366,11 @@ def fn(x):
         inp = [T(10, 10), T(15, 15)]
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_scatter_tensor(self):
         def fn(x):
             rs = _c10d.reduce_scatter_tensor(
@@ -340,6 +384,11 @@ def fn(x):
         inp = T(self.WORLD_SIZE, 10)
         self._verify_runtime_estimation(fn, (inp,))
 
+<<<<<<< HEAD
+=======
+    # lack of profiler on XPU
+    @expectedFailureXPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_scatter_tensor_coalesced(self):
         def fn(x):
             rs = _c10d.reduce_scatter_tensor_coalesced(
diff --git a/test/inductor/test_split_cat_fx_aten_passes.py b/test/inductor/test_split_cat_fx_aten_passes.py
index 0ec7825df001c..2b4c48965bbe1 100644
--- a/test/inductor/test_split_cat_fx_aten_passes.py
+++ b/test/inductor/test_split_cat_fx_aten_passes.py
@@ -5,7 +5,11 @@
 from torch._dynamo.utils import counters
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import GPU_TYPE
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -49,6 +53,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         return torch.ops.aten.cat.default([cat_1, cat_2], 1)
 
 
+<<<<<<< HEAD
 class TestSplitCatSingular(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -65,6 +70,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         return torch.ops.aten.cat.default([cat_1, cat_2], 1)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSplitCatPartial(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -248,7 +255,11 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
             self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
         )
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -291,6 +302,7 @@ def test_split_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
@@ -318,6 +330,9 @@ def test_split_cat_post_grad_singular(self):
         counters.clear()
 
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -342,7 +357,11 @@ def test_select_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -367,6 +386,7 @@ def test_move_view_after_cat_aten(self):
         counters.clear()
 
 
+<<<<<<< HEAD
 class TestSplitCatAtenNormalizationPasses(TestCase):
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
@@ -400,5 +420,7 @@ def arg_only_size_different(x):
             counters.clear()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_split_cat_fx_passes.py b/test/inductor/test_split_cat_fx_passes.py
index 4286bdfda7cd9..00e90b0d8deb9 100644
--- a/test/inductor/test_split_cat_fx_passes.py
+++ b/test/inductor/test_split_cat_fx_passes.py
@@ -115,6 +115,7 @@ def normalize_reshape_with_dynamic_shape(x):
             )
             counters.clear()
 
+<<<<<<< HEAD
     @torch._inductor.config.patch(
         pre_grad_fusion_options={
             "normalization_pass": {},
@@ -142,6 +143,8 @@ def caoncat_only(x):
             )
             counters.clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @patch
     def test_consecutive_split_merge(self):
         def multi_split(x):
diff --git a/test/inductor/test_static_cuda_launcher.py b/test/inductor/test_static_cuda_launcher.py
index 654bfd269f761..aa1f74f87b3ac 100644
--- a/test/inductor/test_static_cuda_launcher.py
+++ b/test/inductor/test_static_cuda_launcher.py
@@ -13,10 +13,17 @@
 from torch._inductor.runtime.triton_helpers import libdevice
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import skipIfRocm
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 @requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+
+
+@requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestStaticCudaLauncher(TestCase):
     def setUp(self):
         super().setUp()
@@ -396,7 +403,11 @@ def kernel_many_args(out_tensor, {decl}):
         self.assertEqual(buf0, buf1)
 
 
+<<<<<<< HEAD
 @requires_cuda_and_triton
+=======
+@requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch._inductor.config.patch(
     {"use_static_cuda_launcher": True, "strict_static_cuda_launcher": True}
 )
diff --git a/test/inductor/test_subgraph_choice.py b/test/inductor/test_subgraph_choice.py
index d2d5a3bf59a9e..35f378c40b55f 100644
--- a/test/inductor/test_subgraph_choice.py
+++ b/test/inductor/test_subgraph_choice.py
@@ -1,13 +1,27 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from unittest import mock
 from unittest.mock import MagicMock
 
 import torch
+<<<<<<< HEAD
+=======
+from torch._dispatch.python import enable_python_dispatcher
+from torch._inductor.codegen.subgraph import SubgraphTemplate
+from torch._inductor.decomposition import select_decomp_table
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.ir import Buffer, FixedLayout, FlexibleLayout
 from torch._inductor.lowering import register_lowering
 from torch._inductor.select_algorithm import autotune_select_algorithm
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
+=======
+from torch.fx.experimental.proxy_tensor import make_fx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import skipIfXpu, TEST_WITH_ROCM
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
@@ -59,6 +73,7 @@ def _(a, b):
             choices = [aten_mm.bind((mat1, mat2), layout)]
 
             kPartitions = 256
+<<<<<<< HEAD
 
             decompose_k_subgraph_template = (
                 torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
@@ -67,6 +82,22 @@ def _(a, b):
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
                 k_split=kPartitions,
+=======
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                        tracing_mode="real",
+                    ),
+                )
+
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
@@ -128,6 +159,7 @@ def _(a, b):
             choices = []
 
             kPartitions = 2
+<<<<<<< HEAD
 
             decompose_k_subgraph_template = (
                 torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
@@ -136,6 +168,21 @@ def _(a, b):
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
                 k_split=kPartitions,
+=======
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                    ),
+                )
+
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
diff --git a/test/inductor/test_torchbind.py b/test/inductor/test_torchbind.py
index c604f8450bbbf..11c26ef8b15fa 100644
--- a/test/inductor/test_torchbind.py
+++ b/test/inductor/test_torchbind.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: functorch"]
 import json
+<<<<<<< HEAD
+=======
+import tempfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import zipfile
 from pathlib import Path
 
@@ -10,10 +14,15 @@
 import torch._inductor.decomposition
 from torch._higher_order_ops.torchbind import CallTorchBind, enable_torchbind_tracing
 from torch._inductor import aot_compile, ir
+<<<<<<< HEAD
 from torch._inductor.codecache import WritableTempFile
 from torch._inductor.package import package_aoti
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import skipIfWindows
+=======
+from torch._inductor.package import package_aoti
+from torch._inductor.test_case import run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
 from torch.testing._internal.torchbind_impls import (
     _empty_tensor_queue,
@@ -159,7 +168,10 @@ def test_torchbind_hop_schema_no_output(self):
             "call_torchbind(__torch__.torch.classes._TorchScriptTesting._TensorQueue _0, str method, Tensor _1) -> NoneType _0",
         )
 
+<<<<<<< HEAD
     @skipIfWindows(msg="AOTI is not fully support on Windows")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torchbind_aot_compile(self):
         ep, inputs, _, _ = self.get_exported_model()
         aoti_files = aot_compile(
@@ -174,7 +186,11 @@ def test_torchbind_aot_compile(self):
                 custom_objs_config = file
             elif file.endswith("/custom_obj_0"):
                 custom_obj_0 = file
+<<<<<<< HEAD
             elif file.endswith("wrapper.json") and "metadata" not in file:
+=======
+            elif file.endswith(".json") and "metadata" not in file:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 extern_json = file
 
         self.assertIsNotNone(custom_objs_config)
@@ -282,7 +298,11 @@ def test_torchbind_aot_compile(self):
             )
 
         # Test that the files are packaged
+<<<<<<< HEAD
         with WritableTempFile(suffix=".pt2") as f:
+=======
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package_path = package_aoti(f.name, aoti_files)
 
             with zipfile.ZipFile(package_path, "r") as zip_ref:
@@ -304,7 +324,10 @@ def test_torchbind_aoti(self):
         self.assertEqual(result, orig_res)
 
     @torch._inductor.config.patch("aot_inductor.use_runtime_constant_folding", True)
+<<<<<<< HEAD
     @skipIfWindows(msg="AOTI is not fully support on Windows")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torchbind_aot_compile_constant_folding(self):
         ep, inputs, orig_res, _ = self.get_exported_model()
         pt2_path = torch._inductor.aoti_compile_and_package(ep)
@@ -413,6 +436,7 @@ def forward(self, x, y):
         ):
             aot_compile(ep.module(), inputs, options={"aot_inductor.package": True})
 
+<<<<<<< HEAD
     def test_aoti_torchbind_name_collision(self):
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -437,6 +461,8 @@ def forward(self, x):
         result = optimized(*inputs)
         self.assertEqual(result, orig_res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 67abaf0bf7bff..04bf3b32f464d 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -19,9 +19,14 @@
 import unittest
 import unittest.mock
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable
 from pathlib import Path
 from typing import TypeVar
+=======
+from pathlib import Path
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 from unittest.mock import patch
 
@@ -70,7 +75,10 @@
 from torch.nn import functional as F
 from torch.testing import FileCheck, make_tensor
 from torch.testing._internal.common_cuda import (
+<<<<<<< HEAD
     IS_SM90,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
@@ -82,7 +90,10 @@
 from torch.testing._internal.common_device_type import (
     expectedFailureXPU,
     largeTensorTest,
+<<<<<<< HEAD
     get_desired_device_type_test_bases,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_dtype import all_types, get_all_dtypes
 from torch.testing._internal.common_quantization import (
@@ -98,13 +109,19 @@
     MACOS_VERSION,
     parametrize,
     serialTest,
+<<<<<<< HEAD
     skipIfMPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfRocm,
     skipIfWindows,
     skipIfXpu,
     subtest,
+<<<<<<< HEAD
     skipIfRocmArch,
     subtest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfS390X,
@@ -143,7 +160,11 @@
     skipCPUIf,
     skipCUDAIf,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _T = TypeVar("_T")
@@ -151,10 +172,13 @@
 
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
+<<<<<<< HEAD
 _desired_test_bases = get_desired_device_type_test_bases()
 RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
 RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
 NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_WITH_ROCM:
     torch._inductor.config.force_layout_optimization = 1
@@ -201,7 +225,11 @@
 
 
 def _large_cumprod_input(shape, dim, dtype, device):
+<<<<<<< HEAD
     # Construct a cumprod input which guarantees not to overflow or underflow
+=======
+    # Construct a cumprod input which guaruntees not to overflow or underflow
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_integer_dtype(dtype):
         # Large products don't fit in integers, the best we can do
         # is random +/-1 values to test the sign of the result
@@ -442,8 +470,11 @@ def check_model(
     check_gradient=False,
     check_has_compiled=True,
     output_process_fn_grad=lambda x: x,
+<<<<<<< HEAD
     # TODO: enable this for all tests
     exact_stride=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     kwargs = kwargs or {}
     torch._dynamo.reset()
@@ -477,12 +508,16 @@ def upcast_fn(x):
                 x.dtype == torch.float16 or x.dtype == torch.bfloat16
             ):
                 has_lowp_args = True
+<<<<<<< HEAD
                 # Preserve strides when casting
                 result = torch.empty_strided(
                     x.size(), x.stride(), device=x.device, dtype=torch.float
                 )
                 result.copy_(x)
                 return result
+=======
+                return x.float()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return x
 
@@ -521,11 +556,19 @@ def run(*ex, **kwargs):
     #         print("Graph", graph)
     if check_has_compiled:
         assert called, "Ran graph without calling compile_fx"
+<<<<<<< HEAD
     assert type(actual) is type(correct)
     if isinstance(actual, (tuple, list)):
         assert len(actual) == len(correct)
         assert all(
             type(actual_item) is type(correct_item)
+=======
+    assert type(actual) == type(correct)
+    if isinstance(actual, (tuple, list)):
+        assert len(actual) == len(correct)
+        assert all(
+            type(actual_item) == type(correct_item)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for actual_item, correct_item in zip(actual, correct)
         )
 
@@ -553,6 +596,7 @@ def reference_to_expect(actual_flat, correct_flat):
         correct_flat = reference_to_expect(actual_flat, correct_flat)
         correct = tree_unflatten(correct_flat, correct_spec)
 
+<<<<<<< HEAD
     # Allow assert_equal to be a custom function, instead of True or False, for
     # cases where differences may not indicate incorrectness.
     if assert_equal:
@@ -566,16 +610,25 @@ def custom_assert_with_self(*args, **kwargs):
             assert_equal_fn = self.assertEqual
 
         assert_equal_fn(
+=======
+    if assert_equal:
+        self.assertEqual(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             actual,
             correct,
             atol=atol,
             rtol=rtol,
             equal_nan=True,
             exact_dtype=exact_dtype,
+<<<<<<< HEAD
             exact_stride=exact_stride,
         )
         # In case of input mutations, check that inputs are the same
         # (This never uses a custom assert_equal fn.)
+=======
+        )
+        # In case of input mutations, check that inputs are the same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             ref_inputs,
             example_inputs,
@@ -584,7 +637,10 @@ def custom_assert_with_self(*args, **kwargs):
             equal_nan=True,
             # our testing sometimes uses higher precision inputs for the reference
             exact_dtype=False,
+<<<<<<< HEAD
             exact_stride=exact_stride,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         for correct_val, actual_val in zip(correct_flat, actual_flat):
@@ -598,8 +654,11 @@ def custom_assert_with_self(*args, **kwargs):
                 assert correct_val.layout == actual_val.layout
                 if exact_dtype:
                     assert correct_val.dtype == actual_val.dtype
+<<<<<<< HEAD
                 if exact_stride:
                     assert correct_val.stride() == actual_val.stride()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if check_gradient:
         actual = output_process_fn_grad(actual)
@@ -653,7 +712,10 @@ def custom_assert_with_self(*args, **kwargs):
                 rtol=grad_rtol or rtol,
                 equal_nan=True,
                 exact_dtype=exact_dtype,
+<<<<<<< HEAD
                 exact_stride=exact_stride,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     torch._dynamo.reset()
@@ -679,8 +741,11 @@ def check_model_gpu(
     check_gradient=False,
     check_has_compiled=True,
     output_process_fn_grad=lambda x: x,
+<<<<<<< HEAD
     # TODO: enable this for all tests
     exact_stride=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     kwargs = kwargs or {}
     if hasattr(model, "to"):
@@ -707,13 +772,20 @@ def check_model_gpu(
         check_gradient=check_gradient,
         check_has_compiled=check_has_compiled,
         output_process_fn_grad=output_process_fn_grad,
+<<<<<<< HEAD
         exact_stride=exact_stride,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if check_lowp:
 
         def downcast_fn(x):
+<<<<<<< HEAD
             if not isinstance(x, torch.Tensor) or x.dtype != torch.float:
+=======
+            if not isinstance(x, torch.Tensor) or not x.dtype == torch.float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x
             return torch.empty_strided(
                 x.size(), x.stride(), device=GPU_TYPE, dtype=torch.half
@@ -740,7 +812,10 @@ def downcast_fn(x):
             check_gradient=check_gradient,
             check_has_compiled=check_has_compiled,
             output_process_fn_grad=output_process_fn_grad,
+<<<<<<< HEAD
             exact_stride=exact_stride,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1417,6 +1492,13 @@ def fn(*args):
             b = torch.add(args[0], args[0])
             return (a, b)
 
+<<<<<<< HEAD
+=======
+        # Complex are not supported on MacOS-13
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("No complex on MacOS13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(41, dtype=torch.complex64, device=self.device)
         y = x.clone()
         # should not inplace write to the input
@@ -1456,6 +1538,7 @@ def fn(a, b):
                 code.count("view_dtype" if config.cpp_wrapper else "aten.view"), 3
             )
 
+<<<<<<< HEAD
     def test_add_complex_strided_fallback(self):
         @torch.compile
         def fn(a, b):
@@ -1489,6 +1572,8 @@ def fn(a, b):
             msg=f"Expected complex add with strided inputs to fall back to extern kernels, got:\n{code}",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_complex5(self):
         def fn(a, b, alpha):
             return torch.add(a, b, alpha=alpha)
@@ -1509,6 +1594,7 @@ def fn(a, b, alpha):
 
         self.common(fn, (x, y, 2))
 
+<<<<<<< HEAD
     def test_add_complex7(self):
         # Fix https://github.com/pytorch/pytorch/issues/160495
         # Test scalar (0-dimensional) complex tensor addition: 0D + 0D
@@ -1552,6 +1638,8 @@ def fn(a, b, alpha):
         y = torch.rand((), dtype=torch.complex64, device=self.device)
         self.common(fn, (x, y, 2))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_concat_add_inplace(self):
         def fn(x, y, z):
             return torch.cat([x, y], dim=1).add_(z)
@@ -1578,6 +1666,13 @@ def fn(a, b, c):
         )
         real_input = torch.tensor([-1.0, 0.0, 1.0, float("nan")])
         interger_real_input = torch.tensor([-1, 0, 1])
+<<<<<<< HEAD
+=======
+        # Complex are not supported on MacOS-13
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            self.common(fn, (complex_input.real, real_input, interger_real_input))
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (complex_input, real_input, interger_real_input))
 
     def test_sgn(self):
@@ -1730,6 +1825,12 @@ def copy(x):
             i = torch.arange(x.size(0), device=x.device)
             return x[i]
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Inaccurate on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(8, device=self.device)
         copy_opt = torch.compile(copy, backend="inductor")
 
@@ -2150,6 +2251,11 @@ def fn(a):
             return torch.max(a), torch.sum(a)
 
         # Requires masked loading for the intermediate reduction
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sample = torch.full((3999971,), 0, dtype=torch.int64)
         sample[-1] = 1
         self.common(fn, (sample,))
@@ -2207,6 +2313,16 @@ def fn(a):
         ):
             if not self.is_dtype_supported(dtype):
                 continue
+<<<<<<< HEAD
+=======
+            # cumsum not implemented for integers on MacOS-13
+            if (
+                self.device == "mps"
+                and not dtype.is_floating_point
+                and MACOS_VERSION < 13.3
+            ):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Use low=0 since when the mean value is 0, cumsum at all points
             # tends towards zero which makes the relative error term blow up
             inp = make_tensor(10, 3, 352, 352, low=0, dtype=dtype, device=self.device)
@@ -2259,6 +2375,12 @@ def fn(lengths, data):
             offsets = torch.cumsum(lengths, 0)
             return data[offsets]
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("CumSum for int64 needs MacOS-13.3+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lengths = torch.full((2**14,), 2**2, dtype=torch.int64, device=self.device)
         lengths[-2] = 3
         lengths[-1] = 3
@@ -2272,6 +2394,16 @@ def fn(a):
         for dtype in [torch.float32, torch.float64, torch.int32, torch.int64]:
             if not self.is_dtype_supported(dtype):
                 continue
+<<<<<<< HEAD
+=======
+            # cumsum not implemented on MacOS-13
+            if (
+                self.device == "mps"
+                and not dtype.is_floating_point
+                and MACOS_VERSION < 13.3
+            ):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inp = _large_cumprod_input(
                 (10, 10000), dim=1, dtype=dtype, device=self.device
             )
@@ -2461,6 +2593,10 @@ def fn(a):
         self.common(fn, [packed])
 
     @xfail_if_mps_unimplemented
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(True, "No _weight_int8pack_mm implementation on CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
     def test_int8_weight_only_quant(self):
         def convert_weight_to_int8pack(b):
@@ -2625,6 +2761,13 @@ def test_sum_int(self):
         def fn(x):
             return 2 * x.sum(-1) + x.sum()
 
+<<<<<<< HEAD
+=======
+        # Requires masked loading for the intermediate reduction
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtypes = torch.bool, torch.uint8, torch.int
         inps = [torch.randint(2, (64,), dtype=dtype) for dtype in dtypes]
 
@@ -2632,6 +2775,12 @@ def fn(x):
             self.common(fn, (i,), check_lowp=False)
 
     def test_sum_dtype(self):
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat unsupported on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_dtype = torch.double if self.device != "mps" else torch.bfloat16
 
         def fn(x):
@@ -2726,6 +2875,10 @@ def make_tensor(shape):
                 inp = torch.full((2, n), float("inf"), device=self.device, dtype=_dtype)
                 self.assertEqual(cfn(inp), fn(inp))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfail_if_triton_cpu
     def test_logcumsumexp(self):
         def fn(x):
@@ -2752,6 +2905,10 @@ def fn(x):
             rtol=1e-5,
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logcumsumexp_zero_dim(self):
         def fn(x):
             return x.logcumsumexp(0), x.logcumsumexp(-1)
@@ -3031,6 +3188,7 @@ def forward(x, y):
             ),
         )
 
+<<<<<<< HEAD
     def test_torch_device_split(self):
         def fn(x):
             return x.split(2)
@@ -3043,6 +3201,8 @@ def fn(x):
             for a, b in zip(out, ref):
                 self.assertTrue(torch.allclose(a, b))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_relu(self):
         def fn(a, b):
             return (torch.relu(a), torch.relu(a + b) / 10)
@@ -3064,6 +3224,11 @@ def fn(a, b):
     @skipIfXpu(msg="logaddexp_xpu not implemented for ComplexFloat")
     @skipCUDAIf(True, "Not implemented for CUDA")
     def test_logaddexp(self):
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Complex needs MacOS-14+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             torch.logaddexp,
             (
@@ -3261,6 +3426,12 @@ def fn(a, b):
                 a // b,
             )
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randint(-100, 0, [8, 8]), torch.randint(1, 10, [8, 8])),
@@ -3480,6 +3651,7 @@ def forward(x, y):
         cf(x, 1e-5)
         cf(x, 1e-6)
 
+<<<<<<< HEAD
     def test_div_presicion_accuracy(self):
         # fix https://github.com/pytorch/pytorch/issues/157959
         def forward(x, y):
@@ -3489,6 +3661,8 @@ def forward(x, y):
         y = 101
         self.common(forward, (x, y))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mul_softmax_symfloat(self):
         def forward(x, y):
             z = x.mul(y * x.shape[-1])
@@ -3894,6 +4068,11 @@ def fn(a, b):
                 torch.randn(256, 256),
                 torch.randint(-128, 127, (256, 256), dtype=torch.int8),
             ),
+<<<<<<< HEAD
+=======
+            # MacOS-13 MM ops have precision issues
+            check_lowp=self.device != "mps" or MACOS_VERSION > 14.0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rtol=0.01,
             atol=0.1,
         )
@@ -4375,7 +4554,11 @@ def fn(a):
             (torch.randn([2, 20, 2]),),
         )
 
+<<<<<<< HEAD
     # It's a view so it doesn't generate a kernel
+=======
+    # It's a view so it doens't generate a kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureCodegenDynamic
     def test_slice3(self):
         def fn(a, b):
@@ -4468,7 +4651,13 @@ def fn2(a):
         )
 
     @parametrize("dilation", (1, 2))
+<<<<<<< HEAD
     @parametrize("dim", (subtest(2), subtest(3)))
+=======
+    @parametrize(
+        "dim", (subtest(2), subtest(3, decorators=[xfail_if_mps_unimplemented]))
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_low_memory_max_pool(self, dilation: int, dim: int):
         prims = torch.ops.prims
 
@@ -4499,6 +4688,12 @@ def fn(x):
         self.common(fn, (torch.randn(1, 3, *[10] * dim),))
 
     def test_to_dtype(self):
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat unsupported on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_dtype = torch.float64 if self.device != "mps" else torch.bfloat16
 
         def fn(a, b):
@@ -4647,6 +4842,7 @@ def fn(a):
             (torch.randn([4, 4, 4]),),
         )
 
+<<<<<<< HEAD
     @skipIfXpu(msg="Incorrect reference on XPU, see issue #165392")
     def test_conv1d_with_permute(self):
         # fix https://github.com/pytorch/pytorch/issues/159462
@@ -4683,6 +4879,8 @@ def forward(self, x):
         )
         self.common(ConvModel(), (input_tensor,), check_lowp=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_convolution1(self):
         m = torch.nn.Sequential(
             torch.nn.Conv2d(5, 6, [3, 3]),
@@ -4701,7 +4899,11 @@ def test_convolution1(self):
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
+<<<<<<< HEAD
             reference_in_float=not torch.version.hip,
+=======
+            reference_in_float=False if torch.version.hip else True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_convolution2(self):
@@ -4735,7 +4937,11 @@ def test_convolution3(self):
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
+<<<<<<< HEAD
             reference_in_float=not torch.version.hip,
+=======
+            reference_in_float=False if torch.version.hip else True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_gpu_halide
@@ -4781,12 +4987,20 @@ def test_conv3d(self):
         self.common(
             m,
             (torch.randn([1, 3, 8, 16, 32]),),
+<<<<<<< HEAD
             atol=1e-3,
+=======
+            atol=6e-5,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rtol=0.001,
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
+<<<<<<< HEAD
             reference_in_float=not torch.version.hip,
+=======
+            reference_in_float=False if torch.version.hip else True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv2d_channels_last(self):
@@ -5476,6 +5690,13 @@ def test_tan(self):
         def fn(x):
             return aten.tan(x) + 2, aten.tan(x + 1)
 
+<<<<<<< HEAD
+=======
+        # tan is broken in MPSGraph for MacOS before version 13.3
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("tan is inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn([16, 16]),),
@@ -5546,6 +5767,7 @@ def fn(x):
 
         self.common(fn, (torch.randn(1, 1),))
 
+<<<<<<< HEAD
     def test_as_strided_on_views(self):
         # https://github.com/pytorch/pytorch/issues/163286
         def fn(a):
@@ -5568,6 +5790,8 @@ def fn(a):
         out = torch.compile(fn)(a)
         assert out.dtype == torch.bfloat16
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_repeat_interleave(self):
         def fn(x):
             return (
@@ -5718,6 +5942,10 @@ def fn(x):
             (torch.randn([2, 4, 4, 8]),),
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_embedding_bag(self):
         def fn(w, i, o):
             return aten._embedding_bag(w, i, o, False, 0, False, None)
@@ -5870,6 +6098,7 @@ def forward(self, x):
         if self.device != "cpu":
             assertGeneratedKernelCountEqual(self, 1)
 
+<<<<<<< HEAD
     def test_complex_from_real_imag(self):
         def fn(x, y):
             return aten.complex.default(x, y)
@@ -5883,6 +6112,8 @@ def fn(x, y):
             reference_in_float=False,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_as_complex(self):
         class Repro(torch.nn.Module):
             def __init__(self) -> None:
@@ -5966,6 +6197,7 @@ def f(a):
             a = torch.rand((1, 1000000), device=self.device)
             self.common(f, (a,))
 
+<<<<<<< HEAD
     def test_inplace_flip(self):
         def f(x, y):
             x.copy_(x.flip(1))
@@ -5976,6 +6208,8 @@ def f(x, y):
         y = torch.randn(20, 1024 * 1024)
         self.common(f, (x, y), atol=1e-3, rtol=1e-3)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gather_scatter(self):
         def fn(node_feat, edge_index):
             src_node_feat = node_feat[edge_index[0]]
@@ -6126,6 +6360,13 @@ def test_pow1(self):
         def fn(x):
             return [aten.pow(x, e) for e in range(-8, 9)]
 
+<<<<<<< HEAD
+=======
+        # pow is broken in MPSGraph for MacOS before version 13.3
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("pow is inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn([16, 16]),),
@@ -6136,6 +6377,13 @@ def test_pow2(self):
         def fn(x):
             return aten.pow(1000, x), aten.pow(x, 1000)
 
+<<<<<<< HEAD
+=======
+        # pow is broken in MPSGraph for MacOS before version 13.3
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("pow is inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (
@@ -6201,6 +6449,7 @@ def fn(x):
             (torch.randn([8, 16, 8, 8]),),
         )
 
+<<<<<<< HEAD
     def test_unsigned_constant_tensors(self):
         def fn(x):
             c = torch.tensor(7, dtype=torch.uint8)
@@ -6211,6 +6460,8 @@ def fn(x):
             (torch.randn([16, 16]),),
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Disable size_asserts for this test due to https://github.com/pytorch/pytorch/issues/145963
     @config.patch(size_asserts=os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS") == "1")
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
@@ -6455,9 +6706,15 @@ def fn(x1, x2, x3, x4):
             atol = 3e-4
             rtol = 1e-4
         else:
+<<<<<<< HEAD
             atol = 5e-4
             rtol = 3e-4
 
+=======
+            # use default
+            atol = None
+            rtol = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # MPS has correctness problem before MacOS15
         with (
             contextlib.nullcontext()
@@ -6480,7 +6737,10 @@ def fn(x1, x2, x3, x4):
     @skip_if_gpu_halide
     # Constant folding was explicitly turned off due to issue #108388
     # Turn it back on for test
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "native matmul has better precision")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._inductor.config.patch(joint_graph_constant_folding=True)
     def test_remove_no_ops(self):
         def matmul_with_op(x, y, fn):
@@ -6502,7 +6762,10 @@ def matmul_with_op(x, y, fn):
             out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
             self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
 
+<<<<<<< HEAD
             atol, rtol = None, None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.device == "cpu":
                 FileCheck().check_not("cpp_fused").run(source_codes[0])
             else:
@@ -6518,18 +6781,26 @@ def matmul_with_op(x, y, fn):
             ]
             for fn in fns:
                 out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
+<<<<<<< HEAD
                 self.assertEqual(
                     out, matmul_with_op(inps[0], inps[1], fn), atol=atol, rtol=rtol
                 )
+=======
+                self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # test broadcasted shape bail
             fn = lambda x: x + torch.zeros(  # noqa: E731
                 [256, 256, 256], dtype=lowp_dtype, device=self.device
             )
             out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
+<<<<<<< HEAD
             self.assertEqual(
                 out, matmul_with_op(inps[0], inps[1], fn), atol=atol, rtol=rtol
             )
+=======
+            self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_remove_noop_copy(self):
         def fn(x, y):
@@ -6878,9 +7149,13 @@ def c(x):
         with patch.object(
             CompileContext,
             "__init__",
+<<<<<<< HEAD
             lambda self, _: CompileContext_init(
                 self, CompileId(frame_id=999, frame_compile_id=999)
             ),
+=======
+            lambda self, _: CompileContext_init(self, CompileId(999, 999)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             _, (coda_a2,) = _run_and_get_stripped_kernels(a, x)
             _, (coda_c2,) = _run_and_get_stripped_kernels(c, x)
@@ -6911,7 +7186,10 @@ def b(x):
 
     @config.patch(force_disable_caches=True)
     @skip_if_cpp_wrapper("run_and_get_kernels issue")
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "matmul is now generated")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deterministic_codegen_with_suffix(self):
         if "cpu" in str(self.device) and config.is_fbcode():
             raise unittest.SkipTest("cpp packaging is wacky in fbcode")
@@ -7132,6 +7410,7 @@ def fn(a):
 
         self.common(fn, (torch.randn(8),))
 
+<<<<<<< HEAD
     def test_full_like_transposed(self):
         def fn(a):
             return torch.full_like(a, 3)
@@ -7144,6 +7423,8 @@ def fn(a):
 
         self.common(fn, (torch.rand(3, 4)[:, ::2],), exact_stride=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_full_truncation(self):
         def fn(a):
             return a + torch.full_like(a, 7.777)
@@ -7482,6 +7763,7 @@ def fn(a, b):
             rtol=1.3e-06,
         )
 
+<<<<<<< HEAD
     @requires_gpu()
     def test_grid_sampler_expand_preserves_view(self):
         if not self.device.startswith("cuda"):
@@ -7532,6 +7814,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         self.assertIsNotNone(model.grid.grad)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_upsample_bicubic2d(self):
         def fn(a):
             return (
@@ -7580,6 +7864,7 @@ def fn(a, descending):
         self.common(fn, (inp, False))
         self.common(fn, (inp, True))
 
+<<<<<<< HEAD
     @parametrize("stable", (True, False))
     @parametrize("descending", (True, False))
     def test_nan_sort(self, descending, stable):
@@ -7611,6 +7896,8 @@ def test_sort(x, descending, stable):
         b = test_sort(*inps)
         self.assertEqual(a, b, equal_nan=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sort_stable(self):
         def fn(a, descending):
             return a.sort(dim=-1, stable=True, descending=descending)
@@ -7702,6 +7989,7 @@ def fn(a):
             fn, (torch.randint(0, 999, size=[1, 1, 8, 8], dtype=torch.float32),)
         )
 
+<<<<<<< HEAD
     def test_constant_pad_2d_strides_nonpositive(self):
         def fn(a):
             return torch.constant_pad_nd(a, [0, 0, 0, -2, 0, 0])
@@ -7710,6 +7998,8 @@ def fn(a):
             fn, (torch.empty_strided((2, 4, 5), (20, 1, 4), dtype=torch.float32),)
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # misaligned address
     def test_constant_pad_3d(self):
         def fn(a):
@@ -8076,8 +8366,11 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
             ),
         )
 
+<<<<<<< HEAD
     @skipIfWindows
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_roi_align(self):
         if not has_torchvision_roi_align():
             raise unittest.SkipTest("requires torchvision")
@@ -8206,6 +8499,11 @@ def fn(a, b, c, beta):
                 # Greatest relative difference: 1.0 at index (3, 19, 4) (up to 0.001 allowed)
                 atol=0.002,
                 rtol=0.001,
+<<<<<<< HEAD
+=======
+                # MacOS-13 MM ops have precision issues
+                check_lowp=self.device != "mps" or MACOS_VERSION > 14.0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @config.patch({"triton.max_tiles": 2})
@@ -8449,6 +8747,13 @@ def fn(x, y):
             )
             return torch.ops.aten.index.Tensor(y, [iota, sub])
 
+<<<<<<< HEAD
+=======
+        # Requires masked loading for the intermediate reduction
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, [torch.randn(1, 1024), torch.randn(1, 1024, 2)])
 
     @config.patch(fallback_random=True)
@@ -8669,6 +8974,7 @@ def forward(self, x, start_pos):
         torch._inductor.metrics.generated_kernel_count = 0
         with torch.no_grad():
             self.common(kv_cache_module, (inp, 1), check_lowp=False)
+<<<<<<< HEAD
 
         if (
             config.triton.native_matmul
@@ -8696,6 +9002,9 @@ def fn(x, y):
                     torch.tensor([0], dtype=torch.float32),
                 ],
             )
+=======
+        assertGeneratedKernelCountEqual(self, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_gpu_halide  # compile error on gpu
     def test_scatter1(self):
@@ -8786,6 +9095,12 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Crashes on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_lowp = True
         if self.device == "xpu":
             check_lowp = False
@@ -8935,6 +9250,12 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_reduce_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Crashes on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_lowp = True
         if self.device == "xpu":
             check_lowp = False
@@ -8953,7 +9274,10 @@ def fn(a, dim, index, b, reduce):
             )
 
     @skip_if_gpu_halide
+<<<<<<< HEAD
     # issue #1150
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
@@ -9143,7 +9467,11 @@ def test_fallback_mutable_op_basic(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
 
             def impl(a, b, c, d, e=2):
+<<<<<<< HEAD
                 a.add_(b[0] * c * e)
+=======
+                (a.add_(b[0] * c * e),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if d is not None:
                     d.add_(b[1])
 
@@ -9216,7 +9544,11 @@ def test_fallback_mutable_op_with_return(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
 
             def impl(a, b, c, d, e=2):
+<<<<<<< HEAD
                 a.add_(b[0] * c * e)
+=======
+                (a.add_(b[0] * c * e),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if d is not None:
                     d.add_(b[1])
                 return b[0] + b[1]
@@ -9469,7 +9801,11 @@ def forward(self, v1: torch.Tensor):
         model = Model()
         x = torch.rand(10, 3, 0)
 
+<<<<<<< HEAD
         self.common(model, (x,), exact_stride=True)
+=======
+        self.common(model, (x,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_randint(self):
         @torch.compile(fullgraph=True)
@@ -9525,6 +9861,7 @@ def bin(index, max_size):
     @xfail_if_mps  # 100% are not close
     def test_like_rands(self):
         def fn(x):
+<<<<<<< HEAD
             return torch.rand_like(x), torch.randn_like(x), torch.randint_like(x, 1, 11)
 
         self.common(fn, [torch.zeros([20, 20])], exact_stride=True)
@@ -9540,6 +9877,11 @@ def fn(x):
             )
 
         self.common(fn, (torch.zeros([3, 4])[:, ::2].permute(1, 0),), exact_stride=True)
+=======
+            return torch.rand_like(x), torch.randn_like(x)
+
+        self.common(fn, [torch.zeros([20, 20])])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(check_stack_no_cycles_TESTING_ONLY=True)
     def test_check_stack_no_cycles(self):
@@ -9572,8 +9914,11 @@ def fn(x):
         a0 = fn(x).clone()
         a1 = fn(x).clone()
         self.assertFalse(torch.allclose(a0, a1))
+<<<<<<< HEAD
         self.assertEqual(a0.shape, a1.shape)
         self.assertEqual(a0.stride(), a1.stride())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_gpu()
     @skip_if_triton_cpu("Flaky on Triton CPU")
@@ -9591,8 +9936,11 @@ def fn(x, device):
         a1 = test_like_rands_on_different_device(GPU_TYPE, "cpu")
         self.assertTrue(a0.device.type == GPU_TYPE)
         self.assertTrue(a1.device.type == "cpu")
+<<<<<<< HEAD
         self.assertEqual(a0.shape, a1.shape)
         self.assertEqual(a0.stride(), a1.stride())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_max_pool2d_with_indices_backward(self):
         def fn(a, b, c):
@@ -9852,6 +10200,10 @@ def fn(a, b):
         )
         assertGeneratedKernelCountEqual(self, 0)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9873,6 +10225,10 @@ def fn(a, b):
             ],
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # compiles for 5+ minutes
     def test_avg_pool3d_backward2(self):
         def fn(a, b):
@@ -9895,6 +10251,10 @@ def fn(a, b):
             ],
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward3(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9918,6 +10278,10 @@ def fn(a, b):
         )
         assertGeneratedKernelCountEqual(self, 1)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward4(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9955,6 +10319,7 @@ def fn(a, b):
             ),
             check_lowp=False,
         )
+<<<<<<< HEAD
 
         if (
             config.triton.native_matmul
@@ -9965,6 +10330,13 @@ def fn(a, b):
         else:
             # codegen mm kernel from template
             self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+=======
+        expected_kernel = 0
+        # codegen mm kernel from template
+        self.assertEqual(
+            torch._inductor.metrics.generated_kernel_count, expected_kernel
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(assume_static_by_default=False)
     def test_dtype_sympy_expr(self):
@@ -10037,7 +10409,10 @@ def check(r, g):
 
     @xfail_if_mps
     @config.patch(search_autotune_cache=False)
+<<<<<<< HEAD
     @unittest.skipIf(config.triton.native_matmul, "matmul count is different")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dropout3(self):
         m = torch.nn.Sequential(
             torch.nn.Linear(32, 32, bias=False),
@@ -10125,7 +10500,10 @@ def fn(x):
             ],
         )
 
+<<<<<<< HEAD
     @skipIfXpu(msg="Incorrect XPU reference")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_argmax_argmin2(self):
         def fn(x):
             return (
@@ -10137,7 +10515,10 @@ def fn(x):
 
         self.common(fn, (torch.randn([144, 144]),))
 
+<<<<<<< HEAD
     @skipIfXpu(msg="Incorrect XPU reference")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_argmax_argmin_with_duplicates(self):
         def fn(x):
             return (
@@ -10159,7 +10540,10 @@ def fn(x):
         t1 = torch.randint(8, size=(1028, 1028))
         self.common(fn, (t1,))
 
+<<<<<<< HEAD
     @skipIfXpu(msg="# Incorrect XPU reference ")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfail_if_mps  # eager nan is wrong, see https://github.com/pytorch/pytorch/issues/130295
     @skip_if_halide  # nan behavior
     def test_argmax_argmin_with_nan(self):
@@ -10260,7 +10644,10 @@ def shrink_rank(x, rank):
                 [rank4_inps, rank3_inps, rank5_inps],
             )
 
+<<<<<<< HEAD
     @skipIfXpu(msg="Incorrect XPU reference")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_argmax_argmin3(self):
         def fn(x):
             return (
@@ -10543,19 +10930,33 @@ def test_zero_dim_reductions(self):
         for kd in [True, False]:
             inps0 = (torch.zeros(2, 0, device=self.device, dtype=torch.float16), 1, kd)
             failed_ops = [aten.argmin, aten.argmax, aten.max, aten.min]
+<<<<<<< HEAD
             for op in failed_ops:
                 with self.assertRaisesRegex(
                     IndexError, "Expected reduction dim 1 to have non-zero size"
                 ):
                     mod = make_fx(op)(*inps0)
+=======
+            for fo in failed_ops:
+                with self.assertRaisesRegex(
+                    IndexError, "Expected reduction dim 1 to have non-zero size"
+                ):
+                    mod = make_fx(fo)(*inps0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     _ = compile_fx_inner(mod, inps0)
 
             pass_ops = [
                 lambda *x: fn(*x) for fn in [aten.sum, aten.prod, aten.any, aten.all]
             ]
+<<<<<<< HEAD
             for op in pass_ops:
                 compiled = torch.compile(op, backend="inductor")
                 expected = op(*inps0)
+=======
+            for po in pass_ops:
+                compiled = torch.compile(po, backend="inductor")
+                expected = po(*inps0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 actual = compiled(*inps0)
 
             self.assertTrue(torch.allclose(actual, expected, atol=1e-3, rtol=1e-3))
@@ -10696,7 +11097,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     with TestRefMode():
                         fn_compiled(inps)
 
+<<<<<<< HEAD
                 # for some reason, TorchDispatch doesn't capture the
+=======
+                # for some reason, TorchDispatch doesnt capture the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # cuda mm call (even without cudagraphs)
                 if self.device == "cpu":
                     self.assertTrue(matmul_seen)
@@ -10739,6 +11144,7 @@ def fn(x):
         for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
             self.common(fn, (x,))
 
+<<<<<<< HEAD
     def test_copy_with_scalar_src(self):
         def fn(x):
             buffer = torch.zeros_like(x)
@@ -10749,6 +11155,8 @@ def fn(x):
         x = torch.randn(64, 64, dtype=torch.float32, device=self.device)
         self.common(fn, (x,))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kwargs(self):
         if self.device == GPU_TYPE:
             raise unittest.SkipTest("histogramdd only supports cpu")
@@ -10828,6 +11236,11 @@ def f(x):
         not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
     )
     def test_inductor_multiple_specializations(self):
+<<<<<<< HEAD
+=======
+        from triton.testing import do_bench
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile(
             options={
                 "max_autotune": True,
@@ -10842,7 +11255,11 @@ def inductor_matmul(a, b):
         m = 16
         k = 1280
         dynamic_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+<<<<<<< HEAD
         dynamic_specialized_a = dynamic_a.clone()
+=======
+        dynamic_specialized_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = torch.randn(k, m, device=GPU_TYPE, dtype=torch.bfloat16)
         torch._dynamo.decorators.mark_dynamic(
             dynamic_a,
@@ -10857,6 +11274,7 @@ def inductor_matmul(a, b):
             b,
             1,
         )
+<<<<<<< HEAD
         dynamic = inductor_matmul(dynamic_a, b)
         torch._dynamo.reset()
         dynamic_specialized = inductor_matmul(dynamic_specialized_a, b)
@@ -10926,6 +11344,14 @@ def branching(x):
             RuntimeError, "Could not guard on data-dependent expression"
         ):
             branching(x_small)
+=======
+        dynamic = do_bench(lambda: inductor_matmul(dynamic_a, b))
+        torch._dynamo.reset()
+        dynamic_specialized = do_bench(
+            lambda: inductor_matmul(dynamic_specialized_a, b)
+        )
+        self.assertGreaterEqual(dynamic, dynamic_specialized)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
@@ -10934,7 +11360,11 @@ def f(x):
 
         def custom_pass(g: torch.fx.Graph) -> None:
             """
+<<<<<<< HEAD
             Applies `lambda x: x.t().contiguous().t()` to the output.
+=======
+            Applies `lamda x: x.t().contiguous().t()` to the output.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             output_node = g.find_nodes(op="output")[0]
             assert len(output_node.args) == 1
@@ -10983,6 +11413,7 @@ def fn(x):
 
         self.common(fn, [torch.randn(1, 8, 396 * 300)])
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_pattern_matcher_unbacked(self):
         @torch.compile(fullgraph=True)
@@ -10999,6 +11430,8 @@ def get_mask(W: torch.Tensor, percentage_nonzeros: torch.Tensor):
         p = torch.tensor(0.50, device=self.device)
         get_mask(x, p)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sqrt_dynamic_shapes(self):
         # TIMM convit_base model: https://github.com/pytorch/pytorch/issues/97877.
         # TODO: support cuda path.
@@ -11318,7 +11751,10 @@ def foo(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
             {
                 "triton.prefer_nd_tiling": prefer_nd_tiling,
                 "triton.use_block_ptr": use_block_ptr,
+<<<<<<< HEAD
                 "triton.native_matmul": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             # Check accuracy
@@ -11753,6 +12189,12 @@ def fn(input_ids) -> torch.Tensor:
             attention_mask = attention_mask.long()
             return torch.cumsum(attention_mask, dim=1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("CumSum for int64 needs MacOS-13.3+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2)
         self.common(fn, (x,), atol=0, rtol=0)
 
@@ -11836,7 +12278,11 @@ def fn(x, size, memory_format):
     @staticmethod
     def _cases_resize_as_common():
         for x, y_size, memory_format in CommonTemplate._cases_resize_common():
+<<<<<<< HEAD
             # each sizes /memory_format combination tested in 2 ways:
+=======
+            # each sizes /memory_format combintation tested in 2 ways:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # 1. y is contiguous fn gets memory_format kwargs
             # 2. y has memory_format contiguity and fn gets preserve kwarg
             # 3. y has some other strides (not contiguous or channels last) and fn gets preserve
@@ -11963,12 +12409,24 @@ def test_fft_real_input(self):
         def fn(x):
             return torch.fft.fftn(x)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("FFT needs MacOS-14+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (torch.randn((16, 16, 16)),), check_lowp=False)
 
     def test_fft_real_input_real_output(self):
         def fn(x):
             return torch.fft.fftn(x).real
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("FFT needs MacOS-14+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (torch.randn((16, 16, 16)),), check_lowp=False)
 
     def test_searchsorted(self):
@@ -12529,7 +12987,15 @@ def fn(x):
                 # to be channels-last. If this assertion ever fails then we need
                 # a new test case.
                 self.assertEqual(len(bar_strides), 1)
+<<<<<<< HEAD
                 self.assertNotEqual(bar_strides[0], expected_stride)
+=======
+                if self.device == "mps" and MACOS_VERSION < 15.0:
+                    # Before MacOS15 contigous output were returned regardless of input
+                    self.assertEqual(bar_strides[0], expected_stride)
+                else:
+                    self.assertNotEqual(bar_strides[0], expected_stride)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(implicit_fallbacks=True)
     @skip_if_cpp_wrapper(
@@ -12980,7 +13446,11 @@ def fn(n):
             )
 
         res = torch.compile(fn)(20)
+<<<<<<< HEAD
         self.assertTrue(torch.all((res >= 0) & (res < 10)).item())
+=======
+        self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._inductor.config.patch(force_shape_pad=True)
     @skip_if_gpu_halide  # correctness issue
@@ -13159,10 +13629,22 @@ def fn(x):
             not in [
                 "airy_ai",
                 "erfcx",
+<<<<<<< HEAD
+=======
+                "gammainc",
+                "gammaincc",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "laguerre_polynomial_l",
                 "legendre_polynomial_p",
                 "log_ndtr",
                 "ndtri",
+<<<<<<< HEAD
+=======
+                "shifted_chebyshev_polynomial_t",
+                "shifted_chebyshev_polynomial_u",
+                "shifted_chebyshev_polynomial_v",
+                "shifted_chebyshev_polynomial_w",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             else self.assertRaises(NotImplementedError)
         )
@@ -13274,9 +13756,21 @@ def forward(float_1, view_1):
         a = torch.randn(512, 4096, requires_grad=True)
         b = torch.randint(size=(512,), low=0, high=4095)
 
+<<<<<<< HEAD
         self.common(forward, (a, b))
 
     def test_isin_tensor_scalar(self):
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
+        self.common(forward, (a, b))
+
+    def test_isin_tensor_scalar(self):
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("isin is not implemented on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for invert in [True, False]:
             torch._dynamo.reset()
             elements = 1
@@ -13449,7 +13943,11 @@ def __init__(self, dim):
 
             def forward(self, x):
                 x = self.conv_t(x)
+<<<<<<< HEAD
                 x = torch.sigmoid(x)  # trigger condition
+=======
+                x = torch.sigmoid(x)  # tigger condition
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x
 
         for dim in (1, 2, 3):
@@ -13532,7 +14030,11 @@ def f(x):
                 "assert_size_stride(buf2, (16, 32), (32, 1)"
             ).run(code)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(use_fast_math=True)
     def test_prepare_softmax_with_fast_math(self):
         """
@@ -13871,7 +14373,11 @@ def test_split_reduction_with_int64_size(self):
         op = torch.mean
         expected = op(t)
         actual = torch.compile(op)(t)
+<<<<<<< HEAD
         # self.common takes more GPU memory. Do the check directly
+=======
+        # self.common takes more GPU memory. Do the check dirctly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             torch.allclose(expected, actual, atol=1e-2, rtol=1e-2),
             f"{expected=} {actual=}",
@@ -13941,6 +14447,10 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
         )
 
     @config.patch("min_num_split", 256)
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # TypeError: cannot determine truth value of Relational
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_split_reduction_dynamic_shape(self):
         from torch._dynamo.decorators import mark_dynamic
 
@@ -14009,6 +14519,7 @@ def forward(self, x):
                 FileCheck().check("cpp_fused_add_0").run(code)
             self.assertEqual(refe_out, test_out)
 
+<<<<<<< HEAD
     def test_triton_kernel_bool_param(self):
         if self.device != GPU_TYPE or self.device == "mps":
             raise unittest.SkipTest("requires GPU")
@@ -14371,6 +14882,8 @@ def fn(x):
 
     # end of class CommonTemplate - add new tests here
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class TestFailure:
@@ -14414,6 +14927,7 @@ def new_test(self, value=value):
         other_cls.is_dtype_supported = my_cls.is_dtype_supported
 
 
+<<<<<<< HEAD
 def add_test_failures(
     test_failures: dict[str, TestFailure], added_test_failures: dict[str, TestFailure]
 ):
@@ -14433,6 +14947,8 @@ def add_test_failures(
             test_failures[name] = new_failure
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if RUN_CPU:
 
     class SweepInputsCpuTest(SweepInputs2, TestCase):
@@ -14613,6 +15129,10 @@ def fn1(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
                 return a[y.to(torch.int64)]
 
             def fn2(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+<<<<<<< HEAD
+=======
+                torch._check_is_size(b.shape[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(b.shape[0] >= 2)
                 torch._check(b.shape[0] <= 100)
                 return fn1(a, b)
@@ -14744,7 +15264,11 @@ def forward(
                 torch._inductor.aot_compile(traced, inputs)
 
         @skipCUDAIf(not SM90OrLater, "Requires sm90")
+<<<<<<< HEAD
         @requires_cuda_and_triton
+=======
+        @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @unittest.skipIf(TEST_WITH_ROCM, "no grouped_mm support")
         @config.patch(implicit_fallbacks=True)
         def test_grouped_mm(self):
@@ -14805,7 +15329,11 @@ def has_indirect(code, tl_fn: str):
 
             def has_assert(code, lower: bool, upper: bool):
                 self.assertIn(
+<<<<<<< HEAD
                     "device_assert", code, msg=f"No device assert found:\n{code}"
+=======
+                    "device_assert", code, msg=f"No device asert found:\n{code}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for line in code.split("\n"):
                     if "device_assert" in line:
@@ -14901,6 +15429,7 @@ def fn_gpu(x):
             self.assertEqual(type(r), np.ndarray)
             self.assertEqual(r, np.sin(x))
 
+<<<<<<< HEAD
         @config.patch(expand_dimension_for_pointwise_nodes=True)
         def test_rope_fusion(self):
             batch_size, seq_length, hidden_dim = 8, 16, 128
@@ -14950,6 +15479,8 @@ def apply_rotary_pos_emb(
             code = run_and_get_triton_code(compiled_fn, q, k, cos, sin, pos_ids)
             self.assertEqual(code.count(".run("), 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_numpy_autograd(self):
             def my_torch(x):
                 y = torch.cat([torch.sin(x) ** 2, torch.max(x)[None]])
@@ -15145,7 +15676,11 @@ def f(a, b):
                 self.assertExpectedInline(
                     "\n".join(lines),
                     """\
+<<<<<<< HEAD
         tmp0 = tl.reshape(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last'), [XBLOCK, R0_BLOCK])
+=======
+        tmp0 = tl.reshape(tl.broadcast_to(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last')[:, None, :, :], [(511 + XBLOCK) // 512, ((1) * ((1) <= ((511 + XBLOCK) // 512)) + ((511 + XBLOCK) // 512) * (((511 + XBLOCK) // 512) < (1))), ((512) * ((512) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (512))), R0_BLOCK]), [XBLOCK, R0_BLOCK])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tmp1 = tl.load(block_ptr1, boundary_check=[1], padding_option='zero', eviction_policy='evict_first')""",  # noqa: B950 line too long
                 )
 
@@ -15210,7 +15745,10 @@ def test_has_constant_mask(self, block_multiple, ynumel_exceed_ygrid_size):
                 self.assertTrue("ymask = yindex < ynumel" in code)
                 self.assertTrue("xmask = xindex < xnumel" in code)
 
+<<<<<<< HEAD
         @config.patch("triton.native_matmul", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_kernel_names_descriptive(self):
             @torch.compile(backend="inductor")
             def fn1(x):
@@ -15254,7 +15792,15 @@ def fn3(x):
                 ),
                 (
                     fn3,
+<<<<<<< HEAD
                     "triton_poi_fused_LayerNorm_ReLU",
+=======
+                    (
+                        "triton_poi_fused_layer_norm_relu"
+                        if torch._dynamo.config.inline_inbuilt_nn_modules
+                        else "triton_poi_fused_LayerNorm_ReLU"
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (torch.randn(4, 4, device=GPU_TYPE),),
                 ),
             ]
@@ -15304,11 +15850,19 @@ def fn(x):
             else:
                 self.assertTrue("Graph fragment" in code)
                 self.assertTrue(
+<<<<<<< HEAD
                     f'%sin : Tensor "f32[4, 4][4, 1]{GPU_TYPE}:0"[num_users=1] = call_function[target=torch.ops.aten.sin.default]'
                     in code
                 )
                 self.assertTrue(
                     f'%relu : Tensor "f32[4, 4][4, 1]{GPU_TYPE}:0"[num_users=1] = call_function[target=torch.ops.aten.relu.default]'
+=======
+                    "%sin : [num_users=1] = call_function[target=torch.ops.aten.sin.default]"
+                    in code
+                )
+                self.assertTrue(
+                    "%relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     in code
                 )
 
@@ -15763,6 +16317,7 @@ def fn(x):
                 "'XBLOCK': 'constexpr'"
             ).run(code[0])
 
+<<<<<<< HEAD
         @unittest.skipIf(TEST_WITH_ROCM or not IS_SM90, "no scaled_grouped_mm support")
         def test_respect_scaled_grouped_mm_layout_tag(self):
             # scaled_grouped_mm needs `mat2` to be column-major
@@ -15854,6 +16409,303 @@ def f(x):
                 FileCheck().check_count(
                     f"with torch.{GPU_TYPE}._DeviceGuard(0)", 1, exactly=True
                 ).run(code)
+=======
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            x, y = [torch.ones(2, 2, device=self.device) for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = f(x, y)
+
+            f_compiled = torch.compile(f)
+            compiled_out = f_compiled(x_cloned, y_cloned)
+            self.assertEqual(eager_out, compiled_out)
+
+            _, code = run_and_get_code(f_compiled, x_cloned, y_cloned)
+
+            if not config.cpp_wrapper:
+                FileCheck().check("def partition_0(args):").check(
+                    "(buf0, buf1, arg0_1, arg1_1) = self.partitions[0](partition0_args)"
+                ).check("recursively_apply_fns = runner.recursively_apply_fns").run(
+                    code[0]
+                )
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_foreach_op(self):
+            def fn(a0, a1):
+                c = torch._foreach_abs([a0, a1])
+                return torch.mul(c[0], a0)
+
+            compiled_fn = torch.compile(fn)
+
+            a0 = torch.randn(2, 3, device=self.device)
+            a1 = torch.randn(2, 3, device=self.device)
+            eager_out = fn(a0, a1)
+            compiled_out = compiled_fn(a0, a1)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_multiple_functions(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            def g(x):
+                return x + 1
+
+            x, y = [torch.ones(2, 2, device=self.device) for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = g(f(x, y))
+
+            f_compiled = torch.compile(f)
+            g_compiled = torch.compile(g)
+            compiled_out = g_compiled(f_compiled(x_cloned, y_cloned))
+
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_condition_op(self):
+            def f(p, b):
+                def true_fn(x):
+                    return torch.cos(x)
+
+                def false_fn(x):
+                    return torch.sin(x)
+
+                return torch.cond(p, true_fn, false_fn, [b])
+
+            compiled_f = torch.compile(f)
+
+            # static shape
+            p = torch.tensor([True], device=self.device)
+            a = torch.ones([2, 3], device=self.device)
+            eager_out = f(p, a)
+            compiled_out = compiled_f(p, a)
+            self.assertEqual(eager_out, compiled_out)
+
+            # dynamic shape with backed symint
+            p = torch.tensor([True], device=self.device)
+            a = torch.ones([4, 5], device=self.device)
+            eager_out = f(p, a)
+            compiled_out = compiled_f(p, a)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_unbacked_symint_multi_output_layout(self):
+            def f(p, size_tensor):
+                size_val = size_tensor.item()
+                b = torch.ones([size_val, 3], device=GPU_TYPE)
+
+                def true_fn(x):
+                    return torch.cos(x), torch.cos(x) + 1
+
+                def false_fn(x):
+                    return torch.sin(x), torch.sin(x) + 1
+
+                cond_out = torch.cond(p, true_fn, false_fn, [b])
+                return cond_out[0] + cond_out[1]
+
+            compiled_f = torch.compile(f)
+            p = torch.tensor([True], device=GPU_TYPE)
+            size_tensor = torch.tensor(2, device=GPU_TYPE)
+            eager_out = f(p, size_tensor)
+            compiled_out = compiled_f(p, size_tensor)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+            )
+            compiled_out = f_compiled(x, y)
+            self.assertEqual(compiled_out, f(x, y))
+
+            x, y = (
+                torch.ones(4, 4, device=self.device),
+                torch.randn(4, 4, device=self.device),
+            )
+            compiled_out = f_compiled(x, y)
+            self.assertEqual(compiled_out, f(x, y))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_cat_backward(self):
+            def f(x, w):
+                y = torch.cat((x, x), dim=0)
+                z = y @ w
+                return z @ z.T
+
+            compiled_f = torch.compile(f)
+
+            for shape in (2, 3):
+                torch.manual_seed(42)
+                eager_x = torch.randn(shape, 2, device=self.device)
+                eager_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+                torch.manual_seed(42)
+                compiled_x = torch.randn(shape, 2, device=self.device)
+                compiled_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+
+                f(eager_x, eager_w).sum().backward()
+                compiled_f(compiled_x, compiled_w).sum().backward()
+                self.assertEqual(eager_w.grad, compiled_w.grad)
+
+        @dynamo_config.patch("capture_dynamic_output_shape_ops", True)
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_nested_indirect_indexing(self):
+            def nested(x, repeats):
+                rank = torch.arange(repeats.numel(), device=x.device)
+                index = rank.repeat_interleave(repeats, dim=0)
+                return torch.index_select(x, index=index, dim=0)
+
+            example_inputs = (
+                torch.randn((32, 64), device=self.device),
+                repeats := torch.tensor([5, 10, 15], device=self.device),
+            )
+            torch._dynamo.mark_dynamic(repeats, 0)  # create backed symint
+
+            nested_opt = torch.compile(nested, backend="inductor")
+
+            expect = nested(*example_inputs)
+            actual = nested_opt(*example_inputs)
+            self.assertEqual(expect, actual)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_mutation_index(self):
+            x = torch.zeros(7, device=GPU_TYPE)
+
+            def fn(n, a):
+                a[n] = -1
+                return a
+
+            opt_fn = torch.compile(fn, fullgraph=True)
+
+            for n in range(2, x.shape[0]):
+                opt_fn(n, x)
+                self.assertEqual(x[n], -1)
+
+            # Negative index triggers new compilation.
+            opt_fn(-x.shape[0], x)
+
+            self.assertEqual(x[0], -1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_unbacked_symint(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+            )
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(y, 1)
+
+            compiled_out = f_compiled(x, y)
+            eager_out = f(x, y)
+            self.assertEqual(compiled_out, eager_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_dynamic_scalar_inputs(self):
+            def f(x, y, integer):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                z += integer
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+            )
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(y, 1)
+
+            compiled_out = f_compiled(x, y, 5)
+            self.assertEqual(compiled_out, f(x, y, 5))
+
+            compiled_out = f_compiled(x, y, 6)
+            self.assertEqual(compiled_out, f(x, y, 6))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_item(self):
+            def f(x):
+                y = x + 1
+                scalar = y.item()
+                return x + y + scalar
+
+            compiled_f = torch.compile(f)
+            compiled_out = f(torch.tensor(1, device=GPU_TYPE))
+            self.assertEqual(compiled_out, f(torch.tensor(1, device=GPU_TYPE)))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_buffer_reuse(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x1 + y1 + x @ y
+                u = (y_cpu.to(GPU_TYPE) + 2) @ y + 3
+                u_cpu = u.cpu() + 2
+                return z + u_cpu.to(GPU_TYPE)
+
+            x, y = [torch.ones(2, 2, device=GPU_TYPE) for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = f(x, y)
+
+            f_compiled = torch.compile(f)
+            compiled_out = f_compiled(x_cloned, y_cloned)
+
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_fused_scheduler_node(self):
+            def foo(x):
+                x = x * 20
+                x_alias = x[0]
+                y = x * 10
+                y_alias = y[0]
+                torch._dynamo.graph_break()
+                ind = torch.tensor(4, device=GPU_TYPE)
+                x_alias2 = x[ind:]
+                y_alias2 = y[ind:]
+                return x, x_alias, x_alias2, y_alias, y_alias2
+
+            foo = torch.compile(foo)
+            x = torch.rand([20, 20], device=GPU_TYPE)
+            _, code = run_and_get_code(foo, x)
+
+            if not config.cpp_wrapper:
+                FileCheck().check("def partition_0(args):").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class RNNTest(TestCase):
         device_type = GPU_TYPE
diff --git a/test/inductor/test_torchinductor_codegen_config_overrides.py b/test/inductor/test_torchinductor_codegen_config_overrides.py
index d8f1f06afd5df..e266257362b77 100644
--- a/test/inductor/test_torchinductor_codegen_config_overrides.py
+++ b/test/inductor/test_torchinductor_codegen_config_overrides.py
@@ -1,7 +1,11 @@
 # Owner(s): ["module: inductor"]
 import importlib
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import skipIf
 
 import torch
@@ -32,8 +36,11 @@ def run_and_compare(
         *args,
         compile_kwargs: Optional[dict] = None,
         config_patches: Optional[dict] = None,
+<<<<<<< HEAD
         atol: float | None = 1e-05,
         rtol: float | None = 1e-08,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Runs the module through Inductor, comparing to eager reference.
@@ -55,7 +62,11 @@ def flatten_tensors(tensors):
         ref_tensors = flatten_tensors(func(*args))
         actual_tensors = flatten_tensors(result)
         for ref, actual in zip(ref_tensors, actual_tensors):
+<<<<<<< HEAD
             self.assertTrue(torch.allclose(ref, actual, atol=atol, rtol=rtol))
+=======
+            self.assertTrue(torch.allclose(ref, actual))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return result, code
 
@@ -93,6 +104,7 @@ def func(a, b):
 
     @requires_gpu()
     @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
+<<<<<<< HEAD
     def test_cse_make_block_ptr_reduction(self):
         def func(a, b):
             tmp0 = a * b
@@ -121,6 +133,8 @@ def func(a, b):
 
     @requires_gpu()
     @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kernel_fusion_thresholds(self):
         def func(a, b):
             tmp0 = a + 1
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 2244af38f635a..bc92782298cce 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -25,7 +25,10 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+<<<<<<< HEAD
     add_test_failures,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CommonTemplate,
     copy_tests,
     run_and_get_cpp_code,
@@ -104,6 +107,7 @@ def run(*ex, **kwargs):
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
     "test_to_device_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+<<<<<<< HEAD
     "test_as_strided_on_views_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
@@ -116,6 +120,11 @@ def run(*ex, **kwargs):
     ),
     # XPU always convert conv1d to conv2d and can not match the expected codegen result.
     "test_conv1d_depthwise_dynamic_shapes": TestFailure(("xpu",), is_skip=True),
+=======
+    #
+    # Failed to find dynamic for loop variable:
+    #
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_arange1_dynamic_shapes": TestFailure(("cpu",)),
     "test_arange2_dynamic_shapes": TestFailure(("cpu",)),
     "test_arange3_dynamic_shapes": TestFailure(("cpu",)),
@@ -147,12 +156,15 @@ def run(*ex, **kwargs):
     "test_mul_index_expr_dynamic_shapes": TestFailure(("cpu",)),
     "test_flip_cat_dynamic_shapes": TestFailure(("cpu",)),
     "test_pad_single_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
     "test_slice_scatter_dtype_consistency_dynamic_shapes": TestFailure(
         (
             "cpu",
             "mps",
         )
     ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_embedding_sparse_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     #
     # Failed to find for loop/triton kernel:
@@ -168,7 +180,10 @@ def run(*ex, **kwargs):
     "test_bmm2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_both_scalars_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_compar_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
     "test_complex_from_real_imag_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_const_int32_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_conv2d_backward_channels_last_dynamic_shapes": TestFailure(("cpu",)),
     "test_conv_backward_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
@@ -194,7 +209,10 @@ def run(*ex, **kwargs):
     "test_bucketize_int_int64_int64_dynamic_shapes": TestFailure(("cpu",)),
     "test_searchsorted_dynamic_shapes": TestFailure(("cpu",)),
     "test_like_rands_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+<<<<<<< HEAD
     "test_like_rands_sliced_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_linspace2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_linspace3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_linspace4_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
@@ -260,9 +278,12 @@ def run(*ex, **kwargs):
     "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_polar_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+<<<<<<< HEAD
     "test_add_complex7_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_add_complex8_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_add_complex9_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)),
     "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_single_elem_dynamic_shapes": TestFailure(("cpu",)),
@@ -367,7 +388,11 @@ def run(*ex, **kwargs):
     "test_rand_like_deterministic_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
+<<<<<<< HEAD
     "test_repeat_interleave_2_dynamic_shapes": TestFailure(("cpu",)),
+=======
+    "test_repeat_interleave_2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_slice_mutation2_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
@@ -402,10 +427,16 @@ def run(*ex, **kwargs):
     # Refinement means we don't actually generate dynamic shapes (but only on
     # cpu apparently?!)
     "test_nonzero_unbacked_refinement_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
 }
 
 add_test_failures(test_failures, dynamic_shapes_test_failures)
 
+=======
+    **dynamic_shapes_test_failures,
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not TEST_WITH_ROCM:
     test_failures.update(
         {
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 267cd32f49f2b..071b56a894826 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -12,7 +12,10 @@
 import torch.library
 from torch._dynamo.testing import CompileCounterWithBackend, make_test_cls_with_patches
 from torch._inductor import metrics
+<<<<<<< HEAD
 from torch._inductor.choices import InductorChoices
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import run_and_get_code
@@ -32,13 +35,19 @@
     TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU,
+<<<<<<< HEAD
     HAS_MPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     patch_inductor_backend,
 )
 
@@ -62,6 +71,7 @@
     "test_kwargs_dynamic_shapes": TestFailure(("cpu",)),
     # calling div on only symint args
     "test_AllenaiLongformerBase_repro_dynamic_shapes": TestFailure(
+<<<<<<< HEAD
         ("cpu", "cuda", "xpu", "mps")
     ),
     "test_argmax_argmin_with_duplicates_dynamic_shapes": TestFailure(("mps",)),
@@ -89,6 +99,11 @@
     ),
 }
 
+=======
+        ("cpu", "cuda", "xpu")
+    ),
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not torch._inductor.config.cpp_wrapper:
     test_failures["test_conv_inference_heuristics_dynamic_shapes"] = TestFailure(
         ("cuda",)
@@ -105,6 +120,7 @@
     test_failures["test_unbacked_reduction"] = TestFailure(("cpu"), is_skip=True)
 
 
+<<<<<<< HEAD
 if any(os.getenv("BUILD_ENVIRONMENT", "").endswith(x) for x in ("-debug", "-asan")):
     # Fails with TORCH_INTERNAL_ASSERT(!is_heap_allocated()), see https://github.com/pytorch/pytorch/issues/130073
     # After https://github.com/pytorch/pytorch/pull/161586, starts failing UBSAN so we can't even xfail.
@@ -116,6 +132,12 @@
     test_failures["test_resize_dynamic_shapes"] = TestFailure(
         ("cpu", "cuda"), is_skip=True
     )
+=======
+if os.getenv("BUILD_ENVIRONMENT", "").endswith("-debug"):
+    # Fails with TORCH_INTERNAL_ASSERT(!is_heap_allocated()), see https://github.com/pytorch/pytorch/issues/130073
+    test_failures["test_resize_as_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
+    test_failures["test_resize_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def make_dynamic_cls(cls, xfail_prop="_expected_failure_dynamic"):
@@ -140,7 +162,11 @@ class DynamicShapesCpuTests(TestCase):
     copy_tests(DynamicShapesCommonTemplate, DynamicShapesCpuTests, "cpu", test_failures)
 
 
+<<<<<<< HEAD
 if (HAS_GPU or HAS_MPS) and not TEST_WITH_ASAN:
+=======
+if HAS_GPU and not TEST_WITH_ASAN:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class DynamicShapesGPUTests(TestCase):
         common = check_model_gpu
@@ -155,7 +181,11 @@ class TestInductorDynamic(TestCase):
     compile_fn = partial(torch.compile, dynamic=True)
 
     def setUp(self):
+<<<<<<< HEAD
         # HAS_CUDA_AND_TRITON also checks compute capability to skip tests
+=======
+        # HAS_CUDA also checks compute capability to skip tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # on older devices
         if not HAS_GPU:
             self.skipTest("Triton not available")
@@ -282,11 +312,18 @@ def fn(x, y):
         self.assertEqual(r, opt_r)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+<<<<<<< HEAD
     @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unwrap_storage_didnt_work_repro(self, device):
         def f():
             full = torch.full((), 11)
             i0 = full.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(i0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.full((i0,), 0)
 
         opt_f = torch.compile(f, fullgraph=True)
@@ -451,6 +488,11 @@ def f(x):
     def test_return_unbacked_view_split(self, device):
         def f(values, length_per_key):
             u0, u1 = length_per_key.tolist()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             v1, v2 = torch.functional.split(values, [u0, u1])
             return v1, v2
 
@@ -482,6 +524,10 @@ def _cat(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
 
         @torch.library.register_fake("_test::_cat")
         def _cat_fake(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
+<<<<<<< HEAD
+=======
+            [torch._check_is_size(d) for d in ds]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return t.new_empty([sum(ds)])
 
         def _cat_setup_context(ctx, inputs, output):
@@ -981,6 +1027,10 @@ def test_item_unbacked_stride_nobreak(self, device):
         @torch.compile(fullgraph=True, dynamic=True)
         def f(x):
             a = x.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(a >= 1)
             torch._check(a <= 10)
             return torch.ones(a, a)
@@ -992,6 +1042,11 @@ def test_symint_sum_list(self, device):
         @torch.compile()
         def f(xt):
             xs = xt.tolist()
+<<<<<<< HEAD
+=======
+            for x in xs:
+                torch._check_is_size(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y = sum(xs)
             return torch.zeros(y, device=device)
 
@@ -1080,6 +1135,7 @@ def fn(x, y):
         self.assertEqual(fn(x, 4.0), fn_opt(x, 4.0))
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
     @onlyOn(GPU_TYPE)
     def test_dynamic_rblock_bounds(self):
         class ForcePersistent(InductorChoices):
@@ -1144,6 +1200,8 @@ def reduce_bounded(x, y):
 
         assert torch.allclose(result, expected, atol=1e-3, rtol=1e-3)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unspecialized_float_dynamic(self):
         def fn(x, y):
             return x * y
@@ -1225,5 +1283,9 @@ def fn(a, descending):
     from torch._inductor.test_case import run_tests
 
     # Slow on ASAN after https://github.com/pytorch/pytorch/pull/94068
+<<<<<<< HEAD
     if (HAS_CPU or HAS_GPU or HAS_MPS) and not TEST_WITH_ASAN:
+=======
+    if (HAS_CPU or HAS_GPU) and not TEST_WITH_ASAN:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 1c9b39a1bd08d..73b1c57cd5665 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -2,7 +2,10 @@
 import atexit
 import contextlib
 import functools
+<<<<<<< HEAD
 import math
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 import unittest
@@ -26,13 +29,21 @@
     OpDTypes,
     ops,
     skipCPUIf,
+<<<<<<< HEAD
+=======
+    skipCUDAIf,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipXPUIf,
 )
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     IS_CI,
     IS_MACOS,
     IS_WINDOWS,
+=======
+    IS_MACOS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_X86,
     skipCUDAMemoryLeakCheckIf,
     skipIfCrossRef,
@@ -45,11 +56,19 @@
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
+<<<<<<< HEAD
     has_triton,
     HAS_XPU_AND_TRITON,
     maybe_skip_size_asserts,
 )
 from torch.testing._internal.triton_utils import requires_gpu_and_triton
+=======
+    HAS_CUDA,
+    has_triton,
+    HAS_XPU,
+    maybe_skip_size_asserts,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
@@ -69,6 +88,7 @@
         sys.exit(0)
     raise
 
+<<<<<<< HEAD
 if IS_WINDOWS and IS_CI:
     # TODO(xuhancn) : improve the compiler build performance on windows.
     sys.stderr.write(
@@ -78,6 +98,8 @@
         sys.exit(0)
     raise unittest.SkipTest("skip slow test")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bf16 = torch.bfloat16  # not tested
 f64 = torch.float64
 f32 = torch.float32
@@ -266,12 +288,18 @@ def format_op(op):
     "torch.ops.aten._flash_attention_forward": {f16},
     "torch.ops.aten._efficient_attention_forward": {f16, f32},
     "to_sparse": {
+<<<<<<< HEAD
         b8,
         f16,
         f32,
         f64,
         i32,
         i64,
+=======
+        f16,
+        f32,
+        f64,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },  # NYI: could not find kernel for aten.view.default at dispatch key DispatchKey.SparseCUDA
 }
 
@@ -286,6 +314,7 @@ def format_op(op):
     "tan": {f16},
     "torch.ops.aten._flash_attention_forward": {f16},
     "torch.ops.aten._efficient_attention_forward": {f16, f32},
+<<<<<<< HEAD
     "to_sparse": {
         b8,
         f16,
@@ -294,6 +323,12 @@ def format_op(op):
         i32,
         i64,
     },  # align with cuda.
+=======
+    "to_sparse": {f32, f64},
+    "linalg.eig": {f32, f64},
+    # Double and complex datatype matmul is not supported in oneDNN
+    "byte": {f16, f32},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("linalg.pinv", "singular"): {f64},
     # could not create a primitive
     "addmv": {f64},
@@ -301,6 +336,7 @@ def format_op(op):
     # a deconvolution forward propagation primitive
     "nn.functional.conv_transpose2d": {f32, f64},
     "nn.functional.conv_transpose3d": {f32, f64},
+<<<<<<< HEAD
     # [Begin] Incorrect XPU reference due to new driver.
     "masked.prod": {b8, i32, i64},
     "masked.amin": {i64},
@@ -312,6 +348,11 @@ def format_op(op):
     "std_mean": {f64},
     "var_mean": {f64},
     # [End]
+=======
+    # not implemented for 'Half'
+    "sort": {b8},
+    "argsort": {b8},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -381,9 +422,14 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     return op(*args, **kwargs)
 
 
+<<<<<<< HEAD
 wrapper_noop_set_seed_decorator = patch(
     "torch.testing._internal.common_methods_invocations.wrapper_set_seed",
     wrapper_noop_set_seed,
+=======
+torch.testing._internal.common_methods_invocations.wrapper_set_seed = (
+    wrapper_noop_set_seed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 # key can be either op_name, or (op_name, dtype)
@@ -418,7 +464,11 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
         "rtol": 1e-4,
     },
     ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-4, "rtol": 0.01},
+<<<<<<< HEAD
     # Following tests are failing with strict comparison but atol=1 is acceptable due roundings errors
+=======
+    # Following tests are failing with strict comparision but atol=1 is acceptable due roundings errors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("nn.functional.interpolate.bilinear", u8): {"atol": 1, "rtol": 0},
     ("nn.functional.upsample_bilinear", u8): {"atol": 1, "rtol": 0},
     ("nn.functional.interpolate.bicubic", u8): {"atol": 1, "rtol": 0},
@@ -447,7 +497,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("cumsum", f16): {"reference_in_float": True},
     "cumprod": {"reference_in_float": True, "atol": 7e-5, "rtol": 0.002},
     "logcumsumexp": {"grad_atol": 8e-4, "grad_rtol": 0.001},
+<<<<<<< HEAD
     ("logcumsumexp", f16): {"grad_atol": 3e-3, "grad_rtol": 0.01},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "exponential": {"reference_in_float": True},
     "geometric": {"reference_in_float": True},
     ("kron", f16): {"reference_in_float": True},
@@ -457,7 +510,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("nn.functional.batch_norm.without_cudnn", f16): {"reference_in_float": True},
     ("nn.functional.cosine_similarity", f16): {"reference_in_float": True},
     ("nn.functional.instance_norm", f16): {"reference_in_float": True},
+<<<<<<< HEAD
     ("nn.functional.linear", f16): {"atol": 3e-4, "rtol": 0.01},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("nn.functional.local_response_norm", f16): {"reference_in_float": True},
     ("nn.functional.normalize", f16): {"atol": 1e-3, "rtol": 0.05},
     ("nn.functional.rms_norm", f16): {"reference_in_float": True},
@@ -563,7 +619,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
         "grad_atol": 8e-4,
         "grad_rtol": 0.001,
     },
+<<<<<<< HEAD
     ("logcumsumexp", f16): {"grad_atol": 4e-3, "grad_rtol": 0.01},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "exponential": {"reference_in_float": True},
     "geometric": {"reference_in_float": True},
     ("kron", f16): {"reference_in_float": True},
@@ -636,7 +695,11 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("var_mean", f16): {"atol": 1e-5, "rtol": 2e-3},
     ("var_mean.unbiased", f16): {"atol": 1e-5, "rtol": 2e-3},
     ("vdot", f16): {"atol": 1e-5, "rtol": 2e-3},
+<<<<<<< HEAD
     # Following tests are failing with strict comparison but atol=1 is acceptable due roundings errors
+=======
+    # Following tests are failing with strict comparision but atol=1 is acceptable due roundings errors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # High atol due to precision loss
     ("nn.functional.interpolate.bilinear", f64): {"atol": 5e-4, "rtol": 0},
     ("nn.functional.upsample_bilinear", f64): {"atol": 5e-4, "rtol": 0},
@@ -655,7 +718,11 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
     ("nn.functional.embedding_bag", f32): {"check_gradient": False},
     ("nn.functional.embedding_bag", f64): {"check_gradient": False},
+<<<<<<< HEAD
     ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-4, "rtol": 0.01},
+=======
+    ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-5, "rtol": 5e-3},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("_unsafe_masked_index", f16): {
         "reference_in_float": True,
         "atol": 3e-4,
@@ -691,6 +758,7 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("nn.functional.unfold", f16): {
         "reference_in_float": True,
     },
+<<<<<<< HEAD
     # Reference crash on Intel LTS2 driver.
     ("nn.functional.interpolate.trilinear", f32): {
         "check_gradient": False,
@@ -699,6 +767,8 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("nn.functional.interpolate.trilinear", f64): {
         "check_gradient": False,
     },
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 if TEST_WITH_ROCM:
     inductor_override_kwargs["cuda"].update(
@@ -889,7 +959,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     "nn.functional.adaptive_avg_pool3d": {f16},
     "nn.functional.adaptive_max_pool1d": {f16, f32},
     "nn.functional.adaptive_max_pool2d": {f16, f32},
+<<<<<<< HEAD
     "nn.functional.max_pool2d": {f16, f32, f64},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "nn.functional.bilinear": {f16},
     "nn.functional.conv_transpose1d": {f16},
     "nn.functional.conv_transpose2d": {f16},
@@ -987,6 +1060,7 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
 }
 
 
+<<<<<<< HEAD
 # Custom replacements for assertEquals, in cases where a difference in value
 # may not indicate correctness.
 
@@ -1112,6 +1186,8 @@ def get_sort_assert_equal_fn(args, kwargs):
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def collection_decorator(fn):
     @functools.wraps(fn)
     def inner(self, device, dtype, op):
@@ -1129,7 +1205,10 @@ def inner(self, device, dtype, op):
     return inner
 
 
+<<<<<<< HEAD
 @wrapper_noop_set_seed_decorator
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestInductorOpInfo(TestCase):
     def tearDown(self):
         torch._dynamo.reset()
@@ -1142,10 +1221,15 @@ def tearDown(self):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
+<<<<<<< HEAD
     @requires_gpu_and_triton
     @skipXPUIf(
         not HAS_XPU_AND_TRITON, "Skipped! Supported XPU compiler and Triton not found"
     )
+=======
+    @skipCUDAIf(not HAS_CUDA, "Skipped! Triton not found")
+    @skipXPUIf(not HAS_XPU, "Skipped! Supported XPU compiler not found")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCPUIf(not HAS_CPU, "Skipped! Supported CPU compiler not found")
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @skipIfTorchDynamo("Test uses dynamo already")
@@ -1226,7 +1310,11 @@ def fn(*args, **kwargs):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
+<<<<<<< HEAD
             and dtype != torch.complex32
+=======
+            and not dtype == torch.complex32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
@@ -1270,7 +1358,11 @@ def map_to_fake(e):
 
             return True, rng_mode.has_rng_op
 
+<<<<<<< HEAD
         def get_contexts(has_rng_op, args, kwargs):
+=======
+        def get_contexts(has_rng_op):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if has_rng_op:
                 # TODO - enable this, running into errors
                 return (
@@ -1287,6 +1379,7 @@ def get_contexts(has_rng_op, args, kwargs):
                 )
 
             ctx = functools.partial(maybe_skip_size_asserts, op)
+<<<<<<< HEAD
             if op_name in CUSTOM_ASSERT_EQUALS_FNS:
                 assert_equal_fn = CUSTOM_ASSERT_EQUALS_FNS[op_name](args, kwargs)
                 return (
@@ -1296,6 +1389,8 @@ def get_contexts(has_rng_op, args, kwargs):
                     ),
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ((ctx, {}),)
 
         try:
@@ -1321,9 +1416,13 @@ def _get_tolerances(dtype):
                 #     print(f"RUNNING OP {op_name} on {device_type} with {dtype}", flush=True)
                 rtol, atol = _get_tolerances(dtype)
                 no_python, has_rng_op = do_nopython_and_has_rng(fn, args, kwargs)
+<<<<<<< HEAD
                 for context_fn, kwarg_overrides in get_contexts(
                     has_rng_op, args, kwargs
                 ):
+=======
+                for context_fn, kwarg_overrides in get_contexts(has_rng_op):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     with context_fn():
                         # Base kwargs
                         adjusted_kwargs = {
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 506174103f56c..e583222f3f512 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -5,12 +5,19 @@
 import importlib
 import math
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
 from torch._dynamo.debug_utils import InputReader
+=======
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor import config
 from torch._inductor.choices import InductorChoices
 from torch._inductor.codegen.triton import FixedTritonConfig
@@ -20,7 +27,10 @@
 from torch._inductor.utils import run_and_get_code
 from torch._inductor.virtualized import V
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     decorateIf,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_parametrized_tests,
     parametrize,
     skipIfXpu,
@@ -28,7 +38,10 @@
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
+<<<<<<< HEAD
     HAS_CUDA_AND_TRITON,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_GPU,
     requires_gpu,
     skip_windows_ci,
@@ -55,6 +68,7 @@
 }
 
 
+<<<<<<< HEAD
 # These xfails are due to the current restrictions with the TMA descriptor API.
 # see Note: TMA API Restrictions. In some cases TMA descriptors cannot be generated, and so tests
 # that assert on the expected number of descriptors (= equivalent block ptrs) will fail
@@ -90,6 +104,58 @@ def xfail_if_use_tensor_descriptor(fn):
 class BlockDescriptorTestBase(InductorTestCase):
     block_descriptor_constructor_str = "tl.make_block_ptr"
 
+=======
+def run_and_compare(
+    self: InductorTestCase,
+    func: Callable[..., Any],
+    *args,
+    compile_kwargs: Optional[dict] = None,
+    expected_num_block_pointers: Optional[int] = None,
+    expected_num_programs: int = 1,
+    expected_num_triton_kernels: int = 1,
+    config_patches: Optional[dict] = None,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+):
+    """
+    Runs the module through Inductor, comparing to eager reference.
+    """
+    if compile_kwargs is None:
+        compile_kwargs = {}
+    if config_patches is None:
+        config_patches = {}
+
+    def flatten_tensors(tensors):
+        flat, spec = pytree.tree_flatten(tensors)
+        return flat
+
+    with config.patch(config_patches):
+        compiled = torch.compile(func, backend="inductor", **compile_kwargs)
+        result, code = run_and_get_code(compiled, *args)
+
+    # Check numerical accuracy
+    ref_tensors = flatten_tensors(func(*args))
+    actual_tensors = flatten_tensors(result)
+    for ref, actual in zip(ref_tensors, actual_tensors):
+        # Don't clobber the default tolerance values
+        tol = {t: v for t, v in {"rtol": rtol, "atol": atol}.items() if v is not None}
+        self.assertTrue(torch.allclose(ref, actual, **tol))
+
+    def count_code(substr: str, expected: Optional[int]):
+        count = sum(prog.count(substr) for prog in code)
+        if expected is not None:
+            self.assertEqual(count, expected)
+
+    # Check the code
+    self.assertEqual(len(code), expected_num_programs)
+    count_code("@triton.jit", expected_num_triton_kernels)
+    count_code("tl.make_block_ptr", expected_num_block_pointers)
+
+    return result, code
+
+
+class BlockPointerTestBase(InductorTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _discontiguous_tensor(
         self, view_size: tuple[int, ...], device: Union[torch.device, str]
     ) -> torch.Tensor:
@@ -121,6 +187,7 @@ def _assert_tiling_ndims(self, code, blocks: list[str], num_dims: int) -> None:
     def _get_lines_containing_substr(self, code: str, substr: str) -> str:
         return "\n".join(line for line in code.split("\n") if substr in line)
 
+<<<<<<< HEAD
     def _run_and_compare(
         self: InductorTestCase,
         func: Callable[..., Any],
@@ -173,6 +240,8 @@ def count_code(substr: str, expected: Optional[int]):
 
         return result, code
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @instantiate_parametrized_tests
 class CommonTemplate:
@@ -199,7 +268,12 @@ def foo(x, y):
         # Expect failure for bad inputs
         with self.assertRaises(AssertionError) if raises else contextlib.nullcontext():
             # Expect 3 block pointers: 2 inputs 1 output
+<<<<<<< HEAD
             self._run_and_compare(
+=======
+            run_and_compare(
+                self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 foo,
                 *inputs,
                 expected_num_block_pointers=expected_num_block_pointers,
@@ -274,7 +348,12 @@ def get_input() -> torch.Tensor:
         args = [get_input() for arg_idx in range(2)]
 
         # Expect 3 block pointers: 2 inputs 1 output
+<<<<<<< HEAD
         self._run_and_compare(
+=======
+        run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.add,
             *args,
             expected_num_block_pointers=3 if require_block_ptr else None,
@@ -322,7 +401,12 @@ def foo(x, y):
         self.assertIn(1, all_dims)
 
         # Expect 3 block pointers: 2 inputs one output
+<<<<<<< HEAD
         self._run_and_compare(
+=======
+        run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             x,
             y,
@@ -330,6 +414,7 @@ def foo(x, y):
             config_patches={"triton.prefer_nd_tiling": prefer_nd_tiling},
         )
 
+<<<<<<< HEAD
     def test_broadcast_with_singleton_dims(self):
         # This tests the case when the input / output contains both zero strides
         # and singleton dimensions. In this case the broadcasting dimensions
@@ -399,6 +484,8 @@ def load_args(reader):
             rtol=rtol,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "x_size,y_size",
         [
@@ -441,9 +528,14 @@ def get_input(size: tuple[int]) -> torch.Tensor:
             if i != 1:
                 self.assertEqual(i, j)
 
+<<<<<<< HEAD
         result, (triton_code,) = self._run_and_compare(foo, x, y)
 
     @xfail_if_use_tensor_descriptor
+=======
+        result, (triton_code,) = run_and_compare(self, foo, x, y)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("prefer_nd_tiling", [False, True])
     @config.patch("triton.skip_l1_cache", False)
     def test_pointwise_broadcast_nonzero_strides(self, prefer_nd_tiling: bool):
@@ -458,7 +550,12 @@ def test_pointwise_broadcast_nonzero_strides(self, prefer_nd_tiling: bool):
         col = torch.as_strided(full, col_shape, full.stride())
 
         # Expect 3 block pointers: 2 inputs one output
+<<<<<<< HEAD
         result, (triton_code,) = self._run_and_compare(
+=======
+        result, (triton_code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.add,
             full,
             col,
@@ -554,7 +651,12 @@ def test_reduction(
 
         # Expect at least 1 block pointer for the input.
         # Add 2 more if we generate 2 kernels.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sum,
             view,
             expected_num_block_pointers=num_block_pointers,
@@ -588,14 +690,22 @@ def foo(x, y):
         ]
 
         # Expect 2 block pointers: inputs
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             *inputs,
             expected_num_block_pointers=num_block_pointers,
             expected_num_triton_kernels=num_triton_kernels,
         )
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multiple_max_block_non_power_of_2(self):
         """
         Check that we support dims of size n * MAX_BLOCK, where n is any positive integer, not
@@ -620,14 +730,22 @@ def foo(x):
         self.assertTrue(len(nontrivial_dims) > 1)
 
         # Expect 2 block pointers: input and output
+<<<<<<< HEAD
         self._run_and_compare(foo, view, expected_num_block_pointers=2)
+=======
+        run_and_compare(self, foo, view, expected_num_block_pointers=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @parametrize(
         "nd_tiling,num_block_pointers",
         [
+<<<<<<< HEAD
             subtest(
                 (True, 2), decorators=[xfail_if_use_tensor_descriptor]
             ),  # With tiling, the index is affine.
+=======
+            (True, 2),  # With tiling, the index is affine.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (False, 1),  # We can't infer that the load is a power of 2.
         ],
     )
@@ -639,7 +757,12 @@ def test_dynamic_shapes_pointwise(self, nd_tiling: bool, num_block_pointers: int
         view_size = (4, 4)
         view = self._discontiguous_tensor(view_size, self.device)
 
+<<<<<<< HEAD
         self._run_and_compare(
+=======
+        run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.div,
             view,
             view,
@@ -651,9 +774,13 @@ def test_dynamic_shapes_pointwise(self, nd_tiling: bool, num_block_pointers: int
     @parametrize(
         "with_tiling,num_block_pointers",
         [
+<<<<<<< HEAD
             subtest(
                 (True, 1), decorators=[xfail_if_use_tensor_descriptor]
             ),  # With tiling, the index is affine.
+=======
+            (True, 1),  # With tiling, the index is affine.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (False, 0),  # We can't infer that the load is a power of 2.
         ],
     )
@@ -666,7 +793,12 @@ def test_dynamic_shapes_reduction(self, with_tiling: bool, num_block_pointers: i
         view_size = (4, 4)
         view = self._discontiguous_tensor(view_size, self.device)
 
+<<<<<<< HEAD
         self._run_and_compare(
+=======
+        run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.prod,
             view,
             expected_num_block_pointers=num_block_pointers,
@@ -696,6 +828,7 @@ def foo(x):
         x = torch.randn(x_size).to(device)
 
         # Expect 2 block pointers: input and output
+<<<<<<< HEAD
         self._run_and_compare(
             x, compile_kwargs={"dynamic": True}, expected_num_block_pointers=2
         )
@@ -706,6 +839,12 @@ def foo(x):
             param_kwargs["num_block_pointers"] == 3 and param_kwargs["num_tiles"] == 1
         ),
     )
+=======
+        run_and_compare(
+            self, x, compile_kwargs={"dynamic": True}, expected_num_block_pointers=2
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "full_size,view_size,num_block_pointers,num_tiles",
         [
@@ -763,7 +902,12 @@ def get_input() -> torch.Tensor:
         args = [get_input() for arg_idx in range(2)]
 
         # Expect up to 3 block pointers: 2 inputs 1 output.
+<<<<<<< HEAD
         result, code = self._run_and_compare(
+=======
+        result, code = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.add,
             *args,
             expected_num_block_pointers=num_block_pointers,
@@ -782,7 +926,10 @@ def get_input() -> torch.Tensor:
                 else:
                     self.assertNotIn(tile_name, program)
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "view_size,num_block_pointers,num_triton_kernels,reduction_op",
         [
@@ -808,7 +955,12 @@ def test_2d_reduction_odd_shapes(
 
         # Expect at least 1 block pointer for the input.
         # Add 2 more if we generate 2 kernels.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             reduction_op,
             view,
             expected_num_block_pointers=num_block_pointers,
@@ -819,13 +971,21 @@ def test_2d_reduction_odd_shapes(
         # Check the code for multiple Rn_BLOCK's
         self._assert_reduction_ndims(code, 2)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "size,expected_num_block_pointers,expected_num_triton_kernels,expect_fallback",
         [
             ((8, 8), 1, 1, True),  # Persistent Welford fallback
+<<<<<<< HEAD
             subtest(
                 ((128, 128), 7, 2, False), decorators=[xfail_if_use_tensor_descriptor]
             ),  # Looped Welford reduction
+=======
+            ((128, 128), 9, 2, False),  # Looped Welford reduction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     def test_2d_welford_reduction(
@@ -846,7 +1006,12 @@ def test_2d_welford_reduction(
         view = self._discontiguous_tensor(size, self.device)
 
         # We expect many block pointers for this one.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.var_mean,
             view,
             expected_num_block_pointers=expected_num_block_pointers,
@@ -873,7 +1038,12 @@ def test_welford_non_block_pointer(
         view = self._discontiguous_tensor((259, 311), self.device)
 
         # We expect many block pointers for this one.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.var_mean,
             view,
             expected_num_block_pointers=6,
@@ -895,7 +1065,12 @@ def test_reduction_multiple_discontiguous_dims(self):
         # Use odd shapes to frustrate block pointer analysis.
         view = self._discontiguous_tensor((3, 7, 11), self.device)
 
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sum,
             view,
             expected_num_block_pointers=0,
@@ -906,7 +1081,10 @@ def test_reduction_multiple_discontiguous_dims(self):
         # Check for 2 reduction dimensions.
         self._assert_reduction_ndims(code, 2)
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor  # Cannot use TMA API for store with no x dimension.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @test_torchinductor.skip_if_triton_cpu  # Illegal instruction  File; cannot xfail because it crashes process
     def test_2d_reduction_multi_kernel(self):
         """
@@ -921,10 +1099,18 @@ def foo(x):
             x = x.reshape(x.shape[0], -1)
             return torch.softmax(x, -1)
 
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
             foo,
             view,
             expected_num_block_pointers=5,
+=======
+        result, (code,) = run_and_compare(
+            self,
+            foo,
+            view,
+            expected_num_block_pointers=6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected_num_triton_kernels=2,
             config_patches={
                 "triton.multi_kernel": True,
@@ -938,7 +1124,10 @@ def foo(x):
         # Check for 2 reduction dimensions.
         self._assert_reduction_ndims(code, 2)
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fused_2d_reduction(
         self,
     ):
@@ -953,7 +1142,12 @@ def foo(x):
         view = self._discontiguous_tensor(view_size, self.device)
 
         # Expect at least 1 block pointer for the input.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             view,
             expected_num_block_pointers=1,
@@ -982,7 +1176,12 @@ def foo(*args):
         arg1 = torch.empty(view_size)
 
         # No guarantees on the number of kernels or pointers.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             arg0,
             arg1,
@@ -994,7 +1193,11 @@ def foo(*args):
 
     @parametrize(
         "tile_reductions",
+<<<<<<< HEAD
         [False, subtest(True, decorators=[xfail_if_use_tensor_descriptor])],
+=======
+        [False, True],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_enable_tiled_reductions(self, tile_reductions: bool):
         """
@@ -1003,7 +1206,12 @@ def test_enable_tiled_reductions(self, tile_reductions: bool):
         view = self._discontiguous_tensor((9, 11), self.device)
 
         # If tiled, we expect 1 block pointer for the input.
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sum,
             view,
             expected_num_block_pointers=1 if tile_reductions else 0,
@@ -1017,7 +1225,10 @@ def test_enable_tiled_reductions(self, tile_reductions: bool):
         # Check the code for multiple Rn_BLOCK's
         self._assert_reduction_ndims(code, 2 if tile_reductions else 1)
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_complex_reshape_block_ptr(self):
         def func(x, y):
             add_ = x + y
@@ -1031,7 +1242,12 @@ def func(x, y):
             return clone_0, clone_1
 
         inps = (torch.rand((8, 2048), device=self.device, dtype=torch.float32),) * 2
+<<<<<<< HEAD
         result, code = self._run_and_compare(
+=======
+        result, code = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             func,
             *inps,
             expected_num_triton_kernels=2,
@@ -1039,7 +1255,10 @@ def func(x, y):
         )
         self.assertTrue("Min" not in code[0])
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu()  # FIXME this test failed on Triton-CPU
     def test_3d_permute_tiling(self):
         """
@@ -1053,7 +1272,12 @@ def foo(x, y, z):
             return a + b
 
         inps = (torch.rand((51, 51, 51), device=self.device, dtype=torch.float32),) * 3
+<<<<<<< HEAD
         result, (code,) = self._run_and_compare(
+=======
+        result, (code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             *inps,
             expected_num_triton_kernels=1,
@@ -1075,6 +1299,10 @@ def test_unbacked_size_on_non_contig_dim(self, num_tile_candidates: int):
 
         def foo(x, length):
             unbacked = length.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(unbacked)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             repeated = x.repeat(1, unbacked, NUM_REPEAT)
             # permute creates split in middle with unbacked symint is the first range
@@ -1088,7 +1316,12 @@ def foo(x, length):
         )
 
         with torch._dynamo.config.patch({"capture_scalar_outputs": True}):
+<<<<<<< HEAD
             self._run_and_compare(
+=======
+            run_and_compare(
+                self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 foo,
                 *inps,
                 expected_num_triton_kernels=1,
@@ -1106,8 +1339,11 @@ def foo(x, length):
     # bernoulli operation
     # TODO: fails for triton CPU "Failed to convert to LLVM IR"
     @test_torchinductor.xfail_if_triton_cpu
+<<<<<<< HEAD
     # Disable split_reductions on this test for now due to the interaction with LOAF
     @config.patch(split_reductions=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_removed_buffers(self):
         from torch.ops import aten
 
@@ -1115,16 +1351,28 @@ def fn(a):
             return aten.bernoulli(a).sum() / torch.prod(torch.tensor(a.size()))
 
         p = 0.3
+<<<<<<< HEAD
         result, code = self._run_and_compare(
             fn,
             *[torch.ones(200, 200, device=self.device) * p],
             expected_num_triton_kernels=1,
             expected_num_block_pointers=1,
+=======
+        result, code = run_and_compare(
+            self,
+            fn,
+            *[torch.ones(200, 200, device=self.device) * p],
+            expected_num_triton_kernels=2,
+            expected_num_block_pointers=3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             atol=p * 0.06,
             rtol=0.06,
         )
 
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_index_order(self):
         """
         Test the order of indices in pointwise kernels. Expect Z to be the leading dim,
@@ -1135,7 +1383,12 @@ def test_pointwise_index_order(self):
             self._discontiguous_tensor((5, 5, 5), device=self.device) for _ in range(2)
         ]
 
+<<<<<<< HEAD
         result, (triton_code,) = self._run_and_compare(
+=======
+        result, (triton_code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.add,
             *inps,
             expected_num_triton_kernels=1,
@@ -1183,7 +1436,12 @@ def foo(x):
             return x.expand(*expanded_size).clone()
 
         inps = [torch.randn(base_size, device=self.device)]
+<<<<<<< HEAD
         result, (triton_code,) = self._run_and_compare(
+=======
+        result, (triton_code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             *inps,
             expected_num_triton_kernels=1,
@@ -1212,7 +1470,12 @@ def foo(x, y, z):
             torch.randn((128,), device=self.device),
             torch.randn((8, 11, 128), device=self.device),
         ]
+<<<<<<< HEAD
         result, (triton_code,) = self._run_and_compare(
+=======
+        result, (triton_code,) = run_and_compare(
+            self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo,
             *inps,
             expected_num_triton_kernels=1,
@@ -1227,6 +1490,7 @@ def foo(x, y, z):
         # Singleton splits should be discarded.
         self._assert_pointwise_ndims(triton_code, 2)
 
+<<<<<<< HEAD
     # Integration test to ensure that matched dims & strides from match_mod_div_expr
     # are unsigned and signed integers respectively. This test case has the following
     # index:=(ModularIndexing(xindex, 4, 4)) + 4*(ModularIndexing(xindex, 32, 2))
@@ -1288,6 +1552,8 @@ def model(x, y):
             expected_num_block_pointers=3,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("triton.prefer_nd_tiling", True)
     @config.patch("triton.max_tiles", 3)
     @parametrize(
@@ -1305,7 +1571,10 @@ def model(x, y):
             ),
         ],
     )
+<<<<<<< HEAD
     @xfail_if_use_tensor_descriptor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_boundary_check(self, block_multiple, ynumel_exceed_ygrid_size, include_z):
         @dataclasses.dataclass
         class InputShape:
@@ -1345,7 +1614,12 @@ def func(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
             return a + b
 
         with V.set_choices_handler(FixedBlockSizeChoices()):
+<<<<<<< HEAD
             result, code = self._run_and_compare(
+=======
+            result, code = run_and_compare(
+                self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 func,
                 a,
                 b,
@@ -1376,7 +1650,11 @@ def func(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
 @unittest.skipIf(not TRITON_HAS_CPU, "requires triton CPU backend")
 @config.patch(cpu_backend="triton")
 @config.patch("triton.use_block_ptr", True)
+<<<<<<< HEAD
 class TritonBlockPointerTestCPU(BlockDescriptorTestBase):
+=======
+class TritonBlockPointerTestCPU(BlockPointerTestBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device = "cpu"
 
 
@@ -1390,12 +1668,17 @@ class TritonBlockPointerTestCPU(BlockDescriptorTestBase):
 
 @unittest.skipIf(not HAS_GPU, "requires triton GPU backend")
 @config.patch("triton.use_block_ptr", True)
+<<<<<<< HEAD
 class TritonBlockPointerTestGPU(BlockDescriptorTestBase):
+=======
+class TritonBlockPointerTestGPU(BlockPointerTestBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device = GPU_TYPE
 
 
 test_torchinductor.copy_tests(CommonTemplate, TritonBlockPointerTestGPU, GPU_TYPE)
 
+<<<<<<< HEAD
 
 @unittest.skipIf(
     not (
@@ -1419,6 +1702,8 @@ class TritonTensorDescriptorTestCUDA(BlockDescriptorTestBase):
     test_failures=TMA_TEST_XFAIL,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index 1573d4860a84c..18d9aa96c8a6e 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -3,13 +3,17 @@
 import functools
 import sys
 import unittest
+<<<<<<< HEAD
 from unittest import skipUnless
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import MagicMock, patch
 
 import torch
 from torch._dynamo.testing import rand_strided
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.utils import clone_preserve_strides
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_LINUX,
@@ -21,6 +25,11 @@
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CUDA_AND_TRITON,
+=======
+from torch.testing._internal.common_utils import IS_LINUX, runOnRocm, skipIfXpu
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_GPU,
     requires_cuda_with_enough_memory,
 )
@@ -76,7 +85,10 @@ def get_autotuned_amd_sqr_kernel():
     )(amd_sqr_kernel)
 
 
+<<<<<<< HEAD
 @instantiate_parametrized_tests
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTritonHeuristics(TestCase):
     device_type = GPU_TYPE
 
@@ -267,11 +279,16 @@ def grid(meta):
         def fn(x):
             return triton_sqr(x)
 
+<<<<<<< HEAD
         x = torch.randn(32, device=GPU_TYPE)
+=======
+        x = torch.randn(32, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = fn(x)
         res = torch.compile(fn)(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @skipIfXpu
     @skipIfRocm
     @skipUnless(HAS_CUDA_AND_TRITON, "requires CUDA")
@@ -300,6 +317,8 @@ def test_prune_configs_over_shared_memory_limit(self, do_pruning):
             )
             self.assertEqual(len(configs), expected_count)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestArgumentCloneAndRestore(TestCase):
     # Our tensor is large enough. If a unexpected copy happens, the
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 4739d00f1f4ad..9a27d38270510 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -4,7 +4,10 @@
 # Skip do not assign a lambda expression, use a def
 import functools
 import logging
+<<<<<<< HEAD
 import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.testing
@@ -32,12 +35,16 @@
     skipIfWindows,
     skipIfXpu,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CUDA_AND_TRITON,
     HAS_GPU,
     HAS_XPU_AND_TRITON,
 )
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU, HAS_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.logging_utils import log_settings, logs_to_string
 
 # Defines all the kernels for tests
@@ -53,7 +60,11 @@
     import triton
     from triton import language as tl
 
+<<<<<<< HEAD
     if HAS_CUDA_AND_TRITON:
+=======
+    if HAS_CUDA:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             from triton.language.extra.libdevice import (  # @manual
                 fast_dividef,
@@ -64,7 +75,11 @@
                 fast_dividef,
                 fast_dividef as my_fast_dividef,
             )
+<<<<<<< HEAD
     elif HAS_XPU_AND_TRITON:
+=======
+    elif HAS_XPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from triton.language.extra.intel.libdevice import (  # @manual
             fast_dividef,
             fast_dividef as my_fast_dividef,
@@ -84,12 +99,15 @@ def _triton_get_ast_equal_to_str(params):
     BOOL_CONSTANT_C: tl.constexpr = tl.constexpr(True)
     FLOAT_CONSTANT_C = tl.constexpr(3.14)  # intentionally un-annotated
 
+<<<<<<< HEAD
     if hasattr(triton, "constexpr_function"):
 
         @triton.constexpr_function
         def log2(n):
             return len(bin(n)) - 3
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class KernelTests(torch._inductor.test_case.TestCase):
     def _kernel_launched_in_code(self, kernel_name: str, code: str) -> bool:
@@ -1017,7 +1035,11 @@ def _mul2(x):
         def f(x):
             for _ in range(4):
                 # The output of one kernel is the input to the next kernel, but
+<<<<<<< HEAD
                 # at some point we should reuse buffers not allocate new ones.
+=======
+                # at some point we should re-use buffers not allocate new ones.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = _mul2(x)
             return x + 1
 
@@ -1035,7 +1057,11 @@ def f(x):
         num_bufs_allocated = code.count(code_string)
         self.assertEqual(num_bufs_allocated, 2)
 
+<<<<<<< HEAD
         # Check we're reusing buffers if not allocating.
+=======
+        # Check we're re-using buffers if not allocating.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_bufs_reused = code.count(
             "// reuse" if inductor_config.cpp_wrapper else "# reuse"
         )
@@ -1281,11 +1307,16 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
+<<<<<<< HEAD
     @common_utils.parametrize("dump_launch_params", ["0", "1"])
     @common_utils.parametrize("dynamic", [False, True])
     def test_triton_kernel_equal_to_1_arg(self, dynamic, dump_launch_params):
         os.environ["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = dump_launch_params
 
+=======
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_triton_kernel_equal_to_1_arg(self, dynamic):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @triton.jit
         def add_kernel_half_n_elements(
             in_ptr0,
@@ -1323,10 +1354,17 @@ def f(x, y):
         else:
             if dynamic:
                 # when half_n_elements passed to the Triton kernel is
+<<<<<<< HEAD
                 # dynamic, equal_to_1 specialization can't be enforced
 
                 # also, equal_to_1 specialization doesn't occur (or appear in the signature)
                 # for newer versions of triton (i.e. the ones where triton_version_uses_attrs_dict() == True)
+=======
+                # dynamic, equal_to_1 specializaiton can't be enforced
+
+                # also, equal_to_1 specialization doesn't occur (or appear in the signature)
+                # for newer versions ofo triton (i.e. the ones where triton_version_uses_attrs_dict() == True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
             else:
                 self.assertTrue(_triton_get_ast_equal_to_str((3,)) in sources[0])
@@ -1393,6 +1431,7 @@ def f(x):
 
         self.assertEqual(compiled_out, eager_out)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not HAS_GPU or not hasattr(triton, "constexpr_function"),
         "newer triton version required",
@@ -1426,6 +1465,8 @@ def f(x):
         self.assertIn("@triton.constexpr_function", triton_code)
         self.assertEqual(compiled_out, eager_out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_triton_kernel_with_imported_symbol_with_custom_name(self):
         @triton.jit
@@ -2243,7 +2284,11 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     # TODO enable this test case on XPU.
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("cfg", ["normal", "cpp_wrapper"])
     def test_triton_kernel_dtype_view(self, cfg):
         # https://github.com/pytorch/pytorch/issues/136159
@@ -2567,6 +2612,7 @@ def fn(inp):
         expected = torch.compile(fn, fullgraph=True)(inp)
         self.assertEqual(actual, expected)
 
+<<<<<<< HEAD
     @requires_gpu
     @inductor_config.patch("emulate_precision_casts", True)
     def test_triton_kernel_emulate_precision_unaffected(self):
@@ -2613,6 +2659,8 @@ def fn(a, b):
                 ) from e
             raise
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def make_mutation_test(fn):
     @requires_gpu
@@ -3005,7 +3053,11 @@ def add_1_time_kernel(
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
             y = tl.load(in_ptr1 + offsets, mask=mask)
+<<<<<<< HEAD
             for i in range(BLOCK_SIZE):
+=======
+            for i in range(0, BLOCK_SIZE):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 i = tl.multiple_of(i, 1)
             output = x + y
             tl.store(out_ptr + offsets, output, mask=mask)
@@ -3160,7 +3212,11 @@ def fwd_kernel(
             x = tl.load(x_block_ptr)
 
             # Compute gating
+<<<<<<< HEAD
             for c2 in range(tl.cdiv(C2, BLOCK_SIZE_C2)):
+=======
+            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Compute block pointers
                 offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
                 o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
@@ -3673,6 +3729,7 @@ def f(x, y):
         self.assertNotIn(opname, code)
 
     @requires_gpu
+<<<<<<< HEAD
     def test_subclass(self):
         libname = "my_cool_namespace"
         opname = "my_triton_operator"
@@ -3707,6 +3764,8 @@ def f(x, y):
         self.assertEqual(out.b, expected.b)
 
     @requires_gpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dynamo_config.patch("recompile_limit", 1)
     def test_triton_dynamic_grid_no_recompile(self):
         libname = "my_cool_namespace"
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index eb882d36160e2..f1b5c52a6a013 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -236,6 +236,10 @@ def test_mm_and_friends(self, device, torch_fn, coordinate_descent_tuning):
 
         def fn(x, w, repeats, is_bmm):
             u0 = repeats.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             x_unbacked = x.expand(u0, 32)
             w_unbacked = w.expand(32, u0)
@@ -267,6 +271,10 @@ def fn(x, w, repeats, is_bmm):
     def test_unbacked_range_tree_divisor(self, device):
         def fn(x, num):
             u0 = num.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             zeros = torch.zeros(u0, device=device, dtype=torch.int)
             return (torch.ops.aten.index(x, [None, zeros]),)
 
@@ -300,6 +308,11 @@ def fn(value, mask):
     def test_unbacked_repeat(self, device):
         def fn(x, a, b):
             u0, u1 = a.item(), b.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return x.repeat(u0, 2).repeat(2, u1)
 
@@ -357,11 +370,19 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                     inp = args[0]
 
                     start = inp.slice_bounds[0].item()
+<<<<<<< HEAD
                     torch._check(start >= 0)
                     torch._check(start <= inp.size(0))
 
                     length = (args[0].slice_bounds[1] - args[0].slice_bounds[0]).item()
                     torch._check(length >= 0)
+=======
+                    torch._check_is_size(start)
+                    torch._check(start <= inp.size(0))
+
+                    length = (args[0].slice_bounds[1] - args[0].slice_bounds[0]).item()
+                    torch._check_is_size(length)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch._check(start + length <= inp.size(0))
 
                     return CustomSliceSubclass(
@@ -485,6 +506,7 @@ def fn(q, k, vector, scalar):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+<<<<<<< HEAD
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
     def test_softmax(self, device):
@@ -675,6 +697,8 @@ def fn(x, y, a):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestUnbackedSymints, globals(), allow_xpu=True)
 
diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
index fa666dfc987ec..9485ff19d7220 100644
--- a/test/inductor/test_utils.py
+++ b/test/inductor/test_utils.py
@@ -1,11 +1,15 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 import unittest
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from sympy import Symbol, sympify
 
 import torch
 from torch._inductor.fx_utils import count_flops_fx, countable_fx
+<<<<<<< HEAD
 from torch._inductor.utils import get_device_tflops, sympy_str, sympy_subs
 from torch._inductor.virtualized import V
 from torch.testing._internal.common_device_type import (
@@ -13,6 +17,11 @@
     instantiate_device_type_tests,
 )
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import sympy_str, sympy_subs
+from torch._inductor.virtualized import V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestUtils(TestCase):
@@ -65,7 +74,11 @@ def testSympySubs(self):
         result = sympy_subs(expr, {Symbol("x", integer=False): Symbol("y")})
         self.assertEqual(result.name, "x")
 
+<<<<<<< HEAD
         # replaced can't be string
+=======
+        # replaced cant be string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertRaises(AssertionError, sympy_subs, expr, {"x": "y"})
 
         # replaced can be an expression
@@ -74,7 +87,11 @@ def testSympySubs(self):
         self.assertEqual(expr.is_integer, None)
         self.assertEqual(expr.is_nonnegative, None)
         # replace abs(x) with y
+<<<<<<< HEAD
         # propagate abs(x) sympy properties.
+=======
+        # propagte abs(x) sympy properties.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = sympy_subs(expr, {expr: Symbol("y")})
         self.assertEqual(result.name, "y")
         self.assertEqual(result.is_integer, None)
@@ -131,6 +148,7 @@ def create_fx_node(
                 (
                     torch.ops.aten.convolution,
                     (
+<<<<<<< HEAD
                         torch.Tensor(2, 2, 3),
                         torch.Tensor(2, 2, 2),
                         torch.Tensor(2),
@@ -139,6 +157,16 @@ def create_fx_node(
                         (1,),
                         True,
                         (0,),
+=======
+                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1, 1),
+                        (0, 0),
+                        (1, 1),
+                        True,
+                        (0, 0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         1,
                     ),
                     {},
@@ -194,6 +222,7 @@ def create_fx_node(
                     countable_fx(fx_node_2), f"Expected false {f}: {fx_node_2}"
                 )
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.cuda.is_available(), "skip if no device")
     @dtypes(torch.float16, torch.bfloat16, torch.float32)
     def test_get_device_tflops(self, dtype):
@@ -202,6 +231,8 @@ def test_get_device_tflops(self, dtype):
 
 
 instantiate_device_type_tests(TestUtils, globals(), allow_xpu=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_xpu_basic.py b/test/inductor/test_xpu_basic.py
index 4501b8264c5f9..5282493b7df02 100644
--- a/test/inductor/test_xpu_basic.py
+++ b/test/inductor/test_xpu_basic.py
@@ -53,7 +53,13 @@ def fn(a, b):
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
+<<<<<<< HEAD
     from torch.testing._internal.inductor_utils import HAS_XPU_AND_TRITON
 
     if HAS_XPU_AND_TRITON:
+=======
+    from torch.testing._internal.inductor_utils import HAS_XPU
+
+    if HAS_XPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/jit/test_alias_analysis.py b/test/jit/test_alias_analysis.py
index 8905872c5c3cc..bc724751298b1 100644
--- a/test/jit/test_alias_analysis.py
+++ b/test/jit/test_alias_analysis.py
@@ -23,7 +23,11 @@ def test_becomes_wildcard_annotations(self):
         graph = parse_ir(graph_str)
         alias_db = graph.alias_db()
         split_node = graph.findNode("aten::split")
+<<<<<<< HEAD
         # split input enters wildcard set, list initialized as containing wildcard set
+=======
+        # split input enters wildcard set, list initalized as containing wildcard set
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             alias_db.may_contain_alias(next(split_node.inputs()), split_node.output())
         )
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index f128f9c7eec36..9cab353ff4e5c 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -27,9 +27,12 @@
 )
 
 
+<<<<<<< HEAD
 assert GRAPH_EXECUTOR is not None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @unittest.skipIf(
     GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients"
 )
diff --git a/test/jit/test_backend_nnapi.py b/test/jit/test_backend_nnapi.py
index 3e79b257131a9..312b7e378dba2 100644
--- a/test/jit/test_backend_nnapi.py
+++ b/test/jit/test_backend_nnapi.py
@@ -17,7 +17,11 @@
 # hacky way to skip these tests in fbcode:
 # during test execution in fbcode, test_nnapi is available during test discovery,
 # but not during test execution. So we can't try-catch here, otherwise it'll think
+<<<<<<< HEAD
 # it sees tests but then fails when it tries to actually run them.
+=======
+# it sees tests but then fails when it tries to actuall run them.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not IS_FBCODE:
     from test_nnapi import TestNNAPI
 
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 60b16469fc084..2c0b2b3ec7c4e 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -16,6 +16,11 @@
     IS_SANDCASTLE,
     IS_WINDOWS,
     raise_on_run_directly,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+    TEST_WITH_ROCM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -59,7 +64,11 @@ def sub_accum(self, x, h):
 
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
 @unittest.skipIf(
+<<<<<<< HEAD
     IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+=======
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Non-portable load_library call used in test",
 )
 class JitBackendTestCase(JitTestCase):
@@ -142,6 +151,10 @@ def test_execution(self):
         self.check_function("sub_accum", (input, input))
         self.check_function("forward", (input, input))
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_load(self):
         # Lowered module should produce the same outputs.
         self.test_execution()
@@ -200,6 +213,10 @@ def test_execution(self):
             backend_method = self.lowered_module.__getattr__("forward")
             backend_method(*(input, input))
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_load(self):
         # Test that saving the lowered module is OK but loading fails because the backend is not available.
         buffer = io.BytesIO()
@@ -443,7 +460,11 @@ def test_errors(self):
 
 # This is needed for IS_WINDOWS or IS_MACOS to skip the tests.
 @unittest.skipIf(
+<<<<<<< HEAD
     IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+=======
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Non-portable load_library call used in test",
 )
 class TestBackends(JitTestCase):
@@ -461,23 +482,41 @@ def __init__(self, name):
 
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         self.basic_module_test.setUp()
         self.basic_module_unavailable_test.setUp()
         self.nested_module_test.setUp()
         self.selective_lowering_test.setUp()
 
+=======
+        if not TEST_WITH_ROCM:
+            self.basic_module_test.setUp()
+            self.basic_module_unavailable_test.setUp()
+            self.nested_module_test.setUp()
+            self.selective_lowering_test.setUp()
+
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_execution(self):
         self.basic_module_test.test_execution()
         self.basic_module_unavailable_test.test_execution()
         self.nested_module_test.test_execution()
         self.selective_lowering_test.test_execution()
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_load(self):
         self.basic_module_test.test_save_load()
         self.basic_module_unavailable_test.test_save_load()
         self.nested_module_test.test_save_load()
         self.selective_lowering_test.test_save_load()
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_errors(self):
         self.selective_lowering_test.test_errors()
 
@@ -502,7 +541,11 @@ def forward(self, x, h):
 
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
 @unittest.skipIf(
+<<<<<<< HEAD
     IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+=======
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Non-portable load_library call used in test",
 )
 class JitBackendTestCaseWithCompiler(JitTestCase):
@@ -794,8 +837,12 @@ def test_attribute(self):
         # Attach bundled inputs which adds several attributes and functions to the model
         self.lowered_module = (
             torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
+<<<<<<< HEAD
                 lowered_module,  # noqa: F821
                 input,
+=======
+                lowered_module, input  # noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
         post_bundled = self.lowered_module(
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index 097130b6f1642..78f9ae5c94997 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -131,6 +131,7 @@ def del_dict_multiple_operands(x: Dict[str, int]) -> Dict[str, int]:
         jit_out = torch.jit.script(del_dict_multiple_operands)({"hi": 5, "there": 6})
         self.assertEqual(py_out, jit_out)
 
+<<<<<<< HEAD
     def test_torch_check(self):
         """Test torch._check functionality with flexible argument handling"""
 
@@ -289,6 +290,8 @@ def too_many_total_args(x):
                 torch._check(True, "msg", cond=False)
                 return x
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestTensorBuiltins(JitTestCase):
     def test_tensor_properties(self):
@@ -326,8 +329,11 @@ def fn(x):
             # This has a longer implementation, maybe not worth copying to
             # TorchScript if named tensors don't work there anyways
             "names",
+<<<<<<< HEAD
             # We don't plan to support grad_dtype in TorchScript
             "grad_dtype",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         for p in properties:
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
index 8cfe63faa0e6a..800cea6cf3bd7 100644
--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@@ -14,6 +14,10 @@
     NoTest,
     raise_on_run_directly,
     skipCUDANonDefaultStreamIf,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_CUDA,
 )
 from torch.testing._internal.jit_utils import JitTestCase
@@ -47,6 +51,10 @@ def tearDown(self):
         torch.cuda.empty_cache()
         super().tearDown()
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_cuda_synchronize(self):
         # Test device synchronization.
@@ -119,6 +127,10 @@ def event_default_args() -> bool:
 
         self.assertTrue(event_default_args)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_current_stream(self):
         # Test current stream on the device and check if the stream device index
@@ -158,6 +170,10 @@ def fn_with_device_index_args():
         self.assertEqual(0, d2)
         self.assertEqual(d0, d2)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     @skipCUDANonDefaultStreamIf(True)
@@ -292,7 +308,11 @@ class Result(NamedTuple):
             default_stream_id: int
             user_stream_id: int
 
+<<<<<<< HEAD
         # The test aims at checking different stream properties.
+=======
+        # The test aims at checking different stream proporties.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.jit.script
         def test_get_stream():
             device_index = torch.cuda.current_device()
@@ -499,7 +519,11 @@ def test_simple_event():
 
         # Record the CUDA event for operation torch.mm on the current stream
         # and then test if the elapsed time is greater than 0. This test is also
+<<<<<<< HEAD
         # an adaption from eager mode CUDA tests available at test/test_cuda.py
+=======
+        # an adaption from eager mdoe CUDA tests available at test/test_cuda.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.jit.script
         def test_event():
             device_index = torch.cuda.current_device()
@@ -522,7 +546,11 @@ def test_event():
         self.assertGreater(test_event(), 0)
 
         # Check for stream synchronization , when a large tensor multiplication is
+<<<<<<< HEAD
         # computed on the stream. The stream.query should be true once the synchronization is done
+=======
+        # computed on the stream. The stream.query should be true once the synchroniztion is done
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.jit.script
         def test_stream_synchronize() -> float:
             device_index = torch.cuda.current_device()
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 91ecf6f3629b2..5b902492cc542 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -16,8 +16,15 @@
 from torch.testing._internal.common_quantized import override_quantized_engine
 from torch.testing._internal.common_utils import (
     raise_on_run_directly,
+<<<<<<< HEAD
     set_default_dtype,
     skipCUDAMemoryLeakCheckIf,
+=======
+    NAVI4_ARCH,
+    set_default_dtype,
+    skipCUDAMemoryLeakCheckIf,
+    skipIfRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TEST_WITH_ROCM,
 )
@@ -563,7 +570,11 @@ def forward(self, x):
         self.assertTrue(mf.hasattr("sub1"))
         self.assertTrue(mf.sub1.hasattr("a"))
         self.assertFalse(mf.sub1.hasattr("b"))
+<<<<<<< HEAD
         # sub2 is fully folded because self.sub1 and self.sub2.sub are not alias (Scripting bug)
+=======
+        # sub2 is fully folded becasue self.sub1 and self.sub2.sub are not alias (Scripting bug)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertFalse(mf.hasattr("sub2"))
         input = torch.randn(2, 2)
         output = m.forward(input)
@@ -2083,9 +2094,15 @@ def forward(self, x):
 
             mod_eager = ConvBN(3, 32, kernel_size=3, stride=2).eval()
             inps = [4, 3, 4]
+<<<<<<< HEAD
             if modules[0] is nn.Conv2d:
                 inps.append(inps[-1])
             if modules[0] is nn.Conv3d:
+=======
+            if modules[0] == nn.Conv2d:
+                inps.append(inps[-1])
+            if modules[0] == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -2224,9 +2241,15 @@ def forward(self, x):
             mod_eager = ConvOp(3, 32, kernel_size=3, stride=2).eval()
 
             inps = [4, 3, 4]
+<<<<<<< HEAD
             if module is nn.Conv2d:
                 inps.append(inps[-1])
             if module is nn.Conv3d:
+=======
+            if module == nn.Conv2d:
+                inps.append(inps[-1])
+            if module == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -2366,10 +2389,17 @@ def forward(self, x):
             mod_eager = LinearBN(32, 32).eval()
 
             inps = [3, 32]
+<<<<<<< HEAD
             if modules[1] is nn.BatchNorm2d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
             if modules[1] is nn.BatchNorm3d:
+=======
+            if modules[1] == nn.BatchNorm2d:
+                inps.append(inps[-1])
+                inps.append(inps[-1])
+            if modules[1] == nn.BatchNorm3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
                 inps.append(inps[-1])
@@ -2429,6 +2459,7 @@ def test_bn_not_broadcast_with_linear(self):
 
             N, C = 3, bn_in
             input_shape = [N, C]
+<<<<<<< HEAD
             if modules[1] is nn.BatchNorm1d:
                 H = linear_in
                 input_shape.append(H)
@@ -2437,6 +2468,16 @@ def test_bn_not_broadcast_with_linear(self):
                 input_shape.append(H)
                 input_shape.append(W)
             elif modules[1] is nn.BatchNorm3d:
+=======
+            if modules[1] == nn.BatchNorm1d:
+                H = linear_in
+                input_shape.append(H)
+            elif modules[1] == nn.BatchNorm2d:
+                H, W = 4, linear_in
+                input_shape.append(H)
+                input_shape.append(W)
+            elif modules[1] == nn.BatchNorm3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 D, H, W = 4, 4, linear_in
                 input_shape.append(D)
                 input_shape.append(H)
@@ -2504,10 +2545,17 @@ def forward(self, x):
             mod_eager = LinearBN(32, 32).cuda().eval()
 
             inps = [3, 32]
+<<<<<<< HEAD
             if modules[1] is nn.BatchNorm2d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
             if modules[1] is nn.BatchNorm3d:
+=======
+            if modules[1] == nn.BatchNorm2d:
+                inps.append(inps[-1])
+                inps.append(inps[-1])
+            if modules[1] == nn.BatchNorm3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inps.append(inps[-1])
                 inps.append(inps[-1])
                 inps.append(inps[-1])
@@ -2757,9 +2805,15 @@ def test_conv_to_mkldnn(self):
             for module, trace in product([nn.Conv2d, nn.Conv3d], [False, True]):
                 mod = module(3, 32, kernel_size=3, stride=2).eval()
                 inps = [4, 3, 4]
+<<<<<<< HEAD
                 if module is nn.Conv2d:
                     inps.append(inps[-1])
                 if module is nn.Conv3d:
+=======
+                if module == nn.Conv2d:
+                    inps.append(inps[-1])
+                if module == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     inps.append(inps[-1])
                     inps.append(inps[-1])
 
@@ -2967,6 +3021,10 @@ def test_conv_to_mkldnn_no_mkldnn(self):
             self.assertEqual(frozen(inp), mod(inp))
 
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+<<<<<<< HEAD
+=======
+    @skipIfRocmArch(NAVI4_ARCH)  # not supported by MIOPEN on NAVI4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_freeze_conv_relu_fusion(self):
         with set_default_dtype(torch.float):
             conv_bias = [True, False]
@@ -2997,7 +3055,11 @@ def forward(self, x):
                 mod_eager = Net(3, 6, kernel_size=3, stride=2).eval().cuda()
 
                 inps = [5, 3, 4, 4]
+<<<<<<< HEAD
                 if conv is nn.Conv3d:
+=======
+                if conv == nn.Conv3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     inps.append(inps[-1])
                 inp = torch.rand(inps).cuda()
 
@@ -3029,6 +3091,10 @@ def forward(self, x):
                 self.assertEqual(mod_eager(inp), frozen_mod(inp))
 
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+<<<<<<< HEAD
+=======
+    @skipIfRocmArch(NAVI4_ARCH)  # not supported by MIOPEN on NAVI4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_freeze_conv_relu_fusion_not_forward(self):
         with set_default_dtype(torch.float):
 
diff --git a/test/jit/test_hash.py b/test/jit/test_hash.py
index 764110d46dd16..53d3efd469479 100644
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@@ -73,6 +73,13 @@ def fn(f1: float, f2: float):
         self.checkScript(fn, (1.2345, float("inf")))
         self.checkScript(fn, (float("inf"), float("inf")))
         self.checkScript(fn, (1.2345, float("nan")))
+<<<<<<< HEAD
+=======
+        if sys.version_info < (3, 10):
+            # Hash of two nans are not guaranteed to be equal. From https://docs.python.org/3/whatsnew/3.10.html :
+            # Hashes of NaN values of both float type and decimal.Decimal type now depend on object identity.
+            self.checkScript(fn, (float("nan"), float("nan")))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.checkScript(fn, (float("nan"), float("inf")))
 
     def test_hash_int(self):
diff --git a/test/jit/test_ignore_context_manager.py b/test/jit/test_ignore_context_manager.py
index 98fb3e7e21d20..fb2d5c034a4e8 100644
--- a/test/jit/test_ignore_context_manager.py
+++ b/test/jit/test_ignore_context_manager.py
@@ -2,6 +2,10 @@
 
 import os
 import sys
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -9,11 +13,19 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
+=======
+from torch.jit.frontend import _IS_ASTUNPARSE_INSTALLED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import raise_on_run_directly
 from torch.testing._internal.jit_utils import JitTestCase
 
 
 class TestIgnoreContextManager(JitTestCase):
+<<<<<<< HEAD
+=======
+    @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_with_ignore_context_manager_with_inp_out(self):
         class A(torch.nn.Module):
             def forward(self):
@@ -65,6 +77,10 @@ def forward(self):
         self.assertEqual(s(), 6)
         self.assertEqual(s(), model())
 
+<<<<<<< HEAD
+=======
+    @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_with_ignore_context_manager_with_just_inp(self):
         class A(torch.nn.Module):
             def forward(self):
@@ -79,6 +95,10 @@ def forward(self):
         self.assertEqual(s(), 4)
         self.assertEqual(s(), model())
 
+<<<<<<< HEAD
+=======
+    @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_with_ignore_context_manager_with_just_out(self):
         class A(torch.nn.Module):
             def forward(self):
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index 93c82d98c93ec..a3e1253205978 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -12,11 +12,15 @@
 from jit.test_module_interface import TestModuleInterface  # noqa: F401
 from torch import jit
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
     raise_on_run_directly,
     skipIfTorchDynamo,
 )
+=======
+from torch.testing._internal.common_utils import freeze_rng_state, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global, RUN_CUDA_HALF
 
 
@@ -437,6 +441,7 @@ def test_parse_ir_single_element_tensor_negative(self):
         self.assertTrue(ret.numel() == 1)
         self.assertTrue(len(ret.size()) == 1)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("The test case only test the parser. No need to wrap dynamo.")
     def test_parse_ir_single_inf(self):
         ir = """
@@ -485,6 +490,8 @@ def test_parse_ir_bool_false(self):
         ret = func()
         self.assertTrue(ret == False)  # noqa: E712
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_script_many_decorators(self):
         def no_op_decorator(f):
             return f
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index 4dd099dbaad5e..48856fbb3d2e6 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -7,7 +7,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import tf32_on_and_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     enable_profiling_mode_for_profiling_tests,
     GRAPH_EXECUTOR,
@@ -483,7 +486,10 @@ def test_super_resolution(self):
         self._test_super_resolution(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+<<<<<<< HEAD
     @tf32_on_and_off(0.02)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_super_resolution_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_super_resolution(self, device="cuda", check_export_import=False)
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index eaedf48080b92..154d445f6071c 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -279,23 +279,41 @@ def __init__(self) -> None:
                 self.moduledict = CustomModuleDict({"submod": self.submod})
 
             def forward(self, inputs):
+<<<<<<< HEAD
                 assert self.modulelist[0] is self.submod, (
                     "__getitem__ failing for ModuleList"
                 )
+=======
+                assert (
+                    self.modulelist[0] is self.submod
+                ), "__getitem__ failing for ModuleList"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(self.modulelist) == 1, "__len__ failing for ModuleList"
                 for module in self.modulelist:
                     assert module is self.submod, "__iter__ failing for ModuleList"
 
+<<<<<<< HEAD
                 assert self.sequential[0] is self.submod, (
                     "__getitem__ failing for Sequential"
                 )
+=======
+                assert (
+                    self.sequential[0] is self.submod
+                ), "__getitem__ failing for Sequential"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(self.sequential) == 1, "__len__ failing for Sequential"
                 for module in self.sequential:
                     assert module is self.submod, "__iter__ failing for Sequential"
 
+<<<<<<< HEAD
                 assert self.moduledict["submod"] is self.submod, (
                     "__getitem__ failing for ModuleDict"
                 )
+=======
+                assert (
+                    self.moduledict["submod"] is self.submod
+                ), "__getitem__ failing for ModuleDict"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(self.moduledict) == 1, "__len__ failing for ModuleDict"
 
                 # note: unable to index moduledict with a string variable currently
@@ -439,9 +457,15 @@ def __init__(self) -> None:
                 self.moduledict = CustomModuleDict()
 
             def forward(self, inputs):
+<<<<<<< HEAD
                 assert "submod" not in self.moduledict, (
                     "__contains__ fails for ModuleDict"
                 )
+=======
+                assert (
+                    "submod" not in self.moduledict
+                ), "__contains__ fails for ModuleDict"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return inputs
 
         m = MyModule()
diff --git a/test/jit/test_modules.py b/test/jit/test_modules.py
index ff4ca58e557e4..7505daa297577 100644
--- a/test/jit/test_modules.py
+++ b/test/jit/test_modules.py
@@ -21,7 +21,11 @@ def test_script_module_with_constants_list(self):
         """
 
         # torch.nn.Linear has a __constants__ attribute defined
+<<<<<<< HEAD
         # and initialized to a list.
+=======
+        # and intialized to a list.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Net(torch.nn.Linear):
             x: torch.jit.Final[int]
 
diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index 12b9c3f18348a..e9e118e9b006f 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -152,7 +152,11 @@ def test():
         self.run_pass("peephole", test.graph)
         FileCheck().check_not("prim::unchecked_cast").run(test.graph)
 
+<<<<<<< HEAD
         # refinement not optimized out
+=======
+        # refinement not optimzied out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def is_int_tensor(x):
             scalar = x.item()
             if isinstance(scalar, int):
diff --git a/test/jit/test_python_builtins.py b/test/jit/test_python_builtins.py
index 771ba85895226..f0c8bc7b11ee3 100644
--- a/test/jit/test_python_builtins.py
+++ b/test/jit/test_python_builtins.py
@@ -405,7 +405,13 @@ def test_index_ellipses(self):
             def f():
                 x = torch.ones(10, 9, 8, 7, 6)
                 return x{indices}.shape
+<<<<<<< HEAD
             """.format(indices=indices)
+=======
+            """.format(
+                    indices=indices
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             test_str = test_str.replace(r"'", r"")
             scope = {}
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index d6addfddca1a7..d3501d0db18aa 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -4,7 +4,10 @@
 import os
 import re
 import sys
+<<<<<<< HEAD
 import threading
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 import typing
 import typing_extensions
@@ -774,6 +777,7 @@ def forward(self, x):
         mod.foo = None
         self.checkModule(mod, (torch.rand(2, 2),))
 
+<<<<<<< HEAD
     def test_thread_safe_error_stacks(self):
         # prior to #160386, this causes a segfault. See [Note: Thread-safe CallStack]
         callstacks = []
@@ -793,6 +797,8 @@ def callstack_creator():
         del callstacks[0]
         self.assertTrue(len(callstacks) == 0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_override_instance_method_ignore(self):
         class M(torch.nn.Module):
             @torch.jit.ignore
diff --git a/test/jit/test_save_load_for_op_version.py b/test/jit/test_save_load_for_op_version.py
index fdb0b0850442f..2523551239ac3 100644
--- a/test/jit/test_save_load_for_op_version.py
+++ b/test/jit/test_save_load_for_op_version.py
@@ -4,7 +4,11 @@
 import io
 import os
 import sys
+<<<<<<< HEAD
 from itertools import product
+=======
+from itertools import product as product
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Union
 
 import hypothesis.strategies as st
diff --git a/test/jit/test_scriptmod_ann.py b/test/jit/test_scriptmod_ann.py
index 4541c24dc5e0c..16810c653c5fc 100644
--- a/test/jit/test_scriptmod_ann.py
+++ b/test/jit/test_scriptmod_ann.py
@@ -139,7 +139,13 @@ def forward(self, x: List[int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -158,7 +164,13 @@ def forward(self, x: list[int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -177,7 +189,13 @@ def forward(self, x: Dict[str, int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -196,7 +214,13 @@ def forward(self, x: dict[str, int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -215,7 +239,13 @@ def forward(self, x: Optional[str]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -234,7 +264,13 @@ def forward(self, x: List[int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -253,7 +289,13 @@ def forward(self, x: list[int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -272,7 +314,13 @@ def forward(self, x: Dict[str, int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -291,7 +339,13 @@ def forward(self, x: dict[str, int]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -310,7 +364,13 @@ def forward(self, x: Optional[str]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
@@ -331,7 +391,13 @@ def forward(self, x: Optional[str]):
         ):
             with self.assertWarnsRegex(
                 UserWarning,
+<<<<<<< HEAD
                 "doesn't support instance-level annotations on empty non-base types",
+=======
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.jit.script(M())
 
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 8d5cfffbcad8e..704daae93e2a6 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -960,9 +960,14 @@ def foo(a, b):
         V = Variable
         a, b = V(torch.rand(1)), V(torch.rand(1))
         ge = torch.jit.trace(foo, (a, b))
+<<<<<<< HEAD
         a, b = (
             V(torch.rand(1), requires_grad=True),
             V(torch.rand(1), requires_grad=True),
+=======
+        a, b = V(torch.rand(1), requires_grad=True), V(
+            torch.rand(1), requires_grad=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (r,) = ge(a, b)
         da, db = torch.autograd.grad(r + 3, [a, b], create_graph=True)
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index c1a010dcfb94d..850664d7992e6 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -210,7 +210,11 @@ def stuff4(x):
         li_1, li_2, li_3 = stuff4([True])
         li_3 = li_3[0]
         for li in [li_1, li_2, li_3]:
+<<<<<<< HEAD
             self.assertTrue(type(li[0]) is bool)
+=======
+            self.assertTrue(type(li[0]) == bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nested_list(self):
         def foo(z):
diff --git a/test/jit/test_union.py b/test/jit/test_union.py
index c5afa13463221..881f6eebcec44 100644
--- a/test/jit/test_union.py
+++ b/test/jit/test_union.py
@@ -396,7 +396,13 @@ def fn():
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "only int, float, complex, Tensor, device and string keys are supported",
+=======
+            "only int, float, "
+            "complex, Tensor, device and string keys "
+            "are supported",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -600,7 +606,13 @@ def fn(x: int) -> str:
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "y is set to type str in the true branch and type int in the false branch",
+=======
+            "y is set to type str"
+            " in the true branch and type int "
+            "in the false branch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -618,7 +630,13 @@ def fn(x: int) -> str:
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "previously had type str but is now being assigned to a value of type int",
+=======
+            "previously had type "
+            "str but is now being assigned to a"
+            " value of type int",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -723,7 +741,12 @@ def fn():
             template,
             "Union[List[str], List[torch.Tensor]]",
             lhs["list_literal_empty"],
+<<<<<<< HEAD
             "there are multiple possible List type candidates in the Union annotation",
+=======
+            "there are multiple possible List type "
+            "candidates in the Union annotation",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._assert_passes(
@@ -895,7 +918,12 @@ def fn():
             template,
             "Union[Dict[str, torch.Tensor], Dict[str, int]]",
             lhs["dict_literal_of_mixed"],
+<<<<<<< HEAD
             "none of those dict types can hold the types of the given keys and values",
+=======
+            "none of those dict types can hold the "
+            "types of the given keys and values",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # TODO: String frontend does not support tuple unpacking
diff --git a/test/jit/test_union_pep604.py b/test/jit/test_union_pep604.py
index 953ce52c49786..81c1be7c38dc9 100644
--- a/test/jit/test_union_pep604.py
+++ b/test/jit/test_union_pep604.py
@@ -20,6 +20,10 @@
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
+<<<<<<< HEAD
+=======
+@unittest.skipIf(sys.version_info < (3, 10), "Requires Python 3.10")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestUnion(JitTestCase):
     """
     This class tests the functionality of `Union`.
@@ -405,7 +409,13 @@ def fn():
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "only int, float, complex, Tensor, device and string keys are supported",
+=======
+            "only int, float, "
+            "complex, Tensor, device and string keys "
+            "are supported",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -609,7 +619,13 @@ def fn(x: int) -> str:
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "y is set to type str in the true branch and type int in the false branch",
+=======
+            "y is set to type str"
+            " in the true branch and type int "
+            "in the false branch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -627,7 +643,13 @@ def fn(x: int) -> str:
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "previously had type str but is now being assigned to a value of type int",
+=======
+            "previously had type "
+            "str but is now being assigned to a"
+            " value of type int",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.jit.script(fn)
 
@@ -732,7 +754,12 @@ def fn():
             template,
             "List[str] | List[torch.Tensor]",
             lhs["list_literal_empty"],
+<<<<<<< HEAD
             "there are multiple possible List type candidates in the Union annotation",
+=======
+            "there are multiple possible List type "
+            "candidates in the Union annotation",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._assert_passes(
@@ -898,7 +925,12 @@ def fn():
             template,
             "Dict[str, torch.Tensor] | Dict[str, int]",
             lhs["dict_literal_of_mixed"],
+<<<<<<< HEAD
             "none of those dict types can hold the types of the given keys and values",
+=======
+            "none of those dict types can hold the "
+            "types of the given keys and values",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # TODO: String frontend does not support tuple unpacking
diff --git a/test/jit/test_upgraders.py b/test/jit/test_upgraders.py
index c2228b2de85f7..43474f0bfa5e7 100644
--- a/test/jit/test_upgraders.py
+++ b/test/jit/test_upgraders.py
@@ -151,7 +151,11 @@ def test_func():
         version = self._load_model_version(loaded_func)
         self.assertTrue(version == 5)
 
+<<<<<<< HEAD
         # make sure we preserve old behaviour
+=======
+        # make sure we preserve old behaviou
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._calculate_package_version_based_on_upgraders(current_flag_value)
 
     def test_aten_linspace(self):
diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py
index 70f14cd2faff5..448f054bd85a2 100644
--- a/test/jit/test_warn.py
+++ b/test/jit/test_warn.py
@@ -135,6 +135,7 @@ def bar():
             bar()
 
         FileCheck().check_count(
+<<<<<<< HEAD
             str="UserWarning: I am warning you from foo",
             count=1,
             exactly=True,
@@ -143,6 +144,14 @@ def bar():
             count=1,
             exactly=True,
         ).run(f.getvalue())
+=======
+            str="UserWarning: I am warning you from foo", count=1, exactly=True
+        ).check_count(
+            str="UserWarning: I am warning you from bar", count=1, exactly=True
+        ).run(
+            f.getvalue()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index f6c7832d5b286..3a2877103516b 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -32,7 +32,11 @@ def forward(self, x):
             },
         )
 
+<<<<<<< HEAD
         for _ in range(20):
+=======
+        for _ in range(0, 20):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index 844b9fef1afe0..db306f2af67b1 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -195,7 +195,11 @@ def wrapper(self):
 class OptimizeTest(unittest.TestCase):
     test_sub = maketest(ModuleSub)
     # Same as test_sub but force aten::sub to fallback
+<<<<<<< HEAD
     # We expect an exception caught because of LTC fallback.
+=======
+    # We expect an exception caught because of LTC fallabck.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_ltc_fallback = maketest(
         ModuleSub,
         exception_msg_pattern="fallback.*aten::sub",
diff --git a/test/lazy/test_generator.py b/test/lazy/test_generator.py
index a513b41b08808..83a291fb876cd 100644
--- a/test/lazy/test_generator.py
+++ b/test/lazy/test_generator.py
@@ -42,12 +42,21 @@ def generate_tensor():
 
         torch._lazy.mark_step()
 
+<<<<<<< HEAD
         assert torch.allclose(cpu_t1, lazy_t1.to("cpu")), (
             f"Expected {cpu_t1}, got {lazy_t1.to('cpu')}"
         )
         assert torch.allclose(cpu_t2, lazy_t2.to("cpu")), (
             f"Expected {cpu_t2}, got {lazy_t2.to('cpu')}"
         )
+=======
+        assert torch.allclose(
+            cpu_t1, lazy_t1.to("cpu")
+        ), f"Expected {cpu_t1}, got {lazy_t1.to('cpu')}"
+        assert torch.allclose(
+            cpu_t2, lazy_t2.to("cpu")
+        ), f"Expected {cpu_t2}, got {lazy_t2.to('cpu')}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfTorchDynamo("Torch Dynamo does not support torch.Generator type")
     def test_generator_causes_multiple_compiles(self):
@@ -69,22 +78,35 @@ def generate_tensor(seed):
             torch._lazy.mark_step()
 
             uncached_compile = metrics.counter_value("UncachedCompile")
+<<<<<<< HEAD
             assert uncached_compile == 1, (
                 f"Expected 1 uncached compiles, got {uncached_compile}"
             )
+=======
+            assert (
+                uncached_compile == 1
+            ), f"Expected 1 uncached compiles, got {uncached_compile}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             t = generate_tensor(2)
             torch._lazy.mark_step()
 
             uncached_compile = metrics.counter_value("UncachedCompile")
+<<<<<<< HEAD
             assert uncached_compile == 2, (
                 f"Expected 2 uncached compiles, got {uncached_compile}"
             )
+=======
+            assert (
+                uncached_compile == 2
+            ), f"Expected 2 uncached compiles, got {uncached_compile}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             t = generate_tensor(1)  # noqa: F841
             torch._lazy.mark_step()
 
             uncached_compile = metrics.counter_value("UncachedCompile")
+<<<<<<< HEAD
             assert uncached_compile == 2, (
                 f"Expected 2 uncached compiles, got {uncached_compile}"
             )
@@ -92,6 +114,15 @@ def generate_tensor(seed):
             assert cached_compile == 1, (
                 f"Expected 1 cached compile, got {cached_compile}"
             )
+=======
+            assert (
+                uncached_compile == 2
+            ), f"Expected 2 uncached compiles, got {uncached_compile}"
+            cached_compile = metrics.counter_value("CachedCompile")
+            assert (
+                cached_compile == 1
+            ), f"Expected 1 cached compile, got {cached_compile}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         metrics.reset()
 
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index 3e06539515384..bca1948a4ba1b 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -164,7 +164,11 @@ def foo(x, *, mark_step):
             if mark_step:
                 torch._lazy.mark_step()
 
+<<<<<<< HEAD
             # y and x should continue to be aliased after the mark_step call.
+=======
+            # y and x should contiue to be aliased after the mark_step call.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y.add_(1)
             return x
 
diff --git a/test/mobile/model_test/README.md b/test/mobile/model_test/README.md
index f176a746c26ff..38a97194201c0 100644
--- a/test/mobile/model_test/README.md
+++ b/test/mobile/model_test/README.md
@@ -81,7 +81,11 @@ python test/mobile/model_test/gen_test_model.py ios
 The test coverage is based on the number of root ops tested in these test models. The full list of generated ops can be found in:
 https://github.com/pytorch/pytorch/blob/master/test/mobile/model_test/coverage.yaml
 
+<<<<<<< HEAD
 In additional, the simulator tests will also report the percentage of Meta's production ops that are covered. The list of production ops changes overtime, so a Meta employee needs to regularly update the list it using
+=======
+In additional, the simulator tests will also report the percentage of Meta's production ops that are covered. The list of production ops changes overtime, so a Meta employee needs to regularly udpate the list it using
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 python test/mobile/model_test/update_production_ops.py ~/fbsource/xplat/pytorch_models/build/all_mobile_model_configs.yaml
 ```
diff --git a/test/mobile/model_test/gen_test_model.py b/test/mobile/model_test/gen_test_model.py
index 5e760a739cec7..29b4f711de05c 100644
--- a/test/mobile/model_test/gen_test_model.py
+++ b/test/mobile/model_test/gen_test_model.py
@@ -118,16 +118,26 @@ def calcOpsCoverage(ops):
     uncovered_ops = production_ops - covered_ops
     coverage = round(100 * len(covered_ops) / len(production_ops), 2)
 
+<<<<<<< HEAD
     # weighted coverage (take op occurrences into account)
     total_occurrences = sum(production_ops_dict["root_operators"].values())
+=======
+    # weighted coverage (take op occurances into account)
+    total_occurances = sum(production_ops_dict["root_operators"].values())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     covered_ops_dict = {
         op: production_ops_dict["root_operators"][op] for op in covered_ops
     }
     uncovered_ops_dict = {
         op: production_ops_dict["root_operators"][op] for op in uncovered_ops
     }
+<<<<<<< HEAD
     covered_occurrences = sum(covered_ops_dict.values())
     occurrences_coverage = round(100 * covered_occurrences / total_occurrences, 2)
+=======
+    covered_occurances = sum(covered_ops_dict.values())
+    occurances_coverage = round(100 * covered_occurances / total_occurances, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     print(f"\n{len(uncovered_ops)} uncovered ops: {uncovered_ops}\n")
     print(f"Generated {len(all_generated_ops)} ops")
@@ -135,7 +145,11 @@ def calcOpsCoverage(ops):
         f"Covered {len(covered_ops)}/{len(production_ops)} ({coverage}%) production ops"
     )
     print(
+<<<<<<< HEAD
         f"Covered {covered_occurrences}/{total_occurrences} ({occurrences_coverage}%) occurrences"
+=======
+        f"Covered {covered_occurances}/{total_occurances} ({occurances_coverage}%) occurances"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     print(f"pytorch ver {torch.__version__}\n")
 
diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py
index dbec56e64261a..2e268a40fa4ff 100644
--- a/test/mobile/model_test/update_production_ops.py
+++ b/test/mobile/model_test/update_production_ops.py
@@ -16,6 +16,7 @@
     model_infos = yaml.safe_load(input_yaml_file)
     for info in model_infos:
         for op in info["root_operators"]:
+<<<<<<< HEAD
             # aggregate occurrence per op
             root_operators[op] = 1 + (root_operators.get(op, 0))
         for op in info["traced_operators"]:
@@ -24,6 +25,20 @@
         # merge dtypes for each kernel
         for kernal, dtypes in info["kernel_metadata"].items():
             new_dtypes = dtypes + (kernel_metadata.get(kernal, []))
+=======
+            # aggregate occurance per op
+            root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0)
+        for op in info["traced_operators"]:
+            # aggregate occurance per op
+            traced_operators[op] = 1 + (
+                traced_operators[op] if op in traced_operators else 0
+            )
+        # merge dtypes for each kernel
+        for kernal, dtypes in info["kernel_metadata"].items():
+            new_dtypes = dtypes + (
+                kernel_metadata[kernal] if kernal in kernel_metadata else []
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel_metadata[kernal] = list(set(new_dtypes))
 
 
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index cee6da61bf3a8..374fa8621648c 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -437,7 +437,11 @@ def forward(self, val: int, x, y, w):
         # additional context to the exception message and preserve the correct
         #  C++ stack trace for symbolication. i.e. it isn't possible to add
         # the debug handle string to show where in the Python code the exception
+<<<<<<< HEAD
         # occurred w/o first changing
+=======
+        # occured w/o first changing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # torch::jit::JITException to extend c10::Error.
         self.assertTrue("self.val and val are same" in error_message)
 
@@ -486,9 +490,23 @@ def forward(self):
                 "Traceback of TorchScript"
             ).check("self.b.forwardError").check_next(
                 "~~~~~~~~~~~~~~~~~~~ <--- HERE"
+<<<<<<< HEAD
             ).check("return self.call").check_next("~~~~~~~~~ <--- HERE").check(
                 "return torch.ones"
             ).check_next("~~~~~~~~~~ <--- HERE").run(str(exp))
+=======
+            ).check(
+                "return self.call"
+            ).check_next(
+                "~~~~~~~~~ <--- HERE"
+            ).check(
+                "return torch.ones"
+            ).check_next(
+                "~~~~~~~~~~ <--- HERE"
+            ).run(
+                str(exp)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestLiteScriptQuantizedModule(QuantizationLiteTestCase):
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 64be8aac150ca..94368a7b98c45 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -47,6 +47,7 @@
     gradgradcheck,
     instantiate_parametrized_tests,
     MACOS_VERSION,
+<<<<<<< HEAD
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
@@ -54,6 +55,16 @@
     TEST_SCIPY,
     TEST_WITH_ROCM,
     xfailIf,
+=======
+    NAVI4_ARCH,
+    parametrize as parametrize_test,
+    run_tests,
+    set_default_dtype,
+    skipIfRocmArch,
+    subtest,
+    TEST_SCIPY,
+    TEST_WITH_ROCM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -93,6 +104,7 @@ def test_conv_backcompat(self):
         input = torch.randn((1, 1, 1, 1), dtype=torch.float)
         self.assertEqual(m(input).size(), (1, 1, 1, 1))
 
+<<<<<<< HEAD
     def test_huge_padding(self):
         class Conv1dModule(torch.nn.Module):
             def __init__(self):
@@ -134,6 +146,8 @@ def __init__(self):
         ):
             model.conv_transposed1d(input_data)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_invalid_conv1d(self):
         for dtype in [
             torch.half,
@@ -271,6 +285,7 @@ def test_conv_invalid_groups(self):
         with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
 
+<<<<<<< HEAD
     def test_conv_aten_invalid_groups(self):
         # test low-level aten ops with invalid groups parameter
         grad_output = torch.randn(2, 4, 8, dtype=torch.double)
@@ -366,6 +381,8 @@ def test_conv3d_overflow_values(self):
                 padding=[2**31, 2**31, 2**31],
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_Conv1d_module_same_padding(self):
         # Compare module against functional: without strides/dilation, asymmetric padding
         x = torch.rand(1, 1, 20)
@@ -849,7 +866,10 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.001)
     def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
@@ -895,7 +915,10 @@ def test_Conv2d_groups_nobias(self):
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.001)
     def test_Conv2d_groups_nobias_v2(self):
         torch.manual_seed(123)
@@ -1050,7 +1073,11 @@ def test_grouped_conv_cudnn_nhwc_support(self):
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     def test_conv_cudnn_memory_layout_dominance(self):
         # desired behavior here is to have the memory_layout of conv.weight to
+<<<<<<< HEAD
         # dominant the layout of output.
+=======
+        # dominante the layout of output.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # which is not the same as current behavior, we'll fix this in
         # following up PRs and remove the `expectedFailure` tag
         input = torch.randint(
@@ -1333,7 +1360,11 @@ def reproducer(radius: int):
             kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
             image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
 
+<<<<<<< HEAD
         for i in range(128):
+=======
+        for i in range(0, 128):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This should not fail
             reproducer(radius=i)
 
@@ -1503,7 +1534,10 @@ def test_ConvTranspose2d_large_output_padding(self, device, dtype):
     # Very similar to test_Conv2d_naive_groups but with special care to handle
     # the number of groups == number of input channels
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.01)
     def test_Conv2d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
@@ -1566,7 +1600,10 @@ def test_Conv2d_depthwise_naive_groups(self, device, dtype):
     @onlyCUDA
     @dtypes(torch.float, torch.double, torch.half)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.01)
     def test_Conv3d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
@@ -1659,6 +1696,7 @@ def test_noncontig_conv_grad(self, device, dtype):
 
     @onlyCUDA
     @dtypes(torch.double)
+<<<<<<< HEAD
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
     @torch.backends.miopen.flags(immediate=True)
     def test_conv_double_backward(self, device, dtype):
@@ -1702,6 +1740,50 @@ def test_conv_double_backward(self, device, dtype):
                     + "\ndilation: "
                     + str(dilation),
                 )
+=======
+    def test_conv_double_backward(self, device, dtype):
+        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
+            # Double backward only runs with DoubleTensor due to precision reason
+            batch_size = 1
+            for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
+                for stride, padding, chan_in, chan_out, dilation in product(
+                    [1], [2], [2], [3], dilations
+                ):
+                    no_weight = stride == 2
+                    result = self.run_conv_double_back_test(
+                        kern,
+                        stride,
+                        padding,
+                        chan_in,
+                        chan_out,
+                        batch_size,
+                        inp_size,
+                        dilation,
+                        no_weight,
+                        use_cuda=True,
+                        dtype=dtype,
+                    )
+                    self.assertTrue(
+                        result,
+                        "Conv double backward test failed with parameters:"
+                        + "\nkern: "
+                        + str(kern)
+                        + "\nstride: "
+                        + str(stride)
+                        + "\npadding: "
+                        + str(padding)
+                        + "\nchan_in: "
+                        + str(chan_in)
+                        + "\nchan_out: "
+                        + str(chan_out)
+                        + "\nbatch_size: "
+                        + str(batch_size)
+                        + "\ninp_size: "
+                        + str(inp_size)
+                        + "\ndilation: "
+                        + str(dilation),
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_conv_double_backward_no_bias(self):
         kern = 3
@@ -1817,7 +1899,10 @@ def test_conv_double_backward_stride(self):
         *([torch.float] if MACOS_VERSION < 14.0 else [torch.float, torch.cfloat])
     )  # Complex not supported on MacOS13
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv1d_same_padding(self, device, dtype):
         # Test padding='same' outputs the correct shape
         test_args = [
@@ -2985,7 +3070,10 @@ def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
     @parametrize_test("strided", [False, True])
     # Test with both contiguous and non-contiguous inputs.
     @parametrize_test("contiguous", [False, True])
+<<<<<<< HEAD
     @expectedFailureMPS  # No double support
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_backend(
         self,
         device,
@@ -3381,6 +3469,20 @@ def test_conv_large_batch_1(self, device):
         self.assertEqual(output.cpu().float(), output_cpu, atol=1e-3, rtol=1e-3)
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipCUDAIfRocm
+    @largeTensorTest("24GB", "cpu")
+    @largeTensorTest("20GB", "cuda")
+    def test_conv3d_large_batch_1(self, device):
+        x = torch.rand(1, 32, 512, 512, 256)
+        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
+        yref = m(x)
+        y = m.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y.cpu())
+
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNoCudnn
     def test_contig_wrong_stride_cudnn(self, device):
         # x has to have batch_size 1 to test contiguous checks
@@ -3494,7 +3596,10 @@ def test_ConvTranspose3d_size_1_kernel(self, device):
     )
     @dtypes(torch.float)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+<<<<<<< HEAD
     @torch.backends.miopen.flags(immediate=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.001)
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
@@ -3640,7 +3745,11 @@ def helper(
                     input_format=input_format,
                     weight_format=weight_format,
                 )
+<<<<<<< HEAD
                 # test when input channel is 1 and not converted to channels last
+=======
+                # test when input chanels is 1 and not converted to channels last
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 helper(
                     nn.Conv2d,
                     2,
@@ -3929,9 +4038,15 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
                         # This is because we have N111 weight that cannot handle
                         # the ambiguous memory_format
                         if w_f == torch.channels_last:
+<<<<<<< HEAD
                             if layer is nn.Conv2d and filter_size * c != 1:
                                 output_format = torch.channels_last
                             if layer is nn.ConvTranspose2d and filter_size * k != 1:
+=======
+                            if layer == nn.Conv2d and filter_size * c != 1:
+                                output_format = torch.channels_last
+                            if layer == nn.ConvTranspose2d and filter_size * k != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 output_format = torch.channels_last
                     self._run_conv(
                         layer,
@@ -3967,9 +4082,18 @@ def test_conv_cudnn_mismatch_memory_format(self, device):
                 nn.ConvTranspose2d, n, c, h, w, k, filter_size, device
             )
 
+<<<<<<< HEAD
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.double, torch.float16, torch.bfloat16)
+=======
+    # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
+    # returning CUDNN_STATUS_BAD_PARAM
+    # Disabling that specific test for now [see issue # 33918]
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    @dtypes(torch.float, torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_cudnn_nhwc_support(self, device, dtype):
         input = torch.randn(
             (1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True
@@ -4005,8 +4129,13 @@ def test_conv2d_no_grad(self, device, dtype):
 
     @onlyCUDA
     @skipCUDAIfNoCudnn
+<<<<<<< HEAD
     @dtypes(torch.float, torch.float16)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+=======
+    @skipIfRocmArch(NAVI4_ARCH) # not supported by MIOPEN on NAVI4x
+    @dtypes(torch.float, torch.float16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_relu(self, device, dtype):
         for batch, groups, image_size, kernel_size, memory_format in product(
@@ -4020,9 +4149,15 @@ def test_cudnn_convolution_relu(self, device, dtype):
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
             w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
+<<<<<<< HEAD
             inp = inp.to(memory_format=memory_format)
             w = w.to(memory_format=memory_format)
             conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+=======
+            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            inp = inp.to(memory_format=memory_format)
+            w = w.to(memory_format=memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if torch.version.hip:
                 cudnn_out = torch.miopen_convolution_relu(
                     inp, w, None, (1, 1), (0, 0), (1, 1), 1
@@ -4040,7 +4175,10 @@ def test_cudnn_convolution_relu(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.float16)
+<<<<<<< HEAD
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_add_relu(self, device, dtype):
         for batch, groups, image_size, kernel_size, memory_format in product(
@@ -4054,11 +4192,20 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
             w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
+<<<<<<< HEAD
             inp = inp.to(memory_format=memory_format)
             w = w.to(memory_format=memory_format)
             conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
             alpha = 2.0
             z = torch.randn_like(conv2d_out)
+=======
+            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            alpha = 2.0
+            z = torch.randn_like(conv2d_out)
+
+            inp = inp.to(memory_format=memory_format)
+            w = w.to(memory_format=memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             z = z.to(memory_format=memory_format)
             if torch.version.hip:
                 cudnn_out = torch.miopen_convolution_add_relu(
@@ -4172,7 +4319,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
     @onlyCUDA
     @largeTensorTest("40GB")
     @largeTensorTest("24GB", "cpu")
+<<<<<<< HEAD
     @tf32_on_and_off(0.005)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
@@ -4182,6 +4332,7 @@ def test_conv3d_64bit_indexing(self, device):
 
     @skipCUDAIfRocm
     @onlyCUDA
+<<<<<<< HEAD
     @largeTensorTest("40GB", "cuda")
     def test_conv3d_cudnn_broken(self, device):
         for dtype in (torch.half, torch.bfloat16):
@@ -4228,6 +4379,16 @@ def test_depthwise_conv_64bit_indexing(self, device):
         yref = c(x)
         y = c.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
+=======
+    @largeTensorTest("20GB")
+    @largeTensorTest("80GB", "cpu")
+    def test_depthwise_conv_64bit_indexing(self, device):
+        x = torch.randn(1, 2, 32800, 32800)
+        c = nn.Conv2d(2, 2, kernel_size=3, stride=1, padding=1, groups=2)
+        yref = c(x)
+        y = c.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), allow_mps=True)
diff --git a/test/nn/test_dropout.py b/test/nn/test_dropout.py
index 95065c0c83092..f703d3b7bb933 100644
--- a/test/nn/test_dropout.py
+++ b/test/nn/test_dropout.py
@@ -12,6 +12,10 @@
     dtypes,
     dtypesIfMPS,
     expectedFailureMPS,
+<<<<<<< HEAD
+=======
+    expectedFailureMPSPre15,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     expectedFailureXLA,
     instantiate_device_type_tests,
 )
@@ -172,6 +176,10 @@ def invert_perm(p):
                     else:
                         self.assertNotEqual(permuted_inp, out)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureMPSPre15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_Dropout(self, device):
         input = torch.empty(1000)
         self._test_dropout(nn.Dropout, device, input)
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index f21184290fa15..c783edaa68295 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -182,7 +182,10 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     # https://github.com/pytorch/pytorch/issues/130806
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("40GB", device="cuda")
     def test_large_tensors(self):
         input = torch.randint(low=0, high=16032, size=[131072], device="cuda")
@@ -551,7 +554,11 @@ def gen_2D_indices_from_1D(
                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
+<<<<<<< HEAD
                 for item_pos in range(max_indices_per_bag):
+=======
+                for item_pos in range(0, max_indices_per_bag):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
@@ -632,6 +639,7 @@ def gen_2D_indices_from_1D(
                     weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol
                 )
 
+<<<<<<< HEAD
     @onlyCUDA
     @dtypes(
         torch.bfloat16,
@@ -702,6 +710,8 @@ def test_embedding_backward_large_batch_overflow(self, device, dtype):
         assert grad_weight.shape == (num_weights, embedding_dim)
         assert grad_weight.dtype == torch.bfloat16
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Check correctness of torch.nn.functional.embedding_bag forward and
     # backward functions with padding_idx, given a 2D indices input. Compare
     # against torch.nn.functional.embedding followed by a reduction.
diff --git a/test/nn/test_load_state_dict.py b/test/nn/test_load_state_dict.py
index 074ac6273689a..a83d676c8b004 100644
--- a/test/nn/test_load_state_dict.py
+++ b/test/nn/test_load_state_dict.py
@@ -474,8 +474,13 @@ def module_load(dest, src, assign=False):
                 f"Expected isinstance(src, {cls}) but got {type(src)}"
             )
             assert (
+<<<<<<< HEAD
                 type(dest) is torch.Tensor
                 or type(dest) is torch.nn.Parameter
+=======
+                type(dest) == torch.Tensor
+                or type(dest) == torch.nn.Parameter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or issubclass(cls, type(dest))
             )
             if assign:
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index f07ad0b8b08fe..da0202b748f1e 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -10,14 +10,21 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+<<<<<<< HEAD
     onlyOn,
+=======
+    onlyCUDAAndPRIVATEUSE1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize as parametrize_test,
     run_tests,
+<<<<<<< HEAD
     TEST_CUDA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_NUMPY,
     TEST_WITH_CROSSREF,
 )
@@ -33,9 +40,14 @@
 
 
 class TestMultiheadAttentionNN(NNTestCase):
+<<<<<<< HEAD
     if TEST_CUDA:
         _do_cuda_memory_leak_check = True
         _do_cuda_non_default_stream = True
+=======
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
     @parametrize_test("average_attn_weights", [True, False])
@@ -487,7 +499,11 @@ def test_multihead_attn_3d_attn_mask(self):
         )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
+<<<<<<< HEAD
         for i in range(batch_size):
+=======
+        for i in range(0, batch_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_2d = mta_model(
                 query[i].unsqueeze(0).transpose(0, 1),
                 key[i].unsqueeze(0).transpose(0, 1),
@@ -836,6 +852,7 @@ def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
         and key padding mask (mask type 1) are provided at the same time on CPU and CUDA and PrivateUse1
         """
         device = device.rstrip(":0123456789")
+<<<<<<< HEAD
         if device not in [
             "cpu",
             "cuda",
@@ -843,6 +860,10 @@ def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
             torch._C._get_privateuse1_backend_name(),
         ]:
             self.skipTest("Fastpath only runs on CPU and CUDA and XPU and PrivateUse1.")
+=======
+        if device not in ["cpu", "cuda", torch._C._get_privateuse1_backend_name()]:
+            self.skipTest("Fastpath only runs on CPU and CUDA and PrivateUse1.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch.autocast(device_type=device, enabled=False):
             embed_dim = 16
@@ -876,7 +897,11 @@ def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
                 # If mock was called, fastpath was taken
                 self.assertTrue(fastpath_mock.called)
 
+<<<<<<< HEAD
     @onlyOn(["cuda", "xpu", torch._C._get_privateuse1_backend_name()])
+=======
+    @onlyCUDAAndPRIVATEUSE1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.half, torch.float, torch.double)
     def test_multihead_attention_dtype(self, device, dtype):
         embed_dim = 128
@@ -891,7 +916,11 @@ def test_multihead_attention_dtype(self, device, dtype):
         self.assertEqual(q.size(), out[0].size())
         self.assertEqual(dtype, out[0].dtype)
 
+<<<<<<< HEAD
     @onlyOn(["cuda", "xpu", torch._C._get_privateuse1_backend_name()])
+=======
+    @onlyCUDAAndPRIVATEUSE1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.half, torch.float, torch.double)
     def test_multihead_attention_dtype_batch_first(self, device, dtype):
         embed_dim = 128
@@ -945,6 +974,7 @@ def test_multihead_attn_fast_path_small_test(self, device, dtype):
         mha(query, query, query)
 
     @dtypes(torch.double)
+<<<<<<< HEAD
     def test_fast_path_check_with_mask_does_not_break_in_compile(self, device, dtype):
         # Test TransformerEncoder fast path determination with src_key_padding_mask set.
         # Specifically, ensure the mask left-align check doesn't fail in torch.compile.
@@ -965,6 +995,8 @@ def test_fast_path_check_with_mask_does_not_break_in_compile(self, device, dtype
         encoder(x, mask=None, src_key_padding_mask=pad_mask)
 
     @dtypes(torch.double)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.no_grad()
     def test_multihead_attn_in_proj_bias_none(self, device, dtype):
         mha = torch.nn.MultiheadAttention(2, 2, bias=False, dtype=dtype, device=device)
diff --git a/test/nn/test_parametrization.py b/test/nn/test_parametrization.py
index aee8d4df50e6e..c95d801021c74 100644
--- a/test/nn/test_parametrization.py
+++ b/test/nn/test_parametrization.py
@@ -593,6 +593,7 @@ def right_inverse(self, w):
             parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
         self.assertFalse(parametrize.is_parametrized(module))
 
+<<<<<<< HEAD
         class ChangeDeviceInverse(nn.Module):
             def forward(self, x):
                 return x.float()
@@ -609,6 +610,8 @@ def right_inverse(self, w):
             )
         self.assertFalse(parametrize.is_parametrized(module))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Doesn't return a tensor
         class NotTensor(nn.Module):
             def forward(self, x):
@@ -1395,7 +1398,11 @@ def fn(input):
                     eval_out0 = wrapped_m(input)
                     # assert eval gives same result as last training iteration
                     self.assertEqual(eval_out0, last_train_out)
+<<<<<<< HEAD
                     # assert doing more iteration in eval don't change things
+=======
+                    # assert doing more iteartion in eval don't change things
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(eval_out0, wrapped_m(input))
                     self.assertEqual(last_train_u, spectral_norm_m._u)
                     self.assertEqual(last_train_v, spectral_norm_m._v)
@@ -1440,7 +1447,11 @@ def test_register_parametrization_no_grad(self):
 
         class SplitAndCat(nn.Module):
             def right_inverse(self, x):
+<<<<<<< HEAD
                 # split the tensor in two halves
+=======
+                # split the tensor in two halfs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.split(x, x.shape[1] // 2)
 
             def forward(self, x0, x1):
@@ -1668,7 +1679,11 @@ def assert_weight_allclose_Q(weight, W):
                 if can_initialize:
                     assert_weight_allclose_Q(m.weight, w_init)
 
+<<<<<<< HEAD
                 # Initializing with a given orthogonal matrix works
+=======
+                # Intializing with a given orthogonal matrix works
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 X = torch.randn_like(m.weight)
                 if wide_matrix:
                     X = X.mT
@@ -1685,7 +1700,11 @@ def assert_weight_allclose_Q(weight, W):
                     with self.assertRaisesRegex(NotImplementedError, msg):
                         m.weight = w_new
 
+<<<<<<< HEAD
                 # Initializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix
+=======
+                # Intializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 w_new = torch.randn_like(m.weight)
                 if can_initialize:
                     m.weight = w_new
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index c3a7b829b2b15..39396da37ebb4 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -504,7 +504,10 @@ def test_quantized_max_pool3d(self):
 
 
 class TestPoolingNNDeviceType(NNTestCase):
+<<<<<<< HEAD
     @expectedFailureMPS  # No double, float shape prop does not work
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     def test_adaptive_pooling_zero_batch(self, dtype, device):
@@ -524,7 +527,10 @@ def test_adaptive_pooling_zero_batch(self, dtype, device):
     # when output_size = 0, in adaptive_{avg, max}_pool and its variants.
     # These tests are explicitly written because ErrorInputs does not support backward calls
     # Issue: https://github.com/pytorch/pytorch/issues/78868
+<<<<<<< HEAD
     @expectedFailureMPS  # No double, float shape prop does not work
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16, torch.float16)
@@ -558,7 +564,10 @@ def test_adaptive_pooling_empty_output_size(self, dtype, device):
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 fn(input2, output_size).sum().backward()
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Error message does not match
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_adaptive_avg_pooling_backward_fails(self, device):
         grad_output = torch.randn(1, 2, 7, device=device)
@@ -585,7 +594,10 @@ def test_adaptive_max_pooling_backward_fails(self, device):
         with self.assertRaisesRegex(RuntimeError, "expected dimensions"):
             torch.ops.aten.adaptive_max_pool3d_backward(grad_output, input, indices)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_batch(self, device):
         mod = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
@@ -596,7 +608,10 @@ def test_FractionalMaxPool2d_zero_batch(self, device):
             inp = torch.randn(1, 0, 50, 32, device=device)
             mod(inp)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_batch(self, device):
         mod = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5)).to(device)
@@ -607,7 +622,10 @@ def test_FractionalMaxPool3d_zero_batch(self, device):
             inp = torch.randn(1, 0, 50, 32, 32, device=device)
             mod(inp)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_out_size(self, device):
         mod = nn.FractionalMaxPool2d([2, 2], output_size=[0, 1])
@@ -615,7 +633,10 @@ def test_FractionalMaxPool2d_zero_out_size(self, device):
         out = mod(inp)
         self.assertEqual(out, torch.empty((16, 50, 0, 1), device=device))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_out_size(self, device):
         mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[0, 1, 1])
@@ -623,7 +644,10 @@ def test_FractionalMaxPool3d_zero_out_size(self, device):
         out = mod(inp)
         self.assertEqual(out, torch.empty((16, 0, 1, 1), device=device))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_samples(self, device):
         samples = torch.rand([0, 16, 2], device=device)
@@ -638,7 +662,10 @@ def test_FractionalMaxPool2d_zero_samples(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expect _random_samples"):
             mod(inp1)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Op not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_samples(self, device):
         samples = torch.rand([0, 16, 3], device=device)
@@ -654,6 +681,7 @@ def test_FractionalMaxPool3d_zero_samples(self, device):
             mod(inp1)
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     def test_FractionalMaxPool3d_errors(self, device):
         samples = torch.rand([0, 16, 3], device=device)
         with self.assertRaisesRegex(ValueError, "kernel_size must greater than 0"):
@@ -689,6 +717,8 @@ def test_MaxPool3d_errors(self, device):
             )(samples)
 
     @onlyNativeDeviceTypes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_MaxPool_zero_batch_dim(self, device):
         inp = torch.randn(0, 16, 50, device=device)
         mod = torch.nn.MaxPool1d(3, stride=2).to(device)
@@ -857,6 +887,7 @@ def test_MaxUnpool_index_errors(
             else:
                 unpool(output, indices)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/163409
     @onlyNativeDeviceTypes
     def test_MaxUnpool_invalid_output_size(self, device):
@@ -872,6 +903,8 @@ def test_MaxUnpool_invalid_output_size(self, device):
             unpool3d(input3d, torch.zeros_like(input3d, dtype=torch.int64))
 
     @expectedFailureMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_AdaptiveMaxPool_zero_batch_dim(self, device):
         inp = torch.randn(0, 16, 50, device=device)
@@ -914,6 +947,7 @@ def test_AvgPool2d_empty(self, device):
             inp = torch.randn(16, 0, 20, 32, device=device)
             avgpool(inp)
 
+<<<<<<< HEAD
     @parametrize_test("kernel", ["max", "avg"])
     @parametrize_test("pooling_dims", [1, 2, 3])
     def test_pooling_shape(self, device, kernel, pooling_dims):
@@ -931,6 +965,22 @@ def check(expected_out_shape, sizes, *args, **kwargs):
                 self.assertEqual(
                     op(t, *args, **kwargs).shape, expected_out_shape[: pooling_dims + 2]
                 )
+=======
+    @expectedFailureMPS  # max_pool3d_with_indices not supported on MPS
+    def test_pooling_shape(self, device):
+        """Test the output shape calculation for pooling functions"""
+
+        # Checks output shape against expected for 1D, 2D and 3D
+        def check(expected_out_shape, sizes, *args, **kwargs):
+            for kernel in ["max", "avg"]:
+                for i in [1, 2, 3]:
+                    if hasattr(torch.nn.functional, f"{kernel}_pool{i}d"):
+                        op = getattr(torch.nn.functional, f"{kernel}_pool{i}d")
+                        t = torch.randn(sizes[: i + 2], device=device)
+                        self.assertEqual(
+                            op(t, *args, **kwargs).shape, expected_out_shape[: i + 2]
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check(
             (1, 1, 3, 3, 4),
@@ -1011,7 +1061,10 @@ def test_adaptive_avg_pool3d_output_size_one(self, device):
         c = out.size(1)
         self.assertEqual(out.stride(), [c, 1, 1, 1, 1])
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Runtime Error not raised for mps
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureMeta  # Runtime Error not raised for meta
     @onlyNativeDeviceTypes
     @dtypes(torch.uint8, torch.int8, torch.short, torch.int, torch.long)
@@ -1026,7 +1079,10 @@ def test_adaptive_pooling_no_suppot_input(self, device, dtype):
                 with self.assertRaisesRegex(RuntimeError, "not implemented"):
                     module(input)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # TODO: fixme
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @gcIfJetson
     @dtypes(torch.float, torch.double)
@@ -1135,7 +1191,11 @@ def check(x, *args, **kwargs):
         for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
             sizes, kernel_sizes, strides, dilations, ceil_modes
         ):
+<<<<<<< HEAD
             padding = random.sample(range(math.floor(kernel_size / 2) + 1), 1)
+=======
+            padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             check(
                 torch.randn(size, device=device, dtype=dtype),
                 kernel_size,
@@ -1174,7 +1234,10 @@ def helper(n, c, h, w, ks):
         helper(1, 100000, 32, 32, ks=4)
         helper(1, 100000, 1, 4, ks=(1, 4))  # test for max_pool1d
 
+<<<<<<< HEAD
     @expectedFailureMPS  # TODO: Fixme
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.bfloat16, torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
@@ -1250,7 +1313,10 @@ def check(x, args, expected, memory_format):
             torch.channels_last,
         )
 
+<<<<<<< HEAD
     @expectedFailureMPS  # TODO: Fixme
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.bfloat16, torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
@@ -1425,6 +1491,7 @@ def test_max_pool2d_with_indices_backward_fails(self, device):
                 indices,
             )
 
+<<<<<<< HEAD
     def test_max_unpool_invalid_indices(self):
         input = torch.randn(1, 1, 2, 2)
         negative_indices = torch.tensor([[[[-1, 0], [0, 2]]]], dtype=torch.int64)
@@ -1452,6 +1519,8 @@ def test_max_unpool_invalid_indices(self):
         with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
             F.max_unpool3d(input, large_indices, output_size)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     @dtypes(torch.half, torch.bfloat16)
     def test_avg_pool2d_reduced_floating(self, device, dtype):
@@ -1742,6 +1811,10 @@ def test_MaxPool1d_indices(self, device, dtype):
     def test_MaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, device=device, dtype=dtype)
 
+<<<<<<< HEAD
+=======
+    @expectedFailureMPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float)
     def test_MaxPool3d_indices(self, device, dtype):
@@ -1802,7 +1875,10 @@ def test_maxpool_indices_no_batch_dim(self, device, dtype):
 
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
+<<<<<<< HEAD
     @expectedFailureMPS  # Exception not raise
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
     @gcIfJetson
     def test_max_pool_nan_inf(self, device, dtype):
@@ -1839,7 +1915,10 @@ def test_max_pool_nan_inf(self, device, dtype):
                 res2 = fn(x2, 1 if adaptive else 3)
                 self.assertTrue(math.isinf(res2.item()))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # float64
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureMeta  # RuntimeError: Unrecognized tensor type ID: Meta
     @onlyNativeDeviceTypes
     def test_fractional_max_pool2d(self, device):
@@ -1902,7 +1981,10 @@ def test_fractional_max_pool2d_backward_fails(self, device):
                 grad_output, input, kernel_size, output_size, indices
             )
 
+<<<<<<< HEAD
     @expectedFailureMPS  # float64
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureMeta  # RuntimeError: Unrecognized tensor type ID: Meta
     @onlyNativeDeviceTypes
     def test_fractional_max_pool3d(self, device):
@@ -1950,7 +2032,10 @@ def func(x):
                         x, (2, 2, 2), output_size=output_size, _random_samples=samples
                     )
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Not implemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
@@ -1980,7 +2065,10 @@ def test_fractional_max_pool_nan_inf(self, device, dtype):
             res2.backward(torch.randn_like(res2))
             self.assertTrue(math.isinf(res2.item()))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # TODO: Fix me
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes  # TODO: RuntimeError message different on XLA
     def test_pooling_zero_stride(self, device):
         for op in ("max", "avg"):
@@ -2003,6 +2091,10 @@ def test_pooling_zero_stride(self, device):
                 )
 
     @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+<<<<<<< HEAD
+=======
+    @expectedFailureMPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float)
     def test_pool_large_size(self, device, dtype):
         for op in ("max", "avg"):
@@ -2097,6 +2189,10 @@ def test_pooling_bfloat16(self, device):
             prec=0.05,
         )
 
+<<<<<<< HEAD
+=======
+    @expectedFailureMPS  # max_pool3d_with_indices not supported on MPS device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_maxpool3d_non_square_backward(self, device):
         # previous CUDA routine of this backward calculates kernel launch grid size
         # with last two dimensions interchanged, so the tailing along the longer dim
diff --git a/test/nn/test_pruning.py b/test/nn/test_pruning.py
index 51078cbcf64fb..d19efff5520f6 100644
--- a/test/nn/test_pruning.py
+++ b/test/nn/test_pruning.py
@@ -894,6 +894,7 @@ def test_rnn_pruning(self):
         prune.l1_unstructured(l, "weight_ih_l0", 0.5)
         assert sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights) == 3
 
+<<<<<<< HEAD
         # Removing the pruning reparameterization restores the Parameter
         prune.remove(l, "weight_ih_l0")
         assert sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights) == 4
@@ -902,6 +903,16 @@ def test_rnn_pruning(self):
         # `._parameters` and `.named_parameters` contain the right params.
         # Specifically, the original weight ('weight_ih_l0') should be placed
         # back in the parameters, while the reparameterization component
+=======
+        # Removing the pruning reparametrization restores the Parameter
+        prune.remove(l, "weight_ih_l0")
+        assert sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights) == 4
+
+        # Make sure that, upon removal of the reparametrization, the
+        # `._parameters` and `.named_parameters` contain the right params.
+        # Specifically, the original weight ('weight_ih_l0') should be placed
+        # back in the parameters, while the reparametrization component
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ('weight_ih_l0_orig') should be removed.
         assert "weight_ih_l0" in l._parameters
         assert l._parameters["weight_ih_l0"] is not None
diff --git a/test/onnx/autograd_helper.py b/test/onnx/autograd_helper.py
index 23d42f7b63c27..ea6fa20a322f0 100644
--- a/test/onnx/autograd_helper.py
+++ b/test/onnx/autograd_helper.py
@@ -3,7 +3,11 @@
 import torch
 
 
+<<<<<<< HEAD
 # Autograd function that is a replica of the autograd function in
+=======
+# Autograd funtion that is a replica of the autograd funtion in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # test_utility_funs.py (test_autograd_module_name)
 class CustomFunction(torch.autograd.Function):
     @staticmethod
diff --git a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
new file mode 100644
index 0000000000000..2e47e48f140eb
--- /dev/null
+++ b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
@@ -0,0 +1,849 @@
+# Owner(s): ["module: onnx"]
+from __future__ import annotations
+
+import contextlib
+import copy
+import dataclasses
+import os
+import sys
+import unittest
+from pathlib import Path
+
+import onnxruntime
+from parameterized import parameterized
+
+import torch
+import torch._dynamo.backends.registry
+from torch import nn
+from torch.onnx import (
+    _OrtBackend as OrtBackend,
+    _OrtBackendOptions as OrtBackendOptions,
+)
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import skipIfNNModuleInlined
+
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
+import onnx_test_common
+
+
+def make_aot_ort():
+    ort_backend = OrtBackend(options=OrtBackendOptions())
+    return ort_backend, ort_backend
+
+
+class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.reset()
+        OrtBackend.clear_cached_instances()
+
+    def tearDown(self):
+        super().tearDown()
+        torch._dynamo.reset()
+        OrtBackend.clear_cached_instances()
+
+    def test_get_ort_device_type(self):
+        from onnxruntime.capi import _pybind_state as ORTC
+
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("cuda"),
+            ORTC.OrtDevice.cuda(),
+        )
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("cpu"),
+            ORTC.OrtDevice.cpu(),
+        )
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("maia"),
+            ORTC.OrtDevice.npu(),
+        )
+
+    def test_torch_compile_backend_registration(self):
+        self.assertIn("onnxrt", torch._dynamo.backends.registry.list_backends())
+        backend = torch._dynamo.backends.registry.lookup_backend("onnxrt")
+        self.assertEqual(backend.__module__, "torch.onnx._internal.onnxruntime")
+
+    def _test_torch_compile_backend_caching_assert_reused(
+        self, options: OrtBackendOptions
+    ):
+        self.assertFalse(OrtBackend.get_cached_instances())  # assert setUp/tearDown
+        new_backend = OrtBackend.get_cached_instance_for_options(options)
+        reused_backend = OrtBackend.get_cached_instance_for_options(options)
+        self.assertEqual(len(OrtBackend.get_cached_instances()), 1)
+        self.assertIs(reused_backend, new_backend)
+        if options is None or options.ort_session_options is None:
+            # OrtBackendOptions.ort_session_options is a pybind11 object that
+            # cannot be pickled via dataclasses.asdict
+            self.assertEqual(
+                new_backend,
+                OrtBackend.get_cached_instance_for_options(
+                    dataclasses.asdict(options) if options else None
+                ),
+            )
+
+    @parameterized.expand(
+        [
+            (None,),
+            (OrtBackendOptions(),),
+            (OrtBackendOptions(use_aot_autograd=True),),
+            (OrtBackendOptions(use_aot_autograd=False),),
+            (OrtBackendOptions(preallocate_output=True),),
+            (OrtBackendOptions(preallocate_output=False),),
+            (OrtBackendOptions(infer_execution_providers=True),),
+            (OrtBackendOptions(infer_execution_providers=False),),
+            (OrtBackendOptions(preferred_execution_providers=["A", "B", "C"]),),
+            (
+                OrtBackendOptions(
+                    preferred_execution_providers=["A", "B", ("C", {"option": "value"})]
+                ),
+            ),
+            (OrtBackendOptions(default_execution_providers=["Something"]),),
+            (OrtBackendOptions(),),
+        ]
+    )
+    def test_torch_compile_backend_caching_assert_reused(
+        self, options: OrtBackendOptions
+    ):
+        self._test_torch_compile_backend_caching_assert_reused(options)
+
+    @parameterized.expand(
+        [
+            (OrtBackendOptions(ort_session_options=onnxruntime.SessionOptions()),),
+        ]
+    )
+    def test_torch_compile_backend_caching_assert_not_reused(
+        self, options: OrtBackendOptions
+    ):
+        with self.assertRaises(AssertionError):
+            self._test_torch_compile_backend_caching_assert_reused(options)
+
+    def _test_model_numerically(
+        self,
+        model,
+        dynamo_backend,
+        example_args_collection,
+        fullgraph: bool = False,
+        test_backward: bool = False,
+        atol: float = 1e-5,
+        rtol: float = 1e-6,
+    ):
+        """Run original and compiled model and compare the results.
+
+        Args:
+            model: The model to test.
+            dynamo_backend: The dynamo backend to use. Here we use string `onnxrt` or
+              the first returned value of `make_aot_ort()`.
+            example_args_collection: A tuple of example arguments to test. E.g.,
+                (
+                  (torch.randn(2), torch.randn(2)),
+                  (torch.randn(4), torch.randn(4)),
+                )
+              if you want to test
+                model(torch.randn(2), torch.randn(2)) and
+                model(torch.randn(4), torch.randn(4))
+              .
+        """
+        compiled_model = torch.compile(
+            model if not isinstance(model, torch.nn.Module) else copy.deepcopy(model),
+            backend=dynamo_backend,
+            dynamic=True,
+            fullgraph=fullgraph,
+        )
+
+        for example_args in example_args_collection:
+            baseline_result = model(*example_args)
+            result = compiled_model(*example_args)
+            if isinstance(baseline_result, torch.Tensor):
+                torch.testing.assert_close(
+                    baseline_result, result, atol=atol, rtol=rtol
+                )
+                if test_backward:
+                    baseline_result.sum().backward()
+                    result.sum().backward()
+                    for baseline_param, param in zip(
+                        model.parameters(), compiled_model.parameters()
+                    ):
+                        torch.testing.assert_close(
+                            baseline_param.grad, param.grad, atol=atol, rtol=rtol
+                        )
+            else:
+                assert test_backward is False, (
+                    "Calculating backward with multiple outputs is not supported yet."
+                )
+                for baseline_elem, result_elem in zip(baseline_result, result):
+                    torch.testing.assert_close(
+                        baseline_elem, result_elem, atol=atol, rtol=rtol
+                    )
+
+    def _assert_counting_information(
+        self,
+        ort_backend: OrtBackend,
+        # Number of session runs.
+        # If there is no graph break, this should be the same as
+        # total number of forward calls.
+        expected_execution_count: int,
+        # Number of GraphModule's cached.
+        # With one graph break, a model will be mapped
+        # to two GraphModule's.
+        number_of_cached_graph_modules: int,
+        # Number of ONNX models cached for each GraphModule,
+        # number_of_exported_onnx_models[i] contains # of ONNX models exported from
+        # the i-th element (type: torch.fx.GraphModule) in
+        # OrtBackend._all_ort_execution_info.execution_info_per_graph_module.values().
+        number_of_exported_onnx_models_for_all_graph_modules: tuple[int, ...],
+    ):
+        self.assertEqual(expected_execution_count, ort_backend.execution_count)
+        self.assertEqual(
+            len(ort_backend._all_ort_execution_info.execution_info_per_graph_module),
+            number_of_cached_graph_modules,
+        )
+        self.assertEqual(
+            len(ort_backend._all_ort_execution_info.execution_info_per_graph_module),
+            len(number_of_exported_onnx_models_for_all_graph_modules),
+        )
+        for (
+            onnx_info,
+            expected_number_of_onnx_models,
+        ) in zip(
+            ort_backend._all_ort_execution_info.execution_info_per_graph_module.values(),
+            number_of_exported_onnx_models_for_all_graph_modules,
+        ):
+            self.assertEqual(len(onnx_info), expected_number_of_onnx_models)
+
+    def _assert_dynamic_input_and_output_shapes_in_all_onnx_models(self, backend):
+        for (
+            onnx_session_infos
+        ) in backend._all_ort_execution_info.execution_info_per_graph_module.values():
+            for onnx_session_info in onnx_session_infos:
+                inputs_have_dynamic_shapes = False
+                for input in onnx_session_info.input_value_infos:
+                    if hasattr(input.type, "tensor_type") and hasattr(
+                        input.type.tensor_type, "shape"
+                    ):
+                        for dim in input.type.tensor_type.shape.dim:
+                            inputs_have_dynamic_shapes = (
+                                inputs_have_dynamic_shapes or hasattr(dim, "dim_param")
+                            )
+                output_have_dynamic_shapes = False
+                for output in onnx_session_info.output_value_infos:
+                    if hasattr(output.type, "tensor_type") and hasattr(
+                        output.type.tensor_type, "shape"
+                    ):
+                        for dim in output.type.tensor_type.shape.dim:
+                            output_have_dynamic_shapes = (
+                                output_have_dynamic_shapes or hasattr(dim, "dim_param")
+                            )
+                self.assertTrue(inputs_have_dynamic_shapes)
+                self.assertTrue(output_have_dynamic_shapes)
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def test_elementwise_function_single_output(self, test_local_backend: bool):
+        example_args_collection = tuple(
+            (torch.randn(batch, dtype=torch.float32),) for batch in (2, 4, 6, 8, 10)
+        )
+
+        def elementwise_model(x: torch.Tensor):
+            y = x.relu()
+            z = y.sigmoid()
+            return z
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            # This will use the global ONNXRuntime backend registered
+            # in Dynamo to compile the tested model.
+            local_aot_ort, local_ort = "onnxrt", None
+
+        self._test_model_numerically(
+            elementwise_model,
+            local_aot_ort,
+            example_args_collection,
+        )
+
+        # We can only check local backend's counting information
+        # since global backend's counting information comes from
+        # all compiled models.
+        if test_local_backend:
+            assert local_ort is not None
+            self._assert_counting_information(
+                local_ort,
+                # OrtBackend._ort_acclerated_call should have been called 5 times because
+                # we have 5 different batch sizes to test.
+                expected_execution_count=len(example_args_collection),
+                # Since this local_ort only compiled one function,
+                # there should be only one GraphModule in its cached.
+                number_of_cached_graph_modules=1,
+                # Since dynamic shape is enabled, we should only have one ONNX model
+                # to support different batch sizes.
+                number_of_exported_onnx_models_for_all_graph_modules=(1,),
+            )
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def test_elementwise_function_multiple_output(self, test_local_backend: bool):
+        example_args_collection = tuple(
+            (torch.randn(batch, dtype=torch.float32),) for batch in (2, 4, 8)
+        )
+
+        def elementwise_model_with_multiple_outputs(w: torch.Tensor):
+            x = w + w
+            y = x.relu()
+            z = y * y
+            return x, y, z
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        self._test_model_numerically(
+            elementwise_model_with_multiple_outputs,
+            local_aot_ort,
+            example_args_collection,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            self._assert_counting_information(
+                local_ort,
+                expected_execution_count=len(example_args_collection),
+                number_of_cached_graph_modules=1,
+                number_of_exported_onnx_models_for_all_graph_modules=(1,),
+            )
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def test_mlp_with_local_backend(self, test_local_backend: bool):
+        example_args_collection = tuple(
+            (torch.randn(batch, 2, dtype=torch.float32),) for batch in (1, 2, 4, 6, 8)
+        )
+
+        class MLP(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 4, bias=True)
+                self.fc2 = nn.Linear(4, 2, bias=True)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                return tensor_x
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        self._test_model_numerically(
+            MLP(),
+            local_aot_ort,
+            example_args_collection,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            self._assert_counting_information(
+                local_ort,
+                # OrtBackend._ort_acclerated_call should have been called 5 times because
+                # we have 5 different batch sizes to test.
+                expected_execution_count=len(example_args_collection),
+                # Since this local_ort only compiled one function, there should be only two
+                # GraphModule's in its cached. One for batch sizes 2, 4, 6, 8 and the other
+                # for batch size 1.
+                number_of_cached_graph_modules=2,
+                # Since dynamic shape is enabled, we should only have one ONNX model
+                # to support different batch sizes.
+                number_of_exported_onnx_models_for_all_graph_modules=(1, 1),
+            )
+
+    @parameterized.expand(
+        [
+            (True, True),
+            (True, False),
+        ]
+    )
+    @skipIfNNModuleInlined("https://github.com/pytorch/pytorch/issues/129456")
+    def test_llama_attention_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import (  # noqa: F811
+            LlamaAttention,
+        )
+
+        hidden_size = 16
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=hidden_size,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        class LlamaAttentionWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                try:
+                    # New version of LlamaAttention has layer_idx argument.
+                    self.attention = LlamaAttention(config, layer_idx=0)
+                except TypeError:
+                    # Fall back to old version of LlamaAttention.
+                    self.attention = LlamaAttention(config)
+
+            def forward(self, hidden_states, attention_mask, position_ids):
+                attn_output, _, _ = self.attention(
+                    hidden_states, attention_mask, position_ids
+                )
+                return attn_output
+
+        def generate_example_inputs(batch: int, seq: int, hidden_size: int):
+            # shape: batch x seq x hidden_size
+            hidden_state = torch.randn(batch, seq, hidden_size)
+            # [0.0000e+00, ..., 0.0000e+00, -3.4028e+38, ...]
+            # shape: batch x 1 x seq x seq
+            attention_mask = torch.zeros(batch, 1, seq, seq, dtype=torch.float)
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+
+            return hidden_state, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8, hidden_size),
+            generate_example_inputs(4, 7, hidden_size),
+            generate_example_inputs(9, 15, hidden_size),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaAttentionWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+            self._assert_counting_information(
+                local_ort,
+                # Number of InferenceSession runs.
+                expected_execution_count=execution_count,
+                # Number of GraphModule's seen by ORT.
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                # Number of InferenceSession's created per GraphModule.
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True, False),
+            (True, True),
+        ]
+    )
+    @skipIfNNModuleInlined("https://github.com/pytorch/pytorch/issues/129456")
+    def test_llama_decoder_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import (  # noqa: F811
+            LlamaDecoderLayer,
+        )
+
+        hidden_size = 16
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=hidden_size,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        class LlamaDecoderWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                try:
+                    # New version of LlamaDecoderLayer has layer_idx argument.
+                    self.decoder = LlamaDecoderLayer(config, layer_idx=0)
+                except TypeError:
+                    # Fall back to old version of LlamaDecoderLayer.
+                    self.decoder = LlamaDecoderLayer(config)
+
+            def forward(self, hidden_states, attention_mask, position_ids):
+                (decoder_output,) = self.decoder(
+                    hidden_states, attention_mask, position_ids
+                )
+                return decoder_output
+
+        def generate_example_inputs(batch: int, seq: int, hidden_size: int):
+            # shape: batch x seq x hidden_size
+            hidden_state = torch.randn(batch, seq, hidden_size)
+            # [0.0000e+00, ..., 0.0000e+00, -3.4028e+38, ...]
+            # shape: batch x 1 x seq x seq
+            attention_mask = torch.zeros(batch, 1, seq, seq, dtype=torch.float)
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+            return hidden_state, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8, hidden_size),
+            generate_example_inputs(4, 7, hidden_size),
+            generate_example_inputs(9, 15, hidden_size),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaDecoderWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+
+            self._assert_counting_information(
+                local_ort,
+                expected_execution_count=execution_count,
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True, False),
+            (True, True),
+        ]
+    )
+    @skipIfNNModuleInlined("https://github.com/pytorch/pytorch/issues/129456")
+    def test_llama_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import LlamaModel  # noqa: F811
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=16,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        config._attn_implementation = "eager"
+
+        class LlamaModelWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                self.llama = LlamaModel(config)
+
+            def forward(self, input_ids, attention_mask, position_ids):
+                decoder_output = self.llama(
+                    input_ids, attention_mask, position_ids, return_dict=False
+                )
+                return decoder_output[0]
+
+        def generate_example_inputs(batch: int, seq: int):
+            # shape: batch x seq x hidden_size
+            input_ids = torch.randint(0, 7, size=(batch, seq), dtype=torch.int64)
+            # Usually, its shape is a tensor with shape batch x seq x seq.
+            # However, to bypass some control flow in the model, we use None.
+            attention_mask = None
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+            return input_ids, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8),
+            generate_example_inputs(4, 7),
+            generate_example_inputs(9, 15),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort()
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaModelWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+            atol=1e-4,
+            rtol=1e-4,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+            self._assert_counting_information(
+                local_ort,
+                expected_execution_count=execution_count,
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def test_dump_model(self, test_local_backend: bool):
+        @contextlib.contextmanager
+        def onnxrt_dump_path(path):
+            key = "ONNXRT_DUMP_PATH"
+            before = os.environ.get(key, None)
+            os.environ[key] = path
+            yield
+            if before is None:
+                del os.environ[key]
+            else:
+                os.environ[key] = before
+
+        example_args_collection = tuple(
+            (torch.randn(batch, 2, dtype=torch.float32),) for batch in (1, 2, 4, 6, 8)
+        )
+
+        class MLP(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 4, bias=True)
+                self.fc2 = nn.Linear(4, 2, bias=True)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                return tensor_x
+
+        if test_local_backend:
+            local_aot_ort, _ = make_aot_ort()
+        else:
+            local_aot_ort, _ = "onnxrt", None
+
+        prefix = f"test_dump_model_{'local' if test_local_backend else 'onnxrt'}_"
+        expected = f"{prefix}0.onnx"
+        expected_graph = f"{prefix}0.txt"
+        if os.path.exists(expected):
+            os.remove(expected)
+        if os.path.exists(expected_graph):
+            os.remove(expected_graph)
+        not_expected = f"{prefix}1.onnx"
+        self.assertFalse(os.path.exists(not_expected))
+
+        model = MLP()
+        compiled_model = torch.compile(
+            model if not isinstance(model, torch.nn.Module) else copy.deepcopy(model),
+            backend=local_aot_ort,
+            dynamic=True,
+        )
+
+        self.assertFalse(os.path.exists(expected))
+        self.assertFalse(os.path.exists(not_expected))
+
+        with onnxrt_dump_path(prefix):
+            example_args = example_args_collection[0]
+            compiled_model(*example_args)
+            self.assertTrue(os.path.exists(expected))
+            self.assertTrue(os.path.exists(expected_graph))
+            self.assertFalse(os.path.exists(not_expected))
+
+            compiled_model(*example_args)
+            self.assertTrue(os.path.exists(expected))
+            self.assertFalse(os.path.exists(not_expected))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "No CUDA to run mix devicei nputs")
+    def test_mix_device_inputs(self):
+        data = torch.randn(4, 8, device="cuda")
+        ref_data = torch.randn(8, 4, device="cpu")
+
+        def reshape_wrapper(data, ref_cpu_data):
+            # Dummy line to make sure ref_cpu_data
+            # is included in the captured graph.
+            ref_cpu_data += 1
+            shape = ref_cpu_data.shape
+            # A call with GPU and CPU inputs.
+            return torch.reshape(data, shape)
+
+        compiled_model = torch.compile(
+            reshape_wrapper,
+            backend="onnxrt",
+            dynamic=True,
+        )
+
+        result = compiled_model(data, ref_data)
+
+        self.assertTrue(torch.allclose(result, data.view(ref_data.shape)))
+
+    def test_no_input(self):
+        def reshape_wrapper():
+            # A model without input.
+            ones = torch.ones(4, 8)
+            zeros = torch.zeros(4, 8)
+            return ones + zeros
+
+        recorded_models = []
+
+        def record_onnx_model_transform(onnx_model):
+            # Record the ONNX model seen by the transform.
+            recorded_models.append(onnx_model)
+
+        compiled_model = torch.compile(
+            reshape_wrapper,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[
+                    record_onnx_model_transform,
+                ]
+            ),
+        )
+
+        result = compiled_model()
+
+        self.assertEqual(len(recorded_models), 1)
+        # NOTE: Constant folded by optimizer
+        self.assertTrue(
+            "Constant" in [node.op_type for node in recorded_models[0].graph.node]
+        )
+
+        self.assertEqual(result, torch.ones(4, 8))
+
+    def test_custom_onnx_transform(self):
+        # This test consists of 2 parts:
+        # 1. If a registered ONNX transform is called and recorded a model.
+        # 2. If a registered ONNX transform is called and changed the model
+
+        # Part 1: Record the ONNX model seen by the transform.
+        # This list contains the models recorded by record_onnx_model_transform.
+        recorded_models = []
+
+        def record_onnx_model_transform(onnx_model):
+            # Record the ONNX model seen by the transform.
+            recorded_models.append(onnx_model)
+
+        def example_model(x: torch.Tensor):
+            y = torch.sigmoid(x)
+            z = x + y
+            return z
+
+        compiled_model = torch.compile(
+            example_model,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[record_onnx_model_transform]
+            ),
+        )
+
+        x = torch.randn(2)
+        assert len(recorded_models) == 0
+        y = compiled_model(x)
+        assert len(recorded_models) == 1
+
+        # Part 2: Change the ONNX model seen by the transform so that
+        # ORT receives a different model.
+        # NOTE: the function is optimized away by optimizer
+        def replace_relu_with_sigmoid(onnx_model):
+            for node in onnx_model.graph.node:
+                if node.op_type == "Relu":
+                    node.op_type = "Sigmoid"
+
+        def another_example_model(x: torch.Tensor):
+            y = torch.relu(x)
+            z = x + y
+            return z
+
+        another_compiled = torch.compile(
+            another_example_model,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[
+                    replace_relu_with_sigmoid,
+                    record_onnx_model_transform,
+                ]
+            ),
+        )
+
+        another_y = another_compiled(x)
+        # We have 2 models recorded `record_onnx_model_transform`
+        # by the 2 torch.compile calls above.
+        assert len(recorded_models) == 2
+        # Since we have changed "Relu" to "Sigmoid" in replace_sigmoid_with_relu,
+        # the result should be the same to previous y.
+        torch.testing.assert_close(y, another_y)
+        # another_example_model still uses "Relu", so the result should be different
+        # than y.
+        self.assertFalse(torch.allclose(y, another_example_model(x)))
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index 2fefd592ecfee..f4b881d437d40 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -4,12 +4,22 @@
 from __future__ import annotations
 
 import io
+<<<<<<< HEAD
 import logging
 import os
 
 from onnxscript import BOOL, FLOAT, opset18 as op
 
 import torch
+=======
+import os
+
+import numpy as np
+from onnxscript import BOOL, FLOAT, ir, opset18 as op
+
+import torch
+import torch.onnx._flags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.exporter import _testing as onnx_testing
 from torch.testing._internal import common_utils
 
@@ -28,11 +38,14 @@ def forward(self, x, b):
         return (y, z)
 
 
+<<<<<<< HEAD
 class SampleModelReduction(torch.nn.Module):
     def forward(self, x):
         return x.sum()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SampleModelForDynamicShapes(torch.nn.Module):
     def forward(self, x, b):
         return x.relu(), b.sigmoid()
@@ -70,7 +83,10 @@ def assert_export(
         )
         assert onnx_program is not None
         onnx_testing.assert_onnx_program(onnx_program, strategy=strategy)
+<<<<<<< HEAD
         return onnx_program
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_args_normalization_with_no_kwargs(self):
         self.assert_export(
@@ -78,6 +94,7 @@ def test_args_normalization_with_no_kwargs(self):
             (torch.randn(1, 1, 2), torch.randn(1, 1, 2)),
         )
 
+<<<<<<< HEAD
     def test_lower_opset_support(self):
         # First test that opset 18 (torchlib opset works)
         onnx_program = self.assert_export(
@@ -152,6 +169,8 @@ def forward(self, a, x):
 
         self.assertEqual(len(onnx_program.model.graph.inputs), 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_axes_enable_dynamic_shapes_with_fully_specified_axes(self):
         self.assert_export(
             SampleModelForDynamicShapes(),
@@ -202,6 +221,7 @@ def test_dynamic_axes_supports_output_names(self):
             dynamic_axes={"b": [0, 1, 2], "b_out": [0, 1, 2]},
         )
 
+<<<<<<< HEAD
     def test_from_dynamic_axes_to_dynamic_shapes_deprecation_warning(self):
         with self.assertWarnsRegex(
             DeprecationWarning,
@@ -247,6 +267,8 @@ def test_from_dynamic_axes_to_dynamic_shapes_keeps_custom_axis_names(self):
         self.assertIs(onnx_program.model.graph.inputs[1].shape[1].value, "customb_b_1")
         self.assertIs(onnx_program.model.graph.inputs[1].shape[2].value, "customb_b_2")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_saved_f_exists_after_export(self):
         with common_utils.TemporaryFileName(suffix=".onnx") as path:
             _ = torch.onnx.export(
@@ -295,6 +317,38 @@ def test_partial_dynamic_shapes(self):
             },
         )
 
+<<<<<<< HEAD
+=======
+    def test_auto_convert_all_axes_to_dynamic_shapes_with_dynamo_export(self):
+        torch.onnx._flags.USE_EXPERIMENTAL_LOGIC = True
+
+        class Nested(torch.nn.Module):
+            def forward(self, x):
+                (a0, a1), (b0, b1), (c0, c1, c2) = x
+                return a0 + a1 + b0 + b1 + c0 + c1 + c2
+
+        inputs = (
+            (1, 2),
+            (
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+            ),
+            (
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+            ),
+        )
+
+        onnx_program = torch.onnx.dynamo_export(
+            Nested(),
+            inputs,
+            export_options=torch.onnx.ExportOptions(dynamic_shapes=True),
+        )
+        assert onnx_program is not None
+        onnx_testing.assert_onnx_program(onnx_program)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_supports_nested_input_model_with_input_names_assigned(self):
         # kwargs can still be renamed as long as it's in order
         input_names = ["input_x", "input_y", "input_z", "d", "e", "f"]
@@ -342,11 +396,18 @@ def forward(self, input):
                 # Use GELU activation function
                 return torch.nn.functional.gelu(input, approximate="tanh")
 
+<<<<<<< HEAD
         input = (torch.randn(1, 3, 4, 4),)
         onnx_program_op18 = torch.onnx.export(
             GeluModel(),
             input,
             opset_version=18,
+=======
+        input = torch.randn(1, 3, 4, 4)
+        onnx_program_op18 = torch.onnx.export(
+            GeluModel(),
+            input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dynamo=True,
         )
         all_nodes_op18 = [n.op_type for n in onnx_program_op18.model.graph]
@@ -400,6 +461,7 @@ def test_export_successful_when_dynamic_dimension_is_one(self):
             ),
         )
 
+<<<<<<< HEAD
     def test_is_in_onnx_export(self):
         class Mod(torch.nn.Module):
             def forward(self, x):
@@ -441,6 +503,8 @@ def forward(self, x):
         )
         onnx_testing.assert_onnx_program(onnx_program)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestCustomTranslationTable(common_utils.TestCase):
     def test_custom_translation_table_overrides_ops(self):
@@ -572,6 +636,7 @@ def onnx_add(self: FLOAT, other: FLOAT) -> FLOAT:
             self.assertIn("Add", all_nodes_decomp)
             self.assertNotIn("Sub", all_nodes_decomp)
 
+<<<<<<< HEAD
     def test_01_specialization_with_run_decomp_is_supported(self):
         # Phi3RMSNorm changes and redo shape inference after `run_decompositions` call
         # We need this test to make sure everything we do on fx graph is covered by
@@ -608,6 +673,137 @@ def forward(self, hidden_states):
         )
         # batch size is not fixed to 1
         self.assertNotEqual(op.model.graph.outputs[0].shape[0], 1)
+=======
+
+class TestFakeTensorExport(common_utils.TestCase):
+    """Test exporting in fake mode."""
+
+    def test_onnx_program_raises_when_model_defined_in_fake_mode(self):
+        with torch.onnx.enable_fake_mode():
+
+            class Model(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.weight = torch.nn.Parameter(torch.tensor(42.0))
+
+                def forward(self, x):
+                    return self.weight + x
+
+            onnx_program = torch.onnx.export(
+                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
+            )
+            assert onnx_program is not None
+            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
+            with self.assertRaises(Exception):
+                # The tensors need to be replaced with real tensors
+                _ = onnx_program.model_proto
+
+        # Convert to model proto and back to trigger to_bytes method which serializes the tensor
+        with self.assertRaises(Exception):
+            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
+            _ = onnx_program.model_proto
+
+        # If we replace with concrete tensors, the serialization will succeed.
+        # This needs to happen outside of the fake context
+        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
+        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
+        np.testing.assert_allclose(
+            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
+        )
+
+    def test_onnx_program_save_raises_when_model_initialized_in_fake_mode(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.tensor(42.0))
+
+            def forward(self, x):
+                return self.weight + x
+
+        with torch.onnx.enable_fake_mode():
+            onnx_program = torch.onnx.export(
+                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
+            )
+            assert onnx_program is not None
+            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
+            with self.assertRaises(Exception):
+                # The tensors need to be replaced with real tensors
+                _ = onnx_program.model_proto
+
+        with self.assertRaises(Exception):
+            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
+            _ = onnx_program.model_proto
+
+        # If we replace with concrete tensors, the serialization will succeed
+        # This needs to happen outside of the fake context
+        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
+        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
+        np.testing.assert_allclose(
+            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
+        )
+
+    def test_onnx_program_save_succeeds_when_export_and_save_in_fake_mode(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.tensor(42.0))
+
+            def forward(self, x):
+                return self.weight + x
+
+        real_model = Model()
+
+        with torch.onnx.enable_fake_mode():
+            onnx_program = torch.onnx.export(
+                real_model, (torch.tensor(1.0),), dynamo=True, optimize=False
+            )
+
+            assert onnx_program is not None
+            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
+            # Note that even though we are calling .model_proto (equivalently .save()) in fake mode,
+            # the concrete tensors are maintained.
+            # This is due to the usage of torch._subclasses.fake_tensor.unset_fake_temporarily() in
+            # TorchTensor.tobytes()
+            onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
+            np.testing.assert_allclose(
+                onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
+            )
+
+        # This works inside or outside the fake mode
+        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
+        np.testing.assert_allclose(
+            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
+        )
+
+    def test_is_in_onnx_export(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                def f(x):
+                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
+
+                return f(x)
+
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+        onnx_program = torch.onnx.export(
+            Mod(),
+            (torch.randn(3, 4),),
+            dynamo=True,
+            fallback=False,
+        )
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+
+        node_names = [n.op_type for n in onnx_program.model.graph]
+        self.assertIn("Sin", node_names)
+
+    def test_torchscript_exporter_raises_deprecation_warning(self):
+        # Test that the deprecation warning is raised when using torchscript exporter
+        with self.assertWarnsRegex(
+            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
+        ):
+            torch.onnx.export(
+                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/exporter/test_core.py b/test/onnx/exporter/test_core.py
index e0742cb70f5f5..9a6cfdbb5659e 100644
--- a/test/onnx/exporter/test_core.py
+++ b/test/onnx/exporter/test_core.py
@@ -3,10 +3,13 @@
 
 from __future__ import annotations
 
+<<<<<<< HEAD
 import io
 import os
 import tempfile
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import ml_dtypes
 import numpy as np
 
@@ -86,6 +89,7 @@ def test_tobytes_float4(self):
         self.assertEqual(tensor.tobytes(), b"\x01")
 
 
+<<<<<<< HEAD
 class TorchTensorToFileTest(common_utils.TestCase):
     def _roundtrip_file(self, tensor: _core.TorchTensor) -> bytes:
         expected = tensor.tobytes()
@@ -156,5 +160,7 @@ def test_tofile_non_contiguous(self):
         self.assertEqual(tensor.tobytes(), expected_manual)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/exporter/test_dynamic_shapes.py b/test/onnx/exporter/test_dynamic_shapes.py
index 42a08e5647bdb..3da3a2cf0ebac 100644
--- a/test/onnx/exporter/test_dynamic_shapes.py
+++ b/test/onnx/exporter/test_dynamic_shapes.py
@@ -199,7 +199,10 @@ def test_dynamic_shapes_supports_nested_input_model_with_input_names_assigned(
                 filename,
                 dynamic_axes=dynamic_axes,
                 input_names=input_names,
+<<<<<<< HEAD
                 dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             onnx_model = onnx.load(filename)
 
diff --git a/test/onnx/exporter/test_small_models_e2e.py b/test/onnx/exporter/test_small_models_e2e.py
index b59dcd4eec5f8..d3b0f38700028 100644
--- a/test/onnx/exporter/test_small_models_e2e.py
+++ b/test/onnx/exporter/test_small_models_e2e.py
@@ -5,9 +5,19 @@
 
 import logging
 
+<<<<<<< HEAD
 import pytest
 import transformers
 from onnxscript import ir
+=======
+import onnx.reference as onnx_ref
+
+import onnxruntime
+import pytest
+import transformers
+from onnxscript import ir
+from packaging import version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.onnx._internal.exporter import _testing as onnx_testing
@@ -15,6 +25,13 @@
 from torch.utils import _pytree as torch_pytree
 
 
+<<<<<<< HEAD
+=======
+def has_onnxruntime_opset_23() -> bool:
+    return version.parse(onnxruntime.__version__) >= version.parse("1.22")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _WithExport:
     def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgram:
         onnx_program = torch.onnx.export(
@@ -726,7 +743,32 @@ def forward(self, x):
             [node.op_type for node in onnx_program.model.graph],
         )
 
+<<<<<<< HEAD
     def test_attention_opset_23(self):
+=======
+    def test_graph_attention_opset_23(self):
+        class Model(torch.nn.Module):
+            def forward(self, query, key, value):
+                return torch.nn.functional.scaled_dot_product_attention(
+                    query, key, value
+                )
+
+        query = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        key = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        value = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        expected = Model()(query, key, value)
+
+        onnx_program = self.export(Model(), (query, key, value), opset_version=23)
+        self.assertIn("Attention", [node.op_type for node in onnx_program.model.graph])
+
+        ref = onnx_ref.ReferenceEvaluator(onnx_program.model_proto)
+        got = ref.run(
+            None, dict(query=query.numpy(), key=key.numpy(), value=value.numpy())
+        )[0]
+        torch.testing.assert_close(torch.from_numpy(got), expected, atol=1e-2, rtol=1)
+
+    def test_graph_accuracy_attention_opset_23(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def forward(self, query, key, value):
                 return torch.nn.functional.scaled_dot_product_attention(
@@ -737,6 +779,7 @@ def forward(self, query, key, value):
         key = torch.rand(32, 8, 128, 64, dtype=torch.float16)
         value = torch.rand(32, 8, 128, 64, dtype=torch.float16)
 
+<<<<<<< HEAD
         onnx_program = self.export(Model(), (query, key, value), opset_version=23)
         self.assertEqual(["Attention"], [n.op_type for n in onnx_program.model.graph])
 
@@ -821,6 +864,15 @@ def forward(self, q, k, v):
         self.assertIn("Unsqueeze", all_ops)
         self.assertIn("Expand", all_ops)
         self.assertIn("Reshape", all_ops)
+=======
+        onnx_program = self.export(
+            Model(), (query, key, value), opset_version=23, optimize=True
+        )
+        self.assertEqual(["Attention"], [n.op_type for n in onnx_program.model.graph])
+        # onnxruntime inlines any op defined as a function and without any implemented kernel
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program, atol=1e-2, rtol=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registraion.py
index fcc4cdeedd92f..9a4d1ff630b78 100644
--- a/test/onnx/internal/test_registraion.py
+++ b/test/onnx/internal/test_registraion.py
@@ -4,7 +4,11 @@
 from collections.abc import Sequence
 
 from torch.onnx import errors
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter import registration
+=======
+from torch.onnx._internal import registration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index bdeeb40cb9f64..acd8c18cff73d 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -7,8 +7,13 @@
 import io
 import os
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
 from typing import Any, Optional, Union
+=======
+from collections.abc import Collection, Iterable, Mapping, Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 import onnxruntime
@@ -17,8 +22,12 @@
 
 import torch
 from torch import export as torch_export
+<<<<<<< HEAD
 from torch.onnx import _constants
 from torch.onnx._internal.torchscript_exporter import verification
+=======
+from torch.onnx import _constants, verification
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 from torch.testing._internal.opinfo import core as opinfo_core
 from torch.types import Number
@@ -66,6 +75,39 @@ def run_model_test(test_suite: _TestONNXRuntime, *args, **kwargs):
     return verification.verify(*args, options=options, **kwargs)
 
 
+<<<<<<< HEAD
+=======
+def assert_dynamic_shapes(onnx_program: torch.onnx.ONNXProgram, dynamic_shapes: bool):
+    """Assert whether the exported model has dynamic shapes or not.
+
+    Args:
+        onnx_program (torch.onnx.ONNXProgram): The output of torch.onnx.dynamo_export.
+        dynamic_shapes (bool): Whether the exported model has dynamic shapes or not.
+            When True, raises if graph inputs don't have at least one dynamic dimension
+            When False, raises if graph inputs have at least one dynamic dimension.
+
+    Raises:
+        AssertionError: If the exported model has dynamic shapes and dynamic_shapes is False and vice-versa.
+    """
+
+    if dynamic_shapes is None:
+        return
+
+    model_proto = onnx_program.model_proto
+    # Process graph inputs
+    dynamic_inputs = []
+    for inp in model_proto.graph.input:
+        dynamic_inputs += [
+            dim
+            for dim in inp.type.tensor_type.shape.dim
+            if dim.dim_value == 0 and dim.dim_param != ""
+        ]
+    assert dynamic_shapes == (len(dynamic_inputs) > 0), (
+        "Dynamic shape check failed for graph inputs"
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def parameterize_class_name(cls: type, idx: int, input_dicts: Mapping[Any, Any]):
     """Combine class name with the parameterized arguments.
 
diff --git a/test/onnx/ops/test_ops.py b/test/onnx/ops/test_ops.py
index 2335d65f8b864..7e54298ce9a9f 100644
--- a/test/onnx/ops/test_ops.py
+++ b/test/onnx/ops/test_ops.py
@@ -7,7 +7,10 @@
 from onnxscript import ir
 
 import torch
+<<<<<<< HEAD
 from torch.onnx._internal.exporter import _testing as onnx_testing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx.ops import _impl, _symbolic_impl
 from torch.testing._internal import common_utils
 
@@ -416,7 +419,10 @@ def test_symbolic_multi_out_raises_when_dtypes_and_shapes_differ(self):
             )
 
 
+<<<<<<< HEAD
 @common_utils.instantiate_parametrized_tests
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NativeOnnxOpsTest(common_utils.TestCase):
     def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgram:
         onnx_program = torch.onnx.export(
@@ -434,7 +440,11 @@ def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgr
 
     def test_onnx_ops_can_be_decomposed_to_aten(self):
         input_data = torch.rand(2, 3, 4, 8)
+<<<<<<< HEAD
         position_ids_data = torch.randint(0, 50, (2, 4)).long()
+=======
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -475,7 +485,11 @@ def forward(
 
     def test_rotary_embedding_opcheck(self):
         input_data = torch.rand(2, 3, 4, 8)
+<<<<<<< HEAD
         position_ids_data = torch.randint(0, 50, (2, 4)).long()
+=======
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -486,7 +500,11 @@ def test_rotary_embedding_opcheck(self):
 
     def test_rotary_embedding(self):
         input_data = torch.rand(2, 3, 4, 8)
+<<<<<<< HEAD
         position_ids_data = torch.randint(0, 50, (2, 4)).long()
+=======
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -527,6 +545,7 @@ def forward(
         )
         self.assertEqual(onnx_program.model.opset_imports[""], 23)
         self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
 
     def test_rotary_embedding_3d(self):
@@ -564,6 +583,10 @@ def forward(self, input_data, cos_cache_data, sin_cache_data):
         onnx_testing.assert_onnx_program(onnx_program)
 
     def test_attention_without_past_kv_caches(self):
+=======
+
+    def test_attention_basic(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Test basic attention functionality."""
         batch_size, q_seq_len, kv_seq_len = 2, 4, 6
         q_num_heads, kv_num_heads = 8, 8
@@ -614,6 +637,7 @@ def test_attention_3d_inputs(self):
             present_value.shape, (batch_size, kv_num_heads, kv_seq_len, head_size)
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "name, kv_num_heads",
         [
@@ -624,6 +648,12 @@ def test_attention_3d_inputs(self):
     def test_attention_kv_num_heads(self, name: str, kv_num_heads: int):
         batch_size, q_seq_len, kv_seq_len = 2, 4, 6
         q_num_heads = 8
+=======
+    def test_attention_gqa(self):
+        """Test Group Query Attention (GQA)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 4  # GQA: q_num_heads % kv_num_heads = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         head_size = 64
 
         Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
@@ -784,6 +814,29 @@ def test_attention_with_gqa_and_mask(self):
             output_4d.shape, (batch_size, q_num_heads, q_seq_len, head_size)
         )
 
+<<<<<<< HEAD
+=======
+    def test_attention_with_large_negative_float_mask(self):
+        """Test attention with large negative values in float mask."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Create mask with large negative values (similar to -inf masking)
+        float_mask = torch.full((q_seq_len, kv_seq_len), -1e9)
+        # Allow some positions
+        float_mask[:, :3] = 0.0
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=float_mask))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=float_mask)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_attention_causal(self):
         """Test causal attention."""
         batch_size, q_seq_len, kv_seq_len = 2, 4, 4  # Square for causal
@@ -901,7 +954,13 @@ def test_attention_export(self):
 
         class AttentionModel(torch.nn.Module):
             def forward(self, Q, K, V):
+<<<<<<< HEAD
                 output, _, _, _ = torch.onnx.ops.attention(Q, K, V)
+=======
+                output, present_key, present_value, qk_output = (
+                    torch.onnx.ops.attention(Q, K, V)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return output
 
         model = AttentionModel()
@@ -914,7 +973,10 @@ def forward(self, Q, K, V):
 
         self.assertEqual(onnx_program.model.opset_imports[""], 23)
         self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_with_dynamic_shapes(self):
         """Test attention export with dynamic shapes."""
@@ -925,6 +987,7 @@ def test_attention_export_with_dynamic_shapes(self):
         Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
         K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
         V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+<<<<<<< HEAD
         attn_mask = torch.randint(
             0, 2, (batch_size, 1, q_seq_len, kv_seq_len), dtype=torch.bool
         )
@@ -932,6 +995,14 @@ def test_attention_export_with_dynamic_shapes(self):
         class AttentionModel(torch.nn.Module):
             def forward(self, Q, K, V, attn_mask):
                 output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=attn_mask)
+=======
+
+        class AttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, present_key, present_value, qk_output = (
+                    torch.onnx.ops.attention(Q, K, V)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return output
 
         model = AttentionModel()
@@ -940,12 +1011,19 @@ def forward(self, Q, K, V, attn_mask):
             "Q": {0: "batch", 2: "q_seq_len"},
             "K": {0: "batch", 2: "kv_seq_len"},
             "V": {0: "batch", 2: "kv_seq_len"},
+<<<<<<< HEAD
             "attn_mask": {0: "batch", 2: "q_seq_len", 3: "kv_seq_len"},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         onnx_program = self.export(
             model,
+<<<<<<< HEAD
             (Q, K, V, attn_mask),
+=======
+            (Q, K, V),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dynamic_shapes=dynamic_shapes,
             opset_version=23,
         )
@@ -954,7 +1032,11 @@ def forward(self, Q, K, V, attn_mask):
         self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
         node = onnx_program.model.graph.node(0)
         # Verify inputs
+<<<<<<< HEAD
         self.assertEqual(len(node.inputs), 4)
+=======
+        self.assertEqual(len(node.inputs), 3)  # Q, K, V (no optional inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             node.inputs[0].shape, ["batch", q_num_heads, "q_seq_len", head_size]
         )
@@ -967,7 +1049,10 @@ def forward(self, Q, K, V, attn_mask):
 
         # Verify default attributes (should be minimal)
         self.assertEqual(len(node.attributes), 0)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_3d_export(self):
         """Test attention export with 3D inputs."""
@@ -996,7 +1081,10 @@ def forward(self, Q, K, V):
 
         self.assertEqual(onnx_program.model.opset_imports[""], 23)
         self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_decomposition(self):
         """Test that attention can be decomposed to aten ops."""
@@ -1050,7 +1138,11 @@ def test_attention_export_with_past_key_value(self):
 
         class Model(torch.nn.Module):
             def forward(self, Q, K, V, past_key, past_value):
+<<<<<<< HEAD
                 output, present_key, present_value, _ = torch.onnx.ops.attention(
+=======
+                output, _, _, _ = torch.onnx.ops.attention(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     Q,
                     K,
                     V,
@@ -1059,7 +1151,11 @@ def forward(self, Q, K, V, past_key, past_value):
                     # Switched argument order
                     past_value=past_value,
                 )
+<<<<<<< HEAD
                 return output, present_key, present_value
+=======
+                return output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = Model()
         onnx_program = self.export(
@@ -1089,7 +1185,10 @@ def forward(self, Q, K, V, past_key, past_value):
         self.assertEqual(
             node.inputs[5].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
         )
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_with_all_optional_inputs(self):
         """Test export with all optional inputs: mask, past_key, past_value."""
@@ -1108,6 +1207,7 @@ def test_attention_export_with_all_optional_inputs(self):
 
         class FullAttentionModel(torch.nn.Module):
             def forward(self, Q, K, V, attn_mask, past_key, past_value):
+<<<<<<< HEAD
                 output, present_key, present_value, qk_matmul = (
                     torch.onnx.ops.attention(
                         Q,
@@ -1119,6 +1219,17 @@ def forward(self, Q, K, V, attn_mask, past_key, past_value):
                     )
                 )
                 return output, present_key, present_value, qk_matmul
+=======
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q,
+                    K,
+                    V,
+                    attn_mask=attn_mask,
+                    past_key=past_key,
+                    past_value=past_value,
+                )
+                return output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = FullAttentionModel()
         onnx_program = self.export(
@@ -1150,7 +1261,10 @@ def forward(self, Q, K, V, attn_mask, past_key, past_value):
         self.assertEqual(
             node.inputs[5].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
         )
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_3d_with_num_heads_attributes(self):
         """Test export with 3D inputs and explicit num_heads attributes."""
@@ -1192,7 +1306,10 @@ def forward(self, Q, K, V):
         self.assertIn("kv_num_heads", attrs)
         self.assertEqual(attrs["q_num_heads"].value, q_num_heads)
         self.assertEqual(attrs["kv_num_heads"].value, kv_num_heads)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_with_all_attributes(self):
         """Test export with all possible attributes set."""
@@ -1237,7 +1354,10 @@ def forward(self, Q, K, V):
         self.assertAlmostEqual(attrs["scale"].value, 0.25, places=6)
         self.assertAlmostEqual(attrs["softcap"].value, 30.0, places=6)
         self.assertEqual(attrs["softmax_precision"].value, 1)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_with_different_mask_shapes(self):
         """Test export with different attention mask shapes."""
@@ -1262,7 +1382,10 @@ def forward(self, Q, K, V, mask):
 
         node_2d = onnx_program_2d.model.graph.node(0)
         self.assertEqual(node_2d.inputs[3].shape, [q_seq_len, kv_seq_len])
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program_2d)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test 3D mask
         mask_3d = torch.randint(
@@ -1281,7 +1404,10 @@ def forward(self, Q, K, V, mask):
         self.assertEqual(
             node_3d.inputs[3].shape, [batch_size, 1, q_seq_len, kv_seq_len]
         )
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program_3d)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test 4D mask
         mask_4d = torch.randint(
@@ -1300,7 +1426,10 @@ def forward(self, Q, K, V, mask):
         self.assertEqual(
             node_4d.inputs[3].shape, [batch_size, q_num_heads, q_seq_len, kv_seq_len]
         )
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program_4d)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_with_float_mask(self):
         """Test export with float attention mask."""
@@ -1326,7 +1455,10 @@ def forward(self, Q, K, V, mask):
         self.assertEqual(node.inputs[3].shape, [q_seq_len, kv_seq_len])
         # Verify the mask input has float dtype in the ONNX model
         self.assertEqual(node.inputs[3].dtype, ir.DataType.FLOAT)
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_qk_output_modes(self):
         """Test export with different QK output modes."""
@@ -1365,7 +1497,10 @@ def forward(self, Q, K, V):
 
             # Verify 4 outputs (output, present_key, present_value, qk_output)
             self.assertEqual(len(node.outputs), 4)
+<<<<<<< HEAD
             onnx_testing.assert_onnx_program(onnx_program)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_mqa(self):
         """Test export with Multi-Query Attention (MQA)."""
@@ -1398,6 +1533,7 @@ def forward(self, Q, K, V):
         self.assertEqual(
             node.inputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
         )
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
 
     @common_utils.parametrize(
@@ -1412,6 +1548,10 @@ def forward(self, Q, K, V):
     def test_attention_export_with_softmax_precision(
         self, precision_enum, precision_name: str
     ):
+=======
+
+    def test_attention_export_with_softmax_precision(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Test export with different softmax precision values."""
         batch_size, q_seq_len, kv_seq_len = 2, 4, 6
         q_num_heads, kv_num_heads = 8, 8
@@ -1421,6 +1561,7 @@ def test_attention_export_with_softmax_precision(
         K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
         V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
 
+<<<<<<< HEAD
         class SoftmaxPrecisionModel(torch.nn.Module):
             def __init__(self, precision):
                 super().__init__()
@@ -1443,6 +1584,39 @@ def forward(self, Q, K, V):
         self.assertIn("softmax_precision", attrs)
         self.assertEqual(attrs["softmax_precision"].value, precision_enum)
         onnx_testing.assert_onnx_program(onnx_program, atol=2e-3, rtol=6e-3)
+=======
+        # Test different ONNX precision types
+        precision_types = [
+            (1, "FLOAT"),
+            (10, "FLOAT16"),
+            (11, "DOUBLE"),
+            (16, "BFLOAT16"),
+        ]
+
+        for precision_val, precision_name in precision_types:
+
+            class SoftmaxPrecisionModel(torch.nn.Module):
+                def __init__(self, precision):
+                    super().__init__()
+                    self.precision = precision
+
+                def forward(self, Q, K, V):
+                    output, _, _, _ = torch.onnx.ops.attention(
+                        Q, K, V, softmax_precision=self.precision
+                    )
+                    return output
+
+            model = SoftmaxPrecisionModel(precision_val)
+            onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+            node = onnx_program.model.graph.node(0)
+            self.assertEqual(node.op_type, "Attention")
+
+            # Verify softmax_precision attribute
+            attrs = node.attributes
+            self.assertIn("softmax_precision", attrs)
+            self.assertEqual(attrs["softmax_precision"].value, precision_val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_attention_export_gqa(self):
         """Test export and verify output tensor shapes."""
@@ -1456,8 +1630,12 @@ def test_attention_export_gqa(self):
 
         class AttentionOutputsModel(torch.nn.Module):
             def forward(self, Q, K, V):
+<<<<<<< HEAD
                 result, _, _, _ = torch.onnx.ops.attention(Q, K, V)
                 return result
+=======
+                return torch.onnx.ops.attention(Q, K, V)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = AttentionOutputsModel()
         onnx_program = self.export(model, (Q, K, V), opset_version=23)
@@ -1474,7 +1652,24 @@ def forward(self, Q, K, V):
             outputs[0].shape, [batch_size, q_num_heads, q_seq_len, head_size]
         )
 
+<<<<<<< HEAD
         onnx_testing.assert_onnx_program(onnx_program)
+=======
+        # present_key: (batch_size, kv_num_heads, kv_seq_len, head_size)
+        self.assertEqual(
+            outputs[1].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+
+        # present_value: (batch_size, kv_num_heads, kv_seq_len, head_size)
+        self.assertEqual(
+            outputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+
+        # qk_output: (batch_size, q_num_heads, q_seq_len, kv_seq_len)
+        self.assertEqual(
+            outputs[3].shape, [batch_size, q_num_heads, q_seq_len, kv_seq_len]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/test_autograd_funs.py b/test/onnx/test_autograd_funs.py
index 81c70d7d98777..30342b3ad01b3 100644
--- a/test/onnx/test_autograd_funs.py
+++ b/test/onnx/test_autograd_funs.py
@@ -5,11 +5,20 @@
 
 import torch
 from torch.onnx import OperatorExportTypes
+<<<<<<< HEAD
+=======
+from torch.onnx._globals import GLOBALS
+from torch.onnx.utils import _model_to_graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
 class TestAutogradFuns(pytorch_test_common.ExportTestCase):
+<<<<<<< HEAD
     opset_version = 20
+=======
+    opset_version = GLOBALS.export_onnx_opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
 
@@ -131,7 +140,11 @@ def forward(self, input):
         input = torch.ones(1, 5)
 
         # Test ONNX_FALLTHROUGH_MODE
+<<<<<<< HEAD
         graph, _, _ = torch.onnx.utils._model_to_graph(
+=======
+        graph, _, _ = _model_to_graph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
@@ -140,7 +153,11 @@ def forward(self, input):
         self.assertEqual(next(iter).kind(), "prim::PythonOp")
 
         # Test ATEN_FALLBACK_MODE
+<<<<<<< HEAD
         graph, _, _ = torch.onnx.utils._model_to_graph(
+=======
+        graph, _, _ = _model_to_graph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
diff --git a/test/onnx/test_fx_passes.py b/test/onnx/test_fx_passes.py
new file mode 100644
index 0000000000000..97d255abdcb14
--- /dev/null
+++ b/test/onnx/test_fx_passes.py
@@ -0,0 +1,60 @@
+# Owner(s): ["module: onnx"]
+import torch
+import torch._dynamo
+import torch.fx
+from torch.onnx._internal.fx.passes import _utils as pass_utils
+from torch.testing._internal import common_utils
+
+
+class TestFxPasses(common_utils.TestCase):
+    def test_set_node_name_correctly_renames_when_new_name_collides_recursively(self):
+        def func(x, y, z):
+            return x + y + z
+
+        x = torch.randn(3)
+        y = torch.randn(3)
+        z = torch.randn(3)
+        gm, _ = torch._dynamo.export(func)(x, y, z)
+        torch._dynamo.reset()
+
+        # Purposely name the nodes in a way that will cause a recursive collision later.
+        # See :func:`set_node_name` for name collision renaming logic.
+        base_name = "tensor"
+        nodes = list(gm.graph.nodes)
+        for i, node in enumerate(nodes[1:]):
+            if i == 0:
+                node.name = base_name
+            else:
+                node.name = f"{base_name}.{i}"
+
+        # Run `set_node_name` and verify that the names are correct.
+        name_to_node = {node.name: node for node in gm.graph.nodes}
+        pass_utils.set_node_name(nodes[0], base_name, name_to_node)
+        assert nodes[0].name == base_name, f"Expected {base_name}, got {nodes[0].name}"
+        assert len({node.name for node in nodes}) == len(nodes), (
+            f"Expected all names to be unique, got {nodes}"
+        )
+
+    def test_set_node_name_succeeds_when_no_name_collisions(self):
+        def func(x, y, z):
+            return x + y + z
+
+        x = torch.randn(3)
+        y = torch.randn(3)
+        z = torch.randn(3)
+        gm, _ = torch._dynamo.export(func)(x, y, z)
+        torch._dynamo.reset()
+
+        # Run `set_node_name` and verify that the names are correct.
+        new_name = "some_tensor"
+        nodes = list(gm.graph.nodes)
+        name_to_node = {node.name: node for node in nodes}
+        pass_utils.set_node_name(nodes[1], new_name, name_to_node)
+        assert nodes[1].name == new_name, f"Expected {new_name}, got {nodes[0].name}"
+        assert len({node.name for node in nodes}) == len(nodes), (
+            f"Expected all names to be unique, got {nodes}"
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 16ca93dbfe2c5..70f08f72175a4 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -11,7 +11,11 @@
 import torch.onnx
 from torch.nn import Module
 from torch.onnx import producer_name, producer_version
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+=======
+from torch.onnx._globals import GLOBALS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
@@ -36,12 +40,20 @@ def check_onnx_opset_operator(
     # but the op's attributes can optionally be
     # specified as well
     assert len(ops) == len(graph.node)
+<<<<<<< HEAD
     for i in range(len(ops)):
+=======
+    for i in range(0, len(ops)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert graph.node[i].op_type == ops[i]["op_name"]
         if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
+<<<<<<< HEAD
             for j in range(len(attributes)):
+=======
+            for j in range(0, len(attributes)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for attribute_field in attributes[j].keys():
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
@@ -67,7 +79,10 @@ def check_onnx_opsets_operator(
             training=training,
             input_names=input_names,
             dynamic_axes=dynamic_axes,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model = onnx.load(io.BytesIO(f.getvalue()))
         check_onnx_opset_operator(model, ops[opset_version], opset_version)
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
index e47c88b4c4406..3cc7f4cb190ed 100644
--- a/test/onnx/test_onnxscript_no_runtime.py
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -10,7 +10,11 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter import jit_utils
+=======
+from torch.onnx._internal import jit_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
@@ -86,20 +90,28 @@ def custom_layer_norm(
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model_selu = torch.nn.SELU()
         selu_onnx = io.BytesIO()
+<<<<<<< HEAD
         torch.onnx.export(
             model_selu, x, selu_onnx, opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(model_selu, x, selu_onnx, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         N, C = 3, 4
         y = torch.randn(N, C)
         model_layer_norm = torch.nn.LayerNorm(C)
         layer_norm_onnx = io.BytesIO()
         torch.onnx.export(
+<<<<<<< HEAD
             model_layer_norm,
             y,
             layer_norm_onnx,
             opset_version=self.opset_version,
             dynamo=False,
+=======
+            model_layer_norm, y, layer_norm_onnx, opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # 4. test on models
@@ -162,11 +174,15 @@ def custom_selu(g, X):
 
         saved_model = io.BytesIO()
         torch.onnx.export(
+<<<<<<< HEAD
             torch.jit.script(model),
             inputs,
             f=saved_model,
             opset_version=15,
             dynamo=False,
+=======
+            torch.jit.script(model), inputs, f=saved_model, opset_version=15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         loop_selu_proto = onnx.load(io.BytesIO(saved_model.getvalue()))
         self.assertEqual(len(loop_selu_proto.functions), 1)
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
index dc19971498d95..bffb186cafc9e 100644
--- a/test/onnx/test_onnxscript_runtime.py
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -9,7 +9,11 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter import jit_utils
+=======
+from torch.onnx._internal import jit_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
index bc3c64ab8679b..ab34b7f74ec69 100644
--- a/test/onnx/test_pytorch_jit_onnx.py
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -4,11 +4,16 @@
 from pytorch_test_common import skipIfNoCuda
 
 import torch
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter import verification
 from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.onnx._internal.torchscript_exporter.utils import (
     _trigger_symbolic_function_registration,
 )
+=======
+from torch.onnx import verification
+from torch.onnx._globals import GLOBALS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
@@ -23,7 +28,10 @@ def _jit_graph_to_onnx_model(graph, operator_export_type, opset_version):
     """
 
     GLOBALS.export_onnx_opset_version = opset_version
+<<<<<<< HEAD
     _trigger_symbolic_function_registration()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph = torch.onnx.utils._optimize_graph(
         graph, operator_export_type, params_dict={}
     )
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
new file mode 100644
index 0000000000000..b3a3aa01cf3c0
--- /dev/null
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -0,0 +1,1226 @@
+# Owner(s): ["module: onnx"]
+
+"""Tests for onnx export that don't run the exported model."""
+
+from __future__ import annotations
+
+import contextlib
+import io
+import itertools
+import unittest
+import unittest.mock
+import warnings
+from typing import Callable, Optional, TYPE_CHECKING, Union
+
+import numpy as np
+
+import onnx
+import onnx.numpy_helper
+import pytorch_test_common
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.onnx import symbolic_helper, utils
+from torch.onnx._internal import registration
+from torch.testing._internal import common_quantization, common_utils, jit_utils
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def export_to_onnx(
+    model: Union[torch.nn.Module, torch.jit.ScriptFunction],
+    input: Union[torch.Tensor, tuple[torch.Tensor]],
+    custom_ops: Optional[
+        Iterable[Union[contextlib.AbstractContextManager, contextlib.ContextDecorator]]
+    ] = None,
+    mocks: Optional[Iterable] = None,
+    operator_export_type: torch.onnx.OperatorExportTypes = torch.onnx.OperatorExportTypes.ONNX,
+    opset_version: int = 17,
+    **torch_onnx_export_kwargs,
+) -> onnx.ModelProto:
+    """Exports `model(input)` to ONNX and returns it.
+
+    Custom operators and/or unittest patches can be used help reproducing specific behaviors.
+
+    Args:
+        model: model to export
+        input: model input with same format as `torch.onnx.export(..,args,...)`
+        custom_ops: list of custom operators to use during export
+        mocks: list of mocks to use during export
+        operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)`
+        opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)`
+        torch_onnx_export_kwargs: extra torch.onnx.export kwargs arguments
+    Returns:
+        A valid ONNX model (`onnx.ModelProto`)
+    """
+    custom_ops = custom_ops or []
+    mocks = mocks or []
+    with contextlib.ExitStack() as stack:
+        for ctx in itertools.chain(custom_ops, mocks):
+            stack.enter_context(ctx)
+
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            input,
+            f,
+            operator_export_type=operator_export_type,
+            opset_version=opset_version,
+            **torch_onnx_export_kwargs,
+        )
+
+    # Validate ONNX graph before returning it
+    onnx_model = onnx.load_from_string(f.getvalue())
+    onnx.checker.check_model(onnx_model)
+    return onnx_model
+
+
+@common_utils.instantiate_parametrized_tests
+class TestONNXExport(pytorch_test_common.ExportTestCase):
+    def test_fuse_addmm(self):
+        class AddmmModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.mm(x, x) + x
+
+        x = torch.ones(3, 3)
+        f = io.BytesIO()
+        torch.onnx.export(AddmmModel(), x, f)
+
+    def test_onnx_transpose_incomplete_tensor_type(self):
+        # Smoke test to get us into the state where we are attempting to export
+        # a transpose op, where the input is a TensorType without size information.
+        # This would previously not work, since we would
+        # take the size of the input and use the length of its sizes as the
+        # number of dimensions in the permutation.
+        class Foo(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                return x.contiguous().transpose(0, 1).sum()
+
+        class TraceMe(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.foo = Foo()
+
+            def forward(self, x):
+                return self.foo(x)
+
+        tm = TraceMe()
+        tm = torch.jit.trace(tm, torch.rand(3, 4))
+        f = io.BytesIO()
+        torch.onnx.export(tm, (torch.rand(3, 4),), f)
+
+    def test_export_tensoroption_to(self):
+        def foo(x):
+            return x[0].detach().clone().cpu() + x
+
+        traced = torch.jit.trace(foo, (torch.rand([2])))
+
+        f = io.BytesIO()
+        torch.onnx.export(traced, (torch.rand([2]),), f)
+
+    def test_onnx_export_script_module(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                y = x - x  # noqa: F841
+                return x + x
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    @common_utils.suppress_warnings
+    def test_onnx_export_func_with_warnings(self):
+        @torch.jit.script
+        def func_with_warning(inp):
+            return torch.nn.functional.sigmoid(inp)  # triggers a deprecation warning
+
+        class WarningTest(torch.nn.Module):
+            def forward(self, x):
+                return func_with_warning(x)
+
+        # no exception
+        f = io.BytesIO()
+        torch.onnx.export(WarningTest(), torch.randn(42), f)
+
+    def test_onnx_export_script_python_fail(self):
+        class PythonModule(torch.jit.ScriptModule):
+            @torch.jit.ignore
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self) -> None:
+                super().__init__()
+                self.mod = PythonModule()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
+            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    def test_onnx_export_script_inline_trace(self):
+        class ModuleToInline(torch.nn.Module):
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self) -> None:
+                super().__init__()
+                self.mod = torch.jit.trace(ModuleToInline(), torch.zeros(1, 2, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    def test_onnx_export_script_inline_script(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.neg(x)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self) -> None:
+                super().__init__()
+                self.mod = ModuleToInline()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return y + y
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    def test_onnx_export_script_module_loop(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                # test if we support end to end onnx export on loop and
+                # nested loops with and without loop index
+                for _ in range(5):
+                    for i in range(3):
+                        x = x + i
+                return x
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    @common_utils.suppress_warnings
+    def test_onnx_export_script_truediv(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                z = x.size(0) / 2
+                return x + z
+
+        mte = ModuleToExport()
+
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3, dtype=torch.float),), f)
+
+    def test_onnx_export_script_non_alpha_add_sub(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                bs = x.size(0) + 1
+                return bs - 1
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.rand(3, 4),), f)
+
+    def test_onnx_export_script_module_if(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                if bool(torch.sum(x) > 0):
+                    x = torch.neg(x)
+                return x
+
+        mte = ModuleToExport()
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
+
+    def test_onnx_export_script_inline_params(self):
+        class ModuleToInline(torch.jit.ScriptModule):
+            def __init__(self) -> None:
+                super().__init__()
+                self.m = torch.nn.Parameter(torch.ones(3, 3))
+                self.unused = torch.nn.Parameter(torch.ones(1, 2, 3))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.mm(x, self.m)
+
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self) -> None:
+                super().__init__()
+                self.mod = ModuleToInline()
+                self.param = torch.nn.Parameter(torch.ones(3, 4))
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = self.mod(x)
+                return torch.mm(y, self.param)
+
+        mte = ModuleToExport()
+        result = mte(torch.zeros(2, 3))
+        reference = torch.mm(
+            torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4)
+        )
+        self.assertEqual(result, reference)
+        f = io.BytesIO()
+        torch.onnx.export(mte, (torch.ones(2, 3),), f)
+
+    def test_onnx_export_speculate(self):
+        class Foo(torch.jit.ScriptModule):
+            def __init__(self, m):
+                super().__init__()
+                self.m = m
+
+            @torch.jit.script_method
+            def forward(self, x):
+                x += x
+                # because we are testing if we emit `if` statement correctly
+                # we cannot use `True` as the condition. Constant prop
+                # would remove the `if` statements.
+                c = torch.sum(x) > 4
+                if bool(c):
+                    if bool(c):
+                        y = self.m(x)
+                    else:
+                        y = self.m(x)
+                else:
+                    y = self.m(x)
+                return y
+
+        linear = torch.jit.trace(
+            torch.nn.Linear(10, 20).float(), torch.zeros(1, 10, dtype=torch.float)
+        )
+
+        @torch.jit.script
+        def transpose(x):
+            return x.t()
+
+        f1 = Foo(transpose)
+        f2 = Foo(linear)
+
+        f = io.BytesIO()
+        torch.onnx.export(f1, (torch.ones(1, 10, dtype=torch.float),), f)
+        f = io.BytesIO()
+        torch.onnx.export(f2, (torch.ones(1, 10, dtype=torch.float),), f)
+
+    def test_onnx_export_shape_reshape(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                import torch.onnx.operators
+
+                x = x.repeat(5, 1, 1)
+                shape = torch.onnx.operators.shape_as_tensor(x)
+                reshaped = torch.onnx.operators.reshape_from_tensor_shape(x, shape)
+                return reshaped
+
+        foo = torch.jit.trace(Foo(), torch.zeros(1, 2, 3))
+        f = io.BytesIO()
+        torch.onnx.export(foo, (torch.zeros(1, 2, 3)), f)
+
+    def test_export_dynamic_slice(self):
+        class DynamicSliceExportMod(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                retval = x[0]
+                for i in range(x.size(1)):
+                    retval += torch.sum(x[0:i], dim=0)
+                return retval
+
+        input = torch.rand(3, 4, 5)
+
+        f = io.BytesIO()
+        torch.onnx.export(DynamicSliceExportMod(), (input,), f, opset_version=10)
+
+    def test_export_dict(self):
+        class DictModule(torch.nn.Module):
+            def forward(self, x_in: torch.Tensor) -> dict[str, torch.Tensor]:
+                return {"test_key_out": x_in}
+
+        x_in = torch.tensor(1)
+        mod = DictModule()
+        mod.train(False)
+
+        f = io.BytesIO()
+        torch.onnx.export(mod, (x_in,), f)
+
+        with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported"):
+            f = io.BytesIO()
+            torch.onnx.export(torch.jit.script(mod), (x_in,), f)
+
+    def test_source_range_propagation(self):
+        class ExpandingModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                # Will be expanded during ONNX export
+                self.ln = torch.nn.LayerNorm([1])
+
+            def forward(self, input):
+                return self.ln(input)
+
+        mod = ExpandingModule()
+
+        graph, _, _ = utils._model_to_graph(
+            mod,
+            (torch.zeros(1),),
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+        )
+
+        # Ensure that every node in the graph has a valid source range
+        for node in graph.nodes():
+            self.assertTrue(node.sourceRange())
+
+    def test_clip_aten_fallback_due_exception(self):
+        def bad_clamp(g, self, min, max):
+            return symbolic_helper._onnx_unsupported("Bad boy!")
+
+        class MyClip(torch.nn.Module):
+            def forward(self, x):
+                return torch.clamp(x, min=-0.5, max=0.5)
+
+        onnx_model = export_to_onnx(
+            MyClip(),
+            torch.randn(3, 4, requires_grad=True),
+            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 17)],
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        self.assertAtenOp(onnx_model, "clamp", "Tensor")
+
+    def test_clip_aten_fallback_explicit_request(self):
+        class MyClip(torch.nn.Module):
+            def forward(self, x):
+                return torch.clamp(x, min=-0.5, max=0.5)
+
+        # Copy of mocked method must be saved to prevent
+        # max recursion depth while trying to run original instance method
+        original_get_function_group = registration.registry.get_function_group
+
+        def break_is_registered_op_api(name):
+            fake_missing_symbolics = {"aten::clamp"}
+            if name in fake_missing_symbolics:
+                return None
+            return original_get_function_group(name)
+
+        # Force missing symbolic for well-known op using a mock
+        onnx_model = export_to_onnx(
+            MyClip(),
+            torch.randn(3, 4, requires_grad=True),
+            mocks=[
+                unittest.mock.patch(
+                    "torch.onnx._internal.registration.registry.get_function_group",
+                    side_effect=break_is_registered_op_api,
+                    # wraps=registration.registry.get_function_group
+                )
+            ],
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        self.assertAtenOp(onnx_model, "clamp", "Tensor")
+
+    def _helper_test_to_(self, cast_fn: Callable[[torch.Tensor], torch.Tensor]):
+        """Helper to test aten::to(device) variants.
+
+        `cast_fn` is converted into a `torch.jit.script`. It wraps `aten::to`
+        during export to preventing the devices to be hard-coded.
+
+        Needed by detectron2 after https://github.com/facebookresearch/detectron2/pull/4132/
+        """
+        cast_fn = torch.jit.script(cast_fn)
+        onnx_model = export_to_onnx(cast_fn, torch.zeros([1, 3, 32, 32]))
+        for n in onnx_model.graph.node:
+            self.assertNotEqual(n.op_type, "To")
+            self.assertNotEqual(n.op_type, "Cast")
+
+    def test_to__cpu_string(self):
+        def cast_cpu_string(src: torch.Tensor) -> torch.Tensor:
+            return src.to("cpu")
+
+        self._helper_test_to_(cast_cpu_string)
+
+    def test_to__device_cpu_string(self):
+        def cast_device_cpu_string(src: torch.Tensor) -> torch.Tensor:
+            return src.to(device="cpu")
+
+        self._helper_test_to_(cast_device_cpu_string)
+
+    def test_script_custom_class_error(self):
+        class BoxCoder:
+            def __init__(self, bbox_xform_clip: float) -> None:
+                self.bbox_xform_clip = bbox_xform_clip
+
+            def decode(self, rel_codes: Tensor, boxes: list[Tensor]) -> Tensor:
+                boxes = torch.cat(boxes, dim=0)
+                pred_ctr_x = (
+                    torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip)
+                    * boxes[:, 2]
+                )
+                return pred_ctr_x
+
+        class MyModule(torch.nn.Module):
+            __annotations__ = {
+                "box_coder": BoxCoder,
+            }
+
+            def __init__(self) -> None:
+                super().__init__()
+                self.box_coder = BoxCoder(1.4)
+
+            def forward(self, box_regression: Tensor, proposals: list[Tensor]):
+                return self.box_coder.decode(box_regression, proposals)
+
+        model = torch.jit.script(MyModule())
+        box_regression = torch.randn([4, 4])
+        proposal = [torch.randn(2, 4), torch.randn(2, 4)]
+
+        with self.assertRaises(RuntimeError):
+            f = io.BytesIO()
+            torch.onnx.export(
+                model,
+                (box_regression, proposal),
+                f,
+            )
+
+    def test_initializer_sequence(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self, input_size, hidden_size, num_classes):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(input_size, hidden_size)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
+
+            def forward(self, x):
+                out = self.fc1(x)
+                out = self.relu(out)
+                out = self.fc2(out)
+                return out
+
+        test_model = MyModule(3, 4, 10)
+        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
+        named_params_list = [k for (k, v) in test_model.named_parameters()]
+
+        x = torch.randn(32, 3)
+        f = io.BytesIO()
+        torch.onnx.export(test_model, (x,), f, do_constant_folding=False)
+        loaded_model = onnx.load_from_string(f.getvalue())
+
+        actual_list = [p.name for p in loaded_model.graph.initializer]
+        assert actual_list == state_dict_list, (
+            "Initializers' sequence is not as same as state_dict(). Expected: ("
+            + ", ".join(state_dict_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+        assert actual_list == named_params_list, (
+            "Initializers' sequence is not as same as named_parameters(). Expected: ("
+            + ", ".join(named_params_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+
+    def test_initializer_sequence_script_model(self):
+        def list_is_expected(short_list, long_list) -> bool:
+            if len(short_list) > len(long_list):
+                return False
+
+            for i in range(len(short_list)):
+                if short_list[i] not in long_list[i]:
+                    return False
+
+            return True
+
+        def loop(x, y):
+            for i in range(int(y)):
+                x = x + i
+            return x
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, input_size, hidden_size, num_classes):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(input_size, hidden_size)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
+
+            def forward(self, x, y):
+                x = loop(x, y)
+                out = self.fc1(x)
+                out = self.relu(out)
+                out = self.fc2(out)
+                return out
+
+        test_model = torch.jit.script(MyModule(3, 4, 10))
+        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
+        named_params_list = [k for (k, v) in test_model.named_parameters()]
+
+        x = torch.ones(2, 3, dtype=torch.float)
+        y = torch.tensor(5, dtype=torch.long)
+        f = io.BytesIO()
+
+        torch.onnx.export(test_model, (x, y), f, do_constant_folding=False)
+        loaded_model = onnx.load_from_string(f.getvalue())
+
+        actual_list = [p.name for p in loaded_model.graph.initializer]
+        assert list_is_expected(state_dict_list, actual_list), (
+            "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: ("
+            + ", ".join(state_dict_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+        assert list_is_expected(named_params_list, actual_list), (
+            "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: ("
+            + ", ".join(named_params_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+
+    def test_shape_value_map(self):
+        class RSoftMax(torch.nn.Module):
+            def __init__(self, radix, cardinality):
+                super().__init__()
+                self.radix = radix
+                self.cardinality = cardinality
+
+            def forward(self, x):
+                batch = x.size(0)
+                x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+                x = F.softmax(x, dim=1)
+                x = x.reshape(batch, -1)
+                return x
+
+        radix = 2
+        cardinality = 1
+        x = torch.randn(10, 1, 128, 1)
+        f = io.BytesIO()
+        torch.onnx.export(
+            RSoftMax(radix, cardinality),
+            (x,),
+            f,
+            input_names=["x"],
+            dynamic_axes={"x": [0]},
+        )
+        loaded_model = onnx.load_from_string(f.getvalue())
+        self.assertEqual(
+            loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128
+        )
+
+    def test_onnx_proto_checker(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return 2 * x
+
+        x = torch.randn(1, 2, 3, requires_grad=True)
+        f = io.BytesIO()
+        torch.onnx.export(Model(), (x,), f)
+        model = onnx.load(f)
+        model.ir_version = 0
+
+        def check_proto():
+            torch._C._check_onnx_proto(model.SerializeToString())
+
+        self.assertRaises(RuntimeError, check_proto)
+
+    def test_maintain_dynamic_shapes_of_unreliable_nodes(self):
+        def symbolic_pythonop(g, *args, **kwargs):
+            return g.op("com.microsoft::PythonOp")
+
+        torch.onnx.register_custom_op_symbolic("prim::PythonOp", symbolic_pythonop, 1)
+        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "prim::PythonOp", 1)
+
+        # necessay parameters for transformer embeddings
+        hidden_size = 48
+        max_position_embeddings = 32
+        batch_size = 2
+
+        # issue found that autograd.function making downstream
+        # node unreliable but with static shape. The issue was first
+        # discovered with using Apex FusedLayerNorm in Transformers
+        class CustomLayerNorm(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, embedding):
+                layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12)
+                return layer_norm(embedding)
+
+        class EmbeddingModule(torch.nn.Module):
+            def forward(
+                self,
+                embeddings=None,
+            ):
+                embedding_output = CustomLayerNorm.apply(embeddings)
+                query = embedding_output.transpose(0, 1)
+                target_len, batch_size, embedding_dim = query.size()
+                # Reshape is used for consuming batch_size, and if it is static,
+                # this will be a Constant node in the graph
+                query = query.reshape(target_len, batch_size, embedding_dim)
+                return query
+
+        embeddings = torch.randn(batch_size, max_position_embeddings, hidden_size)
+
+        f = io.BytesIO()
+        torch.onnx.export(
+            EmbeddingModule().eval(),
+            (embeddings,),
+            f,
+            input_names=["embeddings"],
+            dynamic_axes={
+                "embeddings": {
+                    0: "batch_size",
+                    1: "max_position_embeddings",
+                    2: "hidden_size",
+                }
+            },
+            custom_opsets={"com.microsoft": 1},
+        )
+        model = onnx.load(io.BytesIO(f.getvalue()))
+
+        # If there is a constant node with dim=3 and max_position_embeddings,
+        # batch_size, hidden_size as shape, it means the shape becomes static.
+        # Normally, with dynamic batch size, this constant node should not exist.
+        const_node = [n for n in model.graph.node if n.op_type == "Constant"]
+        self.assertNotEqual(len(const_node), 0)
+        for node in const_node:
+            for a in node.attribute:
+                if a.name == "value":
+                    shape = onnx.numpy_helper.to_array(a.t)
+                    self.assertNotEqual(
+                        shape.tolist(),
+                        [max_position_embeddings, batch_size, hidden_size],
+                    )
+
+    def test_is_fp_for_C_TypeList(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = x.squeeze(1)
+                w = x.shape[2]
+                pos = x.view(2, -1).argmax(1)
+                x_int = pos % w
+                y_int = (pos - x_int) // w
+                return y_int, x_int
+
+        model = torch.jit.script(M())
+        inputs = torch.randn(2, 4, 6)
+        f = io.BytesIO()
+        torch.onnx.export(
+            model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
+        )
+
+    def test_dropout_script(self):
+        eg = torch.zeros(1, 2, 3, requires_grad=True)
+
+        @jit_utils._trace(eg)
+        def foo(x):
+            x = torch.neg(x)
+            return F.dropout(x)
+
+        class MyDrop(torch.nn.Module):
+            def forward(self, x):
+                return foo(x)
+
+        f = io.BytesIO()
+        with warnings.catch_warnings(record=True):
+            torch.onnx.export(MyDrop(), (eg,), f)
+
+    def test_pack_padded_pad_packed_trace(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        T, B, C = 3, 5, 7
+
+        class PadPackedWrapper(torch.nn.Module):
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = np.ones((T, B, C))
+        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
+        # set padding value so we can test equivalence
+        for b in range(B):
+            if seq_lens[b] < T:
+                x[seq_lens[b] :, b, :] = 0
+        seq_lens = torch.from_numpy(seq_lens)
+        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
+
+        m = PadPackedWrapper()
+        m_traced = torch.jit.trace(
+            m,
+            (
+                x,
+                seq_lens,
+            ),
+        )
+
+        y = m(x, seq_lens)
+        loss = torch.sum(y)
+        loss.backward()
+        grad = x.grad.clone()
+        x.grad.zero_()
+
+        y_traced = m_traced(x, seq_lens)
+        loss_traced = torch.sum(y_traced)
+        loss_traced.backward()
+        grad_traced = x.grad.clone()
+
+        self.assertEqual(y_traced, x)
+        self.assertEqual(y_traced, y)
+        self.assertEqual(grad, grad_traced)
+
+        f = io.BytesIO()
+        torch.onnx.export(m, (x, seq_lens), f)
+
+    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
+    @common_utils.suppress_warnings
+    def test_rnn_trace_override(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        num_layers = 3
+        T, B, C = 11, 5, 7
+
+        class RNNTraceWrapper(torch.nn.Module):
+            def __init__(self, cell_type):
+                super().__init__()
+                if cell_type == "RNN":
+                    self.rnn = torch.nn.RNN(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "LSTM":
+                    self.rnn = torch.nn.LSTM(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "GRU":
+                    self.rnn = torch.nn.GRU(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = self.rnn(x)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        for cell_type in ["RNN", "LSTM", "GRU"]:
+            x = torch.ones(T, B, C, requires_grad=True)
+            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+
+            m = RNNTraceWrapper(cell_type)
+            m_traced = torch.jit.trace(
+                m,
+                (
+                    x,
+                    seq_lens,
+                ),
+            )
+
+            y = m(x, seq_lens)
+            loss = torch.sum(y)
+            loss.backward()
+            grad = x.grad.clone()
+            x.grad.zero_()
+
+            y_traced = m_traced(x, seq_lens)
+            loss_traced = torch.sum(y_traced)
+            loss_traced.backward()
+            grad_traced = x.grad.clone()
+
+            self.assertEqual(y_traced, y)
+            self.assertEqual(grad, grad_traced)
+
+            f = io.BytesIO()
+            torch.onnx.export(m, (x, seq_lens), f)
+
+    def test_pushpackingpastrnn_in_peephole_create_own_gather_input(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        num_layers = 3
+        T, B, C = 11, 5, 7
+        mask_start_point = 0
+
+        class LSTMTraceWrapper(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+                self.rnn = torch.nn.LSTM(
+                    input_size=C, hidden_size=C, num_layers=num_layers
+                )
+
+            def forward(self, x, seq_lens):
+                mask = torch.arange(mask_start_point, x.shape[1])
+                seq_lens = seq_lens[mask]
+                x = pack_padded_sequence(x, seq_lens)
+                # Calculate sizes and prepare views to our zero buffer to pass as hx
+                max_batch_size = x.batch_sizes[0]
+                hx = torch.randn(num_layers, max_batch_size, C)
+                cx = torch.randn(num_layers, max_batch_size, C)
+                x, _ = self.rnn(x, (hx, cx))
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = torch.ones(T, B, C)
+        # length 5 because of B
+        seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+        m = LSTMTraceWrapper()
+
+        f = io.BytesIO()
+        torch.onnx.export(
+            m,
+            (x, seq_lens),
+            f,
+            verbose=True,
+            input_names=["input", "seq_len"],
+            dynamic_axes={"input": {1: "B"}},
+        )
+        onnx_proto = onnx.load_model_from_string(f.getvalue())
+        # the first argument in onnx::Range should be constant node with value 0
+        const_node = []
+        constant_input_name = None
+        for n in onnx_proto.graph.node:
+            if n.op_type == "Constant":
+                const_node.append(n)
+            elif n.op_type == "Range":
+                constant_input_name = n.input[0]
+        self.assertNotEqual(constant_input_name, None)
+        self.assertNotEqual(len(const_node), 0)
+
+        value = None
+        for n in const_node:
+            if n.output[0] == constant_input_name:
+                value = np.frombuffer(n.attribute[0].t.raw_data, dtype=np.int64)
+        self.assertEqual(value, 0)
+
+    def test_trace_fork_wait_inline_onnx(self):
+        def fork_body(x):
+            return torch.neg(x), torch.neg(x)
+
+        class MyMod(torch.nn.Module):
+            def forward(self, x):
+                fut = torch.jit._fork(fork_body, x)
+                val = torch.jit._wait(fut)
+                return val[1]
+
+        # smoke test for ONNX export
+        f = io.BytesIO()
+        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
+
+    def test_trace_detach_onnx_erase(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, w):
+                return torch.matmul(x, w).detach()
+
+        f = io.BytesIO()
+        torch.onnx.export(Mod(), (torch.rand(3, 4), torch.rand(4, 5)), f)
+
+    def test_aten_fallback_must_fallback(self):
+        class ModelWithAtenNotONNXOp(torch.nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenNotONNXOp(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # support for linalg.qr was added in later op set versions.
+            opset_version=9,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "linalg_qr")
+
+    def test_onnx_aten(self):
+        class ModelWithAtenFmod(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        x = torch.randn(3, 4, dtype=torch.float32)
+        y = torch.randn(3, 4, dtype=torch.float32)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenFmod(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "fmod", "Tensor")
+
+    def test_onnx_aten_fallback_must_not_fallback(self):
+        # For BUILD_CAFFE2=0, aten fallback only when not exportable
+        class ONNXExportable(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.quant = torch.ao.quantization.QuantStub()
+                self.fc1 = torch.nn.Linear(12, 8)
+                self.fc2 = torch.nn.Linear(8, 4)
+                self.fc3 = torch.nn.Linear(4, 6)
+                self.dequant = torch.ao.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = x.view((-1, 12))
+                h = F.relu(self.fc1(x))
+                h = F.relu(self.fc2(h))
+                h = F.relu(self.fc3(h))
+                h = self.dequant(h)
+                return h
+
+        dummy_input = torch.randn(12)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ONNXExportable(),
+            (dummy_input,),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        all_aten_nodes = [
+            p
+            for p in onnx_model.graph.node
+            if p.op_type == "ATen" and p.domain == "org.pytorch.aten"
+        ]
+        self.assertEqual(len(all_aten_nodes), 0)
+
+    def test_cat_with_empty_tensor(self):
+        class NoopConcat(torch.nn.Module):
+            def forward(self, x):
+                return torch.cat((torch.Tensor([]), x))
+
+        x = torch.randn(4, 5, 6)
+        # TODO: Parametrize this test for opset_version
+        for opset_version in {9, 11}:
+            f = io.BytesIO()
+            torch.onnx.export(NoopConcat(), (x,), f, opset_version=opset_version)
+            loaded_model = onnx.load_from_string(f.getvalue())
+            self.assertEqual(
+                len(loaded_model.graph.output[0].type.tensor_type.shape.dim), 3
+            )
+            for idx, dim in enumerate(x.shape):
+                self.assertEqual(
+                    loaded_model.graph.output[0]
+                    .type.tensor_type.shape.dim[idx]
+                    .dim_value,
+                    dim,
+                )
+
+    def test_col2im(self):
+        # This test can be moved to test/onnx/test_pytorch_onnx_onnxruntime.py when ORT implement ::Col2Im
+
+        # Random batched RGB 32x32 image-shaped input tensor of batch size 64
+        original_image_inputs = torch.randn((64, 3, 32, 32))
+        output_size = tuple(original_image_inputs.shape[2:])
+        kernel_size = (1, 2)
+        dilation = 3
+        padding = 2
+        stride = 1
+        model_im2col = torch.nn.Unfold(
+            kernel_size, dilation=dilation, padding=padding, stride=stride
+        )
+        blocks = model_im2col(original_image_inputs)
+
+        model = torch.nn.Fold(
+            output_size=output_size,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride,
+        )
+        f = io.BytesIO()
+        torch.onnx.export(model, (blocks,), f, opset_version=18)
+
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertEqual(onnx_model.graph.node[-1].op_type, "Col2Im")
+        self.assertEqual(onnx_model.graph.node[-1].domain, "")
+        self.assertEqual(len(onnx_model.graph.node[-1].input), 3)
+        self.assertEqual(onnx_model.graph.node[-1].attribute[0].name, "dilations")
+        self.assertEqual(onnx_model.graph.node[-1].attribute[1].name, "pads")
+        self.assertEqual(onnx_model.graph.node[-1].attribute[2].name, "strides")
+
+    @unittest.skipIf(
+        not torch.hub._check_module_exists("torch_scatter"),
+        "torch_scatter not installed.",
+    )
+    def test_random_namespace_custom_op_is_onnx_exportable(self):
+        from torch_scatter import scatter_max  # type: ignore[import]
+
+        class MyModel(torch.nn.Module):
+            def forward(self, src: torch.Tensor, idx: torch.Tensor):
+                return scatter_max(src, idx)
+
+        m = MyModel().eval()
+        src = torch.ones([3, 10], dtype=torch.float32)
+        idx = torch.randint(0, 4, [3, 10], dtype=torch.long)
+
+        def sym_scatter_max(g, src, index, dim, out, dim_size):
+            return g.op(
+                "torch_scatter::scatter_max", src, index, dim_size_i=-1, outputs=2
+            )
+
+        torch.onnx.register_custom_op_symbolic(
+            "torch_scatter::scatter_max", sym_scatter_max, 1
+        )
+        f = io.BytesIO()
+        with torch.no_grad():
+            torch.onnx.export(
+                m,
+                (src, idx),
+                f,
+                opset_version=13,
+                custom_opsets={"torch_scatter": 1},
+                do_constant_folding=True,
+            )
+
+    @common_utils.parametrize("fp8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+    def test_fp8_export(self, fp8_dtype: torch.dtype):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x.to(torch.float32)
+
+        x = torch.randn(2, 3).to(fp8_dtype)
+
+        f = io.BytesIO()
+        torch.onnx.export(Model(), x, f, opset_version=19)
+        onnx.checker.check_model(f.getvalue())
+
+        onnx_type = {
+            torch.float8_e4m3fn: 17,
+            torch.float8_e5m2: 19,
+        }  # From https://github.com/onnx/onnx/blob/main/onnx/onnx.proto3#L512-L521
+        loaded_model = onnx.load_from_string(f.getvalue())
+        self.assertEqual(
+            loaded_model.graph.input[0].type.tensor_type.elem_type, onnx_type[fp8_dtype]
+        )
+
+
+class TestQuantizeEagerONNXExport(common_utils.TestCase):
+    def _test_lower_graph_impl(self, model, data):
+        model.qconfig = torch.ao.quantization.default_qconfig
+        model = torch.ao.quantization.prepare(model)
+        model = torch.ao.quantization.convert(model)
+
+        _ = model(data)
+        input_names = ["x"]
+
+        def _export_to_onnx(model, input, input_names):
+            traced = torch.jit.trace(model, input)
+            buf = io.BytesIO()
+            torch.jit.save(traced, buf)
+            buf.seek(0)
+
+            model = torch.jit.load(buf)
+            f = io.BytesIO()
+            torch.onnx.export(
+                model,
+                input,
+                f,
+                input_names=input_names,
+                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                opset_version=9,
+            )
+
+        _export_to_onnx(model, data, input_names)
+
+    @common_quantization.skipIfNoFBGEMM
+    @unittest.skip(
+        "onnx opset9 does not support quantize_per_tensor and caffe2 \
+    does not support conv3d"
+    )
+    def test_lower_graph_conv3d(self):
+        model = torch.ao.quantization.QuantWrapper(
+            torch.nn.Conv3d(3, 5, 2, bias=True)
+        ).to(dtype=torch.float)
+        data_numpy = np.random.rand(1, 3, 6, 6, 6).astype(np.float32)
+        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
+        self._test_lower_graph_impl(model, data)
+
+    @pytorch_test_common.skipIfNoCuda
+    def test_composed_layer_norm_small_eps_fp16_keep_double(self):
+        class Net(torch.nn.Module):
+            def __init__(self, C):
+                super().__init__()
+                self.layer_norm = torch.nn.LayerNorm(C, eps=1e-8)
+
+            def forward(self, x):
+                return self.layer_norm(x)
+
+        N, C = 8, 4
+        model = Net(C).cuda().half()
+        x = torch.randn(N, C).cuda().half()
+        f = io.BytesIO()
+        torch.onnx.export(model, (x,), f, opset_version=14)
+        onnx_model = onnx.load_from_string(f.getvalue())
+        const_node = [n for n in onnx_model.graph.node if n.op_type == "Constant"]
+        self.assertNotEqual(len(const_node), 0)
+        double_type_count = 0
+        for node in const_node:
+            for a in node.attribute:
+                # EPS constant should be in double type
+                if a.name == "value" and a.t.data_type == 11:
+                    double_type_count += 1
+        self.assertNotEqual(double_type_count, 0)
+
+    @pytorch_test_common.skipIfNoCuda
+    def test_aten_device_with_index(self):
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+        model = torch.compile(model, backend="onnxrt")
+        model = model.eval()
+        device = "cuda:0"
+        model = model.to(device)
+        ids = tokenizer.batch_encode_plus(["This is a test"], return_tensors="pt").to(
+            device
+        )
+
+        with torch.no_grad():
+            _ = model(
+                input_ids=ids["input_ids"],
+                attention_mask=ids["attention_mask"],
+                decoder_input_ids=ids["input_ids"],
+                decoder_attention_mask=ids["attention_mask"],
+            )
+
+    def test_aten_linalg_vector_norm_with_reducel2(self):
+        class Net(torch.nn.Module):
+            def forward(self, x):
+                x = F.normalize(x)
+                return x
+
+        f = io.BytesIO()
+        torch.onnx.export(Net(), (torch.randn(1, 2, 2),), f)
+        onnx_model = onnx.load_from_string(f.getvalue())
+        onnx_nodes = [n.op_type for n in onnx_model.graph.node]
+        self.assertIn("ReduceL2", onnx_nodes)
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index a474d71d49b73..12f42ebd472e9 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -41,9 +41,13 @@
 import torch
 from torch import Tensor
 from torch.nn.utils import rnn as rnn_utils
+<<<<<<< HEAD
 from torch.onnx import errors
 from torch.onnx._internal.torchscript_exporter import verification
 from torch.onnx._internal.torchscript_exporter._type_utils import JitScalarType
+=======
+from torch.onnx import errors, verification
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
 
@@ -897,11 +901,15 @@ def forward(
         # export succeeds, but running ORT through run_test would fail because the exported model
         # has the inputs flattened into 3 inputs.
         torch.onnx.export(
+<<<<<<< HEAD
             model,
             (x, {"y": (y0, y1)}),
             io.BytesIO(),
             opset_version=self.opset_version,
             dynamo=False,
+=======
+            model, (x, {"y": (y0, y1)}), io.BytesIO(), opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_primitive_input_integer(self):
@@ -1935,7 +1943,11 @@ def test_div_promotion_script(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
                 # Add transpose to hide shape/type information
+<<<<<<< HEAD
                 # Otherwise shape and type are still available from input.
+=======
+                # Otherwise shape and type are still avaiable from input.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = x.transpose(1, 2)
                 y = y.transpose(1, 2)
                 return x / y, torch.true_divide(x, y)
@@ -3878,7 +3890,11 @@ def forward(self, x, k):
     def test_topk_smallest_unsorted(self):
         class MyModule(torch.nn.Module):
             def forward(self, x, k):
+<<<<<<< HEAD
                 # When sorted=False, order of elements in the output tensors
+=======
+                # When sorted=False, order of elements in the outout tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # are not expected to match between PyTorch and ORT
                 topk_unsorted = torch.topk(x, k, largest=False, sorted=False)
                 topk_sorted = torch.topk(x, k, largest=False, sorted=True)
@@ -4361,7 +4377,11 @@ def __init__(self) -> None:
                 super().__init__()
                 self.weight = torch.nn.Buffer(torch.ones(5))
                 # torch.nn.Embedding is converted to ONNX::Gather.
+<<<<<<< HEAD
                 # Constant folding will be triggered for constant inputs.
+=======
+                # Constant folding will be triggerred for constant inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # This pattern is common for constant mask inputs in transformer models.
                 self.embed = torch.nn.Embedding(8, 3)
 
@@ -4879,7 +4899,11 @@ def forward(self, input):
     @skipScriptTest()
     def test_rnn_no_bias(self):
         def make_model(layers, packed_sequence):
+<<<<<<< HEAD
             batch_first = packed_sequence == 2
+=======
+            batch_first = True if packed_sequence == 2 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = torch.nn.RNN(
                 RNN_INPUT_SIZE,
                 RNN_HIDDEN_SIZE,
@@ -4900,7 +4924,11 @@ def make_model(layers, packed_sequence):
             return model
 
         def make_input(batch_size, layers, packed_sequence):
+<<<<<<< HEAD
             batch_first = packed_sequence == 2
+=======
+            batch_first = True if packed_sequence == 2 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
@@ -5389,7 +5417,11 @@ def test_argmin_argmax(self):
         input = torch.randn(7, 3, 5)
         self._argmin_argmax_model(input)
 
+<<<<<<< HEAD
     # Argmin and Argmax with "select_last_index" is not supported before opset 12
+=======
+    # Argmin and Argmax with "select_last_index" is not supprted before opset 12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # "select_last_index" was added in opset 12 to deal with corner case where the
     # same value appears multiple times in the tensor
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -10511,7 +10543,11 @@ def forward(self, input):
                 amax = torch.ones(4)
                 scale = amax / 127.0
                 zero_point = torch.zeros_like(amax, dtype=torch.int)
+<<<<<<< HEAD
                 # Quantize twice to test different branches
+=======
+                # Quantize twice to test differnet branches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 y = torch.fake_quantize_per_channel_affine(
                     input, scale, zero_point, 1, 0, 255
                 )
@@ -10795,7 +10831,10 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10811,7 +10850,10 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_outs = verification._run_onnx(ort_sess, (x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
@@ -10845,7 +10887,10 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10871,7 +10916,10 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -12632,11 +12680,15 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_mean]), torch.tensor([expected_std]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
+<<<<<<< HEAD
             model_export,
             dummy_input,
             model_onnx,
             opset_version=self.opset_version,
             dynamo=False,
+=======
+            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=dummy_input)
@@ -12667,11 +12719,15 @@ def forward(self):
         model_onnx = io.BytesIO()
         test_inputs = ()
         torch.onnx.export(
+<<<<<<< HEAD
             model_export,
             test_inputs,
             model_onnx,
             opset_version=self.opset_version,
             dynamo=False,
+=======
+            model_export, test_inputs, model_onnx, opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=test_inputs)
@@ -12714,11 +12770,15 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_min]), torch.tensor([expected_max]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
+<<<<<<< HEAD
             model_export,
             dummy_input,
             model_onnx,
             opset_version=self.opset_version,
             dynamo=False,
+=======
+            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ort_sess = verification._ort_session(model_onnx)
 
@@ -13725,10 +13785,16 @@ def test_optional_output(self, module_class: type[torch.nn.Module], x_size: int)
             # Ensure condition is not constant
             dynamic_axes={"x": {0: dynamic_axis_name}},
             input_names=["x"],
+<<<<<<< HEAD
             dynamo=False,
         )
         exported = onnx.load_from_string(f.getvalue())
         expected_elem_type = JitScalarType.from_value(x).onnx_type()
+=======
+        )
+        exported = onnx.load_from_string(f.getvalue())
+        expected_elem_type = torch.onnx.JitScalarType.from_value(x).onnx_type()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_output_type = onnx.helper.make_optional_type_proto(
             onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,))
         )
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index e7c58e1ffdbe1..cb58d295acef7 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -10,8 +10,13 @@
 
 import torch
 from torch.onnx import _constants, utils
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter import jit_utils
 from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+=======
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
@@ -396,7 +401,10 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -431,7 +439,10 @@ def linalg_inv_no_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -470,7 +481,10 @@ def linalg_inv_settype(g, self):
             custom_opsets={"com.microsoft": 1},
             input_names=["x"],
             dynamic_axes={"x": {0: "batch"}},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -511,7 +525,10 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
diff --git a/test/onnx/test_symbolic_helper.py b/test/onnx/test_symbolic_helper.py
index cc7a3a133732c..c8fabc4239972 100644
--- a/test/onnx/test_symbolic_helper.py
+++ b/test/onnx/test_symbolic_helper.py
@@ -3,7 +3,11 @@
 
 import torch
 from torch.onnx import symbolic_helper
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+=======
+from torch.onnx._globals import GLOBALS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 1f80f4163eb25..ea7947b05c063 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -1,9 +1,17 @@
 # Owner(s): ["module: onnx"]
 
 import copy
+<<<<<<< HEAD
 import io
 import re
 import warnings
+=======
+import functools
+import io
+import re
+import warnings
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import onnx
 
@@ -21,7 +29,11 @@
 import torch.onnx
 import torch.utils.cpp_extension
 from torch.onnx import _constants, OperatorExportTypes, TrainingMode, utils
+<<<<<<< HEAD
 from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+=======
+from torch.onnx._globals import GLOBALS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx.symbolic_helper import _unpack_list, parse_args
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
@@ -84,6 +96,89 @@ def _model_to_graph(
         return graph, params_dict, torch_out
 
 
+<<<<<<< HEAD
+=======
+@common_utils.instantiate_parametrized_tests
+class TestUnconvertibleOps(pytorch_test_common.ExportTestCase):
+    """Unit tests for the `unconvertible_ops` function."""
+
+    def setUp(self):
+        class EinsumModule(torch.nn.Module):
+            def forward(self, x):
+                return torch.einsum("ii", x)
+
+        self.einsum_module = EinsumModule()
+
+    def test_it_returns_graph_and_unconvertible_ops_at_lower_opset_version(self):
+        x = torch.randn(4, 4)
+
+        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
+        graph, unconvertible_ops = utils.unconvertible_ops(
+            self.einsum_module, (x,), opset_version=9
+        )
+        nodes = graph.nodes()
+        self.assertEqual(next(nodes).kind(), "prim::Constant")
+        self.assertEqual(next(nodes).kind(), "prim::ListConstruct")
+        self.assertEqual(next(nodes).kind(), "prim::Constant")
+        self.assertEqual(next(nodes).kind(), "aten::einsum")
+        self.assertEqual(unconvertible_ops, ["aten::einsum"])
+
+    @common_utils.parametrize(
+        "jit_function",
+        [
+            common_utils.subtest(
+                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
+                name="traced",
+            ),
+            common_utils.subtest(torch.jit.script, name="scripted"),
+        ],
+    )
+    def test_it_returns_unconvertible_ops_at_lower_opset_version_for_jit_module(
+        self, jit_function: Callable
+    ):
+        module = jit_function(self.einsum_module)
+        x = torch.randn(4, 4)
+
+        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
+        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=9)
+        self.assertEqual(unconvertible_ops, ["aten::einsum"])
+
+    @common_utils.parametrize(
+        "jit_function",
+        [
+            common_utils.subtest(lambda x: x, name="nn_module"),
+            common_utils.subtest(
+                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
+                name="traced",
+            ),
+            common_utils.subtest(torch.jit.script, name="scripted"),
+        ],
+    )
+    def test_it_returns_empty_list_when_all_ops_convertible(
+        self, jit_function: Callable
+    ):
+        module = jit_function(self.einsum_module)
+        x = torch.randn(4, 4)
+
+        # Einsum is supported since opset 12
+        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=12)
+        self.assertEqual(unconvertible_ops, [])
+
+    def test_it_returns_empty_list_when_model_contains_supported_inplace_ops(self):
+        class SkipConnectionModule(torch.nn.Module):
+            def forward(self, x):
+                out = x
+                out += x
+                out = torch.nn.functional.relu(out, inplace=True)
+                return out
+
+        module = SkipConnectionModule()
+        x = torch.randn(4, 4)
+        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=13)
+        self.assertEqual(unconvertible_ops, [])
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @parameterized.parameterized_class(
     [
         {"opset_version": opset}
@@ -111,9 +206,13 @@ def forward(self, x):
         x = torch.randn(3, 4)
         f = io.BytesIO()
         try:
+<<<<<<< HEAD
             torch.onnx.export(
                 MyModule(), x, f, opset_version=self.opset_version, dynamo=False
             )
+=======
+            torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except ValueError:
             self.assertFalse(torch.onnx.is_in_onnx_export())
 
@@ -640,7 +739,11 @@ def test_constant_fold_upsample_scale_fold_as_constant(self):
         model = torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
         x = torch.randn(1, 32, 224, 224)
         f = io.BytesIO()
+<<<<<<< HEAD
         torch.onnx.export(model, x, f, dynamo=False)
+=======
+        torch.onnx.export(model, x, f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(len(onnx_model.graph.initializer), 0)
 
@@ -653,6 +756,7 @@ def forward(self, input):
 
         def is_model_stripped(f, verbose=None):
             if verbose is None:
+<<<<<<< HEAD
                 torch.onnx.export(
                     MyModule(), x, f, opset_version=self.opset_version, dynamo=False
                 )
@@ -664,6 +768,12 @@ def is_model_stripped(f, verbose=None):
                     verbose=verbose,
                     opset_version=self.opset_version,
                     dynamo=False,
+=======
+                torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+            else:
+                torch.onnx.export(
+                    MyModule(), x, f, verbose=verbose, opset_version=self.opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             model = onnx.load(io.BytesIO(f.getvalue()))
             model_strip = copy.copy(model)
@@ -686,9 +796,13 @@ def test_error_on_data_parallel(self):
             "exporter, please use 'attribute' module to "
             "unwrap model from torch.nn.DataParallel. Try ",
         ):
+<<<<<<< HEAD
             torch.onnx.export(
                 model, x, f, opset_version=self.opset_version, dynamo=False
             )
+=======
+            torch.onnx.export(model, x, f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_sequence_dim(self):
@@ -712,7 +826,10 @@ def forward(self, x, y):
             opset_version=self.opset_version,
             input_names=["x", "y"],
             dynamic_axes={"y": [1]},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
@@ -724,9 +841,13 @@ def forward(self, x, y):
         # Case 2: no dynamic axes.
         f = io.BytesIO()
         y = torch.randn(2, 3)
+<<<<<<< HEAD
         torch.onnx.export(
             script_model, (x, y), f, opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(script_model, (x, y), f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
         ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info(
@@ -753,7 +874,10 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.TRAINING,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -767,7 +891,10 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.EVAL,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -795,9 +922,13 @@ def forward(self, x):
         # jit.freeze removes the training attribute in the module
         module = torch.jit.freeze(module)
 
+<<<<<<< HEAD
         torch.onnx.export(
             module, (x,), io.BytesIO(), opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(module, (x,), io.BytesIO(), opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfUnsupportedMinOpsetVersion(15)
     def test_local_function(self):
@@ -846,7 +977,10 @@ def forward(self, x, y, z):
                 torch.nn.Dropout,
                 torch.nn.LayerNorm,
             },
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -881,7 +1015,10 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={torch.nn.CELU},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -897,7 +1034,10 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=set(),
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -912,7 +1052,10 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=True,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -949,7 +1092,10 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={NWithOverloads},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -979,7 +1125,10 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             do_constant_folding=False,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1012,7 +1161,10 @@ def forward(self, x):
             f,
             export_modules_as_functions=True,
             opset_version=self.opset_version,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1078,7 +1230,10 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             verbose=True,  # Allows the test case to print `Skipping module attribute 'freeze'`
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_node_scope(self):
@@ -1323,7 +1478,10 @@ def gelu(g, self, approximate):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1344,9 +1502,13 @@ def gelu(g, self, approximate):
         model = torch.nn.GELU(approximate="none")
         x = torch.randn(3, 3)
         f = io.BytesIO()
+<<<<<<< HEAD
         torch.onnx.export(
             model, (x,), f, opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph = onnx.load(io.BytesIO(f.getvalue()))
 
         self.assertEqual(graph.graph.node[0].op_type, "Gelu")
@@ -1373,7 +1535,10 @@ def linalg_inv(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1677,7 +1842,10 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             keep_initializers_as_inputs=True,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.input[1].name, "in_weight")
@@ -1710,19 +1878,27 @@ def forward(self, x):
         ]
         f = io.BytesIO()
 
+<<<<<<< HEAD
         torch.onnx.export(
             module, torch.ones(1, 10), f, output_names=["y"], dynamo=False
         )
+=======
+        torch.onnx.export(module, torch.ones(1, 10), f, output_names=["y"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
             self.assertIn(n.name, ref_node_names)
 
         torch.onnx.export(
+<<<<<<< HEAD
             torch.jit.script(module),
             torch.ones(1, 10),
             f,
             output_names=["y"],
             dynamo=False,
+=======
+            torch.jit.script(module), torch.ones(1, 10), f, output_names=["y"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
@@ -1765,7 +1941,10 @@ def forward(self, x):
             f,
             training=TrainingMode.TRAINING,
             opset_version=self.opset_version,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1778,7 +1957,10 @@ def forward(self, x):
             f,
             training=TrainingMode.PRESERVE,
             opset_version=self.opset_version,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1786,9 +1968,13 @@ def forward(self, x):
         # Test eval mode.
         model.eval()
         f = io.BytesIO()
+<<<<<<< HEAD
         torch.onnx.export(
             model, (x,), f, opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph = onnx.load(io.BytesIO(f.getvalue()))
         param_name_set.remove("param2")
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1817,9 +2003,13 @@ def forward(self, x, y):
         x = torch.randn(3, 3, device=torch.device("cpu"))
         y = torch.randn(3, 3, device=torch.device("cuda"))
         f = io.BytesIO()
+<<<<<<< HEAD
         torch.onnx.export(
             Model(), (x, y), f, opset_version=self.opset_version, dynamo=False
         )
+=======
+        torch.onnx.export(Model(), (x, y), f, opset_version=self.opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, {"w_cpu"})
 
@@ -1860,7 +2050,10 @@ def forward(self, input0, input1):
             dynamic_axes=dynamic_axes,
             verbose=True,
             keep_initializers_as_inputs=True,
+<<<<<<< HEAD
             dynamo=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1888,7 +2081,11 @@ def forward(self, x):
 
         f = io.BytesIO()
         x = torch.randn(1, 32, 224, 224)
+<<<<<<< HEAD
         torch.onnx.export(Model(), x, f, dynamo=False)
+=======
+        torch.onnx.export(Model(), x, f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         # aten::upsample converts to onnx::resize
         resize_nodes = [n for n in onnx_model.graph.node if n.op_type == "Resize"]
@@ -1920,7 +2117,11 @@ def forward(self, x):
         self.assertExpectedRaisesInline(
             AssertionError,
             lambda: torch.onnx.export(
+<<<<<<< HEAD
                 model, (x,), f, opset_version=_onnx_opset_version, dynamo=False
+=======
+                model, (x,), f, opset_version=_onnx_opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             (
                 "A mismatch between the number of arguments (2) and their descriptors (1) was found at symbolic function "
diff --git a/test/onnx/test_verification.py b/test/onnx/test_verification.py
new file mode 100644
index 0000000000000..4d2b4676d9b17
--- /dev/null
+++ b/test/onnx/test_verification.py
@@ -0,0 +1,298 @@
+# Owner(s): ["module: onnx"]
+
+import contextlib
+import io
+import tempfile
+import unittest
+
+import numpy as np
+
+import onnx
+import parameterized
+import pytorch_test_common
+from packaging import version
+
+import torch
+from torch.onnx import _constants, _experimental, verification
+from torch.testing._internal import common_utils
+
+
+class TestVerification(pytorch_test_common.ExportTestCase):
+    def test_check_export_model_diff_returns_diff_when_constant_mismatch(self):
+        class UnexportableModel(torch.nn.Module):
+            def forward(self, x, y):
+                # tensor.data() will be exported as a constant,
+                # leading to wrong model output under different inputs.
+                return x + y.data
+
+        test_input_groups = [
+            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
+            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
+        ]
+
+        results = verification.check_export_model_diff(
+            UnexportableModel(), test_input_groups
+        )
+        self.assertRegex(
+            results,
+            r"Graph diff:(.|\n)*"
+            r"First diverging operator:(.|\n)*"
+            r"prim::Constant(.|\n)*"
+            r"Former source location:(.|\n)*"
+            r"Latter source location:",
+        )
+
+    def test_check_export_model_diff_returns_diff_when_dynamic_controlflow_mismatch(
+        self,
+    ):
+        class UnexportableModel(torch.nn.Module):
+            def forward(self, x, y):
+                for i in range(x.size(0)):
+                    y = x[i] + y
+                return y
+
+        test_input_groups = [
+            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
+            ((torch.randn(4, 3), torch.randn(2, 3)), {}),
+        ]
+
+        export_options = _experimental.ExportOptions(
+            input_names=["x", "y"], dynamic_axes={"x": [0]}
+        )
+        results = verification.check_export_model_diff(
+            UnexportableModel(), test_input_groups, export_options
+        )
+        self.assertRegex(
+            results,
+            r"Graph diff:(.|\n)*"
+            r"First diverging operator:(.|\n)*"
+            r"prim::Constant(.|\n)*"
+            r"Latter source location:(.|\n)*",
+        )
+
+    def test_check_export_model_diff_returns_empty_when_correct_export(self):
+        class SupportedModel(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        test_input_groups = [
+            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
+            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
+        ]
+
+        results = verification.check_export_model_diff(
+            SupportedModel(), test_input_groups
+        )
+        self.assertEqual(results, "")
+
+    def test_compare_ort_pytorch_outputs_no_raise_with_acceptable_error_percentage(
+        self,
+    ):
+        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
+        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
+        options = verification.VerificationOptions(
+            rtol=1e-5,
+            atol=1e-6,
+            check_shape=True,
+            check_dtype=False,
+            ignore_none=True,
+            acceptable_error_percentage=0.3,
+        )
+        verification._compare_onnx_pytorch_outputs(
+            ort_outs,
+            pytorch_outs,
+            options,
+        )
+
+    def test_compare_ort_pytorch_outputs_raise_without_acceptable_error_percentage(
+        self,
+    ):
+        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
+        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
+        options = verification.VerificationOptions(
+            rtol=1e-5,
+            atol=1e-6,
+            check_shape=True,
+            check_dtype=False,
+            ignore_none=True,
+            acceptable_error_percentage=None,
+        )
+        with self.assertRaises(AssertionError):
+            verification._compare_onnx_pytorch_outputs(
+                ort_outs,
+                pytorch_outs,
+                options,
+            )
+
+
+@common_utils.instantiate_parametrized_tests
+class TestVerificationOnWrongExport(pytorch_test_common.ExportTestCase):
+    opset_version: int
+
+    def setUp(self):
+        super().setUp()
+
+        def incorrect_add_symbolic_function(g, self, other, alpha):
+            return self
+
+        self.opset_version = _constants.ONNX_DEFAULT_OPSET
+        torch.onnx.register_custom_op_symbolic(
+            "aten::add",
+            incorrect_add_symbolic_function,
+            opset_version=self.opset_version,
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        torch.onnx.unregister_custom_op_symbolic(
+            "aten::add", opset_version=self.opset_version
+        )
+
+    @common_utils.parametrize(
+        "onnx_backend",
+        [
+            common_utils.subtest(
+                verification.OnnxBackend.REFERENCE,
+                decorators=[
+                    unittest.skipIf(
+                        version.Version(onnx.__version__) < version.Version("1.13"),
+                        reason="Reference Python runtime was introduced in 'onnx' 1.13.",
+                    )
+                ],
+            ),
+            verification.OnnxBackend.ONNX_RUNTIME_CPU,
+        ],
+    )
+    def test_verify_found_mismatch_when_export_is_wrong(
+        self, onnx_backend: verification.OnnxBackend
+    ):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        with self.assertRaisesRegex(AssertionError, ".*Tensor-likes are not close!.*"):
+            verification.verify(
+                Model(),
+                (torch.randn(2, 3),),
+                opset_version=self.opset_version,
+                options=verification.VerificationOptions(backend=onnx_backend),
+            )
+
+
+@parameterized.parameterized_class(
+    [
+        # TODO: enable this when ONNX submodule catches up to >= 1.13.
+        # {"onnx_backend": verification.OnnxBackend.ONNX},
+        {"onnx_backend": verification.OnnxBackend.ONNX_RUNTIME_CPU},
+    ],
+    class_name_func=lambda cls,
+    idx,
+    input_dicts: f"{cls.__name__}_{input_dicts['onnx_backend'].name}",
+)
+class TestFindMismatch(pytorch_test_common.ExportTestCase):
+    onnx_backend: verification.OnnxBackend
+    opset_version: int
+    graph_info: verification.GraphInfo
+
+    def setUp(self):
+        super().setUp()
+        self.opset_version = _constants.ONNX_DEFAULT_OPSET
+
+        def incorrect_relu_symbolic_function(g, self):
+            return g.op("Add", self, g.op("Constant", value_t=torch.tensor(1.0)))
+
+        torch.onnx.register_custom_op_symbolic(
+            "aten::relu",
+            incorrect_relu_symbolic_function,
+            opset_version=self.opset_version,
+        )
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layers = torch.nn.Sequential(
+                    torch.nn.Linear(3, 4),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(4, 5),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(5, 6),
+                )
+
+            def forward(self, x):
+                return self.layers(x)
+
+        self.graph_info = verification.find_mismatch(
+            Model(),
+            (torch.randn(2, 3),),
+            opset_version=self.opset_version,
+            options=verification.VerificationOptions(backend=self.onnx_backend),
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        torch.onnx.unregister_custom_op_symbolic(
+            "aten::relu", opset_version=self.opset_version
+        )
+        delattr(self, "opset_version")
+        delattr(self, "graph_info")
+
+    def test_pretty_print_tree_visualizes_mismatch(self):
+        f = io.StringIO()
+        with contextlib.redirect_stdout(f):
+            self.graph_info.pretty_print_tree()
+        self.assertExpected(f.getvalue())
+
+    def test_preserve_mismatch_source_location(self):
+        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
+
+        self.assertTrue(len(mismatch_leaves) > 0)
+
+        for leaf_info in mismatch_leaves:
+            f = io.StringIO()
+            with contextlib.redirect_stdout(f):
+                leaf_info.pretty_print_mismatch(graph=True)
+            self.assertRegex(
+                f.getvalue(),
+                r"(.|\n)*aten::relu.*/torch/nn/functional.py:[0-9]+(.|\n)*",
+            )
+
+    def test_find_all_mismatch_operators(self):
+        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
+
+        self.assertEqual(len(mismatch_leaves), 2)
+
+        for leaf_info in mismatch_leaves:
+            self.assertEqual(leaf_info.essential_node_count(), 1)
+            self.assertEqual(leaf_info.essential_node_kinds(), {"aten::relu"})
+
+    def test_find_mismatch_prints_correct_info_when_no_mismatch(self):
+        self.maxDiff = None
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        f = io.StringIO()
+        with contextlib.redirect_stdout(f):
+            verification.find_mismatch(
+                Model(),
+                (torch.randn(2, 3),),
+                opset_version=self.opset_version,
+                options=verification.VerificationOptions(backend=self.onnx_backend),
+            )
+        self.assertExpected(f.getvalue())
+
+    def test_export_repro_for_mismatch(self):
+        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
+        self.assertTrue(len(mismatch_leaves) > 0)
+        leaf_info = mismatch_leaves[0]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            repro_dir = leaf_info.export_repro(temp_dir)
+
+            with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close!"):
+                options = verification.VerificationOptions(backend=self.onnx_backend)
+                verification.OnnxTestCaseRepro(repro_dir).validate(options)
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/torchlib/error_reproduction.py b/test/onnx/torchlib/error_reproduction.py
index 9fd1dace77677..f6f61b376951c 100644
--- a/test/onnx/torchlib/error_reproduction.py
+++ b/test/onnx/torchlib/error_reproduction.py
@@ -205,7 +205,11 @@ def create_reproduction_report(
 onnxscript=={onnxscript.__version__}
 numpy=={np.__version__}
 torch=={torch.__version__}"""
+<<<<<<< HEAD
     short_test_name = test_name.rsplit(".", maxsplit=1)[-1]
+=======
+    short_test_name = test_name.split(".")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reproduction_code = _REPRODUCTION_TEMPLATE.format(
         onnx_model_text=onnx_model_text,
         ort_inputs=input_text,
@@ -245,7 +249,11 @@ def create_mismatch_report(
 
     error_text = str(error)
     error_stack = error_text + "\n" + "".join(traceback.format_tb(error.__traceback__))
+<<<<<<< HEAD
     short_test_name = test_name.rsplit(".", maxsplit=1)[-1]
+=======
+    short_test_name = test_name.split(".")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     diff = difflib.unified_diff(
         str(actual).splitlines(),
         str(expected).splitlines(),
diff --git a/test/onnx/torchlib/ops_test_common.py b/test/onnx/torchlib/ops_test_common.py
index d1206da0e07d5..c4588756b095c 100644
--- a/test/onnx/torchlib/ops_test_common.py
+++ b/test/onnx/torchlib/ops_test_common.py
@@ -12,8 +12,13 @@
 import sys
 import unittest
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
 from typing import Any, Optional, TypeVar
+=======
+from collections.abc import Collection, Iterable, Mapping, Sequence
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import error_reproduction
 import numpy as np
@@ -246,7 +251,11 @@ def duplicate_opinfo_for_prims(
             new_opinfo = copy.deepcopy(opinfo)
             new_opinfo.name = new_name
             new_opinfo.op = getattr(torch.ops.prims, prims_name)
+<<<<<<< HEAD
             opinfos.append(new_opinfo)  # noqa: B909
+=======
+            opinfos.append(new_opinfo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
     raise RuntimeError(f"OpInfo '{name}' not found in the database.")
 
@@ -592,6 +601,10 @@ def _capture_graph_and_evaluate_torch_script_evaluator(
                 proto = onnxscript_function.to_function_proto()
                 ir_function = ir.serde.deserialize_function(proto)
             onnx_model.functions[identifier] = ir_function
+<<<<<<< HEAD
+=======
+        _ir_passes.add_torchlib_common_imports(onnx_model, opset_version=opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _ir_passes.add_opset_imports(onnx_model)
         # Make sure the model is valid
         model_proto = ir.to_proto(onnx_model)
diff --git a/test/onnx/torchlib/ops_test_data.py b/test/onnx/torchlib/ops_test_data.py
index 6dd3a39a8d6fc..4dae20cde1f5e 100644
--- a/test/onnx/torchlib/ops_test_data.py
+++ b/test/onnx/torchlib/ops_test_data.py
@@ -39,7 +39,11 @@
 import copy
 import dataclasses
 import functools
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import numpy as np
@@ -52,7 +56,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Collection
+=======
+    from collections.abc import Collection
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Create a copy of the op_db to modify
@@ -275,7 +283,11 @@ def _empty_input_wrangler(
 def _grid_sample_input_wrangler(
     args: list[Any], kwargs: dict[str, Any]
 ) -> tuple[list[Any], dict[str, Any]]:
+<<<<<<< HEAD
     # Convert string attribute to int as input
+=======
+    # Convert string attriute to int as input
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inter_mode_options = {"bilinear": 0, "nearest": 1, "bicubic": 2}
     padding_mode_options = {"zeros": 0, "border": 1, "reflection": 2}
     args.append(inter_mode_options[kwargs["mode"]])
diff --git a/test/onnx/torchlib/test_ops.py b/test/onnx/torchlib/test_ops.py
index be4cda66dc2ed..0b712211d8b98 100644
--- a/test/onnx/torchlib/test_ops.py
+++ b/test/onnx/torchlib/test_ops.py
@@ -25,7 +25,11 @@
 from __future__ import annotations
 
 import os
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
+=======
+from typing import Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import error_reproduction
 import numpy as np
@@ -44,7 +48,11 @@
 
 if TYPE_CHECKING:
     import unittest
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch.testing._internal.opinfo import core as opinfo_core
 
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index 2519207126102..ab23b562a7354 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -43,7 +43,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestLRScheduler(TestCase):
@@ -77,7 +81,11 @@ def setUp(self):
         self.opt = SGD(
             [
                 {"params": self.net.conv1.parameters()},
+<<<<<<< HEAD
                 {"params": self.net.conv2.parameters(), "lr": torch.tensor(0.5)},
+=======
+                {"params": self.net.conv2.parameters(), "lr": 0.5},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             lr=0.05,
         )
@@ -369,6 +377,7 @@ def test_get_last_lr_multi_step_lr(self):
         scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
         self._test_get_last_lr(scheduler, targets, epochs)
 
+<<<<<<< HEAD
     def test_raise_error_when_last_epoch_is_greater_than_0_and_initial_lr_is_not_specified(
         self,
     ):
@@ -379,6 +388,8 @@ def test_raise_error_when_last_epoch_is_greater_than_0_and_initial_lr_is_not_spe
         ):
             StepLR(optimizer, step_size=3, gamma=0.1, last_epoch=1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_step_lr(self):
         # lr = 0.05     if epoch < 2
         # lr = 0.005    if 2 <= epoch < 5
@@ -710,6 +721,7 @@ def test_reduce_lr_on_plateau_get_last_lr_before_step(self):
             scheduler.get_last_lr(), [0.5 for param_group in self.opt.param_groups]
         )
 
+<<<<<<< HEAD
     def test_reduce_lr_on_plateau_preserves_lr_type(self):
         # Ensures that tensor lrs are preserved, preventing recompilations.
         types = [type(group["lr"]) for group in self.opt.param_groups]
@@ -719,6 +731,8 @@ def test_reduce_lr_on_plateau_preserves_lr_type(self):
         for group, type_ in zip(self.opt.param_groups, types):
             self.assertEqual(type(group["lr"]), type_)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sequentiallr1(self):
         epochs = 19
         schedulers = [None] * 2
@@ -803,6 +817,7 @@ def test_sequentiallr5(self):
         scheduler = SequentialLR(self.opt, schedulers=schedulers, milestones=milestones)
         self._test(scheduler, targets, epochs)
 
+<<<<<<< HEAD
     def test_sequentiallr_no_warnings(self):
         scheduler1 = LinearLR(self.opt, start_factor=0.5, end_factor=0.1, total_iters=5)
         scheduler2 = ExponentialLR(self.opt, gamma=0.9)
@@ -816,6 +831,8 @@ def test_sequentiallr_no_warnings(self):
                 scheduler.step()
                 self.assertTrue(len(ws) == 0, "No warning should be raised")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_last_lr_sequentiallr(self):
         epochs = 12
         milestones = [3, 6]
@@ -831,6 +848,7 @@ def test_get_last_lr_sequentiallr(self):
         targets = [single_targets, [x * 10 for x in single_targets]]
         self._test_get_last_lr(scheduler, targets, epochs)
 
+<<<<<<< HEAD
     def test_sequentiallr_does_not_alias_lr_and_initial_lr(self):
         # The TestLRScheduler object uses self.opt to avoid instantiating a new optimizer for each test.
         # self.opt has a float lr, and we need to use a Tensor lr to ensure that a former SequentialLR bug is fixed.
@@ -852,6 +870,8 @@ def test_sequentiallr_does_not_alias_lr_and_initial_lr(self):
         self._test(scheduler, targets, epochs)
         self.opt = old_opt
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_chained_lr2_get_last_lr_before_step(self):
         schedulers = [
             LinearLR(self.opt, start_factor=0.4, total_iters=3),
@@ -1509,7 +1529,11 @@ def test_cycle_lr_triangular2_mode_step_size_up_down(self):
             14.0 / 3,
             29.0 / 6,
         ]
+<<<<<<< HEAD
         deltas = [2 * i for i in range(2)]
+=======
+        deltas = [2 * i for i in range(0, 2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
@@ -2442,7 +2466,10 @@ def test_cosine_then_cyclic(self):
             partial(CyclicLR, base_lr=0.01, max_lr=0.1),
             partial(OneCycleLR, max_lr=0.01, total_steps=10, anneal_strategy="linear"),
             partial(CosineAnnealingWarmRestarts, T_0=20),
+<<<<<<< HEAD
             partial(SWALR, swa_lr=0.01),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     @parametrize("weights_only", [True, False])
@@ -2530,7 +2557,11 @@ def test_add_param_group_errors_reduce_lr_on_plateau(self):
         ],
     )
     def test_constant_initial_lr(self, LRClass):
+<<<<<<< HEAD
         # Test that the initial learning rate is constant and that it does not alias base_lrs
+=======
+        # Test that the initial learning rate is constant
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lr = torch.as_tensor(0.1)
         opt = SGD([torch.nn.Parameter(torch.randn(1))], lr=lr)
         sch = LRClass(opt)
@@ -2544,7 +2575,10 @@ def test_constant_initial_lr(self, LRClass):
             for group, ori_group in zip(opt.param_groups, ori_param_groups):
                 self.assertEqual(group["initial_lr"], ori_group["initial_lr"])
                 self.assertEqual(sch.base_lrs, [0.1])
+<<<<<<< HEAD
                 self.assertIsNot(sch.base_lrs[0], group["initial_lr"])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_constant_initial_params_cyclelr(self):
         # Test that the initial learning rate is constant
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
index a8cde85c0df1b..2889855913f9f 100644
--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@@ -30,7 +30,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
diff --git a/test/optim/test_swa_utils.py b/test/optim/test_swa_utils.py
index 1992a39a78575..00d10c435d1f4 100644
--- a/test/optim/test_swa_utils.py
+++ b/test/optim/test_swa_utils.py
@@ -20,7 +20,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestSWAUtils(TestCase):
diff --git a/test/package/generate_bc_packages.py b/test/package/generate_bc_packages.py
index 52acca1d94655..367906a38255b 100644
--- a/test/package/generate_bc_packages.py
+++ b/test/package/generate_bc_packages.py
@@ -11,7 +11,11 @@
 
 
 def generate_bc_packages():
+<<<<<<< HEAD
     """Function to create packages for testing backwards compatibility"""
+=======
+    """Function to create packages for testing backwards compatiblity"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not IS_FBCODE or IS_SANDCASTLE:
         from package_a.test_nn_module import TestNnModule
 
diff --git a/test/package/package_a/test_nn_module.py b/test/package/package_a/test_nn_module.py
index 18cc9a395ada2..0060cf00e86f7 100644
--- a/test/package/package_a/test_nn_module.py
+++ b/test/package/package_a/test_nn_module.py
@@ -25,7 +25,11 @@ def __init__(self, nz=6, ngf=9, nc=3):
             torch.nn.ReLU(True),
             # state size. (ngf) x 32 x 32
             torch.nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
+<<<<<<< HEAD
             torch.nn.Tanh(),
+=======
+            torch.nn.Tanh()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # state size. (nc) x 64 x 64
         )
 
diff --git a/test/package/test_load_bc_packages.py b/test/package/test_load_bc_packages.py
index 4280736d6e33b..eed59c10f8bc5 100644
--- a/test/package/test_load_bc_packages.py
+++ b/test/package/test_load_bc_packages.py
@@ -17,7 +17,11 @@
 
 
 class TestLoadBCPackages(PackageTestCase):
+<<<<<<< HEAD
     """Tests for checking loading has backwards compatibility"""
+=======
+    """Tests for checking loading has backwards compatiblity"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIf(
         IS_FBCODE or IS_SANDCASTLE,
diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index 897d250bc6790..dda98450f86ad 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -196,7 +196,11 @@ def test_python_version(self):
         "Tests that use temporary files are disabled in fbcode",
     )
     def test_load_python_version_from_package(self):
+<<<<<<< HEAD
         """Tests loading a package with a python version embedded"""
+=======
+        """Tests loading a package with a python version embdded"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         importer1 = PackageImporter(
             f"{Path(__file__).parent}/package_e/test_nn_module.pt"
         )
diff --git a/test/package/test_model.py b/test/package/test_model.py
index ea0d2c0788b61..a4cc5e2a21ef3 100644
--- a/test/package/test_model.py
+++ b/test/package/test_model.py
@@ -97,7 +97,11 @@ def test_model_save(self):
         # how they want to save it but the 'server' can always
         # use the same API to load the package.
 
+<<<<<<< HEAD
         # The convention is for each model to provide a
+=======
+        # The convension is for each model to provide a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # 'model' package with a 'load' function that actual
         # reads the model out of the archive.
 
@@ -123,7 +127,11 @@ def test_model_save(self):
                 import torch_package_importer as resources
 
                 # server knows to call model.load() to get the model,
+<<<<<<< HEAD
                 # maybe in the future it passes options as arguments by convention
+=======
+                # maybe in the future it passes options as arguments by convension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 def load():
                     return resources.load_pickle('model', 'pickled')
                 """
diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index edbba9f6f8ee8..24dfe17e9706f 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -208,10 +208,18 @@ def make_exporter():
             # Ensure that the importer finds the 'PackageAObject' defined in 'importer1' first.
             return pe
 
+<<<<<<< HEAD
         # This succeeds because OrderedImporter.get_name() properly
         # falls back to sys_importer which can find the original PackageAObject
         pe = make_exporter()
         pe.save_pickle("obj", "obj.pkl", obj2)
+=======
+        # This should fail. The 'PackageAObject' type defined from 'importer1'
+        # is not necessarily the same 'obj2's version of 'PackageAObject'.
+        pe = make_exporter()
+        with self.assertRaises(pickle.PicklingError):
+            pe.save_pickle("obj", "obj.pkl", obj2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # This should also fail. The 'PackageAObject' type defined from 'importer1'
         # is not necessarily the same as the one defined from 'importer2'
diff --git a/test/profiler/test_execution_trace.py b/test/profiler/test_execution_trace.py
index 3a174b1d66a67..4a013362f51f4 100644
--- a/test/profiler/test_execution_trace.py
+++ b/test/profiler/test_execution_trace.py
@@ -1,5 +1,21 @@
 # Owner(s): ["oncall: profiler"]
 
+<<<<<<< HEAD
+=======
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import json
 import os
 import tempfile
@@ -39,6 +55,7 @@
 from torch.utils._triton import has_triton
 
 
+<<<<<<< HEAD
 # if tqdm is not shutdown properly, it will leave the monitor thread alive.
 # This causes an issue in the multithreading test because we check all events
 # in that test with their tids. The events that correspond to these lingering
@@ -52,6 +69,8 @@
 except ImportError:
     pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Json = dict[str, Any]
 
 
@@ -404,7 +423,10 @@ def fn(a, b, c):
 
         nodes = self.get_execution_trace_root(fp.name)
         found_captured_triton_kernel_node = False
+<<<<<<< HEAD
         found_call_compiled_fx_graph = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for n in nodes:
             assert "name" in n
             if "triton_" in n["name"]:
@@ -413,10 +435,14 @@ def fn(a, b, c):
                         found_captured_triton_kernel_node = True
                         assert len(n["inputs"]["values"]) > 0
                         assert len(n["outputs"]["values"]) == 0
+<<<<<<< HEAD
             elif "Call CompiledFxGraph" in n["name"]:
                 found_call_compiled_fx_graph = True
         assert found_captured_triton_kernel_node
         assert found_call_compiled_fx_graph
+=======
+        assert found_captured_triton_kernel_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
     @unittest.skipIf(
@@ -425,11 +451,14 @@ def fn(a, b, c):
     )
     @skipCPUIf(True, "skip CPU device for testing profiling triton")
     def test_execution_trace_env_enabled_with_pt2(self, device):
+<<<<<<< HEAD
         # clean up the local cache for triton kernel
         from torch._inductor.codecache import PyCodeCache
 
         PyCodeCache.cache_clear(purge=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import os
 
         os.environ["ENABLE_PYTORCH_EXECUTION_TRACE"] = "1"
@@ -444,9 +473,13 @@ def fn(a, b, c):
         a, b, c = (torch.randn(4, 4, requires_grad=True).to(device) for _ in range(3))
 
         inputs = [a, b, c]
+<<<<<<< HEAD
         with torch._inductor.config.patch(
             compile_threads=1, fx_graph_cache=False, fx_graph_remote_cache=False
         ):
+=======
+        with torch._inductor.config.patch(compile_threads=1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn(*inputs)
 
         with profile(
@@ -480,6 +513,7 @@ def fn(a, b, c):
                         assert len(n["outputs"]["values"]) == 0
         assert found_captured_triton_kernel_node
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
     @unittest.skipIf(
         (not has_triton()) or (not TEST_CUDA and not TEST_XPU),
@@ -585,6 +619,8 @@ def fn(a, b, c):
                         )
                         assert fx_graph[7] == "#   return %cos"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_execution_trace_start_stop(self, device):
         use_device = (
             torch.profiler.ProfilerActivity.CUDA
@@ -740,9 +776,15 @@ def test_execution_trace_record_integral_tensor_data(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             fp_name = os.path.join(temp_dir, "test.et.json")
 
+<<<<<<< HEAD
             os.environ["ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_DATA"] = (
                 "aten::gather"
             )
+=======
+            os.environ[
+                "ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_DATA"
+            ] = "aten::gather"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             et = ExecutionTraceObserver()
             et.register_callback(fp_name)
             et.set_extra_resource_collection(True)
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 91e4fd7a37768..040e00bfa34b2 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -4,8 +4,13 @@
 import itertools as it
 import textwrap
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from typing import Optional
+=======
+from collections.abc import Iterator
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C._profiler import _EventType, _TensorMetadata
@@ -901,7 +906,11 @@ def _run_and_format_categories(self, fn, indent=12):
                         ptr_pair_to_key[(t.impl_ptr, t.storage_data_ptr)] = key
 
         def format_categories(ptr_pair: int):
+<<<<<<< HEAD
             target_key = ptr_pair_to_key.get(ptr_pair)
+=======
+            target_key = ptr_pair_to_key.get(ptr_pair, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if target_key is None:
                 return "???"
 
@@ -1174,10 +1183,18 @@ def step_fn(mark_region):
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 20 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
             aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
+<<<<<<< HEAD
+=======
+            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::detach                             21 (GRADIENT)                                 -> ???
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
+<<<<<<< HEAD
+=======
+            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::detach                             23 (GRADIENT)                                 -> ???""",
         )
 
@@ -1225,10 +1242,18 @@ def step_fn(mark_region):
             aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
             aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
             aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+<<<<<<< HEAD
+=======
+            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+<<<<<<< HEAD
+=======
+            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             -- Optimizer --------------------------------------------------------------------------------------------
             aten::add_.Tensor                        3 (PARAMETER), 23 (GRADIENT)                  -> 3 (PARAMETER)
@@ -1273,8 +1298,15 @@ def step_fn(mark_region):
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
+<<<<<<< HEAD
+            aten::detach                             9 (GRADIENT)                                  -> ???
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+=======
+            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> ???
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::detach                             7 (GRADIENT)                                  -> ???""",
         )
 
@@ -1312,6 +1344,7 @@ def step_fn(mark_region):
             aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
+<<<<<<< HEAD
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
 
@@ -1321,6 +1354,21 @@ def step_fn(mark_region):
             aten::add_.Tensor                        2 (PARAMETER), 10 (OPTIMIZER_STATE)           -> 2 (PARAMETER)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
+=======
+            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+
+            -- Optimizer --------------------------------------------------------------------------------------------
+            aten::clone                              7 (GRADIENT)                                  -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::add_.Tensor                        2 (PARAMETER), 10 (OPTIMIZER_STATE)           -> 2 (PARAMETER)
+            aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)""",
         )
 
@@ -1404,6 +1452,10 @@ def step_fn(mark_region):
             aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
             aten::mm                                 25 (AUTOGRAD_DETAIL), 7 (PARAMETER)           -> 27 (AUTOGRAD_DETAIL)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
+<<<<<<< HEAD
+=======
+            aten::detach                             26 (GRADIENT)                                 -> 26 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::detach                             26 (GRADIENT)                                 -> ???
             aten::detach                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
             aten::threshold_backward                 27 (AUTOGRAD_DETAIL), 6 (ACTIVATION)          -> 28 (AUTOGRAD_DETAIL)
@@ -1412,8 +1464,15 @@ def step_fn(mark_region):
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
             aten::sum.dim_IntList                    28 (AUTOGRAD_DETAIL)                          -> 30 (GRADIENT)
             aten::view                               30 (GRADIENT)                                 -> 30 (GRADIENT)
+<<<<<<< HEAD
+            aten::detach                             30 (GRADIENT)                                 -> ???
+            aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
+=======
+            aten::detach                             30 (GRADIENT)                                 -> 30 (GRADIENT)
             aten::detach                             30 (GRADIENT)                                 -> ???
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
+            aten::detach                             29 (GRADIENT)                                 -> 29 (GRADIENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten::detach                             29 (GRADIENT)                                 -> ???""",
         )
 
@@ -1438,7 +1497,11 @@ def test_memory_timeline(self) -> None:
         memory_profile = prof._memory_profile()
         timeline = memory_profile.timeline
         times = tuple(t for t, _, _, _ in timeline)
+<<<<<<< HEAD
         self.assertTrue(all(t1 >= t0 for t0, t1 in it.pairwise(times)), times)
+=======
+        self.assertTrue(all(t1 >= t0 for t0, t1 in zip(times, times[1:])), times)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             all(
                 (t == -1) if action == _memory_profiler.Action.PREEXISTING else (t > 0)
@@ -1479,7 +1542,11 @@ def id_for_testing(key):
             for _, action, (key, version), size in prof._memory_profile().timeline
             # We generally don't care about tiny allocations during memory
             # profiling and they add a lot of noise to the unit test.
+<<<<<<< HEAD
             if size > 1024 and isinstance(key, _memory_profiler.TensorKey)
+=======
+            if size > 1024
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         self.assertExpectedInline(
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index b30d25ec9af63..b42ef71ad52af 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -27,7 +27,10 @@
 import torch.optim
 import torch.utils.data
 from torch._C._profiler import _ExperimentalConfig, _ExtraFields_PyCall
+<<<<<<< HEAD
 from torch._inductor.utils import is_big_gpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.autograd.profiler import KinetoStepTracker, profile as _profile
 from torch.autograd.profiler_legacy import profile as _profile_legacy
 from torch.profiler import (
@@ -82,7 +85,11 @@
 # This causes an issue in the multithreading test because we check all events
 # in that test with their tids. The events that correspond to these lingering
 # threads all have TID of (uint64_t)(-1) which is invalid.
+<<<<<<< HEAD
 # The work around is turning off monitoring thread when tqdm is loaded.
+=======
+# The work around is turnning off monitoring thread when tqdm is loaded.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Since these are unit tests, it is safe to turn off monitor thread.
 try:
     import tqdm
@@ -529,7 +536,11 @@ def test_kineto(self):
                 found_mm = True
             if "gemm" in e.name.lower() or "Cijk" in e.name:
                 found_gemm = True
+<<<<<<< HEAD
             if "memcpy" in e.name.lower() or "__amd_rocclr_copyBuffer" in e.name:
+=======
+            if "memcpy" in e.name.lower():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 found_memcpy = True
         if use_cuda:
             self.assertTrue(found_gemm)
@@ -967,7 +978,11 @@ def test_flops(self):
         profiler_output = prof.key_averages(group_by_input_shape=True).table(
             sort_by="cpu_time_total", row_limit=10
         )
+<<<<<<< HEAD
         self.assertRegex(profiler_output, "Total M?FLOPs")
+=======
+        self.assertIn("Total MFLOPs", profiler_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not (kineto_available() and torch.cuda.is_available()):
             return
 
@@ -983,6 +998,7 @@ def test_flops(self):
         profiler_output = kineto_profiler.key_averages().table(
             sort_by="self_cuda_time_total", row_limit=-1
         )
+<<<<<<< HEAD
         self.assertRegex(profiler_output, "Total M?FLOPs")
 
     def test_override_time_units(self):
@@ -1028,6 +1044,9 @@ def test_override_time_units(self):
             cpu_time_total_str_us = f"{event.cpu_time_total:.3f}us"
             self.assertTrue(cpu_time_str_us in profiler_output)
             self.assertTrue(cpu_time_total_str_us in profiler_output)
+=======
+        self.assertIn("Total MFLOPs", profiler_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
     @patch.dict(os.environ, {"KINETO_DAEMON_INIT_DELAY_S": "1"})
@@ -1468,7 +1487,11 @@ def trace_and_check(exp_config: Optional[_ExperimentalConfig]) -> None:
                     cats = {e.get("cat", None) for e in j["traceEvents"]}
             self.assertTrue(
                 "cuda_sync" in cats,
+<<<<<<< HEAD
                 f"Expected to find cuda_sync event found = {cats}",
+=======
+                "Expected to find cuda_sync event" f" found = {cats}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         print("Testing enable_cuda_sync_events in _ExperimentalConfig")
@@ -1764,6 +1787,7 @@ def test_profiler_op_event_kwargs(self):
             with open(fname) as f:
                 j = json.load(f)
                 op_events = [
+<<<<<<< HEAD
                     e
                     for e in j["traceEvents"]
                     if e.get("name", "") == "add_test_kwinputs"
@@ -1779,12 +1803,31 @@ def test_profiler_op_event_kwargs(self):
                     self.assertTrue(args["debug"] == "None")
                     self.assertTrue(args["boolean"])
                     self.assertTrue(e["cat"] == "cpu_op")
+=======
+                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                ]
+                for e in op_events:
+                    if e["name"] == "add_test_kwinputs":
+                        # print(e["args"])
+                        args = e["args"]
+                        self.assertTrue("stream" in args)
+                        self.assertTrue("grid" in args)
+                        self.assertTrue("boolean" in args)
+                        self.assertTrue(args["stream"] == 0)
+                        self.assertTrue(args["grid"] == "lambda x : x + 1")
+                        self.assertTrue(args["debug"] == "None")
+                        self.assertTrue(args["boolean"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with profile(record_shapes=True) as p1:
             cm = torch._C._profiler._RecordFunctionFast(
                 "add_test_kwinputs",
                 [x, y],
+<<<<<<< HEAD
                 {"stream": "test", "grid": [1, 2], "scope": "user_scope"},
+=======
+                {"stream": "test", "grid": [1, 2]},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for _ in range(4):
                 with cm:
@@ -1794,6 +1837,7 @@ def test_profiler_op_event_kwargs(self):
             with open(fname1) as f1:
                 j = json.load(f1)
                 op_events = [
+<<<<<<< HEAD
                     e
                     for e in j["traceEvents"]
                     if e.get("name", "") == "add_test_kwinputs"
@@ -1880,6 +1924,16 @@ def test_profiler_op_event_kwargs_list_of_strings(self):
                     self.assertEqual(args["valid_string_list"], ["valid1", "valid2"])
                     self.assertEqual(args["valid_int"], 100)
                     self.assertTrue(e["cat"] == "cpu_op")
+=======
+                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                ]
+                for e in op_events:
+                    if e["name"] == "add_test_kwinputs":
+                        # print(e["args"])
+                        args = e["args"]
+                        self.assertTrue("stream" not in args)
+                        self.assertTrue("grid" not in args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_is_profiler_enabled(self):
         self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
@@ -1930,7 +1984,11 @@ def test_event_list(self):
         event_list.table()
 
     def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
+<<<<<<< HEAD
         for i in range(max_gpu_count):
+=======
+        for i in range(0, max_gpu_count):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(gpu_dict["GPU " + str(i)], 1)
 
     # Do json sanity testing. Checks that all events are between profiler start and end
@@ -2139,8 +2197,13 @@ def test_cpu_annotation_overlap(self):
                         step_helper_funcs.append(event)
             self.assertEqual(len(prof_steps), 5)
             self.assertEqual(len(step_helper_funcs), 5)
+<<<<<<< HEAD
             for i in range(len(step_helper_funcs)):
                 for j in range(len(step_helper_funcs)):
+=======
+            for i in range(0, len(step_helper_funcs)):
+                for j in range(0, len(step_helper_funcs)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertTrue(
                         not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
                     )
@@ -2416,6 +2479,7 @@ def verify_events(events):
             events = main_with_thread_fn(profile_all_threads)
             verify_events(events)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     def test_python_gc_event(self):
@@ -2484,6 +2548,8 @@ def validate_json(prof, gc_collection_on):
                 payload()
             validate_json(prof, gc_flag)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SimpleNet(nn.Module):
     def __init__(self) -> None:
@@ -3238,6 +3304,7 @@ def validate_json(prof):
             assert "Overload Name" in key_averages.table()
             validate_json(prof)
 
+<<<<<<< HEAD
     def test_expose_kineto_event_metadata(self):
         def check_metadata(prof, op_name, metadata_key):
             with TemporaryFileName(mode="w+") as fname:
@@ -3319,6 +3386,8 @@ def names(prof):
         n2 = names(prof2)
         self.assertEqual(n1, n2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index c6316fe3cd7e3..780410324dac4 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -191,6 +191,7 @@ def fmt_name(name: str) -> str:
                 name,
             )
 
+<<<<<<< HEAD
         # HACK: this patches around the fact that PyBind11 improperly sets the
         # __qualname__ attribute on functions and methods; see
         # https://github.com/pybind/pybind11/issues/5774.  This should be removed if
@@ -201,6 +202,8 @@ def fmt_name(name: str) -> str:
             name,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return re.sub("object at 0x[0-9a-fA-F]+>", "object at 0xXXXXXXXXXXXX>", name)
 
     @classmethod
@@ -762,6 +765,7 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
               torch/profiler/profiler.py(...): __enter__
                 ...
               aten::add
+<<<<<<< HEAD
                 PythonSubclass
                   torch/_library/simple_registry.py(...): find_torch_dispatch_rule
                     torch/_library/simple_registry.py(...): find
@@ -778,6 +782,22 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
                         aten::add
                     torch/utils/_pytree.py(...): tree_map
                       ...
+=======
+                torch/_library/simple_registry.py(...): find_torch_dispatch_rule
+                  torch/_library/simple_registry.py(...): find
+                  torch/_library/simple_registry.py(...): find
+                    <built-in method get of dict object at 0xXXXXXXXXXXXX>
+                test_profiler_tree.py(...): __torch_dispatch__
+                  torch/utils/_pytree.py(...): tree_map
+                    ...
+                  torch/utils/_pytree.py(...): tree_map
+                    ...
+                  torch/_ops.py(...): __call__
+                    <built-in method  of PyCapsule object at 0xXXXXXXXXXXXX>
+                      aten::add
+                  torch/utils/_pytree.py(...): tree_map
+                    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
                   ...""",
diff --git a/test/profiler/test_record_function.py b/test/profiler/test_record_function.py
index 26a6c0edf804a..18f41661ef290 100644
--- a/test/profiler/test_record_function.py
+++ b/test/profiler/test_record_function.py
@@ -1,13 +1,32 @@
 # Owner(s): ["oncall: profiler"]
 # ruff: noqa: F841
 
+<<<<<<< HEAD
+=======
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import torch
 import torch.optim
 import torch.utils.data
 import torch.utils.data.datapipes as dp
+<<<<<<< HEAD
 from torch._dispatch.python import enable_python_dispatcher
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.autograd import (
     _record_function_with_args_enter,
     _record_function_with_args_exit,
@@ -17,6 +36,7 @@
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
+<<<<<<< HEAD
 # if tqdm is not shutdown properly, it will leave the monitor thread alive.
 # This causes an issue in the multithreading test because we check all events
 # in that test with their tids. The events that correspond to these lingering
@@ -30,6 +50,8 @@
 except ImportError:
     pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Json = dict[str, Any]
 
 
@@ -153,6 +175,7 @@ def test_datapipe_with_record_function_fork(self):
         self.assertTrue(has_iter)
         self.assertTrue(has_child)
 
+<<<<<<< HEAD
     def test_python_dispatch_mode_record_function(self):
         from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -226,6 +249,8 @@ def wrap(x):
             "PythonSubclass record function not found in profiler events",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_torch_tidy.py b/test/profiler/test_torch_tidy.py
index a0f41114e919a..4321b97d5ad52 100644
--- a/test/profiler/test_torch_tidy.py
+++ b/test/profiler/test_torch_tidy.py
@@ -1,5 +1,21 @@
 # Owner(s): ["oncall: profiler"]
 
+<<<<<<< HEAD
+=======
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import gc
 import re
 import textwrap
@@ -11,11 +27,16 @@
 import torch.nn as nn
 import torch.optim
 import torch.utils.data
+<<<<<<< HEAD
 from torch._C._profiler import _ExtraFields_PyCall, _TensorMetadata
+=======
+from torch._C._profiler import _TensorMetadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.profiler import _utils, profile
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
+<<<<<<< HEAD
 # if tqdm is not shutdown properly, it will leave the monitor thread alive.
 # This causes an issue in the multithreading test because we check all events
 # in that test with their tids. The events that correspond to these lingering
@@ -31,6 +52,12 @@
 
 Json = dict[str, Any]
 
+=======
+Json = dict[str, Any]
+
+from torch._C._profiler import _ExtraFields_PyCall
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def find_node_with_name(nodes, name):
     for node in _utils.traverse_dfs(nodes):
@@ -425,7 +452,11 @@ def get_fields(op_name, index):
             self.assertEqual(state[0][0], "momentum_buffer")
             self.assertEqual(state[0][1].id, weight_momenumtum_id)
 
+<<<<<<< HEAD
         # Check that we handle first step (lazy initialization) and steady state.
+=======
+        # Check that we handle first step (lazy initalization) and steady state.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check(cold_start=True)
         check(cold_start=False)
 
diff --git a/test/quantization/ao_migration/common.py b/test/quantization/ao_migration/common.py
index 5797b4bab1d44..21982e86d2b30 100644
--- a/test/quantization/ao_migration/common.py
+++ b/test/quantization/ao_migration/common.py
@@ -47,6 +47,12 @@ def _test_dict_import(
             new_dict = getattr(new_location, dict_name)
             assert old_dict == new_dict, f"Dicts don't match: {dict_name}"
             for key in new_dict.keys():
+<<<<<<< HEAD
                 assert old_dict[key] == new_dict[key], (
                     f"Dicts don't match: {dict_name} for key {key}"
                 )
+=======
+                assert (
+                    old_dict[key] == new_dict[key]
+                ), f"Dicts don't match: {dict_name} for key {key}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/bc/test_backward_compatibility.py b/test/quantization/bc/test_backward_compatibility.py
index 01c546a95a5cd..77e0d9c0e52b6 100644
--- a/test/quantization/bc/test_backward_compatibility.py
+++ b/test/quantization/bc/test_backward_compatibility.py
@@ -68,7 +68,11 @@ def get_filenames(self, subname):
 
 
 class TestSerialization(TestCase):
+<<<<<<< HEAD
     """Test backward compatibility for serialization and numerics"""
+=======
+    """Test backward compatiblity for serialization and numerics"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Copy and modified from TestCase.assertExpected
     def _test_op(
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
index c4cea4073a5cd..32f4bf3f3ae21 100644
--- a/test/quantization/core/experimental/test_floatx.py
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -275,7 +275,11 @@ def test_float8_e8m0fnu_rne_rounding(self, device):
         IMO simpler to special case e8m0 here.
         """
 
+<<<<<<< HEAD
         for biased_exponent in range(256):
+=======
+        for biased_exponent in range(0, 256):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # iterate through all the possible options of guard, round, sticky bits
             # for the current exponent
             for grs in range(8):
@@ -426,6 +430,10 @@ def test_f4_save_load(self, device):
 
 
 class TestFloat8DtypeCPUOnly(TestCase):
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Test of mul implementation
 
diff --git a/test/quantization/core/experimental/test_linear.py b/test/quantization/core/experimental/test_linear.py
index df66824847658..771b3c7ff063c 100644
--- a/test/quantization/core/experimental/test_linear.py
+++ b/test/quantization/core/experimental/test_linear.py
@@ -14,7 +14,11 @@ def test_linear_APoT_k1(self):
         # weight: fp tensor
         weight = 1000 * torch.rand(4, 4)
 
+<<<<<<< HEAD
         # activation: fp32 tensor with ~ integer values
+=======
+        # activtion: fp32 tensor with ~ integer values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         activation = torch.randint(low=0, high=255, size=(4, 4), dtype=torch.float)
 
         # calculate result from calling linear forward method
@@ -41,7 +45,11 @@ def test_linear_APoT_k2(self):
         # weight: fp tensor
         weight = 1000 * torch.rand(5, 3)
 
+<<<<<<< HEAD
         # activation: fp32 tensor with ~ integer values
+=======
+        # activtion: fp32 tensor with ~ integer values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # note: transpose of activation matrix will have dimension (3, 5)
         activation = torch.randint(low=0, high=255, size=(5, 3), dtype=torch.float)
 
diff --git a/test/quantization/core/experimental/test_quantized_tensor.py b/test/quantization/core/experimental/test_quantized_tensor.py
index aac99c3e82999..6415e8a74968e 100644
--- a/test/quantization/core/experimental/test_quantized_tensor.py
+++ b/test/quantization/core/experimental/test_quantized_tensor.py
@@ -11,7 +11,11 @@ class TestQuantizedTensor(unittest.TestCase):
     """
     def test_int_repr(self):
         # generate tensor with random fp values
+<<<<<<< HEAD
         tensor2quantize = torch.tensor([0, 0.0215, 0.1692, 0.385, 1, 0.0391])
+=======
+        tensor2quantize = tensor2quantize = torch.tensor([0, 0.0215, 0.1692, 0.385, 1, 0.0391])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         observer = APoTObserver(b=4, k=2)
 
diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
new file mode 100644
index 0000000000000..ab1689cccab2d
--- /dev/null
+++ b/test/quantization/core/test_docs.py
@@ -0,0 +1,146 @@
+# Owner(s): ["oncall: quantization"]
+
+import re
+import contextlib
+from pathlib import Path
+
+import torch
+
+from torch.testing._internal.common_quantization import (
+    QuantizationTestCase,
+    SingleLayerLinearModel,
+)
+from torch.testing._internal.common_quantized import override_quantized_engine
+from torch.testing._internal.common_utils import raise_on_run_directly, IS_ARM64, IS_FBCODE
+import unittest
+
+
+@unittest.skipIf(IS_FBCODE, "some path issues in fbcode")
+class TestQuantizationDocs(QuantizationTestCase):
+    r"""
+    The tests in this section import code from the quantization docs and check that
+    they actually run without errors. In cases where objects are undefined in the code snippet,
+    they must be provided in the test. The imports seem to behave a bit inconsistently,
+    they can be imported either in the test file or passed as a global input
+    """
+
+    def run(self, result=None):
+        with override_quantized_engine("qnnpack") if IS_ARM64 else contextlib.nullcontext():
+            super().run(result)
+
+    def _get_code(
+        self, path_from_pytorch, unique_identifier, offset=2, short_snippet=False
+    ):
+        r"""
+        This function reads in the code from the docs given a unique identifier.
+        Most code snippets have a 2 space indentation, for other indentation levels,
+        change the offset `arg`. the `short_snippet` arg can be set to allow for testing
+        of smaller snippets, the check that this arg controls is used to make sure that
+        we are not accidentally only importing a blank line or something.
+        """
+
+        def get_correct_path(path_from_pytorch):
+            r"""
+            Current working directory when CI is running test seems to vary, this function
+            looks for docs relative to this test file.
+            """
+            core_dir = Path(__file__).parent
+            assert core_dir.match("test/quantization/core/"), (
+                "test_docs.py is in an unexpected location. If you've been "
+                "moving files around, ensure that the test and build files have "
+                "been updated to have the correct relative path between "
+                "test_docs.py and the docs."
+            )
+            pytorch_root = core_dir.parents[2]
+            return pytorch_root / path_from_pytorch
+
+        path_to_file = get_correct_path(path_from_pytorch)
+        if path_to_file:
+            with open(path_to_file) as file:
+                content = file.readlines()
+
+            # it will register as having a newline at the end in python
+            if "\n" not in unique_identifier:
+                unique_identifier += "\n"
+
+            assert unique_identifier in content, f"could not find {unique_identifier} in {path_to_file}"
+
+            # get index of first line of code
+            line_num_start = content.index(unique_identifier) + 1
+
+            # next find where the code chunk ends.
+            # this regex will match lines that don't start
+            # with a \n or "  " with number of spaces=offset
+            r = r = re.compile("^[^\n," + " " * offset + "]")
+            # this will return the line of first line that matches regex
+            line_after_code = next(filter(r.match, content[line_num_start:]))
+            last_line_num = content.index(line_after_code)
+
+            # remove the first `offset` chars of each line and gather it all together
+            code = "".join(
+                [x[offset:] for x in content[line_num_start + 1 : last_line_num]]
+            )
+
+            # want to make sure we are actually getting some code,
+            assert last_line_num - line_num_start > 3 or short_snippet, (
+                f"The code in {path_to_file} identified by {unique_identifier} seems suspiciously short:"
+                f"\n\n###code-start####\n{code}###code-end####"
+            )
+            return code
+
+        return None
+
+    def _test_code(self, code, global_inputs=None):
+        r"""
+        This function runs `code` using any vars in `global_inputs`
+        """
+        # if couldn't find the
+        if code is not None:
+            expr = compile(code, "test", "exec")
+            exec(expr, global_inputs)
+
+    def test_quantization_doc_ptdq(self):
+        path_from_pytorch = "docs/source/quantization.rst"
+        unique_identifier = "PTDQ API Example::"
+        code = self._get_code(path_from_pytorch, unique_identifier)
+        self._test_code(code)
+
+    def test_quantization_doc_ptsq(self):
+        path_from_pytorch = "docs/source/quantization.rst"
+        unique_identifier = "PTSQ API Example::"
+        code = self._get_code(path_from_pytorch, unique_identifier)
+        self._test_code(code)
+
+    def test_quantization_doc_qat(self):
+        path_from_pytorch = "docs/source/quantization.rst"
+        unique_identifier = "QAT API Example::"
+
+        def _dummy_func(*args, **kwargs):
+            return None
+
+        input_fp32 = torch.randn(1, 1, 1, 1)
+        global_inputs = {"training_loop": _dummy_func, "input_fp32": input_fp32}
+        code = self._get_code(path_from_pytorch, unique_identifier)
+        self._test_code(code, global_inputs)
+
+    def test_quantization_doc_fx(self):
+        path_from_pytorch = "docs/source/quantization.rst"
+        unique_identifier = "FXPTQ API Example::"
+
+        input_fp32 = SingleLayerLinearModel().get_example_inputs()
+        global_inputs = {"UserModel": SingleLayerLinearModel, "input_fp32": input_fp32}
+
+        code = self._get_code(path_from_pytorch, unique_identifier)
+        self._test_code(code, global_inputs)
+
+    def test_quantization_doc_custom(self):
+        path_from_pytorch = "docs/source/quantization.rst"
+        unique_identifier = "Custom API Example::"
+
+        global_inputs = {"nnq": torch.ao.nn.quantized}
+
+        code = self._get_code(path_from_pytorch, unique_identifier)
+        self._test_code(code, global_inputs)
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 68053cdc61f81..269836cc08617 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -4,6 +4,7 @@
 
 import copy
 import itertools
+<<<<<<< HEAD
 import operator
 import random
 import unittest
@@ -65,6 +66,58 @@
 from torch.utils.cpp_extension import ROCM_HOME
 
 np_dtype = {torch.quint8: np.uint8, torch.qint8: np.int8, torch.qint32: np.int32}
+=======
+import numpy as np
+import operator
+import random
+import sys
+import unittest
+from typing import NamedTuple
+
+import torch
+from torch import _VF
+import torch.jit
+import torch.nn.functional as F
+from torch.nn.modules.utils import _single, _pair
+
+from hypothesis import settings, HealthCheck
+from hypothesis import assume, given, note
+from hypothesis import strategies as st
+import torch.testing._internal.hypothesis_utils as hu
+hu.assert_deadline_disabled()
+
+from torch.testing._internal.common_cuda import SM80OrLater
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    TestCase,
+    IS_PPC,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_FBCODE,
+    IS_ARM64
+)
+from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK, skipIfNoONEDNN
+from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
+    override_quantized_engine, supported_qengines, override_qengines, _snr
+from torch.testing._internal.common_quantized import (
+    qengine_is_qnnpack,
+    qengine_is_onednn,
+)
+from torch.ao.quantization import PerChannelMinMaxObserver
+from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_CUDNN_VERSION, TEST_CUDA
+from torch.testing._internal.optests import opcheck
+import torch.backends.xnnpack
+
+from torch.utils.cpp_extension import ROCM_HOME
+
+from typing import Optional
+
+np_dtype = {
+    torch.quint8 : np.uint8,
+    torch.qint8 : np.int8,
+    torch.qint32 : np.int32
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
 
@@ -84,7 +137,11 @@ class PointwisePostOp(NamedTuple):
 def avoid_vpmaddubsw_overflow_linear(
     batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max
 ):
+<<<<<<< HEAD
     if Version(np.__version__) >= Version("2.1"):
+=======
+    if np.lib.NumpyVersion(np.__version__) >= '2.1.0':
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise unittest.SkipTest("numpy 2.1 overflow error")
     for i, j in np.ndindex((batch_size, output_channels)):
         for k in range(0, input_channels // 2 * 2, 2):
@@ -165,6 +222,7 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type):
         X_scale = 1e-10
     return X, X_scale, X_zero_point
 
+<<<<<<< HEAD
 def _quantize_fp8e4m3(t: torch.Tensor, channelwise: bool, scale: Optional[torch.Tensor] = None):
     quant_max = torch.finfo(torch.float8_e4m3fn).max
     eps = torch.Tensor([torch.finfo(torch.float32).eps])
@@ -192,6 +250,8 @@ def _dequantize_fp8e4m3(qt: torch.Tensor, scale: torch.Tensor):
         dqt = dqt * scale_reshape
     return dqt
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestQuantizedOps(TestCase):
 
     """Helper function to test quantized activation functions."""
@@ -225,7 +285,11 @@ def _test_activation_function(self, X, fn_name, test_configs):
             `output_is_observed`: if specified and is True, we'll append extra
              output_scale/output_zero_point keyword argument when calling quantized op
         """
+<<<<<<< HEAD
         # Retrieves the default parameters from X.
+=======
+        # Retrives the default parameters from X.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         X, (scale, zero_point, torch_type) = X
         if not isinstance(X, torch.Tensor):
             X = torch.from_numpy(X)
@@ -2202,7 +2266,11 @@ def test_cat(self, X, num, dim, relu):
         X = torch.from_numpy(X)
         new_shape = np.array(X.shape)
         new_shape[dim] = 0
+<<<<<<< HEAD
         for _ in range(num):
+=======
+        for idx in range(num):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tensors_q.append(torch.quantize_per_tensor(X, scale, zero_point,
                                                        torch_type))
             tensors_ref.append(X)
@@ -3053,7 +3121,11 @@ def from_float(cls, other, qconfig=None):
                 lstm_quantized = torch.ao.quantization.convert(
                     lstm_prepared, convert_custom_config_dict=custom_config_dict
                 )
+<<<<<<< HEAD
                 assert type(lstm_quantized[0]) is torch.ao.nn.quantized.LSTM
+=======
+                assert type(lstm_quantized[0]) == torch.ao.nn.quantized.LSTM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 qy = lstm_quantized(qx)
 
                 snr = _snr(y, qy)
@@ -3162,7 +3234,11 @@ def forward(
                     # Quantize
                     mha_quantized = torch.ao.quantization.convert(mha_prepared)
 
+<<<<<<< HEAD
                     for name, _param in mha_quantized.named_parameters():
+=======
+                    for name, param in mha_quantized.named_parameters():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertTrue("in_proj_weight" not in name)
 
                     qy = mha_quantized(*q_data)
@@ -3562,15 +3638,24 @@ def test_wrapped_fbgemm_linear_fp16(self):
             (2, 4),         # batch_size
             (4, 5),     # input_channels
             (4, 7),      # output_channels
+<<<<<<< HEAD
             (True, False),         # bias None or not
         )
         for batch_size, input_channels, output_channels, bias_is_none in options:
+=======
+        )
+        for batch_size, input_channels, output_channels in options:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pack_op = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16
             linear_op = torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight
 
             x = torch.randn(batch_size, input_channels)
             w = torch.randn(output_channels, input_channels)
+<<<<<<< HEAD
             bias = torch.randn(output_channels) if not bias_is_none else None
+=======
+            bias = torch.randn(output_channels)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             w_packed = pack_op(w)
             out = linear_op(x, w_packed, bias, output_channels)
@@ -3584,7 +3669,11 @@ def test_wrapped_fbgemm_linear_fp16(self):
     def test_wrapped_fbgemm_pack_gemm_matrix_fp16_pt2_compliant(self):
         # We are not using opcheck over here because the output for the op we're testing
         # (_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16) is not deterministic
+<<<<<<< HEAD
         # due to the C-struct it's producing. This would fail the check when we're trying
+=======
+        # due to the C-struct it's procuding. This would fail the check when we're trying
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # to match the result between compiled and eager version.
         #
         # This is only a temporary solution, long term, we should be able to support PT2
@@ -3604,6 +3693,7 @@ def func(X, W, B):
 
         self.assertEqual(ref_out, compiled_out)
 
+<<<<<<< HEAD
         def func(X, W):
             packed_W = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16(W)
             return torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight(X, packed_W, None, W.size(0))
@@ -3616,6 +3706,8 @@ def func(X, W):
         self.assertEqual(ref_out, compiled_out)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Tests the correctness of the dynamic quantized lstm/gru."""
 
     def _get_rnn_inputs(self, seq_len, num_batches, input_size, hidden_size, num_directions, reduce_range):
@@ -4562,7 +4654,11 @@ def _test_qlinear_pt2e_helper(
         qlinear_op,
         post_op="none",
         unary_post_op_args=(),
+<<<<<<< HEAD
         post_op_algorithms=("none",),
+=======
+        post_op_algorithms=("none"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         qlinear_prepack = torch.ops.onednn.qlinear_prepack
         linear_op = F.linear
@@ -4729,6 +4825,7 @@ def test_qlinear_add_relu_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_pt2e_helper(qlinear, "add_relu")
 
+<<<<<<< HEAD
     def _test_qlinear_fp8_helper(
         self,
         qlinear_op,
@@ -4887,6 +4984,8 @@ def test_qlinear_add_relu_fp8(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_fp8_helper(qlinear, "add_relu")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(IS_MACOS, "Known test failure on Mac.")
 class TestQuantizedEmbeddingOps(TestCase):
@@ -5572,7 +5671,11 @@ def test_qconv2d(
         )
 
         act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
         # Only qnnpack qengine supports qint8
+=======
+        # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
             act_qdtypes.append(torch.qint8)
 
@@ -5653,7 +5756,11 @@ def test_qconv2d_relu(
         )
 
         act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
         # Only qnnpack qengine supports qint8
+=======
+        # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
             act_qdtypes.append(torch.qint8)
 
@@ -5991,7 +6098,11 @@ def trace_handler(p):
                 activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 schedule=my_schedule,
                 on_trace_ready=trace_handler) as prof:
+<<<<<<< HEAD
             for _ in range(30):
+=======
+            for i in range(30):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 conv_op(input, weight, None, stride, padding, dilation, groups)
                 prof.step()
 
@@ -6006,7 +6117,11 @@ def trace_handler(p):
                 activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 schedule=my_schedule,
                 on_trace_ready=trace_handler) as prof:
+<<<<<<< HEAD
             for _ in range(30):
+=======
+            for i in range(30):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 conv_op(input_fp16, weight_fp16, None, stride, padding, dilation, groups)
                 prof.step()
 
@@ -6023,7 +6138,11 @@ def trace_handler(p):
                 activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 schedule=my_schedule,
                 on_trace_ready=trace_handler) as prof:
+<<<<<<< HEAD
             for _ in range(30):
+=======
+            for i in range(30):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 conv_op(input_int8, weight_prepacked, scale, zero_point)
                 prof.step()
 
@@ -6084,7 +6203,11 @@ def test_qconv_transpose1d(self):
             )
 
             act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
             # Only qnnpack qengine supports qint8
+=======
+            # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
                 act_qdtypes.append(torch.qint8)
 
@@ -6210,7 +6333,11 @@ def test_qconv_transpose2d(
             bias=use_bias
         )
         act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
         # Only qnnpack qengine supports qint8
+=======
+        # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
             act_qdtypes.append(torch.qint8)
 
@@ -6515,7 +6642,11 @@ def test_qconv1d(
         qconv = torch.ops.quantized.conv1d
 
         act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
         # Only qnnpack qengine supports qint8
+=======
+        # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
             act_qdtypes.append(torch.qint8)
 
@@ -6586,7 +6717,11 @@ def test_qconv1d_relu(
         qconv = torch.ops.quantized.conv1d_relu
 
         act_qdtypes = [torch.quint8]
+<<<<<<< HEAD
         # Only qnnpack qengine supports qint8
+=======
+        # Only qnnpack qengine supportes qint8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
             act_qdtypes.append(torch.qint8)
 
@@ -7045,8 +7180,13 @@ def _test_qconv_impl_cpu_tensor(
         # ONEDNN only supports symmetric quantization of weight
         if W_zero_point is not None:
             W_zero_point = len(W_zero_point) * [0]
+<<<<<<< HEAD
         fp32_output = qconv_output_dtype is torch.float32
         bfloat16_output = qconv_output_dtype is torch.bfloat16
+=======
+        fp32_output = True if qconv_output_dtype is torch.float32 else False
+        bfloat16_output = True if qconv_output_dtype is torch.bfloat16 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if fp32_output or bfloat16_output:
             Y_scale = 1.0
             Y_zero_point = 0
@@ -7521,10 +7661,17 @@ def test_qconv2d_hardtanh_pt2e(self):
                 qconv_output_dtype=output_dtype,
             )
 
+<<<<<<< HEAD
     # Test qconv with post op swish
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qconv2d_swish_pt2e(self):
+=======
+    # Test qconv with post op silu
+    @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
+    @skipIfNoONEDNN
+    def test_qconv2d_silu_pt2e(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_channels_per_group = 2
         output_channels_per_group = 2
         groups_list = [1, 10]
@@ -7846,6 +7993,7 @@ def test_qconv1d_relu_pt2e(self):
                 qconv_output_dtype=output_dtype,
             )
 
+<<<<<<< HEAD
     def _make_qconv_tensors_fp8(
         self, batch_size, input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, dilations,
@@ -8170,6 +8318,8 @@ def test_qconv3d_fp8(self):
         self._test_qconv_fp8_helper(3, pointwise_post_op)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
@@ -8806,6 +8956,7 @@ def test_compare_tensor_scalar(self, A, b):
             self.assertEqual(result_ref, result,
                              msg=f"'tensor.{op}(scalar)'' failed")
 
+<<<<<<< HEAD
 """Tests the correctness of the quantized::embedding_bag_(byte|4bit|2bit)_prepack_with_rowwise_min_max ops."""
 class TestQuantizedWithMinMax(TestCase):
     """Validates that the *rowwsie_min_max* quantization functions are equivalent to the ones without it."""
@@ -8867,5 +9018,7 @@ def test_quantize_tensor_with_min_max(self):
                         weight_incorrectly_quantized, weight_quantized_no_rowwise_min_max
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     raise_on_run_directly("test/test_quantization.py")
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index a98f5e379343c..7cb48e2864ce5 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -97,10 +97,17 @@ def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
             # found a local optima
             solutions.append((cur_min, cur_max, cur_loss))
         if loss1 < loss2:
+<<<<<<< HEAD
             cur_min, cur_loss = cur_min + stepsize, loss1
         else:
             cur_max, cur_loss = cur_max - stepsize, loss2
     if solutions:
+=======
+            cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
+        else:
+            cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
+    if len(solutions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         best = solutions[0]
         for solution in solutions:
             if solution[-1] < best[-1]:
@@ -586,7 +593,11 @@ def test_compare_per_channel_device_numerics(self):
         ]
         axis = 1
         device = torch.device('cuda')
+<<<<<<< HEAD
         for _ in range(20):
+=======
+        for i in range(20):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for dtype, zero_type in dtype_and_zero_types:
                 r = torch.rand(2, 2) * 10
                 r[0, 0] = 2.5
@@ -765,7 +776,11 @@ def test_qtensor_permute(self):
                 qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
                 qr = qr.transpose(0, 1)
                 rqr = qr.dequantize()
+<<<<<<< HEAD
                 # compare transpose + dequantized result with original transposed result
+=======
+                # compare transpose + dequantized result with orignal transposed result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertTrue(np.allclose(r.cpu().numpy().transpose([1, 0, 2, 3]), rqr.cpu().numpy(), atol=2 / scale))
 
                 qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
@@ -1113,7 +1128,11 @@ def _test_qtensor_index_put_non_accumulate_deterministic(self, device):
             zero_point = 10
             types = [torch.qint8, torch.quint8, torch.qint32]
             for qtype in types:
+<<<<<<< HEAD
                 for _ in range(3):
+=======
+                for i in range(3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     m = random.randint(10, 20)
                     elems = random.randint(20000, 30000)
                     values = torch.rand(elems, device=device)
@@ -1210,7 +1229,11 @@ def test_qtensor_view(self):
             if device == 'cpu':
                 self.assertFalse(torch.equal(b, c))
 
+<<<<<<< HEAD
             # a case can't view non-contiguous Tensor
+=======
+            # a case can't view non-contiguos Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a_int = torch.randint(0, 100, [1, 2, 3, 4], device=device, dtype=dtype)
             a = torch._make_per_tensor_quantized_tensor(a_int, scale=scale, zero_point=zero_point)
             b = a.transpose(1, 2)  # swaps 2nd and 3rd dimension
@@ -1409,9 +1432,12 @@ def test_choose_qparams_optimized(self):
             self.assertEqual(y[0].numpy(), ref[0])
             self.assertEqual(y[1].numpy(), ref[1])
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(ValueError, "input tensor is empty and has no data"):
             torch.choose_qparams_optimized(torch.tensor([]), numel=0, n_bins=200, ratio=0.16, bit_width=8)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_pickle_checkpoint_qtensor(self, device):
         with TemporaryFileName() as fname:
             class M(torch.jit.ScriptModule):
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 9ea8d38828a63..b0d4ea2ef55ce 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -97,7 +97,11 @@ def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
                                                     reduce_range=reduce_range)]
 
         def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val):
+<<<<<<< HEAD
             assert dtype in _INT_DTYPES, f"Not supported dtype: {dtype}, supported dtypes are {_INT_DTYPES}"
+=======
+            assert dtype in _INT_DTYPES, "Not supported dtype: {dtype}, supported dtypes are {_INT_DTYPES}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eps = torch.tensor([tolerance])
             if dtype in [torch.qint8, torch.int8]:
                 if reduce_range:
@@ -138,7 +142,11 @@ def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val)
             # Calculate Qparams should return with a warning for observers with no data
             qparams = myobs.calculate_qparams()
             input_scale = 2**16 if qdtype is torch.qint32 else 1
+<<<<<<< HEAD
             if type(myobs) is MinMaxObserver:
+=======
+            if type(myobs) == MinMaxObserver:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale
                 y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) * input_scale
             else:
@@ -201,7 +209,11 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
                     [[[-4.0, -3.0], [5.0, 5.0]], [[6.0, 3.0], [7.0, 8.0]]],
                 ]
             )
+<<<<<<< HEAD
             if type(myobs) is MovingAveragePerChannelMinMaxObserver:
+=======
+            if type(myobs) == MovingAveragePerChannelMinMaxObserver:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Scaling the input tensor to model change in min/max values
                 # across batches
                 result = myobs(0.5 * x)
@@ -908,6 +920,7 @@ def test_quant_min_max_override(self):
         self.assertEqual(fq_module.activation_post_process.quant_min, 0)
         self.assertEqual(fq_module.activation_post_process.quant_max, 127)
 
+<<<<<<< HEAD
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            sampled_dtype=st.sampled_from(['bf16', 'fp16', 'fp32']))
     def test_fused_moving_avg_obs_fake_quant(self, device, sampled_dtype):
@@ -923,6 +936,8 @@ def test_fused_moving_avg_obs_fake_quant(self, device, sampled_dtype):
         finally:
             torch.set_default_dtype(torch.float32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_buffer_ids(module):
     """
     Object addresses stay constant if and only if all modifications are in-place
@@ -1028,7 +1043,11 @@ def test_observers_preserve_buffers(self):
         for observer_type in observer_types:
             observer = observer_type()
             buffer_ids_before = _get_buffer_ids(observer)
+<<<<<<< HEAD
             for _ in range(5):
+=======
+            for _i in range(5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inputs = torch.rand((4, 4, 4))
                 observer(inputs)
             buffer_ids_after = _get_buffer_ids(observer)
@@ -1046,7 +1065,11 @@ def test_fake_quant_preserves_buffers(self):
         """
         model = torch.ao.quantization.FakeQuantize()
         buffer_ids_before = _get_buffer_ids(model)
+<<<<<<< HEAD
         for _ in range(5):
+=======
+        for _i in range(5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputs = torch.rand((4, 4, 4))
             model(inputs)
         model.apply(torch.ao.quantization.enable_fake_quant)
@@ -1309,7 +1332,11 @@ def test_compare_fused_obs_fq_oss_module(self, device):
         torch.ao.quantization.enable_observer(mod_ref)
         mod_ref.to(device)
 
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for i in range(10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.randn(5, 5, device=device)
             out = mod(x)
             out_ref = mod_ref(x)
@@ -1445,7 +1472,11 @@ def test_embedding_qat_config(self):
 
                 count_fake_quant = 0
                 count_activation_postproc = 0
+<<<<<<< HEAD
                 for name, _mod in quant_model.named_modules():
+=======
+                for name, mod in quant_model.named_modules():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if name.endswith('weight_fake_quant'):
                         count_fake_quant += 1
                     if name.count('activation_post_process') == 1 and 'weight_fake_quant' not in name:
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index c1e8ecfa214bc..5f927d8546724 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -3,7 +3,10 @@
 
 import torch
 import math
+<<<<<<< HEAD
 from typing import Union
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization import (
     FakeQuantize,
     MovingAverageMinMaxObserver,
@@ -30,7 +33,11 @@
 from hypothesis import strategies as st
 import torch.testing._internal.hypothesis_utils as hu
 hu.assert_deadline_disabled()
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_WITH_ROCM
+=======
+from torch.testing._internal.common_cuda import TEST_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 
 # Reference method for fake quantize
@@ -51,11 +58,16 @@ def _fake_quantize_per_tensor_affine_grad_reference(dY, X, scale, zero_point, qu
     return res.to(dtype)
 
 # Reference method for the gradients of the fake quantize operator
+<<<<<<< HEAD
 def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero_point, quant_min, quant_max, device, dtype):
+=======
+def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero_point, quant_min, quant_max, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""This method references the following literatures for back propagation on scale and zero point.
     - https://arxiv.org/pdf/1902.08153.pdf
     - https://arxiv.org/pdf/1903.08066.pdf
     """
+<<<<<<< HEAD
 
     if dtype is torch.bfloat16:
         dY = dY.to(dtype=torch.float32)
@@ -63,6 +75,8 @@ def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero
         scale = scale.to(dtype=torch.float32)
         zero_point = zero_point.to(dtype=torch.float32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     zero_point_rounded = int((zero_point + 0.5).clamp(quant_min, quant_max).item())
     Xq = torch.round(X * (1.0 / scale) + zero_point_rounded)
 
@@ -94,12 +108,15 @@ def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero
 
     grad_scale = (grad_scale * dY).sum().unsqueeze(dim=0)
     grad_zp = (grad_zp * dY).sum().unsqueeze(dim=0)
+<<<<<<< HEAD
 
     if dtype is torch.bfloat16:
         grad_X = grad_X.to(torch.bfloat16)
         grad_scale = grad_scale.to(torch.bfloat16)
         grad_zp = grad_zp.to(torch.bfloat16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return grad_X, grad_scale, grad_zp
 
 
@@ -109,17 +126,24 @@ def _quantize_per_tensor(x, scale, zero_point, quant_min, quant_max):
 
 # Reference method for the per channel gradients of the learnable fake quantize operator
 def _fake_quantize_learnable_per_channel_affine_grad_reference(
+<<<<<<< HEAD
         dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max, device, dtype):
+=======
+        dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""This method references the following literatures for back propagation on scale and zero point.
     - https://arxiv.org/pdf/1902.08153.pdf
     - https://arxiv.org/pdf/1903.08066.pdf
     """
+<<<<<<< HEAD
     if dtype is torch.bfloat16:
         dY = dY.to(dtype=torch.float32)
         X = X.to(dtype=torch.float32)
         per_channel_scale = per_channel_scale.to(dtype=torch.float32)
         per_channel_zero_point = per_channel_zero_point.to(dtype=torch.float32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     per_channel_zero_point = ((per_channel_zero_point.detach() + 0.5).clamp(quant_min, quant_max)).type(torch.int32)
     grad_X = _fake_quantize_per_channel_affine_grad_reference(
         dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max).to(device)
@@ -171,6 +195,7 @@ def _fake_quantize_learnable_per_channel_affine_grad_reference(
 
         grad_scale[i] = grad_scale_i
         grad_zero_point[i] = grad_zp_i
+<<<<<<< HEAD
 
     # if dtype is torch.bfloat16, we downcast before returning the gradients to mimic autograd's downcasting
     if dtype is torch.bfloat16:
@@ -178,10 +203,13 @@ def _fake_quantize_learnable_per_channel_affine_grad_reference(
         grad_scale = grad_scale.to(torch.bfloat16)
         grad_zero_point = grad_zero_point.to(torch.bfloat16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return grad_X, grad_scale, grad_zero_point
 
 def _get_tensor_min_max(
         X: torch.Tensor,
+<<<<<<< HEAD
         running_min: Union[float, torch.Tensor] = float("inf"),
         running_max: Union[float, torch.Tensor] = float("-inf"),
         averaging_const: float = 0.01,
@@ -201,6 +229,20 @@ def _get_tensor_min_max(
         max_val_tensor = running_max + averaging_const_tensor * (max_val_tensor - running_max)
 
     return min_val_tensor.item(), max_val_tensor.item()
+=======
+        running_min: float = float("inf"),
+        running_max: float = float("-inf"),
+        averaging_const: float = 0.01) -> tuple[float, float]:
+    min_val = X.min().to(dtype=torch.float32).item()
+    max_val = X.max().to(dtype=torch.float32).item()
+
+    if not math.isinf(running_min):
+        min_val = running_min + averaging_const * (min_val - running_min)
+    if not math.isinf(running_max):
+        max_val = running_max + averaging_const * (max_val - running_max)
+
+    return min_val, max_val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _get_per_row_min_max(
         x: torch.Tensor,
@@ -480,7 +522,11 @@ def test_learnable_forward_per_tensor_cuda(self, X):
         self._test_learnable_forward_per_tensor(
             X, 'cuda', scale_base, zero_point_base)
 
+<<<<<<< HEAD
     def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_base, dtype=torch.float32):
+=======
+    def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_base):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Tests the backward method with additional backprop support for scale and zero point.
         """
         X_base = torch.tensor(X).to(device)
@@ -488,7 +534,11 @@ def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_
         for n_bits in (4, 8):
             quant_min, quant_max = 0, 2 ** n_bits - 1
 
+<<<<<<< HEAD
             X = X_base.clone().to(device)
+=======
+            X = X_base.clone().float().to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             X.requires_grad_()
             scale_base = scale_base.to(device)
             zero_point_base = zero_point_base.to(device)
@@ -501,7 +551,11 @@ def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_
                     X, scale, zero_point, quant_min, quant_max, grad_factor).to(device)
                 dout = torch.rand_like(X, dtype=torch.float).to(device)
                 dX, dScale, dZeroPoint = _fake_quantize_learnable_per_tensor_affine_grad_reference(
+<<<<<<< HEAD
                     dout, X, scale, zero_point, quant_min, quant_max, device, dtype)
+=======
+                    dout, X, scale, zero_point, quant_min, quant_max, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Y_prime.backward(dout)
 
                 expected_dX = dX.to(device).detach()
@@ -538,6 +592,7 @@ def test_learnable_backward_per_tensor_cpu(self, X):
         self._test_learnable_backward_per_tensor(
             X, 'cpu', scale_base, zero_point_base)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
     def test_learnable_backward_per_tensor_cuda(self):
         # setting seed to avoid increasing tolerance due to cases where
@@ -552,6 +607,19 @@ def test_learnable_backward_per_tensor_cuda(self):
             zero_point_base = torch.normal(mean=0, std=128, size=(1,)).to(dtype=dtype)
             self._test_learnable_backward_per_tensor(
                 X_base, 'cuda', scale_base, zero_point_base, dtype)
+=======
+    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5,),
+                       elements=hu.floats(-1e3, 1e3, allow_nan=False, allow_infinity=False),
+                       qparams=hu.qparams(dtypes=torch.quint8)))
+    @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+    def test_learnable_backward_per_tensor_cuda(self, X):
+        torch.random.manual_seed(NP_RANDOM_SEED)
+        X, (_, _, _) = X
+        scale_base = torch.normal(mean=0, std=1, size=(1,)).clamp(1e-4, 100)
+        zero_point_base = torch.normal(mean=0, std=128, size=(1,))
+        self._test_learnable_backward_per_tensor(
+            X, 'cuda', scale_base, zero_point_base)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            X=hu.tensor(shapes=hu.array_shapes(1, 5,),
@@ -628,7 +696,11 @@ def test_fake_quant_control(self):
             # Output of fake quant is not identical to input
             Y = fq_module(X)
             self.assertNotEqual(Y, X)
+<<<<<<< HEAD
             if type(fq_module) is _LearnableFakeQuantize:
+=======
+            if type(fq_module) == _LearnableFakeQuantize:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fq_module.toggle_fake_quant(False)
             else:
                 torch.ao.quantization.disable_fake_quant(fq_module)
@@ -642,7 +714,11 @@ def test_fake_quant_control(self):
             scale = fq_module.scale.detach().clone()
             zero_point = fq_module.zero_point.detach().clone()
 
+<<<<<<< HEAD
             if type(fq_module) is _LearnableFakeQuantize:
+=======
+            if type(fq_module) == _LearnableFakeQuantize:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fq_module.toggle_observer_update(False)
                 fq_module.toggle_fake_quant(True)
             else:
@@ -654,7 +730,11 @@ def test_fake_quant_control(self):
             # Observer is disabled, scale and zero-point do not change
             self.assertEqual(fq_module.scale, scale)
             self.assertEqual(fq_module.zero_point, zero_point)
+<<<<<<< HEAD
             if type(fq_module) is _LearnableFakeQuantize:
+=======
+            if type(fq_module) == _LearnableFakeQuantize:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fq_module.toggle_observer_update(True)
             else:
                 torch.ao.quantization.enable_observer(fq_module)
@@ -753,7 +833,11 @@ def _test_forward_per_channel_cachemask_impl(self, device):
                 X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
             Y_prime = torch.fake_quantize_per_channel_affine(
                 X, scale, zero_point, axis, quant_min, quant_max)
+<<<<<<< HEAD
             torch.testing.assert_close(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
+=======
+            torch.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(Y.dtype == float_type)
 
     def test_forward_per_channel_cachemask_cpu(self):
@@ -852,6 +936,7 @@ def test_learnable_forward_per_channel_cpu(self, X):
         self._test_learnable_forward_per_channel(
             X_base, 'cpu', scale_base, zero_point_base, axis)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
     def test_learnable_forward_per_channel_cuda(self):
         torch.random.manual_seed(NP_RANDOM_SEED)
@@ -866,6 +951,23 @@ def test_learnable_forward_per_channel_cuda(self):
 
             self._test_learnable_forward_per_channel(
                 X_base, 'cuda', scale_base, zero_point_base, axis)
+=======
+    @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
+                                   qparams=hu.qparams(dtypes=torch.quint8)))
+    @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+    @unittest.skip(
+        "this is broken without changes to any relevant code, "
+        "we need to remove hypothesis testing in CI")
+    def test_learnable_forward_per_channel_cuda(self, X):
+        torch.random.manual_seed(NP_RANDOM_SEED)
+        X, (_, _, axis, _) = X
+        X_base = torch.tensor(X).to('cuda')
+        channel_size = X_base.size(axis)
+        scale_base = torch.normal(mean=0, std=1, size=(channel_size,)).clamp(1e-4, 100)
+        zero_point_base = torch.normal(mean=0, std=128, size=(channel_size,))
+        self._test_learnable_forward_per_channel(
+            X_base, 'cuda', scale_base, zero_point_base, axis)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
@@ -929,7 +1031,11 @@ def test_backward_per_channel_cachemask_cpu(self):
     def test_backward_per_channel_cachemask_cuda(self):
         self._test_backward_per_channel_cachemask_impl('cuda')
 
+<<<<<<< HEAD
     def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_point_base, axis, dtype=torch.float32):
+=======
+    def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_point_base, axis):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Tests the backward path of the learnable FakeQuantizePerTensorAffine op.
         """
         for n_bits in (4, 8):
@@ -951,7 +1057,11 @@ def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_
 
                 dout = torch.rand(X_curr.shape, dtype=torch.float).to(device)
                 dX, dScale, dZeroPoint = _fake_quantize_learnable_per_channel_affine_grad_reference(
+<<<<<<< HEAD
                     dout, X_curr, scale_curr, zero_point_curr, axis, quant_min, quant_max, device, dtype)
+=======
+                    dout, X_curr, scale_curr, zero_point_curr, axis, quant_min, quant_max, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Y_prime.backward(dout)
 
                 dX_expected = dX.to(device).detach()
@@ -960,11 +1070,15 @@ def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_
                 dScale_actual = scale_curr.to(device).grad.detach()
                 dZeroPoint_expected = dZeroPoint.to(device).detach()
                 dZeroPoint_actual = zero_point_curr.to(device).grad.detach()
+<<<<<<< HEAD
 
                 # increasing tolerance for bf16 due to differences in python's x.to(torch.bfloat16) and cpp's x.to(at::kBFloat16)
                 # for example, -0.16749558 gets downcast to -1.68 (after applying grad_factor) in python
                 # in CPP, -1.6752 gets downcast to -1.67
                 tolerance = 1e-2 if dtype is torch.bfloat16 else 1e-4
+=======
+                tolerance = 1e-4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.assertTrue(
                     torch.allclose(dX_expected, dX_actual, rtol=tolerance, atol=tolerance),
@@ -994,6 +1108,7 @@ def test_learnable_backward_per_channel_cpu(self, X):
         self._test_learnable_backward_per_channel(
             X_base, 'cpu', scale_base, zero_point_base, axis)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
     def test_learnable_backward_per_channel_cuda(self):
         torch.random.manual_seed(NP_RANDOM_SEED)
@@ -1009,6 +1124,22 @@ def test_learnable_backward_per_channel_cuda(self):
             self._test_learnable_backward_per_channel(
                 X_base, 'cuda', scale_base, zero_point_base, axis, dtype
             )
+=======
+    @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(2, 5,),
+                                   qparams=hu.qparams(dtypes=torch.quint8)))
+    @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+    @unittest.skip(
+        "this is broken without changes to any relevant code, "
+        "we need to remove hypothesis testing in CI")
+    def test_learnable_backward_per_channel_cuda(self, X):
+        torch.random.manual_seed(NP_RANDOM_SEED)
+        X, (scale, zero_point, axis, torch_type) = X
+        X_base = torch.tensor(X).to('cuda')
+        scale_base = to_tensor(scale, 'cuda')
+        zero_point_base = to_tensor(zero_point, 'cuda')
+        self._test_learnable_backward_per_channel(
+            X_base, 'cuda', scale_base, zero_point_base, axis)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_numerical_consistency_per_tensor(self):
         self._test_numerical_consistency('per_tensor')
@@ -1079,6 +1210,7 @@ def test_fake_quantize_per_channel_affine_scale_dtypes(self):
                     input, scale, zero_point, axis, quant_min, quant_max
                 )
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     @unittest.skipIf(TEST_WITH_ROCM, "Not a suitable test for ROCM")
     @given(dtype=st.sampled_from([torch.float, torch.float64, torch.half, torch.bfloat16]),
@@ -1102,10 +1234,19 @@ class TestFusedObsFakeQuant(TestCase):
            symmetric_quant=st.booleans(), use_bool=st.booleans())
     @settings(deadline=None)
     def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_quant, use_bool) -> None:
+=======
+
+class TestFusedObsFakeQuant(TestCase):
+    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
+           symmetric_quant=st.booleans(), use_bool=st.booleans())
+    @settings(deadline=None)
+    def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Tests the case where we call the fused_obs_fake_quant op multiple times
         and update the running_min and max of the activation tensors.
         """
+<<<<<<< HEAD
         if device == "cpu":
             sampled_dtype = "fp32"
         dtype = {'bf16' : torch.bfloat16, 'fp16' : torch.half, 'fp32' : torch.float32}[sampled_dtype]
@@ -1114,6 +1255,12 @@ def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_
         in_running_min_op = torch.tensor(float("inf"), dtype=dtype, device=device)
         in_running_max_ref = out_running_max_ref = torch.tensor(float("-inf"), dtype=dtype)
         in_running_max_op = torch.tensor(float("-inf"), dtype=dtype, device=device)
+=======
+        in_running_min_ref = out_running_min_ref = float("inf")
+        in_running_min_op = torch.tensor(float("inf"), device=device)
+        in_running_max_ref = out_running_max_ref = float("-inf")
+        in_running_max_op = torch.tensor(float("-inf"), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         avg_const = 0.01
         scale = torch.tensor([1.0], device=device)
         zero_point = torch.tensor([0], dtype=torch.int, device=device)
@@ -1126,7 +1273,12 @@ def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_
                 observer_on = True if use_bool else 1
             if i > 4:
                 fake_quant_on = True if use_bool else 1
+<<<<<<< HEAD
             x = torch.randn(5, 5, dtype=dtype, device=device)
+=======
+
+            x = torch.randn(5, 5, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = pt_op(
                 x,
                 torch.tensor(observer_on, device=device),
@@ -1151,7 +1303,10 @@ def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_
                     running_min=in_running_min_ref,
                     running_max=in_running_max_ref,
                     averaging_const=0.01,
+<<<<<<< HEAD
                     dtype=dtype,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             if fake_quant_on:
@@ -1174,7 +1329,11 @@ def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_
             torch.testing.assert_close(out, x_in)
 
         # Test empty input works
+<<<<<<< HEAD
         x = torch.empty(0, 5, dtype=dtype, device=device)
+=======
+        x = torch.empty(0, 5, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = pt_op(
             x,
             torch.tensor(1, device=device),
diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index da67f19488a4f..8a278dec2c9ea 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -241,16 +241,26 @@ def from_float(cls, mod, qconfig=None):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "qat."
             + cls.__name__
             + ".from_float only works for "
             + cls._FLOAT_MODULE.__name__
         )
         if not qconfig:
+<<<<<<< HEAD
             assert hasattr(mod, "qconfig"), (
                 "Input float module must have qconfig defined"
             )
+=======
+            assert hasattr(
+                mod, "qconfig"
+            ), "Input float module must have qconfig defined"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mod.qconfig, "Input float module must have a valid qconfig"
             qconfig = mod.qconfig
         conv, bn = mod[0], mod[1]
@@ -1264,8 +1274,13 @@ def test_linear_bn_workflow(self):
         mp = prepare_qat(m)
         mp(data)
         mq = convert(mp)
+<<<<<<< HEAD
         self.assertTrue(type(mq[1]) is nnq.Linear)
         self.assertTrue(type(mq[2]) is nn.Identity)
+=======
+        self.assertTrue(type(mq[1]) == nnq.Linear)
+        self.assertTrue(type(mq[2]) == nn.Identity)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoXNNPACK
     @override_qengines
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index 58c88c487348d..d3f69a7c87bac 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -946,7 +946,11 @@ def test_prepare_model_callibration(self):
 
             model_report = ModelReport(model_prep, test_detector_set)
 
+<<<<<<< HEAD
             # prepare the model for calibration
+=======
+            # prepare the model for callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model = model_report.prepare_detailed_calibration()
 
             # see whether observers properly in regular nn.Module
@@ -985,7 +989,11 @@ def test_prepare_model_callibration(self):
                 elif isinstance(detector, DynamicStaticDetector):
                     self.assertEqual(len(detector_obs_of_interest_fqns), 4)
 
+<<<<<<< HEAD
             # ensure that we can prepare for calibration only once
+=======
+            # ensure that we can prepare for callibration only once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaises(ValueError):
                 prepared_for_callibrate_model = model_report.prepare_detailed_calibration()
 
@@ -1037,7 +1045,11 @@ def test_generate_report(self):
             model_full = TwoThreeOps()
             model_single = TwoThreeOps()
 
+<<<<<<< HEAD
             # prepare and calibrate two different instances of same model
+=======
+            # prepare and callibrate two different instances of same model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # prepare the model
             example_input = model_full.get_example_inputs()[0]
             current_backend = torch.backends.quantized.engine
@@ -1052,11 +1064,19 @@ def test_generate_report(self):
             # initialize another with a single detector set
             model_report_single = ModelReport(model_prep_single, single_detector_set)
 
+<<<<<<< HEAD
             # prepare the models for calibration
             prepared_for_callibrate_model_full = model_report_full.prepare_detailed_calibration()
             prepared_for_callibrate_model_single = model_report_single.prepare_detailed_calibration()
 
             # now calibrate the two models
+=======
+            # prepare the models for callibration
+            prepared_for_callibrate_model_full = model_report_full.prepare_detailed_calibration()
+            prepared_for_callibrate_model_single = model_report_single.prepare_detailed_calibration()
+
+            # now callibrate the two models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             num_iterations = 10
             for i in range(num_iterations):
                 example_input = torch.tensor(torch.randint(100, (1, 3, 3, 3)), dtype=torch.float)
@@ -1109,12 +1129,20 @@ def test_generate_visualizer(self):
 
             model = TwoThreeOps()
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = _get_prepped_for_calibration_model_helper(
                 model, detector_set, model.get_example_inputs()[0]
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the model
+=======
+            # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1162,12 +1190,20 @@ def test_qconfig_mapping_generation(self):
 
             model = TwoThreeOps()
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = _get_prepped_for_calibration_model_helper(
                 model, detector_set, model.get_example_inputs()[0]
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the models
+=======
+            # now we actually callibrate the models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1192,7 +1228,11 @@ def test_qconfig_mapping_generation(self):
             self.assertEqual(len(qconfig_mapping.module_name_qconfigs), 2)
 
             # only two linears, make sure per channel min max for weight since fbgemm
+<<<<<<< HEAD
             # also static distribution since a simple single calibration
+=======
+            # also static distribution since a simple single callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for key in qconfig_mapping.module_name_qconfigs:
                 config = qconfig_mapping.module_name_qconfigs[key]
                 self.assertEqual(config.weight, default_per_channel_weight_observer)
@@ -1220,12 +1260,20 @@ def test_equalization_mapping_generation(self):
 
             model = TwoThreeOps()
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = _get_prepped_for_calibration_model_helper(
                 model, detector_set, model.get_example_inputs()[0]
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the models
+=======
+            # now we actually callibrate the models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1319,7 +1367,11 @@ def test_input_weight_equalization_determine_points(self):
 
             detector_set = {InputWeightEqualizationDetector(0.5)}
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             non_fused = self._get_prepped_for_calibration_model(self.TwoBlockComplexNet(), detector_set)
             fused = self._get_prepped_for_calibration_model(self.TwoBlockComplexNet(), detector_set, fused=True)
 
@@ -1365,12 +1417,20 @@ def test_input_weight_equalization_report_gen(self):
             test_input_weight_detector = InputWeightEqualizationDetector(0.4)
             detector_set = {test_input_weight_detector}
             model = self.TwoBlockComplexNet()
+<<<<<<< HEAD
             # prepare the model for calibration
+=======
+            # prepare the model for callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, model_report = self._get_prepped_for_calibration_model(
                 model, detector_set
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the model
+=======
+            # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1430,7 +1490,11 @@ def test_input_weight_equalization_report_gen(self):
                     self.assertEqual(global_max, max(dimension_max))
 
                     input_ratio = torch.sqrt((per_channel_max - per_channel_min) / (global_max - global_min))
+<<<<<<< HEAD
                     # ensure comparison stat passed back is sqrt of range ratios
+=======
+                    # ensure comparision stat passed back is sqrt of range ratios
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # need to get the weight ratios first
 
                     # make sure per channel min and max are as expected
@@ -1474,10 +1538,17 @@ def test_input_weight_equalization_report_gen_empty(self):
             test_input_weight_detector = InputWeightEqualizationDetector(0.4)
             detector_set = {test_input_weight_detector}
             model = self.ReluOnly()
+<<<<<<< HEAD
             # prepare the model for calibration
             prepared_for_callibrate_model, model_report = self._get_prepped_for_calibration_model(model, detector_set)
 
             # now we actually calibrate the model
+=======
+            # prepare the model for callibration
+            prepared_for_callibrate_model, model_report = self._get_prepped_for_calibration_model(model, detector_set)
+
+            # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1531,7 +1602,11 @@ def get_outlier_inputs(self):
 
     def _get_prepped_for_calibration_model(self, model, detector_set, use_outlier_data=False):
         r"""Returns a model that has been prepared for callibration and corresponding model_report"""
+<<<<<<< HEAD
         # call the general helper function to calibrate
+=======
+        # call the general helper function to callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_input = model.get_example_inputs()[0]
 
         # if we specifically want to test data with outliers replace input
@@ -1550,7 +1625,11 @@ def test_outlier_detection_determine_points(self):
 
             detector_set = {OutlierDetector(reference_percentile=0.95)}
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = self._get_prepped_for_calibration_model(
                 self.LargeBatchModel(param_size=128), detector_set
             )
@@ -1594,12 +1673,20 @@ def test_no_outlier_report_gen(self):
             detector_set = {outlier_detector, dynamic_static_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = self._get_prepped_for_calibration_model(
                 model, detector_set
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the model
+=======
+            # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1644,12 +1731,20 @@ def test_all_outlier_report_gen(self):
             detector_set = {outlier_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = self._get_prepped_for_calibration_model(
                 model, detector_set
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the model
+=======
+            # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_input = model.get_example_inputs()[0]
             example_input = example_input.to(torch.float)
 
@@ -1694,16 +1789,28 @@ def test_multiple_run_consistent_spike_outlier_report_gen(self):
             detector_set = {outlier_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = self._get_prepped_for_calibration_model(
                 model, detector_set, use_outlier_data=True
             )
 
+<<<<<<< HEAD
             # now we actually calibrate the model
             example_input = model.get_outlier_inputs()[0]
             example_input = example_input.to(torch.float)
 
             # now calibrate minimum 30 times to make it above minimum threshold
+=======
+            # now we actually callibrate the model
+            example_input = model.get_outlier_inputs()[0]
+            example_input = example_input.to(torch.float)
+
+            # now callibrate minimum 30 times to make it above minimum threshold
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(30):
                 example_input = model.get_outlier_inputs()[0]
                 example_input = example_input.to(torch.float)
@@ -1764,7 +1871,11 @@ def _callibrate_and_generate_visualizer(self, model, prepared_for_callibrate_mod
         r"""
         Callibrates the passed in model, generates report, and returns the visualizer
         """
+<<<<<<< HEAD
         # now we actually calibrate the model
+=======
+        # now we actually callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_input = model.get_example_inputs()[0]
         example_input = example_input.to(torch.float)
 
@@ -1796,7 +1907,11 @@ def test_get_modules_and_features(self):
 
             model = TwoThreeOps()
 
+<<<<<<< HEAD
             # get tst model and calibrate
+=======
+            # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepared_for_callibrate_model, mod_report = _get_prepped_for_calibration_model_helper(
                 model, detector_set, model.get_example_inputs()[0]
             )
@@ -1823,7 +1938,11 @@ def test_get_modules_and_features(self):
             plottable_set = set()
 
             for feature_name in b_1_linear_features:
+<<<<<<< HEAD
                 if type(b_1_linear_features[feature_name]) is torch.Tensor:
+=======
+                if type(b_1_linear_features[feature_name]) == torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     plottable_set.add(feature_name)
 
             returned_plottable_feats = mod_rep_visualizer.get_all_unique_feature_names()
@@ -1843,7 +1962,11 @@ def _prep_visualizer_helper(self):
 
         model = TwoThreeOps()
 
+<<<<<<< HEAD
         # get tst model and calibrate
+=======
+        # get tst model and callibrate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prepared_for_callibrate_model, mod_report = _get_prepped_for_calibration_model_helper(
             model, detector_set, model.get_example_inputs()[0]
         )
@@ -1945,7 +2068,11 @@ def _get_prepped_for_calibration_model_helper(model, detector_set, example_input
     example_input = example_input.to(torch.float)
     q_config_mapping = torch.ao.quantization.get_default_qconfig_mapping()
 
+<<<<<<< HEAD
     # if they passed in fusion parameter, make sure to test that
+=======
+    # if they passed in fusion paramter, make sure to test that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if fused:
         model = torch.ao.quantization.fuse_modules(model, model.get_fusion_modules())
 
@@ -1953,7 +2080,11 @@ def _get_prepped_for_calibration_model_helper(model, detector_set, example_input
 
     model_report = ModelReport(model_prep, detector_set)
 
+<<<<<<< HEAD
     # prepare the model for calibration
+=======
+    # prepare the model for callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prepared_for_callibrate_model = model_report.prepare_detailed_calibration()
 
     return (prepared_for_callibrate_model, model_report)
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index c54c741bcec3d..3a69cac92321d 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -826,8 +826,13 @@ def conv_bn_res_relu_extra_inputs_getter(pattern):
         # check conv module has two inputs
         named_modules = dict(m.named_modules())
         for node in m.graph.nodes:
+<<<<<<< HEAD
             if node.op == "call_module" and type(named_modules[node.target]) is torch.nn.Conv2d:
                 self.assertTrue(len(node.args) == 2, msg="Expecting the fused op to have two arguments")
+=======
+            if node.op == "call_module" and type(named_modules[node.target]) == torch.nn.Conv2d:
+                self.assertTrue(len(node.args) == 2), "Expecting the fused op to have two arguments"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fusion_pattern_with_matchallnode(self):
         """This test tests that the node matched by MatchAllNode will be regared as an input
@@ -917,7 +922,11 @@ def forward(self, x, y):
         m = torch.fx.symbolic_trace(M())
         modules = dict(m.named_modules())
         for n in m.graph.nodes:
+<<<<<<< HEAD
             if n.op == 'call_module' and type(modules[n.target]) is nn.ReLU:
+=======
+            if n.op == 'call_module' and type(modules[n.target]) == nn.ReLU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertTrue(_is_match(modules, n, pattern))
 
     def test_pattern_match_constant(self):
@@ -1221,7 +1230,11 @@ def checkWeightQParams(model):
             def checkSerDeser(model, is_dynamic):
                 for module_name in ("linear", "conv"):
                     if hasattr(model, module_name):
+<<<<<<< HEAD
                         # make sure serialization works
+=======
+                        # make sure seralization works
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         state_dict = copy.deepcopy(model.state_dict())
                         all_keys = _get_keys(module_name, is_dynamic)
                         for key in all_keys:
@@ -1484,7 +1497,11 @@ def checkWeightQParams(model):
             def checkSerDeser(model, is_dynamic):
                 module_name = "deconv"
                 if hasattr(model, module_name):
+<<<<<<< HEAD
                     # make sure serialization works
+=======
+                    # make sure seralization works
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     state_dict = copy.deepcopy(model.state_dict())
                     all_keys = _get_keys(module_name, is_dynamic)
                     for key in all_keys:
@@ -1569,7 +1586,11 @@ def checkWeightQParams(model):
             def checkSerDeser(model, is_dynamic):
                 module_name = "deconv"
                 if hasattr(model, module_name):
+<<<<<<< HEAD
                     # make sure serialization works
+=======
+                    # make sure seralization works
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     state_dict = copy.deepcopy(model.state_dict())
                     all_keys = _get_keys(module_name, is_dynamic)
                     for key in all_keys:
@@ -6648,7 +6669,11 @@ class SubModule(nn.Module):
             """
 
             def __init__(self, input_dim, output_dim):
+<<<<<<< HEAD
                 super().__init__()
+=======
+                super(__class__, self).__init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.w = nn.Parameter(torch.randn(input_dim, output_dim))
                 self.b = nn.Parameter(torch.randn(input_dim))
 
@@ -6661,7 +6686,11 @@ class MainModule(nn.Module):
             """
 
             def __init__(self, input_dim, hidden_dim, output_dim):
+<<<<<<< HEAD
                 super().__init__()
+=======
+                super(__class__, self).__init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.submodule_1 = SubModule(hidden_dim, input_dim)
                 setattr(self, 'submodule|2', SubModule(hidden_dim, hidden_dim))
                 setattr(self, 'submodule/3', SubModule(hidden_dim, hidden_dim))
diff --git a/test/quantization/fx/test_subgraph_rewriter.py b/test/quantization/fx/test_subgraph_rewriter.py
index e410f93803d64..456775e023bb1 100644
--- a/test/quantization/fx/test_subgraph_rewriter.py
+++ b/test/quantization/fx/test_subgraph_rewriter.py
@@ -454,8 +454,13 @@ def forward(self, x):
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n, m in zip(symbolic_traced.graph.nodes, graph.nodes):
             if n.op == 'placeholder':
+<<<<<<< HEAD
                 assert n.type is int
                 assert m.type is int
+=======
+                assert n.type == int
+                assert m.type == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_subgraph_writer_replace_consecutive_submodules(self):
 
diff --git a/test/quantization/jit/test_ondevice_quantization.py b/test/quantization/jit/test_ondevice_quantization.py
index 7fac580af2ef2..3a4986457ad34 100644
--- a/test/quantization/jit/test_ondevice_quantization.py
+++ b/test/quantization/jit/test_ondevice_quantization.py
@@ -18,7 +18,11 @@
     LinearAddModel,
 )
 from torch.testing._internal.common_utils import TestCase
+<<<<<<< HEAD
 from torch.utils import bundled_inputs
+=======
+from torch.utils import bundled_inputs as bundled_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class myMod(torch.nn.Module):
@@ -99,17 +103,29 @@ def get_linear_packed_param_fp_weight(node):
         ):
             raise ValueError("Quantized weight must be produced.")
         fp_weight = weight.inputsAt(0).node()
+<<<<<<< HEAD
         assert fp_weight.kind() == "prim::GetAttr", (
             "Weight must be an attribute of the module."
         )
+=======
+        assert (
+            fp_weight.kind() == "prim::GetAttr"
+        ), "Weight must be an attribute of the module."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fp_weight_name = fp_weight.s("name")
         return fp_weight_name
 
     @staticmethod
     def is_per_channel_quantized_packed_param(node):
+<<<<<<< HEAD
         assert node.kind() == "quantized::linear_prepack", (
             "Node must corresponds to linear_prepack."
         )
+=======
+        assert (
+            node.kind() == "quantized::linear_prepack"
+        ), "Node must corresponds to linear_prepack."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight = node.inputsAt(0).node()
         assert (
             weight.kind() != "aten::quantize_per_tensor"
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index ec7618fb551b8..1fa745f019b64 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -124,7 +124,15 @@ def forward(self, x):
             "aten::dequantize"
         ).check_not("aten::quantize_per_channel").check("aten::dequantize").check_next(
             "aten::conv2d"
+<<<<<<< HEAD
         ).check_next("aten::quantize_per_tensor").check_next("aten::dequantize").run(
+=======
+        ).check_next(
+            "aten::quantize_per_tensor"
+        ).check_next(
+            "aten::dequantize"
+        ).run(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             freezed.graph
         )
 
@@ -666,9 +674,15 @@ def forward(self, x):
         }
         assert len(activation_dtypes) == 1, "Expected to have 1 activation dtype"
         assert len(weight_dtypes) == 1, "Expected to have 1 weight dtype"
+<<<<<<< HEAD
         assert next(iter(activation_dtypes)) != next(iter(weight_dtypes)), (
             "Expected activation dtype to "
         )
+=======
+        assert next(iter(activation_dtypes)) != next(
+            iter(weight_dtypes)
+        ), "Expected activation dtype to "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         " be different from wegiht dtype"
 
     def test_insert_observers_for_reused_weight(self):
@@ -702,9 +716,15 @@ def forward(self, x):
         conv2_observers = attrs_with_prefix(m.conv2, "_observer_")
         assert len(conv1_observers) == 1, "Expected to have 1 observer submodules"
         assert len(conv2_observers) == 1, "Expected to have 1 observer submodules"
+<<<<<<< HEAD
         assert conv1_observers == conv2_observers, (
             "Expect conv1 and conv2 to have same observers since the class type is shared"
         )
+=======
+        assert (
+            conv1_observers == conv2_observers
+        ), "Expect conv1 and conv2 to have same observers since the class type is shared"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_insert_observers_for_general_ops(self):
         """Make sure we skip observers for ops that doesn't require
@@ -730,9 +750,19 @@ def forward(self, x):
             'prim::GetAttr[name="conv"]'
         ).check("prim::CallMethod").check(
             'Observer = prim::GetAttr[name="_observer_'
+<<<<<<< HEAD
         ).check("aten::flatten").check_not(
             'Observer = prim::GetAttr[name="_observer_'
         ).run(m.graph)
+=======
+        ).check(
+            "aten::flatten"
+        ).check_not(
+            'Observer = prim::GetAttr[name="_observer_'
+        ).run(
+            m.graph
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: this is too long, split this to test_insert_observers.py and remove
     # insrt_observers prefix
@@ -762,11 +792,25 @@ def forward(self, x):
             'prim::GetAttr[name="conv1"]'
         ).check("prim::CallMethod").check(
             'Observer = prim::GetAttr[name="_observer_'
+<<<<<<< HEAD
         ).check("aten::flatten").check_not(
             'Observer = prim::GetAttr[name="_observer_'
         ).check('prim::GetAttr[name="conv2"]').check(
             'Observer = prim::GetAttr[name="_observer_'
         ).run(m.graph)
+=======
+        ).check(
+            "aten::flatten"
+        ).check_not(
+            'Observer = prim::GetAttr[name="_observer_'
+        ).check(
+            'prim::GetAttr[name="conv2"]'
+        ).check(
+            'Observer = prim::GetAttr[name="_observer_'
+        ).run(
+            m.graph
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_insert_observers_propagate_observed_in_submodule(self):
         """Make sure we propagate observed property through general ops"""
@@ -795,11 +839,25 @@ def forward(self, x):
             'prim::GetAttr[name="conv1"]'
         ).check("prim::CallMethod").check(
             'Observer = prim::GetAttr[name="_observer_'
+<<<<<<< HEAD
         ).check("prim::CallMethod").check_not(
             'Observer = prim::GetAttr[name="_observer_'
         ).check('prim::GetAttr[name="conv2"]').check(
             'Observer = prim::GetAttr[name="_observer_'
         ).run(m.graph)
+=======
+        ).check(
+            "prim::CallMethod"
+        ).check_not(
+            'Observer = prim::GetAttr[name="_observer_'
+        ).check(
+            'prim::GetAttr[name="conv2"]'
+        ).check(
+            'Observer = prim::GetAttr[name="_observer_'
+        ).run(
+            m.graph
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_insert_observers_propagate_observed_for_function(self):
         def channel_shuffle(x: torch.Tensor, groups: int) -> torch.Tensor:
@@ -1035,9 +1093,15 @@ def forward(self, x):
 
             m(data)
             m = convert_jit(m, debug=True)
+<<<<<<< HEAD
             assert len(m._modules._c.items()) == 1, (
                 "Expected to have single submodule of conv"
             )
+=======
+            assert (
+                len(m._modules._c.items()) == 1
+            ), "Expected to have single submodule of conv"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # make sure the quantized model is executable
             m(data)
             quant_func = (
@@ -1068,6 +1132,7 @@ def forward(self, x):
             qconfig_dict = {"": qconfig}
             m = prepare_jit(m, qconfig_dict)
             # observers for input, output and value between conv1/conv2
+<<<<<<< HEAD
             assert len(attrs_with_prefix(m, "_observer_")) == 3, (
                 "Expected to have 3 obervers"
             )
@@ -1079,6 +1144,19 @@ def forward(self, x):
             assert len(attrs_with_prefix(m.conv2, "_observer_")) == 1, (
                 "Expected to have 1 obervers"
             )
+=======
+            assert (
+                len(attrs_with_prefix(m, "_observer_")) == 3
+            ), "Expected to have 3 obervers"
+            # observer for weight
+            assert (
+                len(attrs_with_prefix(m.conv1, "_observer_")) == 1
+            ), "Expected to have 1 obervers"
+            # observer for weight
+            assert (
+                len(attrs_with_prefix(m.conv2, "_observer_")) == 1
+            ), "Expected to have 1 obervers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             data = torch.randn(1, 3, 10, 10, dtype=torch.float)
             m(data)
@@ -1087,6 +1165,7 @@ def forward(self, x):
             assert m.conv1._c._type() == m.conv2._c._type()
 
             # check all observers have been removed
+<<<<<<< HEAD
             assert len(attrs_with_prefix(m, "_observer_")) == 0, (
                 "Expected to have 0 obervers"
             )
@@ -1096,6 +1175,17 @@ def forward(self, x):
             assert len(attrs_with_prefix(m.conv2, "_observer_")) == 0, (
                 "Expected to have 0 obervers"
             )
+=======
+            assert (
+                len(attrs_with_prefix(m, "_observer_")) == 0
+            ), "Expected to have 0 obervers"
+            assert (
+                len(attrs_with_prefix(m.conv1, "_observer_")) == 0
+            ), "Expected to have 0 obervers"
+            assert (
+                len(attrs_with_prefix(m.conv2, "_observer_")) == 0
+            ), "Expected to have 0 obervers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             quant_func = (
                 "aten::quantize_per_channel"
@@ -1314,7 +1404,15 @@ def forward(self, x):
             "aten::avg_pool2d"
         ).check("aten::q_scale").check_next("aten::q_zero_point").check_next(
             "prim::dtype"
+<<<<<<< HEAD
         ).check_next("aten::quantize_per_tensor").check("aten::dequantize").run(
+=======
+        ).check_next(
+            "aten::quantize_per_tensor"
+        ).check(
+            "aten::dequantize"
+        ).run(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model.graph
         )
 
@@ -1733,7 +1831,13 @@ def forward(self, x):
                     "aten::relu"
                 ).check_not(f"quantized::conv{dim}d(").check_not(
                     "quantized::relu("
+<<<<<<< HEAD
                 ).run(m.graph)
+=======
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_quantized_add_alpha(self):
@@ -1884,7 +1988,13 @@ def forward(self, x, y):
                     "aten::relu("
                 ).check_not("aten::relu_(").check_not("quantized::add(").check_not(
                     "quantized::relu("
+<<<<<<< HEAD
                 ).run(m.graph)
+=======
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_quantized_add(self):
@@ -2091,7 +2201,13 @@ def forward(self, x, y):
                     "aten::relu("
                 ).check_not("aten::relu_(").check_not("quantized::add(").check_not(
                     "quantized::relu("
+<<<<<<< HEAD
                 ).run(m.graph)
+=======
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_quantized_add_scalar_relu(self):
@@ -2175,7 +2291,15 @@ def forward(self, x):
                     "aten::relu("
                 ).check_not("aten::relu_(").check_not(
                     "quantized::add_scalar("
+<<<<<<< HEAD
                 ).check_not("quantized::relu(").run(m.graph)
+=======
+                ).check_not(
+                    "quantized::relu("
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_quantized_cat(self):
@@ -2510,7 +2634,13 @@ def forward(self, x, y):
                     "aten::relu("
                 ).check_not("aten::relu_(").check_not("quantized::mul(").check_not(
                     "quantized::relu("
+<<<<<<< HEAD
                 ).run(m.graph)
+=======
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_quantized_mul_scalar_relu(self):
@@ -2593,7 +2723,15 @@ def forward(self, x):
                     "aten::relu("
                 ).check_not("aten::relu_(").check_not(
                     "quantized::mul_scalar("
+<<<<<<< HEAD
                 ).check_not("quantized::relu(").run(m.graph)
+=======
+                ).check_not(
+                    "quantized::relu("
+                ).run(
+                    m.graph
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @override_qengines
     def test_hardswish(self):
@@ -2926,7 +3064,11 @@ def forward(self, x):
                 m._c, "forward", {"": qconfig}, inplace=False
             )
         )
+<<<<<<< HEAD
         # Checking the model before finalize contain unfused patterns
+=======
+        # Checking the model before fianlize contain unfused patterns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # that numerically matches the model after quantize by checking
         # number of aten::quantize_per_tensor functions
         # conv has 3 quantize_per_tensor for activations and 1 for weight
@@ -3063,7 +3205,13 @@ def forward(self, x):
             'Observer = prim::GetAttr[name="_observer_'
         ).check("prim::CallMethod").check_not(
             'Observer = prim::GetAttr[name="_observer_'
+<<<<<<< HEAD
         ).run(m.graph)
+=======
+        ).run(
+            m.graph
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_insert_quant_dequant_linear_dynamic(self):
         class M(torch.nn.Module):
@@ -3084,9 +3232,15 @@ def forward(self, x):
                 else default_dynamic_qconfig
             )
             m = quantize_dynamic_jit(m, {"": qconfig}, debug=True)
+<<<<<<< HEAD
             assert len(m._modules._c.items()) == 2, (
                 "Expected to have two submodule of linear"
             )
+=======
+            assert (
+                len(m._modules._c.items()) == 2
+            ), "Expected to have two submodule of linear"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             wt_quant_func = (
                 "aten::quantize_per_channel"
@@ -3099,11 +3253,29 @@ def forward(self, x):
                 act_quant_func
             ).check_next("aten::dequantize").check(
                 "aten::_choose_qparams_per_tensor"
+<<<<<<< HEAD
             ).check_next(act_quant_func).check_next("aten::dequantize").check(
                 wt_quant_func
             ).check_next("aten::dequantize").check_not(wt_quant_func).check(
                 "return"
             ).run(m.graph)
+=======
+            ).check_next(
+                act_quant_func
+            ).check_next(
+                "aten::dequantize"
+            ).check(
+                wt_quant_func
+            ).check_next(
+                "aten::dequantize"
+            ).check_not(
+                wt_quant_func
+            ).check(
+                "return"
+            ).run(
+                m.graph
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @override_qengines
     def test_dynamic_multi_op(self):
diff --git a/test/quantization/pt2e/test_duplicate_dq.py b/test/quantization/pt2e/test_duplicate_dq.py
index a4f75588f48c2..fb504ed966f9d 100644
--- a/test/quantization/pt2e/test_duplicate_dq.py
+++ b/test/quantization/pt2e/test_duplicate_dq.py
@@ -24,7 +24,11 @@
     OP_TO_ANNOTATOR,
     QuantizationConfig,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import QuantizationTestCase
 from torch.testing._internal.common_utils import IS_WINDOWS, raise_on_run_directly
 
@@ -101,7 +105,11 @@ def _test_duplicate_dq(
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/test/quantization/pt2e/test_metadata_porting.py b/test/quantization/pt2e/test_metadata_porting.py
index 88e1c9cad7cba..8bc88c3f58ccd 100644
--- a/test/quantization/pt2e/test_metadata_porting.py
+++ b/test/quantization/pt2e/test_metadata_porting.py
@@ -102,7 +102,11 @@ def _test_metadata_porting(
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = torch.export.export(m, example_inputs, strict=True).module()
+=======
+        m = torch.export.export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/test/quantization/pt2e/test_numeric_debugger.py b/test/quantization/pt2e/test_numeric_debugger.py
index cb4aaf86a85cd..2247dfbf5c455 100644
--- a/test/quantization/pt2e/test_numeric_debugger.py
+++ b/test/quantization/pt2e/test_numeric_debugger.py
@@ -4,8 +4,11 @@
 import unittest
 from collections import Counter
 
+<<<<<<< HEAD
 from packaging import version
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.ao.quantization import (
     compare_results,
@@ -21,7 +24,11 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import TestHelperModules
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
@@ -31,10 +38,13 @@
 )
 
 
+<<<<<<< HEAD
 if version.parse(torch.__version__) >= version.parse("2.8.0"):
     torch._dynamo.config.cache_size_limit = 128
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
 class TestNumericDebugger(TestCase):
     def _assert_each_node_has_debug_handle(self, model) -> None:
@@ -82,7 +92,11 @@ def _extract_debug_handles_with_prev_decomp_op_from_node(node):
                         prev_decomp_op_to_debug_handle_map[prev_decomp_op]
                         == debug_handle
                     ), f"Node {node} has different debug handle {debug_handle}"
+<<<<<<< HEAD
                     f"than previous node sharing the same decomp op {prev_decomp_op}"
+=======
+                    "than previous node sharing the same decomp op {prev_decomp_op}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bfs_trace_with_node_process(
             model, _extract_debug_handles_with_prev_decomp_op_from_node
@@ -92,7 +106,11 @@ def _extract_debug_handles_with_prev_decomp_op_from_node(node):
     def test_simple(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map = self._extract_debug_handles(ep)
@@ -102,7 +120,11 @@ def test_simple(self):
     def test_control_flow(self):
         m = TestHelperModules.ControlFlow()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -113,7 +135,11 @@ def test_control_flow(self):
     def test_quantize_pt2e_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
@@ -173,14 +199,22 @@ def test_deepcopy_preserve_handle(self):
     def test_re_export_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map_ref = self._extract_debug_handles(ep)
 
+<<<<<<< HEAD
         ep_reexport = export(m, example_inputs, strict=True)
+=======
+        ep_reexport = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._assert_each_node_has_debug_handle(ep_reexport)
         debug_handle_map = self._extract_debug_handles(ep_reexport)
@@ -190,7 +224,11 @@ def test_re_export_preserve_handle(self):
     def test_run_decompositions_same_handle_id(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -215,7 +253,11 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
 
         for m in test_models:
             example_inputs = m.example_inputs()
+<<<<<<< HEAD
             ep = export(m, example_inputs, strict=True)
+=======
+            ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             generate_numeric_debug_handle(ep)
 
             self._assert_each_node_has_debug_handle(ep)
@@ -238,7 +280,11 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
     def test_prepare_for_propagation_comparison(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_logger = prepare_for_propagation_comparison(m)
@@ -255,7 +301,11 @@ def test_prepare_for_propagation_comparison(self):
     def test_extract_results_from_loggers(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -280,7 +330,11 @@ def test_extract_results_from_loggers(self):
     def test_extract_results_from_loggers_list_output(self):
         m = TestHelperModules.Conv2dWithSplit()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -310,7 +364,11 @@ def test_extract_results_from_loggers_list_output(self):
     def test_added_node_gets_unique_id(self) -> None:
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export(m, example_inputs, strict=True)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         ref_handles = self._extract_debug_handles(ep)
         ref_counter = Counter(ref_handles.values())
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index 25db7d97d9dce..cfad0cd9e27cb 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -39,7 +39,11 @@
     OP_TO_ANNOTATOR,
     QuantizationConfig,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx import Node
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
@@ -254,6 +258,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                         maxpool_node = node
                         input_act = maxpool_node.args[0]
                         assert isinstance(input_act, Node)
+<<<<<<< HEAD
                         maxpool_node.meta["quantization_annotation"] = (
                             QuantizationAnnotation(
                                 input_qspec_map={
@@ -264,6 +269,18 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                                 ),
                                 _annotated=True,
                             )
+=======
+                        maxpool_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                            },
+                            output_qspec=SharedQuantizationSpec(
+                                (input_act, maxpool_node)
+                            ),
+                            _annotated=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             def validate(self, model: torch.fx.GraphModule) -> None:
@@ -339,9 +356,15 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                         def derive_qparams_fn(
                             obs_or_fqs: list[ObserverOrFakeQuantize],
                         ) -> tuple[Tensor, Tensor]:
+<<<<<<< HEAD
                             assert len(obs_or_fqs) == 2, (
                                 f"Expecting two obs/fqs, one for activation and one for weight, got: {len(obs_or_fqs)}"
                             )
+=======
+                            assert (
+                                len(obs_or_fqs) == 2
+                            ), f"Expecting two obs/fqs, one for activation and one for weight, got: {len(obs_or_fqs)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             act_obs_or_fq = obs_or_fqs[0]
                             weight_obs_or_fq = obs_or_fqs[1]
                             act_scale, act_zp = act_obs_or_fq.calculate_qparams()
@@ -442,9 +465,15 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                         def derive_qparams_fn(
                             obs_or_fqs: list[ObserverOrFakeQuantize],
                         ) -> tuple[Tensor, Tensor]:
+<<<<<<< HEAD
                             assert len(obs_or_fqs) == 1, (
                                 f"Expecting one weight obs/fq, got: {len(obs_or_fqs)}"
                             )
+=======
+                            assert (
+                                len(obs_or_fqs) == 1
+                            ), f"Expecting one weight obs/fq, got: {len(obs_or_fqs)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             weight_obs_or_fq = obs_or_fqs[0]
                             (
                                 weight_scale,
@@ -748,6 +777,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             (first_input_node, cat_node)
                         )
                         for input_node in input_nodes[1:]:
+<<<<<<< HEAD
                             input_qspec_map[input_node] = (
                                 share_qparams_with_input_act0_qspec
                             )
@@ -758,6 +788,18 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                                 output_qspec=share_qparams_with_input_act0_qspec,
                                 _annotated=True,
                             )
+=======
+                            input_qspec_map[
+                                input_node
+                            ] = share_qparams_with_input_act0_qspec
+
+                        cat_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act0_qspec,
+                            _annotated=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             def validate(self, model: torch.fx.GraphModule) -> None:
@@ -767,7 +809,11 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
 
         # program capture
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, BackendAQuantizer())
         # make sure the two observers for input are shared
         conv_output_obs = []
@@ -783,9 +829,15 @@ def validate(self, model: torch.fx.GraphModule) -> None:
                 obs_ins0 = getattr(m, input0.target)
                 obs_ins1 = getattr(m, input1.target)
                 assert obs_ins0 == obs_ins1
+<<<<<<< HEAD
         assert len(conv_output_obs) == 2, (
             "expecting two observer that follows conv2d ops"
         )
+=======
+        assert (
+            len(conv_output_obs) == 2
+        ), "expecting two observer that follows conv2d ops"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checking that the output observers for the two convs are shared as well
         assert conv_output_obs[0] == conv_output_obs[1]
 
@@ -827,7 +879,11 @@ def _test_transitive_sharing_with_cat_helper(self, quantizer):
         )
 
         # program capture
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         # make sure the two input observers and output are shared
@@ -850,9 +906,15 @@ def _test_transitive_sharing_with_cat_helper(self, quantizer):
                 obs_ins2 = getattr(m, output_obs.target)
                 assert obs_ins0 == obs_ins2, "input observer does not match output"
 
+<<<<<<< HEAD
         assert len(conv_output_obs) == 2, (
             "expecting two observer that follows conv2d ops"
         )
+=======
+        assert (
+            len(conv_output_obs) == 2
+        ), "expecting two observer that follows conv2d ops"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checking that the output observers for the two convs are shared as well
         assert conv_output_obs[0] == conv_output_obs[1]
 
@@ -967,6 +1029,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             (first_input_node, cat_node)
                         )
                         for input_node in input_nodes[1:]:
+<<<<<<< HEAD
                             input_qspec_map[input_node] = (
                                 share_qparams_with_input_act0_qspec
                             )
@@ -977,6 +1040,18 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                                 output_qspec=share_qparams_with_input_act0_qspec,
                                 _annotated=True,
                             )
+=======
+                            input_qspec_map[
+                                input_node
+                            ] = share_qparams_with_input_act0_qspec
+
+                        cat_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act0_qspec,
+                            _annotated=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             def validate(self, model: torch.fx.GraphModule) -> None:
@@ -1063,6 +1138,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                         share_qparams_with_input_act1_qspec = SharedQuantizationSpec(
                             (second_input_node, cat_node)
                         )
+<<<<<<< HEAD
                         input_qspec_map[first_input_node] = (
                             share_qparams_with_input_act1_qspec
                         )
@@ -1073,6 +1149,18 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                                 output_qspec=share_qparams_with_input_act1_qspec,
                                 _annotated=True,
                             )
+=======
+                        input_qspec_map[
+                            first_input_node
+                        ] = share_qparams_with_input_act1_qspec
+
+                        cat_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act1_qspec,
+                            _annotated=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             def validate(self, model: torch.fx.GraphModule) -> None:
@@ -1121,6 +1209,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                         share_qparams_with_input_act1_qspec = SharedQuantizationSpec(
                             (second_input_node, add_node)
                         )
+<<<<<<< HEAD
                         input_qspec_map[first_input_node] = (
                             share_qparams_with_input_act1_qspec
                         )
@@ -1132,6 +1221,19 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                                 allow_implicit_sharing=False,
                                 _annotated=True,
                             )
+=======
+                        input_qspec_map[
+                            first_input_node
+                        ] = share_qparams_with_input_act1_qspec
+
+                        add_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act1_qspec,
+                            allow_implicit_sharing=False,
+                            _annotated=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             def validate(self, model: torch.fx.GraphModule) -> None:
@@ -1146,7 +1248,11 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         )
 
         # program capture
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = BackendAQuantizer()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -1296,7 +1402,11 @@ def validate(self, model: torch.fx.GraphModule) -> None:
 
         m = M().eval()
         example_inputs = torch.randn(1, 2, 3, 3)
+<<<<<<< HEAD
         m = export(m, (example_inputs,), strict=True).module()
+=======
+        m = export_for_training(m, (example_inputs,), strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(Exception):
             m = prepare_pt2e(m, BackendAQuantizer())
 
@@ -1419,7 +1529,11 @@ def forward(self, x):
         quantizer.set_global(operator_config)
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_meta = None
         for n in m.graph.nodes:
             if (
@@ -1506,7 +1620,11 @@ def forward(self, x):
         m = M().eval()
         quantizer = TestQuantizer()
         example_inputs = (torch.randn(1, 2, 3, 3),)
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1557,7 +1675,11 @@ def forward(self, x, y, z):
             torch.randn(1, 2, 3, 3),
             torch.randn(1, 2, 3, 3),
         )
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1682,7 +1804,11 @@ def test_composable_quantizer_linear_conv(self):
         qconfig_mapping.set_object_type(torch.nn.Linear, dynamic_qconfig)
         # Had to turn off check against fx because fx quant workflow does not seem
         # to propagate observers for permute node for this model.
+<<<<<<< HEAD
         # Surprisingly it does propagate it for EmbeddingConvLinearModule
+=======
+        # Suprisingly it does propagate it for EmbeddingConvLinearModule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Figure out the right behavior for propagation
         self._test_quantizer(
             m_eager,
@@ -1812,7 +1938,11 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1),)
         m = M().train()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inplace:
             target = torch.ops.aten.dropout_.default
         else:
@@ -1877,7 +2007,11 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Assert that batch norm op exists and is in train mode
         bn_node = self._get_node(m, bn_train_op)
@@ -1908,7 +2042,11 @@ def test_disallow_eval_train(self):
         m.train()
 
         # After export: this is not OK
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(NotImplementedError):
             m.eval()
         with self.assertRaises(NotImplementedError):
@@ -1949,7 +2087,11 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _assert_ops_are_correct(m: torch.fx.GraphModule, train: bool):
             targets = [n.target for n in m.graph.nodes]
@@ -2015,7 +2157,11 @@ def forward(self, x):
 
         m = M().train()
         example_inputs = (torch.randn(1, 3, 3, 3),)
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.ao.quantization.allow_exported_model_train_eval(m)
 
         # Mock m.recompile() to count how many times it's been called
@@ -2047,7 +2193,11 @@ def _fake_recompile():
     def test_model_is_exported(self):
         m = TestHelperModules.ConvWithBNRelu(relu=True)
         example_inputs = (torch.rand(3, 3, 5, 5),)
+<<<<<<< HEAD
         exported_gm = export(m, example_inputs, strict=True).module()
+=======
+        exported_gm = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fx_traced_gm = torch.fx.symbolic_trace(m, example_inputs)
         self.assertTrue(
             torch.ao.quantization.pt2e.export_utils.model_is_exported(exported_gm)
@@ -2065,7 +2215,13 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(is_per_channel=True, is_qat=True)
         )
+<<<<<<< HEAD
         m.conv_bn_relu = export(m.conv_bn_relu, example_inputs, strict=True).module()
+=======
+        m.conv_bn_relu = export_for_training(
+            m.conv_bn_relu, example_inputs, strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m.conv_bn_relu = prepare_qat_pt2e(m.conv_bn_relu, quantizer)
         m(*example_inputs)
         m.conv_bn_relu = convert_pt2e(m.conv_bn_relu)
@@ -2073,7 +2229,11 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_module_type(
             torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False)
         )
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m = convert_pt2e(m)
 
@@ -2121,9 +2281,20 @@ def test_groupwise_per_channel_quant(self):
         m(*example_inputs)
 
     def test_observer_callback(self):
+<<<<<<< HEAD
         from torch.library import custom_op
 
         @custom_op("test_int4::quantize_per_tensor_int4", mutates_args=())
+=======
+        from torch.library import impl, Library
+
+        test_lib = Library("test_int4", "DEF")  # noqa: TOR901
+        test_lib.define(
+            "quantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
+        )
+
+        @impl(test_lib, "quantize_per_tensor_int4", "CompositeExplicitAutograd")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def quantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
@@ -2136,7 +2307,15 @@ def quantize_per_tensor_int4(
                 .view(torch.bits8)
             )
 
+<<<<<<< HEAD
         @custom_op("test_int4::dequantize_per_tensor_int4", mutates_args=())
+=======
+        test_lib.define(
+            "dequantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
+        )
+
+        @impl(test_lib, "dequantize_per_tensor_int4", "CompositeExplicitAutograd")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def dequantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
@@ -2236,7 +2415,11 @@ def test_speed(self):
 
         def dynamic_quantize_pt2e(model, example_inputs):
             torch._dynamo.reset()
+<<<<<<< HEAD
             model = export(model, example_inputs, strict=True).module()
+=======
+            model = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Per channel quantization for weight
             # Dynamic quantization for activation
             # Please read a detail: https://fburl.com/code/30zds51q
@@ -2253,7 +2436,11 @@ def dynamic_quantize_pt2e(model, example_inputs):
             model = prepare_qat_pt2e(model, composed_quantizer)
             cur = time.time()
             # print("prepare time:", cur - prev)
+<<<<<<< HEAD
             # Without Calibration, scale/zero value will have an initialized value of 1.0
+=======
+            # Without Calibraiton, scale/zero value will have an initialized value of 1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Per channel quantization needs a proper scale/zero shape/value to work properly.
             # So we need to run calibration before converting to quantized model.
             model(*example_inputs)
@@ -2451,7 +2638,11 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1, 3, 5, 5),)
         m = M()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(),
         )
@@ -2533,7 +2724,11 @@ def prepare_obs_or_fq_callback(
                     edge_or_node_to_obs_or_fq[x] = new_observer
 
         example_inputs = (torch.rand(1, 32, 16, 16),)
+<<<<<<< HEAD
         gm = export(Model().eval(), example_inputs, strict=True).module()
+=======
+        gm = export_for_training(Model().eval(), example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = prepare_pt2e(gm, BackendAQuantizer())
         gm = convert_pt2e(gm)
         for n in gm.graph.nodes:
@@ -2560,7 +2755,13 @@ def check_nn_module(node):
                 "ConvWithBNRelu" in node.meta["nn_module_stack"]["L__self__"][1]
             )
 
+<<<<<<< HEAD
         m.conv_bn_relu = export(m.conv_bn_relu, example_inputs, strict=True).module()
+=======
+        m.conv_bn_relu = export_for_training(
+            m.conv_bn_relu, example_inputs, strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in m.conv_bn_relu.graph.nodes:
             if node.op not in ["placeholder", "output", "get_attr"]:
                 check_nn_module(node)
diff --git a/test/quantization/pt2e/test_quantize_pt2e_qat.py b/test/quantization/pt2e/test_quantize_pt2e_qat.py
index aa8743c32297f..4cc5d4f5eed5e 100644
--- a/test/quantization/pt2e/test_quantize_pt2e_qat.py
+++ b/test/quantization/pt2e/test_quantize_pt2e_qat.py
@@ -34,7 +34,11 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
@@ -140,7 +144,13 @@ def _verify_symmetric_xnnpack_qat_numerics_helper(
                 is_per_channel=is_per_channel, is_qat=True
             )
         )
+<<<<<<< HEAD
         model_pt2e = export(model_pt2e, example_inputs, strict=True).module()
+=======
+        model_pt2e = export_for_training(
+            model_pt2e, example_inputs, strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_pt2e = prepare_qat_pt2e(model_pt2e, quantizer)
         torch.manual_seed(MANUAL_SEED)
         after_prepare_result_pt2e = model_pt2e(*example_inputs)
@@ -227,7 +237,11 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel, is_qat=True)
         )
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
 
@@ -275,9 +289,15 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
 
         # Verify: conv literal args
         if expected_conv_literal_args is not None:
+<<<<<<< HEAD
             assert len(expected_conv_literal_args) == 6, (
                 "wrong num conv args, bad test setup"
             )
+=======
+            assert (
+                len(expected_conv_literal_args) == 6
+            ), "wrong num conv args, bad test setup"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(6):
                 if i + 3 < len(conv_node.args):
                     self.assertEqual(
@@ -616,7 +636,11 @@ def forward(self, x):
         m = M(self.conv_class, self.bn_class, backbone)
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(get_symmetric_quantization_config(is_qat=True))
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
         m = convert_pt2e(m)
@@ -665,10 +689,23 @@ def get_source_fn(node: torch.fx.Node):
         self.assertNotEqual(get_source_fn(second_conv), get_source_fn(second_relu))
         self.assertNotEqual(get_source_fn(first_relu), get_source_fn(second_relu))
 
+<<<<<<< HEAD
     def test_qat_conv_bn_bias_derived_qspec(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
         m = export(m, example_inputs, strict=True).module()
+=======
+        # Assert that "backbone" exists only in the second set of conv and relu's partition
+        self.assertTrue("backbone" not in get_source_fn(first_conv))
+        self.assertTrue("backbone" not in get_source_fn(first_relu))
+        self.assertTrue("backbone" in get_source_fn(second_conv))
+        self.assertTrue("backbone" in get_source_fn(second_relu))
+
+    def test_qat_conv_bn_bias_derived_qspec(self):
+        m = self._get_conv_bn_model()
+        example_inputs = self.example_inputs
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnDerivedBiasQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -715,7 +752,11 @@ def test_qat_conv_bn_bias_derived_qspec(self):
     def test_qat_per_channel_weight_custom_dtype(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnInt32WeightQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -769,7 +810,11 @@ def test_qat_conv_transpose_bn_relu(self):
     def test_qat_conv_bn_per_channel_weight_bias(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnDerivedBiasQuantizer(is_per_channel=True)
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -826,7 +871,11 @@ def test_fold_bn_erases_bn_node(self):
         it into conv in `convert_pt2e` even in train mode.
         """
         m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
+<<<<<<< HEAD
         m = export(m, self.example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, self.example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel=False, is_qat=True),
@@ -842,7 +891,11 @@ def test_fold_bn_erases_add_node(self):
         Test that batch norm stat tracking (which results in an add_ tensor) is removed when folding batch norm.
         """
         m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
+<<<<<<< HEAD
         m = export(m, self.example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, self.example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _has_add_(graph):
             for node in graph.nodes:
@@ -1107,7 +1160,13 @@ def _prepare_qat_linears(self, model):
                     in_channels = child.linear1.weight.size(1)
 
                 example_input = (torch.rand((1, in_channels)),)
+<<<<<<< HEAD
                 traced_child = export(child, example_input, strict=True).module()
+=======
+                traced_child = export_for_training(
+                    child, example_input, strict=True
+                ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 quantizer = XNNPACKQuantizer()
                 quantization_config = get_symmetric_quantization_config(
                     is_per_channel=True, is_qat=True
@@ -1138,7 +1197,11 @@ def test_mixing_qat_ptq(self):
         self._convert_qat_linears(model)
         model(*example_inputs)
 
+<<<<<<< HEAD
         model_pt2e = export(model, example_inputs, strict=True).module()
+=======
+        model_pt2e = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         quantizer = XNNPACKQuantizer()
         quantizer.set_module_type(torch.nn.Linear, None)
diff --git a/test/quantization/pt2e/test_representation.py b/test/quantization/pt2e/test_representation.py
index 1c97dd6a73862..57577351be5e8 100644
--- a/test/quantization/pt2e/test_representation.py
+++ b/test/quantization/pt2e/test_representation.py
@@ -10,7 +10,11 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     QuantizationTestCase,
@@ -34,7 +38,11 @@ def _test_representation(
     ) -> torch.nn.Module:
         # resetting dynamo cache
         torch._dynamo.reset()
+<<<<<<< HEAD
         model = export(model, example_inputs, strict=True).module()
+=======
+        model = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_copy = copy.deepcopy(model)
 
         model = prepare_pt2e(model, quantizer)
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index dfd591cb9419c..8fae022fc03cf 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -17,7 +17,11 @@
     QUANT_ANNOTATION_KEY,
     X86InductorQuantizer,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     QuantizationTestCase,
@@ -332,7 +336,11 @@ def __init__(
         ) -> None:
             super().__init__()
             self.linear = nn.Linear(4, 4, bias=use_bias)
+<<<<<<< HEAD
             if postop is nn.GELU:
+=======
+            if postop == nn.GELU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.postop = postop(approximate=post_op_algo)
             else:
                 self.postop = postop(inplace=inplace_postop)
@@ -668,7 +676,11 @@ def _test_quantizer(
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # QAT Model failed to deepcopy
         export_model = m if is_qat else copy.deepcopy(m)
@@ -2344,7 +2356,11 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -2464,11 +2480,19 @@ def forward(self, x):
             torch.ops.quantized_decomposed.dequantize_per_channel.default: 2,
         }
         node_list = [
+<<<<<<< HEAD
             # Q/DQ for first linear
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
             torch.ops.aten.linear.default,
             # Q/DQ for second linear
+=======
+            # Q/DQ for first lienar
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.aten.linear.default,
+            # Q/DQ for second lienar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
             torch.ops.aten.linear.default,
diff --git a/test/quantization/pt2e/test_xnnpack_quantizer.py b/test/quantization/pt2e/test_xnnpack_quantizer.py
index 6b9acaaf741e3..8ab75d983c9cb 100644
--- a/test/quantization/pt2e/test_xnnpack_quantizer.py
+++ b/test/quantization/pt2e/test_xnnpack_quantizer.py
@@ -29,7 +29,11 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     PT2EQuantizationTestCase,
@@ -362,7 +366,11 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -498,7 +506,11 @@ def test_propagate_annotation(self):
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
         # program capture
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -763,7 +775,13 @@ def forward(self, input_tensor, hidden_tensor):
             model_fx = _convert_to_reference_decomposed_fx(model_fx)
 
             with torchdynamo.config.patch(allow_rnn=True):
+<<<<<<< HEAD
                 model_graph = export(model_graph, example_inputs, strict=True).module()
+=======
+                model_graph = export_for_training(
+                    model_graph, example_inputs, strict=True
+                ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
                 is_per_channel=False, is_dynamic=False
@@ -823,7 +841,13 @@ def forward(self, input_tensor, hidden_tensor):
             model_fx = _convert_to_reference_decomposed_fx(model_fx)
 
             with torchdynamo.config.patch(allow_rnn=True):
+<<<<<<< HEAD
                 model_graph = export(model_graph, example_inputs, strict=True).module()
+=======
+                model_graph = export_for_training(
+                    model_graph, example_inputs, strict=True
+                ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
                 is_per_channel=False, is_dynamic=False
@@ -1031,7 +1055,11 @@ def test_resnet18(self):
             m = torchvision.models.resnet18().eval()
             m_copy = copy.deepcopy(m)
             # program capture
+<<<<<<< HEAD
             m = export(m, example_inputs, strict=True).module()
+=======
+            m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(is_per_channel=True)
@@ -1062,7 +1090,11 @@ def test_resnet18(self):
             # the result matches exactly after prepare
             # Note: this currently will always be true since we are inserting observers
             # the check becomes useful when we add qat examples
+<<<<<<< HEAD
             # but we can still manually inspect the printed observers to make sure
+=======
+            # but we can still manully inspect the printed observers to make sure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # it matches
             self.assertEqual(after_prepare_result, after_prepare_result_fx)
             self.assertEqual(
diff --git a/test/run_test.py b/test/run_test.py
index 56c724f9570c2..a73161cfab096 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -12,17 +12,28 @@
 import signal
 import subprocess
 import sys
+<<<<<<< HEAD
 import sysconfig
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import tempfile
 import time
 from collections import defaultdict
 from collections.abc import Sequence
 from contextlib import ExitStack
 from datetime import datetime
+<<<<<<< HEAD
 from importlib.metadata import PackageNotFoundError, version
 from pathlib import Path
 from typing import Any, cast, NamedTuple, Optional, Union
 
+=======
+from pathlib import Path
+from typing import Any, cast, NamedTuple, Optional, Union
+
+import pkg_resources
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.distributed as dist
 from torch.multiprocessing import current_process, get_context
@@ -36,9 +47,15 @@
     TEST_CUDA,
     TEST_SAVE_XML,
     TEST_WITH_ASAN,
+<<<<<<< HEAD
     TEST_WITH_ROCM,
     TEST_WITH_SLOW_GRADCHECK,
     TEST_XPU,
+=======
+    TEST_WITH_CROSSREF,
+    TEST_WITH_ROCM,
+    TEST_WITH_SLOW_GRADCHECK,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -169,37 +186,75 @@ def __contains__(self, item):
 
 ROCM_BLOCKLIST = [
     "distributed/rpc/test_faulty_agent",
+<<<<<<< HEAD
     "distributed/rpc/test_share_memory",
     "distributed/rpc/cuda/test_tensorpipe_agent",
     "inductor/test_max_autotune",  # taking excessive time, many tests >30 min
+=======
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/test_share_memory",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_determination",
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
     "test_jit_cuda_fuser",
+<<<<<<< HEAD
     "test_openreg",
 ]
 
 if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
     ROCM_BLOCKLIST.append("test_typing")
     
+=======
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 S390X_BLOCKLIST = [
     # these tests fail due to various reasons
     "dynamo/test_misc",
     "inductor/test_cpu_repro",
     "inductor/test_cpu_select_algorithm",
+<<<<<<< HEAD
+=======
+    "inductor/test_aot_inductor_arrayref",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "inductor/test_torchinductor_codegen_dynamic_shapes",
     "lazy/test_meta_kernel",
     "onnx/test_utility_funs",
     "profiler/test_profiler",
+<<<<<<< HEAD
     "test_jit",
     "dynamo/test_utils",
     "test_nn",
+=======
+    "test_ao_sparsity",
+    "test_cpp_extensions_open_device_registration",
+    "test_jit",
+    "test_metal",
+    "test_mps",
+    "dynamo/test_torchrec",
+    "inductor/test_aot_inductor_utils",
+    "inductor/test_coordinate_descent_tuner",
+    "test_jiterator",
+    "inductor/test_cpu_cpp_wrapper",
+    "export/test_converter",
+    "inductor/test_inductor_freezing",
+    "dynamo/test_utils",
+    "test_nn",
+    "functorch/test_ops",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these tests run long and fail in addition to that
     "dynamo/test_dynamic_shapes",
     "test_quantization",
     "inductor/test_torchinductor",
     "inductor/test_torchinductor_dynamic_shapes",
     "inductor/test_torchinductor_opinfo",
+<<<<<<< HEAD
+=======
+    "test_binary_ufuncs",
+    "test_unary_ufuncs",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these tests fail when cuda is not available
     "inductor/test_aot_inductor",
     "inductor/test_best_config",
@@ -218,12 +273,18 @@ def __contains__(self, item):
     # these tests fail when mkldnn is not available
     "inductor/test_custom_post_grad_passes",
     "inductor/test_mkldnn_pattern_matcher",
+<<<<<<< HEAD
     "test_metal",
     # lacks quantization support
     "onnx/test_models_quantized_onnxruntime",
     "onnx/test_pytorch_onnx_onnxruntime",
     # sysctl -n hw.memsize is not available
     "test_mps",
+=======
+    # lacks quantization support
+    "onnx/test_models_quantized_onnxruntime",
+    "onnx/test_pytorch_onnx_onnxruntime",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/issues/102078
     "test_decomp",
     # https://github.com/pytorch/pytorch/issues/146698
@@ -234,6 +295,10 @@ def __contains__(self, item):
     # some false errors
     "doctests",
     # new failures to investigate and fix
+<<<<<<< HEAD
+=======
+    "cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_tensorboard",
     # onnx + protobuf failure, see
     # https://github.com/protocolbuffers/protobuf/issues/22104
@@ -242,10 +307,13 @@ def __contains__(self, item):
     "inductor/test_config",
     "test_public_bindings",
     "test_testing",
+<<<<<<< HEAD
     # depend on z3-solver
     "fx/test_z3_gradual_types",
     "test_proxy_tensor",
     "test_openreg",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 XPU_BLOCKLIST = [
@@ -257,7 +325,10 @@ def __contains__(self, item):
     "profiler/test_profiler_tree",
     "profiler/test_record_function",
     "profiler/test_torch_tidy",
+<<<<<<< HEAD
     "test_openreg",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 XPU_TEST = [
@@ -268,6 +339,10 @@ def __contains__(self, item):
 RUN_PARALLEL_BLOCKLIST = [
     "test_extension_utils",
     "test_cpp_extensions_jit",
+<<<<<<< HEAD
+=======
+    "test_cpp_extensions_open_device_registration",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_cpp_extensions_stream_and_event",
     "test_cpp_extensions_mtia_backend",
     "test_jit_disabled",
@@ -286,7 +361,10 @@ def __contains__(self, item):
     # temporarily sets a global config
     "test_autograd_fallback",
     "inductor/test_compiler_bisector",
+<<<<<<< HEAD
     "test_privateuseone_python_backend",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ] + FSDP_TEST
 
 # Test files that should always be run serially with other test files,
@@ -403,7 +481,10 @@ def __contains__(self, item):
 ]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
+<<<<<<< HEAD
 QUANTIZATION_TESTS = [test for test in TESTS if test.startswith("test_quantization")]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _is_cpp_test(test):
@@ -657,6 +738,7 @@ def run_test(
     return ret_code
 
 
+<<<<<<< HEAD
 def install_cpp_extensions(extensions_dir, env=os.environ):
     # Wipe the build folder, if it exists already
     build_dir = os.path.join(extensions_dir, "build")
@@ -684,6 +766,28 @@ def install_cpp_extensions(extensions_dir, env=os.environ):
         platlib_path, os.path.splitdrive(platlib_path)[0] + os.sep
     )
     install_directory = os.path.join(extensions_dir, "install", platlib_rel)
+=======
+def install_cpp_extensions(cpp_extensions_test_dir, env=os.environ):
+    # Wipe the build folder, if it exists already
+    cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, "build")
+    if os.path.exists(cpp_extensions_test_build_dir):
+        shutil.rmtree(cpp_extensions_test_build_dir)
+
+    # Build the test cpp extensions modules
+    cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
+    return_code = shell(cmd, cwd=cpp_extensions_test_dir, env=env)
+    if return_code != 0:
+        return None, return_code
+
+    install_directory = ""
+    # install directory is the one that is named site-packages
+    for root, directories, _ in os.walk(
+        os.path.join(cpp_extensions_test_dir, "install")
+    ):
+        for directory in directories:
+            if "-packages" in directory:
+                install_directory = os.path.join(root, directory)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert install_directory, "install_directory must not be empty"
     return install_directory, 0
@@ -757,8 +861,11 @@ def print_to_file(s):
                 REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key
             ) as f:
                 current_failure = f.read()
+<<<<<<< HEAD
                 if current_failure == "null":
                     current_failure = f"'{test_file}'"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except FileNotFoundError:
             print_to_file(
                 "No stepcurrent file found. Either pytest didn't get to run (e.g. import error)"
@@ -778,9 +885,12 @@ def print_to_file(s):
                 "Test succeeeded in new process, continuing with the rest of the tests"
             )
         elif num_failures[current_failure] >= 3:
+<<<<<<< HEAD
             # This is for log classifier so it can prioritize consistently
             # failing tests instead of reruns. [1:-1] to remove quotes
             print_to_file(f"FAILED CONSISTENTLY: {current_failure[1:-1]}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not continue_through_error:
                 print_to_file("Stopping at first consistent failure")
                 break
@@ -795,6 +905,11 @@ def print_to_file(s):
             print_to_file("Retrying single test...")
         print_items = []  # do not continue printing them, massive waste of space
 
+<<<<<<< HEAD
+=======
+    if "null" in num_failures:
+        num_failures[f"'{test_file}'"] = num_failures.pop("null")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
     flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
     if len(flaky_failures) > 0:
@@ -833,6 +948,7 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
     # Build the test cpp extensions modules
     shell_env = os.environ.copy()
     shell_env["USE_NINJA"] = str(1 if use_ninja else 0)
+<<<<<<< HEAD
     install_cmd = [
         sys.executable,
         "-m",
@@ -844,6 +960,10 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         "./install",
     ]
     wheel_cmd = [sys.executable, "-m", "build", "--wheel", "--no-isolation"]
+=======
+    install_cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
+    wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return_code = shell(install_cmd, cwd=cpp_extensions_test_dir, env=shell_env)
     if return_code != 0:
         return return_code
@@ -851,9 +971,14 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         exts_to_build = [
             (install_cmd, "no_python_abi_suffix_test"),
         ]
+<<<<<<< HEAD
         if TEST_CUDA or TEST_XPU:
             exts_to_build.append((wheel_cmd, "python_agnostic_extension"))
         if TEST_CUDA:
+=======
+        if TEST_CUDA:
+            exts_to_build.append((wheel_cmd, "python_agnostic_extension"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             exts_to_build.append((install_cmd, "libtorch_agnostic_extension"))
         for cmd, extension_dir in exts_to_build:
             return_code = shell(
@@ -929,6 +1054,7 @@ def _test_autoload(test_directory, options, enable=True):
         os.environ.pop("TORCH_DEVICE_BACKEND_AUTOLOAD")
 
 
+<<<<<<< HEAD
 # test_openreg is designed to run all tests under torch_openreg, which
 # is an torch backend similar to CUDA or MPS and implemented by using
 # third-party accelerator integration mechanism. Therefore, if all the
@@ -937,12 +1063,18 @@ def _test_autoload(test_directory, options, enable=True):
 def test_openreg(test_module, test_directory, options):
     openreg_dir = os.path.join(
         test_directory, "cpp_extensions", "open_registration_extension", "torch_openreg"
+=======
+def run_test_with_openreg(test_module, test_directory, options):
+    openreg_dir = os.path.join(
+        test_directory, "cpp_extensions", "open_registration_extension"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     install_dir, return_code = install_cpp_extensions(openreg_dir)
     if return_code != 0:
         return return_code
 
     with extend_python_path([install_dir]):
+<<<<<<< HEAD
         cmd = [
             sys.executable,
             "-m",
@@ -957,6 +1089,16 @@ def test_openreg(test_module, test_directory, options):
 
 def test_distributed(test_module, test_directory, options):
     mpi_available = shutil.which("mpiexec")
+=======
+        return run_test(test_module, test_directory, options)
+
+
+def test_distributed(test_module, test_directory, options):
+    # MPI tests are broken with Python-3.9
+    mpi_available = subprocess.call(
+        "command -v mpiexec", shell=True
+    ) == 0 and sys.version_info < (3, 9)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if options.verbose and not mpi_available:
         print_to_stderr("MPI not available -- MPI backend tests will be skipped")
 
@@ -1125,9 +1267,12 @@ def run_doctests(test_module, test_directory, options):
     if torch.mps.is_available():
         os.environ["TORCH_DOCTEST_MPS"] = "1"
 
+<<<<<<< HEAD
     if torch.distributed.is_available():
         os.environ["TORCH_DOCTEST_DISTRIBUTED"] = "1"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if 0:
         # TODO: could try to enable some of these
         os.environ["TORCH_DOCTEST_QUANTIZED_DYNAMIC"] = "1"
@@ -1190,11 +1335,14 @@ def handle_log_file(
             if re.search("Running .* items in this shard:", line):
                 print_to_stderr(line.rstrip())
         print_to_stderr("")
+<<<<<<< HEAD
 
         # Temporary dumping the log file into stderr for QA reference
         print_to_stderr(f"\n START of temporary dumping of {test} execution log file from ({file_path})")
         print_to_stderr(full_text)
         print_to_stderr(f"END of temporary dumping of {test} execution log file form ({file_path})\n")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     # otherwise: print entire file
@@ -1204,6 +1352,7 @@ def handle_log_file(
 
 
 def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
+<<<<<<< HEAD
     if is_distributed_test:
         # Distributed tests do not support rerun, see https://github.com/pytorch/pytorch/issues/162978
         rerun_options = ["-x", "--reruns=0"]
@@ -1212,6 +1361,14 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
         # 3+ hours. So, let's opt for less number of reruns. We need at least 150 instances of the
         # test every 2 weeks to satisfy the SQL query (15 x 14 = 210).
         count = 15 if TEST_WITH_ASAN else 50
+=======
+    if RERUN_DISABLED_TESTS:
+        # Distributed tests are too slow, so running them x50 will cause the jobs to timeout after
+        # 3+ hours. So, let's opt for less number of reruns. We need at least 150 instances of the
+        # test every 2 weeks to satisfy the SQL query (15 x 14 = 210). The same logic applies
+        # to ASAN, which is also slow
+        count = 15 if is_distributed_test or TEST_WITH_ASAN else 50
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # When under rerun-disabled-tests mode, run the same tests multiple times to determine their
         # flakiness status. Default to 50 re-runs
         rerun_options = ["--flake-finder", f"--flake-runs={count}"]
@@ -1288,7 +1445,13 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options):
     "test_ci_sanity_check_fail": run_ci_sanity_check,
     "test_autoload_enable": test_autoload_enable,
     "test_autoload_disable": test_autoload_disable,
+<<<<<<< HEAD
     "test_openreg": test_openreg,
+=======
+    "test_cpp_extensions_open_device_registration": run_test_with_openreg,
+    "test_openreg": run_test_with_openreg,
+    "test_transformers_privateuse1": run_test_with_openreg,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -1441,6 +1604,7 @@ def parse_args():
         action="store_true",
         help="Enables removing tests based on TD",
         default=IS_CI
+<<<<<<< HEAD
         and get_pr_number() is not None
         and not strtobool(os.environ.get("NO_TD", "False"))
         and not IS_MACOS
@@ -1448,6 +1612,20 @@ def parse_args():
         and "onnx" not in BUILD_ENVIRONMENT
         and os.environ.get("GITHUB_WORKFLOW", "slow")
         in ("trunk", "pull", "rocm", "rocm-mi300"),
+=======
+        and (
+            TEST_WITH_CROSSREF
+            or TEST_CONFIG == "distributed"
+            or TEST_CONFIG == "default"
+        )
+        and get_pr_number() is not None
+        and not strtobool(os.environ.get("NO_TD", "False"))
+        and not TEST_WITH_ROCM
+        and not IS_MACOS
+        and "xpu" not in BUILD_ENVIRONMENT
+        and "onnx" not in BUILD_ENVIRONMENT
+        and os.environ.get("GITHUB_WORKFLOW", "slow") in ("trunk", "pull"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     parser.add_argument(
         "--shard",
@@ -1483,11 +1661,14 @@ def parse_args():
         help="exclude inductor tests",
     )
     parser.add_argument(
+<<<<<<< HEAD
         "--exclude-quantization-tests",
         action="store_true",
         help="exclude quantization tests",
     )
     parser.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--dry-run",
         action="store_true",
         help="Only list the test that will run.",
@@ -1510,7 +1691,10 @@ def parse_args():
     parser.add_argument(
         "--upload-artifacts-while-running",
         action="store_true",
+<<<<<<< HEAD
         default=IS_CI,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     group = parser.add_mutually_exclusive_group()
@@ -1597,7 +1781,11 @@ def get_selected_tests(options) -> list[str]:
     if options.einops:
         selected_tests = list(
             filter(
+<<<<<<< HEAD
                 lambda test_name: test_name.startswith("dynamo/test_einops"),
+=======
+                lambda test_name: test_name.startswith("test/dynamo/test_einops"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 selected_tests,
             )
         )
@@ -1623,8 +1811,11 @@ def get_selected_tests(options) -> list[str]:
             "test_nn",
             "inductor/test_mps_basic",
             "inductor/test_torchinductor",
+<<<<<<< HEAD
             "inductor/test_aot_inductor",
             "inductor/test_torchinductor_dynamic_shapes",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     else:
         # Exclude all mps tests otherwise
@@ -1633,7 +1824,11 @@ def get_selected_tests(options) -> list[str]:
     if options.xpu:
         selected_tests = exclude_tests(XPU_BLOCKLIST, selected_tests, "on XPU")
     else:
+<<<<<<< HEAD
         # Exclude all xpu specific tests otherwise
+=======
+        # Exclude all xpu specifc tests otherwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         options.exclude.extend(XPU_TEST)
 
     # Filter to only run onnx tests when --onnx option is specified
@@ -1660,9 +1855,12 @@ def get_selected_tests(options) -> list[str]:
     if options.exclude_aot_dispatch_tests:
         options.exclude.extend(AOT_DISPATCH_TESTS)
 
+<<<<<<< HEAD
     if options.exclude_quantization_tests:
         options.exclude.extend(QUANTIZATION_TESTS)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
     if torch.version.cuda is not None:
         options.exclude.extend(["distributions/test_constraints"])
@@ -1901,7 +2099,10 @@ def handle_complete(failure: Optional[TestFailure]):
         "If running on CI, add the 'keep-going' label to your PR and rerun your jobs."
     )
 
+<<<<<<< HEAD
     pool = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         for test in selected_tests_serial:
             options_clone = copy.deepcopy(options)
@@ -1964,9 +2165,14 @@ def parallel_test_completion_callback(failure):
         del os.environ["NUM_PARALLEL_PROCS"]
 
     finally:
+<<<<<<< HEAD
         if pool:
             pool.terminate()
             pool.join()
+=======
+        pool.terminate()
+        pool.join()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return
 
@@ -1977,6 +2183,7 @@ def check_pip_packages() -> None:
         "pytest-flakefinder",
         "pytest-xdist",
     ]
+<<<<<<< HEAD
     try:
         for pkg in packages:
             version(pkg)
@@ -1985,6 +2192,15 @@ def check_pip_packages() -> None:
             f"Missing pip dependency: {pkg}, please run `pip install -r .ci/docker/requirements-ci.txt`"
         )
         sys.exit(1)
+=======
+    installed_packages = [i.key for i in pkg_resources.working_set]
+    for package in packages:
+        if package not in installed_packages:
+            print_to_stderr(
+                f"Missing pip dependency: {package}, please run `pip install -r .ci/docker/requirements-ci.txt`"
+            )
+            sys.exit(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def main():
diff --git a/test/scripts/run_cuda_memcheck.py b/test/scripts/run_cuda_memcheck.py
index ca3196f4f4910..4aad8e57dcb53 100755
--- a/test/scripts/run_cuda_memcheck.py
+++ b/test/scripts/run_cuda_memcheck.py
@@ -157,9 +157,15 @@ async def run1(coroutine_id):
         gpuid = coroutine_id % GPUS
     else:
         gpu_assignments = args.gpus.split(":")
+<<<<<<< HEAD
         assert args.nproc == len(gpu_assignments), (
             "Please specify GPU assignment for each process, separated by :"
         )
+=======
+        assert args.nproc == len(
+            gpu_assignments
+        ), "Please specify GPU assignment for each process, separated by :"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gpuid = gpu_assignments[coroutine_id]
 
     while progress < len(ALL_TESTS):
diff --git a/test/slow_tests.json b/test/slow_tests.json
index fe23e854cc8e8..0a35521e3ece2 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
   "EndToEndLSTM (__main__.RNNTest)": 207.89400227864584,
   "MultiheadAttention (__main__.ModulesTest)": 141.1396687825521,
   "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.02366638183594,
@@ -242,4 +243,266 @@
   "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 98.63116455078125,
   "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 94.85683314005534,
   "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 173.00183614095053
+=======
+  "EndToEndLSTM (__main__.RNNTest)": 184.65333048502603,
+  "MultiheadAttention (__main__.ModulesTest)": 134.43099975585938,
+  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 199.10467020670572,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 83.39333131578233,
+  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 113.98933410644531,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 61.397444831000435,
+  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 176.93266805013022,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.99899800618489,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 66.08271371750604,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.71266555786133,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 151.31399536132812,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 267.58533732096356,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 120.89933013916016,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 73.94028554643903,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 112.47666422526042,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 609.4812072753906,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 158.25587558746338,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 502.05988226996527,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 494.381110297309,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 124.20333480834961,
+  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.64700063069662,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.78066380818684,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 78.40683364868164,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 250.50655958387586,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 145.54050064086914,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 327.4082217746311,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 409.865227593316,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 310.50811258951825,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 90.77466710408528,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 88.94400024414062,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 61.99116643269857,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 89.07300059000652,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 98.6163330078125,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 65.7913335164388,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 400.17799886067706,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 65.32166544596355,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 433.8283386230469,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 65.70300038655598,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 246.12633005777994,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 237.4903361002604,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1256.5741882324219,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 68.78149922688802,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1055.0651448567708,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.93966611226399,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 65.20016670227051,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 61.16316668192545,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 62.08466657002767,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 62.160666147867836,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 65.54600079854329,
+  "test_comprehensive_linalg_vector_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 85.31400044759114,
+  "test_comprehensive_linalg_vector_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 86.7923355102539,
+  "test_comprehensive_linalg_vector_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 83.80366770426433,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 65.01507412945783,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 66.07433333220305,
+  "test_comprehensive_logspace_cpu_float32 (__main__.TestInductorOpInfoCPU)": 169.19166564941406,
+  "test_comprehensive_logspace_cpu_float64 (__main__.TestInductorOpInfoCPU)": 164.14199829101562,
+  "test_comprehensive_logspace_cpu_int32 (__main__.TestInductorOpInfoCPU)": 167.1233367919922,
+  "test_comprehensive_logspace_cpu_int64 (__main__.TestInductorOpInfoCPU)": 161.9933319091797,
+  "test_comprehensive_masked_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 204.7566680908203,
+  "test_comprehensive_masked_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 202.51532999674478,
+  "test_comprehensive_masked_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 205.77066548665366,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 114.11033376057942,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 105.25066757202148,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 113.67999903361003,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 101.1036114162869,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 94.08183288574219,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 94.20638847351074,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 93.08233388264973,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 94.11516571044922,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 107.86000061035156,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 94.72633361816406,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 284.54283142089844,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 228.18283081054688,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 77.24066543579102,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 77.22533416748047,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 153.27567036946616,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 151.73899841308594,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 137.59866841634116,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 1176.6233723958333,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 1034.320332845052,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 1053.9040120442708,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 901.5313517252604,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 914.4829915364584,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1132.8611653645833,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1129.974344889323,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1135.6740112304688,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 891.2769978841146,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 687.6756591796875,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 683.6936645507812,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 678.6616617838541,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 701.6133422851562,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 495.5906626383464,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 487.7074940999349,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 115.73200225830078,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 118.66033426920573,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 115.82266743977864,
+  "test_comprehensive_nn_functional_max_unpool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 67.43566640218098,
+  "test_comprehensive_nn_functional_unfold_cpu_bool (__main__.TestInductorOpInfoCPU)": 68.42166900634766,
+  "test_comprehensive_nn_functional_unfold_cpu_float16 (__main__.TestInductorOpInfoCPU)": 118.02966817220052,
+  "test_comprehensive_nn_functional_unfold_cpu_float32 (__main__.TestInductorOpInfoCPU)": 105.94366709391277,
+  "test_comprehensive_nn_functional_unfold_cpu_float64 (__main__.TestInductorOpInfoCPU)": 118.99266815185547,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 115.5125020345052,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 103.90849939982097,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 66.59218077226119,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 64.84800084431966,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 60.27900060017904,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 68.57966613769531,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 66.81166776021321,
+  "test_compute_global_tensor_shape_1D_invalid_shape (__main__.UtilTest)": 209.35732873280844,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 154.30916849772134,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 142.58683141072592,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 94.73116620381673,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 110.29800033569336,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 244.17077806260852,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 627.981665717231,
+  "test_conv2d_unary_cpu_cpp_wrapper (__main__.TestCppWrapper)": 68.8806660970052,
+  "test_conv3d_binary_broadcast_shapes_cpu_cpu (__main__.TestPatternMatcherGenericCPU)": 75.51066589355469,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 78.39416631062825,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 74.26416778564453,
+  "test_count_nonzero_all (__main__.TestBool)": 630.1393364800347,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 666.0326605902778,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 84.40749867757161,
+  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 88.80566660563152,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 153.85249682267508,
+  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.68433125813802,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.70655483669705,
+  "test_fail_creation_ops.py (__main__.TestTyping)": 70.33796894550323,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 73.33583068847656,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 95.88233311971028,
+  "test_fn_gradgrad_map_nested_cpu_float64 (__main__.TestBwdGradientsCPU)": 84.52066802978516,
+  "test_fn_gradgrad_map_triple_nested_cpu_float64 (__main__.TestBwdGradientsCPU)": 518.5540161132812,
+  "test_fn_gradgrad_map_triple_nested_cuda_float64 (__main__.TestBwdGradientsCUDA)": 352.0611623128255,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 98.19175052642822,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 172.9732191297743,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 178.04811265733508,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 96.32300059000652,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 94.25100072224934,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 110.52466583251953,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 147.46899922688803,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 137.17833455403647,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 223.40133412679037,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.75699996948242,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 159.8721669514974,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 661.1241658528646,
+  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 196.1066640218099,
+  "test_group_norm (__main__.TestQuantizedOps)": 143.82022105322943,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 252.9750010172526,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 68.59622192382812,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 132.5279998779297,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 151.57311164008246,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 117.37533315022786,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 577.0678304036459,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 72.07283401489258,
+  "test_linear (__main__.TestStaticQuantizedModule)": 178.05622397528754,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 64.9945551554362,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 83.73499965667725,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 66.0846659342448,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 125.42355600992839,
+  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 445.62599690755206,
+  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 134.19500223795572,
+  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 363.20066324869794,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.19877794053819,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 61.39377763536241,
+  "test_proper_exit (__main__.TestDataLoader)": 240.04466501871744,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 271.00699615478516,
+  "test_python_ref_executor__refs_special_zeta_executor_aten_cuda_float64 (__main__.TestCommonCUDA)": 64.18233426411946,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 151.71777767605252,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 61.14148919847276,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 60.4263552347819,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 88.72544479370117,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 69.56600189208984,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 82.00166829427083,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 78.14999898274739,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 68.93766784667969,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 75.8633321126302,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 78.89766947428386,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 67.93033345540364,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 76.1066665649414,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 77.59533437093098,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 70.57233174641927,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 86.69966634114583,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 82.32333374023438,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 69.6453348795573,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 79.38400014241536,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 80.18400065104167,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 71.49599965413411,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 78.35600026448567,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 82.9933344523112,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 71.89866892496745,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 75.72566731770833,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 80.28999837239583,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 80.68799845377605,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 85.98066711425781,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 418.50034586588544,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 842.5636698404948,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 658.1936645507812,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1321.1958414713542,
+  "test_quick_core_backward_expand_copy_cuda_float64 (__main__.TestDecompCUDA)": 72.79183260599773,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 68.16699981689453,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 222.59966786702475,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 89.49299875895183,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 208.05382792154947,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 61.09833272298177,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 119.15299987792969,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 72.5490010579427,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 137.61000188191733,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 83.77516682942708,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 112.9426663716634,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 68.61433410644531,
+  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 71.73550089200337,
+  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 66.45991698900859,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 60.68633270263672,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.52111011081271,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 156.46233622233072,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 128.3509979248047,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 148.15933481852213,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 145.64644877115884,
+  "test_sort_stable_cpu (__main__.CpuTritonTests)": 76.39066569010417,
+  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.5290018717448,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.72099796930948,
+  "test_tensor_split (__main__.TestVmapOperators)": 72.26428134347766,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 100.98866719669766,
+  "test_terminate_signal (__main__.ForkTest)": 134.33088995267948,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 133.97255667547384,
+  "test_terminate_signal (__main__.SpawnTest)": 137.73455943001642,
+  "test_torch_distributions_functions_dynamic_shapes (__main__.DynamicShapesFunctionTests)": 193.52591840426126,
+  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 144.84678077697754,
+  "test_train_parity_multi_group_unshard_async_op (__main__.TestFullyShard1DTrainingCore)": 62.523999532063804,
+  "test_transformer_backend_inductor_fullgraph_True (__main__.TestFullyShardCompile)": 82.06791687011719,
+  "test_transformer_backend_inductor_fullgraph_True_graph_partition (__main__.TestFullyShardCompile)": 82.57758394877116,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 93.72849909464519,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 86.33483123779297,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 84.0580005645752,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 128.47150166829428,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 125.92099952697754,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.98566563924153,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 173.52266354031033,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 154.03555562761096,
+  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 95.91699727376302,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 91.32800038655598,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 72.65949885050456,
+  "test_vmapjvpvjp_diff_cuda_float32 (__main__.TestOperatorsCUDA)": 64.64249992370605,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 114.75466410319011,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 61.643143063499814,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 76.99316660563152,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 67.82800102233887,
+  "test_vmapjvpvjp_linalg_pinv_singular_cpu_float32 (__main__.TestOperatorsCPU)": 60.267666498819985,
+  "test_vmapjvpvjp_linalg_solve_triangular_cuda_float32 (__main__.TestOperatorsCUDA)": 68.94433307647705,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 73.93966547648112,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 88.03500111897786,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 90.39650090535481,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 79.07066853841145,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 95.49366696675618,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 81.16833623250325,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 61.30799865722656,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 79.50816663106282,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 100.31945332613859,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 106.99416732788086,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 103.08566665649414,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 149.96750259399414
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
\ No newline at end of file
diff --git a/test/test_accelerator.py b/test/test_accelerator.py
index 21731bd275b60..317be9d781881 100644
--- a/test/test_accelerator.py
+++ b/test/test_accelerator.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: tests"]
 
+<<<<<<< HEAD
 import gc
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import unittest
 
@@ -157,6 +160,7 @@ def test_generic_event_behavior(self):
         ):
             event1.elapsed_time(event2)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_MPS, "MPS doesn't support torch.accelerator memory API!")
     def test_memory_stats(self):
         # Ensure that device allocator is initialized
@@ -234,6 +238,8 @@ def test_memory_stats(self):
         self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
         self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index 35b96522a81cb..5d7d4189f34e1 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: sparse"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 
 # Kernels
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 8e057c363cf3e..07fe0d49aa31d 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -346,7 +346,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def test_mps_autocast_error_message(self):
         with self.assertWarnsRegex(
             UserWarning,
+<<<<<<< HEAD
             "MPS Autocast only supports dtypes of torch.bfloat16, torch.float16 currently.",
+=======
+            "MPS Autocast only supports dtype of torch.bfloat16 and torch.float16 currently.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             with torch.autocast(device_type="mps", dtype=torch.float32):
                 _ = torch.ones(10)
diff --git a/test/test_autograd.py b/test/test_autograd.py
index ee6d9c09282bd..394c71f346926 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -109,10 +109,13 @@ def graph_desc(fn):
 
 
 class TestAutograd(TestCase):
+<<<<<<< HEAD
     def tearDown(self):
         torch.autograd._force_original_view_tracking(False)
         super(TestCase, self).tearDown()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_copy_slices_graph_task_updates(self):
         def f1(x, y):
             out = x.clone().view(-1)
@@ -1196,6 +1199,7 @@ def fn(x, reduce=True):
                 tmp_edge, inputs=(x,), grad_tensors=torch.tensor([1.0, 2.0, 3.0, 4.0])
             )
 
+<<<<<<< HEAD
     def test_gradient_edge_graph_ownership(self):
         # Ensure we own the graph properly
         class Clone(torch.autograd.Function):
@@ -1223,6 +1227,8 @@ def backward(ctx, gX):
         del out
         torch.autograd.backward(edge)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_grad_nonleaf(self):
         x_init = torch.randn(2, 2, requires_grad=True)
         x = x_init
@@ -3694,6 +3700,7 @@ def test_sparse_gather_x_scalar(self):
     def test_sparse_gather_both_scalar(self):
         self._test_sparse_gather((), (), 0)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("grad_dtype not supported in compile")
     def test_grad_dtype(self):
         leaf = torch.tensor([1.0, 2.0], requires_grad=True)
@@ -3818,6 +3825,8 @@ def backward(ctx, grad_output):
         z2.sum().backward()
         self.assertEqual(l.grad.dtype, torch.float32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gc_in_destructor(self):
         """
         Previously, if a Function destructor triggered a garbage collection,
@@ -4012,6 +4021,7 @@ def backward(ctx, grad_output):
         torch.autograd.grad(y, x, create_graph=True)
         torch.autograd.grad(y, x)  # should not error!
 
+<<<<<<< HEAD
     def test_custom_autograd_ac_early_stop(self):
         refs = []
 
@@ -4044,6 +4054,8 @@ def scope():
             for ref in refs:
                 self.assertIsNone(ref())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_detach(self):
         x = torch.randn(10, 10, requires_grad=True)
         y = x + 2
@@ -4316,7 +4328,11 @@ def backward(self, grad_output):
         self.assertIsNone(y.grad_fn)
 
     def test_backward_copy(self):
+<<<<<<< HEAD
         # This tests checks backward engine for a very subtle bug that appeared
+=======
+        # This tests checks backward engine for a very subtle bug that appreared
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in one of the initial versions of autograd. Gradients tensors were
         # simply stored in lists while the function waited for all its gradients
         # to be computed. However, sometimes an output was used multiple times,
@@ -4499,7 +4515,11 @@ def backward(ctx, grad_output):
                     ctx.output_var.sum().backward()
                 return ctx.x.grad * grad_output
 
+<<<<<<< HEAD
         # Reentrant starts on CPU thread, finishes on GPU thread
+=======
+        # Reentrant starts on CPU thread, finishs on GPU thread
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2, requires_grad=True)
         out = Reenter.apply(x)
         out.sum().backward()
@@ -5050,6 +5070,10 @@ def __torch_dispatch__(self, func, types, args, kwargs=None):
 Running aten.div.Tensor from within DivBackward0
 Running aten.mul.Tensor from within MulBackward0
 Running aten.detach.default from within AccumulateGrad
+<<<<<<< HEAD
+=======
+Running aten.detach.default from within AccumulateGrad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Done""",
         )
 
@@ -5896,7 +5920,11 @@ def forward(ctx, inp1, inp2):
 
             @staticmethod
             def backward(ctx, grad):
+<<<<<<< HEAD
                 # Create a sparse tensor with non-contiguous indices and values
+=======
+                # Create a sparse tensor with non-contigous indices and values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # and return as grad.
                 v = torch.rand(1, 3)
                 i = torch.ones(1, 1, dtype=torch.long)
@@ -7322,7 +7350,13 @@ def context_fn():
             lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn
         )
         out.backward()
+<<<<<<< HEAD
         self.assertEqual(verbose_mode.operators, ["exp.default", "detach.default"])
+=======
+        self.assertEqual(
+            verbose_mode.operators, ["exp.default", "detach.default", "detach.default"]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             Exception, "only supported when use_reentrant=False"
@@ -8073,6 +8107,7 @@ def test_grad_fn_attr_bindings(self):
         for t in results:
             self.assertEqual(t.grad_fn._saved_scalars, scalars)
 
+<<<<<<< HEAD
     def test_get_data_and_hooks_from_raw_saved_variable(self):
         def pack_hook(t):
             return t
@@ -8102,6 +8137,8 @@ def unpack_hook(t):
         self.assertTrue(pow2_sv.data is c)
         self.assertIsNone(pow2_sv.unpack_hook)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cant_create_saved_tensors(self):
         with self.assertRaisesRegex(
             RuntimeError,
@@ -8344,8 +8381,12 @@ def _do_test_autograd_simple_views_python(self, dtype):
 
         class IdOneOutput(Function):
             @staticmethod
+<<<<<<< HEAD
             def forward(ctx, a, make_view, pure_view):
                 ctx._is_pure_view = pure_view
+=======
+            def forward(ctx, a, b, make_view):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if make_view:
                     a = a.narrow(0, 0, 2)
                 else:
@@ -8359,8 +8400,12 @@ def backward(ctx, ga):
 
         class IdTwoOutput(Function):
             @staticmethod
+<<<<<<< HEAD
             def forward(ctx, a, b, make_view, pure_view):
                 ctx._is_pure_view = pure_view
+=======
+            def forward(ctx, a, b, make_view):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if make_view:
                     a = a.narrow(0, 0, 2)
                 else:
@@ -8374,12 +8419,20 @@ def backward(ctx, ga, gab):
                     ga_nz[0] = False
                 else:
                     ga_nz[0] = True
+<<<<<<< HEAD
                 return ga + gab, gab, None, None
 
         class ViewOfTemp(Function):
             @staticmethod
             def forward(ctx, a, make_view, pure_view):
                 ctx._is_pure_view = pure_view
+=======
+                return ga + gab, gab, None
+
+        class ViewOfTemp(Function):
+            @staticmethod
+            def forward(ctx, a, make_view):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ctx.save_for_backward(a)
                 if make_view:
                     a = a.narrow(0, 0, 2)
@@ -8394,7 +8447,11 @@ def backward(ctx, grad):
                 (a,) = ctx.saved_tensors
                 res = torch.zeros_like(a)
                 res.select(0, 0).copy_(grad)
+<<<<<<< HEAD
                 return res, None, None
+=======
+                return res, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fn_id_to_inplace_on_view_err_msg = {
             "one_output": (
@@ -8403,6 +8460,7 @@ def backward(ctx, grad):
             ),
             "two_output": (
                 "Output 0 of IdTwoOutputBackward is a view and is being modified inplace."
+<<<<<<< HEAD
                 " This view is the output of a function that returns multiple views.",
                 "Pure view custom Function can only have one input Tensor and one output Tensor."
                 " Open an issue if you need to support more.",
@@ -8411,12 +8469,20 @@ def backward(ctx, grad):
                 "Output 0 of ViewOfTempBackward is a view and is being "
                 "modified inplace. This view was created inside a custom Function",
                 "a view of a leaf Variable that requires grad is being used in an in-place operation",
+=======
+                " This view is the output of a function that returns multiple views."
+            ),
+            "view_of_temp": (
+                "Output 0 of ViewOfTempBackward is a view and is being "
+                "modified inplace. This view was created inside a custom Function"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         }
 
         for fn_id in ["one_output", "two_output", "view_of_temp"]:
             for inplace in [True, False]:
                 for make_view in [True, False]:
+<<<<<<< HEAD
                     for pure_view in [True, False]:
                         # Used for special casing the tests below
                         output_is_a_view = make_view or fn_id == "view_of_temp"
@@ -8493,6 +8559,62 @@ def fn(a, b):
 
                         self.assertTrue(bw_called[0] == expected_called)
                         self.assertTrue(ga_nz[0] == expected_ga_nz)
+=======
+                    # Used for special casing the tests below
+                    output_is_a_view = make_view or fn_id == "view_of_temp"
+
+                    def fn(a, b):
+                        # never modify a, b inplace for gracheck
+                        a = a.clone()
+                        b = b.clone()
+                        if fn_id == "two_output":
+                            tmp1, tmp2 = IdTwoOutput.apply(a, b, make_view)
+                            if inplace:
+                                tmp1 += 3
+                                tmp2 += 3
+                            else:
+                                tmp1 = tmp1 + 3
+                                tmp2 = tmp2 + 3
+                            tmp = tmp1 * tmp2
+                        else:
+                            if fn_id == "one_output":
+                                tmp = IdOneOutput.apply(a, b, make_view)
+                            else:
+                                tmp = ViewOfTemp.apply(a + b, make_view)
+                            if inplace:
+                                tmp += 3
+                            else:
+                                tmp = tmp + 3
+
+                        return tmp.sum()
+
+                    a = torch.ones(2, dtype=dtype, requires_grad=True)
+                    b = torch.ones(2, dtype=dtype, requires_grad=True)
+
+                    err_msg = fn_id_to_inplace_on_view_err_msg[fn_id]
+
+                    if not inplace or not output_is_a_view:
+                        gradcheck(fn, (a, b), check_batched_grad=False)
+
+                    # Was the custom backward called properly
+                    bw_called[0] = 0
+                    ga_nz[0] = True  # For the case where the backward is called
+
+                    if inplace and output_is_a_view:
+                        with self.assertRaisesRegex(RuntimeError, err_msg):
+                            fn(a, b)
+                    else:
+                        fn(a, b).abs().backward()
+
+                    expected_called = 1
+                    expected_ga_nz = True
+
+                    if output_is_a_view and inplace:
+                        expected_called = 0
+
+                    self.assertTrue(bw_called[0] == expected_called)
+                    self.assertTrue(ga_nz[0] == expected_ga_nz)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_autograd_simple_views_python(self):
         self._do_test_autograd_simple_views_python(torch.double)
@@ -8956,6 +9078,7 @@ def test_named_tensor_for_complex_views(self):
         expected.fill_(complex(abs_1_1j / 2, abs_1_1j / 2))
         self.assertEqual(z.grad, torch.view_as_real(expected))
 
+<<<<<<< HEAD
     def test_custom_function_saving_mutated_view_no_leak(self):
         class Test(torch.autograd.Function):
             @staticmethod
@@ -8977,6 +9100,8 @@ def scope():
         ref = scope()
         self.assertIsNone(ref())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_function_return_view_in_nograd(self):
         class Alias(Function):
             @staticmethod
@@ -10990,7 +11115,11 @@ def get_tensor_and_weak_ref():
             dual = fwAD.make_dual(foo, tangent)
             self.assertFalse(tangent_ref.expired())
 
+<<<<<<< HEAD
             # Make sure that the tangent we provided has been reused as is
+=======
+            # Make sure that the tangent we provided has been re-used as is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(fwAD.unpack_dual(dual)[1] is tangent)
 
             # Make sure that dual is keeping the tangent alive
@@ -11349,7 +11478,11 @@ def test_advanced_packing_unpacking(self):
             self.assertEqual(
                 dual_tangent.storage().data_ptr(), bar.storage().data_ptr()
             )
+<<<<<<< HEAD
             # And the tangent is actually reused as-is so it is still the same Tensor
+=======
+            # And the tangent is actually re-used as-is so it is still the same Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIs(dual_tangent, bar)
 
             # Ensure we properly share the version counter
@@ -11861,7 +11994,11 @@ def _test_pyscalar_conversions(t, integral_conv):
             def test_nonzero(tensor, value, expected):
                 tensor[0] = value
                 self.assertEqual(expected, bool(tensor))
+<<<<<<< HEAD
                 self.assertEqual(expected, bool(tensor))
+=======
+                self.assertEqual(expected, True if tensor else False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             test_nonzero(l, 0, False)
             test_nonzero(l, -2, True)
@@ -12231,19 +12368,31 @@ def backward(ctx, grad_output):
                         (new_param**2).sum().backward()
                 return grad_output
 
+<<<<<<< HEAD
         # Reentrant starts on GPU thread, finishes on GPU thread
+=======
+        # Reentrant starts on GPU thread, finishs on GPU thread
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2, device=device, requires_grad=True)
         out = ReentrantFunc.apply(x)
         out.sum().backward()
 
+<<<<<<< HEAD
         # Reentrant starts on CPU thread, finishes on GPU thread
+=======
+        # Reentrant starts on CPU thread, finishs on GPU thread
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2, requires_grad=True)
         # set ReentrantFunc node to GPU to emit tasks to GPU queue
         ReentrantFunc._cpu_mode = False
         out = ReentrantFunc.apply(x)
         out.sum().backward()
 
+<<<<<<< HEAD
         # Reentrant starts on GPU thread, finishes on CPU thread
+=======
+        # Reentrant starts on GPU thread, finishs on CPU thread
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2, device=device, requires_grad=True)
         # set ReentrantFunc node to CPU to emit tasks to CPU queue
         ReentrantFunc._cpu_mode = True
@@ -12654,6 +12803,7 @@ def test_resize_version_bump(self, device):
         x.resize_as_(y)
         self.assertEqual(x._version, 2)
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
     def test_zero_dim_param_mixed_device_grad(self, device):
         # cpu 0-dim params with an accelerator device grad
@@ -12677,6 +12827,8 @@ def forward(self, x):
         self.assertEqual(model.a.grad.device, torch.device("cpu"))
         self.assertEqual(model.b.grad.device, torch.device("cpu"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestAllowMutationOnSaved(TestCase):
     def assertClonedLenEqual(self, ctx, n):
@@ -13950,7 +14102,11 @@ def forward(self, x):
                     y = x * x
                     if torch.cuda.device_count() >= 2:
                         # DataParallel is calling the forward in different threads
+<<<<<<< HEAD
                         # without propagating TLS, so hooks should not be called here
+=======
+                        # without progating TLS, so hooks should not be called here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         _self.assertEqual(len(w), 0)
                     else:
                         # DataParallel only uses one thread
@@ -14372,13 +14528,17 @@ def fn(x):
             # early stop is enabled.
             return clone(x.sin().cos())
 
+<<<<<<< HEAD
         # Test default
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Early stopping is enabled by default
         a = torch.tensor(1.0, requires_grad=True)
         out = checkpoint(fn, a, use_reentrant=False)
         out.backward()
         self.assertEqual(counter[0], 1)
 
+<<<<<<< HEAD
         # Test local setting
         counter = [0]
         a = torch.tensor(1.0, requires_grad=True)
@@ -14393,6 +14553,9 @@ def fn(x):
         self.assertEqual(counter[0], 1)
 
         # Test context manager
+=======
+        # Try using the context manager to set early stopping to False.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Expect early stopping to be disabled for all checkpoints ran under
         # the context manager, even though context manager is no longer active
         # when backward/recomputation is performed.
@@ -14400,6 +14563,7 @@ def fn(x):
         a = torch.tensor(1.0, requires_grad=True)
         with torch.utils.checkpoint.set_checkpoint_early_stop(False):
             out = checkpoint(fn, a, use_reentrant=False)
+<<<<<<< HEAD
         out.backward()
         self.assertEqual(counter[0], 2)
 
@@ -14434,6 +14598,12 @@ def fn(x):
         out.backward()
         self.assertEqual(counter[0], 1)
 
+=======
+
+        out.backward()
+        self.assertEqual(counter[0], 2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nested_checkpoint_set_early_stop_no_recompution_needed(self):
         # Case 1: We have one tensor saved and its the input
 
diff --git a/test/test_autograd_fallback.py b/test/test_autograd_fallback.py
index d6252ac6f34a3..d3346d6ad50c8 100644
--- a/test/test_autograd_fallback.py
+++ b/test/test_autograd_fallback.py
@@ -6,7 +6,11 @@
 import numpy as np
 
 import torch
+<<<<<<< HEAD
 from torch.library import _scoped_library
+=======
+from torch.library import _scoped_library, Library
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -28,6 +32,7 @@ def autograd_fallback_mode(mode):
 class TestAutogradFallback(TestCase):
     test_ns = "_test_autograd_fallback"
 
+<<<<<<< HEAD
     def setUp(self):
         super().setUp()
         self.libraries = []
@@ -38,14 +43,28 @@ def tearDown(self):
         for lib in self.libraries:
             lib._destroy()
         del self.libraries
+=======
+    def tearDown(self):
+        if hasattr(torch.ops, self.test_ns):
+            delattr(torch.ops, self.test_ns)
+        if hasattr(self, "lib"):
+            del self.lib.m
+            del self.lib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_op(self, name):
         return getattr(getattr(torch.ops, self.test_ns), name).default
 
     def get_lib(self):
+<<<<<<< HEAD
         result = torch.library.Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
         self.libraries.append(result)
         return result
+=======
+        lib = Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
+        self.lib = lib
+        return lib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @parametrize("mode", ("nothing", "warn"))
     def test_no_grad(self, mode):
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 406242964d1c9..e1ad13a71fb69 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -79,7 +79,11 @@
 class TestBinaryUfuncs(TestCase):
     # Generic tests for elementwise binary (AKA binary universal (u) functions (funcs))
     # TODO: below contiguous tensor results are compared with a variety of noncontiguous results.
+<<<<<<< HEAD
     #   It would be interesting to have the lhs and rhs have different discontinuities.
+=======
+    #   It would be interesting to have the lhs and rhs have different discontiguities.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Helper for comparing torch tensors and NumPy arrays
     # TODO: should this or assertEqual also validate that strides are equal?
@@ -1481,7 +1485,11 @@ def to_np(value):
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
                     self.assertEqual(actual, expected.to(actual))
+<<<<<<< HEAD
                     self.assertEqual(actual2, expected.to(actual2))
+=======
+                    self.assertEqual(actual2, expected.to(actual))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,
@@ -1688,11 +1696,20 @@ def test_cpu_tensor_pow_cuda_scalar_tensor(self, device):
 
     @onlyCUDA
     @dtypes(torch.complex64, torch.complex128)
+<<<<<<< HEAD
     def test_pow_cuda_complex_extremal_passing(self, device, dtype):
         t = torch.tensor(complex(-1.0, float("inf")), dtype=dtype, device=device)
         cuda_out = t.pow(2)
         cpu_out = t.cpu().pow(2)
         self.assertEqual(cpu_out, cuda_out)
+=======
+    def test_pow_cuda_complex_extremal_failing(self, device, dtype):
+        t = torch.tensor(complex(-1.0, float("inf")), dtype=dtype, device=device)
+        with self.assertRaises(AssertionError):
+            cuda_out = t.pow(2)
+            cpu_out = t.cpu().pow(2)
+            self.assertEqual(cpu_out, cuda_out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfTorchDynamo()
     @onlyNativeDeviceTypes
@@ -2521,7 +2538,11 @@ def _test_copysign_numpy(a, b):
             # Verify Value
             self.assertEqual(torch_result, expected)
             # Verify Sign
+<<<<<<< HEAD
             # Use double copysign to verify the correctness of 0.0 and -0.0, since
+=======
+            # Use double copysign to verify the correctnes of 0.0 and -0.0, since
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # it always True for self.assertEqual(0.0 == -0.0). So, we use 1 as the
             # magnitude to verify the sign between torch and numpy results, elementwise.
             # Special case: NaN conversions between FP32 and FP16 is not bitwise
@@ -4162,7 +4183,11 @@ def to_np(value):
             for i in complex_exponents if exp_dtype.is_complex else exponents:
                 out_dtype_scalar_exp = (
                     torch.complex128
+<<<<<<< HEAD
                     if base_dtype.is_complex or type(i) is complex
+=======
+                    if base_dtype.is_complex or type(i) == complex
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else torch.float64
                 )
                 expected_scalar_exp = torch.from_numpy(np.float_power(to_np(base), i))
@@ -4190,7 +4215,11 @@ def to_np(value):
         for i in complex_exponents if base_dtype.is_complex else exponents:
             out_dtype_scalar_base = (
                 torch.complex128
+<<<<<<< HEAD
                 if exp_dtype.is_complex or type(i) is complex
+=======
+                if exp_dtype.is_complex or type(i) == complex
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else torch.float64
             )
             expected_scalar_base = torch.from_numpy(np.float_power(i, to_np(exp)))
@@ -4205,9 +4234,15 @@ def to_np(value):
     def test_float_power_exceptions(self, device):
         def _promo_helper(x, y):
             for i in (x, y):
+<<<<<<< HEAD
                 if type(i) is complex:
                     return torch.complex128
                 elif type(i) is torch.Tensor and i.is_complex():
+=======
+                if type(i) == complex:
+                    return torch.complex128
+                elif type(i) == torch.Tensor and i.is_complex():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return torch.complex128
             return torch.double
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index 221502ae3190a..bf33f9571e6bc 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -58,20 +58,29 @@ def forward(self, arg):
         # Make sure the model only grew a little bit,
         # despite having nominally large bundled inputs.
         augmented_size = model_size(sm)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertLess(augmented_size, original_size + (1 << 12))
 
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
         self.assertEqual(loaded.get_num_bundled_inputs(), len(samples))
         self.assertEqual(len(inflated), len(samples))
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(loaded(*inflated[0]) is inflated[0][0])
 
         for idx, inp in enumerate(inflated):
             self.assertIsInstance(inp, tuple)
             self.assertEqual(len(inp), 1)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(inp[0], torch.Tensor)
             if idx != 5:
                 # Strides might be important for benchmarking.
@@ -139,7 +148,10 @@ def forward(self, fmt: str, num: int):
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
         self.assertEqual(inflated, samples)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(loaded(*inflated[0]) == "first 1")
 
     def test_multiple_methods_with_inputs(self):
@@ -186,7 +198,10 @@ def foo(self, arg):
         self.assertEqual(inflated, loaded.get_all_bundled_inputs_for_foo())
 
         # Check running and size helpers
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(loaded(*inflated[0]) is inflated[0][0])
         self.assertEqual(loaded.get_num_bundled_inputs(), len(samples))
 
@@ -419,7 +434,10 @@ def {}(self, value: Optional[List[Tensor]]):
         )
         augmented_size = model_size(sm)
         # assert the size has not increased more than 8KB
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertLess(augmented_size, original_size + (1 << 13))
 
         loaded = save_and_load(sm)
diff --git a/test/test_complex.py b/test/test_complex.py
index 9941b68c1757a..25c888e1d8aad 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -48,7 +48,10 @@ def test_conj_copy(self, device, dtype):
     def test_all(self, device, dtype):
         # issue: https://github.com/pytorch/pytorch/issues/120875
         x = torch.tensor([1 + 2j, 3 - 4j, 5j, 6], device=device, dtype=dtype)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.all(x))
 
     @dtypes(*complex_types())
@@ -57,7 +60,10 @@ def test_any(self, device, dtype):
         x = torch.tensor(
             [0, 0j, -0 + 0j, -0 - 0j, 0 + 0j, 0 - 0j], device=device, dtype=dtype
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertFalse(torch.any(x))
 
     @onlyCPU
diff --git a/test/test_content_store.py b/test/test_content_store.py
index 755f0852af749..91c424a93a88c 100644
--- a/test/test_content_store.py
+++ b/test/test_content_store.py
@@ -7,6 +7,10 @@
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import (
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TemporaryDirectoryName,
     TestCase,
 )
@@ -69,6 +73,10 @@ def test_repeated_hash(self, device):
         for _ in range(4):
             hash_storage(torch.tensor(2, device=device).untyped_storage())
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_load_tensor(self, device):
         with TemporaryDirectoryName() as loc:
             writer = ContentStoreWriter(loc)
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 8ad4882e15ecc..add9c70ecb3a0 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -148,8 +148,12 @@ def test_cusolver_extension(self):
 
     @unittest.skipIf(IS_WINDOWS, "Not available on Windows")
     def test_no_python_abi_suffix_sets_the_correct_library_name(self):
+<<<<<<< HEAD
         # For this test, run_test.py will call
         # `python -m pip install . -v --no-build-isolation` in the
+=======
+        # For this test, run_test.py will call `python setup.py install` in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # cpp_extensions/no_python_abi_suffix_test folder, where the
         # `BuildExtension` class has a `no_python_abi_suffix` option set to
         # `True`. This *should* mean that on Python 3, the produced shared
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 176ac3d044708..b1d82d75ea6cc 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -21,7 +21,10 @@
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
 from torch.testing._internal.common_utils import gradcheck, TEST_XPU
 from torch.utils.cpp_extension import (
+<<<<<<< HEAD
     _get_cuda_arch_flags,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _TORCH_PATH,
     check_compiler_is_gcc,
     CUDA_HOME,
@@ -220,12 +223,15 @@ def test_mps_extension(self):
 
         self.assertEqual(cpu_output, mps_output.to("cpu"))
 
+<<<<<<< HEAD
         # Regression test for https://github.com/pytorch/pytorch/issues/163721
         lib = torch.mps.compile_shader("void kernel noop(device float *x) {}")
         lib.noop(mps_output)
         module.mps_add_one_new_context(mps_output)
         self.assertEqual(cpu_output + 1.0, mps_output.to("cpu"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _run_jit_cuda_archflags(self, flags, expected):
         # Compile an extension with given `flags`
         def _check_cuobjdump_output(expected_values, is_ptx=False):
@@ -328,6 +334,7 @@ def test_jit_cuda_archflags(self):
                 [f"{capability[0]}{capability[1]}" for capability in capabilities],
                 None,
             ),
+<<<<<<< HEAD
         }
         archflags["7.5+PTX"] = (["75"], ["75"])
         major, minor = map(int, torch.version.cuda.split(".")[:2])
@@ -337,6 +344,14 @@ def test_jit_cuda_archflags(self):
             archflags["Volta"] = (["70"], ["70"])
             archflags["5.0;6.0+PTX;7.0;7.5"] = (["50", "60", "70", "75"], ["60"])
         if major < 12:
+=======
+            "Maxwell+Tegra;6.1": (["53", "61"], None),
+            "Volta": (["70"], ["70"]),
+        }
+        archflags["7.5+PTX"] = (["75"], ["75"])
+        archflags["5.0;6.0+PTX;7.0;7.5"] = (["50", "60", "70", "75"], ["60"])
+        if int(torch.version.cuda.split(".")[0]) < 12:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # CUDA 12 drops compute capability < 5.0
             archflags["Pascal 3.5"] = (["35", "60", "61"], None)
 
@@ -357,6 +372,7 @@ def test_jit_cuda_archflags(self):
                 # to avoid errors from here leaking into other tests
                 pass
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_cuda_arch_flags_non_default_gencode(self):
         user_arch_flags = ["-gencode=arch=compute_86,code=sm_86"]
@@ -386,6 +402,8 @@ def test_cuda_arch_flags_default_gencode(self):
             len(empty_flags), 0, "Empty list should generate default flags"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
     @unittest.skipIf(TEST_ROCM, "Not supported on ROCm")
     def test_jit_cudnn_extension(self):
@@ -1040,7 +1058,11 @@ def test_warning(self):
         t = torch.rand(2).double()
         cpp_tensor_name = r"CPUDoubleType"
 
+<<<<<<< HEAD
         # Without error handling, the warnings cannot be caught
+=======
+        # Without error handling, the warnings cannot be catched
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warn_mod = torch.utils.cpp_extension.load_inline(
             name="warn_mod",
             cpp_sources=[source],
@@ -1074,23 +1096,39 @@ def test_warning(self):
         )
 
         with warnings.catch_warnings(record=True) as w:
+<<<<<<< HEAD
             # Caught with no error should be detected
             warn_mod.foo(t, 0)
             self.assertEqual(len(w), 1)
 
             # Caught with cpp error should also be detected
+=======
+            # Catched with no error should be detected
+            warn_mod.foo(t, 0)
+            self.assertEqual(len(w), 1)
+
+            # Catched with cpp error should also be detected
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaisesRegex(TypeError, t.type()):
                 warn_mod.foo(t, 1)
             self.assertEqual(len(w), 2)
 
+<<<<<<< HEAD
             # Caught with python error should also be detected
+=======
+            # Catched with python error should also be detected
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaisesRegex(
                 SystemError, "bad argument to internal function"
             ):
                 warn_mod.foo(t, 2)
             self.assertEqual(len(w), 3)
 
+<<<<<<< HEAD
             # Caught with pybind error should also be detected
+=======
+            # Catched with pybind error should also be detected
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Note that there is no type name translation for pybind errors
             with self.assertRaisesRegex(KeyError, cpp_tensor_name):
                 warn_mod.foo(t, 3)
@@ -1233,25 +1271,41 @@ def test_aoti_torch_call_dispatcher(self):
         #include <torch/csrc/inductor/aoti_runtime/utils.h>
         #include <torch/csrc/inductor/aoti_torch/utils.h>
         #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+<<<<<<< HEAD
         #include <torch/csrc/stable/stableivalue_conversions.h>
+=======
+        #include <torch/csrc/stable/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle;
 
         at::Tensor my_abs(at::Tensor x) {
         StableIValue stack[1];
         RAIIATH raii(torch::aot_inductor::new_tensor_handle(std::move(x)));
+<<<<<<< HEAD
         stack[0] = torch::stable::detail::from(raii.release());
         aoti_torch_call_dispatcher("aten::abs", "", stack);
         RAIIATH res(torch::stable::detail::to<AtenTensorHandle>(stack[0]));
+=======
+        stack[0] = from(raii.release());
+        aoti_torch_call_dispatcher("aten::abs", "", stack);
+        RAIIATH res(to<AtenTensorHandle>(stack[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return *reinterpret_cast<at::Tensor*>(res.release());
         }
 
         at::Tensor my_floor(at::Tensor x) {
         StableIValue stack[1];
         RAIIATH raii(torch::aot_inductor::new_tensor_handle(std::move(x)));
+<<<<<<< HEAD
         stack[0] = torch::stable::detail::from(raii.release());
         aoti_torch_call_dispatcher("aten::floor", "", stack);
         RAIIATH res(torch::stable::detail::to<AtenTensorHandle>(stack[0]));
+=======
+        stack[0] = from(raii.release());
+        aoti_torch_call_dispatcher("aten::floor", "", stack);
+        RAIIATH res(to<AtenTensorHandle>(stack[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return *reinterpret_cast<at::Tensor*>(res.release());
         }
         """
diff --git a/test/test_cpp_extensions_open_device_registration.py b/test/test_cpp_extensions_open_device_registration.py
new file mode 100644
index 0000000000000..4aedfdb630149
--- /dev/null
+++ b/test/test_cpp_extensions_open_device_registration.py
@@ -0,0 +1,269 @@
+# Owner(s): ["module: cpp-extensions"]
+
+import _codecs
+import io
+import os
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+import pytorch_openreg  # noqa: F401
+
+import torch
+import torch.testing._internal.common_utils as common
+import torch.utils.cpp_extension
+from torch.serialization import safe_globals
+from torch.testing._internal.common_utils import TemporaryFileName
+
+
+@unittest.skipIf(common.TEST_XPU, "XPU does not support cppextension currently")
+@common.markDynamoStrictTest
+class TestCppExtensionOpenRegistration(common.TestCase):
+    """Tests Open Device Registration with C++ extensions."""
+
+    module = None
+
+    def setUp(self):
+        super().setUp()
+
+        # cpp extensions use relative paths. Those paths are relative to
+        # this file, so we'll change the working directory temporarily
+        self.old_working_dir = os.getcwd()
+        os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+        assert self.module is not None
+
+    def tearDown(self):
+        super().tearDown()
+
+        # return the working directory (see setUp)
+        os.chdir(self.old_working_dir)
+
+    @classmethod
+    def setUpClass(cls):
+        common.remove_cpp_extensions_build_root()
+
+        cls.module = torch.utils.cpp_extension.load(
+            name="custom_device_extension",
+            sources=[
+                "cpp_extensions/open_registration_extension.cpp",
+            ],
+            extra_include_paths=["cpp_extensions"],
+            extra_cflags=["-g"],
+            verbose=True,
+        )
+
+    def test_open_device_faketensor(self):
+        with torch._subclasses.fake_tensor.FakeTensorMode.push():
+            a = torch.empty(1, device="openreg")
+            b = torch.empty(1, device="openreg:0")
+            result = a + b  # noqa: F841
+
+    def test_open_device_named_tensor(self):
+        torch.empty([2, 3, 4, 5], device="openreg", names=["N", "C", "H", "W"])
+
+    # Not an open registration test - this file is just very convenient
+    # for testing torch.compile on custom C++ operators
+    def test_compile_autograd_function_returns_self(self):
+        x_ref = torch.randn(4, requires_grad=True)
+        out_ref = self.module.custom_autograd_fn_returns_self(x_ref)
+        out_ref.sum().backward()
+
+        x_test = x_ref.detach().clone().requires_grad_(True)
+        f_compiled = torch.compile(self.module.custom_autograd_fn_returns_self)
+        out_test = f_compiled(x_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(x_ref.grad, x_test.grad)
+
+    # Not an open registration test - this file is just very convenient
+    # for testing torch.compile on custom C++ operators
+    @common.skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
+    def test_compile_autograd_function_aliasing(self):
+        x_ref = torch.randn(4, requires_grad=True)
+        out_ref = torch.ops._test_funcs.custom_autograd_fn_aliasing(x_ref)
+        out_ref.sum().backward()
+
+        x_test = x_ref.detach().clone().requires_grad_(True)
+        f_compiled = torch.compile(torch.ops._test_funcs.custom_autograd_fn_aliasing)
+        out_test = f_compiled(x_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(x_ref.grad, x_test.grad)
+
+    def test_open_device_scalar_type_fallback(self):
+        z_cpu = torch.Tensor([[0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]]).to(torch.int64)
+        z = torch.triu_indices(3, 3, device="openreg")
+        self.assertEqual(z_cpu, z)
+
+    def test_open_device_tensor_type_fallback(self):
+        # create tensors located in custom device
+        x = torch.Tensor([[1, 2, 3], [2, 3, 4]]).to("openreg")
+        y = torch.Tensor([1, 0, 2]).to("openreg")
+        # create result tensor located in cpu
+        z_cpu = torch.Tensor([[0, 2, 1], [1, 3, 2]])
+        # Check that our device is correct.
+        device = self.module.custom_device()
+        self.assertTrue(x.device == device)
+        self.assertFalse(x.is_cpu)
+
+        # call sub op, which will fallback to cpu
+        z = torch.sub(x, y)
+        self.assertEqual(z_cpu, z)
+
+        # call index op, which will fallback to cpu
+        z_cpu = torch.Tensor([3, 1])
+        y = torch.Tensor([1, 0]).long().to("openreg")
+        z = x[y, y]
+        self.assertEqual(z_cpu, z)
+
+    def test_open_device_tensorlist_type_fallback(self):
+        # create tensors located in custom device
+        v_openreg = torch.Tensor([1, 2, 3]).to("openreg")
+        # create result tensor located in cpu
+        z_cpu = torch.Tensor([2, 4, 6])
+        # create tensorlist for foreach_add op
+        x = (v_openreg, v_openreg)
+        y = (v_openreg, v_openreg)
+        # Check that our device is correct.
+        device = self.module.custom_device()
+        self.assertTrue(v_openreg.device == device)
+        self.assertFalse(v_openreg.is_cpu)
+
+        # call _foreach_add op, which will fallback to cpu
+        z = torch._foreach_add(x, y)
+        self.assertEqual(z_cpu, z[0])
+        self.assertEqual(z_cpu, z[1])
+
+        # call _fused_adamw_ with undefined tensor.
+        self.module.fallback_with_undefined_tensor()
+
+    @common.skipIfTorchDynamo()
+    @unittest.skipIf(
+        np.__version__ < "1.25",
+        "versions < 1.25 serialize dtypes differently from how it's serialized in data_legacy_numpy",
+    )
+    def test_open_device_numpy_serialization(self):
+        """
+        This tests the legacy _rebuild_device_tensor_from_numpy serialization path
+        """
+        device = self.module.custom_device()
+
+        # Legacy data saved with _rebuild_device_tensor_from_numpy on f80ed0b8 via
+
+        # with patch.object(torch._C, "_has_storage", return_value=False):
+        #     x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device=device)
+        #     x_foo = x.to(device)
+        #     sd = {"x": x_foo}
+        #     rebuild_func = x_foo._reduce_ex_internal(default_protocol)[0]
+        #     self.assertTrue(
+        #         rebuild_func is torch._utils._rebuild_device_tensor_from_numpy
+        #     )
+        #     with open("foo.pt", "wb") as f:
+        #         torch.save(sd, f)
+
+        data_legacy_numpy = (
+            b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02}q\x00X\x01"
+            b"\x00\x00\x00xq\x01ctorch._utils\n_rebuild_device_tensor_from_numpy\nq\x02(cnumpy.core.m"
+            b"ultiarray\n_reconstruct\nq\x03cnumpy\nndarray\nq\x04K\x00\x85q\x05c_codecs\nencode\nq\x06"
+            b"X\x01\x00\x00\x00bq\x07X\x06\x00\x00\x00latin1q\x08\x86q\tRq\n\x87q\x0bRq\x0c(K\x01K\x02K"
+            b"\x03\x86q\rcnumpy\ndtype\nq\x0eX\x02\x00\x00\x00f4q\x0f\x89\x88\x87q\x10Rq\x11(K\x03X\x01"
+            b"\x00\x00\x00<q\x12NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\x13b\x89h\x06X\x1c\x00\x00"
+            b"\x00\x00\x00\xc2\x80?\x00\x00\x00@\x00\x00@@\x00\x00\xc2\x80@\x00\x00\xc2\xa0@\x00\x00\xc3"
+            b"\x80@q\x14h\x08\x86q\x15Rq\x16tq\x17bctorch\nfloat32\nq\x18X\t\x00\x00\x00openreg:0q\x19\x89"
+            b"tq\x1aRq\x1bs.PK\x07\x08\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00PK\x03\x04\x00\x00\x08"
+            b"\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00.\x00"
+            b"archive/byteorderFB*\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08"
+            b"\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00=\x00archive/versionFB9\x00"
+            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00"
+            b"\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZ"
+            b"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0636457737946401051300000025273995036293PK\x07\x08\xee(\xcd"
+            b"\x8d(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00"
+            b"\xdfE\xd6\xcaS\x01\x00\x00S\x01\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00"
+            b"\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\xa3\x01\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00"
+            b"\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x16\x02\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08"
+            b"\x08\x00\x00\x00\x00\x00\x00\xee(\xcd\x8d(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x92\x02\x00\x00archive/.data/serialization_idPK\x06"
+            b"\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00"
+            b"\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x06\x01\x00\x00\x00\x00\x00\x008\x03\x00"
+            b"\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00>\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00"
+            b"PK\x05\x06\x00\x00\x00\x00\x04\x00\x04\x00\x06\x01\x00\x008\x03\x00\x00\x00\x00"
+        )
+        buf_data_legacy_numpy = io.BytesIO(data_legacy_numpy)
+
+        with safe_globals(
+            [
+                (np.core.multiarray._reconstruct, "numpy.core.multiarray._reconstruct")
+                if np.__version__ >= "2.1"
+                else np.core.multiarray._reconstruct,
+                np.ndarray,
+                np.dtype,
+                _codecs.encode,
+                np.dtypes.Float32DType,
+            ]
+        ):
+            sd_loaded = torch.load(buf_data_legacy_numpy, weights_only=True)
+            buf_data_legacy_numpy.seek(0)
+            # Test map_location
+            sd_loaded_cpu = torch.load(
+                buf_data_legacy_numpy, weights_only=True, map_location="cpu"
+            )
+        expected = torch.tensor(
+            [[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device=device
+        )
+        self.assertEqual(sd_loaded["x"].cpu(), expected.cpu())
+        self.assertFalse(sd_loaded["x"].is_cpu)
+        self.assertTrue(sd_loaded_cpu["x"].is_cpu)
+
+    def test_open_device_cpu_serialization(self):
+        torch.utils.rename_privateuse1_backend("openreg")
+        device = self.module.custom_device()
+        default_protocol = torch.serialization.DEFAULT_PROTOCOL
+
+        with patch.object(torch._C, "_has_storage", return_value=False):
+            x = torch.randn(2, 3)
+            x_openreg = x.to(device)
+            sd = {"x": x_openreg}
+            rebuild_func = x_openreg._reduce_ex_internal(default_protocol)[0]
+            self.assertTrue(
+                rebuild_func is torch._utils._rebuild_device_tensor_from_cpu_tensor
+            )
+            # Test map_location
+            with TemporaryFileName() as f:
+                torch.save(sd, f)
+                sd_loaded = torch.load(f, weights_only=True)
+                # Test map_location
+                sd_loaded_cpu = torch.load(f, weights_only=True, map_location="cpu")
+            self.assertFalse(sd_loaded["x"].is_cpu)
+            self.assertEqual(sd_loaded["x"].cpu(), x)
+            self.assertTrue(sd_loaded_cpu["x"].is_cpu)
+
+            # Test metadata_only
+            with TemporaryFileName() as f:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Cannot serialize tensors on backends with no storage under skip_data context manager",
+                ):
+                    with torch.serialization.skip_data():
+                        torch.save(sd, f)
+
+    def test_open_device_dlpack(self):
+        t = torch.randn(2, 3).to("openreg")
+        capsule = torch.utils.dlpack.to_dlpack(t)
+        t1 = torch.from_dlpack(capsule)
+        self.assertTrue(t1.device == t.device)
+        t = t.to("cpu")
+        t1 = t1.to("cpu")
+        self.assertEqual(t, t1)
+
+
+if __name__ == "__main__":
+    common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 62d885bb45043..a477f401a6747 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -69,7 +69,10 @@
     load_tests,
     MI300_ARCH,
     parametrize,
+<<<<<<< HEAD
     recover_orig_fp32_precision,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     serialTest,
     setBlasBackendsToDefaultFinally,
@@ -98,7 +101,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     import torchvision.models  # noqa: F401
@@ -174,7 +181,11 @@ def test_pinned_memory_with_cudaregister_multithread(self):
         for thread in threads:
             thread.join()
 
+<<<<<<< HEAD
     @serialTest()
+=======
+    @serialTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_host_memory_stats(self):
         # Helper functions
         def empty_stats():
@@ -183,14 +194,22 @@ def empty_stats():
                 "allocated_bytes.current": 0,
                 "allocated_bytes.freed": 0,
                 "allocated_bytes.peak": 0,
+<<<<<<< HEAD
                 "allocations.allocated": 0,
                 "allocations.current": 0,
                 "allocations.freed": 0,
                 "allocations.peak": 0,
+=======
+                "allocation.allocated": 0,
+                "allocation.current": 0,
+                "allocation.freed": 0,
+                "allocation.peak": 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "host_alloc_time.count": 0,
                 "host_free_time.count": 0,
                 "num_host_alloc": 0,
                 "num_host_free": 0,
+<<<<<<< HEAD
                 "active_bytes.allocated": 0,
                 "active_bytes.current": 0,
                 "active_bytes.freed": 0,
@@ -199,13 +218,26 @@ def empty_stats():
                 "active_requests.current": 0,
                 "active_requests.freed": 0,
                 "active_requests.peak": 0,
+=======
+                "reserved_bytes.allocated": 0,
+                "reserved_bytes.current": 0,
+                "reserved_bytes.freed": 0,
+                "reserved_bytes.peak": 0,
+                "segment.allocated": 0,
+                "segment.current": 0,
+                "segment.freed": 0,
+                "segment.peak": 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
         def check_stats(expected):
             stats = torch.cuda.host_memory_stats()
             for k, v in expected.items():
+<<<<<<< HEAD
                 if v != stats[k]:
                     print(f"key: {k}, expected: {v}, stats: {stats[k]}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(v, stats[k])
 
         # Setup the test cleanly
@@ -225,12 +257,20 @@ def check_stats(expected):
         # Make first allocation and check stats
         t1 = torch.ones(alloc1 * 1024, pin_memory=True)
         self.assertTrue(t1.is_pinned())
+<<<<<<< HEAD
         for prefix in ["active_requests", "allocations"]:
+=======
+        for prefix in ["segment", "allocation"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += 1
 
         allocation_size1 = alloc1_aligned * 1024 * 4
+<<<<<<< HEAD
         for prefix in ["allocated_bytes", "active_bytes"]:
+=======
+        for prefix in ["allocated_bytes", "reserved_bytes"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += allocation_size1
 
@@ -239,15 +279,48 @@ def check_stats(expected):
 
         check_stats(expected)
 
+<<<<<<< HEAD
         # Make second allocation and check stats
         t2 = torch.ones(alloc2 * 1024, pin_memory=True)
         self.assertTrue(t2.is_pinned())
         for prefix in ["active_requests", "allocations"]:
+=======
+        # Remove first allocation and check stats
+        del t1
+
+        expected["allocation.current"] -= 1
+        expected["allocation.freed"] += 1
+        expected["allocated_bytes.current"] -= allocation_size1
+        expected["allocated_bytes.freed"] += allocation_size1
+
+        check_stats(expected)
+
+        # Make first allocation again and check reuse
+        t1 = torch.ones(alloc1 * 1024, pin_memory=True)
+        self.assertTrue(t1.is_pinned())
+        for suffix in ["allocated", "current"]:
+            expected["allocation" + "." + suffix] += 1
+
+        allocation_size1 = alloc1_aligned * 1024 * 4
+        for suffix in ["allocated", "current"]:
+            expected["allocated_bytes" + "." + suffix] += allocation_size1
+
+        check_stats(expected)
+
+        # Make second allocation and check stats
+        t2 = torch.ones(alloc2 * 1024, pin_memory=True)
+        self.assertTrue(t2.is_pinned())
+        for prefix in ["segment", "allocation"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += 1
 
         allocation_size2 = alloc2_aligned * 1024 * 4
+<<<<<<< HEAD
         for prefix in ["allocated_bytes", "active_bytes"]:
+=======
+        for prefix in ["allocated_bytes", "reserved_bytes"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += allocation_size2
 
@@ -256,8 +329,39 @@ def check_stats(expected):
 
         check_stats(expected)
 
+<<<<<<< HEAD
         # Empty cache and check stats
         torch._C._host_emptyCache()
+=======
+        # Remove first allocation and check stats
+        del t1
+
+        expected["allocation.current"] -= 1
+        expected["allocation.freed"] += 1
+        expected["allocated_bytes.current"] -= allocation_size1
+        expected["allocated_bytes.freed"] += allocation_size1
+
+        check_stats(expected)
+
+        # Remove second allocation and check stats
+        del t2
+
+        expected["allocation.current"] -= 1
+        expected["allocation.freed"] += 1
+        expected["allocated_bytes.current"] -= allocation_size2
+        expected["allocated_bytes.freed"] += allocation_size2
+
+        check_stats(expected)
+
+        # Empty cache and check stats
+        torch._C._host_emptyCache()
+        expected["segment.freed"] += expected["segment.current"]
+        expected["segment.current"] = 0
+        expected["reserved_bytes.freed"] += expected["reserved_bytes.current"]
+        expected["reserved_bytes.current"] = 0
+        expected["num_host_free"] = expected["num_host_alloc"]
+        expected["host_free_time.count"] += expected["host_alloc_time.count"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check_stats(expected)
 
@@ -267,6 +371,11 @@ def check_stats(expected):
 
         expected = empty_stats()
 
+<<<<<<< HEAD
+=======
+        check_stats(expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pinned_memory_empty_cache(self):
         try:
             for alloc_settings in (True, False):
@@ -325,6 +434,7 @@ def test_memory_allocation(self):
                 torch.cuda.caching_allocator_delete(mem)
                 self.assertEqual(torch.cuda.memory_allocated(), prev)
 
+<<<<<<< HEAD
     def test_memory_stats(self):
         gc.collect()
         torch.cuda.empty_cache()
@@ -361,6 +471,8 @@ def test_memory_stats(self):
         self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
         self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_check_error(self):
         # Assert this call doesn't raise.
         torch.cuda.check_error(0)
@@ -580,7 +692,11 @@ def _test_to_non_blocking(a, non_blocking, dst):
             src = torch.randn(
                 1000000,
                 device="cuda" if dst == "cpu" else "cpu",
+<<<<<<< HEAD
                 pin_memory=dst == "cuda",
+=======
+                pin_memory=True if dst == "cuda" else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             _test_to_non_blocking(src, try_non_blocking, dst)
 
@@ -714,7 +830,57 @@ def check_workspace_size(inp):
 
         torch._C._cuda_clearCublasWorkspaces()
 
+<<<<<<< HEAD
     def test_cublas_allow_tf32_get_set(self):
+=======
+    @contextlib.contextmanager
+    def _hip_allow_tf32(self):
+        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+        # and only for MI300+
+        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+        try:
+            yield
+        finally:
+            if hip_allow_tf32 is not None:
+                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+            else:
+                del os.environ["HIPBLASLT_ALLOW_TF32"]
+
+    @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing")
+    def test_hipblaslt_allow_tf32(self):
+        tf32_ctx = self._hip_allow_tf32
+        with tf32_ctx():
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "0"
+            # Save original value of allow_tf32
+            orig = torch.backends.cuda.matmul.allow_tf32
+            # If allow_tf32 variable is declared as static in aten/src/ATen/Context.cpp
+            # then matmul.allow_tf32 will return False after this point even if
+            # HIP_BLASLT_ALLOW_TF32 is set to 1 and matmul.allow_tf32 is changed.
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+            # Toggle torch.backends.cuda.matmul.allow_tf32 couple of times.
+            torch.backends.cuda.matmul.allow_tf32 = not orig
+            test1 = torch.backends.cuda.matmul.allow_tf32
+            torch.backends.cuda.matmul.allow_tf32 = orig
+            test2 = torch.backends.cuda.matmul.allow_tf32
+            self.assertNotEqual(test1, test2)
+            # Restore original value of allow_tf32
+            torch.backends.cuda.matmul.allow_tf32 = orig
+
+    def test_cublas_allow_tf32_get_set(self):
+        """
+        We only turn on TF32 for MI300 with a special env var. This is because TF32
+        is only available in MI300+ and is in experimental mode (hipblaslt support
+        is current WIP)
+        """
+        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+
+        with tf32_ctx():
+            self._test_cublas_allow_tf32_get_set_inner()
+
+    def _test_cublas_allow_tf32_get_set_inner(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
         )
@@ -729,12 +895,25 @@ def test_cublas_allow_tf32_get_set(self):
         torch.backends.cuda.matmul.allow_tf32 = orig
 
     def test_float32_matmul_precision_get_set(self):
+<<<<<<< HEAD
+=======
+        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+
+        with tf32_ctx():
+            self._test_float32_matmul_precision_get_set_inner()
+
+    def _test_float32_matmul_precision_get_set_inner(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         orig = torch.get_float32_matmul_precision()
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
         )
         # this is really just checking that the environment variable is respected during testing
+<<<<<<< HEAD
         # and not overwritten by another function that doesn't revert it to the initial value
+=======
+        # and not overwritten by another function that doesn't revert it to the intitial value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not skip_tf32_cublas:
             self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
             self.assertEqual(torch.get_float32_matmul_precision(), "highest")
@@ -752,6 +931,7 @@ def test_float32_matmul_precision_get_set(self):
 
     def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+<<<<<<< HEAD
         orig_splitk = (
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction_split_k
         )
@@ -813,6 +993,27 @@ def test_cublas_allow_bf16_reduced_precision_reduction_get_set(self):
             orig,
             orig_splitk,
         )
+=======
+        self.assertEqual(
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig
+        )
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig
+        self.assertEqual(
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig
+        )
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
+
+    def test_cublas_allow_bf16_reduced_precision_reduction_get_set(self):
+        orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+        self.assertEqual(
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), orig
+        )
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
+        self.assertEqual(
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), not orig
+        )
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cublas_allow_fp16_accumulation_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_accumulation
@@ -831,6 +1032,7 @@ def test_cudnn_allow_tf32_get_set(self):
         ):
             self.assertTrue(torch.backends.cudnn.allow_tf32)
 
+<<<<<<< HEAD
     @recover_orig_fp32_precision
     def test_fp32_precision_with_tf32(self):
         with torch.backends.cudnn.flags(
@@ -880,6 +1082,8 @@ def test_invalid_status_for_legacy_api(self):
             with self.assertRaisesRegex(RuntimeError, "mix of the legacy and new APIs"):
                 print(torch.backends.cuda.matmul.allow_tf32)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_type_conversions(self):
         x = torch.randn(5, 5)
         self.assertIsInstance(x.float(), torch.FloatTensor)
@@ -1004,6 +1208,7 @@ def test_stream_event_repr(self):
         s.record_event(e)
         self.assertTrue("torch.cuda.Event" in e.__repr__())
 
+<<<<<<< HEAD
     def test_cuda_stream_protocol(self):
         stream = torch.cuda.Stream()
 
@@ -1022,6 +1227,8 @@ def test_cuda_stream_protocol(self):
         self.assertEqual(external_result[0], 0)
         self.assertEqual(external_result[1], external_stream.cuda_stream)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_events(self):
         stream = torch.cuda.current_stream()
         event = torch.cuda.Event(enable_timing=True)
@@ -1142,7 +1349,11 @@ def perform_copy():
             tmp2 = torch.cuda.FloatTensor(t.size())
             tmp2.zero_()
             self.assertNotEqual(
+<<<<<<< HEAD
                 tmp2.data_ptr(), ptr[0], msg="allocation reused to soon"
+=======
+                tmp2.data_ptr(), ptr[0], msg="allocation re-used to soon"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self.assertEqual(result.tolist(), [1, 2, 3, 4])
@@ -1153,7 +1364,11 @@ def perform_copy():
             torch.cuda.current_stream().synchronize()
             with torch.cuda.stream(stream):
                 tmp3 = torch.cuda.FloatTensor(t.size())
+<<<<<<< HEAD
                 self.assertEqual(tmp3.data_ptr(), ptr[0], msg="allocation not reused")
+=======
+                self.assertEqual(tmp3.data_ptr(), ptr[0], msg="allocation not re-used")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_record_stream_on_shifted_view(self):
         # See issue #27366
@@ -1234,20 +1449,32 @@ def test_noncontiguous_pinned_memory(self):
     def test_caching_pinned_memory(self):
         cycles_per_ms = get_cycles_per_ms()
 
+<<<<<<< HEAD
         # check that allocations are reused after deletion
+=======
+        # check that allocations are re-used after deletion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t = torch.FloatTensor([1]).pin_memory()
         ptr = t.data_ptr()
         del t
         t = torch.FloatTensor([1]).pin_memory()
         self.assertEqual(t.data_ptr(), ptr, msg="allocation not reused")
 
+<<<<<<< HEAD
         # check that the allocation is not reused if it's in-use by a copy
+=======
+        # check that the allocation is not re-used if it's in-use by a copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gpu_tensor = torch.cuda.FloatTensor([0])
         torch.cuda._sleep(int(1000 * cycles_per_ms))  # delay the copy by 1s
         gpu_tensor.copy_(t, non_blocking=True)
         del t
         t = torch.FloatTensor([1]).pin_memory()
+<<<<<<< HEAD
         self.assertNotEqual(t.data_ptr(), ptr, msg="allocation reused too soon")
+=======
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(list(gpu_tensor), [1])
 
     def test_caching_allocator_record_stream_oom(self):
@@ -1262,7 +1489,11 @@ def test_caching_allocator_record_stream_oom(self):
             x = torch.empty(40 * 1024 * 1024, device="cuda")
             with torch.cuda.stream(stream):
                 y += x
+<<<<<<< HEAD
             # delays reuse of `x` until after all operations in `stream`
+=======
+            # delays re-use of `x` until after all operations in `stream`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x.record_stream(stream)
             del x
 
@@ -1476,6 +1707,7 @@ def test_huge_index(self):
         res_cpu = src.cpu()[idx.cpu()]
         self.assertEqual(res.cpu(), res_cpu)
 
+<<<<<<< HEAD
     def test_fast_index_overflow(self):
         src = torch.randint(0, 20, (4, 87, 1056, 736), device="cuda")
         indices = torch.tensor([True, False, False, True], device="cuda")
@@ -1483,6 +1715,8 @@ def test_fast_index_overflow(self):
         res_cpu = src.cpu()[indices.cpu()]
         self.assertEqual(res.cpu(), res_cpu)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_randint_randomness_for_large_range(self) -> None:
         # For large ranges, randint generation is slightly different. This lead to a subtle bug where some Philox
         # offsets were not calculated correctly, resulting in reused random states.
@@ -1509,7 +1743,10 @@ def run(dev: torch.device) -> int:
         )
 
     @largeTensorTest("20GB", "cuda")
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_randint_generation_for_large_numel(self) -> None:
         numel = 2**31 + 1
         s = torch.randint(2, (numel,), device="cuda", dtype=torch.int8).sum()
@@ -1682,6 +1919,11 @@ def test_streaming_backwards_sync(self):
             self.assertEqual(x.grad, torch.ones_like(x) * 3)
             self.assertEqual(torch.cuda.current_stream(), bwd_ambient_stream)
 
+<<<<<<< HEAD
+=======
+    # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
+    @skipIfRocm(msg="flakey on ROCm https://github.com/pytorch/pytorch/issues/53190")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_streaming_backwards_multiple_streams(self):
         MultiplyInStream = self._make_multiply_in_stream()
 
@@ -2401,8 +2643,14 @@ def test_graph_capture_oom(self):
             with torch.cuda.graph(torch.cuda.CUDAGraph()):
                 torch.zeros(2**40, device="cuda")
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
+=======
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @serialTest()
     @setBlasBackendsToDefaultFinally
     def test_repeat_graph_capture_cublas_workspace_memory(self):
@@ -2974,7 +3222,11 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
                     current = postcapture_stats[stat] - precapture_stats[stat]
 
                     # There will only ever be one expandable segment in each of the small and large pools. The way the
+<<<<<<< HEAD
                     # bookkeeping is done in the allocator means that we never increment the number of segments.
+=======
+                    # bookeeping is done in the allocator means that we never increment the number of segments.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if self.expandable_segments and "segment" in stat:
                         expected = 0
                     # These two cases hit an edge case where the PyTorch allocator won't immediately unmap part of an
@@ -3015,7 +3267,11 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
                 current = postdel_stats[stat] - precapture_stats[stat]
 
                 # There will only ever be one expandable segment in each of the small and large pools. The way the
+<<<<<<< HEAD
                 # bookkeeping is done in the allocator means that we never increment the number of segments.
+=======
+                # bookeeping is done in the allocator means that we never increment the number of segments.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if self.expandable_segments and "segment" in stat:
                     expected = 0
                 # These two cases hit an edge case where the PyTorch allocator won't immediately unmap part of an
@@ -3124,6 +3380,7 @@ def test_graph_cudnn_dropout(self):
 
         model(x)
 
+<<<<<<< HEAD
     @skipIfRocm
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
@@ -3172,12 +3429,19 @@ def test_graph_manual_seed_mismatch_raises(self):
             with torch.cuda.graph(g):
                 torch.cuda.manual_seed(1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
+<<<<<<< HEAD
+=======
+            subtest((False, False, True), decorators=[skipIfRocm]),
+            subtest((True, False, True), decorators=[skipIfRocm]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             subtest((True, True, True), decorators=[unittest.expectedFailure]),
             subtest((False, False, False), decorators=[unittest.expectedFailure]),
         ],
@@ -3327,10 +3591,17 @@ def forward(self, x):
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
+<<<<<<< HEAD
             subtest((False, False, True)),
             subtest((True, False, True)),
             subtest((True, True, True), decorators=[unittest.expectedFailure]),
             subtest((False, False, False)),
+=======
+            subtest((False, False, True), decorators=[skipIfRocm]),
+            subtest((True, False, True), decorators=[skipIfRocm]),
+            subtest((True, True, True), decorators=[unittest.expectedFailure]),
+            subtest((False, False, False), decorators=[skipIfRocm]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         name_fn=lambda x, y, z: "{}{}{}".format(
             {True: "with_amp", False: "without_amp"}[x],
@@ -3405,7 +3676,10 @@ def forward(self, input_dict: dict):
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
+<<<<<<< HEAD
     @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_make_graphed_callables_same_pool(self):
         torch.manual_seed(5)
         torch.cuda.manual_seed(5)
@@ -3589,14 +3863,22 @@ def raw_malloc():
             try:
                 with torch.cuda.stream(stream):
                     mem = torch.cuda.caching_allocator_alloc(1024)
+<<<<<<< HEAD
             except BaseException:  # noqa: B036
+=======
+            except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if mem is None:
                     return
             try:
                 torch.cuda.caching_allocator_delete(mem)
                 mem = None
                 return None
+<<<<<<< HEAD
             except BaseException:  # noqa: B036
+=======
+            except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 pass
 
         def throws_on_cuda_event(capture_error_mode):
@@ -3681,6 +3963,7 @@ def test_cuda_graph_raw_graph(self):
         graph.replay()
 
     @unittest.skipIf(
+<<<<<<< HEAD
         not TEST_CUDA_GRAPH or not TEST_CUDA_PYTHON_BINDINGS,
         "CUDA >= 11.0 or ROCM >= 5.3 required for graphs, cuda-bindings must be installed",
     )
@@ -3710,6 +3993,8 @@ def test_cuda_graph_raw_graph_exec(self, keep_graph):
         graph.replay()
 
     @unittest.skipIf(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
     def test_cuda_graph_raw_graph_reset_and_recapture(self):
@@ -3728,7 +4013,11 @@ def test_cuda_graph_raw_graph_reset_and_recapture(self):
         graph.replay()
         self.assertTrue(torch.all(x == 3.0))
 
+<<<<<<< HEAD
         # Check that graph capture can succeed after resetting.
+=======
+        # Check that graph capture can succeed after reseting.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph.reset()
 
         # Don't do x[:] = 0.0 because we want to capture a new address
@@ -3778,6 +4067,7 @@ def test_cuda_graph_allocator_propagates_stream(self):
         self.assertEqual(len(x), 2)
         self.assertEqual(x[0], x[1])
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
@@ -3811,6 +4101,8 @@ def my_func(a: torch.Tensor, b: torch.Tensor, perm: torch.Tensor):
                 .strip()
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_batch_norm_gather_stats(self):
         input = torch.randn(1, 3, 3, 3, device="cuda")
         mean, invstd = torch.batch_norm_gather_stats(
@@ -3952,6 +4244,7 @@ def test_hip_device_count(self):
             {"CUDA_VISIBLE_DEVICES": "0", "HIP_VISIBLE_DEVICES": None},
             {"CUDA_VISIBLE_DEVICES": None, "HIP_VISIBLE_DEVICES": "0"},
             {"CUDA_VISIBLE_DEVICES": "0,1,2,3", "HIP_VISIBLE_DEVICES": "0"},
+<<<<<<< HEAD
             {"ROCR_VISIBLE_DEVICES": "0,1,2,3", "HIP_VISIBLE_DEVICES": "0"},
             {"ROCR_VISIBLE_DEVICES": "0", "HIP_VISIBLE_DEVICES": None},
         ]
@@ -3963,6 +4256,12 @@ def test_hip_device_count(self):
                 ]
             )
 
+=======
+            {"ROCR_VISIBLE_DEVICES": "1,2,3", "HIP_VISIBLE_DEVICES": "0"},
+            {"ROCR_VISIBLE_DEVICES": "0", "HIP_VISIBLE_DEVICES": None},
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for env_config in custom_envs:
             env = os.environ.copy()
             for key, value in env_config.items():
@@ -4226,7 +4525,10 @@ def run():
                 ss = torch.cuda.memory._snapshot()
 
                 trace_plot(ss)
+<<<<<<< HEAD
                 trace_plot(ss, filter_freed=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 segment_plot(ss)
                 text = json.dumps(ss)
 
@@ -4269,7 +4571,10 @@ def thefree():
     )
     @unittest.skipIf(not has_triton(), "test needs triton")
     @requiresCppContext
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_compile_regions(self):
         expected_allocation_sequence = [
             "Torch-Compiled Region: 0/0",
@@ -4371,7 +4676,10 @@ def should_capture2():
     def test_memory_plots_free_segment_stack(self):
         for context in ["alloc", "all", "state"]:
             try:
+<<<<<<< HEAD
                 torch._C._cuda_clearCublasWorkspaces()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.cuda.memory.empty_cache()
                 torch.cuda.memory._record_memory_history(context=context)
                 x = torch.rand(3, 4, device="cuda")
@@ -4386,6 +4694,7 @@ def test_memory_plots_free_segment_stack(self):
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @requiresCppContext
     def test_memory_plots_metadata(self):
         for context in ["alloc", "all", "state"]:
@@ -4411,6 +4720,10 @@ def test_memory_plots_metadata(self):
     def test_memory_snapshot_script(self):
         try:
             torch._C._cuda_clearCublasWorkspaces()
+=======
+    def test_memory_snapshot_script(self):
+        try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.memory.empty_cache()
             torch.cuda.memory._record_memory_history("state", stacks="python")
 
@@ -4432,17 +4745,28 @@ def foo():
         finally:
             torch.cuda.memory._record_memory_history(None)
 
+<<<<<<< HEAD
     @serialTest()
     def test_max_split_expandable(self):
         try:
             orig = torch.cuda.get_per_process_memory_fraction()
+=======
+    @serialTest
+    def test_max_split_expandable(self):
+        try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.memory.empty_cache()
             mb = 1024 * 1024
             _, all_memory = torch.cuda.memory.mem_get_info()
             pre_reserved = torch.cuda.memory_reserved()
             total_allowed = 120 * mb + pre_reserved
             fraction_allowed = total_allowed / all_memory
+<<<<<<< HEAD
             self.assertEqual(int(round(fraction_allowed * all_memory)), total_allowed)
+=======
+            self.assertEqual(int(fraction_allowed * all_memory), total_allowed)
+            orig = torch.cuda.get_per_process_memory_fraction()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
 
             def alloc(n):
@@ -4468,10 +4792,16 @@ def alloc(n):
         finally:
             torch.cuda.memory.set_per_process_memory_fraction(orig)
 
+<<<<<<< HEAD
     @serialTest()
     def test_garbage_collect_expandable(self):
         try:
             orig = torch.cuda.get_per_process_memory_fraction(0)
+=======
+    @serialTest
+    def test_garbage_collect_expandable(self):
+        try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.memory.empty_cache()
             mb = 1024 * 1024
             _, all_memory = torch.cuda.memory.mem_get_info()
@@ -4479,6 +4809,10 @@ def test_garbage_collect_expandable(self):
             total_allowed = 120 * mb + pre_reserved
             fraction_allowed = total_allowed / all_memory
             self.assertEqual((fraction_allowed * all_memory), total_allowed)
+<<<<<<< HEAD
+=======
+            orig = torch.cuda.get_per_process_memory_fraction(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
 
             def alloc(n):
@@ -4587,6 +4921,7 @@ def power2_div(size, div_factor):
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertEqual(reg_mem - start_mem, nbytes)
 
+<<<<<<< HEAD
         with self.assertRaises(ValueError):
             torch._C._accelerator_setAllocatorSettings("foo:1,bar:2")
 
@@ -4636,6 +4971,37 @@ def check_output(script: str) -> str:
         rc = check_output(test_script)
         self.assertEqual(rc, "cudaMallocAsync")
 
+=======
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings(
+                "garbage_collection_threshold:1.2"
+            )
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_use_cuda_host_register:none"
+            )
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_num_register_threads:none"
+            )
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_num_register_threads:1024"
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cachingAllocator_raw_alloc(self):
         # Test that raw_alloc respects the setting that
         # activates/deactivates the caching allocator
@@ -5458,7 +5824,10 @@ def test_mempool_empty_cache(self):
         segments = torch.cuda.memory._snapshot()["segments"]
         self.assertTrue(len(segments) > 0, "expected more than one segment")
 
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mempool_empty_cache_inactive(self):
         torch.cuda.empty_cache()
         allocator, dummy_allocator = self.get_dummy_allocator(check_vars=True)
@@ -5543,7 +5912,11 @@ def test_mempool_with_allocator(self):
             out_2 = torch.randn(nelem_1mb, device="cuda")
 
             # pool now should have 2 segments since the CUDACachingAllocator had
+<<<<<<< HEAD
             # to make a new 2 MB buffer to accommodate out_2
+=======
+            # to make a new 2 MB buffer to accomodate out_2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(pool.snapshot()), 2)
 
         self.assertEqual(len(pool.snapshot()), 2)
@@ -5674,6 +6047,7 @@ def my_function(pool):
             s = p.snapshot()
             self.assertEqual(len(s), 1, "Expected to have a single segment")
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
@@ -5817,6 +6191,8 @@ def test_graph_capture_reclaim_4_streams(self):
             "graph_capture_record_stream_reuse:False"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
@@ -5831,7 +6207,10 @@ def test_mempool_expandable(self):
                 out_0 = torch.randn(nelem_1mb, device="cuda")
         torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mempool_ctx_multithread(self):
         torch.cuda.empty_cache()
         segments = torch.cuda.memory._snapshot()["segments"]
@@ -6751,7 +7130,10 @@ def test_autocast_rnn(self):
                 for grad, grad_control in zip(grads, grads_control):
                     self.assertEqual(grad.half(), grad_control)
 
+<<<<<<< HEAD
     @serialTest()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_autocast_cache_leak(self):
         # Reported at https://github.com/pytorch/pytorch/issues/48049
         # Test is used to check, if autocast recaches the same parameters
@@ -6766,7 +7148,11 @@ def test_autocast_cache_leak(self):
                 first_iter_mem = torch.cuda.memory_allocated()
                 for _ in range(3):
                     out = linear(data)
+<<<<<<< HEAD
                 self.assertEqual(first_iter_mem, torch.cuda.memory_allocated())
+=======
+                self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_autocast_checkpointing(self):
         model = torch.nn.Sequential(
@@ -6790,11 +7176,22 @@ def test_cuda_autocast_deprecated_warning(self):
             with torch.cuda.amp.autocast():
                 _ = torch.ones(10)
 
+<<<<<<< HEAD
 
 @unittest.skipIf(
     os.environ.get("USE_LEGACY_DRIVER", None) == "1", "Doesn't work with older driver"
 )
 class TestCompileKernel(TestCase):
+=======
+    def test_cuda_module_loading_env(self):
+        torch.cuda.init()
+        val = os.environ.get("CUDA_MODULE_LOADING", "")
+        self.assertEqual(val, "LAZY")
+
+
+class TestCompileKernel(TestCase):
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel(self):
         # Simple vector addition kernel
@@ -6861,13 +7258,23 @@ def test_compile_kernel(self):
         self.assertEqual(c_int, expected_int)
 
         # Test with header code
+<<<<<<< HEAD
         scale_kernel_source = """
+=======
+        header_code = """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #define SCALE_FACTOR 2.0f
 
         __device__ float scale_value(float val) {
             return val * SCALE_FACTOR;
         }
+<<<<<<< HEAD
+
+=======
+        """
 
+        scale_kernel_source = """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         __global__ void scale_tensors(const float* input, float* output, int n) {
             int i = threadIdx.x + blockIdx.x * blockDim.x;
             if (i < n)
@@ -6875,7 +7282,13 @@ def test_compile_kernel(self):
         }
         """
 
+<<<<<<< HEAD
         scale_kernel = _compile_kernel(scale_kernel_source, "scale_tensors")
+=======
+        scale_kernel = _compile_kernel(
+            scale_kernel_source, "scale_tensors", header_code=header_code
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input_tensor = torch.rand(N, device="cuda")
         output_tensor = torch.empty_like(input_tensor)
@@ -6900,6 +7313,7 @@ def test_compile_kernel(self):
         with self.assertRaises(RuntimeError):
             _compile_kernel(invalid_kernel_source, "invalid_kernel")
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_large_shared_memory(self):
         kernel_source = """
@@ -6971,6 +7385,10 @@ def test_compile_kernel_large_shared_memory(self):
             kernel.set_shared_memory_config(excessive_shared_mem)
 
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+=======
+    @tf32_on_and_off(0.005)
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_advanced(self):
         # Test matrix multiplication
@@ -7021,10 +7439,14 @@ def test_compile_kernel_advanced(self):
 
         # Test with different compute capability if specified
         device_props = torch.cuda.get_device_properties(torch.cuda.current_device())
+<<<<<<< HEAD
         if not torch.version.hip:
             compute_cap = f"{device_props.major}{device_props.minor}"
         else:
             compute_cap = f"{device_props.gcnArchName}"
+=======
+        compute_cap = f"{device_props.major}{device_props.minor}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Recompile with explicit compute capability
         matmul_kernel_explicit = _compile_kernel(
@@ -7043,6 +7465,10 @@ def test_compile_kernel_advanced(self):
         # Verify results
         self.assertEqual(C_explicit, expected)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_as_custom_op(self):
         # Define a simple vector addition kernel
@@ -7102,10 +7528,18 @@ def _(a, b):
         expected = a + b
         torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_custom_op_validation(self):
         kernel_source = """
         __global__ void add_scalar(const float* input, float* output, double scalar, int n) {
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_custom_op_validation(self):
+        kernel_source = """
+        __global__ void add_scalar(const float* input, float* output, float scalar, int n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             int idx = blockIdx.x * blockDim.x + threadIdx.x;
             if (idx < n) {
                 output[idx] = input[idx] + scalar;
@@ -7149,6 +7583,7 @@ def _(input_tensor, scalar):
         expected = input_data + scalar_val
         torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_double_precision(self):
         """Test that Python floats are correctly handled as doubles in kernels."""
@@ -7314,6 +7749,8 @@ def test_compile_kernel_dlpack(self):
         a_dlpack[0] = 42.0
         self.assertEqual(a[0].item(), 42.0, "DLPack tensors should share memory")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 class TestCudaDeviceParametrized(TestCase):
@@ -7336,7 +7773,13 @@ def test_graph_external_wait_and_record(self):
         """
         from torch.cuda import _compile_kernel
 
+<<<<<<< HEAD
         spin_wait_kernel = _compile_kernel(kernel_source, "wait_for_cpu")
+=======
+        spin_wait_kernel = _compile_kernel(
+            kernel_source, "wait_for_cpu", compute_capability="70"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.ones(4, device="cuda")
         x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()
@@ -7396,7 +7839,11 @@ def test_graph_external_wait_and_record(self):
 
             # This writes allows wait_for_cpu to proceed
             # This is an atomic store at system scope according to this rule:
+<<<<<<< HEAD
             # "the scope is thread_scope_system and it is a load or store that affects a naturally-aligned object of sizes 1, 2, 4, 8, or 16 bytes on mapped memory"  # noqa: B950
+=======
+            # "the scope is thread_scope_system and and it is a load or store that affects a naturally-aligned object of sizes 1, 2, 4, 8, or 16 bytes on mapped memory"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_model.html#atomicity
 
             # Note that every CPU store is implicitly system scope,
diff --git a/test/test_cuda_multigpu.py b/test/test_cuda_multigpu.py
index 0ce0cbfa0e2b0..84e54779b0cb8 100644
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@@ -31,6 +31,10 @@
     run_tests,
     serialTest,
     skipCUDANonDefaultStreamIf,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_CUDA,
     TestCase,
 )
@@ -776,6 +780,11 @@ def _test_stream_event_nogil(self, sync_func, p2c, c2p):
             p2c.get()
             c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
 
+<<<<<<< HEAD
+=======
+    # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_stream_event_nogil(self):
         for sync_func in [
@@ -816,6 +825,10 @@ def test_stream_event_nogil(self):
             self.assertGreater(parent_time + child_time, total_time * 1.3)
 
     # This test is flaky for ROCm, see issue #62602
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_events_wait(self):
         d0 = torch.device("cuda:0")
@@ -884,6 +897,10 @@ def test_events_multi_gpu_query(self):
             self.assertTrue(e1.query())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_events_multi_gpu_elapsed_time(self):
         d0 = torch.device("cuda:0")
         d1 = torch.device("cuda:1")
@@ -962,7 +979,11 @@ def test_external_streams_multi_device(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_caching_pinned_memory_multi_gpu(self):
+<<<<<<< HEAD
         # checks that the events preventing pinned memory from being reused
+=======
+        # checks that the events preventing pinned memory from being re-used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # too early are recorded on the correct GPU
         cycles_per_ms = get_cycles_per_ms()
 
@@ -977,7 +998,11 @@ def test_caching_pinned_memory_multi_gpu(self):
 
         del t
         t = torch.FloatTensor([2]).pin_memory()
+<<<<<<< HEAD
         self.assertNotEqual(t.data_ptr(), ptr, msg="allocation reused too soon")
+=======
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch.cuda.device(0):
             gpu_tensor0.copy_(t, non_blocking=True)
diff --git a/test/test_cuda_nvml_based_avail.py b/test/test_cuda_nvml_based_avail.py
index 3da49da57ad4c..af6a1eef62c18 100644
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@@ -127,7 +127,11 @@ def test_partial_uuid_resolver(self):
             _transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
         )
         self.assertEqual(
+<<<<<<< HEAD
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-1", "GPU-47"], uuids),
+=======
+            _transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [1, 7, 5],
         )
         # First invalid UUID aborts parsing
@@ -138,7 +142,11 @@ def test_partial_uuid_resolver(self):
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
             [1],
         )
+<<<<<<< HEAD
         # First ambiguous UUID aborts parsing
+=======
+        # First ambigous UUID aborts parsing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
         )
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 7ce0b19ce884f..53068566e627b 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -4,8 +4,17 @@
 import unittest
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 from torch.testing._internal.common_utils import NoTest, run_tests, skipIfRocm, TestCase
+=======
+from torch.testing._internal.common_cuda import (
+    _get_torch_cuda_version,
+    TEST_CUDA,
+    TEST_MULTIGPU,
+)
+from torch.testing._internal.common_utils import NoTest, run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NOTE: this needs to be run in a brand new process
@@ -31,17 +40,29 @@ def setUp(self):
                 TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
             )
 
+<<<<<<< HEAD
     @skipIfRocm(
         msg="last checked in ROCm 7, HIP runtime doesn't create context for hipSetDevice()"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_device_0(self):
         # In CUDA 12 the behavior of cudaSetDevice has changed. It eagerly creates context on target.
         # The behavior of `torch.cuda.set_device(0)` should also create context on the device 0.
         # Initially, we should not have any context on device 0.
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         torch.cuda.set_device(0)
+<<<<<<< HEAD
         # Now after the device was set, the context should present in CUDA 12.
         self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
+=======
+        if _get_torch_cuda_version() >= (12, 0):
+            # Now after the device was set, the contex should present in CUDA 12.
+            self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
+        else:
+            # In CUDA 11 the context should not be created.
+            self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_str_repr(self):
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index 5408a8ca82655..eb46c3c7e0900 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -21,6 +21,10 @@
 import torch.testing._internal.optests as optests
 import torch.utils._pytree as pytree
 import torch.utils.cpp_extension
+<<<<<<< HEAD
+=======
+from functorch import make_fx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import Tensor
 from torch._custom_op.impl import CustomOp, infer_schema
 from torch._library.fake_profile import (
@@ -33,9 +37,13 @@
     TensorMetadata,
 )
 from torch._library.infer_schema import tuple_to_list
+<<<<<<< HEAD
 from torch._library.opaque_object import make_opaque, OpaqueType
 from torch._utils_internal import get_file_path_2  # @manual
 from torch.fx.experimental.proxy_tensor import make_fx
+=======
+from torch._utils_internal import get_file_path_2  # @manual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing._internal import custom_op_db
 from torch.testing._internal.common_cuda import TEST_CUDA
@@ -56,7 +64,10 @@
     TestCase,
 )
 from torch.testing._internal.custom_op_db import numpy_nonzero
+<<<<<<< HEAD
 from torch.testing._internal.two_tensor import TwoTensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Shadowed by `torch.testing._internal.common_utils.custom_op`
@@ -169,7 +180,10 @@ def foo_impl(x):
         lib.impl("foo", Foo.apply, "Autograd")
         lib.impl("foo", foo_impl, "CPU")
         lib.impl("foo", foo_impl, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", foo_impl, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.tensor(3.14159 / 3, requires_grad=True, device=device)
         with self.assertRaisesRegex(
@@ -274,7 +288,10 @@ def foo_impl(x):
         lib.impl("foo", Foo.apply, "Autograd")
         lib.impl("foo", foo_impl, "CPU")
         lib.impl("foo", foo_impl, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", foo_impl, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.tensor([0, 1.0], requires_grad=True)
         with self.assertRaisesRegex(
@@ -316,7 +333,10 @@ def foo_meta(x):
         lib.impl("foo", Foo.apply, "Autograd")
         lib.impl("foo", foo_impl, "CPU")
         lib.impl("foo", foo_impl, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", foo_impl, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lib.impl("foo", foo_meta, "Meta")
 
         x = torch.tensor([0, 1.0], requires_grad=True)
@@ -348,7 +368,10 @@ def foo_meta(x):
         lib.impl("foo", Foo.apply, "Autograd")
         lib.impl("foo", foo_impl, "CPU")
         lib.impl("foo", foo_impl, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", foo_impl, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lib.impl("foo", foo_meta, "Meta")
 
         x = torch.tensor([0, 1.0])
@@ -375,7 +398,10 @@ def backward(ctx, gx):
 
         lib.impl("foo", Foo.apply, "CPU")
         lib.impl("foo", Foo.apply, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", Foo.apply, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lib.impl("foo", lambda x: x.clone(), "Meta")
 
         x = torch.randn([], requires_grad=True)
@@ -469,7 +495,10 @@ def foo_impl(x):
         lib.impl("foo", Foo.apply, "Autograd")
         lib.impl("foo", foo_impl, "CPU")
         lib.impl("foo", foo_impl, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", foo_impl, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(3, requires_grad=True, device=device)
         # Should not raise
@@ -519,7 +548,10 @@ def backward(ctx, gx):
 
         lib.impl("foo", Foo.apply, "CPU")
         lib.impl("foo", Foo.apply, "CUDA")
+<<<<<<< HEAD
         lib.impl("foo", Foo.apply, "XPU")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(3, requires_grad=True, device=device)
         with self.assertRaisesRegex(AssertionError, "incorrectly registered"):
@@ -546,6 +578,65 @@ def test_assert_raises_regex(self, device):
 class TestCustomOp(CustomOpTestCaseBase):
     test_ns = "_test_custom_op"
 
+<<<<<<< HEAD
+=======
+    def test_deploy_interaction(self):
+        # run in a different process to avoid parallel issues when we monkeypatch torch._running_with_deploy
+        script = """
+import torch
+torch._running_with_deploy = lambda: True
+
+# creating the library is a no-op, so you can DEF multiple times
+m1 = torch.library.Library("mylib4392", "DEF")  # noqa: TOR901
+m2 = torch.library.Library("mylib4392", "DEF")  # noqa: TOR901
+
+m = torch.library.Library("aten", "FRAGMENT")  # noqa: TOR901
+
+# define is a no-op
+m.define("foobarbaz9996(Tensor x) -> Tensor")
+assert not hasattr(torch.ops.aten, "foobarbaz9996"), "m.define should have been a noop"
+
+def sin_override(x):
+    raise AssertionError("m.impl should have been a noop")
+
+# impl is a no-op
+m.impl("sin", sin_override, "CompositeImplicitAutograd")
+x = torch.randn(3)
+y = torch.sin(x)
+
+# should be a no-op
+@torch.library.custom_op("mylib::foobar", mutates_args={})
+def foobar(x: torch.Tensor) -> torch.Tensor:
+    return x.sin()
+
+# should be a no-op
+@foobar.register_fake
+def _(x):
+    return torch.empty_like(x)
+
+# should be a no-op
+m2.define("foobarbaz9996(Tensor x) -> Tensor")
+
+# should be a no-op
+@torch.library.register_fake("mylib4392::foobarbaz9996")
+def _(x):
+    return torch.empty_like(x)
+        """
+        script = script.strip()
+        env = os.environ.copy()
+        try:
+            subprocess.check_output(
+                [sys.executable, "-c", script],
+                stderr=subprocess.STDOUT,
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+                env=env,
+            )
+        except subprocess.CalledProcessError as e:
+            self.fail(msg=("Subprocess exception:\n" + e.output.decode("utf-8")))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_compile
     def test_functionalize_error(self):
         with torch.library._scoped_library(self.test_ns, "FRAGMENT") as lib:
@@ -583,7 +674,11 @@ def g(x):
                 g(x)
 
     def test_invalid_schemas(self):
+<<<<<<< HEAD
         # function schema validation goes through torchgen, so this is just a
+=======
+        # function schmea validation goes through torchgen, so this is just a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # basic test.
         with self.assertRaisesRegex(AssertionError, "Invalid function schema: foo"):
             custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo", "(")
@@ -902,8 +997,11 @@ def _generate_examples(self, typ):
             return [torch.tensor(3)]
         if typ == Optional[torch.types.Number]:
             return [None, 2.718]
+<<<<<<< HEAD
         if typ == OpaqueType:
             return [make_opaque("moo")]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         origin = typing.get_origin(typ)
         if origin is Union:
             args = typing.get_args(typ)
@@ -1068,6 +1166,7 @@ def foo(x: Tensor, y: Callable) -> Tensor:
 
             del foo
 
+<<<<<<< HEAD
         # Define a named tuple for a Point with x and y coordinates
         Point = collections.namedtuple("Point", ["x", "y"])
         with self.assertRaisesRegex(ValueError, "unsupported type"):
@@ -1078,6 +1177,8 @@ def foo(x: Tensor, y: Point) -> Tensor:
 
             del foo
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_supported_schemas(self):
         # All of these should already be tested by PyTorch codegen
         # (we share the same mechanism), but here's a sanity check.
@@ -1622,7 +1723,11 @@ def test_impl_abstract_overload(self):
         lib = self.lib()
         lib.define("sin.blah(Tensor x) -> Tensor")
 
+<<<<<<< HEAD
         torch.library.register_fake(
+=======
+        torch.library.impl_abstract(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{self.test_ns}::sin.blah", torch.empty_like, lib=lib
         )
 
@@ -1635,7 +1740,11 @@ def test_impl_meta(self):
         def foo(x: torch.Tensor, dim: int) -> torch.Tensor:
             raise NotImplementedError
 
+<<<<<<< HEAD
         @torch.library.register_fake(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+=======
+        @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def foo_meta(x, dim):
             output_shape = list(x.shape)
             del output_shape[dim]
@@ -1651,7 +1760,11 @@ def test_duplicate_impl(self):
         def foo(x: torch.Tensor, dim: int) -> torch.Tensor:
             raise NotImplementedError
 
+<<<<<<< HEAD
         @torch.library.register_fake(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+=======
+        @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def foo_meta(x, dim):
             output_shape = list(x.shape)
             del output_shape[dim]
@@ -1659,7 +1772,11 @@ def foo_meta(x, dim):
 
         with self.assertRaisesRegex(RuntimeError, r"test_custom_ops.py:\d+"):
 
+<<<<<<< HEAD
             @torch.library.register_fake(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+=======
+            @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def foo_meta2(x, dim):
                 output_shape = list(x.shape)
                 del output_shape[dim]
@@ -1670,7 +1787,11 @@ def test_new_data_dependent_symint(self):
         def foo(x: torch.Tensor) -> torch.Tensor:
             raise NotImplementedError
 
+<<<<<<< HEAD
         @torch.library.register_fake(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+=======
+        @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def foo_meta(x):
             ctx = torch.library.get_ctx()
             r = ctx.new_dynamic_size(min=1)
@@ -1697,7 +1818,11 @@ def test_basic_make_fx(self):
         def foo(x: torch.Tensor) -> torch.Tensor:
             raise NotImplementedError
 
+<<<<<<< HEAD
         @torch.library.register_fake(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+=======
+        @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def foo_meta(x):
             return x.sum()
 
@@ -1782,8 +1907,12 @@ def f(x):
   Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
 
   Developer debug context: _torch_testing.numpy_nonzero.default
+<<<<<<< HEAD
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html""",
+=======
+""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # pre-existing problem: torch.compile(dynamic=True) will, by default,
@@ -1841,7 +1970,11 @@ def test_abstract_impl_on_existing_op(self):
         lib.define("foo(Tensor x) -> Tensor")
         qualname = f"{self.test_ns}::foo"
 
+<<<<<<< HEAD
         @torch.library.register_fake(qualname, lib=self.lib())
+=======
+        @torch.library.impl_abstract(qualname, lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def foo_impl(x):
             return x.sin()
 
@@ -1864,7 +1997,11 @@ def foo_impl(x):
         op = self.get_op(qualname)
 
         with self.assertRaisesRegex(RuntimeError, r"already has .*Meta implementation"):
+<<<<<<< HEAD
             torch.library.register_fake(qualname, foo_impl, lib=self.lib())
+=======
+            torch.library.impl_abstract(qualname, func=foo_impl, lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_abstract_impl_on_existing_op_with_CompositeImplicitAutograd(self):
         lib = self.lib()
@@ -1878,7 +2015,11 @@ def foo_impl(x):
         op = self.get_op(qualname)
 
         with self.assertRaisesRegex(RuntimeError, "CompositeImplicitAutograd"):
+<<<<<<< HEAD
             torch.library.register_fake(qualname, foo_impl, lib=self.lib())
+=======
+            torch.library.impl_abstract(qualname, func=foo_impl, lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_abstract_impl_on_existing_op_with_CompositeExplicitAutograd(self):
         lib = self.lib()
@@ -1891,7 +2032,11 @@ def foo_impl(x):
         lib.impl("foo", foo_impl, "CompositeExplicitAutograd")
         op = self.get_op(qualname)
 
+<<<<<<< HEAD
         torch.library.register_fake(qualname, lambda x: x.sum(), lib=self.lib())
+=======
+        torch.library.impl_abstract(qualname, func=lambda x: x.sum(), lib=self.lib())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch._subclasses.FakeTensorMode():
             x = torch.randn(10)
             result = op(x)
@@ -2347,6 +2492,7 @@ def test_autograd_function_backed_op(self, load_inline):
         loss.backward()
         self.assertEqual(x.grad, temp)
 
+<<<<<<< HEAD
     # Using a non-existent DSO is a quick way to trigger an OSError,
     # which can be used to not break BC.
     def test_load_library(self):
@@ -2355,6 +2501,8 @@ def test_load_library(self):
         ):
             torch.ops.load_library("libnoexist.so")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def op_with_incorrect_schema(testcase, name):
     lib = testcase.lib()
@@ -2564,6 +2712,7 @@ def sin_(x):
         self.assertEqual(x, expected)
 
     @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+<<<<<<< HEAD
     def test_subclass_accessor_view_error(self):
         @torch.library.custom_op(
             "_torch_testing::_failing_two_tensor_accessor",
@@ -2669,6 +2818,8 @@ def setup_ctx(ctx, inputs, output):
         self.assertEqual(leaf.grad, MyTwoTensor(2 * torch.ones(3), torch.ones(3)))
 
     @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kwarg_only_tensors(self):
         with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
 
@@ -2791,7 +2942,11 @@ def backward(ctx, grad):
                 self.assertEqual(ctx.needs_input_grad, expected)
                 return list(grad.unbind(0))
 
+<<<<<<< HEAD
         # call two applies, do a backward on the first
+=======
+        # call two applys, do a backward on the first
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def t():
             return torch.randn([], requires_grad=True)
 
@@ -4756,10 +4911,15 @@ def test_version(self):
             loaded = read_profiles_from_yaml(yaml_str)
 
 
+<<<<<<< HEAD
 only_for = ("cpu", "cuda", "xpu")
 instantiate_device_type_tests(
     TestCustomOpTesting, globals(), only_for=only_for, allow_xpu=True
 )
+=======
+only_for = ("cpu", "cuda")
+instantiate_device_type_tests(TestCustomOpTesting, globals(), only_for=only_for)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(TestCustomOp)
 instantiate_parametrized_tests(TestCustomOpAPI)
 
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 3ac803239c53f..7221e59417f63 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -25,7 +25,10 @@
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_JETSON,
+<<<<<<< HEAD
     IS_MACOS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_S390X,
     IS_SANDCASTLE,
     IS_WINDOWS,
@@ -33,11 +36,19 @@
     parametrize,
     run_tests,
     skipIfNoDill,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfXpu,
     slowTest,
     TEST_CUDA,
     TEST_NUMPY,
     TEST_WITH_ASAN,
+<<<<<<< HEAD
+=======
+    TEST_WITH_ROCM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_TSAN,
     TestCase,
     xfailIfLinux,
@@ -88,14 +99,22 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST_CUDA_IPC = (
     torch.cuda.is_available()
     and sys.platform != "darwin"
     and sys.platform != "win32"
     and not IS_JETSON
+<<<<<<< HEAD
     #    and not TEST_WITH_ROCM
+=======
+    and not TEST_WITH_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )  # https://github.com/pytorch/pytorch/issues/90940
 
 TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
@@ -733,12 +752,21 @@ class SleepDataset(Dataset):
     def __init__(self, size, sleep_sec):
         self.size = size
         self.sleep_sec = sleep_sec
+<<<<<<< HEAD
         self.slept = False
 
     def __getitem__(self, idx):
         if not self.slept:
             time.sleep(self.sleep_sec)
             self.slept = True
+=======
+        self.sleeped = False
+
+    def __getitem__(self, idx):
+        if not self.sleeped:
+            time.sleep(self.sleep_sec)
+            self.sleeped = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return idx
 
     def __len__(self):
@@ -1864,6 +1892,10 @@ def test_chain_iterable_style_dataset(self):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
     @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # https://github.com/pytorch/pytorch/issues/90940
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multiprocessing_contexts(self):
         reference = [
             torch.arange(3),
@@ -2488,6 +2520,10 @@ def test_partial_workers(self):
                 self.assertFalse(pin_memory_thread.is_alive())
 
     # Takes 2.5min to finish, see https://github.com/pytorch/pytorch/issues/46065
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_PSUTIL, "psutil not found")
     @slowTest
     def test_proper_exit(self):
@@ -3130,6 +3166,7 @@ def test_pin_memory(self):
             self.assertTrue(sample["a_tensor"].is_pinned())
             self.assertTrue(sample["another_dict"]["a_number"].is_pinned())
 
+<<<<<<< HEAD
     @skipIfXpu
     @unittest.skipIf(TEST_CUDA, "Test for when CUDA is not available")
     def test_pin_memory_no_cuda(self):
@@ -3138,6 +3175,8 @@ def test_pin_memory_no_cuda(self):
             self.assertFalse(sample["a_tensor"].is_pinned())
             self.assertFalse(sample["another_dict"]["a_number"].is_pinned())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_pin_memory_device(self):
         loader = DataLoader(
@@ -3473,10 +3512,13 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             if current_worker_idx == num_workers:
                 current_worker_idx = 0
 
+<<<<<<< HEAD
     @unittest.skipIf(
         IS_WINDOWS or IS_MACOS,
         "Flaky on Windows and MacOS https://github.com/pytorch/pytorch/issues/68643",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ind_worker_queue(self):
         max_num_workers = None
         if hasattr(os, "sched_getaffinity"):
@@ -3494,7 +3536,11 @@ def test_ind_worker_queue(self):
             max_num_workers = 1
 
         for batch_size in (8, 16, 32, 64):
+<<<<<<< HEAD
             for num_workers in range(min(6, max_num_workers)):
+=======
+            for num_workers in range(0, min(6, max_num_workers)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._run_ind_worker_queue_test(
                     batch_size=batch_size, num_workers=num_workers + 1
                 )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 2790145665b13..a870797b89bb6 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -520,7 +520,11 @@ def test_demux_mux_datapipe(self):
         self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
+<<<<<<< HEAD
         source_numbers = list(range(10)) + [10, 12]
+=======
+        source_numbers = list(range(0, 10)) + [10, 12]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
@@ -573,7 +577,11 @@ def operations(df):
 
 class TestDataFramesPipes(TestCase):
     """
+<<<<<<< HEAD
     Most of test will fail if pandas installed, but no dill available.
+=======
+    Most of test will fail if pandas instaled, but no dill available.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Need to rework them to avoid multiple skips.
     """
 
@@ -1257,7 +1265,11 @@ def test_demux_iterdatapipe(self):
         )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
+<<<<<<< HEAD
         self.assertEqual(list(range(5)), output2)
+=======
+        self.assertEqual(list(range(0, 5)), output2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Functional Test: values of the same classification are lumped together, and unlimited buffer
         with warnings.catch_warnings(record=True) as wa:
@@ -1271,7 +1283,11 @@ def test_demux_iterdatapipe(self):
             self.assertRegex(str(wa[-1].message), r"Unlimited buffer size is set")
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
+<<<<<<< HEAD
         self.assertEqual(list(range(5)), output2)
+=======
+        self.assertEqual(list(range(0, 5)), output2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Functional Test: classifier returns a value outside of [0, num_instance - 1]
         dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
@@ -1887,7 +1903,11 @@ def _non_bool_fn(data):
         with self.assertRaises(ValueError):
             list(filter_dp)
 
+<<<<<<< HEAD
         # Functional Test: Specify input_col
+=======
+        # Funtional Test: Specify input_col
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tuple_input_ds = dp.iter.IterableWrapper([(d - 1, d, d + 1) for d in range(10)])
 
         # Single input_col
@@ -2478,7 +2498,11 @@ def test_issubinstance(self):
             else:
                 self.assertFalse(issubinstance(d, S))
             for t in basic_type:
+<<<<<<< HEAD
                 if type(d) is t:
+=======
+                if type(d) == t:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertTrue(issubinstance(d, t))
                 else:
                     self.assertFalse(issubinstance(d, t))
@@ -2577,7 +2601,11 @@ def __iter__(self):
 
         self.assertTrue(issubclass(DP4, IterDataPipe))
         dp4 = DP4()
+<<<<<<< HEAD
         self.assertTrue(dp4.type.param is tuple)
+=======
+        self.assertTrue(dp4.type.param == tuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class DP5(IterDataPipe):
             r"""DataPipe without type annotation"""
@@ -2601,7 +2629,11 @@ def __iter__(self) -> Iterator:
 
         self.assertTrue(issubclass(DP6, IterDataPipe))
         dp6 = DP6()
+<<<<<<< HEAD
         self.assertTrue(dp6.type.param is int)
+=======
+        self.assertTrue(dp6.type.param == int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class DP7(IterDataPipe[Awaitable[T_co]]):
             r"""DataPipe with abstract base class"""
@@ -3356,6 +3388,66 @@ def construct_sharded_pipe():
         with self.assertRaises(Exception):
             dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
 
+<<<<<<< HEAD
+=======
+    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatbility
+    # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
+    def test_sharding_groups_in_legacy_grouping_package(self):
+        with self.assertWarnsRegex(
+            FutureWarning,
+            r"Please use `SHARDING_PRIORITIES` "
+            "from the `torch.utils.data.datapipes.iter.sharding`",
+        ):
+            from torch.utils.data.datapipes.iter.grouping import (
+                SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES,
+            )
+
+        def construct_sharded_pipe():
+            sharding_pipes = []
+            dp = NumbersDataset(size=90)
+            dp = dp.sharding_filter(
+                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
+            )
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(
+                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(sharding_group_filter=300)
+            sharding_pipes.append(dp)
+            return dp, sharding_pipes
+
+        dp, sharding_pipes = construct_sharded_pipe()
+
+        for pipe in sharding_pipes:
+            pipe.apply_sharding(
+                2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
+            )
+            pipe.apply_sharding(
+                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
+            pipe.apply_sharding(3, 1, sharding_group=300)
+
+        actual = list(dp)
+        expected = [17, 47, 77]
+        self.assertEqual(expected, actual)
+        self.assertEqual(3, len(dp))
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
+        with self.assertRaises(Exception):
+            dp.apply_sharding(
+                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(
+            5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+        )
+        with self.assertRaises(Exception):
+            dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_legacy_custom_sharding(self):
         dp = self._get_pipeline()
         sharded_dp = CustomShardingIterDataPipe(dp)
diff --git a/test/test_decomp.py b/test/test_decomp.py
index c65bc07cd9c9b..830f5cb31456e 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -15,7 +15,11 @@
 from torch._export.utils import _is_cia_op
 from torch._ops import DispatchKey
 from torch.testing import make_tensor
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import SM70OrLater, tf32_off
+=======
+from torch.testing._internal.common_cuda import tf32_off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCPU,
@@ -220,8 +224,11 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.bfloat16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3,
         (torch.float16, torch.ops.aten.reflection_pad3d_backward.default): 5e-3,
         (torch.bfloat16, torch.ops.aten.reflection_pad3d_backward.default): 5e-2,
+<<<<<<< HEAD
         (torch.float16, torch.ops.aten._batch_norm_with_update.default): 2e-7,
         (torch.bfloat16, torch.ops.aten._batch_norm_with_update.default): 2e-7,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # see https://github.com/pytorch/pytorch/pull/96264
         (torch.float16, torch.ops.aten.mv.default): 1e-5,
         (torch.bfloat16, torch.ops.aten.mv.default): 1e-5,
@@ -297,7 +304,10 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         rtol, atol = tol_table[(decomp.dtype, op)]
     else:
         rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_case.assertEqual(
         orig,
         decomp,
@@ -857,13 +867,18 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             #  de-functionalise the graph, as that would break AoTAutograd
             # We run the real function *after* the decomposition to make sure that the
             # decomposition does not modify any of the inputs in-place. If it does
+<<<<<<< HEAD
             # real_out should be different than decom_out so we should catch this
+=======
+            # real_out should be differen than decom_out so we should catch this
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             real_out_unflat = func(*args, **kwargs)
             real_out = pytree.tree_leaves(real_out_unflat)
 
             assert len(real_out) == len(decomp_out)
 
             if do_relative_check:
+<<<<<<< HEAD
                 device_arg = kwargs.get("device", None)
 
                 def upcast(x):
@@ -874,6 +889,9 @@ def upcast(x):
                     else:
                         return upcast_tensor(x, dtype=torch.float64)
 
+=======
+                upcast = partial(upcast_tensor, dtype=torch.float64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 real_out_double, _ = tree_flatten(
                     func(*tree_map(upcast, args), **tree_map(upcast, kwargs))
                 )
@@ -881,7 +899,11 @@ def upcast(x):
                     zip(real_out, decomp_out, real_out_double)
                 ):
                     if not isinstance(orig, torch.Tensor):
+<<<<<<< HEAD
                         assert type(orig) is type(decomp)
+=======
+                        assert type(orig) == type(decomp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assert orig == decomp
                         continue
                     op_assert_ref(
@@ -898,7 +920,11 @@ def upcast(x):
             else:
                 for orig, decomp in zip(real_out, decomp_out):
                     if not isinstance(orig, torch.Tensor):
+<<<<<<< HEAD
                         assert type(orig) is type(decomp)
+=======
+                        assert type(orig) == type(decomp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assert orig == decomp
                         continue
                     op_assert_equal(
@@ -945,7 +971,11 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
+<<<<<<< HEAD
             and dtype != torch.complex32
+=======
+            and not dtype == torch.complex32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
@@ -1238,6 +1268,7 @@ def f(x, w, b):
         for o_ref, o in zip(out_ref, out):
             self.assertEqual(o_ref.dtype, o.dtype)
 
+<<<<<<< HEAD
     @onlyCUDA
     @unittest.skipIf(not SM70OrLater, "triton")
     def test_rms_norm_decomp_cuda(self, device):
@@ -1265,6 +1296,8 @@ def forward_pass_fn():
             "triton_per_fused__fused_rms_norm_backward_cosh_mul" in generated_codes[1]
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(DecompOneOffTests, globals())
 
@@ -1346,6 +1379,7 @@ def test_aten_core_operators(self):
         core_aten_ops = useful_decomps - core_decomps
         self.assertExpected("".join(sorted(op.name() + "\n" for op in core_aten_ops)))
 
+<<<<<<< HEAD
     def test_conv1d_decomposition(self):
         from torch._inductor.decomposition import conv1d_to_conv2d
 
@@ -1395,6 +1429,8 @@ def check_case(
         check_case(groups=1, C_in=8, C_out=12)  # groups=1 bigger
         check_case(groups=2, C_in=8, C_out=12)  # grouped conv
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_deploy.py b/test/test_deploy.py
new file mode 100644
index 0000000000000..b852802c0c20f
--- /dev/null
+++ b/test/test_deploy.py
@@ -0,0 +1,43 @@
+# Owner(s): ["oncall: package/deploy"]
+
+import textwrap
+import types
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils._freeze import Freezer, PATH_MARKER
+
+
+class TestFreezer(TestCase):
+    """Tests the freeze.py script"""
+
+    def test_compile_string(self):
+        freezer = Freezer(True)
+        code_str = textwrap.dedent(
+            """
+            class MyCls:
+                def __init__(self) -> None:
+                    pass
+            """
+        )
+        co = freezer.compile_string(code_str)
+        num_co = 0
+
+        def verify_filename(co: types.CodeType):
+            nonlocal num_co
+
+            if not isinstance(co, types.CodeType):
+                return
+
+            self.assertEqual(PATH_MARKER, co.co_filename)
+            num_co += 1
+
+            for nested_co in co.co_consts:
+                verify_filename(nested_co)
+
+        verify_filename(co)
+        # there is at least one nested code object besides the top level one
+        self.assertTrue(num_co >= 2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index 3d6c4ae7484cb..ecbe050834bd7 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -3,6 +3,7 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import (
+<<<<<<< HEAD
     deviceCountAtLeast,
     dtypes,
     dtypesIfMPS,
@@ -26,6 +27,18 @@
     TestCase,
 )
 from torch.utils.dlpack import DLDeviceType, from_dlpack, to_dlpack
+=======
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    skipCUDAIfRocm,
+    skipMeta,
+)
+from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing._internal.common_utils import IS_JETSON, run_tests, TestCase
+from torch.utils.dlpack import from_dlpack, to_dlpack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Wraps a tensor, exposing only DLPack methods:
@@ -60,7 +73,10 @@ class TestTorchDlPack(TestCase):
             torch.uint64,
         )
     )
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dlpack_capsule_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(to_dlpack(x))
@@ -78,7 +94,10 @@ def test_dlpack_capsule_conversion(self, device, dtype):
             torch.uint64,
         )
     )
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dlpack_protocol_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(x)
@@ -87,13 +106,29 @@ def test_dlpack_protocol_conversion(self, device, dtype):
     @skipMeta
     @onlyNativeDeviceTypes
     def test_dlpack_shared_storage(self, device):
+<<<<<<< HEAD
         dtype = torch.bfloat16 if device.startswith("mps") else torch.float64
         x = make_tensor((5,), dtype=dtype, device=device)
+=======
+        x = make_tensor((5,), dtype=torch.float64, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         z = from_dlpack(to_dlpack(x))
         z[0] = z[0] + 20.0
         self.assertEqual(z, x)
 
+<<<<<<< HEAD
     def _dlpack_conversion_with_streams(self, stream, x):
+=======
+    @skipMeta
+    @onlyCUDA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    def test_dlpack_conversion_with_streams(self, device, dtype):
+        # Create a stream where the tensor will reside
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            # Do an operation in the actual stream
+            x = make_tensor((5,), dtype=dtype, device=device) + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # DLPack protocol helps establish a correct stream order
         # (hence data dependency) at the exchange boundary.
         # DLPack manages this synchronization for us, so we don't need to
@@ -106,6 +141,7 @@ def _dlpack_conversion_with_streams(self, stream, x):
         with torch.cuda.stream(stream):
             z = from_dlpack(x)
         stream.synchronize()
+<<<<<<< HEAD
         return z
 
     @skipMeta
@@ -139,6 +175,11 @@ def test_dlpack_conversion_with_streams_narrow_precision(self, device, dtype):
         self.assertEqual(z.view(torch.uint8), x.view(torch.uint8))
 
     @skipMeta
+=======
+        self.assertEqual(z, x)
+
+    @skipMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(
         *all_types_and_complex_and(
@@ -150,14 +191,20 @@ def test_dlpack_conversion_with_streams_narrow_precision(self, device, dtype):
             torch.uint64,
         )
     )
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_from_dlpack(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
         self.assertEqual(x, y)
 
     @skipMeta
+<<<<<<< HEAD
     @skipIfMPS  # MPS crashes with noncontiguous now
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(
         *all_types_and_complex_and(
@@ -204,12 +251,17 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype):
         # in the current stream to make sure that it was correctly populated.
         with torch.cuda.stream(stream_a):
             x = make_tensor((5,), dtype=dtype, device=device) + 1
+<<<<<<< HEAD
             z = torch.from_dlpack(x.__dlpack__(stream=stream_b.cuda_stream))
+=======
+            z = torch.from_dlpack(x.__dlpack__(stream_b.cuda_stream))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             stream_a.synchronize()
         stream_b.synchronize()
         self.assertEqual(z, x)
 
     @skipMeta
+<<<<<<< HEAD
     @onlyCUDA
     @dtypes(
         torch.float8_e5m2,
@@ -231,6 +283,8 @@ def test_dlpack_conversion_with_diff_streams_narrow_precision(self, device, dtyp
         self.assertEqual(z.view(torch.uint8), x.view(torch.uint8))
 
     @skipMeta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(
         *all_types_and_complex_and(
@@ -242,7 +296,10 @@ def test_dlpack_conversion_with_diff_streams_narrow_precision(self, device, dtyp
             torch.uint64,
         )
     )
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_from_dlpack_dtype(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
@@ -263,7 +320,11 @@ def __dlpack__(self, stream=None):
                     assert stream == 1
                 else:
                     assert stream == 0
+<<<<<<< HEAD
                 capsule = self.tensor.__dlpack__(stream=stream)
+=======
+                capsule = self.tensor.__dlpack__(stream)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return capsule
 
         # CUDA-based tests runs on non-default streams
@@ -286,7 +347,11 @@ def test_dlpack_convert_default_stream(self, device):
             x = torch.zeros(1, device=device)
             torch.cuda._sleep(2**20)
             self.assertTrue(torch.cuda.default_stream().query())
+<<<<<<< HEAD
             x.__dlpack__(stream=1)
+=======
+            x.__dlpack__(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check that the default stream has work (a pending cudaStreamWaitEvent)
         self.assertFalse(torch.cuda.default_stream().query())
 
@@ -298,6 +363,7 @@ def test_dlpack_tensor_invalid_stream(self, device, dtype):
             x = make_tensor((5,), dtype=dtype, device=device)
             x.__dlpack__(stream=object())
 
+<<<<<<< HEAD
     @skipMeta
     @onlyCUDA
     @skipCUDAIfRocm
@@ -354,25 +420,39 @@ def test_dlpack_tensor_on_different_device(self, devices):
             with torch.device(dev1):
                 x.__dlpack__()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: add interchange tests once NumPy 1.22 (dlpack support) is required
     @skipMeta
     def test_dlpack_export_requires_grad(self):
         x = torch.zeros(10, dtype=torch.float32, requires_grad=True)
+<<<<<<< HEAD
         with self.assertRaisesRegex(BufferError, r"require gradient"):
+=======
+        with self.assertRaisesRegex(RuntimeError, r"require gradient"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x.__dlpack__()
 
     @skipMeta
     def test_dlpack_export_is_conj(self):
         x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
         y = torch.conj(x)
+<<<<<<< HEAD
         with self.assertRaisesRegex(BufferError, r"conjugate bit"):
+=======
+        with self.assertRaisesRegex(RuntimeError, r"conjugate bit"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y.__dlpack__()
 
     @skipMeta
     def test_dlpack_export_non_strided(self):
         x = torch.sparse_coo_tensor([[0]], [1], size=(1,))
         y = torch.conj(x)
+<<<<<<< HEAD
         with self.assertRaisesRegex(BufferError, r"strided"):
+=======
+        with self.assertRaisesRegex(RuntimeError, r"strided"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y.__dlpack__()
 
     @skipMeta
@@ -383,8 +463,13 @@ def test_dlpack_normalize_strides(self):
         self.assertEqual(y.stride(), (3,))
         z = from_dlpack(y)
         self.assertEqual(z.shape, (1,))
+<<<<<<< HEAD
         # Stride normalization has been removed, strides should be preserved
         self.assertEqual(z.stride(), (3,))
+=======
+        # gh-83069, make sure __dlpack__ normalizes strides
+        self.assertEqual(z.stride(), (1,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipMeta
     @onlyNativeDeviceTypes
@@ -399,6 +484,7 @@ def test_automatically_select_in_creation(self, device):
         new_tensor = torch.tensor(wrap)
         self.assertEqual(tensor, new_tensor)
 
+<<<<<<< HEAD
     @skipMeta
     @skipIfTorchDynamo("__dlpack__ doesn't work with dynamo")
     @onlyNativeDeviceTypes
@@ -536,6 +622,10 @@ def test_dlpack_unsupported_dtype_error(self, device):
 
 
 instantiate_device_type_tests(TestTorchDlPack, globals(), allow_mps=True)
+=======
+
+instantiate_device_type_tests(TestTorchDlPack, globals())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index fb1d22805d50a..5a7531707ed97 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -8,7 +8,10 @@
 import unittest
 
 import numpy as np
+<<<<<<< HEAD
 import pytest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sympy
 
 import torch
@@ -16,7 +19,11 @@
 import torch.nn.functional as F
 from torch import sym_int, SymBool, SymFloat, SymInt
 from torch._C import _disabled_torch_function_impl
+<<<<<<< HEAD
 from torch._dynamo.testing import CompileCounter, CompileCounterWithBackend
+=======
+from torch._dynamo.testing import CompileCounterWithBackend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.utils import fresh_cache
 from torch.fx.experimental import sym_node
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -44,7 +51,10 @@
     parametrize,
     run_tests,
     skipIfTorchDynamo,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.logging_utils import logs_to_string
@@ -863,7 +873,11 @@ def test_mul_int_oo_nan(self):
         s2 = create_symint(shape_env, 5, duck=False)
         bool(s0 * (s1 // s0) == s2)
 
+<<<<<<< HEAD
     def test_non_overlapping_and_dense_backed(self):
+=======
+    def test_non_overlapping_and_dense(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
         r = torch.empty_strided((a0, 7), (1, a0), device="meta")
@@ -872,6 +886,10 @@ def test_non_overlapping_and_dense_backed(self):
     def test_non_overlapping_and_dense_unbacked(self):
         shape_env = ShapeEnv()
         u0 = shape_env.create_unbacked_symint()
+<<<<<<< HEAD
+=======
+        torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cf = torch.ops.aten.is_non_overlapping_and_dense.default
 
         self.assertEqual(IsNonOverlappingAndDenseIndicator(u0.node.expr, 2, 2, 1), 1)
@@ -897,6 +915,7 @@ def test_non_overlapping_and_dense_unbacked(self):
             )
         )
 
+<<<<<<< HEAD
     def test_prims_non_overlapping_and_dense(self):
         shape_env = ShapeEnv()
         cf = torch._prims_common.is_non_overlapping_and_dense
@@ -953,6 +972,8 @@ def test_prims_non_overlapping_and_dense(self):
             )
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sympy_optimized_add_binary_search(self):
         import sympy
 
@@ -1129,6 +1150,10 @@ def test_numpy_sym_min(self):
     def test_debug_has_internal_overlap_unbacked(self):
         shape_env = ShapeEnv()
         u0 = shape_env.create_unbacked_symint()
+<<<<<<< HEAD
+=======
+        torch._check_is_size(u0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cf = torch._debug_has_internal_overlap
         self.assertEqual(cf(torch.empty_strided((u0, 2), (2, 1), device="meta")), 0)
         self.assertEqual(cf(torch.empty_strided((2, u0), (1, 2), device="meta")), 0)
@@ -1386,7 +1411,11 @@ def test_ephemeral_source_unified_with_non_ephemeral_source(self):
             self.assertEqual(x.storage_offset(), y.storage_offset())
 
     def test_tensor_factory_with_symint(self):
+<<<<<<< HEAD
         args = list(range(3))
+=======
+        args = list(range(0, 3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected = torch.tensor(args)
 
         shape_env = ShapeEnv()
@@ -1428,6 +1457,7 @@ def f(a, b):
                 f(torch.tensor([1]), torch.tensor([1])), torch.tensor([20])
             )
 
+<<<<<<< HEAD
     @fresh_cache()
     def test_slice_backed_size_oblivious(self):
         @torch.compile(backend="inductor", fullgraph=True, dynamic=True)
@@ -1437,6 +1467,8 @@ def f(x):
         with torch.fx.experimental._config.patch(backed_size_oblivious=True):
             f(torch.randn(10, 10))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_baddbmm_symint(self):
         from torch._subclasses.fake_tensor import FakeTensorMode
 
@@ -1825,6 +1857,7 @@ def test_stride_symnode(self):
         self.assertTrue(isinstance(s3, int))
         self.assertTrue(str(s1.node.expr) != str(s2.node.expr))
 
+<<<<<<< HEAD
     @fresh_cache()
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     @parametrize("backend", ["inductor", "eager"])
@@ -1915,6 +1948,8 @@ def check(l, r):
         )
         self.assertEqual(z * torch.ones(z).sum(dim=x), 4)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
@@ -1956,6 +1991,7 @@ def test_floordiv_div_by_one(self):
                 TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y)
             )
 
+<<<<<<< HEAD
     def test_floordiv_div_does_not_generate_non_int_rational(self):
         s14 = sympy.Symbol("s14", integer=True, positive=True)
         s37 = sympy.Symbol("s37", integer=True, positive=True)
@@ -1969,6 +2005,8 @@ def test_floordiv_div_does_not_generate_non_int_rational(self):
         all_rationals_ints = all(r.q == 1 for r in rationals)
         self.assertTrue(all_rationals_ints)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_floordiv_simplify(self):
         # Tests how we simplify or evaluate FloorDiv without free variables
         shape_env = ShapeEnv()
@@ -2025,6 +2063,7 @@ def is_complex(x):
 
 
 class TestDimConstraints(TestCase):
+<<<<<<< HEAD
     @skipIfTorchDynamo("mark_dynamic not supported")
     def test_simplify_max_1_0(self):
         x = torch.rand(10)
@@ -2047,6 +2086,8 @@ def func(x, v):
         self.assertEqual(func(x, 1), x * 400)
         self.assertEqual(func(x, 0), x * 400)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dim_constraints_reduce_congruences_simple(self):
         from sympy import Symbol
 
@@ -3205,6 +3246,7 @@ def test_guards_float_div(self):
         self.assertTrue(shape_env.evaluate_guards_expression(guards, [hint_int(s0)]))
         self.assertFalse(shape_env.evaluate_guards_expression(guards, [hint_int(s1)]))
 
+<<<<<<< HEAD
     @unittest.skipIf(
         TEST_XPU, "Skipped on XPU"
     )  # https://github.com/intel/torch-xpu-ops/issues/2169"
@@ -3239,6 +3281,8 @@ def f(x, b):
             f"Size comparison should not cause recompilation.",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_remove_symbols_without_guarding(self):
         from torch._functorch.partitioners import _remove_symbols_without_guarding
 
@@ -3274,7 +3318,10 @@ def custom_pass(graph: torch.fx.Graph) -> torch.fx.Graph:
 
 
 class TestUnbacked(TestCase):
+<<<<<<< HEAD
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/156135")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     @parametrize("backend", ["inductor", "eager"])
     def test_deferred_neq_assert(self, backend):
@@ -3322,7 +3369,10 @@ def func(x, y):
         with self.assertRaises(RuntimeError):
             func(torch.rand(2, 50), torch.tensor([51]))
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/156135")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     @parametrize("backend", ["inductor", "eager"])
     def test_deferred_sym_or_assert(self, backend):
@@ -3344,7 +3394,10 @@ def test_has_free_symbols(self):
         self.assertTrue(has_free_symbols(sympy.sympify("a*2")))
         self.assertTrue(has_free_symbols(sympy.sympify("a+b")))
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/156135")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     @parametrize("backend", ["inductor", "eager"])
     def test_deferred_sym_eq_assert(self, backend):
@@ -3372,6 +3425,7 @@ def func(a, b):
         torch._dynamo.decorators.mark_unbacked(b, 0)
         func(a, b)
 
+<<<<<<< HEAD
         # inductor adds the check sometimes itself so it will be reflected
         # as AssertionError.
         with self.assertRaises((AssertionError, RuntimeError)):
@@ -3475,6 +3529,11 @@ def func(a, b):
         b = torch.tensor([1])
         func(a, b)
 
+=======
+        with self.assertRaises(RuntimeError):
+            func(a, torch.rand(2, 1))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestUbackedOps(TestCase):
     @fresh_cache()
@@ -3489,8 +3548,14 @@ def func(x, y):
             f = y.item()
             t1 = x.view((f, f))
             t2 = x.reshape((f, f))
+<<<<<<< HEAD
             t3 = torch._ops.ops.aten.view_copy(x, (f, f))
             return t1 * 10, t2 * 10, t3
+=======
+            # TODO avoid _check_is_size here.
+            torch._check_is_size(f)
+            return t1 * 10, t2 * 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compiled_func = torch.compile(
             fullgraph=True,
@@ -3512,7 +3577,11 @@ def make_non_contiguous_tensor_and_test(cnt):
             self.assertEqual(compiled_result, eager_result)
 
         log_stream, ctx = logs_to_string(
+<<<<<<< HEAD
             "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+=======
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with ctx():
             make_non_contiguous_tensor_and_test(4)
@@ -3524,18 +3593,30 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)",
         ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+<<<<<<< HEAD
         ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+=======
+        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
         view: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense])
+<<<<<<< HEAD
         view_1: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense])
         view_2: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense]);  arg3_1 = _local_scalar_dense = None
         clone: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.clone.default(view_2);  view_2 = None
         mul_11: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
         mul_14: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
         return (mul_11, mul_14, clone)""",  # noqa: B950
+=======
+        view_1: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense]);  arg3_1 = _local_scalar_dense = None
+        mul_9: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        mul_12: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
+        return (mul_9, mul_12)""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_comments=True,
             ignore_empty_lines=True,
         )
@@ -3549,7 +3630,11 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)",
         torch._dynamo.decorators.mark_unbacked(x, 0)
 
         log_stream, ctx = logs_to_string(
+<<<<<<< HEAD
             "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+=======
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with ctx():
             compiled_result = compiled_func(x, torch.tensor([10]))
@@ -3565,18 +3650,30 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
         ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+<<<<<<< HEAD
         ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+=======
+        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
         view: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense])
+<<<<<<< HEAD
         view_1: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense])
         view_2: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense]);  arg2_1 = _local_scalar_dense = None
         clone: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.clone.default(view_2);  view_2 = None
         mul_6: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
         mul_9: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
         return (mul_6, mul_9, clone)""",  # noqa: B950
+=======
+        view_1: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense]);  arg2_1 = _local_scalar_dense = None
+        mul_4: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        mul_7: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
+        return (mul_4, mul_7)""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_comments=True,
             ignore_empty_lines=True,
         )
@@ -3591,10 +3688,19 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
     def test_unbacked_reshape2(self):
         cnt = CompileCounterWithBackend("inductor")
 
+<<<<<<< HEAD
         # This reshape requires a clone when the input is not contiguous and we can't compute strides.
         # reshape (u2, u3) -> (u0, u1)
         def func(x, y):
             u0, u1 = y.tolist()
+=======
+        # This reshape requires a clone when the input is not contiguous and we cant compute strides.
+        # reshape (u2, u3) -> (u0, u1)
+        def func(x, y):
+            u0, u1 = y.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             result1 = torch.reshape(x, (u0, u1))
             return result1 * 10
@@ -3608,7 +3714,11 @@ def func(x, y):
         torch._dynamo.decorators.mark_unbacked(x, 1)
 
         log_stream, ctx = logs_to_string(
+<<<<<<< HEAD
             "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+=======
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with ctx():
             result_eager = func(x, torch.tensor([5, 20]))
@@ -3627,6 +3737,7 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
         select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
+<<<<<<< HEAD
         ge_4: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_4, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_4 = _assert_scalar_2 = None
         sym_sum: "Sym(u0 + 1)" = torch.sym_sum((1, _local_scalar_dense))
@@ -3647,6 +3758,22 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         view: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.view.default(clone, [_local_scalar_dense, _local_scalar_dense_1]);  clone = _local_scalar_dense = _local_scalar_dense_1 = None
         mul_21: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
         return (mul_21,)""",  # noqa: B950
+=======
+        ge_5: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_5 = _assert_scalar_2 = None
+        select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
+        _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
+        ge_7: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
+        _assert_scalar_3 = torch.ops.aten._assert_scalar.default(ge_7, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_7 = _assert_scalar_3 = None
+        mul: "Sym(u2*u3)" = arg1_1 * arg2_1;  arg1_1 = arg2_1 = None
+        mul_1: "Sym(u0*u1)" = _local_scalar_dense * _local_scalar_dense_1
+        eq: "Sym(Eq(u2*u3, u0*u1))" = mul == mul_1;  mul = mul_1 = None
+        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u2*u3, u0*u1) on node 'eq'");  eq = _assert_scalar_4 = None
+        clone: "f32[u2, u3][Max(1, u3), 1]cpu" = torch.ops.aten.clone.default(arg3_1, memory_format = torch.contiguous_format);  arg3_1 = None
+        view: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.view.default(clone, [_local_scalar_dense, _local_scalar_dense_1]);  clone = _local_scalar_dense = _local_scalar_dense_1 = None
+        mul_19: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        return (mul_19,)""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignore_comments=True,
             ignore_empty_lines=True,
         )
@@ -3662,6 +3789,7 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         self.assertEqual(result_compiled, result_eager)
         self.assertEqual(cnt.frame_count, 1)
 
+<<<<<<< HEAD
         # Pass a contiguous tensor. A recompilation will happen due to 0/1 specialization on stride.
         log_stream, ctx = logs_to_string(
             "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
@@ -3669,6 +3797,15 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         with ctx():
             # This used to hit could guard on data-dependent expression Eq(10, u3) x.stride[0]==10. and x.size()=[u2, u3].
             # but not anymore since we use  contiguous_or_false .
+=======
+        # Pass a contiguous tensor. A recompilation will happen due to 0/1 speciialization on stride.
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+        )
+        with ctx():
+            # This used to hit could guard on data-dependent expression Eq(10, u3) x.stride[0]==10. and x.size()=[u2, u3].
+            # but not anymore since we use  definitely_contiguous .
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We need a way to mark strides unbacked to avoid the recompilation here.
             x = torch.randn(10, 10)
             torch._dynamo.decorators.mark_unbacked(x, 0)
@@ -3695,6 +3832,7 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         self.assertEqual(result_compiled, result_eager)
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
     @fresh_cache()
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_unbacked_slice(self):
@@ -3957,6 +4095,8 @@ def f(x, xs, y):
         fn = torch.compile(f, fullgraph=True, backend="inductor")
         fn(x, xs, y)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skip("this test fails due to inductor/autograd issue #153041")
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_unbacked_non_contigious_reshape_failing(self):
@@ -3992,10 +4132,20 @@ def make_non_contiguous_tensor(cnt):
     def test_invalid_view_unbacked_view(self):
         cnt = CompileCounterWithBackend("inductor")
 
+<<<<<<< HEAD
         # This view (u2, u3) -> (u0, u1) can't happen in general unless we know that input is contiguous or we have
         # hints to to compute strides.
         def func(x, y):
             u0, u1 = y.tolist()
+=======
+        # This view (u2, u3) -> (u0, u1) cant happen in general unless we know that input is contigous or we have
+        # hints to to compute strides.
+        def func(x, y):
+            u0, u1 = y.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result2 = x.view(u0, u1) * 10
             return result2
 
@@ -4010,6 +4160,7 @@ def func(x, y):
             # throws a data dependent error.
             compiled_func(x, torch.tensor([5, 20]))
 
+<<<<<<< HEAD
     @skipIfTorchDynamo()
     def test_unbind_not_dynamic(self):
         cnt = CompileCounter()
@@ -4401,6 +4552,8 @@ def func(x, y):
 
         self.assertEqual(compiled(a, b), func(a, b))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 0a5b6faab2f63..95cef1c6c222f 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -64,7 +64,10 @@
     skipIfCrossRef,
     skipIfRocm,
     skipIfTorchDynamo,
+<<<<<<< HEAD
     skipIfWindows,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TemporaryFileName,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
@@ -97,7 +100,11 @@ def checkType(self, t, device_str, size):
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cuda_initialized(self):
+<<<<<<< HEAD
         # doesn't error
+=======
+        # doesnt error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with FakeTensorMode():
             p = torch.randn(4, 2, requires_grad=True, device="cuda")
             x = torch.randn(8, 4, device="cuda")
@@ -201,6 +208,7 @@ def test_convert_fake_to_real(self):
 
         self.assertEqual(torch.ones([10]), out[0])
 
+<<<<<<< HEAD
     def test_conv_nhwc(self):
         x = torch.randn([1, 1024, 16, 16]).to(memory_format=torch.channels_last)
         w = torch.randn([256, 1024, 4, 4]).to(memory_format=torch.channels_last)
@@ -221,6 +229,8 @@ def forward(self, x, w, b):
         eager_out = model.forward(x, w, b)
         self.assertEqual(fake_out.stride(), eager_out.stride())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_zero_dim(self):
         with FakeTensorMode() as mode:
@@ -231,6 +241,7 @@ def test_zero_dim(self):
             self.assertEqual(out.device, y.device)
             self.assertTrue(isinstance(out, FakeTensor))
 
+<<<<<<< HEAD
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_op_with_zero_dim_bypassed(self):
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
@@ -247,6 +258,8 @@ def test_op_with_zero_dim_bypassed(self):
         ) as exc:
             torch.nextafter(fake_x, fake_y)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nan_to_num(self):
         with FakeTensorMode():
             for dtype in [torch.float16, torch.float32]:
@@ -289,6 +302,7 @@ def test_device_inplace_copy(self):
             assert x.copy_(y).device.type == "cpu"
             assert y.copy_(x).device.type == "cuda"
 
+<<<<<<< HEAD
     def test_fake_device(self):
         t = torch.ones(3)
         t = t.view(1, 3)
@@ -302,6 +316,8 @@ def test_fake_device(self):
 
         self.assertEqual(new_fake_t.device, fake_t.device)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fake_dispatch_keys(self):
         with FakeTensorMode():
             x = torch.rand([4])
@@ -1068,6 +1084,7 @@ def test_fast_div(self):
         y = fast_div(mode, x, 2)
         self.assertEqual(y.dtype, torch.float32)
 
+<<<<<<< HEAD
     def test_nanmean_out(self):
         # Regression test to ensure we don't error out.
         with torch._subclasses.fake_tensor.FakeTensorMode() as mode:
@@ -1088,6 +1105,8 @@ def test_unbind_copy_out(self):
         self.assertEqual(out[1].dtype, eye.dtype)
         self.assertEqual(out[2].dtype, eye.dtype)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(FakeTensorTest)
 
@@ -1484,6 +1503,10 @@ def test_cross_entropy_loss(self):
 
             self.assertEqual(ref.size(), meta_out.size())
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
         "Does not support SDPA or pre-SM80 hardware",
@@ -1519,7 +1542,11 @@ def forward(self, arg1, arg2, arg3):
                 with torch._subclasses.CrossRefFakeMode():
                     Repro()(*args)
             except MetadataMismatchError as e:
+<<<<<<< HEAD
                 # We expect the cross ref to succeed for the first output to fail
+=======
+                # We expect the cross ref to succed for the first output to fail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # for the rng state, see Note [Seed and Offset]
                 self.assertTrue("output[0]" not in str(e))
                 if self.__class__.__name__.startswith("PropagateRealTensors"):
@@ -1536,6 +1563,7 @@ def test_fake_gpu_no_init(self):
         # Skip this test, we will try to run CUDA operations to real prop so
         # it clearly will not work on CPU runner
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
+<<<<<<< HEAD
             self.skipTest("Propagate real tensor not supported")
 
         with FakeTensorMode(allow_non_fake_inputs=True):
@@ -1592,6 +1620,18 @@ def test_move_meta_tensor(self):
             self.assertEqual(meta_tensor.to(device="cpu").device.type, "cpu")
             self.assertEqual(meta_tensor.to(device=GPU_TYPE).device.type, GPU_TYPE)
 
+=======
+            return
+        with FakeTensorMode():
+            torch.empty(10, device=GPU_TYPE)
+            torch.ones(10, device=GPU_TYPE)
+            torch.zeros(10, device=GPU_TYPE)
+            torch.rand(10, device=GPU_TYPE)
+            torch.tensor(3.14, device=GPU_TYPE)
+            torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE)
+
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_conv_c1_backward(self):
         class Repro(torch.nn.Module):
@@ -1786,6 +1826,7 @@ def test_nonzero_stride(self):
 
         self.assertEqual(fake_r.T.is_contiguous(), r.T.is_contiguous())
 
+<<<<<<< HEAD
     def test_nan_to_num(self):
         shape_env = ShapeEnv()
         fake_mode = FakeTensorMode(shape_env=shape_env)
@@ -1796,6 +1837,8 @@ def test_nan_to_num(self):
         self.assertEqual(x.size(), y.size())
         self.assertEqual(x.stride(), y.stride())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_torch_load_with_fake_mode(self):
         model = torch.nn.Linear(5, 10)
@@ -2008,6 +2051,7 @@ def test_cache_key_constants(self):
             self._test_cache_key(fm, 1.0, 1.0, 1)
             self._test_cache_key(fm, 0.0, 0.0, 0)
 
+<<<<<<< HEAD
     def test_empty_list(self):
         with FakeTensorMode() as fm:
             func = aten.any.dims
@@ -2018,6 +2062,8 @@ def test_empty_list(self):
 
         self.assertNotEqual(key_x, key_y)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def assertHitsMisses(self, hits, misses):
         """
         Helper to assert on the number of recorded hits and misses.
@@ -2397,9 +2443,12 @@ def test_cache_aten_index(self):
                 lambda: torch.ops.aten.index(x, [None, idx_tensor1]),
             )
 
+<<<<<<< HEAD
     @skipIfWindows(
         msg="weird bug - cache may not be cleared after https://github.com/pytorch/pytorch/pull/154283"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
     def test_invoke_subgraph(self):
         """
@@ -2441,7 +2490,11 @@ def fn(x, y):
             self.assertEqual(len(backend.fw_graphs), 1)
             mod = backend.fw_graphs[0]
 
+<<<<<<< HEAD
             # Ensure that we see hits every time
+=======
+            # Ensure that we see hits everytime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with FakeTensorMode():
                 x = torch.randn(6, 4)
                 y = torch.randn(6, 4)
@@ -2561,6 +2614,7 @@ def forward(
         self.assertBypasses("unrepresented symbol in output", 2)
 
 
+<<<<<<< HEAD
 class FakeTensorPreferDeviceType(TestCase):
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_fake_tensor_prefer_device_type(self):
@@ -2637,5 +2691,7 @@ def test_fake_tensor_prefer_device_type_cpu_only(self):
                 self.assertTrue(isinstance(result, FakeTensor))
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index 03eb15744b543..a0e31e75ce0ca 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -13,7 +13,10 @@
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import e4m3_type
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     run_tests,
     TEST_WITH_TORCHDYNAMO,
@@ -854,7 +857,11 @@ def get_flops(model):
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
     def test_scaled_mm(self):
+<<<<<<< HEAD
         dtype = e4m3_type
+=======
+        dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with FlopCounterMode() as mode:
             torch._scaled_mm(
                 torch.randn((3 * 16, 5 * 16), device="cuda").to(dtype),
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 12c2ec7ccc961..b97598390353b 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -12,7 +12,11 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
+=======
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -20,6 +24,10 @@
     onlyCUDA,
     OpDTypes,
     ops,
+<<<<<<< HEAD
+=======
+    skipCUDAVersionIn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -38,11 +46,19 @@
     gradcheck,
     parametrize,
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfRocmVersionLessThan,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TEST_WITH_ROCM,
     TestCase,
 )
+<<<<<<< HEAD
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
@@ -78,6 +94,7 @@ def __init__(self, func):
     def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
         actual = None
         zero_size = kwargs.pop("zero_size", False)
+<<<<<<< HEAD
 
         # Skip profiler check for CUDA 12.6, 12.8 as the upgrade makes profiler results flaky
         # https://github.com/pytorch/pytorch/issues/148681. TODO: ADD IT BACK!!!
@@ -85,6 +102,10 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
         if (
             is_cuda
             and not skip_profiler_check
+=======
+        if (
+            is_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and torch.autograd.kineto_available()
             and torch.profiler.ProfilerActivity.CUDA
             in torch.profiler.supported_activities()
@@ -95,7 +116,10 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
                 torch.cuda.synchronize()
             keys = tuple([e.key for e in p.key_averages()])
             mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mta_called == (expect_fastpath and (not zero_size)), (
                 f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
             )
@@ -195,6 +219,13 @@ def test_all_zero_size_tensors_do_not_launch_kernel(self, device, dtype, op):
                         zero_size=True,
                     )
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+    @skipIfRocmVersionLessThan((6, 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(
         foreach_unary_op_db
         + foreach_binary_op_db
@@ -306,6 +337,12 @@ def _binary_test(
                 else:
                     self.assertEqual(expected, actual)
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
     @parametrize("is_fastpath", (True, False))
     def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
@@ -363,6 +400,12 @@ def clone(arg):
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_op_with_tensor_of_scalarlist_overload(
         self, device, dtype, op, is_fastpath
     ):
@@ -700,6 +743,12 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
                 ):
                     foreach_op_([tensor1], [tensor2])
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
     @ops(
         filter(lambda op: op.supports_out, foreach_binary_op_db),
@@ -815,6 +864,12 @@ def test_binary_op_list_slow_path(self, device, dtype, op):
             scalar_self_arg=False,
         )
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(
         filter(lambda op: op.supports_out, foreach_binary_op_db),
         dtypes=floating_types_and(torch.half, torch.bfloat16),
@@ -1338,6 +1393,12 @@ def test_foreach_copy_with_multi_device_inputs(self, device, dtype, op):
                         copy_(t, s, non_blocking)
                     self.assertEqual(ref_input, sample.input)
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
@@ -1373,7 +1434,11 @@ def test_foreach_copy_with_multi_dtypes_large_input(self):
         ref_out = torch.empty_like(self_tensor).copy_(src_tensor)
         self.assertEqual(self_tensor, ref_out)
 
+<<<<<<< HEAD
     @requires_cuda_and_triton
+=======
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_different_device_inputs(self, device, dtype, op):
         if dtype in (torch.complex128, torch.complex64):
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 65e74297a531f..0588ff20e9219 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -199,7 +199,11 @@ def f(x):
             y.set_(x.storage())
             return y
 
+<<<<<<< HEAD
         # We should probably get the crossref test to work,
+=======
+        # We should probaby get the crossref test to work,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # but fixing it for Storage() objects is annoying.
         r = _functionalize(f, reapply_views=True, crossref=False)(torch.ones(2))
         self.assertEqual(str(r.device), "cpu")
@@ -2318,7 +2322,11 @@ def forward(self, arg0_1):
     ]
 )
 @unittest.skipIf(
+<<<<<<< HEAD
     TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesn't work well"
+=======
+    TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesnt work well"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 class TestCrossRefFunctionalization(TestFunctionalization):
     crossref = True
diff --git a/test/test_functionalization_of_rng_ops.py b/test/test_functionalization_of_rng_ops.py
index 9b4542500d50d..cca20c684bfca 100644
--- a/test/test_functionalization_of_rng_ops.py
+++ b/test/test_functionalization_of_rng_ops.py
@@ -302,7 +302,11 @@ def fn(x, y):
         fwd_compiler = functools.partial(count_philox_rand, freq=1)
         bwd_compiler = functools.partial(count_philox_rand, freq=0)
         aot_fn = aot_function(fn, fwd_compiler, bwd_compiler)
+<<<<<<< HEAD
         # We can't check accuracy here because rand_like generated different rand numbers than dropout
+=======
+        # We cant check accuracy here because rand_like generated different rand numbers than dropout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = aot_fn(x, y)
         res.sum().backward()
 
@@ -316,7 +320,11 @@ def fn(x):
 
         # Ensure the decomp is happening
         aot_fn = aot_function(fn, functools.partial(count_philox_rand, freq=1))
+<<<<<<< HEAD
         # We can't check accuracy here because rand_like generated different rand numbers than dropout
+=======
+        # We cant check accuracy here because rand_like generated different rand numbers than dropout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot_fn(x)
 
 
diff --git a/test/test_fx.py b/test/test_fx.py
index d8b02712b5ea8..cd3741ab12ada 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -6,7 +6,10 @@
 import collections
 import contextlib
 import copy
+<<<<<<< HEAD
 import gc
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import inspect
 import io
@@ -20,7 +23,10 @@
 import types
 import typing
 import unittest
+<<<<<<< HEAD
 import weakref
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from math import sqrt
 from torch.multiprocessing import Process
@@ -37,8 +43,12 @@
 from torch.fx.operator_schemas import get_signature_for_torch_op
 from copy import deepcopy
 from collections import namedtuple
+<<<<<<< HEAD
 from typing import Any, NamedTuple, Optional, Union
 from collections.abc import Callable
+=======
+from typing import Any, Callable, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -71,7 +81,10 @@
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
 )
 from torch.testing._internal.jit_utils import JitTestCase
@@ -205,7 +218,11 @@ def side_effect_func(x: torch.Tensor):
 class TestFX(JitTestCase):
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         # Checking for mutable operations while tracing is feature flagged
+=======
+        # Checking for mutable operations whil tracing is feature flagged
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = (
             torch.fx.proxy.TracerBase.check_mutable_operations
@@ -912,7 +929,11 @@ def __init__(self, interpreter):
             wrapper = WrapperModule(interpreter)
 
             # Create a graph that: 1) Takes function arguments 2) Invokes the interpreter
+<<<<<<< HEAD
             # 3) Returns the specified return value
+=======
+            # 3) Returns the speficied return value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # FIXME: The following code could be greatly simplified by symbolic_trace'ing
             # the wrapper with a Tracer that considers the Wrapper instance a root
@@ -958,7 +979,11 @@ def __init__(self, interpreter):
         script_out = scripted_lowered(x)
         torch.testing.assert_close(script_out, ref_out)
 
+<<<<<<< HEAD
         # Test TorchScript Ser/De
+=======
+        # Test TorchScript ser/de
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import_copy = self.getExportImportCopy(scripted_lowered)
         imported_out = import_copy(x)
         torch.testing.assert_close(imported_out, ref_out)
@@ -1627,6 +1652,7 @@ def test_remove_uses(self):
 
         self.assertTrue(neg not in relu.users)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Dynamo does not free right away")
     def test_prepend_does_not_leak(self):
         g = Graph()
@@ -1646,6 +1672,8 @@ def test_prepend_does_not_leak(self):
 
         self.assertIsNone(ref())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_remove_uses_with_custom_filter(self):
         g: torch.fx.Graph = Graph()
         x: torch.fx.Node = g.placeholder("x")
@@ -2248,8 +2276,13 @@ def forward(
         foo_scripted = torch.jit.script(Foo())
         foo_scripted(Pair(torch.rand(5), torch.rand(5)), torch.rand(5), 3)
 
+<<<<<<< HEAD
         fixed = symbolic_trace(Foo())
         fxed_scripted = torch.jit.script(fixed)
+=======
+        fxed = symbolic_trace(Foo())
+        fxed_scripted = torch.jit.script(fxed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fxed_scripted(Pair(torch.rand(5), torch.rand(5)), torch.rand(5), 3)
 
     def test_fn_type_annotation_empty(self):
@@ -3606,7 +3639,11 @@ def is_leaf_module(self, module, name):
 
         class LeafTracerNotB(Tracer):
             def is_leaf_module(self, module, name):
+<<<<<<< HEAD
                 return "b" not in name
+=======
+                return False if "b" in name else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Recompile calls added "for fun", since they
         # chain __call__ wrappers.
@@ -3808,6 +3845,28 @@ def forward(self, x: typing.Tuple[()], y: typing.Tuple[str, typing.Tuple[()]]):
 
         FileCheck().check("Tuple[()]").check("Tuple[str, Tuple[()]]").run(scripted.code)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_WINDOWS, "Python Windows bug? https://bugs.python.org/issue45108"
+    )
+    @unittest.skipIf(sys.version_info >= (3, 10), "Does not work on Python-3.10")
+    def test_assert(self):
+        def f(x):
+            assert x > 1
+            return x + 1
+
+        try:
+            torch.fx.proxy.TracerBase.trace_asserts = True
+            traced = symbolic_trace(f)
+        finally:
+            torch.fx.proxy.TracerBase.trace_asserts = False
+
+        self.assertEqual(f(2), traced(2))
+        with self.assertRaises(AssertionError):
+            traced(0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pytree(self):
         # Used to test that you can use your own placeholder class
         class PHTest(PHBase):
@@ -4199,7 +4258,11 @@ def run_getitem_target():
 
 class TestOperatorSignatures(JitTestCase):
     def setUp(self):
+<<<<<<< HEAD
         # Checking for mutable operations while tracing is feature flagged
+=======
+        # Checking for mutable operations whil tracing is feature flagged
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = (
             torch.fx.proxy.TracerBase.check_mutable_operations
@@ -4242,7 +4305,11 @@ def setUp(self):
         super().setUp()
         self.maxDiff = None
 
+<<<<<<< HEAD
         # Checking for mutable operations while tracing is feature flagged
+=======
+        # Checking for mutable operations whil tracing is feature flagged
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = (
             torch.fx.proxy.TracerBase.check_mutable_operations
@@ -4598,7 +4665,11 @@ def test_preserve_unused_attr_after_unpickle(self):
 class TestFunctionalTracing(JitTestCase):
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         # Checking for mutable operations while tracing is feature flagged
+=======
+        # Checking for mutable operations whil tracing is feature flagged
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = (
             torch.fx.proxy.TracerBase.check_mutable_operations
@@ -4664,6 +4735,10 @@ def tearDown(self):
         "linear": BUILT_IN_FUNC,
         "logsigmoid": BUILT_IN_FUNC,
         "one_hot": BUILT_IN_FUNC,
+<<<<<<< HEAD
+=======
+        "pad": ARG_TYPE_MISMATCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "pairwise_distance": BUILT_IN_FUNC,
         "pdist": BUILT_IN_FUNC,
         "pixel_shuffle": BUILT_IN_FUNC,
@@ -4696,6 +4771,15 @@ def tearDown(self):
         "max_unpool3d": PROXY_ITERATED,
         "fold": PROXY_ITERATED,
         "unfold": PROXY_ITERATED,
+<<<<<<< HEAD
+=======
+        "adaptive_max_pool1d_with_indices": ARG_TYPE_MISMATCH,
+        "fractional_max_pool2d_with_indices": ARG_TYPE_MISMATCH,
+        "fractional_max_pool3d_with_indices": ARG_TYPE_MISMATCH,
+        "layer_norm": ARG_TYPE_MISMATCH,
+        "rms_norm": ARG_TYPE_MISMATCH,
+        "lp_pool1d": ARG_TYPE_MISMATCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "affine_grid": CONTROL_FLOW,
         "alpha_dropout": CONTROL_FLOW,
         "batch_norm": CONTROL_FLOW,
@@ -4729,6 +4813,12 @@ def tearDown(self):
         "leaky_relu": CONTROL_FLOW,
         "local_response_norm": CONTROL_FLOW,
         "margin_ranking_loss": CONTROL_FLOW,
+<<<<<<< HEAD
+=======
+        "max_pool1d_with_indices": ARG_TYPE_MISMATCH,
+        "max_pool2d_with_indices": ARG_TYPE_MISMATCH,
+        "max_pool3d_with_indices": ARG_TYPE_MISMATCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "mse_loss": CONTROL_FLOW,
         "multi_head_attention_forward": CONTROL_FLOW,
         "multi_margin_loss": CONTROL_FLOW,
@@ -4823,6 +4913,10 @@ def generate_test_func(cls, func_name, fn):
         def functional_test(self):
             if (
                 func_name in self.UNTRACEABLE_FUNCTIONALS_PY38
+<<<<<<< HEAD
+=======
+                and sys.version_info >= (3, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and sys.version_info < (3, 12)
             ):
                 exc, err = self.UNTRACEABLE_FUNCTIONALS_PY38[func_name]
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index d74a3febf171f..7964275a91c24 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -12,8 +12,12 @@
 import typing
 import unittest
 from types import BuiltinFunctionType
+<<<<<<< HEAD
 from typing import NamedTuple, Optional, Union
 from collections.abc import Callable
+=======
+from typing import Callable, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.experimental.meta_tracer
@@ -54,7 +58,11 @@
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_nn import module_tests, get_new_module_tests
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_Z3, run_tests, TestCase, TEST_WITH_CROSSREF
+=======
+from torch.testing._internal.common_utils import TEST_Z3, run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 import torch.utils._pytree as pytree
 
@@ -964,6 +972,7 @@ def _test_split_graph(split_gm):
         # `keep_original_order=True`
         _test_split_graph(split_module(g, None, split_callback=lambda _ : 0, keep_original_order=True))
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_CROSSREF, "See https://github.com/pytorch/pytorch/issues/160077")
     def test_split_module_symint_dependency_handling(self):
         # Based on the code from - transformers/models/granitemoe/modeling_granitemoe.py
@@ -1053,6 +1062,8 @@ def backend(gm, inps):
         actual = torch.compile(moe, backend=backend)(inp)
         torch.testing.assert_close(actual, expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_normalize_binary_operators(self):
         ops_to_test = {
             torch.add,
diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py
index be22f8e61e509..a9bd093fa22ec 100644
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@@ -110,7 +110,11 @@ def forward5(a, b, c):
 
     @staticmethod
     def forward6(a, b, c):
+<<<<<<< HEAD
         # add should have its own partition, as neither branches are supported
+=======
+        # add should have its own partition, as neither branchs are supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         add = a + 1
         # left branch
         relu = add.relu()
@@ -283,7 +287,11 @@ class TestFXGraphPasses(JitTestCase):
         (TestPartitionFunctions.forward15, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
         (TestPartitionFunctions.forward16, [["permute_1", "add_1", "add"]], True),
         (TestPartitionFunctions.forward16, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
+<<<<<<< HEAD
         # should be empty partition, not a partition with empty nodes
+=======
+        # should be empty partition, not a partiton with empty nodes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (TestPartitionFunctions.forward18, [], False),
     ])
     def test_partitioner(self, fn, expected_partition, bookend_non_compute_pass):
@@ -344,9 +352,15 @@ def test_partitioner_independent_output(self, fn, expected_partition):
         [['add', 'add_1', 'add_2']],  # vertical fusion
         [['add_2', 'add_3']],         # horizontal fusion
         [['add_3', 'add_4']],
+<<<<<<< HEAD
         [['add_6', 'add_5']],     # arbitrary node order
         [['add_4', 'add_1', 'add_3', 'add_2']],           # arbitrary node order
         [['add_5', 'add_6'], ['add_1', 'add_2', 'add_3', 'add_4']],  # arbitrary partition order
+=======
+        [['add_6', 'add_5']],     # arbitray node order
+        [['add_4', 'add_1', 'add_3', 'add_2']],           # arbitray node order
+        [['add_5', 'add_6'], ['add_1', 'add_2', 'add_3', 'add_4']],  # arbitray partition order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [['add_5', 'linear2']],   # includes call_function + call_module node
         [['add_6', 'relu']],   # includes call_function + call_module node
         [['param', 'add_2']],   # includes get_attr + call_module nodes
diff --git a/test/test_fx_reinplace_pass.py b/test/test_fx_reinplace_pass.py
index 4acda3bece746..0ea4073f51d68 100644
--- a/test/test_fx_reinplace_pass.py
+++ b/test/test_fx_reinplace_pass.py
@@ -43,7 +43,11 @@ def test_reinplace_with_view(self):
         def f(x):
             a = x.clone()
             a_view = a.view(-1)
+<<<<<<< HEAD
             # We shouldn't re-inplace the first add(), because an alias of a is reused later in the program
+=======
+            # We shouldn't re-inplace the first add(), because an alias of a is re-used later in the program
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             b = a.add(1)  # noqa: F841
 
             # Second add() is fine to re-inplace
@@ -188,11 +192,22 @@ def f(a_):
 
 def forward(self, a__1):
     clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+<<<<<<< HEAD
     select = torch.ops.aten.select.int(clone, 1, 1)
     select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
     add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = add = None
     select_2 = torch.ops.aten.select.int(clone, 1, 1);  select_2 = None
     select_3 = torch.ops.aten.select.int(clone, 1, 1)
+=======
+    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
+    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
+    add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = add = None
+    slice_2 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select_2 = torch.ops.aten.select.int(slice_2, 1, 1);  slice_2 = select_2 = None
+    slice_3 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select_3 = torch.ops.aten.select.int(slice_3, 1, 1);  slice_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     select_4 = torch.ops.aten.select.int(select_3, 0, 1);  select_3 = select_4 = None
     return clone
     """)
@@ -225,7 +240,12 @@ def f(a_):
 
 def forward(self, a__1):
     clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+<<<<<<< HEAD
     select = torch.ops.aten.select.int(clone, 1, 1)
+=======
+    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
     add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = add = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 1);  clone = None
@@ -260,7 +280,12 @@ def f(a_):
 
 def forward(self, a__1):
     clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+<<<<<<< HEAD
     select = torch.ops.aten.select.int(clone, 1, 1)
+=======
+    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
     add = torch.ops.aten.add.Tensor(select_1, 1);  select_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 1);  clone = None
@@ -292,7 +317,12 @@ def f(a_):
 
 def forward(self, a__1):
     clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+<<<<<<< HEAD
     select = torch.ops.aten.select.int(clone, 1, 1)
+=======
+    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
     add = torch.ops.aten.add.Tensor(select_1, 1);  select_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 0);  clone = None
@@ -347,9 +377,18 @@ def f():
 def forward(self):
     zeros = torch.ops.aten.zeros.default([4, 4, 4], device = device(type='cpu'), pin_memory = False)
     ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)
+<<<<<<< HEAD
     slice_1 = torch.ops.aten.slice.Tensor(zeros, 1, 2, 9223372036854775807)
     copy = torch.ops.aten.copy_.default(slice_1, ones);  slice_1 = ones = copy = None
     slice_2 = torch.ops.aten.slice.Tensor(zeros, 1, 2, 9223372036854775807);  slice_2 = None
+=======
+    slice_1 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
+    slice_2 = torch.ops.aten.slice.Tensor(slice_1, 1, 2, 9223372036854775807);  slice_1 = None
+    copy = torch.ops.aten.copy_.default(slice_2, ones);  slice_2 = ones = copy = None
+    slice_3 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807);  slice_3 = None
+    slice_4 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
+    slice_5 = torch.ops.aten.slice.Tensor(slice_4, 1, 2, 9223372036854775807);  slice_4 = slice_5 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return zeros
     """)
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
index cca7a21165d0c..792a1b63e987c 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -5,7 +5,10 @@
 import unittest
 import warnings
 from functools import reduce
+<<<<<<< HEAD
 from itertools import product
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 
@@ -16,14 +19,19 @@
     dtypes,
     dtypesIfCPU,
     dtypesIfCUDA,
+<<<<<<< HEAD
     dtypesIfMPS,
     expectedFailureMPS,
     instantiate_device_type_tests,
     onlyCPU,
+=======
+    instantiate_device_type_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     onlyCUDA,
     onlyNativeDeviceTypes,
     skipXLA,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_dtype import (
     all_mps_types_and,
     all_types_and,
@@ -33,11 +41,18 @@
 from torch.testing._internal.common_utils import (
     DeterministicGuard,
     parametrize,
+=======
+from torch.testing._internal.common_utils import (
+    DeterministicGuard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     serialTest,
     skipIfTorchDynamo,
     TEST_CUDA,
+<<<<<<< HEAD
     TEST_MPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
     xfailIfTorchDynamo,
 )
@@ -152,10 +167,14 @@ def consec(size, start=1):
         )
 
         lst = [list(range(i, i + 10)) for i in range(0, 100, 10)]
+<<<<<<< HEAD
         _make_tensor = (
             torch.DoubleTensor if not device.startswith("mps") else torch.FloatTensor
         )
         tensor = _make_tensor(lst).to(device)
+=======
+        tensor = torch.DoubleTensor(lst).to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _i in range(100):
             idx1_start = random.randrange(10)
             idx1_end = idx1_start + random.randrange(1, 10 - idx1_start + 1)
@@ -171,7 +190,11 @@ def consec(size, start=1):
             else:
                 lst_indexed = lst[idx1]
                 tensor_indexed = tensor[idx1]
+<<<<<<< HEAD
             self.assertEqual(_make_tensor(lst_indexed), tensor_indexed)
+=======
+            self.assertEqual(torch.DoubleTensor(lst_indexed), tensor_indexed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertRaises(ValueError, lambda: reference[1:9:0])
         self.assertRaises(ValueError, lambda: reference[1:9:-1])
@@ -194,7 +217,10 @@ def delitem():
 
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.half)  # TODO: add bf16 there?
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_advancedindex(self, device, dtype):
         # Tests for Integer Array Indexing, Part I - Purely integer array
         # indexing
@@ -247,7 +273,11 @@ def validate_setting(x):
                 x[ri([0, 2, 4]),], torch.tensor([5, 4, 3], dtype=dtype, device=device)
             )
 
+<<<<<<< HEAD
         # Only validates indexing and setting for Halves
+=======
+        # Only validates indexing and setting for halfs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dtype == torch.half:
             reference = consec((10,))
             validate_indexing(reference)
@@ -902,7 +932,11 @@ def test_list_indices(self, device):
         # Set window size
         W = 10
         # Generate a list of lists, containing overlapping window indices
+<<<<<<< HEAD
         indices = [range(i, i + W) for i in range(N - W)]
+=======
+        indices = [range(i, i + W) for i in range(0, N - W)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
@@ -924,6 +958,7 @@ def test_multiple_bool_indices(self, device):
         mask2 = torch.tensor([1, 1, 1], dtype=torch.bool, device=device)
         self.assertEqual(v[mask1, :, mask2].shape, (3, 7))
 
+<<<<<<< HEAD
     def test_multi_dimensional_bool_mask(self, device):
         x = torch.randn(2, 2, 3, device=device)
         b = ((True, False), (False, False))
@@ -984,6 +1019,8 @@ def test_multi_dimensional_bool_mask_assignment(self, device):
         torch.ops.aten.index_put_(v, [None, mask, None], torch.tensor(0))
         self.assertEqual(v, torch.tensor([[[[0], [2]], [[3], [0]]]], device=device))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_byte_mask(self, device):
         v = torch.randn(5, 7, 3, device=device)
         mask = torch.ByteTensor([1, 0, 1, 1, 0]).to(device)
@@ -1005,11 +1042,18 @@ def test_byte_mask_accumulate(self, device):
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
             self.assertEqual(len(w), 2)
 
+<<<<<<< HEAD
     # MPS: Fails locally, but passes in CI...
     @skipIfTorchDynamo(
         "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
     )
     @serialTest(TEST_CUDA or TEST_MPS)
+=======
+    @skipIfTorchDynamo(
+        "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
+    )
+    @serialTest(TEST_CUDA)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
@@ -1208,11 +1252,18 @@ def func1(x, i, v):
 
     @onlyNativeDeviceTypes
     def test_index_put_accumulate_duplicate_indices(self, device):
+<<<<<<< HEAD
         dtype = torch.float if device.startswith("mps") else torch.double
         for i in range(1, 512):
             # generate indices by random walk, this will create indices with
             # lots of duplicates interleaved with each other
             delta = torch.empty(i, dtype=dtype, device=device).uniform_(-1, 1)
+=======
+        for i in range(1, 512):
+            # generate indices by random walk, this will create indices with
+            # lots of duplicates interleaved with each other
+            delta = torch.empty(i, dtype=torch.double, device=device).uniform_(-1, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             indices = delta.cumsum(0).long()
 
             input = torch.randn(indices.abs().max() + 1, device=device)
@@ -1321,7 +1372,10 @@ def test_int_indices(self, device):
         torch.float8_e5m2,
         torch.float8_e4m3fn,
     )
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float, torch.float16, torch.long, torch.bool)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_put_src_datatype(self, device, dtype):
         src = torch.ones(3, 2, 4, device=device, dtype=dtype)
         vals = torch.ones(3, 2, 4, device=device, dtype=dtype)
@@ -1797,6 +1851,7 @@ def test_index_limits(self, device):
         self.assertRaises(IndexError, lambda: t[idx_min])
         self.assertRaises(IndexError, lambda: t[idx_max])
 
+<<<<<<< HEAD
     @parametrize("reduce", ["prod", "amin", "amax", "mean"])
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
     @expectedFailureMPS  # Unimplemented for MPS device
@@ -2125,6 +2180,8 @@ def ref_index_select(src, dim, idx):
             out = source.index_select(0, idx)
             self.assertEqual(out.item(), source.item())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The tests below are from NumPy test_indexing.py with some modifications to
 # make them compatible with PyTorch. It's licensed under the BDS license below:
@@ -2396,9 +2453,13 @@ def test_truncate_leading_1s(self, device):
         self.assertEqual(kernel, kernel2)
 
 
+<<<<<<< HEAD
 instantiate_device_type_tests(
     TestIndexing, globals(), except_for="meta", allow_mps=True
 )
+=======
+instantiate_device_type_tests(TestIndexing, globals(), except_for="meta")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(NumpyTests, globals(), except_for="meta")
 
 if __name__ == "__main__":
diff --git a/test/test_itt.py b/test/test_itt.py
index efcdcf49b159f..dfec9a2416093 100644
--- a/test/test_itt.py
+++ b/test/test_itt.py
@@ -6,7 +6,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not torch.profiler.itt.is_available(), "ITT is required")
 class TestItt(TestCase):
diff --git a/test/test_jit.py b/test/test_jit.py
index 137979fcc4f15..017c4f3353212 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3,6 +3,7 @@
 
 import torch
 
+<<<<<<< HEAD
 if __name__ == '__main__':
     from torch.testing._internal.common_utils import parse_cmd_line_args
 
@@ -10,6 +11,8 @@
     # before instantiating tests.
     parse_cmd_line_args()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is how we include tests located in test/jit/...
 # They are included here so that they are invoked when you call `test_jit.py`,
 # do not run these test files directly.
@@ -104,7 +107,11 @@
 from torch.testing._internal import jit_utils
 from torch.testing._internal.common_jit import check_against_reference
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, \
+<<<<<<< HEAD
     GRAPH_EXECUTOR, suppress_warnings, IS_SANDCASTLE, ProfilingMode, \
+=======
+    suppress_warnings, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase, freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, skipIfTorchDynamo
@@ -165,7 +172,10 @@ def doAutodiffCheck(testname):
     if "test_t_" in testname or testname == "test_t":
         return False
 
+<<<<<<< HEAD
     assert GRAPH_EXECUTOR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if GRAPH_EXECUTOR == ProfilingMode.SIMPLE:
         return False
 
@@ -209,7 +219,10 @@ def doAutodiffCheck(testname):
     return testname not in test_exceptions
 
 
+<<<<<<< HEAD
 assert GRAPH_EXECUTOR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: enable TE in PE when all tests are fixed
 torch._C._jit_set_texpr_fuser_enabled(GRAPH_EXECUTOR == ProfilingMode.PROFILING)
 torch._C._jit_set_profiling_executor(GRAPH_EXECUTOR != ProfilingMode.LEGACY)
@@ -2887,9 +2900,15 @@ def lstm(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
                     self.assertTrue(hasattr(input, 'type'))
                     self.assertTrue(input.type() is not None)
                 self.assertTrue(hasattr(block, 'returnNode'))
+<<<<<<< HEAD
                 self.assertTrue(type(block.returnNode()) is torch._C.Node)
                 self.assertTrue(hasattr(block, 'paramNode'))
                 self.assertTrue(type(block.paramNode()) is torch._C.Node)
+=======
+                self.assertTrue(type(block.returnNode()) == torch._C.Node)
+                self.assertTrue(hasattr(block, 'paramNode'))
+                self.assertTrue(type(block.paramNode()) == torch._C.Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(tested_blocks)
 
     def test_export_opnames(self):
@@ -3153,7 +3172,11 @@ def fct_loop(x):
             eplan = get_execution_plan(dstate)
             num_bailouts = eplan.code.num_bailouts()
 
+<<<<<<< HEAD
             for i in range(num_bailouts):
+=======
+            for i in range(0, num_bailouts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 eplan.code.request_bailout(i)
                 self.assertEqual(jitted(x), expected)
 
@@ -4773,7 +4796,11 @@ def fun():
         self.assertIsNot(fun_compiled, fun_compiled_2)
         self.assertEqual(fun_compiled_2(), 7)
 
+<<<<<<< HEAD
         # caching doesn't increase refcounts to function (holds weak reference)
+=======
+        # caching doesnt increase refcounts to function (holds weak reference)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(sys.getrefcount(fun), num_ref_counts)
 
     def test_string_ops(self):
@@ -5950,7 +5977,11 @@ def fib(x):
             # type: (int) -> int
             prev = 1
             v = 1
+<<<<<<< HEAD
             for i in range(x):
+=======
+            for i in range(0, x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 save = v
                 v = v + prev
                 prev = save
@@ -6510,7 +6541,11 @@ def func(a, b):
                     if isinstance(res_python, Exception):
                         continue
 
+<<<<<<< HEAD
                     if type(res_python) is type(res_script):
+=======
+                    if type(res_python) == type(res_script):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if isinstance(res_python, tuple) and (math.isnan(res_python[0]) == math.isnan(res_script[0])):
                             continue
                         if isinstance(res_python, float) and math.isnan(res_python) and math.isnan(res_script):
@@ -6723,7 +6758,11 @@ def test_logical_short_circuit(self):
         @torch.jit.script
         def testNoThrows(t):
             c1 = 1
+<<<<<<< HEAD
             if (False and bool(t[1])) or (True or bool(t[1])):  # noqa: SIM222,SIM223
+=======
+            if (False and bool(t[1])) or (True or bool(t[1])):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 c1 = 0
             return c1
 
@@ -7383,8 +7422,13 @@ def func():
                     # tensor from empty list is type float in python and annotated type in torchscript
                     if "annotate" in li and "dtype" not in option:
                         continue
+<<<<<<< HEAD
                     # Skip unsigned tensor initialization for signed values on 3.10
                     if "torch.uint8" in option and "-" in li:
+=======
+                    # Skip unsigned tensor initializaton for signed values on 3.10
+                    if sys.version_info[:2] >= (3, 10) and "torch.uint8" in option and "-" in li:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         continue
                     code = tensor_template.format(list_create=li, tensor_op=op, options=option)
                     scope = {}
@@ -7999,7 +8043,11 @@ def test_varexit(cond):
                 m += k
             return m
 
+<<<<<<< HEAD
         # use of k tests the pathway where we have to insert uninitialized
+=======
+        # use of k tests the pathway where we have to insert unitialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.checkScript(test_varexit, (3,))
         self.checkScript(test_varexit, (2,))
 
@@ -8646,7 +8694,11 @@ def func():
         args = args + [1, 1.5]
 
         def isBool(arg):
+<<<<<<< HEAD
             return type(arg) is bool or (type(arg) is str and "torch.bool" in arg)
+=======
+            return type(arg) == bool or (type(arg) == str and "torch.bool" in arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for op in ops:
             for first_arg in args:
@@ -8655,7 +8707,11 @@ def isBool(arg):
                     if (op == 'sub' or op == 'div') and (isBool(first_arg) or isBool(second_arg)):
                         continue
                     # div is not implemented correctly for mixed-type or int params
+<<<<<<< HEAD
                     if (op == 'div' and (type(first_arg) is not type(second_arg) or
+=======
+                    if (op == 'div' and (type(first_arg) != type(second_arg) or
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        isinstance(first_arg, int) or
                        (isinstance(first_arg, str) and 'int' in first_arg))):
                         continue
@@ -8671,7 +8727,11 @@ def isBool(arg):
                     graph = cu.func.graph
                     torch._C._jit_pass_complete_shape_analysis(graph, (), False)
                     # use dim=-1 to represent a python/jit scalar.
+<<<<<<< HEAD
                     dim = -1 if type(first_arg) is not str and type(second_arg) is not str else non_jit_result.dim()
+=======
+                    dim = -1 if type(first_arg) != str and type(second_arg) != str else non_jit_result.dim()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dtype = non_jit_result.dtype
                     # jit only supports int/float scalars.
                     if dim < 0:
@@ -10005,7 +10065,11 @@ def test_if_supertype(self):
         def tensor_unifying(x, y, z):
             # testing dynamic is appropriately set for y and z
             if bool(x):
+<<<<<<< HEAD
                 x, y, z = x + 1, y, z  # noqa: PLW0127
+=======
+                x, y, z = x + 1, y, z
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 x, y, z = x + 1, x, y
 
@@ -10075,7 +10139,11 @@ def forward(self):
         buffer = io.BytesIO()
         torch.jit.save(cm, buffer)
         buffer.seek(0)
+<<<<<<< HEAD
         # when tensor is loaded as constant it isn't specialized
+=======
+        # when tensor is loaded as constant it isnt specialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cm_load = torch.jit.load(buffer)
         FileCheck().check_not("Float(1, 3)").run(cm_load.forward.graph)
 
@@ -10309,7 +10377,11 @@ def method(self, x):
 
     def test_type_inferred_from_empty_annotation(self):
         """
+<<<<<<< HEAD
         Test that the type inferred from an empty or missing annotation is Torch.Tensor with `inferred=true`
+=======
+        Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         @torch.jit.script
         def fn(x):
@@ -10938,7 +11010,11 @@ def forward(self, x: torch.Tensor):
 
             # Test symbolic differentiation
             # Run Forward and Backward thrice to trigger autodiff graph
+<<<<<<< HEAD
             for i in range(3):
+=======
+            for i in range(0, 3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 y = jit_module(x)
                 y.backward(grad)
             x.grad.zero_()
@@ -11802,7 +11878,11 @@ def test_for_in_zip_enumerate(self):
         def fn_zip_enumerate(x, y):
             # type: (List[int], List[int]) -> int
             sum = 0
+<<<<<<< HEAD
             for (i, (j, v), k) in zip(x, enumerate(y), range(100)):
+=======
+            for (i, (j, v), k) in zip(x, enumerate(y), range(0, 100)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sum += i * j * v * k
 
             return sum
@@ -14844,7 +14924,11 @@ def forward(self):
 
         # testing overload declared first, then non-overload
         if sys.version_info < (3, 13):  # test broken in 3.13
+<<<<<<< HEAD
             with self.assertRaisesRegex(Exception, "Overloads are not usable when a module"):
+=======
+            with self.assertRaisesRegex(Exception, "Overloads are not useable when a module"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 class W3(torch.nn.Module):
                     @torch.jit._overload_method  # noqa: F811
                     def forward(self, x):  # noqa: F811
@@ -14897,7 +14981,11 @@ def forward(self, x):
                 return self.hello(1), self.hello(x)
 
         if sys.version_info < (3, 13):  # test broken in 3.13
+<<<<<<< HEAD
             with self.assertRaisesRegex(Exception, "Overloads are not usable when a module"):
+=======
+            with self.assertRaisesRegex(Exception, "Overloads are not useable when a module"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 a = torch.jit.script(W2())
 
     def test_narrow_copy(self):
@@ -15615,7 +15703,11 @@ def forward(self):
                 a = hasattr(self, "fee")
                 b = hasattr(self, "foo")
                 c = hasattr(self, "hi")
+<<<<<<< HEAD
                 d = hasattr(self, "nonexistent")
+=======
+                d = hasattr(self, "nonexistant")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (a, b, c, d)
 
             def foo(self):
@@ -15758,7 +15850,11 @@ def test_in_for_and_comp_expr(self):
         def fn(d):
             # type: (Dict[str, int]) -> List[int]
             out = [1]
+<<<<<<< HEAD
             for i in range(d.get("hi", 6)):
+=======
+            for i in range(d["hi"] if "hi" in d else 6):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out.append(i)  # noqa: PERF402
             return out
 
@@ -16053,7 +16149,11 @@ def f(x):
 # chunk returns a list in scripting and we don't unpack the list,
 # Thus it won't be replaced by ConstantChunk and run AD.
 # It's explicitly checked in test_chunk_constant_script_ad
+<<<<<<< HEAD
 # Similarly for split, it's replaced by split_with_sizes in tracing,
+=======
+# Similary for split, it's replaced by split_with_sizes in tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # but we don't have AD formula for aten::split(Tensor, int[], int),
 # an op registered in JIT so AD is not triggered in scripting.
 EXCLUDE_SCRIPT_AD_CHECK = {
@@ -16104,7 +16204,11 @@ class TestJitGeneratedFunctional(JitTestCase):
 S = 5
 
 def add_nn_module_test(*args, **kwargs):
+<<<<<<< HEAD
     no_grad = kwargs.get('no_grad', False)
+=======
+    no_grad = False if 'no_grad' not in kwargs else kwargs['no_grad']
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if 'desc' in kwargs and 'eval' in kwargs['desc']:
         # eval() is not supported, so skip these tests
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 0559a728aef98..01465aab332e8 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -5,6 +5,7 @@
 from typing import Optional
 
 import unittest
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import parse_cmd_line_args, run_tests, skipIfTorchDynamo
 from torch.testing import FileCheck
@@ -16,6 +17,14 @@
     parse_cmd_line_args()
 
 from test_jit import JitTestCase
+=======
+from test_jit import JitTestCase
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
+from torch.testing import FileCheck
+from jit.test_models import MnistNet
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_BFLOAT16 = TEST_CUDA and torch.cuda.is_bf16_supported()
 
 @skipIfTorchDynamo("Not a TorchDynamo suitable test")
@@ -111,7 +120,11 @@ def fn(a, b, use_amp: bool):
     def test_runtime_autocast_state_expr(self):
         @torch.jit.script
         def fn(a, b):
+<<<<<<< HEAD
             with autocast(enabled=bool((a[0][0] > 0.5).item())):
+=======
+            with autocast(enabled=True if a[0][0] > 0.5 else False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.mm(a, b)
         # runtime values for autocast enable argument are not supported
         with self.assertRaises(RuntimeError):
@@ -324,7 +337,11 @@ def fn(a, b, c, d):
 
     # TODO: fix and enable this test?
     #   (we could technically fix this, but is it really worth it?)
+<<<<<<< HEAD
     @unittest.skipIf(True, "unsupported autocast syntax")
+=======
+    @unittest.skipIf(True, "unsuported autocast syntax")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reused_autocast_expr(self):
         @torch.jit.script
         def fn(a, b, c, d):
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index 5446770695c43..06e9da41caf64 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -9,6 +9,7 @@
 from torch.testing import FileCheck
 from unittest import skipIf
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     from torch.testing._internal.common_utils import parse_cmd_line_args
 
@@ -16,6 +17,8 @@
     # before instantiating tests.
     parse_cmd_line_args()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, IS_SANDCASTLE, ProfilingMode, GRAPH_EXECUTOR, \
     enable_profiling_mode_for_profiling_tests, IS_WINDOWS, TemporaryDirectoryName, shell
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, _inline_everything, \
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index 4100bcc3e182f..c9813ac8dfa78 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -2,6 +2,7 @@
 
 import sys
 sys.argv.append("--jit-executor=legacy")
+<<<<<<< HEAD
 
 if __name__ == "__main__":
     from torch.testing._internal.common_utils import parse_cmd_line_args
@@ -10,6 +11,8 @@
     # before instantiating tests.
     parse_cmd_line_args()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from test_jit_fuser import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index dba28f98cbf98..8d874f92bb32f 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -22,6 +22,7 @@
 torch._C._jit_set_profiling_executor(True)
 torch._C._get_graph_executor_optimize(True)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     from torch.testing._internal.common_utils import parse_cmd_line_args
 
@@ -29,6 +30,8 @@
     # before instantiating tests.
     parse_cmd_line_args()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from itertools import combinations, permutations, product
 from textwrap import dedent
 
@@ -133,7 +136,11 @@ def setUp(self):
         super().setUp()
         self.tensorexpr_options = TensorExprTestOptions()
 
+<<<<<<< HEAD
         # note: `self.dynamic_shapes` instantiated in specialization of class
+=======
+        # note: `self.dynamic_shapes` instatiated in specialization of class
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # defined below
 
         fusion_strategy = [("DYNAMIC", 20)] if self.dynamic_shapes else [("STATIC", 20)]
@@ -243,7 +250,11 @@ def func(x):
             return x2.sum()
 
         with texpr_reductions_enabled():
+<<<<<<< HEAD
             a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+=======
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -259,7 +270,11 @@ def func_neg(x):
             return x.sum((-2,)) * 2
 
         with texpr_reductions_enabled():
+<<<<<<< HEAD
             a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+=======
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -271,7 +286,11 @@ def func(x):
             return x.sum((0,), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
+<<<<<<< HEAD
             a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
+=======
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -2234,7 +2253,11 @@ def eager(x, y):
 
         indices = [0, 1, 2, 3]
         sets = []
+<<<<<<< HEAD
         for i in range(len(indices) + 1):
+=======
+        for i in range(0, len(indices) + 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for subset in combinations(indices, i):
                 sets.append(subset)  # noqa: PERF402
 
@@ -2946,10 +2969,14 @@ def test_unsupported(self, device, dtype, op):
 
     @slowTest
     @onlyCPU
+<<<<<<< HEAD
     @ops(
         [op for op in op_db if get_name(op) not in known_failures],
         dtypes=OpDTypes.supported,
     )
+=======
+    @ops(op_db, dtypes=OpDTypes.supported)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nnc_correctness(self, device, dtype, op):
         if not op.supports_tracing:
             self.skipTest("Requires tracing support")
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 480b57a55bd47..4ee5f07c86c02 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -2,6 +2,7 @@
 
 import sys
 sys.argv.append("--jit-executor=legacy")
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import parse_cmd_line_args, run_tests
 
 if __name__ == '__main__':
@@ -10,6 +11,9 @@
     parse_cmd_line_args()
 
 from test_jit import *  # noqa: F403, F401
+=======
+from test_jit import *  # noqa: F403
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit_string.py b/test/test_jit_string.py
index 55bd003cd9e3c..d4913f7eb43d7 100644
--- a/test/test_jit_string.py
+++ b/test/test_jit_string.py
@@ -241,6 +241,7 @@ def test_rpartition() -> tuple[tuple[str, str, str], tuple[str, str, str], tuple
         def test_split() -> tuple[list[str], list[str], list[str], list[str], list[str],
                                   list[str], list[str], list[str], list[str], list[str], list[str]]:
             return (
+<<<<<<< HEAD
                 ["a", "a", "a", "a", "a"],
                 ["a", "a", "a", "a", "a"],
                 ["a", "a", "a", "a", "a"],
@@ -252,6 +253,19 @@ def test_split() -> tuple[list[str], list[str], list[str], list[str], list[str],
                 [" a", "a a", "a a"],
                 [" a", "a a", "a a "],
                 [" ", "a ", "a a "],
+=======
+                "a a a a a".split(),
+                "a  a a   a a".split(),
+                "   a a\ta \v a \v\f\n a \t   ".split(),
+                " a a a a a ".split(" "),
+                "a a a a a ".split(" ", 10),
+                "a a a a a ".split(" ", -1),
+                "a a a a a ".split(" ", 3),
+                " a a a a a ".split("*"),
+                " a*a a*a a".split("*"),
+                " a*a a*a a ".split("*", -1),
+                " a*a a*a a ".split("a*", 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self.checkScript(test_split, ())
 
@@ -266,6 +280,7 @@ def test_split_empty_separator():
         def test_rsplit() -> tuple[list[str], list[str], list[str], list[str], list[str],
                                    list[str], list[str], list[str], list[str]]:
             return (
+<<<<<<< HEAD
                 ["a", "a", "a", "a", "a"],
                 ["", "a", "a", "a", "a", "a", ""],
                 ["a", "a", "a", "a", "a", ""],
@@ -275,6 +290,17 @@ def test_rsplit() -> tuple[list[str], list[str], list[str], list[str], list[str]
                 [" a", "a a", "a a "],
                 [" a", "a a", "a a "],
                 [" ", "a ", "a a"],
+=======
+                "a a a a a".rsplit(),
+                " a a a a a ".rsplit(" "),
+                "a a a a a ".rsplit(" ", 10),
+                "a a a a a ".rsplit(" ", -1),
+                "a a a a a ".rsplit(" ", 3),
+                " a a a a a ".rsplit("*"),
+                " a*a a*a a ".rsplit("*"),
+                " a*a a*a a ".rsplit("*", -1),
+                " a*a a*a a".rsplit("a*", 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self.checkScript(test_rsplit, ())
 
diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py
index 278026a021d7f..83952cf99385c 100644
--- a/test/test_kernel_launch_checks.py
+++ b/test/test_kernel_launch_checks.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: cuda"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.testing._internal.check_kernel_launches import (
diff --git a/test/test_legacy_vmap.py b/test/test_legacy_vmap.py
index bfd1075b25ed5..556f478265398 100644
--- a/test/test_legacy_vmap.py
+++ b/test/test_legacy_vmap.py
@@ -1679,7 +1679,11 @@ def get(shape):
 
             # Interesting case #2: Batch dim at end of tensor, success cases
             # view_as_complex requires that the dim with size 2 have stride 1
+<<<<<<< HEAD
             # in order for the view to function properly
+=======
+            # in order for the view to function propertly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             test(op, [get([B0, 2]).transpose(0, 1)], in_dims=1)
             test(vmap(op, in_dims=1), [get([B0, B1, 2]).movedim(1, 2)])
             test(vmap(op, in_dims=2), [get([B0, 3, B1, 2]).movedim(2, 3)])
diff --git a/test/test_license.py b/test/test_license.py
index 6f289a15bb4ec..4256bdb3e6a72 100644
--- a/test/test_license.py
+++ b/test/test_license.py
@@ -45,7 +45,11 @@ def test_distinfo_license(self):
                 'Found too many "torch-*dist-info" directories '
                 f'in "{site_packages}, expected only one'
             )
+<<<<<<< HEAD
         # setuptools renamed *dist-info/LICENSE to *dist-info/licenses/LICENSE since 77.0
+=======
+        # setuptools renamed *dist-info/LICENSE to *dist-info/licenses/LICENSE sicne 77.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         license_file = os.path.join(distinfo[0], "licenses", "LICENSE")
         if not os.path.exists(license_file):
             license_file = os.path.join(distinfo[0], "LICENSE")
diff --git a/test/test_linalg.py b/test/test_linalg.py
index b7cce9d10dcc8..753a31dd1c923 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -16,12 +16,17 @@
 from functools import reduce, partial
 from typing import Union, Optional
 from torch._prims_common import DimsType
+<<<<<<< HEAD
+=======
+from packaging import version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_SCIPY, IS_MACOS, IS_WINDOWS, slowTest,
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
+<<<<<<< HEAD
      skipIfRocmArch, setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
      runOnRocmArch, MI300_ARCH, NAVI_ARCH, TEST_CUDA)
 from torch.testing._internal.common_device_type import \
@@ -29,6 +34,16 @@
      onlyCPU, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
      onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, dtypesIfMPS, largeTensorTest)
+=======
+     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
+     runOnRocmArch, MI300_ARCH, TEST_CUDA)
+from torch.testing._internal.common_device_type import \
+    (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
+     onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
+     skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
+     onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
+     dtypesIfMPS, largeTensorTest)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
@@ -38,7 +53,11 @@
     _get_torch_cuda_version, CDNA2OrLater, TEST_MULTIGPU
 from torch.testing._internal.common_quantization import _group_quantize_tensor, _dynamically_quantize_per_channel, \
     _group_quantize_tensor_symmetric
+<<<<<<< HEAD
 from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
+=======
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions.binomial import Binomial
 import torch.backends.opt_einsum as opt_einsum
 import operator
@@ -107,6 +126,25 @@ def get_tunableop_untuned_filename():
     return untuned_filename
 
 class TestLinalg(TestCase):
+<<<<<<< HEAD
+=======
+    @contextlib.contextmanager
+    def _hip_allow_tf32(self):
+        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+        # and only for MI300+. Environment variable will be removed in the future.
+        import os
+        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+        try:
+            yield
+        finally:
+            if hip_allow_tf32 is not None:
+                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+            else:
+                del os.environ["HIPBLASLT_ALLOW_TF32"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUp(self):
         super().setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -147,6 +185,10 @@ def _tunableop_ctx(self):
             # loop through a list of potentially used
             # environment variables.
             env_list = ["PYTORCH_TUNABLEOP_BLAS_LOG",
+<<<<<<< HEAD
+=======
+                        "PYTORCH_TUNABLEOP_NUMERICAL_CHECK",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "PYTORCH_TUNABLEOP_UNTUNED_FILENAME"]
             for env in env_list:
                 try:
@@ -166,7 +208,10 @@ def _set_tunableop_defaults(self):
         torch.cuda.tunable.set_max_tuning_duration(30)
         torch.cuda.tunable.set_max_tuning_iterations(100)
         torch.cuda.tunable.set_rotating_buffer_size(-1)
+<<<<<<< HEAD
         torch.cuda.tunable.set_numerical_check_tolerances(False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ordinal = torch.cuda.current_device()
 
         # Set filenames to be unique on a per test basis
@@ -213,7 +258,11 @@ def _compare_untuned_tuned_entries(self, untuned_filename=None, tuned_filename=N
     @dtypes(torch.float, torch.cfloat)
     @precisionOverride({torch.float: 1e-06, torch.cfloat: 1e-06})
     @tf32_on_and_off(5e-3)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(5e-3)
+=======
+    @bf32_on_and_off(5e-3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inner(self, device, dtype):
         def check(a_sizes_, b_sizes_):
             for a_sizes, b_sizes in ((a_sizes_, b_sizes_), (b_sizes_, a_sizes_)):
@@ -603,6 +652,18 @@ def complement_device(device):
             with self.assertRaisesRegex(RuntimeError, r'parameter `driver` should be one of \(gels, gelsy, gelsd, gelss\)'):
                 torch.linalg.lstsq(a, b, driver='fictitious_driver')
 
+<<<<<<< HEAD
+=======
+        # cuSOLVER path supports underdetermined systems
+        version = torch.testing._internal.common_cuda._get_torch_cuda_version()
+        cusolver_not_available = (version < (10, 1))
+
+        if device != 'cpu' and cusolver_not_available:
+            a = torch.rand(2, 3, dtype=dtype, device=device)
+            b = torch.rand(2, 1, dtype=dtype, device=device)
+            with self.assertRaisesRegex(RuntimeError, r'only overdetermined systems'):
+                torch.linalg.lstsq(a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -758,7 +819,11 @@ def cholesky_test_helper(n, batch_dims, upper):
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
     @tf32_on_and_off(0.1 if TEST_WITH_ROCM else 0.01)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.01)
+=======
+    @bf32_on_and_off(0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_old_cholesky(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
 
@@ -1144,7 +1209,11 @@ def test_eigh_errors_and_warnings(self, device, dtype):
 
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
+<<<<<<< HEAD
     @unittest.skipIf((not TEST_WITH_ROCM) and _get_torch_cuda_version() < (12, 1), "Test is fixed on cuda 12.1 update 1.")
+=======
+    @unittest.skipIf(_get_torch_cuda_version() < (12, 1), "Test is fixed on cuda 12.1 update 1.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_eigh_svd_illcondition_matrix_input_should_not_crash(self, device, dtype):
         # See https://github.com/pytorch/pytorch/issues/94772, https://github.com/pytorch/pytorch/issues/105359
         # This test crashes with `cusolver error: CUSOLVER_STATUS_EXECUTION_FAILED` on cuda 11.8,
@@ -2137,7 +2206,10 @@ def run_test(shape, *, symmetric=False):
     @skipCUDAIfNoMagma
     @dtypes(*floating_and_complex_types())
     def test_eig_compare_backends(self, device, dtype):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def run_test(shape, *, symmetric=False):
             from torch.testing._internal.common_utils import random_symmetric_matrix
 
@@ -2151,6 +2223,7 @@ def run_test(shape, *, symmetric=False):
 
             complementary_device = 'cpu'
 
+<<<<<<< HEAD
             # compare eigenvalues with CPU
             expected = torch.linalg.eig(a.to(complementary_device))
             self.assertEqual(expected[0], actual[0])
@@ -2172,6 +2245,12 @@ def run_test(shape, *, symmetric=False):
                 raise RuntimeError("eig returned empty tensors unexpectedly")
 
             self.assertEqual(a @ v, v * w.unsqueeze(-2), atol=atol, rtol=0)
+=======
+            # compare with CPU
+            expected = torch.linalg.eig(a.to(complementary_device))
+            self.assertEqual(expected[0], actual[0])
+            self.assertEqual(expected[1], actual[1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         shapes = [(0, 0),  # Empty matrix
                   (5, 5),  # Single matrix
@@ -2809,6 +2888,12 @@ def test_invariance_error_spectral_decompositions(self, device, dtype):
             Q = torch.linalg.eigh(A).eigenvectors
             Q.sum().abs().backward()
 
+<<<<<<< HEAD
+=======
+    # I don't know how much memory this test uses but on complex64 it needs at least 4GB
+    @largeTensorTest("4GB", device="cuda")
+    @serialTest(TEST_CUDA)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNoCusolver  # MAGMA backend doesn't work in this case
     @precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})
     @skipCPUIfNoLapack
@@ -4242,6 +4327,7 @@ def test(n=10,                       # how many tests to generate
 
         test(500)
 
+<<<<<<< HEAD
     @dtypes(torch.float)
     def test_einsum_output_layout(self, device, dtype):
         batch, in_dim, out_dim = 2, 3, 5
@@ -4254,6 +4340,8 @@ def test_einsum_output_layout(self, device, dtype):
         self.assertEqual(result.stride(), expected.stride())
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_einsum_corner_cases(self, device):
         def check(equation, *operands, expected_output):
             tensors = [torch.tensor(operand, device=device, dtype=torch.float32) if not isinstance(operand, tuple)
@@ -4764,7 +4852,10 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
     @dtypes(*floating_types_and(torch.half))
     @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
     def test_matmul_small_brute_force_tunableop(self, device, dtype):
+<<<<<<< HEAD
         import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # disable tunableop buffer rotation for all tests everywhere, it can be slow
         # We set the TunableOp numerical check environment variable here because it is
         # possible to hit some invalid numerical solutions due to the small matrix sizes.
@@ -4792,11 +4883,34 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype):
 
             filename1 = torch.cuda.tunable.get_filename()
             unique_id = self.id().split(".")[-1]
+<<<<<<< HEAD
+=======
+            filename2 = f"{filename1}_tmp1.csv"
+            filename3 = f"{filename1}_tmp2.csv"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ordinal = torch.cuda.current_device()
             assert filename1 == f"tunableop_results_{unique_id}_{ordinal}.csv"
             assert len(torch.cuda.tunable.get_results()) > 0
 
+<<<<<<< HEAD
             self.assertTrue(os.path.exists(filename1))
+=======
+            assert torch.cuda.tunable.write_file()  # use default filename
+            assert torch.cuda.tunable.write_file(filename2)  # use custom, one-time filename
+            torch.cuda.tunable.set_filename(filename3)
+            assert torch.cuda.tunable.write_file()  # use previously set filename
+            assert torch.cuda.tunable.read_file()  # use previously set filename, will ignore duplicates and return True
+
+            with open(filename1) as file1:
+                file1_contents = file1.read()
+            with open(filename2) as file2:
+                file2_contents = file2.read()
+            with open(filename3) as file3:
+                file3_contents = file3.read()
+            assert file1_contents == file2_contents
+            assert file1_contents == file3_contents
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We need to reset the filename to the default value so we can properly
             # clean up intermediate files
             self._set_tunableop_defaults()
@@ -4805,7 +4919,10 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.half)
     def test_matmul_offline_tunableop(self, device, dtype):
+<<<<<<< HEAD
         import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Main offline tunableop test
         # NOTE: The offline tuning does not support certain tensor
         # shapes as noted below. Submatrics / matrix slices are
@@ -4838,7 +4955,11 @@ def is_bmm_compatible(A, B):
             self.assertTrue(torch.cuda.tunable.record_untuned_is_enabled())
 
             make_arg = partial(make_tensor, device=device, dtype=dtype)
+<<<<<<< HEAD
             # offline tuning only handles matmuls on two dimensional tensors
+=======
+            # offline tuning only handles matmuls on two dimensionsal tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # matmul that require broadcasting are
             # not supported either.
             # Below we check the different transA and transB combinations.
@@ -4867,7 +4988,11 @@ def is_bmm_compatible(A, B):
                     continue
 
             # offline tuning only handles batched matmuls on
+<<<<<<< HEAD
             # three dimensional tensors
+=======
+            # three dimensionsal tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # matmul that require broadcasting are
             # not supported either.
             # Below we check the different transA and transB combinations.
@@ -4916,9 +5041,13 @@ def is_bmm_compatible(A, B):
             new_results = len(torch.cuda.tunable.get_results())
 
             self.assertGreater(new_results - ref_results, 0)
+<<<<<<< HEAD
 
             results_filename = torch.cuda.tunable.get_filename()
             self.assertTrue(os.path.exists(results_filename))
+=======
+            self.assertTrue(torch.cuda.tunable.write_file())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -4929,7 +5058,10 @@ def is_bmm_compatible(A, B):
     @runOnRocmArch(MI300_ARCH)
     @dtypes(torch.torch.float8_e4m3fnuz, torch.float8_e5m2fnuz)
     def test_scaled_gemm_offline_tunableop(self, device, dtype):
+<<<<<<< HEAD
         import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This test is the offline version of test_scaled_gemm_tunableop
 
         with self._tunableop_ctx():
@@ -5009,8 +5141,12 @@ def test_scaled_gemm_offline_tunableop(self, device, dtype):
                 count = 6
             self.assertEqual(total_num_results, count)
 
+<<<<<<< HEAD
             results_filename = torch.cuda.tunable.get_filename()
             self.assertTrue(os.path.exists(results_filename))
+=======
+            self.assertTrue(torch.cuda.tunable.write_file())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -5158,6 +5294,10 @@ def test_bmm_tunableop_rocm(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.bfloat16)
     def test_numeric_check_leak_tunableop_rocm(self, device, dtype):
+<<<<<<< HEAD
+=======
+        import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.testing._internal.common_utils import CudaMemoryLeakCheck
         # run operator first without tuning to ensure all rocm libs are loaded,
         # otherwise false positive mem leak
@@ -5170,8 +5310,13 @@ def test_numeric_check_leak_tunableop_rocm(self, device, dtype):
 
         with self._tunableop_ctx():
             torch.cuda.tunable.set_rotating_buffer_size(0)
+<<<<<<< HEAD
             # enable tunableop numeric check via API.
             torch.cuda.tunable.set_numerical_check_tolerances(True, 0.1, 0.1)
+=======
+            # enable tunableop numeric check via env variable.
+            os.environ["PYTORCH_TUNABLEOP_NUMERICAL_CHECK"] = "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             ordinal = torch.cuda.current_device()
 
@@ -5360,23 +5505,44 @@ def test_gemm_bias_tunableop(self, device, dtype):
             m = 3
             n = 5
             k = 7
+<<<<<<< HEAD
+=======
+            # 'TN' case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             X = torch.rand(m, k, dtype=dtype, device=device)
             matA = torch.rand(n, k, dtype=dtype, device=device)
             bias = torch.rand(n, dtype=dtype, device=device)
 
             torch.nn.functional.linear(X, matA, bias)
 
+<<<<<<< HEAD
+=======
+            # 'NT' case
+            X = torch.rand(k, m, dtype=dtype, device=device).t()
+            matA = torch.rand(k, n, dtype=dtype, device=device).t()
+            bias = torch.rand(n, dtype=dtype, device=device)
+
+            torch.nn.functional.linear(X, matA, bias)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This stores total number of cummulative results
             total_num_results = len(torch.cuda.tunable.get_results())
 
             # There must be a new tuning result
+<<<<<<< HEAD
             self.assertEqual((total_num_results - ref_num_results), 1)
+=======
+            self.assertEqual((total_num_results - ref_num_results), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @onlyCUDA
     @skipCUDAIfNotRocm
     @dtypes(torch.bfloat16)
     def test_gemm_bias_offline_tunableop(self, device, dtype):
+<<<<<<< HEAD
         import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This test is the offline version of test_gemm_bias_tunableop
         ordinal = torch.cuda.current_device()
 
@@ -5391,12 +5557,25 @@ def test_gemm_bias_offline_tunableop(self, device, dtype):
             m = 5
             n = 7
             k = 9
+<<<<<<< HEAD
+=======
+            # 'TN' case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             X = torch.rand(m, k, dtype=dtype, device=device)
             matA = torch.rand(n, k, dtype=dtype, device=device)
             bias = torch.rand(n, dtype=dtype, device=device)
 
             torch.nn.functional.linear(X, matA, bias)
 
+<<<<<<< HEAD
+=======
+            # 'NT' case
+            X = torch.rand(k, m, dtype=dtype, device=device).t()
+            matA = torch.rand(k, n, dtype=dtype, device=device).t()
+            bias = torch.rand(n, dtype=dtype, device=device)
+
+            torch.nn.functional.linear(X, matA, bias)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(torch.cuda.tunable.is_enabled())
             self.assertTrue(torch.cuda.tunable.tuning_is_enabled() is False)
 
@@ -5418,10 +5597,16 @@ def test_gemm_bias_offline_tunableop(self, device, dtype):
             total_num_results = new_results - ref_results
 
             # There must be a new tuning results
+<<<<<<< HEAD
             self.assertEqual(total_num_results, 1)
 
             results_filename = torch.cuda.tunable.get_filename()
             self.assertTrue(os.path.exists(results_filename))
+=======
+            self.assertEqual(total_num_results, 2)
+
+            self.assertTrue(torch.cuda.tunable.write_file())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -5506,8 +5691,18 @@ def test_scaled_gemm_tunableop(self, device, dtype):
     @runOnRocmArch(MI300_ARCH)
     @dtypes(torch.float)
     def test_tf32_tunableop(self, device, dtype):
+<<<<<<< HEAD
         try:
             with self._tunableop_ctx():
+=======
+        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
+        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+        # and only for MI300+. Eventually this flag will go away.
+        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+
+        try:
+            with self._tunableop_ctx(), tf32_ctx():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.cuda.tunable.set_rotating_buffer_size(0)
 
@@ -5570,8 +5765,18 @@ def test_tf32_offline_tunableop(self, device, dtype):
         # This test is the offline version of test_tf32_tunableop
         import os
 
+<<<<<<< HEAD
         try:
             with self._tunableop_ctx():
+=======
+        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
+        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+        # and only for MI300+. Eventually this flag will go away.
+        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+
+        try:
+            with self._tunableop_ctx(), tf32_ctx():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.backends.cuda.matmul.allow_tf32 = True
                 ordinal = torch.cuda.current_device()
                 torch.cuda.tunable.set_rotating_buffer_size(0)
@@ -5622,8 +5827,12 @@ def test_tf32_offline_tunableop(self, device, dtype):
                                                      'nn_41_41_41_ld_41_41_41')
                 self.assertTrue(found_result is not None)
 
+<<<<<<< HEAD
                 results_filename = torch.cuda.tunable.get_filename()
                 self.assertTrue(os.path.exists(results_filename))
+=======
+                self.assertTrue(torch.cuda.tunable.write_file())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Compare Param Signature of untuned and tuned results
                 ok = self._compare_untuned_tuned_entries()
@@ -5660,7 +5869,11 @@ def test_blaslog_tunableop(self, device, dtype):
 
             # TunableOp is running in a subprocess
             # online tuning needs filename set through API
+<<<<<<< HEAD
             # offline tuning needs filename set through environment variable
+=======
+            # offline tuning needs filename set through environment variableq
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result_filename = torch.cuda.tunable.get_filename()
             untuned_filename = get_tunableop_untuned_filename()
 
@@ -5723,7 +5936,10 @@ def test_blaslog_tunableop(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.float)
     def test_mm_submatrix_offline_tunableop(self, device, dtype):
+<<<<<<< HEAD
         import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test offline tuning with submatrices
         # Covers GEMM, ScaledGEMM, and GEMM+bias.
         ordinal = torch.cuda.current_device()
@@ -5848,20 +6064,29 @@ def test_mm_submatrix_offline_tunableop(self, device, dtype):
             torch.cuda.tunable.tune_gemm_in_file(untuned_filename)
             new_results = len(torch.cuda.tunable.get_results())
 
+<<<<<<< HEAD
             # This stores total number of cumulative results
+=======
+            # This stores total number of cummulative results
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             total_num_results = new_results - ref_results
 
             # There must be a new tuning results
             self.assertEqual(total_num_results, 10)
 
+<<<<<<< HEAD
             results_filename = torch.cuda.tunable.get_filename()
             self.assertTrue(os.path.exists(results_filename))
 
+=======
+            self.assertTrue(torch.cuda.tunable.write_file())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
             self.assertTrue(ok)
 
+<<<<<<< HEAD
 
     @onlyCUDA
     @skipCUDAIfNotRocm
@@ -5996,6 +6221,8 @@ def test_matmul_empty_existing_file_tunableop(self, device, dtype):
                 self.assertGreater(len(result_lines), 0)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @skipCUDAIfNotRocm
     @runOnRocmArch(MI300_ARCH)
@@ -6072,6 +6299,7 @@ def test_call_count_tunableop(self, device, dtype):
             # There must be exactly three kernels only
             self.assertEqual(kernel_count, 3)
 
+<<<<<<< HEAD
     @onlyCUDA
     @skipCUDAIfNotRocm
     @dtypes(torch.float16)
@@ -6114,6 +6342,8 @@ def test_numerical_check_accuracy_tunableop(self, device, dtype):
             self.assertTrue(torch.allclose(C_baseline, C_numeric, atol=atol, rtol=rtol))
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.complex64)
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty((256, 512), device=device, dtype=dtype, requires_grad=True).unsqueeze(0)
@@ -6149,7 +6379,10 @@ def test_tensordot_out_kernel_errors_with_autograd(self, device, dtype):
             self.assertEqual(len(w), 1)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+<<<<<<< HEAD
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_mm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
@@ -6160,7 +6393,10 @@ def test_large_bmm_mm_backward(self, device):
         (A @ B).backward(G)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+<<<<<<< HEAD
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
@@ -6223,6 +6459,21 @@ def test_linalg_cross_with_and_without_dim(self, device, dtype):
         self.assertEqual(res1, res2)
         self.assertEqual(res1, res3)
 
+<<<<<<< HEAD
+=======
+    def test_cross_error(self, device):
+        x = torch.randn(4, 3, device=device)
+        y = torch.randn(4, 3, device=device)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.cross(x, y, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.cross(y, x, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.linalg.cross(x, y, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.linalg.cross(y, x, out=x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_renorm(self, device):
         m1 = torch.randn(20, 20, device=device)  # big enough to exercise vectorized path
         res1 = torch.tensor((), device=device)
@@ -6869,6 +7120,15 @@ def _test_lobpcg_method(self, device, dtype, method):
         def test_tracker(worker):
             k = worker.iparams['k']
             nc = worker.ivars['converged_count']
+<<<<<<< HEAD
+=======
+
+            # Regression test for PR #152789 (fixes issue #101075)
+            # Ensure rerr is non-negative at each iteration
+            rerr = worker.tvars['rerr']
+            self.assertGreaterEqual(rerr.min(), 0.)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if k <= nc:
                 tol = worker.fparams['tol']
                 rerr = worker.tvars['rerr']
@@ -6898,9 +7158,32 @@ def lobpcg(*args, **kwargs):
             kwargs['tol'] = 1e-8
             return orig_lobpcg(*args, **kwargs)
         prec = 5e-4
+<<<<<<< HEAD
 
         # check dense input
         mm = torch.matmul
+=======
+        mm = torch.matmul
+
+        # Regression test for PR #152789 (fixes issue #101075)
+        # https://github.com/pytorch/pytorch/issues/101075#issuecomment-1548483685
+        # Demonstrates the original bug: negative residuals in the 2nd iteration
+        A = torch.Tensor([
+            [-0.56142016, 0.29639858, -0.16059532],
+            [0.29639858, -0.69093563, 0.26248195],
+            [-0.16059532, 0.26248195, -0.40236716]
+        ])
+        B = torch.Tensor([
+            [1.89193057, -0.08174309, -0.3557846],
+            [-0.08174309, 1.64589643, -0.46436347],
+            [-0.3557846, -0.46436347, 1.67404367]
+        ])
+        X = torch.Tensor([[0.61591334, 0.63823109, 0.46185694]]).T
+        E, V = lobpcg(A=A, B=B, X=X, k=1)
+        self.assertEqual(matmul(A, V), mm(matmul(B, V), E.diag_embed()), atol=prec, rtol=0)
+
+        # check dense input
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for batches in [(), (2,), (2, 3)]:
             for m, n, k in [
                     (9, 3, 1),
@@ -6993,7 +7276,12 @@ def test_lobpcg_torchscript(self, device, dtype):
         eq_err = torch.norm((mm(A1, V1) - V1 * E1), 2) / E1.max()
         self.assertLess(eq_err, 1e-6)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_SCIPY or (TEST_SCIPY and scipy.__version__ < '1.4.1'), "Scipy not found or older than 1.4.1")
+=======
+    @unittest.skipIf(not TEST_SCIPY or (TEST_SCIPY and version.parse(scipy.__version__) < version.parse('1.4.1')),
+                     "Scipy not found or older than 1.4.1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCPUIfNoLapack
     @skipIfTorchDynamo("fails in tracing scipy.sparse.lobpcg")
     @onlyCPU
@@ -7205,8 +7493,11 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
                   torch.half))
     @dtypes(torch.bfloat16, torch.half, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
+<<<<<<< HEAD
         if IS_ARM64 and device == 'cpu' and dtype == torch.float16:
             raise unittest.SkipTest("Fails on ARM, see https://github.com/pytorch/pytorch/issues/125438")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # have to use torch.randn(...).to(bfloat16) instead of
         # torch.randn(..., dtype=bfloat16). randn does not support
         # bfloat16 yet.
@@ -7314,7 +7605,11 @@ def maybe_transpose(cond, m):
                   *[torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm(self, device, dtype):
         self._test_addmm_impl(torch.addmm, None, device, dtype)
 
@@ -7324,7 +7619,11 @@ def test_addmm(self, device, dtype):
                   *[torch.bfloat16, torch.half] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_relu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
 
@@ -7336,7 +7635,11 @@ def test_addmm_relu(self, device, dtype):
                   *[torch.bfloat16, torch.half] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_relu_tunableop_rocm(self, device, dtype):
         with self._tunableop_ctx():
             torch.cuda.tunable.set_rotating_buffer_size(0)
@@ -7350,14 +7653,22 @@ def test_addmm_relu_tunableop_rocm(self, device, dtype):
                   *[torch.bfloat16, torch.half] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_gelu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
 
     @dtypes(torch.float, torch.double)
     @dtypesIfCUDA(*floating_and_complex_types())
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.005)
+=======
+    @bf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
             for n in [0, 1, 10]:
@@ -7438,6 +7749,10 @@ def test_matmul_45724(self, device):
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(SM90OrLater and not TEST_WITH_ROCM, "Expected failure on sm90")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
+<<<<<<< HEAD
+=======
+    @skipCUDAIfRocmVersionLessThan((6, 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @parametrize("k", [16, 32])
     @parametrize("n", [16, 32])
@@ -7479,7 +7794,11 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
 
         if TEST_WITH_ROCM:
             _test(17, k, n, use_transpose_a, use_transpose_b, True)
+<<<<<<< HEAD
         else:
+=======
+        elif version >= (11, 7):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not use_transpose_a and use_transpose_b:
                 if SM80OrLater or (version >= (12, 3) and (SM70 or SM75)):
                     _test(17, k, n, use_transpose_a, use_transpose_b, version > (11, 7))
@@ -7505,11 +7824,26 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
                     with self.assertRaisesRegex(RuntimeError,
                                                 "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
                         _test(17, k, n, use_transpose_a, use_transpose_b)
+<<<<<<< HEAD
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
     def test__int_mm_errors(self, device):
+=======
+        else:
+            with self.assertRaisesRegex(RuntimeError, "_int_mm_out_cuda not compiled for CUDA"):
+                _test(17, k, n, use_transpose_a, use_transpose_b, False)
+
+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
+    @skipCUDAIfRocmVersionLessThan((6, 0))
+    @onlyCUDA
+    def test__int_mm_errors(self, device):
+        version = _get_torch_cuda_version()
+        if torch.version.cuda and version < (11, 7):
+            self.skipTest("_int_mm only compiled for CUDA 11.7")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def genf_int(x, y):
             return torch.empty((x, y), dtype=torch.int8, device=device)
@@ -7871,7 +8205,11 @@ def dyn_quant_matmul_4bit(
             all_elements_within_threshold, "Some elements have error >= 0.06"
         )
 
+<<<<<<< HEAD
     @onlyNativeDeviceTypes
+=======
+    @onlyCPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("m", [32, 64])
     @parametrize("k", [32, 64])
     @parametrize("n", [48, 64])
@@ -7947,7 +8285,11 @@ def test_fp16_mv_transposed_first_argument_arm_cpu(self, device, m, k):
     @dtypes(torch.half, torch.float32, torch.float64, torch.int32, torch.int64, torch.cfloat, torch.cdouble)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.cfloat, torch.cdouble)
     @tf32_on_and_off(0.01)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.01)
+=======
+    @bf32_on_and_off(0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mm(self, device, dtype):
         def _test_mm(n, m, p, dtype, genf):
             # helper function
@@ -8127,7 +8469,11 @@ def test_strided_mm_bmm(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -8240,7 +8586,11 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -8314,7 +8664,11 @@ def generate_tensor():
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.05)
+=======
+    @bf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_baddbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -8528,6 +8882,7 @@ def test_linalg_matrix_exp_boundary_cases(self, device, dtype):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+<<<<<<< HEAD
     def test_matrix_exp_backward_input_validation(self, device, dtype):
 
         scalar_tensor = torch.tensor(1.0, dtype=dtype, device=device)
@@ -8543,6 +8898,8 @@ def test_matrix_exp_backward_input_validation(self, device, dtype):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linalg_matrix_exp_perverse_nan_values(self, device, dtype):
         expm = torch.linalg.matrix_exp
 
@@ -9287,8 +9644,14 @@ def dims_full_for_fn():
             r1 = fntorch(t0_full, t1, t2)
             self.assertEqual(r0, r1)
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.001)
     @reduced_f32_on_and_off(0.001)
+=======
+    # ROCm 6.4 passes with tf32=on, but 6.4.1 needed tolerance reduced slightly
+    @tf32_on_and_off(0.002 if torch.version.hip else 0.001)
+    @bf32_on_and_off(0.001)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_broadcast_batched_matmul(self, device):
         n_dim = random.randint(1, 8)
         m_dim = random.randint(1, 8)
@@ -9625,7 +9988,11 @@ def fn(torchfn, *args):
                          fn(torch.slogdet, (0, 0)))
 
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.07)
+=======
+    @bf32_on_and_off(0.07)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensordot(self, device):
         a = torch.arange(60., device=device).reshape(3, 4, 5)
         b = torch.arange(24., device=device).reshape(4, 3, 2)
@@ -9656,6 +10023,27 @@ def test_tensordot(self, device):
         an = torch.from_numpy(np.tensordot(np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0))
         self.assertEqual(a, an)
 
+<<<<<<< HEAD
+=======
+        # Testing the fast path introduced in #145936,
+        # i.e. reduction to a scalar has to be of right dim.
+        a = torch.rand(2, 2, device=device)
+        a_dims = [-1, -2]
+        b = torch.rand(2, 2, device=device)
+        b_dims = [-2, -1]
+        for res_ndim in range(5):
+            res_torch = torch.tensordot(a, b, [a_dims, b_dims])
+            self.assertEqual(res_torch.ndim, res_ndim)
+
+            res_numpy = torch.from_numpy(np.tensordot(a.cpu().numpy(), b.cpu().numpy(), [a_dims, b_dims]))
+            self.assertEqual(res_torch, res_numpy)
+
+            if res_ndim % 2:
+                b.unsqueeze_(0)
+            else:
+                a.unsqueeze_(0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNoCusolver
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -9727,6 +10115,10 @@ def symmetric(A):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @skipCUDAIfRocm
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*floating_and_complex_types())
     def test_ldl_solve(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
@@ -9797,7 +10189,10 @@ def test_preferred_blas_library(self):
         self.assertEqual(out_ref, out2.cpu())
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNotRocm
     @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device")
     @setBlasBackendsToDefaultFinally
@@ -9902,6 +10297,7 @@ def test_matmul_mv(self, device, dtype):
         C = torch.matmul(A, B)
         self.assertEqual(C, B.sum().expand(B.shape))
 
+<<<<<<< HEAD
     @onlyCUDA
     @largeTensorTest("40GB")
     def test_triu_tril_large_matrix_64bit(self, device):
@@ -9957,6 +10353,8 @@ def test_triu_tril_extreme_k_values(self, device, dtype):
         self.assertEqual(result_triu_min, expected_triu_min)
         self.assertEqual(result_tril_min, expected_tril_min)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 1e-4})
     def test_1_sized_with_0_strided(self, device, dtype):
diff --git a/test/test_masked.py b/test/test_masked.py
index 1c6bd42ab763f..89ed038792f98 100644
--- a/test/test_masked.py
+++ b/test/test_masked.py
@@ -57,7 +57,11 @@ def apply_masked_reduction_along_dim(op, input, *args, **kwargs):
        [[op([1, 2], *args0, **kwargs, dim=None, keepdim=False)]
         [op([3, 4, 5], *args0, **kwargs, dim=None, keepdim=False)]]
 
+<<<<<<< HEAD
       where args0 is args where dim value is replaced with None if
+=======
+      where args0 is args where dim value is replased with None if
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       present.
 
       Using the same example data, if the op is called with dim=(0, 1)
diff --git a/test/test_maskedtensor.py b/test/test_maskedtensor.py
index 03c05c7ea6da4..df5fc5d13a434 100644
--- a/test/test_maskedtensor.py
+++ b/test/test_maskedtensor.py
@@ -236,6 +236,7 @@ def test_to_sparse(self, device):
             _compare_mt_t(sparse_mt, data)
             _compare_mt_t(mt.grad, data.grad)
 
+<<<<<<< HEAD
     def test_to_device(self, device):
         for sample in _generate_sample_data(device=device):
             data = sample.input
@@ -262,6 +263,8 @@ def test_to_dtype(self, device):
             self.assertEqual(mt_dtype.get_mask().dtype, torch.bool)
             self.assertEqual(mt_dtype.get_data().dtype, new_dtype)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_to_dense(self, device):
         samples = _generate_sample_data(
             device=device,
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 96efb8924bf21..6ecae9db59fb5 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -1,10 +1,21 @@
 # Owner(s): ["module: linear algebra"]
 
 import contextlib
+<<<<<<< HEAD
 import time
 import unittest
 from itertools import product
 from functools import partial
+=======
+import json
+import math
+import re
+import tempfile
+import unittest
+from itertools import product
+from functools import partial
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -16,12 +27,22 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_BF16,
+<<<<<<< HEAD
     PLATFORM_SUPPORTS_GREEN_CONTEXT,
     SM53OrLater,
     SM80OrLater,
     SM90OrLater,
     SM100OrLater,
     _get_torch_cuda_version,
+=======
+    SM53OrLater,
+    SM89OrLater,
+    SM90OrLater,
+    xfailIfSM100OrLater,
+    _get_torch_cuda_version,
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MX_GEMM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -29,11 +50,19 @@
     onlyCUDA,
     tol as xtol,
     toleranceOverride,
+<<<<<<< HEAD
+=======
+    e4m3_type,
+    e5m2_type,
+    E4M3_MAX_POS,
+    E5M2_MAX_POS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 from torch.testing._internal.common_utils import (
     IS_JETSON,
     IS_WINDOWS,
+<<<<<<< HEAD
     MI200_ARCH,
     NAVI_ARCH,
     getRocmVersion,
@@ -50,6 +79,18 @@
 )
 
 from torch._inductor.test_case import TestCase as InductorTestCase
+=======
+    parametrize,
+    run_tests,
+    skipIfRocm,
+    skipIfRocmVersionAndArch,
+    skipIfRocmVersionLessThan,
+    TEST_CUDA,
+    TEST_WITH_ROCM,
+    TestCase,
+)
+from torch.testing._internal.common_quantized import _f32_to_floatx_unpacked, _floatx_unpacked_to_f32, ceil_div, to_blocked
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _IS_SM8X = False
 if TEST_CUDA:
@@ -58,6 +99,7 @@
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
+<<<<<<< HEAD
 def xfailIfSM100OrLaterNonRTXAndCondition(condition_fn):
     """
     Conditionally xfail tests on SM100+ datacenter SKUs based on a condition function.
@@ -69,6 +111,8 @@ def xfailIfSM100OrLaterNonRTXAndCondition(condition_fn):
         lambda params: computeCapabilityCheck and condition_fn(params)
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextlib.contextmanager
 def blas_library_context(backend):
@@ -79,7 +123,11 @@ def blas_library_context(backend):
     finally:
         torch.backends.cuda.preferred_blas_library(prev_backend)
 
+<<<<<<< HEAD
 class TestMatmulCuda(InductorTestCase):
+=======
+class TestMatmulCuda(TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUp(self):
         super().setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -145,16 +193,24 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
         # Move to CPU for comparison
         res_cuda = res_cuda.to("cpu")
         # Compare
+<<<<<<< HEAD
         if dtype == torch.float16:
             self.assertEqual(res_cpu, res_cuda, atol=size * 2.5e-5, rtol=0.0)
         else:
             self.assertEqual(res_cpu, res_cuda)
 
+=======
+        self.assertEqual(res_cpu, res_cuda)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig_bf16
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig_fp16
         torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accumulate
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocmVersionLessThan((5, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=1e-1, rtol=1e-1),
                         torch.bfloat16: xtol(atol=1e-1, rtol=1e-1),
@@ -164,6 +220,7 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
         with blas_library_context(backend):
+<<<<<<< HEAD
             if (TEST_WITH_ROCM and backend == "cublas" and isRocmArchAnyOf(NAVI_ARCH) and
                     getRocmVersion() < (6, 4) and dtype == torch.float16 and size >= 10000):
                 self.skipTest(f"failed on Navi for ROCm6.3 due to hipblas backend, dtype={dtype} and size={size}")
@@ -171,6 +228,12 @@ def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
 
     @onlyCUDA
     @xfailIfSM100OrLaterNonRTXAndCondition(lambda params: params.get('dtype') == torch.bfloat16 and params.get('size') == 10000)
+=======
+            self.cublas_addmm(size, dtype, False)
+
+    @onlyCUDA
+    @skipIfRocmVersionLessThan((5, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
@@ -182,6 +245,10 @@ def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype, bac
             self.cublas_addmm(size, dtype, True)
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocmVersionLessThan((5, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float16)
     # m == 4 chooses OUTPUT_TYPE reduction on H200
     # m == 8 chooses OUTPUT_TYPE reduction on A100
@@ -202,6 +269,10 @@ def test_cublas_addmm_no_reduced_precision(self, small_size: int, size: int, dty
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig_precision
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocmVersionLessThan((5, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
@@ -213,6 +284,10 @@ def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype:
             self.cublas_addmm(size, dtype, False, True)
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cublas_and_lt_reduced_precision_fp16_accumulate(self):
         orig_fp16_accumulate = torch.backends.cuda.matmul.allow_fp16_accumulation
         torch.backends.cuda.matmul.allow_fp16_accumulation = True
@@ -237,7 +312,11 @@ def test_cublas_and_lt_reduced_precision_fp16_accumulate(self):
     def test_cublas_addmm_alignment(self, dtype):
         device = 'cuda'
         # perturb X, A, or B alignment
+<<<<<<< HEAD
         for idx in range(3):
+=======
+        for idx in range(0, 3):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for offset in range(1, 3):
                 offsets = [0, 0, 0]
                 offsets[idx] = offset
@@ -264,6 +343,10 @@ def test_cublas_addmm_alignment(self, dtype):
          (1, 10000, 10000, 10000)],
         name_fn=lambda batch_size, N, M, P: f"{batch_size}_{N}_{M}_{P}",
     )
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cublas_baddbmm_large_input(self, device, batch_size, N, M, P, dtype):
         cpu_dtype = dtype
         if dtype == torch.float16 or dtype == torch.bfloat16:
@@ -285,10 +368,14 @@ def _convert_to_cpu(t):
         if N == M and M == P:
             M2_eye = torch.eye(N, device=device, dtype=dtype)
             out1_eye_gpu = torch.nn.functional.linear(M1, M2_eye.t(), torch.zeros_like(A))
+<<<<<<< HEAD
             if runOnRocmArch(MI200_ARCH) and dtype == torch.float16:
                 self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu(), atol=1e-4, rtol=0.001)
             else:
                 self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu())
+=======
+            self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # baddbmm
         def _expand_to_batch(t: torch.Tensor):
@@ -303,14 +390,19 @@ def _expand_to_batch(t: torch.Tensor):
         if N == M and M == P:
             M2_eye = torch.eye(N, device=device, dtype=dtype).expand(batch_size, N, N)
             out2_eye_gpu = torch.baddbmm(torch.zeros_like(A), M1, M2_eye, beta=beta, alpha=alpha)
+<<<<<<< HEAD
             if runOnRocmArch(MI200_ARCH) and dtype == torch.float16:
                 self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu(), atol=1e-4, rtol=0.001)
             else:
                 self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu())
+=======
+            self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # cross comparison
         self.assertEqual(out1_gpu, out2_gpu[0])
 
+<<<<<<< HEAD
     @onlyCUDA
     @skipIfRocm
     @parametrize("shape", [2**i for i in range(5, 14)])
@@ -321,6 +413,8 @@ def test_cublas_deterministic(self, device, shape, dtype):
         for _ in range(10):
             self.assertEqual(first, torch.matmul(inp, inp), atol=0., rtol=0.)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist):
         for a, b, gO, agrad, bgrad, out in zip(alist, blist, gOlist, agradlist, bgradlist, outlist):
             a = a.clone().detach().requires_grad_()
@@ -332,6 +426,7 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
                 self.assertEqual(agrad, a.grad)
                 self.assertEqual(bgrad, b.grad)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
@@ -339,6 +434,17 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
     @dtypes(torch.bfloat16, torch.float32, torch.float16)
     def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
             a = torch.randn(m, k * n_groups + k * int(strided), device=device, dtype=dtype)[:, :k * n_groups]
@@ -355,7 +461,11 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
         offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
 
         f = torch._grouped_mm
+<<<<<<< HEAD
         out = f(a, b.t(), offs=offs, out_dtype=dtype)
+=======
+        out = f(a, b.t(), offs=offs, out_dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gO = torch.rand_like(out)
         out.backward(gO)
         offs_cpu = offs.cpu()
@@ -369,6 +479,7 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
             start = offs_cpu[i]
         self.grouped_mm_helper(alist, blist, gO, agradlist, bgradlist, out)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
@@ -376,6 +487,17 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
     @dtypes(torch.bfloat16, torch.float32, torch.float16)
     def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -402,12 +524,20 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
 
             a.grad = None
             b.grad = None
+<<<<<<< HEAD
             offs = torch.arange(m, n_groups * m + 1, m, device=device, dtype=torch.int32)
+=======
+            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
+<<<<<<< HEAD
             out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
+=======
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -424,6 +554,7 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
             self.grouped_mm_helper(alist, b, gOlist, agradlist, bgradlist, outlist)
 
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
@@ -431,6 +562,17 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
     @dtypes(torch.bfloat16, torch.float32, torch.float16)
     def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -452,11 +594,16 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
         self.assertTrue(b_contig.is_contiguous() is not strided)
 
         f = torch._grouped_mm
+<<<<<<< HEAD
         out = f(a, b.transpose(-2, -1), out_dtype=dtype)
+=======
+        out = f(a, b.transpose(-2, -1), out_dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gO = torch.rand_like(out)
         out.backward(gO)
         self.grouped_mm_helper(a, b, gO, a.grad, b.grad, out)
 
+<<<<<<< HEAD
     @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
@@ -466,6 +613,17 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
         if TEST_WITH_ROCM and a_row_major and b_row_major and dtype in [torch.bfloat16, torch.float16]:
             self.skipTest("failed using hipblaslt on rocm 6.4.2")
         device = "cuda"
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -489,12 +647,20 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
             if check_zero_size and n_groups <= 1:
                 continue
 
+<<<<<<< HEAD
             offs = torch.arange(n, n_groups * n + 1, n, device=device, dtype=torch.int32)
+=======
+            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
+<<<<<<< HEAD
             out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
+=======
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -511,6 +677,7 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
             self.grouped_mm_helper(a, blist, gOlist, agradlist, bgradlist, outlist)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+<<<<<<< HEAD
     # TODO(future PR): enable compile for torch._grouped_mm fallback path
     @unittest.skipIf(not SM90OrLater, "Grouped gemm with compile supported on SM90")
     @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
@@ -518,6 +685,16 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
     @parametrize("b_row_major", [False, True])
     @parametrize("max_autotune", [False, True])
     def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune):
+=======
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major):
+        torch._dynamo.reset()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = "cuda"
         dtype_AB = torch.bfloat16
         dtype_offset = torch.int32
@@ -525,6 +702,7 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
         align = 16 // dtype_AB.itemsize
 
         f_ref = torch._grouped_mm
+<<<<<<< HEAD
 
         options = {}
         if max_autotune:
@@ -537,6 +715,14 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
         f = torch.compile(
             f_ref,
             options=options,
+=======
+        f = torch.compile(
+            f_ref,
+            options={
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if op == "2d/2d":
@@ -544,9 +730,15 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
             m_align = (m + align - 1) // align * align
             n_align = (n + align - 1) // align * align
             if not a_row_major and not b_row_major:
+<<<<<<< HEAD
                 offs = torch.tensor([0, 1, 6, 6, 7], device=device, dtype=dtype_offset)
             else:
                 offs = torch.tensor([0, 8, 16, 16, 27], device=device, dtype=dtype_offset)
+=======
+                offs = torch.tensor([1, 3, 4, 6, 7], device=device, dtype=dtype_offset)
+            else:
+                offs = torch.tensor([8, 16, 32, 37], device=device, dtype=dtype_offset)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ngroups = offs.shape[0]
             k = offs[-1]
             k_align = (k + align - 1) // align * align
@@ -560,7 +752,11 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
             else:
                 B = torch.randn(k, n_align, device=device, dtype=dtype_AB).t()[:n, :]
         elif op == "2d/3d":
+<<<<<<< HEAD
             n, k = 7, 259  # k is larger here, to validate iterating over k tiles on an op
+=======
+            n, k = 7, 13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             n_align = (n + align - 1) // align * align
             k_align = (k + align - 1) // align * align
             if a_row_major:
@@ -620,7 +816,11 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
                     -2, -1
                 )[:, :n, :]
         else:
+<<<<<<< HEAD
             raise AssertionError(f"Invalid op: {op}")
+=======
+            raise AssertionError(f"Invaild op: {op}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         C_ref = f_ref(A, B.transpose(-2, -1), offs=offs)
         C = f(A, B.transpose(-2, -1), offs=offs)
@@ -628,6 +828,10 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
 
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
@@ -635,6 +839,7 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
     @parametrize("batch_size", [None, 1, 16])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+<<<<<<< HEAD
         if torch.version.hip:
             msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
             if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
@@ -646,6 +851,8 @@ def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
             if input_dtype == torch.float16 and M == 64 and N == 1 and K == 64 and batch_size == 1:
                 raise unittest.SkipTest(msg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
@@ -693,6 +900,10 @@ def create_inputs(B=None):
 
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
@@ -700,6 +911,7 @@ def create_inputs(B=None):
     @parametrize("batch_size", [None, 1, 32])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+<<<<<<< HEAD
         if torch.version.hip:
             msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
             if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
@@ -711,6 +923,8 @@ def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, bac
             if input_dtype == torch.float16 and M == 64 and N == 1 and K == 64 and batch_size == 1:
                 raise unittest.SkipTest(msg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
@@ -766,6 +980,10 @@ def create_inputs(B=None):
 
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("batch_size", [1, 32])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_fp16_accum_and_fp32_out_failure(self, batch_size, backend):
@@ -803,6 +1021,7 @@ def expand(tensor):
 
             torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accum
 
+<<<<<<< HEAD
     @onlyCUDA
     @parametrize("ops", [("mm", torch.mm), ("bmm", torch.bmm), ("addmm", torch.addmm), ("baddbmm", torch.baddbmm)])
     def test_input_dimension_checking_out_dtype(self, ops):
@@ -878,6 +1097,1192 @@ def test_greencontext_carveout(self):
         t3 = time.perf_counter()
         self.assertEqual(partial_res, full_res)
         self.assertGreater(t1 - t0, t3 - t2)
+=======
+f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
+mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
+
+# avoid division by zero when calculating scale
+EPS = 1e-12
+
+def amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+):
+    """ Converts the amax value of a tensor to the fp8 scale.
+    Args:
+        amax: The amax value of the tensor.
+        float8_dtype: the float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+    """
+    scale = torch.empty_like(amax, dtype=torch.float32)
+    if float8_dtype == e4m3_type:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    elif float8_dtype == e5m2_type:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+    # Ensure the scale is representable in float16,
+    # this helps when amax is small. We are assuming that we don't need
+    # to care about this for float32/bfloat16
+    if orig_dtype is torch.float16:
+        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
+
+    scale.copy_(res)
+    return scale
+
+def tensor_to_scale(x: torch.Tensor, float8_dtype: torch.dtype, dim=None):
+    if dim is None:
+        amax = torch.max(torch.abs(x))
+    else:
+        amax = torch.max(torch.abs(x), dim=dim, keepdim=True).values
+
+    return amax_to_scale(amax, float8_dtype, x.dtype)
+
+def mm_float8_emulated(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
+    # naive implementation: dq -> op -> q
+    x_fp32 = x.to(torch.float) / x_scale
+    y_fp32 = y.to(torch.float) / y_scale
+    out_fp32 = torch.mm(x_fp32, y_fp32)
+
+    return out_fp32.to(out_dtype)
+
+def addmm_float8_unwrapped(
+    a_data: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_data: torch.Tensor,
+    b_scale: torch.tensor,
+    output_dtype: torch.dtype,
+    output_scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    a_inverse_scale = a_scale.reciprocal()
+    b_inverse_scale = b_scale.reciprocal()
+    if output_dtype == torch.float32 and bias is not None:
+        # Bias is not supported by _scaled_mm when output is fp32
+        output = torch._scaled_mm(
+            a_data,
+            b_data,
+            scale_a=a_inverse_scale,
+            scale_b=b_inverse_scale,
+            scale_result=output_scale,
+            out_dtype=output_dtype,
+        )
+        output += bias
+        return output
+    output = torch._scaled_mm(
+        a_data,
+        b_data,
+        bias=bias,
+        scale_a=a_inverse_scale,
+        scale_b=b_inverse_scale,
+        scale_result=output_scale,
+        out_dtype=output_dtype,
+    )
+    return output
+
+def mm_float8(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    output_dtype: torch.dtype,  # output dtype
+    output_scale: Optional[torch.Tensor] = None,  # output scale, precomputed
+) -> torch.Tensor:
+    return addmm_float8_unwrapped(
+        a, a_scale, b, b_scale, output_dtype, output_scale
+    )
+
+def to_fp8_saturated(
+    x: torch.Tensor,
+    fp8_dtype: torch.dtype
+):
+    if fp8_dtype == e4m3_type:
+        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
+    elif fp8_dtype == e5m2_type:
+        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
+    else:
+        raise ValueError(f"to_fp8_saturated(): Unsupported fp8_dtype: {fp8_dtype}")
+
+    return x.to(fp8_dtype)
+
+def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Computes the error between two tensors in dB.
+
+    For more details see:
+        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
+
+    Args:
+        x: The original tensor.
+        y: The tensor to compare to the original tensor.
+    """
+    Ps = torch.norm(x)
+    Pn = torch.norm(x - y)
+    return 20 * torch.log10(Ps / Pn)
+
+
+# largest power of 2 representable in `torch.float8_e4m3fn`
+F8E4M3_LARGEST_POW2 = 8
+# largest power of 2 representable in `torch.float4_e2m1fn_x2`
+FP4E2M1FN_LARGEST_POW2 = 1.0
+# max value of `torch.float8_e4m3fn` (448)
+F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
+# exponent bias of `torch.float8_e8m0fnu`
+F8E8M0_EXP_BIAS = 127
+# exponent and mantissa bits of `torch.float4_e2m1fn_x2`
+FP4_EBITS, FP4_MBITS = 2, 1
+FP4_MAX_VAL = 6.0
+
+def data_to_mx_scale(x, block_size, recipe):
+    # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    # section 6.3, not all edge cases (such as NaN) are handled/tested
+    if recipe == "mxfp8":
+        largest_pow2 = F8E4M3_LARGEST_POW2
+    elif recipe == "mxfp4":
+        largest_pow2 = FP4E2M1FN_LARGEST_POW2
+    else:
+        raise ValueError(f"data_to_mx_scale(): Unsupported mx recipe: {recipe}")
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+    max_abs = torch.amax(torch.abs(x), 1)
+    largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
+    scale_e8m0_unbiased = largest_p2_lt_max_abs - largest_pow2
+    scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
+    scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
+    scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
+    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
+    return scale_e8m0_biased.reshape(orig_shape[0], -1)
+
+
+def data_to_nvfp4_scale(x, block_size):
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+    max_abs = torch.amax(torch.abs(x), 1) + 1e-12
+
+    # x_orig_max / scale = x_in_fp4_domain_max
+    # x_orig_max / x_in_fp4_domain_max = scale
+    scale = max_abs / FP4_MAX_VAL
+
+    # for the purposes of this function, just clamp to representable range of
+    # `torch.float8_e4m3fn`. In real code, we would expect the modeling code to
+    # handle this before the input data hits this function.
+    scale = scale.clamp(max=F8E4M3_MAX_VAL)
+
+    # cast to target dtype
+    scale = scale.to(torch.float8_e4m3fn)
+    scale = scale.reshape(orig_shape[0], -1)
+    return scale
+
+
+def down_size(size):
+    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
+    return (*size[:-1], size[-1] // 2)
+
+
+def pack_uint4(uint8_data) -> torch.Tensor:
+    # converting to uint8 for operations
+    shape = uint8_data.shape
+    assert shape[-1] % 2 == 0
+    uint8_data = uint8_data.contiguous().view(-1)
+    return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape))
+
+
+def _bfloat16_to_float4_e2m1fn_x2(x):
+    assert x.dtype == torch.bfloat16
+    x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS)
+    x = pack_uint4(x)
+    x = x.view(torch.float4_e2m1fn_x2)
+    return x
+
+
+class TestFP8Matmul(TestCase):
+
+    def _test_tautological_mm(self, device: str = "cuda",
+                              x_dtype: torch.dtype = e4m3_type,
+                              y_dtype: torch.dtype = e4m3_type,
+                              out_dtype: Optional[torch.dtype] = None,
+                              size: int = 16) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        x_fp8 = torch.rand(size, size, device=device).to(x_dtype)
+        y_fp8 = torch.eye(size, device=device, dtype=y_dtype).t()
+        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        out_fp8 = torch._scaled_mm(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
+        if out_dtype is not None:
+            self.assertEqual(out_dtype, out_fp8.dtype)
+        self.assertEqual(out_fp32, out_fp8.to(torch.float))
+
+    def test_float8_basics(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
+        # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
+        # supported on ROCm but fails on CUDA
+        ctx = self.assertRaises(RuntimeError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
+        with ctx:
+            self._test_tautological_mm(device, e5m2_type, e5m2_type)
+
+        self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
+        self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
+
+        self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
+        self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
+        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
+
+        with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError):
+            self._test_tautological_mm(device, out_dtype=e5m2_type)
+
+    def test_float8_scale(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        size = (16, 16)
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
+        scale_one = torch.tensor(1.0, device=device)
+        scale_a = torch.tensor(1.5, device=device)
+        scale_b = torch.tensor(0.66, device=device)
+        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_one, scale_b=scale_one)
+        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
+        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
+        self.assertEqual(out_fp8, out_fp8_s)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_mm_vs_emulated(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        x_scale = tensor_to_scale(x, input_dtype).float()
+        y_scale = tensor_to_scale(y, input_dtype).float()
+
+        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
+        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
+
+        # Calculate actual F8 mm
+        out_scaled_mm = mm_float8(
+            x_fp8,
+            y_fp8,
+            a_scale=x_scale,
+            b_scale=y_scale,
+            output_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated(
+            x_fp8,
+            x_scale,
+            y_fp8,
+            y_scale,
+            output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
+
+            out_emulated = out_emulated.to(compare_type)
+            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 3e-3, 3e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_mm_change_stride(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        x = torch.empty_strided((16, 16), (16, 1), device="cuda", dtype=base_dtype)
+        y = torch.empty_strided((16, 32), (1, 64), device="cuda", dtype=base_dtype)
+
+        x.normal_()
+        y.normal_()
+
+        x_scale = tensor_to_scale(x, input_dtype).float()
+        y_scale = tensor_to_scale(y, input_dtype).float()
+
+        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
+        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
+
+        # Calculate actual F8 mm
+        out_scaled_mm = mm_float8(
+            x_fp8,
+            y_fp8,
+            a_scale=x_scale,
+            b_scale=y_scale,
+            output_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated(
+            x_fp8,
+            x_scale,
+            y_fp8,
+            y_scale,
+            output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
+
+            out_emulated = out_emulated.to(compare_type)
+            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 3e-3, 3e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @onlyCUDA
+    def test_float8_bias(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        (k, l, m) = (16, 48, 32)
+        x = torch.ones((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
+        bias = torch.full((m,), 4.0, device=device, dtype=torch.half)
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
+        outb_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, bias=bias)
+        # this fails on ROCm currently because hipblaslt doesn't have amax op
+        out_fp32 = out_fp8.to(torch.float32)
+        outb_fp32 = outb_fp8.to(torch.float32)
+        difference = torch.abs(out_fp32 - outb_fp32)
+        self.assertEqual(difference, torch.tensor(4.0, device=device).expand_as(out_fp32))
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("bias", [True, False])
+    def test_non_divisible_leading_dim(self, device, bias: bool) -> None:
+        x = torch.rand((17, 16), device=device).to(e4m3_type)
+        y = torch.rand((16, 16), device=device).to(e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        input_bias = None
+        if bias:
+            input_bias = torch.rand((16,), device=device).to(torch.half)
+        _ = torch._scaled_mm(x, y, scale_a, scale_b, bias=input_bias)
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_float8_bias_relu_edgecase(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.full((k, l), 0.0, device=device).to(e4m3_type)
+        y = torch.full((m, l), 1.0, device=device, dtype=e4m3_type).t()
+        bias = torch.full((m,), -3.0, device=device, dtype=torch.half)
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        outb_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, bias=bias)
+        outb_fp32 = outb_fp8.to(torch.float32)
+        self.assertEqual(outb_fp32, torch.tensor(-3.0, device=device).expand_as(outb_fp32))
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_float32_output_errors_with_bias(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
+        self.assertRaisesRegex(
+            RuntimeError,
+            "Bias is not supported when out_dtype is set to Float32",
+            lambda: torch._scaled_mm(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
+        )
+
+    @onlyCUDA
+    @unittest.skipIf(PLATFORM_SUPPORTS_FP8 or not torch.cuda.is_available(), f8_msg)
+    def test_error_message_fp8_pre_sm89(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.rand((m, l), device=device).to(e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"torch\.\_scaled\_mm is only supported on CUDA devices with compute capability \>\= 9\.0 or 8\.9, or ROCm MI300\+",
+            lambda: torch._scaled_mm(x, y, scale_a, scale_b, out_dtype=torch.float32),
+        )
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_float8_scale_fast_accum(self, device) -> None:
+        size = (16, 16)
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
+        scale_a = torch.tensor(1.5, device=device)
+        scale_b = torch.tensor(0.66, device=device)
+        out_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, use_fast_accum=True)
+        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
+        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, use_fast_accum=True)
+        self.assertEqual(out_fp8, out_fp8_s)
+
+    @skipIfRocmVersionAndArch((7, 1), "gfx950")
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+    @parametrize("use_fast_accum", [True, False])
+    def test_float8_rowwise_scaling_sanity(self, device, use_fast_accum: bool) -> None:
+        M, K, N = (1024, 512, 2048)
+        fill_value = 0.5
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        x_scales = torch.ones((x.shape[0], 1), device=device, dtype=torch.float32)
+        y_scales = torch.ones((1, y.shape[0]), device=device, dtype=torch.float32)
+
+        x_fp8 = x.to(e4m3_type)
+        y_fp8 = y.to(e4m3_type).t()
+
+        out_fp8 = torch._scaled_mm(
+            x_fp8,
+            y_fp8,
+            scale_a=x_scales,
+            scale_b=y_scales,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=use_fast_accum,
+        )
+        self.assertEqual(
+            out_fp8.to(torch.float32), torch.full((M, N), K * (fill_value**2), device=device)
+        )
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    def test_float8_error_messages(self, device) -> None:
+        M, K, N = (1024, 512, 2048)
+        fill_value = 0.5
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        x_fp8 = x.to(e4m3_type)
+        y_fp8 = y.to(e4m3_type).t()
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                "For RowWise scaling, scale_a should be (1024, 1) and scale_b "
+                "should be (1, 2048). Got scale_a.size()=(1, 1) and scale_b.size()=(1, 2)"
+            ),
+        ):
+            torch._scaled_mm(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((1, 1), device="cuda"),
+                scale_b=torch.ones((1, 2), device="cuda"),
+                out_dtype=torch.bfloat16,
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                " For RowWise scaling, scale_a should be (1024, 1) and scale_b "
+                "should be (1, 2048). Got scale_a.size()=(1024, 1) and scale_b.size()=(1, 2049)"
+            ),
+        ):
+            torch._scaled_mm(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N + 1), device="cuda"),
+                out_dtype=torch.bfloat16,
+            )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape("For non-TensorWise scaling, scale tensors must be 2-dimensional"),
+        ):
+            torch._scaled_mm(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M), device="cuda"),
+                scale_b=torch.ones((N, N), device="cuda"),
+                out_dtype=torch.bfloat16,
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                "Both scale_a and scale_b must be contiguous for RowWise scaling."
+            ),
+        ):
+            torch._scaled_mm(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N * 2), device="cuda")[:, ::2],
+                out_dtype=torch.bfloat16,
+            )
+
+        # Note re.compile is used, not re.escape. This is to accomodate fn vs fnuz type message.
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
+        ):
+            torch._scaled_mm(
+                x_fp8,
+                y_fp8.to(e5m2_type),
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N), device="cuda"),
+                out_dtype=torch.bfloat16,
+            )
+
+    @skipIfRocmVersionAndArch((7, 1), "gfx950")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+    @parametrize("base_dtype", [torch.bfloat16])
+    def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+
+        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        x_scales = tensor_to_scale(x, input_dtype, dim=1).float()
+        y_scales = tensor_to_scale(y, input_dtype, dim=0).float()
+
+        x_fp8 = to_fp8_saturated(x * x_scales, e4m3_type)
+        y_fp8 = to_fp8_saturated(y * y_scales, e4m3_type)
+
+        # Calculate actual F8 mm
+        out_scaled_mm = mm_float8(
+            x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated(
+            x_fp8, x_scales, y_fp8, y_scales, output_dtype
+        )
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 2e-3, 2e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("which_dim_zero", [0, 1, 2])
+    @parametrize("use_torch_compile", [False, True])
+    def test_zero_dim_tensorwise(self, which_dim_zero, use_torch_compile) -> None:
+        device = "cuda"
+        x_dtype, y_dtype = torch.float8_e4m3fn, torch.float8_e4m3fn
+        out_dtype = torch.bfloat16
+        M, K, N = 32, 32, 32
+        if which_dim_zero == 0:
+            M = 0
+        elif which_dim_zero == 1:
+            K = 0
+        elif which_dim_zero == 2:
+            N = 0
+
+        x_fp8 = torch.zeros(M, K, device=device).to(x_dtype)
+        y_fp8 = torch.zeros(N, K, device=device, dtype=y_dtype).t()
+        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
+        scale_a = torch.tensor(float('-inf'), device=device)
+        scale_b = torch.tensor(float('-inf'), device=device)
+        f = torch._scaled_mm
+        if use_torch_compile:
+            f = torch.compile(torch._scaled_mm)
+        out_fp8 = f(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
+        self.assertEqual(out_dtype, out_fp8.dtype)
+        self.assertEqual(out_fp32, out_fp8.to(torch.float))
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
+    @unittest.skipIf(IS_WINDOWS, "Windows doesn't support row-wise scaling")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(not SM90OrLater, "sm89 kernel isn't opted into carveout yet")
+    def test_honor_sm_carveout(self) -> None:
+        torch.manual_seed(42)
+
+        x = torch.randn(8192, 2048, device="cuda", dtype=torch.float32)
+        y = torch.randn(8192, 2048, device="cuda", dtype=torch.float32).t()
+        x_scales = tensor_to_scale(x, e4m3_type, dim=1).reciprocal()
+        y_scales = tensor_to_scale(y, e4m3_type, dim=0).reciprocal()
+        x_fp8 = to_fp8_saturated(x / x_scales, e4m3_type)
+        y_fp8 = to_fp8_saturated(y / y_scales, e4m3_type)
+
+        with tempfile.NamedTemporaryFile() as f:
+            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+                self.assertIsNone(torch._C._get_sm_carveout_experimental())
+                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(0)
+                self.assertEqual(torch._C._get_sm_carveout_experimental(), 0)
+                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(66)
+                self.assertEqual(torch._C._get_sm_carveout_experimental(), 66)
+                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(None)
+                self.assertIsNone(torch._C._get_sm_carveout_experimental())
+                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+
+            prof.export_chrome_trace(f.name)
+            no_carveout, carveout_0, carveout_66, no_carveout_again = [
+                math.prod(evt.get("args", {}).get("grid", []))
+                for evt in json.load(open(f.name))["traceEvents"]
+                if evt.get("cat", "") == "kernel"
+            ]
+
+            self.assertEqual(no_carveout, no_carveout_again)
+            self.assertNotEqual(no_carveout, carveout_66)
+            self.assertNotEqual(carveout_66, carveout_0)
+
+    def test_pack_uint4(self):
+        """
+        Verify that given a tensor with high precision values [val0, val1],
+        the x2 packed representation is val1:val0 (from MSB to LSB), and
+        not val0:val1.
+
+        Note that the packing function is private to this file, but it's still
+        good to test that we are packing in the expected way.
+        """
+        hp_data = torch.tensor([0b00000010, 0b00001011], dtype=torch.uint8)
+        lp_data_actual = pack_uint4(hp_data)
+        lp_data_expected = torch.tensor([0b10110010], dtype=torch.uint8)
+        torch.testing.assert_close(lp_data_actual, lp_data_expected, atol=0, rtol=0)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    @parametrize("test_case_name", [
+        "a_eye_b_eye",
+        "a_ones_b_ones",
+        "a_ones_modified_b_ones",
+        "a_ones_b_ones_modified",
+        "a_scale_modified_b_ones",
+        "a_ones_b_scale_modified",
+        "data_random_scales_one",
+        "data_random_scales_from_data",
+    ])
+    @parametrize("fast_accum", [False, True])
+    @parametrize("mkn", [
+        # Nice shapes
+        (128, 128, 128),
+        (256, 256, 256),
+        (128, 256, 512),
+        (256, 512, 128),
+        (512, 128, 256),
+
+        # Non block multiples
+        (65, 96, 112),
+        (197, 224, 272),
+        # K not multiple of 32 (skipped for fp4)
+        (197, 240, 272),
+
+        # Very unbalanced
+        (1023, 64, 48),
+        (31, 1024, 64),
+        (45, 96, 1024),
+
+        # Mixed large and small
+        (2, 1024, 128),
+        (127, 96, 1024),
+        (1025, 128, 96)
+    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
+        if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
+            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
+
+        device = "cuda"
+        M, K, N = mkn
+        if torch.version.hip:
+            if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
+                raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
+
+        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
+
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
+        BLOCK_SIZE = 16 if recipe == "nvfp4" else 32
+        require_exact_match = True
+        approx_match_sqnr_target = 22.0
+
+        if test_case_name == "a_eye_b_eye":
+            if not ((M == K) and (M == N)):
+                raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
+            A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+            B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_modified_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+            A_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_b_ones_modified":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+            B_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_scale_modified_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A[1][0:BLOCK_SIZE] = 2
+                A_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                A_scale[1][0] = 2
+
+        elif test_case_name == "a_ones_b_scale_modified":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B[1][0:BLOCK_SIZE] = 2
+                B_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                B_scale[1][0] = 2
+
+        elif test_case_name == "data_random_scales_one":
+            require_exact_match = False
+
+            if recipe == "mxfp8":
+                # scales all-ones, element data random while being exactly representable in float8_e4m3fn
+                # generate integers in [0, 255] and interpret as float8_e4m3fn
+                A_ref = torch.randint(0, 255, (M, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                B_ref = torch.randint(0, 255, (N, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                # modification: don't allow NaN values
+                A_ref[torch.isnan(A_ref)] = 0
+                B_ref[torch.isnan(B_ref)] = 0
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                # scales all-ones, element data random while being exactly representable in float4_e2m1fn_x2
+                # generate integers in [0, 16] and cast to bfloat16
+                A_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (M, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                B_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (N, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "data_random_scales_from_data":
+            if not K % BLOCK_SIZE == 0:
+                raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
+            require_exact_match = False
+            # random data, scales from data
+            A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
+            B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
+
+            if recipe == "mxfp8":
+                # Calculate scales based on the inputs
+                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
+                max_val = F8E4M3_MAX_VAL
+                min_val = -1 * max_val
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+            else:  # nvfp4 # mxfp4
+                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
+                A_scale = scale_func(A_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                B_scale = scale_func(B_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                max_val = FP4_MAX_VAL
+                min_val = -1 * max_val
+
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val)
+                A = _bfloat16_to_float4_e2m1fn_x2(A)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val)
+                B = _bfloat16_to_float4_e2m1fn_x2(B)
+
+                approx_match_sqnr_target = 12.0 if torch.version.hip else 15.8
+
+        C_ref = A_ref @ B_ref.t()
+
+        # convert to swizzled format
+        if not torch.version.hip:
+            A_scale = to_blocked(A_scale)
+            B_scale = to_blocked(B_scale)
+
+        C = torch._scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=fast_accum,
+        )
+
+        if require_exact_match:
+            torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+        else:
+            sqnr = compute_error(C_ref, C)
+            assert sqnr.item() > approx_match_sqnr_target
+
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
+    @parametrize("recipe", ["mxfp8", "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
+        M, K, N = (1024, 512, 2048)
+        BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
+        BLOCK_SIZE_MN = 128
+        fill_value = 0.5
+        scale_dtype = torch.float8_e4m3fn if recipe == "nvfp4" else torch.float8_e8m0fnu
+
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        if recipe == "mxfp8":
+            x_lowp = x.to(e4m3_type)
+            y_lowp = y.to(e4m3_type).t()
+        else:  # nvfp4
+            x_lowp = _bfloat16_to_float4_e2m1fn_x2(x.bfloat16())
+            y_lowp = _bfloat16_to_float4_e2m1fn_x2(y.bfloat16()).t()
+
+        num_k_blocks = ceil_div(K, BLOCK_SIZE_K)
+        padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
+        expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
+        expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
+
+        # Test wrong scale tensor size for scale_a with correct dtype
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                f"For BlockWise scaling: Expected scale_a size to be {expected_a_size} "
+                f"but got {expected_a_size - 1}"
+            ),
+        ):
+            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
+            correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+                scale_a=incorrect_size_a,
+                scale_b=correct_size_b,
+                out_dtype=torch.bfloat16,
+            )
+
+        # Test wrong scale tensor size for scale_b with correct dtype
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                f"For BlockWise scaling: Expected scale_b size to be {expected_b_size} "
+                f"but got {expected_b_size + 1}"
+            ),
+        ):
+            correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
+            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+                scale_a=correct_size_a,
+                scale_b=incorrect_size_b,
+                out_dtype=torch.bfloat16,
+            )
+
+        # Test non-contiguous scale tensors with correct dtype
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape(
+                "For BlockWise scaling: Both scale_a and scale_b must be contiguous"
+            ),
+        ):
+            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
+            contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+                scale_a=non_contiguous_a,
+                scale_b=contiguous_b,
+                out_dtype=torch.bfloat16,
+            )
+
+    def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
+        for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
+            out_ref = torch._scaled_mm(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
+                                       out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
+            self.assertEqual(out, out_ref, atol=5e-2, rtol=5e-4)
+
+    # Testing only _scaled_grouped_mm() with multiple shapes, as
+    # _scaled_mm() already has more combinations of parameters than
+    # _scaled_grouped_mm(), for supporing more than one inputs layout
+    # combinations.
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        a = torch.randn(m, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
+        b = torch.randn(n, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
+        scale_a = torch.rand(m * n_groups, device=device, dtype=torch.float32)
+        scale_b = torch.rand(n * n_groups, device=device, dtype=torch.float32)
+        offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
+        f = torch._scaled_grouped_mm
+        out = f(a, b.t(), scale_a, scale_b, offs=offs,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        offs_cpu = offs.cpu()
+        alist, blist, ascalelist, bscalelist = [], [], [], []
+        start = 0
+        for i in range(n_groups):
+            alist.append(a[:, start:offs_cpu[i]])
+            blist.append(b[:, start:offs_cpu[i]])
+            ascalelist.append(scale_a[i * m : (i + 1) * m])
+            bscalelist.append(scale_b[i * n : (i + 1) * n])
+            start = offs_cpu[i]
+        self.scaled_grouped_mm_helper(alist, blist, ascalelist, bscalelist, out, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(m * n_groups, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[:, :k]
+        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+            scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32)
+            scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+
+            f = torch._scaled_grouped_mm
+            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+
+            offs_cpu = offs.cpu()
+            alist, ascalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                alist.append(a[start:offs_cpu[i]])
+                ascalelist.append(scale_a[start:offs_cpu[i]])
+                outlist.append(out[start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
+        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+
+        f = torch._scaled_grouped_mm
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+
+        self.scaled_grouped_mm_helper(a, b, scale_a, scale_b, out, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
+        b = torch.randn(n * n_groups, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[:, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32)
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+
+            f = torch._scaled_grouped_mm
+            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+            offs_cpu = offs.cpu()
+            blist, bscalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                blist.append(b[start:offs_cpu[i]])
+                bscalelist.append(scale_b[start:offs_cpu[i]])
+                outlist.append(out[:, start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(a, blist, scale_a, bscalelist, outlist, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_mxfp8_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 32
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = A_ref.to(torch.float8_e4m3fn)
+        B = B_ref.to(torch.float8_e4m3fn)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_nvfp4_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 16
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+        B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        # C = torch._scaled_mm(
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
@@ -1002,6 +2407,10 @@ def run_test(
 
 instantiate_device_type_tests(TestMatmulCuda, globals(), except_for="cpu")
 instantiate_device_type_tests(TestMixedDtypesLinearCuda, globals(), except_for="cpu")
+<<<<<<< HEAD
+=======
+instantiate_device_type_tests(TestFP8Matmul, globals(), except_for="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == '__main__':
     TestCase._default_dtype_check_enabled = True
diff --git a/test/test_meta.py b/test/test_meta.py
index 4e79e59cfe62a..5b0fae27a1335 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -575,8 +575,13 @@ def run_meta_crossref(
         elif func in (torch.ops.aten.repeat_interleave.Tensor, torch.ops.aten.repeat_interleave.Tensor_out):
             if kwargs.get("output_size", None) is None:
                 meta_args = args
+<<<<<<< HEAD
                 if func is torch.ops.aten.repeat_interleave.Tensor_out:
                     meta_kwargs["out"] = kwargs["out"]
+=======
+            if func is torch.ops.aten.repeat_interleave.Tensor_out:
+                meta_kwargs["out"] = kwargs["out"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif func in (torch.ops.aten.index.Tensor, torch.ops.aten.index.Tensor_out):
             # Don't convert boolean tensors to meta as they will have nonzero
             # called on them
@@ -681,10 +686,14 @@ def run_meta_crossref(
 }
 
 meta_function_expected_failures_conditional = {
+<<<<<<< HEAD
     torch.repeat_interleave: lambda dtype, *args, **kwargs: (
         not isinstance(kwargs.get("repeats", None), int)
         and (kwargs.get("output_size", None) is None)
     ),
+=======
+    torch.repeat_interleave : (lambda dtype, *args, **kwargs: not isinstance(kwargs.get("repeats", None), int)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 """
@@ -1505,7 +1514,11 @@ def test_batch_norm_backward(self, output_mask):
     def test_fill__alias_relationship(self):
         inps = torch.rand(2**52, device='meta')
         r = torch.ops.aten.fill_(inps, 1.0)
+<<<<<<< HEAD
         # aten.fill_ returns an alias
+=======
+        # aten.fill_ returns an aliase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(id(inps), id(r))
 
         # aten.fill returns a new tensor
@@ -1827,6 +1840,7 @@ def test_stride_for_index_Tensor(self):
 
         self.assertEqual(out.stride(), f_out.stride())
 
+<<<<<<< HEAD
 
     @parametrize("in_dtype", [torch.float32, torch.float16])
     @parametrize("bias_dtype", [torch.float32, torch.float16, None])
@@ -1864,6 +1878,8 @@ def fn(input, weight, bias, need_grad_input):
         else:
             self.assertEqual(out_dtype, [in_dtype,])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestMeta, globals())
 
 def print_op_str_if_not_supported(op_str):
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index 4e1ef44bb3129..6b4dacb98b4d7 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -22,12 +22,19 @@
 from torch.utils import mkldnn as mkldnn_utils
 from torch.testing._internal.common_utils import TestCase, \
     run_tests, TemporaryFileName, gradcheck, gradgradcheck, IS_WINDOWS, \
+<<<<<<< HEAD
     skipIfTorchDynamo, xfailIfTorchDynamo, recover_orig_fp32_precision
+=======
+    skipIfTorchDynamo, xfailIfTorchDynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     dtypes,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # batched grad doesn't support mkldnn
 gradcheck = functools.partial(gradcheck, check_batched_grad=False)
@@ -265,10 +272,14 @@ def _test_conv_base(self, dim):
                     loss1.backward()
             if not train or (train and dim != 1):
                 y_mkldnn = mkldnn_conv(x2).to_dense()
+<<<<<<< HEAD
                 if self.precision != 0:
                     self.assertEqual(y_aten, y_mkldnn, atol=self.precision, rtol=self.precision)
                 else:
                     self.assertEqual(y_aten, y_mkldnn)
+=======
+                self.assertEqual(y_aten, y_mkldnn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not train:
                 self._test_serialization(mkldnn_conv, (x.to_mkldnn(),))
                 self._test_tracing(mkldnn_conv, (x.to_mkldnn(),))
@@ -284,6 +295,7 @@ def _test_conv_base(self, dim):
                 if bias:
                     self.assertEqual(conv.bias.grad, mkldnn_conv.bias.grad)
 
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
     def test_conv1d(self):
         self._test_conv_base(dim=1)
@@ -293,6 +305,14 @@ def test_conv2d(self):
         self._test_conv_base(dim=2)
 
     @reduced_f32_on_and_off()
+=======
+    def test_conv1d(self):
+        self._test_conv_base(dim=1)
+
+    def test_conv2d(self):
+        self._test_conv_base(dim=2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d(self):
         self._test_conv_base(dim=3)
 
@@ -407,7 +427,10 @@ def _test_conv_deconv_nhwc_base(self, conv_module, weight_memory_format, dtype,
                     self.assertEqual(conv1.bias.grad, conv2.bias.grad, atol=prec, rtol=prec)
                 self.assertEqual(x1.grad, x2.grad, atol=prec, rtol=prec)
 
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_nhwc_fp32(self):
         self._test_conv_deconv_nhwc_base(torch.nn.Conv2d, torch.contiguous_format, dtype=torch.float32)
         self._test_conv_deconv_nhwc_base(torch.nn.Conv2d, torch.channels_last, dtype=torch.float32)
@@ -443,7 +466,10 @@ def test_conv_nhwc_lower_precision(self, dtype):
             self._test_conv_deconv_nhwc_base(torch.nn.Conv3d, torch.channels_last_3d, dtype=dtype, prec=prec)
 
 
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_transpose_nhwc_fp32(self):
         self._test_conv_deconv_nhwc_base(torch.nn.ConvTranspose2d, torch.contiguous_format, dtype=torch.float32)
         self._test_conv_deconv_nhwc_base(torch.nn.ConvTranspose2d, torch.channels_last, dtype=torch.float32)
@@ -492,7 +518,11 @@ def _test_conv_transpose_base(self, dim):
             C = torch.randint(1, 3, (1,)).item() * groups
             x_shape = (N, C) + input_shapes[dim]
             data = torch.randn(x_shape, dtype=torch.float32)
+<<<<<<< HEAD
             # conv: mkldnn transpose conv fp32
+=======
+            # conv: mkldnn tranpose conv fp32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # conv_ref: thnn transpose conv fp32
             conv = conv_module[dim](in_channels=C,
                                     out_channels=M,
@@ -518,11 +548,15 @@ def _test_conv_transpose_base(self, dim):
             if train:
                 y.sum().backward()
 
+<<<<<<< HEAD
             if self.precision != 0:
                 self.assertEqual(y, y_ref, atol=self.precision, rtol=self.precision)
             else:
                 self.assertEqual(y, y_ref)
 
+=======
+            self.assertEqual(y, y_ref)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if train:
                 self.assertEqual(x.grad, x_ref.grad)
                 self.assertEqual(conv.weight.grad,
@@ -532,6 +566,7 @@ def _test_conv_transpose_base(self, dim):
                 if bias:
                     self.assertEqual(conv.bias.grad, conv_ref.bias.grad)
 
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
     def test_conv_transpose1d(self):
         self._test_conv_transpose_base(dim=1)
@@ -541,6 +576,14 @@ def test_conv_transpose2d(self):
         self._test_conv_transpose_base(dim=2)
 
     @reduced_f32_on_and_off()
+=======
+    def test_conv_transpose1d(self):
+        self._test_conv_transpose_base(dim=1)
+
+    def test_conv_transpose2d(self):
+        self._test_conv_transpose_base(dim=2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_transpose3d(self):
         self._test_conv_transpose_base(dim=3)
 
@@ -1520,7 +1563,11 @@ def test_lstm(self):
                 h = torch.randn(num_layers * num_directions, batch_size, hidden_size, dtype=torch.float32)
                 c = torch.randn(num_layers * num_directions, batch_size, hidden_size, dtype=torch.float32)
                 if fp16:
+<<<<<<< HEAD
                     # TODO add training support when oneDNN support lstm FP16 training
+=======
+                    # TODO add traing support when oneDNN support lstm FP16 training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     training = False
                 model = torch.nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional,
                                       bias=bias, dropout=dropout, batch_first=batch_first).float()
@@ -1675,6 +1722,7 @@ def test_mkldnn_scaled_mm(self, device) -> None:
             self.assertEqual(out_emulated.float(), out.float(), atol=5e-2, rtol=5e-2)
 
 
+<<<<<<< HEAD
     @recover_orig_fp32_precision
     def test_mlkdnn_get_set(self):
         # get/set mkldnn ops
@@ -1726,6 +1774,8 @@ def test_default_use_parent(self):
             with torch.backends.flags(fp32_precision="tf32"):
                 self.assertEqual(torch.backends.mkldnn.matmul.fp32_precision, "tf32")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestMkldnn, globals(), only_for=('cpu',))
 
diff --git a/test/test_model_exports_to_core_aten.py b/test/test_model_exports_to_core_aten.py
index 60ec7ec54daf9..f07ea0bb6ba20 100644
--- a/test/test_model_exports_to_core_aten.py
+++ b/test/test_model_exports_to_core_aten.py
@@ -27,7 +27,13 @@ def test_vit_aten_export(self):
         m = m.eval()
         input_shape = (1, 3, 224, 224)
         example_inputs = (torch.randn(input_shape),)
+<<<<<<< HEAD
         m = torch.export.export(m, copy.deepcopy(example_inputs), strict=True).module()
+=======
+        m = torch.export.export_for_training(
+            m, copy.deepcopy(example_inputs), strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m(*example_inputs)
         m = export.export(m, copy.deepcopy(example_inputs))
         ops = _get_ops_list(m.graph_module)
diff --git a/test/test_modules.py b/test/test_modules.py
index 2f881c89b7857..dd9ceb663b242 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -15,6 +15,7 @@
 from torch.testing._internal.common_modules import module_db, modules, ModuleErrorEnum, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
+<<<<<<< HEAD
     gradgradcheck, parametrize, wrapSwapTensorsTest, TEST_WITH_ROCM)
 from unittest.mock import patch, call
 
@@ -25,6 +26,12 @@
     os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1"
 
 
+=======
+    gradgradcheck, parametrize, wrapSwapTensorsTest)
+from unittest.mock import patch, call
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
@@ -328,7 +335,11 @@ def _traverse_obj(self, obj, func):
 
     def _retain_grad(self, obj):
         # gradients needs to be retained to check for grad. This is useful when
+<<<<<<< HEAD
         # non-leaves are present in the graph.
+=======
+        # non-leafs are present in the graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def inner_retain_grad(obj):
             if obj.requires_grad:
                 obj.retain_grad()
diff --git a/test/test_mps.py b/test/test_mps.py
index 83d5b46d46821..9a105fb7b00ad 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -23,13 +23,20 @@
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
     (gradcheck, gradgradcheck, parametrize, run_tests, TestCase, download_file, MACOS_VERSION, IS_CI,
+<<<<<<< HEAD
      NoTest, skipIfSlowGradcheckEnv, suppress_warnings, serialTest, instantiate_parametrized_tests, xfailIf)
+=======
+     NoTest, skipIfSlowGradcheckEnv, suppress_warnings, serialTest, instantiate_parametrized_tests)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_mps import mps_ops_modifier, mps_ops_grad_modifier, mps_ops_error_inputs_modifier
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
+<<<<<<< HEAD
 from torch.utils._python_dispatch import TorchDispatchMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import partial
 
 from torch.testing._internal.common_methods_invocations import (
@@ -72,6 +79,17 @@
     )
 )
 
+<<<<<<< HEAD
+=======
+def xfailIf(condition):
+    def wrapper(func):
+        if condition:
+            return unittest.expectedFailure(func)
+        else:
+            return func
+    return wrapper
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Same logic as test_cuda.py
 if not torch.backends.mps.is_available():
     print('MPS not available, skipping tests', file=sys.stderr)
@@ -80,9 +98,12 @@
 
 total_memory = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]))
 
+<<<<<<< HEAD
 MPS_UNSUPPORTED_TYPES = [torch.double, torch.cdouble]
 MPS_DTYPES = [t for t in get_all_dtypes() if t not in MPS_UNSUPPORTED_TYPES]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
 
@@ -191,6 +212,11 @@ def test_matmul_autocast(self):
     @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
     def test_scaled_dot_product_attention_autocast(self, dtype):
         # Regression test for https://github.com/pytorch/pytorch/issues/141774
+<<<<<<< HEAD
+=======
+        if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat16 needs MacOS14+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         query = torch.rand(4, 1, 16, 8, dtype=torch.float32, device="mps")
         key = torch.rand(4, 1, 16, 8, dtype=torch.float32, device="mps")
@@ -202,6 +228,7 @@ def test_scaled_dot_product_attention_autocast(self, dtype):
         y = F.scaled_dot_product_attention(query, key, value.to(torch.float32))
         self.assertEqual(y.to(y_autocast.dtype), y_autocast)
 
+<<<<<<< HEAD
     def test_conv_transpose3d_autocast_fp32(self):
         m = nn.ConvTranspose3d(16, 33, 3, stride=2).to("mps")
         x = torch.randn(20, 16, 10, 50, 100, device="mps")
@@ -228,6 +255,8 @@ def forward(self, x):
             y = model(x)
         self.assertEqual(y.dtype, torch.float16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gradscaler_mps(self):
         # big model to force chunking/depth in the gradscaler dispatch
         class Model(nn.Module):
@@ -249,6 +278,11 @@ def forward(self, x):
         torch.manual_seed(42)
 
         def helper(model_cpu, model_mps, dtype, iterations, batch_size, atol=3e-4, rtol=1e-5):
+<<<<<<< HEAD
+=======
+            if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+                raise unittest.SkipTest("bfloat16 needs MacOS14+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optimizer_cpu = torch.optim.SGD(model_cpu.parameters(), lr=0.01)
             optimizer_mps = torch.optim.SGD(model_mps.parameters(), lr=0.01)
             loss_fn = nn.MSELoss()
@@ -515,11 +549,14 @@ def _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims):
             _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=0)
             _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=-2)
 
+<<<<<<< HEAD
         def test_pixel_shuffle_large_upscale_factor():
             with self.assertRaises(ValueError):
                 ps = nn.PixelShuffle(545460846592)
                 ps(torch.randn(2, 16, 9, 3))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_pixel_shuffle_unshuffle_1D():
             _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=1)
 
@@ -535,7 +572,10 @@ def test_pixel_shuffle_unshuffle_4D():
         def test_pixel_shuffle_unshuffle_5D():
             _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=5)
 
+<<<<<<< HEAD
         test_pixel_shuffle_large_upscale_factor()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         test_pixel_shuffle_unshuffle_1D()
         test_pixel_shuffle_unshuffle_2D()
         test_pixel_shuffle_unshuffle_3D()
@@ -634,11 +674,18 @@ def test_batched_matrix_x_batched_matrix(self):
     def test_batched_matrix_x_broadcasted_matrix(self):
         self._helper((10, 3, 4), (4, 5))
 
+<<<<<<< HEAD
     @serialTest()
     def test_large_matmul(self):
         # Issue: #141909
         tensor1_mps = torch.randn(1, 1, 72250, dtype=torch.half, device="mps")
         tensor2_mps = torch.randn(1, 72250, 1, dtype=torch.half, device="mps")
+=======
+    def test_large_matmul(self):
+        # Issue: #141909
+        tensor1_mps = torch.randn(1, 1, 72250, dtype=torch.half)
+        tensor2_mps = torch.randn(1, 72250, 1, dtype=torch.half)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         matmul_mps = torch.matmul(tensor1_mps, tensor2_mps)
 
         tensor1_cpu = tensor1_mps.to("cpu")
@@ -665,7 +712,11 @@ def _testLeakyRelu(self, shape, dtype, negative_slope, contiguous):
         mps_x = cpu_x.detach().clone().to('mps')
 
         if not contiguous and not (0 in shape or len(shape) < 2):
+<<<<<<< HEAD
             # Transposing will make the tensor non-contiguous
+=======
+            # Tranposing will make the tensor non-contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cpu_x = cpu_x.transpose(0, 1)
             mps_x = mps_x.transpose(0, 1)
             assert not mps_x.is_contiguous()
@@ -748,6 +799,7 @@ def test_avg_pool2d_ceil_mode(self):
             padding=(0, 1), stride=2)
         self.assertFalse(torch.isnan(y).any())
 
+<<<<<<< HEAD
     # Test some cases for avg_pool2d which used to mismatch CPU results.
     # Addresses this issue: https://github.com/pytorch/pytorch/issues/160743
     def test_avg_pool2d_ceil_mode_mismatch(self):
@@ -775,6 +827,8 @@ def test_avg_pool2d_ceil_mode_mismatch(self):
             msg = f'{input_size=}, {kwargs=}'
             self.assertEqual(out_mps, out_cpu, msg=msg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestMPS(TestCaseMPS):
     def test_exp(self, device="mps", dtype=torch.float):
@@ -992,7 +1046,11 @@ def test_cdist_same_inputs(self, device="mps"):
             x.requires_grad = True
             d = torch.cdist(x, y)
             d.backward(dist_grad)
+<<<<<<< HEAD
             # Check that the backward pass does not contain invalid
+=======
+            # Check that the backward passs does not contain invalid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # values such as nan or inf
             assert torch.isfinite(x.grad).all()
 
@@ -1132,6 +1190,12 @@ def test_large_bmm(self, dtype):
 
     @parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
     def test_take_along_dim(self, dtype):
+<<<<<<< HEAD
+=======
+        if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat16 needs MacOS14+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.tensor([[-5.], [0.], [5.]], dtype=dtype)
         inds = torch.tensor([[0], [1], [2]])
         ref = torch.take_along_dim(x, inds, 0)
@@ -1244,7 +1308,11 @@ def test_linear_errors(self):
             torch.nn.functional.linear(torch.rand(size, device='mps'),
                                        torch.randint(-10, 10, size, dtype=torch.int8, device='mps'))
 
+<<<<<<< HEAD
         # Weights on wrong device
+=======
+        # Weigths on wrong device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, "argument weight is on cpu but expected on mps"):
             torch.nn.functional.linear(torch.rand(size, device='mps'),
                                        torch.rand(size, device='cpu'))
@@ -1254,6 +1322,7 @@ def test_linear_errors(self):
             torch.nn.functional.linear(torch.rand(size, device='cpu'),
                                        torch.rand(size, device='mps'))
 
+<<<<<<< HEAD
     def test_linear_non_contiguous(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/161640
         # Slice tensors to force non-contiguity
@@ -1265,6 +1334,8 @@ def test_linear_non_contiguous(self):
         result_contig = torch.nn.functional.linear(input_s, weight_contiguous_equiv)
         self.assertEqual(result_contig, result_sliced)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False):
         cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias)
         mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias)
@@ -1364,6 +1435,10 @@ def test_linear3D_no_bias(self):
     def test_linear3D_no_bias_backward(self):
         self._linear_helper(in_features=2, out_features=3, shape=((4, 5, 2)), bias=True, backward_pass=True)
 
+<<<<<<< HEAD
+=======
+    @xfailIf(MACOS_VERSION < 14.0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_large(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/122045
         x_cpu = torch.randn(9, 1024, 1, device='cpu')
@@ -1582,11 +1657,20 @@ def test_masked_fill(self):
                 dst2[i] = val
         self.assertEqual(dst.to("cpu"), dst2, atol=0, rtol=0)
 
+<<<<<<< HEAD
         # Regression test for https://github.com/pytorch/pytorch/issues/143477
         # Allocating 48x25x1024x1024 tensor crashes on MacOS-13
         mask_bool = torch.triu(torch.ones(1024, 1024, device=device), diagonal=1).bool()
         attn_scores = torch.rand(48, 25, 1024, 1024, device=device)
         attn_scores.masked_fill_(mask_bool, 0)
+=======
+        if MACOS_VERSION >= 14.0:
+            # Regression test for https://github.com/pytorch/pytorch/issues/143477
+            # Allocating 48x25x1024x1024 tensor crashes on MacOS-13
+            mask_bool = torch.triu(torch.ones(1024, 1024, device=device), diagonal=1).bool()
+            attn_scores = torch.rand(48, 25, 1024, 1024, device=device)
+            attn_scores.masked_fill_(mask_bool, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_masked_fill__non_contiguous(self):
         shape = (3, 5)
@@ -1806,6 +1890,7 @@ def test_batch_norm_slices(self):
 
         self.assertEqual(res_cpu, res_mps)
 
+<<<<<<< HEAD
     def test_batch_norm_backward_weight_bias_gradients(self):
         # See issue: https://github.com/pytorch/pytorch/issues/156555
         N, C, L = 4, 3, 5
@@ -1826,6 +1911,8 @@ def test_batch_norm_backward_weight_bias_gradients(self):
         self.assertEqual(bn_cpu.weight.grad, bn_mps.weight.grad, atol=1e-5, rtol=1e-5)
         self.assertEqual(bn_cpu.bias.grad, bn_mps.bias.grad, atol=1e-5, rtol=1e-5)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_layer_norm_backward(self):
         inputs = torch.rand(4, 4, device="mps", requires_grad=True)
         x = torch.nn.LayerNorm(4).to("mps")
@@ -1907,7 +1994,11 @@ def test_linalg_vector_norm(self):
         res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5)
         self.assertEqual(res_mps, res_cpu)
 
+<<<<<<< HEAD
         for dim in range(B_mps.dim()):
+=======
+        for dim in range(0, B_mps.dim()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_mps = torch.linalg.vector_norm(B_mps, ord=3.5, dim=dim)
             res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5, dim=dim)
             self.assertEqual(res_mps, res_cpu)
@@ -1944,6 +2035,7 @@ def run_lu_factor_ex_test(size, *batch_dims, check_errors, atol=1e-5, rtol=1e-6)
         # big matrix check with batch size > 1
         run_lu_factor_ex_test(256, 2, check_errors=False, atol=3e-5, rtol=5e-6)
 
+<<<<<<< HEAD
     def test_linalg_lu_factor_singular(self):
         # Explicit singular matrix
         A = torch.tensor([[1.0, 2.0], [2.0, 4.0]], device="mps")
@@ -1951,6 +2043,8 @@ def test_linalg_lu_factor_singular(self):
         with self.assertRaisesRegex(RuntimeError, "result in a division by zero"):
             torch.linalg.lu_factor(A)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linalg_solve(self):
         from torch.testing._internal.common_utils import make_fullrank_matrices_with_distinct_singular_values
 
@@ -1992,6 +2086,7 @@ def run_linalg_solve_test(size, *batch_dims):
         run_linalg_solve_test(32, 10, 10)
         run_linalg_solve_test(32, 2, 2, 2, 2, 10, 10)
 
+<<<<<<< HEAD
     def test_linalg_solve_singular(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/163962
 
@@ -2002,6 +2097,8 @@ def test_linalg_solve_singular(self):
         with self.assertRaisesRegex(RuntimeError, "input matrix is singular"):
             torch.linalg.solve(A, b)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linalg_solve_with_broadcasting(self):
         from functools import partial
         import torch
@@ -2115,6 +2212,10 @@ def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dt
         # Regression test for https://github.com/pytorch/pytorch/issues/96113
         torch.nn.LayerNorm((16,), elementwise_affine=True).to("mps")(torch.randn(1, 2, 16).to("mps", dtype=torch.float16))
 
+<<<<<<< HEAD
+=======
+    @xfailIf(MACOS_VERSION < 14.0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ifft(self):
         # See: https://github.com/pytorch/pytorch/issues/124096
         device = torch.device("mps")
@@ -2371,6 +2472,7 @@ def helper(dim, layer='linear', dtype=torch.float32):
         helper(3, layer='conv')
         helper(-1, layer='conv')
 
+<<<<<<< HEAD
         # Conv3d is only available from MacOS 13 onwards
         helper(0, layer='conv3d')
         helper(1, layer='conv3d')
@@ -2378,6 +2480,16 @@ def helper(dim, layer='linear', dtype=torch.float32):
         helper(3, layer='conv3d')
         helper(4, layer='conv3d')
         helper(-1, layer='conv3d')
+=======
+        if MACOS_VERSION >= 13.2:
+            # Conv3d is only available from MacOS 13 onwards
+            helper(0, layer='conv3d')
+            helper(1, layer='conv3d')
+            helper(2, layer='conv3d')
+            helper(3, layer='conv3d')
+            helper(4, layer='conv3d')
+            helper(-1, layer='conv3d')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Test conv2d
     def test_conv2d_unit(self):
@@ -2885,8 +2997,13 @@ def test_slice_reshape_contg_view(self):
 
     def test_contiguous_slice_2d(self):
         def helper(shape):
+<<<<<<< HEAD
             for i in range(shape[0]):
                 for j in range(shape[1]):
+=======
+            for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     t_mps = torch.randn(shape, device="mps")
                     t_cpu = t_mps.detach().clone().cpu()
 
@@ -3240,7 +3357,12 @@ def test_torch_repeat_interleave(self, device="mps"):
     def test_repeat_interleave(self, device="mps"):
         x = torch.tensor([0, 1, 2, 3], device=device)
         expected = torch.tensor([1, 2, 2, 3, 3, 3], device=device)
+<<<<<<< HEAD
         self.assertEqual(torch.repeat_interleave(x), expected)
+=======
+        # Prior to macos 13.3, input of dtype=torch.int64 returns dtype=torch.int32
+        self.assertEqual(torch.repeat_interleave(x), expected, exact_dtype=MACOS_VERSION >= 13.3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaises(RuntimeError):
             torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2))
@@ -3446,12 +3568,20 @@ def test_storage_offset_greater_than_src_nbytes(self):
         elems = torch.arange(n_tensors * n_tensor_elems, dtype=torch.float32)
 
         tensor_list = []
+<<<<<<< HEAD
         for i in range(n_tensors - 1):
+=======
+        for i in range(0, n_tensors - 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # create a list of contiguous view tensors (view tensor created by the slice op)
             t = elems[n_tensor_elems * i : n_tensor_elems * (i + 1)]
             tensor_list.append(t)
 
+<<<<<<< HEAD
         for i in range(n_tensors - 1):
+=======
+        for i in range(0, n_tensors - 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t = tensor_list[i].view(1, n_tensor_elems)
             t_mps = t.to("mps")
             self.assertEqual(t, t_mps.cpu(), f"i={i}")
@@ -3651,7 +3781,11 @@ def rotate_subset(data, dim):
             self.assertFalse(x2.is_contiguous())
             return torch.concat((x1, x2), dim=dim)
         for dtype in MPS_DTYPES:
+<<<<<<< HEAD
             if dtype == torch.bool:
+=======
+            if dtype == torch.bool or (dtype.is_complex and MACOS_VERSION < 14.0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             data = torch.arange(48).to(dtype=dtype).reshape(1, 2, 4, 6)
             data = data.to(memory_format=torch.channels_last)
@@ -3664,6 +3798,7 @@ def rotate_subset(data, dim):
                 # TODO: enable memory format test
                 # self.assertEqual(cpu_result.is_contiguous(), mps_result.is_contiguous())
 
+<<<<<<< HEAD
     # Skip if a test needs more memory than the system has.
     def _skip_if_exceeds_total_memory(self, required_memory):
         if total_memory < required_memory:
@@ -3738,6 +3873,8 @@ def fn(x, y):
         rc = torch.func.jacfwd(fn)(x, y)
         self.assertEqual(rc.shape, (5, 2))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # See https://github.com/pytorch/pytorch/issues/85967
     def test_from_numpy_non_contiguous(self):
         a = np.arange(9).reshape(3, 3)[:, :2]
@@ -3931,7 +4068,18 @@ def helper(dtype):
             a_cpu = t_cpu.cumsum(0, dtype=dtype)
 
             self.assertEqual(a.cpu(), a_cpu)
+<<<<<<< HEAD
         [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.int64, torch.float32]]
+=======
+        [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.float32]]
+
+        try:
+            helper(torch.int64)
+        except Exception as e:
+            e_string = str(e)
+            self.assertEqual(e_string, "MPS does not support cumsum_out_mps op with int64 input." +
+                             " Support has been added in macOS 13.3")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cumsum_bool(self):
         a = torch.ones(2**16, dtype=torch.bool)
@@ -3966,7 +4114,18 @@ def helper(dtype):
             a_cpu = t_cpu.cumprod(0, dtype=dtype)
 
             self.assertEqual(a.cpu(), a_cpu)
+<<<<<<< HEAD
         [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.int64, torch.float32]]
+=======
+        [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.float32]]
+
+        try:
+            helper(torch.int64)
+        except Exception as e:
+            e_string = str(e)
+            self.assertEqual(e_string, "MPS does not support cumprod_out_mps op with int64 input."
+                             + " Support has been added in macOS 13.3")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cumprod_minus_one_axis(self):
         def helper(dtype):
@@ -4749,6 +4908,10 @@ def helper(n, c, h, w, reduction_type, dtype=torch.float32):
         helper(2, 8, 4, 4, "min", torch.float16)
         helper(2, 8, 4, 4, "min", torch.int64)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(MACOS_VERSION < 13.3, "Long data type supported from macOS 13.3 and above")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduction_sum_max_long_val(self):
         x_mps = torch.tensor([sys.maxsize, sys.maxsize - 10, sys.maxsize - 5, sys.maxsize - 18], device="mps")
         x_cpu = x_mps.detach().clone().cpu()
@@ -4956,7 +5119,11 @@ def helper(fn, dim):
             x_mps = fn(torch.zeros(shape, device="mps"), dim=dim)
             self.assertEqual(x_cpu, x_mps.cpu())
         for fn in [torch.any, torch.all]:
+<<<<<<< HEAD
             for dim in range(4):
+=======
+            for dim in range(0, 4):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 helper(fn, dim)
 
         # 6D tensor reductions
@@ -5450,9 +5617,12 @@ def helper():
 
         helper()
 
+<<<<<<< HEAD
         # Regression test for https://github.com/pytorch/pytorch/issues/160738
         self.assertTrue(torch.var(torch.tensor(3.13, device='mps'), dim=0).isnan().item())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Test forward amax
     def test_amax(self):
         def helper(shape, dim, keepdim):
@@ -5806,7 +5976,12 @@ def helper(shapes, output_size, kernel_size, padding, stride, contiguous, dtype=
         helper((4, 15, 1600), (40, 40), (3, 5), (1, 2), (1, 1), True)
         helper((4, 45, 187), (35, 33), (3, 5), (0, 1), (2, 3), True)
         helper((1600, 15), (40, 40), (3, 5), (1, 2), (1, 1), False)
+<<<<<<< HEAD
         helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, torch.bfloat16)
+=======
+        if MACOS_VERSION >= 14.0:
+            helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, torch.float16)
         helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, test_bool=True)
 
@@ -6095,6 +6270,10 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
+<<<<<<< HEAD
+=======
+    @xfailIf(MACOS_VERSION < 14.0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_angle(self):
         def helper(shape, dtype):
             cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
@@ -6146,7 +6325,11 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
+<<<<<<< HEAD
     @parametrize("dtype", {torch.float, torch.half, torch.bfloat16})
+=======
+    @parametrize("dtype", {torch.float, torch.half} if MACOS_VERSION < 14 else {torch.float, torch.half, torch.bfloat16})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_log1p(self, dtype):
         eps = torch.finfo(dtype).eps
         # Small values
@@ -6407,7 +6590,11 @@ def helper(shape, contiguous=True):
             x = cpu_x.detach().clone().to('mps')
 
             if not contiguous and (0 not in shape and len(shape) >= 2):
+<<<<<<< HEAD
                 # Transposing will make the tensor non-contiguous
+=======
+                # Tranposing will make the tensor non-contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cpu_x = cpu_x.transpose(0, 1)
                 x = x.transpose(0, 1)
                 assert not x.is_contiguous()
@@ -6563,7 +6750,11 @@ def helper(shape, dtype=torch.float, contiguous=True):
             x = cpu_x.detach().clone().to('mps')
 
             if not contiguous and (0 not in shape and len(shape) >= 2):
+<<<<<<< HEAD
                 # Transposing will make the tensor non-contiguous
+=======
+                # Tranposing will make the tensor non-contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cpu_x = cpu_x.transpose(0, 1)
                 x = x.transpose(0, 1)
                 assert not x.is_contiguous()
@@ -6603,7 +6794,11 @@ def helper(shape, dtype=torch.float, contiguous=True):
             x = cpu_x.detach().clone().to('mps')
 
             if not contiguous and (0 not in shape and len(shape) >= 2):
+<<<<<<< HEAD
                 # Transposing will make the tensor non-contiguous
+=======
+                # Tranposing will make the tensor non-contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cpu_x = cpu_x.transpose(0, 1)
                 x = x.transpose(0, 1)
                 assert not x.is_contiguous()
@@ -6885,6 +7080,11 @@ def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dt
 
     def test_index_64bit(self):
         """ Test that index operations work for 4Gb+ tensors """
+<<<<<<< HEAD
+=======
+        if MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Sonoma is needed for large tensors, see https://github.com/pytorch/pytorch/issues/84039")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Cleanup memory
         gc.collect()
         torch.mps.empty_cache()
@@ -6921,10 +7121,19 @@ def compare_mm(m, n, k, dtype=torch.float):
         # see https://github.com/pytorch/pytorch/issues/116769#issuecomment-1920066984
         compare_mm(32769, 1, 1025)
 
+<<<<<<< HEAD
         # Test bfloat16 mm
         compare_mm(1024, 1, 32769, torch.bfloat16)
 
     @unittest.skipIf(total_memory < 12_000_000_000, "Needs at least 12Gb RAM to run the test")
+=======
+        if MACOS_VERSION >= 14.0:
+            # Test bfloat16 mm
+            compare_mm(1024, 1, 32769, torch.bfloat16)
+
+    @unittest.skipIf(total_memory < 12_000_000_000, "Needs at least 12Gb RAM to run the test")
+    @unittest.skipIf(MACOS_VERSION < 14.0, "Can't allocate 4Gb tensor on MacOS 13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_CI, "May be fixes https://github.com/pytorch/pytorch/issues/149999")
     def test_copy_large(self):
         """ Test that copy of 4Gb+ tensors works """
@@ -7414,11 +7623,19 @@ def test_arange(self):
         self.assertEqual(np.arange(7, 1, -1), torch.arange(7, 1, -1, device='mps'))
         self.assertEqual(np.arange(1, 2, .3, dtype=np.float32), torch.arange(1, 2, .3, device='mps'))
         self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(6.3, device='mps'))
+<<<<<<< HEAD
 
         def do_arange(start=1.2, end=10.3, dtype=torch.bfloat16, device='cpu'):
             return torch.arange(start, end, device=device, dtype=dtype)
 
         self.assertEqual(do_arange(device='mps'), do_arange(device='cpu'))
+=======
+        # To be removed
+        if MACOS_VERSION >= 14.0:
+            def do_arange(start=1.2, end=10.3, dtype=torch.bfloat16, device='cpu'):
+                return torch.arange(start, end, device=device, dtype=dtype)
+            self.assertEqual(do_arange(device='mps'), do_arange(device='cpu'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_arange_empty(self):
         out_mps = torch.tensor([], device="mps")
@@ -7580,12 +7797,23 @@ def helper(shape, mean=0.0, std=1.0, dtype=torch.float):
 
         helper((2, 3, 4, 5, 6))
         helper((100, 100), 2.5, 1.2)
+<<<<<<< HEAD
         helper((10, 10), 2.5, 1.2, dtype=torch.bfloat16)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test invalid inputs
         with self.assertRaises(TypeError):
             helper((10, 10), 10, 11, dtype=torch.int32)
 
+<<<<<<< HEAD
+=======
+        if MACOS_VERSION >= 14.0:
+            helper((10, 10), 2.5, 1.2, dtype=torch.bfloat16)
+        else:
+            with self.assertRaises(TypeError):
+                helper((10, 10), 2.5, 1.2, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_bernoulli(self):
         shape = (10, 10)
@@ -7612,6 +7840,7 @@ def test_bernoulli(self):
         for dtype in [torch.float16, torch.int8, torch.int16, torch.int32, torch.int64]:
             mps_out = torch.zeros(shape, device='mps', dtype=dtype).bernoulli(0.5)
             # Check that output is not all zeros or ones
+<<<<<<< HEAD
             uniq = mps_out.unique()
             self.assertEqual(uniq, torch.arange(2, device='mps', dtype=dtype))
 
@@ -7647,6 +7876,14 @@ def test_dropout(self, dtype):
             else:
                 self.assertEqual(input.grad, output_grad)
 
+=======
+            if MACOS_VERSION > 13.0:
+                uniq = mps_out.unique()
+                self.assertEqual(uniq, torch.arange(2, device='mps', dtype=dtype))
+            else:
+                self.assertEqual(mps_out.min().item(), 0.)
+                self.assertEqual(mps_out.max().item(), 1.)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mps_generator(self):
         # explicit manual seeding by creating an MPS Generator
@@ -7842,6 +8079,7 @@ def test_random_5d(self):
         shape = (2, 3, 4, 5, 6)
         x = torch.rand(shape, device="mps")
         self.assertNotEqual(x[0], x[1])
+<<<<<<< HEAD
         # Check that normal distributions is not affected by the same
         y = torch.normal(torch.zeros(shape, device="mps"), torch.ones(shape, device="mps"))
         self.assertNotEqual(y[0], y[1])
@@ -7895,6 +8133,22 @@ def helper(shape, lambda_, dtype=torch.float32):
 
             print(mps_out.to('cpu').float().mean(), 1 / lambda_)
             print(mps_out.to('cpu').float().std() ** 2, 1 / (lambda_**2))
+=======
+        # Check that normal distributino is not affected by the same
+        y = torch.normal(torch.zeros(shape, device="mps"), torch.ones(shape, device="mps"))
+        self.assertNotEqual(y[0], y[1])
+
+    # Test exponential
+    @unittest.skip("This does not test anything")
+    def test_exponential(self):
+        def helper(shape, lamda, dtype=torch.float32):
+
+            mps_out = torch.zeros(shape, device='mps', dtype=dtype)
+            mps_out.exponential_(lamda)
+
+            print(mps_out.to('cpu').float().mean(), 1 / lamda)
+            print(mps_out.to('cpu').float().std() ** 2, 1 / (lamda**2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for dtype in [torch.float32, torch.float16]:
             helper([100, 100], 2, dtype)
@@ -7912,12 +8166,15 @@ def test_exponential_1(self):
         self.assertEqual(Exponential(0.2).sample((1,)).size(), (1,))
         self.assertEqual(Exponential(50.0).sample((1,)).size(), (1,))
 
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
     def test_exponential_nonzero(self, dtype):
         for _ in range(100):
             a = torch.empty(32_000, device="mps", dtype=dtype).exponential_()
             self.assertTrue((a != 0).all())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Test add
     def test_add_sub(self):
         def helper(shape, alpha, op_name, inplace):
@@ -7966,8 +8223,11 @@ def helper(shape, alpha, op_name, inplace):
         y = torch.arange(32, device='mps', dtype=torch.int32)
         self.assertEqual(torch.add(x, y, alpha=2).cpu(), torch.add(x.cpu(), y.cpu(), alpha=2))
         self.assertEqual(torch.add(x, 3, alpha=2).cpu(), torch.add(x.cpu(), 3, alpha=2))
+<<<<<<< HEAD
         # Regression test for https://github.com/pytorch/pytorch/issues/160208
         self.assertEqual(torch.add(y, x, alpha=2).cpu(), torch.add(y.cpu(), x.cpu(), alpha=2))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Test add
     def test_add_scalars(self):
@@ -8180,6 +8440,7 @@ def test_inplace_bitwise_not(self, dtype):
             x[::2].bitwise_not_()
         self.assertEqual(x_mps.cpu(), x_cpu)
 
+<<<<<<< HEAD
     def test_empty_posneginf(self):
         # just to check that it doesnt crash
         input_tensor = torch.empty(0, device="mps")
@@ -8197,6 +8458,10 @@ def test_empty_dot(self):
 
 class TestLargeTensors(TestCaseMPS):
     @serialTest()
+=======
+
+class TestLargeTensors(TestCaseMPS):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_64bit_binops(self):
         if torch.mps.recommended_max_memory() < 16_000_000_000:
             raise unittest.SkipTest("Needs at least 16Gb of RAM")
@@ -8225,6 +8490,7 @@ def test_64bit_index_select(self):
         gc.collect()
         torch.mps.empty_cache()
 
+<<<<<<< HEAD
     @serialTest()
     def test_rand_2b_raises(self):
         int32_max = torch.iinfo(torch.int32).max
@@ -8235,6 +8501,8 @@ def test_rand_2b_raises(self):
         self.assertEqual(x.numel(), int32_max)
         del x
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
@@ -8346,7 +8614,11 @@ def test_min_max(self, dtype):
             z_cpu = x_cpu.min()
             self.assertEqual(z, z_cpu)
 
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+=======
+    @parametrize("dtype", [torch.float32, torch.float16] + ([torch.bfloat16] if MACOS_VERSION >= 14.0 else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_min_max_nan_propagation(self, dtype):
         cpu_x = torch.tensor([1.0, float("nan"), 3.0], device="cpu", dtype=dtype)
         mps_x = cpu_x.detach().clone().to('mps')
@@ -8393,10 +8665,17 @@ def helper(dtype):
                     self.assertEqual(mps_out, cpu_ref)
 
         dtypes = [torch.float32, torch.float16, torch.bfloat16, torch.int32, torch.int16, torch.uint8, torch.int8]
+<<<<<<< HEAD
+=======
+        if MACOS_VERSION < 14.0:
+            # Int types expected to fail on MacOS < 14.0
+            dtypes = [torch.float32, torch.float16, torch.bfloat16]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         [helper(dtype) for dtype in dtypes]
 
         # Mixed dtypes (see https://github.com/pytorch/pytorch/issues/151443 )
+<<<<<<< HEAD
         x = torch.arange(4.0, device="mps")
         y = torch.tensor([1, 3], device="mps", dtype=torch.float16)
         self.assertEqual(torch.isin(x, y), torch.tensor([False, True, False, True], device="mps"))
@@ -8407,6 +8686,20 @@ def helper(dtype):
         self.assertEqual(torch.isin(x, 8.0), torch.tensor([False, False, False, False], device="mps"))
         # Scalar.Tensor variant(alaises to Scalar.Scalar), not covered by OpInfo
         self.assertEqual(torch.isin(2.0, x), torch.tensor(True, device="mps"))
+=======
+        # torch.isin is broken in MacOS-13.2 even for the same dtype
+        if MACOS_VERSION >= 14.0:
+            x = torch.arange(4.0, device="mps")
+            y = torch.tensor([1, 3], device="mps", dtype=torch.float16)
+            self.assertEqual(torch.isin(x, y), torch.tensor([False, True, False, True], device="mps"))
+
+            # Tensor.Scalar variant (aliases to eq), not covered by OpInfo
+            self.assertEqual(torch.isin(x, 2.0), torch.tensor([False, False, True, False], device="mps"))
+            self.assertEqual(torch.isin(x, 1.0, invert=True), torch.tensor([True, False, True, True], device="mps"))
+            self.assertEqual(torch.isin(x, 8.0), torch.tensor([False, False, False, False], device="mps"))
+            # Scalar.Tensor varaiant(alaises to Scalar.Scalar), not covered by OpInfo
+            self.assertEqual(torch.isin(2.0, x), torch.tensor(True, device="mps"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_isin_asserts(self):
         C = torch.randn(size=[1, 4], device='mps', dtype=torch.float32)
@@ -9119,12 +9412,15 @@ def test_constant_pad_nd_preserves_memory_format(self):
         nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5)
         self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last))
 
+<<<<<<< HEAD
     def test_constant_pad_nd_with_empty_pad(self):
         # Empty constant pad is no-op
         # See https://github.com/pytorch/pytorch/issues/161066
         input_mps = torch.randn((2, 3, 4), device="mps")
         output_mps = torch.constant_pad_nd(input_mps, [])
         self.assertEqual(output_mps, input_mps)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestLinalgMPS(TestCaseMPS):
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
@@ -9345,7 +9641,11 @@ def weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros):
 
         b_int4pack, b_scales_and_zeros_f32 = convert_weight_to_int4pack(b_f32)
 
+<<<<<<< HEAD
         for dtype in [torch.float16, torch.float32, torch.bfloat16]:
+=======
+        for dtype in [torch.float16, torch.float32] + ([torch.bfloat16] if MACOS_VERSION > 14.0 else []):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = a_f32.to(dtype=dtype)
             b = b_f32.to(dtype=dtype)
             b_scales_and_zeros = b_scales_and_zeros_f32.to(dtype=dtype)
@@ -9373,7 +9673,11 @@ def weight_int8pack_mm(a, b_int8pack, b_scales):
             return torch._weight_int8pack_mm(a, b_int8pack, b_scales)
 
         b_int8pack, b_scales_f32 = convert_weight_to_int8pack(b_f32)
+<<<<<<< HEAD
         for dtype in [torch.float16, torch.float32, torch.bfloat16]:
+=======
+        for dtype in [torch.float16, torch.float32] + ([torch.bfloat16] if MACOS_VERSION > 14.0 else []):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = a_f32.to(dtype=dtype)
             b = b_f32.to(dtype=dtype)
             b_scales = b_scales_f32.to(dtype=dtype)
@@ -9480,6 +9784,7 @@ def test_sdpa_mask_fp16_L6(self):
     def test_sdpa_mask_fp16_L6_S17_NH23_HS121(self):
         self._test_sdpa_mask(torch.float16, 7, 17, 23, 121)
 
+<<<<<<< HEAD
     # Regression test from: https://github.com/pytorch/pytorch/issues/156707
     @parametrize("dtype", [torch.float16, torch.float32])
     def test_sdpa_full_mask(self, dtype):
@@ -9492,6 +9797,8 @@ def test_sdpa_full_mask(self, dtype):
         out_mps = F.scaled_dot_product_attention(q.to('mps'), k.to('mps'), v.to('mps'), attn_mask=mask.to('mps'))
         self._compare_tensors(out_mps.cpu(), out_cpu)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dtype", [torch.float16, torch.float32])
     def test_sdpa_3d_input(self, dtype):
         head_num, seq_len, embed_dim = 16, 16, 80
@@ -9590,7 +9897,11 @@ def test_sdpa_enable_gqa(self, dtype, is_causal):
             )
         self._compare_tensors(y.cpu(), y_ref)
 
+<<<<<<< HEAD
     @serialTest()
+=======
+    @serialTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sdpa_fp32_no_memory_leak(self):
         def get_mps_memory_usage():
             return (torch.mps.current_allocated_memory() / (1024 * 1024),
@@ -9608,6 +9919,7 @@ def get_mps_memory_usage():
         # 5 MB different maximum allowed value(could be decreased even more)
         torch.testing.assert_close(memory_footprints[-1], memory_footprints[0], atol=5, rtol=1)
 
+<<<<<<< HEAD
     def generate_qkv(self, batch: int, NH: int, q_len: int, s_len: int, head_dim: int, layout: str, dtype: torch.dtype):
         if layout == "contiguous":
             q = torch.randn(batch, NH, q_len, head_dim, dtype=dtype, device="mps")
@@ -9639,6 +9951,19 @@ def run_fast_attention_test(
         dropout_p: float = 0.0,
         is_causal: bool = False,
     ):
+=======
+    def generate_qkv(self, batch, NH, q_len, s_len, head_dim, contiguous, dtype):
+        if contiguous:
+            q = torch.randn(batch, NH, q_len, head_dim, dtype=dtype, device="mps")
+            k = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
+        else:
+            q = torch.randn(batch, NH, head_dim, q_len, dtype=dtype, device="mps").mT
+            k = torch.randn(batch, NH, head_dim, s_len, dtype=dtype, device="mps").mT
+        v = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
+        return q, k, v
+
+    def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q_len = q.shape[2]
         s_len = k.shape[2]
 
@@ -9679,15 +10004,23 @@ def run_fast_attention_test(
         self._compare_tensors(y.cpu(), y_ref)
 
     @parametrize("dtype", [torch.float16, torch.float32])
+<<<<<<< HEAD
     @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("head_dim", [64, 96, 128])  # 64, 96, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
     def test_fast_vector_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
+=======
+    @parametrize("contiguous", [True, False])
+    @parametrize("head_dim", [64, 96, 128])  # 64, 96, 128 are for the fast kernel
+    @parametrize("with_mask", [True, False])
+    def test_fast_vector_attention(self, dtype, contiguous, head_dim, with_mask):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 4  # <8 so that vector fast is eligible
         s_len = 16  # smaller than 1024 so that we use the one–pass variant
+<<<<<<< HEAD
         q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
@@ -9695,26 +10028,47 @@ def test_fast_vector_attention(self, dtype: torch.dtype, layout: str, head_dim:
     @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("with_mask", [True, False])
     def test_fast_vector_attention_2pass(self, dtype: torch.dtype, layout: str, with_mask: bool):
+=======
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        self.run_fast_attention_test(q, k, v, with_mask)
+
+    @parametrize("dtype", [torch.float32])  # float16 underflows sometimes, which leads to flaky tests
+    @parametrize("contiguous", [True, False])
+    @parametrize("with_mask", [True, False])
+    def test_fast_vector_attention_2pass(self, dtype, contiguous, with_mask):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(1729)
         batch = 1
         NH = 32
         q_len = 8
         s_len = 1024  # large enough to trigger the two–pass path
         head_dim = 64  # supported head dimension for vector attention
+<<<<<<< HEAD
         q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
+=======
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_fast_attention_test(q, k, v, with_mask)
 
     @unittest.skip("Full attention fast kernel not implemented yet")
     @parametrize("dtype", [torch.float16, torch.float32])
+<<<<<<< HEAD
     @parametrize("layout", ["contiguous", "mT"])
     @parametrize("head_dim", [64, 80, 128])  # 64, 80, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
     def test_fast_full_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
+=======
+    @parametrize("contiguous", [True, False])
+    @parametrize("head_dim", [64, 80, 128])  # 64, 80, 128 are for the fast kernel
+    @parametrize("with_mask", [True, False])
+    def test_fast_full_attention(self, dtype, contiguous, head_dim, with_mask):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 32  # threshold to trigger full fast attention path
         s_len = 16
+<<<<<<< HEAD
         q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
@@ -9790,6 +10144,12 @@ def new_fn(self, *args, **kwargs):
 TestSDPAMeta = create_sdpa_meta_test()
 instantiate_parametrized_tests(TestSDPAMeta)
 
+=======
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        self.run_fast_attention_test(q, k, v, with_mask)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):
         # Slicing with step
@@ -9803,7 +10163,11 @@ def test_slicing_with_step(self):
         self.assertEqual(x_cpu, x_mps)
 
     def test_cast_gather_scatter(self):
+<<<<<<< HEAD
         for _ in range(50):
+=======
+        for _ in range(0, 50):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
             with torch.no_grad():
                 s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
@@ -10191,7 +10555,11 @@ def assert_is_nonview(t, nv):
         assert_is_nonview(t, nv)
 
         # flatten returns the original object if start_dim=end_dim
+<<<<<<< HEAD
         t = torch.ones(2, 2, device=device)
+=======
+        t = t = torch.ones(2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nv = t.flatten(1, 1)
         self.assertIs(t, nv)
 
@@ -10772,7 +11140,11 @@ def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
         grad_in_cl = torch.empty(1, f, oc, device="mps").transpose(1, 2)
         grad_in_cl[:] = grad_in
 
+<<<<<<< HEAD
         # It does not matter whether grad_in contiguous, or channels last, results should equal to each other
+=======
+        # It does not matter whether grad_in contigous, or channels last, results should equal to each other
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_rc = torch.autograd.grad((out,), (inp, conv.weight, conv.bias), (grad_in,), retain_graph=True)
         grad_rc_cl = torch.autograd.grad((out,), (inp, conv.weight, conv.bias), (grad_in_cl,), retain_graph=True)
 
@@ -11141,6 +11513,10 @@ class TestAdvancedIndexing(TestCaseMPS):
     supported_dtypes = [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]
     supported_np_dtypes = [np.float32, np.float16, np.int64, np.int32, np.int16, np.uint8]
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(MACOS_VERSION < 14.0, "Skipped on macOS < 14")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nonzero_no_warning(self):
         device = "mps"
         t = torch.randn((2, 2), device=device)
@@ -11783,9 +12159,12 @@ def test_empty_slice(self, device="mps"):
     def test_empty_reduce(self, device="mps"):
         x = torch.rand(0, 3, device=device)
         self.assertTrue(x.mean().isnan())
+<<<<<<< HEAD
         self.assertTrue(x.nanmean().isnan())
         self.assertTrue(x.median().isnan())
         self.assertTrue(x.nanmedian().isnan())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(x.count_nonzero(), 0)
         self.assertEqual(x.sum(), 0)
         self.assertEqual(x.nansum(), 0)
@@ -12303,6 +12682,12 @@ def test_serialization_map_location(self):
             self.assertEqual(x2.device.type, "mps")
 
 
+<<<<<<< HEAD
+=======
+MPS_UNSUPPORTED_TYPES = [torch.double, torch.cdouble] + ([torch.bfloat16] if MACOS_VERSION < 14.0 else [])
+MPS_DTYPES = [t for t in get_all_dtypes() if t not in MPS_UNSUPPORTED_TYPES]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MPS_GRAD_DTYPES = [torch.float32, torch.float16]
 
 
@@ -12386,7 +12771,11 @@ class TestConsistency(TestCaseMPS):
         'arange', 'linspace',
         'special.xlog1py',
 
+<<<<<<< HEAD
         # CPU accumulates sequantially, but GPU does in parallel
+=======
+        # CPU accumulates sequantially, but GPU does in in parallel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         '_unsafe_masked_index_put_accumulate',
     }
 
@@ -12421,6 +12810,13 @@ def _compute_tolerances(self, op, dtype):
             return (7e-4, 2e-3)
         if op.name == "native_layer_norm":
             return (1e-4, 1.3e-5)
+<<<<<<< HEAD
+=======
+        if op.name in ["pow", "__rpow__"] and MACOS_VERSION < 13.3:
+            # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
+            # fixed in macOS 13.3+
+            return (1e-6, 2e-3 if dtype == torch.float16 else 4e-6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op.name in ['fft.rfftn', 'fft.hfftn', 'fft.hfft2', 'fft.fft', 'fft.fftn', 'fft.rfft']:
             # TODO: Investigate why this is needed
             # See https://github.com/pytorch/pytorch/issues/120237
@@ -12438,6 +12834,11 @@ def _compute_tolerances(self, op, dtype):
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "mps:0")
         include_conjugated_inputs = dtype.is_complex and op.test_conjugated_samples
+<<<<<<< HEAD
+=======
+        if op.name.endswith("svd") and MACOS_VERSION < 14.0 and dtype == torch.complex64:
+            raise unittest.SkipTest("Can't even generate complex samples on MacOS-13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_samples():
             return op.sample_inputs(
@@ -12488,6 +12889,7 @@ def get_samples():
                 # Similar to the above, float vs double precision aresults in slight error
                 atol, rtol = 2e-5, 2e-6
 
+<<<<<<< HEAD
             if op.name in ["grid_sampler_3d", "asinh"]:
                 atol, rtol = 1e-4, 1e-4
 
@@ -12500,6 +12902,8 @@ def get_samples():
                 self.assertEqual(values if keep_dim else values.squeeze(dim), mps_out[0])
                 continue
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
     @ops(mps_ops_grad_modifier(copy.deepcopy(test_consistency_op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
@@ -12592,6 +12996,7 @@ def req_grad(t):
             # which leads to larger errors
             if op.name == "_unsafe_masked_index" and dtype == torch.float16:
                 atol, rtol = 3e-3, 3e-3
+<<<<<<< HEAD
             if op.name == "logcumsumexp":
                 atol, rtol = 4e-3, 1e-3
             if op.name == "nn.functional.max_pool3d" and dtype == torch.float16:
@@ -12645,6 +13050,12 @@ def test_grid_sampler_3d_nan(self, device):
 
     def test_fmax_mixed_dtypes(self, device):
         # Regression testing for https://github.com/pytorch/pytorch/issues/149951
+=======
+            self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
+
+    def test_fmax_mixed_dtypes(self, device):
+        # Regression tesing for https://github.com/pytorch/pytorch/issues/149951
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # fmax and fmin are implemented as binary metal shaders and they were implemented
         # with the assumption that both args have the same dtype
         x = torch.rand((3, 3), device=device, dtype=torch.float32)
@@ -12755,7 +13166,11 @@ def test_numpy_ref_mps(self, device, dtype, op):
     def test_tensor_creation(self, device, dtype):
         def ones(device):
             return torch.ones((2, 2), dtype=dtype, device=device)
+<<<<<<< HEAD
         if dtype not in MPS_DTYPES:
+=======
+        if dtype not in MPS_DTYPES + ([torch.bfloat16] if MACOS_VERSION > 14.0 else []):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaises(TypeError):
                 ones(device)
         else:
@@ -12862,8 +13277,15 @@ def test_metal_include(self):
         lib = torch.mps.compile_shader("#include <c10/metal/special_math.h>")
         self.assertIsNotNone(lib)
 
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16, torch.int32, torch.int64])
     def test_reduction_utils(self, dtype):
+=======
+    @parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.int64])
+    def test_reduction_utils(self, dtype):
+        if dtype == torch.int64 and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Using simd_shuffle_down_and_fill results in ICE on MacOS-13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.codegen.mps import DTYPE_TO_METAL
         lib = torch.mps.compile_shader(f"""
             #include <c10/metal/reduction_utils.h>
@@ -12872,6 +13294,7 @@ def test_reduction_utils(self, dtype):
                                uint idx [[thread_position_in_grid]]) {{
                 out[idx] = c10::metal::simd_sum(inp[idx]);
             }}
+<<<<<<< HEAD
 
             kernel void do_max(device {DTYPE_TO_METAL[dtype]}* out0,
                                device int* out1,
@@ -12910,6 +13333,21 @@ def test_reduction_utils(self, dtype):
 
     @parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.bfloat16])
     def test_atomic_add(self, dtype):
+=======
+        """)
+        x = torch.testing.make_tensor(28, device="mps", dtype=dtype)
+        y = torch.empty_like(x)
+        lib.do_sum(y, x)
+        x_sum = x.sum()
+        max_err = (y - x_sum).abs().max().item()
+        self.assertLess(max_err, 1e-2 if dtype == torch.float16 else 1e-5,
+                        f"results are {y}, but all elements should have been {x_sum.item()}")
+
+    @parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.bfloat16])
+    def test_atomic_add(self, dtype):
+        if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat requires MacOS-14+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.codegen.mps import DTYPE_TO_METAL
         mdtype = DTYPE_TO_METAL[dtype]
         lib = torch.mps.compile_shader(f"""
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 76e50375bba15..5a03f5051d12b 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -30,7 +30,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST_REPEATS = 30
 HAS_SHM_FILES = os.path.isdir("/dev/shm")
@@ -211,9 +215,15 @@ def autograd_sharing(queue, ready, master_modified, device, is_parameter):
     is_ok &= var.grad is None
     is_ok &= not var._backward_hooks
     if is_parameter:
+<<<<<<< HEAD
         is_ok &= type(var) is Parameter
     else:
         is_ok &= type(var) is torch.Tensor
+=======
+        is_ok &= type(var) == Parameter
+    else:
+        is_ok &= type(var) == torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     var._grad = torch.ones(5, 5, device=device)
 
     queue.put(is_ok)
diff --git a/test/test_multiprocessing_spawn.py b/test/test_multiprocessing_spawn.py
index b77105567cbaa..b19de67da8bf3 100644
--- a/test/test_multiprocessing_spawn.py
+++ b/test/test_multiprocessing_spawn.py
@@ -47,7 +47,11 @@ def _test_terminate_signal_func(i):
 def _test_terminate_exit_func(i, arg):
     if i == 0:
         sys.exit(arg)
+<<<<<<< HEAD
     time.sleep(4.0)
+=======
+    time.sleep(1.0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _test_success_first_then_exception_func(i, arg):
@@ -145,7 +149,11 @@ def test_terminate_signal(self):
         with self.assertRaisesRegex(Exception, message):
             mp.start_processes(_test_terminate_signal_func, nprocs=2, start_method=self.start_method)
 
+<<<<<<< HEAD
     @parametrize("grace_period", [None, 20])
+=======
+    @parametrize("grace_period", [None, 5])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_terminate_exit(self, grace_period):
         exitcode = 123
         ctx = mp.start_processes(_test_terminate_exit_func, args=(exitcode,), nprocs=2, start_method=self.start_method, join=False)
@@ -201,7 +209,11 @@ def _test_nested(self):
                 try:
                     os.kill(pid, 0)
                 except ProcessLookupError:
+<<<<<<< HEAD
                     pids.remove(pid)  # noqa: B909
+=======
+                    pids.remove(pid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     break
 
             # This assert fails if any nested child process is still
@@ -265,12 +277,15 @@ def tearDown(self):
 )
 class ParallelForkServerPerfTest(TestCase):
 
+<<<<<<< HEAD
     @unittest.skipIf(
         sys.version_info >= (3, 13, 8),
         "Python 3.13.8+ changed forkserver module caching behavior",
         # https://docs.python.org/3.13/whatsnew/changelog.html
         # gh-126631
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_forkserver_perf(self):
 
         start_method = 'forkserver'
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index c8a9ca33efb0f..a4d1e4c471dd6 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -280,11 +280,14 @@ def test_diagonal(self):
         self.assertEqual(named_tensor.diagonal(outdim='E', dim1='B', dim2='D').names,
                          ['A', 'C', 'E'])
 
+<<<<<<< HEAD
     def test_empty_names(self):
         ref_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]])
         empty_named_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]], names=[])
         self.assertEqual(ref_tensor, empty_named_tensor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_max_pooling(self):
         def check_tuple_return(op, inputs, expected_names):
             values, indices = op(*inputs)
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 8e9d1ed0217ae..9e14027a3f9d3 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -26,6 +26,10 @@
     NestedTensor,
     ViewNestedFromBuffer,
 )
+<<<<<<< HEAD
+=======
+from torch.nn.attention.flex_attention import create_nested_block_mask, flex_attention
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     SM70OrLater,
@@ -35,6 +39,10 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+<<<<<<< HEAD
+=======
+    flex_attention_supported_platform,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -58,6 +66,10 @@
     parametrize,
     run_tests,
     serialTest,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfSlowGradcheckEnv,
     skipIfTorchDynamo,
     subtest,
@@ -857,6 +869,7 @@ def test_cat(self):
         ):
             torch.cat([x, y], dim=-1)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/161812
     def test_jagged_with_dim_error(self):
         x = torch.nested.nested_tensor(
@@ -873,6 +886,8 @@ def test_jagged_with_dim_error(self):
         ):
             torch.stack([x, x])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nested_view_from_buffer_overflow_errors(self):
         buffer = torch.tensor([1])
         sizes = torch.tensor([[2**63 - 1], [2**63 - 1], [3]], dtype=torch.int64)
@@ -1105,6 +1120,7 @@ def check(inputs, y):
 
         check(inputs, y)
 
+<<<<<<< HEAD
     @dtypes(
         torch.int8,
         torch.int16,
@@ -1237,6 +1253,8 @@ def test_jagged_argmin_dtypes(self, device, dtype):
 
         self.assertEqual(result_argmin, expected_argmin)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @torch.inference_mode()
     @dtypes(*floating_types_and_half())
@@ -1375,6 +1393,7 @@ def test_device_checks(self, device):
         is_cuda = "cuda" in str(device)
         self.assertEqual(nt.is_cuda, is_cuda)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     def test_share_memory(self, device):
         a = torch.randn(3, 4, device=device)
@@ -1393,6 +1412,8 @@ def test_share_memory(self, device):
         # Verify in shared memory
         self.assertTrue(nt.is_shared())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.float16, torch.double)
     def test_nested_tensor_indexing(self, device, dtype):
         # edge case: empty nested tensor
@@ -1482,6 +1503,7 @@ def test_unary_funcs(self, device, func):
             lambda: func(nt_noncontiguous),
         )
 
+<<<<<<< HEAD
     def test_is_any_true_jagged(self, device):
         B, Fin = 2, 6
         start = torch.zeros(B, dtype=torch.int64, device=device)
@@ -1558,6 +1580,8 @@ def test_is_all_true_jagged(self, device):
             )
             self.assertFalse(torch.ops.aten._is_all_true.default(nt_mixed).item())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("func", [subtest(torch.ge, name="ge"), subtest(torch.eq, name="eq")])
     def test_binary_ops_with_scalar(self, device, func):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair(
@@ -4683,18 +4707,24 @@ def test_jagged_op_different_output_shape_dim(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+<<<<<<< HEAD
     @parametrize(
         "func",
         [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
         name_fn=lambda func: func.__name__,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_softmax_dim(
         self,
         device,
         dtype,
         requires_grad,
         components_require_grad,
+<<<<<<< HEAD
         func,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Softmax passes when reducing on valid reduction dimensions.
@@ -4713,7 +4743,11 @@ def test_softmax_dim(
 
         for reduce_dim, _ in reduce_dims:
             nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
+<<<<<<< HEAD
             out_actual = func(nt, dim=reduce_dim)
+=======
+            out_actual = torch.nn.functional.softmax(nt, dim=reduce_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.disable(self.assertEqual)(
                 len(out_actual.shape), len(output_shape)
             )  # disable if running on dynamo
@@ -4743,10 +4777,19 @@ def test_softmax_dim(
             reduce_dim, reduce_dim_expected = reduce_dim_tuple
 
             if nt.dim() > reduce_dim:
+<<<<<<< HEAD
                 # nested tensor
                 out_actual = func(nt, dim=reduce_dim)
                 # dense tensor of dimensions 1 less than out_actual
                 out_expected = func(nt.values(), dim=reduce_dim_expected)
+=======
+                out_actual = torch.nn.functional.softmax(
+                    nt, dim=reduce_dim
+                )  # nested tensor
+                out_expected = torch.nn.functional.softmax(
+                    nt.values(), dim=reduce_dim_expected
+                )  # dense tensor of dimensions 1 less than out_actual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertTrue(
                     torch.allclose(out_actual.values().view(-1), out_expected.view(-1))
                 )
@@ -4844,6 +4887,7 @@ def test_softmax_dim_reduce_ragged_idx_1(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+<<<<<<< HEAD
     @parametrize(
         "func",
         [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
@@ -4851,6 +4895,10 @@ def test_softmax_dim_reduce_ragged_idx_1(
     )
     def test_softmax_reduce_batch_dim(
         self, device, dtype, requires_grad, components_require_grad, func
+=======
+    def test_softmax_reduce_batch_dim(
+        self, device, dtype, requires_grad, components_require_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Softmax on NestedTensor fails when trying to reduce across batch dimension.
@@ -4875,7 +4923,11 @@ def test_softmax_reduce_batch_dim(
                 RuntimeError,
                 "not supported when reducing across the batch dimension for NestedTensor",
             ):
+<<<<<<< HEAD
                 out = func(nt, dim=reduce_dim)
+=======
+                out = torch.nn.functional.softmax(nt, dim=reduce_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
@@ -5888,11 +5940,14 @@ def test_nested_tensor_from_jagged(self, device, dtype, pass_min_max):
         ):
             torch.nested.nested_tensor_from_jagged(values, offsets=None, lengths=None)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(ValueError, "Expected jagged_dim >=1, but got 0."):
             torch.nested.nested_tensor_from_jagged(
                 values, lengths=lengths, jagged_dim=0
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     def test_nested_tensor_from_jagged_fx_trace(self, device):
         def fn(x, y):
@@ -6999,10 +7054,18 @@ def check_forward_backward(skip_backward=False):
             and check_cudnn
             and (dtype == torch.float16 or dtype == torch.bfloat16)
         ):
+<<<<<<< HEAD
             with torch.nn.attention.sdpa_kernel(
                 torch.nn.attention.SDPBackend.CUDNN_ATTENTION
             ):
                 check_forward_backward()
+=======
+            with self.assertRaisesRegex(RuntimeError, "cuDNN SDPA Nested Tensor"):
+                with torch.nn.attention.sdpa_kernel(
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION
+                ):
+                    check_forward_backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfTorchDynamo("SDPA test compiles internally")
     @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
@@ -7381,10 +7444,13 @@ def fn(values, lengths):
     @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
     @parametrize("use_legacy_api", [True, False])
     @skipCPUIf(True, "SPDA Math NT fallback causes failure: see issue #133644")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dummy_mha_with_nt(self, device, use_legacy_api):
         bs = 3
         d1 = 2
@@ -7446,7 +7512,11 @@ def forward(self, query, value, offsets):
 
         query = torch.rand(bs, d1, d3, device=device)
         value = torch.rand(30, d2, requires_grad=True, device=device)
+<<<<<<< HEAD
         # total_length must > than max_length otherwise flash_attn backward will fail
+=======
+        # total_length must > than max_length otherwise flash_attn backwark will fail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offsets = torch.tensor([0, 2, 3, 30], device=device)
 
         m = mha(use_legacy_api)
@@ -7528,6 +7598,124 @@ def _rand_nt(noncontig_with_holes=noncontig_with_holes):
 
         return query, key, value
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+    @flex_attention_supported_platform
+    @dtypes(torch.float32)
+    # non-contiguous with holes not supported yet
+    @decorateIf(unittest.skip, lambda params: params["noncontig_with_holes"])
+    @parametrize("noncontig_with_holes", [False, True])
+    @parametrize("cross_attention", [False, True])
+    @skipIfRocm
+    def test_flex_attention(self, device, dtype, noncontig_with_holes, cross_attention):
+        query, key, value = self._rand_qkv(
+            device, dtype, noncontig_with_holes, q_and_kv_match=(not cross_attention)
+        )
+
+        # Run FlexAttention with a causal mask
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        if cross_attention:
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, key, _compile=True
+            )
+        else:
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, _compile=True
+            )
+
+        out_flex = flex_attention(query, key, value, block_mask=block_mask)
+        grad_out = torch.randn_like(out_flex)
+        grads_flex = torch.autograd.grad(
+            out_flex, inputs=(query, key, value), grad_outputs=(grad_out,)
+        )
+        flex_outs = [out_flex, *grads_flex]
+
+        # Run FlexAttention with a score_mod that represents causal attention
+        def causal_score_mod(score, b, h, q_idx, kv_idx):
+            return torch.where(q_idx >= kv_idx, score, float("-inf"))
+
+        out_flex2 = flex_attention(query, key, value, score_mod=causal_score_mod)
+        grads_flex2 = torch.autograd.grad(
+            out_flex2, inputs=(query, key, value), grad_outputs=(grad_out,)
+        )
+        flex_outs2 = [out_flex2, *grads_flex2]
+
+        # Run causal SDPA for comparison
+        out_sdpa = F.scaled_dot_product_attention(query, key, value, is_causal=True)
+        grads_sdpa = torch.autograd.grad(
+            out_sdpa, inputs=(query, key, value), grad_outputs=(grad_out,)
+        )
+        sdpa_outs = [out_sdpa, *grads_sdpa]
+
+        # Compare flex vs. SDPA output and grads
+        for flex, flex2, sdpa in zip(flex_outs, flex_outs2, sdpa_outs):
+            self.assertTrue(flex.is_nested and flex2.is_nested and sdpa.is_nested)
+            self.assertEqual(flex, sdpa, atol=1e-2, rtol=1e-2)
+            self.assertEqual(flex2, sdpa, atol=1e-2, rtol=1e-2)
+
+    @onlyCUDA
+    @flex_attention_supported_platform
+    @dtypes(torch.float32)
+    def test_flex_attention_converts_stacked_seq_indices(self, device, dtype):
+        # This test verifies that a score_mod function written to operate within
+        # NJT sequence index space, such as a lookup table, works correctly. This
+        # validates that FlexAttention properly converts indices within the
+        # "stacked sequence" space used for NJT -> sequence-relative indices.
+        query, key, value = self._rand_qkv(device, dtype)
+
+        # Test with score_mod
+        score_mod_table = torch.randn(query._max_seqlen, device=device, dtype=dtype)
+
+        def my_score_mod(score, b, h, q_idx, kv_idx):
+            return score_mod_table[q_idx]
+
+        flex_attention(query, key, value, score_mod=my_score_mod)
+
+        # Test with batch-specific score_mod
+        batch_size = query.size(0)
+        batch_table = torch.randn(batch_size, device=device, dtype=dtype)
+        # Keep score the same for batch index == 0
+        batch_table[0].zero_()
+
+        def batch_specific_score_mod(score, b, h, q_idx, kv_idx):
+            return score + batch_table[b]
+
+        def identity_score_mod(score, b, h, q_idx, kv_idx):
+            return score
+
+        output = flex_attention(query, key, value, score_mod=batch_specific_score_mod)
+        output_identity = flex_attention(
+            query, key, value, score_mod=identity_score_mod
+        )
+
+        # Guard against a bug where the batch index passed to score_mod is always b == 0.
+        # Output would be equivalent to applying an identity score_mod.
+        # See https://github.com/pytorch/pytorch/issues/143788
+        self.assertFalse(torch.allclose(output._values, output_identity._values))
+
+        # Test with mask_mod
+        mask_mod_table = score_mod_table > 0.0
+
+        def my_mask_mod(b, h, q_idx, kv_idx):
+            return mask_mod_table[q_idx]
+
+        def my_mask_mod2(b, h, q_idx, kv_idx):
+            return mask_mod_table[q_idx] & (b == 0)
+
+        block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query, _compile=True)
+        output = flex_attention(query, key, value, block_mask=block_mask)
+
+        block_mask2 = create_nested_block_mask(my_mask_mod2, 1, 1, query, _compile=True)
+        output2 = flex_attention(query, key, value, block_mask=block_mask2)
+
+        # Guard against a bug where the batch index passed to mask_mod is always b == 0.
+        # See https://github.com/pytorch/pytorch/issues/143788
+        self.assertFalse(torch.allclose(output._values, output2._values))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32)
     def test_apply_(self, device, dtype):
         nt = random_nt_from_dims(
@@ -7935,6 +8123,7 @@ def test_to_padded_tensor(self, device, dtype, nt_dim, requires_grad):
 
         nt = torch.nested.nested_tensor(
             [
+<<<<<<< HEAD
                 (
                     torch.randint(
                         2, (n, *post_seq_len_shape), device=device, dtype=dtype
@@ -7942,6 +8131,11 @@ def test_to_padded_tensor(self, device, dtype, nt_dim, requires_grad):
                     if dtype is torch.bool
                     else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
                 )
+=======
+                torch.randint(2, (n, *post_seq_len_shape), device=device, dtype=dtype)
+                if dtype is torch.bool
+                else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for n in range(2, 9)
             ],
             layout=torch.jagged,
@@ -7990,6 +8184,7 @@ def test_to_padded_tensor_compile(self, device, dtype, nt_dim, requires_grad):
 
         nt = torch.nested.nested_tensor(
             [
+<<<<<<< HEAD
                 (
                     torch.randint(
                         2, (n, *post_seq_len_shape), device=device, dtype=dtype
@@ -7997,6 +8192,11 @@ def test_to_padded_tensor_compile(self, device, dtype, nt_dim, requires_grad):
                     if dtype is torch.bool
                     else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
                 )
+=======
+                torch.randint(2, (n, *post_seq_len_shape), device=device, dtype=dtype)
+                if dtype is torch.bool
+                else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for n in range(2, 9)
             ],
             layout=torch.jagged,
@@ -8232,7 +8432,10 @@ def f(values, offsets):
             "std.unbiased",
             "var",
             "var.unbiased",
+<<<<<<< HEAD
             "hash_tensor",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
         name="not_implemented",
     ),
@@ -8680,6 +8883,17 @@ def f(values, offsets):
 
 COMPILE_FORWARD_SKIPS_AND_XFAILS = [
     *FORWARD_SKIPS_AND_XFAILS,
+<<<<<<< HEAD
+=======
+    # Needs investigation in AOTAutograd: len(unwrapped_args) == num_args_tallied assertion fails
+    # e.g. Expected 5 == 4
+    XFailRule(
+        error_type=AssertionError,
+        op_match_fn=lambda device, op: (op.full_name == "fill"),
+        sample_match_fn=lambda device, sample: ("noncontig_transposed" in sample.name),
+        name="fill_aot_autograd_bug_with_transposed_input",
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Bug: cross-device conversions with to() result in new nested ints within compile only
     XFailRule(
         error_type=AssertionError,
@@ -8717,6 +8931,21 @@ def f(values, offsets):
         sample_match_fn=lambda device, sample: ("batch_dim" in sample.name),
         name="broken_select_backward_unbacked",
     ),
+<<<<<<< HEAD
+=======
+    # Bug: no idea what's going on here; needs investigation within AOTAutograd
+    XFailRule(
+        op_match_fn=lambda device, op: (op.full_name == "nan_to_num"),
+        sample_match_fn=lambda device, sample: ("noncontig_transposed" in sample.name),
+        name="crazy_aot_autograd_bug1",
+    ),
+    # Bug: also no idea what's going on here: needs investigation within AOTAutograd
+    XFailRule(
+        op_match_fn=lambda device, op: (op.full_name == "isreal"),
+        sample_match_fn=lambda device, sample: ("noncontig_transposed" in sample.name),
+        name="crazy_aot_autograd_bug2",
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 COMPILE_BACKWARD_SKIPS_AND_XFAILS = [
@@ -8741,7 +8970,11 @@ def f(values, offsets):
     # min() / max(): weird bug
     XFailRule(
         error_type=AttributeError,
+<<<<<<< HEAD
         error_msg="'NestedIntNode' object has no attribute 'add'",
+=======
+        error_msg="'ConstantIntNode' object has no attribute 'add'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_match_fn=lambda device, op: (
             op.full_name in {"max.reduction_with_dim", "min.reduction_with_dim"}
         ),
@@ -8758,7 +8991,11 @@ def f(values, offsets):
     # copysign(): formula is broken for (T, NT) broadcasting
     XFailRule(
         error_type=AttributeError,
+<<<<<<< HEAD
         error_msg="'NestedIntNode' object has no attribute 'add'",
+=======
+        error_msg="'ConstantIntNode' object has no attribute 'add'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_match_fn=lambda device, op: (op.full_name == "copysign"),
         sample_match_fn=lambda device, sample: ("(T, NT)" in sample.name),
         name="broken_copysign_compile_backward",
diff --git a/test/test_nn.py b/test/test_nn.py
index 267b594f5c767..7107961beba48 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -31,8 +31,12 @@
 from torch.nn import Buffer, Parameter
 from torch.nn.parallel._functions import Broadcast
 from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, \
     skipIfNoLapack, skipIfRocm, \
+=======
+from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
     download_file, get_function_arglist, load_tests, skipIfMPS, \
     IS_PPC, \
@@ -56,7 +60,11 @@
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_off, tf32_on
 from torch.types import _TensorOrTensors
+<<<<<<< HEAD
 from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
+=======
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
@@ -66,7 +74,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_SCIPY:
     import scipy.signal
@@ -584,6 +596,7 @@ def test_register_buffer_allows_overwriting_with_same_name(self):
         m.buffer_name = Buffer(buffer3)
         self.assertEqual(m.buffer_name, Buffer(buffer3))
 
+<<<<<<< HEAD
     def test_register_buffer_allows_tensor_like_object(self):
         class TensorLike:
             @classmethod
@@ -600,6 +613,8 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         self.assertEqual(m.buffer_name, buffer2)
         self.assertEqual(m.get_buffer('buffer_name'), buffer2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_buffer(self):
         m = nn.Module()
         buffer1 = torch.randn(2, 3)
@@ -1809,17 +1824,28 @@ def check_weight_norm(l, name, num_params):
                 num_params - 1,
             )
 
+<<<<<<< HEAD
             # Removing the weight norm reparameterization restores the Parameter
+=======
+            # Removing the weight norm reparametrization restores the Parameter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             l = torch.nn.utils.remove_weight_norm(l, name=name)
             self.assertEqual(
                 sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights),
                 num_params,
             )
 
+<<<<<<< HEAD
             # Make sure that, upon removal of the reparameterization, the
             # `._parameters` and `.named_parameters` contain the right params.
             # Specifically, the original weight ('weight_ih_l0') should be placed
             # back in the parameters, while the reparameterization components
+=======
+            # Make sure that, upon removal of the reparametrization, the
+            # `._parameters` and `.named_parameters` contain the right params.
+            # Specifically, the original weight ('weight_ih_l0') should be placed
+            # back in the parameters, while the reparametrization components
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # ('weight_ih_l0_v' and 'weight_ih_l0_g') should be removed.
             self.assertTrue(name in l._parameters)
             self.assertIsNotNone(l._parameters[name])
@@ -2034,7 +2060,11 @@ def fn(input):
                 eval_out0 = wrapped_m(input)
                 # assert eval gives same result as last training iteration
                 self.assertEqual(eval_out0, last_train_out)
+<<<<<<< HEAD
                 # assert doing more iteration in eval don't change things
+=======
+                # assert doing more iteartion in eval don't change things
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(eval_out0, wrapped_m(input))
                 self.assertEqual(last_train_u, m.weight_u)
                 self.assertEqual(last_train_v, m.weight_v)
@@ -3522,7 +3552,11 @@ def test_cudnn_weight_format(self):
             nn.RNN(10, 20, batch_first=True)
         ]
         # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it
+<<<<<<< HEAD
         first_warn = not torch.version.hip
+=======
+        first_warn = False if torch.version.hip else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for rnn in rnns:
             rnn.cuda()
             input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
@@ -4565,11 +4599,14 @@ def _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims):
             _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=0)
             _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=-2)
 
+<<<<<<< HEAD
         def test_pixel_shuffle_large_upscale_factor():
             with self.assertRaises(ValueError):
                 ps = nn.PixelShuffle(545460846592)
                 ps(torch.randn(2, 16, 9, 3))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_pixel_shuffle_unshuffle_1D():
             _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=1)
 
@@ -4585,7 +4622,10 @@ def test_pixel_shuffle_unshuffle_4D():
         def test_pixel_shuffle_unshuffle_5D():
             _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=5)
 
+<<<<<<< HEAD
         test_pixel_shuffle_large_upscale_factor()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         test_pixel_shuffle_unshuffle_1D()
         test_pixel_shuffle_unshuffle_2D()
         test_pixel_shuffle_unshuffle_3D()
@@ -5204,6 +5244,7 @@ def test_batchnorm_nhwc_cuda(self):
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
     def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
+<<<<<<< HEAD
         if torch.version.cuda:
             if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
                                         "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
@@ -5231,6 +5272,24 @@ def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
 
             if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
                 self.skipTest("3D float16 NCHW train failed on ROCm")
+=======
+        if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
+           self.skipTest("3D float16 NCHW train failed on CUDA and ROCm due to Native batchnorm accuracy issue SWDEV-541024")
+        if torch.version.hip:
+            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
+                                    ) and _get_torch_rocm_version() < (6, 4):
+                # NCHW bfloat16 path uses native kernels for rocm<=6.3
+                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+                self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
+
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+                                        ) and _get_torch_rocm_version() >= (6, 4):
+                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dims == 3 and memory_format in ("NHWC", "NCHW"):
             memory_format = memory_format + "3D"
@@ -5261,7 +5320,11 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format)
                 return torch.contiguous_format
             if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
                 return memory_format
+<<<<<<< HEAD
             raise ValueError(f"Unable to detect memory format for backend={backend} and memory_format={memory_format}")
+=======
+            raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
             if t.is_contiguous(memory_format=torch.contiguous_format):
@@ -7145,6 +7208,7 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r"the product of kernel_width and kernel_height overflowed"):
             tensor_data = torch.tensor([
                 [1.4009e-03, -1.3341e-32, -1.3334e-32, -1.3341e-32, 1.2723e-38, 3.6334e+00, 1.5374e-02],
@@ -7153,6 +7217,8 @@ def test_unfold_invalid_arg(self):
             ])
             F.fold(tensor_data, 16, 7318349394477056)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_softmin(self):
         x = torch.randn(2, 16)
         self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
@@ -7337,7 +7403,11 @@ def test_sync_batchnorm_backward_elemt(self):
             count_tensor
         )
 
+<<<<<<< HEAD
         # Test batch_norm_backward_element gives the same answer for all
+=======
+        # Test batch_norm_backward_elemt gives the same answer for all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # combinations of contiguous as channels_last input
         for a, b in [
                 (torch.channels_last, torch.contiguous_format),
@@ -7491,7 +7561,10 @@ def test_layer_norm_backwards_eps(self):
                 if bias and elementwise_affine:
                     self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=rtol, atol=atol)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("40GB", device="cuda")
     def test_layer_norm_large_tensor(self):
         # test for https://github.com/pytorch/pytorch/issues/136291
@@ -7531,6 +7604,7 @@ def test_fractional_max_pool2d_invalid_output_ratio(self):
                                     "fractional_max_pool2d requires output_ratio to either be a single Int or tuple of Ints."):
             res = arg_class(*arg_3)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @largeTensorTest("20GB", device="cuda")
     def test_large_max_pool2d_ch_last(self):
@@ -7544,6 +7618,8 @@ def test_large_max_pool2d_ch_last(self):
         y_cuda_contig = pool(x_cuda.contiguous())
         self.assertEqual(y_cuda_ch_last, y_cuda_contig)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_max_pool1d_invalid_output_size(self):
         arg_1 = 3
         arg_2 = 255
@@ -8368,7 +8444,11 @@ def _test_module_empty_inputs(self, module, inputs):
                      "Scipy v1.0 and/or numpy not found")
     @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
     @tf32_on_and_off()
+<<<<<<< HEAD
     @reduced_f32_on_and_off()
+=======
+    @bf32_on_and_off()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_affine_2d_rotate0(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8409,7 +8489,11 @@ def test_affine_2d_rotate0(self, device):
                      "Scipy v1.0 and/or numpy not found")
     @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
     @tf32_on_and_off(0.01 if TEST_WITH_ROCM else 0.001)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.001)
+=======
+    @bf32_on_and_off(0.001)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_affine_2d_rotate90(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8459,7 +8543,11 @@ def test_affine_2d_rotate90(self, device):
                      "Scipy v1.0 and/or numpy not found")
     @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
     @tf32_on_and_off(0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.005)
+=======
+    @bf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_affine_2d_rotate45(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8514,6 +8602,7 @@ def test_avg_pool_large_tensor(self, device):
         self.assertTrue(torch.allclose(a.grad.cpu(), a_cpu.grad.half()))
 
     @onlyCUDA
+<<<<<<< HEAD
     @largeTensorTest("20GB", device="cuda")
     def test_large_max_pool2d_ch_last(self, device):
         # https://github.com/pytorch/pytorch/issues/165297
@@ -8534,6 +8623,8 @@ def test_large_reflect_pad(self, device):
         self.assertEqual(c, c_cpu)
 
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("48GB", "cpu")
     @largeTensorTest("48GB", "cuda")
     def test_avg_pool_large_tensor2(self, device):
@@ -8557,7 +8648,11 @@ def test_avg_pool_large_tensor2(self, device):
                      "Scipy v1.0 and/or numpy not found")
     @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.005)
+=======
+    @bf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_affine_2d_rotateRandom(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8608,8 +8703,14 @@ def test_affine_2d_rotateRandom(self, device):
 
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
+<<<<<<< HEAD
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
     @reduced_f32_on_and_off(0.005)
+=======
+    @expectedFailureMPS  # aten::grid_sampler_3d not implemented https://github.com/pytorch/pytorch/issues/77764
+    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @bf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_affine_3d_rotateRandom(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8662,7 +8763,10 @@ def test_affine_3d_rotateRandom(self, device):
 
             self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary))
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @dtypes(torch.float, torch.half)
     def test_batchnorm_large_batch(self, device, dtype):
@@ -8857,6 +8961,7 @@ def rms_norm_reference_fn(i, normalized_shape, weight, eps=None):
 
         self.assertEqual(Y_ref, Y)
 
+<<<<<<< HEAD
     @onlyNativeDeviceTypes
     @dtypes(torch.float16, torch.bfloat16, torch.float32, torch.float64)
     @dtypesIfMPS(torch.float16, torch.bfloat16, torch.float32)
@@ -8882,6 +8987,8 @@ def rms_norm_reference_fn(i, normalized_shape):
 
         self.assertEqual(Y_ref, Y)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     def test_glu_bfloat16(self, device):
         def test_dtype(fn, input, dtype):
@@ -9034,7 +9141,10 @@ def group_norm_ref(X, gamma, beta, groups, channels, eps):
             Y_cpu = group_norm(X.cpu())
             self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Double is not supported on MPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
     def test_pad(self, device, dtype):
@@ -9046,7 +9156,11 @@ def test_pad(self, device, dtype):
         # Should raise error when negative padding results in negative output shape
         self.assertRaises(RuntimeError, lambda: F.pad(inputs, (-3, -2), mode='circular'))
 
+<<<<<<< HEAD
         # assert that reflection padding errors when pad >= input size
+=======
+        # assert that relfection padding errors when pad >= input size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_err_msg = r"Padding size should be less than the corresponding input dimension"
         inputs = torch.randn(1, 1, 2, 3, device=device, dtype=dtype)
         self.assertRaisesRegex(RuntimeError, expected_err_msg,
@@ -9066,7 +9180,10 @@ def test_pad(self, device, dtype):
             out.fill_(4)
             self.assertTrue(torch.all(torch.abs(inputs) < 2))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Unsupported float64/complex128
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
     def test_ReplicationPad_empty(self, device, dtype):
@@ -9205,7 +9322,10 @@ def test_Bilinear_empty(self, device):
         self.assertEqual(inp1.grad, torch.zeros_like(inp1))
         self.assertEqual(inp2.grad, torch.zeros_like(inp2))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Double not supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
     def test_TransformerEncoderLayer_empty(self, device):
@@ -9235,7 +9355,10 @@ def test_TransformerEncoderLayer_empty(self, device):
                     _test_module_empty_input(self, encoder_layer, input, check_size=False)
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+<<<<<<< HEAD
     @expectedFailureMPS   # Float64 is not supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_TransformerEncoder_empty(self, device):
         for batch_first, input_shape in [(True, (0, 10, 512)),
@@ -9246,7 +9369,10 @@ def test_TransformerEncoder_empty(self, device):
             _test_module_empty_input(self, transformer_encoder, input, check_size=False)
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+<<<<<<< HEAD
     @expectedFailureMPS   # Float64 is not supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_TransformerDecoderLayer_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
@@ -9257,7 +9383,10 @@ def test_TransformerDecoderLayer_empty(self, device):
             self._test_module_empty_inputs(decoder_layer, [tgt, memory])
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+<<<<<<< HEAD
     @expectedFailureMPS   # Float64 is not supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_TransformerDecoder_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
@@ -9269,7 +9398,10 @@ def test_TransformerDecoder_empty(self, device):
             self._test_module_empty_inputs(transformer_decoder, [tgt, memory])
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+<<<<<<< HEAD
     @expectedFailureMPS   # Float64 is not supported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_Transformer_empty(self, device):
         for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]:
@@ -9304,7 +9436,11 @@ def test_ReflectionPad_empty(self, device, dtype):
 
     @onlyNativeDeviceTypes
     def test_ReflectionPad_fails(self, device):
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r'Padding size 2 is not supported for 4D input tensor'):
+=======
+        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod = torch.nn.ReflectionPad1d(2)
             inp = torch.randn(3, 3, 10, 10, device=device)
             mod(inp)
@@ -9313,7 +9449,11 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, device=device)
             torch.ops.aten.reflection_pad1d(inp, (2, 2))
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r'Padding size 4 is not supported for 5D input tensor'):
+=======
+        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod = torch.nn.ReflectionPad2d(2)
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             mod(inp)
@@ -9322,7 +9462,11 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             torch.ops.aten.reflection_pad2d(inp, (2, 2, 2, 2))
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r'Padding size 6 is not supported for 6D input tensor'):
+=======
+        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod = torch.nn.ReflectionPad3d(3)
             inp = torch.randn(3, 3, 10, 10, 10, 10, device=device)
             mod(inp)
@@ -9405,7 +9549,10 @@ def test_ReflectionPad3d_large(self, device):
 
             self.assertEqual(x.grad, ref_x.grad)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Unimplemented margin_loss
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     def test_MarginLoss_empty(self, device, dtype):
@@ -9434,6 +9581,7 @@ def test_MarginLoss_empty(self, device, dtype):
                 mod(x, y)
 
     @onlyCUDA
+<<<<<<< HEAD
     @dtypes(torch.float, torch.double)
     def test_MarginLoss_race(self, device, dtype):
         loss = torch.nn.MultiMarginLoss().to(device)
@@ -9453,6 +9601,8 @@ def test_MarginLoss_race(self, device, dtype):
         self.assertEqual(x_cpu.grad, x.grad.cpu())
 
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_MarginLoss_warnings(self, device):
         model = torch.nn.Linear(128, 22, device=device)
         loss = torch.nn.MultiMarginLoss()
@@ -9465,6 +9615,7 @@ def test_MarginLoss_warnings(self, device):
             l.backward()
         self.assertTrue(len(f.getvalue()) == 0)
 
+<<<<<<< HEAD
     @onlyCUDA
     def test_mse_loss_error(self, device):
         i = torch.randn((10, 1), device=device)
@@ -9473,6 +9624,8 @@ def test_mse_loss_error(self, device):
             F.mse_loss(i, t)
 
     @expectedFailureMPS   # TODO: Fixme, and raise assert on empty tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_Unfold_empty(self, device):
         inp = torch.randn(0, 3, 3, 4, device=device)
@@ -9696,7 +9849,10 @@ def verify_reduction_scalars(input, reduction, output):
                     verify_reduction_scalars(input, reduction, output)
 
     # verify that bogus reduction strings are errors
+<<<<<<< HEAD
     @expectedFailureMPS  # CTCLoss unimplemented
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_invalid_reduction_strings(self, device):
         input = torch.randn(3, 5, requires_grad=True, device=device)
@@ -10183,7 +10339,10 @@ def test_upsamplingNearestExact3d_correctness(self, device, memory_format, isize
     @parametrize_test("align_corners", [True, False])
     @parametrize_test("mode", ["bilinear", "bicubic"])
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+<<<<<<< HEAD
     @expectedFailureMPS  # double device type
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
         # Forward AD does not support XLA because XLA tensors don't have storage
@@ -10253,7 +10412,10 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory
     @parametrize_test("num_channels", [3, 5])
     @parametrize_test("mode", ["nearest", "nearest-exact", "bilinear", "bicubic"])
     @parametrize_test("dtype", integral_types() + floating_types())
+<<<<<<< HEAD
     @skipIfMPS  # Error message is wrong for some dtypes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_upsamplingBiMode2d_nonsupported_dtypes(self, device, antialias, num_channels, mode, dtype):
         x = torch.ones(1, num_channels, 32, 32, dtype=dtype, device=device)
@@ -11183,7 +11345,11 @@ def test_rnn_retain_variables(self, device, dtype):
     @onlyCUDA
     @dtypes(torch.double)
     def test_lstmcell_backward_only_one_output_grad(self, device, dtype):
+<<<<<<< HEAD
         # checks that undefined gradients doesn't hamper the backward
+=======
+        # checks that undefined gradients doen't hamper the backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # see #11872
         l = torch.nn.LSTMCell(2, 3).to(device).to(dtype=dtype)
         s = torch.randn(1, 2, device=device, dtype=dtype, requires_grad=True)
@@ -11293,6 +11459,7 @@ def test_upsamplingNearest2d_launch_config(self, device):
         self.assertEqual(out_ref, out)
 
     @onlyCUDA
+<<<<<<< HEAD
     @dtypes(torch.half, torch.bfloat16)
     def test_cudnn_rnn(self, dtype):
         rnn = nn.RNN(10, 20, num_layers=2, device='cuda', dtype=dtype)
@@ -11303,6 +11470,8 @@ def test_cudnn_rnn(self, dtype):
         self.assertEqual(tuple([i.cuda() for i in output_ref]), output, atol=5e-3, rtol=1e-3)
 
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @gcIfJetson
     def test_upsamplingNearest3d_launch_config(self, device):
         m = nn.Upsample(scale_factor=2)
@@ -11586,7 +11755,10 @@ def test_hardsigmoid_grad(self, device):
         self.assertTrue(gradcheck(F.hardsigmoid, (inputs,)))
 
     # currently fails on XLA
+<<<<<<< HEAD
     @expectedFailureMPS  # TypeError: the MPS framework doesn't support float64
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_hardswish_grad(self, device):
         inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10
@@ -11794,7 +11966,10 @@ def test_batchnorm_simple_average_mixed(self, device, dtype):
                 self._test_batchnorm_simple_average(device, dtype, torch.float)
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     @expectedFailureMPS  # Unsupported Border padding mode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double)
     def test_grid_sample_nan_inf(self, device, dtype):
         input = torch.zeros([1, 1, 3, 3], device=device, dtype=dtype)
@@ -12144,6 +12319,7 @@ def test_activations_bfloat16(self, device):
     def test_softmax_bfloat16(self, device):
         for dim in [0, 1, 2, 3]:
             _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=1e-2)
+<<<<<<< HEAD
             # test softmax with large input value which causes exp() to overflow
             _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
 
@@ -12153,6 +12329,11 @@ def test_nll_loss_1d_input_1d_target_invalid_size(self, device):
         with self.assertRaisesRegex(ValueError, "For 1D input, 1D target must have size 1"):
             F.nll_loss(x, t)
 
+=======
+            # test softmax with large input value which casues exp() to overflow
+            _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nll_loss_mismatched_batch(self, device):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -12481,7 +12662,11 @@ def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, d
             input = torch.randn(N, C, *other_dims, device=device, requires_grad=True)
             target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C)
 
+<<<<<<< HEAD
             # construct target probability that should have the same result as label_smoothing
+=======
+            # construct target probablity that should have the same result as label_smoothing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_proba = F.one_hot(target, num_classes=C)
             # Need to put the C dim at index 1.
             target_proba = target_proba.permute(0, -1, *range(1, target_proba.dim() - 1))
@@ -12882,6 +13067,7 @@ def test_leaky_relu_inplace_with_zero_slope(self, device):
         self.assertEqual(a_bf16.grad, expected_bf16)
 
     @onlyCPU
+<<<<<<< HEAD
     def test_rrelu_bounds_validation(self, device):
         """Test RReLU bounds validation for finite and infinite values."""
         x = torch.randn(5, 5, device=device)
@@ -12919,6 +13105,8 @@ def test_rrelu_bounds_validation(self, device):
             F.rrelu(x, lower=0.5, upper=0.3)
 
     @onlyCPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_softshrink(self, device):
         x = torch.tensor([[1.21, 0.56, 0.5001, 0.4999, 1.2357, -0.4999, -0.5001, -1.154,
                            0.254, -0.24, -0.225, 0.104, 0.002, -0.001, 0.0574, 1.2344,
@@ -12950,7 +13138,10 @@ def test_threshold_inplace_overlap(self, device):
         F.threshold(x, 0.5, 0.5, inplace=True)
         F.threshold_(x, 0.5, 0.5)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Double is unsupported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_triplet_margin_with_distance_loss_default_parity(self, device):
         # Test for `nn.TripletMarginWithDistanceLoss` and
@@ -12985,7 +13176,10 @@ def test_triplet_margin_with_distance_loss_default_parity(self, device):
             self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n),
                             (anchor, positive, negative)))
 
+<<<<<<< HEAD
     @expectedFailureMPS  # Double is unsupported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_triplet_margin_with_distance_loss(self, device):
         # Test for parity between `nn.TripletMarginWithDistanceLoss` and
@@ -13129,6 +13323,10 @@ def test_skip_init(self, device):
         self.assertEqual(m_initialized.weight.device, m_uninitialized.weight.device)
         self.assertFalse(torch.allclose(m_initialized.weight, m_uninitialized.weight))
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm(msg='Not our bug: TransformerEncoderLayer._sa_block still uses FA/ME and effectively takes fastpath')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfMPS  # TODO(hvaara): Investigate as possible bug. macOS 13 passes, while 14 and 15 fails.
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.double, torch.float, torch.half)
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index a194c2da6de3e..fd945a3644053 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Owner(s): ["module: cuda"]
+=======
+# Owner(s): ["module: unknown"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import unittest
 
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 724cc974047b7..44ba40b9beefa 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -164,6 +164,7 @@ def test_to_numpy_bool(self, device) -> None:
         self.assertEqual(y.dtype, np.bool_)
         self.assertEqual(x[0], y[0])
 
+<<<<<<< HEAD
     @skipIfTorchDynamo(
         "can't check if value is ZeroTensor since _is_zerotensor returns a bool and not a TensorVariable"
     )
@@ -186,6 +187,8 @@ def test_to_numpy_zero_tensor(self, device) -> None:
             for i in range(10):
                 self.assertEqual(y[i], 0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("conj bit not implemented in TensorVariable yet")
     def test_to_numpy_force_argument(self, device) -> None:
         for force in [False, True]:
@@ -205,7 +208,11 @@ def test_to_numpy_force_argument(self, device) -> None:
                             x = x.conj()
                             y = x.resolve_conj()
                         expect_error = (
+<<<<<<< HEAD
                             requires_grad or sparse or conj or device != "cpu"
+=======
+                            requires_grad or sparse or conj or not device == "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         error_msg = r"Use (t|T)ensor\..*(\.numpy\(\))?"
                         if not force and expect_error:
@@ -510,7 +517,11 @@ def test_parse_numpy_int_overflow(self, device):
             )  # type: ignore[call-overload]
         else:
             self.assertRaisesRegex(
+<<<<<<< HEAD
                 ValueError,
+=======
+                RuntimeError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "(Overflow|an integer is required)",
                 lambda: torch.mean(torch.randn(1, 1), np.uint64(-1)),
             )  # type: ignore[call-overload]
@@ -596,7 +607,11 @@ def test_numpy_scalar_cmp(self, device, dtype):
                 if (
                     dtype == torch.complex64
                     and torch.is_tensor(t)
+<<<<<<< HEAD
                     and type(a) is np.complex64
+=======
+                    and type(a) == np.complex64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     # TODO: Imaginary part is dropped in this case. Need fix.
                     # https://github.com/pytorch/pytorch/issues/43579
@@ -661,6 +676,7 @@ def test_empty_tensors_interop(self, device):
         # Regression test for https://github.com/pytorch/pytorch/issues/113037
         self.assertEqual(torch.div(x, y, rounding_mode="floor").shape, y.shape)
 
+<<<<<<< HEAD
     def test_ndarray_astype_object_graph_break(self):
         @torch.compile(backend="eager", fullgraph=True)
         def f(xs):
@@ -683,6 +699,8 @@ def f(xs):
         ):
             f(xs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestNumPyInterop, globals())
 
diff --git a/test/test_openreg.py b/test/test_openreg.py
new file mode 100644
index 0000000000000..e44a2e4306d2b
--- /dev/null
+++ b/test/test_openreg.py
@@ -0,0 +1,429 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import os
+import tempfile
+import types
+import unittest
+
+import psutil
+import pytorch_openreg  # noqa: F401
+
+import torch
+from torch.testing._internal.common_utils import (
+    IS_LINUX,
+    run_tests,
+    skipIfTorchDynamo,
+    skipIfXpu,
+    TestCase,
+)
+
+
+class TestPrivateUse1(TestCase):
+    """Tests of third-parth device integration mechinasm based PrivateUse1"""
+
+    def test_backend_name(self):
+        self.assertEqual(torch._C._get_privateuse1_backend_name(), "openreg")
+        # backend can be renamed to the same name multiple times
+        torch.utils.rename_privateuse1_backend("openreg")
+        with self.assertRaisesRegex(RuntimeError, "has already been set"):  # type: ignore[misc]
+            torch.utils.rename_privateuse1_backend("dev")
+
+    def test_backend_module_registration(self):
+        def generate_faked_module():
+            return types.ModuleType("fake_module")
+
+        with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):  # type: ignore[misc]
+            torch._register_device_module("dev", generate_faked_module())
+        with self.assertRaisesRegex(RuntimeError, "The runtime module of"):  # type: ignore[misc]
+            torch._register_device_module("openreg", generate_faked_module())
+
+    def test_backend_generate_methods(self):
+        with self.assertRaisesRegex(RuntimeError, "The custom device module of"):  # type: ignore[misc]
+            torch.utils.generate_methods_for_privateuse1_backend()  # type: ignore[misc]
+
+        self.assertTrue(hasattr(torch.Tensor, "is_openreg"))
+        self.assertTrue(hasattr(torch.Tensor, "openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.nn.Module, "openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "is_openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "openreg"))
+
+    def test_backend_module_function(self):
+        with self.assertRaisesRegex(RuntimeError, "Try to call torch.openreg"):  # type: ignore[misc]
+            torch.utils.backend_registration._get_custom_mod_func("func_name_")  # type: ignore[misc]
+        self.assertTrue(
+            torch.utils.backend_registration._get_custom_mod_func("device_count")() == 2  # type: ignore[misc]
+        )
+
+    @skipIfTorchDynamo()
+    def test_backend_operator_registration(self):
+        self.assertTrue(
+            torch._C._dispatch_has_kernel_for_dispatch_key(
+                "aten::empty.memory_format", torch.DispatchKey.PrivateUse1
+            )
+        )
+        x = torch.empty(3, 3, device="openreg")
+        self.assertTrue(x.device.type, "openreg")
+        self.assertTrue(x.shape, torch.Size([3, 3]))
+
+    def test_backend_dispatchstub(self):
+        x_cpu = torch.randn(2, 2, 3, dtype=torch.float32, device="cpu")
+        x_openreg = x_cpu.to("openreg")
+
+        y_cpu = torch.abs(x_cpu)
+        y_openreg = torch.abs(x_openreg)
+        self.assertEqual(y_cpu, y_openreg.cpu())
+
+        o_cpu = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
+        o_openreg = o_cpu.to("openreg")
+        # output operand with resize flag is False in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:2])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:2])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+        # output operand with resize flag is True in TensorIterator and
+        # convert output to contiguous tensor in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:3])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:3])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+    def test_backend_tensor_type(self):
+        dtypes_map = {
+            torch.bool: "torch.openreg.BoolTensor",
+            torch.double: "torch.openreg.DoubleTensor",
+            torch.float32: "torch.openreg.FloatTensor",
+            torch.half: "torch.openreg.HalfTensor",
+            torch.int32: "torch.openreg.IntTensor",
+            torch.int64: "torch.openreg.LongTensor",
+            torch.int8: "torch.openreg.CharTensor",
+            torch.short: "torch.openreg.ShortTensor",
+            torch.uint8: "torch.openreg.ByteTensor",
+        }
+
+        for dtype, str in dtypes_map.items():
+            x = torch.empty(4, 4, dtype=dtype, device="openreg")
+            self.assertTrue(x.type() == str)
+
+    # Note that all dtype-d Tensor objects here are only for legacy reasons
+    # and should NOT be used.
+    def test_backend_type_methods(self):
+        # Tensor
+        tensor_cpu = torch.randn([8]).float()
+        self.assertEqual(tensor_cpu.type(), "torch.FloatTensor")
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertEqual(tensor_openreg.type(), "torch.openreg.FloatTensor")
+
+        # Storage
+        storage_cpu = tensor_cpu.storage()
+        self.assertEqual(storage_cpu.type(), "torch.FloatStorage")
+
+        tensor_openreg = tensor_cpu.openreg()
+        storage_openreg = tensor_openreg.storage()
+        self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+
+        class CustomFloatStorage:
+            @property
+            def __module__(self):
+                return "torch." + torch._C._get_privateuse1_backend_name()
+
+            @property
+            def __name__(self):
+                return "FloatStorage"
+
+        try:
+            torch.openreg.FloatStorage = CustomFloatStorage()
+            self.assertEqual(storage_openreg.type(), "torch.openreg.FloatStorage")
+
+            # test custom int storage after defining FloatStorage
+            tensor_openreg = tensor_cpu.int().openreg()
+            storage_openreg = tensor_openreg.storage()
+            self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+        finally:
+            torch.openreg.FloatStorage = None
+
+    def test_backend_tensor_methods(self):
+        x = torch.empty(4, 4)
+        self.assertFalse(x.is_openreg)  # type: ignore[misc]
+
+        y = x.openreg(torch.device("openreg"))  # type: ignore[misc]
+        self.assertTrue(y.is_openreg)  # type: ignore[misc]
+        z = x.openreg(torch.device("openreg:0"))  # type: ignore[misc]
+        self.assertTrue(z.is_openreg)  # type: ignore[misc]
+        n = x.openreg(0)  # type: ignore[misc]
+        self.assertTrue(n.is_openreg)  # type: ignore[misc]
+
+    @unittest.skip("Need to support Parameter in openreg")
+    def test_backend_module_methods(self):
+        class FakeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self):
+                pass
+
+        module = FakeModule()
+        self.assertEqual(module.x.device.type, "cpu")
+        module.openreg()  # type: ignore[misc]
+        self.assertEqual(module.x.device.type, "openreg")
+
+    @unittest.skip("Need to support untyped_storage in openreg")
+    def test_backend_storage_methods(self):
+        x = torch.empty(4, 4)
+
+        x_cpu = x.storage()
+        self.assertFalse(x_cpu.is_openreg)  # type: ignore[misc]
+        x_openreg = x_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(x_openreg.is_openreg)  # type: ignore[misc]
+
+        y = torch.empty(4, 4)
+
+        y_cpu = y.untyped_storage()
+        self.assertFalse(y_cpu.is_openreg)  # type: ignore[misc]
+        y_openreg = y_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(y_openreg.is_openreg)  # type: ignore[misc]
+
+    def test_backend_packed_sequence_methods(self):
+        x = torch.rand(5, 3)
+        y = torch.tensor([1, 1, 1, 1, 1])
+
+        z_cpu = torch.nn.utils.rnn.PackedSequence(x, y)
+        self.assertFalse(z_cpu.is_openreg)  # type: ignore[misc]
+
+        z_openreg = z_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(z_openreg.is_openreg)  # type: ignore[misc]
+
+
+class TestOpenReg(TestCase):
+    """Tests of mimick accelerator named OpenReg based on PrivateUse1"""
+
+    # Stream & Event
+    def test_stream_synchronize(self):
+        stream = torch.Stream(device="openreg:1")
+        stream.synchronize()
+        self.assertEqual(True, stream.query())
+
+    def test_stream_wait_stream(self):
+        stream_1 = torch.Stream(device="openreg:0")
+        stream_2 = torch.Stream(device="openreg:1")
+        # Does not crash!
+        stream_2.wait_stream(stream_1)
+
+    @skipIfTorchDynamo()
+    def test_record_event(self):
+        stream = torch.Stream(device="openreg:1")
+        event1 = stream.record_event()
+        self.assertNotEqual(0, event1.event_id)
+        event2 = stream.record_event()
+        self.assertNotEqual(0, event2.event_id)
+        self.assertNotEqual(event1.event_id, event2.event_id)
+
+    @skipIfTorchDynamo()
+    def test_event_elapsed_time(self):
+        stream = torch.Stream(device="openreg:1")
+        e1 = torch.Event(device="openreg:1", enable_timing=True)
+        e1.record(stream)
+        e2 = torch.Event(device="openreg:1", enable_timing=True)
+        e2.record(stream)
+
+        e2.synchronize()
+        self.assertTrue(e2.query())
+
+        ms = e1.elapsed_time(e2)
+        self.assertTrue(ms > 0)
+
+    @skipIfTorchDynamo()
+    def test_stream_wait_event(self):
+        s1 = torch.Stream(device="openreg")
+        s2 = torch.Stream(device="openreg")
+        e = s1.record_event()
+        s2.wait_event(e)
+
+    @skipIfTorchDynamo()
+    def test_event_wait_stream(self):
+        s1 = torch.Stream(device="openreg")
+        s2 = torch.Stream(device="openreg")
+        e1 = s1.record_event()
+        e1.wait(s2)
+
+    # Copy
+    def test_cross_device_copy(self):
+        a = torch.rand(10)
+        b = a.to(device="openreg").add(2).to(device="cpu")
+        self.assertEqual(b, a + 2)
+
+    def test_copy_same_device(self):
+        a = torch.ones(10, device="openreg").clone()
+        self.assertEqual(a, torch.ones(10, device="openreg"))
+
+    def test_cross_diff_devices_copy(self):
+        a = torch.ones(10, device="openreg:0").to(device="openreg:1").to(device="cpu")
+        self.assertEqual(a, torch.ones(10))
+
+    # RNG
+    def test_generator(self):
+        generator = torch.Generator(device="openreg:1")
+        self.assertEqual(generator.device.type, "openreg")
+        self.assertEqual(generator.device.index, 1)
+
+    def test_rng_state(self):
+        state = torch.openreg.get_rng_state(0)  # type: ignore[misc]
+        torch.openreg.set_rng_state(state, 0)  # type: ignore[misc]
+
+    def test_manual_seed(self):
+        torch.openreg.manual_seed_all(2024)  # type: ignore[misc]
+        self.assertEqual(torch.openreg.initial_seed(), 2024)  # type: ignore[misc]
+
+    # Autograd
+    @unittest.skipIf(not IS_LINUX, "Only works on linux")
+    def test_autograd_init(self):
+        # Make sure autograd is initialized
+        torch.ones(2, requires_grad=True, device="openreg").sum().backward()
+
+        pid = os.getpid()
+        task_path = f"/proc/{pid}/task"
+        all_threads = psutil.Process(pid).threads()
+
+        all_thread_names = set()
+
+        for t in all_threads:
+            with open(f"{task_path}/{t.id}/comm") as file:
+                thread_name = file.read().strip()
+            all_thread_names.add(thread_name)
+
+        for i in range(torch.accelerator.device_count()):
+            self.assertIn(f"pt_autograd_{i}", all_thread_names)
+
+    # Storage & Pin Memory
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_pin_memory(self):
+        tensor = torch.randn(10)
+        self.assertFalse(tensor.is_pinned())
+        pinned_tensor = tensor.pin_memory()
+        self.assertTrue(pinned_tensor.is_pinned())
+        slice_tensor = pinned_tensor[2:5]
+        self.assertTrue(slice_tensor.is_pinned())
+
+        tensor = torch.randn(10)
+        storage = tensor.storage()
+        self.assertFalse(storage.is_pinned("openreg"))
+        pinned_storage = storage.pin_memory("openreg")
+        self.assertTrue(pinned_storage.is_pinned("openreg"))
+
+        tensor = torch.randn(10)
+        untyped_storage = tensor.untyped_storage()
+        self.assertFalse(untyped_storage.is_pinned("openreg"))
+        pinned_untyped_storage = untyped_storage.pin_memory("openreg")
+        self.assertTrue(pinned_untyped_storage.is_pinned("openreg"))
+
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_rewrapped_storage(self):
+        pinned_a = torch.randn(10).pin_memory()
+        rewrapped_a = torch.tensor((), dtype=torch.float32).set_(
+            pinned_a.untyped_storage()[2:],
+            size=(5,),
+            stride=(1,),
+            storage_offset=0,
+        )
+        self.assertTrue(rewrapped_a.is_pinned())
+        self.assertNotEqual(pinned_a.data_ptr(), rewrapped_a.data_ptr())
+
+    # Serialization
+    @unittest.skip(
+        "Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function,"
+        "this pr can fix this, https://github.com/pytorch/pytorch/pull/147095"
+    )
+    def test_serialization(self):
+        storage = torch.UntypedStorage(4, device=torch.device("openreg"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage = torch.UntypedStorage(4, device=torch.device("openreg:0"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage_cpu = torch.empty(4, 4).storage()
+        storage_openreg = torch.serialization.default_restore_location(
+            storage_cpu, "openreg:0"
+        )
+        self.assertTrue(storage_openreg.is_openreg)  # type: ignore[misc]
+
+        tensor = torch.empty(3, 3, device="openreg")
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), {})  # type: ignore[misc]
+        metadata = {"version_number": True, "format_number": True}
+        torch._utils.set_tensor_metadata(tensor, metadata)  # type: ignore[misc]
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), metadata)  # type: ignore[misc]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "data.pt")
+            torch.save(tensor, path)
+
+            tensor_openreg = torch.load(path)
+            self.assertTrue(tensor_openreg.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_openreg), metadata)  # type: ignore[misc]
+
+            tensor_cpu = torch.load(path, map_location="cpu")
+            self.assertFalse(tensor_cpu.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_cpu), {})  # type: ignore[misc]
+
+    # Opeartors
+    def test_factory(self):
+        x = torch.empty(3, device="openreg")
+        self.assertEqual(x.device.type, "openreg")
+        self.assertEqual(x.shape, torch.Size([3]))
+
+        y = torch.zeros(3, device="openreg")
+        self.assertEqual(y.device.type, "openreg")
+        self.assertEqual(y.shape, torch.Size([3]))
+
+        z = torch.tensor((), device="openreg")
+        self.assertEqual(z.device.type, "openreg")
+        self.assertEqual(z.shape, torch.Size([0]))
+
+    def test_printing(self):
+        a = torch.ones(20, device="openreg")
+        # Does not crash!
+        str(a)
+
+    def test_data_dependent_output(self):
+        cpu_a = torch.randn(10)
+        a = cpu_a.to(device="openreg")
+        mask = a.gt(0)
+        out = torch.masked_select(a, mask)
+
+        self.assertEqual(out, cpu_a.masked_select(cpu_a.gt(0)))
+
+    def test_expand(self):
+        x = torch.tensor([[1], [2], [3]], device="openreg")
+        y = x.expand(3, 2)
+        self.assertEqual(y.to(device="cpu"), torch.tensor([[1, 1], [2, 2], [3, 3]]))
+        self.assertEqual(x.data_ptr(), y.data_ptr())
+
+    def test_resize(self):
+        tensor_cpu = torch.randn([4, 4])
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertTrue(tensor_openreg.size() == torch.Size([4, 4]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+        tensor_openreg.resize_(2, 2, 2, 2)
+        self.assertTrue(tensor_openreg.size() == torch.Size([2, 2, 2, 2]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+    # Quantize
+    @skipIfXpu(msg="missing kernel for openreg")
+    def test_quantize(self):
+        x = torch.randn(3, 4, 5, dtype=torch.float32, device="openreg")
+        quantized_tensor = torch.quantize_per_tensor(x, 0.1, 10, torch.qint8)
+        self.assertEqual(quantized_tensor.device, torch.device("openreg:0"))
+        self.assertEqual(quantized_tensor.dtype, torch.qint8)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index ce137cf33192f..ef7a2ead7e22d 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -28,11 +28,17 @@
     onlyCPU,
     onlyCUDA,
     onlyNativeDeviceTypesAnd,
+<<<<<<< HEAD
     onlyOn,
     OpDTypes,
     ops,
     skipMeta,
     skipXPU,
+=======
+    OpDTypes,
+    ops,
+    skipMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -89,7 +95,11 @@
 # Get names of all the operators which have ref in their entry in OpInfo (testing infra)
 #   except for elementwise unary operators (separately implemented in test/test_unary_ufuncs.py),
 #   elementwise binary operators (separately implemented in test_binary_ufuncs.py),
+<<<<<<< HEAD
 #   reduction operations (separately implemented in test_reductions.py),
+=======
+#   reduction operations (separately impelemented in test_reductions.py),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py)
 _ref_test_ops = tuple(
     filter(
@@ -112,6 +122,7 @@ def reduction_dtype_filter(op):
     return "dtype" in inspect.getfullargspec(op.op).kwonlyargs
 
 
+<<<<<<< HEAD
 def has_reduction_tag(op):
     """Check if an op has the reduction tag."""
     if not hasattr(torch.ops.aten, op.name):
@@ -122,6 +133,8 @@ def has_reduction_tag(op):
     return torch.Tag.reduction in aten_op.default.tags
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Create a list of operators that are a subset of _ref_test_ops but don't have a
 # numpy ref to compare them too, If both CPU and CUDA are compared to numpy
 # then they do not need to be compared to each other
@@ -130,17 +143,30 @@ def has_reduction_tag(op):
 aten = torch.ops.aten
 
 meta_consistency_out_dtype_mismatch_xfails = {
+<<<<<<< HEAD
+=======
+    xfail("alias_copy"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("all"),
     xfail("amax"),
     xfail("amin"),
     xfail("aminmax"),
     xfail("any"),
+<<<<<<< HEAD
+=======
+    xfail("as_strided_copy"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("bucketize"),
     xfail("conj_physical"),
     xfail("cross"),
     xfail("cummax"),
     xfail("cummin"),
     xfail("diag"),
+<<<<<<< HEAD
+=======
+    xfail("diagonal_copy"),
+    xfail("expand_copy"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("fft.ihfft2"),
     xfail("fft.ihfftn"),
     xfail("frexp"),
@@ -175,6 +201,11 @@ def has_reduction_tag(op):
     xfail("msort"),
     xfail("multinomial"),
     xfail("nan_to_num"),
+<<<<<<< HEAD
+=======
+    xfail("nanmean"),
+    xfail("narrow_copy"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("native_batch_norm"),
     xfail("neg"),
     xfail("nn.functional.avg_pool3d"),
@@ -184,6 +215,10 @@ def has_reduction_tag(op):
     xfail("nn.functional.softplus"),
     xfail("nn.functional.softshrink"),
     xfail("ormqr"),
+<<<<<<< HEAD
+=======
+    xfail("permute_copy"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("qr"),
     xfail("renorm"),
     xfail("round"),
@@ -198,10 +233,23 @@ def has_reduction_tag(op):
     xfail("softmax"),
     xfail("sort"),
     xfail("sparse.sampled_addmm"),
+<<<<<<< HEAD
+    xfail("take"),
+    xfail("tril"),
+    xfail("triu"),
+    xfail("unfold_copy"),
+=======
+    xfail("squeeze_copy"),
+    xfail("t_copy"),
     xfail("take"),
+    xfail("transpose_copy"),
     xfail("tril"),
     xfail("triu"),
     xfail("unfold_copy"),
+    xfail("unsqueeze_copy"),
+    xfail("view_copy"),
+    xfail("where"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Output has dynamic shape.
     # Does not have a meta kernel implementation.
     skip("linalg.lstsq"),
@@ -233,7 +281,11 @@ def tearDownClass(cls):
             assert len(filtered_ops) == 0, err_msg
 
     # Validates that each OpInfo works correctly on different CUDA devices
+<<<<<<< HEAD
     @onlyOn(["cuda", "xpu"])
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
     def test_multiple_devices(self, devices, dtype, op):
@@ -337,6 +389,7 @@ def get_opoverloadpacket_from_dispatch(kernel):
 
                         self.assertTrue(torch.Tag.pointwise in overload.tags)
 
+<<<<<<< HEAD
     def test_reduction_tag_coverage(self):
         """Test that operators with reduction tag are from reduction operator files."""
         pytorch_dir = os.path.abspath(__file__ + "/../../")
@@ -454,14 +507,19 @@ def test_reduction_ops_reduce(self, device, op):
                     f"(input: {sample.input.numel()}, expected: {expected_numel}, got: {result.numel()})",
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Tests that the function and its (ndarray-accepting) reference produce the same
     #   values on the tensors from sample_inputs func for the corresponding op.
     # This test runs in double and complex double precision because
     # NumPy does computation internally using double precision for many functions
     # resulting in possible equality check failures.
     # skip windows case on CPU due to https://github.com/pytorch/pytorch/issues/129947
+<<<<<<< HEAD
     # XPU test will be enabled step by step. Skip the tests temporarily.
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypesAnd(["hpu"])
     @suppress_warnings
     @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128))
@@ -471,7 +529,11 @@ def test_numpy_ref(self, device, dtype, op):
             and op.formatted_name
             in ("signal_windows_exponential", "signal_windows_bartlett")
             and dtype == torch.float64
+<<<<<<< HEAD
             and ("cuda" in device or "xpu" in device)
+=======
+            and "cuda" in device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             or "cpu" in device
         ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
@@ -484,7 +546,11 @@ def test_numpy_ref(self, device, dtype, op):
                 )
 
     # Tests that the cpu and gpu results are consistent
+<<<<<<< HEAD
     @onlyOn(["cuda", "xpu"])
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @suppress_warnings
     @slowTest
     @ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
@@ -503,7 +569,11 @@ def to_cpu(arg):
 
             # output_process_fn_grad has a very unfortunate name
             # We use this function in linalg extensively to postprocess the inputs of functions
+<<<<<<< HEAD
             # that are not completely well-defined. Think svd and multiplying the singular vectors by -1.
+=======
+            # that are not completely well-defined. Think svd and muliplying the singular vectors by -1.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # CPU and CUDA implementations of the SVD can return valid SVDs that are different.
             # We use this function to compare them.
             cuda_results = sample.output_process_fn_grad(cuda_results)
@@ -516,7 +586,10 @@ def to_cpu(arg):
     # Tests that experimental Python References can propagate shape, dtype,
     # and device metadata properly.
     # See https://github.com/pytorch/pytorch/issues/78050 for a discussion of stride propagation.
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -711,8 +784,12 @@ def _distance(a, b):
 
     # Tests that experimental Python References perform the same computation
     # as the operators they reference, when operator calls in the torch
+<<<<<<< HEAD
     # namespace are remapped to the refs namespace (torch.foo becomes refs.foo).
     @skipXPU
+=======
+    # namesapce are remapped to the refs namespace (torch.foo becomes refs.foo).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -731,7 +808,10 @@ def test_python_ref(self, device, dtype, op):
     # Tests that experimental Python References perform the same computation
     # as the operators they reference, when operator calls in the torch
     # namespace are preserved (torch.foo remains torch.foo).
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -767,7 +847,10 @@ def test_python_ref_executor(self, device, dtype, op, executor):
         op.op = partial(make_traced(op.op), executor=executor)
         self._ref_test_helper(contextlib.nullcontext, device, dtype, op)
 
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
@@ -779,7 +862,10 @@ def test_errors(self, device, op):
                 out = op(si.input, *si.args, **si.kwargs)
                 self.assertFalse(isinstance(out, type(NotImplemented)))
 
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -803,7 +889,10 @@ def test_errors_sparse(self, device, op, layout):
                 out = op(si.input, *si.args, **si.kwargs)
                 self.assertFalse(isinstance(out, type(NotImplemented)))
 
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -830,7 +919,10 @@ def _to_tensormeta(x):
 
     # Tests that the function produces the same result when called with
     #   noncontiguous tensors.
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_tf32_off
     @onlyNativeDeviceTypesAnd(["hpu"])
     @suppress_warnings
@@ -896,7 +988,11 @@ def test_noncontiguous_samples(self, device, dtype, op):
                 else tuple(n_inp) + n_args
             )
 
+<<<<<<< HEAD
             # Filter the elements that are tensors that require grad
+=======
+            # Filter the elemnts that are tensors that require grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t_input_tensors = [
                 t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad
             ]
@@ -923,7 +1019,10 @@ def test_noncontiguous_samples(self, device, dtype, op):
     #   incorrectly sized out parameter warning properly yet
     # Cases test here:
     #   - out= with the correct dtype and device, but the wrong shape
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(ops_and_refs, dtypes=OpDTypes.none)
     def test_out_warning(self, device, op):
         if TEST_WITH_TORCHDYNAMO and op.name == "_refs.clamp":
@@ -1062,7 +1161,10 @@ def _any_nonempty(out):
     # Case 3 and 4 are slightly different when the op is a factory function:
     #   - if device, dtype are NOT passed, any combination of dtype/device should be OK for out
     #   - if device, dtype are passed, device and dtype should match
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(ops_and_refs, dtypes=OpDTypes.any_one)
     def test_out(self, device, dtype, op):
         # Prefers running in float32 but has a fallback for the first listed supported dtype
@@ -1248,6 +1350,7 @@ def _case_four_transform(t):
                 if op.is_factory_function and sample.kwargs.get("dtype", None) is None:
                     op_out(out=out)
                 else:
+<<<<<<< HEAD
                     # TODO: Remove me when all ops will raise type error on mismatched types
                     exc_type = (
                         TypeError
@@ -1267,6 +1370,11 @@ def _case_four_transform(t):
                         op_out(out=out)
 
     @skipXPU
+=======
+                    with self.assertRaises(RuntimeError, msg=msg_fail):
+                        op_out(out=out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(
         [
             op
@@ -1305,7 +1413,10 @@ def set_requires_grad(x):
         with self.assertRaises(RuntimeError, msg=msg), maybe_skip_size_asserts(op):
             op(sample.input, *sample.args, **sample.kwargs, out=out)
 
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(filter(reduction_dtype_filter, ops_and_refs), dtypes=(torch.int16,))
     def test_out_integral_dtype(self, device, dtype, op):
         def helper(with_out, expectFail, op_to_test, inputs, *args, **kwargs):
@@ -1349,7 +1460,10 @@ def helper(with_out, expectFail, op_to_test, inputs, *args, **kwargs):
     # Tests that the forward and backward passes of operations produce the
     #   same values for the cross-product of op variants (method, inplace)
     #   against eager's gold standard op function variant
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @_variant_ops(op_db)
     def test_variant_consistency_eager(self, device, dtype, op):
         # Acquires variants (method variant, inplace variant, operator variant, inplace_operator variant, aliases)
@@ -1530,7 +1644,10 @@ def _test_inplace_preserve_storage(samples, variants):
 
     # Reference testing for operations in complex32 against complex64.
     # NOTE: We test against complex64 as NumPy doesn't have a complex32 equivalent dtype.
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(op_db, allowed_dtypes=(torch.complex32,))
     def test_complex_half_reference_testing(self, device, dtype, op):
         if not op.supports_dtype(torch.complex32, device):
@@ -1566,6 +1683,7 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # `cfloat` input -> `float` output
             self.assertEqual(actual, expected, exact_dtype=False)
 
+<<<<<<< HEAD
     @skipXPU
     @ops(op_db, allowed_dtypes=(torch.bool,))
     def test_non_standard_bool_values(self, device, dtype, op):
@@ -1583,6 +1701,10 @@ def test_non_standard_bool_values(self, device, dtype, op):
             if self._testMethodName in rocm_blocklist:
                 self.skipTest("Failed on ROCm")
 
+=======
+    @ops(op_db, allowed_dtypes=(torch.bool,))
+    def test_non_standard_bool_values(self, device, dtype, op):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test boolean values other than 0x00 and 0x01 (gh-54789)
         def convert_boolean_tensors(x):
             if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
@@ -1609,7 +1731,10 @@ def convert_boolean_tensors(x):
 
     # Validates that each OpInfo specifies its forward and backward dtypes
     #   correctly for CPU and CUDA devices
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(ops_and_refs, dtypes=OpDTypes.none)
@@ -1760,6 +1885,7 @@ def _tensor_requires_grad(x):
         ) == 0:
             return
 
+<<<<<<< HEAD
         if TEST_WITH_TORCHDYNAMO:
             # NOTE: Also for TEST_WITH_TORCHINDUCTOR tests
             # Under compile, some ops may be decomposed into supported ops
@@ -1770,6 +1896,8 @@ def _tensor_requires_grad(x):
             ) == 0:
                 return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Reference operators often support additional dtypes, and that's OK
         if op in python_ref_db:
             if (
@@ -1816,7 +1944,10 @@ def _tensor_requires_grad(x):
         self.fail(msg)
 
     # Validates that each OpInfo that sets promotes_int_to_float=True does as it says
+<<<<<<< HEAD
     @skipXPU
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -2661,7 +2792,11 @@ def test_refs_are_in_decomp_table(self, op):
     "linalg.eigvals",  # The tensor has a non-zero number of elements, but its data is not allocated yet
     "linalg.eigvalsh",  # aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
     "linalg.matrix_power",  # Could not run 'aten::eye.m_out' with arguments from the 'Meta' backend
+<<<<<<< HEAD
     # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backend
+=======
+    # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "linalg.matrix_rank.hermitian",  # Could not run 'aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
     "linalg.pinv.hermitian",  # tensor.mH is only supported on matrices or batches of matrices. Got 1-D tensor
     "linalg.solve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta' backend
@@ -2671,6 +2806,10 @@ def test_refs_are_in_decomp_table(self, op):
     "mvlgamma.mvlgamma_p_1",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
     "mvlgamma.mvlgamma_p_3",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
     "mvlgamma.mvlgamma_p_5",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+<<<<<<< HEAD
+=======
+    "nanmean",  # logical_not() got an unexpected keyword argument 'out'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "quantile",  # quantile() q values must be in the range [0, 1]
     "nanquantile",  # quantile() q values must be in the range [0, 1]
     "nn.functional.ctc_loss",  # The tensor has a non-zero number of elements, but its data is not allocated yet
@@ -2755,7 +2894,10 @@ def test_refs_are_in_decomp_table(self, op):
 @unMarkDynamoStrictTest
 class TestFakeTensor(TestCase):
     def setUp(self):
+<<<<<<< HEAD
         super().setUp()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Turn on FakeTensor caching and cross-checking for these tests:
         cache_enabled = unittest.mock.patch(
             "torch._dynamo.config.fake_tensor_cache_enabled", True
@@ -3006,7 +3148,11 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
+<<<<<<< HEAD
 instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
+=======
+instantiate_device_type_tests(TestCommon, globals())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestCompositeCompliance, globals())
 instantiate_device_type_tests(TestMathBits, globals())
 instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
diff --git a/test/test_ops_jit.py b/test/test_ops_jit.py
index 9dfb75cc6a8f1..b9faaf52d1f10 100644
--- a/test/test_ops_jit.py
+++ b/test/test_ops_jit.py
@@ -188,7 +188,11 @@ def get_sample():
             # Note: only runs in float32 because schema isn't affected by dtype,
             #   so running it on all dtypes is would be excessive
             if dtype == torch.float32:
+<<<<<<< HEAD
                 # TODO: no reason why we can't run this with tracing graph
+=======
+                # TODO: no reason why we cant run this with tracing graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if support_script and op.name != "rsub":
                     check_alias_annotation(
                         name,
diff --git a/test/test_optim.py b/test/test_optim.py
index de185725b5c2c..6ff4d1594a62b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -187,8 +187,12 @@ def test_forloop_goes_right_direction(
                     )
                 input = torch.randn(5, device=device, dtype=dtype)
 
+<<<<<<< HEAD
                 params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
                 optimizer = optim_cls(params, **optim_input.kwargs)
+=======
+                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -196,12 +200,16 @@ def test_forloop_goes_right_direction(
 
                 def closure():
                     optimizer.zero_grad()
+<<<<<<< HEAD
                     wo = (
                         weight.mv(input)
                         if optim_cls.__name__ == "Muon"
                         else weight.mv(input) + bias
                     )
                     loss = wo.pow(2).sum()
+=======
+                    loss = (weight.mv(input) + bias).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -252,8 +260,12 @@ def test_forloop_goes_right_direction_multigpu(
                 bias = Parameter(torch.randn((10), device="cuda:1", dtype=dtype))
                 inpt = torch.randn(5, device="cuda:0", dtype=dtype)
 
+<<<<<<< HEAD
                 params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
                 optimizer = optim_cls(params, **optim_input.kwargs)
+=======
+                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -261,12 +273,16 @@ def test_forloop_goes_right_direction_multigpu(
 
                 def closure():
                     optimizer.zero_grad()
+<<<<<<< HEAD
                     wo = (
                         weight.mv(inpt).cuda(1)
                         if optim_cls.__name__ == "Muon"
                         else weight.mv(inpt).cuda(1) + bias
                     )
                     loss = wo.pow(2).sum()
+=======
+                    loss = (weight.mv(inpt).cuda(1) + bias).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -297,25 +313,41 @@ def test_param_group_with_lrscheduler_goes_right_direction(
 
         for schedulers_c in optim_info.scheduler_inputs:
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+<<<<<<< HEAD
             weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+=======
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inpt = torch.randn(5, device=device, dtype=dtype)
 
             # avoid endless recompiles by wrapping LR in a tensor if we're compiling
             lr = torch.tensor(0.01) if torch.compiler.is_compiling() else 0.01
+<<<<<<< HEAD
             optimizer = optim_cls(
                 [{"params": [weight]}, {"params": [weight2], "lr": lr}]
             )
+=======
+            optimizer = optim_cls([{"params": [weight]}, {"params": [bias], "lr": lr}])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             schedulers = [scheduler_c(optimizer) for scheduler_c in schedulers_c]
 
             def closure():
                 optimizer.zero_grad()
+<<<<<<< HEAD
                 loss = (weight.mv(inpt) + weight2.mv(inpt)).pow(2).sum()
+=======
+                loss = (weight.mv(inpt) + bias).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
+<<<<<<< HEAD
                     weight2.grad = weight2.grad.to_sparse()
+=======
+                    bias.grad = bias.grad.to_sparse()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return loss
 
             initial_value = closure().item()
@@ -353,26 +385,39 @@ def test_tensor_lr(self, device, dtype, optim_info, num_dim):
             if "lr" in kwargs:
                 del kwargs["lr"]
 
+<<<<<<< HEAD
             params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
             kwargs["lr"] = 1.0 if optim_info.step_requires_closure else 1e-3
             optimizer_r = optim_cls(params, **kwargs)
+=======
+            kwargs["lr"] = 1.0 if optim_info.step_requires_closure else 1e-3
+            optimizer_r = optim_cls([weight, bias], **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             try:
                 kwargs["lr"] = (
                     torch.tensor(kwargs["lr"]).reshape([1] * num_dim).to(lr_device)
                 )
+<<<<<<< HEAD
                 params_c = [weight_c, bias_c]
                 if optim_cls.__name__ == "Muon":
                     params_c = [weight_c]
                 optimizer = optim_cls(params_c, **kwargs)
+=======
+                optimizer = optim_cls([weight_c, bias_c], **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except ValueError as e:
                 self.assertRegex(str(e), ".*lr as a Tensor is not supported.*")
                 continue
 
             def closure(optim, w, b, i):
                 optim.zero_grad()
+<<<<<<< HEAD
                 wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
                 loss = wo.pow(2).sum()
+=======
+                loss = (w.mv(i) + b).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
@@ -396,8 +441,12 @@ def closure(optim, w, b, i):
                     optimizer.step()
 
                 self.assertEqual(weight, weight_c)
+<<<<<<< HEAD
                 if optim_cls.__name__ != "Muon":
                     self.assertEqual(bias, bias_c)
+=======
+                self.assertEqual(bias, bias_c)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @parametrize("with_lrsched", [True, False])
     @optims(
@@ -1237,16 +1286,25 @@ def test_param_groups_weight_decay(self, device, dtype, optim_info):
         )
         for optim_input in all_optim_inputs:
             weight_kwargs = optim_input.kwargs
+<<<<<<< HEAD
             weight2_kwargs = deepcopy(optim_input.kwargs)
             weight2_kwargs["weight_decay"] = 0.0
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+=======
+            bias_kwargs = deepcopy(optim_input.kwargs)
+            bias_kwargs["weight_decay"] = 0.0
+
+            weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input = torch.randn(5, device=device, dtype=dtype)
 
             optimizer = optim_cls(
                 [
                     dict(params=[weight], **weight_kwargs),
+<<<<<<< HEAD
                     dict(params=[weight2], **weight2_kwargs),
                 ]
             )
@@ -1256,12 +1314,27 @@ def test_param_groups_weight_decay(self, device, dtype, optim_info):
             for _ in range(20):
                 optimizer.zero_grad()
                 loss = (weight.mv(input) + weight2.mv(input)).pow(2).sum()
+=======
+                    dict(params=[bias], **bias_kwargs),
+                ]
+            )
+
+            loss = (weight.mv(input) + bias).pow(2).sum()
+            initial_value = loss.item()
+            for _ in range(20):
+                optimizer.zero_grad()
+                loss = (weight.mv(input) + bias).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
+<<<<<<< HEAD
                     weight2.grad = weight2.grad.to_sparse()
+=======
+                    bias.grad = bias.grad.to_sparse()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 optimizer.step()
 
             # Test that the direction of loss moved appropriately
@@ -1288,6 +1361,7 @@ def test_param_groups_lr(self, device, dtype, optim_info):
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+<<<<<<< HEAD
             irrelevant = Parameter(torch.randn((2, 2), device=device, dtype=dtype))
             irrelevant_clone = irrelevant.clone()
             input = torch.randn(5, device=device, dtype=dtype)
@@ -1295,11 +1369,20 @@ def test_param_groups_lr(self, device, dtype, optim_info):
             optimizer = optim_cls(
                 [
                     dict(params=params, **optim_input.kwargs),
+=======
+            irrelevant = Parameter(torch.randn(2, device=device, dtype=dtype))
+            irrelevant_clone = irrelevant.clone()
+            input = torch.randn(5, device=device, dtype=dtype)
+            optimizer = optim_cls(
+                [
+                    dict(params=[weight, bias], **optim_input.kwargs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dict(params=[irrelevant]),
                 ],
                 **outer_kwargs,
             )
 
+<<<<<<< HEAD
             wo = (
                 weight.mv(input)
                 if optim_cls.__name__ == "Muon"
@@ -1315,6 +1398,13 @@ def test_param_groups_lr(self, device, dtype, optim_info):
                     else weight.mv(input) + bias
                 )
                 loss = wo.pow(2).sum()
+=======
+            loss = (weight.mv(input) + bias).pow(2).sum()
+            initial_value = loss.item()
+            for _ in range(20):
+                optimizer.zero_grad()
+                loss = (weight.mv(input) + bias).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 loss.backward()
                 irrelevant.grad = torch.rand_like(irrelevant)
                 if optim_info.only_supports_sparse_grads:
@@ -1372,8 +1462,13 @@ def closure():
             if kwargs.get("weight_decay", 0) != 0:
                 continue
 
+<<<<<<< HEAD
             # AdamW/Muon params will be updated regardless of grads due to lr, so make lr smaller
             if optim_cls.__name__ == "AdamW" or optim_cls.__name__ == "Muon":
+=======
+            # AdamW params will be updated regardless of grads due to lr, so make lr smaller
+            if optim_cls.__name__ == "AdamW":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["lr"] = (
                     torch.tensor(1e-5)
                     if isinstance(kwargs.get("lr", 1e-5), torch.Tensor)
@@ -1470,8 +1565,11 @@ def test_state_dict_deterministic(
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
         params = [weight, bias]
+<<<<<<< HEAD
         if optim_cls.__name__ == "Muon":
             params = [weight]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1486,8 +1584,12 @@ def without_param_names(state_dict):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
+<<<<<<< HEAD
             wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
             loss = wo.pow(2).sum()
+=======
+            loss = (w.mv(i) + b).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 if w.grad is not None:
@@ -1513,10 +1615,14 @@ def fwd_bwd(optim, w, b, i):
             with torch.no_grad():
                 weight_c = Parameter(weight.clone())
                 bias_c = Parameter(bias.clone())
+<<<<<<< HEAD
             params_c_list = (
                 [weight_c, bias_c] if optim_cls.__name__ != "Muon" else [weight_c]
             )
             params_c = make_named_param(params_c_list, is_named=is_named_optim1)
+=======
+            params_c = make_named_param([weight_c, bias_c], is_named=is_named_optim1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optimizer_c = optim_cls(params_c, **optim_input.kwargs)
             closure_c = functools.partial(fwd_bwd, optimizer_c, weight_c, bias_c, input)
 
@@ -1535,8 +1641,12 @@ def fwd_bwd(optim, w, b, i):
                     optimizer_c.step()
 
                 self.assertEqual(weight, weight_c)
+<<<<<<< HEAD
                 if optim_cls.__name__ != "Muon":
                     self.assertEqual(bias, bias_c)
+=======
+                self.assertEqual(bias, bias_c)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Make sure state dict is deterministic with equal (not identical) parameters
             # Param names are optional and not needed to be the consistent.
@@ -1560,6 +1670,7 @@ def test_can_load_older_state_dict(self, device, dtype, optim_info):
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+<<<<<<< HEAD
 
         def _get_model_and_input_tensor(device, dtype, optim_cls):
             if optim_cls.__name__ == "Muon":
@@ -1578,6 +1689,16 @@ def _get_model_and_input_tensor(device, dtype, optim_cls):
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
             model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
+=======
+        for optim_input in all_optim_inputs:
+            torch.manual_seed(1)
+            model = torch.nn.Sequential(
+                torch.nn.Conv2d(4, 2, 1, stride=2),
+                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+            )
+            model.to(dtype=dtype, device=device)
+            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optimizer = optim_cls(model.parameters(), **optim_input.kwargs)
 
             def fwd_bwd(optim, mod, i):
@@ -1625,6 +1746,7 @@ def test_can_load_from_to_named_state_dict(
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+<<<<<<< HEAD
 
         def _get_model_and_input_tensor(device, dtype, optim_cls):
             if optim_cls.__name__ == "Muon":
@@ -1643,6 +1765,16 @@ def _get_model_and_input_tensor(device, dtype, optim_cls):
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
             model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
+=======
+        for optim_input in all_optim_inputs:
+            torch.manual_seed(1)
+            model = torch.nn.Sequential(
+                torch.nn.Conv2d(4, 2, 1, stride=2),
+                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+            )
+            model.to(dtype=dtype, device=device)
+            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def fwd_bwd(optim, mod, i):
                 optim.zero_grad()
@@ -1679,12 +1811,19 @@ def fwd_bwd(optim, mod, i):
                 fwd_bwd(optimizer2, model, input)
                 optimizer2.step()
 
+<<<<<<< HEAD
             ref_names = [p[0] for p in model.named_parameters()]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Make sure that param_names are preserved when provided to at least one of the optimizers
             if is_named_optim0 or is_named_optim1:
                 self.assertEqual(
                     optimizer2.state_dict()["param_groups"][0]["param_names"],
+<<<<<<< HEAD
                     ref_names,
+=======
+                    ["0.weight", "0.bias", "1.weight", "1.bias"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     @parametrize("is_named_optim", [True, False])
@@ -1703,7 +1842,11 @@ def test_save_load_equality_with_weights_only(
         )
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
+<<<<<<< HEAD
         params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
+=======
+        params = [weight, bias]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1712,8 +1855,12 @@ def make_named_param(param, is_named):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
+<<<<<<< HEAD
             wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
             loss = wo.pow(2).sum()
+=======
+            loss = (w.mv(i) + b).pow(2).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 weight.grad = weight.grad.to_sparse()
@@ -1997,7 +2144,11 @@ def post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
+<<<<<<< HEAD
         params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
+=======
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def dummy_closure():
             return 1
@@ -2029,8 +2180,12 @@ def pre_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
+<<<<<<< HEAD
         # Create a random 2D tensor for compatibility with Muon.
         params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
+=======
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def dummy_closure():
             return 1
@@ -2074,7 +2229,11 @@ def local_post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data.append(2)
 
+<<<<<<< HEAD
         params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
+=======
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def dummy_closure():
             return 1
@@ -2280,8 +2439,12 @@ def test_defaults_changed_to_foreach(self, device, dtype, optim_info):
     def test_non_empty_state(self, device, dtype, optim_info):
         # There are internal tests that check that the state is not empty
         optim_cls = optim_info.optim_cls
+<<<<<<< HEAD
         # Muon only accepts 2D parameter.
         model = torch.nn.Linear(5, 5, bias=False)
+=======
+        model = torch.nn.Linear(5, 5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.to(dtype=dtype, device=device)
         inpt = torch.rand(2, 5, dtype=dtype, device=device)
 
@@ -2305,6 +2468,7 @@ def test_non_empty_state(self, device, dtype, optim_info):
             for state in optim.state.values():
                 self.assertGreater(len(state), 0)
 
+<<<<<<< HEAD
     @parametrize("dtype", [torch.float32])
     def test_step_iteration(self, device, dtype):
         def _get_model_and_input_tensor(device, dtype):
@@ -2333,6 +2497,8 @@ def fwd_bwd(optim, mod, i):
         optimizer.step(functools.partial(fwd_bwd, optimizer, model, input))
         self.assertEqual(counter, 6)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestOptimRenewed, globals(), allow_mps=True)
 
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 8454677856d0f..dc23739f03779 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -77,7 +77,11 @@ def quux(a):
 # dictionary are function names in the torch API and the values are
 # function implementations. Implementations are added to
 # HANDLED_FUNCTION_DIAGONAL by decorating a python function with
+<<<<<<< HEAD
 # implements_diagonal. See the overrides immediately below the definition
+=======
+# implements_diagonal. See the overrides immediately below the defintion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # of DiagonalTensor for usage examples.
 HANDLED_FUNCTIONS_DIAGONAL = {}
 
@@ -133,7 +137,11 @@ class DiagonalTensor:
         https://numpy.org/devdocs/user/basics.dispatch.html
     """
     # This is defined as a class attribute so that SubDiagonalTensor
+<<<<<<< HEAD
     # below which subclasses DiagonalTensor can reuse DiagonalTensor's
+=======
+    # below which subclasses DiagonalTensor can re-use DiagonalTensor's
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # __torch_function__ implementation.
     handled_functions = HANDLED_FUNCTIONS_DIAGONAL
 
@@ -615,6 +623,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         self.assertEqual(NothingImplemented() ** RPowOnly(), -1)
 
+<<<<<<< HEAD
     def test_torch_function_in_lists(self):
         """Test that __torch_function__ is called for objects inside lists"""
 
@@ -880,6 +889,8 @@ def __index__(self):
         self.assertNotIn('size', called_functions,
                          "size should not be called - we should use getitem, not convert to advanced indexing")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def generate_tensor_like_override_tests(cls):
     from torch.testing._internal.generated.annotated_fn_args import annotated_args
@@ -1400,6 +1411,7 @@ def test_resolve_name(self):
                 )
 
 class TestTorchFunctionWarning(TestCase):
+<<<<<<< HEAD
     def test_torch_function_standalone_class(self):
         class StandaloneTorchFunctionClass:
             @classmethod
@@ -1425,6 +1437,31 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         result2 = torch.abs(b)
         self.assertEqual(result1, torch.tensor(99.0))
         self.assertEqual(result2, torch.tensor(99.0))
+=======
+    def test_warn_on_invalid_torch_function_standalone_class(self):
+        class StandaloneTorchFunctionClass:
+            def __torch_function__(self, *args, **kwargs):
+                pass
+        a = StandaloneTorchFunctionClass()
+        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
+            # Function that handles torch_function on the python side
+            torch.nn.functional.dropout(a)
+        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
+            # Function that handles torch_function in C++
+            torch.abs(a)
+
+    def test_warn_on_invalid_torch_function_tensor_subclass(self):
+        class TensorSubclassTorchFunctionClass(torch.Tensor):
+            def __torch_function__(self, *args, **kwargs):
+                pass
+        b = TensorSubclassTorchFunctionClass()
+        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
+            # Function that handles torch_function on the python side
+            torch.nn.functional.dropout(b)
+        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
+            # Function that handles torch_function in C++
+            torch.abs(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestDisabledUserWarnings(TestCase):
     def test_no_implicit_user_warning_for_deprecated_functions(self):
diff --git a/test/test_per_overload_api.py b/test/test_per_overload_api.py
index e5cf2aa1d5679..823883b8084d5 100644
--- a/test/test_per_overload_api.py
+++ b/test/test_per_overload_api.py
@@ -7,7 +7,11 @@
 
 class TestPerOverloadAPI(TestCase):
     def test_basics_opoverloadpacket(self):
+<<<<<<< HEAD
         # add is only used as an example here. It is ok to update the test
+=======
+        # add is ony used as an example here. It is ok to update the test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if the semantics of add are modified in the future.
         add_packet = torch.ops.aten.add
 
diff --git a/test/test_prims.py b/test/test_prims.py
index e528a1eb2e4eb..931bf4a2fd2d1 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -142,7 +142,11 @@ def test_collapse(self, device, dtype):
             self.assertTrue(view._is_view())
 
         t_discontig = t.transpose(0, 1)
+<<<<<<< HEAD
         with self.assertRaises(RuntimeError, msg="Attempting to view a collapsed tensor, but no such view exists!"):
+=======
+        with self.assertRaises(ValueError, msg="no such view exists"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             view = prims.collapse_view(t_discontig, 0, 2)
 
         copy = prims.collapse(t_discontig, 0, 1)
@@ -342,6 +346,7 @@ def test_clone_complex(self):
             x = torch.randn(4, dtype=torch.complex64, device='meta').conj()
             x + 1
 
+<<<<<<< HEAD
     def test_clone_meta_stride_preservation_dense(self):
         tensor = torch.randn(1, 5).t()
         meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
@@ -352,6 +357,8 @@ def test_clone_meta_stride_preservation_sparse(self):
         meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
         self.assertEqual(tensor.contiguous().stride(), meta_clone.stride())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_check_deprecation_warning(self):
         with self.assertWarnsRegex(FutureWarning, 'will be removed in the future'):
             torch._prims_common.check(True, lambda: 'message')
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index b76895a0a91f3..893a904e2094f 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1128,6 +1128,12 @@ def f(_a, _b, _stride):
             a = _a.item()
             b = _b.item()
             stride = _stride.item()
+<<<<<<< HEAD
+=======
+            torch._check_is_size(a)
+            torch._check_is_size(b)
+            torch._check_is_size(stride)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ta = torch.randn(a * stride)
             tb = torch.randn(b * stride)
             r = torch.cat([ta, tb])
@@ -1367,8 +1373,13 @@ def forward(self, crop_camera_1, mask_1):
     view_1 = torch.ops.aten.view.default(expand_1, [sym_size_int, sym_size_int_1, sym_size_int_2]);  expand_1 = sym_size_int_1 = sym_size_int_2 = None
     bmm = torch.ops.aten.bmm.default(view, view_1);  view = view_1 = None
     view_2 = torch.ops.aten.view.default(bmm, [sym_size_int, 3, 3]);  bmm = None
+<<<<<<< HEAD
     mul_9 = sym_size_int * 3
     view_3 = torch.ops.aten.view.default(view_2, [mul_9, 3]);  view_2 = mul_9 = None
+=======
+    mul_6 = sym_size_int * 3
+    view_3 = torch.ops.aten.view.default(view_2, [mul_6, 3]);  view_2 = mul_6 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mm = torch.ops.aten.mm.default(view_3, eye);  view_3 = eye = None
     _unsafe_view = torch.ops.aten._unsafe_view.default(mm, [sym_size_int, 3, 3]);  mm = sym_size_int = None
     index_put_ = torch.ops.aten.index_put_.default(crop_camera_1, [mask_1], _unsafe_view);  crop_camera_1 = mask_1 = _unsafe_view = index_put_ = None
@@ -1473,9 +1484,15 @@ def test_view_divisibility_unbacked_relatively_prime(self):
         # See https://github.com/pytorch/pytorch/issues/123651
         def f(x):
             i0 = x.item()
+<<<<<<< HEAD
             # To trigger the original issue, the max bound has to
             # be chosen such that 448 / 447 < 2 (which it is.)
             torch._check(i0 > 0)
+=======
+            torch._check_is_size(i0)
+            # To trigger the original issue, the max bound has to
+            # be chosen such that 448 / 447 < 2 (which it is.)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(i0 <= 448)
             return torch.zeros(256 * i0).view(-1, 447)
         make_fx(f, tracing_mode="symbolic")(torch.tensor(256 * 447, device="cuda"))
@@ -1556,6 +1573,12 @@ def test_split_unbacked_sizes(self):
         def f(lengths, values):
             # tolist not directly supported atm
             sizes = [lengths[i].item() for i in range(lengths.size(0))]
+<<<<<<< HEAD
+=======
+            for s in sizes:
+                # TODO(avik): no assertion generated with torch._check_is_size?
+                torch._constrain_as_size(s)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.split(values, sizes)
 
         r = str(make_fx(f, tracing_mode="symbolic")(
@@ -1570,6 +1593,12 @@ def forward(self, lengths_1, values_1):
     _local_scalar_dense_1 = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
     select_2 = torch.ops.aten.select.int(lengths_1, 0, 2);  lengths_1 = None
     _local_scalar_dense_2 = torch.ops.aten._local_scalar_dense.default(select_2);  select_2 = None
+<<<<<<< HEAD
+=======
+    sym_constrain_range_for_size = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense);  sym_constrain_range_for_size = None
+    sym_constrain_range_for_size_1 = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense_1);  sym_constrain_range_for_size_1 = None
+    sym_constrain_range_for_size_2 = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense_2);  sym_constrain_range_for_size_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     split_with_sizes = torch.ops.aten.split_with_sizes.default(values_1, [_local_scalar_dense, _local_scalar_dense_1, _local_scalar_dense_2]);  values_1 = _local_scalar_dense = _local_scalar_dense_1 = _local_scalar_dense_2 = None
     getitem = split_with_sizes[0]
     getitem_1 = split_with_sizes[1]
@@ -1858,7 +1887,11 @@ def f(a):
             show_guards(tensor),
             """\
 L['a'].size()[1] < L['a'].size()[0]
+<<<<<<< HEAD
 3 <= L['a'].size()[0] and L['a'].size()[0] <= 19
+=======
+L['a'].size()[0] <= 19
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 L['a'].size()[1] <= 18""")
 
     def test_sym_storage_offset(self):
@@ -1964,6 +1997,10 @@ def f(t):
     skip('item'),
     xfail('cov'),
     xfail('nn.functional.gaussian_nll_loss'),
+<<<<<<< HEAD
+=======
+    xfail('tensor_split'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail('corrcoef'),
     xfail('quantile'),
     xfail('nanquantile'),
@@ -1983,12 +2020,18 @@ def f(t):
 
 only_real_tensor_failures = {
     xfail('narrow'),
+<<<<<<< HEAD
     xfail('tensor_split'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 only_fake_tensor_failures = {
     xfail('narrow'),
+<<<<<<< HEAD
     xfail('tensor_split'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 fake_tensor_failures = set()
diff --git a/test/test_pruning_op.py b/test/test_pruning_op.py
index d8e42d781390c..994ae2f19b208 100644
--- a/test/test_pruning_op.py
+++ b/test/test_pruning_op.py
@@ -18,7 +18,11 @@ class PruningOpTest(TestCase):
     def _generate_rowwise_mask(self, embedding_rows):
         indicator = torch.from_numpy((np.random.random_sample(embedding_rows)).astype(np.float32))
         threshold = float(np.random.random_sample())
+<<<<<<< HEAD
         mask = torch.BoolTensor([val >= threshold for val in indicator])
+=======
+        mask = torch.BoolTensor([True if val >= threshold else False for val in indicator])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mask
 
     def _test_rowwise_prune_op(self, embedding_rows, embedding_dims, indices_type, weights_dtype):
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index dff4e9c014c78..ca8600fed2c36 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -7,7 +7,11 @@
 import os
 import pkgutil
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._utils_internal import get_file_path_2  # @manual
@@ -123,7 +127,10 @@ def test_no_new_bindings(self):
             "FutureType",
             "Generator",
             "GeneratorType",
+<<<<<<< HEAD
             "GreenContext",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "get_autocast_cpu_dtype",
             "get_autocast_dtype",
             "get_autocast_ipu_dtype",
@@ -513,7 +520,11 @@ def check_one_element(elem, modname, mod, *, is_public, is_all):
                             "does not have `__all__` defined"
                         )
                         fix_is_public = (
+<<<<<<< HEAD
                             f"remove it from the modules' (`{modname}`) `__all__`"
+=======
+                            f"remove it from the modules's (`{modname}`) `__all__`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if is_all
                             else f"either define a `__all__` for `{modname}` or add a `_` at the beginning of the name"
                         )
@@ -523,7 +534,11 @@ def check_one_element(elem, modname, mod, *, is_public, is_all):
                             f"it is not inside the module's (`{modname}`) `__all__`"
                         )
                         fix_is_public = (
+<<<<<<< HEAD
                             f"add it from the modules' (`{modname}`) `__all__`"
+=======
+                            f"add it from the modules's (`{modname}`) `__all__`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     if looks_public:
                         why_looks_public = (
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 515ce435b72a7..cb4aae386a50c 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,6 +1,10 @@
 # Owner(s): ["module: __torch_dispatch__"]
 # ruff: noqa: F841
 
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import pickle
 import sys
 import tempfile
@@ -155,7 +159,11 @@ def second_fallback(op, *args, **kwargs):
                 # New dispatcher call should hit the first callback again
                 self.assertFalse(first_called)
                 a, b = args
+<<<<<<< HEAD
                 # Make a subtraction here instead of add !
+=======
+                # Make a substraction here instead of add !
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 c = a - b
                 self.assertTrue(first_called)
                 return c
@@ -587,6 +595,7 @@ def test_error_for_unsupported_ns_or_kind(self) -> None:
             with self.assertRaisesRegex(ValueError, "reserved namespace"):
                 my_lib1 = Library("prim", kind)  # noqa: TOR901
 
+<<<<<<< HEAD
     def test_dispatcher_error_filenames(self) -> None:
         # Test that dispatcher errors report correct Python filenames and line numbers
         # when defining duplicate libraries (which triggers the filename tracking)
@@ -628,6 +637,8 @@ def test_dispatcher_error_filenames(self) -> None:
         self.assertIn("FIRST_LIB_MARKER", first_line)
         self.assertIn("SECOND_LIB_MARKER", second_line)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_returning_symint(self) -> None:
         shape_env = ShapeEnv()
         fake_tensor_mode = FakeTensorMode(shape_env=shape_env)
@@ -781,8 +792,14 @@ def test_produce_real_type(self) -> None:
 $0: f32[2, 2] = input('x')
 $1: f64[2, 2] = torch._ops.aten._to_copy.default($0, dtype=torch.float64)
 $2: f64[2, 2] = torch._ops.aten.cumprod.default($0, 0, dtype=torch.float64)
+<<<<<<< HEAD
 $3: f32[2] = torch._ops.aten.select.int($0, 1, 1)
 $4: f32[2] = torch._ops.aten.clone.default($3, memory_format=torch.contiguous_format)""",
+=======
+$3: f32[2, 2] = torch._ops.aten.slice.Tensor($0, 0, 0, 9223372036854775807)
+$4: f32[2] = torch._ops.aten.select.int($3, 1, 1)
+$5: f32[2] = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_optional_tensor_list(self) -> None:
@@ -850,7 +867,11 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             lambda: A(torch.zeros(1)).detach(),
         )
 
+<<<<<<< HEAD
     def test_detach_appears_once_when_called_once(self) -> None:
+=======
+    def test_detach_appears_twice_when_called_once(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with capture_logs() as logs:
             x = LoggingTensor(torch.tensor([3.0]), requires_grad=True)
             log_input("x", x)
@@ -863,7 +884,12 @@ def test_detach_appears_once_when_called_once(self) -> None:
             "\n".join(logs),
             """\
 $0: f32[1] = input('x')
+<<<<<<< HEAD
 $1: f32[1] = torch._ops.aten.detach.default($0)""",
+=======
+$1: f32[1] = torch._ops.aten.detach.default($0)
+$2: f32[1] = torch._ops.aten.detach.default($1)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_storage(self) -> None:
@@ -1757,6 +1783,52 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 self.assertEqual(s.device_index, 2)
                 self.assertEqual(s.device_type, 3)
 
+<<<<<<< HEAD
+=======
+    def test_subclass_autograd_device_check(self) -> None:
+        class NonWrapperSubclass(torch.Tensor):
+            elem: torch.Tensor
+
+            __slots__ = ["elem"]
+
+            @staticmethod
+            def __new__(cls, elem, *args, **kwargs):
+                # Wrong device here!
+                r = torch.Tensor._make_subclass(
+                    cls, elem.to("meta"), elem.requires_grad
+                )
+                # ...the real tensor is held as an element on the tensor.
+                r.elem = elem
+                return r
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                def unwrap(e):
+                    return e.elem if isinstance(e, NonWrapperSubclass) else e
+
+                def wrap(e):
+                    return NonWrapperSubclass(e) if isinstance(e, torch.Tensor) else e
+
+                rs = tree_map(
+                    wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+                )
+                logging.getLogger("NonWrapperSubclass").info(
+                    f"{func.__module__}.{func.__name__}",  # noqa: G004
+                    args,
+                    kwargs,
+                    rs,
+                )
+                return rs
+
+        x = NonWrapperSubclass(torch.tensor([3.0, 4.0], requires_grad=True))
+        y = torch.randn(2, requires_grad=True)
+        z = x * y
+        self.assertIsInstance(z, NonWrapperSubclass)
+        z.sum().backward(torch.tensor(1))
+        self.assertEqual(x.grad, y)
+        self.assertEqual(y.grad, x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_none_wrapping(self):
         # A Tensor subclass that returns None when doing add
         # See LoggingTensor above for more details on the subclass
@@ -1998,8 +2070,11 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return contiguous_data.is_contiguous()
+<<<<<<< HEAD
                     if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
                         return torch.ops.aten.sym_is_contiguous(contiguous_data)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return NotImplemented
 
             class ExampleTensor3(torch.Tensor):
@@ -2013,8 +2088,11 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return not_contiguous_data.is_contiguous()
+<<<<<<< HEAD
                     if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
                         return torch.ops.aten.sym_is_contiguous(not_contiguous_data)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return NotImplemented
 
             err_msg = "Multiple dispatch failed for 'torch.ops.aten.is_contiguous'"
@@ -2047,7 +2125,10 @@ def __new__(cls, data):
             @classmethod
             def __torch_dispatch__(cls, func, types, args, kwargs):
                 if func in [
+<<<<<<< HEAD
                     torch.ops.aten.sym_is_contiguous.default,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.ops.aten.is_contiguous.default,
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
@@ -2481,6 +2562,7 @@ def __torch_dispatch__(self, func, types, args, kwargs=None):
         self.assertEqual(res, t.a)
         self.assertIs(type(res), torch.Tensor)
 
+<<<<<<< HEAD
     def test_custom_dispatch_mode_supports_higher_order_operators(self):
         class Mode(TorchDispatchMode):
             supports_higher_order_operators = True
@@ -2527,6 +2609,8 @@ def __torch_dispatch__(self, func, types, args, kwargs):
             self.assertEqual(m.last_args[1], uarg)
         self.assertTrue((a == uarg).all().item())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestPythonDispatcher(TestCase):
     def test_basic(self):
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 7cc3b8affc0ef..6d51a1cf0feb0 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -14,7 +14,11 @@
 from typing import Any, NamedTuple, Optional
 
 import torch
+<<<<<<< HEAD
 import torch.utils._pytree as python_pytree
+=======
+import torch.utils._pytree as py_pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -26,6 +30,7 @@
 )
 
 
+<<<<<<< HEAD
 pytree_modules = {
     "python": python_pytree,
 }
@@ -43,6 +48,13 @@
     [subtest(module, name=name) for name, module in pytree_modules.items()],
 )
 
+=======
+if IS_FBCODE:
+    # optree is not yet enabled in fbcode, so just re-test the python implementation
+    cxx_pytree = py_pytree
+else:
+    import torch.utils._cxx_pytree as cxx_pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 GlobalPoint = namedtuple("GlobalPoint", ["x", "y"])
 
@@ -67,12 +79,17 @@ class TestEnum(enum.Enum):
 
 class TestGenericPytree(TestCase):
     def test_aligned_public_apis(self):
+<<<<<<< HEAD
         public_apis = python_pytree.__all__
+=======
+        public_apis = py_pytree.__all__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(public_apis, cxx_pytree.__all__)
 
         for name in public_apis:
             cxx_api = getattr(cxx_pytree, name)
+<<<<<<< HEAD
             python_api = getattr(python_pytree, name)
 
             self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(python_api))
@@ -88,6 +105,20 @@ def test_aligned_public_apis(self):
                 cxx_param_names = list(cxx_signature.parameters)
                 python_param_names = list(python_signature.parameters)
                 self.assertEqual(cxx_param_names, python_param_names)
+=======
+            py_api = getattr(py_pytree, name)
+
+            self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(py_api))
+            self.assertEqual(inspect.isfunction(cxx_api), inspect.isfunction(py_api))
+            if inspect.isfunction(cxx_api):
+                cxx_signature = inspect.signature(cxx_api)
+                py_signature = inspect.signature(py_api)
+
+                # Check the parameter names are the same.
+                cxx_param_names = list(cxx_signature.parameters)
+                py_param_names = list(py_signature.parameters)
+                self.assertEqual(cxx_param_names, py_param_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Check the positional parameters are the same.
                 cxx_positional_param_names = [
@@ -101,9 +132,15 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
+<<<<<<< HEAD
                 python_positional_param_names = [
                     n
                     for n, p in python_signature.parameters.items()
+=======
+                py_positional_param_names = [
+                    n
+                    for n, p in py_signature.parameters.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (
                         p.kind
                         in {
@@ -112,6 +149,7 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
+<<<<<<< HEAD
                 self.assertEqual(
                     cxx_positional_param_names,
                     python_positional_param_names,
@@ -128,6 +166,21 @@ def test_aligned_public_apis(self):
                     # Check parameter annotations are the same.
                     if "TreeSpec" in str(cxx_param.annotation):
                         self.assertIn("TreeSpec", str(python_param.annotation))
+=======
+                self.assertEqual(cxx_positional_param_names, py_positional_param_names)
+
+                for py_name, py_param in py_signature.parameters.items():
+                    self.assertIn(py_name, cxx_signature.parameters)
+                    cxx_param = cxx_signature.parameters[py_name]
+
+                    # Check parameter kinds and default values are the same.
+                    self.assertEqual(cxx_param.kind, py_param.kind)
+                    self.assertEqual(cxx_param.default, py_param.default)
+
+                    # Check parameter annotations are the same.
+                    if "TreeSpec" in str(cxx_param.annotation):
+                        self.assertIn("TreeSpec", str(py_param.annotation))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertEqual(
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
@@ -137,33 +190,60 @@ def test_aligned_public_apis(self):
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
                                 "TreeSpec",
+<<<<<<< HEAD
                                 str(python_param.annotation),
                             ),
                             msg=(
                                 f"C++ parameter {cxx_param} "
                                 f"does not match Python parameter {python_param} "
+=======
+                                str(py_param.annotation),
+                            ),
+                            msg=(
+                                f"C++ parameter {cxx_param} "
+                                f"does not match Python parameter {py_param} "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 f"for API `{name}`"
                             ),
                         )
                     else:
                         self.assertEqual(
                             cxx_param.annotation,
+<<<<<<< HEAD
                             python_param.annotation,
                             msg=(
                                 f"C++ parameter {cxx_param} "
                                 f"does not match Python parameter {python_param} "
+=======
+                            py_param.annotation,
+                            msg=(
+                                f"C++ parameter {cxx_param} "
+                                f"does not match Python parameter {py_param} "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 f"for API `{name}`"
                             ),
                         )
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_register_pytree_node(self, pytree):
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_register_pytree_node(self, pytree_impl):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class MyDict(UserDict):
             pass
 
         d = MyDict(a=1, b=2, c=3)
 
         # Custom types are leaf nodes by default
+<<<<<<< HEAD
         values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [d])
         self.assertIs(values[0], d)
@@ -172,11 +252,22 @@ class MyDict(UserDict):
 
         # Register MyDict as a pytree node
         pytree.register_pytree_node(
+=======
+        values, spec = pytree_impl.tree_flatten(d)
+        self.assertEqual(values, [d])
+        self.assertIs(values[0], d)
+        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+        self.assertTrue(spec.is_leaf())
+
+        # Register MyDict as a pytree node
+        pytree_impl.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             MyDict,
             lambda d: (list(d.values()), list(d.keys())),
             lambda values, keys: MyDict(zip(keys, values)),
         )
 
+<<<<<<< HEAD
         values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [1, 2, 3])
         self.assertEqual(d, pytree.tree_unflatten(values, spec))
@@ -184,11 +275,21 @@ class MyDict(UserDict):
         # Do not allow registering the same type twice
         with self.assertRaisesRegex(ValueError, "already registered"):
             pytree.register_pytree_node(
+=======
+        values, spec = pytree_impl.tree_flatten(d)
+        self.assertEqual(values, [1, 2, 3])
+        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+
+        # Do not allow registering the same type twice
+        with self.assertRaisesRegex(ValueError, "already registered"):
+            pytree_impl.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 MyDict,
                 lambda d: (list(d.values()), list(d.keys())),
                 lambda values, keys: MyDict(zip(keys, values)),
             )
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_flatten_unflatten_leaf(self, pytree):
         def run_test_with_leaf(leaf):
@@ -197,6 +298,22 @@ def run_test_with_leaf(leaf):
             self.assertEqual(treespec, pytree.treespec_leaf())
 
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_flatten_unflatten_leaf(self, pytree_impl):
+        def run_test_with_leaf(leaf):
+            values, treespec = pytree_impl.tree_flatten(leaf)
+            self.assertEqual(values, [leaf])
+            self.assertEqual(treespec, pytree_impl.LeafSpec())
+
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, leaf)
 
         run_test_with_leaf(1)
@@ -206,6 +323,7 @@ def run_test_with_leaf(leaf):
         run_test_with_leaf(torch.randn(3, 3))
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -216,6 +334,18 @@ def run_test_with_leaf(leaf):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda tup: py_pytree.TreeSpec(
+                        tuple, None, [py_pytree.LeafSpec() for _ in tup]
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (cxx_pytree, lambda tup: cxx_pytree.tree_structure((0,) * len(tup))),
@@ -223,15 +353,26 @@ def run_test_with_leaf(leaf):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_tuple(self, pytree, gen_expected_fn):
         def run_test(tup):
             expected_spec = gen_expected_fn(tup)
             values, treespec = pytree.tree_flatten(tup)
+=======
+    def test_flatten_unflatten_tuple(self, pytree_impl, gen_expected_fn):
+        def run_test(tup):
+            expected_spec = gen_expected_fn(tup)
+            values, treespec = pytree_impl.tree_flatten(tup)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, tuple)
 
@@ -241,6 +382,7 @@ def run_test(tup):
         run_test((torch.tensor([1.0, 2]), 2, 10, 9, 11))
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -251,6 +393,18 @@ def run_test(tup):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda lst: py_pytree.TreeSpec(
+                        list, None, [py_pytree.LeafSpec() for _ in lst]
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (cxx_pytree, lambda lst: cxx_pytree.tree_structure([0] * len(lst))),
@@ -258,15 +412,26 @@ def run_test(tup):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_list(self, pytree, gen_expected_fn):
         def run_test(lst):
             expected_spec = gen_expected_fn(lst)
             values, treespec = pytree.tree_flatten(lst)
+=======
+    def test_flatten_unflatten_list(self, pytree_impl, gen_expected_fn):
+        def run_test(lst):
+            expected_spec = gen_expected_fn(lst)
+            values, treespec = pytree_impl.tree_flatten(lst)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, lst)
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, lst)
             self.assertIsInstance(unflattened, list)
 
@@ -275,6 +440,7 @@ def run_test(lst):
         run_test([torch.tensor([1.0, 2]), 2, 10, 9, 11])
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -287,6 +453,20 @@ def run_test(lst):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda dct: py_pytree.TreeSpec(
+                        dict,
+                        list(dct.keys()),
+                        [py_pytree.LeafSpec() for _ in dct.values()],
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (
@@ -297,15 +477,26 @@ def run_test(lst):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_dict(self, pytree, gen_expected_fn):
         def run_test(dct):
             expected_spec = gen_expected_fn(dct)
             values, treespec = pytree.tree_flatten(dct)
+=======
+    def test_flatten_unflatten_dict(self, pytree_impl, gen_expected_fn):
+        def run_test(dct):
+            expected_spec = gen_expected_fn(dct)
+            values, treespec = pytree_impl.tree_flatten(dct)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(dct.values()))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, dct)
             self.assertIsInstance(unflattened, dict)
 
@@ -316,6 +507,7 @@ def run_test(dct):
         run_test({"a": 1, "b": 2, "c": torch.randn(2, 3)})
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -328,6 +520,20 @@ def run_test(dct):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda odict: py_pytree.TreeSpec(
+                        OrderedDict,
+                        list(odict.keys()),
+                        [py_pytree.LeafSpec() for _ in odict.values()],
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (
@@ -340,15 +546,26 @@ def run_test(dct):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_ordereddict(self, pytree, gen_expected_fn):
         def run_test(odict):
             expected_spec = gen_expected_fn(odict)
             values, treespec = pytree.tree_flatten(odict)
+=======
+    def test_flatten_unflatten_ordereddict(self, pytree_impl, gen_expected_fn):
+        def run_test(odict):
+            expected_spec = gen_expected_fn(odict)
+            values, treespec = pytree_impl.tree_flatten(odict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(odict.values()))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, odict)
             self.assertIsInstance(unflattened, OrderedDict)
 
@@ -360,6 +577,7 @@ def run_test(odict):
         run_test(od)
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -372,6 +590,20 @@ def run_test(odict):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda ddct: py_pytree.TreeSpec(
+                        defaultdict,
+                        [ddct.default_factory, list(ddct.keys())],
+                        [py_pytree.LeafSpec() for _ in ddct.values()],
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (
@@ -384,15 +616,26 @@ def run_test(odict):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_defaultdict(self, pytree, gen_expected_fn):
         def run_test(ddct):
             expected_spec = gen_expected_fn(ddct)
             values, treespec = pytree.tree_flatten(ddct)
+=======
+    def test_flatten_unflatten_defaultdict(self, pytree_impl, gen_expected_fn):
+        def run_test(ddct):
+            expected_spec = gen_expected_fn(ddct)
+            values, treespec = pytree_impl.tree_flatten(ddct)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(ddct.values()))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, ddct)
             self.assertEqual(unflattened.default_factory, ddct.default_factory)
             self.assertIsInstance(unflattened, defaultdict)
@@ -404,6 +647,7 @@ def run_test(ddct):
         run_test(defaultdict(int, {"a": 1, "b": 2, "c": torch.randn(2, 3)}))
 
     @parametrize(
+<<<<<<< HEAD
         "pytree,gen_expected_fn",
         [
             subtest(
@@ -414,6 +658,20 @@ def run_test(ddct):
                     ),
                 ),
                 name="python",
+=======
+        "pytree_impl,gen_expected_fn",
+        [
+            subtest(
+                (
+                    py_pytree,
+                    lambda deq: py_pytree.TreeSpec(
+                        deque,
+                        deq.maxlen,
+                        [py_pytree.LeafSpec() for _ in deq],
+                    ),
+                ),
+                name="py",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             subtest(
                 (
@@ -426,15 +684,26 @@ def run_test(ddct):
             ),
         ],
     )
+<<<<<<< HEAD
     def test_flatten_unflatten_deque(self, pytree, gen_expected_fn):
         def run_test(deq):
             expected_spec = gen_expected_fn(deq)
             values, treespec = pytree.tree_flatten(deq)
+=======
+    def test_flatten_unflatten_deque(self, pytree_impl, gen_expected_fn):
+        def run_test(deq):
+            expected_spec = gen_expected_fn(deq)
+            values, treespec = pytree_impl.tree_flatten(deq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(deq))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, deq)
             self.assertEqual(unflattened.maxlen, deq.maxlen)
             self.assertIsInstance(unflattened, deque)
@@ -443,6 +712,7 @@ def run_test(deq):
         run_test(deque([1.0, 2]))
         run_test(deque([torch.tensor([1.0, 2]), 2, 10, 9, 11], maxlen=8))
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_flatten_unflatten_namedtuple(self, pytree):
         Point = namedtuple("Point", ["x", "y"])
@@ -455,11 +725,35 @@ def run_test(tup):
             else:
                 expected_spec = cxx_pytree.tree_structure(Point(0, 1))
             values, treespec = pytree.tree_flatten(tup)
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_flatten_unflatten_namedtuple(self, pytree_impl):
+        Point = namedtuple("Point", ["x", "y"])
+
+        def run_test(tup):
+            if pytree_impl is py_pytree:
+                expected_spec = py_pytree.TreeSpec(
+                    namedtuple, Point, [py_pytree.LeafSpec() for _ in tup]
+                )
+            else:
+                expected_spec = cxx_pytree.tree_structure(Point(0, 1))
+            values, treespec = pytree_impl.tree_flatten(tup)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, Point)
 
@@ -473,6 +767,7 @@ def run_test(tup):
             subtest(torch.min, name="min"),
         ],
     )
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_flatten_unflatten_return_types(self, pytree, op):
         x = torch.randn(3, 3)
@@ -483,21 +778,57 @@ def test_flatten_unflatten_return_types(self, pytree, op):
         for value in values:
             self.assertIsInstance(value, torch.Tensor)
         result = pytree.tree_unflatten(values, spec)
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_flatten_unflatten_return_types(self, pytree_impl, op):
+        x = torch.randn(3, 3)
+        expected = op(x, dim=0)
+
+        values, spec = pytree_impl.tree_flatten(expected)
+        # Check that values is actually List[Tensor] and not (ReturnType(...),)
+        for value in values:
+            self.assertIsInstance(value, torch.Tensor)
+        result = pytree_impl.tree_unflatten(values, spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(type(result), type(expected))
         self.assertEqual(result, expected)
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_flatten_unflatten_nested(self, pytree):
         def run_test(tree):
             values, treespec = pytree.tree_flatten(tree)
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_flatten_unflatten_nested(self, pytree_impl):
+        def run_test(pytree):
+            values, treespec = pytree_impl.tree_flatten(pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_leaves)
 
             # NB: python basic data structures (dict list tuple) all have
             # contents equality defined on them, so the following works for them.
+<<<<<<< HEAD
             unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tree)
+=======
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cases = [
             [()],
@@ -509,11 +840,25 @@ def run_test(tree):
         for case in cases:
             run_test(case)
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_flatten_with_is_leaf(self, pytree):
         def run_test(tree, one_level_leaves):
             values, treespec = pytree.tree_flatten(
                 tree, is_leaf=lambda x: x is not tree
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_flatten_with_is_leaf(self, pytree_impl):
+        def run_test(pytree, one_level_leaves):
+            values, treespec = pytree_impl.tree_flatten(
+                pytree, is_leaf=lambda x: x is not pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_nodes - 1)
@@ -523,6 +868,7 @@ def run_test(tree, one_level_leaves):
 
             self.assertEqual(
                 treespec,
+<<<<<<< HEAD
                 pytree.tree_structure(
                     pytree.tree_unflatten([0] * treespec.num_leaves, treespec)
                 ),
@@ -530,6 +876,15 @@ def run_test(tree, one_level_leaves):
 
             unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tree)
+=======
+                pytree_impl.tree_structure(
+                    pytree_impl.tree_unflatten([0] * treespec.num_leaves, treespec)
+                ),
+            )
+
+            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cases = [
             ([()], [()]),
@@ -548,6 +903,7 @@ def run_test(tree, one_level_leaves):
         for case in cases:
             run_test(*case)
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_tree_map(self, pytree):
         def run_test(tree):
@@ -556,14 +912,35 @@ def f(x):
 
             sm1 = sum(map(f, pytree.tree_leaves(tree)))
             sm2 = sum(pytree.tree_leaves(pytree.tree_map(f, tree)))
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_map(self, pytree_impl):
+        def run_test(pytree):
+            def f(x):
+                return x * 3
+
+            sm1 = sum(map(f, pytree_impl.tree_leaves(pytree)))
+            sm2 = sum(pytree_impl.tree_leaves(pytree_impl.tree_map(f, pytree)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(sm1, sm2)
 
             def invf(x):
                 return x // 3
 
             self.assertEqual(
+<<<<<<< HEAD
                 pytree.tree_map(invf, pytree.tree_map(f, tree)),
                 tree,
+=======
+                pytree_impl.tree_map(invf, pytree_impl.tree_map(f, pytree)),
+                pytree,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         cases = [
@@ -576,6 +953,7 @@ def invf(x):
         for case in cases:
             run_test(case)
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_tree_map_multi_inputs(self, pytree):
         def run_test(tree):
@@ -589,6 +967,29 @@ def f(x, y, z):
             self.assertEqual(
                 pytree.tree_map(f, tree_x, tree_y, tree_z),
                 pytree.tree_map(lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), tree),
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_map_multi_inputs(self, pytree_impl):
+        def run_test(pytree):
+            def f(x, y, z):
+                return x, [y, (z, 0)]
+
+            pytree_x = pytree
+            pytree_y = pytree_impl.tree_map(lambda x: (x + 1,), pytree)
+            pytree_z = pytree_impl.tree_map(lambda x: {"a": x * 2, "b": 2}, pytree)
+
+            self.assertEqual(
+                pytree_impl.tree_map(f, pytree_x, pytree_y, pytree_z),
+                pytree_impl.tree_map(
+                    lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), pytree
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         cases = [
@@ -601,6 +1002,7 @@ def f(x, y, z):
         for case in cases:
             run_test(case)
 
+<<<<<<< HEAD
     @parametrize_pytree_module
     def test_tree_map_only(self, pytree):
         self.assertEqual(pytree.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"])
@@ -624,6 +1026,57 @@ def test_tree_all_any(self, pytree):
 
     @parametrize_pytree_module
     def test_broadcast_to_and_flatten(self, pytree):
+=======
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_map_only(self, pytree_impl):
+        self.assertEqual(
+            pytree_impl.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"]
+        )
+
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_map_only_predicate_fn(self, pytree_impl):
+        self.assertEqual(
+            pytree_impl.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
+        )
+
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_all_any(self, pytree_impl):
+        self.assertTrue(pytree_impl.tree_all(lambda x: x % 2, [1, 3]))
+        self.assertFalse(pytree_impl.tree_all(lambda x: x % 2, [0, 1]))
+        self.assertTrue(pytree_impl.tree_any(lambda x: x % 2, [0, 1]))
+        self.assertFalse(pytree_impl.tree_any(lambda x: x % 2, [0, 2]))
+        self.assertTrue(pytree_impl.tree_all_only(int, lambda x: x % 2, [1, 3, "a"]))
+        self.assertFalse(pytree_impl.tree_all_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertTrue(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertFalse(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 2, "a"]))
+
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_broadcast_to_and_flatten(self, pytree_impl):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cases = [
             (1, (), []),
             # Same (flat) structures
@@ -656,6 +1109,7 @@ def test_broadcast_to_and_flatten(self, pytree):
             ((1, 2), ([0, [0, 0], 0], [0, 0]), [1, 1, 1, 1, 2, 2]),
             (([1, 2, 3], 4), ([0, [0, 0], 0], [0, 0]), [1, 2, 2, 3, 4, 4]),
         ]
+<<<<<<< HEAD
         for tree, to_tree, expected in cases:
             _, to_spec = pytree.tree_flatten(to_tree)
             result = pytree._broadcast_to_and_flatten(tree, to_spec)
@@ -667,6 +1121,31 @@ def test_pytree_serialize_bad_input(self, pytree):
             pytree.treespec_dumps("random_blurb")
 
     @parametrize_pytree_module
+=======
+        for pytree, to_pytree, expected in cases:
+            _, to_spec = pytree_impl.tree_flatten(to_pytree)
+            result = pytree_impl._broadcast_to_and_flatten(pytree, to_spec)
+            self.assertEqual(result, expected, msg=str([pytree, to_spec, expected]))
+
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_pytree_serialize_bad_input(self, pytree_impl):
+        with self.assertRaises(TypeError):
+            pytree_impl.treespec_dumps("random_blurb")
+
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_is_namedtuple(self, pytree):
         DirectNamedTuple1 = namedtuple("DirectNamedTuple1", ["x", "y"])
 
@@ -707,7 +1186,17 @@ class IndirectNamedTuple2(DirectNamedTuple2):
         self.assertFalse(pytree.is_namedtuple_class(tuple))
         self.assertFalse(pytree.is_namedtuple_class(list))
 
+<<<<<<< HEAD
     @parametrize_pytree_module
+=======
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_is_structseq(self, pytree):
         class FakeStructSeq(tuple):
             n_fields = 2
@@ -781,7 +1270,17 @@ class DirectNamedTuple2(NamedTuple):
                 self.assertFalse(pytree.is_namedtuple(cls))
                 self.assertFalse(pytree.is_namedtuple_class(cls))
 
+<<<<<<< HEAD
     @parametrize_pytree_module
+=======
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_enum_treespec_roundtrip(self, pytree):
         data = {TestEnum.A: 5}
         spec = pytree.tree_structure(data)
@@ -801,14 +1300,22 @@ def __init__(self, x, y):
         with self.assertWarnsRegex(
             FutureWarning, "torch.utils._pytree._register_pytree_node"
         ):
+<<<<<<< HEAD
             python_pytree._register_pytree_node(
+=======
+            py_pytree._register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
             )
 
         with self.assertWarnsRegex(UserWarning, "already registered"):
+<<<<<<< HEAD
             python_pytree._register_pytree_node(
+=======
+            py_pytree._register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -845,6 +1352,7 @@ def test_import_pytree_doesnt_import_optree(self):
 
     def test_treespec_equality(self):
         self.assertEqual(
+<<<<<<< HEAD
             python_pytree.treespec_leaf(),
             python_pytree.treespec_leaf(),
         )
@@ -863,12 +1371,35 @@ def test_treespec_equality(self):
         self.assertTrue(
             python_pytree.TreeSpec(tuple, None, [])
             != python_pytree.TreeSpec(list, None, []),
+=======
+            py_pytree.LeafSpec(),
+            py_pytree.LeafSpec(),
+        )
+        self.assertEqual(
+            py_pytree.TreeSpec(list, None, []),
+            py_pytree.TreeSpec(list, None, []),
+        )
+        self.assertEqual(
+            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
+            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
+        )
+        self.assertFalse(
+            py_pytree.TreeSpec(tuple, None, []) == py_pytree.TreeSpec(list, None, []),
+        )
+        self.assertTrue(
+            py_pytree.TreeSpec(tuple, None, []) != py_pytree.TreeSpec(list, None, []),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_treespec_repr(self):
         # Check that it looks sane
+<<<<<<< HEAD
         tree = (0, [0, 0, [0]])
         spec = python_pytree.tree_structure(tree)
+=======
+        pytree = (0, [0, 0, [0]])
+        _, spec = py_pytree.tree_flatten(pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             repr(spec),
             (
@@ -882,6 +1413,7 @@ def test_treespec_repr(self):
     @parametrize(
         "spec",
         [
+<<<<<<< HEAD
             # python_pytree.tree_structure([])
             python_pytree.TreeSpec(list, None, []),
             # python_pytree.tree_structure(())
@@ -934,10 +1466,71 @@ def test_treespec_repr(self):
                             python_pytree.treespec_leaf(),
                             python_pytree.treespec_leaf(),
                             python_pytree.treespec_leaf(),
+=======
+            # py_pytree.tree_structure([])
+            py_pytree.TreeSpec(list, None, []),
+            # py_pytree.tree_structure(())
+            py_pytree.TreeSpec(tuple, None, []),
+            # py_pytree.tree_structure({})
+            py_pytree.TreeSpec(dict, [], []),
+            # py_pytree.tree_structure([0])
+            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
+            # py_pytree.tree_structure([0, 1])
+            py_pytree.TreeSpec(
+                list,
+                None,
+                [
+                    py_pytree.LeafSpec(),
+                    py_pytree.LeafSpec(),
+                ],
+            ),
+            # py_pytree.tree_structure((0, 1, 2))
+            py_pytree.TreeSpec(
+                tuple,
+                None,
+                [
+                    py_pytree.LeafSpec(),
+                    py_pytree.LeafSpec(),
+                    py_pytree.LeafSpec(),
+                ],
+            ),
+            # py_pytree.tree_structure({"a": 0, "b": 1, "c": 2})
+            py_pytree.TreeSpec(
+                dict,
+                ["a", "b", "c"],
+                [
+                    py_pytree.LeafSpec(),
+                    py_pytree.LeafSpec(),
+                    py_pytree.LeafSpec(),
+                ],
+            ),
+            # py_pytree.tree_structure(OrderedDict([("a", (0, 1)), ("b", 2), ("c", {"a": 3, "b": 4, "c": 5})])
+            py_pytree.TreeSpec(
+                OrderedDict,
+                ["a", "b", "c"],
+                [
+                    py_pytree.TreeSpec(
+                        tuple,
+                        None,
+                        [
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+                        ],
+                    ),
+                    py_pytree.LeafSpec(),
+                    py_pytree.TreeSpec(
+                        dict,
+                        ["a", "b", "c"],
+                        [
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ],
                     ),
                 ],
             ),
+<<<<<<< HEAD
             # python_pytree.tree_structure([(0, 1, [2, 3])])
             python_pytree.TreeSpec(
                 list,
@@ -955,12 +1548,32 @@ def test_treespec_repr(self):
                                 [
                                     python_pytree.treespec_leaf(),
                                     python_pytree.treespec_leaf(),
+=======
+            # py_pytree.tree_structure([(0, 1, [2, 3])])
+            py_pytree.TreeSpec(
+                list,
+                None,
+                [
+                    py_pytree.TreeSpec(
+                        tuple,
+                        None,
+                        [
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+                            py_pytree.TreeSpec(
+                                list,
+                                None,
+                                [
+                                    py_pytree.LeafSpec(),
+                                    py_pytree.LeafSpec(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 ],
                             ),
                         ],
                     ),
                 ],
             ),
+<<<<<<< HEAD
             # python_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
             python_pytree.TreeSpec(
                 defaultdict,
@@ -977,6 +1590,30 @@ def test_treespec_repr(self):
                         [python_pytree.treespec_leaf(), python_pytree.treespec_leaf()],
                     ),
                     python_pytree.TreeSpec(dict, [], []),
+=======
+            # py_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
+            py_pytree.TreeSpec(
+                defaultdict,
+                [list, ["a", "b", "c"]],
+                [
+                    py_pytree.TreeSpec(
+                        list,
+                        None,
+                        [
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+                        ],
+                    ),
+                    py_pytree.TreeSpec(
+                        list,
+                        None,
+                        [
+                            py_pytree.LeafSpec(),
+                            py_pytree.LeafSpec(),
+                        ],
+                    ),
+                    py_pytree.TreeSpec(dict, [], []),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
         ],
@@ -985,6 +1622,7 @@ def test_pytree_serialize(self, spec):
         # Ensure that the spec is valid
         self.assertEqual(
             spec,
+<<<<<<< HEAD
             python_pytree.tree_structure(
                 python_pytree.tree_unflatten([0] * spec.num_leaves, spec)
             ),
@@ -1004,10 +1642,32 @@ def test_pytree_serialize_defaultdict_enum(self):
                     None,
                     [
                         python_pytree.treespec_leaf(),
+=======
+            py_pytree.tree_structure(
+                py_pytree.tree_unflatten([0] * spec.num_leaves, spec)
+            ),
+        )
+
+        serialized_spec = py_pytree.treespec_dumps(spec)
+        self.assertIsInstance(serialized_spec, str)
+        self.assertEqual(spec, py_pytree.treespec_loads(serialized_spec))
+
+    def test_pytree_serialize_defaultdict_enum(self):
+        spec = py_pytree.TreeSpec(
+            defaultdict,
+            [list, [TestEnum.A]],
+            [
+                py_pytree.TreeSpec(
+                    list,
+                    None,
+                    [
+                        py_pytree.LeafSpec(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ],
                 ),
             ],
         )
+<<<<<<< HEAD
         serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
 
@@ -1015,62 +1675,109 @@ def test_pytree_serialize_enum(self):
         spec = python_pytree.TreeSpec(dict, TestEnum.A, [python_pytree.treespec_leaf()])
 
         serialized_spec = python_pytree.treespec_dumps(spec)
+=======
+        serialized_spec = py_pytree.treespec_dumps(spec)
+        self.assertIsInstance(serialized_spec, str)
+
+    def test_pytree_serialize_enum(self):
+        spec = py_pytree.TreeSpec(dict, TestEnum.A, [py_pytree.LeafSpec()])
+
+        serialized_spec = py_pytree.treespec_dumps(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsInstance(serialized_spec, str)
 
     def test_pytree_serialize_namedtuple(self):
         Point1 = namedtuple("Point1", ["x", "y"])
+<<<<<<< HEAD
         python_pytree._register_namedtuple(
+=======
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Point1,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point1",
         )
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(Point1(1, 2))
         self.assertIs(spec.type, namedtuple)
         roundtrip_spec = python_pytree.treespec_loads(
             python_pytree.treespec_dumps(spec)
         )
+=======
+        spec = py_pytree.tree_structure(Point1(1, 2))
+        self.assertIs(spec.type, namedtuple)
+        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(spec, roundtrip_spec)
 
         class Point2(NamedTuple):
             x: int
             y: int
 
+<<<<<<< HEAD
         python_pytree._register_namedtuple(
+=======
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Point2,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point2",
         )
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(Point2(1, 2))
         self.assertIs(spec.type, namedtuple)
         roundtrip_spec = python_pytree.treespec_loads(
             python_pytree.treespec_dumps(spec)
         )
+=======
+        spec = py_pytree.tree_structure(Point2(1, 2))
+        self.assertIs(spec.type, namedtuple)
+        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(spec, roundtrip_spec)
 
         class Point3(Point2):
             pass
 
+<<<<<<< HEAD
         python_pytree._register_namedtuple(
+=======
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Point3,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point3",
         )
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(Point3(1, 2))
         self.assertIs(spec.type, namedtuple)
         roundtrip_spec = python_pytree.treespec_loads(
             python_pytree.treespec_dumps(spec)
         )
+=======
+        spec = py_pytree.tree_structure(Point3(1, 2))
+        self.assertIs(spec.type, namedtuple)
+        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(spec, roundtrip_spec)
 
     def test_pytree_serialize_namedtuple_bad(self):
         DummyType = namedtuple("DummyType", ["x", "y"])
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(DummyType(1, 2))
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             NotImplementedError, "Please register using `_register_namedtuple`"
         ):
+<<<<<<< HEAD
             python_pytree.treespec_dumps(spec)
+=======
+            py_pytree.treespec_dumps(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pytree_custom_type_serialize_bad(self):
         class DummyType:
@@ -1078,17 +1785,29 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
         )
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(DummyType(1, 2))
         with self.assertRaisesRegex(
             NotImplementedError, "No registered serialization name"
         ):
             python_pytree.treespec_dumps(spec)
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+        with self.assertRaisesRegex(
+            NotImplementedError, "No registered serialization name"
+        ):
+            py_pytree.treespec_dumps(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pytree_custom_type_serialize(self):
         class DummyType:
@@ -1096,7 +1815,11 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1104,10 +1827,17 @@ def __init__(self, x, y):
             to_dumpable_context=lambda context: "moo",
             from_dumpable_context=lambda dumpable_context: None,
         )
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(DummyType(1, 2))
         serialized_spec = python_pytree.treespec_dumps(spec, 1)
         self.assertIn("moo", serialized_spec)
         roundtrip_spec = python_pytree.treespec_loads(serialized_spec)
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+        serialized_spec = py_pytree.treespec_dumps(spec, 1)
+        self.assertIn("moo", serialized_spec)
+        roundtrip_spec = py_pytree.treespec_loads(serialized_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(roundtrip_spec, spec)
 
     def test_pytree_serialize_register_bad(self):
@@ -1119,7 +1849,11 @@ def __init__(self, x, y):
         with self.assertRaisesRegex(
             ValueError, "Both to_dumpable_context and from_dumpable_context"
         ):
+<<<<<<< HEAD
             python_pytree.register_pytree_node(
+=======
+            py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -1133,7 +1867,11 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1142,31 +1880,51 @@ def __init__(self, x, y):
             from_dumpable_context=lambda dumpable_context: None,
         )
 
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(DummyType(1, 2))
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             TypeError, "Object of type type is not JSON serializable"
         ):
+<<<<<<< HEAD
             python_pytree.treespec_dumps(spec)
+=======
+            py_pytree.treespec_dumps(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pytree_serialize_bad_protocol(self):
         import json
 
         Point = namedtuple("Point", ["x", "y"])
+<<<<<<< HEAD
         spec = python_pytree.tree_structure(Point(1, 2))
         python_pytree._register_namedtuple(
+=======
+        spec = py_pytree.tree_structure(Point(1, 2))
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Point,
             serialized_type_name="test_pytree.test_pytree_serialize_bad_protocol.Point",
         )
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
+<<<<<<< HEAD
             python_pytree.treespec_dumps(spec, -1)
 
         serialized_spec = python_pytree.treespec_dumps(spec)
+=======
+            py_pytree.treespec_dumps(spec, -1)
+
+        serialized_spec = py_pytree.treespec_dumps(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _, data = json.loads(serialized_spec)
         bad_protocol_serialized_spec = json.dumps((-1, data))
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
+<<<<<<< HEAD
             python_pytree.treespec_loads(bad_protocol_serialized_spec)
 
     def test_saved_serialized(self):
@@ -1188,6 +1946,27 @@ def test_saved_serialized(self):
                         python_pytree.treespec_leaf(),
                         python_pytree.treespec_leaf(),
                         python_pytree.treespec_leaf(),
+=======
+            py_pytree.treespec_loads(bad_protocol_serialized_spec)
+
+    def test_saved_serialized(self):
+        # py_pytree.tree_structure(OrderedDict([(1, (0, 1)), (2, 2), (3, {4: 3, 5: 4, 6: 5})]))
+        complicated_spec = py_pytree.TreeSpec(
+            OrderedDict,
+            [1, 2, 3],
+            [
+                py_pytree.TreeSpec(
+                    tuple, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
+                ),
+                py_pytree.LeafSpec(),
+                py_pytree.TreeSpec(
+                    dict,
+                    [4, 5, 6],
+                    [
+                        py_pytree.LeafSpec(),
+                        py_pytree.LeafSpec(),
+                        py_pytree.LeafSpec(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ],
                 ),
             ],
@@ -1195,14 +1974,23 @@ def test_saved_serialized(self):
         # Ensure that the spec is valid
         self.assertEqual(
             complicated_spec,
+<<<<<<< HEAD
             python_pytree.tree_structure(
                 python_pytree.tree_unflatten(
+=======
+            py_pytree.tree_structure(
+                py_pytree.tree_unflatten(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     [0] * complicated_spec.num_leaves, complicated_spec
                 )
             ),
         )
 
+<<<<<<< HEAD
         serialized_spec = python_pytree.treespec_dumps(complicated_spec)
+=======
+        serialized_spec = py_pytree.treespec_dumps(complicated_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         saved_spec = (
             '[1, {"type": "collections.OrderedDict", "context": "[1, 2, 3]", '
             '"children_spec": [{"type": "builtins.tuple", "context": "null", '
@@ -1215,11 +2003,19 @@ def test_saved_serialized(self):
             '[]}, {"type": null, "context": null, "children_spec": []}]}]}]'
         )
         self.assertEqual(serialized_spec, saved_spec)
+<<<<<<< HEAD
         self.assertEqual(complicated_spec, python_pytree.treespec_loads(saved_spec))
 
     def test_tree_map_with_path(self):
         tree = [{i: i for i in range(10)}]
         all_zeros = python_pytree.tree_map_with_path(
+=======
+        self.assertEqual(complicated_spec, py_pytree.treespec_loads(saved_spec))
+
+    def test_tree_map_with_path(self):
+        tree = [{i: i for i in range(10)}]
+        all_zeros = py_pytree.tree_map_with_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda kp, val: val - kp[1].key + kp[0].idx, tree
         )
         self.assertEqual(all_zeros, [dict.fromkeys(range(10), 0)])
@@ -1232,14 +2028,22 @@ class Data:
             c: Optional[str] = None
             d: str = field(init=False, default="")
 
+<<<<<<< HEAD
         python_pytree.register_dataclass(Data)
         old_data = Data(torch.tensor(3), "b", "c")
         old_data.d = "d"
         new_data = python_pytree.tree_map(lambda x: x, old_data)
+=======
+        py_pytree.register_dataclass(Data)
+        old_data = Data(torch.tensor(3), "b", "c")
+        old_data.d = "d"
+        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(new_data.a, torch.tensor(3))
         self.assertEqual(new_data.b, "b")
         self.assertEqual(new_data.c, "c")
         self.assertEqual(new_data.d, "")
+<<<<<<< HEAD
         python_pytree._deregister_pytree_node(Data)
 
         with self.assertRaisesRegex(ValueError, "Missing fields"):
@@ -1260,6 +2064,28 @@ class Data:
         self.assertEqual(new_data.b, "moo")
         self.assertEqual(new_data.c, None)
         python_pytree._deregister_pytree_node(Data)
+=======
+        py_pytree._deregister_pytree_node(Data)
+
+        with self.assertRaisesRegex(ValueError, "Missing fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b"])
+
+        with self.assertRaisesRegex(ValueError, "Unexpected fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
+
+        with self.assertRaisesRegex(ValueError, "Unexpected fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
+
+        py_pytree.register_dataclass(
+            Data, field_names=["a"], drop_field_names=["b", "c"]
+        )
+        old_data = Data(torch.tensor(3), "b", "c")
+        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        self.assertEqual(new_data.a, torch.tensor(3))
+        self.assertEqual(new_data.b, "moo")
+        self.assertEqual(new_data.c, None)
+        py_pytree._deregister_pytree_node(Data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_register_dataclass_class(self):
         class CustomClass:
@@ -1268,11 +2094,19 @@ def __init__(self, x, y):
                 self.y = y
 
         with self.assertRaisesRegex(ValueError, "field_names must be specified"):
+<<<<<<< HEAD
             python_pytree.register_dataclass(CustomClass)
 
         python_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
         c = CustomClass(torch.tensor(0), torch.tensor(1))
         mapped = python_pytree.tree_map(lambda x: x + 1, c)
+=======
+            py_pytree.register_dataclass(CustomClass)
+
+        py_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
+        c = CustomClass(torch.tensor(0), torch.tensor(1))
+        mapped = py_pytree.tree_map(lambda x: x + 1, c)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(mapped.x, torch.tensor(1))
         self.assertEqual(mapped.y, torch.tensor(2))
 
@@ -1283,10 +2117,17 @@ def test_constant(self):
         class Config:
             norm: str
 
+<<<<<<< HEAD
         python_pytree.register_constant(Config)
 
         config = Config("l1")
         elements, spec = python_pytree.tree_flatten(config)
+=======
+        py_pytree.register_constant(Config)
+
+        config = Config("l1")
+        elements, spec = py_pytree.tree_flatten(config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(elements, [])
         self.assertEqual(spec.context.value, config)
 
@@ -1296,7 +2137,11 @@ def __init__(self, norm: str):
                 self.norm = norm
 
         try:
+<<<<<<< HEAD
             python_pytree.register_constant(Config)
+=======
+            py_pytree.register_constant(Config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__eq__` implementation."
@@ -1311,7 +2156,11 @@ def __eq__(self, other):
                 return self.norm == other.norm
 
         try:
+<<<<<<< HEAD
             python_pytree.register_constant(Config)
+=======
+            py_pytree.register_constant(Config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__hash__` implementation."
@@ -1327,23 +2176,40 @@ class ACustomPytree:
         tree1 = [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5]
         tree2 = [ACustomPytree(x=2, y={"cin": [2, 2, 2], "bar": 2}, z="leaf"), 2]
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
             flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
         )
+<<<<<<< HEAD
         from_two_trees = python_pytree.tree_map_with_path(
             lambda kp, a, b: a + b, tree1, tree2
         )
         from_one_tree = python_pytree.tree_map(lambda a: a + 2, tree1)
+=======
+        from_two_trees = py_pytree.tree_map_with_path(
+            lambda kp, a, b: a + b, tree1, tree2
+        )
+        from_one_tree = py_pytree.tree_map(lambda a: a + 2, tree1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(from_two_trees, from_one_tree)
 
     def test_tree_flatten_with_path_is_leaf(self):
         leaf_dict = {"foo": [(3)]}
+<<<<<<< HEAD
         tree = (["hello", [1, 2], leaf_dict],)
         key_leaves, _ = python_pytree.tree_flatten_with_path(
             tree, is_leaf=lambda x: isinstance(x, dict)
+=======
+        pytree = (["hello", [1, 2], leaf_dict],)
+        key_leaves, _ = py_pytree.tree_flatten_with_path(
+            pytree, is_leaf=lambda x: isinstance(x, dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertTrue(key_leaves[-1][1] is leaf_dict)
 
@@ -1359,7 +2225,11 @@ class ACustomPytree:
             y: Any
             z: Any
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1372,12 +2242,19 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
+<<<<<<< HEAD
         for tree in SOME_PYTREES:
             key_leaves, spec = python_pytree.tree_flatten_with_path(tree)
             actual = python_pytree.tree_unflatten(
                 [leaf for _, leaf in key_leaves], spec
             )
             self.assertEqual(actual, tree)
+=======
+        for pytree in SOME_PYTREES:
+            key_leaves, spec = py_pytree.tree_flatten_with_path(pytree)
+            actual = py_pytree.tree_unflatten([leaf for _, leaf in key_leaves], spec)
+            self.assertEqual(actual, pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_tree_leaves_with_path(self):
         class ANamedTuple(NamedTuple):
@@ -1391,7 +2268,11 @@ class ACustomPytree:
             y: Any
             z: Any
 
+<<<<<<< HEAD
         python_pytree.register_pytree_node(
+=======
+        py_pytree.register_pytree_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1404,9 +2285,15 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
+<<<<<<< HEAD
         for tree in SOME_PYTREES:
             flat_out, _ = python_pytree.tree_flatten_with_path(tree)
             leaves_out = python_pytree.tree_leaves_with_path(tree)
+=======
+        for pytree in SOME_PYTREES:
+            flat_out, _ = py_pytree.tree_flatten_with_path(pytree)
+            leaves_out = py_pytree.tree_leaves_with_path(pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(flat_out, leaves_out)
 
     def test_key_str(self):
@@ -1415,8 +2302,13 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
+<<<<<<< HEAD
         flat, _ = python_pytree.tree_flatten_with_path(tree)
         paths = [f"{python_pytree.keystr(kp)}: {val}" for kp, val in flat]
+=======
+        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        paths = [f"{py_pytree.keystr(kp)}: {val}" for kp, val in flat]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             paths,
             [
@@ -1431,7 +2323,11 @@ class ANamedTuple(NamedTuple):
 
     def test_flatten_flatten_with_key_consistency(self):
         """Check that flatten and flatten_with_key produces consistent leaves/context."""
+<<<<<<< HEAD
         reg = python_pytree.SUPPORTED_NODES
+=======
+        reg = py_pytree.SUPPORTED_NODES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         EXAMPLE_TREE = {
             list: [1, 2, 3],
@@ -1450,8 +2346,13 @@ def test_flatten_flatten_with_key_consistency(self):
             example = EXAMPLE_TREE.get(typ)
             if example is None:
                 continue
+<<<<<<< HEAD
             flat_with_path, spec1 = python_pytree.tree_flatten_with_path(example)
             flat, spec2 = python_pytree.tree_flatten(example)
+=======
+            flat_with_path, spec1 = py_pytree.tree_flatten_with_path(example)
+            flat, spec2 = py_pytree.tree_flatten(example)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertEqual(flat, [x[1] for x in flat_with_path])
             self.assertEqual(spec1, spec2)
@@ -1462,9 +2363,15 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
+<<<<<<< HEAD
         flat, _ = python_pytree.tree_flatten_with_path(tree)
         for kp, val in flat:
             self.assertEqual(python_pytree.key_get(tree, kp), val)
+=======
+        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        for kp, val in flat:
+            self.assertEqual(py_pytree.key_get(tree, kp), val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCxxPytree(TestCase):
@@ -1473,12 +2380,21 @@ def setUp(self):
             raise unittest.SkipTest("C++ pytree tests are not supported in fbcode")
 
     def test_treespec_equality(self):
+<<<<<<< HEAD
         self.assertEqual(cxx_pytree.treespec_leaf(), cxx_pytree.treespec_leaf())
 
     def test_treespec_repr(self):
         # Check that it looks sane
         tree = (0, [0, 0, [0]])
         spec = cxx_pytree.tree_structure(tree)
+=======
+        self.assertEqual(cxx_pytree.LeafSpec(), cxx_pytree.LeafSpec())
+
+    def test_treespec_repr(self):
+        # Check that it looks sane
+        pytree = (0, [0, 0, [0]])
+        _, spec = cxx_pytree.tree_flatten(pytree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             repr(spec), "PyTreeSpec((*, [*, *, [*]]), NoneIsLeaf, namespace='torch')"
         )
@@ -1515,7 +2431,11 @@ def test_pytree_serialize(self, spec):
         self.assertEqual(spec, cxx_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_namedtuple(self):
+<<<<<<< HEAD
         python_pytree._register_namedtuple(
+=======
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             GlobalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.GlobalPoint",
         )
@@ -1525,7 +2445,11 @@ def test_pytree_serialize_namedtuple(self):
         self.assertEqual(roundtrip_spec.type._fields, spec.type._fields)
 
         LocalPoint = namedtuple("LocalPoint", ["x", "y"])
+<<<<<<< HEAD
         python_pytree._register_namedtuple(
+=======
+        py_pytree._register_namedtuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             LocalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.LocalPoint",
         )
diff --git a/test/test_quantization.py b/test/test_quantization.py
index c36c20bb0ca76..e78bc5b6085be 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -38,6 +38,16 @@
 from quantization.core.test_workflow_module import TestFusedObsFakeQuantModule  # noqa: F401
 from quantization.core.test_backend_config import TestBackendConfig  # noqa: F401
 from quantization.core.test_utils import TestUtils  # noqa: F401
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+try:
+    # This test has extra data dependencies, so in some environments, e.g. Meta internal
+    # Buck, it has its own test runner.
+    from quantization.core.test_docs import TestQuantizationDocs  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Eager Mode Workflow. Tests for the functionality of APIs and different features implemented
 # using eager mode.
@@ -51,7 +61,11 @@
 from quantization.eager.test_quantize_eager_qat import TestQuantizeEagerQATNumerics  # noqa: F401
 # 3. Eager mode fusion passes
 from quantization.eager.test_fuse_eager import TestFuseEager  # noqa: F401
+<<<<<<< HEAD
 # 4. Testing model numerics between quantized and FP32 models
+=======
+# 4. Testing model numerics between quanitzed and FP32 models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from quantization.eager.test_model_numerics import TestModelNumericsEager  # noqa: F401
 # 5. Tooling: numeric_suite
 from quantization.eager.test_numeric_suite_eager import TestNumericSuiteEager  # noqa: F401
@@ -60,7 +74,10 @@
 from quantization.eager.test_bias_correction_eager import TestBiasCorrectionEager  # noqa: F401
 
 
+<<<<<<< HEAD
 log = logging.getLogger(__name__)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # FX GraphModule Graph Mode Quantization. Tests for the functionality of APIs and different features implemented
 # using fx quantization.
 try:
@@ -72,7 +89,11 @@
 except ImportError as e:
     # In FBCode we separate FX out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_fx`
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # PyTorch 2 Export Quantization
 try:
@@ -94,7 +115,11 @@
 except ImportError as e:
     # In FBCode we separate PT2 out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_pt2e`
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     from quantization.fx.test_numeric_suite_fx import TestFXGraphMatcher  # noqa: F401
@@ -103,7 +128,11 @@
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteNShadows  # noqa: F401
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteCoreAPIsModels  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Test the model report module
 try:
@@ -115,19 +144,31 @@
     from quantization.fx.test_model_report_fx import TestFxDetectOutliers  # noqa: F401
     from quantization.fx.test_model_report_fx import TestFxModelReportVisualizer  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Equalization for FX mode
 try:
     from quantization.fx.test_equalize_fx import TestEqualizeFx  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Backward Compatibility. Tests serialization and BC for quantized modules.
 try:
     from quantization.bc.test_backward_compatibility import TestSerialization  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # JIT Graph Mode Quantization
 from quantization.jit.test_quantize_jit import TestQuantizeJit  # noqa: F401
@@ -146,12 +187,17 @@
 try:
     from quantization.ao_migration.test_quantization_fx import TestAOMigrationQuantizationFx  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Experimental functionality
 try:
     from quantization.core.experimental.test_bits import TestBitsCPU  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
@@ -169,6 +215,25 @@
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
 except ImportError as e:
     log.warning(e)  # noqa:G200
+=======
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCPU  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCUDA  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 4a3235fbc50c9..eb10528e34228 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -735,7 +735,11 @@ def test_numpy_named_args(self, device):
         res2 = x1.sum(axis=(0, 2), keepdims=True)
         self.assertEqual(res1, res2)
 
+<<<<<<< HEAD
     # TODO: kill this and replace with common creation ops
+=======
+    # TODO: kill this ane replace with common creation ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _make_tensors(self, shape, val_range=(-100, 100), use_floating=True, use_integral=True,
                       use_complex=False) -> dict[str, list[torch.Tensor]]:
         float_types = [torch.double,
@@ -1629,7 +1633,11 @@ def test_bucketization(self, device):
                 RuntimeError, "only when boundaries tensor dimension is 1"):
             torch.searchsorted(boundaries, 1)
 
+<<<<<<< HEAD
         # incompatible output tensor's dtype
+=======
+        # incompatiable output tensor's dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_output_dtype(dtype, is_int32):
             output = values_1d.to(dtype)
             with self.assertRaisesRegex(
@@ -1710,7 +1718,11 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype
                                             with_extremal=False, atol=None, rtol=None,
                                             exact_dtype=True, with_keepdim=False):
         # Test 0-d to 3-d tensors.
+<<<<<<< HEAD
         for ndims in range(4):
+=======
+        for ndims in range(0, 4):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for n in range(ndims + 1):
                 for c in combinations(list(range(ndims)), n):
@@ -1899,7 +1911,11 @@ def test_all_any_vs_numpy(self, device, dtype):
         # Note [all, any uint8 compatibility]: However for compatibility reason,
         # for `uint8`, they return Tensor of same dtype `uint8`.
         # Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
+<<<<<<< HEAD
         exact_dtype = dtype != torch.uint8
+=======
+        exact_dtype = True if dtype != torch.uint8 else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _test_all_any(x):
             self.compare_with_numpy(torch.all, np.all, x)
@@ -2018,7 +2034,11 @@ def test_repeated_dim(self, device):
                 with self.assertRaisesRegex(RuntimeError, error_msg):
                     op(x, dim=dim)
 
+<<<<<<< HEAD
     # TODO: update this test to compare against NumPy
+=======
+    # TODO: update this test to comapre against NumPy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     def test_var(self, device):
         cpu_tensor = torch.randn(2, 3, 3)
@@ -2513,7 +2533,11 @@ def test_median_real_values(self, device, dtype):
             k = int((t.numel() - 1) / 2)
             self.assertEqual(res, t.view(-1).sort()[0][k])
             if t.numel() % 2 == 1:
+<<<<<<< HEAD
                 # We can only test against numpy for odd reductions because numpy
+=======
+                # We can only test agains numpy for odd reductions because numpy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # returns the mean of the two medians and torch returns the lower
                 self.assertEqual(res.cpu().numpy(), np.median(t_numpy))
             for dim in range(t.ndim):
@@ -2524,7 +2548,11 @@ def test_median_real_values(self, device, dtype):
                 self.assertEqual(res[0], (t.sort(dim)[0]).select(dim, k).unsqueeze_(dim))
                 self.assertEqual(res[0], t.gather(dim, res[1]))
                 if size % 2 == 1:
+<<<<<<< HEAD
                     # We can only test against numpy for odd reductions because numpy
+=======
+                    # We can only test agains numpy for odd reductions because numpy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # returns the mean of the two medians and torch returns the lower
                     self.assertEqual(res[0].cpu().numpy(), np.median(t_numpy, dim, keepdims=True), exact_dtype=False)
 
@@ -2548,7 +2576,11 @@ def test_median_nan_values(self, device, dtype):
                     k = int((t.numel() - num_nan - 1) / 2)
                 self.assertEqual(res, t.view(-1).sort()[0][k])
                 if (t.numel() - num_nan) % 2 == 1:
+<<<<<<< HEAD
                     # We can only test against numpy for odd reductions because numpy
+=======
+                    # We can only test agains numpy for odd reductions because numpy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # returns the mean of the two medians and torch returns the lower
                     self.assertEqual(res.item(), numpy_op(t.cpu().numpy()))
                 for dim in range(t.ndim):
@@ -2561,7 +2593,11 @@ def test_median_nan_values(self, device, dtype):
                         k = ((size - num_nan - 1) / 2).type(torch.long)
                     self.assertEqual(res[0], (t.sort(dim)[0]).gather(dim, k))
                     self.assertEqual(res[0], t.gather(dim, res[1]))
+<<<<<<< HEAD
                     # We can only test against numpy for odd reductions because numpy
+=======
+                    # We can only test agains numpy for odd reductions because numpy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # returns the mean of the two medians and torch returns the lower
                     mask = (size - num_nan) % 2 == 1
                     res = res[0].masked_select(mask).cpu()
@@ -2623,7 +2659,11 @@ def test_quantile(self, device, dtype):
         # Generate some random test cases
         ops = ['quantile', 'nanquantile']
         inputs = [tuple(np.random.randint(2, 10, size=i)) for i in range(1, 4)]
+<<<<<<< HEAD
         quantiles = [tuple(np.random.rand(i)) for i in range(5)]
+=======
+        quantiles = [tuple(np.random.rand(i)) for i in range(0, 5)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         keepdims = [True, False]
 
         # Add corner cases
@@ -3047,6 +3087,7 @@ def test_histc(self, device):
             torch.tensor([1], dtype=torch.float, device=device),
             actual)
         # tensors with inf; min, max not provided -- should throw a RuntimeError
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
             torch.histc(torch.tensor([float("inf")], dtype=torch.float, device=device))
         with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
@@ -3054,6 +3095,11 @@ def test_histc(self, device):
         with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
             torch.histc(torch.tensor([float("-inf"), float("inf")], dtype=torch.float, device=device))
         with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
+=======
+        with self.assertRaisesRegex(RuntimeError, r'range of \[inf, inf\] is not finite'):
+            torch.histc(torch.tensor([float("inf")], dtype=torch.float, device=device))
+        with self.assertRaisesRegex(RuntimeError, r'range of \[1, inf\] is not finite'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.histc(torch.tensor([1., 2., float("inf")], dtype=torch.float, device=device))
         # tensors with inf; min, max provided
         self.assertEqual(
@@ -3134,6 +3180,7 @@ def test_histc_min_max_corner_cases(self, device, dtype):
             torch.tensor([2, 0, 0, 1], dtype=dtype, device=device),
             actual)
 
+<<<<<<< HEAD
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_histc_value_corner_cases(self, device, dtype):
@@ -3148,6 +3195,8 @@ def test_histc_value_corner_cases(self, device, dtype):
             bins=4)
         self.assertEqual(3.0, actual.sum())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @dtypes(torch.uint8, torch.int8, torch.int, torch.long)
     def test_histc_min_max_corner_cases_cuda(self, device, dtype):
@@ -3327,7 +3376,11 @@ def test_histogram(self, device, dtype):
     """
     def _test_histogramdd_numpy(self, t, bins, bin_range, weights, density):
         def to_np(t):
+<<<<<<< HEAD
             if type(t) is list:
+=======
+            if type(t) == list:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return list(map(to_np, t))
             if not torch.is_tensor(t):
                 return t
@@ -3544,7 +3597,11 @@ def test_tensor_compare_ops_empty(self, device):
     # raises an error if no `dim` parameter is specified. This exists separately from tests in
     # test_tensot_compare_ops_empty because not specifying a `dim` parameter in the former tests does
     # not throw errors. Also, checking the return type of argmax requires supplying a different dtype
+<<<<<<< HEAD
     # argument than that for the input tensor. There is also variation in numpy testing.
+=======
+    # argument than that for the input tensor. There is also variantion in numpy testing.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_compare_ops_argmax_argmix_kthvalue_dim_empty(self, device):
         shape = (2, 0, 4)
         master_input = torch.randn(shape, device=device)
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index ba967c142f1e7..8e2dc48da5356 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -380,6 +380,7 @@ def helper(input_size, idx_size):
         helper([50, 8, 7], 100)
         helper([50, 3, 4, 5], 100)
 
+<<<<<<< HEAD
     @dtypes(torch.float32)
     def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
         for d in (0, 1):
@@ -397,6 +398,8 @@ def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
             self.assertEqual(res, ref)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     @dtypes(torch.float32, torch.float64, torch.bfloat16)
     def test_gather_expanded_index(self, device, dtype):
@@ -456,7 +459,11 @@ def unsqueeze_helper(idx, dim):
         helper([50, 8, 7], 100)
         helper([50, 3, 4, 5], 100)
 
+<<<<<<< HEAD
 # Generic Device Test Framework instantiation, see
+=======
+# Generic Device Test Framework instantation, see
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
 #   for details.
 instantiate_device_type_tests(TestScatterGather, globals())
diff --git a/test/test_schema_check.py b/test/test_schema_check.py
index 91d9a484d3c89..829a72925042a 100644
--- a/test/test_schema_check.py
+++ b/test/test_schema_check.py
@@ -14,12 +14,18 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_device_type import ops, OpDTypes, instantiate_device_type_tests
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, slowTestIf
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
 
 
+=======
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def secretly_aliasing(x):
     return x.view(-1)
 
@@ -496,9 +502,15 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         with SchemaInfoBindTestMode(self) as schemaInfoCheck:
             x.add(x)
 
+<<<<<<< HEAD
 class TestSchemaCheckModeOpInfo(JitTestCase):
     @ops(op_db, dtypes=OpDTypes.supported)
     @slowTestIf(IS_WINDOWS)
+=======
+
+class TestSchemaCheckModeOpInfo(JitTestCase):
+    @ops(op_db, dtypes=OpDTypes.supported)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_schema_correctness(self, device, dtype, op):
         # Currently torch.equal isn't supported with torch.complex32
         # There's also errors with complex64 and complex128
diff --git a/test/test_segment_reductions.py b/test/test_segment_reductions.py
index 18159044407c5..a6882a2ce6638 100644
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@@ -14,6 +14,10 @@
     run_tests,
     gradcheck,
     parametrize,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -129,7 +133,11 @@ def test_simple_1d(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
+<<<<<<< HEAD
                 check_backward = initial is not None
+=======
+                check_backward = True if initial is not None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -186,7 +194,11 @@ def test_simple_zero_length(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
+<<<<<<< HEAD
                 check_backward = initial is not None
+=======
+                check_backward = True if initial is not None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -230,6 +242,10 @@ def test_simple_zero_length(self, device, dtypes):
                             length_type,
                         )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(
         *product(
             (torch.half, torch.bfloat16, torch.float, torch.double),
@@ -244,7 +260,11 @@ def test_multi_d_simple(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
+<<<<<<< HEAD
                 check_backward = initial is not None
+=======
+                check_backward = True if initial is not None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -556,7 +576,11 @@ def test_unsafe_flag(self, device, dtype):
         lengths = torch.tensor([0, 2, 3, 0], device=device, dtype=length_type)
         data = torch.arange(6, dtype=torch.float, device=device)
 
+<<<<<<< HEAD
         # test for error on 1-D lengths
+=======
+        # test for error on 1-D lenghts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, "Expected all rows of lengths along axis"):
             torch._segment_reduce(data, 'sum', lengths=lengths, axis=0, unsafe=False)
 
diff --git a/test/test_serialization.py b/test/test_serialization.py
index e378c6c2789d6..fbe77d6b5acfc 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -57,12 +57,18 @@
     TemporaryDirectoryName,
     TemporaryFileName,
     TEST_DILL,
+<<<<<<< HEAD
     TEST_WITH_MTIA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.two_tensor import TwoTensor  # noqa: F401
 from torch.utils._import_utils import import_dill
+<<<<<<< HEAD
 from pickle import UnpicklingError
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if not IS_WINDOWS:
@@ -70,9 +76,12 @@
 else:
     MAP_SHARED, MAP_PRIVATE = None, None
 
+<<<<<<< HEAD
 if TEST_WITH_MTIA:
     import mtia.host_runtime.torch_mtia.dynamic_library  # noqa: F401
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # These tests were all copied from `test/test_torch.py` at some point, so see
 # the actual blame, see this revision
 # https://github.com/pytorch/pytorch/blame/9a2691f2fc948b9792686085b493c61793c2de30/test/test_torch.py
@@ -295,7 +304,11 @@ def test_serialization_fake_zip(self):
             5,
             6
         ]
+<<<<<<< HEAD
         for i in range(100):
+=======
+        for i in range(0, 100):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             data.append(0)
         t = torch.tensor(data, dtype=torch.uint8)
 
@@ -652,10 +665,13 @@ def load_bytes():
         xpu_last_map_locations = [
             f'xpu:{torch.xpu.device_count() - 1}',
         ]
+<<<<<<< HEAD
         mtia_0_map_locations = generate_map_locations('mtia')
         mtia_last_map_locations = [
             f'mtia:{torch.mtia.device_count() - 1}',
         ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def check_map_locations(map_locations, dtype, intended_device):
             for fileobject_lambda in fileobject_lambdas:
@@ -681,6 +697,7 @@ def check_map_locations(map_locations, dtype, intended_device):
                 torch.float,
                 torch.device('xpu', torch.xpu.device_count() - 1)
             )
+<<<<<<< HEAD
         if torch.mtia.is_available():
             check_map_locations(mtia_0_map_locations, torch.float, torch.device('mtia', 0))
             check_map_locations(
@@ -688,6 +705,8 @@ def check_map_locations(map_locations, dtype, intended_device):
                 torch.float,
                 torch.device('mtia', torch.mtia.device_count() - 1)
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(torch.cuda.is_available(), "Testing torch.load on CPU-only machine")
     def test_load_nonexistent_device(self):
@@ -762,7 +781,11 @@ def test_serialization_filelike_stress(self):
                                           'readinto() stress test')
 
     def test_serialization_filelike_uses_readinto(self):
+<<<<<<< HEAD
         # For maximum efficiency, when reading a file-like object,
+=======
+        # For maximum effiency, when reading a file-like object,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ensure the C API calls readinto instead of read.
         a = torch.randn(5, 4)
 
@@ -1372,6 +1395,7 @@ def test_weights_only_error(self, unsafe_global):
                                             "file an issue with the following so that we can make `weights_only=True`"):
                     torch.load(f, weights_only=True)
 
+<<<<<<< HEAD
     def test_weights_only_blocked_func_error_msg(self):
         import datetime
         import zoneinfo
@@ -1405,6 +1429,8 @@ def test_weights_only_with_zoneinfo_unpickle_registration_success(self):
                 loaded_data = torch.load(f)
                 self.assertEqual(loaded_data, data)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize('weights_only', (False, True))
     def test_serialization_math_bits(self, weights_only):
         t = torch.randn(1, dtype=torch.cfloat)
@@ -4553,7 +4579,11 @@ def test_weights_only_env_variables(self, force_weights_only):
         with TemporaryFileName() as f:
             torch.save(m, f)
             try:
+<<<<<<< HEAD
                 old_value = os.environ.get(env_var, None)
+=======
+                old_value = os.environ[env_var] if env_var in os.environ else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 os.environ[env_var] = "1"
                 # if weights_only is explicitly set, TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD cannot override it
                 with self.assertRaisesRegex(pickle.UnpicklingError, "Weights only load failed"):
@@ -4802,6 +4832,7 @@ def test_serialization_uintx_intx(self):
 
             assert x.dtype == y.dtype
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("getrefcount does not work in dynamo")
     def test_serializaion_no_storage_leak(self):
         # Test https://github.com/pytorch/pytorch/issues/149846
@@ -4814,6 +4845,8 @@ def test_serializaion_no_storage_leak(self):
         ref2 = sys.getrefcount(storage)
         self.assertEqual(ref1, ref2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super().run(*args, **kwargs)
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 24c8122d5aeec..83f54f941b924 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -14,12 +14,20 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+<<<<<<< HEAD
     dtypesIfXPU,
     instantiate_device_type_tests,
     largeTensorTest,
     onlyCPU,
     onlyNativeDeviceTypes,
     onlyOn,
+=======
+    instantiate_device_type_tests,
+    largeTensorTest,
+    onlyCPU,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_dtype import (
     all_types,
@@ -262,7 +270,11 @@ def test_diagonal_multidim(self, device, dtype):
             expected = xn.diagonal(*args)
             self.assertEqual(expected.shape, result.shape)
             self.assertEqual(expected, result)
+<<<<<<< HEAD
         # test non-contiguous
+=======
+        # test non-continguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         xp = x.permute(1, 2, 3, 0)
         result = torch.diagonal(xp, 0, -2, -1)
         expected = xp.numpy().diagonal(0, -2, -1)
@@ -272,7 +284,10 @@ def test_diagonal_multidim(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(*all_types())
     @dtypesIfCUDA(*all_types_and(torch.half))
+<<<<<<< HEAD
     @dtypesIfXPU(*all_types_and(torch.half))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_trace(self, device, dtype):
         def test(shape):
             tensor = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
@@ -570,7 +585,11 @@ def test_flip_numpy(self, device, dtype):
                     np_fn = partial(np.flip, axis=flip_dim)
                     self.compare_with_numpy(torch_fn, np_fn, data)
 
+<<<<<<< HEAD
     @onlyOn(["cuda", "xpu"])  # CPU is too slow
+=======
+    @onlyCUDA  # CPU is too slow
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("17GB")  # 4 tensors of 4GB (in, out) x (torch, numpy) + 1GB
     @largeTensorTest(
         "81GB", "cpu"
@@ -717,7 +736,10 @@ def gen_nontrivial_input(shape, dtype, device):
                 )
             if (
                 self.device_type == "cuda"
+<<<<<<< HEAD
                 or self.device_type == "xpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or self.device_type == TEST_PRIVATEUSE1_DEVICE_TYPE
             ):
                 self.assertRaisesRegex(
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 5be1758186467..29aa8c87305c5 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -179,7 +179,11 @@ def test_sort_stable_none(self):
     def test_complex_unsupported_cpu(self):
         x = torch.tensor([3.0 + 2j, 4.0 + 3j])
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, " Sort does not support complex dtypes on CPU"
+=======
+            ValueError, "Sort currently does not support complex dtypes on CPU."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.sort(input=x)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 49c793b61be64..f9455336f6143 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -14,13 +14,17 @@
     parametrize, subtest, is_coalesced_indices, suppress_warnings, instantiate_parametrized_tests, \
     skipIfCrossRef
 from torch.testing._internal.common_cuda import TEST_CUDA
+<<<<<<< HEAD
 from torch.testing._internal.common_mps import mps_ops_modifier
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from numbers import Number
 from typing import Any
 from packaging import version
 from torch.testing._internal.common_cuda import \
     (SM53OrLater, SM80OrLater, TEST_MULTIGPU)
 from torch.testing._internal.common_device_type import \
+<<<<<<< HEAD
     (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, dtypesIfMPS, onlyCPU, onlyCUDA, precisionOverride,
      deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, expectedFailureMPS,
      expectedFailureMPSComplex, largeTensorTest)
@@ -28,6 +32,14 @@
     (op_db, reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops, binary_ufuncs)
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex, all_mps_types, all_types_and_complex_and, floating_and_complex_types,
+=======
+    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
+     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes)
+from torch.testing._internal.common_methods_invocations import \
+    (op_db, reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops, binary_ufuncs)
+from torch.testing._internal.common_dtype import (
+    all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     floating_and_complex_types_and, integral_types, floating_types_and,
 )
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
@@ -44,6 +56,10 @@ def _op_supports_any_sparse(op):
             or op.supports_sparse_bsc)
 
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 reduction_ops_with_sparse_support = [
     op for op in reduction_ops if 'masked.' not in op.name and
     _op_supports_any_sparse(op) and not isinstance(op, ReductionPythonRefInfo)]
@@ -59,12 +75,17 @@ def _op_supports_any_sparse(op):
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # batched grad doesn't support sparse
 gradcheck = functools.partial(gradcheck, check_batched_grad=False)
 
 CUSPARSE_SPMM_COMPLEX128_SUPPORTED = (
+<<<<<<< HEAD
     IS_WINDOWS and torch.version.cuda
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
@@ -75,6 +96,12 @@ def _op_supports_any_sparse(op):
 SPARSE_COMPLEX128_SUPPORTED = CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
 SPARSE_FLOAT16_SUPPORTED = (SM53OrLater and torch.version.cuda) or (HIPSPARSE_FP16_SUPPORTED)
 SPARSE_BFLOAT16_SUPPORTED = (SM80OrLater and torch.version.cuda) or (HIPSPARSE_BF16_SUPPORTED)
+=======
+    IS_WINDOWS and torch.version.cuda and version.parse(torch.version.cuda) > version.parse("11.2")
+) or (not IS_WINDOWS and not TEST_WITH_ROCM)
+
+HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def all_sparse_layouts(test_name='layout', include_strided=False):
     return parametrize(test_name, [
@@ -231,12 +258,18 @@ def randn(self, *args, **kwargs):
         return torch.empty(*args, **kwargs).normal_()
 
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_print_coalesced(self, device, dtype):
         self._test_print(device, dtype, True)
 
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_print_uncoalesced(self, device, dtype):
         self._test_print(device, dtype, False)
 
@@ -275,7 +308,11 @@ def _test_print(self, device, dtype, coalesced):
             if values.dtype == torch.double:
                 dtypes.append(torch.float)
             else:
+<<<<<<< HEAD
                 dtypes.append(torch.double if values.device != torch.device("mps:0") else torch.float32)
+=======
+                dtypes.append(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for dtype in dtypes:
                 printed.append(f"########## {dtype} ##########")
                 x = sp_tensor.detach().to(dtype)
@@ -295,7 +332,10 @@ def _test_print(self, device, dtype, coalesced):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_basic(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             if isinstance(with_size, Number):
@@ -330,7 +370,10 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.bfloat16: 1e-2})
     def test_coalesce(self, device, dtype, coalesced):
 
@@ -378,6 +421,7 @@ def _test_coalesce(t):
             t, _, _ = self._gen_sparse(len(sparse_size), nnz, sparse_size + dense_size, dtype, device, coalesced)
             _test_coalesce(t)  # this tests correctness
 
+<<<<<<< HEAD
     @onlyCUDA
     @largeTensorTest("30GB", "cuda")
     @skipCUDAIf(not SM80OrLater and not TEST_WITH_ROCM, "CUDA capability < SM80 and not ROCM")
@@ -394,6 +438,9 @@ def test_coalesce_accepts_large_tensor(self, device, dtype):
 
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/89395")
     def test_coalesce_reference_cycle(self, device, dtype):
         # Test coalesce doesn't create autograd graph cycles (gh-52253)
@@ -421,7 +468,10 @@ def test_sparse_sum():
         self.assertTrue(ref.expired())
 
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ctor_large_sizes(self, device, dtype):
         # Test that integer overflow is detected when computing numel
         # of a sparse tensor with large dimensions (gh-57416). Notice
@@ -436,7 +486,10 @@ def test_ctor_large_sizes(self, device, dtype):
                               indices, values, (N + 1,) * 4, device=device))
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ctor_size_checks(self, device, dtype):
         indices = self.index_tensor([
             [0, 0, 0],
@@ -460,10 +513,15 @@ def test_ctor_size_checks(self, device, dtype):
             RuntimeError,
             lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1])))
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @coalescedonoff
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ctor_is_coalesced_with_gradcheck(self, device, dtype, coalesced):
         for sparse_size, nnz in (((3, 3), 5), ((2, 3, 1, 5), 11)):
             t, _, _ = self._gen_sparse(len(sparse_size), nnz, sparse_size, dtype, device, coalesced)
@@ -488,8 +546,12 @@ def func(indices, values, shape, is_coalesced):
                         torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
 
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types())
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @gradcheck_semantics()
     def test_to_dense_with_gradcheck(self, device, dtype, gradcheck):
 
@@ -513,8 +575,12 @@ def fn(x):
             x.requires_grad_(True)
             gradcheck(fn, (x,))
 
+<<<<<<< HEAD
         values_types = [torch.double, torch.cdouble] if device != "mps:0" else [torch.float32, torch.complex64]
         for value_type in values_types:
+=======
+        for value_type in [torch.double, torch.cdouble]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             i = self.index_tensor([
                 [0, 1, 2, 2],
                 [0, 0, 0, 3],
@@ -554,12 +620,19 @@ def fn(x):
 
     @coalescedonoff
     @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float16, torch.bfloat16, torch.float32, torch.int, torch.cfloat)
     def test_to_sparse(self, device, dtype, coalesced):
         shape = [5, 2, 10, 4]
         max_nnz = 1
         dtypes = [torch.double, torch.cdouble] if device != "mps:0" else [torch.float32, torch.complex64]
         for value_type in dtypes:
+=======
+    def test_to_sparse(self, device, dtype, coalesced):
+        shape = [5, 2, 10, 4]
+        max_nnz = 1
+        for value_type in [torch.double, torch.cdouble]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for dim, dim_sz in enumerate(shape, 1):
                 max_nnz *= dim_sz
                 rnnz = torch.randint(2, max_nnz, (1,)).item()
@@ -575,7 +648,10 @@ def test_to_sparse(self, device, dtype, coalesced):
                     self.assertEqual(dim, result.sparse_dim())
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_bool(self, device, dtype):
         a = torch.tensor([True, False], dtype=dtype, device=device).to(torch.bool)
         b = a.to_sparse().to_dense()
@@ -583,7 +659,10 @@ def test_sparse_bool(self, device, dtype):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/108667")
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scalar(self, device, dtype):
         # tensor with value
         a = self.sparse_tensor(self.index_tensor([], device=device).unsqueeze(1), 12.3, [], dtype=dtype, device=device)
@@ -614,7 +693,10 @@ def test_scalar(self, device, dtype):
         self.assertEqual(a, a.to_dense().to_sparse())
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shared(self, device, dtype):
         i = self.index_tensor([[2]], device=device)
         v = torch.tensor([5], dtype=dtype, device=device)
@@ -630,10 +712,15 @@ def test_shared(self, device, dtype):
         i[0][0] = 0
         self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @dtypes(torch.double, torch.cdouble)
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @gradcheck_semantics()
     def test_to_dense_hybrid(self, device, dtype, gradcheck):
 
@@ -681,7 +768,10 @@ def fn(x):
         test_tensor(x, res)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -763,7 +853,10 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig_hybrid(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -851,7 +944,10 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @coalescedonoff
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.double, torch.cdouble)
     def test_clone(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
@@ -871,7 +967,10 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.bfloat16: 2e-2})
     def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
         # This is for testing torch.copy_(SparseTensor, SparseTensor)
@@ -894,7 +993,11 @@ def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
         x1.copy_(x2)
         self.assertEqual(x1_dtype, x1.dtype)
 
+<<<<<<< HEAD
         x2 = x2.to(torch.float64) if device != "mps:0" else x2.to(torch.float32)
+=======
+        x2 = x2.to(torch.float64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x1_dtype = x1.dtype
         x1.copy_(x2)
         self.assertEqual(x1_dtype, x1.dtype)
@@ -973,7 +1076,10 @@ def test_tensor(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transpose(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
@@ -994,9 +1100,13 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @gradcheck_semantics()
     def test_permute(self, device, dtype, coalesced, gradcheck):
         # trivial checks
@@ -1075,7 +1185,10 @@ def test_shape(di, dj, dk, nnz):
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1166")
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_t_empty(self, device, dtype):
         def test_in_place(x):
             shape_original = x.shape
@@ -1105,7 +1218,10 @@ def test_not_in_place(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_zeros(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1121,7 +1237,10 @@ def test_shape(sparse_dims, nnz, sizes):
         test_shape(2, 20, [3, 17, 19, 0])
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_sub_nnz(self, device, dtype):
         # nnz should not grow unbounded (gh-34964)
         x = torch.randn(10, dtype=dtype, device=device).to_sparse()
@@ -1135,7 +1254,10 @@ def test_add_sub_nnz(self, device, dtype):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat(self, device, dtype, coalesced):
         # shapes: list of tuples (sparse_dims, nnz, sizes)
         def test_shapes(shapes, dim, fail_message=None):
@@ -1178,7 +1300,10 @@ def test_shapes(shapes, dim, fail_message=None):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unsqueeze(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1211,7 +1336,10 @@ def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_select(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=None):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1257,10 +1385,15 @@ def test_select_no_type_promotion(self, device, dtype):
             self.assertEqual(t.dtype, t[0, 0].dtype)
             self.assertEqual(t.dtype, t[1, 1].dtype)
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_select(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=None):
             if isinstance(select_index, int):
@@ -1294,7 +1427,11 @@ def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coales
             # NOTE: indices are negative
             idx_dim_d_range = list(range(-sizes[d], 0))
             for idx_len in range(sizes[d], sizes[d] + 1):
+<<<<<<< HEAD
                 # creates all possible valid indices into dim d of length idx_len
+=======
+                # creates all possible valid indices into dim d of lenght idx_len
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for idx in itertools.product(*itertools.repeat(idx_dim_d_range, idx_len)):
                     t_idx = torch.tensor(idx, dtype=torch.long, device=device)
 
@@ -1311,26 +1448,41 @@ def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coales
                     small_sparse_result = t_small_sparse.index_select(d, t_idx)
                     self.assertEqual(small_dense_result, small_sparse_result)
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
         # will trigger brute-force algo
         self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced)
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
         # will trigger more sophisticated algos
         self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced)
 
+<<<<<<< HEAD
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced):
         # empty index
         idx_empty = torch.tensor([], dtype=torch.long, device=device)
@@ -1339,7 +1491,11 @@ def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coales
         res_sparse = t.to_sparse().index_select(0, idx_empty)
         self.assertEqual(res_dense, res_sparse)
 
+<<<<<<< HEAD
         # non-contiguous index
+=======
+        # non-contigous index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = torch.randint(low=0, high=5, size=(10, 2), device=device)[:, 0]
 
         def run_test(sizes):
@@ -1428,7 +1584,10 @@ def test_shape(di, dj, dk, nnz):
     )
     @coalescedonoff
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bmm(self, device, dtype, coalesced):
         def test_shape(num_mats, dim_i, dim_j, dim_k, nnz):
             a_list = []
@@ -1640,7 +1799,10 @@ def test_shape(di, dj, dk, nnz):
     @coalescedonoff
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16, torch.float16)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_addmm(self, device, dtype, coalesced):
         if (dtype is torch.bfloat16 or dtype is torch.float16) and device.startswith("cuda"):
             self.skipTest('addmm_sparse_cuda is not implemented for BFloat16 and Half')
@@ -1680,10 +1842,15 @@ def fn(S, D1, D2, beta=beta, alpha=alpha):
         test_shape(7, 8, 9, 20, True, (1, 1))
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @dtypes(torch.double)
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_mm(self, device, dtype, coalesced):
         def test_shape(d1, d2, d3, nnz, transposed):
             if transposed:
@@ -1705,19 +1872,28 @@ def fn(S, D):
 
     @coalescedonoff
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @gradcheck_semantics()
     def test_sparse_mul(self, device, dtype, coalesced, gradcheck):
         # https://github.com/pytorch/pytorch/issues/79914
         a = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
         b = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
+<<<<<<< HEAD
         gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(masked_grad=gradcheck.masked), [a, b], eps=1e-4)
+=======
+        gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(masked_grad=gradcheck.masked), [a, b])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_shape(sparse_dims, nnz, with_shape):
             a = self._gen_sparse(sparse_dims, nnz, with_shape, dtype, device, coalesced)[0].requires_grad_(True)
             b = self._gen_sparse(sparse_dims, nnz, with_shape, dtype, device, coalesced)[0].requires_grad_(True)
 
+<<<<<<< HEAD
             self.assertEqual((a * b).to_dense(), a.to_dense() * b.to_dense())
             gradcheck(lambda x, y: (x * y).to_dense(), [a, b], eps=1e-4)
             # Issues with 0-dim indices/values
@@ -1730,6 +1906,19 @@ def test_shape(sparse_dims, nnz, with_shape):
     @coalescedonoff
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+            self.assertEqual((a * b).to_dense(), a.to_dense() * b.to_dense(), masked=True)
+            gradcheck(lambda x, y: (x * y).to_dense(), [a, b])
+            # Issues with 0-dim indices/values
+            gradcheck(lambda x, y: torch.sparse.sum(x * y).to_dense(), [a, b], masked=True)
+
+        # TODO: Re-enable these
+        # test_shape(2, 3, [2, 3, 4, 5])
+        # test_shape(2, 3, [2, 2, 0])
+
+    @coalescedonoff
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)[0]
@@ -1748,9 +1937,13 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)[0]
@@ -1770,7 +1963,10 @@ def test_shape(di, dj, dk, nnz):
 
     @coalescedonoff
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_spadd(self, device, dtype, coalesced):
 
         def _test_spadd_shape(nnz, shape_i, shape_v=None):
@@ -1857,9 +2053,13 @@ def test_sparse_add_out_bfloat16(self, device, dtype, coalesced):
         self.assertEqual(res_fp32, res_bf16, atol=1e-2, rtol=0)
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPSComplex
     @dtypes(torch.double, torch.cdouble)
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+    @dtypes(torch.double, torch.cdouble)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_norm(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)
@@ -1887,9 +2087,13 @@ def test_shape(sparse_dims, nnz, with_size):
                 x.norm(**kwargs)
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(TEST_WITH_CROSSREF, "fallback triggers cuda device error")
     def test_sparse_sum(self, device, dtype, coalesced):
 
@@ -2030,7 +2234,10 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device,
 
     @coalescedonoff
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_basic_ops(self, device, dtype, coalesced):
 
         def _test_basic_ops():
@@ -2062,7 +2269,10 @@ def _test_basic_ops_hybrid():
         _test_basic_ops_hybrid()
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_dense_sparse_mismatch(self, device, dtype):
         def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
             x = torch.zeros(dense_size, dtype=dtype, device=device)
@@ -2079,7 +2289,10 @@ def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
 
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_add_noncontiguous(self, device, dtype):
         indices = self.index_tensor([[1, 2], [0, 2]], device=device)
         values = torch.tensor([1.], dtype=dtype, device=device).expand(2, 3, 4, 5)
@@ -2103,7 +2316,10 @@ def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, devic
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_mask(self, device, dtype, coalesced):
         def _test_sparse_mask_fixed():
             i = self.index_tensor([
@@ -2173,7 +2389,10 @@ def _test_sparse_mask_fixed():
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_mask_hybrid(self, device, dtype, coalesced):
         def _test_sparse_mask_hybrid_fixed():
             i = self.index_tensor([
@@ -2235,8 +2454,11 @@ def _test_sparse_mask_hybrid_fixed():
         self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0], dtype, device, coalesced)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
     @expectedFailureMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfCrossRef
     def test_sparse_mask_backward(self, device, dtype):
         from itertools import product, repeat
@@ -2271,7 +2493,10 @@ def test_sparse_mask_backward(self, device, dtype):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_zeros(self, device, dtype, coalesced):
         def _test_zeros(nnzs, shape, out_shape_i, out_shape_v=None):
             out_shape = out_shape_i + (out_shape_v or [])
@@ -2297,7 +2522,10 @@ def test_shape(i_shapes, v_shapes, shape, nnzs):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_zeros_like(self, device, dtype, coalesced):
         def _test_zeros_like(nnzs, template_shape_i, template_shape_v=None):
             template_shape_v = template_shape_v or []
@@ -2374,14 +2602,21 @@ def _test_empty_like(self, sparse_tensor, dtype, device, coalesced):
             self.assertTrue(result.layout == torch.strided)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA|MPS)' backend"
+=======
+            RuntimeError, r"Could not run 'aten::empty_strided' with arguments from the 'Sparse(CPU|CUDA)' backend"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             dense_tensor = sparse_tensor.to_dense()
             result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_empty_like(self, device, dtype, coalesced):
         # tests https://github.com/pytorch/pytorch/issues/43699
 
@@ -2438,7 +2673,10 @@ def _all_narrow_combs(self, shape):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_narrow(self, device, dtype, coalesced):
         shape = [3, 3, 4, 2]
         input, _, _ = self._gen_sparse(4, 19, shape, dtype, device, coalesced)
@@ -2480,7 +2718,10 @@ def is_integral(dtype):
                 sparse_tensor.requires_grad_()
 
     @coalescedonoff
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*all_types())
     def test_log1p(self, device, dtype, coalesced):
         if coalesced:
@@ -2546,7 +2787,10 @@ def _test_neg_negative(self, sparse_tensor):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_neg_negative(self, device, dtype, coalesced):
 
         if coalesced:
@@ -2628,7 +2872,10 @@ def is_integral(dtype):
 
     @coalescedonoff
     @dtypes(*all_types())
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_asin_arcsin(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2674,9 +2921,13 @@ def test_asin_arcsin(self, device, dtype, coalesced):
             self._test_asin_arcsin(input_uncoalesced, coalesced)
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mv(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x, _, _ = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)
@@ -2703,7 +2954,10 @@ def test_shape(di, dj, dk, nnz):
             res = x.mv(y)
 
     @dtypes(*floating_and_complex_types())
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.bfloat16, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_add_coalesce(self, device, dtype):
         i = self.index_tensor([[1, 2, 1]], device=device)
         v = torch.tensor([3, 4, 5], dtype=dtype, device=device)
@@ -2781,7 +3035,10 @@ def test_new_device_multi_gpu(self):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_new(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)
@@ -2838,7 +3095,10 @@ def test_factory(self, device, dtype):
                             self.assertEqual(True, sparse_tensor.requires_grad)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_factory_size_check(self, device, dtype):
         indices = self.index_tensor([[1, 2],
                                     [0, 2]], device=device)
@@ -2893,7 +3153,10 @@ def test_factory_empty_indices(self, device):
         self.assertEqual(tensor._indices(), expected_indices)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_factory_nnz(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)  # (sparse_dim, nnz): (1, 1)
         values = torch.tensor([[1, 1], [1, 1]], dtype=dtype, device=device)  # (nnz, ...): (2, 2)
@@ -2908,7 +3171,10 @@ def test_factory_nnz(self, device, dtype):
             torch.sparse_coo_tensor(indices, values, sizes, dtype=dtype, device=device)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_factory_nnz_zero(self, device, dtype):
         def test_shape(i_shape, v_shape, size, expected_size):
             if size:
@@ -2930,7 +3196,10 @@ def test_shape(i_shape, v_shape, size, expected_size):
         test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_factory_dense_dim(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)
         values = torch.tensor([[[1, 1, 1], [1, 1, 1]]], dtype=dtype, device=device)
@@ -3171,7 +3440,10 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size, dtype, device):
                          x_dense.view(-1)[0:x_v_numel].view(x_v))
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_resize(self, device, dtype):
         # 1. Expand the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -3257,7 +3529,10 @@ def test_is_nonzero(self, device):
                          .is_nonzero())
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_change_tensor_metadata(self, device, dtype):
         i = self.index_tensor([[0], [1]], device=device)
         v = torch.tensor([[3, 4, 5]], dtype=dtype, device=device)
@@ -3301,7 +3576,10 @@ def test_change_tensor_metadata(self, device, dtype):
 
     @coalescedonoff
     @dtypes(torch.double)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pickle(self, device, dtype, coalesced):
         import pickle
 
@@ -3350,7 +3628,10 @@ def test_isnan(self, device):
 
     @coalescedonoff
     @dtypes(torch.float32, torch.float64)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float16, torch.float32)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_div_rounding_mode(self, device, dtype, coalesced):
         sparse, _, _ = self._gen_sparse(2, 10, (10, 10), dtype,
                                         device, coalesced)
@@ -3387,9 +3668,13 @@ def test_sparse_to_numpy(self, device):
         self.assertRaises(TypeError, lambda: t.numpy())
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(torch.double)
     @dtypesIfMPS(torch.float32)
+=======
+    @dtypes(torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_softmax(self, device, dtype, coalesced):
         import torch.nn.functional as F
 
@@ -3701,21 +3986,30 @@ def _check_zero_nnz_softmax_op(self, func, ndim, device, dtype):
 
 
     @dtypes(torch.double, torch.float)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 1, device, dtype)
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
 
     @dtypes(torch.double, torch.float)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
+=======
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_log_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 10, device, dtype)
 
+<<<<<<< HEAD
     @dtypes(torch.float)
     @expectedFailureMPS
     def test_log_softmax_float(self, device, dtype):
@@ -3734,6 +4028,17 @@ def test_log_softmax_float(self, device, dtype):
                                       *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
                                       torch.complex64,
                                       *[torch.complex128] if SPARSE_COMPLEX128_SUPPORTED else []))
+=======
+    # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
+    @coalescedonoff
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
+                                      torch.complex64,
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(TEST_WITH_CROSSREF, "not working with fake tensor")
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2, torch.complex64: 1e-2, torch.float32: 1e-2})
     def test_sparse_matmul(self, device, dtype, coalesced):
@@ -3859,7 +4164,10 @@ def assign_to():
         self.assertRaises(TypeError, assign_to)
 
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_full_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
             s0 = tuple(reversed(s0))
@@ -3890,7 +4198,10 @@ def can_broadcast(s0, s1):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float32, torch.complex64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sparse_broadcast_to(self, device, dtype, coalesced):
         def test(sparse_dims, nnz, with_size, new_size):
             x = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
@@ -3919,10 +4230,15 @@ def _test_mul_skips(self, device, dtype, coalesced):
             self.skipTest(f"Test with dtype={dtype}, device={device} runs only with coalesced inputs")
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     # NOTE: addcmul_out is not implemented for bool.
     @dtypes(*all_types_and_complex_and(torch.bfloat16, torch.float16))
     @dtypesIfMPS(*all_mps_types())
+=======
+    # NOTE: addcmul_out is not implemented for bool.
+    @dtypes(*all_types_and_complex_and(torch.bfloat16, torch.float16))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_sparse_mul(self, device, dtype, coalesced):
         self._test_mul_skips(device, dtype, coalesced)
@@ -3972,9 +4288,13 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
                 # check_autograd(x, y)
 
     @coalescedonoff
+<<<<<<< HEAD
     @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     @dtypesIfMPS(*all_mps_types())
+=======
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_dense_mul(self, device, dtype, coalesced):
         self._test_mul_skips(device, dtype, coalesced)
@@ -4105,11 +4425,19 @@ def valid_cases():
             # some normal cases
             yield (make_diags((1, 5)), make_offsets([0]), (5, 5))
             yield (make_diags((3, 3)), make_offsets([-1, 0, 1]), (4, 4))
+<<<<<<< HEAD
             # non-contiguous diags
             yield (make_diags((5, 4), noncontiguous=True), make_offsets([-1, 1, 0, 2, -2]), (5, 5))
             # non-contiguous offsets
             yield (make_diags((3, 4)), make_offsets([1, -1, 0, -2, 2])[::2], (5, 5))
             # non-contiguous diags + offsets
+=======
+            # noncontigous diags
+            yield (make_diags((5, 4), noncontiguous=True), make_offsets([-1, 1, 0, 2, -2]), (5, 5))
+            # noncontigous offsets
+            yield (make_diags((3, 4)), make_offsets([1, -1, 0, -2, 2])[::2], (5, 5))
+            # noncontigous diags + offsets
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield (make_diags((3, 4), noncontiguous=True), make_offsets([1, -1, 0, -2, 2])[::2], (5, 5))
             # correct dimensionality, 2d, 2d , and shapes match, but the number of diagonals is zero
             yield (make_diags((0, 3)), make_offsets([]), (3, 3))
@@ -4160,7 +4488,10 @@ def test_small_nnz_coalesced(self):
 
     @coalescedonoff
     @dtypes(*all_types_and_complex_and(torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sum(self, device, dtype, coalesced):
         def run_test(shape, nnz):
             a = self._gen_sparse(2, nnz, shape, dtype, device, coalesced)[0]
@@ -4233,7 +4564,11 @@ def _sparse_to_dense(tensor):
     return tensor.to(torch.int8).to_dense().to(torch.bool)
 
 
+<<<<<<< HEAD
 _sparse_unary_ops = ops(mps_ops_modifier(sparse_unary_ufuncs, sparse=True), dtypes=OpDTypes.supported,
+=======
+_sparse_unary_ops = ops(sparse_unary_ufuncs, dtypes=OpDTypes.supported,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         allowed_dtypes=all_types_and_complex())
 class TestSparseUnaryUfuncs(TestCase):
     exact_dtype = True
@@ -4285,8 +4620,13 @@ def test_inplace(self, device, dtype, op):
     @_sparse_unary_ops
     def test_sparse_zero_dims(self, device, dtype, op):
         # test 0x0 sparse_coo_tensor
+<<<<<<< HEAD
         indices = torch.empty(2, 0, dtype=torch.int64, device=device)
         values = torch.empty(0, dtype=dtype, device=device)
+=======
+        indices = torch.empty(2, 0, dtype=torch.int64)
+        values = torch.empty(0, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sparse_0x0 = torch.sparse_coo_tensor(indices, values, (0, 0))
         expected = torch.sparse_coo_tensor(indices, op(values), (0, 0))
         actual = op(sparse_0x0)
@@ -4745,7 +5085,11 @@ def create_invalid_tensor(check_invariants=None):
 
             # However, invariants check can be disabled via
             # constructor's optional argument so that the invalid
+<<<<<<< HEAD
             # tensor is successfully constructed:
+=======
+            # tensor is succesfully constructed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = create_invalid_tensor(check_invariants=False)
             self.assertEqual(r.layout, layout)
 
@@ -4767,7 +5111,11 @@ def create_invalid_tensor(check_invariants=None):
             self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
         self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
 
+<<<<<<< HEAD
         # Test an attempt to reuse an activate context manager instance
+=======
+        # Test an attempt to re-use an activate context manager instance
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_ctx2 = torch.sparse.check_sparse_tensor_invariants(True)
         with check_ctx:
             self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
@@ -4914,6 +5262,12 @@ def generic_constructor(*args, **kwargs):
                                     lambda i, v, sz: cnstr(i, v, sz, **kwargs_).to_dense(masked_grad=masked),
                                     args_, masked=masked)
                             else:
+<<<<<<< HEAD
+=======
+                                if layout in {torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc} and 0:
+                                    # TODO: remove this if-block after gh-107370 is resolved
+                                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 torch.autograd.gradcheck(
                                     lambda ci, pi, v: cnstr(ci, pi, v, **kwargs).to_dense(masked_grad=masked),
                                     args, masked=masked)
@@ -5301,7 +5655,11 @@ def test_to_sparse_identity(self, device, layout, dtype):
             x_dense = torch.eye(dense_dim, dtype=dtype, device=device)
             for sparse_dim_in in range(1, dense_dim):
                 x_sparse = x_dense.to_sparse(sparse_dim_in)
+<<<<<<< HEAD
                 for sparse_dim_out in range(dense_dim):
+=======
+                for sparse_dim_out in range(0, dense_dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if sparse_dim_out == sparse_dim_in:
                         self.assertTrue(x_sparse.to_sparse(sparse_dim_out).sparse_dim() == sparse_dim_out)
                     else:
@@ -5492,6 +5850,10 @@ def test_constructor_pin_memory(self, device, layout):
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,  # pinning zero-sized tensors is a no-op
                 pin_memory=True,
+<<<<<<< HEAD
+=======
+                enable_batch=False,  # TODO: remove after gh-104868 is resolved
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if layout is torch.sparse_coo:
                 self.assertTrue(t._indices().is_pinned())
@@ -5521,6 +5883,10 @@ def test_method_pin_memory(self, device, layout):
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,  # pinning zero-sized tensors is a no-op
                 pin_memory=False,         # no pinning
+<<<<<<< HEAD
+=======
+                enable_batch=False,  # TODO: remove after gh-104868 is resolved
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             t = t_.pin_memory()
             self.assertTrue(t.is_pinned())
@@ -5571,6 +5937,10 @@ def test_constructor_pinned_memory(self, device, layout):
                 enable_zero_sized=False,     # pinning zero-sized tensors is a no-op
                 pin_memory=None,             # constructor does not specify pin_memory=...
                 members_pin_memory=True,     # indices and values are pinned
+<<<<<<< HEAD
+=======
+                enable_batch=False,          # TODO: remove after gh-104868 is resolved
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if layout is torch.sparse_coo:
                 self.assertTrue(t._indices().is_pinned())
@@ -5608,6 +5978,10 @@ def generic_constructor(*args, **kwargs):
         for args, kwargs in self.generate_simple_inputs(
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,     # pinning zero-sized tensors is a no-op
+<<<<<<< HEAD
+=======
+                enable_batch=False,  # TODO: remove after gh-104868 is resolved
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_tensor=False):
 
             # indices are pinned, values is a non-pinned tensor
@@ -5626,12 +6000,20 @@ def generic_constructor(*args, **kwargs):
 
 
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
+<<<<<<< HEAD
 instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), allow_mps=True, except_for='meta')
+=======
+instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestSparseMaskedReductions, globals(), except_for='meta')
 
 # e.g., TestSparseCPU and TestSparseCUDA
+<<<<<<< HEAD
 instantiate_device_type_tests(TestSparse, globals(), allow_mps=True, except_for='meta')
+=======
+instantiate_device_type_tests(TestSparse, globals(), except_for='meta')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestSparseAny, globals(), except_for='meta')
 
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 5412648a7406b..144660bd49a08 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -12,6 +12,7 @@
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_utils import \
     (TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, TEST_CUDA_CUDSS, TEST_SCIPY, TEST_NUMPY, TEST_MKL, IS_WINDOWS, TestCase,
+<<<<<<< HEAD
      run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo,
      skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
 from torch.testing._internal.common_device_type import \
@@ -20,13 +21,28 @@
 from torch.testing._internal.common_methods_invocations import \
     (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
 from torch.testing._internal.common_cuda import TEST_CUDA
+=======
+     run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo, skipIfRocm,
+     skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
+from torch.testing._internal.common_device_type import \
+    (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
+     precisionOverride, skipMeta, skipCUDAIf, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
+     largeTensorTest)
+from torch.testing._internal.common_methods_invocations import \
+    (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_dtype import (
     floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
     all_types_and_complex, floating_and_complex_types_and)
 from torch.testing._internal.opinfo.definitions.linalg import sample_inputs_linalg_solve
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
+<<<<<<< HEAD
 from test_sparse import HIPSPARSE_BF16_SUPPORTED, HIPSPARSE_FP16_SUPPORTED, \
     SPARSE_FLOAT16_SUPPORTED, SPARSE_BFLOAT16_SUPPORTED, SPARSE_COMPLEX128_SUPPORTED
+=======
+from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED, HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 
 if TEST_SCIPY:
@@ -36,15 +52,37 @@
     import numpy as np
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
 
 no_mkl_sparse = IS_WINDOWS or not TEST_MKL
 
+=======
+load_tests = load_tests
+
+no_mkl_sparse = IS_WINDOWS or not TEST_MKL
+
+def _check_cusparse_triangular_solve_available():
+    version = _get_torch_cuda_version()
+    # cusparseSpSM was added in 11.3.1 but we don't have access to patch version
+    min_supported_version = (11, 4)
+    return version >= min_supported_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _check_cusparse_spgemm_available():
     # cusparseSpGEMM was added in 11.0
     return not TEST_WITH_ROCM
 
+<<<<<<< HEAD
+=======
+def _check_cusparse_sddmm_available():
+    if TEST_WITH_ROCM:
+        return True
+    version = _get_torch_cuda_version()
+    # cusparseSDDMM was added in 11.2.1 but we don't have access to patch version
+    min_supported_version = (11, 3)
+    return version >= min_supported_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _sparse_csr_ops = list(filter(lambda op: op.supports_sparse_csr, op_db))
 _sparse_compressed_ops = list(filter(lambda op: (op.supports_sparse_csr or op.supports_sparse_csc
@@ -136,7 +174,11 @@ def test_make_crow_indices(self):
         index_dtype = torch.int32
         for n_rows in range(1, 10):
             for n_cols in range(1, 10):
+<<<<<<< HEAD
                 for nnz in range(n_rows * n_cols + 1):
+=======
+                for nnz in range(0, n_rows * n_cols + 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     crow_indices = self._make_crow_indices(
                         n_rows, n_cols, nnz,
                         device=device, dtype=index_dtype)
@@ -1492,6 +1534,11 @@ def test_csr_matvec(self, device, dtype):
                 csr.matmul(bad_vec)
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    # hmm, the test passes ok on CUDA when Rocm is not available:
+    @skipCUDAIfRocmVersionLessThan((5, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_baddbmm(self, device, dtype):
 
@@ -1532,10 +1579,16 @@ def run_test(c, a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device
                     run_test(c, a, a_batched, b, op_b, op_out, dtype=dtype, device=device)
 
     @onlyCUDA
+<<<<<<< HEAD
     @dtypes(*floating_and_complex_types_and(*[torch.half] if HIPSPARSE_FP16_SUPPORTED else [],
                                             *[torch.bfloat16] if HIPSPARSE_BF16_SUPPORTED else []))
     @skipCUDAIfNoSparseGeneric
     @skipIfRocmVersionLessThan((6, 3))
+=======
+    @skipCUDAIfNoSparseGeneric
+    @skipIfRocmVersionLessThan((6, 3))
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bmm(self, device, dtype):
         def run_test(a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device=None):
             b = b.mH if (op_b and a.shape == b.shape) else b
@@ -1942,8 +1995,13 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
 
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_and_complex_types_and(
+<<<<<<< HEAD
                   *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
                   *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else []))
+=======
+                  *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
+                  *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
@@ -1977,9 +2035,17 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfCUDA(*floating_types_and(torch.complex64,
+<<<<<<< HEAD
                                       *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else [],
                                       *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
                                       *[torch.complex128] if SPARSE_COMPLEX128_SUPPORTED else []))
+=======
+                                      *[torch.bfloat16] if (SM80OrLater and not TEST_WITH_ROCM) else [],
+                                      *[torch.half] if (SM53OrLater and not TEST_WITH_ROCM) else [],
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @sparse_compressed_nonblock_layouts()
     def test_addmm_all_sparse_csr(self, device, dtype, layout):
         M = torch.randn(10, 25, device=device).to(dtype)
@@ -2051,9 +2117,17 @@ def maybe_transpose(cond, m):
     @skipCPUIfNoMklSparse
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_types_and(torch.complex64,
+<<<<<<< HEAD
                                       *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else [],
                                       *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
                                       *[torch.complex128] if SPARSE_COMPLEX128_SUPPORTED else []))
+=======
+                                      *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):
@@ -2330,7 +2404,14 @@ def run_test(index_type):
             run_test(index_dtype)
 
     @skipCPUIfNoMklSparse
+<<<<<<< HEAD
     @skipCUDAIfRocm(msg="needs HIPSPARSE_GENERIC_SPSV or SPSM")
+=======
+    @skipCUDAIf(
+        not _check_cusparse_triangular_solve_available(),
+        "cuSparse Generic API SpSV is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2407,6 +2488,13 @@ def remove_diagonal(t):
                                                                                  itertools.product([True, False], repeat=4)):
             run_test(n, k, upper, unitriangular, transpose, zero)
 
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2457,6 +2545,13 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
                 for op_a, op_b in itertools.product([True, False], repeat=2):
                     run_test(c, a, b, op_a, op_b)
 
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_autograd(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_sparse_sampled_addmm
@@ -2483,9 +2578,19 @@ def test_sampled_addmm_autograd(self, device, dtype):
             self.assertEqual(a.grad, a1.grad)
             self.assertEqual(b.grad, b1.grad)
 
+<<<<<<< HEAD
     @skipCUDAIfRocm
     @onlyCUDA
     @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
+=======
+    @onlyCUDA
+    # It works on ROCm and CUDA issue is currently active
+    @skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2501,6 +2606,13 @@ def run_test(c, a, b):
             run_test(c, a, b)
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_errors(self, device, dtype):
         # test that the errors are the same for dense and sparse sampled versions
@@ -2697,7 +2809,11 @@ def test_sparse_csr_unary_out(self, device, dtype, op):
             # Sparse CSR only supports 2D tensors as inputs
             # Fail early to prevent silent success with this test
             if sample.input.ndim != 2:
+<<<<<<< HEAD
                 raise ValueError(f"Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+=======
+                raise ValueError("Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             sample.input = sample.input.to_sparse_csr()
             expect = op(sample.input, *sample.args, **sample.kwargs)
@@ -2721,7 +2837,11 @@ def test_sparse_csr_unary_inplace(self, device, dtype, op):
             # Sparse CSR only supports 2D tensors as inputs
             # Fail early to prevent silent success with this test
             if sample.input.ndim != 2:
+<<<<<<< HEAD
                 raise ValueError(f"Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+=======
+                raise ValueError("Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             sample.input = sample.input.to_sparse_csr()
             expect = op(sample.input, *sample.args, **sample.kwargs)
@@ -2755,7 +2875,11 @@ def test_autograd_sparse_csr_unary(self, device, dtype, op):
             raise ValueError("Expected at least one 2D tensor in samples.")
 
         for sample in samples:
+<<<<<<< HEAD
             # We must skip samples of low dimensionality, we can't convert them to sparsed compressed layouts
+=======
+            # We must skip samples of low dimensionality, we can't covert them to sparsed compressed layouts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sample.input.ndim < 2:
                 continue
             sparse_input = sample.input.to_sparse_csr().requires_grad_(True)
@@ -2780,6 +2904,13 @@ def fn(input):
             dense_output.backward(dense_covector)
             self.assertEqual(sparse_input.grad, dense_input.grad)
 
+<<<<<<< HEAD
+=======
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float64)
     def test_autograd_dense_output_addmm(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_addmm
@@ -3215,7 +3346,11 @@ def test_dense_to_from_sparse_compressed(self, device, hybrid, batched, layout):
         # helpers
 
         def _check_against_scipy_matrix(pt_matrix, dense, blocksize, **kwargs):
+<<<<<<< HEAD
             # scipy has no bsc layout, so we check against the bsr layout of the transposed dense
+=======
+            # scipy has no bsc layout, so we check against the bsr layout of the tranposed dense
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if layout == torch.sparse_bsc:
                 sp_matrix = self._construct_sp_matrix(dense.t(), layout=torch.sparse_bsr, blocksize=blocksize[::-1])
             else:
@@ -3232,7 +3367,11 @@ def _check_against_scipy_matrix(pt_matrix, dense, blocksize, **kwargs):
             self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix))
             self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix))
             if layout == torch.sparse_bsc:
+<<<<<<< HEAD
                 # we must transpose the blocks before comparing
+=======
+                # we must tranpose the blocks before comparing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values().transpose(-2, -1))
             else:
                 self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values())
@@ -3331,7 +3470,11 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
 
         # special cases for batched tensors
         if batched:
+<<<<<<< HEAD
             # batched sparse tensors need only have the same number of non-zeros in each batch not necessarily the
+=======
+            # batched sparse tensors need only have the same number of non-zeros in each batch not nessesarily the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # same sparsity pattern in each batch
             sparse_shape = sparse_sizes[0]
             hybrid_shape = hybrid_sizes[0]
@@ -3342,7 +3485,11 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
             # number of elements/blocks in each batch (total not nnz)
             batch_mask_shape = sparse_shape
             if layout in blocked_layouts:
+<<<<<<< HEAD
                 # if we are blocked the mask is generated for the block valued elements
+=======
+                # if we are blocked the mask is genereated for the block valued elemetns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 batch_mask_shape = sparse_shape[0] // blocksize[0], sparse_shape[1] // blocksize[1]
 
             # random bool vector w/ length equal to max possible nnz for the sparse_shape
@@ -3563,8 +3710,13 @@ def test_triton_bsr_softmax(self, device, dtype):
     @onlyCUDA
     @dtypes(torch.half, torch.bfloat16, torch.float)
     @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
+<<<<<<< HEAD
     @unittest.skipIf((not TEST_WITH_TORCHINDUCTOR) or (IS_FBCODE and IS_REMOTE_GPU),
                      "Skipped for internal with remote GPUs")
+=======
+    @unittest.skipIf((not TEST_WITH_TORCHINDUCTOR) or (IS_FBCODE and IS_REMOTE_GPU) or torch._running_with_deploy(),
+                     "Skipped for deploy and internal with remote GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_bsr_dense_bmm(self, device, dtype, index_dtype, block_size):
         from functools import partial
         from torch.sparse._triton_ops import bsr_dense_mm
@@ -3640,8 +3792,13 @@ def kernel_impl(*args, **kwargs):
 
     @onlyCUDA
     @dtypes(torch.half)
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU,
                      "Skipped for internal with remote GPUs")
+=======
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU or torch._running_with_deploy(),
+                     "Skipped for deploy and internal with remote GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_bsr_dense_bmm_error_messages(self, device, dtype):
         from torch.sparse._triton_ops import bsr_dense_mm
 
@@ -3685,6 +3842,10 @@ def test_triton_bsr_dense_bmm_error_messages(self, device, dtype):
 
     @parametrize("block_size", [16, 32, 64])
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.half, torch.bfloat16, torch.float)
     @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
@@ -3774,7 +3935,11 @@ def test_triton_sampled_addmm(self, device, dtype, block_size):
                 input_broadcasted_clone.col_indices(),
                 # For testing `out=` let's make values to have "weird" strides
                 # so that if the kernel modifies values to it's needs, the result
+<<<<<<< HEAD
                 # is being copied into out.values.
+=======
+                # is being compied into out.values.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_broadcasted_clone.values().transpose(-3, -2).contiguous().transpose(-3, -2),
                 layout=input_broadcasted_clone.layout,
                 size=input_broadcasted_clone.shape
@@ -3889,7 +4054,11 @@ def test_triton_bsr_scatter_mm(self, device, dtype, blocksize):
                     try:
                         result = bsr_scatter_mm(bsr, dense, indices_data=indices_data)
                     except triton.compiler.OutOfResources:
+<<<<<<< HEAD
                         # ensure that there was at least one successful test:
+=======
+                        # ensure that there was at least one succesful test:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assert SPLIT_N < SPLIT_N_list[0]
                         break
 
@@ -4097,7 +4266,11 @@ def nc_copy(t, axes=(-1,)):
             left_alpha = make_tensor(M, dtype=dtype, device=device, low=0.5, high=high) if has_left_alpha else None
             right_alpha = make_tensor(N, dtype=dtype, device=device, low=0.5, high=high) if has_right_alpha else None
 
+<<<<<<< HEAD
             if 0 and op == "bsr_dense_addmm":  # noqa: SIM223
+=======
+            if 0 and op == "bsr_dense_addmm":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Find optimal kernel parameters, the speed-up is
                 # about 10x for running this test.
                 #
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 14eeacab94fa0..674db4b498725 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -50,8 +50,13 @@
 _IS_HIPSPARSELT_AVAILABLE = False
 
 if torch.cuda.is_available():
+<<<<<<< HEAD
     _IS_SM8X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 8)
     _IS_SM9X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 9)
+=======
+    _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
+    _IS_SM9X = torch.cuda.get_device_capability(0)[0] == 9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _IS_HIPSPARSELT_AVAILABLE = torch.version.hip is not None and tuple(int(v) for v in torch.version.hip.split('.')[:2]) > (6, 4)
     # CUTLASS kernels only work for Ampere
     if _IS_SM8X:
@@ -247,10 +252,13 @@ def test_mlp_contiguous_relu_compile_cutlass(self):
     @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
     @unittest.skipIf("cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cusparselt not supported on this machine")
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_compile(self) -> None:
         x = torch.randn([1024, 512], device="cuda", dtype=torch.float16, requires_grad=True)
 
@@ -580,10 +588,13 @@ def setUp(self):
 
     @training_dtypes
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_prune_dense_static_sort(self, dtype) -> None:
         # Ideally we would like to clone and compare, but that won't work because the sorting order will be different
         # instead we pass the pruned matrix to the CUDA implementation and preserve the sparsity pattern.
@@ -629,10 +640,13 @@ def test_prune_dense_static_sort(self, dtype) -> None:
     @training_dtypes
     @parametrize_backends
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pruning_algo_largest_abs_values_greedy(self, dtype, backend) -> None:
         inp = torch.tensor(
             [[4, 3, 2, 1], [-1, -3, 0.6, 0.5], [1, 2, 3, 4], [10, 2, -1, 5]],
@@ -670,10 +684,13 @@ def test_gemm(self, dtype) -> None:
     @training_dtypes
     @parametrize_backends
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_meta_correctness(self, dtype, backend) -> None:
         M, N = 128, 256
         # Construct x to make sure we always have exactly 8 elements per 4x4 tile
@@ -708,10 +725,13 @@ def test_pack_both_ways_meta_correctness(self, dtype, backend) -> None:
 
     @training_dtypes
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_id(self, dtype) -> None:
         N = 512
         torch.manual_seed(0)
@@ -734,15 +754,21 @@ def test_pack_both_ways_id(self, dtype) -> None:
         max_diff = (ref_gemm - pack_gemm).abs().argmax()
         torch.testing.assert_close(
             ref_gemm, pack_gemm,
+<<<<<<< HEAD
             **atol_rtol_kw[dtype],
             msg=f"packed is wrong at pos: ({max_diff // N}, {max_diff % N})",
         )
+=======
+            **atol_rtol_kw[dtype]
+        ), f"packed is wrong at pos: ({max_diff // N}, {max_diff % N})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test A.t@B
         pack_gemm = torch._sparse_semi_structured_linear(b.t(), packed_t, meta_t)
         max_diff = (ref_gemm - pack_gemm).abs().argmax()
 
         torch.testing.assert_close(
             ref_gemm, pack_gemm,
+<<<<<<< HEAD
             **atol_rtol_kw[dtype],
             msg=f"packed_t is wrong at pos: ({max_diff // N}, {max_diff % N})",
         )
@@ -753,6 +779,13 @@ def test_pack_both_ways_id(self, dtype) -> None:
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+            **atol_rtol_kw[dtype]
+        ), f"packed_t is wrong at pos: ({max_diff // N}, {max_diff % N})"
+
+    @training_dtypes
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_edge_case1(self, dtype) -> None:
         # In this case, the heuristic will keep 7 values out of 16
         # instead of 8. let's see how the kernel handles this
@@ -778,10 +811,13 @@ def test_pack_both_ways_edge_case1(self, dtype) -> None:
 
     @training_dtypes
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_apply(self, dtype) -> None:
         M, N = 256, 1024
         x = torch.randn([M, N], dtype=dtype, device="cuda")
@@ -798,10 +834,13 @@ def test_sp24_apply(self, dtype) -> None:
 
     @training_dtypes
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_apply_dense(self, dtype) -> None:
         M, N = 256, 1024
         x = torch.randn([M, N], dtype=dtype, device="cuda")
@@ -840,10 +879,13 @@ def test_sp24_apply_dense(self, dtype) -> None:
 
     @training_dtypes
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls(self, dtype) -> None:
         M, N, K = 64, 256, 1024
         a = torch.randn([M, K], device="cuda", dtype=dtype)
@@ -879,10 +921,13 @@ def test_sp24_matmuls(self, dtype) -> None:
         )
 
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls_mat_vec(self) -> None:
         a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
         b = torch.randn([128], device="cuda", dtype=torch.float16)
@@ -893,10 +938,13 @@ def test_sp24_matmuls_mat_vec(self) -> None:
             torch.testing.assert_close(a_s @ b, (a * a_m) @ b, **atol_rtol_kw[a.dtype])
 
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+<<<<<<< HEAD
     @unittest.skipIf(
         "RelWithAssert" in torch.__config__.show(),
         "failing in debug build, see https://github.com/pytorch/pytorch/pull/165158 for context",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls_bmm(self) -> None:
         a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
         b = torch.randn([5, 6, 128], device="cuda", dtype=torch.float16)
@@ -1284,8 +1332,16 @@ def test_cusparselt_backend(self):
         version = _get_torch_cuda_version()
         assert torch.backends.cusparselt.is_available()
 
+<<<<<<< HEAD
         # PyTorch CUDA 12.4+ using cuSPARSELt v0.6.2+
         if version >= (12, 4):
+=======
+        # CUDA 11.8 has cuSPARSELt v0.4.0 support
+        if version == (11, 8):
+            assert torch.backends.cusparselt.version() == 400
+        # PyTorch CUDA 12.4+ using cuSPARSELt v0.6.2+
+        elif version >= (12, 4):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert torch.backends.cusparselt.version() >= 602
         else:
             assert torch.backends.cusparselt.version() is None
diff --git a/test/test_stateless.py b/test/test_stateless.py
index e8217f2caea2d..3b959d9c7aa76 100644
--- a/test/test_stateless.py
+++ b/test/test_stateless.py
@@ -186,7 +186,11 @@ def test_functional_batch_norm(self, functional_call):
         cur_rm = module.running_mean
         self.assertEqual(cur_rm, prev_rm)
         self.assertEqual(rm, torch.full((10,), 12.8))
+<<<<<<< HEAD
         # Now run functional without reparameterization and check that the module has
+=======
+        # Now run functional without reparametrization and check that the module has
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # been updated
         functional_call(module, {}, x)
         self.assertEqual(module.running_mean, torch.full((10,), 12.8))
@@ -210,7 +214,11 @@ def test_circular_references(self, functional_call):
         prev_buffer = module.buffer.clone()
         res = functional_call(module, parameters, x, tie_weights=False)
         self.assertEqual(x, res)
+<<<<<<< HEAD
         # check that the weights remain unmodified and were correctly accessed
+=======
+        # check that the weights remain unmodified and were correctly accesed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_weight = module.l1.weight
         cur_buffer = module.buffer
         self.assertEqual(cur_weight, prev_weight)
@@ -753,7 +761,11 @@ def test_functional_call_tuple_dicts(self):
         res = torch.func.functional_call(mod, (), x)
         self.assertEqual(res, mod(x))
 
+<<<<<<< HEAD
         # three dictionaries
+=======
+        # three dictonaries
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = ({'l1.weight': torch.ones(1, 1)}, {'l1.bias': torch.ones(1)}, {'buffer': torch.zeros(1)})
         res = torch.func.functional_call(mod, a, x)
         self.assertEqual(res, x + 1)
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index df1e0c3e34faa..d31ef0366cb86 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -60,7 +60,11 @@ def forward(self, query, key, value, mask):
 # Taken from https://github.com/facebookresearch/dlrm/blob/master/dlrm_s_pytorch.py
 def create_mlp(ln, sigmoid_layer):
     layers = nn.ModuleList()
+<<<<<<< HEAD
     for i in range(len(ln) - 1):
+=======
+    for i in range(0, len(ln) - 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         n = ln[i]
         m = ln[i + 1]
 
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index 75396631fedb5..ec1bf01a64fc8 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -5,7 +5,11 @@
 import math
 import pickle
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -24,7 +28,10 @@
     FloorDiv,
     Identity,
     OpaqueUnaryFn_cos,
+<<<<<<< HEAD
     BitwiseFn_bitwise_and,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     simple_floordiv_gcd,
 )
 from torch.utils._sympy.interp import sympy_interp
@@ -424,7 +431,11 @@ def test_interp(self, fn):
                 sargs = [sympy.sympify(a) for a in args]
                 sympy_expr = getattr(ReferenceAnalysis, fn)(*symbols)
                 ref_r = getattr(ReferenceAnalysis, fn)(*sargs)
+<<<<<<< HEAD
                 # Yes, I know this is a long-winded way of saying xreplace; the
+=======
+                # Yes, I know this is a longwinded way of saying xreplace; the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # point is to test sympy_interp
                 r = sympy_interp(
                     ReferenceAnalysis, dict(zip(symbols, sargs)), sympy_expr
@@ -874,10 +885,13 @@ def test_pickle(self):
         r = pickle.loads(pickle.dumps(x))
         self.assertEqual(x, r)
 
+<<<<<<< HEAD
         x = BitwiseFn_bitwise_and(sympy.Symbol("a"), sympy.Symbol("b"))
         r = pickle.loads(pickle.dumps(x))
         self.assertEqual(x, r)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestSingletonInt(TestCase):
     def test_basic(self):
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index b9256b322bb8a..e8dd5f6f33aea 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -689,6 +689,7 @@ def test_cat_preserve_channels_last(self, device):
         self.assertTrue(res1.is_contiguous(memory_format=torch.channels_last))
 
     @onlyCUDA
+<<<<<<< HEAD
     def test_cat_channels_last_large_inputs(self, device):
         num_tensors = 130
         inputs_cuda = [
@@ -704,6 +705,8 @@ def test_cat_channels_last_large_inputs(self, device):
         self.assertTrue(result.is_contiguous(memory_format=torch.channels_last))
 
     @onlyCUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_out_memory_format(self, device):
         inp_size = (4, 4, 4, 4)
         expected_size = (8, 4, 4, 4)
@@ -1166,6 +1169,7 @@ def test_cat2(self, device, dtype):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
+<<<<<<< HEAD
     @dtypes(torch.float)
     def test_cat_size1(self, device, dtype):
         # create a tensor that has aligned stride along dim - 1 dimension
@@ -1217,6 +1221,8 @@ def test_cat_large_tensor(self, device, dtype):
         ref = torch.cat([x.cpu() for x in inps])
         self.assertEqual(res, ref)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # FIXME: Create an OpInfo-based tensor creation method test that verifies this for all tensor
     #   creation methods and verify all dtypes and layouts
     @dtypes(torch.bool, torch.uint8, torch.int16, torch.int64, torch.float16, torch.float32, torch.complex64)
@@ -1597,7 +1603,11 @@ def test_combinations(self, device):
         expected = torch.empty(0, 5, dtype=a.dtype, device=device)
         self.assertEqual(c, expected)
 
+<<<<<<< HEAD
         # test empty input
+=======
+        # test empty imput
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.empty(0, device=device)
         c1 = torch.combinations(a)
         c2 = torch.combinations(a, with_replacement=True)
@@ -2031,11 +2041,14 @@ def test_zeros(self, device):
         expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex32)
         self.assertEqual(complexHalfTensor, expected)
 
+<<<<<<< HEAD
     def test_zeros_bounds_checking(self, device):
         # Test negative large integer
         with self.assertRaisesRegex(RuntimeError, r"zeros: Dimension size must be non-negative."):
             torch.zeros(-6744789213055875072, device=device)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: this test should be updated
     def test_zeros_out(self, device):
         shape = (3, 4)
@@ -3505,7 +3518,11 @@ def test_uniform_from_to(self, device, dtype):
                 else:
                     t.uniform_(from_, to_)
                     range_ = to_ - from_
+<<<<<<< HEAD
                     if dtype != torch.bfloat16 and not (
+=======
+                    if not (dtype == torch.bfloat16) and not (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             dtype == torch.half and device == 'cpu') and not torch.isnan(t).all():
                         delta = alpha * range_
                         double_t = t.to(torch.double)
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index 376e24023a53b..f13528ed675ce 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -40,9 +40,15 @@
 import torch
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     parametrize,
     IS_MACOS,
     IS_WINDOWS,
+=======
+    IS_MACOS,
+    IS_WINDOWS,
+    parametrize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     skipIfTorchDynamo,
     TEST_WITH_CROSSREF,
@@ -82,6 +88,7 @@ def tearDown(self):
             if os.path.exists(temp_dir):
                 shutil.rmtree(temp_dir)
 
+<<<<<<< HEAD
     def assertProto(self, actual_proto):
         if expecttest.ACCEPT:
             write_proto(actual_proto, self)
@@ -90,6 +97,15 @@ def assertProto(self, actual_proto):
         expected_proto = Summary()
         text_format.Parse(expected_str, expected_proto)
         self.assertEqual(actual_proto, expected_proto)
+=======
+    def assertProto(self, str_to_compare):
+        if expecttest.ACCEPT:
+            write_proto(str_to_compare, self)
+            return True
+        expected = read_expected_content(self)
+        str_to_compare = str(str_to_compare)
+        self.assertEqual(remove_whitespace(str_to_compare), remove_whitespace(expected))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def assertImageProto(self, actual_proto):
         if expecttest.ACCEPT:
@@ -200,7 +216,11 @@ def test_pytorch_histogram_raw(self):
                 bucket_counts=counts.tolist(),
             )
 
+<<<<<<< HEAD
             ints = torch.tensor(range(100)).float()
+=======
+            ints = torch.tensor(range(0, 100)).float()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nbins = 100
             counts = torch.histc(ints, bins=nbins, min=0, max=99)
             limits = torch.tensor(range(nbins))
@@ -487,23 +507,53 @@ def test_video(self):
         summary.video("dummy", np.random.rand(16, 48, 1, 28, 28))
         summary.video("dummy", np.random.rand(20, 7, 1, 8, 8))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfailIfS390X
     def test_audio(self):
         self.assertProto(summary.audio("dummy", tensor_N(shape=(42,))))
 
+<<<<<<< HEAD
     def test_text(self):
         self.assertProto(summary.text("dummy", "text 123"))
 
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+    def test_text(self):
+        self.assertProto(summary.text("dummy", "text 123"))
+
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_histogram_auto(self):
         self.assertProto(
             summary.histogram("dummy", tensor_N(shape=(1024,)), bins="auto", max_bins=5)
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_histogram_fd(self):
         self.assertProto(
             summary.histogram("dummy", tensor_N(shape=(1024,)), bins="fd", max_bins=5)
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_histogram_doane(self):
         self.assertProto(
             summary.histogram(
@@ -523,6 +573,12 @@ def test_custom_scalars(self):
             layout
         )  # only smoke test. Because protobuf in python2/3 serialize dictionary differently.
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mesh(self):
         v = np.array([[[1, 1, 1], [-1, -1, 1], [1, -1, -1], [-1, 1, -1]]], dtype=float)
         c = np.array(
@@ -532,6 +588,12 @@ def test_mesh(self):
         mesh = summary.mesh("my_mesh", vertices=v, colors=c, faces=f, config_dict=None)
         self.assertProto(mesh)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scalar_new_style(self):
         scalar = summary.scalar("test_scalar", 1.0, new_style=True)
         self.assertProto(scalar)
@@ -778,6 +840,7 @@ def test_figure_list(self):
             figures.append(figure)
 
         writer.add_figure("add_figure/figure_list", figures, 0, close=False)
+<<<<<<< HEAD
         self.assertTrue(
             all(plt.fignum_exists(figure.number) is True for figure in figures)
         )  # noqa: F812
@@ -787,6 +850,13 @@ def test_figure_list(self):
             self.assertTrue(
                 all(plt.fignum_exists(figure.number) is False for figure in figures)
             )  # noqa: F812
+=======
+        self.assertTrue(all(plt.fignum_exists(figure.number) is True for figure in figures))  # noqa: F812
+
+        writer.add_figure("add_figure/figure_list", figures, 1)
+        if matplotlib.__version__ != "3.3.0":
+            self.assertTrue(all(plt.fignum_exists(figure.number) is False for figure in figures))  # noqa: F812
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             print(
                 "Skipping fignum_exists, see https://github.com/matplotlib/matplotlib/issues/18163"
@@ -796,6 +866,16 @@ def test_figure_list(self):
 
 
 class TestTensorBoardNumpy(BaseTestCase):
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "Skipping on windows, see https://github.com/pytorch/pytorch/pull/109349 ",
+    )
+    @unittest.skipIf(
+        IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scalar(self):
         res = make_np(1.1)
         self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
@@ -803,9 +883,14 @@ def test_scalar(self):
         self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
         res = make_np(np.float16(1.00000087))
         self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
+<<<<<<< HEAD
         if not IS_MACOS and not IS_WINDOWS:
             res = make_np(np.float128(1.00008 + 9))
             self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
+=======
+        res = make_np(np.float128(1.00008 + 9))
+        self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = make_np(np.int64(100000000000))
         self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
 
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 628e45ed8eb6d..aff376159b569 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -695,17 +695,29 @@ def test_type_as(x, y):
             _atol = 2e-3
             _rtol = 1e-5
             if data_type is torch.bfloat16:
+<<<<<<< HEAD
                 # Compared to aten logic, NNC could save additional BF16/Fp32 conversion.
+=======
+                # Compared to aten logic, NNC coudl save addtional BF16/Fp32 conversion.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Take d = a + b - c as an example, the aten logic is as follows at
                 # operator level:
                 #    tmp = to_bf16(to_fp32(a) + to_fp32(b))
                 #    d = to_bf16(to_fp32(tmp) + to_fp32(c))
+<<<<<<< HEAD
                 # But NNC could fuse the compression and remove the redundant conversions.
+=======
+                # But NNC could fuse the compression and remove the redudant conversions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # The final statement is as follows
                 #    d = to_bf16(to_fp32(a) + to_fp32(b) + to_fp32(c))
                 # Hence, we simulate NNC computation by feeding fp32 tensors and converting
                 # the result tensor back to bf16. The simulation could avoid the numeric
+<<<<<<< HEAD
                 # deviation to simplify the result comparison
+=======
+                # deviation to simplify the result comprasion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 y = warmup_and_run_forward(traced, rand_a.float(), rand_b.float())
                 if torch_fn not in cmp_fns:
                     y = y.bfloat16()
@@ -1216,7 +1228,11 @@ def test_loop(self):
         @torch.jit.script
         def test(x: torch.Tensor, y: torch.Tensor, z: int) -> torch.Tensor:
             b = y
+<<<<<<< HEAD
             for i in range(z):
+=======
+            for i in range(0, z):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 a = x + y
                 b = b + y
             return b
diff --git a/test/test_testing.py b/test/test_testing.py
index 1735bcdcbb060..0214e0213c891 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -12,8 +12,12 @@
 import subprocess
 import sys
 import unittest.mock
+<<<<<<< HEAD
 from typing import Any
 from collections.abc import Callable
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 
 import torch
@@ -2352,7 +2356,11 @@ def _check_python_output(cls, program) -> str:
             # fail, so just set CWD to this script's directory
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
 
+<<<<<<< HEAD
     # The test is flaky on ROCm/XPU and has been open and close multiple times
+=======
+    # The test is flaky on ROCm and has been open and close multiple times
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/issues/110040
     @skipIfRocm
     def test_circular_dependencies(self) -> None:
@@ -2370,7 +2378,10 @@ def test_circular_dependencies(self) -> None:
                            "torch.distributed.benchmarks",  # depends on RPC and DDP Optim
                            "torch.distributed.examples",  # requires CUDA and torchvision
                            "torch.distributed.tensor.examples",  # example scripts
+<<<<<<< HEAD
                            "torch.distributed._tools.sac_ilp",  # depends on pulp
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            "torch.csrc",  # files here are devtools, not part of torch
                            "torch.include",  # torch include files after install
                            ]
diff --git a/test/test_torch.py b/test/test_torch.py
index 47e65ab6a12e1..c28d64b280915 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -43,8 +43,12 @@
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
     bytes_to_scalar, parametrize, skipIfMPS, noncontiguous_like,
+<<<<<<< HEAD
     AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO, xfailIfTorchDynamo,
     xfailIfS390X, set_warn_always_context)
+=======
+    AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO, xfailIfTorchDynamo, set_warn_always_context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
@@ -59,14 +63,21 @@
 from torch.testing._internal.common_cuda import (
     tf32_on_and_off, TEST_CUDNN, TEST_MULTIGPU,
     _create_scaling_case, _create_scaling_models_optimizers)
+<<<<<<< HEAD
 from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
+=======
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_dtype import (
     floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types,
     all_types_and, floating_types, floating_and_complex_types, integral_types_and,
     get_all_qint_dtypes, all_types_complex_float8_and,
 )
 from torch.testing._internal.two_tensor import TwoTensor
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_WITH_TORCHINDUCTOR:
     from torch._inductor.test_case import TestCase
@@ -79,7 +90,11 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
@@ -159,7 +174,10 @@ def test_constants(self, device):
         self.assertEqual(torch.inf, math.inf)
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.uint16, torch.uint32, torch.uint64)
@@ -192,7 +210,10 @@ def test_int64_upsample3d(self, device, dtype):
     @dtypes(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.uint16, torch.uint32, torch.uint64)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage(self, device, dtype):
         v = make_tensor((3, 5), dtype=dtype, device=device, low=-9, high=9)
         self.assertEqual(v.storage()[0], v[0][0])
@@ -223,7 +244,10 @@ def test_storage(self, device, dtype):
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.quint8, torch.qint8, torch.qint32,
             torch.quint4x2)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage_setitem(self, device, dtype):
         # Skip quantized dtypes for CUDA, since they're not supported
         if torch.device(device).type == 'cuda':
@@ -255,7 +279,14 @@ def test_storage_setitem(self, device, dtype):
 
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+    @unittest.skipIf(
+        "RelWithAssert" in torch.__config__.show(),
+        "failing in debug build, see https://github.com/pytorch/pytorch/pull/156731 for example",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage_use_count(self, device):
         a = torch.randn(10, device=device)
         prev_cf = torch._C._storage_Use_Count(a.untyped_storage()._cdata)
@@ -266,7 +297,10 @@ def test_storage_use_count(self, device):
     @xfailIfTorchDynamo
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_storage_type(self, device, dtype):
         a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
 
@@ -277,7 +311,10 @@ def test_tensor_storage_type(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_from_storage(self, device, dtype):
         a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
@@ -295,7 +332,10 @@ def test_tensor_from_storage(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_storage(self, device, dtype):
         a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
@@ -334,7 +374,10 @@ def _check_storage_meta(self, s, s_check):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_typed_storage_meta(self, device, dtype):
         args_list = [
             [],
@@ -348,7 +391,10 @@ def test_typed_storage_meta(self, device, dtype):
             self._check_storage_meta(s, s_check)
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_untyped_storage_meta(self, device):
         args_list = [
             [],
@@ -363,7 +409,10 @@ def test_untyped_storage_meta(self, device):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage_meta_from_tensor(self, device, dtype):
         t_check = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         t = t_check.to('meta')
@@ -373,7 +422,10 @@ def test_storage_meta_from_tensor(self, device, dtype):
         self._check_storage_meta(s, s_check)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage_meta_errors(self, device, dtype):
         s0 = torch.TypedStorage([1, 2, 3, 4], device='meta', dtype=dtype)
 
@@ -414,7 +466,10 @@ def test_storage_meta_errors(self, device, dtype):
 
     @onlyCPU
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_storage_meta_ok(self, device, dtype):
         s0 = torch.TypedStorage([1, 2, 3, 4], device='meta', dtype=dtype)
 
@@ -430,7 +485,10 @@ def test_module_share_memory(self):
         model.share_memory()
 
     @dtypes(torch.float32, torch.complex64)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deepcopy(self, device, dtype):
         from copy import deepcopy
         a = torch.randn(5, 5, dtype=dtype, device=device)
@@ -458,7 +516,10 @@ def test_deepcopy(self, device, dtype):
         self.assertEqual(deepcopy(a).foo, 3)
 
     @dtypes(torch.float32, torch.complex64)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deepcopy_scalar(self, device, dtype):
         from copy import deepcopy
         a = torch.tensor(5, dtype=dtype, device=device)
@@ -1106,7 +1167,11 @@ def test_broadcast(self, fn, device):
             small2_expanded = small2.expand(*dims_full)
 
         if small.is_cuda and fn in ['map', 'map2']:
+<<<<<<< HEAD
             # map and map2 are not implemented on CUDA tensors
+=======
+            # map and map2 are not implementd on CUDA tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         if hasattr(large_expanded, fn):
@@ -1232,6 +1297,77 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
             _test_in_place_broadcastable(small2, small_expanded, large_expanded)
             _test_in_place_broadcastable(small2, small, large)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
+    @onlyCUDA
+    @wrapDeterministicFlagAPITest
+    def test_cublas_config_nondeterministic_alert(self, device):
+        test_cases = [
+            # (function, (tensor sizes))
+            ('mm', ((2, 2), (2, 2),)),
+            ('mv', ((2, 2), (2,),)),
+            ('bmm', ((1, 2, 2), (1, 2, 2),))]
+
+        test_configs = [
+            # (CuBLAS workspace config, is deterministic)
+            ('garbage', False),
+            (None, False),
+            (':4096:8', True),
+            (':16:8', True)]
+
+        cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
+        is_cuda10_2_or_higher = (torch.version.cuda is not None)
+
+        def test_case_info(fn_name, config):
+            return f'function "{fn_name}" with config "{"" if config is None else config}"'
+
+        # Create processes to test each combination of test cases and config settings
+        for fn_name, arg_sizes in test_cases:
+            for config, is_config_deterministic in test_configs:
+                env = os.environ.copy()
+                if config is None:
+                    if env.get(cublas_var_name) is not None:
+                        del env[cublas_var_name]
+                else:
+                    env[cublas_var_name] = config
+                should_throw_error = is_cuda10_2_or_higher and not is_config_deterministic
+                script = f"""
+import torch
+torch.use_deterministic_algorithms(True)
+fn = torch.{fn_name}
+arg_sizes = {arg_sizes}
+device = '{device}'
+should_throw_error = {should_throw_error}
+args = []
+for arg_size in arg_sizes:
+    args.append(torch.randn(*arg_size, device=device))
+try:
+    fn(*args)
+except RuntimeError as e:
+    if not should_throw_error:
+        raise RuntimeError('Did not expect any error to be raised')
+    elif 'Deterministic behavior was enabled with either' not in str(e):
+        raise RuntimeError('Expected a CuBLAS nondeterministic error, but got a different error')
+else:
+    if should_throw_error:
+        raise RuntimeError('Expected a CuBLAS nondeterministic error, but it was not raised')
+
+"""
+                try:
+                    subprocess.check_output(
+                        [sys.executable, '-c', script],
+                        stderr=subprocess.STDOUT,
+                        # On Windows, opening the subprocess with the default CWD makes `import torch`
+                        # fail, so just set CWD to this script's directory
+                        cwd=os.path.dirname(os.path.realpath(__file__)),
+                        env=env)
+                except subprocess.CalledProcessError as e:
+                    self.fail(msg=(
+                        f'Subprocess exception while attempting to run {test_case_info(fn_name, config)}:\n'
+                        + e.output.decode("utf-8")))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     @dtypes(*get_all_qint_dtypes())
@@ -1493,6 +1629,7 @@ def test_nondeterministic_alert_interpolate_bilinear(self, device):
             'upsample_bilinear2d_backward_out_cuda',
             torch.device(device).type == 'cuda')
 
+<<<<<<< HEAD
     def test_no_nondeterministic_alert_interpolate_bilinear(self, device):
         input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True)
 
@@ -1527,6 +1664,8 @@ def fn():
             'upsample_trilinear3d_backward_out_cuda',
             False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchInductor("aot-autograd issue")
     def test_deterministic_replication_pad2d(self, device):
         test_cases = [
@@ -1837,6 +1976,33 @@ def test_nondeterministic_alert_bincount(self, device):
                 '_bincount_cuda',
                 False)
 
+<<<<<<< HEAD
+=======
+    # Ensures that kthvalue throws nondeterministic alerts in the correct cases
+    @dtypes(torch.double)
+    def test_nondeterministic_alert_kthvalue(self, device, dtype):
+        def test_func(call_type):
+            S = 10
+            k = 5
+            a = torch.randn(S, device=device)
+            if call_type == 'function':
+                torch.kthvalue(a, k)
+            elif call_type == 'method':
+                a.kthvalue(k)
+            elif call_type == 'out':
+                values = torch.empty_like(a)
+                indices = torch.empty((), device=device, dtype=torch.long)
+                torch.kthvalue(a, k, out=(values, indices))
+            else:
+                self.fail(f"'{call_type}' is not a valid call type")
+
+        for call_type in ['function', 'method', 'out']:
+            self.check_nondeterministic_alert(
+                lambda: test_func('function'),
+                'kthvalue CUDA',
+                torch.device(device).type == 'cuda')
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfMPS
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_grid_sample_2d(self, device):
@@ -2070,11 +2236,16 @@ def _cond_fn(x):
         ind_cpu = ind.cpu()
         repeats = torch.full((1,), 2, device=device)
         mask = torch.randint(2, (size,), device=device, dtype=bool)
+<<<<<<< HEAD
         mask_cpu = mask.cpu()
         expect_no_sync = (lambda: _ind_put_fn(x, mask, 1.),
                           lambda: _ind_put_fn(x, mask_cpu, y),
                           lambda: _ind_put_fn(x, ind, y),
                           lambda: _ind_get_fn(x, mask_cpu),
+=======
+        expect_no_sync = (lambda: _ind_put_fn(x, mask, 1.),
+                          lambda: _ind_put_fn(x, ind, y),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           lambda: _ind_get_fn(x, ind),
                           lambda: torch.nn.functional.one_hot(ind, num_classes=size),
                           lambda: torch.randperm(20000, device=device),
@@ -2244,7 +2415,11 @@ def test_corrcoef(self, device, dtype):
         for x in self._generate_correlation_tensors(device, dtype):
             res = torch.corrcoef(x)
             ref = np.corrcoef(x.cpu().numpy())
+<<<<<<< HEAD
             self.assertEqual(res, ref, atol=1e-04, rtol=1e-03, exact_dtype=False)
+=======
+            self.assertEqual(res, ref, exact_dtype=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipRocmIfTorchInductor
     @dtypes(torch.int, torch.float, torch.cfloat)
@@ -2480,7 +2655,11 @@ def test_cdist_cuda_backward(self, device):
                         self.assertEqual(y1.grad, y2.grad, rtol=0, atol=0.001)
 
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.08)
+=======
+    @bf32_on_and_off(0.08)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cdist_large(self, device):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(1000, 10, device=device)
@@ -2491,7 +2670,11 @@ def test_cdist_large(self, device):
 
     @slowTest
     @tf32_on_and_off(0.01)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.08)
+=======
+    @bf32_on_and_off(0.08)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cdist_large_batch(self, device):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(4, 3, 1000, 10, device=device)
@@ -2501,7 +2684,11 @@ def test_cdist_large_batch(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.04)
+=======
+    @bf32_on_and_off(0.04)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cdist_non_contiguous(self, device):
         for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(5, 7, device=device).mT
@@ -2529,7 +2716,11 @@ def test_cdist_non_contiguous(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
+<<<<<<< HEAD
     @reduced_f32_on_and_off(0.04)
+=======
+    @bf32_on_and_off(0.04)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cdist_non_contiguous_batch(self, device):
         for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(4, 3, 2, 5, 7, device=device).mT
@@ -2598,9 +2789,15 @@ def test_cdist_same_inputs(self, device):
             dist_grad = torch.randn((1, 27, 27), device=device, dtype=torch.float)
             y = x.clone()
             x.requires_grad = True
+<<<<<<< HEAD
             d = torch.cdist(x, y, p=p)
             d.backward(dist_grad)
             # Check that the backward pass does not contain invalid
+=======
+            d = torch.cdist(x, y)
+            d.backward(dist_grad)
+            # Check that the backward passs does not contain invalid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # values such as nan or inf
             assert torch.isfinite(x.grad).all()
 
@@ -2632,7 +2829,11 @@ def test_cumsum(self, device):
                                              [0, 0, 0],
                                              [1, 2, 3]]))
 
+<<<<<<< HEAD
         # Check that cumulative sum over a zero length dimension doesn't crash on backprop.
+=======
+        # Check that cummulative sum over a zero length dimension doesn't crash on backprop.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Also check that cumsum over other dimensions in a tensor with a zero-length
         # dimensiuon also works
         # Also include a basic suite of similar tests for other bases cases.
@@ -2684,7 +2885,11 @@ def test_cumprod(self, device):
                                              [0, 0, 0],
                                              [1, 1, 1]]))
 
+<<<<<<< HEAD
         # Check that cumulative prod over a zero length dimension doesn't crash on backprop.
+=======
+        # Check that cummulative prod over a zero length dimension doesn't crash on backprop.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Also check that cumprod over other dimensions in a tensor with a zero-length
         # dimensiuon also works
         # Also include a basic suite of similar tests for other bases cases.
@@ -3356,9 +3561,275 @@ def test_narrow_copy_non_contiguous(self, device):
         actual = torch.narrow_copy(inp, 1, 0, 10)
         self.assertEqual(expected, actual)
 
+<<<<<<< HEAD
     # FIXME: find a test suite for the take operator
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @slowTestIf(IS_WINDOWS)
+=======
+    # FIXME: move to indexing test suite
+    @parametrize("reduce", ['prod', 'amin', 'amax', 'mean'])
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
+    def test_index_reduce(self, device, dtype, reduce):
+        size = (3, 4, 5)
+        index_dtypes = [torch.int, torch.long]
+        include_selfs = [True, False]
+        amin_init = float('inf') if dtype.is_floating_point else torch.iinfo(dtype).max
+        amax_init = -float('inf') if dtype.is_floating_point else torch.iinfo(dtype).min
+        reduction_init = {'prod': 1, 'mean': 0, 'amin': amin_init, 'amax': amax_init}
+
+        for dest_noncontig, src_noncontig, index_noncontig in product([True, False], repeat=3):
+            for idx_dtype, include_self in product(index_dtypes, include_selfs):
+                for dim in range(len(size)):
+                    num_src = np.random.randint(10)
+                    num_dest = size[dim]
+                    dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=dest_noncontig)
+                    src_size = size[:dim] + (num_src,) + size[dim + 1:]
+                    src = make_tensor(src_size, device=device, dtype=dtype, noncontiguous=src_noncontig)
+                    idx = torch.testing.make_tensor(
+                        num_src, low=0, high=num_dest, dtype=idx_dtype, device=device, noncontiguous=index_noncontig
+                    )
+                    expected = dest.clone()
+                    dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
+                    # fill rows in idx with reduction inits if include_self=False
+                    if (not include_self):
+                        expected.index_fill_(dim, idx.long(), reduction_init[reduce])
+                    expected = expected.transpose(0, dim)
+                    src = src.transpose(0, dim)
+                    for i in range(num_src):
+                        if reduce == 'prod':
+                            expected[idx[i]] *= src[i]
+                        elif reduce == 'amin':
+                            torch.minimum(expected[idx[i]], src[i], out=expected[idx[i]])
+                        elif reduce == 'amax':
+                            torch.maximum(expected[idx[i]], src[i], out=expected[idx[i]])
+                        else:
+                            expected[idx[i]] += src[i]
+                    if reduce == 'mean':
+                        counts = torch.ones_like(expected) if include_self else torch.zeros_like(expected)
+                        counts.index_add_(0, idx, torch.ones_like(src))
+                        counts.masked_fill_(counts == 0, 1)
+                        if (dtype.is_floating_point):
+                            expected.div_(counts)
+                        else:
+                            expected.div_(counts, rounding_mode="floor")
+                    expected = expected.transpose(0, dim)
+
+                    self.assertEqual(dest, expected)
+
+    # FIXME: move to test indexing
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_index_copy(self, device, dtype):
+        # We just test for num_copy <= num_dest, as otherwise there are repeated indices
+        # and the behavior is undefined
+        num_copy, num_dest = 3, 5
+
+        def make_arg(batch_sizes, n, dim, contig):
+            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
+            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
+
+        def ref_index_copy(tgt, dim, idx, src):
+            for i in range(idx.size(0)):
+                idx_dest = dim * (slice(None),) + (idx[i],)
+                idx_src = dim * (slice(None),) + (i,)
+                tgt[idx_dest] = src[idx_src]
+
+        # More thorough testing as in index_add
+        for dest_contig, src_contig, index_contig in product([True, False], repeat=3):
+            for other_sizes in ((), (4, 5)):
+                for dim in range(len(other_sizes)):
+                    dest = make_arg(other_sizes, num_dest, dim, dest_contig)
+                    src = make_arg(other_sizes, num_copy, dim, src_contig)
+                    idx = torch.randperm(num_dest, dtype=torch.int64, device=device)[:num_copy]
+                    if not index_contig:
+                        idx = torch.repeat_interleave(idx, 2, dim=-1)
+                        idx = idx[..., ::2]
+                    dest2 = dest.clone()
+                    dest.index_copy_(dim, idx, src)
+                    ref_index_copy(dest2, dim, idx, src)
+                    self.assertEqual(dest, dest2)
+
+    # FIXME: move to test indexing
+    # onlyNativeDeviceTypes due to an XLA error:
+    # https://github.com/pytorch/pytorch/issues/53256
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_index_copy_scalars(self, device, dtype):
+        # Create the 8 possible combinations of scalar sizes for target / index / source
+        scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None),
+                    make_tensor(size_i, dtype=torch.int64, device=device, low=0, high=1),
+                    make_tensor(size_s, dtype=dtype, device=device, low=None, high=None))
+                   for size_t, size_i, size_s in product([(), (1,)], repeat=3))
+        for target, idx, source in scalars:
+            target.index_copy_(0, idx, source)
+            self.assertEqual(target.item(), source.item())
+
+    # FIXME: move to test indexing
+    @onlyCPU
+    def test_errors_index_copy(self, device):
+        # We do not test the GPU as the CUDA_ASSERT would break the CUDA context
+        idx_dim = 8
+        tgt_dim = 5
+        batch_dim = 3
+
+        # Too large of an index
+        a = torch.randn(batch_dim, tgt_dim, device=device)
+        idx = torch.full((idx_dim,), tgt_dim, device=device)
+        c = torch.zeros(batch_dim, idx_dim, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+        # Too small (negative indices)
+        idx = torch.full((idx_dim,), -1, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+        # Too small (very negative indices) - they should be unsupported even
+        # when support for negative indices is implemented for index_copy_
+        idx = torch.full((idx_dim,), -tgt_dim - 1, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+    def _prepare_data_for_index_copy_and_add_deterministic(
+        self, dim: int, device: torch.device
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert (dim >= 0 and dim < 3)
+        a = [5, 4, 3]
+        a[dim] = 2000
+        x = torch.zeros(a, device=device)
+        b = a.copy()
+        elems = a[dim] * 20
+        b[dim] = elems
+        src = torch.rand(b, device=device)
+        index = torch.randint(a[dim], (elems,), device=device)
+        return (x, index, src)
+
+    # FIXME: move to test indexing
+    @onlyNativeDeviceTypes
+    def test_index_copy_deterministic(self, device: torch.device) -> None:
+        for dim in range(3):
+            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(dim, device)
+            with DeterministicGuard(True):
+                y0 = torch.index_copy(x, dim, index, src)
+
+            x0 = x.detach().clone()
+            index_list = index.tolist()
+            for i in range(len(index_list)):
+                if dim == 0:
+                    x0[index_list[i], :, :] = src[i, :, :]
+                elif dim == 1:
+                    x0[:, index_list[i], :] = src[:, i, :]
+                elif dim == 2:
+                    x0[:, :, index_list[i]] = src[:, :, i]
+
+            self.assertEqual(x0, y0, atol=0, rtol=0)
+
+    # FIXME: move to test indexing
+    @onlyNativeDeviceTypes
+    def test_index_add_deterministic(self, device: torch.device) -> None:
+        for dim in range(3):
+            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(dim, device)
+            alpha = random.random() + 1
+            # on CPU it should be deterministic regardless of the deterministic mode
+            with DeterministicGuard(True):
+                y0 = torch.index_add(x, dim, index, src, alpha=alpha)
+                for _ in range(3):
+                    y = torch.index_add(x, dim, index, src, alpha=alpha)
+                    self.assertEqual(y, y0, atol=0, rtol=0)
+
+            with DeterministicGuard(False):
+                for _ in range(3):
+                    y_nd = torch.index_add(x, dim, index, src, alpha=alpha)
+                    self.assertEqual(y_nd, y0, atol=1e-3, rtol=1e-5)
+
+    # FIXME: find a test suite for the put operator
+    @onlyNativeDeviceTypes
+    def test_index_put_non_accumulate_deterministic(self, device) -> None:
+        with DeterministicGuard(True):
+            for i in range(3):
+                m = random.randint(10, 20)
+                elems = random.randint(20000, 30000)
+                values = torch.rand(elems, device=device)
+                indices = torch.randint(m, (elems,), device=device)
+                input = torch.rand(m, device=device)
+                output = input.index_put((indices,), values, accumulate=False)
+
+                input_list = input.tolist()
+                indices_list = indices.tolist()
+                values_list = values.tolist()
+                for i, v in zip(indices_list, values_list):
+                    input_list[i] = v
+
+                self.assertEqual(output, input_list)
+
+    # FIXME: move to test indexing
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @skipIfMPS
+    def test_index_fill(self, device, dtype):
+        x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
+        index = torch.tensor([0], device=device)
+        x.index_fill_(1, index, 0)
+        self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
+        if not x.is_complex() and not device == "meta":
+            with self.assertRaisesRegex(RuntimeError, r"Scalar"):
+                x.index_fill_(1, index, 1 + 1j)
+        # Make sure that the result stays 0-dim while applied to
+        # a 0-dim input
+        x = torch.tensor(1, dtype=dtype, device=device)
+        self.assertEqual(0, x.index_fill(0, index, -1).dim())
+        self.assertEqual(0, x.index_fill_(0, index, -1).dim())
+
+    # FIXME: move to test indexing
+    # The test fails for zero-dimensional tensors on XLA
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_complex_float8_and(torch.half, torch.bool, torch.bfloat16))
+    def test_index_select(self, device, dtype):
+        num_src, num_out = 3, 5
+
+        def make_arg(batch_sizes, n, dim, contig):
+            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
+            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
+
+        def ref_index_select(src, dim, idx):
+            # some types not supported on numpy
+            not_np_dtypes = (torch.bfloat16, torch.float8_e5m2, torch.float8_e5m2fnuz, torch.float8_e4m3fn, torch.float8_e4m3fnuz)
+            if dtype in not_np_dtypes:
+                src = src.float()
+            out = torch.from_numpy(np.take(src.cpu().numpy(), idx.cpu().numpy(), axis=dim))
+            if dtype in not_np_dtypes:
+                out = out.to(device=device, dtype=dtype)
+            return out
+
+        for src_contig, idx_contig in product([True, False], repeat=2):
+            for other_sizes in ((), (4, 5)):
+                for dim in range(len(other_sizes)):
+                    src = make_arg(other_sizes, num_src, dim, src_contig)
+                    idx = make_tensor(
+                        (num_out,), dtype=torch.int64, device=device, low=0, high=num_src, noncontiguous=not idx_contig
+                    )
+                    out = torch.index_select(src, dim, idx)
+                    out2 = ref_index_select(src, dim, idx)
+                    self.assertEqual(out, out2)
+
+        for idx_type in (torch.int32, torch.int64):
+            other_sizes = (3, 2)
+            dim = 1
+            src = make_arg(other_sizes, num_src, dim, True)
+            idx = make_tensor((num_out,), dtype=idx_type, device=device, low=0, high=num_src, noncontiguous=False)
+            out = torch.index_select(src, dim, idx)
+            out2 = ref_index_select(src, dim, idx)
+            self.assertEqual(out, out2)
+
+        # Create the 4 possible combinations of scalar sizes for index / source
+        scalars = ((make_tensor(size_s, dtype=dtype, device=device),
+                    torch.zeros(size_i, dtype=torch.int64, device=device))
+                   for size_s, size_i in product([(), (1,)], repeat=2))
+        for source, idx in scalars:
+            out = source.index_select(0, idx)
+            self.assertEqual(out.item(), source.item())
+
+    # FIXME: find a test suite for the take operator
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_take(self, device, dtype):
         idx_size = (4,)
 
@@ -3469,7 +3940,11 @@ def test_put_accumulate(self, device, dtype):
         # Test for parallel adds with accumulate == True
         low_precision = dtype == torch.half or dtype == torch.bfloat16
         # Less numbers to avoid overflow with low_precision
+<<<<<<< HEAD
         # Grainsize is 3000 for the for_loop to be parallelized on CPU
+=======
+        # Grainsize is 3000 for the for_loop to be parallized on CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sizes = ((100,)) if low_precision else ((200,), (3002,))
         # Bfloat16 has a particularly bad performance here
         # This operation is nondeterministic on GPU, so we are generous with the rtol
@@ -6726,7 +7201,11 @@ def test_index_add_cornercase(self):
                 dest.index_add(0, index, source)
 
     def test_linspace_logspace(self):
+<<<<<<< HEAD
         # Ensure the output does not require grad regardless of inputs requiring guard or not.
+=======
+        # Ensure the output does not require grad regardless of inputs requiring gard or not.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The output of factory functions should not be part of any computational graph.
         start = 0.0
         end = 3.0
@@ -8000,7 +8479,11 @@ def test_print(self):
         self.assertExpectedInline(str(x), '''tensor([1.0000e+02, 1.0000e-02])''')
         torch.set_printoptions(sci_mode=False)
         self.assertEqual(x.__repr__(), str(x))
+<<<<<<< HEAD
         self.assertExpectedInline(str(x), '''tensor([100.0000,   0.0100])''')
+=======
+        self.assertExpectedInline(str(x), '''tensor([  100.0000,     0.0100])''')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.set_printoptions(sci_mode=None)  # reset to the default value
 
         # test no leading space if all elements positive
@@ -8363,7 +8846,11 @@ def test_Size(self):
         self.assertEqual(2 * size, (1, 2, 3, 1, 2, 3))
 
     def test_Size_concat_non_tuple_sequence(self):
+<<<<<<< HEAD
         # check that TypeError gets raised on adding non-tuple sequences.
+=======
+        # check that TypeError get's raised on adding non-tuple sequences.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from collections.abc import Sequence
 
         class DummySequence(Sequence):
@@ -8400,7 +8887,11 @@ def test_Size_scalar(self):
     def test_Size_iter(self):
         for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
             x = torch.Size(sizes)
+<<<<<<< HEAD
             for i in range(5):
+=======
+            for i in range(0, 5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(x[i], i + 1)
 
     def test_t_not_2d_error(self):
@@ -9095,7 +9586,11 @@ def test_manual_seed(self):
                    f"after calling manual_seed({seed:x}), but got {actual_initial_seed:x} instead")
             self.assertEqual(expected_initial_seed, actual_initial_seed, msg=msg)
         for invalid_seed in [min_int64 - 1, max_uint64 + 1]:
+<<<<<<< HEAD
             with self.assertRaisesRegex(ValueError, r'Overflow when unpacking long long'):
+=======
+            with self.assertRaisesRegex(RuntimeError, r'Overflow when unpacking long'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.manual_seed(invalid_seed)
 
         torch.set_rng_state(rng_state)
@@ -9386,7 +9881,10 @@ def test_type(self):
         self.assertEqual(x.type(torch.int32).dtype, torch.int32)
 
     # FIXME: port to a quantization test suite
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_qengine(self):
         qengines = torch.backends.quantized.supported_engines
         original_qe = torch.backends.quantized.engine
@@ -10471,7 +10969,11 @@ def test_conj_neg_tolist(self):
     def test_no_cuda_monkeypatch(self):
         # Note that this is not in test_cuda.py as this whole file is skipped when cuda
         # is not available.
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "torch.cuda.Stream requires CUDA support"):
+=======
+        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Stream"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.Stream()
 
         with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Event"):
@@ -10514,8 +11016,13 @@ def test_size_stride(self) -> None:
     def test_invalid_arg_error_handling(self) -> None:
         """ Tests that errors from old TH functions are propagated back """
         for invalid_val in [-1, 2**65]:
+<<<<<<< HEAD
             self.assertRaises((ValueError, RuntimeError), lambda: torch.set_num_threads(invalid_val))
             self.assertRaises((ValueError, RuntimeError), lambda: torch.set_num_interop_threads(invalid_val))
+=======
+            self.assertRaises(RuntimeError, lambda: torch.set_num_threads(invalid_val))
+            self.assertRaises(RuntimeError, lambda: torch.set_num_interop_threads(invalid_val))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_tensor_prop(self, t):
         preserved = (
@@ -10767,7 +11274,11 @@ def add_neg_dim_tests():
         assert not hasattr(TestTorch, test_name), "Duplicated test name: " + test_name
         setattr(TestTorch, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))
 
+<<<<<<< HEAD
 # TODO: these empty classes are temporarily instantiated for XLA compatibility
+=======
+# TODO: these empy classes are temporarily instantiated for XLA compatibility
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   once XLA updates their test suite it should be removed
 class TestViewOps(TestCase):
     pass
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 4dea431246999..e3cc80e96d909 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -17,7 +17,11 @@
 import math
 import itertools
 import torch.optim as optim
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import expectedFailureMPS, instantiate_device_type_tests, onlyCUDA, largeTensorTest
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCUDA, onlyCPU, largeTensorTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 import torch.utils.cpp_extension
 from torch.testing._internal.common_nn import NNTestCase
@@ -37,7 +41,10 @@
     NOTEST_CPU,
     IS_WINDOWS,
     TEST_WITH_TORCHDYNAMO,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._dynamo.testing import CompileCounterWithBackend
 
@@ -50,8 +57,15 @@
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+<<<<<<< HEAD
     tf32_on_and_off,
     tf32_enabled,
+=======
+    SM90OrLater,
+    tf32_on_and_off,
+    tf32_enabled,
+    ROCM_VERSION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if TEST_FAIRSEQ:
@@ -98,7 +112,11 @@ def _check_equal(
     """
     Compare test tensor against golden and reference tensors.
     Golden is the highest precision possible serving as the "ground truth"
+<<<<<<< HEAD
     Reference is the same precision as test and should also serve as less precisie ground truth.
+=======
+    Refernce is the same precision as test and should also serve as less precisie ground truth.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     We calcculate the "reference error" by comparing the golden to reference and use this as the
     measruing stick for the test tensor.
 
@@ -340,11 +358,21 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.001)
+=======
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
     def test_multiheadattention_fastpath_attn_mask(self, device, attn_mask_dim, key_padding_mask_dim, mask_dtype):
+<<<<<<< HEAD
+=======
+        if TEST_WITH_ROCM:
+            if attn_mask_dim is not None and mask_dtype == torch.bool:
+                self.skipTest("boolean mask is not fully supported on ROCm yet.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # MHA converts all
         with torch.no_grad():
             B = 2
@@ -427,7 +455,11 @@ def hook(module, inputs, output):
         # remove hook
         handle.remove()
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.0021 if TEST_WITH_ROCM else 0.001)
+=======
+    @tf32_on_and_off(0.001)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("use_torchscript", [False])
     @parametrize("enable_nested_tensor", [True, False])
     @parametrize("use_autocast", [True, False])
@@ -520,7 +552,11 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.001)
+=======
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1106,8 +1142,12 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.003)
     @parametrize("batch_size", [0, 5])
+=======
+    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -1117,7 +1157,11 @@ def forward(
                          if attn_dim is not None else "no_attn_mask")))
     @parametrize("dropout_p", [0.0, 0.2, 0.5])
     @sdpa_kernel(backends=[SDPBackend.MATH])
+<<<<<<< HEAD
     def test_scaled_dot_product_attention(self, device, batch_size, input_dim, attn_mask_dim, is_causal, dropout_p):
+=======
+    def test_scaled_dot_product_attention(self, device, input_dim, attn_mask_dim, is_causal, dropout_p):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def sdp_ref(
                 q,
                 k,
@@ -1141,13 +1185,20 @@ def sdp_ref(
         # TODO: Support cross-device / dtype testing properly when instantiate_device_type_tests() is used.
         dtypes = [torch.double, torch.float]
         for dtype in dtypes:
+<<<<<<< HEAD
             N = batch_size
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def rand_tensor(*shape):
                 return torch.randn(shape, device=device, dtype=dtype)
 
             # This test compares python and C++ implementations of SDP.
+<<<<<<< HEAD
             N_prime, L, S, E = 2, 4, 3, 6
+=======
+            N, N_prime, L, S, E = 5, 2, 4, 3, 6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if input_dim == 3:
                 query = rand_tensor(N, L, E)
                 key = rand_tensor(N, S, E)
@@ -1658,6 +1709,22 @@ def test_invalid_sdpa_kernel_grouped_query_attention_cuda(self, device, fused_ke
                     F.scaled_dot_product_attention(rand_query, rand_key, rand_value, dropout_p=0.0,
                                                    is_causal=False, enable_gqa=True)
 
+<<<<<<< HEAD
+=======
+    @onlyCPU
+    def test_invalid_sdpa_kernel_grouped_query_attention_cpu(self, device):
+        rand_query = torch.rand(8, 8, 64, 64, device=device, dtype=torch.float16, requires_grad=True)
+        rand_key = torch.rand(8, 4, 64, 64, device=device, dtype=torch.float16, requires_grad=True)
+        rand_value = torch.rand(8, 4, 64, 64, device=device, dtype=torch.float16, requires_grad=True)
+
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
+            with self.assertRaisesRegex(RuntimeError, "No available kernel"):
+                with self.assertWarnsRegex(UserWarning, "For dense inputs, both fused kernels require query, "
+                                           "key and value to have"):
+                    F.scaled_dot_product_attention(rand_query, rand_key, rand_value, dropout_p=0.0,
+                                                   is_causal=False, enable_gqa=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not flash_attention fused scaled dot product attention")
     @parametrize("kernel", PLATFORM_SPECIFIC_SDPA)
@@ -1705,7 +1772,11 @@ def test_invalid_fused_inputs_attn_mask_present(self, device, kernel: SDPBackend
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support fused SDPA or pre-SM80 hardware")
     def test_unaligned_tensors(self, device):
+<<<<<<< HEAD
         # The alignment is dependent on arch so we specify SM80OrLater
+=======
+        # The alignment is depdent on arch so we specifiy SM80OrLater
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = torch.float16
         size = SdpaShape(2, 2, 8, 5)
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
@@ -2025,7 +2096,10 @@ class TestSDPA(NNTestCase):
         for both cpu and cuda. If you're test is only applicable to cuda,
         add it to TestSDPACudaOnly.
     """
+<<<<<<< HEAD
     @expectedFailureMPS  # No double support
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("contiguous_inputs", [True, False])
     def test_sdp_math_gradcheck(self, device, contiguous_inputs: bool):
 
@@ -2068,11 +2142,14 @@ def ref(x):
             sdp_math = torch.nn.functional.scaled_dot_product_attention(x, x, x, scale=-1.0 / 0.0001)
         self.assertEqual(ref_result, sdp_math)
 
+<<<<<<< HEAD
     def test_scaled_dot_product_attention_fp16_overflow(self, device):
         # Regression test for https://github.com/pytorch/pytorch/issues/160841
         x = torch.full((1, 32, 23, 80), 256.0, dtype=torch.half, device=device)
         y = torch.nn.functional.scaled_dot_product_attention(x, x, x)
         self.assertFalse(y.isnan().any().item())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestSDPACpuOnly(NNTestCase):
     """ Used to test CPU only functionality of scaled_dot_product_attention """
@@ -2093,6 +2170,7 @@ def test_fused_sdp_choice_cpu(self, device, type: str, dropout: float, dtype: to
         else:
             assert torch._fused_sdp_choice(q, k, v, dropout_p=dropout) == SDPBackend.FLASH_ATTENTION.value
 
+<<<<<<< HEAD
     def _generate_fixed_qkv_helper(
         self,
         device,
@@ -2113,6 +2191,8 @@ def _generate_fixed_qkv_helper(
         v = make_tensor(kv_shape).transpose(1, 2)
         return q, k, v
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION])
     @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16, torch.float16])
     @parametrize("batch_size", [2, 12])
@@ -2146,20 +2226,34 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
             tol = Tolerances(5e-2, 5e-2)
         if dtype is torch.float16:
             tol = Tolerances(1e-2, 1e-2)
+<<<<<<< HEAD
         tol_grad = Tolerances(1e-5, 5e-6)
         if dtype is torch.bfloat16:
             tol_grad = Tolerances(5e-2, 5e-2)
         if dtype is torch.float16:
             tol_grad = Tolerances(1e-1, 1e-1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for mask_shape in itertools.product(
             [q_seq_len, 1], [kv_seq_len, 1]
         ) if mask_dim == 2 else itertools.product(
             [batch_size, 1], [n_head, 1], [q_seq_len, 1], [kv_seq_len, 1]
         ):
+<<<<<<< HEAD
             q, k, v = self._generate_fixed_qkv_helper(
                 device, dtype, batch_size, n_head, n_head, q_seq_len, kv_seq_len, head_dim)
             q2, k2, v2 = self._generate_fixed_qkv_helper(
                 device, dtype, batch_size, n_head, n_head, q_seq_len, kv_seq_len, head_dim)
+=======
+            make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, dtype=dtype, requires_grad=False)
+            q_shape = SdpaShape(batch_size, n_head, q_seq_len, head_dim)
+            kv_shape = SdpaShape(batch_size, n_head, kv_seq_len, head_dim)
+            q = make_tensor(q_shape)
+            k = make_tensor(kv_shape)
+            v = make_tensor(kv_shape)
+            q2, k2, v2 = q.clone(), k.clone(), v.clone()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if train:
                 q.requires_grad_(True)
                 k.requires_grad_(True)
@@ -2168,6 +2262,15 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
                 k2.requires_grad_(True)
                 v2.requires_grad_(True)
 
+<<<<<<< HEAD
+=======
+            if dtype in [torch.bfloat16, torch.float16]:
+                q2, k2, v2 = q2.float(), k2.float(), v2.float()
+            # (B, nh, T, hs)
+            q = q.view(batch_size, q_seq_len, n_head, head_dim).transpose(1, 2)
+            k = k.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
+            v = v.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if set_attn_mask and not casual:
                 if bool_mask:
                     attn_mask = torch.randint(0, 2, size=mask_shape, dtype=torch.bool, device=device)
@@ -2175,11 +2278,22 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
                     attn_mask = torch.randn(mask_shape, dtype=dtype, device=device)
             else:
                 attn_mask = None
+<<<<<<< HEAD
+=======
+            q2 = q2.view(batch_size, q_seq_len, n_head, head_dim).transpose(1, 2)
+            k2 = k2.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
+            v2 = v2.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with sdpa_kernel(backends=[fused_kernel]):
                 actual = torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=casual)
             with sdpa_kernel(backends=[SDPBackend.MATH]):
+<<<<<<< HEAD
+=======
+                if not bool_mask and dtype in [torch.bfloat16, torch.float16] and attn_mask is not None:
+                    attn_mask = attn_mask.float()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 math_ref = torch.nn.functional.scaled_dot_product_attention(
                     q2, k2, v2, attn_mask=attn_mask, dropout_p=0.0, is_causal=casual)
 
@@ -2198,6 +2312,7 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
                 grad_q_actual, grad_k_actual, grad_v_actual = q.grad, k.grad, v.grad
                 grad_q_ref, grad_k_ref, grad_v_ref = q2.grad, k2.grad, v2.grad
 
+<<<<<<< HEAD
                 self.assertFalse(grad_q_actual is None)
                 self.assertFalse(grad_k_actual is None)
                 self.assertFalse(grad_v_actual is None)
@@ -2270,6 +2385,11 @@ def test_scaled_dot_product_fused_attention_gqa_vs_math_cpu(
             self.assertEqual(grad_q_actual, grad_q_ref, atol=tol_grad.atol, rtol=tol_grad.rtol)
             self.assertEqual(grad_k_actual, grad_k_ref, atol=tol_grad.atol, rtol=tol_grad.rtol)
             self.assertEqual(grad_v_actual, grad_v_ref, atol=tol_grad.atol, rtol=tol_grad.rtol)
+=======
+                self.assertEqual(grad_q_actual, grad_q_ref, atol=tol.atol, rtol=tol.rtol)
+                self.assertEqual(grad_k_actual, grad_k_ref, atol=tol.atol, rtol=tol.rtol)
+                self.assertEqual(grad_v_actual, grad_v_ref, atol=tol.atol, rtol=tol.rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_sdpa_with_inf(self, device):
         # https://github.com/pytorch/pytorch/issues/127055.
@@ -2667,6 +2787,7 @@ def test_cudnn_attention_d256_heuristic(self, device):
         v_shape = SdpaShape(batch, num_heads, seq_len, head_dim_v)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
+<<<<<<< HEAD
         def test():
             with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION], set_priority=True):
                 actual = torch.nn.functional.scaled_dot_product_attention(
@@ -2685,6 +2806,20 @@ def test():
         else:
             with self.assertRaisesRegex(RuntimeError, "No available kernel."):
                 test()
+=======
+        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH], set_priority=True):
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+            actual.backward(torch.randn_like(actual))
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
+                query.contiguous().to(torch.float32),
+                key.contiguous().to(torch.float32),
+                value.contiguous().to(torch.float32),
+                attn_mask=None, dropout_p=0.0, is_causal=False)
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_fused_attention_different_dk_dv(self, device):
@@ -2710,7 +2845,10 @@ def test_fused_attention_different_dk_dv(self, device):
 
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+<<<<<<< HEAD
     @unittest.skipIf(True, "broken as of cuDNN 9.10")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudnn_attention_fail_d128(self, device):
         # Test that cuDNN attention dispatching correctly bails out on d > 128
         b, h = 1, 2
@@ -2725,6 +2863,10 @@ def test_cudnn_attention_fail_d128(self, device):
         ISSM90 = device_cap == (9, 0)
         ISSM100 = device_cap == (10, 0)
         with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
+<<<<<<< HEAD
+=======
+            # SM90/100 support d <= 256 as of cuDNN 9.5.1+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (ISSM90 or ISSM100) and torch.backends.cudnn.version() >= 90501:
                 torch.nn.functional.scaled_dot_product_attention(q, k, v)
             else:
@@ -2814,6 +2956,7 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(SDPBackend.CUDNN_ATTENTION, list(permute_order) + [3])
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_compiles(self):
         q = torch.randn(2, 8, 1024, 128, dtype=torch.half, device='cuda', requires_grad=True)
@@ -2845,6 +2988,8 @@ def test_cudnn_attention_seqlen1_dropout_heuristic(self):
             out = torch.nn.functional.scaled_dot_product_attention(q, q, q, dropout_p=0.5)
             out.backward(grad)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
     def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]):
@@ -2977,7 +3122,11 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: s
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "Fused SDPA was not built for this system")
     @unittest.skipIf("TORCH_CUDNN_SDPA_NESTED_TENSOR_ENABLED" not in os.environ, "cuDNN Nested Tensor support not enabled")
     @parametrize("type", ["nested"])
+<<<<<<< HEAD
     @parametrize("is_contiguous", [True, False])
+=======
+    @parametrize("is_contiguous", [True])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scaled_dot_product_attention_cudnn_nested(self, device, type: str, is_contiguous: bool):
         if TEST_WITH_ROCM and type == 'nested':
             self.skipTest("ROCM does not support efficient attention on nested tensors, for now")
@@ -3167,7 +3316,11 @@ def test_sdp_flash_attention_grad_against_math(self, device, contiguous_inputs:
 
         # Cast up and compare
         # Since we are doing the compute on fp16 we have to bump the tolerance
+<<<<<<< HEAD
         # Bump down the tolerance for blfoat16
+=======
+        # Bump down the tolearnce for blfoat16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         atol = 7e-4 if dtype == torch.float16 else 7e-3
         rtol = 7e-4 if dtype == torch.float16 else 7e-3
         if TEST_WITH_ROCM:
@@ -3188,6 +3341,7 @@ def test_fused_sdp_choice(self, device, type: str):
         value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
+<<<<<<< HEAD
         device_capability = None
         if "cuda" in str(device):
             device_capability = torch.cuda.get_device_capability()
@@ -3201,6 +3355,17 @@ def test_fused_sdp_choice(self, device, type: str):
         elif PLATFORM_SUPPORTS_FLASH_ATTENTION:
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
         elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and not prefer_cudnn:  # e.g., we're on Windows
+=======
+        # TODO we are currently disabling this by default, lets assert that this returns
+        # FlashAttention, we need to change when we make remove opt-in for cudnn
+        if type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and SM90OrLater:
+            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
+            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
+                self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
+        elif PLATFORM_SUPPORTS_FLASH_ATTENTION:
+            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
+        elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION:  # e.g., we're on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.EFFICIENT_ATTENTION.value)
             with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
                 self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
@@ -3322,7 +3487,11 @@ def test_mem_eff_backwards_determinism(self, device):
             out = F.scaled_dot_product_attention(query, key, value)
             upward_grad = torch.rand_like(out)
             out.backward(upward_grad)
+<<<<<<< HEAD
             initial_query_grad = query.grad
+=======
+            intial_query_grad = query.grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Re-run the op with the same upward grad and check that the backward is
             # not deterministic
@@ -3331,7 +3500,11 @@ def test_mem_eff_backwards_determinism(self, device):
                 query.grad = None
                 out = F.scaled_dot_product_attention(query, key, value)
                 out.backward(upward_grad)
+<<<<<<< HEAD
                 if not torch.equal(initial_query_grad, query.grad):
+=======
+                if not torch.equal(intial_query_grad, query.grad):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     diff_anwser_once = True
                     break
             self.assertTrue(diff_anwser_once)
@@ -3341,7 +3514,11 @@ def test_mem_eff_backwards_determinism(self, device):
             out = F.scaled_dot_product_attention(query, key, value)
             upward_grad = torch.rand_like(out)
             out.backward(upward_grad)
+<<<<<<< HEAD
             initial_query_grad = query.grad
+=======
+            intial_query_grad = query.grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Re-run the op with the same upward grad and check that the backward is
             # deterministic now that we have enforced it
@@ -3350,7 +3527,11 @@ def test_mem_eff_backwards_determinism(self, device):
                 query.grad = None
                 out = F.scaled_dot_product_attention(query, key, value)
                 out.backward(upward_grad)
+<<<<<<< HEAD
                 if not torch.equal(initial_query_grad, query.grad):
+=======
+                if not torch.equal(intial_query_grad, query.grad):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     diff_anwser_once = True
                     break
             self.assertFalse(diff_anwser_once)
@@ -3659,7 +3840,11 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
                     query, key, value, is_causal=is_causal, scale=scale, enable_gqa=enable_gqa)
         else:
             # Problem: We pad sizes in the composite region of the top level SDPA. But we need the
+<<<<<<< HEAD
             # Debug mask when have dropout. So I am going to manually pad up here when testing dropout
+=======
+            # Debug mask when have dropout. So I am going to manualy pad up here when testing dropout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q_padded, q_og_size = pad_last_dim(query, 8)
             k_padded, k_og_size = pad_last_dim(key, 8)
             v_padded, v_og_size = pad_last_dim(value, 8)
@@ -4176,6 +4361,12 @@ def rand_nt(sequence_list, num_heads, head_dim):
 class TestSDPAXpuOnly(NNTestCase):
     """ Used to test XPU only functionality of scaled_dot_product_attention
     Mostly migrate from TestSDPACudaOnly in test/test_transformers.py
+<<<<<<< HEAD
+=======
+
+    Note that as SDPBackend.OVERRIDEABLE is not managed by sdpa_kernel so that
+    math ref has to be called explicitly via torch.ops.aten._scaled_dot_product_attention_math.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     @parametrize("type", ["dense"])
@@ -4201,6 +4392,10 @@ def test_fused_attention_different_dk_dv(self, device):
         v_shape = SdpaShape(batch, num_heads, 2, head_dim_v)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
+<<<<<<< HEAD
+=======
+        # test that we do not dispatch to onednn for an unsupported case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual = F.scaled_dot_product_attention(
             query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
 
@@ -4238,6 +4433,10 @@ def test_fused_attention_gqa(self, device, dtype, batch_size, n_head, n_head_kv,
         v_shape = SdpaShape(batch_size, n_head_kv, kv_size, head_dim)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
+<<<<<<< HEAD
+=======
+        # test that we do not dispatch to onednn for an unsupported case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual = F.scaled_dot_product_attention(
             query, key, value, attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True)
 
@@ -4307,6 +4506,7 @@ def test_attention(permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(list(permute_order) + [3])
 
+<<<<<<< HEAD
     def test_backends_set_to_math(self, device):
         dtype = torch.bfloat16
         q_shape = SdpaShape(1, 1, 8, 16)
@@ -4336,6 +4536,8 @@ def test_default_priority_order(self, device):
         self.assertTrue(overrideable_index < math_index < flash_index,
                         f"Expected overrideable < math < flash, got {overrideable_index}, {math_index}, {flash_index}")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scaled_dot_product_attention_fused_kernels_safe_softmax(self, device):
         dtype = torch.bfloat16
         make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
@@ -4629,6 +4831,7 @@ def test_is_causal_and_mask_fails(self, device):
             scaled_dot_product_attention(query, key, value, attn_mask=attn_bias, is_causal=True, dropout_p=0.0)
 
 if NOTEST_CPU:
+<<<<<<< HEAD
     device_types = ("cuda", "mps")
 else:
     device_types = ("cpu", "cuda", "mps")
@@ -4642,6 +4845,18 @@ def test_is_causal_and_mask_fails(self, device):
 instantiate_device_type_tests(TestSDPACudaOnly, globals(), only_for=("cuda"))
 instantiate_device_type_tests(TestSDPACpuOnly, globals(), only_for=("cpu"))
 instantiate_device_type_tests(TestAttnBias, globals(), only_for=device_types, allow_xpu=True)
+=======
+    device_types = ("cuda", )
+else:
+    device_types = ("cpu", "cuda")
+
+instantiate_device_type_tests(TestTransformers, globals(), only_for=device_types)
+instantiate_device_type_tests(TestSDPAFailureModes, globals(), only_for=device_types)
+instantiate_device_type_tests(TestSDPA, globals(), only_for=device_types)
+instantiate_device_type_tests(TestSDPACudaOnly, globals(), only_for=("cuda"))
+instantiate_device_type_tests(TestSDPACpuOnly, globals(), only_for=("cpu"))
+instantiate_device_type_tests(TestAttnBias, globals(), only_for=device_types)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestSDPAXpuOnly, globals(), only_for="xpu", allow_xpu=True)
 
 if __name__ == '__main__':
diff --git a/test/test_transformers_privateuse1.py b/test/test_transformers_privateuse1.py
new file mode 100644
index 0000000000000..728b0a1188252
--- /dev/null
+++ b/test/test_transformers_privateuse1.py
@@ -0,0 +1,100 @@
+# Owner(s): ["module: sdpa"]
+
+import unittest
+from collections import namedtuple
+from functools import partial
+
+import pytorch_openreg  # noqa: F401
+
+import torch
+from torch.nn.attention import SDPBackend
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TEST_XPU
+
+
+SdpaShape = namedtuple("Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"])
+
+
+@unittest.skipIf(TEST_XPU, "XPU does not support cppextension currently")
+class TestSDPAPrivateUse1Only(NNTestCase):
+    @skipIfTorchDynamo()
+    def test_fused_sdp_choice_privateuseone(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = partial(torch.rand, device="cpu", dtype=torch.float16)
+        shape = SdpaShape(batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        assert (
+            torch._fused_sdp_choice(q_privateuse1, k_privateuse1, v_privateuse1)
+            == SDPBackend.OVERRIDEABLE.value
+        )
+
+    def test_scaled_dot_product_fused_attention_overrideable(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = partial(torch.rand, device="cpu", dtype=torch.float16)
+        shape = SdpaShape(batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        torch.nn.functional.scaled_dot_product_attention(
+            q_privateuse1, k_privateuse1, v_privateuse1, attn_mask=None, dropout_p=0.0
+        )
+
+    def test_scaled_dot_product_fused_attention_overrideable_backward(self):
+        batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
+        make_tensor = partial(
+            torch.rand, device="cpu", dtype=torch.float16, requires_grad=True
+        )
+        shape = (batch_size, num_heads, seq_len, head_dim)
+        q_cpu, k_cpu, v_cpu = make_tensor(shape), make_tensor(shape), make_tensor(shape)
+        attn_mask = make_tensor((batch_size, num_heads, seq_len, seq_len))
+        q_privateuse1 = q_cpu.to("openreg")
+        k_privateuse1 = k_cpu.to("openreg")
+        v_privateuse1 = v_cpu.to("openreg")
+        attn_mask_privateuse1 = attn_mask.to("openreg")
+        (
+            output,
+            logsumexp,
+            cum_seq_q,
+            cum_seq_k,
+            max_q,
+            max_k,
+            philox_seed,
+            philox_offset,
+            debug_attn_mask,
+        ) = torch.ops.aten._scaled_dot_product_fused_attention_overrideable(
+            q_privateuse1, k_privateuse1, v_privateuse1, attn_bias=attn_mask_privateuse1
+        )
+
+        rand_upward = torch.rand(
+            shape, device="cpu", dtype=torch.float16, requires_grad=False
+        )
+        rand_upward_privateuse1 = rand_upward.to("openreg")
+        grad_input_mask = [True, True, True, True]
+        grad_q, grad_k, grad_v, grad_attn_mask = (
+            torch.ops.aten._scaled_dot_product_fused_attention_overrideable_backward(
+                rand_upward_privateuse1,
+                q_privateuse1,
+                k_privateuse1,
+                v_privateuse1,
+                attn_mask_privateuse1,
+                grad_input_mask,
+                output,
+                logsumexp,
+                cum_seq_q,
+                cum_seq_k,
+                max_q,
+                max_k,
+                dropout_p=0.0,
+                is_causal=False,
+                philox_seed=philox_seed,
+                philox_offset=philox_offset,
+            )
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_type_info.py b/test/test_type_info.py
index 2ed7a29fe5d28..2ec4ee39ceb03 100644
--- a/test/test_type_info.py
+++ b/test/test_type_info.py
@@ -12,7 +12,11 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sys
 import unittest
@@ -125,7 +129,10 @@ def test_to_complex(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/124868
         # If reference count is leaked this would be a set of 10 elements
         ref_cnt = {sys.getrefcount(torch.float32.to_complex()) for _ in range(10)}
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertLess(len(ref_cnt), 3)
 
         self.assertEqual(torch.float64.to_complex(), torch.complex128)
@@ -136,7 +143,10 @@ def test_to_real(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/124868
         # If reference count is leaked this would be a set of 10 elements
         ref_cnt = {sys.getrefcount(torch.cfloat.to_real()) for _ in range(10)}
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertLess(len(ref_cnt), 3)
 
         self.assertEqual(torch.complex128.to_real(), torch.double)
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index abb32d525bf71..af805db700cbf 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -22,7 +22,11 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Not thread-safe decorator that runs the decorated test once with
 # the default dtype being torch.float and again with the default dtype
@@ -968,7 +972,11 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
                 except Exception as e:
                     expected = e
 
+<<<<<<< HEAD
                 same_result = (type(expected) is type(actual)) and expected == actual
+=======
+                same_result = (type(expected) == type(actual)) and expected == actual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Note: An "undesired failure," as opposed to an "expected failure"
                 # is both expected (we know the test will fail) and
@@ -1046,13 +1054,21 @@ def test_cat_out_different_dtypes(self, device):
                     and not (out_dtype.is_floating_point or out_dtype.is_complex))
                     or ((x_dtype.is_complex or y_dtype.is_complex) and not out_dtype.is_complex)):
                 # This combinations do not support type conversion to a different class out type
+<<<<<<< HEAD
                 with self.assertRaises(TypeError):
+=======
+                with self.assertRaises(RuntimeError):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.cat([x, y], out=out)
             else:
                 torch.cat([x, y], out=out)
                 self.assertEqual(out, expected_out, exact_dtype=True)
 
+<<<<<<< HEAD
     # Verifies that unary ops require matching out types
+=======
+    # Verfies that unary ops require matching out types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(*itertools.product((torch.int64,
                                 torch.float32, torch.float64,
@@ -1128,7 +1144,11 @@ def make_tensor(size, dtype):
         maxs = (max_t, max_t[0], max_t[0].item())
         inp = make_tensor((S,), dtype0)
         for min_v, max_v in itertools.product(mins, maxs):
+<<<<<<< HEAD
             if type(max_v) is not type(min_v):
+=======
+            if type(max_v) != type(min_v):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             if isinstance(min_v, torch.Tensor) and min_v.ndim == 0 and max_v.ndim == 0:
                 continue  # 0d tensors go to scalar overload, and it's tested separately
diff --git a/test/test_typing.py b/test/test_typing.py
index f28091fa8d046..015d400d6be6a 100644
--- a/test/test_typing.py
+++ b/test/test_typing.py
@@ -35,7 +35,11 @@
 
 
 def _key_func(key: str) -> str:
+<<<<<<< HEAD
     """Split at the first occurrence of the ``:`` character.
+=======
+    """Split at the first occurance of the ``:`` character.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Windows drive-letters (*e.g.* ``C:``) are ignored herein.
     """
@@ -135,7 +139,11 @@ def _parse_reveals(file: IO[str]) -> list[str]:
     comments = "/n".join(comments_array)
 
     # Only search for the `{*}` pattern within comments,
+<<<<<<< HEAD
     # otherwise there is the risk of accidentally grabbing dictionaries and sets
+=======
+    # otherwise there is the risk of accidently grabbing dictionaries and sets
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     key_set = set(re.findall(r"\{(.*?)\}", comments))
     kwargs = {
         k: FORMAT_DICT.get(k, f"<UNRECOGNIZED FORMAT KEY {k!r}>") for k in key_set
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 13f205c4d116b..d7c60f8f0d790 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -54,8 +54,11 @@
 )
 from torch.utils import _pytree as pytree
 
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, slowTestIf
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TEST_SCIPY:
     import scipy
 
@@ -273,7 +276,10 @@ def _helper_reference_numerics(
     #   and noncontiguities.
     @suppress_warnings
     @ops(reference_filtered_ops)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reference_numerics_normal(self, device, dtype, op):
         tensors = generate_elementwise_unary_tensors(
             op, device=device, dtype=dtype, requires_grad=False
@@ -282,7 +288,10 @@ def test_reference_numerics_normal(self, device, dtype, op):
 
     @suppress_warnings
     @ops(reference_filtered_ops)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reference_numerics_small(self, device, dtype, op):
         if dtype in (torch.bool,):
             raise self.skipTest("bool has no small values")
@@ -294,7 +303,10 @@ def test_reference_numerics_small(self, device, dtype, op):
 
     @suppress_warnings
     @ops(reference_filtered_ops)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reference_numerics_large(self, device, dtype, op):
         if dtype in (torch.bool, torch.uint8, torch.int8):
             raise self.skipTest("bool, uint8, and int8 dtypes have no large values")
@@ -309,7 +321,10 @@ def test_reference_numerics_large(self, device, dtype, op):
         reference_filtered_ops,
         allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
     )
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reference_numerics_extremal(self, device, dtype, op):
         tensors = generate_elementwise_unary_extremal_value_tensors(
             op, device=device, dtype=dtype, requires_grad=False
@@ -318,7 +333,10 @@ def test_reference_numerics_extremal(self, device, dtype, op):
 
     # Tests for testing (non)contiguity consistency
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig_vs_every_other(self, device, dtype, op):
         contig = make_tensor(
             (1026,), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
@@ -335,7 +353,10 @@ def test_contig_vs_every_other(self, device, dtype, op):
         self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig_vs_transposed(self, device, dtype, op):
         contig = make_tensor(
             (789, 357), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
@@ -352,7 +373,10 @@ def test_contig_vs_transposed(self, device, dtype, op):
         self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contig(self, device, dtype, op):
         shapes = [(5, 7), (1024,)]
         for shape in shapes:
@@ -369,7 +393,10 @@ def test_non_contig(self, device, dtype, op):
             self.assertEqual(op(contig, **torch_kwargs), op(non_contig, **torch_kwargs))
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contig_index(self, device, dtype, op):
         contig = make_tensor(
             (2, 2, 1, 2),
@@ -388,7 +415,10 @@ def test_non_contig_index(self, device, dtype, op):
         self.assertEqual(op(contig, **torch_kwargs), op(non_contig, **torch_kwargs))
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contig_expand(self, device, dtype, op):
         shapes = [(1, 3), (1, 7), (5, 7)]
         for shape in shapes:
@@ -410,7 +440,10 @@ def test_non_contig_expand(self, device, dtype, op):
                 )
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig_size1(self, device, dtype, op):
         contig = make_tensor(
             (5, 100), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
@@ -426,7 +459,10 @@ def test_contig_size1(self, device, dtype, op):
         self.assertEqual(op(contig, **torch_kwargs), op(contig2, **torch_kwargs))
 
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_contig_size1_large_dim(self, device, dtype, op):
         contig = make_tensor(
             (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4),
@@ -448,7 +484,10 @@ def test_contig_size1_large_dim(self, device, dtype, op):
     # Tests that computation on a multiple batches is the same as
     # per-batch computation.
     @ops(unary_ufuncs)
+<<<<<<< HEAD
     @slowTestIf(IS_WINDOWS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_batch_vs_slicing(self, device, dtype, op):
         input = make_tensor(
             (1024, 512), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
@@ -773,6 +812,7 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                     with self.assertRaises(AttributeError):
                         torch_inplace_method = getattr(torch.Tensor, fn_name + "_")
 
+<<<<<<< HEAD
     @onlyCUDA
     @dtypes(torch.complex64)
     def test_tan_complex_cuda_matches_numpy(self, device, dtype):
@@ -815,6 +855,8 @@ def test_tanh_complex_cuda_matches_numpy(self, device, dtype):
         z = torch.complex(real, imag).to(dtype)
         self.compare_with_numpy(torch.tanh, np.tanh, z)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def check_internal_mem_overlap(
         self, inplace_op, num_inputs, dtype, device, expected_failure=False
     ):
@@ -1136,7 +1178,11 @@ def test_silu(self, device, dtype):
     def test_silu_complex(self, device, dtype):
         atol = 1e-6
         rtol = 1e-6
+<<<<<<< HEAD
         inp_outs = [
+=======
+        inouts = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (0.2 + 0.3j, 0.08775215595960617065 + 0.18024823069572448730j),
             (1e-19 + 1e-18j, 4.99999984132761269448e-20 + 5.00000022906852482872e-19j),
             (-1.0 + 2.0j, -0.78546208143234252930 + -0.44626939296722412109j),
@@ -1144,7 +1190,11 @@ def test_silu_complex(self, device, dtype):
             (2.0j, -1.55740761756896972656 + 0.99999988079071044922j),
         ]
 
+<<<<<<< HEAD
         for inp, out in inp_outs:
+=======
+        for inp, out in inouts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = torch.nn.functional.silu(
                 torch.tensor(inp, dtype=dtype, device=device)
             )
@@ -1152,7 +1202,11 @@ def test_silu_complex(self, device, dtype):
             self.assertEqual(res.real, out.real, atol=atol, rtol=rtol)
             self.assertEqual(res.imag, out.imag, atol=atol, rtol=rtol)
 
+<<<<<<< HEAD
         for inp, out in inp_outs:
+=======
+        for inp, out in inouts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = torch.nn.functional.silu(
                 torch.tensor(inp, dtype=dtype, device=device), inplace=True
             )
@@ -1160,7 +1214,11 @@ def test_silu_complex(self, device, dtype):
             self.assertEqual(res.real, out.real, atol=atol, rtol=rtol)
             self.assertEqual(res.imag, out.imag, atol=atol, rtol=rtol)
 
+<<<<<<< HEAD
     # It is not obvious how to merge this into OpInfo because these inputs
+=======
+    # It is not obvious how to merge this into OpInfo becuase these inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # succeed for gradcheck but are expected to fail for gradgradcheck
     @dtypes(torch.double)
     def test_sinc(self, device, dtype):
@@ -1226,7 +1284,11 @@ def test_log1p_complex(self, device, dtype):
         # Not using numpy's log1p here because by the time of writing this,
         # np.log1p has precision problems for small complex input values, see here:
         # https://github.com/numpy/numpy/issues/22609
+<<<<<<< HEAD
         inp_outs = [
+=======
+        inouts = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (0.2 + 0.3j, 0.21263386770217202 + 0.24497866312686414j),
             (1e-19 + 1e-18j, 1e-19 + 1e-18j),
             (1e-18 + 0.1j, 0.00497517 + 0.0996687j),
@@ -1240,7 +1302,11 @@ def test_log1p_complex(self, device, dtype):
         ]
         # test the extreme values
         if dtype == torch.complex128:
+<<<<<<< HEAD
             inp_outs += [
+=======
+            inouts += [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (-1 + 1e250j, 575.6462732485114 + 1.5707963267948966j),
                 (1e250 + 1j, 575.6462732485114 + 1e-250j),
                 (1e250 + 1e250j, 575.9928468387914 + 0.7853981633974483j),
@@ -1249,7 +1315,11 @@ def test_log1p_complex(self, device, dtype):
                 (1e250 + 1e-250j, 575.6462732485114 + 0.0j),
             ]
         elif dtype == torch.complex64:
+<<<<<<< HEAD
             inp_outs += [
+=======
+            inouts += [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (-1 + 1e30j, 69.07755278982137 + 1.5707963267948966j),
                 (1e30 + 1j, 69.07755278982137 + 1e-30j),
                 (1e30 + 1e30j, 69.42412638010134 + 0.7853981633974483j),
@@ -1259,7 +1329,11 @@ def test_log1p_complex(self, device, dtype):
             ]
 
         # test the log1p individually
+<<<<<<< HEAD
         for inp, out in inp_outs:
+=======
+        for inp, out in inouts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = torch.log1p(torch.tensor(inp, dtype=dtype, device=device))
             self.assertFalse(torch.any(torch.isnan(res)))
             # setting up atol == 0.0 because some part has very small values
@@ -1267,7 +1341,11 @@ def test_log1p_complex(self, device, dtype):
             self.assertEqual(res.imag, out.imag, atol=0.0, rtol=1e-6)
 
         # test the log1p in tensor
+<<<<<<< HEAD
         inp_lst, out_lst = (list(elmt) for elmt in zip(*inp_outs))
+=======
+        inp_lst, out_lst = (list(elmt) for elmt in zip(*inouts))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp_tens = torch.tensor(inp_lst, dtype=dtype, device=device)
         out_tens = torch.tensor(out_lst, dtype=dtype, device=device)
         res_tens = torch.log1p(inp_tens)
@@ -1348,7 +1426,11 @@ def test_igamma_edge_cases(self, device, dtype):
         zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
         small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
         nans = torch.zeros((3,), **tkwargs) + float("nan")
+<<<<<<< HEAD
         inp_outs = [
+=======
+        inpouts = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # (a    ,    x),       out
             ((zeros, small_to_inf), ones),
             ((small_to_inf, zeros), zeros),
@@ -1358,7 +1440,11 @@ def test_igamma_edge_cases(self, device, dtype):
             ((infs, infs), nans),
             ((-small_to_inf, small_to_inf), nans),
         ]
+<<<<<<< HEAD
         for inputs, output in inp_outs:
+=======
+        for inputs, output in inpouts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input0, input1 = inputs
             calc = torch.igamma(input0, input1)
             if torch.all(torch.isnan(output)):
@@ -1377,7 +1463,11 @@ def test_igammac_edge_cases(self, device, dtype):
         zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
         small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
         nans = torch.zeros((3,), **tkwargs) + float("nan")
+<<<<<<< HEAD
         inp_outs = [
+=======
+        inpouts = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # (a    ,    x),       out
             ((zeros, small_to_inf), zeros),
             ((small_to_inf, zeros), ones),
@@ -1387,7 +1477,11 @@ def test_igammac_edge_cases(self, device, dtype):
             ((infs, infs), nans),
             ((-small_to_inf, small_to_inf), nans),
         ]
+<<<<<<< HEAD
         for inputs, output in inp_outs:
+=======
+        for inputs, output in inpouts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input0, input1 = inputs
             calc = torch.igammac(input0, input1)
             if torch.all(torch.isnan(output)):
@@ -1696,6 +1790,7 @@ def test_nonzero_static(self, device):
             ),
         )
 
+<<<<<<< HEAD
         # empty input
         # https://github.com/pytorch/pytorch/issues/162473
         input_tensor = torch.tensor([], device=device)
@@ -1705,6 +1800,8 @@ def test_nonzero_static(self, device):
             torch.tensor([[-1]], device=device),
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # 1D input
         input_tensor = torch.tensor([0, 8], device=device)
         static_size = 1
diff --git a/test/test_utils.py b/test/test_utils.py
index 40cc969f11665..2964e8ce225cc 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -3,6 +3,10 @@
 
 import os
 import random
+<<<<<<< HEAD
+=======
+import re
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import shutil
 import subprocess
 import sys
@@ -19,7 +23,12 @@
 import torch.utils.cpp_extension
 import torch.utils.data
 from torch._utils import try_import
+<<<<<<< HEAD
 from torch._utils_internal import deprecated
+=======
+from torch.autograd._functions.utils import check_onnx_broadcast
+from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
@@ -51,7 +60,11 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 HAS_CUDA = torch.cuda.is_available()
 
@@ -59,9 +72,12 @@
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
+<<<<<<< HEAD
 # mypy: disable-error-code="name-defined"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RandomDatasetMock(torch.utils.data.Dataset):
     def __getitem__(self, index):
         return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
@@ -632,6 +648,154 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
+<<<<<<< HEAD
+=======
+@unittest.skipIf(
+    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
+)
+class TestBottleneck(TestCase):
+    def _run(self, command, timeout=30):
+        """Returns (return-code, stdout, stderr)"""
+        import subprocess
+
+        p = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
+        try:
+            output, err = p.communicate(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            p.kill()
+            output, err = p.communicate()
+        rc = p.returncode
+        output_str = output.decode("ascii")
+        err_str = err.decode("ascii")
+        return (rc, output_str, err_str)
+
+    def _run_bottleneck(self, test_file, scriptargs=""):
+        curdir = os.path.dirname(os.path.abspath(__file__))
+        filepath = f"{curdir}/{test_file}"
+        if scriptargs != "":
+            scriptargs = f" {scriptargs}"
+        rc, out, err = self._run(
+            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
+        )
+        return rc, out, err
+
+    def _check_run_args(self):
+        # Check that this fails due to missing args
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
+        self.assertEqual(
+            rc,
+            2,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Missing args should error", out + err),
+        )
+
+        # This should succeed
+        rc, out, err = self._run_bottleneck(
+            "bottleneck_test/test_args.py", "--foo foo --bar bar"
+        )
+        self.assertEqual(
+            rc,
+            0,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Should pass args to script", out + err),
+        )
+
+    def _fail_msg(self, msg, output):
+        return f"{msg}, output was:\n{output}"
+
+    def _check_environment_summary(self, output):
+        results = re.search("Environment Summary", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have Environment Summary", output)
+        )
+
+        # Up to five lines away from the heading, there should be the version number
+        results = re.search(
+            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
+        )
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have PyTorch version", output)
+        )
+
+    def _check_cprof_summary(self, output):
+        results = re.search("cProfile output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have cProfile output", output)
+        )
+
+        # This assumes that after the cProfile output section we have
+        # the autograd profiler output
+        results = re.search(
+            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
+        )
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between cProfile and autograd prof out not in [6, 50] lines",
+                output,
+            ),
+        )
+
+    def _check_autograd_summary(self, output):
+        results = re.search("autograd profiler output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have autograd profiler output", output)
+        )
+
+        # This assumes that after the autograd profiler output is the end of the
+        # output.
+        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between autograd prof output and end of output not in [6, 100] lines",
+                output,
+            ),
+        )
+
+    def _check_cuda(self, output):
+        if HAS_CUDA:
+            results = re.search("CUDA mode", output)
+            self.assertIsNotNone(
+                results, self._fail_msg("Should tell users CUDA", output)
+            )
+        else:
+            results = re.search("CUDA mode", output)
+            self.assertIsNone(
+                results, self._fail_msg("Should not tell users about CUDA", output)
+            )
+
+    @unittest.skipIf(HAS_CUDA, "CPU-only test")
+    def test_bottleneck_cpu_only(self):
+        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
+
+        self._check_run_args()
+        self._check_environment_summary(out)
+        self._check_autograd_summary(out)
+        self._check_cprof_summary(out)
+        self._check_cuda(out)
+
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
+    def test_bottleneck_cuda(self):
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
+
+        self._check_run_args()
+        self._check_environment_summary(out)
+        self._check_autograd_summary(out)
+        self._check_cprof_summary(out)
+        self._check_cuda(out)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.collect_env import get_pretty_env_info
 
 
@@ -642,6 +806,68 @@ def test_smoke(self):
         self.assertTrue(info_output.count("\n") >= 17)
 
 
+<<<<<<< HEAD
+=======
+class TestONNXUtils(TestCase):
+    def test_prepare_onnx_paddings(self):
+        sizes = [2, 3, 4]
+        pad = [1, 2, 3, 4]
+        paddings = _prepare_onnx_paddings(len(sizes), pad)
+        self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
+
+    def test_check_onnx_broadcast(self):
+        def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
+            broadcast = True
+            fail = False
+            try:
+                broadcast = check_onnx_broadcast(dims1, dims2)
+            except ValueError:
+                fail = True
+            self.assertEqual(broadcast, expect_broadcast)
+            self.assertEqual(fail, expect_fail)
+
+        # Case 1, check the case when len(dims1) < len(dims2) and numel(dims2) > 1
+        dims1 = [3, 4]
+        dims2 = [2, 3, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 2, check the case when len(dims1) < len(dims2) and numel(dims2) == 1
+        dims1 = [3, 4]
+        dims2 = [1, 1, 1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 3, check the case when len(dims1) > len(dims2) and numel(dims2) == 1
+        dims1 = [1, 1]
+        dims2 = [1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 4, check the case when len(dims1) > len(dims2) and dims1[x:] == dims2
+        dims1 = [2, 3, 4]
+        dims2 = [3, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+        # Case 5, check the case when len(dims1) > len(dims2), but dims1[x:] != dims2
+        dims1 = [2, 3, 4]
+        dims2 = [1, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 6, check the equal case, no broadcast
+        dims1 = [3, 4]
+        dims2 = [3, 4]
+        try_check_onnx_broadcast(dims1, dims2, False, False)
+
+        # Case 7, check the case when len(dims1) == len(dims2), but dims1 != dims2
+        dims1 = [3, 4]
+        dims2 = [1, 4]
+        try_check_onnx_broadcast(dims1, dims2, True, True)
+
+        # Case 8, check the case when len(dims1) == len(dims2) and numel(s2) == 1
+        dims1 = [3, 4]
+        dims2 = [1, 1]
+        try_check_onnx_broadcast(dims1, dims2, True, False)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestHipify(TestCase):
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
@@ -649,9 +875,13 @@ def test_import_hipify(self):
 
 class TestHipifyTrie(TestCase):
     def setUp(self):
+<<<<<<< HEAD
         from torch.utils.hipify import hipify_python
 
         self.trie = hipify_python.Trie()
+=======
+        self.trie = torch.utils.hipify.hipify_python.Trie()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_add_and_search_trie(self):
         self.trie.add("banana")
@@ -996,6 +1226,7 @@ def test_import_missing(self):
         self.assertIsNone(missing_module)
 
 
+<<<<<<< HEAD
 @deprecated()
 def _deprecated_api(x, y=15):
     return x + y
@@ -1011,5 +1242,7 @@ def test_deprecated(self):
         _deprecated_api(1, y=2)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 980439b7a6967..c2bcb317e5b49 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -11,16 +11,26 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfMPS,
+<<<<<<< HEAD
     expectedFailureMPS,
     instantiate_device_type_tests,
     onlyCPU,
     onlyNativeDeviceTypes,
+=======
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyNativeDeviceTypes,
+    onlyNativeDeviceTypesAnd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipLazy,
     skipMeta,
     skipXLA,
 )
 from torch.testing._internal.common_dtype import (
+<<<<<<< HEAD
     all_mps_types_and,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_types_and,
     all_types_and_complex_and,
     complex_types,
@@ -158,11 +168,16 @@ def test_conj_self(self, device, dtype):
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*integral_types_and(torch.cfloat, torch.float, torch.half, torch.bool))
     def test_view_dtype_new(self, device, dtype):
         dtypes = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
         if device.startswith("mps"):
             del dtypes[torch.float64]
+=======
+    def test_view_dtype_new(self, device, dtype):
+        dtypes = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del dtypes[torch.bool]
 
         def generate_inputs():
@@ -275,7 +290,10 @@ def calc_expected_size_and_stride(a, view_dtype):
     # has a greater element size than the original dtype
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_dtype_upsize_errors(self, device, dtype):
         dtype_size = torch._utils._element_size(dtype)
 
@@ -377,7 +395,10 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
 
     @onlyNativeDeviceTypes
     @dtypes(*complex_types(), torch.complex32)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.cfloat, torch.chalf)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_as_real(self, device, dtype):
         def fn(contiguous_input=True):
             t = torch.randn(3, 4, dtype=dtype, device=device)
@@ -404,7 +425,13 @@ def fn(contiguous_input=True):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.bool))
+=======
+    @dtypesIfMPS(
+        *integral_types_and(torch.half, torch.bfloat16, torch.bool, torch.float32)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_tensor_split(self, device, dtype):
         a = make_tensor((40, 30), dtype=dtype, device=device, low=-9, high=9)
         a_split_dim0 = a.tensor_split(7, 0)
@@ -416,7 +443,10 @@ def test_view_tensor_split(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_tensor_hsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_hsplit = torch.hsplit(t, 2)
@@ -427,7 +457,10 @@ def test_view_tensor_hsplit(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_tensor_vsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_vsplit = torch.vsplit(t, 2)
@@ -438,7 +471,10 @@ def test_view_tensor_vsplit(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+<<<<<<< HEAD
     @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_tensor_dsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_dsplit = torch.dsplit(t, 2)
@@ -447,9 +483,15 @@ def test_view_tensor_dsplit(self, device, dtype):
         t[2, 2, 2] = 7
         self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2])
 
+<<<<<<< HEAD
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
     @dtypesIfMPS(*all_mps_types_and(torch.bool))
+=======
+    @onlyNativeDeviceTypesAnd("mps")
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.half, torch.bool, torch.float32))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_imag_noncomplex(self, device, dtype):
         t = torch.ones((5, 5), dtype=dtype, device=device)
 
@@ -458,7 +500,10 @@ def test_imag_noncomplex(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*complex_types())
+<<<<<<< HEAD
     @dtypesIfMPS(torch.cfloat)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_real_imag_view(self, device, dtype):
         def compare_with_numpy(contiguous_input=True):
             t = torch.randn(3, 3, dtype=dtype, device=device)
@@ -489,7 +534,10 @@ def compare_with_numpy(contiguous_input=True):
         self.assertEqual(a[5:].imag, a.imag[5:])
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     @expectedFailureMPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*complex_types())
     def test_conj_imag_view(self, device, dtype) -> None:
         t = _make_tensor((4, 5), dtype, device)
@@ -521,12 +569,15 @@ def test_conj_view_with_shared_memory(self, device) -> None:
             all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
         )
     )
+<<<<<<< HEAD
     @dtypesIfMPS(
         *product(
             [torch.cfloat, torch.chalf],
             all_mps_types_and(torch.cfloat, torch.chalf, torch.bool),
         )
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @suppress_warnings
     def test_set_real_imag(self, device, dtypes):
         x = torch.randn(10, dtype=dtypes[0], device=device)
@@ -916,7 +967,11 @@ def assert_is_nonview(t, nv):
         assert_is_nonview(t, nv)
 
         # flatten returns the original object if start_dim=end_dim
+<<<<<<< HEAD
         t = torch.ones(2, 2, device=device)
+=======
+        t = t = torch.ones(2, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nv = t.flatten(1, 1)
         self.assertTrue(t is nv)
 
@@ -1559,7 +1614,11 @@ def test_transpose_vs_numpy(self, device, dtype):
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
+<<<<<<< HEAD
         for ndims in range(5):
+=======
+        for ndims in range(0, 5):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for _ in range(ndims + 1):
                 for with_extremal in [False, True]:
@@ -1656,7 +1715,11 @@ def test_broadcast_shapes(self, device):
         inputs_with_neg_vals = [[1, 1, -12], [-1, 1], [-11]]
         for integral_inputs_with_neg_vals in inputs_with_neg_vals:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 ValueError, "Attempting to broadcast a dimension with negative length!"
+=======
+                RuntimeError, "Trying to create tensor with negative dimension"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.broadcast_shapes(*integral_inputs_with_neg_vals)
 
@@ -1664,21 +1727,33 @@ def test_broadcast_shapes(self, device):
         for error_input in integral_inputs_error_case:
             with self.assertRaisesRegex(
                 RuntimeError,
+<<<<<<< HEAD
                 ".*expected shape should be broadcastable to*",
+=======
+                "Shape mismatch: objects cannot be broadcast to a single shape",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.broadcast_shapes(*error_input)
 
         negative_inputs = [(-1,), (1, -12), (4, -11), (-4, 1), (1, 1, -2)]
         for s0 in negative_inputs:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 ValueError, "Attempting to broadcast a dimension with negative length!"
+=======
+                RuntimeError, "Trying to create tensor with negative dimension"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.broadcast_shapes(s0)
 
             for s1 in negative_inputs:
                 with self.assertRaisesRegex(
+<<<<<<< HEAD
                     ValueError,
                     "Attempting to broadcast a dimension with negative length!",
+=======
+                    RuntimeError, "Trying to create tensor with negative dimension"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     torch.broadcast_shapes(s0, s1)
 
@@ -1971,7 +2046,11 @@ def test_tensor_split_errors(self, device):
             with self.assertRaises(numpy_err, msg=msg):
                 np.array_split(a.cpu().numpy(), sections_or_indices, dim)
 
+<<<<<<< HEAD
         # additional tests for tensor_split with tensor_indices_or_sections
+=======
+        # addtional tests for tensor_split with tensor_indices_or_sections
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             RuntimeError,
             r"tensor_split expected tensor_indices_or_sections to have dtype of long, but got Float",
diff --git a/test/test_weak.py b/test/test_weak.py
index 629ed12db3267..8081fe2017775 100644
--- a/test/test_weak.py
+++ b/test/test_weak.py
@@ -159,7 +159,11 @@ def test_weak_keyed_bad_delitem(self):
         self.assertRaises(KeyError, d.__delitem__, o)
         self.assertRaises(KeyError, d.__getitem__, o)
 
+<<<<<<< HEAD
         # If a key isn't of a weakly referenceable type, __getitem__ and
+=======
+        # If a key isn't of a weakly referencable type, __getitem__ and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # __setitem__ raise TypeError.  __delitem__ should too.
         self.assertRaises(TypeError, d.__delitem__, 13)
         self.assertRaises(TypeError, d.__getitem__, 13)
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 62e257790fd4e..b77c6a0203618 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -1316,7 +1316,11 @@ def test_conv1d_basic(self):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
+<<<<<<< HEAD
         padding_list = range(3)
+=======
+        padding_list = range(0, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dilation_list = range(1, 3)
 
         for hparams in itertools.product(
@@ -1401,7 +1405,11 @@ def test_conv1d_with_relu_fc(self):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
+<<<<<<< HEAD
         padding_list = range(3)
+=======
+        padding_list = range(0, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dilation_list = range(1, 3)
         output_features_list = range(1, 3)
 
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 9daa4b5501176..e1b1300c6ec42 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: intel"]
 
+<<<<<<< HEAD
 import gc
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import subprocess
 import sys
@@ -25,7 +28,10 @@
     IS_LINUX,
     IS_WINDOWS,
     run_tests,
+<<<<<<< HEAD
     serialTest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     suppress_warnings,
     TEST_XPU,
     TestCase,
@@ -103,7 +109,10 @@ def test_get_device_properties(self):
         self.assertEqual(device_name, torch.xpu.get_device_name())
 
         device_capability = torch.xpu.get_device_capability(current_device)
+<<<<<<< HEAD
         self.assertTrue(device_capability["device_id"] > 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(device_capability["max_work_group_size"] > 0)
         self.assertTrue(device_capability["max_num_sub_groups"] > 0)
         self.assertEqual(
@@ -135,10 +144,13 @@ def test_get_device_properties(self):
                 device_properties.architecture,
                 device_capability["architecture"],
             )
+<<<<<<< HEAD
         self.assertEqual(
             len(str(device_properties.uuid)), 36
         )  # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
         self.assertEqual(len(device_properties.uuid.bytes), 16)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
     def test_wrong_xpu_fork(self):
@@ -483,6 +495,7 @@ def test_raises_oom(self):
         with self.assertRaises(torch.OutOfMemoryError):
             torch.empty(1024 * 1024 * 1024 * 1024, device="xpu")
 
+<<<<<<< HEAD
     @serialTest()
     def test_set_per_process_memory_fraction(self):
         gc.collect()
@@ -509,6 +522,8 @@ def test_set_per_process_memory_fraction(self):
 
         torch.xpu.set_per_process_memory_fraction(1.0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_allocation(self):
         torch.xpu.empty_cache()
         prev_allocated = torch.xpu.memory_allocated()
@@ -552,6 +567,7 @@ def test_device_memory_allocated(self):
         )
         del a
 
+<<<<<<< HEAD
     def test_memory_stats(self):
         gc.collect()
         torch.xpu.empty_cache()
@@ -588,6 +604,8 @@ def test_memory_stats(self):
         self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
         self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipXPUIf(
         int(torch.version.xpu) < 20250000,
         "Test requires SYCL compiler version 2025.0.0 or newer.",
@@ -612,6 +630,7 @@ def test_get_arch_list(self):
         for arch in arch_list:
             self.assertTrue(arch in flags)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_MULTIXPU, "only one GPU detected")
     def test_can_device_access_peer(self):
         device_count = torch.xpu.device_count()
@@ -622,6 +641,8 @@ def test_can_device_access_peer(self):
                     torch.xpu.can_device_access_peer(peer, device),
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_version_xpu(self):
         self.assertEqual(len(torch.version.xpu), 8)
         compiler_version = int(torch.version.xpu)
@@ -803,10 +824,13 @@ def test_is_bf16_supported(self):
             torch.xpu.is_available(),
         )
 
+<<<<<<< HEAD
     def test_is_tf32_supported(self):
         if not torch.xpu.is_available():
             self.assertFalse(torch.xpu.is_tf32_supported())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_arch_list(self):
         if not torch.xpu._is_compiled():
             self.assertEqual(len(torch.xpu.get_arch_list()), 0)
diff --git a/test/torch_np/numpy_tests/core/test_dtype.py b/test/torch_np/numpy_tests/core/test_dtype.py
index 19b41d877ca8d..dc8b491aa8666 100644
--- a/test/torch_np/numpy_tests/core/test_dtype.py
+++ b/test/torch_np/numpy_tests/core/test_dtype.py
@@ -87,7 +87,11 @@ def test_invalid_types(self):
             assert_raises(TypeError, np.dtype, "l8")
             assert_raises(TypeError, np.dtype, "L8")
 
+<<<<<<< HEAD
     # XXX: what is 'q'? on my 64-bit ubuntu matching it's int64, same as 'l'
+=======
+    # XXX: what is 'q'? on my 64-bit ubuntu maching it's int64, same as 'l'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #       if np.dtype('q').itemsize == 8:
     #           assert_raises(TypeError, np.dtype, 'q4')
     #           assert_raises(TypeError, np.dtype, 'Q4')
@@ -100,7 +104,11 @@ def test_richcompare_invalid_dtype_equality(self):
         # dtypes results in False/True when compared to valid dtypes.
         # Here 7 cannot be converted to dtype. No exceptions should be raised
 
+<<<<<<< HEAD
         assert np.dtype(np.int32) != 7, "dtype richcompare failed for =="
+=======
+        assert not np.dtype(np.int32) == 7, "dtype richcompare failed for =="
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert np.dtype(np.int32) != 7, "dtype richcompare failed for !="
 
     @parametrize("operation", [operator.le, operator.lt, operator.ge, operator.gt])
@@ -307,7 +315,11 @@ def test_python_integer_promotion(self, val):
     )
     def test_permutations_do_not_influence_result(self, dtypes, expected):
         # Tests that most permutations do not influence the result.  In the
+<<<<<<< HEAD
         # above some uint and int combinations promote to a larger integer
+=======
+        # above some uint and int combintations promote to a larger integer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # type, which would then promote to a larger than necessary float.
         for perm in permutations(dtypes):
             assert np.result_type(*perm) == expected
diff --git a/test/torch_np/numpy_tests/core/test_einsum.py b/test/torch_np/numpy_tests/core/test_einsum.py
index 45c1d97474872..6a13b0ba902c0 100644
--- a/test/torch_np/numpy_tests/core/test_einsum.py
+++ b/test/torch_np/numpy_tests/core/test_einsum.py
@@ -976,7 +976,11 @@ def test_different_paths(self, dtype):
         # Test originally added to cover broken float16 path: gh-20305
         # Likely most are covered elsewhere, at least partially.
         dtype = np.dtype(dtype)
+<<<<<<< HEAD
         # Simple test, designed to exercise most specialized code paths,
+=======
+        # Simple test, designed to excersize most specialized code paths,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # note the +0.5 for floats.  This makes sure we use a float value
         # where the results must be exact.
         arr = (np.arange(7) + 0.5).astype(dtype)
@@ -1160,7 +1164,11 @@ def test_broadcasting_dot_cases(self):
     @xfail  # (reason="order='F' not supported")
     def test_output_order(self):
         # Ensure output order is respected for optimize cases, the below
+<<<<<<< HEAD
         # contraction should yield a reshaped tensor view
+=======
+        # conraction should yield a reshaped tensor view
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # gh-16415
 
         a = np.ones((2, 3, 5), order="F")
diff --git a/test/torch_np/numpy_tests/core/test_getlimits.py b/test/torch_np/numpy_tests/core/test_getlimits.py
index 8b4911e1106d2..74b8b0ad4fe93 100644
--- a/test/torch_np/numpy_tests/core/test_getlimits.py
+++ b/test/torch_np/numpy_tests/core/test_getlimits.py
@@ -1,7 +1,13 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 """Test functions for limits module."""
 
+=======
+""" Test functions for limits module.
+
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import warnings
 from unittest import expectedFailure as xfail, skipIf
diff --git a/test/torch_np/numpy_tests/core/test_indexing.py b/test/torch_np/numpy_tests/core/test_indexing.py
index 16d89c0321984..9e28a97f2964f 100644
--- a/test/torch_np/numpy_tests/core/test_indexing.py
+++ b/test/torch_np/numpy_tests/core/test_indexing.py
@@ -219,7 +219,11 @@ def test_single_int_index(self):
         assert_raises(IndexError, a.__getitem__, 1 << 30)
         # Index overflow produces IndexError
         # Note torch raises RuntimeError here
+<<<<<<< HEAD
         assert_raises((IndexError, ValueError), a.__getitem__, 1 << 64)
+=======
+        assert_raises((IndexError, RuntimeError), a.__getitem__, 1 << 64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_single_bool_index(self):
         # Single boolean index
@@ -375,7 +379,11 @@ def test_trivial_fancy_not_possible(self):
         assert_array_equal(a[idx], idx)
 
         # this case must not go into the fast path, note that idx is
+<<<<<<< HEAD
         # a non-contiguous none 1D array here.
+=======
+        # a non-contiuguous none 1D array here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a[idx] = -1
         res = np.arange(6)
         res[0] = -1
diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
index f20e3ec7a166b..82dc4ccc11677 100644
--- a/test/torch_np/numpy_tests/core/test_multiarray.py
+++ b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -900,7 +900,11 @@ def subscript(x, i):
 
         assert_raises(IndexError, subscript, a, (np.newaxis, 0))
 
+<<<<<<< HEAD
         # this assertion fails because 50 > NPY_MAXDIMS = 32
+=======
+        # this assersion fails because 50 > NPY_MAXDIMS = 32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # assert_raises(IndexError, subscript, a, (np.newaxis,)*50)
 
     @xfail  # (reason="pytorch disallows overlapping assignments")
@@ -2702,7 +2706,11 @@ def test_diagonal_memleak(self):
         a = np.zeros((100, 100))
         if HAS_REFCOUNT:
             assert_(sys.getrefcount(a) < 50)
+<<<<<<< HEAD
         for _ in range(100):
+=======
+        for i in range(100):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a.diagonal()
         if HAS_REFCOUNT:
             assert_(sys.getrefcount(a) < 50)
@@ -3283,7 +3291,11 @@ class TestArgmax(TestCase):
                     ([np.nan, 0, 1, 2, 3], 0),
                     ([np.nan, 0, np.nan, 2, 3], 0),
                     # To hit the tail of SIMD multi-level(x4, x1) inner loops
+<<<<<<< HEAD
                     # on variant SIMD widths
+=======
+                    # on variant SIMD widthes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ([1] * (2 * 5 - 1) + [np.nan], 2 * 5 - 1),
                     ([1] * (4 * 5 - 1) + [np.nan], 4 * 5 - 1),
                     ([1] * (8 * 5 - 1) + [np.nan], 8 * 5 - 1),
@@ -3392,7 +3404,11 @@ class TestArgmin(TestCase):
                     ([np.nan, 0, 1, 2, 3], 0),
                     ([np.nan, 0, np.nan, 2, 3], 0),
                     # To hit the tail of SIMD multi-level(x4, x1) inner loops
+<<<<<<< HEAD
                     # on variant SIMD widths
+=======
+                    # on variant SIMD widthes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ([1] * (2 * 5 - 1) + [np.nan], 2 * 5 - 1),
                     ([1] * (4 * 5 - 1) + [np.nan], 4 * 5 - 1),
                     ([1] * (8 * 5 - 1) + [np.nan], 8 * 5 - 1),
@@ -4104,7 +4120,10 @@ def test_decimal(decimal_sep_localization):
             def test_decimal_period_separator():
                 pass
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def test_decimal_comma_separator():
                 with CommaDecimalPointLocale():
                     pass
@@ -4457,7 +4476,11 @@ def test_none_shape(self):
 
     def test_0d_shape(self):
         # to it multiple times to test it does not break alloc cache gh-9216
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for i in range(10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = np.empty((1,))
             x.resize(())
             assert_equal(x.shape, ())
@@ -5081,7 +5104,11 @@ def test_dot_3args(self):
         v = np.random.random_sample((16, 32))
 
         r = np.empty((1024, 32))
+<<<<<<< HEAD
         for _ in range(12):
+=======
+        for i in range(12):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dot(f, v, r)
         if HAS_REFCOUNT:
             assert_equal(sys.getrefcount(r), 2)
@@ -6678,7 +6705,11 @@ def test_largedim(self):
         np.random.seed(2)
         array = np.random.rand(*shape)
 
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for i in range(10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             benchmark = array.nonzero()
             result = array.nonzero()
             assert_array_equal(benchmark, result)
@@ -6787,10 +6818,14 @@ def test_dot_out(self):
 class TestArange(TestCase):
     def test_infinite(self):
         assert_raises(
+<<<<<<< HEAD
             (RuntimeError, ValueError),
             np.arange,
             0,
             np.inf,  # "unsupported range",
+=======
+            (RuntimeError, ValueError), np.arange, 0, np.inf  # "unsupported range",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_nan_step(self):
diff --git a/test/torch_np/numpy_tests/core/test_numeric.py b/test/torch_np/numpy_tests/core/test_numeric.py
index c6b2d14aef6dc..8a5d605dab4bb 100644
--- a/test/torch_np/numpy_tests/core/test_numeric.py
+++ b/test/torch_np/numpy_tests/core/test_numeric.py
@@ -2384,7 +2384,11 @@ def test_dtype_str_bytes(self, likefunc, dtype):
         b = a[:, ::2]  # Ensure b is not contiguous.
         kwargs = {"fill_value": ""} if likefunc == np.full_like else {}
         result = likefunc(b, dtype=dtype, **kwargs)
+<<<<<<< HEAD
         if dtype is str:
+=======
+        if dtype == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert result.strides == (16, 4)
         else:
             # dtype is bytes
@@ -2733,6 +2737,7 @@ def test_errors(self):
         assert_raises(np.AxisError, np.moveaxis, x, 3, 0)  # 'source.*out of bounds',
         assert_raises(np.AxisError, np.moveaxis, x, -4, 0)  # 'source.*out of bounds',
         assert_raises(
+<<<<<<< HEAD
             np.AxisError,
             np.moveaxis,
             x,
@@ -2745,6 +2750,12 @@ def test_errors(self):
             x,
             [0, 0],
             [0, 1],  # 'repeated axis in `source`',
+=======
+            np.AxisError, np.moveaxis, x, 0, 5  # 'destination.*out of bounds',
+        )
+        assert_raises(
+            ValueError, np.moveaxis, x, [0, 0], [0, 1]  # 'repeated axis in `source`',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         assert_raises(
             ValueError,  # 'repeated axis in `destination`',
diff --git a/test/torch_np/numpy_tests/core/test_numerictypes.py b/test/torch_np/numpy_tests/core/test_numerictypes.py
index 29b42de7c075d..e21cdc5bfd2a0 100644
--- a/test/torch_np/numpy_tests/core/test_numerictypes.py
+++ b/test/torch_np/numpy_tests/core/test_numerictypes.py
@@ -30,7 +30,11 @@
 
 
 @xpassIfTorchDynamo_np  # (
+<<<<<<< HEAD
 #    reason="We do not distinguish between scalar and array types."
+=======
+#    reason="We do not disctinguish between scalar and array types."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #    " Thus, scalars can upcast arrays."
 # )
 class TestCommonType(TestCase):
diff --git a/test/torch_np/numpy_tests/core/test_scalar_ctors.py b/test/torch_np/numpy_tests/core/test_scalar_ctors.py
index a630eda39ce8c..a3d5cee2e7ce1 100644
--- a/test/torch_np/numpy_tests/core/test_scalar_ctors.py
+++ b/test/torch_np/numpy_tests/core/test_scalar_ctors.py
@@ -3,7 +3,10 @@
 """
 Test the scalar constructors, which also do type-coercion
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 from unittest import skipIf as skipif
 
diff --git a/test/torch_np/numpy_tests/core/test_scalar_methods.py b/test/torch_np/numpy_tests/core/test_scalar_methods.py
index f51e62d91eb87..c8450ee341dd9 100644
--- a/test/torch_np/numpy_tests/core/test_scalar_methods.py
+++ b/test/torch_np/numpy_tests/core/test_scalar_methods.py
@@ -3,7 +3,10 @@
 """
 Test the scalar constructors, which also do type-coercion
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import fractions
 import functools
 import types
@@ -235,7 +238,11 @@ class TestBitCount(TestCase):
     def test_small(self, itype):
         for a in range(max(np.iinfo(itype).min, 0), 128):
             msg = f"Smoke test for {itype}({a}).bit_count()"
+<<<<<<< HEAD
             assert itype(a).bit_count() == a.bit_count(), msg
+=======
+            assert itype(a).bit_count() == bin(a).count("1"), msg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_bit_count(self):
         for exp in [10, 17, 63]:
diff --git a/test/torch_np/numpy_tests/core/test_scalarinherit.py b/test/torch_np/numpy_tests/core/test_scalarinherit.py
index 7c7fec495f182..0b378d2fb0ceb 100644
--- a/test/torch_np/numpy_tests/core/test_scalarinherit.py
+++ b/test/torch_np/numpy_tests/core/test_scalarinherit.py
@@ -1,7 +1,13 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 """Test printing of scalar types."""
 
+=======
+""" Test printing of scalar types.
+
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 from unittest import skipIf as skipif
 
diff --git a/test/torch_np/numpy_tests/core/test_scalarmath.py b/test/torch_np/numpy_tests/core/test_scalarmath.py
index ea7621e975463..0310e18e33a76 100644
--- a/test/torch_np/numpy_tests/core/test_scalarmath.py
+++ b/test/torch_np/numpy_tests/core/test_scalarmath.py
@@ -925,7 +925,11 @@ def rop_func(self, other):
 
         # inheritance has to override, or this is correctly lost:
         res = op(myf_simple1(1), myf_simple2(2))
+<<<<<<< HEAD
         assert type(res) is sctype or type(res) is np.bool_
+=======
+        assert type(res) == sctype or type(res) == np.bool_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert op(myf_simple1(1), myf_simple2(2)) == op(1, 2)  # inherited
 
         # Two independent subclasses do not really define an order.  This could
@@ -955,7 +959,11 @@ def rop_func(self, other):
         assert op(myt(1), np.float64(2)) == __op__
         assert op(np.float64(1), myt(2)) == __rop__
 
+<<<<<<< HEAD
         if op in {operator.mod, operator.floordiv} and subtype is complex:
+=======
+        if op in {operator.mod, operator.floordiv} and subtype == complex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return  # module is not support for complex.  Do not test.
 
         if __rop__ == __op__:
@@ -968,11 +976,19 @@ def rop_func(self, other):
         res = op(myt(1), np.float16(2))
         expected = op(subtype(1), np.float16(2))
         assert res == expected
+<<<<<<< HEAD
         assert type(res) is type(expected)
         res = op(np.float32(2), myt(1))
         expected = op(np.float32(2), subtype(1))
         assert res == expected
         assert type(res) is type(expected)
+=======
+        assert type(res) == type(expected)
+        res = op(np.float32(2), myt(1))
+        expected = op(np.float32(2), subtype(1))
+        assert res == expected
+        assert type(res) == type(expected)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/torch_np/numpy_tests/core/test_shape_base.py b/test/torch_np/numpy_tests/core/test_shape_base.py
index 9563d0c8bacba..baca34a911a2f 100644
--- a/test/torch_np/numpy_tests/core/test_shape_base.py
+++ b/test/torch_np/numpy_tests/core/test_shape_base.py
@@ -811,10 +811,14 @@ def test_invalid_nesting(self, block):
         assert_raises_regex(ValueError, msg, block, [[1], 2])
         assert_raises_regex(ValueError, msg, block, [[], 2])
         assert_raises_regex(
+<<<<<<< HEAD
             ValueError,
             msg,
             block,
             [[[1], [2]], [[3, 4]], [5]],  # missing brackets
+=======
+            ValueError, msg, block, [[[1], [2]], [[3, 4]], [5]]  # missing brackets
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_empty_lists(self, block):
diff --git a/test/torch_np/numpy_tests/fft/test_helper.py b/test/torch_np/numpy_tests/fft/test_helper.py
index 2b6a384bf899c..e7c4c11a3d620 100644
--- a/test/torch_np/numpy_tests/fft/test_helper.py
+++ b/test/torch_np/numpy_tests/fft/test_helper.py
@@ -5,7 +5,10 @@
 Copied from fftpack.helper by Pearu Peterson, October 2005
 
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     run_tests,
     TEST_WITH_TORCHDYNAMO,
diff --git a/test/torch_np/numpy_tests/fft/test_pocketfft.py b/test/torch_np/numpy_tests/fft/test_pocketfft.py
index e664c9248a2a6..e24ad132da113 100644
--- a/test/torch_np/numpy_tests/fft/test_pocketfft.py
+++ b/test/torch_np/numpy_tests/fft/test_pocketfft.py
@@ -366,13 +366,21 @@ def worker(args, q):
 
         [x.join() for x in t]
         # Make sure all threads returned the correct value
+<<<<<<< HEAD
         for _ in range(self.threads):
+=======
+        for i in range(self.threads):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # under torch.dynamo `assert_array_equal` fails with relative errors of
             # about 1.5e-14. Hence replace it with `assert_allclose(..., rtol=2e-14)`
             assert_allclose(
                 q.get(timeout=5),
                 expected,
+<<<<<<< HEAD
                 atol=2e-14,
+=======
+                atol=2e-14
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # msg="Function returned wrong value in multithreaded context",
             )
 
diff --git a/test/torch_np/numpy_tests/lib/test_arraysetops.py b/test/torch_np/numpy_tests/lib/test_arraysetops.py
index 79f41fc415af2..f19900c3ccc68 100644
--- a/test/torch_np/numpy_tests/lib/test_arraysetops.py
+++ b/test/torch_np/numpy_tests/lib/test_arraysetops.py
@@ -1,7 +1,13 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 """Test functions for 1D array set operations."""
 
+=======
+"""Test functions for 1D array set operations.
+
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import expectedFailure as xfail, skipIf
 
 import numpy
diff --git a/test/torch_np/numpy_tests/lib/test_function_base.py b/test/torch_np/numpy_tests/lib/test_function_base.py
index cabb1a78122a6..8d51460e15970 100644
--- a/test/torch_np/numpy_tests/lib/test_function_base.py
+++ b/test/torch_np/numpy_tests/lib/test_function_base.py
@@ -2596,12 +2596,20 @@ def test_dtype_reference_leaks(self):
         intp_refcount = sys.getrefcount(np.dtype(np.intp))
         double_refcount = sys.getrefcount(np.dtype(np.double))
 
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for j in range(10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             np.bincount([1, 2, 3])
         assert_equal(sys.getrefcount(np.dtype(np.intp)), intp_refcount)
         assert_equal(sys.getrefcount(np.dtype(np.double)), double_refcount)
 
+<<<<<<< HEAD
         for _ in range(10):
+=======
+        for j in range(10):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             np.bincount([1, 2, 3], [4, 5, 6])
         assert_equal(sys.getrefcount(np.dtype(np.intp)), intp_refcount)
         assert_equal(sys.getrefcount(np.dtype(np.double)), double_refcount)
@@ -2881,8 +2889,12 @@ def test_linear_nan_1D(self, dtype):
         np.testing.assert_equal(res.dtype, arr.dtype)
 
     H_F_TYPE_CODES = [
+<<<<<<< HEAD
         (int_type, np.float64)
         for int_type in "Bbhil"  # np.typecodes["AllInteger"]
+=======
+        (int_type, np.float64) for int_type in "Bbhil"  # np.typecodes["AllInteger"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ] + [
         (np.float16, np.float16),
         (np.float32, np.float32),
@@ -3361,42 +3373,70 @@ def test_nan_behavior(self):
         assert_equal(np.percentile(a, 0.3), np.nan)
         assert_equal(np.percentile(a, 0.3).ndim, 0)
 
+<<<<<<< HEAD
         # axis0 zeroed
+=======
+        # axis0 zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
         assert_equal(np.percentile(a, 0.3, 0), b)
 
+<<<<<<< HEAD
         # axis0 not zeroed
+=======
+        # axis0 not zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], 0)
         b[:, 2, 3] = np.nan
         b[:, 1, 2] = np.nan
         assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
 
+<<<<<<< HEAD
         # axis1 zeroed
+=======
+        # axis1 zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
         assert_equal(np.percentile(a, 0.3, 1), b)
+<<<<<<< HEAD
         # axis1 not zeroed
+=======
+        # axis1 not zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], 1)
         b[:, 1, 3] = np.nan
         b[:, 1, 2] = np.nan
         assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
 
+<<<<<<< HEAD
         # axis02 zeroed
+=======
+        # axis02 zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, (0, 2))
         b[1] = np.nan
         b[2] = np.nan
         assert_equal(np.percentile(a, 0.3, (0, 2)), b)
+<<<<<<< HEAD
         # axis02 not zeroed
+=======
+        # axis02 not zerod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], (0, 2)
         )
         b[:, 1] = np.nan
         b[:, 2] = np.nan
         assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
+<<<<<<< HEAD
         # axis02 not zeroed with method='nearest'
+=======
+        # axis02 not zerod with method='nearest'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4),
             [0.3, 0.6],
diff --git a/test/torch_np/numpy_tests/lib/test_histograms.py b/test/torch_np/numpy_tests/lib/test_histograms.py
index 82382cfc147e3..ce58d0664e543 100644
--- a/test/torch_np/numpy_tests/lib/test_histograms.py
+++ b/test/torch_np/numpy_tests/lib/test_histograms.py
@@ -505,7 +505,12 @@ def test_simple(self):
                 assert_equal(
                     len(a),
                     numbins,
+<<<<<<< HEAD
                     err_msg=f"For the {estimator} estimator with datasize of {testlen}",
+=======
+                    err_msg=f"For the {estimator} estimator "
+                    f"with datasize of {testlen}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def test_small(self):
@@ -551,7 +556,12 @@ def test_small(self):
                 assert_equal(
                     len(a),
                     expbins,
+<<<<<<< HEAD
                     err_msg=f"For the {estimator} estimator with datasize of {testlen}",
+=======
+                    err_msg=f"For the {estimator} estimator "
+                    f"with datasize of {testlen}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def test_incorrect_methods(self):
diff --git a/test/torch_np/numpy_tests/lib/test_twodim_base.py b/test/torch_np/numpy_tests/lib/test_twodim_base.py
index f873ae8091a2f..8dbffac7061a0 100644
--- a/test/torch_np/numpy_tests/lib/test_twodim_base.py
+++ b/test/torch_np/numpy_tests/lib/test_twodim_base.py
@@ -1,7 +1,13 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 """Test functions for matrix module"""
 
+=======
+"""Test functions for matrix module
+
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 from unittest import expectedFailure as xfail, skipIf as skipif
 
diff --git a/test/torch_np/numpy_tests/linalg/test_linalg.py b/test/torch_np/numpy_tests/linalg/test_linalg.py
index c5bd68b50fa33..a49e0322f049f 100644
--- a/test/torch_np/numpy_tests/linalg/test_linalg.py
+++ b/test/torch_np/numpy_tests/linalg/test_linalg.py
@@ -1,6 +1,12 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
 """Test functions for linalg module"""
 
+=======
+""" Test functions for linalg module
+
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import itertools
 import os
@@ -488,7 +494,11 @@ class SolveCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
     # kept apart from TestSolve for use for testing with matrices.
     def do(self, a, b, tags):
         x = linalg.solve(a, b)
+<<<<<<< HEAD
         assert_almost_equal(b, dot_generalized(a, x), single_decimal=5)
+=======
+        assert_almost_equal(b, dot_generalized(a, x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert_(consistent_subclass(x, b))
 
 
@@ -937,7 +947,11 @@ def do(self, a, b, tags):
 @instantiate_parametrized_tests
 class TestDet(DetCases, TestCase):
     def test_zero(self):
+<<<<<<< HEAD
         # NB: comment out tests of type(det) is double : we return zero-dim arrays
+=======
+        # NB: comment out tests of type(det) == double : we return zero-dim arrays
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert_equal(linalg.det([[0.0]]), 0.0)
         #    assert_equal(type(linalg.det([[0.0]])), double)
         assert_equal(linalg.det([[0.0j]]), 0.0)
@@ -1103,7 +1117,11 @@ def tz(M):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
+<<<<<<< HEAD
             if dt is not object:
+=======
+            if dt != object:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
@@ -1115,7 +1133,11 @@ def tz(mat):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
+<<<<<<< HEAD
             if dt is not object:
+=======
+            if dt != object:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
@@ -1128,7 +1150,11 @@ def tz(mat):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
+<<<<<<< HEAD
             if dt is not object:
+=======
+            if dt != object:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
@@ -1706,7 +1732,11 @@ def test_reduced_rank(self):
         # Test matrices with reduced rank
         #  rng = np.random.RandomState(20120714)
         np.random.seed(20120714)
+<<<<<<< HEAD
         for _ in range(100):
+=======
+        for i in range(100):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Make a rank deficient matrix
             X = np.random.normal(size=(40, 10))
             X[:, 0] = X[:, 1] + X[:, 2]
diff --git a/test/torch_np/test_ndarray_methods.py b/test/torch_np/test_ndarray_methods.py
index b25faac56cb83..faf2e366bf00d 100644
--- a/test/torch_np/test_ndarray_methods.py
+++ b/test/torch_np/test_ndarray_methods.py
@@ -399,7 +399,11 @@ class TestArgmax(TestCase):
                     ([np.nan, 0, 1, 2, 3], 0),
                     ([np.nan, 0, np.nan, 2, 3], 0),
                     # To hit the tail of SIMD multi-level(x4, x1) inner loops
+<<<<<<< HEAD
                     # on variant SIMD widths
+=======
+                    # on variant SIMD widthes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ([1] * (2 * 5 - 1) + [np.nan], 2 * 5 - 1),
                     ([1] * (4 * 5 - 1) + [np.nan], 4 * 5 - 1),
                     ([1] * (8 * 5 - 1) + [np.nan], 8 * 5 - 1),
@@ -534,7 +538,11 @@ class TestArgmin(TestCase):
                     ([np.nan, 0, 1, 2, 3], 0),
                     ([np.nan, 0, np.nan, 2, 3], 0),
                     # To hit the tail of SIMD multi-level(x4, x1) inner loops
+<<<<<<< HEAD
                     # on variant SIMD widths
+=======
+                    # on variant SIMD widthes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ([1] * (2 * 5 - 1) + [np.nan], 2 * 5 - 1),
                     ([1] * (4 * 5 - 1) + [np.nan], 4 * 5 - 1),
                     ([1] * (8 * 5 - 1) + [np.nan], 8 * 5 - 1),
@@ -661,7 +669,11 @@ def test_iter_1d(self):
         # numpy generates array scalars, we do 0D arrays
         a = np.arange(5)
         lst = list(a)
+<<<<<<< HEAD
         assert all(type(x) is np.ndarray for x in lst), f"{[type(x) for x in lst]}"
+=======
+        assert all(type(x) == np.ndarray for x in lst), f"{[type(x) for x in lst]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(x.ndim == 0 for x in lst)
 
     def test_iter_2d(self):
@@ -669,8 +681,12 @@ def test_iter_2d(self):
         a = np.arange(5)[None, :]
         lst = list(a)
         assert len(lst) == 1
+<<<<<<< HEAD
         # FIXME: "is" cannot be used here because dynamo fails
         assert type(lst[0]) == np.ndarray  # noqa: E721
+=======
+        assert type(lst[0]) == np.ndarray
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert_equal(lst[0], np.arange(5))
 
 
diff --git a/test/torch_np/test_nep50_examples.py b/test/torch_np/test_nep50_examples.py
index d89a7a390e34a..1e9c2dbf89474 100644
--- a/test/torch_np/test_nep50_examples.py
+++ b/test/torch_np/test_nep50_examples.py
@@ -94,7 +94,11 @@ class TestNEP50Table(TestCase):
     def test_nep50_exceptions(self, example):
         old, new = examples[example]
 
+<<<<<<< HEAD
         if new is Exception:
+=======
+        if new == Exception:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with assert_raises(OverflowError):
                 eval(example)
 
diff --git a/test/torch_np/test_random.py b/test/torch_np/test_random.py
index 8ef19caa7624c..11e5bf82a4584 100644
--- a/test/torch_np/test_random.py
+++ b/test/torch_np/test_random.py
@@ -1,7 +1,12 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 """Light smoke test switching between numpy to pytorch random streams."""
 
+=======
+"""Light smoke test switching between numpy to pytorch random streams.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from functools import partial
 
diff --git a/test/torch_np/test_scalars_0D_arrays.py b/test/torch_np/test_scalars_0D_arrays.py
index 0a21e5fb97b97..9bd147dada376 100644
--- a/test/torch_np/test_scalars_0D_arrays.py
+++ b/test/torch_np/test_scalars_0D_arrays.py
@@ -68,7 +68,11 @@ def test_decay_to_py_scalar(self, value):
         assert product.shape == (3,)
         assert_equal(product, [42, 42 * 2, 42 * 3])
 
+<<<<<<< HEAD
         # repeat with right-multiply
+=======
+        # repeat with right-mulitply
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         product = lst * value
         assert isinstance(product, np.ndarray)
         assert product.shape == (3,)
diff --git a/test/torch_np/test_ufuncs_basic.py b/test/torch_np/test_ufuncs_basic.py
index b8d923cdd3f19..64c370c87588e 100644
--- a/test/torch_np/test_ufuncs_basic.py
+++ b/test/torch_np/test_ufuncs_basic.py
@@ -9,7 +9,10 @@
 by
 >>> import torch._numpy as np
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 from unittest import skipIf as skip, SkipTest
 
diff --git a/test/typing/fail/arithmetic_ops.py b/test/typing/fail/arithmetic_ops.py
index b3f816329445a..b96f008bf3231 100644
--- a/test/typing/fail/arithmetic_ops.py
+++ b/test/typing/fail/arithmetic_ops.py
@@ -7,12 +7,17 @@
 
 # See ../pass/arithmetic_ops.py for more information
 
+<<<<<<< HEAD
 TENSOR, FLOAT = randn(3), 1.5
+=======
+TENSOR, INT, FLOAT = randn(3), 2, 1.5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FLOAT & TENSOR  # E: Unsupported operand types for & ("float" and "Tensor")
 FLOAT | TENSOR  # E: Unsupported operand types for | ("float" and "Tensor")
 FLOAT ^ TENSOR  # E: Unsupported operand types for ^ ("float" and "Tensor")
 # FIXME: false negatives (https://github.com/pytorch/pytorch/issues/155701)
+<<<<<<< HEAD
 #
 # FLOAT << TENSOR  # E: Unsupported operand types for & ("float" and "Tensor")
 # FLOAT >> TENSOR  # E: Unsupported operand types for & ("float" and "Tensor")
@@ -22,3 +27,8 @@
 # TENSOR ^ FLOAT  # E: Unsupported operand types for ^ ("Tensor" and "float" )
 # TENSOR << FLOAT  # E: Unsupported operand types for & ("Tensor" and "float")
 # TENSOR >> FLOAT  # E: Unsupported operand types for & ("Tensor" and "float")
+=======
+# TENSOR & FLOAT  # E: Unsupported operand types for & ("Tensor" and "float" )
+# TENSOR | FLOAT  # E: Unsupported operand types for | ("Tensor" and "float" )
+# TENSOR ^ FLOAT  # E: Unsupported operand types for ^ ("Tensor" and "float" )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/typing/pass/arithmetic_ops.py b/test/typing/pass/arithmetic_ops.py
index f0d6cc6fd9f97..099bd9a3e197e 100644
--- a/test/typing/pass/arithmetic_ops.py
+++ b/test/typing/pass/arithmetic_ops.py
@@ -1,9 +1,14 @@
+<<<<<<< HEAD
 from typing import Union
+=======
+from typing import Any, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import assert_type, TypeAlias
 
 from torch import randn, Tensor
 
 
+<<<<<<< HEAD
 # Test deduced types of arithmetic operations between tensors, ints, floats and bools
 # The expected type should always be `Tensor`, but isn't.
 # See https://github.com/pytorch/pytorch/issues/145838
@@ -13,11 +18,21 @@
 #
 # Unary ops
 #
+=======
+TENSOR, INT, FLOAT, BOOL = randn(3), 2, 1.5, True
+
+# Test deduced types of arithmetic operations between tensors, ints, floats and bools
+# The expected type should always be `Tensor`: `Any` and `bool` below are wrong.
+# See https://github.com/pytorch/pytorch/issues/145838
+
+# Unary ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 assert_type(+TENSOR, Tensor)
 assert_type(-TENSOR, Tensor)
 assert_type(~TENSOR, Tensor)
 
+<<<<<<< HEAD
 #
 # Binary ops that return a boolean
 #
@@ -204,6 +219,139 @@
 assert_type(INT ^ TENSOR, Tensor)
 assert_type(TENSOR ^ FLOAT, Tensor)  # Should fail type checking
 assert_type(FLOAT ^ TENSOR, Tensor)  # type: ignore[operator]
+=======
+# Binary ops
+
+assert_type(TENSOR == TENSOR, Tensor)
+assert_type(TENSOR != TENSOR, Tensor)
+assert_type(TENSOR < TENSOR, Tensor)
+assert_type(TENSOR > TENSOR, Tensor)
+assert_type(TENSOR <= TENSOR, Tensor)
+assert_type(TENSOR >= TENSOR, Tensor)
+assert_type(TENSOR + TENSOR, Tensor)
+assert_type(TENSOR - TENSOR, Tensor)
+assert_type(TENSOR * TENSOR, Tensor)
+assert_type(TENSOR // TENSOR, Any)
+assert_type(TENSOR / TENSOR, Tensor)
+assert_type(TENSOR % TENSOR, Tensor)
+assert_type(TENSOR**TENSOR, Tensor)
+assert_type(TENSOR << TENSOR, Tensor)
+assert_type(TENSOR >> TENSOR, Tensor)
+assert_type(TENSOR & TENSOR, Tensor)
+assert_type(TENSOR | TENSOR, Tensor)
+assert_type(TENSOR ^ TENSOR, Tensor)
+
+assert_type(TENSOR == BOOL, Tensor)
+assert_type(TENSOR != BOOL, Tensor)
+assert_type(TENSOR < BOOL, Tensor)
+assert_type(TENSOR > BOOL, Tensor)
+assert_type(TENSOR <= BOOL, Tensor)
+assert_type(TENSOR >= BOOL, Tensor)
+assert_type(TENSOR + BOOL, Tensor)
+assert_type(TENSOR - BOOL, Tensor)
+assert_type(TENSOR * BOOL, Tensor)
+assert_type(TENSOR // BOOL, Any)
+assert_type(TENSOR / BOOL, Tensor)
+assert_type(TENSOR % BOOL, Tensor)
+assert_type(TENSOR**BOOL, Tensor)
+assert_type(TENSOR << BOOL, Tensor)
+assert_type(TENSOR >> BOOL, Tensor)
+assert_type(TENSOR & BOOL, Tensor)
+assert_type(TENSOR | BOOL, Tensor)
+assert_type(TENSOR ^ BOOL, Tensor)
+
+assert_type(BOOL == TENSOR, bool)
+assert_type(BOOL != TENSOR, bool)
+assert_type(BOOL < TENSOR, Tensor)
+assert_type(BOOL > TENSOR, Tensor)
+assert_type(BOOL <= TENSOR, Tensor)
+assert_type(BOOL >= TENSOR, Tensor)
+assert_type(BOOL + TENSOR, Tensor)
+assert_type(BOOL - TENSOR, Any)
+assert_type(BOOL * TENSOR, Tensor)
+assert_type(BOOL // TENSOR, Any)
+assert_type(BOOL / TENSOR, Any)
+assert_type(BOOL % TENSOR, Any)
+assert_type(BOOL**TENSOR, Any)
+assert_type(BOOL << TENSOR, Any)
+assert_type(BOOL >> TENSOR, Any)
+assert_type(BOOL & TENSOR, Tensor)
+assert_type(BOOL | TENSOR, Tensor)
+assert_type(BOOL ^ TENSOR, Tensor)
+
+assert_type(TENSOR == INT, Tensor)
+assert_type(TENSOR != INT, Tensor)
+assert_type(TENSOR < INT, Tensor)
+assert_type(TENSOR > INT, Tensor)
+assert_type(TENSOR <= INT, Tensor)
+assert_type(TENSOR >= INT, Tensor)
+assert_type(TENSOR + INT, Tensor)
+assert_type(TENSOR - INT, Tensor)
+assert_type(TENSOR * INT, Tensor)
+assert_type(TENSOR // INT, Any)
+assert_type(TENSOR / INT, Tensor)
+assert_type(TENSOR % INT, Tensor)
+assert_type(TENSOR**INT, Tensor)
+assert_type(TENSOR << INT, Tensor)
+assert_type(TENSOR >> INT, Tensor)
+assert_type(TENSOR & INT, Tensor)
+assert_type(TENSOR | INT, Tensor)
+assert_type(TENSOR ^ INT, Tensor)
+
+assert_type(INT == TENSOR, bool)
+assert_type(INT != TENSOR, bool)
+assert_type(INT < TENSOR, Tensor)
+assert_type(INT > TENSOR, Tensor)
+assert_type(INT <= TENSOR, Tensor)
+assert_type(INT >= TENSOR, Tensor)
+assert_type(INT + TENSOR, Tensor)
+assert_type(INT - TENSOR, Any)
+assert_type(INT * TENSOR, Tensor)
+assert_type(INT // TENSOR, Any)
+assert_type(INT / TENSOR, Any)
+assert_type(INT % TENSOR, Any)
+assert_type(INT**TENSOR, Any)
+assert_type(INT << TENSOR, Any)
+assert_type(INT >> TENSOR, Any)
+assert_type(INT & TENSOR, Tensor)
+assert_type(INT | TENSOR, Tensor)
+assert_type(INT ^ TENSOR, Tensor)
+
+assert_type(TENSOR == FLOAT, Tensor)
+assert_type(TENSOR != FLOAT, Tensor)
+assert_type(TENSOR < FLOAT, Tensor)
+assert_type(TENSOR > FLOAT, Tensor)
+assert_type(TENSOR <= FLOAT, Tensor)
+assert_type(TENSOR >= FLOAT, Tensor)
+assert_type(TENSOR + FLOAT, Tensor)
+assert_type(TENSOR - FLOAT, Tensor)
+assert_type(TENSOR * FLOAT, Tensor)
+assert_type(TENSOR // FLOAT, Any)
+assert_type(TENSOR / FLOAT, Tensor)
+assert_type(TENSOR % FLOAT, Tensor)
+assert_type(TENSOR**FLOAT, Tensor)
+assert_type(TENSOR << FLOAT, Tensor)
+assert_type(TENSOR >> FLOAT, Tensor)
+assert_type(TENSOR & FLOAT, Tensor)
+assert_type(TENSOR | FLOAT, Tensor)
+assert_type(TENSOR ^ FLOAT, Tensor)
+
+assert_type(FLOAT == TENSOR, bool)
+assert_type(FLOAT != TENSOR, bool)
+assert_type(FLOAT < TENSOR, Tensor)
+assert_type(FLOAT > TENSOR, Tensor)
+assert_type(FLOAT <= TENSOR, Tensor)
+assert_type(FLOAT >= TENSOR, Tensor)
+assert_type(FLOAT + TENSOR, Tensor)
+assert_type(FLOAT - TENSOR, Any)
+assert_type(FLOAT * TENSOR, Tensor)
+assert_type(FLOAT // TENSOR, Any)
+assert_type(FLOAT / TENSOR, Any)
+assert_type(FLOAT % TENSOR, Any)
+assert_type(FLOAT**TENSOR, Any)
+assert_type(FLOAT << TENSOR, Any)
+assert_type(FLOAT >> TENSOR, Any)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 NUMBER: TypeAlias = Union[int, float, bool]
@@ -427,3 +575,41 @@ def __xor__(self, other: NUMBER) -> "Binary":  # type: ignore[override]
 assert_type(BOOL >> BINARY, Binary)
 assert_type(BOOL - BINARY, Binary)
 assert_type(BOOL ^ BINARY, Binary)
+<<<<<<< HEAD
+=======
+
+# Tensor operators whose types could be improved
+# This is the "diff" of the first and second sections.
+
+assert_type(BOOL // TENSOR, Any)
+assert_type(FLOAT // TENSOR, Any)
+assert_type(INT // TENSOR, Any)
+assert_type(TENSOR // BOOL, Any)
+assert_type(TENSOR // FLOAT, Any)
+assert_type(TENSOR // INT, Any)
+assert_type(TENSOR // TENSOR, Any)
+
+assert_type(BOOL**TENSOR, Any)
+assert_type(FLOAT**TENSOR, Any)
+assert_type(INT**TENSOR, Any)
+
+assert_type(BOOL - TENSOR, Any)
+assert_type(FLOAT - TENSOR, Any)
+assert_type(INT - TENSOR, Any)
+
+assert_type(BOOL / TENSOR, Any)
+assert_type(FLOAT / TENSOR, Any)
+assert_type(INT / TENSOR, Any)
+
+assert_type(BOOL % TENSOR, Any)
+assert_type(FLOAT % TENSOR, Any)
+assert_type(INT % TENSOR, Any)
+
+assert_type(BOOL << TENSOR, Any)
+assert_type(FLOAT << TENSOR, Any)
+assert_type(INT << TENSOR, Any)
+
+assert_type(BOOL >> TENSOR, Any)
+assert_type(FLOAT >> TENSOR, Any)
+assert_type(INT >> TENSOR, Any)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
index 566c1711532af..23b8d32712f39 100644
--- a/test/xpu/test_gemm.py
+++ b/test/xpu/test_gemm.py
@@ -12,19 +12,27 @@
 import numpy as np
 
 import torch
+<<<<<<< HEAD
 import torch._inductor.decomposition
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch.fx.experimental.proxy_tensor import make_fx
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+<<<<<<< HEAD
     onlyNativeDeviceTypes,
     precisionOverride,
 )
 from torch.testing._internal.common_quantization import (
     _dynamically_quantize_per_channel,
 )
+=======
+    precisionOverride,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     iter_indices,
     parametrize,
@@ -238,7 +246,11 @@ def maybe_transpose(cond, m):
                 )
 
     @precisionOverride({torch.float: 1e-4, torch.double: 1e-6, torch.half: 1e-1})
+<<<<<<< HEAD
     @dtypes(torch.float32, torch.half, torch.double, torch.complex64)
+=======
+    @dtypes(torch.float32, torch.half, torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         self._test_addmm_impl(torch.addmm, None, device, dtype)
@@ -313,7 +325,10 @@ def test_addmv(self, device, dtype):
         torch.half,
         torch.float32,
         torch.float64,
+<<<<<<< HEAD
         torch.complex64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     @tf32_on_and_off(0.05)
     def test_mm(self, device, dtype):
@@ -417,7 +432,11 @@ def genf_Half(x, y):
             _test_mm(n, m, p, dtype, genf)
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
+<<<<<<< HEAD
     @dtypes(torch.float32, torch.bfloat16, torch.half, torch.float64, torch.complex64)
+=======
+    @dtypes(torch.float32, torch.bfloat16, torch.half, torch.float64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
         batch_sizes = [1, 10]
@@ -534,7 +553,11 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
         self.assertEqual(res7, ref)
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
+<<<<<<< HEAD
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half, torch.complex64)
+=======
+    @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.005)
     def test_addbmm(self, device, dtype):
         num_batches = 2
@@ -638,7 +661,11 @@ def generate_tensor():
             self._test_addbmm_baddbmm("addbmm", b1, b2, ref, out_tensor)
 
     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5, torch.float64: 1e-6})
+<<<<<<< HEAD
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half, torch.complex64)
+=======
+    @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.01)
     def test_baddbmm(self, device, dtype):
         num_batches = 10
@@ -1372,6 +1399,7 @@ def test_mm_with_offset(self, device):
         cpu_out = torch.matmul(a.cpu(), b.cpu())
         self.assertEqual(gpu_out.cpu(), cpu_out)
 
+<<<<<<< HEAD
     @parametrize("m", [0, 8, 17])
     @parametrize("k", [0, 16, 32])
     @parametrize("n", [16, 32])
@@ -1495,6 +1523,8 @@ def weight_int8pack_mm(a, b_int8pack, b_scales):
         mean_err = ((res - ref).abs() / ref).mean()
         self.assertTrue(mean_err < 0.05)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu", allow_xpu=True)
 
diff --git a/third_party/eigen_pin.txt b/third_party/eigen_pin.txt
index 0062ac971805f..f0e1a9edf3de2 100644
--- a/third_party/eigen_pin.txt
+++ b/third_party/eigen_pin.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 5.0.0
+=======
+3.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/third_party/miniz-3.0.2/miniz.c b/third_party/miniz-3.0.2/miniz.c
index 0f0cf1833b6da..47d20e17b61cd 100644
--- a/third_party/miniz-3.0.2/miniz.c
+++ b/third_party/miniz-3.0.2/miniz.c
@@ -3136,7 +3136,10 @@ extern "C" {
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
+<<<<<<< HEAD
 #include <share.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static WCHAR* mz_utf8z_to_widechar(const char* str)
 {
@@ -3150,6 +3153,7 @@ static FILE *mz_fopen(const char *pFilename, const char *pMode)
 {
   WCHAR* wFilename = mz_utf8z_to_widechar(pFilename);
   WCHAR* wMode = mz_utf8z_to_widechar(pMode);
+<<<<<<< HEAD
   /*
   Must use _wfsopen with _SH_DENYNO on Windows, to open opened temp files.
   */
@@ -3157,6 +3161,13 @@ static FILE *mz_fopen(const char *pFilename, const char *pMode)
   free(wFilename);
   free(wMode);
   return pFile;
+=======
+  FILE* pFile = NULL;
+  errno_t err = _wfopen_s(&pFile, wFilename, wMode);
+  free(wFilename);
+  free(wMode);
+  return err ? NULL : pFile;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
index 5e5b69b4cb4ec..2e5c6a8dba2db 100644
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@@ -7,7 +7,10 @@ LIBUV_COMMON_SRCS = [
     "third_party/libuv/src/inet.c",
     "third_party/libuv/src/random.c",
     "third_party/libuv/src/strscpy.c",
+<<<<<<< HEAD
     "third_party/libuv/src/strtok.c",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "third_party/libuv/src/threadpool.c",
     "third_party/libuv/src/timer.c",
     "third_party/libuv/src/uv-common.c",
@@ -38,7 +41,13 @@ LIBUV_POSIX_SRCS = [
 
 LIBUV_LINUX_SRCS = LIBUV_POSIX_SRCS + [
     "third_party/libuv/src/unix/proctitle.c",
+<<<<<<< HEAD
     "third_party/libuv/src/unix/linux.c",
+=======
+    "third_party/libuv/src/unix/linux-core.c",
+    "third_party/libuv/src/unix/linux-inotify.c",
+    "third_party/libuv/src/unix/linux-syscalls.c",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "third_party/libuv/src/unix/procfs-exepath.c",
     "third_party/libuv/src/unix/random-getrandom.c",
     "third_party/libuv/src/unix/random-sysctl-linux.c",
@@ -59,7 +68,10 @@ cc_library(
             "third_party/libuv/src/unix/*.h",
         ],
     ),
+<<<<<<< HEAD
     copts = ["-D_GNU_SOURCE"],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     visibility = ["//visibility:public"],
 )
 
@@ -151,7 +163,11 @@ cc_library(
         ".",
     ],
     copts = [
+<<<<<<< HEAD
         "-std=c++17",
+=======
+        "-std=c++14",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -168,7 +184,11 @@ cc_library(
         ".",
     ],
     copts = [
+<<<<<<< HEAD
         "-std=c++17",
+=======
+        "-std=c++14",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index b353d5d0d5982..29d96c34ad210 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -1437,7 +1437,11 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_srcs = [
             (
+<<<<<<< HEAD
                 "(arm64|aarch64)",
+=======
+                "(arm64|aarch64)$",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 prod_srcs_for_arch_wrapper("neonfma") + prod_srcs_for_arch_wrapper("neonfma_aarch64"),
             ),
         ] if not is_arvr_mode() else [],
@@ -2227,7 +2231,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ],
             # doesn't cover iphonesimulator-x86_64
             "ovr_config//runtime:arm64-linux-ubuntu-neon": [":arm64_lib"],
+<<<<<<< HEAD
             "ovr_config//runtime:fbcode-arm64": [":arm64_lib"],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "ovr_config//runtime:platform010": [":x86_and_x86_64_lib"],
         }),
     )
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 91aee0c2a0ffa..2667f42e90f26 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 8d373ba272f9fed348c7684bac4a0c2663844bbd
+=======
+3a9419c8bb6a98dd3e3cd473c36691fb4abeae40
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/alerts/create_alerts.py b/tools/alerts/create_alerts.py
index b86e2368d4408..1f78d4d59f910 100644
--- a/tools/alerts/create_alerts.py
+++ b/tools/alerts/create_alerts.py
@@ -190,12 +190,20 @@ def map_job_data(jobNames: Any, shaGrid: Any) -> dict[str, Any]:
 
 
 def is_job_failed(job: Any) -> bool:
+<<<<<<< HEAD
     conclusion = job.get("conclusion", None)
+=======
+    conclusion = job["conclusion"] if "conclusion" in job else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return conclusion is not None and conclusion != SUCCESS and conclusion != PENDING
 
 
 def is_job_skipped(job: Any) -> bool:
+<<<<<<< HEAD
     conclusion = job.get("conclusion", None)
+=======
+    conclusion = job["conclusion"] if "conclusion" in job else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return conclusion in (NEUTRAL, SKIPPED) or conclusion is None
 
 
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index b2c2461a08885..7a5916565bb1a 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -137,7 +137,10 @@
     "third_party/nvfuser/runtime/helpers.cu",
     "torch/csrc/jit/codegen/fuser/cuda/resource_strings.h",
     "torch/csrc/jit/tensorexpr/ir_printer.cpp",
+<<<<<<< HEAD
     "torch/csrc/jit/ir/ir.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # generated files we shouldn't frob
     "torch/lib/tmp_install/*",
     "torch/include/*",
@@ -201,6 +204,7 @@ def remove_hcc(line: str) -> str:
                     sources.write(line)
             print(f"{hip_platform_file} updated")
 
+<<<<<<< HEAD
 # NOTE: fbgemm sources needing hipify
 # fbgemm is its own project with its own build system. pytorch uses fbgemm as
 # a submodule to acquire some gpu source files but compiles only those sources
@@ -229,12 +233,15 @@ def remove_hcc(line: str) -> str:
     fbgemm_original = fbgemm_dir / "tuning_cache.cuh"
 
     extra_files.append(fbgemm_original.as_posix())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 hipify_python.hipify(
     project_directory=proj_dir,
     output_directory=out_dir,
     includes=includes,
     ignores=ignores,
+<<<<<<< HEAD
     extra_files=extra_files,
     out_of_place_only=args.out_of_place_only,
     hip_clang_launch=is_hip_clang(),
@@ -261,3 +268,14 @@ def remove_hcc(line: str) -> str:
             for line in src_lines:
                 dst.write(line)
         print(f"{fbgemm_move_dst} updated")
+=======
+    extra_files=[
+        "torch/_inductor/codegen/cuda/device_op_overrides.py",
+        "torch/_inductor/codegen/cpp_wrapper_cpu.py",
+        "torch/_inductor/codegen/cpp_wrapper_gpu.py",
+        "torch/_inductor/codegen/wrapper.py",
+    ],
+    out_of_place_only=args.out_of_place_only,
+    hip_clang_launch=is_hip_clang(),
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/autograd/build.bzl b/tools/autograd/build.bzl
index c5ddf7a20b800..edd9454ebe777 100644
--- a/tools/autograd/build.bzl
+++ b/tools/autograd/build.bzl
@@ -12,9 +12,12 @@ def define_targets(rules):
             "//torchgen",
         ],
     )
+<<<<<<< HEAD
 
     rules.filegroup(
         name = "deprecated_yaml",
         srcs = ["deprecated.yaml"],
         visibility = ["//:__subpackages__"],
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/autograd/context.py b/tools/autograd/context.py
index 0ed4b2ee4d014..8604bdb6b755f 100644
--- a/tools/autograd/context.py
+++ b/tools/autograd/context.py
@@ -1,5 +1,9 @@
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
 from torchgen.context import native_function_manager
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8bb99f982e0be..d42867e04dd7f 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1059,6 +1059,14 @@
   LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot)
   output_differentiability: [True, False, False]
 
+<<<<<<< HEAD
+=======
+- name: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
+  A: lu_factor_ex_backward(grad, LU, pivots, pivot)
+  LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot)
+  output_differentiability: [True, False]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
   A: linalg_lu_backward(grad_L, grad_U, P, L, U, pivot)
   L: std::get<0>(linalg_lu_jvp(A_t, P, L, U, pivot))
@@ -1262,11 +1270,14 @@
   mean: not_implemented("native_layer_norm_backward mean")
   rstd: not_implemented("native_layer_norm_backward rstd")
 
+<<<<<<< HEAD
 - name: _fused_rms_norm(Tensor input, int[] normalized_shape, Tensor? weight, float? eps) -> (Tensor, Tensor)
   input, weight: "GradMode::is_enabled() || grads[1].defined() ? infinitely_differentiable_native_rms_norm_backward(grads[0], grads[1], input, normalized_shape, result1, weight, grad_input_mask) : (grads[0].defined() ? _fused_rms_norm_backward(grads[0], input, normalized_shape, result1, weight, grad_input_mask) : std::tuple<Tensor, Tensor>())"
   result0: rms_norm_jvp(input_p, input_t, weight_p, weight_t, result1, normalized_shape)
   result1: rms_norm_rstd_jvp(input_p, input_t, result1, normalized_shape)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(grads[0].device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), input.device().is_xpu() ? input : input.contiguous(input.device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
   result0: group_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, group)
@@ -1794,9 +1805,12 @@
   self: zeros_like(grad)
   result: auto_element_wise
 
+<<<<<<< HEAD
 - name: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
   output_differentiability: [False]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # DO NOT define a backward for to_dense
 # See [Note: Sometimes view derivatives]
 # - name: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
@@ -2899,10 +2913,13 @@
   output_differentiability: [True, False, False, False, False, False]
   query, key, value, bias: _efficient_attention_backward_symint(grad, query, key, value, bias, output, cu_seqlens_q, cu_seqlens_k, max_seqlen_batch_q, max_seqlen_batch_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias.requires_grad(), scale)
 
+<<<<<<< HEAD
 - name: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   output_differentiability: [True, False, False, False, False, False, False, False, False]
   query, key, value: _cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   output_differentiability: [True, False, False, False, False, False, False, False, False]
   query, key, value: _scaled_dot_product_cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index d32562374d5f6..989296e37c526 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -863,7 +863,10 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
             saved_variables.append(f"{type.cpp_type()} {name};")
 
             if type in MISC_GETTER_DEFS:
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 getter_def, body = MISC_GETTER_DEFS[type]
                 getter_definitions.append(
                     getter_def.substitute(op=info.op, name=name, body=body)
@@ -1034,7 +1037,10 @@ def emit_derivative(
     unpack_ivalues = []
     for typ, name in zip(apply_functional_args_ref_types, apply_functional_args):
         typ = typ.removesuffix("&")
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unpack_ivalues.append(f"auto {name} = packed_args.unpack<{typ}>();")
 
     schema_args = [f"std::array<bool, {len(input_name_to_idx)}>"]
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index 4cb3429c39276..cb267a13bde47 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -340,7 +340,11 @@ def get_base_name(f: NativeFunction) -> str:
 
 def get_view_info(f: NativeFunction) -> str | None:
     base_name = get_base_name(f)
+<<<<<<< HEAD
     view_info = VIEW_FUNCTIONS.get(base_name)
+=======
+    view_info = VIEW_FUNCTIONS.get(base_name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
         view_info = "self"
     return view_info
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index af25d55ef38d8..48f4322b4b91e 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -36,7 +36,11 @@
 import itertools
 import re
 from collections import defaultdict
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import yaml
 
@@ -77,7 +81,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Sequence
+=======
+    from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 #
@@ -97,7 +105,10 @@
     "is_sparse_csr",
     "size",
     "stride",
+<<<<<<< HEAD
     "sym_is_contiguous",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "sym_size",
     "sym_stride",
     "sym_storage_offset",
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 0a4ecbd14f514..e1ae8be981ff6 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -182,7 +182,10 @@ def dispatch_trace_input(arg: Argument | TensorOptionsArguments) -> Sequence[str
             ADD_TRACE_INPUT.substitute(
                 name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
             )
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(num_out_args)
         ]
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 13ca3e1389ac1..6ad937d2edfac 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -29,7 +29,11 @@
 from __future__ import annotations
 
 import re
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen.api import cpp
 from torchgen.api.autograd import (
@@ -106,7 +110,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # We don't set or modify grad_fn on these methods. Generally, they return
@@ -1495,7 +1503,10 @@ def save_variables(
                 else:
                     expr = f"SavedVariable({var}, {str(is_output).lower()})"
                     if foreacharg is not None and "original_selfs" not in expr:
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         expr = expr.replace(src_name, name_in_expr)
             elif (
                 type == BaseCType(tensorListT)
@@ -1845,14 +1856,20 @@ def emit_any_has_forward_grad() -> list[str]:
                                 )
                             )
                         cur_derivative_conditions.append(
+<<<<<<< HEAD
                             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(
                                 req_inp=inp_name + "[i]"
                             )
                         )
                     else:
                         cur_derivative_conditions.append(
+<<<<<<< HEAD
                             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp_name)
                         )
 
@@ -1923,7 +1940,10 @@ def emit_fw_derivatives() -> list[str]:
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
                     inp_name="original_self",
                     inp="original_self" + input_suffix,
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     zeros_fn=zeros_fn,
                 )
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index 59669b42cd5d4..378e76963878a 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -95,11 +95,16 @@ def add_view_copy_derivatives(
             else:
                 break
         # prefer manually-defined derivatives if any
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
             # pyrefly: ignore [unbound-name]
             assert fn_schema is not None
             # pyrefly: ignore [unbound-name]
+=======
+        if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
+            assert fn_schema is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             view_infos[fn_schema] = view_copy_differentiability_infos
 
     infos.update(view_infos)
@@ -401,7 +406,10 @@ def repl(m: Any) -> str:
             for arg_name in all_arg_names:
                 if arg_name in diff_arg_names:
                     arg_name = arg_name + "_t"
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_args.append(arg_name)
 
             # TODO we are trolling
@@ -942,7 +950,10 @@ def stride_expr(name: str) -> str:
             + f".sym_strides(), which returned a c10::SymIntArrayRef. formula={formula}"
         )
     for nctype in nctypes:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = (
             nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
         )
@@ -952,7 +963,10 @@ def stride_expr(name: str) -> str:
 
             def repl(m: re.Match[str]) -> str:
                 suffix: str = (
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     info["suffix"](m) if callable(info["suffix"]) else info["suffix"]
                 )
                 expr: str = info["expr"](name) if "expr" in info else m.group(0)
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index bfc5b80835c4b..fb298ed48d247 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -264,7 +264,11 @@ static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args, PyObjec
   auto& self_ = THPVariable_Unpack(self);
   auto memory_format = r.memoryformat(0);
   // avoids touching the GIL or current device if self is already contiguous
+<<<<<<< HEAD
   if (self_.is_contiguous_or_false(memory_format)) {
+=======
+  if (self_.is_contiguous(memory_format)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NOTE: this logic is duplicated from VariableType.cpp. Since we need to
     // record this call to contiguous() in the trace regardless of whether
     // we actually call contiguous here, we need to record this information
diff --git a/tools/bazel.bzl b/tools/bazel.bzl
index 147990c2e84b1..ed1aa63d6ffd0 100644
--- a/tools/bazel.bzl
+++ b/tools/bazel.bzl
@@ -2,7 +2,11 @@ load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_cuda//cuda:defs.bzl", "cuda_library", "requires_cuda_enabled")
 load("@rules_python//python:defs.bzl", "py_binary", "py_library")
 load("@pip_deps//:requirements.bzl", "requirement")
+<<<<<<< HEAD
 load("@pytorch//torch/headeronly/macros:cmake_configure_file.bzl", "cmake_configure_file")
+=======
+load("@pytorch//c10/macros:cmake_configure_file.bzl", "cmake_configure_file")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 load("@pytorch//tools/config:defs.bzl", "if_cuda")
 
 def _genrule(**kwds):
@@ -16,7 +20,10 @@ def _is_cpu_static_dispatch_build():
 # build structure aims to replicate Bazel as much as possible, most of
 # the rules simply forward to the Bazel definitions.
 rules = struct(
+<<<<<<< HEAD
     alias = native.alias,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cc_binary = cc_binary,
     cc_library = cc_library,
     cc_test = cc_test,
diff --git a/tools/build/bazel/requirements.in b/tools/build/bazel/requirements.in
index ae94ca4a24c4d..05711db83a8d3 100644
--- a/tools/build/bazel/requirements.in
+++ b/tools/build/bazel/requirements.in
@@ -1,7 +1,16 @@
+<<<<<<< HEAD
 pyyaml==6.0.2
 numpy==1.26.4
 requests==2.32.4
 setuptools==78.1.1
 sympy==1.12
 typing-extensions==4.11.0
+=======
+PyYAML==6.0.1
+numpy==1.26.4
+requests==2.32.2
+setuptools==78.1.1
+sympy==1.12
+typing_extensions==4.11.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 networkx==2.8.8
diff --git a/tools/build/bazel/requirements.txt b/tools/build/bazel/requirements.txt
index 288c8cf1fba6f..7272d61ac19cc 100644
--- a/tools/build/bazel/requirements.txt
+++ b/tools/build/bazel/requirements.txt
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # This file was autogenerated by uv via the following command:
 #    uv pip compile --generate-hashes tools/build/bazel/requirements.in --output-file tools/build/bazel/requirements.txt
 certifi==2025.7.14 \
@@ -101,6 +102,113 @@ charset-normalizer==3.4.2 \
 idna==3.10 \
     --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
     --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+=======
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --generate-hashes tools/build/bazel/requirements.in
+#
+certifi==2024.7.4 \
+    --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \
+    --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90
+    # via requests
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
+    # via requests
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # via requests
 mpmath==1.3.0 \
     --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
@@ -109,7 +217,11 @@ mpmath==1.3.0 \
 networkx==2.8.8 \
     --hash=sha256:230d388117af870fce5647a3c52401fcf753e94720e6ea6b4197a5355648885e \
     --hash=sha256:e435dfa75b1d7195c7b8378c3859f0445cd88c6b0375c181ed66823a9ceb7524
+<<<<<<< HEAD
     # via -r tools/build/bazel/requirements.in
+=======
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -147,6 +259,7 @@ numpy==1.26.4 \
     --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
     --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
     --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+<<<<<<< HEAD
     # via -r tools/build/bazel/requirements.in
 pyyaml==6.0.2 \
     --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \
@@ -223,3 +336,81 @@ urllib3==2.5.0 \
     --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
     --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
     # via requests
+=======
+    # via -r requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r requirements.in
+requests==2.32.2 \
+    --hash=sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289 \
+    --hash=sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
+urllib3==2.2.2 \
+    --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \
+    --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168
+    # via requests
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==78.1.1 \
+    --hash=sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561 \
+    --hash=sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 9d43de80f1298..815e4948e5960 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -2,7 +2,10 @@
 
 import os
 import platform
+<<<<<<< HEAD
 import subprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .optional_submodules import checkout_nccl
 from .setup_helpers.cmake import CMake, USE_NINJA
@@ -88,8 +91,12 @@ def build_pytorch(
 ) -> None:
     my_env = _create_build_env()
     if (
+<<<<<<< HEAD
         not check_negative_env_flag("USE_DISTRIBUTED")
         and not check_negative_env_flag("USE_CUDA")
+=======
+        not check_negative_env_flag("USE_CUDA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and not check_negative_env_flag("USE_NCCL")
         and not check_env_flag("USE_SYSTEM_NCCL")
     ):
@@ -100,6 +107,7 @@ def build_pytorch(
     )
     if cmake_only:
         return
+<<<<<<< HEAD
     build_custom_step = os.getenv("BUILD_CUSTOM_STEP")
     if build_custom_step:
         try:
@@ -116,4 +124,6 @@ def build_pytorch(
             print("Output (stdout and stderr):")
             print(e.output)
             raise
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cmake.build(my_env)
diff --git a/tools/build_with_debinfo.py b/tools/build_with_debinfo.py
index d2e0fefa61ac2..0bd63d1bce990 100755
--- a/tools/build_with_debinfo.py
+++ b/tools/build_with_debinfo.py
@@ -95,8 +95,12 @@ def main() -> None:
         sys.exit(-95)
     if not is_devel_setup():
         print(
+<<<<<<< HEAD
             "Not a devel setup of PyTorch, "
             "please run `python -m pip install --no-build-isolation -v -e .` first"
+=======
+            "Not a devel setup of PyTorch, please run `python3 setup.py develop --user` first"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         sys.exit(-1)
     if not has_build_ninja():
diff --git a/tools/code_coverage/package/tool/summarize_jsons.py b/tools/code_coverage/package/tool/summarize_jsons.py
index d12cf2d77f248..23c0748e5d104 100644
--- a/tools/code_coverage/package/tool/summarize_jsons.py
+++ b/tools/code_coverage/package/tool/summarize_jsons.py
@@ -67,7 +67,10 @@ def is_intrested_file(
 
     # ignore files that are not belong to pytorch
     if platform == TestPlatform.OSS:
+<<<<<<< HEAD
         # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from package.oss.utils import get_pytorch_folder
 
         if not file_path.startswith(get_pytorch_folder()):
diff --git a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
index 5cea32d00decb..4f0c9331e447f 100644
--- a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
+++ b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
@@ -30,7 +30,11 @@
 
 
 def is_not_builtin_class(obj: Any) -> bool:
+<<<<<<< HEAD
     return isclass(obj) and type(obj).__module__ != "builtins"
+=======
+    return isclass(obj) and not type(obj).__module__ == "builtins"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class JitPlugin(CoveragePlugin):  # type: ignore[misc, no-any-unimported]
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
index 7c9432eddee9f..6b8a4c4594b51 100644
--- a/tools/download_mnist.py
+++ b/tools/download_mnist.py
@@ -7,6 +7,10 @@
 
 
 MIRRORS = [
+<<<<<<< HEAD
+=======
+    "http://yann.lecun.com/exdb/mnist/",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "https://ossci-datasets.s3.amazonaws.com/mnist/",  # @lint-ignore
 ]
 
@@ -24,7 +28,10 @@ def report_download_progress(
     file_size: int,
 ) -> None:
     if file_size != -1:
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
index 541189eb66792..c772854cdf98a 100644
--- a/tools/dynamo/gb_id_mapping.py
+++ b/tools/dynamo/gb_id_mapping.py
@@ -1,7 +1,13 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import argparse
 import ast
 import json
 import re
+<<<<<<< HEAD
 from pathlib import Path
 from typing import Any, Optional
 
@@ -18,16 +24,43 @@ def load_registry(path: Path) -> dict[str, Any]:
 
 
 def save_registry(reg: dict[str, Any], path: Path) -> None:
+=======
+import sys
+from pathlib import Path
+
+
+def get_source_segment(source, node):
+    return ast.get_source_segment(source, node)
+
+
+def load_registry(path):
+    if path.exists():
+        with path.open() as f:
+            return json.load(f)
+    return {}
+
+
+def save_registry(reg, path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with path.open("w") as f:
         json.dump(reg, f, indent=2)
 
 
+<<<<<<< HEAD
 def next_gb_id(reg: dict[str, Any]) -> str:
     ids = [int(x[2:]) for x in reg if x.startswith("GB") and x[2:].isdigit()]
     return f"GB{(max(ids, default=-1) + 1):04d}"
 
 
 def clean_string(s: Any) -> Any:
+=======
+def next_gb_id(reg):
+    ids = [int(x[2:]) for x in reg if x.startswith("GB") and x[2:].isdigit()]
+    return f"GB{(max(ids, default=0) + 1):04d}"
+
+
+def clean_string(s):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Normalizes string literals by removing formatting artifacts and escape sequences.
     Handles f-strings, quotes, newlines, and other syntax elements for cleaner output.
@@ -48,6 +81,7 @@ def clean_string(s: Any) -> Any:
     return s
 
 
+<<<<<<< HEAD
 def expand_hints(hints: list[str], dynamo_dir: Optional[str] = None) -> list[str]:
     """
     Expands hint references to their actual values from graph_break_hints.
@@ -71,10 +105,21 @@ def expand_hints(hints: list[str], dynamo_dir: Optional[str] = None) -> list[str
         name: value
         for name, value in hints_namespace.items()
         if isinstance(value, list) and name.isupper() and not name.startswith("_")
+=======
+def expand_hints(hints):
+    # Expands hint references to their actual values from graph_break_hints.
+    from torch._dynamo import graph_break_hints
+
+    hint_constants = {
+        name: value
+        for name, value in graph_break_hints.__dict__.items()
+        if isinstance(value, list) and name.isupper()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     expanded_hints = []
     for hint in hints:
+<<<<<<< HEAD
         expanded = False
         for name, value in hint_constants.items():
             if f"*graph_break_hints.{name}" in hint:
@@ -88,6 +133,16 @@ def expand_hints(hints: list[str], dynamo_dir: Optional[str] = None) -> list[str
 
 
 def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
+=======
+        for name, value in hint_constants.items():
+            if f"*graph_break_hints.{name}" in hint:
+                expanded_hints.extend(value)
+                break
+    return expanded_hints
+
+
+def extract_info_from_keyword(source, kw):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Extracts and returns the value of a keyword argument from an AST node.
 
@@ -105,16 +160,22 @@ def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
         evaluated_context = []
         for value in kw.value.values:
             if isinstance(value, ast.FormattedValue):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 evaluated_context.append(f"{{{ast.unparse(value.value)}}}")
             elif isinstance(value, ast.Constant):
                 # pyrefly: ignore [bad-argument-type]
+=======
+                evaluated_context.append(f"{{{ast.unparse(value.value)}}}")
+            elif isinstance(value, ast.Constant):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 evaluated_context.append(value.value)
         return "".join(evaluated_context)
     else:
         return clean_string(param_source)
 
 
+<<<<<<< HEAD
 def find_unimplemented_v2_calls(
     path: str, dynamo_dir: Optional[str] = None
 ) -> list[dict[str, Any]]:
@@ -125,6 +186,16 @@ def find_unimplemented_v2_calls(
         file_paths = path_obj.glob("**/*.py")
     else:
         file_paths = [path_obj]  # type: ignore[assignment]
+=======
+def find_unimplemented_v2_calls(path):
+    results = []
+    path = Path(path)
+
+    if path.is_dir():
+        file_paths = path.glob("**/*.py")
+    else:
+        file_paths = [path]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for file_path in file_paths:
         with open(file_path) as f:
@@ -134,18 +205,28 @@ def find_unimplemented_v2_calls(
 
                 for node in ast.walk(tree):
                     if isinstance(node, ast.FunctionDef):
+<<<<<<< HEAD
                         if node.name in (
                             "unimplemented_v2",
                             "unimplemented_v2_with_warning",
                         ):
+=======
+                        if node.name == "unimplemented_v2":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             continue
                     if (
                         isinstance(node, ast.Call)
                         and isinstance(node.func, ast.Name)
+<<<<<<< HEAD
                         and node.func.id
                         in ("unimplemented_v2", "unimplemented_v2_with_warning")
                     ):
                         info: dict[str, Any] = {
+=======
+                        and node.func.id == "unimplemented_v2"
+                    ):
+                        info = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "gb_type": None,
                             "context": None,
                             "explanation": None,
@@ -154,7 +235,10 @@ def find_unimplemented_v2_calls(
 
                         for kw in node.keywords:
                             if kw.arg in info:
+<<<<<<< HEAD
                                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 info[kw.arg] = extract_info_from_keyword(source, kw)
 
                         if info["gb_type"] is None:
@@ -168,7 +252,11 @@ def find_unimplemented_v2_calls(
                                 expanded_hints.extend(items)
 
                             if "*graph_break_hints." in hints:
+<<<<<<< HEAD
                                 expanded_hints.extend(expand_hints([hints], dynamo_dir))
+=======
+                                expanded_hints.extend(expand_hints([hints]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                             info["hints"] = expanded_hints
 
@@ -179,7 +267,125 @@ def find_unimplemented_v2_calls(
     return results
 
 
+<<<<<<< HEAD
 def create_registry(dynamo_dir: str, registry_path: str) -> None:
+=======
+def cmd_add_new_gb_type(gb_type, file_path, registry_path, additional_info=None):
+    """
+    Add a new graph break type to the registry.
+
+    Args:
+        gb_type: The graph break type to add
+        file_path: Path to the file containing the unimplemented_v2 call
+        registry_path: Path to the registry JSON file
+    """
+    registry_path = Path(registry_path)
+    reg = load_registry(registry_path)
+
+    existing_gb_types = {entry[0]["Gb_type"] for entry in reg.values()}
+    if gb_type in existing_gb_types:
+        print(
+            f"Error: gb_type '{gb_type}' already exists in registry. Please rename the gb_type so it can be unique."
+        )
+        return False
+
+    calls = find_unimplemented_v2_calls(Path(file_path))
+    matching_call = next((call for call in calls if call["gb_type"] == gb_type), None)
+
+    if not matching_call:
+        print(
+            f"Error: Could not find unimplemented_v2 call with gb_type '{gb_type}' in {file_path}"
+        )
+        return False
+
+    gb_id = next_gb_id(reg)
+    reg[gb_id] = [
+        {
+            "Gb_type": gb_type,
+            "Context": matching_call["context"],
+            "Explanation": matching_call["explanation"],
+            "Hints": matching_call["hints"] or [],
+            **({"Additional_Info": [additional_info]} if additional_info else {}),
+        }
+    ]
+
+    save_registry(reg, registry_path)
+    print(f"Added {gb_type} to registry with ID {gb_id}")
+    return True
+
+
+def cmd_update_gb_type(
+    old_gb_type, file_path, registry_path, new_gb_type=None, additional_info=None
+):
+    """
+    Update an existing graph break type in the registry by adding a new version
+    to the version history list.
+
+    Args:
+        old_gb_type: The current graph break type to update
+        file_path: Path to the file containing the updated unimplemented_v2 call
+        registry_path: Path to the registry JSON file
+        new_gb_type: Optional new gb_type name to replace the old one
+    """
+    registry_path = Path(registry_path)
+    reg = load_registry(registry_path)
+
+    gb_id_map = {entry[0]["Gb_type"]: id for id, entry in reg.items()}
+    gb_id = gb_id_map.get(old_gb_type)
+
+    if gb_id is None:
+        print(f"Error: gb_type '{old_gb_type}' not found in registry.")
+        return False
+
+    search_gb_type = new_gb_type if new_gb_type else old_gb_type
+    calls = find_unimplemented_v2_calls(Path(file_path))
+    matching_call = next(
+        (call for call in calls if call["gb_type"] == search_gb_type), None
+    )
+
+    if not matching_call:
+        print(
+            f"Error: Could not find unimplemented_v2 call with gb_type '{search_gb_type}' in {file_path}"
+        )
+        return False
+
+    if (
+        matching_call["gb_type"] != old_gb_type
+        and matching_call["gb_type"] in gb_id_map
+    ):
+        print(
+            f"Error: New gb_type '{matching_call['gb_type']}' already exists in registry. Please use a unique gb_type."
+        )
+        return False
+
+    new_entry = {
+        "Gb_type": matching_call["gb_type"],
+        "Context": matching_call["context"],
+        "Explanation": matching_call["explanation"],
+        "Hints": matching_call["hints"] or [],
+    }
+
+    if additional_info:
+        additional_info_list = reg[gb_id][0].get("Additional_Info", [])
+        new_entry["Additional_Info"] = (
+            additional_info_list + [additional_info]
+            if additional_info_list
+            else [additional_info]
+        )
+    elif "Additional_Info" in reg[gb_id][0]:
+        new_entry["Additional_Info"] = reg[gb_id][0]["Additional_Info"]
+
+    reg[gb_id].insert(0, new_entry)
+
+    save_registry(reg, registry_path)
+    print(
+        f"Updated {old_gb_type} to {matching_call['gb_type']} in registry with ID {gb_id}"
+    )
+    return True
+
+
+def create_registry(dynamo_dir, registry_path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     calls = find_unimplemented_v2_calls(dynamo_dir)
     registry = {}
 
@@ -205,9 +411,16 @@ def create_registry(dynamo_dir: str, registry_path: str) -> None:
         json.dump(registry, f, indent=2)
 
 
+<<<<<<< HEAD
 def main() -> None:
     repo_root = Path(__file__).resolve().parent.parent.parent
     registry_path = repo_root / "torch" / "_dynamo" / "graph_break_registry.json"
+=======
+def main():
+    script_dir = Path(__file__).resolve().parent
+    repo_root = script_dir.parent.parent
+    registry_path = script_dir / "graph_break_registry.json"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     try:
         import torch._dynamo
@@ -227,6 +440,33 @@ def main() -> None:
         help="Directory to search for unimplemented_v2 calls.",
     )
 
+<<<<<<< HEAD
+=======
+    add_parser = subparsers.add_parser("add", help="Add a gb_type to registry")
+    add_parser.add_argument("gb_type", help="The gb_type to add")
+    add_parser.add_argument(
+        "file_path", help="Path to the file containing the unimplemented_v2 call"
+    )
+    add_parser.add_argument(
+        "--additional-info", help="Optional additional information to include"
+    )
+
+    update_parser = subparsers.add_parser(
+        "update", help="Update an existing gb_type in registry"
+    )
+    update_parser.add_argument("gb_type", help="The gb_type to update")
+    update_parser.add_argument(
+        "file_path",
+        help="Path to the file containing the updated unimplemented_v2 call",
+    )
+    update_parser.add_argument(
+        "--new_gb_type", help="New gb_type name if it has changed", default=None
+    )
+    update_parser.add_argument(
+        "--additional-info", help="Optional additional information to include"
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument(
         "--registry-path",
         type=str,
@@ -238,6 +478,25 @@ def main() -> None:
 
     if args.command == "create":
         create_registry(args.dynamo_dir, args.registry_path)
+<<<<<<< HEAD
+=======
+    elif args.command == "add":
+        success = cmd_add_new_gb_type(
+            args.gb_type, args.file_path, args.registry_path, args.additional_info
+        )
+        if not success:
+            sys.exit(1)
+    elif args.command == "update":
+        success = cmd_update_gb_type(
+            args.gb_type,
+            args.file_path,
+            args.registry_path,
+            args.new_gb_type,
+            args.additional_info,
+        )
+        if not success:
+            sys.exit(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         parser.print_help()
 
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index b6ec848922f5a..a23c2424da2d1 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -8,7 +8,11 @@
 
 MIN_CUDA_VERSION = "11.6"
 MIN_ROCM_VERSION = "5.4"
+<<<<<<< HEAD
 MIN_PYTHON_VERSION = (3, 10)
+=======
+MIN_PYTHON_VERSION = (3, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class VerifyDynamoError(BaseException):
diff --git a/tools/flight_recorder/components/builder.py b/tools/flight_recorder/components/builder.py
index b2917a557b4da..3ba99b1546209 100644
--- a/tools/flight_recorder/components/builder.py
+++ b/tools/flight_recorder/components/builder.py
@@ -24,7 +24,10 @@
     Traceback,
 )
 from tools.flight_recorder.components.utils import (
+<<<<<<< HEAD
     add_stack_id_in_entries,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     align_trace_from_beginning,
     check_current_entry_match,
     check_no_missing_dump_files,
@@ -134,7 +137,10 @@ def build_collectives(
     _memberships: dict[str, set[Any]],
     _pg_guids: dict[tuple[str, int], str],
     version: str,
+<<<<<<< HEAD
     mismatch_cap: int = 10,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[list[Traceback], list[Collective], list[NCCLCall]]:
     """
     groups, memberships are the non-flat dicts that are indexable
@@ -172,6 +178,10 @@ def build_collectives(
     # once we find one mismatch, we stop pairing up collectives since the pairing is possibly incorrect
     # instead, just record the remaining ops as NCCLCalls
     mismatch = {_groups[g].id: 0 for g in _groups}
+<<<<<<< HEAD
+=======
+    MISMATCH_TAIL = 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # For best effort partial analysis.
     dumps_ranks = {int(key) for key in all_entries.keys()}
@@ -365,7 +375,11 @@ def build_collectives(
                     )
                 )
 
+<<<<<<< HEAD
         if mismatch[pg_name] > mismatch_cap:
+=======
+        if mismatch[pg_name] > MISMATCH_TAIL:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             logger.error(
                 "Too many mismatches for process_group %s: %s aborting", pg_name, desc
             )
@@ -392,9 +406,12 @@ def build_db(
     # Ensure version is consistent across all ranks.
     check_version(version_by_ranks, version)
     entries = align_trace_from_beginning(entries)
+<<<<<<< HEAD
     stack_id_trace_map: dict[str, int] = {}
     if args.just_print_entries:
         entries, stack_id_trace_map = add_stack_id_in_entries(entries)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # flattened database
     groups, _groups, memberships, _memberships, _pg_guids = build_groups_memberships(
@@ -402,6 +419,7 @@ def build_db(
     )
     logger.debug("built groups, memberships")
 
+<<<<<<< HEAD
     if args.just_print_entries:
         just_print_entries(
             entries, _groups, _memberships, _pg_guids, args, stack_id_trace_map
@@ -413,6 +431,17 @@ def build_db(
 
     tracebacks, collectives, nccl_calls = build_collectives(
         entries, _groups, _memberships, _pg_guids, version, args.mismatch_cap
+=======
+    if not args.allow_incomplete_ranks:
+        check_no_missing_dump_files(entries, memberships)
+
+    if args.just_print_entries:
+        just_print_entries(entries, _groups, _memberships, _pg_guids, args)
+        sys.exit(0)
+
+    tracebacks, collectives, nccl_calls = build_collectives(
+        entries, _groups, _memberships, _pg_guids, version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     logger.debug("built collectives, nccl_calls")
     if args.verbose:
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
index d43022444e447..9885023b23597 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/tools/flight_recorder/components/config_manager.py
@@ -67,6 +67,7 @@ def __init__(self: "JobConfig"):
         )
         self.parser.add_argument("-j", "--just_print_entries", action="store_true")
         self.parser.add_argument("-v", "--verbose", action="store_true")
+<<<<<<< HEAD
         self.parser.add_argument("--print_stack_trace", action="store_true")
         self.parser.add_argument(
             "--mismatch_cap",
@@ -74,10 +75,13 @@ def __init__(self: "JobConfig"):
             default=10,
             help="Maximum number of mismatches we print (from earliest).",
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def parse_args(
         self: "JobConfig", args: Optional[Sequence[str]]
     ) -> argparse.Namespace:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         args = self.parser.parse_args(args)
         # pyrefly: ignore [missing-attribute]
@@ -96,4 +100,17 @@ def parse_args(
         if args.verbose:
             logger.set_log_level(logging.DEBUG)
         # pyrefly: ignore [bad-return]
+=======
+        args = self.parser.parse_args(args)
+        if args.selected_ranks is not None:
+            assert args.just_print_entries, (
+                "Not support selecting ranks without printing entries"
+            )
+        if args.pg_filters is not None:
+            assert args.just_print_entries, (
+                "Not support selecting pg filters without printing entries"
+            )
+        if args.verbose:
+            logger.set_log_level(logging.DEBUG)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return args
diff --git a/tools/flight_recorder/components/fr_logger.py b/tools/flight_recorder/components/fr_logger.py
index 49d878bf45596..4030d42614fa6 100644
--- a/tools/flight_recorder/components/fr_logger.py
+++ b/tools/flight_recorder/components/fr_logger.py
@@ -5,8 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FlightRecorderLogger:
diff --git a/tools/flight_recorder/components/loader.py b/tools/flight_recorder/components/loader.py
index 7634226bae528..3a8bebb02e1a3 100644
--- a/tools/flight_recorder/components/loader.py
+++ b/tools/flight_recorder/components/loader.py
@@ -78,9 +78,15 @@ def read_dir(args: argparse.Namespace) -> tuple[dict[str, dict[str, Any]], str]:
         if prefix is None:
             prefix = _determine_prefix(files)
         for f in files:
+<<<<<<< HEAD
             if (offset := f.find(prefix)) == -1:
                 continue
             details[f] = read_dump(f[:offset] + prefix, os.path.join(root, f))
+=======
+            if f.find(prefix) != 0:
+                continue
+            details[f] = read_dump(prefix, os.path.join(root, f))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             filecount += 1
             if not version:
                 version = str(details[f]["version"])
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
index 2c8fea5fb3340..bbd14be225760 100644
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@@ -388,17 +388,26 @@ def __init__(
         self, event: dict[Any, Any], memberships: dict[str, set[Any]], pg_name: str
     ):
         self.profiling_name = event["profiling_name"]
+<<<<<<< HEAD
         comm_lib_backend, name = self.profiling_name.split(":")
         assert comm_lib_backend in ["nccl", "xccl"], (
             f"name formatting error? {comm_lib_backend} != 'nccl' or 'xccl'"
         )
+=======
+        nccl, name = self.profiling_name.split(":")
+        assert nccl == "nccl", f"name formatting error? {nccl} != 'nccl'"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         parts = name.split(" ")
         type = parts[0]
         meta = parts[1] if len(parts) == 2 else None
         self.state = event["state"]
+<<<<<<< HEAD
         # Store the hashed pg_name for accessing memberships, and original pg info for display
         self.pg_name = pg_name  # This is the hashed version used for memberships lookup
         self.original_pg_name, self.pg_desc = event["process_group"]
+=======
+        self.pg_name, self.pg_desc = event["process_group"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert type in COLLECTIVES | P2P | {"coalesced"}, (
             f"{type} is not a supported operation"
         )
@@ -421,7 +430,10 @@ def __init__(
         else:
             self.input_sizes, self.output_sizes = None, None
         self.collective_seq_id = event["collective_seq_id"]
+<<<<<<< HEAD
         self.stack_id = event.get("stack_id", -1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.p2p_seq_id = event["p2p_seq_id"]
         self.input_dtypes = event["input_dtypes"]
         self.output_dtypes = event["output_dtypes"]
@@ -430,9 +442,15 @@ def __init__(
         self.is_verbose = os.getenv("FR_TRACE_VERBOSE_OUTPUT", "0") == "1"
 
     def _init_global_src_dst(self, pg_ranks: set[Any]) -> None:
+<<<<<<< HEAD
         pg_ranks_sorted = sorted(pg_ranks)
         self._src_g = pg_ranks_sorted[self._src] if self._src is not None else None
         self._dst_g = pg_ranks_sorted[self._dst] if self._dst is not None else None
+=======
+        pg_ranks = sorted(pg_ranks)
+        self._src_g = pg_ranks[self._src] if self._src is not None else None
+        self._dst_g = pg_ranks[self._dst] if self._dst is not None else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def src(self) -> int:
@@ -461,7 +479,10 @@ def __repr__(self) -> str:
                 f"pg_name={self.pg_name}",
                 f"pg_description={self.pg_desc}",
                 f"pg_size={self.pg_size}",
+<<<<<<< HEAD
                 f"stack_id={self.stack_id}",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"state={self.state}",
             )
             return f"{self.type}(%s)" % ", ".join(s for s in verbose_info if s)
@@ -469,6 +490,7 @@ def __repr__(self) -> str:
             f"{p2p_info}, " if p2p_info else ""
         )
 
+<<<<<<< HEAD
     def dtype_mismatch(self, other: "Op") -> bool:
         if (
             (
@@ -493,6 +515,8 @@ def dtype_mismatch(self, other: "Op") -> bool:
             return True
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def match(self, other: "Op") -> MatchInfo:
         # TODO: I think this can validly not match,
         # e.g. if one PG was used for p2p ops between only some of the peers?
@@ -528,19 +552,59 @@ def match(self, other: "Op") -> MatchInfo:
                     MatchState.COLLECTIVE_TYPE_MISMATCH,
                     f"Expected collective type: '{self.type}' does not match found collective type: '{other.type}'",
                 )
+<<<<<<< HEAD
             if (
                 self.type not in ["all_to_all", "scatter"]
                 and self.input_sizes != other.input_sizes
             ):
                 return MatchInfo(
+=======
+            if self.state != other.state:
+                # MatchState()
+                return MatchInfo(
+                    MatchState.COLLECTIVE_STATE_MISMATCH,
+                    f"Expected state: '{self.state}' does not match found state: '{other.state}'",
+                )
+            if (
+                (
+                    set(self.input_dtypes) != set(self.output_dtypes)
+                    and self.input_sizes[0]
+                    and self.output_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.input_dtypes)
+                    and self.input_sizes[0]
+                    and other.input_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.output_dtypes)
+                    and self.input_sizes[0]
+                    and other.output_sizes[0]
+                )
+            ):
+                return MatchInfo(
+                    MatchState.COLLECTIVE_DTYPE_MISMATCH,
+                    f"Expected dtypes: '{set(self.input_dtypes)}' does not "
+                    f"match found dtype: '{set(self.output_dtypes)}/"
+                    f"{set(other.input_dtypes)}/{set(other.output_dtypes)}'",
+                )
+            if self.type == "all_to_all":
+                return MatchInfo(MatchState.UNDECIDED)
+            if self.type != "scatter" and self.input_sizes != other.input_sizes:
+                return MatchInfo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected input sizes: '{self.input_sizes}' does not match found input sizes: "
                     f"'{other.input_sizes}'",
                 )
+<<<<<<< HEAD
             if (
                 self.type not in ["all_to_all", "gather"]
                 and self.output_sizes != other.output_sizes
             ):
+=======
+            if self.type != "gather" and self.output_sizes != other.output_sizes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected output sizes: '{self.output_sizes}' does not match found output sizes: "
@@ -554,6 +618,7 @@ def match(self, other: "Op") -> MatchInfo:
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected input sizes: '{self.input_sizes}' does not match found output sizes: '{other.output_sizes}'",
                 )
+<<<<<<< HEAD
             if (
                 self.type
                 in [
@@ -563,12 +628,22 @@ def match(self, other: "Op") -> MatchInfo:
                 ]
                 and math.prod(other.output_sizes[0])
                 != math.prod(self.input_sizes[0]) * self.pg_size
+=======
+            if self.type in [
+                "all_gather",
+                "all_gather_base",
+                "all_gather_into_tensor_coalesced",
+            ] and not (
+                math.prod(other.output_sizes[0])
+                == math.prod(self.input_sizes[0]) * self.pg_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Found input numel '{math.prod(other.input_sizes[0])} * pg size {self.pg_size}' "
                     f"does not match output numel '{math.prod(other.output_sizes[0])}'",
                 )
+<<<<<<< HEAD
             if (
                 self.type
                 in [
@@ -578,12 +653,22 @@ def match(self, other: "Op") -> MatchInfo:
                 ]
                 and math.prod(other.input_sizes[0])
                 != math.prod(self.output_sizes[0]) * self.pg_size
+=======
+            if self.type in [
+                "reduce_scatter",
+                "_reduce_scatter_base",
+                "reduce_scatter_tensor_coalesced",
+            ] and not (
+                math.prod(other.input_sizes[0])
+                == math.prod(self.output_sizes[0]) * self.pg_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
                     f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
                 )
+<<<<<<< HEAD
             if self.dtype_mismatch(other):
                 return MatchInfo(
                     MatchState.COLLECTIVE_DTYPE_MISMATCH,
@@ -599,6 +684,8 @@ def match(self, other: "Op") -> MatchInfo:
                 )
             if self.type == "all_to_all":
                 return MatchInfo(MatchState.UNDECIDED)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.type in [
             "coalesced",
             "ALLGATHER_coalesced",
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 2f8736fa7c828..2db2158fc536d 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -41,7 +41,10 @@ def format_frame(frame: dict[str, str]) -> str:
 def format_frames(frames: list[dict[str, str]]) -> str:
     formatted_frames = []
     for frame in frames:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         formatted_frames.append(format_frame(frame))
     return "\n".join(formatted_frames)
 
@@ -116,6 +119,7 @@ def visualize_ops(
             for r in all_ops:
                 if len(all_ops[r]) > i:
                     rank, event = all_rank_events[r][i]
+<<<<<<< HEAD
                     # Check if the pg_guid exists for this rank and process group
                     pg_key = (event["process_group"][0], rank)
                     if pg_key in _pg_guids:
@@ -129,6 +133,15 @@ def visualize_ops(
                     else:
                         # Skip this entry if pg_guid mapping doesn't exist
                         row.append(None)  # type: ignore[arg-type]
+=======
+                    row.append(
+                        Op(
+                            event,
+                            memberships,
+                            _pg_guids[(event["process_group"][0], rank)],
+                        )
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     progress = True
                 else:
                     row.append(None)  # type: ignore[arg-type]
@@ -251,6 +264,7 @@ def visualize_ops(
             for r in all_ops:
                 if len(all_ops[r]) > i:
                     rank, event = all_rank_events[r][i]
+<<<<<<< HEAD
                     # Check if the pg_guid exists for this rank and process group
                     pg_key = (event["process_group"][0], rank)
                     if pg_key in _pg_guids:
@@ -264,6 +278,15 @@ def visualize_ops(
                     else:
                         # Skip this entry if pg_guid mapping doesn't exist
                         row.append(None)  # type: ignore[arg-type]
+=======
+                    row.append(
+                        Op(
+                            event,
+                            memberships,
+                            _pg_guids[(event["process_group"][0], rank)],
+                        )
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     progress = True
                 else:
                     row.append(None)  # type: ignore[arg-type]
@@ -629,7 +652,10 @@ def just_print_entries(
     _memberships: dict[str, set[Any]],
     _pg_guids: dict[tuple[str, int], str],
     args: argparse.Namespace,
+<<<<<<< HEAD
     stack_id_trace_map: dict[str, int],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     rows = []
     ranks = sorted(all_entries.keys())
@@ -664,6 +690,7 @@ def just_print_entries(
 
     logger.info(tabulate(rows, headers=headers))
 
+<<<<<<< HEAD
     if stack_id_trace_map and args.print_stack_trace:
         headers = ["stack_id", "frame_stack"]
         rows = []
@@ -675,6 +702,8 @@ def just_print_entries(
 
         logger.info(tabulate(rows, headers=headers))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def check_no_missing_dump_files(
     entries: dict[int, Any], memberships: list[Membership]
@@ -696,13 +725,17 @@ def check_version(version_by_ranks: dict[str, str], version: str) -> None:
 
 
 def get_version_detail(version: str) -> tuple[int, int]:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     version = version.split(".")
     assert len(version) == 2, f"Invalid version {version}"
     major, minor = map(int, version)
     return major, minor
 
 
+<<<<<<< HEAD
 def add_stack_id_in_entries(
     entries: dict[int, list[dict[str, Any]]],
 ) -> tuple[dict[int, list[dict[str, Any]]], dict[str, int]]:
@@ -724,6 +757,8 @@ def add_stack_id_in_entries(
     return entries, stack_id_trace_map
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def align_trace_from_beginning(
     entries: dict[int, list[dict[str, Any]]],
 ) -> dict[int, list[dict[str, Any]]]:
@@ -754,10 +789,13 @@ def align_trace_from_beginning(
         # Rank 3: [0, 1, 2, 3, 4, 5, None]
         # Then we should start from collective 2 not 0 because any collective before,
         # we don't have complete records from all ranks so we need to ignore them.
+<<<<<<< HEAD
         # If we don't have any trace from some ranks, ignore them
         # as well.
         if len(entries[rank]) == 0:
             continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         first_record_id = entries[rank][0]["record_id"]
         maximum_starting_record_id = max(maximum_starting_record_id, first_record_id)
 
diff --git a/tools/flight_recorder/fr_trace.py b/tools/flight_recorder/fr_trace.py
index 3ba262832f57e..98cf7ac2c57f8 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/tools/flight_recorder/fr_trace.py
@@ -40,6 +40,7 @@
 
 def main(args: Optional[Sequence[str]] = None) -> None:
     config = JobConfig()
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     args = config.parse_args(args)
     # pyrefly: ignore [missing-attribute]
@@ -51,6 +52,13 @@ def main(args: Optional[Sequence[str]] = None) -> None:
     # pyrefly: ignore [missing-attribute]
     if args.output:
         # pyrefly: ignore [no-matching-overload]
+=======
+    args = config.parse_args(args)
+    assert args.trace_dir, "Trace directory trace_dir is required"
+    details, version = read_dir(args)
+    db = build_db(details, args, version)
+    if args.output:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(args.output, "wb") as f:
             pickle.dump((types, db), f)
 
diff --git a/tools/gdb/pytorch-gdb.py b/tools/gdb/pytorch-gdb.py
index c38b9f349b118..91a2354bbe1bc 100644
--- a/tools/gdb/pytorch-gdb.py
+++ b/tools/gdb/pytorch-gdb.py
@@ -34,7 +34,10 @@ class TensorRepr(gdb.Command):  # type: ignore[misc, no-any-unimported]
     on it.
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __doc__ = textwrap.dedent(__doc__).strip()
 
     def __init__(self) -> None:
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 1a586f16b5126..8e79fd6bdb870 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -118,7 +118,10 @@ def extract_filename(path: str, keep_ext: bool = True) -> Any:
 
 
 # https://gist.github.com/pypt/94d747fe5180851196eb
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class UniqueKeyLoader(Loader):
     def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
         if not isinstance(node, MappingNode):
@@ -234,7 +237,10 @@ def preprocess(
         last_indent = input_indent
 
     while blank_lines != 0:
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         python_lines.append(python_indent + "print(file=OUT_STREAM)")
         blank_lines -= 1
 
@@ -669,7 +675,10 @@ def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
             "    ",
         )
 
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return shader_dispatch_str
 
 
@@ -684,18 +693,27 @@ def genCppFiles(
         name = getName(spvPath).replace("_spv", "")
 
         sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         spv_bin_strs.append(spv_bin_str)
 
         shader_info = getShaderInfo(srcPath)
 
         register_shader_info_strs.append(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             generateShaderInfoStr(shader_info, name, sizeBytes)
         )
 
         if shader_info.register_for is not None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shader_registry_strs.append(generateShaderDispatchStr(shader_info, name))
 
     spv_bin_arrays = "\n".join(spv_bin_strs)
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index ec16bbf4546e2..bd8840f1cb1bd 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -1,13 +1,19 @@
 from __future__ import annotations
 
 import argparse
+<<<<<<< HEAD
 import email
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import re
 import subprocess
 from pathlib import Path
 
+<<<<<<< HEAD
 from packaging.version import Version
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from setuptools import distutils  # type: ignore[import,attr-defined]
 
 
@@ -50,6 +56,7 @@ def get_tag(pytorch_root: str | Path) -> str:
 
 
 def get_torch_version(sha: str | None = None) -> str:
+<<<<<<< HEAD
     """Determine the torch version string.
 
     The version is determined from one of the following sources, in order of
@@ -72,12 +79,18 @@ def get_torch_version(sha: str | None = None) -> str:
         sdist_version = pkg_info["Version"]
     else:
         sdist_version = None
+=======
+    pytorch_root = Path(__file__).absolute().parent.parent
+    version = open(pytorch_root / "version.txt").read().strip()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if os.getenv("PYTORCH_BUILD_VERSION"):
         assert os.getenv("PYTORCH_BUILD_NUMBER") is not None
         build_number = int(os.getenv("PYTORCH_BUILD_NUMBER", ""))
         version = os.getenv("PYTORCH_BUILD_VERSION", "")
         if build_number > 1:
             version += ".post" + str(build_number)
+<<<<<<< HEAD
         origin = "PYTORCH_BUILD_{VERSION,NUMBER} env variables"
     elif sdist_version:
         version = sdist_version
@@ -104,6 +117,12 @@ def get_torch_version(sha: str | None = None) -> str:
             f"Source part '{source_version}' of version '{version}' from "
             f"{origin} does not match version '{sdist_version}' from PKG-INFO"
         )
+=======
+    elif sha != UNKNOWN:
+        if sha is None:
+            sha = get_sha(pytorch_root)
+        version += "+git" + sha[:7]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return version
 
 
diff --git a/tools/github/github_utils.py b/tools/github/github_utils.py
index dc078fe29fadd..92f9963a9ed78 100644
--- a/tools/github/github_utils.py
+++ b/tools/github/github_utils.py
@@ -4,16 +4,23 @@
 
 import json
 import os
+<<<<<<< HEAD
 from typing import Any, cast, TYPE_CHECKING
+=======
+from typing import Any, Callable, cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gh_fetch_url_and_headers(
     url: str,
     *,
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index 195fb45f8ae8d..4e73ffddfbe10 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -131,14 +131,20 @@ def __call__(self, f: NativeFunction) -> str:
                 else:
                     arg_cpp = f"c10::IValue({arg_default})"
             args_code.append(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"""c10::Argument("{arg.name}", nullptr, ::std::nullopt, {arg_cpp})"""
             )
 
         returns = f.func.returns
         returns_code = []
         for ret in returns:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             returns_code.append(f"""c10::Argument("{ret.name if ret.name else ""}")""")
         return f"""
 // aten::{schema}
diff --git a/tools/jit/test/test_gen_unboxing.py b/tools/jit/test/test_gen_unboxing.py
index 975342aad0f7a..c95b45567ee30 100644
--- a/tools/jit/test/test_gen_unboxing.py
+++ b/tools/jit/test/test_gen_unboxing.py
@@ -53,7 +53,11 @@ def test_get_custom_build_selector_with_both_allowlist_and_yaml(
         temp_file.seek(0)
         args = [
             "--op-registration-allowlist=op1",
+<<<<<<< HEAD
             f"--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
+=======
+            "--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "--op-selection-yaml-path=path2",
         ]
         gen_unboxing.main(args)
diff --git a/tools/linter/adapters/_linter/block.py b/tools/linter/adapters/_linter/block.py
index 4097da50a7e4e..31d1463fac3c0 100644
--- a/tools/linter/adapters/_linter/block.py
+++ b/tools/linter/adapters/_linter/block.py
@@ -14,9 +14,12 @@
     from tokenize import TokenInfo
 
 
+<<<<<<< HEAD
 _OVERRIDES = {"@override", "@typing_extensions.override", "@typing.override"}
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @total_ordering
 @dc.dataclass
 class Block:
@@ -71,6 +74,7 @@ class Category(str, Enum):
 
     @property
     def start_line(self) -> int:
+<<<<<<< HEAD
         """The line number for the def or class statement"""
         return self.tokens[self.begin].start[0]
 
@@ -85,6 +89,13 @@ def end_line(self) -> int:
             #    def function(): ...
             #
             # and the dedent correctly pointed to one past the end of self.tokens
+=======
+        return self.tokens[max(self.indent, self.index)].start[0]
+
+    @property
+    def end_line(self) -> int:
+        return self.tokens[max(self.dedent, self.index)].start[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def line_count(self) -> int:
@@ -111,7 +122,13 @@ def decorators(self) -> list[str]:
 
     @cached_property
     def is_override(self) -> bool:
+<<<<<<< HEAD
         return not self.is_class and bool(_OVERRIDES.intersection(self.decorators))
+=======
+        return not self.is_class and any(
+            d.rpartition(".")[2] == "override" for d in self.decorators
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     DATA_FIELDS = (
         "category",
@@ -159,9 +176,15 @@ def _get_decorators(tokens: Sequence[TokenInfo], block_start: int) -> list[str]:
     def decorators() -> Iterator[str]:
         rev = reversed(range(block_start))
         newlines = (i for i in rev if tokens[i].type == token.NEWLINE)
+<<<<<<< HEAD
         it = iter(itertools.chain(newlines, [-1]))
         # The -1 accounts for the very first line in the file
 
+=======
+        newlines = itertools.chain(newlines, [-1])  # To account for the first line
+
+        it = iter(newlines)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         end = next(it, -1)  # Like itertools.pairwise in Python 3.10
         for begin in it:
             for i in range(begin + 1, end):
diff --git a/tools/linter/adapters/_linter/bracket_pairs.py b/tools/linter/adapters/_linter/bracket_pairs.py
index 323f4da88bced..236ae0a15bb8e 100644
--- a/tools/linter/adapters/_linter/bracket_pairs.py
+++ b/tools/linter/adapters/_linter/bracket_pairs.py
@@ -16,10 +16,16 @@ def bracket_pairs(tokens: Sequence[TokenInfo]) -> dict[int, int]:
     """Returns a dictionary mapping opening to closing brackets"""
     braces: dict[int, int] = {}
     stack: list[int] = []
+<<<<<<< HEAD
     in_fstring = False
 
     for i, t in enumerate(tokens):
         if t.type == token.OP and not in_fstring:
+=======
+
+    for i, t in enumerate(tokens):
+        if t.type == token.OP:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if t.string in BRACKETS:
                 stack.append(i)
             elif inv := BRACKETS_INV.get(t.string):
@@ -35,11 +41,17 @@ def bracket_pairs(tokens: Sequence[TokenInfo]) -> dict[int, int]:
                     raise ParseError(t, f"Mismatched braces '{b}' at {begin}")
         elif t.type == FSTRING_START:
             stack.append(FSTRING_START)
+<<<<<<< HEAD
             in_fstring = True
         elif t.type == FSTRING_END:
             if stack.pop() != FSTRING_START:
                 raise ParseError(t, "Mismatched FSTRING_START/FSTRING_END")
             in_fstring = False
+=======
+        elif t.type == FSTRING_END:
+            if stack.pop() != FSTRING_START:
+                raise ParseError(t, "Mismatched FSTRING_START/FSTRING_END")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if stack:
         raise ParseError(t, "Left open")
     return braces
diff --git a/tools/linter/adapters/_linter/file_linter.py b/tools/linter/adapters/_linter/file_linter.py
index 8ee54da8c8562..0f4c18e67ef47 100644
--- a/tools/linter/adapters/_linter/file_linter.py
+++ b/tools/linter/adapters/_linter/file_linter.py
@@ -112,7 +112,10 @@ def _replace(self, pf: PythonFile) -> tuple[str, list[LintResult]]:
         first_results = None
         original = replacement = pf.contents
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while True:
             try:
                 results = sorted(self._lint(pf), key=LintResult.sort_key)
diff --git a/tools/linter/adapters/_linter/sets.py b/tools/linter/adapters/_linter/sets.py
index 2849df8c246bf..dfea7d9123710 100644
--- a/tools/linter/adapters/_linter/sets.py
+++ b/tools/linter/adapters/_linter/sets.py
@@ -41,7 +41,10 @@ def is_set(self, i: int) -> bool:
         t = self.tokens[i]
         after = i < len(self.tokens) - 1 and self.tokens[i + 1]
         if t.string == "Set" and t.type == token.NAME:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return after and after.string == "[" and after.type == token.OP
         return (
             (t.string == "set" and t.type == token.NAME)
diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py
index 019f0fe896bcd..7a949cf595645 100644
--- a/tools/linter/adapters/actionlint_linter.py
+++ b/tools/linter/adapters/actionlint_linter.py
@@ -73,8 +73,11 @@ def check_file(
                 binary,
                 "-ignore",
                 '"runs-on" section must be sequence node but got mapping node with "!!map" tag',
+<<<<<<< HEAD
                 "-ignore",
                 'input "freethreaded" is not defined in action "actions/setup-python@v',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 file,
             ]
         )
diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py
new file mode 100644
index 0000000000000..c22a89032cfb3
--- /dev/null
+++ b/tools/linter/adapters/black_linter.py
@@ -0,0 +1,225 @@
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from enum import Enum
+from typing import BinaryIO, NamedTuple
+
+
+IS_WINDOWS: bool = os.name == "nt"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: str | None
+    line: int | None
+    char: int | None
+    code: str
+    severity: LintSeverity
+    name: str
+    original: str | None
+    replacement: str | None
+    description: str | None
+
+
+def as_posix(name: str) -> str:
+    return name.replace("\\", "/") if IS_WINDOWS else name
+
+
+def _run_command(
+    args: list[str],
+    *,
+    stdin: BinaryIO,
+    timeout: int,
+) -> subprocess.CompletedProcess[bytes]:
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            stdin=stdin,
+            capture_output=True,
+            shell=IS_WINDOWS,  # So batch scripts are found.
+            timeout=timeout,
+            check=True,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+def run_command(
+    args: list[str],
+    *,
+    stdin: BinaryIO,
+    retries: int,
+    timeout: int,
+) -> subprocess.CompletedProcess[bytes]:
+    remaining_retries = retries
+    while True:
+        try:
+            return _run_command(args, stdin=stdin, timeout=timeout)
+        except subprocess.TimeoutExpired as err:
+            if remaining_retries == 0:
+                raise err
+            remaining_retries -= 1
+            logging.warning(
+                "(%s/%s) Retrying because command failed with: %r",
+                retries - remaining_retries,
+                retries,
+                err,
+            )
+            time.sleep(1)
+
+
+def check_file(
+    filename: str,
+    retries: int,
+    timeout: int,
+) -> list[LintMessage]:
+    try:
+        with open(filename, "rb") as f:
+            original = f.read()
+        with open(filename, "rb") as f:
+            proc = run_command(
+                [sys.executable, "-mblack", "--stdin-filename", filename, "-"],
+                stdin=f,
+                retries=retries,
+                timeout=timeout,
+            )
+    except subprocess.TimeoutExpired:
+        return [
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code="BLACK",
+                severity=LintSeverity.ERROR,
+                name="timeout",
+                original=None,
+                replacement=None,
+                description=(
+                    "black timed out while trying to process a file. "
+                    "Please report an issue in pytorch/pytorch with the "
+                    "label 'module: lint'"
+                ),
+            )
+        ]
+    except (OSError, subprocess.CalledProcessError) as err:
+        return [
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code="BLACK",
+                severity=LintSeverity.ADVICE,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(
+                    f"Failed due to {err.__class__.__name__}:\n{err}"
+                    if not isinstance(err, subprocess.CalledProcessError)
+                    else (
+                        "COMMAND (exit code {returncode})\n"
+                        "{command}\n\n"
+                        "STDERR\n{stderr}\n\n"
+                        "STDOUT\n{stdout}"
+                    ).format(
+                        returncode=err.returncode,
+                        command=" ".join(as_posix(x) for x in err.cmd),
+                        stderr=err.stderr.decode("utf-8").strip() or "(empty)",
+                        stdout=err.stdout.decode("utf-8").strip() or "(empty)",
+                    )
+                ),
+            )
+        ]
+
+    replacement = proc.stdout
+    if original == replacement:
+        return []
+
+    return [
+        LintMessage(
+            path=filename,
+            line=None,
+            char=None,
+            code="BLACK",
+            severity=LintSeverity.WARNING,
+            name="format",
+            original=original.decode("utf-8"),
+            replacement=replacement.decode("utf-8"),
+            description="Run `lintrunner -a` to apply this patch.",
+        )
+    ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Format files with black.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--retries",
+        default=3,
+        type=int,
+        help="times to retry timed out black",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=90,
+        type=int,
+        help="seconds to wait for black",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(threadName)s:%(levelname)s> %(message)s",
+        level=logging.NOTSET
+        if args.verbose
+        else logging.DEBUG
+        if len(args.filenames) < 1000
+        else logging.INFO,
+        stream=sys.stderr,
+    )
+
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=os.cpu_count(),
+        thread_name_prefix="Thread",
+    ) as executor:
+        futures = {
+            executor.submit(check_file, x, args.retries, args.timeout): x
+            for x in args.filenames
+        }
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                for lint_message in future.result():
+                    print(json.dumps(lint_message._asdict()), flush=True)
+            except Exception:
+                logging.critical('Failed at "%s".', futures[future])
+                raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index 0d82ddd939b15..f5154c2984a1c 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -73,7 +73,11 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
+<<<<<<< HEAD
             logging.warning(  # noqa: G200
+=======
+            logging.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index bb588f8c21bd1..3f847d61b30ad 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -19,13 +19,19 @@
 # PyTorch directory root
 def scm_root() -> str:
     path = os.path.abspath(os.getcwd())
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while True:
         if os.path.exists(os.path.join(path, ".git")):
             return path
         if os.path.isdir(os.path.join(path, ".hg")):
             return path
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         n = len(path)
         path = os.path.dirname(path)
         if len(path) == n:
@@ -176,8 +182,11 @@ def check_file(
         for match in RESULTS_RE.finditer(proc.stdout.decode()):
             # Convert the reported path to an absolute path.
             abs_path = str(Path(match["file"]).resolve())
+<<<<<<< HEAD
             if not abs_path.startswith(PYTORCH_ROOT):
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             message = LintMessage(
                 path=abs_path,
                 name=match["code"],
diff --git a/tools/linter/adapters/codespell_linter.py b/tools/linter/adapters/codespell_linter.py
index 37a32b77401f4..5bfedae9c4657 100644
--- a/tools/linter/adapters/codespell_linter.py
+++ b/tools/linter/adapters/codespell_linter.py
@@ -20,8 +20,11 @@
     "multipy",  # project pytorch/multipy is dead  # codespell:ignore multipy
 }
 
+<<<<<<< HEAD
 MAX_FILE_SIZE: int = 1024 * 1024 * 1024  # 1GB in bytes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class LintSeverity(str, Enum):
     ERROR = "error"
@@ -51,8 +54,13 @@ def format_error_message(
     if message is None and error is not None:
         message = (
             f"Failed due to {error.__class__.__name__}:\n{error}\n"
+<<<<<<< HEAD
             "Please either fix the error or add the word(s) to the dictionary file.\n"
             "HINT: all-lowercase words in the dictionary can cover all case variations."
+=======
+            "Please either fix the error or "
+            "add the word(s) to the dictionary file (lowercase is preferred)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     return LintMessage(
         path=filename,
@@ -88,6 +96,7 @@ def run_codespell(path: Path) -> str:
 
 def check_file(filename: str) -> list[LintMessage]:
     path = Path(filename).absolute()
+<<<<<<< HEAD
 
     # Check if file is too large
     try:
@@ -121,6 +130,8 @@ def check_file(filename: str) -> list[LintMessage]:
             )
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         run_codespell(path)
     except Exception as err:
@@ -136,7 +147,10 @@ def check_dictionary(filename: str) -> list[LintMessage]:
         words_set = set(words)
         if len(words) != len(words_set):
             raise ValueError("The dictionary file contains duplicate entries.")
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         uncased_words = list(map(str.lower, words))
         if uncased_words != sorted(uncased_words):
             raise ValueError(
diff --git a/tools/linter/adapters/docstring_linter-grandfather.json b/tools/linter/adapters/docstring_linter-grandfather.json
index 49b12adb127bd..e735e94d9f86a 100644
--- a/tools/linter/adapters/docstring_linter-grandfather.json
+++ b/tools/linter/adapters/docstring_linter-grandfather.json
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
   "torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py": {
     "class MMRankingA100": 279,
     "def MMRankingA100.fill_choices()": 199
@@ -15,10 +16,37 @@
     "class MixedMMH100": 132,
     "def MixedMMH100.get_best_choices()": 85
   },
+=======
+  "torch/_inductor/async_compile.py": {
+    "class AsyncCompile": 281
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py": {
+    "class MMRankingA100": 278,
+    "def MMRankingA100.fill_choices()": 199
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py": {
+    "class MMRankingH100": 303,
+    "def MMRankingH100.fill_choices()": 203
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py": {
+    "class MixedMMA100": 132,
+    "def MixedMMA100.get_best_choices()": 85
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py": {
+    "class MixedMMH100": 131,
+    "def MixedMMH100.get_best_choices()": 85
+  },
+  "torch/_inductor/autotune_process.py": {
+    "class CUDABenchmarkRequest": 115,
+    "class TritonBenchmarkRequest": 121,
+    "def TritonBenchmarkRequest.make_run_fn()": 81
+  },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch/_inductor/bounds.py": {
     "class ValueRangeAnalysis": 107
   },
   "torch/_inductor/codecache.py": {
+<<<<<<< HEAD
     "class CppPythonBindingsCodeCache": 179,
     "class HalideCodeCache": 357,
     "class PyCodeCache": 102
@@ -35,11 +63,33 @@
     "class CppOverrides": 429,
     "class CppScheduling": 786,
     "class CppVecKernel": 865,
+=======
+    "class AotCodeCompiler": 516,
+    "class CUDACodeCache": 107,
+    "class CppCodeCache": 125,
+    "class CppPythonBindingsCodeCache": 168,
+    "class HalideCodeCache": 350
+  },
+  "torch/_inductor/codegen/common.py": {
+    "class CSE": 167,
+    "class CSEProxy": 310,
+    "class Kernel": 286,
+    "class KernelArgs": 325,
+    "class OpOverrides": 227
+  },
+  "torch/_inductor/codegen/cpp.py": {
+    "class CppKernel": 572,
+    "class CppKernelProxy": 601,
+    "class CppOverrides": 429,
+    "class CppScheduling": 777,
+    "class CppVecKernel": 857,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "class OuterLoopFusedSchedulerNode": 159,
     "def CppKernel.codegen_loops_impl()": 144,
     "def CppKernelProxy.codegen_functions()": 183,
     "def CppKernelProxy.legalize_lowp_fp_dtype_loopbody()": 224,
     "def CppScheduling.fuse()": 81,
+<<<<<<< HEAD
     "def CppVecKernel.reduction_combine_vec()": 100,
     "def OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr()": 85,
     "def TilingSelect.select_tiling()": 170
@@ -54,17 +104,44 @@
   "torch/_inductor/codegen/cpp_grouped_gemm_template.py": {
     "def CppGroupedGemmTemplate.add_choices()": 154,
     "def CppGroupedGemmTemplate.render()": 153
+=======
+    "def CppVecKernel.reduction()": 193,
+    "def CppVecKernel.reduction_combine_vec()": 87,
+    "def TilingSelect.select_tiling()": 165
+  },
+  "torch/_inductor/codegen/cpp_flex_attention_template.py": {
+    "class CppFlexAttentionTemplate": 374,
+    "def CppFlexAttentionTemplate.modification()": 94
+  },
+  "torch/_inductor/codegen/cpp_gemm_template.py": {
+    "class CppGemmTemplate": 998,
+    "def CppGemmTemplate.add_choices()": 163,
+    "def CppGemmTemplate.get_options()": 243
+  },
+  "torch/_inductor/codegen/cpp_grouped_gemm_template.py": {
+    "def CppGroupedGemmTemplate.add_choices()": 141,
+    "def CppGroupedGemmTemplate.render()": 146
+  },
+  "torch/_inductor/codegen/cpp_micro_gemm.py": {
+    "def create_micro_gemm()": 94
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/codegen/cpp_template.py": {
     "class CppTemplate": 114
   },
   "torch/_inductor/codegen/cpp_template_kernel.py": {
+<<<<<<< HEAD
     "class CppTemplateKernel": 499,
     "def CppTemplateKernel.store_outputs()": 111
+=======
+    "class CppTemplateKernel": 469,
+    "def CppTemplateKernel.store_outputs()": 102
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/codegen/cpp_utils.py": {
     "def create_epilogue_with_attr()": 165
   },
+<<<<<<< HEAD
   "torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py": {
     "def CppWrapperCpuArrayRef.generate_return()": 128,
     "def CppWrapperCpuArrayRef.write_wrapper_decl()": 208
@@ -88,15 +165,65 @@
   },
   "torch/_inductor/codegen/rocm/ck_universal_gemm_template.py": {
     "class CKGemmTemplate": 950
+=======
+  "torch/_inductor/codegen/cpp_wrapper_cpu.py": {
+    "def CppWrapperCpu.generate_extern_kernel_args_decl_if_needed()": 152,
+    "def CppWrapperCpu.generate_input_output_runtime_checks()": 115,
+    "def CppWrapperCpu.generate_py_arg()": 96,
+    "def CppWrapperCpu.val_to_arg_str()": 88,
+    "def CppWrapperCpu.write_wrapper_decl()": 140
+  },
+  "torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py": {
+    "def CppWrapperCpuArrayRef.generate_return()": 127,
+    "def CppWrapperCpuArrayRef.write_wrapper_decl()": 208
+  },
+  "torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py": {
+    "def EmitGemmUniversal3xInstanceWithEVT.emit()": 98
+  },
+  "torch/_inductor/codegen/cuda/device_op_overrides.py": {
+    "class CUDADeviceOpOverrides": 222,
+    "def CUDADeviceOpOverrides.tma_descriptor_helpers()": 102
+  },
+  "torch/_inductor/codegen/cuda/gemm_template.py": {
+    "class CUTLASS2xGemmTemplate": 265,
+    "class CUTLASS3xGemmTemplate": 326
+  },
+  "torch/_inductor/codegen/debug_utils.py": {
+    "class DebugPrinterManager": 228
+  },
+  "torch/_inductor/codegen/halide.py": {
+    "class HalideKernel": 982,
+    "class HalideOverrides": 329,
+    "class HalidePrinter": 129,
+    "def HalideKernel.halide_kernel_meta()": 82
+  },
+  "torch/_inductor/codegen/mps.py": {
+    "class MetalKernel": 354,
+    "class MetalOverrides": 335,
+    "def MetalKernel.reduction()": 109
+  },
+  "torch/_inductor/codegen/rocm/ck_conv_template.py": {
+    "class CKGroupedConvFwdTemplate": 531,
+    "def CKGroupedConvFwdTemplate.globals()": 143
+  },
+  "torch/_inductor/codegen/rocm/ck_universal_gemm_template.py": {
+    "class CKGemmTemplate": 947
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/codegen/rocm/rocm_benchmark_request.py": {
     "class ROCmBenchmarkRequest": 117
   },
   "torch/_inductor/codegen/simd.py": {
+<<<<<<< HEAD
+=======
+    "class IterationRangesRoot": 122,
+    "class SIMDScheduling": 1054,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "def SIMDScheduling.candidate_tilings()": 126,
     "def SIMDScheduling.generate_node_schedule()": 95
   },
   "torch/_inductor/codegen/triton.py": {
+<<<<<<< HEAD
     "class TritonKernel": 2562,
     "class TritonOverrides": 469,
     "class TritonPrinter": 172,
@@ -107,6 +234,19 @@
     "def TritonKernel.reduction()": 396,
     "def TritonKernel.scan()": 110,
     "def TritonScheduling.benchmark_codegened_module()": 85,
+=======
+    "class BlockPtrOptions": 272,
+    "class TritonKernel": 2455,
+    "class TritonOverrides": 505,
+    "class TritonPrinter": 172,
+    "class TritonScheduling": 396,
+    "def TritonKernel.codegen_kernel()": 222,
+    "def TritonKernel.codegen_kernel_benchmark()": 89,
+    "def TritonKernel.load()": 134,
+    "def TritonKernel.reduction()": 383,
+    "def TritonKernel.scan()": 103,
+    "def TritonScheduling.benchmark_codegened_module()": 83,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "def TritonScheduling.benchmark_combo_kernel()": 91
   },
   "torch/_inductor/codegen/triton_combo_kernel.py": {
@@ -118,6 +258,7 @@
   },
   "torch/_inductor/codegen/wrapper.py": {
     "def PythonWrapperCodegen.benchmark_compiled_module()": 92,
+<<<<<<< HEAD
     "def PythonWrapperCodegen.define_user_defined_triton_kernel()": 266,
     "def PythonWrapperCodegen.generate_example_arg_value()": 84,
     "def user_defined_kernel_grid_fn_code()": 102
@@ -150,10 +291,54 @@
   },
   "torch/_inductor/fx_passes/b2b_gemm.py": {
     "def b2b_gemm_handler()": 182
+=======
+    "def PythonWrapperCodegen.define_user_defined_triton_kernel()": 249,
+    "def PythonWrapperCodegen.generate_example_arg_value()": 83,
+    "def user_defined_kernel_grid_fn_code()": 96
+  },
+  "torch/_inductor/comm_lowering.py": {
+    "def register_comm_lowerings()": 189
+  },
+  "torch/_inductor/comms.py": {
+    "def enforce_comm_ordering_for_fsdp()": 170,
+    "def reinplace_fsdp_all_gather()": 110
+  },
+  "torch/_inductor/compile_fx.py": {
+    "def _InProcessFxCompile.codegen_and_compile()": 379,
+    "def fw_compiler_freezing()": 93
+  },
+  "torch/_inductor/config.py": {
+    "class cpp": 107,
+    "class triton": 182
+  },
+  "torch/_inductor/constant_folding.py": {
+    "class ConstantFolder": 223,
+    "def ConstantFolder.run_node()": 94
+  },
+  "torch/_inductor/cpu_vec_isa.py": {
+    "class VecISA": 120
+  },
+  "torch/_inductor/debug.py": {
+    "class DebugContext": 158,
+    "class DebugFormatter": 189,
+    "def DebugFormatter.log_autotuning_results()": 81
+  },
+  "torch/_inductor/dependencies.py": {
+    "class MemoryDep": 225
+  },
+  "torch/_inductor/fx_passes/b2b_gemm.py": {
+    "def b2b_gemm_handler()": 180
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/fx_passes/binary_folding.py": {
     "def binary_folding_init()": 416
   },
+<<<<<<< HEAD
+=======
+  "torch/_inductor/fx_passes/freezing_patterns.py": {
+    "def addmm_patterns_init()": 94
+  },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch/_inductor/fx_passes/group_batch_fusion.py": {
     "def BatchLayernormFusion.fuse()": 131,
     "def PostGradBatchLinearFusion.fuse()": 83,
@@ -161,12 +346,17 @@
   },
   "torch/_inductor/fx_passes/joint_graph.py": {
     "def constant_fold_uniform_value()": 109,
+<<<<<<< HEAD
     "def remove_no_ops()": 97
+=======
+    "def remove_no_ops()": 93
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/fx_passes/micro_pipeline_tp.py": {
     "def find_all_gather_patterns()": 116,
     "def find_reduce_scatter_patterns()": 125
   },
+<<<<<<< HEAD
   "torch/_inductor/fx_passes/split_cat.py": {
     "def SplitCatSimplifier.replace_cat()": 152,
     "def merge_getitem_cat()": 97,
@@ -213,11 +403,83 @@
   },
   "torch/_inductor/kernel/mm.py": {
     "def tuned_addmm()": 151
+=======
+  "torch/_inductor/fx_passes/post_grad.py": {
+    "def lower_scan_to_while_loop()": 154
+  },
+  "torch/_inductor/fx_passes/split_cat.py": {
+    "def SplitCatSimplifier.replace_cat()": 145,
+    "def merge_getitem_cat()": 97,
+    "def merge_split_cat_aten()": 87,
+    "def move_reshape_out_of_split_stack()": 110
+  },
+  "torch/_inductor/fx_utils.py": {
+    "def FakeTensorUpdater.incremental_update()": 100
+  },
+  "torch/_inductor/graph.py": {
+    "class GraphLowering": 2032,
+    "def GraphLowering.call_function()": 116,
+    "def GraphLowering.extract_autotune_inputs()": 90,
+    "def GraphLowering.output()": 87,
+    "def GraphLowering.placeholder()": 92,
+    "def GraphLowering.run_node()": 380
+  },
+  "torch/_inductor/ir.py": {
+    "class Buffer": 122,
+    "class ComputedBuffer": 329,
+    "class Conditional": 138,
+    "class ExternKernel": 793,
+    "class FallbackKernel": 439,
+    "class FlexibleLayout": 139,
+    "class IRNode": 244,
+    "class Layout": 202,
+    "class Loops": 128,
+    "class Reduction": 737,
+    "class Scan": 199,
+    "class Sort": 150,
+    "class UserDefinedTritonKernel": 183,
+    "class View": 174,
+    "class WelfordReduction": 221,
+    "class WhileLoop": 203,
+    "def ConcatKernel.create()": 95,
+    "def ExternKernel.process_kernel()": 110,
+    "def ExternKernel.require_strides()": 149,
+    "def FallbackKernel.create()": 81,
+    "def FallbackKernel.export_extern_kernel_node()": 82,
+    "def Reduction.create()": 136,
+    "def Reduction.num_splits()": 152,
+    "def Scan.create()": 83,
+    "def WelfordReduction.create()": 110,
+    "def WhileLoop.create()": 161
+  },
+  "torch/_inductor/jagged_lowerings.py": {
+    "def register_jagged_ops()": 156
+  },
+  "torch/_inductor/kernel/bmm.py": {
+    "def tuned_bmm()": 91
+  },
+  "torch/_inductor/kernel/conv.py": {
+    "def convolution()": 231
+  },
+  "torch/_inductor/kernel/flex_attention.py": {
+    "def flex_attention()": 303,
+    "def flex_attention_backward()": 323,
+    "def lower_cpu()": 273
+  },
+  "torch/_inductor/kernel/flex_decoding.py": {
+    "def create_flex_decoding_kernel()": 288
+  },
+  "torch/_inductor/kernel/mm.py": {
+    "def tuned_addmm()": 169,
+    "def tuned_mm()": 127,
+    "def tuned_scaled_mm()": 130
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/loop_body.py": {
     "class CaptureIndexing": 174
   },
   "torch/_inductor/lowering.py": {
+<<<<<<< HEAD
     "def avg_pool2d_backward()": 164,
     "def avg_pool3d_backward()": 198,
     "def cat()": 123,
@@ -235,11 +497,29 @@
   },
   "torch/_inductor/mkldnn_lowerings.py": {
     "def register_onednn_fusion_ops()": 1156
+=======
+    "def avg_pool2d_backward()": 155,
+    "def avg_pool3d_backward()": 189,
+    "def cat()": 123,
+    "def index_put_impl_()": 125,
+    "def make_pointwise()": 85,
+    "def max_pool2d_with_indices_backward()": 140,
+    "def scatter_reduce_()": 111,
+    "def sdpa_constraint()": 132,
+    "def searchsorted()": 84
+  },
+  "torch/_inductor/mkldnn_ir.py": {
+    "class MkldnnRnnLayer": 114
+  },
+  "torch/_inductor/mkldnn_lowerings.py": {
+    "def register_onednn_fusion_ops()": 1152
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/mock_cache.py": {
     "class PatchCaches": 108
   },
   "torch/_inductor/pattern_matcher.py": {
+<<<<<<< HEAD
     "class ReplacementPatternEntry": 202,
     "def ReplacementPatternEntry.replace_with_graph()": 188
   },
@@ -248,11 +528,22 @@
   },
   "torch/_inductor/runtime/autotune_cache.py": {
     "class AutotuneCache": 201
+=======
+    "class ReplacementPatternEntry": 196,
+    "def ReplacementPatternEntry.replace_with_graph()": 177
+  },
+  "torch/_inductor/quantized_lowerings.py": {
+    "def register_woq_mm_ops()": 136
+  },
+  "torch/_inductor/runtime/autotune_cache.py": {
+    "class AutotuneCache": 190
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   "torch/_inductor/runtime/benchmarking.py": {
     "class InductorBenchmarker": 111
   },
   "torch/_inductor/scheduler.py": {
+<<<<<<< HEAD
     "class BaseSchedulerNode": 695,
     "class BaseScheduling": 142,
     "class SchedulerBuffer": 106,
@@ -263,5 +554,31 @@
   },
   "torch/_inductor/utils.py": {
     "class IndentedBuffer": 145
+=======
+    "class BaseSchedulerNode": 697,
+    "class BaseScheduling": 139,
+    "class Scheduler": 2568,
+    "class SchedulerBuffer": 103,
+    "class SchedulerNode": 256
+  },
+  "torch/_inductor/select_algorithm.py": {
+    "class AlgorithmSelectorCache": 694,
+    "class TritonTemplate": 224,
+    "class TritonTemplateKernel": 770,
+    "def AlgorithmSelectorCache.log_results()": 92,
+    "def AlgorithmSelectorCache.make_benchmark_fn[2]()": 145
+  },
+  "torch/_inductor/sizevars.py": {
+    "class SizeVarAllocator": 780
+  },
+  "torch/_inductor/template_heuristics.py": {
+    "class ROCmConfigHeuristic": 212
+  },
+  "torch/_inductor/utils.py": {
+    "class IndentedBuffer": 136
+  },
+  "torch/_inductor/wrapper_benchmark.py": {
+    "def parse_profile_event_list()": 119
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
\ No newline at end of file
diff --git a/tools/linter/adapters/docstring_linter.py b/tools/linter/adapters/docstring_linter.py
index ce891bedcf998..8597ef862c994 100644
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@@ -5,12 +5,19 @@
 import sys
 from functools import cached_property
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _FILE = Path(__file__).absolute()
 _PATH = [Path(p).absolute() for p in sys.path]
+<<<<<<< HEAD
 _OVERRIDES = {"@override", "@typing_extensions.override", "@typing.override"}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING or _FILE.parent not in _PATH:
     from . import _linter
@@ -18,7 +25,11 @@
     import _linter
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterator, Sequence
+=======
+    from collections.abc import Iterator, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 GRANDFATHER_LIST = _FILE.parent / "docstring_linter-grandfather.json"
@@ -155,7 +166,11 @@ def has_class_init_doc(b: _linter.Block) -> bool:
     def _is_bad_block(self, b: _linter.Block, pf: _linter.PythonFile) -> bool:
         max_lines = self._max_lines[b.category]
         return (
+<<<<<<< HEAD
             not (b.is_override or pf.omitted(pf.tokens, b.begin, b.dedent))
+=======
+            not pf.omitted(pf.tokens, b.begin, b.dedent)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and b.line_count > max_lines
             and len(b.docstring) < self.args.min_docstring
             and (self.args.lint_local or not b.is_local)
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index d51ef09fec75e..94155afe9547e 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -172,7 +172,11 @@ def run_command(
             ):
                 raise err
             remaining_retries -= 1
+<<<<<<< HEAD
             logging.warning(  # noqa: G200
+=======
+            logging.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py
index f8c88d02409b4..86166de7b1976 100644
--- a/tools/linter/adapters/grep_linter.py
+++ b/tools/linter/adapters/grep_linter.py
@@ -16,11 +16,14 @@
 
 
 IS_WINDOWS: bool = os.name == "nt"
+<<<<<<< HEAD
 MAX_FILE_SIZE: int = 1024 * 1024 * 1024  # 1GB in bytes
 MAX_MATCHES_PER_FILE: int = 100  # Maximum number of matches to report per file
 MAX_ORIGINAL_SIZE: int = (
     512 * 1024
 )  # 512KB - don't compute replacement if original is larger
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LintSeverity(str, Enum):
@@ -30,10 +33,13 @@ class LintSeverity(str, Enum):
     DISABLED = "disabled"
 
 
+<<<<<<< HEAD
 LINTER_NAME: str = ""
 ERROR_DESCRIPTION: str | None = None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintMessage(NamedTuple):
     path: str | None
     line: int | None
@@ -65,6 +71,7 @@ def run_command(
         logging.debug("took %dms", (end_time - start_time) * 1000)
 
 
+<<<<<<< HEAD
 def print_lint_message(
     name: str,
     severity: LintSeverity = LintSeverity.ERROR,
@@ -202,6 +209,34 @@ def lint_file(
         except Exception as err:
             print_lint_message(
                 name="command-failed",
+=======
+def lint_file(
+    matching_line: str,
+    allowlist_pattern: str,
+    replace_pattern: str,
+    linter_name: str,
+    error_name: str,
+    error_description: str,
+) -> LintMessage | None:
+    # matching_line looks like:
+    #   tools/linter/clangtidy_linter.py:13:import foo.bar.baz
+    split = matching_line.split(":")
+    filename = split[0]
+
+    if allowlist_pattern:
+        try:
+            proc = run_command(["grep", "-nEHI", allowlist_pattern, filename])
+        except Exception as err:
+            return LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=linter_name,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 description=(
                     f"Failed due to {err.__class__.__name__}:\n{err}"
                     if not isinstance(err, subprocess.CalledProcessError)
@@ -218,6 +253,7 @@ def lint_file(
                     )
                 ),
             )
+<<<<<<< HEAD
             return
 
         print_lint_message(
@@ -248,6 +284,60 @@ def lint_file(
                 name="too-many-matches",
                 description=f"File has {total_matches} matches, only showing first {MAX_MATCHES_PER_FILE}",
             )
+=======
+
+        # allowlist pattern was found, abort lint
+        if proc.returncode == 0:
+            return None
+
+    original = None
+    replacement = None
+    if replace_pattern:
+        with open(filename) as f:
+            original = f.read()
+
+        try:
+            proc = run_command(["sed", "-r", replace_pattern, filename])
+            replacement = proc.stdout.decode("utf-8")
+        except Exception as err:
+            return LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=linter_name,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(
+                    f"Failed due to {err.__class__.__name__}:\n{err}"
+                    if not isinstance(err, subprocess.CalledProcessError)
+                    else (
+                        "COMMAND (exit code {returncode})\n"
+                        "{command}\n\n"
+                        "STDERR\n{stderr}\n\n"
+                        "STDOUT\n{stdout}"
+                    ).format(
+                        returncode=err.returncode,
+                        command=" ".join(as_posix(x) for x in err.cmd),
+                        stderr=err.stderr.decode("utf-8").strip() or "(empty)",
+                        stdout=err.stdout.decode("utf-8").strip() or "(empty)",
+                    )
+                ),
+            )
+
+    return LintMessage(
+        path=split[0],
+        line=int(split[1]) if len(split) > 1 else None,
+        char=None,
+        code=linter_name,
+        severity=LintSeverity.ERROR,
+        name=error_name,
+        original=original,
+        replacement=replacement,
+        description=error_description,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def main() -> None:
@@ -301,6 +391,7 @@ def main() -> None:
         nargs="+",
         help="paths to lint",
     )
+<<<<<<< HEAD
 
     # Check for duplicate arguments before parsing
     seen_args = set()
@@ -319,6 +410,10 @@ def main() -> None:
     LINTER_NAME = args.linter_name
     ERROR_DESCRIPTION = args.error_description
 
+=======
+    args = parser.parse_args()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     logging.basicConfig(
         format="<%(threadName)s:%(levelname)s> %(message)s",
         level=logging.NOTSET
@@ -329,6 +424,7 @@ def main() -> None:
         stream=sys.stderr,
     )
 
+<<<<<<< HEAD
     # Filter out files that are too large before running grep
     filtered_filenames = []
     for filename in args.filenames:
@@ -354,6 +450,8 @@ def main() -> None:
     if not filtered_filenames:
         return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     files_with_matches = []
     if args.match_first_only:
         files_with_matches = ["--files-with-matches"]
@@ -362,23 +460,46 @@ def main() -> None:
     try:
         # Split the grep command into multiple batches to avoid hitting the
         # command line length limit of ~1M on my machine
+<<<<<<< HEAD
         arg_length = sum(len(x) for x in filtered_filenames)
         batches = arg_length // 750000 + 1
         batch_size = len(filtered_filenames) // batches
         for i in range(0, len(filtered_filenames), batch_size):
+=======
+        arg_length = sum(len(x) for x in args.filenames)
+        batches = arg_length // 750000 + 1
+        batch_size = len(args.filenames) // batches
+        for i in range(0, len(args.filenames), batch_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             proc = run_command(
                 [
                     "grep",
                     "-nEHI",
                     *files_with_matches,
                     args.pattern,
+<<<<<<< HEAD
                     *filtered_filenames[i : i + batch_size],
+=======
+                    *args.filenames[i : i + batch_size],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
             lines.extend(proc.stdout.decode().splitlines())
     except Exception as err:
+<<<<<<< HEAD
         print_lint_message(
             name="command-failed",
+=======
+        err_msg = LintMessage(
+            path=None,
+            line=None,
+            char=None,
+            code=args.linter_name,
+            severity=LintSeverity.ERROR,
+            name="command-failed",
+            original=None,
+            replacement=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             description=(
                 f"Failed due to {err.__class__.__name__}:\n{err}"
                 if not isinstance(err, subprocess.CalledProcessError)
@@ -395,6 +516,7 @@ def main() -> None:
                 )
             ),
         )
+<<<<<<< HEAD
         sys.exit(0)
 
     # Group lines by file to call lint_file once per file
@@ -408,6 +530,22 @@ def main() -> None:
             args.replace_pattern,
             args.error_name,
         )
+=======
+        print(json.dumps(err_msg._asdict()), flush=True)
+        sys.exit(0)
+
+    for line in lines:
+        lint_message = lint_file(
+            line,
+            args.allowlist_pattern,
+            args.replace_pattern,
+            args.linter_name,
+            args.error_name,
+            args.error_description,
+        )
+        if lint_message is not None:
+            print(json.dumps(lint_message._asdict()), flush=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/tools/linter/adapters/import_linter.py b/tools/linter/adapters/import_linter.py
index 69c5ecc19fa5c..62446fadd6345 100644
--- a/tools/linter/adapters/import_linter.py
+++ b/tools/linter/adapters/import_linter.py
@@ -47,8 +47,286 @@ class LintMessage(NamedTuple):
 CURRENT_FILE_NAME = os.path.basename(__file__)
 _MODULE_NAME_ALLOW_LIST: set[str] = set()
 
+<<<<<<< HEAD
 # Add builtin modules of python.
 _MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
+=======
+# Add builtin modules.
+if sys.version_info >= (3, 10):
+    _MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
+else:
+    assert (sys.version_info.major, sys.version_info.minor) == (3, 9)
+    # Taken from `stdlib_list("3.9")` to avoid introducing a new dependency.
+    _MODULE_NAME_ALLOW_LIST.update(
+        [
+            "__future__",
+            "_abc",
+            "_aix_support",
+            "_ast",
+            "_bootlocale",
+            "_bootsubprocess",
+            "_codecs",
+            "_collections",
+            "_collections_abc",
+            "_compat_pickle",
+            "_compression",
+            "_crypt",
+            "_functools",
+            "_hashlib",
+            "_imp",
+            "_io",
+            "_locale",
+            "_lsprof",
+            "_markupbase",
+            "_operator",
+            "_osx_support",
+            "_peg_parser",
+            "_posixsubprocess",
+            "_py_abc",
+            "_pydecimal",
+            "_pyio",
+            "_random",
+            "_signal",
+            "_sitebuiltins",
+            "_socket",
+            "_sre",
+            "_ssl",
+            "_stat",
+            "_string",
+            "_strptime",
+            "_symtable",
+            "_sysconfigdata_x86_64_conda_cos6_linux_gnu",
+            "_sysconfigdata_x86_64_conda_linux_gnu",
+            "_thread",
+            "_threading_local",
+            "_tracemalloc",
+            "_uuid",
+            "_warnings",
+            "_weakref",
+            "_weakrefset",
+            "abc",
+            "aifc",
+            "antigravity",
+            "argparse",
+            "array",
+            "ast",
+            "asynchat",
+            "asyncio",
+            "asyncore",
+            "atexit",
+            "audioop",
+            "base64",
+            "bdb",
+            "binascii",
+            "binhex",
+            "bisect",
+            "builtins",
+            "bz2",
+            "cProfile",
+            "calendar",
+            "cgi",
+            "cgitb",
+            "chunk",
+            "cmath",
+            "cmd",
+            "code",
+            "codecs",
+            "codeop",
+            "collections",
+            "colorsys",
+            "compileall",
+            "concurrent",
+            "configparser",
+            "contextlib",
+            "contextvars",
+            "copy",
+            "copyreg",
+            "crypt",
+            "csv",
+            "ctypes",
+            "curses",
+            "dataclasses",
+            "datetime",
+            "dbm",
+            "decimal",
+            "difflib",
+            "dis",
+            "distutils",
+            "doctest",
+            "email",
+            "encodings",
+            "ensurepip",
+            "enum",
+            "errno",
+            "faulthandler",
+            "fcntl",
+            "filecmp",
+            "fileinput",
+            "fnmatch",
+            "formatter",
+            "fractions",
+            "ftplib",
+            "functools",
+            "gc",
+            "genericpath",
+            "getopt",
+            "getpass",
+            "gettext",
+            "glob",
+            "graphlib",
+            "grp",
+            "gzip",
+            "hashlib",
+            "heapq",
+            "hmac",
+            "html",
+            "http",
+            "idlelib",
+            "imaplib",
+            "imghdr",
+            "imp",
+            "importlib",
+            "inspect",
+            "io",
+            "ipaddress",
+            "itertools",
+            "json",
+            "keyword",
+            "lib2to3",
+            "linecache",
+            "locale",
+            "logging",
+            "lzma",
+            "mailbox",
+            "mailcap",
+            "marshal",
+            "math",
+            "mimetypes",
+            "mmap",
+            "modulefinder",
+            "msilib",
+            "msvcrt",
+            "multiprocessing",
+            "netrc",
+            "nis",
+            "nntplib",
+            "ntpath",
+            "nturl2path",
+            "numbers",
+            "opcode",
+            "operator",
+            "optparse",
+            "os",
+            "ossaudiodev",
+            "parser",
+            "pathlib",
+            "pdb",
+            "pickle",
+            "pickletools",
+            "pipes",
+            "pkgutil",
+            "platform",
+            "plistlib",
+            "poplib",
+            "posix",
+            "posixpath",
+            "pprint",
+            "profile",
+            "pstats",
+            "pty",
+            "pwd",
+            "py_compile",
+            "pyclbr",
+            "pydoc",
+            "pydoc_data",
+            "queue",
+            "quopri",
+            "random",
+            "re",
+            "readline",
+            "reprlib",
+            "resource",
+            "rlcompleter",
+            "runpy",
+            "sched",
+            "secrets",
+            "select",
+            "selectors",
+            "shelve",
+            "shlex",
+            "shutil",
+            "signal",
+            "site",
+            "smtpd",
+            "smtplib",
+            "sndhdr",
+            "socket",
+            "socketserver",
+            "spwd",
+            "sqlite3",
+            "sre_compile",
+            "sre_constants",
+            "sre_parse",
+            "ssl",
+            "stat",
+            "statistics",
+            "string",
+            "stringprep",
+            "struct",
+            "subprocess",
+            "sunau",
+            "symbol",
+            "symtable",
+            "sys",
+            "sysconfig",
+            "syslog",
+            "tabnanny",
+            "tarfile",
+            "telnetlib",
+            "tempfile",
+            "termios",
+            "test",
+            "textwrap",
+            "this",
+            "threading",
+            "time",
+            "timeit",
+            "tkinter",
+            "token",
+            "tokenize",
+            "trace",
+            "traceback",
+            "tracemalloc",
+            "tty",
+            "turtle",
+            "turtledemo",
+            "types",
+            "typing",
+            "unicodedata",
+            "unittest",
+            "urllib",
+            "uu",
+            "uuid",
+            "venv",
+            "warnings",
+            "wave",
+            "weakref",
+            "webbrowser",
+            "winreg",
+            "winsound",
+            "wsgiref",
+            "xdrlib",
+            "xml",
+            "xmlrpc",
+            "xxsubtype",
+            "zipapp",
+            "zipfile",
+            "zipimport",
+            "zlib",
+            "zoneinfo",
+        ]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Add the allowed third party libraries. Please avoid updating this unless you
 # understand the risks -- see `_ERROR_MESSAGE` for why.
diff --git a/tools/linter/adapters/newlines_linter.py b/tools/linter/adapters/newlines_linter.py
index cbd67c657b213..85ce90fa05b0a 100644
--- a/tools/linter/adapters/newlines_linter.py
+++ b/tools/linter/adapters/newlines_linter.py
@@ -7,7 +7,10 @@
 import argparse
 import json
 import logging
+<<<<<<< HEAD
 import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from enum import Enum
 from typing import NamedTuple
@@ -16,7 +19,10 @@
 NEWLINE = 10  # ASCII "\n"
 CARRIAGE_RETURN = 13  # ASCII "\r"
 LINTER_CODE = "NEWLINE"
+<<<<<<< HEAD
 MAX_FILE_SIZE: int = 1024 * 1024 * 1024  # 1GB in bytes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LintSeverity(str, Enum):
@@ -41,6 +47,7 @@ class LintMessage(NamedTuple):
 def check_file(filename: str) -> LintMessage | None:
     logging.debug("Checking file %s", filename)
 
+<<<<<<< HEAD
     # Check if file is too large
     try:
         file_size = os.path.getsize(filename)
@@ -69,6 +76,8 @@ def check_file(filename: str) -> LintMessage | None:
             description=f"Failed to get file size: {err}",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with open(filename, "rb") as f:
         lines = f.readlines()
 
diff --git a/tools/linter/adapters/no_workflows_on_fork.py b/tools/linter/adapters/no_workflows_on_fork.py
index 02efd5f6f62a7..68b6cb53bb9d6 100644
--- a/tools/linter/adapters/no_workflows_on_fork.py
+++ b/tools/linter/adapters/no_workflows_on_fork.py
@@ -22,15 +22,22 @@
 import re
 from enum import Enum
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, NamedTuple, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from yaml import load
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Safely load fast C Yaml loader/dumper if they are available
 try:
     from yaml import CSafeLoader as Loader
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index 8e5aca4f71024..1660a7d6f6ef7 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -13,6 +13,7 @@
 import time
 
 
+<<<<<<< HEAD
 def run_command(
     args: list[str],
     env: dict[str, str] | None = None,
@@ -21,12 +22,23 @@ def run_command(
     start_time = time.monotonic()
     try:
         return subprocess.run(args, env=env, text=True, encoding="utf-8", check=True)
+=======
+def run_command(args: list[str]) -> subprocess.CompletedProcess[bytes]:
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(args, check=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     finally:
         end_time = time.monotonic()
         logging.debug("took %dms", (end_time - start_time) * 1000)
 
 
+<<<<<<< HEAD
 def main() -> None:
+=======
+if __name__ == "__main__":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser = argparse.ArgumentParser(description="pip initializer")
     parser.add_argument(
         "packages",
@@ -41,6 +53,14 @@ def main() -> None:
     parser.add_argument(
         "--dry-run", help="do not install anything, just print what would be done."
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--no-black-binary",
+        help="do not use pre-compiled binaries from pip for black.",
+        action="store_true",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     args = parser.parse_args()
 
@@ -50,6 +70,7 @@ def main() -> None:
         stream=sys.stderr,
     )
 
+<<<<<<< HEAD
     env: dict[str, str] = {
         **os.environ,
         "UV_PYTHON": sys.executable,
@@ -60,6 +81,19 @@ def main() -> None:
     uv_index = env.get("UV_INDEX", env.get("PIP_EXTRA_INDEX_URL"))
     if uv_index:
         env["UV_INDEX"] = uv_index
+=======
+    uv_available = (
+        any(prefix in sys.base_prefix for prefix in ["uv/python", "uv\\python"])
+        and shutil.which("uv") is not None
+    )
+
+    if uv_available:
+        pip_args = ["uv", "pip", "install"]
+    elif sys.executable:
+        pip_args = [sys.executable, "-mpip", "install"]
+    else:
+        pip_args = ["pip3", "install"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If we are in a global install, use `--user` to install so that you do not
     # need root access in order to initialize linters.
@@ -67,6 +101,7 @@ def main() -> None:
     # However, `pip install --user` interacts poorly with virtualenvs (see:
     # https://bit.ly/3vD4kvl) and conda (see: https://bit.ly/3KG7ZfU). So in
     # these cases perform a regular installation.
+<<<<<<< HEAD
     in_conda = env.get("CONDA_PREFIX") is not None
     in_virtualenv = env.get("VIRTUAL_ENV") is not None
     need_user_flag = not in_conda and not in_virtualenv
@@ -81,6 +116,11 @@ def main() -> None:
         pip_args = ["pip3", "install"]
 
     if need_user_flag:
+=======
+    in_conda = os.environ.get("CONDA_PREFIX") is not None
+    in_virtualenv = os.environ.get("VIRTUAL_ENV") is not None
+    if not in_conda and not in_virtualenv:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pip_args.append("--user")
 
     pip_args.extend(args.packages)
@@ -89,17 +129,29 @@ def main() -> None:
         package_name, _, version = package.partition("=")
         if version == "":
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Package {package_name} did not have a version specified. "
                 "Please specify a version to produce a consistent linting experience."
             )
+=======
+                "Package {package_name} did not have a version specified. "
+                "Please specify a version to produce a consistent linting experience."
+            )
+        if args.no_black_binary and "black" in package_name:
+            pip_args.append(f"--no-binary={package_name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dry_run = args.dry_run == "1"
     if dry_run:
         print(f"Would have run: {pip_args}")
         sys.exit(0)
 
+<<<<<<< HEAD
     run_command(pip_args, env=env)
 
 
 if __name__ == "__main__":
     main()
+=======
+    run_command(pip_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/linter/adapters/pyfmt_linter.py b/tools/linter/adapters/pyfmt_linter.py
index c6486bf596ab4..c40cedb6bce95 100644
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@@ -2,6 +2,10 @@
 
 import argparse
 import concurrent.futures
+<<<<<<< HEAD
+=======
+import fnmatch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import json
 import logging
 import os
@@ -12,7 +16,11 @@
 from pathlib import Path
 from typing import NamedTuple
 
+<<<<<<< HEAD
 # pyrefly: ignore [import-error]
+=======
+import black
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import isort
 import usort
 
@@ -20,6 +28,56 @@
 IS_WINDOWS: bool = os.name == "nt"
 REPO_ROOT = Path(__file__).absolute().parents[3]
 
+<<<<<<< HEAD
+=======
+# TODO: remove this when it gets empty and remove `black` in PYFMT
+USE_BLACK_FILELIST = re.compile(
+    "|".join(
+        (
+            r"\A\Z",  # empty string
+            *map(
+                fnmatch.translate,
+                [
+                    # **
+                    # .ci/**
+                    # .github/**
+                    # benchmarks/**
+                    # functorch/**
+                    # tools/**
+                    # torchgen/**
+                    # test/**
+                    # test/[a-h]*/**
+                    # test/[i-j]*/**
+                    "test/j*/**",
+                    # test/[k-m]*/**
+                    "test/[k-m]*/**",
+                    # test/optim/**
+                    # "test/[p-z]*/**",
+                    "test/[p-z]*/**",
+                    # torch/**
+                    # torch/_[a-c]*/**
+                    "torch/_[a-c]*/**",
+                    # torch/_[e-h]*/**
+                    "torch/_[e-h]*/**",
+                    # torch/_i*/**
+                    # torch/_[j-z]*/**
+                    "torch/_[j-z]*/**",
+                    # torch/[a-c]*/**
+                    "torch/a[a-n]*/**",
+                    "torch/a[p-z]*/**",
+                    "torch/[b-c]*/**",
+                    # torch/d*/**
+                    # torch/[e-m]*/**
+                    # torch/optim/**
+                    # torch/[p-z]*/**
+                    "torch/[p-z]*/**",
+                ],
+            ),
+        )
+    )
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class LintSeverity(str, Enum):
     ERROR = "error"
@@ -79,6 +137,26 @@ def run_usort(content: str, path: Path) -> str:
     return usort.usort_string(content, path=path, config=usort_config)
 
 
+<<<<<<< HEAD
+=======
+def run_black(content: str, path: Path) -> str:
+    black_config = black.parse_pyproject_toml(black.find_pyproject_toml((str(path),)))  # type: ignore[attr-defined,arg-type]
+    # manually patch options that do not have a 1-to-1 match in Mode arguments
+    black_config["target_versions"] = {
+        black.TargetVersion[ver.upper()]  # type: ignore[attr-defined]
+        for ver in black_config.pop("target_version", [])
+    }
+    black_config["string_normalization"] = not black_config.pop(
+        "skip_string_normalization", False
+    )
+    black_mode = black.Mode(**black_config)
+    black_mode.is_pyi = path.suffix.lower() == ".pyi"
+    black_mode.is_ipynb = path.suffix.lower() == ".ipynb"
+
+    return black.format_str(content, mode=black_mode)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_ruff_format(content: str, path: Path) -> str:
     try:
         return subprocess.check_output(
@@ -110,7 +188,14 @@ def check_file(filename: str) -> list[LintMessage]:
         # NB: run isort first to enforce style for blank lines
         replacement = run_isort(replacement, path=path)
         replacement = run_usort(replacement, path=path)
+<<<<<<< HEAD
         replacement = run_ruff_format(replacement, path=path)
+=======
+        if USE_BLACK_FILELIST.match(path.absolute().relative_to(REPO_ROOT).as_posix()):
+            replacement = run_black(replacement, path=path)
+        else:
+            replacement = run_ruff_format(replacement, path=path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if original == replacement:
             return []
diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py
index 28feae002f36c..94c34fd6ba243 100644
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -112,7 +112,11 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
+<<<<<<< HEAD
             logging.warning(  # noqa: G200
+=======
+            logging.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index c1ca031fe482a..dcbd3e39dd26c 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -55,7 +55,10 @@ def report_download_progress(
     Pretty printer for file download progress.
     """
     if file_size != -1:
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
@@ -95,8 +98,13 @@ def check(binary_path: Path, reference_hash: str) -> bool:
 
     try:
         binary_path.unlink()
+<<<<<<< HEAD
     except OSError:
         logging.critical("Failed to delete binary", exc_info=True)
+=======
+    except OSError as e:
+        logging.critical("Failed to delete binary: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logging.critical(
             "Delete this binary as soon as possible and do not execute it!"
         )
diff --git a/tools/linter/adapters/test_device_bias_linter.py b/tools/linter/adapters/test_device_bias_linter.py
index a2079e4fe810a..b5005eca61202 100644
--- a/tools/linter/adapters/test_device_bias_linter.py
+++ b/tools/linter/adapters/test_device_bias_linter.py
@@ -1,9 +1,15 @@
 #!/usr/bin/env python3
 """
 This lint verifies that every Python test file (file that matches test_*.py or
+<<<<<<< HEAD
 *_test.py in the test folder) has a cuda hard code in `requires_gpu()` or
 `requires_triton()` decorated function or `if HAS_GPU:` guarded main section,
 to ensure that the test not fail on other GPU devices.
+=======
+*_test.py in the test folder) has a cuda hard code in `requires_gpu()`
+decorated function to ensure that the test not fail on other GPU.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 from __future__ import annotations
@@ -39,6 +45,7 @@ class LintMessage(NamedTuple):
 
 
 DEVICE_BIAS = ["cuda", "xpu", "mps"]
+<<<<<<< HEAD
 GPU_RELATED_DECORATORS = {"requires_gpu", "requires_triton"}
 
 
@@ -87,11 +94,27 @@ def __init__(self, filename: str, is_gpu_test_suite: bool) -> None:
     def _has_proper_decorator(self, node: ast.FunctionDef) -> bool:
         for d in node.decorator_list:
             if isinstance(d, ast.Name) and d.id in GPU_RELATED_DECORATORS:
+=======
+
+
+class DeviceBiasVisitor(ast.NodeVisitor):
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.lint_messages: list[LintMessage] = []
+
+    def _has_requires_gpu_decorator(self, node: ast.FunctionDef) -> bool:
+        for d in node.decorator_list:
+            if isinstance(d, ast.Name) and d.id == "requires_gpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return True
             if (
                 isinstance(d, ast.Call)
                 and isinstance(d.func, ast.Name)
+<<<<<<< HEAD
                 and d.func.id in GPU_RELATED_DECORATORS
+=======
+                and d.func.id == "requires_gpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return True
         return False
@@ -100,6 +123,10 @@ def _has_proper_decorator(self, node: ast.FunctionDef) -> bool:
     def _check_keyword_device(self, subnode: ast.keyword, msg_prefix: str) -> None:
         if subnode.arg != "device":
             return
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         val = subnode.value
         if isinstance(val, ast.Constant) and any(
             bias in val.value for bias in DEVICE_BIAS
@@ -142,6 +169,7 @@ def _check_device_methods(self, subnode: ast.Call, msg_prefix: str) -> None:
                     f"{msg_prefix} .to('{arg.value}'), suggest to use .to(GPU_TYPE)",
                 )
 
+<<<<<<< HEAD
     def _check_with_statement(self, node: ast.With, msg_prefix: str) -> None:
         for item in node.items:
             ctx_expr = item.context_expr
@@ -162,6 +190,17 @@ def _check_with_statement(self, node: ast.With, msg_prefix: str) -> None:
                     )
 
     def _check_node(self, node: ast.AST, msg_prefix: str) -> None:
+=======
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        # Check if the function is decorated with @requires_gpu, which indicates
+        # that the function is intended to run on GPU devices (e.g., CUDA or XPU),
+        # but ensure it does not hardcode the device to CUDA.
+        if not self._has_requires_gpu_decorator(node):
+            self.generic_visit(node)
+            return
+
+        msg_prefix = "`@requires_gpu` function should not hardcode"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for subnode in ast.walk(node):
             if isinstance(subnode, ast.keyword):
                 self._check_keyword_device(subnode, msg_prefix)
@@ -169,6 +208,7 @@ def _check_node(self, node: ast.AST, msg_prefix: str) -> None:
                 subnode.func, ast.Attribute
             ):
                 self._check_device_methods(subnode, msg_prefix)
+<<<<<<< HEAD
             elif isinstance(subnode, ast.With):
                 self._check_with_statement(subnode, msg_prefix)
 
@@ -182,6 +222,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
             # If the function is guarded by HAS_GPU in main(), we still need to check for device bias
             msg_prefix = "The test suites is shared amount GPUS, should not hardcode"
             self._check_node(node, msg_prefix)
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.generic_visit(node)
 
     def record(self, node: ast.AST, message: str) -> None:
@@ -204,16 +247,27 @@ def check_file(filename: str) -> list[LintMessage]:
     with open(filename) as f:
         source = f.read()
         tree = ast.parse(source, filename=filename)
+<<<<<<< HEAD
         is_gpu_test_suite = is_main_has_gpu(tree)
         checker = DeviceBiasVisitor(filename, is_gpu_test_suite)
         checker.visit(tree)
+=======
+        checker = DeviceBiasVisitor(filename)
+        checker.visit(tree)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return checker.lint_messages
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(
+<<<<<<< HEAD
         description="Detect Device bias in functions decorated with requires_gpu/requires_triton"
         " or guarded by HAS_GPU block in main() that may break other GPU devices.",
+=======
+        description="Detect Device bias in python functions decorated with [require_gpu]"
+        " that may potentially break support for other GPU devices.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fromfile_prefix_chars="@",
     )
     parser.add_argument(
diff --git a/tools/linter/adapters/test_has_main_linter.py b/tools/linter/adapters/test_has_main_linter.py
index 983966345f30b..131d33f2fb4a1 100644
--- a/tools/linter/adapters/test_has_main_linter.py
+++ b/tools/linter/adapters/test_has_main_linter.py
@@ -15,10 +15,14 @@
 from enum import Enum
 from typing import NamedTuple
 
+<<<<<<< HEAD
 # pyrefly: ignore [import-error]
 import libcst as cst
 
 # pyrefly: ignore [import-error]
+=======
+import libcst as cst
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import libcst.matchers as m
 
 
diff --git a/tools/linter/adapters/testowners_linter.py b/tools/linter/adapters/testowners_linter.py
index 82dc668b67382..70d87e6605085 100755
--- a/tools/linter/adapters/testowners_linter.py
+++ b/tools/linter/adapters/testowners_linter.py
@@ -103,7 +103,11 @@ def check_labels(
                 description=(
                     f"{label} is not an acceptable owner "
                     "(please update to another label or edit ACCEPTABLE_OWNERS_LABELS "
+<<<<<<< HEAD
                     "in tools/linters/adapters/testowners_linter.py)"
+=======
+                    "in tools/linters/adapters/testowners_linter.py"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
         )
diff --git a/tools/linter/adapters/workflow_consistency_linter.py b/tools/linter/adapters/workflow_consistency_linter.py
index 64a2ce15dc2a1..509665d76eace 100644
--- a/tools/linter/adapters/workflow_consistency_linter.py
+++ b/tools/linter/adapters/workflow_consistency_linter.py
@@ -16,9 +16,12 @@
 from yaml import dump, load
 
 
+<<<<<<< HEAD
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
@@ -62,6 +65,7 @@ def is_workflow(yaml: Any) -> bool:
     return yaml.get("jobs") is not None
 
 
+<<<<<<< HEAD
 def print_lint_message(
     path: Path,
     job: dict[str, Any],
@@ -69,6 +73,9 @@ def print_lint_message(
     baseline_path: Path,
     baseline_job_id: str,
 ) -> None:
+=======
+def print_lint_message(path: Path, job: dict[str, Any], sync_tag: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     job_id = next(iter(job.keys()))
     with open(path) as f:
         lines = f.readlines()
@@ -78,7 +85,10 @@ def print_lint_message(
 
     lint_message = LintMessage(
         path=str(path),
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         line=line_number,
         char=None,
         code="WORKFLOWSYNC",
@@ -86,11 +96,16 @@ def print_lint_message(
         name="workflow-inconsistency",
         original=None,
         replacement=None,
+<<<<<<< HEAD
         description=f"Job doesn't match other job {baseline_job_id} in file {baseline_path} with sync-tag: '{sync_tag}'",
+=======
+        description=f"Job doesn't match other jobs with sync-tag: '{sync_tag}'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     print(json.dumps(lint_message._asdict()), flush=True)
 
 
+<<<<<<< HEAD
 def get_jobs_with_sync_tag(
     job: dict[str, Any],
 ) -> tuple[str, str, dict[str, Any]] | None:
@@ -110,6 +125,8 @@ def get_jobs_with_sync_tag(
     return (sync_tag, job_id, job)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="workflow consistency linter.",
@@ -122,6 +139,7 @@ def get_jobs_with_sync_tag(
     )
     args = parser.parse_args()
 
+<<<<<<< HEAD
     # Go through all files, aggregating jobs with the same sync tag
     tag_to_jobs = defaultdict(list)
     for path in REPO_ROOT.glob(".github/workflows/*"):
@@ -140,10 +158,15 @@ def get_jobs_with_sync_tag(
             tag_to_jobs[sync_tag].append((clean_path, job_id, job_dict))
 
     # Check the files passed as arguments
+=======
+    # Go through the provided files, aggregating jobs with the same sync tag
+    tag_to_jobs = defaultdict(list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for path in args.filenames:
         workflow = load_yaml(Path(path))
         jobs = workflow["jobs"]
         for job_id, job in jobs.items():
+<<<<<<< HEAD
             res = get_jobs_with_sync_tag(job)
             if res is None:
                 continue
@@ -158,3 +181,37 @@ def get_jobs_with_sync_tag(
                     print_lint_message(
                         path, job_dict, sync_tag, baseline_path, baseline_job_id
                     )
+=======
+            try:
+                sync_tag = job["with"]["sync-tag"]
+            except KeyError:
+                continue
+
+            # remove the "if" field, which we allow to be different between jobs
+            # (since you might have different triggering conditions on pull vs.
+            # trunk, say.)
+            if "if" in job:
+                del job["if"]
+
+            # same is true for ['with']['test-matrix']
+            if "test-matrix" in job.get("with", {}):
+                del job["with"]["test-matrix"]
+
+            tag_to_jobs[sync_tag].append((path, {job_id: job}))
+
+    # For each sync tag, check that all the jobs have the same code.
+    for sync_tag, path_and_jobs in tag_to_jobs.items():
+        baseline_path, baseline_dict = path_and_jobs.pop()
+        baseline_str = dump(baseline_dict)
+
+        printed_baseline = False
+
+        for path, job_dict in path_and_jobs:
+            job_str = dump(job_dict)
+            if baseline_str != job_str:
+                print_lint_message(path, job_dict, sync_tag)
+
+                if not printed_baseline:
+                    print_lint_message(baseline_path, baseline_dict, sync_tag)
+                    printed_baseline = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt
index c4a250db04836..d438a7bb9bf0a 100644
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 aLoad
 aLoads
 ans
@@ -8,18 +9,28 @@ bLoad
 bLoads
 bStore
 bStores
+=======
+ans
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BU
 contiguities
 contiguity
 coo
 DEPENDEES
+<<<<<<< HEAD
 deser
 din
 dout
+=======
+Din
+Dout
+dOut
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ElementE
 followings
 fro
 froms
+<<<<<<< HEAD
 Halfs
 hsa
 indexT
@@ -44,12 +55,23 @@ overrideable
 oW
 padD
 posIn
+=======
+hsa
+nd
+nin
+nout
+NowNs
+optins
+OT
+overrideable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ptd
 rebuild
 rebuilt
 reenable
 reenabled
 requestor
+<<<<<<< HEAD
 ser
 serde
 serder
@@ -62,3 +84,8 @@ te
 THW
 tne
 WONT
+=======
+ser'de
+supercedes
+te
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
index 5d6889d275551..1cc3646a56da4 100644
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@@ -25,8 +25,13 @@
 
 selected_kernel_dtypes_h_template_str = """
 #include <c10/core/ScalarType.h>
+<<<<<<< HEAD
 #include <c10/macros/Macros.h>
 #include <string_view>
+=======
+#include <c10/util/string_view.h>
+#include <c10/macros/Macros.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 inline constexpr bool should_include_kernel_dtype(
@@ -73,7 +78,10 @@ def get_selected_kernel_dtypes_code(
         for kernel_tag, dtypes in selective_builder.kernel_metadata.items():
             conditions = ["scalar_type == at::ScalarType::" + x for x in dtypes]
             body_parts.append(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if_condition_template.substitute(
                     kernel_tag_name=kernel_tag,
                     dtype_checks=" || ".join(conditions),
diff --git a/tools/lldb/deploy_debugger.py b/tools/lldb/deploy_debugger.py
new file mode 100644
index 0000000000000..7a28c72a6caf2
--- /dev/null
+++ b/tools/lldb/deploy_debugger.py
@@ -0,0 +1,38 @@
+import lldb  # type: ignore[import]
+
+
+# load into lldb instance with:
+#   command script import tools/lldb/deploy_debugger.py
+
+target = lldb.debugger.GetSelectedTarget()
+bp = target.BreakpointCreateByRegex("__deploy_register_code")
+bp.SetScriptCallbackBody(
+    """\
+process = frame.thread.GetProcess()
+target = process.target
+symbol_addr = frame.module.FindSymbol("__deploy_module_info").GetStartAddress()
+info_addr = symbol_addr.GetLoadAddress(target)
+e = lldb.SBError()
+ptr_size = 8
+str_addr = process.ReadPointerFromMemory(info_addr, e)
+file_addr = process.ReadPointerFromMemory(info_addr + ptr_size, e)
+file_size = process.ReadPointerFromMemory(info_addr + 2*ptr_size, e)
+load_bias = process.ReadPointerFromMemory(info_addr + 3*ptr_size, e)
+name = process.ReadCStringFromMemory(str_addr, 512, e)
+r = process.ReadMemory(file_addr, file_size, e)
+from tempfile import NamedTemporaryFile
+from pathlib import Path
+stem = Path(name).stem
+with NamedTemporaryFile(prefix=stem, suffix='.so', delete=False) as tf:
+    tf.write(r)
+    print("torch_deploy registering debug information for ", tf.name)
+    cmd1 = f"target modules add {tf.name}"
+    # print(cmd1)
+    lldb.debugger.HandleCommand(cmd1)
+    cmd2 = f"target modules load -f {tf.name} -s {hex(load_bias)}"
+    # print(cmd2)
+    lldb.debugger.HandleCommand(cmd2)
+
+return False
+"""
+)
diff --git a/tools/nightly.py b/tools/nightly.py
index a829f4729e77a..f23e23d30b6a1 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -7,12 +7,20 @@
 You can use this script to check out a new nightly branch with the following::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch
+<<<<<<< HEAD
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Or if you would like to check out the nightly commit in detached HEAD mode::
 
     $ ./tools/nightly.py checkout
+<<<<<<< HEAD
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Or if you would like to reuse an existing virtual environment, you can pass in
 the prefix argument (--prefix)::
@@ -23,17 +31,26 @@
 To install the nightly binaries built with CUDA, you can pass in the flag --cuda::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch --cuda
+<<<<<<< HEAD
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 To install the nightly binaries built with ROCm, you can pass in the flag --rocm::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch --rocm
+<<<<<<< HEAD
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 You can also use this tool to pull the nightly commits into the current branch as
 well. This can be done with::
 
     $ ./tools/nightly.py pull
+<<<<<<< HEAD
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 To create the virtual environment with a specific Python interpreter, you can
@@ -41,6 +58,9 @@
 
     $ ./tools/nightly.py --python /path/to/python3.12
     $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+=======
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Pulling will recreate a fresh virtual environment and reinstall the development
 dependencies as well as the nightly binaries into the repo directory.
@@ -65,11 +85,18 @@
 import time
 import uuid
 from ast import literal_eval
+<<<<<<< HEAD
 from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
 from platform import system as platform_system
 from typing import Any, cast, NamedTuple, TYPE_CHECKING, TypeVar
+=======
+from datetime import datetime
+from pathlib import Path
+from platform import system as platform_system
+from typing import Any, Callable, cast, NamedTuple, TYPE_CHECKING, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -149,6 +176,7 @@ class PipSource(NamedTuple):
         supported_platforms={"Linux", "Windows"},
         accelerator="cuda",
     ),
+<<<<<<< HEAD
     "cuda-13.0": PipSource(
         name="cuda-13.0",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu130",
@@ -165,6 +193,18 @@ class PipSource(NamedTuple):
     "rocm-7.0": PipSource(
         name="rocm-7.0",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm7.0",
+=======
+    # NOTE: Sync with ROCM_ARCHES in .github/scripts/generate_binary_build_matrix.py
+    "rocm-6.3": PipSource(
+        name="rocm-6.3",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.3",
+        supported_platforms={"Linux"},
+        accelerator="rocm",
+    ),
+    "rocm-6.4": PipSource(
+        name="rocm-6.4",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.4",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         supported_platforms={"Linux"},
         accelerator="rocm",
     ),
@@ -255,6 +295,7 @@ def __init__(
         *,
         base_executable: Path | str | None = None,
     ) -> None:
+<<<<<<< HEAD
         base_executable = Path(base_executable or sys.executable)
         if not base_executable.is_absolute():
             base_exec = shutil.which(str(base_executable))
@@ -267,12 +308,20 @@ def __init__(
         self.prefix = Path(prefix).absolute()
         self.pip_source = pip_source
         self.base_executable = base_executable.absolute()
+=======
+        self.prefix = Path(prefix).absolute()
+        self.pip_source = pip_source
+        self.base_executable = Path(base_executable or sys.executable).absolute()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._executable: Path | None = None
         self._bindir: Path | None = None
         self._env = {
             "PIP_EXTRA_INDEX_URL": self.pip_source.index_url,
             "UV_INDEX": self.pip_source.index_url,
+<<<<<<< HEAD
             "UV_PYTHON_DOWNLOADS": "never",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "FORCE_COLOR": "1",
             "CLICOLOR_FORCE": "1",
         }
@@ -311,7 +360,10 @@ def site_packages(self, python: Path | str | None = None) -> Path:
             python=python,
             capture_output=True,
         ).stdout
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         candidates = list(map(Path, filter(None, map(str.strip, output.splitlines()))))
         candidates = [p for p in candidates if p.is_dir() and p.name == "site-packages"]
         if not candidates:
@@ -325,7 +377,11 @@ def activate_script(self) -> Path:
         """Get the activation script for the virtual environment."""
         if WINDOWS:
             # Assume PowerShell
+<<<<<<< HEAD
             return self.prefix / "Scripts" / "activate"
+=======
+            return self.prefix / "Scripts" / "Activate.ps1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Assume POSIX-compliant shell: Bash, Zsh, etc.
         return self.prefix / "bin" / "activate"
 
@@ -334,18 +390,26 @@ def activate_command(self) -> str:
         """Get the command to activate the virtual environment."""
         if WINDOWS:
             # Assume PowerShell
+<<<<<<< HEAD
             return f'. "{self.activate_script}"'
+=======
+            return f'& "{self.activate_script}"'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Assume Bash, Zsh, etc.
         # POSIX standard should use dot `. venv/bin/activate` rather than `source`
         return f"source {shlex.quote(str(self.activate_script))}"
 
     @timed("Creating virtual environment")
+<<<<<<< HEAD
     def create(
         self,
         *,
         remove_if_exists: bool = False,
         assume_yes: bool = False,
     ) -> Path:
+=======
+    def create(self, *, remove_if_exists: bool = False) -> Path:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a virtual environment."""
         if self.prefix.exists():
             if remove_if_exists:
@@ -355,42 +419,64 @@ def create(
                         f"The path {self.prefix} already exists and is not a virtual environment. "
                         "Please remove it manually or choose a different prefix."
                     )
+<<<<<<< HEAD
                 if any(
                     Path(p).absolute().samefile(self.prefix)
+=======
+                if self.prefix in [
+                    Path(p).absolute()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for p in [
                         sys.prefix,
                         sys.exec_prefix,
                         sys.base_prefix,
                         sys.base_exec_prefix,
                     ]
+<<<<<<< HEAD
                 ):
+=======
+                ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise RuntimeError(
                         f"The path {self.prefix} trying to remove is the same as the interpreter "
                         "to run this script. Please choose a different prefix or deactivate the "
                         "current virtual environment."
                     )
+<<<<<<< HEAD
                 if any(
+=======
+                if self.prefix in [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     Path(
                         self.base_python(
                             "-c",
                             f"import os, sys; print(os.path.abspath({p}))",
                             capture_output=True,
                         ).stdout.strip()
+<<<<<<< HEAD
                     )
                     .absolute()
                     .samefile(self.prefix)
+=======
+                    ).absolute()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for p in [
                         "sys.prefix",
                         "sys.exec_prefix",
                         "sys.base_prefix",
                         "sys.base_exec_prefix",
                     ]
+<<<<<<< HEAD
                 ):
+=======
+                ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise RuntimeError(
                         f"The Python executable {self.base_executable} trying to remove is the "
                         "same as the interpreter to create the virtual environment. Please choose "
                         "a different prefix or a different Python interpreter."
                     )
+<<<<<<< HEAD
                 if not assume_yes:
                     answer = input(
                         f"The virtual environment {self.prefix} already exists. "
@@ -405,6 +491,11 @@ def create(
 
                 print(f"Removing existing venv: {self.prefix}")
                 _remove_existing(self.prefix)
+=======
+                print(f"Removing existing venv: {self.prefix}")
+                _remove_existing(self.prefix)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise RuntimeError(f"Path {self.prefix} already exists.")
 
@@ -456,6 +547,7 @@ def ensure(self) -> Path:
         """Ensure the virtual environment exists."""
         if not self.is_venv():
             return self.create(remove_if_exists=True)
+<<<<<<< HEAD
         if (
             self.python_version().split(".")[:2]
             != self.base_python_version().split(".")[:2]
@@ -465,6 +557,8 @@ def ensure(self) -> Path:
                 f"but base Python is {self.base_python_version()}. "
                 "Please recreate the virtual environment with the correct Python version."
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.pip_install(*self.AGGRESSIVE_UPDATE_PACKAGES, upgrade=True)
         return self.prefix
@@ -481,13 +575,20 @@ def python(
         cmd = [str(python), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return subprocess.run(
             cmd,
             check=check,
             text=True,
             encoding="utf-8",
+<<<<<<< HEAD
             env={**os.environ, **self._env, **env},
+=======
+            env={**self._env, **env},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **popen_kwargs,
         )
 
@@ -500,10 +601,14 @@ def base_python(
         return self.python(*args, python=self.base_executable, **popen_kwargs)
 
     def python_version(self, *, python: Path | str | None = None) -> str:
+<<<<<<< HEAD
         """Get the Python version for the virtual environment.
 
         Return a string like "3.13.7", "3.13.7t", "3.13.7d", "3.13.7td", etc.
         """
+=======
+        """Get the Python version for the virtual environment."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.python(
             "-c",
             (
@@ -515,10 +620,14 @@ def python_version(self, *, python: Path | str | None = None) -> str:
         ).stdout.strip()
 
     def base_python_version(self) -> str:
+<<<<<<< HEAD
         """Get the Python version for the base environment.
 
         Return a string like "3.13.7", "3.13.7t", "3.13.7d", "3.13.7td", etc.
         """
+=======
+        """Get the Python version for the base environment."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.python_version(python=self.base_executable)
 
     def uv(
@@ -533,13 +642,21 @@ def uv(
         cmd = [str(self.bindir / "uv"), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+        env["UV_PYTHON"] = str(python)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return subprocess.run(
             cmd,
             check=check,
             text=True,
             encoding="utf-8",
+<<<<<<< HEAD
             env={**os.environ, **self._env, **env, "UV_PYTHON": str(python)},
+=======
+            env={**self._env, **env},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **popen_kwargs,
         )
 
@@ -744,7 +861,11 @@ def logging_manager(*, debug: bool = False) -> Generator[logging.Logger, None, N
         logging_record_exception(e)
         print(f"log file: {log_file}")
         sys.exit(1)
+<<<<<<< HEAD
     except BaseException as e:  # noqa: B036
+=======
+    except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # You could logging.debug here to suppress the backtrace
         # entirely, but there is no reason to hide it from technically
         # savvy users.
@@ -941,7 +1062,10 @@ def _move_single(
 
 def _copy_files(listing: list[Path], source_dir: Path, target_dir: Path) -> None:
     for src in listing:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _move_single(src, source_dir, target_dir, shutil.copy2, "Copying")
 
 
@@ -1027,6 +1151,7 @@ def install(
     packages: Iterable[str],
     subcommand: str = "checkout",
     branch: str | None = None,
+<<<<<<< HEAD
     fresh_venv: bool = False,
     assume_yes: bool = False,
 ) -> None:
@@ -1036,6 +1161,15 @@ def install(
         venv.ensure()
     else:
         venv.create(remove_if_exists=True, assume_yes=assume_yes)
+=======
+) -> None:
+    """Development install of PyTorch"""
+    use_existing = subcommand == "checkout"
+    if use_existing:
+        venv.ensure()
+    else:
+        venv.create(remove_if_exists=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     packages = [p for p in packages if p != "torch"]
 
@@ -1110,8 +1244,13 @@ def find_executable(name: str) -> Path:
             metavar="PYTHON",
         )
         subparser.add_argument(
+<<<<<<< HEAD
             "--prefix",
             "-p",
+=======
+            "-p",
+            "--prefix",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             type=lambda p: Path(p).absolute(),
             help='Path to virtual environment directory (e.g. "./venv")',
             dest="prefix",
@@ -1119,6 +1258,7 @@ def find_executable(name: str) -> Path:
             metavar="PATH",
         )
         subparser.add_argument(
+<<<<<<< HEAD
             "--fresh",
             help="Remove existing virtual environment if it exists",
             dest="fresh",
@@ -1139,6 +1279,10 @@ def find_executable(name: str) -> Path:
         subparser.add_argument(
             "--verbose",
             "-v",
+=======
+            "-v",
+            "--verbose",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             help="Provide debugging info",
             dest="verbose",
             default=False,
@@ -1228,8 +1372,11 @@ def main() -> None:
             packages=PACKAGES_TO_INSTALL,
             subcommand=args.subcmd,
             branch=args.branch,
+<<<<<<< HEAD
             fresh_venv=args.fresh,
             assume_yes=args.yes,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/tools/nightly_hotpatch.py b/tools/nightly_hotpatch.py
index d8e78a82664d8..ac56da5ae9e0b 100644
--- a/tools/nightly_hotpatch.py
+++ b/tools/nightly_hotpatch.py
@@ -118,7 +118,10 @@ def download_patch(pr_number: int, repo_url: str, download_dir: str) -> str:
             urllib.request.urlopen(patch_url) as response,
             open(patch_file, "wb") as out_file,
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-specialization]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shutil.copyfileobj(response, out_file)
         if not os.path.isfile(patch_file):
             print(f"Failed to download patch for PR #{pr_number}")
diff --git a/tools/packaging/build_wheel.py b/tools/packaging/build_wheel.py
index 5f6f262ab8204..451db4215bc82 100644
--- a/tools/packaging/build_wheel.py
+++ b/tools/packaging/build_wheel.py
@@ -4,7 +4,10 @@
 import contextlib
 import logging
 import os
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import subprocess
 import sys
 import tempfile
@@ -17,11 +20,19 @@
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
+<<<<<<< HEAD
 logger.setLevel(logging.INFO)
 
 ROOT_PATH = Path(__file__).absolute().parent.parent.parent
 REQUIREMENTS_PATH = ROOT_PATH / "requirements.txt"
 PYPROJECT_TOML_PATH = ROOT_PATH / "pyproject.toml"
+=======
+logger.setLevel(logging.DEBUG)
+
+ROOT_PATH = Path(__file__).absolute().parent.parent.parent
+SETUP_PY_PATH = ROOT_PATH / "setup.py"
+REQUIREMENTS_PATH = ROOT_PATH / "requirements.txt"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def run_cmd(
@@ -46,6 +57,7 @@ def interpreter_version(interpreter: str) -> str:
     return str(version_string.split(" ")[1])
 
 
+<<<<<<< HEAD
 def get_supported_python_versions() -> list[str]:
     """Extract supported Python versions from pyproject.toml classifiers."""
     with open(PYPROJECT_TOML_PATH) as f:
@@ -119,6 +131,8 @@ def _find_manylinux_interpreters() -> list[str]:
     return interpreters
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextlib.contextmanager
 def venv(interpreter: str) -> Iterator[str]:
     # Should this use EnvBuilder? Probably, maybe a good todo in the future
@@ -142,6 +156,7 @@ class Builder:
     def __init__(self, interpreter: str) -> None:
         self.interpreter = interpreter
 
+<<<<<<< HEAD
     def build_wheel(self, destination: str) -> bool:
         logger.info("Running bdist_wheel -d %s", destination)
         return (
@@ -163,6 +178,20 @@ def build_wheel(self, destination: str) -> bool:
     def clean(self) -> bool:
         logger.info("Running clean")
         return run_cmd([self.interpreter, "setup.py", "clean"]).returncode == 0
+=======
+    def setup_py(self, cmd_args: list[str]) -> bool:
+        return (
+            run_cmd([self.interpreter, str(SETUP_PY_PATH), *cmd_args]).returncode == 0
+        )
+
+    def bdist_wheel(self, destination: str) -> bool:
+        logger.info("Running bdist_wheel -d %s", destination)
+        return self.setup_py(["bdist_wheel", "-d", destination])
+
+    def clean(self) -> bool:
+        logger.info("Running clean")
+        return self.setup_py(["clean"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def install_requirements(self) -> None:
         logger.info("Installing requirements")
@@ -184,6 +213,7 @@ def parse_args() -> argparse.Namespace:
         ),
     )
     parser.add_argument(
+<<<<<<< HEAD
         "--find-python",
         type=str,
         choices=["manylinux"],
@@ -194,6 +224,8 @@ def parse_args() -> argparse.Namespace:
         ),
     )
     parser.add_argument(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "-d",
         "--destination",
         default="dist/",
@@ -205,6 +237,7 @@ def parse_args() -> argparse.Namespace:
 
 def main() -> None:
     args = parse_args()
+<<<<<<< HEAD
 
     if args.find_python:
         if args.python:
@@ -225,6 +258,9 @@ def main() -> None:
     else:
         pythons = args.python or [sys.executable]
 
+=======
+    pythons = args.python or [sys.executable]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     build_times: dict[str, float] = dict()
 
     if len(pythons) > 1 and args.destination == "dist/":
@@ -242,7 +278,11 @@ def main() -> None:
 
             start_time = time.time()
 
+<<<<<<< HEAD
             builder.build_wheel(args.destination)
+=======
+            builder.bdist_wheel(args.destination)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             end_time = time.time()
 
diff --git a/tools/packaging/split_wheel.py b/tools/packaging/split_wheel.py
new file mode 100644
index 0000000000000..fd52c39a22b02
--- /dev/null
+++ b/tools/packaging/split_wheel.py
@@ -0,0 +1,109 @@
+"""Script to build split pytorch wheels
+
+What is split build / why is it important?
+    > Split build is splitting the PyTorch build into a libtorch &
+    > PyTorch python frontend package. This allows us to to publish
+    > both as separate packages and opens up our ability to have users
+    > install different libtorch backends per their PyTorch frontend
+    >
+    > Example: opening up the door to things like:
+    >     pip install torch[cuda]
+    >     pip install torch[rocm]
+    >     pip install torch[cpu]
+    >     etc.
+
+Why does this exist?
+    > Currently our split build requires you to invoke setup.py twice
+    > Which ends up complicating the build process and adds some level
+    > of complexity to our setup.py / build invocation for split builds.
+    > Ideally this script will eventually not be needed but for
+    > development purposes we should have an easy way to invoke this script
+"""
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+# NOTE: This will need to be updated if this script is ever moved
+ROOT_PATH = Path(__file__).absolute().parents[2]
+SETUP_PY_PATH = ROOT_PATH / "setup.py"
+
+
+def requirements_installed() -> bool:
+    try:
+        import setuptools  # type: ignore[import-untyped]  # noqa: F401
+
+        return True
+    except ImportError:
+        logger.error(
+            "Requirements not installed, run the following command to install:"
+        )
+        logger.error(
+            "    > %s -m pip install -r %s/requirements.txt", sys.executable, ROOT_PATH
+        )
+        return False
+
+
+def setup_py(cmd_args: list[str], extra_env: Optional[dict[str, str]] = None) -> None:
+    if extra_env is None:
+        extra_env = {}
+    cmd = [sys.executable, str(SETUP_PY_PATH), *cmd_args]
+    logger.debug("+ %s", " ".join(cmd))
+    subprocess.run(
+        cmd,
+        # Give the parent environment to the subprocess
+        env={**os.environ, **extra_env},
+        check=True,
+    )
+
+
+def split_build(cmd: str) -> None:
+    logger.info("Running %s for libtorch wheel", cmd)
+    setup_py(
+        [cmd],
+        extra_env={"BUILD_LIBTORCH_WHL": "1", "BUILD_PYTHON_ONLY": "0"},
+    )
+    logger.info("Running %s for torch wheel", cmd)
+    # NOTE: Passing CMAKE_FRESH=1 is necessary here since the torch frontend has it's
+    # own cmake files that it needs to generate
+    setup_py(
+        [cmd],
+        extra_env={
+            "BUILD_LIBTORCH_WHL": "0",
+            "BUILD_PYTHON_ONLY": "1",
+            "CMAKE_FRESH": "1",
+        },
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    command_subparser = parser.add_subparsers(dest="command")
+    # Ideally these should mirror setuptools commands if we need support here for that
+    command_subparser.add_parser("install")
+    command_subparser.add_parser("bdist_wheel")
+    command_subparser.add_parser("develop")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    if not requirements_installed():
+        sys.exit(1)
+    split_build(args.command)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d0c51e5be697a..d1717ffb1f6bf 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -422,6 +422,7 @@ def gen_nn_functional(fm: FileManager) -> None:
                         "Tensor",
                     )
                 ],
+<<<<<<< HEAD
                 f"max_pool{d}d_with_indices": [
                     defs(
                         f"max_pool{d}d_with_indices",
@@ -435,6 +436,8 @@ def gen_nn_functional(fm: FileManager) -> None:
                         "tuple[Tensor, Tensor]",
                     )
                 ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
@@ -564,6 +567,7 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+<<<<<<< HEAD
             "elu": [
                 defs(
                     "elu",
@@ -813,6 +817,8 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     )
 
@@ -994,7 +1000,10 @@ def add_docstr_to_hint(docstr: str, hint: str) -> str:
         hint = hint.removesuffix("...").rstrip()  # remove "..."
         content = hint + "\n" + textwrap.indent(f'r"""\n{docstr}\n"""', prefix="    ")
         # Remove trailing whitespace on each line
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "\n".join(map(str.rstrip, content.splitlines())).rstrip()
 
     # attribute or property
@@ -1175,6 +1184,7 @@ def gen_pyi(
                     "None",
                 )
             ],
+<<<<<<< HEAD
             "_functionalize_mutation_counter": [
                 defs(
                     "_functionalize_mutation_counter",
@@ -1196,6 +1206,8 @@ def gen_pyi(
                     "_int",
                 )
             ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "_functionalize_are_all_mutations_hidden_from_autograd": [
                 defs(
                     "_functionalize_are_all_mutations_hidden_from_autograd",
@@ -1221,8 +1233,13 @@ def gen_pyi(
             "_functionalize_was_storage_changed": [
                 defs("_functionalize_was_storage_changed", ["tensor: Tensor"], "_bool")
             ],
+<<<<<<< HEAD
             "_functionalize_mark_storage_changed": [
                 "def _functionalize_mark_storage_changed(tensor: Tensor) -> _bool: ..."
+=======
+            "_functionalize_set_storage_changed": [
+                "def _functionalize_set_storage_changed(tensor: Tensor) -> _bool: ..."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             "_functionalize_has_metadata_mutation": [
                 defs(
@@ -1581,6 +1598,7 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
+<<<<<<< HEAD
             "_dtensor__new__": [
                 "@staticmethod\n"
                 + defs(
@@ -1594,6 +1612,8 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "__contains__": [defs("__contains__", ["self", "item: Any", "/"], "_bool")],
             "__getitem__": [defs("__getitem__", ["self", INDICES, "/"], "Tensor")],
             "__setitem__": [
@@ -2013,10 +2033,17 @@ def replace_special_case(hint: str) -> str:
 
     # Include only the functions that contain hints, to prevent undefined
     # symbols to be included in the `__all__` directive.
+<<<<<<< HEAD
     hinted_function_names = {
         name for name, hint in unsorted_function_hints.items() if hint
     }
     all_symbols = sorted(hinted_function_names.union(structseqs))
+=======
+    hinted_function_names = [
+        name for name, hint in unsorted_function_hints.items() if hint
+    ]
+    all_symbols = sorted(list(structseqs) + hinted_function_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_directive = [
         "__all__ = [",
         *(f'    "{name}",' for name in all_symbols),
@@ -2102,6 +2129,7 @@ def main() -> None:
         default=".",
         help="path to output directory",
     )
+<<<<<<< HEAD
     parser.add_argument(
         "--template-dir",
         default=".",
@@ -2111,6 +2139,10 @@ def main() -> None:
     fm = FileManager(
         install_dir=args.out, template_dir=args.template_dir, dry_run=False
     )
+=======
+    args = parser.parse_args()
+    fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gen_pyi(
         args.native_functions_path,
         args.tags_path,
diff --git a/tools/setup_helpers/BUILD.bazel b/tools/setup_helpers/BUILD.bazel
index 8a02a12d0845c..51a43e3991485 100644
--- a/tools/setup_helpers/BUILD.bazel
+++ b/tools/setup_helpers/BUILD.bazel
@@ -11,8 +11,12 @@ py_binary(
 py_binary(
     name = "gen_version_header",
     srcs = ["gen_version_header.py"],
+<<<<<<< HEAD
     visibility = [
         "//:__pkg__",
         "//torch/headeronly:__pkg__",
     ],
+=======
+    visibility = ["//:__pkg__"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
diff --git a/tools/setup_helpers/__init__.py b/tools/setup_helpers/__init__.py
index e227fd2ac0d95..9f84a23bc9ca0 100644
--- a/tools/setup_helpers/__init__.py
+++ b/tools/setup_helpers/__init__.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+<<<<<<< HEAD
 import warnings
 
 
@@ -13,6 +14,11 @@ def which(thefile: str) -> str | None:
         stacklevel=2,
     )
 
+=======
+
+
+def which(thefile: str) -> str | None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     path = os.environ.get("PATH", os.defpath).split(os.pathsep)
     for d in path:
         fname = os.path.join(d, thefile)
diff --git a/tools/setup_helpers/build.bzl b/tools/setup_helpers/build.bzl
index ab2fa27f9f310..a54e4b106c26a 100644
--- a/tools/setup_helpers/build.bzl
+++ b/tools/setup_helpers/build.bzl
@@ -13,8 +13,12 @@ def define_targets(rules):
     rules.py_binary(
         name = "gen_version_header",
         srcs = ["gen_version_header.py"],
+<<<<<<< HEAD
         visibility = [
             "//:__pkg__",
             "//torch/headeronly:__pkg__",
         ],
+=======
+        visibility = ["//:__pkg__"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 88f0fe5d3094a..301d61a2b2369 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Manages CMake."""
 
 from __future__ import annotations
@@ -34,6 +35,25 @@
         from distutils.version import (  # type: ignore[assignment,no-redef]
             LooseVersion as Version,
         )
+=======
+"Manages CMake."
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import platform
+import sys
+import sysconfig
+from distutils.version import LooseVersion
+from pathlib import Path
+from subprocess import CalledProcessError, check_call, check_output
+from typing import Any, cast
+
+from . import which
+from .cmake_utils import CMakeValue, get_cmake_cache_variables_from_file
+from .env import BUILD_DIR, check_negative_env_flag, IS_64BIT, IS_DARWIN, IS_WINDOWS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _mkdir_p(d: str) -> None:
@@ -45,6 +65,7 @@ def _mkdir_p(d: str) -> None:
         ) from e
 
 
+<<<<<<< HEAD
 # Print to stderr
 eprint = functools.partial(print, file=sys.stderr, flush=True)
 
@@ -53,13 +74,22 @@ def _mkdir_p(d: str) -> None:
 # Use ninja if it is on the PATH. Previous version of PyTorch required the
 # ninja python package, but we no longer use it, so we do not have to import it
 USE_NINJA = bool(not check_negative_env_flag("USE_NINJA") and shutil.which("ninja"))
+=======
+# Ninja
+# Use ninja if it is on the PATH. Previous version of PyTorch required the
+# ninja python package, but we no longer use it, so we do not have to import it
+USE_NINJA = not check_negative_env_flag("USE_NINJA") and which("ninja") is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "CMAKE_GENERATOR" in os.environ:
     USE_NINJA = os.environ["CMAKE_GENERATOR"].lower() == "ninja"
 
 
+<<<<<<< HEAD
 CMAKE_MINIMUM_VERSION = Version(CMAKE_MINIMUM_VERSION_STRING)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CMake:
     "Manages cmake."
 
@@ -76,6 +106,7 @@ def _cmake_cache_file(self) -> str:
         """
         return os.path.join(self.build_dir, "CMakeCache.txt")
 
+<<<<<<< HEAD
     @property
     def _ninja_build_file(self) -> str:
         r"""Returns the path to build.ninja.
@@ -141,6 +172,55 @@ def run(self, args: list[str], env: dict[str, str]) -> None:
 
         command = [self._cmake_command] + args
         eprint(" ".join(command))
+=======
+    @staticmethod
+    def _get_cmake_command() -> str:
+        "Returns cmake command."
+
+        cmake_command = "cmake"
+        if IS_WINDOWS:
+            return cmake_command
+        cmake3_version = CMake._get_version(which("cmake3"))
+        cmake_version = CMake._get_version(which("cmake"))
+
+        _cmake_min_version = LooseVersion("3.27.0")
+        if all(
+            ver is None or ver < _cmake_min_version
+            for ver in [cmake_version, cmake3_version]
+        ):
+            raise RuntimeError(
+                "no cmake or cmake3 with version >= 3.27.0 found:"
+                + str([cmake_version, cmake3_version])
+            )
+
+        if cmake3_version is None:
+            cmake_command = "cmake"
+        elif cmake_version is None:
+            cmake_command = "cmake3"
+        else:
+            if cmake3_version >= cmake_version:
+                cmake_command = "cmake3"
+            else:
+                cmake_command = "cmake"
+        return cmake_command
+
+    @staticmethod
+    def _get_version(cmd: str | None) -> Any:
+        "Returns cmake version."
+
+        if cmd is None:
+            return None
+        for line in check_output([cmd, "--version"]).decode("utf-8").split("\n"):
+            if "version" in line:
+                return LooseVersion(line.strip().split(" ")[2])
+        raise RuntimeError("no version found")
+
+    def run(self, args: list[str], env: dict[str, str]) -> None:
+        "Executes cmake with arguments and an environment."
+
+        command = [self._cmake_command] + args
+        print(" ".join(command))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             check_call(command, cwd=self.build_dir, env=env)
         except (CalledProcessError, KeyboardInterrupt):
@@ -151,7 +231,11 @@ def run(self, args: list[str], env: dict[str, str]) -> None:
 
     @staticmethod
     def defines(args: list[str], **kwargs: CMakeValue) -> None:
+<<<<<<< HEAD
         """Adds definitions to a cmake argument list."""
+=======
+        "Adds definitions to a cmake argument list."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, value in sorted(kwargs.items()):
             if value is not None:
                 args.append(f"-D{key}={value}")
@@ -173,11 +257,16 @@ def generate(
         my_env: dict[str, str],
         rerun: bool,
     ) -> None:
+<<<<<<< HEAD
         """Runs cmake to generate native build files."""
+=======
+        "Runs cmake to generate native build files."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if rerun and os.path.isfile(self._cmake_cache_file):
             os.remove(self._cmake_cache_file)
 
+<<<<<<< HEAD
         cmake_cache_file_available = os.path.exists(self._cmake_cache_file)
         if cmake_cache_file_available:
             cmake_cache_variables = self.get_cmake_cache_variables()
@@ -198,6 +287,11 @@ def generate(
 
         if cmake_cache_file_available and (
             not USE_NINJA or os.path.exists(self._ninja_build_file)
+=======
+        ninja_build_file = os.path.join(self.build_dir, "build.ninja")
+        if os.path.exists(self._cmake_cache_file) and not (
+            USE_NINJA and not os.path.exists(ninja_build_file)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # Everything's in place. Do not rerun.
             return
@@ -211,9 +305,15 @@ def generate(
             generator = os.getenv("CMAKE_GENERATOR", "Visual Studio 16 2019")
             supported = ["Visual Studio 16 2019", "Visual Studio 17 2022"]
             if generator not in supported:
+<<<<<<< HEAD
                 eprint("Unsupported `CMAKE_GENERATOR`: " + generator)
                 eprint("Please set it to one of the following values: ")
                 eprint("\n".join(supported))
+=======
+                print("Unsupported `CMAKE_GENERATOR`: " + generator)
+                print("Please set it to one of the following values: ")
+                print("\n".join(supported))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sys.exit(1)
             args.append("-G" + generator)
             toolset_dict = {}
@@ -222,7 +322,11 @@ def generate(
                 toolset_dict["version"] = toolset_version
                 curr_toolset = os.getenv("VCToolsVersion")
                 if curr_toolset is None:
+<<<<<<< HEAD
                     eprint(
+=======
+                    print(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "When you specify `CMAKE_GENERATOR_TOOLSET_VERSION`, you must also "
                         "activate the vs environment of this version. Please read the notes "
                         "in the build steps carefully."
@@ -327,7 +431,11 @@ def generate(
 
         # The default value cannot be easily obtained in CMakeLists.txt. We set it here.
         py_lib_path = sysconfig.get_path("purelib")
+<<<<<<< HEAD
         cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH")
+=======
+        cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cmake_prefix_path:
             build_options["CMAKE_PREFIX_PATH"] = (
                 py_lib_path + ";" + cast(str, cmake_prefix_path)
@@ -339,7 +447,10 @@ def generate(
         # future, as CMake can detect many of these libraries pretty comfortably. We have them here for now before CMake
         # integration is completed. They appear here not in the CMake.defines call below because they start with either
         # "BUILD_" or "USE_" and must be overwritten here.
+<<<<<<< HEAD
         use_numpy = not check_negative_env_flag("USE_NUMPY")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_options.update(
             {
                 # Note: Do not add new build options to this dict if it is directly read from environment variable -- you
@@ -349,15 +460,25 @@ def generate(
                 "BUILD_TEST": build_test,
                 # Most library detection should go to CMake script, except this one, which Python can do a much better job
                 # due to NumPy's inherent Pythonic nature.
+<<<<<<< HEAD
                 "USE_NUMPY": use_numpy,
+=======
+                "USE_NUMPY": not check_negative_env_flag("USE_NUMPY"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
         # Detect build dependencies from python lib path (in order to set *_HOME variables)
         # NVSHMEM
+<<<<<<< HEAD
         nvshmem_py_dir = py_lib_path + "/nvidia/nvshmem"
         if os.path.exists(nvshmem_py_dir):
             build_options["NVSHMEM_PY_DIR"] = nvshmem_py_dir
+=======
+        nvshmem_home = py_lib_path + "/nvidia/nvshmem"
+        if os.path.exists(nvshmem_home):
+            build_options["NVSHMEM_HOME"] = nvshmem_home
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Options starting with CMAKE_
         cmake__options = {
@@ -368,13 +489,18 @@ def generate(
         # error if the user also attempts to set these CMAKE options directly.
         specified_cmake__options = set(build_options).intersection(cmake__options)
         if len(specified_cmake__options) > 0:
+<<<<<<< HEAD
             eprint(
+=======
+            print(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ", ".join(specified_cmake__options)
                 + " should not be specified in the environment variable. They are directly set by PyTorch build script."
             )
             sys.exit(1)
         build_options.update(cmake__options)
 
+<<<<<<< HEAD
         if use_numpy:
             try:
                 # This helps CMake find the correct include directory for NumPy
@@ -389,6 +515,8 @@ def generate(
                 # use_numpy is just a hint.... so we can fail silently here
                 pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CMake.defines(
             args,
             Python_EXECUTABLE=sys.executable,
@@ -411,8 +539,16 @@ def generate(
                     my_env[env_var_name] = str(my_env[env_var_name].encode("utf-8"))
                 except UnicodeDecodeError as e:
                     shex = ":".join(f"{ord(c):02x}" for c in my_env[env_var_name])
+<<<<<<< HEAD
                     eprint(f"Invalid ENV[{env_var_name}] = {shex}")
                     eprint(e)
+=======
+                    print(
+                        f"Invalid ENV[{env_var_name}] = {shex}",
+                        file=sys.stderr,
+                    )
+                    print(e, file=sys.stderr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # According to the CMake manual, we should pass the arguments first,
         # and put the directory as the last element. Otherwise, these flags
         # may not be passed correctly.
@@ -423,7 +559,11 @@ def generate(
         self.run(args, env=my_env)
 
     def build(self, my_env: dict[str, str]) -> None:
+<<<<<<< HEAD
         """Runs cmake to build binaries."""
+=======
+        "Runs cmake to build binaries."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from .env import build_type
 
@@ -461,6 +601,7 @@ def build(self, my_env: dict[str, str]) -> None:
             # CMake 3.12 provides a '-j' option.
             build_args += ["-j", max_jobs]
         self.run(build_args, my_env)
+<<<<<<< HEAD
 
     def clear_cache(self) -> None:
         """Clears the CMake cache."""
@@ -468,3 +609,5 @@ def clear_cache(self) -> None:
             os.remove(self._cmake_cache_file)
         if os.path.isfile(self._ninja_build_file):
             os.remove(self._ninja_build_file)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index 3eb23af44a231..e0165e37c4add 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -11,8 +11,11 @@
     from collections.abc import Iterable
 
 
+<<<<<<< HEAD
 CMAKE_MINIMUM_VERSION_STRING = "3.27"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IS_WINDOWS = platform.system() == "Windows"
 IS_DARWIN = platform.system() == "Darwin"
 IS_LINUX = platform.system() == "Linux"
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index e53efd7288c1f..4ba8f5e145ce7 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -189,12 +189,15 @@ def main() -> None:
     )
     options = parser.parse_args()
 
+<<<<<<< HEAD
     # Path: aten/src/ATen
     aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
     operator_selector = get_selector(
         options.selected_op_list_path, options.operators_yaml_path
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     generate_code(
         options.gen_dir,
         options.native_functions_path,
@@ -204,6 +207,7 @@ def main() -> None:
         options.disable_autograd,
         options.force_schema_registration,
         # options.selected_op_list
+<<<<<<< HEAD
         operator_selector=operator_selector,
     )
 
@@ -235,6 +239,20 @@ def main() -> None:
         ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
         ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
         lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
+=======
+        operator_selector=get_selector(
+            options.selected_op_list_path, options.operators_yaml_path
+        ),
+    )
+
+    if options.gen_lazy_ts_backend:
+        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
+        ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
+        ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+        ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
+        install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
+        lazy_install_dir = os.path.join(install_dir, "lazy/generated")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.makedirs(lazy_install_dir, exist_ok=True)
 
         assert os.path.isfile(ts_backend_yaml), (
diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
index 39eab3145ff68..b150c3b797e3b 100644
--- a/tools/setup_helpers/generate_linker_script.py
+++ b/tools/setup_helpers/generate_linker_script.py
@@ -1,7 +1,12 @@
+<<<<<<< HEAD
 import argparse
 import os
 import subprocess
 from pathlib import Path
+=======
+import os
+import subprocess
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def gen_linker_script(
@@ -30,12 +35,15 @@ def gen_linker_script(
     assert len(text_line_start) == 1, "The linker script has multiple text sections!"
     text_line_start = text_line_start[0]
 
+<<<<<<< HEAD
     # ensure that parent directory exists before writing
     # pyrefly: ignore [bad-assignment]
     fout = Path(fout)
     # pyrefly: ignore [missing-attribute]
     fout.parent.mkdir(parents=True, exist_ok=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with open(fout, "w") as f:
         for lineid, line in enumerate(linker_script_lines):
             if lineid == text_line_start + 2:
@@ -44,6 +52,7 @@ def gen_linker_script(
                     f.write(f"      .text.{plines}\n")
                 f.write("    )\n")
             f.write(f"{line}\n")
+<<<<<<< HEAD
 
 
 if __name__ == "__main__":
@@ -61,3 +70,5 @@ def gen_linker_script(
     # convert args to a dict to pass to gen_linker_script
     kwargs = vars(parser.parse_args())
     gen_linker_script(**kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 2cf8a8489cfb1..0a3976469db18 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -7,6 +7,7 @@
 import os
 import shutil
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, cast, TYPE_CHECKING
 from urllib.request import urlopen
 
@@ -15,6 +16,12 @@
     from collections.abc import Callable
 
 
+=======
+from typing import Any, Callable, cast
+from urllib.request import urlopen
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
 
@@ -36,8 +43,12 @@ def get_disabled_issues() -> list[str]:
 TD_HEURISTIC_PREVIOUSLY_FAILED = "previous_failures.json"
 TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL = "previous_failures_additional.json"
 
+<<<<<<< HEAD
 #increse the cache time to 6 months
 FILE_CACHE_LIFESPAN_SECONDS = 6 * 30 * 24 * 3600
+=======
+FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def fetch_and_cache(
@@ -113,7 +124,11 @@ def process_disabled_test(the_response: dict[str, Any]) -> dict[str, Any]:
         return disabled_test_from_issues
 
     try:
+<<<<<<< HEAD
         url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json"
+=======
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=oZfFXdfoa7trdcAiH1aL91T9jUDckwlX"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")
diff --git a/tools/stats/monitor.py b/tools/stats/monitor.py
index 38d1f94b178b2..82b8857755a2c 100644
--- a/tools/stats/monitor.py
+++ b/tools/stats/monitor.py
@@ -78,9 +78,12 @@ class GpuData:
     uuid: str
     utilization: float
     mem_utilization: float
+<<<<<<< HEAD
     allocated_mem: float
     allocated_mem_value: float
     total_mem_value: float
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -262,7 +265,10 @@ def _generate_stats(self, data_list: list[float]) -> UtilizationStats:
         return UtilizationStats(
             avg=round(avg, 2),
             max=round(maxi, 2),
+<<<<<<< HEAD
             raw=data_list,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _output_data(self) -> None:
@@ -342,33 +348,45 @@ def _calculate_gpu_utilization(self, data_list: list[UsageData]) -> list[GpuUsag
         calculate_gpu = []
         gpu_mem_utilization = defaultdict(list)
         gpu_utilization = defaultdict(list)
+<<<<<<< HEAD
         gpu_allocated_mem = defaultdict(list)
         gpu_allocated_mem_values = defaultdict(list)
         gpu_total_mem_values = defaultdict(float)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for data in data_list:
             for gpu in data.gpu_list:
                 gpu_mem_utilization[gpu.uuid].append(gpu.mem_utilization)
                 gpu_utilization[gpu.uuid].append(gpu.utilization)
+<<<<<<< HEAD
                 gpu_allocated_mem[gpu.uuid].append(gpu.allocated_mem)
                 gpu_allocated_mem_values[gpu.uuid].append(gpu.allocated_mem_value)
                 gpu_total_mem_values[gpu.uuid] = gpu.total_mem_value
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for gpu_uuid in gpu_utilization.keys():
             gpu_util_stats = self._generate_stats(gpu_utilization[gpu_uuid])
             gpu_mem_util_stats = self._generate_stats(gpu_mem_utilization[gpu_uuid])
+<<<<<<< HEAD
             gpu_allocated_mem_stats = self._generate_stats(gpu_allocated_mem[gpu_uuid])
             gpu_allocated_mem_value_stats = self._generate_stats(
                 gpu_allocated_mem_values[gpu_uuid]
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             calculate_gpu.append(
                 GpuUsage(
                     uuid=gpu_uuid,
                     util_percent=gpu_util_stats,
                     mem_util_percent=gpu_mem_util_stats,
+<<<<<<< HEAD
                     allocated_mem_percent=gpu_allocated_mem_stats,
                     allocated_mem_value=gpu_allocated_mem_value_stats,
                     total_mem_value=gpu_total_mem_values[gpu_uuid],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
         return calculate_gpu
@@ -399,6 +417,7 @@ def _collect_gpu_data(self) -> list[GpuData]:
                 # see https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
                 gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
                 gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
+<<<<<<< HEAD
                 gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
                 mem_utilization = gpu_utilization.memory
 
@@ -406,14 +425,20 @@ def _collect_gpu_data(self) -> list[GpuData]:
                 total_mem_MB = gpu_memory_info.total / 1024**2
                 allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gpu_data_list.append(
                     GpuData(
                         uuid=gpu_uuid,
                         utilization=gpu_utilization.gpu,
+<<<<<<< HEAD
                         mem_utilization=mem_utilization,
                         allocated_mem=allocate_mem_percent,
                         allocated_mem_value=allocate_mem_MB,
                         total_mem_value=total_mem_MB,
+=======
+                        mem_utilization=gpu_utilization.memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
         elif self._has_amdsmi:
@@ -424,20 +449,26 @@ def _collect_gpu_data(self) -> list[GpuData]:
                 gpu_uuid = amdsmi.amdsmi_get_gpu_device_uuid(handle)
                 gpu_utilization = engine_usage["gfx_activity"]
                 gpu_mem_utilization = gpu_utilization["umc_activity"]
+<<<<<<< HEAD
                 mem_info = amdsmi.amdsmi_get_gpu_memory_usage(handle)
 
                 allocate_mem_MB = mem_info["vram_usage"] / 1024**2
                 total_mem_MB = mem_info["vram_total"] / 1024**2
                 allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gpu_data_list.append(
                     GpuData(
                         uuid=gpu_uuid,
                         utilization=gpu_utilization,
                         mem_utilization=gpu_mem_utilization,
+<<<<<<< HEAD
                         allocated_mem=allocate_mem_percent,
                         allocated_mem_value=allocate_mem_MB,
                         total_mem_value=total_mem_MB,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
         return gpu_data_list
@@ -535,9 +566,13 @@ def get_processes_running_python_tests() -> list[Any]:
                     cmd = " ".join(process.cmdline())
                     processName = process.name()
                     pid = process.pid
+<<<<<<< HEAD
                     is_python = "python" in processName and "python" in cmd
                     is_pytest = "pytest" in cmd
                     if is_python or is_pytest:
+=======
+                    if "python" in processName and cmd.startswith("python"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         python_test_processes.append({"pid": pid, "cmd": cmd})
                 except Exception:
                     pass
diff --git a/tools/stats/test_dashboard.py b/tools/stats/test_dashboard.py
index f98fc7b4abc7e..c47e4a1227d7e 100644
--- a/tools/stats/test_dashboard.py
+++ b/tools/stats/test_dashboard.py
@@ -89,18 +89,29 @@ def get_td_exclusions(
         return grouped_tests
 
 
+<<<<<<< HEAD
 def group_test_cases(test_cases: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
     # Returns a list of lists. Each inner list contains test cases with the same
     # build name, test config, file name, class name, and test name (ex if it was run multiple times)
     start = time.time()
     test_case_with_job_info = defaultdict(list)
 
+=======
+def group_test_cases(test_cases: list[dict[str, Any]]) -> dict[str, Any]:
+    start = time.time()
+    grouped_tests: dict[str, Any] = defaultdict(
+        lambda: defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for test_case in test_cases:
         job_name = get_job_name(test_case["job_id"])
         build_name = get_build_name(job_name)
         if "bazel" in build_name:
             continue
         test_config = get_test_config(job_name)
+<<<<<<< HEAD
 
         test_case["job_name"] = job_name
         test_case["build_name"] = build_name
@@ -120,11 +131,29 @@ def group_test_cases(test_cases: list[dict[str, Any]]) -> list[list[dict[str, An
 
 
 def get_reruns(grouped_tests: list[list[dict[str, Any]]]) -> dict[str, Any]:
+=======
+        class_name = test_case.pop("classname", "NoClass")
+        name = test_case.pop("name", "NoName")
+        invoking_file = test_case.pop("invoking_file", "NoFile")
+        invoking_file = invoking_file.replace(".", "/")
+        test_case.pop("workflow_id")
+        test_case.pop("workflow_run_attempt")
+        grouped_tests[build_name][test_config][invoking_file][class_name][name].append(
+            test_case
+        )
+
+    print(f"Time taken to group tests: {time.time() - start}")
+    return grouped_tests
+
+
+def get_reruns(grouped_tests: dict[str, Any]) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reruns: dict[str, Any] = defaultdict(
         lambda: defaultdict(
             lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
         )
     )
+<<<<<<< HEAD
 
     for tests in grouped_tests:
         if len(tests) > 1:
@@ -184,6 +213,43 @@ def get_invoking_file_summary(
 
     for (build_name, test_config, file), data in summary_flat.items():
         invoking_file_summary[build_name][test_config][file] = data
+=======
+    for build_name, build in grouped_tests.items():
+        for test_config, test_config_data in build.items():
+            for invoking_file, invoking_file_data in test_config_data.items():
+                for class_name, class_data in invoking_file_data.items():
+                    for test_name, test_data in class_data.items():
+                        if len(test_data) > 1:
+                            if invoking_file in (
+                                "distributed/test_distributed_spawn",
+                                "onnx/test_fx_to_onnx_with_onnxruntime",
+                                "distributed/algorithms/quantization/test_quantization",
+                            ):
+                                continue
+                            reruns[build_name][test_config][invoking_file][class_name][
+                                test_name
+                            ] = test_data
+    return reruns
+
+
+def get_invoking_file_summary(grouped_tests: dict[str, Any]) -> dict[str, Any]:
+    invoking_file_summary: dict[str, Any] = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(lambda: {"count": 0, "time": 0.0}))
+    )
+    for build_name, build in grouped_tests.items():
+        for test_config, test_config_data in build.items():
+            for invoking_file, invoking_file_data in test_config_data.items():
+                for class_data in invoking_file_data.values():
+                    for test_data in class_data.values():
+                        invoking_file_summary[build_name][test_config][invoking_file][
+                            "count"
+                        ] += 1
+                        for i in test_data:
+                            invoking_file_summary[build_name][test_config][
+                                invoking_file
+                            ]["time"] += i["time"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return invoking_file_summary
 
 
@@ -204,6 +270,7 @@ def get_all_run_attempts(workflow_run_id: int) -> list[int]:
     return sorted(run_attempts)
 
 
+<<<<<<< HEAD
 def get_test_status(test_cases: list[list[dict[str, Any]]]) -> list[dict[str, Any]]:
     # Returns a list of dicts with test status info (flaky, success, failure,
     # skipped)
@@ -239,6 +306,8 @@ def get_test_status(test_cases: list[list[dict[str, Any]]]) -> list[dict[str, An
     return only_status_info
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def upload_additional_info(
     workflow_run_id: int, workflow_run_attempt: int, test_cases: list[dict[str, Any]]
 ) -> None:
@@ -265,9 +334,12 @@ def upload_additional_info(
         "additional_info/invoking_file_summary",
         [invoking_file_summary],
     )
+<<<<<<< HEAD
     upload_workflow_stats_to_s3(
         workflow_run_id,
         workflow_run_attempt,
         "additional_info/test_status",
         get_test_status(grouped_tests),
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py
index ab31cf645cd52..49ca1f215a9e1 100644
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@@ -6,17 +6,24 @@
 import os
 import time
 import urllib.parse
+<<<<<<< HEAD
 from typing import Any, cast, TYPE_CHECKING
+=======
+from typing import Any, Callable, cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from urllib.error import HTTPError
 from urllib.request import Request, urlopen
 
 from tools.stats.upload_stats_lib import upload_to_s3
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FILTER_OUT_USERS = {
     "pytorchmergebot",
     "facebook-github-bot",
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index 34548b80d76ba..e00202a195c38 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -9,16 +9,23 @@
 import zipfile
 from functools import lru_cache
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, cast, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import boto3  # type: ignore[import]
 import requests
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
 
 
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index b5802e8032419..30c47a0bedbdb 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -296,6 +296,7 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
             remove_nan_inf(test_cases),
         )
 
+<<<<<<< HEAD
     # Part of an experiment to see if we can handle all the data as is
     upload_workflow_stats_to_s3(
         args.workflow_run_id,
@@ -304,4 +305,6 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
         remove_nan_inf(test_cases),
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
diff --git a/tools/stats/upload_utilization_stats/upload_utilization_stats.py b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
index 5b69c1a555952..6d91779d6b121 100644
--- a/tools/stats/upload_utilization_stats/upload_utilization_stats.py
+++ b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
@@ -60,7 +60,10 @@ def generate(
         df[time_col_name] = pd.to_datetime(df[time_col_name], unit="s", utc=True)
 
         # get unique cmd names
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unique_cmds_df = pd.DataFrame(df[cmd_col_name].unique(), columns=[cmd_col_name])
 
         # get all detected python cmds
diff --git a/tools/stats/utilization_stats_lib.py b/tools/stats/utilization_stats_lib.py
index 306cd7fe9e1f7..270a6acfa4fa9 100644
--- a/tools/stats/utilization_stats_lib.py
+++ b/tools/stats/utilization_stats_lib.py
@@ -2,11 +2,18 @@
 from datetime import datetime
 from typing import Optional
 
+<<<<<<< HEAD
 #  pyrefly: ignore [missing-import]
 from dataclasses_json import DataClassJsonMixin  # type: ignore[import-not-found]
 
 
 _DATA_MODEL_VERSION = 1.5
+=======
+from dataclasses_json import DataClassJsonMixin
+
+
+_DATA_MODEL_VERSION = 1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # data model for test log usage
@@ -14,11 +21,18 @@
 class UtilizationStats:
     avg: Optional[float] = None
     max: Optional[float] = None
+<<<<<<< HEAD
     raw: Optional[list[float]] = None
 
 
 @dataclass
 class UtilizationMetadata(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
+=======
+
+
+@dataclass
+class UtilizationMetadata(DataClassJsonMixin):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     level: str
     workflow_id: str
     job_id: str
@@ -34,6 +48,7 @@ class UtilizationMetadata(DataClassJsonMixin):  # type: ignore[misc, no-any-unim
 
 
 @dataclass
+<<<<<<< HEAD
 class GpuUsage(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
     uuid: Optional[str] = None
     util_percent: Optional[UtilizationStats] = None
@@ -45,13 +60,27 @@ class GpuUsage(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
 
 @dataclass
 class RecordData(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
+=======
+class GpuUsage(DataClassJsonMixin):
+    uuid: Optional[str] = None
+    util_percent: Optional[UtilizationStats] = None
+    mem_util_percent: Optional[UtilizationStats] = None
+
+
+@dataclass
+class RecordData(DataClassJsonMixin):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cpu: Optional[UtilizationStats] = None
     memory: Optional[UtilizationStats] = None
     gpu_usage: Optional[list[GpuUsage]] = None
 
 
 @dataclass
+<<<<<<< HEAD
 class UtilizationRecord(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
+=======
+class UtilizationRecord(DataClassJsonMixin):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     level: str
     timestamp: int
     data: Optional[RecordData] = None
@@ -64,7 +93,11 @@ class UtilizationRecord(DataClassJsonMixin):  # type: ignore[misc, no-any-unimpo
 # the db schema related to this is:
 # https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_metadata_schema.sql
 @dataclass
+<<<<<<< HEAD
 class OssCiSegmentV1(DataClassJsonMixin):  # type: ignore[misc, no-any-unimported]
+=======
+class OssCiSegmentV1(DataClassJsonMixin):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     level: str
     name: str
     start_at: int
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
index a62e93ecc2615..87bcdc9754981 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
@@ -1,5 +1,6 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+<<<<<<< HEAD
     " 1": "def a_very_very_long(): lines=8, docs=0: (grandfathered)",
     "10": "class LintInit: lines=6, docs=0: (grandfathered)"
   },
@@ -8,5 +9,13 @@
     "24": "class LongWithShortDocstring: lines=6, docs=10: (grandfathered)",
     "54": "def long_without_docstring(): lines=7, docs=0: (grandfathered)",
     "71": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: (grandfathered)"
+=======
+    "11": "class LintInit: lines=6, docs=0: (grandfathered)"
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "20": "class LongWithoutDocstring: lines=4, docs=0: (grandfathered)",
+    "25": "class LongWithShortDocstring: lines=6, docs=10: (grandfathered)",
+    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: (grandfathered)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
index f6f71e0a45d6a..f06e9ffc33500 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
@@ -1,5 +1,6 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+<<<<<<< HEAD
     " 1": "def a_very_very_long(): lines=8, docs=0: FAIL",
     "10": "class LintInit: lines=6, docs=0: FAIL"
   },
@@ -8,5 +9,13 @@
     "24": "class LongWithShortDocstring: lines=6, docs=10: FAIL",
     "54": "def long_without_docstring(): lines=7, docs=0: FAIL",
     "71": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: FAIL"
+=======
+    "11": "class LintInit: lines=6, docs=0: FAIL"
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "20": "class LongWithoutDocstring: lines=4, docs=0: FAIL",
+    "25": "class LongWithShortDocstring: lines=6, docs=10: FAIL",
+    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: FAIL"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
index de8cf370f7cc4..4f9c52159a3a0 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/more_python_code.py.txt:1: No docstring found for function 'a_very_very_long' (8 lines)
     1 | def a_very_very_long():
         ^
@@ -6,6 +7,8 @@ tools/test/docstring_linter_testdata/more_python_code.py.txt:1: No docstring fou
     4 |     # Lots of lines!
     5 |     # Lots of lines!
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tools/test/docstring_linter_testdata/more_python_code.py.txt:10: No docstring found for class 'LintInit' (6 lines)
     8 |
     9 |
@@ -14,7 +17,11 @@ tools/test/docstring_linter_testdata/more_python_code.py.txt:10: No docstring fo
    11 |     def __init__(self) -> None:
    12 |         # Lots of lines!
 
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (6 lines)
+=======
+tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (4 lines)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    15 |
    16 |
    17 | class LongWithoutDocstring:
@@ -30,6 +37,7 @@ tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for
    25 |     """TODO"""
    26 |
 
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found for function 'long_without_docstring' (7 lines)
    52 |
    53 |
@@ -38,6 +46,8 @@ tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found f
    55 |     #
    56 |     #
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
    69 |     """This docstring, while short, is enough"""
    70 |
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
index 1c4c8b6963a31..a343d65bd2944 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
@@ -1,5 +1,6 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+<<<<<<< HEAD
     "class LintInit": 6,
     "def a_very_very_long()": 8
   },
@@ -8,5 +9,13 @@
     "class LongWithoutDocstring": 6,
     "def ImpossibleCombo.needs_docs()": 12,
     "def long_without_docstring()": 7
+=======
+    "class LintInit": 6
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "class LongWithShortDocstring": 6,
+    "class LongWithoutDocstring": 4,
+    "def ImpossibleCombo.needs_docs()": 12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
\ No newline at end of file
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.json b/tools/test/docstring_linter_testdata/python_code.py.txt.json
index b95486e7ff563..148ad9737301c 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.json
@@ -4,7 +4,11 @@
     "code": "DOCSTRING_LINTER",
     "description": null,
     "line": 17,
+<<<<<<< HEAD
     "name": "No docstring found for class 'LongWithoutDocstring' (6 lines)",
+=======
+    "name": "No docstring found for class 'LongWithoutDocstring' (4 lines)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "original": null,
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
     "replacement": null,
@@ -22,6 +26,7 @@
     "severity": "error"
   },
   {
+<<<<<<< HEAD
     "char": 0,
     "code": "DOCSTRING_LINTER",
     "description": null,
@@ -33,6 +38,8 @@
     "severity": "error"
   },
   {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "char": 4,
     "code": "DOCSTRING_LINTER",
     "description": null,
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
index 2db9a576291a0..3d4cb1ddfde4d 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (6 lines)
+=======
+tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (4 lines)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    15 |
    16 |
    17 | class LongWithoutDocstring:
@@ -14,6 +18,7 @@ tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for
    25 |     """TODO"""
    26 |
 
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found for function 'long_without_docstring' (7 lines)
    52 |
    53 |
@@ -22,6 +27,8 @@ tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found f
    55 |     #
    56 |     #
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
    69 |     """This docstring, while short, is enough"""
    70 |
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
index 65d46215a3c25..67ef14fefbc67 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
@@ -11,7 +11,11 @@
     "is_method": false,
     "line_count": 4,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 1
+=======
+    "start_line": 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -25,7 +29,11 @@
     "is_method": false,
     "line_count": 3,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 6
+=======
+    "start_line": 7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -42,7 +50,11 @@
         "is_method": true,
         "line_count": 3,
         "parent": 2,
+<<<<<<< HEAD
         "start_line": 13
+=======
+        "start_line": 14
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [],
@@ -54,7 +66,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 10
+=======
+    "start_line": 11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -71,7 +87,11 @@
         "is_method": true,
         "line_count": 3,
         "parent": 4,
+<<<<<<< HEAD
         "start_line": 20
+=======
+        "start_line": 21
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [],
@@ -81,9 +101,15 @@
     "index": 4,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 6,
     "parent": null,
     "start_line": 17
+=======
+    "line_count": 4,
+    "parent": null,
+    "start_line": 20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -100,7 +126,11 @@
         "is_method": true,
         "line_count": 3,
         "parent": 6,
+<<<<<<< HEAD
         "start_line": 27
+=======
+        "start_line": 28
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [],
@@ -112,7 +142,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 24
+=======
+    "start_line": 25
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -129,7 +163,11 @@
         "is_method": true,
         "line_count": 3,
         "parent": 8,
+<<<<<<< HEAD
         "start_line": 34
+=======
+        "start_line": 35
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [],
@@ -141,7 +179,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 31
+=======
+    "start_line": 32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -153,9 +195,15 @@
     "index": 10,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 6,
     "parent": null,
     "start_line": 38
+=======
+    "line_count": 3,
+    "parent": null,
+    "start_line": 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -169,7 +217,11 @@
     "is_method": false,
     "line_count": 8,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 45
+=======
+    "start_line": 46
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -181,9 +233,15 @@
     "index": 12,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 7,
     "parent": null,
     "start_line": 54
+=======
+    "line_count": 3,
+    "parent": null,
+    "start_line": 59
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -206,7 +264,11 @@
                 "is_method": false,
                 "line_count": 6,
                 "parent": 15,
+<<<<<<< HEAD
                 "start_line": 73
+=======
+                "start_line": 74
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               },
               {
                 "category": "class",
@@ -220,7 +282,11 @@
                 "is_method": false,
                 "line_count": 3,
                 "parent": 15,
+<<<<<<< HEAD
                 "start_line": 80
+=======
+                "start_line": 81
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               }
             ],
             "decorators": [],
@@ -232,7 +298,11 @@
             "is_method": false,
             "line_count": 11,
             "parent": 14,
+<<<<<<< HEAD
             "start_line": 72
+=======
+            "start_line": 73
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           },
           {
             "category": "class",
@@ -246,7 +316,11 @@
             "is_method": false,
             "line_count": 6,
             "parent": 15,
+<<<<<<< HEAD
             "start_line": 73
+=======
+            "start_line": 74
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           },
           {
             "category": "class",
@@ -260,7 +334,11 @@
             "is_method": false,
             "line_count": 3,
             "parent": 15,
+<<<<<<< HEAD
             "start_line": 80
+=======
+            "start_line": 81
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
         ],
         "decorators": [],
@@ -272,7 +350,11 @@
         "is_method": true,
         "line_count": 12,
         "parent": 13,
+<<<<<<< HEAD
         "start_line": 71
+=======
+        "start_line": 72
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "def",
@@ -289,7 +371,11 @@
             "is_method": false,
             "line_count": 6,
             "parent": 15,
+<<<<<<< HEAD
             "start_line": 73
+=======
+            "start_line": 74
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           },
           {
             "category": "class",
@@ -303,7 +389,11 @@
             "is_method": false,
             "line_count": 3,
             "parent": 15,
+<<<<<<< HEAD
             "start_line": 80
+=======
+            "start_line": 81
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
         ],
         "decorators": [],
@@ -315,7 +405,11 @@
         "is_method": false,
         "line_count": 11,
         "parent": 14,
+<<<<<<< HEAD
         "start_line": 72
+=======
+        "start_line": 73
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "class",
@@ -329,7 +423,11 @@
         "is_method": false,
         "line_count": 6,
         "parent": 15,
+<<<<<<< HEAD
         "start_line": 73
+=======
+        "start_line": 74
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "class",
@@ -343,7 +441,11 @@
         "is_method": false,
         "line_count": 3,
         "parent": 15,
+<<<<<<< HEAD
         "start_line": 80
+=======
+        "start_line": 81
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [],
@@ -353,9 +455,15 @@
     "index": 13,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 21,
     "parent": null,
     "start_line": 62
+=======
+    "line_count": 15,
+    "parent": null,
+    "start_line": 69
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -372,7 +480,11 @@
         "is_method": true,
         "line_count": 2,
         "parent": 18,
+<<<<<<< HEAD
         "start_line": 86
+=======
+        "start_line": 87
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "def",
@@ -386,9 +498,15 @@
         "index": 20,
         "is_local": false,
         "is_method": true,
+<<<<<<< HEAD
         "line_count": 6,
         "parent": 18,
         "start_line": 92
+=======
+        "line_count": 2,
+        "parent": 18,
+        "start_line": 97
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "def",
@@ -402,7 +520,11 @@
         "is_method": true,
         "line_count": 2,
         "parent": 18,
+<<<<<<< HEAD
         "start_line": 99
+=======
+        "start_line": 100
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       {
         "category": "def",
@@ -416,7 +538,11 @@
         "is_method": true,
         "line_count": 4,
         "parent": 18,
+<<<<<<< HEAD
         "start_line": 102
+=======
+        "start_line": 103
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     ],
     "decorators": [
@@ -430,7 +556,11 @@
     "is_method": false,
     "line_count": 21,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 85
+=======
+    "start_line": 86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -442,8 +572,14 @@
     "index": 23,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 5,
     "parent": null,
     "start_line": 107
+=======
+    "line_count": 1,
+    "parent": null,
+    "start_line": 112
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 ]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
index dd4c90dc2710c..110ab5e93d879 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
@@ -1,17 +1,29 @@
 {
   "class ImpossibleCombo": {
     "children": {
+<<<<<<< HEAD
       "71": {
         "children": {
           "72": {
             "children": {
               "73": {
+=======
+      "72": {
+        "children": {
+          "73": {
+            "children": {
+              "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "docstring_len": 0,
                 "lines": 6,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Long",
                 "status": "good"
               },
+<<<<<<< HEAD
               "80": {
+=======
+              "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "docstring_len": 0,
                 "lines": 3,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -23,13 +35,21 @@
             "name": "def ImpossibleCombo.needs_docs.not_short",
             "status": "good"
           },
+<<<<<<< HEAD
           "73": {
+=======
+          "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
+<<<<<<< HEAD
           "80": {
+=======
+          "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -41,15 +61,25 @@
         "name": "def ImpossibleCombo.needs_docs",
         "status": "good"
       },
+<<<<<<< HEAD
       "72": {
         "children": {
           "73": {
+=======
+      "73": {
+        "children": {
+          "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
+<<<<<<< HEAD
           "80": {
+=======
+          "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -61,13 +91,21 @@
         "name": "def ImpossibleCombo.needs_docs.not_short",
         "status": "good"
       },
+<<<<<<< HEAD
       "73": {
+=======
+      "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 6,
         "name": "class ImpossibleCombo.needs_docs.not_short.Long",
         "status": "good"
       },
+<<<<<<< HEAD
       "80": {
+=======
+      "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -75,13 +113,22 @@
       }
     },
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 62,
     "lines": 21,
+=======
+    "line": 69,
+    "lines": 15,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "class LongWithDocstring": {
     "children": {
+<<<<<<< HEAD
       "13": {
+=======
+      "14": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithDocstring.short1",
@@ -89,13 +136,21 @@
       }
     },
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 10,
+=======
+    "line": 11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "class LongWithShortDocstring": {
     "children": {
+<<<<<<< HEAD
       "27": {
+=======
+      "28": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithShortDocstring.short1",
@@ -103,13 +158,21 @@
       }
     },
     "docstring_len": 10,
+<<<<<<< HEAD
     "line": 24,
+=======
+    "line": 25,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "class LongWithoutDocstring": {
     "children": {
+<<<<<<< HEAD
       "20": {
+=======
+      "21": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithoutDocstring.short1",
@@ -117,18 +180,28 @@
       }
     },
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 17,
     "lines": 6,
+=======
+    "line": 20,
+    "lines": 4,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "class NotDocstring": {
     "children": {
+<<<<<<< HEAD
       " 86": {
+=======
+      " 87": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short1",
         "status": "good"
       },
+<<<<<<< HEAD
       " 92": {
         "docstring_len": 0,
         "lines": 6,
@@ -136,12 +209,25 @@
         "status": "good"
       },
       " 99": {
+=======
+      " 97": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.long_with_override",
+        "status": "good"
+      },
+      "100": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short2",
         "status": "good"
       },
+<<<<<<< HEAD
       "102": {
+=======
+      "103": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 4,
         "name": "def NotDocstring.short3",
@@ -149,25 +235,41 @@
       }
     },
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 85,
+=======
+    "line": 86,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 21,
     "status": "good"
   },
   "class Short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 6,
+=======
+    "line": 7,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "class ShortWithDocstring": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 1,
+=======
+    "line": 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 4,
     "status": "good"
   },
   "class _Protected": {
     "children": {
+<<<<<<< HEAD
       "34": {
+=======
+      "35": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def _Protected.short1",
@@ -175,32 +277,55 @@
       }
     },
     "docstring_len": 10,
+<<<<<<< HEAD
     "line": 31,
+=======
+    "line": 32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "def long": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 45,
+=======
+    "line": 46,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 8,
     "status": "good"
   },
   "def long_with_omit": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 107,
     "lines": 5,
+=======
+    "line": 112,
+    "lines": 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "def long_without_docstring": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 54,
     "lines": 7,
+=======
+    "line": 59,
+    "lines": 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "def short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 38,
     "lines": 6,
+=======
+    "line": 42,
+    "lines": 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
index cadee32ab874f..8a1afea19cb86 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
@@ -1,19 +1,33 @@
 {
+<<<<<<< HEAD
   "  1": {
+=======
+  "  2": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 44,
     "lines": 4,
     "name": "class ShortWithDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   "  6": {
+=======
+  "  7": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "class Short",
     "status": "good"
   },
+<<<<<<< HEAD
   " 10": {
     "children": {
       "13": {
+=======
+  " 11": {
+    "children": {
+      "14": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithDocstring.short1",
@@ -25,9 +39,15 @@
     "name": "class LongWithDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   " 17": {
     "children": {
       "20": {
+=======
+  " 20": {
+    "children": {
+      "21": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithoutDocstring.short1",
@@ -35,6 +55,7 @@
       }
     },
     "docstring_len": 0,
+<<<<<<< HEAD
     "lines": 6,
     "name": "class LongWithoutDocstring",
     "status": "good"
@@ -42,6 +63,15 @@
   " 24": {
     "children": {
       "27": {
+=======
+    "lines": 4,
+    "name": "class LongWithoutDocstring",
+    "status": "good"
+  },
+  " 25": {
+    "children": {
+      "28": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithShortDocstring.short1",
@@ -53,9 +83,15 @@
     "name": "class LongWithShortDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   " 31": {
     "children": {
       "34": {
+=======
+  " 32": {
+    "children": {
+      "35": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "def _Protected.short1",
@@ -67,6 +103,7 @@
     "name": "class _Protected",
     "status": "good"
   },
+<<<<<<< HEAD
   " 38": {
     "docstring_len": 0,
     "lines": 6,
@@ -74,11 +111,21 @@
     "status": "good"
   },
   " 45": {
+=======
+  " 42": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def short",
+    "status": "good"
+  },
+  " 46": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 44,
     "lines": 8,
     "name": "def long",
     "status": "good"
   },
+<<<<<<< HEAD
   " 54": {
     "docstring_len": 0,
     "lines": 7,
@@ -92,12 +139,31 @@
           "72": {
             "children": {
               "73": {
+=======
+  " 59": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def long_without_docstring",
+    "status": "good"
+  },
+  " 69": {
+    "children": {
+      "72": {
+        "children": {
+          "73": {
+            "children": {
+              "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "docstring_len": 0,
                 "lines": 6,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Long",
                 "status": "good"
               },
+<<<<<<< HEAD
               "80": {
+=======
+              "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "docstring_len": 0,
                 "lines": 3,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -109,13 +175,21 @@
             "name": "def ImpossibleCombo.needs_docs.not_short",
             "status": "good"
           },
+<<<<<<< HEAD
           "73": {
+=======
+          "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
+<<<<<<< HEAD
           "80": {
+=======
+          "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -127,15 +201,25 @@
         "name": "def ImpossibleCombo.needs_docs",
         "status": "good"
       },
+<<<<<<< HEAD
       "72": {
         "children": {
           "73": {
+=======
+      "73": {
+        "children": {
+          "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
+<<<<<<< HEAD
           "80": {
+=======
+          "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -147,13 +231,21 @@
         "name": "def ImpossibleCombo.needs_docs.not_short",
         "status": "good"
       },
+<<<<<<< HEAD
       "73": {
+=======
+      "74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 6,
         "name": "class ImpossibleCombo.needs_docs.not_short.Long",
         "status": "good"
       },
+<<<<<<< HEAD
       "80": {
+=======
+      "81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 3,
         "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -161,6 +253,7 @@
       }
     },
     "docstring_len": 44,
+<<<<<<< HEAD
     "lines": 21,
     "name": "class ImpossibleCombo",
     "status": "good"
@@ -168,11 +261,21 @@
   " 85": {
     "children": {
       " 86": {
+=======
+    "lines": 15,
+    "name": "class ImpossibleCombo",
+    "status": "good"
+  },
+  " 86": {
+    "children": {
+      " 87": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short1",
         "status": "good"
       },
+<<<<<<< HEAD
       " 92": {
         "docstring_len": 0,
         "lines": 6,
@@ -180,12 +283,25 @@
         "status": "good"
       },
       " 99": {
+=======
+      " 97": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.long_with_override",
+        "status": "good"
+      },
+      "100": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short2",
         "status": "good"
       },
+<<<<<<< HEAD
       "102": {
+=======
+      "103": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "docstring_len": 0,
         "lines": 4,
         "name": "def NotDocstring.short3",
@@ -197,9 +313,15 @@
     "name": "class NotDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   "107": {
     "docstring_len": 0,
     "lines": 5,
+=======
+  "112": {
+    "docstring_len": 0,
+    "lines": 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "def long_with_omit",
     "status": "good"
   }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.report.json b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
index 43a8648aad288..8c82149703976 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
@@ -11,7 +11,11 @@
     "is_method": false,
     "line_count": 4,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 1
+=======
+    "start_line": 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -25,7 +29,11 @@
     "is_method": false,
     "line_count": 3,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 6
+=======
+    "start_line": 7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -41,7 +49,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 10
+=======
+    "start_line": 11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -55,7 +67,11 @@
     "is_method": true,
     "line_count": 3,
     "parent": 2,
+<<<<<<< HEAD
     "start_line": 13
+=======
+    "start_line": 14
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -69,9 +85,15 @@
     "index": 4,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 6,
     "parent": null,
     "start_line": 17
+=======
+    "line_count": 4,
+    "parent": null,
+    "start_line": 20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -85,7 +107,11 @@
     "is_method": true,
     "line_count": 3,
     "parent": 4,
+<<<<<<< HEAD
     "start_line": 20
+=======
+    "start_line": 21
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -101,7 +127,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 24
+=======
+    "start_line": 25
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -115,7 +145,11 @@
     "is_method": true,
     "line_count": 3,
     "parent": 6,
+<<<<<<< HEAD
     "start_line": 27
+=======
+    "start_line": 28
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -131,7 +165,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 31
+=======
+    "start_line": 32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -145,7 +183,11 @@
     "is_method": true,
     "line_count": 3,
     "parent": 8,
+<<<<<<< HEAD
     "start_line": 34
+=======
+    "start_line": 35
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -157,9 +199,15 @@
     "index": 10,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 6,
     "parent": null,
     "start_line": 38
+=======
+    "line_count": 3,
+    "parent": null,
+    "start_line": 42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -173,7 +221,11 @@
     "is_method": false,
     "line_count": 8,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 45
+=======
+    "start_line": 46
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -185,9 +237,15 @@
     "index": 12,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 7,
     "parent": null,
     "start_line": 54
+=======
+    "line_count": 3,
+    "parent": null,
+    "start_line": 59
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -204,9 +262,15 @@
     "index": 13,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 21,
     "parent": null,
     "start_line": 62
+=======
+    "line_count": 15,
+    "parent": null,
+    "start_line": 69
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -224,7 +288,11 @@
     "is_method": true,
     "line_count": 12,
     "parent": 13,
+<<<<<<< HEAD
     "start_line": 71
+=======
+    "start_line": 72
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -241,7 +309,11 @@
     "is_method": false,
     "line_count": 11,
     "parent": 14,
+<<<<<<< HEAD
     "start_line": 72
+=======
+    "start_line": 73
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -255,7 +327,11 @@
     "is_method": false,
     "line_count": 6,
     "parent": 15,
+<<<<<<< HEAD
     "start_line": 73
+=======
+    "start_line": 74
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -269,7 +345,11 @@
     "is_method": false,
     "line_count": 3,
     "parent": 15,
+<<<<<<< HEAD
     "start_line": 80
+=======
+    "start_line": 81
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "class",
@@ -290,7 +370,11 @@
     "is_method": false,
     "line_count": 21,
     "parent": null,
+<<<<<<< HEAD
     "start_line": 85
+=======
+    "start_line": 86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -304,7 +388,11 @@
     "is_method": true,
     "line_count": 2,
     "parent": 18,
+<<<<<<< HEAD
     "start_line": 86
+=======
+    "start_line": 87
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -318,9 +406,15 @@
     "index": 20,
     "is_local": false,
     "is_method": true,
+<<<<<<< HEAD
     "line_count": 6,
     "parent": 18,
     "start_line": 92
+=======
+    "line_count": 2,
+    "parent": 18,
+    "start_line": 97
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -334,7 +428,11 @@
     "is_method": true,
     "line_count": 2,
     "parent": 18,
+<<<<<<< HEAD
     "start_line": 99
+=======
+    "start_line": 100
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -348,7 +446,11 @@
     "is_method": true,
     "line_count": 4,
     "parent": 18,
+<<<<<<< HEAD
     "start_line": 102
+=======
+    "start_line": 103
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
     "category": "def",
@@ -360,8 +462,14 @@
     "index": 23,
     "is_local": false,
     "is_method": false,
+<<<<<<< HEAD
     "line_count": 5,
     "parent": null,
     "start_line": 107
+=======
+    "line_count": 1,
+    "parent": null,
+    "start_line": 112
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 ]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
index 16b1f18567f78..981e777314aa5 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
   "  1": "class ShortWithDocstring: lines=4, docs=44",
   "  6": "class Short: lines=3, docs=0",
   " 10": "class LongWithDocstring: lines=6, docs=44",
@@ -23,4 +24,30 @@
   " 99": "def NotDocstring.short2(): lines=2, docs=0",
   "102": "def NotDocstring.short3(): lines=4, docs=0",
   "107": "def long_with_omit(): lines=5, docs=0"
+=======
+  "  2": "class ShortWithDocstring: lines=4, docs=44",
+  "  7": "class Short: lines=3, docs=0",
+  " 11": "class LongWithDocstring: lines=6, docs=44",
+  " 14": "def LongWithDocstring.short1(): lines=3, docs=0",
+  " 20": "class LongWithoutDocstring: lines=4, docs=0",
+  " 21": "def LongWithoutDocstring.short1(): lines=3, docs=0",
+  " 25": "class LongWithShortDocstring: lines=6, docs=10",
+  " 28": "def LongWithShortDocstring.short1(): lines=3, docs=0",
+  " 32": "class _Protected: lines=6, docs=10",
+  " 35": "def _Protected.short1(): lines=3, docs=0",
+  " 42": "def short(): lines=3, docs=0",
+  " 46": "def long(): lines=8, docs=44",
+  " 59": "def long_without_docstring(): lines=3, docs=0",
+  " 69": "class ImpossibleCombo: lines=15, docs=44",
+  " 72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0",
+  " 73": "def ImpossibleCombo.needs_docs.not_short(): lines=11, docs=0",
+  " 74": "class ImpossibleCombo.needs_docs.not_short.Long: lines=6, docs=0",
+  " 81": "class ImpossibleCombo.needs_docs.not_short.Short: lines=3, docs=0",
+  " 86": "class NotDocstring: lines=21, docs=0",
+  " 87": "def NotDocstring.short1(): lines=2, docs=0",
+  " 97": "def NotDocstring.long_with_override(): lines=2, docs=0",
+  "100": "def NotDocstring.short2(): lines=2, docs=0",
+  "103": "def NotDocstring.short3(): lines=4, docs=0",
+  "112": "def long_with_omit(): lines=1, docs=0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
index 224da17c004fd..33105088d4b7e 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
@@ -1,146 +1,248 @@
 {
   "class ImpossibleCombo": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 62,
     "lines": 21,
+=======
+    "line": 69,
+    "lines": 15,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "class ImpossibleCombo.needs_docs.not_short.Long": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 73,
+=======
+    "line": 74,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "class ImpossibleCombo.needs_docs.not_short.Short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 80,
+=======
+    "line": 81,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "class LongWithDocstring": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 10,
+=======
+    "line": 11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "class LongWithShortDocstring": {
     "docstring_len": 10,
+<<<<<<< HEAD
     "line": 24,
+=======
+    "line": 25,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "class LongWithoutDocstring": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 17,
     "lines": 6,
+=======
+    "line": 20,
+    "lines": 4,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "class NotDocstring": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 85,
+=======
+    "line": 86,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 21,
     "status": "good"
   },
   "class Short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 6,
+=======
+    "line": 7,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "class ShortWithDocstring": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 1,
+=======
+    "line": 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 4,
     "status": "good"
   },
   "class _Protected": {
     "docstring_len": 10,
+<<<<<<< HEAD
     "line": 31,
+=======
+    "line": 32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 6,
     "status": "good"
   },
   "def ImpossibleCombo.needs_docs": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 71,
+=======
+    "line": 72,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 12,
     "status": "good"
   },
   "def ImpossibleCombo.needs_docs.not_short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 72,
+=======
+    "line": 73,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 11,
     "status": "good"
   },
   "def LongWithDocstring.short1": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 13,
+=======
+    "line": 14,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "def LongWithShortDocstring.short1": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 27,
+=======
+    "line": 28,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "def LongWithoutDocstring.short1": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 20,
+=======
+    "line": 21,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "def NotDocstring.long_with_override": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 92,
     "lines": 6,
+=======
+    "line": 97,
+    "lines": 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "def NotDocstring.short1": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 86,
+=======
+    "line": 87,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 2,
     "status": "good"
   },
   "def NotDocstring.short2": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 99,
+=======
+    "line": 100,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 2,
     "status": "good"
   },
   "def NotDocstring.short3": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 102,
+=======
+    "line": 103,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 4,
     "status": "good"
   },
   "def _Protected.short1": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 34,
+=======
+    "line": 35,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 3,
     "status": "good"
   },
   "def long": {
     "docstring_len": 44,
+<<<<<<< HEAD
     "line": 45,
+=======
+    "line": 46,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "lines": 8,
     "status": "good"
   },
   "def long_with_omit": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 107,
     "lines": 5,
+=======
+    "line": 112,
+    "lines": 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "def long_without_docstring": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 54,
     "lines": 7,
+=======
+    "line": 59,
+    "lines": 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   },
   "def short": {
     "docstring_len": 0,
+<<<<<<< HEAD
     "line": 38,
     "lines": 6,
+=======
+    "line": 42,
+    "lines": 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "status": "good"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
index 0e7d43c440f31..cd0838da987b0 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
@@ -1,28 +1,45 @@
 {
+<<<<<<< HEAD
   "  1": {
+=======
+  "  2": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 44,
     "lines": 4,
     "name": "class ShortWithDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   "  6": {
+=======
+  "  7": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "class Short",
     "status": "good"
   },
+<<<<<<< HEAD
   " 10": {
+=======
+  " 11": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 44,
     "lines": 6,
     "name": "class LongWithDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   " 13": {
+=======
+  " 14": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithDocstring.short1",
     "status": "good"
   },
+<<<<<<< HEAD
   " 17": {
     "docstring_len": 0,
     "lines": 6,
@@ -30,35 +47,61 @@
     "status": "good"
   },
   " 20": {
+=======
+  " 20": {
+    "docstring_len": 0,
+    "lines": 4,
+    "name": "class LongWithoutDocstring",
+    "status": "good"
+  },
+  " 21": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithoutDocstring.short1",
     "status": "good"
   },
+<<<<<<< HEAD
   " 24": {
+=======
+  " 25": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 10,
     "lines": 6,
     "name": "class LongWithShortDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   " 27": {
+=======
+  " 28": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithShortDocstring.short1",
     "status": "good"
   },
+<<<<<<< HEAD
   " 31": {
+=======
+  " 32": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 10,
     "lines": 6,
     "name": "class _Protected",
     "status": "good"
   },
+<<<<<<< HEAD
   " 34": {
+=======
+  " 35": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "def _Protected.short1",
     "status": "good"
   },
+<<<<<<< HEAD
   " 38": {
     "docstring_len": 0,
     "lines": 6,
@@ -66,11 +109,21 @@
     "status": "good"
   },
   " 45": {
+=======
+  " 42": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def short",
+    "status": "good"
+  },
+  " 46": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 44,
     "lines": 8,
     "name": "def long",
     "status": "good"
   },
+<<<<<<< HEAD
   " 54": {
     "docstring_len": 0,
     "lines": 7,
@@ -84,41 +137,77 @@
     "status": "good"
   },
   " 71": {
+=======
+  " 59": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def long_without_docstring",
+    "status": "good"
+  },
+  " 69": {
+    "docstring_len": 44,
+    "lines": 15,
+    "name": "class ImpossibleCombo",
+    "status": "good"
+  },
+  " 72": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 12,
     "name": "def ImpossibleCombo.needs_docs",
     "status": "good"
   },
+<<<<<<< HEAD
   " 72": {
+=======
+  " 73": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 11,
     "name": "def ImpossibleCombo.needs_docs.not_short",
     "status": "good"
   },
+<<<<<<< HEAD
   " 73": {
+=======
+  " 74": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 6,
     "name": "class ImpossibleCombo.needs_docs.not_short.Long",
     "status": "good"
   },
+<<<<<<< HEAD
   " 80": {
+=======
+  " 81": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 3,
     "name": "class ImpossibleCombo.needs_docs.not_short.Short",
     "status": "good"
   },
+<<<<<<< HEAD
   " 85": {
+=======
+  " 86": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 21,
     "name": "class NotDocstring",
     "status": "good"
   },
+<<<<<<< HEAD
   " 86": {
+=======
+  " 87": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 2,
     "name": "def NotDocstring.short1",
     "status": "good"
   },
+<<<<<<< HEAD
   " 92": {
     "docstring_len": 0,
     "lines": 6,
@@ -126,20 +215,39 @@
     "status": "good"
   },
   " 99": {
+=======
+  " 97": {
+    "docstring_len": 0,
+    "lines": 2,
+    "name": "def NotDocstring.long_with_override",
+    "status": "good"
+  },
+  "100": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 2,
     "name": "def NotDocstring.short2",
     "status": "good"
   },
+<<<<<<< HEAD
   "102": {
+=======
+  "103": {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "docstring_len": 0,
     "lines": 4,
     "name": "def NotDocstring.short3",
     "status": "good"
   },
+<<<<<<< HEAD
   "107": {
     "docstring_len": 0,
     "lines": 5,
+=======
+  "112": {
+    "docstring_len": 0,
+    "lines": 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "def long_with_omit",
     "status": "good"
   }
diff --git a/tools/test/gen_operators_yaml_test.py b/tools/test/gen_operators_yaml_test.py
index 244bfe5b834b1..6c5b0cf38adda 100644
--- a/tools/test/gen_operators_yaml_test.py
+++ b/tools/test/gen_operators_yaml_test.py
@@ -7,7 +7,10 @@
 from collections import defaultdict
 from unittest.mock import Mock, patch
 
+<<<<<<< HEAD
 # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from gen_operators_yaml import (
     fill_output,
     get_parser_options,
@@ -242,6 +245,9 @@ def test_fill_output_with_arguments_not_include_all_overloads(
 
         fill_output(output, options)
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for op_val in output["operators"].values():
             self.assertFalse(op_val["include_all_overloads"])
diff --git a/tools/test/heuristics/test_heuristics.py b/tools/test/heuristics/test_heuristics.py
index 9c52ea55e31b6..47d068c75f952 100644
--- a/tools/test/heuristics/test_heuristics.py
+++ b/tools/test/heuristics/test_heuristics.py
@@ -14,6 +14,10 @@
 sys.path.append(str(REPO_ROOT))
 
 from tools.test.heuristics.test_interface import TestTD
+<<<<<<< HEAD
+=======
+from tools.testing.target_determination.determinator import TestPrioritizations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from tools.testing.target_determination.heuristics.filepath import (
     file_matches_keyword,
     get_keywords,
@@ -21,7 +25,10 @@
 from tools.testing.target_determination.heuristics.historical_class_failure_correlation import (
     HistoricalClassFailurCorrelation,
 )
+<<<<<<< HEAD
 from tools.testing.target_determination.heuristics.interface import TestPrioritizations
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from tools.testing.target_determination.heuristics.previously_failed_in_pr import (
     get_previous_failures,
 )
diff --git a/tools/test/set_linter_testdata/python_code.py.txt b/tools/test/set_linter_testdata/python_code.py.txt
index e805a3ca92be1..3be8bfe8cf2b0 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt
+++ b/tools/test/set_linter_testdata/python_code.py.txt
@@ -30,9 +30,12 @@ class A:
 
 set = A().set
 
+<<<<<<< HEAD
 # An f string as in https://github.com/pytorch/pytorch/issues/159056
 f_string = f" {h:{w}} "
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Braced sets
 
 set1 = {1}
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.json b/tools/test/set_linter_testdata/python_code.py.txt.json
index 22935a7904dfa..fcf97fb29c3e1 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.json
+++ b/tools/test/set_linter_testdata/python_code.py.txt.json
@@ -47,7 +47,11 @@
     "char": 7,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 35,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -58,7 +62,11 @@
     "char": 9,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 35,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -69,7 +77,11 @@
     "char": 7,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 39,
+=======
+    "line": 36,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -80,7 +92,11 @@
     "char": 12,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 39,
+=======
+    "line": 36,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -91,7 +107,11 @@
     "char": 15,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 38,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -102,7 +122,11 @@
     "char": 36,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 38,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -113,7 +137,11 @@
     "char": 17,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 44,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -124,7 +152,11 @@
     "char": 22,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 44,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -135,7 +167,11 @@
     "char": 30,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 44,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -146,7 +182,11 @@
     "char": 50,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 44,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -157,7 +197,11 @@
     "char": 10,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 47,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -168,7 +212,11 @@
     "char": 51,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 47,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -179,7 +227,11 @@
     "char": 75,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 47,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -190,7 +242,11 @@
     "char": 77,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 47,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -203,9 +259,15 @@
     "description": null,
     "line": null,
     "name": "Suggested fixes for set_linter",
+<<<<<<< HEAD
     "original": "# Basic tests\nimport tempfile\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = set()\nb = \"set()\"\nc = set\nd = c.set\nf = (\n   set(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# An f string as in https://github.com/pytorch/pytorch/issues/159056\nf_string = f\" {h:{w}} \"\n\n# Braced sets\n\nset1 = {1}\nset2 = {1, 2}\n\niterator_set = {i for i in range(10)}\n\n# A dict with two sets.\ndict_set = {\"a\": {2, 3}, \"b\": {i for i in range(3)}}\n\n# A set containing an object constructed with a dict and a set\nsos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}\n",
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
     "replacement": "# Basic tests\nimport tempfile\nfrom torch.utils._ordered_set import OrderedSet\n\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = OrderedSet()\nb = \"set()\"\nc = OrderedSet\nd = c.set\nf = (\n   OrderedSet(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# An f string as in https://github.com/pytorch/pytorch/issues/159056\nf_string = f\" {h:{w}} \"\n\n# Braced sets\n\nset1 = OrderedSet([1])\nset2 = OrderedSet([1, 2])\n\niterator_set = OrderedSet([i for i in range(10)])\n\n# A dict with two sets.\ndict_set = {\"a\": OrderedSet([2, 3]), \"b\": OrderedSet([i for i in range(3)])}\n\n# A set containing an object constructed with a dict and a set\nsos_set = OrderedSet([Something({i: i + 1 for i in range(3)}, OrderedSet([i + 1 for i in range(3)]))])\n",
+=======
+    "original": "# Basic tests\nimport tempfile\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = set()\nb = \"set()\"\nc = set\nd = c.set\nf = (\n   set(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = {1}\nset2 = {1, 2}\n\niterator_set = {i for i in range(10)}\n\n# A dict with two sets.\ndict_set = {\"a\": {2, 3}, \"b\": {i for i in range(3)}}\n\n# A set containing an object constructed with a dict and a set\nsos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}\n",
+    "path": "tools/test/set_linter_testdata/python_code.py.txt",
+    "replacement": "# Basic tests\nimport tempfile\nfrom torch.utils._ordered_set import OrderedSet\n\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = OrderedSet()\nb = \"set()\"\nc = OrderedSet\nd = c.set\nf = (\n   OrderedSet(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = OrderedSet([1])\nset2 = OrderedSet([1, 2])\n\niterator_set = OrderedSet([i for i in range(10)])\n\n# A dict with two sets.\ndict_set = {\"a\": OrderedSet([2, 3]), \"b\": OrderedSet([i for i in range(3)])}\n\n# A set containing an object constructed with a dict and a set\nsos_set = OrderedSet([Something({i: i + 1 for i in range(3)}, OrderedSet([i + 1 for i in range(3)]))])\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "severity": "error"
   }
 ]
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.lintrunner b/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
index 4926368e9ab17..3c93a79bc437f 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
+++ b/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
@@ -30,6 +30,7 @@ tools/test/set_linter_testdata/python_code.py.txt:12:4: Builtin `set` is depreca
    13 |    )
    14 | )
 
+<<<<<<< HEAD
 tools/test/set_linter_testdata/python_code.py.txt:38:8: Builtin `set` is deprecated
    36 | # Braced sets
    37 |
@@ -132,4 +133,108 @@ tools/test/set_linter_testdata/python_code.py.txt:47:78: Builtin `set` is deprec
    45 |
    46 | # A set containing an object constructed with a dict and a set
    47 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+=======
+tools/test/set_linter_testdata/python_code.py.txt:35:8: Builtin `set` is deprecated
+   33 | # Braced sets
+   34 |
+   35 | set1 = {1}
+               ^
+   36 | set2 = {1, 2}
+   37 |
+
+tools/test/set_linter_testdata/python_code.py.txt:35:10: Builtin `set` is deprecated
+   33 | # Braced sets
+   34 |
+   35 | set1 = {1}
+                 ^
+   36 | set2 = {1, 2}
+   37 |
+
+tools/test/set_linter_testdata/python_code.py.txt:36:8: Builtin `set` is deprecated
+   34 |
+   35 | set1 = {1}
+   36 | set2 = {1, 2}
+               ^
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+
+tools/test/set_linter_testdata/python_code.py.txt:36:13: Builtin `set` is deprecated
+   34 |
+   35 | set1 = {1}
+   36 | set2 = {1, 2}
+                    ^
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+
+tools/test/set_linter_testdata/python_code.py.txt:38:16: Builtin `set` is deprecated
+   36 | set2 = {1, 2}
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+                       ^
+   39 |
+   40 | # A dict with two sets.
+
+tools/test/set_linter_testdata/python_code.py.txt:38:37: Builtin `set` is deprecated
+   36 | set2 = {1, 2}
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+                                            ^
+   39 |
+   40 | # A dict with two sets.
+
+tools/test/set_linter_testdata/python_code.py.txt:41:18: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                         ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:23: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                              ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:31: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                                      ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:51: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                                                          ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:44:11: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                  ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:52: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                                                           ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:76: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                                                                                   ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:78: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                                                      ^
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.python b/tools/test/set_linter_testdata/python_code.py.txt.python
index 52aaf12f26315..f3df3ae0e455e 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.python
+++ b/tools/test/set_linter_testdata/python_code.py.txt.python
@@ -32,9 +32,12 @@ class A:
 
 set = A().set
 
+<<<<<<< HEAD
 # An f string as in https://github.com/pytorch/pytorch/issues/159056
 f_string = f" {h:{w}} "
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Braced sets
 
 set1 = OrderedSet([1])
diff --git a/tools/test/test_docstring_linter.py b/tools/test/test_docstring_linter.py
index f1b98391b9ae9..7c5d1e2a1b786 100644
--- a/tools/test/test_docstring_linter.py
+++ b/tools/test/test_docstring_linter.py
@@ -28,7 +28,11 @@
 TEST_FILE = Path("tools/test/docstring_linter_testdata/python_code.py.txt")
 TEST_FILE2 = Path("tools/test/docstring_linter_testdata/more_python_code.py.txt")
 TEST_BLOCK_NAMES = Path("tools/test/docstring_linter_testdata/block_names.py.txt")
+<<<<<<< HEAD
 ARGS = "--max-class=5", "--max-def=6", "--min-docstring=16"
+=======
+ARGS = "--max-class=3", "--max-def=4", "--min-docstring=16"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestDocstringLinter(LinterTestCase):
diff --git a/tools/test/test_selective_build.py b/tools/test/test_selective_build.py
index f4221ed2898cd..e379f61e9c325 100644
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@@ -88,7 +88,10 @@ def gen():
         self.assertTrue(selector2.is_operator_selected("aten::sub.int"))
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             False,
@@ -104,7 +107,10 @@ def gen():
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["aten::add", "aten::add.int", "aten::mul.int"],
             True,
             False,
@@ -120,7 +126,10 @@ def gen():
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             True,
diff --git a/tools/test/test_set_linter.py b/tools/test/test_set_linter.py
index 003096c3c408e..f6f54aa42af2b 100644
--- a/tools/test/test_set_linter.py
+++ b/tools/test/test_set_linter.py
@@ -77,7 +77,10 @@ def test_match_braced_sets(self) -> None:
             ("{i for i in range(2, 3)}", 1),
             ("{1, 2}", 1),
             ("{One({'a': 1}), Two([{}, {2}, {1, 2}])}", 3),
+<<<<<<< HEAD
             ('f" {h:{w}} "', 0),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for s, expected in TESTS:
             pf = SetLinter.make_file(s)
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 1210326a02dbf..773d6320fab6b 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -13,7 +13,11 @@
 
 
 def parse_test_module(test: str) -> str:
+<<<<<<< HEAD
     return test.split(".", maxsplit=1)[0]
+=======
+    return test.split(".")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def discover_tests(
@@ -73,6 +77,10 @@ def skip_test_p(name: str) -> bool:
     cpp_tests_dir=CPP_TESTS_DIR,
     blocklisted_patterns=[
         "ao",
+<<<<<<< HEAD
+=======
+        "bottleneck_test",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "custom_backend",
         "custom_operator",
         "fx",  # executed by test_fx.py
@@ -82,7 +90,10 @@ def skip_test_p(name: str) -> bool:
         "package",  # executed by test_package.py
         "quantization",  # executed by test_quantization.py
         "autograd",  # executed by test_autograd.py
+<<<<<<< HEAD
         "cpp_extensions/open_registration_extension/torch_openreg/tests",  # executed by test_openreg.py
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     blocklisted_tests=[
         "test_bundled_images",
@@ -107,7 +118,10 @@ def skip_test_p(name: str) -> bool:
         "lazy/test_meta_kernel",
         "lazy/test_extract_compiled_graph",
         "test/inductor/test_aot_inductor_utils",
+<<<<<<< HEAD
         "inductor/test_aoti_cross_compile_windows",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "onnx/test_onnxscript_no_runtime",
         "onnx/test_pytorch_onnx_onnxruntime_cuda",
         "onnx/test_models",
@@ -139,7 +153,10 @@ def skip_test_p(name: str) -> bool:
         "doctests",
         "test_autoload_enable",
         "test_autoload_disable",
+<<<<<<< HEAD
         "test_openreg",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
 )
 
diff --git a/tools/testing/do_target_determination_for_s3.py b/tools/testing/do_target_determination_for_s3.py
index 1272181a91fb0..73e9fabbd2d4f 100644
--- a/tools/testing/do_target_determination_for_s3.py
+++ b/tools/testing/do_target_determination_for_s3.py
@@ -19,9 +19,15 @@
 )
 from tools.stats.upload_metrics import emit_metric
 from tools.testing.discover_tests import TESTS
+<<<<<<< HEAD
 from tools.testing.target_determination.determinator import get_test_prioritizations
 from tools.testing.target_determination.heuristics.interface import (
     AggregatedHeuristics,
+=======
+from tools.testing.target_determination.determinator import (
+    AggregatedHeuristics,
+    get_test_prioritizations,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestPrioritizations,
 )
 
diff --git a/tools/testing/explicit_ci_jobs.py b/tools/testing/explicit_ci_jobs.py
index 0d25bc642678a..cf4df1d70fed6 100755
--- a/tools/testing/explicit_ci_jobs.py
+++ b/tools/testing/explicit_ci_jobs.py
@@ -43,7 +43,11 @@ def add_job(
     if workflow_name not in workflows:
         workflows[workflow_name] = {"when": "always", "jobs": []}
 
+<<<<<<< HEAD
     requires = job.get("requires")
+=======
+    requires = job.get("requires", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if requires is not None:
         for requirement in requires:
             dependency = past_jobs[requirement]
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index e0ef858b96b21..067faeafd703e 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -23,6 +23,10 @@
     "test_cpp_extensions_aot_ninja",
     "test_cpp_extensions_aot_no_ninja",
     "test_cpp_extensions_jit",
+<<<<<<< HEAD
+=======
+    "test_cpp_extensions_open_device_registration",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_cpp_extensions_stream_and_event",
     "test_cpp_extensions_mtia_backend",
     "test_cuda",
@@ -186,7 +190,11 @@ def get_dep_modules(test: str) -> set[str]:
 
 
 def parse_test_module(test: str) -> str:
+<<<<<<< HEAD
     return test.split(".", maxsplit=1)[0]
+=======
+    return test.split(".")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def print_to_stderr(message: str) -> None:
diff --git a/tools/testing/target_determination/determinator.py b/tools/testing/target_determination/determinator.py
index 0d00b85d073a4..38a0f87be286b 100644
--- a/tools/testing/target_determination/determinator.py
+++ b/tools/testing/target_determination/determinator.py
@@ -4,9 +4,15 @@
 from typing import Any
 
 from tools.testing.target_determination.heuristics import (
+<<<<<<< HEAD
     AggregatedHeuristics,
     HEURISTICS,
     TestPrioritizations,
+=======
+    AggregatedHeuristics as AggregatedHeuristics,
+    HEURISTICS,
+    TestPrioritizations as TestPrioritizations,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
diff --git a/tools/testing/target_determination/heuristics/filepath.py b/tools/testing/target_determination/heuristics/filepath.py
index 9cd4ccd862a4e..889fc66bbc958 100644
--- a/tools/testing/target_determination/heuristics/filepath.py
+++ b/tools/testing/target_determination/heuristics/filepath.py
@@ -3,7 +3,11 @@
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from warnings import warn
 
 from tools.testing.target_determination.heuristics.interface import (
@@ -17,10 +21,13 @@
 from tools.testing.test_run import TestRun
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REPO_ROOT = Path(__file__).parents[3]
 
 keyword_synonyms: dict[str, list[str]] = {
diff --git a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
index 326072dc1f47e..c7c02b5969e42 100644
--- a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
+++ b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
@@ -83,9 +83,13 @@ def _rank_correlated_tests(
 ) -> list[str]:
     # Find the tests failures that are correlated with the edited files.
     # Filter the list to only include tests we want to run.
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     tests_to_run = set(tests_to_run)
     # pyrefly: ignore [bad-argument-type]
+=======
+    tests_to_run = set(tests_to_run)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ratings = _get_ratings_for_tests(tests_to_run)
     prioritize = sorted(ratings, key=lambda x: -ratings[x])
     return prioritize
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 4a5fbb6a836b4..30ab6936ffcb0 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -4,12 +4,17 @@
 import os
 import subprocess
 from pathlib import Path
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from tools.stats.import_test_stats import get_disabled_tests
 from tools.testing.test_run import ShardedTest, TestRun
 
 
+<<<<<<< HEAD
 try:
     from torch.testing._internal.common_cuda import SM80OrLater
     from torch.testing._internal.common_utils import TEST_CUDA
@@ -20,19 +25,31 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
+=======
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
 IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
 BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
+<<<<<<< HEAD
+=======
+USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
 # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
 # used to run tests.  If they are not equal, the only consequence should be
 # unequal shards.
 IS_ROCM = os.path.exists("/opt/rocm")
+<<<<<<< HEAD
 NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if not TEST_CUDA or SM80OrLater else 2
+=======
+NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 3 if USE_3_PROCS else 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NUM_PROCS_FOR_SHARDING_CALC = NUM_PROCS if not IS_ROCM or IS_MEM_LEAK_CHECK else 2
 THRESHOLD = 60 * 10  # 10 minutes
 
diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py
index 07b62ec9a1b74..28371dcfd273a 100644
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@@ -36,13 +36,19 @@ def concated_logs() -> str:
     for log_file in glob.glob(
         f"{REPO_ROOT}/test/test-reports/**/*.log", recursive=True
     ):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logs.append(f"=== {log_file} ===")
         with open(log_file) as f:
             # For every line, prefix with fake timestamp for log classifier
             for line in f:
                 line = line.rstrip("\n")  # Remove any trailing newline
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 logs.append(f"2020-01-01T00:00:00.0000000Z {line}")
     return "\n".join(logs)
 
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index d92b9e19a76c5..aacca8369a088 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -29,6 +29,10 @@ endif()
 set(LIBSHM_SRCDIR ${TORCH_SRC_DIR}/lib/${LIBSHM_SUBDIR})
 add_subdirectory(${LIBSHM_SRCDIR})
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Generate files
 set(TOOLS_PATH "${TORCH_ROOT}/tools")
 
@@ -95,9 +99,12 @@ endif()
 if(USE_ASAN AND TARGET Sanitizer::undefined)
   list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::undefined)
 endif()
+<<<<<<< HEAD
 if(USE_LSAN AND TARGET Sanitizer::leak)
   list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::leak)
 endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(USE_TSAN AND TARGET Sanitizer::thread)
   list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::thread)
 endif()
@@ -148,6 +155,7 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUFILE)
     endif()
 
+<<<<<<< HEAD
     if(TARGET torch::nvtx3)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
     else()
@@ -155,6 +163,9 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
       endif()
     endif()
+=======
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES CUDA::nvtx3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_ROCM)
@@ -268,7 +279,11 @@ add_custom_command(
     OUTPUT
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi"
     COMMAND
+<<<<<<< HEAD
     ${CMAKE_COMMAND} -E env --modify PYTHONPATH=path_list_prepend:"${TORCH_ROOT}" --
+=======
+    ${CMAKE_COMMAND} -E env PYTHONPATH="${TORCH_ROOT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "${Python_EXECUTABLE}" ${TORCH_SRC_DIR}/utils/data/datapipes/gen_pyi.py
     DEPENDS
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi.in"
@@ -480,17 +495,24 @@ else()
   set(TORCH_VERSION_DEBUG 0)
 endif()
 
+<<<<<<< HEAD
 set(CUDA_VERSION "")
 if(CUDAToolkit_VERSION_MAJOR)
   set(CUDA_VERSION "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 add_custom_target(
   gen_torch_version ALL
   "${Python_EXECUTABLE}" "${TOOLS_PATH}/generate_torch_version.py"
     --is-debug=${TORCH_VERSION_DEBUG}
     --cuda-version=${CUDA_VERSION}
+<<<<<<< HEAD
     --hip-version=${ROCM_VERSION_DEV}
+=======
+    --hip-version=${HIP_VERSION}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     --xpu-version=${SYCL_COMPILER_VERSION}
   BYPRODUCTS ${TORCH_SRC_DIR}/version.py
   COMMENT "Regenerating version file..."
@@ -508,11 +530,18 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
           )
   # Pybind11 requires explicit linking of the torch_python library
   if(BUILD_LIBTORCHLESS)
+<<<<<<< HEAD
     target_link_libraries(nnapi_backend PRIVATE ${TORCH_LIB})
   else()
     target_link_libraries(nnapi_backend PRIVATE torch)
   endif()
   target_link_libraries(nnapi_backend PRIVATE torch_python pybind::pybind11 fmt::fmt-header-only)
+=======
+    target_link_libraries(nnapi_backend PRIVATE ${TORCH_LIB} torch_python pybind::pybind11)
+  else()
+    target_link_libraries(nnapi_backend PRIVATE torch torch_python pybind::pybind11)
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 set(TORCH_PYTHON_COMPILE_OPTIONS ${TORCH_PYTHON_COMPILE_OPTIONS} PARENT_SCOPE)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 7e44af688b928..cb29db8011b98 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -26,12 +26,18 @@ import numpy
 import torch
 from torch import inf, SymInt, Tensor
 from torch._C import (
+<<<<<<< HEAD
     _acc,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _aoti,
     _cpu,
     _dynamo,
     _export,
+<<<<<<< HEAD
     _functionalization,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _functorch,
     _lazy,
     _lazy_ts_backend,
@@ -42,8 +48,11 @@ from torch._C import (
 )
 from torch._prims_common import DeviceLikeType
 from torch.autograd.graph import Node as _Node
+<<<<<<< HEAD
 from torch.cuda import _POOL_HANDLE
 from torch.distributed.tensor._op_schema import OpSchema
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.node import Node as FxNode
 from torch.package import PackageExporter
 from torch.storage import TypedStorage, UntypedStorage
@@ -954,7 +963,10 @@ class FunctionSchema:
         is_vararg: _bool,
         is_varret: _bool,
     ) -> None: ...
+<<<<<<< HEAD
     def _is_view_op(self) -> _bool: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _UpgraderEntry:
     bumped_at_version: _int
@@ -1217,8 +1229,11 @@ def _get_mkldnn_enabled() -> _bool: ...  # THPModule_userEnabledMkldnn
 def _set_mkldnn_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledMkldnn
 def _get_cudnn_benchmark() -> _bool: ...  # THPModule_benchmarkCuDNN
 def _set_cudnn_benchmark(arg: _bool) -> None: ...  # THPModule_setBenchmarkCuDNN
+<<<<<<< HEAD
 def _get_miopen_immediate() -> _bool: ...  # THPModule_userImmediateMiopen
 def _set_miopen_immediate(arg: _bool) -> None: ...  # THPModule_setUserImmediateMiopen
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_cudnn_deterministic() -> _bool: ...  # THPModule_deterministicCuDNN
 def _set_cudnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicCuDNN
 def _get_mkldnn_deterministic() -> _bool: ...  # THPModule_deterministicMkldnn
@@ -1254,6 +1269,7 @@ def _get_float32_matmul_precision() -> str: ...  # THPModule_float32MatmulPrecis
 def _set_float32_matmul_precision(
     arg: str,
 ) -> None: ...  # THPModule_setFloat32MatmulPrecision
+<<<<<<< HEAD
 def _get_cublas_allow_fp16_reduced_precision_reduction() -> tuple[
     _bool, _bool
 ]: ...  # THPModule_allowFP16ReductionCuBLAS
@@ -1267,6 +1283,19 @@ def _get_cublas_allow_bf16_reduced_precision_reduction() -> tuple[
 def _set_cublas_allow_bf16_reduced_precision_reduction(
     arg: _bool,
     allow_splitk: _bool = ...,
+=======
+def _get_cublas_allow_fp16_reduced_precision_reduction() -> (
+    _bool
+): ...  # THPModule_allowFP16ReductionCuBLAS
+def _set_cublas_allow_fp16_reduced_precision_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowFP16ReductionCuBLAS
+def _get_cublas_allow_bf16_reduced_precision_reduction() -> (
+    _bool
+): ...  # THPModule_allowBF16ReductionCuBLAS
+def _set_cublas_allow_bf16_reduced_precision_reduction(
+    arg: _bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...  # THPModule_setAllowBF16ReductionCuBLAS
 def _get_cublas_allow_fp16_accumulation() -> (
     _bool
@@ -1279,7 +1308,10 @@ def _set_sm_carveout_experimental(arg: _int | None) -> None: ...
 def _set_conj(x: Tensor, conj: _bool) -> None: ...
 def _set_neg(x: Tensor, neg: _bool) -> None: ...
 def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
+<<<<<<< HEAD
 def _autocast_supported_devices() -> list[str]: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _meta_in_tls_dispatch_include() -> _bool: ...
 def _stash_obj_in_tls(key: str, arg: Any) -> None: ...
 def _get_obj_in_tls(key: str) -> Any: ...
@@ -1305,6 +1337,7 @@ def _group_tensors_by_device_and_dtype(
     tuple[torch.device, torch.dtype],
     tuple[list[list[Tensor | None]], list[_int]],
 ]: ...
+<<<<<<< HEAD
 def _initCrashHandler() -> None: ...
 
 # NB: There is no Capsule type in typing, see
@@ -1323,6 +1356,13 @@ def _from_dlpack(data: Any) -> Tensor: ...  # THPModule_fromDLPack
 def _torchDeviceToDLDevice(
     device: torch.device,
 ) -> tuple[_int, _int]: ...  # THPModule_torchDeviceToDLDevice
+=======
+
+# NB: There is no Capsule type in typing, see
+# https://github.com/python/cpython/issues/109562
+def _to_dlpack(data: Tensor) -> Any: ...  # THPModule_toDLPack
+def _from_dlpack(data: Any) -> Tensor: ...  # THPModule_fromDLPack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_cpp_backtrace(
     frames_to_skip: _int,
     maximum_number_of_frames: _int,
@@ -1382,9 +1422,12 @@ def _disabled_torch_dispatch_impl(
 ) -> Any: ...  # THPModule_disable_dispatch_function
 def _get_linalg_preferred_backend() -> _LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: _LinalgBackend): ...
+<<<<<<< HEAD
 def _get_fp32_precision_getter(backend: str, op: str) -> str: ...
 def _set_fp32_precision_setter(backend: str, op: str, value: str) -> str: ...
 def _ensureCUDADeviceGuardSet() -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _LinalgBackend:
     Default: _LinalgBackend
@@ -1442,7 +1485,10 @@ _has_cuda: _bool
 _has_magma: _bool
 _has_xpu: _bool
 _has_mkldnn: _bool
+<<<<<<< HEAD
 _has_mkldnn_acl: _bool
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _has_cudnn: _bool
 _has_cusparselt: _bool
 has_spectral: _bool
@@ -1625,11 +1671,14 @@ def _jit_pass_cse(Graph) -> _bool: ...
 def _jit_pass_dce(Graph) -> None: ...
 def _jit_pass_dce_graph(Graph) -> None: ...
 def _jit_pass_lint(Graph) -> None: ...
+<<<<<<< HEAD
 def _make_opaque_object(payload: Any) -> ScriptObject: ...
 def _get_opaque_object_payload(obj: ScriptObject) -> Any: ...
 def _set_opaque_object_payload(obj: ScriptObject, payload: Any) -> None: ...
 def _register_opaque_type(type_name: str) -> None: ...
 def _is_opaque_type_registered(type_name: str) -> _bool: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/jit/python/python_custom_class.cpp
 def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
@@ -1856,9 +1905,12 @@ class _SetExcludeDispatchKeyGuard:
     def __enter__(self): ...
     def __exit__(self, *exc_info: object) -> None: ...
 
+<<<<<<< HEAD
 def _get_dtensor_allow_implicit_replication() -> _bool: ...
 def _set_dtensor_allow_implicit_replication(value: _bool) -> None: ...
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Defined in torch/csrc/utils/schema_info.h
 
 class _SchemaInfo:
@@ -1913,7 +1965,10 @@ class TensorBase(metaclass=_TensorMeta):
     names: list[str]
     device: _device
     dtype: _dtype
+<<<<<<< HEAD
     grad_dtype: _dtype | None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     layout: _layout
     real: Tensor
     imag: Tensor
@@ -1945,9 +2000,12 @@ class TensorBase(metaclass=_TensorMeta):
 
 _TensorBase = TensorBase
 
+<<<<<<< HEAD
 def _DTensor_OpSchema_post_init(self: OpSchema) -> None: ...
 def _DTensor_OpSchema_recompute_comparison_key(self: OpSchema) -> None: ...
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 def _set_thread_name(name: str) -> None: ...
@@ -1968,10 +2026,15 @@ def _mtia_isBuilt() -> _bool: ...
 def _mtia_isInBadFork() -> _bool: ...
 def _mtia_deviceSynchronize() -> None: ...
 def _mtia_getCurrentStream(device: _int) -> Stream: ...
+<<<<<<< HEAD
 def _mtia_getCurrentRawStream(device: _int) -> _int: ...
 def _mtia_setCurrentStream(stream: Stream) -> None: ...
 def _mtia_getDefaultStream(device: _int) -> Stream: ...
 def _mtia_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
+=======
+def _mtia_setCurrentStream(stream: Stream) -> None: ...
+def _mtia_getDefaultStream(device: _int) -> Stream: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _mtia_memoryStats(device: _int) -> dict[str, Any]: ...
 def _mtia_getDeviceCapability(device: _int) -> tuple[_int, _int]: ...
 def _mtia_getDeviceProperties(device: _int) -> dict[str, Any]: ...
@@ -1990,9 +2053,13 @@ def _mtia_resetPeakMemoryStats(device: _int) -> None: ...
 
 # Defined in torch/csrc/mps/Module.cpp
 def _mps_deviceSynchronize() -> None: ...
+<<<<<<< HEAD
 def _mps_get_core_count() -> _int: ...
 def _mps_get_default_generator() -> Generator: ...
 def _mps_get_name() -> _str: ...
+=======
+def _mps_get_default_generator() -> Generator: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _mps_emptyCache() -> None: ...
 def _mps_setMemoryFraction(fraction: _float) -> None: ...
 def _mps_currentAllocatedMemory() -> _int: ...
@@ -2039,6 +2106,10 @@ def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
 def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
+<<<<<<< HEAD
+=======
+def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_beginAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentThreadToPool(
     device: _int,
@@ -2071,8 +2142,11 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
+<<<<<<< HEAD
 def _cuda_setMemoryMetadata(metadata: str) -> None: ...
 def _cuda_getMemoryMetadata() -> str: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
@@ -2081,7 +2155,10 @@ def _cuda_record_memory_history_legacy(
     alloc_trace_record_context: _bool,
     clear_history: _bool,
     compile_context: _bool,
+<<<<<<< HEAD
     global_record_annotations: _bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _cuda_record_memory_history(
     enabled: str | None,
@@ -2090,7 +2167,10 @@ def _cuda_record_memory_history(
     max_entries: _int,
     clear_history: _bool,
     compile_context: _bool,
+<<<<<<< HEAD
     global_record_annotations: _bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _cuda_isHistoryEnabled() -> _bool: ...
 def _cuda_getAllocatorBackend() -> str: ...
@@ -2190,14 +2270,23 @@ def _cuda_tunableop_set_filename(
     insert_device_ordinal: _bool | None,
 ) -> None: ...
 def _cuda_tunableop_get_filename() -> str: ...
+<<<<<<< HEAD
+def _cuda_tunableop_read_file(filename: str | None) -> _bool: ...
+=======
+def _cuda_tunableop_write_file(filename: str | None) -> _bool: ...
 def _cuda_tunableop_read_file(filename: str | None) -> _bool: ...
+def _cuda_tunableop_write_file_on_exit(val: _bool) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_tunableop_get_results() -> tuple[str, str, str, _float]: ...
 def _cuda_tunableop_get_validators() -> tuple[str, str]: ...
 def _cuda_tunableop_set_rotating_buffer_size(buffer_size: _int) -> None: ...
 def _cuda_tunableop_get_rotation_buffer_size() -> _int: ...
+<<<<<<< HEAD
 def _cuda_tunableop_set_numerical_check_tolerances(
     enabled: _bool, atol: _float = 1e-5, rtol: _float = 1e-5
 ) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _CudaDeviceProperties:
     name: str
@@ -2212,9 +2301,12 @@ class _CudaDeviceProperties:
     warp_size: _int
     uuid: str
     L2_cache_size: _int
+<<<<<<< HEAD
     clock_rate: _int
     memory_clock_rate: _int
     memory_bus_width: _int
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Functions related to SDPA
 class _SDPAParams:
@@ -2242,7 +2334,10 @@ class _SDPBackend(Enum):
     FLASH_ATTENTION = 1
     EFFICIENT_ATTENTION = 2
     CUDNN_ATTENTION = 3
+<<<<<<< HEAD
     OVERRIDEABLE = 4
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _is_flash_attention_available() -> _bool: ...
 def _can_use_cudnn_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
@@ -2332,7 +2427,11 @@ class _CUDAGraph:
     def __new__(cls, keep_graph: _bool = ...) -> Self: ...
     def capture_begin(
         self,
+<<<<<<< HEAD
         pool: _POOL_HANDLE | None = ...,
+=======
+        pool: tuple[_int, _int] | None = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         capture_error_mode: str = "global",
     ) -> None: ...
     def capture_end(self) -> None: ...
@@ -2340,11 +2439,18 @@ class _CUDAGraph:
     def register_generator_state(self, Generator) -> None: ...
     def replay(self) -> None: ...
     def reset(self) -> None: ...
+<<<<<<< HEAD
     def pool(self) -> _POOL_HANDLE: ...
     def enable_debug_mode(self) -> None: ...
     def debug_dump(self, debug_path: str) -> None: ...
     def raw_cuda_graph(self) -> _int: ...
     def raw_cuda_graph_exec(self) -> _int: ...
+=======
+    def pool(self) -> tuple[_int, _int]: ...
+    def enable_debug_mode(self) -> None: ...
+    def debug_dump(self, debug_path: str) -> None: ...
+    def raw_cuda_graph(self) -> _int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/cuda/MemPool.cpp
 class _MemPool:
@@ -2353,10 +2459,19 @@ class _MemPool:
         allocator: _cuda_CUDAAllocator | None = None,
         is_user_created: _bool = True,
         use_on_oom: _bool = False,
+<<<<<<< HEAD
+=======
+        symmetric: _bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
     @property
     def id(self) -> tuple[_int, _int]: ...
     @property
+<<<<<<< HEAD
+=======
+    def is_symmetric(self) -> _bool: ...
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def allocator(self) -> _cuda_CUDAAllocator | None: ...
     def use_count(self) -> _int: ...
 
@@ -2381,14 +2496,20 @@ def _xpu_memoryStats(device: _int) -> dict[str, Any]: ...
 def _xpu_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _xpu_resetPeakMemoryStats(device: _int) -> None: ...
 def _xpu_getMemoryInfo(device: _int) -> tuple[_int, _int]: ...
+<<<<<<< HEAD
 def _xpu_canDeviceAccessPeer(device: _int, peer: _int) -> _bool: ...
 def _xpu_setMemoryFraction(fraction: _float, device: _int) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _XpuDeviceProperties:
     name: str
     platform_name: str
     vendor: str
+<<<<<<< HEAD
     device_id: _int
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     driver_version: str
     version: str
     max_compute_units: _int
@@ -2407,7 +2528,10 @@ class _XpuDeviceProperties:
     gpu_subslice_count: _int
     architecture: _int
     type: str
+<<<<<<< HEAD
     uuid: Any
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/xpu/Stream.cpp
 class _XpuStreamBase(Stream):
@@ -2463,12 +2587,15 @@ def _accelerator_getStream(device_index: _int) -> Stream: ...
 def _accelerator_synchronizeDevice(device_index: _int) -> None: ...
 def _accelerator_exchangeDevice(device_index: _int) -> _int: ...
 def _accelerator_maybeExchangeDevice(device_index: _int) -> _int: ...
+<<<<<<< HEAD
 def _accelerator_isAllocatorInitialized() -> _bool: ...
 def _accelerator_emptyCache() -> None: ...
 def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
 def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ...
 def _accelerator_resetPeakStats(device_index: _int) -> None: ...
 def _accelerator_setAllocatorSettings(env: str) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
 class TracingState:
@@ -2750,6 +2877,7 @@ class _NodeBase:
         return_type: Any,
     ) -> None: ...
     def _update_args_kwargs(self, args: tuple[Any, ...], kwargs: dict[str, Any]): ...
+<<<<<<< HEAD
     def _prepend(self, n: FxNode) -> None: ...
     def _replace_input_with(self, old_input: FxNode, new_input: FxNode) -> None: ...
     def _remove_from_list(self) -> None: ...
@@ -2757,6 +2885,8 @@ class _NodeBase:
     def __gt__(self, n: Self) -> _bool: ...
     def __le__(self, n: Self) -> _bool: ...
     def __ge__(self, n: Self) -> _bool: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _NodeIter(Iterator[FxNode]):
     def __init__(self, root: FxNode, reversed: _bool) -> None: ...
@@ -2784,6 +2914,7 @@ class _StaticCudaLauncher:
         args: tuple[Any, ...],
         stream: _int,
     ) -> None: ...
+<<<<<<< HEAD
 
 # Defined in torch/csrc/cuda/GreenContext.cpp
 class GreenContext:
@@ -2798,3 +2929,5 @@ class GreenContext:
     def pop_context(
         self,
     ) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 1ff5d847b61aa..ae462023c6203 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from typing import Any
+=======
+from enum import Enum
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C._profiler import (
@@ -78,8 +83,11 @@ class _KinetoEvent:
     def cuda_elapsed_us(self) -> int: ...
     def privateuse1_elapsed_us(self) -> int: ...
     def is_user_annotation(self) -> bool: ...
+<<<<<<< HEAD
     def is_hidden_event(self) -> bool: ...
     def metadata_json(self) -> str: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _ProfilerResult:
     def events(self) -> list[_KinetoEvent]: ...
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index da59123625e84..9909fefec8537 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -112,6 +112,7 @@ class DebugLevel(Enum):
     DETAIL = ...
 
 class ReduceOp:
+<<<<<<< HEAD
     # pyrefly: ignore  # unknown-name
     def __init__(self, op: RedOpType) -> None: ...
 
@@ -134,6 +135,19 @@ class ReduceOp:
     # pyrefly: ignore  # unknown-name
     PREMUL_SUM: RedOpType = ...
     # pyrefly: ignore  # unknown-name
+=======
+    def __init__(self, op: RedOpType) -> None: ...
+
+    SUM: RedOpType = ...
+    AVG: RedOpType = ...
+    PRODUCT: RedOpType = ...
+    MIN: RedOpType = ...
+    MAX: RedOpType = ...
+    BAND: RedOpType = ...
+    BOR: RedOpType = ...
+    BXOR: RedOpType = ...
+    PREMUL_SUM: RedOpType = ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UNUSED: RedOpType = ...
 
     # mypy error being ignored:
@@ -290,12 +304,19 @@ class Work:
     def is_success(self) -> bool: ...
     def exception(self) -> Any: ...
     def wait(self, timeout: timedelta = ...) -> bool: ...
+<<<<<<< HEAD
     def block_current_stream(self) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_future(self) -> Future: ...
     def source_rank(self) -> int: ...
     def _source_rank(self) -> int: ...
     def result(self) -> list[Tensor]: ...
+<<<<<<< HEAD
     def synchronize(self) -> None: ...
+=======
+    def synchronize(self): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def boxed(self) -> ScriptObject: ...
     @staticmethod
     def unbox(obj: ScriptObject) -> Work: ...
@@ -309,8 +330,11 @@ class Backend:
         def _timeout(self) -> timedelta: ...
         @_timeout.setter
         def _timeout(self, val: timedelta) -> None: ...
+<<<<<<< HEAD
         global_ranks_in_group: list[int]
         group_name: str
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -323,12 +347,18 @@ class Backend:
     def supports_coalescing(self) -> bool: ...
     @property
     def supports_time_estimate(self) -> bool: ...
+<<<<<<< HEAD
     def set_timeout(self, timeout: timedelta) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def options(self) -> Options: ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
+<<<<<<< HEAD
     def name(self) -> str: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def abort(self) -> None: ...
     def shutdown(self) -> None: ...
     def eager_connect_single_device(self, device: torch.device | None) -> None: ...
@@ -364,6 +394,7 @@ class ProcessGroup:
     ) -> None: ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
+<<<<<<< HEAD
     def get_group_store(self) -> Store: ...
     def split_group(
         self,
@@ -383,6 +414,9 @@ class ProcessGroup:
     ) -> ProcessGroup: ...
     def abort(self) -> None: ...
     def set_timeout(self, timeout: timedelta) -> None: ...
+=======
+    def abort(self) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def shutdown(self) -> None: ...
     @overload
     def broadcast(
@@ -395,7 +429,10 @@ class ProcessGroup:
         self,
         tensor: Tensor,
         root: int,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def allreduce(
@@ -408,14 +445,20 @@ class ProcessGroup:
         self,
         tensors: list[Tensor],
         op=...,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def allreduce(
         self,
         tensor: Tensor,
         op=...,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     def allreduce_coalesced(
         self,
@@ -440,7 +483,10 @@ class ProcessGroup:
         tensor: Tensor,
         root: int,
         op=...,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def allgather(
@@ -454,7 +500,10 @@ class ProcessGroup:
         self,
         output_tensors: list[Tensor],
         input_tensor: Tensor,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     def _allgather_base(
         self,
@@ -487,7 +536,10 @@ class ProcessGroup:
         output_tensors: list[Tensor],
         input_tensor: Tensor,
         root: int,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def scatter(
@@ -502,7 +554,10 @@ class ProcessGroup:
         output_tensor: Tensor,
         input_tensors: list[Tensor],
         root: int,
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def reduce_scatter(
@@ -516,8 +571,11 @@ class ProcessGroup:
         self,
         output_tensors: Tensor,
         input_tensor: list[Tensor],
+<<<<<<< HEAD
         op=...,
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     def _reduce_scatter_base(
         self,
@@ -541,7 +599,10 @@ class ProcessGroup:
         input: Tensor,
         output_split_sizes: list[int],
         input_split_sizes: list[int],
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     @overload
     def alltoall(
@@ -555,7 +616,10 @@ class ProcessGroup:
         self,
         output: list[Tensor],
         input: list[Tensor],
+<<<<<<< HEAD
         timeout: timedelta | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Work: ...
     def send(
         self,
@@ -570,10 +634,14 @@ class ProcessGroup:
         tag: int,
     ) -> Work: ...
     def recv_anysource(self, tensors: list[Tensor], tag: int) -> Work: ...
+<<<<<<< HEAD
     @overload
     def barrier(self, opts=...) -> Work: ...
     @overload
     def barrier(self, timeout: timedelta | None = None) -> Work: ...
+=======
+    def barrier(self, opts=...) -> Work: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def boxed(self) -> ScriptObject: ...
     @staticmethod
     def unbox(obj: ScriptObject) -> ProcessGroup: ...
@@ -607,8 +675,12 @@ class ProcessGroup:
     def group_desc(self) -> str: ...
 
 class FakeProcessGroup(Backend):
+<<<<<<< HEAD
     @staticmethod
     def _create_internal(rank: int, world_size: int) -> FakeProcessGroup: ...
+=======
+    def __init__(self, rank: int, world_size: int) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class FakeWork(Work):
     seq_id: int
@@ -622,6 +694,11 @@ class ProcessGroupGloo(Backend):
     class Options(Backend.Options):
         devices: list[ProcessGroupGloo.Device]
         threads: int
+<<<<<<< HEAD
+=======
+        global_ranks_in_group: list[int]
+        group_name: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __init__(self): ...
 
@@ -656,13 +733,21 @@ class ProcessGroupNCCL(Backend):
         cga_cluster_size: int
         min_ctas: int
         max_ctas: int
+<<<<<<< HEAD
         def unsafe_get_ptr(self) -> int: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class Options(Backend.Options):
         config: ProcessGroupNCCL.NCCLConfig
         is_high_priority_stream: bool
         split_from: ProcessGroupNCCL
         split_color: int
+<<<<<<< HEAD
+=======
+        global_ranks_in_group: list[int]
+        group_name: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __init__(self, is_high_priority_stream: bool = False): ...
 
@@ -746,7 +831,11 @@ def _allow_inflight_collective_as_graph_input() -> bool: ...
 def _unregister_all_process_groups() -> None: ...
 def _unregister_process_group(group_name: str) -> None: ...
 
+<<<<<<< HEAD
 # Initializes the device state in CUmodule so that it's able to perform NVSHMEM
+=======
+# Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # operations.  CUmodule is a pointer to a CUDA module, carried by a int64 in
 # Python. At C++ interface, it is converted to a uintptr_t.
 def _nvshmemx_cumodule_init(module: int) -> None: ...
@@ -776,6 +865,7 @@ class _SymmetricMemory:
         device_type: DeviceType,
         device_idx: int,
     ) -> bool: ...
+<<<<<<< HEAD
     # Set Symmetric Memory allocation backend.
     @staticmethod
     def set_backend(name: str) -> None: ...
@@ -783,6 +873,8 @@ class _SymmetricMemory:
     def get_backend(device: torch.device) -> Optional[str]: ...
     @staticmethod
     def get_mempool_allocator(device: torch.device) -> Any: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def rank(self) -> int: ...
     @property
@@ -818,12 +910,15 @@ class _SymmetricMemory:
         channel: int = 0,
         timeout_ms: int = 0,
     ) -> None: ...
+<<<<<<< HEAD
     def get_remote_tensor(
         self,
         peer: int,
         sizes: torch.types._size,
         dtype: torch.dtype,
     ) -> torch.Tensor: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def memset32(
         tensor: torch.Tensor, offset: int, val: int, count: int = 1
@@ -848,14 +943,18 @@ class _SymmetricMemory:
     def signal_pad_size(self) -> int: ...
 
 class ProcessGroupXCCL(Backend):
+<<<<<<< HEAD
     class Options(Backend.Options):
         def __init__(self): ...
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         store: Store,
         rank: int,
         size: int,
+<<<<<<< HEAD
         options: Options,
     ) -> None: ...
     @property
@@ -863,3 +962,6 @@ class ProcessGroupXCCL(Backend):
 
 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
+=======
+    ): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_dynamo/compiled_autograd.pyi b/torch/_C/_dynamo/compiled_autograd.pyi
index ef24582b50231..2c01291968642 100644
--- a/torch/_C/_dynamo/compiled_autograd.pyi
+++ b/torch/_C/_dynamo/compiled_autograd.pyi
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 from torch._dynamo.compiled_autograd import AutogradCompilerInstance
diff --git a/torch/_C/_dynamo/eval_frame.pyi b/torch/_C/_dynamo/eval_frame.pyi
index 117795db5ac3e..02c06a5cd013a 100644
--- a/torch/_C/_dynamo/eval_frame.pyi
+++ b/torch/_C/_dynamo/eval_frame.pyi
@@ -2,9 +2,18 @@ import enum
 import types
 from typing import Optional, overload
 
+<<<<<<< HEAD
 from torch._dynamo.guards import GuardManagerWrapper
 from torch._dynamo.types import DynamoCallback, DynamoGuardCompleteHook, DynamoGuardHook
 from torch._guards import CompileId
+=======
+from torch._dynamo.types import (
+    DynamoCallback,
+    DynamoGuardCompleteHook,
+    DynamoGuardHook,
+    GuardFn,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def set_eval_frame(callback: DynamoCallback) -> DynamoCallback: ...
 def set_skip_guard_eval_unsafe(value: bool) -> bool: ...
@@ -22,6 +31,7 @@ def raise_sigtrap() -> None: ...
 
 class _CacheEntry:
     def check_fn(self, *args: object, **kwargs: object) -> bool: ...
+<<<<<<< HEAD
     def update_diff_guard_root_manager(self) -> None: ...
     code: types.CodeType
     compile_id: CompileId
@@ -36,6 +46,13 @@ class _ExtraState:
     def invalidate(
         self, cache_entry: _CacheEntry, guard_manager: GuardManagerWrapper
     ) -> None: ...
+=======
+    code: types.CodeType
+    next: _CacheEntry | None
+
+class _ExtraState:
+    def invalidate(self, cache_entry: _CacheEntry, guard_manager: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _FrameAction(enum.IntEnum):
     DEFAULT = 0
@@ -61,7 +78,11 @@ class _PyInterpreterFrame:
     f_globals: dict[str, object]
     f_builtins: dict[str, object]
     f_lasti: int
+<<<<<<< HEAD
     f_lineno: int
+=======
+    f_lineo: int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     f_back: types.FrameType
     # A tuple containing cell objects captured by this frame.
     closure: tuple[types.CellType]
@@ -72,9 +93,15 @@ py_opcode_caches: list[int]
 
 def code_framelocals_names(code: types.CodeType) -> tuple[str]: ...
 def _load_precompile_entry(
+<<<<<<< HEAD
     code: types.CodeType,
     guard_manager: GuardManagerWrapper,
     dynamo_code: types.CodeType,
 ) -> None: ...
 def _reset_precompile_entries(code: types.CodeType) -> None: ...
 def _debug_get_precompile_entries(code: types.CodeType) -> list[_PrecompileEntry]: ...
+=======
+    code: types.CodeType, guard_manager: GuardFn, dynamo_code: types.CodeType
+) -> None: ...
+def _reset_precompile_entries(code: types.CodeType) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index b8c0a93e35fa3..60927280e3940 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import enum
 from collections.abc import Callable
 from typing import Any, Optional, TypeAlias
@@ -9,10 +10,18 @@ import torch
 # imports
 GuardManagerType: TypeAlias = enum.Enum
 
+=======
+# mypy: allow-untyped-defs
+from typing import Any, Callable
+
+import torch
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GlobalStateGuard:
     def check(self) -> bool: ...
     def reason(self) -> str: ...
 
+<<<<<<< HEAD
 class LeafGuard:
     def verbose_code_parts(self) -> list[str]: ...
 
@@ -26,18 +35,33 @@ class GuardDebugInfo:
 class GuardManager:
     def check(self, value: Any) -> bool: ...
     def check_verbose(self, value: Any) -> GuardDebugInfo: ...
+=======
+class LeafGuard: ...
+class GuardDebugInfo: ...
+
+class GuardManager:
+    def check(self, value) -> bool: ...
+    def check_verbose(self, value) -> GuardDebugInfo: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Accessors
     def globals_dict_manager(
         self,
         f_globals: dict[str, Any],
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def framelocals_manager(
         self,
         key: tuple[str, int],
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
@@ -126,10 +150,23 @@ class GuardManager:
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def dict_getitem_manager(
+        self,
+        key,
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def global_weakref_manager(
         self,
         global_name: str,
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
@@ -139,24 +176,48 @@ class GuardManager:
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def type_manager(
+        self,
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def getattr_manager(
         self,
         attr: str,
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def tensor_property_size_manager(
         self,
         idx: int,
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def tensor_property_shape_manager(
         self,
         idx: int,
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
@@ -167,10 +228,23 @@ class GuardManager:
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def tensor_property_storage_offset_manager(
+        self,
+        idx: None,
+        source,
+        example_value,
+        guard_manager_enum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> GuardManager: ...
     def indexed_manager(
         self,
         idx: int,
+<<<<<<< HEAD
         source: str,
         example_value: Any,
         guard_manager_enum: GuardManagerType,
@@ -343,6 +417,33 @@ class GuardManager:
     ) -> None: ...
     def mark_tag_safe(self) -> None: ...
     def mark_tag_safe_root(self) -> None: ...
+=======
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def lambda_manager(
+        self,
+        python_lambda,
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+
+    # Leaf guards
+    def add_lambda_guard(self, user_lambda, verbose_code_parts: list[str]) -> None: ...
+    def add_id_match_guard(self, id_val, verbose_code_parts: list[str]) -> None: ...
+    def add_equals_match_guard(
+        self,
+        equals_val,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_global_state_guard(self, verbose_code_parts: list[str]) -> None: ...
+    def add_torch_function_mode_stack_guard(
+        self, initial_stack, verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_mapping_keys_guard(sef, value, verbose_code_parts: list[str]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class RootGuardManager(GuardManager):
     def get_epilogue_lambda_guards(self) -> list[LeafGuard]: ...
@@ -354,11 +455,15 @@ class RootGuardManager(GuardManager):
     def clone_manager(
         self, clone_filter_fn: Callable[[GuardManager], bool]
     ) -> RootGuardManager: ...
+<<<<<<< HEAD
     def attach_compile_id(self, compile_id: str) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DictGuardManager(GuardManager):
     def get_key_manager(
         self,
+<<<<<<< HEAD
         index: int,
         source: str,
         example_value: Any,
@@ -396,16 +501,44 @@ def install_object_aliasing_guard(
     y: GuardManager,
     verbose_code_parts: list[str],
 ) -> None: ...
+=======
+        index,
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def get_value_manager(
+        self,
+        index,
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+
+def install_object_aliasing_guard(
+    guard_managers: list[GuardManager],
+    tensor_names: list[str],
+    verbose_code_parts: list[str],
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def install_no_tensor_aliasing_guard(
     guard_managers: list[GuardManager],
     tensor_names: list[str],
     verbose_code_parts: list[str],
+<<<<<<< HEAD
 ) -> None: ...
+=======
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def install_storage_overlapping_guard(
     overlapping_guard_managers: list[GuardManager],
     non_overlapping_guard_managers: list[GuardManager],
     verbose_code_parts: list[str],
+<<<<<<< HEAD
 ) -> None: ...
+=======
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def install_symbolic_shape_guard(
     guard_managers: list[GuardManager],
     nargs_int: int,
@@ -413,7 +546,11 @@ def install_symbolic_shape_guard(
     py_addr: int,
     py_addr_keep_alive: Any,
     verbose_code_parts: list[str],
+<<<<<<< HEAD
 ) -> None: ...
+=======
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def profile_guard_manager(
     guard_manager: GuardManager,
     f_locals: dict[str, Any],
@@ -427,22 +564,35 @@ class TensorGuards:
         dynamic_dims_sizes: list[torch.SymInt | None] | None = None,
         dynamic_dims_strides: list[torch.SymInt | None] | None = None,
     ) -> None: ...
+<<<<<<< HEAD
     def check(self, *args: Any) -> bool: ...
     def check_verbose(
         self, *args: Any, tensor_check_names: Optional[list[str]] = None
     ) -> bool | str: ...
+=======
+    def check(self, *args) -> bool: ...
+    def check_verbose(self, *args, tensor_check_names=None) -> bool | str: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def assert_size_stride(
     item: torch.Tensor,
     size: torch.types._size,
     stride: torch.types._size,
     op_name: str | None = None,
+<<<<<<< HEAD
 ) -> None: ...
+=======
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def assert_alignment(
     item: torch.Tensor,
     alignment: int,
     op_name: str | None = None,
+<<<<<<< HEAD
 ) -> None: ...
+=======
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_obj_id(obj: object, expected: int) -> bool: ...
 def check_type_id(obj: object, expected: int) -> bool: ...
 def dict_version(d: dict[Any, Any]) -> int: ...
diff --git a/torch/_C/_export/pt2_archive_constants.pyi b/torch/_C/_export/pt2_archive_constants.pyi
index f7a92ddd0c961..349021f6a9127 100644
--- a/torch/_C/_export/pt2_archive_constants.pyi
+++ b/torch/_C/_export/pt2_archive_constants.pyi
@@ -10,15 +10,23 @@ MODELS_FILENAME_FORMAT: str = ...
 AOTINDUCTOR_DIR: str = ...
 MTIA_DIR: str = ...
 WEIGHTS_DIR: str = ...
+<<<<<<< HEAD
 WEIGHTS_CONFIG_FILENAME_FORMAT: str = ...
 WEIGHT_FILENAME_PREFIX: str = ...
 CONSTANTS_DIR: str = ...
 CONSTANTS_CONFIG_FILENAME_FORMAT: str = ...
+=======
+WEIGHT_FILENAME_PREFIX: str = ...
+CONSTANTS_DIR: str = ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TENSOR_CONSTANT_FILENAME_PREFIX: str = ...
 CUSTOM_OBJ_FILENAME_PREFIX: str = ...
 SAMPLE_INPUTS_DIR: str = ...
 SAMPLE_INPUTS_FILENAME_FORMAT: str = ...
+<<<<<<< HEAD
 EXECUTORCH_DIR: str = ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 EXTRA_DIR: str = ...
 MODULE_INFO_PATH: str = ...
 XL_MODEL_WEIGHTS_DIR: str = ...
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index c23240e13170a..2c9e8c4a1a0e7 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -22,7 +22,10 @@ def _unwrap_batched(tensor: Tensor, level: int) -> tuple[Tensor, int | None]: ..
 def current_level() -> int: ...
 def count_jvp_interpreters() -> int: ...
 def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+<<<<<<< HEAD
 def _maybe_unsafe_set_level(tensor: Tensor, level: int) -> None: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
 def get_single_level_autograd_function_allowed() -> bool: ...
 def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
diff --git a/torch/_C/_monitor.pyi b/torch/_C/_monitor.pyi
index 82f2a3e442703..b835175d9893d 100644
--- a/torch/_C/_monitor.pyi
+++ b/torch/_C/_monitor.pyi
@@ -1,9 +1,15 @@
 # Defined in torch/csrc/monitor/python_init.cpp
 
 import datetime
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from types import TracebackType
+=======
+from enum import Enum
+from types import TracebackType
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Aggregation(Enum):
     VALUE = ...
diff --git a/torch/_C/_nn.pyi.in b/torch/_C/_nn.pyi.in
index 7be3dcff4da67..7d62995bf1005 100644
--- a/torch/_C/_nn.pyi.in
+++ b/torch/_C/_nn.pyi.in
@@ -67,6 +67,7 @@ def pad_sequence(
     padding_value: float = 0.0,
     padding_side: Literal["left", "right"] = "right",
 ) -> Tensor: ...
+<<<<<<< HEAD
 
 # Upsample functions used by torch.nn.functional.interpolate
 def upsample_nearest1d(
@@ -135,5 +136,7 @@ def upsample_bicubic2d(
     align_corners: bool,
     scale_factors: Sequence[float] | None,
 ) -> Tensor: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def flatten_dense_tensors(tensors: list[Tensor]) -> Tensor: ...
 def unflatten_dense_tensors(flat: Tensor, tensors: list[Tensor]) -> list[Tensor]: ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index d60d89a6a4796..97bd90a8e642b 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -1,5 +1,10 @@
 from enum import Enum
+<<<<<<< HEAD
 from typing import Literal, TypeAlias
+=======
+from typing import Literal
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._C import device, dtype, layout
 
diff --git a/torch/__init__.py b/torch/__init__.py
index b830704254f6a..381a9b2c59978 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -22,10 +22,16 @@
 import sys
 import textwrap
 import threading
+<<<<<<< HEAD
 import warnings
 from collections.abc import Callable as _Callable
 from typing import (
     Any as _Any,
+=======
+from typing import (
+    Any as _Any,
+    Callable as _Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_origin as _get_origin,
     Optional as _Optional,
     overload as _overload,
@@ -33,6 +39,7 @@
     TypeVar as _TypeVar,
     Union as _Union,
 )
+<<<<<<< HEAD
 from typing_extensions import ParamSpec as _ParamSpec, TypeIs as _TypeIs
 
 
@@ -41,6 +48,20 @@
 # they are likely stale.
 def _running_with_deploy() -> builtins.bool:
     return False
+=======
+from typing_extensions import ParamSpec as _ParamSpec
+
+
+if TYPE_CHECKING:
+    from .types import Device, IntLikeType
+
+
+# multipy/deploy is setting this import before importing torch, this is the most
+# reliable way we have to detect if we're running within deploy.
+# https://github.com/pytorch/multipy/blob/d60f34ad38c371e441fe7ffdb77a3c3dda5a5d19/multipy/runtime/interpreter/interpreter_impl.cpp#L134-L137
+def _running_with_deploy() -> builtins.bool:
+    return sys.modules.get("torch._meta_registrations", None) is object
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from torch._utils import (
@@ -55,12 +76,30 @@ def _running_with_deploy() -> builtins.bool:
     USE_GLOBAL_DEPS,
     USE_RTLD_GLOBAL_WITH_LIBTORCH,
 )
+<<<<<<< HEAD
 from torch.torch_version import __version__ as __version__
 
 
 if TYPE_CHECKING:
     from torch.types import Device, IntLikeType
 
+=======
+
+
+# TODO(torch_deploy) figure out how to freeze version.py in fbcode build
+if _running_with_deploy():
+    __version__ = "torch-deploy-1.8"
+    # TODO: Remove this ugly hack when deploy typing extensions are updated to 4.10+
+    if not TYPE_CHECKING:
+        import typing_extensions
+
+        _TypeIs = typing_extensions.TypeGuard
+        typing_extensions.TypeIs = _TypeIs
+else:
+    from typing_extensions import TypeIs as _TypeIs
+
+    from torch.torch_version import __version__ as __version__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = [
     "BoolStorage",
@@ -194,6 +233,7 @@ def _load_dll_libraries() -> None:
             if os.path.exists(p)
         ]
 
+<<<<<<< HEAD
         if not builtins.any(
             os.path.exists(os.path.join(p, "nvToolsExt64_1.dll")) for p in dll_paths
         ):
@@ -208,6 +248,8 @@ def _load_dll_libraries() -> None:
         else:
             nvtoolsext_dll_path = ""
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cuda_version and builtins.all(
             not glob.glob(os.path.join(p, "cudart64*.dll")) for p in dll_paths
         ):
@@ -220,9 +262,13 @@ def _load_dll_libraries() -> None:
         else:
             cuda_path = ""
 
+<<<<<<< HEAD
         dll_paths.extend(
             p for p in (nvtoolsext_dll_path, cuda_path) if os.path.exists(p)
         )
+=======
+        dll_paths.extend(p for p in (cuda_path,) if os.path.exists(p))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -245,7 +291,11 @@ def _load_dll_libraries() -> None:
                 textwrap.dedent(
                     """
                     Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
+<<<<<<< HEAD
                     It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe
+=======
+                    It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     """
                 ).strip()
             )
@@ -284,6 +334,7 @@ def _load_dll_libraries() -> None:
 
 
 def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
+<<<<<<< HEAD
     # Libraries can either be in
     # path/nvidia/lib_folder/lib or
     # path/nvidia/cuXX/lib (since CUDA 13.0) or
@@ -298,12 +349,22 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
         nvidia_lib_paths += glob.glob(
             os.path.join(path, "nvidia", f"cu{maj_cuda_version}", "lib", lib_name)
         )
+=======
+    # Libraries can either be in path/nvidia/lib_folder/lib or path/lib_folder/lib
+    nvidia_lib_paths = glob.glob(
+        os.path.join(path, "nvidia", lib_folder, "lib", lib_name)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lib_paths = glob.glob(os.path.join(path, lib_folder, "lib", lib_name))
 
     return nvidia_lib_paths + lib_paths
 
 
+<<<<<<< HEAD
 def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
+=======
+def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Preloads cuda deps if they could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == "Linux", "Should only be called on Linux"
@@ -314,15 +375,25 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) ->
         if candidate_lib_paths:
             lib_path = candidate_lib_paths[0]
             break
+<<<<<<< HEAD
     if not lib_path and required:
         raise ValueError(f"{lib_name} not found in the system path {sys.path}")
     if lib_path:
         ctypes.CDLL(lib_path)
+=======
+    if not lib_path:
+        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+    ctypes.CDLL(lib_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # See Note [Global dependencies]
 def _load_global_deps() -> None:
+<<<<<<< HEAD
     if platform.system() == "Windows":
+=======
+    if _running_with_deploy() or platform.system() == "Windows":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     # Determine the file extension based on the platform
@@ -342,6 +413,7 @@ def _load_global_deps() -> None:
         try:
             with open("/proc/self/maps") as f:
                 _maps = f.read()
+<<<<<<< HEAD
 
             # libtorch_global_deps.so always depends in cudart, check if its installed and loaded
             if "libcudart.so" not in _maps:
@@ -349,6 +421,14 @@ def _load_global_deps() -> None:
             # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
             _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
             _preload_cuda_deps("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]")
+=======
+            # libtorch_global_deps.so always depends in cudart, check if its installed via wheel
+            if "nvidia/cuda_runtime/lib/libcudart.so" not in _maps:
+                return
+            # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
+            # Please note that order are important for CUDA-11.8 , as nvjitlink does not exist there
+            _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _preload_cuda_deps("nvjitlink", "libnvJitLink.so.*[0-9]")
         except Exception:
             pass
@@ -356,6 +436,11 @@ def _load_global_deps() -> None:
     except OSError as err:
         # Can only happen for wheel with cuda libs as PYPI deps
         # As PyTorch is not purelib, but nvidia-*-cu12 is
+<<<<<<< HEAD
+=======
+        from torch.version import cuda as cuda_version
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cuda_libs: dict[str, str] = {
             "cublas": "libcublas.so.*[0-9]",
             "cudnn": "libcudnn.so.*[0-9]",
@@ -369,9 +454,20 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
+<<<<<<< HEAD
             "nvshmem": "libnvshmem_host.so.*[0-9]",
             "cufile": "libcufile.so.*[0-9]",
         }
+=======
+        }
+        # cufiile is only available on cuda 12+
+        # TODO: Remove once CUDA 11.8 binaries are deprecated
+        if cuda_version is not None:
+            t_version = cuda_version.split(".")
+            t_major = int(t_version[0])  # type: ignore[operator]
+            if t_major >= 12:
+                cuda_libs["cufile"] = "libcufile.so.*[0-9]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         is_cuda_lib_err = [
             lib for lib in cuda_libs.values() if lib.split(".")[0] in err.args[0]
@@ -380,14 +476,21 @@ def _load_global_deps() -> None:
             raise err
         for lib_folder, lib_name in cuda_libs.items():
             _preload_cuda_deps(lib_folder, lib_name)
+<<<<<<< HEAD
 
         # libnvToolsExt is Optional Dependency
         _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
 if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv("TORCH_USE_RTLD_GLOBAL")) and (
+<<<<<<< HEAD
     platform.system() != "Windows"
+=======
+    _running_with_deploy() or platform.system() != "Windows"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # Do it the hard way.  You might want to load libtorch with RTLD_GLOBAL in a
     # few circumstances:
@@ -990,7 +1093,11 @@ def sym_ite(b, t, f):
     """SymInt-aware utility for ternary operator (``t if b else f``.)"""
     if overrides.has_torch_function((b, t, f)):
         return overrides.handle_torch_function(sym_ite, (b, t, f), b, t, f)
+<<<<<<< HEAD
     assert isinstance(b, (SymBool, builtins.bool)) and type(t) is type(f)
+=======
+    assert isinstance(b, (SymBool, builtins.bool)) and type(t) == type(f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(b, SymBool):
         return b.__sym_ite__(t, f)
     return t if b else f
@@ -1019,10 +1126,17 @@ def sym_fresh_size(expr):
                     of the PyTorch repository rather than the C extensions which
                     are expected in the `torch._C` namespace. This can occur when
                     using the `install` workflow. e.g.
+<<<<<<< HEAD
                         $ python -m pip install --no-build-isolation -v . && python -c "import torch"
 
                     This error can generally be solved using the `develop` workflow
                         $ python -m pip install --no-build-isolation -v -e . && python -c "import torch"  # This should succeed
+=======
+                        $ python setup.py install && python -c "import torch"
+
+                    This error can generally be solved using the `develop` workflow
+                        $ python setup.py develop && python -c "import torch"  # This should succeed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     or by running Python from a different directory.
                 """
             ).strip()
@@ -1120,6 +1234,14 @@ def typename(obj: _Any, /) -> str:
 def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     r"""Returns True if `obj` is a PyTorch tensor.
 
+<<<<<<< HEAD
+=======
+    Note that this function is simply doing ``isinstance(obj, Tensor)``.
+    Using that ``isinstance`` check is better for typechecking with mypy,
+    and more explicit - so it's recommended to use that instead of
+    ``is_tensor``.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         obj (object): Object to test
     Example::
@@ -1137,6 +1259,7 @@ def is_storage(obj: _Any, /) -> _TypeIs[_Union["TypedStorage", "UntypedStorage"]
 
     Args:
         obj (Object): Object to test
+<<<<<<< HEAD
     Example::
 
         >>> x = torch.tensor([1, 2, 3])
@@ -1145,6 +1268,8 @@ def is_storage(obj: _Any, /) -> _TypeIs[_Union["TypedStorage", "UntypedStorage"]
         >>> torch.is_storage(x.untyped_storage())
         True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     return type(obj) in _storage_classes
 
@@ -1412,6 +1537,10 @@ def use_deterministic_algorithms(
         * :func:`torch.histc` when called on a CUDA tensor
         * :func:`torch.bincount` when called on a CUDA tensor and ``weights``
           tensor is given
+<<<<<<< HEAD
+=======
+        * :func:`torch.kthvalue` with called on a CUDA tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         * :func:`torch.median` with indices output when called on a CUDA tensor
         * :func:`torch.nn.functional.grid_sample` when attempting to differentiate a CUDA tensor
         * :func:`torch.cumsum` when called on a CUDA tensor when dtype is floating point or complex
@@ -1423,6 +1552,7 @@ def use_deterministic_algorithms(
     :attr:`torch.utils.deterministic.fill_uninitialized_memory` is turned on.
     See the documentation for that attribute for more information.
 
+<<<<<<< HEAD
     Note that deterministic operations tend to have worse performance than
     nondeterministic operations.
 
@@ -1447,6 +1577,22 @@ def use_deterministic_algorithms(
         occupancy.
 
 
+=======
+    A handful of CUDA operations are nondeterministic if the CUDA version is
+    10.2 or greater, unless the environment variable ``CUBLAS_WORKSPACE_CONFIG=:4096:8``
+    or ``CUBLAS_WORKSPACE_CONFIG=:16:8`` is set. See the CUDA documentation for more
+    details: `<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
+    If one of these environment variable configurations is not set, a :class:`RuntimeError`
+    will be raised from these operations when called with CUDA tensors:
+
+        * :func:`torch.mm`
+        * :func:`torch.mv`
+        * :func:`torch.bmm`
+
+    Note that deterministic operations tend to have worse performance than
+    nondeterministic operations.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     .. note::
 
         This flag does not detect or prevent nondeterministic behavior caused
@@ -1470,14 +1616,25 @@ def use_deterministic_algorithms(
         >>> # xdoctest: +SKIP
         >>> torch.use_deterministic_algorithms(True)
 
+<<<<<<< HEAD
+=======
+        # Forward mode nondeterministic error
+        >>> torch.randn(10, device='cuda').kthvalue(1)
+        ...
+        RuntimeError: kthvalue CUDA does not have a deterministic implementation...
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Backward mode nondeterministic error
         >>> torch.nn.AvgPool3d(1)(torch.randn(3, 4, 5, 6, requires_grad=True).cuda()).sum().backward()
         ...
         RuntimeError: avg_pool3d_backward_cuda does not have a deterministic implementation...
     """
+<<<<<<< HEAD
     import torch._inductor.config as inductor_config
 
     inductor_config.deterministic = mode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _C._set_deterministic_algorithms(mode, warn_only=warn_only)
 
 
@@ -1703,10 +1860,16 @@ def _check(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
+<<<<<<< HEAD
     _check_with(RuntimeError, cond, message)  # pyrefly: ignore [bad-argument-type]
 
 
 # TODO add deprecation annotation
+=======
+    _check_with(RuntimeError, cond, message)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _check_is_size(i, message=None, *, max=None):
     """Checks that a given integer is a valid size (i.e., is non-negative).
     You should use this over ``_check(i >= 0)`` because it can prevent
@@ -1753,7 +1916,11 @@ def _check_index(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
+<<<<<<< HEAD
     _check_with(IndexError, cond, message)  # pyrefly: ignore [bad-argument-type]
+=======
+    _check_with(IndexError, cond, message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_value(cond, message=None):  # noqa: F811
@@ -1771,7 +1938,11 @@ def _check_value(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
+<<<<<<< HEAD
     _check_with(ValueError, cond, message)  # pyrefly: ignore [bad-argument-type]
+=======
+    _check_with(ValueError, cond, message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_type(cond, message=None):  # noqa: F811
@@ -1789,7 +1960,11 @@ def _check_type(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
+<<<<<<< HEAD
     _check_with(TypeError, cond, message)  # pyrefly: ignore [bad-argument-type]
+=======
+    _check_with(TypeError, cond, message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_not_implemented(cond, message=None):  # noqa: F811
@@ -1807,12 +1982,16 @@ def _check_not_implemented(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
+<<<<<<< HEAD
     _check_with(
         NotImplementedError,
         cond,
         # pyrefly: ignore [bad-argument-type]
         message,
     )
+=======
+    _check_with(NotImplementedError, cond, message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_tensor_all_with(error_type, cond, message=None):  # noqa: F811
@@ -2104,7 +2283,11 @@ def _dtype(self):
 
 # Shared memory manager needs to know the exact location of manager executable
 def _manager_path():
+<<<<<<< HEAD
     if platform.system() == "Windows":
+=======
+    if _running_with_deploy() or platform.system() == "Windows":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return b""
     path = get_file_path("torch", "bin", "torch_shm_manager")
     prepare_multiprocessing_environment(get_file_path("torch"))
@@ -2164,7 +2347,11 @@ def _manager_path():
 )
 
 ################################################################################
+<<<<<<< HEAD
 # Import TorchDynamo's lazy APIs to avoid circular dependencies
+=======
+# Import TorchDynamo's lazy APIs to avoid circular dependenices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ################################################################################
 
 # needs to be before from torch.functional import * to avoid circular dependencies
@@ -2247,7 +2434,10 @@ def _assert(condition, message):
     testing as testing,
     types as types,
     utils as utils,
+<<<<<<< HEAD
     version as version,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xpu as xpu,
 )
 from torch.signal import windows as windows
@@ -2516,7 +2706,11 @@ def compile(
     to compile it and cache the compiled result on the code object for future
     use.  A single frame may be compiled multiple times if previous compiled
     results are not applicable for subsequent calls (this is called a "guard
+<<<<<<< HEAD
     failure"), you can use TORCH_LOGS=guards to debug these situations.
+=======
+    failure), you can use TORCH_LOGS=guards to debug these situations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Multiple compiled results can be associated with a frame up to
     ``torch._dynamo.config.recompile_limit``, which defaults to 8; at which
     point we will fall back to eager.  Note that compile caches are per
@@ -2525,11 +2719,18 @@ def compile(
 
     Args:
        model (Callable or None): Module/function to optimize
+<<<<<<< HEAD
        fullgraph (bool): If False (default), torch.compile attempts to discover compilable regions
         in the function that it will optimize. If True, then we require that the entire function be
         capturable into a single graph. If this is not possible (that is, if there are graph breaks),
         then this will raise an error. This also opts into unbacked semantics, notably it will turn on
         capture_scalar_outputs and capture_dynamic_output_shape_ops on by default.
+=======
+       fullgraph (bool): If False (default), torch.compile attempts to discover compileable regions
+        in the function that it will optimize. If True, then we require that the entire function be
+        capturable into a single graph. If this is not possible (that is, if there are graph breaks),
+        then this will raise an error.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        dynamic (bool or None): Use dynamic shape tracing.  When this is True, we will up-front attempt
         to generate a kernel that is as dynamic as possible to avoid recompilations when
         sizes change.  This may not always work as some operations/optimizations will
@@ -2622,7 +2823,11 @@ def foo(x):
         def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
             if model is None:
                 raise RuntimeError("Model can't be None")
+<<<<<<< HEAD
             return compile(  # pyrefly: ignore  # no-matching-overload
+=======
+            return compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 fullgraph=fullgraph,
                 dynamic=dynamic,
@@ -2650,6 +2855,7 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
     if options and isinstance(options, dict):
         guard_filter_fn = options.pop("guard_filter_fn", None)
 
+<<<<<<< HEAD
     if torch.compiler.is_exporting():
         warnings.warn(
             "You are calling torch.compile inside torch.export region. "
@@ -2674,6 +2880,8 @@ def export_wrapped_fn(*args, **kwargs):
 
         return export_wrapped_fn
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if backend == "inductor":
         backend = _TorchCompileInductorWrapper(mode, options, dynamic)
     else:
@@ -2735,6 +2943,7 @@ def _register_device_module(device_type, module):
 # Register MPS specific decomps
 torch.backends.mps._init()
 
+<<<<<<< HEAD
 from torch import compiler as compiler
 
 
@@ -2750,6 +2959,23 @@ def registerOp(cls, op_key, full_schema, op_impl, dispatch_key):
             cls.ops_table[(op_key, dispatch_key)] = op_impl
 
         return cls.ops_table[(op_key, dispatch_key)]
+=======
+if not _running_with_deploy():
+    from torch import compiler as compiler
+
+    class _TritonLibrary:
+        lib = torch.library.Library("triton", "DEF")
+        ops_table: dict[tuple[str, str], _Callable] = {}
+
+        @classmethod
+        def registerOp(cls, op_key, full_schema, op_impl, dispatch_key):
+            if (op_key, dispatch_key) not in cls.ops_table:
+                cls.lib.define(full_schema)
+                cls.lib.impl("triton::" + op_key, op_impl, dispatch_key)
+                cls.ops_table[(op_key, dispatch_key)] = op_impl
+
+            return cls.ops_table[(op_key, dispatch_key)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Deprecated attributes
@@ -2864,7 +3090,14 @@ def _import_device_backends():
     from importlib.metadata import entry_points
 
     group_name = "torch.backends"
+<<<<<<< HEAD
     backend_extensions = entry_points(group=group_name)
+=======
+    if sys.version_info < (3, 10):
+        backend_extensions = entry_points().get(group_name, ())
+    else:
+        backend_extensions = entry_points(group=group_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for backend_extension in backend_extensions:
         try:
diff --git a/torch/_classes.py b/torch/_classes.py
index a811c7c30be61..0a9a86ad099d5 100644
--- a/torch/_classes.py
+++ b/torch/_classes.py
@@ -1,15 +1,28 @@
+<<<<<<< HEAD
 import types
 from typing import Any
+=======
+# mypy: allow-untyped-defs
+import types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C
 
 
 class _ClassNamespace(types.ModuleType):
+<<<<<<< HEAD
     def __init__(self, name: str) -> None:
         super().__init__("torch.classes" + name)
         self.name = name
 
     def __getattr__(self, attr: str) -> Any:
+=======
+    def __init__(self, name):
+        super().__init__("torch.classes" + name)
+        self.name = name
+
+    def __getattr__(self, attr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
         if proxy is None:
             raise RuntimeError(f"Class {self.name}.{attr} not registered!")
@@ -22,16 +35,27 @@ class _Classes(types.ModuleType):
     def __init__(self) -> None:
         super().__init__("torch.classes")
 
+<<<<<<< HEAD
     def __getattr__(self, name: str) -> _ClassNamespace:
+=======
+    def __getattr__(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         namespace = _ClassNamespace(name)
         setattr(self, name, namespace)
         return namespace
 
     @property
+<<<<<<< HEAD
     def loaded_libraries(self) -> Any:
         return torch.ops.loaded_libraries
 
     def load_library(self, path: str) -> None:
+=======
+    def loaded_libraries(self):
+        return torch.ops.loaded_libraries
+
+    def load_library(self, path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Loads a shared library from the given path into the current process.
 
diff --git a/torch/_compile.py b/torch/_compile.py
index 76ddd3ccb05b4..1ccb26670267c 100644
--- a/torch/_compile.py
+++ b/torch/_compile.py
@@ -4,8 +4,12 @@
 """
 
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, overload, TypeVar, Union
+=======
+from typing import Callable, Literal, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
@@ -21,7 +25,11 @@ def _disable_dynamo(
 
 @overload
 def _disable_dynamo(
+<<<<<<< HEAD
     fn: None = None, recursive: bool = True
+=======
+    fn: Literal[None] = None, recursive: bool = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]: ...
 
 
diff --git a/torch/_custom_op/autograd.py b/torch/_custom_op/autograd.py
index eed665a1a0d62..c416b1c2ef7ad 100644
--- a/torch/_custom_op/autograd.py
+++ b/torch/_custom_op/autograd.py
@@ -220,7 +220,11 @@ def error(what):
                     f"hold a list of gradients but got object of type "
                     f"{type(grad)}."
                 )
+<<<<<<< HEAD
             if len(grad) != len(arg_info):
+=======
+            if not len(grad) == len(arg_info):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 error(
                     f"for input '{name}' expected the grad_input dict to "
                     f"hold a list of {len(arg_info)} gradients but got "
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index 1398f808da21f..3b41586aea81e 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -55,7 +55,10 @@ def warn_deprecated():
         "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please "
         "use the equivalent torch.library API instead.",
         DeprecationWarning,
+<<<<<<< HEAD
         stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -102,7 +105,11 @@ def inner(func):
             lib, ns, function_schema, name, ophandle, _private_access=True
         )
 
+<<<<<<< HEAD
         result.__name__ = func.__name__  # pyrefly: ignore [bad-assignment]
+=======
+        result.__name__ = func.__name__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result.__module__ = func.__module__
         result.__doc__ = func.__doc__
 
@@ -649,7 +656,11 @@ def custom_op_from_existing(op):
     name = op.name().split("::")[-1]
     schema_str = str(op._schema)
     # CustomOp expects the schema string without the namespace
+<<<<<<< HEAD
     schema_str = schema_str.rsplit("::", maxsplit=1)[-1]
+=======
+    schema_str = schema_str.split("::")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     schema = FunctionSchema.parse(schema_str)
     return CustomOp(lib, ns, schema, name, op, _private_access=True)
 
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index c4396932818d3..57f5e98818538 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -1,10 +1,17 @@
 # mypy: allow-untyped-defs
 import inspect
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from functools import lru_cache, partial, wraps
 from itertools import chain
 from typing import Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from collections.abc import Sequence
+from functools import lru_cache, partial, wraps
+from itertools import chain
+from typing import Callable, Optional, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
@@ -240,7 +247,10 @@ def get_decompositions(
 
     registry = global_decomposition_table[type]
     packets_to_overloads = defaultdict(list)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for opo in registry:
         if isinstance(opo, (OpOverload, OpOverloadPacket)):
             packets_to_overloads[opo.overloadpacket].append(opo)
@@ -419,7 +429,10 @@ def _core_aten_decompositions_post_autograd() -> dict[
             aten.native_dropout_backward,
             aten.native_group_norm_backward,
             aten.native_layer_norm_backward,
+<<<<<<< HEAD
             aten._fused_rms_norm_backward,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten.new_empty,
             aten.new_full,
             aten.new_ones,
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index ad08d26521908..15f4457f5f7cb 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -5,12 +5,20 @@
 import numbers
 import operator
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
 from functools import partial, reduce
 from itertools import chain, product
 from typing import Any, cast, Optional, Union
+=======
+from collections.abc import Iterable
+from enum import Enum
+from functools import partial, reduce
+from itertools import chain, product
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._meta_registrations
@@ -53,7 +61,11 @@ class Reduction(Enum):
 
 
 # This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided
+<<<<<<< HEAD
 # We're currently reusing ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
+=======
+# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Will need to validate the non-elementwise uses
 def type_casts(
     f: Callable,
@@ -382,7 +394,10 @@ def to_real_dtype(dtype: torch.dtype):
 def mse_loss(
     self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value
 ) -> Tensor:
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     loss = (self - target) ** 2
     return apply_loss_reduction(loss, reduction)
 
@@ -416,7 +431,10 @@ def smooth_l1_loss(
     beta: float = 1.0,
 ):
     loss = (self - target).abs()
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
     return apply_loss_reduction(loss, reduction)
 
@@ -724,7 +742,14 @@ def slice_forward(
     end: Optional[int] = None,
     step: int = 1,
 ):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import statically_known_true
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_size_oblivious,
+        statically_known_true,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ndim = self.dim()
     if ndim == 0:
@@ -739,6 +764,7 @@ def slice_forward(
     start_val = start if start is not None else 0
     end_val = end if end is not None else sys.maxsize  # 2^63 - 1
 
+<<<<<<< HEAD
     if start_val < 0:
         start_val += sizes[dim]
 
@@ -748,13 +774,30 @@ def slice_forward(
     if start_val < 0:
         start_val = 0
     elif start_val > sizes[dim]:
+=======
+    if guard_size_oblivious(start_val < 0):
+        start_val += sizes[dim]
+
+    if guard_size_oblivious(end_val < 0):
+        end_val += sizes[dim]
+
+    if guard_size_oblivious(start_val < 0):
+        start_val = 0
+    elif guard_size_oblivious(start_val > sizes[dim]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         start_val = sizes[dim]
 
     if statically_known_true(end_val == sys.maxsize):
         end_val = sizes[dim]
+<<<<<<< HEAD
     elif end_val < start_val:
         end_val = start_val
     elif end_val > sizes[dim]:
+=======
+    elif guard_size_oblivious(end_val < start_val):
+        end_val = start_val
+    elif guard_size_oblivious(end_val > sizes[dim]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         end_val = sizes[dim]
 
     storage_offset = self.storage_offset() + start_val * strides[dim]
@@ -814,7 +857,11 @@ def slice_scatter(
     if start == 0 and end == dim_size and step == 1:
         return src.clone()
 
+<<<<<<< HEAD
     indices: list[Optional[Tensor]] = [None] * input.dim()
+=======
+    indices = [None] * input.dim()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     idx = torch.arange(dim_size, device=input.device)
     indices[dim] = (idx - start) // step
 
@@ -924,7 +971,11 @@ def im2col(
     def check_positive(param, param_name, strict=True):
         cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
         torch._check(
+<<<<<<< HEAD
             cond, lambda: f"{param_name} should be greater than zero, but got {param}"
+=======
+            cond, lambda: "{param_name} should be greater {'than' zero, but got {param}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     check_positive(kernel_size, "kernel_size")
@@ -947,7 +998,11 @@ def check_positive(param, param_name, strict=True):
     )
     torch._check(
         all(c > 0 for c in output_size),
+<<<<<<< HEAD
         lambda: f"Given an input with spatial size {tuple(shape[-2:])}, "
+=======
+        lambda: f"Given an input with spacial size {tuple(shape[-2:])}, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"kernel_size={kernel_size}, dilation={dilation}, "
         f"padding={padding}, stride={stride}, "
         "the calculated shape of the array of sliding blocks "
@@ -1009,7 +1064,11 @@ def col2im(
     def check_positive(param, param_name, strict=True):
         cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
         torch._check(
+<<<<<<< HEAD
             cond, lambda: f"{param_name} should be greater than zero, but got {param}"
+=======
+            cond, lambda: "{param_name} should be greater than zero, but got {param}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     check_positive(kernel_size, "kernel_size")
@@ -1173,8 +1232,11 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]):
 @register_decomposition(aten._softmax)
 @out_wrapper()
 def _softmax(x: Tensor, dim: int, half_to_float: bool):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # eager softmax returns a contiguous tensor. Ensure that decomp also returns
     # a contiguous tensor.
     x = x.contiguous()
@@ -1184,7 +1246,11 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
+<<<<<<< HEAD
     if guard_or_false(x.numel() == 0):
+=======
+    if x.numel() == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unnormalized = torch.exp(x)
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1198,8 +1264,11 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
 @register_decomposition(aten._log_softmax)
 @out_wrapper(exact_dtype=True)
 def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # eager log_softmax returns a contiguous tensor. Ensure that decomp also
     # returns a contiguous tensor.
     x = x.contiguous()
@@ -1209,7 +1278,11 @@ def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
+<<<<<<< HEAD
     if guard_or_false(x.numel() == 0):
+=======
+    if x.numel() == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shifted = x
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1442,6 +1515,7 @@ def tensor_split_tensor_indices_or_sections_py_impl(
         assert isinstance(sections, IntLike)
         return self.tensor_split(sections, dim)
     else:
+<<<<<<< HEAD
         ctx = nullcontext
         if (fake_mode := torch._guards.detect_fake_mode()) and (
             shape_env := fake_mode.shape_env
@@ -1454,6 +1528,10 @@ def tensor_split_tensor_indices_or_sections_py_impl(
         with ctx():
             indices = [i.item() for i in tensor_indices_or_sections]
         # WARNING: Tempted to torch._check(x>0) on the indices here?  You
+=======
+        indices = [i.item() for i in tensor_indices_or_sections]
+        # WARNING: Tempted to torch._check_is_size on the indices here?  You
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # can't: tensor_split works with negative values in indices:
         #
         # >>> torch.tensor_split(torch.randn(10), torch.tensor([-5, 5]))
@@ -1553,9 +1631,15 @@ def native_group_norm_backward(
         lambda: f"Expect gamma to have {C} elements but got {gamma.numel() if gamma is not None else -1}",
     )
 
+<<<<<<< HEAD
     cpg = C // group
     torch._check(
         C == cpg * group,
+=======
+    cpg, _rem = divmod(C, group)
+    torch._check(
+        _rem == 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"Expect number of channels {C} to be evenly-divisible by number of groups {group}",
     )
 
@@ -1681,9 +1765,15 @@ def native_layer_norm_backward(
 
     N = prod(inner_dims)  # type: ignore[arg-type]
     M = prod(outer_dims)  # type: ignore[arg-type]
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import statically_known_true
 
     if statically_known_true(M == 0) or statically_known_true(N == 0):
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(M <= 0) or guard_size_oblivious(N <= 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             input.new_zeros(input_shape) if output_mask[0] else None,
             input.new_zeros(input_shape[axis:]) if output_mask[1] else None,
@@ -1691,7 +1781,10 @@ def native_layer_norm_backward(
         )
     mean = _unsqueeze_to_dim(mean, input_cast.dim())  # type: ignore[union-attr]
     rstd = _unsqueeze_to_dim(rstd, input_cast.dim())  # type: ignore[union-attr]
+<<<<<<< HEAD
     assert input_cast is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x_hat = (input_cast - mean) * rstd
     if weight_cast is not None:
         grad_x_hat = grad_out_cast * weight_cast
@@ -1724,8 +1817,13 @@ def native_layer_norm_backward(
 
     return (
         _maybe_cast(d_input, input.dtype),
+<<<<<<< HEAD
         _maybe_cast(d_weight, weight.dtype if weight is not None else None),
         _maybe_cast(d_bias, bias.dtype if bias is not None else None),
+=======
+        _maybe_cast(d_weight, input.dtype),
+        _maybe_cast(d_bias, input.dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -1757,6 +1855,7 @@ def native_layer_norm_backward_out(
     return grad_input
 
 
+<<<<<<< HEAD
 @register_decomposition(aten._fused_rms_norm_backward.default)
 def _fused_rms_norm_backward(
     grad_out: Tensor,
@@ -1832,6 +1931,8 @@ def _fused_rms_norm_backward(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def native_batch_norm_helper(
     input: Tensor,
     weight: Optional[Tensor],
@@ -2790,7 +2891,11 @@ def _index_add(
     if alpha != 1:
         python_type = utils.dtype_to_type(x.dtype)
         torch._check(
+<<<<<<< HEAD
             python_type is bool
+=======
+            python_type == bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             or utils.is_weakly_lesser_type(type(alpha), python_type),
             lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
         )
@@ -4001,9 +4106,15 @@ def _unsafe_masked_index(x, mask, indices, fill):
         lambda: "tensors used as masks must be bool tensors",
     )
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if guard_or_false(x.numel() == 0):
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(x.numel() == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         meta_result = torch._meta_registrations.meta_index_Tensor(x, indices)
         return x.new_full(meta_result.shape, fill)
 
@@ -4079,7 +4190,10 @@ def _nll_loss_forward(
         return result, total_weight
 
     if weight is not None:
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         w = w.expand(self.shape)
         wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
         wsum = torch.where(target != ignore_index, wsum, 0)
@@ -4136,7 +4250,11 @@ def nll_loss2d_forward(
     return _nll_loss_forward(self, target, weight, reduction, ignore_index)
 
 
+<<<<<<< HEAD
 # These are adapted from aten/src/ATen/native/UpSample.h, which is based on
+=======
+# These are adapted from aten/src/ATen/native/UpSample.h, wich is based on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
 def _upsample_cubic_convolution1(x: Tensor, A: float) -> Tensor:
     return ((A + 2) * x - (A + 3)) * x * x + 1
@@ -4476,7 +4594,11 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
 
     t1, t2 = (tensor1, tensor2) if tensor1.ndim >= tensor2.ndim else (tensor2, tensor1)
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not (t1.ndim >= 3 and t2.ndim <= 2):
         return False
@@ -4484,7 +4606,11 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
         return True
     if tensor1.ndim == 2:
         return False
+<<<<<<< HEAD
     if guard_or_false(t1.numel() == 0):
+=======
+    if guard_size_oblivious(t1.numel() == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     t1_shape = t1.shape
@@ -4496,7 +4622,11 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
     for size in reversed(t1_shape[1:]):
         expected_stride.append(size * expected_stride[-1])
     return all(
+<<<<<<< HEAD
         guard_or_false(size == 1) or guard_or_false(left == right)
+=======
+        guard_size_oblivious(size == 1) or left == right
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for left, right, size in zip(
             t1_stride, list(reversed(expected_stride)), t1_shape
         )
@@ -4896,9 +5026,13 @@ def accumulate(grad, out, index_ranges):
 @register_decomposition(aten.aminmax)
 @out_wrapper("min", "max")
 def aminmax(self, *, dim=None, keepdim=False):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     amin = torch.amin(self, dim=dim, keepdim=keepdim)
     # pyrefly: ignore [bad-argument-type]
+=======
+    amin = torch.amin(self, dim=dim, keepdim=keepdim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     amax = torch.amax(self, dim=dim, keepdim=keepdim)
     return amin, amax
 
@@ -5092,7 +5226,10 @@ def scaled_dot_product_flash_attention_for_cpu(
         is_causal=is_causal,
         dropout_mask=None,
         scale=scale,
+<<<<<<< HEAD
         enable_gqa=query.size(1) != key.size(1),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     # Why this change?
     # In pre-dispatch export scaled_dot_product_attention is executed via
@@ -5143,7 +5280,10 @@ def baddbmm(self, batch1, batch2, beta=1, alpha=1):
         alpha = int(alpha)
     result = torch.bmm(batch1, batch2)
     if not isinstance(alpha, numbers.Number) or alpha != 1:
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = result * alpha
     if beta == 0:
         return result
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index fb4a4d85faa20..56afefd90efc5 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import inspect
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._decomp
@@ -147,7 +151,11 @@ def native_layer_norm_backward(
     inner_dims = input_shape[axis:]
     outer_dims = input_shape[:axis]
     inner_dim_indices = list(range(axis, input_ndim))
+<<<<<<< HEAD
     outer_dim_indices = list(range(axis))
+=======
+    outer_dim_indices = list(range(0, axis))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     N = 1
     for i in inner_dims:
diff --git a/torch/_decomp/decompositions_for_rng.py b/torch/_decomp/decompositions_for_rng.py
index 455ef0cc99438..411f5cb061adb 100644
--- a/torch/_decomp/decompositions_for_rng.py
+++ b/torch/_decomp/decompositions_for_rng.py
@@ -2,7 +2,11 @@
 # mypy: allow-untyped-defs
 import functools
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._decomp as decomp
diff --git a/torch/_deploy.py b/torch/_deploy.py
new file mode 100644
index 0000000000000..0443a2447d00d
--- /dev/null
+++ b/torch/_deploy.py
@@ -0,0 +1,104 @@
+# mypy: allow-untyped-defs
+import io
+
+import torch
+from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
+from torch.package._package_pickler import create_pickler
+from torch.package._package_unpickler import PackageUnpickler
+from torch.serialization import _maybe_decode_ascii
+
+
+def _save_storages(importer, obj):
+    serialized_storages = []
+    serialized_dtypes = []
+
+    importer = importer if isinstance(importer, torch.package.PackageImporter) else None
+    importers: Importer
+    if importer is not None:
+        importers = OrderedImporter(importer, sys_importer)
+    else:
+        importers = sys_importer
+
+    def persistent_id(obj):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, we can
+                # remove this case
+                dtype = obj.dtype
+            else:
+                dtype = torch.uint8
+
+            serialized_storages.append(obj)
+            serialized_dtypes.append(dtype)
+            return ("storage", len(serialized_storages) - 1)
+
+        if hasattr(obj, "__reduce_deploy__"):
+            if _serialized_reduces.get(id(obj)) is None:
+                _serialized_reduces[id(obj)] = (
+                    "reduce_deploy",
+                    id(obj),
+                    *obj.__reduce_deploy__(importers),
+                )
+            return _serialized_reduces[id(obj)]
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = create_pickler(data_buf, importers)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return (
+        data_value,
+        serialized_storages,
+        serialized_dtypes,
+        importer.zip_reader if importer else None,
+    )
+
+
+def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        if typename == "storage":
+            # TODO: Once we decide to break serialization FC, we can
+            # stop wrapping with TypedStorage
+            storage = serialized_storages[data[0]]
+            dtype = serialized_dtypes[data[0]]
+            return torch.storage.TypedStorage(
+                wrap_storage=storage.untyped(), dtype=dtype
+            )
+
+        if typename == "reduce_deploy":
+            reduce_id, func, args = data
+            if reduce_id not in _loaded_reduces:
+                _loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
+            return _loaded_reduces[reduce_id]
+
+        return None
+
+    importer: Importer
+    if zip_reader is not None:
+        importer = OrderedImporter(_get_package(zip_reader), sys_importer)
+    else:
+        importer = sys_importer
+
+    unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
+    unpickler.persistent_load = persistent_load  # type: ignore[method-assign]
+    result = _deploy_objects[id] = unpickler.load()
+    return result
+
+
+def _get_package(zip_reader):
+    if zip_reader not in _raw_packages:
+        _raw_packages[zip_reader] = PackageImporter(zip_reader)
+    return _raw_packages[zip_reader]
+
+
+_raw_packages: dict = {}
+_deploy_objects: dict = {}
+_serialized_reduces: dict = {}
+_loaded_reduces: dict = {}
diff --git a/torch/_dispatch/python.py b/torch/_dispatch/python.py
index e6b3f09c22fc2..1934ee0142df9 100644
--- a/torch/_dispatch/python.py
+++ b/torch/_dispatch/python.py
@@ -1,9 +1,15 @@
 # mypy: allow-untyped-defs
 import itertools
 import unittest.mock
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from contextlib import contextmanager
 from typing import TypeVar, Union
+=======
+from collections.abc import Iterator
+from contextlib import contextmanager
+from typing import Callable, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -154,7 +160,11 @@ def maybe_detach(t):
                 maybe_detach, (f_args, f_kwargs)
             )
             with fake_mode:
+<<<<<<< HEAD
                 f_r = op(*f_args, **f_kwargs)  # pyrefly: ignore [invalid-param-spec]
+=======
+                f_r = op(*f_args, **f_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = op._op_dk(final_key, *args, **kwargs)
 
         def desc():
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 8666d12bddcbc..85e3e2af4beb3 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -10,6 +10,7 @@
 
 import torch
 
+<<<<<<< HEAD
 from . import (
     aot_compile,
     config,
@@ -18,6 +19,9 @@
     functional_export,
     resume_execution,
 )
+=======
+from . import config, convert_frame, eval_frame, resume_execution
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .backends.registry import list_backends, lookup_backend, register_backend
 from .callback import callback_handler, on_compile_end, on_compile_start
 from .code_context import code_context
@@ -28,7 +32,10 @@
     disable,
     disallow_in_graph,
     dont_skip_tracing,
+<<<<<<< HEAD
     error_on_graph_break,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     forbid_in_graph,
     graph_break,
     mark_dynamic,
@@ -40,7 +47,10 @@
     run,
     set_stance,
     skip_frame,
+<<<<<<< HEAD
     step_unsupported,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     substitute_in_graph,
 )
 from .eval_frame import (
@@ -54,12 +64,16 @@
     OptimizedModule,
     reset_code,
 )
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .external_utils import is_compiling
 from .mutation_guard import GenerationTracker
 from .pgo import reset_code_state
 from .symbolic_convert import TensorifyState
+<<<<<<< HEAD
 from .utils import (
     graph_break_reasons,
     guard_failures,
@@ -67,6 +81,9 @@
     register_hook_for_recompile_user_context,
     reset_frame_count,
 )
+=======
+from .utils import graph_break_reasons, guard_failures, orig_code_map, reset_frame_count
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Register polyfill functions
@@ -76,6 +93,7 @@
 __all__ = [
     "allow_in_graph",
     "assume_constant_result",
+<<<<<<< HEAD
     "config",
     "disable",
     "disallow_in_graph",
@@ -87,6 +105,13 @@
     "is_compiling",
     "list_backends",
     "lookup_backend",
+=======
+    "disallow_in_graph",
+    "dont_skip_tracing",
+    "forbid_in_graph",
+    "substitute_in_graph",
+    "graph_break",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "mark_dynamic",
     "maybe_mark_dynamic",
     "mark_static",
@@ -94,6 +119,7 @@
     "nonstrict_trace",
     "optimize",
     "optimize_assert",
+<<<<<<< HEAD
     "OptimizedModule",
     "patch_dynamo_config",
     "register_backend",
@@ -105,6 +131,23 @@
     "skip_frame",
     "step_unsupported",
     "substitute_in_graph",
+=======
+    "patch_dynamo_config",
+    "skip_frame",
+    "export",
+    "explain",
+    "run",
+    "replay",
+    "disable",
+    "set_stance",
+    "reset",
+    "OptimizedModule",
+    "is_compiling",
+    "register_backend",
+    "list_backends",
+    "lookup_backend",
+    "config",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # allowlist this for weights_only load of NJTs
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index c7679a9300a01..c3949ef3317bc 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -49,6 +49,7 @@
 __all__ = ["trace_wrapped"]
 
 
+<<<<<<< HEAD
 @torch.library.custom_op("flex_lib::zeros_and_scatter", mutates_args=())  # type: ignore[misc]
 def zeros_and_scatter(
     shape: list[int],
@@ -89,13 +90,59 @@ def _(info, indims, shape, indices, value):  # type: ignore[no-untyped-def]
         value,
     )
     return out, None
+=======
+if not torch._running_with_deploy():
+    # torch.library.custom_op does not work with torch.deploy/multipy  # codespell:ignore
+
+    @torch.library.custom_op("flex_lib::zeros_and_scatter", mutates_args=())  # type: ignore[misc]
+    def zeros_and_scatter(
+        shape: list[int],
+        indices: list[Tensor],
+        vals: Tensor,
+    ) -> Tensor:
+        """Custom Op so that we can register a custom lowering for the new_output + scatter in the backwards pass"""
+        grad = torch.zeros(shape, device=vals.device, dtype=vals.dtype)
+        return torch.ops.aten.index_put(grad, indices, vals, accumulate=True)
+
+    @zeros_and_scatter.register_fake  # type: ignore[misc]
+    def _(
+        shape: list[int],
+        indices: list[Tensor],
+        vals: Tensor,
+    ) -> Tensor:
+        return vals.new_empty(shape)
+
+    @zeros_and_scatter.register_vmap  # type: ignore[misc]
+    def _(info, indims, shape, indices, value):  # type: ignore[no-untyped-def]
+        """The batching rule is special in that it returns a tensor that is not batched"""
+        indices_indims = indims[1]
+        expanded_indices = []
+        for idx, idx_indim in zip(indices, indices_indims):
+            # The index is not a being batched, we should unsqueeze and expand to val
+            if idx_indim is None:
+                expanded_indices.append(idx.expand(value.shape))
+            else:
+                # the index is being part of the vmap batch, it should be the same size as val
+                assert idx.shape == value.shape
+                expanded_indices.append(idx)
+
+        out = torch.ops.flex_lib.zeros_and_scatter(
+            shape,
+            expanded_indices,
+            value,
+        )
+        return out, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ModIndex(torch.autograd.Function):
     generate_vmap_rule = True
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(x: Tensor, indices: list[Tensor]) -> Tensor:
         return torch.ops.aten.index(x, indices)
 
@@ -117,11 +164,14 @@ def backward(ctx, gradOut):  # type: ignore[no-untyped-def]
             None,
         )
 
+<<<<<<< HEAD
     @classmethod
     @torch._export.wrappers.allow_in_pre_dispatch_graph
     def apply(cls, *args, **kwargs):  # type: ignore[no-untyped-def]
         return super().apply(*args, **kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 mod_index = ModIndex.apply
 
@@ -243,7 +293,10 @@ def _trace_wrapped_functionalized(ctx: Any, *args: Any, **kwargs: Any) -> Any:
 
 def autograd_function_backward_rewritten(original_backward: Any) -> Any:
     def new_backward(ctx: Any, *grads: Any) -> Any:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grads = [g.contiguous() for g in grads]
         return original_backward(ctx, *grads)
 
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
index 781315d95346e..4cf1341c53311 100644
--- a/torch/_dynamo/backends/common.py
+++ b/torch/_dynamo/backends/common.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides common utilities and base classes for TorchDynamo backends.
 
@@ -19,9 +24,12 @@
 import contextlib
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any
 from typing_extensions import ParamSpec, TypeVar
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -37,6 +45,7 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 P = ParamSpec("P")
 R = TypeVar("R")
 
@@ -49,6 +58,15 @@ def __init__(self, **kwargs: Any) -> None:
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: Iterable[Any], **kwargs: Any
     ) -> Callable[..., Any]:
+=======
+
+class AotAutograd:
+    def __init__(self, **kwargs) -> None:
+        self.__name__ = "compiler_fn"
+        self.kwargs = kwargs
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if kwargs:
             log.warning("aot_autograd-based backend ignoring extra kwargs %s", kwargs)
 
@@ -72,8 +90,13 @@ def __call__(
             counters["aot_autograd"]["not_ok"] += 1
             return gm
 
+<<<<<<< HEAD
         def wrap_bw_compiler(bw_compiler_fn: Callable[P, R]) -> Callable[..., R]:
             def _wrapped_bw_compiler(*args: P.args, **kwargs: P.kwargs) -> R:
+=======
+        def wrap_bw_compiler(bw_compiler_fn):
+            def _wrapped_bw_compiler(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Note [Wrapping bw_compiler in disable]
                 # The two disables here:
                 # - stop TorchDynamo from trying to compile the bw_compiler function itself
@@ -81,7 +104,11 @@ def _wrapped_bw_compiler(*args: P.args, **kwargs: P.kwargs) -> R:
                 return disable(
                     disable(
                         bw_compiler_fn, reason="do not trace backward compiler function"
+<<<<<<< HEAD
                     )(*args, **kwargs),  # type: ignore[misc]
+=======
+                    )(*args, **kwargs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     reason="do not trace generated backwards pass",
                 )
 
@@ -105,9 +132,13 @@ def _wrapped_bw_compiler(*args: P.args, **kwargs: P.kwargs) -> R:
         # debug asserts slow down compile time noticeably,
         # So only default them on when the aot_eager backend is used.
         if self.kwargs.get("fw_compiler", None) == nop:
+<<<<<<< HEAD
             patch_config: contextlib.AbstractContextManager[Any] = patch(
                 "functorch.compile.config.debug_assert", True
             )
+=======
+            patch_config = patch("functorch.compile.config.debug_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             patch_config = contextlib.nullcontext()
 
@@ -124,11 +155,19 @@ def _wrapped_bw_compiler(*args: P.args, **kwargs: P.kwargs) -> R:
             raise
 
 
+<<<<<<< HEAD
 def aot_autograd(**kwargs: Any) -> AotAutograd:
     return AotAutograd(**kwargs)
 
 
 def mem_efficient_fusion_kwargs(use_decomps: bool) -> dict[str, Any]:
+=======
+def aot_autograd(**kwargs) -> AotAutograd:
+    return AotAutograd(**kwargs)
+
+
+def mem_efficient_fusion_kwargs(use_decomps):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from functorch.compile import (
         default_decompositions,
         min_cut_rematerialization_partition,
@@ -148,21 +187,33 @@ def mem_efficient_fusion_kwargs(use_decomps: bool) -> dict[str, Any]:
     return kwargs
 
 
+<<<<<<< HEAD
 def fake_tensor_unsupported(fn: Callable[[Any, list[Any], Any], R]) -> Any:
+=======
+def fake_tensor_unsupported(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator for backends that need real inputs.  We swap out fake
     tensors for zero tensors.
     """
 
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(model: Any, inputs: Any, **kwargs: Any) -> Any:
         with _disable_current_modes():
             inputs = list(map(defake, inputs))
             return fn(model, inputs, **kwargs)  # type: ignore[call-arg]
+=======
+    def wrapper(model, inputs, **kwargs):
+        with _disable_current_modes():
+            inputs = list(map(defake, inputs))
+            return fn(model, inputs, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
 
+<<<<<<< HEAD
 def device_from_inputs(example_inputs: Iterable[Any]) -> torch.device:
     for x in example_inputs:
         if hasattr(x, "device"):
@@ -175,3 +226,15 @@ def dtype_from_inputs(example_inputs: Iterable[Any]) -> torch.dtype:
         if hasattr(x, "dtype"):
             return x.dtype
     return torch.float32  # Default fallback
+=======
+def device_from_inputs(example_inputs) -> torch.device:
+    for x in example_inputs:
+        if hasattr(x, "device"):
+            return x.device
+
+
+def dtype_from_inputs(example_inputs) -> torch.dtype:
+    for x in example_inputs:
+        if hasattr(x, "dtype"):
+            return x.dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
index 0346614583921..68730f8029d51 100644
--- a/torch/_dynamo/backends/cudagraphs.py
+++ b/torch/_dynamo/backends/cudagraphs.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module implements CUDA graphs support for TorchDynamo backends.
 
@@ -23,11 +28,17 @@
 
 import functools
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional
 
 import torch
 import torch.fx
+=======
+from typing import Optional
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo import config
 from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.backends.debugging import boxed_nop
@@ -51,8 +62,13 @@
 from .registry import register_backend
 
 
+<<<<<<< HEAD
 def find_input_mutations(g: torch.fx.Graph) -> set[int]:
     def meta_fk(meta: dict[str, Any]) -> Any:
+=======
+def find_input_mutations(g):
+    def meta_fk(meta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return meta["val"] if "val" in meta else meta["fake_result"]
 
     inputs = defaultdict(set)
@@ -90,9 +106,13 @@ def meta_fk(meta: dict[str, Any]) -> Any:
     return mutated_inputs
 
 
+<<<<<<< HEAD
 def get_device_node_mapping(
     gm: torch.fx.GraphModule,
 ) -> dict[torch.device, torch.fx.Node]:
+=======
+def get_device_node_mapping(gm: torch.fx.GraphModule):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_node_mapping: dict[torch.device, torch.fx.Node] = {}
     for n in gm.graph.nodes:
         t = n.meta.get("val", None)
@@ -102,7 +122,11 @@ def get_device_node_mapping(
 
 
 def check_for_mutation_ignore_cuda_graph_managed_tensor(
+<<<<<<< HEAD
     aot_model: torch.fx.GraphModule, num_fixed: int
+=======
+    aot_model: torch.fx.GraphModule, num_fixed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[str]:
     mutation_indices = find_input_mutations(aot_model.graph) - set(range(num_fixed))
     if not mutation_indices:
@@ -112,7 +136,11 @@ def check_for_mutation_ignore_cuda_graph_managed_tensor(
     return get_mutation_stack_trace(placeholders, mutation_indices)
 
 
+<<<<<<< HEAD
 def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed: int) -> Optional[str]:
+=======
+def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not config.cudagraph_backend_support_input_mutation:
         if mut_skip := check_for_mutation_ignore_cuda_graph_managed_tensor(
             aot_model, num_fixed
@@ -130,12 +158,17 @@ def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed: int) -> Optional[
     return None
 
 
+<<<<<<< HEAD
 def get_device_index(gm: torch.fx.GraphModule) -> int:
+=======
+def get_device_index(gm) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device = next(iter(get_device_node_mapping(gm)))
     assert device.type == "cuda"
     return device.index
 
 
+<<<<<<< HEAD
 def get_stack_traces(gm: torch.fx.GraphModule) -> list[Optional[str]]:
     output = output_node(gm)
     assert len(output.args) == 1
@@ -149,16 +182,32 @@ def get_stack_traces(gm: torch.fx.GraphModule) -> list[Optional[str]]:
 
 
 def cudagraphs(dynamo_model: torch.fx.GraphModule, dynamo_inputs: Sequence[Any]) -> Any:
+=======
+def get_stack_traces(gm) -> list[Optional[str]]:
+    output = output_node(gm)
+    assert len(output.args) == 1
+    return [
+        (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+        for arg in output.args[0]
+    ]
+
+
+def cudagraphs(dynamo_model, dynamo_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.cudagraph_trees import cudagraphify_impl
 
     do_cudagraphs = BoxedBool(True)
     boxed_device_index = BoxedDeviceIndex(None)
 
+<<<<<<< HEAD
     def forward_cudagraphs(
         aot_model: torch.fx.GraphModule,
         aot_inputs: list[Any],
         is_inference: bool = False,
     ) -> Any:
+=======
+    def forward_cudagraphs(aot_model, aot_inputs, is_inference=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         interp = boxed_nop(aot_model, aot_inputs)
         fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs))
         if skip_msg := check_for_skip(aot_model, fixed):
@@ -175,17 +224,28 @@ def forward_cudagraphs(
             range(fixed),
             device_index=boxed_device_index.value,
             is_backward=False,
+<<<<<<< HEAD
             is_inference=False,  # Q: should forward is_inference here?
+=======
+            is_inference=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             stack_traces=get_stack_traces(aot_model),
             placeholders=get_placeholder_info(aot_model.graph),
             mutated_input_idxs=find_input_mutations(aot_model.graph),
         )
+<<<<<<< HEAD
         out._boxed_call = True  # type: ignore[attr-defined]
         return out
 
     def backward_cudagraphs(
         aot_model: torch.fx.GraphModule, aot_inputs: list[Any]
     ) -> Any:
+=======
+        out._boxed_call = True
+        return out
+
+    def backward_cudagraphs(aot_model, aot_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         interp = boxed_nop(aot_model, aot_inputs)
         if not do_cudagraphs:
             return aot_model
@@ -193,6 +253,7 @@ def backward_cudagraphs(
         fixed = count_tangents(aot_model)
         if skip_msg := check_for_skip(aot_model, fixed):
             log_cudagraph_skip_and_bump_counter(
+<<<<<<< HEAD
                 f"skipping cudagraphs due to {skip_msg}"
             )
 
@@ -211,6 +272,22 @@ def fn(inputs: list[Any]) -> Any:
                 return aot_model(inputs)
 
             fn._boxed_call = True  # type: ignore[attr-defined]
+=======
+                "skipping cudagraphs due to %s", skip_msg
+            )
+
+            # See [Backward Generation Handling]
+            manager = torch._inductor.cudagraph_trees.get_manager(
+                boxed_device_index.value, create_if_none_exists=False
+            )
+            assert manager is not None
+
+            def fn(inputs):
+                manager.set_to_running_backward()
+                return aot_model(inputs)
+
+            fn._boxed_call = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return fn
 
         out = cudagraphify_impl(
@@ -224,7 +301,11 @@ def fn(inputs: list[Any]) -> Any:
             placeholders=get_placeholder_info(aot_model.graph),
             mutated_input_idxs=find_input_mutations(aot_model.graph),
         )
+<<<<<<< HEAD
         out._boxed_call = True  # type: ignore[attr-defined]
+=======
+        out._boxed_call = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out
 
     aot_cudagraphs = aot_autograd(
@@ -240,13 +321,21 @@ class CudagraphsBackend:
     compiler_name = "cudagraphs"
 
     @staticmethod
+<<<<<<< HEAD
     def reset() -> None:
+=======
+    def reset():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.cudagraph_trees import reset_cudagraph_trees
 
         reset_cudagraph_trees()
 
     @staticmethod
+<<<<<<< HEAD
     def __call__(model: torch.fx.GraphModule, inputs: Sequence[Any]) -> Any:
+=======
+    def __call__(model, inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cudagraphs(model, inputs)
 
 
@@ -255,12 +344,16 @@ def __call__(model: torch.fx.GraphModule, inputs: Sequence[Any]) -> Any:
 register_backend(name="cudagraphs", compiler_fn=CudagraphsBackend())
 
 
+<<<<<<< HEAD
 def cudagraphs_inner(
     model: Callable[..., Any],
     inputs: Sequence[Any],
     copy_outputs: bool = True,
     copy_inputs: bool = True,
 ) -> Callable[..., Sequence[Any]]:
+=======
+def cudagraphs_inner(model, inputs, copy_outputs=True, copy_inputs=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """This isn't registered as a backend, but is used in some benchmarks"""
     assert isinstance(inputs, (list, tuple))
     if copy_inputs:
@@ -285,7 +378,11 @@ def cudagraphs_inner(
     if not isinstance(static_outputs, (list, tuple)):
         static_outputs = (static_outputs,)
 
+<<<<<<< HEAD
     def run(*new_inputs: Any) -> Sequence[Any]:
+=======
+    def run(*new_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(static_inputs) == len(new_inputs)
         if copy_inputs:
             for dst, src in zip(static_inputs, new_inputs):
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index 360a3d7335303..b4d5e92d2562e 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides debugging backends for TorchDynamo to help diagnose and troubleshoot
 compilation and execution issues. It includes:
@@ -26,37 +31,54 @@
 import dataclasses
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from importlib import import_module
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from importlib import import_module
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from functorch.compile import min_cut_rematerialization_partition
 from torch import _guards
+<<<<<<< HEAD
 from torch._dynamo.output_graph import GraphCompileReason
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch import config as functorch_config
 from torch._functorch.compilers import ts_compile
 
 from .common import aot_autograd
+<<<<<<< HEAD
 from .registry import CompiledFn, CompilerFn, register_debug_backend as register_backend
 
 
 if TYPE_CHECKING:
     from torch.fx.node import Target
+=======
+from .registry import register_debug_backend as register_backend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
 
 
 @register_backend
+<<<<<<< HEAD
 def eager(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> Callable[..., Any]:
+=======
+def eager(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs:
         log.warning("eager backend ignoring extra kwargs %s", kwargs)
     return gm.forward
 
 
+<<<<<<< HEAD
 def make_eager_backend_with_torch_function_mode(
     mode: torch.overrides.TorchFunctionMode,
 ) -> Callable[..., Any]:
@@ -66,11 +88,19 @@ def make_eager_backend_with_torch_function_mode(
 def make_eager_backend_with_torch_function_modes(
     modes: Iterable[torch.overrides.TorchFunctionMode],
 ) -> Callable[..., Any]:
+=======
+def make_eager_backend_with_torch_function_mode(mode):
+    return make_eager_backend_with_torch_function_modes([mode])
+
+
+def make_eager_backend_with_torch_function_modes(modes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Used to trace HOPs (cond and while) for eager execution, the metadata
     TF mode mutates vars outside of the scope of the HOP, and we can't have graph breaks
     in the HOP, so we need to externally run this mode and not trace it."""
     from contextlib import ExitStack
 
+<<<<<<< HEAD
     def fn(
         gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
     ) -> Callable[..., Any]:
@@ -81,20 +111,38 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
                 return gm.forward(*args, **kwargs)
 
         return wrapper
+=======
+    def fn(gm, fake_tensor_inputs, **kwargs):
+        stack = ExitStack()
+        for mode in modes:
+            stack.enter_context(mode)
+
+        result = gm.forward
+        stack.close()
+        return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return fn
 
 
 @register_backend
+<<<<<<< HEAD
 def eager_noexcept(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> Callable[..., Any]:
+=======
+def eager_noexcept(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs:
         log.warning("eager_noexcept backend ignoring extra kwargs %s", kwargs)
 
     # This backend is intended to check that dynamo-generated GraphModules
     # do not cause errors.
+<<<<<<< HEAD
     def inner(*args: Any) -> Any:
+=======
+    def inner(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             return gm(*args)
         except Exception as e:
@@ -106,15 +154,23 @@ def inner(*args: Any) -> Any:
 
 
 @register_backend
+<<<<<<< HEAD
 def pre_dispatch_eager(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> torch.fx.GraphModule:
+=======
+def pre_dispatch_eager(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs:
         log.warning("pre_dispatch_eager backend ignoring extra kwargs %s", kwargs)
 
     from torch.fx.experimental.proxy_tensor import make_fx
 
+<<<<<<< HEAD
     def runnable_gm(*args: Any) -> Any:
+=======
+    def runnable_gm(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.fx.Interpreter(gm).run(*args)
 
     pre_dispatch_gm = make_fx(runnable_gm, pre_dispatch=True)(*fake_tensor_inputs)
@@ -124,9 +180,13 @@ def runnable_gm(*args: Any) -> Any:
 
 
 @register_backend
+<<<<<<< HEAD
 def eager_debug(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> Callable[..., Any]:
+=======
+def eager_debug(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs:
         log.warning("eager_debug backend ignoring extra kwargs %s", kwargs)
 
@@ -135,21 +195,31 @@ def eager_debug(
     # We could add more debugging bits here.
     # Right now, this backend can be used to check for and error on
     # custom dispatcher ops that have incorrect schemas.
+<<<<<<< HEAD
     def inner(*args: Any) -> Any:
+=======
+    def inner(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with SchemaCheckMode():
             return torch.fx.Interpreter(gm).run(*args)
 
     return inner
 
 
+<<<<<<< HEAD
 @register_backend(name="ts")  # type: ignore[misc]
 def torchscript(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
 ) -> torch.jit.ScriptModule:
+=======
+@register_backend(name="ts")
+def torchscript(gm, fake_tensor_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.jit.script(gm)
 
 
 # used boxed call to discard inputs when they are no longer needed
+<<<<<<< HEAD
 def boxed_nop(
     fx_g: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> Callable[..., Any]:
@@ -211,6 +281,31 @@ def run(args: Any) -> Any:
             return forward_fn(args)
 
     run._boxed_call = True  # type: ignore[attr-defined]
+=======
+def boxed_nop(fx_g, example_inputs):
+    def run(args):
+        return torch.fx.Interpreter(fx_g).boxed_run(args)
+
+    run._boxed_call = True
+    return run
+
+
+def boxed_nop_with_mode(fx_g, example_inputs, *, mode):
+    def run(args):
+        with mode:
+            return torch.fx.Interpreter(fx_g).boxed_run(args)
+
+    run._boxed_call = True
+    return run
+
+
+def fake_crossref_boxed_nop(fx_g, example_inputs, ignore_op_fn=None):
+    def run(args):
+        with torch._subclasses.CrossRefFakeMode(ignore_op_fn):
+            return torch.fx.Interpreter(fx_g).boxed_run(args)
+
+    run._boxed_call = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return run
 
 
@@ -218,9 +313,13 @@ def ignore_builtins(op: torch._ops.OpOverload) -> bool:
     return op.namespace in ("aten", "prims", "prim")
 
 
+<<<<<<< HEAD
 def get_nop_func() -> Callable[
     [torch.fx.GraphModule, list[torch.Tensor]], Callable[..., Any]
 ]:
+=======
+def get_nop_func():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not torch._functorch.config.fake_tensor_crossref:
         return boxed_nop
     elif torch._functorch.config.fake_tensor_crossref == "all":
@@ -233,12 +332,21 @@ def get_nop_func() -> Callable[
 # Useful for debugging purpose
 # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
 def aot_eager(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     fake_tensor_inputs: list[torch.Tensor],
     fw_compiler: Optional[Callable[..., Any]] = None,
     bw_compiler: Optional[Callable[..., Any]] = None,
     **kwargs: Any,
 ) -> Callable[..., Any]:
+=======
+    gm,
+    fake_tensor_inputs,
+    fw_compiler=None,
+    bw_compiler=None,
+    **kwargs,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return aot_autograd(
         fw_compiler=fw_compiler or boxed_nop,
         bw_compiler=bw_compiler or boxed_nop,
@@ -261,9 +369,13 @@ def aot_eager(
 # inductor problems.
 # aot_eager_decomp_partition just replaces the inductor compiler with nop to help
 # isolate inductor vs aot_eager errors
+<<<<<<< HEAD
 def aot_eager_decomp_partition(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> Callable[..., Any]:
+=======
+def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs:
         log.warning(
             "aot_eager_decomp_partition backend ignoring extra kwargs %s", kwargs
@@ -275,7 +387,11 @@ def aot_eager_decomp_partition(
     if bisect_changes := CompilerBisector.get_config_change(
         "aot_eager_decomp_partition"
     ):
+<<<<<<< HEAD
         config_patches.update(bisect_changes)  # type: ignore[arg-type]
+=======
+        config_patches.update(bisect_changes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with functorch_config.patch(config_patches):
         return aot_autograd(
@@ -299,12 +415,16 @@ def aot_eager_decomp_partition(
 
 # aot_eager_decomp_partition_with_mode is similar as aot_eager_decomp_partition,
 # except that it takes a TorchDispatchMode mode and run the fw/bw in the mode
+<<<<<<< HEAD
 def aot_eager_decomp_partition_with_mode(
     gm: torch.fx.GraphModule,
     fake_tensor_inputs: list[torch.Tensor],
     mode: Any,
     **kwarg: Any,
 ) -> Callable[..., Any]:
+=======
+def aot_eager_decomp_partition_with_mode(gm, fake_tensor_inputs, mode, **kwarg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return aot_autograd(
         # these are taken from memory_efficient_fusion()
         fw_compiler=functools.partial(boxed_nop_with_mode, mode=mode),
@@ -321,6 +441,7 @@ def aot_eager_decomp_partition_with_mode(
 
 register_backend(
     name="aot_eager_decomp_partition_with_mode",
+<<<<<<< HEAD
     compiler_fn=aot_eager_decomp_partition_with_mode,  # type: ignore[arg-type]
 )
 
@@ -328,6 +449,13 @@ def aot_eager_decomp_partition_with_mode(
 def aot_eager_decomp_partition_crossref(
     gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
 ) -> Callable[..., Any]:
+=======
+    compiler_fn=aot_eager_decomp_partition_with_mode,
+)
+
+
+def aot_eager_decomp_partition_crossref(gm, fake_tensor_inputs, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # if the config is set, respect it, otherwise only test custom_ops.
     # custom_op bad metas always manifest as an error whereas aten will only sometimes.
     # by default, use the less noisy option
@@ -365,9 +493,13 @@ class TestingOnlyCompileError(Exception):
 
 
 @register_backend
+<<<<<<< HEAD
 def relu_compile_error_TESTING_ONLY(
     gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> torch.fx.GraphModule:
+=======
+def relu_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             raise ReluCompileError
@@ -375,9 +507,13 @@ def relu_compile_error_TESTING_ONLY(
 
 
 @register_backend
+<<<<<<< HEAD
 def relu_runtime_error_TESTING_ONLY(
     gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> torch.fx.GraphModule:
+=======
+def relu_runtime_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch._assert
@@ -387,9 +523,13 @@ def relu_runtime_error_TESTING_ONLY(
 
 
 @register_backend
+<<<<<<< HEAD
 def relu_accuracy_error_TESTING_ONLY(
     gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> torch.fx.GraphModule:
+=======
+def relu_accuracy_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch.add
@@ -400,9 +540,13 @@ def relu_accuracy_error_TESTING_ONLY(
 
 
 @register_backend
+<<<<<<< HEAD
 def non_leaf_compile_error_TESTING_ONLY(
     gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> torch.fx.GraphModule:
+=======
+def non_leaf_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Require at least one non-trivial thing in the graph,
     # see https://github.com/pytorch/pytorch/issues/102898
     for node in gm.graph.nodes:
@@ -426,9 +570,17 @@ class ExplainOutput:
     graphs: list[torch.fx.GraphModule]
     graph_count: int
     graph_break_count: int
+<<<<<<< HEAD
     break_reasons: list[GraphCompileReason]
     op_count: int
     ops_per_graph: Optional[list[list["Target"]]] = None
+=======
+    break_reasons: list[
+        Any
+    ]  # Type is GraphCompileReason but doesn't matter for this purpose
+    op_count: int
+    ops_per_graph: Optional[list[torch.fx.Node]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_guards: Optional[list[_guards.Guard]] = None
     compile_times: Optional[str] = None
 
@@ -464,6 +616,7 @@ def __str__(self) -> str:
 
 
 def _explain_graph_detail(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     graphs: list[torch.fx.GraphModule],
     op_count: int,
@@ -476,6 +629,10 @@ def _explain_graph_detail(
     list[list["Target"]],
     list[GraphCompileReason],
 ]:
+=======
+    gm: torch.fx.GraphModule, graphs, op_count, ops_per_graph, break_reasons
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This function is a utility which processes a torch.fx.GraphModule and
     accumulates information about its ops, graph breaks, and other details. It
@@ -497,8 +654,13 @@ def _explain_graph_detail(
     ops = [node.target for node in gm.graph.nodes if node.op == "call_function"]
     op_count += len(ops)
     ops_per_graph.append(ops)
+<<<<<<< HEAD
     if gm.compile_subgraph_reason.graph_break:  # type: ignore[union-attr]
         break_reasons.append(gm.compile_subgraph_reason)  # type: ignore[arg-type]
+=======
+    if gm.compile_subgraph_reason.graph_break:
+        break_reasons.append(gm.compile_subgraph_reason)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return gm, graphs, op_count, ops_per_graph, break_reasons
 
@@ -528,6 +690,7 @@ def fn(x):
         print(eb.output())
     """
 
+<<<<<<< HEAD
     def __init__(self, backend: Union[CompilerFn, str]) -> None:
         from .registry import lookup_backend
 
@@ -542,6 +705,19 @@ def __call__(
         ops_per_graph: list[list[Target]] = []
         gm, self.graphs, self.op_count, _, self.break_reasons = _explain_graph_detail(
             gm, self.graphs, self.op_count, ops_per_graph, self.break_reasons
+=======
+    def __init__(self, backend) -> None:
+        from .registry import lookup_backend
+
+        self.backend = lookup_backend(backend)
+        self.graphs = []
+        self.op_count = 0
+        self.break_reasons = []
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+        gm, self.graphs, self.op_count, _, self.break_reasons = _explain_graph_detail(
+            gm, self.graphs, self.op_count, [], self.break_reasons
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return self.backend(gm, example_inputs)
 
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index e53becd884bba..8ee99a30398dd 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module implements distributed training optimizations for TorchDynamo backends.
 
@@ -18,24 +23,35 @@
 
 import logging
 import traceback
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from dataclasses import dataclass, field
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 import torch
 from torch import fx
+<<<<<<< HEAD
 from torch._dynamo.backends.registry import CompiledFn, CompilerFn
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.output_graph import GraphCompileReason
 from torch._dynamo.utils import deepcopy_to_fake_tensor, detect_fake_mode
 from torch._logging import trace_structured
 from torch.fx.node import Node
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Regular log messages should go through 'log'.
 # ddp_graph_log is a separate artifact logger reserved for dumping graphs.
 # See docs/source/logging.rst for more info.
@@ -43,7 +59,11 @@
 ddp_graph_log = torch._logging.getArtifactLogger(__name__, "ddp_graphs")
 
 
+<<<<<<< HEAD
 def args_str(args: Any) -> str:
+=======
+def args_str(args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # a debug helper
     if torch.is_tensor(args):
         return f"T[{args.shape}]"
@@ -62,7 +82,11 @@ class Bucket:
     nodes: list[fx.Node] = field(default_factory=list)
 
     # param_ids is just used for unit testing
+<<<<<<< HEAD
     param_ids: list[int] = field(default_factory=list)
+=======
+    param_ids: list = field(default_factory=list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # keep track of any buckets that were extended for logging purposes
     opcount_increased_to_capture_external_output: int = 0
@@ -82,9 +106,15 @@ def bucket_has_external_output(bucket: Bucket) -> bool:
     return False
 
 
+<<<<<<< HEAD
 def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
     headers = ("Index", "Size (b)", "Param Names")
     rows: list[tuple[Optional[int], Optional[int], str]] = []
+=======
+def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int):
+    headers = ("Index", "Size (b)", "Param Names")
+    rows = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extended_buckets = []
     for idx, bucket in enumerate(reversed(buckets)):
         if len(bucket.params) > 0:
@@ -99,14 +129,22 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 )
             )
 
+<<<<<<< HEAD
     if rows:
+=======
+    if len(rows):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.info(
             "\nDDPOptimizer used bucket cap %s and created %d buckets. Enable debug logs for detailed bucket info.",
             bucket_bytes_cap,
             len(buckets),
         )
 
+<<<<<<< HEAD
         if extended_buckets:
+=======
+        if len(extended_buckets):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.warning(
                 "Some buckets were extended beyond their requested parameter capacities"
                 " in order to ensure each subgraph has an output node, required for fx graph partitioning."
@@ -123,7 +161,11 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 tabulate(rows, headers=headers, tablefmt="simple_grid"),
             )
 
+<<<<<<< HEAD
             if extended_buckets:
+=======
+            if len(extended_buckets):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 log.warning(
                     "DDPOptimizer extended these buckets to ensure per-subgraph output nodes:\n%s",
                     tabulate(
@@ -140,7 +182,11 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
         log.debug("DDPOptimizer captured no parameters and did not split this graph.")
 
 
+<<<<<<< HEAD
 def has_higher_order_op(gm: fx.GraphModule) -> bool:
+=======
+def has_higher_order_op(gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Check if there is a higher order op in the graph
     for node in gm.graph.nodes:
         if node.op == "get_attr":
@@ -150,7 +196,11 @@ def has_higher_order_op(gm: fx.GraphModule) -> bool:
     return False
 
 
+<<<<<<< HEAD
 def propagate_metadata(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
+=======
+def propagate_metadata(orig_gm, split_gm) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name, module in split_gm.named_modules():
         if "." not in name and len(name):
             # TODO: add split id to CompileId: https://github.com/pytorch/tlparse/pull/83/files#r1880649384
@@ -158,7 +208,11 @@ def propagate_metadata(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> Non
             module._param_name_to_source = orig_gm._param_name_to_source
 
 
+<<<<<<< HEAD
 def propagate_dynamo_source(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
+=======
+def propagate_dynamo_source(orig_gm, split_gm) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name_to_dynamo_source = {}
     for node in orig_gm.graph.find_nodes(op="placeholder"):
         name_to_dynamo_source[node.name] = node._dynamo_source
@@ -170,6 +224,7 @@ def propagate_dynamo_source(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -
                 node._dynamo_source = name_to_dynamo_source.get(node.name, None)
 
 
+<<<<<<< HEAD
 class DDPOptimizerContext:
     def __init__(self) -> None:
         self.curr_bucket: int = -1
@@ -195,6 +250,16 @@ def __init__(
     def compile_submod(
         self, input_mod: fx.GraphModule, args: list[torch.Tensor], kwargs: Any
     ) -> Any:
+=======
+# compile each of the partitioned submodules using the user-provided compiler
+class SubmodCompiler(torch.fx.interpreter.Interpreter):
+    def __init__(self, module, compiler, fake_mode) -> None:
+        super().__init__(module)
+        self.compiler = compiler
+        self.fake_mode = fake_mode
+
+    def compile_submod(self, input_mod, args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Compile the submodule,
         using a wrapper to make sure its output is always a tuple,
@@ -203,14 +268,22 @@ def compile_submod(
         assert len(kwargs) == 0, "We assume only args for these modules"
 
         class WrapperModule(torch.nn.Module):
+<<<<<<< HEAD
             def __init__(
                 self, submod: Callable[..., Any], unwrap_singleton_tuple: bool
             ) -> None:
+=======
+            def __init__(self, submod, unwrap_singleton_tuple) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 super().__init__()
                 self.submod = submod
                 self.unwrap_singleton_tuple = unwrap_singleton_tuple
 
+<<<<<<< HEAD
             def forward(self, *args: Any) -> Any:
+=======
+            def forward(self, *args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = self.submod(*args)
                 # TODO(whc)
                 # for some reason the isinstance check is necessary if I split one node per submod
@@ -228,12 +301,20 @@ def forward(self, *args: Any) -> Any:
                     sn.args = (sn.args,)
 
         input_mod.recompile()
+<<<<<<< HEAD
         input_mod.compile_subgraph_reason = GraphCompileReason(  # type: ignore[assignment]
+=======
+        input_mod.compile_subgraph_reason = GraphCompileReason(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
             " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
             [
                 # it's close to useless to get a real stacktrace here, and quite verbose.
+<<<<<<< HEAD
                 traceback.FrameSummary(__file__, 0, "DDPOptimizer"),
+=======
+                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         )
 
@@ -280,7 +361,11 @@ def run_node(self, n: Node) -> Any:
         assert isinstance(kwargs, dict)
 
         if n.op == "call_module":
+<<<<<<< HEAD
             real_mod = self.fetch_attr(str(n.target))
+=======
+            real_mod = self.fetch_attr(n.target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.fake_mode:
                 curr_submod = deepcopy_to_fake_tensor(real_mod, self.fake_mode)
             else:
@@ -310,10 +395,17 @@ class FakeifyFirstAOTInvocationGuard:
                 def __init__(self) -> None:
                     self.tc = torch._guards.TracingContext.try_get()
                     assert self.tc
+<<<<<<< HEAD
                     self.tc.fakify_first_call = True
 
                 def __del__(self) -> None:
                     self.tc.fakify_first_call = False  # type: ignore[union-attr]
+=======
+                    torch._guards.TracingContext.try_get().fakify_first_call = True
+
+                def __del__(self) -> None:
+                    self.tc.fakify_first_call = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # For aot_eager and other backends, tracing context is not set
             has_tracing_context = torch._guards.TracingContext.try_get() is not None
@@ -331,9 +423,15 @@ def __del__(self) -> None:
 
             # We update the original (outer) graph with a call into the compiled module
             # instead of the uncompiled one.
+<<<<<<< HEAD
             self.module.delete_submodule(n.target)  # type: ignore[operator]
             n.target = "compiled_" + n.target  # type: ignore[operator]
             self.module.add_submodule(n.target, compiled_submod_real)  # type: ignore[operator]
+=======
+            self.module.delete_submodule(n.target)
+            n.target = "compiled_" + n.target
+            self.module.add_submodule(n.target, compiled_submod_real)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Finally, we have to produce inputs for use compiling the next submodule,
             # and these need to be FakeTensors, so we execute the module under fake_mode
@@ -343,6 +441,7 @@ def __del__(self) -> None:
                 mock.patch.object(self.fake_mode, "allow_non_fake_inputs", True),
             ):
                 if has_tracing_context and invoked_aot_autograd:
+<<<<<<< HEAD
                     tracing_ctx = torch._guards.TracingContext.try_get()
                     assert tracing_ctx is not None
                     # DDPOptimizer maintains 1 dynamo graph -> N AOT graphs
@@ -353,6 +452,8 @@ def __del__(self) -> None:
                     ddp_ctx.curr_bucket += 1
                     ddp_ctx.metadata_per_bucket.append(tracing_ctx.fw_metadata)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     out = compiled_submod_real(*new_args, **kwargs)
                     # output should be fake or subclass
                     assert all(
@@ -431,7 +532,11 @@ class DDPOptimizer:
     def __init__(
         self,
         bucket_bytes_cap: int,
+<<<<<<< HEAD
         backend_compile_fn: CompilerFn,
+=======
+        backend_compile_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         first_bucket_cap: Optional[int] = None,
     ) -> None:
         if first_bucket_cap is not None:
@@ -449,14 +554,22 @@ def __init__(
 
         self.backend_compile_fn = backend_compile_fn
 
+<<<<<<< HEAD
     def _ignore_parameter(self, parameter: torch.nn.Parameter) -> bool:
         return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
 
     def add_param(self, bucket: Bucket, param: torch.nn.Parameter, name: str) -> None:
+=======
+    def _ignore_parameter(self, parameter):
+        return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
+
+    def add_param(self, bucket, param, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bucket.size += param.untyped_storage().nbytes()
         bucket.params.append(name)
         bucket.param_ids.append(id(param))
 
+<<<<<<< HEAD
     def add_module_params_to_bucket(
         self,
         mod: torch.nn.Module,
@@ -464,12 +577,19 @@ def add_module_params_to_bucket(
         processed_modules: set[torch.nn.Module],
         prefix: str,
     ) -> None:
+=======
+    def add_module_params_to_bucket(self, mod, bucket, processed_modules, prefix):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         processed_modules.add(mod)
         for name, param in mod.named_parameters():
             if param.requires_grad and not self._ignore_parameter(param):
                 self.add_param(bucket, param, f"{prefix}_{name}")
 
+<<<<<<< HEAD
     def add_param_args(self, bucket: Bucket, node: fx.Node) -> None:
+=======
+    def add_param_args(self, bucket, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in node.args:
             if not isinstance(arg, torch.fx.node.Node):
                 continue
@@ -481,11 +601,17 @@ def add_param_args(self, bucket: Bucket, node: fx.Node) -> None:
                 and param.requires_grad
                 and not self._ignore_parameter(param)
             ):
+<<<<<<< HEAD
                 self.add_param(bucket, param, str(arg.target))
 
     def compile_fn(
         self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]
     ) -> CompiledFn:
+=======
+                self.add_param(bucket, param, arg.target)
+
+    def compile_fn(self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Implements graph splitting, first determining a set of of buckets by counting
         parameter sizes in reverse graph order, then invoking the user/backend compiler
@@ -494,7 +620,11 @@ def compile_fn(
         """
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
+<<<<<<< HEAD
         processed_modules: set[torch.nn.Module] = set()
+=======
+        processed_modules = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in reversed(gm.graph.nodes):
             if node.op in ("output", "placeholder"):
                 continue
@@ -574,9 +704,13 @@ def compile_fn(
                 partition_map[node] = idx
 
         split_gm = fx.passes.split_module.split_module(
+<<<<<<< HEAD
             gm,
             None,  # type: ignore[arg-type]
             lambda node: partition_map[node],
+=======
+            gm, None, lambda node: partition_map[node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # See note [Assumption on Dynamo Metadata]
diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
index ae62dd56678b8..6d7406876b100 100644
--- a/torch/_dynamo/backends/inductor.py
+++ b/torch/_dynamo/backends/inductor.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides the TorchInductor backend integration for TorchDynamo.
 
@@ -10,13 +15,17 @@
     model = torch.compile(model, backend="inductor")
 """
 
+<<<<<<< HEAD
 from typing import Any
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo import register_backend
 from torch._dynamo.utils import dynamo_timed
 
 
 @register_backend
+<<<<<<< HEAD
 def inductor(*args: Any, **kwargs: Any) -> Any:
     with dynamo_timed("inductor_import", log_pt2_compile_event=True):
         # do import here to avoid loading inductor into memory when it is not used
@@ -26,6 +35,11 @@ def inductor(*args: Any, **kwargs: Any) -> Any:
 
         maybe_warm_pool()
 
+=======
+def inductor(*args, **kwargs):
+    with dynamo_timed("inductor_import", log_pt2_compile_event=True):
+        # do import here to avoid loading inductor into memory when it is not used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.compile_fx import compile_fx
 
     return compile_fx(*args, **kwargs)
diff --git a/torch/_dynamo/backends/onnxrt.py b/torch/_dynamo/backends/onnxrt.py
index 93490e64f4ae2..faa0a552e1bfe 100644
--- a/torch/_dynamo/backends/onnxrt.py
+++ b/torch/_dynamo/backends/onnxrt.py
@@ -1,7 +1,13 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This backend is maintained by ONNX team. To direct issues
 # to the right people, please tag related GitHub issues with `module: onnx`.
 #
 # Maintainers' Github IDs: wschin, xadupre
+<<<<<<< HEAD
 # from torch.onnx._internal.onnxruntime import (
 #     is_onnxrt_backend_supported,
 #     torch_compile_backend,
@@ -37,3 +43,37 @@
 #         )
 
 #     register_backend(name="onnxrt", compiler_fn=information_displaying_backend)
+=======
+from torch.onnx._internal.onnxruntime import (
+    is_onnxrt_backend_supported,
+    torch_compile_backend,
+)
+
+from .registry import register_backend
+
+
+def has_onnxruntime():
+    # FIXME: update test/dynamo/test_backends.py to call is_onnxrt_backend_supported()
+    return is_onnxrt_backend_supported()
+
+
+if is_onnxrt_backend_supported():
+    register_backend(name="onnxrt", compiler_fn=torch_compile_backend)
+else:
+
+    def information_displaying_backend(*args, **kwargs):
+        raise ImportError(
+            "onnxrt is not registered as a backend. "
+            "Please make sure all dependencies such as "
+            "numpy, onnx, onnxscript, and onnxruntime-training are installed. "
+            "Suggested procedure to fix dependency problem:\n"
+            "  (1) pip or conda install numpy onnx onnxscript onnxruntime-training.\n"
+            "  (2) Open a new python terminal.\n"
+            "  (3) Call the API `torch.onnx.is_onnxrt_backend_supported()`:\n"
+            "  (4)   If it returns `True`, then you can use `onnxrt` backend.\n"
+            "  (5)   If it returns `False`, please execute the package importing section in "
+            "torch/onnx/_internal/onnxruntime.py under pdb line-by-line to see which import fails."
+        )
+
+    register_backend(name="onnxrt", compiler_fn=information_displaying_backend)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 706ec1768cd35..645af1b9fc82f 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module implements TorchDynamo's backend registry system for managing compiler backends.
 
@@ -60,9 +65,16 @@ def my_compiler_function(fx_graph, example_inputs):
 
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from importlib.metadata import EntryPoint
 from typing import Any, Optional, Protocol, Union
+=======
+import sys
+from collections.abc import Sequence
+from importlib.metadata import EntryPoint
+from typing import Callable, Optional, Protocol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import fx
@@ -85,7 +97,11 @@ def register_backend(
     compiler_fn: Optional[CompilerFn] = None,
     name: Optional[str] = None,
     tags: Sequence[str] = (),
+<<<<<<< HEAD
 ) -> Callable[..., Any]:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to add a given compiler to the registry to allow calling
     `torch.compile` with string shorthand.  Note: for projects not
@@ -99,14 +115,22 @@ def register_backend(
     """
     if compiler_fn is None:
         # @register_backend(name="") syntax
+<<<<<<< HEAD
         return functools.partial(register_backend, name=name, tags=tags)  # type: ignore[return-value]
+=======
+        return functools.partial(register_backend, name=name, tags=tags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert callable(compiler_fn)
     name = name or compiler_fn.__name__
     assert name not in _COMPILER_FNS, f"duplicate name: {name}"
     if compiler_fn not in _BACKENDS:
         _BACKENDS[name] = None
     _COMPILER_FNS[name] = compiler_fn
+<<<<<<< HEAD
     compiler_fn._tags = tuple(tags)  # type: ignore[attr-defined]
+=======
+    compiler_fn._tags = tuple(tags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return compiler_fn
 
 
@@ -116,7 +140,11 @@ def register_backend(
 )
 
 
+<<<<<<< HEAD
 def lookup_backend(compiler_fn: Union[str, CompilerFn]) -> CompilerFn:
+=======
+def lookup_backend(compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Expand backend strings to functions"""
     if isinstance(compiler_fn, str):
         if compiler_fn not in _BACKENDS:
@@ -128,33 +156,53 @@ def lookup_backend(compiler_fn: Union[str, CompilerFn]) -> CompilerFn:
 
         if compiler_fn not in _COMPILER_FNS:
             entry_point = _BACKENDS[compiler_fn]
+<<<<<<< HEAD
             if entry_point is not None:
                 register_backend(compiler_fn=entry_point.load(), name=compiler_fn)
+=======
+            register_backend(compiler_fn=entry_point.load(), name=compiler_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiler_fn = _COMPILER_FNS[compiler_fn]
     return compiler_fn
 
 
+<<<<<<< HEAD
 # NOTE: can't type this due to public api mismatch; follow up with dev team
 def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:  # type: ignore[no-untyped-def]
+=======
+def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Return valid strings that can be passed to:
 
         torch.compile(..., backend="name")
     """
     _lazy_import()
+<<<<<<< HEAD
     exclude_tags_set = set(exclude_tags or ())
+=======
+    exclude_tags = set(exclude_tags or ())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     backends = [
         name
         for name in _BACKENDS.keys()
         if name not in _COMPILER_FNS
+<<<<<<< HEAD
         or not exclude_tags_set.intersection(_COMPILER_FNS[name]._tags)  # type: ignore[attr-defined]
+=======
+        or not exclude_tags.intersection(_COMPILER_FNS[name]._tags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     return sorted(backends)
 
 
 @functools.cache
+<<<<<<< HEAD
 def _lazy_import() -> None:
+=======
+def _lazy_import():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .. import backends
     from ..utils import import_submodule
 
@@ -168,12 +216,28 @@ def _lazy_import() -> None:
 
 
 @functools.cache
+<<<<<<< HEAD
 def _discover_entrypoint_backends() -> None:
+=======
+def _discover_entrypoint_backends():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # importing here so it will pick up the mocked version in test_backends.py
     from importlib.metadata import entry_points
 
     group_name = "torch_dynamo_backends"
+<<<<<<< HEAD
     eps = entry_points(group=group_name)
     eps_dict = {name: eps[name] for name in eps.names}
     for backend_name in eps_dict:
         _BACKENDS[backend_name] = eps_dict[backend_name]
+=======
+    if sys.version_info < (3, 10):
+        eps = entry_points()
+        eps = eps[group_name] if group_name in eps else []
+        eps = {ep.name: ep for ep in eps}
+    else:
+        eps = entry_points(group=group_name)
+        eps = {name: eps[name] for name in eps.names}
+    for backend_name in eps:
+        _BACKENDS[backend_name] = eps[backend_name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/backends/tensorrt.py b/torch/_dynamo/backends/tensorrt.py
index 493e21a9dfc5f..bfd0d70ccb9b1 100644
--- a/torch/_dynamo/backends/tensorrt.py
+++ b/torch/_dynamo/backends/tensorrt.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # import torch  # type: ignore[import]
 # from .common import device_from_inputs, fake_tensor_unsupported  # type: ignore[import]
 # from .registry import register_backend  # type: ignore[import]
diff --git a/torch/_dynamo/backends/torchxla.py b/torch/_dynamo/backends/torchxla.py
index 60d7b87bd0876..81aef9c888e2e 100644
--- a/torch/_dynamo/backends/torchxla.py
+++ b/torch/_dynamo/backends/torchxla.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import logging
 from collections.abc import Callable
 from typing import Any
@@ -8,12 +9,23 @@
 
 from ..backends.common import aot_autograd
 from .registry import CompiledFn, register_backend, register_experimental_backend
+=======
+# mypy: ignore-errors
+
+import logging
+
+from functorch.compile import make_boxed_func
+
+from ..backends.common import aot_autograd
+from .registry import register_backend, register_experimental_backend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
 
 
 @register_experimental_backend
+<<<<<<< HEAD
 def openxla_eval(
     model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
 ) -> CompiledFn:
@@ -29,6 +41,17 @@ def openxla_eval_boxed(
 def xla_backend_helper(
     model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], boxed: bool = False
 ) -> Callable[..., Any]:
+=======
+def openxla_eval(model, fake_tensor_inputs):
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=False)
+
+
+def openxla_eval_boxed(model, fake_tensor_inputs):
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=True)
+
+
+def xla_backend_helper(model, fake_tensor_inputs, boxed=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         import torch_xla.core.dynamo_bridge as bridge
     except ImportError as e:
@@ -38,7 +61,11 @@ def xla_backend_helper(
 
     compiled_graph = None
 
+<<<<<<< HEAD
     def fwd(*args: torch.Tensor) -> Any:
+=======
+    def fwd(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal model
         nonlocal compiled_graph
         if compiled_graph is None:
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index 350df8f9c15c8..6741f97cd07c9 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: ignore-errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides TVM backend integration for TorchDynamo.
 
@@ -26,12 +31,19 @@
 import os
 import sys
 import tempfile
+<<<<<<< HEAD
 from collections.abc import Callable
 from types import MappingProxyType
 from typing import Any, Optional
 
 import torch
 from torch import fx
+=======
+from types import MappingProxyType
+from typing import Optional
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .common import device_from_inputs, fake_tensor_unsupported
 from .registry import register_backend
@@ -41,6 +53,7 @@
 
 
 @register_backend
+<<<<<<< HEAD
 @fake_tensor_unsupported  # type: ignore[arg-type]
 def tvm(
     gm: fx.GraphModule,
@@ -51,6 +64,17 @@ def tvm(
     if options is None:
         options = MappingProxyType({"scheduler": None, "trials": 20000, "opt_level": 3})
     assert options is not None
+=======
+@fake_tensor_unsupported
+def tvm(
+    gm,
+    example_inputs,
+    *,
+    options: Optional[MappingProxyType] = MappingProxyType(
+        {"scheduler": None, "trials": 20000, "opt_level": 3}
+    ),
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import tvm  # type: ignore[import]
     from tvm import relay  # type: ignore[import]
     from tvm.contrib import graph_executor  # type: ignore[import]
@@ -78,19 +102,28 @@ def tvm(
     opt_level = options.get("opt_level", 3)
 
     if scheduler == "auto_scheduler":
+<<<<<<< HEAD
         # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from tvm import auto_scheduler
 
         log_file = tempfile.NamedTemporaryFile()
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not os.path.exists(log_file):
             tasks, task_weights = auto_scheduler.extract_tasks(
                 mod["main"], params, target
             )
             if len(tasks) != 0:
                 tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not os.path.exists(log_file):
                     assert trials > 0
                     tune_option = auto_scheduler.TuningOptions(
@@ -101,9 +134,13 @@ def tvm(
                     try:
                         tuner.tune(tune_option)
                     except Exception:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
                         if os.path.exists(log_file):
                             # pyrefly: ignore [bad-argument-type]
+=======
+                        if os.path.exists(log_file):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             os.unlink(log_file)
                         raise
 
@@ -113,7 +150,10 @@ def tvm(
             ):
                 lib = relay.build(mod, target=target, params=params)
     elif scheduler == "meta_schedule":
+<<<<<<< HEAD
         # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from tvm import meta_schedule as ms
 
         with tempfile.TemporaryDirectory() as work_dir:
@@ -154,7 +194,11 @@ def tvm(
         )
     m = graph_executor.GraphModule(lib["default"](dev))
 
+<<<<<<< HEAD
     def to_torch_tensor(nd_tensor: tvm.nd.array) -> torch.Tensor:
+=======
+    def to_torch_tensor(nd_tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """A helper function to transfer a NDArray to torch.tensor."""
         if nd_tensor.dtype == "bool":
             # DLPack does not support boolean so it can't be handled by
@@ -163,7 +207,11 @@ def to_torch_tensor(nd_tensor: tvm.nd.array) -> torch.Tensor:
             return torch.from_numpy(nd_tensor.numpy())
         return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
 
+<<<<<<< HEAD
     def to_tvm_tensor(torch_tensor: torch.Tensor) -> tvm.nd.array:
+=======
+    def to_tvm_tensor(torch_tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """A helper function to transfer a torch.tensor to NDArray."""
         if torch_tensor.dtype == torch.bool:
             # same reason as above, fallback to numpy conversion which
@@ -171,7 +219,11 @@ def to_tvm_tensor(torch_tensor: torch.Tensor) -> tvm.nd.array:
             return tvm.nd.array(torch_tensor.cpu().numpy())
         return tvm.nd.from_dlpack(torch_tensor)
 
+<<<<<<< HEAD
     def exec_tvm(*i_args: torch.Tensor) -> list[torch.Tensor]:
+=======
+    def exec_tvm(*i_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = [a.contiguous() for a in i_args]
         shape_info, _ = m.get_input_info()
         active_inputs = {name for name, _ in shape_info.items()}
@@ -200,7 +252,11 @@ def exec_tvm(*i_args: torch.Tensor) -> list[torch.Tensor]:
 tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler")
 
 
+<<<<<<< HEAD
 def has_tvm() -> bool:
+=======
+def has_tvm():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         importlib.import_module("tvm")
         return True
@@ -209,7 +265,11 @@ def has_tvm() -> bool:
 
 
 @functools.cache
+<<<<<<< HEAD
 def llvm_target() -> str:
+=======
+def llvm_target():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.platform == "linux":
         cpuinfo = open("/proc/cpuinfo").read()
         if "avx512" in cpuinfo:
diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
index c7a982906b3fe..cb9ae6b51d948 100644
--- a/torch/_dynamo/bytecode_analysis.py
+++ b/torch/_dynamo/bytecode_analysis.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides utilities for analyzing and optimizing Python bytecode.
 Key functionality includes:
@@ -15,6 +20,7 @@
 import bisect
 import dataclasses
 import dis
+<<<<<<< HEAD
 import itertools
 import sys
 from typing import Any, TYPE_CHECKING, Union
@@ -25,6 +31,12 @@
     # and refactoring in callsite; that way we don't have to guard this import
     from .bytecode_transformation import Instruction
 
+=======
+import sys
+from typing import Any, Union
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TERMINAL_OPCODES = {
     dis.opmap["RETURN_VALUE"],
     dis.opmap["JUMP_FORWARD"],
@@ -37,8 +49,12 @@
     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
 else:
     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+<<<<<<< HEAD
 # pyrefly: ignore [unsupported-operation]
 if (3, 12) <= sys.version_info < (3, 14):
+=======
+if sys.version_info >= (3, 12):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
 if sys.version_info >= (3, 13):
     TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"])
@@ -50,7 +66,11 @@
 stack_effect = dis.stack_effect
 
 
+<<<<<<< HEAD
 def get_indexof(insts: list["Instruction"]) -> dict["Instruction", int]:
+=======
+def get_indexof(insts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get a mapping from instruction memory address to index in instruction list.
     Additionally checks that each instruction only appears once in the list.
@@ -62,12 +82,20 @@ def get_indexof(insts: list["Instruction"]) -> dict["Instruction", int]:
     return indexof
 
 
+<<<<<<< HEAD
 def remove_dead_code(instructions: list["Instruction"]) -> list["Instruction"]:
+=======
+def remove_dead_code(instructions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Dead code elimination"""
     indexof = get_indexof(instructions)
     live_code = set()
 
+<<<<<<< HEAD
     def find_live_code(start: int) -> None:
+=======
+    def find_live_code(start):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(start, len(instructions)):
             if i in live_code:
                 return
@@ -76,7 +104,10 @@ def find_live_code(start: int) -> None:
             if inst.exn_tab_entry:
                 find_live_code(indexof[inst.exn_tab_entry.target])
             if inst.opcode in JUMP_OPCODES:
+<<<<<<< HEAD
                 assert inst.target is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 find_live_code(indexof[inst.target])
             if inst.opcode in TERMINAL_OPCODES:
                 return
@@ -108,21 +139,37 @@ def find_live_code(start: int) -> None:
     return [inst for i, inst in enumerate(instructions) if i in live_code]
 
 
+<<<<<<< HEAD
 def remove_pointless_jumps(instructions: list["Instruction"]) -> list["Instruction"]:
     """Eliminate jumps to the next instruction"""
     pointless_jumps = {
         id(a)
         for a, b in itertools.pairwise(instructions)
+=======
+def remove_pointless_jumps(instructions):
+    """Eliminate jumps to the next instruction"""
+    pointless_jumps = {
+        id(a)
+        for a, b in zip(instructions, instructions[1:])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if a.opname == "JUMP_ABSOLUTE" and a.target is b
     }
     return [inst for inst in instructions if id(inst) not in pointless_jumps]
 
 
+<<<<<<< HEAD
 def propagate_line_nums(instructions: list["Instruction"]) -> None:
     """Ensure every instruction has line number set in case some are removed"""
     cur_line_no = None
 
     def populate_line_num(inst: "Instruction") -> None:
+=======
+def propagate_line_nums(instructions):
+    """Ensure every instruction has line number set in case some are removed"""
+    cur_line_no = None
+
+    def populate_line_num(inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal cur_line_no
         if inst.starts_line:
             cur_line_no = inst.starts_line
@@ -133,12 +180,20 @@ def populate_line_num(inst: "Instruction") -> None:
         populate_line_num(inst)
 
 
+<<<<<<< HEAD
 def remove_extra_line_nums(instructions: list["Instruction"]) -> None:
+=======
+def remove_extra_line_nums(instructions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Remove extra starts line properties before packing bytecode"""
 
     cur_line_no = None
 
+<<<<<<< HEAD
     def remove_line_num(inst: "Instruction") -> None:
+=======
+    def remove_line_num(inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal cur_line_no
         if inst.starts_line is None:
             return
@@ -158,14 +213,22 @@ class ReadsWrites:
     visited: set[Any]
 
 
+<<<<<<< HEAD
 def livevars_analysis(
     instructions: list["Instruction"], instruction: "Instruction"
 ) -> set[Any]:
+=======
+def livevars_analysis(instructions, instruction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     indexof = get_indexof(instructions)
     must = ReadsWrites(set(), set(), set())
     may = ReadsWrites(set(), set(), set())
 
+<<<<<<< HEAD
     def walk(state: ReadsWrites, start: int) -> None:
+=======
+    def walk(state, start):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if start in state.visited:
             return
         state.visited.add(start)
@@ -185,7 +248,10 @@ def walk(state: ReadsWrites, start: int) -> None:
             if inst.exn_tab_entry:
                 walk(may, indexof[inst.exn_tab_entry.target])
             if inst.opcode in JUMP_OPCODES:
+<<<<<<< HEAD
                 assert inst.target is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 walk(may, indexof[inst.target])
                 state = may
             if inst.opcode in TERMINAL_OPCODES:
@@ -206,19 +272,31 @@ class StackSize:
     high: Union[int, float]
     fixed_point: FixedPointBox
 
+<<<<<<< HEAD
     def zero(self) -> None:
+=======
+    def zero(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.low = 0
         self.high = 0
         self.fixed_point.value = False
 
+<<<<<<< HEAD
     def offset_of(self, other: "StackSize", n: int) -> None:
+=======
+    def offset_of(self, other, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior = (self.low, self.high)
         self.low = min(self.low, other.low + n)
         self.high = max(self.high, other.high + n)
         if (self.low, self.high) != prior:
             self.fixed_point.value = False
 
+<<<<<<< HEAD
     def exn_tab_jump(self, depth: int) -> None:
+=======
+    def exn_tab_jump(self, depth):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior = (self.low, self.high)
         self.low = min(self.low, depth)
         self.high = max(self.high, depth)
@@ -226,7 +304,11 @@ def exn_tab_jump(self, depth: int) -> None:
             self.fixed_point.value = False
 
 
+<<<<<<< HEAD
 def stacksize_analysis(instructions: list["Instruction"]) -> Union[int, float]:
+=======
+def stacksize_analysis(instructions) -> Union[int, float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert instructions
     fixed_point = FixedPointBox()
     stack_sizes = {
@@ -247,7 +329,10 @@ def stacksize_analysis(instructions: list["Instruction"]) -> Union[int, float]:
                 eff = stack_effect(inst.opcode, inst.arg, jump=False)
                 stack_sizes[next_inst].offset_of(stack_size, eff)
             if inst.opcode in JUMP_OPCODES:
+<<<<<<< HEAD
                 assert inst.target is not None, f"missing target: {inst}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 stack_sizes[inst.target].offset_of(
                     stack_size, stack_effect(inst.opcode, inst.arg, jump=True)
                 )
@@ -257,6 +342,14 @@ def stacksize_analysis(instructions: list["Instruction"]) -> Union[int, float]:
                 depth = inst.exn_tab_entry.depth + int(inst.exn_tab_entry.lasti) + 1
                 stack_sizes[inst.exn_tab_entry.target].exn_tab_jump(depth)
 
+<<<<<<< HEAD
+=======
+    if False:
+        for inst in instructions:
+            stack_size = stack_sizes[inst]
+            print(stack_size.low, stack_size.high, inst)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     low = min(x.low for x in stack_sizes.values())
     high = max(x.high for x in stack_sizes.values())
 
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index ed6e17f8aa25d..0deccc1f2a1d6 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides utilities for analyzing, transforming and manipulating Python bytecode.
 It includes functionality for:
@@ -21,10 +26,17 @@
 import sys
 import types
 import uuid
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from typing import Any, cast, Optional, TYPE_CHECKING, Union
 
 from . import config
+=======
+from collections.abc import Iterator, Sequence
+from typing import Any, Callable, cast, Optional, Union
+
+from ..utils._backport_slots import dataclass_slots
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .bytecode_analysis import (
     get_indexof,
     propagate_line_nums,
@@ -34,11 +46,16 @@
 from .utils import is_safe_constant
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from .output_graph import DynamoTracerOutput
 
 
 @dataclasses.dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class InstructionExnTabEntry:
     start: "Instruction"
     end: "Instruction"
@@ -54,9 +71,13 @@ def __repr__(self) -> str:
             f"depth={self.depth}, lasti={self.lasti})"
         )
 
+<<<<<<< HEAD
     def __eq__(self, o: object) -> bool:
         if not isinstance(o, InstructionExnTabEntry):
             return False
+=======
+    def __eq__(self, o) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             self.start is o.start
             and self.end is o.end
@@ -66,7 +87,12 @@ def __eq__(self, o: object) -> bool:
         )
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Instruction:
     """A mutable version of dis.Instruction"""
 
@@ -86,7 +112,11 @@ class Instruction:
     def __hash__(self) -> int:
         return id(self)
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
+=======
+    def __eq__(self, other) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return id(self) == id(other)
 
     def short_inst_repr(self) -> str:
@@ -147,26 +177,42 @@ def __repr__(self) -> str:
 
 if sys.version_info >= (3, 12):
 
+<<<<<<< HEAD
     def inst_has_op_bits(name: str) -> bool:
+=======
+    def inst_has_op_bits(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return name in ("LOAD_ATTR", "LOAD_GLOBAL", "LOAD_SUPER_ATTR")
 
 elif sys.version_info >= (3, 11):
 
+<<<<<<< HEAD
     def inst_has_op_bits(name: str) -> bool:
+=======
+    def inst_has_op_bits(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return name == "LOAD_GLOBAL"
 
 else:
 
+<<<<<<< HEAD
     def inst_has_op_bits(name: str):
+=======
+    def inst_has_op_bits(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
 
 def create_instruction(
+<<<<<<< HEAD
     name: str,
     *,
     arg: Optional[int] = None,
     argval: Optional[Any] = _NotProvided,
     target: Optional[Instruction] = None,
+=======
+    name, *, arg=None, argval=_NotProvided, target=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Instruction:
     """
     At most one of `arg`, `argval`, and `target` can be not None/_NotProvided.
@@ -204,16 +250,24 @@ def create_instruction(
 
 
 # Python 3.11 remaps
+<<<<<<< HEAD
 def create_jump_absolute(target: Instruction) -> Instruction:
+=======
+def create_jump_absolute(target) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inst = "JUMP_FORWARD" if sys.version_info >= (3, 11) else "JUMP_ABSOLUTE"
     return create_instruction(inst, target=target)
 
 
+<<<<<<< HEAD
 def is_jump_absolute(target: Instruction) -> bool:
     return target.opname in ("JUMP_FORWARD", "JUMP_ABSOLUTE")
 
 
 def create_load_const(val: Any, checked: bool = True) -> Instruction:
+=======
+def create_load_const(val, checked=True) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     In general we should only create `LOAD_CONST` for immutable objects, but
     sometimes it's convenient _and safe_ for Dynamo create `LOAD_CONST` for
@@ -230,7 +284,11 @@ def create_dup_top() -> Instruction:
     return create_instruction("DUP_TOP")
 
 
+<<<<<<< HEAD
 def create_rot_n(n: int) -> list[Instruction]:
+=======
+def create_rot_n(n) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns a "simple" sequence of instructions that rotates TOS to the n-th
     position in the stack. For Python < 3.11, returns a single ROT_*
@@ -248,6 +306,13 @@ def create_rot_n(n: int) -> list[Instruction]:
         # e.g. rotate 3 is equivalent to swap 3, swap 2
         return [create_instruction("SWAP", arg=i) for i in range(n, 1, -1)]
 
+<<<<<<< HEAD
+=======
+    # ensure desired rotate function exists
+    if sys.version_info < (3, 10) and n >= 5:
+        raise AttributeError(f"rotate {n} not supported for Python < 3.10")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if n <= 4:
         return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
     return [create_instruction("ROT_N", arg=n)]
@@ -271,6 +336,7 @@ def add_push_null(
     In this case, instructions WILL be modified.
     """
     if isinstance(inst_or_insts, Instruction):
+<<<<<<< HEAD
         insts: list[Instruction] = [inst_or_insts]
     else:
         assert isinstance(inst_or_insts, list)
@@ -283,6 +349,19 @@ def inst_has_bit_set(idx: int) -> bool:
     def set_inst_bit(idx: int) -> None:
         assert insts[idx].arg is not None
         insts[idx].arg |= 1  # type: ignore[operator]
+=======
+        insts = [inst_or_insts]
+    else:
+        insts = inst_or_insts
+
+    def inst_has_bit_set(idx):
+        assert insts[idx].arg is not None
+        return insts[idx].arg & 1 == 1
+
+    def set_inst_bit(idx):
+        assert insts[idx].arg is not None
+        insts[idx].arg |= 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if sys.version_info >= (3, 13):
         # In 3.13, NULL follows the callable
@@ -319,9 +398,14 @@ def add_push_null_call_function_ex(
     is not set, due to an expected CALL_FUNCTION_EX instruction.
     """
     if isinstance(inst_or_insts, Instruction):
+<<<<<<< HEAD
         insts: list[Instruction] = [inst_or_insts]
     else:
         assert isinstance(inst_or_insts, list)
+=======
+        insts = [inst_or_insts]
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         insts = inst_or_insts
 
     if sys.version_info < (3, 11):
@@ -342,7 +426,11 @@ def add_push_null_call_function_ex(
     return insts
 
 
+<<<<<<< HEAD
 def create_call_function(nargs: int, push_null: bool) -> list[Instruction]:
+=======
+def create_call_function(nargs, push_null) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Creates a sequence of instructions that makes a function call.
 
@@ -397,6 +485,7 @@ def create_call_function(nargs: int, push_null: bool) -> list[Instruction]:
     return [create_instruction("CALL_FUNCTION", arg=nargs)]
 
 
+<<<<<<< HEAD
 def create_call_function_ex(
     has_kwargs: bool, push_null: bool, ignore_314_kwargs_push: bool = False
 ) -> list[Instruction]:
@@ -431,6 +520,9 @@ def create_call_function_ex(
 
 
 def create_call_method(nargs: int) -> list[Instruction]:
+=======
+def create_call_method(nargs) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 12):
         return [create_instruction("CALL", arg=nargs)]
     if sys.version_info >= (3, 11):
@@ -441,28 +533,43 @@ def create_call_method(nargs: int) -> list[Instruction]:
     return [create_instruction("CALL_METHOD", arg=nargs)]
 
 
+<<<<<<< HEAD
 def create_load_method(name: str) -> Instruction:
+=======
+def create_load_method(name) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 12):
         # in 3.12, create a LOAD_ATTR instruction with the low bit set
         return create_instruction("LOAD_ATTR", arg=1, argval=name)
     return create_instruction("LOAD_METHOD", argval=name)
 
 
+<<<<<<< HEAD
 def create_setup_with(target: Instruction) -> Instruction:
+=======
+def create_setup_with(target) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opname = "BEFORE_WITH" if sys.version_info >= (3, 11) else "SETUP_WITH"
     return create_instruction(opname, target=target)
 
 
+<<<<<<< HEAD
 def create_swap(n: int) -> list[Instruction]:
+=======
+def create_swap(n) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 11):
         return [create_instruction("SWAP", arg=n)]
     # in Python < 3.11, SWAP is a macro that expands to multiple instructions
     if n == 1:
         return []
+<<<<<<< HEAD
     elif n == 2:
         return [create_instruction("ROT_TWO")]
     elif n == 3:
         return [create_instruction("ROT_THREE"), create_instruction("ROT_TWO")]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     e.g. swap "a" and "b" in this stack:
     0 a 1 2 3 b
@@ -485,7 +592,11 @@ def create_swap(n: int) -> list[Instruction]:
         create_instruction("BUILD_LIST", arg=n - 1),
         create_instruction("DUP_TOP"),
         create_instruction("LOAD_CONST", argval=-1),
+<<<<<<< HEAD
         create_binary_subscr(),
+=======
+        create_instruction("BINARY_SUBSCR"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         create_instruction("ROT_THREE"),
         create_instruction("DUP_TOP"),
         create_instruction("ROT_THREE"),
@@ -499,6 +610,7 @@ def create_swap(n: int) -> list[Instruction]:
     ]
 
 
+<<<<<<< HEAD
 def create_binary_slice(
     start: Optional[int], end: Optional[int], store: bool = False
 ) -> list[Instruction]:
@@ -585,18 +697,53 @@ def create_build_tuple(n: int) -> Instruction:
 def linetable_writer(
     first_lineno: int,
 ) -> tuple[list[int], Callable[[int, int], None], Callable[[int], None]]:
+=======
+def lnotab_writer(
+    lineno: int, byteno: int = 0
+) -> tuple[list[int], Callable[[int, int], None]]:
+    """
+    Used to create typing.CodeType.co_lnotab
+    See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
+    This is the internal format of the line number table if Python < 3.10
+    """
+    assert sys.version_info < (3, 10)
+    lnotab: list[int] = []
+
+    def update(lineno_new, byteno_new):
+        nonlocal byteno, lineno
+        while byteno_new != byteno or lineno_new != lineno:
+            byte_offset = max(0, min(byteno_new - byteno, 255))
+            line_offset = max(-128, min(lineno_new - lineno, 127))
+            assert byte_offset != 0 or line_offset != 0
+            byteno += byte_offset
+            lineno += line_offset
+            lnotab.extend((byte_offset, line_offset & 0xFF))
+
+    return lnotab, update
+
+
+def linetable_310_writer(first_lineno):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Used to create typing.CodeType.co_linetable
     See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
     This is the internal format of the line number table for Python 3.10
     """
+<<<<<<< HEAD
     assert sys.version_info[:2] == (3, 10)
+=======
+    assert sys.version_info >= (3, 10) and sys.version_info < (3, 11)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     linetable: list[int] = []
     lineno = first_lineno
     lineno_delta = 0
     byteno = 0
 
+<<<<<<< HEAD
     def _update(byteno_delta: int, lineno_delta: int) -> None:
+=======
+    def _update(byteno_delta, lineno_delta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while byteno_delta != 0 or lineno_delta != 0:
             byte_offset = max(0, min(byteno_delta, 254))
             line_offset = max(-127, min(lineno_delta, 127))
@@ -605,7 +752,11 @@ def _update(byteno_delta: int, lineno_delta: int) -> None:
             lineno_delta -= line_offset
             linetable.extend((byte_offset, line_offset & 0xFF))
 
+<<<<<<< HEAD
     def update(lineno_new: int, byteno_new: int) -> None:
+=======
+    def update(lineno_new, byteno_new):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal lineno, lineno_delta, byteno
         byteno_delta = byteno_new - byteno
         byteno = byteno_new
@@ -613,7 +764,11 @@ def update(lineno_new: int, byteno_new: int) -> None:
         lineno_delta = lineno_new - lineno
         lineno = lineno_new
 
+<<<<<<< HEAD
     def end(total_bytes: int) -> None:
+=======
+    def end(total_bytes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _update(total_bytes - byteno, lineno_delta)
 
     return linetable, update, end
@@ -634,9 +789,13 @@ def encode_varint(n: int) -> list[int]:
     return b
 
 
+<<<<<<< HEAD
 def linetable_311_writer(
     first_lineno: int,
 ) -> tuple[list[int], Callable[[Optional["dis.Positions"], int], None]]:
+=======
+def linetable_311_writer(first_lineno: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Used to create typing.CodeType.co_linetable
     See https://github.com/python/cpython/blob/3.11/Objects/locations.md
@@ -646,11 +805,19 @@ def linetable_311_writer(
     linetable = []
     lineno = first_lineno
 
+<<<<<<< HEAD
     def update(positions: Optional["dis.Positions"], inst_size: int) -> None:
         nonlocal lineno
         lineno_new = positions.lineno if positions else None
 
         def _update(delta: int, size: int) -> None:
+=======
+    def update(positions: "dis.Positions", inst_size):
+        nonlocal lineno
+        lineno_new = positions.lineno if positions else None
+
+        def _update(delta, size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert 0 < size <= 8
             # first byte - use 13 (no column info) is positions is
             # malformed, otherwise use 14 (long form)
@@ -695,7 +862,12 @@ def _update(delta: int, size: int) -> None:
     return linetable, update
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ExceptionTableEntry:
     start: int
     end: int
@@ -811,7 +983,14 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             for _ in range(instruction_size(inst) // 2 - 1):
                 code.extend((0, 0))
     else:
+<<<<<<< HEAD
         lnotab, update_lineno, end = linetable_writer(firstlineno)
+=======
+        if sys.version_info < (3, 10):
+            lnotab, update_lineno = lnotab_writer(firstlineno)
+        else:
+            lnotab, update_lineno, end = linetable_310_writer(firstlineno)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for inst in instructions:
             if inst.starts_line is not None:
@@ -819,14 +998,23 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             arg = inst.arg or 0
             code.extend((inst.opcode, arg & 0xFF))
 
+<<<<<<< HEAD
         end(len(code))
+=======
+        if sys.version_info >= (3, 10):
+            end(len(code))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return bytes(code), bytes(lnotab)
 
 
+<<<<<<< HEAD
 def _get_instruction_by_offset(
     offset_to_inst: dict[int, Instruction], offset: int
 ) -> Optional[Instruction]:
+=======
+def _get_instruction_by_offset(offset_to_inst: dict[int, Instruction], offset: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get the instruction located at a given offset, accounting for EXTENDED_ARGs
     """
@@ -836,11 +1024,17 @@ def _get_instruction_by_offset(
     return None
 
 
+<<<<<<< HEAD
 def virtualize_jumps(instructions: Iterable[Instruction]) -> None:
     """Replace jump targets with pointers to make editing easier"""
     jump_targets = {
         inst.offset: inst for inst in instructions if inst.offset is not None
     }
+=======
+def virtualize_jumps(instructions) -> None:
+    """Replace jump targets with pointers to make editing easier"""
+    jump_targets = {inst.offset: inst for inst in instructions}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for inst in instructions:
         if inst.opcode in dis.hasjabs or inst.opcode in dis.hasjrel:
@@ -863,7 +1057,11 @@ def flip_jump_direction(instruction: Instruction) -> None:
     assert instruction.opcode in _REL_JUMPS
 
 
+<<<<<<< HEAD
 def _get_instruction_front(instructions: list[Instruction], idx: int) -> Instruction:
+=======
+def _get_instruction_front(instructions: list[Instruction], idx: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     i.e. get the first EXTENDED_ARG instruction (if any) when targeting
     instructions[idx] with a jump.
@@ -877,7 +1075,11 @@ def _get_instruction_front(instructions: list[Instruction], idx: int) -> Instruc
     return target
 
 
+<<<<<<< HEAD
 def devirtualize_jumps(instructions: list[Instruction]) -> None:
+=======
+def devirtualize_jumps(instructions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Fill in args for virtualized jump target after instructions may have moved"""
     jumps = set(dis.hasjabs).union(set(dis.hasjrel))
 
@@ -885,11 +1087,14 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
     for inst in instructions:
         if inst.opcode in jumps:
             if inst.opcode not in dis.hasjabs:
+<<<<<<< HEAD
                 assert (
                     inst.target is not None
                     and inst.target.offset is not None
                     and inst.offset is not None
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if inst.target.offset < inst.offset:
                     if sys.version_info < (3, 11):
                         raise RuntimeError("Got negative jump offset for Python < 3.11")
@@ -908,10 +1113,18 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
     # compute jump instruction arg
     for inst in instructions:
         if inst.opcode in jumps:
+<<<<<<< HEAD
             assert inst.target is not None
             target = _get_instruction_front(instructions, indexof[inst.target])
             if inst.opcode in dis.hasjabs:
                 if sys.version_info < (3, 11):
+=======
+            target = _get_instruction_front(instructions, indexof[inst.target])
+            if inst.opcode in dis.hasjabs:
+                if sys.version_info < (3, 10):
+                    inst.arg = target.offset
+                elif sys.version_info < (3, 11):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # `arg` is expected to be bytecode offset, whereas `offset` is byte offset.
                     # Divide since bytecode is 2 bytes large.
                     inst.arg = int(target.offset / 2)
@@ -919,19 +1132,32 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
                     raise RuntimeError("Python 3.11+ should not have absolute jumps")
             else:  # relative jump
                 # byte offset between target and next instruction
+<<<<<<< HEAD
                 assert target.offset is not None and inst.offset is not None
                 inst.arg = abs(
                     int(target.offset - inst.offset - instruction_size(inst))
                 )
                 # pyrefly: ignore [unsupported-operation]
                 inst.arg //= 2
+=======
+                inst.arg = abs(
+                    int(target.offset - inst.offset - instruction_size(inst))
+                )
+                if sys.version_info >= (3, 10):
+                    # see bytecode size comment in the absolute jump case above
+                    inst.arg //= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inst.argval = target.offset
             inst.argrepr = f"to {target.offset}"
 
 
+<<<<<<< HEAD
 def virtualize_exception_table(
     exn_tab_bytes: bytes, instructions: list[Instruction]
 ) -> None:
+=======
+def virtualize_exception_table(exn_tab_bytes: bytes, instructions: list[Instruction]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Replace exception table entries with pointers to make editing easier"""
     exn_tab = parse_exception_table(exn_tab_bytes)
     offset_to_inst = {cast(int, inst.offset): inst for inst in instructions}
@@ -940,7 +1166,11 @@ def virtualize_exception_table(
     exn_tab_iter = iter(exn_tab)
     try:
 
+<<<<<<< HEAD
         def step() -> tuple[ExceptionTableEntry, InstructionExnTabEntry]:
+=======
+        def step():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal end_offset_idx
             entry = next(exn_tab_iter)
             # find rightmost offset <= entry.end, since entry.end may not be
@@ -954,9 +1184,15 @@ def step() -> tuple[ExceptionTableEntry, InstructionExnTabEntry]:
             assert end_offset_idx > 0
             end_offset = offsets[end_offset_idx - 1]
             inst_entry = InstructionExnTabEntry(
+<<<<<<< HEAD
                 _get_instruction_by_offset(offset_to_inst, entry.start),  # type: ignore[arg-type]
                 _get_instruction_by_offset(offset_to_inst, end_offset),  # type: ignore[arg-type]
                 _get_instruction_by_offset(offset_to_inst, entry.target),  # type: ignore[arg-type]
+=======
+                _get_instruction_by_offset(offset_to_inst, entry.start),
+                _get_instruction_by_offset(offset_to_inst, end_offset),
+                _get_instruction_by_offset(offset_to_inst, entry.target),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 entry.depth,
                 entry.lasti,
             )
@@ -964,7 +1200,10 @@ def step() -> tuple[ExceptionTableEntry, InstructionExnTabEntry]:
 
         entry, inst_entry = step()
         for inst in instructions:
+<<<<<<< HEAD
             assert inst.offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while inst.offset > entry.end:
                 entry, inst_entry = step()
             if inst.offset >= entry.start:
@@ -986,18 +1225,27 @@ def compute_exception_table(
             start = _get_instruction_front(
                 instructions, indexof[inst.exn_tab_entry.start]
             ).offset
+<<<<<<< HEAD
             assert start is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # point to the last 2 bytes of the end instruction
             end = (
                 cast(int, inst.exn_tab_entry.end.offset)
                 + instruction_size(inst.exn_tab_entry.end)
                 - 2
             )
+<<<<<<< HEAD
             assert end is not None
             target = _get_instruction_front(
                 instructions, indexof[inst.exn_tab_entry.target]
             ).offset
             assert target is not None
+=======
+            target = _get_instruction_front(
+                instructions, indexof[inst.exn_tab_entry.target]
+            ).offset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key = (start, end)
             val = (target, inst.exn_tab_entry.depth, inst.exn_tab_entry.lasti)
             if key in exn_dict:
@@ -1017,7 +1265,11 @@ def compute_exception_table(
     key_stack: list[tuple[int, int]] = []
     exn_tab: list[ExceptionTableEntry] = []
 
+<<<<<<< HEAD
     def pop() -> None:
+=======
+    def pop():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Pop the key_stack and append an exception table entry if possible.
         """
@@ -1051,7 +1303,11 @@ def pop() -> None:
 
 
 def check_inst_exn_tab_entries_nested(
+<<<<<<< HEAD
     tab: list[InstructionExnTabEntry], indexof: dict[Instruction, int]
+=======
+    tab: list[InstructionExnTabEntry], indexof
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Checks `tab` is a properly sorted list of nested InstructionExnTabEntry's,
@@ -1096,7 +1352,11 @@ def propagate_inst_exn_table_entries(instructions: list[Instruction]) -> None:
             instructions[i].exn_tab_entry = copy.copy(entry)
 
 
+<<<<<<< HEAD
 def check_inst_exn_tab_entries_valid(instructions: list[Instruction]) -> None:
+=======
+def check_inst_exn_tab_entries_valid(instructions: list[Instruction]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Checks that exn_tab_entries of instructions are valid.
     An entry's start, end, and target must be in instructions.
@@ -1129,9 +1389,13 @@ def strip_extended_args(instructions: list[Instruction]) -> None:
 # instruction, exception table entries, and positions.
 # Returns the modified sequence of instructions (including the modified
 # old instruction!) that can be manipulated elsewhere.
+<<<<<<< HEAD
 def overwrite_instruction(
     old_inst: Instruction, new_insts: list[Instruction]
 ) -> list[Instruction]:
+=======
+def overwrite_instruction(old_inst, new_insts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # update old_inst.exnt_tab_entry.end if necessary
     if (
         old_inst.exn_tab_entry
@@ -1200,10 +1464,14 @@ def remove_binary_store_slice(instructions: list[Instruction]) -> None:
         new_insts.append(inst)
         if inst.opname in ("BINARY_SLICE", "STORE_SLICE"):
             # new instruction
+<<<<<<< HEAD
             if sys.version_info >= (3, 14) and inst.opname == "BINARY_SLICE":
                 subscr_inst = create_binary_subscr()
             else:
                 subscr_inst = create_instruction(inst.opname.replace("SLICE", "SUBSCR"))
+=======
+            subscr_inst = create_instruction(inst.opname.replace("SLICE", "SUBSCR"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if inst.exn_tab_entry and inst.exn_tab_entry.end is inst:
                 inst.exn_tab_entry.end = subscr_inst
             subscr_inst.exn_tab_entry = copy.copy(inst.exn_tab_entry)
@@ -1219,7 +1487,10 @@ def remove_binary_store_slice(instructions: list[Instruction]) -> None:
 
 FUSED_INSTS = {
     "LOAD_FAST_LOAD_FAST": ("LOAD_FAST", "LOAD_FAST"),
+<<<<<<< HEAD
     "LOAD_FAST_BORROW_LOAD_FAST_BORROW": ("LOAD_FAST_BORROW", "LOAD_FAST_BORROW"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "STORE_FAST_STORE_FAST": ("STORE_FAST", "STORE_FAST"),
     "STORE_FAST_LOAD_FAST": ("STORE_FAST", "LOAD_FAST"),
 }
@@ -1242,6 +1513,7 @@ def remove_fused_load_store(instructions: list[Instruction]) -> None:
     instructions[:] = new_insts
 
 
+<<<<<<< HEAD
 # adds GRAPH_BREAK_IF_LEAF (not a real instruction) before RETURN_* instructions
 # for testing purposes
 def add_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
@@ -1285,6 +1557,8 @@ def remove_graph_break_if_leaf_instructions(instructions: list[Instruction]) ->
     instructions[:] = new_insts
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def explicit_super(code: types.CodeType, instructions: list[Instruction]) -> None:
     """convert super() with no args into explicit arg form"""
     cell_and_free = (code.co_cellvars or ()) + (code.co_freevars or ())
@@ -1327,7 +1601,11 @@ def fix_extended_args(instructions: list[Instruction]) -> int:
     """Fill in correct argvals for EXTENDED_ARG ops"""
     output: list[Instruction] = []
 
+<<<<<<< HEAD
     def maybe_pop_n(n: int) -> None:
+=======
+    def maybe_pop_n(n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(n):
             if output and output[-1].opcode == dis.EXTENDED_ARG:
                 output.pop()
@@ -1356,7 +1634,11 @@ def maybe_pop_n(n: int) -> None:
     return added
 
 
+<<<<<<< HEAD
 def instruction_size(inst: Instruction) -> int:
+=======
+def instruction_size(inst) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch
 
     if sys.version_info >= (3, 11):
@@ -1364,13 +1646,18 @@ def instruction_size(inst: Instruction) -> int:
     return 2
 
 
+<<<<<<< HEAD
 def check_offsets(instructions: Sequence[Instruction]) -> None:
+=======
+def check_offsets(instructions) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     offset = 0
     for inst in instructions:
         assert inst.offset == offset
         offset += instruction_size(inst)
 
 
+<<<<<<< HEAD
 def update_offsets(instructions: Sequence[Instruction]) -> None:
     offset = 0
     for inst in instructions:
@@ -1380,6 +1667,16 @@ def update_offsets(instructions: Sequence[Instruction]) -> None:
 
 
 def debug_bytes(*args: bytes) -> str:
+=======
+def update_offsets(instructions) -> None:
+    offset = 0
+    for inst in instructions:
+        inst.offset = offset
+        offset += instruction_size(inst)
+
+
+def debug_bytes(*args) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     index = range(max(map(len, args)))
     result = [
         " ".join(f"{x:03}" for x in arg)
@@ -1391,9 +1688,15 @@ def debug_bytes(*args: bytes) -> str:
     return "bytes mismatch\n" + "\n".join(result)
 
 
+<<<<<<< HEAD
 def debug_checks(code: types.CodeType) -> None:
     """Make sure our assembler produces same bytes as we start with"""
     dode, _ = transform_code_object(code, lambda x, y: None, safe=True)
+=======
+def debug_checks(code):
+    """Make sure our assembler produces same bytes as we start with"""
+    dode = transform_code_object(code, lambda x, y: None, safe=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert code.co_code == dode.co_code, debug_bytes(code.co_code, dode.co_code)
     assert code.co_lnotab == dode.co_lnotab, debug_bytes(code.co_lnotab, dode.co_lnotab)
 
@@ -1404,7 +1707,11 @@ def debug_checks(code: types.CodeType) -> None:
 HAS_CONST = set(dis.hasconst)
 
 
+<<<<<<< HEAD
 def get_const_index(code_options: dict[str, Any], val: Any) -> int:
+=======
+def get_const_index(code_options, val) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, v in enumerate(code_options["co_consts"]):
         # NOTE: stronger comparison is required, since we have
         # examples where two values compare equal but have
@@ -1416,6 +1723,7 @@ def get_const_index(code_options: dict[str, Any], val: Any) -> int:
     return len(code_options["co_consts"]) - 1
 
 
+<<<<<<< HEAD
 def fix_vars(
     instructions: list[Instruction],
     code_options: dict[str, Any],
@@ -1425,6 +1733,13 @@ def fix_vars(
     names = {name: idx for idx, name in enumerate(code_options["co_names"])}
 
     def get_name_index(name: str) -> int:
+=======
+def fix_vars(instructions: list[Instruction], code_options, varname_from_oparg=None):
+    # compute instruction arg from argval if arg is not provided
+    names = {name: idx for idx, name in enumerate(code_options["co_names"])}
+
+    def get_name_index(name) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             idx = names[name]
         except KeyError:
@@ -1459,7 +1774,11 @@ def get_name_index(name: str) -> int:
         }
     for i in range(len(instructions)):
 
+<<<<<<< HEAD
         def should_compute_arg() -> bool:
+=======
+        def should_compute_arg():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # argval is prioritized over arg
             return instructions[i].argval is not _NotProvided
 
@@ -1527,7 +1846,11 @@ def should_compute_arg() -> bool:
                 instructions[i].arg = idx
 
 
+<<<<<<< HEAD
 def clear_instruction_args(instructions: list[Instruction]) -> None:
+=======
+def clear_instruction_args(instructions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Clear the instruction arg for instructions that have argvals.
     # Useful for using dis'd bytecode within generated bytecode.
     for inst in instructions:
@@ -1568,7 +1891,14 @@ def get_code_keys() -> list[str]:
     if sys.version_info >= (3, 11):
         keys.append("co_qualname")
     keys.append("co_firstlineno")
+<<<<<<< HEAD
     keys.append("co_linetable")
+=======
+    if sys.version_info >= (3, 10):
+        keys.append("co_linetable")
+    else:
+        keys.append("co_lnotab")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 11):
         # not documented, but introduced in https://github.com/python/cpython/issues/84403
         keys.append("co_exceptiontable")
@@ -1581,6 +1911,7 @@ def get_code_keys() -> list[str]:
     return keys
 
 
+<<<<<<< HEAD
 def transform_code_object(
     code: types.CodeType,
     transformations: Callable[
@@ -1588,23 +1919,36 @@ def transform_code_object(
     ],
     safe: bool = False,
 ) -> tuple[types.CodeType, Optional["DynamoTracerOutput"]]:
+=======
+def transform_code_object(code, transformations, safe=False) -> types.CodeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     keys = get_code_keys()
     code_options = {k: getattr(code, k) for k in keys}
     assert len(code_options["co_varnames"]) == code_options["co_nlocals"]
 
     instructions = cleaned_instructions(code, safe)
+<<<<<<< HEAD
     # propagate line nums again for added instructions
     propagate_line_nums(instructions)
 
     tracer_output = transformations(instructions, code_options)
     _, bytecode = clean_and_assemble_instructions(instructions, keys, code_options)
     return bytecode, tracer_output
+=======
+    propagate_line_nums(instructions)
+
+    transformations(instructions, code_options)
+    return clean_and_assemble_instructions(instructions, keys, code_options)[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def clean_and_assemble_instructions(
     instructions: list[Instruction], keys: list[str], code_options: dict[str, Any]
 ) -> tuple[list[Instruction], types.CodeType]:
+<<<<<<< HEAD
     remove_graph_break_if_leaf_instructions(instructions)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # also implicitly checks for no duplicate instructions
     check_inst_exn_tab_entries_valid(instructions)
 
@@ -1625,8 +1969,16 @@ def clean_and_assemble_instructions(
 
     remove_extra_line_nums(instructions)
     bytecode, lnotab = assemble(instructions, code_options["co_firstlineno"])
+<<<<<<< HEAD
 
     code_options["co_linetable"] = lnotab
+=======
+    if sys.version_info < (3, 10):
+        code_options["co_lnotab"] = lnotab
+    else:
+        code_options["co_linetable"] = lnotab
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     code_options["co_code"] = bytecode
     code_options["co_stacksize"] = stacksize_analysis(instructions)
     assert set(keys) - {"co_posonlyargcount"} == set(code_options.keys()) - {
@@ -1640,7 +1992,11 @@ def clean_and_assemble_instructions(
     return instructions, types.CodeType(*[code_options[k] for k in keys])
 
 
+<<<<<<< HEAD
 def populate_kw_names_argval(instructions: Sequence[Instruction], consts: Any) -> None:
+=======
+def populate_kw_names_argval(instructions, consts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for inst in instructions:
         if inst.opname == "KW_NAMES":
             inst.argval = consts[inst.arg]
@@ -1648,7 +2004,11 @@ def populate_kw_names_argval(instructions: Sequence[Instruction], consts: Any) -
 
 # If safe=True, we do not make any bytecode modifications.
 # Mainly used for debugging bytecode_transformation (see debug_checks)
+<<<<<<< HEAD
 def cleaned_instructions(code: types.CodeType, safe: bool = False) -> list[Instruction]:
+=======
+def cleaned_instructions(code, safe=False) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instructions = _cached_cleaned_instructions(code, safe)
     # We have a lot of code that implicitly mutates the instruction array. We
     # could do better here by making the copies explicit when necessary.
@@ -1656,7 +2016,11 @@ def cleaned_instructions(code: types.CodeType, safe: bool = False) -> list[Instr
 
 
 # Copy an instructions array, making sure to remap the individual instruction targets.
+<<<<<<< HEAD
 def _clone_instructions(instructions: Sequence[Instruction]) -> list[Instruction]:
+=======
+def _clone_instructions(instructions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This is super hot and this is the fastest way to do this (tried copy.copy
     # and dataclasses.replace).
     copied = [
@@ -1678,10 +2042,17 @@ def _clone_instructions(instructions: Sequence[Instruction]) -> list[Instruction
 
     remap = dict(zip(instructions, copied))
     # Handle `None` in the remapper so we don't need an extra `if`.
+<<<<<<< HEAD
     remap[None] = None  # type: ignore[index, assignment]
 
     for i in copied:
         i.target = remap[i.target]  # type: ignore[index]
+=======
+    remap[None] = None
+
+    for i in copied:
+        i.target = remap[i.target]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if entry := i.exn_tab_entry:
             i.exn_tab_entry = InstructionExnTabEntry(
                 remap[entry.start],
@@ -1694,12 +2065,17 @@ def _clone_instructions(instructions: Sequence[Instruction]) -> list[Instruction
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def _cached_cleaned_instructions(
     code: types.CodeType, safe: bool = False
 ) -> Sequence[Instruction]:
     instructions = list(map(convert_instruction, dis.get_instructions(code)))
     # propagate now in case we remove some instructions
     propagate_line_nums(instructions)
+=======
+def _cached_cleaned_instructions(code, safe=False) -> Sequence[Instruction]:
+    instructions = list(map(convert_instruction, dis.get_instructions(code)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_offsets(instructions)
     if sys.version_info >= (3, 11):
         populate_kw_names_argval(instructions, code.co_consts)
@@ -1717,8 +2093,11 @@ def _cached_cleaned_instructions(
                 remove_binary_store_slice(instructions)
             if sys.version_info >= (3, 13):
                 remove_fused_load_store(instructions)
+<<<<<<< HEAD
         if config.debug_force_graph_break_on_leaf_return:
             add_graph_break_if_leaf_instructions(instructions)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 11):
         update_offsets(instructions)
         devirtualize_jumps(instructions)
@@ -1728,7 +2107,11 @@ def _cached_cleaned_instructions(
 _unique_id_counter = itertools.count()
 
 
+<<<<<<< HEAD
 def unique_id(name: str, with_uuid: bool = False) -> str:
+=======
+def unique_id(name, with_uuid=False) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ret = f"{name}_{next(_unique_id_counter)}"
     if with_uuid:
         ret += f"_{uuid.uuid4()}".replace("-", "_")
@@ -1740,12 +2123,16 @@ def is_generator(code: types.CodeType) -> bool:
     return (code.co_flags & co_generator) > 0
 
 
+<<<<<<< HEAD
 def bytecode_from_template(
     fn: Callable[..., Any],
     varname_map: Optional[Mapping[Any, Any]] = None,
     noreturn: bool = True,
     noprefix: bool = True,
 ) -> list[Instruction]:
+=======
+def bytecode_from_template(fn, varname_map=None, noreturn=True, noprefix=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Generates bytecode from a template function `fn` for use in
     dynamo bytecode generation.
 
diff --git a/torch/_dynamo/cache_size.py b/torch/_dynamo/cache_size.py
index d1a46742f37ac..3b19ab907736a 100644
--- a/torch/_dynamo/cache_size.py
+++ b/torch/_dynamo/cache_size.py
@@ -1,7 +1,14 @@
+<<<<<<< HEAD
 import logging
 import weakref
 from dataclasses import dataclass
 from typing import Any, Optional
+=======
+# mypy: allow-untyped-defs
+import logging
+import weakref
+from dataclasses import dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._guards import CompileId
 
@@ -9,7 +16,11 @@
 from .types import DynamoFrameType
 
 
+<<<<<<< HEAD
 log: logging.Logger = logging.getLogger(__name__)
+=======
+log = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 [Note on cache size limit]
 
@@ -99,9 +110,13 @@ def will_compilation_exceed_specific_limit(self, limit: int) -> bool:
         return self.num_cache_entries_with_same_id_matched_objs >= limit
 
 
+<<<<<<< HEAD
 def _get_weakref_from_f_locals(
     frame: DynamoFrameType, local_name: str
 ) -> Optional[weakref.ref[Any]]:
+=======
+def _get_weakref_from_f_locals(frame: DynamoFrameType, local_name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     obj = frame.f_locals.get(local_name, None)
     weak_id = None
     try:
@@ -111,7 +126,11 @@ def _get_weakref_from_f_locals(
     return weak_id
 
 
+<<<<<<< HEAD
 def _has_same_id_matched_objs(frame: DynamoFrameType, cache_entry: Any) -> bool:
+=======
+def _has_same_id_matched_objs(frame: DynamoFrameType, cache_entry) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Checks if the ID_MATCH'd objects saved on cache_entry are same as the ones
     in frame.f_locals.
@@ -133,7 +152,11 @@ def _has_same_id_matched_objs(frame: DynamoFrameType, cache_entry: Any) -> bool:
 
 
 def compute_cache_size(
+<<<<<<< HEAD
     frame: DynamoFrameType, cache_entry: Any
+=======
+    frame: DynamoFrameType, cache_entry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> CacheSizeRelevantForFrame:
     # Walk the linked list to calculate the cache size
     num_cache_entries = 0
diff --git a/torch/_dynamo/callback.py b/torch/_dynamo/callback.py
index 25e9f260e34b3..663444e26a7bc 100644
--- a/torch/_dynamo/callback.py
+++ b/torch/_dynamo/callback.py
@@ -27,10 +27,17 @@ def my_end_callback():
 
 import enum
 import threading
+<<<<<<< HEAD
 from collections.abc import Callable, Generator
 from contextlib import contextmanager
 from dataclasses import dataclass, field  # noqa: F811
 from typing import Any
+=======
+from collections.abc import Generator
+from contextlib import contextmanager
+from dataclasses import dataclass, field  # noqa: F811
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CallbackTrigger(enum.Enum):
@@ -39,7 +46,11 @@ class CallbackTrigger(enum.Enum):
     # backward compilation can be deferred to runtime
     LAZY_BACKWARD = 2
     # some backends autotune at runtime
+<<<<<<< HEAD
     TRITON_AUTOTUNING = 3  # Temporarily disabled due to spam
+=======
+    TRITON_AUTOTUNING = 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # cudagraphs record at runtime
     CUDAGRAPH_RECORDING = 4
 
@@ -126,9 +137,15 @@ def install_callbacks(
         args = CallbackArgs(trigger, compile_id)
         try:
             with self.__pending_callbacks_counter_lock:
+<<<<<<< HEAD
                 self.__pending_callbacks_counter += 1
                 if self.__pending_callbacks_counter == 1:
                     self.run_start_callbacks(args)
+=======
+                if self.__pending_callbacks_counter == 0:
+                    self.run_start_callbacks(args)
+                self.__pending_callbacks_counter += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield
         finally:
             with self.__pending_callbacks_counter_lock:
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 3a933f3de34a4..747fa77558c6e 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides utilities for generating Python bytecode in PyTorch's Dynamo system.
 It includes functionality for:
@@ -16,8 +21,12 @@
 import sys
 import types
 from collections import Counter
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.nn
 from torch.utils._ordered_set import OrderedSet
@@ -26,10 +35,14 @@
 from .bytecode_transformation import (
     add_push_null,
     add_push_null_call_function_ex,
+<<<<<<< HEAD
     create_binary_subscr,
     create_build_tuple,
     create_call_function,
     create_call_function_ex,
+=======
+    create_call_function,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     create_call_method,
     create_dup_top,
     create_instruction,
@@ -57,8 +70,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from torch._dynamo.variables.builder import GraphArg
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .symbolic_convert import InstructionTranslatorBase
 
 
@@ -78,8 +94,13 @@ def __init__(
         tx: "InstructionTranslatorBase",
         root: Optional[torch.nn.Module] = None,
         graph_output_var: Optional[str] = None,
+<<<<<<< HEAD
         tempvars: Optional[dict[Union[VariableTracker, Source], Any]] = None,
         overridden_sources: Optional[dict[Source, Source]] = None,
+=======
+        tempvars=None,
+        overridden_sources=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.root = root
         self.top_of_stack: Optional[Union[VariableTracker, Source]] = None
@@ -90,7 +111,11 @@ def __init__(
         # locals, and maps the VariableTracker/Source to the local variable
         # name. Note that it could map to None initially, in which case we'll
         # overwrite it to map to real temporary names via `add_cache`.
+<<<<<<< HEAD
         self.tempvars: dict[Union[VariableTracker, Source], Any] = tempvars or {}
+=======
+        self.tempvars = tempvars or {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.tx = tx
         self.graph_output_var = graph_output_var
         self.code_options = self.tx.output.code_options
@@ -102,9 +127,13 @@ def __init__(
         # without affecting other components, e.g., guards.
         self.overridden_sources: dict[Source, Source] = overridden_sources or {}
 
+<<<<<<< HEAD
     def restore_stack(
         self, stack_values: list[Any], *, value_from_source: bool = True
     ) -> None:
+=======
+    def restore_stack(self, stack_values, *, value_from_source=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prev = self.value_from_source
         self.value_from_source &= value_from_source
         try:
@@ -112,6 +141,7 @@ def restore_stack(
         finally:
             self.value_from_source = prev
 
+<<<<<<< HEAD
     def graph_output_vars(self) -> list[VariableTracker]:
         return [x.variable for x in self.graph_outputs.values()]
 
@@ -124,6 +154,16 @@ def call_reconstruct(
     def add_push_null(
         self, gen_fn: Callable[[], None], call_function_ex: bool = False
     ) -> None:
+=======
+    def graph_output_vars(self):
+        return [x.variable for x in self.graph_outputs.values()]
+
+    def call_reconstruct(self, value):
+        res = value.reconstruct(self)
+        assert res is None, f"reconstruct!=None {value}"
+
+    def add_push_null(self, gen_fn, call_function_ex=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         `gen_fn` generates instructions via PyCodegen methods
         that push a single callable to the stack.
@@ -152,9 +192,13 @@ def add_push_null(
             # NULL will be at top of stack
             self.clear_tos()
 
+<<<<<<< HEAD
     def __call__(
         self, value: Union[VariableTracker, Source], allow_cache: bool = True
     ) -> None:
+=======
+    def __call__(self, value, allow_cache=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Generate code such that top-of-stack (TOS) is set to value.
 
@@ -295,7 +339,11 @@ def __call__(
             output.extend(create_call_function(2, False))
         elif (
             isinstance(value, SymNodeVariable)
+<<<<<<< HEAD
             and value.python_type() is float
+=======
+            and value.python_type() == float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and not self.tx.export
         ):
             # This is a little unusual; force the output convention to be a
@@ -309,7 +357,11 @@ def __call__(
                 value.as_tensor(self.tx, torch.float64)
             )
 
+<<<<<<< HEAD
             def gen_fn() -> None:
+=======
+            def gen_fn():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.load_graph_output(graph_outputs[graph_outputs_key].index)
                 output.append(self.create_load_attr("item"))
 
@@ -334,7 +386,11 @@ def gen_fn() -> None:
                 output.extend(create_call_function(1, False))
             elif isinstance(value, UnspecializedPythonVariable) and value.need_unwrap:
 
+<<<<<<< HEAD
                 def gen_fn() -> None:
+=======
+                def gen_fn():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.load_graph_output(graph_outputs[graph_outputs_key].index)
                     output.append(self.create_load_attr("item"))
 
@@ -375,7 +431,11 @@ def gen_fn() -> None:
 
         self.top_of_stack = value
 
+<<<<<<< HEAD
     def add_graph_output(self, value: VariableTracker) -> int:
+=======
+    def add_graph_output(self, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_outputs_key = id(value.as_proxy())
         if graph_outputs_key not in self.graph_outputs:
             self.graph_outputs[graph_outputs_key] = GraphOutputEntry(
@@ -383,26 +443,45 @@ def add_graph_output(self, value: VariableTracker) -> int:
             )
         return graph_outputs_key
 
+<<<<<<< HEAD
     def load_graph_output(self, index: int) -> None:
         output = self._output
         assert self.graph_output_var is not None
+=======
+    def load_graph_output(self, index):
+        output = self._output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output.append(self.create_load(self.graph_output_var))
         output.append(self.create_load_const(index))
         output.append(self.create_binary_subscr())
 
+<<<<<<< HEAD
     def add_cache(self, value: Union[VariableTracker, Source]) -> None:
+=======
+    def add_cache(self, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var = self.new_var()
         self.tempvars[value] = var
         self._output.append(self.create_store(var))
 
+<<<<<<< HEAD
     def foreach(self, items: Iterable[Union[VariableTracker, Source]]) -> None:
+=======
+    def foreach(self, items):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in items:
             self(i)
 
     def create_binary_subscr(self) -> Instruction:
+<<<<<<< HEAD
         return create_binary_subscr()
 
     def setup_globally_cached(self, name: str, value: Any) -> list[Instruction]:
+=======
+        return create_instruction("BINARY_SUBSCR")
+
+    def setup_globally_cached(self, name, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Store value in a new global"""
         name = re.sub(r"[^a-zA-Z0-9_]+", "_", name)
         f_globals = self.tx.f_globals
@@ -412,15 +491,26 @@ def setup_globally_cached(self, name: str, value: Any) -> list[Instruction]:
             f_globals[name] = value
         return [self.create_load_global(name, add=True)]
 
+<<<<<<< HEAD
     def clear_tos(self) -> None:
         self.top_of_stack = None
 
     def append_output(self, inst: Instruction) -> None:
+=======
+    def clear_tos(self):
+        self.top_of_stack = None
+
+    def append_output(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(inst, Instruction)
         self._output.append(inst)
         self.clear_tos()
 
+<<<<<<< HEAD
     def extend_output(self, insts: list[Instruction]) -> None:
+=======
+    def extend_output(self, insts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(isinstance(x, Instruction) for x in insts)
         self._output.extend(insts)
         self.clear_tos()
@@ -428,15 +518,24 @@ def extend_output(self, insts: list[Instruction]) -> None:
     def get_instructions(self) -> list[Instruction]:
         return self._output
 
+<<<<<<< HEAD
     def create_load(self, name: str) -> Instruction:
         assert name in self.code_options["co_varnames"], f"{name} missing"
         return create_instruction("LOAD_FAST", argval=name)
 
     def create_load_closure(self, name: str) -> Instruction:
+=======
+    def create_load(self, name) -> Instruction:
+        assert name in self.code_options["co_varnames"], f"{name} missing"
+        return create_instruction("LOAD_FAST", argval=name)
+
+    def create_load_closure(self, name) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert name in self.cell_and_freevars()
         inst_name = "LOAD_FAST" if sys.version_info >= (3, 13) else "LOAD_CLOSURE"
         return create_instruction(inst_name, argval=name)
 
+<<<<<<< HEAD
     def create_load_deref(self, name: str) -> Instruction:
         assert name in self.cell_and_freevars()
         return create_instruction("LOAD_DEREF", argval=name)
@@ -450,11 +549,27 @@ def create_store_deref(self, name: str) -> Instruction:
         return create_instruction("STORE_DEREF", argval=name)
 
     def create_load_global(self, name: str, add: bool = False) -> Instruction:
+=======
+    def create_load_deref(self, name) -> Instruction:
+        assert name in self.cell_and_freevars()
+        return create_instruction("LOAD_DEREF", argval=name)
+
+    def create_store(self, name) -> Instruction:
+        assert name in self.code_options["co_varnames"], f"{name} missing"
+        return create_instruction("STORE_FAST", argval=name)
+
+    def create_store_deref(self, name) -> Instruction:
+        assert name in self.cell_and_freevars()
+        return create_instruction("STORE_DEREF", argval=name)
+
+    def create_load_global(self, name, add=False) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if add:
             self.tx.output.update_co_names(name)
         assert name in self.code_options["co_names"], f"{name} not in co_names"
         return create_instruction("LOAD_GLOBAL", argval=name)
 
+<<<<<<< HEAD
     def create_load_const(self, value: Any) -> Instruction:
         return create_load_const(value)
 
@@ -469,10 +584,27 @@ def call_method(self, nargs: int) -> None:
         self.extend_output(create_call_method(nargs))
 
     def create_load_attr(self, name: str) -> Instruction:
+=======
+    def create_load_const(self, value) -> Instruction:
+        return create_load_const(value)
+
+    def create_load_const_unchecked(self, value) -> Instruction:
+        return create_load_const(value, checked=False)
+
+    def load_method(self, name):
+        self.tx.output.update_co_names(name)
+        self.append_output(create_load_method(name))
+
+    def call_method(self, nargs):
+        self.extend_output(create_call_method(nargs))
+
+    def create_load_attr(self, name) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name not in self.code_options["co_names"]:
             self.code_options["co_names"] += (name,)
         return create_instruction("LOAD_ATTR", argval=name)
 
+<<<<<<< HEAD
     def load_attr(self, name: str) -> None:
         self.append_output(self.create_load_attr(name))
 
@@ -480,16 +612,32 @@ def create_load_attrs(self, names: str) -> list[Instruction]:
         return [self.create_load_attr(name) for name in names.split(".")]
 
     def create_store_attr(self, name: str) -> Instruction:
+=======
+    def load_attr(self, name):
+        self.append_output(self.create_load_attr(name))
+
+    def create_load_attrs(self, names):
+        return [self.create_load_attr(name) for name in names.split(".")]
+
+    def create_store_attr(self, name) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name not in self.code_options["co_names"]:
             self.code_options["co_names"] += (name,)
         return create_instruction("STORE_ATTR", argval=name)
 
+<<<<<<< HEAD
     def store_attr(self, name: str) -> None:
         self.append_output(self.create_store_attr(name))
 
     def load_function_name(
         self, fn_name: str, push_null: bool, num_on_stack: int = 0
     ) -> list[Instruction]:
+=======
+    def store_attr(self, name):
+        self.append_output(self.create_store_attr(name))
+
+    def load_function_name(self, fn_name, push_null, num_on_stack=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the global fn_name on the stack num_on_stack down"""
         output = []
         if push_null and sys.version_info >= (3, 11):
@@ -510,12 +658,17 @@ def load_function_name(
             )
         return output
 
+<<<<<<< HEAD
     def rot_n(self, n: int) -> list[Instruction]:
+=======
+    def rot_n(self, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             return create_rot_n(n)
         except AttributeError:
             # desired rotate bytecode doesn't exist, generate equivalent bytecode
             return [
+<<<<<<< HEAD
                 create_build_tuple(n),
                 self.create_load_const_unchecked(rot_n_helper(n)),
                 *create_rot_n(2),
@@ -570,6 +723,69 @@ def make_function_with_closure(
         self.clear_tos()
 
     def create_load_python_module(self, mod: types.ModuleType) -> Instruction:
+=======
+                create_instruction("BUILD_TUPLE", arg=n),
+                self.create_load_const_unchecked(rot_n_helper(n)),
+                *create_rot_n(2),
+                create_instruction("CALL_FUNCTION_EX", arg=0),
+                create_instruction("UNPACK_SEQUENCE", arg=n),
+            ]
+
+    def pop_top(self):
+        self.append_output(create_instruction("POP_TOP"))
+
+    def call_function(self, nargs: int, push_null: bool):
+        self.extend_output(create_call_function(nargs, push_null=push_null))
+
+    def dup_top(self):
+        self.append_output(create_dup_top())
+
+    def store(self, varname):
+        self.append_output(self.create_store(varname))
+
+    def load_deref(self, varname):
+        self.append_output(self.create_load_deref(varname))
+
+    def make_function_with_closure(
+        self, fn_name: str, code: types.CodeType, push_null: bool, num_on_stack=0
+    ):
+        freevars = code.co_freevars
+        assert freevars
+        output = self._output
+
+        def gen_fn():
+            # Emitting `LOAD_FAST/LOAD_CLOSURE` with names in `co_freevars`
+            # requires that in the generated bytecode, these cells would keep
+            # their original local names, which we ensure via
+            # `CellVariable.local_name`.
+            for var in freevars:
+                assert var in self.cell_and_freevars()
+                output.append(self.create_load_closure(var))
+            output.append(create_instruction("BUILD_TUPLE", arg=len(freevars)))
+            output.append(self.create_load_const(code))
+            if sys.version_info < (3, 11):
+                output.append(self.create_load_const(fn_name))
+            if sys.version_info >= (3, 13):
+                output.extend(
+                    [
+                        create_instruction("MAKE_FUNCTION"),
+                        create_instruction("SET_FUNCTION_ATTRIBUTE", arg=0x08),
+                    ]
+                )
+            else:
+                output.append(create_instruction("MAKE_FUNCTION", arg=0x08))
+
+        if push_null and sys.version_info >= (3, 11):
+            self.add_push_null(gen_fn)
+            output.extend(self.rot_n(num_on_stack + 2))
+            output.extend(self.rot_n(num_on_stack + 2))
+        else:
+            gen_fn()
+            output.extend(self.rot_n(num_on_stack + 1))
+        self.clear_tos()
+
+    def create_load_python_module(self, mod) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Generate a LOAD_GLOBAL instruction to fetch a given python module.
         """
@@ -597,7 +813,11 @@ def make_call_generated_code(self, fn_name: str) -> None:
 
         seen_sources: OrderedSet[Source] = OrderedSet()
 
+<<<<<<< HEAD
         def collect_temp_source(source: Source) -> None:
+=======
+        def collect_temp_source(source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if source in seen_sources:
                 # This source is used at least twice, so it can be reused
                 self.mark_source_temp(source)
@@ -663,10 +883,14 @@ def collect_temp_source(source: Source) -> None:
 
         self.extend_output(create_call_function(len(graphargs), False))
 
+<<<<<<< HEAD
     def create_import_name(self, module_name: str) -> Instruction:
         return create_instruction("IMPORT_NAME", argval=module_name)
 
     def load_import_from(self, module_name: str, object_name: str) -> None:
+=======
+    def load_import_from(self, module_name, object_name) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source = AttrSource(self.tx.import_source(module_name), object_name)
         # Note: This approach is somewhat aggressive because typically, a source is marked
         # as a tempvar only when it is used more than once. In this case, we're marking it
@@ -675,9 +899,13 @@ def load_import_from(self, module_name: str, object_name: str) -> None:
         self.mark_source_temp(source)
         self(source)
 
+<<<<<<< HEAD
     def create_call_function_kw(
         self, nargs: int, kw_names: Iterable[str], push_null: bool
     ) -> list[Instruction]:
+=======
+    def create_call_function_kw(self, nargs, kw_names, push_null) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 13):
             output = create_call_function(nargs, push_null)
             assert output[-1].opname == "CALL"
@@ -701,5 +929,9 @@ def create_call_function_kw(
             create_instruction("CALL_FUNCTION_KW", arg=nargs),
         ]
 
+<<<<<<< HEAD
     def create_delete(self, value: object) -> Instruction:
+=======
+    def create_delete(self, value) -> Instruction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return create_instruction("DELETE_FAST", argval=value)
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index 5af72310b3a7f..b9c072d2ed743 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Provides functionality for compiling PyTorch's autograd (automatic differentiation) system.
 
@@ -20,12 +25,19 @@
 import operator
 import time
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Sequence
 from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.utils._pytree as pytree
 from torch._dispatch.python import enable_python_dispatcher
+=======
+from typing import Optional, TYPE_CHECKING, Union
+
+import torch
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.external_utils import (
     call_accumulate_grad,
     call_backward,
@@ -44,11 +56,18 @@
     AutogradLazyBackwardCompileInfo,
     CachedAutogradLazyBackwardCompileInfo,
 )
+<<<<<<< HEAD
 from torch._guards import compile_context, CompileContext, CompileId, Source
 from torch._logging import getArtifactLogger, trace_structured
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses import FakeTensorMode
 from torch._subclasses.fake_tensor import FakeTensor
+=======
+from torch._guards import compile_context, CompileContext, CompileId
+from torch._logging import getArtifactLogger, trace_structured
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses import FakeTensorMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx import GraphModule
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import (
@@ -62,7 +81,10 @@
 )
 from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
 from torch.fx.traceback import preserve_node_meta, set_stack_trace
+<<<<<<< HEAD
 from torch.types import FloatLikeType, IntLikeType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._traceback import CapturedTraceback
 
@@ -81,23 +103,39 @@
 verbose_log = getArtifactLogger(__name__, "compiled_autograd_verbose")
 
 
+<<<<<<< HEAD
 def snapshot_verbose_logging_enabled() -> bool:
+=======
+def snapshot_verbose_logging_enabled():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._logging._internal.log_state.is_artifact_enabled(
         "compiled_autograd_verbose"
     )
 
 
+<<<<<<< HEAD
 def snapshot_cudagraph_enabled() -> bool:
     return torch._inductor.config.triton.cudagraphs
 
 
 def maybe_clone(x: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+=======
+def snapshot_cudagraph_enabled():
+    return torch._inductor.config.triton.cudagraphs
+
+
+def maybe_clone(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if x is not None:
         return clone_preserve_strides(x)
     return x
 
 
+<<<<<<< HEAD
 def extract_bw_module(CompiledFunction: Any) -> Callable[..., Any]:
+=======
+def extract_bw_module(CompiledFunction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(
         CompiledFunction._lazy_backward_info, AutogradLazyBackwardCompileInfo
     ):
@@ -125,13 +163,21 @@ def extract_bw_module(CompiledFunction: Any) -> Callable[..., Any]:
 # So different semantics are needed, this implementation below will check
 # for NaNs at the end of the autograd call, instead of after each node
 class NaNChecker:
+<<<<<<< HEAD
     def __init__(self, accumulate_grad: bool) -> None:
+=======
+    def __init__(self, accumulate_grad: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.accumulate_grad = accumulate_grad
         self.params_indices: list[int] = []
         self.params_to_check: dict[str, torch.Tensor] = {}
         self.output_names: list[str] = []
 
+<<<<<<< HEAD
     def prep_with_graph(self, graph: torch.fx.Graph) -> None:
+=======
+    def prep_with_graph(self, graph: torch.fx.Graph):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs_node = next(iter(graph.nodes))
         acc_grad_nodes = graph.find_nodes(
             op="call_function", target=call_accumulate_grad
@@ -155,7 +201,11 @@ def prep_with_graph(self, graph: torch.fx.Graph) -> None:
 
         self.output_names = [node.name for node in output_nodes]
 
+<<<<<<< HEAD
     def prep_with_inputs(self, inputs: tuple[torch.Tensor]) -> None:
+=======
+    def prep_with_inputs(self, inputs: tuple[torch.Tensor]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.accumulate_grad:
             # Using .grad, nothing to prep
             return
@@ -166,12 +216,20 @@ def prep_with_inputs(self, inputs: tuple[torch.Tensor]) -> None:
             if grad is not None:
                 assert not torch.isnan(grad).any(), (
                     f"Compiled autograd running under anomaly mode with inputs[{idx}] already "
+<<<<<<< HEAD
                     f"having NaN gradient. This is not supported. {TURN_OFF_MSG}"
+=======
+                    "having NaN gradient. This is not supported. {TURN_OFF_MSG}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             self.params_to_check[f"inputs[{idx}]"] = inputs[idx]
 
+<<<<<<< HEAD
     def check(self, out: tuple[torch.Tensor]) -> None:
+=======
+    def check(self, out: tuple[torch.Tensor]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.accumulate_grad:
             # Using .backward, graph outputs are empty
             assert not out
@@ -204,6 +262,7 @@ def check(self, out: tuple[torch.Tensor]) -> None:
 # function is called. It's possible to avoid lazy binding and instead bind
 # all of this upfront (perhaps at import time) via codegen changes.
 class OpNamespace:
+<<<<<<< HEAD
     def __init__(self) -> None:
         self.custom_function_name_counter: Counter[str] = Counter()
 
@@ -214,6 +273,12 @@ def add(
         is_custom_function: bool,
         is_traceable: bool,
     ) -> str:
+=======
+    def __init__(self):
+        self.custom_function_name_counter: Counter[str] = Counter()
+
+    def add(self, name, fn, is_custom_function, is_traceable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_custom_function:
             name = "CppNode" + name
             count = self.custom_function_name_counter[name]
@@ -227,30 +292,50 @@ def add(
         else:
             # C++ autograd function was not marked as traceable
             # Dynamo can't dry run it at compile time, so must fallback to eager
+<<<<<<< HEAD
             @torch._dynamo.disable  # type: ignore[misc]
             def run_non_traceable_cpp_in_eager(*args: Any, **kwargs: Any) -> Any:
+=======
+            @torch._dynamo.disable
+            def run_non_traceable_cpp_in_eager(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return result(*args, **kwargs)
 
             setattr(self, name, run_non_traceable_cpp_in_eager)
         return name
 
+<<<<<<< HEAD
     def get(self, name: str) -> Any:
+=======
+    def get(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return getattr(self, name)
 
 
 class Op:
+<<<<<<< HEAD
     def __init__(
         self, name: str, fn: Callable[..., Any], is_custom_function: bool
     ) -> None:
+=======
+    def __init__(self, name, fn, is_custom_function):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fn = fn
         self.is_custom_function = is_custom_function
         self.__name__ = name
         self.__module__ = "torch._dynamo.compiled_autograd.ops"
 
+<<<<<<< HEAD
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return self.fn(*args, **kwargs)
 
     def __repr__(self) -> str:
+=======
+    def __call__(self, *args, **kwargs):
+        return self.fn(*args, **kwargs)
+
+    def __repr__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.__module__ + "." + self.__name__
 
 
@@ -270,7 +355,11 @@ def __repr__(self) -> str:
 COMPILE_COUNTER = itertools.count()
 
 
+<<<<<<< HEAD
 def make_compile_context(compiled_autograd_id: int) -> Any:
+=======
+def make_compile_context(compiled_autograd_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return compile_context(
         CompileContext(
             CompileId(
@@ -283,7 +372,11 @@ def make_compile_context(compiled_autograd_id: int) -> Any:
 
 
 class AutogradCompilerInstance:
+<<<<<<< HEAD
     def __init__(self, compiler_fn: Callable[..., Any]) -> None:
+=======
+    def __init__(self, compiler_fn) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.compiler_fn = compiler_fn
         self.stack = contextlib.ExitStack()
         self.close = self.stack.close
@@ -297,12 +390,20 @@ def __init__(self, compiler_fn: Callable[..., Any]) -> None:
         self.proxy_mode = ProxyTorchDispatchMode(self.fx_tracer, "symbolic")
         self.hooks_proxy: Optional[Proxy] = None
 
+<<<<<<< HEAD
     def wrap_fake(self, x: torch.Tensor, source: Optional[Source]) -> FakeTensor:
+=======
+    def wrap_fake(self, x, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(x, torch.Tensor)
         return self.fake_tensor_mode.from_tensor(x, source=source)
 
     @staticmethod
+<<<<<<< HEAD
     def source(name: str, idx: Any) -> GetItemSource:
+=======
+    def source(name, idx) -> GetItemSource:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return GetItemSource(LocalSource(name), idx)
 
     def begin_capture(
@@ -313,7 +414,11 @@ def begin_capture(
         origins: list[list[tuple[int, str]]],
         accumulate_grad: bool,
         check_nans: bool,
+<<<<<<< HEAD
     ) -> tuple[str, list[torch.Tensor], list[IntLikeType], list[FloatLikeType]]:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counters["compiled_autograd"]["captures"] += 1
         self.id = next(COMPILE_COUNTER)
         self.aot_id_counter: dict[int, int] = defaultdict(int)
@@ -345,10 +450,13 @@ def begin_capture(
         self.stack.enter_context(preserve_node_meta())
         inputs_origins, sizes_origins, scalars_origins = origins
 
+<<<<<<< HEAD
         # Turn on PythonDispatcher during initial trace to make it identifiable
         # that tracing is happening, which is needed to prevent hashing symints
         self.stack.enter_context(enable_python_dispatcher())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # tensor inputs to fake tensors
         x = inputs[0]  # mypy will complain about unbound x
         try:
@@ -361,7 +469,11 @@ def begin_capture(
         self.bind_objects_to_proxies(inputs, args_proxy, inputs_origins)
 
         # size inputs to symints
+<<<<<<< HEAD
         sym_sizes = [
+=======
+        sizes = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.shape_env.create_unspecified_symint_and_symbol(
                 val,
                 self.source("sizes", idx),
@@ -373,8 +485,13 @@ def begin_capture(
         # We want to mark every size as dynamic, but since there's no way to
         # mark a primitive `int` as dynamic, we need to wrap it in a tensor.
         # In the graph, we unwrap it with `unwrap_maybe_dynamic_int` back into a primitive.
+<<<<<<< HEAD
         proxies = [self.sizes_proxy[i] for i in range(len(sym_sizes))]  # type: ignore[index]
         for i, symint in enumerate(sym_sizes):
+=======
+        proxies = [self.sizes_proxy[i] for i in range(len(sizes))]  # type: ignore[index]
+        for i, symint in enumerate(sizes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             proxies[i] = self.fx_tracer.create_proxy(
                 "call_function",
                 unwrap_maybe_dynamic_int,
@@ -382,7 +499,11 @@ def begin_capture(
                 {},
             )
             self.symnode_proxy_lookup[symint.node] = proxies[i]
+<<<<<<< HEAD
         proxies = self.bind_objects_to_proxies(sym_sizes, proxies, sizes_origins)
+=======
+        proxies = self.bind_objects_to_proxies(sizes, proxies, sizes_origins)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for idx, val in enumerate(scalars):
             source = self.source("scalars", idx)
@@ -422,14 +543,23 @@ def begin_capture(
         return (
             str(CompileContext.current_compile_id()),
             inputs,
+<<<<<<< HEAD
             sym_sizes,
             scalars,  # type: ignore[return-value]
+=======
+            sizes,
+            scalars,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def log_compile_reasons(
         self,
         compile_reasons: list[str],
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert compile_reasons
         trace_structured(
             "artifact",
@@ -442,6 +572,7 @@ def log_compile_reasons(
 
     def proxy_call_aot_backward(
         self,
+<<<<<<< HEAD
         pinputs: Sequence[Any],
         psaved_tensors: Sequence[torch.Tensor],
         saved_tensors: Sequence[torch.Tensor],
@@ -449,6 +580,15 @@ def proxy_call_aot_backward(
         ctx: Any,
         maybe_backward_state_idx: Optional[int],
     ) -> Sequence[Any]:
+=======
+        pinputs,
+        psaved_tensors,
+        saved_tensors,
+        pctx,
+        ctx,
+        maybe_backward_state_idx,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The AOTBackward call consists of three things: the prologue, the
         # backward graph, and the epilogue.
         # Our strategy is:
@@ -478,11 +618,15 @@ def proxy_call_aot_backward(
                     )
 
         @torch._dynamo.allow_in_graph  # type: ignore[misc]
+<<<<<<< HEAD
         def call_aot_bwd_prologue(
             ctx_saved_tensors: Sequence[torch.Tensor],
             ctx_symints: Sequence[IntLikeType],
             *flat_args: Sequence[Any],
         ) -> Any:
+=======
+        def call_aot_bwd_prologue(ctx_saved_tensors, ctx_symints, *flat_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = torch._functorch._aot_autograd.runtime_wrappers._backward_prologue_functional(
                 ctx_saved_tensors,
                 ctx_symints,
@@ -508,8 +652,13 @@ def call_aot_bwd_prologue(
             pbackward_state = self.hooks_proxy[maybe_backward_state_idx]  # type: ignore[index]
 
         # Copy-paste the AOT backward graph into the compiled autograd graph
+<<<<<<< HEAD
         def copy_paste_aot_backward_graph() -> list[torch.Tensor]:
             def num_inputs(graph: torch.fx.Graph) -> int:
+=======
+        def copy_paste_aot_backward_graph():
+            def num_inputs(graph):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 num_args = 0
                 for node in graph.nodes:
                     if node.op == "placeholder":
@@ -521,7 +670,11 @@ def num_inputs(graph: torch.fx.Graph) -> int:
 
             # set up the proxy inputs to bw_module
             # the calling convention is: [*symints, *args (primals and tangents), backward_state]
+<<<<<<< HEAD
             num_args = num_inputs(bw_module.graph)  # type: ignore[attr-defined]
+=======
+            num_args = num_inputs(bw_module.graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pall_args = [
                 pgrads[i] for i in range(num_args - int(pbackward_state is not None))
             ]
@@ -547,11 +700,19 @@ def num_inputs(graph: torch.fx.Graph) -> int:
                 deduped_aot_id += f"_{self.aot_id_counter[aot_id]}"
             self.aot_id_counter[aot_id] += 1
 
+<<<<<<< HEAD
             def make_unique(node_name: str) -> str:
                 # make it both informative and unique
                 return f"aot{deduped_aot_id}_{node_name}"
 
             for node in bw_module.graph.nodes:  # type: ignore[attr-defined]
+=======
+            def make_unique(node_name):
+                # make it both informative and unique
+                return f"aot{deduped_aot_id}_{node_name}"
+
+            for node in bw_module.graph.nodes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if node.op == "placeholder":
                     ph = pall_args[args_idx].node
                     ph.name = make_unique(node.name)
@@ -599,7 +760,11 @@ def make_unique(node_name: str) -> str:
 
             # In general we don't know what the shapes of the outputs are, so allocate
             # some dummy sizes for them.
+<<<<<<< HEAD
             def dummy() -> torch.Tensor:
+=======
+            def dummy():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with disable_proxy_modes_tracing():
                     return torch.zeros(0, 0, 0, 0, 123)
 
@@ -611,11 +776,17 @@ def dummy() -> torch.Tensor:
 
         outputs = copy_paste_aot_backward_graph()
 
+<<<<<<< HEAD
         def proxy_subclass_constructor(
             subclass_meta: Any, is_runtime: bool, unwrapped_args: Sequence[Any]
         ) -> torch.Tensor:
             @torch._dynamo.allow_in_graph  # type: ignore[misc]
             def make_subclass(*unwrapped_args: Any) -> Any:
+=======
+        def proxy_subclass_constructor(subclass_meta, is_runtime, unwrapped_args):
+            @torch._dynamo.allow_in_graph
+            def make_subclass(*unwrapped_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return subclass_meta.creation_fn(unwrapped_args, is_runtime=is_runtime)
 
             punwrapped_args = pytree.tree_map(self.to_proxy, unwrapped_args)
@@ -642,6 +813,7 @@ def make_subclass(*unwrapped_args: Any) -> Any:
 
     def proxy_call_backward(
         self,
+<<<<<<< HEAD
         inputs: Sequence[Any],
         output_metadatas: Sequence[Any],
         saved_tensors: Sequence[torch.Tensor],
@@ -649,6 +821,15 @@ def proxy_call_backward(
         ctx: torch.autograd.function.BackwardCFunction,
         maybe_backward_state_idx: Optional[int],
     ) -> tuple[Optional[torch.Tensor], ...]:
+=======
+        inputs,
+        output_metadatas,
+        saved_tensors,
+        backward_idx: int,
+        ctx: torch.autograd.function.BackwardCFunction,
+        maybe_backward_state_idx: Optional[int],
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         pctx = self.hooks_proxy[backward_idx]  # type: ignore[index]
         pinputs = self.to_proxy(inputs)
@@ -693,6 +874,7 @@ def proxy_call_backward(
 
     def call_copy_slices_prologue(
         self,
+<<<<<<< HEAD
         inputs: Sequence[Any],
         base_sizes: Sequence[Any],
         base_strides: Sequence[Any],
@@ -701,6 +883,16 @@ def call_copy_slices_prologue(
         view_strides: Sequence[Any],
         view_storage_offset: Any,
     ) -> Sequence[torch.Tensor]:
+=======
+        inputs,
+        base_sizes,
+        base_strides,
+        base_storage_offset,
+        view_sizes,
+        view_strides,
+        view_storage_offset,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = (
             inputs,
             self.to_proxy(base_sizes),
@@ -712,6 +904,7 @@ def call_copy_slices_prologue(
         )
         return self.proxy_call(copy_slices_prologue, args, [None] * 3)
 
+<<<<<<< HEAD
     def call_copy_slices_epilogue(
         self,
         needs_input_grad: Sequence[bool],
@@ -719,17 +912,25 @@ def call_copy_slices_epilogue(
         res: Sequence[Any],
         grad_slice: torch.Tensor,
     ) -> Sequence[torch.Tensor]:
+=======
+    def call_copy_slices_epilogue(self, needs_input_grad, result, res, grad_slice):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.proxy_call(
             copy_slices_epilogue,
             (needs_input_grad, result, res, grad_slice),
             [None] * len(needs_input_grad),
         )
 
+<<<<<<< HEAD
     def allocate_dummy(self) -> torch.Tensor:
+=======
+    def allocate_dummy(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with disable_proxy_modes_tracing():
             # Weird quantity so it's easy to grep
             return torch.zeros([0, 123456789])
 
+<<<<<<< HEAD
     def bind_function(
         self,
         fn_name: str,
@@ -747,13 +948,24 @@ def apply_functional(
         args: Any,
         output_metadata: Sequence[Any],
     ) -> Sequence[torch.Tensor]:
+=======
+    def bind_function(self, fn_name, fn, is_custom_function, is_traceable):
+        """Binds ops.fn_name = fn"""
+        return ops.add(fn_name, fn, is_custom_function, is_traceable)
+
+    def apply_functional(self, fn_name, grads, args, output_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Proxies a call to ops.fn_name(grads, *args) into the graph"""
         op = ops.get(fn_name)
         return self.proxy_call(op, (grads, *args), output_metadata)
 
+<<<<<<< HEAD
     def proxy_call(
         self, fn: Callable[..., Any], args: Any, output_metadata: Sequence[Any]
     ) -> Sequence[torch.Tensor]:
+=======
+    def proxy_call(self, fn, args, output_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Proxies a call to fn(*args) into the graph"""
         flat_args, _ = pytree.tree_flatten(args)
         proxy_args = pytree.tree_map(lambda e: self.to_proxy(e), args)
@@ -764,9 +976,13 @@ def proxy_call(
         self.bind_objects_to_proxies(result, [proxy_out[i] for i in range(len(result))])
         return result
 
+<<<<<<< HEAD
     def validate_outputs(
         self, _: Any, outputs: Sequence[Any], args: Any, output_metadata: Sequence[Any]
     ) -> Sequence[torch.Tensor]:
+=======
+    def validate_outputs(self, _, outputs, args, output_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Proxies a call to ops.validate_outputs(outputs, *args) into the graph"""
         op = ops.get("validate_outputs")
         proxy_args = pytree.tree_map(self.to_proxy, (outputs, *args))
@@ -777,7 +993,11 @@ def validate_outputs(
         self.bind_objects_to_proxies(outputs, new_proxy_outputs)
         return outputs
 
+<<<<<<< HEAD
     def accumulate(self, old_var: Any, new_var: Any) -> torch.Tensor:
+=======
+    def accumulate(self, old_var, new_var):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         old_var_proxy = self.to_proxy(old_var)
         new_var_proxy = self.to_proxy(new_var)
         proxy_out = self.fx_tracer.create_proxy(
@@ -787,9 +1007,13 @@ def accumulate(self, old_var: Any, new_var: Any) -> torch.Tensor:
         self.bind_objects_to_proxies([result], [proxy_out])
         return result
 
+<<<<<<< HEAD
     def accumulate_grad(
         self, variable: torch.Tensor, grad: torch.Tensor, has_post_hooks: bool
     ) -> None:
+=======
+    def accumulate_grad(self, variable, grad, has_post_hooks):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fx_tracer.create_proxy(
             "call_function",
             call_accumulate_grad,
@@ -801,9 +1025,13 @@ def accumulate_grad(
             kwargs={},
         )
 
+<<<<<<< HEAD
     def proxy_call_hook(
         self, hook: Callable[..., Any], *args: Any, **kwargs: Any
     ) -> torch.fx.Proxy:
+=======
+    def proxy_call_hook(self, hook, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.fx_tracer.create_proxy(
             "call_function",
             call_hook,
@@ -814,7 +1042,11 @@ def proxy_call_hook(
             kwargs,
         )
 
+<<<<<<< HEAD
     def unpack_hook(self, hook_id: int, data_id: int) -> torch.Tensor:
+=======
+    def unpack_hook(self, hook_id, data_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         data = self.packed_data_proxy[data_id]  # type: ignore[index]
@@ -827,9 +1059,13 @@ def unpack_hook(self, hook_id: int, data_id: int) -> torch.Tensor:
         self.bind_objects_to_proxies([out], [proxy])
         return out
 
+<<<<<<< HEAD
     def tensor_pre_hook(
         self, inputs: list[torch.Tensor], hook_id: int, i: int
     ) -> list[torch.Tensor]:
+=======
+    def tensor_pre_hook(self, inputs, hook_id, i: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxy = self.proxy_call_hook(
@@ -838,6 +1074,7 @@ def tensor_pre_hook(
             hook_type="tensor_pre_hook",
         )
         with disable_proxy_modes_tracing():
+<<<<<<< HEAD
             inputs[i] = maybe_clone(inputs[i])  # type: ignore[assignment]
             self.bind_objects_to_proxies([inputs[i]], [proxy])
         return inputs
@@ -845,6 +1082,13 @@ def tensor_pre_hook(
     def cpp_tensor_pre_hook(
         self, inputs: list[torch.Tensor], hook_id: int, i: int
     ) -> list[torch.Tensor]:
+=======
+            inputs[i] = maybe_clone(inputs[i])
+            self.bind_objects_to_proxies([inputs[i]], [proxy])
+        return inputs
+
+    def cpp_tensor_pre_hook(self, inputs: list[torch.Tensor], hook_id: int, i: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proxy = self.fx_tracer.create_proxy(
             "call_function",
             torch._C._dynamo.compiled_autograd.call_cpp_tensor_pre_hooks,
@@ -852,11 +1096,19 @@ def cpp_tensor_pre_hook(
             {},
         )
         with disable_proxy_modes_tracing():
+<<<<<<< HEAD
             inputs[i] = maybe_clone(inputs[i])  # type: ignore[assignment]
             self.bind_objects_to_proxies([inputs[i]], [proxy])
         return inputs
 
     def pre_hook(self, inputs: Sequence[Any], hook_id: int) -> list[torch.Tensor]:
+=======
+            inputs[i] = maybe_clone(inputs[i])
+            self.bind_objects_to_proxies([inputs[i]], [proxy])
+        return inputs
+
+    def pre_hook(self, inputs, hook_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
@@ -869,9 +1121,13 @@ def pre_hook(self, inputs: Sequence[Any], hook_id: int) -> list[torch.Tensor]:
             self.bind_objects_to_proxies(inputs, proxies)
         return inputs
 
+<<<<<<< HEAD
     def post_hook(
         self, outputs: list[torch.Tensor], inputs: Sequence[torch.Tensor], hook_id: int
     ) -> list[torch.Tensor]:
+=======
+    def post_hook(self, outputs, inputs, hook_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
@@ -881,6 +1137,7 @@ def post_hook(
             hook_type="post_hook",
         )
         with disable_proxy_modes_tracing():
+<<<<<<< HEAD
             outputs = [maybe_clone(x) for x in outputs]  # type: ignore[misc]
             self.bind_objects_to_proxies(outputs, proxies)
         return outputs
@@ -888,6 +1145,13 @@ def post_hook(
     def post_acc_grad_hook(
         self, input: torch.Tensor, hook_id: int
     ) -> list[torch.Tensor]:
+=======
+            outputs = [maybe_clone(x) for x in outputs]
+            self.bind_objects_to_proxies(outputs, proxies)
+        return outputs
+
+    def post_acc_grad_hook(self, input, hook_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(input, torch.Tensor)
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
@@ -897,16 +1161,26 @@ def post_acc_grad_hook(
             hook_type="post_acc_grad_hook",
         )
         with disable_proxy_modes_tracing():
+<<<<<<< HEAD
             res = [maybe_clone(input)]
             self.bind_objects_to_proxies(res, [proxy])
         return res  # type: ignore[return-value]
+=======
+            input = [maybe_clone(input)]
+            self.bind_objects_to_proxies(input, [proxy])
+        return input
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Note: [Compiled autograd and cudagraphs]
     # Eager autograd backward implements scalars as 0-dim tensors, see DivBackward0::other_.
     # When compiled autograd traces those nodes, it lifts the scalar tensors, resulting in a graph
     # with some cpu 0-dim tensor inputs. To prevent the entire graph from skipping cudagraph, we move the
     # scalars tensors to cuda. This works because ATen/prims ops will accept cuda 0-dim tensors too.
+<<<<<<< HEAD
     def move_graph_nodes_to_cuda(self, graph: torch.fx.Graph) -> list[int]:
+=======
+    def move_graph_nodes_to_cuda(self, graph) -> list[int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to_move: dict[int, torch.fx.Node] = {}
         has_cuda_inputs = False
         nodes = list(graph.nodes)
@@ -955,7 +1229,11 @@ def move_graph_nodes_to_cuda(self, graph: torch.fx.Graph) -> list[int]:
 
         return []
 
+<<<<<<< HEAD
     def is_sym_node(self, node: Any) -> bool:
+=======
+    def is_sym_node(self, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             isinstance(node, torch.fx.Node)
             and node.op == "call_function"
@@ -963,18 +1241,30 @@ def is_sym_node(self, node: Any) -> bool:
             in [torch.ops.aten.sym_size.int, torch.ops.aten.sym_numel.default]
         )
 
+<<<<<<< HEAD
     def dce(self) -> None:
+=======
+    def dce(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Most of these removed nodes would have been removed during Dynamo and AOTDispatch
         # Remove some of these nodes earlier to improve compilation speed
 
         # Dynamo guards will error instead of creating aliasing guards unless we unpack them in the graph
         unpack_nodes: OrderedSet[torch.fx.Node] = OrderedSet()
+<<<<<<< HEAD
         i: int | None = None
         for i, node in enumerate(self.fx_tracer.graph.find_nodes(op="placeholder")):  # noqa: B007
             unpack_nodes.update(node.users.keys())
         assert i == len(_graph_placeholders) - 1
 
         def is_impure(node: torch.fx.Node) -> bool:
+=======
+        for i, node in enumerate(self.fx_tracer.graph.find_nodes(op="placeholder")):
+            unpack_nodes.update(node.users.keys())
+        assert i == len(_graph_placeholders) - 1
+
+        def is_impure(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node in unpack_nodes or (
                 node.op == "call_function" and node.target in _impure_targets
             ):
@@ -986,7 +1276,11 @@ def is_impure(node: torch.fx.Node) -> bool:
         after = len(self.fx_tracer.graph.nodes)
         verbose_log.debug("DCE removed %d nodes", before - after)
 
+<<<<<<< HEAD
     def remove_unused_sizes(self) -> set[int]:
+=======
+    def remove_unused_sizes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         used_sizes = []
         unused_sizes = []
 
@@ -1020,10 +1314,17 @@ def remove_unused_sizes(self) -> set[int]:
 
         return used_sizes_idx
 
+<<<<<<< HEAD
     def create_graph_module(self, id: str) -> GraphModule:
         return GraphModule(self.fx_tracer.root, self.fx_tracer.graph, id)
 
     def end_capture(self, outputs: Any) -> tuple[Callable[..., Any], Any]:
+=======
+    def create_graph_module(self, id):
+        return GraphModule(self.fx_tracer.root, self.fx_tracer.graph, id)
+
+    def end_capture(self, outputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fx_tracer.create_proxy(
             "call_function",
             FakeCompiledAutogradEngine._exec_final_callbacks_stub,
@@ -1101,6 +1402,7 @@ def end_capture(self, outputs: Any) -> tuple[Callable[..., Any], Any]:
             payload_fn=lambda: graph.print_readable(print_output=False),
         )
 
+<<<<<<< HEAD
         def runtime_wrapper(
             compiled_fn: Callable[..., Any],
             inputs: Any,
@@ -1109,6 +1411,9 @@ def runtime_wrapper(
             hooks: Any,
             packed_inputs: Any,
         ) -> tuple[Any, Any]:
+=======
+        def runtime_wrapper(compiled_fn, inputs, sizes, scalars, hooks, packed_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             global in_compiled_autograd_region
             try:
                 in_compiled_autograd_region = True
@@ -1150,22 +1455,38 @@ def runtime_wrapper(
         return runtime_wrapper, self.compiler_fn(graph)
 
     @staticmethod
+<<<<<<< HEAD
     def get_all_nodes(args: Sequence[Any]) -> list[torch.fx.Node]:
+=======
+    def get_all_nodes(args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # filter out non-Node args, like None
         nodes = [n for n in args if type(n) is torch.fx.Node]
         return nodes
 
     @staticmethod
+<<<<<<< HEAD
     def is_placeholder(node: torch.fx.Node) -> bool:
         if node.op == "placeholder" or (
             node.op == "call_function"
             and node.target == operator.getitem
             and node.args[0].op == "placeholder"  # type: ignore[union-attr, arg-type]
+=======
+    def is_placeholder(node):
+        if node.op == "placeholder" or (
+            node.op == "call_function"
+            and node.target == operator.getitem
+            and node.args[0].op == "placeholder"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return True
         return False
 
+<<<<<<< HEAD
     def reorder_accumulate_grad_nodes(self) -> None:
+=======
+    def reorder_accumulate_grad_nodes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
         the graph.  This differs from eager mode, which schedules them as soon as possible. This
@@ -1186,7 +1507,11 @@ def reorder_accumulate_grad_nodes(self) -> None:
                 if getitem_node is not None:
                     arg.append(getitem_node)
 
+<<<<<<< HEAD
     def delay_unpack_hook_nodes(self) -> None:
+=======
+    def delay_unpack_hook_nodes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         We can delay unpack hooks until they are needed, even later than in the eager autograd engine.
         """
@@ -1199,7 +1524,11 @@ def delay_unpack_hook_nodes(self) -> None:
             first_user = min(node.users)
             first_user.prepend(node)
 
+<<<<<<< HEAD
     def reorder_tensor_pre_hook_nodes(self) -> None:
+=======
+    def reorder_tensor_pre_hook_nodes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Usage of AOTAutograd causes all the tensor_pre_hook nodes to get pushed
         to the end of the graph. This differs from eager mode, which schedules
@@ -1219,7 +1548,11 @@ def reorder_tensor_pre_hook_nodes(self) -> None:
                 input_node.append(getitem_node)
                 getitem_node.append(node)
 
+<<<<<<< HEAD
     def reorder_pre_hook_nodes_to_schedule_asap(self) -> None:
+=======
+    def reorder_pre_hook_nodes_to_schedule_asap(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         In this function, we schedule the pre hooks as soon as possible. This
         does not match eager behavior (schedule pre hook right before its
@@ -1247,7 +1580,11 @@ def reorder_pre_hook_nodes_to_schedule_asap(self) -> None:
                     hook_block.append(n)
             for a, b in zip(to_remove, to_append):
                 input_nodes.remove(a)
+<<<<<<< HEAD
                 input_nodes.append(b)  # type: ignore[arg-type]
+=======
+                input_nodes.append(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             arg = max(input_nodes)  # last input
             if arg is not node.prev and not self.is_placeholder(arg):
@@ -1255,7 +1592,11 @@ def reorder_pre_hook_nodes_to_schedule_asap(self) -> None:
                 for n in hook_block:
                     getitem_node.append(n)
 
+<<<<<<< HEAD
     def reorder_pre_hook_nodes_to_mimic_eager(self) -> None:
+=======
+    def reorder_pre_hook_nodes_to_mimic_eager(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Usage of AOTAutograd causes all the pre_hook nodes to get pushed to the
         end of the graph. This differs from eager mode, which schedules them
@@ -1290,7 +1631,11 @@ def reorder_pre_hook_nodes_to_mimic_eager(self) -> None:
                 for getitem in users:
                     registered_node.prepend(getitem)
 
+<<<<<<< HEAD
     def reorder_post_acc_grad_hook_nodes(self) -> None:
+=======
+    def reorder_post_acc_grad_hook_nodes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Usage of AOTAutograd causes all the post_acc_grad_hook nodes to get
         pushed to the end of the graph. This differs from eager mode, which
@@ -1326,7 +1671,11 @@ def reorder_post_acc_grad_hook_nodes(self) -> None:
             acc_grad_node.append(getitem_node)
             getitem_node.append(node)
 
+<<<<<<< HEAD
     def reorder_post_hook_nodes(self) -> None:
+=======
+    def reorder_post_hook_nodes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Usage of AOTAutograd causes all the post_hook nodes to get pushed to the
         end of the graph. This differs from eager mode, which schedules them as
@@ -1383,7 +1732,11 @@ def reorder_post_hook_nodes(self) -> None:
                 arg.append(getitem_node)
                 getitem_node.append(node)
 
+<<<<<<< HEAD
     def to_proxy(self, t: Any) -> Any:
+=======
+    def to_proxy(self, t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if t is None:
             return None
         if isinstance(t, list):
@@ -1400,11 +1753,16 @@ def to_proxy(self, t: Any) -> Any:
         return proxy_tensor.proxy
 
     def bind_objects_to_proxies(
+<<<<<<< HEAD
         self,
         objects: Sequence[Any],
         proxies: Any,
         origins: Optional[list[tuple[int, str]]] = None,
     ) -> Sequence[Any]:
+=======
+        self, objects, proxies, origins: Optional[list[tuple[int, str]]] = None
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(proxies, torch.fx.Proxy):
             if origins:
                 assert len(origins) == len(objects)
@@ -1421,7 +1779,11 @@ def bind_objects_to_proxies(
         track_tensor_tree(objects, proxies, constant=None, tracer=self.fx_tracer)
         return proxies
 
+<<<<<<< HEAD
     def bind_backward_state(self, index: int) -> BackwardState:
+=======
+    def bind_backward_state(self, index: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.hooks_proxy is not None
         proxy = self.hooks_proxy[index]  # type: ignore[index]
         bw_state = BackwardState()
@@ -1433,7 +1795,11 @@ def set_node_origin(
         node_name: str,
         nodecall_index: int,
         pyobj: Optional[torch.autograd.Function],
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         maybe_aot_id = ""
         if pyobj is not None:
             forward_cls = pyobj._forward_cls  # type: ignore[attr-defined]
@@ -1468,11 +1834,15 @@ def set_node_origin(
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def _enable(
     compiler_fn: Callable[..., Any],
     dynamic: bool = True,
     ignore_active_disable_ctx: bool = True,
 ) -> Generator[None, None, None]:
+=======
+def _enable(compiler_fn, dynamic: bool = True, ignore_active_disable_ctx=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The entrypoint to enable CA.
     # It is recommended to enable via `torch._dynamo.config.compiled_autograd = True` rather
     # than using this context manager directly. If you are torch.compiling the corresponding
@@ -1513,8 +1883,12 @@ def _enable(
         else:
             # we need to import this, because user might not have imported it if they directly use this context manager
             # we need to lazily import it, because of circular dependencies
+<<<<<<< HEAD
             if torch.cuda.is_available():
                 from torch._inductor import cudagraph_trees  # noqa: F401
+=======
+            import torch._inductor.cudagraph_trees
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             (
                 prior_compiler,
@@ -1545,7 +1919,11 @@ def _enable(
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def _disable() -> Generator[None, None, None]:
+=======
+def _disable():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (
         prior_compiler,
         prior_dynamic,
@@ -1581,6 +1959,7 @@ def reset() -> None:
 # Reimplementation of part of CopySlices::apply in Python.
 # The shared code is really similar so we're not going to try to deduplicate.
 def copy_slices_prologue(
+<<<<<<< HEAD
     inputs: Sequence[torch.Tensor],
     base_sizes: Sequence[IntLikeType],
     base_strides: Sequence[IntLikeType],
@@ -1589,6 +1968,16 @@ def copy_slices_prologue(
     view_strides: Sequence[IntLikeType],
     view_storage_offset: IntLikeType,
 ) -> list[torch.Tensor]:
+=======
+    inputs,
+    base_sizes,
+    base_strides,
+    base_storage_offset,
+    view_sizes,
+    view_strides,
+    view_storage_offset,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad = inputs[0]
     result = grad.new_empty_strided(base_sizes, base_strides)
     assert grad is not None
@@ -1600,6 +1989,7 @@ def copy_slices_prologue(
 
 # Reimplementation of part of CopySlices::apply in Python.
 # The shared code is really similar so we're not going to try to deduplicate.
+<<<<<<< HEAD
 def copy_slices_epilogue(
     needs_input_grad: Sequence[bool],
     result: torch.Tensor,
@@ -1607,14 +1997,22 @@ def copy_slices_epilogue(
     grad_slice: torch.Tensor,
 ) -> list[Optional[torch.Tensor]]:
     grad_inputs: list[Optional[torch.Tensor]] = [None] * len(needs_input_grad)
+=======
+def copy_slices_epilogue(needs_input_grad, result, res, grad_slice):
+    grad_inputs = [None] * len(needs_input_grad)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i in range(len(needs_input_grad)):
         if needs_input_grad[i]:
             if res[i] is None:
                 continue
             if i == 0:
+<<<<<<< HEAD
                 to_copy = res[i]
                 assert to_copy is not None
                 grad_slice.copy_(to_copy)
+=======
+                grad_slice.copy_(res[i])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 grad_inputs[i] = result
             else:
                 grad_inputs[i] = res[i]
diff --git a/torch/_dynamo/comptime.py b/torch/_dynamo/comptime.py
index 65690dc446a24..fce57722af5fd 100644
--- a/torch/_dynamo/comptime.py
+++ b/torch/_dynamo/comptime.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides the public comptime interface to TorchDynamo, enabling users to execute
 arbitrary Python code during symbolic evaluation of their programs.
@@ -38,6 +43,7 @@ def my_model(x):
 import dis
 import time
 import traceback
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, TextIO, Union
 
@@ -45,6 +51,11 @@ def my_model(x):
 from torch._dynamo.symbolic_convert import InstructionTranslatorBase
 from torch._dynamo.variables.base import VariableTracker
 from torch._subclasses.fake_tensor import FakeTensor
+=======
+from typing import Optional, Union
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import free_symbols
 
 from .exc import unimplemented_v2
@@ -64,10 +75,17 @@ class ComptimeVar:
     actual data in the Tensor is.)
     """
 
+<<<<<<< HEAD
     def __init__(self, v: VariableTracker) -> None:
         self.__variable = v
 
     def as_proxy(self) -> Union[VariableTracker, Sequence[VariableTracker]]:
+=======
+    def __init__(self, v) -> None:
+        self.__variable = v
+
+    def as_proxy(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns an fx.Proxy (or tuple/list of fx.Proxy) representing
         this variable in the FX graph we are assembling to pass
@@ -81,13 +99,21 @@ def as_proxy(self) -> Union[VariableTracker, Sequence[VariableTracker]]:
         """
         return self.__variable.as_proxy()
 
+<<<<<<< HEAD
     def is_proxy(self) -> bool:
+=======
+    def is_proxy(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns True if as_proxy() would succeed.
         """
         return self.__variable.is_proxy()
 
+<<<<<<< HEAD
     def as_fake(self) -> Union[FakeTensor, torch.SymInt]:
+=======
+    def as_fake(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns a "fake" value (either a FakeTensor or a SymInt)
         representing the variable in question.  This only works
@@ -104,16 +130,26 @@ def size(self, dim: Optional[int] = None) -> Union[int, torch.SymInt]:
         Returns the size of the tensor (if dim is None) or the size
         at the dimension dim.  The returned size may be a SymInt.
         """
+<<<<<<< HEAD
         return self.as_fake().size(dim)  # type: ignore[union-attr, return-value]
 
     def python_type(self) -> type:
+=======
+        return self.as_fake().size(dim)
+
+    def python_type(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns what type(v) would have returned for the variable
         at compile time.
         """
         return self.__variable.python_type()
 
+<<<<<<< HEAD
     def as_python_constant(self) -> Any:
+=======
+    def as_python_constant(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns the Python value this variable would have, but only if it is
         completely known at compile-time (e.g., it is constant).
@@ -125,19 +161,31 @@ def as_python_constant(self) -> Any:
         """
         return self.__variable.as_python_constant()
 
+<<<<<<< HEAD
     def is_python_constant(self) -> bool:
+=======
+    def is_python_constant(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns True if as_python_constant would succeed.
         """
         return self.__variable.is_python_constant()
 
+<<<<<<< HEAD
     def is_dynamic(self) -> bool:
+=======
+    def is_dynamic(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.__variable, SymNodeVariable):
             fs = free_symbols(self.__variable.sym_num)
             return bool(fs)
         return False
 
+<<<<<<< HEAD
     def force_static(self) -> None:
+=======
+    def force_static(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Forces that a value is static, inducing a guard on its specific value
         """
@@ -151,7 +199,11 @@ def force_static(self) -> None:
                 f"cannot force {self.__variable} ({type(self.__variable)}) static"
             )
 
+<<<<<<< HEAD
     def _i_will_not_complain_if_bc_breaks_VariableTracker(self) -> VariableTracker:
+=======
+    def _i_will_not_complain_if_bc_breaks_VariableTracker(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns the internal data structure VariableTracker that Dynamo uses
         to represent variables at compile time.  There are no BC guarantees on
@@ -173,10 +225,17 @@ class ComptimeContext:
     file a feature request at https://github.com/pytorch/pytorch/
     """
 
+<<<<<<< HEAD
     def __init__(self, tx: InstructionTranslatorBase) -> None:
         self.__tx = tx
 
     def get_local(self, name: str, *, stacklevel: int = 0) -> ComptimeVar:
+=======
+    def __init__(self, tx) -> None:
+        self.__tx = tx
+
+    def get_local(self, name: str, *, stacklevel=0) -> ComptimeVar:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Retrieve the compile-time known information about a local.
         """
@@ -189,7 +248,11 @@ def get_local(self, name: str, *, stacklevel: int = 0) -> ComptimeVar:
 
         return ComptimeVar(var)
 
+<<<<<<< HEAD
     def graph_break(self, msg: str = "ComptimeContext.graph_break") -> None:
+=======
+    def graph_break(self, msg="ComptimeContext.graph_break"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Manually trigger a graph break
         """
@@ -200,14 +263,22 @@ def graph_break(self, msg: str = "ComptimeContext.graph_break") -> None:
             hints=[],
         )
 
+<<<<<<< HEAD
     def graph(self) -> torch.fx.Graph:
+=======
+    def graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Retrieve the partially constructed FX graph that would be
         passed to the user compiler after compilation.
         """
         return self.__tx.output.graph
 
+<<<<<<< HEAD
     def assert_static(self, val: ComptimeVar) -> None:
+=======
+    def assert_static(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Asserts that the int is static (and not dynamic, per dynamic shapes)
         """
@@ -215,9 +286,13 @@ def assert_static(self, val: ComptimeVar) -> None:
             "expected static but got dynamic (run with TORCH_LOGS=dynamic for more info)"
         )
 
+<<<<<<< HEAD
     def print_graph(
         self, *, verbose: bool = True, file: Optional[TextIO] = None
     ) -> None:
+=======
+    def print_graph(self, *, verbose=True, file=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print the partially constructed FX graph that would be passed
         to the user compiler after compilation.
@@ -226,6 +301,7 @@ def print_graph(
             self.__tx.output.graph.python_code("self", verbose=verbose).src, file=file
         )
 
+<<<<<<< HEAD
     def parent(self) -> "ComptimeContext":
         return ComptimeContext(self.__tx.parent)  # type: ignore[arg-type]
 
@@ -241,6 +317,21 @@ def print(self, val: Any, *, file: Optional[TextIO] = None) -> None:
     def print_disas(
         self, *, file: Optional[TextIO] = None, stacklevel: int = 0
     ) -> None:
+=======
+    def parent(self):
+        return ComptimeContext(self.__tx.parent)
+
+    def __get_tx(self, stacklevel):
+        tx = self.__tx
+        for _ in range(stacklevel):
+            tx = tx.parent
+        return tx
+
+    def print(self, val, *, file=None):
+        print(repr(val), file=file)
+
+    def print_disas(self, *, file=None, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print the current series of opcodes being executed (not including
         parent frames), including where you are in the particular opcode
@@ -255,9 +346,13 @@ def print_disas(
             file=file,
         )
 
+<<<<<<< HEAD
     def print_value_stack(
         self, *, file: Optional[TextIO] = None, stacklevel: int = 0
     ) -> None:
+=======
+    def print_value_stack(self, *, file=None, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print the current Python value stack.  Note that this is NOT the same
         as the traceback; use print_bt() to print that.  Note that at
@@ -272,9 +367,13 @@ def print_value_stack(
         for s in tx.stack:
             print(f"- {s.debug_repr()}", file=file)
 
+<<<<<<< HEAD
     def print_locals(
         self, *, file: Optional[TextIO] = None, stacklevel: int = 0
     ) -> None:
+=======
+    def print_locals(self, *, file=None, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print all of the locals available in the current context.
         By default this view is very limited; you can get more information
@@ -284,7 +383,11 @@ def print_locals(
         for k, v in tx.symbolic_locals.items():
             print(f"{k} = {v.debug_repr()}", file=file)
 
+<<<<<<< HEAD
     def print_bt(self, *, file: Optional[TextIO] = None, stacklevel: int = 0) -> None:
+=======
+    def print_bt(self, *, file=None, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print the user code backtrace, starting at the beginning of the
         frame Dynamo started evaluating.  Note that this MAY NOT go all
@@ -303,7 +406,11 @@ def print_bt(self, *, file: Optional[TextIO] = None, stacklevel: int = 0) -> Non
             file=file,
         )
 
+<<<<<<< HEAD
     def print_guards(self, *, file: Optional[TextIO] = None) -> None:
+=======
+    def print_guards(self, *, file=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Print the currently installed guards for the Dynamo context.
         This does NOT include guards associated with variables that
@@ -317,9 +424,13 @@ def print_guards(self, *, file: Optional[TextIO] = None) -> None:
             file=file,
         )
 
+<<<<<<< HEAD
     def _i_will_not_complain_if_bc_breaks_InstructionTranslator(
         self,
     ) -> InstructionTranslatorBase:
+=======
+    def _i_will_not_complain_if_bc_breaks_InstructionTranslator(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns the internal data structure InstructionTranslator that Dynamo
         uses to track state of symbolic evaluation.  There are no BC
@@ -328,22 +439,31 @@ def _i_will_not_complain_if_bc_breaks_InstructionTranslator(
         """
         return self.__tx
 
+<<<<<<< HEAD
     def sleep(self, sec: Union[int, float]) -> None:
+=======
+    def sleep(self, sec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         time.sleep(sec)
 
 
 class _Comptime:
     @staticmethod
+<<<<<<< HEAD
     def __call__(
         fn: Callable[[ComptimeContext], Any],
         fallback_fn: Callable[[], Any] = lambda: None,
     ) -> Any:
+=======
+    def __call__(fn, fallback_fn=lambda: None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """fn gets called at compile time in TorchDynamo, calls fallback_fn otherwise"""
         fallback_fn()
 
     # Convenience wrappers that are more compact to use
 
     @staticmethod
+<<<<<<< HEAD
     def graph_break() -> None:
         comptime(lambda ctx: ctx.graph_break())
 
@@ -357,6 +477,21 @@ def print_graph() -> None:
 
     @staticmethod
     def print_disas(*, stacklevel: int = 0) -> None:
+=======
+    def graph_break():
+        comptime(lambda ctx: ctx.graph_break())
+
+    @staticmethod
+    def print(e):
+        comptime(lambda ctx: ctx.print(ctx.get_local("e")), lambda: print(e))
+
+    @staticmethod
+    def print_graph():
+        comptime(lambda ctx: ctx.print_graph())
+
+    @staticmethod
+    def print_disas(*, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(
             lambda ctx: ctx.print_disas(
                 stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
@@ -364,7 +499,11 @@ def print_disas(*, stacklevel: int = 0) -> None:
         )
 
     @staticmethod
+<<<<<<< HEAD
     def print_value_stack(*, stacklevel: int = 0) -> None:
+=======
+    def print_value_stack(*, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(
             lambda ctx: ctx.print_value_stack(
                 stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
@@ -375,7 +514,11 @@ def print_value_stack(*, stacklevel: int = 0) -> None:
     # in an expression context; e.g., x + print_value_stack_and_return(y + z),
     # you will see x on the stack prior to the addition operation
     @staticmethod
+<<<<<<< HEAD
     def print_value_stack_and_return(e: Any, *, stacklevel: int = 0) -> Any:
+=======
+    def print_value_stack_and_return(e, *, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(
             lambda ctx: ctx.print_value_stack(
                 stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
@@ -384,7 +527,11 @@ def print_value_stack_and_return(e: Any, *, stacklevel: int = 0) -> Any:
         return e
 
     @staticmethod
+<<<<<<< HEAD
     def print_locals(*, stacklevel: int = 0) -> None:
+=======
+    def print_locals(*, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(
             lambda ctx: ctx.print_locals(
                 stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
@@ -392,7 +539,11 @@ def print_locals(*, stacklevel: int = 0) -> None:
         )
 
     @staticmethod
+<<<<<<< HEAD
     def print_bt(*, stacklevel: int = 0) -> None:
+=======
+    def print_bt(*, stacklevel=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(
             lambda ctx: ctx.print_bt(
                 stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
@@ -400,6 +551,7 @@ def print_bt(*, stacklevel: int = 0) -> None:
         )
 
     @staticmethod
+<<<<<<< HEAD
     def print_guards() -> None:
         comptime(lambda ctx: ctx.print_guards())
 
@@ -413,6 +565,21 @@ def force_static(val: Any) -> None:
 
     @staticmethod
     def breakpoint() -> None:
+=======
+    def print_guards():
+        comptime(lambda ctx: ctx.print_guards())
+
+    @staticmethod
+    def assert_static(val):
+        comptime(lambda ctx: ctx.assert_static(ctx.get_local("val")))
+
+    @staticmethod
+    def force_static(val):
+        comptime(lambda ctx: ctx.get_local("val").force_static())
+
+    @staticmethod
+    def breakpoint():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Like pdb breakpoint(), but drop into pdb whenever this line
         of code is compiled by dynamo.  Use it by putting
@@ -430,14 +597,22 @@ def breakpoint() -> None:
             (Pdb) p ctx.get_local("attention").as_fake()
         """
 
+<<<<<<< HEAD
         def inner(inner_ctx: ComptimeContext) -> None:
+=======
+        def inner(inner_ctx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ctx = inner_ctx.parent()  # noqa: F841
             builtins.breakpoint()
 
         comptime(inner)
 
     @staticmethod
+<<<<<<< HEAD
     def sleep(sec: Union[int, float]) -> None:
+=======
+    def sleep(sec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         comptime(lambda ctx: ctx.sleep(ctx.get_local("sec").as_python_constant()))
 
 
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 5858a4584b3dd..f3a2007fbcc04 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -1,9 +1,17 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Configuration module for TorchDynamo compiler and optimization settings.
 
 This module contains various configuration flags and settings that control TorchDynamo's
 behavior, including:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - Runtime behavior flags (e.g., guard settings, specialization options)
 - Debugging and development options
 - Performance tuning parameters
@@ -111,12 +119,15 @@
 # Valid options: "dynamic", "unbacked"
 automatic_dynamic_shapes_mark_as: Literal["dynamic", "unbacked"] = "dynamic"
 
+<<<<<<< HEAD
 # log graph in/out metadata
 # This is only turned on for export today since we
 # know we are tracing a flat callable. later, this
 # can extended to other use cases as well.
 log_graph_in_out_metadata = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This flag changes how the shapes of parameters are treated.
 # If this flag is set to True, then the shapes of torch.nn.Parameter as well as of torch.Tensor are attempted to be dynamic
 # If this flag is set to False, then the shapes of torch.nn.Parameter are assumed to be static,
@@ -188,6 +199,7 @@
 # [@compile_ignored: runtime_behaviour] Get a cprofile trace of Dynamo
 cprofile = os.environ.get("TORCH_COMPILE_CPROFILE", False)
 
+<<<<<<< HEAD
 # Legacy config, does nothing now!
 skipfiles_inline_module_allowlist: dict[Any, Any] = {}
 """Allowlist of inline modules to skip during compilation.
@@ -204,6 +216,10 @@
 .. note::
    DEPRECATED: This setting has no effect on current behavior.
 """
+=======
+# legacy config, does nothing now!
+skipfiles_inline_module_allowlist: dict[Any, Any] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # If a string representing a PyTorch module is in this ignorelist,
 # the `allowed_functions.is_allowed` function will not consider it
@@ -279,6 +295,15 @@
 # hybrid backed unbacked symints
 prefer_deferred_runtime_asserts_over_guards = False
 
+<<<<<<< HEAD
+=======
+# For complex dynamic shapes guards that we're unable to specify with dynamo/export's
+# range constraints + dims + derived dims language, we raise constraint violation
+# errors or specialize by default. If set to True, this flag avoids crashing/specialization,
+# and allows complex guards as runtime assertions in the graph.
+allow_complex_guards_as_runtime_asserts = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # By default, dynamo will treat all ints as backed SymInts, which means (1) it
 # will wait to see the int change over multiple runs before generalizing and
 # (2) it will still always 0/1 specialize an int.  When true, this knob
@@ -323,7 +348,11 @@
     ],
 ] = True
 
+<<<<<<< HEAD
 # By default, Dynamo emits runtime asserts (e.g. torch._check) in the graph.
+=======
+# By default, Dynamo emits runtime asserts (e.g. torch._check, torch._check_is_size) in the graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # In some cases those asserts could be performance costly
 # E.g. torch._check(tensor[0].item() > 2) for tensor on cuda will require cuda sync.
 # Setting this to True keeps them hinting to symbolic shapes engine,
@@ -341,10 +370,13 @@
 # No longer used
 optimize_ddp_lazy_compile = False
 
+<<<<<<< HEAD
 # lambda guarding on object aliasing to improve opportunity for dict tag
 # optimization
 use_lamba_guard_for_object_aliasing = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Whether to skip guarding on FSDP-managed modules
 skip_fsdp_guards = True
 # Whether to apply torch._dynamo.disable() to FSDP2 hooks.
@@ -366,6 +398,7 @@
 # the dictionary tag is same across invocation calls.
 skip_tensor_guards_with_matching_dict_tags = True
 
+<<<<<<< HEAD
 # Skips guards on func.__defaults__ if the element to be guarded is a constant
 skip_guards_on_constant_func_defaults = True
 
@@ -396,6 +429,8 @@
 # useful for regional compilation.
 max_saved_pointers_for_recursive_dict_tags_check = 256
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # If True, raises exception if TorchDynamo is called with a context manager
 raise_on_ctx_manager_usage = True
 
@@ -416,7 +451,11 @@
 # exported FX graph. This flag should become the default eventually
 # and be removed, but currently provides a way to fall back to old
 # graph breaking behavior.
+<<<<<<< HEAD
 capture_sparse_compute = not is_fbcode()
+=======
+capture_sparse_compute = False if is_fbcode() else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # If true, error if we try to compile a function that has
 # been seen before.
@@ -463,19 +502,25 @@
     justknob="pytorch/compiler:inline_inbuilt_nn_modules",
 )
 
+<<<<<<< HEAD
 # Resume tracing in nested frames if a nested graph break occurs
 # Old behavior is to bubble up the graph break to the top level frame.
 nested_graph_breaks = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install "free" tensor variables (globals, non-locals, nn module attributes)
 # as graph attributes.  This is useful for export, as it
 # produces a consistent number of inputs to the graph.
 install_free_tensors = False
 
+<<<<<<< HEAD
 # Temporary flag to control the turning of install_free_tensors to True for
 # export. We will remove this flag in a few weeks when stable.
 install_free_tensors_for_export = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Use C++ FrameLocalsMapping (raw array view of Python frame fastlocals) (deprecated: always True)
 enable_cpp_framelocals_guard_eval = True
 
@@ -500,6 +545,7 @@
 # traced FX graph is empty when RETURN_* is traced.
 allow_empty_graphs = False
 
+<<<<<<< HEAD
 # Used for testing - forces all top-level functions to be nested when traced with Dynamo
 debug_force_nested_calls = False
 
@@ -512,12 +558,18 @@
 # always pass.
 debug_disable_compile_counter = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # When set, total compile time instruction count is recorded using
 # torch._dynamo.utilsCompileTimeInstructionCounter.
 record_compile_time_instruction_count = False
 
 
+<<<<<<< HEAD
 def default_debug_dir_root() -> str:
+=======
+def default_debug_dir_root():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # [@compile_ignored: debug]
     DEBUG_DIR_VAR_NAME = "TORCH_COMPILE_DEBUG_DIR"
     if DEBUG_DIR_VAR_NAME in os.environ:
@@ -609,12 +661,15 @@ def default_debug_dir_root() -> str:
 # the inference_mode is still respected.
 fake_tensor_disable_inference_mode = True
 
+<<<<<<< HEAD
 # Experimental feature for running automatic caching precompile.
 # Enables automatic DynamoCache save/load
 caching_precompile = os.environ.get("TORCH_CACHING_PRECOMPILE", "0") == "1"
 
 strict_precompile = os.environ.get("TORCH_STRICT_PRECOMPILE", "0") == "1"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Enables the Compiled Autograd engine to trace autograd calls made under torch.compile().
 # Note: AOTAutograd will still trace and partition an AOT backward graph local to that
 # compiled region. But AOTAutograd traces without knowledge of backward hooks which are
@@ -626,6 +681,7 @@ def default_debug_dir_root() -> str:
 # registering backward hooks on tensors contained within the compiled region.
 compiled_autograd = False
 
+<<<<<<< HEAD
 
 # Checks if we should graph break when seeing nn parameter constructors
 # in dynamo; this is so that we clearly fail and ask users to move outside
@@ -662,6 +718,10 @@ def default_debug_dir_root() -> str:
    may be added in future versions.
 """
 
+=======
+# Overrides torch.compile() kwargs for Compiled Autograd:
+compiled_autograd_kwargs_override: dict[str, Any] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Enables use of collectives *during* compilation to synchronize behavior
 # across ranks.  Today, this is used solely to modify automatic_dynamic_shapes
@@ -711,9 +771,12 @@ def default_debug_dir_root() -> str:
     os.environ.get("UNSAFE_SKIP_FSDP_MODULE_GUARDS", "0") == "1"
 )
 
+<<<<<<< HEAD
 # Common prefix to append to the id of each compile run to filter out data
 pt2_compile_id_prefix: Optional[str] = os.environ.get("PT2_COMPILE_ID_PREFIX", None)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Run GC at the end of compilation
 run_gc_after_compile = Config(  # type: ignore[var-annotated]
     default=True,
@@ -721,11 +784,14 @@ def default_debug_dir_root() -> str:
     env_name_default="TORCH_DYNAMO_RUN_GC_AFTER_COMPILE",
 )
 
+<<<<<<< HEAD
 # Does not graph break on torch.autograd._profiler_enabled if set to True. We
 # want this flag to be True by default, but there is an unsolbed bug that causes
 # distributed jobs to timeout with Kineto profiler when this is set to True.
 constant_fold_autograd_profiler_enabled = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Takes the function/module decorated with torch.compile and passes it through a
 # wrapper. This ensures that nn.module hooks are also compiled in the same frame.
 wrap_top_frame = False
@@ -734,15 +800,22 @@ def default_debug_dir_root() -> str:
 # and AOTAutograd runtime wrapper.
 record_runtime_overhead = True
 
+<<<<<<< HEAD
 enable_aot_compile = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # HACK: this is for testing custom ops profiling only
 _custom_ops_profile: Optional[Any] = None
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
+<<<<<<< HEAD
     def _make_closure_patcher(**changes: Any) -> Any: ...
+=======
+    def _make_closure_patcher(**changes): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 install_config_module(sys.modules[__name__])
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 74a53c6d9c4bf..823bba5d1f32f 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-decorators
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module implements TorchDynamo's core frame conversion functionality, transforming Python
 frames into FX graphs. It handles:
@@ -15,10 +20,13 @@
 
 The conversion process preserves program semantics while enabling optimizations
 through torch.compile() and related systems.
+<<<<<<< HEAD
 
 NOTE: _torchdynamo_orig_backend is used for convert frame wrappers to identify the inner wrapped function.
 By going down the _torchdynamo_orig_backend chain, one can recover the original unwrapped backend,
 which is checked for during the Dynamo cache lookup.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 from __future__ import annotations
@@ -29,7 +37,10 @@
 import dis
 import functools
 import gc
+<<<<<<< HEAD
 import inspect
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import logging
 import os
@@ -40,6 +51,7 @@
 import threading
 import time
 import traceback
+<<<<<<< HEAD
 import types
 import typing
 import weakref
@@ -47,6 +59,13 @@
 from pathlib import Path
 from types import CellType, CodeType, FunctionType, ModuleType
 from typing import Any, Optional, TypeVar, Union
+=======
+import typing
+import weakref
+from pathlib import Path
+from types import CellType, CodeType, FunctionType, ModuleType
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 from weakref import ReferenceType
 
@@ -74,7 +93,10 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.utils._python_dispatch import (
     _disable_current_modes,
+<<<<<<< HEAD
     is_in_any_mode_without_ignore_compile_internals,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_in_torch_dispatch_mode,
 )
 from torch.utils._traceback import CapturedTraceback, format_traceback_short
@@ -96,7 +118,10 @@
 )
 from .eval_frame import (
     always_optimize_code_objects,
+<<<<<<< HEAD
     Constraint,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dynamo_tls,
     skip_code,
     TorchPatcher,
@@ -109,7 +134,10 @@
     InternalTorchDynamoError,
     PackageError,
     RecompileLimitExceeded,
+<<<<<<< HEAD
     ResumePrologueTracingError,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ShortenTraceback,
     SkipCodeRecursiveException,
     TorchRuntimeError,
@@ -117,19 +145,26 @@
     unimplemented_v2,
     Unsupported,
 )
+<<<<<<< HEAD
 from .graph_bytecode_inputs import reset_user_object_tracking
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .guards import (
     CheckFunctionManager,
     get_and_maybe_log_recompilation_reasons,
     GuardedCode,
 )
 from .hooks import Hooks
+<<<<<<< HEAD
 from .output_graph import DynamoTracerOutput, OutputGraphCommon
 from .pgo import (
     _log_size_mismatch_recompile,
     log_frame_dynamic_whitelist,
     put_code_state,
 )
+=======
+from .pgo import log_frame_dynamic_whitelist, put_code_state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .replay_record import ExecutionRecord
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
 from .symbolic_convert import (
@@ -142,7 +177,10 @@
 from .trace_rules import is_numpy
 from .types import ConvertFrameReturn, FrameAction, FrameExecStrategy, wrap_guarded_code
 from .utils import (
+<<<<<<< HEAD
     _get_error_on_graph_break,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     chromium_event_timed,
     CleanupManager,
     CompileTimeInstructionCounter,
@@ -150,7 +188,10 @@
     dynamo_timed,
     format_bytecode,
     gen_record_file_name,
+<<<<<<< HEAD
     get_hook_for_recompile_user_context,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_metrics_context,
     increment_frame,
     is_namedtuple,
@@ -176,10 +217,13 @@
 
 
 if typing.TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
     from torch.utils.weak import WeakIdKeyDictionary
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .backends.registry import CompilerFn
     from .package import CompilePackage
     from .repro.after_dynamo import WrapBackendDebug
@@ -239,6 +283,7 @@ def fx_forward_from_src_skip_result(
     return result
 
 
+<<<<<<< HEAD
 def log_dynamo_start(code: CodeType, skip: int = 0) -> list[str]:
     convert_frame_intern = structured.intern_string(__file__)
     captured_tb = CapturedTraceback.extract(skip=4 + skip).summary()
@@ -273,6 +318,8 @@ def log_dynamo_start(code: CodeType, skip: int = 0) -> list[str]:
     return stack_strings
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def preserve_global_state(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     """
     Context manager to:
@@ -307,11 +354,16 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
             torch_rng_state = torch.random.get_rng_state()
             cuda_rng_state = None
             if torch.cuda.is_available():
+<<<<<<< HEAD
                 with torch._C.DisableTorchFunction():
                     cuda_rng_state = torch.cuda.get_rng_state()
             cuda_matmul_fp32_prec = torch._C._get_fp32_precision_getter(
                 "cuda", "matmul"
             )
+=======
+                cuda_rng_state = torch.cuda.get_rng_state()
+            allow_tf32 = torch._C._get_cublas_allow_tf32()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prior_fwd_from_src = torch.fx.graph_module._forward_from_src
             torch.fx.graph_module._forward_from_src = fx_forward_from_src_skip_result
             cleanup = setup_compile_debug()
@@ -320,7 +372,10 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                 torch.fx._symbolic_trace._maybe_revert_all_patches()
             )
             exit_stack.enter_context(torch_function_mode_stack_state_mgr)
+<<<<<<< HEAD
             reset_user_object_tracking()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 return fn(*args, **kwargs)
             finally:
@@ -343,17 +398,26 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                 if prior_mobile_allocator_state != curr_mobile_allocator_state:
                     torch._C._unset_default_mobile_cpu_allocator()
                 if cuda_rng_state is not None:
+<<<<<<< HEAD
                     with torch._C.DisableTorchFunction():
                         torch.cuda.set_rng_state(cuda_rng_state)
                 torch._C._set_fp32_precision_setter(
                     "cuda", "matmul", cuda_matmul_fp32_prec
                 )
+=======
+                    torch.cuda.set_rng_state(cuda_rng_state)
+                torch._C._set_cublas_allow_tf32(allow_tf32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.fx.graph_module._forward_from_src = prior_fwd_from_src
                 assert guards.check(), (
                     f"Global {guards.reason()}state changed while dynamo tracing, please report a bug"
                 )
 
+<<<<<<< HEAD
     _fn._torchdynamo_orig_backend = fn  # type: ignore[attr-defined]
+=======
+    _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _fn
 
 
@@ -475,7 +539,10 @@ def profile_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         try:
             prof.enable()
             start_ts = time.time()
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             retval = prof.runcall(func, *args, **kwargs)
             profile_latency = time.time() - start_ts
             prof.disable()
@@ -534,6 +601,7 @@ def profile_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
     return profile_wrapper
 
 
+<<<<<<< HEAD
 @dataclass
 class ConvertFrameBox:
     error_on_graph_break: Optional[bool] = None
@@ -562,6 +630,8 @@ def get_compile_id(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConvertFrameAssert:
     def __init__(
         self,
@@ -573,12 +643,19 @@ def __init__(
     ) -> None:
         # assert export_constraints is None
         reset_graph_break_dup_checker()
+<<<<<<< HEAD
         self._torchdynamo_orig_backend = compiler_fn
+=======
+        self._torchdynamo_orig_callable = compiler_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._one_graph = one_graph
         self._export = export
         self._export_constraints = export_constraints
         self._package = package
+<<<<<<< HEAD
         self._box = ConvertFrameBox()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def _clone_with_backend(self) -> Callable[[CompilerFn], ConvertFrameAssert]:
@@ -599,6 +676,10 @@ def __call__(
         skip: int = 0,
     ) -> ConvertFrameReturn:
         increment_frame()
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = frame.f_code
 
         cache_size = compute_cache_size(frame, cache_entry)
@@ -677,8 +758,29 @@ def __call__(
         global initial_global_state
         initial_global_state = GlobalStateGuard()
 
+<<<<<<< HEAD
         compile_id = get_compile_id(frame_state)
         frame_id = compile_id.frame_id
+=======
+        global FRAME_COUNTER
+        if "_id" not in frame_state:
+            frame_state["_id"] = FRAME_COUNTER
+            FRAME_COUNTER += 1
+        frame_id = frame_state["_id"]
+        assert isinstance(frame_id, int)
+
+        frame_compile_id = FRAME_COMPILE_COUNTER[frame_id]
+        FRAME_COMPILE_COUNTER[frame_id] += 1
+
+        compiled_autograd_id = None
+        if prior := CompileContext.current_compile_id():
+            compiled_autograd_id = prior.compiled_autograd_id
+        compile_id = CompileId(
+            compiled_autograd_id=compiled_autograd_id,
+            frame_id=frame_id,
+            frame_compile_id=frame_compile_id,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         signpost_event(
             "dynamo",
@@ -700,13 +802,21 @@ def __call__(
             dynamo_tls.traced_frame_infos.append(info)
 
         with compile_context(CompileContext(compile_id)):
+<<<<<<< HEAD
             result = _compile(
+=======
+            return _compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 frame.f_code,
                 frame.f_globals,
                 frame.f_locals,
                 frame.f_builtins,
                 frame.closure,
+<<<<<<< HEAD
                 self._torchdynamo_orig_backend,
+=======
+                self._torchdynamo_orig_callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._one_graph,
                 self._export,
                 self._export_constraints,
@@ -718,6 +828,7 @@ def __call__(
                 compile_id=compile_id,
                 skip=skip + 1,
                 package=self._package,
+<<<<<<< HEAD
                 convert_frame_box=self._box,
             )
 
@@ -728,6 +839,10 @@ def __call__(
             DynamoCache.record_package(self._package)
         return result
 
+=======
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def convert_frame_assert(
     compiler_fn: CompilerFn,
@@ -736,7 +851,11 @@ def convert_frame_assert(
     export_constraints: Optional[typing.Never] = None,
     package: Optional[CompilePackage] = None,
 ) -> ConvertFrameAssert:
+<<<<<<< HEAD
     """Fully convert a frame into an FX graph, raising an exception if we fail."""
+=======
+    """Fully convert a frame into an FX graph"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ConvertFrameAssert(
         compiler_fn, one_graph, export, export_constraints, package
     )
@@ -747,6 +866,12 @@ def convert_frame_assert(
 from torch.utils.hooks import RemovableHandle
 
 
+<<<<<<< HEAD
+=======
+if typing.TYPE_CHECKING:
+    from .output_graph import OutputGraph
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # we have to use `OrderedDict` to make `RemovableHandle` work.
 _bytecode_hooks: dict[int, BytecodeHook] = OrderedDict()
 
@@ -761,6 +886,7 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
     return handle
 
 
+<<<<<<< HEAD
 # TODO - We want to run preserve_node_meta context manager here, but the CI
 # fails (its unclear if the failures were flaky)
 # @torch.fx.traceback.preserve_node_meta()
@@ -1252,6 +1378,8 @@ def transform(
             raise
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _compile(
     code: CodeType,
     globals: dict[str, object],
@@ -1271,6 +1399,7 @@ def _compile(
     compile_id: CompileId,
     skip: int = 0,
     package: Optional[CompilePackage] = None,
+<<<<<<< HEAD
     # Can be used to record things for the caller, both
     # in the case of normal and exception code paths
     convert_frame_box: Optional[ConvertFrameBox] = None,
@@ -1278,17 +1407,98 @@ def _compile(
     from torch._inductor.async_compile import async_compile_pool_manager
     from torch.fx.experimental.validator import (
         BisectValidationException,
+=======
+) -> ConvertFrameReturn:
+    from torch.fx.experimental.validator import (
+        bisect,
+        BisectValidationException,
+        translation_validation_enabled,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ValidationException,
     )
 
     # Only nonlocal defs here please!
     # Time spent compiling this frame before restarting or failing analysis
     dynamo_time_before_restart: float = 0.0
+<<<<<<< HEAD
 
     @compile_time_strobelight_meta(phase_name="compile_inner")
     def compile_inner(
         code: CodeType, one_graph: bool, hooks: Hooks
     ) -> tuple[ConvertFrameReturn, Optional[DynamoTracerOutput]]:
+=======
+    output: Optional[OutputGraph] = None
+    tracer: Optional[InstructionTranslator] = None
+
+    tf_mode_stack: list[torch.overrides.TorchFunctionMode] = (
+        torch.overrides._get_current_function_mode_stack()
+    )
+
+    @preserve_global_state
+    def transform(
+        instructions: list[Instruction], code_options: dict[str, object]
+    ) -> None:
+        nonlocal output
+        nonlocal tracer
+        speculation_log.restart()  # type: ignore[has-type]
+        exn_vt_stack = ExceptionStack()
+        tracer = InstructionTranslator(
+            instructions,
+            code,
+            locals,
+            globals,
+            builtins,
+            closure,
+            tf_mode_stack,
+            code_options,
+            compiler_fn,
+            one_graph,
+            export,
+            export_constraints,
+            frame_state=frame_state,
+            speculation_log=speculation_log,  # type: ignore[has-type]
+            exn_vt_stack=exn_vt_stack,
+            distributed_state=distributed_state,  # type: ignore[has-type]
+            package=package,
+        )
+
+        try:
+            tracer.output.mark_bytecode_tracing_start()
+            with tracing(tracer.output.tracing_context), tracer.set_current_tx():
+                tracer.run()
+        except exc.UnspecializeRestartAnalysis:
+            speculation_log.clear()  # type: ignore[has-type]
+            raise
+        except (
+            exc.SpeculationRestartAnalysis,
+            exc.TensorifyScalarRestartAnalysis,
+            exc.SkipFrame,
+        ):
+            raise
+        except Exception:
+            if translation_validation_enabled():
+                bisect(tracer.output.shape_env)
+            raise
+        finally:
+            tracer.output.call_cleanup_hooks()
+
+        output = tracer.output
+        assert output is not None
+        assert output.output_instructions
+        instructions[:] = output.output_instructions
+        code_options.update(output.code_options)
+        propagate_inst_exn_table_entries(instructions)
+        check_inst_exn_tab_entries_valid(instructions)
+        instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))
+
+    @compile_time_strobelight_meta(phase_name="compile_inner")
+    def compile_inner(
+        code: CodeType,
+        one_graph: bool,
+        hooks: Hooks,
+        transform: Callable[[list[Instruction], dict[str, Any]], Any],
+    ) -> ConvertFrameReturn:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with contextlib.ExitStack() as stack:
             stack.enter_context(
                 torch._dynamo.callback_handler.install_callbacks(
@@ -1296,11 +1506,18 @@ def compile_inner(
                 )
             )
             stack.enter_context(CompileTimeInstructionCounter.record())
+<<<<<<< HEAD
             return _compile_inner(code, one_graph, hooks)
 
         return (
             ConvertFrameReturn(),
             None,
+=======
+            return _compile_inner(code, one_graph, hooks, transform)
+
+        return (
+            ConvertFrameReturn()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )  # dead, but see https://github.com/python/mypy/issues/7577
 
     @maybe_cprofile
@@ -1308,7 +1525,12 @@ def _compile_inner(
         code: CodeType,
         one_graph: bool,
         hooks: Hooks,
+<<<<<<< HEAD
     ) -> tuple[ConvertFrameReturn, DynamoTracerOutput]:
+=======
+        transform: Callable[[list[Instruction], dict[str, Any]], Any],
+    ) -> ConvertFrameReturn:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal dynamo_time_before_restart
         last_attempt_start_time = start_time = time.time()
 
@@ -1329,6 +1551,7 @@ def log_bytecode(
         )
 
         out_code = None
+<<<<<<< HEAD
         try:
             dynamo_output = compile_frame(
                 code,
@@ -1350,14 +1573,61 @@ def log_bytecode(
                 log.debug("No graph captured with export/fullgraph=True")
             assert e._torch_dynamo_tracer_output is not None
             return ConvertFrameReturn(), e._torch_dynamo_tracer_output
+=======
+        for attempt in itertools.count():
+            CompileContext.get().attempt = attempt
+            try:
+                with dynamo_timed(
+                    f"compile_attempt_{attempt}", log_pt2_compile_event=True
+                ):
+                    out_code = transform_code_object(code, transform)
+                break
+            except exc.RestartAnalysis as e:
+                if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
+                    TensorifyState.clear()
+                log.info(
+                    "Restarting analysis due to %s",
+                    LazyString(format_traceback_short, e.__traceback__),
+                )
+                # If restart reason is None just log the type of the exception
+                restart_reasons.add(e.restart_reason or str(type(e)))
+                # We now have a new "last attempt", reset the clock
+                last_attempt_start_time = time.time()
+                if attempt > 100:
+                    unimplemented_v2(
+                        gb_type="Excessive RestartAnalysis() calls",
+                        context="",
+                        explanation="Dynamo attempted to trace the same frame 100+ times. "
+                        "Giving up on compiling as the compile time tradeoff is likely not "
+                        "worth the performance gain.",
+                        hints=[],
+                    )
+            except exc.SkipFrame as e:
+                if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
+                    TensorifyState.clear()
+                log.debug(
+                    "Skipping frame %s %s \
+                    %s %s",
+                    e,
+                    code.co_name,
+                    code.co_filename,
+                    code.co_firstlineno,
+                )
+                if one_graph:
+                    log.debug("No graph captured with one_graph=True")
+                return ConvertFrameReturn()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert distributed_state is None or distributed_state.all_states is not None, (  # type: ignore[has-type]
             "compiler collective wasn't run before compilation completed"
         )
+<<<<<<< HEAD
         out_code = dynamo_output.bytecode
         tracer_output = dynamo_output.tracer_output
         if dynamo_output.last_attempt_start_time is not None:
             last_attempt_start_time = dynamo_output.last_attempt_start_time
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert out_code is not None
         log_bytecode(
@@ -1368,17 +1638,28 @@ def log_bytecode(
             out_code,
         )
 
+<<<<<<< HEAD
         for idx, hook in enumerate(_bytecode_hooks.values()):
             with dynamo_timed(f"bytecode_hooks_{idx}", log_pt2_compile_event=True):
                 hook_output = hook(code, out_code)
                 if hook_output is not None:
                     out_code = hook_output
+=======
+        for hook in _bytecode_hooks.values():
+            hook_output = hook(code, out_code)
+            if hook_output is not None:
+                out_code = hook_output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         orig_code_map[out_code] = code
         output_codes.add(out_code)
         dynamo_time_before_restart = last_attempt_start_time - start_time
+<<<<<<< HEAD
         assert tracer_output.output_graph is not None
         output = tracer_output.output_graph
+=======
+        assert output is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Tests for new code objects.
         # The rationale for these tests can be found in torch/csrc/dynamo/eval_frame.c
@@ -1424,24 +1705,41 @@ def count_args(code: CodeType) -> int:
         # are extra graphs now.
 
         if output.export and output.is_empty_graph():
+<<<<<<< HEAD
             return ConvertFrameReturn(), tracer_output
+=======
+            return ConvertFrameReturn()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
         nonlocal cache_entry
         with dynamo_timed("build_guards", log_pt2_compile_event=True):
+<<<<<<< HEAD
             check_fn = dynamo_output.build_guards(
                 code,
                 hooks=hooks,
                 save=package is not None,
                 cache_entry=cache_entry,
+=======
+            check_fn = CheckFunctionManager(
+                code,
+                output,
+                cache_entry,
+                hooks.guard_fail_fn if hooks else None,
+                hooks.guard_filter_fn if hooks else None,
+                guards_serialization_mode="save" if package else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if package is not None:
             assert check_fn.guards_state is not None
             package.add_guarded_code(check_fn.guards_state, out_code)
+<<<<<<< HEAD
             package.add_inlined_source(output.tracing_context.traced_code)
             package.update_device_type(output.current_tracer.graph)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compile_id_str = str(compile_id) if compile_id is not None else "Unknown"
         annotation_str = "Torch-Compiled Region: " + compile_id_str
@@ -1460,7 +1758,11 @@ def count_args(code: CodeType) -> int:
             # they are benign and do not generate any new graphs.
             hooks.guard_export_fn(output.guards)
 
+<<<<<<< HEAD
         return wrap_guarded_code(guarded_code), tracer_output
+=======
+        return wrap_guarded_code(guarded_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     metrics_context = get_metrics_context()
     code_context = (
@@ -1469,7 +1771,10 @@ def count_args(code: CodeType) -> int:
     with (
         _use_lazy_graph_module(config.use_lazy_graph_module),
         compile_context(CompileContext(compile_id)),
+<<<<<<< HEAD
         async_compile_pool_manager(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         chromium_event_timed(
             "dynamo", reset_event_log_on_exit=True, log_pt2_compile_event=True
         ),
@@ -1483,6 +1788,11 @@ def count_args(code: CodeType) -> int:
         code_context,
     ):
         restart_reasons: set[str] = set()
+<<<<<<< HEAD
+=======
+        # This is shared across restarts
+        speculation_log = SpeculationLog()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if compile_pg := get_compile_pg():
             distributed_state = DistributedState(compile_pg, LocalState())
         else:
@@ -1495,6 +1805,7 @@ def count_args(code: CodeType) -> int:
             recompile_reason = (
                 "Unable to find recompilation reasons" if not reasons else reasons[0]
             )
+<<<<<<< HEAD
         # Recheck for recompilation, for when inline_inbuilt_nn_modules is set to False
         inline_inbuilt_nn_modules_candidate = False
         if not config.inline_inbuilt_nn_modules and frame:
@@ -1529,6 +1840,9 @@ def count_args(code: CodeType) -> int:
                 user_context()[:256] for user_context in recompile_user_contexts
             }
             metrics_context.set("recompile_user_contexts", user_contexts_msg)
+=======
+        metrics_context.update_outer({"recompile_reason": recompile_reason})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         exceeded, limit_type = exceeds_recompile_limit(cache_size, compile_id)
         if exceeded:
@@ -1536,14 +1850,21 @@ def count_args(code: CodeType) -> int:
             def format_func_info(code: CodeType) -> str:
                 return f"'{code.co_name}' ({code.co_filename}:{code.co_firstlineno})"
 
+<<<<<<< HEAD
             # NS: Don't add period at the end of string, as it'll be added to URL
             # rendering it incorrect
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.warning(
                 "torch._dynamo hit config.%s (%s)\n"
                 "   function: %s\n"
                 "   last reason: %s\n"
                 'To log all recompilation reasons, use TORCH_LOGS="recompiles".\n'
+<<<<<<< HEAD
                 "To diagnose recompilation issues, see %s",
+=======
+                "To diagnose recompilation issues, see %s.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 limit_type,
                 getattr(config, limit_type),
                 format_func_info(code),
@@ -1556,7 +1877,11 @@ def format_func_info(code: CodeType) -> str:
                 )
             elif one_graph:
                 raise FailOnRecompileLimitHit(
+<<<<<<< HEAD
                     f"{limit_type} reached with fullgraph=True. Excessive recompilations can degrade "
+=======
+                    f"{limit_type} reached with one_graph=True. Excessive recompilations can degrade "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "performance due to the compilation overhead of each recompilation. To monitor "
                     "recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider "
                     "increasing torch._dynamo.config.cache_size_limit to an appropriate value."
@@ -1603,18 +1928,51 @@ def format_func_info(code: CodeType) -> str:
         # # 2 extra here
         # torch/_logging/_internal.py:1064 in trace_structured
         # torch/_dynamo/convert_frame.py:780 in <lambda>
+<<<<<<< HEAD
         stack_trace = log_dynamo_start(code, skip)
         start_time_ns = time.time_ns()
         fail_type: Optional[str] = None
         fail_reason: Optional[str] = None
         exception_stack_trace: Optional[list[str]] = None
+=======
+        convert_frame_intern = structured.intern_string(__file__)
+        # Initialize the ChromiumEventLogger on start
+        torch._logging.trace_structured(
+            "dynamo_start",
+            lambda: {
+                "stack": list(
+                    itertools.takewhile(
+                        lambda f: f["filename"] != convert_frame_intern,
+                        structured.from_traceback(
+                            CapturedTraceback.extract(skip=4 + skip).summary()
+                        ),
+                    )
+                )
+                + [
+                    {
+                        "line": code.co_firstlineno,
+                        "name": code.co_name,
+                        "filename": structured.intern_string(code.co_filename),
+                    }
+                ]
+            },
+        )
+        start_time_ns = time.time_ns()
+        fail_type: Optional[str] = None
+        fail_reason: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fail_user_frame_filename: Optional[str] = None
         fail_user_frame_lineno: Optional[int] = None
         torch._dynamo.utils.ReinplaceCounters.clear()
         guarded_code = None
+<<<<<<< HEAD
         tracer_output = None
         try:
             guarded_code, tracer_output = compile_inner(code, one_graph, hooks)
+=======
+        try:
+            guarded_code = compile_inner(code, one_graph, hooks, transform)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # NB: We only put_code_state in success case.  Success case here
             # does include graph breaks; specifically, if a graph break still
@@ -1626,6 +1984,7 @@ def format_func_info(code: CodeType) -> str:
             # to upload for graph break though, because this can prevent
             # extra graph break compilations.)
             put_code_state()
+<<<<<<< HEAD
             if (
                 tracer_output
                 and (output_graph := tracer_output.output_graph)
@@ -1634,6 +1993,9 @@ def format_func_info(code: CodeType) -> str:
                 log_frame_dynamic_whitelist(code)
                 if recompile_reason and "size mismatch at index" in recompile_reason:
                     _log_size_mismatch_recompile()
+=======
+            log_frame_dynamic_whitelist(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return guarded_code
         except Exception as e:
@@ -1642,7 +2004,10 @@ def format_func_info(code: CodeType) -> str:
             # info here and add it to the metrics context below.
             fail_type = type(e).__qualname__
             fail_reason = str(e)
+<<<<<<< HEAD
             exception_stack_trace = [traceback.format_exc()]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             exception_handler(e, code, frame, export=export)
             # NB: this is the post-mutation exception
             torch._logging.trace_structured(
@@ -1656,7 +2021,10 @@ def format_func_info(code: CodeType) -> str:
             fail_user_frame_filename, fail_user_frame_lineno = exc.get_exc_message(
                 e, compile_id
             )
+<<<<<<< HEAD
             tracer_output = getattr(e, "_torch_dynamo_tracer_output", None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(
                 e,
                 (
@@ -1671,7 +2039,10 @@ def format_func_info(code: CodeType) -> str:
                     BisectValidationException,
                     ShortenTraceback,
                     PackageError,
+<<<<<<< HEAD
                     ResumePrologueTracingError,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             ):
                 raise
@@ -1691,6 +2062,7 @@ def format_func_info(code: CodeType) -> str:
                     log.info("run_gc_after_compile: running gc")
                     gc.collect(1)
 
+<<<<<<< HEAD
             output = None
             if tracer_output:
                 output = tracer_output.output_graph
@@ -1699,6 +2071,11 @@ def format_func_info(code: CodeType) -> str:
                 # tracer should already be None, keep an extra check here just in case.
                 if tracer := output.root_tx:
                     tracer.f_locals = {}
+=======
+            if tracer:
+                tracer.output.local_scope = {}
+                tracer.f_locals = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             from .utils import curr_frame
 
@@ -1708,7 +2085,10 @@ def format_func_info(code: CodeType) -> str:
                 shape_env_guard_count = len(output.shape_env.guards)
                 graph_op_count = output.count_calls()
                 graph_node_count = len(output.graph.nodes)
+<<<<<<< HEAD
                 graph_node_shapes = output.get_graph_sizes_structured()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph_input_count = len(output.placeholders)
                 non_compliant_ops = {op.__qualname__ for op in output.non_compliant_ops}
                 compliant_custom_ops = {
@@ -1720,7 +2100,10 @@ def format_func_info(code: CodeType) -> str:
                 shape_env_guard_count = None
                 graph_op_count = None
                 graph_node_count = None
+<<<<<<< HEAD
                 graph_node_shapes = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph_input_count = None
                 non_compliant_ops = set({})
                 compliant_custom_ops = set({})
@@ -1749,14 +2132,22 @@ def format_func_info(code: CodeType) -> str:
                 "restart_reasons": restart_reasons,
                 "dynamo_time_before_restart_s": dynamo_time_before_restart,
                 "has_guarded_code": guarded_code is not None,
+<<<<<<< HEAD
+=======
+                "config_suppress_errors": config.suppress_errors,
+                "config_inline_inbuilt_nn_modules": config.inline_inbuilt_nn_modules,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "specialize_float": config.specialize_float,
                 "is_forward": True,
                 "dynamo_compile_time_before_restart_us": to_int_us(
                     dynamo_time_before_restart
                 ),
+<<<<<<< HEAD
                 "stack_trace": stack_trace,
                 "graph_node_shapes": str(graph_node_shapes),
                 "exception_stack_trace": exception_stack_trace,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
             # TODO: replace with CompileEventLogger.compilation_metrics
             # There are some columns here not in PT2 Compile Events
@@ -1764,6 +2155,7 @@ def format_func_info(code: CodeType) -> str:
             metrics_context.update_outer(metrics)
             # === END WARNING WARNING WARNING ===
 
+<<<<<<< HEAD
             # If tracer is available, then tracer.error_on_graph_break reflects value of
             # global symbolic_convert.error_on_graph_break at the time of the graph break -
             # symbolic_convert.error_on_graph_break may have been (correctly) changed during cleanup.
@@ -1775,6 +2167,8 @@ def format_func_info(code: CodeType) -> str:
                     else _get_error_on_graph_break()
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ConvertFrame:
     def __init__(
@@ -1783,7 +2177,11 @@ def __init__(
         hooks: Hooks,
         package: Optional[CompilePackage] = None,
     ) -> None:
+<<<<<<< HEAD
         self._torchdynamo_orig_backend = compiler_fn
+=======
+        self._torchdynamo_orig_callable = compiler_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._inner_convert = convert_frame_assert(
             compiler_fn, one_graph=False, package=package
         )
@@ -1791,10 +2189,14 @@ def __init__(
 
     @property
     def _clone_with_backend(self) -> Callable[[WrapBackendDebug], ConvertFrame]:
+<<<<<<< HEAD
         return lambda backend: convert_frame(
             backend,
             self._hooks,
         )
+=======
+        return lambda backend: convert_frame(backend, self._hooks)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self,
@@ -1813,6 +2215,7 @@ def __call__(
             counters["frames"]["ok"] += 1
             return result
         except Exception as e:
+<<<<<<< HEAD
             # Do not allow errors to be suppressed if we're tracing a resume function prologue
             if isinstance(e, ResumePrologueTracingError):
                 raise
@@ -1829,6 +2232,8 @@ def __call__(
                 # as an observed exception - we don't expect that exception to be suppressed.
                 raise
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # These two exception types are "soft" failure, in the sense that
             # we know this is due to something we didn't implement all the
             # way, scare the user less about it.  That being said, if you
@@ -1909,9 +2314,13 @@ def __call__(
 
 
 def convert_frame(
+<<<<<<< HEAD
     compiler_fn: CompilerFn,
     hooks: Hooks,
     package: Optional[CompilePackage] = None,
+=======
+    compiler_fn: CompilerFn, hooks: Hooks, package: Optional[CompilePackage] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ConvertFrame:
     """Try to convert a frame into an FX graph, if error leave frame unmodified"""
     return ConvertFrame(compiler_fn, hooks, package=package)
@@ -1927,6 +2336,7 @@ def replay(filename: str) -> None:
         record = ExecutionRecord.load(in_file)
     record.globals = dict(itertools.chain(record.globals.items(), globals().items()))
 
+<<<<<<< HEAD
     with decorators.error_on_graph_break(False):
         try:
             _compile(
@@ -1948,6 +2358,28 @@ def replay(filename: str) -> None:
             )
         finally:
             config.replay_record_enabled = original_replay_val
+=======
+    try:
+        _compile(
+            record.code,
+            record.globals,
+            record.locals,
+            record.builtins,
+            record.closure,
+            compiler_fn=eager,
+            one_graph=False,
+            export=False,
+            export_constraints=None,
+            hooks=Hooks(),
+            cache_size=CacheSizeRelevantForFrame(0, 0),
+            cache_entry=None,
+            frame=None,
+            frame_state={},
+            compile_id=CompileId(frame_id=42, frame_compile_id=999),
+        )
+    finally:
+        config.replay_record_enabled = original_replay_val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def first_real_inst_idx(code: CodeType) -> int:
@@ -1971,6 +2403,7 @@ def __call__(
     ) -> ConvertFrameReturn: ...
 
 
+<<<<<<< HEAD
 def should_skip_due_to_torch_dispatch_mode() -> bool:
     return is_in_any_mode_without_ignore_compile_internals()
 
@@ -1979,6 +2412,12 @@ class CatchErrorsWrapper:
     def __init__(self, callback: ConvertFrameProtocol, hooks: Hooks) -> None:
         functools.wraps(callback)(self)
         self._torchdynamo_orig_backend = callback
+=======
+class CatchErrorsWrapper:
+    def __init__(self, callback: ConvertFrameProtocol, hooks: Hooks) -> None:
+        functools.wraps(callback)(self)
+        self._torchdynamo_orig_callable = callback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.hooks = hooks
 
     def __call__(
@@ -1988,6 +2427,10 @@ def __call__(
         frame_state: dict[str, Union[int, FrameStateSizeEntry]],
     ) -> ConvertFrameReturn:
         assert frame_state is not None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_codes.add(frame.f_code)
 
         is_skipfile = trace_rules.check(frame.f_code)
@@ -2001,8 +2444,13 @@ def __call__(
             or is_skipfile
             or config.disable
             or (
+<<<<<<< HEAD
                 should_skip_due_to_torch_dispatch_mode()
                 and not getattr(self._torchdynamo_orig_backend, "_export", False)
+=======
+                is_in_torch_dispatch_mode(include_infra_modes=False)
+                and not getattr(self._torchdynamo_orig_callable, "_export", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ):
             if log.isEnabledFor(logging.DEBUG):
@@ -2023,6 +2471,7 @@ def __call__(
                 )
             return ConvertFrameReturn()
 
+<<<<<<< HEAD
         if (
             frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__"
         ) or (
@@ -2030,6 +2479,10 @@ def __call__(
             and frame.f_code.co_name == "_make"
         ):
             # nametuple constructor/_make
+=======
+        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
+            # nametuple constructor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ConvertFrameReturn()
         if torch._dynamo.utils.get_optimize_ddp_mode() == "ddp_optimizer":
             ddp_module = DistributedDataParallel._get_active_ddp_module()
@@ -2039,15 +2492,26 @@ def __call__(
 
                     ddp_optimizer = DDPOptimizer(
                         bucket_bytes_cap=ddp_module.bucket_bytes_cap,
+<<<<<<< HEAD
                         backend_compile_fn=self._torchdynamo_orig_backend._torchdynamo_orig_backend,  # type: ignore[attr-defined]
                     )
                     assert hasattr(
                         self._torchdynamo_orig_backend, "_clone_with_backend"
+=======
+                        backend_compile_fn=self._torchdynamo_orig_callable._torchdynamo_orig_callable,  # type: ignore[attr-defined]
+                    )
+                    assert hasattr(
+                        self._torchdynamo_orig_callable, "_clone_with_backend"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ), (
                         "DDPOptimizer only supports callback fns that know how to clone themselves."
                     )
                     hijacked_callback = (
+<<<<<<< HEAD
                         self._torchdynamo_orig_backend._clone_with_backend(
+=======
+                        self._torchdynamo_orig_callable._clone_with_backend(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             ddp_optimizer.compile_fn,
                         )
                     )
@@ -2057,10 +2521,16 @@ def __call__(
 
         with compile_lock, _disable_current_modes():
             # skip=1: skip this frame
+<<<<<<< HEAD
             result = self._torchdynamo_orig_backend(
                 frame, cache_entry, self.hooks, frame_state, skip=1
             )
             return result
+=======
+            return self._torchdynamo_orig_callable(
+                frame, cache_entry, self.hooks, frame_state, skip=1
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def catch_errors_wrapper(
diff --git a/torch/_dynamo/create_parameter_op.py b/torch/_dynamo/create_parameter_op.py
index 2a716865c3f48..78d4f72fee231 100644
--- a/torch/_dynamo/create_parameter_op.py
+++ b/torch/_dynamo/create_parameter_op.py
@@ -20,7 +20,10 @@
 
 class TracableCreateParameter(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx: Any, tensor: Any, placeholder: Any) -> torch.nn.Parameter:
         assert not tensor.requires_grad
         return placeholder.set_(tensor)
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 47e5fdb12dfcc..0ba1fe5942307 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -1,3 +1,9 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code="method-assign"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Debug utilities for TorchDynamo compilation and execution.
 
@@ -16,8 +22,11 @@
 - BuckTargetWriter: Manages Buck build system integration
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import atexit
 import copy
 import cProfile
@@ -34,14 +43,21 @@
 import textwrap
 from collections import Counter
 from importlib import import_module
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._prims_common as utils
 import torch._subclasses.meta_utils
 from torch import Tensor
 from torch._dynamo.testing import rand_strided
+<<<<<<< HEAD
 from torch._inductor.cpp_builder import normalize_path_separator
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import is_float_dtype
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._content_store import ContentStoreReader, ContentStoreWriter
@@ -50,6 +66,7 @@
 from .utils import clone_inputs, get_debug_dir
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 
@@ -57,6 +74,8 @@
     from torch.storage import UntypedStorage
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 T = TypeVar("T")
@@ -71,7 +90,10 @@
 
 extra_deps = []
 extra_imports = ""
+<<<<<<< HEAD
 cur_target = ""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if use_buck:
     extra_deps = [
         "//caffe2/torch/fb/sparsenn:sparsenn_operators_gpu",
@@ -87,7 +109,11 @@
 
 
 class BuckTargetWriter:
+<<<<<<< HEAD
     def __init__(self, filename: str) -> None:
+=======
+    def __init__(self, filename):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.subdir, self.py_file = os.path.split(os.path.abspath(filename))
         self.target = self.py_file.replace(".py", "")
 
@@ -101,7 +127,11 @@ def __init__(self, filename: str) -> None:
         tmp = tmp[tmp.find("fbcode/") :][7:]
         self.cmd_line_path = f"//{tmp}:{self.target}"
 
+<<<<<<< HEAD
     def build(self) -> str:
+=======
+    def build(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_cpp_deps = "\n".join([f'        "{x}",' for x in extra_deps])
         return textwrap.dedent(
             f"""
@@ -127,7 +157,11 @@ def build(self) -> str:
 """
         )
 
+<<<<<<< HEAD
     def write(self, print_msg: bool = True) -> list[str]:
+=======
+    def write(self, print_msg=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target_file = os.path.join(self.subdir, "TARGETS")
         with open(target_file, "w") as fd:
             fd.write(self.build())
@@ -141,7 +175,11 @@ def write(self, print_msg: bool = True) -> list[str]:
         return cmd_split
 
 
+<<<<<<< HEAD
 def minifier_dir() -> str:
+=======
+def minifier_dir():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     path = os.path.join(get_debug_dir(), "minifier")
     if path is None:
         path = f"{tempfile.gettempdir()}/minifier_{getpass.getuser()}"
@@ -179,7 +217,11 @@ class NNModuleToString:
     ]
 
     @staticmethod
+<<<<<<< HEAD
     def can_convert_to_string(gm: torch.fx.GraphModule) -> bool:
+=======
+    def can_convert_to_string(gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cant_convert = set()
         for _, module in gm.named_children():
             if type(module) not in NNModuleToString.safe_reprs:
@@ -191,7 +233,11 @@ def can_convert_to_string(gm: torch.fx.GraphModule) -> bool:
         return True
 
     @staticmethod
+<<<<<<< HEAD
     def convert(gm: torch.fx.GraphModule) -> str:
+=======
+    def convert(gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.nn.modules.module import _addindent
 
         tab = " " * 4
@@ -256,7 +302,11 @@ def __init__(self) -> None:
 
 
 @functools.cache  # subprocess is expensive
+<<<<<<< HEAD
 def _cuda_system_info_comment() -> str:
+=======
+def _cuda_system_info_comment():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not torch.cuda.is_available():
         return "# torch.cuda.is_available()==False, no GPU info collected\n"
 
@@ -264,7 +314,11 @@ def _cuda_system_info_comment() -> str:
     try:
         cuda_version_out = subprocess.check_output(["nvcc", "--version"])
         cuda_version_lines = cuda_version_out.decode().split("\n")
+<<<<<<< HEAD
         comment = "".join([f"# {s} \n" for s in cuda_version_lines if s != ""])
+=======
+        comment = "".join([f"# {s} \n" for s in cuda_version_lines if s not in [""]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_str += f"{comment}\n"
     except (FileNotFoundError, subprocess.CalledProcessError):
         model_str += "# nvcc not found\n"
@@ -280,7 +334,11 @@ def _cuda_system_info_comment() -> str:
     return model_str
 
 
+<<<<<<< HEAD
 def generate_env_vars_string(*, stable_output: bool = False) -> str:
+=======
+def generate_env_vars_string(*, stable_output=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a string configuration for environment variables related to Dynamo, Inductor, and Triton.
     """
@@ -290,7 +348,11 @@ def generate_env_vars_string(*, stable_output: bool = False) -> str:
     allow_list = ["TORCH", "DYNAMO", "INDUCTOR", "TRITON"]
     skip_list = ["TRITON_LIBDEVICE_PATH", "TRITON_PTXAS_PATH", "TRITON_LIBCUDA_PATH"]
 
+<<<<<<< HEAD
     def filter(key: str) -> bool:
+=======
+    def filter(key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return any(string in key for string in allow_list) and key not in skip_list
 
     config_lines = [
@@ -299,6 +361,7 @@ def filter(key: str) -> bool:
         if filter(key)
     ]
     config_string = "\n".join(config_lines)
+<<<<<<< HEAD
     return normalize_path_separator(f"""\
 import os
 {config_string}
@@ -306,6 +369,15 @@ def filter(key: str) -> bool:
 
 
 def generate_config_string(*, stable_output: bool = False) -> str:
+=======
+    return f"""\
+import os
+{config_string}
+    """
+
+
+def generate_config_string(*, stable_output=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch._functorch.config
     import torch._inductor.config
 
@@ -325,11 +397,19 @@ def generate_config_string(*, stable_output: bool = False) -> str:
 """
 
 
+<<<<<<< HEAD
 def get_minifier_repro_path() -> str:
     return os.path.join(minifier_dir(), "minifier_launcher.py")
 
 
 def helper_for_dump_minify(contents: str) -> None:
+=======
+def get_minifier_repro_path():
+    return os.path.join(minifier_dir(), "minifier_launcher.py")
+
+
+def helper_for_dump_minify(contents):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     minified_repro_path = get_minifier_repro_path()
     log.warning("Writing minified repro to:\n%s", minified_repro_path)
 
@@ -341,14 +421,22 @@ def helper_for_dump_minify(contents: str) -> None:
 
     except OSError as e:
         log.exception("")
+<<<<<<< HEAD
         raise NotImplementedError(f"Could not write to {minified_repro_path}") from e
+=======
+        raise NotImplementedError("Could not write to {minified_repro_path}") from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AccuracyError(Exception):
     pass
 
 
+<<<<<<< HEAD
 def clone_inputs_retaining_gradness(example_inputs: Sequence[Any]) -> list[Any]:
+=======
+def clone_inputs_retaining_gradness(example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This clone inputs is different from utils clone_input. In case of minifier,
     all the tensors are leaf tensors while creating a new graph. So, we set the
@@ -358,6 +446,7 @@ def clone_inputs_retaining_gradness(example_inputs: Sequence[Any]) -> list[Any]:
     for idx in range(len(example_inputs)):
         if isinstance(cloned_inputs[idx], torch.Tensor):
             cloned_inputs[idx].requires_grad_(example_inputs[idx].requires_grad)
+<<<<<<< HEAD
     return cloned_inputs  # type: ignore[return-value]
 
 
@@ -367,6 +456,12 @@ def run_fwd_maybe_bwd(
     only_fwd: bool = False,
     disable_clone: bool = False,
 ) -> Any:
+=======
+    return cloned_inputs
+
+
+def run_fwd_maybe_bwd(gm, args, only_fwd=False, disable_clone=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Runs a forward and possibly backward iteration for a given mod and args.
 
@@ -394,6 +489,7 @@ def run_fwd_maybe_bwd(
 
 
 def same_two_models(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     opt_gm: torch.fx.GraphModule,
     example_inputs: Sequence[Any],
@@ -402,6 +498,16 @@ def same_two_models(
     require_fp64: bool = False,
     ignore_non_fp: bool = False,
 ) -> bool:
+=======
+    gm,
+    opt_gm,
+    example_inputs,
+    only_fwd=False,
+    *,
+    require_fp64=False,
+    ignore_non_fp=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Check two models have same accuracy.
 
@@ -451,7 +557,11 @@ def same_two_models(
     return passing
 
 
+<<<<<<< HEAD
 def cast_dtype_args_to_fp64(model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+=======
+def cast_dtype_args_to_fp64(model):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in model.graph.nodes:
         if (
             node.op == "call_function"
@@ -472,9 +582,13 @@ def cast_dtype_args_to_fp64(model: torch.fx.GraphModule) -> torch.fx.GraphModule
     return model
 
 
+<<<<<<< HEAD
 def cast_to(
     dtype: torch.dtype, model: torch.fx.GraphModule, inputs: list[Any]
 ) -> tuple[torch.fx.GraphModule, list[Any]]:
+=======
+def cast_to(dtype, model, inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.utils._pytree import tree_map
 
     model = model.to(dtype)
@@ -492,13 +606,18 @@ def cast_to(
     return model, inputs
 
 
+<<<<<<< HEAD
 def cast_to_fp64(
     model: torch.fx.GraphModule, inputs: list[Any]
 ) -> tuple[torch.fx.GraphModule, list[Any]]:
+=======
+def cast_to_fp64(model, inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cast_to(torch.float64, model, inputs)
 
 
 def backend_accuracy_fails(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[Any],
     compiler_fn: Callable[[torch.fx.GraphModule, list[Any]], torch.fx.GraphModule],
@@ -507,6 +626,16 @@ def backend_accuracy_fails(
     require_fp64: bool = False,
     ignore_non_fp: bool = False,
 ) -> bool:
+=======
+    gm,
+    example_inputs,
+    compiler_fn,
+    only_fwd=False,
+    *,
+    require_fp64=False,
+    ignore_non_fp=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         compiled_gm = compiler_fn(
             copy.deepcopy(gm), clone_inputs_retaining_gradness(example_inputs)
@@ -540,10 +669,17 @@ def backend_accuracy_fails(
 
 
 def _stride_or_default(
+<<<<<<< HEAD
     stride: Optional[torch._prims_common.StrideType],
     *,
     shape: torch._prims_common.ShapeType,
 ) -> torch._prims_common.StrideType:
+=======
+    stride: Optional["torch._prims_common.StrideType"],
+    *,
+    shape: "torch._prims_common.ShapeType",
+) -> "torch._prims_common.StrideType":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return stride if stride is not None else utils.make_contiguous_strides_for(shape)
 
 
@@ -562,6 +698,7 @@ class NopInputReader:
     def __init__(self) -> None:
         self.total = 0
 
+<<<<<<< HEAD
     def storage(
         self,
         storage_hash: Optional[str],
@@ -576,13 +713,26 @@ def tensor(self, *args: Any, **kwargs: Any) -> Optional[torch.Tensor]:
         pass
 
     def symint(self, *args: Any, **kwargs: Any) -> Optional[int]:
+=======
+    def storage(self, storage_hash, nbytes, *, device=None, dtype_hint=None):
+        self.total += 1
+
+    def tensor(self, *args, **kwargs):
+        pass
+
+    def symint(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
 
 # TODO: Support bundling the entire repro into a zip file for ease of
 # transferring around
 class InputReader:
+<<<<<<< HEAD
     def __init__(self, save_dir: Optional[str] = None, *, pbar: Optional[tqdm] = None):
+=======
+    def __init__(self, save_dir=None, *, pbar=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If None, we will generate random data instead.  It's important
         # to natively support this use case as it will allow people to
         # share repros without including the real data, if the problem
@@ -590,6 +740,7 @@ def __init__(self, save_dir: Optional[str] = None, *, pbar: Optional[tqdm] = Non
         if save_dir is None:
             log.warning("no save_dir specified, will generate random data")
         self.store = ContentStoreReader(save_dir) if save_dir is not None else None
+<<<<<<< HEAD
         self.args: list[Any] = []
         self.pbar = pbar
 
@@ -604,6 +755,15 @@ def storage(
         if self.pbar is not None:
             self.pbar.update(1)
         device = _device_or_default(device)  # type: ignore[arg-type]
+=======
+        self.args = []
+        self.pbar = pbar
+
+    def storage(self, storage_hash, nbytes, *, device=None, dtype_hint=None):
+        if self.pbar is not None:
+            self.pbar.update(1)
+        device = _device_or_default(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype_hint = _dtype_or_default(dtype_hint)
         if self.store is not None and storage_hash is not None:
             try:
@@ -624,6 +784,7 @@ def storage(
 
     def tensor(
         self,
+<<<<<<< HEAD
         storage: UntypedStorage,
         shape: torch._prims_common.ShapeType,
         stride: Optional[torch._prims_common.StrideType] = None,
@@ -634,6 +795,18 @@ def tensor(
         is_leaf: Optional[bool] = None,
         **metadata: Any,
     ) -> torch.Tensor:
+=======
+        storage,
+        shape,
+        stride=None,
+        *,
+        storage_offset=None,
+        dtype=None,
+        requires_grad=None,
+        is_leaf=None,
+        **metadata,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stride = _stride_or_default(stride, shape=shape)
         storage_offset = _storage_offset_or_default(storage_offset)
         dtype = _dtype_or_default(dtype)
@@ -655,7 +828,11 @@ def tensor(
         self.args.append(t)
         return t  # for BC
 
+<<<<<<< HEAD
     def symint(self, val: Any) -> Any:
+=======
+    def symint(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.args.append(val)
         return val  # for BC
 
@@ -673,8 +850,13 @@ def symint(self, val: Any) -> Any:
 
 
 class InputWriter:
+<<<<<<< HEAD
     def __init__(self, save_dir: Optional[str], *, stable_hash: bool = False) -> None:
         self._lines: list[str] = []
+=======
+    def __init__(self, save_dir, *, stable_hash=False):
+        self._lines = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: consider ensuring tensor and storage counters line up?
         self.storage_counter = itertools.count()
         self.save_dir = save_dir
@@ -683,9 +865,15 @@ def __init__(self, save_dir: Optional[str], *, stable_hash: bool = False) -> Non
             if save_dir is not None
             else None
         )
+<<<<<<< HEAD
         self.seen_storages: dict[StorageWeakRef, str] = {}
 
     def lines(self) -> list[str]:
+=======
+        self.seen_storages = {}
+
+    def lines(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = [
             "def load_args(reader):",
         ]
@@ -700,6 +888,7 @@ def lines(self) -> list[str]:
     # of initialization may be appropriate
     #
     # If we had a FakeTensor, device_hint tells us what device should be
+<<<<<<< HEAD
     def storage(
         self,
         untyped_storage: UntypedStorage,
@@ -707,6 +896,9 @@ def storage(
         device_hint: Optional[torch._prims_common.DeviceLikeType] = None,
         dtype_hint: Optional[torch.dtype] = None,
     ) -> str:
+=======
+    def storage(self, untyped_storage, *, dtype_hint=None, device_hint=None) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ws = StorageWeakRef(untyped_storage)
         v = self.seen_storages.get(ws)
         if v is not None:
@@ -721,7 +913,11 @@ def storage(
         device = untyped_storage.device
         if device.type == "meta":
             assert device_hint is not None
+<<<<<<< HEAD
             device = device_hint  # type: ignore[assignment]
+=======
+            device = device_hint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if _device_or_default(None) != device:
             maybe_device = f", device={device!r}"
         nbytes = untyped_storage.nbytes()
@@ -734,7 +930,11 @@ def storage(
         self.seen_storages[ws] = v
         return v
 
+<<<<<<< HEAD
     def tensor(self, name: str, t: torch.Tensor) -> None:
+=======
+    def tensor(self, name, t) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
 
         storage = self.storage(
@@ -766,7 +966,11 @@ def tensor(self, name: str, t: torch.Tensor) -> None:
             + f")  # {name}"
         )
 
+<<<<<<< HEAD
     def unsupported(self, name: str, arg: Any) -> None:
+=======
+    def unsupported(self, name, arg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: Try hard not to /print/ a tensor, that will be very slow
         self._lines.append(f"# {name} was unsupported type for dumping: {type(arg)}")
         # Best effort dump as much useful stuff we can lol, in case you want
@@ -784,13 +988,21 @@ def unsupported(self, name: str, arg: Any) -> None:
             self._lines.append('"""')
 
     # write out that the arg was filtered out as it is constant
+<<<<<<< HEAD
     def const(self, name: str) -> None:
+=======
+    def const(self, name) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._lines.append(
             f"reader.const({name!r})  # {name}, filtered out during compilation"
         )
 
     # TODO: this doesn't actually symint atm
+<<<<<<< HEAD
     def symint(self, name: str, val: Any) -> None:
+=======
+    def symint(self, name, val) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(val, torch.SymInt):
             val = val.node.hint
         self._lines.append(f"reader.symint({val!r})  # {name}")
@@ -819,10 +1031,15 @@ def forward(self, primals_1: "f32[1001, 6]", primals_2: "f32[s0]", primals_3: "S
 
     from torch.utils._dtype_abbrs import dtype_abbrs
 
+<<<<<<< HEAD
     dtype_map: dict[str, torch.dtype] = {
         value: key for key, value in dtype_abbrs.items()
     }
     dtype_pattern: str = "|".join(dtype_abbrs.values())
+=======
+    dtype_map = {value: key for key, value in dtype_abbrs.items()}
+    dtype_pattern = "|".join(dtype_abbrs.values())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Extracting the source code from the function
     source = inspect.getsource(func)
@@ -838,6 +1055,7 @@ class TensorContainer:
     # Dictionary for tensors from annotations
     kwargs: dict[str, Any] = {}
 
+<<<<<<< HEAD
     sym_shapes_dict: dict[str, int] = sym_shapes or {}
 
     def get_sym_int(symint: str) -> int:
@@ -848,11 +1066,27 @@ def get_sym_int(symint: str) -> int:
         return sym_shapes_dict.get(symint, default_sym_shape)  # type: ignore[return-value]
 
     def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tensor:
+=======
+    sym_shapes = sym_shapes or {}
+
+    def get_sym_int(symint):
+        torch._check(
+            symint in sym_shapes or default_sym_shape is not None,
+            lambda: f"{symint} not in symbolic_shapes and default sym shape not passed in",
+        )
+        return sym_shapes.get(symint, default_sym_shape)
+
+    def gen_tensor(shape, dtype) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Resolve symbolic shapes to concrete values
         resolved_shape = []
         dynamic_dims = []
         for i, dim in enumerate(shape):
+<<<<<<< HEAD
             dim = dim.strip()  # type: ignore[attr-defined]
+=======
+            dim = dim.strip()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if "s" in dim:
                 s = get_sym_int(dim)
                 resolved_shape.append(s)
@@ -879,7 +1113,10 @@ def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tens
             data_type, shape_str = match.groups()
             shape = tuple(shape_str.split(","))
             dtype = dtype_map[data_type]
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kwargs[param] = gen_tensor(shape, dtype)
 
         match = re.search(sym_shape_regex, annotation)
@@ -893,7 +1130,10 @@ def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tens
             attr_name, data_type, shape_str, _ = match.groups()
             shape = tuple(shape_str.split(","))
             dtype = dtype_map[data_type]
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             setattr(container, attr_name, gen_tensor(shape, dtype))
 
     return kwargs
@@ -909,9 +1149,15 @@ def profile_to_file(filename: str) -> Callable[[T], T]:
     prof = cProfile.Profile()
     filename = os.path.abspath(os.path.expanduser(filename))
 
+<<<<<<< HEAD
     def decorator(fn: Any) -> Any:
         @functools.wraps(fn)
         def wrapper(*args: Any, **kwargs: Any) -> Any:
+=======
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prof.enable()
             try:
                 return fn(*args, **kwargs)
@@ -920,7 +1166,11 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 
         return wrapper
 
+<<<<<<< HEAD
     def save_it() -> None:
+=======
+    def save_it():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prof.dump_stats(filename)
         sys.stderr.write(
             textwrap.dedent(
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index 50fdadbb8fbbd..ec8ff566b8b56 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -1,3 +1,9 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+# ruff: noqa: TCH004
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides decorators and utilities for controlling TorchDynamo's behavior during compilation.
 """
@@ -5,6 +11,7 @@
 import functools
 import inspect
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from types import TracebackType
@@ -13,6 +20,13 @@
 
 import torch
 from torch.compiler import is_compiling
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import ParamSpec
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._contextlib import _DecoratorContextManager
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
@@ -28,10 +42,18 @@
 )
 from .exc import IncorrectUsage
 from .external_utils import (
+<<<<<<< HEAD
     get_nonrecursive_disable_wrapper,
     wrap_dunder_call_ctx_manager,
 )
 from .utils import _get_error_on_graph_break, _set_error_on_graph_break, is_function
+=======
+    _dynamo_config_patch_proxy_dunder_call,
+    get_nonrecursive_disable_wrapper,
+    is_compiling,
+)
+from .utils import is_function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -55,11 +77,17 @@
 
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
+<<<<<<< HEAD
 FuncType = Callable[..., Any]
 F = TypeVar("F", bound=FuncType)
 
 
 def run(fn: Optional[Callable[_P, _R]] = None) -> Any:
+=======
+
+
+def run(fn=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Don't do any dynamic compiles, just use prior optimizations"""
     if fn is not None:
         fn = innermost_fn(fn)
@@ -68,7 +96,11 @@ def run(fn: Optional[Callable[_P, _R]] = None) -> Any:
     return RunOnlyContext()
 
 
+<<<<<<< HEAD
 def disable(fn=None, recursive=True, *, reason=None, wrapping=True):  # type: ignore[no-untyped-def]
+=======
+def disable(fn=None, recursive=True, *, reason=None, wrapping=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to disable TorchDynamo
 
@@ -88,7 +120,11 @@ def disable(fn=None, recursive=True, *, reason=None, wrapping=True):  # type: ig
         return DisableContext(msg=reason, wrapping=wrapping)
     else:
 
+<<<<<<< HEAD
         def wrap(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+=======
+        def wrap(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn = innermost_fn(fn)
             assert callable(fn)
 
@@ -96,7 +132,10 @@ def wrap(fn: Callable[_P, _R]) -> Callable[_P, _R]:
             nonrecursive_disable_wrapper._torchdynamo_disable = True  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_disable_msg = reason  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return nonrecursive_disable_wrapper
 
         if fn is None:
@@ -108,7 +147,11 @@ def wrap(fn: Callable[_P, _R]) -> Callable[_P, _R]:
 skip_code(_nonrecursive_disable_wrapper_code)
 
 
+<<<<<<< HEAD
 def skip(fn: Optional[Callable[_P, _R]] = None) -> Callable[..., Any]:
+=======
+def skip(fn=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Skip frames associated with the function code, but still process recursively
     invoked frames
@@ -118,7 +161,11 @@ def skip(fn: Optional[Callable[_P, _R]] = None) -> Callable[..., Any]:
     fn = innermost_fn(fn)
     assert callable(fn)
     skip_code(fn.__code__)
+<<<<<<< HEAD
     fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+=======
+    fn._torchdynamo_disable = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fn
 
 
@@ -136,7 +183,11 @@ def __init__(
         stance: str = "default",
         *,
         skip_guard_eval_unsafe: bool = False,
+<<<<<<< HEAD
         force_backend: Union[str, Callable[..., Any], None] = None,
+=======
+        force_backend=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if force_backend is not None and stance != "default":
             raise RuntimeError("non-default stance cannot have force_backend set")
@@ -144,13 +195,18 @@ def __init__(
         self.stance = DynamoStance(stance, skip_guard_eval_unsafe, force_backend)
         self.prev = _set_stance(self.stance)
 
+<<<<<<< HEAD
     def __call__(self, fn: F) -> F:
+=======
+    def __call__(self, fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _set_stance(self.prev)
         wrapper = super().__call__(fn)
         # forbid wrapper in graph
         wrapper._dynamo_forbidden = True  # type: ignore[attr-defined]
         return wrapper
 
+<<<<<<< HEAD
     def __enter__(self) -> None:
         _set_stance(self.stance)
 
@@ -172,6 +228,24 @@ def assume_constant_result(fn):  # type: ignore[no-untyped-def]
 
 
 def allow_in_graph(fn):  # type: ignore[no-untyped-def]
+=======
+    def __enter__(self):
+        _set_stance(self.stance)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        _set_stance(self.prev)
+
+    def clone(self):
+        return self.__class__(self.stance.stance, force_backend=self.stance.backend)
+
+
+def assume_constant_result(fn):
+    fn._dynamo_marked_constant = True
+    return fn
+
+
+def allow_in_graph(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Tells the compiler frontend (Dynamo) to skip symbolic introspection of the function
     and instead directly write it to the graph when encountered.
@@ -189,14 +263,22 @@ def allow_in_graph(fn):  # type: ignore[no-untyped-def]
         trace_rules._allowed_callable_ids.add(fn_id)
 
         # Avoid id reuse which creates subtle bugs.
+<<<<<<< HEAD
         def deregister() -> None:
+=======
+        def deregister():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             trace_rules._allowed_callable_ids.remove(fn_id)
 
         weakref.finalize(fn, deregister)
     return fn
 
 
+<<<<<<< HEAD
 def nonstrict_trace(traceable_fn: Callable[_P, _R]) -> Callable[_P, _R]:
+=======
+def nonstrict_trace(traceable_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Like `allow_in_graph`, but with the following enhancements/differences:
     #
     # 1. Supports user-defined class as inputs, as long as the class has been
@@ -217,7 +299,11 @@ def nonstrict_trace(traceable_fn: Callable[_P, _R]) -> Callable[_P, _R]:
     assert callable(traceable_fn), "nonstrict_trace expects a callable"
 
     @functools.wraps(traceable_fn)
+<<<<<<< HEAD
     def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+=======
+    def wrapped(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return traceable_fn(*args, **kwargs)
 
     wrapped_id = id(wrapped)
@@ -229,7 +315,11 @@ def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _R:
     trace_rules._nonstrict_trace_callable_ids.add(wrapped_id)
 
     # Avoid id reuse which creates subtle bugs.
+<<<<<<< HEAD
     def deregister() -> None:
+=======
+    def deregister():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         trace_rules._allowed_callable_ids.remove(wrapped_id)
         trace_rules._nonstrict_trace_callable_ids.remove(wrapped_id)
 
@@ -238,8 +328,13 @@ def deregister() -> None:
     return wrapped
 
 
+<<<<<<< HEAD
 def _disallow_in_graph_helper(throw_if_not_allowed: bool) -> Callable[..., Any]:
     def inner(fn: Any) -> Any:
+=======
+def _disallow_in_graph_helper(throw_if_not_allowed):
+    def inner(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(fn, (list, tuple)):
             return [disallow_in_graph(x) for x in fn]
         assert callable(fn), "disallow_in_graph expects a callable"
@@ -261,7 +356,11 @@ def inner(fn: Any) -> Any:
     return inner
 
 
+<<<<<<< HEAD
 def disallow_in_graph(fn: Callable[..., Any]) -> Any:
+=======
+def disallow_in_graph(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Customize which functions TorchDynamo will exclude in the generated
     graph and force a graph break on.
@@ -287,12 +386,17 @@ def fn(a):
 
 
 @_disallow_in_graph_helper(throw_if_not_allowed=False)
+<<<<<<< HEAD
 def graph_break(msg: str = "") -> None:
+=======
+def graph_break(msg=""):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Force a graph break"""
 
 
 # NOTE: primarily used for internal debugging purposes!
 @_disallow_in_graph_helper(throw_if_not_allowed=False)
+<<<<<<< HEAD
 def skip_frame(msg: str = "") -> None:
     """Force a skipped frame"""
 
@@ -306,6 +410,13 @@ def step_unsupported(msg: str = "") -> None:
 
 
 def forbid_in_graph(fn: Any) -> Any:
+=======
+def skip_frame(msg=""):
+    """Force a skipped frame"""
+
+
+def forbid_in_graph(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Customize which functions TorchDynamo will assert are not present while tracing.
 
@@ -316,7 +427,10 @@ def forbid_in_graph(fn: Any) -> Any:
     if isinstance(fn, (list, tuple)):
         return [forbid_in_graph(x) for x in fn]
     assert callable(fn), "forbid_in_graph applies only to callables"
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn._dynamo_forbidden = True
     return fn
 
@@ -408,9 +522,13 @@ def wrapper(traceable_fn: Callable[_P, _R]) -> Callable[_P, _R]:
             else:
                 traceable_sig = inspect.signature(traceable_fn)
 
+<<<<<<< HEAD
                 def sig_ident(
                     sig: inspect.Signature,
                 ) -> tuple[tuple[str, ...], set[str], dict[str, Any]]:
+=======
+                def sig_ident(sig):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Ignore annotations for parameters and return type
                     return (
                         tuple(
@@ -490,6 +608,7 @@ def sig_ident(
         def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _R:
             return original_fn(*args, **kwargs)
 
+<<<<<<< HEAD
         def dispatch_fn(
             self: VariableBuilder, value: Callable[_P, _R]
         ) -> PolyfilledFunctionVariable:
@@ -503,6 +622,13 @@ def dispatch_fn(
                 value,
                 source=self.source,
                 **self.install_guards(guard_type),
+=======
+        def dispatch_fn(self, value: Callable[_P, _R]) -> PolyfilledFunctionVariable:
+            return PolyfilledFunctionVariable(
+                value,
+                source=self.source,
+                **self.install_guards(GuardBuilder.FUNCTION_MATCH),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         id_dispatch_map[id(original_fn)] = id_dispatch_map[id(wrapped)] = dispatch_fn
@@ -523,9 +649,13 @@ def dispatch_fn(
 # Helper function to flatten a tensor subclass and apply a function to
 # all inner tensors that match the outer dim. Used to reduce duplication
 # across the various marking APIs.
+<<<<<<< HEAD
 def _apply_func_to_inner_tensors_of_same_dim(
     func: Callable[..., Any], t: object, *args: Any, **kwargs: Any
 ) -> None:
+=======
+def _apply_func_to_inner_tensors_of_same_dim(func, t, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert is_traceable_wrapper_subclass(t)
 
     attrs, _ctx = t.__tensor_flatten__()
@@ -550,6 +680,7 @@ class directly; instead, use :func:`mark_dynamic`.
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def mark_unbacked(
     t: Any,
     index: Union[int, list[Any], tuple[Any]],
@@ -573,6 +704,19 @@ def mark_unbacked(
             By default (strict=False), specialization is allowed and will proceed without error.
         specialize_on (Optional[list[Any]], default=None): A list of specialization criteria (e.g., lambdas) for this dimension.
             If provided, Dynamo will generate specialized compiled regions for each criterion in addition to a generic trace.
+=======
+def mark_unbacked(t, index, strict=False, specialize_on=None):
+    """
+    Mark a tensor as having an unbacked dim.  This changes the semantics of operations,
+    we will always report the size does not equal zero/one, we will turn asserts
+    on this index into runtime asserts, and if you try to get the real value we will
+    raise an exception.  In other words, we will treat this dimension as if it was
+    data dependent (we do not know anything about its value.)
+
+    For historical reasons, by default if an unbacked dim is specialized, we will
+    happily specialize it and continue. If you want to error in these cases, pass
+    strict=True.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # You could have copied the mark_dynamic behavior but I'm not convinced
     # it's what you want
@@ -591,12 +735,15 @@ def mark_unbacked(
         if not hasattr(t, "_dynamo_unbacked_indices"):
             t._dynamo_unbacked_indices = set()
 
+<<<<<<< HEAD
         if not hasattr(t, "_dynamo_hint_overrides"):
             t._dynamo_hint_overrides = {}
 
         if hint_override:
             t._dynamo_hint_overrides[index] = hint_override
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
         # TypeError: 'Attribute' object does not support item assignment
         if isinstance(t._specialize_on, dict):
@@ -611,6 +758,7 @@ def mark_unbacked(
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def mark_dynamic(
     t: Any,
     index: Union[int, list[Any], tuple[Any]],
@@ -620,6 +768,9 @@ def mark_dynamic(
     max: Optional[int] = None,
     specialize_on: Optional[list[Any]] = None,
 ) -> None:
+=======
+def mark_dynamic(t, index, *, min=None, max=None, specialize_on=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mark a tensor as having a dynamic dim and set corresponding min and max range for the dim.
 
@@ -642,10 +793,14 @@ def mark_dynamic(
     4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
     before torch.compile.
 
+<<<<<<< HEAD
     5) If hint_override is passed, the hint_override for the specified dimension will replace the provided value
     from the first example input as the official size hint.
 
     6) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
+=======
+    5) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     multiple specialized compilations in addition to a single generic compilation. NB: For now we only support
     per dimension specialization, or in other words we do not generate a cross product of specializations.
     At runtime, we will dispatch to a specialized compiled region if the input matches the specialization criteria.
@@ -659,7 +814,10 @@ def mark_dynamic(
     This approach results in one Dynamo trace and two backend compilations. When the input dimension equals 8 or 16
     at runtime, execution will be directed to the specialized compiled region. Performance measurements indicate
     2-8x speedups depending on the specific specialization and model architecture.
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
@@ -670,6 +828,7 @@ def mark_dynamic(
 
     if isinstance(index, int):
         if not hasattr(t, "_dynamo_dynamic_indices"):
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             t._dynamo_dynamic_indices = set()
             # pyrefly: ignore [missing-attribute]
@@ -692,6 +851,20 @@ def mark_dynamic(
         # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
         # TypeError: 'Attribute' object does not support item assignment
         # pyrefly: ignore [missing-attribute]
+=======
+            t._dynamo_dynamic_indices = set()
+            t._dynamo_dynamic_range = set()
+
+        if not hasattr(t, "_specialize_on"):
+            t._specialize_on = {}
+
+        # TODO(voz): Should we bounds check?
+        t._dynamo_dynamic_indices.add(index)
+        t._dynamo_dynamic_range.add(_DimRange(index, min, max))
+
+        # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
+        # TypeError: 'Attribute' object does not support item assignment
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(t._specialize_on, dict):
             t._specialize_on[index] = specialize_on if specialize_on is not None else []
 
@@ -704,7 +877,11 @@ def mark_dynamic(
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def maybe_mark_dynamic(t: Any, index: Union[int, list[Any], tuple[Any]]) -> None:
+=======
+def maybe_mark_dynamic(t, index):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mark a tensor as having a dynamic dim, but don't enforce it (i.e., if this
     dimension ends up getting specialized, don't error).
@@ -716,10 +893,15 @@ def maybe_mark_dynamic(t: Any, index: Union[int, list[Any], tuple[Any]]) -> None
 
     if isinstance(index, int):
         if not hasattr(t, "_dynamo_weak_dynamic_indices"):
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             t._dynamo_weak_dynamic_indices = set()
         # TODO(voz): Should we bounds check?
         # pyrefly: ignore [missing-attribute]
+=======
+            t._dynamo_weak_dynamic_indices = set()
+        # TODO(voz): Should we bounds check?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t._dynamo_weak_dynamic_indices.add(index)
         return
 
@@ -728,9 +910,13 @@ def maybe_mark_dynamic(t: Any, index: Union[int, list[Any], tuple[Any]]) -> None
         maybe_mark_dynamic(t, i)
 
 
+<<<<<<< HEAD
 def mark_static(
     t: Any, index: Optional[Union[int, list[Any], tuple[Any]]] = None
 ) -> None:
+=======
+def mark_static(t, index=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mark a tensor as having a static dim or mark a nn module class as static.
 
@@ -771,11 +957,16 @@ def mark_static(
         # TODO: Make this configurable via a supported public API
         _apply_func_to_inner_tensors_of_same_dim(mark_static, t, index)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     if not isinstance(t, torch.Tensor) and issubclass(t, torch.nn.Module):
         # pyrefly: ignore [missing-attribute]
         t._dynamo_marked_static = True
         # pyrefly: ignore [bad-return]
+=======
+    if not isinstance(t, torch.Tensor) and issubclass(t, torch.nn.Module):
+        t._dynamo_marked_static = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return t
 
     if not isinstance(t, torch.Tensor):
@@ -798,6 +989,7 @@ def mark_static(
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def mark_static_address(t: Any, guard: bool = False) -> None:
     """
     Marks an input tensor whose address should be treated as constant across calls to the
@@ -805,6 +997,14 @@ def mark_static_address(t: Any, guard: bool = False) -> None:
     is not needed for this input. The data_ptr will be guarded if guard=True, and cause a full
     recompile if the data_ptr changes. Note: If this address changes, cudagraphs will re-record
     if guard=False.
+=======
+def mark_static_address(t, guard=True):
+    """
+    Marks an input tensor whose data_ptr will not change across multiple calls
+    to a dynamo-compiled function. This indicates to cudagraphs that an extra allocation
+    is not needed for this input. The data_ptr will be guarded if guard=True. Note:
+    Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if not isinstance(t, torch.Tensor):
         raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
@@ -818,7 +1018,11 @@ def mark_static_address(t: Any, guard: bool = False) -> None:
 # One day, Dynamo will support tracing into einops directly (no allow_in_graph needed)
 # Note that PyTorch supports multiple versions of einops, so when that day comes,
 # we still need to be really careful about version matches.
+<<<<<<< HEAD
 def _allow_in_graph_einops() -> None:
+=======
+def _allow_in_graph_einops():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import einops
 
     try:
@@ -849,15 +1053,24 @@ def _allow_in_graph_einops() -> None:
 # Proxy class for torch._dynamo.config patching - so dynamo can identify context managers/decorators
 # created by patch_dynamo_config, compared to ones created by a raw torch._dynamo.config.patch.
 class DynamoConfigPatchProxy:
+<<<<<<< HEAD
     def __init__(self, config_patch: Any) -> None:
         self.config_patch = config_patch
 
     @property
     def changes(self) -> dict[str, Any]:
+=======
+    def __init__(self, config_patch):
+        self.config_patch = config_patch
+
+    @property
+    def changes(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.config_patch.changes
 
     # Decorator implementation that simply sets up `self` as a context manager.
     # Placed in external_utils so that we can trace through it.
+<<<<<<< HEAD
     __call__ = wrap_dunder_call_ctx_manager
 
     def __enter__(self) -> None:
@@ -869,6 +1082,14 @@ def __exit__(
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
+=======
+    __call__ = _dynamo_config_patch_proxy_dunder_call
+
+    def __enter__(self):
+        return self.config_patch.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.config_patch.__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -900,7 +1121,11 @@ def __exit__(
 del config
 
 
+<<<<<<< HEAD
 def _patch_dynamo_config_check(changes: dict[str, Any]) -> None:
+=======
+def _patch_dynamo_config_check(changes: dict[str, Any]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k, v in changes.items():
         if k not in _allowed_config_patches:
             raise ValueError(
@@ -944,6 +1169,7 @@ def patch_dynamo_config(
     return DynamoConfigPatchProxy(config_patch)
 
 
+<<<<<<< HEAD
 @overload
 def dont_skip_tracing(fn: None = None) -> DynamoConfigPatchProxy: ...
 
@@ -953,6 +1179,9 @@ def dont_skip_tracing(fn: Callable[_P, _R]) -> Callable[_P, _R]: ...
 
 
 def dont_skip_tracing(fn: Optional[Any] = None) -> Any:
+=======
+def dont_skip_tracing(fn=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Context manager/decorator to trace into functions intentionally marked by developers to be skipped
     when tracing.
@@ -963,6 +1192,7 @@ def dont_skip_tracing(fn: Optional[Any] = None) -> Any:
     if fn:
         return ctx(fn)
     return ctx
+<<<<<<< HEAD
 
 
 class ErrorOnGraphBreakDecoratorContextManager:
@@ -1002,3 +1232,5 @@ def error_on_graph_break(
     The default value of torch.compile's `error_on_graph_break` setting is False.
     """
     return ErrorOnGraphBreakDecoratorContextManager(error_on_graph_break)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index 00ba284f9b44c..a9960b1e08679 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -1,11 +1,23 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Device abstraction layer for TorchDynamo and Inductor backends.
 
 This module provides a unified interface for different hardware backends (CUDA, XPU,
+<<<<<<< HEAD
 CPU, MPS, MTIA) through a common device interface. Key components include:
 
 - DeviceInterface: Base class defining the common API for all device types
 - Device-specific implementations: CudaInterface, XpuInterface, CpuInterface, MpsInterface, MtiaInterface
+=======
+CPU, MPS) through a common device interface. Key components include:
+
+- DeviceInterface: Base class defining the common API for all device types
+- Device-specific implementations: CudaInterface, XpuInterface, CpuInterface, MpsInterface
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - Device registration system for managing available backends
 - Worker APIs for multi-processing scenarios
 - Stream and event management across different devices
@@ -17,10 +29,16 @@
 
 import inspect
 import time
+<<<<<<< HEAD
 from collections import namedtuple
 from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from typing import Any, Literal, Optional, Union
+=======
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -43,17 +61,29 @@ class DeviceInterface:
     """
 
     class device:
+<<<<<<< HEAD
         def __new__(cls, device: torch.types.Device) -> Any:
             raise NotImplementedError
 
     class Event:
         def __new__(cls, *args: Any, **kwargs: Any) -> Any:
+=======
+        def __new__(cls, device: torch.types.Device):
+            raise NotImplementedError
+
+    class Event:
+        def __new__(cls, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError(
                 "Event should be inherited from torch.Event, otherwise, it couldn't be captured by dynamo."
             )
 
     class Stream:
+<<<<<<< HEAD
         def __new__(cls, *args: Any, **kwargs: Any) -> Any:
+=======
+        def __new__(cls, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError(
                 "Stream should be inherited from torch.Stream, otherwise, it couldn't be captured by dynamo."
             )
@@ -67,7 +97,11 @@ class Worker:
         """
 
         @staticmethod
+<<<<<<< HEAD
         def set_device(device: int) -> None:
+=======
+        def set_device(device: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError
 
         @staticmethod
@@ -75,6 +109,7 @@ def current_device() -> int:
             raise NotImplementedError
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: torch.types.Device = None) -> Any:
             raise NotImplementedError
 
@@ -84,6 +119,17 @@ def current_device() -> int:
 
     @staticmethod
     def set_device(device: torch.types.Device) -> None:
+=======
+        def get_device_properties(device: torch.types.Device = None):
+            raise NotImplementedError
+
+    @staticmethod
+    def current_device():
+        raise NotImplementedError
+
+    @staticmethod
+    def set_device(device: torch.types.Device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @staticmethod
@@ -95,7 +141,11 @@ def exchange_device(device: int) -> int:
         raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def device_count() -> int:
+=======
+    def device_count():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @staticmethod
@@ -103,6 +153,7 @@ def is_available() -> bool:
         raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def stream(stream: torch.Stream) -> Any:
         raise NotImplementedError
 
@@ -116,6 +167,21 @@ def set_stream(stream: torch.Stream) -> None:
 
     @staticmethod
     def _set_stream_by_id(stream_id: int, device_index: int, device_type: int) -> None:
+=======
+    def stream(stream: torch.Stream):
+        raise NotImplementedError
+
+    @staticmethod
+    def current_stream():
+        raise NotImplementedError
+
+    @staticmethod
+    def set_stream(stream: torch.Stream):
+        raise NotImplementedError
+
+    @staticmethod
+    def _set_stream_by_id(stream_id: int, device_index: int, device_type: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @staticmethod
@@ -123,6 +189,7 @@ def get_raw_stream(device_idx: int) -> int:
         raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def synchronize(device: torch.types.Device = None) -> None:
         raise NotImplementedError
 
@@ -136,6 +203,21 @@ def get_compute_capability(device: torch.types.Device = None) -> Any:
 
     @staticmethod
     def is_bf16_supported(including_emulation: bool = False) -> bool:
+=======
+    def synchronize(device: torch.types.Device = None):
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_properties(cls, device: torch.types.Device = None):
+        return cls.Worker.get_device_properties(device)
+
+    @staticmethod
+    def get_compute_capability(device: torch.types.Device = None):
+        raise NotImplementedError
+
+    @staticmethod
+    def is_bf16_supported(including_emulation: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @classmethod
@@ -187,11 +269,19 @@ def __init__(
         self.idx = index
         self.prev_idx = -1
 
+<<<<<<< HEAD
     def __enter__(self) -> None:
         if self.idx is not None:
             self.prev_idx = self.device_interface.exchange_device(self.idx)
 
     def __exit__(self, type: Any, value: Any, traceback: Any) -> Literal[False]:
+=======
+    def __enter__(self):
+        if self.idx is not None:
+            self.prev_idx = self.device_interface.exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.idx is not None:
             self.idx = self.device_interface.maybe_exchange_device(self.prev_idx)
         return False
@@ -205,10 +295,16 @@ class CudaInterface(DeviceInterface):
     Event = torch.cuda.Event  # type: ignore[assignment]
     Stream = torch.cuda.Stream  # type: ignore[assignment]
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     class Worker:
         @staticmethod
         def set_device(device: int) -> None:
+=======
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             caching_worker_current_devices["cuda"] = device
 
         @staticmethod
@@ -218,7 +314,11 @@ def current_device() -> int:
             return torch.cuda.current_device()
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: torch.types.Device = None) -> Any:
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device is not None:
                 if isinstance(device, str):
                     device = torch.device(device)
@@ -241,15 +341,23 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
     set_device = staticmethod(torch.cuda.set_device)
     device_count = staticmethod(torch.cuda.device_count)
     stream = staticmethod(torch.cuda.stream)  # type: ignore[assignment]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     current_stream = staticmethod(torch.cuda.current_stream)
     set_stream = staticmethod(torch.cuda.set_stream)  # type: ignore[assignment]
     _set_stream_by_id = staticmethod(torch.cuda._set_stream_by_id)  # type: ignore[assignment]
     synchronize = staticmethod(torch.cuda.synchronize)
     get_device_properties = staticmethod(torch.cuda.get_device_properties)  # type: ignore[assignment]
     get_raw_stream = staticmethod(get_cuda_stream)  # type: ignore[assignment, arg-type]
+<<<<<<< HEAD
     exchange_device = staticmethod(torch.cuda._exchange_device)  # type: ignore[arg-type, has-type]
     maybe_exchange_device = staticmethod(torch.cuda._maybe_exchange_device)  # type: ignore[arg-type, has-type]
+=======
+    exchange_device = staticmethod(torch.cuda._exchange_device)  # type: ignore[arg-type]
+    maybe_exchange_device = staticmethod(torch.cuda._maybe_exchange_device)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     memory_allocated = staticmethod(torch.cuda.memory_allocated)
     is_bf16_supported = staticmethod(torch.cuda.is_bf16_supported)  # type: ignore[arg-type]
 
@@ -259,7 +367,11 @@ def is_available() -> bool:
         return torch.cuda.is_available()
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: torch.types.Device = None) -> Union[int, str]:
+=======
+    def get_compute_capability(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip is None:
             major, min = torch.cuda.get_device_capability(device)
             return major * 10 + min
@@ -290,6 +402,7 @@ def raise_if_triton_unavailable(device: torch.types.Device = None) -> None:
             raise RuntimeError("triton not built with the 'nvidia' backend")
 
 
+<<<<<<< HEAD
 get_mtia_stream: Optional[Callable[[int], int]]
 if torch.mtia._is_compiled():
     from torch._C import _mtia_getCurrentRawStream as get_mtia_stream
@@ -373,6 +486,8 @@ def raise_if_triton_unavailable(evice: torch.types.Device = None) -> None:
             raise RuntimeError("triton not built with the 'mtia' backend")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 get_xpu_stream: Optional[Callable[[int], int]]
 if torch.xpu._is_compiled():
     from torch._C import _xpu_getCurrentRawStream as get_xpu_stream
@@ -385,10 +500,16 @@ class XpuInterface(DeviceInterface):
     Event = torch.xpu.Event  # type: ignore[assignment]
     Stream = torch.xpu.Stream  # type: ignore[assignment]
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     class Worker:
         @staticmethod
         def set_device(device: int) -> None:
+=======
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             caching_worker_current_devices["xpu"] = device
 
         @staticmethod
@@ -398,7 +519,11 @@ def current_device() -> int:
             return torch.xpu.current_device()
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: torch.types.Device = None) -> Any:
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device is not None:
                 if isinstance(device, str):
                     device = torch.device(device)
@@ -419,17 +544,27 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
 
     current_device = staticmethod(torch.xpu.current_device)
     set_device = staticmethod(torch.xpu.set_device)
+<<<<<<< HEAD
     device_count = staticmethod(torch.xpu.device_count)  # type: ignore[has-type]
     stream = staticmethod(torch.xpu.stream)  # type: ignore[assignment]
     # pyrefly: ignore [bad-override]
+=======
+    device_count = staticmethod(torch.xpu.device_count)
+    stream = staticmethod(torch.xpu.stream)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     current_stream = staticmethod(torch.xpu.current_stream)
     set_stream = staticmethod(torch.xpu.set_stream)  # type: ignore[assignment]
     _set_stream_by_id = staticmethod(torch.xpu._set_stream_by_id)  # type: ignore[assignment]
     synchronize = staticmethod(torch.xpu.synchronize)
     get_device_properties = staticmethod(torch.xpu.get_device_properties)  # type: ignore[assignment]
     get_raw_stream = staticmethod(get_xpu_stream)  # type: ignore[assignment, arg-type]
+<<<<<<< HEAD
     exchange_device = staticmethod(torch.xpu._exchange_device)  # type: ignore[arg-type, has-type]
     maybe_exchange_device = staticmethod(torch.xpu._maybe_exchange_device)  # type: ignore[arg-type, has-type]
+=======
+    exchange_device = staticmethod(torch.xpu._exchange_device)  # type: ignore[arg-type]
+    maybe_exchange_device = staticmethod(torch.xpu._maybe_exchange_device)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     memory_allocated = staticmethod(torch.xpu.memory_allocated)
 
     # Can be mock patched by @patch decorator.
@@ -438,7 +573,11 @@ def is_available() -> bool:
         return torch.xpu.is_available()
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: torch.types.Device = None) -> Any:
+=======
+    def get_compute_capability(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cc = torch.xpu.get_device_capability(device)
         return cc
 
@@ -451,7 +590,11 @@ def is_triton_capable(device: torch.types.Device = None) -> bool:
         return True
 
     @staticmethod
+<<<<<<< HEAD
     def raise_if_triton_unavailable(device: torch.types.Device = None) -> None:
+=======
+    def raise_if_triton_unavailable(evice: torch.types.Device = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import triton.backends
 
         if "intel" not in triton.backends.backends:
@@ -464,6 +607,7 @@ class CpuDeviceProperties:
 
 
 class CpuInterface(DeviceInterface):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     class Event(torch.Event):
         def __init__(self, enable_timing: bool = True) -> None:
@@ -481,6 +625,21 @@ class Worker:
         def get_device_properties(
             device: torch.types.Device = None,
         ) -> CpuDeviceProperties:
+=======
+    class Event(torch.Event):
+        def __init__(self, enable_timing=True):
+            self.time = 0.0
+
+        def elapsed_time(self, end_event) -> float:
+            return (end_event.time - self.time) * 1000
+
+        def record(self, stream=None):
+            self.time = time.perf_counter()
+
+    class Worker:
+        @staticmethod
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             import multiprocessing
 
             cpu_count = multiprocessing.cpu_count()
@@ -491,7 +650,11 @@ def is_available() -> bool:
         return True
 
     @staticmethod
+<<<<<<< HEAD
     def is_bf16_supported(including_emulation: bool = False) -> bool:
+=======
+    def is_bf16_supported(including_emulation: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     @staticmethod
@@ -499,6 +662,7 @@ def get_compute_capability(device: torch.types.Device = None) -> str:
         return ""
 
     @staticmethod
+<<<<<<< HEAD
     def get_raw_stream(device_idx: Any) -> int:
         return 0
 
@@ -508,6 +672,17 @@ def current_device() -> int:
 
     @staticmethod
     def synchronize(device: torch.types.Device = None) -> None:
+=======
+    def get_raw_stream(device_idx) -> int:
+        return 0
+
+    @staticmethod
+    def current_device():
+        return 0
+
+    @staticmethod
+    def synchronize(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
     @staticmethod
@@ -540,7 +715,11 @@ def is_available() -> bool:
         return torch.backends.mps.is_available()
 
     @staticmethod
+<<<<<<< HEAD
     def current_device() -> int:
+=======
+    def current_device():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return 0
 
     @staticmethod
@@ -548,6 +727,7 @@ def get_compute_capability(device: torch.types.Device = None) -> str:
         return ""
 
     @staticmethod
+<<<<<<< HEAD
     def synchronize(device: torch.types.Device = None) -> None:
         torch.mps.synchronize()
 
@@ -561,6 +741,18 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
 
         @staticmethod
         def current_device() -> int:
+=======
+    def synchronize(device: torch.types.Device = None):
+        torch.mps.synchronize()
+
+    class Worker:
+        @staticmethod
+        def get_device_properties(device: torch.types.Device = None):
+            return {}
+
+        @staticmethod
+        def current_device():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return 0
 
 
@@ -570,7 +762,11 @@ def current_device() -> int:
 
 def register_interface_for_device(
     device: Union[str, torch.device], device_interface: type[DeviceInterface]
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(device, torch.device):
         device = device.type
     device_interfaces[device] = device_interface
@@ -592,7 +788,11 @@ def get_registered_device_interfaces() -> Iterable[tuple[str, type[DeviceInterfa
     return device_interfaces.items()
 
 
+<<<<<<< HEAD
 def init_device_reg() -> None:
+=======
+def init_device_reg():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global _device_initialized
     register_interface_for_device("cuda", CudaInterface)
     for i in range(torch.cuda.device_count()):
@@ -602,10 +802,13 @@ def init_device_reg() -> None:
     for i in range(torch.xpu.device_count()):
         register_interface_for_device(f"xpu:{i}", XpuInterface)
 
+<<<<<<< HEAD
     register_interface_for_device("mtia", MtiaInterface)
     for i in range(torch.mtia.device_count()):
         register_interface_for_device(f"mtia:{i}", MtiaInterface)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_interface_for_device("cpu", CpuInterface)
     register_interface_for_device("mps", MpsInterface)
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 070d26a4699c4..b4d4fc71f8f4d 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: disable-error-code="method-assign"
 
 """
@@ -36,13 +40,20 @@
 import threading
 import traceback
 import types
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 import weakref
 from dataclasses import dataclass
 from enum import Enum
 from os.path import dirname, join
+<<<<<<< HEAD
 from typing import Any, NamedTuple, Optional, Sized, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import sympy
@@ -67,7 +78,11 @@
 from torch._dynamo.types import ConvertFrameReturn, FrameAction, FrameExecStrategy
 from torch._export.utils import _compiling_state_context
 from torch._subclasses.fake_tensor import unset_fake_temporarily
+<<<<<<< HEAD
 from torch._utils_internal import DISABLE_JUSTKNOBS, justknobs_check, log_export_usage
+=======
+from torch._utils_internal import justknobs_check, log_export_usage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export.dynamic_shapes import (
     _combine_args,
     _DimHint,
@@ -103,6 +118,7 @@
 )
 from .hooks import Hooks
 from .mutation_guard import install_generation_tagging_init
+<<<<<<< HEAD
 from .utils import (
     _get_error_on_graph_break,
     _set_error_on_graph_break,
@@ -126,6 +142,15 @@
         GuardFail,
         GuardFilterEntry,
     )
+=======
+from .utils import common_constant_types, compile_times
+
+
+if TYPE_CHECKING:
+    from torch._subclasses import fake_tensor
+
+    from .types import CacheEntry, DynamoCallback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -145,6 +170,7 @@ class Unset(Enum):
 unset = Unset.token
 
 
+<<<<<<< HEAD
 if DISABLE_JUSTKNOBS:
     _maybe_set_eval_frame = set_eval_frame
 else:
@@ -159,6 +185,18 @@ def _maybe_set_eval_frame(callback: DynamoCallback) -> DynamoCallback:
             return callback
         else:
             return set_eval_frame(callback)
+=======
+def _maybe_set_eval_frame(callback: DynamoCallback):
+    # A wrapper on set_eval_frame that is guarded by a Justknob.
+    # Users can disable torchDynamo by setting the JK to False.
+    if not justknobs_check("pytorch/compiler:enable_compiler_set_eval_frame"):
+        torch._dynamo.utils.warn_once(
+            "Dynamo disabled by Justknob: enable_compiler_set_eval_frame, skipping set_eval_frame"
+        )
+        return callback
+    else:
+        return set_eval_frame(callback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -191,7 +229,11 @@ def _set_stance(stance: DynamoStance) -> DynamoStance:
 _EXAMPLE_INPUTS: Optional[dict[str, list[Any]]] = None
 
 
+<<<<<<< HEAD
 def get_example_inputs(key: str) -> list[Any]:
+=======
+def get_example_inputs(key) -> list[Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global _EXAMPLE_INPUTS
     if _EXAMPLE_INPUTS is None:
         _EXAMPLE_INPUTS = {}
@@ -202,7 +244,11 @@ def get_example_inputs(key: str) -> list[Any]:
     return _EXAMPLE_INPUTS[key]
 
 
+<<<<<<< HEAD
 def _callback_from_stance(callback: DynamoCallback) -> DynamoCallback:
+=======
+def _callback_from_stance(callback):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _stance.stance == "default":
         # force_backend
         if _stance.backend is not None and callback not in (False, None):
@@ -227,6 +273,7 @@ def _callback_from_stance(callback: DynamoCallback) -> DynamoCallback:
         if callback in (False, None):
             return callback
 
+<<<<<<< HEAD
         def fail_callback(
             frame: DynamoFrameType, *args: Any, **kwargs: Any
         ) -> ConvertFrameReturn:
@@ -267,15 +314,30 @@ def fail_callback(
 
         # to prevent cache miss due to different backend
         fail_callback._torchdynamo_orig_backend = callback  # type: ignore[attr-defined]
+=======
+        def fail_callback(frame, *args, **kwargs):
+            if trace_rules.check(frame.f_code):
+                return ConvertFrameReturn()
+            raise RuntimeError(
+                "Detected recompile when torch.compile stance is 'fail_on_recompile'"
+            )
+
+        # to prevent cache miss due to different callback
+        fail_callback._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return fail_callback
     else:
         raise RuntimeError(f"invalid torch.compile stance '{_stance}'")
 
 
+<<<<<<< HEAD
 def _create_wrapped_callback(
     compiler_fn: CompilerFn,
 ) -> convert_frame.CatchErrorsWrapper:
+=======
+def _create_wrapped_callback(compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hooks = Hooks()
     return convert_frame.catch_errors_wrapper(
         convert_frame.convert_frame(  # type: ignore[arg-type]
@@ -286,7 +348,11 @@ def _create_wrapped_callback(
     )
 
 
+<<<<<<< HEAD
 def _get_or_add_example_inputs(frame: DynamoFrameType) -> list[Any]:
+=======
+def _get_or_add_example_inputs(frame):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     key = frame.f_code.co_filename + str(frame.f_code.co_firstlineno)
     example_inputs = get_example_inputs(key)
 
@@ -296,10 +362,15 @@ def _get_or_add_example_inputs(frame: DynamoFrameType) -> list[Any]:
     return example_inputs
 
 
+<<<<<<< HEAD
 def _create_delayed_compile_callback(
     callback: DynamoCallback, stance: str
 ) -> Callable[..., Any]:
     def callback_fn(*args: Any, **kwargs: Any) -> convert_frame.ConvertFrameReturn:
+=======
+def _create_delayed_compile_callback(callback, stance):
+    def callback_fn(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         frame = args[0]
         example_inputs = _get_or_add_example_inputs(frame)
 
@@ -316,6 +387,7 @@ def callback_fn(*args: Any, **kwargs: Any) -> convert_frame.ConvertFrameReturn:
 
         dynamism = track_dynamism_across_examples(example_inputs)
         code_context.get_context(frame.f_code)["dynamism"] = dynamism
+<<<<<<< HEAD
         compiler_fn = callback._torchdynamo_orig_backend._torchdynamo_orig_backend  # type: ignore[union-attr]
         return _create_wrapped_callback(compiler_fn)(*args, **kwargs)
 
@@ -330,6 +402,19 @@ def _is_skip_guard_eval_unsafe_stance() -> bool:
 
 
 def _reset_guarded_backend_cache() -> None:
+=======
+        compiler_fn = callback._torchdynamo_orig_callable._torchdynamo_orig_callable
+        return _create_wrapped_callback(compiler_fn)(*args, **kwargs)
+
+    return callback_fn
+
+
+def _is_skip_guard_eval_unsafe_stance():
+    return _stance.skip_guard_eval_unsafe
+
+
+def _reset_guarded_backend_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global cached_backends
     for backend in cached_backends.values():
         if hasattr(backend, "reset"):
@@ -377,7 +462,11 @@ class OptimizedModule(torch.nn.Module):
         "_super_module_initialized",
     }
 
+<<<<<<< HEAD
     def __init__(self, mod: torch.nn.Module, dynamo_ctx: _TorchDynamoContext) -> None:
+=======
+    def __init__(self, mod: torch.nn.Module, dynamo_ctx) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: this must go first, because attribute reads/writes of `self`
         # uses `_orig_mod`, and sometimes users override `Module.__init__` to
         # do attribute reads/writes on `self`.
@@ -395,6 +484,7 @@ def __init__(self, mod: torch.nn.Module, dynamo_ctx: _TorchDynamoContext) -> Non
         self._initialize()
         self.training = self._orig_mod.training
 
+<<<<<<< HEAD
     def __len__(self) -> int:
         # Proxy the len call to the original module
         if isinstance(self._orig_mod, Sized):
@@ -403,6 +493,9 @@ def __len__(self) -> int:
         raise TypeError(f"{type(self._orig_mod).__name__} does not support len()")
 
     def _initialize(self) -> None:
+=======
+    def _initialize(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Do this stuff in constructor to lower overhead slightly
         if isinstance(self.dynamo_ctx, DisableContext):
             # No need to check trace rules
@@ -426,7 +519,11 @@ def _initialize(self) -> None:
             self._forward = self.forward
             self.forward = self._call_lazy_check
 
+<<<<<<< HEAD
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
+=======
+    def __call__(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.nn.modules.module._has_any_global_hook():
             warnings.warn(
                 "Using `torch.compile(module)` when there are global hooks on "
@@ -439,6 +536,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
             )
         return super().__call__(*args, **kwargs)
 
+<<<<<<< HEAD
     def _aot_compile(self, inputs: list[torch._dynamo.aot_compile.ModelInput]) -> None:
         """
         Experimental: AOT Compile a set of inputs and use that as the forward function
@@ -496,34 +594,60 @@ def __reduce__(
         return (self.__class__, (self._orig_mod, self.dynamo_ctx))
 
     def __getstate__(self) -> dict[str, Any]:
+=======
+    def __reduce__(self):
+        return (self.__class__, (self._orig_mod, self.dynamo_ctx))
+
+    def __getstate__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state = dict(self.__dict__)
         state.pop("forward", None)
         state.pop("__call__", None)
         return state
 
+<<<<<<< HEAD
     def __setstate__(self, state: dict[str, Any]) -> None:
+=======
+    def __setstate__(self, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.__dict__ = state
         self._initialize()
 
     @property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def training(self) -> bool:
         return self._orig_mod.training
 
     @training.setter
     def training(self, value: bool) -> None:
+=======
+    def training(self):
+        return self._orig_mod.training
+
+    @training.setter
+    def training(self, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Ignore the `training` mutation in `super().__init__()`, since that's
         # setting the default on `nn.Module`, but we are mirroring the
         # `training` attr in `self._orig_mod`.
         if self._super_module_initialized:
             self._orig_mod.training = value
 
+<<<<<<< HEAD
     def __getattr__(self, name: str) -> Any:
+=======
+    def __getattr__(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name == "_orig_mod":
             return self._modules["_orig_mod"]
         return getattr(self._orig_mod, name)
 
+<<<<<<< HEAD
     def __setattr__(self, name: str, val: Any) -> None:
+=======
+    def __setattr__(self, name, val) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Allow patching over class attributes
         if hasattr(type(self), name):
             return super().__setattr__(name, val)
@@ -532,7 +656,11 @@ def __setattr__(self, name: str, val: Any) -> None:
             return super().__setattr__(name, val)
         return setattr(self._orig_mod, name, val)
 
+<<<<<<< HEAD
     def __delattr__(self, name: str) -> None:
+=======
+    def __delattr__(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This mirrors `__setattr__`
         if hasattr(type(self), name):
             return super().__delattr__(name)
@@ -541,7 +669,11 @@ def __delattr__(self, name: str) -> None:
             return super().__delattr__(name)
         return delattr(self._orig_mod, name)
 
+<<<<<<< HEAD
     def _call_lazy_check(self, *args: Any, **kwargs: Any) -> Any:
+=======
+    def _call_lazy_check(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             hasattr(self._orig_mod, "_initialize_hook")
             and hasattr(self._orig_mod, "_infer_parameters")
@@ -554,14 +686,22 @@ def _call_lazy_check(self, *args: Any, **kwargs: Any) -> Any:
             self._orig_mod._infer_parameters(self._orig_mod, args, kwargs)
         return self._forward(*args, **kwargs)
 
+<<<<<<< HEAD
     def __dir__(self) -> list[str]:
+=======
+    def __dir__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         orig_mod_attrs = self._orig_mod.__dir__()
         return orig_mod_attrs + [
             attr for attr in super().__dir__() if attr not in orig_mod_attrs
         ]
 
 
+<<<<<<< HEAD
 def remove_from_cache(f: Any) -> None:
+=======
+def remove_from_cache(f):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Make sure f.__code__ is not cached to force a recompile
     """
@@ -578,6 +718,7 @@ def remove_from_cache(f: Any) -> None:
         log.warning("could not determine __code__ for %s", f)
 
 
+<<<<<<< HEAD
 def nothing() -> None:
     pass
 
@@ -589,21 +730,41 @@ def always_false() -> bool:
 def innermost_fn(
     fn: Callable[..., Any], unaltered_fn_attr: str = "_torchdynamo_orig_callable"
 ) -> Callable[..., Any]:
+=======
+def nothing():
+    pass
+
+
+def always_false():
+    return False
+
+
+def innermost_fn(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     In case of nesting of _TorchDynamoContext calls, find the innermost
     function. TorchDynamo caches on fn.__code__ object, so its necessary to find
     the innermost function to pass on the optimize, run, disable etc.
     """
     unaltered_fn = fn
+<<<<<<< HEAD
     while hasattr(unaltered_fn, unaltered_fn_attr):
         unaltered_fn = getattr(unaltered_fn, unaltered_fn_attr)
+=======
+    while hasattr(unaltered_fn, "_torchdynamo_orig_callable"):
+        unaltered_fn = unaltered_fn._torchdynamo_orig_callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert callable(unaltered_fn), (
             f"A callable function is expected, but {type(unaltered_fn)} is provided."
         )
     return unaltered_fn
 
 
+<<<<<<< HEAD
 def make_set_enable_dynamic(enable: bool) -> Any:
+=======
+def make_set_enable_dynamic(enable: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(enable, bool)
     if enable:
         # Assume everything is dynamic by default
@@ -625,12 +786,20 @@ class DynamoTLS(threading.local):
 dynamo_tls = DynamoTLS()
 
 
+<<<<<<< HEAD
 def clear_dynamo_tls() -> None:
+=======
+def clear_dynamo_tls():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dynamo_tls.traced_frame_infos.clear()
 
 
 @atexit.register
+<<<<<<< HEAD
 def _log_traced_frames() -> None:
+=======
+def _log_traced_frames():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     At program exit, log all of the frames Dynamo has attempted to trace from,
     excluding the continuation frames generated by Dynamo.
@@ -641,23 +810,45 @@ def _log_traced_frames() -> None:
     log.info(msg)
 
 
+<<<<<<< HEAD
 def guard_collectives_hook(guard_eval_result: bool) -> bool:
+=======
+def guard_collectives_hook(guard_eval_result):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch.distributed as dist
     from torch._dynamo.utils import dynamo_timed
 
     # guard_eval_result == True  ==>  cache hit
     if pg := distributed.get_guard_pg():
         with dynamo_timed(
+<<<<<<< HEAD
             "guard_collective", log_pt2_compile_event=False, log_waitcounter=True
         ):
             log.debug("guard_collective %s", guard_eval_result)
+=======
+            "guard_collective", log_pt2_compile_event=True, log_waitcounter=True
+        ):
+            log.info("guard_collective %s", guard_eval_result)
+            torch._logging.trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "guard_collective",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: str(guard_eval_result),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: a bit awkward to time, this isn't inside of the dynamo compile region
             all_results = [None] * pg.size()
             dist.all_gather_object(all_results, guard_eval_result, group=pg)
             # True = everyone hit, OK to run
             # False = someone missed, force recompile everywhere
             res = all(all_results)
+<<<<<<< HEAD
             log.debug("guard_collective %s -> %s", guard_eval_result, res)
+=======
+            log.info("guard_collective %s -> %s", guard_eval_result, res)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return res
     return guard_eval_result
 
@@ -669,6 +860,7 @@ class _TorchDynamoContext:
     def __init__(
         self,
         callback: DynamoCallback,
+<<<<<<< HEAD
         on_enter: Callable[[], Any] = nothing,
         backend_ctx_ctor: Callable[
             [], contextlib.AbstractContextManager[Any]
@@ -683,6 +875,17 @@ def __init__(
         compiler_config: Optional[Any] = None,
         package: Optional[CompilePackage] = None,
         hooks: Optional[Hooks] = None,
+=======
+        on_enter=nothing,
+        backend_ctx_ctor=null_context,
+        patch_fn=nothing,
+        first_ctx=False,
+        *,
+        export=False,
+        dynamic=None,
+        compiler_config=None,
+        package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__()
         assert callable(callback) or callback is False or callback is None
@@ -690,27 +893,42 @@ def __init__(
         self._backend_ctx_ctor = backend_ctx_ctor
         self.prior: Union[Unset, DynamoCallback] = unset
         self.first_ctx = first_ctx
+<<<<<<< HEAD
         self.fullgraph = fullgraph
         self.error_on_graph_break = error_on_graph_break
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.export = export
         self._dynamic = dynamic
         self.compiler_config = compiler_config
         self.cleanup_fns: list[Callable[[], Any]] = []
         self.enter_exit_hooks = []
         self._package = package
+<<<<<<< HEAD
         self._hooks = hooks
         patch_fn()
 
         # Save the backends so that we can reset them during torch._dynamo.reset
         backend = innermost_fn(callback, unaltered_fn_attr="_torchdynamo_orig_backend")  # type: ignore[arg-type]
         cached_backends.setdefault(id(backend), backend)  # type: ignore[arg-type]
+=======
+        patch_fn()
+
+        # Save the backends so that we can reset them during torch._dynamo.reset
+        backend = innermost_fn(callback)
+        cached_backends.setdefault(id(backend), backend)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dynamic is not None:
             self.enter_exit_hooks.append(make_set_enable_dynamic(dynamic))
 
         if on_enter is not nothing:
             # this case is not common
+<<<<<<< HEAD
             def call_on_enter() -> Callable[[], None]:
+=======
+            def call_on_enter():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 on_enter()
                 return nothing
 
@@ -718,14 +936,22 @@ def call_on_enter() -> Callable[[], None]:
 
         if backend_ctx_ctor is not contextlib.nullcontext:
             # this case is not common
+<<<<<<< HEAD
             def call_backend_ctx() -> functools.partial[Optional[bool]]:
+=======
+            def call_backend_ctx():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ctx = backend_ctx_ctor()
                 ctx.__enter__()
                 return functools.partial(ctx.__exit__, None, None, None)
 
             self.enter_exit_hooks.append(call_backend_ctx)
 
+<<<<<<< HEAD
     def __enter__(self) -> None:
+=======
+    def __enter__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.raise_on_ctx_manager_usage:
             raise RuntimeError(
                 "torch._dynamo.optimize(...) is used with a context manager. "
@@ -739,12 +965,16 @@ def __enter__(self) -> None:
         )
         _maybe_set_eval_frame(_callback_from_stance(self.callback))
 
+<<<<<<< HEAD
     def __exit__(
         self,
         exc_type: Optional[type[BaseException]],
         exc_val: Optional[BaseException],
         exc_tb: Optional[types.TracebackType],
     ) -> Optional[bool]:
+=======
+    def __exit__(self, exc_type, exc_val, exc_tb):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.prior is not unset
         set_eval_frame(None)
         set_skip_guard_eval_unsafe(self.prior_skip_guard_eval_unsafe)
@@ -753,6 +983,7 @@ def __exit__(
         self.cleanup_fns.clear()
         _maybe_set_eval_frame(_callback_from_stance(self.prior))
         self.prior = unset
+<<<<<<< HEAD
         return None
 
     def __call__(self, fn: Any) -> Any:
@@ -804,6 +1035,16 @@ def aot_compile(example_inputs: tuple[tuple[Any, ...], dict[str, Any]]) -> Any:
                 ),
             )
 
+=======
+
+    def __call__(self, fn):
+        # public api for compiler config/options
+        def get_compiler_config():
+            return self.compiler_config
+
+        fn = innermost_fn(fn)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # add context containing GraphModule to any GraphModule forward functions
         if isinstance(fn, GraphModule):
             # add context containing GraphModule to any GraphModule forward functions
@@ -844,9 +1085,13 @@ def aot_compile(example_inputs: tuple[tuple[Any, ...], dict[str, Any]]) -> Any:
             filename = inspect.getsourcefile(fn)
         except TypeError:
             filename = None
+<<<<<<< HEAD
         if config.debug_force_nested_calls:
             fn = external_utils.wrap_inline(fn)
         elif config.wrap_top_frame or (
+=======
+        if config.wrap_top_frame or (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (filename is None or trace_rules.check(fn))
             and (
                 getattr(fn, "__name__", "")
@@ -857,6 +1102,7 @@ def aot_compile(example_inputs: tuple[tuple[Any, ...], dict[str, Any]]) -> Any:
             # call to a builtin without a frame for us to capture
             fn = external_utils.wrap_inline(fn)
 
+<<<<<<< HEAD
         def do_nothing(*arg: Any, **kwargs: Any) -> None:
             pass
 
@@ -880,6 +1126,24 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                         return fn(*args, **kwargs)
                 # Skip nested compile - just inline the function
                 if is_fx_symbolic_tracing():
+=======
+        def do_nothing(*arg, **kwargs):
+            pass
+
+        if hasattr(self, "callback"):
+            callback = self.callback
+        else:
+            callback = do_nothing
+
+        is_jit_tracing = torch._C._is_tracing
+        is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
+
+        @functools.wraps(fn)
+        def compile_wrapper(*args, **kwargs):
+            prior = set_eval_frame(None)
+            try:
+                if is_fx_tracing():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if config.error_on_nested_fx_trace:
                         raise RuntimeError(
                             "Detected that you are using FX to symbolically trace "
@@ -898,10 +1162,13 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 prior_skip_guard_eval_unsafe = set_skip_guard_eval_unsafe(
                     _is_skip_guard_eval_unsafe_stance()
                 )
+<<<<<<< HEAD
                 prior_error_on_graph_break = None
                 if not self.fullgraph and self.error_on_graph_break is not None:
                     prior_error_on_graph_break = _get_error_on_graph_break()
                     _set_error_on_graph_break(self.error_on_graph_break)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Ensure that if an assertion occurs after graph pushes
                 # something onto the DynamicLayerStack then we pop it off (the
@@ -912,7 +1179,10 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 saved_dynamic_layer_stack_depth = (
                     torch._C._functorch.get_dynamic_layer_stack_depth()
                 )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _maybe_set_eval_frame(_callback_from_stance(callback))
 
                 try:
@@ -925,7 +1195,10 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                     while cur_exn.__cause__ is not None:
                         cur_exn.__cause__.with_traceback(None)
                         cur_exn = cur_exn.__cause__
+<<<<<<< HEAD
                     # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise e.with_traceback(None) from e.__cause__  # User compiler error
                 except ShortenTraceback as e:
                     # Failures in the backend likely don't have useful
@@ -934,8 +1207,11 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 finally:
                     # Restore the dynamic layer stack depth if necessary.
                     set_eval_frame(None)
+<<<<<<< HEAD
                     if prior_error_on_graph_break is not None:
                         _set_error_on_graph_break(prior_error_on_graph_break)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch._C._functorch.pop_dynamic_layer_stack_and_undo_to_depth(
                         saved_dynamic_layer_stack_depth
                     )
@@ -947,6 +1223,7 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 _maybe_set_eval_frame(prior)
 
         # hooks to properly handle inlining
+<<<<<<< HEAD
         if self.error_on_graph_break is not None:
             compile_wrapper._torchdynamo_inline = (  # type: ignore[attr-defined]
                 external_utils.wrap_inline_with_error_on_graph_break(
@@ -955,6 +1232,9 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
             )
         else:
             compile_wrapper._torchdynamo_inline = fn  # type: ignore[attr-defined]
+=======
+        compile_wrapper._torchdynamo_inline = fn  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -964,8 +1244,11 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
         # provide public api _fn.get_compiler_config()
         assert not hasattr(compile_wrapper, "get_compiler_config")
         compile_wrapper.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+<<<<<<< HEAD
         if torch._dynamo.config.enable_aot_compile:
             compile_wrapper.aot_compile = aot_compile  # type: ignore[attr-defined]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
@@ -1012,6 +1295,7 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
 class OptimizeContext(_TorchDynamoContext):
     def __init__(
         self,
+<<<<<<< HEAD
         callback: DynamoCallback,
         backend_ctx_ctor: Callable[[], contextlib.AbstractContextManager[Any]],
         first_ctx: bool = False,
@@ -1028,6 +1312,21 @@ def __init__(
         hooks: Optional[Hooks] = None,
     ) -> None:
         def on_enter() -> None:
+=======
+        callback,
+        backend_ctx_ctor,
+        first_ctx=False,
+        *,
+        export=False,
+        dynamic=None,
+        compiler_config=None,
+        rebuild_ctx: Optional[
+            Callable[[], Union[OptimizeContext, _NullDecorator]]
+        ] = None,
+        package=None,
+    ) -> None:
+        def on_enter():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             install_generation_tagging_init()
 
         super().__init__(
@@ -1036,13 +1335,19 @@ def on_enter() -> None:
             backend_ctx_ctor=backend_ctx_ctor,
             patch_fn=TorchPatcher.patch,
             first_ctx=first_ctx,
+<<<<<<< HEAD
             fullgraph=fullgraph,
             error_on_graph_break=error_on_graph_break,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             export=export,
             dynamic=dynamic,
             compiler_config=compiler_config,
             package=package,
+<<<<<<< HEAD
             hooks=hooks,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if config.compiled_autograd:
@@ -1050,6 +1355,7 @@ def on_enter() -> None:
             if _dynamic is None:
                 _dynamic = not torch._dynamo.config.assume_static_by_default
 
+<<<<<<< HEAD
             def call_compiled_autograd() -> functools.partial[Optional[bool]]:
                 assert rebuild_ctx is not None
                 compiler_fn = rebuild_ctx()
@@ -1058,15 +1364,26 @@ def call_compiled_autograd() -> functools.partial[Optional[bool]]:
                     # pyrefly: ignore [bad-argument-type]
                     dynamic=_dynamic,
                     ignore_active_disable_ctx=False,
+=======
+            def call_compiled_autograd():
+                assert rebuild_ctx is not None
+                compiler_fn = rebuild_ctx()
+                ctx = torch._dynamo.compiled_autograd._enable(
+                    compiler_fn, dynamic=_dynamic, ignore_active_disable_ctx=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 ctx.__enter__()
                 return functools.partial(ctx.__exit__, None, None, None)
 
             self.enter_exit_hooks.append(call_compiled_autograd)
 
+<<<<<<< HEAD
     def __reduce__(
         self,
     ) -> tuple[type[OptimizeContext], tuple[Any, ...], dict[str, Any]]:
+=======
+    def __reduce__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             self.__class__,
             (self.callback, self._backend_ctx_ctor, self.first_ctx),
@@ -1081,12 +1398,20 @@ def __reduce__(
 class RunOnlyContext(_TorchDynamoContext):
     def __init__(self) -> None:
         # cudagraph trees relies on generation increment
+<<<<<<< HEAD
         def on_enter() -> None:
+=======
+        def on_enter():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.mutation_guard.GenerationTracker.generation += 1
 
         super().__init__(callback=False, on_enter=on_enter)
 
+<<<<<<< HEAD
     def __reduce__(self) -> tuple[type[RunOnlyContext], tuple[Any, ...]]:
+=======
+    def __reduce__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (self.__class__, ())
 
 
@@ -1096,7 +1421,11 @@ def __init__(self, msg: Optional[str] = None, wrapping: bool = True) -> None:
         self.msg = msg
         self.wrapping = wrapping
 
+<<<<<<< HEAD
     def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+    def __call__(self, fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Earlier this code was in the base class _TorchDynamoContext. But we
         # moved it here to have better code organization. For disable, we just
         # want the callback to be None. We don't have to check trace_rules or
@@ -1120,7 +1449,10 @@ def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
             cls_obj.__call__ = self(cls_obj.__call__)
             if issubclass(cls_obj, torch.nn.Module):
                 # NN module variable tracker directly inlines the _call_impl. Disable it.
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
@@ -1128,7 +1460,11 @@ def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
             f"A callable function is expected, but {type(fn)} is provided."
         )
 
+<<<<<<< HEAD
         def _fn(*args: Any, **kwargs: Any) -> Any:
+=======
+        def _fn(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prior = set_eval_frame(None)
             try:
                 _maybe_set_eval_frame(_callback_from_stance(self.callback))
@@ -1156,11 +1492,16 @@ def _fn(*args: Any, **kwargs: Any) -> Any:
 
         return _fn
 
+<<<<<<< HEAD
     def __reduce__(self) -> tuple[type[DisableContext], tuple[Any, ...]]:
+=======
+    def __reduce__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (self.__class__, ())
 
 
 def _optimize_catch_errors(
+<<<<<<< HEAD
     compile_fn: convert_frame.ConvertFrameProtocol,
     hooks: Hooks,
     backend_ctx_ctor: Callable[
@@ -1174,17 +1515,32 @@ def _optimize_catch_errors(
     rebuild_ctx: Optional[Callable[[], Union[OptimizeContext, _NullDecorator]]] = None,
     package: Optional[CompilePackage] = None,
 ) -> OptimizeContext:
+=======
+    compile_fn,
+    hooks: Hooks,
+    backend_ctx_ctor=null_context,
+    export=False,
+    dynamic=None,
+    compiler_config=None,
+    rebuild_ctx=None,
+    package=None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return OptimizeContext(
         convert_frame.catch_errors_wrapper(compile_fn, hooks),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
+<<<<<<< HEAD
         fullgraph=fullgraph,
         error_on_graph_break=error_on_graph_break,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         export=export,
         dynamic=dynamic,
         compiler_config=compiler_config,
         rebuild_ctx=rebuild_ctx,
         package=package,
+<<<<<<< HEAD
         hooks=hooks,
     )
 
@@ -1200,22 +1556,41 @@ def get_compiler_fn(
     elif hasattr(compiler_fn, "compiler_name"):
         compiler_str = compiler_fn.compiler_name  # type: ignore[union-attr]
         assert isinstance(compiler_str, str)
+=======
+    )
+
+
+def get_compiler_fn(compiler_fn):
+    from .repro.after_dynamo import wrap_backend_debug
+
+    if hasattr(compiler_fn, "compiler_name"):
+        compiler_str = compiler_fn.compiler_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(compiler_fn, str):
         compiler_str = compiler_fn
     else:
         compiler_str = None
+<<<<<<< HEAD
     compiler_fn = lookup_backend(compiler_fn)  # type: ignore[arg-type]
+=======
+    compiler_fn = lookup_backend(compiler_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return wrap_backend_debug(compiler_fn, compiler_str)
 
 
 class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
+<<<<<<< HEAD
     def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+    def __call__(self, fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert callable(fn), (
             f"A callable function is expected, but {type(fn)} is provided."
         )
         return fn
 
 
+<<<<<<< HEAD
 # Make dynamo graph to have same input/output spec as user code
 def argument_names(
     f_sig: inspect.Signature,
@@ -1302,6 +1677,9 @@ def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
 
 
 def check_if_dynamo_supported() -> None:
+=======
+def check_if_dynamo_supported():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 14):
         raise RuntimeError("Python 3.14+ not yet supported for torch.compile")
     elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1 and sys.version_info < (
@@ -1315,7 +1693,11 @@ def check_if_dynamo_supported() -> None:
         )
 
 
+<<<<<<< HEAD
 def is_dynamo_supported() -> bool:
+=======
+def is_dynamo_supported():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         check_if_dynamo_supported()
         return True
@@ -1323,11 +1705,19 @@ def is_dynamo_supported() -> bool:
         return False
 
 
+<<<<<<< HEAD
 def check_if_inductor_supported() -> None:
     check_if_dynamo_supported()
 
 
 def is_inductor_supported() -> bool:
+=======
+def check_if_inductor_supported():
+    check_if_dynamo_supported()
+
+
+def is_inductor_supported():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         check_if_inductor_supported()
         return True
@@ -1335,15 +1725,24 @@ def is_inductor_supported() -> bool:
         return False
 
 
+<<<<<<< HEAD
 def check_for_incompatible_configs() -> None:
+=======
+def check_for_incompatible_configs():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Some of the configs should be mutually exclusive
     assert not (config.suppress_errors and config.fail_on_recompile_limit_hit), (
         "Dynamo configs suppress_error and fail_on_recompile_limit_hit can not both be active at the same time."
     )
 
 
+<<<<<<< HEAD
 def optimize(*args: Any, **kwargs: Any) -> Union[OptimizeContext, _NullDecorator]:
     def rebuild_ctx() -> Union[OptimizeContext, _NullDecorator]:
+=======
+def optimize(*args, **kwargs):
+    def rebuild_ctx():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ca_kwargs_override = config.compiled_autograd_kwargs_override
         if ca_kwargs_override:
             # NOTE: The process of translating other `torch.compile` kwargs to `torch._dynamo.optimize` kwargs
@@ -1359,6 +1758,7 @@ def rebuild_ctx() -> Union[OptimizeContext, _NullDecorator]:
 
 def _optimize(
     rebuild_ctx: Callable[[], Union[OptimizeContext, _NullDecorator]],
+<<<<<<< HEAD
     backend: Union[str, Callable[..., Any]] = "inductor",
     *,
     nopython: bool = False,
@@ -1369,6 +1769,17 @@ def _optimize(
     disable: bool = False,
     dynamic: Optional[bool] = None,
     package: Optional[CompilePackage] = None,
+=======
+    backend="inductor",
+    *,
+    nopython=False,
+    guard_export_fn=None,
+    guard_fail_fn=None,
+    guard_filter_fn=None,
+    disable=False,
+    dynamic=None,
+    package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Union[OptimizeContext, _NullDecorator]:
     """
     The main entrypoint of TorchDynamo.  Do graph capture and call
@@ -1385,11 +1796,14 @@ def _optimize(
             - Or, a string backend name in `torch._dynamo.list_backends()`
         nopython: If True, graph breaks will be errors and there will
             be a single whole-program graph.
+<<<<<<< HEAD
         error_on_graph_break: If not None, the current `error_on_graph_break` setting is set to the given value.
             See `torch._dynamo.error_on_graph_break()` for more details on what `error_on_graph_break` means.
 
             Unlike `nopython=True` (i.e. `fullgraph=True`), there is no guarantee of a single whole-program graph.
             If `nopython` is True, `error_on_graph_break` does nothing.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         disable: If True, turn this decorator into a no-op
         dynamic: If True, upfront compile as dynamic a kernel as possible.  If False,
             disable all dynamic shapes support (always specialize).  If None, automatically
@@ -1420,7 +1834,11 @@ def toy_example(a, b): ...
     ):
         return _NullDecorator()
 
+<<<<<<< HEAD
     if nopython and not config.debug_force_graph_break_on_leaf_return:
+=======
+    if nopython:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return optimize_assert(
             backend,
             dynamic=dynamic,
@@ -1435,6 +1853,7 @@ def toy_example(a, b): ...
     backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
 
     # The backend function is stashed in the callable returned by
+<<<<<<< HEAD
     # _optimize_catch_errors in the field _torchdynamo_orig_backend. This can
     # be used by eval_frame.c to insert a guard on the backend.
 
@@ -1456,6 +1875,14 @@ def toy_example(a, b): ...
         fullgraph=False,
         error_on_graph_break=error_on_graph_break
         and not config.debug_force_graph_break_on_leaf_return,
+=======
+    # _optimize_catch_errors in the field _torchdynamo_orig_callable. This can
+    # be used by eval_frame.c to insert a guard on the backend.
+    return _optimize_catch_errors(
+        convert_frame.convert_frame(backend, hooks=hooks, package=package),
+        hooks,
+        backend_ctx_ctor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic=dynamic,
         compiler_config=(
             backend.get_compiler_config()
@@ -1469,10 +1896,15 @@ def toy_example(a, b): ...
 
 # TODO(voz): Consider making "explain" output alongside a run / part of a run
 @patch("torch._dynamo.symbolic_convert.explain", True)
+<<<<<<< HEAD
 def explain(f: Callable[..., Any], *extra_args: Any, **extra_kwargs: Any) -> Any:
     from .backends.debugging import ExplainOutput
 
     def inner(*args: Any, **kwargs: Any) -> ExplainOutput:
+=======
+def explain(f, *extra_args, **extra_kwargs):
+    def inner(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO(voz): Do we want a decorator for this?
         from . import reset  # type: ignore[attr-defined]
 
@@ -1481,12 +1913,21 @@ def inner(*args: Any, **kwargs: Any) -> ExplainOutput:
         graphs: list[torch.fx.GraphModule] = []
         break_reasons: list[Any] = []
         op_count: int = 0
+<<<<<<< HEAD
         ops_per_graph: list[list[Target]] = []
         out_guards: list[_guards.Guard] = []
 
         def dynamo_graph_accumulating_compiler(
             gm: torch.fx.GraphModule, example_inputs: Any
         ) -> Callable[..., Any]:
+=======
+        ops_per_graph: list[torch.fx.Node] = []
+        out_guards: list[_guards.Guard] = []
+
+        def dynamo_graph_accumulating_compiler(
+            gm: torch.fx.GraphModule, example_inputs
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from .backends.debugging import _explain_graph_detail
 
             nonlocal graphs
@@ -1500,7 +1941,11 @@ def dynamo_graph_accumulating_compiler(
 
             return gm.forward
 
+<<<<<<< HEAD
         def guard_export_print(guards: Iterable[_guards.Guard]) -> None:
+=======
+        def guard_export_print(guards):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal out_guards
             out_guards.extend(guards)
 
@@ -1518,6 +1963,10 @@ def guard_export_print(guards: Iterable[_guards.Guard]) -> None:
 
         # TODO(voz): Do we want a decorator for this?
         reset()
+<<<<<<< HEAD
+=======
+        from .backends.debugging import ExplainOutput
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return ExplainOutput(
             graphs,
@@ -1547,9 +1996,15 @@ class FlattenInputOutputSignature(torch.fx.Transformer):
     def __init__(
         self,
         m: torch.fx.GraphModule,
+<<<<<<< HEAD
         flat_args: list[Any],
         matched_input_elements_positions: list[int],
         flat_results: Sequence[Any],
+=======
+        flat_args: tuple[Any],
+        matched_input_elements_positions: list[int],
+        flat_results: list[Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         matched_output_elements_positions: list[int],
         example_fake_inputs: list[torch.Tensor],
         flat_args_dynamic_dims: list[set[int]],
@@ -1564,7 +2019,11 @@ def __init__(
         }
 
         self.new_args = []
+<<<<<<< HEAD
         for i in range(len(flat_args)):
+=======
+        for i in range(0, len(flat_args)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg = super().placeholder(f"arg{i}", (), {})
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
@@ -1597,9 +2056,13 @@ def __init__(
         self.matched_output_elements_positions = matched_output_elements_positions
         self.flat_results = flat_results
 
+<<<<<<< HEAD
     def placeholder(
         self, target: Target, args: tuple[Argument, ...], kwargs: dict[str, Any]
     ) -> Any:
+=======
+    def placeholder(self, target, args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = next(self.old_args_gen)
         if "val" in self.current_node.meta:
             arg.node.meta["val"] = self.current_node.meta["val"]
@@ -1614,11 +2077,17 @@ def placeholder(
             ]
         return arg
 
+<<<<<<< HEAD
     def output(
         self, target: Target, args: tuple[Argument, ...], kwargs: dict[str, Any]
     ) -> Any:
         dynamo_result_flat = args[0]
         lookup = [*dynamo_result_flat, *self.new_args]  # type: ignore[misc]
+=======
+    def output(self, target, args, kwargs):
+        dynamo_result_flat = args[0]
+        lookup = [*dynamo_result_flat, *self.new_args]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_results_flat = []
         for i in range(len(self.flat_results)):
             if self.matched_output_elements_positions[i] is not None:
@@ -1631,7 +2100,11 @@ def output(
                 new_results_flat.append(const_val)
         return super().output(target, (new_results_flat,), {})
 
+<<<<<<< HEAD
     def run_node(self, n: Node) -> Any:
+=======
+    def run_node(self, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.current_node = n
         result_proxy = super().run_node(n)
         if "val" in self.current_node.meta:
@@ -1651,7 +2124,11 @@ def run_node(self, n: Node) -> Any:
             )
         return result_proxy
 
+<<<<<<< HEAD
     def transform(self) -> torch.fx.GraphModule:
+=======
+    def transform(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result_gm = super().transform()
         if "dynamo_flat_name_to_original_fqn" in self.module.meta:  # type: ignore[operator]
             result_gm.meta["dynamo_flat_name_to_original_fqn"] = self.module.meta[  # type: ignore[index]
@@ -1670,17 +2147,26 @@ class ExportResult(NamedTuple):
 
 
 # NOTE: this function only supports graphs created by Dynamo's OutputGraph module
+<<<<<<< HEAD
 def check_signature_rewritable(graph: torch.fx.GraphModule) -> None:
+=======
+def check_signature_rewritable(graph):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_errors = []
     for node in graph.graph.find_nodes(op="placeholder"):
         # set in OutputGraph._call_user_compiler
         assert hasattr(node, "_dynamo_source")
         assert hasattr(graph, "_source_to_user_stacks")
 
+<<<<<<< HEAD
         # NOTE: We can safely ignore these type warnings if and only if
         # the function is made from OutputGraph (checked in the assertions)
         source = node._dynamo_source  # type: ignore[attr-defined]
         user_stacks = graph._source_to_user_stacks.get(source)  # type: ignore[operator, union-attr]
+=======
+        source = node._dynamo_source
+        user_stacks = graph._source_to_user_stacks.get(source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if user_stacks is None:
             continue
         assert len(user_stacks) > 0
@@ -1716,6 +2202,7 @@ def check_signature_rewritable(graph: torch.fx.GraphModule) -> None:
         )
 
 
+<<<<<<< HEAD
 def check_user_input_output(flat_values: list[Any], error_type: UserErrorType) -> None:
     supported_types = [
         torch.Tensor,
@@ -1763,11 +2250,63 @@ def rewrite_signature(
 ) -> torch.fx.GraphModule:
     orig_args, orig_kwargs = pytree.tree_unflatten(flat_args, in_spec)
 
+=======
+def rewrite_signature(
+    f_sig,
+    graph,
+    fake_mode,
+    flat_args,
+    in_spec,
+    example_fake_inputs,
+    graph_captured_input,
+    graph_captured_output,
+    dynamo_traced_result,
+    flat_args_dynamic_dims,
+):
+    orig_args, orig_kwargs = pytree.tree_unflatten(flat_args, in_spec)
+
+    def check_user_input_output(flat_values, error_type):
+        supported_types = [
+            torch.Tensor,
+            torch.SymInt,
+            torch.SymFloat,
+            torch.SymBool,
+            torch._C.ScriptObject,
+            _IntWrapper,
+        ] + list(common_constant_types)
+
+        def is_supported_type(val):
+            return isinstance(val, tuple(supported_types))
+
+        value_type = "input" if error_type == UserErrorType.INVALID_INPUT else "output"
+        # We only check that the outputs are not None. Inputs can be None.
+        for v in flat_values:
+            if not is_supported_type(v):
+                if error_type == UserErrorType.INVALID_INPUT and v is None:
+                    continue
+
+                raise UserError(
+                    error_type,
+                    f"It looks like one of the {value_type}s with type `{type(v)}` "
+                    "is not supported or pytree-flattenable. \n"
+                    f"Exported graphs {value_type}s can only contain the "
+                    f"following supported types: {supported_types}. \n"
+                    "If you are using a custom class object, "
+                    "please register a pytree_flatten/unflatten function "
+                    "using `torch.utils._pytree.register_pytree_node` or "
+                    "`torch.export.register_dataclass`.",
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_user_input_output(flat_args, UserErrorType.INVALID_INPUT)
     flat_results_traced, out_spec_traced = pytree.tree_flatten(dynamo_traced_result)
     check_user_input_output(flat_results_traced, UserErrorType.INVALID_OUTPUT)
 
+<<<<<<< HEAD
     def check_optional_input_and_error(f_sig: inspect.Signature) -> None:
+=======
+    def check_optional_input_and_error(f_sig: inspect.Signature):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Check if function has optional input.
         for name, param in f_sig.parameters.items():
             if param.default is not inspect.Parameter.empty:
@@ -1783,9 +2322,13 @@ def check_optional_input_and_error(f_sig: inspect.Signature) -> None:
                     case_name="optional_input",
                 )
 
+<<<<<<< HEAD
     def produce_matching(
         debug_type: str, sources: Iterable[Any], candidates: Iterable[Any]
     ) -> list[Optional[int]]:
+=======
+    def produce_matching(debug_type, sources, candidates):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         matched_elements_positions: list[Optional[int]] = []
         dict_of_source_vals = {}
         for i, val in enumerate(sources):
@@ -1818,14 +2361,106 @@ def produce_matching(
     new_graph = FlattenInputOutputSignature(
         graph,
         flat_args,
+<<<<<<< HEAD
         matched_input_elements_positions,  # type: ignore[arg-type]
         flat_results_traced,
         matched_output_elements_positions,  # type: ignore[arg-type]
+=======
+        matched_input_elements_positions,
+        flat_results_traced,
+        matched_output_elements_positions,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_fake_inputs,
         flat_args_dynamic_dims,
         fake_mode,
     ).transform()
 
+<<<<<<< HEAD
+=======
+    # Make dynamo graph to have same input/output spec as user code
+    def argument_names(f_sig, args, kwargs) -> list[str]:
+        def signature_to_fullargspec(sig: inspect.Signature):
+            # Get a list of Parameter objects from the Signature object
+            params = list(sig.parameters.values())
+            # Separate positional arguments, keyword-only arguments and varargs/varkw
+            args = [
+                p.name
+                for p in params
+                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+            ]
+            kwonlyargs = [
+                p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
+            ]
+            varargs = next(
+                (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
+                None,
+            )
+            varkw = next(
+                (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
+                None,
+            )
+            # Get default values for positional arguments and keyword-only arguments
+            defaults = tuple(
+                p.default
+                for p in params
+                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+                and p.default is not inspect.Parameter.empty
+            )
+            kwonlydefaults = {
+                p.name: p.default
+                for p in params
+                if p.kind == inspect.Parameter.KEYWORD_ONLY
+                and p.default is not inspect.Parameter.empty
+            }
+            # Get annotations for parameters and return value
+            annotations = {}
+            if sig.return_annotation:
+                annotations = {"return": sig.return_annotation}
+            for parameter in params:
+                annotations[parameter.name] = parameter.annotation
+            # Return a FullArgSpec object with the extracted attributes
+            return inspect.FullArgSpec(
+                args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
+            )
+
+        fullargspec = signature_to_fullargspec(f_sig)
+
+        # 1. Map `args` 1-to-1 to positional arguments in original signature.
+        input_strs = fullargspec.args[: len(args)]
+
+        if len(args) > len(fullargspec.args):
+            # 2. If there are more arguments left in `args`, they map to varargs in original
+            # signature. Assign names as {varargs}_0, {varargs}_1, ...
+            assert fullargspec.varargs is not None, "More arguments than expected"
+            input_strs += [
+                f"{fullargspec.varargs}_{i}"
+                for i in range(0, len(args) - len(input_strs))
+            ]
+        elif len(args) < len(fullargspec.args):
+            # 3. If there are fewer arguments in `args` than `fullargspec.args`,
+            # it implies these are arguments either with default values, or provided in
+            # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
+            # export them as part of the function signature. The latter will be handled
+            # in the next step.
+            for unprovided_arg in fullargspec.args[
+                len(args) : -len(fullargspec.defaults or [])
+            ]:
+                assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
+
+        # 4. Keyword arguments provided in `kwargs`.
+        input_strs += list(kwargs.keys())
+
+        # 5. Keyword-only arguments with default values if not provided are not exported
+        # as part of the function signature.
+        for kwonly_arg in fullargspec.kwonlyargs:
+            kwonlydefaults = fullargspec.kwonlydefaults or {}
+            assert kwonly_arg in kwargs or kwonly_arg in kwonlydefaults, (
+                f"Missing keyword only argument {kwonly_arg}"
+            )
+
+        return input_strs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_graph.graph._codegen = _PyTreeCodeGen(
         _PyTreeInfo(
             argument_names(f_sig, orig_args, orig_kwargs),
@@ -1839,7 +2474,11 @@ def produce_matching(
 
 def export(
     f: Callable[..., Any],
+<<<<<<< HEAD
     *extra_args: Any,
+=======
+    *extra_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten_graph: bool = False,
     pre_dispatch: bool = False,
     decomposition_table: Optional[
@@ -1852,9 +2491,16 @@ def export(
     same_signature: bool = True,
     disable_constraint_solver: bool = False,
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+<<<<<<< HEAD
     _log_export_usage: bool = True,
     constraints: Optional[list[Constraint]] = None,
     **extra_kwargs: Any,
+=======
+    allow_complex_guards_as_runtime_asserts: bool = False,
+    _log_export_usage: bool = True,
+    constraints: Optional[list[Constraint]] = None,
+    **extra_kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[..., ExportResult]:
     """
     Export an input function f to a format that can be executed outside of PyTorch using the FX graph.
@@ -1909,9 +2555,12 @@ def export(
 
     Note - this headerdoc was authored by ChatGPT, with slight modifications by the author.
     """
+<<<<<<< HEAD
     if config.debug_force_graph_break_on_leaf_return:
         raise unittest.SkipTest("Cannot force graph break on export")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _log_export_usage:
         log_export_usage(event="export.private_api", flags={"_dynamo"})
 
@@ -1921,7 +2570,11 @@ def export(
     _assume_static_by_default = assume_static_by_default
     _constraints = constraints
 
+<<<<<<< HEAD
     def inner(*args: Any, **kwargs: Any) -> ExportResult:
+=======
+    def inner(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not _constraints:
             combined_args = _combine_args(_f, args, kwargs)
             constraints = _process_dynamic_shapes(combined_args, dynamic_shapes)
@@ -1941,7 +2594,11 @@ def inner(*args: Any, **kwargs: Any) -> ExportResult:
             assert aten_graph, "pre_dispatch=True can only be used when aten_graph=True"
         f = innermost_fn(f)
         call_to_inspect = f.forward if isinstance(f, torch.nn.Module) else f
+<<<<<<< HEAD
         original_signature = inspect.signature(call_to_inspect)  # type: ignore[arg-type]
+=======
+        original_signature = inspect.signature(call_to_inspect)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph = None
         out_guards = None
         graph_captured_input = None
@@ -1949,18 +2606,30 @@ def inner(*args: Any, **kwargs: Any) -> ExportResult:
         fake_mode = None
         result_traced = None
 
+<<<<<<< HEAD
         def guard_export_print(guards: _guards.GuardsSet) -> None:
+=======
+        def guard_export_print(guards: _guards.GuardsSet):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal out_guards
             assert out_guards is None, (
                 "whole graph export entails exactly one guard export"
             )
             out_guards = guards
 
+<<<<<<< HEAD
         example_inputs: list[Any] = []
 
         def dynamo_normalization_capturing_compiler(
             gm: torch.fx.GraphModule, inner_example_inputs: list[Any]
         ) -> Callable[..., Any]:
+=======
+        example_inputs = []
+
+        def dynamo_normalization_capturing_compiler(
+            gm: torch.fx.GraphModule, inner_example_inputs
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal graph
             assert graph is None, (
                 "Tried to emit a second graph during export. Tracing through 'f' must produce a single graph."
@@ -1976,7 +2645,11 @@ def dynamo_normalization_capturing_compiler(
             fake_mode = _guards.detect_fake_mode()
             example_inputs = inner_example_inputs
 
+<<<<<<< HEAD
             def result_capturing_wrapper(*graph_inputs: Any) -> Any:
+=======
+            def result_capturing_wrapper(*graph_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 nonlocal graph_captured_result
                 nonlocal graph_captured_input
 
@@ -2000,7 +2673,11 @@ def result_capturing_wrapper(*graph_inputs: Any) -> Any:
                 ignore_fresh_unbacked = null_context()
                 assert ambient_fake_mode is not None
                 if shape_env := ambient_fake_mode.shape_env:
+<<<<<<< HEAD
                     ignore_fresh_unbacked = shape_env.ignore_fresh_unbacked_symbols()  # type: ignore[assignment]
+=======
+                    ignore_fresh_unbacked = shape_env.ignore_fresh_unbacked_symbols()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 with (
                     ambient_fake_mode,
@@ -2018,6 +2695,7 @@ def result_capturing_wrapper(*graph_inputs: Any) -> Any:
                             value, static_shapes=True
                         )
 
+<<<<<<< HEAD
                     from torch._export.non_strict_utils import (
                         key_path_to_source,
                         KeyPath,
@@ -2028,6 +2706,10 @@ def fakify_with_ambient(
                     ) -> Any:
                         if isinstance(t, torch.Tensor):
                             # pyrefly: ignore [missing-attribute]
+=======
+                    def fakify_with_ambient(path, t):
+                        if isinstance(t, torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             return ambient_fake_mode.from_tensor(t, static_shapes=True)
                         elif isinstance(t, _IntWrapper):
                             if (
@@ -2039,6 +2721,13 @@ def fakify_with_ambient(
                                     _DimHintType.AUTO,
                                 )
                             ):  # type: ignore[union-attr]
+<<<<<<< HEAD
+=======
+                                from torch._export.non_strict_utils import (
+                                    key_path_to_source,
+                                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 source = key_path_to_source(path)
                                 symint = ambient_fake_mode.shape_env.create_unspecified_symint_and_symbol(  # type: ignore[union-attr]
                                     t.val, source, DimDynamic.DYNAMIC
@@ -2053,9 +2742,13 @@ def fakify_with_ambient(
                         fakify_with_ambient, graph_inputs
                     )
                     graph_captured_result = torch.func.functional_call(
+<<<<<<< HEAD
                         graph,
                         fake_params_buffers,  # type: ignore[arg-type]
                         fake_graph_inputs,  # type: ignore[arg-type]
+=======
+                        graph, fake_params_buffers, fake_graph_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                 return graph_captured_result
@@ -2078,12 +2771,17 @@ def fakify_with_ambient(
                 automatic_dynamic_shapes=False,
                 capture_dynamic_output_shape_ops=True,
                 capture_scalar_outputs=True,
+<<<<<<< HEAD
                 constant_fold_autograd_profiler_enabled=True,
                 prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                 # install_free_tensors ensures that params and buffers are still
                 # added as graph attributes, and makes Dynamo emits graphs that
                 # follow export pytree-able input requirements
                 install_free_tensors=config.install_free_tensors_for_export,
+=======
+                prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+                allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             _compiling_state_context(),
         ):
@@ -2113,9 +2811,13 @@ def fakify_with_ambient(
             and not trace_rules.check(call_to_inspect)
         ):
             dim_constraints.solve()
+<<<<<<< HEAD
 
             forced_specializations = dim_constraints.forced_specializations()
 
+=======
+            forced_specializations = dim_constraints.forced_specializations()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             msg = dim_constraints.prettify_results(
                 original_signature,
                 dynamic_shapes,
@@ -2136,7 +2838,10 @@ def fakify_with_ambient(
                     )
 
             # Error if we have any constraints on static values
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for k in shape_env.var_to_range.keys():
                 if isinstance(k, sympy.Integer):
                     constraint_violation_error = ConstraintViolationError(
@@ -2205,7 +2910,11 @@ def fakify_with_ambient(
 
         if aten_graph:
             # Running graph with interpreter is needed for propagating the stack_trace
+<<<<<<< HEAD
             def graph_with_interpreter(*args: Any) -> Any:
+=======
+            def graph_with_interpreter(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with torch.fx.traceback.preserve_node_meta():
                     return torch.fx.Interpreter(graph).run(*args)  # type: ignore[arg-type]
 
@@ -2255,12 +2964,20 @@ def graph_with_interpreter(*args: Any) -> Any:
                 flat_args,
                 in_spec,
                 example_fake_inputs,
+<<<<<<< HEAD
                 graph_captured_input,  # type: ignore[arg-type]
+=======
+                graph_captured_input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph_captured_result,
                 result_traced,  # type: ignore[possibly-undefined]
                 flat_args_dynamic_dims,
             )
+<<<<<<< HEAD
         return ExportResult(graph, out_guards)
+=======
+        return ExportResult(graph, out_guards)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if extra_args or extra_kwargs:
         warnings.warn(
@@ -2270,19 +2987,31 @@ def graph_with_interpreter(*args: Any) -> Any:
             FutureWarning,
             stacklevel=2,
         )
+<<<<<<< HEAD
         return inner(*extra_args, **extra_kwargs)  # type: ignore[return-value]
+=======
+        return inner(*extra_args, **extra_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return inner
 
 
+<<<<<<< HEAD
 def optimize_assert(*args: Any, **kwargs: Any) -> OptimizeContext:
+=======
+def optimize_assert(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if "rebuild_ctx" in kwargs and kwargs["rebuild_ctx"] is not None:
         # called from optimize
         rebuild_ctx = kwargs["rebuild_ctx"]
         del kwargs["rebuild_ctx"]
     else:
 
+<<<<<<< HEAD
         def rebuild_ctx() -> OptimizeContext:
+=======
+        def rebuild_ctx():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return optimize_assert(*args, **kwargs)
 
     return _optimize_assert(rebuild_ctx, *args, **kwargs)
@@ -2290,6 +3019,7 @@ def rebuild_ctx() -> OptimizeContext:
 
 def _optimize_assert(
     rebuild_ctx: Callable[[], OptimizeContext],
+<<<<<<< HEAD
     backend: Union[str, Callable[..., Any], None],
     *,
     hooks: Hooks = Hooks(None, None, None),
@@ -2305,12 +3035,25 @@ def _optimize_assert(
 
     Used for fullgraph=True and export, since we must always error on graph breaks and ignore
     symbolic_convert.error_on_graph_break. Can also be used for testing.
+=======
+    backend,
+    *,
+    hooks=Hooks(None, None, None),
+    export=False,
+    export_constraints=None,
+    dynamic=None,
+    package=None,
+):
+    """
+    The same as `torch._dynamo.optimize(backend, nopython=True)`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     backend = get_compiler_fn(backend)
 
     # Find if backend has any extra context manager
     backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
 
+<<<<<<< HEAD
     if config.caching_precompile and package is None:
         # Create an uninitialized package that will be set/filled by
         # _OptimizeContext.__call__
@@ -2321,6 +3064,8 @@ def _optimize_assert(
 
         package = CompilePackage(fn=None, dynamo=None, ignore_inlined_sources=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _optimize_catch_errors(
         convert_frame.convert_frame_assert(
             backend,
@@ -2330,7 +3075,10 @@ def _optimize_assert(
         ),
         hooks,
         backend_ctx_ctor,
+<<<<<<< HEAD
         fullgraph=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         export=export,
         dynamic=dynamic,
         rebuild_ctx=rebuild_ctx,
@@ -2341,7 +3089,11 @@ def _optimize_assert(
 class TorchPatcher:
     @staticmethod
     @functools.cache
+<<<<<<< HEAD
     def patch() -> None:
+=======
+    def patch():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # A better way to disable the following would be decorate the source
         # functions with @torch._disable_dynamo. However, this causes issues
         # with torch.deploy internally.
@@ -2434,6 +3186,7 @@ def patch() -> None:
                 )
 
     @staticmethod
+<<<<<<< HEAD
     def suppress_torch_distributed_warnings(
         fn: Callable[..., Any],
     ) -> Callable[..., Any]:
@@ -2442,11 +3195,23 @@ def inner_fn(*args: Any, **kwargs: Any) -> Any:
                 torch._logging._internal.user_warning_filter
             ):
                 return fn(*args, **kwargs)
+=======
+    def suppress_torch_distributed_warnings(fn):
+        def inner_fn(*args, **kwargs):
+            warnings.filterwarnings(
+                "ignore", category=UserWarning, module="torch.distributed"
+            )
+            return fn(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return inner_fn
 
 
+<<<<<<< HEAD
 def skip_code(code: types.CodeType) -> None:
+=======
+def skip_code(code: types.CodeType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_code_exec_strategy(
         code, FrameExecStrategy(FrameAction.SKIP, FrameAction.DEFAULT)
     )
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index c6bed34c0d8d5..ef6da361b1b64 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -26,20 +26,29 @@
     - Debugging utilities for error reporting
 """
 
+<<<<<<< HEAD
 import json
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import os
 import re
 import textwrap
 import typing
 from enum import auto, Enum
+<<<<<<< HEAD
 from functools import lru_cache
 from pathlib import Path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from traceback import extract_stack, format_exc, format_list, StackSummary
 from typing import Any, NoReturn, Optional, TYPE_CHECKING
 
 import torch._guards
+<<<<<<< HEAD
 from torch._utils_internal import get_file_path_2
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import config
 from .utils import counters
@@ -50,7 +59,10 @@
 
     from torch._guards import CompileId
 
+<<<<<<< HEAD
     from .output_graph import DynamoTracerOutput
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .symbolic_convert import InstructionTranslatorBase
     from .types import DynamoFrameType
 
@@ -68,6 +80,7 @@ def exportdb_error_message(case_name: str) -> str:
 
 
 class TorchDynamoException(RuntimeError):
+<<<<<<< HEAD
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self._torch_dynamo_tracer_output: Optional[DynamoTracerOutput] = None
@@ -78,6 +91,12 @@ class InternalTorchDynamoError(TorchDynamoException):
 
 
 class ResumePrologueTracingError(TorchDynamoException):
+=======
+    pass
+
+
+class InternalTorchDynamoError(TorchDynamoException):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass
 
 
@@ -163,6 +182,7 @@ def __init__(
 
 
 class Unsupported(TorchDynamoException):
+<<<<<<< HEAD
     def __init__(
         self,
         msg: str,
@@ -174,6 +194,11 @@ def __init__(
         if not real_stack:
             real_stack = torch._guards.TracingContext.extract_stack()
         self.real_stack = real_stack
+=======
+    def __init__(self, msg: str, *, case_name: Optional[str] = None) -> None:
+        super().__init__(msg)
+        self.real_stack = torch._guards.TracingContext.extract_stack()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.msg = msg
         self.category: Optional[str] = None
         self.add_to_stats()
@@ -271,16 +296,20 @@ class RecompileLimitExceeded(Unsupported):
     pass
 
 
+<<<<<<< HEAD
 # debug exception thrown when tracing torch._dynamo.step_unsupported()
 class StepUnsupported(TorchDynamoException):
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class UnsafeScriptObjectError(TorchDynamoException):
     pass
 
 
 class UncapturedHigherOrderOpError(TorchDynamoException):
+<<<<<<< HEAD
     def __init__(self, msg: str, real_stack: Optional[StackSummary] = None) -> None:
         super().__init__(msg)
         self.msg = msg
@@ -289,6 +318,9 @@ def __init__(self, msg: str, real_stack: Optional[StackSummary] = None) -> None:
             if real_stack is not None
             else torch._guards.TracingContext.extract_stack()
         )
+=======
+    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class IncorrectUsage(Exception):
@@ -308,9 +340,13 @@ class PackageError(TorchDynamoException):
 
 class ObservedException(TorchDynamoException):
     # An exception observed during the tracing. This exception is used by Dynamo to handle exceptions.
+<<<<<<< HEAD
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.real_stack: StackSummary = torch._guards.TracingContext.extract_stack()
+=======
+    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ObservedUserStopIteration(ObservedException):
@@ -381,10 +417,16 @@ class ObservedTypeError(ObservedException):
 def get_dynamo_observed_exception(exc_type: type[Exception]) -> type[ObservedException]:
     if exc_type not in observed_exception_map:
         name = getattr(exc_type, "__name__", str(exc_type))
+<<<<<<< HEAD
         observed_exception_map[exc_type] = type(  # type: ignore[assignment]
             f"Observed{name}Error", (ObservedException,), {}
         )
     # pyrefly: ignore [index-error]
+=======
+        observed_exception_map[exc_type] = type(
+            f"Observed{name}Error", (ObservedException,), {}
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return observed_exception_map[exc_type]
 
 
@@ -394,12 +436,16 @@ def raise_observed_exception(
     *,
     args: Optional[list[Any]] = None,
     kwargs: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
     msg: Optional[str] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> NoReturn:
     from .variables import BuiltinVariable
 
     # CPython here raises an exception. Since there is no python code, we have to manually setup the exception
     # stack and raise the exception.
+<<<<<<< HEAD
     # If a message is provided but no args, use the message as the first argument
     if msg is not None and (args is None or len(args) == 0):
         args = [msg]
@@ -410,6 +456,11 @@ def raise_observed_exception(
     if args:
         raise raised_exc(*args)
     raise raised_exc
+=======
+    exception_vt = BuiltinVariable(exc_type).call_function(tx, args or [], kwargs or {})  # type: ignore[arg-type]
+    tx.exn_vt_stack.set_current_exception(exception_vt)
+    raise observed_exception_map[exc_type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def handle_observed_exception(tx: Any) -> None:
@@ -536,6 +587,7 @@ def format_graph_break_message(
     return msg
 
 
+<<<<<<< HEAD
 @lru_cache(maxsize=1)
 def _load_gb_type_to_gb_id_map() -> dict[str, Any]:
     """
@@ -586,6 +638,8 @@ def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
     return None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO replace old unimplemented later
 def unimplemented_v2(
     gb_type: str,
@@ -608,14 +662,22 @@ def unimplemented_v2(
 
     msg = format_graph_break_message(gb_type, context, explanation, hints)
 
+<<<<<<< HEAD
     documentation_link = get_gbid_documentation_link(gb_type)
 
     if documentation_link:
         msg += f"\n For more details about this graph break, please visit: {documentation_link}"
+=======
+    # Temporarily disabling the generation of the weblinks in error message
+
+    # documentation_link = get_gbid_documentation_link(gb_type)
+    # msg += f"\n For more details about this graph break, please visit: {documentation_link}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if log_warning:
         log.warning(msg)
     if from_exc is not _NOTHING:
+<<<<<<< HEAD
         past_real_stack = None
         if hasattr(from_exc, "real_stack"):
             past_real_stack = from_exc.real_stack
@@ -623,6 +685,17 @@ def unimplemented_v2(
     raise Unsupported(msg)
 
 
+=======
+        raise Unsupported(msg) from from_exc
+    raise Unsupported(msg)
+
+
+def warning(msg: str) -> None:
+    counters["warnings"][msg] += 1
+    assert msg != os.environ.get("BREAK", False)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # KeyError has special handling for its args
 # see https://github.com/python/cpython/blob/3.11/Objects/exceptions.c#L2534 for details
 class KeyErrorMsg:
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index fd21f57d8b865..8fce7fa34c993 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# This module contains functions that *will be allowed* by dynamo
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module contains utility functions that are explicitly allowed to be called during
 TorchDynamo compilation. These functions are carefully vetted to ensure they work
@@ -22,8 +27,12 @@
 
 import functools
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -97,9 +106,13 @@ def wrap(*args: _P.args, **kwargs: _P.kwargs) -> pytree.PyTree:
         args, kwargs = pytree.tree_map_only(
             torch.Tensor, lambda x: x.numpy(), (args, kwargs)
         )
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-param-spec]
         out = f(*args, **kwargs)
         # pyrefly: ignore [missing-attribute]
+=======
+        out = f(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return pytree.tree_map_only(np.ndarray, lambda x: torch.as_tensor(x), out)
 
     return wrap
@@ -201,12 +214,19 @@ def nonrecursive_disable_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
     return nonrecursive_disable_wrapper
 
 
+<<<<<<< HEAD
 def wrap_dunder_call_ctx_manager(self: Any, func: Callable[_P, _R]) -> Callable[_P, _R]:
     """
     Apply self as a ctx manager around a call to func
     """
 
     # NOTE: do not functools.wraps(func) because we don't ever want this frame to be skipped!
+=======
+def _dynamo_config_patch_proxy_dunder_call(
+    self: Any, func: Callable[_P, _R]
+) -> Callable[_P, _R]:
+    @functools.wraps(func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         with self:
             return func(*args, **kwargs)
@@ -230,6 +250,7 @@ def call_accumulate_grad(
         [grad], variable, variable.grad, has_post_hooks
     )
     variable.grad = updated_grad[0]
+<<<<<<< HEAD
 
 
 def wrap_inline_with_error_on_graph_break(
@@ -281,3 +302,5 @@ def insert_const_values_with_mask(
             out.append(tup[idx])
             idx += 1
     return tuple(out)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index 1896240317a29..3586a790ef8d3 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -168,7 +168,11 @@
     {
       "Gb_type": "Attempted to wrap torch._higher_order_ops.invoke_subgraph",
       "Context": "",
+<<<<<<< HEAD
       "Explanation": "Directly using invoke_subgraph is not supported. Use nested_compile_region",
+=======
+      "Explanation": "Directly using invoke_subgraph is not supported. Use mark_compile_region",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Hints": []
     }
   ],
@@ -222,7 +226,11 @@
     {
       "Gb_type": "Builtin `operator.*` comparison with constant `self` failed",
       "Context": "call_method {self} {name} {args} {kwargs}",
+<<<<<<< HEAD
       "Explanation": "\"Failed to compare {self} with {other}, \"                     + f\"because {other} is not a Python constant or its mutation check fails.\"",
+=======
+      "Explanation": "\"Failed to compare {self} with {other}, because {other} is not a Python constant or its mutation check fails.\"",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Hints": []
     }
   ],
@@ -391,7 +399,10 @@
       "Context": "context",
       "Explanation": "Higher order ops do not support aliasing. Found in {source_target.name()}",
       "Hints": [
+<<<<<<< HEAD
         "Replace `return input` with `return input.clone()` to avoid aliasing.",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Consider using the debug context to change user code to avoid aliasing.",
         "Please open an issue."
       ]
@@ -851,7 +862,11 @@
   "GB0088": [
     {
       "Gb_type": "Observed exception",
+<<<<<<< HEAD
       "Context": "raised exception {curr_exc.python_type_name()}({curr_exc.args})",
+=======
+      "Context": "str(raised_exception)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Explanation": "observed_exn_gb_explanation",
       "Hints": [
         "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
@@ -1384,6 +1399,7 @@
   "GB0142": [
     {
       "Gb_type": "Unsupported context manager",
+<<<<<<< HEAD
       "Context": "Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on {ctx}",
       "Explanation": "Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
       "Hints": [
@@ -1398,6 +1414,8 @@
     },
     {
       "Gb_type": "Unsupported context manager",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Context": "Attempted SETUP_WITH/BEFORE_WITH on {ctx}",
       "Explanation": "Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
       "Hints": [
@@ -1674,6 +1692,7 @@
   ],
   "GB0170": [
     {
+<<<<<<< HEAD
       "Gb_type": "Data-dependent branching",
       "Context": "attempted to jump with {value}",
       "Explanation": "_explanation",
@@ -1689,6 +1708,8 @@
       "Hints": []
     },
     {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Gb_type": "_gb_type",
       "Context": "attempted to jump with {value}",
       "Explanation": "_explanation",
@@ -2167,6 +2188,7 @@
       "Explanation": "Dynamo does not support tracing builtin index() on a Tensor",
       "Hints": []
     }
+<<<<<<< HEAD
   ],
   "GB0219": [
     {
@@ -2851,5 +2873,7 @@
         "Move the Placement usage outside the compiled region"
       ]
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ]
 }
diff --git a/torch/_dynamo/graph_deduplication.py b/torch/_dynamo/graph_deduplication.py
index dcc6302dd7c64..e4aea90be1fbf 100644
--- a/torch/_dynamo/graph_deduplication.py
+++ b/torch/_dynamo/graph_deduplication.py
@@ -9,7 +9,11 @@
 
 import logging
 import operator
+<<<<<<< HEAD
 from collections import defaultdict, deque
+=======
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Generator, Iterable
 from typing import Optional
 
@@ -80,8 +84,11 @@ def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:
         (
             subgraph,
             external_node_usages,
+<<<<<<< HEAD
             node_usage_to_tuple_elems,
             ind_to_tuple_spec,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) = _create_subgraph(region, inds_with_external_users)
 
         # Ignore regions with no args for now, could they possibly be evaluated at compile time?
@@ -102,8 +109,11 @@ def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:
                 region,
                 get_subgraph_node,
                 external_node_usages,
+<<<<<<< HEAD
                 node_usage_to_tuple_elems,
                 ind_to_tuple_spec,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inds_with_external_users,
                 subgraph_name,
                 node_to_additional_deps,
@@ -126,18 +136,26 @@ def _replace_region_with_subgraph(
     region: Region,
     get_subgraph_node: Node,
     external_node_usages: Iterable[OrderedSet[UsageIndex]],
+<<<<<<< HEAD
     node_usage_to_tuple_elems: dict[UsageIndex, OrderedSet[int]],
     ind_to_tuple_spec: dict[int, dict[tuple[int, ...], int]],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inds_with_external_users: list[int],
     subgraph_name: str,
     node_to_additional_deps: dict[Node, OrderedSet[Node]],
     node_to_mutated_arg_positions: dict[Node, OrderedSet[int]],
 ) -> None:
     sub_args = []
+<<<<<<< HEAD
     flattened_getitem_nodes: OrderedSet[Node] = OrderedSet()
     for usages in external_node_usages:
         usage = next(iter(usages))
         node_ind, usage_ind = usage
+=======
+    for usages in external_node_usages:
+        node_ind, usage_ind = next(iter(usages))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node = region[node_ind]
         flattened_args_kwargs = _get_flat_args(node, {})
         for user_ind, node_usage_ind in usages:
@@ -148,19 +166,27 @@ def _replace_region_with_subgraph(
                         "NYI: Failed to substitute region %s due to mutation", region
                     )
                     return
+<<<<<<< HEAD
         if usage in node_usage_to_tuple_elems:
             tuple_elems = [region[i] for i in node_usage_to_tuple_elems[usage]]
             flattened_getitem_nodes.update(tuple_elems)
             sub_args.extend(tuple_elems)
         else:
             sub_args.append(flattened_args_kwargs[usage_ind])
+=======
+        sub_args.append(flattened_args_kwargs[usage_ind])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Input/Output aliasing not supported in HOPs today
     # Note: we should use the nodes in the original graph (the region here)
     # because we use the original traced example values for this check
+<<<<<<< HEAD
     if _has_aliasing(
         region, sub_args, inds_with_external_users, flattened_getitem_nodes
     ):
+=======
+    if _has_aliasing(region, sub_args, inds_with_external_users):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     invoke_args = (get_subgraph_node, subgraph_name, *sub_args)
@@ -171,6 +197,7 @@ def _replace_region_with_subgraph(
         invoke_args,  # type: ignore[arg-type]
         {},
     )
+<<<<<<< HEAD
 
     ind = 0
     flattened_output_nodes: OrderedSet[Node] = OrderedSet()
@@ -200,6 +227,18 @@ def _replace_region_with_subgraph(
         if node not in flattened_output_nodes:
             graph.erase_node(node)
 
+=======
+    for ind, external_user_ind in enumerate(inds_with_external_users):
+        node = region[external_user_ind]
+        subgraph_output = graph.create_node(
+            "call_function", operator.getitem, (invoke_subgraph_node, ind), {}
+        )
+        node.replace_all_uses_with(subgraph_output, propagate_meta=True)
+
+    # Erase in reverse topological order
+    for node in reversed(region):
+        graph.erase_node(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Remove any nodes with additional deps
         # This is safe; we've guaranteed that there is
         # no input mutation, so all additional deps
@@ -254,6 +293,7 @@ def _get_inds_with_external_users(region: Region, inds_unique: set[int]) -> None
                     inds_unique.add(ind)
 
 
+<<<<<<< HEAD
 def _create_subgraph(
     region: Region,
     inds_with_external_users: list[int],
@@ -291,6 +331,17 @@ def _create_subgraph(
             placeholder = subgraph.placeholder(f"subgraph_input_{node.name}")
             region_to_subgraph_node[node] = placeholder
 
+=======
+def _copy_nodes_and_remap_inputs(
+    subgraph: torch.fx.Graph, region: Region
+) -> list[OrderedSet[UsageIndex]]:
+    external_input_to_usages = _get_external_inputs(region)
+    external_node_usages = list[OrderedSet[UsageIndex]]()
+    region_to_subgraph_node = {}
+    for node, usage_indices in external_input_to_usages.items():
+        placeholder = subgraph.placeholder(f"subgraph_input_{node.name}")
+        region_to_subgraph_node[node] = placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         external_node_usages.append(usage_indices)
 
     def map_arg(node: Node) -> Node:
@@ -299,6 +350,7 @@ def map_arg(node: Node) -> Node:
         else:
             return node
 
+<<<<<<< HEAD
     def copy_to_subgraph(node: Node) -> Node:
         subgraph_node = subgraph.node_copy(node, lambda old: map_arg(old))
         region_to_subgraph_node[node] = subgraph_node
@@ -329,6 +381,37 @@ def _stable_topological_sort_impl(
     node_to_additional_deps: dict[Node, OrderedSet[Node]],
     do_sort: bool = True,
 ) -> bool:
+=======
+    for node in region:
+        subgraph_node = subgraph.node_copy(node, lambda old: map_arg(old))
+        region_to_subgraph_node[node] = subgraph_node
+
+    return external_node_usages
+
+
+def _create_subgraph_outputs(
+    subgraph: torch.fx.Graph, inds_to_output: list[int]
+) -> None:
+    node_list = [n for n in subgraph.nodes if n.op not in ("placeholder", "output")]
+    out_tup = tuple(node_list[ind] for ind in inds_to_output)
+    subgraph.output(out_tup)
+
+
+def _create_subgraph(
+    region: Region,
+    inds_with_external_users: list[int],
+) -> tuple[torch.fx.Graph, list[OrderedSet[UsageIndex]]]:
+    subgraph: torch.fx.Graph = torch.fx.Graph()
+    external_node_usages = _copy_nodes_and_remap_inputs(subgraph, region)
+    _create_subgraph_outputs(subgraph, inds_with_external_users)
+    return subgraph, external_node_usages
+
+
+def _stable_topological_sort(
+    graph: torch.fx.Graph,
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Nodes are in exactly one of these four collections:
 
     # - Nodes in `pending` are waiting to be processed (in reverse order):
@@ -367,7 +450,11 @@ def _stable_topological_sort_impl(
             waiting[waiting_for[-1]].append(node)
         else:
             ready.add(node)
+<<<<<<< HEAD
             if cursor and cursor.next is not node and do_sort:
+=======
+            if cursor and cursor.next is not node:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cursor.append(node)
             cursor = node
             # Mark the nodes that have been waiting for this node to finish as
@@ -375,6 +462,7 @@ def _stable_topological_sort_impl(
             pending.extend(reversed(waiting.pop(node, ())))
 
     ready.update(outputs)
+<<<<<<< HEAD
     return not waiting and len(ready) == len(graph.nodes)
 
 
@@ -392,6 +480,9 @@ def _has_cycle(
     return not _stable_topological_sort_impl(
         graph, node_to_additional_deps, do_sort=False
     )
+=======
+    assert not waiting and len(ready) == len(graph.nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _populate_additional_deps(
@@ -456,15 +547,21 @@ def _add_mutation_dependencies(
             for user in mutated_arg.users:
                 if user is node:
                     continue
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
                 elif user < node:
                     node_to_additional_deps[node].add(user)
                 # pyrefly: ignore [unsupported-operation]
+=======
+                elif user < node:
+                    node_to_additional_deps[node].add(user)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif user > node:
                     node_to_additional_deps[user].add(node)
 
 
 def _has_aliasing(
+<<<<<<< HEAD
     region: Region,
     inputs: list[Node],
     inds_with_external_users: list[int],
@@ -474,6 +571,13 @@ def _has_aliasing(
     for node in inputs:
         if node in flattened_getitem_nodes:
             continue
+=======
+    region: Region, inputs: list[Node], inds_with_external_users: list[int]
+) -> bool:
+    input_storages: dict[StorageWeakRef, Node] = dict()
+
+    for node in inputs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_value = node.meta["example_value"]
         if isinstance(example_value, torch.Tensor):
             storage = StorageWeakRef(example_value._typed_storage())
@@ -487,11 +591,18 @@ def _has_aliasing(
                 )
                 return True
             input_storages[storage] = node
+<<<<<<< HEAD
     output_storages: dict[StorageWeakRef, Node] = dict()
     for i in inds_with_external_users:
         out_node = region[i]
         if out_node in flattened_getitem_nodes:
             continue
+=======
+
+    output_storages: dict[StorageWeakRef, Node] = dict()
+    for i in inds_with_external_users:
+        out_node = region[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if out_node:
             example_value = out_node.meta["example_value"]
             assert not isinstance(example_value, list)
@@ -507,6 +618,10 @@ def _has_aliasing(
                     )
                     return True
                 output_storages[storage] = out_node
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     intersected_storages = input_storages.keys() & output_storages.keys()
     if len(intersected_storages) > 0:
         # input-output aliasing
@@ -520,6 +635,7 @@ def _has_aliasing(
             aliased,
         )
         return True
+<<<<<<< HEAD
     return False
 
 
@@ -608,3 +724,7 @@ def _replace_tuple_outputs(
     graph.erase_node(node)
     erased_nodes.add(node)
     return erased_nodes
+=======
+
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_region_tracker.py b/torch/_dynamo/graph_region_tracker.py
index 61f92dd06e22c..e79d93252ddfd 100644
--- a/torch/_dynamo/graph_region_tracker.py
+++ b/torch/_dynamo/graph_region_tracker.py
@@ -13,8 +13,11 @@
 optimization operations.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copyreg
 import io
 import logging
@@ -23,7 +26,11 @@
 import pickle
 from collections import defaultdict, deque
 from dataclasses import fields
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._logging
 import torch.fx
@@ -38,14 +45,18 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .symbolic_convert import InstructionTranslatorBase
 
 
 Node = torch.fx.Node
 Region = list[Node]
 IdenticalNodes = list[Node]
+<<<<<<< HEAD
 GlobalStateKey = tuple[
     bool,
     bool,
@@ -58,6 +69,9 @@
     bool,
     bool,
 ]
+=======
+GlobalStateKey = tuple[bool, bool, int, bool, bool, torch.dtype, bool, bool, bool, bool]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 graph_expansion_log = torch._logging.getArtifactLogger(
@@ -138,6 +152,7 @@ def _normalize_args(
     return (sorted_keys, tuple(_extract_args(arg) for arg in all_args))
 
 
+<<<<<<< HEAD
 def _sort_with_ref_region(
     index_to_rank: dict[int, int], regions: list[list[Any]]
 ) -> None:
@@ -150,6 +165,8 @@ def _sort_with_ref_region(
         region[:] = [region[i] for i in sorted_indices]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_global_state_key() -> GlobalStateKey:
     return (
         torch.is_grad_enabled(),
@@ -178,7 +195,11 @@ def __init__(self, origin: Node) -> None:
         self._queue: deque[Optional[Node]] = deque()
 
     @staticmethod
+<<<<<<< HEAD
     def create(origin: Node) -> BackwardBfsArgIter:
+=======
+    def create(origin: Node) -> "BackwardBfsArgIter":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         it = BackwardBfsArgIter(origin)
         it.add_children(origin)
         # pop the origin node, since it is the origin of
@@ -253,13 +274,18 @@ def _is_identical(self, n0: Node, n1: Node) -> bool:
             and n0 is not n1
         )
 
+<<<<<<< HEAD
     def track_node(self, tx: InstructionTranslatorBase, node: Node) -> None:
+=======
+    def track_node(self, tx: "InstructionTranslatorBase", node: Node) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         The main entry point for tracking a node. This function will hash the node argument and group
         nodes with the same hash together. It updates the hash_to_duplicates and node_to_duplicates dictionaries
         to track the new node.
         """
         try:
+<<<<<<< HEAD
             if (
                 node not in self.node_to_duplicates
             ):  # don't allow nodes to be added twice
@@ -272,6 +298,17 @@ def track_node(self, tx: InstructionTranslatorBase, node: Node) -> None:
                 self.node_to_duplicates[node] = duplicates
         except NodeHashException as e:
             log.debug("Unable to hash node %s with exception %s", node, e)  # noqa: G200
+=======
+            duplicates = self.hash_to_duplicates[
+                self._hash_node(
+                    tx.f_code.co_filename, tx.lineno, tx.instruction_pointer, node
+                )
+            ]
+            duplicates.append(node)
+            self.node_to_duplicates[node] = duplicates
+        except NodeHashException as e:
+            log.debug("Unable to hash node %s with exception %s", node, e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def track_node_mutations(
         self,
@@ -333,7 +370,10 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
             if len(group) > 1:
                 region_group = []
                 min_rank = math.inf
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for node in group:
                     # some nodes aren't in the topo ranking?
                     if node in topological_ranking:
@@ -358,6 +398,7 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
                 self._is_identical,
             )
             # sort topologically
+<<<<<<< HEAD
             # we need to handle edge cases where some nodes have no dependencies
             # so first we map each node to its ranking,
             ref_region = region_group[0]
@@ -365,6 +406,10 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
                 index: topological_ranking[n] for index, n in enumerate(ref_region)
             }
             _sort_with_ref_region(index_to_rank, region_group)
+=======
+            for region in region_group:
+                region.sort(key=lambda n: topological_ranking[n])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return [
             region_group for region_group in region_groups if len(region_group[0]) > 1
@@ -458,7 +503,10 @@ def fully_expand_region_group(
                 candidate not in seen_nodes
                 and candidate not in nodes_to_add
                 and candidate.op != "placeholder"
+<<<<<<< HEAD
                 and candidate.op != "get_attr"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and is_identical_fn(candidate, current_node)
                 and not region_wrapper.will_inclusion_create_cycle(candidate)
             )
diff --git a/torch/_dynamo/graph_utils.py b/torch/_dynamo/graph_utils.py
index a7429e83174b8..af74099bbbcce 100644
--- a/torch/_dynamo/graph_utils.py
+++ b/torch/_dynamo/graph_utils.py
@@ -1,10 +1,17 @@
 from collections import deque
+<<<<<<< HEAD
 from typing import Any, Optional
 
 import torch
 from torch.fx import Graph, map_arg, Node
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_flatten
+=======
+from typing import Any
+
+from torch.fx import Graph, map_arg, Node
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # flattens with support for slices
@@ -77,6 +84,7 @@ def current_path_head() -> Node:
                 pending.append((child, cur_node))
 
     return "no cycle detected"
+<<<<<<< HEAD
 
 
 def _graph_device_type(graph: Optional[Graph]) -> str:
@@ -114,3 +122,5 @@ def _flatten_meta(node: Node, key: str) -> list[Any]:
         for obj in flat_args:
             return _device_type(obj)
     return "cpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index d5869b9b29f51..bc78f6acb2a9f 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Core guard system for Dynamo that detects when compiled code needs to be recompiled due to
 changes in program state. Guards are conditions that must remain true for previously-compiled
@@ -31,13 +36,17 @@
 import pickle
 import sys
 import textwrap
+<<<<<<< HEAD
 import traceback
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 import warnings
 import weakref
 from contextlib import contextmanager
 from copy import deepcopy
 from inspect import currentframe
+<<<<<<< HEAD
 from typing import Any, NoReturn, Optional, TYPE_CHECKING, Union
 
 
@@ -47,6 +56,9 @@
     from typing_extensions import LiteralString
 
 from typing_extensions import TypeAliasType, TypeVar
+=======
+from typing import Any, Callable, NoReturn, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import ReferenceType
 
 import torch
@@ -56,6 +68,7 @@
 from torch._C._dynamo.guards import (
     check_obj_id,
     check_type_id,
+<<<<<<< HEAD
     ClosureGuardAccessor,
     CodeGuardAccessor,
     dict_version,
@@ -68,10 +81,15 @@
     GuardAccessor,
     GuardDebugInfo,
     GuardManager,
+=======
+    dict_version,
+    DictGuardManager,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     install_no_tensor_aliasing_guard,
     install_object_aliasing_guard,
     install_storage_overlapping_guard,
     install_symbolic_shape_guard,
+<<<<<<< HEAD
     LeafGuard,
     profile_guard_manager,
     RelationalGuard,
@@ -80,6 +98,10 @@
     TypeDictGuardAccessor,
     TypeGuardAccessor,
     TypeMROGuardAccessor,
+=======
+    profile_guard_manager,
+    RootGuardManager,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._dynamo.source import (
     get_global_source_name,
@@ -88,8 +110,11 @@
     is_from_flatten_script_object_source,
     is_from_local_source,
     is_from_optimizer_source,
+<<<<<<< HEAD
     is_from_skip_guard_source,
     is_from_unspecialized_builtin_nn_module_source,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorProperty,
     TensorPropertySource,
 )
@@ -105,7 +130,10 @@
     Source,
     StorageOverlap,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import IndentedBuffer
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import structured
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental.symbolic_shapes import (
@@ -128,15 +156,21 @@
     CallFunctionNoArgsSource,
     CallMethodItemSource,
     ChainedSource,
+<<<<<<< HEAD
     ClosureSource,
     CodeSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ConstantSource,
     ConstDictKeySource,
     DataclassFieldsSource,
     DefaultsSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+<<<<<<< HEAD
     DynamicScalarSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FlattenScriptObjectSource,
     FloatTensorSource,
     FSDPNNModuleSource,
@@ -148,19 +182,27 @@
     GradSource,
     ListGetItemSource,
     LocalSource,
+<<<<<<< HEAD
     NamedTupleFieldsSource,
     NNModuleSource,
     NonSerializableSetGetItemSource,
+=======
+    NNModuleSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NumpyTensorSource,
     OptimizerSource,
     ScriptObjectQualifiedNameSource,
     ShapeEnvSource,
     SubclassAttrListSource,
     TorchFunctionModeStackSource,
+<<<<<<< HEAD
     TorchSource,
     TupleIteratorGetItemSource,
     TypeDictSource,
     TypeMROSource,
+=======
+    TupleIteratorGetItemSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TypeSource,
     UnspecializedBuiltinNNModuleSource,
     UnspecializedNNModuleSource,
@@ -198,11 +240,15 @@
 )
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
 guard_manager_testing_hook_fn: Optional[Callable[[Any, Any, Any], Any]] = None
+=======
+guard_manager_testing_hook_fn: Optional[Callable[[Any, Any], Any]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     import numpy as np
@@ -211,6 +257,7 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Generator, KeysView, Sequence
 
     from sympy import Symbol
@@ -219,6 +266,13 @@
     from torch._dynamo.output_graph import OutputGraphCommon, OutputGraphGuardsState
 
 T = TypeVar("T")
+=======
+    from sympy import Symbol
+
+    from torch._dynamo.output_graph import OutputGraphGuardsState
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 guards_log = torch._logging.getArtifactLogger(__name__, "guards")
 recompiles_log = torch._logging.getArtifactLogger(__name__, "recompiles")
@@ -228,6 +282,7 @@
 verbose_guards_log = torch._logging.getArtifactLogger(__name__, "verbose_guards")
 
 
+<<<<<<< HEAD
 dunder_attrs_assumed_constants = (
     "__defaults__",
     "__kwdefaults__",
@@ -264,6 +319,8 @@ def writeline(self, line: str, skip_prefix: bool = False) -> None:  # type: igno
             super().writeline("+- " + line)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GuardManagerWrapper:
     """
     A helper class that contains the root guard manager. An instance of this
@@ -272,12 +329,17 @@ class is stored in the Dynamo cache entry, so that the cache entry can
     the check_nopybind from C++.
     """
 
+<<<<<<< HEAD
     def __init__(self, root: Optional[RootGuardManager] = None) -> None:
+=======
+    def __init__(self, root=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if root is None:
             self.root = RootGuardManager()
         else:
             self.root = root
 
+<<<<<<< HEAD
         self.diff_guard_root: Optional[RootGuardManager] = None
         self.closure_vars: Optional[dict[str, Any]] = None
         self.args: Optional[list[str]] = None
@@ -291,19 +353,42 @@ def __init__(self, root: Optional[RootGuardManager] = None) -> None:
         self.no_tensor_aliasing_sources: list[str] = []
 
         self.printed_relational_guards: set[RelationalGuard] = set()
+=======
+        self.diff_guard_root = None
+        self.closure_vars = None
+        self.args = None
+        self.code_parts = []
+        self.verbose_code_parts = None
+        self.global_scope = None
+        self.guard_fail_fn = None
+        self.cache_entry = None
+        self.extra_state = None
+        self.id_matched_objs = {}
+        self.no_tensor_aliasing_sources = []
+
+        self.printed_relational_guards = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.diff_guard_sources: OrderedSet[str] = OrderedSet()
 
     @contextmanager
+<<<<<<< HEAD
     def _preserve_printed_relational_guards(self) -> Generator[None, None, None]:
+=======
+    def _preserve_printed_relational_guards(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.printed_relational_guards = set()
         try:
             yield
         finally:
             self.printed_relational_guards = set()
 
+<<<<<<< HEAD
     # TODO: clarify what fn and attributes guard manager has to get the right things here
     def collect_diff_guard_sources(self) -> OrderedSet[str]:
+=======
+    def collect_diff_guard_sources(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # At the time of finalize, we have only marked guard managers with
         # TENSOR_MATCH guards as diff guard managers. So, we do a tree traversal
         # and collect all the nodes in the tree (branches) that lead to tensor
@@ -313,7 +398,11 @@ def collect_diff_guard_sources(self) -> OrderedSet[str]:
         # 0, so we collect them as well. Later on, we accumulate the diff guard
         # sources for all the guard managers.
 
+<<<<<<< HEAD
         def visit_dict_manager(node: DictGuardManager) -> bool:
+=======
+        def visit_dict_manager(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_diff_guard_node = (
                 node.get_source() in self.diff_guard_sources or node.fail_count() > 0
             )
@@ -327,7 +416,11 @@ def visit_dict_manager(node: DictGuardManager) -> bool:
 
             return is_diff_guard_node
 
+<<<<<<< HEAD
         def visit_manager(node: GuardManager) -> bool:
+=======
+        def visit_manager(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not isinstance(node, DictGuardManager)
 
             is_diff_guard_node = (
@@ -341,7 +434,11 @@ def visit_manager(node: GuardManager) -> bool:
 
             return is_diff_guard_node
 
+<<<<<<< HEAD
         def visit(node: GuardManager) -> bool:
+=======
+        def visit(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node is None:
                 return False
             if isinstance(node, DictGuardManager):
@@ -352,6 +449,7 @@ def visit(node: GuardManager) -> bool:
 
         return self.diff_guard_sources
 
+<<<<<<< HEAD
     def finalize(self) -> None:
         if config.use_recursive_dict_tags_for_guards and justknobs_check(
             "pytorch/compiler:use_recursive_dict_tags_for_guards"
@@ -584,6 +682,13 @@ def visit(node: GuardManager) -> list[GuardManager]:
                 node.mark_tag_safe_root()
 
     def populate_diff_guard_manager(self) -> None:
+=======
+    def finalize(self):
+        self.collect_diff_guard_sources()
+        self.populate_diff_guard_manager()
+
+    def populate_diff_guard_manager(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.diff_guard_root = self.clone_with_chosen_sources(self.diff_guard_sources)
 
         # Ensure that that C++ side points to the updated diff guard manager.
@@ -596,28 +701,42 @@ def populate_diff_guard_manager(self) -> None:
         if self.cache_entry:
             self.cache_entry.update_diff_guard_root_manager()
 
+<<<<<<< HEAD
     def clone_with_chosen_sources(
         self, chosen_sources: OrderedSet[str]
     ) -> RootGuardManager:
         def filter_fn(node_mgr: GuardManager) -> bool:
+=======
+    def clone_with_chosen_sources(self, chosen_sources):
+        def filter_fn(node_mgr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return node_mgr.get_source() in chosen_sources
 
         return self.root.clone_manager(filter_fn)
 
+<<<<<<< HEAD
     def get_guard_lines(self, guard: LeafGuard) -> list[str]:
+=======
+    def get_guard_lines(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guard_name = guard.__class__.__name__
         parts = guard.verbose_code_parts()
         parts = [guard_name + ": " + part for part in parts]
         return parts
 
+<<<<<<< HEAD
     def get_manager_line(
         self, guard_manager: GuardManager, accessor_str: Optional[str] = None
     ) -> str:
+=======
+    def get_manager_line(self, guard_manager, accessor_str=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source = guard_manager.get_source()
         t = guard_manager.__class__.__name__
         s = t + ": source=" + source
         if accessor_str:
             s += ", " + accessor_str
+<<<<<<< HEAD
         s += f", type={guard_manager.get_type_of_guarded_value()}"
         s += f", tag_safe=({guard_manager.is_tag_safe()}, {guard_manager.is_tag_safe_root()})"
         return s
@@ -625,6 +744,11 @@ def get_manager_line(
     def construct_dict_manager_string(
         self, mgr: DictGuardManager, body: IndentedBufferWithPrefix
     ) -> None:
+=======
+        return s
+
+    def construct_dict_manager_string(self, mgr, body):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx, (key_mgr, val_mgr) in sorted(mgr.get_key_value_managers().items()):
             body.writeline(f"KeyValueManager pair at index={idx}")
             with body.indent():
@@ -636,6 +760,7 @@ def construct_dict_manager_string(
                     body.writeline(f"ValueManager: {self.get_manager_line(val_mgr)}")
                     self.construct_manager_string(val_mgr, body)
 
+<<<<<<< HEAD
     def construct_manager_string(
         self, mgr: GuardManager, body: IndentedBufferWithPrefix
     ) -> None:
@@ -645,6 +770,14 @@ def construct_manager_string(
                     if guard not in self.printed_relational_guards:
                         self.printed_relational_guards.add(guard)
                         # pyrefly: ignore [bad-argument-type]
+=======
+    def construct_manager_string(self, mgr, body):
+        with body.indent():
+            for guard in mgr.get_leaf_guards():
+                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                    if guard not in self.printed_relational_guards:
+                        self.printed_relational_guards.add(guard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         body.writelines(self.get_guard_lines(guard))
                     else:
                         body.writelines(
@@ -668,7 +801,23 @@ def construct_manager_string(
                 )
                 self.construct_manager_string(child_mgr, body)
 
+<<<<<<< HEAD
     def __str__(self) -> str:
+=======
+    def __str__(self):
+        from torch._inductor.utils import IndentedBuffer
+
+        class IndentedBufferWithPrefix(IndentedBuffer):
+            def prefix(self):
+                return "| " * (self._indent * self.tabwidth)
+
+            def writeline(self, line, skip_prefix=False):
+                if skip_prefix:
+                    super().writeline(line)
+                else:
+                    super().writeline("+- " + line)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self._preserve_printed_relational_guards():
             body = IndentedBufferWithPrefix()
             body.tabwidth = 1
@@ -681,6 +830,7 @@ def __str__(self) -> str:
                     body.writelines(self.get_guard_lines(guard))
             return body.getvalue()
 
+<<<<<<< HEAD
     def check(self, x: Any) -> bool:
         # Only needed for debugging purposes.
         return self.root.check(x)
@@ -694,18 +844,41 @@ def populate_code_parts_for_debugging(self) -> None:
         relational_guards_seen = set()
 
         def get_code_parts(leaf_guard: LeafGuard) -> list[str]:
+=======
+    def check(self, x):
+        # Only needed for debugging purposes.
+        return self.root.check(x)
+
+    def check_verbose(self, x):
+        # Only needed for debugging purposes.
+        return self.root.check_verbose(x)
+
+    def populate_code_parts_for_debugging(self):
+        # This should be called when the guard manager is fully populated
+        relational_guards_seen = set()
+
+        def get_code_parts(leaf_guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code_parts = []
             for verbose_code_part in leaf_guard.verbose_code_parts():
                 code_part = verbose_code_part.split("#")[0].rstrip()
                 code_parts.append(code_part)
             return code_parts
 
+<<<<<<< HEAD
         def visit(mgr: GuardManager) -> None:
             nonlocal relational_guards_seen
             for guard in mgr.get_leaf_guards():
                 if isinstance(guard, RelationalGuard):
                     if guard not in relational_guards_seen:
                         # pyrefly: ignore [bad-argument-type]
+=======
+        def visit(mgr):
+            nonlocal relational_guards_seen
+            for guard in mgr.get_leaf_guards():
+                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                    if guard not in relational_guards_seen:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.code_parts.extend(get_code_parts(guard))
                         relational_guards_seen.add(guard)
                 else:
@@ -717,18 +890,29 @@ def visit(mgr: GuardManager) -> None:
         visit(self.root)
 
 
+<<<<<<< HEAD
 def from_numpy(a: Any) -> torch.Tensor:
+=======
+def from_numpy(a):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # If not numpy array, piggy back on e.g. tensor guards to check type
     # Re-enable torch function since we disable it on leaf guards
     # we need it to properly construct the tensor if a default device is set
     with torch.overrides._enable_torch_function():
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.as_tensor(a) if isinstance(a, (np.generic, np.ndarray)) else a
 
 
 # For user stack printing
 @functools.cache
+<<<<<<< HEAD
 def uninteresting_files() -> set[str]:
+=======
+def uninteresting_files():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch._dynamo.external_utils
     import torch._dynamo.polyfills
 
@@ -736,7 +920,10 @@ def uninteresting_files() -> set[str]:
 
     from torch._dynamo.polyfills.loader import POLYFILLED_MODULES
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mods.extend(POLYFILLED_MODULES)
 
     return {inspect.getfile(m) for m in mods}
@@ -745,7 +932,11 @@ def uninteresting_files() -> set[str]:
 _CLOSURE_VARS: Optional[dict[str, object]] = None
 
 
+<<<<<<< HEAD
 def _get_closure_vars() -> dict[str, object]:
+=======
+def _get_closure_vars():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global _CLOSURE_VARS
     if _CLOSURE_VARS is None:
         _CLOSURE_VARS = {
@@ -759,7 +950,10 @@ def _get_closure_vars() -> dict[str, object]:
             "___normalize_range_iter": normalize_range_iter,
             "___tuple_iterator_getitem": tuple_iterator_getitem,
             "___dataclass_fields": dataclass_fields,
+<<<<<<< HEAD
             "___namedtuple_fields": lambda x: x._fields,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "___get_torch_function_mode_stack_at": get_torch_function_mode_stack_at,
             "__math_isnan": math.isnan,
             "__numpy_isnan": None if np is None else np.isnan,
@@ -782,13 +976,18 @@ def _ast_unparse(node: ast.AST) -> str:
 strip_function_call = torch._C._dynamo.strip_function_call
 
 
+<<<<<<< HEAD
 def get_verbose_code_part(code_part: str, guard: Optional[Guard]) -> str:
+=======
+def get_verbose_code_part(code_part: str, guard: Guard) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extra = ""
     if guard is not None:
         if guard.user_stack:
             for fs in reversed(guard.user_stack):
                 if fs.filename not in uninteresting_files():
                     extra = f"  # {format_frame(fs, line=True)}"
+<<<<<<< HEAD
                     if len(extra) > 1024:
                         # For fx graphs, the line can be very long in case of
                         # torch.stack ops, where many inputs are set to None
@@ -796,6 +995,8 @@ def get_verbose_code_part(code_part: str, guard: Optional[Guard]) -> str:
                         # guards log file.  In such cases, do not print the line
                         # contents.
                         extra = f"  # {format_frame(fs)}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     break
         elif guard.stack:
             summary = guard.stack.summary()
@@ -807,6 +1008,7 @@ def get_verbose_code_part(code_part: str, guard: Optional[Guard]) -> str:
 
 
 def get_verbose_code_parts(
+<<<<<<< HEAD
     code_parts: Union[str, list[str]],
     guard: Optional[Guard],
     recompile_hint: Optional[str] = None,
@@ -826,6 +1028,16 @@ def get_verbose_code_parts(
 
 
 def convert_int_to_concrete_values(dim: Any) -> Optional[int]:
+=======
+    code_parts: Union[str | list[str]], guard: Guard
+) -> list[str]:
+    if not isinstance(code_parts, list):
+        code_parts = [code_parts]
+    return [get_verbose_code_part(code_part, guard) for code_part in code_parts]
+
+
+def convert_int_to_concrete_values(dim) -> Optional[int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if dim is None:
         return None
     if not is_symbolic(dim):
@@ -835,6 +1047,7 @@ def convert_int_to_concrete_values(dim: Any) -> Optional[int]:
         return dim.node.maybe_as_int()
 
 
+<<<<<<< HEAD
 def convert_to_concrete_values(size_or_stride: list[Any]) -> list[Optional[int]]:
     return [convert_int_to_concrete_values(dim) for dim in size_or_stride]
 
@@ -847,6 +1060,13 @@ def get_tensor_guard_code_part(
     pytype: type,
     dispatch_keys: DispatchKeySet,
 ) -> str:
+=======
+def convert_to_concrete_values(size_or_stride):
+    return [convert_int_to_concrete_values(dim) for dim in size_or_stride]
+
+
+def get_tensor_guard_code_part(value, name, sizes, strides, pytype, dispatch_keys):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dispatch_key = (
         dispatch_keys | torch._C._dispatch_tls_local_include_set()
     ) - torch._C._dispatch_tls_local_exclude_set()
@@ -860,7 +1080,11 @@ def get_tensor_guard_code_part(
     return guard_str
 
 
+<<<<<<< HEAD
 def get_key_index(dct: dict[Any, Any], key: Any) -> int:
+=======
+def get_key_index(dct, key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Ensure that we call dict.keys and not value.keys (which can call
     # overridden keys method). In the C++ guards, we relied on PyDict_Next
     # to traverse the dictionary, which uses the internal data structure and
@@ -868,7 +1092,11 @@ def get_key_index(dct: dict[Any, Any], key: Any) -> int:
     return list(builtin_dict_keys(dct)).index(key)
 
 
+<<<<<<< HEAD
 def get_key_index_source(source: Any, index: Any) -> str:
+=======
+def get_key_index_source(source, index):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return f"list(dict.keys({source}))[{index}]"
 
 
@@ -880,6 +1108,7 @@ def raise_local_type_error(obj: Any) -> NoReturn:
     )
 
 
+<<<<<<< HEAD
 def should_optimize_getattr_on_nn_module(value: Any) -> bool:
     # If inline_inbuilt_nn_modules flag is True, Dynamo has already traced
     # through the __getattr__, and therefore it is always safe to optimize
@@ -890,6 +1119,8 @@ def should_optimize_getattr_on_nn_module(value: Any) -> bool:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class NNModuleAttrAccessorInfo:
     # Represents where is the attr name is present in the nn module attribute
@@ -906,12 +1137,17 @@ class NNModuleAttrAccessorInfo:
 
 
 def getitem_on_dict_manager(
+<<<<<<< HEAD
     source: Union[DictGetItemSource, DictSubclassGetItemSource],
     base_guard_manager: DictGuardManager,
     base_example_value: Any,
     example_value: Any,
     guard_manager_enum: GuardManagerType,
 ) -> GuardManager:
+=======
+    source, base_guard_manager, base_example_value, example_value, guard_manager_enum
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_source_name = source.base.name()
     if isinstance(source.index, ConstDictKeySource):
         index = source.index.index
@@ -950,7 +1186,11 @@ def getitem_on_dict_manager(
     )
 
 
+<<<<<<< HEAD
 def match_on_id_for_tensor(guard: Guard) -> bool:
+=======
+def match_on_id_for_tensor(guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     source = guard.originating_source
     # For numpy tensors, always use TENSOR_MATCH because __from_numpy leads
     # to a new tensor every time and therefore id differs.
@@ -977,7 +1217,11 @@ class GuardManagerType(enum.Enum):
 
 
 @functools.cache
+<<<<<<< HEAD
 def code_framelocals_names_reversed_cached(code: types.CodeType) -> list[str]:
+=======
+def code_framelocals_names_reversed_cached(code: types.CodeType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return list(reversed(code_framelocals_names(code)))
 
 
@@ -985,24 +1229,38 @@ class GuardBuilder(GuardBuilderBase):
     def __init__(
         self,
         f_code: types.CodeType,
+<<<<<<< HEAD
         id_ref: Callable[[object, str], int],
         source_ref: Callable[[Source], str],
         lookup_weakrefs: Callable[[object], Optional[weakref.ref[object]]],
+=======
+        id_ref: Callable[[Any, str], str],
+        source_ref: Callable[[Source], str],
+        lookup_weakrefs: Callable[[object], ReferenceType[object]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_scope: dict[str, object],
         global_scope: dict[str, object],
         guard_manager: GuardManagerWrapper,
         check_fn_manager: CheckFunctionManager,
+<<<<<<< HEAD
         save_guards: bool = False,
         runtime_global_scope: Optional[dict[str, object]] = None,
         source_get_cache: Optional[dict[str, Any]] = None,
     ) -> None:
+=======
+        serialization_mode: Optional[str] = None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.f_code = f_code
         self.id_ref = id_ref
         self.source_ref = source_ref
         self.lookup_weakrefs = lookup_weakrefs
         self.scope: dict[str, dict[str, object]] = {"L": local_scope, "G": global_scope}
+<<<<<<< HEAD
         self.runtime_global_scope = runtime_global_scope or global_scope
         self.source_get_cache = source_get_cache or {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.scope["__builtins__"] = builtins.__dict__.copy()
         for (
             name,
@@ -1027,6 +1285,7 @@ def __init__(
         # Collect the guard managers and debug info to insert no tensor aliasing
         # guards.
         self.no_tensor_aliasing_names: list[str] = []
+<<<<<<< HEAD
         self.no_tensor_aliasing_guard_managers: list[GuardManager] = []
 
         self.check_fn_manager: CheckFunctionManager = check_fn_manager
@@ -1034,17 +1293,28 @@ def __init__(
         self.guard_tree_values: dict[int, Any] = {}
         self.save_guards = save_guards
 
+=======
+        self.no_tensor_aliasing_guard_managers: list[GuardManagerWrapper] = []
+
+        self.check_fn_manager: CheckFunctionManager = check_fn_manager
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Collect the ids of dicts which need key order guarding. source_name is
         # not sufficient because for nn modules, we can have different sources
         # to access the same object - self._module["param"] is same as
         # self.param.
         self.key_order_guarded_dict_ids = set()
+<<<<<<< HEAD
         assert self.check_fn_manager.output_graph is not None
         for source in self.check_fn_manager.output_graph.guard_on_key_order:
             dict_obj = self.get(source.name())
             if self.save_guards:
                 self.source_get_cache[source.name()] = dict_obj
             self.key_order_guarded_dict_ids.add(id(dict_obj))
+=======
+        for source in self.check_fn_manager.output_graph.guard_on_key_order:
+            self.key_order_guarded_dict_ids.add(id(self.get(source.name())))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and guard_manager and is used to
@@ -1052,6 +1322,7 @@ def __init__(
         self.id_matched_objs: dict[str, ReferenceType[object]] = {}
 
         # Save the guard managers to avoid repeatedly traversing sources.
+<<<<<<< HEAD
         self._cached_guard_managers: dict[str, GuardManager] = {}
         self._cached_duplicate_input_guards: set[tuple[str, str]] = set()
         self.object_aliasing_guard_codes: list[tuple[str, str]] = []
@@ -1063,6 +1334,15 @@ def __init__(
     def guard_on_dict_keys_and_ignore_order(
         self, example_value: dict[Any, Any], guard: Guard
     ) -> None:
+=======
+        self._cached_guard_managers: dict[
+            str, torch._C._dynamo.guards.GuardManager
+        ] = {}
+        self._cached_duplicate_input_guards: set[tuple[str, str]] = set()
+        self.serialization_mode = serialization_mode
+
+    def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dict_mgr = self.get_guard_manager(guard)
         if isinstance(dict_mgr, DictGuardManager):
             raise NotImplementedError(
@@ -1090,7 +1370,11 @@ def guard_on_dict_keys_and_ignore_order(
                 guard_manager_enum=guard_manager_enum,
             )
 
+<<<<<<< HEAD
     def guard_on_dict_keys_and_order(self, value: dict[Any, Any], guard: Guard) -> None:
+=======
+    def guard_on_dict_keys_and_order(self, value, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Add key managers for the DictGuardManager. Then add either an
         # ID_MATCH or EQUALS_MATCH guard on the key.
         dict_mgr = self.get_guard_manager(guard)
@@ -1129,7 +1413,11 @@ def guard_on_dict_keys_and_order(self, value: dict[Any, Any], guard: Guard) -> N
                 )
 
     @staticmethod
+<<<<<<< HEAD
     def _get_generic_dict_manager_example_value(example_value: Any) -> Optional[Any]:
+=======
+    def _get_generic_dict_manager_example_value(example_value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # due to a bug in 3.13.0 (introduced by https://github.com/python/cpython/pull/116115,
         # reported in https://github.com/python/cpython/issues/125608,
         # fixed by https://github.com/python/cpython/pull/125611), we cannot take
@@ -1148,6 +1436,7 @@ def _get_generic_dict_manager_example_value(example_value: Any) -> Optional[Any]
 
     def getattr_on_nn_module(
         self,
+<<<<<<< HEAD
         source: AttrSource,
         base_guard_manager: GuardManager,
         base_example_value: Any,
@@ -1156,6 +1445,16 @@ def getattr_on_nn_module(
         source_name: str,
         guard_manager_enum: GuardManagerType,
     ) -> GuardManager:
+=======
+        source,
+        base_guard_manager,
+        base_example_value,
+        example_value,
+        base_source_name,
+        source_name,
+        guard_manager_enum,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This tries to avoid calling the expensive nn module custom getattr method by
         checking if the attribute is accessible via __dict__. For attributes that
@@ -1174,6 +1473,7 @@ def getattr_on_nn_module(
         """
 
         def getitem_on_dict_mgr(
+<<<<<<< HEAD
             mgr: GuardManager,
             key: Any,
             source_name: str,
@@ -1181,6 +1481,10 @@ def getitem_on_dict_mgr(
             example_value: Any,
             guard_manager_enum: GuardManagerType,
         ) -> GuardManager:
+=======
+            mgr, key, source_name, base_example_value, example_value, guard_manager_enum
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(mgr, DictGuardManager):
                 # Case where the user code relies on key order, e.g.,
                 # named_parameters
@@ -1290,7 +1594,10 @@ def getitem_on_dict_mgr(
             )
 
             if l2_key:
+<<<<<<< HEAD
                 assert l2_source_name is not None and l2_guard_manager_enum is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return getitem_on_dict_mgr(
                     mgr=l1_mgr,
                     key=l2_key,
@@ -1301,13 +1608,18 @@ def getitem_on_dict_mgr(
                 )
             return l1_mgr
 
+<<<<<<< HEAD
     def requires_key_order_guarding(self, source: Source) -> bool:
+=======
+    def requires_key_order_guarding(self, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source_name = source.name()
         if source_name == "":
             return False
         obj_id = id(self.get(source_name))
         return obj_id in self.key_order_guarded_dict_ids
 
+<<<<<<< HEAD
     def get_guard_manager_type(
         self,
         source: Source,
@@ -1315,33 +1627,52 @@ def get_guard_manager_type(
             Union[KeysView[Any], set[Any], frozenset[Any], dict[Any, Any]]
         ],
     ) -> GuardManagerType:
+=======
+    def get_guard_manager_type(self, source, example_value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guard_manager_enum = GuardManagerType.GUARD_MANAGER
         if self.requires_key_order_guarding(source):
             # Fix this if condition
             if isinstance(example_value, dict_keys):
                 guard_manager_enum = GuardManagerType.DICT_GUARD_MANAGER
+<<<<<<< HEAD
             elif isinstance(example_value, (set, frozenset)):
                 # we don't need to guard on key order for set/frozenset
                 # but the if above will be true for these types as set is
                 # implemented using a dict in Dynamo
                 guard_manager_enum = GuardManagerType.GUARD_MANAGER
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 assert isinstance(example_value, dict)
                 guard_manager_enum = GuardManagerType.DICT_GUARD_MANAGER
         return guard_manager_enum
 
+<<<<<<< HEAD
     def manager_guards_on_keys(self, mgr_enum: GuardManagerType) -> bool:
         return mgr_enum == GuardManagerType.DICT_GUARD_MANAGER
 
     def get_global_guard_manager(self) -> GuardManager:
         return self.guard_manager.root.globals_dict_manager(
             f_globals=self.runtime_global_scope,
+=======
+    def manager_guards_on_keys(self, mgr_enum):
+        return mgr_enum == GuardManagerType.DICT_GUARD_MANAGER
+
+    def get_global_guard_manager(self):
+        return self.guard_manager.root.globals_dict_manager(
+            f_globals=self.scope["G"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             source="G",
             example_value=self.scope["G"],
             guard_manager_enum=GuardManagerType.GUARD_MANAGER,
         )
 
+<<<<<<< HEAD
     def get_guard_manager_from_source(self, source: Source) -> GuardManager:
+=======
+    def get_guard_manager_from_source(self, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         root_guard_manager = self.guard_manager.root
 
         example_value = None
@@ -1352,7 +1683,10 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
 
         if source_name != "":
             example_value = self.get(source_name)
+<<<<<<< HEAD
             self.guard_tree_values[id(example_value)] = example_value
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         guard_manager_enum = self.get_guard_manager_type(source, example_value)
 
@@ -1371,7 +1705,24 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
 
         # Use istype instead of isinstance to check for exact type of source.
         if istype(source, LocalSource):
+<<<<<<< HEAD
             framelocals_idx = get_framelocals_idx(self.f_code, source.local_name)
+=======
+            # Refer to index in the frame's localsplus directly.
+            # NOTE: name order for a code object doesn't change.
+            # NOTE: we need to find the LAST matching index because <= 3.10 contains
+            # duplicate names in the case of cells: a name can be both local and cell
+            # and will take up 2 slots of the frame's localsplus. The correct behavior
+            # is to refer to the cell, which has a higher index.
+            framelocals_names_reversed = code_framelocals_names_reversed_cached(
+                self.f_code
+            )
+            framelocals_idx = (
+                len(framelocals_names_reversed)
+                - framelocals_names_reversed.index(source.local_name)
+                - 1
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = root_guard_manager.framelocals_manager(
                 key=(source.local_name, framelocals_idx),
                 source=source_name,
@@ -1408,6 +1759,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+<<<<<<< HEAD
         elif istype(source, TypeDictSource):
             assert base_guard_manager  # to make mypy happy
             out = base_guard_manager.type_dict_manager(
@@ -1422,6 +1774,8 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(
             source,
             (
@@ -1434,6 +1788,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
         ):
             assert base_guard_manager  # to make mypy happy
             out = base_guard_manager
+<<<<<<< HEAD
         elif istype(source, TorchSource):
             out = root_guard_manager.lambda_manager(
                 python_lambda=lambda _: torch,
@@ -1441,6 +1796,8 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(source, TorchFunctionModeStackSource):
             out = root_guard_manager.lambda_manager(
                 python_lambda=lambda _: get_torch_function_mode_stack_at(
@@ -1467,9 +1824,18 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
             )
         elif istype(source, (AttrSource, UnspecializedParamBufferSource)):
             assert base_guard_manager  # to make mypy happy
+<<<<<<< HEAD
             assert isinstance(source, AttrSource)
             if should_optimize_getattr_on_nn_module(base_example_value):
                 assert base_source_name
+=======
+
+            if (
+                isinstance(base_example_value, torch.nn.Module)
+                and get_custom_getattr(base_example_value)
+                is unpatched_nn_module_getattr
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = self.getattr_on_nn_module(
                     source,
                     base_guard_manager,
@@ -1489,7 +1855,10 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
         elif istype(source, (DictGetItemSource, DictSubclassGetItemSource)):
             assert base_guard_manager  # to make mypy happy
             assert isinstance(base_example_value, (dict, collections.OrderedDict))
+<<<<<<< HEAD
             assert isinstance(source, (DictGetItemSource, DictSubclassGetItemSource))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(base_guard_manager, DictGuardManager):
                 assert self.manager_guards_on_keys(base_guard_manager_enum)
                 out = getitem_on_dict_manager(
@@ -1569,7 +1938,10 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 )
         elif istype(source, DefaultsSource):
             assert base_guard_manager  # to make mypy happy
+<<<<<<< HEAD
             assert base_source_name
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert callable(base_example_value)
             if not source.is_kw:
                 out = base_guard_manager.func_defaults_manager(
@@ -1677,6 +2049,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+<<<<<<< HEAD
         elif istype(source, NonSerializableSetGetItemSource):
             assert base_guard_manager
             out = base_guard_manager.set_getitem_manager(
@@ -1685,6 +2058,8 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(source, WeakRefCallSource):
             assert base_guard_manager  # to make mypy happy
             out = base_guard_manager.weakref_call_manager(
@@ -1707,6 +2082,7 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+<<<<<<< HEAD
         elif istype(source, NamedTupleFieldsSource):
             assert base_guard_manager
             out = base_guard_manager.lambda_manager(
@@ -1737,6 +2113,8 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise AssertionError(
                 f"missing guard manager builder {source} - {source.name()}"
@@ -1745,16 +2123,28 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
         self._cached_guard_managers[source.name()] = out
         return out
 
+<<<<<<< HEAD
     def get_guard_manager(self, guard: Guard) -> GuardManager:
+=======
+    def get_guard_manager(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.get_guard_manager_from_source(guard.originating_source)
 
     def add_python_lambda_leaf_guard_to_root(
         self,
+<<<<<<< HEAD
         code_parts: list[str],
         verbose_code_parts: list[str],
         closure_vars: Optional[dict[str, object]] = None,
         is_epilogue: bool = True,
     ) -> None:
+=======
+        code_parts,
+        verbose_code_parts,
+        closure_vars=None,
+        is_epilogue=True,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if closure_vars is None:
             closure_vars = _get_closure_vars()
         # Adds a lambda leaf guard to the root guard manager. It wraps the
@@ -1784,6 +2174,7 @@ def add_python_lambda_leaf_guard_to_root(
     # (like its type) which is what you permanently install into the
     # guard code.
     def get(self, name: str, closure_vars: Optional[dict[str, Any]] = None) -> Any:
+<<<<<<< HEAD
         if self.source_get_cache:
             if name in self.source_get_cache:
                 return self.source_get_cache[name]
@@ -1793,6 +2184,11 @@ def get(self, name: str, closure_vars: Optional[dict[str, Any]] = None) -> Any:
         if self.save_guards and ".__closure__" in name:
             self.source_get_cache[name] = ret
         return ret
+=======
+        if closure_vars is None:
+            closure_vars = _get_closure_vars()
+        return eval(name, self.scope, closure_vars)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Registers the usage of the source name referenced by the
     # string (or stored in the Guard) as being guarded upon.  It's important
@@ -1815,6 +2211,7 @@ def arg_ref(self, guard: Union[str, Guard]) -> str:
 
         return name
 
+<<<<<<< HEAD
     def _guard_on_attribute(
         self,
         guard: Guard,
@@ -1825,6 +2222,10 @@ def _guard_on_attribute(
             attr_source = CodeSource(guard.originating_source)
         else:
             attr_source = AttrSource(guard.originating_source, attr_name)  # type: ignore[assignment]
+=======
+    def _guard_on_attribute(self, guard: Guard, attr_name: str, guard_fn):
+        attr_source = AttrSource(guard.originating_source, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Copy the stack info
         new_guard = Guard(
             attr_source, guard_fn, stack=guard.stack, user_stack=guard.user_stack
@@ -1832,6 +2233,7 @@ def _guard_on_attribute(
         new_guard.create(self)
 
     # Note: the order of the guards in this file matters since we sort guards on the same object by lineno
+<<<<<<< HEAD
     def HASATTR(self, guard: Guard) -> None:
         source = guard.originating_source
         if isinstance(source, NNModuleSource):
@@ -1839,6 +2241,12 @@ def HASATTR(self, guard: Guard) -> None:
         if isinstance(source, CodeSource):
             # No need to guard that a function has a __code__ attribute
             return
+=======
+    def HASATTR(self, guard: Guard):
+        source = guard.originating_source
+        if isinstance(source, NNModuleSource):
+            source = source.base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(source, AttrSource), f"invalid source {guard.name}"
         base_source = source.base
         base = base_source.name()
@@ -1851,10 +2259,13 @@ def HASATTR(self, guard: Guard) -> None:
             code = f"hasattr({ref}, {attr!r})"
         else:
             code = f"not hasattr({ref}, {attr!r})"
+<<<<<<< HEAD
 
         if code in self.already_added_code_parts:
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._set_guard_export_info(
             guard, [code], provided_guarded_object=self.get(base)
         )
@@ -1869,8 +2280,17 @@ def HASATTR(self, guard: Guard) -> None:
 
             # if the base value is nn.Module, check if we can speedup the
             # guard by going through __dict__ attrs.
+<<<<<<< HEAD
             if should_optimize_getattr_on_nn_module(base_example_value):
                 self.getattr_on_nn_module(
+=======
+            if (
+                isinstance(base_example_value, torch.nn.Module)
+                and get_custom_getattr(base_example_value)
+                is unpatched_nn_module_getattr
+            ):
+                return self.getattr_on_nn_module(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     source,
                     base_manager,
                     base_example_value,
@@ -1888,6 +2308,7 @@ def HASATTR(self, guard: Guard) -> None:
                 )
         else:
             base_manager.add_no_hasattr_guard(attr, get_verbose_code_parts(code, guard))
+<<<<<<< HEAD
         self.already_added_code_parts.add(code)
 
     def NOT_PRESENT_IN_GENERIC_DICT(
@@ -1903,6 +2324,17 @@ def NOT_PRESENT_IN_GENERIC_DICT(
         if code in self.already_added_code_parts:
             return
 
+=======
+
+    def NOT_PRESENT_IN_GENERIC_DICT(self, guard: Guard, attr=None) -> None:
+        assert attr is not None
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        assert isinstance(val, torch.nn.Module)
+
+        base_manager = self.get_guard_manager(guard)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod_dict_source = f"{guard.name}.__dict__"
         mod_generic_dict_manager = base_manager.get_generic_dict_manager(
             source=mod_dict_source,
@@ -1910,10 +2342,17 @@ def NOT_PRESENT_IN_GENERIC_DICT(
             guard_manager_enum=GuardManagerType.GUARD_MANAGER,
         )
 
+<<<<<<< HEAD
         mod_generic_dict_manager.add_dict_contains_guard(
             False, attr, get_verbose_code_parts(code, guard)
         )
         self.already_added_code_parts.add(code)
+=======
+        code = f"not ___dict_contains({attr!r}, {ref}.__dict__)"
+        mod_generic_dict_manager.add_dict_contains_guard(
+            False, attr, get_verbose_code_parts(code, guard)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def TYPE_MATCH(self, guard: Guard) -> None:
         # ___check_type_id is same as `id(type(x)) == y`
@@ -1923,10 +2362,16 @@ def TYPE_MATCH(self, guard: Guard) -> None:
         else:
             t = type(value)
 
+<<<<<<< HEAD
         if t.__qualname__ != t.__name__:
             # Type match guards must be local scope, this is
             # raised in self.serialize_guards
             guard._unserializable = True
+=======
+        if self.serialization_mode == "save":
+            if t.__qualname__ != t.__name__:
+                raise_local_type_error(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         obj_id = self.id_ref(t, f"type({guard.name})")
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
@@ -1936,7 +2381,15 @@ def TYPE_MATCH(self, guard: Guard) -> None:
             obj_id, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def DICT_VERSION(self, guard: Guard) -> None:
+=======
+    def DICT_VERSION(self, guard: Guard):
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "DICT_VERSION guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ___check_dict_version is same as `dict_version(x) == y`
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1950,18 +2403,26 @@ def DICT_VERSION(self, guard: Guard) -> None:
             val, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool) -> None:
+=======
+    def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dict_ref = self.arg_ref(guard)
 
         maybe_not = "not " if invert else ""
         code = f"{maybe_not}___dict_contains({key!r}, {dict_ref})"
+<<<<<<< HEAD
         if code in self.already_added_code_parts:
             return
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._set_guard_export_info(guard, [code])
 
         self.get_guard_manager(guard).add_dict_contains_guard(
             not invert, key, get_verbose_code_parts(code, guard)
         )
+<<<<<<< HEAD
         self.already_added_code_parts.add(code)
 
     def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool) -> None:
@@ -1981,6 +2442,10 @@ def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool) -> None:
         self.already_added_code_parts.add(code)
 
     def BOOL_MATCH(self, guard: Guard) -> None:
+=======
+
+    def BOOL_MATCH(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks val == True or val == False
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1997,7 +2462,11 @@ def BOOL_MATCH(self, guard: Guard) -> None:
                 get_verbose_code_parts(code, guard)
             )
 
+<<<<<<< HEAD
     def NONE_MATCH(self, guard: Guard) -> None:
+=======
+    def NONE_MATCH(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks `val is None`
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -2009,6 +2478,7 @@ def NONE_MATCH(self, guard: Guard) -> None:
             get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def ID_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
         # TODO - Run a CI with the following uncommented to find the remaining places
         # val = self.get(guard.name)
@@ -2021,6 +2491,11 @@ def ID_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
     def id_match_unchecked(
         self, guard: Guard, recompile_hint: Optional[str] = None
     ) -> None:
+=======
+    def ID_MATCH(self, guard: Guard):
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError("ID_MATCH guard cannot be serialized.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ___check_obj_id is same as `id(x) == y`
         if isinstance(guard.originating_source, TypeSource):
             # optional optimization to produce cleaner/faster guard code
@@ -2032,9 +2507,16 @@ def id_match_unchecked(
         val = self.get(guard.name)
         id_val = self.id_ref(val, guard.name)
         code = f"___check_obj_id({ref}, {id_val})"
+<<<<<<< HEAD
         self._set_guard_export_info(guard, [code], provided_func_name="ID_MATCH")
         self.get_guard_manager(guard).add_id_match_guard(
             id_val, get_verbose_code_parts(code, guard, recompile_hint)
+=======
+        self._set_guard_export_info(guard, [code])
+
+        self.get_guard_manager(guard).add_id_match_guard(
+            id_val, get_verbose_code_parts(code, guard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Keep track of ID_MATCH'd objects. This will be used to modify the
@@ -2049,7 +2531,11 @@ def id_match_unchecked(
                 if weak_id is not None:
                     self.id_matched_objs[local_name] = weak_id
 
+<<<<<<< HEAD
     def NOT_NONE_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
+=======
+    def NOT_NONE_MATCH(self, guard: Guard, value=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         assert isinstance(val, torch.Tensor)
@@ -2060,7 +2546,11 @@ def NOT_NONE_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
             get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def DISPATCH_KEY_SET_MATCH(self, guard: Guard) -> None:
+=======
+    def DISPATCH_KEY_SET_MATCH(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         assert isinstance(val, torch._C.DispatchKeySet)
@@ -2070,6 +2560,7 @@ def DISPATCH_KEY_SET_MATCH(self, guard: Guard) -> None:
             val, get_verbose_code_parts(code_parts, guard)
         )
 
+<<<<<<< HEAD
     def DUAL_LEVEL(self, guard: Guard) -> None:
         # Invalidate dual level if current dual level is different than the one
         # in the fx graph
@@ -2085,6 +2576,30 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
         # Invalidate functorch code if current level is different than
         # the one when FX graph was generated
         assert self.check_fn_manager.output_graph is not None
+=======
+    def NAME_MATCH(self, guard: Guard):
+        self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
+
+    def DUAL_LEVEL(self, guard: Guard):
+        # Invalidate dual level if current dual level is different than the one
+        # in the fx graph
+        dual_level = self.check_fn_manager.output_graph.dual_level
+        code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
+        self._set_guard_export_info(guard, [code])
+        # TODO(anijain2305) - Consider this moving this guard to C++
+        forward_ad = torch.autograd.forward_ad
+
+        def fn(x):
+            return forward_ad._current_level == dual_level
+
+        self.guard_manager.root.add_lambda_guard(
+            fn, get_verbose_code_parts(code, guard)
+        )
+
+    def FUNCTORCH_STACK_MATCH(self, guard: Guard):
+        # Invalidate functorch code if current level is different than
+        # the one when FX graph was generated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cis = self.check_fn_manager.output_graph.functorch_layers
         states = [ci.get_state() for ci in cis]
         code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
@@ -2093,22 +2608,34 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
         # TODO(anijain2305) - Consider this moving this guard to C++
         compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
+<<<<<<< HEAD
         def fn(x: Any) -> bool:
+=======
+        def fn(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return compare_fn(states)
 
         self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def AUTOGRAD_SAVED_TENSORS_HOOKS(self, guard: Guard) -> None:
+=======
+    def AUTOGRAD_SAVED_TENSORS_HOOKS(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
         are_inline_hooks = (
             torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
         )
 
+<<<<<<< HEAD
         def hooks_ids_fn(
             hooks: tuple[Callable[[torch.Tensor], Any], Callable[[Any], torch.Tensor]],
         ) -> Optional[tuple[int, ...]]:
+=======
+        def hooks_ids_fn(hooks):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not are_inline_hooks(hooks):
                 return None
 
@@ -2122,27 +2649,43 @@ def hooks_ids_fn(
         ]
         self._set_guard_export_info(guard, code)
 
+<<<<<<< HEAD
         def fn(x: Any) -> bool:
+=======
+        def fn(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return guard_hooks_ids == hooks_ids_fn(get_hooks())
 
         self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard) -> None:
+=======
+    def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = self.get(guard.name)
         original_metadata = deepcopy(self.get(guard.name).__tensor_flatten__()[1])
         if hasattr(value, "__metadata_guard__"):
             verify_guard_fn_signature(value)
 
+<<<<<<< HEAD
             def metadata_checker(x: Any) -> bool:
+=======
+            def metadata_checker(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return value.__metadata_guard__(
                     original_metadata, x.__tensor_flatten__()[1]
                 )
 
         else:
 
+<<<<<<< HEAD
             def metadata_checker(x: Any) -> bool:
+=======
+            def metadata_checker(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x.__tensor_flatten__()[1] == original_metadata
 
         global_name = f"___check_metadata_{id(metadata_checker)}_c{CompileContext.current_compile_id()}"
@@ -2150,7 +2693,11 @@ def metadata_checker(x: Any) -> bool:
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
+<<<<<<< HEAD
     def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
+=======
+    def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         if np:
@@ -2182,8 +2729,11 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
                 range,
                 dict_keys,
                 torch.Size,
+<<<<<<< HEAD
                 torch.Stream,
                 torch.cuda.streams.Stream,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *np_types,
                 *ok_mutable_types,
             }
@@ -2212,27 +2762,52 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
 
         import torch.utils._pytree as pytree
 
+<<<<<<< HEAD
         assert isinstance(val, ok_types) or pytree.is_constant_class(type(val)), (
+=======
+        assert istype(val, ok_types) or pytree.is_constant_class(type(val)), (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Unexpected type {type(val)}"
         )
 
         # Special case for nan because float("nan") == float("nan") evaluates to False
         if istype(val, float) and math.isnan(val):
+<<<<<<< HEAD
             code = [f"(type({ref}) is float and __math_isnan({ref}))"]
             self._set_guard_export_info(guard, code)
 
             self.get_guard_manager(guard).add_float_is_nan_guard(
+=======
+            self.TYPE_MATCH(guard)
+            code = []
+            code.append(f"__math_isnan({ref})")
+            self._set_guard_export_info(guard, code)
+
+            self.get_guard_manager(guard).add_lambda_guard(
+                _get_closure_vars()["__math_isnan"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_verbose_code_parts(code, guard),
             )
             return
 
         # Python math library doesn't support complex nan, so we need to use numpy
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         if istype(val, complex) and np.isnan(val):
             code = [f"(type({ref}) is complex and __numpy_isnan({ref}))"]
             self._set_guard_export_info(guard, code)
 
             self.get_guard_manager(guard).add_complex_is_nan_guard(
+=======
+        if istype(val, complex) and np.isnan(val):
+            self.TYPE_MATCH(guard)
+            code = []
+            code.append(f"__numpy_isnan({ref})")
+            self._set_guard_export_info(guard, code)
+
+            self.get_guard_manager(guard).add_lambda_guard(
+                _get_closure_vars()["__numpy_isnan"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_verbose_code_parts(code, guard),
             )
             return
@@ -2255,7 +2830,11 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
         self._set_guard_export_info(guard, code)
         return
 
+<<<<<<< HEAD
     def CONSTANT_MATCH(self, guard: Guard) -> None:
+=======
+    def CONSTANT_MATCH(self, guard: Guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         val = self.get(guard.name)
         if istype(val, bool):
             self.BOOL_MATCH(guard)
@@ -2266,6 +2845,7 @@ def CONSTANT_MATCH(self, guard: Guard) -> None:
         else:
             self.EQUALS_MATCH(guard)
 
+<<<<<<< HEAD
     def NN_MODULE(self, guard: Guard) -> None:
         # don't support this in serialization because it uses unsupported ID_MATCH
         self.ID_MATCH(guard, "[inline-inbuilt-nn-modules-candidate]")
@@ -2275,6 +2855,19 @@ def NN_MODULE(self, guard: Guard) -> None:
             if not self.guard_nn_modules:
                 # If guard_nn_modules is true, we will guard on the right set of guards
                 self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)  # type: ignore[arg-type]
+=======
+    def NN_MODULE(self, guard: Guard):
+        # don't support this in serialization because it uses unsupported ID_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "NN_MODULE guard cannot be serialized."
+            )
+        self.ID_MATCH(guard)
+        val = self.get(guard.name)
+        if hasattr(val, "training"):
+            assert istype(val.training, bool)
+            self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             exc.unimplemented_v2(
                 gb_type="Attempted to guard on uninitialized nn.Module",
@@ -2286,6 +2879,7 @@ def NN_MODULE(self, guard: Guard) -> None:
                 ],
             )
 
+<<<<<<< HEAD
     def FUNCTION_MATCH(self, guard: Guard) -> None:
         """things like torch.add and user defined functions"""
         # don't support this in serialization because it uses unsupported ID_MATCH
@@ -2330,6 +2924,36 @@ def BUILTIN_MATCH(self, guard: Guard) -> None:
         return self.id_match_unchecked(guard)
 
     def SEQUENCE_LENGTH(self, guard: Guard) -> None:
+=======
+    def FUNCTION_MATCH(self, guard: Guard):
+        """things like torch.add and user defined functions"""
+        # don't support this in serialization because it uses unsupported ID_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "FUNCTION_MATCH guard cannot be serialized."
+            )
+        return self.ID_MATCH(guard)
+
+    def CLOSURE_MATCH(self, guard: Guard):
+        """matches a closure by __code__ id."""
+        # don't support this in serialization because it uses unsupported FUNCTION_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "CLOSURE_MATCH guard cannot be serialized."
+            )
+        val = self.get(guard.name)
+        # Strictly only want user-defined functions
+        if type(val) == types.FunctionType and hasattr(val, "__code__"):
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)
+        else:
+            self.FUNCTION_MATCH(guard)
+
+    def BUILTIN_MATCH(self, guard: Guard):
+        return self.FUNCTION_MATCH(guard)
+
+    def SEQUENCE_LENGTH(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This guard is used to check length of PySequence objects like list,
         # tuple, collections.deque etc
         ref = self.arg_ref(guard)
@@ -2355,7 +2979,11 @@ def SEQUENCE_LENGTH(self, guard: Guard) -> None:
                 len(value), get_verbose_code_parts(code, guard)
             )
 
+<<<<<<< HEAD
     def TUPLE_ITERATOR_LEN(self, guard: Guard) -> None:
+=======
+    def TUPLE_ITERATOR_LEN(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
@@ -2371,7 +2999,11 @@ def TUPLE_ITERATOR_LEN(self, guard: Guard) -> None:
             tuple_iterator_len(value), obj_id, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def RANGE_ITERATOR_MATCH(self, guard: Guard) -> None:
+=======
+    def RANGE_ITERATOR_MATCH(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
@@ -2390,6 +3022,7 @@ def RANGE_ITERATOR_MATCH(self, guard: Guard) -> None:
         )
 
     # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
+<<<<<<< HEAD
     def DUPLICATE_INPUT(self, guard: Guard, source_b: Source) -> None:
         if self.save_guards:
             if name := get_local_source_name(source_b):
@@ -2397,6 +3030,13 @@ def DUPLICATE_INPUT(self, guard: Guard, source_b: Source) -> None:
             if name := get_global_source_name(source_b):
                 self.check_fn_manager.additional_used_global_vars.add(name)
 
+=======
+    def DUPLICATE_INPUT(self, guard, source_b):
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "DUPLICATE_INPUT guard cannot be serialized yet."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_a = self.arg_ref(guard)
         ref_b = self.arg_ref(source_b.name())
 
@@ -2416,6 +3056,7 @@ def DUPLICATE_INPUT(self, guard: Guard, source_b: Source) -> None:
         code = [f"{ref_b} is {ref_a}"]
         self._set_guard_export_info(guard, code)
 
+<<<<<<< HEAD
         if config.use_lamba_guard_for_object_aliasing:
             # Save the code part so that we can install a lambda guard at the
             # end.  Read the Note - On Lambda guarding of object aliasing - to
@@ -2431,6 +3072,19 @@ def DUPLICATE_INPUT(self, guard: Guard, source_b: Source) -> None:
             )
 
     def WEAKREF_ALIVE(self, guard: Guard) -> None:
+=======
+        install_object_aliasing_guard(
+            self.get_guard_manager(guard),
+            self.get_guard_manager_from_source(source_b),
+            get_verbose_code_parts(code, guard),
+        )
+
+    def WEAKREF_ALIVE(self, guard):
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "WEAKREF_ALIVE guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = [f"{self.arg_ref(guard)} is not None"]
 
         self._set_guard_export_info(guard, code)
@@ -2438,7 +3092,11 @@ def WEAKREF_ALIVE(self, guard: Guard) -> None:
             get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def MAPPING_KEYS_CHECK(self, guard: Guard) -> None:
+=======
+    def MAPPING_KEYS_CHECK(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Guard on the key order of types.MappingProxyType object"""
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
@@ -2448,7 +3106,11 @@ def MAPPING_KEYS_CHECK(self, guard: Guard) -> None:
         self._set_guard_export_info(guard, code)
         self.get_guard_manager(guard).add_mapping_keys_guard(value, code)
 
+<<<<<<< HEAD
     def DICT_KEYS_MATCH(self, guard: Guard) -> None:
+=======
+    def DICT_KEYS_MATCH(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Insert guard to check that the keys of a dict are same"""
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
@@ -2473,13 +3135,18 @@ def DICT_KEYS_MATCH(self, guard: Guard) -> None:
         else:
             self.guard_on_dict_keys_and_ignore_order(value, guard)
 
+<<<<<<< HEAD
     def EMPTY_NN_MODULE_HOOKS_DICT(self, guard: Guard) -> None:
+=======
+    def EMPTY_NN_MODULE_HOOKS_DICT(self, guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Special guard to skip guards on empty hooks. This is controlled by skip_nnmodule_hook_guards"""
         if config.skip_nnmodule_hook_guards:
             # This is unsafe if you add/remove a hook on nn module variable
             return
         self.SEQUENCE_LENGTH(guard)
 
+<<<<<<< HEAD
     def GRAD_MODE(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
@@ -2497,6 +3164,24 @@ def DEFAULT_DEVICE(self, guard: Guard) -> None:
         assert guard.source is GuardSource.GLOBAL
 
         assert self.check_fn_manager.output_graph is not None
+=======
+    def GRAD_MODE(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def DETERMINISTIC_ALGORITHMS(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def TORCH_FUNCTION_STATE(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def FSDP_TRAINING_STATE(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def DEFAULT_DEVICE(self, guard: Guard):
+        """Guard on CURRENT_DEVICE per torch.utils._device"""
+        assert guard.source is GuardSource.GLOBAL
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = [
             f"utils_device.CURRENT_DEVICE == {self.check_fn_manager.output_graph.current_device!r}"
         ]
@@ -2506,6 +3191,7 @@ def DEFAULT_DEVICE(self, guard: Guard) -> None:
             get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
     def SHAPE_ENV(self, guard: Guard) -> None:
         from torch._dynamo.output_graph import OutputGraphCommon
 
@@ -2513,6 +3199,13 @@ def SHAPE_ENV(self, guard: Guard) -> None:
         output_graph = self.check_fn_manager.output_graph
         assert output_graph is not None
         if self.check_fn_manager.shape_code_parts is not None:
+=======
+    def SHAPE_ENV(self, guard: Guard):
+        assert guard.name == ""
+        output_graph = self.check_fn_manager.output_graph
+        if self.serialization_mode == "load":
+            assert self.check_fn_manager.shape_code_parts is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape_code_parts = self.check_fn_manager.shape_code_parts
             python_code_parts = shape_code_parts.python_code_parts
             verbose_code_parts = shape_code_parts.verbose_code_parts
@@ -2524,17 +3217,27 @@ def SHAPE_ENV(self, guard: Guard) -> None:
             # shape variables to sources from tracked_fakes.  This must happen after
             # tensor checks.
             # NB: self.output_graph can be None in the debug_nops tests
+<<<<<<< HEAD
             assert isinstance(output_graph, OutputGraphCommon)
             assert output_graph.shape_env is not None
             fs = output_graph.shape_env.tracked_fakes or []
             input_contexts = [a.symbolic_context for a in fs]
 
             def get_sources(t_id: int, dim: int) -> list[Source]:
+=======
+            fs = output_graph.tracked_fakes
+            input_contexts = [a.symbolic_context for a in fs]
+
+            def get_sources(t_id, dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Looks up base sources mapped to a tensor id and uses them to create
                 # sources for the corresponding tensor dimension.
                 return [
                     TensorPropertySource(source, TensorProperty.SIZE, dim)
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for source in output_graph.tracked_fakes_id_to_source[t_id]
                 ]
 
@@ -2546,7 +3249,11 @@ def get_sources(t_id: int, dim: int) -> list[Source]:
                 ] = []
                 phantom_symbols: dict[str, Symbol] = {}
                 relaxed_sources: set[Source] = set()
+<<<<<<< HEAD
                 for constraint in output_graph.export_constraints:  # type: ignore[attr-defined]
+=======
+                for constraint in output_graph.export_constraints:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if constraint.t_id in output_graph.tracked_fakes_id_to_source:
                         torch.export.dynamic_shapes._process_equalities(
                             constraint,
@@ -2570,6 +3277,7 @@ def get_sources(t_id: int, dim: int) -> list[Source]:
             else:
                 equalities_inputs = None
 
+<<<<<<< HEAD
             def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                 # pyrefly: ignore [missing-attribute]
                 return output_graph.shape_env.produce_guards_verbose(
@@ -2581,6 +3289,17 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                     # Export keeps static.
                     # pyrefly: ignore [missing-attribute]
                     ignore_static=(not output_graph.export),
+=======
+            def _get_code_parts(langs):
+                return output_graph.shape_env.produce_guards_verbose(
+                    [a.fake for a in fs],
+                    [a.source for a in fs],
+                    input_contexts=input_contexts,
+                    equalities_inputs=equalities_inputs,
+                    source_ref=self.source_ref,
+                    # Export keeps static.
+                    ignore_static=(not self.check_fn_manager.output_graph.export),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     langs=langs,
                 )
 
@@ -2588,7 +3307,11 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                 try:
                     # For exporting we need the python code parts
                     python_code_parts, verbose_code_parts, cpp_code_parts = (
+<<<<<<< HEAD
                         _get_code_parts(("python", "verbose_python", "cpp"))  # type: ignore[assignment]
+=======
+                        _get_code_parts(("python", "verbose_python", "cpp"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     python_fallback = False
                 except OverflowError:
@@ -2605,10 +3328,17 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
 
             # When exporting, we may work with the shape constraints some more in
             # postprocessing, so don't freeze yet
+<<<<<<< HEAD
             if not output_graph.export:
                 output_graph.shape_env.freeze()
 
         if self.save_guards:
+=======
+            if not self.check_fn_manager.output_graph.export:
+                output_graph.shape_env.freeze()
+
+        if self.serialization_mode == "save":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For SHAPE_ENV we want to skip serializing the entire ShapeEnv so instead
             # we directly serialize the generated code here.
             maybe_cpp_code_parts = locals().get("cpp_code_parts")
@@ -2641,9 +3371,13 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
         if not python_fallback:
             assert cpp_code_parts  # type: ignore[possibly-undefined]
             code_parts, source_to_symbol = (
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 cpp_code_parts.exprs,
                 # pyrefly: ignore [unbound-name, missing-attribute]
+=======
+                cpp_code_parts.exprs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cpp_code_parts.source_to_symbol,
             )
 
@@ -2674,9 +3408,13 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
 
             assert cpp_code_parts  # type: ignore[possibly-undefined]
             code_parts, source_to_symbol = (
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 cpp_code_parts.exprs,
                 # pyrefly: ignore [unbound-name, missing-attribute]
+=======
+                cpp_code_parts.exprs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cpp_code_parts.source_to_symbol,
             )
 
@@ -2753,7 +3491,11 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                 closure_vars={**SYMPY_INTERP, **_get_closure_vars()},
             )
 
+<<<<<<< HEAD
     def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
+=======
+    def TENSOR_MATCH(self, guard: Guard, value=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config._unsafe_skip_fsdp_module_guards and guard.is_fsdp_module():
             return
         # For tensors that are part of the Dynamo extracted Fx graph module, an
@@ -2779,10 +3521,16 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
 
             if config.log_compilation_metrics and isinstance(value, torch.nn.Parameter):
                 metrics_context = get_metrics_context()
+<<<<<<< HEAD
                 if metrics_context.in_progress():
                     metrics_context.increment("param_numel", value.numel())
                     metrics_context.increment("param_bytes", value.nbytes)
                     metrics_context.increment("param_count", 1)
+=======
+                metrics_context.increment("param_numel", value.numel())
+                metrics_context.increment("param_bytes", value.nbytes)
+                metrics_context.increment("param_count", 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             tensor_name = self.arg_ref(guard)
             # [Note - On Export Tensor Guards]
@@ -2807,7 +3555,10 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
             # The list of tensor fields and calls we care about can be found in `terms` below.
             # TODO(voz): We are missing storage offset in all our tensor guards?
             code: list[str] = []
+<<<<<<< HEAD
             assert self.check_fn_manager.output_graph is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.check_fn_manager.output_graph.export:
                 self.TYPE_MATCH(guard)
                 terms = [
@@ -2838,12 +3589,16 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
                 # insert aliasing guards on them
                 if not (
                     config.skip_no_tensor_aliasing_guards_on_parameters
+<<<<<<< HEAD
                     and (
                         istype(value, torch.nn.Parameter)
                         or is_from_unspecialized_builtin_nn_module_source(
                             guard.originating_source
                         )
                     )
+=======
+                    and istype(value, torch.nn.Parameter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) and not isinstance(guard.originating_source, NumpyTensorSource):
                     # Keep track of all the tensor guard managers to insert
                     # NoAliasing check at the end.
@@ -2859,19 +3614,28 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
 
                 verbose_code_parts = get_verbose_code_parts(
                     get_tensor_guard_code_part(
+<<<<<<< HEAD
                         value,
                         tensor_name,
                         size,
                         stride,
                         pytype,
                         dispatch_keys,
+=======
+                        value, tensor_name, size, stride, pytype, dispatch_keys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     guard,
                 )
                 guard_manager.add_tensor_match_guard(
                     value,
+<<<<<<< HEAD
                     size,  # type: ignore[arg-type]
                     stride,  # type: ignore[arg-type]
+=======
+                    size,
+                    stride,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     tensor_name,
                     verbose_code_parts,
                     pytype,
@@ -2939,6 +3703,7 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
                 self._set_guard_export_info(guard, code)
 
     # A util that in the case of export, adds data onto guards
+<<<<<<< HEAD
     def _set_guard_export_info(
         self,
         guard: Guard,
@@ -2946,6 +3711,9 @@ def _set_guard_export_info(
         provided_guarded_object: Optional[Any] = None,
         provided_func_name: Optional[str] = None,
     ) -> None:
+=======
+    def _set_guard_export_info(self, guard, code_list, provided_guarded_object=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # WARNING: It is important that cur_frame/caller do NOT stay in
         # the current frame, because they will keep things live longer
         # than they should.  See TestMisc.test_release_module_memory
@@ -2954,7 +3722,11 @@ def _set_guard_export_info(
         caller = cur_frame.f_back
         del cur_frame
         assert caller is not None
+<<<<<<< HEAD
         func_name = provided_func_name or caller.f_code.co_name
+=======
+        func_name = caller.f_code.co_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del caller
         # We use func_name for export, so might as well get a nice defensive check out of it
         assert func_name in self.__class__.__dict__, (
@@ -2978,9 +3750,13 @@ def _set_guard_export_info(
             getattr(guarded_object.__class__, "__weakrefoffset__", 0) != 0
         )
         # See D64140537 for why we are checking for tuple.
+<<<<<<< HEAD
         if supports_weakref and not isinstance(
             guarded_object, (enum.Enum, tuple, weakref.ProxyTypes)
         ):
+=======
+        if supports_weakref and not isinstance(guarded_object, (enum.Enum, tuple)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             obj_ref = weakref.ref(guarded_object)
 
         guard.set_export_info(
@@ -3025,7 +3801,11 @@ class ExprCounter(ast.NodeVisitor):
         def __init__(self, config: PyExprCSEPass.Config) -> None:
             self._config = config
 
+<<<<<<< HEAD
         def visit(self, node: ast.AST) -> None:
+=======
+        def visit(self, node: ast.AST) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(node, PyExprCSEPass.ALLOWED_NODE_TYPES):
                 self._config.expr_count[_ast_unparse(node)] += 1
             super().visit(node)
@@ -3093,7 +3873,11 @@ def replace(self, expr: str) -> tuple[list[str], str]:
         return replacer.preface, _ast_unparse(new_node)
 
 
+<<<<<<< HEAD
 def must_add_nn_module_guards(guard: Guard) -> bool:
+=======
+def must_add_nn_module_guards(guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For config.guard_nn_modules=False, we can skip all the guards that
     # originate from inside of nn module except for a few categories.
     return (
@@ -3108,11 +3892,19 @@ def must_add_nn_module_guards(guard: Guard) -> bool:
 
 
 class DeletedGuardManagerWrapper(GuardManagerWrapper):
+<<<<<<< HEAD
     def __init__(self, reason: str) -> None:
         super().__init__()
         self.invalidation_reason = reason
 
     def populate_diff_guard_manager(self) -> None:
+=======
+    def __init__(self, reason):
+        super().__init__()
+        self.invalidation_reason = reason
+
+    def populate_diff_guard_manager(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.diff_guard_root = None
 
 
@@ -3129,6 +3921,7 @@ class ShapeCodeParts:
 class GuardsState:
     output_graph: OutputGraphGuardsState
     shape_code_parts: Optional[ShapeCodeParts]
+<<<<<<< HEAD
     source_get_cache: Optional[dict[str, Any]] = None
 
 
@@ -3181,11 +3974,24 @@ def __init__(
 
     @classmethod
     def _unpickle_module(cls, state: Any) -> torch.nn.Module:
+=======
+
+
+class GuardsStatePickler(pickle.Pickler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fake_mode = torch._subclasses.FakeTensorMode()
+        self.tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
+
+    @classmethod
+    def _unpickle_module(cls, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = torch.nn.Module()
         mod.__setstate__(state)
         return mod
 
     @classmethod
+<<<<<<< HEAD
     def _unpickle_tensor(
         cls,
         meta_tensor: torch.Tensor,
@@ -3197,12 +4003,19 @@ def _unpickle_tensor(
         fake_mode = torch._subclasses.FakeTensorMode()
         tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
         ret = tensor_converter.from_meta_and_device(
+=======
+    def _unpickle_tensor(cls, meta_tensor, device, pytype, dispatch_keys_raw):
+        fake_mode = torch._subclasses.FakeTensorMode()
+        tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
+        return tensor_converter.from_meta_and_device(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fake_mode,
             meta_tensor,
             device,
             pytype,
             torch._C.DispatchKeySet.from_raw_repr(dispatch_keys_raw),
         )
+<<<<<<< HEAD
         ret.grad = grad
         return ret
 
@@ -3216,13 +4029,24 @@ def _unpickle_traceable_wrapper_subclass(
         ctx: Any,
         inner_data: list[tuple[str, Callable[..., Any], tuple[Any, ...]]],
     ) -> torch.Tensor:
+=======
+
+    @classmethod
+    def _unpickle_traceable_wrapper_subclass(
+        cls, meta_tensor, device, pytype, dispatch_keys_raw, ctx, inner_data
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Unpickle the inner tensor components. These could also be subclass instances.
         inner_tensors = {}
         for attr, unpickle_func, unpickle_func_args in inner_data:
             inner_tensors[attr] = unpickle_func(*unpickle_func_args)
 
         outer_size, outer_stride = meta_tensor.shape, meta_tensor.stride()
+<<<<<<< HEAD
         out = type(meta_tensor).__tensor_unflatten__(  # type: ignore[attr-defined]
+=======
+        out = type(meta_tensor).__tensor_unflatten__(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inner_tensors, ctx, outer_size, outer_stride
         )
         out.pytype = pytype
@@ -3230,6 +4054,7 @@ def _unpickle_traceable_wrapper_subclass(
         return out
 
     @classmethod
+<<<<<<< HEAD
     def _unpickle_python_module(cls, alias: str) -> types.ModuleType:
         return importlib.import_module(alias)
 
@@ -3304,6 +4129,29 @@ def reducer_override(
             if id(obj) not in self.guard_tree_values:
                 return _Missing, ("tensor guard tree",)
 
+=======
+    def _unpickle_python_module(cls, alias: str):
+        return importlib.import_module(alias)
+
+    @classmethod
+    def _unpickle_dispatch_key_set(cls, raw_repr: int):
+        return torch._C.DispatchKeySet.from_raw_repr(raw_repr)
+
+    @classmethod
+    def _unpickle_functorch_interpreter(cls, json: bytes):
+        return torch._C._functorch.CInterpreter.deserialize(json)
+
+    @classmethod
+    def _unpickle_mapping_proxy(cls, d):
+        return types.MappingProxyType(d)
+
+    def reducer_override(self, obj):
+        import sympy
+
+        if isinstance(obj, torch.Tensor) and obj.device.type != "meta":
+            from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_traceable_wrapper_subclass(obj):
                 # inner_data is a list of tuples of:
                 #   (inner attr name, unpickle func, tuple of func inputs)
@@ -3313,8 +4161,11 @@ def reducer_override(
                 # recursively call for inner tensor components
                 for attr in attrs:
                     inner = getattr(obj, attr)
+<<<<<<< HEAD
                     if isinstance(inner, torch.Tensor):
                         self.guard_tree_values[id(inner)] = inner
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     func, args_tuple = self.reducer_override(inner)
                     inner_data.append((attr, func, args_tuple))
 
@@ -3328,6 +4179,7 @@ def reducer_override(
                 )
 
             return type(self)._unpickle_tensor, (
+<<<<<<< HEAD
                 torch.empty_like(obj, device="meta", requires_grad=obj.requires_grad),
                 obj.device,
                 type(obj),
@@ -3345,6 +4197,15 @@ def reducer_override(
             if isinstance(obj, torch.nn.parallel.DistributedDataParallel):
                 return type(self)._unpickle_ddp_module, (obj.__getstate__(),)
 
+=======
+                torch.empty_like(obj, device="meta"),
+                obj.device,
+                type(obj),
+                torch._C._dispatch_keys(obj).raw_repr(),
+            )
+
+        elif isinstance(obj, torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if type(obj).__qualname__ == type(obj).__name__:
                 return NotImplemented
             if obj.__class__.__getstate__ == torch.nn.Module.__getstate__:
@@ -3373,6 +4234,7 @@ def reducer_override(
         elif isinstance(obj, types.MappingProxyType):
             return type(self)._unpickle_mapping_proxy, (obj.copy(),)
 
+<<<<<<< HEAD
         elif isinstance(obj, torch._dynamo.utils.dict_keys):
             return type(self)._unpickle_dict_keys, (list(obj),)
 
@@ -3418,6 +4280,8 @@ def reducer_override(
             if id(obj) not in self.guard_tree_values:
                 return _Missing, ("distributed_c10d.Work",)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if type(obj).__qualname__ != type(obj).__name__:
             raise torch._dynamo.exc.PackageError(
                 f"Type {type(obj)} for object {obj} cannot be saved "
@@ -3425,6 +4289,7 @@ def reducer_override(
                 + "Please define the class at global scope (top level of a module)."
             )
 
+<<<<<<< HEAD
         if (
             inspect.isclass(obj)
             and hasattr(torch.distributed, "fsdp")
@@ -3462,6 +4327,14 @@ def pickle_guards_state(state: GuardsState, guard_tree_values: dict[int, Any]) -
             # Prune more objects in pytree hierarchy.
             missing_values[id(leaf)] = leaf
     pickler = GuardsStatePickler(guard_tree_values, empty_values, missing_values, buf)
+=======
+        return NotImplemented
+
+
+def pickle_guards_state(state: GuardsState) -> bytes:
+    buf = io.BytesIO()
+    pickler = GuardsStatePickler(buf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         pickler.dump(state)
     except AttributeError as e:
@@ -3477,18 +4350,29 @@ def pickle_guards_state(state: GuardsState, guard_tree_values: dict[int, Any]) -
 class CheckFunctionManager:
     def __init__(
         self,
+<<<<<<< HEAD
         f_code: types.CodeType,
         output_graph: OutputGraphCommon,
         cache_entry: Optional[CacheEntry] = None,
+=======
+        f_code,
+        output_graph=None,
+        cache_entry=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
         guard_filter_fn: Optional[
             Callable[[list[GuardFilterEntry]], list[bool]]
         ] = None,
+<<<<<<< HEAD
         shape_code_parts: Optional[ShapeCodeParts] = None,
         runtime_global_scope: Optional[dict[str, Any]] = None,
         save_guards: bool = False,
         strict_error: bool = False,
         source_get_cache: Optional[dict[str, Any]] = None,
+=======
+        guards_serialization_mode: Optional[str] = None,
+        shape_code_parts: Optional[ShapeCodeParts] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: dict[int, ReferenceType[object]] = {}
@@ -3496,8 +4380,12 @@ def __init__(
         existing_diff_guard_sources = (
             update_diff_guard_managers_for_existing_cache_entries(cache_entry)
         )
+<<<<<<< HEAD
         self.output_graph: Optional[OutputGraphCommon] = output_graph
         assert self.output_graph is not None
+=======
+        self.output_graph = output_graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Only used for serialization.
         self.shape_code_parts = shape_code_parts
@@ -3507,14 +4395,19 @@ def __init__(
         self.torch_function_mode_stack = (
             output_graph.torch_function_mode_stack if output_graph else None
         )
+<<<<<<< HEAD
         self.used_builtin_vars: OrderedSet[str] = OrderedSet()
         self.additional_used_local_vars: OrderedSet[str] = OrderedSet()
         self.additional_used_global_vars: OrderedSet[str] = OrderedSet()
         self.runtime_global_scope = runtime_global_scope
+=======
+        self.guards_serialization_mode = guards_serialization_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not justknobs_check("pytorch/compiler:guard_nn_modules"):
             log.warning("guard_nn_modules is turned off using justknobs killswitch")
 
+<<<<<<< HEAD
         # TODO Be more explicit about the behavior for the users.
         if torch._dynamo.config.caching_precompile:
             _guard_filter_fn = guard_filter_fn or (lambda gs: [True for g in gs])
@@ -3560,12 +4453,27 @@ def guard_filter_fn(guards: list[GuardFilterEntry]) -> list[bool]:
             )
 
             def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
+=======
+        sorted_guards = sorted(guards or (), key=Guard.sort_key)
+        builder, guard_manager = self.build_guards(
+            sorted_guards,
+            existing_diff_guard_sources,
+            f_code,
+            output_graph,
+            None if guard_filter_fn else self.guards_serialization_mode,
+        )
+
+        if guard_filter_fn:
+
+            def make_guard_filter_entry(guard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 MISSING = object()
                 name = strip_local_scope(guard.name)
                 if name == "":
                     has_value = False
                     value = MISSING
                 else:
+<<<<<<< HEAD
                     try:
                         # Guard evaluation is expected to fail when we guard on
                         # things like "not hasattr(x, 'foo')". In cases like this,
@@ -3577,14 +4485,29 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                         value = MISSING
                         has_value = False
                 is_global = get_global_source_name(guard.originating_source) is not None
+=======
+                    has_value = True
+                    value = builder.get(guard.name)
+                is_global = get_global_source_name(guard.originating_source) is not None
+                guard_fn = guard.create_fn
+                if isinstance(guard_fn, functools.partial):
+                    guard_fn = guard.create_fn.func
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return GuardFilterEntry(
                     name=name,
                     has_value=has_value,
                     value=value,
+<<<<<<< HEAD
                     guard_type=guard.create_fn_name(),
                     derived_guard_types=(
                         tuple(guard.guard_types) if guard.guard_types else ()
                     ),
+=======
+                    guard_type=guard_fn.__name__,
+                    derived_guard_types=tuple(guard.guard_types)
+                    if guard.guard_types
+                    else (),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     is_global=is_global,
                     orig_guard=guard,
                 )
@@ -3593,6 +4516,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                 [make_guard_filter_entry(guard) for guard in sorted_guards]
             )
             assert len(filter_results) == len(sorted_guards)
+<<<<<<< HEAD
             assert all(type(x) is bool for x in filter_results)
             sorted_guards = [
                 guard for i, guard in enumerate(sorted_guards) if filter_results[i]
@@ -3607,6 +4531,20 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
             save_guards,
             source_get_cache=source_get_cache,
         )
+=======
+            assert all(type(x) == bool for x in filter_results)
+            sorted_guards = [
+                guard for i, guard in enumerate(sorted_guards) if filter_results[i]
+            ]
+            # Redo the guards because filtering relies on the results from the last guard builder.
+            builder, guard_manager = self.build_guards(
+                sorted_guards,
+                existing_diff_guard_sources,
+                f_code,
+                output_graph,
+                self.guards_serialization_mode,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.guard_manager = guard_manager
         self.compile_check_fn(builder, sorted_guards, guard_fail_fn)
@@ -3629,6 +4567,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
         # TODO(anijain2305, ydwu4) - Skipping export because of following test
         # python -s test/dynamo/test_export.py -k test_export_with_symbool_inputs
         latency = 0.0
+<<<<<<< HEAD
 
         if not output_graph.skip_guards_check and not output_graph.export:
             if not self.guard_manager.check(output_graph.local_scope):
@@ -3645,13 +4584,31 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
             if guard_manager_testing_hook_fn is not None:
                 guard_manager_testing_hook_fn(
                     self.guard_manager, output_graph.local_scope, builder
+=======
+        if not output_graph.export and self.guards_serialization_mode != "load":
+            if not self.guard_manager.check(output_graph.local_scope):
+                reasons = get_guard_fail_reason_helper(
+                    self.guard_manager,  # type: ignore[arg-type]
+                    output_graph.local_scope,
+                    CompileContext.current_compile_id(),
+                )
+                raise AssertionError(f"Guard check failed: {reasons}")
+
+            if guard_manager_testing_hook_fn is not None:
+                guard_manager_testing_hook_fn(
+                    self.guard_manager, output_graph.local_scope
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # NB for developers: n_iters is chosen to be 1 to prevent excessive
             # increase in compile time. We first do a cache flush to measure the
             # guard latency more accurately. This cache flush is expensive.
             # Note  - If you are working on a guard optimization, it might be a
+<<<<<<< HEAD
             # good idea to increase this number for more stability during
+=======
+            # good idea to increase this number for more stabiilty during
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # development.
             latency = profile_guard_manager(
                 self.guard_manager.root, output_graph.local_scope, 1
@@ -3669,6 +4626,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
             CompileEventLogger.increment_toplevel("guard_latency_us", int(latency))
 
         self.guards_state: Optional[bytes] = None
+<<<<<<< HEAD
         if save_guards:
             from torch._dynamo.output_graph import OutputGraphCommon
 
@@ -3684,6 +4642,80 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                     f"Guard evaluation failed: {str(e)}",
                     traceback=traceback.format_exc().split("\n"),
                 )
+=======
+        if self.guards_serialization_mode == "save":
+            used_global_vars = set()
+            used_local_vars = set()
+
+            def prune_variable(source):
+                if name := get_global_source_name(source):
+                    assert isinstance(name, str)
+                    used_global_vars.add(name)
+                elif name := get_local_source_name(source):
+                    assert isinstance(name, str)
+                    used_local_vars.add(name)
+
+            output_graph_guards_state = self.output_graph.dump_guards_state()
+            # Only serialize the global variables that are actually used in guards.
+            for guard in sorted_guards:
+                if isinstance(guard.originating_source, ShapeEnvSource):
+                    assert self.shape_code_parts
+                    for source in self.shape_code_parts.shape_env_sources:
+                        prune_variable(source)
+                else:
+                    prune_variable(guard.originating_source)
+
+            for source in self.output_graph.guard_on_key_order:
+                prune_variable(source)
+
+            def normalize_create_fn(x):
+                if isinstance(x, functools.partial):
+
+                    def _ref(x):
+                        if isinstance(x, (TensorWeakRef, weakref.ref)):
+                            return x()
+                        return x
+
+                    new_args = tuple(_ref(a) for a in x.args)
+                    new_keywords = {k: _ref(v) for k, v in x.keywords.items()}
+                    return functools.partial(x.func, *new_args, **new_keywords)
+
+                return x
+
+            output_graph_guards_state = dataclasses.replace(
+                output_graph_guards_state,
+                local_scope={
+                    k: v
+                    for k, v in output_graph_guards_state.local_scope.items()
+                    if k in used_local_vars
+                },
+                global_scope={
+                    k: v
+                    for k, v in output_graph_guards_state.global_scope.items()
+                    if k in used_global_vars
+                },
+                _guards=torch._guards.GuardsSet(
+                    {
+                        dataclasses.replace(
+                            guard,
+                            obj_weakref=None,
+                            guarded_class_weakref=None,
+                            create_fn=normalize_create_fn(guard.create_fn),
+                        )
+                        for guard in sorted_guards
+                    }
+                ),
+                input_source_to_sizes_strides=pytree.tree_map(
+                    convert_int_to_concrete_values,
+                    output_graph_guards_state.input_source_to_sizes_strides,
+                ),
+            )
+            guards_state = GuardsState(
+                output_graph=output_graph_guards_state,
+                shape_code_parts=self.shape_code_parts,
+            )
+            self.guards_state = pickle_guards_state(guards_state)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: don't do the string rep, do something more structured here
         torch._logging.trace_structured(
@@ -3701,6 +4733,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
         self._weakrefs.clear()
         self.output_graph = None
 
+<<<<<<< HEAD
     UNSUPPORTED_SERIALIZATION_GUARD_TYPES: tuple[LiteralString, ...] = (
         "DICT_VERSION",
         "NN_MODULE",
@@ -3845,12 +4878,26 @@ def build_guards(
         save_guards: bool,
         source_get_cache: Optional[dict[str, Any]] = None,
     ) -> tuple[GuardBuilder, GuardManagerWrapper]:
+=======
+    def build_guards(
+        self,
+        sorted_guards,
+        existing_diff_guard_sources,
+        f_code,
+        output_graph,
+        serialization_mode=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guard_manager = GuardManagerWrapper()
         guard_manager.diff_guard_sources = existing_diff_guard_sources
 
         w_builder = None
 
+<<<<<<< HEAD
         def source_ref(source: Source) -> str:
+=======
+        def source_ref(source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             guard_source = source.guard_source()
             if guard_source is GuardSource.CONSTANT:
                 # No need to track constants
@@ -3869,6 +4916,7 @@ def source_ref(source: Source) -> str:
             output_graph.global_scope,
             guard_manager,
             self,
+<<<<<<< HEAD
             save_guards,
             runtime_global_scope=self.runtime_global_scope,
             source_get_cache=source_get_cache,
@@ -3879,6 +4927,16 @@ def cleanup_builder(weak_b: weakref.ref[GuardBuilder]) -> None:
             b = weak_b()
             if b:
                 b.scope = None  # type: ignore[assignment]
+=======
+            serialization_mode,
+        )
+
+        # Break retain cycle. See test_release_scope_memory
+        def cleanup_builder(weak_b):
+            b = weak_b()
+            if b:
+                b.scope = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Break retain cycle. See test_release_input_memory
         w_builder = weakref.ref(builder, cleanup_builder)
@@ -3902,12 +4960,16 @@ def cleanup_builder(weak_b: weakref.ref[GuardBuilder]) -> None:
             guard.create(builder)
         return builder, guard_manager
 
+<<<<<<< HEAD
     def compile_check_fn(
         self,
         builder: GuardBuilder,
         guards_out: list[Guard],
         guard_fail_fn: Optional[Callable[[GuardFail], None]],
     ) -> None:
+=======
+    def compile_check_fn(self, builder, guards_out, guard_fail_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = builder.argnames
         largs += ["**___kwargs_ignored"]
@@ -3918,11 +4980,15 @@ def compile_check_fn(
         verbose_code_parts = []
         structured_guard_fns: list[Callable[[], dict[str, Any]]] = []
 
+<<<<<<< HEAD
         assert self.torch_function_mode_stack is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch_function_mode_stack_check_fn = make_torch_function_mode_stack_guard(
             self.torch_function_mode_stack
         )
 
+<<<<<<< HEAD
         # Add compile id info in the guard manager for debugging purpose
         self.guard_manager.root.attach_compile_id(
             str(CompileContext.current_compile_id())
@@ -3934,6 +5000,10 @@ def compile_check_fn(
         self.guard_manager.root.add_global_state_guard(
             global_state, ["___check_global_state()"]
         )
+=======
+        # Insert the global_state guard
+        self.guard_manager.root.add_global_state_guard(["___check_global_state()"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.guard_manager.root.add_torch_function_mode_stack_guard(
             self.torch_function_mode_stack,
@@ -3942,9 +5012,13 @@ def compile_check_fn(
         # Clear references to torch_function modes held in the list
         self.torch_function_mode_stack = None
 
+<<<<<<< HEAD
         def add_code_part(
             code_part: str, guard: Optional[Guard], log_only: bool = False
         ) -> None:
+=======
+        def add_code_part(code_part, guard, log_only=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             verbose_code_part = get_verbose_code_part(code_part, guard)
             guards_log.debug("%s", verbose_code_part)
 
@@ -4007,6 +5081,7 @@ def add_code_part(
                 ["check_no_aliasing(" + ", ".join(no_tensor_aliasing_names) + ")"],
             )
 
+<<<<<<< HEAD
         # Note - On Lambda guarding of object aliasing
         # We previously installed object-aliasing guards as relational guards,
         # but that undermined the recursive-dict guard optimization: placing the
@@ -4027,6 +5102,8 @@ def add_code_part(
                 aliasing_code_parts, aliasing_verbose_code_parts
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aotautograd_guards: list[GuardEnvExpr] = (
             self.output_graph.aotautograd_guards if self.output_graph else []
         )
@@ -4082,7 +5159,12 @@ def add_code_part(
                 "dynamo_guards", payload_fn=lambda: [f() for f in structured_guard_fns]
             )
 
+<<<<<<< HEAD
         if convert_frame.initial_global_state is None:
+=======
+        global_state = convert_frame.initial_global_state
+        if global_state is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # we should only hit this case in NopTests()
             global_state = convert_frame.GlobalStateGuard()
         closure_vars = {
@@ -4114,7 +5196,11 @@ def add_code_part(
         self.guard_manager.extra_state = None
         self.guard_manager.no_tensor_aliasing_sources = no_tensor_aliasing_names
 
+<<<<<<< HEAD
     def invalidate(self, obj_str: str) -> None:
+=======
+    def invalidate(self, obj_str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Some tests reveal that CheckFunctionManager has no attribute
         # guard_manager, but this case should not be of any concern.
         # This case doesn't seem easy to repro.
@@ -4125,6 +5211,7 @@ def invalidate(self, obj_str: str) -> None:
             and (extra_state := self.guard_manager.extra_state) is not None
         ):
             assert isinstance(cache_entry, CacheEntry)
+<<<<<<< HEAD
 
             assert isinstance(extra_state, ExtraState)
             reason = f"Cache line invalidated because {obj_str} got deallocated"
@@ -4134,6 +5221,15 @@ def invalidate(self, obj_str: str) -> None:
             self.guard_manager = deleted_guard_manager
 
     def id_ref(self, obj: object, obj_str: str) -> int:
+=======
+            assert isinstance(extra_state, ExtraState)
+            reason = f"Cache line invalidated because {obj_str} got deallocated"
+            deleted_guard_manager = DeletedGuardManagerWrapper(reason)
+            extra_state.invalidate(cache_entry, deleted_guard_manager)
+            self.guard_manager = deleted_guard_manager
+
+    def id_ref(self, obj, obj_str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """add a weakref, return the id"""
         try:
             if id(obj) not in self._weakrefs:
@@ -4148,14 +5244,22 @@ def id_ref(self, obj: object, obj_str: str) -> int:
             pass  # cannot weakref bool object
         return id(obj)
 
+<<<<<<< HEAD
     def lookup_weakrefs(self, obj: object) -> Optional[weakref.ref[object]]:
+=======
+    def lookup_weakrefs(self, obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Lookup the _weakrefs created in id_ref function for ID_MATCH'd objects"""
         if id(obj) in self._weakrefs:
             return self._weakrefs[id(obj)]
         return None
 
 
+<<<<<<< HEAD
 def build_guard_function(code_parts: list[str], closure_args: str) -> tuple[str, str]:
+=======
+def build_guard_function(code_parts, closure_args) -> tuple[str, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.utils import IndentedBuffer
 
     csepass = PyExprCSEPass()
@@ -4164,7 +5268,10 @@ def build_guard_function(code_parts: list[str], closure_args: str) -> tuple[str,
 
         def replace(expr: str) -> tuple[list[str], str]:
             return csepass.replace(expr)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except RecursionError:
         # If we hit recursion limits during CSE analysis, fall back to a no-op replace function
         # This can happen with extremely complex guard expressions
@@ -4199,28 +5306,47 @@ def replace(expr: str) -> tuple[list[str], str]:
     return guard_body.getvalue(), make_guard_fn.getvalue()
 
 
+<<<<<<< HEAD
 def is_recompiles_enabled() -> bool:
     return torch._logging._internal.log_state.is_artifact_enabled("recompiles")
 
 
 def is_recompiles_verbose_enabled() -> bool:
+=======
+def is_recompiles_enabled():
+    return torch._logging._internal.log_state.is_artifact_enabled("recompiles")
+
+
+def is_recompiles_verbose_enabled():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._logging._internal.log_state.is_artifact_enabled("recompiles_verbose")
 
 
 # this will only be used if cpp guards are disabled
+<<<<<<< HEAD
 def make_torch_function_mode_stack_guard(
     initial_stack: list[torch.overrides.TorchFunctionMode],
 ) -> Callable[[], bool]:
     types = [type(x) for x in initial_stack]
 
     def check_torch_function_mode_stack() -> bool:
+=======
+def make_torch_function_mode_stack_guard(initial_stack):
+    types = [type(x) for x in initial_stack]
+
+    def check_torch_function_mode_stack():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_stack = get_torch_function_mode_stack()
 
         if len(cur_stack) != len(types):
             return False
 
         for ty, mode in zip(types, cur_stack):
+<<<<<<< HEAD
             if ty is not type(mode):
+=======
+            if ty != type(mode):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
         return True
@@ -4228,6 +5354,7 @@ def check_torch_function_mode_stack() -> bool:
     return check_torch_function_mode_stack
 
 
+<<<<<<< HEAD
 Scope = TypeAliasType("Scope", dict[str, object])
 
 
@@ -4238,6 +5365,12 @@ def recompilation_reason_for_no_tensor_aliasing_guard(
     global_scope = dict(guard_manager.global_scope)
     ids_to_source = collections.defaultdict(list)
     for tensor_source in guard_manager.no_tensor_aliasing_sources:
+=======
+def recompilation_reason_for_no_tensor_aliasing_guard(guard_manager, scope):
+    global_scope = dict(guard_manager.global_scope)
+    ids_to_source = collections.defaultdict(list)
+    for tensor_source in guard_manager.no_tensor_aliasing_sources:  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global_scope["__compile_source__"] = tensor_source
         tensor_id = id(eval(tensor_source, global_scope, scope))
         ids_to_source[tensor_id].append(tensor_source)
@@ -4264,17 +5397,26 @@ def strip_local_scope(s: str) -> str:
 
 
 def get_guard_fail_reason_helper(
+<<<<<<< HEAD
     guard_manager: GuardManagerWrapper,
     f_locals: dict[str, object],
     compile_id: Optional[CompileId],
+=======
+    guard_manager: GuardFn,
+    f_locals: dict[str, object],
+    compile_id: CompileId,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> str:
     """
     Return the reason why `guard_manager` failed.
     Updates `guard_failures` with the generated reason.
     Only the first failed check of guard_manager is reported.
     """
+<<<<<<< HEAD
     assert guard_manager.global_scope is not None
     assert guard_manager.closure_vars is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scope = {"L": f_locals, "G": guard_manager.global_scope["G"]}
     scope.update(guard_manager.closure_vars)
     reasons: list[str] = []
@@ -4282,7 +5424,11 @@ def get_guard_fail_reason_helper(
     no_tensor_aliasing_check_failed = False
 
     verbose_code_parts: list[str] = []
+<<<<<<< HEAD
     guard_debug_info = guard_manager.check_verbose(f_locals)
+=======
+    guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For test_export_with_map_cond, the check_verbose fail even without the
     # C++ guard manager. We need to fix the issue to remove the comment.
     # assert not guard_debug_info.result
@@ -4333,17 +5479,27 @@ def get_guard_fail_reason_helper(
 
 
 def get_guard_fail_reason(
+<<<<<<< HEAD
     guard_manager: GuardManagerWrapper,
     code: types.CodeType,
     f_locals: dict[str, object],
     compile_id: CompileId,
     skip_logging: bool = False,
+=======
+    guard_manager: GuardFn,
+    code: types.CodeType,
+    f_locals: dict[str, object],
+    compile_id: CompileId,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> str:
     if isinstance(guard_manager, DeletedGuardManagerWrapper):
         return f"{compile_id}: {guard_manager.invalidation_reason}"
     reason_str = get_guard_fail_reason_helper(guard_manager, f_locals, compile_id)
+<<<<<<< HEAD
     if skip_logging:
         return reason_str
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     guard_failures[orig_code_map[code]].append(reason_str)
 
     try:
@@ -4360,9 +5516,13 @@ def get_guard_fail_reason(
 
 
 def get_and_maybe_log_recompilation_reasons(
+<<<<<<< HEAD
     cache_entry: Optional[CacheEntry],
     frame: DynamoFrameType,
     skip_logging: bool = False,
+=======
+    cache_entry, frame: DynamoFrameType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[str]:
     """
     Return the list of guard failure reasons using cache_entry.
@@ -4376,7 +5536,10 @@ def get_and_maybe_log_recompilation_reasons(
             cache_entry.code,
             frame.f_locals,
             cache_entry.compile_id,
+<<<<<<< HEAD
             skip_logging,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if reason:
             reasons.append(reason)
@@ -4384,8 +5547,11 @@ def get_and_maybe_log_recompilation_reasons(
 
     code = frame.f_code
 
+<<<<<<< HEAD
     if skip_logging:
         return reasons
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # at least one of "recompiles" or "recompiles_verbose" is enabled
     do_recompiles_log = is_recompiles_enabled() or is_recompiles_verbose_enabled()
 
@@ -4424,20 +5590,32 @@ def get_and_maybe_log_recompilation_reasons(
     return reasons
 
 
+<<<<<<< HEAD
 def update_diff_guard_managers_for_existing_cache_entries(
     cache_entry: Optional[CacheEntry],
 ) -> OrderedSet[str]:
+=======
+def update_diff_guard_managers_for_existing_cache_entries(cache_entry):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     first_cache_entry = cache_entry
 
     # On the first pass, go through the cache entries and accumulate the diff
     # guard sources. Different guard managers can fail with different sources.
     # So, we collect all of them first.
+<<<<<<< HEAD
     acc_diff_guard_sources: OrderedSet[str] = OrderedSet()
+=======
+    acc_diff_guard_sources = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while cache_entry is not None:
         acc_diff_guard_sources.update(
             cache_entry.guard_manager.collect_diff_guard_sources()
         )
+<<<<<<< HEAD
         cache_entry = cache_entry.next  # type: ignore[assignment]
+=======
+        cache_entry = cache_entry.next
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # On the second pass, set the diff_guard_sources for each cache line to the
     # accumulated value. And the re-populate the diff guard manager.
@@ -4445,7 +5623,11 @@ def update_diff_guard_managers_for_existing_cache_entries(
     while cache_entry is not None:
         cache_entry.guard_manager.diff_guard_sources = acc_diff_guard_sources
         cache_entry.guard_manager.populate_diff_guard_manager()
+<<<<<<< HEAD
         cache_entry = cache_entry.next  # type: ignore[assignment]
+=======
+        cache_entry = cache_entry.next
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # return the accumulated sources to set up the new cache line.
     return acc_diff_guard_sources
@@ -4457,7 +5639,11 @@ def guard_error_hook(
     f_locals: dict[str, object],
     index: int,
     last: bool,
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     print(
         f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
     )
@@ -4477,7 +5663,11 @@ def guard_error_hook(
 set_guard_error_hook(guard_error_hook)
 
 
+<<<<<<< HEAD
 def unique(seq: Sequence[T]) -> Generator[T, None, None]:
+=======
+def unique(seq):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     seen = set()
     for x in seq:
         if x not in seen:
@@ -4485,9 +5675,13 @@ def unique(seq: Sequence[T]) -> Generator[T, None, None]:
             seen.add(x)
 
 
+<<<<<<< HEAD
 def make_dupe_guard(
     obj_source: Source, dupe_source: Source
 ) -> Optional[functools.partial[Any]]:
+=======
+def make_dupe_guard(obj_source, dupe_source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Note - we may end up in a situation where we invoke something like
     # def fn(x, y)
     # with fn(x, x)
@@ -4521,7 +5715,11 @@ def make_dupe_guard(
     return None
 
 
+<<<<<<< HEAD
 def install_guard(*guards: Guard, skip: int = 0) -> None:
+=======
+def install_guard(*guards, skip=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Add dynamo guards to the current tracing context.
 
@@ -4537,6 +5735,9 @@ def install_guard(*guards: Guard, skip: int = 0) -> None:
     add = TracingContext.get().guards_context.dynamo_guards.add
     for guard in guards:
         assert isinstance(guard, Guard)
+<<<<<<< HEAD
         if is_from_skip_guard_source(guard.originating_source):
             continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         add(guard, collect_debug_stack=collect_debug_stack, skip=skip + 1)
diff --git a/torch/_dynamo/hooks.py b/torch/_dynamo/hooks.py
index 4f47a80d1ae0a..bab666c5c74d2 100644
--- a/torch/_dynamo/hooks.py
+++ b/torch/_dynamo/hooks.py
@@ -10,8 +10,12 @@
 """
 
 import dataclasses
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._guards import GuardsSet
 
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 74862962adaa1..a91b5b68e5952 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -11,8 +11,12 @@
 
 import itertools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.hub import _Faketqdm, tqdm
 
diff --git a/torch/_dynamo/metrics_context.py b/torch/_dynamo/metrics_context.py
index bc341f10897c6..1fabf74fdfe0c 100644
--- a/torch/_dynamo/metrics_context.py
+++ b/torch/_dynamo/metrics_context.py
@@ -13,6 +13,7 @@
 execution performance.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
 import heapq
@@ -27,6 +28,14 @@
     from collections.abc import Iterator
 
 from torch.utils._traceback import CapturedTraceback
+=======
+import heapq
+import logging
+import time
+from collections.abc import Iterator
+from typing import Any, Callable, Optional
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -71,9 +80,14 @@ def __init__(self, on_exit: OnExitType):
         self._metrics: dict[str, Any] = {}
         self._start_time_ns: int = 0
         self._level: int = 0
+<<<<<<< HEAD
         self._edits: list[tuple[CapturedTraceback, set[str]]] = []
 
     def __enter__(self) -> Self:
+=======
+
+    def __enter__(self) -> "MetricsContext":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Initialize metrics recording.
         """
@@ -121,6 +135,7 @@ def increment(self, metric: str, value: int) -> None:
             self._metrics[metric] = 0
         self._metrics[metric] += value
 
+<<<<<<< HEAD
     def _render_edits(self, pred: set[str]) -> str:
         return "\n\n" + "\n\n".join(
             "Previous Traceback:\n" + "".join(e.format())
@@ -128,6 +143,8 @@ def _render_edits(self, pred: set[str]) -> str:
             if k & pred
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set(self, metric: str, value: Any, overwrite: bool = False) -> None:
         """
         Set a metric to a given value. Raises if the metric has been assigned previously
@@ -137,11 +154,16 @@ def set(self, metric: str, value: Any, overwrite: bool = False) -> None:
             raise RuntimeError(f"Cannot set {metric} outside of a MetricsContext")
         if metric in self._metrics and not overwrite:
             raise RuntimeError(
+<<<<<<< HEAD
                 self._render_edits({metric})
                 + f"\n\nRuntimeError: Metric '{metric}' has already been set in the current context "
                 "(see above for current and previous traceback)."
             )
         self._edits.append((CapturedTraceback.extract(skip=1), {metric}))
+=======
+                f"Metric '{metric}' has already been set in the current context"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._metrics[metric] = value
 
     def set_key_value(self, metric: str, key: str, value: Any) -> None:
@@ -169,11 +191,16 @@ def update(self, values: dict[str, Any], overwrite: bool = False) -> None:
         existing = self._metrics.keys() & values.keys()
         if existing and not overwrite:
             raise RuntimeError(
+<<<<<<< HEAD
                 self._render_edits(set(values.keys()))
                 + f"\n\nRuntimeError: Metric(s) {existing} have already been set in the current context.  "
                 "(see above for current and previous traceback)."
             )
         self._edits.append((CapturedTraceback.extract(skip=1), set(values.keys())))
+=======
+                f"Metric(s) {existing} have already been set in the current context"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._metrics.update(values)
 
     def update_outer(self, values: dict[str, Any]) -> None:
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 50638ccbba0eb..5f1bcf973b6db 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Core graph building functionality for PyTorch's Dynamo system. This module contains
 the essential components for constructing and managing FX graphs during compilation:
@@ -30,6 +35,7 @@
 import re
 import sys
 import traceback
+<<<<<<< HEAD
 import warnings
 import weakref
 from collections.abc import Callable, Generator, Sequence
@@ -37,6 +43,11 @@
 from types import CodeType
 from typing import Any, cast, Optional, TYPE_CHECKING, Union
 from typing_extensions import ParamSpec, TypeVar
+=======
+import weakref
+from dataclasses import dataclass, field as dc_field
+from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -58,7 +69,10 @@
 )
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._utils_internal import signpost_event
+<<<<<<< HEAD
 from torch.export.dynamic_shapes import _ConstraintTarget
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx._lazy_graph_module import _make_graph_module  # type: ignore[attr-defined]
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.symbolic_shapes import (
@@ -67,9 +81,13 @@
     is_symbolic,
     ShapeEnv,
     Specialization,
+<<<<<<< HEAD
     uninteresting_files,
 )
 from torch.fx.node import Target
+=======
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._ordered_set import OrderedSet
@@ -78,6 +96,7 @@
 from . import config, exc, logging as torchdynamo_logging, variables
 from .backends.registry import CompiledFn, CompilerFn
 from .bytecode_transformation import (
+<<<<<<< HEAD
     create_binary_slice,
     create_binary_subscr,
     create_build_tuple,
@@ -87,6 +106,11 @@
     create_load_const,
     create_rot_n,
     create_swap,
+=======
+    create_call_function,
+    create_instruction,
+    create_load_const,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Instruction,
     unique_id,
 )
@@ -101,14 +125,22 @@
     unimplemented_v2,
     unimplemented_v2_with_warning,
 )
+<<<<<<< HEAD
 from .graph_bytecode_inputs import has_user_objects, index_to_source
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .graph_deduplication import apply_graph_deduplication
 from .graph_region_tracker import GraphRegionTracker
 from .guards import GuardBuilder, install_guard
 from .mutation_guard import is_dynamic_nn_module
+<<<<<<< HEAD
 from .side_effects import AttributeMutationExisting, SideEffects, ValueMutationExisting
 from .source import (
     _get_source_debug_name,
+=======
+from .side_effects import AttributeMutationExisting, SideEffects
+from .source import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AttrSource,
     BackwardStateSource,
     ConstantSource,
@@ -154,7 +186,11 @@
 )
 from .variables.ctx_manager import ContextWrappingVariable
 from .variables.lists import BaseListVariable
+<<<<<<< HEAD
 from .variables.misc import NullVariable
+=======
+from .variables.misc import CellVariable, NullVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
     NumpyNdarrayVariable,
@@ -163,6 +199,7 @@
     UnspecializedPythonVariable,
 )
 from .variables.torch_function import TensorWithTFOverrideVariable
+<<<<<<< HEAD
 from .variables.user_defined import UserDefinedDictVariable
 
 
@@ -170,6 +207,14 @@
     from torch._dynamo.package import CompilePackage
     from torch._dynamo.symbolic_convert import InstructionTranslatorBase
 
+=======
+
+
+if TYPE_CHECKING:
+    from torch._dynamo.symbolic_convert import InstructionTranslatorBase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 graph_tabular_log = torch._logging.getArtifactLogger(__name__, "graph")
 graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
@@ -179,6 +224,7 @@
 RootGuardManager = guards.RootGuardManager
 
 
+<<<<<<< HEAD
 # Capture fn pointer at import time
 # This is to guard against trying to mark the iterated tensors
 # as static in case user overrides fn ptr
@@ -186,6 +232,8 @@
 og_module_named_parameters_fn_ptr = torch.nn.Module.named_parameters
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass(frozen=True)
 class VariableTrackerCacheKey:
     vt_id: int
@@ -208,31 +256,54 @@ class MutationInfo:
 
 
 class VariableTrackerCache:
+<<<<<<< HEAD
     def __init__(self) -> None:
         self.cache: dict[VariableTrackerCacheKey, VariableTracker] = {}
 
     def lookup(self, value: Any, source: Source) -> Optional[VariableTracker]:
+=======
+    def __init__(self):
+        self.cache = {}
+
+    def lookup(self, value, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         key = VariableTrackerCacheKey(id(value), source)
         if key not in self.cache:
             return None
         return self.cache[key]
 
+<<<<<<< HEAD
     def add(self, value: Any, source: Source, vt: VariableTracker) -> None:
         key = VariableTrackerCacheKey(id(value), source)
         self.cache[key] = vt
 
     def clone(self) -> "VariableTrackerCache":
+=======
+    def add(self, value, source, vt):
+        key = VariableTrackerCacheKey(id(value), source)
+        self.cache[key] = vt
+
+    def clone(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Needed for copy and restore graph state
         new_cache = VariableTrackerCache()
         new_cache.cache.update(self.cache)
         return new_cache
 
+<<<<<<< HEAD
     def clear(self) -> None:
+=======
+    def clear(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cache.clear()
 
 
 @functools.cache
+<<<<<<< HEAD
 def _step_logger() -> Any:
+=======
+def _step_logger():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torchdynamo_logging.get_step_logger(log)
 
 
@@ -243,16 +314,28 @@ class GraphCompileReason:
     reason: str
     user_stack: list[traceback.FrameSummary]
 
+<<<<<<< HEAD
     # Indicates if this was a graph break reason due to graph break.
     graph_break: bool = True
 
     def __post_init__(self) -> None:
+=======
+    # Indicates if this was a graph compile reason due to graph break.
+    graph_break: bool = True
+
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.graph_break:
             graph_break_reasons.append(self)
 
 
+<<<<<<< HEAD
 def _get_gen_rand_values_fn(random_calls: Any) -> Callable[[], list[Any]]:
     def _gen_rand_values() -> list[Any]:
+=======
+def _get_gen_rand_values_fn(random_calls):
+    def _gen_rand_values():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [fn(*args, **kwargs) for fn, args, kwargs in random_calls]
 
     return _gen_rand_values
@@ -269,18 +352,29 @@ def __init__(self, nn_modules: dict[str, torch.nn.Module]):
     def __repr__(self) -> str:
         return "FakeRootModule(...)"
 
+<<<<<<< HEAD
     def add_nn_modules(self, nn_modules: dict[str, torch.nn.Module]) -> None:
+=======
+    def add_nn_modules(self, nn_modules: dict[str, torch.nn.Module]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for k, v in nn_modules.items():
             setattr(self, k, v)
 
 
 class WrapperBackend:
+<<<<<<< HEAD
     def __init__(self, backend: CompilerFn) -> None:
         self.backend: CompilerFn = backend
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
     ) -> CompiledFn:
+=======
+    def __init__(self, backend: CompilerFn):
+        self.backend: CompilerFn = backend
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.restore = checkpoint_params(gm)
         self.gm = gm
         copy_gm = copy.deepcopy(self.gm)
@@ -333,6 +427,7 @@ class OutputGraphGuardsState:
     dual_level: int
     functorch_layers: list[torch._functorch.pyfunctorch.FuncTorchInterpreter]
     current_device: Optional[torch.device]
+<<<<<<< HEAD
     global_state_guard: torch._C._dynamo.guards.GlobalStateGuard
     _guards: torch._guards.GuardsSet
     _aotautograd_guards: list[torch._guards.GuardEnvExpr]
@@ -376,6 +471,27 @@ def dump_guards_state(self) -> "OutputGraphGuardsState":
             skip_guards_check=self.skip_guards_check,
         )
 
+=======
+
+    export: bool = False
+    export_constraints: bool = False
+
+    _guards: Optional[torch._guards.GuardsSet] = None
+    _aotautograd_guards: Optional[list[torch._guards.GuardEnvExpr]] = None
+
+    @property
+    def shape_env(self):
+        raise AssertionError(f"shape_env shouldn't be accessed from {type(self)}")
+
+    @property
+    def guards(self):
+        return self._guards
+
+    @property
+    def aotautograd_guards(self):
+        return self._aotautograd_guards
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclass
 class StackLocalsMetadata:
@@ -383,10 +499,13 @@ class StackLocalsMetadata:
     Stores metadata for a frame's stack and locals for the purposes of building resume functions
     """
 
+<<<<<<< HEAD
     num_stack: int = 0  # number of stack elements, minus removed NULLs
     locals_names: dict[str, int] = dc_field(
         default_factory=dict
     )  # order of locals codegen'd to the stack
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stack_null_idxes: list[int] = dc_field(default_factory=list)
     locals_null_keys: list[str] = dc_field(default_factory=list)
     stack_ctx_args: list[tuple[int, tuple[Any, ...]]] = dc_field(default_factory=list)
@@ -394,6 +513,7 @@ class StackLocalsMetadata:
     locals_ctx_args: list[tuple[str, tuple[Any, ...]]] = dc_field(default_factory=list)
 
 
+<<<<<<< HEAD
 # TODO we should expand this to make it work for atribtrary in/out
 @dataclass
 class ExportMetaData:
@@ -510,6 +630,9 @@ def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
 
 
 class OutputGraph(OutputGraphCommon):
+=======
+class OutputGraph(OutputGraphGuardsState):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
     generated fx.Graph.
@@ -526,6 +649,7 @@ def __init__(
         self,
         code_options: dict[str, Any],
         compiler_fn: Optional[CompilerFn],
+<<<<<<< HEAD
         root_tx: "InstructionTranslatorBase",
         export: bool,
         export_constraints: Sequence[_ConstraintTarget],
@@ -539,6 +663,19 @@ def __init__(
     ) -> None:
         OutputGraphGuardsState.__init__(
             self,
+=======
+        root_tx,
+        export: bool,
+        export_constraints,
+        frame_state,
+        local_scope: Scope,
+        global_scope: Scope,
+        f_code,
+        torch_function_mode_stack,
+        package,
+    ):
+        super().__init__(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_scope,
             global_scope,
             torch_function_mode_stack,
@@ -547,19 +684,26 @@ def __init__(
             dual_level=torch.autograd.forward_ad._current_level,
             functorch_layers=torch._functorch.pyfunctorch.retrieve_all_functorch_interpreters(),
             current_device=torch.utils._device.CURRENT_DEVICE,
+<<<<<<< HEAD
             # initial_global_state is only None during NopTest.
             global_state_guard=torch._dynamo.convert_frame.initial_global_state
             or torch._C._dynamo.guards.GlobalStateGuard(),
             # These are set by @property instead, just initialize them as blank
             _guards=torch._guards.GuardsSet(),
             _aotautograd_guards=[],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.tracers = [SubgraphTracer(self, is_export=export)]
         # Map from graph input's `Source` to its `VariableTracker` to
         # de-duplicate graph inputs by source and reuse the tracker
         self.input_source_to_var: dict[Source, VariableTracker] = {}
         self.export = export
+<<<<<<< HEAD
         self.export_constraints = export_constraints  # type: ignore[assignment]
+=======
+        self.export_constraints = export_constraints
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.frame_state = frame_state
         self.cleanup_hooks: list[Callable[[], Any]] = []
         # compile_id is an id number for the current torch.compile
@@ -593,11 +737,18 @@ def __init__(
             # TrackedFake instances may have its metadata changed throughout
             # the program execution.
             tracked_fakes=self.tracked_fakes,
+<<<<<<< HEAD
             # We want to allow capture scalar outputs and allow_dynamic_output_shape_ops when fullgraph=True
             allow_scalar_outputs=one_graph or config.capture_scalar_outputs,
             allow_dynamic_output_shape_ops=one_graph
             or config.capture_dynamic_output_shape_ops,
             prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
+=======
+            allow_scalar_outputs=config.capture_scalar_outputs,
+            allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
+            prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
+            allow_complex_guards_as_runtime_asserts=config.allow_complex_guards_as_runtime_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             co_fields=self.co_fields,
         )
 
@@ -609,12 +760,19 @@ def __init__(
             fake_mode = torch._subclasses.FakeTensorMode(
                 shape_env=shape_env,
                 # TODO (tmanlaibaatar) Remove this once we always lift params and buffers
+<<<<<<< HEAD
                 allow_non_fake_inputs=bool(self.export),
+=======
+                allow_non_fake_inputs=True if self.export else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 export=self.export,
             )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
         self.tracing_context.traced_code.append(f_code)
+<<<<<<< HEAD
         self.traced_code = self.tracing_context.traced_code
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dynamo_compile_id: Optional[CompileId] = (
             CompileContext.current_compile_id()
         )
@@ -712,7 +870,10 @@ def __init__(
         self.backward_state_proxy: Optional[torch.fx.Proxy] = None
         self.backward_state_var: Optional[str] = None
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name_of_builtins_dict_key_in_fglobals: str = (
             self.install_builtins_dict_in_fglobals()
         )
@@ -727,6 +888,7 @@ def __init__(
             self.maybe_install_saved_tensors_hooks_subgraphs()
         )
 
+<<<<<<< HEAD
         # mangled alias -> module fqn name
         self.import_sources: dict[str, str] = {}
 
@@ -737,6 +899,9 @@ def __init__(
         self.used_inlined_inbuilt_modules_names: OrderedSet[str] = OrderedSet()
 
     def mark_bytecode_tracing_start(self) -> None:
+=======
+    def mark_bytecode_tracing_start(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.compiler_trace_stack.enter_context(
             dynamo_timed(
                 "bytecode_tracing",
@@ -744,6 +909,7 @@ def mark_bytecode_tracing_start(self) -> None:
             )
         )
 
+<<<<<<< HEAD
     def mark_bytecode_tracing_stop(self) -> None:
         self.compiler_trace_stack.close()
 
@@ -754,12 +920,41 @@ def install_builtins_dict_in_fglobals(self) -> str:
     def add_backward_state_hook(
         self, hook: VariableTracker, prefix: str = "hook"
     ) -> tuple[str, torch.fx.Proxy]:
+=======
+    def mark_bytecode_tracing_stop(self):
+        self.compiler_trace_stack.close()
+
+    def install_builtins_dict_in_fglobals(self):
+        # f_globals["__builtins__"] can be a dict or a module. This is an
+        # implementation detail -
+        # https://docs.python.org/3/library/builtins.html.
+
+        # This makes guarding on any builtin messy because the guard check_fn
+        # has to check if the __builtins__ is a module or dict, and then access
+        # by either using getattr or getitem respectively.
+
+        # To solve this problem, we insert a new entry in f_globals which points
+        # to the builtins __dict__ and then we guard any builtin on this dict.
+        # To avoid any collision with the pre-existing keys, we use the
+        # install_global to give us a unique dict key.
+
+        f_builtins = self.global_scope["__builtins__"]
+        if not isinstance(f_builtins, dict):
+            f_builtins = f_builtins.__dict__
+        return self.install_global("__builtins_dict__", f_builtins)
+
+    def add_backward_state_hook(self, hook: VariableTracker, prefix="hook"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = f"{prefix}{len(self.backward_state)}"
         assert name not in self.backward_state
         self.backward_state[name] = hook
         return name, self.get_backward_state_proxy()
 
+<<<<<<< HEAD
     def get_backward_state_proxy(self) -> torch.fx.Proxy:
+=======
+    def get_backward_state_proxy(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.backward_state_proxy is None:
             if self.export:
                 unimplemented_v2(
@@ -780,7 +975,11 @@ def get_backward_state_proxy(self) -> torch.fx.Proxy:
         return self.backward_state_proxy
 
     # This gets its own helper function so guards DEBUG logs are more informative
+<<<<<<< HEAD
     def init_ambient_guards(self) -> None:
+=======
+    def init_ambient_guards(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Register a SHAPE_ENV guard to make sure we setup shape guards
         # that show up in ShapeEnv
         self.guards.add(ShapeEnvSource().make_guard(GuardBuilder.SHAPE_ENV))
@@ -838,9 +1037,29 @@ def maybe_install_saved_tensors_hooks_subgraphs(self) -> Optional[list[str]]:
         assert unpack_subgraph_name == "saved_tensors_hooks_unpack_0"
         return [pack_subgraph_name, unpack_subgraph_name]
 
+<<<<<<< HEAD
     def synthetic_graph_input(
         self, fn: Callable[..., Any], args: tuple[Any, ...]
     ) -> VariableTracker:
+=======
+    def dump_guards_state(self):
+        return OutputGraphGuardsState(
+            local_scope=self.local_scope,
+            global_scope=self.global_scope,
+            torch_function_mode_stack=self.torch_function_mode_stack,
+            guard_on_key_order=self.guard_on_key_order,
+            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
+            dual_level=self.dual_level,
+            functorch_layers=self.functorch_layers,
+            current_device=self.current_device,
+            export=self.export,
+            export_constraints=self.export_constraints,
+            _guards=self.guards,
+            _aotautograd_guards=self.aotautograd_guards,
+        )
+
+    def synthetic_graph_input(self, fn, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         call fn(*args) before the graph runs and turn the result into a fake input.
         """
@@ -866,15 +1085,23 @@ def synthetic_graph_input(
         )
         return result
 
+<<<<<<< HEAD
     def add_cleanup_hook(self, fn: Callable[[], Any]) -> None:
         self.cleanup_hooks.append(fn)
 
     def call_cleanup_hooks(self) -> None:
+=======
+    def add_cleanup_hook(self, fn: Callable[[], Any]):
+        self.cleanup_hooks.append(fn)
+
+    def call_cleanup_hooks(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for hook in reversed(self.cleanup_hooks):
             hook()
         self.cleanup_hooks.clear()
 
     @property
+<<<<<<< HEAD
     def root_tracer(self) -> "SubgraphTracer":
         return self.tracers[0]
 
@@ -883,15 +1110,30 @@ def current_tracer(self) -> "SubgraphTracer":
         return self.tracers[-1]
 
     def is_root_tracer(self) -> bool:
+=======
+    def root_tracer(self):
+        return self.tracers[0]
+
+    @property
+    def current_tracer(self):
+        return self.tracers[-1]
+
+    def is_root_tracer(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Helper to tell if we are inside the higher order operator tracing.
         return len(self.tracers) == 1
 
     @property
+<<<<<<< HEAD
     def graph(self) -> torch.fx.Graph:
+=======
+    def graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.current_tracer.graph
 
     # TODO(rzou): can delete after we refactor speculate_subgraph to use nested GraphTracer.
     @graph.setter
+<<<<<<< HEAD
     def graph(self, value: torch.fx.Graph) -> None:
         self.current_tracer.graph = value
 
@@ -905,6 +1147,21 @@ def real_value_cache(self) -> dict[fx.Node, torch.Tensor]:
 
     @property
     def bound_symbols(self) -> dict[sympy.Symbol, Union[torch.fx.Proxy, "LazyProxy"]]:
+=======
+    def graph(self, value):
+        self.current_tracer.graph = value
+
+    @property
+    def input_name_to_proxy(self):
+        return self.current_tracer.input_name_to_proxy
+
+    @property
+    def real_value_cache(self):
+        return self.current_tracer.real_value_cache
+
+    @property
+    def bound_symbols(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.current_tracer.bound_symbols
 
     # If you are here, and you're looking for create_graph_input,
@@ -913,6 +1170,7 @@ def bound_symbols(self) -> dict[sympy.Symbol, Union[torch.fx.Proxy, "LazyProxy"]
     # - self.root_tracer.create_graph_input
     # See NOTE [HigherOrderOperator tracing design] for more context.
 
+<<<<<<< HEAD
     def create_proxy(self, *args: Any, **kwargs: Any) -> torch.fx.Proxy:
         return self.current_tracer.create_proxy(*args, **kwargs)
 
@@ -926,6 +1184,19 @@ def remove_node(self, *args: Any, **kwargs: Any) -> None:
     def subtracer(
         self, source_target: Optional[Target], prior_tracer: "SubgraphTracer"
     ) -> Generator[fx.Tracer, None, None]:
+=======
+    def create_proxy(self, *args, **kwargs):
+        return self.current_tracer.create_proxy(*args, **kwargs)
+
+    def create_node(self, *args, **kwargs):
+        return self.current_tracer.create_node(*args, **kwargs)
+
+    def remove_node(self, *args, **kwargs):
+        return self.current_tracer.remove_node(*args, **kwargs)
+
+    @contextlib.contextmanager
+    def subtracer(self, source_target, prior_tracer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_scope_ctx = enter_new_scope()
         try:
             if prior_tracer:
@@ -949,6 +1220,7 @@ def subtracer(
             self.tracers.pop()
 
     @property
+<<<<<<< HEAD
     def output(self) -> "OutputGraph":
         return self
 
@@ -961,6 +1233,17 @@ def fake_mode(self) -> torch._subclasses.FakeTensorMode:
     def shape_env(self) -> ShapeEnv:
         assert self.tracing_context.fake_mode is not None
         assert self.tracing_context.fake_mode.shape_env is not None
+=======
+    def output(self):
+        return self
+
+    @property
+    def fake_mode(self):
+        return self.tracing_context.fake_mode
+
+    @property
+    def shape_env(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.tracing_context.fake_mode.shape_env
 
     @property
@@ -972,12 +1255,19 @@ def nn_modules(self) -> dict[str, Any]:
         return self.tracing_context.module_context.nn_modules
 
     @property
+<<<<<<< HEAD
     def aotautograd_guards(self) -> list[torch._guards.GuardEnvExpr]:
         return self.tracing_context.guards_context.aotautograd_guards
 
     def save_global_state(
         self, out: Optional[dict[str, tuple[Callable[..., Any], bool]]] = None
     ) -> None:
+=======
+    def aotautograd_guards(self):
+        return self.tracing_context.guards_context.aotautograd_guards
+
+    def save_global_state(self, out=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Saves to out if it is provided. Else saves to the tracing context's global_state.
         """
@@ -1013,6 +1303,7 @@ def save_global_state(
             torch.is_autocast_cache_enabled(),
         )
 
+<<<<<<< HEAD
     def push_tx(self, tx: "InstructionTranslatorBase") -> None:
         self._current_tx.append(tx)
 
@@ -1033,6 +1324,25 @@ def has_outputs(self) -> bool:
         return len([x for x in self.graph.nodes if x.op == "output"]) > 0
 
     def get_submodule(self, keys: str) -> Union[torch.nn.Module, Any]:
+=======
+    def push_tx(self, tx):
+        self._current_tx.append(tx)
+
+    def pop_tx(self):
+        return self._current_tx.pop()
+
+    @property
+    def current_tx(self):
+        return self.root_tx if not self._current_tx else self._current_tx[-1]
+
+    def count_calls(self):
+        return count_calls(self.graph)
+
+    def is_empty_graph(self):
+        return len(list(self.graph.nodes)) == 0
+
+    def get_submodule(self, keys):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert keys
         obj: Union[torch.nn.Module, dict[str, torch.nn.Module]] = self.nn_modules
         for k in keys.split("."):
@@ -1042,7 +1352,11 @@ def get_submodule(self, keys: str) -> Union[torch.nn.Module, Any]:
                 obj = getattr(obj, k)
         return obj
 
+<<<<<<< HEAD
     def new_var(self, name: str = "tmp") -> str:
+=======
+    def new_var(self, name="tmp"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         existing = set(self.code_options["co_varnames"])
         # In common case, this will be O(1)
         while True:
@@ -1051,12 +1365,17 @@ def new_var(self, name: str = "tmp") -> str:
                 self.code_options["co_varnames"] += (var,)
                 return var
 
+<<<<<<< HEAD
     def update_co_names(self, name: str) -> None:
+=======
+    def update_co_names(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Ensure self.code_options.co_names contains name"""
         if name not in self.code_options["co_names"]:
             self.code_options["co_names"] += (name,)
 
     @staticmethod
+<<<<<<< HEAD
     def module_key_name(*names: Any) -> str:
         # create a new unique name
         name = "_".join(map(str, names))
@@ -1068,6 +1387,11 @@ def module_key_name(*names: Any) -> str:
         name = re.sub(
             r"getattr\(\s*([^,]+?)\s*,\s*(['\"])([^'\"]+)\2\s*\)", r"\1.\3", name
         )
+=======
+    def module_key_name(*names):
+        # create a new unique name
+        name = "_".join(map(str, names))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Strip the guard lookup L/G access
         name = re.sub(r"^[GL]\['?(.*?)'?\]$", r"\1", name)
         # e.g. replace abc.xyz[123].qkv with abc.xyz_123.qkv
@@ -1083,6 +1407,7 @@ def module_key_name(*names: Any) -> str:
     def register_static_attr_and_return_proxy(
         self, attr_prefix: str, attr_value: Any
     ) -> fx.Proxy:
+<<<<<<< HEAD
         # Check if the module already exists, if it does, return the already
         # added proxy. This is important for executorch tests.
         if isinstance(attr_value, torch.nn.Module):
@@ -1091,6 +1416,8 @@ def register_static_attr_and_return_proxy(
                     proxy = self.create_proxy("get_attr", name, (), {})
                     return proxy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attr_name = get_unique_name_wrt(attr_prefix, self.nn_modules)
         # TODO `nn_modules` has been historically overloaded to store a lot more
         # than just nn module objects, fix that.
@@ -1102,9 +1429,15 @@ def register_static_attr_and_return_proxy(
     def register_attr_or_module(
         self,
         target: Union[torch.nn.Module, torch.Tensor, Any],
+<<<<<<< HEAD
         *names: Any,
         **options: Any,
     ) -> VariableTracker:
+=======
+        *names,
+        **options,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_dynamic_nn_module(target, self.export):
             # Instead of returning UnspecializedNNModuleVariable, call
             # VariableTracker.build so that it is tracked for mutation.
@@ -1131,13 +1464,20 @@ def register_attr_or_module(
                 # are registered as get_attr nodes in the root graph.
                 tracer = self.root_tracer
 
+<<<<<<< HEAD
             def wrap_name(module_key: str) -> VariableTracker:
+=======
+            def wrap_name(module_key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert self.param_name_to_source is not None
                 self.param_name_to_source[module_key] = source
 
                 # Check if the attr has already been registered. This can happen
                 # when two different sources point to the same tensor.
+<<<<<<< HEAD
                 assert self.root_tx is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if target in self.root_tx.output.side_effects:
                     return self.root_tx.output.side_effects[target]
 
@@ -1159,9 +1499,14 @@ def wrap_name(module_key: str) -> VariableTracker:
                 # different sources pointing to the same tensor object.
                 vt = self.root_tx.output.side_effects.track_object_existing(target, vt)
 
+<<<<<<< HEAD
                 assert "tensor_dict" not in vt.as_proxy().node.meta
                 # pyrefly: ignore [bad-argument-type]
                 vt.as_proxy().node.meta["tensor_dict"] = _extract_tensor_dict(target)
+=======
+                assert "tensor_dict" not in vt.proxy.node.meta
+                vt.proxy.node.meta["tensor_dict"] = _extract_tensor_dict(target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return vt
 
@@ -1171,8 +1516,12 @@ def wrap_name(module_key: str) -> VariableTracker:
             if source:
                 install_guard(source.make_guard(GuardBuilder.NN_MODULE))
 
+<<<<<<< HEAD
                 def wrap_name(module_key: str) -> VariableTracker:
                     # pyrefly: ignore [bad-argument-type]
+=======
+                def wrap_name(module_key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return NNModuleVariable(type(target), module_key, target, **options)
 
             else:
@@ -1180,7 +1529,11 @@ def wrap_name(module_key: str) -> VariableTracker:
                 # from higher order ops. NNModuleVariable tracker can't be
                 # sourceless, so let's return a unspecializedNNModule variable
                 # tracker.
+<<<<<<< HEAD
                 def wrap_name(module_key: str) -> VariableTracker:
+=======
+                def wrap_name(module_key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return variables.UnspecializedNNModuleVariable(target, **options)
 
         elif isinstance(target, (torch.SymInt, torch.SymFloat)):
@@ -1191,7 +1544,11 @@ def wrap_name(module_key: str) -> VariableTracker:
             # own storage
             # alas, this is like this for now
 
+<<<<<<< HEAD
             def wrap_name(module_key: str) -> VariableTracker:
+=======
+            def wrap_name(module_key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return SymNodeVariable.create(
                     self,
                     self.create_proxy("get_attr", module_key, (), {}),
@@ -1202,7 +1559,11 @@ def wrap_name(module_key: str) -> VariableTracker:
             # HACKY CODE REGION END
         else:
 
+<<<<<<< HEAD
             def wrap_name(module_key: str) -> VariableTracker:
+=======
+            def wrap_name(module_key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.output.update_co_names(module_key)
                 self.global_scope[module_key] = target
                 return VariableTracker.build(
@@ -1221,7 +1582,11 @@ def wrap_name(module_key: str) -> VariableTracker:
         self.nn_modules[name] = target
         if isinstance(target, torch.nn.Module):
 
+<<<<<<< HEAD
             def register_leaf_name(leaf_name: str) -> None:
+=======
+            def register_leaf_name(leaf_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert self.param_name_to_source is not None
                 new_source = ParamBufferSource(source, leaf_name)
                 new_name = f"{name}.{leaf_name}"
@@ -1242,9 +1607,13 @@ def register_leaf_name(leaf_name: str) -> None:
 
         return wrap_name(name)
 
+<<<<<<< HEAD
     def handle_aliases_for_stolen_lists(
         self, tx: "InstructionTranslatorBase"
     ) -> tuple[list[Instruction], dict[Source, Source]]:
+=======
+    def handle_aliases_for_stolen_lists(self, tx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If list inputs are stolen, but still needed after the function call, create aliases to keep them alive
         maybe_gm = self.local_scope.get("self")
         stolen_list_names = get_locals_to_steal(maybe_gm)
@@ -1315,7 +1684,11 @@ def handle_aliases_for_stolen_lists(
                         [
                             create_instruction("LOAD_FAST", argval=list_name),
                             create_load_const(list_idx),
+<<<<<<< HEAD
                             create_binary_subscr(),
+=======
+                            create_instruction("BINARY_SUBSCR"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             create_instruction("STORE_FAST", argval=alias_name),
                         ]
                     )
@@ -1330,9 +1703,13 @@ def handle_aliases_for_stolen_lists(
         # other parts of Dynamo like guards.
         return alias_insts, overridden_sources
 
+<<<<<<< HEAD
     def _get_stack_values_to_restore(
         self, tx: "InstructionTranslatorBase", stack_pops: int
     ) -> tuple[list[VariableTracker], StackLocalsMetadata]:
+=======
+    def _get_stack_values_to_restore(self, tx, stack_pops):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Gets the stack + locals values belonging to tx that need to be restored.
 
@@ -1344,6 +1721,10 @@ def _get_stack_values_to_restore(
 
         Returns:
             - stack_values: stack and locals values that need to be restored
+<<<<<<< HEAD
+=======
+            - restore_vars: names of locals corresponding to the locals part of `stack_values`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             - meta: locations of NULLs and ContextWrappingVariables in the stack/locals
                 (ignores the top `stack_pops` values on the stack)
         """
@@ -1372,10 +1753,16 @@ def _get_stack_values_to_restore(
                 meta.stack_ctx_args.append((len(stack_values) - 1, target_values))
                 meta.stack_ctx_idxes_orig.append(i)
 
+<<<<<<< HEAD
         meta.num_stack = len(stack_values)
 
         cell_and_freevars = set(tx.cellvars() + tx.freevars())
 
+=======
+        # Add all the local vars to the "stack" so restore at the end
+        restore_vars: list[str] = []
+        val_to_names: dict[VariableTracker, list[str]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: Typically (i.e., for graph compile from RETURN_VALUE),
         # symbolic_locals will be empty at this point, as prune_dead_locals
         # will clear out all of symbolic_locals because RETURN_VALUE is the
@@ -1390,6 +1777,7 @@ def _get_stack_values_to_restore(
             # This will in turn result in spurious variables showing up in the graph.
             # This was very tricky to debug. For an example, dump the graph at call_user_compiler
             # while running test_subgraphs.py
+<<<<<<< HEAD
             # Do not include top-frame unmodified locals here - otherwise, the compiled graph may
             # erroneously include them as part of the return. We manually codegen them afterward.
             if (
@@ -1404,6 +1792,14 @@ def _get_stack_values_to_restore(
             # Do not load variable if it is NULL.
             if sys.version_info >= (3, 12):
                 # NOTE: do not use isinstance, since it realizes lazy VT's
+=======
+            if isinstance(v.source, LocalSource) and v.source.local_name == k:
+                continue  # no need to restore initial state
+            if isinstance(v, CellVariable) and v.local_name == k:
+                continue  # no need to restore initial state
+            # Do not load variable if it is NULL.
+            if sys.version_info >= (3, 12):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Continuation function will load the NULL for v.
                 if type.__instancecheck__(NullVariable, v):
                     meta.locals_null_keys.append(k)
@@ -1411,23 +1807,43 @@ def _get_stack_values_to_restore(
             else:
                 # A variable should never be NULL in < 3.12
                 assert not type.__instancecheck__(NullVariable, v)
+<<<<<<< HEAD
             meta.locals_names[k] = len(meta.locals_names)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(v, ContextWrappingVariable):
                 target_values = (
                     () if v.target_values is None else tuple(v.target_values)
                 )
                 meta.locals_ctx_args.append((k, target_values))
+<<<<<<< HEAD
             stack_values.append(v)
 
         return stack_values, meta
+=======
+            if v not in val_to_names:
+                val_to_names[v] = []
+            val_to_names[v].append(k)
+        for v in val_to_names.keys():
+            restore_vars.extend(val_to_names[v])
+            stack_values.extend([v] * len(val_to_names[v]))
+
+        return stack_values, restore_vars, meta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compile_subgraph(
         self,
         tx: "InstructionTranslatorBase",
         reason: GraphCompileReason,
+<<<<<<< HEAD
         partial_convert: bool = False,
         stack_pops: int = 0,
     ) -> list[StackLocalsMetadata]:
+=======
+        partial_convert=False,
+        stack_pops=0,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Compiles the current subgraph, with inputs w.r.t. self.root_tx, and codegens:
             - Call the compiled subgraph
@@ -1445,9 +1861,15 @@ def compile_subgraph(
 
         assert self.root_tx is not None
 
+<<<<<<< HEAD
         if not config.nested_graph_breaks:
             # expect to only compile 1 frame
             assert self.root_tx is tx
+=======
+        # FIXME temporary assert to make sure we're not accidentally compiling nested graph breaks
+        # before we're done the full implementation
+        assert self.root_tx is tx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # bytecode tracing has finished. Pop the context manager for dynamo_timed
         self.mark_bytecode_tracing_stop()
@@ -1461,16 +1883,29 @@ def compile_subgraph(
         # prefix instructions (Python 3.11+)
         prefix_insts: list[Instruction] = []
         if sys.version_info >= (3, 11):
+<<<<<<< HEAD
             for inst in self.root_tx.prefix_insts:
                 if inst.opname == "COPY_FREE_VARS":
                     prefix_insts.append(
                         create_instruction(
                             "COPY_FREE_VARS",
                             arg=len(self.root_tx.code_options["co_freevars"]),
+=======
+            for inst in tx.prefix_insts:
+                if inst.opname == "MAKE_CELL":
+                    prefix_insts.append(
+                        create_instruction("MAKE_CELL", argval=inst.argval)
+                    )
+                elif inst.opname == "COPY_FREE_VARS":
+                    prefix_insts.append(
+                        create_instruction(
+                            "COPY_FREE_VARS", arg=len(tx.code_options["co_freevars"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     )
                 else:
                     prefix_insts.append(copy.copy(inst))
+<<<<<<< HEAD
 
         # stack values and restore vars for each frame are pushed in reverse order
         # i.e. last element corresponds to root frame (1),
@@ -1497,6 +1932,8 @@ def compile_subgraph(
         # "Garbage collect the heap".
         self.side_effects.prune_dead_object_new(tx)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.add_output_instructions(prefix_insts)
 
         assert not (self.pregraph_bytecode and self.export), (
@@ -1509,8 +1946,37 @@ def compile_subgraph(
         )
         self.add_output_instructions(alias_insts)
 
+<<<<<<< HEAD
         self.cleanup_graph()
 
+=======
+        # Exit from all context manager variables to make sure global state is restored
+        for block in reversed(self.root_tx.block_stack):
+            block.exit(self.root_tx, is_graph_break=reason.graph_break)
+
+        self.cleanup_graph()
+
+        # stack values and restore vars for each frame are pushed in reverse order
+        # i.e. last element corresponds to root frame, first element corresponds to current frame
+        all_stack_values = []
+        all_restore_vars = []
+        all_stack_locals_metas = []
+        cur_tx: Optional[InstructionTranslatorBase] = tx
+        while True:
+            assert cur_tx is not None
+            # this should have been checked by the caller
+            assert all(block.can_restore() for block in cur_tx.block_stack)
+            stack_values, restore_vars, meta = self._get_stack_values_to_restore(
+                cur_tx, stack_pops
+            )
+            all_stack_values.append(stack_values)
+            all_restore_vars.append(restore_vars)
+            all_stack_locals_metas.append(meta)
+            if cur_tx is self.root_tx:
+                break
+            cur_tx = tx.parent
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Use nn.Module "proxies" in the constructed GraphModule so that
         # the resulting GM does not hold additional strong references to the original modules.
         # This prevents a strong ref cycle where Dynamo created code holds on to references
@@ -1524,6 +1990,7 @@ def compile_subgraph(
 
         from .decorators import disable
 
+<<<<<<< HEAD
         if has_user_objects():
             # NB: This is where we store possible user objects before running the graph
             # index_to_user_object_weakref is the function used in the graph to translate
@@ -1545,6 +2012,8 @@ def compile_subgraph(
             codegen.pop_top()
             self.add_output_instructions(codegen.get_instructions())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # to handle random calls
         if len(self.random_calls) > 0:
             random_calls_instructions = []
@@ -1566,6 +2035,7 @@ def compile_subgraph(
             )
             self.add_output_instructions(random_calls_instructions)
 
+<<<<<<< HEAD
         # Codegen stack convention before the unsupported instruction
         # NOTE: in these comment blocks, "locals" EXCLUDE free and cell vars.
         # NOTE: stack/locals/cells must be codegen'd BEFORE the unsupported instruction, since the latter
@@ -1591,6 +2061,15 @@ def compile_subgraph(
         if (
             self.root_tx is tx  # single frame
             and stack_values_flat
+=======
+        # call compiled fx graph
+        graph_output_var = None
+        stored_graph_output_var = False
+        root_stack_values = all_stack_values[-1]
+        if (
+            self.root_tx is tx
+            and root_stack_values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and all(
                 not isinstance(
                     v,
@@ -1601,10 +2080,17 @@ def compile_subgraph(
                     ),
                 )
                 and not (isinstance(v, SymNodeVariable) and v.python_type() is float)
+<<<<<<< HEAD
                 for v in stack_values_flat
             )
             and all(isinstance(x, TensorVariable) for x in stack_values_flat)
             and len(set(stack_values_flat)) == len(stack_values_flat)
+=======
+                for v in root_stack_values
+            )
+            and all(isinstance(x, TensorVariable) for x in root_stack_values)
+            and len(set(root_stack_values)) == len(root_stack_values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and self.side_effects.is_empty()
             and not tx.debug_locals
             and not self.backward_state
@@ -1612,6 +2098,7 @@ def compile_subgraph(
             and not all_stack_locals_metas[-1].locals_null_keys
         ):
             # optimization to generate better code in a common case
+<<<<<<< HEAD
 
             # codegen cells
             # no side effects, so no new cells created - no need to call side_effects.codegen_save_tempvars
@@ -1633,6 +2120,20 @@ def compile_subgraph(
             graph_output_var = self.new_var("graph_out")
             # load stack values in a flat manner - we will codegen bytecode to place them correctly
             # according to our convention above
+=======
+            self.add_output_instructions(
+                self.compile_and_call_fx_graph(
+                    tx, list(reversed(root_stack_values)), root
+                )
+                + [create_instruction("UNPACK_SEQUENCE", arg=len(root_stack_values))]
+            )
+        else:
+            graph_output_var = self.new_var("graph_out")
+            # load stack values in a flat manner for now - will likely change later.
+            stack_values_flat = [
+                val for vals in reversed(all_stack_values) for val in vals
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pass1 = PyCodegen(
                 self.root_tx,
                 root,
@@ -1659,6 +2160,7 @@ def compile_subgraph(
             )
             self.codegen_suffix(tx, stack_values_flat, pass2)
 
+<<<<<<< HEAD
             if (
                 torch._dynamo.config.log_graph_in_out_metadata
                 and stack_values_flat
@@ -1707,6 +2209,8 @@ def compile_subgraph(
 
                     self.export_metadata.out_spec = out_spec.as_python_constant()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = []
             if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
                 output.extend(
@@ -1724,6 +2228,7 @@ def compile_subgraph(
                 self.run_compiler_collective()
             self.add_output_instructions(output + pass2.get_instructions())
 
+<<<<<<< HEAD
         # store all stack and locals for each frame
         # current state of the stack:
         # all cells,
@@ -1908,6 +2413,28 @@ def codegen_suffix(
         stack_values: list[VariableTracker],
         cg: PyCodegen,
     ) -> None:
+=======
+        # restore all the live local vars of the root
+        local_restore_cg = PyCodegen(
+            self.root_tx, overridden_sources=overridden_sources
+        )
+        # TODO this local restoration should be removed when fully implementing nested graph breaks
+        self.add_output_instructions(
+            [
+                local_restore_cg.create_store(var)
+                for var in reversed(all_restore_vars[-1])
+            ]
+        )
+
+        if graph_output_var and stored_graph_output_var:
+            self.add_output_instructions(
+                [local_restore_cg.create_delete(graph_output_var)]
+            )
+
+        return all_stack_locals_metas
+
+    def codegen_suffix(self, tx, stack_values, cg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: `codegen_save_tempvars` must run first to update `source` fields
         # for variables with `AttributeMutationNew`, as they don't implement
         # `reconstruct` themselves.
@@ -1916,12 +2443,18 @@ def codegen_suffix(
             assert not self.export
             for name, val in self.backward_state.items():
                 cg(val)
+<<<<<<< HEAD
                 assert self.backward_state_var is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cg.append_output(cg.create_load(self.backward_state_var))
                 cg.store_attr(name)
         self.side_effects.codegen_hooks(cg)
 
+<<<<<<< HEAD
         # TODO get debug_locals working for nested graph breaks
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Return variables used for logging at the end
         for debug_var, args in tx.debug_locals:
             cg.add_push_null(lambda: cg(debug_var))
@@ -1930,6 +2463,7 @@ def codegen_suffix(
             cg.extend_output(create_call_function(len(args), False))
             cg.extend_output([create_instruction("POP_TOP")])
 
+<<<<<<< HEAD
         # codegen cells before we apply side effects
         self.codegen_cells(tx, cg)
 
@@ -1937,6 +2471,12 @@ def codegen_suffix(
         self.side_effects.codegen_update_mutated(cg)
 
     def cleanup_graph(self) -> None:
+=======
+        cg.restore_stack(stack_values, value_from_source=not tx.export)
+        self.side_effects.codegen_update_mutated(cg)
+
+    def cleanup_graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Remove "creation_timestamp" from node meta
 
@@ -1950,7 +2490,11 @@ def cleanup_graph(self) -> None:
             node.meta.pop("creation_timestamp", None)
 
         grad_enabled = torch.is_grad_enabled()
+<<<<<<< HEAD
         for node1, node2 in itertools.pairwise(nodes):
+=======
+        for node1, node2 in zip(nodes, nodes[1:]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 node1.target is torch._C._set_grad_enabled
                 and tuple(node1.args) == (not grad_enabled,)
@@ -1966,6 +2510,7 @@ def cleanup_graph(self) -> None:
                     self.graph.erase_node(node1)
                     self.graph.erase_node(node2)
 
+<<<<<<< HEAD
     def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
         """
         Do not save this output graph to the CompilePackage
@@ -1994,6 +2539,10 @@ def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
 
     def get_graph_sizes_structured(self) -> dict[str, list[Union[int, str]]]:
         ret: dict[str, list[Union[int, str]]] = {}
+=======
+    def get_graph_sizes_structured(self):
+        ret = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in self.graph.nodes:
             example_value = node.meta.get("example_value", None)
             if isinstance(example_value, torch._subclasses.FakeTensor):
@@ -2001,7 +2550,11 @@ def get_graph_sizes_structured(self) -> dict[str, list[Union[int, str]]]:
                 ret[node.name] = [s if isinstance(s, int) else repr(s) for s in size]
         return ret
 
+<<<<<<< HEAD
     def get_graph_sizes(self, name: str) -> str:
+=======
+    def get_graph_sizes(self, name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_sizes_str = "TRACED GRAPH TENSOR SIZES\n"
         graph_sizes_str += f"===== {name} =====\n"
         for node in self.graph.nodes:
@@ -2027,7 +2580,11 @@ def get_graph_sizes(self, name: str) -> str:
         return graph_sizes_str
 
     @contextlib.contextmanager
+<<<<<<< HEAD
     def restore_global_state(self) -> Any:
+=======
+    def restore_global_state(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Momentarily restores the global state to what it was prior to tracing the current output
         """
@@ -2044,12 +2601,19 @@ def restore_global_state(self) -> Any:
                 GlobalContextCheckpointState(current_global_state)
             )
 
+<<<<<<< HEAD
     def run_compiler_collective(self) -> None:
+=======
+    def run_compiler_collective(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx = self.root_tx
         assert tx is not None
         if (ds := tx.distributed_state) is not None and ds.all_states is None:
             compile_pg = ds.compile_pg
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.info("compiler_collective %s", ds.local_state)
             torch._logging.trace_structured(
                 "artifact",
@@ -2069,22 +2633,31 @@ def run_compiler_collective(self) -> None:
                 ),
                 dynamo_timed("compiler_collective", log_pt2_compile_event=True),
             ):
+<<<<<<< HEAD
                 all_states: list[Any] = [None] * compile_pg.size()
 
                 dist.all_gather_object(all_states, ds.local_state, group=compile_pg)
 
+=======
+                all_states = [None] * compile_pg.size()
+                dist.all_gather_object(all_states, ds.local_state, group=compile_pg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ds.all_states = all_states
             # Clear speculation log, because are tracing may diverge due to
             # this information from the compiler collective
             tx.speculation_log.clear()
             raise exc.CompileCollectiveRestartAnalysis
 
+<<<<<<< HEAD
     def compile_and_call_fx_graph(
         self,
         tx: "InstructionTranslatorBase",
         rv: list[VariableTracker],
         root: FakeRootModule,
     ) -> list[Instruction]:
+=======
+    def compile_and_call_fx_graph(self, tx, rv, root):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Generate code from self.graph and return the Instruction()s to
         call that generated code.
@@ -2098,8 +2671,11 @@ def compile_and_call_fx_graph(
             assert self.should_exit
 
             self.run_compiler_collective()
+<<<<<<< HEAD
             if count_calls(self.graph) == 0 and len(rv) == 0:
                 return []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             name = unique_id("__compiled_fn", with_uuid=True)
 
@@ -2113,7 +2689,11 @@ def compile_and_call_fx_graph(
                 {},
             )
             sub_gms = self.dedup_pass()
+<<<<<<< HEAD
             root.add_nn_modules(sub_gms)  # type: ignore[arg-type]
+=======
+            root.add_nn_modules(sub_gms)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.current_tracer._maybe_preserve_original_meta(tx, output_node)
             if not config.do_not_emit_runtime_asserts:
@@ -2123,6 +2703,7 @@ def compile_and_call_fx_graph(
                 # while creating the graph module because self.graph and root
                 # are out of sync. This only happens for `get_attr` nodes, so
                 # here we clean up the get_attr nodes that are unused.
+<<<<<<< HEAD
                 for attr in dir(root):
                     subgraph = getattr(root, attr)
                     if isinstance(subgraph, fx.GraphModule):
@@ -2132,6 +2713,8 @@ def compile_and_call_fx_graph(
                             name,
                             export=self.export,
                         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.remove_unused_get_attr_nodes()
                 insert_deferred_runtime_asserts(
                     fx.GraphModule(root, self.graph),
@@ -2162,6 +2745,7 @@ def compile_and_call_fx_graph(
             for register_finalizer in self.register_finalizer_fns:
                 register_finalizer(gm)
 
+<<<<<<< HEAD
             if next(gm.parameters(), None) is not None:
                 # If dynamo produces a graph with parameters, skip package stuff
                 # Bypass output graph
@@ -2176,12 +2760,18 @@ def compile_and_call_fx_graph(
             if self.package is not None:
                 gm._backend_id = name
 
+=======
+            gm._backend_id = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gm.compile_subgraph_reason = self.compile_subgraph_reason
             gm.meta["dynamo_flat_name_to_original_fqn"] = (
                 self.dynamo_flat_name_to_original_fqn.copy()
             )
             gm.meta["dynamo_compile_id"] = self.dynamo_compile_id
+<<<<<<< HEAD
             gm.meta["backend_id"] = name
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             graph_code_log.debug(
                 "%s",
@@ -2198,7 +2788,10 @@ def compile_and_call_fx_graph(
             )
             self.call_cleanup_hooks()
             old_fake_mode = self.tracing_context.fake_mode
+<<<<<<< HEAD
             assert old_fake_mode is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self.export:
                 import torch._functorch.config as _config
 
@@ -2246,7 +2839,10 @@ def compile_and_call_fx_graph(
             )
 
             counters["stats"]["unique_graphs"] += 1
+<<<<<<< HEAD
             assert old_fake_mode.shape_env is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if specializations := old_fake_mode.shape_env.specializations:
                 specialization_guards = []
                 specialization_cache: dict[Specialization, Callable[[Any], Any]] = {}
@@ -2254,10 +2850,14 @@ def compile_and_call_fx_graph(
                 for specialization in specializations:
                     source_index = sources.index(specialization.source)
                     check_fn_source = inspect.getsource(specialization.check_fn).strip()
+<<<<<<< HEAD
                     # Required because the LABDA_GUARD API requires a root guard manager
                     unused_root_guard_manager = RootGuardManager()
                     check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
                         unused_root_guard_manager,
+=======
+                    check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         specialization.check_fn,
                         [check_fn_source],
                     )
@@ -2279,8 +2879,13 @@ def compile_and_call_fx_graph(
                         )
                     )
 
+<<<<<<< HEAD
                 @torch._dynamo.disable(reason="do not trace Dynamo-compiled graph")  # type: ignore[misc]
                 def specialized_dispatch(*args: Any, **kwargs: Any) -> Any:
+=======
+                @torch._dynamo.disable(reason="do not trace Dynamo-compiled graph")
+                def specialized_dispatch(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for check_fn, specialization in specialization_guards:
                         if check_fn(args):
                             if specialization in specialization_cache:
@@ -2310,10 +2915,13 @@ def specialized_dispatch(*args: Any, **kwargs: Any) -> Any:
 
             assert self.root_tx is not None
             cg = PyCodegen(self.root_tx)
+<<<<<<< HEAD
 
             for idx, arg in enumerate(self.graphargs):
                 self.export_metadata.graph_input_idx_to_local_source[idx] = arg.source
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cg.make_call_generated_code(name)
             return cg.get_instructions()
 
@@ -2411,19 +3019,32 @@ def _call_user_compiler(
             },
         )
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         return compiled_fn
 
     def dedup_pass(self) -> dict[str, torch.fx.GraphModule]:
+=======
+        return compiled_fn
+
+    def dedup_pass(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch._dynamo.config.use_graph_deduplication:
             return apply_graph_deduplication(self)
         else:
             return {}
 
+<<<<<<< HEAD
     def install_subgraph(self, name: str, sub_gm: torch.fx.GraphModule) -> str:
         next_name = get_unique_name_wrt(name, self.nn_modules, requires_suffix=True)
         sub_gm.__name__ = next_name  # type: ignore[assignment]
         sub_gm.torchdynamo_force_dynamic = False  # type: ignore[assignment]
+=======
+    def install_subgraph(self, name, sub_gm):
+        next_name = get_unique_name_wrt(name, self.nn_modules, requires_suffix=True)
+        sub_gm.__name__ = next_name
+        sub_gm.torchdynamo_force_dynamic = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This graph module is not present in the user space, so it can't be
         # accessed by a source. Set source=None.
         self.register_attr_or_module(sub_gm, next_name, source=None)
@@ -2452,7 +3073,11 @@ def remove_unused_graphargs(self) -> None:
         assert self.should_exit
 
         # Miniature DCE pass, but only for obviously trivial operations
+<<<<<<< HEAD
         def is_static_true(b_node: fx.node.Argument) -> bool:
+=======
+        def is_static_true(b_node: fx.node.Argument):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if b_node is True:
                 return True
             if not isinstance(b_node, fx.Node):
@@ -2471,7 +3096,11 @@ def is_static_true(b_node: fx.node.Argument) -> bool:
             # doesn't have unbacked inputs, since it's all in the ShapeEnv
             return False
 
+<<<<<<< HEAD
         def is_symnode_arg(a: fx.node.Argument) -> bool:
+=======
+        def is_symnode_arg(a: fx.node.Argument):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.fx.experimental.sym_node import SymTypes
 
             if isinstance(a, (int, float, bool)):
@@ -2483,7 +3112,11 @@ def is_symnode_arg(a: fx.node.Argument) -> bool:
         # NB: We assume that you cannot do mutations on int/float/bool,
         # because they are immutable types, and therefore is always safe to
         # DCE.
+<<<<<<< HEAD
         def is_symnode_compute_node(node: fx.Node) -> bool:
+=======
+        def is_symnode_compute_node(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.fx.experimental.sym_node import SymTypes
 
             if node.op != "call_function":
@@ -2517,7 +3150,11 @@ def is_symnode_compute_node(node: fx.Node) -> bool:
                 ):
                     self.remove_node(node)
 
+<<<<<<< HEAD
         def placeholder_binds_symbol(node: fx.Node) -> Optional[sympy.Symbol]:
+=======
+        def placeholder_binds_symbol(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg = node.meta["grapharg"]
             example = arg.example
             if isinstance(example, torch.SymInt) and isinstance(
@@ -2526,7 +3163,11 @@ def placeholder_binds_symbol(node: fx.Node) -> Optional[sympy.Symbol]:
                 return example.node.expr
             return None
 
+<<<<<<< HEAD
         def remove_unused(node: fx.Node) -> None:
+=======
+        def remove_unused(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("REMOVE UNUSED GRAPHARG %s", node.meta["grapharg"].source.name())
             # I'm not really sure why you need to delete these from the
             # node since the node is going to get removed
@@ -2536,9 +3177,13 @@ def remove_unused(node: fx.Node) -> None:
 
         used_symbols: set[sympy.Symbol] = set()
 
+<<<<<<< HEAD
         def update_used_symbols(
             used_symbols: set[sympy.Symbol], fake: Union[torch.SymInt, torch.Tensor]
         ) -> None:
+=======
+        def update_used_symbols(used_symbols, fake: Union[torch.SymInt, torch.Tensor]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             used_symbols |= free_symbols(fake)
 
         recheck_placeholders = []
@@ -2632,7 +3277,11 @@ def add_output_instructions(self, prefix: list[Instruction]) -> None:
         self.output_instructions.extend(prefix)
         self.should_exit = True
 
+<<<<<<< HEAD
     def install_global_unsafe(self, name: str, value: Any) -> None:
+=======
+    def install_global_unsafe(self, name, value) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         WARNING: prefer the safer `install_global_by_id/install_global`.
         torch.compile instances should be independent of each other;
@@ -2644,7 +3293,11 @@ def install_global_unsafe(self, name: str, value: Any) -> None:
         self.installed_globals.add(name)
         self.cleanups.append(CleanupHook.create(self.global_scope, name, value))
 
+<<<<<<< HEAD
     def install_global_by_id(self, prefix: str, value: Any) -> str:
+=======
+    def install_global_by_id(self, prefix, value) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Installs a global if it hasn't been installed already.
         This is determined by (prefix, id(value)) pair.
@@ -2659,7 +3312,11 @@ def install_global_by_id(self, prefix: str, value: Any) -> str:
         self.install_global_unsafe(name, value)
         return name
 
+<<<<<<< HEAD
     def install_global(self, prefix: str, value: Any) -> str:
+=======
+    def install_global(self, prefix, value) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Installs a global, generating a unique name for it.
 
@@ -2673,9 +3330,14 @@ def install_global(self, prefix: str, value: Any) -> str:
     def cleanup(self) -> None:
         # There is a reference cycle between tracer and OutputGraph, causing
         # some of the tensor objects to be held alive for longer than necessary.
+<<<<<<< HEAD
         self.root_tx = None  # type: ignore[assignment]
         self.nn_modules.clear()
         self.used_inlined_inbuilt_modules_names.clear()
+=======
+        self.root_tx = None
+        self.nn_modules.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.param_name_to_source = None
 
         for node in self.graph.nodes:
@@ -2697,13 +3359,18 @@ def add_graph_finalizer(
     ) -> None:
         self.register_finalizer_fns.append(register_finalizer)
 
+<<<<<<< HEAD
     def example_value_from_input_node(self, node: torch.fx.Node) -> Any:
+=======
+    def example_value_from_input_node(self, node: torch.fx.Node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Extract the non-fake example tensor"""
         if node.op == "placeholder":
             return node.meta["grapharg"].example
         assert node.op == "get_attr"
         return self.nn_modules[node.target]  # type: ignore[index]
 
+<<<<<<< HEAD
     def add_fqn_info_for_inlined_modules(
         self, inlined_module: torch.nn.Module, source: Source
     ) -> None:
@@ -2760,6 +3427,8 @@ def __init__(
         else:
             self.output_graph = tracer.output
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 err_epilogue = (
     "With the current config, we will graph break "
@@ -2770,6 +3439,7 @@ def __init__(
 )
 
 
+<<<<<<< HEAD
 def check_pt2_compliant_op(
     output_graph: OutputGraph, kind: str, target: Any, args: Any, kwargs: Any
 ) -> None:
@@ -2777,11 +3447,22 @@ def check_pt2_compliant_op(
         return
 
     def encountered_compliant_op(target: torch._ops.OpOverload) -> None:
+=======
+def check_pt2_compliant_op(output_graph, kind, target, args, kwargs):
+    if kind != "call_function":
+        return
+
+    def encountered_compliant_op(target):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if target.namespace in {"prim", "prims", "aten"}:
             return
         output_graph.compliant_custom_ops.add(target)
 
+<<<<<<< HEAD
     def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> None:
+=======
+    def encountered_non_compliant_op(target, msg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_graph.non_compliant_ops.add(target)
         if config.only_allow_pt2_compliant_ops:
             unimplemented_v2(
@@ -2833,7 +3514,10 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
                 hints=[],
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op = getattr(target, overload)
         if torch.Tag.pt2_compliant_tag in op.tags:
             encountered_compliant_op(op)
@@ -2841,7 +3525,10 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
             encountered_non_compliant_op(
                 op,
                 f"Encountered the torch.ops.OpOverloadPacket {target} "
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"which resolves to the overload ({overload}) that is "
                 f"not PT2 compliant.",
             )
@@ -2849,6 +3536,7 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
 
 _compile_id_counter = itertools.count()
 
+<<<<<<< HEAD
 P = ParamSpec("P")
 R = TypeVar("R")
 
@@ -2863,11 +3551,21 @@ def __init__(
     ) -> None:
         self.tracer = tracer
         # pyrefly: ignore [invalid-type-var]
+=======
+
+class LazyProxy:
+    def __init__(self, tracer, fn, *args, **kwargs):
+        self.tracer = tracer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fn = fn
         self.args = args
         self.kwargs = kwargs
 
+<<<<<<< HEAD
     def __call__(self) -> Any:
+=======
+    def __call__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.fn(*self.args, **self.kwargs)
 
 
@@ -2879,6 +3577,7 @@ class SubgraphTracer(fx.Tracer):
     compiling and executing the graph.
     """
 
+<<<<<<< HEAD
     def __init__(
         self,
         output_graph: "OutputGraph",
@@ -2886,6 +3585,9 @@ def __init__(
         is_export: bool = False,
         source_target: Optional[Target] = None,
     ) -> None:
+=======
+    def __init__(self, output_graph, parent=None, is_export=False, source_target=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.output_graph = weakref.proxy(output_graph)
         self.graph = torch.fx.Graph()
@@ -2915,11 +3617,16 @@ def __init__(
         # need to keep track of what free variables were lifted so we can
         # rewrite the HigherOrderOperator call using the traced body_fn.
         # Dicts maintain the order of args for the HigherOrderOperator call.
+<<<<<<< HEAD
         self.lifted_freevars: dict[fx.Proxy, fx.Proxy] = {}
+=======
+        self.lifted_freevars = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # map basic symbols (unbacked and unbacked) to their bound proxies.
         # There are only two cases where bound_symbols will be recorded:
         # 1. when we create_graph_input for a backed SymInt that's basic symbol
+<<<<<<< HEAD
         # 2. when we track_produced_symints for intermediate results
         # bound_symbols always map the symbol to the proxy whose
         # tracer is the current tracer that's readily accessible in current tracer's graph.
@@ -2928,6 +3635,11 @@ def __init__(
         # Maps _DynamicScalar object ids to allocated SymInt nodes, for symbol reuse
         self.dynamic_scalar_nodes: dict[int, torch.SymInt] = {}
 
+=======
+        # 2. when we track_unbacked_symbols for intermediate results that contain unbacked symints.
+        self.bound_symbols: dict[sympy.Symbol, Union[torch.fx.Proxy, LazyProxy]] = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.prev_inst = None
         # True if this tracer is currently tracing into torch.utils.checkpoint
         # as part of speculate_subgraph.
@@ -2949,15 +3661,25 @@ def __init__(
         self.debug_level: int = parent.debug_level + 1 if parent is not None else 0
 
         self._cur_code = None
+<<<<<<< HEAD
         self._orig_gm_meta: Optional[list[Any]] = None
         self._orig_gm_lineno_map: Optional[dict[int, Optional[int]]] = None
         self._orig_gm_firstlineno: Optional[int] = None
+=======
+        self._orig_gm_meta = None
+        self._orig_gm_lineno_map = None
+        self._orig_gm_firstlineno = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Each SubgraphTracer is associated with a source target, which indicates
         # which operator this subgraph is attached to. We compute a source_fn_stack
         # based on the source target. For the root tracer, it's set to [].
         # This is useful for debugging and transforming the exported graph.
         if self.parent is None:
+<<<<<<< HEAD
             self.source_fn_stack: list[Any] = []
+=======
+            self.source_fn_stack = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self.source_fn_stack = self.parent.source_fn_stack + [
                 (self.graph._target_to_str(source_target), source_target)
@@ -2974,9 +3696,13 @@ def __init__(
             )
 
     # preserve original meta if it is available
+<<<<<<< HEAD
     def _maybe_preserve_original_meta(
         self, tx: "InstructionTranslatorBase", node: fx.Node
     ) -> None:
+=======
+    def _maybe_preserve_original_meta(self, tx, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             self._orig_gm_meta
             and self._orig_gm_lineno_map
@@ -2998,6 +3724,7 @@ def _maybe_preserve_original_meta(
 
     def create_proxy(
         self,
+<<<<<<< HEAD
         kind: str,
         target: Any,
         args: Any,
@@ -3006,6 +3733,16 @@ def create_proxy(
         type_expr: Optional[Any] = None,
         proxy_factory_fn: Optional[Callable[[fx.Node], fx.Proxy]] = None,
     ) -> fx.Proxy:
+=======
+        kind,
+        target,
+        args,
+        kwargs,
+        name=None,
+        type_expr=None,
+        proxy_factory_fn=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: [Nested SubgraphTracer and free_variable handling]
         # --------------------------------------------------------
         # Read NOTE [HigherOrderOperator tracing design] first.
@@ -3049,6 +3786,7 @@ def create_proxy(
             args, kwargs = pytree.tree_unflatten(new_flat_args, tree_spec)
 
         rv = super().create_proxy(
+<<<<<<< HEAD
             kind,
             target,
             args,
@@ -3056,6 +3794,9 @@ def create_proxy(
             name,
             type_expr,
             proxy_factory_fn,  # type: ignore[arg-type]
+=======
+            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # append stack trace to fx node
@@ -3076,7 +3817,11 @@ def create_proxy(
                 tx_code = tx.f_code
                 header = tx.get_line_of_code_header(lineno=cur_inst.positions.lineno)
 
+<<<<<<< HEAD
                 def get_trace_call_log_str() -> str:
+=======
+                def get_trace_call_log_str():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     line = get_instruction_source_311(tx_code, cur_inst).rstrip()
                     return f"TRACE FX call {rv.node.name} from {header}\n{line}"
 
@@ -3107,6 +3852,7 @@ def get_trace_call_log_str() -> str:
             rv.node.meta["nn_module_stack"] = nn_module_stack.copy()
 
         if kind in {"call_function", "call_method"}:
+<<<<<<< HEAD
             stack = (rv.node.name, target)
             if nn_module_stack:
                 # Current codebase assumes that the nn_module_stack has the
@@ -3122,6 +3868,11 @@ def get_trace_call_log_str() -> str:
                     stack = (rv.node.name, current_nn_module)
 
             rv.node.meta["source_fn_stack"] = self.source_fn_stack + [stack]
+=======
+            rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
+                (rv.node.name, target)
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif kind == "call_module":
             if self.parent is not None:
                 # TODO can remove once inline_inbuilt_nn_modules is always True
@@ -3181,6 +3932,7 @@ def get_trace_call_log_str() -> str:
                 if not tx.is_co_filename_from_nn_modules():
                     frame_summaries.append(tx.frame_summary())
                 tx = getattr(tx, "parent", None)
+<<<<<<< HEAD
 
             filtered_frame_summaries = [
                 frame
@@ -3193,6 +3945,13 @@ def get_trace_call_log_str() -> str:
 
             # official from_list stub doesn't have new-style type
             msgs = traceback.StackSummary.from_list(filtered_frame_summaries).format()
+=======
+            # Reverse the frame_summaries, such that the innermost frame is at the last
+            frame_summaries.reverse()
+
+            # official from_list stub doesn't have new-style type
+            msgs = traceback.StackSummary.from_list(frame_summaries).format()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rv.node.stack_trace = "".join(msgs)
 
         if (
@@ -3205,6 +3964,7 @@ def get_trace_call_log_str() -> str:
         return rv
 
     def create_node(
+<<<<<<< HEAD
         self,
         op: str,
         target: Target,
@@ -3213,6 +3973,10 @@ def create_node(
         name: Optional[str] = None,
         type_expr: Optional[Any] = None,
     ) -> fx.Node:
+=======
+        self, op, target, args=None, kwargs=None, name=None, type_expr=None
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_pt2_compliant_op(self.output_graph, op, target, args, kwargs)
         if self.parent is not None:
             flat_args = pytree.arg_tree_leaves(*args, **kwargs)
@@ -3230,7 +3994,11 @@ def create_node(
 
     # Note: we did not override erase_node since
     # we call self.graph.erase_node elsewhere
+<<<<<<< HEAD
     def remove_node(self, node: fx.Node) -> None:
+=======
+    def remove_node(self, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(node.users) > 0:
             user_graph_nodes: list[torch.fx.Node] = []
             for user in node.users.keys():
@@ -3253,6 +4021,7 @@ def remove_node(self, node: fx.Node) -> None:
     # Remove this if https://github.com/pytorch/pytorch/issues/99007 gets
     # fixed.
     def create_graph_input(
+<<<<<<< HEAD
         self,
         name: str,
         type_expr: Any,
@@ -3260,6 +4029,10 @@ def create_graph_input(
         before: bool = False,
         source: Optional[Source] = None,
     ) -> fx.Proxy:
+=======
+        self, name, type_expr, example_value, before=False, source=None
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(example_value, torch.Tensor):
             self._input_versions_at_beginning.append(example_value._version)
         log.debug(
@@ -3285,7 +4058,10 @@ def create_graph_input(
         # So we are a bit more strict about what sources can become inputs
         # in export
         if self.is_export and self.parent is None:
+<<<<<<< HEAD
             assert source is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not is_from_local_source(source, only_allow_input=True):
                 self.output_graph.source_to_user_stacks.setdefault(source, []).append(
                     TracingContext.extract_stack()
@@ -3320,6 +4096,7 @@ def create_graph_input(
             self._used_names.add(name)
 
             # NOTE: [Auto lift basic free symbols when create_graph_input]
+<<<<<<< HEAD
             # There are two sources of basic symbols:
             #
             # - They can come from inputs, e.g. when an input tensor is specified as dynamic. We handle
@@ -3348,6 +4125,29 @@ def create_graph_input(
             # immediately after they're created at wrap_fx_proxy with track_produced_symints. Notice
             # that for basic symbols that're already tracked by create_graph_input, we won't track it again.
             #
+=======
+            # Whenever we call create_graph_input, we try to also lift the basic symbols in example values
+            # as graph input.
+            # This applies to both top-level graph and subgraphs in higher order ops.
+            # It has several cases:
+            #  1. When create_graph_input for a tensor that has symbolic shapes,
+            #     we look for basic symbols in its size and stride, we check if the symbol is bound
+            #     in current graph (i.e. bound_symbols), it it's not bound, we'll create a placeholder
+            #     for it then recursively check its parent, creates ph if not bound.
+            #     Every tracer maintains a mapping (i.e. lifted_freevars)
+            #     that maps from parent proxy to proxy in current tracer for the symbol.
+            #  2. When create_graph_input for a tensor with unbacked symbolic shapes,
+            #     Backed symbols all come from inputs's symbolic shape. But unbacked symbols
+            #     can be created while tracing. So we use track_unbacked_symbols will intercept
+            #     at wrap_fx_proxy, and try to bind the unbacked symbols immediately after they're
+            #     created.
+            #  3. subgraph will also lifted basic symbols in compound exprs of tensor shape.
+            #     For example, if an input to subgraph takes size [s1+s2//8], we'll look for the
+            #     the free symbols in the sizes and lift as inputs similar to 1 in _lift_symbols_in_symint)
+            #  4. When create_graph_input for a SymInt, if the symint is a basic symbol, we'll track it
+            #     in bound_symbols so that we don't lift the same basic symbol twice. When the symint is a
+            #     compound expr, we'll just create the proxy for the compouned expr but not lift its basic symbols.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Also see NOTE: [Export inputs must be explicitly passed in]
             is_strict_export = self.is_export
             is_non_strict_export = torch.compiler.is_compiling()
@@ -3375,9 +4175,13 @@ def create_graph_input(
             return proxy
 
     # See NOTE: [Nested SubgraphTracer and free_variable handling] for more details
+<<<<<<< HEAD
     def lift_tracked_freevar_to_input(
         self, proxy: fx.Proxy
     ) -> Union[LazyProxy, fx.Proxy]:
+=======
+    def lift_tracked_freevar_to_input(self, proxy):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # You're doing something wrong if we are the root SubgraphTracer because
         # Dynamo adds tensors to graph inputs before creating a proxy for them.
         assert self.parent is not None, (
@@ -3417,7 +4221,11 @@ def lift_tracked_freevar_to_input(
         self.lifted_freevars[proxy] = new_proxy
         return new_proxy
 
+<<<<<<< HEAD
     def maybe_lift_tracked_freevar_to_input(self, arg: Any) -> Any:
+=======
+    def maybe_lift_tracked_freevar_to_input(self, arg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         If arg is a free variable, then lift it to be an input.
         Returns the new lifted arg (if arg was a freevar), else the
@@ -3443,6 +4251,7 @@ def maybe_lift_tracked_freevar_to_input(self, arg: Any) -> Any:
 
     # See NOTE: [Auto lift basic free symbols when create_graph_input] for overall design
     # You MUST call this API every time when creating a proxy in wrap_fx_proxy for a call
+<<<<<<< HEAD
     # that produced symints or tensors with unbacked symint shapes.
     # This function is used to track the symints with its proxies created during
     # dynamo tracing so that subgraph knows how to bind a symbol input with parent's proxy.
@@ -3452,6 +4261,16 @@ def maybe_lift_tracked_freevar_to_input(self, arg: Any) -> Any:
     def track_produced_symints(
         self, example_value: Any, e_proxy: Union[LazyProxy, torch.fx.Proxy]
     ) -> None:
+=======
+    # that produced unbacked symints or tensors with unbacked symint shapes.
+    # This function is used to track the unbacked symints with its proxies created during
+    # dynamo tracing so that subgraph knows how to bind a symbol input with parent's proxy.
+    # LazyProxy are created for tensor shapes that're unbacked so that we don't create proxies
+    # for symbols that're not going to be used.
+    def track_unbacked_symbols(
+        self, example_value, e_proxy: Union[LazyProxy, torch.fx.Proxy]
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # When binding the symbols in an exmaple_value, we bind the symbols
         # to the proxy's associated Tracer instead of current tracer.
         # This is because:
@@ -3468,12 +4287,17 @@ def track_produced_symints(
         tracer = e_proxy.tracer
         assert isinstance(tracer, SubgraphTracer)
 
+<<<<<<< HEAD
         def need_bind(s: Any) -> bool:
+=======
+        def need_bind(s) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.fx.experimental.symbolic_shapes import is_symbolic
 
             return (
                 is_symbolic(s)
                 and isinstance(s.node.expr, sympy.Symbol)
+<<<<<<< HEAD
                 and s.node.expr not in self.bound_symbols
             )
 
@@ -3488,12 +4312,26 @@ def _proxy_with_example_value(
                 proxy = tracer.create_proxy(*args, **kwargs)
                 set_example_value(proxy.node, example_value)
                 return proxy
+=======
+                and s.node.shape_env.is_unbacked_symint(s.node.expr)
+                and s.node.expr not in self.bound_symbols
+            )
+
+        def _proxy_with_example_value(example_value, *args, **kwargs):
+            proxy = tracer.create_proxy(*args, **kwargs)
+            set_example_value(proxy.node, example_value)
+            return proxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(example_value, torch.Tensor):
             for i, s in enumerate(example_value.size()):
                 if need_bind(s):
                     log.debug(
+<<<<<<< HEAD
                         "track_produced_symints %s for %s.size()[%s] at debug_level %s",
+=======
+                        "_track_unbacked_symbols %s for %s.size()[%s] at debug_level %s",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         s,
                         e_proxy,
                         i,
@@ -3509,6 +4347,7 @@ def _proxy_with_example_value(
                         {},
                         type_expr=type(s),
                     )
+<<<<<<< HEAD
                     self.track_produced_symints(s, lazy_proxy)
 
             storage_offset = example_value.storage_offset()
@@ -3530,12 +4369,19 @@ def _proxy_with_example_value(
                     type_expr=type(storage_offset),
                 )
                 self.track_produced_symints(storage_offset, lazy_proxy)
+=======
+                    self.track_unbacked_symbols(s, lazy_proxy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if example_value.layout is torch.strided:
                 for i, s in enumerate(example_value.stride()):
                     if need_bind(s):
                         log.debug(
+<<<<<<< HEAD
                             "track_produced_symints %s for %s.stride()[%s] at debug_level %s",
+=======
+                            "_track_unbacked_symbols %s for %s.stride()[%s] at debug_level %s",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             s,
                             e_proxy,
                             i,
@@ -3551,6 +4397,7 @@ def _proxy_with_example_value(
                             {},
                             type_expr=type(s),
                         )
+<<<<<<< HEAD
                         self.track_produced_symints(s, lazy_proxy)
 
             elif example_value.layout is torch.sparse_coo:
@@ -3562,12 +4409,31 @@ def _proxy_with_example_value(
             elif example_value.layout in {torch.sparse_csc, torch.sparse_bsc}:
                 self.track_produced_symints(example_value.ccol_indices(), e_proxy)
                 self.track_produced_symints(example_value.row_indices(), e_proxy)
+=======
+                        self.track_unbacked_symbols(s, lazy_proxy)
+
+            elif example_value.layout is torch.sparse_coo:
+                self.track_unbacked_symbols(example_value._indices(), e_proxy)
+                self.track_unbacked_symbols(example_value._values(), e_proxy)
+            elif example_value.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                self.track_unbacked_symbols(example_value.crow_indices(), e_proxy)
+                self.track_unbacked_symbols(example_value.col_indices(), e_proxy)
+            elif example_value.layout in {torch.sparse_csc, torch.sparse_bsc}:
+                self.track_unbacked_symbols(example_value.ccol_indices(), e_proxy)
+                self.track_unbacked_symbols(example_value.row_indices(), e_proxy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_traceable_wrapper_subclass(example_value):
                 attrs, ctx = example_value.__tensor_flatten__()
                 for attr in attrs:
                     inner_t = getattr(example_value, attr)
+<<<<<<< HEAD
                     self.track_produced_symints(inner_t, getattr(e_proxy, attr))
         elif isinstance(example_value, torch.SymInt):
+=======
+                    self.track_unbacked_symbols(inner_t, getattr(e_proxy, attr))
+        elif isinstance(example_value, torch.SymInt):
+            # Only bind unbacked symbols. backed symbols are lifted as inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if need_bind(example_value):
                 expr = example_value.node.expr
                 tracer.bound_symbols[expr] = e_proxy
@@ -3575,7 +4441,11 @@ def _proxy_with_example_value(
     # See Note [Auto lift basic free symbols when create_graph_input]
     def _lift_basic_symbols(
         self, example_value: Union[torch.SymInt, torch.Tensor], src: Optional[Source]
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The before arg is for inserting symints in the sizes/strides of a tensor
         # before the tensor. This ordering ensures that when we look at the tensor's
         # symbols, they're already lifted/tracked. E.g. this assumption is used
@@ -3599,7 +4469,11 @@ def _lift_symbols_in_symint(
                 self.parent._lift_basic_symbols(s, source)
                 for s0 in self_to_be_bound:
                     parent_proxy = self.parent.bound_symbols[s0]
+<<<<<<< HEAD
                     example_val = parent_proxy.node.meta["example_value"]  # type: ignore[union-attr]
+=======
+                    example_val = parent_proxy.node.meta["example_value"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert isinstance(example_val, torch.SymInt)
                     ph = self.create_graph_input(
                         str(s0),
@@ -3614,7 +4488,11 @@ def _lift_symbols_in_symint(
                         source.name() if source is not None else "subgraph inputs",
                         self.debug_level,
                     )
+<<<<<<< HEAD
                     self.lifted_freevars[parent_proxy] = ph  # type: ignore[index]
+=======
+                    self.lifted_freevars[parent_proxy] = ph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For root_tracer:
             else:
                 assert len(self_to_be_bound) == 1, (
@@ -3724,7 +4602,11 @@ def lookup_unbound_symbols(self, s: torch.SymInt) -> list[sympy.Symbol]:
         # Sort the symbols so that we can have a deterministic lifting order
         return sorted(to_be_bound, key=lambda s: s.name)
 
+<<<<<<< HEAD
     def has_input_mutation(self) -> MutationInfo:
+=======
+    def has_input_mutation(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_versions_at_beginning = self._input_versions_at_beginning
         input_nodes = []
 
@@ -3746,14 +4628,22 @@ def has_input_mutation(self) -> MutationInfo:
             if v1 != v2
         ]
 
+<<<<<<< HEAD
         if mutated_inputs:
+=======
+        if len(mutated_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mutated_nodes = [input_nodes[i] for i in mutated_inputs]
             msg = f"Input mutation detected at {mutated_nodes}"
             return MutationInfo(True, msg)
 
         return MutationInfo(False, "")
 
+<<<<<<< HEAD
     def has_aliasing(self) -> AliasingInfo:
+=======
+    def has_aliasing(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._higher_order_ops.utils import _collect_fake_inputs
 
         input_storages: dict[StorageWeakRef, torch.fx.Node] = dict()
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index e178a8e6a7380..2a810befb81bc 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -8,19 +8,26 @@
 from a different process or host.
 """
 
+<<<<<<< HEAD
 import abc
 import ast
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import dataclasses
 import functools
 import hashlib
 import importlib
+<<<<<<< HEAD
 import inspect
 import json
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import os
 import pickle
 import platform
+<<<<<<< HEAD
 import shutil
 import sys
 import types
@@ -35,15 +42,31 @@
 
 from .bytecode_transformation import get_code_keys
 from .utils import counters, dynamo_timed, increment_frame
+=======
+import sys
+import types
+from collections.abc import Generator
+from typing import Any, NewType, Optional
+
+import torch
+import torch._inductor.package
+from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+from torch.compiler._cache import CacheArtifactFactory
+
+from .bytecode_transformation import get_code_keys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from .guards import GuardManagerWrapper, GuardsState
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class SerializedCode:
     co_argcount: int
@@ -102,6 +125,7 @@ class _GuardedCodeCacheEntry:
     dynamo_code: SerializedCode
 
 
+<<<<<<< HEAD
 def load_guards_state(guards_state: bytes) -> Any:
     try:
         import torch.distributed.fsdp._fully_shard._fully_shard as _fully_shard
@@ -129,10 +153,13 @@ def load_guard_manager(
     ).guard_manager
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _BackendId = NewType("_BackendId", str)  # __compiled_fn
 _FunctionId = NewType("_FunctionId", str)  # __resume_at
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass(frozen=True)
 class InlinedSource:
     module: str
@@ -170,6 +197,8 @@ def add_code(self, code: types.CodeType) -> None:
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class _DynamoCodeCacheEntry:
     """
@@ -185,11 +214,14 @@ class _DynamoCodeCacheEntry:
       4. A list of guarded code that eval frame dispatches to.
       5. A list of imported module objects unioned from all compiled branches.
       6. A list of "backends" (compiled fx graph) unioned from all compield branches.
+<<<<<<< HEAD
       7. A string path used to access the original code object users defined.
          A code object can be accessed by "{python_module}.{function_name}.{code_source}" .
       8. A boolean flag indicating whether the function is installed to global scope.
       9. A boolean flag indicating whether the function has a compile id.
       10. Whether or not this code entry was bypassed
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     python_code: SerializedCode
@@ -198,6 +230,7 @@ class _DynamoCodeCacheEntry:
     guarded_codes: list[_GuardedCodeCacheEntry]
     import_sources: dict[str, str]
     backend_ids: list[_BackendId]
+<<<<<<< HEAD
     code_source: Optional[str]
     install_to_global: bool
     has_compile_id: bool = False
@@ -402,21 +435,29 @@ def check_compatibility(
                     f"Compile package was created with different GPU: "
                     f"cached={self.gpu_name}, current={other.gpu_name}"
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
 class _DynamoCacheEntry:
     codes: list[_DynamoCodeCacheEntry]
+<<<<<<< HEAD
     source_info: SourceInfo
     device_type: str
     system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
     fn_name: Optional[str] = None
     fn_first_lineno: Optional[str] = None
+=======
+    python_version: str = platform.python_version()
+    torch_version: str = torch.__version__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def backend_ids(self) -> set[_BackendId]:
         return {backend_id for code in self.codes for backend_id in code.backend_ids}
 
+<<<<<<< HEAD
     def check_versions(self) -> None:
         """Check if the current system is compatible with the system used to create this cache entry."""
         current_system_info = SystemInfo.current()
@@ -545,6 +586,17 @@ def _ctx() -> Iterator[None]:
             yield
 
     return _ctx()
+=======
+
+@CacheArtifactFactory.register
+class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]):
+    @staticmethod
+    def type() -> str:
+        return "precompile_dynamo"
+
+    def after_deserialization(self) -> _DynamoCacheEntry:
+        return pickle.loads(self.content)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CompilePackage:
@@ -560,17 +612,22 @@ class CompilePackage:
         updates with compiled functions and resume functions.
     """
 
+<<<<<<< HEAD
     def __init__(
         self,
         fn: Optional[Callable[..., Any]],
         dynamo: Optional[_DynamoCacheEntry] = None,
         ignore_inlined_sources: bool = False,
     ) -> None:
+=======
+    def __init__(self, fn: Any, dynamo: Optional[_DynamoCacheEntry] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._innermost_fn = None
         self._codes: dict[types.CodeType, _DynamoCodeCacheEntry] = {}
 
         self._current_entry: Optional[_DynamoCodeCacheEntry] = None
         self._installed_globals: dict[types.ModuleType, list[str]] = {}
+<<<<<<< HEAD
         # device_type that model compiled with.
         self._device_type = "cpu"
 
@@ -616,6 +673,33 @@ def initialize(
 
             main, *codes = dynamo.codes
             # pyrefly: ignore [bad-assignment]
+=======
+
+        # For debugging/testing purpose only.
+        self._cached_backends: dict[_BackendId, Any] = {}
+
+        self._initialize(fn, dynamo)
+        self.uninstall()
+        self.validate()
+
+    def _initialize(self, fn: Any, dynamo: Optional[_DynamoCacheEntry] = None) -> None:
+        from .eval_frame import innermost_fn
+
+        self._innermost_fn = innermost_fn(fn)
+        assert self._innermost_fn is not None
+        if dynamo is not None:
+            assert isinstance(dynamo, _DynamoCacheEntry)
+            if dynamo.python_version != platform.python_version():
+                raise RuntimeError(
+                    f"Compile package was created with a different Python version: {dynamo.python_version}"
+                )
+            if dynamo.torch_version != torch.__version__:
+                raise RuntimeError(
+                    f"Compile package was created with a different PyTorch version: {dynamo.torch_version}"
+                )
+
+            main, *codes = dynamo.codes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._codes = {self._innermost_fn.__code__: main}
             for code in codes:
                 self._codes[SerializedCode.to_code_object(code.python_code)] = code
@@ -623,16 +707,23 @@ def initialize(
             self._add_function(
                 self._innermost_fn.__code__, self._innermost_fn.__module__
             )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         self._initialized = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _add_function(
         self,
         python_code: types.CodeType,
         python_module: str,
+<<<<<<< HEAD
         function_name: Optional[_FunctionId] = None,
         code_source: Optional[str] = None,
         install_to_global: bool = False,
+=======
+        name: Optional[_FunctionId] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if python_code not in self._codes:
             code = _DynamoCodeCacheEntry(
@@ -642,18 +733,27 @@ def _add_function(
                 guarded_codes=[],
                 import_sources={},
                 backend_ids=[],
+<<<<<<< HEAD
                 code_source=code_source,
                 install_to_global=install_to_global,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self._codes[python_code] = code
         else:
             code = self._codes[python_code]
             assert code.python_module == python_module
+<<<<<<< HEAD
             assert code.install_to_global == install_to_global
             assert code.code_source == code_source
 
         if function_name is not None:
             code.function_names.append(function_name)
+=======
+
+        if name is not None:
+            code.function_names.append(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def cached_backends(self) -> dict[_BackendId, Any]:
@@ -662,6 +762,7 @@ def cached_backends(self) -> dict[_BackendId, Any]:
     @functools.cached_property
     def source_id(self) -> str:
         assert self._innermost_fn is not None
+<<<<<<< HEAD
         return CompilePackage.source_id_from_fn(self._innermost_fn)
 
     def _add_user_function(self, code: types.CodeType) -> None:
@@ -675,22 +776,34 @@ def _add_user_function(self, code: types.CodeType) -> None:
             function_name=_FunctionId(function_name),
             code_source=code_source,
         )
+=======
+        sha256_hash = hashlib.sha256()
+        sha256_hash.update(self._innermost_fn.__qualname__.encode())
+        sha256_hash.update(str(self._innermost_fn.__code__.co_firstlineno).encode())
+        return sha256_hash.hexdigest()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @contextlib.contextmanager
     def code_context(self, code: types.CodeType) -> Generator[None, None, None]:
         assert self._current_entry is None
 
+<<<<<<< HEAD
         # Sometimes user code cannot be inlined in dynamo resulting in extra user code
         # being compiled. We should record these as when they are actually invoked.
         if code not in self._codes:
             self._add_user_function(code)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         entry = self._codes[code]
         self._current_entry = entry
         try:
             yield
         finally:
+<<<<<<< HEAD
             entry.has_compile_id = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._current_entry = None
 
     def add_guarded_code(
@@ -699,14 +812,18 @@ def add_guarded_code(
         dynamo_code: types.CodeType,
     ) -> None:
         assert self._current_entry is not None
+<<<<<<< HEAD
         if self._current_entry.bypassed:
             return
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         guarded_code_entry = _GuardedCodeCacheEntry(
             guards_state=guards_state,
             dynamo_code=SerializedCode.from_code_object(dynamo_code),
         )
         self._current_entry.guarded_codes.append(guarded_code_entry)
 
+<<<<<<< HEAD
     def add_inlined_source(self, sources: list[types.CodeType]) -> None:
         assert self._current_entry is not None
         if self._current_entry.bypassed:
@@ -723,10 +840,13 @@ def bypass_current_entry(self) -> None:
         assert self._current_entry is not None
         self._current_entry.bypassed = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_resume_function(
         self,
         python_code: types.CodeType,
         python_module: str,
+<<<<<<< HEAD
         function_name: Optional[str],
     ) -> None:
         self._add_function(
@@ -736,6 +856,13 @@ def add_resume_function(
             install_to_global=True,
         )
         self._resume_codes.add(python_code)
+=======
+        name: Optional[str],
+    ) -> None:
+        self._add_function(
+            python_code, python_module, _FunctionId(name) if name else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def add_import_source(self, alias: str, module_name: str) -> None:
         assert self._current_entry is not None
@@ -752,7 +879,10 @@ def add_backend_id(self, backend_id: str, backend: Optional[Any] = None) -> None
     def validate(self) -> None:
         assert self._current_entry is None
         assert self._innermost_fn is not None
+<<<<<<< HEAD
         assert self._initialized
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert next(iter(self._codes)) is self._innermost_fn.__code__
 
     def _install_global(self, module: types.ModuleType, name: str, value: Any) -> None:
@@ -767,7 +897,10 @@ def uninstall(self) -> None:
             for name in names:
                 module.__dict__.pop(name)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._installed_globals = {}
 
         _reset_precompile_entries(self._innermost_fn.__code__)
@@ -781,6 +914,7 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
         """
         from torch._C._dynamo.eval_frame import _load_precompile_entry
 
+<<<<<<< HEAD
         from .output_graph import get_builtins_dict
 
         self.uninstall()
@@ -942,12 +1076,93 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None:
         )
 
         backend_content: _Backends = {}
+=======
+        self.uninstall()
+
+        for code, entry in self._codes.items():
+            module = sys.modules[entry.python_module]
+            for alias, module_name in entry.import_sources.items():
+                self._install_global(
+                    module, alias, importlib.import_module(module_name)
+                )
+            for function_name in entry.function_names:
+                fn = types.FunctionType(code, module.__dict__, function_name)
+                self._install_global(module, function_name, fn)
+            for backend_id in entry.backend_ids:
+                if backend_id not in backends:
+                    raise RuntimeError(
+                        f"Backend {backend_id} is not found in the given backends"
+                    )
+                backend = backends[backend_id]
+                self._install_global(
+                    module,
+                    backend_id,
+                    torch._dynamo.disable(backend),
+                )
+
+        for code, entry in self._codes.items():
+            for guarded_code in entry.guarded_codes:
+                guards_state = pickle.loads(guarded_code.guards_state)
+                assert isinstance(guards_state, torch._dynamo.guards.GuardsState)
+                check_fn_manager = torch._dynamo.guards.CheckFunctionManager(
+                    code,
+                    guards_state.output_graph,
+                    guards_serialization_mode="load",
+                    shape_code_parts=guards_state.shape_code_parts,
+                )
+                _load_precompile_entry(
+                    code,
+                    check_fn_manager.guard_manager,
+                    SerializedCode.to_code_object(guarded_code.dynamo_code),
+                )
+
+    def cache_entry(self) -> _DynamoCacheEntry:
+        self.validate()
+        return _DynamoCacheEntry(codes=list(self._codes.values()))
+
+
+@CacheArtifactFactory.register
+class EagerCacheArtifact(PrecompileCacheArtifact[Any]):
+    @staticmethod
+    def type() -> str:
+        return "precompile_eager"
+
+    def after_deserialization(self) -> Any:
+        return pickle.loads(self.content)
+
+
+class DynamoStore:
+    """
+    A DynamoStore tracks active CompilePackages, and provides methods to store and retrieve them.
+    """
+
+    def record_package(self, package: CompilePackage) -> None:
+        """Records a package to PrecompileContext, so that it can be serialized later."""
+        cache_entry = package.cache_entry()
+        pickled_result = pickle.dumps(cache_entry)
+        PrecompileContext.record_artifact(
+            _DynamoCacheArtifact.type(), key=package.source_id, content=pickled_result
+        )
+
+    def record_eager_backend(self, backend_id: _BackendId, backend: Any) -> None:
+        """Records eager fx graphs to PrecompileContext for testing purposes."""
+        pickled_result = pickle.dumps(backend)
+        PrecompileContext.record_artifact(
+            EagerCacheArtifact.type(), key=backend_id, content=pickled_result
+        )
+
+    def save_package(self, package: CompilePackage, path: str) -> None:
+        """Saves a package to a given path. Grabs backends from PrecompileContext."""
+        backend_content = {}
+        cache_entry = package.cache_entry()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for backend_id in cache_entry.backend_ids:
             serialized_backend = PrecompileContext.serialize_artifact_by_key(backend_id)
             if serialized_backend is None:
                 raise RuntimeError(
                     f"Backend {backend_id} is not found in the given backends"
                 )
+<<<<<<< HEAD
             assert isinstance(serialized_backend, BackendCacheArtifact)
             backend_content[backend_id] = serialized_backend
 
@@ -1155,3 +1370,29 @@ def cache_dir() -> str:
 
 
 DynamoCache = DiskDynamoCache(os.path.join(cache_dir(), "dynamo"))
+=======
+            backend_content[backend_id] = serialized_backend
+        try:
+            with open(os.path.join(path, "dynamo"), "wb") as dynamo_path:
+                pickle.dump(cache_entry, dynamo_path)
+            with open(os.path.join(path, "backends"), "wb") as backend_path:
+                pickle.dump(backend_content, backend_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to save package to {path}: {e}") from e
+
+    def load_package(
+        self, fn: Any, path: str
+    ) -> tuple[CompilePackage, dict[_BackendId, Any]]:
+        """Loads a package from a given path and returns it plus a list of deserialized backends"""
+        try:
+            with open(os.path.join(path, "dynamo"), "rb") as dynamo_path:
+                cache_entry = pickle.load(dynamo_path)
+            with open(os.path.join(path, "backends"), "rb") as backend_path:
+                backend_content = pickle.load(backend_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load package from path {path}: {e}") from e
+        for backend_id, backend in backend_content.items():
+            backend_content[backend_id] = backend.after_deserialization()
+        package = CompilePackage(fn, cache_entry)
+        return package, backend_content
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index 58cb5d2a521e6..4c1beab13b5ea 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -167,15 +167,21 @@ def make(code: types.CodeType) -> CodeId:
 @dataclasses.dataclass
 class CodeState:
     automatic_dynamic: defaultdict[str, FrameStateSizeEntry] = dataclasses.field(
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         default_factory=lambda: defaultdict(FrameStateSizeEntry)
     )
 
 
 _INIT_CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
 _CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
+<<<<<<< HEAD
 _LOGGED_DYNAMIC_ALLOWLIST: bool = False
 _KNOWN_DYNAMIC_SOURCES: set[str] = set()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass(frozen=True)
@@ -265,7 +271,11 @@ def render_tuple(ss: tuple[Union[int, AutoDynamic, InferStride], ...]) -> str:
                 return f"tensor size={render_tuple(self.size)} stride={render_tuple(self.stride)}"
 
         # Fallback
+<<<<<<< HEAD
         return f"unusual {repr(self)}"
+=======
+        return "unusual {repr(self)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self) -> None:
         assert not isinstance(self.scalar, torch.SymInt), self.scalar
@@ -522,7 +532,18 @@ def process_automatic_dynamic(
         return res
 
 
+<<<<<<< HEAD
 def format_cache_key(key: str) -> str:
+=======
+def get_cache_key() -> Optional[str]:
+    # TODO: info versions of these logs that log only once
+    if torch._inductor.config.force_disable_caches:
+        warn_once(
+            "dynamo_pgo force disabled by torch._inductor.config.force_disable_caches"
+        )
+        return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: We always use global rank for keys, even though they are overkill
     # for local only cache
     rank = None
@@ -530,6 +551,7 @@ def format_cache_key(key: str) -> str:
         rank = dist.get_rank()
 
     tag = torch.compiler.config.cache_key_tag
+<<<<<<< HEAD
     return f"{key}:{rank}:{tag}"
 
 
@@ -540,6 +562,8 @@ def get_cache_key() -> Optional[str]:
             "dynamo_pgo force disabled by torch.compiler.config.force_disable_caches"
         )
         return None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NB: We namespace the cache keys so that only user-specified job id
     # can alias with each other.
@@ -550,15 +574,24 @@ def get_cache_key() -> Optional[str]:
                 "automatically generated job id associated with a specific MAST job "
                 "name and version."
             )
+<<<<<<< HEAD
         return format_cache_key(r)
 
     if (name_version := torch._utils_internal.get_mast_job_name_version()) is not None:
         mast_job_name, mast_job_version = name_version
         return format_cache_key(f"mast:{mast_job_name}:{mast_job_version}")
+=======
+        return f"{r}:{rank}:{tag}"
+
+    if (name_version := torch._utils_internal.get_mast_job_name_version()) is not None:
+        mast_job_name, mast_job_version = name_version
+        return f"mast:{mast_job_name}:{mast_job_version}:{rank}:{tag}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return None
 
 
+<<<<<<< HEAD
 def get_extra_cache_key(sticky_key: str) -> Optional[str]:
     if torch.compiler.config.force_disable_caches:
         warn_once(
@@ -569,6 +602,8 @@ def get_extra_cache_key(sticky_key: str) -> Optional[str]:
     return format_cache_key(sticky_key)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This solely controls local PGO
 def code_state_path(cache_key: str) -> Optional[str]:
     if not torch._dynamo.config.automatic_dynamic_local_pgo:
@@ -582,7 +617,11 @@ def code_state_path(cache_key: str) -> Optional[str]:
 
 
 def should_use_remote_dynamo_pgo_cache() -> bool:
+<<<<<<< HEAD
     if torch.compiler.config.force_disable_caches:
+=======
+    if torch._inductor.config.force_disable_caches:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     if (r := torch._dynamo.config.automatic_dynamic_remote_pgo) is not None:
@@ -631,6 +670,7 @@ def _collect_dynamic_sources(code_state: CodeState) -> OrderedSet[str]:
     return dynamic_sources
 
 
+<<<<<<< HEAD
 def _collect_missing_sources(all_sources: OrderedSet[str]) -> OrderedSet[str]:
     from torch._dynamo.variables.builder import is_dynamic_source
 
@@ -676,6 +716,19 @@ def _log_size_mismatch_recompile() -> None:
         _LOGGED_DYNAMIC_ALLOWLIST = True
 
 
+=======
+def log_frame_dynamic_whitelist(f_code: types.CodeType) -> None:
+    code_id = CodeId.make(f_code)
+    frame_state = get_code_state()[code_id]
+    frame_whitelist = ",".join(_collect_dynamic_sources(frame_state))
+    if frame_whitelist:
+        with dynamo_timed(name := "pgo.dynamic_whitelist", log_pt2_compile_event=True):
+            CompileEventLogger.pt2_compile(
+                name, recompile_dynamic_whitelist=frame_whitelist
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
     code_state_str = "\n".join(
         f"{k}:\n"
@@ -725,6 +778,7 @@ def _rewrite_cache_key_for_mega_cache(original_key: str) -> str:
         return original_key
 
 
+<<<<<<< HEAD
 def hit(key: str, ty: str) -> defaultdict[CodeId, CodeState]:
     global _INIT_CODE_STATE
     assert isinstance(_CODE_STATE, defaultdict)
@@ -741,6 +795,34 @@ def hit(key: str, ty: str) -> defaultdict[CodeId, CodeState]:
 
 def get_local_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeState]]:
     global _CODE_STATE
+=======
+def get_code_state() -> defaultdict[CodeId, CodeState]:
+    global _CODE_STATE, _INIT_CODE_STATE
+    if _CODE_STATE is not None:
+        return _CODE_STATE
+
+    # Initialize it (even if we don't look up profile)
+    _CODE_STATE = defaultdict(CodeState)
+
+    cache_key = get_cache_key()
+    if cache_key is None:
+        return _CODE_STATE
+
+    def hit(ty: str) -> defaultdict[CodeId, CodeState]:
+        global _INIT_CODE_STATE
+        assert isinstance(_CODE_STATE, defaultdict)
+        log.info("get_code_state %s hit %s, %d entries", path, ty, len(_CODE_STATE))
+        trace_structured_artifact(
+            f"get_{ty}_code_state",
+            "string",
+            lambda: render_code_state(_CODE_STATE),  # type: ignore[arg-type]
+        )
+        set_feature_use("pgo", True)
+        _INIT_CODE_STATE = copy.deepcopy(_CODE_STATE)
+        return _CODE_STATE
+
+    # Attempt local
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     path = code_state_path(cache_key)
     if path is not None and os.path.exists(path):
         with dynamo_timed(
@@ -762,6 +844,7 @@ def get_local_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeSta
                     CacheArtifactManager.record_artifact(
                         PGOCacheArtifact.type(), cache_key, content
                     )
+<<<<<<< HEAD
                     return hit(path, "local")
     return None
 
@@ -805,6 +888,11 @@ def lookup_remote_cache_entry(
 
 def get_remote_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeState]]:
     global _CODE_STATE
+=======
+                    return hit("local")
+
+    # Attempt remote
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     remote_cache = get_remote_cache()
     if remote_cache is not None:
         with dynamo_timed(
@@ -813,6 +901,7 @@ def get_remote_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeSt
             dynamo_compile_column_us="pgo_get_remote_code_state_time_us",
         ):
             CompileEventLogger.pt2_compile(name, cache_key=cache_key)
+<<<<<<< HEAD
             code_state = lookup_remote_cache_entry(remote_cache, cache_key, name)
             if code_state is not None:
                 _CODE_STATE = code_state
@@ -879,6 +968,39 @@ def get_code_state() -> defaultdict[CodeId, CodeState]:
         extra_read_key = get_extra_cache_key(sticky_read)
         if extra_read_key is not None:
             get_extra_remote_code_state(extra_read_key)
+=======
+            # TODO: I don't really understand why there's a JSON container format
+            try:
+                cache_data = remote_cache.get(cache_key)
+            except Exception:
+                log.warning(
+                    "get_code_state failed remote read on %s", cache_key, exc_info=True
+                )
+            else:
+                if cache_data is not None:
+                    try:
+                        assert isinstance(cache_data, dict)
+                        data = cache_data["data"]
+                        assert isinstance(data, str)
+                        payload = base64.b64decode(data)
+                        CompileEventLogger.pt2_compile(
+                            name, cache_size_bytes=len(payload)
+                        )
+                        _CODE_STATE = pickle.loads(payload)
+                    except Exception:
+                        log.warning(
+                            "get_code_state failed parsing remote result on %s",
+                            cache_key,
+                            exc_info=True,
+                        )
+                    else:
+                        CacheArtifactManager.record_artifact(
+                            PGOCacheArtifact.type(), cache_key, payload
+                        )
+                        return hit("remote")
+                else:
+                    log.info("get_code_state remote miss on %s", cache_key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     log.info("get_code_state using default")
 
@@ -902,10 +1024,13 @@ def put_code_state() -> None:
 
     put_local_code_state(cache_key)
     put_remote_code_state(cache_key)
+<<<<<<< HEAD
     if (sticky_write := torch.compiler.config.pgo_extra_write_key) is not None:
         extra_write_key = get_extra_cache_key(sticky_write)
         if extra_write_key is not None:
             put_remote_code_state(extra_write_key)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def write_local_impl(cache_key: str, pickled_code: bytes) -> Optional[tuple[str, int]]:
@@ -959,6 +1084,7 @@ def put_local_code_state(cache_key: str) -> None:
         )
 
 
+<<<<<<< HEAD
 def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> None:
     event_name = (
         "put_remote_code_state"
@@ -967,6 +1093,11 @@ def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> Non
     )
     with dynamo_timed(
         name := f"pgo.{event_name}",
+=======
+def put_remote_code_state(cache_key: str) -> None:
+    with dynamo_timed(
+        name := "pgo.put_remote_code_state",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log_pt2_compile_event=True,
         dynamo_compile_column_us="pgo_put_remote_code_state_time_us",
     ):
@@ -976,7 +1107,11 @@ def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> Non
         remote_cache = get_remote_cache()
 
         if remote_cache is None:
+<<<<<<< HEAD
             log.info("%s: remote cache disabled", event_name)
+=======
+            log.info("put_code_state: remote cache disabled")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         content = pickle.dumps(_CODE_STATE)
@@ -986,11 +1121,19 @@ def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> Non
         }
         remote_cache.put(cache_key, cache_data)
         log.info(
+<<<<<<< HEAD
             "%s: wrote remote %s, %d entries", event_name, cache_key, len(_CODE_STATE)
         )
         # TODO: don't log this multiple times
         trace_structured_artifact(
             event_name,
+=======
+            "put_code_state: wrote remote %s, %d entries", cache_key, len(_CODE_STATE)
+        )
+        # TODO: don't log this multiple times
+        trace_structured_artifact(
+            "put_remote_code_state",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "string",
             lambda: render_code_state(_CODE_STATE),
         )
@@ -998,7 +1141,13 @@ def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> Non
 
 # NB: this does NOT reset the cached code state on disk
 def reset_code_state() -> None:
+<<<<<<< HEAD
     global _CODE_STATE, _INIT_CODE_STATE, _LOGGED_DYNAMIC_ALLOWLIST
     _CODE_STATE = None
     _INIT_CODE_STATE = None
     _LOGGED_DYNAMIC_ALLOWLIST = False
+=======
+    global _CODE_STATE, _INIT_CODE_STATE
+    _CODE_STATE = None
+    _INIT_CODE_STATE = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/polyfills/__init__.py b/torch/_dynamo/polyfills/__init__.py
index a8dcf3e00c166..0c5a83cc21984 100644
--- a/torch/_dynamo/polyfills/__init__.py
+++ b/torch/_dynamo/polyfills/__init__.py
@@ -9,11 +9,17 @@
 # mypy: allow-untyped-defs
 
 import types
+<<<<<<< HEAD
 from collections import OrderedDict
 from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
 from itertools import repeat as _repeat
 from operator import eq, ne
 from typing import Any, TYPE_CHECKING
+=======
+from collections.abc import Iterable, MutableMapping, Sequence
+from itertools import repeat as _repeat
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -25,14 +31,20 @@
     # See also the POLYFILLED_MODULE_NAMES in torch/_dynamo/polyfills/loader.py
     # Put the submodules here to avoid circular imports
     from . import (
+<<<<<<< HEAD
         _collections as _collections,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         builtins as builtins,
         functools as functools,
         itertools as itertools,
         operator as operator,
         os as os,
         pytree as pytree,
+<<<<<<< HEAD
         struct as struct,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sys as sys,
     )
 
@@ -78,6 +90,7 @@ def radians(x):
     return math.pi / 180.0 * x
 
 
+<<<<<<< HEAD
 def impl_CONTAINS_OP_fallback(a, b):
     # performs fallback "a in b"
     if hasattr(b, "__iter__"):
@@ -89,6 +102,8 @@ def impl_CONTAINS_OP_fallback(a, b):
     raise TypeError(f"argument of type {type(b)} is not iterable")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def accumulate_grad(x, new_grad):
     # polyfills according to the Gradient Layout Contract
     if new_grad is None:
@@ -107,6 +122,7 @@ def accumulate_grad(x, new_grad):
 # https://github.com/python/cpython/blob/a1c52d1265c65bcf0d9edf87e143843ad54f9b8f/Objects/listobject.c#L3352-L3413
 def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequence[Any]):
     """emulate `(1,2,3) > (1,2)` etc"""
+<<<<<<< HEAD
 
     # Optimization: For equality, short-circuit if lengths differ
     # This avoids iterating through elements and triggering guards on SymInts
@@ -118,12 +134,15 @@ def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequenc
     if op is ne and left_len != right_len:
         return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Apply `op` to the first pair that differ
     for a, b in zip(left, right):
         if a != b:
             return op(a, b)
 
     # No more pairs to compare, so compare sizes.
+<<<<<<< HEAD
     return op(left_len, right_len)
 
 
@@ -139,6 +158,9 @@ def dict___eq__(d, other):
             return False
 
     return True
+=======
+    return op(len(left), len(right))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def set_symmetric_difference(set1, set2):
@@ -159,6 +181,7 @@ def set_symmetric_difference_update(set1, set2):
 
 
 def set_isdisjoint(set1, set2):
+<<<<<<< HEAD
     if not isinstance(set2, Iterable):
         raise TypeError(f"'{type(set2)}' object is not iterable")
 
@@ -168,6 +191,11 @@ def set_isdisjoint(set1, set2):
                 raise TypeError(f"unhashable type: '{type(y)}'")
             if x == y:
                 return False
+=======
+    for x in set1:
+        if x in set2:
+            return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return True
 
 
@@ -175,6 +203,7 @@ def set_intersection(set1, *others):
     if len(others) == 0:
         return set1.copy()
 
+<<<<<<< HEAD
     if not all(isinstance(s, Iterable) for s in others):
         raise TypeError(f"set.difference expected an iterable, got {type(others)}")
 
@@ -187,6 +216,12 @@ def set_intersection(set1, *others):
     for x in set1:
         for set2 in others:
             if not any(x == y for y in set2):
+=======
+    intersection_set = set()
+    for x in set1:
+        for set2 in others:
+            if x not in set2:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 break
         else:
             intersection_set.add(x)
@@ -201,6 +236,7 @@ def set_intersection_update(set1, *others):
 
 def set_union(set1, *others):
     # frozenset also uses this function
+<<<<<<< HEAD
     if len(others) == 0:
         return set1.copy()
 
@@ -216,6 +252,11 @@ def set_union(set1, *others):
         set_update(union_set, set2)
 
     # frozenset also uses this function
+=======
+    union_set = set(set1.copy())
+    for set2 in others:
+        set_update(union_set, set2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return type(set1)(union_set)
 
 
@@ -236,10 +277,13 @@ def set_difference(set1, *others):
     if not all(isinstance(s, Iterable) for s in others):
         raise TypeError(f"set.difference expected an iterable, got {type(others)}")
 
+<<<<<<< HEAD
     for s in others:
         if any(not isinstance(x, Hashable) for x in s):
             raise TypeError("unhashable type")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     difference_set = set()
     for x in set1:
         for set2 in others:
@@ -256,6 +300,7 @@ def set_difference_update(set1, *others):
     set1.update(result)
 
 
+<<<<<<< HEAD
 def assert_dict_equal(self_, d1, d2, msg=None):
     self_.assertTrue(d1 == d2, msg)
 
@@ -269,6 +314,8 @@ def assert_sequence_equal(self_, seq1, seq2, msg=None, seq_type=None):
     return self_.assertTrue(seq1 == seq2, msg)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def getattr_and_trace(*args, **kwargs):
     wrapper_obj = args[0]
     attr_name = args[1]
@@ -300,9 +347,12 @@ def construct_dict(cls, /, *args, **kwargs):
     if args:
         src = args[0]
 
+<<<<<<< HEAD
         if not isinstance(src, Iterable):
             raise TypeError(f"{type(src)} object is not iterable")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Ensure that the overridden __iter__ method is invoked
         if isinstance(src, (dict, MutableMapping, types.MappingProxyType)):
             for key in src:
diff --git a/torch/_dynamo/polyfills/builtins.py b/torch/_dynamo/polyfills/builtins.py
index 45feac9ca5dce..7223136f681d4 100644
--- a/torch/_dynamo/polyfills/builtins.py
+++ b/torch/_dynamo/polyfills/builtins.py
@@ -7,7 +7,10 @@
 import builtins
 import functools
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import TYPE_CHECKING, TypeVar
 
 from ..decorators import substitute_in_graph
@@ -59,6 +62,7 @@ def enumerate(iterable: Iterable[_T], start: int = 0) -> Iterable[tuple[int, _T]
 @substitute_in_graph(builtins.sum, can_constant_fold_through=True)  # type: ignore[arg-type]
 def sum(iterable: Iterable[_T], /, start: _T = 0) -> _T:  # type: ignore[assignment]
     return functools.reduce(operator.add, iterable, start)
+<<<<<<< HEAD
 
 
 class _CallableIterator:
@@ -121,3 +125,5 @@ def sequence_protocol(iterable):  # type: ignore[no-untyped-def]
             raise TypeError("iter(v, w): v must be a callable")
 
         return _CallableIterator(fn, sentinel)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/polyfills/functools.py b/torch/_dynamo/polyfills/functools.py
index f70ca59bcea3e..fc8c35bbbb2dc 100644
--- a/torch/_dynamo/polyfills/functools.py
+++ b/torch/_dynamo/polyfills/functools.py
@@ -3,8 +3,13 @@
 """
 
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import TypeVar
+=======
+from collections.abc import Iterable
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ..decorators import substitute_in_graph
 
diff --git a/torch/_dynamo/polyfills/fx.py b/torch/_dynamo/polyfills/fx.py
index 5a5ed97e0899d..0cc1fee60fb55 100644
--- a/torch/_dynamo/polyfills/fx.py
+++ b/torch/_dynamo/polyfills/fx.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._C import _fx_map_aggregate, _fx_map_arg
 from torch.fx.immutable_collections import immutable_dict, immutable_list
diff --git a/torch/_dynamo/polyfills/itertools.py b/torch/_dynamo/polyfills/itertools.py
index 8fbf9dfa17067..643c00abed418 100644
--- a/torch/_dynamo/polyfills/itertools.py
+++ b/torch/_dynamo/polyfills/itertools.py
@@ -5,9 +5,15 @@
 from __future__ import annotations
 
 import itertools
+<<<<<<< HEAD
 import operator
 from collections.abc import Callable
 from typing import Optional, overload, TYPE_CHECKING, TypeAlias, TypeVar
+=======
+import sys
+from typing import Callable, overload, TYPE_CHECKING, TypeVar
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ..decorators import substitute_in_graph
 
@@ -17,6 +23,7 @@
 
 
 __all__ = [
+<<<<<<< HEAD
     "accumulate",
     "chain",
     "chain_from_iterable",
@@ -28,6 +35,15 @@
     "tee",
     "zip_longest",
     "pairwise",
+=======
+    "chain",
+    "chain_from_iterable",
+    "compress",
+    "dropwhile",
+    "islice",
+    "tee",
+    "zip_longest",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -45,6 +61,7 @@ def chain(*iterables: Iterable[_T]) -> Iterator[_T]:
         yield from iterable
 
 
+<<<<<<< HEAD
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.accumulate
 @substitute_in_graph(itertools.accumulate, is_embedded_type=True)  # type: ignore[arg-type]
 def accumulate(
@@ -81,6 +98,11 @@ def chain_from_iterable(iterable: Iterable[Iterable[_T]], /) -> Iterator[_T]:
     # If iterable is an infinite generator, this will lead to infinite recursion
     for it in iterable:
         yield from it
+=======
+@substitute_in_graph(itertools.chain.from_iterable)  # type: ignore[arg-type]
+def chain_from_iterable(iterable: Iterable[Iterable[_T]], /) -> Iterator[_T]:
+    return itertools.chain(*iterable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 chain.from_iterable = chain_from_iterable  # type: ignore[attr-defined]
@@ -92,6 +114,7 @@ def compress(data: Iterable[_T], selectors: Iterable[_U], /) -> Iterator[_T]:
     return (datum for datum, selector in zip(data, selectors) if selector)
 
 
+<<<<<<< HEAD
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.cycle
 @substitute_in_graph(itertools.cycle, is_embedded_type=True)  # type: ignore[arg-type]
 def cycle(iterable: Iterable[_T]) -> Iterator[_T]:
@@ -110,6 +133,8 @@ def _cycle(iterator: Iterator[_T]) -> Iterator[_T]:
     return _cycle(iterator)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.dropwhile
 @substitute_in_graph(itertools.dropwhile, is_embedded_type=True)  # type: ignore[arg-type]
 def dropwhile(predicate: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator[_T]:
@@ -124,6 +149,7 @@ def dropwhile(predicate: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator[
     yield from iterator
 
 
+<<<<<<< HEAD
 @substitute_in_graph(itertools.filterfalse, is_embedded_type=True)  # type: ignore[arg-type]
 def filterfalse(function: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator[_T]:
     it = iter(iterable)
@@ -133,6 +159,8 @@ def filterfalse(function: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator
         return filter(lambda x: not function(x), it)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.islice
 @substitute_in_graph(itertools.islice, is_embedded_type=True)  # type: ignore[arg-type]
 def islice(iterable: Iterable[_T], /, *args: int | None) -> Iterator[_T]:
@@ -163,6 +191,7 @@ def islice(iterable: Iterable[_T], /, *args: int | None) -> Iterator[_T]:
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.pairwise
+<<<<<<< HEAD
 @substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
 def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
     a = None
@@ -173,6 +202,22 @@ def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
         else:
             yield a, b  # type: ignore[misc]
         a = b
+=======
+if sys.version_info >= (3, 10):
+
+    @substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
+    def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
+        a = None
+        first = True
+        for b in iterable:
+            if first:
+                first = False
+            else:
+                yield a, b  # type: ignore[misc]
+            a = b
+
+    __all__ += ["pairwise"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.tee
@@ -196,7 +241,10 @@ def _tee(link) -> Iterator[_T]:  # type: ignore[no-untyped-def]
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def zip_longest(
     iter1: Iterable[_T1],
     /,
@@ -206,7 +254,10 @@ def zip_longest(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def zip_longest(
     iter1: Iterable[_T1],
     iter2: Iterable[_T2],
@@ -215,7 +266,10 @@ def zip_longest(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def zip_longest(
     iter1: Iterable[_T1],
     iter2: Iterable[_T2],
@@ -226,7 +280,10 @@ def zip_longest(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def zip_longest(
     iter1: Iterable[_T],
     iter2: Iterable[_T],
@@ -237,7 +294,10 @@ def zip_longest(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def zip_longest(
     iter1: Iterable[_T],
     iter2: Iterable[_T],
diff --git a/torch/_dynamo/polyfills/loader.py b/torch/_dynamo/polyfills/loader.py
index d348a422ff576..fdbd1abc2a9a0 100644
--- a/torch/_dynamo/polyfills/loader.py
+++ b/torch/_dynamo/polyfills/loader.py
@@ -13,14 +13,20 @@
 
 # See also the TYPE_CHECKING block in torch/_dynamo/polyfills/__init__.py
 POLYFILLED_MODULE_NAMES: tuple[str, ...] = (
+<<<<<<< HEAD
     "_collections",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "builtins",
     "functools",
     "itertools",
     "operator",
     "os",
     "pytree",
+<<<<<<< HEAD
     "struct",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "sys",
     "fx",
     "tensor",
diff --git a/torch/_dynamo/polyfills/operator.py b/torch/_dynamo/polyfills/operator.py
index cae61df2c0430..250b94b06cdaf 100644
--- a/torch/_dynamo/polyfills/operator.py
+++ b/torch/_dynamo/polyfills/operator.py
@@ -5,18 +5,27 @@
 from __future__ import annotations
 
 import operator
+<<<<<<< HEAD
 from typing import Any, overload, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, overload, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeVarTuple, Unpack
 
 from ..decorators import substitute_in_graph
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable
 
 
 # Most unary and binary operators are handled by BuiltinVariable (e.g., `pos`, `add`)
 __all__ = ["attrgetter", "itemgetter", "methodcaller", "countOf"]
+=======
+# Most unary and binary operators are handled by BuiltinVariable (e.g., `pos`, `add`)
+__all__ = ["attrgetter", "itemgetter", "methodcaller"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _T = TypeVar("_T")
@@ -30,12 +39,18 @@
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def attrgetter(attr: str, /) -> Callable[[Any], _U]: ...
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def attrgetter(
     attr1: str, attr2: str, /, *attrs: str
 ) -> Callable[[Any], tuple[_U1, _U2, Unpack[_Us]]]: ...
@@ -70,12 +85,18 @@ def getter(obj: Any) -> tuple[Any, ...]:  # type: ignore[misc]
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def itemgetter(item: _T, /) -> Callable[[Any], _U]: ...
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def itemgetter(
     item1: _T1, item2: _T2, /, *items: Unpack[_Ts]
 ) -> Callable[[Any], tuple[_U1, _U2, Unpack[_Us]]]: ...
@@ -111,9 +132,12 @@ def caller(obj: Any) -> Any:
         return getattr(obj, name)(*args, **kwargs)
 
     return caller
+<<<<<<< HEAD
 
 
 # Reference: https://docs.python.org/3/library/operator.html#operator.countOf
 @substitute_in_graph(operator.countOf, can_constant_fold_through=True)  # type: ignore[arg-type,misc]
 def countOf(a: Iterable[_T], b: _T, /) -> int:
     return sum(it is b or it == b for it in a)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/polyfills/os.py b/torch/_dynamo/polyfills/os.py
index 2f55d436ad897..a5d7fe51be9ed 100644
--- a/torch/_dynamo/polyfills/os.py
+++ b/torch/_dynamo/polyfills/os.py
@@ -17,7 +17,10 @@
 @substitute_in_graph(os.fspath, can_constant_fold_through=True)
 def fspath(path: AnyStr | os.PathLike[AnyStr]) -> AnyStr:
     if isinstance(path, (str, bytes)):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return path
 
     path_type = type(path)
diff --git a/torch/_dynamo/polyfills/pytree.py b/torch/_dynamo/polyfills/pytree.py
index f9bdc0cce4a00..0f01a6f3c485e 100644
--- a/torch/_dynamo/polyfills/pytree.py
+++ b/torch/_dynamo/polyfills/pytree.py
@@ -6,7 +6,11 @@
 
 from collections import deque
 from dataclasses import dataclass, field
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, Literal, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeIs
 
 import torch.utils._pytree as python_pytree
@@ -17,7 +21,11 @@
 
 if TYPE_CHECKING:
     import builtins
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Mapping
+=======
+    from collections.abc import Iterable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from typing_extensions import Self
 
 
@@ -28,7 +36,11 @@
     import optree
     import optree._C
 
+<<<<<<< HEAD
     import torch.utils._cxx_pytree as cxx_pytree  # noqa: F401
+=======
+    import torch.utils._cxx_pytree as cxx_pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if TYPE_CHECKING:
         from torch.utils._cxx_pytree import PyTree
@@ -64,6 +76,7 @@ def _(*args: Any, **kwargs: Any) -> bool:
         del __func
     del __name
 
+<<<<<<< HEAD
     @substitute_in_graph(optree.tree_is_leaf, can_constant_fold_through=True)
     def tree_is_leaf(
         tree: PyTree,
@@ -87,29 +100,56 @@ def tree_iter(
         *,
         none_is_leaf: bool = False,
         namespace: str = "",
+=======
+    @substitute_in_graph(cxx_pytree.tree_is_leaf, can_constant_fold_through=True)
+    def tree_is_leaf(
+        tree: PyTree,
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> bool:
+        if tree is None or (is_leaf is not None and is_leaf(tree)):
+            return True
+        if optree.register_pytree_node.get(type(tree), namespace="torch") is None:  # type: ignore[attr-defined]
+            return True
+        return False
+
+    @substitute_in_graph(cxx_pytree.tree_iter, can_constant_fold_through=False)
+    def tree_iter(
+        tree: PyTree,
+        is_leaf: Callable[[PyTree], bool] | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Iterable[Any]:
         stack = [tree]
         while stack:
             node = stack.pop()
+<<<<<<< HEAD
             if tree_is_leaf(
                 node,
                 is_leaf=is_leaf,
                 none_is_leaf=none_is_leaf,
                 namespace=namespace,
             ):
+=======
+            if tree_is_leaf(node, is_leaf=is_leaf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 yield node
                 continue
 
             children, *_ = optree.tree_flatten_one_level(
                 node,
                 is_leaf=is_leaf,
+<<<<<<< HEAD
                 none_is_leaf=none_is_leaf,
                 namespace=namespace,
+=======
+                none_is_leaf=True,
+                namespace="torch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             stack.extend(reversed(children))
 
     __all__ += ["tree_iter"]
 
+<<<<<<< HEAD
     @substitute_in_graph(optree.tree_leaves, can_constant_fold_through=True)
     def tree_leaves(
         tree: PyTree,
@@ -127,6 +167,14 @@ def tree_leaves(
                 namespace=namespace,
             )
         )
+=======
+    @substitute_in_graph(cxx_pytree.tree_leaves, can_constant_fold_through=True)
+    def tree_leaves(
+        tree: PyTree,
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> list[Any]:
+        return list(tree_iter(tree, is_leaf=is_leaf))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __all__ += ["tree_leaves"]
 
@@ -151,12 +199,20 @@ class PyTreeSpec:
         _metadata: Any
         _entries: tuple[Any, ...]
         _unflatten_func: Callable[[Any | None, Iterable[PyTree]], PyTree] | None
+<<<<<<< HEAD
         none_is_leaf: bool
         namespace: str
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         num_nodes: int = field(init=False)
         num_leaves: int = field(init=False)
         num_children: int = field(init=False)
+<<<<<<< HEAD
+=======
+        none_is_leaf: Literal[True] = field(init=False)
+        namespace: Literal["torch"] = field(init=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __post_init__(self) -> None:
             if self._type is None:
@@ -176,6 +232,11 @@ def __post_init__(self) -> None:
             object.__setattr__(self, "num_nodes", num_nodes)
             object.__setattr__(self, "num_leaves", num_leaves)
             object.__setattr__(self, "num_children", num_children)
+<<<<<<< HEAD
+=======
+            object.__setattr__(self, "none_is_leaf", True)
+            object.__setattr__(self, "namespace", "torch")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __repr__(self) -> str:
             def helper(treespec: PyTreeSpec) -> str:
@@ -190,11 +251,17 @@ def helper(treespec: PyTreeSpec) -> str:
                 ]
                 if (
                     treespec.type in BUILTIN_TYPES
+<<<<<<< HEAD
                     or (treespec.type is type(None) and not self.none_is_leaf)
                     or optree.is_namedtuple_class(treespec.type)
                     or optree.is_structseq_class(treespec.type)
                 ):
                     # pyrefly: ignore [bad-return]
+=======
+                    or optree.is_namedtuple_class(treespec.type)
+                    or optree.is_structseq_class(treespec.type)
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return treespec._unflatten_func(
                         treespec._metadata,
                         children_representations,
@@ -204,12 +271,18 @@ def helper(treespec: PyTreeSpec) -> str:
                     f"[{', '.join(children_representations)}])"
                 )
 
+<<<<<<< HEAD
             inner = [
                 str(helper(self)),
                 *(["NoneIsLeaf"] if self.none_is_leaf else []),
                 f"namespace={self.namespace!r}",
             ]
             return f"PyTreeSpec({', '.join(inner)})"
+=======
+            return (
+                f"PyTreeSpec({helper(self)}, NoneIsLeaf, namespace={self.namespace!r})"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __len__(self) -> int:
             return self.num_leaves
@@ -254,8 +327,13 @@ def helper(
 
                     children, metadata, *_ = optree.tree_flatten_one_level(
                         node,
+<<<<<<< HEAD
                         none_is_leaf=self.none_is_leaf,
                         namespace=self.namespace,
+=======
+                        none_is_leaf=True,
+                        namespace="torch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if len(children) != treespec.num_children:
                         raise ValueError(
@@ -303,8 +381,13 @@ def helper(
                         # node_type is treespec.type
                         children, metadata, *_ = optree.tree_flatten_one_level(
                             node,
+<<<<<<< HEAD
                             none_is_leaf=self.none_is_leaf,
                             namespace=self.namespace,
+=======
+                            none_is_leaf=True,
+                            namespace="torch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         if (
                             node_type
@@ -346,10 +429,16 @@ def unflatten(self, leaves: Iterable[Any]) -> PyTree:
             assert callable(self._unflatten_func)
             return self._unflatten_func(self._metadata, subtrees)
 
+<<<<<<< HEAD
+=======
+    _LEAF_SPEC = PyTreeSpec((), None, None, (), None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _is_pytreespec_instance(obj: Any, /) -> TypeIs[PyTreeSpec]:
         return isinstance(obj, PyTreeSpec)
 
     @substitute_in_graph(  # type: ignore[arg-type]
+<<<<<<< HEAD
         optree.treespec_leaf,
         # We need to disable constant folding here because we want the function to reference the
         # PyTreeSpec class defined above, not the one in the C++ module.
@@ -458,12 +547,16 @@ def treespec_dict(
 
     @substitute_in_graph(  # type: ignore[arg-type]
         optree.tree_flatten,
+=======
+        cxx_pytree.tree_flatten,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We need to disable constant folding here because we want the function to reference the
         # PyTreeSpec class defined above, not the one in the C++ module.
         can_constant_fold_through=False,
     )
     def tree_flatten(
         tree: PyTree,
+<<<<<<< HEAD
         /,
         is_leaf: Callable[[PyTree], bool] | None = None,
         *,
@@ -487,6 +580,14 @@ def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
                     none_is_leaf=none_is_leaf,
                     namespace=namespace,
                 )
+=======
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> tuple[list[Any], PyTreeSpec]:
+        def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
+            if tree_is_leaf(node, is_leaf=is_leaf):
+                leaves.append(node)
+                return _LEAF_SPEC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             (
                 children,
@@ -496,12 +597,18 @@ def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
             ) = optree.tree_flatten_one_level(
                 node,
                 is_leaf=is_leaf,
+<<<<<<< HEAD
                 none_is_leaf=none_is_leaf,
                 namespace=namespace,
+=======
+                none_is_leaf=True,
+                namespace="torch",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             # Recursively flatten the children
             subspecs = tuple(helper(child, leaves) for child in children)
+<<<<<<< HEAD
             return PyTreeSpec(
                 subspecs,
                 type(node),
@@ -511,6 +618,9 @@ def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
                 none_is_leaf=none_is_leaf,
                 namespace=namespace,
             )  # type: ignore[arg-type]
+=======
+            return PyTreeSpec(subspecs, type(node), metadata, entries, unflatten_func)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         leaves: list[Any] = []
         treespec = helper(tree, leaves)
@@ -519,13 +629,18 @@ def helper(node: PyTree, leaves: list[Any]) -> PyTreeSpec:
     __all__ += ["tree_flatten"]
 
     @substitute_in_graph(  # type: ignore[arg-type]
+<<<<<<< HEAD
         optree.tree_structure,
+=======
+        cxx_pytree.tree_structure,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We need to disable constant folding here because we want the function to reference the
         # PyTreeSpec class defined above, not the one in the C++ module.
         can_constant_fold_through=False,
     )
     def tree_structure(
         tree: PyTree,
+<<<<<<< HEAD
         /,
         is_leaf: Callable[[PyTree], bool] | None = None,
         *,
@@ -538,16 +653,29 @@ def tree_structure(
             none_is_leaf=none_is_leaf,
             namespace=namespace,
         )[1]
+=======
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> PyTreeSpec:
+        return tree_flatten(tree, is_leaf=is_leaf)[1]  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __all__ += ["tree_structure"]
 
     @substitute_in_graph(  # type: ignore[arg-type]
+<<<<<<< HEAD
         optree.tree_unflatten,
+=======
+        cxx_pytree.tree_unflatten,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We need to disable constant folding here because we want the function to reference the
         # PyTreeSpec class defined above, not the one in the C++ module.
         can_constant_fold_through=False,
     )
+<<<<<<< HEAD
     def tree_unflatten(treespec: PyTreeSpec, leaves: Iterable[Any]) -> PyTree:
+=======
+    def tree_unflatten(leaves: Iterable[Any], treespec: PyTreeSpec) -> PyTree:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not _is_pytreespec_instance(treespec):
             raise TypeError(
                 f"tree_unflatten(leaves, treespec): Expected `treespec` to be instance of "
@@ -557,6 +685,7 @@ def tree_unflatten(treespec: PyTreeSpec, leaves: Iterable[Any]) -> PyTree:
 
     __all__ += ["tree_unflatten"]
 
+<<<<<<< HEAD
     @substitute_in_graph(optree.tree_map, can_constant_fold_through=True)
     def tree_map(
         func: Callable[..., Any],
@@ -573,11 +702,22 @@ def tree_map(
             none_is_leaf=none_is_leaf,
             namespace=namespace,
         )
+=======
+    @substitute_in_graph(cxx_pytree.tree_map, can_constant_fold_through=True)
+    def tree_map(
+        func: Callable[..., Any],
+        tree: PyTree,
+        *rests: PyTree,
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> PyTree:
+        leaves, treespec = tree_flatten(tree, is_leaf=is_leaf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
         return treespec.unflatten(map(func, *flat_args))
 
     __all__ += ["tree_map"]
 
+<<<<<<< HEAD
     @substitute_in_graph(optree.tree_map_, can_constant_fold_through=True)
     def tree_map_(
         func: Callable[..., Any],
@@ -594,11 +734,22 @@ def tree_map_(
             none_is_leaf=none_is_leaf,
             namespace=namespace,
         )
+=======
+    @substitute_in_graph(cxx_pytree.tree_map_, can_constant_fold_through=True)
+    def tree_map_(
+        func: Callable[..., Any],
+        tree: PyTree,
+        *rests: PyTree,
+        is_leaf: Callable[[PyTree], bool] | None = None,
+    ) -> PyTree:
+        leaves, treespec = tree_flatten(tree, is_leaf=is_leaf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
         deque(map(func, *flat_args), maxlen=0)  # consume and exhaust the iterable
         return tree
 
     __all__ += ["tree_map_"]
+<<<<<<< HEAD
 
     _none_unflatten = optree.register_pytree_node.get(type(None)).unflatten_func  # type: ignore[union-attr]
 
@@ -611,3 +762,5 @@ def none_unflatten(_: None, children: Iterable[Any], /) -> None:
         if len(list(children)) != 0:
             raise ValueError("Expected no children.")
         return None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/polyfills/sys.py b/torch/_dynamo/polyfills/sys.py
index ab666c385806f..b4aee1817ee0d 100644
--- a/torch/_dynamo/polyfills/sys.py
+++ b/torch/_dynamo/polyfills/sys.py
@@ -23,6 +23,7 @@ def intern(string: str, /) -> str:
 @substitute_in_graph(sys.getrecursionlimit, can_constant_fold_through=True)
 def getrecursionlimit() -> int:
     return sys.getrecursionlimit()
+<<<<<<< HEAD
 
 
 if hasattr(sys, "get_int_max_str_digits"):
@@ -32,3 +33,5 @@ def get_int_max_str_digits() -> int:
         return sys.get_int_max_str_digits()
 
     __all__ += ["get_int_max_str_digits"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py
index f3715ca39ae1f..f109c5ff44115 100644
--- a/torch/_dynamo/precompile_context.py
+++ b/torch/_dynamo/precompile_context.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import copy
 import json
 import logging
@@ -14,6 +15,23 @@
     DynamoCache,
     PrecompileCacheEntry,
 )
+=======
+from abc import abstractmethod
+from collections import defaultdict
+from typing import Any, Generic, Optional, TypeVar
+from typing_extensions import override
+
+from torch.compiler._cache import (
+    _serialize_single_cache,
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+    CacheArtifactsResult,
+    CacheInfo,
+)
+from torch.utils._appending_byte_serializer import AppendingByteSerializer
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """
@@ -21,6 +39,7 @@
 """
 
 T = TypeVar("T")
+<<<<<<< HEAD
 logger = logging.getLogger(__name__)
 
 
@@ -30,6 +49,18 @@ class BackendCacheArtifact(Generic[T]):
     Represents a single serializable backend artifact from a dynamo backend.
     Each BackendCacheArtifact has a key associated with it along with some
     serializable content.
+=======
+
+
+class PrecompileCacheArtifact(CacheArtifact, Generic[T]):
+    """
+    Data for each cache artifact that will be serialized and deserialized by
+    PrecompileContext, rather than CacheArtifactManager.
+    T represents the deserialized type of the artifact, i.e. the return type of after_deserialization
+
+    PrecompileCacheArtifact is a frozen dataclass - you can add new serializable fields and metadata specific to your own artifacts
+    as needed, and use them in after_deserialization.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example implementation:
 
@@ -43,8 +74,18 @@ def after_deserialization(self) -> MySerializableType:
             return result
     """
 
+<<<<<<< HEAD
     key: str
     content: Any
+=======
+    @override
+    def populate_cache(self) -> None:
+        raise RuntimeError("Precompile cache artifacts do not populate caches")
+
+    @override
+    def precompile_compatible(self) -> bool:
+        return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def after_deserialization(self) -> T:
@@ -54,6 +95,7 @@ def after_deserialization(self) -> T:
         """
         ...
 
+<<<<<<< HEAD
     def edit_contents(self, edit_fn: Callable[..., Any]) -> None:
         """
         Edit the contents of the artifact.
@@ -71,12 +113,17 @@ class BypassDynamoCacheEntry(Exception):
 
 
 class PrecompileContext:
+=======
+
+class PrecompileContext(CacheArtifactManager):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     PrecompileContext is a special CacheArtifactManager for handling precompilation
     It uses the same interface as CacheArtifactManager, but handles deserialization differently: instead
     of placing each artifact into respective caches, it will stitch all the cache artifacts for a single key
     together and place it into a global Precompile Cache.
 
+<<<<<<< HEAD
     PrecompileContext has two main portions: dynamo_cache_entries and backend_cache_artifacts.
     When saving, PrecompileContext.serialize() will serialize all dynamo cache entries along with any PrecompileCacheArtifacts that
     are needed to save those dynamo cache entries.
@@ -223,3 +270,88 @@ def create_cache_entries(
                 )
                 continue
         return precompile_cache_entries, debug_info
+=======
+    The following artifact types are supported by PrecompileContext:
+     - BundledAOTAutogradCacheArtifact
+     - CodeStateArtifact (from torch._dynamo.package once available)
+    """
+
+    # Protected by the compile_lock
+    # _new_cache_artifacts_by_key organizes results by the key of each artifact.
+    # This allows us to implement serialize_by_key easily.
+    # On call to `serialize()`, all cache artifacts in _new_cache_artifacts_by_key
+    # are transferred to _new_cache_artifacts before serialization.
+    _new_cache_artifacts_by_key: dict[str, CacheArtifact] = {}
+    _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
+    # Keep a separate seen artifacts list to make avoid unnecessary duplicates
+    # This list will not be cleared between serialize() calls
+    _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
+    # When serialize() is called, artifacts are transferred from _cache_artifacts to
+    # internal data structure of the _serializer
+    # This allows us to only pay the cost of serialization if serialize() is called
+    _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
+        AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+    )
+    _cache_info: CacheInfo = CacheInfo()
+
+    @classmethod
+    def clear(cls) -> None:
+        cls._new_cache_artifacts_by_key.clear()
+        super().clear()
+
+    @override
+    @classmethod
+    def record_artifact(
+        cls,
+        artifact_type: str,
+        key: str,
+        content: Any,
+    ) -> None:
+        """
+        Called from each caching operation to record the artifact in this
+        "mega" list
+        """
+        artifact = CacheArtifactFactory.encode_create(artifact_type, key, content)
+        # TODO: although this covers completely same artifacts, it's possible
+        # with AOTAutogradCacheEntries to have multiple artifacts whose keys
+        # (i.e. backend_ids) are different, but whose contents are equal.
+        # In those cases, it would be much better if we only serialize once instead
+        # of N times.
+        if artifact in cls._seen_artifacts:
+            return
+
+        cls._new_cache_artifacts_by_key[key] = artifact
+        cls._seen_artifacts.add(artifact)
+
+    @classmethod
+    def _save_artifacts_by_type(cls) -> None:
+        """
+        We normally record artifacts by key, but serialization expects them to be organized
+        by artifact type. This function transfers artifacts from _new_cache_artifacts_by_key to _new_cache_artifacts
+        """
+        for artifact in cls._new_cache_artifacts_by_key.values():
+            cls._new_cache_artifacts[artifact.__class__.type()].append(artifact)
+        cls._new_cache_artifacts_by_key.clear()
+
+    @classmethod
+    def serialize_artifact_by_key(cls, key: str) -> Optional[CacheArtifact]:
+        """
+        Serialize all artifacts with the given key returned in a list.
+        """
+        return cls._new_cache_artifacts_by_key.get(key, None)
+
+    @classmethod
+    def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
+        cls._save_artifacts_by_type()
+        return super().serialize()
+
+    @staticmethod
+    def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo:
+        raise NotImplementedError("TODO")
+
+    @classmethod
+    def _ensure_cache_artifacts_registered(cls) -> None:
+        from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
+            BundledAOTAutogradCacheArtifact,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/profiler.py b/torch/_dynamo/profiler.py
index 3c0a943eb7a05..3086c7a1a4e10 100644
--- a/torch/_dynamo/profiler.py
+++ b/torch/_dynamo/profiler.py
@@ -12,8 +12,11 @@
 by tracking both captured and total operations, timing, and graph statistics.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import dataclasses
 import os
 from typing import Any
@@ -37,7 +40,11 @@ def __iadd__(self, other: Self) -> Self:
         self.fusions += other.fusions
         return self
 
+<<<<<<< HEAD
     def __add__(self, other: ProfileMetrics) -> ProfileMetrics:
+=======
+    def __add__(self, other: "ProfileMetrics") -> "ProfileMetrics":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(other, ProfileMetrics)
         return ProfileMetrics(
             self.microseconds + other.microseconds,
@@ -45,6 +52,7 @@ def __add__(self, other: ProfileMetrics) -> ProfileMetrics:
             self.fusions + other.fusions,
         )
 
+<<<<<<< HEAD
     def __truediv__(self, other: Any) -> ProfileMetrics:
         if isinstance(other, int):
             other = ProfileMetrics(other, other, other)
@@ -54,6 +62,14 @@ def __truediv__(self, other: Any) -> ProfileMetrics:
             # pyrefly: ignore [bad-argument-type]
             self.operators / max(1, other.operators),
             # pyrefly: ignore [bad-argument-type]
+=======
+    def __truediv__(self, other: Any) -> "ProfileMetrics":
+        if isinstance(other, int):
+            other = ProfileMetrics(other, other, other)
+        return ProfileMetrics(
+            self.microseconds / max(1, other.microseconds),
+            self.operators / max(1, other.operators),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.fusions / max(1, other.fusions),
         )
 
diff --git a/torch/_dynamo/replay_record.py b/torch/_dynamo/replay_record.py
index 5d01217fdbb61..4e163dd209fcd 100644
--- a/torch/_dynamo/replay_record.py
+++ b/torch/_dynamo/replay_record.py
@@ -15,9 +15,14 @@
 
 import dataclasses
 from dataclasses import field
+<<<<<<< HEAD
 from io import BufferedReader, BufferedWriter
 from types import CellType, CodeType, ModuleType
 from typing import Any, IO, Union
+=======
+from types import CellType, CodeType, ModuleType
+from typing import Any, IO
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 from torch.utils._import_utils import import_dill
@@ -52,12 +57,20 @@ class ExecutionRecord:
     builtins: dict[str, Any] = field(default_factory=dict)
     code_options: dict[str, Any] = field(default_factory=dict)
 
+<<<<<<< HEAD
     def dump(self, f: Union[IO[str], BufferedWriter]) -> None:
+=======
+    def dump(self, f: IO[str]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert dill is not None, "replay_record requires `pip install dill`"
         dill.dump(self, f)
 
     @classmethod
+<<<<<<< HEAD
     def load(cls, f: Union[IO[bytes], BufferedReader]) -> Self:
+=======
+    def load(cls, f: IO[bytes]) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert dill is not None, "replay_record requires `pip install dill`"
         return dill.load(f)
 
diff --git a/torch/_dynamo/repro/after_aot.py b/torch/_dynamo/repro/after_aot.py
index 7a0efe79d5cfd..57d57363e927d 100644
--- a/torch/_dynamo/repro/after_aot.py
+++ b/torch/_dynamo/repro/after_aot.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Utilities for reproducing and debugging issues in PyTorch's Dynamo AOT compilation.
 
@@ -17,8 +22,11 @@
 the Dynamo AOT compilation pipeline, particularly for the Inductor backend.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import argparse
 import copy
 import functools
@@ -30,6 +38,7 @@
 import sys
 import textwrap
 import uuid
+<<<<<<< HEAD
 from importlib import import_module
 from tempfile import TemporaryFile
 from typing import Any, IO, Optional, TYPE_CHECKING, Union
@@ -51,6 +60,14 @@ class Heuristics:  # type: ignore[no-redef]
         pass
 
 
+=======
+from collections.abc import Sequence
+from importlib import import_module
+from tempfile import TemporaryFile
+from typing import Any, Callable, TYPE_CHECKING, Union
+from typing_extensions import Unpack
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.fx as fx
 import torch.nn as nn
@@ -75,10 +92,15 @@ class Heuristics:  # type: ignore[no-redef]
 )
 from torch._dynamo.utils import clone_inputs, counters, same
 from torch._environment import is_fbcode
+<<<<<<< HEAD
 from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
 from torch._inductor.cpp_builder import normalize_path_separator
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._ops import OpOverload
+=======
+from torch._inductor.output_code import OutputCode
+from torch._library.fake_class_registry import FakeScriptObject
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import (
     fx_placeholder_targets,
@@ -90,10 +112,14 @@ class Heuristics:  # type: ignore[no-redef]
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
 
     from torch._inductor.compile_fx import _CompileFxCallable, _CompileFxKwargs
     from torch._inductor.output_code import OutputCode
+=======
+    from torch._inductor.compile_fx import _CompileFxCallable, _CompileFxKwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.utils import InputType
 
 
@@ -109,9 +135,15 @@ class Heuristics:  # type: ignore[no-redef]
 
 
 def wrap_compiler_debug(
+<<<<<<< HEAD
     unconfigured_compiler_fn: _CompileFxCallable,
     compiler_name: str,
 ) -> _CompileFxCallable:
+=======
+    unconfigured_compiler_fn: "_CompileFxCallable",
+    compiler_name: str,
+) -> "_CompileFxCallable":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
     forward and backward call separately with the backend compiler_fn - like
@@ -123,8 +155,13 @@ def wrap_compiler_debug(
     @functools.wraps(unconfigured_compiler_fn)
     def debug_wrapper(
         gm: torch.fx.GraphModule,
+<<<<<<< HEAD
         example_inputs: Sequence[InputType],
         **kwargs: Unpack[_CompileFxKwargs],
+=======
+        example_inputs: Sequence["InputType"],
+        **kwargs: Unpack["_CompileFxKwargs"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> OutputCode:
         from torch._subclasses import FakeTensorMode
 
@@ -164,7 +201,11 @@ def debug_wrapper(
         # We may run regular PyTorch compute that may trigger Dynamo, do NOT
         # recursively attempt to accuracy minify in that case!
         def deferred_for_real_inputs(
+<<<<<<< HEAD
             real_inputs: Sequence[InputType], **_kwargs: object
+=======
+            real_inputs: Sequence["InputType"], **_kwargs: object
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Any:
             # This is a bit obscure: if we recursively try to accuracy minify
             # the SAME function, this would trigger.  But most of the time
@@ -176,7 +217,11 @@ def deferred_for_real_inputs(
             with config.patch(repro_after=None):
                 return inner_debug_fn(real_inputs)
 
+<<<<<<< HEAD
         def inner_debug_fn(real_inputs: Sequence[InputType]) -> Any:
+=======
+        def inner_debug_fn(real_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             Aot Autograd fw_compiler and bw_compiler can have fake tensors. So,
             example_inputs can be fake tensors. We can call compiler_fn (which is
@@ -205,7 +250,11 @@ def inner_debug_fn(real_inputs: Sequence[InputType]) -> Any:
                     )
                 failed = not same_two_models(
                     gm,
+<<<<<<< HEAD
                     inner_compiled_fn,  # type: ignore[arg-type]
+=======
+                    inner_compiled_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     real_inputs,
                     only_fwd=True,
                     ignore_non_fp=config.repro_ignore_non_fp,
@@ -269,7 +318,11 @@ def inner_debug_fn(real_inputs: Sequence[InputType]) -> Any:
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def maybe_fbcode_instructions() -> str:
+=======
+def maybe_fbcode_instructions():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_fbcode():
         extra_deps_formatted = "\n".join([f'        "{dep}",' for dep in extra_deps])
         if len(extra_deps_formatted) > 0:
@@ -302,6 +355,7 @@ def maybe_fbcode_instructions() -> str:
 
 
 def generate_compiler_repro_string(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
     *,
@@ -332,6 +386,10 @@ def generate_compiler_repro_string(
         """
         ).strip()
 
+=======
+    gm, args, *, stable_output=False, save_dir=None, stable_hash=False
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     model_str = textwrap.dedent(
         f"""
 {generate_env_vars_string(stable_output=stable_output)}
@@ -341,8 +399,11 @@ def generate_compiler_repro_string(
 from torch._dynamo.testing import rand_strided
 from math import inf
 import torch._inductor.inductor_prims
+<<<<<<< HEAD
 {distributed_imports}
 {triton_imports}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {generate_config_string(stable_output=stable_output)}
 
@@ -351,7 +412,11 @@ def generate_compiler_repro_string(
 {extra_imports}
 
 {maybe_fbcode_instructions()}
+<<<<<<< HEAD
      """
+=======
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if not stable_output:
         model_str += f"# torch version: {torch.version.__version__}\n"
@@ -361,6 +426,7 @@ def generate_compiler_repro_string(
             model_str += f"# torch git version: {torch.version.git_version}\n\n\n"
         model_str += _cuda_system_info_comment()
 
+<<<<<<< HEAD
     kernel_side_table_prefix = (
         "torch._higher_order_ops.triton_kernel_wrap.kernel_side_table"
     )
@@ -431,12 +497,25 @@ def generate_compiler_repro_string(
         if isinstance(arg, (int, torch.SymInt)):
             writer.symint(placeholder, arg)
         # pyrefly: ignore [unbound-name]
+=======
+    model_str += NNModuleToString.convert(gm)
+
+    # get hint shape/stride when dynamic shape enabled
+    def hint_if_symint(x):
+        return tuple(i.node.hint if isinstance(i, torch.SymInt) else i for i in x)
+
+    writer = InputWriter(save_dir, stable_hash=stable_hash)
+    for placeholder, arg in zip(fx_placeholder_targets(gm), args):
+        if isinstance(arg, (int, torch.SymInt)):
+            writer.symint(placeholder, arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(arg, torch.Tensor):
             # TODO: improve these names with FQN
             writer.tensor(placeholder, arg)
         elif arg is None:
             writer.const(placeholder)
         else:
+<<<<<<< HEAD
             writer.unsupported(placeholder, arg)
 
         # Extract symbolic variables from the same arguments
@@ -467,12 +546,20 @@ def generate_compiler_repro_string(
     load_args_lines = writer.lines()
     load_args_code = "\n".join(load_args_lines)
     model_str += load_args_code + "\n"
+=======
+            # It's better to produce a slightly wrong repro string than none
+            # at all
+            writer.unsupported(placeholder, arg)
+
+    model_str += "\n".join(writer.lines()) + "\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     model_str += "mod = Repro()\n"
     return model_str
 
 
 def save_graph_repro(
+<<<<<<< HEAD
     fd: IO[Any],
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
@@ -486,6 +573,21 @@ def save_graph_repro(
     check_str: Optional[str] = None,
     stable_hash: bool = False,
 ) -> None:
+=======
+    fd,
+    gm,
+    args,
+    compiler_name,
+    *,
+    stable_output=False,
+    save_dir=None,
+    command="run",
+    accuracy=None,
+    tracing_mode=None,
+    check_str=None,
+    stable_hash=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if any(
         isinstance(arg, torch.fx.experimental._backward_state.BackwardState)
         for arg in args
@@ -495,6 +597,7 @@ def save_graph_repro(
         )
         return
 
+<<<<<<< HEAD
     if save_dir is not None:
         save_dir = normalize_path_separator(save_dir)
 
@@ -506,6 +609,8 @@ def save_graph_repro(
         for node in gm.graph.nodes
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fd.write(
         generate_compiler_repro_string(
             gm,
@@ -513,7 +618,10 @@ def save_graph_repro(
             stable_output=stable_output,
             save_dir=save_dir,
             stable_hash=stable_hash,
+<<<<<<< HEAD
             has_distributed_ops=has_distributed_ops,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     )
     if accuracy is None:
@@ -526,6 +634,7 @@ def save_graph_repro(
             tracing_mode = "symbolic"
     fd.write("if __name__ == '__main__':\n")
     fd.write("    from torch._dynamo.repro.after_aot import run_repro\n")
+<<<<<<< HEAD
 
     # Add distributed initialization before run_repro if needed
     if has_distributed_ops:
@@ -540,6 +649,8 @@ def save_graph_repro(
             "    )\n"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fd.write(
         f"    with torch.no_grad():\n"
         f"        run_repro(mod, load_args, accuracy={accuracy!r}, command={command!r}, "
@@ -550,6 +661,7 @@ def save_graph_repro(
         f"        # mod(*args)"
     )
 
+<<<<<<< HEAD
     # Add distributed cleanup after run_repro
     if has_distributed_ops:
         fd.write("\n    dist.destroy_process_group()\n")
@@ -562,6 +674,10 @@ def dump_compiler_graph_state(
     *,
     accuracy: Optional[Union[str, bool]] = None,
 ) -> None:
+=======
+
+def dump_compiler_graph_state(gm, args, compiler_name, *, accuracy=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subdir = os.path.join(minifier_dir(), "checkpoints")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
@@ -589,9 +705,13 @@ def dump_compiler_graph_state(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def dump_to_minify(
     gm: torch.fx.GraphModule, args: Sequence[Any], compiler_name: str
 ) -> None:
+=======
+def dump_to_minify(gm, args, compiler_name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out = io.StringIO()
     # TODO: factor this out
     subdir = os.path.join(minifier_dir(), "checkpoints")
@@ -602,6 +722,7 @@ def dump_to_minify(
 
 
 def isolate_fails(
+<<<<<<< HEAD
     fx_g: torch.fx.GraphModule,
     args: Sequence[Any],
     compiler_name: str,
@@ -611,6 +732,17 @@ def isolate_fails(
     tracing_mode: Optional[str] = None,
     check_str: Optional[str] = None,
 ) -> bool:
+=======
+    fx_g,
+    args,
+    compiler_name: str,
+    env=None,
+    save_dir=None,
+    accuracy=None,
+    tracing_mode=None,
+    check_str=None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if env is None:
         env = {}
     subdir = os.path.join(os.getcwd(), "isolate")
@@ -666,16 +798,24 @@ def isolate_fails(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def inductor_fails(
     fx_g: torch.fx.GraphModule, args: Sequence[Any], check_str: Optional[str] = None
 ) -> bool:
+=======
+def inductor_fails(fx_g, args, check_str=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     has_cuda = False
     for arg in args:
         if isinstance(arg, torch.Tensor) and arg.is_cuda:
             has_cuda = True
             break
 
+<<<<<<< HEAD
     def sync() -> None:
+=======
+    def sync():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if has_cuda:
             # Ensures that segfaults are surfaced
             torch.cuda.synchronize()
@@ -705,6 +845,7 @@ def sync() -> None:
 
 
 def inductor_accuracy_fails(
+<<<<<<< HEAD
     fx_g: torch.fx.GraphModule,
     args: Sequence[Any],
     check_str: Optional[str] = None,
@@ -712,12 +853,21 @@ def inductor_accuracy_fails(
     require_fp64: bool = False,
     ignore_non_fp: bool = False,
 ) -> bool:
+=======
+    fx_g, args, check_str=None, *, require_fp64=False, ignore_non_fp=False
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.compile_fx import compile_fx_inner
 
     return backend_aot_accuracy_fails(
         fx_g,
+<<<<<<< HEAD
         args,  # type: ignore[arg-type]
         compile_fx_inner,  # type: ignore[arg-type]
+=======
+        args,
+        compile_fx_inner,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         require_fp64=require_fp64,
         ignore_non_fp=ignore_non_fp,
     )
@@ -731,9 +881,13 @@ def inductor_accuracy_fails(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def repro_common(
     options: Any, mod: nn.Module, load_args: Any
 ) -> tuple[torch.fx.GraphModule, Sequence[Any]]:
+=======
+def repro_common(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Invariant for graphs we generate with the repro script
     assert not any(mod.named_parameters())
     for n, b in mod.named_buffers():
@@ -771,13 +925,20 @@ def repro_common(
     # TODO: speed this up
     mod = make_fx(mod, tracing_mode=options.tracing_mode)(*args)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._inductor.config.generate_intermediate_hooks = True
 
     return mod, args
 
 
+<<<<<<< HEAD
 ACCURACY_FAILS: dict[str, Callable[[torch.fx.GraphModule, Any], bool]] = {
+=======
+ACCURACY_FAILS: dict[str, Callable[[nn.Module, Any], bool]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "": inductor_fails,
     # This might look inverted but it's not.  strict_accuracy means "we will
     # minify any time we see anything that diverges", whereas accuracy is more
@@ -790,7 +951,11 @@ def repro_common(
 }
 
 
+<<<<<<< HEAD
 def repro_minifier_query(options: Any, mod: nn.Module, load_args: Any) -> None:
+=======
+def repro_minifier_query(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mod, args = repro_common(options, mod, load_args)
     fail_fn = functools.partial(
         ACCURACY_FAILS[options.accuracy],
@@ -802,7 +967,11 @@ def repro_minifier_query(options: Any, mod: nn.Module, load_args: Any) -> None:
         sys.exit(0)
 
 
+<<<<<<< HEAD
 def repro_minify(options: Any, mod: nn.Module, load_args: Any) -> None:
+=======
+def repro_minify(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from functorch.compile import minifier
 
     mod, args = repro_common(options, mod, load_args)
@@ -839,7 +1008,11 @@ def repro_minify(options: Any, mod: nn.Module, load_args: Any) -> None:
     )
 
 
+<<<<<<< HEAD
 def repro_analyze(options: Any, mod: nn.Module, load_args: Any) -> None:
+=======
+def repro_analyze(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.compile_fx import compile_fx_inner
     from torch._inductor.hooks import intermediate_hook
 
@@ -857,7 +1030,11 @@ def repro_analyze(options: Any, mod: nn.Module, load_args: Any) -> None:
 
     known_names = set()
 
+<<<<<<< HEAD
     def save_hook(name: str, val: Any) -> None:
+=======
+    def save_hook(name, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         known_names.add(name)
         if not options.skip_saving_inductor_intermediates:
             writer.write_tensor(os.path.join("inductor", name), val)
@@ -874,10 +1051,17 @@ def save_hook(name: str, val: Any) -> None:
         tqdm(desc="Saving inductor intermediates", total=total) as pbar,
     ):
         assert not isinstance(compiled, str)
+<<<<<<< HEAD
         compiled(new_args)  # type: ignore[arg-type]
         assert not new_args
 
     def compare_tuples(tuple1: tuple[Any], tuple2: tuple[Any]) -> Optional[str]:
+=======
+        compiled(new_args)
+        assert not new_args
+
+    def compare_tuples(tuple1, tuple2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         diff_indices = [i for i in range(len(tuple1)) if tuple1[i] != tuple2[i]]
         diff_values = [(tuple1[i], tuple2[i]) for i in diff_indices]
 
@@ -886,7 +1070,11 @@ def compare_tuples(tuple1: tuple[Any], tuple2: tuple[Any]) -> Optional[str]:
         else:
             return " and ".join(f"{a} != {b}" for a, b in diff_values)
 
+<<<<<<< HEAD
     def check_hook(name: str, val: Any) -> None:
+=======
+    def check_hook(name, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         meta = writer.compute_tensor_metadata(val)
         meta2 = reader.read_tensor_metadata(os.path.join("inductor", name))
         reason = compare_tuples(meta, meta2)
@@ -900,6 +1088,7 @@ def check_hook(name: str, val: Any) -> None:
             intermediate_hook(check_hook),
             tqdm(desc="Checking inductor determinism", total=total) as pbar,
         ):
+<<<<<<< HEAD
             compiled(new_args)  # type: ignore[arg-type]
             assert not new_args
 
@@ -909,6 +1098,17 @@ def __init__(self, mod: torch.nn.Module, subdir: str) -> None:
             self.subdir = subdir
 
         def run_node(self, n: torch.fx.Node) -> Any:
+=======
+            compiled(new_args)
+            assert not new_args
+
+    class WriterInterp(fx.Interpreter):
+        def __init__(self, mod, subdir) -> None:
+            super().__init__(mod)
+            self.subdir = subdir
+
+        def run_node(self, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = super().run_node(n)
             name = n.name
             if name in known_names:
@@ -919,13 +1119,21 @@ def run_node(self, n: torch.fx.Node) -> Any:
     # NB: the module cast doesn't actually do anything, since there are no
     # parameters/buffers on the module
     if not options.skip_saving_float64_intermediates:
+<<<<<<< HEAD
         new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))  # type: ignore[arg-type]
+=======
+        new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with tqdm(desc="Saving float64 intermediates", total=total) as pbar:
             WriterInterp(new_mod, "float64").boxed_run(new_args)
         assert not new_args
 
     class ExactReaderInterp(fx.Interpreter):
+<<<<<<< HEAD
         def run_node(self, n: torch.fx.Node) -> Any:
+=======
+        def run_node(self, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = super().run_node(n)
             name = n.name
             if name in known_names:
@@ -940,7 +1148,11 @@ def run_node(self, n: torch.fx.Node) -> Any:
     # TODO: check eager determinism
 
     if not options.skip_check_deterministic:
+<<<<<<< HEAD
         new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))  # type: ignore[arg-type]
+=======
+        new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with tqdm(desc="Checking float64 determinism", total=total) as pbar:
             ExactReaderInterp(new_mod).boxed_run(new_args)
             assert not new_args
@@ -948,7 +1160,11 @@ def run_node(self, n: torch.fx.Node) -> Any:
     # Now that we've saved everything, interp through the eager graph
     # and do comparisons
     class ReaderInterp(fx.Interpreter):
+<<<<<<< HEAD
         def run_node(self, n: torch.fx.Node) -> Any:
+=======
+        def run_node(self, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r = super().run_node(n)
             name = n.name
             if name in known_names:
@@ -956,7 +1172,11 @@ def run_node(self, n: torch.fx.Node) -> Any:
                 float64 = reader.read_tensor(os.path.join("float64", name))
                 logged = False
 
+<<<<<<< HEAD
                 def log_error(msg: str, *args: Any) -> None:
+=======
+                def log_error(msg, *args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     nonlocal logged
                     logged = True
                     pbar.write(f"DIVERGED at {name}: {msg % args}")
@@ -978,6 +1198,7 @@ def log_error(msg: str, *args: Any) -> None:
     assert not args
 
 
+<<<<<<< HEAD
 def repro_get_args(
     options: Any, mod: nn.Module, load_args: Any
 ) -> tuple[torch.fx.GraphModule, list[Any]]:
@@ -986,6 +1207,14 @@ def repro_get_args(
 
 
 def repro_run(options: Any, mod: nn.Module, load_args: Any) -> None:
+=======
+def repro_get_args(options, mod, load_args):
+    mod, args = repro_common(options, mod, load_args)
+    return mod, args
+
+
+def repro_run(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.compile_fx import compile_fx_inner
 
     mod, args = repro_common(options, mod, load_args)
@@ -1000,7 +1229,11 @@ def repro_run(options: Any, mod: nn.Module, load_args: Any) -> None:
         # seems counterintuitive
         if not same_two_models(
             mod,
+<<<<<<< HEAD
             compiled,  # type: ignore[arg-type]
+=======
+            compiled,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args,
             only_fwd=True,
             ignore_non_fp=config.repro_ignore_non_fp,
@@ -1022,6 +1255,7 @@ def repro_run(options: Any, mod: nn.Module, load_args: Any) -> None:
 
 # TODO: lazily load the inputs or something, rather than cloning them
 def run_repro(
+<<<<<<< HEAD
     mod: nn.Module,
     load_args: Any,
     *,
@@ -1033,6 +1267,19 @@ def run_repro(
     check_str: Optional[str] = None,
     **kwargs: Any,
 ) -> Any:
+=======
+    mod,
+    load_args,
+    *,
+    command="run",
+    accuracy: Union[bool, str] = "",
+    save_dir=None,
+    tracing_mode=None,
+    patch_code=None,
+    check_str=None,
+    **kwargs,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k in kwargs:
         log.warning(
             "Unrecognized kwarg %s; perhaps this repro was made on a newer version of PyTorch",
@@ -1065,7 +1312,11 @@ def run_repro(
         formatter_class=argparse.RawTextHelpFormatter,
     )
 
+<<<<<<< HEAD
     def common_flags(parser: argparse.ArgumentParser) -> None:
+=======
+    def common_flags(parser):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         accuracy_group = parser.add_mutually_exclusive_group()
         accuracy_group.add_argument(
             "--no-accuracy",
diff --git a/torch/_dynamo/repro/after_dynamo.py b/torch/_dynamo/repro/after_dynamo.py
index a17518fc6c74d..a9234f65ca7d4 100644
--- a/torch/_dynamo/repro/after_dynamo.py
+++ b/torch/_dynamo/repro/after_dynamo.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Utilities for reproducing and debugging issues in Dynamo after graph capture.
 
@@ -24,12 +29,21 @@
 import shutil
 import sys
 import textwrap
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from importlib import import_module
 from typing import Any, Optional, Union
 
 import torch
 import torch.fx as fx
+=======
+from importlib import import_module
+from typing import Union
+
+import torch
+import torch.fx as fx
+from torch._dynamo.backends.registry import CompiledFn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.debug_utils import (
     AccuracyError,
     backend_accuracy_fails,
@@ -51,7 +65,11 @@
 from torch.hub import tqdm
 
 from .. import config
+<<<<<<< HEAD
 from ..backends.registry import CompilerFn, lookup_backend, register_debug_backend
+=======
+from ..backends.registry import lookup_backend, register_debug_backend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..debug_utils import clone_inputs_retaining_gradness
 
 
@@ -66,11 +84,15 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def _accuracy_fails(
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[Any],
     compiler_fn: Callable[[torch.fx.GraphModule, list[Any]], torch.fx.GraphModule],
 ) -> bool:
+=======
+def _accuracy_fails(gm, example_inputs, compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return backend_accuracy_fails(
         gm,
         example_inputs,
@@ -81,15 +103,22 @@ def _accuracy_fails(
 
 
 class WrapBackendDebug:
+<<<<<<< HEAD
     def __init__(
         self, unconfigured_compiler_fn: CompilerFn, compiler_name: Optional[str]
     ) -> None:
         functools.wraps(unconfigured_compiler_fn)(self)
         self._torchdynamo_orig_backend = unconfigured_compiler_fn
+=======
+    def __init__(self, unconfigured_compiler_fn, compiler_name: str) -> None:
+        functools.wraps(unconfigured_compiler_fn)(self)
+        self._torchdynamo_orig_callable = unconfigured_compiler_fn  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._compiler_name = compiler_name
         if hasattr(unconfigured_compiler_fn, "__name__"):
             self.__name__ = unconfigured_compiler_fn.__name__
         if hasattr(unconfigured_compiler_fn, "compiler_name"):
+<<<<<<< HEAD
             self.__name__ = unconfigured_compiler_fn.compiler_name  # type: ignore[attr-defined]
         if hasattr(unconfigured_compiler_fn, "get_compiler_config"):
             self.get_compiler_config = unconfigured_compiler_fn.get_compiler_config  # type: ignore[attr-defined]
@@ -98,16 +127,33 @@ def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[Any], **kwargs: Any
     ) -> torch.fx.GraphModule:
         compiler_fn = functools.partial(self._torchdynamo_orig_backend, **kwargs)
+=======
+            self.__name__ = unconfigured_compiler_fn.compiler_name
+        if hasattr(unconfigured_compiler_fn, "get_compiler_config"):
+            self.get_compiler_config = unconfigured_compiler_fn.get_compiler_config  # type: ignore[attr-defined]
+
+    def __call__(self, gm, example_inputs, **kwargs):
+        compiler_fn = functools.partial(self._torchdynamo_orig_callable, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert config.repro_after in ("dynamo", "aot", None)
 
         if config.repro_after == "dynamo":
 
+<<<<<<< HEAD
             def add_paths(exc: Exception) -> None:
                 exc.minifier_path = os.path.join(minifier_dir(), "minifier_launcher.py")  # type: ignore[attr-defined]
                 if use_buck:
                     exc.buck_command = " ".join(  # type: ignore[attr-defined]
                         BUCK_CMD_PREFIX
                         + [BuckTargetWriter(exc.minifier_path).cmd_line_path]  # type: ignore[attr-defined]
+=======
+            def add_paths(exc):
+                exc.minifier_path = os.path.join(minifier_dir(), "minifier_launcher.py")
+                if use_buck:
+                    exc.buck_command = " ".join(
+                        BUCK_CMD_PREFIX
+                        + [BuckTargetWriter(exc.minifier_path).cmd_line_path]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             if config.repro_level == 3:
@@ -117,7 +163,11 @@ def add_paths(exc: Exception) -> None:
             if config.repro_level == 4:
                 # Check Accuracy
                 compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+<<<<<<< HEAD
                 if _accuracy_fails(gm, example_inputs, compiler_fn):  # type: ignore[arg-type]
+=======
+                if _accuracy_fails(gm, example_inputs, compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     log.warning(
                         "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
                     )
@@ -132,7 +182,11 @@ def add_paths(exc: Exception) -> None:
             else:
                 try:
                     compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+<<<<<<< HEAD
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)  # type: ignore[arg-type]
+=======
+                    run_fwd_maybe_bwd(compiled_gm, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except Exception as exc:
                     log.warning(
                         "Compiled Fx GraphModule failed. Creating script to minify the error."
@@ -155,12 +209,19 @@ def add_paths(exc: Exception) -> None:
         else:
             compiled_gm = compiler_fn(gm, example_inputs)
 
+<<<<<<< HEAD
         return compiled_gm  # type: ignore[return-value]
 
 
 def wrap_backend_debug(
     unconfigured_compiler_fn: CompilerFn, compiler_name: Optional[str]
 ) -> WrapBackendDebug:
+=======
+        return compiled_gm
+
+
+def wrap_backend_debug(unconfigured_compiler_fn, compiler_name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     A minifier decorator that wraps the TorchDynamo produced Fx graph modules.
     As opposed to wrap_compiler_debug, this wrapper intercepts at the
@@ -178,6 +239,7 @@ def wrap_backend_debug(
 
 
 def generate_dynamo_fx_repro_string(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
     compiler_name: Optional[str],
@@ -187,6 +249,17 @@ def generate_dynamo_fx_repro_string(
     save_dir: Optional[str] = None,
     command: str = "run",
 ) -> str:
+=======
+    gm,
+    args,
+    compiler_name,
+    check_accuracy=False,
+    *,
+    stable_output=False,
+    save_dir=None,
+    command="run",
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a repro string for backend-agnostic minified version.
     """
@@ -233,12 +306,16 @@ def generate_dynamo_fx_repro_string(
     )
 
 
+<<<<<<< HEAD
 def dump_backend_repro_as_file(
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
     compiler_name: Optional[str],
     check_accuracy: bool = False,
 ) -> None:
+=======
+def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Saves the repro to a repro.py file
     """
@@ -266,12 +343,16 @@ def dump_backend_repro_as_file(
     shutil.copyfile(file_name, latest_repro)
 
 
+<<<<<<< HEAD
 def dump_backend_state(
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
     compiler_name: Optional[str],
     check_accuracy: bool = False,
 ) -> None:
+=======
+def dump_backend_state(gm, args, compiler_name, check_accuracy=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Dumps the dynamo graph to repro the issue.
     1) It tries to convert Fx GraphModule to a string. If we can, it writes to a
@@ -289,9 +370,13 @@ def dump_backend_state(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def dump_to_minify_after_dynamo(
     gm: torch.fx.GraphModule, args: Sequence[Any], compiler_name: Optional[str]
 ) -> None:
+=======
+def dump_to_minify_after_dynamo(gm, args, compiler_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: factor this out
     subdir = os.path.join(minifier_dir(), "checkpoints")
     if not os.path.exists(subdir):
@@ -315,11 +400,19 @@ def dump_to_minify_after_dynamo(
 
 @register_debug_backend  # type: ignore[arg-type]
 def dynamo_minifier_backend(
+<<<<<<< HEAD
     gm: fx.GraphModule, example_inputs: Sequence[Any], compiler_name: Optional[str]
 ) -> fx.GraphModule:
     from functorch.compile import minifier
 
     compiler_fn = lookup_backend(compiler_name)  # type: ignore[arg-type]
+=======
+    gm: fx.GraphModule, example_inputs, compiler_name: CompiledFn
+):
+    from functorch.compile import minifier
+
+    compiler_fn = lookup_backend(compiler_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: It's inconsistent to pass SymInt inputs but REAL tensors.
     # We should pass ints and look at the GraphModule placeholders
@@ -330,7 +423,11 @@ def dynamo_minifier_backend(
 
     try:
         compiled_gm = compiler_fn(gm, example_inputs)
+<<<<<<< HEAD
         run_fwd_maybe_bwd(compiled_gm, example_inputs)  # type: ignore[arg-type]
+=======
+        run_fwd_maybe_bwd(compiled_gm, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError("No issue was detected")
     except Exception as exc:
         orig_failure = str(exc)
@@ -356,25 +453,40 @@ def dynamo_minifier_backend(
 
 
 @register_debug_backend  # type: ignore[arg-type]
+<<<<<<< HEAD
 def dynamo_accuracy_minifier_backend(
     gm: fx.GraphModule, example_inputs: Sequence[Any], compiler_name: Optional[str]
 ) -> fx.GraphModule:
     from functorch.compile import minifier
 
     compiler_fn = lookup_backend(compiler_name)  # type: ignore[arg-type]
+=======
+def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
+    from functorch.compile import minifier
+
+    compiler_fn = lookup_backend(compiler_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Set the eval mode to remove randomness.
     gm.eval()
 
     # Check Accuracy
+<<<<<<< HEAD
     if _accuracy_fails(gm, example_inputs, compiler_fn):  # type: ignore[arg-type]
+=======
+    if _accuracy_fails(gm, example_inputs, compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.warning("Accuracy failed for the TorchDynamo produced graph")
         dump_state_fn = functools.partial(
             dump_backend_state, compiler_name=compiler_name, check_accuracy=True
         )
         fails_fn = functools.partial(
             _accuracy_fails,
+<<<<<<< HEAD
             compiler_fn=compiler_fn,  # type: ignore[arg-type]
+=======
+            compiler_fn=compiler_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
         minifier(
@@ -388,12 +500,16 @@ def dynamo_accuracy_minifier_backend(
     return gm
 
 
+<<<<<<< HEAD
 def backend_fails(
     gm: fx.GraphModule,
     example_inputs: Sequence[Any],
     compiler_fn: CompilerFn,
     orig_failure: Sequence[Any],
 ) -> bool:
+=======
+def backend_fails(gm, example_inputs, compiler_fn, orig_failure):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Minifier uses this function to identify if the minified graph module fails
     with the same error.
@@ -410,8 +526,13 @@ def backend_fails(
     try:
         # Run the original gm to check eager validity
         run_fwd_maybe_bwd(gm, clone_inputs_retaining_gradness(example_inputs))
+<<<<<<< HEAD
         compiled_gm = compiler_fn(gm, example_inputs)  # type: ignore[arg-type]
         run_fwd_maybe_bwd(compiled_gm, clone_inputs_retaining_gradness(example_inputs))  # type: ignore[arg-type]
+=======
+        compiled_gm = compiler_fn(gm, example_inputs)
+        run_fwd_maybe_bwd(compiled_gm, clone_inputs_retaining_gradness(example_inputs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except Exception as e:
         new_failure = str(e)
         if SequenceMatcher(None, orig_failure, new_failure).ratio() > 0.5:
@@ -424,7 +545,11 @@ def backend_fails(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+<<<<<<< HEAD
 def run_load_args(options: Any, mod: torch.nn.Module, load_args: Any) -> list[Any]:
+=======
+def run_load_args(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not hasattr(load_args, "_version"):
         log.warning(
             "load_args does not have a _version attribute, please file a bug to PyTorch "
@@ -450,7 +575,11 @@ def run_load_args(options: Any, mod: torch.nn.Module, load_args: Any) -> list[An
     return args
 
 
+<<<<<<< HEAD
 def repro_minify(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
+=======
+def repro_minify(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = run_load_args(options, mod, load_args)
 
     # Setup debug minifier compiler
@@ -469,7 +598,11 @@ def repro_minify(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
 
     dynamo_minifier_backend = functools.partial(
         compiler_fn,
+<<<<<<< HEAD
         compiler_name=options.backend,  # type: ignore[call-arg]
+=======
+        compiler_name=options.backend,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     opt_mod = torch._dynamo.optimize(dynamo_minifier_backend)(mod)
 
@@ -477,20 +610,35 @@ def repro_minify(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
         opt_mod(*args)
 
 
+<<<<<<< HEAD
 def repro_run(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
+=======
+def repro_run(options, mod, load_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opt_mod = torch._dynamo.optimize(options.backend)(mod)
 
     if options.accuracy != "":
         mod.eval()
+<<<<<<< HEAD
         opt_mod.eval()  # type: ignore[union-attr]
+=======
+        opt_mod.eval()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch.amp.autocast("cuda", enabled=options.autocast):
             # TODO: disable clone
             args = run_load_args(options, mod, load_args)
+<<<<<<< HEAD
             assert same_two_models(mod, mod, args), "Eager itself failed"  # type: ignore[arg-type]
             if not same_two_models(
                 mod,  # type: ignore[arg-type]
                 opt_mod,  # type: ignore[arg-type]
+=======
+            assert same_two_models(mod, mod, args), "Eager itself failed"
+            if not same_two_models(
+                mod,
+                opt_mod,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args,
                 only_fwd=config.repro_forward_only,
                 ignore_non_fp=config.repro_ignore_non_fp,
@@ -499,19 +647,28 @@ def repro_run(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
     else:
         with torch.amp.autocast("cuda", enabled=options.autocast):
             args = run_load_args(options, mod, load_args)
+<<<<<<< HEAD
             run_fwd_maybe_bwd(mod, args, only_fwd=options.only_fwd, disable_clone=True)  # type: ignore[arg-type]
+=======
+            run_fwd_maybe_bwd(mod, args, only_fwd=options.only_fwd, disable_clone=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             del args
 
             args = run_load_args(options, mod, load_args)
             run_fwd_maybe_bwd(
+<<<<<<< HEAD
                 opt_mod,  # type: ignore[arg-type]
                 args,
                 only_fwd=options.only_fwd,
                 disable_clone=True,  # type: ignore[arg-type]
+=======
+                opt_mod, args, only_fwd=options.only_fwd, disable_clone=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
 def run_repro(
+<<<<<<< HEAD
     mod: torch.nn.Module,
     load_args: Any,
     *,
@@ -522,6 +679,18 @@ def run_repro(
     backend: str = "inductor",
     **kwargs: Any,
 ) -> None:
+=======
+    mod,
+    load_args,
+    *,
+    command="run",
+    accuracy: Union[bool, str] = "",
+    save_dir=None,
+    autocast=False,
+    backend="inductor",
+    **kwargs,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k in kwargs:
         log.warning(
             "Unrecognized kwarg %s; perhaps this repro was made on a newer version of PyTorch",
@@ -547,7 +716,11 @@ def run_repro(
         formatter_class=argparse.RawTextHelpFormatter,
     )
 
+<<<<<<< HEAD
     def common_flags(parser: argparse.ArgumentParser) -> None:
+=======
+    def common_flags(parser):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         accuracy_group = parser.add_mutually_exclusive_group()
         accuracy_group.add_argument(
             "--no-accuracy",
diff --git a/torch/_dynamo/repro/aoti.py b/torch/_dynamo/repro/aoti.py
index d1f556787695c..a663d2dc5c8dd 100644
--- a/torch/_dynamo/repro/aoti.py
+++ b/torch/_dynamo/repro/aoti.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Utilities for debugging and reproducing issues in Ahead of Time with Inductor (AOTI) compilation.
 
@@ -24,9 +29,14 @@
 import shutil
 import sys
 import textwrap
+<<<<<<< HEAD
 from collections.abc import Sequence
 from importlib import import_module
 from typing import Any, IO, Optional, Union
+=======
+from importlib import import_module
+from typing import Any, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._dynamo.debug_utils import (
@@ -53,7 +63,11 @@
 
 
 class AOTIMinifierError(Exception):
+<<<<<<< HEAD
     def __init__(self, original_exception: Union[str, Exception]) -> None:
+=======
+    def __init__(self, original_exception):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         additional_message = "This error is caused by a bug in the AOTI minifier, please report a bug to PyTorch"
         full_message = f"{additional_message}: {str(original_exception)}"
         super().__init__(full_message)
@@ -65,7 +79,11 @@ def dump_to_minify(
     compiler_name: str,
     command: str = "minify",
     options: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     If command is "minify":
         Dump exported_program to `debug_dir/minifier/minifier_launcher.py`, with minify command.
@@ -110,8 +128,13 @@ def dump_to_minify(
             log.warning("No write permissions for %s", file_name)
 
 
+<<<<<<< HEAD
 def get_module_string(gm: torch.fx.GraphModule) -> str:
     def _convert_to_comment(s_: str) -> str:
+=======
+def get_module_string(gm):
+    def _convert_to_comment(s_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = s_.split("\n")
         if len(s) == 1:
             return "# " + s_
@@ -131,13 +154,19 @@ def _convert_to_comment(s_: str) -> str:
 
 
 def save_graph_repro_ep(
+<<<<<<< HEAD
     fd: IO[Any],
     compiler_name: str,
+=======
+    fd,
+    compiler_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     exported_program: Optional[ExportedProgram] = None,
     gm: Optional[torch.nn.Module] = None,
     args: Optional[tuple[Any]] = None,
     config_patches: Optional[dict[str, str]] = None,
+<<<<<<< HEAD
     stable_output: bool = False,
     save_dir: Optional[str] = None,
     command: str = "run",
@@ -146,6 +175,16 @@ def save_graph_repro_ep(
     module_in_comment: bool = False,
     strict: bool = False,
 ) -> None:
+=======
+    stable_output=False,
+    save_dir=None,
+    command="run",
+    accuracy=None,
+    check_str=None,
+    module_in_comment=False,
+    strict=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Save graph for reproducing the error.
     # Either exported_program or gm will be saved, depending on which one is defined.
     # Only one of exported_program and gm should be defined.
@@ -162,10 +201,17 @@ def save_graph_repro_ep(
         assert args is not None
         exported_program = torch.export.export(gm, args, strict=strict)
     elif gm is None:
+<<<<<<< HEAD
         gm = exported_program.module(check_guards=False)
 
     # save a graph preview using gm
     module_string = get_module_string(gm)  # type: ignore[arg-type]
+=======
+        gm = exported_program.module()
+
+    # save a graph preview using gm
+    module_string = get_module_string(gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fd.write(module_string)
 
     # save a graph repro using exported_program
@@ -189,6 +235,7 @@ def save_graph_repro_ep(
 
 
 def dump_compiler_graph_state(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
     args: Sequence[Any],
     compiler_name: str,
@@ -197,6 +244,16 @@ def dump_compiler_graph_state(
     accuracy: Optional[Union[str, bool]] = None,
     strict: bool = False,
 ) -> None:
+=======
+    gm,
+    args,
+    compiler_name,
+    *,
+    config_patches=None,
+    accuracy=None,
+    strict=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subdir = os.path.join(minifier_dir(), "checkpoints")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
@@ -233,12 +290,21 @@ def dump_compiler_graph_state(
 
 
 def generate_compiler_repro_exported_program(
+<<<<<<< HEAD
     exported_program: ExportedProgram,
     *,
     options: Optional[dict[str, str]] = None,
     stable_output: bool = False,
     save_dir: Optional[str] = None,
 ) -> str:
+=======
+    exported_program,
+    *,
+    options: Optional[dict[str, str]] = None,
+    stable_output=False,
+    save_dir=None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     model_str = textwrap.dedent(
         f"""
 {generate_env_vars_string(stable_output=stable_output)}
@@ -260,10 +326,15 @@ def generate_compiler_repro_exported_program(
         if hasattr(torch.version, "git_version"):
             model_str += f"# torch git version: {torch.version.git_version}\n\n\n"
         model_str += _cuda_system_info_comment()
+<<<<<<< HEAD
     if save_dir:
         ep_path = os.path.join(save_dir, "exported_program.pt2")
     else:
         ep_path = "exported_program.pt2"
+=======
+
+    ep_path = os.path.join(save_dir, "exported_program.pt2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.export.save(exported_program, ep_path)
 
     model_str += f"exported_program = torch.export.load('{ep_path}')\n"
@@ -272,7 +343,11 @@ def generate_compiler_repro_exported_program(
     return model_str
 
 
+<<<<<<< HEAD
 def repro_load_args(load_args: Any, save_dir: Optional[str]) -> tuple[Any]:
+=======
+def repro_load_args(load_args, save_dir):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not hasattr(load_args, "_version"):
         log.warning(
             "load_args does not have a _version attribute, please file a bug to PyTorch "
@@ -298,6 +373,7 @@ def repro_load_args(load_args: Any, save_dir: Optional[str]) -> tuple[Any]:
     return tuple(args)
 
 
+<<<<<<< HEAD
 def repro_common(
     options: Any, exported_program: ExportedProgram
 ) -> tuple[torch.fx.GraphModule, Any, Any]:
@@ -313,15 +389,29 @@ def repro_get_args(
     exported_program: ExportedProgram,
     config_patches: Optional[dict[str, Any]],
 ) -> tuple[torch.fx.GraphModule, Any, Any]:
+=======
+def repro_common(options, exported_program):
+    torch._inductor.config.generate_intermediate_hooks = True
+    mod = exported_program.module()
+    args, kwargs = exported_program.example_inputs
+    return mod, args, kwargs
+
+
+def repro_get_args(options, exported_program, config_patches):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mod, args, kwargs = repro_common(options, exported_program)
     return mod, args, kwargs
 
 
+<<<<<<< HEAD
 def repro_run(
     options: Any,
     exported_program: ExportedProgram,
     config_patches: Optional[dict[str, Any]],
 ) -> None:
+=======
+def repro_run(options, exported_program, config_patches):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor import _aoti_compile_and_package_inner
 
     gm, args, kwargs = repro_common(options, exported_program)
@@ -349,10 +439,14 @@ def repro_run(
 
 
 def export_for_aoti_minifier(
+<<<<<<< HEAD
     gm: torch.nn.Module,
     tuple_inputs: tuple[Any],
     strict: bool = False,
     skip_export_error: bool = True,
+=======
+    gm, tuple_inputs, strict=False, skip_export_error=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[torch.nn.Module]:
     # Some graphs cannot be used for AOTI/export (illegal graphs), these should be
     # considered as graphs that don't fail in the minifier, so the minifier keeps searching.
@@ -369,7 +463,11 @@ def export_for_aoti_minifier(
 
     try:
         ep = torch.export.export(gm, tuple_inputs, strict=strict)
+<<<<<<< HEAD
         gm = ep.module(check_guards=False)
+=======
+        gm = ep.module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return gm
     except Exception as e:
         if skip_export_error:
@@ -387,11 +485,15 @@ def export_for_aoti_minifier(
     return None
 
 
+<<<<<<< HEAD
 def repro_minify(
     options: Any,
     exported_program: ExportedProgram,
     config_patches: Optional[dict[str, Any]],
 ) -> None:
+=======
+def repro_minify(options, exported_program, config_patches):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from functorch.compile import minifier
     from torch._inductor import _aoti_compile_and_package_inner
     from torch._inductor.compile_fx import _aoti_flatten_inputs
@@ -416,6 +518,7 @@ def repro_minify(
             need_sync = True
             break
 
+<<<<<<< HEAD
     def module_fails(
         gm: torch.fx.GraphModule,
         flat_example_inputs: list[Any],
@@ -424,6 +527,11 @@ def module_fails(
         # Need to export first so the in_spec and out_spec are populated
         tuple_inputs = tuple(flat_example_inputs)
         # pyrefly: ignore [bad-assignment]
+=======
+    def module_fails(gm, flat_example_inputs, check_str=None):
+        # Need to export first so the in_spec and out_spec are populated
+        tuple_inputs = tuple(flat_example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = export_for_aoti_minifier(
             gm, tuple_inputs, strict=strict, skip_export_error=skip_export_error
         )
@@ -471,6 +579,7 @@ def module_fails(
 
 
 def run_repro(
+<<<<<<< HEAD
     exported_program: ExportedProgram,
     *,
     config_patches: Optional[dict[str, str]] = None,
@@ -483,6 +592,20 @@ def run_repro(
     skip_export_error: bool = True,
     **more_kwargs: Any,
 ) -> Any:
+=======
+    exported_program,
+    *,
+    config_patches: Optional[dict[str, str]] = None,
+    command="run",
+    accuracy: Union[bool, str] = "",
+    save_dir=None,
+    tracing_mode=None,
+    check_str=None,
+    minifier_export_mode="python",
+    skip_export_error=True,
+    **more_kwargs,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k in more_kwargs:
         log.warning(
             "Unrecognized kwarg %s; perhaps this repro was made on a newer version of PyTorch",
@@ -510,7 +633,11 @@ def run_repro(
         formatter_class=argparse.RawTextHelpFormatter,
     )
 
+<<<<<<< HEAD
     def common_flags(parser: argparse.ArgumentParser) -> None:
+=======
+    def common_flags(parser):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         accuracy_group = parser.add_mutually_exclusive_group()
         accuracy_group.add_argument(
             "--no-accuracy",
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index 13794e2a0ac88..af446160f79cb 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides functionality for resuming Python execution at specific points in code,
 primarily used by PyTorch Dynamo for control flow handling and optimization. It implements
@@ -17,6 +22,7 @@
 import dataclasses
 import sys
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from contextlib import AbstractContextManager
 from typing import Any, cast, Optional
@@ -27,6 +33,13 @@
     create_binary_subscr,
     create_call_function,
     create_call_function_ex,
+=======
+from typing import Any, cast, Optional
+
+from .bytecode_transformation import (
+    bytecode_from_template,
+    create_call_function,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     create_instruction,
     create_jump_absolute,
     create_load_const,
@@ -52,11 +65,17 @@
 
 # trace_rules.py import this constant for consistency
 TORCH_DYNAMO_RESUME_IN_PREFIX = "torch_dynamo_resume_in"
+<<<<<<< HEAD
 IS_TRACING_RESUME_PROLOGUE_VARNAME = "__is_tracing_resume_prologue"
 
 
 # If is_resume - this codegen is for a resume function
 def _initial_push_null(insts: list[Instruction]) -> None:
+=======
+
+
+def _initial_push_null(insts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 11):
         insts.append(create_instruction("PUSH_NULL"))
         if sys.version_info < (3, 13):
@@ -64,11 +83,15 @@ def _initial_push_null(insts: list[Instruction]) -> None:
 
 
 # Generates bytecode from template and splits the code where LOAD_FAST dummy is present.
+<<<<<<< HEAD
 def _bytecode_from_template_with_split(
     template: Callable[..., Any],
     stack_index: int,
     varname_map: Optional[dict[str, Any]] = None,
 ) -> tuple[list[Instruction], list[Instruction]]:
+=======
+def _bytecode_from_template_with_split(template, stack_index, varname_map=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     template_code = bytecode_from_template(template, varname_map=varname_map)
     template_code.append(create_instruction("POP_TOP"))
 
@@ -82,12 +105,20 @@ def _bytecode_from_template_with_split(
         (
             (i, inst)
             for i, inst in enumerate(template_code)
+<<<<<<< HEAD
             if inst.opname in ("LOAD_FAST", "LOAD_FAST_BORROW")
             and inst.argval == "dummy"
         ),
         (None, None),
     )
     assert dummy_idx is not None and dummy_inst is not None
+=======
+            if inst.opname == "LOAD_FAST" and inst.argval == "dummy"
+        ),
+        (None, None),
+    )
+    assert dummy_idx is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # replace LOAD_FAST dummy with first NOP marking exception area
     overwrite_instruction(dummy_inst, [create_instruction("NOP")])
@@ -99,10 +130,16 @@ def _bytecode_from_template_with_split(
     return template_code[: dummy_idx + 1], template_code[dummy_idx + 1 :]
 
 
+<<<<<<< HEAD
 def _try_except_tf_mode_template(dummy: Any, stack_var_name: Any) -> None:
     # NOTE: Make sure this name matches what is generated by symbolic_convert:import_source
     # on torch._dynamo.utils.
     # pyrefly: ignore [unknown-name]
+=======
+def _try_except_tf_mode_template(dummy, stack_var_name):
+    # NOTE: Make sure this name matches what is generated by symbolic_convert:import_source
+    # on torch._dynamo.utils.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global __import_torch_dot__dynamo_dot_utils
     try:
         dummy
@@ -118,9 +155,13 @@ class ReenterWith:
     stack_index: int
     target_values: Optional[tuple[Any, ...]] = None
 
+<<<<<<< HEAD
     def try_except_torch_function_mode(
         self, code_options: dict[str, Any], cleanup: list[Instruction]
     ) -> list[Instruction]:
+=======
+    def try_except_torch_function_mode(self, code_options, cleanup: list[Instruction]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Codegen based off of:
         try:
@@ -142,9 +183,13 @@ def try_except_torch_function_mode(
 
     # If we do not want to destroy the stack, we can do the same thing as a
     # `SETUP_WITH` block, only that we store the context manager in a local_symbol
+<<<<<<< HEAD
     def try_finally(
         self, code_options: dict[str, Any], cleanup: list[Instruction]
     ) -> list[Instruction]:
+=======
+    def try_finally(self, code_options, cleanup: list[Instruction]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Codegen based off of:
         load args
@@ -175,7 +220,11 @@ def try_finally(
             ]
         )
 
+<<<<<<< HEAD
         def _template(ctx: AbstractContextManager[Any], dummy: Any) -> None:
+=======
+        def _template(ctx, dummy):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ctx.__enter__()
             try:
                 dummy
@@ -188,9 +237,13 @@ def _template(ctx: AbstractContextManager[Any], dummy: Any) -> None:
         cleanup[:] = epilogue + cleanup
         return create_ctx + setup_try_finally
 
+<<<<<<< HEAD
     def __call__(
         self, code_options: dict[str, Any], cleanup: list[Instruction]
     ) -> tuple[list[Instruction], Optional[Instruction]]:
+=======
+    def __call__(self, code_options, cleanup):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Codegen based off of:
         with ctx(args):
@@ -202,9 +255,13 @@ def __call__(
             load_args = [create_load_const(val) for val in self.target_values]
 
         create_ctx: list[Instruction] = []
+<<<<<<< HEAD
         # Do not push NULL in Python 3.14+ since the NULL should be on the symbolic stack.
         if sys.version_info < (3, 14):
             _initial_push_null(create_ctx)
+=======
+        _initial_push_null(create_ctx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         create_ctx.extend(
             [
                 *load_args,
@@ -212,7 +269,11 @@ def __call__(
             ]
         )
 
+<<<<<<< HEAD
         def _template(ctx: AbstractContextManager[Any], dummy: Any) -> None:
+=======
+        def _template(ctx, dummy):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with ctx:
                 dummy
 
@@ -225,8 +286,12 @@ def _template(ctx: AbstractContextManager[Any], dummy: Any) -> None:
             (
                 inst
                 for inst in setup_with
+<<<<<<< HEAD
                 if inst.opname in ("LOAD_FAST", "LOAD_FAST_BORROW")
                 and inst.argval == "ctx"
+=======
+                if inst.opname == "LOAD_FAST" and inst.argval == "ctx"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             None,
         )
@@ -257,6 +322,7 @@ class ResumeFunctionMetadata:
     prefix_block_target_offset_remap: list[int] = dataclasses.field(
         default_factory=list
     )
+<<<<<<< HEAD
     # per-offset map from new block target offsets to original block target offsets
     block_target_offset_remap: dict[tuple[int, int], dict[int, int]] = (
         dataclasses.field(default_factory=dict)
@@ -268,6 +334,13 @@ def _filter_iter(
     l2: Iterable[Any],
     cond: Callable[[Any, Any], bool],
 ) -> list[Any]:
+=======
+    # map from new block target offsets to original block target offsets
+    block_target_offset_remap: Optional[dict[int, int]] = None
+
+
+def _filter_iter(l1, l2, cond):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Two-pointer conditional filter.
     e.g. _filter_iter(insts, sorted_offsets, lambda i, o: i.offset == o)
@@ -286,7 +359,11 @@ def _filter_iter(
     return res
 
 
+<<<<<<< HEAD
 def _load_tuple_and_call(tup: tuple[Any, ...]) -> list[Instruction]:
+=======
+def _load_tuple_and_call(tup):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     insts: list[Instruction] = []
     _initial_push_null(insts)
     insts.extend(create_load_const(val) for val in tup)
@@ -299,28 +376,43 @@ class ContinueExecutionCache:
     generated_code_metadata = ExactWeakKeyDictionary()
 
     @classmethod
+<<<<<<< HEAD
     def lookup(
         cls, code: types.CodeType, lineno: int, init_offset: int, *key: Any
     ) -> types.CodeType:
+=======
+    def lookup(cls, code, lineno, *key):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if code not in cls.cache:
             cls.cache[code] = {}
         key = tuple(key)
         if key not in cls.cache[code]:
+<<<<<<< HEAD
             cls.cache[code][key] = cls.generate(code, lineno, init_offset, *key)
+=======
+            cls.cache[code][key] = cls.generate(code, lineno, *key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls.cache[code][key]
 
     @classmethod
     def generate(
         cls,
+<<<<<<< HEAD
         code: types.CodeType,
         lineno: int,
         init_offset: int,
         resume_offset: int,
+=======
+        code,
+        lineno,
+        offset: int,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         setup_fn_target_offsets: tuple[int, ...],  # only used in Python 3.11+
         nstack: int,
         argnames: tuple[str, ...],
         argnames_null: tuple[str, ...],
         setup_fns: tuple[ReenterWith, ...],
+<<<<<<< HEAD
         handle_inactive_ctx: bool,
         stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
         argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
@@ -330,6 +422,13 @@ def generate(
         nested_code_objs: tuple[types.CodeType],
     ) -> types.CodeType:
         assert resume_offset is not None
+=======
+        stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
+        argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
+        null_idxes: tuple[int, ...],
+    ) -> types.CodeType:
+        assert offset is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert not (
             code.co_flags
             & (CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR)
@@ -339,23 +438,34 @@ def generate(
             return cls.generate_based_on_original_code_object(
                 code,
                 lineno,
+<<<<<<< HEAD
                 init_offset,
                 resume_offset,
+=======
+                offset,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 setup_fn_target_offsets,
                 nstack,
                 argnames,
                 argnames_null,
                 setup_fns,
+<<<<<<< HEAD
                 handle_inactive_ctx,
                 stack_ctx_vars,
                 argnames_ctx_vars,
                 null_idxes,
                 nested_code_objs,
+=======
+                stack_ctx_vars,
+                argnames_ctx_vars,
+                null_idxes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         is_py311_plus = sys.version_info >= (3, 11)
         meta = ResumeFunctionMetadata(code)
 
+<<<<<<< HEAD
         def update(
             instructions: list[Instruction], code_options: dict[str, Any]
         ) -> None:
@@ -363,6 +473,12 @@ def update(
 
             args = ["__nested_resume_fns", "__nested_frame_values"]
             args += [f"___stack{i}" for i in range(nstack)]
+=======
+        def update(instructions: list[Instruction], code_options: dict[str, Any]):
+            meta.instructions = copy.deepcopy(instructions)
+
+            args = [f"___stack{i}" for i in range(nstack)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args.extend(v for v in argnames if v not in args)
             freevars = tuple(code_options["co_cellvars"] or []) + tuple(
                 code_options["co_freevars"] or []
@@ -390,13 +506,25 @@ def update(
             code_options["co_varnames"] = tuple(
                 args
                 + [v for v in argnames_null if v not in args]
+<<<<<<< HEAD
                 + [v for v in code_options["co_varnames"] if v not in args]
                 + [IS_TRACING_RESUME_PROLOGUE_VARNAME]
+=======
+                + [
+                    v
+                    for v in code_options["co_varnames"]
+                    if v not in args and v not in freevars
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             code_options["co_flags"] = code_options["co_flags"] & ~(
                 CO_VARARGS | CO_VARKEYWORDS
             )
+<<<<<<< HEAD
             target = next(i for i in instructions if i.offset == resume_offset)
+=======
+            target = next(i for i in instructions if i.offset == offset)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             prefix = []
             if is_py311_plus:
@@ -406,6 +534,7 @@ def update(
                     )
                 prefix.append(create_instruction("RESUME", arg=0))
 
+<<<<<<< HEAD
             # Set is_tracing_resume_prologue to prevent graph breaks.
             # This doesn't really do anything at runtime, but dynamo will trace this
             # and will know that we're in a resume function prologue.
@@ -418,6 +547,8 @@ def update(
                 ]
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cleanup: list[Instruction] = []
             hooks = {fn.stack_index: fn for fn in setup_fns}
             hook_target_offsets = {
@@ -427,6 +558,7 @@ def update(
             offset_to_inst = {inst.offset: inst for inst in instructions}
             # map old hook targets to new targets generated by the hook
             old_hook_target_remap = {}
+<<<<<<< HEAD
             stack_i = 0
             null_i = 0
             stack_ctx_vars_d = dict(stack_ctx_vars)  # type: ignore[var-annotated,arg-type]
@@ -444,6 +576,18 @@ def update(
                         prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[stack_i]))
                     stack_i += 1
 
+=======
+            null_idxes_i = 0
+            stack_ctx_vars_d = dict(stack_ctx_vars)  # type: ignore[var-annotated,arg-type]
+            for i in range(nstack):
+                while (
+                    null_idxes_i < len(null_idxes)
+                    and null_idxes[null_idxes_i] == i + null_idxes_i
+                ):
+                    prefix.append(create_instruction("PUSH_NULL"))
+                    null_idxes_i += 1
+                prefix.append(create_instruction("LOAD_FAST", argval=f"___stack{i}"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if i in hooks:
                     hook = hooks.pop(i)
                     hook_insts, exn_target = hook(code_options, cleanup)
@@ -453,6 +597,13 @@ def update(
                         old_hook_target = offset_to_inst[hook_target_offset]
                         meta.prefix_block_target_offset_remap.append(hook_target_offset)
                         old_hook_target_remap[old_hook_target] = exn_target
+<<<<<<< HEAD
+=======
+                if i in stack_ctx_vars_d:
+                    # NOTE: we assume that current stack var is a context manager CLASS!
+                    # Load args for context variable and construct it
+                    prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[i]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if is_py311_plus:
                 # reverse the mapping since targets of later/nested contexts are inserted
@@ -465,11 +616,18 @@ def update(
 
             # NOTE: we assume that local var is a context manager CLASS!
             # initialize inactive context vars in argnames
+<<<<<<< HEAD
             if handle_inactive_ctx:
                 for name, vals in argnames_ctx_vars:
                     prefix.append(create_instruction("LOAD_FAST", argval=name))
                     prefix.extend(_load_tuple_and_call(vals))
                     prefix.append(create_instruction("STORE_FAST", argval=name))
+=======
+            for name, vals in argnames_ctx_vars:
+                prefix.append(create_instruction("LOAD_FAST", argval=name))
+                prefix.extend(_load_tuple_and_call(vals))
+                prefix.append(create_instruction("STORE_FAST", argval=name))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # 3.12+: store NULL into variables that were NULL
             if argnames_null:
@@ -483,6 +641,7 @@ def update(
                         ]
                     )
 
+<<<<<<< HEAD
             # Call nested resume function
             if nested_code_objs:
                 prefix.extend(
@@ -545,6 +704,8 @@ def update(
                     ]
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prefix.append(create_jump_absolute(target))
 
             # because the line number table monotonically increases from co_firstlineno
@@ -563,26 +724,41 @@ def update(
 
             # remap original instructions' exception table entries
             if old_hook_target_remap:
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert is_py311_plus
                 for inst in instructions:
                     if (
                         inst.exn_tab_entry
                         and inst.exn_tab_entry.target in old_hook_target_remap
                     ):
+<<<<<<< HEAD
                         inst.exn_tab_entry.target = old_hook_target_remap[  # type: ignore[assignment]
+=======
+                        inst.exn_tab_entry.target = old_hook_target_remap[
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             inst.exn_tab_entry.target
                         ]
 
             # TODO(jansel): add dead code elimination here
             instructions[:] = prefix + instructions
 
+<<<<<<< HEAD
         new_code, _ = transform_code_object(code, update)
+=======
+        new_code = transform_code_object(code, update)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ContinueExecutionCache.generated_code_metadata[new_code] = meta
         return new_code
 
     @staticmethod
+<<<<<<< HEAD
     def unreachable_codes(code_options: dict[str, Any]) -> list[Instruction]:
+=======
+    def unreachable_codes(code_options) -> list[Instruction]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Codegen a `raise None` to make analysis work for unreachable code"""
         return [
             create_load_const(None),
@@ -591,6 +767,7 @@ def unreachable_codes(code_options: dict[str, Any]) -> list[Instruction]:
 
     @classmethod
     def generate_based_on_original_code_object(
+<<<<<<< HEAD
         cls,
         code: types.CodeType,
         lineno: int,
@@ -599,6 +776,10 @@ def generate_based_on_original_code_object(
         setup_fn_target_offsets: tuple[int, ...],
         *args: Any,
     ) -> types.CodeType:
+=======
+        cls, code, lineno, offset: int, setup_fn_target_offsets: tuple[int, ...], *args
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This handles the case of generating a resume into code generated
         to resume something else.  We want to always generate starting
@@ -610,6 +791,7 @@ def generate_based_on_original_code_object(
         meta: ResumeFunctionMetadata = ContinueExecutionCache.generated_code_metadata[
             code
         ]
+<<<<<<< HEAD
 
         def find_orig_offset(cur_offset: int) -> int:
             orig_offset = -1
@@ -652,12 +834,32 @@ def find_orig_offset_transform(
         assert orig_resume_offset > -1, (
             "resume instruction not found in original code - this is a bug."
         )
+=======
+        new_offset = None
+
+        def find_new_offset(
+            instructions: list[Instruction], code_options: dict[str, Any]
+        ):
+            nonlocal new_offset
+            (target,) = (i for i in instructions if i.offset == offset)
+            # match the functions starting at the last instruction as we have added a prefix
+            (new_target,) = (
+                i2
+                for i1, i2 in zip(reversed(instructions), reversed(meta.instructions))
+                if i1 is target
+            )
+            assert target.opcode == new_target.opcode
+            new_offset = new_target.offset
+
+        transform_code_object(code, find_new_offset)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if sys.version_info >= (3, 11):
             # setup_fn_target_offsets currently contains the target offset of
             # each setup_fn, based on `code`. When we codegen the resume function
             # based on the original code object, `meta.code`, the offsets in
             # setup_fn_target_offsets must be based on `meta.code` instead.
+<<<<<<< HEAD
             offset_key = (orig_init_offset, orig_resume_offset)
             # NOTE: we key by offset_key since the same resume function may graph
             # break in multiple places and we need different block_target_offset_remap's
@@ -675,14 +877,28 @@ def remap_block_offsets(
                     # NOTE: each prefix block generates exactly one PUSH_EXC_INFO,
                     # so we can tell which block a prefix PUSH_EXC_INFO belongs to,
                     # by counting. Then we can use meta.prefix_block_target_offset_remap
+=======
+            if not meta.block_target_offset_remap:
+                block_target_offset_remap = meta.block_target_offset_remap = {}
+
+                def remap_block_offsets(
+                    instructions: list[Instruction], code_options: dict[str, Any]
+                ):
+                    # NOTE: each prefix block generates exactly one PUSH_EXC_INFO,
+                    # so we can tell which block a prefix PUSH_EXC_INFO belongs to,
+                    # by counting. Then we can use meta.prefix_block-target_offset_remap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # to determine where in the original code the PUSH_EXC_INFO offset
                     # replaced.
                     prefix_blocks: list[Instruction] = []
                     for inst in instructions:
+<<<<<<< HEAD
                         # NOTE meta.prefix_block_target_offset_remap is based off of how we codegen'd
                         # context managers at the prefix/prologue of the resume function. It is the same for
                         # every graph break in the same resume function, so we do not need to recompute
                         # for each graph break (unlike for meta.block_target_offset_remap)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if len(prefix_blocks) == len(
                             meta.prefix_block_target_offset_remap
                         ):
@@ -690,12 +906,17 @@ def remap_block_offsets(
                         if inst.opname == "PUSH_EXC_INFO":
                             prefix_blocks.append(inst)
 
+<<<<<<< HEAD
                     # remap block target offsets for blocks generated in the resume prefix
+=======
+                    # offsets into prefix
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for inst, o in zip(
                         prefix_blocks, meta.prefix_block_target_offset_remap
                     ):
                         block_target_offset_remap[cast(int, inst.offset)] = o
 
+<<<<<<< HEAD
                     # current bytecode targets are after the prefix PUSH_EXC_INFO's
                     cur_start_offset = (
                         cast(int, prefix_blocks[-1].offset) if prefix_blocks else -1
@@ -735,4 +956,33 @@ def remap_block_offsets(
             orig_resume_offset,
             setup_fn_target_offsets,
             *args,
+=======
+                    # old bytecode targets are after the prefix PUSH_EXC_INFO's
+                    old_start_offset = (
+                        cast(int, prefix_blocks[-1].offset) if prefix_blocks else -1
+                    )
+                    # offsets into old bytecode
+                    old_inst_offsets = sorted(
+                        n for n in setup_fn_target_offsets if n > old_start_offset
+                    )
+                    targets = _filter_iter(
+                        instructions, old_inst_offsets, lambda inst, o: inst.offset == o
+                    )
+                    new_targets = _filter_iter(
+                        zip(reversed(instructions), reversed(meta.instructions)),
+                        targets,
+                        lambda v1, v2: v1[0] is v2,
+                    )
+                    for new, old in zip(new_targets, targets):
+                        block_target_offset_remap[old.offset] = new[1].offset
+
+                transform_code_object(code, remap_block_offsets)
+
+            # if offset is not in setup_fn_target_offsets, it is an error
+            setup_fn_target_offsets = tuple(
+                meta.block_target_offset_remap[n] for n in setup_fn_target_offsets
+            )
+        return ContinueExecutionCache.lookup(
+            meta.code, lineno, new_offset, setup_fn_target_offsets, *args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index eed90ed5a9c67..ec1951ba8a547 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Side effect tracking and management for TorchDynamo's compilation system.
 
@@ -26,12 +31,19 @@
 import inspect
 import warnings
 import weakref
+<<<<<<< HEAD
 from collections.abc import Generator, MutableMapping
+=======
+from collections.abc import MutableMapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from types import CellType
 from typing import Any, Optional, TYPE_CHECKING
 
 import torch.nn
+<<<<<<< HEAD
 from torch._dynamo.variables.misc import AutogradFunctionContextVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import graph_break_hints, utils, variables
 from .bytecode_transformation import (
@@ -57,6 +69,7 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from torch._dynamo.output_graph import OutputGraph
     from torch._dynamo.symbolic_convert import InstructionTranslatorBase
     from torch._dynamo.variables.lists import ListVariable
@@ -65,17 +78,32 @@
 def _manual_dict_setitem(
     dict_from: dict[Any, Any], dict_to: dict[Any, Any], mro_index: int
 ) -> None:
+=======
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+
+
+def _manual_dict_setitem(dict_from, dict_to, mro_index):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Carefully calls the dict or OrderedDict `clear` or `__setitem__`. We have
     # to be careful because we don't want to trigger the user defined object
     # setitem or clear. The mro_index is used to find the dict/OrderedDict from
     # the class mro.
     dict_class = type(dict_to).__mro__[mro_index]
+<<<<<<< HEAD
     dict_class.clear(dict_to)  # type: ignore[attr-defined]
     for k, v in dict_from.items():
         dict_class.__setitem__(dict_to, k, v)  # type: ignore[index]
 
 
 def _manual_list_update(list_from: list[Any], list_to: list[Any]) -> None:
+=======
+    dict_class.clear(dict_to)
+    for k, v in dict_from.items():
+        dict_class.__setitem__(dict_to, k, v)
+
+
+def _manual_list_update(list_from, list_to):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list.clear(list_to)
     list.extend(list_to, list_from)
 
@@ -106,6 +134,7 @@ class SideEffects:
 
     def __init__(
         self,
+<<<<<<< HEAD
         output_graph: "OutputGraph",
         id_to_variable: Optional[dict[int, VariableTracker]] = None,
         store_attr_mutations: Optional[
@@ -127,6 +156,15 @@ def __init__(
             ]
         ] = None,
     ) -> None:
+=======
+        output_graph,
+        id_to_variable=None,
+        store_attr_mutations=None,
+        keepalive=None,
+        save_for_backward=None,
+        tensor_hooks=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.output_graph_weakref = weakref.ref(output_graph)
         self.id_to_variable = id_to_variable or {}
@@ -139,6 +177,7 @@ def __init__(
         self._has_existing_dict_mutation = False
         # Track Compiled Autograd final callbacks that must be called at the end of Compiled Autograd backward graph.
         # Only applicable if this graph is created from Dynamo tracing in Compiled Autograd.
+<<<<<<< HEAD
         self.ca_final_callbacks_var: Optional[ListVariable] = None
 
         # Tracks VariableTracker objects whose mutations can be skipped.
@@ -159,6 +198,9 @@ def stop_ignoring_mutations_on(self, var: VariableTracker) -> None:
         """Remove a variable from the skip mutation set, restoring normal mutation tracking."""
         if var in self.ignore_mutation_on_these_variables:
             self.ignore_mutation_on_these_variables.remove(var)
+=======
+        self.ca_final_callbacks_var = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __eq__(self, other: object) -> bool:
         assert isinstance(other, SideEffects)
@@ -192,12 +234,19 @@ def diff(self, other: "SideEffects") -> Optional[str]:
         else:
             return None
 
+<<<<<<< HEAD
     def clone(self) -> "SideEffects":
         """Create a shallow copy"""
         ref = self.output_graph_weakref()
         assert ref is not None
         return self.__class__(
             output_graph=ref,
+=======
+    def clone(self):
+        """Create a shallow copy"""
+        return self.__class__(
+            output_graph=self.output_graph_weakref(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             id_to_variable=dict(self.id_to_variable),
             store_attr_mutations={
                 k: dict(v) for k, v in self.store_attr_mutations.items()
@@ -207,6 +256,7 @@ def clone(self) -> "SideEffects":
             tensor_hooks=self.tensor_hooks,
         )
 
+<<<<<<< HEAD
     def __contains__(self, item: Any) -> bool:
         return id(item) in self.id_to_variable
 
@@ -227,19 +277,49 @@ def should_allow_side_effects_under_checkpoint(self) -> bool:
     def should_allow_externally_visible_side_effects_in_subtracer(self) -> bool:
         output_graph = self.output_graph_weakref()
         return bool(
+=======
+    def __contains__(self, item):
+        return id(item) in self.id_to_variable
+
+    def __getitem__(self, item):
+        return self.id_to_variable[id(item)]
+
+    def should_allow_side_effects_under_checkpoint(self):
+        output_graph = self.output_graph_weakref()
+        return (
+            output_graph
+            and output_graph.current_tx.output.current_tracer.under_activation_checkpoint
+            and output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+        )
+
+    def should_allow_externally_visible_side_effects_in_subtracer(self):
+        output_graph = self.output_graph_weakref()
+        return (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_graph
             and output_graph.current_tx.output.current_tracer.unsafe_allow_externally_visible_side_effects
         )
 
+<<<<<<< HEAD
     def is_reconstructing_generator(self) -> bool:
         output_graph = self.output_graph_weakref()
 
         return bool(
+=======
+    def is_reconstructing_generator(self):
+        output_graph = self.output_graph_weakref()
+
+        return (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_graph
             and output_graph.current_tx.output.current_tracer.is_reconstructing_generator
         )
 
+<<<<<<< HEAD
     def check_allowed_side_effect(self, item: VariableTracker) -> bool:
+=======
+    def check_allowed_side_effect(self, item):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.variables.misc import AutogradFunctionContextVariable
 
         # People do things like self.dim = dim inside autograd.Function.
@@ -266,17 +346,23 @@ def check_allowed_side_effect(self, item: VariableTracker) -> bool:
                 explanation="This is not supported.",
                 hints=[],
             )
+<<<<<<< HEAD
         return False
 
     def store_attr(
         self, item: VariableTracker, name: str, value: VariableTracker
     ) -> None:
+=======
+
+    def store_attr(self, item: VariableTracker, name: str, value: VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.is_attribute_mutation(item)
         self.check_allowed_side_effect(item)
         if item not in self.store_attr_mutations:
             self.store_attr_mutations[item] = {}
         self.store_attr_mutations[item][name] = value
 
+<<<<<<< HEAD
     def load_attr(
         self,
         item: VariableTracker,
@@ -284,6 +370,9 @@ def load_attr(
         deleted_ok: bool = False,
         check: bool = False,
     ) -> VariableTracker:
+=======
+    def load_attr(self, item, name, deleted_ok=False, check=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if check:
             assert self.is_attribute_mutation(item)
         result = self.store_attr_mutations[item][name]
@@ -296,7 +385,11 @@ def load_attr(
             )
         return result
 
+<<<<<<< HEAD
     def store_cell(self, cellvar: VariableTracker, value: VariableTracker) -> None:
+=======
+    def store_cell(self, cellvar, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cellvar.is_immutable():
             unimplemented_v2(
                 gb_type="Write to immutable cell",
@@ -308,7 +401,11 @@ def store_cell(self, cellvar: VariableTracker, value: VariableTracker) -> None:
         assert isinstance(value, variables.VariableTracker)
         self.store_attr(cellvar, "cell_contents", value)
 
+<<<<<<< HEAD
     def load_cell(self, cellvar: VariableTracker) -> VariableTracker:
+=======
+    def load_cell(self, cellvar):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(cellvar, variables.CellVariable)
         if self.has_pending_mutation_of_attr(cellvar, "cell_contents"):
             return self.load_attr(cellvar, "cell_contents", check=False)
@@ -321,6 +418,7 @@ def load_cell(self, cellvar: VariableTracker) -> VariableTracker:
             hints=[*graph_break_hints.USER_ERROR],
         )
 
+<<<<<<< HEAD
     def load_global(self, gvar: VariableTracker, name: str) -> VariableTracker:
         assert isinstance(gvar, variables.VariableTracker)
         return self.load_attr(gvar, name)
@@ -328,17 +426,31 @@ def load_global(self, gvar: VariableTracker, name: str) -> VariableTracker:
     def store_global(
         self, gvar: VariableTracker, name: str, value: VariableTracker
     ) -> None:
+=======
+    def load_global(self, gvar: VariableTracker, name: str):
+        assert isinstance(gvar, variables.VariableTracker)
+        return self.load_attr(gvar, name)
+
+    def store_global(self, gvar: VariableTracker, name: str, value: VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(gvar, variables.VariableTracker)
         assert isinstance(value, variables.VariableTracker)
         self.store_attr(gvar, name, value)
 
     @staticmethod
+<<<<<<< HEAD
     def cls_supports_mutation_side_effects(cls: type) -> bool:
         return inspect.getattr_static(cls, "__getattribute__", None) in (
             object.__getattribute__,
             dict.__getattribute__,
             set.__getattribute__,
             frozenset.__getattribute__,
+=======
+    def cls_supports_mutation_side_effects(cls):
+        return inspect.getattr_static(cls, "__getattribute__", None) in (
+            object.__getattribute__,
+            dict.__getattribute__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             int.__getattribute__,
             str.__getattribute__,
             list.__getattribute__,
@@ -346,20 +458,35 @@ def cls_supports_mutation_side_effects(cls: type) -> bool:
             BaseException.__getattribute__,
         )
 
+<<<<<<< HEAD
     def is_attribute_mutation(self, item: VariableTracker) -> bool:
         return isinstance(item.mutation_type, AttributeMutation)
 
     def has_pending_mutation(self, item: VariableTracker) -> bool:
+=======
+    def is_attribute_mutation(self, item):
+        return isinstance(item.mutation_type, AttributeMutation)
+
+    def has_pending_mutation(self, item):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.is_attribute_mutation(item) and bool(
             self.store_attr_mutations.get(item)
         )
 
+<<<<<<< HEAD
     def has_pending_mutation_of_attr(self, item: VariableTracker, name: str) -> bool:
+=======
+    def has_pending_mutation_of_attr(self, item, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.is_attribute_mutation(
             item
         ) and name in self.store_attr_mutations.get(item, ())
 
+<<<<<<< HEAD
     def is_modified(self, item: VariableTracker) -> bool:
+=======
+    def is_modified(self, item):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if item.is_immutable():
             return False
         if isinstance(item.mutation_type, (AttributeMutationNew, ValueMutationNew)):
@@ -374,14 +501,23 @@ def is_modified(self, item: VariableTracker) -> bool:
         if self.is_attribute_mutation(item):
             return item in self.store_attr_mutations
 
+<<<<<<< HEAD
         return item.mutation_type.is_modified  # type: ignore[attr-defined]
+=======
+        return item.mutation_type.is_modified
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _track_obj(
         self,
         item: Any,
         variable: VariableTracker,
+<<<<<<< HEAD
         mutation_type_cls: type = ValueMutationExisting,
     ) -> VariableTracker:
+=======
+        mutation_type_cls=ValueMutationExisting,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Start tracking an existing or new variable for mutation"""
         if id(item) in self.id_to_variable:
             raise AssertionError(
@@ -403,7 +539,11 @@ def track_object_existing(
         self,
         item: Any,
         variable: VariableTracker,
+<<<<<<< HEAD
     ) -> VariableTracker:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._track_obj(
             item,
             variable,
@@ -415,8 +555,13 @@ def track_object_new(
         cls_source: Source,
         user_cls: Any,
         variable_cls: Any,
+<<<<<<< HEAD
         options: dict[str, Any],
     ) -> VariableTracker:
+=======
+        options,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if user_cls is torch.autograd.function.FunctionCtx:
             with warnings.catch_warnings(record=True):
                 obj = torch.autograd.Function()
@@ -431,7 +576,11 @@ def track_object_new(
         self.keepalive.append(obj)
         return variable
 
+<<<<<<< HEAD
     def get_variable_cls(self, user_cls: type) -> type:
+=======
+    def get_variable_cls(self, user_cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.overrides import TorchFunctionMode
 
         from .variables.ctx_manager import GenericContextWrappingVariable
@@ -455,8 +604,11 @@ def get_variable_cls(self, user_cls: type) -> type:
             variable_cls = variables.UnspecializedNNModuleVariable
         elif issubclass(user_cls, (dict, collections.OrderedDict)):
             variable_cls = variables.UserDefinedDictVariable
+<<<<<<< HEAD
         elif issubclass(user_cls, (set, frozenset)):
             variable_cls = variables.UserDefinedSetVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif issubclass(user_cls, tuple):
             variable_cls = variables.UserDefinedTupleVariable
         elif issubclass(user_cls, list):
@@ -472,11 +624,19 @@ def get_variable_cls(self, user_cls: type) -> type:
 
     def get_example_value(
         self,
+<<<<<<< HEAD
         base_cls_vt: VariableTracker,
         cls_vt: VariableTracker,
         init_args: list[VariableTracker],
     ) -> Any:
         user_cls = cls_vt.value  # type: ignore[attr-defined]
+=======
+        base_cls_vt,
+        cls_vt,
+        init_args,
+    ):
+        user_cls = cls_vt.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if issubclass(user_cls, torch.nn.Module):
             # TODO(anijain2305) - Is it possible to remove this specialization?
             obj = nn_module_new(user_cls)
@@ -503,10 +663,17 @@ def get_example_value(
 
     def track_new_user_defined_object(
         self,
+<<<<<<< HEAD
         base_cls_vt: VariableTracker,
         cls_vt: VariableTracker,
         init_args: list[VariableTracker],
     ) -> VariableTracker:
+=======
+        base_cls_vt,
+        cls_vt,
+        init_args,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Creates a UserDefinedObjectVariable (or its subclass) variable tracker
         and mark it for attribute mutation tracking.
@@ -516,7 +683,11 @@ def track_new_user_defined_object(
             base_cls_vt.__new__(user_cls, *init_args)
         """
         cls_source = cls_vt.source
+<<<<<<< HEAD
         user_cls = cls_vt.value  # type: ignore[attr-defined]
+=======
+        user_cls = cls_vt.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         variable_cls = self.get_variable_cls(user_cls)
         obj = self.get_example_value(base_cls_vt, cls_vt, init_args)
 
@@ -533,7 +704,11 @@ def track_new_user_defined_object(
 
     def track_cell_new(
         self,
+<<<<<<< HEAD
     ) -> VariableTracker:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = object()
         variable = variables.CellVariable(
             mutation_type=AttributeMutationNew(),
@@ -544,7 +719,11 @@ def track_cell_new(
 
     def track_cell_existing(
         self, source: Optional[Source], cell: CellType, contents: VariableTracker
+<<<<<<< HEAD
     ) -> VariableTracker:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         variable = variables.CellVariable(
             # We don't support mutation to cell without source because we need
             # source to properly codegen the mutations.
@@ -556,7 +735,11 @@ def track_cell_existing(
         self.keepalive.append(cell)
         return variable
 
+<<<<<<< HEAD
     def track_global_existing(self, source: Source, item: Any) -> VariableTracker:
+=======
+    def track_global_existing(self, source: Source, item: Any):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         variable = variables.NewGlobalVariable(
             mutation_type=AttributeMutationExisting(),
             source=source,
@@ -565,6 +748,7 @@ def track_global_existing(self, source: Source, item: Any) -> VariableTracker:
         self.keepalive.append(item)
         return variable
 
+<<<<<<< HEAD
     def track_save_for_backward(
         self, ctx: VariableTracker, args: list[VariableTracker]
     ) -> None:
@@ -574,6 +758,13 @@ def track_save_for_backward(
     def track_runahead_tensor_and_symvar_side_effects(
         self, other: "SideEffects"
     ) -> None:
+=======
+    def track_save_for_backward(self, ctx, args):
+        assert isinstance(ctx, variables.AutogradFunctionContextVariable)
+        self.save_for_backward.append((ctx, args))
+
+    def track_tensor_variables_from_runahead_side_effects(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # In higher order ops we want to keep track of tensors seen in the
         # speculate_subgraph so that we don't lift them again as a new input in
         # other speculate_subgraph or in the root tracer.
@@ -581,16 +772,28 @@ def track_runahead_tensor_and_symvar_side_effects(
             other_id = id(other_item)
             other_variable = other.id_to_variable[other_id]
             if other_id not in self.id_to_variable and isinstance(
+<<<<<<< HEAD
                 other_variable, (variables.TensorVariable, variables.SymNodeVariable)
             ):
                 self.track_object_existing(other_item, other_variable)
 
     def prune_dead_object_new(self, tx: "InstructionTranslatorBase") -> None:
+=======
+                other_variable, variables.TensorVariable
+            ):
+                self.track_object_existing(other_item, other_variable)
+
+    def prune_dead_object_new(self, tx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Avoid VT cycles from e.g., recursive function.
         visited: set[VariableTracker] = set()
         live_new_objects: set[VariableTracker] = set()
 
+<<<<<<< HEAD
         def visit(var: VariableTracker) -> None:
+=======
+        def visit(var: VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if var in visited:
                 return
             visited.add(var)
@@ -606,7 +809,11 @@ def visit(var: VariableTracker) -> None:
                     self.store_attr_mutations[var],
                 )
 
+<<<<<<< HEAD
         def is_live(var: VariableTracker) -> bool:
+=======
+        def is_live(var: VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(var.mutation_type, AttributeMutationNew):
                 return var in live_new_objects
             return True
@@ -620,6 +827,7 @@ def is_live(var: VariableTracker) -> bool:
         # The only live side effects come from returns (tx.stack), any intermediates
         # during a graph break (tx.symbolic_locals), and mutation on pre-existing variables.
         # Recursively visit Variables and see if any of them have been mutated.
+<<<<<<< HEAD
         init_live_vars = []
         # gather stack/symbolic_locals for all tx's up the chain
         cur_tx: Optional[InstructionTranslatorBase] = tx
@@ -635,6 +843,18 @@ def is_live(var: VariableTracker) -> bool:
                 tx.output.backward_state,
                 self.tensor_hooks,
             ],
+=======
+        VariableTracker.visit(
+            visit,
+            # TODO track from all possible sources.
+            (
+                tx.stack,
+                tx.symbolic_locals,
+                pre_existing_vars,
+                tx.output.backward_state,
+                self.tensor_hooks,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Manually release the self-referential function, which indirectly
         # captures certain `VariableTracker` and affects parts of PT test/logic
@@ -654,10 +874,14 @@ def is_live(var: VariableTracker) -> bool:
             k: v for k, v in self.store_attr_mutations.items() if is_live(k)
         }
 
+<<<<<<< HEAD
     def mutation(self, var: VariableTracker) -> None:
         if var in self.ignore_mutation_on_these_variables:
             return
 
+=======
+    def mutation(self, var):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_allowed_side_effect(var)
         if isinstance(var.mutation_type, ValueMutationExisting):
             var.mutation_type.is_modified = True
@@ -668,6 +892,7 @@ def mutation(self, var: VariableTracker) -> None:
         ):
             self._has_existing_dict_mutation = True
 
+<<<<<<< HEAD
     def has_existing_dict_mutation(self) -> bool:
         return self._has_existing_dict_mutation
 
@@ -675,6 +900,15 @@ def _get_modified_vars(self) -> list[VariableTracker]:
         return [var for var in self.id_to_variable.values() if self.is_modified(var)]
 
     def codegen_save_tempvars(self, cg: PyCodegen) -> None:
+=======
+    def has_existing_dict_mutation(self):
+        return self._has_existing_dict_mutation
+
+    def _get_modified_vars(self):
+        return [var for var in self.id_to_variable.values() if self.is_modified(var)]
+
+    def codegen_save_tempvars(self, cg: PyCodegen):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We must codegen modified VT to their source by default, so that
         # mutation and aliasing are properly accounted for.
         #
@@ -699,7 +933,10 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                     cg.add_cache(var)
                     var.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
                 elif var.source is None:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     var.source = LocalCellSource(var.local_name)
             elif isinstance(var, variables.TensorVariable):
                 # NOTE: for historical reasons we never assigned local sources
@@ -735,8 +972,12 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                 # base_cls.__new__(user_cls, *args)
                 if isinstance(var, variables.UserDefinedObjectVariable):
 
+<<<<<<< HEAD
                     def load_new_method() -> None:
                         # pyrefly: ignore [missing-attribute]
+=======
+                    def load_new_method():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assert var.base_cls_vt is not None
                         cg(var.base_cls_vt)  # type: ignore[attr-defined]
                         cg.extend_output([cg.create_load_attr("__new__")])
@@ -746,6 +987,7 @@ def load_new_method() -> None:
                     cg.add_push_null(
                         lambda: cg.load_import_from(utils.__name__, "object_new")
                     )
+<<<<<<< HEAD
                 assert var.mutation_type.cls_source is not None
                 cg(var.mutation_type.cls_source)
 
@@ -755,6 +997,16 @@ def load_new_method() -> None:
 
                 # Call the __new__ method
                 cg.extend_output(create_call_function(1 + len(var.init_args), False))  # type: ignore[attr-defined]
+=======
+                cg(var.mutation_type.cls_source)
+
+                # Generate the args to the __new__ method
+                for arg in var.init_args:
+                    cg(arg)
+
+                # Call the __new__ method
+                cg.extend_output(create_call_function(1 + len(var.init_args), False))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 cg.add_cache(var)
                 var.source = LocalSource(cg.tempvars[var])
@@ -771,6 +1023,7 @@ def load_new_method() -> None:
                 ]
             )
 
+<<<<<<< HEAD
     def register_hook(
         self,
         tensor: "variables.TensorVariable",
@@ -778,6 +1031,9 @@ def register_hook(
         handle: "variables.RemovableHandleVariable",
         name: str,
     ) -> None:
+=======
+    def register_hook(self, tensor, hook, handle, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(tensor, variables.TensorVariable)
         assert isinstance(hook, variables.VariableTracker)
         assert (
@@ -793,10 +1049,17 @@ def register_hook(
         assert not handle.idx
         handle.idx = idx
 
+<<<<<<< HEAD
     def remove_hook(self, idx: int) -> None:
         del self.tensor_hooks[idx]
 
     def codegen_hooks(self, cg: PyCodegen) -> None:
+=======
+    def remove_hook(self, idx):
+        del self.tensor_hooks[idx]
+
+    def codegen_hooks(self, cg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (
             tensor,
             hook,
@@ -838,7 +1101,11 @@ def codegen_hooks(self, cg: PyCodegen) -> None:
             # - The handle's exact user-specified name, "user_code_variable_name", is discerned and associated during STORE_FAST.
             assert tensor.source, "Hooks on non input tensors NYI - should not get here"
 
+<<<<<<< HEAD
             def gen_fn() -> None:
+=======
+            def gen_fn():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cg(tensor)
                 cg.extend_output([cg.create_load_attr(name)])
 
@@ -850,17 +1117,27 @@ def gen_fn() -> None:
             # be associated with the return value of register_hook().  This consumes the top of stack.
             cg.add_cache(handle)
 
+<<<<<<< HEAD
     def get_ca_final_callbacks_var(self) -> "variables.ListVariable":
+=======
+    def get_ca_final_callbacks_var(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .variables.base import ValueMutationNew
 
         if self.ca_final_callbacks_var is None:
             self.ca_final_callbacks_var = variables.ListVariable(
                 [], mutation_type=ValueMutationNew()
             )
+<<<<<<< HEAD
 
         return self.ca_final_callbacks_var
 
     def codegen_update_mutated(self, cg: PyCodegen) -> None:
+=======
+        return self.ca_final_callbacks_var
+
+    def codegen_update_mutated(self, cg: PyCodegen):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         suffixes = []
         for var in self._get_modified_vars():
             if isinstance(var, variables.ListVariable):
@@ -983,9 +1260,13 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
 
             elif self.is_attribute_mutation(var):
                 if isinstance(
+<<<<<<< HEAD
                     var,
                     variables.UserDefinedDictVariable,
                     # pyrefly: ignore [bad-argument-type]
+=======
+                    var, variables.UserDefinedDictVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) and self.is_modified(var._dict_vt):
                     # Do dict related update manually here. The store_attr
                     # mutations will be applied later.
@@ -1018,7 +1299,10 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
 
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cg(var._dict_vt, allow_cache=False)  # Don't codegen via source
                     cg.extend_output(
                         [
@@ -1039,9 +1323,13 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
                 elif isinstance(
+<<<<<<< HEAD
                     var,
                     variables.UserDefinedListVariable,
                     # pyrefly: ignore [bad-argument-type]
+=======
+                    var, variables.UserDefinedListVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) and self.is_modified(var._list_vt):
                     # Update the list to the updated items. Be careful in
                     # calling the list methods and not the overridden methods.
@@ -1058,7 +1346,10 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
 
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cg(var._list_vt, allow_cache=False)  # Don't codegen via source
                     cg.extend_output(
                         [
@@ -1159,7 +1450,11 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                     cg.pop_top()
             elif isinstance(var, variables.RandomVariable):
                 # set correct random seed state
+<<<<<<< HEAD
                 def gen_fn() -> None:
+=======
+                def gen_fn():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cg(var.source)  # type: ignore[attr-defined]
                     cg.load_attr("setstate")
 
@@ -1179,7 +1474,11 @@ def gen_fn() -> None:
         for suffix in reversed(suffixes):
             cg.extend_output(suffix)
 
+<<<<<<< HEAD
     def is_empty(self) -> bool:
+=======
+    def is_empty(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return not (
             any(map(self.is_modified, self.id_to_variable.values()))
             or self.tensor_hooks
@@ -1187,15 +1486,23 @@ def is_empty(self) -> bool:
             or self.tensor_hooks
         )
 
+<<<<<<< HEAD
     def clear(self) -> None:
+=======
+    def clear(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.keepalive.clear()
         self.id_to_variable.clear()
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def allow_side_effects_under_checkpoint(
     tx: "InstructionTranslatorBase",
 ) -> Generator[None, None, None]:
+=======
+def allow_side_effects_under_checkpoint(tx: "InstructionTranslator"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert tx.output.current_tracer.under_activation_checkpoint
     orig_val = tx.output.current_tracer.allow_side_effects_under_checkpoint
     try:
@@ -1206,9 +1513,13 @@ def allow_side_effects_under_checkpoint(
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def allow_externally_visible_side_effects_in_subtracer(
     tx: "InstructionTranslatorBase",
 ) -> Generator[None, None, None]:
+=======
+def allow_externally_visible_side_effects_in_subtracer(tx: "InstructionTranslator"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_val = tx.output.current_tracer.unsafe_allow_externally_visible_side_effects
     try:
         tx.output.current_tracer.unsafe_allow_externally_visible_side_effects = True
@@ -1218,9 +1529,13 @@ def allow_externally_visible_side_effects_in_subtracer(
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def disallow_side_effects_in_generator(
     tx: "InstructionTranslatorBase",
 ) -> Generator[None, None, None]:
+=======
+def disallow_side_effects_in_generator(tx: "InstructionTranslator"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_val = tx.output.current_tracer.is_reconstructing_generator
     try:
         tx.output.current_tracer.is_reconstructing_generator = True
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 9fb4f32d68ad4..8c09e484553e8 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module provides Source classes that track the origins of values in PyTorch Dynamo.
 Sources represent where values come from (e.g. local variables, globals, attributes) and
@@ -20,6 +25,7 @@
 import dataclasses
 import enum
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING, Union
 
@@ -31,6 +37,14 @@
     create_build_tuple,
     create_call_function,
 )
+=======
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+from torch._guards import ChainedSource, GuardSource, Source
+
+from . import utils
+from .bytecode_transformation import create_call_function, create_instruction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -99,7 +113,11 @@
 }
 
 
+<<<<<<< HEAD
 def is_constant_source(source: Source) -> bool:
+=======
+def is_constant_source(source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, ConstantSource):
         return True
     try:
@@ -111,6 +129,7 @@ def is_constant_source(source: Source) -> bool:
     return False
 
 
+<<<<<<< HEAD
 def _get_source_debug_name(source: Source) -> str:
     try:
         return source.name()
@@ -118,6 +137,8 @@ def _get_source_debug_name(source: Source) -> str:
         return "<unknown source>"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class LocalSource(Source):
     local_name: str
@@ -134,16 +155,27 @@ class LocalSource(Source):
     # or `co_freevars`.
     is_derefed_cell_contents: bool = False
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.is_derefed_cell_contents:
             codegen.load_deref(self.local_name)
         else:
             codegen.append_output(codegen.create_load(self.local_name))
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return GuardSource.LOCAL
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return GuardSource.LOCAL
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"L[{repr(self.local_name)}]"
 
 
@@ -151,6 +183,7 @@ def name(self) -> str:
 class SyntheticLocalSource(Source):
     local_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load(self.local_name))
 
@@ -158,6 +191,15 @@ def guard_source(self) -> GuardSource:
         return GuardSource.SYNTHETIC_LOCAL
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.append_output(codegen.create_load(self.local_name))
+
+    def guard_source(self):
+        return GuardSource.SYNTHETIC_LOCAL
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"SYNTHETIC_LOCAL[{self.local_name!r}]"
 
 
@@ -165,6 +207,7 @@ def name(self) -> str:
 class RandomValueSource(Source):
     random_call_index: int
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return GuardSource.RANDOM_VALUE
 
@@ -174,6 +217,17 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(create_binary_subscr())
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return GuardSource.RANDOM_VALUE
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.append_output(codegen.create_load(codegen.tx.output.random_values_var))
+        codegen.append_output(codegen.create_load_const(self.random_call_index))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"random_value_{self.random_call_index}"
 
 
@@ -181,6 +235,7 @@ def name(self) -> str:
 class GlobalSource(Source):
     global_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load_global(self.global_name, add=True))
 
@@ -188,6 +243,15 @@ def guard_source(self) -> GuardSource:
         return GuardSource.GLOBAL
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.append_output(codegen.create_load_global(self.global_name, add=True))
+
+    def guard_source(self):
+        return GuardSource.GLOBAL
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"G[{repr(self.global_name)}]"
 
 
@@ -195,7 +259,11 @@ def name(self) -> str:
 class GlobalWeakRefSource(Source):
     global_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.append_output(
                 codegen.create_load_global(self.global_name, add=True)
@@ -203,15 +271,23 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         )
         codegen.extend_output(create_call_function(0, False))
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return GuardSource.GLOBAL
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return GuardSource.GLOBAL
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"G[{repr(self.global_name)}]()"
 
 
 @dataclasses.dataclass(frozen=True)
 class WeakRefCallSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(lambda: codegen(self.base))
         codegen.extend_output(create_call_function(0, False))
@@ -220,6 +296,16 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(lambda: codegen(self.base))
+        codegen.extend_output(create_call_function(0, False))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.base.name()}()"
 
 
@@ -232,7 +318,11 @@ class CallFunctionNoArgsSource(WeakRefCallSource):
 class AttrSource(ChainedSource):
     member: str
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.base, "Can't construct an AttrSource without a valid base source"
         if "." in self.member:
             member_parts = self.member.split(".")
@@ -241,6 +331,7 @@ def __post_init__(self) -> None:
             )
             object.__setattr__(self, "member", member_parts[-1])
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
@@ -249,6 +340,16 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.member.isidentifier():
             return f"getattr({self.base.name()}, {self.member!r})"
         return f"{self.base.name()}.{self.member}"
@@ -258,7 +359,11 @@ def name(self) -> str:
 class GenericAttrSource(ChainedSource):
     member: str
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.base, "Can't construct an AttrSource without a valid base source"
         if "." in self.member:
             member_parts = self.member.split(".")
@@ -267,6 +372,7 @@ def __post_init__(self) -> None:
             )
             object.__setattr__(self, "member", member_parts[-1])
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
@@ -310,6 +416,19 @@ def name(self) -> str:
         return f"{self.base.name()}.__mro__"
 
 
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"object.__getattribute__({self.base.name()}, {self.member!r})"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class LocalCellSource(Source):
     """
@@ -319,7 +438,11 @@ class LocalCellSource(Source):
 
     local_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Although `LOAD_FAST` and `LOAD_CLOSURE` have the same semantics,
         # Dynamo's bytecode transformation differentiates them slightly, so we
         # always emit `LOAD_CLOSURE` here.
@@ -329,6 +452,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
     # local cell object should never be used for guards.
 
 
+<<<<<<< HEAD
 # Represents obj.__code__ where object is type object
 @dataclasses.dataclass(frozen=True)
 class CodeSource(ChainedSource):
@@ -357,6 +481,8 @@ def name(self) -> str:
         return f"{self.base.name()}.__closure__"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Represents tensor.grad source. It could be represented by AttrSource as well.
 # But, we could access grad field on tensor directly in C++ without going
 # through the Python bytecodes. Therefore, we use a separate source for grad
@@ -365,6 +491,7 @@ def name(self) -> str:
 class GradSource(ChainedSource):
     member: str = "grad"
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
@@ -373,12 +500,26 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.base.name()}.{self.member}"
 
 
 @dataclasses.dataclass(frozen=True)
 class ParamBufferSource(AttrSource):
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _GUARD_SOURCE_SPECIALIZED_NN_MODULE[self.base.guard_source()]
 
 
@@ -401,6 +542,7 @@ class UnspecializedParamBufferSource(AttrSource):
 class EphemeralSource(Source):
     desc: Optional[str] = None
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return GuardSource.EPHEMERAL
 
@@ -426,20 +568,42 @@ def name(self) -> str:
         return self.base.name()
 
 
+=======
+    def guard_source(self):
+        return GuardSource.EPHEMERAL
+
+    def name(self):
+        return f"<ephemeral{': ' + self.desc if self.desc is not None else ''}>"
+
+    def make_guard(self, fn):
+        raise NotImplementedError
+
+    def is_ephemeral(self):
+        return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorProperty(enum.Enum):
     SIZE = 0
     STRIDE = 1
     STORAGE_OFFSET = 2
 
+<<<<<<< HEAD
     def method_name(self) -> str:
+=======
+    def method_name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self is TensorProperty.SIZE:
             return "size"
         elif self is TensorProperty.STRIDE:
             return "stride"
         elif self is TensorProperty.STORAGE_OFFSET:
             return "storage_offset"
+<<<<<<< HEAD
         else:
             raise AssertionError(f"unhandled {self}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass(frozen=True)
@@ -447,14 +611,22 @@ class TensorPropertySource(ChainedSource):
     prop: TensorProperty
     idx: Optional[int] = None  # None for STORAGE_OFFSET
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.base is not None
         if self.prop is TensorProperty.STORAGE_OFFSET:
             assert self.idx is None
         else:
             assert self.idx is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 utils.__name__, f"call_{self.prop.method_name()}"
@@ -468,10 +640,17 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             create_call_function(2 if self.idx is not None else 1, False)
         )
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.prop is TensorProperty.SIZE:
             return f"{self.base.name()}.size()[{self.idx}]"
         elif self.prop is TensorProperty.STRIDE:
@@ -487,6 +666,7 @@ def name(self) -> str:
 class IndexedSource(ChainedSource):
     idx: int
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         assert self.base is not None
 
@@ -497,11 +677,24 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        raise NotImplementedError
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"({self.idx}, {self.base.name()})"
 
 
 @dataclasses.dataclass(frozen=True)
 class NegateSource(ChainedSource):
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         assert self.base is not None
 
@@ -512,12 +705,25 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        raise NotImplementedError
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: use method call so that function stripping regexes work
         return f"{self.base.name()}.__neg__()"
 
 
 @dataclasses.dataclass(frozen=True)
 class ConvertIntSource(ChainedSource):
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         assert self.base is not None
 
@@ -528,10 +734,23 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"cast_symbool_to_symint_guardless({self.base.name()})"
 
 
 @dataclasses.dataclass(frozen=True)
+<<<<<<< HEAD
 class DynamicScalarSource(ChainedSource):
     is_int: bool
 
@@ -566,11 +785,25 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+class FlattenScriptObjectSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.base.name()}.__obj_flatten__()"
 
 
 @dataclasses.dataclass(frozen=True)
 class ScriptObjectQualifiedNameSource(ChainedSource):
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         assert self.base is not None
 
@@ -581,10 +814,23 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.base.name()}._type().qualified_name()"
 
 
 class AttrProxySource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
 
@@ -592,6 +838,15 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.base.name()}.get_base()"
 
 
@@ -602,7 +857,11 @@ class DefaultsSource(ChainedSource):
     field: str = dataclasses.field(init=False, repr=False, compare=False)
     _name: str = dataclasses.field(init=False, repr=False, compare=False)
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.base, (
             "Base must be a valid source in order to properly track and guard this Defaults to its origin."
         )
@@ -619,6 +878,7 @@ def __post_init__(self) -> None:
                 self, "_name", f"{self.base.name()}.{self.field}[{self.idx_key}]"
             )
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.field))
@@ -629,6 +889,18 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+        codegen.extend_output(codegen.create_load_attrs(self.field))
+        codegen.append_output(codegen.create_load_const(self.idx_key))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._name
 
 
@@ -637,30 +909,51 @@ class GetItemSource(ChainedSource):
     index: Any
     index_is_slice: bool = False
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.base is not None
         if isinstance(self.index, slice):
             # store the hashable version of the slice so the whole GetItemSource is hashable
             super().__setattr__("index", self.index.__reduce__())
             super().__setattr__("index_is_slice", True)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         if self.index_is_slice:
             codegen.append_output(codegen.create_load_const(self.unpack_slice()))
         else:
             codegen.append_output(codegen.create_load_const(self.index))
+<<<<<<< HEAD
         codegen.append_output(create_binary_subscr())
 
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def unpack_slice(self) -> slice:
+=======
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def unpack_slice(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.index_is_slice
         slice_class, slice_args = self.index
         return slice_class(*slice_args)
 
+<<<<<<< HEAD
     def name(self) -> str:
+=======
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Index can be of following types
         # 1) index is a slice - example 1:4
         # 2) index is a constant - example string, integer
@@ -675,10 +968,17 @@ def name(self) -> str:
 class ConstDictKeySource(ChainedSource):
     index: Any
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(utils.__name__, "dict_keys_getitem")
         )
@@ -686,6 +986,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load_const(self.index))
         codegen.extend_output(create_call_function(2, False))
 
+<<<<<<< HEAD
     def name(self) -> str:
         # The list creation will be CSE'd by PyExprCSEPass
         return f"list(dict.keys({self.base.name()}))[{self.index!r}]"
@@ -726,21 +1027,46 @@ def is_dict_key(self) -> bool:
 @dataclasses.dataclass(frozen=True)
 class DictGetItemSource(ChainedSource):
     # Key to access in the dictionary. It can be one of the following types
+=======
+    def name(self):
+        # The list creation will be CSE'd by PyExprCSEPass
+        return f"list(dict.keys({self.base.name()}))[{self.index!r}]"
+
+    def is_dict_key(self):
+        return True
+
+
+# Used to access an item from the dictionary
+@dataclasses.dataclass(frozen=True)
+class DictGetItemSource(ChainedSource):
+    # Key to access in the dictionary. It can be one of the the following types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # 1) ConstDictKeySource
     # 2) constant - like string, integer
     index: Any
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .variables import ConstantVariable
 
         assert isinstance(
             self.index, ConstDictKeySource
         ) or ConstantVariable.is_literal(self.index)
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Load dict
         codegen(self.base)
 
@@ -749,9 +1075,15 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen(self.index)
         else:
             codegen.append_output(codegen.create_load_const(self.index))
+<<<<<<< HEAD
         codegen.append_output(create_binary_subscr())
 
     def name(self) -> str:
+=======
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.index, ConstDictKeySource):
             return f"{self.base.name()}[{self.index.name()}]"
         else:
@@ -762,22 +1094,37 @@ def name(self) -> str:
 # torch.compile does not run the overridden __getitem__ method
 @dataclasses.dataclass(frozen=True)
 class DictSubclassGetItemSource(ChainedSource):
+<<<<<<< HEAD
     # Key to access in the dictionary. It can be one of the following types
+=======
+    # Key to access in the dictionary. It can be one of the the following types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # 1) ConstDictKeySource
     # 2) constant - like string, integer
     index: Any
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .variables import ConstantVariable
 
         assert isinstance(
             self.index, ConstDictKeySource
         ) or ConstantVariable.is_literal(self.index)
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # reconstruct dict.__getitem__(dct, key)
 
         # Load dict.__getitem__
@@ -796,7 +1143,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
         codegen.extend_output(create_call_function(2, False))
 
+<<<<<<< HEAD
     def name(self) -> str:
+=======
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.index, ConstDictKeySource):
             return f"dict.__getitem__({self.base.name()}, {self.index.name()})"
         else:
@@ -809,7 +1160,11 @@ class ListGetItemSource(GetItemSource):
     Same as GetItemSource with reconstruct and name overridden to be list specific.
     """
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Reconstruct list.__getitem__(lst, index) to avoid any side effects
         # from possibly overridden __getitem__.
 
@@ -831,7 +1186,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
         codegen.extend_output(create_call_function(2, False))
 
+<<<<<<< HEAD
     def name(self) -> str:
+=======
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Index can be of following types
         # 1) index is a slice - example 1:4
         # 2) index is a constant - example string, integer
@@ -846,7 +1205,11 @@ def name(self) -> str:
 
 @dataclasses.dataclass(frozen=True)
 class TupleIteratorGetItemSource(GetItemSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(utils.__name__, "tuple_iterator_getitem")
         )
@@ -854,11 +1217,16 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load_const(self.index))
         codegen.extend_output(create_call_function(2, False))
 
+<<<<<<< HEAD
     def name(self) -> str:
+=======
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
 
 
 @dataclasses.dataclass(frozen=True)
+<<<<<<< HEAD
 class NamedTupleFieldsSource(ChainedSource):
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
@@ -874,38 +1242,64 @@ def name(self) -> str:
 @dataclasses.dataclass(frozen=True)
 class DataclassFieldsSource(ChainedSource):
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+class DataclassFieldsSource(ChainedSource):
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(utils.__name__, "dataclass_fields")
         )
         codegen(self.base)
         codegen.extend_output(create_call_function(1, False))
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"___dataclass_fields({self.base.name()})"
 
 
 @dataclasses.dataclass(frozen=True)
 class TypeSource(ChainedSource):
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         assert self.base is not None
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("builtins", "type"))
         codegen(self.base)
         codegen.extend_output(create_call_function(1, False))
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"type({self.base.name()})"
 
 
 @dataclasses.dataclass(frozen=True)
 class OptimizerSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
 
@@ -913,11 +1307,21 @@ def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.name()
 
 
 @dataclasses.dataclass(frozen=True)
 class NNModuleSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
 
@@ -925,29 +1329,51 @@ def guard_source(self) -> GuardSource:
         return _GUARD_SOURCE_SPECIALIZED_NN_MODULE[self.base.guard_source()]
 
     def name(self) -> str:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen(self.base)
+
+    def guard_source(self):
+        return _GUARD_SOURCE_SPECIALIZED_NN_MODULE[self.base.guard_source()]
+
+    def name(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.name()
 
 
 @dataclasses.dataclass(frozen=True)
 class UnspecializedNNModuleSource(NNModuleSource):
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _GUARD_SOURCE_UNSPECIALIZED_NN_MODULE[self.base.guard_source()]
 
 
 @dataclasses.dataclass(frozen=True)
 class UnspecializedBuiltinNNModuleSource(UnspecializedNNModuleSource):
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _GUARD_SOURCE_UNSPECIALIZED_BUILTIN_NN_MODULE[self.base.guard_source()]
 
 
 @dataclasses.dataclass(frozen=True)
 class FSDPNNModuleSource(NNModuleSource):
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _GUARD_SOURCE_FSDP_MODULE[self.base.guard_source()]
 
 
 @dataclasses.dataclass(frozen=True)
 class GlobalStateSource(Source):
+<<<<<<< HEAD
     def name(self) -> str:
         return ""
 
@@ -979,6 +1405,12 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         )
 
     def guard_source(self) -> GuardSource:
+=======
+    def name(self):
+        return ""
+
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return GuardSource.GLOBAL
 
 
@@ -986,15 +1418,26 @@ def guard_source(self) -> GuardSource:
 class TorchFunctionModeStackSource(Source):
     ind: int
 
+<<<<<<< HEAD
     def name(self) -> str:
         return f"___get_torch_function_mode_stack_at({self._get_index()})"
 
     def _get_index(self) -> int:
+=======
+    def name(self):
+        return f"___get_torch_function_mode_stack_at({self._get_index()})"
+
+    def _get_index(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .variables.torch_function import TorchFunctionModeStackVariable
 
         return TorchFunctionModeStackVariable.get_mode_index(self.ind)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 utils.__name__, "get_torch_function_mode_stack_at"
@@ -1003,7 +1446,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.extend_output([codegen.create_load_const(self._get_index())])
         codegen.extend_output(create_call_function(1, False))
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return GuardSource.GLOBAL
 
 
@@ -1011,6 +1458,7 @@ def guard_source(self) -> GuardSource:
 class ConstantSource(Source):
     source_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load_global(self.source_name, add=False))
 
@@ -1021,6 +1469,18 @@ def name(self) -> str:
         return self.source_name
 
     def make_guard(self, fn: Any) -> Any:
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.append_output(codegen.create_load_global(self.source_name, add=False))
+
+    def guard_source(self):
+        return GuardSource.CONSTANT
+
+    def name(self):
+        return self.source_name
+
+    def make_guard(self, fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
 
@@ -1029,10 +1489,17 @@ class NumpyTensorSource(ChainedSource):
     def name(self) -> str:
         return f"___from_numpy({self.base.name()})"
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+=======
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("torch", "as_tensor"))
         codegen(self.base)
         codegen.extend_output(create_call_function(1, False))
@@ -1043,7 +1510,11 @@ class SubclassAttrListSource(ChainedSource):
     def name(self) -> str:
         return f"{self.base.name()}.__tensor_flatten__()[0]"
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.guard_source()
 
 
@@ -1054,7 +1525,11 @@ class FloatTensorSource(ChainedSource):
     def name(self) -> str:
         return f"___as_tensor({self.base.name()})"
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.guard_source()
 
 
@@ -1063,7 +1538,11 @@ class CallMethodItemSource(ChainedSource):
     def name(self) -> str:
         return f"{self.base.name()}.item()"
 
+<<<<<<< HEAD
     def guard_source(self) -> GuardSource:
+=======
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.guard_source()
 
 
@@ -1072,15 +1551,23 @@ def guard_source(self) -> GuardSource:
 # guard contents from the ambient ShapeEnv
 @dataclasses.dataclass(frozen=True)
 class ShapeEnvSource(Source):
+<<<<<<< HEAD
     def name(self) -> str:
         return ""
 
     def guard_source(self) -> GuardSource:
+=======
+    def name(self):
+        return ""
+
+    def guard_source(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return GuardSource.SHAPE_ENV
 
 
 @dataclasses.dataclass(frozen=True)
 class BackwardStateSource(Source):
+<<<<<<< HEAD
     def name(self) -> str:
         return ""
 
@@ -1091,6 +1578,16 @@ def guard_source(self) -> GuardSource:
 def get_local_source_name(
     source: Source, *, only_allow_input: bool = False
 ) -> Optional[str]:
+=======
+    def name(self):
+        return ""
+
+    def guard_source(self):
+        return GuardSource.BACKWARD_STATE
+
+
+def get_local_source_name(source: Source, *, only_allow_input=False) -> Optional[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, ChainedSource):
         return get_local_source_name(source.base, only_allow_input=only_allow_input)
     if not isinstance(source, LocalSource):
@@ -1100,7 +1597,11 @@ def get_local_source_name(
     return source.local_name
 
 
+<<<<<<< HEAD
 def is_from_local_source(source: Source, *, only_allow_input: bool = False) -> bool:
+=======
+def is_from_local_source(source: Source, *, only_allow_input=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return get_local_source_name(source, only_allow_input=only_allow_input) is not None
 
 
@@ -1116,7 +1617,11 @@ def get_global_source_name(source: Source) -> Optional[str]:
     return source.global_name
 
 
+<<<<<<< HEAD
 def is_from_nonlocal_source(source: Source) -> bool:
+=======
+def is_from_nonlocal_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, ChainedSource):
         return is_from_nonlocal_source(source.base)
     return (
@@ -1126,6 +1631,7 @@ def is_from_nonlocal_source(source: Source) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_from_closure_source(source: Source) -> bool:
     if isinstance(source, ClosureSource):
         return True
@@ -1135,13 +1641,20 @@ def is_from_closure_source(source: Source) -> bool:
 
 
 def is_from_source(source: Source, target: Source) -> bool:
+=======
+def is_from_source(source: Source, target: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, ChainedSource):
         return is_from_source(source.base, target)
     return source == target
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_unspecialized_nn_module_source(source: Source) -> bool:
+=======
+def is_from_unspecialized_nn_module_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, UnspecializedNNModuleSource):
         return True
     if isinstance(source, ChainedSource):
@@ -1150,7 +1663,11 @@ def is_from_unspecialized_nn_module_source(source: Source) -> bool:
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_unspecialized_builtin_nn_module_source(source: Source) -> bool:
+=======
+def is_from_unspecialized_builtin_nn_module_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, UnspecializedBuiltinNNModuleSource):
         return True
     if isinstance(source, ChainedSource):
@@ -1159,7 +1676,11 @@ def is_from_unspecialized_builtin_nn_module_source(source: Source) -> bool:
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_unspecialized_param_buffer_source(source: Source) -> bool:
+=======
+def is_from_unspecialized_param_buffer_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, UnspecializedParamBufferSource):
         return True
     if isinstance(source, ChainedSource):
@@ -1168,7 +1689,11 @@ def is_from_unspecialized_param_buffer_source(source: Source) -> bool:
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_flatten_script_object_source(source: Source) -> bool:
+=======
+def is_from_flatten_script_object_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, FlattenScriptObjectSource):
         return True
     elif isinstance(source, ChainedSource):
@@ -1177,7 +1702,11 @@ def is_from_flatten_script_object_source(source: Source) -> bool:
 
 
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_optimizer_source(source: Source) -> bool:
+=======
+def is_from_optimizer_source(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, OptimizerSource):
         return True
     if isinstance(source, ChainedSource):
@@ -1188,7 +1717,11 @@ def is_from_optimizer_source(source: Source) -> bool:
 # TODO: can probably write a generic "test this on everything in the chain"
 # helper
 @functools.lru_cache
+<<<<<<< HEAD
 def is_from_defaults(source: Source) -> bool:
+=======
+def is_from_defaults(source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, DefaultsSource):
         return True
 
@@ -1211,6 +1744,7 @@ def is_from_defaults(source: Source) -> bool:
     if isinstance(source, ChainedSource):
         return is_from_defaults(source.base)
     return False
+<<<<<<< HEAD
 
 
 @functools.lru_cache
@@ -1222,3 +1756,5 @@ def is_from_skip_guard_source(source: Source) -> bool:
         return is_from_skip_guard_source(source.base)
 
     return False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 2c7b09ee3a31e..f973a4cc05868 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Core module responsible for converting Python bytecode into TorchDynamo's symbolic execution format.
 
@@ -22,8 +27,11 @@
 optimization of PyTorch programs.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import collections
 import collections.abc
 import contextlib
@@ -42,6 +50,7 @@
 import threading
 import traceback
 import types
+<<<<<<< HEAD
 import weakref
 from collections import deque
 from traceback import StackSummary
@@ -51,6 +60,16 @@
 import torch
 import torch._logging
 from torch._dynamo.exc import ObservedException, TensorifyScalarRestartAnalysis
+=======
+import typing
+import weakref
+from typing import Any, Callable, cast, NoReturn, Optional, TYPE_CHECKING, Union
+from unittest.mock import patch
+
+import torch
+import torch._logging
+from torch._dynamo.exc import TensorifyScalarRestartAnalysis
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._guards import tracing, TracingContext
 from torch._logging.structured import dump_file
 from torch.fx.experimental.symbolic_shapes import guard_bool
@@ -72,6 +91,7 @@
 )
 from .bytecode_transformation import (
     cleaned_instructions,
+<<<<<<< HEAD
     create_binary_slice,
     create_call_function,
     create_call_function_ex,
@@ -80,11 +100,19 @@
     create_instruction,
     create_jump_absolute,
     create_rot_n,
+=======
+    create_call_function,
+    create_instruction,
+    create_jump_absolute,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     create_swap,
     get_code_keys,
     Instruction,
     is_generator,
+<<<<<<< HEAD
     is_jump_absolute,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unique_id,
 )
 from .code_context import code_context
@@ -95,13 +123,17 @@
     collapse_resume_frames,
     format_graph_break_message,
     get_stack_above_dynamo,
+<<<<<<< HEAD
     ResumePrologueTracingError,
     StepUnsupported,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unimplemented_v2,
     Unsupported,
 )
 from .funcname_cache import get_funcname
 from .guards import GuardBuilder, install_guard
+<<<<<<< HEAD
 from .output_graph import GraphCompileReason, OutputGraph, StackLocalsMetadata
 from .polyfills import impl_CONTAINS_OP_fallback
 from .replay_record import DummyModule, ExecutionRecorder
@@ -110,6 +142,11 @@
     IS_TRACING_RESUME_PROLOGUE_VARNAME,
     ReenterWith,
 )
+=======
+from .output_graph import GraphCompileReason, OutputGraph
+from .replay_record import DummyModule, ExecutionRecorder
+from .resume_execution import ContinueExecutionCache, ReenterWith
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .source import (
     AttrSource,
     DictGetItemSource,
@@ -117,12 +154,18 @@
     GlobalWeakRefSource,
     LocalCellSource,
     LocalSource,
+<<<<<<< HEAD
     SkipGuardSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Source,
 )
 from .trace_rules import is_builtin_constant, is_forbidden
 from .utils import (
+<<<<<<< HEAD
     _get_error_on_graph_break,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     counters,
     get_fake_value,
     get_instruction_source_311,
@@ -139,7 +182,10 @@
 from .variables.ctx_manager import (
     ContextWrappingVariable,
     GenericContextWrappingVariable,
+<<<<<<< HEAD
     WithEnterFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WithExitFunctionVariable,
 )
 from .variables.dicts import ConstDictVariable, SetVariable
@@ -156,7 +202,10 @@
 from .variables.lazy import LazyVariableTracker
 from .variables.lists import (
     BaseListVariable,
+<<<<<<< HEAD
     IteratorVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ListIteratorVariable,
     ListVariable,
     SliceVariable,
@@ -170,7 +219,11 @@
     PythonModuleVariable,
     UnknownVariable,
 )
+<<<<<<< HEAD
 from .variables.nn_module import NNModuleVariable, UnspecializedNNModuleVariable
+=======
+from .variables.nn_module import NNModuleVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .variables.tensor import supported_comparison_ops, SymNodeVariable, TensorVariable
 from .variables.torch_function import (
     SymbolicTorchFunctionState,
@@ -186,10 +239,13 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Generator, Sequence
 
     from torch._subclasses.fake_tensor import FakeTensorMode
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .package import CompilePackage
 
 log = logging.getLogger(__name__)
@@ -210,6 +266,7 @@
     tx, [handle_contains(tx, [*reversed(args)], {})], {}
 )
 
+<<<<<<< HEAD
 PT2_ISSUE_TRACKER_URL = "https://github.com/pytorch/pytorch/issues/new?&labels=oncall%3A+pt2&projects=&template=pt2-bug-report.yml"
 
 ExceptionVals: TypeAlias = Union[
@@ -217,6 +274,10 @@
     UserDefinedExceptionClassVariable,
     UserDefinedExceptionObjectVariable,
 ]
+=======
+
+PT2_ISSUE_TRACKER_URL = "https://github.com/pytorch/pytorch/issues/new?&labels=oncall%3A+pt2&projects=&template=pt2-bug-report.yml"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
@@ -235,6 +296,7 @@ class SpeculationEntry:
     lineno: int
     instruction_pointer: int
     inst: Instruction  # for debugging only
+<<<<<<< HEAD
     _failed: bool = False
     error_on_graph_break: Optional[bool] = None
     reason: Optional[GraphCompileReason] = None
@@ -245,12 +307,23 @@ def fail_and_restart_analysis(self, error_on_graph_break: bool) -> None:
         """
         self._failed = True
         self.error_on_graph_break = error_on_graph_break
+=======
+    failed: bool = False
+    reason: Optional[GraphCompileReason] = None
+
+    def fail_and_restart_analysis(self):
+        """
+        Start tracing of the current frame over again, and don't take this branch.
+        """
+        self.failed = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.reason is not None:
             restart_reason = self.reason.reason
         else:
             restart_reason = "Unknown fail_and_restart_analysis"
         raise exc.SpeculationRestartAnalysis(restart_reason=restart_reason)
 
+<<<<<<< HEAD
     def failed(self, tx: InstructionTranslatorBase) -> bool:
         if self._failed:
             assert self.error_on_graph_break is not None
@@ -258,6 +331,8 @@ def failed(self, tx: InstructionTranslatorBase) -> bool:
             return True
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class SpeculationLog:
@@ -272,15 +347,26 @@ class SpeculationLog:
     entries: list[SpeculationEntry] = dataclasses.field(default_factory=list)
     index: int = 0
 
+<<<<<<< HEAD
     def restart(self) -> None:
         self.index = 0
 
     def clear(self) -> None:
+=======
+    def restart(self):
+        self.index = 0
+
+    def clear(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.entries.clear()
         self.index = 0
 
     def next(
+<<<<<<< HEAD
         self, filename: str, lineno: int, instruction_pointer: int, inst: Instruction
+=======
+        self, filename: str, lineno: int, instruction_pointer, inst
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> SpeculationEntry:
         """
         Lookup or create a SpeculationEntry() that is shared across
@@ -371,14 +457,22 @@ def empty(cls) -> bool:
 
 
 @functools.cache
+<<<<<<< HEAD
 def _step_logger() -> Callable[..., None]:
+=======
+def _step_logger():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torchdynamo_logging.get_step_logger(log)
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def save_and_restart_speculation_log(
     tx: InstructionTranslatorBase,
 ) -> Generator[None, None, None]:
+=======
+def save_and_restart_speculation_log(tx: "InstructionTranslatorBase"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # When reconstructing a generator after a graph break, we advance it until
     # it is fully exhausted. This process adds new entries to the speculation
     # log that were not previously observed. Without temporarily clearing the
@@ -396,9 +490,13 @@ def save_and_restart_speculation_log(
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def temporarely_allow_writes_to_output_graph(
     tx: InstructionTranslatorBase,
 ) -> Generator[None, None, None]:
+=======
+def temporarely_allow_writes_to_output_graph(tx: "InstructionTranslatorBase"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         tmp = tx.output.should_exit
         tx.output.should_exit = False
@@ -417,10 +515,17 @@ class BlockStackEntry:
         Union[ContextWrappingVariable, GenericContextWrappingVariable]
     ] = None
 
+<<<<<<< HEAD
     def can_restore(self) -> bool:
         return self.with_context is not None
 
     def resume_fn(self) -> ReenterWith:
+=======
+    def can_restore(self):
+        return self.with_context is not None
+
+    def resume_fn(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.stack_index is not None
         if (
             self.with_context
@@ -433,12 +538,20 @@ def resume_fn(self) -> ReenterWith:
         else:
             return ReenterWith(self.stack_index - 1)
 
+<<<<<<< HEAD
     def exit(self, tx: InstructionTranslatorBase, is_graph_break: bool) -> None:
+=======
+    def exit(self, tx, is_graph_break):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.with_context is not None
         if (
             is_graph_break and self.with_context.exit_on_graph_break()
         ) or not is_graph_break:
+<<<<<<< HEAD
             return self.with_context.exit(tx)  # type: ignore[arg-type]
+=======
+            return self.with_context.exit(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SpeculationLogDivergence(AssertionError):
@@ -456,17 +569,26 @@ class YieldValueOp(Exception):
     """
 
 
+<<<<<<< HEAD
 def stack_op(fn: Callable[..., object]) -> Callable[..., Any]:
+=======
+def stack_op(fn: typing.Callable[..., object]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nargs = len(inspect.signature(fn).parameters)
     fn_var = BuiltinVariable(fn)
 
     @functools.wraps(fn)
+<<<<<<< HEAD
     def impl(self: InstructionTranslator, inst: Instruction) -> None:
+=======
+    def impl(self: "InstructionTranslator", inst: Instruction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.push(fn_var.call_function(self, self.popn(nargs), {}))
 
     return impl
 
 
+<<<<<<< HEAD
 def is_stdlib(mod: object) -> bool:
     if not isinstance(mod, types.ModuleType):
         return False
@@ -502,17 +624,39 @@ def _detect_and_normalize_assert_statement(
     # by pushing dummy error message when nothing is given.
     #
     # Python 3.9-3.13 assertion is in following format (minus small differences)
+=======
+def _detect_and_normalize_assert_statement(
+    self: "InstructionTranslatorBase",
+    truth_fn: typing.Callable[[object], bool],
+    push: bool,
+):
+    # Detect if this jump instruction is assert and normalize the assert
+    # by pushing dummy error message when nothing is given.
+    #
+    # Python 3.9 assertion is in following format:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # 18 POP_JUMP_IF_TRUE       28
     # 20 LOAD_ASSERTION_ERROR
     # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
     # 24 CALL_FUNCTION            1                    -> optional instruction
     # 26 RAISE_VARARGS
+<<<<<<< HEAD
+=======
+    #
+    # Python 3.8 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_GLOBAL              0 (Assertion type)
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS            1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (truth_fn is not operator.truth) or push:
         return False
 
     assert isinstance(self.instruction_pointer, int)
     current_instruction_pointer = self.instruction_pointer
+<<<<<<< HEAD
 
     for with_msg in (False, True):
         assert_insts = get_assert_bytecode_sequence(with_msg)
@@ -533,11 +677,51 @@ def _detect_and_normalize_assert_statement(
             return True
 
     return False
+=======
+    inst = self.instructions[current_instruction_pointer]
+    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
+    if inst.opname != "LOAD_ASSERTION_ERROR":
+        return False
+
+    current_instruction_pointer += 1
+
+    # Use dummy error message if its hard to extract
+    error_msg = "assertion error"
+
+    inst = self.instructions[current_instruction_pointer]
+    # DETECT RAISE_VARARGS or LOAD CONST
+    if inst.opname == "LOAD_CONST":
+        if not isinstance(inst.argval, str):
+            return False
+        error_msg = inst.argval
+
+        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
+        # (PRECALL for Python 3.11, CALL for Python 3.12+)
+        current_instruction_pointer += 1
+        inst = self.instructions[current_instruction_pointer]
+        if inst.opname not in ("CALL_FUNCTION", "PRECALL", "CALL"):
+            return False
+
+        # for Python 3.11, PRECALL should be followed by CALL, then RAISE_VARARGS
+        # for Python != 3.11, CALL_FUNCTION/CALL should be followed by RAISE_VARARGS
+        current_instruction_pointer += 1
+        if inst.opname == "PRECALL":
+            current_instruction_pointer += 1
+        inst = self.instructions[current_instruction_pointer]
+
+    if inst.opname != "RAISE_VARARGS":
+        return False
+
+    self.push(ConstantVariable.create(error_msg))
+
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 explain = False
 
 
+<<<<<<< HEAD
 def log_graph_break(
     code_options: dict[str, Any],
     reason: str = "",
@@ -545,6 +729,9 @@ def log_graph_break(
     user_stack: Optional[StackSummary] = None,
     latest_bytecode_log: Optional[str] = None,
 ) -> None:
+=======
+def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if user_stack is None:
         user_stack = torch._guards.TracingContext.extract_stack()
 
@@ -564,8 +751,12 @@ def log_graph_break(
             traceback.format_list(stack_above_dynamo)
         )
     else:
+<<<<<<< HEAD
         user_stack = get_stack_above_dynamo() + user_stack  # type: ignore[assignment]
         # pyrefly: ignore [bad-argument-type]
+=======
+        user_stack = get_stack_above_dynamo() + user_stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         user_stack = collapse_resume_frames(user_stack)
     user_stack_formatted = "".join(traceback.format_list(user_stack))
     user_stack_trace = (
@@ -606,10 +797,13 @@ def log_graph_break(
         # This log line MUST contain the string "Graph break in user code",
         # This log line is exercised from
         #   python test/dynamo/test_exc.py -k test_graph_break_log
+<<<<<<< HEAD
         if latest_bytecode_log and config.verbose:
             user_stack_trace += "Most recent bytecode instructions traced (max 20):\n"
             user_stack_trace += latest_bytecode_log
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_break_log.debug(
             user_stack_trace,
         )
@@ -625,9 +819,13 @@ def log_graph_break(
         )
 
 
+<<<<<<< HEAD
 def generic_jump(
     truth_fn: Callable[[object], bool], push: bool
 ) -> Callable[[InstructionTranslatorBase, Instruction], None]:
+=======
+def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # graph break message fields for data dependent branching
     _gb_type = "Data-dependent branching"
     _explanation = (
@@ -639,12 +837,16 @@ def generic_jump(
         "Use `torch.cond` to express dynamic control flow.",
     ]
 
+<<<<<<< HEAD
     def jump_graph_break(
         self: InstructionTranslatorBase,
         inst: Instruction,
         value: VariableTracker,
         extra_msg: str = "",
     ) -> None:
+=======
+    def jump_graph_break(self, inst, value, extra_msg=""):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log_graph_break(
             self.code_options,
             reason=format_graph_break_message(
@@ -676,6 +878,7 @@ def jump_graph_break(
         self.pop()
 
         if_next = self.create_call_resume_at(
+<<<<<<< HEAD
             self.next_instruction,
             all_stack_locals_metadata,
         )
@@ -686,6 +889,13 @@ def jump_graph_break(
             inst.target,
             all_stack_locals_metadata,
         )
+=======
+            self.next_instruction, all_stack_locals_metadata
+        )
+        if push:
+            self.push(value)
+        if_jump = self.create_call_resume_at(inst.target, all_stack_locals_metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if sys.version_info >= (3, 13):
             # 3.13 requires stack[-1] to be bool type
@@ -695,7 +905,11 @@ def jump_graph_break(
         jump_inst.copy_positions(inst)
         self.output.add_output_instructions([jump_inst] + if_next + if_jump)
 
+<<<<<<< HEAD
     def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
+=======
+    def inner(self: "InstructionTranslatorBase", inst: Instruction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value: VariableTracker = self.pop()
         if (
             config.rewrite_assert_with_torch_assert
@@ -878,6 +1092,7 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
                     self.jump(inst)
             else:
                 unimplemented_v2(
+<<<<<<< HEAD
                     gb_type="Data-dependent branching",
                     context=f"attempted to jump with {value}",
                     explanation=_explanation,
@@ -885,11 +1100,18 @@ def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
                         *graph_break_hints.FUNDAMENTAL,
                         "Use `torch.cond` to express dynamic control flow.",
                     ],
+=======
+                    gb_type=_gb_type,
+                    context=f"attempted to jump with {value}",
+                    explanation=_explanation,
+                    hints=_hints,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     return inner
 
 
+<<<<<<< HEAD
 def break_graph_if_unsupported(
     *, push: int
 ) -> Callable[
@@ -902,6 +1124,14 @@ def decorator(
         def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
             speculation = self.speculate()
             if speculation.failed(self):
+=======
+def break_graph_if_unsupported(*, push):
+    def decorator(inner_fn):
+        @functools.wraps(inner_fn)
+        def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
+            speculation = self.speculate()
+            if speculation.failed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert speculation.reason is not None
                 return handle_graph_break(self, inst, speculation.reason)
             try:
@@ -933,7 +1163,10 @@ def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
                     exc_info=True,
                     reason=str(excp),
                     user_stack=excp.real_stack,
+<<<<<<< HEAD
                     latest_bytecode_log="\n".join(self.latest_bytecode_queue),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 if self.maybe_has_backedge():
@@ -947,6 +1180,7 @@ def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
                 excp.remove_from_stats()
                 excp.add_to_stats("graph_break")
                 speculation.reason = GraphCompileReason(excp.msg, excp.real_stack)
+<<<<<<< HEAD
             speculation.fail_and_restart_analysis(self.error_on_graph_break)
 
         def handle_graph_break(
@@ -954,6 +1188,15 @@ def handle_graph_break(
             inst: Instruction,
             reason: GraphCompileReason,
         ) -> None:
+=======
+            speculation.fail_and_restart_analysis()
+
+        def handle_graph_break(
+            self: "InstructionTranslatorBase",
+            inst: Instruction,
+            reason: GraphCompileReason,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 sys.version_info >= (3, 11)
                 and sys.version_info < (3, 12)
@@ -969,7 +1212,11 @@ def handle_graph_break(
             all_stack_locals_metadata = self.output.compile_subgraph(
                 self, reason=reason, stack_pops=push - stack_effect
             )
+<<<<<<< HEAD
             cg = PyCodegen(self.output.root_tx)
+=======
+            cg = PyCodegen(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cleanup: list[Instruction] = []
             # Reconstruct the context variable CLASS in the block stack
             for b in self.block_stack:
@@ -1001,7 +1248,10 @@ def handle_graph_break(
                     self.output.add_output_instructions(
                         [create_instruction("KW_NAMES", argval=kw_names)]
                     )
+<<<<<<< HEAD
                 assert inst.arg is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 call_insts = create_call_function(inst.arg, False)
                 call_insts[-1].copy_positions(inst)
                 self.output.add_output_instructions(call_insts)
@@ -1019,8 +1269,12 @@ def handle_graph_break(
                 self.push(UnknownVariable())
             self.output.add_output_instructions(
                 self.create_call_resume_at(
+<<<<<<< HEAD
                     self.next_instruction,
                     all_stack_locals_metadata,
+=======
+                    self.next_instruction, all_stack_locals_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
@@ -1029,6 +1283,7 @@ def handle_graph_break(
     return decorator
 
 
+<<<<<<< HEAD
 class BytecodeDispatchTableMeta(type):
     """Installs a `cls.dispatch_table` on every subclass to speed up calls to self.OPCODE()"""
 
@@ -1036,6 +1291,15 @@ def __init__(cls: type, name: str, bases: Any, dct: Any) -> None:
         super().__init__(name, bases, dct)  # type: ignore[misc]
 
         def _missing(opname: str, *args: Any) -> None:
+=======
+class BytecodeDistpatchTableMeta(type):
+    """Installs a `cls.dispatch_table` on every subclass to speed up calls to self.OPCODE()"""
+
+    def __init__(cls, name, bases, dct) -> None:
+        super().__init__(name, bases, dct)
+
+        def _missing(opname, *args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented_v2(
                 gb_type="Missing bytecode handler",
                 context=f"{opname} with args {args}",
@@ -1051,7 +1315,10 @@ def _missing(opname: str, *args: Any) -> None:
             op: getattr(cls, opname, functools.partial(_missing, opname))
             for opname, op in dis.opmap.items()
         }
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls.dispatch_table = [dispatch_table.get(i) for i in range(2**8)]
 
 
@@ -1062,7 +1329,11 @@ class ExceptionStack:
     """
 
     # Exception handling in CPython is a bit confusing and some of the bytecode
+<<<<<<< HEAD
     # have a slightly different behavior than what is documented. While reading
+=======
+    # have a slightly different behavior than what is is documented. While reading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # the documentation, is important to notice that the terms "current exception"
     # and "stack" sometimes refers to a C variable with the same name and the
     # exception stack, respectively.
@@ -1072,6 +1343,7 @@ class ExceptionStack:
     #  + PUSH_EXC_INFO := pushes the current_exception to the *exception stack*
     #  + POP_EXCEPT := pops TOS from the *exception stack*
 
+<<<<<<< HEAD
     _exc_stack: list[ExceptionVals] = dataclasses.field(default_factory=list)
     _current_exception: Optional[ExceptionVals] = dataclasses.field(default=None)
 
@@ -1083,10 +1355,24 @@ def set_current_exception(self, val: ExceptionVals) -> None:
         self._current_exception = val
 
     def move_current_exception_to_stack(self) -> None:
+=======
+    _exc_stack: list[VariableTracker] = dataclasses.field(default_factory=list)
+    _current_exception: Optional[VariableTracker] = dataclasses.field(default=None)
+
+    def clear_current_exception(self):
+        self._current_exception = None
+
+    def set_current_exception(self, val):
+        self._set_context_and_break_context_reference_cycle(val)
+        self._current_exception = val
+
+    def move_current_exception_to_stack(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self._current_exception is not None
         self.append(self._current_exception)
         self.clear_current_exception()
 
+<<<<<<< HEAD
     def get_current_exception(self) -> ExceptionVals:
         assert self._current_exception is not None
         return self._current_exception
@@ -1095,14 +1381,29 @@ def _set_context_recursive(
         self, val: ExceptionVals, prev_idx: int
     ) -> ExceptionVals:
         if (ctx := val.__context__) and type(ctx) is not ConstantVariable:  # type: ignore[union-attr]
+=======
+    def get_current_exception(self):
+        assert self._current_exception is not None
+        return self._current_exception
+
+    def _set_context_recursive(self, val, prev_idx):
+        if (ctx := val.__context__) and type(ctx) is not ConstantVariable:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return val
         if len(self._exc_stack) + prev_idx > 0:
             prev = self._exc_stack[prev_idx]
             self._set_context_recursive(prev, prev_idx - 1)
+<<<<<<< HEAD
             val.set_context(prev)  # type: ignore[union-attr, arg-type]
         return val
 
     def _break_context_reference_cycle(self, val: ExceptionVals) -> None:
+=======
+            val.set_context(prev)
+        return val
+
+    def _break_context_reference_cycle(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # See test_exceptions::test_raise_does_not_create_context_chain_cycle
         # Based on https://github.com/python/cpython/blob/e635bf2e49797ecb976ce45a67fce2201a25ca68/Python/errors.c#L207-L228
         # As noted on CPython, this is O(chain length) but the context chains
@@ -1110,21 +1411,33 @@ def _break_context_reference_cycle(self, val: ExceptionVals) -> None:
         o = slow_o = val
         slow_update_toggle = False  # floyd's algorithm for detecting cycle
         while True:
+<<<<<<< HEAD
             context = o.__context__  # type: ignore[union-attr]
+=======
+            context = o.__context__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if type(context) is ConstantVariable:  # context not set
                 break
 
             if context is val:
+<<<<<<< HEAD
                 o.set_context(ConstantVariable(None))  # type: ignore[union-attr, arg-type]
                 break
 
             o = context  # type: ignore[assignment]
+=======
+                o.set_context(ConstantVariable(None))
+                break
+
+            o = context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if o is slow_o:
                 # pre-existing cycle - all exceptions on the path were
                 # visited and checked
                 break
 
             if slow_update_toggle:
+<<<<<<< HEAD
                 # visited all exceptions
                 slow_o = slow_o.__context__  # type: ignore[union-attr, assignment]
             slow_update_toggle = not slow_update_toggle
@@ -1132,10 +1445,17 @@ def _break_context_reference_cycle(self, val: ExceptionVals) -> None:
     def _set_context_and_break_context_reference_cycle(
         self, val: ExceptionVals
     ) -> None:
+=======
+                slow_o = slow_o.__context__  # visited all exceptions
+            slow_update_toggle = not slow_update_toggle
+
+    def _set_context_and_break_context_reference_cycle(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # set Exception.__context__
         self._set_context_recursive(val, len(self._exc_stack) - 1)
         self._break_context_reference_cycle(val)
 
+<<<<<<< HEAD
     def pop(self) -> ExceptionVals:
         return self._exc_stack.pop()
 
@@ -1149,19 +1469,41 @@ def __getitem__(self, index: int) -> ExceptionVals:
         return self._exc_stack[index]
 
     def __str__(self) -> str:
+=======
+    def pop(self):
+        return self._exc_stack.pop()
+
+    def append(self, val):
+        self._exc_stack.append(val)
+
+    def __len__(self):
+        return len(self._exc_stack)
+
+    def __getitem__(self, index):
+        return self._exc_stack[index]
+
+    def __str__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self._exc_stack=} - {self._current_exception=}"
 
     __repr__ = __str__
 
 
 class InstructionTranslatorBase(
+<<<<<<< HEAD
     metaclass=BytecodeDispatchTableMeta,
+=======
+    metaclass=BytecodeDistpatchTableMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     output: OutputGraph
     symbolic_locals: dict[str, VariableTracker]
     symbolic_globals: dict[str, VariableTracker]
     symbolic_torch_function_state: SymbolicTorchFunctionState
+<<<<<<< HEAD
     post_prune_cell_and_freevars: Optional[dict[str, VariableTracker]]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stack: list[VariableTracker]
     instruction_pointer: Optional[int]
     current_instruction: Instruction
@@ -1179,6 +1521,7 @@ class InstructionTranslatorBase(
     strict_checks_fn: Optional[Callable[[VariableTracker], bool]]
     start_point: Optional[int]
     is_leaf_tracer: bool
+<<<<<<< HEAD
     parent: Optional[InstructionTranslatorBase]
     debug_locals: list[tuple[VariableTracker, list[VariableTracker]]]
     package: Optional[CompilePackage]
@@ -1186,6 +1529,13 @@ class InstructionTranslatorBase(
     # Store the latest bytecode before graph_break() call by user
 
     def mark_inconsistent_side_effects(self) -> None:
+=======
+    parent: Optional["InstructionTranslatorBase"]
+    debug_locals: list[tuple[VariableTracker, list[VariableTracker]]]
+    package: Optional["CompilePackage"]
+
+    def mark_inconsistent_side_effects(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         InstructionTranslator has encountered instructions which may cause
         dynamo to see a different version of history from eager
@@ -1193,7 +1543,11 @@ def mark_inconsistent_side_effects(self) -> None:
         """
         self.inconsistent_side_effects = True
 
+<<<<<<< HEAD
     def maybe_has_backedge(self) -> bool:
+=======
+    def maybe_has_backedge(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This function employs a heuristic. It does not reliably detect a backedge.
         # The heuristic is straightforward: starting from the current instruction and
         # continuing to the end, if any jump instruction targets an instruction before
@@ -1213,6 +1567,7 @@ def maybe_has_backedge(self) -> bool:
         # graph during a for loop. In general, its better to have fewer false
         # negatives so that Dynamo does not skip the whole frame.
 
+<<<<<<< HEAD
         # If any parent tx has a backedge, then return True
         cur_tx: Optional[InstructionTranslatorBase] = self
         while cur_tx is not None:
@@ -1235,10 +1590,43 @@ def freevars(self) -> list[str]:
         return self.code_options["co_freevars"]
 
     def cell_and_freevars(self) -> list[str]:
+=======
+        cur_offset = self.current_instruction.offset
+        assert self.instruction_pointer is not None
+        for inst in self.instructions[self.instruction_pointer :]:
+            if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+                return False
+            if inst.opname in JUMP_OPNAMES:
+                jump_offset = inst.argval
+                if jump_offset < cur_offset:
+                    return True
+        return False
+
+    def cellvars(self):
+        if not hasattr(self, "_cellvars"):
+            self._cellvars = tuple(self.code_options["co_cellvars"] or [])
+            # An inlined function might depend on the cellvar of the parent
+            # function. So, recursively obtain parent cellvars.
+            if isinstance(self, InliningInstructionTranslator):
+                self._cellvars += self.parent.cellvars()
+        return self._cellvars
+
+    def freevars(self):
+        if not hasattr(self, "_freevars"):
+            self._freevars = tuple(self.code_options["co_freevars"] or [])
+            # An inlined function might depend on the freevar of the parent
+            # function. So, recursively obtain parent freevars.
+            if isinstance(self, InliningInstructionTranslator):
+                self._freevars += self.parent.freevars()
+        return self._freevars
+
+    def cell_and_freevars(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not hasattr(self, "_cell_and_freevars"):
             self._cell_and_freevars = self.cellvars() + self.freevars()
         return self._cell_and_freevars
 
+<<<<<<< HEAD
     def prune_dead_locals(self) -> None:
         # keep cell and freevar references alive
         self.post_prune_cell_and_freevars = {
@@ -1246,18 +1634,30 @@ def prune_dead_locals(self) -> None:
             for k, v in self.symbolic_locals.items()
             if k in self.cell_and_freevars()
         }
+=======
+    def prune_dead_locals(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Only keep the locals that must remain on the stack.
         reads = livevars_analysis(self.instructions, self.current_instruction)
         self.symbolic_locals = {
             k: v for k, v in self.symbolic_locals.items() if k in reads
         }
+<<<<<<< HEAD
+=======
+        # "Garbage collect the heap".
+        self.output.side_effects.prune_dead_object_new(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(
         self,
         fn: VariableTracker,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(fn, VariableTracker)
         assert isinstance(args, list)
         assert isinstance(kwargs, dict)
@@ -1274,13 +1674,18 @@ def call_function(
             raise AssertionError(f"Attempt to trace forbidden callable {inner_fn}")
         self.push(fn.call_function(self, args, kwargs))  # type: ignore[arg-type]
 
+<<<<<<< HEAD
     def inline_generator_function(
         self, fn: VariableTracker, args: Sequence[Any], kwargs: dict[str, Any]
     ) -> Any:
+=======
+    def inline_generator_function(self, fn, args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Redirect the call to the generator "call_function"
         """
         if not isinstance(fn, LocalGeneratorFunctionVariable):
+<<<<<<< HEAD
             fn = LocalGeneratorFunctionVariable(fn)  # type: ignore[arg-type]
         return fn.call_function(self, args, kwargs)  # type: ignore[arg-type]
 
@@ -1292,11 +1697,25 @@ def inline_user_function_return(
         """
         self.is_leaf_tracer = False
         if config.enable_faithful_generator_behavior and is_generator(fn.get_code()):  # type: ignore[attr-defined]
+=======
+            fn = LocalGeneratorFunctionVariable(fn)
+        return fn.call_function(self, args, kwargs)
+
+    def inline_user_function_return(self, fn, args, kwargs):
+        """
+        A call to some user defined function by inlining it.
+        """
+        if config.enable_faithful_generator_behavior and is_generator(fn.get_code()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.inline_generator_function(fn, args, kwargs)
         else:
             return InliningInstructionTranslator.inline_call(self, fn, args, kwargs)
 
+<<<<<<< HEAD
     def get_line_of_code_header(self, lineno: Optional[int] = None) -> str:
+=======
+    def get_line_of_code_header(self, lineno=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if lineno is None:
             lineno = self.lineno
         inline_depth_str = (
@@ -1306,13 +1725,21 @@ def get_line_of_code_header(self, lineno: Optional[int] = None) -> str:
         funcname_str = "" if funcname is None else f" ({funcname})"
         return f"{self.f_code.co_filename}:{lineno} in {self.f_code.co_name}{funcname_str}{inline_depth_str}"
 
+<<<<<<< HEAD
     def get_log_starts_line_log_str(self) -> str:
+=======
+    def get_log_starts_line_log_str(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log_str = f"TRACE starts_line {self.get_line_of_code_header()}\n"
         line = linecache.getline(self.f_code.co_filename, self.lineno).rstrip()
         log_str += f"    {line}"
         return log_str
 
+<<<<<<< HEAD
     def starts_line(self, lineno: int) -> None:
+=======
+    def starts_line(self, lineno):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.lineno == lineno:
             return
         self.lineno = lineno
@@ -1323,10 +1750,15 @@ def starts_line(self, lineno: int) -> None:
         if self.is_trace_source_log_enabled:
             trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
 
+<<<<<<< HEAD
     def step(self) -> bool:
         """Process exactly one instruction, return False we should exit"""
         self.error_on_graph_break = _get_error_on_graph_break()
 
+=======
+    def step(self):
+        """Process exactly one instruction, return False we should exit"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ip = self.instruction_pointer
         if ip is None:
             return False
@@ -1342,6 +1774,7 @@ def step(self) -> bool:
             and self.is_non_empty_graph()
         ):
             self.current_speculation = self.speculate()
+<<<<<<< HEAD
             if self.current_speculation.failed(self):
                 self.step_graph_break(inst)
                 return False
@@ -1361,6 +1794,14 @@ def step(self) -> bool:
                 stack_repr = "<self.stack repr truncated due to large integer>"
             self.latest_bytecode_queue.append(
                 f"TRACE {inst.opname} {repr(inst.argval)} {stack_repr}"
+=======
+            if self.current_speculation.failed:
+                return self.step_graph_break(inst)
+
+        if self.is_trace_bytecode_log_enabled:
+            trace_bytecode_log.debug(
+                "TRACE %s %s %s", inst.opname, inst.argval, self.stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self.update_block_stack(inst)
@@ -1375,6 +1816,7 @@ def step(self) -> bool:
             return True
         except (ReturnValueOp, YieldValueOp):
             return False
+<<<<<<< HEAD
         except (Unsupported, StepUnsupported) as e:
             if self.current_speculation is None:
                 log.debug("empty checkpoint")
@@ -1400,6 +1842,19 @@ def step(self) -> bool:
     if sys.version_info >= (3, 11):
 
         def update_block_stack(self, inst: Instruction) -> None:
+=======
+        except Unsupported:
+            if self.current_speculation is None:
+                log.debug("empty checkpoint")
+                raise
+            log.debug("step triggered compile", exc_info=True)
+
+        self.current_speculation.fail_and_restart_analysis()
+
+    if sys.version_info >= (3, 11):
+
+        def update_block_stack(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # 3.11+ no longer uses a block stack, but we still keep track of one
             # so that we know which contexts are currently active.
             # For our purposes, all exception table entries with the same target
@@ -1431,12 +1886,16 @@ def update_block_stack(self, inst: Instruction) -> None:
                 # an exception table entry, so we also assume that we
                 # are still in the same block. It is probably safe to do
                 # this in 3.11, even though we haven't encountered this case before.
+<<<<<<< HEAD
                 # In 3.14+, NOT_TAKEN might also not be covered by an exn table entry.
                 if self.block_stack and inst.opname not in (
                     "NOP",
                     "JUMP_BACKWARD",
                     "NOT_TAKEN",
                 ):
+=======
+                if self.block_stack and inst.opname not in ("NOP", "JUMP_BACKWARD"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # If we really escape from a block and the current
                     # instruction is not in another block, then there
                     # should be no other nested blocks that we are in.
@@ -1445,6 +1904,7 @@ def update_block_stack(self, inst: Instruction) -> None:
 
     else:
 
+<<<<<<< HEAD
         def update_block_stack(self, inst: Instruction) -> None:
             pass
 
@@ -1454,6 +1914,16 @@ def next_instruction(self) -> Instruction:
         return self.instructions[self.instruction_pointer]
 
     def step_graph_break(self, continue_inst: Instruction) -> None:
+=======
+        def update_block_stack(self, inst):
+            pass
+
+    @property
+    def next_instruction(self):
+        return self.instructions[self.instruction_pointer]  # type: ignore[index]
+
+    def step_graph_break(self, continue_inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # generate code from checkpoint
         assert not self.output.output_instructions
         assert self.current_speculation is not None
@@ -1461,14 +1931,19 @@ def step_graph_break(self, continue_inst: Instruction) -> None:
         # where we call step_graph_break right now is when the stack is empty,
         # so let's enforce that for now.
         assert not self.stack
+<<<<<<< HEAD
         # NOTE: if we support non-empty self.stack in the future, the `stack_pops` argument
         # below should be set to the stack length to ensure that the stack is codegen'd
         # for the rest of the function.
         all_stack_locals_metadata = self.output.compile_subgraph(
+=======
+        self.output.compile_subgraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self,
             partial_convert=True,
             reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
         )
+<<<<<<< HEAD
         # current frame state
         # cells,
         # [
@@ -1638,17 +2113,29 @@ def step_graph_break(self, continue_inst: Instruction) -> None:
             )
 
     def run_ctx_mgr(self) -> Any:
+=======
+        self.output.add_output_instructions(
+            [create_jump_absolute(continue_inst)] + self.instructions
+        )
+
+    def run_ctx_mgr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: Don't push the top level frame summary; set_current_loc will
         # take care of it.  However, DO make sure we attach real_stack to
         # exceptions
         return TracingContext.current_frame(None)
 
+<<<<<<< HEAD
     def run(self) -> None:
+=======
+    def run(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.run_ctx_mgr():
             dump_file(self.f_code.co_filename)
             try:
                 self.output.push_tx(self)
                 self.start_point = self.instruction_pointer
+<<<<<<< HEAD
                 try:
                     while self.step():
                         pass
@@ -1660,6 +2147,10 @@ def run(self) -> None:
                             f"{type(e).__qualname__}: {str(e)}"
                         ).with_traceback(e.__traceback__) from None
                     raise
+=======
+                while self.step():
+                    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except TensorifyScalarRestartAnalysis:
                 raise
             except BackendCompilerFailed:
@@ -1695,13 +2186,21 @@ def run(self) -> None:
                     # twice is not an issue (second stop is a no op).
                     self.output.mark_bytecode_tracing_stop()
 
+<<<<<<< HEAD
     def push(self, val: Optional[VariableTracker]) -> None:
+=======
+    def push(self, val: Optional[VariableTracker]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert val is None or isinstance(val, VariableTracker), (
             f"push expects VariableTracker, got {typestr(val)}"
         )
         self.stack.append(val)  # type: ignore[arg-type]
 
+<<<<<<< HEAD
     def push_many(self, vals: list[VariableTracker]) -> None:
+=======
+    def push_many(self, vals: list[VariableTracker]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for val in vals:
             self.push(val)
 
@@ -1711,7 +2210,11 @@ def pop(self) -> VariableTracker:
     def popn(self, n: int) -> list[VariableTracker]:
         return [*reversed([self.pop() for _ in range(n)])]
 
+<<<<<<< HEAD
     def LOAD_FAST(self, inst: Instruction) -> None:
+=======
+    def LOAD_FAST(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = inst.argval
         if self.exec_recorder and name in self.f_locals:
             self.exec_recorder.add_local_var(name, self.f_locals[name])
@@ -1746,7 +2249,11 @@ def LOAD_FAST(self, inst: Instruction) -> None:
         if name.startswith("__stack"):
             self.symbolic_locals.pop(name)
 
+<<<<<<< HEAD
     def LOAD_DEREF(self, inst: Instruction) -> None:
+=======
+    def LOAD_DEREF(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert inst.argval in self.cell_and_freevars()
         cell = self.symbolic_locals[inst.argval]
         contents_var = self.output.side_effects.load_cell(cell)
@@ -1755,11 +2262,16 @@ def LOAD_DEREF(self, inst: Instruction) -> None:
         if self.exec_recorder and inst.argval in self.f_locals:
             self.exec_recorder.add_local_var(inst.argval, self.f_locals[inst.argval])
 
+<<<<<<< HEAD
     def STORE_FAST(self, inst: Instruction) -> None:
+=======
+    def STORE_FAST(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = inst.argval
         loaded_vt = self.pop()
         loaded_vt.set_name_hint(name)
         self.symbolic_locals[name] = loaded_vt
+<<<<<<< HEAD
         if name == IS_TRACING_RESUME_PROLOGUE_VARNAME:
             val = loaded_vt.as_python_constant()
             assert type(val) is bool
@@ -1769,6 +2281,13 @@ def DELETE_FAST(self, inst: Instruction) -> None:
         del self.symbolic_locals[inst.argval]
 
     def STORE_DEREF(self, inst: Instruction) -> None:  # type: ignore[override]
+=======
+
+    def DELETE_FAST(self, inst):
+        del self.symbolic_locals[inst.argval]
+
+    def STORE_DEREF(self, inst):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert inst.argval in self.cell_and_freevars()
         cell = self.symbolic_locals[inst.argval]
         val = self.pop()
@@ -1780,6 +2299,7 @@ def STORE_DEREF(self, inst: Instruction) -> None:  # type: ignore[override]
 
     LOAD_CLOSURE = LOAD_FAST
 
+<<<<<<< HEAD
     def _load_const(self, inst: Instruction) -> VariableTracker:
         i = inst.arg
         if i is None:
@@ -1795,6 +2315,21 @@ def LOAD_CONST(self, inst: Instruction) -> None:
         self.push(self._load_const(inst))
 
     def _load_global(self, inst: Instruction) -> None:
+=======
+    def _load_const(self, inst):
+        i = inst.arg
+        if i is None:
+            return ConstantVariable.create(value=inst.argval)
+        val = self._constants_cache[i]
+        if not val:
+            self._constants_cache[i] = val = ConstantVariable.create(value=inst.argval)
+        return val
+
+    def LOAD_CONST(self, inst):
+        self.push(self._load_const(inst))
+
+    def _load_global(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = inst.argval
 
         if self.exec_recorder:
@@ -1816,21 +2351,33 @@ def _load_global(self, inst: Instruction) -> None:
         self.push(VariableTracker.build(self, value, GlobalSource(name)))
 
     @functools.cached_property
+<<<<<<< HEAD
     def nn_modules_globals_vt(self) -> VariableTracker:
+=======
+    def nn_modules_globals_vt(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module_name = "torch.nn.modules.module"
         module_source = self.import_source(module_name)
         fglobals_value = _import_module(module_name)
         return VariableTracker.build(self, fglobals_value, module_source)
 
+<<<<<<< HEAD
     def LOAD_GLOBAL(self, inst: Instruction) -> None:
         assert inst.arg is not None
+=======
+    def LOAD_GLOBAL(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 11) and sys.version_info < (3, 13) and inst.arg % 2:
             self.PUSH_NULL(inst)
         self._load_global(inst)
         if sys.version_info >= (3, 13) and inst.arg % 2:
             self.PUSH_NULL(inst)
 
+<<<<<<< HEAD
     def STORE_GLOBAL(self, inst: Instruction) -> None:
+=======
+    def STORE_GLOBAL(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = self.pop()
         name = inst.argval
         source = GlobalSource(name)
@@ -1851,7 +2398,11 @@ def STORE_GLOBAL(self, inst: Instruction) -> None:
     # Cache note: This cache only exists for the duration of this
     # InstructionTranslator - so it should be safe to do.
     @cache_method
+<<<<<<< HEAD
     def import_source(self, module_name: str) -> GlobalSource:
+=======
+    def import_source(self, module_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create an alias to a module for use in guards"""
         if "torch_package" in module_name:
             value = torch.package.package_importer._package_imported_modules[
@@ -1866,14 +2417,21 @@ def import_source(self, module_name: str) -> GlobalSource:
 
         if self.package is not None:
             self.package.add_import_source(alias, module_name)
+<<<<<<< HEAD
         self.output.import_sources[alias] = module_name
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_globals = self.output.global_scope
         assert alias not in f_globals or f_globals[alias] is value
         f_globals[alias] = value
         self.output.update_co_names(alias)
         return GlobalSource(alias)
 
+<<<<<<< HEAD
     def resolve_name(self, name: str, package: str, level: int) -> str:
+=======
+    def resolve_name(self, name, package, level):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Copied from the Cpython implementation of __import__
         Resolve a relative module name to an absolute one.
@@ -1885,7 +2443,11 @@ def resolve_name(self, name: str, package: str, level: int) -> str:
         base = bits[0]
         return f"{base}.{name}" if name else base
 
+<<<<<<< HEAD
     def calc_package(self) -> str:
+=======
+    def calc_package(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Copied from the Cpython implementation of __import__
         https://github.com/python/cpython/blob/5a094f0255eea1db58fb2cf14c200971e64ec36e/Lib/importlib/_bootstrap.py#L1090
@@ -1914,7 +2476,11 @@ def calc_package(self) -> str:
                 package = package.rpartition(".")[0]
         return package
 
+<<<<<<< HEAD
     def IMPORT_NAME(self, inst: Instruction) -> None:
+=======
+    def IMPORT_NAME(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         level, fromlist = self.popn(2)
         level = level.as_python_constant()
         fromlist = fromlist.as_python_constant()
@@ -1959,17 +2525,26 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
                 source = self.import_source(module_name)
 
         if self.exec_recorder:
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             self.exec_recorder.add_local_mod(recorded_name, value)
 
         # pyrefly: ignore [unbound-name]
         if istype(value, (types.ModuleType, DummyModule)):
             # pyrefly: ignore [unbound-name]
+=======
+            self.exec_recorder.add_local_mod(recorded_name, value)
+
+        if istype(value, (types.ModuleType, DummyModule)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.push(PythonModuleVariable(value, source=source))
         else:
             unimplemented_v2(
                 gb_type="Bad import result",
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 context=typestr(value),
                 explanation="Import result is not a Python module.",
                 hints=[],
@@ -1978,14 +2553,24 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
     # fb internal 3.12 opcode
     EAGER_IMPORT_NAME = IMPORT_NAME
 
+<<<<<<< HEAD
     def IMPORT_FROM(self, inst: Instruction) -> None:
         self.DUP_TOP(inst)
         self._load_attr(inst.argval)
+=======
+    def IMPORT_FROM(self, inst):
+        self.DUP_TOP(inst)
+        self._load_attr(inst)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Cache note: This cache only exists for the duration of this
     # InstructionTranslator - so it should be safe to do.
     @cache_method
+<<<<<<< HEAD
     def load_builtin_from_argval(self, argval: Any) -> VariableTracker:
+=======
+    def load_builtin_from_argval(self, argval):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if argval not in self.f_builtins:
             raise Unsupported(f"name '{argval}' is not defined")
         val = self.f_builtins[argval]
@@ -2000,6 +2585,7 @@ def load_builtin_from_argval(self, argval: Any) -> VariableTracker:
             assert is_builtin_constant(val)
             return ConstantVariable.create(value=val)
 
+<<<<<<< HEAD
     def load_builtin(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval(inst.argval))
 
@@ -2007,6 +2593,14 @@ def jump(self, inst: Instruction) -> None:
         assert self.instruction_pointer is not None
         assert self.start_point is not None
         assert inst.target is not None
+=======
+    def load_builtin(self, inst):
+        self.push(self.load_builtin_from_argval(inst.argval))
+
+    def jump(self, inst):
+        assert self.instruction_pointer is not None
+        assert self.start_point is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_metrics_context().increment(
             "ir_count", self.instruction_pointer - self.start_point
         )
@@ -2021,6 +2615,7 @@ def jump(self, inst: Instruction) -> None:
     JUMP_IF_FALSE_OR_POP = generic_jump(operator.not_, True)
     JUMP_IF_TRUE_OR_POP = generic_jump(operator.truth, True)
 
+<<<<<<< HEAD
     def SETUP_LOOP(self, inst: Instruction) -> None:
         # only exists in python<=3.7
         assert inst.target is not None
@@ -2056,6 +2651,39 @@ def WITH_CLEANUP_FINISH(self, inst: Instruction) -> None:
         self.push(None)
 
     def FOR_ITER(self, inst: Instruction) -> None:
+=======
+    def SETUP_LOOP(self, inst):
+        # only exists in python<=3.7
+        self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
+
+    def SETUP_EXCEPT(self, inst):
+        # only exists in python<=3.7
+        self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
+
+    def POP_BLOCK(self, inst):
+        self.block_stack.pop()
+
+    def SETUP_WITH(self, inst):
+        self.setup_or_before_with(inst)
+
+    def SETUP_FINALLY(self, inst):
+        self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
+
+    def BEGIN_FINALLY(self, inst):
+        self.push(None)
+
+    def WITH_CLEANUP_START(self, inst):
+        exit, exc = self.popn(2)
+        assert exc is None
+        self.push(exc)
+        self.push(exit.call_function(self, [ConstantVariable.create(None)] * 3, {}))
+
+    def WITH_CLEANUP_FINISH(self, inst):
+        self.popn(2)
+        self.push(None)
+
+    def FOR_ITER(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         it = self.pop().realize()
         try:
             val = it.next_variable(self)
@@ -2075,7 +2703,11 @@ def FOR_ITER(self, inst: Instruction) -> None:
                 self.push(ConstantVariable.create(None))
             self.jump(inst)
 
+<<<<<<< HEAD
     def _create_exception_type(self, val: VariableTracker) -> VariableTracker:
+=======
+    def _create_exception_type(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             val, (variables.BuiltinVariable, UserDefinedExceptionClassVariable)
         ):
@@ -2084,7 +2716,11 @@ def _create_exception_type(self, val: VariableTracker) -> VariableTracker:
             val = val.call_function(self, [], {})  # type: ignore[arg-type]
         return val
 
+<<<<<<< HEAD
     def _raise_exception_variable(self, val: VariableTracker) -> NoReturn:
+=======
+    def _raise_exception_variable(self, val) -> NoReturn:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # User can raise exception in 2 ways
         #   1) raise exception type - raise NotImplementedError
         #   2) raise exception instance - raise NotImplemetedError("foo")
@@ -2102,11 +2738,19 @@ def _raise_exception_variable(self, val: VariableTracker) -> NoReturn:
             val = variables.BuiltinVariable(RuntimeError).call_function(self, [], {})  # type: ignore[arg-type]
 
         # Save the exception in a global data structure
+<<<<<<< HEAD
         self.exn_vt_stack.set_current_exception(val)  # type: ignore[arg-type]
 
         # 2) when user raises exception instance
         if self._isinstance_exception(val):
             observed_exception_type = exc.get_dynamo_observed_exception(val.exc_type)  # type: ignore[attr-defined, union-attr]
+=======
+        self.exn_vt_stack.set_current_exception(val)
+
+        # 2) when user raises exception instance
+        if self._isinstance_exception(val):
+            observed_exception_type = exc.get_dynamo_observed_exception(val.exc_type)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise observed_exception_type(f"raised exception {val}")
         unimplemented_v2(
             gb_type="Failed to raise exception",
@@ -2115,7 +2759,11 @@ def _raise_exception_variable(self, val: VariableTracker) -> NoReturn:
             hints=[*graph_break_hints.USER_ERROR],
         )
 
+<<<<<<< HEAD
     def RAISE_VARARGS(self, inst: Instruction) -> None:
+=======
+    def RAISE_VARARGS(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inst.arg == 0:
             if not len(self.exn_vt_stack):
                 msg = ConstantVariable("No active exception to reraise")
@@ -2129,21 +2777,35 @@ def RAISE_VARARGS(self, inst: Instruction) -> None:
             self._raise_exception_variable(val)
         elif inst.arg == 1:
             # raise TOS
+<<<<<<< HEAD
             val = self.stack[-1]  # type: ignore[assignment]
+=======
+            val = self.stack[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._raise_exception_variable(val)
         else:
             # raise .. from ...
             from_vt = self.pop()
+<<<<<<< HEAD
             val = self.pop()  # type: ignore[assignment]
+=======
+            val = self.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 self._raise_exception_variable(val)
             finally:
                 # Update __cause__/__supppress_context__ in the raised exception
                 curr_exc = self.exn_vt_stack.get_current_exception()
                 cause = self._create_exception_type(from_vt)
+<<<<<<< HEAD
                 curr_exc.call_setattr(self, ConstantVariable("__cause__"), cause)  # type: ignore[arg-type, union-attr, assignment]
 
     def CLEANUP_THROW(self, inst: Instruction) -> None:
+=======
+                curr_exc.call_setattr(self, ConstantVariable("__cause__"), cause)
+
+    def CLEANUP_THROW(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://github.com/python/cpython/pull/96010
         tos = self.stack[-1]
         assert isinstance(tos, ExceptionVariable)
@@ -2157,7 +2819,11 @@ def CLEANUP_THROW(self, inst: Instruction) -> None:
         else:
             self.RERAISE(inst)
 
+<<<<<<< HEAD
     def RERAISE(self, inst: Instruction) -> None:
+=======
+    def RERAISE(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://docs.python.org/3/library/dis.html#opcode-RERAISE
         #   Re-raises the exception currently on top of the stack. If oparg is
         #   non-zero, pops an additional value from the stack which is used to
@@ -2180,7 +2846,11 @@ def RERAISE(self, inst: Instruction) -> None:
             _tb = self.pop()
             self._raise_exception_variable(val)
 
+<<<<<<< HEAD
     def _isinstance_exception(self, val: VariableTracker) -> TypeIs[ExceptionVals]:
+=======
+    def _isinstance_exception(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return isinstance(
             val,
             (
@@ -2190,10 +2860,15 @@ def _isinstance_exception(self, val: VariableTracker) -> TypeIs[ExceptionVals]:
             ),
         )
 
+<<<<<<< HEAD
     def WITH_EXCEPT_START(self, inst: Instruction) -> None:
         args: list[VariableTracker] = []
         if sys.version_info >= (3, 11):
             fn_loc = 4 if sys.version_info < (3, 14) else 5
+=======
+    def WITH_EXCEPT_START(self, inst):
+        if sys.version_info >= (3, 11):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # At the top of the stack are 4 values:
             #    - TOP = exc_info()
             #    - SECOND = previous exception
@@ -2201,6 +2876,7 @@ def WITH_EXCEPT_START(self, inst: Instruction) -> None:
             #    - FOURTH: the context.__exit__ bound method
             #    We call FOURTH(type(TOP), TOP, GetTraceback(TOP)).
             #    Then we push the __exit__ return value.
+<<<<<<< HEAD
             # In Python 3.14+, there is a NULL placed between the context.__exit__ bound method and the lasti,
             # that is, fn is now the 5th from TOS.
             assert len(self.stack) >= fn_loc
@@ -2212,6 +2888,14 @@ def WITH_EXCEPT_START(self, inst: Instruction) -> None:
             if sys.version_info >= (3, 14):
                 if not isinstance(self.stack[-4], NullVariable):
                     args.append(self.stack[-4])
+=======
+            assert len(self.stack) >= 4
+            fn = self.stack[-4]
+            val = self.stack[-1]
+            assert self._isinstance_exception(val)
+            typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined]
+            tb = ConstantVariable(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             assert len(self.stack) >= 7
             fn = self.stack[-7]
@@ -2220,15 +2904,22 @@ def WITH_EXCEPT_START(self, inst: Instruction) -> None:
             typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined]
             tb = ConstantVariable(None)
 
+<<<<<<< HEAD
         args += [typ, val, tb]
         self.call_function(fn, args, {})
 
     def exception_handler(self, raised_exception: ObservedException) -> None:
+=======
+        self.call_function(fn, [typ, val, tb], {})
+
+    def exception_handler(self, raised_exception):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         observed_exn_gb_explanation = (
             "Dynamo found no exception handler at the top-level compiled function "
             "when encountering an exception. Exception will propagate outside the compiled region."
         )
 
+<<<<<<< HEAD
         def bubble_exception_to_interpreter() -> None:
             # Bubble the exception to the interpreter
             curr_exc = self.exn_vt_stack.get_current_exception()
@@ -2245,6 +2936,8 @@ def bubble_exception_to_interpreter() -> None:
                 from_exc=raised_exception,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 11):
             exn_tab_entry = self.current_instruction.exn_tab_entry
             if exn_tab_entry:
@@ -2265,13 +2958,29 @@ def bubble_exception_to_interpreter() -> None:
                 self.push(self.exn_vt_stack.get_current_exception())
 
                 # 4) jump to the handler
+<<<<<<< HEAD
                 self.jump(exn_tab_entry)  # type: ignore[arg-type]
+=======
+                self.jump(exn_tab_entry)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # No handler found. Bubble the exception to the parent
                 # instruction translator. We use special exception for this.
                 self.stack.clear()
                 if type(self) is InstructionTranslator:
+<<<<<<< HEAD
                     bubble_exception_to_interpreter()
+=======
+                    unimplemented_v2(
+                        gb_type="Observed exception",
+                        context=str(raised_exception),
+                        explanation=observed_exn_gb_explanation,
+                        hints=[
+                            *graph_break_hints.USER_ERROR,
+                            *graph_break_hints.SUPPORTABLE,
+                        ],
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise raised_exception
         else:
             if len(self.block_stack):
@@ -2343,10 +3052,25 @@ def bubble_exception_to_interpreter() -> None:
                 # instruction translator. We use special exception for this.
                 self.stack.clear()
                 if type(self) is InstructionTranslator:
+<<<<<<< HEAD
                     bubble_exception_to_interpreter()
                 raise raised_exception
 
     def PUSH_EXC_INFO(self, inst: Instruction) -> None:
+=======
+                    unimplemented_v2(
+                        gb_type="Observed exception",
+                        context=str(raised_exception),
+                        explanation=observed_exn_gb_explanation,
+                        hints=[
+                            *graph_break_hints.USER_ERROR,
+                            *graph_break_hints.SUPPORTABLE,
+                        ],
+                    )
+                raise raised_exception
+
+    def PUSH_EXC_INFO(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://docs.python.org/3/library/dis.html#opcode-PUSH_EXC_INFO
         #   Pops a value from the stack. Pushes the current exception to the top
         #   of the stack. Pushes the value originally popped back to the stack.
@@ -2368,14 +3092,22 @@ def PUSH_EXC_INFO(self, inst: Instruction) -> None:
 
         val = self.pop()
         if len(self.exn_vt_stack) == 0:
+<<<<<<< HEAD
             prev_exc: VariableTracker = ConstantVariable(None)
+=======
+            prev_exc = ConstantVariable(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             prev_exc = self.exn_vt_stack[-1]
         self.push(prev_exc)
         self.push(val)
         self.exn_vt_stack.move_current_exception_to_stack()
 
+<<<<<<< HEAD
     def POP_EXCEPT(self, inst: Instruction) -> None:
+=======
+    def POP_EXCEPT(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 11):
             _ = self.pop()
             # This exception is handled and therefore we can clear the error indicator
@@ -2396,7 +3128,11 @@ def POP_EXCEPT(self, inst: Instruction) -> None:
             assert len(self.exn_vt_stack)
             self.exn_vt_stack.pop()
 
+<<<<<<< HEAD
     def check_if_exc_matches(self) -> bool:
+=======
+    def check_if_exc_matches(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(self.stack) >= 2
         expected_exc_types = self.pop()
         if sys.version_info >= (3, 11):
@@ -2466,19 +3202,28 @@ def check_if_exc_matches(self) -> bool:
                     hints=[*graph_break_hints.USER_ERROR],
                 )
             if self._isinstance_exception(exc_instance) and issubclass(
+<<<<<<< HEAD
                 exc_instance.exc_type,  # type: ignore[union-attr]
+=======
+                exc_instance.exc_type,  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 expected_type.fn,  # type: ignore[attr-defined]
             ):
                 return True
             elif isinstance(exc_instance, variables.BuiltinVariable) and issubclass(
+<<<<<<< HEAD
                 exc_instance.fn,
                 # pyrefly: ignore [missing-attribute]
                 expected_type.fn,
+=======
+                exc_instance.fn, expected_type.fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return True
 
         return False
 
+<<<<<<< HEAD
     def CHECK_EXC_MATCH(self, inst: Instruction) -> None:
         self.push(variables.ConstantVariable(self.check_if_exc_matches()))
 
@@ -2487,31 +3232,58 @@ def JUMP_IF_NOT_EXC_MATCH(self, inst: Instruction) -> None:
             self.jump(inst)
 
     def COMPARE_OP(self, inst: Instruction) -> None:
+=======
+    def CHECK_EXC_MATCH(self, inst):
+        self.push(variables.ConstantVariable(self.check_if_exc_matches()))
+
+    def JUMP_IF_NOT_EXC_MATCH(self, inst):
+        if not self.check_if_exc_matches():
+            self.jump(inst)
+
+    def COMPARE_OP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inst.argval == "exception match":
             self.CHECK_EXC_MATCH(inst)
         else:
             self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
 
+<<<<<<< HEAD
     def GET_ITER(self, inst: Instruction) -> None:
         self.call_function(BuiltinVariable(iter), [self.pop()], {})
 
     @break_graph_if_unsupported(push=1)
     def CALL_FUNCTION(self, inst: Instruction) -> None:
+=======
+    def GET_ITER(self, inst):
+        self.call_function(BuiltinVariable(iter), [self.pop()], {})
+
+    @break_graph_if_unsupported(push=1)
+    def CALL_FUNCTION(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = self.popn(inst.argval)
         fn = self.pop()
         self.call_function(fn, args, {})
 
     @break_graph_if_unsupported(push=1)
+<<<<<<< HEAD
     def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
+=======
+    def CALL_FUNCTION_EX(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargsvars: VariableTracker
         if inst.argval == 0:
             kwargsvars = ConstDictVariable({})
             argsvars = self.pop()
+<<<<<<< HEAD
         elif inst.argval == 1 or sys.version_info >= (3, 14):
             # Python 3.14+ removed the argval and replaced it with a possibly NULL kwargs
             kwargsvars = self.pop()
             if isinstance(kwargsvars, NullVariable):
                 kwargsvars = ConstDictVariable({})
+=======
+        elif inst.argval == 1:
+            kwargsvars = self.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             argsvars = self.pop()
         else:
             unimplemented_v2(
@@ -2533,6 +3305,7 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
             assert isinstance(null, NullVariable)
 
         if not isinstance(
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             argsvars,
             BaseListVariable,
@@ -2555,12 +3328,28 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
             unimplemented_v2(
                 gb_type="Variadic function call with bad args/kwargs type",
                 # pyrefly: ignore [unbound-name]
+=======
+            argsvars, BaseListVariable
+        ) and argsvars.has_force_unpack_var_sequence(self):
+            argsvars = TupleVariable(argsvars.force_unpack_var_sequence(self))
+
+        # Unpack for cases like fn(**obj) where obj is a map
+        if isinstance(kwargsvars, UserDefinedObjectVariable):
+            kwargsvars = BuiltinVariable.call_custom_dict(self, dict, kwargsvars)  # type: ignore[arg-type]
+
+        if not isinstance(argsvars, BaseListVariable) or not isinstance(
+            kwargsvars, ConstDictVariable
+        ):
+            unimplemented_v2(
+                gb_type="Variadic function call with bad args/kwargs type",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 context=f"args type: {typestr(argsvars)}, kwargs type: {typestr(kwargsvars)}",
                 explanation="Expected args to be a list and kwargs to be a dict",
                 hints=[*graph_break_hints.USER_ERROR],
             )
 
         # Map to a dictionary of str -> VariableTracker
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name, missing-attribute]
         kwargsvars = kwargsvars.keys_as_python_constant()
         # pyrefly: ignore [unbound-name, missing-attribute]
@@ -2568,6 +3357,13 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
 
     @break_graph_if_unsupported(push=1)
     def CALL_FUNCTION_KW(self, inst: Instruction) -> None:
+=======
+        kwargsvars = kwargsvars.keys_as_python_constant()
+        self.call_function(fn, argsvars.items, kwargsvars)
+
+    @break_graph_if_unsupported(push=1)
+    def CALL_FUNCTION_KW(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         argnames = self.pop()
         args = self.popn(inst.argval)
         fn = self.pop()
@@ -2578,11 +3374,16 @@ def CALL_FUNCTION_KW(self, inst: Instruction) -> None:
         assert len(kwargs) == len(argnames)
         self.call_function(fn, args, kwargs)
 
+<<<<<<< HEAD
     def LOAD_METHOD_SUPER(self, inst: Instruction) -> None:
+=======
+    def LOAD_METHOD_SUPER(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
         if sys.version_info < (3, 11):
+<<<<<<< HEAD
             self._load_attr(argval)
         else:
             self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
@@ -2595,6 +3396,20 @@ def LOAD_ATTR_SUPER(self, inst: Instruction) -> None:
 
     def LOAD_METHOD(self, inst: Instruction) -> None:
         self._load_attr(inst.argval)
+=======
+            self._load_attr(dataclasses.replace(inst, argval=argval))
+        else:
+            self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
+
+    def LOAD_ATTR_SUPER(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        arg = inst.argval[0]
+        argval = self.code_options["co_names"][arg]
+        self._load_attr(dataclasses.replace(inst, argval=argval))
+
+    def LOAD_METHOD(self, inst):
+        self._load_attr(inst)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.pop()
         if sys.version_info >= (3, 13):
             self.push(obj)
@@ -2609,22 +3424,35 @@ def LOAD_METHOD(self, inst: Instruction) -> None:
             self.push(obj)
             self.push(None)
 
+<<<<<<< HEAD
     def CALL_METHOD(self, inst: Instruction) -> None:
+=======
+    def CALL_METHOD(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = self.popn(inst.argval)
         dummy = self.pop()
         assert dummy is None
         fn = self.pop()
         self.call_function(fn, args, {})
 
+<<<<<<< HEAD
     def _load_attr(self, attr: Any) -> None:
         obj = self.pop()
         result = BuiltinVariable(getattr).call_function(
             self,  # type: ignore[arg-type]
             [obj, ConstantVariable.create(attr)],
+=======
+    def _load_attr(self, inst):
+        obj = self.pop()
+        result = BuiltinVariable(getattr).call_function(
+            self,  # type: ignore[arg-type]
+            [obj, ConstantVariable.create(inst.argval)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         self.push(result)
 
+<<<<<<< HEAD
     def LOAD_ATTR(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 12):
             # pyrefly: ignore [unsupported-operation]
@@ -2636,6 +3464,18 @@ def LOAD_ATTR(self, inst: Instruction) -> None:
     def STORE_ATTR(self, inst: Instruction) -> None:
         speculation = self.speculate()
         if speculation.failed(self):
+=======
+    def LOAD_ATTR(self, inst):
+        if sys.version_info >= (3, 12):
+            if inst.arg % 2:
+                self.LOAD_METHOD(inst)
+                return
+        self._load_attr(inst)
+
+    def STORE_ATTR(self, inst):
+        speculation = self.speculate()
+        if speculation.failed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.store_attr_graph_break(inst)
         val, obj = self.popn(2)
 
@@ -2659,9 +3499,15 @@ def STORE_ATTR(self, inst: Instruction) -> None:
             log.debug("STORE_ATTR triggered compile", exc_info=True)
             e.remove_from_stats()
             e.add_to_stats("graph_break")
+<<<<<<< HEAD
         speculation.fail_and_restart_analysis(self.error_on_graph_break)
 
     def store_attr_graph_break(self, inst: Instruction) -> None:
+=======
+        speculation.fail_and_restart_analysis()
+
+    def store_attr_graph_break(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log_graph_break(self.code_options, reason="STORE_ATTR-caused graph break")
         if not self.should_compile_partial_graph():
             unimplemented_v2(
@@ -2676,6 +3522,7 @@ def store_attr_graph_break(self, inst: Instruction) -> None:
             reason=GraphCompileReason("store_attr", [self.frame_summary()]),
             stack_pops=2,
         )
+<<<<<<< HEAD
         inst_copy = copy.copy(inst)
         inst_copy.exn_tab_entry = None
         self.output.add_output_instructions([inst_copy])
@@ -2688,6 +3535,15 @@ def store_attr_graph_break(self, inst: Instruction) -> None:
         )
 
     def DELETE_ATTR(self, inst: Instruction) -> None:
+=======
+        self.output.add_output_instructions([copy.copy(inst)])
+        self.popn(2)
+        self.output.add_output_instructions(
+            self.create_call_resume_at(self.next_instruction, all_stack_locals_metadata)
+        )
+
+    def DELETE_ATTR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.pop()
         BuiltinVariable(delattr).call_function(
             self,  # type: ignore[arg-type]
@@ -2695,6 +3551,7 @@ def DELETE_ATTR(self, inst: Instruction) -> None:
             {},
         )
 
+<<<<<<< HEAD
     @staticmethod
     def codegen_return_with_pops(
         inst: Instruction, num_stack: int
@@ -3250,6 +4107,40 @@ def BUILD_LIST(self, inst: Instruction) -> None:
         self.push(ListVariable(items, mutation_type=ValueMutationNew()))
 
     def BUILD_SET(self, inst: Instruction) -> None:
+=======
+    def create_call_resume_at(self, offset, all_stack_locals_metadata):
+        raise AssertionError(
+            f"create_call_resume_at not overridden by subclass {type(self)}"
+        )
+
+    def should_compile_partial_graph(self) -> bool:
+        raise AssertionError(
+            f"should_compile_partial_graph not overridden by subclass {type(self)}"
+        )
+
+    @break_graph_if_unsupported(push=0)
+    def STORE_SUBSCR(self, inst):
+        val, obj, key = self.popn(3)
+        obj.call_method(self, "__setitem__", [key, val], {})
+
+    def DELETE_SUBSCR(self, inst):
+        obj, key = self.popn(2)
+        obj.call_method(self, "__delitem__", [key], {})
+
+    def BUILD_TUPLE(self, inst):
+        items = self.popn(inst.argval)
+        self.push(TupleVariable(items))
+
+    def BUILD_SLICE(self, inst):
+        items = self.popn(inst.argval)
+        self.push(SliceVariable(items))
+
+    def BUILD_LIST(self, inst):
+        items = self.popn(inst.argval)
+        self.push(ListVariable(items, mutation_type=ValueMutationNew()))
+
+    def BUILD_SET(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.inject_BUILD_SET_unimplemented_TESTING_ONLY:
             unimplemented_v2(
                 gb_type="missing BUILD_SET handler",
@@ -3261,7 +4152,11 @@ def BUILD_SET(self, inst: Instruction) -> None:
         new_set = SetVariable(items, mutation_type=ValueMutationNew())
         self.push(new_set)
 
+<<<<<<< HEAD
     def BUILD_LIST_UNPACK(self, inst: Instruction, cls: type = ListVariable) -> None:
+=======
+    def BUILD_LIST_UNPACK(self, inst, cls=ListVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seqs = self.popn(inst.argval)
         items = []
         for seq in seqs:
@@ -3277,21 +4172,37 @@ def BUILD_LIST_UNPACK(self, inst: Instruction, cls: type = ListVariable) -> None
                 )
         self.push(cls(items, mutation_type=ValueMutationNew()))
 
+<<<<<<< HEAD
     def BUILD_TUPLE_UNPACK(self, inst: Instruction) -> None:
+=======
+    def BUILD_TUPLE_UNPACK(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.BUILD_LIST_UNPACK(inst, cls=TupleVariable)
 
     BUILD_TUPLE_UNPACK_WITH_CALL = BUILD_TUPLE_UNPACK
 
+<<<<<<< HEAD
     def BUILD_MAP(self, inst: Instruction) -> None:
+=======
+    def BUILD_MAP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items = self.popn(inst.argval * 2)
         d = dict(zip(items[::2], items[1::2]))
         self.push(ConstDictVariable(d, mutation_type=ValueMutationNew()))
 
+<<<<<<< HEAD
     def BUILD_MAP_UNPACK(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         # ensure everything is a dict
         items = [BuiltinVariable(dict).call_function(self, [x], {}) for x in items]  # type: ignore[arg-type]
         result: dict[Any, Any] = {}
+=======
+    def BUILD_MAP_UNPACK(self, inst):
+        items = self.popn(inst.argval)
+        # ensure everything is a dict
+        items = [BuiltinVariable(dict).call_function(self, [x], {}) for x in items]  # type: ignore[arg-type]
+        result = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for x in items:
             assert isinstance(x, ConstDictVariable)
             result.update(x.items)
@@ -3304,7 +4215,11 @@ def BUILD_MAP_UNPACK(self, inst: Instruction) -> None:
 
     BUILD_MAP_UNPACK_WITH_CALL = BUILD_MAP_UNPACK
 
+<<<<<<< HEAD
     def BUILD_CONST_KEY_MAP(self, inst: Instruction) -> None:
+=======
+    def BUILD_CONST_KEY_MAP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         keys = self.pop()
         values = self.popn(inst.argval)
         assert isinstance(keys, TupleVariable)
@@ -3320,14 +4235,21 @@ def BUILD_CONST_KEY_MAP(self, inst: Instruction) -> None:
             )
         )
 
+<<<<<<< HEAD
     def MAP_ADD(self, inst: Instruction) -> None:
         k, v = self.popn(2)
         assert inst.argval > 0
         assert inst.arg is not None
+=======
+    def MAP_ADD(self, inst):
+        k, v = self.popn(2)
+        assert inst.argval > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         obj.call_method(self, "__setitem__", (k, v), {})  # type: ignore[arg-type]
 
+<<<<<<< HEAD
     def SET_ADD(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
@@ -3341,22 +4263,45 @@ def SET_UPDATE(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
         assert inst.arg is not None
+=======
+    def SET_ADD(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg]
+        assert isinstance(obj, SetVariable)
+        assert obj.is_mutable()
+        return obj.call_method(self, "add", [v], {})
+
+    def SET_UPDATE(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.stack[-inst.arg]
         assert isinstance(obj, SetVariable)
         assert obj.is_mutable()
         obj.call_method(self, "update", [v], {})
 
+<<<<<<< HEAD
     def LIST_APPEND(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
         assert inst.arg is not None
+=======
+    def LIST_APPEND(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ListVariable)
         assert obj.is_mutable()
         self.output.side_effects.mutation(obj)
         obj.items.append(v)
 
+<<<<<<< HEAD
     def MAKE_FUNCTION(self, inst: Instruction) -> None:
+=======
+    def MAKE_FUNCTION(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flags = inst.arg
         if sys.version_info < (3, 11):
             fn_name = self.pop()
@@ -3373,6 +4318,7 @@ def MAKE_FUNCTION(self, inst: Instruction) -> None:
 
         if sys.version_info < (3, 13):
             # in 3.13, this is handled in SET_FUNCTION_ATTRIBUTE
+<<<<<<< HEAD
             if flags is not None:
                 if flags & 0x08:
                     closure = self.pop()
@@ -3382,6 +4328,16 @@ def MAKE_FUNCTION(self, inst: Instruction) -> None:
                     kwdefaults = self.pop()
                 if flags & 0x01:
                     defaults = self.pop()
+=======
+            if flags & 0x08:
+                closure = self.pop()
+            if flags & 0x04:
+                annotations = self.pop()
+            if flags & 0x02:
+                kwdefaults = self.pop()
+            if flags & 0x01:
+                defaults = self.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.push(
             NestedUserFunctionVariable(
@@ -3395,7 +4351,11 @@ def MAKE_FUNCTION(self, inst: Instruction) -> None:
             )
         )
 
+<<<<<<< HEAD
     def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
+=======
+    def UNPACK_SEQUENCE(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq = self.pop()
         if isinstance(seq, TensorVariable):
             val = seq.unpack_var_sequence(self, idxes=range(inst.argval))  # type: ignore[arg-type]
@@ -3413,21 +4373,34 @@ def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
                 "(i.e. `a, b, c = d`).",
                 hints=[*graph_break_hints.USER_ERROR],
             )
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         if len(val) != inst.argval:
             unimplemented_v2(
                 gb_type="Length mismatch when unpacking object for UNPACK_SEQUENCE",
                 # pyrefly: ignore [unbound-name]
+=======
+        if len(val) != inst.argval:
+            unimplemented_v2(
+                gb_type="Length mismatch when unpacking object for UNPACK_SEQUENCE",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 context=f"expected length: {inst.argval}, actual: {len(val)}",
                 explanation=f"{seq} unpacked to a list for the UNPACK_SEQUENCE bytecode "
                 "(i.e. `a, b, c = d`) with unexpected length.",
                 hints=[*graph_break_hints.DYNAMO_BUG],
             )
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         for i in reversed(val):
             self.push(i)
 
     def UNPACK_EX(self, inst: Instruction) -> None:
+=======
+        for i in reversed(val):
+            self.push(i)
+
+    def UNPACK_EX(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert 0 <= inst.argval <= 0xFFFF
         prefix = inst.argval & 0xFF  # low byte
         suffix = inst.argval >> 8  # high byte
@@ -3451,6 +4424,7 @@ def UNPACK_EX(self, inst: Instruction) -> None:
                 hints=[*graph_break_hints.USER_ERROR],
             )
 
+<<<<<<< HEAD
     @break_graph_if_unsupported(push=0)
     def graph_break_on_leaf_function(self, inst: Instruction) -> None:
         if self.is_leaf_tracer:
@@ -3472,12 +4446,25 @@ def POP_TOP(self, inst: Instruction) -> None:
         self.pop()
 
     def ROT_TWO(self, inst: Instruction) -> None:
+=======
+    def NOP(self, inst):
+        pass
+
+    def POP_TOP(self, inst):
+        self.pop()
+
+    def ROT_TWO(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.pop()
         b = self.pop()
         self.push(a)
         self.push(b)
 
+<<<<<<< HEAD
     def ROT_THREE(self, inst: Instruction) -> None:
+=======
+    def ROT_THREE(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.pop()
         b = self.pop()
         c = self.pop()
@@ -3485,7 +4472,11 @@ def ROT_THREE(self, inst: Instruction) -> None:
         self.push(c)
         self.push(b)
 
+<<<<<<< HEAD
     def ROT_FOUR(self, inst: Instruction) -> None:
+=======
+    def ROT_FOUR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.pop()
         b = self.pop()
         c = self.pop()
@@ -3495,12 +4486,20 @@ def ROT_FOUR(self, inst: Instruction) -> None:
         self.push(c)
         self.push(b)
 
+<<<<<<< HEAD
     def DUP_TOP(self, inst: Instruction) -> None:
+=======
+    def DUP_TOP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.pop()
         self.push(a)
         self.push(a)
 
+<<<<<<< HEAD
     def DUP_TOP_TWO(self, inst: Instruction) -> None:
+=======
+    def DUP_TOP_TWO(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = self.pop()
         b = self.pop()
         self.push(b)
@@ -3508,7 +4507,11 @@ def DUP_TOP_TWO(self, inst: Instruction) -> None:
         self.push(b)
         self.push(a)
 
+<<<<<<< HEAD
     def _convert_value(self, value: VariableTracker, flag: int) -> VariableTracker:
+=======
+    def _convert_value(self, value, flag):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if flag == 1:
             return BuiltinVariable(str).call_function(self, [value], {})  # type: ignore[arg-type]
         elif flag == 2:
@@ -3517,7 +4520,11 @@ def _convert_value(self, value: VariableTracker, flag: int) -> VariableTracker:
             return BuiltinVariable(ascii).call_function(self, [value], {})  # type: ignore[arg-type]
         return value
 
+<<<<<<< HEAD
     def _format_value(self, fmt_spec: VariableTracker, flags: int) -> None:
+=======
+    def _format_value(self, fmt_spec, flags):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = self.pop()
         if isinstance(value, SymNodeVariable):
             from torch._dynamo.variables.lazy import (
@@ -3537,9 +4544,14 @@ def _format_value(self, fmt_spec: VariableTracker, flags: int) -> None:
 
         self.call_function(BuiltinVariable(str.format), [fmt_var, value], {})
 
+<<<<<<< HEAD
     def FORMAT_VALUE(self, inst: Instruction) -> None:
         flags = inst.arg
         assert flags is not None
+=======
+    def FORMAT_VALUE(self, inst):
+        flags = inst.arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (flags & 0x04) == 0x04:
             fmt_spec = self.pop()
         else:
@@ -3547,11 +4559,18 @@ def FORMAT_VALUE(self, inst: Instruction) -> None:
 
         return self._format_value(fmt_spec, flags)
 
+<<<<<<< HEAD
     def BUILD_STRING(self, inst: Instruction) -> None:
         format_string_parts: list[str] = []
         args: list[VariableTracker] = []
         kwargs: dict[str, VariableTracker] = {}
         assert inst.arg is not None
+=======
+    def BUILD_STRING(self, inst):
+        format_string_parts: list[str] = []
+        args: list[VariableTracker] = []
+        kwargs: dict[str, VariableTracker] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for part in self.popn(inst.arg):
             if isinstance(part, ConstantVariable):
                 format_string_parts.append("{}")
@@ -3580,7 +4599,11 @@ def BUILD_STRING(self, inst: Instruction) -> None:
             )
         )
 
+<<<<<<< HEAD
     def IS_OP(self, inst: Instruction) -> None:
+=======
+    def IS_OP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert inst.argval == 0 or inst.argval == 1
         if inst.argval == 0:
             new_argval = "is"
@@ -3589,6 +4612,7 @@ def IS_OP(self, inst: Instruction) -> None:
         new_inst = create_instruction("COMPARE_OP", argval=new_argval)
         self.COMPARE_OP(new_inst)
 
+<<<<<<< HEAD
     def CONTAINS_OP(self, inst: Instruction) -> None:
         assert inst.argval == 0 or inst.argval == 1
         left, right = self.popn(2)
@@ -3619,15 +4643,35 @@ def LIST_EXTEND(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
         assert inst.arg is not None
+=======
+    def CONTAINS_OP(self, inst):
+        assert inst.argval == 0 or inst.argval == 1
+        left, right = self.popn(2)
+        op = inst.argval
+        self.push(right.call_method(self, "__contains__", [left], {}))
+        if op == 1:
+            self.UNARY_NOT(inst)
+
+    def LIST_EXTEND(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.stack[-inst.arg]
         assert isinstance(obj, ListVariable)
         assert obj.is_mutable()
         obj.call_method(self, "extend", [v], {})
 
+<<<<<<< HEAD
     def LIST_TO_TUPLE(self, inst: Instruction) -> None:
         self.push(BuiltinVariable(tuple).call_function(self, [self.pop()], {}))  # type: ignore[arg-type]
 
     def STOPITERATION_ERROR(self, inst: Instruction) -> None:
+=======
+    def LIST_TO_TUPLE(self, inst):
+        self.push(BuiltinVariable(tuple).call_function(self, [self.pop()], {}))  # type: ignore[arg-type]
+
+    def STOPITERATION_ERROR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # wrap the generator body in a try: ... except StopIteration: ... which
         # converts the StopIteration into a RuntimeError
         # https://peps.python.org/pep-0479/
@@ -3635,7 +4679,11 @@ def STOPITERATION_ERROR(self, inst: Instruction) -> None:
         # https://github.com/python/cpython/commit/28187141cc34063ef857976ddbca87ba09a882c2
         val = self.stack[-1]
         assert self._isinstance_exception(val)
+<<<<<<< HEAD
         if val.exc_type is StopIteration:  # type: ignore[union-attr]
+=======
+        if val.exc_type is StopIteration:  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_val = variables.BuiltinVariable(RuntimeError).call_function(
                 self,  # type: ignore[arg-type]
                 [ConstantVariable("generator raised StopIteration")],
@@ -3645,10 +4693,16 @@ def STOPITERATION_ERROR(self, inst: Instruction) -> None:
             new_val.call_setattr(self, ConstantVariable("__cause__"), val)  # type: ignore[attr-defined]
             self.stack[-1] = new_val
 
+<<<<<<< HEAD
     def DICT_MERGE(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
         assert inst.arg is not None
+=======
+    def DICT_MERGE(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         assert obj.is_mutable()
@@ -3656,17 +4710,28 @@ def DICT_MERGE(self, inst: Instruction) -> None:
 
     DICT_UPDATE = DICT_MERGE
 
+<<<<<<< HEAD
     def GEN_START(self, inst: Instruction) -> None:
         self.pop()
 
     def GET_LEN(self, inst: Instruction) -> None:
+=======
+    def GEN_START(self, inst):
+        self.pop()
+
+    def GET_LEN(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tos = self.stack[-1]
         if tos.is_python_constant():
             self.push(ConstantVariable.create(len(tos.as_python_constant())))
         else:
             self.push(tos.call_method(self, "__len__", [], {}))
 
+<<<<<<< HEAD
     def MATCH_MAPPING(self, inst: Instruction) -> None:
+=======
+    def MATCH_MAPPING(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tos = self.stack[-1]
         assert isinstance(tos, ConstDictVariable)
         if isinstance(tos.items, collections.abc.Mapping):
@@ -3674,7 +4739,11 @@ def MATCH_MAPPING(self, inst: Instruction) -> None:
         else:
             self.push(ConstantVariable.create(False))
 
+<<<<<<< HEAD
     def MATCH_SEQUENCE(self, inst: Instruction) -> None:
+=======
+    def MATCH_SEQUENCE(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tos = self.stack[-1]
         assert tos.is_python_constant()
         tos_value = tos.as_python_constant()
@@ -3685,6 +4754,7 @@ def MATCH_SEQUENCE(self, inst: Instruction) -> None:
         else:
             self.push(ConstantVariable.create(False))
 
+<<<<<<< HEAD
     def MATCH_KEYS(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         assert isinstance(tos, TupleVariable)
@@ -3694,6 +4764,15 @@ def MATCH_KEYS(self, inst: Instruction) -> None:
 
         if all(k in tos1 for k in keys):  # type: ignore[attr-defined]
             self.push(TupleVariable([tos1.getitem_const(self, k) for k in keys]))  # type: ignore[attr-defined,arg-type]
+=======
+    def MATCH_KEYS(self, inst):
+        tos = self.stack[-1]
+        tos1 = self.stack[-2]
+        assert isinstance(tos1, ConstDictVariable)
+
+        if all(k in tos1 for k in tos):  # type: ignore[attr-defined]
+            self.push(TupleVariable([tos1.getitem_const(self, k) for k in tos]))  # type: ignore[attr-defined,arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sys.version_info < (3, 11):
                 self.push(ConstantVariable.create(True))
         else:
@@ -3701,11 +4780,27 @@ def MATCH_KEYS(self, inst: Instruction) -> None:
             if sys.version_info < (3, 11):
                 self.push(ConstantVariable.create(False))
 
+<<<<<<< HEAD
     def LOAD_ASSERTION_ERROR(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval("AssertionError"))
 
     def LOAD_BUILD_CLASS(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval("__build_class__"))
+=======
+    def LOAD_ASSERTION_ERROR(self, inst):
+        self.push(self.load_builtin_from_argval("AssertionError"))
+
+    def LOAD_BUILD_CLASS(self, inst):
+        unimplemented_v2(
+            gb_type="LOAD_BUILD_CLASS bytecode not supported",
+            context="",
+            explanation="Dynamo does not support tracing classes that are defined in the compiled region.",
+            hints=[
+                "Move the class definition out of the compiled region.",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     UNARY_POSITIVE = stack_op(operator.pos)
     UNARY_NEGATIVE = stack_op(operator.neg)
@@ -3744,7 +4839,11 @@ def LOAD_BUILD_CLASS(self, inst: Instruction) -> None:
     INPLACE_OR = stack_op(operator.ior)
 
     # 3.11 opcodes
+<<<<<<< HEAD
     def RESUME(self, inst: Instruction) -> None:
+=======
+    def RESUME(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inst.arg == 0:
             self.append_prefix_inst(inst)
             self.accept_prefix_inst = False
@@ -3753,6 +4852,7 @@ def RESUME(self, inst: Instruction) -> None:
 
     if sys.version_info >= (3, 11):
 
+<<<<<<< HEAD
         def BINARY_OP(self, inst: Instruction) -> None:
             assert inst.arg is not None
             return _binary_op_lookup[inst.arg](self, inst)
@@ -3761,6 +4861,15 @@ def PRECALL(self, inst: Instruction) -> None:
         pass
 
     def KW_NAMES(self, inst: Instruction) -> None:
+=======
+        def BINARY_OP(self, inst):
+            return _binary_op_lookup[inst.arg](self, inst)
+
+    def PRECALL(self, inst):
+        pass
+
+    def KW_NAMES(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kw_names = self.code_options["co_consts"][inst.arg]
         assert isinstance(kw_names, tuple)
         for name in kw_names:
@@ -3768,10 +4877,17 @@ def KW_NAMES(self, inst: Instruction) -> None:
         assert self.kw_names is None
         self.kw_names = ConstantVariable.create(value=kw_names)  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def PUSH_NULL(self, inst: Instruction) -> None:
         self.push(NullVariable())
 
     def _call(self, inst: Instruction, call_kw: bool = False) -> None:
+=======
+    def PUSH_NULL(self, inst):
+        self.push(NullVariable())
+
+    def _call(self, inst, call_kw=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # see https://docs.python.org/3.11/library/dis.html#opcode-CALL
         # for convention
         if call_kw:
@@ -3783,7 +4899,10 @@ def _call(self, inst: Instruction, call_kw: bool = False) -> None:
         else:
             kw_names = self.kw_names.value if self.kw_names else ()
 
+<<<<<<< HEAD
         assert inst.arg is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         contents = self.popn(inst.arg + 2)
         if sys.version_info >= (3, 13):
             # NULL and callable swapped
@@ -3798,6 +4917,7 @@ def _call(self, inst: Instruction, call_kw: bool = False) -> None:
                 args = [contents[1]]
 
         if kw_names:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             args = args + contents[2 : -len(kw_names)]
             # pyrefly: ignore [bad-argument-type]
@@ -3805,6 +4925,11 @@ def _call(self, inst: Instruction, call_kw: bool = False) -> None:
             # pyrefly: ignore [no-matching-overload]
             kwargs = dict(zip(kw_names, kwargs_list))
             # pyrefly: ignore [bad-argument-type]
+=======
+            args = args + contents[2 : -len(kw_names)]
+            kwargs_list = contents[-len(kw_names) :]
+            kwargs = dict(zip(kw_names, kwargs_list))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(kwargs) == len(kw_names)
         else:
             args = args + contents[2:]
@@ -3818,6 +4943,7 @@ def _call(self, inst: Instruction, call_kw: bool = False) -> None:
             self.kw_names = None
 
     @break_graph_if_unsupported(push=1)
+<<<<<<< HEAD
     def CALL(self, inst: Instruction) -> None:
         self._call(inst)
 
@@ -3827,6 +4953,15 @@ def COPY(self, inst: Instruction) -> None:
 
     def SWAP(self, inst: Instruction) -> None:
         assert inst.arg is not None
+=======
+    def CALL(self, inst):
+        self._call(inst)
+
+    def COPY(self, inst):
+        self.push(self.stack[-inst.arg])
+
+    def SWAP(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.stack[-1], self.stack[-inst.arg] = self.stack[-inst.arg], self.stack[-1]
 
     JUMP_BACKWARD = jump
@@ -3837,6 +4972,7 @@ def SWAP(self, inst: Instruction) -> None:
     POP_JUMP_FORWARD_IF_FALSE = generic_jump(operator.not_, False)
     POP_JUMP_BACKWARD_IF_FALSE = generic_jump(operator.not_, False)
 
+<<<<<<< HEAD
     def CACHE(self, inst: Instruction) -> None:
         pass
 
@@ -3848,14 +4984,57 @@ def enter_ctx(
         ctx: Union[ContextWrappingVariable, GenericContextWrappingVariable],
         inst: Instruction,
     ) -> VariableTracker:
+=======
+    def CACHE(self, inst):
+        pass
+
+    def BEFORE_WITH(self, inst):
+        self.setup_or_before_with(inst)
+
+    def setup_or_before_with(self, inst):
+        ctx = self.pop()
+        if not isinstance(
+            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+        ):
+            unimplemented_v2(
+                gb_type="Unsupported context manager",
+                context=f"Attempted SETUP_WITH/BEFORE_WITH on {ctx}",
+                explanation=f"Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
+                hints=[
+                    "Avoid using the unsupported context manager.",
+                    "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then "
+                    "it may be the case that it was created outside the compiled region, which Dynamo does not support. "
+                    "Supported context managers can cross graph break boundaries only if they are local non-closure "
+                    "variables, or are intermediate values.",
+                    "File an issue to PyTorch. Simple context managers can potentially be supported, "
+                    "but note that context managers can't be supported in general",
+                ],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             isinstance(ctx, GenericContextWrappingVariable)
             and not ctx.supports_graph_breaks()
         ):
             self.active_generic_context_managers.append(ctx)
 
+<<<<<<< HEAD
         if sys.version_info >= (3, 11):
             # See update_block_stack/create_resume for block stack details.
+=======
+        # Need this redundant check for mypy
+        assert isinstance(
+            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+        )
+
+        exit = WithExitFunctionVariable(
+            ctx,
+            inst.target,
+        )
+
+        if sys.version_info >= (3, 11):
+            # See create_call_resume_at for block stack details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Only push a block if the current instruction's block is a
             # with block that is not nested in a try block - that is, the current
             # instruction's block target is the same as the top block's target.
@@ -3865,19 +5044,30 @@ def enter_ctx(
             ):
                 target = None
             else:
+<<<<<<< HEAD
                 assert self.next_instruction.exn_tab_entry is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 target = self.next_instruction.exn_tab_entry.target
         else:
             target = inst.target
 
+<<<<<<< HEAD
         if target:
             if isinstance(self, InstructionTranslator) or config.nested_graph_breaks:
+=======
+        self.push(exit)
+
+        if target:
+            if isinstance(self, InstructionTranslator):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.block_stack.append(
                     BlockStackEntry(inst, target, len(self.stack), ctx)
                 )
             else:
                 self.block_stack.append(BlockStackEntry(inst, target, len(self.stack)))
 
+<<<<<<< HEAD
         return ctx.enter(self)
 
     @staticmethod
@@ -3917,6 +5107,15 @@ def append_prefix_inst(self, inst: Instruction) -> None:
         self.prefix_insts.append(inst)
 
     def MAKE_CELL(self, inst: Instruction) -> None:
+=======
+        self.push(ctx.enter(self))
+
+    def append_prefix_inst(self, inst):
+        assert self.accept_prefix_inst
+        self.prefix_insts.append(inst)
+
+    def MAKE_CELL(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 12) and not self.accept_prefix_inst:
             # In 3.12+, MAKE_CELL is not longer necessarily a prefix instruction.
             # It can be generated by inlined comprehensions.
@@ -3927,24 +5126,40 @@ def MAKE_CELL(self, inst: Instruction) -> None:
         else:
             self.append_prefix_inst(inst)
 
+<<<<<<< HEAD
     def COPY_FREE_VARS(self, inst: Instruction) -> None:
         self.append_prefix_inst(inst)
 
     def RETURN_GENERATOR(self, inst: Instruction) -> None:
+=======
+    def COPY_FREE_VARS(self, inst):
+        self.append_prefix_inst(inst)
+
+    def RETURN_GENERATOR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.append_prefix_inst(inst)
 
     # 3.12 opcodes
     # BINARY/STORE_SLICE opcodes are broken down into
     # BUILD_SLICE 2 and BINARY/STORE_SUBSCR
 
+<<<<<<< HEAD
     def END_FOR(self, inst: Instruction) -> None:
+=======
+    def END_FOR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 13):
             self.pop()
         else:
             self.popn(2)
 
+<<<<<<< HEAD
     def LOAD_FAST_CHECK(self, inst: Instruction) -> None:
         if istype(self.symbolic_locals.get(inst.argval, None), NullVariable):
+=======
+    def LOAD_FAST_CHECK(self, inst):
+        if isinstance(self.symbolic_locals.get(inst.argval, None), NullVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented_v2(
                 gb_type="LOAD_FAST_CHECK on uninitialized variable",
                 context=inst.argval,
@@ -3953,13 +5168,18 @@ def LOAD_FAST_CHECK(self, inst: Instruction) -> None:
             )
         self.LOAD_FAST(inst)
 
+<<<<<<< HEAD
     def LOAD_FAST_AND_CLEAR(self, inst: Instruction) -> None:
+=======
+    def LOAD_FAST_AND_CLEAR(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inst.argval not in self.symbolic_locals:
             self.push(NullVariable())
         else:
             self.LOAD_FAST(inst)
         self.symbolic_locals[inst.argval] = NullVariable()
 
+<<<<<<< HEAD
     def LOAD_SUPER_ATTR(self, inst: Instruction) -> None:
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         assert inst.arg is not None
@@ -3969,6 +5189,16 @@ def LOAD_SUPER_ATTR(self, inst: Instruction) -> None:
             self._load_attr(inst.argval)
 
     def CALL_INTRINSIC_1(self, inst: Instruction) -> None:
+=======
+    def LOAD_SUPER_ATTR(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        if inst.arg & 1:
+            self.LOAD_METHOD(inst)
+        else:
+            self._load_attr(inst)
+
+    def CALL_INTRINSIC_1(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inst.argval == 3:
             # INTRINSIC_STOPITERATION_ERROR
             self.STOPITERATION_ERROR(inst)
@@ -3986,7 +5216,11 @@ def CALL_INTRINSIC_1(self, inst: Instruction) -> None:
                 hints=[*graph_break_hints.SUPPORTABLE],
             )
 
+<<<<<<< HEAD
     def END_SEND(self, inst: Instruction) -> None:
+=======
+    def END_SEND(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tos = self.pop()
         self.pop()
         self.push(tos)
@@ -3995,10 +5229,17 @@ def END_SEND(self, inst: Instruction) -> None:
     # fused instructions LOAD_FAST_LOAD_FAST, STORE_FAST_STORE_FAST, STORE_FAST_LOAD_FAST
     # are broken down.
     @break_graph_if_unsupported(push=1)
+<<<<<<< HEAD
     def CALL_KW(self, inst: Instruction) -> None:
         self._call(inst, call_kw=True)
 
     def TO_BOOL(self, inst: Instruction) -> None:
+=======
+    def CALL_KW(self, inst):
+        self._call(inst, call_kw=True)
+
+    def TO_BOOL(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TO_BOOL only precedes a conditional jump or UNARY_NOT (see compile.c in CPython)
         # So we can skip this instruction as long as we remember to codegen a TO_BOOL
         # before conditional jumps/UNARY_NOT.
@@ -4008,9 +5249,14 @@ def TO_BOOL(self, inst: Instruction) -> None:
             "UNARY_NOT",
         )
 
+<<<<<<< HEAD
     def SET_FUNCTION_ATTRIBUTE(self, inst: Instruction) -> None:
         flags = inst.arg
         assert flags is not None
+=======
+    def SET_FUNCTION_ATTRIBUTE(self, inst):
+        flags = inst.arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = self.pop()
         assert isinstance(fn, NestedUserFunctionVariable)
         attr = self.pop()
@@ -4026,6 +5272,7 @@ def SET_FUNCTION_ATTRIBUTE(self, inst: Instruction) -> None:
 
         self.push(fn)
 
+<<<<<<< HEAD
     def CONVERT_VALUE(self, inst: Instruction) -> None:
         self.push(self._convert_value(self.pop(), inst.argval))
 
@@ -4100,15 +5347,31 @@ def LOAD_COMMON_CONSTANT(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval(self._common_constants[inst.arg]))
 
     def is_non_empty_graph(self) -> bool:
+=======
+    def CONVERT_VALUE(self, inst):
+        self.push(self._convert_value(self.pop(), inst.argval))
+
+    def FORMAT_SIMPLE(self, inst):
+        self._format_value(ConstantVariable.create(""), 0)
+
+    def FORMAT_WITH_SPEC(self, inst):
+        self._format_value(self.pop(), 0)
+
+    def is_non_empty_graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.output.count_calls() > 1:
             # perf optimization only
             self.is_non_empty_graph = lambda: True  # type: ignore[method-assign]
             return True
         return False
 
+<<<<<<< HEAD
     def format_frame_summary(
         self, additional_stack_frames: Optional[list[Any]] = None
     ) -> str:
+=======
+    def format_frame_summary(self, additional_stack_frames=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if additional_stack_frames is None:
             additional_stack_frames = []
         return "".join(
@@ -4117,7 +5380,11 @@ def format_frame_summary(
             )
         )
 
+<<<<<<< HEAD
     def frame_summary(self) -> traceback.FrameSummary:
+=======
+    def frame_summary(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return traceback.FrameSummary(
             getattr(self.f_code, "co_filename", "<unknown>"),
             self.lineno,
@@ -4125,12 +5392,20 @@ def frame_summary(self) -> traceback.FrameSummary:
             lookup_line=False,
         )
 
+<<<<<<< HEAD
     def is_co_filename_from_nn_modules(self) -> bool:
+=======
+    def is_co_filename_from_nn_modules(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         filename = getattr(self.f_code, "co_filename", "<unknown>")
         nn_modules_pattern = re.compile(r".*torch/nn/modules.*")
         return nn_modules_pattern.match(filename) is not None
 
+<<<<<<< HEAD
     def store_global_weakref_by_id(self, prefix: str, value: Any) -> str:
+=======
+    def store_global_weakref_by_id(self, prefix, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global_name = self.output.install_global_by_id(prefix, weakref.ref(value))
         install_guard(
             GlobalWeakRefSource(global_name).make_guard(GuardBuilder.WEAKREF_ALIVE)
@@ -4138,6 +5413,7 @@ def store_global_weakref_by_id(self, prefix: str, value: Any) -> str:
         return global_name
 
     @property
+<<<<<<< HEAD
     def fake_mode(self) -> Optional[FakeTensorMode]:
         return self.output.tracing_context.fake_mode
 
@@ -4145,6 +5421,13 @@ def fake_mode(self) -> Optional[FakeTensorMode]:
     def strict_translation_mode(
         self, check_fn: Callable[[VariableTracker], bool]
     ) -> Any:
+=======
+    def fake_mode(self):
+        return self.output.tracing_context.fake_mode
+
+    @contextlib.contextmanager
+    def strict_translation_mode(self, check_fn: Callable[[VariableTracker], bool]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Strict mode is enabled on a per-VariableTracker level depending on the return value of check_fn(node).
         """
@@ -4184,7 +5467,11 @@ def __init__(
         distributed_state: Optional[DistributedState],
         # This determines whether to use the execution recorder.
         closure: Optional[tuple[types.CellType]] = None,
+<<<<<<< HEAD
         package: Optional[CompilePackage] = None,
+=======
+        package: Optional["CompilePackage"] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__()
         self.speculation_log = speculation_log
@@ -4195,10 +5482,14 @@ def __init__(
         self.symbolic_locals = symbolic_locals
         self.symbolic_globals = symbolic_globals
         self.symbolic_torch_function_state = symbolic_torch_function_state
+<<<<<<< HEAD
         # used to keep cell/freevars alive after pruning symbolic_locals (prune_dead_locals)
         # in order to generate any nested closures
         self.post_prune_cell_and_freevars = None
         self.stack: list[VariableTracker] = []
+=======
+        self.stack = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.instruction_pointer = 0
         self.start_point = None
         self.current_instruction = create_instruction("NOP")
@@ -4210,7 +5501,10 @@ def __init__(
         self.accept_prefix_inst = True
         self.prefix_insts = []
         self.exn_vt_stack = exn_vt_stack
+<<<<<<< HEAD
         self.latest_bytecode_queue = deque(maxlen=20)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Properties of the input/output code
         self.instructions: list[Instruction] = instructions
@@ -4222,7 +5516,10 @@ def __init__(
         self.f_builtins: dict[str, Any] = f_builtins
         self.code_options: dict[str, Any] = code_options
         self.f_code: types.CodeType = f_code
+<<<<<<< HEAD
         self.closure = closure
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Execution record for replaying errors
         if closure is not None and config.replay_record_enabled:
@@ -4238,6 +5535,7 @@ def __init__(
         self.num_calls: dict[str, int] = {}
         # Flag to indicate whether tracing is used for export.
         self.export = export
+<<<<<<< HEAD
         # NOTE: one_graph is used for export/fullgraph=True to always force errors on graph breaks.
         # To toggle erroring/resuming on graph breaks during fullgraph=False compile, self.error_on_graph_break
         # is used instead. Every step(), its value is updated to the global tls.error_on_graph_break.
@@ -4248,6 +5546,9 @@ def __init__(
         self.error_on_graph_break = False
         # Also do not graph break when tracing resume function prologues
         self.is_tracing_resume_prologue = False
+=======
+        self.one_graph = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.current_speculation = None
 
@@ -4259,6 +5560,7 @@ def __init__(
 
         self.package = package
 
+<<<<<<< HEAD
         from .resume_execution import (
             CO_ASYNC_GENERATOR,
             CO_COROUTINE,
@@ -4276,6 +5578,26 @@ def __init__(
         self._constants_cache: list[
             Optional[Union[ConstantVariable, SliceVariable]]
         ] = [None] * len(f_code.co_consts)
+=======
+        if sys.version_info >= (3, 10):
+            from .resume_execution import (
+                CO_ASYNC_GENERATOR,
+                CO_COROUTINE,
+                CO_GENERATOR,
+                CO_ITERABLE_COROUTINE,
+            )
+
+            if f_code.co_flags & (
+                CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
+            ):
+                self.push(BuiltinVariable(None))
+
+        self.inline_depth = inline_depth
+        self.inconsistent_side_effects = False
+        self._constants_cache: list[Optional[VariableTracker]] = [None] * len(
+            f_code.co_consts
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.is_trace_bytecode_log_enabled: Optional[bool] = (
             trace_bytecode_log.isEnabledFor(logging.DEBUG)
@@ -4288,11 +5610,19 @@ def __init__(
 
 class InstructionTranslator(InstructionTranslatorBase):
     @staticmethod
+<<<<<<< HEAD
     def current_tx() -> InstructionTranslator:
         return tls.current_tx
 
     @contextlib.contextmanager
     def set_current_tx(self) -> Any:
+=======
+    def current_tx() -> "InstructionTranslator":
+        return tls.current_tx
+
+    @contextlib.contextmanager
+    def set_current_tx(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior = getattr(tls, "current_tx", None)
         tls.current_tx = self
         try:
@@ -4303,6 +5633,7 @@ def set_current_tx(self) -> Any:
     def __init__(
         self,
         instructions: list[Instruction],
+<<<<<<< HEAD
         f_code: types.CodeType,
         f_locals: dict[str, Any],
         f_globals: dict[str, Any],
@@ -4319,6 +5650,24 @@ def __init__(
         exn_vt_stack: ExceptionStack,
         distributed_state: Optional[DistributedState],
         package: Optional[CompilePackage],
+=======
+        f_code,
+        f_locals,
+        f_globals,
+        f_builtins,
+        closure,
+        torch_function_mode_stack,
+        code_options,
+        compiler_fn,
+        one_graph,
+        export,
+        export_constraints,
+        frame_state,
+        speculation_log: SpeculationLog,
+        exn_vt_stack: ExceptionStack,
+        distributed_state: Optional[DistributedState],
+        package: Optional["CompilePackage"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         _step_logger()(
             logging.INFO,
@@ -4336,7 +5685,10 @@ def __init__(
                 global_scope=f_globals,
                 f_code=f_code,
                 torch_function_mode_stack=torch_function_mode_stack,
+<<<<<<< HEAD
                 one_graph=one_graph,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 package=package,
             ),
             instructions=instructions,
@@ -4427,12 +5779,19 @@ def __init__(
                     side_effects.store_cell(cell_var, contents_var)
                 else:
                     cell_var = side_effects.track_cell_new()
+<<<<<<< HEAD
                 cell_var.local_name = name  # type: ignore[attr-defined]
+=======
+                cell_var.local_name = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.symbolic_locals[name] = cell_var
 
             # Populate `symbolic_locals` with cells captured by this frame,
             # effectively implementing the `COPY_FREE_VARS` instruction.
+<<<<<<< HEAD
             assert closure is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for name, cell in zip(self.freevars(), closure):
                 cell_source = LocalCellSource(name)
                 contents_source = LocalSource(name, is_derefed_cell_contents=True)
@@ -4446,7 +5805,11 @@ def __init__(
                 cell_var = side_effects.track_cell_existing(
                     cell_source, cell, contents_var
                 )
+<<<<<<< HEAD
                 cell_var.local_name = name  # type: ignore[attr-defined]
+=======
+                cell_var.local_name = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.symbolic_locals[name] = cell_var
 
             self.symbolic_torch_function_state = SymbolicTorchFunctionState(
@@ -4460,7 +5823,11 @@ def __init__(
                     self.symbolic_locals
                 )
 
+<<<<<<< HEAD
     def _throw_if_in_functorch(self) -> None:
+=======
+    def _throw_if_in_functorch(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Fallback to eager in case of a graph break inside vmap
         eager = torch._dynamo.lookup_backend("eager")
         compiler_fn = inspect.getattr_static(
@@ -4491,14 +5858,136 @@ def _throw_if_in_functorch(self) -> None:
                 hints=[],
             )
 
+<<<<<<< HEAD
     def get_example_value(self, source: Source) -> Any:
+=======
+    def get_example_value(self, source: Source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(source, LocalSource):
             return self.f_locals[source.local_name]
         if isinstance(source, GlobalSource):
             return self.f_globals[source.global_name]
         raise KeyError
 
+<<<<<<< HEAD
     def symbolic_locals_contain_module_class(self) -> bool:
+=======
+    def run(self):
+        super().run()
+
+    def should_compile_partial_graph(self):
+        if sys.version_info >= (3, 11):
+            # Do not compile if current instruction's block is not the top with block
+            entry = self.current_instruction.exn_tab_entry
+            if entry and (
+                not self.block_stack or entry.target is not self.block_stack[-1].target
+            ):
+                return False
+        return (
+            all(b.can_restore() for b in self.block_stack)
+            and not self.one_graph
+            and not self.active_generic_context_managers
+        )
+
+    def create_call_resume_at(self, inst, all_stack_locals_metadata):
+        self.instruction_pointer = None
+
+        if inst.opname == "RETURN_VALUE":
+            return [create_instruction("RETURN_VALUE")]
+        elif inst.opname == "RETURN_CONST":
+            return [create_instruction("RETURN_CONST", argval=inst.argval)]
+
+        reads = livevars_analysis(self.instructions, inst)
+        all_argnames = tuple(
+            k
+            for k in self.symbolic_locals.keys()
+            if k in reads and k not in self.cell_and_freevars()
+        )
+        # NOTE: do not use isinstance, since it realizes lazy VT's
+        argnames_null_set = set(all_stack_locals_metadata[0].locals_null_keys)
+        argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
+        argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
+        if sys.version_info < (3, 12):
+            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
+        # compile_subgraph did not codegen any NULLs,
+        # so we should not count NullVariables
+        stack_len = len(self.stack) - len(all_stack_locals_metadata[0].stack_null_idxes)
+        nargs = stack_len + len(argnames)
+
+        cg = PyCodegen(self)
+
+        # Handle inactive context variables.
+        # The resume function assumes that context variables are the class, NOT the object.
+        # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
+        # NOTE: if the unsupported instruction modifies the inactive context variable, it may
+        # result in silent incorrectness!
+        for (i, _), i_orig in zip(
+            all_stack_locals_metadata[0].stack_ctx_args,
+            all_stack_locals_metadata[0].stack_ctx_idxes_orig,
+        ):
+            # Replace the current stack var with the context class
+            ctx = cast(ContextWrappingVariable, self.stack[i_orig])
+            ctx.reconstruct_type(cg)
+            cg.extend_output(create_swap(stack_len - i + 1))
+            cg.append_output(create_instruction("POP_TOP"))
+
+        for name, _ in all_stack_locals_metadata[0].locals_ctx_args:
+            # Replace the local with the context class
+            ctx = cast(ContextWrappingVariable, self.symbolic_locals[name])
+            ctx.reconstruct_type(cg)
+            cg.append_output(create_instruction("STORE_FAST", argval=name))
+
+        name = unique_id(f"__resume_at_{inst.offset}", with_uuid=True)
+
+        new_code: types.CodeType = ContinueExecutionCache.lookup(
+            self.f_code,
+            self.lineno,
+            inst.offset,
+            tuple(b.target.offset for b in self.block_stack),
+            stack_len,
+            argnames,
+            argnames_null,
+            tuple(b.resume_fn() for b in self.block_stack),
+            tuple(all_stack_locals_metadata[0].stack_ctx_args),
+            tuple(all_stack_locals_metadata[0].locals_ctx_args),
+            tuple(all_stack_locals_metadata[0].stack_null_idxes),
+        )
+
+        # Add original GraphModule context to the resume function to handle
+        # the case of a graph break while tracing a GraphModule
+        orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
+            "orig_graphmodule", lambda: None
+        )()
+        if orig_graphmodule_maybe is not None:
+            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                orig_graphmodule_maybe
+            )
+
+        if new_code.co_freevars:
+            # expose code object for debugging purposes
+            self.output.install_global_unsafe(name, new_code)
+            cg.make_function_with_closure(name, new_code, True, stack_len)
+            package_name = None
+        else:
+            # This is safe: we pre-generate a unique name
+            self.output.install_global_unsafe(
+                name, types.FunctionType(new_code, self.f_globals, name)
+            )
+            cg.extend_output(cg.load_function_name(name, True, stack_len))
+            package_name = name
+
+        if self.package is not None:
+            self.package.add_resume_function(
+                new_code, self.f_globals["__name__"], package_name
+            )
+
+        cg.extend_output([cg.create_load(k) for k in argnames])
+        cg.extend_output(create_call_function(nargs, False))
+        cg.append_output(create_instruction("RETURN_VALUE"))
+        return cg.get_instructions()
+
+    def symbolic_locals_contain_module_class(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for v in self.symbolic_locals.values():
             if isinstance(v, UserDefinedClassVariable) and issubclass(
                 v.as_python_constant(), torch.nn.Module
@@ -4506,7 +5995,11 @@ def symbolic_locals_contain_module_class(self) -> bool:
                 return True
         return False
 
+<<<<<<< HEAD
     def replace_tos_if_return_is_generator(self) -> None:
+=======
+    def replace_tos_if_return_is_generator(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             len(self.stack)
             and (tos := self.stack[-1])
@@ -4517,7 +6010,11 @@ def replace_tos_if_return_is_generator(self) -> None:
                 mutation_type=ValueMutationNew(),
             )
 
+<<<<<<< HEAD
     def _return(self, inst: Instruction) -> None:
+=======
+    def _return(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.replace_tos_if_return_is_generator()
         assert self.instruction_pointer is not None
         assert self.start_point is not None
@@ -4532,8 +6029,11 @@ def _return(self, inst: Instruction) -> None:
             and not self.symbolic_locals_contain_module_class()
             and not self.export
             and not self.one_graph
+<<<<<<< HEAD
             and not self.error_on_graph_break
             and not self.is_tracing_resume_prologue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             raise exc.SkipFrame("because no content in function call")
 
@@ -4548,13 +6048,17 @@ def _return(self, inst: Instruction) -> None:
             reason=GraphCompileReason(
                 "return_value", [self.frame_summary()], graph_break=False
             ),
+<<<<<<< HEAD
             # the value to be returned
             stack_pops=1 if inst.opname == "RETURN_VALUE" else 0,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # check that our stack/locals meta are correct:
         # we should only be tracing 1 frame, and there should not be any NULLs on the stack
         assert len(all_stack_locals_metadata) == 1
         assert not all_stack_locals_metadata[0].stack_null_idxes
+<<<<<<< HEAD
         self.output.add_output_instructions(
             self.codegen_return_with_pops(inst, all_stack_locals_metadata[0].num_stack)
         )
@@ -4564,6 +6068,20 @@ def RETURN_VALUE(self, inst: Instruction) -> None:
         self._return(inst)
 
     def RETURN_CONST(self, inst: Instruction) -> None:
+=======
+        return_inst = (
+            create_instruction("RETURN_VALUE")
+            if inst.opname == "RETURN_VALUE"
+            else create_instruction("RETURN_CONST", argval=inst.argval)
+        )
+        self.output.add_output_instructions([return_inst])
+        raise ReturnValueOp
+
+    def RETURN_VALUE(self, inst):
+        self._return(inst)
+
+    def RETURN_CONST(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._return(inst)
 
 
@@ -4581,6 +6099,7 @@ class InliningInstructionTranslator(InstructionTranslatorBase):
     """Trace and inline a called method"""
 
     symbolic_result: Optional[VariableTracker]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     parent: InstructionTranslatorBase
 
@@ -4591,6 +6110,18 @@ def inline_call(cls, parent: Any, func: Any, args: Any, kwargs: Any) -> Any:
 
     @staticmethod
     def check_inlineable(func: Any) -> trace_rules.SkipResult:
+=======
+    parent: InstructionTranslatorBase
+
+    @classmethod
+    def inline_call(cls, parent, func, args, kwargs):
+        with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
+            tracer = cls.build_inline_tracer(parent, func, args, kwargs)
+            return tracer.inline_call_()
+
+    @staticmethod
+    def check_inlineable(func):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if func.has_self():
             unimplemented_v2(
                 gb_type="Inline attempt with __self__",
@@ -4622,11 +6153,17 @@ def check_inlineable(func: Any) -> trace_rules.SkipResult:
 
             # _origin marks this as coming from an internal dynamo known function that is safe to
             # trace through.
+<<<<<<< HEAD
             if (
                 hasattr(getattr(func, "fn", None), "_origin")
                 # pyrefly: ignore [missing-attribute]
                 and func.fn._origin is produce_trampoline_autograd_apply
             ):
+=======
+            if hasattr(getattr(func, "fn", None), "_origin") and func.fn._origin in [
+                produce_trampoline_autograd_apply,
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Known sound
                 return trace_rules.SkipResult(
                     False, "allowlist in dynamo known function"
@@ -4655,11 +6192,19 @@ def check_inlineable(func: Any) -> trace_rules.SkipResult:
 
     @staticmethod
     def build_inline_tracer(
+<<<<<<< HEAD
         parent: Any,
         func: VariableTracker,
         args: list[VariableTracker],
         kwargs: Any,
     ) -> InliningInstructionTranslator:
+=======
+        parent,
+        func: VariableTracker,
+        args: list[VariableTracker],
+        kwargs,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(
             func,
             (
@@ -4699,14 +6244,20 @@ def build_inline_tracer(
                 tracing_ctx.previously_inlined_functions[code] = result
 
         try:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sub_locals = func.bind_args(parent, args, kwargs)
         except TypeError as e:
             # Wrap the general TypeError during bind_args() to the internal ArgsMismatchError with detailed info
             raise ArgsMismatchError(  # noqa: B904
                 "{reason}.\n  func = {func}, args = {args}, kwargs = {kwargs}".format(
                     reason=str(e),
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     func=f"'{func.get_name()}' {func.get_filename()}:{func.get_code().co_firstlineno}",
                     args=[arg.python_type() for arg in args],
                     kwargs=kwargs,
@@ -4741,7 +6292,11 @@ def build_inline_tracer(
             cur_inst = parent.current_instruction
             parent_code = parent.f_code
 
+<<<<<<< HEAD
             def get_trace_call_log_str() -> str:
+=======
+            def get_trace_call_log_str():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 header = parent.get_line_of_code_header(
                     lineno=cur_inst.positions.lineno
                 )
@@ -4761,6 +6316,7 @@ def get_trace_call_log_str() -> str:
                 code_context.get_context(module.forward.__code__)[
                     "orig_graphmodule"
                 ] = weakref.ref(module)
+<<<<<<< HEAD
         # When we have inline_nn_module turned on, modules resolve to UnspecializedNNModuleVariable
         if args and isinstance(args[0], UnspecializedNNModuleVariable):
             module = args[0].value
@@ -4770,6 +6326,8 @@ def get_trace_call_log_str() -> str:
                 code_context.get_context(module.forward.__code__)[
                     "orig_graphmodule"
                 ] = weakref.ref(module)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tracer: InliningInstructionTranslator
         if is_generator(code):
@@ -4790,12 +6348,19 @@ def get_trace_call_log_str() -> str:
                 sub_locals,
                 parent.symbolic_globals,
                 parent.symbolic_torch_function_state,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 func,
             )
         return tracer
 
+<<<<<<< HEAD
     def inline_call_(self) -> VariableTracker:
+=======
+    def inline_call_(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         parent = self.parent
         code = self.f_code
 
@@ -4817,6 +6382,7 @@ def inline_call_(self) -> VariableTracker:
         except Exception:
             log.debug("FAILED INLINING %s", code)
             raise
+<<<<<<< HEAD
         finally:
             parent.error_on_graph_break = self.error_on_graph_break
 
@@ -4824,6 +6390,8 @@ def inline_call_(self) -> VariableTracker:
             # graph break
             return ConstantVariable.create(None)  # return dummy variable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.symbolic_result is not None
 
         if self.f_globals is parent.f_globals:
@@ -4846,6 +6414,7 @@ def inline_call_(self) -> VariableTracker:
             ):
                 assert isinstance(self, InliningGeneratorInstructionTranslator)
                 # When the generator returns None, we raise StopIteration
+<<<<<<< HEAD
                 args = []
                 if not (
                     isinstance(self.symbolic_result, ConstantVariable)
@@ -4853,6 +6422,9 @@ def inline_call_(self) -> VariableTracker:
                 ):
                     args = [self.symbolic_result]
                 exc.raise_observed_exception(StopIteration, self, args=args)
+=======
+                exc.raise_observed_exception(StopIteration, self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return self.symbolic_result
         else:
@@ -4884,7 +6456,11 @@ def __init__(
         # because we dont mutate them in transform_code_object (those
         # instructions are for the top most Instruction translator).  Also, we
         # have to be careful about not using _cached_cleaned_instructions here
+<<<<<<< HEAD
         # because that function is global, while we want the cache to be
+=======
+        # because that function is global, while we want the the cache to be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # alive only during a compmilation.
         tracing_ctx = parent.output.tracing_context
         instructions = None
@@ -4924,6 +6500,7 @@ def __init__(
         self.one_graph = parent.one_graph
 
     @property
+<<<<<<< HEAD
     def fake_mode(self) -> Optional[FakeTensorMode]:
         return self.parent.fake_mode
 
@@ -4944,6 +6521,18 @@ def create_call_resume_at(
     ) -> list[Instruction]:
         if config.nested_graph_breaks:
             return super().create_call_resume_at(inst, all_stack_locals_metadata)
+=======
+    def fake_mode(self):
+        return self.parent.fake_mode
+
+    def run_ctx_mgr(self):
+        return TracingContext.current_frame(self.parent.frame_summary())
+
+    def should_compile_partial_graph(self):
+        return False  # inlining functions is all-or-nothing
+
+    def create_call_resume_at(self, inst, all_stack_locals_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unimplemented_v2(
             gb_type="Graph break in inlined function",
             context="",
@@ -4951,19 +6540,31 @@ def create_call_resume_at(
             hints=[],
         )
 
+<<<<<<< HEAD
     def RETURN_VALUE(self, inst: Instruction) -> None:
+=======
+    def RETURN_VALUE(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.symbolic_result = self.pop()  # type: ignore[assignment]
         self.instruction_pointer = None
         raise ReturnValueOp
 
+<<<<<<< HEAD
     def RETURN_CONST(self, inst: Instruction) -> None:
+=======
+    def RETURN_CONST(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.symbolic_result = self._load_const(inst)
         self.instruction_pointer = None
         raise ReturnValueOp
 
+<<<<<<< HEAD
     def get_globals_source_and_value(
         self, name: str
     ) -> tuple[Any, VariableTracker, Source]:
+=======
+    def get_globals_source_and_value(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NamedTuple's `__new__` has a fake global scope that's not an actual
         # module. TODO generalize the check for other non-importable cases.
         # https://github.com/python/cpython/blob/8421b03b16a4852a527256cb7cdce2ab2d318548/Lib/collections/__init__.py#L441-L447
@@ -4992,6 +6593,7 @@ def get_globals_source_and_value(
             # Dont use lazy vt because we will do a setattr afterwards
             fglobals_vt = VariableBuilder(self, globals_source)(fglobals_value)
             global_source = DictGetItemSource(globals_source, name)  # type: ignore[assignment]
+<<<<<<< HEAD
 
         if is_stdlib(fglobals_value):
             # Users don't inplace mutate a stdlib attribute (like inspect,
@@ -5001,6 +6603,11 @@ def get_globals_source_and_value(
         return fglobals_value, fglobals_vt, global_source
 
     def _load_global(self, inst: Instruction) -> None:
+=======
+        return fglobals_value, fglobals_vt, global_source
+
+    def _load_global(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = inst.argval
         if name not in self.f_globals:
             return self.load_builtin(inst)
@@ -5017,7 +6624,11 @@ def _load_global(self, inst: Instruction) -> None:
                 value = self.f_globals[name]
                 self.push(VariableTracker.build(self, value, global_source))
 
+<<<<<<< HEAD
     def STORE_GLOBAL(self, inst: Instruction) -> None:
+=======
+    def STORE_GLOBAL(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.output.global_scope is self.f_globals:
             # If the global scope matches that of the root frame, use handler in
             # root frame instruction translator, to enforce consistency.
@@ -5040,17 +6651,25 @@ class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
     generated_items: list[VariableTracker]
     # Flag whether or not the InlineGenerator should consume the entire iterator
 
+<<<<<<< HEAD
     def __init__(self, *args: Any, **kwargs: Any) -> None:
+=======
+    def __init__(self, *args, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(*args, **kwargs)
         self.generated_items = []
         self.generator_exhausted = False
         self.is_generator_from_ctx_manager = False
 
+<<<<<<< HEAD
     def should_compile_partial_graph(self) -> bool:
         # resuming on graph break on inlined generator not supported
         return False
 
     def YIELD_VALUE(self, inst: Instruction) -> None:
+=======
+    def YIELD_VALUE(self, inst: Instruction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         top = self.pop()
         self.generated_items.append(top)
         if len(self.generated_items) > MAX_ITERATOR_LIMIT:
@@ -5067,13 +6686,18 @@ def YIELD_VALUE(self, inst: Instruction) -> None:
             # Stop tracing
             raise YieldValueOp
 
+<<<<<<< HEAD
     def GET_YIELD_FROM_ITER(self, inst: Instruction) -> None:
+=======
+    def GET_YIELD_FROM_ITER(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tos = self.stack[-1]
         if not isinstance(tos, ListIteratorVariable):
             self.pop()
             res = BuiltinVariable(iter).call_function(self, [tos], {})  # type: ignore[arg-type]
             self.push(res)
 
+<<<<<<< HEAD
     def RETURN_VALUE(self, inst: Instruction) -> None:
         self.generator_exhausted = True
         return super().RETURN_VALUE(inst)
@@ -5083,6 +6707,17 @@ def RETURN_CONST(self, inst: Instruction) -> None:
         return super().RETURN_CONST(inst)
 
     def YIELD_FROM(self, inst: Instruction) -> None:
+=======
+    def RETURN_VALUE(self, inst):
+        self.generator_exhausted = True
+        return super().RETURN_VALUE(inst)
+
+    def RETURN_CONST(self, inst):
+        self.generator_exhausted = True
+        return super().RETURN_CONST(inst)
+
+    def YIELD_FROM(self, inst):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(self.stack) >= 2
         val = self.pop()
         tos = self.stack[-1]
@@ -5120,11 +6755,19 @@ def YIELD_FROM(self, inst: Instruction) -> None:
             # Add the value to yield into generated_items and replace the top of the stack with None
             self.YIELD_VALUE(inst)
 
+<<<<<<< HEAD
     def SEND(self, inst: Instruction) -> None:
         assert len(self.stack) >= 2
         val = self.pop()
         tos = self.stack[-1]
         if isinstance(tos, (IteratorVariable, LocalGeneratorObjectVariable)) or (
+=======
+    def SEND(self, inst):
+        assert len(self.stack) >= 2
+        val = self.pop()
+        tos = self.stack[-1]
+        if isinstance(tos, (ListIteratorVariable, LocalGeneratorObjectVariable)) or (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             isinstance(tos, UserDefinedObjectVariable)
             and isinstance(tos.value, collections.abc.Iterator)
         ):
diff --git a/torch/_dynamo/tensor_version_op.py b/torch/_dynamo/tensor_version_op.py
index 8709c5618d859..39c2e9adea0e5 100644
--- a/torch/_dynamo/tensor_version_op.py
+++ b/torch/_dynamo/tensor_version_op.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """This module implements tensor version operations for Dynamo tracing.
 
 It provides primitives for handling tensor versioning during tracing, particularly in the
@@ -16,11 +21,15 @@
 Note this is similar to how no_grad is handled.
 """
 
+<<<<<<< HEAD
 from contextlib import AbstractContextManager
 from typing import Any
 
 import torch
 from torch import SymInt
+=======
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims import _make_prim, RETURN_TYPE
 from torch._subclasses import FakeTensorMode
 from torch._subclasses.functional_tensor import FunctionalTensorMode
@@ -35,14 +44,22 @@
 )
 
 
+<<<<<<< HEAD
 @_tensor_version.py_impl(FakeTensorMode)  # type: ignore[misc]
 def _tensor_version_fake(fake_mode: FakeTensorMode, self_tensor: Any) -> SymInt:
+=======
+@_tensor_version.py_impl(FakeTensorMode)
+def _tensor_version_fake(fake_mode, self_tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The initial dynamo capture of _tensor_version + _unsafe_set_version_counter turns the
     `._version` into an unbacked SymInt so that we don't need to specialize on the `._version`
     of input tensors to the graph.
     """
+<<<<<<< HEAD
     assert fake_mode.shape_env is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fake_mode.shape_env.create_unbacked_symint()
 
 
@@ -56,6 +73,7 @@ def _tensor_version_fake(fake_mode: FakeTensorMode, self_tensor: Any) -> SymInt:
 torch.fx.node.has_side_effect(_unsafe_set_version_counter)
 
 
+<<<<<<< HEAD
 @_tensor_version.py_impl(FunctionalTensorMode)  # type: ignore[misc]
 def _tensor_version_functional(mode: FunctionalTensorMode, self: Any) -> int:
     return self._version
@@ -67,4 +85,13 @@ def _unsafe_set_version_counter_functional(
     tensors: tuple[torch.Tensor, ...],
     versions: tuple[int, ...],
 ) -> None:
+=======
+@_tensor_version.py_impl(FunctionalTensorMode)
+def _tensor_version_functional(mode, self):
+    return self._version
+
+
+@_unsafe_set_version_counter.py_impl(FunctionalTensorMode)
+def _unsafe_set_version_counter_functional(ctx, tensors, versions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._C._autograd._unsafe_set_version_counter(tensors, versions)
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 1ccee7c8d7cd1..addc924af43b2 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """Testing utilities for Dynamo, providing a specialized TestCase class and test running functionality.
 
 This module extends PyTorch's testing framework with Dynamo-specific testing capabilities.
@@ -16,12 +21,19 @@
 import re
 import sys
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
 
 import torch
 import torch.testing
 from torch._dynamo import polyfills
+=======
+from typing import Union
+
+import torch
+import torch.testing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging._internal import trace_log
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     IS_WINDOWS,
@@ -102,6 +114,7 @@ def tearDown(self) -> None:
             log.warning("Running test changed grad mode")
             torch.set_grad_enabled(self._prior_is_grad_enabled)
 
+<<<<<<< HEAD
     def assertEqual(self, x: Any, y: Any, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
         if (
             config.debug_disable_compile_counter
@@ -127,6 +140,8 @@ def tearDown(self) -> None:
         # pyrefly: ignore [bad-assignment]
         torch._dynamo.config.nested_graph_breaks = self.prev_nested_graph_breaks
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CPythonTestCase(TestCase):
     """
@@ -161,6 +176,7 @@ class CPythonTestCase(TestCase):
     assertRegex = unittest.TestCase.assertRegex
     assertNotRegex = unittest.TestCase.assertNotRegex
     assertCountEqual = unittest.TestCase.assertCountEqual
+<<<<<<< HEAD
     assertMultiLineEqual = polyfills.assert_multi_line_equal
     assertSequenceEqual = polyfills.assert_sequence_equal
     assertListEqual = unittest.TestCase.assertListEqual
@@ -170,6 +186,15 @@ class CPythonTestCase(TestCase):
     # pyrefly: ignore [bad-override]
     assertRaises = unittest.TestCase.assertRaises
     # pyrefly: ignore [bad-override]
+=======
+    assertMultiLineEqual = unittest.TestCase.assertMultiLineEqual
+    assertSequenceEqual = unittest.TestCase.assertSequenceEqual
+    assertListEqual = unittest.TestCase.assertListEqual
+    assertTupleEqual = unittest.TestCase.assertTupleEqual
+    assertSetEqual = unittest.TestCase.assertSetEqual
+    assertDictEqual = unittest.TestCase.assertDictEqual
+    assertRaises = unittest.TestCase.assertRaises
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assertRaisesRegex = unittest.TestCase.assertRaisesRegex
     assertWarns = unittest.TestCase.assertWarns
     assertWarnsRegex = unittest.TestCase.assertWarnsRegex
@@ -177,6 +202,7 @@ class CPythonTestCase(TestCase):
     fail = unittest.TestCase.fail
     failureException = unittest.TestCase.failureException
 
+<<<<<<< HEAD
     def compile_fn(
         self,
         fn: Callable[..., Any],
@@ -193,6 +219,17 @@ def compile_fn(
         return fn
 
     def _dynamo_test_key(self) -> str:
+=======
+    def compile_fn(self, fn, backend, nopython):
+        # We want to compile only the test function, excluding any setup code
+        # from unittest
+        method = getattr(self, self._testMethodName)
+        method = torch._dynamo.optimize(backend, nopython=nopython)(method)
+        setattr(self, self._testMethodName, method)
+        return fn
+
+    def _dynamo_test_key(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         suffix = super()._dynamo_test_key()
         test_cls = self.__class__
         test_file = inspect.getfile(test_cls).split(os.sep)[-1].split(".")[0]
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 07c0c172342ef..00ab3655e8ea6 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """Common utilities for testing Dynamo's minifier functionality.
 
 This module provides the base infrastructure for running minification tests in Dynamo.
@@ -23,8 +28,12 @@
 import sys
 import tempfile
 import traceback
+<<<<<<< HEAD
 from collections.abc import Sequence
 from typing import Any, Optional, Union
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -39,7 +48,11 @@ class MinifierTestResult:
     minifier_code: str
     repro_code: str
 
+<<<<<<< HEAD
     def _get_module(self, t: str) -> str:
+=======
+    def _get_module(self, t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match = re.search(r"class Repro\(torch\.nn\.Module\):\s+([ ].*\n| *\n)+", t)
         assert match is not None, "failed to find module"
         r = match.group(0)
@@ -47,7 +60,11 @@ def _get_module(self, t: str) -> str:
         r = re.sub(r"\n{3,}", "\n\n", r)
         return r.strip()
 
+<<<<<<< HEAD
     def get_exported_program_path(self) -> Optional[str]:
+=======
+    def get_exported_program_path(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Extract the exported program file path from AOTI minifier's repro.py
         # Regular expression pattern to match the file path
         pattern = r'torch\.export\.load\(\s*["\'](.*?)["\']\s*\)'
@@ -59,10 +76,17 @@ def get_exported_program_path(self) -> Optional[str]:
             return file_path
         return None
 
+<<<<<<< HEAD
     def minifier_module(self) -> str:
         return self._get_module(self.minifier_code)
 
     def repro_module(self) -> str:
+=======
+    def minifier_module(self):
+        return self._get_module(self.minifier_code)
+
+    def repro_module(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._get_module(self.repro_code)
 
 
@@ -70,7 +94,11 @@ class MinifierTestBase(torch._dynamo.test_case.TestCase):
     DEBUG_DIR = tempfile.mkdtemp()
 
     @classmethod
+<<<<<<< HEAD
     def setUpClass(cls) -> None:
+=======
+    def setUpClass(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().setUpClass()
         if not os.path.exists(cls.DEBUG_DIR):
             cls.DEBUG_DIR = tempfile.mkdtemp()
@@ -93,14 +121,22 @@ def setUpClass(cls) -> None:
         )
 
     @classmethod
+<<<<<<< HEAD
     def tearDownClass(cls) -> None:
+=======
+    def tearDownClass(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if os.getenv("PYTORCH_KEEP_TMPDIR", "0") != "1":
             shutil.rmtree(cls.DEBUG_DIR)
         else:
             print(f"test_minifier_common tmpdir kept at: {cls.DEBUG_DIR}")
         cls._exit_stack.close()  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
     def _gen_codegen_fn_patch_code(self, device: str, bug_type: str) -> str:
+=======
+    def _gen_codegen_fn_patch_code(self, device, bug_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert bug_type in ("compile_error", "runtime_error", "accuracy")
         return f"""\
 {torch._dynamo.config.codegen_config()}
@@ -108,11 +144,15 @@ def _gen_codegen_fn_patch_code(self, device: str, bug_type: str) -> str:
 torch._inductor.config.{"cpp" if device == "cpu" else "triton"}.inject_relu_bug_TESTING_ONLY = {bug_type!r}
 """
 
+<<<<<<< HEAD
     def _maybe_subprocess_run(
         self, args: Sequence[Any], *, isolate: bool, cwd: Optional[str] = None
     ) -> subprocess.CompletedProcess[bytes]:
         from torch._inductor.cpp_builder import normalize_path_separator
 
+=======
+    def _maybe_subprocess_run(self, args, *, isolate, cwd=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isolate:
             assert len(args) >= 2, args
             assert args[0] == "python3", args
@@ -123,8 +163,12 @@ def _maybe_subprocess_run(
             else:
                 assert len(args) >= 2, args
                 with open(args[1]) as f:
+<<<<<<< HEAD
                     # Need normalize path of the code.
                     code = normalize_path_separator(f.read())
+=======
+                    code = f.read()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args = args[1:]
 
             # WARNING: This is not a perfect simulation of running
@@ -178,9 +222,13 @@ def _maybe_subprocess_run(
     # Run `code` in a separate python process.
     # Returns the completed process state and the directory containing the
     # minifier launcher script, if `code` outputted it.
+<<<<<<< HEAD
     def _run_test_code(
         self, code: str, *, isolate: bool
     ) -> tuple[subprocess.CompletedProcess[bytes], Union[str, Any]]:
+=======
+    def _run_test_code(self, code, *, isolate):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proc = self._maybe_subprocess_run(
             ["python3", "-c", code], isolate=isolate, cwd=self.DEBUG_DIR
         )
@@ -196,6 +244,7 @@ def _run_test_code(
 
     # Runs the minifier launcher script in `repro_dir`
     def _run_minifier_launcher(
+<<<<<<< HEAD
         self,
         repro_dir: str,
         isolate: bool,
@@ -203,11 +252,18 @@ def _run_minifier_launcher(
         minifier_args: Sequence[Any] = (),
         repro_after: Optional[str] = None,
     ) -> tuple[subprocess.CompletedProcess[bytes], str]:
+=======
+        self, repro_dir, isolate, *, minifier_args=(), repro_after=None
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsNotNone(repro_dir)
         launch_file = _as_posix_path(os.path.join(repro_dir, "minifier_launcher.py"))
         with open(launch_file) as f:
             launch_code = f.read()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(os.path.exists(launch_file))
 
         args = ["python3", launch_file, "minify", *minifier_args]
@@ -219,20 +275,30 @@ def _run_minifier_launcher(
         print("minifier stdout:", launch_proc.stdout.decode("utf-8"))
         stderr = launch_proc.stderr.decode("utf-8")
         print("minifier stderr:", stderr)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertNotIn("Input graph did not fail the tester", stderr)
 
         return launch_proc, launch_code
 
     # Runs the repro script in `repro_dir`
+<<<<<<< HEAD
     def _run_repro(
         self, repro_dir: str, *, isolate: bool = True
     ) -> tuple[subprocess.CompletedProcess[bytes], str]:
+=======
+    def _run_repro(self, repro_dir, *, isolate=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsNotNone(repro_dir)
         repro_file = _as_posix_path(os.path.join(repro_dir, "repro.py"))
         with open(repro_file) as f:
             repro_code = f.read()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(os.path.exists(repro_file))
 
         repro_proc = self._maybe_subprocess_run(
@@ -246,7 +312,11 @@ def _run_repro(
     # `run_code` is the code to run for the test case.
     # `patch_code` is the code to be patched in every generated file; usually
     # just use this to turn on bugs via the config
+<<<<<<< HEAD
     def _gen_test_code(self, run_code: str, repro_after: str, repro_level: int) -> str:
+=======
+    def _gen_test_code(self, run_code, repro_after, repro_level):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         repro_after_line = ""
         if repro_after == "aot_inductor":
             repro_after_line = (
@@ -279,6 +349,7 @@ def _gen_test_code(self, run_code: str, repro_after: str, repro_level: int) -> s
     # isolate=True only if the bug you're testing would otherwise
     # crash the process
     def _run_full_test(
+<<<<<<< HEAD
         self,
         run_code: str,
         repro_after: str,
@@ -286,6 +357,9 @@ def _run_full_test(
         *,
         isolate: bool,
         minifier_args: Sequence[Any] = (),
+=======
+        self, run_code, repro_after, expected_error, *, isolate, minifier_args=()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[MinifierTestResult]:
         if isolate:
             repro_level = 3
@@ -299,14 +373,21 @@ def _run_full_test(
         if expected_error is None:
             # Just check that there was no error
             self.assertEqual(test_proc.returncode, 0)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIsNone(repro_dir)
             return None
         # NB: Intentionally do not test return code; we only care about
         # actually generating the repro, we don't have to crash
+<<<<<<< HEAD
 
         self.assertIn(expected_error, test_proc.stderr.decode("utf-8"))
 
+=======
+        self.assertIn(expected_error, test_proc.stderr.decode("utf-8"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsNotNone(repro_dir)
         print("running minifier", file=sys.stderr)
         _minifier_proc, minifier_code = self._run_minifier_launcher(
@@ -317,7 +398,10 @@ def _run_full_test(
         )
         print("running repro", file=sys.stderr)
         repro_proc, repro_code = self._run_repro(repro_dir, isolate=isolate)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIn(expected_error, repro_proc.stderr.decode("utf-8"))
         self.assertNotEqual(repro_proc.returncode, 0)
         return MinifierTestResult(minifier_code=minifier_code, repro_code=repro_code)
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 9206f2598afc2..95ab8694ec0bb 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -23,8 +23,13 @@
 import sys
 import types
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, overload, TypeVar, Union
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 from unittest.mock import patch
 
@@ -42,7 +47,11 @@
 )
 from .guards import CheckFunctionManager, CompileId, GuardedCode
 from .types import ConvertFrameReturn, DynamoFrameType, wrap_guarded_code
+<<<<<<< HEAD
 from .utils import CompileCounterInt, same
+=======
+from .utils import same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 np: Optional[types.ModuleType] = None
@@ -200,6 +209,7 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
             return ConvertFrameReturn()
 
         debug_checks(frame.f_code)
+<<<<<<< HEAD
         code, _ = transform_code_object(frame.f_code, insert_nops)
         graph = OutputGraph(
             code_options={},
@@ -207,6 +217,15 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
             root_tx=None,  # type: ignore[arg-type]
             export=False,
             export_constraints=[],
+=======
+        code = transform_code_object(frame.f_code, insert_nops)
+        graph = OutputGraph(
+            code_options={},
+            compiler_fn=None,
+            root_tx=None,
+            export=False,
+            export_constraints=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             frame_state={"_id": 0},
             # TODO: shouldn't this be f_locals/f_globals from frame?
             local_scope=locals(),
@@ -227,8 +246,13 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
 
 class CompileCounter:
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.frame_count: Union[int, CompileCounterInt] = 0
         self.clear()
+=======
+        self.frame_count = 0
+        self.op_count = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -240,19 +264,30 @@ def __call__(
         return gm.forward
 
     def clear(self) -> None:
+<<<<<<< HEAD
         if config.debug_disable_compile_counter:
             self.frame_count = CompileCounterInt(0)
         else:
             self.frame_count = 0
+=======
+        self.frame_count = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.op_count = 0
 
 
 class CompileCounterWithBackend:
     def __init__(self, backend: str) -> None:
+<<<<<<< HEAD
         self.frame_count: Union[int, CompileCounterInt] = 0
         self.backend = backend
         self.graphs: list[torch.fx.GraphModule] = []
         self.clear()
+=======
+        self.frame_count = 0
+        self.op_count = 0
+        self.backend = backend
+        self.graphs: list[torch.fx.GraphModule] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -267,10 +302,14 @@ def __call__(
         return lookup_backend(self.backend)(gm, example_inputs)
 
     def clear(self) -> None:
+<<<<<<< HEAD
         if config.debug_disable_compile_counter:
             self.frame_count = CompileCounterInt(0)
         else:
             self.frame_count = 0
+=======
+        self.frame_count = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.op_count = 0
         self.graphs = []
 
@@ -420,12 +459,20 @@ def rand_strided(
     device: Union[str, torch.device] = "cpu",
     extra_size: int = 0,
 ) -> torch.Tensor:
+<<<<<<< HEAD
     needed_size = extra_size
     if all(s > 0 for s in size):
         # only need to allocate if all sizes are non-zero
         needed_size += (
             sum((shape - 1) * stride for shape, stride in zip(size, stride)) + 1
         )
+=======
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(size, stride))
+        + 1
+        + extra_size
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if dtype.is_floating_point:
         if dtype.itemsize == 1:
             """
@@ -496,7 +543,10 @@ def make_test_cls_with_patches(
 def skipIfNotPy311(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     if sys.version_info >= (3, 11):
         return fn
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return, bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return unittest.skip(fn)
 
 
@@ -506,12 +556,15 @@ def skipIfNotPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     return unittest.skip("Requires Python 3.12+")(fn)
 
 
+<<<<<<< HEAD
 def skipIfOnlyNotPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     if sys.version_info >= (3, 13) or sys.version_info < (3, 12):
         return unittest.skip("Requires Python 3.12")(fn)
     return fn
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def xfailIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     if sys.version_info >= (3, 12):
         return unittest.expectedFailure(fn)
@@ -524,6 +577,16 @@ def skipIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     return fn
 
 
+<<<<<<< HEAD
+=======
+def requiresPy310(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+    if sys.version_info >= (3, 10):
+        return fn
+    else:
+        return unittest.skip("Requires Python 3.10+")(fn)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
 def expectedFailureDynamic(fn: Callable[_P, _T]) -> Callable[_P, _T]:
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 595ba182a597c..ff1df2928e597 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Tracing rules and policies for TorchDynamo compilation decisions.
 
@@ -21,6 +26,10 @@
 
 import abc
 import builtins
+<<<<<<< HEAD
+=======
+import collections
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 import dataclasses
 import functools
@@ -34,11 +43,19 @@
 import sys
 import traceback
 import types
+<<<<<<< HEAD
 import unittest
 from collections import defaultdict
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any, cast, Optional, Union
+=======
+import typing
+import unittest
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.test_operators
@@ -49,6 +66,7 @@
 
 from . import config
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
+<<<<<<< HEAD
 from .utils import (
     getfile,
     hashable,
@@ -56,6 +74,9 @@
     NP_SUPPORTED_MODULES,
     unwrap_if_wrapper,
 )
+=======
+from .utils import getfile, hashable, NP_SUPPORTED_MODULES, unwrap_if_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .variables import (
     BuiltinVariable,
     FunctionalCallVariable,
@@ -64,13 +85,19 @@
     LocalGeneratorObjectVariable,
     NestedUserFunctionVariable,
     PolyfilledFunctionVariable,
+<<<<<<< HEAD
     ReparametrizeModuleCallVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SkipFunctionVariable,
     TorchInGraphFunctionVariable,
     UserFunctionVariable,
     UserMethodVariable,
 )
+<<<<<<< HEAD
 from .variables.base import VariableTracker
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 np: Optional[types.ModuleType] = None
@@ -80,6 +107,13 @@
     pass
 
 
+<<<<<<< HEAD
+=======
+if typing.TYPE_CHECKING:
+    from .variables.base import VariableTracker
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 A note on skip/inline rules:
 
@@ -147,6 +181,7 @@
 
 
 """
+<<<<<<< HEAD
 manual_torch_name_rule_map: dict[
     str,
     Union[
@@ -155,6 +190,9 @@
         type[UserFunctionVariable],
     ],
 ] = {
+=======
+manual_torch_name_rule_map: dict[str, Any] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
     "torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
     "torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
@@ -172,12 +210,19 @@
     "torch.distributed.distributed_c10d.get_process_group_ranks": TorchInGraphFunctionVariable,
     "torch._utils.is_compiling": TorchInGraphFunctionVariable,
     "torch.fx._symbolic_trace.is_fx_tracing": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
     "torch.fx._symbolic_trace.is_fx_symbolic_tracing": TorchInGraphFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._dynamo.external_utils.is_compiling": TorchInGraphFunctionVariable,
     "torch._dynamo.utils._disable_side_effect_safety_checks_for_current_subtracer": UserFunctionVariable,
     "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
+=======
+    "torch.autograd._profiler_enabled": SkipFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._C._to_dlpack": SkipFunctionVariable,
     "torch.to_dlpack": SkipFunctionVariable,
     # We graph break on RNG state setters or getters like
@@ -208,10 +253,13 @@
     "torch.fx.node.map_aggregate": UserFunctionVariable,
     "torch.fx.node.map_arg": UserFunctionVariable,
     "torch.fx.immutable_collections._no_mutation": UserFunctionVariable,
+<<<<<<< HEAD
     "torch.fx.immutable_collections._immutable_list_flatten": UserFunctionVariable,
     "torch.fx.immutable_collections._immutable_list_unflatten": UserFunctionVariable,
     "torch.fx.immutable_collections._immutable_dict_flatten": UserFunctionVariable,
     "torch.fx.immutable_collections._immutable_dict_unflatten": UserFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # symbol operators implemented in Python
     "torch.sym_not": TorchInGraphFunctionVariable,
     "torch.sym_float": TorchInGraphFunctionVariable,
@@ -245,8 +293,11 @@
     "torch._C.set_autocast_xla_dtype": SkipFunctionVariable,
     "torch._C.set_autocast_xla_enabled": SkipFunctionVariable,
     "torch.resize_as_": SkipFunctionVariable,
+<<<<<<< HEAD
     "torch._functorch.predispatch._add_batch_dim": TorchInGraphFunctionVariable,
     "torch._functorch.predispatch._remove_batch_dim": TorchInGraphFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.resize_as_sparse_": SkipFunctionVariable,
     "torch.get_default_device": TorchInGraphFunctionVariable,
     # functorch/vmap
@@ -317,7 +368,10 @@
     # functional_call
     "torch._functorch.functional_call.functional_call": FunctionalCallVariable,
     "torch.nn.utils.stateless._groupby_tensor": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
     "torch.nn.utils.stateless._reparametrize_module": ReparametrizeModuleCallVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # functorch/deprecated
     "torch._functorch.deprecated.jvp": UserFunctionVariable,
     "torch._functorch.deprecated.hessian": UserFunctionVariable,
@@ -327,6 +381,11 @@
     "torch._functorch.deprecated.grad_and_value": UserFunctionVariable,
     "torch._functorch.deprecated.vjp": UserFunctionVariable,
     # functorch/C++ bindings
+<<<<<<< HEAD
+=======
+    "torch._C._functorch._add_batch_dim": TorchInGraphFunctionVariable,
+    "torch._C._functorch._remove_batch_dim": TorchInGraphFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._C._functorch._wrap_for_grad": TorchInGraphFunctionVariable,
     "torch._C._functorch._unwrap_for_grad": TorchInGraphFunctionVariable,
     "torch._C._functorch._unwrap_batched": TorchInGraphFunctionVariable,
@@ -335,8 +394,11 @@
     "torch._C._functorch.is_batchedtensor": TorchInGraphFunctionVariable,
     "torch._C._functorch.peek_interpreter_stack": TorchInGraphFunctionVariable,
     "torch._C._functorch.unwrap_if_dead": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
     "torch._functorch.predispatch._vmap_increment_nesting": TorchInGraphFunctionVariable,
     "torch._functorch.predispatch._vmap_decrement_nesting": TorchInGraphFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # everything else
     "torch._functorch.pyfunctorch.coerce_cinterpreter": TorchInGraphFunctionVariable,
     "torch._higher_order_ops.triton_kernel_wrap.do_prune_configs": UserFunctionVariable,
@@ -349,7 +411,10 @@
     "torch._dynamo.mark_static": UserFunctionVariable,
     "torch._dynamo.nonstrict_trace": UserFunctionVariable,
     "torch._dynamo.patch_dynamo_config": UserFunctionVariable,
+<<<<<<< HEAD
     "torch._dynamo.error_on_graph_break": UserFunctionVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_true": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_false": TorchInGraphFunctionVariable,
@@ -450,7 +515,10 @@
         "torch._C._accelerator_getAccelerator",
         "torch._C._accelerator_getDeviceIndex",
         "torch._C._accelerator_getStream",
+<<<<<<< HEAD
         "torch._C._accelerator_setAllocatorSettings",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._accelerator_setStream",
         "torch._C._accelerator_synchronizeDevice",
         "torch._C._activate_gpu_trace",
@@ -507,6 +575,10 @@
         "torch._C._cuda_clearCublasWorkspaces",
         "torch._C._cuda_cudaCachingAllocator_raw_alloc",
         "torch._C._cuda_cudaCachingAllocator_raw_delete",
+<<<<<<< HEAD
+=======
+        "torch._C._cuda_cudaCachingAllocator_set_allocator_settings",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._cuda_cudaHostAllocator",
         "torch._C._cuda_customAllocator",
         "torch._C._cuda_emptyCache",
@@ -586,6 +658,10 @@
         "torch._C._dispatch_has_kernel",
         "torch._C._dispatch_is_alias_key",
         "torch._C._dispatch_is_included_in_alias",
+<<<<<<< HEAD
+=======
+        "torch._C._dispatch_is_main_interpreter",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._dispatch_isTensorSubclassLike",
         "torch._C._dispatch_key_for_device",
         "torch._C._dispatch_key_name",
@@ -661,7 +737,10 @@
         "torch._C._get_cublas_allow_tf32",
         "torch._C._get_cudnn_allow_tf32",
         "torch._C._get_cudnn_benchmark",
+<<<<<<< HEAD
         "torch._C._get_miopen_immediate",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._get_cudnn_deterministic",
         "torch._C._get_cudnn_enabled",
         "torch._C._get_custom_class_python_wrapper",
@@ -684,7 +763,10 @@
         "torch._C._get_mem_efficient_sdp_enabled",
         "torch._C._get_mkldnn_enabled",
         "torch._C._get_cudnn_sdp_enabled",
+<<<<<<< HEAD
         "torch._C._get_overrideable_sdp_enabled",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._set_sdp_use_cudnn",
         "torch._C._get_mobile_model_contained_types_from_buffer",
         "torch._C._get_mobile_model_contained_types",
@@ -1221,7 +1303,10 @@
         "torch._C._set_sdp_use_math",
         "torch._C._set_math_sdp_allow_fp16_bf16_reduction",
         "torch._C._set_sdp_use_mem_efficient",
+<<<<<<< HEAD
         "torch._C._set_sdp_use_overrideable",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._set_should_use_format_with_string_table",
         "torch._C._set_sm_carveout_experimental",
         "torch._C._set_storage_access_error_msg",
@@ -1949,7 +2034,10 @@
         "torch.geqrf",
         "torch.ger",
         "torch.get_device",
+<<<<<<< HEAD
         "torch.get_device_module",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.gradient",
         "torch.greater_equal",
         "torch.greater",
@@ -1963,7 +2051,10 @@
         "torch.hamming_window",
         "torch.hann_window",
         "torch.hardshrink",
+<<<<<<< HEAD
         "torch.hash_tensor",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.heaviside",
         "torch.hinge_embedding_loss",
         "torch.histc",
@@ -2370,11 +2461,15 @@
         "torch._functorch.utils.enable_single_level_autograd_function",
         "torch._functorch.utils.exposed_in",
         "torch._functorch.utils.unwrap_dead_wrappers",
+<<<<<<< HEAD
         "torch._functorch.predispatch.lazy_load_decompositions",
         "torch._functorch.predispatch._vmap_increment_nesting",
         "torch._functorch.predispatch._vmap_decrement_nesting",
         "torch._functorch.predispatch._add_batch_dim",
         "torch._functorch.predispatch._remove_batch_dim",
+=======
+        "torch._functorch.vmap.lazy_load_decompositions",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._guards.compile_context",
         "torch._guards.detect_fake_mode",
         "torch._guards.tracing",
@@ -2419,6 +2514,10 @@
         "torch._lowrank.svd_lowrank",
         "torch._preload_cuda_deps",
         "torch._register_device_module",
+<<<<<<< HEAD
+=======
+        "torch._running_with_deploy",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._utils._dummy_type",
         "torch._utils._flatten_dense_tensors",
         "torch._utils._unflatten_dense_tensors",
@@ -2443,7 +2542,10 @@
         "torch.atleast_3d",
         "torch.autograd._calculate_shape",
         "torch.autograd._is_checkpoint_valid",
+<<<<<<< HEAD
         "torch.autograd._profiler_enabled",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.autograd._make_grads",
         "torch.autograd._register_py_tensor_class_for_device",
         "torch.autograd._tensor_or_tensors_to_tuple",
@@ -2693,6 +2795,10 @@
         "torch.cuda.set_stream",
         "torch.cuda.set_sync_debug_mode",
         "torch.cuda.stream",
+<<<<<<< HEAD
+=======
+        "torch.cuda.synchronize",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.cuda.temperature",
         "torch.cuda.utilization",
         "torch.einsum",
@@ -2971,7 +3077,10 @@
         "torch.xpu.random.seed_all",
         "torch.xpu.random.seed",
         "torch.xpu.set_stream",
+<<<<<<< HEAD
         "torch.xpu.stream",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.xpu.synchronize",
     ],
     TorchInGraphFunctionVariable,
@@ -2998,6 +3107,7 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
             if ".py#" not in k:
                 obj = load_object(k)
             else:
+<<<<<<< HEAD
                 torch_dir = _module_dir(torch)
                 if torch_dir is None:
                     continue
@@ -3005,6 +3115,10 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
             if obj is not None:
                 if is_lru_cache_wrapped_function(obj):
                     obj = obj.__wrapped__
+=======
+                obj = _module_dir(torch) + k[len("torch/") :]
+            if obj is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if obj in d and d[obj] != v:
                     raise AssertionError(
                         f"Duplicate torch object {obj} with different rules: {v}, {d[obj]}"
@@ -3014,7 +3128,11 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
     return d
 
 
+<<<<<<< HEAD
 def _load_obj_from_str(fully_qualified_name: str) -> Any:
+=======
+def _load_obj_from_str(fully_qualified_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module, obj_name = fully_qualified_name.rsplit(".", maxsplit=1)
     return getattr(importlib.import_module(module), obj_name)
 
@@ -3024,7 +3142,11 @@ def _load_obj_from_str(fully_qualified_name: str) -> Any:
 """
 
 
+<<<<<<< HEAD
 def load_object(name: str) -> Any:
+=======
+def load_object(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         x = name.split("#")
         if len(x) == 2:
@@ -3045,7 +3167,11 @@ def load_object(name: str) -> Any:
 
 
 @functools.cache
+<<<<<<< HEAD
 def get_tensor_method() -> frozenset[Any]:
+=======
+def get_tensor_method():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disallowed_tensor_methods = {"__new__", "_make_wrapper_subclass", "_make_subclass"}
     s = set()
     for name in dir(torch.Tensor):
@@ -3074,7 +3200,11 @@ def get_tensor_method() -> frozenset[Any]:
 """
 
 
+<<<<<<< HEAD
 def is_aten_op_or_tensor_method(obj: Any) -> bool:
+=======
+def is_aten_op_or_tensor_method(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return obj in get_tensor_method() or isinstance(
         obj,
         (torch._ops.OpOverloadPacket, torch._ops.OpOverload),
@@ -3110,16 +3240,28 @@ def __call__(self) -> set[int]:
                 self.function_ids = value
         return self.function_ids
 
+<<<<<<< HEAD
     def get_name(self, idx: int, default: str) -> str:
+=======
+    def get_name(self, idx: int, default: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self()  # lazy init
         assert self.function_names is not None
         return self.function_names.get(idx, default)
 
+<<<<<<< HEAD
     def add(self, idx: int) -> None:
         function_ids = self()  # lazy init
         function_ids.add(idx)
 
     def remove(self, idx: int) -> None:
+=======
+    def add(self, idx: int):
+        function_ids = self()  # lazy init
+        function_ids.add(idx)
+
+    def remove(self, idx: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         function_ids = self()
         if idx in function_ids:
             function_ids.remove(idx)
@@ -3187,7 +3329,11 @@ def _numpy_function_ids() -> dict[int, str]:
         "sample",
     }
 
+<<<<<<< HEAD
     def is_supported(k: str, v: Any, mod: Any) -> bool:
+=======
+    def is_supported(k, v, mod):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not callable(v):
             return False
         if not getattr(v, "__module__", None):
@@ -3246,53 +3392,93 @@ def _maybe_init_lazy_module(obj: object) -> None:
             fn()
 
 
+<<<<<<< HEAD
 def is_callable_allowed(obj: Any) -> bool:
+=======
+def is_callable_allowed(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _maybe_init_lazy_module(obj)
     return id(obj) in _allowed_callable_ids
 
 
+<<<<<<< HEAD
 def is_nonstrict_trace_callable(obj: Any) -> bool:
+=======
+def is_nonstrict_trace_callable(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _maybe_init_lazy_module(obj)
     return id(obj) in _nonstrict_trace_callable_ids
 
 
+<<<<<<< HEAD
 def is_callable_disallowed(obj: Any) -> bool:
+=======
+def is_callable_disallowed(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _maybe_init_lazy_module(obj)
     return id(obj) in _disallowed_callable_ids
 
 
+<<<<<<< HEAD
 def is_forbidden(obj: Any) -> bool:
+=======
+def is_forbidden(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _maybe_init_lazy_module(obj)
     return inspect.getattr_static(obj, "_dynamo_forbidden", False)
 
 
+<<<<<<< HEAD
 def is_builtin_callable(obj: Any) -> bool:
+=======
+def is_builtin_callable(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # See also torch/_dynamo/polyfills/loader.py, which removes items in _builtin_function_ids
     return id(obj) in _builtin_function_ids
 
 
+<<<<<<< HEAD
 def is_builtin_constant(obj: Any) -> bool:
     return id(obj) in _builtin_constant_ids
 
 
 def is_polyfilled_callable(obj: Any) -> bool:
+=======
+def is_builtin_constant(obj) -> bool:
+    return id(obj) in _builtin_constant_ids
+
+
+def is_polyfilled_callable(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # See also @torch._dynamo.decorators.substitute_in_graph(...), which adds items in _polyfilled_function_ids
     return id(obj) in _polyfilled_function_ids
 
 
+<<<<<<< HEAD
 def is_numpy(obj: Any) -> bool:
+=======
+def is_numpy(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if np is None:
         return False
     return isinstance(obj, (np.ndarray, np.generic)) or id(obj) in _numpy_function_ids
 
 
+<<<<<<< HEAD
 def is_numpy_dtype(obj: Any) -> bool:
+=======
+def is_numpy_dtype(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if np is None:
         return False
     return isinstance(obj, np.dtype)
 
 
+<<<<<<< HEAD
 def is_numpy_type_info(obj: Any) -> bool:
+=======
+def is_numpy_type_info(obj) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if np is None:
         return False
     return isinstance(obj, (np.finfo, np.iinfo))
@@ -3300,6 +3486,10 @@ def is_numpy_type_info(obj: Any) -> bool:
 
 BUILTIN_SKIPLIST = (
     abc,
+<<<<<<< HEAD
+=======
+    collections,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     copy,
     random,
     traceback,
@@ -3329,7 +3519,11 @@ def is_numpy_type_info(obj: Any) -> bool:
 )
 
 
+<<<<<<< HEAD
 def _as_posix_path(path: str) -> str:
+=======
+def _as_posix_path(path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     posix_path = Path(os.path.normpath(path)).as_posix()
     # os.path.normpath and pathlib.Path remove trailing slash, so we need to add it back
     if path.endswith((os.path.sep, "/")):
@@ -3337,13 +3531,21 @@ def _as_posix_path(path: str) -> str:
     return posix_path
 
 
+<<<<<<< HEAD
 def _strip_init_py(s: str) -> str:
+=======
+def _strip_init_py(s):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     suffix = "__init__.py"
     s = s.removesuffix(suffix)
     return _as_posix_path(s)
 
 
+<<<<<<< HEAD
 def _module_dir(m: types.ModuleType) -> Optional[str]:
+=======
+def _module_dir(m: types.ModuleType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Protect against a module not exporting __file__ - this can happen for
     # frozen modules, for example.
     file = getattr(m, "__file__", None)
@@ -3368,7 +3570,10 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._functorch.apis",
     "torch._functorch.deprecated",
     "torch.nn.attention.flex_attention",
+<<<<<<< HEAD
     "torch.ao.quantization.stubs",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.ao.quantization.pt2e.export_utils",
     "torch.ao.quantization.pt2e.qat_utils",
     "torch.ao.quantization.pt2e.representation.rewrite",
@@ -3407,8 +3612,11 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._dynamo.compiled_autograd",
     "torch._dynamo.comptime",
     "torch._dynamo.polyfills",
+<<<<<<< HEAD
     "torch._dynamo.test_case",
     "torch._export.non_strict_utils",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._functorch._aot_autograd.subclass_parametrization",
     "torch._functorch.autograd_function",
     "torch._functorch.eager_transforms",
@@ -3429,13 +3637,19 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch.cuda.amp.autocast_mode",
     "torch.distributions",
     "torch.export._tree_utils",
+<<<<<<< HEAD
     "torch.export._unlift",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.export._wrapper_utils",
     "torch.fx._pytree",
     "torch.fx._symbolic_trace",
     "torch.fx.experimental.proxy_tensor",
     "torch.fx.passes.shape_prop",
+<<<<<<< HEAD
     "torch.fx.traceback",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.nn",
     "torch.overrides",
     "torch.random",
@@ -3485,6 +3699,10 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._custom_op",
     "torch._custom_ops",
     "torch._decomp",
+<<<<<<< HEAD
+=======
+    "torch._deploy",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._dispatch",
     "torch._dynamo",
     "torch._export",
@@ -3569,36 +3787,54 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
 
 
 @functools.cache
+<<<<<<< HEAD
 def get_legacy_mod_inlinelist() -> set[str]:
     torch_dir = _module_dir(torch)
     if torch_dir is None:
         return set()
     inlinelist = {
         _as_posix_path(torch_dir + m[len("torch.") :].replace(".", "/"))
+=======
+def get_legacy_mod_inlinelist():
+    inlinelist = {
+        _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for m in LEGACY_MOD_INLINELIST
     }
     return inlinelist
 
 
 @functools.cache
+<<<<<<< HEAD
 def get_mod_inlinelist() -> set[str]:
     torch_dir = _module_dir(torch)
     if torch_dir is None:
         return set()
     inlinelist = {
         _as_posix_path(torch_dir + m[len("torch.") :].replace(".", "/"))
+=======
+def get_mod_inlinelist():
+    inlinelist = {
+        _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for m in MOD_INLINELIST
     }
     return inlinelist
 
 
 @functools.cache
+<<<<<<< HEAD
 def get_mod_skiplist() -> set[str]:
     torch_dir = _module_dir(torch)
     if torch_dir is None:
         return set()
     skiplist = {
         _as_posix_path(torch_dir + m[len("torch.") :].replace(".", "/"))
+=======
+def get_mod_skiplist():
+    skiplist = {
+        _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for m in MOD_SKIPLIST
     }
     return skiplist
@@ -3655,14 +3891,22 @@ def get_mod_skiplist() -> set[str]:
 FORCE_SKIP_FILES = {f"{_module_dir(torch)}optim/lr_scheduler.py"}
 
 
+<<<<<<< HEAD
 def _recompile_re() -> None:
+=======
+def _recompile_re():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global SKIP_DIRS_RE
     SKIP_DIRS_RE = re.compile(
         rf"^[^\s<]*({'|'.join(re.escape(_as_posix_path(d)) for d in SKIP_DIRS)})"
     )
 
 
+<<<<<<< HEAD
 def add(import_name: str) -> None:
+=======
+def add(import_name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(import_name, types.ModuleType):
         return add(import_name.__name__)
     assert isinstance(import_name, str)
@@ -3684,7 +3928,11 @@ class SkipResult:
     reason: Optional[str]
 
 
+<<<<<<< HEAD
 def check_file(filename: Optional[str], is_inlined_call: bool = False) -> SkipResult:
+=======
+def check_file(filename, is_inlined_call=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Should skip this file?"""
     if filename is None:
         return SkipResult(True, "filename is None")
@@ -3722,10 +3970,15 @@ def check_file(filename: Optional[str], is_inlined_call: bool = False) -> SkipRe
     ):
         return SkipResult(True, "FBCODE_SKIP_TORCHREC_DIRS")
 
+<<<<<<< HEAD
     unittest_dir = _module_dir(unittest)
     if (
         unittest_dir is not None
         and filename.startswith(unittest_dir)
+=======
+    if (
+        filename.startswith(_module_dir(unittest))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and not torch._dynamo.config.enable_trace_unittest
     ):
         return SkipResult(True, "unittest")
@@ -3780,7 +4033,11 @@ def f3(x, y):
 """
 
 
+<<<<<<< HEAD
 def check_verbose(obj: Any, is_inlined_call: bool = False) -> SkipResult:
+=======
+def check_verbose(obj, is_inlined_call=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(
         obj,
         (
@@ -3799,6 +4056,7 @@ def check_verbose(obj: Any, is_inlined_call: bool = False) -> SkipResult:
     elif isinstance(obj, types.CodeType):
         fi = FunctionInfo(None, obj.co_name, obj.co_filename, obj)
     elif isinstance(obj, (types.FunctionType, types.MethodType)):
+<<<<<<< HEAD
         filename = getfile(obj)
         assert filename is not None
         fi = FunctionInfo(
@@ -3811,11 +4069,24 @@ def check_verbose(obj: Any, is_inlined_call: bool = False) -> SkipResult:
         filename = getfile(obj)
         assert filename is not None
         fi = FunctionInfo(obj, None, filename, None)
+=======
+        fi = FunctionInfo(
+            obj,
+            obj.__name__,
+            getfile(obj),
+            obj.__code__,  # type: ignore[union-attr] # FIXME Add MethodType.__code__ to typeshed
+        )
+    else:
+        fi = FunctionInfo(obj, None, getfile(obj), None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Consulte the central trace rules defined in torch._dynamo.trace_rules.
     reasons: set[str] = set()
     rule = lookup_inner(fi.py_obj, fi.name, fi.filename, is_inlined_call, reasons)
+<<<<<<< HEAD
     assert rule is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if issubclass(
         rule,
         (
@@ -3841,7 +4112,11 @@ def check_verbose(obj: Any, is_inlined_call: bool = False) -> SkipResult:
         )
 
 
+<<<<<<< HEAD
 def check(obj: Any, is_inlined_call: bool = False) -> bool:
+=======
+def check(obj, is_inlined_call=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return check_verbose(obj, is_inlined_call).skipped
 
 
@@ -3852,23 +4127,38 @@ def check(obj: Any, is_inlined_call: bool = False) -> bool:
 _recompile_re()
 
 
+<<<<<<< HEAD
 def is_torch_inline_allowed(filename: str) -> bool:
+=======
+def is_torch_inline_allowed(filename):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return any(filename.startswith(d) for d in get_mod_inlinelist())
 
 
 @functools.cache
+<<<<<<< HEAD
 def dynamo_dir() -> Optional[str]:
+=======
+def dynamo_dir():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch._dynamo
 
     return _module_dir(torch._dynamo)
 
 
+<<<<<<< HEAD
 def is_torch(filename: str) -> bool:
     dynamo_path = dynamo_dir()
     if dynamo_path is not None and filename.startswith(dynamo_path):
         return False
     torch_path = _module_dir(torch)
     return torch_path is not None and filename.startswith(torch_path)
+=======
+def is_torch(filename):
+    if filename.startswith(dynamo_dir()):
+        return False
+    return filename.startswith(_module_dir(torch))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """
@@ -3876,7 +4166,11 @@ def is_torch(filename: str) -> bool:
 """
 
 
+<<<<<<< HEAD
 def lookup_callable(obj: Callable[..., Any]) -> Optional[type[VariableTracker]]:
+=======
+def lookup_callable(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not hashable(obj):
         return None
     # Custom allow/disallow in graph takes precedence over the general lookup.
@@ -3897,18 +4191,31 @@ def lookup_callable(obj: Callable[..., Any]) -> Optional[type[VariableTracker]]:
 """
 
 
+<<<<<<< HEAD
 def lookup(obj: Any) -> Optional[type[VariableTracker]]:
+=======
+def lookup(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return lookup_inner(obj)
 
 
 # also takes config.dont_skip_tracing into account
 def lookup_inner(
+<<<<<<< HEAD
     obj: Any,
     name: Optional[str] = None,
     filename: Optional[str] = None,
     is_direct_call: bool = True,
     reasons: Union[None, set[str]] = None,
 ) -> Optional[type[VariableTracker]]:
+=======
+    obj,
+    name=None,
+    filename=None,
+    is_direct_call=True,
+    reasons: Union[None, set[str]] = None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = _lookup_inner(
         obj,
         name=name,
@@ -3923,6 +4230,7 @@ def lookup_inner(
     if config.dont_skip_tracing and result is SkipFunctionVariable:
         if filename is None:
             filename = getfile(obj)
+<<<<<<< HEAD
         assert filename is not None
         filename = _as_posix_path(filename)
         torch_dir = _module_dir(torch)
@@ -3932,6 +4240,14 @@ def lookup_inner(
                 "test_dont_skip_tracing_functions.py"
             ):
                 return SkipFunctionVariable
+=======
+        filename = _as_posix_path(filename)
+        dynamo_path = _as_posix_path(_module_dir(torch)) + "_dynamo"
+        if filename.startswith(dynamo_path) and not filename.endswith(
+            "test_dont_skip_tracing_functions.py"
+        ):
+            return SkipFunctionVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if reasons is not None:
             reasons.add(
                 "Attempted skip but we are ignoring skips due to torch._dynamo.config.dont_skip_tracing"
@@ -3941,12 +4257,21 @@ def lookup_inner(
 
 
 def _lookup_inner(
+<<<<<<< HEAD
     obj: Any,
     name: Optional[str] = None,
     filename: Optional[str] = None,
     is_direct_call: bool = True,
     reasons: Optional[set[str]] = None,
 ) -> Optional[type[VariableTracker]]:
+=======
+    obj,
+    name=None,
+    filename=None,
+    is_direct_call=True,
+    reasons: Union[None, set[str]] = None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Step 1: lookup obj's tracing rule in `torch_name_rule_map`.
     # The rules defined in `torch_name_rule_map` mainly includes two parts:
     # - Manually defined rules for any functions.
@@ -4020,7 +4345,11 @@ def _lookup_inner(
         filename = getfile(obj)
 
     skip_result = check_file(filename, is_direct_call)
+<<<<<<< HEAD
     if reasons is not None and skip_result.reason is not None:
+=======
+    if reasons is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reasons.add(skip_result.reason)
     if skip_result.skipped:
         return SkipFunctionVariable
@@ -4028,7 +4357,11 @@ def _lookup_inner(
         return UserFunctionVariable
 
 
+<<<<<<< HEAD
 def clear_lru_cache() -> None:
+=======
+def clear_lru_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._dynamo.trace_rules.get_torch_obj_rule_map.cache_clear()
     torch._dynamo.trace_rules.get_tensor_method.cache_clear()
     torch._dynamo.trace_rules.get_legacy_mod_inlinelist.cache_clear()
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 8236d8b229be2..ae49e202c5806 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -13,8 +13,12 @@
 
 import dataclasses
 import types
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, NamedTuple, Optional, Protocol, Union
+=======
+from typing import Any, Callable, NamedTuple, Optional, Protocol, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # CacheEntry has a `guard_manager` field for the guard, and a `code` field for the code object.
 from torch._C._dynamo.eval_frame import (
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index d07ad52ab32c1..f73f5fd403258 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Utility functions and classes used throughout the TorchDynamo system.
 
@@ -51,6 +56,7 @@
 from types import CodeType, MethodWrapperType
 from typing import (
     Any,
+<<<<<<< HEAD
     cast,
     ClassVar,
     Generic,
@@ -63,6 +69,18 @@
     Union,
 )
 from typing_extensions import ParamSpec, TypeIs
+=======
+    Callable,
+    cast,
+    ClassVar,
+    Generic,
+    Optional,
+    overload,
+    TypeVar,
+    Union,
+)
+from typing_extensions import Literal, TypeAlias, TypeGuard, TypeIs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._functorch.config
@@ -89,7 +107,10 @@
 from torch.fx._utils import _format_graph_code, lazy_format_graph_code
 from torch.monitor import _WaitCounter
 from torch.nn.modules.lazy import LazyModuleMixin
+<<<<<<< HEAD
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton, has_triton_package
 from torch.utils.hooks import RemovableHandle
 
@@ -98,13 +119,17 @@
 
 if typing.TYPE_CHECKING:
     from collections.abc import (
+<<<<<<< HEAD
         Callable,
         Container,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Generator,
         ItemsView,
         Iterable,
         Iterator,
         KeysView,
+<<<<<<< HEAD
         Mapping,
         Sequence,
         ValuesView,
@@ -118,6 +143,11 @@
     from torch._dynamo.variables.base import VariableTracker
     from torch._prims_common import DeviceLikeType
 
+=======
+        ValuesView,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     import numpy as np
@@ -157,8 +187,11 @@
 
 
 T = TypeVar("T")
+<<<<<<< HEAD
 R = TypeVar("R")
 _P = ParamSpec("_P")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 unpatched_nn_module_getattr = torch.nn.Module.__getattr__
 unpatched_nn_module_call = torch.nn.Module.__call__
@@ -169,7 +202,11 @@
 )
 optimus_scuba_log: dict[str, Any] = {}
 troubleshooting_url = (
+<<<<<<< HEAD
     "https://pytorch.org/docs/main/compile/programming_model.recompilation.html"
+=======
+    "https://pytorch.org/docs/main/torch.compiler_troubleshooting.html"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 nnmodule_doc_url = "https://pytorch.org/docs/main/torch.compiler_nn_module.html"
 nnmodule_doc_url_msg = f"See {nnmodule_doc_url} for more information and limitations."
@@ -198,43 +235,71 @@ class ReinplaceCounters:
 
     # Track sizes of known not re-inplaced tensors (exclude dynamic shapes).
     @classmethod
+<<<<<<< HEAD
     def add_missed_bytes(cls, trigger: ReInplaceTrigger, bytes: int) -> None:
+=======
+    def add_missed_bytes(cls, trigger: ReInplaceTrigger, bytes: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if bytes != 0:
             cls._values[f"missed_bytes_{trigger.name}"] += bytes
 
     # Track number of not re-inplaced tensors.
     @classmethod
+<<<<<<< HEAD
     def add_missed_opportunities(cls, trigger: ReInplaceTrigger, count: int) -> None:
+=======
+    def add_missed_opportunities(cls, trigger: ReInplaceTrigger, count: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if count != 0:
             cls._values[f"missed_tensors_{trigger}"] += count
 
     @classmethod
+<<<<<<< HEAD
     def clear(cls) -> None:
         cls._values.clear()
 
     @classmethod
     def get_total_missed(cls) -> int:
+=======
+    def clear(cls):
+        cls._values.clear()
+
+    @classmethod
+    def get_total_missed(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum = 0
         for trigger in ReInplaceTrigger:
             sum += cls._values.get(f"missed_tensors_{trigger}", 0)
         return sum
 
     @classmethod
+<<<<<<< HEAD
     def get_total_missed_bytes(cls) -> int:
+=======
+    def get_total_missed_bytes(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum = 0
         for trigger in ReInplaceTrigger:
             sum += cls._values.get(f"missed_bytes_{trigger.name}", 0)
         return sum
 
     @classmethod
+<<<<<<< HEAD
     def log(cls) -> None:
+=======
+    def log(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if not empty log.
         if cls._values:
             signpost_event("inductor", "reinplace_counters", cls._values)
 
 
 def tabulate(
+<<<<<<< HEAD
     rows: Union[list[tuple[str, Any]], list[list[Any]]],
+=======
+    rows: Union[list[tuple[str, object]], list[list[object]]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     headers: Union[tuple[str, ...], list[str]],
 ) -> str:
     try:
@@ -264,6 +329,7 @@ def reset_frame_count() -> None:
     curr_frame = 0
 
 
+<<<<<<< HEAD
 _recompile_user_contexts: Optional[list[Callable[[], str]]] = None
 
 
@@ -285,6 +351,8 @@ def get_hook_for_recompile_user_context() -> Optional[list[Callable[[], str]]]:
     return _recompile_user_contexts
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 op_count = 0
 
 
@@ -298,13 +366,19 @@ def increment_op_count(cnt: int) -> None:
 def calculate_time_spent() -> dict[str, float]:
     total_by_key = {}
     for phase, timing in cumulative_time_spent_ns.items():
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         total_by_key[phase] = timing / 1e9
 
     total_by_key["total_wall_time"] = total_by_key.get(
         "entire_frame_compile", 0
     ) + total_by_key.get("entire_backward_compile", 0)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return total_by_key
 
 
@@ -401,7 +475,11 @@ def log_instant_event(
         metadata: dict[str, Any],
         time_ns: Optional[int] = None,
         log_level: CompileEventLogLevel = CompileEventLogLevel.CHROMIUM,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if time_ns is None:
             time_ns = time.time_ns()
         chromium_log = get_chromium_event_logger()
@@ -423,7 +501,11 @@ def add_data(
         log_level: CompileEventLogLevel,
         overwrite: bool = False,
         **metadata: object,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Centralized API for adding data to various events
         Log an event to a toplevel "dynamo" event or metrics context
@@ -466,7 +548,11 @@ def add_data(
     @staticmethod
     def add_toplevel(
         log_level: CompileEventLogLevel, overwrite: bool = False, **metadata: object
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Syntactic sugar for logging to the toplevel event
         """
@@ -480,7 +566,11 @@ def add_toplevel(
     @staticmethod
     def increment(
         event_name: str, log_level: CompileEventLogLevel, key: str, value: int
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Increments an existing field, or adds it
         """
@@ -513,7 +603,11 @@ def increment_toplevel(
         key: str,
         value: int = 1,
         log_level: CompileEventLogLevel = CompileEventLogLevel.COMPILATION_METRIC,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Increments a value on the toplevel metric. By default, logs to metric.
         """
@@ -528,7 +622,11 @@ def increment_toplevel(
     @staticmethod
     def add_to_set(
         event_name: str, log_level: CompileEventLogLevel, key: str, value: Any
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Add metadata <value> to a set of values with key <key>. Creates a set if it doesn't exist.
         """
@@ -561,7 +659,11 @@ def add_to_set_toplevel(
         key: str,
         value: Any,
         log_level: CompileEventLogLevel = CompileEventLogLevel.COMPILATION_METRIC,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Same as add to set, just does it automatically to the toplevel event instead of having to explicitly name it.
         Defaults to COMPILATION_METRIC log level.
@@ -577,7 +679,11 @@ def add_to_set_toplevel(
     # Helper functions that are syntactic sugar
 
     @staticmethod
+<<<<<<< HEAD
     def chromium(event_name: str, **metadata: object) -> None:
+=======
+    def chromium(event_name: str, **metadata: object):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Add <metadata> to <event_name> in chromium. Each key/value of metadata will appear in the chromium trace.
         <event_name> should be the name of a timed event span passed to `dynamo_timed`.
@@ -587,7 +693,11 @@ def chromium(event_name: str, **metadata: object) -> None:
         )
 
     @staticmethod
+<<<<<<< HEAD
     def pt2_compile(event_name: str, **metadata: object) -> None:
+=======
+    def pt2_compile(event_name: str, **metadata: object):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Add <metadata> to <event_name> in chromium and PT2 Compile Events.
         Each key/value of metadata will appear in the chromium trace. Each kwarg name becomes
@@ -600,7 +710,11 @@ def pt2_compile(event_name: str, **metadata: object) -> None:
         )
 
     @staticmethod
+<<<<<<< HEAD
     def compilation_metric(overwrite: bool = False, **metadata: object) -> None:
+=======
+    def compilation_metric(overwrite: bool = False, **metadata: object):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Add <metadata> to the CompilationMetrics context. Also logs to PT2 Compile Events
         and chromium.
@@ -614,7 +728,11 @@ def compilation_metric(overwrite: bool = False, **metadata: object) -> None:
     @staticmethod
     def instant(
         event_name: str, metadata: dict[str, Any], time_ns: Optional[int] = None
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Log an instant event to chromium logs with name <event_name> at time <time_ns>. The `args` field in
         Perfetto will point to metadata. <time_ns> should be a value obtained from time.time_ns().
@@ -624,7 +742,11 @@ def instant(
         )
 
     @staticmethod
+<<<<<<< HEAD
     def try_add_pt2_compile(event_name: str, **metadata: object) -> None:
+=======
+    def try_add_pt2_compile(event_name: str, **metadata: object):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Adds to an existing pt2_compile event, but silently returns if the event doesn't exist
         or ChromiumEventLogger is not initialized.
@@ -636,7 +758,11 @@ def try_add_pt2_compile(event_name: str, **metadata: object) -> None:
         chromium_log.try_add_event_data(event_name, **metadata)
 
     @staticmethod
+<<<<<<< HEAD
     def try_(method_fn: Callable[_P, Any], *args: _P.args, **kwargs: _P.kwargs) -> None:
+=======
+    def try_(method_fn, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Special function that quietly runs a given method, returning if CHROMIUM_EVENT_LOG is None or metrics context is not set
         """
@@ -715,7 +841,10 @@ def _foo(...):
     if key not in compilation_time_metrics:
         compilation_time_metrics[key] = []
 
+<<<<<<< HEAD
     metrics = compilation_time_metrics[key]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     event_metadata = {}
     if metadata:
         event_metadata.update(metadata)
@@ -763,7 +892,11 @@ def _foo(...):
     finally:
         end_ns = time.time_ns()
         time_spent_ns = end_ns - start_ns
+<<<<<<< HEAD
         metrics.append(time_spent_ns / 1e9)
+=======
+        compilation_time_metrics[key].append(time_spent_ns / 1e9)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         chromium_log.log_event_end(
             event_name, end_ns, {}, start_ns, log_pt2_compile_event, compile_id
         )
@@ -803,15 +936,22 @@ def compile_times(repr: Literal["str"], aggregate: bool = False) -> str: ...
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compile_times(
     repr: Literal["csv"], aggregate: bool = False
 ) -> tuple[list[str], list[object]]: ...
 
 
+<<<<<<< HEAD
 def compile_times(  # type: ignore[misc]
     repr: str = "str", aggregate: bool = False
 ) -> Union[str, None, tuple[list[str], list[str]]]:
+=======
+def compile_times(repr="str", aggregate: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get metrics about torchdynamo frontend/backend compilation times.
 
@@ -825,7 +965,11 @@ def compile_times(  # type: ignore[misc]
     per metric.
     """
 
+<<<<<<< HEAD
     def fmt_fn(values: list[float], item_fn: Callable[[float], str] = str) -> str:
+=======
+    def fmt_fn(values, item_fn=lambda x: x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if aggregate:
             return item_fn(sum(values))
         return ", ".join(map(item_fn, values))
@@ -872,8 +1016,13 @@ def __init__(self, maxsize: int = 4096) -> None:
         self.maxsize = maxsize
         self.reset()
 
+<<<<<<< HEAD
     def reset(self) -> None:
         self.set: OrderedDict[Any, Any] = OrderedDict()
+=======
+    def reset(self):
+        self.set = OrderedDict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def add(self, key: Union[str, tuple[object, object]]) -> bool:
         if key in self.set:
@@ -890,7 +1039,11 @@ def add(self, key: Union[str, tuple[object, object]]) -> bool:
 graph_break_dup_warning_checker = DuplicateWarningChecker()
 
 
+<<<<<<< HEAD
 def setup_compile_debug() -> contextlib.ExitStack:
+=======
+def setup_compile_debug():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
 
     if compile_debug:
@@ -903,7 +1056,11 @@ def reset_graph_break_dup_checker() -> None:
     graph_break_dup_warning_checker.reset()
 
 
+<<<<<<< HEAD
 def add_file_handler() -> contextlib.ExitStack:
+=======
+def add_file_handler():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     log_path = os.path.join(get_debug_dir(), "torchdynamo")
     os.makedirs(log_path, exist_ok=True)
 
@@ -916,7 +1073,11 @@ def add_file_handler() -> contextlib.ExitStack:
     return exitstack
 
 
+<<<<<<< HEAD
 def setup_log_file() -> contextlib.ExitStack:
+=======
+def setup_log_file():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     exitstack = contextlib.ExitStack()
     if config.log_file_name is not None:
         log_file_handler = logging.FileHandler(config.log_file_name)
@@ -928,12 +1089,20 @@ def setup_log_file() -> contextlib.ExitStack:
     return exitstack
 
 
+<<<<<<< HEAD
 def gen_record_file_name(exc: Exception, code: CodeType) -> str:
+=======
+def gen_record_file_name(exc, code) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return f"{get_debug_dir()}/error_recordings/\
 {code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec"
 
 
+<<<<<<< HEAD
 def write_record_to_file(filename: str, exec_record: ExecutionRecord) -> None:
+=======
+def write_record_to_file(filename: str, exec_record) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         if os.path.exists(filename):
             log.warning(
@@ -959,7 +1128,11 @@ def identity(x: T) -> T:
     return x
 
 
+<<<<<<< HEAD
 def hashable(x: Any) -> bool:
+=======
+def hashable(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         hash(x)
         return True
@@ -970,13 +1143,18 @@ def hashable(x: Any) -> bool:
         return False
 
 
+<<<<<<< HEAD
 def nothing(*args: Any, **kwargs: Any) -> None:
+=======
+def nothing(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass
 
 
 class ExactWeakKeyDictionary:
     """Similar to weakref.WeakKeyDictionary, but use `is`/`id` rather than `==` to compare equality"""
 
+<<<<<<< HEAD
     def __init__(self) -> None:
         self.values: dict[int, Any] = {}
         self.refs: dict[int, weakref.ReferenceType[Any]] = {}
@@ -991,18 +1169,42 @@ def __contains__(self, key: Any) -> bool:
         return id(key) in self.values
 
     def __setitem__(self, key: Any, value: Any) -> None:
+=======
+    def __init__(self):
+        self.values = {}
+        self.refs = {}
+
+    def __getitem__(self, key):
+        return self.values[id(key)]
+
+    def get(self, key, default=None):
+        return self.values.get(id(key), default)
+
+    def __contains__(self, key):
+        return id(key) in self.values
+
+    def __setitem__(self, key, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = id(key)
         if idx not in self.refs:
             self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
         self.values[idx] = value
 
+<<<<<<< HEAD
     def _remove_id(self, idx: int) -> None:
+=======
+    def _remove_id(self, idx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if idx in self.values:
             del self.values[idx]
         if idx in self.refs:
             del self.refs[idx]
 
+<<<<<<< HEAD
     def clear(self) -> None:
+=======
+    def clear(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.refs.clear()
         self.values.clear()
 
@@ -1021,7 +1223,11 @@ def istype(
 def istype(obj: object, allowed_types: Iterable[type]) -> bool: ...
 
 
+<<<<<<< HEAD
 def istype(obj: object, allowed_types: Any) -> bool:
+=======
+def istype(obj, allowed_types):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """isinstance() without subclasses"""
     if isinstance(allowed_types, (tuple, list, set)):
         return type(obj) in allowed_types
@@ -1031,8 +1237,11 @@ def istype(obj: object, allowed_types: Any) -> bool:
 if sys.version_info >= (3, 12):
     # Some typing classes moved to C in 3.12,
     # which no longer have the _Final mixin.
+<<<<<<< HEAD
     # Check for consistency e.g. here:
     # https://github.com/python/cpython/blob/f2b82b3b3b1f8c7a81e84df35ee921e44517cf32/Lib/typing.py#L32
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _builtin_final_typing_classes = (
         typing.ParamSpecArgs,
         typing.ParamSpecKwargs,
@@ -1043,17 +1252,26 @@ def istype(obj: object, allowed_types: Any) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_typing(value: Any) -> bool:
     # _Final catches most of typing classes:
     #   - Any
     #   - Callable
     #   - Union (Python < 3.14)
+=======
+def is_typing(value):
+    # _Final catches most of typing classes:
+    #   - Any
+    #   - Callable
+    #   - Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #   ...
     #
     # NB: we intentionally ignore classes that inherit from Generic, since they
     # can be used as both TypingVariable as well as UserDefinedClassVariable.
     if sys.version_info >= (3, 12) and isinstance(value, _builtin_final_typing_classes):
         return True
+<<<<<<< HEAD
     return (
         isinstance(value, typing._Final)  # type: ignore[attr-defined]
         or value is typing.Generic
@@ -1062,6 +1280,12 @@ def is_typing(value: Any) -> bool:
 
 
 def is_numpy_int_type(value: Any) -> bool:
+=======
+    return isinstance(value, typing._Final) or value is typing.Generic  # type: ignore[attr-defined]
+
+
+def is_numpy_int_type(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not np:
         return False
 
@@ -1080,7 +1304,11 @@ def is_numpy_int_type(value: Any) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_numpy_float_type(value: Any) -> bool:
+=======
+def is_numpy_float_type(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not np:
         return False
 
@@ -1192,11 +1420,19 @@ def is_wrapper_or_member_descriptor(
     )
 
 
+<<<<<<< HEAD
 def unwrap_if_wrapper(fn: Any) -> Any:
     return unwrap_with_attr_name_if_wrapper(fn)[0]
 
 
 def unwrap_with_attr_name_if_wrapper(fn: Any) -> tuple[Any, Optional[str]]:
+=======
+def unwrap_if_wrapper(fn):
+    return unwrap_with_attr_name_if_wrapper(fn)[0]
+
+
+def unwrap_with_attr_name_if_wrapper(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO(anijain2305) - Investigate if we can get rid of this function
     # unpack @torch._dynamo.optimize()(fn) wrapped function
     if is_function(fn) and inspect.getattr_static(fn, "_torchdynamo_inline", False):
@@ -1207,14 +1443,22 @@ def unwrap_with_attr_name_if_wrapper(fn: Any) -> tuple[Any, Optional[str]]:
     return fn, attr_name
 
 
+<<<<<<< HEAD
 def is_numpy_ndarray(value: Any) -> TypeGuard[np.ndarray]:  # type: ignore[type-arg]
+=======
+def is_numpy_ndarray(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not np:
         return False
 
     return istype(value, np.ndarray)
 
 
+<<<<<<< HEAD
 def istensor(obj: Any) -> bool:
+=======
+def istensor(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Check of obj is a tensor"""
     tensor_list: tuple[type, ...] = (
         torch.Tensor,
@@ -1225,11 +1469,16 @@ def istensor(obj: Any) -> bool:
     return istype(obj, tensor_list)
 
 
+<<<<<<< HEAD
 def is_lazy_module(mod: Any) -> bool:
+=======
+def is_lazy_module(mod):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(mod, LazyModuleMixin)
 
 
 @functools.lru_cache(4096)
+<<<<<<< HEAD
 def print_once(*args: Any) -> None:
     print(*args)
 
@@ -1239,13 +1488,28 @@ def make_cell(val: Any = None) -> types.CellType:
     x = val
 
     def f() -> Any:
+=======
+def print_once(*args):
+    print(*args)
+
+
+def make_cell(val=None):
+    """Some black magic to create a cell object that usually only exists in a closure"""
+    x = val
+
+    def f():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
     assert f.__closure__ is not None and len(f.__closure__) == 1
     return f.__closure__[0]
 
 
+<<<<<<< HEAD
 def proxy_args_kwargs(args: Any, kwargs: Any) -> tuple[tuple[Any, ...], dict[str, Any]]:
+=======
+def proxy_args_kwargs(args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         proxy_args = tuple(arg.as_proxy() for arg in args)
         proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
@@ -1305,9 +1569,12 @@ class CompilationMetrics:
     compliant_custom_ops: Optional[set[str]] = None
     restart_reasons: Optional[set[str]] = None
     dynamo_time_before_restart_s: Optional[float] = None
+<<<<<<< HEAD
     stack_trace: Optional[list[str]] = None
     exception_stack_trace: Optional[list[str]] = None
     graph_node_shapes: Optional[str] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Sometimes, we will finish analyzing a frame but conclude we don't want
     # to install any guarded code.  True means we actually decided to install
     # a compiled frame
@@ -1318,7 +1585,10 @@ class CompilationMetrics:
     config_inline_inbuilt_nn_modules: Optional[bool] = None
     specialize_float: Optional[bool] = None
     dynamo_config: Optional[str] = None
+<<<<<<< HEAD
     compiler_config: Optional[str] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_forward: Optional[bool] = None
     num_triton_bundles: Optional[int] = None
     remote_fx_graph_cache_get_time_ms: Optional[int] = None
@@ -1377,6 +1647,7 @@ class CompilationMetrics:
     # The number of parameters counted by fields. This is mostly a proxy for
     # the number of distinct type of params.
     param_count: Optional[int] = None
+<<<<<<< HEAD
     recompile_user_contexts: Optional[set[str]] = None
     inline_inbuilt_nn_modules_candidate: Optional[bool] = False
     pytorch_version: Optional[str] = None
@@ -1384,6 +1655,11 @@ class CompilationMetrics:
 
     @classmethod
     def create(cls, metrics: dict[str, Any]) -> CompilationMetrics:
+=======
+
+    @classmethod
+    def create(cls, metrics: dict[str, Any]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Factory method to create a CompilationMetrics from a dict of fields.
         Includes the logic to add legacy fields and any pre-processing, e.g.,
@@ -1464,7 +1740,10 @@ def collection_to_json_str(metric: Optional[Any]) -> Optional[str]:
         compile_id = all_metrics.get("compile_id")
         all_metrics["compile_id"] = str(compile_id) if compile_id else None
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(**all_metrics)
 
 
@@ -1509,6 +1788,7 @@ def add_compilation_metrics_to_chromium(c: CompilationMetrics) -> None:
         fail_user_frame_filename=c.fail_user_frame_filename,
         fail_user_frame_lineno=c.fail_user_frame_lineno,
         # Sets aren't JSON serializable
+<<<<<<< HEAD
         non_compliant_ops=(
             list(c.non_compliant_ops) if c.non_compliant_ops is not None else None
         ),
@@ -1518,6 +1798,17 @@ def add_compilation_metrics_to_chromium(c: CompilationMetrics) -> None:
         restart_reasons=(
             list(c.restart_reasons) if c.restart_reasons is not None else None
         ),
+=======
+        non_compliant_ops=list(c.non_compliant_ops)
+        if c.non_compliant_ops is not None
+        else None,
+        compliant_custom_ops=list(c.compliant_custom_ops)
+        if c.compliant_custom_ops is not None
+        else None,
+        restart_reasons=list(c.restart_reasons)
+        if c.restart_reasons is not None
+        else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamo_time_before_restart_s=c.dynamo_time_before_restart_s,
         has_guarded_code=c.has_guarded_code,
         dynamo_config=c.dynamo_config,
@@ -1559,6 +1850,7 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
     return json.dumps(config_dict, sort_keys=True)
 
 
+<<<<<<< HEAD
 def _compiler_config_for_logging() -> Optional[str]:
     def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
         blocklist = {
@@ -1583,6 +1875,8 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
     return json.dumps(config_dict, sort_keys=True)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _scrubbed_inductor_config_for_logging() -> Optional[str]:
     """
     Method to parse and scrub uninteresting configs from inductor config
@@ -1591,7 +1885,11 @@ def _scrubbed_inductor_config_for_logging() -> Optional[str]:
     # TypeSafeSerializer for json.dumps()
     # Skips complex types as values in config dict
     class TypeSafeSerializer(json.JSONEncoder):
+<<<<<<< HEAD
         def default(self, o: Any) -> Any:
+=======
+        def default(self, o):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 return super().default(o)
             except Exception:
@@ -1599,6 +1897,7 @@ def default(self, o: Any) -> Any:
 
     keys_to_scrub: set[Any] = set()
     inductor_conf_str = None
+<<<<<<< HEAD
     inductor_config_copy = None
 
     if torch._inductor.config:
@@ -1607,6 +1906,11 @@ def default(self, o: Any) -> Any:
         except (TypeError, AttributeError, RuntimeError, AssertionError):
             inductor_conf_str = "Inductor Config cannot be pickled"
 
+=======
+    inductor_config_copy = (
+        torch._inductor.config.get_config_copy() if torch._inductor.config else None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if inductor_config_copy is not None:
         try:
             for key, val in inductor_config_copy.items():
@@ -1637,7 +1941,11 @@ def record_compilation_metrics(
     metrics: dict[str, Any],
     exc_type: Optional[type[BaseException]],
     exc_value: Optional[BaseException],
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch._inductor.utils.should_use_remote_fx_graph_cache():
         try:
             from torch._inductor.fb.remote_cache import REMOTE_CACHE_VERSION
@@ -1667,16 +1975,23 @@ def record_compilation_metrics(
             torch._logging.get_structured_logging_overhead()
         ),
         "dynamo_config": _get_dynamo_config_for_logging(),
+<<<<<<< HEAD
         "config_suppress_errors": config.suppress_errors,
         "config_inline_inbuilt_nn_modules": config.inline_inbuilt_nn_modules,
         "inductor_config": _scrubbed_inductor_config_for_logging(),
         "compiler_config": _compiler_config_for_logging(),
+=======
+        "inductor_config": _scrubbed_inductor_config_for_logging(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "cuda_version": torch.version.cuda,
         "triton_version": triton.__version__ if has_triton() else "",
         "remote_cache_version": remote_cache_version,
         "inductor_fx_remote_cache_backend_type": inductor_fx_remote_cache_backend_type,
         "python_version": sys.version,
+<<<<<<< HEAD
         "pytorch_version": torch.__version__,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     compilation_metrics = CompilationMetrics.create({**common_metrics, **metrics})
@@ -1761,7 +2076,11 @@ def get_outermost_event(self) -> Optional[str]:
         stack = self.get_stack()
         return stack[0] if stack else None
 
+<<<<<<< HEAD
     def get_pt2_compile_substack(self) -> list[str]:
+=======
+    def get_pt2_compile_substack(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         A smaller subset of the main stack that gets used to log
         PT2 Compile Events internally.
@@ -1777,6 +2096,7 @@ def get_event_data(self) -> dict[str, Any]:
             self.tls.event_data = {}
         return self.tls.event_data
 
+<<<<<<< HEAD
     def __init__(self) -> None:
         self.tls = threading.local()
 
@@ -1788,11 +2108,22 @@ def __init__(self) -> None:
             self.id_ = f"{config.pt2_compile_id_prefix}-{uuid.uuid4()}"
         else:
             self.id_ = str(uuid.uuid4())
+=======
+    def __init__(self):
+        self.tls = threading.local()
+        # Generate a unique id for this logger, which we can use in scuba to filter down
+        # to a single python run.
+        self.id_ = str(uuid.uuid4())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: log to init/id tlparse after I add support for it
         log.info("ChromiumEventLogger initialized with id %s", self.id_)
 
+<<<<<<< HEAD
     def try_add_event_data(self, event_name: str, **kwargs: Any) -> None:
+=======
+    def try_add_event_data(self, event_name: str, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Same as add_event_data, but will silently not log if the event isn't in the stack.
         """
@@ -1803,7 +2134,11 @@ def try_add_event_data(self, event_name: str, **kwargs: Any) -> None:
     def add_event_data(
         self,
         event_name: str,
+<<<<<<< HEAD
         **kwargs: Any,
+=======
+        **kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Adds additional metadata info to an in-progress event
@@ -1820,7 +2155,11 @@ def add_event_data(
             event_data[event_name] = {}
         event_data[event_name].update(kwargs)
 
+<<<<<<< HEAD
     def increment(self, event_name: str, key: str, value: int) -> None:
+=======
+    def increment(self, event_name: str, key: str, value: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Increment an integer event data field by the given amount
         """
@@ -1843,7 +2182,11 @@ def add_to_set(
         event_name: str,
         key: str,
         value: Any,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Add a value to a set within a event_name's metadata if it exists
         """
@@ -1939,7 +2282,11 @@ def log_event_end(
             event_metadata,
         )
 
+<<<<<<< HEAD
         def pop_stack(stack: list[str]) -> None:
+=======
+        def pop_stack(stack):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while event_name != stack[-1]:
                 # If the event isn't the most recent one to end, pop
                 # off the stack until it is.
@@ -2100,14 +2447,22 @@ class CleanupHook:
     scope: dict[str, Any]
     name: str
 
+<<<<<<< HEAD
     def __call__(self, *args: Any) -> None:
+=======
+    def __call__(self, *args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Make sure we're not shutting down
         if CleanupManager is not None:
             CleanupManager.count -= 1
         del self.scope[self.name]
 
     @staticmethod
+<<<<<<< HEAD
     def create(scope: dict[str, Any], name: str, val: Any) -> CleanupHook:
+=======
+    def create(scope, name, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert name not in scope
         CleanupManager.count += 1
         scope[name] = val
@@ -2118,7 +2473,11 @@ class CleanupManager(ExactWeakKeyDictionary):
     count = 0
     instance: ClassVar[CleanupManager]
 
+<<<<<<< HEAD
     def _remove_id(self, idx: int) -> None:
+=======
+    def _remove_id(self, idx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for hook in self.values[idx]:
             hook()
         super()._remove_id(idx)
@@ -2127,7 +2486,11 @@ def _remove_id(self, idx: int) -> None:
 CleanupManager.instance = CleanupManager()
 
 
+<<<<<<< HEAD
 def clone_tensor(x: torch.Tensor) -> torch.Tensor:
+=======
+def clone_tensor(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Clone the tensor and its gradient"""
     y = x.clone().requires_grad_(x.requires_grad)
     if x.is_leaf and x.grad is not None:
@@ -2135,16 +2498,24 @@ def clone_tensor(x: torch.Tensor) -> torch.Tensor:
     return y
 
 
+<<<<<<< HEAD
 def clone_input(
     x: torch.Tensor, *, dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
+=======
+def clone_input(x, *, dtype=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """copy while preserving strides"""
     # TODO: this is questionable
     if is_fake(x):
         # this func fails on fake tensors in __torch_dispatch__
         return x
 
+<<<<<<< HEAD
     def torch_clone(x: torch.Tensor) -> torch.Tensor:
+=======
+    def torch_clone(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = torch.clone(x)
         if x.is_leaf:
             y.requires_grad_(x.requires_grad)
@@ -2181,10 +2552,13 @@ def torch_clone(x: torch.Tensor) -> torch.Tensor:
                 x.shape,
                 layout=x.layout,
             )
+<<<<<<< HEAD
         elif is_traceable_wrapper_subclass(x):
             # Questionable - but this is required to not fail executorch related
             # torchao tests.
             return torch_clone(x)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         needed_size = sum(
             (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
@@ -2215,6 +2589,7 @@ def torch_clone(x: torch.Tensor) -> torch.Tensor:
         return result
 
 
+<<<<<<< HEAD
 @overload
 def clone_inputs(
     example_inputs: dict[str, Union[T, tuple[T, ...]]],
@@ -2227,6 +2602,10 @@ def clone_inputs(example_inputs: Sequence[T]) -> list[T]: ...
 
 def clone_inputs(example_inputs: Any) -> Any:
     res: Union[dict[str, Any], list[Any]]
+=======
+def clone_inputs(example_inputs):
+    res: Union[dict[Any, Any], list[Any]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if type(example_inputs) is dict:
         res = dict(example_inputs)
         for key, value in res.items():
@@ -2244,7 +2623,11 @@ def clone_inputs(example_inputs: Any) -> Any:
     return res
 
 
+<<<<<<< HEAD
 def skip_frame_if_in_functorch_mode(val: torch.Tensor) -> None:
+=======
+def skip_frame_if_in_functorch_mode(val: torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         val.data_ptr()  # will throw for functorch tensors
     except RuntimeError as e:
@@ -2258,7 +2641,11 @@ def skip_frame_if_in_functorch_mode(val: torch.Tensor) -> None:
 
 
 @contextmanager
+<<<<<<< HEAD
 def preserve_rng_state() -> Generator[None, None, None]:
+=======
+def preserve_rng_state():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disable_functorch = torch._C._DisableFuncTorch
     disable_current_modes = torch.utils._python_dispatch._disable_current_modes
     with disable_current_modes(), disable_functorch():
@@ -2276,6 +2663,7 @@ def preserve_rng_state() -> Generator[None, None, None]:
 
 
 def is_jit_model(
+<<<<<<< HEAD
     model0: Any,
 ) -> TypeIs[
     Union[
@@ -2286,6 +2674,10 @@ def is_jit_model(
         torch.jit.ScriptModule,
     ]
 ]:
+=======
+    model0,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(
         model0,
         (
@@ -2297,7 +2689,11 @@ def is_jit_model(
     )
 
 
+<<<<<<< HEAD
 def torchscript(model: Any, example_inputs: Any, verbose: bool = False) -> Any:
+=======
+def torchscript(model, example_inputs, verbose=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_jit_model(model):
         # already done?
         return model
@@ -2315,19 +2711,31 @@ def torchscript(model: Any, example_inputs: Any, verbose: bool = False) -> Any:
     return None
 
 
+<<<<<<< HEAD
 def getfile(obj: Any) -> Optional[str]:
+=======
+def getfile(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         return inspect.getfile(obj)
     except (TypeError, OSError):
         return None
 
 
+<<<<<<< HEAD
 def is_namedtuple(obj: Any) -> bool:
+=======
+def is_namedtuple(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
     return is_namedtuple_cls(type(obj))
 
 
+<<<<<<< HEAD
 def is_namedtuple_cls(cls: Any) -> bool:
+=======
+def is_namedtuple_cls(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Test if an object is a namedtuple or a (torch.return_types|torch.autograd.forward_ad).* quasi-namedtuple"""
     try:
         if issubclass(cls, tuple):
@@ -2358,7 +2766,11 @@ def is_namedtuple_cls(cls: Any) -> bool:
 
 
 @functools.lru_cache(1)
+<<<<<<< HEAD
 def namedtuple_fields(cls: type) -> tuple[str, ...]:
+=======
+def namedtuple_fields(cls) -> tuple[str, ...]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Get the fields of a namedtuple or a torch.return_types.* quasi-namedtuple"""
     if cls is slice:
         return ("start", "stop", "step")
@@ -2374,27 +2786,46 @@ class Marker:
 
     # frustrating ones e.g. torch.return_types.max
     assert cls.__module__ == "torch.return_types"
+<<<<<<< HEAD
     obj = cls(map(Marker, range(cls.n_fields)))  # type: ignore[attr-defined]
+=======
+    obj = cls(map(Marker, range(cls.n_fields)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fields: dict[str, int] = {}
     for name in dir(obj):
         if name[0] != "_" and isinstance(getattr(obj, name), Marker):
             fields[name] = getattr(obj, name).index
+<<<<<<< HEAD
     assert len(fields) == cls.n_fields  # type: ignore[attr-defined]
     return tuple(sorted(fields, key=fields.get))  # type: ignore[arg-type]
 
 
 def checkpoint_params(gm: torch.fx.GraphModule) -> Callable[[], None]:
+=======
+    assert len(fields) == cls.n_fields
+    return tuple(sorted(fields, key=fields.get))  # type: ignore[arg-type]
+
+
+def checkpoint_params(gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad():
         rng_state = torch.clone(torch.random.get_rng_state())
         if torch.cuda.is_available():
             cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
         saved_state = [
             (param, param._version, torch.clone(param))
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             for param in itertools.chain(gm.parameters(), gm.buffers())
         ]
 
     def restore() -> None:
+=======
+            for param in itertools.chain(gm.parameters(), gm.buffers())
+        ]
+
+    def restore():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.no_grad():
             torch.random.set_rng_state(rng_state)
             if torch.cuda.is_available():
@@ -2406,9 +2837,13 @@ def restore() -> None:
     return restore
 
 
+<<<<<<< HEAD
 def timed(
     model: Any, example_inputs: Iterable[Any], times: int = 1
 ) -> tuple[Any, float]:
+=======
+def timed(model, example_inputs, times=1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch.cuda.is_available():
         synchronize = torch.cuda.synchronize
     else:
@@ -2425,12 +2860,20 @@ def timed(
     return result, t1 - t0  # type: ignore[possibly-undefined]
 
 
+<<<<<<< HEAD
 def check_is_cuda(gm: torch.fx.GraphModule, example_inputs: Iterable[Any]) -> bool:
+=======
+def check_is_cuda(gm, example_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return all(x.is_cuda for x in itertools.chain(example_inputs, gm.parameters(True)))
 
 
 @lru_cache(32)
+<<<<<<< HEAD
 def rot_n_helper(n: int) -> Callable[..., Any]:
+=======
+def rot_n_helper(n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert n > 1
     vars = [f"v{i}" for i in range(n)]
     rotated = reversed(vars[-1:] + vars[:-1])
@@ -2474,7 +2917,11 @@ def rot_n_helper(n: int) -> Callable[..., Any]:
 """
 
 
+<<<<<<< HEAD
 def is_safe_constant(v: Any) -> bool:
+=======
+def is_safe_constant(v):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if istype(v, (tuple, frozenset)):
         return all(map(is_safe_constant, v))
     return isinstance(
@@ -2493,7 +2940,11 @@ def is_safe_constant(v: Any) -> bool:
 
 
 @functools.cache
+<<<<<<< HEAD
 def common_constants() -> set[int]:
+=======
+def common_constants():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         # We zero-one specialize shapes, so specialize these constants
         # too
@@ -2508,7 +2959,11 @@ def is_torch_sym(value: Any) -> TypeGuard[Union[torch.SymBool, torch.SymInt]]:
     )
 
 
+<<<<<<< HEAD
 def is_int_specialization_case(value: Any, source: Any) -> bool:
+=======
+def is_int_specialization_case(value, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .source import is_from_defaults
 
     return not TracingContext.get().force_unspec_int_unbacked_size_like and (
@@ -2539,7 +2994,11 @@ def is_int_specialization_case(value: Any, source: Any) -> bool:
     )
 
 
+<<<<<<< HEAD
 def specialize_symnode(arg: Any) -> Any:
+=======
+def specialize_symnode(arg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .variables import ConstantVariable, LazyVariableTracker, SymNodeVariable
 
     # Guard and specialize
@@ -2564,7 +3023,11 @@ def specialize_symnode(arg: Any) -> Any:
     return arg
 
 
+<<<<<<< HEAD
 def guard_if_dyn(arg: Any) -> Any:
+=======
+def guard_if_dyn(arg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .variables import ConstantVariable
 
     arg = specialize_symnode(arg)
@@ -2575,11 +3038,19 @@ def guard_if_dyn(arg: Any) -> Any:
     return arg
 
 
+<<<<<<< HEAD
 def check_constant_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
     return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values()))
 
 
 def check_unspec_python_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
+=======
+def check_constant_args(args, kwargs):
+    return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values()))
+
+
+def check_unspec_python_args(args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .variables.constant import ConstantVariable
     from .variables.tensor import UnspecializedPythonVariable
 
@@ -2592,9 +3063,13 @@ def check_unspec_python_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) ->
     return unspec_count > 0
 
 
+<<<<<<< HEAD
 def check_unspec_or_constant_args(
     args: Iterable[Any], kwargs: Mapping[Any, Any]
 ) -> bool:
+=======
+def check_unspec_or_constant_args(args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # A fused version of:
     # return check_constant_args(args, kwargs) or check_unspec_python_args(args, kwargs)
     from .variables.tensor import UnspecializedPythonVariable
@@ -2605,7 +3080,11 @@ def check_unspec_or_constant_args(
     return True
 
 
+<<<<<<< HEAD
 def check_numpy_ndarray_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
+=======
+def check_numpy_ndarray_args(args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .variables.tensor import NumpyNdarrayVariable
 
     return any(
@@ -2628,10 +3107,13 @@ def check_numpy_ndarray_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) ->
     for method in itertools.chain(dict.__dict__.values(), OrderedDict.__dict__.values())
     if callable(method)
 }
+<<<<<<< HEAD
 set_methods = {method for method in set.__dict__.values() if callable(method)}
 frozenset_methods = {
     method for method in frozenset.__dict__.values() if callable(method)
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 tuple_new = tuple.__new__
 tuple_methods = {method for method in tuple.__dict__.values() if callable(method)}
@@ -2640,22 +3122,32 @@ def check_numpy_ndarray_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) ->
 
 str_methods = {method for method in str.__dict__.values() if callable(method)}
 
+<<<<<<< HEAD
 K = TypeVar("K")
 V = TypeVar("V")
 
 
 def builtin_dict_keys(d: dict[K, V]) -> KeysView[K]:
+=======
+
+def builtin_dict_keys(d):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Avoids overridden keys method of the dictionary
     assert isinstance(d, dict)
     return dict.keys(d)
 
 
+<<<<<<< HEAD
 def get_items_from_dict(obj: dict[K, V]) -> Iterable[tuple[K, Union[V, Any]]]:
+=======
+def get_items_from_dict(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Get items without calling the user defined __getitem__ or keys method.
     assert isinstance(obj, dict)
     if istype(obj, (dict, OrderedDict)):
         return obj.items()
     elif isinstance(obj, OrderedDict):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return [(k, OrderedDict.__getitem__(obj, k)) for k in OrderedDict.keys(obj)]
     else:
@@ -2666,26 +3158,48 @@ def get_items_from_dict(obj: dict[K, V]) -> Iterable[tuple[K, Union[V, Any]]]:
 def nn_module_new(cls: Any) -> Any:
     obj = object_new(cls)
     # pyrefly: ignore [bad-argument-type]
+=======
+        return [(k, OrderedDict.__getitem__(obj, k)) for k in OrderedDict.keys(obj)]
+    else:
+        return [(k, dict.__getitem__(obj, k)) for k in dict.keys(obj)]
+
+
+def nn_module_new(cls):
+    obj = object_new(cls)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.nn.Module.__init__(obj)
     return obj
 
 
+<<<<<<< HEAD
 def product(it: Iterable[T]) -> int:
     return functools.reduce(operator.mul, it, 1)
 
 
 def tuple_iterator_getitem(it: Any, index: int) -> Any:
+=======
+def product(it):
+    return functools.reduce(operator.mul, it, 1)
+
+
+def tuple_iterator_getitem(it, index):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _, (obj,), start = it.__reduce__()
     return obj[start + index]
 
 
+<<<<<<< HEAD
 def dataclass_fields(cls: Any) -> Any:
+=======
+def dataclass_fields(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._dynamo.disable(dataclasses.fields)(cls)
 
 
 iter_next = next
 
 
+<<<<<<< HEAD
 def normalize_range_iter(range_iter: Any) -> tuple[int, int, int]:
     _, (range_obj,), maybe_idx = range_iter.__reduce__()
     # In 3.12+, `maybe_idx` could be None, and `range_obj.start` would've been
@@ -2695,23 +3209,39 @@ def normalize_range_iter(range_iter: Any) -> tuple[int, int, int]:
     # start. See:
     # https://github.com/python/cpython/blob/ea77feecbba389916af8f90b2fc77f07910a2963/Objects/rangeobject.c#L885-L899
     start = range_obj.start + (maybe_idx or 0) * range_obj.step
+=======
+def normalize_range_iter(range_iter) -> tuple[int, int, int]:
+    _, (range_obj,), maybe_idx = range_iter.__reduce__()
+    # In 3.12+, `maybe_idx` could be None, and `range_obj.start` would've been
+    # already incremented by the current index.
+    start = range_obj.start + (maybe_idx or 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stop = range_obj.stop
     step = range_obj.step
     return (start, stop, step)
 
 
+<<<<<<< HEAD
 def to_subclass(t: Any, cls: type) -> Any:
+=======
+def to_subclass(t, cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return t.as_subclass(cls)
 
 
 dict_getitem = dict.__getitem__
 
 
+<<<<<<< HEAD
 def dict_keys_getitem(d: dict[Any, Any], n: int) -> Any:
+=======
+def dict_keys_getitem(d, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Call dict(d) to prevent calling overridden __iter__/keys
     dict_class = dict
     if isinstance(d, OrderedDict):
         dict_class = OrderedDict
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     return next(itertools.islice(dict_class.keys(d), n, n + 1))
 
@@ -2722,6 +3252,12 @@ def set_getitem(s: set[T], n: int) -> T:
 
 
 def enum_repr(value: Any, local: bool) -> str:
+=======
+    return next(itertools.islice(dict_class.keys(d), n, n + 1))
+
+
+def enum_repr(value, local):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # enum class can override __str__ method. Use __class__ and name attribute
     # to extract the class name and key name.
     name = value.__class__.__name__
@@ -2731,7 +3267,11 @@ def enum_repr(value: Any, local: bool) -> str:
     return local_name
 
 
+<<<<<<< HEAD
 def set_example_value(node: torch.fx.Node, example_value: Any) -> None:
+=======
+def set_example_value(node, example_value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: example_value is a bit of a misnomer, because this is always a fake
     # tensor of some sort.  Furthermore, these example values serve as the
     # runtime state of Dynamo tracing, which means if metadata mutation
@@ -2739,9 +3279,13 @@ def set_example_value(node: torch.fx.Node, example_value: Any) -> None:
     # this to accurately reflect what the state of the value was at the time
     # the program was traced).
     node.meta["example_value"] = example_value
+<<<<<<< HEAD
     fake_mode = TracingContext.get().fake_mode
     assert fake_mode is not None
     shape_env = fake_mode.shape_env
+=======
+    shape_env = TracingContext.get().fake_mode.shape_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (
         symbol_to_path
         := torch.fx.experimental.symbolic_shapes.compute_unbacked_bindings(
@@ -2751,7 +3295,11 @@ def set_example_value(node: torch.fx.Node, example_value: Any) -> None:
         node.meta["unbacked_bindings"] = symbol_to_path
 
 
+<<<<<<< HEAD
 def _get_fake_tensor(vt: VariableTracker) -> Any:
+=======
+def _get_fake_tensor(vt):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fake_tensor = vt.as_proxy().node.meta.get("example_value")
     if not is_fake(fake_tensor):
         from . import graph_break_hints
@@ -2766,6 +3314,7 @@ def _get_fake_tensor(vt: VariableTracker) -> Any:
     return fake_tensor
 
 
+<<<<<<< HEAD
 def slice_length(s: slice, seq_len: int) -> int:
     start, stop, step = s.indices(seq_len)
     return max(0, (stop - start + (step - (1 if step > 0 else -1))) // step)
@@ -2801,6 +3350,16 @@ def iter_contains(
 ) -> Any:
     from .variables import BuiltinVariable, ConstantVariable, TensorVariable
 
+=======
+def iter_contains(items, search, tx, check_tensor_identity=False):
+    from .variables import (
+        BuiltinVariable,
+        ConstantVariable,
+        TensorVariable,
+        VariableTracker,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if search.is_python_constant():
         found_const = any(
             x.is_python_constant()
@@ -2841,11 +3400,19 @@ def key_is_id(
     return isinstance(k, (torch.Tensor, torch.nn.Module, MethodWrapperType))
 
 
+<<<<<<< HEAD
 def key_to_id(value: Any) -> list[Any]:
     return [id(k) if key_is_id(k) else k for k in value.keys()]
 
 
 def const_repr(x: Any, *, local: Any) -> str:
+=======
+def key_to_id(value):
+    return [id(k) if key_is_id(k) else k for k in value.keys()]
+
+
+def const_repr(x, *, local) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .trace_rules import is_builtin_callable
 
     if isinstance(x, (list, tuple)):
@@ -2866,7 +3433,11 @@ def const_repr(x: Any, *, local: Any) -> str:
         return x.__name__
     elif isinstance(x, type):
 
+<<<<<<< HEAD
         def fullname(o: Any) -> str:
+=======
+        def fullname(o):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             klass = o.__class__
             module = klass.__module__
             if module == "builtins":
@@ -2878,7 +3449,11 @@ def fullname(o: Any) -> str:
         return f"{x!r}"
 
 
+<<<<<<< HEAD
 def dict_keys_repr(const_keys: Any, *, local: Any) -> str:
+=======
+def dict_keys_repr(const_keys, *, local) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     keys_str = ",".join(const_repr(s, local=local) for s in const_keys)
     return "[" + keys_str + "]"
 
@@ -2889,7 +3464,11 @@ def dict_keys_repr(const_keys: Any, *, local: Any) -> str:
 from torch._subclasses import UnsupportedFakeTensorException  # noqa: F401
 
 
+<<<<<<< HEAD
 def get_safe_global_name(tx: InstructionTranslatorBase, root: str, obj: Any) -> str:
+=======
+def get_safe_global_name(tx, root, obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The global_mangled_class_name should be different for different
     # invocations of torch.compile. Otherwise, we can run into a situation
     # where multiple torch.compile invocations reuse the same global name,
@@ -2899,16 +3478,24 @@ def get_safe_global_name(tx: InstructionTranslatorBase, root: str, obj: Any) ->
     return f"{root}_{id(obj)}_c{tx.output.compile_id}"
 
 
+<<<<<<< HEAD
 def is_in(item: T, *containers: Container[T]) -> bool:
+=======
+def is_in(item: Any, *containers) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for container in containers:
         if item in container:
             return True
     return False
 
 
+<<<<<<< HEAD
 def get_unique_name_wrt(
     prefix: str, *containers: Any, requires_suffix: bool = False
 ) -> str:
+=======
+def get_unique_name_wrt(prefix: str, *containers, requires_suffix=False) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Return a name that starts with `prefix` and is not in any of the
     `containers` (e.g., map, set).
@@ -2924,7 +3511,11 @@ def get_unique_name_wrt(
     raise AssertionError("unreachable")
 
 
+<<<<<<< HEAD
 def wrap_fake_exception(fn: Callable[[], Any]) -> Any:
+=======
+def wrap_fake_exception(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         return fn()
     except UnsupportedFakeTensorException as e:
@@ -2941,20 +3532,29 @@ def wrap_fake_exception(fn: Callable[[], Any]) -> Any:
         )
 
 
+<<<<<<< HEAD
 def deepcopy_to_fake_tensor(
     obj: Any, fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
 ) -> Any:
+=======
+def deepcopy_to_fake_tensor(obj, fake_mode):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode):
         return wrap_fake_exception(lambda: copy.deepcopy(obj))
 
 
+<<<<<<< HEAD
 def rmse(ref: torch.Tensor, res: torch.Tensor) -> torch.Tensor:
+=======
+def rmse(ref, res):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Calculate root mean squared error
     """
     return torch.sqrt(torch.mean(torch.square(ref - res)))
 
 
+<<<<<<< HEAD
 def bitwise_same(ref: Any, res: Any, equal_nan: bool = False) -> bool:
     return same(
         ref,
@@ -2978,6 +3578,22 @@ def same(
     use_larger_multiplier_for_smaller_tensor: bool = False,
     force_max_multiplier: bool = False,
 ) -> bool:
+=======
+def same(
+    ref,
+    res,
+    fp64_ref=None,
+    cos_similarity=False,
+    tol=1e-4,
+    equal_nan=False,
+    exact_dtype=True,
+    relax_numpy_equality=False,
+    ignore_non_fp=False,
+    log_error=log.error,
+    use_larger_multiplier_for_smaller_tensor=False,
+    force_max_multiplier: bool = False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Check correctness to see if ref and res match"""
     if fp64_ref is None:
         fp64_ref = ref
@@ -3058,7 +3674,11 @@ def same(
         assert not isinstance(ref, torch._subclasses.FakeTensor)
         assert not isinstance(res, torch._subclasses.FakeTensor)
 
+<<<<<<< HEAD
         def to_tensor(t: Any) -> torch.Tensor:
+=======
+        def to_tensor(t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return t if isinstance(t, torch.Tensor) else torch.tensor(t)
 
         ref, res, fp64_ref = (to_tensor(val) for val in (ref, res, fp64_ref))
@@ -3097,7 +3717,11 @@ def to_tensor(t: Any) -> torch.Tensor:
             score = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6)
             if score < 0.99:
                 log.warning("Similarity score=%s", score.detach().cpu().item())
+<<<<<<< HEAD
             return bool(score >= 0.99)
+=======
+            return score >= 0.99
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             if not exact_dtype:
                 ref = ref.to(res.dtype)
@@ -3137,7 +3761,11 @@ def to_tensor(t: Any) -> torch.Tensor:
 
                 res_error = rmse(fp64_ref, res).item()
 
+<<<<<<< HEAD
                 def get_multiplier() -> float:
+=======
+                def get_multiplier():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # In some particular cases, we expect high difference in results.
                     # At the moment one of this cases is inductor freezing bfloat16 convolution const folding.
                     # In case of it the res_error is at least one order of magnitude higher.
@@ -3268,18 +3896,29 @@ def get_multiplier() -> float:
         raise RuntimeError(f"unsupported type: {type(ref).__name__}")
 
 
+<<<<<<< HEAD
 def format_func_info(code: CodeType) -> str:
+=======
+def format_func_info(code):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     short_filename = code.co_filename.split("/")[-1]
     return f"'{code.co_name}' ({short_filename}:{code.co_firstlineno})"
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def disable_cache_limit() -> Generator[None, None, None]:
     prior = config.recompile_limit
     # pyrefly: ignore [bad-assignment]
     config.recompile_limit = sys.maxsize
     prior_acc_limit = config.accumulated_recompile_limit
     # pyrefly: ignore [bad-assignment]
+=======
+def disable_cache_limit():
+    prior = config.recompile_limit
+    config.recompile_limit = sys.maxsize
+    prior_acc_limit = config.accumulated_recompile_limit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     config.accumulated_recompile_limit = sys.maxsize
 
     try:
@@ -3305,7 +3944,11 @@ def disable_cache_limit() -> Generator[None, None, None]:
 
 # return same dir unless user changes config between calls
 @functools.cache
+<<<<<<< HEAD
 def _get_debug_dir(root_dir: str) -> str:
+=======
+def _get_debug_dir(root_dir):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dir_name = (
         "run_"
         + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
@@ -3316,12 +3959,20 @@ def _get_debug_dir(root_dir: str) -> str:
     return os.path.join(root_dir, dir_name)
 
 
+<<<<<<< HEAD
 def get_debug_dir() -> str:
+=======
+def get_debug_dir():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     debug_root = config.debug_dir_root
     return _get_debug_dir(debug_root)
 
 
+<<<<<<< HEAD
 def extract_fake_example_value(node: torch.fx.Node, required: bool = True) -> Any:
+=======
+def extract_fake_example_value(node, required=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if "example_value" in node.meta and is_fake(node.meta["example_value"]):
         return node.meta["example_value"]
     elif required:
@@ -3339,15 +3990,24 @@ def extract_fake_example_value(node: torch.fx.Node, required: bool = True) -> An
         return None
 
 
+<<<<<<< HEAD
 def ensure_graph_fake(e: Any, tx: InstructionTranslatorBase) -> Any:
+=======
+def ensure_graph_fake(e, tx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert maybe_get_fake_mode(e) is tx.fake_mode
     return e
 
 
+<<<<<<< HEAD
 def get_fake_values_from_nodes(
     tx: InstructionTranslatorBase, nodes: Any, allow_non_graph_fake: bool
 ) -> Any:
     def visit(n: torch.fx.Node) -> Any:
+=======
+def get_fake_values_from_nodes(tx, nodes, allow_non_graph_fake):
+    def visit(n: torch.fx.Node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if n.op == "call_function" and "example_value" not in n.meta:
             # fake tensor validity is checked inside get_fake_value using
             # ensure_graph_fake
@@ -3355,7 +4015,11 @@ def visit(n: torch.fx.Node) -> Any:
 
         elif n.op == "get_attr" and "example_value" not in n.meta:
             assert n.target in tx.output.nn_modules
+<<<<<<< HEAD
             gm = tx.output.nn_modules[n.target]  # type: ignore[index]
+=======
+            gm = tx.output.nn_modules[n.target]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert isinstance(gm, torch.fx.GraphModule)
             return gm
 
@@ -3367,11 +4031,15 @@ def visit(n: torch.fx.Node) -> Any:
     return torch.fx.node.map_arg(nodes, visit)
 
 
+<<<<<<< HEAD
 def get_fake_value(
     node: torch.fx.Node,
     tx: InstructionTranslatorBase,
     allow_non_graph_fake: bool = False,
 ) -> Any:
+=======
+def get_fake_value(node, tx, allow_non_graph_fake=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Run the computation represented by `node` using fake tensors and return the result.
 
@@ -3415,6 +4083,7 @@ def get_fake_value(
         id_to_initial_version = {}
 
     nnmodule = None
+<<<<<<< HEAD
     fake_mode = tx.fake_mode
     assert fake_mode is not None
     if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
@@ -3423,6 +4092,14 @@ def get_fake_value(
 
     if op == "call_module":
         nnmodule = tx.output.nn_modules[node.target]  # type: ignore[index]
+=======
+    if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
+        # If the first argument is nn.Module, should copy to fake mode.
+        args = (deepcopy_to_fake_tensor(args[0], tx.fake_mode),) + tuple(args[1:])
+
+    if op == "call_module":
+        nnmodule = tx.output.nn_modules[node.target]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if is_lazy_module(nnmodule) and hasattr(nnmodule, "_initialize_hook"):
             # In the case of a lazy module, we want to run
@@ -3432,23 +4109,37 @@ def get_fake_value(
             nnmodule._infer_parameters(nnmodule, args)
 
         # no matter it's lazy module or not, we should copy to fake mode.
+<<<<<<< HEAD
         nnmodule = deepcopy_to_fake_tensor(nnmodule, fake_mode)
+=======
+        nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if node.name in ["interpolate", "is_integer", "wrapped_gradient"] or any(
         isinstance(a, complex) for a in args
     ):
         # We need to specialize symfloats for now. Eventually we should do a tensorify pass in dynamo.
         args = tuple(
+<<<<<<< HEAD
             (
                 float(arg)
                 if isinstance(arg, torch.SymFloat) and arg.node.hint is not None
                 else arg
             )
+=======
+            float(arg)
+            if isinstance(arg, torch.SymFloat) and arg.node.hint is not None
+            else arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for arg in args
         )
 
     try:
+<<<<<<< HEAD
         with fake_mode, enable_python_dispatcher():
+=======
+        with tx.fake_mode, enable_python_dispatcher():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret_val = wrap_fake_exception(
                 lambda: run_node(tx.output, node, args, kwargs, nnmodule)
             )
@@ -3510,7 +4201,11 @@ def get_fake_value(
         elif isinstance(
             cause, torch._subclasses.fake_tensor.UnsupportedOperatorException
         ):
+<<<<<<< HEAD
             op = cause.func  # type: ignore[assignment]
+=======
+            op = cause.func
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             import_suggestion = ""
             if isinstance(op, torch._ops.OpOverload):
                 maybe_pystub = torch._C._dispatch_pystub(
@@ -3574,12 +4269,20 @@ def get_fake_value(
 _current_node = threading.local()
 
 
+<<<<<<< HEAD
 def get_current_node() -> Optional[torch.fx.Node]:
+=======
+def get_current_node():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return getattr(_current_node, "value", None)
 
 
 @contextmanager
+<<<<<<< HEAD
 def set_current_node(node: torch.fx.Node) -> Generator[None, None, None]:
+=======
+def set_current_node(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     old = get_current_node()
     _current_node.value = node
     try:
@@ -3588,9 +4291,13 @@ def set_current_node(node: torch.fx.Node) -> Generator[None, None, None]:
         _current_node.value = old
 
 
+<<<<<<< HEAD
 def run_node(
     tracer: Any, node: torch.fx.Node, args: Any, kwargs: Any, nnmodule: Any
 ) -> Any:
+=======
+def run_node(tracer, node, args, kwargs, nnmodule):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Runs a given node, with the given args and kwargs.
 
@@ -3609,7 +4316,11 @@ def run_node(
 
     with set_current_node(node):
 
+<<<<<<< HEAD
         def make_error_message(e: Any) -> str:
+=======
+        def make_error_message(e):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (
                 f"Dynamo failed to run FX node with fake tensors: {op} {node.target}(*{args}, **{kwargs}): got "
                 + repr(e)
@@ -3619,9 +4330,15 @@ def make_error_message(e: Any) -> str:
 
         try:
             if op == "call_function":
+<<<<<<< HEAD
                 return node.target(*args, **kwargs)  # type: ignore[operator]
             elif op == "call_method":
                 if not hasattr(args[0], node.target):  # type: ignore[arg-type]
+=======
+                return node.target(*args, **kwargs)
+            elif op == "call_method":
+                if not hasattr(args[0], node.target):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     from .exc import unimplemented_v2
 
                     unimplemented_v2(
@@ -3630,7 +4347,11 @@ def make_error_message(e: Any) -> str:
                         explanation=make_error_message("attribute not defined"),
                         hints=[],
                     )
+<<<<<<< HEAD
                 return getattr(args[0], node.target)(*args[1:], **kwargs)  # type: ignore[arg-type]
+=======
+                return getattr(args[0], node.target)(*args[1:], **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif op == "call_module":
                 assert nnmodule is not None
                 return nnmodule(*args, **kwargs)
@@ -3667,7 +4388,11 @@ def make_error_message(e: Any) -> str:
     raise AssertionError(op)
 
 
+<<<<<<< HEAD
 def get_real_value(node: torch.fx.Node, tracer: Any) -> Any:
+=======
+def get_real_value(node, tracer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Run the actual computation represented by `node` and return the result.
     This will execute any dependent nodes in the graph as well.
@@ -3706,10 +4431,17 @@ def get_real_value(node: torch.fx.Node, tracer: Any) -> Any:
     return real_value
 
 
+<<<<<<< HEAD
 def assert_no_fake_params_or_buffers(gm: torch.fx.GraphModule) -> None:
     from torch._subclasses.fake_tensor import FakeTensorConfig, is_fake
 
     def stack_or_hint(t: Any) -> str:
+=======
+def assert_no_fake_params_or_buffers(gm):
+    from torch._subclasses.fake_tensor import FakeTensorConfig, is_fake
+
+    def stack_or_hint(t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if FakeTensorConfig.debug:
             import traceback
 
@@ -3727,21 +4459,33 @@ def stack_or_hint(t: Any) -> str:
         )
 
 
+<<<<<<< HEAD
 def fqn(obj: Any) -> str:
+=======
+def fqn(obj: Any):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns the fully qualified name of the object.
     """
     return f"{obj.__module__}.{obj.__qualname__}"
 
 
+<<<<<<< HEAD
 def ifdynstaticdefault(count1: Any, count2: Any) -> Any:
+=======
+def ifdynstaticdefault(count1, count2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch._dynamo.config.assume_static_by_default:
         return count1
     else:
         return count2
 
 
+<<<<<<< HEAD
 def import_submodule(mod: types.ModuleType) -> None:
+=======
+def import_submodule(mod: types.ModuleType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Ensure all the files in a given submodule are imported
     """
@@ -3750,17 +4494,29 @@ def import_submodule(mod: types.ModuleType) -> None:
             importlib.import_module(f"{mod.__name__}.{filename[:-3]}")
 
 
+<<<<<<< HEAD
 def object_has_getattribute(value: Any) -> bool:
     return class_has_getattribute(type(value))
 
 
 def object_setattr_ignore_descriptor(obj: Any, name: str, value: Any) -> None:
+=======
+def object_has_getattribute(value: Any):
+    return class_has_getattribute(type(value))
+
+
+def object_setattr_ignore_descriptor(obj, name, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/python/cpython/blob/3.11/Objects/object.c#L1286-L1335
     d = object.__getattribute__(obj, "__dict__")
     d[name] = value
 
 
+<<<<<<< HEAD
 def class_has_getattribute(cls: type) -> bool:
+=======
+def class_has_getattribute(cls: type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         if isinstance(
             inspect.getattr_static(cls, "__getattribute__"),
@@ -3772,9 +4528,13 @@ def class_has_getattribute(cls: type) -> bool:
     return False
 
 
+<<<<<<< HEAD
 def get_custom_getattr(
     value: Any, ignore_nn_module_getattr: bool = False
 ) -> Optional[Any]:
+=======
+def get_custom_getattr(value: Any, ignore_nn_module_getattr: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         getattr_fn = inspect.getattr_static(type(value), "__getattr__")
     except AttributeError:
@@ -3791,7 +4551,11 @@ class TensorStaticReason(enum.Enum):
     NN_MODULE_PROPERTY = 5
 
 
+<<<<<<< HEAD
 def tensor_static_reason_to_message(reason: TensorStaticReason) -> str:
+=======
+def tensor_static_reason_to_message(reason: TensorStaticReason):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if reason == TensorStaticReason.PARAMETER:
         return "mark_dynamic on parameter, parameters are always static today."
     if reason == TensorStaticReason.NOT_TENSOR:
@@ -3835,8 +4599,13 @@ def tensor_always_has_static_shape(
     return False, None
 
 
+<<<<<<< HEAD
 def lazy_format_graph_tabular(fn_name: str, gm: torch.fx.GraphModule) -> Any:
     def inner() -> str:
+=======
+def lazy_format_graph_tabular(fn_name, gm):
+    def inner():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             from tabulate import tabulate  # TODO: Check that this is installed
         except ImportError:
@@ -3856,9 +4625,13 @@ def inner() -> str:
     return LazyString(inner)
 
 
+<<<<<<< HEAD
 def format_bytecode(
     prefix: str, name: str, filename: str, line_no: int, code: Any
 ) -> str:
+=======
+def format_bytecode(prefix, name, filename, line_no, code):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return f"{prefix} {name} {filename} line {line_no} \n{dis.Bytecode(code).dis()}\n"
 
 
@@ -3873,21 +4646,37 @@ def format_bytecode(
 all_hook_names = forward_hook_names + backward_hook_names + state_dict_hook_names
 
 
+<<<<<<< HEAD
 def nn_module_has_global_hooks() -> bool:
     # This is limited to backward hooks for now because NNModuleVariable
     # supports fwd hooks underneath.
     return bool(
         len(torch.nn.modules.module._global_backward_hooks)
         or len(torch.nn.modules.module._global_backward_pre_hooks)
+=======
+def nn_module_has_global_hooks():
+    # This is limited to backward hooks for now because NNModuleVariable
+    # supports fwd hooks underneath.
+    return len(torch.nn.modules.module._global_backward_hooks) or len(
+        torch.nn.modules.module._global_backward_pre_hooks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 def nn_module_get_all_hooks(
+<<<<<<< HEAD
     mod: torch.nn.Module,
     check_forward_hooks: bool = False,
     check_backward_hooks: bool = False,
     check_state_dict_hooks: bool = False,
 ) -> list[Any]:
+=======
+    mod,
+    check_forward_hooks=False,
+    check_backward_hooks=False,
+    check_state_dict_hooks=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Sometimes its useful to differentiate between types of hooks such as forward/backward/pre
     hooks executed during module.__call__, and state_dict hooks which are executed separately.
@@ -3916,11 +4705,19 @@ def nn_module_get_all_hooks(
 
 
 def nnmodule_has_hooks(
+<<<<<<< HEAD
     mod: torch.nn.Module,
     check_forward_hooks: bool = False,
     check_backward_hooks: bool = False,
     check_state_dict_hooks: bool = False,
 ) -> bool:
+=======
+    mod,
+    check_forward_hooks=False,
+    check_backward_hooks=False,
+    check_state_dict_hooks=False,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Helper function to check if a module has any hooks attached to it.
     """
@@ -3933,7 +4730,11 @@ def nnmodule_has_hooks(
     return bool(hooks)
 
 
+<<<<<<< HEAD
 def to_numpy_helper(value: Any) -> Any:
+=======
+def to_numpy_helper(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Convert tensor and tnp.ndarray to numpy.ndarray."""
     if is_fake(value):
         return value
@@ -3947,7 +4748,11 @@ def to_numpy_helper(value: Any) -> Any:
         return value
 
 
+<<<<<<< HEAD
 def numpy_to_tensor(value: Any) -> Any:
+=======
+def numpy_to_tensor(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Convert tnp.ndarray to tensor, leave other types intact. If a list/tuple, loop through it to convert."""
     assert np is not None
     if isinstance(value, np.ndarray):
@@ -3960,20 +4765,33 @@ def numpy_to_tensor(value: Any) -> Any:
         return value
 
 
+<<<<<<< HEAD
 class numpy_to_tensor_wrapper(Generic[_P, R]):
     def __init__(self, f: Callable[_P, R]) -> None:
+=======
+class numpy_to_tensor_wrapper:
+    def __init__(self, f):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.f = f
         self.__name__ = "wrapped_" + self.f.__name__
 
     def __repr__(self) -> str:
         return f"<Wrapped function <original {self.f.__name__}>>"
 
+<<<<<<< HEAD
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
+=======
+    def __call__(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = self.f(*args, **kwargs)
         return numpy_to_tensor(out)
 
 
+<<<<<<< HEAD
 def numpy_attr_wrapper(obj: Any, name: str) -> Any:
+=======
+def numpy_attr_wrapper(obj, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(obj, tnp.ndarray):
         out = getattr(obj, name)
         return numpy_to_tensor(out)
@@ -3985,14 +4803,22 @@ def numpy_attr_wrapper(obj: Any, name: str) -> Any:
 class numpy_method_wrapper:
     """Convert obj from torch.Tensor to tnp.ndarray and call method. Then convert result back to torch.Tensor."""
 
+<<<<<<< HEAD
     def __init__(self, method: str) -> None:
+=======
+    def __init__(self, method: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.method = method
         self.__name__ = "wrapped_" + self.method
 
     def __repr__(self) -> str:
         return f"<Wrapped method <original {self.method}>>"
 
+<<<<<<< HEAD
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
+=======
+    def __call__(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj = args[0]
         if isinstance(obj, torch.Tensor):
             obj = tnp.ndarray(obj)
@@ -4001,20 +4827,33 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return numpy_to_tensor(out)
 
 
+<<<<<<< HEAD
 class numpy_operator_wrapper(Generic[_P, R]):
     """Implements dunder methods for tnp.ndarray via functions from the operator library"""
 
     def __init__(self, op: Callable[..., Any]) -> None:
+=======
+class numpy_operator_wrapper:
+    """Implements dunder methods for tnp.ndarray via functions from the operator library"""
+
+    def __init__(self, op: Callable[..., Any]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.op = op
         self.__name__ = f"wrapped_{op.__name__}"
 
     def __repr__(self) -> str:
         return f"<Wrapped operator <original {self.__name__}>>"
 
+<<<<<<< HEAD
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
         assert not kwargs
 
         # pyrefly: ignore [bad-assignment]
+=======
+    def __call__(self, *args, **kwargs):
+        assert not kwargs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = (
             tnp.ndarray(arg) if isinstance(arg, torch.Tensor) else arg for arg in args
         )
@@ -4022,7 +4861,11 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
         return numpy_to_tensor(out)
 
 
+<<<<<<< HEAD
 def defake(x: Any) -> Any:
+=======
+def defake(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(x, FakeTensor):
         return x
     size: torch._prims_common.ShapeType
@@ -4054,6 +4897,7 @@ def defake(x: Any) -> Any:
     return y
 
 
+<<<<<<< HEAD
 def _disable_side_effect_safety_checks_for_current_subtracer(
     fn: Callable[_P, R], *args: _P.args, **kwargs: _P.kwargs
 ) -> R:
@@ -4061,19 +4905,34 @@ def _disable_side_effect_safety_checks_for_current_subtracer(
 
 
 def is_utils_checkpoint(obj: Any) -> bool:
+=======
+def _disable_side_effect_safety_checks_for_current_subtracer(fn, *args, **kwargs):
+    return fn(*args, **kwargs)
+
+
+def is_utils_checkpoint(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Lazy import to avoid circular dependencies
     import torch.utils.checkpoint
 
     return obj is torch.utils.checkpoint.checkpoint
 
 
+<<<<<<< HEAD
 def is_invoke_subgraph(obj: Any) -> bool:
+=======
+def is_invoke_subgraph(obj):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._higher_order_ops.invoke_subgraph import invoke_subgraph_placeholder
 
     return obj is invoke_subgraph_placeholder
 
 
+<<<<<<< HEAD
 def build_invoke_subgraph_variable(**options: Any) -> Any:
+=======
+def build_invoke_subgraph_variable(**options):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .variables.higher_order_ops import TorchHigherOrderOperatorVariable
 
     return TorchHigherOrderOperatorVariable.make(
@@ -4082,7 +4941,11 @@ def build_invoke_subgraph_variable(**options: Any) -> Any:
     )
 
 
+<<<<<<< HEAD
 def build_checkpoint_variable(**options: Any) -> Any:
+=======
+def build_checkpoint_variable(**options):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch._higher_order_ops.wrap as higher_order_ops
 
     from .variables.higher_order_ops import TorchHigherOrderOperatorVariable
@@ -4101,14 +4964,22 @@ def build_checkpoint_variable(**options: Any) -> Any:
     )
 
 
+<<<<<<< HEAD
 def is_compile_supported(device_type: DeviceLikeType) -> Any:
+=======
+def is_compile_supported(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .eval_frame import is_dynamo_supported
 
     type = torch.device(device_type).type
     compile_supported = is_dynamo_supported()
     if type == "cpu":
         pass
+<<<<<<< HEAD
     elif type in ["cuda", "xpu", "mtia"] and compile_supported:
+=======
+    elif type in ["cuda", "xpu"] and compile_supported:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_supported = has_triton()
     else:
         compile_supported = False
@@ -4167,12 +5038,20 @@ def _extract_anchors_from_expr(segment: str) -> Optional[_Anchors]:
     lines = segment.split("\n")
 
     # get character index given byte offset
+<<<<<<< HEAD
     def normalize(lineno: int, offset: int) -> int:
+=======
+    def normalize(lineno, offset):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _fix_offset(lines[lineno], offset)
 
     # Gets the next valid character index in `lines`, if
     # the current location is not valid. Handles empty lines.
+<<<<<<< HEAD
     def next_valid_char(lineno: int, col: int) -> tuple[int, int]:
+=======
+    def next_valid_char(lineno, col):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while lineno < len(lines) and col >= len(lines[lineno]):
             col = 0
             lineno += 1
@@ -4180,14 +5059,22 @@ def next_valid_char(lineno: int, col: int) -> tuple[int, int]:
         return lineno, col
 
     # Get the next valid character index in `lines`.
+<<<<<<< HEAD
     def increment(lineno: int, col: int) -> tuple[int, int]:
+=======
+    def increment(lineno, col):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         col += 1
         lineno, col = next_valid_char(lineno, col)
         assert lineno < len(lines) and col < len(lines[lineno])
         return lineno, col
 
     # Get the next valid character at least on the next line
+<<<<<<< HEAD
     def nextline(lineno: int, col: int) -> tuple[int, int]:
+=======
+    def nextline(lineno, col):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         col = 0
         lineno += 1
         lineno, col = next_valid_char(lineno, col)
@@ -4204,7 +5091,10 @@ def nextline(lineno: int, col: int) -> tuple[int, int]:
             # -2 since end_lineno is 1-indexed and because we added an extra
             # bracket to `segment` when calling ast.parse
             cur_lineno = cast(int, expr.left.end_lineno) - 2
+<<<<<<< HEAD
             assert expr.left.end_col_offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cur_col = normalize(cur_lineno, expr.left.end_col_offset)
             cur_lineno, cur_col = next_valid_char(cur_lineno, cur_col)
 
@@ -4237,14 +5127,20 @@ def nextline(lineno: int, col: int) -> tuple[int, int]:
             # subscript^^^^^^^^^^^^^^^^^^^^
             # find left bracket (first '[' after value)
             left_lineno = cast(int, expr.value.end_lineno) - 2
+<<<<<<< HEAD
             assert expr.value.end_col_offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             left_col = normalize(left_lineno, expr.value.end_col_offset)
             left_lineno, left_col = next_valid_char(left_lineno, left_col)
             while lines[left_lineno][left_col] != "[":
                 left_lineno, left_col = increment(left_lineno, left_col)
             # find right bracket (final character of expression)
             right_lineno = cast(int, expr.end_lineno) - 2
+<<<<<<< HEAD
             assert expr.end_col_offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             right_col = normalize(right_lineno, expr.end_col_offset)
             return _Anchors(left_lineno, left_col, right_lineno, right_col)
         elif isinstance(expr, ast.Call):
@@ -4253,14 +5149,20 @@ def nextline(lineno: int, col: int) -> tuple[int, int]:
             # call^^^^^^^^^^^^^^^^^^^^^^^^
             # find left bracket (first '(' after func)
             left_lineno = cast(int, expr.func.end_lineno) - 2
+<<<<<<< HEAD
             assert expr.func.end_col_offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             left_col = normalize(left_lineno, expr.func.end_col_offset)
             left_lineno, left_col = next_valid_char(left_lineno, left_col)
             while lines[left_lineno][left_col] != "(":
                 left_lineno, left_col = increment(left_lineno, left_col)
             # find right bracket (final character of expression)
             right_lineno = cast(int, expr.end_lineno) - 2
+<<<<<<< HEAD
             assert expr.end_col_offset is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             right_col = normalize(right_lineno, expr.end_col_offset)
             return _Anchors(left_lineno, left_col, right_lineno, right_col)
 
@@ -4399,14 +5301,22 @@ def get_instruction_source_311(code: types.CodeType, inst: dis.Instruction) -> s
     return result
 
 
+<<<<<<< HEAD
 def get_static_address_type(t: Any) -> Any:
+=======
+def get_static_address_type(t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(t, torch.Tensor):
         return getattr(t, "_dynamo_static_input_type", None)
 
     return None
 
 
+<<<<<<< HEAD
 def is_rng_state_getter_or_setter(value: Any) -> bool:
+=======
+def is_rng_state_getter_or_setter(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getters = (
         # The following two functions are not identical, so don't remove anyone!
         torch._C.Generator.get_state,
@@ -4423,7 +5333,11 @@ def is_rng_state_getter_or_setter(value: Any) -> bool:
     return value in (*setters, *getters)
 
 
+<<<<<<< HEAD
 def is_tensor_base_attr_getter(value: Any) -> bool:
+=======
+def is_tensor_base_attr_getter(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         isinstance(value, types.MethodWrapperType)
         and value.__name__ == "__get__"
@@ -4431,7 +5345,11 @@ def is_tensor_base_attr_getter(value: Any) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_tensor_getset_descriptor(name: str) -> bool:
+=======
+def is_tensor_getset_descriptor(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         attr = inspect.getattr_static(torch.Tensor, name)
         return type(attr) is types.GetSetDescriptorType
@@ -4439,11 +5357,19 @@ def is_tensor_getset_descriptor(name: str) -> bool:
         return False
 
 
+<<<<<<< HEAD
 def is_torch_function_object(value: Any) -> bool:
     return hasattr(value, "__torch_function__")
 
 
 def has_torch_function(vt: VariableTracker) -> bool:
+=======
+def is_torch_function_object(value):
+    return hasattr(value, "__torch_function__")
+
+
+def has_torch_function(vt: torch._dynamo.variables.base.VariableTracker) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This emulates
     # https://github.com/pytorch/pytorch/blob/8d81806211bc3c0ee6c2ef235017bacf1d775a85/torch/csrc/utils/disable_torch_function.cpp#L315-L323
     from torch._dynamo.variables import UserDefinedObjectVariable
@@ -4473,9 +5399,13 @@ def has_torch_function(vt: VariableTracker) -> bool:
 
 
 # see note [Tensor Fakification and Symbol Caching]
+<<<<<<< HEAD
 def to_fake_tensor(
     t: torch.Tensor, fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
 ) -> Any:
+=======
+def to_fake_tensor(t, fake_mode):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     symbolic_context = None
     source = None
     if tracing_context := torch._guards.TracingContext.try_get():
@@ -4489,7 +5419,11 @@ def to_fake_tensor(
 
 
 # NB: this works for both classes and instances
+<<<<<<< HEAD
 def is_frozen_dataclass(value: Any) -> bool:
+=======
+def is_frozen_dataclass(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         not object_has_getattribute(value)
         and not class_has_getattribute(value)
@@ -4500,7 +5434,11 @@ def is_frozen_dataclass(value: Any) -> bool:
     )
 
 
+<<<<<<< HEAD
 def get_first_attr(obj: Any, *attrs: str) -> Any:
+=======
+def get_first_attr(obj, *attrs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Return the first available attribute or throw an exception if none is present.
     """
@@ -4512,15 +5450,24 @@ def get_first_attr(obj: Any, *attrs: str) -> Any:
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def maybe_enable_compiled_autograd(
     should_enable: bool, fullgraph: bool = True, dynamic: bool = True
 ) -> Generator[Any, None, None]:
+=======
+def maybe_enable_compiled_autograd(should_enable, fullgraph=True, dynamic=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not should_enable:
         yield
     else:
 
+<<<<<<< HEAD
         def compiler_fn(gm: Any) -> Any:
             def inner_compiler(gm_: Any, example_inputs_: Any) -> Any:
+=======
+        def compiler_fn(gm):
+            def inner_compiler(gm_, example_inputs_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._dynamo.utils.counters["compiled_autograd"]["compiles"] += 1
                 return torch._inductor.compile(gm_, example_inputs_)
 
@@ -4532,7 +5479,11 @@ def inner_compiler(gm_: Any, example_inputs_: Any) -> Any:
             yield ctx
 
 
+<<<<<<< HEAD
 def invalid_removeable_handle() -> RemovableHandle:
+=======
+def invalid_removeable_handle():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # need a subclass so weakref works
     class Invalid(dict):  # type: ignore[type-arg]
         pass
@@ -4544,7 +5495,11 @@ class Invalid(dict):  # type: ignore[type-arg]
 # Attribute changes to the original object/proxy will be reflected in the other.
 # This is useful for cases where we want a keep-alive reference to a module without increasing
 # its reference count.
+<<<<<<< HEAD
 def nn_module_proxy(mod: Any) -> Any:
+=======
+def nn_module_proxy(mod):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(mod, torch.nn.Module):
         return mod
     if isinstance(mod, torch.fx.GraphModule):
@@ -4556,22 +5511,34 @@ def nn_module_proxy(mod: Any) -> Any:
 
 
 class GmWrapper(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(
         self, gm: torch.fx.GraphModule, unflatten_fn: Callable[[list[Any]], Any]
     ) -> None:
+=======
+    def __init__(self, gm, unflatten_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.gm = gm
         self.unflatten_fn = unflatten_fn
 
+<<<<<<< HEAD
     def forward(self, *args: Any) -> Any:
         # pyrefly: ignore [annotation-mismatch]
+=======
+    def forward(self, *args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args: list[Any] = list(args)
         return self.gm(*self.unflatten_fn(args))
 
 
+<<<<<<< HEAD
 def flatten_graph_inputs(
     gm: torch.fx.GraphModule, inputs: Any, compile_gm: Callable[[Any, Any], Any]
 ) -> Callable[..., Any]:
+=======
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mutate inputs so that they are flat and wrap gm such that it
     accepts those inputs.  This is needed for graphs that take
@@ -4590,10 +5557,17 @@ def flatten_graph_inputs(
         assert isinstance(inputs[0], list)
         boxed_inputs_count = len(inputs[0])
 
+<<<<<<< HEAD
         def flatten_fn(args: Any) -> Any:
             return args[0] + list(args[1:])
 
         def unflatten_fn(flat_args: Any) -> Any:
+=======
+        def flatten_fn(args):
+            return args[0] + list(args[1:])
+
+        def unflatten_fn(flat_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (flat_args[:boxed_inputs_count], *flat_args[boxed_inputs_count:])
 
         compiled_fn = compile_gm(GmWrapper(gm, unflatten_fn), flatten_fn(inputs))
@@ -4605,7 +5579,11 @@ def unflatten_fn(flat_args: Any) -> Any:
         # note this doesn't check the spec, assuming it is the same
         flatten_fn = pytree.arg_tree_leaves
 
+<<<<<<< HEAD
     def wrapper(*args: Any) -> Any:
+=======
+    def wrapper(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args = flatten_fn(args)
 
         # flat_args is a new list, so we need to clear references from the old list
@@ -4618,18 +5596,30 @@ def wrapper(*args: Any) -> Any:
     return wrapper
 
 
+<<<<<<< HEAD
 def get_locals_to_steal(maybe_gm: Any) -> list[Any]:
+=======
+def get_locals_to_steal(maybe_gm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(maybe_gm, torch.fx.GraphModule) or not hasattr(maybe_gm, "meta"):
         return []
     return maybe_gm.meta.get("locals_to_steal", [])
 
 
+<<<<<<< HEAD
 def set_locals_to_steal(gm: torch.fx.GraphModule, locals_to_steal: list[Any]) -> None:
+=======
+def set_locals_to_steal(gm, locals_to_steal):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm.meta["locals_to_steal"] = locals_to_steal
 
 
 class Lit:
+<<<<<<< HEAD
     def __init__(self, s: str) -> None:
+=======
+    def __init__(self, s):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.s = s
 
     def __repr__(self) -> str:
@@ -4639,7 +5629,11 @@ def __repr__(self) -> str:
 warn_once_cache: set[str] = set()
 
 
+<<<<<<< HEAD
 def warn_once(msg: str, stacklevel: int = 1) -> None:
+=======
+def warn_once(msg, stacklevel=1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Dynamo causes all warnings.warn (in user code and in Dynamo code) to print all the time.
     # https://github.com/pytorch/pytorch/issues/128427.
     # warn_once is a workaround: if the msg has been warned on before, then we will not
@@ -4651,14 +5645,22 @@ def warn_once(msg: str, stacklevel: int = 1) -> None:
     warnings.warn(msg, stacklevel=stacklevel + 1)
 
 
+<<<<<<< HEAD
 def strip_color_from_string(text: str) -> str:
+=======
+def strip_color_from_string(text):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This regular expression matches ANSI escape codes
     ansi_escape = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]")
     return ansi_escape.sub("", text)
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def _disable_saved_tensors_hooks_during_tracing() -> Generator[None, None, None]:
+=======
+def _disable_saved_tensors_hooks_during_tracing():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # See NOTE: [Deferring tensor pack/unpack hooks until runtime]
     try:
         prior = torch._C._autograd._saved_tensors_hooks_set_tracing(True)
@@ -4667,22 +5669,38 @@ def _disable_saved_tensors_hooks_during_tracing() -> Generator[None, None, None]
         torch._C._autograd._saved_tensors_hooks_set_tracing(prior)
 
 
+<<<<<<< HEAD
 def is_parameter_freezing() -> bool:
     return torch._inductor.config.freezing and not torch.is_grad_enabled()
 
 
 def get_torch_function_mode_stack() -> list[Any]:
+=======
+def is_parameter_freezing():
+    return torch._inductor.config.freezing and not torch.is_grad_enabled()
+
+
+def get_torch_function_mode_stack():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [
         get_torch_function_mode_stack_at(i) for i in range(_len_torch_function_stack())
     ]
 
 
+<<<<<<< HEAD
 def get_torch_function_mode_stack_at(ind: int) -> Any:
+=======
+def get_torch_function_mode_stack_at(ind):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert ind < _len_torch_function_stack() and ind >= 0
     return torch._C._get_function_stack_at(ind)
 
 
+<<<<<<< HEAD
 def set_torch_function_mode_stack(stack: list[Any]) -> None:
+=======
+def set_torch_function_mode_stack(stack):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for _ in range(_len_torch_function_stack()):
         _pop_torch_function_stack()
 
@@ -4690,17 +5708,29 @@ def set_torch_function_mode_stack(stack: list[Any]) -> None:
         _push_on_torch_function_stack(mode)
 
 
+<<<<<<< HEAD
 def clear_torch_function_mode_stack() -> None:
+=======
+def clear_torch_function_mode_stack():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for _ in range(_len_torch_function_stack()):
         _pop_torch_function_stack()
 
 
 # call from C dynamo in order to inspect values in pdb
+<<<<<<< HEAD
 def _breakpoint_for_c_dynamo(*args: Any) -> None:
     breakpoint()
 
 
 def verify_guard_fn_signature(value: Any) -> None:
+=======
+def _breakpoint_for_c_dynamo(*args):
+    breakpoint()
+
+
+def verify_guard_fn_signature(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn = value.__metadata_guard__
     sig = inspect.signature(fn)
     if len(sig.parameters) != 2:
@@ -4717,7 +5747,11 @@ def verify_guard_fn_signature(value: Any) -> None:
         )
 
 
+<<<<<<< HEAD
 def does_not_override_dict_iter_methods(user_cls: Any) -> bool:
+=======
+def does_not_override_dict_iter_methods(user_cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         user_cls.items in (dict.items, OrderedDict.items)
         and user_cls.values in (dict.values, OrderedDict.values)
@@ -4730,23 +5764,39 @@ def does_not_override_dict_iter_methods(user_cls: Any) -> bool:
 # __torch_function__ calls triggered on tensor properties in the pre graph
 # bytecode.
 @torch._disable_dynamo
+<<<<<<< HEAD
 def call_size(x: Any, i: int) -> int:
+=======
+def call_size(x, i):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return x.size(i)
 
 
 @torch._disable_dynamo
+<<<<<<< HEAD
 def call_stride(x: Any, i: int) -> int:
+=======
+def call_stride(x, i):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return x.stride(i)
 
 
 @torch._disable_dynamo
+<<<<<<< HEAD
 def call_storage_offset(x: Any) -> int:
+=======
+def call_storage_offset(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return x.storage_offset()
 
 
 # Helper function to extract relevant parts of a tensor's __dict__ to store in node meta.
 # To avoid ref cycles, it's important that no tensors are present here, so leave those out.
+<<<<<<< HEAD
 def _extract_tensor_dict(t: torch.Tensor) -> dict[str, Any]:
+=======
+def _extract_tensor_dict(t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     KEYS_TO_COPY = [
         "_dynamo_static_input_type",
         "tag",
@@ -4765,13 +5815,18 @@ def _extract_tensor_dict(t: torch.Tensor) -> dict[str, Any]:
 user_obj_id_to_weakref: dict[int, weakref.ReferenceType[object]] = {}
 
 
+<<<<<<< HEAD
 # TODO: mlazos to remove after replacing w/ above API
 def get_user_object_from_id(obj_id: int) -> Any:
+=======
+def get_user_object_from_id(obj_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     obj = user_obj_id_to_weakref[obj_id]()
     assert obj is not None, "User object is no longer alive"
     return obj
 
 
+<<<<<<< HEAD
 def store_user_object_weakref(obj: object) -> None:
     obj_id = id(obj)
     try:
@@ -4786,6 +5841,11 @@ def store_user_object_weakref(obj: object) -> None:
             hints=[],
             from_exc=e,
         )
+=======
+def store_user_object_weakref(obj):
+    obj_id = id(obj)
+    user_obj_id_to_weakref[obj_id] = weakref.ref(obj)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CompileTimeInstructionCounter:
@@ -4816,7 +5876,11 @@ def value(cls) -> int:
 
     @classmethod
     @contextmanager
+<<<<<<< HEAD
     def record(cls) -> Generator[None, None, None]:
+=======
+    def record(cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             if config.record_compile_time_instruction_count:
                 cls.start()
@@ -4826,12 +5890,16 @@ def record(cls) -> Generator[None, None, None]:
                 cls.end()
 
 
+<<<<<<< HEAD
 class CompileCounterInt(int):
     def __add__(self, other: Any) -> CompileCounterInt:
         return CompileCounterInt(super().__add__(other))
 
 
 def set_feature_use(feature: str, usage: bool) -> None:
+=======
+def set_feature_use(feature: str, usage: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Records whether we are using a feature
     Generally a feature is a JK.
@@ -4849,7 +5917,11 @@ def set_feature_use(feature: str, usage: bool) -> None:
 )
 
 
+<<<<<<< HEAD
 def get_optimize_ddp_mode() -> str:
+=======
+def get_optimize_ddp_mode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     optimize_ddp = config.optimize_ddp
     if isinstance(optimize_ddp, bool):
         mode = "ddp_optimizer" if optimize_ddp else "no_optimization"
@@ -4908,6 +5980,7 @@ def is_node_meta_valid(node: Optional[torch.fx.Node]) -> bool:
     return node is None or "example_value" in node.meta or "val" in node.meta
 
 
+<<<<<<< HEAD
 # If True, enforce fullgraph=True - raise errors on graph break
 _error_on_graph_break = False
 
@@ -4921,6 +5994,8 @@ def _set_error_on_graph_break(value: bool) -> None:
     _error_on_graph_break = value
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch._disable_dynamo
 def record_pregraph_bytecode_enter() -> AbstractContextManager[None]:
     cm: AbstractContextManager[None] = (
@@ -4939,7 +6014,11 @@ def record_pregraph_bytecode_exit(cm: AbstractContextManager[None]) -> None:
 
 # Returns a set of code objects present traced in the current TracingContext, or None
 # if there is no current TracingContext.
+<<<<<<< HEAD
 def get_traced_code() -> Optional[list[CodeType]]:
+=======
+def get_traced_code() -> list[CodeType]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._guards import TracingContext
 
     return TracingContext.get_traced_code()
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 74165b30bb2f0..3e4dcc4a5e1c2 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -27,9 +27,13 @@
     DisabledSavedTensorsHooksVariable,
     DualLevelContextManager,
     DynamoConfigPatchVariable,
+<<<<<<< HEAD
     ErrorOnGraphBreakVariable,
     FSDPParamGroupUseTrainingStateVariable,
     FxTracebackAnnotateVariable,
+=======
+    FSDPParamGroupUseTrainingStateVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GradIncrementNestingCtxManagerVariable,
     GradInplaceRequiresGradCtxManagerVariable,
     GradModeVariable,
@@ -37,9 +41,16 @@
     JvpIncrementNestingCtxManagerVariable,
     SDPAKernelVariable,
     SetFwdGradEnabledContextManager,
+<<<<<<< HEAD
     TemporarilyPopInterpreterStackCtxManagerVariable,
     VmapIncrementNestingCtxManagerVariable,
     WithEnterFunctionVariable,
+=======
+    StreamContextVariable,
+    StreamVariable,
+    TemporarilyPopInterpreterStackCtxManagerVariable,
+    VmapIncrementNestingCtxManagerVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WithExitFunctionVariable,
 )
 from .dicts import (
@@ -75,16 +86,26 @@
 from .higher_order_ops import (
     FunctionalCallVariable,
     FunctorchHigherOrderVariable,
+<<<<<<< HEAD
     ReparametrizeModuleCallVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TorchHigherOrderOperatorVariable,
 )
 from .iter import (
     CountIteratorVariable,
+<<<<<<< HEAD
+=======
+    CycleIteratorVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FilterVariable,
     IteratorVariable,
     ItertoolsVariable,
     MapVariable,
+<<<<<<< HEAD
     ObjectIteratorVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RepeatIteratorVariable,
     ZipVariable,
 )
@@ -129,7 +150,10 @@
 )
 from .optimizer import OptimizerVariable
 from .sdpa import SDPAParamsVariable
+<<<<<<< HEAD
 from .streams import EventVariable, StreamContextVariable, StreamVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .tensor import (
     DataPtrVariable,
     FakeItemVariable,
@@ -141,7 +165,10 @@
 )
 from .torch import TorchCtxManagerClassVariable, TorchInGraphFunctionVariable
 from .user_defined import (
+<<<<<<< HEAD
     FrozenDataClassVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MutableMappingVariable,
     RemovableHandleVariable,
     UserDefinedClassVariable,
@@ -150,7 +177,10 @@
     UserDefinedExceptionObjectVariable,
     UserDefinedListVariable,
     UserDefinedObjectVariable,
+<<<<<<< HEAD
     UserDefinedSetVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UserDefinedTupleVariable,
 )
 
@@ -169,6 +199,10 @@
     "CreateTMADescriptorExperimentalVariable",
     "CreateTMADescriptorStableVariable",
     "CUDADeviceVariable",
+<<<<<<< HEAD
+=======
+    "CycleIteratorVariable",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "DataPtrVariable",
     "DefaultDictVariable",
     "DeletedVariable",
@@ -201,7 +235,10 @@
     "RemovableHandleVariable",
     "RepeatIteratorVariable",
     "SDPAParamsVariable",
+<<<<<<< HEAD
     "ErrorOnGraphBreakVariable",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "SkipFunctionVariable",
     "SliceVariable",
     "StringFormatVariable",
@@ -224,7 +261,10 @@
     "UserFunctionVariable",
     "UserMethodVariable",
     "VariableTracker",
+<<<<<<< HEAD
     "WithEnterFunctionVariable",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "WithExitFunctionVariable",
     "MappingProxyVariable",
 ]
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 731c29a365ad7..7c3823e57736b 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -16,9 +16,15 @@
 """
 
 import collections
+<<<<<<< HEAD
 from collections.abc import Callable, ItemsView, KeysView, Sequence, ValuesView
 from enum import Enum
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from collections.abc import ItemsView, KeysView, Sequence, ValuesView
+from enum import Enum
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .. import graph_break_hints, variables
 from ..current_scope_id import current_scope_id
@@ -375,9 +381,13 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
         if not variables.ConstantVariable.is_literal(value):
             raise NotImplementedError
         source = self.source and AttrSource(self.source, name)
+<<<<<<< HEAD
         if source and not isinstance(self, variables.ConstantVariable):
             # The second condition is to avoid guards on const getattr objects
             # like __code__.co_argcount
+=======
+        if source:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             install_guard(source.make_guard(GuardBuilder.CONSTANT_MATCH))
         return variables.ConstantVariable.create(value, source=source)
 
@@ -549,12 +559,15 @@ def call_method(
                 "This can happen unintentionally if a previous graph break happens with a builtin iterator "
                 "in the local scope."
             )
+<<<<<<< HEAD
             hints.append(
                 "List/dict comprehensions in Python <= 3.11 result in implicit function calls, which Dynamo "
                 "cannot trace as a top level frame. Possible workarounds are (1) use a loop instead of a comprehension, "
                 "(2) fix any graph breaks in the function above the comprehension, (3) wrap the comprehension in a "
                 "function, or (4) use Python 3.12+."
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unimplemented_v2(
             gb_type="Unsupported method call",
             context=f"call_method {self} {name} {args} {kwargs}",
@@ -636,11 +649,14 @@ def __init__(
                 assert source is not None
 
 
+<<<<<<< HEAD
 def raise_type_error_exc(tx: "InstructionTranslator", msg_str: str) -> None:
     msg = variables.ConstantVariable.create(msg_str)
     raise_observed_exception(TypeError, tx, args=[msg])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def typestr(*objs):
     if len(objs) == 1:
         (obj,) = objs
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 2a1cff0211f5b..404d1f9a7c32e 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -36,19 +36,29 @@
 import sys
 import traceback
 import types
+<<<<<<< HEAD
 import weakref
 from collections.abc import Callable, MutableMapping
 from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
+=======
+import warnings
+import weakref
+from collections.abc import MutableMapping
+from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
 from torch import SymInt
+<<<<<<< HEAD
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.graph_bytecode_inputs import (
     get_user_object_by_index,
     register_user_object,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import (
     get_metrics_context,
     is_int_specialization_case,
@@ -56,7 +66,10 @@
     set_feature_use,
 )
 from torch._guards import TracingContext
+<<<<<<< HEAD
 from torch._higher_order_ops.flat_apply import flat_apply
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.torchbind import call_torchbind
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, is_fake, maybe_get_fake_mode
@@ -64,7 +77,10 @@
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental._dynamism import normalize_source_name
+<<<<<<< HEAD
 from torch.fx.experimental.sym_node import _DynamicScalar, DynamicInt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import (
     _constrain_range_for_size,
     _nested_int_aware_sort,
@@ -106,19 +122,28 @@
     ConvertIntSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+<<<<<<< HEAD
     DynamicScalarSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FloatTensorSource,
     GetItemSource,
     GradSource,
     is_constant_source,
+<<<<<<< HEAD
     is_from_closure_source,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_from_global_source,
     is_from_nonlocal_source,
     is_from_optimizer_source,
     is_from_unspecialized_nn_module_source,
     ListGetItemSource,
     LocalSource,
+<<<<<<< HEAD
     NonSerializableSetGetItemSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NumpyTensorSource,
     OptimizerSource,
     RandomValueSource,
@@ -140,7 +165,10 @@
     get_locals_to_steal,
     get_static_address_type,
     is_frozen_dataclass,
+<<<<<<< HEAD
     is_function,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_function_or_wrapper,
     is_invoke_subgraph,
     is_lru_cache_wrapped_function,
@@ -170,14 +198,25 @@
     VariableTracker,
     VariableTrackerMeta,
 )
+<<<<<<< HEAD
 from .builtin import BuiltinVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .constant import ConstantVariable, EnumVariable
 from .ctx_manager import (
     AutocastModeVariable,
     DynamoConfigPatchVariable,
+<<<<<<< HEAD
     ErrorOnGraphBreakVariable,
     NullContextVariable,
     PreserveVersionContextVariable,
+=======
+    EventVariable,
+    NullContextVariable,
+    PreserveVersionContextVariable,
+    StreamContextVariable,
+    StreamVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .dicts import (
     ConstDictVariable,
@@ -209,10 +248,14 @@
     UserMethodVariable,
     WrapperUserFunctionVariable,
 )
+<<<<<<< HEAD
 from .higher_order_ops import (
     LocalMapWrappedHigherOrderVariable,
     TorchHigherOrderOperatorVariable,
 )
+=======
+from .higher_order_ops import TorchHigherOrderOperatorVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .iter import ItertoolsVariable
 from .lazy import LazyVariableTracker
 from .lists import (
@@ -258,7 +301,10 @@
 from .optimizer import OptimizerVariable
 from .script_object import TorchScriptObjectVariable
 from .sdpa import SDPAParamsVariable
+<<<<<<< HEAD
 from .streams import EventVariable, StreamContextVariable, StreamVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .tensor import (
     NumpyNdarrayVariable,
     supported_const_comparison_op_values,
@@ -289,7 +335,10 @@
     UserDefinedExceptionClassVariable,
     UserDefinedListVariable,
     UserDefinedObjectVariable,
+<<<<<<< HEAD
     UserDefinedSetVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UserDefinedTupleVariable,
 )
 
@@ -315,7 +364,12 @@
 
 
 def safe_has_grad(t):
+<<<<<<< HEAD
     with torch._logging.hide_warnings(torch._logging._internal.safe_grad_filter):
+=======
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return hasattr(t, "grad")
 
 
@@ -444,6 +498,7 @@ def __call__(self, value):
             dup_guard = make_dupe_guard(self.source, side_effect_result.source)
             if dup_guard:
                 self.install_guards(dup_guard)
+<<<<<<< HEAD
 
             if isinstance(value, torch.nn.Module) and isinstance(
                 side_effect_result, UnspecializedNNModuleVariable
@@ -456,6 +511,8 @@ def __call__(self, value):
                 # lets return the old variable tracker but update its
                 # nn_module_stack
                 side_effect_result.set_nn_module_stack_source(self.source)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return side_effect_result
 
         cached_vt = self.tx.output.variable_tracker_cache.lookup(value, self.source)
@@ -467,6 +524,7 @@ def __call__(self, value):
         if vt.source is None:
             vt.source = self.source
 
+<<<<<<< HEAD
         def _is_deduplicable_sym_variable(value, vt):
             # Constants like 0, 1, 2, etc. can be unspecialized as SymNodeVariables sometimes, but we
             # should NOT track them. If we use a single SymNodeVariable instance to track them
@@ -481,6 +539,10 @@ def _is_deduplicable_sym_variable(value, vt):
                 self._can_lift_attrs_to_inputs(vt)
                 or _is_deduplicable_sym_variable(value, vt)
             )
+=======
+        if (
+            self._can_lift_attrs_to_inputs(vt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and value not in self.tx.output.side_effects
             and not is_wrapper_or_member_descriptor(value)
         ):
@@ -628,7 +690,11 @@ def _id_dispatch(
                 lambda self, value: LambdaVariable(
                     _dataclasses_fields_lambda,
                     source=self.source,
+<<<<<<< HEAD
                     **self.install_guards(GuardBuilder.CLOSURE_MATCH),
+=======
+                    **self.install_guards(GuardBuilder.FUNCTION_MATCH),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             ),
             (torch.__version__, lambda self, value: TorchVersionVariable()),
@@ -650,10 +716,14 @@ def _wrap(self, value):
             has_triton_tensor_descriptor_host_tma,
         )
 
+<<<<<<< HEAD
         from ..decorators import (
             DynamoConfigPatchProxy,
             ErrorOnGraphBreakDecoratorContextManager,
         )
+=======
+        from ..decorators import DynamoConfigPatchProxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if has_triton():
             from triton.runtime.autotuner import Autotuner
@@ -711,10 +781,20 @@ def from_tensor():
             )
             and type(value) not in config.nontraceable_tensor_subclasses
         ):
+<<<<<<< HEAD
             if (
                 type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__
                 or is_traceable_wrapper_subclass(value)
             ):
+=======
+            if type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__:
+                # This case it's either tensor or subclass with default
+                # torch_dispatch (they might override torch_function or not),
+                # and we can always trace into them.
+                return self.wrap_tensor(value)
+            elif is_traceable_wrapper_subclass(value):
+                # For non-default torch_dispatch, we have more requirements.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self.wrap_tensor(value)
 
         if is_namedtuple(value):
@@ -729,7 +809,11 @@ def from_tensor():
             result = NamedTupleVariable(
                 output, tuple_cls=type(value), source=self.source
             )
+<<<<<<< HEAD
             return self.tx.output.side_effects.track_object_existing(value, result)
+=======
+            return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(value, (dict, collections.defaultdict, collections.OrderedDict)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
@@ -805,6 +889,7 @@ def build_key_value(i, k, v):
             var = TorchFunctionModeVariable(value, source=self.source)
             self.tx.output.side_effects.track_object_existing(value, var)
             return var
+<<<<<<< HEAD
         elif istype(value, set):
             if any(isinstance(x, torch.Tensor) for x in value):
                 unimplemented_v2(
@@ -837,6 +922,8 @@ def build_key_value(i, k, v):
             ]
             result = SetVariable(items, source=self.source)
             return self.tx.output.side_effects.track_object_existing(value, result)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(value, frozenset) and all(
             (
                 # For DBR quantization, we could get a frozenset of torch funcs.
@@ -865,14 +952,21 @@ def build_key_value(i, k, v):
             self.install_guards(GuardBuilder.BUILTIN_MATCH)
             return DebuggingVariable(value, source=self.source)
         elif isinstance(value, logging.Logger):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.TYPE_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return LoggingLoggerVariable(value, source=self.source)
         elif is_utils_checkpoint(value):
             return build_checkpoint_variable(source=self.source)
         elif is_invoke_subgraph(value):
             return build_invoke_subgraph_variable(source=self.source)
+<<<<<<< HEAD
         elif LocalMapWrappedHigherOrderVariable.should_wrap_in_hop(value):
             return LocalMapWrappedHigherOrderVariable.build(source=self.source)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(value, functools.partial):
             func_src = AttrSource(self.get_source(), "func")
             func_obj = VariableBuilder(self.tx, func_src)(value.func)
@@ -916,6 +1010,7 @@ def build_key_value(i, k, v):
             return self.wrap_numpy_ndarray(np.asarray(value))
         elif trace_rules.is_numpy(value):
             assert np
+<<<<<<< HEAD
             if istype(value, types.MethodType):
                 # Dont guard on cython functions as they dont change ids
                 if inspect.isfunction(value.__func__):
@@ -932,6 +1027,13 @@ def build_key_value(i, k, v):
                 self.install_guards(GuardBuilder.ID_MATCH)
             else:
                 self.install_guards(GuardBuilder.TYPE_MATCH)
+=======
+            self.install_guards(
+                GuardBuilder.FUNCTION_MATCH
+                if callable(value)
+                else GuardBuilder.TYPE_MATCH
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return NumpyVariable(value, source=self.source)
         elif trace_rules.is_numpy_dtype(value):
             self.install_guards(GuardBuilder.ID_MATCH)
@@ -946,14 +1048,22 @@ def build_key_value(i, k, v):
             return NumpyTypeInfoVariable(value, source=self.source)
         # NB: These can't be put in type_dispatch, they have to run later
         elif CollectiveFunctionRewriteVariable.can_rewrite(value):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.CLOSURE_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return CollectiveFunctionRewriteVariable.create(
                 self.tx,
                 value,
                 source=self.source,
             )
         elif istype(value, torch.autograd.function.FunctionMeta):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.CLASS_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return AutogradFunctionVariable(
                 value,
                 source=self.source,
@@ -997,11 +1107,15 @@ def build_key_value(i, k, v):
             and value == getattr(value.__self__, "apply", None)
         ):
             # handle aliased autograd function `apply` calls
+<<<<<<< HEAD
             install_guard(
                 AttrSource(self.get_source(), "__func__").make_guard(
                     GuardBuilder.CLOSURE_MATCH
                 )
             )
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return GetAttrVariable(
                 AutogradFunctionVariable(
                     value.__self__, source=AttrSource(self.source, member="__self__")
@@ -1015,7 +1129,11 @@ def build_key_value(i, k, v):
             value
             is torch._dynamo.external_utils.FakeCompiledAutogradEngine._exec_final_callbacks_stub
         ):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.CLOSURE_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return LambdaVariable(
                 lambda: UserFunctionVariable(
                     torch._dynamo.external_utils.FakeCompiledAutogradEngine.exec_final_callbacks,
@@ -1027,8 +1145,11 @@ def build_key_value(i, k, v):
             )
         elif isinstance(value, DynamoConfigPatchProxy):
             return DynamoConfigPatchVariable(value.changes)
+<<<<<<< HEAD
         elif isinstance(value, ErrorOnGraphBreakDecoratorContextManager):
             return ErrorOnGraphBreakVariable(value.error_on_graph_break)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif callable(value) and trace_rules.lookup_callable(value) is not None:
             if trace_rules.is_callable_allowed(value):
                 self.tx.output.has_user_defined_allowed_in_graph = True
@@ -1045,7 +1166,11 @@ def build_key_value(i, k, v):
                     explanation="Directly using invoke_subgraph is not supported. Use nested_compile_region",
                     hints=[],
                 )
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.TYPE_MATCH)
+=======
+            self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return TorchHigherOrderOperatorVariable.make(value, source=self.source)
         elif isinstance(value, torch.cuda.StreamContext):
             self.install_guards(GuardBuilder.ID_MATCH)
@@ -1053,6 +1178,7 @@ def build_key_value(i, k, v):
             stream_var = VariableBuilder(self.tx, stream_source)(value.stream)
             return StreamContextVariable.create(self.tx, stream_var)
         elif isinstance(value, torch.Stream):
+<<<<<<< HEAD
             # This refers to the device-agnostic torch.Stream
             self.install_guards(GuardBuilder.TYPE_MATCH)
             index = register_user_object(value, self.source)
@@ -1061,12 +1187,30 @@ def build_key_value(i, k, v):
             )
             set_example_value(stream_proxy.node, value)
             var = StreamVariable(
+=======
+            self.install_guards(GuardBuilder.ID_MATCH)
+            stream_proxy = self.tx.output.create_proxy(
+                "call_function",
+                type(value),
+                (),
+                {
+                    "stream_id": value.stream_id,
+                    "device_index": value.device_index,
+                    "device_type": value.device_type,
+                },
+            )
+            set_example_value(stream_proxy.node, value)
+            return StreamVariable(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 stream_proxy,
                 value,
                 value.device,
                 source=self.source,
             )
+<<<<<<< HEAD
             return self.tx.output.side_effects.track_object_existing(value, var)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(value, (torch._C._SDPAParams)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             return SDPAParamsVariable.create(self.tx, value, self.source)
@@ -1074,12 +1218,21 @@ def build_key_value(i, k, v):
             self.install_guards(GuardBuilder.ID_MATCH)
             return FuncTorchInterpreterVariable(value)
         elif isinstance(value, torch.Event):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.TYPE_MATCH)
             index = register_user_object(value, self.source)
             event_proxy = self.tx.output.create_proxy(
                 "call_function",
                 get_user_object_by_index,
                 (index,),
+=======
+            self.install_guards(GuardBuilder.ID_MATCH)
+            torch._dynamo.utils.store_user_object_weakref(value)
+            event_proxy = self.tx.output.create_proxy(
+                "call_function",
+                torch._dynamo.utils.get_user_object_from_id,
+                (id(value),),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 {},
             )
             set_example_value(event_proxy.node, value)
@@ -1130,6 +1283,7 @@ def build_key_value(i, k, v):
             id(value) in ITERTOOLS_TYPE_IDS
             and id(value) not in ITERTOOLS_POLYFILLED_TYPE_IDS
         ):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.CLASS_MATCH)
             return ItertoolsVariable(value, source=self.source)
         elif isinstance(value, _DynamicScalar):
@@ -1172,6 +1326,10 @@ def build_key_value(i, k, v):
             )
             self.tx.output.tracked_fakes.append(TrackedFake(node, source, None))
             return SymNodeVariable(sym_node_proxy, node)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return ItertoolsVariable(value, source=self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif is_torch_sym(value):
             # Note: this doesn't handle nested symints.
             # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo.
@@ -1264,10 +1422,14 @@ def build_key_value(i, k, v):
                 source=self.source,
             )
         elif TorchCtxManagerClassVariable.is_matching_cls(value):
+<<<<<<< HEAD
             if inspect.isclass(value):
                 self.install_guards(GuardBuilder.CLASS_MATCH)
             elif inspect.isfunction(value):
                 self.install_guards(GuardBuilder.CLOSURE_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return TorchCtxManagerClassVariable(value, source=self.source)
         elif inspect.getattr_static(value, "__script_if_tracing_wrapper", False):
             self.install_guards(GuardBuilder.TYPE_MATCH)
@@ -1301,12 +1463,15 @@ def build_key_value(i, k, v):
         ) and BuiltinMethodVariable.is_supported_builtin_method(value):
             self.install_guards(GuardBuilder.ID_MATCH)
             return BuiltinMethodVariable(value, source=self.source)
+<<<<<<< HEAD
         elif is_function(value) and value in (float.fromhex, float.hex):
             self.install_guards(GuardBuilder.ID_MATCH)
             return GetAttrVariable(
                 BuiltinVariable(float, source=self.source),
                 value.__name__,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif is_function_or_wrapper(value):
             value, attr_name = unwrap_with_attr_name_if_wrapper(value)
             # For these wrappers, Dynamo points to the wrapped function,
@@ -1330,7 +1495,11 @@ def build_key_value(i, k, v):
         # E.g, type(torch.ops) -> <class 'torch._ops._Ops'>,
         # type(torch.backends.cudnn) -> <class 'torch.backends.cudnn.CudnnModule'>
         elif isinstance(value, (types.ModuleType, replay_record.DummyModule)):
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.MODULE_MATCH)
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = PythonModuleVariable(
                 value,
                 source=self.source,
@@ -1357,6 +1526,10 @@ def build_key_value(i, k, v):
             assert self_obj and isinstance(self_obj, VariableTracker), (
                 "Failed to produce a valid self obj"
             )
+<<<<<<< HEAD
+=======
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return UserMethodVariable(
                 value.__func__,
                 self_obj,
@@ -1373,7 +1546,11 @@ def build_key_value(i, k, v):
         elif isinstance(value, types.MethodWrapperType):
             # Method-wrappers are written in C, and they are not guaranteed to
             # return the same object on attribute lookup. Therefore, we cannot
+<<<<<<< HEAD
             # insert a ID_MATCH guard here. method-wrappers are very
+=======
+            # insert a FUNCTION_MATCH guard here. method-wrappers are very
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # unlikely to change, so its ok to skip the guard here.
             return MethodWrapperVariable(value)
         elif issubclass(type(value), type) and issubclass(value, BaseException):
@@ -1391,7 +1568,11 @@ def build_key_value(i, k, v):
                     value, source=self.source
                 )
             if value is torch.autograd._unsafe_preserve_version_counter:
+<<<<<<< HEAD
                 self.install_guards(GuardBuilder.CLASS_MATCH)
+=======
+                self.install_guards(GuardBuilder.FUNCTION_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return PreserveVersionContextVariable.constructor(self.tx)
             if (
                 # `value` must be a strict subclass of `torch.Tensor`
@@ -1407,6 +1588,7 @@ def build_key_value(i, k, v):
                 and not is_traceable_wrapper_subclass_type(value)
             ):
                 return TensorSubclassVariable(value, source=self.source)
+<<<<<<< HEAD
 
             if not is_from_closure_source(self.source):
                 # For closure source, the variable comes from LOAD_SUPER_ATTR,
@@ -1417,6 +1599,11 @@ def build_key_value(i, k, v):
                 # ID_MATCH even if its a global variable.
                 self.install_guards(GuardBuilder.CLASS_MATCH)
 
+=======
+            # This is a userdefined class, so install an ID_MATCH even if its a
+            # global variable.
+            self.install_guards(GuardBuilder.ID_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return UserDefinedClassVariable(
                 value,
                 source=self.source,
@@ -1578,6 +1765,7 @@ def build_key_value(i, k, v):
             )
             result = UserDefinedListVariable(value, list_vt=list_vt, source=self.source)
             return self.tx.output.side_effects.track_object_existing(value, result)
+<<<<<<< HEAD
         elif isinstance(value, (set, frozenset)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
@@ -1600,6 +1788,11 @@ def build_key_value(i, k, v):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             result = MutableMappingVariable(value, source=self.source)
             return self.tx.output.side_effects.track_object_existing(value, result)
+=======
+        elif issubclass(type(value), MutableMapping):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return MutableMappingVariable(value, source=self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif is_frozen_dataclass(value):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             result = FrozenDataClassVariable.create(self.tx, value, source=self.source)
@@ -1732,8 +1925,11 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
                 source=source,
             )
 
+<<<<<<< HEAD
             # Apply relevant logic from `VariableTracker.build(value[i])`
             # (except for the `create_graph_input` stuff).
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             guards = []
             for i, tensor_variable in enumerate(list_variable.items):
                 source_i = GetItemSource(base=source, index=i, index_is_slice=False)
@@ -1742,6 +1938,10 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
                 tensor_variable.proxy.node.meta["tensor_dict"] = _extract_tensor_dict(
                     value[i]
                 )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 guard = functools.partial(
                     GuardBuilder.TENSOR_MATCH, value=TensorWeakRef(value[i])
                 )
@@ -1758,6 +1958,7 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
             )
             tensor_list_proxy.node.meta["grapharg"] = grapharg
 
+<<<<<<< HEAD
             # The following is very important for maintaining the "python object
             # <==> variable tracker" 1-to-1 mapping, which is mainly handled via
             # `side_effects`. Note that constructing `tensor_variable` above
@@ -1779,6 +1980,8 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
             for vt in output:
                 vt.realize()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = BaseListVariable.cls_for_instance(value)(output, source=self.source)
         if istype(value, (list, collections.deque)):
             return self.tx.output.side_effects.track_mutable(value, result)
@@ -1812,7 +2015,11 @@ def wrap_slice_range(self, value: Union[slice, range]):
         ]
         self.install_guards(GuardBuilder.TYPE_MATCH)
         if isinstance(value, slice):
+<<<<<<< HEAD
             return SliceVariable(items, self.tx, source=self.source)
+=======
+            return SliceVariable(items, source=self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return RangeVariable(items, source=self.source)
 
@@ -1970,8 +2177,11 @@ def wrap_module(self, value: torch.nn.Module):
                 result = UnspecializedNNModuleVariable(value, source=new_source)
                 install_guard(new_source.make_guard(GuardBuilder.TYPE_MATCH))
 
+<<<<<<< HEAD
             self.tx.output.add_fqn_info_for_inlined_modules(value, self.source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not SideEffects.cls_supports_mutation_side_effects(type(value)):
                 # don't allow STORE_ATTR mutation with custom __setattr__
                 return result
@@ -2021,12 +2231,15 @@ def wrap_literal(self, value):
                             "integer into a tensor."
                         )
 
+<<<<<<< HEAD
                     process_automatic_dynamic(
                         self.tx,
                         self.source.name(),
                         FrameStateSizeEntry.make_scalar(value),
                         is_unspecialized_nn_module=self.source.guard_source().is_unspecialized_nn_module(),
                     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.install_guards(
                         functools.partial(
                             GuardBuilder.EQUALS_MATCH, recompile_hint=recompile_hint
@@ -2132,8 +2345,37 @@ def wrap_tensor(self, value: torch.Tensor):
             return self.tx.output.input_source_to_var[source]
 
         options = {}
+<<<<<<< HEAD
         subclass_type = infer_subclass_type(value)
         if subclass_type is not None:
+=======
+        if type(value) in (
+            torch.Tensor,
+            torch.nn.Parameter,
+            torch._subclasses.fake_tensor.FakeTensor,
+            torch._subclasses.functional_tensor.FunctionalTensor,
+        ) or is_traceable_wrapper_subclass(value):
+            # Ordinarily, we would fakeify a tensor so that it can get dynamic
+            # shapes and be computed on without triggering actual operations.
+            # However, how can we fakeify a tensor subclass?  Ordinary
+            # inheritance (nor multiple inheritance) won't work work.
+            #
+            # Instead, our plan is to *manually simulate* the tensor subclass
+            # inheriting from a fake tensor with dynamo.  This means our
+            # data representation for a tensor subclass will be a fake tensor
+            # + tensor subclass type + any extra data the subclass may have
+            # been storing on the tensor.  Because all Python accesses are
+            # mediated through TensorWithTFOverrideVariable, we can ensure
+            # that we dispatch differently, e.g., according to
+            # __torch_function__
+            #
+            # To simplify things for now, the __dict__ tracking bits haven't
+            # been implemented yet, but they can be added into this design at
+            # a later point in time.
+            subclass_type = None
+        else:
+            subclass_type = type(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_guards(GuardBuilder.TYPE_MATCH)
 
         if get_static_address_type(value) == "guarded":
@@ -2898,6 +3140,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     import torch._utils
 
     if isinstance(example_value, torch.Tensor):
+<<<<<<< HEAD
         # Check if the result is a sparse tensor -
         # We generally don't support sparse tensor so better to graph break here
         if is_sparse_any(example_value) and (
@@ -2909,6 +3152,8 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
                 explanation="torch.compile does not support sparse Tensors with VariableTracker",
                 hints=[*graph_break_hints.SUPPORTABLE],
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var = construct_tensor_variable(
             target_cls, tx, proxy, example_value, subclass_type, options
         )
@@ -3002,7 +3247,11 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     elif example_value is None or proxy.node.target is torch.manual_seed:
         return ConstantVariable.create(None, **options)
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+<<<<<<< HEAD
         tx.output.current_tracer.track_produced_symints(example_value, proxy)
+=======
+        tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         set_example_value(proxy.node, example_value)
         return SymNodeVariable(proxy, example_value, **options)
     elif (
@@ -3044,8 +3293,11 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
             torch.seed,
             operator.mod,
             torch._functorch.vmap._validate_and_get_batch_size,
+<<<<<<< HEAD
             torch._functorch.predispatch._vmap_increment_nesting,
             torch._functorch.predispatch._vmap_decrement_nesting,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # some mac builds are missing torch.distributed.get_rank()
             getattr(torch.distributed, "get_rank", _missing),
             getattr(torch.distributed, "get_world_size", _missing),
@@ -3055,7 +3307,11 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         ]
         or (
             # TODO: this is a little sus, because we didn't check what the self is
+<<<<<<< HEAD
             proxy.node.op == "call_method" and proxy.node.target == "bit_length"
+=======
+            proxy.node.op == "call_method" and proxy.node.target in ["bit_length"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     ):
         set_example_value(proxy.node, example_value)
@@ -3073,21 +3329,30 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
             torch.backends.cuda.is_flash_attention_available,
             torch.backends.cuda.can_use_flash_attention,
             torch.backends.cuda.can_use_efficient_attention,
+<<<<<<< HEAD
             torch._C._get_cudnn_sdp_enabled,
             torch._C._get_flash_sdp_enabled,
             torch._C._get_mem_efficient_sdp_enabled,
             torch._C._get_math_sdp_enabled,
             torch._C._get_overrideable_sdp_enabled,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "is_integer",
         ]
         + list(supported_const_comparison_op_values.keys())
     ):
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
+<<<<<<< HEAD
     elif isinstance(example_value, (int, float, bool)) and (
         proxy.node.target is call_torchbind
         or proxy.node.target is flat_apply
         or (proxy.node.op == "call_method" and proxy.node.target == "item")
+=======
+    elif (
+        isinstance(example_value, (int, float, bool))
+        and proxy.node.target is call_torchbind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
@@ -3103,6 +3368,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         )
 
 
+<<<<<<< HEAD
 def infer_subclass_type(value):
     if type(value) in (
         torch.Tensor,
@@ -3152,6 +3418,8 @@ def get_specialized_props(target_cls, tx, example_value, subclass_type):
     return specialized_props
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def construct_tensor_variable(
     target_cls, tx, proxy, example_value, subclass_type, options
 ):
@@ -3168,9 +3436,30 @@ def construct_tensor_variable(
     # So that subgraphs can access the unbacked symbol's proxy in parent graph
     # when lifting unbacked symbols of input tensors to subgraph inputs.
     # We do it lazily because the tensor may not be used in subgraphs.
+<<<<<<< HEAD
     if proxy.node.op != "placeholder":
         tx.output.current_tracer.track_produced_symints(example_value, proxy)
     options.update(get_specialized_props(target_cls, tx, example_value, subclass_type))
+=======
+    tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
+    specialized_props = target_cls.specialize(example_value)
+    # TODO: not sure about this fake mode test
+    if (
+        isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor)
+        and example_value.fake_mode is tx.fake_mode
+    ):
+        if subclass_type:
+            tensor_type = subclass_type
+        elif isinstance(example_value, torch.nn.Parameter):
+            tensor_type = torch.nn.Parameter
+        elif isinstance(example_value, torch.nn.Buffer):
+            tensor_type = torch.nn.Buffer
+        else:
+            tensor_type = torch.Tensor
+        specialized_props["class_type"] = tensor_type
+
+    options.update(specialized_props)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return target_cls(proxy, **options)
 
 
@@ -3342,6 +3631,10 @@ def _automatic_dynamic(
         )
 
     if static_shapes and not is_dynamic_source(name):
+<<<<<<< HEAD
+=======
+        record_automatic_dynamic(tx, name, e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return StatefulSymbolicContext(
             dynamic_sizes=[DimDynamic.STATIC] * e.dim(),
             dynamic_strides=[DimDynamic.INFER_STRIDE] * e.dim(),
@@ -3458,10 +3751,18 @@ def update_dim2constraint(dim, constraint_range, name):
         if is_dynamic_source(name):
             log.debug("%s marked dynamic via source whitelist", name)
             automatic_dynamic_size = True
+<<<<<<< HEAD
+=======
+            automatic_dynamic_stride = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if is_unbacked_source(name):
             log.debug("%s marked unbacked via source whitelist", name)
             automatic_dynamic_size = True
+<<<<<<< HEAD
+=======
+            automatic_dynamic_stride = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         automatic_dynamic = automatic_dynamic_size or automatic_dynamic_stride
 
@@ -3592,6 +3893,7 @@ def wrap_to_fake_tensor_and_record(
             type(e),
         )
 
+<<<<<<< HEAD
         # Note [enable_python_dispatcher in dynamo]
         # Dynamo disables itself when it runs fake tensor prop, which means that tensor subclasses
         # have no way to know (purely based off of global state) if they are currently being run under compile or not.
@@ -3605,6 +3907,15 @@ def wrap_to_fake_tensor_and_record(
                     symbolic_context=symbolic_context,
                 )
             )
+=======
+        fake_e = wrap_fake_exception(
+            lambda: tx.fake_mode.from_tensor(
+                e,
+                source=source,
+                symbolic_context=symbolic_context,
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             source is not None
             and isinstance(fake_e, FakeTensor)
@@ -3696,12 +4007,15 @@ def create(tx: "InstructionTranslator", value) -> VariableTracker:
             if trace_rules.is_callable_allowed(value):
                 tx.output.has_user_defined_allowed_in_graph = True
             return trace_rules.lookup_callable(value)(value)
+<<<<<<< HEAD
         elif callable(value) and UserDefinedClassVariable.is_supported_new_method(
             value
         ):
             # NamedTuple._make uses an alias of tuple.__new__
             obj = trace_rules.lookup_callable(value.__self__)(value.__self__)
             return GetAttrVariable(obj, "__new__")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif is_function_or_wrapper(value):
             return trace_rules.lookup(value)(value)
         elif isinstance(
@@ -3727,7 +4041,13 @@ def create(tx: "InstructionTranslator", value) -> VariableTracker:
                 pass  # failthrough to unimplemented branch
         elif isinstance(value, torch.fx.graph_module.GraphModule):
             return SourcelessGraphModuleVariable(value)
+<<<<<<< HEAD
         elif isinstance(value, torch.utils._pytree.TreeSpec):
+=======
+        elif isinstance(
+            value, (torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec)
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return UserDefinedObjectVariable(value)
         elif PlacementVariable.is_placement(value):
             return PlacementVariable(value)
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 2382d4ef5df4a..c27e170abb61e 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -28,24 +28,39 @@
 import logging
 import math
 import operator
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 import typing
 import unittest
 from collections import defaultdict, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, KeysView, Sequence
 from typing import Any, TYPE_CHECKING, Union
+=======
+from collections.abc import KeysView, Sequence
+from typing import Callable, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import sym_float, sym_int
 from torch._subclasses.meta_utils import is_sparse_any
+<<<<<<< HEAD
 from torch.overrides import BaseTorchFunctionMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from .. import config, graph_break_hints, polyfills, variables
 from ..exc import (
     AttributeMutationError,
     ObservedAttributeError,
+<<<<<<< HEAD
     ObservedUserStopIteration,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     raise_observed_exception,
     unimplemented_v2,
     Unsupported,
@@ -69,7 +84,10 @@
     cmp_name_to_op_mapping,
     dict_methods,
     extract_fake_example_value,
+<<<<<<< HEAD
     frozenset_methods,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_fake_value,
     guard_if_dyn,
     is_tensor_getset_descriptor,
@@ -77,17 +95,27 @@
     istype,
     numpy_operator_wrapper,
     proxy_args_kwargs,
+<<<<<<< HEAD
     raise_args_mismatch,
     set_methods,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     str_methods,
     tensortype_to_dtype,
 )
 from .base import AsPythonConstantNotImplementedError, ValueMutationNew, VariableTracker
 from .constant import ConstantVariable
+<<<<<<< HEAD
 from .dicts import (
     ConstDictVariable,
     DefaultDictVariable,
     DictKeysVariable,
+=======
+from .ctx_manager import EventVariable, StreamVariable
+from .dicts import (
+    ConstDictVariable,
+    DefaultDictVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DictViewVariable,
     FrozensetVariable,
     is_hashable,
@@ -101,7 +129,10 @@
     TupleIteratorVariable,
     TupleVariable,
 )
+<<<<<<< HEAD
 from .streams import EventVariable, StreamVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .tensor import (
     FakeItemVariable,
     supported_comparison_ops,
@@ -109,12 +140,16 @@
     TensorVariable,
     UnspecializedPythonVariable,
 )
+<<<<<<< HEAD
 from .user_defined import (
     MutableMappingVariable,
     UserDefinedDictVariable,
     UserDefinedObjectVariable,
     UserDefinedVariable,
 )
+=======
+from .user_defined import UserDefinedObjectVariable, UserDefinedVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -155,6 +190,7 @@
     operator.ge: polyfills.cmp_ge,
 }
 
+<<<<<<< HEAD
 bin_ops = (
     operator.pow,
     operator.mul,
@@ -275,6 +311,8 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
                 if most_recent_func != BUILTIN_TO_TENSOR_FN_MAP[op]:
                     BUILTIN_TO_TENSOR_RFN_MAP[op] = most_recent_func
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class BuiltinVariable(VariableTracker):
     """
@@ -308,7 +346,10 @@ def _constant_fold_functions():
             bool,
             callable,
             chr,
+<<<<<<< HEAD
             complex,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             divmod,
             float,
             getattr,
@@ -673,6 +714,7 @@ def list_iadd_handler(tx: "InstructionTranslator", a, b):
         def expand_list_like(tx: "InstructionTranslator", lst, const):
             if isinstance(lst, ConstantVariable):
                 lst, const = const, lst
+<<<<<<< HEAD
             try:
                 return lst.__class__(
                     items=lst.items * const.as_python_constant(),
@@ -684,6 +726,12 @@ def expand_list_like(tx: "InstructionTranslator", lst, const):
                     tx,
                     args=list(map(ConstantVariable.create, exc.args)),
                 )
+=======
+            return lst.__class__(
+                items=lst.items * const.as_python_constant(),
+                mutation_type=ValueMutationNew(),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         list_like_expansion_handlers: list[
             tuple[
@@ -700,6 +748,7 @@ def expand_list_like(tx: "InstructionTranslator", lst, const):
 
         def create_cmp_op_handlers(op):
             def compare_by_value(tx: "InstructionTranslator", a, b):
+<<<<<<< HEAD
                 try:
                     return ConstantVariable(op(a.value, b.value))
                 except TypeError as exc:
@@ -708,6 +757,9 @@ def compare_by_value(tx: "InstructionTranslator", a, b):
                         tx,
                         args=list(map(ConstantVariable.create, exc.args)),
                     )
+=======
+                return ConstantVariable(op(a.value, b.value))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             result: list[
                 tuple[
@@ -990,7 +1042,11 @@ def create_exception_class_object(
                         hints=[*graph_break_hints.SUPPORTABLE],
                     )
 
+<<<<<<< HEAD
                 return variables.ExceptionVariable(fn, args, kwargs)
+=======
+                return variables.ExceptionVariable(fn, args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return create_exception_class_object
 
@@ -1029,7 +1085,10 @@ def call_binop_handlers(tx: "InstructionTranslator", args, _):
 
             def call_self_handler(tx: "InstructionTranslator", args, kwargs):
                 try:
+<<<<<<< HEAD
                     # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result = self_handler(tx, *args, **kwargs)
                     if result is not None:
                         return result
@@ -1037,12 +1096,19 @@ def call_self_handler(tx: "InstructionTranslator", args, kwargs):
                     # Check if binding is bad. inspect signature bind is expensive.
                     # So check only when handler call fails.
                     try:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         inspect.signature(self_handler).bind(tx, *args, **kwargs)
                     except TypeError as e:
                         has_constant_handler = obj.has_constant_handler(args, kwargs)
                         if not has_constant_handler:
+<<<<<<< HEAD
                             log.warning(  # noqa: G200
+=======
+                            log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 "incorrect arg count %s %s and no constant handler",
                                 self_handler,
                                 e,
@@ -1090,7 +1156,10 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                             hints=[*graph_break_hints.DYNAMO_BUG],
                             from_exc=exc,
                         )
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return VariableTracker.build(tx, res)
 
             else:
@@ -1119,7 +1188,10 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                                 tx,
                                 args=list(map(ConstantVariable.create, exc.args)),
                             )
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return VariableTracker.build(tx, res)
 
             handlers.append(constant_fold_handler)
@@ -1162,6 +1234,7 @@ def builtin_dispatch(tx: "InstructionTranslator", args, kwargs):
 
         return builtin_dispatch
 
+<<<<<<< HEAD
     def call_vars(self, tx: "InstructionTranslator", *args):
         if len(args) == 0:
             unimplemented_v2(
@@ -1177,6 +1250,8 @@ def call_vars(self, tx: "InstructionTranslator", *args):
         except ObservedAttributeError:
             raise_observed_exception(TypeError, tx)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
         from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
@@ -1185,15 +1260,28 @@ def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
 
         # insert handling for torch function here
         from .builder import SourcelessBuilder
+<<<<<<< HEAD
         from .torch_function import can_dispatch_torch_function, dispatch_torch_function
 
         global BUILTIN_TO_TENSOR_RFN_MAP, BUILTIN_TO_TENSOR_FN_MAP
+=======
+        from .torch_function import (
+            BUILTIN_TO_TENSOR_FN_MAP,
+            BUILTIN_TO_TENSOR_RFN_MAP,
+            can_dispatch_torch_function,
+            dispatch_torch_function,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if can_dispatch_torch_function(tx, args, kwargs):
             # Only remap the fn to tensor methods if we aren't exporting
             # export serde does not handle method descriptors today
             if not tx.export:
+<<<<<<< HEAD
                 # Ensure the builtin maps are populated before accessing them
                 populate_builtin_to_tensor_fn_map()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Use sourceless builder, we built the map ourselves
                 if not isinstance(args[0], TensorVariable):
                     if self.fn in BUILTIN_TO_TENSOR_RFN_MAP:
@@ -1252,7 +1340,11 @@ def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
             # Interaction between ndarray and tensors:
             #   We prefer the tensor op whenever there are tensors involved
             if check_numpy_ndarray_args(args, kwargs) and not any(
+<<<<<<< HEAD
                 type(arg) is variables.TensorVariable for arg in args
+=======
+                type(arg) == variables.TensorVariable for arg in args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 proxy = tx.output.create_proxy(
                     "call_function",
@@ -1393,11 +1485,19 @@ def call_method(
             if (
                 self.fn is tuple
                 and len(args) == 2
+<<<<<<< HEAD
                 and args[1].has_force_unpack_var_sequence(tx)
                 and not kwargs
             ):
                 if isinstance(args[0], BuiltinVariable) and args[0].fn is tuple:
                     init_args = args[1].force_unpack_var_sequence(tx)
+=======
+                and args[1].has_unpack_var_sequence(tx)
+                and not kwargs
+            ):
+                if isinstance(args[0], BuiltinVariable) and args[0].fn is tuple:
+                    init_args = args[1].unpack_var_sequence(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return variables.TupleVariable(
                         init_args, mutation_type=ValueMutationNew()
                     )
@@ -1418,6 +1518,7 @@ def call_method(
                     args[1:],
                 )
 
+<<<<<<< HEAD
         if self.fn is float and len(args) == 1 and name in ("fromhex", "hex"):
             if isinstance(args[0], ConstantVariable):
                 try:
@@ -1431,6 +1532,8 @@ def call_method(
                         args=list(map(ConstantVariable.create, e.args)),
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.fn is object and name == "__init__":
             # object.__init__ is a no-op
             return variables.ConstantVariable(None)
@@ -1442,11 +1545,15 @@ def call_method(
             resolved_fn = getattr(self.fn, name)
             if resolved_fn in dict_methods:
                 if isinstance(args[0], variables.UserDefinedDictVariable):
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return args[0]._dict_vt.call_method(tx, name, args[1:], kwargs)
                 elif isinstance(args[0], variables.ConstDictVariable):
                     return args[0].call_method(tx, name, args[1:], kwargs)
 
+<<<<<<< HEAD
         if self.fn is set:
             resolved_fn = getattr(self.fn, name)
             if resolved_fn in set_methods:
@@ -1462,18 +1569,23 @@ def call_method(
                 if isinstance(args[0], variables.FrozensetVariable):
                     return args[0].call_method(tx, name, args[1:], kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.fn is str and len(args) >= 1:
             resolved_fn = getattr(self.fn, name)
             if resolved_fn in str_methods:
                 if isinstance(args[0], ConstantVariable):
                     return args[0].call_method(tx, name, args[1:], kwargs)
 
+<<<<<<< HEAD
         if self.fn is float and len(args) >= 1:
             if isinstance(args[0], ConstantVariable):
                 return ConstantVariable.create(
                     getattr(float, name)(args[0].as_python_constant())
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def _call_int_float(self, tx: "InstructionTranslator", arg):
@@ -1540,12 +1652,18 @@ def call_str(self, tx: "InstructionTranslator", arg):
             if type(arg.value).__str__ is object.__str__:
                 # Rely on the object str method
                 try:
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return variables.ConstantVariable.create(value=str_method())
                 except AttributeError:
                     # Graph break
                     return
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif is_wrapper_or_member_descriptor(str_method):
                 unimplemented_v2(
                     gb_type="Attempted to a str() method implemented in C/C++",
@@ -1560,6 +1678,7 @@ def call_str(self, tx: "InstructionTranslator", arg):
 
                 try:
                     # Only supports certain function types
+<<<<<<< HEAD
                     user_func_variable = VariableTracker.build(tx, bound_method)
                 except AssertionError:
                     # Won't be able to do inline the str method, return to avoid graph break
@@ -1568,6 +1687,16 @@ def call_str(self, tx: "InstructionTranslator", arg):
 
                 # Inline the user function
                 return user_func_variable.call_function(tx, [arg], {})
+=======
+                    user_func_variable = variables.UserFunctionVariable(bound_method)
+                except AssertionError as e:
+                    # Won't be able to do inline the str method, return to avoid graph break
+                    log.warning("Failed to create UserFunctionVariable: %s", e)
+                    return
+
+                # Inline the user function
+                return tx.inline_user_function_return(user_func_variable, [arg], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(arg, (variables.ExceptionVariable,)):
             if len(arg.args) == 0:
                 value = f"{arg.exc_type}"
@@ -1662,10 +1791,15 @@ def _call_min_max_binary(self, tx: "InstructionTranslator", a, b):
                 else:
                     raw_b = b.raw_value
                 if self.fn is max:
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
                     raw_res = max(a.raw_value, raw_b)
                 else:
                     # pyrefly: ignore [missing-attribute]
+=======
+                    raw_res = max(a.raw_value, raw_b)
+                else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raw_res = min(a.raw_value, raw_b)
 
                 need_unwrap = any(
@@ -1746,7 +1880,11 @@ def _dynamic_args(self, *args, **kwargs):
         )
 
     def call_slice(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         return variables.SliceVariable(args, tx)
+=======
+        return variables.SliceVariable(args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _dyn_proxy(self, tx: "InstructionTranslator", *args, **kwargs):
         from .builder import wrap_fx_proxy
@@ -1783,7 +1921,11 @@ def _call_iter_tuple_list(
                     if (
                         getattr(obj, "source", False)
                         and isinstance(obj, ConstDictVariable)
+<<<<<<< HEAD
                         and not istype(obj, (SetVariable, FrozensetVariable))
+=======
+                        and not istype(obj, SetVariable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         tx.output.guard_on_key_order.add(obj.source)
 
@@ -1819,10 +1961,14 @@ def _call_tuple_list(self, tx, obj=None, *args, **kwargs):
                 list(obj.force_unpack_var_sequence(tx)),
                 mutation_type=ValueMutationNew(),
             )
+<<<<<<< HEAD
         elif isinstance(obj, variables.LocalGeneratorObjectVariable) or (
             isinstance(obj, UserDefinedObjectVariable)
             and obj.has_force_unpack_var_sequence(tx)
         ):
+=======
+        elif isinstance(obj, variables.LocalGeneratorObjectVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._call_iter_tuple_generator(tx, obj, *args, **kwargs)
         else:
             return self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1830,10 +1976,13 @@ def _call_tuple_list(self, tx, obj=None, *args, **kwargs):
     def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
         if isinstance(obj, variables.IteratorVariable):
             ret = obj
+<<<<<<< HEAD
         elif isinstance(obj, variables.RangeVariable):
             ret = obj.call_method(tx, "__iter__", [], {})
         elif isinstance(obj, variables.LocalGeneratorObjectVariable):
             ret = obj  # type: ignore[assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # Handle the case where we are iterating over a tuple, list or iterator
             ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1842,6 +1991,7 @@ def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
             # If the object doesn't implement a __iter__ method, it will be an error in eager mode when calling iter on it anyway.
             # If the object implements a __iter__ method, inlining effectively forwards the call to another iter call
             # (e.g. when __iter__ just returns iter(self.list)) or return a user-defined iterator.
+<<<<<<< HEAD
             # If the object implements a __getitem__ method, iter(...) will call obj.__getitem__()
             # with an integer argument starting at 0, until __getitem__ raises IndexError
             ret = variables.UserFunctionVariable(
@@ -1854,6 +2004,9 @@ def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
                 # Wrap the return value in a IteratorVariable subclass (LazyObjectIteratorVariable)
                 # that forwards the next_variable call to the object.
                 ret = variables.ObjectIteratorVariable(ret)
+=======
+            return obj.call_method(tx, "__iter__", args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ret
 
     call_tuple = _call_tuple_list
@@ -1899,17 +2052,21 @@ def call_cast(self, _, *args, **kwargs):
             hints=["Ensure your call to cast() has exactly 2 arguments."],
         )
 
+<<<<<<< HEAD
     def call_dir(self, tx: "InstructionTranslator", arg):
         if isinstance(arg, variables.UserDefinedClassVariable):
             return VariableTracker.build(tx, dir(arg.value))
         if isinstance(arg, BuiltinVariable):
             return VariableTracker.build(tx, dir(arg.fn))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_dict(self, tx: "InstructionTranslator", *args, **kwargs):
         return BuiltinVariable.call_custom_dict(tx, dict, *args, **kwargs)
 
     @staticmethod
     def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
+<<<<<<< HEAD
         args = list(args)
         if (
             len(args) == 1
@@ -1921,6 +2078,8 @@ def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
             # VT(foo.__dict__). This simplifies the construction of the new
             # dict.
             args[0] = args[0].get_forwarded_dict(tx)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tx.inline_user_function_return(
             VariableTracker.build(tx, polyfills.construct_dict),
             [VariableTracker.build(tx, user_cls), *args],
@@ -1931,6 +2090,7 @@ def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
     def call_custom_dict_fromkeys(
         tx: "InstructionTranslator", user_cls, *args, **kwargs
     ):
+<<<<<<< HEAD
         if user_cls not in {dict, OrderedDict, defaultdict}:
             unimplemented_v2(
                 gb_type="Unsupported dict type for fromkeys()",
@@ -1972,6 +2132,19 @@ def call_custom_dict_fromkeys(
                 "2 args",
                 f"{len(args)} args",
             )
+=======
+        assert user_cls in {dict, OrderedDict, defaultdict}
+        if kwargs:
+            # Only `OrderedDict.fromkeys` accepts `value` passed by keyword
+            assert user_cls is OrderedDict
+            assert len(args) == 1 and len(kwargs) == 1 and "value" in kwargs
+            args = (*args, kwargs.pop("value"))
+        if len(args) == 0:
+            raise UserError(TypeError, "fromkeys expected at least 1 argument, got 0")  # type: ignore[arg-type]
+        if len(args) == 1:
+            args = (*args, ConstantVariable.create(None))
+        assert len(args) == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg, value = args
         DictVariableType = (
             ConstDictVariable if user_cls is not defaultdict else DefaultDictVariable
@@ -1980,16 +2153,23 @@ def call_custom_dict_fromkeys(
         if isinstance(arg, dict):
             arg = [ConstantVariable.create(k) for k in arg.keys()]
             return DictVariableType(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 dict.fromkeys(arg, value),
                 user_cls,
                 mutation_type=ValueMutationNew(),
+=======
+                dict.fromkeys(arg, value), user_cls, mutation_type=ValueMutationNew()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif arg.has_force_unpack_var_sequence(tx):
             keys = arg.force_unpack_var_sequence(tx)
             if all(is_hashable(v) for v in keys):
                 return DictVariableType(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dict.fromkeys(keys, value),
                     user_cls,
                     mutation_type=ValueMutationNew(),
@@ -2023,7 +2203,11 @@ def call_set(self, tx: "InstructionTranslator", *args, **kwargs):
                 ],
             )
         arg = args[0]
+<<<<<<< HEAD
         if istype(arg, variables.SetVariable):
+=======
+        if isinstance(arg, variables.SetVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return arg.clone(mutation_type=ValueMutationNew())
         elif arg.has_force_unpack_var_sequence(tx):
             items = arg.force_unpack_var_sequence(tx)
@@ -2058,10 +2242,17 @@ def call_frozenset(self, tx: "InstructionTranslator", *args, **kwargs):
                 ],
             )
         arg = args[0]
+<<<<<<< HEAD
         if istype(arg, variables.FrozensetVariable):
             return FrozensetVariable([x.vt for x in arg.set_items])
         elif arg.has_force_unpack_var_sequence(tx):
             items = arg.force_unpack_var_sequence(tx)
+=======
+        if isinstance(arg, variables.FrozensetVariable):
+            return FrozensetVariable([x.vt for x in arg.set_items])
+        elif arg.has_unpack_var_sequence(tx):
+            items = arg.unpack_var_sequence(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return FrozensetVariable(items)
         raise_observed_exception(
             TypeError,
@@ -2071,6 +2262,7 @@ def call_frozenset(self, tx: "InstructionTranslator", *args, **kwargs):
 
     def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
         if kwargs:
+<<<<<<< HEAD
             if not (len(kwargs) == 1 and "strict" in kwargs):
                 raise_args_mismatch(
                     tx,
@@ -2080,6 +2272,14 @@ def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
                 )
         strict = kwargs.pop("strict", False)
         args = [BuiltinVariable(iter).call_function(tx, [arg], {}) for arg in args]
+=======
+            assert len(kwargs) == 1 and "strict" in kwargs
+        strict = kwargs.pop("strict", False)
+        args = [
+            arg.unpack_var_sequence(tx) if arg.has_unpack_var_sequence(tx) else arg
+            for arg in args
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return variables.ZipVariable(
             args, strict=strict, mutation_type=ValueMutationNew()
         )
@@ -2156,7 +2356,10 @@ def check_type(ty):
             )
 
         if isinstance(arg, variables.UserDefinedExceptionClassVariable):
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ConstantVariable.create(isinstance(arg_type, isinstance_type))
 
         isinstance_type_tuple: tuple[type, ...]
@@ -2165,7 +2368,13 @@ def check_type(ty):
             getattr(isinstance_type, "__instancecheck__", None)
         ):
             isinstance_type_tuple = (isinstance_type,)
+<<<<<<< HEAD
         elif isinstance(isinstance_type, types.UnionType):
+=======
+        elif sys.version_info >= (3, 10) and isinstance(
+            isinstance_type, types.UnionType
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             isinstance_type_tuple = isinstance_type.__args__
         elif isinstance(isinstance_type, tuple) and all(
             isinstance(tp, type) or callable(getattr(tp, "__instancecheck__", None))
@@ -2189,10 +2398,15 @@ def check_type(ty):
             # through it. This is a limitation of the current implementation.
             # Usually `__subclasscheck__` and `__instancecheck__` can be constant fold through, it
             # might not be a big issue and we trade off it for performance.
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             val = issubclass(arg_type, isinstance_type_tuple)
         except TypeError:
             # pyrefly: ignore [unbound-name]
+=======
+            val = issubclass(arg_type, isinstance_type_tuple)
+        except TypeError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             val = arg_type in isinstance_type_tuple
         return variables.ConstantVariable.create(val)
 
@@ -2214,12 +2428,16 @@ def call_issubclass(self, tx: "InstructionTranslator", left_ty, right_ty):
 
         # WARNING: This might run arbitrary user code `__subclasscheck__`.
         # See the comment in call_isinstance above.
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return variables.ConstantVariable(issubclass(left_ty_py, right_ty_py))
 
     def call_super(self, tx: "InstructionTranslator", a, b):
         return variables.SuperVariable(a, b)
 
+<<<<<<< HEAD
     def call_next(self, tx: "InstructionTranslator", *args):
         arg = args[0]
         try:
@@ -2228,6 +2446,11 @@ def call_next(self, tx: "InstructionTranslator", *args):
             if len(args) == 2:
                 return args[1]
             raise
+=======
+    def call_next(self, tx: "InstructionTranslator", arg: VariableTracker):
+        try:
+            return arg.next_variable(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Unsupported as ex:
             if isinstance(arg, variables.BaseListVariable):
                 ex.remove_from_stats()
@@ -2252,6 +2475,7 @@ def call_filter(self, tx: "InstructionTranslator", fn, seq):
         seq = seq.unpack_var_sequence(tx) if seq.has_unpack_var_sequence(tx) else seq
         return variables.FilterVariable(fn, seq, mutation_type=ValueMutationNew())
 
+<<<<<<< HEAD
     def var_getattr(self, tx: "InstructionTranslator", name):
         source = self.source and AttrSource(self.source, name)
         if self.fn is object:
@@ -2266,6 +2490,8 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 return VariableTracker.build(tx, value, source)
         return variables.GetAttrVariable(self, name, source=source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_getattr(
         self,
         tx: "InstructionTranslator",
@@ -2361,6 +2587,11 @@ def call_getattr(
                     "assertRaisesRegex",
                     "assertNotWarns",
                     "assertWarnsRegex",
+<<<<<<< HEAD
+=======
+                    "assertDictEqual",
+                    "assertSequenceEqual",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "assertWarns",
                 )
             ):
@@ -2473,6 +2704,7 @@ def call_setattr(
                                 "the mutation out of `torch.compile` region",
                             ],
                         )
+<<<<<<< HEAD
                     elif obj.dtype != val.dtype:  # type: ignore[attr-defined]
                         unimplemented_v2(
                             gb_type="Failed to mutate tensor data attribute to different dtype",
@@ -2484,6 +2716,8 @@ def call_setattr(
                                 "the mutation out of `torch.compile` region",
                             ],
                         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     # Remove the old reference in tracked fakes - if we don't do this
                     # new .data value size and shape differences will cause
@@ -2642,6 +2876,7 @@ def call_neg(self, tx: "InstructionTranslator", a):
                 (operator.neg)(a.as_proxy()),
                 sym_num=None,
             )
+<<<<<<< HEAD
 
         if (
             isinstance(a, UserDefinedObjectVariable)
@@ -2649,6 +2884,8 @@ def call_neg(self, tx: "InstructionTranslator", a):
         ):
             return a.call_method(tx, "__neg__", [], {})
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # None no-ops this handler and lets the driving function proceed
         return None
 
@@ -2667,10 +2904,14 @@ def call_id(self, tx: "InstructionTranslator", *args):
             (variables.UserDefinedClassVariable, variables.UserDefinedObjectVariable),
         ):
             if args[0].source:
+<<<<<<< HEAD
                 if isinstance(args[0], variables.UserDefinedClassVariable):
                     install_guard(args[0].source.make_guard(GuardBuilder.CLASS_MATCH))
                 else:
                     install_guard(args[0].source.make_guard(GuardBuilder.ID_MATCH))
+=======
+                install_guard(args[0].source.make_guard(GuardBuilder.ID_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             constant_result = id(args[0].value)
             return variables.ConstantVariable.create(constant_result)
         elif len(args) == 1 and isinstance(args[0], TensorVariable):
@@ -2785,6 +3026,7 @@ def _comparison_with_symnode(self, tx: "InstructionTranslator", left, right):
             sym_num=None,
         )
 
+<<<<<<< HEAD
     def call_xor(self, tx: "InstructionTranslator", a, b):
         if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__xor__", [b], {})
@@ -2801,6 +3043,8 @@ def call_isub(self, tx: "InstructionTranslator", a, b):
         if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__isub__", [b], {})
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_and_(self, tx: "InstructionTranslator", a, b):
         # Rely on constant_handler
         if isinstance(a, ConstantVariable) and isinstance(b, ConstantVariable):
@@ -2815,6 +3059,7 @@ def call_and_(self, tx: "InstructionTranslator", a, b):
                 ),
                 sym_num=None,
             )
+<<<<<<< HEAD
         if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__and__", [b], {})
         # None no-ops this handler and lets the driving function proceed
@@ -2835,6 +3080,13 @@ def call_iand(self, tx: "InstructionTranslator", a, b):
             )
         if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__iand__", [b], {})
+=======
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items & b.set_items))
+        # None no-ops this handler and lets the driving function proceed
+
+    call_iand = call_and_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_or_(self, tx: "InstructionTranslator", a, b):
         # Rely on constant_handler
@@ -2850,6 +3102,7 @@ def call_or_(self, tx: "InstructionTranslator", a, b):
                 ),
                 sym_num=None,
             )
+<<<<<<< HEAD
 
         # This call looks like `{"one": torch.ones(1)} | {"two": torch.ones(2)}`.
         if isinstance(
@@ -2900,6 +3153,17 @@ def call_ior(self, tx: "InstructionTranslator", a, b):
 
         # None no-ops this handler and lets the driving function proceed
         return None
+=======
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items | b.set_items))
+        # This call looks like `{"one": torch.ones(1)} | {"two": torch.ones(2)}`.
+        if isinstance(a, ConstDictVariable):
+            return a.call_method(tx, "__or__", args=[b], kwargs={})
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    call_ior = call_or_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_not_(self, tx: "InstructionTranslator", a):
         if isinstance(a, SymNodeVariable):
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 34573c5cfc773..620082fe2d9d6 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -16,6 +16,7 @@
 
 from .. import graph_break_hints, variables
 from ..exc import raise_observed_exception, unimplemented_v2
+<<<<<<< HEAD
 from ..utils import (
     cmp_name_to_op_mapping,
     common_constant_types,
@@ -23,6 +24,9 @@
     np,
     raise_args_mismatch,
 )
+=======
+from ..utils import cmp_name_to_op_mapping, common_constant_types, istype, np
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .base import VariableTracker
 
 
@@ -49,7 +53,11 @@ def create(value, **kwargs) -> VariableTracker:
         NOTE: the caller must install the proper guards if needed; most often
         the guard will be `CONSTANT_MATCH`.
         """
+<<<<<<< HEAD
         source = kwargs.get("source")
+=======
+        source = kwargs.get("source", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Routing for supported collection literals.
         if isinstance(value, set):
@@ -58,10 +66,13 @@ def create(value, **kwargs) -> VariableTracker:
         elif isinstance(value, frozenset):
             items = [ConstantVariable.create(x) for x in value]
             return variables.FrozensetVariable(items, **kwargs)
+<<<<<<< HEAD
         elif isinstance(value, slice):
             slice_args = (value.start, value.stop, value.step)
             slice_args_vars = tuple(ConstantVariable.create(arg) for arg in slice_args)
             return variables.SliceVariable(slice_args_vars, **kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(value, (list, tuple)):
             items = []
             for i, x in enumerate(value):
@@ -135,7 +146,11 @@ def unpack_var_sequence(self, tx):
 
     def const_getattr(self, tx: "InstructionTranslator", name):
         if not hasattr(self.value, name):
+<<<<<<< HEAD
             raise_observed_exception(AttributeError, tx, args=[name])
+=======
+            raise NotImplementedError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         member = getattr(self.value, name)
         if callable(member):
             raise NotImplementedError
@@ -155,6 +170,7 @@ def call_method(
                 tx, [self, *args], kwargs
             )
         elif name == "join" and istype(self.value, str):
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -162,6 +178,9 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 1 and len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_unpacked = args[0].force_unpack_var_sequence(tx)
             try:
                 arg_const = [x.as_python_constant() for x in arg_unpacked]
@@ -189,6 +208,7 @@ def call_method(
                 raise_observed_exception(type(e), tx)
         elif isinstance(self.value, (float, int)):
             if not (args or kwargs):
+<<<<<<< HEAD
                 try:
                     return ConstantVariable.create(getattr(self.value, name)())
                 except (OverflowError, ValueError) as exc:
@@ -197,6 +217,9 @@ def call_method(
                         tx,
                         args=list(map(ConstantVariable.create, exc.args)),
                     )
+=======
+                return ConstantVariable.create(getattr(self.value, name)())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 hasattr(operator, name)
                 and len(args) == 1
@@ -222,16 +245,20 @@ def call_method(
         elif isinstance(self.value, bytes) and name == "decode":
             method = getattr(self.value, name)
             return ConstantVariable.create(method(*const_args, **const_kwargs))
+<<<<<<< HEAD
         elif type(self.value) is complex and name in complex.__dict__.keys():
             method = getattr(self.value, name)
             try:
                 return ConstantVariable.create(method(*const_args, **const_kwargs))
             except Exception as e:
                 raise_observed_exception(type(e), tx)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if name == "__len__" and not (args or kwargs):
             return ConstantVariable.create(len(self.value))
         elif name == "__round__" and len(args) == 1 and args[0].is_python_constant():
+<<<<<<< HEAD
             try:
                 return ConstantVariable.create(
                     round(self.value, args[0].as_python_constant())
@@ -250,6 +277,16 @@ def call_method(
                 raise_observed_exception(
                     type(e), tx, args=list(map(ConstantVariable.create, e.args))
                 )
+=======
+            return ConstantVariable.create(
+                round(self.value, args[0].as_python_constant())
+            )
+        elif name == "__contains__" and len(args) == 1 and args[0].is_python_constant():
+            assert not kwargs
+            search = args[0].as_python_constant()
+            result = search in self.value
+            return ConstantVariable.create(result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def call_obj_hasattr(
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 0502c58a78420..86097b84e1205 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -23,7 +23,10 @@
 import inspect
 import sys
 import warnings
+<<<<<<< HEAD
 from contextlib import ExitStack
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import TYPE_CHECKING, Union
 
 import torch._C
@@ -35,10 +38,17 @@
     create_instruction,
     create_setup_with,
 )
+<<<<<<< HEAD
 from ..exc import unimplemented_v2
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GlobalStateSource
 from ..utils import _get_error_on_graph_break, _set_error_on_graph_break
+=======
+from ..device_interface import get_interface_for_device
+from ..exc import unimplemented_v2
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GlobalStateSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .base import VariableTracker
 from .functions import (
     NestedUserFunctionVariable,
@@ -197,6 +207,7 @@ def exit_on_graph_break(self):
         return True
 
 
+<<<<<<< HEAD
 class RepararametrizeModuleContextVariable(GenericContextWrappingVariable):
     def __init__(self, ctx_manager_vt, mod):
         self.cm_vt = ctx_manager_vt
@@ -225,6 +236,8 @@ def __getattr__(self, name):
         return getattr(self.cm_vt, name)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GradInplaceRequiresGradCtxManagerVariable(ContextWrappingVariable):
     """represents torch grad requires grad"""
 
@@ -512,17 +525,31 @@ def enter(self, tx):
         batch_size, randomness = self.target_values
         if isinstance(batch_size, variables.SymNodeVariable):
             batch_size_value = batch_size.sym_num
+<<<<<<< HEAD
         else:
             batch_size_value = batch_size.as_python_constant()
+=======
+            batch_size_node = batch_size.as_proxy().node
+        else:
+            batch_size_value = batch_size.as_python_constant()
+            batch_size_node = batch_size.as_python_constant()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         randomness = randomness.as_python_constant()
         vmap_level = torch._C._functorch._vmap_increment_nesting(
             batch_size_value, randomness
         )
         self.set_cleanup_hook(tx, lambda: torch._C._functorch._vmap_decrement_nesting())
+<<<<<<< HEAD
         self.proxy = tx.output.create_proxy(
             "call_function",
             torch._functorch.predispatch._vmap_increment_nesting,
             (batch_size.as_proxy(), randomness),
+=======
+        self.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._vmap_increment_nesting,
+            (batch_size_node, randomness),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         return variables.ConstantVariable.create(vmap_level)
@@ -530,10 +557,14 @@ def enter(self, tx):
     def exit(self, tx: "InstructionTranslator", *args):
         self.cleanup()
         tx.output.create_node(
+<<<<<<< HEAD
             "call_function",
             torch._functorch.predispatch._vmap_decrement_nesting,
             (),
             {},
+=======
+            "call_function", torch._C._functorch._vmap_decrement_nesting, (), {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return variables.ConstantVariable.create(None)
 
@@ -808,8 +839,15 @@ def enter(self, tx):
     def _call_func(self, tx: "InstructionTranslator", values):
         assert len(values) == 1
         value = values[0]
+<<<<<<< HEAD
         tx.output.create_node(
             "call_function", torch._C._set_deterministic_algorithms, (value,), {}
+=======
+        (
+            tx.output.create_node(
+                "call_function", torch._C._set_deterministic_algorithms, (value,), {}
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         torch._C._set_deterministic_algorithms(value)
 
@@ -941,8 +979,12 @@ def __init__(self, target_values=None, **kwargs) -> None:
         super().__init__(target_values=target_values, **kwargs)
 
     def enter(self, tx):
+<<<<<<< HEAD
         none = variables.ConstantVariable.create(None)
         return self.target_values if self.target_values else none
+=======
+        return variables.ConstantVariable.create(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def exit(self, tx: "InstructionTranslator", *args):
         return variables.ConstantVariable.create(None)
@@ -989,6 +1031,73 @@ def reconstruct(self, cg):
         )
 
 
+<<<<<<< HEAD
+=======
+class StreamContextVariable(ContextWrappingVariable):
+    @staticmethod
+    def create(tx: "InstructionTranslator", target_value, **kwargs):
+        from .builder import wrap_fx_proxy_cls
+
+        current_stream_method = get_interface_for_device(
+            target_value.device
+        ).current_stream
+        current_stream = wrap_fx_proxy_cls(
+            StreamVariable,
+            tx,
+            tx.output.create_proxy(
+                "call_function",
+                current_stream_method,
+                (None,),
+                {},
+            ),
+        )
+        return StreamContextVariable(
+            target_values=[target_value],
+            initial_values=[current_stream],
+            device=target_value.device,
+            **kwargs,
+        )
+
+    def __init__(self, target_values, device, initial_values=None, **kwargs) -> None:
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        self.device = device
+        self.set_stream = get_interface_for_device(self.device).set_stream
+        self.set_stream_id = get_interface_for_device(self.device)._set_stream_by_id
+
+    def enter(self, tx):
+        # stream generated inside the traced function
+        if self.target_values[0].as_proxy() is not None:
+            tx.output.create_proxy(
+                "call_function",
+                self.set_stream,
+                (self.target_values[0].as_proxy(),),
+                {},
+            )
+        # stream passed from outside the traced function
+        else:
+            stream = self.target_values[0].value
+            tx.output.create_proxy(
+                "call_function",
+                self.set_stream_id,
+                (stream.stream_id, stream.device_index, stream.device_type),
+                {},
+            )
+        self.set_stream(self.target_values[0].value)
+        self.set_cleanup_hook(tx, lambda: self.set_stream(self.initial_values[0].value))
+
+    def exit(self, tx: "InstructionTranslator", *args):
+        tx.output.create_proxy(
+            "call_function",
+            self.set_stream,
+            (self.initial_values[0].as_proxy(),),
+            {},
+        )
+        self.cleanup_assert()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class PreserveVersionContextVariable(ContextWrappingVariable):
     """
     Wraps torch.autograd._unsafe_preserve_version_counter
@@ -1196,6 +1305,7 @@ def fn_name(self):
         return "_sdpa_kernel_variadic"
 
 
+<<<<<<< HEAD
 class FxTracebackAnnotateVariable(ContextWrappingVariable):
     """
     fx.traceback.annotate is a context manager that allows users to annotate the
@@ -1236,6 +1346,142 @@ def reconstruct_type(self, codegen: "PyCodegen"):
                 *graph_break_hints.SUPPORTABLE,
             ],
         )
+=======
+class StreamVariable(VariableTracker):
+    def __init__(self, proxy, value, device, **kwargs) -> None:
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        assert value.device.type == device.type, (
+            "stream value is not equal to the passed device"
+        )
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+        self.device = device
+
+    def python_type(self):
+        return torch.Stream
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        assert hasattr(self.value, name), f"no stream method found named {name}"
+
+        from ..utils import cmp_name_to_op_mapping, proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait_stream", "synchronize", "wait_event"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return variables.ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=variables.ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        elif name == "record_event":
+            return wrap_fx_proxy_cls(
+                target_cls=EventVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        elif name in cmp_name_to_op_mapping and len(args) == 1 and not kwargs:
+            # NB : Checking for mutation is necessary because we compare
+            # constant values
+            other = args[0]
+            if not isinstance(other, StreamVariable):
+                return variables.ConstantVariable.create(NotImplemented)
+            return variables.ConstantVariable.create(
+                cmp_name_to_op_mapping[name](self.value, other.value)
+            )
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def as_proxy(self):
+        return self.proxy
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        # If we got here, this stream is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
+        # is fine and sound according to dynamo principles of treating collectives. However,
+        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
+        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
+        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
+        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
+        prefix = f"_stream_{self.device}"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(codegen.create_load_global(name, add=True))
+
+
+class EventVariable(VariableTracker):
+    def __init__(self, proxy, value, **kwargs) -> None:
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from ..utils import proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait", "record", "synchronize"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return variables.ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=variables.ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        else:
+            method_name = (
+                f"{type(self.value).__module__}.{type(self.value).__qualname__}.{name}"
+            )
+            unimplemented_v2(
+                gb_type="Unsupported event method",
+                context=str(name),
+                explanation=f"Dynamo doesn't support tracing the {method_name} method. "
+                f"We currently support wait, record, synchronize, and query.",
+                hints=[
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+
+    def as_proxy(self):
+        return self.proxy
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        # If we got here, this event is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Similar to stream handling, we lift the event into a global and then codegen bytecode to load it from there.
+        prefix = "_event"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(codegen.create_load_global(name, add=True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DynamoConfigPatchVariable(ContextWrappingVariable):
@@ -1251,6 +1497,19 @@ def __init__(self, target_values, **kwargs) -> None:
             self.initial_values[key] = torch._dynamo.config.__getattr__(key)
         self.initial_values = (tuple(self.initial_values.items()),)
 
+<<<<<<< HEAD
+=======
+    def enter(self, tx):
+        # resets all config patches at the end of tracing
+        self.set_cleanup_hook(tx)
+        self._call_func(tx, self.target_values)
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx: "InstructionTranslator", *args):
+        self._call_func(tx, self.initial_values)
+        return variables.ConstantVariable.create(None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _call_func(self, tx: "InstructionTranslator", values):
         assert len(values) == 1
         value = values[0]
@@ -1269,6 +1528,7 @@ def fn_name(self):
         return "patch_dynamo_config"
 
 
+<<<<<<< HEAD
 class ErrorOnGraphBreakVariable(ContextWrappingVariable):
     """represents torch._dynamo.error_on_graph_break"""
 
@@ -1330,6 +1590,8 @@ def reconstruct(self, codegen: "PyCodegen"):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WithExitFunctionVariable(VariableTracker):
     _nonvar_fields = {
         "target",
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 481aa7e1a302b..520f08b165d97 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -41,7 +41,10 @@
     dict_keys,
     dict_values,
     istype,
+<<<<<<< HEAD
     raise_args_mismatch,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     specialize_symnode,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -58,6 +61,17 @@
 # - (perhaps) Define how it is compared in _HashableTracker._eq_impl
 
 
+<<<<<<< HEAD
+=======
+def raise_args_mismatch(tx, name):
+    raise_observed_exception(
+        TypeError,
+        tx,
+        args=[ConstantVariable(f"wrong number of arguments for {name}() call")],
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def was_instancecheck_override(obj):
     return type(obj).__dict__.get("__instancecheck__", False)
 
@@ -68,9 +82,13 @@ def raise_unhashable(arg, tx=None):
 
         tx = InstructionTranslator.current_tx()
     raise_observed_exception(
+<<<<<<< HEAD
         TypeError,
         tx,
         args=[ConstantVariable(f"unhashable type: {type(arg.realize())}")],
+=======
+        TypeError, tx, args=[ConstantVariable(f"unhashable type: {type(arg)}")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -93,8 +111,11 @@ def is_hashable(x):
         return x.as_proxy().node.meta.get("example_value") is not None
     elif isinstance(x, variables.TupleVariable):
         return all(is_hashable(e) for e in x.items)
+<<<<<<< HEAD
     elif isinstance(x, variables.FrozenDataClassVariable):
         return all(is_hashable(e) for e in x.fields.values())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif (
         isinstance(x, variables.UserDefinedObjectVariable)
         and not was_instancecheck_override(x.value)
@@ -110,7 +131,10 @@ def is_hashable(x):
                 variables.SymNodeVariable,
                 variables.ConstantVariable,
                 variables.EnumVariable,
+<<<<<<< HEAD
                 variables.FrozensetVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 variables.UserDefinedClassVariable,
                 variables.UserFunctionVariable,
                 variables.SkipFunctionVariable,
@@ -122,14 +146,20 @@ def is_hashable(x):
                 variables.TypingVariable,
                 variables.FunctoolsPartialVariable,
                 variables.WeakRefVariable,
+<<<<<<< HEAD
                 variables.TorchHigherOrderOperatorVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
 
 class ConstDictVariable(VariableTracker):
+<<<<<<< HEAD
     CONTAINS_GUARD = GuardBuilder.DICT_CONTAINS
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _nonvar_fields = {
         "user_cls",
         *VariableTracker._nonvar_fields,
@@ -174,6 +204,7 @@ def underlying_value(self):
                 # Access the underlying value inside the referent_vt for the key representation
                 Hashable = ConstDictVariable._HashableTracker
                 return Hashable(self.vt.referent_vt).underlying_value
+<<<<<<< HEAD
             elif isinstance(self.vt, variables.FrozenDataClassVariable):
                 Hashable = ConstDictVariable._HashableTracker
                 fields_values = {
@@ -182,6 +213,8 @@ def underlying_value(self):
                 return variables.FrozenDataClassVariable.HashWrapper(
                     self.vt.python_type(), fields_values
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(self.vt, variables.UserDefinedObjectVariable):
                 # The re module in Python 3.13+ has a dictionary (_cache2) with
                 # an object as key (`class _ZeroSentinel(int): ...`):
@@ -197,11 +230,17 @@ def __hash__(self):
         @staticmethod
         def _eq_impl(a, b):
             # TODO: Put this in utils and share it between variables/builtin.py and here
+<<<<<<< HEAD
             type_a, type_b = type(a), type(b)
             if not (issubclass(type_a, type_b) or issubclass(type_b, type_a)):
                 return False
 
             if isinstance(a, tuple):
+=======
+            if type(a) != type(b):
+                return False
+            elif isinstance(a, tuple):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Hashable = ConstDictVariable._HashableTracker
                 return len(a) == len(b) and all(
                     Hashable._eq_impl(u, v) for u, v in zip(a, b)
@@ -249,14 +288,19 @@ def __init__(
         def make_hashable(key):
             return key if isinstance(key, Hashable) else Hashable(key)
 
+<<<<<<< HEAD
         dict_cls = self._get_dict_cls_from_user_cls(user_cls)
         self.items = dict_cls({make_hashable(x): v for x, v in items.items()})
+=======
+        self.items = {make_hashable(x): v for x, v in items.items()}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # need to reconstruct everything if the dictionary is an intermediate value
         # or if a pop/delitem was executed
         self.should_reconstruct_all = not is_from_local_source(self.source)
         self.original_items = items.copy()
         self.user_cls = user_cls
 
+<<<<<<< HEAD
     def _get_dict_cls_from_user_cls(self, user_cls):
         accepted_dict_types = (dict, collections.OrderedDict, collections.defaultdict)
 
@@ -276,6 +320,8 @@ def _get_dict_cls_from_user_cls(self, user_cls):
             dict_cls = dict
         return dict_cls
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def as_proxy(self):
         return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
 
@@ -310,6 +356,7 @@ def __contains__(self, vt) -> bool:
             and not isinstance(self.items[Hashable(vt)], variables.DeletedVariable)
         )
 
+<<<<<<< HEAD
     def len(self) -> int:
         return sum(
             not isinstance(x, variables.DeletedVariable) for x in self.items.values()
@@ -317,6 +364,21 @@ def len(self) -> int:
 
     def has_new_items(self) -> bool:
         return self.should_reconstruct_all or any(
+=======
+    def len(self):
+        return len(
+            [
+                x
+                for x in self.items.values()
+                if not isinstance(x, variables.DeletedVariable)
+            ]
+        )
+
+    def has_new_items(self):
+        if self.should_reconstruct_all:
+            return True
+        return any(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.is_new_item(self.original_items.get(key.vt), value)
             for key, value in self.items.items()
         )
@@ -424,7 +486,11 @@ def install_dict_contains_guard(self, tx, args):
             install_guard(
                 self.make_guard(
                     functools.partial(
+<<<<<<< HEAD
                         type(self).CONTAINS_GUARD,
+=======
+                        GuardBuilder.DICT_CONTAINS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         key=args[0].value,
                         invert=not contains,
                     )
@@ -465,6 +531,7 @@ def call_method(
             return ConstantVariable.create(None)
         elif name == "__getitem__":
             # Key guarding - Nothing to do. LazyVT for value will take care.
+<<<<<<< HEAD
             if len(args) != 1:
                 raise_args_mismatch(tx, name, "1 args", f"{len(args)} args")
             return self.getitem_const_raise_exception_if_absent(tx, args[0])
@@ -476,11 +543,18 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 1
+            return self.getitem_const_raise_exception_if_absent(tx, args[0])
+        elif name == "items":
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_dict_keys_match_guard()
             if self.source:
                 tx.output.guard_on_key_order.add(self.source)
             return DictItemsVariable(self)
         elif name == "keys":
+<<<<<<< HEAD
             if len(args):
                 raise_args_mismatch(tx, name, "0 args", f"{len(args)} args")
             self.install_dict_keys_match_guard()
@@ -510,10 +584,27 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            self.install_dict_keys_match_guard()
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source)
+            assert not (args or kwargs)
+            return DictKeysVariable(self)
+        elif name == "values":
+            self.install_dict_keys_match_guard()
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source)
+            assert not (args or kwargs)
+            return DictValuesVariable(self)
+        elif name == "copy":
+            self.install_dict_keys_match_guard()
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.clone(
                 items=self.items.copy(), mutation_type=ValueMutationNew(), source=None
             )
         elif name == "__len__":
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -521,6 +612,9 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_dict_keys_match_guard()
             return ConstantVariable.create(len(self.items))
         elif name == "__setitem__" and self.is_mutable():
@@ -528,6 +622,7 @@ def call_method(
                 raise_unhashable(args[0])
 
             self.install_dict_keys_match_guard()
+<<<<<<< HEAD
             if kwargs or len(args) != 2:
                 raise_args_mismatch(
                     tx,
@@ -535,6 +630,9 @@ def call_method(
                     "2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs and len(args) == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tx.output.side_effects.mutation(self)
             self.items[Hashable(args[0])] = args[1]
             return ConstantVariable.create(None)
@@ -544,6 +642,7 @@ def call_method(
             tx.output.side_effects.mutation(self)
             self.items.__delitem__(Hashable(args[0]))
             return ConstantVariable.create(None)
+<<<<<<< HEAD
         elif name == "get":
             if len(args) not in (1, 2):
                 raise_args_mismatch(tx, name, "1 or 2 args", f"{len(args)} args")
@@ -614,6 +713,22 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+        elif name in ("pop", "get") and len(args) in (1, 2) and args[0] not in self:
+            # missing item, return the default value. Install no DICT_CONTAINS guard.
+            self.install_dict_contains_guard(tx, args)
+            if len(args) == 1:
+                if name == "pop":
+                    raise_observed_exception(KeyError, tx)
+                return ConstantVariable(None)
+            else:
+                return args[1]
+        elif name == "pop" and arg_hashable and self.is_mutable():
+            self.should_reconstruct_all = True
+            tx.output.side_effects.mutation(self)
+            return self.items.pop(Hashable(args[0]))
+        elif name == "clear":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.should_reconstruct_all = True
             tx.output.side_effects.mutation(self)
             self.items.clear()
@@ -645,6 +760,7 @@ def call_method(
                 return ConstantVariable.create(None)
             else:
                 return super().call_method(tx, name, args, kwargs)
+<<<<<<< HEAD
         elif name == "__contains__":
             if not len(args):
                 raise_args_mismatch(
@@ -654,12 +770,19 @@ def call_method(
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
 
+=======
+        elif name in ("get", "__getattr__") and args[0] in self:
+            # Key guarding - Nothing to do.
+            return self.getitem_const(tx, args[0])
+        elif name == "__contains__" and len(args) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not arg_hashable:
                 raise_unhashable(args[0])
 
             self.install_dict_contains_guard(tx, args)
             contains = args[0] in self
             return ConstantVariable.create(contains)
+<<<<<<< HEAD
         elif name == "setdefault" and self.is_mutable():
             if len(args) not in (1, 2):
                 raise_args_mismatch(
@@ -680,6 +803,12 @@ def call_method(
                     "at most 2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+        elif name == "setdefault" and arg_hashable and self.is_mutable():
+            self.install_dict_keys_match_guard()
+            assert not kwargs
+            assert len(args) <= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             value = self.maybe_getitem_const(args[0])
             if value is not None:
                 return value
@@ -693,6 +822,7 @@ def call_method(
                 return x
         elif name == "move_to_end":
             self.install_dict_keys_match_guard()
+<<<<<<< HEAD
             tx.output.side_effects.mutation(self)
             if args[0] not in self:
                 raise_observed_exception(KeyError, tx)
@@ -769,6 +899,25 @@ def call_method(
                 mutation_type=ValueMutationNew(),
                 source=None,
                 user_cls=user_cls,
+=======
+            assert not kwargs and len(args) == 1
+            tx.output.side_effects.mutation(self)
+            key = Hashable(args[0])
+            val = self.items[key]
+            self.items.pop(key)
+            self.items[key] = val
+            return ConstantVariable.create(None)
+        elif name == "__or__":
+            assert len(args) == 1
+            if not isinstance(args[0], ConstDictVariable):
+                raise TypeError(
+                    f"unsupported operand type(s) for |: 'dict' and '{args[0].python_type().__name__}'"
+                )
+
+            self.install_dict_keys_match_guard()
+            new_dict_vt = self.clone(
+                items=self.items.copy(), mutation_type=ValueMutationNew(), source=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             # NB - Guard on all the keys of the other dict to ensure
@@ -776,9 +925,12 @@ def call_method(
             args[0].install_dict_keys_match_guard()
             new_dict_vt.items.update(args[0].items)
             return new_dict_vt
+<<<<<<< HEAD
         elif name == "__ior__":
             self.call_method(tx, "update", args, kwargs)
             return self
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return super().call_method(tx, name, args, kwargs)
 
@@ -922,8 +1074,12 @@ def call_method(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         if name == "__getitem__":
+<<<<<<< HEAD
             if len(args) != 1:
                 raise_args_mismatch(tx, name, "1 args", f"{len(args)} args")
+=======
+            assert len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if args[0] in self:
                 return self.getitem_const(tx, args[0])
@@ -960,8 +1116,11 @@ def reconstruct(self, codegen):
 class SetVariable(ConstDictVariable):
     """We model a sets as dictionary with None values"""
 
+<<<<<<< HEAD
     CONTAINS_GUARD = GuardBuilder.SET_CONTAINS
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         items: list[VariableTracker],
@@ -998,6 +1157,7 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.foreach([x.vt for x in self.set_items])
         codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
+<<<<<<< HEAD
     def _fast_set_method(self, tx, fn, args, kwargs):
         try:
             res = fn(
@@ -1010,6 +1170,8 @@ def _fast_set_method(self, tx, fn, args, kwargs):
             )
         return VariableTracker.build(tx, res)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_method(
         self,
         tx,
@@ -1018,6 +1180,7 @@ def call_method(
         kwargs: dict[str, VariableTracker],
     ) -> "VariableTracker":
         # We forward the calls to the dictionary model
+<<<<<<< HEAD
         from ..utils import check_constant_args
 
         if (
@@ -1035,6 +1198,8 @@ def call_method(
             py_type = self.python_type()
             return self._fast_set_method(tx, getattr(py_type, name), args, kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name == "__init__":
             temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, *kwargs)
             tx.output.side_effects.mutation(self)
@@ -1042,6 +1207,7 @@ def call_method(
             self.items.update(temp_set_vt.items)
             return ConstantVariable.create(None)
         elif name == "add":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1059,6 +1225,16 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+            name = "__setitem__"
+            args = (args[0], SetVariable._default_value())
+        elif name == "pop":
+            assert not kwargs
+            assert not args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Choose an item at random and pop it via the Dict.pop method
             try:
                 result = self.set_items.pop().vt
@@ -1069,6 +1245,7 @@ def call_method(
             super().call_method(tx, name, (result,), kwargs)
             return result
         elif name == "isdisjoint":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1076,42 +1253,67 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+            assert len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_isdisjoint
             ).call_function(tx, [self, args[0]], {})
         elif name == "intersection":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_intersection
             ).call_function(tx, [self, *args], {})
         elif name == "intersection_update":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_intersection_update
             ).call_function(tx, [self, *args], {})
         elif name == "union":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(polyfills.set_union).call_function(
                 tx, [self, *args], {}
             )
         elif name == "difference":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(
                     tx, name, f"Expect: 0 kwargs, Actual: {len(kwargs)} kwargs"
                 )
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_difference
             ).call_function(tx, [self, *args], {})
         elif name == "difference_update":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_difference_update
             ).call_function(tx, [self, *args], {})
         elif name == "symmetric_difference":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1119,10 +1321,16 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_symmetric_difference
             ).call_function(tx, [self, *args], {})
         elif name == "symmetric_difference_update":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1130,16 +1338,26 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(
                 polyfills.set_symmetric_difference_update
             ).call_function(tx, [self, *args], {})
         elif name == "update" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserFunctionVariable(polyfills.set_update).call_function(
                 tx, [self, *args], {}
             )
         elif name == "remove":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1147,10 +1365,15 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+            assert len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if args[0] not in self:
                 raise_observed_exception(KeyError, tx, args=args)
             return super().call_method(tx, "pop", args, kwargs)
         elif name == "discard":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1158,14 +1381,21 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+            assert len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if args[0] in self:
                 return super().call_method(tx, "pop", args, kwargs)
             else:
                 return ConstantVariable.create(value=None)
         elif name in ("issubset", "issuperset"):
+<<<<<<< HEAD
             if len(args) != 1:
                 raise_args_mismatch(tx, name, "1 args", f"{len(args)} args")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op = {
                 "issubset": operator.le,
                 "issuperset": operator.ge,
@@ -1176,6 +1406,7 @@ def call_method(
             return variables.BuiltinVariable(op.get(name)).call_function(
                 tx, [self, other], {}
             )
+<<<<<<< HEAD
         elif name in ("__and__", "__or__", "__xor__", "__sub__"):
             m = {
                 "__and__": "intersection",
@@ -1214,6 +1445,8 @@ def call_method(
             return ConstantVariable.create(
                 cmp_name_to_op_mapping[name](self.set_items, args[0].set_items)
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
@@ -1224,7 +1457,12 @@ def install_dict_keys_match_guard(self):
         pass
 
     def install_dict_contains_guard(self, tx, args):
+<<<<<<< HEAD
         super().install_dict_contains_guard(tx, args)
+=======
+        # Already EQUALS_MATCH guarded
+        pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FrozensetVariable(SetVariable):
@@ -1249,7 +1487,11 @@ def python_type(self):
         return frozenset
 
     def as_python_constant(self):
+<<<<<<< HEAD
         return frozenset({k.vt.as_python_constant() for k in self.set_items})
+=======
+        return {k.vt.as_python_constant() for k in self.set_items}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def reconstruct(self, codegen: "PyCodegen"):
         codegen.foreach([x.vt for x in self.set_items])
@@ -1280,6 +1522,7 @@ def call_method(
             # In[3]: s
             # frozenset({1, 2})
             return ConstantVariable.create(None)
+<<<<<<< HEAD
         elif name in (
             "copy",
             "difference",
@@ -1288,6 +1531,8 @@ def call_method(
         ):
             r = super().call_method(tx, name, args, kwargs)
             return FrozensetVariable(r.items)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
 
@@ -1309,6 +1554,7 @@ def debug_repr(self):
                 + "])"
             )
 
+<<<<<<< HEAD
     def install_dict_keys_match_guard(self):
         # Already EQUALS_MATCH guarded
         pass
@@ -1317,6 +1563,8 @@ def install_dict_contains_guard(self, tx, args):
         # Already EQUALS_MATCH guarded
         pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def set_items(self):
         return self.items
@@ -1374,11 +1622,14 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.load_method(self.kv)
         codegen.call_method(0)
 
+<<<<<<< HEAD
     def call_obj_hasattr(self, tx, name):
         if name in self.python_type().__dict__:
             return ConstantVariable.create(True)
         return ConstantVariable.create(False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_method(
         self,
         tx,
@@ -1415,6 +1666,7 @@ def call_method(
     ) -> "VariableTracker":
         if name == "__contains__":
             return self.dv_dict.call_method(tx, name, args, kwargs)
+<<<<<<< HEAD
         elif name in (
             "__and__",
             "__iand__",
@@ -1429,6 +1681,8 @@ def call_method(
             m = getattr(self.set_items, name)
             r = m(args[0].set_items)
             return SetVariable(r)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in cmp_name_to_op_mapping:
             if not isinstance(args[0], (SetVariable, DictKeysVariable)):
                 return ConstantVariable.create(NotImplemented)
@@ -1460,6 +1714,7 @@ def view_items_vt(self):
 
     def python_type(self):
         return dict_items
+<<<<<<< HEAD
 
     def call_method(self, tx, name, args, kwargs):
         # TODO(guilhermeleobas): This should actually check if args[0]
@@ -1471,3 +1726,5 @@ def call_method(self, tx, name, args, kwargs):
                 return self.dv_dict.call_method(tx, "__eq__", [args[0].dv_dict], {})
             return ConstantVariable.create(False)
         return super().call_method(tx, name, args, kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index 37878abbb37e4..b3f291344910b 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -29,7 +29,10 @@
 
 from .. import compiled_autograd, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
+<<<<<<< HEAD
 from ..bytecode_transformation import create_call_function
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..exc import unimplemented_v2
 from ..external_utils import call_module_hooks_from_backward_state
 from ..guards import GuardBuilder, install_guard
@@ -143,7 +146,11 @@ def is_placement_type(value):
 
         from torch.distributed.tensor.placement_types import Placement
 
+<<<<<<< HEAD
         return isinstance(value, type) and issubclass(value, Placement)
+=======
+        return type(value) is type and issubclass(value, Placement)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def as_python_constant(self):
         return self.value
@@ -154,10 +161,20 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         if self.source:
             # NOTE: we don't need to track mutations to the placement class as they
             # are supposed to be immutable.
             new_obj = self.value.__new__(self.value)
+=======
+        if (
+            inspect.getattr_static(self.value, "__new__", None) in (object.__new__,)
+            and self.source
+        ):
+            # NOTE: we don't need to track mutations to the placement class as they
+            # suppose to be immutable.
+            new_obj = object.__new__(self.value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             var = PlacementVariable(new_obj)
             if inspect.getattr_static(self.value, "__init__", None):
                 var.call_method(tx, "__init__", args, kwargs)
@@ -210,6 +227,7 @@ def call_method(
         if name in constant_fold_functions:
             try:
                 value_type = type(self.value)
+<<<<<<< HEAD
                 if inspect.getattr_static(value_type, "__getattr__", None) is not None:
                     unimplemented_v2(
                         gb_type="Placement with custom __getattr__ not supported",
@@ -220,6 +238,11 @@ def call_method(
                             "Move the Placement usage outside the compiled region",
                         ],
                     )
+=======
+                assert (
+                    inspect.getattr_static(value_type, "__getattr__", None) is None
+                ), "no custom getattr allowed!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 method = inspect.getattr_static(value_type, name)
             except AttributeError:
                 method = None
@@ -236,6 +259,7 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
         # Reconstruct the Placement object by calling its constructor
         # e.g., Shard(0), Replicate(), Partial()
@@ -260,6 +284,8 @@ def reconstruct(self, codegen):
         else:
             super().reconstruct(codegen)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DeviceMeshVariable(DistributedVariable):
     @staticmethod
@@ -280,11 +306,14 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
             return ConstantVariable.create(self.value.ndim)
         if name == "device_type":
             return ConstantVariable.create(self.value.device_type)
+<<<<<<< HEAD
         if name == "mesh_dim_names":
             source = self.source
             if source:
                 source = AttrSource(base=source, member="mesh_dim_names")
             return VariableTracker.build(tx, self.value.mesh_dim_names, source)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().var_getattr(tx, name)
 
     def call_method(
@@ -300,6 +329,7 @@ def call_method(
             return ConstantVariable.create(self.value.size(*const_args, **const_kwargs))
         if name == "get_coordinate":
             return ConstantVariable.create(self.value.get_coordinate())
+<<<<<<< HEAD
         if name == "get_rank":
             return ConstantVariable.create(self.value.get_rank())
         if name == "get_local_rank":
@@ -308,6 +338,8 @@ def call_method(
             return ConstantVariable.create(
                 self.value.get_local_rank(*const_args, **const_kwargs)
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name == "get_group":
             const_args = [x.as_python_constant() for x in args]
             const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index f15be35cac99d..c75478e5b72bf 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -31,10 +31,18 @@
 import sys
 import traceback
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from types import FunctionType
 from typing import Any, Optional, TYPE_CHECKING, TypeVar
 from typing_extensions import Never
+=======
+from collections.abc import Sequence
+from types import FunctionType
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar
+from typing_extensions import Never
+from unittest.mock import patch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakKeyDictionary
 
 import torch
@@ -51,11 +59,15 @@
     ObservedUserStopIteration,
     raise_observed_exception,
     SkipFrame,
+<<<<<<< HEAD
     StepUnsupported,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unimplemented_v2,
     Unsupported,
 )
 from ..guards import GuardBuilder, install_guard
+<<<<<<< HEAD
 from ..source import (
     AttrSource,
     ClosureSource,
@@ -64,10 +76,17 @@
     GetItemSource,
     SkipGuardSource,
 )
+=======
+from ..source import AttrSource, ConstantSource, DefaultsSource, GetItemSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import (
     check_constant_args,
     check_unspec_or_constant_args,
     cmp_name_to_op_mapping,
+<<<<<<< HEAD
+=======
+    counters,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     identity,
     is_function,
     is_wrapper_or_member_descriptor,
@@ -77,7 +96,10 @@
 from .base import (
     AsPythonConstantNotImplementedError,
     AttributeMutationNew,
+<<<<<<< HEAD
     raise_type_error_exc,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ValueMutationNew,
     VariableTracker,
 )
@@ -104,7 +126,11 @@
 CO_VARKEYWORDS = 0x08
 
 
+<<<<<<< HEAD
 # Module-level cache keyed by the function object
+=======
+# Module‐level cache keyed by the function object
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _spec_cache = WeakKeyDictionary()
 
 
@@ -133,7 +159,11 @@ def update_defaults(self, func: FunctionType):
         self.defaults = func.__defaults__ or ()
         self.kwdefaults = func.__kwdefaults__ or {}
 
+<<<<<<< HEAD
         # Map positional-default names → their index in self.defaults
+=======
+        # Map positional‐default names → their index in self.defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.pos_default_map = dict(
             zip(self.all_pos_names[-len(self.defaults) :], range(len(self.defaults)))
         )
@@ -159,15 +189,20 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
             ba[name] = wrap_bound_arg(tx, args[i])
         elif name in rem_kw:
             if name in spec.posonly_names:
+<<<<<<< HEAD
                 raise_observed_exception(
                     TypeError,
                     tx,
                     args=[ConstantVariable.create(f"{name} is positional-only")],
                 )
+=======
+                raise TypeError(f"{name} is positional-only")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ba[name] = wrap_bound_arg(tx, rem_kw.pop(name))
         elif name in spec.pos_default_map:
             idx = spec.pos_default_map[name]
             default_source = None
+<<<<<<< HEAD
             if fn_source and not (
                 ConstantVariable.is_literal(spec.defaults[idx])
                 and config.skip_guards_on_constant_func_defaults
@@ -184,12 +219,20 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
                     )
                 ],
             )
+=======
+            if fn_source:
+                default_source = DefaultsSource(fn_source, idx)
+            ba[name] = wrap_bound_arg(tx, spec.defaults[idx], default_source)
+        else:
+            raise TypeError(f"Missing required positional argument: {name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # 2) *args
     extra = args[len(spec.all_pos_names) :]
     if spec.varargs_name:
         ba[spec.varargs_name] = wrap_bound_arg(tx, tuple(extra))
     elif extra:
+<<<<<<< HEAD
         raise_observed_exception(
             TypeError,
             tx,
@@ -198,6 +241,10 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
                     f"Too many positional arguments: got {len(args)}, expected {len(spec.all_pos_names)}"
                 )
             ],
+=======
+        raise TypeError(
+            f"Too many positional arguments: got {len(args)}, expected {len(spec.all_pos_names)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # 3) Keyword-only
@@ -210,6 +257,7 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
                 kwdefault_source = DefaultsSource(fn_source, name, is_kw=True)
             ba[name] = wrap_bound_arg(tx, spec.kwdefaults[name], kwdefault_source)
         else:
+<<<<<<< HEAD
             raise_observed_exception(
                 TypeError,
                 tx,
@@ -219,11 +267,15 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
                     )
                 ],
             )
+=======
+            raise TypeError(f"Missing required keyword-only argument: {name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # 4) **kwargs
     if spec.varkw_name:
         ba[spec.varkw_name] = wrap_bound_arg(tx, rem_kw)
     elif rem_kw:
+<<<<<<< HEAD
         raise_observed_exception(
             TypeError,
             tx,
@@ -231,6 +283,9 @@ def bind_args_cached(func, tx, fn_source, args, kwargs):
                 ConstantVariable.create(f"Unexpected keyword arguments: {list(rem_kw)}")
             ],
         )
+=======
+        raise TypeError(f"Unexpected keyword arguments: {list(rem_kw)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return ba
 
@@ -304,6 +359,7 @@ def _create_nested_fn(
 
 def fn_var_getattr(tx, fn, source, name):
     source = source and AttrSource(source, name)
+<<<<<<< HEAD
 
     if source and name == "__annotations__":
         # We get a large number of silly guards from annotations from inspect
@@ -311,6 +367,8 @@ def fn_var_getattr(tx, fn, source, name):
         # graph is even rarer. So skip guards.
         source = SkipGuardSource(source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         subobj = inspect.getattr_static(fn, name)
     except AttributeError:
@@ -424,6 +482,7 @@ def has_self(self):
     def get_globals(self):
         return self.fn.__globals__
 
+<<<<<<< HEAD
     def get_source(self):
         source = self.source
 
@@ -431,6 +490,8 @@ def get_source(self):
             source = self.source_fn
         return source
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         """
         Assume `args` and `kwargs` are VariableTracker arguments for a call to
@@ -443,9 +504,13 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         if not isinstance(fn, FunctionType):
             raise TypeError("Only supports regular Python functions.")
         root_tx = parent.output.root_tx
+<<<<<<< HEAD
 
         source = self.get_source()
         result = bind_args_cached(fn, root_tx, source, args, kwargs)
+=======
+        result = bind_args_cached(fn, root_tx, self.source, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         init_cellvars(parent, result, fn.__code__)
         closure = self.fn.__closure__ or ()
@@ -458,8 +523,15 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
             if cell in side_effects:
                 cell_var = side_effects[cell]
 
+<<<<<<< HEAD
             elif source:
                 closure_cell = GetItemSource(ClosureSource(source), idx)
+=======
+            elif self.source:
+                closure_cell = GetItemSource(
+                    AttrSource(self.source, "__closure__"), idx
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 closure_cell_contents = AttrSource(closure_cell, "cell_contents")
                 try:
                     contents_var = VariableTracker.build(
@@ -489,8 +561,12 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
     def var_getattr(self, tx: "InstructionTranslator", name: str):
         if name in cmp_name_to_op_mapping:
             return variables.GetAttrVariable(self, name)
+<<<<<<< HEAD
         source = self.get_source()
         return fn_var_getattr(tx, self.fn, source, name)
+=======
+        return fn_var_getattr(tx, self.fn, self.source, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
@@ -505,6 +581,10 @@ def call_function(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         # Handle patch_dynamo_config call
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.fn is torch._dynamo.patch_dynamo_config:
             try:
                 args_const = [arg.as_python_constant() for arg in args]
@@ -521,6 +601,7 @@ def call_function(
                     "Please fix your call to patch_dynamo_config by using simpler inputs. "
                     f"args: {args}, kwargs: {kwargs}"
                 ) from e
+<<<<<<< HEAD
         elif self.fn is torch._dynamo.error_on_graph_break:
             try:
                 bound = inspect.signature(self.fn).bind(*args, **kwargs)
@@ -536,6 +617,10 @@ def call_function(
                 ) from e
         # Handle a `nonstrict_trace(fn)` call
         elif self.fn is torch._dynamo.nonstrict_trace:
+=======
+        # Handle a `nonstrict_trace(fn)` call
+        if self.fn is torch._dynamo.nonstrict_trace:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             bound = inspect.signature(self.fn).bind(*args, **kwargs)
             fn_var = bound.args[0]
             if not isinstance(fn_var, BaseUserFunctionVariable):
@@ -711,7 +796,12 @@ def next_variable(self, tx):
             # Hierarchically, tx can be seen as the parent of the inline tracer
             # created on call_function. Any exception needs to be propagated to tx
             # for Dynamo to behave correctly
+<<<<<<< HEAD
             return tracer.inline_call_()
+=======
+            with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
+                return tracer.inline_call_()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except ObservedException as e:
             tracer.generator_exhausted = True
             raise e
@@ -721,11 +811,16 @@ def next_variable(self, tx):
         except Unsupported as e:
             torch._dynamo.eval_frame.skip_code(self.get_code())
             raise SkipFrame from e
+<<<<<<< HEAD
 
     def call_obj_hasattr(self, tx, name):
         if name in self.python_type().__dict__:
             return ConstantVariable.create(True)
         return ConstantVariable.create(False)
+=======
+        finally:
+            counters["unimplemented"] |= counters["inline_call"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def has_unpack_var_sequence(self, tx):
         return False
@@ -876,7 +971,11 @@ def call_method(
             retval = self.next_variable(tx)
 
             # The exception raised before is still active. We need to check the exception
+<<<<<<< HEAD
             # table one more time to find the next target. But why? Let's walk
+=======
+            # table one more time to find the next target. But why? Let’s walk
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # through an example and its generated bytecode: https://godbolt.org/z/ebdTbMv8M
             #
             #     z = 0
@@ -1003,6 +1102,7 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         if not is_generator(self.vt.get_code()):
             unimplemented_v2(
                 gb_type="non-generator contextlib.contextmanager",
@@ -1014,6 +1114,9 @@ def call_function(
                     "Remove the `@contextlib.contextmanager` decorator.",
                 ],
             )
+=======
+        assert is_generator(self.vt.get_code())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         inline_tracer = self._build_inline_tracer(tx, args, kwargs)
         code = self.vt.get_code()
@@ -1061,6 +1164,7 @@ def _build_inline_tracer(self, tx, args, kwargs):
 class UserMethodVariable(UserFunctionVariable):
     """Some unsupported user-defined method"""
 
+<<<<<<< HEAD
     def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
         super().__init__(fn=fn, **kwargs)
         self.obj = obj
@@ -1079,6 +1183,11 @@ def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
         # `source_fn` rather than the original `source`.
         if source_fn is None and kwargs.get("source") is not None:
             self.source_fn = AttrSource(kwargs.get("source"), "__func__")
+=======
+    def __init__(self, fn, obj, **kwargs) -> None:
+        super().__init__(fn=fn, **kwargs)
+        self.obj = obj
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.fn}, {self.obj})"
@@ -1154,6 +1263,7 @@ def inspect_parameter_names(self):
         return super().inspect_parameter_names()[1:]
 
     def var_getattr(self, tx: "InstructionTranslator", name: str):
+<<<<<<< HEAD
         if name == "__self__":
             return self.obj
         if name == "__func__":
@@ -1161,6 +1271,13 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
             # information is stored in self.source_fn, use that to construct the
             # variable tracker.
             return VariableTracker.build(tx, self.fn, self.source_fn)
+=======
+        source = self.source and AttrSource(self.source, name)
+        if name == "__self__":
+            return self.obj
+        if name == "__func__":
+            return VariableTracker.build(tx, self.fn, source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().var_getattr(tx, name)
 
 
@@ -1316,6 +1433,7 @@ def has_closure(self):
 
     def const_getattr(self, tx, name):
         if name == "__name__":
+<<<<<<< HEAD
             return self.get_name()
         if name == "__code__":
             return self.get_code()
@@ -1331,6 +1449,11 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
             return variables.ConstantVariable.create(hasattr(self, "defaults"))
         return super().call_obj_hasattr(tx, name)
 
+=======
+            return self.fn_name.as_python_constant()
+        return super().const_getattr(tx, name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_self(self):
         return False
 
@@ -1477,6 +1600,7 @@ def as_python_constant(self):
 
     @classmethod
     def create_with_source(cls, value, source):
+<<<<<<< HEAD
         # Use closure match guard (i.e. guard on __code__ object instead of
         # function id) to avoid guarding on nested functions.
         if inspect.getattr_static(value, "_torchdynamo_disable", False):
@@ -1500,6 +1624,13 @@ def create_with_source(cls, value, source):
             # attribute lookup. They are unlikely to be changed, so we can skip
             # guarding them.
             install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
+=======
+        if not is_wrapper_or_member_descriptor(value):
+            # These descriptors are not guaranteed to return the same object on
+            # attribute lookup. They are unlikely to be changed, so we can skip
+            # guarding them.
+            install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(value, source=source)
 
     def call_function(
@@ -1538,8 +1669,11 @@ def call_function(
             raise SkipFrame(
                 f"Skip frame due to `torch._dynamo.skip_frame()`. Message: {skip_frame_msg}"
             )
+<<<<<<< HEAD
         elif self.value is torch._dynamo.step_unsupported:
             raise StepUnsupported
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             if config.dont_skip_tracing:
                 from .builder import SourcelessBuilder
@@ -1881,6 +2015,7 @@ def call_function(
     ) -> "VariableTracker":
         constant_args = check_constant_args(args, kwargs)
         if constant_args:
+<<<<<<< HEAD
             try:
                 value = self.fn(
                     *[x.as_python_constant() for x in args],
@@ -1892,6 +2027,12 @@ def call_function(
                     tx,
                     args=list(map(ConstantVariable.create, exc.args)),
                 )
+=======
+            value = self.fn(
+                *[x.as_python_constant() for x in args],
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserDefinedClassVariable(
                 value, mutation_type=ValueMutationNew()
             )
@@ -2001,7 +2142,11 @@ def _get_polyfill_handlers(cls) -> dict[Callable[..., Any], types.FunctionType]:
 
     @classmethod
     def create_with_source(cls, value, source):
+<<<<<<< HEAD
         install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
+=======
+        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return cls(value, source=source)
 
@@ -2103,8 +2248,13 @@ def call_method(
             return self.call_function(tx, args, kwargs)
 
         method = getattr(self.fn, name, None)
+<<<<<<< HEAD
         if not (method or is_function(method)):
             raise_type_error_exc(tx, f"Cannot find callable {name} in {self.fn}")
+=======
+        assert method is not None, f"Member {name} not found in {self.fn}"
+        assert is_function(method), f"Member {name} is not callable in {self.fn}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         options = {}
         if self.source:
             options["source"] = AttrSource(self.source, name)
@@ -2463,11 +2613,15 @@ def call_function(
             )
 
         if self.rank == 1:
+<<<<<<< HEAD
             if len(args) + len(kwargs) != 4:
                 raise_type_error_exc(
                     tx,
                     f"TMA metadata rank=1 requires exactly 4 arguments, got {len(args) + len(kwargs)}",
                 )
+=======
+            assert len(args) + len(kwargs) == 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dims = [
                 kwargs["dim"] if "dim" in kwargs else args[1],
             ]
@@ -2475,11 +2629,15 @@ def call_function(
                 kwargs["block_dim"] if "block_dim" in kwargs else args[2],
             ]
         else:
+<<<<<<< HEAD
             if len(args) + len(kwargs) != 6:
                 raise_type_error_exc(
                     tx,
                     f"TMA metadata rank=2 requires exactly 6 arguments, got {len(args) + len(kwargs)}",
                 )
+=======
+            assert len(args) + len(kwargs) == 6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dims = [
                 kwargs["dim1"] if "dim1" in kwargs else args[1],
                 kwargs["dim0"] if "dim0" in kwargs else args[2],
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 7b1a1cc83dbc9..7fc198496099e 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -26,9 +26,13 @@
 import logging
 import types
 import warnings
+<<<<<<< HEAD
 from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C
 import torch.fx
@@ -37,7 +41,10 @@
 from torch._dynamo.utils import get_fake_value
 from torch._dynamo.variables.builtin import BuiltinVariable
 from torch._dynamo.variables.constant import ConstantVariable
+<<<<<<< HEAD
 from torch._dynamo.variables.ctx_manager import RepararametrizeModuleContextVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.variables.functions import UserFunctionVariable
 from torch._dynamo.variables.nn_module import UnspecializedNNModuleVariable
 from torch._dynamo.variables.tensor import SymNodeVariable
@@ -71,6 +78,7 @@
 hc_log = torch._logging.getArtifactLogger(__name__, "hierarchical_compile")
 
 
+<<<<<<< HEAD
 @dataclass
 class OutputSpec:
     """
@@ -98,6 +106,8 @@ def __post_init__(self):
             assert len(self.masks_to_filter_const_values) == len(self.const_values)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def raise_hard_error_if_graph_break(reason):
     def deco(fn):
         @functools.wraps(fn)
@@ -105,6 +115,7 @@ def graph_break_as_hard_error(*args, **kwargs):
             try:
                 return fn(*args, **kwargs)
             except (Unsupported, ObservedException) as e:
+<<<<<<< HEAD
                 import sys
 
                 if isinstance(e, Unsupported):
@@ -118,6 +129,10 @@ def graph_break_as_hard_error(*args, **kwargs):
                         f"{reason} Got {msg}", real_stack
                     )
                 raise exc.with_traceback(sys.exc_info()[2]) from None
+=======
+                msg = " Scroll up to find out what causes the graph break."
+                raise UncapturedHigherOrderOpError(reason + msg) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return graph_break_as_hard_error
 
@@ -244,7 +259,11 @@ def inline_call(*args, **kwargs):
 
 
 def _call_function_and_unflatten_output(
+<<<<<<< HEAD
     tx, fn, args, kwargs, flat_example_value, ret_spec
+=======
+    tx, fn, args, kwargs, flat_example_value, ret_treespec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     from .builder import wrap_fx_proxy
 
@@ -260,6 +279,7 @@ def _call_function_and_unflatten_output(
         example_value=flat_example_value,
     )
 
+<<<<<<< HEAD
     if ret_spec.masks_to_filter_const_values:
         from torch._dynamo.external_utils import insert_const_values_with_mask
 
@@ -269,12 +289,19 @@ def _call_function_and_unflatten_output(
             flat_variable, ret_spec.masks_to_filter_const_values, ret_spec.const_values
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Transform variable back into a list (previously made into a tuple by
     # speculate_subgraph function) so as to respect the pytree API typing.
     flat_list_variable = BuiltinVariable(list).call_function(tx, [flat_variable], {})
     return (
+<<<<<<< HEAD
         _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_spec.treespec)
         if ret_spec.treespec
+=======
+        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_treespec)
+        if ret_treespec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else flat_variable
     )
 
@@ -312,6 +339,7 @@ def _check_supported_callable_arg(
         )
 
 
+<<<<<<< HEAD
 def _call_while_loop(
     self: VariableTracker,
     tx: "InstructionTranslator",
@@ -550,6 +578,8 @@ def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def are_same_graph_modules(fn_name, a_mod, b_mod, fake_mode):
     from torch._subclasses._fake_tensor_utils import _CacheKeyState
     from torch._subclasses.fake_tensor import extract_tensor_metadata
@@ -718,7 +748,15 @@ def validate_args_and_maybe_create_graph_inputs(
                     new_proxy = tracer.create_graph_input(
                         arg_name, a.python_type(), example_value
                     )
+<<<<<<< HEAD
                     example_value = node.meta.get("example_value", None)
+=======
+                    example_value = (
+                        node.meta["example_value"]
+                        if "example_value" in node.meta
+                        else None
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     a = wrap_fx_proxy_cls(
                         target_cls=type(a),
                         tx=tx,
@@ -756,7 +794,13 @@ def validate_args_and_maybe_create_graph_inputs(
             # If `a` can be put into a graph
             elif a.maybe_fx_node() is not None:
                 node = a.maybe_fx_node()
+<<<<<<< HEAD
                 example_value = node.meta.get("example_value", None)
+=======
+                example_value = (
+                    node.meta["example_value"] if "example_value" in node.meta else None
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 arg_name = node.name if sub_args_names is None else sub_args_names[idx]
                 new_proxy = tracer.create_graph_input(
                     arg_name, a.python_type(), example_value
@@ -858,7 +902,10 @@ def fixup_branch_inps(graph, lifted_freevars, shared, unique_l, unique_r):
         def _insert_or_replace_phs(new_args, name_suffix):
             for arg in new_args:
                 new_ph = graph.placeholder(arg.node.name + name_suffix)
+<<<<<<< HEAD
                 new_ph.meta = arg.node.meta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Override with new_ph if there exists a old placeholder.
                 if arg in lifted_freevars:
                     old_ph = lifted_freevars[arg].node
@@ -905,9 +952,12 @@ def speculate_subgraph(
     set_subgraph_inputs="automatic",
     restore_side_effects=True,
     should_flatten_outputs=False,
+<<<<<<< HEAD
     # if should_flatten_outputs is True, `remove_consts_from_outputs` remove the
     # const outputs from the subgraph output.
     remove_consts_from_outputs=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     under_activation_checkpoint=False,
     # TODO - supports input_mutation and aliasing should be False by default for strictness
     supports_input_mutation=True,
@@ -992,26 +1042,38 @@ def speculate_subgraph(
 
             if restore_side_effects:
                 new_side_effects = tx.output.side_effects.clone()
+<<<<<<< HEAD
                 prev_side_effects.track_runahead_tensor_and_symvar_side_effects(
+=======
+                prev_side_effects.track_tensor_variables_from_runahead_side_effects(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_side_effects
                 )
                 tx.output.side_effects = prev_side_effects
 
             treespec = None
+<<<<<<< HEAD
             masks_to_filter_const_values = None
             const_values = None
             if should_flatten_outputs:
                 from torch._dynamo.external_utils import filter_out_const_values
 
+=======
+            if should_flatten_outputs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Flatten the speculated subgraph output.
                 output, treespec = _make_inlined(tx, pytree.tree_flatten)(
                     output
                 ).unpack_var_sequence(tx)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Actually, transform the list (returned by flatten) into a tuple
                 # for dynamo consistency.
                 output = BuiltinVariable(tuple).call_function(tx, [output], {})
 
+<<<<<<< HEAD
                 if remove_consts_from_outputs:
                     # Filter out the constants and save them into a spec. Filtering
                     # out constants makes the graph simpler for the backends. We
@@ -1030,6 +1092,8 @@ def speculate_subgraph(
                         output, masks_to_filter_const_values
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Register output to graph
             # Modeled off of compile_and_call_fx_graph
             # TODO: support pytree output
@@ -1037,6 +1101,7 @@ def speculate_subgraph(
             # like bwd.
             if always_restore:
                 # Nothing left to do here
+<<<<<<< HEAD
                 return (
                     (
                         output,
@@ -1047,6 +1112,9 @@ def speculate_subgraph(
                     tx.output.graph,
                     subtracer.lifted_freevars,
                 )
+=======
+                return (output, treespec), tx.output.graph, subtracer.lifted_freevars
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 validate_subgraph_output_types(output)
 
@@ -1155,19 +1223,26 @@ def move_lifted_freevars_phs_to_end(
                             context=context,
                             explanation=f"Higher order ops do not support aliasing. Found in {source_target.name()}",
                             hints=[
+<<<<<<< HEAD
                                 "Replace `return input` with `return input.clone()` to avoid aliasing.",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 "Consider using the debug context to change user code to avoid aliasing.",
                                 "Please open an issue.",
                             ],
                         )
 
                 return (
+<<<<<<< HEAD
                     (
                         output,
                         OutputSpec(
                             treespec, masks_to_filter_const_values, const_values
                         ),
                     ),
+=======
+                    (output, treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     graph,
                     lifted_freevars,
                 )
@@ -1183,7 +1258,11 @@ def move_lifted_freevars_phs_to_end(
             f"fall back to eager-mode PyTorch, which could lead to a slowdown."
         )
         log.info(msg)
+<<<<<<< HEAD
         log.info(ex)  # noqa: G200
+=======
+        log.info(ex)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ex
 
 
@@ -1207,6 +1286,7 @@ def __init__(
 
     @staticmethod
     def make(value, source=None, **kwargs):
+<<<<<<< HEAD
         variable_class = _hop_name_to_variable_class.get(value.__name__)
         if variable_class is not None:
             return variable_class(value, source, **kwargs)
@@ -1216,10 +1296,71 @@ def make(value, source=None, **kwargs):
         if isinstance(value, BaseHOP):
             return BaseHOPVariable(value, source, **kwargs)
         unimplemented(f"HigherOrderOperator {value.__name__}")
+=======
+        from torch._higher_order_ops import BaseHOP
+
+        if value.__name__ == "cond":
+            return CondHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "while_loop":
+            return WhileLoopHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ in ("map", "map_impl"):
+            return MapHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "executorch_call_delegate":
+            return ExecutorchCallDelegateHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "out_dtype":
+            return OutDtypeHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "wrap":
+            return WrapHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "hints_wrapper":
+            return HintsWrapperHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "flex_attention":
+            return FlexAttentionHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "flex_attention_backward":
+            return FlexAttentionBackwardHighOrderVariable(value, source, **kwargs)
+        elif value.__name__ in (
+            "wrap_activation_checkpoint",
+            "tag_activation_checkpoint",
+        ):
+            return CheckpointHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "_export_tracepoint":
+            return ExportTracepointHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "trace_wrapped":
+            return TraceWrappedHigherOrderOperatorVariable(value, source, **kwargs)
+        elif value.__name__ == "strict_mode":
+            return StrictModeHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "run_with_rng_state":
+            return RunWithRNGStateHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "associative_scan":
+            return AssociativeScanHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "scan":
+            return ScanHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "call_torchbind":
+            return CallTorchbindHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "wrap_with_set_grad_enabled":
+            return WrapWithSetGradEnabledHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "wrap_with_autocast":
+            return WrapWithAutocastHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "dynamo_bypassing_wrapper":
+            return DynamoBypassingWrapperHigherOrderVariable(value, source, **kwargs)
+        elif (
+            value.__name__ == "auto_functionalized"
+            or value.__name__ == "auto_functionalized_v2"
+        ):
+            return AutoFunctionalizeHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "invoke_subgraph":
+            return InvokeSubgraphHigherOrderVariable(value, source, **kwargs)
+        elif isinstance(value, BaseHOP):
+            return BaseHOPVariable(value, source, **kwargs)
+        elif value.__name__ == "custom_function_call":
+            return CustomFunctionHigherOrderOperatorVariable(value, source, **kwargs)
+        else:
+            unimplemented(f"HigherOrderOperator {value.__name__}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(
         self,
         tx: "InstructionTranslator",
+<<<<<<< HEAD
         args: Sequence[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
@@ -1234,6 +1375,9 @@ def _call_function(
         self,
         tx: "InstructionTranslator",
         args: Sequence[VariableTracker],
+=======
+        args: list[VariableTracker],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
         unimplemented(f"HigherOrderOperator {self.value.__name__}")
@@ -1247,7 +1391,11 @@ class CustomFunctionHigherOrderOperatorVariable(TorchHigherOrderOperatorVariable
     Wraps torch._functorch.autograd_function.custom_function_call
     """
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -1258,7 +1406,11 @@ def _call_function(
             torch._dynamo.variables.UserDefinedObjectVariable(
                 self.value, source=self.source
             ),
+<<<<<<< HEAD
             source=AttrSource(self.source, "__call__"),
+=======
+            source=AttrSource(AttrSource(self.source, "__call__"), "__func__"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).call_function(tx, args, kwargs)
 
 
@@ -1269,7 +1421,11 @@ class CondHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="Cond doesn't work unless it is captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -1325,9 +1481,13 @@ def _call_function(
                 f"{operands.python_type()}",
             )
         operands_seq = operands.unpack_var_sequence(tx)
+<<<<<<< HEAD
         if not only_consist_of(
             operands, (TensorVariable, ConstantVariable, SymNodeVariable)
         ):
+=======
+        if not only_consist_of(operands, (TensorVariable, ConstantVariable)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented(
                 "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
             )
@@ -1354,7 +1514,11 @@ def speculate_branch(branch):
             ix = 1 if branch else 2
             # TODO: Support kwargs
             (
+<<<<<<< HEAD
                 (ret_val, ret_spec),
+=======
+                (ret_val, ret_treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ret_graph,
                 ret_lifted_freevars,
             ) = speculate_subgraph(
@@ -1365,17 +1529,23 @@ def speculate_branch(branch):
                 "cond",
                 source_target=self.value,
                 should_flatten_outputs=True,
+<<<<<<< HEAD
                 # TODO - removing consts from control flow ops need more work
                 remove_consts_from_outputs=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 supports_input_mutation=self.supports_input_mutation,
                 supports_aliasing=self.supports_aliasing,
             )
 
+<<<<<<< HEAD
             # need to ensure we increase epoch so we don't memoize unbacked bindings
             # across different subgraphs which can interfere with runtime assertion
             # generation.
             tx.fake_mode.epoch += 1
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not only_consist_of(ret_val, (TensorVariable, ConstantVariable)):
                 unimplemented(
                     "Expected branches to return a possibly nested pytree of tensors "
@@ -1387,24 +1557,43 @@ def speculate_branch(branch):
                         "Expected branches to return a possibly nested pytree of tensors "
                         f"or constant ints but it consists of others {ret.python_type()}.",
                     )
+<<<<<<< HEAD
             return ret_val, ret_spec, ret_graph, ret_lifted_freevars
 
         (true_r, true_spec, true_graph, true_lifted_freevars) = speculate_branch(True)
+=======
+            return ret_val, ret_treespec, ret_graph, ret_lifted_freevars
+
+        (true_r, true_treespec, true_graph, true_lifted_freevars) = speculate_branch(
+            True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         true_nn_modules = dict(tx.output.nn_modules)
 
         (
             false_r,
+<<<<<<< HEAD
             false_spec,
+=======
+            false_treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             false_graph,
             false_lifted_freevars,
         ) = speculate_branch(False)
         false_nn_modules = dict(tx.output.nn_modules)
 
+<<<<<<< HEAD
         same_spec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
             true_spec.treespec, false_spec.treespec
         ).as_python_constant()
         # 3.14: NotImplemented cannot be converted to bool
         if same_spec is not NotImplemented and not same_spec:
+=======
+        same_treespec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
+            true_treespec, false_treespec
+        )
+        if not same_treespec.as_python_constant():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented("Expected branches to return the same pytree structure.")
 
         (
@@ -1449,7 +1638,11 @@ def speculate_branch(branch):
             p_args,
             {},
             None,
+<<<<<<< HEAD
             true_spec,
+=======
+            true_treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1459,7 +1652,11 @@ def __init__(self, hop, source, script_obj_var, method_name) -> None:
         self.script_obj_var = script_obj_var
         self.method_name = method_name
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
@@ -1512,12 +1709,17 @@ class WhileLoopHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="while_loop doesn't work unless it is captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
+<<<<<<< HEAD
         return _call_while_loop(self, tx, args, kwargs, stack_output=False)
 
 
@@ -1535,6 +1737,236 @@ def _call_function(
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
         return _call_while_loop(self, tx, args, kwargs, stack_output=True)
+=======
+        from torch._higher_order_ops.while_loop import _create_unbacked_symint
+
+        from . import TensorVariable
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+        cond_fn, body_fn, operands, additional_inputs = args
+
+        # Input checks
+        for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
+            if v := kwargs.pop(k, None):
+                assert i == len(args), (
+                    "did not provide the right number of non-keyword args"
+                )
+                args.append(v)
+
+        if kwargs:
+            unimplemented(
+                f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}"
+            )
+
+        if len(args) != 4:
+            unimplemented(
+                f"Expected 4 arguments but got {len(args)}.\n"
+                f"Usage: while_loop(cond_fn, body_fn, operands)",
+            )
+
+        # cond_fn and body_fn input check
+        _check_supported_callable_arg(tx, cond_fn, "cond_fn")
+        _check_supported_callable_arg(tx, body_fn, "body_fn")
+
+        # operands input check
+        operands_seq = operands.unpack_var_sequence(tx)
+
+        # additional_inputs input check
+        if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected additional_inputs to be a list/tuple but got "
+                f"{additional_inputs.python_type()}. It seems to be an "
+                f"internal error, please report an issue to PyTorch."
+            )
+        additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
+
+        with discard_graph_changes(tx):
+            # See NOTE [unspecialize int carry with unbacked symints]
+            # Note: this must be run under discard graph changes.
+            def create_unbacked_sym_node_var(tx) -> SymNodeVariable:
+                example_value = _create_unbacked_symint(
+                    tx.output.fake_mode, ignore_fresh_unbacked_symbols=True
+                )
+                proxy = tx.output.current_tracer.create_graph_input(
+                    "unbacked_symint", type(example_value), example_value
+                )
+                return SymNodeVariable.create(tx, proxy, example_value)
+
+            new_operands_seq = [
+                (
+                    create_unbacked_sym_node_var(tx)
+                    if (
+                        isinstance(carry, ConstantVariable)
+                        and carry.python_type() is int
+                    )
+                    or (isinstance(carry, SymNodeVariable))
+                    else carry
+                )
+                for carry in operands_seq
+            ]
+
+        # create cond subgrpahs
+        (
+            (cond_r, _cond_treespec),
+            cond_graph,
+            cond_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            cond_fn,
+            new_operands_seq + additional_inputs_seq,
+            {},
+            "while_loop",
+            source_target=self.value,
+            # NOTE [why we cannot use "automatic" for while_loop]:
+            # The reason is that we want to enforce
+            # the ordering of inputs and outputs to be consistent and the the ordering
+            # of cond_fn and body_fn to the consistent.
+            # e.g. suppose we use "automatic" and we have:
+            #
+            # def body_fn(ph1, ph2):
+            #   new_a, new_b = ph2.cos(), ph1.sin()
+            #   return new_a, new_b
+            #
+            # a, b = torch.randn(3), torch.randn(3)
+            # new_a, new_b = body_fn(a, b)
+            #
+            # Using automatic, the ordering of arguments will be the order that they're
+            # used. In this example, the capture graph looks like:
+            #
+            # def captured_body(ph1, ph2):
+            #   new_a, new_b = ph1.cos(), ph2.add_(1)
+            #   return new_a, new_b
+            #
+            # This is fine when we change the calling convention of captured_body to be
+            # new_a, new_b = captured_body(b, a).
+            # But for while_loop, the next iteration's input is previous iteration output
+            # we'll end up feeding captured_body(new_a, new_b) instead.
+            # So it's best we always enforce the ordering of carried_inputs the same as outputs
+            # with "flatten_manual".
+            set_subgraph_inputs="flatten_manual",
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+        )
+        cond_nn_modules = dict(tx.output.nn_modules)
+        validate_subgraph_output_types(cond_r)
+        if isinstance(cond_r, TensorVariable):
+            cond_r_meta = _extract_tensor_metadata(
+                cond_r.proxy.node.meta["example_value"], include_contiguity=False
+            )
+            if (
+                not cond_r_meta.dtype == torch.bool
+                or not cond_r_meta.shape == torch.Size([])
+            ):
+                unimplemented(
+                    f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
+                )
+        elif isinstance(cond_r, ConstantVariable):
+            # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
+            pred = cond_r.as_python_constant()
+            if pred:
+                unimplemented(
+                    f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
+                )
+            else:
+                return operands
+
+        # create body subgraph
+        (
+            (body_r, body_treespec),
+            body_graph,
+            body_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            body_fn,
+            new_operands_seq + additional_inputs_seq,
+            {},
+            "while_loop",
+            source_target=self.value,
+            set_subgraph_inputs="flatten_manual",
+            should_flatten_outputs=True,
+            supports_input_mutation=False,
+            supports_aliasing=False,
+        )
+        validate_subgraph_output_types(body_r)
+
+        # We set include contiguity=False because we have vmap x HOP tests, where if
+        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+        # "querying is_contiguous inside of vmap for memory_format other than
+        # torch.contiguous_format is not yet implemented". This is okay because stride
+        # is still checked.
+        check_meta_consistency_vt(
+            body_r.unpack_var_sequence(tx),
+            operands_seq,
+            "body_fn_output",
+            "carried_inputs",
+            include_contiguity=False,
+        )
+
+        (
+            cond_graph,
+            body_graph,
+            cond_shared,
+            _body_shared,
+            cond_unique,
+            body_unique,
+        ) = _merge_graph_inputs(
+            cond_graph,
+            cond_lifted_freevars,
+            "cond_fn",
+            body_graph,
+            body_lifted_freevars,
+            "body_fn",
+        )
+
+        # Note: cond_shared and body_shared refer to the same proxy in parent graph
+        # so using either of them is OK. Use cond_shared as it doesn't matter.
+        additional_lifted_inputs = cond_shared + cond_unique + body_unique
+
+        body_nn_modules = dict(tx.output.nn_modules)
+
+        cond_name = tx.output.install_subgraph(
+            "cond_fn",
+            torch.fx.GraphModule(cond_nn_modules, cond_graph),
+        )
+        body_name = tx.output.install_subgraph(
+            "body_fn",
+            torch.fx.GraphModule(body_nn_modules, body_graph),
+        )
+
+        cond_node = make_attr(tx, cond_name)
+        body_node = make_attr(tx, body_name)
+
+        p_args = (
+            cond_node,
+            body_node,
+            tuple([operand.as_proxy() for operand in operands_seq]),
+            tuple(
+                [inp.as_proxy() for inp in additional_inputs_seq]
+                + additional_lifted_inputs
+            ),
+        )
+
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            body_r.as_proxy(),
+        )
+        unspecialized_flat_example_value = pytree.tree_map_only(
+            (int, torch.SymInt),
+            lambda _: _create_unbacked_symint(
+                tx.output.fake_mode, ignore_fresh_unbacked_symbols=False
+            ),
+            flat_example_value,
+        )
+        return _call_function_and_unflatten_output(
+            tx,
+            torch.ops.higher_order.while_loop,
+            p_args,
+            {},
+            unspecialized_flat_example_value,
+            body_treespec,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
@@ -1544,7 +1976,11 @@ class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="associative_scan must be captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
@@ -1623,7 +2059,11 @@ def arg_extractor(combine_fn, xs, additional_inputs):
 
         sub_args = sub_args + sub_args_additional_inputs
         (
+<<<<<<< HEAD
             (combine_result, _combine_spec),
+=======
+            (combine_result, _combine_treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1661,8 +2101,13 @@ def arg_extractor(combine_fn, xs, additional_inputs):
         # We need to have this check this way, because in case init is a TreeSpec and carry
         # but carry is only a LeafSpec, these two cannot be compared correctly.
         if (
+<<<<<<< HEAD
             xs_treespec.as_python_constant().is_leaf()
             != _combine_treespec.as_python_constant().is_leaf()
+=======
+            isinstance(xs_treespec.as_python_constant(), pytree.LeafSpec)
+            != isinstance(_combine_treespec.as_python_constant(), pytree.LeafSpec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) or not _make_inlined(tx, pytree.TreeSpec.__eq__)(
             xs_treespec, _combine_treespec
         ).as_python_constant():
@@ -1697,11 +2142,17 @@ def arg_extractor(combine_fn, xs, additional_inputs):
 
         with tx.fake_mode:
             sub_args_fake = [
+<<<<<<< HEAD
                 (
                     leaf.node.meta["example_value"].clone()
                     if hasattr(leaf.node.meta["example_value"], "clone")
                     else leaf.node.meta["example_value"]
                 )
+=======
+                leaf.node.meta["example_value"].clone()
+                if hasattr(leaf.node.meta["example_value"], "clone")
+                else leaf.node.meta["example_value"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for leaf in pytree.tree_leaves(proxy_vars_inputcheck)
             ]
             pre_dispatch = False
@@ -1734,13 +2185,26 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             additional_inputs_proxy,
         )
 
+<<<<<<< HEAD
+=======
+        with tx.fake_mode:
+            out_meta = tuple(
+                inp_proxy.node.meta["example_value"].clone() for inp_proxy in xs_proxy
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _call_function_and_unflatten_output(
             tx,
             torch.ops.higher_order.associative_scan,
             p_args,
             {},
+<<<<<<< HEAD
             None,
             OutputSpec(xs_treespec),
+=======
+            out_meta,
+            xs_treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1751,13 +2215,21 @@ class ScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="scan must be captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
+<<<<<<< HEAD
         from torch._higher_order_ops.scan import _extract_carry_and_out
+=======
+        from torch._higher_order_ops.scan import _extract_carry_and_out, stack_y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._higher_order_ops.utils import first_slice_copy
 
         args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
@@ -1768,7 +2240,10 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                 combine_fn_var,
                 (
                     variables.nn_module.NNModuleVariable,
+<<<<<<< HEAD
                     variables.nn_module.UnspecializedNNModuleVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     variables.FunctoolsPartialVariable,
                 ),
             ):
@@ -1777,6 +2252,7 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                     f"or a graph module if we're re-exporting but got "
                     f"{combine_fn.python_type()}. Please report an issue to PyTorch if you're seeing this."
                 )
+<<<<<<< HEAD
             return isinstance(
                 combine_fn_var,
                 (
@@ -1784,6 +2260,9 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                     variables.nn_module.UnspecializedNNModuleVariable,
                 ),
             )
+=======
+            return isinstance(combine_fn_var, variables.nn_module.NNModuleVariable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def arg_extractor(combine_fn, init, xs, additional_inputs):
             return combine_fn, init, xs, additional_inputs
@@ -1855,7 +2334,11 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
 
         sub_args = sub_args_init + sub_args_inp + sub_args_additional_inputs
         (
+<<<<<<< HEAD
             (combine_result, _combine_spec),
+=======
+            (combine_result, _combine_treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1889,7 +2372,11 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
                     f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}"
                 )
             carry_tree, out_vars = combine_result_vars
+<<<<<<< HEAD
             carry_vars, _ = _make_inlined(tx, pytree.tree_flatten)(
+=======
+            carry_vars, carry_treespec = _make_inlined(tx, pytree.tree_flatten)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 carry_tree
             ).unpack_var_sequence(tx)
             carry_vars = carry_vars.unpack_var_sequence(tx)
@@ -1898,9 +2385,13 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
             ).unpack_var_sequence(tx)
 
             # additional output checking
+<<<<<<< HEAD
             _combine_spec = OutputSpec(
                 _make_inlined(tx, pytree.tree_structure)(combine_result)
             )
+=======
+            _combine_treespec = _make_inlined(tx, pytree.tree_structure)(combine_result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             check_meta_consistency_vt(
                 init_vars,
@@ -1929,6 +2420,10 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         additional_inputs_proxy = list(additional_inputs.as_proxy()) + list(
             combine_freevars_proxy
         )
+<<<<<<< HEAD
+=======
+        y_proxies = [out_var.as_proxy() for out_var in out_vars]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         combine_gm = torch.fx.GraphModule(dict(tx.output.nn_modules), combine_graph)
         combine_fn_name = tx.output.install_subgraph("scan_combine_fn", combine_gm)
@@ -1940,8 +2435,24 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
             additional_inputs_proxy,
         )
 
+<<<<<<< HEAD
         return _call_function_and_unflatten_output(
             tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_spec
+=======
+        with tx.fake_mode:
+            example_carry = [
+                init_p.node.meta["example_value"].clone() for init_p in init_proxy
+            ]
+            # For the fake mode, we need to duplicate the init tensor along the dim
+            # to have the same size as the xs arguments
+            example_stacked_out = [
+                stack_y(y.node.meta["example_value"], scan_length) for y in y_proxies
+            ]
+            out_meta = [*example_carry, *example_stacked_out]
+
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.scan, p_args, {}, out_meta, _combine_treespec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1961,7 +2472,11 @@ class MapHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="map doesn't work unless it is captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
@@ -2020,8 +2535,11 @@ def _call_function(
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
             should_flatten_outputs=True,
+<<<<<<< HEAD
             # TODO - removing consts from control flow ops need more work
             remove_consts_from_outputs=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             supports_input_mutation=self.supports_input_mutation,
             supports_aliasing=self.supports_aliasing,
         )
@@ -2059,7 +2577,11 @@ def _call_function(
 
 
 class ExecutorchCallDelegateHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2077,6 +2599,7 @@ def _call_function(
             unimplemented(
                 "executorch_call_delegate: kwargs arguments were not enabled."
             )
+<<<<<<< HEAD
         if isinstance(args[0], variables.NNModuleVariable):
             lowered_module = tx.output.get_submodule(args[0].module_key)
             lowered_node = make_attr(tx, args[0].module_key)
@@ -2087,6 +2610,11 @@ def _call_function(
             lowered_node = tx.output.register_static_attr_and_return_proxy(
                 "delegate", lowered_module
             )
+=======
+        lowered_module = tx.output.get_submodule(args[0].module_key)
+
+        lowered_node = make_attr(tx, args[0].module_key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         p_args = tuple(arg.as_proxy() for arg in args[1:])
         real_sub_args = pytree.tree_map_only(
@@ -2139,6 +2667,7 @@ def call_function(
         return super().call_function(tx, args, kwargs)
 
 
+<<<<<<< HEAD
 class ReparametrizeModuleCallVariable(FunctorchHigherOrderVariable):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -2156,6 +2685,11 @@ class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
     # TODO - Go through all subclasses of WrapHigherOrderVariable to see if
     # restore_side_effects can be ignored. For now, this is conservative.
     restore_side_effects = True
+=======
+class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    supports_input_mutation = True
+    supports_aliasing = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name="wrap_body"
@@ -2189,7 +2723,10 @@ def create_wrapped_node(
             kwargs,
             description,
             source_target=self.value,
+<<<<<<< HEAD
             restore_side_effects=self.restore_side_effects,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             should_flatten_outputs=True,
             under_activation_checkpoint=under_activation_checkpoint,
             supports_input_mutation=self.supports_input_mutation,
@@ -2220,7 +2757,11 @@ def create_wrapped_node(
 
         return proxy_args, {}, example_value, body_r, treespec, body_gmod, body_name
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2408,7 +2949,11 @@ class HintsWrapperHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="Hints_wrapper doesn't work unless it is captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self, tx, args: "list[VariableTracker]", kwargs: "dict[str, VariableTracker]"
     ) -> "VariableTracker":
         _check_supported_callable_arg(tx, args[0], "body_fn")
@@ -2478,7 +3023,11 @@ def _call_function(
 
 
 class OutDtypeHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2516,7 +3065,11 @@ class StrictModeHigherOrderVariable(TorchHigherOrderOperatorVariable):
     @raise_hard_error_if_graph_break(
         reason="strict_mode HOO doesn't work unless it is captured completely with torch.compile."
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2534,7 +3087,11 @@ def _call_function(
             )
 
         (
+<<<<<<< HEAD
             (ret_val, ret_spec),
+=======
+            (ret_val, ret_treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret_graph,
             ret_lifted_freevars,
         ) = speculate_subgraph(
@@ -2572,11 +3129,16 @@ def _call_function(
             p_args,
             {},
             flat_example_value,
+<<<<<<< HEAD
             ret_spec,
+=======
+            ret_treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
 class CheckpointHigherOrderVariable(WrapHigherOrderVariable):
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         # If side effects are allowed under checkpoint, we should not restore
@@ -2586,6 +3148,9 @@ def __init__(self, *args, **kwargs) -> None:
         )
 
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
@@ -2594,6 +3159,11 @@ def _call_function(
         from torch._higher_order_ops.wrap import TagActivationCheckpoint
         from torch.utils.checkpoint import noop_context_fn
 
+<<<<<<< HEAD
+=======
+        from .builder import wrap_fx_proxy
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         context_fn = None
         if "context_fn" in kwargs and kwargs["context_fn"] != noop_context_fn:
             ctx = kwargs.pop("context_fn")
@@ -2602,7 +3172,11 @@ def _call_function(
             elif isinstance(
                 ctx, torch._dynamo.variables.functions.FunctoolsPartialVariable
             ):
+<<<<<<< HEAD
                 context_fn = ctx.guard_as_python_constant()
+=======
+                context_fn = ctx.as_python_constant()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise NotImplementedError(
                     f"checkpoint not implemented for {type(ctx)} context_fn"
@@ -2617,7 +3191,11 @@ def _call_function(
             _,
             example_value,
             _body_r,
+<<<<<<< HEAD
             out_spec,
+=======
+            treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkpointed_gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2633,6 +3211,7 @@ def _call_function(
 
         _, checkpoint_kwargs = proxy_args_kwargs([], checkpoint_kwargs)
 
+<<<<<<< HEAD
         return _call_function_and_unflatten_output(
             tx,
             self.value,
@@ -2642,17 +3221,49 @@ def _call_function(
             out_spec,
         )
 
+=======
+        # Store the invocation as a call
+        variable = wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(p_args),
+                kwargs=checkpoint_kwargs,
+            ),
+            example_value=example_value,
+        )
+
+        if treespec is None:
+            return variable
+
+        # Transform variable back into a list (previously made into a tuple by
+        # speculate_subgraph function) so as to respect the pytree API typing.
+        variable = BuiltinVariable(list).call_function(tx, [variable], {})
+
+        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DynamoBypassingWrapperHigherOrderVariable(WrapHigherOrderVariable):
     def __init__(self, hop, source) -> None:
         super().__init__(hop, source)
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
+<<<<<<< HEAD
+=======
+        from .builder import wrap_fx_proxy
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func_var = args[0]
 
         if isinstance(func_var, torch._dynamo.variables.UserFunctionVariable):
@@ -2670,7 +3281,11 @@ def _call_function(
             _,
             example_value,
             _body_r,
+<<<<<<< HEAD
             out_spec,
+=======
+            treespec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2686,6 +3301,7 @@ def _call_function(
         gmod_meta_key = "_dynamo_bypassing_wrapper_fn"
         gmod.meta[gmod_meta_key] = func
 
+<<<<<<< HEAD
         return _call_function_and_unflatten_output(
             tx,
             self.value,
@@ -2695,6 +3311,29 @@ def _call_function(
             out_spec,
         )
 
+=======
+        # Store the invocation as a call
+        variable = wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=(gmod_meta_key,) + tuple(p_args),
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
+
+        if treespec is None:
+            return variable
+
+        # Transform variable back into a list (previously made into a tuple by
+        # speculate_subgraph function) so as to respect the pytree API typing.
+        variable = BuiltinVariable(list).call_function(tx, [variable], {})
+
+        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ExportTracepointHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def call_function(
@@ -2720,7 +3359,11 @@ def call_function(
 
 
 class RunWithRNGStateHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2743,7 +3386,11 @@ def _call_function(
 
 
 class AutoFunctionalizeHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self, tx, args: "list[VariableTracker]", kwargs: "dict[str, VariableTracker]"
     ) -> "VariableTracker":
         from .builder import wrap_fx_proxy
@@ -2780,7 +3427,11 @@ def to_proxy(self, tx, arg):
         else:
             return arg.as_proxy()
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self, tx, args: "list[VariableTracker]", kwargs: "dict[str, VariableTracker]"
     ) -> "VariableTracker":
         from .builder import wrap_fx_proxy
@@ -2812,7 +3463,11 @@ class TraceWrappedHigherOrderOperatorVariable(TorchHigherOrderOperatorVariable):
     here in the call to dynamo from compiled autograd.
     """
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -2844,6 +3499,11 @@ def create_wrapped_node(
     ):
         from .._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
+<<<<<<< HEAD
+=======
+        tx: InstructionTranslator = tx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def create_scalar():
             return query.call_method(
                 tx,
@@ -2871,7 +3531,11 @@ def create_scalar():
 
         with TransformGetItemToIndex():
             (
+<<<<<<< HEAD
                 (_body_output, _body_spec),
+=======
+                (_body_output, _body_treespec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 body_graph,
                 body_lifted_freevars,
             ) = speculate_subgraph(
@@ -2901,7 +3565,11 @@ def create_scalar():
 
         return proxy_args
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -3383,7 +4051,11 @@ class BaseHOPVariable(WrapHigherOrderVariable):
     def python_type(self):
         return type(self.value)
 
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -3407,7 +4079,10 @@ def _call_function(
             lambda a: a.node.meta["example_value"],
             body_r.as_proxy(),
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
         return _call_function_and_unflatten_output(
             tx, self.value, p_args, p_kwargs, flat_example_value, treespec
@@ -3415,7 +4090,11 @@ def _call_function(
 
 
 class InvokeSubgraphHigherOrderVariable(WrapHigherOrderVariable):
+<<<<<<< HEAD
     supports_input_mutation = True
+=======
+    supports_input_mutation = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     supports_aliasing = False
 
     def install_subgraph_in_output_graph(
@@ -3481,7 +4160,11 @@ def install_subgraph_in_output_graph(
     @raise_hard_error_if_graph_break(
         reason="torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
     )
+<<<<<<< HEAD
     def _call_function(
+=======
+    def call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
@@ -3520,6 +4203,7 @@ def _call_function(
             flat_example_value,
             treespec,
         )
+<<<<<<< HEAD
 
 
 class LocalMapWrappedHigherOrderVariable(WrapHigherOrderVariable):
@@ -3817,3 +4501,5 @@ def make_error_msg(*args):
     "custom_function_call": CustomFunctionHigherOrderOperatorVariable,
     "local_map_hop": LocalMapWrappedHigherOrderVariable,
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index be1b7bf433f3a..d391560999f96 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -16,6 +16,7 @@
 """
 
 import itertools
+<<<<<<< HEAD
 from typing import TYPE_CHECKING, Union
 
 from .. import graph_break_hints, polyfills, variables
@@ -25,6 +26,14 @@
     create_call_function_ex,
     create_instruction,
 )
+=======
+import operator
+import sys
+from typing import Optional, TYPE_CHECKING, Union
+
+from .. import graph_break_hints, polyfills, variables
+from ..bytecode_transformation import create_call_function, create_instruction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..exc import (
     handle_observed_exception,
     ObservedUserStopIteration,
@@ -63,6 +72,7 @@ def call_function(
     ) -> "VariableTracker":
         # See also: module `torch._dynamo.polyfills.itertools`
 
+<<<<<<< HEAD
         if self.value is itertools.product:
             if any(kw != "repeat" for kw in kwargs.keys()):
                 unimplemented_v2(
@@ -82,6 +92,88 @@ def call_function(
                 variables.TupleVariable(list(item))
                 for item in itertools.product(*seqs, repeat=r)
             ]
+=======
+        if (
+            self.value is itertools.product
+            and not kwargs
+            and all(arg.has_unpack_var_sequence(tx) for arg in args)
+        ):
+            seqs = [arg.unpack_var_sequence(tx) for arg in args]
+            items = [
+                variables.TupleVariable(list(item)) for item in itertools.product(*seqs)
+            ]
+            return variables.ListIteratorVariable(
+                items, mutation_type=ValueMutationNew()
+            )
+        elif self.value is itertools.accumulate:
+            from .builtin import BuiltinVariable
+
+            if any(key not in ["initial", "func"] for key in kwargs.keys()):
+                unimplemented_v2(
+                    gb_type="Unsupported kwargs for itertools.accumulate",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation=f"Expected kwargs: 'initial', 'func', but got "
+                    f"{','.join(set(kwargs.keys()) - {'initial', 'func'})}",
+                    hints=[*graph_break_hints.USER_ERROR],
+                )
+
+            if len(args) in [1, 2] and args[0].has_unpack_var_sequence(tx):
+                seq = args[0].unpack_var_sequence(tx)
+
+                if "func" in kwargs and len(args) == 1:
+                    func = kwargs["func"].call_function
+                elif len(args) == 2:
+                    func = args[1].call_function
+                elif len(args) == 1:
+                    # Default to operator.add
+                    func = BuiltinVariable(operator.add).call_function
+                else:
+                    unimplemented_v2(
+                        gb_type="Unsupported `func` in itertools.accumulate",
+                        context=f"call_function {self} {args} {kwargs}",
+                        explanation="Dynamo does not know how to get the "
+                        "function to use for itertools.accumulate. "
+                        "itertools.accumulate expects the `func` as the second "
+                        "argument or as a keyword argument.",
+                        hints=[*graph_break_hints.USER_ERROR],
+                    )
+            else:
+                unimplemented_v2(
+                    gb_type="Unsupported arguments for itertools.accumulate",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation="Dynamo does not know how to trace "
+                    f"itertools.accumulate with args: {args} and kwargs: {kwargs}. "
+                    "itertools.accumulate expects an iterable, an optional "
+                    "binary function for accumulation, and an optional initial "
+                    "value to set the starting state.",
+                    hints=[
+                        "Make sure the arguments to itertools.accumulate are correct.",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+
+            items = []
+            acc = kwargs.get("initial")
+            if acc is not None:
+                items.append(acc)
+            for item in seq:
+                if acc is None:
+                    acc = item
+                else:
+                    try:
+                        acc = func(tx, [acc, item], {})
+                    except Exception as e:
+                        unimplemented_v2(
+                            gb_type="Unexpected failure during itertools.accumulate() iteration",
+                            context=f"call_function {self} {args} {kwargs}",
+                            explanation="Unexpected failure in invoking function during accumulate. "
+                            f"Failed running func {func}({item}{acc})",
+                            hints=[*graph_break_hints.DIFFICULT],
+                            from_exc=e,
+                        )
+                items.append(acc)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.ListIteratorVariable(
                 items, mutation_type=ValueMutationNew()
             )
@@ -160,11 +252,17 @@ def keyfunc(x):
                     result.append(
                         variables.TupleVariable(
                             [
+<<<<<<< HEAD
                                 (
                                     variables.ConstantVariable.create(k)
                                     if variables.ConstantVariable.is_literal(k)
                                     else k
                                 ),
+=======
+                                variables.ConstantVariable.create(k)
+                                if variables.ConstantVariable.is_literal(k)
+                                else k,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 variables.ListIteratorVariable(
                                     list(v), mutation_type=ValueMutationNew()
                                 ),
@@ -196,6 +294,7 @@ def keyfunc(x):
             return variables.CountIteratorVariable(
                 *args, mutation_type=ValueMutationNew()
             )
+<<<<<<< HEAD
         elif (
             self.value is itertools.permutations
             and (len(args) == 1 or (len(args) == 2 and args[1].is_python_constant()))
@@ -213,6 +312,11 @@ def keyfunc(x):
             ]
             return variables.ListIteratorVariable(
                 items, mutation_type=ValueMutationNew()
+=======
+        elif self.value is itertools.cycle:
+            return variables.CycleIteratorVariable(
+                *args, mutation_type=ValueMutationNew()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             return super().call_function(tx, args, kwargs)
@@ -253,6 +357,7 @@ def has_force_unpack_var_sequence(self, tx) -> bool:
         return True
 
 
+<<<<<<< HEAD
 class ObjectIteratorVariable(IteratorVariable):
     """
     VariableTracker for iter(obj) that implements the iterator protocol (i.e.,
@@ -285,6 +390,8 @@ def next_variable(self, tx):
             raise
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RepeatIteratorVariable(IteratorVariable):
     def __init__(self, item: VariableTracker, **kwargs) -> None:
         super().__init__(**kwargs)
@@ -338,6 +445,57 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.extend_output(create_call_function(2, False))
 
 
+<<<<<<< HEAD
+=======
+class CycleIteratorVariable(IteratorVariable):
+    def __init__(
+        self,
+        iterator: IteratorVariable,
+        saved: Optional[list[VariableTracker]] = None,
+        saved_index: int = 0,
+        item: Optional[VariableTracker] = None,
+        **kwargs,
+    ) -> None:
+        if saved is None:
+            saved = []
+        super().__init__(**kwargs)
+        self.iterator = iterator
+        self.saved = saved
+        self.saved_index = saved_index
+        self.item = item
+
+    def next_variable(self, tx):
+        assert self.is_mutable()
+
+        if self.iterator is not None:
+            try:
+                new_item = self.iterator.next_variable(tx)
+                if len(self.saved) > MAX_ITERATOR_LIMIT:
+                    unimplemented_v2(
+                        gb_type="input iterator to itertools.cycle has too many items",
+                        context=f"next({self})",
+                        explanation=f"Has reached internal Dynamo max iterator limit: {MAX_ITERATOR_LIMIT}",
+                        hints=[],
+                    )
+                tx.output.side_effects.mutation(self)
+                self.saved.append(new_item)
+                self.item = new_item
+                if self.item is None:
+                    return self.next_variable(tx)
+                return self.item
+            except ObservedUserStopIteration:
+                handle_observed_exception(tx)
+                self.iterator = None
+                return self.next_variable(tx)
+        elif len(self.saved) > 0:
+            tx.output.side_effects.mutation(self)
+            self.saved_index = (self.saved_index + 1) % len(self.saved)
+            return self.item
+        else:
+            raise_observed_exception(StopIteration, tx)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ZipVariable(IteratorVariable):
     """
     Represents zip(*iterables)
@@ -351,7 +509,11 @@ class ZipVariable(IteratorVariable):
 
     def __init__(
         self,
+<<<<<<< HEAD
         iterables: list[VariableTracker],
+=======
+        iterables: list[Union[list[VariableTracker], VariableTracker]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         strict: bool = False,
         **kwargs,
     ) -> None:
@@ -385,10 +547,13 @@ def unpack_var_sequence(self, tx) -> list["VariableTracker"]:
 
     def next_variable(self, tx):
         assert self.is_mutable()
+<<<<<<< HEAD
 
         if len(self.iterables) == 0:
             raise_observed_exception(StopIteration, tx)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         old_index = self.index
         args = []
 
@@ -400,9 +565,14 @@ def get_item(it):
             else:
                 return it.next_variable(tx)
 
+<<<<<<< HEAD
         idx: int | None = None
         try:
             for idx, it in enumerate(self.iterables):  # noqa:B007
+=======
+        try:
+            for idx, it in enumerate(self.iterables):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args.append(get_item(it))
         except ObservedUserStopIteration:
             if self.strict:
@@ -435,7 +605,13 @@ def reconstruct_items(self, codegen: "PyCodegen"):
             if isinstance(it, list):
                 remaining_items = it[self.index :]
                 codegen.foreach(remaining_items)
+<<<<<<< HEAD
                 codegen.append_output(create_build_tuple(len(remaining_items)))
+=======
+                codegen.append_output(
+                    create_instruction("BUILD_TUPLE", arg=len(remaining_items))
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 codegen(it)
 
@@ -444,6 +620,7 @@ def reconstruct(self, codegen: "PyCodegen"):
             lambda: codegen.load_import_from("builtins", "zip"), call_function_ex=True
         )
         self.reconstruct_items(codegen)
+<<<<<<< HEAD
         codegen.append_output(create_build_tuple(len(self.iterables)))
         codegen.extend_output(
             [
@@ -453,6 +630,22 @@ def reconstruct(self, codegen: "PyCodegen"):
                 *create_call_function_ex(True, False),
             ]
         )
+=======
+        codegen.append_output(
+            create_instruction("BUILD_TUPLE", arg=len(self.iterables))
+        )
+        if sys.version_info >= (3, 10):
+            codegen.extend_output(
+                [
+                    codegen.create_load_const("strict"),
+                    codegen.create_load_const(self.strict),
+                    create_instruction("BUILD_MAP", arg=1),
+                    create_instruction("CALL_FUNCTION_EX", arg=1),
+                ]
+            )
+        else:
+            codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MapVariable(ZipVariable):
@@ -487,8 +680,13 @@ def reconstruct(self, codegen: "PyCodegen"):
         self.reconstruct_items(codegen)
         codegen.extend_output(
             [
+<<<<<<< HEAD
                 create_build_tuple(len(self.iterables) + 1),
                 *create_call_function_ex(False, False),
+=======
+                create_instruction("BUILD_TUPLE", arg=len(self.iterables) + 1),
+                create_instruction("CALL_FUNCTION_EX", arg=0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
 
@@ -546,10 +744,14 @@ def _next():
         while True:
             item = _next()
             self.index += 1
+<<<<<<< HEAD
             if isinstance(self.fn, ConstantVariable) and self.fn.value is None:
                 res = item
             else:
                 res = self.fn.call_function(tx, [item], {})
+=======
+            res = self.fn.call_function(tx, [item], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pred_res = variables.UserFunctionVariable(
                 polyfills.predicate
             ).call_function(tx, [res], {})
@@ -560,7 +762,13 @@ def reconstruct_items(self, codegen: "PyCodegen"):
         if isinstance(self.iterable, list):
             remaining_items = self.iterable[self.index :]
             codegen.foreach(remaining_items)
+<<<<<<< HEAD
             codegen.append_output(create_build_tuple(len(remaining_items)))
+=======
+            codegen.append_output(
+                create_instruction("BUILD_TUPLE", arg=len(remaining_items))
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             codegen(self.iterable)
 
diff --git a/torch/_dynamo/variables/lazy.py b/torch/_dynamo/variables/lazy.py
index 80c630f86ecc8..2e07ef0213d00 100644
--- a/torch/_dynamo/variables/lazy.py
+++ b/torch/_dynamo/variables/lazy.py
@@ -1,8 +1,12 @@
 import collections
 import functools
 import inspect
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, final, Optional, Union
+=======
+from typing import Any, Callable, final, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 from ..utils import is_function_or_wrapper
@@ -18,7 +22,10 @@ def __init__(self, value: Any, source: Any) -> None:
             assert source
         self.value = value
         self.source = source
+<<<<<<< HEAD
         self.name_hint: Optional[str] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.vt: Optional[VariableTracker] = None
 
     def realize(self) -> None:
@@ -33,6 +40,7 @@ def realize(self) -> None:
         else:
             self.vt = builder.VariableBuilder(tx, self.source)(self.value)
 
+<<<<<<< HEAD
         if self.name_hint is not None:
             # pyrefly: ignore [missing-attribute]
             self.vt.set_name_hint(self.name_hint)
@@ -40,6 +48,10 @@ def realize(self) -> None:
         del self.value
         del self.source
         del self.name_hint
+=======
+        del self.value
+        del self.source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @final
@@ -99,6 +111,7 @@ def peek_value(self) -> Any:
         assert not self.is_realized()
         return self._cache.value
 
+<<<<<<< HEAD
     def set_name_hint(self, name: str) -> None:
         if self.is_realized():
             self._cache.vt.set_name_hint(name)  # type: ignore[union-attr]
@@ -113,6 +126,12 @@ def __str__(self) -> str:
             variable_info += f"unrealized: {self.peek_type()})"
 
         return variable_info
+=======
+    def __str__(self) -> str:
+        if self.is_realized():
+            return repr(self.unwrap())
+        return super().__repr__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __getattr__(self, item: str) -> Any:
         return getattr(self.realize(), item)
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 909892b3da686..b245626173d01 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -19,13 +19,17 @@ class that handles its unique behaviors while integrating with Dynamo's
 import collections
 import inspect
 import operator
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, TYPE_CHECKING
 
 import torch
 import torch.fx
 
 from .. import graph_break_hints, polyfills, variables
+<<<<<<< HEAD
 from ..bytecode_transformation import (
     create_build_tuple,
     create_call_function,
@@ -34,6 +38,11 @@ class that handles its unique behaviors while integrating with Dynamo's
 )
 from ..exc import raise_observed_exception, unimplemented_v2
 from ..source import AttrSource, NamedTupleFieldsSource
+=======
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..exc import raise_observed_exception, unimplemented_v2
+from ..source import AttrSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import (
     cmp_name_to_op_mapping,
     cmp_name_to_op_str_mapping,
@@ -43,8 +52,11 @@ class that handles its unique behaviors while integrating with Dynamo's
     Lit,
     namedtuple_fields,
     odict_values,
+<<<<<<< HEAD
     raise_args_mismatch,
     range_iterator,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_example_value,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -116,9 +128,12 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             index = arg.as_python_constant()
 
         if isinstance(index, slice):
+<<<<<<< HEAD
             if index.step == 0:
                 msg = ConstantVariable.create("slice step cannot be zero")
                 raise_observed_exception(ValueError, tx, args=[msg])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Set source to None because slicing a list gives a new local
             return self.clone(
                 items=self.items[index],
@@ -147,6 +162,7 @@ def call_method(
         if name == "__getitem__":
             from .tensor import TensorVariable
 
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -155,6 +171,9 @@ def call_method(
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
 
+=======
+            assert not kwargs and len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(args[0], TensorVariable):
                 value = get_fake_value(args[0].as_proxy().node, tx)
                 if value.constant is not None and value.constant.numel() == 1:
@@ -170,6 +189,7 @@ def call_method(
                     )
             else:
                 value = args[0]
+<<<<<<< HEAD
 
             if value.python_type() not in (int, slice):
                 msg = f"indices must be integers or slices, not {value.python_type()}"
@@ -194,11 +214,20 @@ def call_method(
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
 
+=======
+            return self.getitem_const(tx, value)
+        elif name == "__contains__":
+            assert len(args) == 1
+            assert not kwargs
+            return iter_contains(self.unpack_var_sequence(tx), args[0], tx)
+        elif name == "index":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return tx.inline_user_function_return(
                 VariableTracker.build(tx, polyfills.index),
                 [self] + list(args),
                 kwargs,
             )
+<<<<<<< HEAD
         elif name == "count":
             if len(args) != 1:
                 raise_args_mismatch(
@@ -265,6 +294,9 @@ def call_method(
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
 
+=======
+        elif name in cmp_name_to_op_mapping:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             left = self
             right = args[0]
             # TODO this type check logic mirrors the following
@@ -314,6 +346,7 @@ def __init__(self, items, **kwargs) -> None:
         else:
             raise AssertionError
 
+<<<<<<< HEAD
         def maybe_as_int(x):
             return (
                 ConstantVariable(int(x.value)) if isinstance(x, ConstantVariable) else x
@@ -324,6 +357,8 @@ def maybe_as_int(x):
         step = maybe_as_int(step)
         stop = maybe_as_int(stop)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
 
@@ -410,12 +445,16 @@ def apply_index(self, index):
             index = length + index
 
         if index < 0 or index >= length:
+<<<<<<< HEAD
             tx = torch._dynamo.symbolic_convert.InstructionTranslator.current_tx()
             raise_observed_exception(
                 IndexError,
                 tx,
                 args=[ConstantVariable("range object index out of range")],
             )
+=======
+            raise IndexError(f"index {index} is out of range")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return variables.ConstantVariable.create(self.start() + (index * self.step()))
 
@@ -449,11 +488,16 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
 
         if isinstance(index, slice):
             return self.apply_slice(index)
+<<<<<<< HEAD
         elif isinstance(index, int):
             return self.apply_index(index)
         else:
             msg = ConstantVariable("range indices must be integers or slices")
             raise_observed_exception(TypeError, tx, args=[msg])
+=======
+        else:
+            return self.apply_index(index)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def as_proxy(self):
         return self.python_type()(*self._as_proxy())
@@ -469,6 +513,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.extend_output(create_call_function(3, False))
 
+<<<<<<< HEAD
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
     ) -> "VariableTracker":
@@ -557,6 +602,19 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         if name in fields:
             return self.items[fields.index(name)]
         return super().var_getattr(tx, name)
+=======
+    def var_getattr(self, tx: "InstructionTranslator", name):
+        fields = ["start", "stop", "step"]
+        if name not in fields:
+            unimplemented_v2(
+                gb_type="Unsupported attribute for range() object",
+                context=f"var_getattr {self} {name}",
+                explanation=f"Expected attribute to be one of {','.join(fields)} "
+                f"but got {name}",
+                hints=[*graph_break_hints.USER_ERROR],
+            )
+        return self.items[fields.index(name)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CommonListMethodsVariable(BaseListVariable):
@@ -574,6 +632,7 @@ def call_method(
         from .tensor import SymNodeVariable
 
         if name == "append" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -581,10 +640,14 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (arg,) = args
             tx.output.side_effects.mutation(self)
             self.items.append(arg)
             return ConstantVariable.create(None)
+<<<<<<< HEAD
         elif name == "extend" and self.is_mutable():
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
@@ -598,12 +661,22 @@ def call_method(
                 msg = ConstantVariable.create(f"{type(args[0])} object is not iterable")
                 raise_observed_exception(TypeError, tx, args=[msg])
 
+=======
+        elif (
+            name == "extend"
+            and self.is_mutable()
+            and args
+            and args[0].has_force_unpack_var_sequence(tx)
+        ):
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (arg,) = args
             arg.force_apply_to_var_sequence(
                 tx, lambda item: self.call_method(tx, "append", [item], {})
             )
             return ConstantVariable.create(None)
         elif name == "insert" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs or len(args) != 2:
                 raise_args_mismatch(
                     tx,
@@ -611,6 +684,9 @@ def call_method(
                     "2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             idx, value = args
             if isinstance(idx, SymNodeVariable):
                 const_idx = idx.evaluate_expr()
@@ -620,6 +696,7 @@ def call_method(
             self.items.insert(const_idx, value)
             return ConstantVariable.create(None)
         elif name == "pop" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs or len(args) > 1:
                 raise_args_mismatch(
                     tx,
@@ -647,6 +724,13 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs
+            tx.output.side_effects.mutation(self)
+            return self.items.pop(*[a.as_python_constant() for a in args])
+        elif name == "clear" and self.is_mutable():
+            assert not kwargs and not args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tx.output.side_effects.mutation(self)
             self.items.clear()
             return ConstantVariable.create(None)
@@ -654,6 +738,7 @@ def call_method(
             name == "__setitem__"
             and self.is_mutable()
             and args
+<<<<<<< HEAD
             and (
                 args[0].is_python_constant()
                 or isinstance(args[0], SymNodeVariable)
@@ -756,6 +841,30 @@ def call_method(
             idx = self.call_method(tx, "index", args, kwargs)
             self.call_method(tx, "pop", [idx], {})
             return ConstantVariable.create(None)
+=======
+            and args[0].is_python_constant()
+        ):
+            assert not kwargs
+            key, value = args
+            tx.output.side_effects.mutation(self)
+            if isinstance(key, SliceVariable):
+                self.items[key.as_python_constant()] = list(value.items)
+            else:
+                self.items[key.as_python_constant()] = value
+            return ConstantVariable.create(None)
+        elif name == "copy":
+            # List copy() doesn't have args and kwargs
+            assert not kwargs
+            assert not args
+            items = list(self.items)
+            return self.modified(items, mutation_type=ValueMutationNew())
+        elif name == "reverse" and self.is_mutable():
+            assert not kwargs
+            assert not args
+            self.items.reverse()
+            tx.output.side_effects.mutation(self)
+            return ConstantVariable.create(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return super().call_method(tx, name, args, kwargs)
 
@@ -781,6 +890,7 @@ def call_method(
         args: list["VariableTracker"],
         kwargs: dict[str, "VariableTracker"],
     ) -> "VariableTracker":
+<<<<<<< HEAD
         from .tensor import SymNodeVariable
 
         if name == "__setitem__" and self.is_mutable():
@@ -834,12 +944,44 @@ def call_method(
         if name == "sort" and self.is_mutable():
             if len(args) != 0:
                 raise_args_mismatch(tx, name, "0 args", f"{len(args)} args")
+=======
+        if (
+            name == "__setitem__"
+            and self.is_mutable()
+            and args
+            and args[0].is_python_constant()
+        ):
+            assert not kwargs
+            key, value = args
+            tx.output.side_effects.mutation(self)
+            if isinstance(key, SliceVariable):
+                if not value.has_force_unpack_var_sequence(tx):
+                    unimplemented_v2(
+                        gb_type="Unsupported conversion for slice assignment",
+                        context=f"call_method {self} {name} {args}",
+                        explanation=f"Missing dynamo support for converting {value} into a list for slice assignment.",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+                    )
+                self.items[key.as_python_constant()] = value.force_unpack_var_sequence(
+                    tx
+                )
+            else:
+                self.items[key.as_python_constant()] = value
+            return ConstantVariable.create(None)
+
+        if name == "sort" and self.is_mutable():
+            assert len(args) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key_fn_var = kwargs.pop("key", ConstantVariable.create(None))
             reverse = kwargs.pop(
                 "reverse", ConstantVariable.create(False)
             ).as_python_constant()
+<<<<<<< HEAD
             if len(kwargs) != 0:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if (
                 key_fn_var.is_python_constant()
@@ -890,8 +1032,12 @@ def call_method(
             return ConstantVariable.create(None)
 
         if name == "__init__" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(args) == 0:
                 return ConstantVariable.create(None)
             elif len(args) == 1 and args[0].has_force_unpack_var_sequence(tx):
@@ -978,6 +1124,7 @@ def call_method(
             and args
             and args[0].is_python_constant()
         ):
+<<<<<<< HEAD
             if kwargs or len(args) != 2:
                 raise_args_mismatch(
                     tx,
@@ -985,6 +1132,10 @@ def call_method(
                     "2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 2
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key, value = args
             assert key.is_python_constant()
             assert isinstance(key.as_python_constant(), int)
@@ -1004,6 +1155,7 @@ def call_method(
             and len(args) > 0
             and args[0].has_force_unpack_var_sequence(tx)
         ):
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1011,6 +1163,10 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 1
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # NOTE this is inefficient, but the alternative is to represent self.items
             # as a deque, which is a more intrusive change.
             args[0].force_apply_to_var_sequence(
@@ -1019,6 +1175,7 @@ def call_method(
             slice_within_maxlen = slice(None, maxlen)
             result = ConstantVariable.create(None)
         elif name == "popleft" and self.is_mutable():
+<<<<<<< HEAD
             if kwargs or len(args) > 0:
                 raise_args_mismatch(
                     tx,
@@ -1036,11 +1193,21 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not args
+            assert not kwargs
+            tx.output.side_effects.mutation(self)
+            result, *self.items[:] = self.items
+        elif name == "appendleft" and len(args) > 0 and self.is_mutable():
+            assert len(args) == 1
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tx.output.side_effects.mutation(self)
             self.items[:] = [args[0], *self.items]
             slice_within_maxlen = slice(None, maxlen)
             result = ConstantVariable.create(None)
         elif name == "insert" and len(args) > 0 and self.is_mutable():
+<<<<<<< HEAD
             if kwargs or len(args) != 2:
                 raise_args_mismatch(
                     tx,
@@ -1048,6 +1215,10 @@ def call_method(
                     "2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 2
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if maxlen is not None and len(self.items) == maxlen:
                 raise_observed_exception(
                     IndexError, tx, args=["deque already at its maximum size"]
@@ -1077,7 +1248,11 @@ def debug_repr(self):
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
+<<<<<<< HEAD
         codegen.append_output(create_build_tuple(len(self.items)))
+=======
+        codegen.append_output(create_instruction("BUILD_TUPLE", arg=len(self.items)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_method(
         self,
@@ -1180,7 +1355,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(lambda: codegen.load_import_from("torch", "Size"))
         codegen.foreach(self.items)
         build_torch_size = [
+<<<<<<< HEAD
             create_build_tuple(len(self.items)),
+=======
+            create_instruction("BUILD_TUPLE", arg=len(self.items)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] + create_call_function(1, False)
         codegen.extend_output(build_torch_size)
 
@@ -1223,6 +1402,7 @@ def call_method(
         kwargs: dict[str, "VariableTracker"],
     ) -> "VariableTracker":
         if name == "__getitem__":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -1240,6 +1420,13 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs and len(args) == 1
+            out = self.get_item_dyn(tx, args[0])
+            return out
+        elif name == "numel":
+            assert not args and not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.numel(tx)
 
         return super().call_method(tx, name, args, kwargs)
@@ -1271,10 +1458,17 @@ class NamedTupleVariable(TupleVariable):
         *TupleVariable._nonvar_fields,
     }
 
+<<<<<<< HEAD
     def __init__(self, items, tuple_cls, dynamic_attributes=None, **kwargs) -> None:
         super().__init__(items, **kwargs)
         self.tuple_cls = tuple_cls
         self.dynamic_attributes = dynamic_attributes if dynamic_attributes else {}
+=======
+    def __init__(self, items, tuple_cls, **kwargs) -> None:
+        super().__init__(items, **kwargs)
+        self.tuple_cls = tuple_cls
+        self.dynamic_attributes = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_namedtuple(self):
         return isinstance(getattr(self.tuple_cls, "_fields", None), tuple) and callable(
@@ -1300,6 +1494,7 @@ def python_type(self):
     def as_python_constant(self):
         if self.is_structseq():
             # StructSequenceType(iterable)
+<<<<<<< HEAD
             result = self.python_type()([x.as_python_constant() for x in self.items])
         else:
             # NamedTupleType(*iterable)
@@ -1318,6 +1513,11 @@ def as_python_constant(self):
                 setattr(result, attr_name, python_value)
 
         return result
+=======
+            return self.python_type()([x.as_python_constant() for x in self.items])
+        # NamedTupleType(*iterable)
+        return self.python_type()(*[x.as_python_constant() for x in self.items])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def as_proxy(self):
         assert self.python_type() is not SizeVariable
@@ -1328,7 +1528,10 @@ def as_proxy(self):
         return self.python_type()(*self._as_proxy())
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+<<<<<<< HEAD
         # Always reconstruct the NamedTuple normally first
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Constructors:
         #   StructSequenceType(iterable)
         #   NamedTupleType(*iterable)
@@ -1342,11 +1545,16 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.extend_output(
             [
+<<<<<<< HEAD
                 create_build_tuple(len(self.items)),
+=======
+                create_instruction("BUILD_TUPLE", arg=len(self.items)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             + create_call_function(1, False)
         )
 
+<<<<<<< HEAD
         for name, value in self.dynamic_attributes.items():
             codegen.dup_top()
             codegen(value)
@@ -1373,6 +1581,8 @@ def _is_method_overridden(self, method_name: str) -> bool:
             return False
         return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_method(
         self,
         tx,
@@ -1381,6 +1591,7 @@ def call_method(
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
         if name == "__setattr__":
+<<<<<<< HEAD
             if kwargs or len(args) != 2:
                 raise_args_mismatch(
                     tx,
@@ -1388,6 +1599,10 @@ def call_method(
                     "2 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 2
+            assert len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             attr, value = args
             attr = attr.as_python_constant()
             if (
@@ -1401,6 +1616,7 @@ def call_method(
                 raise_observed_exception(AttributeError, tx)
             # Subclass of namedtuple type can have dynamic attributes
             tx.output.side_effects.mutation(self)
+<<<<<<< HEAD
             if self.source:
                 tx.output.side_effects.store_attr(self, attr, value)
             self.dynamic_attributes[attr] = value
@@ -1446,6 +1662,12 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             )
         return super().getitem_const(tx, arg)
 
+=======
+            self.dynamic_attributes[attr] = value
+            return ConstantVariable.create(None)
+        return super().call_method(tx, name, args, kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def var_getattr(self, tx: "InstructionTranslator", name):
         def check_and_create_method():
             method = inspect.getattr_static(self.tuple_cls, name, None)
@@ -1462,6 +1684,7 @@ def check_and_create_method():
             else:
                 return None
 
+<<<<<<< HEAD
         # Avoid UserMethodVariable fallback precisely when methods NamedTuple methods have not been overwritten.
         if (
             name == "_replace"
@@ -1483,6 +1706,8 @@ def check_and_create_method():
             source = NamedTupleFieldsSource(self.source) if self.source else None
             return VariableTracker.build(tx, self.fields(), source=source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in self.dynamic_attributes:
             return self.dynamic_attributes[name]
 
@@ -1503,7 +1728,11 @@ def call_obj_hasattr(
 
 
 class SliceVariable(VariableTracker):
+<<<<<<< HEAD
     def __init__(self, items, tx=None, **kwargs) -> None:
+=======
+    def __init__(self, items, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items_to_map = items
         start, stop, step = [variables.ConstantVariable.create(None)] * 3
 
@@ -1516,6 +1745,7 @@ def __init__(self, items, tx=None, **kwargs) -> None:
         else:
             raise AssertionError
 
+<<<<<<< HEAD
         # Convert TensorVariable to SymIntVariable by calling .item()
         # This decomposes a[:t] to u=t.item(); a[:u] at the dynamo level
         if isinstance(start, variables.TensorVariable):
@@ -1534,6 +1764,20 @@ def __init__(self, items, tx=None, **kwargs) -> None:
             )
             step = step.call_method(tx, "item", [], {})
 
+=======
+        if isinstance(start, variables.TensorVariable) or isinstance(
+            stop, variables.TensorVariable
+        ):
+            unimplemented_v2(
+                gb_type="Dynamic slicing with Tensor arguments",
+                context=f"SliceVariable start: {start}, stop: {stop}, step: {step}",
+                explanation="Creating slices with Tensor arguments is not supported. "
+                "e.g. `l[:x]`, where `x` is a 1-element tensor.",
+                hints=[
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.items = (start, stop, step)
 
         super().__init__(**kwargs)
@@ -1598,8 +1842,24 @@ def next_variable(self, tx):
         self.index += 1
         return self.items[old_index]
 
+<<<<<<< HEAD
     def call_obj_hasattr(self, tx, name):
         return variables.ConstantVariable.create(hasattr(iter([]), name))
+=======
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ):
+        if name == "__contains__":
+            assert len(args) == 1
+            assert not kwargs
+            return iter_contains(self.items[self.index :], args[0], tx)
+
+        return super().call_method(tx, name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def python_type(self):
         return type(iter([]))
@@ -1609,6 +1869,7 @@ def as_python_constant(self):
             raise NotImplementedError
         return iter([x.as_python_constant() for x in self.items])
 
+<<<<<<< HEAD
     def has_unpack_var_sequence(self, tx):
         return True
 
@@ -1616,6 +1877,10 @@ def unpack_var_sequence(self, tx):
         r = list(self.items[self.index :])
         self.index = len(self.items)
         return r
+=======
+    def unpack_var_sequence(self, tx):
+        return list(self.items[self.index :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
         return self.unpack_var_sequence(tx)
@@ -1625,7 +1890,11 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(remaining_items)
         codegen.extend_output(
             [
+<<<<<<< HEAD
                 create_build_tuple(len(remaining_items)),
+=======
+                create_instruction("BUILD_TUPLE", arg=len(remaining_items)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 create_instruction("GET_ITER"),
             ]
         )
@@ -1633,6 +1902,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
 class TupleIteratorVariable(ListIteratorVariable):
     pass
+<<<<<<< HEAD
 
 
 class RangeIteratorVariable(IteratorVariable):
@@ -1682,3 +1952,5 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.append_output(codegen.create_load_const(self.step))
         codegen.extend_output(create_call_function(3, False))
         codegen.append_output(create_instruction("GET_ITER"))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index a8653ffda2f97..cb8356bf70d05 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -33,11 +33,15 @@
 import torch.utils._pytree as pytree
 
 from .. import config, graph_break_hints, trace_rules, variables
+<<<<<<< HEAD
 from ..bytecode_transformation import (
     create_call_function,
     create_call_function_ex,
     create_instruction,
 )
+=======
+from ..bytecode_transformation import create_call_function, create_instruction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..create_parameter_op import do_not_convert_to_tracable_parameter
 from ..exc import raise_observed_exception, unimplemented, unimplemented_v2
 from ..guards import GuardBuilder, install_guard
@@ -46,7 +50,10 @@
     AttrSource,
     GenericAttrSource,
     GetItemSource,
+<<<<<<< HEAD
     TypeMROSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TypeSource,
     WeakRefCallSource,
 )
@@ -58,10 +65,16 @@
     istype,
     list_methods,
     proxy_args_kwargs,
+<<<<<<< HEAD
     raise_args_mismatch,
     tuple_methods,
 )
 from .base import raise_type_error_exc, VariableTracker
+=======
+    tuple_methods,
+)
+from .base import VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .constant import ConstantVariable
 from .functions import NestedUserFunctionVariable, UserFunctionVariable
 from .user_defined import call_random_fn, is_standard_setattr, UserDefinedObjectVariable
@@ -103,6 +116,7 @@ def reconstruct(self, codegen: "PyCodegen"):
             codegen.extend_output(create_call_function(1, False))
 
     def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
+<<<<<<< HEAD
         if not self.objvar:
             unimplemented_v2(
                 gb_type="1-arg super not implemented",
@@ -114,6 +128,9 @@ def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
                     "Use two-argument super(type, object_or_type).",
                 ],
             )
+=======
+        assert self.objvar, "1-arg super not implemented"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         search_type = self.typevar.as_python_constant()
 
         # The rest of this function does two things:
@@ -150,7 +167,13 @@ def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
                     # Equivalent of something like type(L['self']).__mro__[1].attr_name
                     if type_to_use_source:
                         source = AttrSource(
+<<<<<<< HEAD
                             GetItemSource(TypeMROSource(type_to_use_source), index),
+=======
+                            GetItemSource(
+                                AttrSource(type_to_use_source, "__mro__"), index
+                            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             name,
                         )
                     return resolved_getattr, source
@@ -211,10 +234,16 @@ def call_method(
                 and not (args or kwargs)
             ):
                 with do_not_convert_to_tracable_parameter():
+<<<<<<< HEAD
                     fn_vt = VariableTracker.build(
                         tx, unpatched_nn_module_init, source=source
                     )
                     return fn_vt.call_function(tx, [self.objvar] + args, kwargs)
+=======
+                    return variables.UserFunctionVariable(
+                        unpatched_nn_module_init, source=source
+                    ).call_function(tx, [self.objvar] + args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 unimplemented_v2(
                     gb_type="Unsupported super().__init__() call",
@@ -242,8 +271,14 @@ def call_method(
         elif isinstance(inner_fn, staticmethod) and isinstance(
             inner_fn.__func__, types.FunctionType
         ):
+<<<<<<< HEAD
             fn_vt = VariableTracker.build(tx, inner_fn.__func__, source=source)
             return fn_vt.call_function(tx, args, kwargs)
+=======
+            return variables.UserFunctionVariable(
+                inner_fn.__func__, source=source
+            ).call_function(tx, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(inner_fn, classmethod) and isinstance(
             inner_fn.__func__, types.FunctionType
         ):
@@ -261,11 +296,16 @@ def call_method(
                 # different from type(self) with polymorphism.
                 cls_source = None
                 if self.objvar.source:
+<<<<<<< HEAD
                     cls_source = TypeSource(self.objvar.source)
+=======
+                    cls_source = AttrSource(self.objvar.source, "__class__")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cls_variable = VariableTracker.build(
                     tx, self.objvar.value_type, cls_source
                 )
 
+<<<<<<< HEAD
             fn_vt = VariableTracker.build(
                 tx, inner_fn.__func__, source=AttrSource(source, "__func__")
             )
@@ -273,6 +313,15 @@ def call_method(
         elif isinstance(inner_fn, types.FunctionType):
             fn_vt = VariableTracker.build(tx, inner_fn, source=source)
             return fn_vt.call_function(tx, [self.objvar] + args, kwargs)
+=======
+            return variables.UserMethodVariable(
+                inner_fn.__func__, cls_variable, source=source
+            ).call_function(tx, args, kwargs)
+        elif isinstance(inner_fn, types.FunctionType):
+            return variables.UserFunctionVariable(
+                inner_fn, source=source
+            ).call_function(tx, [self.objvar] + args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(inner_fn, types.MethodType):
             return variables.UserMethodVariable(
                 inner_fn.__func__, self.objvar, source=source
@@ -320,11 +369,14 @@ def call_method(
         ):
             return self.objvar._dict_vt.call_method(tx, name, args, kwargs)
         elif (
+<<<<<<< HEAD
             isinstance(self.objvar, variables.UserDefinedSetVariable)
             and inner_fn in self.objvar._set_methods
         ):
             return self.objvar._set_vt.call_method(tx, name, args, kwargs)
         elif (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             isinstance(self.objvar, variables.UserDefinedTupleVariable)
             and inner_fn in tuple_methods
         ):
@@ -403,6 +455,7 @@ def call_method(
 
 class ExceptionVariable(VariableTracker):
     # The ExceptionVariable corresponds to the BaseException class in Python
+<<<<<<< HEAD
     def __init__(
         self, exc_type, args, init_kwargs=None, source=None, mutation_type=None
     ) -> None:
@@ -416,6 +469,12 @@ def __init__(
                 explanation="Dynamo does not know how to handle keyword args passed to an exception constructor",
                 hints=[*graph_break_hints.SUPPORTABLE],
             )
+=======
+    def __init__(self, exc_type, args, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.exc_type = exc_type
+        self.args = args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # When raising a new exception while another exception is already being
         # handled, the new exception's __context__ attribute is automatically
         # set to the handled exception.
@@ -585,8 +644,15 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
         from ..comptime import comptime
 
         # To support the comptime.print_graph convenience accessors
+<<<<<<< HEAD
         return VariableTracker.build(
             tx, getattr(comptime, name), source=AttrSource(self.source, name)
+=======
+        from .functions import UserFunctionVariable
+
+        return UserFunctionVariable(
+            getattr(comptime, name), source=AttrSource(self.source, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def call_function(
@@ -598,6 +664,7 @@ def call_function(
         from ..comptime import ComptimeContext
 
         # TODO: support an expression form as well
+<<<<<<< HEAD
         # Second argument is runtime lambda, ignored
         if kwargs or len(args) > 2:
             raise_args_mismatch(
@@ -606,17 +673,30 @@ def call_function(
                 "at most 2 args and 0 kwargs",
                 f"{len(args)} args and {len(kwargs)} kwargs",
             )
+=======
+
+        assert not kwargs
+        # Second argument is runtime lambda, ignored
+        assert len(args) <= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = args[0]
         if isinstance(fn, UserFunctionVariable):
             fn.get_function()(ComptimeContext(tx))
         elif isinstance(fn, NestedUserFunctionVariable):
             # We have to manually bind the freevars ourselves
             code = fn.get_code()
+<<<<<<< HEAD
             if fn.closure:
                 raise_type_error_exc(
                     tx,
                     f"comptime function must not have free variables, but these variables were free: {code.co_freevars}",
                 )
+=======
+            assert not fn.closure, (
+                "comptime function must not have free variables, "
+                f"but these variables were free: {code.co_freevars}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             func = types.FunctionType(
                 code,
                 fn.f_globals,
@@ -684,6 +764,7 @@ def __init__(self, fn_cls, **kwargs) -> None:
     def call_apply(self, tx: "InstructionTranslator", args, kwargs):
         requires_grad = False
 
+<<<<<<< HEAD
         def visit(vt):
             nonlocal requires_grad
             if isinstance(vt, variables.TensorVariable):
@@ -691,6 +772,15 @@ def visit(vt):
                     requires_grad = True
             if isinstance(vt, variables.NNModuleVariable):
                 if vt.is_training(tx):
+=======
+        def visit(node):
+            nonlocal requires_grad
+            if isinstance(node, variables.TensorVariable):
+                if node.requires_grad is not False:
+                    requires_grad = True
+            if isinstance(node, variables.NNModuleVariable):
+                if node.is_training(tx):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     requires_grad = True
 
         VariableTracker.visit(visit, (args, kwargs))
@@ -766,10 +856,17 @@ def visit(vt):
             # functions, so we have to add guards manually.
             if self.source:
                 fwd_src = AttrSource(self.source, "forward")
+<<<<<<< HEAD
                 install_guard(fwd_src.make_guard(GuardBuilder.CLOSURE_MATCH))
                 if is_setup_ctx_defined:
                     setup_ctx_src = AttrSource(self.source, "setup_context")
                     install_guard(setup_ctx_src.make_guard(GuardBuilder.CLOSURE_MATCH))
+=======
+                install_guard(fwd_src.make_guard(GuardBuilder.FUNCTION_MATCH))
+                if is_setup_ctx_defined:
+                    setup_ctx_src = AttrSource(self.source, "setup_context")
+                    install_guard(setup_ctx_src.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return val
 
@@ -785,8 +882,14 @@ def visit(vt):
             sig = inspect.signature(fn)
             if len(args) - 1 == len(sig._parameters):
                 args = args[1:]  # Don't use context
+<<<<<<< HEAD
             fn_vt = VariableTracker.build(tx, fn, source=source)
             return fn_vt.call_function(tx, args, kwargs)
+=======
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, args, kwargs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(fn, types.MethodType):
             return variables.UserMethodVariable(
                 fn.__func__,
@@ -812,8 +915,14 @@ def call_backward(self, tx: "InstructionTranslator", args, kwargs):
         assert isinstance(fn, types.FunctionType)
 
         fn_source = AttrSource(self.source, "backward")
+<<<<<<< HEAD
         fn_vt = VariableTracker.build(tx, fn, source=fn_source)
         return fn_vt.call_function(tx, args, kwargs)
+=======
+        return variables.UserFunctionVariable(fn, source=fn_source).call_function(
+            tx, args, kwargs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(self, tx: "InstructionTranslator", args, kwargs):
         return AutogradFunctionVariable(self.fn_cls)
@@ -958,8 +1067,12 @@ def call_method(
         if name == "__setattr__":
             return super().call_method(tx, name, args, kwargs)
         elif name == "mark_non_differentiable":
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.non_differentiable = proxy_args_kwargs(args, {})[0]
             return variables.ConstantVariable.create(None)
 
@@ -987,10 +1100,14 @@ def call_method(
             )
 
         if not self.inference:
+<<<<<<< HEAD
             if kwargs or not self.source:
                 raise_type_error_exc(
                     tx, "save_for_backward() requires a source and no keyword arguments"
                 )
+=======
+            assert self.source and not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tx.output.side_effects.track_save_for_backward(self, args)
 
         # In eager mode, multiple calls to .save_for_backward() will overwrite previous calls.
@@ -1039,6 +1156,7 @@ def call_method(
     ) -> "VariableTracker":
         if name == "queue_callback":
             if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+<<<<<<< HEAD
                 assert tx.one_graph or tx.error_on_graph_break, (
                     "queue_callback() is only supported when Compiled Autograd is enabled with fullgraph=True"
                 )
@@ -1048,6 +1166,15 @@ def call_method(
                     torch._dynamo.external_utils.FakeCompiledAutogradEngine.queue_callback,
                 )
                 return fn_vt.call_function(
+=======
+                assert tx.one_graph, (
+                    "queue_callback() is only supported when Compiled Autograd is enabled with fullgraph=True"
+                )
+                return variables.UserFunctionVariable(
+                    torch._dynamo.external_utils.FakeCompiledAutogradEngine.queue_callback,
+                    source=self.source,
+                ).call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     tx,
                     (tx.output.side_effects.get_ca_final_callbacks_var(), *args),
                     kwargs,
@@ -1213,6 +1340,7 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
     def get_forwarded_dict(self, tx):
         assert (
             self.name == "__dict__"
@@ -1222,6 +1350,8 @@ def get_forwarded_dict(self, tx):
         self.obj.ban_mutation = True
         return VariableTracker.build(tx, self.obj.value.__dict__, self.source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MethodWrapperVariable(VariableTracker):
     def __init__(self, method_wrapper, **kwargs) -> None:
@@ -1238,10 +1368,14 @@ def call_function(
         if is_tensor_base_attr_getter(self.method_wrapper) and isinstance(
             args[0], variables.TensorVariable
         ):
+<<<<<<< HEAD
             if not (len(args) == 1 and len(kwargs) == 0):
                 raise_type_error_exc(
                     tx, "tensor attribute getter takes exactly one argument"
                 )
+=======
+            assert len(args) == 1 and len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return args[0].var_getattr(tx, self.method_wrapper.__self__.__name__)
 
@@ -1446,7 +1580,11 @@ def can_constant_fold_through(cls, fn):
     def get_constant_collection_for_func(cls, fn):
         mod = fn.__module__.split(".")
         assert len(mod) >= 2 and mod[:2] == ["torch", "_numpy"]
+<<<<<<< HEAD
         return np_constant_collections_map.get(fn)
+=======
+        return np_constant_collections_map.get(fn, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(
         self,
@@ -1600,7 +1738,11 @@ def reconstruct(self, codegen: "PyCodegen"):
             variables.ConstantVariable.create(k): v for k, v in self.sym_kwargs.items()
         }
         codegen(variables.ConstDictVariable(kwargs))
+<<<<<<< HEAD
         codegen.extend_output(create_call_function_ex(True, False))
+=======
+        codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DebuggingVariable(VariableTracker):
@@ -1951,7 +2093,11 @@ def reconstruct(self, codegen: "PyCodegen"):
 class WeakRefVariable(VariableTracker):
     @staticmethod
     def build(tx, weakref_value, **options):
+<<<<<<< HEAD
         source = options.get("source")
+=======
+        source = options.get("source", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         callback = weakref_value.__callback__
         callback_source = source and AttrSource(source, "__callback__")
         callback_vt = VariableTracker.build(tx, callback, callback_source)
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 1e294950d9283..3fdd2255d9006 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -26,7 +26,10 @@
 import functools
 import inspect
 import itertools
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 from contextlib import contextmanager, nullcontext
 from typing import TYPE_CHECKING
@@ -62,12 +65,19 @@
     nnmodule_has_hooks,
     object_has_getattribute,
     proxy_args_kwargs,
+<<<<<<< HEAD
     raise_args_mismatch,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_example_value,
     unpatched_nn_module_call,
     unpatched_nn_module_call_impl,
 )
+<<<<<<< HEAD
 from .base import raise_type_error_exc, typestr, ValueMutationNew, VariableTracker
+=======
+from .base import typestr, ValueMutationNew, VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .functions import invoke_and_store_as_constant
 from .lazy import LazyVariableTracker
 from .lists import SliceVariable
@@ -103,6 +113,7 @@ def convert_to_fake(x):
         proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
         fake_args = [convert_to_fake(arg) for arg in proxy_args]
         fake_kwargs = {k: convert_to_fake(v) for k, v in proxy_kwargs.items()}
+<<<<<<< HEAD
         try:
             mod._infer_parameters(mod, fake_args, fake_kwargs)
         except AttributeError as e:
@@ -114,17 +125,23 @@ def convert_to_fake(x):
                 if str(e)
                 else "AttributeError during lazy module initialization",
             )
+=======
+        mod._infer_parameters(mod, fake_args, fake_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextmanager
 def record_nn_module_stack(module_key: str, source, tx, mod: torch.nn.Module):
     fully_qualified_name = source.name()
+<<<<<<< HEAD
     # Remove redundant namings
     fully_qualified_name = re.sub(
         r"\._(?:modules|parameters|buffers)\[(['\"])([^'\"\]]+)\1\]",
         r".\2",
         fully_qualified_name,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     num_calls = tx.num_calls.get(fully_qualified_name, 0)
     module_key = f"{module_key}@{num_calls}" if num_calls > 0 else module_key
     try:
@@ -375,7 +392,10 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 raise_observed_exception(
                     AttributeError,
                     tx,
+<<<<<<< HEAD
                     msg=f"'{type(base).__name__}' object has no attribute '{name}'",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         if name == "forward":
@@ -463,6 +483,7 @@ def call_function(
                 assert not is_lazy, (
                     "Expected lazy sequential isn't a valid combination?"
                 )
+<<<<<<< HEAD
                 if kwargs:
                     raise_args_mismatch(
                         tx,
@@ -470,6 +491,9 @@ def call_function(
                         "0 kwargs",
                         f"{len(kwargs)} kwargs",
                     )
+=======
+                assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (arg,) = args
                 # TODO: Use named_children when it supports remove_duplicate=False.
                 for child_name, submod in mod._modules.items():
@@ -507,11 +531,14 @@ def call_function(
                 tx.output.is_root_tracer()
                 and mod.__module__.startswith(("torch.nn.", "torch.ao."))
                 and mod.__module__ != "torch.nn.utils.parametrize"
+<<<<<<< HEAD
                 # this basically means we are using the new strict export tracer which wraps the
                 # user callable, so we shouldn't directly proxy in the fx graph
                 and not isinstance(
                     mod, torch.ao.quantization.pt2e.export_utils._WrapperModule
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 if nnmodule_has_hooks(
                     mod, check_forward_hooks=True, check_backward_hooks=True
@@ -613,6 +640,7 @@ def generic_call_method_helper(name):
             return ConstantVariable.create(True)
 
         if name == "_get_item_by_idx":
+<<<<<<< HEAD
             if not args[1].is_python_constant():
                 raise_type_error_exc(
                     tx,
@@ -623,6 +651,10 @@ def generic_call_method_helper(name):
                     tx,
                     f"``nn.Module`` {module}'s call method {name} requires a tuple as first argument",
                 )
+=======
+            assert args[1].is_python_constant()
+            assert isinstance(args[0], TupleVariable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod_var = args[0].items[args[1].value]
             if isinstance(mod_var, UnspecializedNNModuleVariable):
                 return mod_var
@@ -700,6 +732,7 @@ def gen_source(source, name):
 
         if name == "named_children":
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -707,6 +740,9 @@ def gen_source(source, name):
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name, submod in module.named_children():
                 result.append(named_embed(name, submod))
@@ -737,6 +773,7 @@ def gen_source(source, name):
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "children":
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -744,6 +781,9 @@ def gen_source(source, name):
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return wrap_values(module.named_children())
         elif name == "modules":
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
@@ -755,6 +795,7 @@ def gen_source(source, name):
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers"))
             return wrap_values(module.named_buffers(**get_kwargs("recurse")))
         elif name == "keys":
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -762,11 +803,15 @@ def gen_source(source, name):
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name in module.keys():
                 result.append(ConstantVariable.create(name))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "values":
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -783,11 +828,18 @@ def gen_source(source, name):
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+            return wrap_values(module.items())
+        elif name == "items":
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name, submod in module.items():
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "__len__":
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -795,6 +847,9 @@ def gen_source(source, name):
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not (args or kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ConstantVariable.create(len(module))
         elif (
             name == "__contains__"
@@ -806,6 +861,7 @@ def gen_source(source, name):
                 args[0].as_python_constant() in module._modules
             )
         elif name == "__getitem__":
+<<<<<<< HEAD
             if kwargs or len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -813,6 +869,9 @@ def gen_source(source, name):
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not kwargs and len(args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             builtin_supported = (
                 torch.nn.ModuleDict.__getitem__,
                 torch.nn.ModuleList.__getitem__,
@@ -822,6 +881,7 @@ def gen_source(source, name):
             )
 
             if type(module).__getitem__ not in builtin_supported:
+<<<<<<< HEAD
                 if not (
                     isinstance(args[0], variables.ConstantVariable)
                     and isinstance(args[0].as_python_constant(), (str, int))
@@ -835,6 +895,11 @@ def gen_source(source, name):
                             "Use constant arguments of type str or int for __getitem__"
                         ],
                     )
+=======
+                assert isinstance(args[0], variables.ConstantVariable), typestr(args[0])
+                key = args[0].as_python_constant()
+                assert isinstance(key, (str, int))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fn = getattr(module, name).__func__
 
                 assert isinstance(fn, types.FunctionType)
@@ -993,11 +1058,15 @@ def set_nn_module_stack_source(self, source):
     @functools.cache
     def _nn_module_method_ids():
         # Allow __setattr__ to fall through to base class handler
+<<<<<<< HEAD
         supported = {
             torch.nn.Module.__setattr__,
             torch.nn.Module.__init__,
             torch.nn.Module.__delattr__,
         }
+=======
+        supported = {torch.nn.Module.__setattr__, torch.nn.Module.__init__}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return {
             id(x.__code__)
             for x in torch.nn.Module.__dict__.values()
@@ -1019,7 +1088,11 @@ def unpack_var_sequence(self, tx):
             # will not reflect the mutations. So, trace through the `__iter__`
             # function to reflect any tracked mutations.
             return tx.inline_user_function_return(
+<<<<<<< HEAD
                 VariableTracker.build(tx, fn),
+=======
+                variables.UserFunctionVariable(fn),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 [
                     self,
                 ],
@@ -1041,7 +1114,14 @@ def call_function(
                 self.value_type = mod.cls_to_become
             initialize_lazy_module(tx, mod, args, kwargs)
 
+<<<<<<< HEAD
         if not isinstance(mod, torch.fx.GraphModule):
+=======
+        if (
+            not isinstance(mod, torch.fx.GraphModule)
+            and mod.__call__.__func__ is not unpatched_nn_module_call
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name = "__call__"
             fn = getattr(self.value_type, name)
         else:
@@ -1074,7 +1154,11 @@ def call_function(
                     fn = self.value_type.forward
 
         if self.source:
+<<<<<<< HEAD
             source = self.get_source_by_walking_mro(name)
+=======
+            source = AttrSource(AttrSource(self.source, "__class__"), name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             source = None
 
@@ -1088,6 +1172,7 @@ def call_function(
             else nullcontext()
         )
         with ctx:
+<<<<<<< HEAD
             if not isinstance(fn, (types.FunctionType, torch.jit.ScriptFunction)):
                 fn_vt = VariableTracker.build(tx, fn, source=source)
                 return fn_vt.call_function(tx, [self] + list(args), kwargs)
@@ -1099,6 +1184,11 @@ def call_function(
                 return variables.UserFunctionVariable(fn, source=source).call_function(
                     tx, [self] + list(args), kwargs
                 )
+=======
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, [self] + list(args), kwargs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_method(
         self,
@@ -1110,12 +1200,22 @@ def call_method(
         if name in ["_call_impl", "_wrapped_call_impl"]:
             fn = getattr(self.value_type, name)
             if self.source:
+<<<<<<< HEAD
                 source = self.get_source_by_walking_mro(name)
             else:
                 source = None
 
             fn_vt = VariableTracker.build(tx, fn, source=source)
             return fn_vt.call_function(tx, [self] + list(args), kwargs)
+=======
+                source = AttrSource(AttrSource(self.source, "__class__"), name)
+            else:
+                source = None
+
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, [self] + list(args), kwargs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if name not in getattr(self.value, "__dict__", {}):
             try:
@@ -1124,9 +1224,20 @@ def call_method(
                 method = None
 
             if isinstance(method, staticmethod):
+<<<<<<< HEAD
                 source = AttrSource(self.get_source_by_walking_mro(name), "__func__")
                 fn_vt = VariableTracker.build(tx, method.__func__, source=source)
                 return fn_vt.call_function(tx, args, kwargs)
+=======
+                source = AttrSource(
+                    AttrSource(AttrSource(self.source, "__class__"), name), "__func__"
+                )
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(method.__func__, source=source),
+                    args,
+                    kwargs,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if (
                 hasattr(method, "__code__")
@@ -1180,6 +1291,7 @@ def call_method(
                     # Handle submodules
                     self.is_state_mutated = True
 
+<<<<<<< HEAD
             if (
                 method is torch.nn.Module.__setattr__
                 and isinstance(args[1], variables.DeletedVariable)
@@ -1188,6 +1300,18 @@ def call_method(
                 # members like `_modules``.
                 fn_vt = VariableTracker.build(tx, torch.nn.Module.__delattr__)
                 return fn_vt.call_function(tx, [self, args[0]], kwargs)
+=======
+            if method is torch.nn.Module.__setattr__ and isinstance(
+                args[1], variables.DeletedVariable
+            ):
+                # Trace through __delattr__ to track mutations on the module
+                # members like `_modules``.
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(torch.nn.Module.__delattr__),
+                    [self, args[0]],
+                    kwargs,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return super().call_method(tx, name, args, kwargs)
 
@@ -1272,11 +1396,15 @@ def manually_trace_nn_module_getattr(self, tx: "InstructionTranslator", name):
         if out is None:
             out = self.getattr_helper(tx, "_buffers", name_vt)
         if out is None:
+<<<<<<< HEAD
             raise_observed_exception(
                 AttributeError,
                 tx,
                 msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
             )
+=======
+            raise_observed_exception(AttributeError, tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out
 
 
@@ -1303,7 +1431,11 @@ class FSDPManagedNNModuleVariable(UnspecializedNNModuleVariable):
     """
 
     def __init__(self, value, **kwargs) -> None:
+<<<<<<< HEAD
         source = kwargs.get("source")
+=======
+        source = kwargs.get("source", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert source is not None, (
             "FSDPManagedNNModule depends on having an accurate source to control guarding."
         )
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 776f7f34d9c37..b1f242f935c3f 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -147,7 +147,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
             for group in self.value.param_groups:
                 for p in group["params"]:
+<<<<<<< HEAD
                     mark_static_address(p, guard=True)
+=======
+                    mark_static_address(p)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self._set_capturable(tx)
 
@@ -239,8 +243,21 @@ def map_sources_and_install_guards(self, tx):
         self.grad_to_source = {}
         self.tensor_to_source = {}
 
+<<<<<<< HEAD
         def mark_static(x):
             mark_static_address(x, guard=True)
+=======
+        # Tracing the _init_group is expensive. But we still have to insert the
+        # necessary guards for _init_group. So, we manually handle insertion of
+        # guards. We also want to mark all the tensors inside the state dict to
+        # be static address.
+
+        # Mark all the tensors in the state dict to be static address. This has
+        # to be done first because the variable builder relies on the static
+        # address annotation.
+        def mark_static(x):
+            mark_static_address(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tree_map_only(torch.Tensor, mark_static, self.value.state)
 
@@ -348,14 +365,22 @@ def wrap_tensor(self, tx: "InstructionTranslator", tensor_value):
 
         if tensor_value in self.tensor_to_source:
             # mark these tensors as static for cudagraphs
+<<<<<<< HEAD
             mark_static_address(tensor_value, guard=True)
+=======
+            mark_static_address(tensor_value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             source = self.tensor_to_source[tensor_value]
             self.static_tensor_names.add(tx.output.module_key_name(source.name()))
         elif tensor_value in self.grad_to_source:
             source = self.grad_to_source[tensor_value]
         else:
             # mark these tensors as static for cudagraphs
+<<<<<<< HEAD
             mark_static_address(tensor_value, guard=True)
+=======
+            mark_static_address(tensor_value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
             source = GlobalWeakRefSource(global_name)
diff --git a/torch/_dynamo/variables/script_object.py b/torch/_dynamo/variables/script_object.py
index a120ab488ed95..4ec9a4606357d 100644
--- a/torch/_dynamo/variables/script_object.py
+++ b/torch/_dynamo/variables/script_object.py
@@ -25,8 +25,12 @@
 
 import torch
 
+<<<<<<< HEAD
 from .. import graph_break_hints
 from ..exc import unimplemented_v2, UnsafeScriptObjectError, Unsupported
+=======
+from ..exc import unimplemented, UnsafeScriptObjectError, Unsupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .base import VariableTracker
 from .user_defined import UserDefinedObjectVariable
 
@@ -76,6 +80,7 @@ def var_getattr(self, tx, name: str) -> VariableTracker:
 
         method = getattr(self.value, name, None)
         if method is None:
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="FakeScriptObject missing method implementation",
                 context=f"value={self.value}, method={name}",
@@ -94,6 +99,16 @@ def var_getattr(self, tx, name: str) -> VariableTracker:
                 hints=[
                     "Use method calls instead of attribute access.",
                 ],
+=======
+            unimplemented(
+                f"FakeScriptObject doesn't define method {name}. Did you forget to implement it in the fake class?"
+            )
+
+        if not callable(method):
+            unimplemented(
+                "Only method calls on TorchScript objects can be supported safely."
+                " Please use method calls instead of attribute access."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         return TorchHigherOrderOperatorVariable.make(
@@ -111,6 +126,7 @@ def var_getattr(self, tx, name: str) -> VariableTracker:
         "Dynamo cannot safely trace script object due to graph break."
     )
     def call_method(self, tx, name, args, kwargs):
+<<<<<<< HEAD
         unimplemented_v2(
             gb_type="Weird method call on TorchScript object",
             context=f"value={self.value}, method={name}",
@@ -122,3 +138,6 @@ def call_method(self, tx, name, args, kwargs):
                 "Avoid calling this method.",
             ],
         )
+=======
+        unimplemented(f"call method {name} on script object is not safe.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
index e63edf8e2b036..6831b00379b6b 100644
--- a/torch/_dynamo/variables/sdpa.py
+++ b/torch/_dynamo/variables/sdpa.py
@@ -13,6 +13,7 @@
     from torch._dynamo.codegen import PyCodegen
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
+<<<<<<< HEAD
 PARAM_NAMES = [
     "query",
     "key",
@@ -22,6 +23,9 @@
     "is_causal",
     "enable_gqa",
 ]
+=======
+PARAM_NAMES = "query key value attn_mask dropout is_causal enable_gqa".split()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SDPAParamsVariable(VariableTracker):
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 1e66d48cec495..13b2a3dd34471 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -23,7 +23,11 @@
 import textwrap
 import traceback
 import types
+<<<<<<< HEAD
 from contextlib import nullcontext
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import TYPE_CHECKING
 
 import sympy
@@ -62,7 +66,10 @@
     object_has_getattribute,
     product,
     proxy_args_kwargs,
+<<<<<<< HEAD
     raise_args_mismatch,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_example_value,
     tensortype_to_dtype,
 )
@@ -203,6 +210,7 @@ def __init__(
             _is_name_set = self.proxy.node.op == "placeholder"
         self._is_name_set: bool = _is_name_set
 
+<<<<<<< HEAD
     def synchronize_attributes(self, tx, target_cls=None):
         from .builder import get_specialized_props, infer_subclass_type
 
@@ -216,6 +224,8 @@ def synchronize_attributes(self, tx, target_cls=None):
         for k, v in specialized_props.items():
             setattr(self, k, v)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def debug_repr(self):
         # TODO: strip off fake tensor from repr here
         return repr(self.proxy.node.meta["example_value"])
@@ -248,7 +258,11 @@ def specialize(value: torch.Tensor):
 
         if is_sparse_any(value) and not has_free_symbols(value):
             props["_size"] = tuple(
+<<<<<<< HEAD
                 int(s) if is_symbolic(s) else s for s in value.size()
+=======
+                [int(s) if is_symbolic(s) else s for s in value.size()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif not has_free_symbols(value):
             # this is a fully static shape, and the keys on props here inform specialization.
@@ -260,8 +274,12 @@ def specialize(value: torch.Tensor):
             props["_size"] = tuple(
                 # the non is_symbolic case applies to the jagged layout
                 # NestedTensor case as singleton ints are not symbolic
+<<<<<<< HEAD
                 int(s) if is_symbolic(s) else s
                 for s in value.size()
+=======
+                [int(s) if is_symbolic(s) else s for s in value.size()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             props["stride"] = tuple(value.stride())
             if torch._C._functorch.is_batchedtensor(value):
@@ -270,9 +288,17 @@ def specialize(value: torch.Tensor):
                 props["is_contiguous"] = None
             else:
                 props["is_contiguous"] = tuple(
+<<<<<<< HEAD
                     x
                     for x in torch._prims_common._memory_formats
                     if value.is_contiguous(memory_format=x)
+=======
+                    [
+                        x
+                        for x in torch._prims_common._memory_formats
+                        if value.is_contiguous(memory_format=x)
+                    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         return props
 
@@ -329,18 +355,28 @@ def dynamic_getattr(self, tx: "InstructionTranslator", name):
         real_value = getattr(_input_associated_real_value, name)
 
         attr_source = AttrSource(self.source, name)
+<<<<<<< HEAD
+=======
+        install_guard(attr_source.make_guard(GuardBuilder.HASATTR))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Typically we'd want to use variable builder here
         # but unfortunately id(real_value.__self__) is not id(<original value>)
         if is_bound_tensor_method(real_value):
+<<<<<<< HEAD
             # No need to install the guard because its a bound tensor method
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from .misc import GetAttrVariable
 
             return GetAttrVariable(
                 self, name, source=attr_source, py_type=type(real_value)
             )
 
+<<<<<<< HEAD
         install_guard(attr_source.make_guard(GuardBuilder.HASATTR))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return VariableTracker.build(tx, real_value, attr_source)
 
     def method_attr_ndim(self, tx):
@@ -518,7 +554,11 @@ def try_generic_attr_handling():
                 # these attributes are implemented under tp_getset, which appear
                 # as `getset_descriptor`s, (compared to, say, methods which appear
                 # as `method_descriptor`s)
+<<<<<<< HEAD
                 if type(static_attr) is not types.GetSetDescriptorType:
+=======
+                if type(static_attr) != types.GetSetDescriptorType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return None
 
                 proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
@@ -958,10 +998,22 @@ def method_tolist(self):
 
         def tolist(tensor, sub_proxy):
             def wrap(i, sub_proxy):
+<<<<<<< HEAD
                 return wrap_fx_proxy(
                     tx,
                     sub_proxy.item(),
                 )
+=======
+                # Sigh, we forgot to gate this, so this data dependent is on
+                # by default and is load bearing in CI
+                with unittest.mock.patch.object(
+                    tx.fake_mode, "allow_scalar_outputs", True
+                ):
+                    return wrap_fx_proxy(
+                        tx,
+                        sub_proxy.item(),
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if tensor.dtype not in [
                 torch.int8,
@@ -1007,11 +1059,15 @@ def method_data_ptr(self, *args, **kwargs):
         return DataPtrVariable(self)
 
     def method_item(self, *args, **kwargs):
+<<<<<<< HEAD
         from ..symbolic_convert import InstructionTranslator
 
         tx = InstructionTranslator.current_tx()
         # We enable capture_scalar_outputs when full_graph=True by default.
         if not tx.one_graph and not config.capture_scalar_outputs:
+=======
+        if not config.capture_scalar_outputs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._warn_capture_scalar_outputs()
             unimplemented_v2(
                 gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False",
@@ -1102,6 +1158,7 @@ def method___setitem__(self, key, value):
             *proxy_args_kwargs([self, key, value], {}),
         )
 
+<<<<<<< HEAD
         if isinstance(value, TensorVariable):
             # [Note: Tensor.__setitem__ and VariableTracker metadata]
             # At this point, we proxied a node representing `self[key] = value` into the graph.
@@ -1132,6 +1189,8 @@ def method___setitem__(self, key, value):
 
             self.synchronize_attributes(tx, type(vt))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.use_graph_deduplication or config.track_nodes_for_deduplication:
             tx.output.region_tracker.add_node_mutation(proxy.node, 0)
 
@@ -1375,7 +1434,11 @@ def method_new(self, *args, **kwargs):
         if (len(args) == 1 and isinstance(args[0], SizeVariable)) or (
             len(args) >= 1
             and all(
+<<<<<<< HEAD
                 isinstance(a, ConstantVariable) and a.python_type() is int for a in args
+=======
+                isinstance(a, ConstantVariable) and a.python_type() == int for a in args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ):
             from ..symbolic_convert import InstructionTranslator
@@ -1557,7 +1620,11 @@ def insert_into_graph():
                 explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
                 hints=[],
             )
+<<<<<<< HEAD
         elif name == "__version__":
+=======
+        elif name in ["__version__"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented_v2(
                 gb_type="Unsupported ndarray.__version__ access",
                 context=f"var_getattr {self} {name}",
@@ -1582,11 +1649,15 @@ def call_method(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         from ..exc import unimplemented_v2
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from ..utils import numpy_method_wrapper
 
         args, kwargs = self.patch_args(name, args, kwargs)
 
+<<<<<<< HEAD
         if name == "astype":
             from .builtin import BuiltinVariable
 
@@ -1612,6 +1683,8 @@ def call_method(
                     ),
                     hints=[*graph_break_hints.FUNDAMENTAL],
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in ["__len__", "size", "tolist"]:
             # delegate back to TensorVariable
             return super().call_method(tx, name, args, kwargs)
@@ -1762,6 +1835,7 @@ def call_method(
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
         if name == "size":
+<<<<<<< HEAD
             if args or kwargs:
                 raise_args_mismatch(
                     tx,
@@ -1769,6 +1843,10 @@ def call_method(
                     "0 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert not args
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = self.example_value.size()
             if not has_free_symbols(result):
                 # avoid creating a node in the graph
@@ -1787,8 +1865,12 @@ def call_method(
                     ),
                 )
         if name == "resize_" and len(args) == 1:
+<<<<<<< HEAD
             if kwargs:
                 raise_args_mismatch(tx, name, "0 kwargs", f"{len(kwargs)} kwargs")
+=======
+            assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tx.output.create_proxy(
                 "call_function",
                 torch.ops.inductor.resize_storage_bytes_,
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index e48a488101549..81b04b7422765 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -33,8 +33,13 @@
 import logging
 import math
 import re
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C
 import torch._refs
@@ -52,6 +57,7 @@
     tracable_create_parameter,
 )
 from ..device_interface import get_registered_device_interfaces
+<<<<<<< HEAD
 from ..exc import raise_observed_exception, unimplemented_v2
 from ..guards import GuardBuilder, install_guard
 from ..source import (
@@ -60,17 +66,29 @@
     SyntheticLocalSource,
     TorchSource,
 )
+=======
+from ..exc import unimplemented, unimplemented_v2
+from ..guards import GuardBuilder, install_guard
+from ..source import CallFunctionNoArgsSource, SyntheticLocalSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import (
     check_unspec_or_constant_args,
     guard_if_dyn,
     has_torch_function,
     hashable,
+<<<<<<< HEAD
     is_wrapper_or_member_descriptor,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     product,
     proxy_args_kwargs,
     unwrap_if_wrapper,
 )
+<<<<<<< HEAD
 from .base import raise_type_error_exc, typestr, VariableTracker
+=======
+from .base import typestr, VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .ctx_manager import (
     AutocastModeVariable,
     ProfilerContextVariable,
@@ -78,7 +96,10 @@
 )
 from .dicts import ConstDictVariable
 from .distributed import DistributedVariable, ProcessGroupVariable
+<<<<<<< HEAD
 from .functions import bind_args_cached
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .lists import ListVariable, TupleVariable
 from .torch_function import (
     can_dispatch_torch_function,
@@ -126,6 +147,7 @@
         torch.autograd.graph.disable_saved_tensors_hooks,
         torch.cpu.amp.autocast_mode.autocast,
         torch.cuda.amp.autocast_mode.autocast,
+<<<<<<< HEAD
         torch.fx.traceback.annotate,
         torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
         # We'll let Dynamo inline into the contextlib part of these context
@@ -136,6 +158,10 @@
         # This allows us to support calling functions decorated with these
         # context managers, without much extra effort or code dup.
         torch.nn.attention.sdpa_kernel.__wrapped__,  # type: ignore[attr-defined]
+=======
+        torch.nn.attention.sdpa_kernel,
+        torch.nn.attention._sdpa_kernel_variadic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 )
 
@@ -148,7 +174,10 @@
 
 constant_fold_functions_need_guards = [
     torch.accelerator.current_device_index,
+<<<<<<< HEAD
     torch.accelerator.current_accelerator,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.current_device,
     torch.cuda.is_initialized,
     torch.xpu.current_device,
@@ -200,16 +229,23 @@ def tracing_state_functions() -> dict[Callable[[], Any], Optional[bool]]:
         torch.jit.is_tracing: False,
         torch._C._get_tracing_state: None,
         torch.fx._symbolic_trace.is_fx_tracing: False,
+<<<<<<< HEAD
         torch.fx._symbolic_trace.is_fx_symbolic_tracing: False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.onnx.is_in_onnx_export: False,
         torch._dynamo.external_utils.is_compiling: True,
         torch._utils.is_compiling: True,
         torch.compiler.is_compiling: True,
         torch.compiler.is_dynamo_compiling: True,
         torch.compiler.is_exporting: True,
+<<<<<<< HEAD
         # Look into https://github.com/pytorch/pytorch/pull/164721 why this is
         # turned to True for Dynamo.
         torch.nn.modules.activation._is_make_fx_tracing: True,
+=======
+        torch.nn.modules.activation._is_make_fx_tracing: False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
 
@@ -246,6 +282,7 @@ class BaseTorchVariable(VariableTracker):
 
     @classmethod
     def create_with_source(cls, value, source):
+<<<<<<< HEAD
         if inspect.isclass(value):
             install_guard(source.make_guard(GuardBuilder.CLASS_MATCH))
         elif inspect.ismodule(value):
@@ -263,6 +300,9 @@ def create_with_source(cls, value, source):
             pass
         else:
             install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+=======
+        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(value, source=source)
 
     def __init__(self, value, **kwargs) -> None:
@@ -292,6 +332,7 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
     def can_constant_fold_through(self):
         if self.value in constant_fold_functions:
             return True
+<<<<<<< HEAD
 
         if (
             self.value is torch.autograd._profiler_enabled
@@ -312,6 +353,8 @@ def can_constant_fold_through(self):
             # interaction with Kineto is not a valid usecase. So, this is ok.
             return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return getattr(self.value, "__module__", None) == "math"
 
 
@@ -347,7 +390,10 @@ def call_function(
             DisabledSavedTensorsHooksVariable,
             DualLevelContextManager,
             FSDPParamGroupUseTrainingStateVariable,
+<<<<<<< HEAD
             FxTracebackAnnotateVariable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             GradIncrementNestingCtxManagerVariable,
             GradInplaceRequiresGradCtxManagerVariable,
             GradModeVariable,
@@ -382,6 +428,7 @@ def call_function(
             assert len(args) <= 1 and len(kwargs) == 0
             inf_mode = args[0].as_python_constant() if len(args) == 1 else True
             return InferenceModeVariable.create(tx, inf_mode)
+<<<<<<< HEAD
         elif self.value in (
             torch.fx.traceback.annotate,
             torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
@@ -390,6 +437,8 @@ def call_function(
             return FxTracebackAnnotateVariable(
                 args[0].as_python_constant(), source=self.source
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif inspect.isclass(self.value) and issubclass(self.value, torch.Stream):
             from torch._dynamo.variables.builder import wrap_fx_proxy_cls
 
@@ -469,6 +518,7 @@ def call_function(
             return FSDPParamGroupUseTrainingStateVariable.create(
                 tx, args[0], args[1].as_python_constant()
             )
+<<<<<<< HEAD
         elif self.value is torch.nn.attention.sdpa_kernel.__wrapped__:  # type: ignore[attr-defined]
             name_to_arg_map = bind_args_cached(
                 self.value, tx, self.source, args, kwargs
@@ -476,6 +526,19 @@ def call_function(
             backends = name_to_arg_map["backends"].as_python_constant()
             set_priority = name_to_arg_map["set_priority"].as_python_constant()
             return SDPAKernelVariable.create(tx, backends, set_priority)
+=======
+        elif self.value is torch.nn.attention.sdpa_kernel:
+            assert len(args) == 1 or (len(kwargs) == 1 and "backends" in kwargs)
+            backends = args[0] if len(args) == 1 else kwargs["backends"]
+            set_priority = kwargs["set_priority"] if "set_priority" in kwargs else False
+            return SDPAKernelVariable.create(
+                tx, backends.as_python_constant(), set_priority
+            )
+        elif self.value is torch.nn.attention._sdpa_kernel_variadic:
+            return SDPAKernelVariable.create(
+                tx, [arg.as_python_constant() for arg in args]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return super().call_function(tx, args, kwargs)
 
@@ -548,7 +611,11 @@ def handle_dispatch_key_set_functions(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
             assert not kwargs
+<<<<<<< HEAD
             if self.value is torch._C._dispatch_keys:
+=======
+            if self.value in (torch._C._dispatch_keys,):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(args) == 1
                 assert isinstance(args[0], variables.TensorVariable)
                 example_value = args[0].proxy.node.meta["example_value"]
@@ -648,6 +715,7 @@ def handle_torch_compile(self, tx: "InstructionTranslator", *args, **kwargs):
                 # torch.compile is a no-op in dynamo
                 return args[0]
 
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="torch.compile call with > 1 args",
                 context=f"args={args}, kwargs={kwargs}",
@@ -657,6 +725,9 @@ def handle_torch_compile(self, tx: "InstructionTranslator", *args, **kwargs):
                     *graph_break_hints.SUPPORTABLE,
                 ],
             )
+=======
+            unimplemented("torch.compile is used as a decorator in the compiled frame")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @register(*REWRITE_OPS_TO_TENSOR_SIZE_METHOD)
         def handle_tensor_size_rewrites(self, tx: "InstructionTranslator", input):
@@ -682,6 +753,7 @@ def handle_is_grad_enabled(self, tx):
         def handle_use_deterministic_algorithms(
             self, tx: "InstructionTranslator", mode, warn_only=False
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             if warn_only and warn_only.as_python_constant():
                 unimplemented_v2(
@@ -693,6 +765,10 @@ def handle_use_deterministic_algorithms(
                         *graph_break_hints.SUPPORTABLE,
                     ],
                 )
+=======
+            if warn_only and warn_only.as_python_constant():
+                unimplemented("torch.use_deterministic_algorithms(warn_only=True)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return DeterministicAlgorithmsVariable.create(tx, mode.as_python_constant())
 
         @register(torch.are_deterministic_algorithms_enabled)
@@ -743,6 +819,7 @@ def handle_device_interface_stream(self, tx: "InstructionTranslator", stream):
         @register(torch.from_numpy)
         def handle_from_numpy(self, tx: "InstructionTranslator", *args):
             if not config.trace_numpy:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="call `torch.from_numpy` with `torch._dynamo.config.trace_numpy=False`",
                     context=f"trace_numpy={config.trace_numpy}",
@@ -764,6 +841,11 @@ def handle_from_numpy(self, tx: "InstructionTranslator", *args):
                         *graph_break_hints.USER_ERROR,
                     ],
                 )
+=======
+                unimplemented("torch.from_numpy. config.trace_numpy is False")
+            if not np:
+                unimplemented("torch.from_numpy. NumPy is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return wrap_fx_proxy_cls(
                 target_cls=TensorVariable,
                 tx=tx,
@@ -975,6 +1057,7 @@ def handle_nested_tensor(
             from .lists import BaseListVariable
 
             if layout and layout.as_python_constant() == torch.strided:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="Attempted to use strided NestedTensor",
                     context=f"layout={layout}",
@@ -994,6 +1077,11 @@ def handle_nested_tensor(
                         *graph_break_hints.USER_ERROR,
                     ],
                 )
+=======
+                unimplemented("torch.compile does not support strided NestedTensor")
+            if not isinstance(tensor_list, BaseListVariable):
+                unimplemented("nested_tensor with non-list input")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @register(torch.nn.functional.one_hot)
         def handle_one_hot(self, tx: "InstructionTranslator", *args, **kwargs):
@@ -1002,6 +1090,7 @@ def handle_one_hot(self, tx: "InstructionTranslator", *args, **kwargs):
                 and args[1].is_python_constant()
                 and args[1].as_python_constant() == -1
             ):
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="Attempted to use `torch.nn.functional.one_hot` with data-dependent output shape",
                     context=f"args={args}, kwargs={kwargs}",
@@ -1010,6 +1099,10 @@ def handle_one_hot(self, tx: "InstructionTranslator", *args, **kwargs):
                         "Explicitly set the `num_classes` param of the function call "
                         "`torch.nn.functional.one_hot` to something other than -1.",
                     ],
+=======
+                unimplemented(
+                    "torch.nn.functional.one_hot with data-dependent output shape"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         @register(torch.fx.experimental.symbolic_shapes.guard_size_oblivious)
@@ -1067,7 +1160,10 @@ def guard_scalar(self, tx: "InstructionTranslator", expr):
             else:
                 raise torch._dynamo.exc.Unsupported("branch not supported")
             return variables.ConstantVariable.create(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.fx.experimental.symbolic_shapes.guard_scalar(val)
             )
 
@@ -1114,7 +1210,10 @@ def handle_has_static_value(self, tx: "InstructionTranslator", expr):
                 return
 
             return variables.ConstantVariable.create(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.fx.experimental.symbolic_shapes.has_static_value(val)
             )
 
@@ -1180,6 +1279,7 @@ def handle_pop_torch_function(
         ):
             assert not args and not kwargs
             if not tx.symbolic_torch_function_state.mode_stack:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="Attempted to pop from empty torch function mode stack",
                     context="",
@@ -1189,6 +1289,9 @@ def handle_pop_torch_function(
                         *graph_break_hints.USER_ERROR,
                     ],
                 )
+=======
+                raise unimplemented("Popping from an empty torch function mode stack")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             TorchFunctionModeStackVariable.register_mutation(tx)
             return tx.symbolic_torch_function_state.pop_torch_function_mode()
 
@@ -1196,11 +1299,15 @@ def handle_pop_torch_function(
         def handle_push_torch_function(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
+<<<<<<< HEAD
             if len(args) != 1 or kwargs:
                 raise_type_error_exc(
                     tx,
                     f"push_torch_function takes exactly one argument ({len(args)} given)",
                 )
+=======
+            assert len(args) == 1 and not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             TorchFunctionModeStackVariable.register_mutation(tx)
             tx.symbolic_torch_function_state.push_torch_function_mode(args[0])
             return ConstantVariable.create(None)
@@ -1209,23 +1316,32 @@ def handle_push_torch_function(
         def handle_len_torch_function(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
+<<<<<<< HEAD
             if args or kwargs:
                 raise_type_error_exc(tx, "len_torch_function_stack takes no arguments")
+=======
+            assert not args and not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ConstantVariable.create(
                 len(tx.symbolic_torch_function_state.mode_stack)
             )
 
         @register(torch._C._get_function_stack_at)
         def handle_get_stack_at(self, tx: "InstructionTranslator", *args, **kwargs):
+<<<<<<< HEAD
             if len(args) != 1 or kwargs:
                 raise_type_error_exc(
                     tx,
                     f"get_function_stack_at takes exactly one argument ({len(args)} given)",
                 )
+=======
+            assert len(args) == 1 and not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ind = args[0].as_python_constant()
             assert ind >= 0 and ind < len(tx.symbolic_torch_function_state.mode_stack)
             return tx.symbolic_torch_function_state.mode_stack[ind]
 
+<<<<<<< HEAD
         @register(torch.get_device_module.__wrapped__)
         def handle_get_device_module(self, tx, *args, **kwargs):
             if len(args) + len(kwargs) > 1 or (kwargs and "device" not in kwargs):
@@ -1268,6 +1384,8 @@ def handle_get_device_module(self, tx, *args, **kwargs):
             # pyrefly: ignore [unbound-name]
             return VariableTracker.build(tx, module, new_source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @register(torch.set_default_device)
         def handle_set_default_device(
             self, tx: "InstructionTranslator", *args, **kwargs
@@ -1330,6 +1448,7 @@ def call_function(
                 arg_type = flat_arg_vt.python_type()
                 if not is_graphable_type(arg_type):
                     type_name = flat_arg_vt.python_type().__qualname__
+<<<<<<< HEAD
                     unimplemented_v2(
                         gb_type="Invalid input type for nonstrict_trace-ed function",
                         context=f"Encountered input of type <{type_name}>.",
@@ -1344,6 +1463,15 @@ def call_function(
                             "* `torch.utils._pytree.register_dataclass`\n"
                             "* `torch.utils._pytree.register_pytree_node`",
                         ],
+=======
+                    unimplemented(
+                        f"""
+For `nonstrict_trace`-ed function, the only allowed input types are basic types (e.g., torch.Tensor, int, float) or pytree containers of those. Here you are calling the function with arguments that contain a value of type <{type_name}>, please use one of the following to register the type with pytree:
+  * `torch.utils._pytree.register_constant`
+  * `torch.utils._pytree.register_dataclass`
+  * `torch.utils._pytree.register_pytree_node`
+"""  # NOQA: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             # Since we checked with `is_graphable` above, `as_proxy` on the
@@ -1364,6 +1492,7 @@ def call_function(
                 import torch.utils._pytree as pytree
 
                 if pytree.is_constant_class(typ):
+<<<<<<< HEAD
                     unimplemented_v2(
                         gb_type="Input marked with `pytree.register_constant` constructed in the `torch.compile` region",
                         context=f"Input={input_spec_vt}, offending type <{type_name}>.",
@@ -1395,6 +1524,27 @@ def call_function(
                             *graph_break_hints.SUPPORTABLE,
                         ],
                         from_exc=e,
+=======
+                    unimplemented(
+                        f"""
+You are calling a `nonstrict_trace`-ed function with an input that contains an object of type <{type_name}>, which was marked with `pytree.register_constant`. However, the object was constructed _inside_ the `torch.compile` region.
+
+Please construct the object _outside_ the `torch.compile` region, or submit an issue to GitHub.
+    """  # NOQA: B950
+                    )
+                else:
+                    unimplemented(
+                        f"""
+You are calling a `nonstrict_trace`-ed function where one one of the inputs has been registered with a `pytree_flatten` that puts an object of type <{type_name}> into the context.
+
+Please consider modifying that `pytree_flatten` to avoid putting the object into context, and apply one of the following to <{type_name}>
+  * `torch.utils._pytree.register_constant`
+  * `torch.utils._pytree.register_dataclass`
+  * `torch.utils._pytree.register_pytree_node`
+
+If the above doesn't work, please subtmit an issue to GitHub.
+"""  # NOQA: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             fn = self.value
@@ -1420,18 +1570,25 @@ def patched_fn(*args, **kwargs):
                 f"{fn.__name__}_spec", f_spec
             )
             input_spec_proxy = tx.output.register_static_attr_and_return_proxy(
+<<<<<<< HEAD
                 fn.__name__ + "_input_spec",
                 # pyrefly: ignore [unbound-name]
                 input_spec,
             )
             f_spec_proxy.node.type = type(f_spec)
             # pyrefly: ignore [unbound-name]
+=======
+                fn.__name__ + "_input_spec", input_spec
+            )
+            f_spec_proxy.node.type = type(f_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_spec_proxy.node.type = type(input_spec)
             all_args = (f_spec_proxy, input_spec_proxy, *proxified_flat_args)
 
             # 2. Create a proxy call to `flat_apply`, then fake-tensor propagate
             # the call and wrap output into a VariableTracker.
             proxy = tx.output.create_proxy("call_function", flat_apply, all_args, {})
+<<<<<<< HEAD
             try:
                 # TODO support more output types once `flat_apply` supports
                 # pytree-able output types. We can have Dynamo trace through an
@@ -1453,6 +1610,14 @@ def patched_fn(*args, **kwargs):
                     ),
                     hints=[*graph_break_hints.SUPPORTABLE],
                 )
+=======
+            out_vt = wrap_fx_proxy(tx, proxy)
+            # TODO support more output types
+            # Q: flat_apply will likely pytree_flatten the output for this, then
+            # how do we intercept the output before flatten, and wrap those?
+            # - Maybe we can have `flat_apply` return the output spec, so that
+            #   Dynamo can unflatten and wrap the result.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return out_vt
 
@@ -1467,6 +1632,7 @@ def patched_fn(*args, **kwargs):
                 source = CallFunctionNoArgsSource(self.source)
                 install_guard(source.make_guard(GuardBuilder.EQUALS_MATCH))
             # constant fold
+<<<<<<< HEAD
             try:
                 return ConstantVariable.create(
                     self.as_python_constant()(
@@ -1480,6 +1646,14 @@ def patched_fn(*args, **kwargs):
                     tx,
                     args=list(map(ConstantVariable.create, exc.args)),
                 )
+=======
+            return ConstantVariable.create(
+                self.as_python_constant()(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                ),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.is_tensor_method():
             name = self.value.__name__
@@ -1530,6 +1704,7 @@ def patched_fn(*args, **kwargs):
 For now, dynamo will explicitly graph break when it encounters user code with this behavior.
 """
             log.warning(msg)
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Attempted to call torch in-graph function on only torch.SymInt arguments",
                 context=f"fn={self.value}, args={args}, kwargs={kwargs}",
@@ -1541,6 +1716,9 @@ def patched_fn(*args, **kwargs):
                     *graph_break_hints.SUPPORTABLE,
                 ],
             )
+=======
+            unimplemented(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO(voz): Replace w/ dynamic shape rewrite table.
         # Ideally, we would be able to do this at ctor time, but alas we need a combination
@@ -1557,6 +1735,7 @@ def patched_fn(*args, **kwargs):
         # variant torch ops, the original function could come from a user
         # defined `@allow_in_graph` function as well, which doesn't have the
         # same semantics as the torch ops.
+<<<<<<< HEAD
 
         # Calling fake tensor propagation can mutate the out= tensor in
         # tx.output.tracked_fakes. tracked_fakes are used to apply
@@ -1582,6 +1761,17 @@ def patched_fn(*args, **kwargs):
             # e.g., out=output_tensor
             if isinstance(out_kwarg_vt, variables.TensorVariable):
                 saved_out_shapes = out_kwarg_vt.proxy.node.meta["example_value"].shape
+=======
+        fake_out_shape = None
+        if "out" in kwargs and isinstance(kwargs["out"], variables.TensorVariable):
+            # Calling fake tensor propagation can mutate the out= tensor in
+            # tx.output.tracked_fakes. tracked_fakes are used to apply
+            # symbolic_shape guards. Mutating them destroys the information
+            # prior to tracing, which is essential for creating right
+            # guards. So save the shape now, and check later if it has
+            # changed. If it has, graph break.
+            fake_out_shape = kwargs["out"].proxy.node.meta["example_value"].shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tensor_variable = wrap_fx_proxy(
             tx=tx,
@@ -1598,6 +1788,7 @@ def patched_fn(*args, **kwargs):
             and "requires_grad" in kwargs
             and kwargs["requires_grad"].as_python_constant()
         ):
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Attempted to use tensor creation function with requires_grad=True",
                 context=f"fn={self.value}, args={args}, kwargs={kwargs}",
@@ -1611,6 +1802,18 @@ def patched_fn(*args, **kwargs):
 
         # Handle e.g., `torch.add(a, b, out=result)`
         if saved_out_shapes is not None:
+=======
+            unimplemented(
+                """factory functions that return tensors that require grad are not supported.
+Either create the tensor outside the compiled region, or do not set the tensor to require_grad"""
+            )
+
+        # Handle e.g., `torch.add(a, b, out=result)`
+        if "out" in kwargs and not (
+            isinstance(kwargs["out"], variables.ConstantVariable)
+            and kwargs["out"].as_python_constant() is None
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # out variants of torch operators like torch.sort and torch.sigmoid
             # mutate the tensors in the out field.
             #
@@ -1619,6 +1822,7 @@ def patched_fn(*args, **kwargs):
             # take the conservative approach to graph break on size changes, and
             # assume other cases can fall through soundly.
             #
+<<<<<<< HEAD
             # Note that although these tensor variables would hold different
             # proxies, the in-place mutation semantics is preserved in the FX
             # graph, so we won't have correctness issues.
@@ -1689,6 +1893,66 @@ def patched_fn(*args, **kwargs):
                             *graph_break_hints.SUPPORTABLE,
                         ],
                     )
+=======
+            # Note that although these tensor variablels would hold different
+            # proxies, the in-place mutation semantics is preserved in the FX
+            # graph, so we won't have correctness issues.
+            if isinstance(tensor_variable, TupleVariable):
+                assert isinstance(kwargs["out"], (TupleVariable, ListVariable))
+                for out_tensor, result_tensor in zip(
+                    kwargs["out"].items, tensor_variable.items
+                ):
+                    if (
+                        isinstance(out_tensor, variables.TensorVariable)
+                        and isinstance(result_tensor, variables.TensorVariable)
+                        and out_tensor._size
+                        != result_tensor._size  # we actually want to compare None values here
+                    ):
+                        # It's hard to get out variants with resizing on graph inputs work
+                        # properly across dynamo/aot/inductor, just fall back.
+                        unimplemented("out variants with resizing on graph inputs")
+            elif isinstance(tensor_variable, TensorVariable):
+                assert isinstance(kwargs["out"], TensorVariable)
+                assert "example_value" in kwargs["out"].proxy.node.meta
+                fake_tensor = tensor_variable.proxy.node.meta["example_value"]
+                fake_out = kwargs["out"].proxy.node.meta["example_value"]
+                if fake_out_shape != fake_tensor.shape:
+                    # It's hard to get out variants with resizing on graph inputs work
+                    # properly across dynamo/aot/inductor, just fall back.
+                    unimplemented("out variants with resizing on graph inputs")
+                if not torch._prims_common.is_contiguous(fake_out):
+                    # It's difficult to handle strides correctly in functionalization
+                    # when calling an out= op with a non-contiguous out argument
+                    unimplemented(
+                        "out= op was called where output tensor was non-contiguous"
+                    )
+            elif (
+                isinstance(tensor_variable, ConstantVariable)
+                and tensor_variable.value is None
+            ):
+                # Handle out-variant custom ops that return None.
+                if isinstance(kwargs["out"], TensorVariable):
+                    assert "example_value" in kwargs["out"].proxy.node.meta
+                    fake_out = kwargs["out"].proxy.node.meta["example_value"]
+                    if not torch._prims_common.is_contiguous(fake_out):
+                        # It's difficult to handle strides correctly in functionalization
+                        # when calling an out= op with a non-contiguous out argument
+                        unimplemented(
+                            "out= op was called where output tensor was non-contiguous"
+                        )
+                elif isinstance(kwargs["out"], ListVariable):
+                    for idx, x in enumerate(kwargs["out"].items):
+                        assert "example_value" in x.proxy.node.meta  # type: ignore[attr-defined]
+                        fake_out = x.proxy.node.meta["example_value"]  # type: ignore[attr-defined]
+                        if not torch._prims_common.is_contiguous(fake_out):
+                            # It's difficult to handle strides correctly in functionalization
+                            # when calling an out= op with a non-contiguous out argument
+                            unimplemented(
+                                "out= op was called where some of the output tensors were non-contiguous"
+                            )
+            else:
+                unimplemented(f"out variant of {type(kwargs['out'])}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return tensor_variable
 
@@ -1712,6 +1976,7 @@ def handle_ntuple(value):
                     torch.nn.modules.utils._ntuple(count)(value.as_python_constant()),
                 )
             else:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="Attempted to use `torch.nn.modules.utils._ntuple` with unsupported argument type",
                     context=f"value={value}",
@@ -1720,6 +1985,9 @@ def handle_ntuple(value):
                         "Change use of _ntuple with argument as constant or tensor.",
                     ],
                 )
+=======
+                unimplemented(f"torch.nn.modules.utils._ntuple({value})")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.value is torch.nn.modules.utils._ntuple:
             return variables.LambdaVariable(handle_ntuple)
@@ -1730,6 +1998,7 @@ def handle_ntuple(value):
     def call_nn_parameter(cls, tx, data=None, requires_grad=True):
         """A call to torch.nn.Parameter() gets lifted to before the graph"""
         if tx.export:
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Attempted to use `torch.nn.Parameter()` with export",
                 context="",
@@ -1739,11 +2008,15 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
                     *graph_break_hints.SUPPORTABLE,
                 ],
             )
+=======
+            unimplemented("nn parameter construction not supported with export")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(requires_grad, variables.VariableTracker):
             try:
                 requires_grad = requires_grad.as_python_constant()
             except NotImplementedError:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="non-constant `requires_grad` argument to `torch.nn.Parameter`",
                     context=f"requires_grad={requires_grad}",
@@ -1836,6 +2109,36 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
         # pyrefly: ignore [missing-attribute]
         if data.requires_grad:
             # pyrefly: ignore [missing-attribute]
+=======
+                unimplemented("Parameter(requires_grad=...) not constant")
+
+        if not isinstance(data, variables.TensorVariable):
+            unimplemented(f"Parameter(data={data}) not implemented")
+
+        # this results in cleaner graphs, but only works for inputs
+        if data.source:
+            return cls._nn_param_via_prefix_insert(tx, data, requires_grad)
+
+        if isinstance(
+            data, TensorWithTFOverrideVariable
+        ) or is_traceable_wrapper_subclass_type(data.class_type):
+            unimplemented("Parameter constructor with tensor subclass NYI")
+
+        if not can_convert_to_tracable_parameter():
+            unimplemented("Workaround for issues with nn_parameter construction")
+
+        try:
+            shape = tuple(data.var_getattr(tx, "shape").as_python_constant())
+            dtype = data.var_getattr(tx, "dtype").as_python_constant()
+            device = data.var_getattr(tx, "device").as_python_constant()
+        except NotImplementedError as e:
+            unimplemented(f"Parameter not python_constant: {e}")
+
+        placeholder = tx.output.synthetic_graph_input(
+            new_parameter_placeholder, [shape, dtype, device, requires_grad]
+        )
+        if data.requires_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             data = data.call_method(tx, "detach", [], {})
 
         from .builder import wrap_fx_proxy
@@ -1845,7 +2148,10 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             tx.output.create_proxy(
                 "call_function",
                 tracable_create_parameter,
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (data.as_proxy(), placeholder.as_proxy()),
                 {},
             ),
@@ -1857,7 +2163,11 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
         result.class_type = torch.nn.Parameter
 
         # TODO(jansel/bdhirsh) - There is some issue with
+<<<<<<< HEAD
         # tracable_create_parameter. It does not seem to use the right
+=======
+        # tracable_create_paramter. It does not seem to use the right
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # grad_enabled. Since this is parameter, we can just override the
         # has_grad_fn field to False to workaround the issue.
         result.has_grad_fn = False
@@ -1872,8 +2182,12 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
         varname = tx.output.new_var()
 
         # construct the nn.Parameter before the graph save it to varname
+<<<<<<< HEAD
         assert tx.output.root_tx is not None
         cg = PyCodegen(tx.output.root_tx)
+=======
+        cg = PyCodegen(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cg.add_push_null(lambda: cg.load_import_from("torch.nn", "Parameter"))
         cg(data.source)
         cg(variables.ConstantVariable(requires_grad))
@@ -1883,6 +2197,7 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
 
         data_node = data.as_proxy().node
         if data_node.op not in ("placeholder", "get_attr"):
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Unexpected type of data placeholder op for parameter construction",
                 context=f"data_node.op={data_node.op}",
@@ -1890,13 +2205,21 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
                 hints=[
                     *graph_break_hints.DIFFICULT,
                 ],
+=======
+            unimplemented(
+                "Unexpected type of data placeholder op for parameter construction"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # add the newly constructed nn.Parameter as a graph input
         source = SyntheticLocalSource(varname)
         example_value = torch.nn.Parameter(
+<<<<<<< HEAD
             tx.output.example_value_from_input_node(data.as_proxy().node),
             requires_grad=requires_grad,
+=======
+            tx.output.example_value_from_input_node(data.as_proxy().node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         result = VariableTracker.build(tx, example_value, source)
         # Realize the VT because we will delete the guards on it in the next line.
@@ -1943,7 +2266,11 @@ def create_with_source(cls, value, source):
         return cls(value, source=source)
 
     def is_constant_fold_method(self, name):
+<<<<<<< HEAD
         return name == "has"
+=======
+        return name in ["has"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_method(
         self,
@@ -1986,7 +2313,11 @@ def call_method(
             return variables.EnumVariable(self.value.key())
         elif name == "process":
             return tx.inline_user_function_return(
+<<<<<<< HEAD
                 VariableTracker.build(tx, self.value.process.__func__),
+=======
+                variables.UserFunctionVariable(self.value.process.__func__),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 [self] + args,
                 kwargs,
             )
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index 817385ff149c0..52550682e5f65 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -38,6 +38,10 @@
 from torch._guards import Source
 from torch.overrides import (
     _get_overloaded_args,
+<<<<<<< HEAD
+=======
+    BaseTorchFunctionMode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_default_nowrap_functions,
     TorchFunctionMode,
 )
@@ -123,6 +127,76 @@
     operator.length_hint,
 ]
 
+<<<<<<< HEAD
+=======
+BUILTIN_TO_TENSOR_FN_MAP = {}
+
+# These functions represent the r* versions of the above ops
+# Basically, if __add__(1, Tensor) is called, it is translated
+# to __radd__(Tensor, 1).
+# In the builtin var, we check if there is a tensor in the first args position,
+# if not, we swap the args and use the r* version of the op.
+BUILTIN_TO_TENSOR_RFN_MAP = {}
+
+
+def populate_builtin_to_tensor_fn_map():
+    global BUILTIN_TO_TENSOR_FN_MAP
+
+    most_recent_func = None
+
+    class GetMethodMode(BaseTorchFunctionMode):
+        """
+        Mode to extract the correct methods from torch function invocations
+        (Used to get the correct torch.Tensor methods from builtins)
+        """
+
+        def __torch_function__(self, func, types, args=(), kwargs=None):
+            kwargs = kwargs or {}
+            nonlocal most_recent_func
+            most_recent_func = func
+            return func(*args, **kwargs)
+
+    inp0 = torch.ones(1)
+    inp1 = torch.ones(1)
+    inp0_int = torch.ones(1, dtype=torch.int32)
+    inp1_int = torch.ones(1, dtype=torch.int32)
+    with GetMethodMode():
+        setups_and_oplists = [
+            (lambda o: o(inp0), un_ops),
+            (lambda o: o(inp0_int), un_int_ops),
+            (lambda o: o(inp0, inp1), bin_ops),
+            (lambda o: o(inp0_int, inp1_int), bin_int_ops),
+            (lambda o: o(inp0_int, 0), tensor_and_int_ops),
+        ]
+        for setup_fn, op_list in setups_and_oplists:
+            for op in op_list:
+                setup_fn(op)
+                assert most_recent_func is not None
+                BUILTIN_TO_TENSOR_FN_MAP[op] = most_recent_func
+
+        # gather the reverse functions
+        rsetups_and_oplists = [
+            (
+                lambda o: o(1, inp1),
+                bin_ops,
+            ),  # Get r* ops, (ex. __sub__(int, Tensor) -> __rsub__(Tensor, int))
+            (lambda o: o(1, inp1_int), bin_int_ops),
+            (lambda o: o(0, inp0_int), tensor_and_int_ops),
+        ]
+
+        rskips = {operator.matmul, operator.imatmul, operator.getitem}
+        for setup_fn, op_list in rsetups_and_oplists:
+            for op in op_list:
+                if op in rskips:
+                    continue
+                setup_fn(op)
+                assert most_recent_func is not None
+                if most_recent_func != BUILTIN_TO_TENSOR_FN_MAP[op]:
+                    BUILTIN_TO_TENSOR_RFN_MAP[op] = most_recent_func
+
+
+populate_builtin_to_tensor_fn_map()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 banned_attrs = [
     fn.__self__.__name__
@@ -389,7 +463,11 @@ def _flatten_vts(vts):
     output = []
 
     while vts:
+<<<<<<< HEAD
         vt = vts.popleft()
+=======
+        vt = vts.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not vt.is_realized() and vt.peek_type() in (dict, list, tuple):
             vt.realize()
@@ -397,10 +475,15 @@ def _flatten_vts(vts):
         if vt.is_realized():
             if isinstance(vt, ListVariable):
                 vts.extend(vt.items)
+<<<<<<< HEAD
                 continue
             elif isinstance(vt, ConstDictVariable):
                 vts.extend(vt.items.values())
                 continue
+=======
+            elif isinstance(vt, ConstDictVariable):
+                vts.extend(vt.items.values())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output.append(vt)
 
@@ -583,6 +666,15 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         ):
             args, kwargs = [self], {}
             if can_dispatch_torch_function(tx, args, kwargs):
+<<<<<<< HEAD
+=======
+                if self.source:
+                    install_guard(
+                        AttrSource(
+                            AttrSource(self.source, "__class__"), name
+                        ).make_guard(GuardBuilder.FUNCTION_MATCH)
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_fn = VariableTracker.build(tx, getattr(torch.Tensor, name).__get__)
 
                 return self.call_torch_function(
@@ -608,14 +700,23 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 cls_source = GlobalSource(self.global_mangled_class_name(tx))
                 attr_source = AttrSource(cls_source, name)
                 if isinstance(attr, types.FunctionType):
+<<<<<<< HEAD
                     install_guard(attr_source.make_guard(GuardBuilder.CLOSURE_MATCH))
+=======
+                    install_guard(attr_source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return UserMethodVariable(attr, self)
 
                 elif isinstance(attr, property):
                     getter_source = AttrSource(attr_source, "fget")
                     getter = attr.fget
+<<<<<<< HEAD
                     getter_var = VariableTracker.build(tx, getter, source=getter_source)
                     return getter_var.call_function(tx, [self], {})
+=======
+                    getter_var = UserMethodVariable(getter, self, source=getter_source)
+                    return getter_var.call_function(tx, [], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 elif isinstance(attr, classmethod):
                     return UserMethodVariable(
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index f4419fbbfe79b..ed2142fc7f29b 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -10,9 +10,13 @@
   attribute access, and other Python object behaviors.
 - Specialized subclasses for common patterns:
   - UserDefinedDictVariable: For dict subclasses
+<<<<<<< HEAD
   - UserDefinedSetVariable: For set subclasses
   - UserDefinedTupleVariable: For tuple subclasses
   - UserDefinedExceptionObjectVariable: For exception subclasses
+=======
+  - UserDefinedTupleVariable: For tuple subclasses
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - FrozenDataClassVariable: Special handling of frozen dataclasses
   - MutableMappingVariable: For collections.abc.MutableMapping subclasses
 
@@ -46,54 +50,86 @@
 from torch._guards import TracingContext
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass_type
 
+<<<<<<< HEAD
 from .. import graph_break_hints, polyfills, variables
+=======
+from .. import polyfills, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..bytecode_transformation import create_call_function
 from ..create_parameter_op import do_not_convert_to_tracable_parameter
 from ..exc import (
     handle_observed_exception,
     ObservedAttributeError,
+<<<<<<< HEAD
     ObservedKeyError,
     ObservedTypeError,
     ObservedUserStopIteration,
     raise_observed_exception,
     unimplemented_v2,
+=======
+    raise_observed_exception,
+    unimplemented,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from ..guards import GuardBuilder, install_guard
 from ..source import (
     AttrSource,
     CallFunctionNoArgsSource,
     DataclassFieldsSource,
+<<<<<<< HEAD
     DictGetItemSource,
     GetItemSource,
     RandomValueSource,
     TypeDictSource,
     TypeMROSource,
+=======
+    GetItemSource,
+    RandomValueSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TypeSource,
     UnspecializedParamBufferSource,
 )
 from ..utils import (
+<<<<<<< HEAD
     check_constant_args,
     cmp_name_to_op_mapping,
     dict_methods,
     frozenset_methods,
+=======
+    build_checkpoint_variable,
+    check_constant_args,
+    cmp_name_to_op_mapping,
+    dict_methods,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_custom_getattr,
     has_torch_function,
     is_frozen_dataclass,
     is_lru_cache_wrapped_function,
     is_namedtuple_cls,
+<<<<<<< HEAD
+=======
+    is_utils_checkpoint,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_wrapper_or_member_descriptor,
     istype,
     list_methods,
     namedtuple_fields,
     object_has_getattribute,
     proxy_args_kwargs,
+<<<<<<< HEAD
     raise_args_mismatch,
     set_methods,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensortype_to_dtype,
     tuple_methods,
     unpatched_nn_module_getattr,
 )
+<<<<<<< HEAD
 from .base import raise_type_error_exc, ValueMutationNew, VariableTracker
+=======
+from .base import AttributeMutationExisting, ValueMutationNew, VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .dicts import DefaultDictVariable
 from .lists import SizeVariable
 
@@ -140,6 +176,7 @@ def is_forbidden_context_manager(ctx):
     return ctx in f_ctxs
 
 
+<<<<<<< HEAD
 def is_cython_function(obj):
     return (
         callable(obj)
@@ -148,6 +185,8 @@ def is_cython_function(obj):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class UserDefinedVariable(VariableTracker):
     value: object
 
@@ -158,10 +197,13 @@ class UserDefinedClassVariable(UserDefinedVariable):
     def __init__(self, value, **kwargs) -> None:
         super().__init__(**kwargs)
         self.value = value
+<<<<<<< HEAD
         # Used when we materialize class.__dict__ to a MappingProxyObject. In
         # this case, we don't want to allow mutation in the class because there
         # is no way to reflect it in the created MappingProxyVariable.
         self.ban_mutation = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def as_python_constant(self):
         return self.value
@@ -225,8 +267,11 @@ def supported_c_new_functions():
         return {
             object.__new__,
             dict.__new__,
+<<<<<<< HEAD
             set.__new__,
             frozenset.__new__,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tuple.__new__,
             list.__new__,
         }.union(exceptions)
@@ -259,9 +304,12 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
         elif name == "__dict__":
             options = {"source": source}
             return variables.GetAttrVariable(self, name, **options)
+<<<<<<< HEAD
         elif name == "__mro__":
             attr_source = self.source and TypeMROSource(self.source)
             return VariableTracker.build(tx, self.value.__mro__, attr_source)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Special handling of collections.OrderedDict.fromkeys()
         # Wrap it as GetAttrVariable(collections.OrderedDict, "fromkeys") to make it consistent with
@@ -278,11 +326,15 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
             obj = inspect.getattr_static(self.value, name)
         except AttributeError:
             if type(self.value) is type:
+<<<<<<< HEAD
                 raise_observed_exception(
                     AttributeError,
                     tx,
                     msg=f"type object '{self.value.__name__}' has no attribute '{name}'",
                 )
+=======
+                raise_observed_exception(AttributeError, tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # Cannot reason about classes with a custom metaclass
                 # See: test_functions::test_getattr_metaclass
@@ -298,8 +350,14 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
             return VariableTracker.build(tx, obj.__get__(self.value), source)
         elif isinstance(obj, classmethod):
             if isinstance(obj.__func__, property):
+<<<<<<< HEAD
                 fget_vt = VariableTracker.build(tx, obj.__func__.fget)
                 return fget_vt.call_function(tx, [self], {})
+=======
+                return variables.UserFunctionVariable(obj.__func__.fget).call_function(
+                    tx, [self], {}
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.UserMethodVariable(obj.__func__, self, source=source)
         elif isinstance(obj, types.ClassMethodDescriptorType):
             # e.g.: inspect.getattr_static(dict, "fromkeys")
@@ -307,15 +365,25 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
             func = obj.__get__(None, self.value)
             return VariableTracker.build(tx, func, source)
         elif source:
+<<<<<<< HEAD
             if inspect.ismemberdescriptor(obj):
+=======
+            # __mro__ is a member in < 3.12, an attribute in >= 3.12
+            if inspect.ismemberdescriptor(obj) or (
+                sys.version_info >= (3, 12) and name == "__mro__"
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return VariableTracker.build(tx, obj.__get__(self.value), source)
 
         if ConstantVariable.is_literal(obj):
             return ConstantVariable.create(obj)
         elif isinstance(obj, enum.Enum):
             return EnumVariable(obj)
+<<<<<<< HEAD
         elif self.value is collections.OrderedDict:
             return variables.GetAttrVariable(self, name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif name in getattr(self.value, "__dict__", {}) or (
             self.value.__module__.startswith("torch.")
             or self.value.__module__ == "torch"
@@ -423,24 +491,31 @@ def call_method(
             return BuiltinVariable.call_custom_dict_fromkeys(
                 tx, self.value, *args, **kwargs
             )
+<<<<<<< HEAD
         elif self.value is collections.OrderedDict and name == "move_to_end":
             return args[0].call_method(tx, name, [*args[1:]], kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif name == "__eq__" and len(args) == 1 and hasattr(args[0], "value"):
             return variables.ConstantVariable(self.value == args[0].value)
         elif name == "__ne__" and len(args) == 1 and hasattr(args[0], "value"):
             return variables.ConstantVariable(self.value != args[0].value)
+<<<<<<< HEAD
         elif issubclass(self.value, dict) and name != "__new__":
             # __new__ is handled below
             return variables.BuiltinVariable(dict).call_method(tx, name, args, kwargs)
         elif issubclass(self.value, (set, frozenset)) and name != "__new__":
             # __new__ is handled below
             return variables.BuiltinVariable(set).call_method(tx, name, args, kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif (
             name == "__new__"
             and self.value is collections.OrderedDict
             and isinstance(args[0], UserDefinedClassVariable)
             and args[0].value is collections.OrderedDict
         ):
+<<<<<<< HEAD
             if kwargs and len(args) != 1:
                 raise_args_mismatch(
                     tx,
@@ -448,6 +523,10 @@ def call_method(
                     "1 args and 0 kwargs",
                     f"{len(args)} args and {len(kwargs)} kwargs",
                 )
+=======
+            assert len(args) == 1
+            assert len(kwargs) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.ConstDictVariable(
                 {}, collections.OrderedDict, mutation_type=ValueMutationNew()
             )
@@ -459,6 +538,7 @@ def call_method(
                 args[0],
                 args[1:],
             )
+<<<<<<< HEAD
         elif name == "__setattr__" and self.ban_mutation:
             unimplemented_v2(
                 gb_type="Class attribute mutation when the __dict__ was already materialized",
@@ -466,6 +546,8 @@ def call_method(
                 explanation="Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
                 hints=graph_break_hints.SUPPORTABLE,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def call_function(
@@ -493,7 +575,11 @@ def call_function(
             # import here to avoid circular dependency
             from .ctx_manager import NullContextVariable
 
+<<<<<<< HEAD
             return NullContextVariable(*args, **kwargs)
+=======
+            return NullContextVariable()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.value is collections.OrderedDict:
             return tx.inline_user_function_return(
                 VariableTracker.build(tx, polyfills.construct_dict),
@@ -513,6 +599,7 @@ def call_function(
             )
         elif is_typeddict(self.value):
             if self.value.__optional_keys__:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="TypedDict with optional keys",
                     context=str(self.value),
@@ -565,6 +652,32 @@ def deque_signature(iterable=None, maxlen=None):
             if "maxlen" in bound_args.arguments:
                 maxlen = bound_args.arguments["maxlen"]
 
+=======
+                unimplemented("TypedDict with optional keys not supported")
+            return variables.BuiltinVariable(dict).call_dict(tx, *args, **kwargs)
+        elif self.value is collections.deque:
+            maxlen = variables.ConstantVariable.create(None)
+            if not kwargs:
+                if len(args) == 0:
+                    items = []
+                elif len(args) == 1 and args[0].has_force_unpack_var_sequence(tx):
+                    items = args[0].force_unpack_var_sequence(tx)
+                elif len(args) == 2 and args[0].has_force_unpack_var_sequence(tx):
+                    items = args[0].force_unpack_var_sequence(tx)
+                    maxlen = args[1]
+                else:
+                    unimplemented("deque() with more than 2 arg not supported")
+            elif tuple(kwargs) == ("maxlen",):
+                maxlen = kwargs["maxlen"]
+                if len(args) == 0:
+                    items = []
+                if len(args) == 1 and args[0].has_force_unpack_var_sequence(tx):
+                    items = args[0].force_unpack_var_sequence(tx)
+                else:
+                    unimplemented("deque() with more than 1 arg not supported")
+            else:
+                unimplemented("deque() with invalid kwargs not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.lists.DequeVariable(
                 items, maxlen=maxlen, mutation_type=ValueMutationNew()
             )
@@ -576,6 +689,7 @@ def deque_signature(iterable=None, maxlen=None):
             return variables.WeakRefVariable(args[0], callback)
         elif self.value is functools.partial:
             if not args:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="missing args to functools.partial",
                     context="",
@@ -585,6 +699,9 @@ def deque_signature(iterable=None, maxlen=None):
                         *graph_break_hints.USER_ERROR,
                     ],
                 )
+=======
+                unimplemented("functools.partial malformed")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # The first arg, a callable (the ctor below will assert on types)
             fn = args[0]
             rest_args = args[1:]
@@ -596,10 +713,14 @@ def deque_signature(iterable=None, maxlen=None):
         elif self.value is warnings.catch_warnings and not args:
             return variables.CatchWarningsCtxManagerVariable.create(tx, kwargs)
         elif self.value is torch.cuda.device and not kwargs and len(args) == 1:
+<<<<<<< HEAD
             if not args[0].is_python_constant():
                 raise_type_error_exc(
                     tx, "torch.cuda.device() requires a constant argument"
                 )
+=======
+            assert args[0].is_python_constant()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return variables.CUDADeviceVariable.create(tx, args[0].as_python_constant())
         elif (
             issubclass(type(self.value), type)
@@ -614,7 +735,10 @@ def deque_signature(iterable=None, maxlen=None):
             and self.source
             and not is_forbidden_context_manager(self.value)
         ):
+<<<<<<< HEAD
             from . import TorchCtxManagerClassVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from .functions import (
                 BaseUserFunctionVariable,
                 FunctionDecoratedByContextlibContextManagerVariable,
@@ -634,6 +758,7 @@ def deque_signature(iterable=None, maxlen=None):
             ):
                 # We are not changing the behavior of Dynamo as these function were
                 # already ignored on trace_rules.py before #136033 landed
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="unsupported contextlib.* API",
                     context=f"{self.value}",
@@ -680,6 +805,19 @@ def deque_signature(iterable=None, maxlen=None):
                     kwargs_dict = args[2].keys_as_python_constant()
                     return fn_var.call_function(tx, args_list, kwargs_dict)
 
+=======
+                unimplemented(
+                    f"{self.value} not supported. This may be due to its use of "
+                    "context-specific operations that are not supported in "
+                    "Dynamo yet (i.e. Exception handling)"
+                )
+
+            if self.value is contextlib._GeneratorContextManager and isinstance(
+                args[0], BaseUserFunctionVariable
+            ):
+                if not torch._dynamo.config.enable_trace_contextlib:
+                    unimplemented("contextlib.contextmanager")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Wrap UserFunctionVariable in FunctionDecoratedByContextlibContextManagerVariable
                 # if the function is annotated with @contextlib.contextmanager
                 # This shouldn't be necessary once generator functions are fully
@@ -701,6 +839,7 @@ def deque_signature(iterable=None, maxlen=None):
             fields = namedtuple_fields(self.value)
             # check if this a quasi-namedtuple or a real one
             if self.value.__module__ == "torch.return_types":
+<<<<<<< HEAD
                 if kwargs or len(args) != 1:
                     raise_args_mismatch(
                         tx,
@@ -708,6 +847,10 @@ def deque_signature(iterable=None, maxlen=None):
                         "1 args and 0 kwargs",
                         f"{len(args)} args and {len(kwargs)} kwargs",
                     )
+=======
+                assert len(args) == 1
+                assert not kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 items = args[0].force_unpack_var_sequence(tx)
             else:
                 field_defaults = self.value._field_defaults
@@ -733,12 +876,16 @@ def deque_signature(iterable=None, maxlen=None):
 
                 assert all(x is not None for x in items)
 
+<<<<<<< HEAD
             # Modify mutability of namedtuple for sourcelesss instantiations.
             from .base import AttributeMutationNew
 
             return variables.NamedTupleVariable(
                 items, self.value, mutation_type=AttributeMutationNew()
             )
+=======
+            return variables.NamedTupleVariable(items, self.value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.value is torch.Size:
             # This simulates `THPSize_pynew`, the C impl for `Size.__new__`.
             tup = variables.BuiltinVariable(tuple).call_function(tx, args, kwargs)
@@ -872,6 +1019,12 @@ class UserDefinedExceptionClassVariable(UserDefinedClassVariable):
     def fn(self):
         return self.value
 
+<<<<<<< HEAD
+=======
+    def python_type(self):
+        return self.value
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class NO_SUCH_SUBOBJ:
     pass
@@ -1052,6 +1205,7 @@ def call_method(
             if torch._dynamo.config.enable_faithful_generator_behavior and isinstance(
                 self.value, types.GeneratorType
             ):
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="call_method on generator",
                     context=f"object={self.value}, method={name}, args={args}, kwargs={kwargs}",
@@ -1070,14 +1224,31 @@ def call_method(
                 source_fn = None
                 if source:
                     source_fn = self.get_source_by_walking_mro(name)
+=======
+                unimplemented("Generator as graph argument is not supported")
+
+            # check for methods implemented in C++
+            if isinstance(method, types.FunctionType):
+                source = (
+                    None
+                    if self.source is None
+                    else AttrSource(AttrSource(self.source, "__class__"), name)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO(jansel): add a guard to check for monkey patching?
                 from ..mutation_guard import unpatched_nn_module_init
 
                 if method is torch.nn.Module.__init__:
                     method = unpatched_nn_module_init
+<<<<<<< HEAD
                 return UserMethodVariable(
                     method, self, source_fn=source_fn, source=source
                 ).call_function(tx, args, kwargs)
+=======
+                return UserMethodVariable(method, self, source=source).call_function(
+                    tx, args, kwargs
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if method is list.__len__ and self.source and not (args or kwargs):
                 install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
@@ -1091,6 +1262,7 @@ def method_setattr_standard(
         try:
             name = name.as_python_constant()
         except NotImplementedError:
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="non-const setattr name on user-defined object",
                 context=f"object={self}, name={name}, value={value}",
@@ -1101,6 +1273,11 @@ def method_setattr_standard(
             "Attempted setattr on a user-defined object that does not have "
             "an AttributeMutation mutation_type"
         )
+=======
+            unimplemented(f"non-const setattr name: {name}")
+        if not tx.output.side_effects.is_attribute_mutation(self):
+            unimplemented(f"setattr({self}, {name}, ...)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if directly_update_dict:
             self.attrs_directly_modifed_on_dict.add(name)
@@ -1149,6 +1326,7 @@ def unpack_var_sequence(self, tx):
             ]
         return super().unpack_var_sequence(tx)
 
+<<<<<<< HEAD
     def has_force_unpack_var_sequence(self, tx: "InstructionTranslator") -> bool:
         try:
             variables.BuiltinVariable(iter).call_function(tx, [self], {})
@@ -1170,6 +1348,8 @@ def force_unpack_var_sequence(self, tx):
                 break
         return result
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def next_variable(self, tx):
         return self.call_method(tx, "__next__", [], {})
 
@@ -1217,6 +1397,7 @@ def call_function(
                 ).call_function(tx, [var], kwargs)
 
             if self.source is None:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="attempted to call sourceless user-defined object as a method",
                     context=f"object={self.value}, function={func}, args={args}, kwargs={kwargs}",
@@ -1224,6 +1405,10 @@ def call_function(
                     hints=[
                         f"Ensure the user-defined object {self.value} is constructed outside the compiled region.",
                     ],
+=======
+                unimplemented(
+                    "Sourceless UserDefinedObjectVariable method not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             func_src = AttrSource(self.source, "__func__")
             func_var = VariableTracker.build(tx, func, func_src)
@@ -1233,7 +1418,11 @@ def call_function(
         elif callable(self.value):
             if self.source:
                 source = AttrSource(self.cls_source, "__call__")
+<<<<<<< HEAD
                 install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
+=======
+                install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.call_method(tx, "__call__", args, kwargs)
 
         return super().call_function(tx, args, kwargs)
@@ -1312,6 +1501,7 @@ def get_source_by_walking_mro(self, name):
 
         for idx, klass in enumerate(type(self.value).__mro__):
             if name in klass.__dict__:
+<<<<<<< HEAD
                 if idx != 0:
                     mro_source = TypeMROSource(self.cls_source)
                     klass_source = GetItemSource(mro_source, idx)
@@ -1358,6 +1548,19 @@ def get_source_by_walking_mro(self, name):
         )
 
     def var_getattr(self, tx: "InstructionTranslator", name):
+=======
+                mro_source = AttrSource(self.cls_source, "__mro__")
+                klass_source = GetItemSource(mro_source, idx)
+                dict_source = AttrSource(klass_source, "__dict__")
+                # TODO(anijain2305) - This is a mapping proxy object. Ideally we
+                # should use DictGetItemSource here.
+                return GetItemSource(dict_source, name)
+
+        unimplemented(f"Could not find {name} in {type(self.value).__mro__}")
+
+    def var_getattr(self, tx: "InstructionTranslator", name):
+        from .. import trace_rules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from . import ConstantVariable
 
         source = AttrSource(self.source, name) if self.source else None
@@ -1379,11 +1582,15 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         if tx.output.side_effects.has_pending_mutation_of_attr(self, name):
             result = tx.output.side_effects.load_attr(self, name, deleted_ok=True)
             if isinstance(result, variables.DeletedVariable):
+<<<<<<< HEAD
                 raise_observed_exception(
                     AttributeError,
                     tx,
                     msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
                 )
+=======
+                raise_observed_exception(AttributeError, tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return result
 
         if name == "__dict__":
@@ -1443,6 +1650,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 return out
 
             elif getattr_fn is not None:
+<<<<<<< HEAD
                 unimplemented_v2(
                     gb_type="User-defined object with non-function __getattr__",
                     context=f"object={self.value}, name={name}, getattr_fn={getattr_fn}",
@@ -1452,12 +1660,16 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                         "Ensure the object's __getattr__ is a function type.",
                     ],
                 )
+=======
+                unimplemented("UserDefined with non-function __getattr__")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from ..mutation_guard import unpatched_nn_module_init
 
         if subobj is torch.nn.Module.__init__:
             subobj = unpatched_nn_module_init
 
+<<<<<<< HEAD
         subobj_from_class = inspect.getattr_static(
             self.value.__class__, name, NO_SUCH_SUBOBJ
         )
@@ -1478,6 +1690,17 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
             fget_vt = VariableTracker.build(tx, subobj.fget, source=source)
             return fget_vt.call_function(tx, [self], {})
+=======
+        if isinstance(subobj, property):
+            if self.source:
+                # Read the class attribute to reach the property
+                source = AttrSource(AttrSource(self.source, "__class__"), name)
+                # Get the getter function
+                source = AttrSource(source, "fget")
+            return variables.UserMethodVariable(
+                subobj.fget, self, source=source
+            ).call_function(tx, [], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(subobj, _collections._tuplegetter):
             # namedtuple fields are represented by _tuplegetter, and here we
             # emulate its `__get__`, which is implemented in C.
@@ -1490,6 +1713,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             # Safe because `staticmethod.__get__` basically won't trigger user
             # code and just returns the underlying `__func__`:
             # https://github.com/python/cpython/blob/3.11/Objects/funcobject.c#L1088-L1100
+<<<<<<< HEAD
             if is_accessible_from_type_mro:
                 # Accessing from __dict__ does not resolve the descriptor, it
                 # returns a staticmethod object, so access the __func__
@@ -1509,6 +1733,13 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 self.var_getattr(tx, "__class__"),
                 source_fn=source_fn,
                 source=source,
+=======
+            func = subobj.__get__(self.value)
+            return VariableTracker.build(tx, func, source)
+        elif isinstance(subobj, classmethod):
+            return variables.UserMethodVariable(
+                subobj.__func__, self.var_getattr(tx, "__class__"), source=source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif isinstance(subobj, types.ClassMethodDescriptorType):
             # e.g.: inspect.getattr_static({}, "fromkeys")
@@ -1569,6 +1800,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             if isinstance(subobj, types.MethodType):
                 if dynamic_subobj.__self__ is not self.value:
                     if not isinstance(dynamic_subobj.__func__, types.FunctionType):
+<<<<<<< HEAD
                         unimplemented_v2(
                             gb_type="User-defined object method with non-function __func__",
                             context=f"object={self.value}, name={name}, method={dynamic_subobj}, "
@@ -1578,6 +1810,10 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                             hints=[
                                 "Ensure that the method's __func__ is a function type.",
                             ],
+=======
+                        unimplemented(
+                            f"Found a method whose __func__ is not of FunctionType - {dynamic_subobj}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
                     # Use the __self__ attribute of the method to find the
@@ -1598,6 +1834,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 func = subobj
 
             if inspect.ismethod(dynamic_subobj):
+<<<<<<< HEAD
                 source_fn = None
                 if is_accessible_from_type_mro:
                     source_fn = self.get_source_by_walking_mro(name)
@@ -1606,6 +1843,18 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 )
             elif inspect.isfunction(dynamic_subobj):
                 return VariableTracker.build(tx, func, source)
+=======
+                return variables.UserMethodVariable(func, self, source=source)
+            elif inspect.isfunction(dynamic_subobj):
+                if is_utils_checkpoint(func):
+                    return build_checkpoint_variable(source=source)
+                elif source is not None:
+                    return trace_rules.lookup(func).create_with_source(
+                        func, source=source
+                    )
+                else:
+                    return trace_rules.lookup(func)(func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             # wrap the source only if inline_inbuilt_nn_modules is set or fsdp modules. This is a temporary solution to
@@ -1627,6 +1876,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             source = self._wrap_source(source)
 
         if subobj is not NO_SUCH_SUBOBJ:
+<<<<<<< HEAD
             if (
                 is_wrapper_or_member_descriptor(subobj)
                 or torch._C._dynamo.utils.is_instancemethod(subobj)
@@ -1638,6 +1888,12 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 if is_accessible_from_type_mro:
                     source = self.get_source_by_walking_mro(name)
 
+=======
+            if is_wrapper_or_member_descriptor(subobj):
+                options = {"source": source}
+                return variables.GetAttrVariable(self, name, **options)
+            if source:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return variables.LazyVariableTracker.create(subobj, source)
             else:
                 # Check if the subobj is accessible from the class itself. If the class source is known, we can create a
@@ -1655,11 +1911,15 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 return VariableTracker.build(tx, subobj)
 
         # Earlier we were returning GetAttrVariable but its incorrect. In absence of attr, Python raises AttributeError.
+<<<<<<< HEAD
         raise_observed_exception(
             AttributeError,
             tx,
             msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
         )
+=======
+        raise_observed_exception(AttributeError, tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
@@ -1680,6 +1940,7 @@ def call_obj_hasattr(
 
 
 class FrozenDataClassVariable(UserDefinedObjectVariable):
+<<<<<<< HEAD
     class HashWrapper:
         """This class is hashed if a dataclass is used as a key in a dict.
         It's necessary to avoid side effects from calling the __init__ of the dataclass class when hashing"""
@@ -1698,6 +1959,8 @@ def __eq__(self, other):
         def __hash__(self):
             return hash((self.cls, self.fields))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def create(tx, value, source):
         from dataclasses import fields
@@ -1771,6 +2034,7 @@ def as_proxy(self):
         ctor = self.python_type()
         return ctor(*args, **kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "PyCodegen") -> None:
         # Handle specific pytree classes
         import torch.utils._pytree as pytree
@@ -1786,6 +2050,8 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         # For other frozen dataclasses, fall back to the base class behavior
         super().reconstruct(codegen)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: This is called during __init__ for a frozen dataclass
     # use this to accumulate the most up-to-date field values
     def method_setattr_standard(self, tx: "InstructionTranslator", name, value):
@@ -1811,7 +2077,11 @@ def call_method(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         fn_variable = VariableTracker.build(tx, self.value.forward.__func__)
+=======
+        fn_variable = variables.UserFunctionVariable(self.value.forward.__func__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = [self] + args
         return tx.inline_user_function_return(
             fn_variable,
@@ -1839,7 +2109,11 @@ def call_method(self, tx, name, args, kwargs):
             self.exc_vt.args = args
             self.value.args = args
             return variables.ConstantVariable(None)
+<<<<<<< HEAD
         elif (
+=======
+        if (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name == "__setattr__"
             and len(args) == 2
             and isinstance(args[0], variables.ConstantVariable)
@@ -1847,18 +2121,24 @@ def call_method(self, tx, name, args, kwargs):
             in ("__cause__", "__context__", "__suppress_context__", "__traceback__")
         ):
             self.exc_vt.call_setattr(tx, args[0], args[1])
+<<<<<<< HEAD
         elif name == "with_traceback":
             return self.exc_vt.call_method(tx, name, args, kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     @property
     def __context__(self):
         return self.exc_vt.__context__
 
+<<<<<<< HEAD
     @property
     def args(self):
         return self.exc_vt.args
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_context(self, context: "variables.ExceptionVariable"):
         return self.exc_vt.set_context(context)
 
@@ -1963,7 +2243,11 @@ def __init__(self, value, dict_vt=None, **kwargs):
                 "dict_vt must be constructed by builder.py when source is present"
             )
             self._dict_vt = variables.ConstDictVariable(
+<<<<<<< HEAD
                 {}, type(value), mutation_type=ValueMutationNew()
+=======
+                {}, mutation_type=ValueMutationNew()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self._dict_methods = dict_methods
 
@@ -1976,6 +2260,7 @@ def call_method(
     ) -> "VariableTracker":
         method = self._maybe_get_baseclass_method(name)
         if method in self._dict_methods:
+<<<<<<< HEAD
             # Dict subclasses can override __missing__ to provide fallback
             # behavior instead of raising a KeyError. This is used, for example,
             # by collections.Counter.
@@ -1990,6 +2275,9 @@ def call_method(
                     return self.call_method(tx, "__missing__", args, kwargs)
                 else:
                     raise
+=======
+            return self._dict_vt.call_method(tx, name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def unpack_var_sequence(self, tx):
@@ -2003,6 +2291,7 @@ def unpack_var_sequence(self, tx):
     def is_underlying_vt_modified(self, side_effects):
         return side_effects.is_modified(self._dict_vt)
 
+<<<<<<< HEAD
     @property
     def user_cls(self):
         return self._dict_vt.user_cls
@@ -2092,6 +2381,8 @@ def install_dict_keys_match_guard(self):
     def install_dict_contains_guard(self):
         return self._set_vt.install_dict_contains_guard()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class UserDefinedListVariable(UserDefinedObjectVariable):
     """
@@ -2161,7 +2452,11 @@ def __init__(self, value, tuple_vt=None, init_args=None, **kwargs):
             from torch._dynamo.symbolic_convert import InstructionTranslator
 
             tx = InstructionTranslator.current_tx()
+<<<<<<< HEAD
             elems = init_args[0].force_unpack_var_sequence(tx)
+=======
+            elems = init_args[0].unpack_var_sequence(tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._tuple_vt = variables.TupleVariable(
                 elems, mutation_type=ValueMutationNew()
             )
@@ -2192,6 +2487,10 @@ class MutableMappingVariable(UserDefinedObjectVariable):
     def __init__(self, value, **kwargs):
         super().__init__(value, **kwargs)
         self.generic_dict_vt = variables.ConstDictVariable({})
+<<<<<<< HEAD
+=======
+        self.mutation_type = AttributeMutationExisting()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracker":
         # A common pattern in the init code of MutableMapping objects is to
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index d653db0c23a74..ad77ceeba6fc8 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -16,8 +16,12 @@
 from contextlib import contextmanager
 from functools import lru_cache
 
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
 from collections.abc import Callable
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -48,7 +52,10 @@
 
 from .wrappers import _wrap_submodules
 from .utils import _materialize_cpp_cia_ops
+<<<<<<< HEAD
 from . import config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING:
     from torch._C._aoti import AOTIModelContainerRunner
@@ -67,6 +74,10 @@ class ExportDynamoConfig:
 # is called multiple times.
 @lru_cache
 def aot_compile_warning():
+<<<<<<< HEAD
+=======
+    from torch._inductor import config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     log.warning("+============================+")
     log.warning("|     !!!   WARNING   !!!    |")
@@ -125,15 +136,24 @@ def aot_compile(
     """
     from torch.export._trace import _export_to_torch_ir
     from torch._inductor.decomposition import select_decomp_table
+<<<<<<< HEAD
     from torch._inductor import config as inductor_config
 
     aot_compile_warning()
 
     if inductor_config.is_predispatch:
+=======
+    from torch._inductor import config
+
+    aot_compile_warning()
+
+    if config.is_predispatch:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = torch.export._trace._export(f, args, kwargs, dynamic_shapes, pre_dispatch=True).module()
     else:
         # We want to export to Torch IR here to utilize the pre_grad passes in
         # inductor, which run on Torch IR.
+<<<<<<< HEAD
         with torch._export.config.patch(use_new_tracer_experimental=True):
             gm = _export_to_torch_ir(
                 f,
@@ -146,11 +166,27 @@ def aot_compile(
                 # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
                 restore_fqn=False,
             )
+=======
+        gm = _export_to_torch_ir(
+            f,
+            args,
+            kwargs,
+            dynamic_shapes,
+            disable_constraint_solver=disable_constraint_solver,
+            same_signature=same_signature,
+            # Disabling this flag, because instead we can rely on the mapping
+            # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+            restore_fqn=False,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with torch.no_grad():
         so_path = torch._inductor.aot_compile(gm, args, kwargs, options=options)  # type: ignore[arg-type]
 
+<<<<<<< HEAD
     assert isinstance(so_path, (str, list))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return so_path
 
 def aot_load(so_path: str, device: str) -> Callable:
diff --git a/torch/_export/converter.py b/torch/_export/converter.py
index 1a928f011bbed..18d9e616611e7 100644
--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@@ -4,9 +4,15 @@
 import operator
 import typing
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
 from typing import Any, Optional, Union
+=======
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.export._trace
@@ -134,7 +140,11 @@ def execute_subgraph_from_prim_loop(
 ):
     """
     subgraph: GraphModule from sub-block.
+<<<<<<< HEAD
     iter_idx: The index of interaction.
+=======
+    iter_idx: The index of interation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     len_loop_local_arguments: The number of loop local arguments in args.
     """
 
@@ -624,9 +634,15 @@ def convert_graph_inputs(self):
                     self.fx_graph, name, self.is_top_level_graph()
                 )
             elif name in self.name_to_constant:
+<<<<<<< HEAD
                 assert isinstance(self.name_to_constant[name], torch.ScriptObject), (
                     "Input conversion only handles ScriptObject"
                 )
+=======
+                assert isinstance(
+                    self.name_to_constant[name], torch.ScriptObject
+                ), "Input conversion only handles ScriptObject"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 normalized_name = normalize_name(name)
                 self.input_specs.append(
                     InputSpec(
@@ -661,7 +677,13 @@ def convert_aten_Float(self, node: torch._C.Node):
         def to_float_tensor(t):
             return t.to(dtype=torch.float).item()
 
+<<<<<<< HEAD
         inp_list = [self.get_fx_value_by_ir_value(inp) for inp in node.inputs()]  # noqa: C416
+=======
+        inp_list = [
+            self.get_fx_value_by_ir_value(inp) for inp in node.inputs()
+        ]  # noqa: C416
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fx_node = self.fx_graph.call_function(
             to_float_tensor,
             tuple(inp_list),
@@ -704,8 +726,12 @@ def convert_aten_append(self, node: torch._C.Node):
         # In a sense, the converter now becomes an stateful interpreter
         warnings.warn(
             "Converting aten::append.t, which is a inplace mutation of the list. "
+<<<<<<< HEAD
             "This makes the converter non-functional: the result depends on the order of the append nodes being converter!",
             stacklevel=2,
+=======
+            "This makes the converter non-functional: the result depends on the order of the append nodes being converter!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         args = tuple(self.get_fx_value_by_ir_value(inp) for inp in node.inputs())
@@ -748,7 +774,13 @@ def convert_prim_Constant(self, node: torch._C.Node):
         self.name_to_constant[name] = value
 
     def convert_prim_CallMethod(self, node: torch._C.Node):
+<<<<<<< HEAD
         inp_list = [self.get_fx_value_by_ir_value(inp) for inp in node.inputs()]  # noqa: C416
+=======
+        inp_list = [
+            self.get_fx_value_by_ir_value(inp) for inp in node.inputs()
+        ]  # noqa: C416
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fx_node = self.fx_graph.call_method(
             node.s("name"),
             tuple(inp_list),
@@ -780,9 +812,15 @@ def convert_prim_GetAttr(self, node: torch._C.Node):
                 self.name_to_node[output_name] = self.fx_graph.get_attr(attr_fqn)
             else:
                 if attr_fqn not in self.name_to_non_tensor_attribute_node:
+<<<<<<< HEAD
                     self.name_to_non_tensor_attribute_node[attr_fqn] = (
                         self.name_to_non_tensor_attribute[attr_fqn]
                     )
+=======
+                    self.name_to_non_tensor_attribute_node[
+                        attr_fqn
+                    ] = self.name_to_non_tensor_attribute[attr_fqn]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.name_to_node[output_name] = self.name_to_non_tensor_attribute_node[
                     attr_fqn
                 ]
@@ -811,7 +849,11 @@ def convert_call_function_op(self, node: torch._C.Node):
 
         fx_node = self.fx_graph.call_function(target, args, kwargs)
 
+<<<<<<< HEAD
         # TODO: convert sourceRange() into stack_trace
+=======
+        # TODO: covnert sourceRange() into stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # fx_node.meta["stack_trace"] = node.sourceRange()
 
         if node.outputsSize() == 1:
@@ -847,6 +889,7 @@ def convert_prim_DictConstruct(self, node: torch._C.Node):
                 k = self.get_fx_value_by_ir_value(inp)
             else:
                 v = self.get_fx_value_by_ir_value(inp)
+<<<<<<< HEAD
                 assert k is not None and v is not None, (
                     "DictConstruct has an empty key value pair."
                 )
@@ -856,6 +899,17 @@ def convert_prim_DictConstruct(self, node: torch._C.Node):
         assert k is None and v is None, (
             "DictConstruct has an odd number of elements (violating our assumption)."
         )
+=======
+                assert (
+                    k is not None and v is not None
+                ), "DictConstruct has an empty key value pair."
+                output_dict[k] = v
+                k, v = None, None
+
+        assert (
+            k is None and v is None
+        ), "DictConstruct has an odd number of elements (violating our assumption)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_name = node.output().debugName()
         self.name_to_node[output_name] = output_dict
@@ -884,7 +938,11 @@ def convert_aten_Int(self, node: torch._C.Node):
             torch.ops.aten._local_scalar_dense.default, (to_copy_node,)
         )
 
+<<<<<<< HEAD
         # TODO: convert sourceRange() into stack_trace
+=======
+        # TODO: covnert sourceRange() into stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # fx_node.meta["stack_trace"] = node.sourceRange()
 
         output_name = node.output().debugName()
@@ -943,7 +1001,11 @@ def convert_aten_div(self, node: torch._C.Node):
                         kwargs,
                     )
 
+<<<<<<< HEAD
                     # TODO: convert sourceRange() into stack_trace
+=======
+                    # TODO: covnert sourceRange() into stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # fx_node.meta["stack_trace"] = node.sourceRange()
 
                     output_name = node.output().debugName()
@@ -1007,7 +1069,11 @@ def convert_aten_add(self, node: torch._C.Node):
             ):
                 target = torch.ops.aten.add.t
             else:
+<<<<<<< HEAD
                 raise RuntimeError(f"unable to determined the target for {node}")
+=======
+                raise RuntimeError(f"unable to determind the target for {node}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             target = get_op_overload(node)
 
@@ -1110,7 +1176,10 @@ def convert_prim_Loop(self, node: torch._C.Node):
                     fx_block_args[i] = self.name_to_node[output_name]
 
             # Update the value of global variables, whose values are modified inplace.
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i, name in enumerate(
                 subgraph_converter.name_update_from_subblock_to_parent
             ):
@@ -1122,9 +1191,15 @@ def convert_prim_Loop(self, node: torch._C.Node):
                     ),  # + 1 because the 0th element is the condition.
                 )
                 global_argument_index = global_arguments.index(name)
+<<<<<<< HEAD
                 fx_block_args[i + node.outputsSize() + global_argument_index] = (
                     self.name_to_node[name]
                 )
+=======
+                fx_block_args[
+                    i + node.outputsSize() + global_argument_index
+                ] = self.name_to_node[name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_set_attr_in_if_block(self, if_node: torch._C.Node):
         for block in if_node.blocks():
@@ -1472,8 +1547,12 @@ def convert(self) -> ExportedProgram:
             for k, tensor in self.ts_model.state_dict().items():  # type: ignore[union-attr]
                 if k not in ep.state_dict:
                     warnings.warn(
+<<<<<<< HEAD
                         f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram.",
                         stacklevel=2,
+=======
+                        f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     ep.state_dict[k] = tensor
 
@@ -1544,9 +1623,15 @@ def retrace_as_exported_program(
         for spec in ep.graph_signature.input_specs:
             # Mark as constant tensors for erroneously traced buffers.
             if spec.kind == InputKind.BUFFER and spec.target in name_to_constant:
+<<<<<<< HEAD
                 assert isinstance(name_to_constant[spec.target], torch.Tensor), (
                     f"{type(name_to_constant[spec.target])} has been erroneously marked as buffer"
                 )
+=======
+                assert isinstance(
+                    name_to_constant[spec.target], torch.Tensor
+                ), f"{type(name_to_constant[spec.target])} has been erroneously marked as buffer"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 spec.kind = InputKind.CONSTANT_TENSOR
                 spec.persistent = None
         ep.verifier().check(ep)
@@ -1568,7 +1653,11 @@ def lift_get_attr(self):
         #
         # This function should happen in TS2EPConverter instead of
         # TS2FXGraphConverter since it gets attributes from self.ts_model
+<<<<<<< HEAD
         # which is not accessible in TS2FXGraphConverter. It is similar to where
+=======
+        # which is not accessable in TS2FXGraphConverter. It is similar to where
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # we collect self.name_to_param and self.name_to_buffer.
         name_to_attribute_fqn: dict[str, str] = {}
 
diff --git a/torch/_export/db/case.py b/torch/_export/db/case.py
index 048a71cd6c16a..514f51e4ef92e 100644
--- a/torch/_export/db/case.py
+++ b/torch/_export/db/case.py
@@ -132,7 +132,10 @@ def _make_export_case(m, name, configs):
             m.__doc__ is not None
         ), f"Could not find description or docstring for export case: {m}"
         configs = {**configs, "description": m.__doc__}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ExportCase(**{**configs, "model": m, "name": name})
 
 
diff --git a/torch/_export/db/examples/autograd_function.py b/torch/_export/db/examples/autograd_function.py
index efd645d13a7d5..b52a14ee3c8f5 100644
--- a/torch/_export/db/examples/autograd_function.py
+++ b/torch/_export/db/examples/autograd_function.py
@@ -3,12 +3,18 @@
 
 class MyAutogradFunction(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, x):
         return x.clone()
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return grad_output + 1
 
diff --git a/torch/_export/db/examples/constrain_as_size_example.py b/torch/_export/db/examples/constrain_as_size_example.py
index 934746aaf6739..fe3af4b72f9c4 100644
--- a/torch/_export/db/examples/constrain_as_size_example.py
+++ b/torch/_export/db/examples/constrain_as_size_example.py
@@ -5,12 +5,22 @@
 class ConstrainAsSizeExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
+<<<<<<< HEAD
     can trace further. Please look at torch._check APIs.
+=======
+    can trace further. Please look at torch._check and torch._check_is_size APIs.
+    torch._check_is_size is used for values that NEED to be used for constructing
+    tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def forward(self, x):
         a = x.item()
+<<<<<<< HEAD
         torch._check(a >= 0)
+=======
+        torch._check_is_size(a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._check(a <= 5)
         return torch.zeros((a, 5))
 
diff --git a/torch/_export/db/examples/constrain_as_value_example.py b/torch/_export/db/examples/constrain_as_value_example.py
index 22f791a3e8047..f7fc7169515c3 100644
--- a/torch/_export/db/examples/constrain_as_value_example.py
+++ b/torch/_export/db/examples/constrain_as_value_example.py
@@ -5,7 +5,13 @@
 class ConstrainAsValueExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
+<<<<<<< HEAD
     can trace further. Please look at torch._check API.
+=======
+    can trace further. Please look at torch._check and torch._check_is_size APIs.
+    torch._check is used for values that don't need to be used for constructing
+    tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def forward(self, x, y):
diff --git a/torch/_export/db/examples/model_attr_mutation.py b/torch/_export/db/examples/model_attr_mutation.py
index 122b0ddfc3429..40423299ed4cc 100644
--- a/torch/_export/db/examples/model_attr_mutation.py
+++ b/torch/_export/db/examples/model_attr_mutation.py
@@ -1,10 +1,18 @@
 # mypy: allow-untyped-defs
 import torch
+<<<<<<< HEAD
+=======
+from torch._export.db.case import SupportLevel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ModelAttrMutation(torch.nn.Module):
     """
+<<<<<<< HEAD
     Attribute mutation raises a warning. Covered in the test_export.py test_detect_leak_strict test.
+=======
+    Attribute mutation is not supported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self) -> None:
@@ -21,4 +29,8 @@ def forward(self, x):
 
 example_args = (torch.randn(3, 2),)
 tags = {"python.object-model"}
+<<<<<<< HEAD
+=======
+support_level = SupportLevel.NOT_SUPPORTED_YET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 model = ModelAttrMutation()
diff --git a/torch/_export/db/examples/optional_input.py b/torch/_export/db/examples/optional_input.py
index 41e66a7c977a8..b2ad4bf65145c 100644
--- a/torch/_export/db/examples/optional_input.py
+++ b/torch/_export/db/examples/optional_input.py
@@ -16,5 +16,9 @@ def forward(self, x, y=torch.randn(2, 3)):
 
 example_args = (torch.randn(2, 3),)
 tags = {"python.object-model"}
+<<<<<<< HEAD
 support_level = SupportLevel.SUPPORTED
+=======
+support_level = SupportLevel.NOT_SUPPORTED_YET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 model = OptionalInput()
diff --git a/torch/_export/db/logging.py b/torch/_export/db/logging.py
index 9d18a5c0ea08e..aa6cbe9f5430c 100644
--- a/torch/_export/db/logging.py
+++ b/torch/_export/db/logging.py
@@ -39,7 +39,10 @@ def get_class_if_classified_error(e: Exception) -> Optional[str]:
         TorchRuntimeError: None,
     }
     if type(e) in _ALLOW_LIST:
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attr_name = _ALLOW_LIST[type(e)]
         if attr_name is None:
             return ALWAYS_CLASSIFIED
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index ef510480347c8..6dfd5bb17fcd8 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -6,9 +6,15 @@
 import logging
 import math
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -101,7 +107,10 @@ def get(self, kp: KeyPath) -> tuple[Source, KeyPath]:
             assert len(kp) > 0
             k, *kp = kp  # type: ignore[assignment]
             node = node[k]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return node, kp
 
 
@@ -140,7 +149,10 @@ def key_path_to_source(
         source: Source = LocalSource("args")
     else:
         source, kp = sourced_prefixes.get(kp)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k in kp:
         if isinstance(k, SequenceKey):
             source = GetItemSource(source, k.idx)
@@ -171,10 +183,14 @@ def fakify(
         return t
 
     if isinstance(t, _IntWrapper):
+<<<<<<< HEAD
         if t.dynamism is not None and t.dynamism.type in (  # type: ignore[union-attr]
             _DimHintType.DYNAMIC,
             _DimHintType.AUTO,
         ):
+=======
+        if t.dynamism is not None and t.dynamism.type in (_DimHintType.DYNAMIC, _DimHintType.AUTO):  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             symint = mode.shape_env.create_unspecified_symint_and_symbol(  # type: ignore[union-attr]
                 t.val, source, DimDynamic.DYNAMIC
             )
@@ -201,6 +217,7 @@ def fakify(
             "To register a constant input, use torch.utils._pytree.register_constant"
         )
 
+<<<<<<< HEAD
     # Create symbolic context (handles subclass recursion internally)
     symbolic_context = _create_symbolic_context_for_tensor(
         t, source, t_constraints, sources, mode
@@ -226,6 +243,11 @@ def _create_symbolic_context_for_tensor(t, source, t_constraints, sources, mode)
     dynamic_sizes = []
     constraint_sizes = [None] * n_dims
 
+=======
+    n_dims = len(t.shape)
+    dynamic_sizes = []
+    constraint_sizes = [None] * n_dims
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i in range(n_dims):
         if i in getattr(t, "_dynamo_weak_dynamic_indices", {}):
             dynamic_sizes.append(DimDynamic.DYNAMIC)
@@ -237,6 +259,7 @@ def _create_symbolic_context_for_tensor(t, source, t_constraints, sources, mode)
             constraint_sizes[i] = RelaxedUnspecConstraint(warn_only=False)  # type: ignore[call-overload]
         else:
             dynamic_sizes.append(DimDynamic.STATIC)
+<<<<<<< HEAD
 
     # Handle nested subclasses
     if is_traceable_wrapper_subclass(t):
@@ -269,6 +292,14 @@ def _create_symbolic_context_for_tensor(t, source, t_constraints, sources, mode)
         )
 
     # Apply constraints (common logic)
+=======
+    symbolic_context: StatelessSymbolicContext = (  # make mypy happy
+        StatelessSymbolicContext(
+            dynamic_sizes=dynamic_sizes,
+            constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     t_id = id(t)
     assert mode.shape_env is not None
     if t_id in t_constraints:
@@ -279,8 +310,14 @@ def _create_symbolic_context_for_tensor(t, source, t_constraints, sources, mode)
                 continue
             symbolic_context.constraint_sizes[i] = constraint.constraint_range
             mode.shape_env.source_name_to_debug_name[src.name()] = constraint.name  # type: ignore[assignment]
+<<<<<<< HEAD
 
     return symbolic_context
+=======
+    fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)
+    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))  # type: ignore[union-attr]
+    return fake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _is_unbacked_symint(symbol):
@@ -356,12 +393,18 @@ def _override_builtin_ops():
     original_min = builtins.min
     original_pow = math.pow
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     builtins.max = functools.partial(
         _tensor_min_max, real_callable=original_max, tensor_callable=torch.maximum
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     builtins.min = functools.partial(
         _tensor_min_max, real_callable=original_min, tensor_callable=torch.minimum
     )
@@ -381,7 +424,12 @@ def make_fake_inputs(
     args,
     kwargs,
     dynamic_shapes,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards=False,
+=======
+    _is_torch_jit_trace=False,
+    allow_complex_guards_as_runtime_asserts=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Given an nn module, example inputs, and constraints, return a new fake mode,
@@ -416,8 +464,12 @@ def make_fake_inputs(
         # a toplevel TracingContext with a fake mode, so we do not want to
         # create another fake mode.
         fake_mode = context.fake_mode
+<<<<<<< HEAD
         assert fake_mode is not None
     else:
+=======
+    elif not _is_torch_jit_trace:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(nn_module.forward, functools.partial):
             # functools handles nesting by itself, no need to recurse
             code = nn_module.forward.func.__code__
@@ -433,12 +485,31 @@ def make_fake_inputs(
                 shape_env=ShapeEnv(
                     tracked_fakes=[],
                     co_fields=co_fields,
+<<<<<<< HEAD
                     prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     trace_asserts=True,
                 ),
                 allow_non_fake_inputs=True,
                 export=True,
             )
+<<<<<<< HEAD
+=======
+    else:
+        with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+            fake_mode = FakeTensorMode(
+                shape_env=ShapeEnv(
+                    tracked_fakes=[],
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    trace_asserts=True,
+                ),
+                allow_non_fake_inputs=True,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if fake_mode.shape_env is None or fake_mode.shape_env.tracked_fakes is None:
         raise ValueError(
             "Detected fake_mode does not have a shape_env with tracked fakes. "
@@ -447,7 +518,15 @@ def make_fake_inputs(
         )
 
     with fake_mode:
+<<<<<<< HEAD
         original_signature = inspect.signature(nn_module.forward)
+=======
+        # FIXME(ycao) ScriptMethod doesn't have signature, I am using an empty one to unblock
+        if not _is_torch_jit_trace:
+            original_signature = inspect.signature(nn_module.forward)
+        else:
+            original_signature = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sources: dict[tuple[int, int], list[Source]] = defaultdict(list)
         sourced_prefixes = make_sourced_prefixes(nn_module, args, kwargs)
         fake_args, fake_kwargs = tree_map_with_path(
@@ -528,6 +607,10 @@ def produce_guards_and_solve_constraints(
     dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None],
     equalities_inputs: EqualityConstraint,
     original_signature: inspect.Signature,
+<<<<<<< HEAD
+=======
+    _is_torch_jit_trace=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Given a fake mode, sources pairs corresponding to equal dynamic shape dimensions,
@@ -568,6 +651,7 @@ def produce_guards_and_solve_constraints(
         raise constraint_violation_error
     dim_constraints.solve()
     forced_specializations = dim_constraints.forced_specializations()
+<<<<<<< HEAD
 
     msg = dim_constraints.prettify_results(
         original_signature,
@@ -576,6 +660,18 @@ def produce_guards_and_solve_constraints(
         forced_specializations,  # type: ignore[arg-type]
     )
 
+=======
+    if not _is_torch_jit_trace:
+        msg = dim_constraints.prettify_results(
+            original_signature,
+            dynamic_shapes,  # type: ignore[arg-type]
+            constraint_violation_error,
+            forced_specializations,  # type: ignore[arg-type]
+        )
+    else:
+        # FIXME(ycao): This is a hack to get around missing signature from ScriptMethod
+        msg = "dummy constraint violation message"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if constraint_violation_error:
         constraint_violation_error.args = (constraint_violation_error.args[0] + msg,)
     elif forced_specializations:
@@ -903,7 +999,11 @@ def _fakify_script_objects(
     mod: torch.nn.Module,
     args: Sequence[Any],
     kwargs: dict[Any, Any],
+<<<<<<< HEAD
     fake_mode: Optional[torch._subclasses.fake_tensor.FakeTensorMode],
+=======
+    fake_mode: torch._subclasses.fake_tensor.FakeTensorMode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # This context manager is used to fakify script objects into FakeScriptObject.
     # Inputs:
@@ -1034,12 +1134,16 @@ def _override(self, func, args, kwargs):
 
             def rewrite(dim, item):
                 # Redirect to torch.select for indexing.
+<<<<<<< HEAD
                 if item is None:
                     return dim + 1, (torch.unsqueeze, [dim])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(item, (int, torch.SymInt)):
                     return dim, (torch.select, [dim, item])
                 # Redirect to torch.ops.aten.slice for slicing.
                 if isinstance(item, slice):
+<<<<<<< HEAD
                     step = item.step or 1
                     if item.start is None and item.stop is None and step == 1:
                         # no-op
@@ -1094,6 +1198,32 @@ def run():
                     return t
 
                 return run, [], {}
+=======
+                    return dim + 1, (
+                        torch.ops.aten.slice,
+                        [dim, item.start, item.stop, item.step or 1],
+                    )
+                # Otherwise do nothing.
+
+            items = args[1] if isinstance(args[1], tuple) else (args[1],)
+            dim = 0
+            # Sequence rewrites.
+            sequence = []
+            for item in items:
+                if (r := rewrite(dim, item)) is None:
+                    return func, args, kwargs
+                dim, call_spec = r
+                sequence.append(call_spec)
+
+            def run():
+                # Run sequence.
+                t = args[0]
+                for _method, _args in sequence:
+                    t = _method(t, *_args)
+                return t
+
+            return run, [], {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return func, args, kwargs
 
diff --git a/torch/_export/pass_base.py b/torch/_export/pass_base.py
index a8522525fc28c..cd4b716f15f37 100644
--- a/torch/_export/pass_base.py
+++ b/torch/_export/pass_base.py
@@ -2,9 +2,14 @@
 import operator
 import traceback
 import typing
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import nullcontext
 from typing import Any, Optional, Union
+=======
+from contextlib import nullcontext
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import fx
@@ -188,7 +193,10 @@ def __init__(
             self.callback = callback
             self.node: torch.fx.Node = next(iter(gm.graph.nodes))
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def placeholder(
             self,
             target: str,  # type: ignore[override]
@@ -254,11 +262,16 @@ def call_function(
             else:
                 raise ExportPassBaseError(f"Unsupported target type: {target}")
 
+<<<<<<< HEAD
         def get_attr(  # type: ignore[override]
             self,
             target: str,
             args: tuple[Argument, ...],
             kwargs: dict[str, Argument],
+=======
+        def get_attr(
+            self, target: str, args: tuple[Argument, ...], kwargs: dict[str, Argument]  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Argument:
             return super().get_attr(target, args, kwargs)
 
@@ -270,11 +283,16 @@ def call_module(
         ) -> None:
             raise ExportPassBaseError("call_module is not supported.")
 
+<<<<<<< HEAD
         def call_method(  # type: ignore[override]
             self,
             target: str,
             args: tuple[Argument, ...],
             kwargs: dict[str, Argument],
+=======
+        def call_method(
+            self, target: str, args: tuple[Argument, ...], kwargs: dict[str, Argument]  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> None:
             raise ExportPassBaseError("call_method is not supported.")
 
@@ -434,6 +452,7 @@ def output(self, results: list[Argument], meta: NodeMetadata) -> ProxyValue:
     def call_submodule(
         self, graph_module: fx.GraphModule, inputs: tuple[Argument, ...]
     ) -> PassResult:
+<<<<<<< HEAD
         prev_tracer, self.tracer = (
             self.tracer,
             self.ExportTracer(self, graph_module.graph._codegen),
@@ -446,6 +465,15 @@ def call_submodule(
             torch.fx.Interpreter(  # type: ignore[assignment]
                 torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
             ),
+=======
+        prev_tracer, self.tracer = self.tracer, self.ExportTracer(
+            self, graph_module.graph._codegen
+        )
+        self.tracer.fake_tensor_mode = prev_tracer.fake_tensor_mode
+        interpreter = self.ExportInterpreter(self, graph_module)
+        prev_interpreter, self.interpreter = self.interpreter, torch.fx.Interpreter(  # type: ignore[assignment]
+            torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         inputs_data = pytree.tree_map_only(ProxyValue, lambda x: x.data, inputs)
         with fx_traceback.preserve_node_meta():
@@ -471,9 +499,15 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:
         fake_tensor_mode = None
         for i in inputs:
             if isinstance(i, FakeTensor):
+<<<<<<< HEAD
                 assert fake_tensor_mode is None or fake_tensor_mode is i.fake_mode, (
                     "Multiple fake tensor mode detected."
                 )
+=======
+                assert (
+                    fake_tensor_mode is None or fake_tensor_mode is i.fake_mode
+                ), "Multiple fake tensor mode detected."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fake_tensor_mode = i.fake_mode
         if fake_tensor_mode is None:
             self.tracer.fake_tensor_mode = FakeTensorMode(allow_non_fake_inputs=True)
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
index 3547a5f73c774..d074284d489f7 100644
--- a/torch/_export/passes/_node_metadata_hook.py
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -1,29 +1,43 @@
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 from typing import Any, Optional
 
 import torch
 import torch.utils._pytree as pytree
 from torch._dispatch.python import enable_python_dispatcher
 from torch._subclasses.fake_tensor import FakeTensorMode
+=======
+from typing import Optional
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.graph_module import GraphModule
 
 
 _EMPTY_NN_MODULE_STACK_KEY = "_empty_nn_module_stack_from_metadata_hook"
 
 
+<<<<<<< HEAD
 def _node_metadata_hook(
     node: torch.fx.Node,
     metadata: Optional[dict[str, Any]] = None,
     fake_mode: Optional[FakeTensorMode] = None,
 ) -> None:
+=======
+def _node_metadata_hook(node: torch.fx.Node, stack_trace: Optional[str] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Hook for adding the appropriate metadata to nodes that are created during a
     pass using graph.create_node. An example of how to use it:
 
     ```
     with _set_node_metadata_hook(gm,
+<<<<<<< HEAD
         functools.partial(_node_metadata_hook, metadata={"stack_trace": "file"})
+=======
+        functools.partial(_node_metadata_hook, stack_trace="file")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         pass(gm)
     ```
@@ -32,12 +46,20 @@ def _node_metadata_hook(
     that nodes being added are only call_function nodes, and copies over the
     first argument node's nn_module_stack.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     fake_mode = fake_mode or contextlib.nullcontext()
 
     assert node.op == "call_function" and callable(node.target), (
         f"node: {node}, target: {node.target}"
     )
+=======
+    assert node.op == "call_function" and callable(node.target)
+
+    arg_meta = [arg.meta for arg in node.args if isinstance(arg, torch.fx.Node)]
+    assert len(arg_meta) >= 1
+    arg_meta = arg_meta[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (
         isinstance(node.target, torch._ops.OpOverload)
@@ -45,6 +67,7 @@ def _node_metadata_hook(
     ):
         node.meta["val"] = None
     else:
+<<<<<<< HEAD
         fake_args, fake_kwargs = pytree.tree_map_only(
             torch.fx.Node, lambda arg: arg.meta["val"], (node.args, node.kwargs)
         )
@@ -87,6 +110,28 @@ def _node_metadata_hook(
             # pyrefly: ignore [missing-attribute]
             f"{node.target.__class__.__name__}.{node.target.__name__}",
         ),
+=======
+        fake_args = [
+            arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+            for arg in node.args
+        ]
+        fake_res = node.target(*fake_args)
+        node.meta["val"] = fake_res
+
+    node.meta["stack_trace"] = stack_trace
+    node.meta["nn_module_stack"] = arg_meta.get(
+        "nn_module_stack",
+        {
+            _EMPTY_NN_MODULE_STACK_KEY: (
+                _EMPTY_NN_MODULE_STACK_KEY,
+                _EMPTY_NN_MODULE_STACK_KEY,
+            )
+        },
+    )
+    node.meta["torch_fn"] = (
+        f"{node.target.__name__}_0",
+        f"{node.target.__class__.__name__}.{node.target.__name__}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
index d646b7edaaf06..5a720428e5e7e 100644
--- a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
+++ b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
@@ -3,7 +3,11 @@
 import operator
 import traceback
 from functools import partial
+<<<<<<< HEAD
 from typing import NamedTuple, TYPE_CHECKING
+=======
+from typing import Callable, NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -15,10 +19,13 @@
 from torch.utils._sympy.value_ranges import ValueRanges
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = ["InputDim"]
 
 
diff --git a/torch/_export/passes/constant_folding.py b/torch/_export/passes/constant_folding.py
index 5fdc92702a116..0faf596847285 100644
--- a/torch/_export/passes/constant_folding.py
+++ b/torch/_export/passes/constant_folding.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import collections
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -127,7 +131,11 @@ def set_env(arg):
         # contains a ScriptObject, equality checking results in a type error if
         # the types are different.
         if any(
+<<<<<<< HEAD
             type(self.unknown_value) is type(input_) and self.unknown_value == input_
+=======
+            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for input_ in flattened_inputs
         ):
             return self.unknown_value
diff --git a/torch/_export/passes/insert_custom_op_guards.py b/torch/_export/passes/insert_custom_op_guards.py
index 1b1e5fb6a9d7f..ee37418961fa6 100644
--- a/torch/_export/passes/insert_custom_op_guards.py
+++ b/torch/_export/passes/insert_custom_op_guards.py
@@ -16,6 +16,7 @@ def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: set[str]) ->
     """
     for node in gm.graph.nodes:
         if node.op == "call_function" and str(node.target) in ops_to_guard:
+<<<<<<< HEAD
             with (
                 _set_node_metadata_hook(
                     gm,
@@ -26,6 +27,14 @@ def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: set[str]) ->
                 ),
                 gm.graph.inserting_before(node),
             ):
+=======
+            with _set_node_metadata_hook(
+                gm,
+                functools.partial(
+                    _node_metadata_hook, stack_trace=node.meta.get("stack_trace")
+                ),
+            ), gm.graph.inserting_before(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for arg in (*node.args, *node.kwargs.values()):
                     if isinstance(arg, torch.fx.Node) and isinstance(
                         arg.meta.get("val"), torch.Tensor
@@ -54,10 +63,19 @@ def get_op_profiles(
 
     def _get_op_profile(node: torch.fx.Node) -> OpProfile:
         args_profile = tuple(
+<<<<<<< HEAD
             TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
             if isinstance(arg, torch.fx.Node)
             else None
             for arg in (*node.args, *node.kwargs.values())
+=======
+            [
+                TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
+                if isinstance(arg, torch.fx.Node)
+                else None
+                for arg in (*node.args, *node.kwargs.values())
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         out_profile = None
@@ -66,7 +84,11 @@ def _get_op_profile(node: torch.fx.Node) -> OpProfile:
         if isinstance(meta, torch.Tensor):
             out_profile = TensorMetadata.maybe_from_tensor(meta)
         elif isinstance(meta, (list, tuple)):
+<<<<<<< HEAD
             out_profile = tuple(TensorMetadata.maybe_from_tensor(m) for m in meta)  # type: ignore[assignment]
+=======
+            out_profile = tuple([TensorMetadata.maybe_from_tensor(m) for m in meta])  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert out_profile is not None
 
         return OpProfile(args_profile, out_profile)  # type: ignore[arg-type]
diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py
index 7e57817eb68d7..53f38e7f23fe1 100644
--- a/torch/_export/passes/lift_constants_pass.py
+++ b/torch/_export/passes/lift_constants_pass.py
@@ -142,10 +142,13 @@ def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
     if len(lift_fresh_node.users) > 1:
         return None
 
+<<<<<<< HEAD
     # Case 1: lift node is not used anywhere
     if len(lift_fresh_node.users) == 0:
         return [lift_fresh_node, node]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     detach_node = next(iter(lift_fresh_node.users.keys()))
     if not (
         detach_node.op == "call_function"
@@ -160,7 +163,10 @@ def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
     if len(detach_node.users) > 0:
         return None
     else:
+<<<<<<< HEAD
         # Case 2: Lift node's child is not used anywhere
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [detach_node, lift_fresh_node, node]
 
 
@@ -170,7 +176,11 @@ def lift_constants_pass(
     constant_attrs: ConstantAttrMap,
 ) -> dict[str, _ConstantAttributeType]:
     """
+<<<<<<< HEAD
     Takes a graph module, graph signature, and modifies them inplace to lift any
+=======
+    Takes a graph module, graph signature, and modifies them implace to lift any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constants (tensors or custom classes) as inputs to the graph. Returns a
     dictionary of names to constants.
 
@@ -188,12 +198,21 @@ def lift_constants_pass(
     """
     all_constants: dict[str, _ConstantAttributeType] = {}
 
+<<<<<<< HEAD
     input_specs = graph_signature.input_specs
     num_custom_obj = sum(
         input_spec.kind == InputKind.CUSTOM_OBJ for input_spec in input_specs
     )
     num_tensor_constants = sum(
         input_spec.kind == InputKind.CONSTANT_TENSOR for input_spec in input_specs
+=======
+    inputs = graph_signature.input_specs
+    num_custom_obj = sum(
+        input_specs.kind == InputKind.CUSTOM_OBJ for input_specs in inputs
+    )
+    num_tensor_constants = sum(
+        input_specs.kind == InputKind.CONSTANT_TENSOR for input_specs in inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     fake_mode = detect_fake_mode(
@@ -202,6 +221,7 @@ def lift_constants_pass(
 
     first_user_input_loc, first_user_input = 0, next(iter(gm.graph.nodes))
     used_target_names = set()
+<<<<<<< HEAD
 
     input_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
     assert len(input_nodes) == len(input_specs)
@@ -210,6 +230,21 @@ def lift_constants_pass(
         if input_spec.kind == InputKind.USER_INPUT:
             first_user_input = node
             first_user_input_loc = i
+=======
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            if node.name in graph_signature.user_inputs:
+                first_user_input = node
+                break
+            used_target_names.add(inputs[first_user_input_loc].target)
+            first_user_input_loc += 1
+        # If we ever hit here, it means that
+        # there was no user input so the constants
+        # should be inserted right before the first
+        # non-placeholder node.
+        if node.op != "placeholder":
+            first_user_input = node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             break
 
     lifted_objs = ConstantAttrMap()
@@ -373,7 +408,11 @@ def lift_constants_pass(
 
 def rewrite_script_object_meta(
     gm: torch.fx.GraphModule,
+<<<<<<< HEAD
 ) -> dict[str, _ConstantAttributeType]:
+=======
+) -> dict[str, _ConstantAttributeType,]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """When tracing, we produce a graph with FakeScriptObject in the
     meta["val"].
 
diff --git a/torch/_export/passes/replace_autocast_with_hop_pass.py b/torch/_export/passes/replace_autocast_with_hop_pass.py
index 71b90a3ff1bfb..386910f702883 100644
--- a/torch/_export/passes/replace_autocast_with_hop_pass.py
+++ b/torch/_export/passes/replace_autocast_with_hop_pass.py
@@ -100,8 +100,13 @@ def _split_autocast(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
     split_autocast creates a new graph module that splits the input graph module into multiple submodules
     based on the `_enter_autocast` and `_exit_autocast` nodes. It doesn't mutate the input graph module.
 
+<<<<<<< HEAD
     Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are split
     into a submodule. Nested autocast regions are not split.
+=======
+    Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are splitted
+    into a submodule. Nested autocast regions are not splitted.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     `_enter_autocast` and `_exit_autocast(_enter_autocast)` nodes are in the submodule as well.
 
     Below is an example of splitting. A, B, C, D, E are blocks of non-autocast nodes in the original graph
diff --git a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
index 2324d1f2cfa20..d392ef789b43f 100644
--- a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
+++ b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
@@ -292,7 +292,11 @@ def _conv1d_op_with_squeeze(
 
 
 def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+<<<<<<< HEAD
     """Conv specific transformation function."""
+=======
+    """Conv specfic transformation function."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(node.target, torch._ops.OpOverload)
     opname = node.target._opname
     scale_node, zero_point_node = node.args[2], node.args[3]
@@ -347,7 +351,11 @@ def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.No
 
 
 def _transform_linear_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+<<<<<<< HEAD
     """Linear specific transformation function."""
+=======
+    """Linear specfic transformation function."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scale_node, zero_point_node = node.args[2], node.args[3]
 
     inp_node, param_node = node.args[0], node.args[1]
@@ -567,7 +575,10 @@ def replace_quantized_ops_with_standard_ops(gm: torch.fx.GraphModule):
     quantized = False
 
     last_quantized_node = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if isinstance(node.target, OpOverload):
             with gm.graph.inserting_before(node):
@@ -630,7 +641,10 @@ def _clean_attr(mod: torch.nn.Module):
                     attr_names_to_clean.add(k)
                 if k == "_buffers":
                     buffer_name_to_clean = set()
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for b_name, b_value in v.items():
                         if isinstance(b_value, torch.Tensor) and b_value.dtype in [
                             torch.qint8,
@@ -638,7 +652,10 @@ def _clean_attr(mod: torch.nn.Module):
                         ]:
                             buffer_name_to_clean.add(b_name)
                     for b_name in buffer_name_to_clean:
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         v.pop(b_name, None)
             for attr_name in attr_names_to_clean:
                 delattr(submod, attr_name)
diff --git a/torch/_export/passes/replace_with_hop_pass_util.py b/torch/_export/passes/replace_with_hop_pass_util.py
index 4579519fa3f2c..5b932c0126227 100644
--- a/torch/_export/passes/replace_with_hop_pass_util.py
+++ b/torch/_export/passes/replace_with_hop_pass_util.py
@@ -4,7 +4,11 @@
 import contextlib
 import copy
 import operator
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
+=======
+from typing import Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -12,8 +16,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._ops import HigherOrderOperator
     from torch.export.graph_signature import ExportGraphSignature
 
@@ -35,7 +42,10 @@ def set_hoo_node_meta(call_func_node):
         )
         call_func_node.meta["torch_fn"] = (
             f"{wrap_hoo.__name__}",
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{wrap_hoo.__class__.__name__}.{wrap_hoo.__name__}",
         )
         if isinstance(output_args, (tuple, list)):
@@ -49,7 +59,11 @@ def set_hoo_node_meta(call_func_node):
             enter_block_node.meta.get("nn_module_stack", {})
         )
         output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+<<<<<<< HEAD
         # Split_module pass intentionally doesn't add output node
+=======
+        # Split_module pass intentially doesn't add output node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if the graph doesn't return anything.
         # TODO (tmanlaibaatar) Figure out if this is right behaviour
         # for split_module
@@ -100,7 +114,11 @@ def set_hoo_node_meta(call_func_node):
                 node_replace_(node, get_item_node)
             else:
                 raise NotImplementedError(
+<<<<<<< HEAD
                     f"replace_with_hop_pass doesn't support output type {type(output_args)}"
+=======
+                    f"repalce_with_hop_pass doesnt' support output type {type(output_args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         else:
             # TODO (shangdiy): remove this line, since the export graph can be non-functional
diff --git a/torch/_export/serde/dynamic_shapes.py b/torch/_export/serde/dynamic_shapes.py
index e3d002874d482..d5fe1b068fe8d 100644
--- a/torch/_export/serde/dynamic_shapes.py
+++ b/torch/_export/serde/dynamic_shapes.py
@@ -54,7 +54,10 @@ def _postprocess_serialized_shapes(
         )
         for k, v in sorted(dims.items())
     }
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     spec = DynamicShapesSpec(dynamic_shapes=dynamic_shapes, dims=dims)
     if to_dict:
         return _dataclass_to_dict(spec)
@@ -108,6 +111,7 @@ def _dump_dynamic_shapes(
     would generate the following output:
     ```
     {
+<<<<<<< HEAD
         "dynamic_shapes": (
             [
                 ["dx", 4],
@@ -122,6 +126,22 @@ def _dump_dynamic_shapes(
                 "min": 4,
                 "max": 16,
                 "derived": ["dx + 1"],
+=======
+        'dynamic_shapes': (
+            [
+                ['dx', 4],
+                ['dx + 1', 4],
+            ],
+            ['_DimHint.STATIC'],
+            ['_DimHint.STATIC', '_DimHint.STATIC'],
+            None,
+        ),
+        'dims': {
+            'dx': {
+                'min': 4,
+                'max': 16,
+                'derived': ['dx + 1'],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
         },
     }
@@ -150,7 +170,11 @@ def _standardize_shapes(path, tensor, shape):  # type: ignore[no-untyped-def]
         return out
 
     def _track_dim_from_dims(
+<<<<<<< HEAD
         val: Union[None, int, _DimHint, Dim],
+=======
+        val: Union[None, int, _DimHint, Dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Union[None, int, str]:
         """
         Tracks dims, ranges, derived dims from the standardized dynamic_shapes spec.
@@ -184,7 +208,10 @@ def _track_dim_from_dims(
     kwargs = kwargs or {}
     if isinstance(dynamic_shapes, dict):
         dynamic_shapes = dynamic_shapes.values()  # type: ignore[assignment]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment, bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dynamic_shapes = tuple(dynamic_shapes)
     combined_args = tuple(args) + tuple(kwargs.values())
 
@@ -297,7 +324,11 @@ def _load_dynamic_shapes(
             dim_cache[_expr] = ddim  # cache derived dims
 
     def deserialize_shape(
+<<<<<<< HEAD
         val: Union[None, int, str],
+=======
+        val: Union[None, int, str]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Union[None, int, Dim, _DimHint]:
         if val is None or isinstance(val, int):
             return val
diff --git a/torch/_export/serde/export_schema.thrift b/torch/_export/serde/export_schema.thrift
index f4a08f8739993..def587ee5e949 100644
--- a/torch/_export/serde/export_schema.thrift
+++ b/torch/_export/serde/export_schema.thrift
@@ -1,5 +1,9 @@
 // @generated by update_schema.py
+<<<<<<< HEAD
 // checksum<<a1c01cb72b55ca996960afa7e3b5ab6714405b036d8a3c33a81084a84e58bbab>>
+=======
+// checksum<<fd7996be362400fc786560be9e3d5680d1d7ec37b21d7a89d74146b669f72da6>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace py3 torch._export
 namespace cpp2 torch._export.schema
@@ -50,8 +54,11 @@ enum ScalarType {
   UINT16 = 28,
   FLOAT8E4M3FN = 29,
   FLOAT8E5M2 = 30,
+<<<<<<< HEAD
   FLOAT8E4M3FNUZ = 31,
   FLOAT8E5M2FNUZ = 32,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -134,11 +141,14 @@ struct CustomObjArgument {
   20: string class_fqn;
 }
 
+<<<<<<< HEAD
 struct ComplexValue {
   10: double real;
   20: double imag;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 union Argument {
   10: bool as_none;
   20: TensorArgument as_tensor;
@@ -166,7 +176,10 @@ union Argument {
   230: SymFloatArgument as_sym_float;
   240: list<SymFloatArgument> as_sym_floats;
   250: OptionalTensorArgument as_optional_tensor;
+<<<<<<< HEAD
   260: ComplexValue as_complex;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct NamedArgument {
@@ -260,11 +273,14 @@ struct BufferMutationSpec {
   20: string buffer_name;
 }
 
+<<<<<<< HEAD
 struct ParameterMutationSpec {
   10: TensorArgument arg;
   20: string parameter_name;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct GradientToParameterSpec {
   10: TensorArgument arg;
   20: string parameter_name;
@@ -292,7 +308,10 @@ union OutputSpec {
   50: GradientToUserInputSpec gradient_to_user_input;
   60: UserInputMutationSpec user_input_mutation;
   70: OutputTokenSpec token;
+<<<<<<< HEAD
   80: ParameterMutationSpec parameter_mutation;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct GraphSignature {
@@ -342,6 +361,7 @@ struct ExportedProgram {
   60: SchemaVersion schema_version;
   70: list<string> verifiers;
   80: string torch_version;
+<<<<<<< HEAD
   90: list<string> guards_code;
 }
 
@@ -354,6 +374,21 @@ struct PayloadMeta {
 
 struct PayloadConfig {
   10: map<string, PayloadMeta> config;
+=======
+}
+
+struct Program {
+  200: map<string, ExportedProgram> methods;
+}
+
+struct Model {
+  10: string name;
+  20: map<string, string> tensorPaths;
+  40: Program program;
+  50: map<string, Program> delegates;
+  60: map<string, string> deviceAllocationMap;
+  70: map<string, string> constantPaths;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct AOTInductorModelPickleData {
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index a9cec8b185c58..845f8cc37a57f 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -5,11 +5,19 @@
 from enum import IntEnum
 from typing import Annotated, Optional
 
+<<<<<<< HEAD
 from torch._export.serde.union import _Union, _union_dataclass
 
 
 # NOTE: Please update this value if any modifications are made to the schema
 SCHEMA_VERSION = (8, 14)
+=======
+from torch._export.serde.union import _Union
+
+
+# NOTE: Please update this value if any modifications are made to the schema
+SCHEMA_VERSION = (8, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TREESPEC_VERSION = 1
 
 
@@ -33,8 +41,11 @@ class ScalarType(IntEnum):
     UINT16 = 28
     FLOAT8E4M3FN = 29
     FLOAT8E5M2 = 30
+<<<<<<< HEAD
     FLOAT8E4M3FNUZ = 31
     FLOAT8E5M2FNUZ = 32
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Layout(IntEnum):
@@ -62,7 +73,11 @@ class Device:
     index: Annotated[Optional[int], 20] = None
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymExprHint(_Union):
     as_int: Annotated[int, 10]
     as_bool: Annotated[bool, 20]
@@ -79,19 +94,31 @@ class SymExpr:
     hint: Annotated[Optional[SymExprHint], 20] = None
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymInt(_Union):
     as_expr: Annotated[SymExpr, 10]
     as_int: Annotated[int, 20]
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymFloat(_Union):
     as_expr: Annotated[SymExpr, 10]
     as_float: Annotated[float, 20]
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymBool(_Union):
     as_expr: Annotated[SymExpr, 10]
     as_bool: Annotated[bool, 20]
@@ -114,7 +141,11 @@ class TensorMeta:
 # of SymInt and ints (ex. [1, s0, ...]). We will serialize this type of list to
 # be List[SymIntArgument] and map the SymInts to the "as_name" field, and ints
 # to the "as_int" field.
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymIntArgument(_Union):
     as_name: Annotated[str, 10]
     as_int: Annotated[int, 20]
@@ -126,7 +157,11 @@ class SymIntArgument(_Union):
 # of SymFloat and float (ex. [1.0, s0, ...]). We will serialize this type of list to
 # be List[SymFloatArgument] and map the SymFloats to the "as_name" field, and ints
 # to the "as_float" field.
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymFloatArgument(_Union):
     as_name: Annotated[str, 10]
     as_float: Annotated[float, 20]
@@ -138,7 +173,11 @@ class SymFloatArgument(_Union):
 # of SymBool and bools (ex. [True, i0, ...]). We will serialize this type of list to
 # be List[SymboolArgument] and map the SymBools to the "as_name" field, and bools
 # to the "as_bool" field.
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymBoolArgument(_Union):
     as_name: Annotated[str, 10]
     as_bool: Annotated[bool, 20]
@@ -156,9 +195,15 @@ class TokenArgument:
 
 # This is use for storing the contents of a list which contain optional tensors
 # (Tensor?[], ex. [Tensor, None, ...]), where the list will be serialized to the
+<<<<<<< HEAD
 # type List[OptionalTensorArgument], with tensor values serialized to the
 # "as_tensor" field, and None values serialized to the "as_none" field.
 @_union_dataclass
+=======
+# type List[OptionalTensorArgument], with tensor values seiralized to the
+# "as_tensor" field, and None values serialized to the "as_none" field.
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OptionalTensorArgument(_Union):
     as_tensor: Annotated[TensorArgument, 20]
     as_none: Annotated[bool, 10]
@@ -176,6 +221,7 @@ class CustomObjArgument:
     class_fqn: Annotated[str, 20]
 
 
+<<<<<<< HEAD
 @dataclass
 class ComplexValue:
     real: Annotated[float, 10]
@@ -184,6 +230,10 @@ class ComplexValue:
 
 # This is actually a union type
 @_union_dataclass
+=======
+# This is actually a union type
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Argument(_Union):
     as_none: Annotated[bool, 10]
     as_tensor: Annotated[TensorArgument, 20]
@@ -211,7 +261,10 @@ class Argument(_Union):
     as_sym_float: Annotated[SymFloatArgument, 230]
     as_sym_floats: Annotated[list[SymFloatArgument], 240]
     as_optional_tensor: Annotated[OptionalTensorArgument, 250]
+<<<<<<< HEAD
     as_complex: Annotated[ComplexValue, 260]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ArgumentKind(IntEnum):
@@ -262,7 +315,11 @@ class UserInputSpec:
     arg: Annotated[Argument, 10]
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConstantValue(_Union):
     as_none: Annotated[bool, 10]
     as_int: Annotated[int, 20]
@@ -307,7 +364,11 @@ class InputTokenSpec:
     arg: Annotated[TokenArgument, 10]
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class InputSpec(_Union):
     user_input: Annotated[UserInputSpec, 10]
     parameter: Annotated[InputToParameterSpec, 20]
@@ -335,12 +396,15 @@ class BufferMutationSpec:
 
 
 @dataclass
+<<<<<<< HEAD
 class ParameterMutationSpec:
     arg: Annotated[TensorArgument, 10]
     parameter_name: Annotated[str, 20]
 
 
 @dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GradientToParameterSpec:
     arg: Annotated[TensorArgument, 10]
     parameter_name: Annotated[str, 20]
@@ -363,7 +427,11 @@ class OutputTokenSpec:
     arg: Annotated[TokenArgument, 10]
 
 
+<<<<<<< HEAD
 @_union_dataclass
+=======
+@dataclass(repr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OutputSpec(_Union):
     user_output: Annotated[UserOutputSpec, 10]
     loss_output: Annotated[LossOutputSpec, 20]
@@ -372,7 +440,10 @@ class OutputSpec(_Union):
     gradient_to_user_input: Annotated[GradientToUserInputSpec, 50]
     user_input_mutation: Annotated[UserInputMutationSpec, 60]
     token: Annotated[OutputTokenSpec, 70]
+<<<<<<< HEAD
     parameter_mutation: Annotated[ParameterMutationSpec, 80]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -398,7 +469,11 @@ class ModuleCallSignature:
     out_spec: Annotated[str, 40]
 
     # This field is used to prettify the graph placeholders
+<<<<<<< HEAD
     # after we Ser/Der and retrace
+=======
+    # after we ser/der and retrace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     forward_arg_names: Annotated[Optional[list[str]], 50] = None
 
 
@@ -429,7 +504,11 @@ class GraphModule:
 
 
 # Invariant: Every time a change is made to the schema, one of the versions
+<<<<<<< HEAD
 #            should be updated.
+=======
+#            should be upadted.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class SchemaVersion:
     major: Annotated[
@@ -449,7 +528,10 @@ class ExportedProgram:
     schema_version: Annotated[SchemaVersion, 60]
     verifiers: Annotated[list[str], 70] = field(default_factory=list)
     torch_version: Annotated[str, 80] = "<=2.4"
+<<<<<<< HEAD
     guards_code: Annotated[list[str], 90] = field(default_factory=list)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 #########################################################################
@@ -457,6 +539,7 @@ class ExportedProgram:
 #########################################################################
 
 
+<<<<<<< HEAD
 # The metadata for payload saved in PT2 archive.
 # payload includes params, buffers, tensor constants, and custom objects.
 @dataclass
@@ -476,6 +559,31 @@ class PayloadMeta:
 @dataclass
 class PayloadConfig:
     config: Annotated[dict[str, PayloadMeta], 10]
+=======
+@dataclass
+class Program:
+    methods: Annotated[dict[str, ExportedProgram], 200]
+
+
+# This is the top-level model definition that be will serialized into the package
+@dataclass
+class Model:
+    # unique identifier of the model in the package, e.g. local, remote, merge
+    name: Annotated[str, 10]
+    # key is the FQN of tensor in exported program
+    # value is the archive path of tensor payloads
+    # e.g. "L__self__linear.weight" : "/data/tensor/L__self__linear.weight"
+    tensorPaths: Annotated[dict[str, str], 20]
+    # program exported from torch.export()
+    program: Annotated[Program, 40]
+    # Backend-specialized Lowered GraphModule
+    # e.g. "aotinductor-a100" : ExportedProgram_with_AOTInductor_delegate
+    delegates: Annotated[dict[str, Program], 50]
+    deviceAllocationMap: Annotated[dict[str, str], 60]
+    # key is the FQN of constant in exported program (constant tensor or torchbind objs)
+    # value is the archive path of serialized constants
+    constantPaths: Annotated[dict[str, str], 70]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 #
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
index 951351e7786aa..6189ad0a4323b 100644
--- a/torch/_export/serde/schema.yaml
+++ b/torch/_export/serde/schema.yaml
@@ -1,5 +1,9 @@
 # @generated by update_schema.py
+<<<<<<< HEAD
 # checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
+=======
+# checksum<<110c364974d3b0f7dcbdf6862781212bdcc7178925c43c894c336fc2b6ca6628>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTInductorModelPickleData:
   kind: struct
   fields:
@@ -73,8 +77,11 @@ Argument:
       type: List[SymFloatArgument]
     as_optional_tensor:
       type: OptionalTensorArgument
+<<<<<<< HEAD
     as_complex:
       type: ComplexValue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ArgumentKind:
   kind: enum
   fields:
@@ -88,6 +95,7 @@ BufferMutationSpec:
       type: TensorArgument
     buffer_name:
       type: str
+<<<<<<< HEAD
 ComplexValue:
   kind: struct
   fields:
@@ -95,6 +103,8 @@ ComplexValue:
       type: float
     imag:
       type: float
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ConstantValue:
   kind: union
   fields:
@@ -140,9 +150,12 @@ ExportedProgram:
     torch_version:
       type: str
       default: <=2.4
+<<<<<<< HEAD
     guards_code:
       type: List[str]
       default: '[]'
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ExternKernelNode:
   kind: struct
   fields:
@@ -305,6 +318,24 @@ MemoryFormat:
     ChannelsLast: 2
     ChannelsLast3d: 3
     PreserveFormat: 4
+<<<<<<< HEAD
+=======
+Model:
+  kind: struct
+  fields:
+    name:
+      type: str
+    tensorPaths:
+      type: Dict[str, str]
+    program:
+      type: Program
+    delegates:
+      type: Dict[str, Program]
+    deviceAllocationMap:
+      type: Dict[str, str]
+    constantPaths:
+      type: Dict[str, str]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ModuleCallEntry:
   kind: struct
   fields:
@@ -380,13 +411,17 @@ OutputSpec:
       type: UserInputMutationSpec
     token:
       type: OutputTokenSpec
+<<<<<<< HEAD
     parameter_mutation:
       type: ParameterMutationSpec
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OutputTokenSpec:
   kind: struct
   fields:
     arg:
       type: TokenArgument
+<<<<<<< HEAD
 ParameterMutationSpec:
   kind: struct
   fields:
@@ -410,6 +445,13 @@ PayloadMeta:
       type: bool
     tensor_meta:
       type: Optional[TensorMeta]
+=======
+Program:
+  kind: struct
+  fields:
+    methods:
+      type: Dict[str, ExportedProgram]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RangeConstraint:
   kind: struct
   fields:
@@ -437,8 +479,11 @@ ScalarType:
     UINT16: 28
     FLOAT8E4M3FN: 29
     FLOAT8E5M2: 30
+<<<<<<< HEAD
     FLOAT8E4M3FNUZ: 31
     FLOAT8E5M2FNUZ: 32
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 SchemaVersion:
   kind: struct
   fields:
@@ -551,5 +596,9 @@ UserOutputSpec:
       type: Argument
 SCHEMA_VERSION:
 - 8
+<<<<<<< HEAD
 - 14
+=======
+- 8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TREESPEC_VERSION: 1
diff --git a/torch/_export/serde/schema_check.py b/torch/_export/serde/schema_check.py
index 0890b4b2dd84e..824fa957763e9 100644
--- a/torch/_export/serde/schema_check.py
+++ b/torch/_export/serde/schema_check.py
@@ -64,14 +64,22 @@ def dump_type(t, level: int) -> tuple[str, str, str]:
                 )
             elif o := typing.get_origin(t):
                 # Lemme know if there's a better way to do this.
+<<<<<<< HEAD
                 if o is list:
+=======
+                if o == list:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     yaml_head, cpp_head, thrift_head, thrift_tail = (
                         "List",
                         "std::vector",
                         "list<",
                         ">",
                     )
+<<<<<<< HEAD
                 elif o is dict:
+=======
+                elif o == dict:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     yaml_head, cpp_head, thrift_head, thrift_tail = (
                         "Dict",
                         "std::unordered_map",
@@ -81,7 +89,11 @@ def dump_type(t, level: int) -> tuple[str, str, str]:
                 elif o == Union:
                     assert level == 0, "Optional is only supported at the top level."
                     args = typing.get_args(t)
+<<<<<<< HEAD
                     assert len(args) == 2 and args[1] is type(None)
+=======
+                    assert len(args) == 2 and args[1] == type(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     yaml_type, cpp_type, thrift_type = dump_type(args[0], level + 1)
                     return (
                         f"Optional[{yaml_type}]",
@@ -129,6 +141,7 @@ def dump_field(f) -> tuple[dict[str, Any], str, Optional[str], str, int]:
             t, cpp_type, thrift_type = dump_type(f.type, 0)
             ret = {"type": t}
             cpp_default: Optional[str] = None
+<<<<<<< HEAD
             assert typing.get_origin(f.type) == Annotated, (
                 f"Field {f.name} must be annotated with an integer id."
             )
@@ -136,6 +149,15 @@ def dump_field(f) -> tuple[dict[str, Any], str, Optional[str], str, int]:
             assert type(thrift_id) is int, (
                 f"Field {f.name} must be annotated with an integer id."
             )
+=======
+            assert (
+                typing.get_origin(f.type) == Annotated
+            ), f"Field {f.name} must be annotated with an integer id."
+            thrift_id = f.type.__metadata__[0]
+            assert (
+                type(thrift_id) is int
+            ), f"Field {f.name} must be annotated with an integer id."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             value = dataclasses.MISSING
             if f.default is not dataclasses.MISSING:
@@ -173,7 +195,13 @@ def dump_field(f) -> tuple[dict[str, Any], str, Optional[str], str, int]:
 
     def _handle_int_enum(name, ty):
         yaml_ret[name] = {"kind": "enum", "fields": {x.name: x.value for x in ty}}
+<<<<<<< HEAD
         cpp_enum_defs[name] = f"""
+=======
+        cpp_enum_defs[
+            name
+        ] = f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enum class {name} {{
 {chr(10).join([f"  {x.name} = {x.value}," for x in ty])}
 }};
@@ -238,6 +266,7 @@ def accessor(name, ty):
 
         from_json_def = f"""{{
   {name} nlohmann_json_default_obj;
+<<<<<<< HEAD
 {
             chr(10).join(
                 [
@@ -249,6 +278,16 @@ def accessor(name, ty):
 }}
 """
         cpp_class_defs[name] = f"""
+=======
+{chr(10).join(
+    [f'  nlohmann_json_t.{name} = nlohmann_json_j.value("{name}", nlohmann_json_default_obj.{name});'
+    for name, f in cpp_fields.items()])}
+}}
+"""
+        cpp_class_defs[
+            name
+        ] = f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class {name} {{
  private:
 {field_decls}
@@ -263,7 +302,13 @@ class {name} {{
         cpp_json_defs.append(f"inline {from_json_decl} {from_json_def}")
         cpp_type_decls.append(f"class {name};")
 
+<<<<<<< HEAD
         thrift_type_defs[name] = f"""
+=======
+        thrift_type_defs[
+            name
+        ] = f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct {name} {{
 {chr(10).join(f"  {f['thrift_id']}: {f['thrift_type']} {n};" for n, f in thrift_fields.items())}
 }}"""
@@ -306,7 +351,13 @@ def accessor(name, ty, idx):
             ]
         )
 
+<<<<<<< HEAD
         cpp_class_defs[name] = f"""
+=======
+        cpp_class_defs[
+            name
+        ] = f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class {name} {{
   struct Void {{}};
 
@@ -349,7 +400,13 @@ class {name} {{
 """
         cpp_type_decls.append(f"class {name};")
 
+<<<<<<< HEAD
         thrift_type_defs[name] = f"""
+=======
+        thrift_type_defs[
+            name
+        ] = f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 union {name} {{
 {chr(10).join(f"  {f['thrift_id']}: {f['thrift_type']} {n};" for n, f in thrift_fields.items())}
 }}"""
@@ -448,7 +505,10 @@ class ForwardRef {{
     ptr_ = std::make_unique<T>(*other.ptr_);
     return *this;
   }}
+<<<<<<< HEAD
   ~ForwardRef();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const T& operator*() const {{
     return *ptr_;
   }}
@@ -520,7 +580,10 @@ class F64 {{
 
 template <typename T> ForwardRef<T>::ForwardRef(ForwardRef<T>&&) = default;
 template <typename T> ForwardRef<T>& ForwardRef<T>::operator=(ForwardRef<T>&&) = default;
+<<<<<<< HEAD
 template <typename T> ForwardRef<T>::~ForwardRef() = default;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }} // namespace _export
 }} // namespace torch
 """
@@ -623,9 +686,13 @@ class _Commit:
 def update_schema():
     import importlib.resources
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     if importlib.resources.is_resource(__package__, "schema.yaml"):
         # pyrefly: ignore [bad-argument-type]
+=======
+    if importlib.resources.is_resource(__package__, "schema.yaml"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         content = importlib.resources.read_text(__package__, "schema.yaml")
         match = re.search("checksum<<([A-Fa-f0-9]{64})>>", content)
         _check(match is not None, "checksum not found in schema.yaml")
@@ -633,9 +700,13 @@ def update_schema():
         checksum_head = match.group(1)
 
         thrift_content = importlib.resources.read_text(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             __package__,
             "export_schema.thrift",
+=======
+            __package__, "export_schema.thrift"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         match = re.search("checksum<<([A-Fa-f0-9]{64})>>", thrift_content)
         _check(match is not None, "checksum not found in export_schema.thrift")
@@ -658,9 +729,13 @@ def update_schema():
 
     src, cpp_header, thrift_schema = _staged_schema()
     additions, subtractions = _diff_schema(dst, src)
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     yaml_path = __package__.replace(".", "/") + "/schema.yaml"
     # pyrefly: ignore [missing-attribute]
+=======
+    yaml_path = __package__.replace(".", "/") + "/schema.yaml"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     thrift_schema_path = __package__.replace(".", "/") + "/export_schema.thrift"
     torch_prefix = "torch/"
     assert yaml_path.startswith(torch_prefix)  # sanity check
@@ -697,7 +772,11 @@ def check(commit: _Commit, force_unsafe: bool = False):
             for f, d in fields.items():
                 if kind == "struct" and "default" not in d:
                     reason += (
+<<<<<<< HEAD
                         f"Field {k}.{f} is added to schema.py without a default value as an incompatible change "
+=======
+                        f"Field {k}.{f} is added to schema.py without a default value as an incomparible change "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         + "which requires major version bump.\n"
                     )
                     next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 7e706baa5f9bd..b470ade8a719e 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -14,11 +14,19 @@
 import traceback
 import typing
 from collections import namedtuple, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Annotated, Any, cast, final, Optional, Union
+=======
+from collections.abc import Iterable, Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Annotated, Any, Callable, cast, final, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -35,14 +43,20 @@
 from torch.utils._sympy.symbol import prefix_str, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._traceback import CapturedTraceback
+<<<<<<< HEAD
 from torch.utils._triton import has_triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ..utils import remove_proxy_from_state_dict
 from .schema import (  # type: ignore[attr-defined]
     Argument,
     ArgumentKind,
     BufferMutationSpec,
+<<<<<<< HEAD
     ComplexValue,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ConstantValue,
     CustomObjArgument,
     Device,
@@ -71,7 +85,10 @@
     OptionalTensorArgument,
     OutputSpec,
     OutputTokenSpec,
+<<<<<<< HEAD
     ParameterMutationSpec,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RangeConstraint,
     ScalarType,
     SCHEMA_VERSION,
@@ -146,8 +163,11 @@ def _reverse_map(d: dict[Any, Enum]):
     torch.bfloat16: ScalarType.BFLOAT16,
     torch.float8_e4m3fn: ScalarType.FLOAT8E4M3FN,
     torch.float8_e5m2: ScalarType.FLOAT8E5M2,
+<<<<<<< HEAD
     torch.float8_e4m3fnuz: ScalarType.FLOAT8E4M3FNUZ,
     torch.float8_e5m2fnuz: ScalarType.FLOAT8E5M2FNUZ,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -227,6 +247,7 @@ class _SerializedProgram:
     example_inputs: bytes
 
 
+<<<<<<< HEAD
 class LazyMap(dict):
     """
     Dictionary class for deferred instantiation of node metadata values.
@@ -252,12 +273,15 @@ def __repr__(self):
         return self.map.__repr__()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def deserialize_device(d: Device) -> torch.device:
     if d.index is None:
         return torch.device(type=d.type)  # type: ignore[call-overload]
     return torch.device(type=d.type, index=d.index)
 
 
+<<<<<<< HEAD
 def deserialize_size(sizes: Sequence[SymInt]) -> tuple[int, ...]:
     for sym_int_size in sizes:
         assert sym_int_size.type == "as_int", (
@@ -283,6 +307,8 @@ def deserialize_storage_offset(offset: SymInt) -> int:
     return offset.as_int
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _print_sympy(s: Union[torch.SymInt, torch.SymBool, torch.SymFloat, sympy.Expr]):
     if isinstance(s, (torch.SymInt, torch.SymBool, torch.SymFloat)):
         s = s.node.expr
@@ -353,7 +379,11 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta:
         requires_grad=t.requires_grad,
         device=Device(type=t.device.type, index=t.device.index),
         strides=[serialize_sym_int(s) for s in t.stride()],
+<<<<<<< HEAD
         storage_offset=serialize_sym_int(t.storage_offset()),
+=======
+        storage_offset=serialize_sym_int(0),  # TODO needs to be fixed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout],
     )
 
@@ -377,6 +407,7 @@ def _reconstruct_fake_tensor(
     json_tensor_meta = json.loads(serialized_tensor_meta.decode("utf-8"))
     tensor_meta = _dict_to_dataclass(TensorMeta, json_tensor_meta)
     # Find the current fake mode
+<<<<<<< HEAD
     assert _CURRENT_DESERIALIZER is not None, (
         "Need access to current deserializer state"
     )
@@ -384,6 +415,14 @@ def _reconstruct_fake_tensor(
     if is_parameter:
         fake_tensor = torch.nn.Parameter(fake_tensor)  # type: ignore[assignment]
     # pyrefly: ignore [bad-return]
+=======
+    assert (
+        _CURRENT_DESERIALIZER is not None
+    ), "Need access to current deserializer state"
+    fake_tensor = _CURRENT_DESERIALIZER.deserialize_tensor_meta(tensor_meta)
+    if is_parameter:
+        fake_tensor = torch.nn.Parameter(fake_tensor)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fake_tensor
 
 
@@ -393,9 +432,15 @@ def serialize_torch_artifact(
     if artifact is None:
         return b""
 
+<<<<<<< HEAD
     assert FakeTensor not in copyreg.dispatch_table, (
         "Refusing to stomp on existing FakeTensor reducer"
     )
+=======
+    assert (
+        FakeTensor not in copyreg.dispatch_table
+    ), "Refusing to stomp on existing FakeTensor reducer"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         copyreg.pickle(FakeTensor, _reduce_fake_tensor)
         buffer = io.BytesIO()
@@ -412,7 +457,11 @@ def serialize_torch_artifact(
 
 
 def deserialize_torch_artifact(
+<<<<<<< HEAD
     serialized: Union[dict[str, Any], tuple[Any, ...], bytes],
+=======
+    serialized: Union[dict[str, Any], tuple[Any, ...], bytes]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     if isinstance(serialized, (dict, tuple)):
         return serialized
@@ -471,7 +520,11 @@ def _symbol_index(sym: sympy.Symbol, sym_type: SymT):
 
 
 def serialize_range_constraints(
+<<<<<<< HEAD
     range_constraints: dict[sympy.Symbol, ValueRanges],
+=======
+    range_constraints: dict[sympy.Symbol, ValueRanges]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> dict[str, RangeConstraint]:
     return {
         str(k): RangeConstraint(
@@ -511,6 +564,7 @@ def __new__(metacls, name, bases, classdict):
         return type.__new__(metacls, name, bases, dict(classdict))
 
 
+<<<<<<< HEAD
 def get_triton_kernel_and_cache_entry(node: torch.fx.Node):
     assert (
         node.target
@@ -564,6 +618,8 @@ def get_triton_kernel_and_cache_entry(node: torch.fx.Node):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @final
 class GraphModuleSerializer(metaclass=Final):
     def __init__(
@@ -608,9 +664,15 @@ def handle_placeholder(self, node: torch.fx.Node):
             graph_input = Argument.create(
                 as_custom_obj=CustomObjArgument(name=node.name, class_fqn=class_fqn)
             )
+<<<<<<< HEAD
             self.graph_state.custom_obj_values[node.name] = (
                 self.serialize_script_obj_meta(val)
             )
+=======
+            self.graph_state.custom_obj_values[
+                node.name
+            ] = self.serialize_script_obj_meta(val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise AssertionError(f"Unimplemented graph input type: {node.meta['val']}")
         self.graph_state.inputs.append(graph_input)
@@ -726,6 +788,7 @@ def serialize_tensor_list_output(node):
                     metadata=self.serialize_metadata(node),
                     is_hop_single_tensor_return=False,
                 )
+<<<<<<< HEAD
             elif (
                 node.target
                 is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
@@ -784,6 +847,8 @@ def serialize_tensor_list_output(node):
                     metadata=self.serialize_metadata(node),
                     is_hop_single_tensor_return=_is_hop_single_tensor_return(node),
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
@@ -794,9 +859,15 @@ def serialize_tensor_list_output(node):
                 )
         elif type(node.target) in _serialization_registry:
             # Sanity check for unhandled serialization.
+<<<<<<< HEAD
             assert type(node.target) in _serialization_registry, (
                 f"{type(node.target)} is not supported in export serialization."
             )
+=======
+            assert (
+                type(node.target) in _serialization_registry
+            ), f"{type(node.target)} is not supported in export serialization."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             handler = _serialization_registry[type(node.target)]
             namespace = handler.namespace()
@@ -1024,6 +1095,7 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
                     return Argument.create(
                         as_graph=GraphArgument(name=arg.target, graph=graph)
                     )
+<<<<<<< HEAD
                 elif type(attr).__name__ == "LoweredBackendModule":
                     # Special handling for executorch_call_delegate HOP
                     # It's first argument is a LoweredBackendModule, for which we
@@ -1033,6 +1105,8 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
                     assert module_name is not None, "module_name should not be None"
                     assert backend_id is not None, "backend_id should not be None"
                     return Argument.create(as_string=f"{module_name}-{backend_id}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     raise SerializeError(
                         f"Unsupported getattr attribute {arg.target} with type: {type(attr)}"
@@ -1100,10 +1174,13 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
             return Argument.create(as_int=arg)
         elif type(arg) is float:
             return Argument.create(as_float=arg)
+<<<<<<< HEAD
         elif type(arg) is complex:
             return Argument.create(
                 as_complex=ComplexValue(real=arg.real, imag=arg.imag)
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif arg is None:
             return Argument.create(as_none=True)
         elif isinstance(arg, (list, tuple)):
@@ -1394,6 +1471,7 @@ def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
                     buffer_name=spec.target,
                 )
             )
+<<<<<<< HEAD
         elif spec.kind == ep.OutputKind.PARAMETER_MUTATION:
             assert spec.target is not None
             assert isinstance(spec.arg, ep.TensorArgument)
@@ -1403,6 +1481,8 @@ def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
                     parameter_name=spec.target,
                 )
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif spec.kind == ep.OutputKind.GRADIENT_TO_PARAMETER:
             assert spec.target is not None
             assert isinstance(spec.arg, ep.TensorArgument)
@@ -1463,10 +1543,17 @@ def serialize_argument_spec(self, x: ep.ArgumentSpec) -> Argument:
         else:
             raise AssertionError("TODO")
 
+<<<<<<< HEAD
     def serialize_treespec(self, treespec: pytree.TreeSpec) -> str:
         # We want to additionally save all the field names of the namedtuples in
         # case users want to check that the treespec types are equivalent
         def store_namedtuple_fields(ts: pytree.TreeSpec) -> None:
+=======
+    def serialize_treespec(self, treespec):
+        # We want to additionally save all the field names of the namedtuples in
+        # case users want to check that the treespec types are equivalent
+        def store_namedtuple_fields(ts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if ts.type is None:
                 return
             if ts.type is namedtuple or pytree.is_namedtuple_class(ts.type):
@@ -1484,11 +1571,19 @@ def store_namedtuple_fields(ts: pytree.TreeSpec) -> None:
                             f"but somehow previously was found to have field names {field_names}."
                         )
                 else:
+<<<<<<< HEAD
                     self.treespec_namedtuple_fields[serialized_type_name] = (
                         NamedTupleDef(field_names=ts.context._fields)
                     )
 
             for child in ts.children():
+=======
+                    self.treespec_namedtuple_fields[
+                        serialized_type_name
+                    ] = NamedTupleDef(field_names=ts.context._fields)
+
+            for child in ts.children_specs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 store_namedtuple_fields(child)
 
         serialized_treespec = treespec_dumps(treespec, TREESPEC_VERSION)
@@ -1597,7 +1692,11 @@ def _is_single_tensor_list_return(target: Any) -> bool:
                 assert isinstance(
                     return_schema.real_type, (torch.OptionalType, torch.TensorType)
                 )
+<<<<<<< HEAD
                 # When the return type is annotated as Tensor type, the op can also return an
+=======
+                # When the return type is annoated as Tensor type, the op can also return an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # undefined Tensor which will be implicitly converted to None in Python.
                 output_arguments.append(Argument.create(as_none=True))
             elif isinstance(meta, FakeTensor):
@@ -1668,6 +1767,7 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                     outputs.append(self.serialize_output(name, element_meta_val))
 
             return outputs
+<<<<<<< HEAD
         elif isinstance(meta_val, dict):
             tensor_args = []
             # use the dict key as the idx
@@ -1679,6 +1779,8 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                 name = self._output_node_name_at_index(node, idx)
                 tensor_args.append(self.serialize_tensor_output(name, meta))
             return [Argument.create(as_tensors=tensor_args)]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return [self.serialize_output(node.name, meta_val)]
 
@@ -1716,9 +1818,15 @@ def _handle_getitem_users(self, node: torch.fx.Node) -> list[TensorArgument]:
 
         idx_to_name = {}
         for user in node.users:
+<<<<<<< HEAD
             assert user.target is operator.getitem, (
                 f"User node {user} of {node} is incorrect"
             )
+=======
+            assert (
+                user.target is operator.getitem
+            ), f"User node {user} of {node} is incorrect"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             idx_to_name[user.args[1]] = user.name
 
         for idx, _ in enumerate(meta_val):
@@ -1836,7 +1944,10 @@ def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
             ),
             verifiers=[v.dialect for v in exported_program.verifiers],
             torch_version=torch.__version__,
+<<<<<<< HEAD
             guards_code=exported_program._guards_code,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Test canonical form is well defined.
@@ -1870,7 +1981,11 @@ class Result:
 
     def __init__(self) -> None:
         self.serialized_name_to_node: dict[str, torch.fx.Node] = {}
+<<<<<<< HEAD
         self.serialized_name_to_meta: LazyMap = LazyMap()  # str -> MetaType
+=======
+        self.serialized_name_to_meta: dict[str, MetaType] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.graph = torch.fx.Graph()
         self.module = torch.nn.Module()
 
@@ -1886,7 +2001,11 @@ def save_graph_module(self) -> Iterator[None]:
         self.graph = torch.fx.Graph()
         self.module = torch.nn.Module()
         self.serialized_name_to_node = {}
+<<<<<<< HEAD
         self.serialized_name_to_meta = LazyMap()
+=======
+        self.serialized_name_to_meta = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.unbacked_symbols: set[sympy.Symbol] = set()
         try:
             yield
@@ -2075,6 +2194,7 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
         # Handle the tensor metas.
         for name, tensor_value in serialized_graph.tensor_values.items():
             log.debug("[deserialize_tensor_meta] %s (input): %s", name, tensor_value)
+<<<<<<< HEAD
             self.serialized_name_to_meta[name] = (
                 lambda v=tensor_value: self.deserialize_tensor_meta(v)
             )
@@ -2101,6 +2221,34 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
             log.debug("[deserialize_script_obj_meta] %s", script_obj_meta)
             self.serialized_name_to_meta[name] = (
                 lambda v=script_obj_meta: self.deserialize_script_obj_meta(v)
+=======
+            meta_val = self.deserialize_tensor_meta(tensor_value)
+            log.debug("[deserialize_tensor_meta] %s (output): %s", name, meta_val)
+            self.serialized_name_to_meta[name] = meta_val
+
+        for name, sym_int_value in serialized_graph.sym_int_values.items():
+            log.debug("[deserialize_sym_int] %s (input): %s", name, sym_int_value)
+            int_val = self.deserialize_sym_int(sym_int_value)
+            log.debug("[deserialize_sym_int] %s (output): %s", name, int_val)
+            self.serialized_name_to_meta[name] = int_val
+
+        for name, sym_float_value in serialized_graph.sym_float_values.items():
+            log.debug("[deserialize_sym_float] %s (input): %s", name, sym_float_value)
+            float_val = self.deserialize_sym_float(sym_float_value)
+            log.debug("[deserialize_sym_float] %s (output): %s", name, float_val)
+            self.serialized_name_to_meta[name] = float_val
+
+        for name, sym_bool_value in serialized_graph.sym_bool_values.items():
+            log.debug("[deserialize_sym_bool] %s (input): %s", name, sym_bool_value)
+            bool_val = self.deserialize_sym_bool(sym_bool_value)
+            log.debug("[deserialize_sym_bool] %s (output): %s", name, bool_val)
+            self.serialized_name_to_meta[name] = bool_val
+
+        for name, script_obj_meta in serialized_graph.custom_obj_values.items():
+            log.debug("[deserialize_script_obj_meta] %s", script_obj_meta)
+            self.serialized_name_to_meta[name] = self.deserialize_script_obj_meta(
+                script_obj_meta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         log.debug("\n[deserialize graph nodes]")
@@ -2206,6 +2354,7 @@ def _is_single_tensor_return(target) -> bool:
 
             fx_node = self.graph.create_node("call_function", target, args, {}, name)
             self.deserialize_sym_op_outputs(serialized_node, fx_node)
+<<<<<<< HEAD
         elif (
             target
             is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
@@ -2213,6 +2362,9 @@ def _is_single_tensor_return(target) -> bool:
             raise SerializeError(
                 "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional"
             )
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(target, torch._ops.HigherOrderOperator):
             args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
             metadata = self.deserialize_metadata(serialized_node.metadata)
@@ -2264,7 +2416,11 @@ def _is_single_tensor_return(target) -> bool:
             _additional_msg = (
                 (
                     f"We failed to resolve {target} to an operator. "
+<<<<<<< HEAD
                     + "If it's a custom op/custom triton op, this is usually because the custom op is not registered"
+=======
+                    + "If it's a custom op/custom triton op, this is usally because the custom op is not registered"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     + " when deserializing. Please import the custom op to register it before deserializing."
                     + " Otherwise, please file an issue on github."
                 )
@@ -2285,6 +2441,7 @@ def _is_single_tensor_return(target) -> bool:
             fx_node.kwargs,
             fx_node.meta.get("val"),
         )
+<<<<<<< HEAD
 
         # handle ShapeEnv asserts
         if target == torch.ops.aten._assert_scalar.default:
@@ -2300,11 +2457,19 @@ def _is_single_tensor_return(target) -> bool:
                 self.shape_env._constrain_range_for_size(sym.node.expr)
 
         # handle nn_module_stack; serialization throws away empty dicts
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             fx_node.op not in ["placeholder", "output"]
             and "nn_module_stack" not in fx_node.meta
         ):
+<<<<<<< HEAD
             fx_node.meta["nn_module_stack"] = {}
+=======
+            fx_node.meta[
+                "nn_module_stack"
+            ] = {}  # serialization throws away empty dicts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def deserialize_input_spec(self, i: InputSpec) -> ep.InputSpec:
         log.debug("[deserialize_input_spec] %s", i)
@@ -2379,12 +2544,15 @@ def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
                 arg=ep.TensorArgument(name=o.buffer_mutation.arg.name),
                 target=o.buffer_mutation.buffer_name,
             )
+<<<<<<< HEAD
         elif o.type == "parameter_mutation":
             return ep.OutputSpec(
                 kind=ep.OutputKind.PARAMETER_MUTATION,
                 arg=ep.TensorArgument(name=o.parameter_mutation.arg.name),
                 target=o.parameter_mutation.parameter_name,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif o.type == "gradient_to_parameter":
             return ep.OutputSpec(
                 kind=ep.OutputKind.GRADIENT_TO_PARAMETER,
@@ -2487,6 +2655,11 @@ def deserialize(
             if symbol_name_to_range:
                 for k, vr in symbol_name_to_range.items():
                     lower = vr.lower
+<<<<<<< HEAD
+=======
+                    if vr.upper >= 2:  # max is >= 2, not sym bool range
+                        lower = max(2, lower)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.symbol_name_to_range[k] = symbolic_shapes.ValueRanges(
                         _int_to_sympy_int(lower, -int_oo), vr.upper
                     )
@@ -2500,9 +2673,15 @@ def deserialize(
             # TODO(pianpwk): if we can clean up unused symbols in range_constraints,
             # then this logic can just be handled with self.unbacked_symbols alone
             for _ in range(count_unbacked_symfloat + 1):
+<<<<<<< HEAD
                 self.shape_env.unbacked_symfloat_counter += 1
             for _ in range(count_unbacked_symint + 1):
                 self.shape_env.unbacked_symint_counter += 1
+=======
+                next(self.shape_env.unbacked_symfloat_counter)
+            for _ in range(count_unbacked_symint + 1):
+                next(self.shape_env.unbacked_symint_counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if example_inputs is not None and len(example_inputs) > 0:
                 self.example_inputs = deserialize_torch_artifact(example_inputs)
@@ -2638,8 +2817,11 @@ def deserialize_input(self, inp: Argument) -> Any:
             return inp.as_bool
         elif typ_ == "as_string":
             return inp.as_string
+<<<<<<< HEAD
         elif typ_ == "as_complex":
             return complex(inp.as_complex.real, inp.as_complex.imag)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif typ_ == "as_sym_int":
             return self.deserialize_sym_argument(inp.as_sym_int)
         elif typ_ == "as_sym_float":
@@ -2741,7 +2923,10 @@ def _deserialize_hop_with_single_return(serialized_node, fx_node):
                     serialized_node.metadata
                 )
                 assert arg is not None
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.generate_getitem(meta_val, fx_node, arg, 0, deserialized_metadata)
                 fx_node.meta["val"] = tuple(meta_val)
                 self.serialized_name_to_node[fx_node.name] = fx_node
@@ -3082,7 +3267,10 @@ def deserialize(
             constants=res.constants,
             verifiers=[load_verifier(v) for v in exported_program.verifiers],
         )
+<<<<<<< HEAD
         result._guards_code = exported_program.guards_code
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug("\n[deserialize]: %s", result)
         return result
 
@@ -3115,7 +3303,11 @@ def _dataclass_to_dict(obj):
             return "Infinity"
         elif obj == -math.inf:
             return "-Infinity"
+<<<<<<< HEAD
         elif math.isnan(obj):
+=======
+        elif obj == math.nan:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "NaN"
         else:
             return obj
@@ -3167,6 +3359,7 @@ def _dict_to_dataclass(cls, data):
         _value = next(iter(data.values()))
         assert isinstance(_type, str)
         field_type = cls.__annotations__[_type]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
     elif dataclasses.is_dataclass(cls):
@@ -3181,6 +3374,17 @@ def _dict_to_dataclass(cls, data):
             new_field_obj = _dict_to_dataclass(type_hints[name], data[name])
             fields[name] = new_field_obj
         return cls(**fields)  # type: ignore[operator]
+=======
+        return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
+    elif dataclasses.is_dataclass(cls):
+        obj = cls(**data)  # type: ignore[assignment,operator]
+        type_hints = typing.get_type_hints(cls)
+        for f in dataclasses.fields(cls):
+            name = f.name
+            new_field_obj = _dict_to_dataclass(type_hints[name], getattr(obj, name))
+            setattr(obj, name, new_field_obj)
+        return obj
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(data, list):
         if len(data) == 0:
             return data
@@ -3189,11 +3393,16 @@ def _dict_to_dataclass(cls, data):
     elif isinstance(data, dict):
         v_type = typing.get_args(cls)[1]
         return {k: _dict_to_dataclass(v_type, v) for k, v in data.items()}
+<<<<<<< HEAD
     elif cls is float:
+=======
+    elif cls == float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return float(data)
     return data
 
 
+<<<<<<< HEAD
 def _bytes_to_dataclass(cls: Any, artifact_bytes: bytes) -> Any:
     artifact_str = artifact_bytes.decode("utf-8")
     artifact_dict = json.loads(artifact_str)
@@ -3201,6 +3410,8 @@ def _bytes_to_dataclass(cls: Any, artifact_bytes: bytes) -> Any:
     return artifact_dataclass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def deserialize(
     artifact: SerializedArtifact,
     expected_opset_version: Optional[dict[str, int]] = None,
@@ -3208,8 +3419,15 @@ def deserialize(
     _unsafe_skip_version_check=False,
 ) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
+<<<<<<< HEAD
     serialized_exported_program = _bytes_to_dataclass(
         ExportedProgram, artifact.exported_program
+=======
+    exported_program_str = artifact.exported_program.decode("utf-8")
+    exported_program_dict = json.loads(exported_program_str)
+    serialized_exported_program = _dict_to_dataclass(
+        ExportedProgram, exported_program_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return ExportedProgramDeserializer(expected_opset_version).deserialize(
         serialized_exported_program,
@@ -3242,8 +3460,11 @@ def _get_argument(a: Argument):
             return None
         elif a.type == "as_strings":
             return None
+<<<<<<< HEAD
         elif a.type == "as_complex":
             return None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif a.type == "as_sym_int":
             return a.as_sym_int
         elif a.type == "as_sym_ints":
@@ -3474,6 +3695,7 @@ def replace_use(a):
         n.metadata.clear()
 
     # Stage 4: Aggregate values.
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
     sorted_tensor_values = dict(
         sorted(graph.tensor_values.items(), key=operator.itemgetter(0))
@@ -3491,6 +3713,20 @@ def replace_use(a):
         sorted(graph.sym_bool_values.items(), key=operator.itemgetter(0))
     )
     # pyrefly: ignore [no-matching-overload]
+=======
+    sorted_tensor_values = dict(
+        sorted(graph.tensor_values.items(), key=operator.itemgetter(0))
+    )
+    sorted_sym_int_values = dict(
+        sorted(graph.sym_int_values.items(), key=operator.itemgetter(0))
+    )
+    sorted_sym_float_values = dict(
+        sorted(graph.sym_float_values.items(), key=operator.itemgetter(0))
+    )
+    sorted_sym_bool_values = dict(
+        sorted(graph.sym_bool_values.items(), key=operator.itemgetter(0))
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sorted_custom_obj_values = dict(
         sorted(graph.custom_obj_values.items(), key=operator.itemgetter(0))
     )
@@ -3547,14 +3783,20 @@ def canonicalize(
         ExportedProgram: The canonicalized exported program.
     """
     ep = copy.deepcopy(ep)
+<<<<<<< HEAD
     # pyrefly: ignore [annotation-mismatch]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constants: set[str] = constants or set()
 
     opset_version = dict(sorted(ep.opset_version.items(), key=operator.itemgetter(0)))
     range_constraints = dict(
         sorted(ep.range_constraints.items(), key=operator.itemgetter(0))
     )
+<<<<<<< HEAD
     guards_code = sorted(ep.guards_code)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
     signature = ep.graph_module.signature
     graph = ep.graph_module.graph
@@ -3586,6 +3828,7 @@ def rank_output(out) -> tuple[int, Optional[str], int]:
         idx, (_arg, spec) = out
         assert isinstance(spec, OutputSpec)
         if spec.type == "user_output":
+<<<<<<< HEAD
             return 4, None, idx
         elif spec.type == "loss_output":
             return 4, None, idx
@@ -3599,6 +3842,19 @@ def rank_output(out) -> tuple[int, Optional[str], int]:
             return 6, None, idx
         elif spec.type == "user_input_mutation":
             return 3, None, idx
+=======
+            return 3, None, idx
+        elif spec.type == "loss_output":
+            return 3, None, idx
+        elif spec.type == "buffer_mutation":
+            return 1, spec.buffer_mutation.buffer_name, idx
+        elif spec.type == "gradient_to_parameter":
+            return 4, spec.gradient_to_parameter.parameter_name, idx
+        elif spec.type == "gradient_to_user_input":
+            return 5, None, idx
+        elif spec.type == "user_input_mutation":
+            return 2, None, idx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif spec.type == "token":
             return 0, None, idx
         else:
@@ -3711,9 +3967,12 @@ def replace_output(out):
         elif spec.type == "buffer_mutation":
             t = spec.buffer_mutation.arg
             t.name = replace_table[t.name]
+<<<<<<< HEAD
         elif spec.type == "parameter_mutation":
             t = spec.parameter_mutation.arg
             t.name = replace_table[t.name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif spec.type == "gradient_to_parameter":
             t = spec.gradient_to_parameter.arg
             t.name = replace_table[t.name]
@@ -3751,7 +4010,10 @@ def replace_output(out):
         schema_version=ep.schema_version,
         verifiers=ep.verifiers,
         torch_version=ep.torch_version,
+<<<<<<< HEAD
         guards_code=guards_code,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -3782,9 +4044,15 @@ def register_extension(
     extension_handler: type[ExtensionHandler],
 ):
     """Register custom de/serialization method for a node with non-standard type."""
+<<<<<<< HEAD
     assert issubclass(extension_handler, ExtensionHandler), (
         f"Expected ExtensionHandler, got {extension_handler}."
     )
+=======
+    assert issubclass(
+        extension_handler, ExtensionHandler
+    ), f"Expected ExtensionHandler, got {extension_handler}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert op_type not in _serialization_registry, f"{op_type} is already registered."
     assert isinstance(op_type, type)  # Maybe a good idea to enforce this first.
     assert not (
diff --git a/torch/_export/serde/union.py b/torch/_export/serde/union.py
index c65ad38d337fe..c35e8070e0786 100644
--- a/torch/_export/serde/union.py
+++ b/torch/_export/serde/union.py
@@ -1,12 +1,16 @@
 # mypy: allow-untyped-defs
 import functools
 from collections.abc import Hashable
+<<<<<<< HEAD
 from dataclasses import dataclass, fields
 from typing import TypeVar
 from typing_extensions import dataclass_transform
 
 
 T = TypeVar("T", bound="_Union")
+=======
+from dataclasses import fields
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _UnionTag(str):
@@ -23,9 +27,15 @@ def create(t, cls):
     def __eq__(self, cmp) -> bool:
         assert isinstance(cmp, str)
         other = str(cmp)
+<<<<<<< HEAD
         assert other in _get_field_names(self._cls), (
             f"{other} is not a valid tag for {self._cls}. Available tags: {_get_field_names(self._cls)}"
         )
+=======
+        assert other in _get_field_names(
+            self._cls
+        ), f"{other} is not a valid tag for {self._cls}. Available tags: {_get_field_names(self._cls)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return str(self) == other
 
     def __hash__(self):
@@ -37,6 +47,7 @@ def _get_field_names(cls) -> set[str]:
     return {f.name for f in fields(cls)}
 
 
+<<<<<<< HEAD
 # If you turn a schema class that inherits from union into a dataclass, please use
 # this decorator to configure it. It's safe, faster and allows code sharing.
 #
@@ -49,6 +60,8 @@ def _union_dataclass(cls: type[T]) -> type[T]:
     return dataclass(repr=False, eq=False)(cls)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _Union:
     _type: _UnionTag
 
@@ -60,10 +73,14 @@ def create(cls, **kwargs):
         return obj
 
     def __post_init__(self):
+<<<<<<< HEAD
         assert not any(
             f.name in ("type", "_type", "create", "value")
             for f in fields(self)  # type: ignore[arg-type, misc]
         )
+=======
+        assert not any(f.name in ("type", "_type", "create", "value") for f in fields(self))  # type: ignore[arg-type, misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def type(self) -> str:
@@ -84,11 +101,14 @@ def __getattribute__(self, name):
             raise AttributeError(f"Field {name} is not set.")
         return attr
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, _Union):
             return False
         return self.type == other.type and self.value == other.value
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __str__(self):
         return self.__repr__()
 
diff --git a/torch/_export/tools.py b/torch/_export/tools.py
index b254fd62e3b2d..ba696c817e505 100644
--- a/torch/_export/tools.py
+++ b/torch/_export/tools.py
@@ -51,8 +51,12 @@ def pre_forward(module, module_args, module_kwargs):
         model(*args, **kwargs)
     except Exception as e:
         warnings.warn(
+<<<<<<< HEAD
             f"Failed to generate submodule inputs because of the following error:\n{e}",
             stacklevel=2,
+=======
+            f"Failed to generate submodule inputs because of the following error:\n{e}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     finally:
         for h in handles:
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index cc7cbee8dff47..d20cf46254d82 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -8,18 +8,28 @@
 import math
 import operator
 import re
+<<<<<<< HEAD
 from collections import defaultdict
 from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from inspect import ismethod, Parameter
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Iterable
+from contextlib import contextmanager
+from inspect import ismethod, Parameter
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._guards import detect_fake_mode
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx._utils import first_call_function_nn_module_stack
+<<<<<<< HEAD
 from torch.fx.experimental.proxy_tensor import PreDispatchTorchFunctionMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
 
@@ -212,6 +222,7 @@ def _getattr(model: torch.fx.GraphModule, attr_name: str):
     return params_buffers_to_node_meta
 
 
+<<<<<<< HEAD
 def _maybe_find_pre_dispatch_tf_mode_for_export():
     if not torch._C._is_torch_function_mode_enabled():
         return None
@@ -235,6 +246,8 @@ def _maybe_find_pre_dispatch_tf_mode_for_export():
     return mode
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _populate_param_buffer_metadata_to_new_gm(
     params_buffers_to_node_meta: dict[str, Any],
     gm: torch.fx.GraphModule,
@@ -280,8 +293,11 @@ def _get_shape_env_from_gm(gm: torch.fx.GraphModule):
 
 def _rename_without_collisions(
     name_map: dict[str, str],
+<<<<<<< HEAD
     find_available: dict[str, int],
     used_names: set[str],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_name: str,
     name: str,
     is_placeholder: bool = False,
@@ -289,12 +305,16 @@ def _rename_without_collisions(
     """
     Renames nodes to avoid name collisions, with suffixing.
     name_map: map from original name to new name
+<<<<<<< HEAD
     find_available: map prefix to available suffix
     used_names: cache of used names
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_name: mapping key
     name: candidate name (potentially suffixed, e.g. mul_2)
     is_placeholder: if the node is a placeholder, avoid detecting suffix
     """
+<<<<<<< HEAD
     match = re.match(r"(.*)_(\d+)", name)
     key = name
 
@@ -315,6 +335,21 @@ def _rename_without_collisions(
     name_map[orig_name] = new_name
     used_names.add(new_name)
 
+=======
+    if name in name_map.values():
+        # non-placeholder nodes may be suffixed with the count
+        # instead of adding another suffix, we will try to increment it
+        match = re.match(r"(.*)_(\d+)", name)
+        if match and not is_placeholder:
+            name, n = match.group(1), int(match.group(2))
+        else:
+            n = 0
+        while (dup_name := f"{name}_{n + 1}") in name_map.values():
+            n += 1
+        name_map[orig_name] = dup_name
+    else:
+        name_map[orig_name] = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return name_map[orig_name]
 
 
@@ -331,7 +366,11 @@ def get_keystr(key_path: KeyPath) -> str:
         return f"*args{keystr(key_path[1:])}"
     else:
         kwarg_key = key_path[1]
+<<<<<<< HEAD
         assert isinstance(kwarg_key, (GetAttrKey, MappingKey))
+=======
+        assert isinstance(kwarg_key, MappingKey)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
         return f"{name}{keystr(key_path[2:])}"
 
@@ -419,7 +458,11 @@ def _check_symint(
         # this means we deferred a guard from export analysis to runtime, let this pass
         # we'll add a runtime assert checking equality to this replacement expression
         pass
+<<<<<<< HEAD
     elif arg != int(symint):
+=======
+    elif arg != symint:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         path = get_keystr(keypath)
         if i is not None:
             path += f".shape[{i}]"
@@ -464,18 +507,26 @@ def _check_input_constraints_for_graph(
                 )
 
         elif isinstance(node_val, (int, float, str)):
+<<<<<<< HEAD
             if type(arg) is not type(node_val) or arg != node_val:
+=======
+            if type(arg) != type(node_val) or arg != node_val:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise RuntimeError(
                     f"Expected input at {get_keystr(key_path)} to be equal to {node_val}, but got {arg}",
                 )
         elif isinstance(node_val, torch.SymInt):
             _check_symint(
+<<<<<<< HEAD
                 node_val,
                 arg,
                 range_constraints,
                 unification_map,
                 key_path,
                 None,
+=======
+                node_val, arg, range_constraints, unification_map, key_path, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -489,11 +540,18 @@ def register_dataclass_as_pytree_node(
     from_dumpable_context: Optional[FromDumpableContextFn] = None,
     return_none_fields: bool = False,
 ) -> None:
+<<<<<<< HEAD
     assert dataclasses.is_dataclass(cls), (
         f"Only dataclasses can be registered with this function: {cls}"
     )
 
     @torch._dynamo.dont_skip_tracing
+=======
+    assert dataclasses.is_dataclass(
+        cls
+    ), f"Only dataclasses can be registered with this function: {cls}"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def default_flatten_fn(obj: Any) -> tuple[list[Any], Context]:
         flattened = []
         flat_names = []
@@ -507,12 +565,18 @@ def default_flatten_fn(obj: Any) -> tuple[list[Any], Context]:
                 none_names.append(name)
         return flattened, [flat_names, none_names]
 
+<<<<<<< HEAD
     @torch._dynamo.dont_skip_tracing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def default_unflatten_fn(values: Iterable[Any], context: Context) -> Any:
         flat_names, none_names = context
         return cls(**dict(zip(flat_names, values)), **dict.fromkeys(none_names))
 
+<<<<<<< HEAD
     @torch._dynamo.dont_skip_tracing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def default_flatten_fn_with_keys(obj: Any) -> tuple[list[Any], Context]:
         flattened, (flat_names, _none_names) = flatten_fn(obj)  # type: ignore[misc]
         return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
@@ -688,6 +752,7 @@ def _insert_aten_to_metadata_assert_pass(gm: torch.fx.GraphModule) -> None:
                 continue
 
             if (tensor_val := node.args[0].meta.get("val")) is not None:
+<<<<<<< HEAD
                 with (
                     gm.graph.inserting_before(node),
                     _set_node_metadata_hook(
@@ -699,6 +764,13 @@ def _insert_aten_to_metadata_assert_pass(gm: torch.fx.GraphModule) -> None:
                                 "nn_module_stack": node.meta.get("nn_module_stack"),
                             },
                         ),
+=======
+                with gm.graph.inserting_before(node), _set_node_metadata_hook(
+                    gm,
+                    functools.partial(
+                        _node_metadata_hook,
+                        stack_trace=node.meta.get("stack_trace"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ):
                     gm.graph.call_function(
@@ -725,10 +797,14 @@ def apply_runtime_assertion_pass(gm: torch.fx.GraphModule, graph_signature):
             "in insert_deferred_runtime_asserts"
         )
         with _set_node_metadata_hook(
+<<<<<<< HEAD
             gm,
             functools.partial(
                 _node_metadata_hook, metadata={"stack_trace": stack_trace}
             ),
+=======
+            gm, functools.partial(_node_metadata_hook, stack_trace=stack_trace)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             shape_env = _get_shape_env_from_gm(gm)
             if shape_env:
@@ -917,6 +993,7 @@ def _bind_signature_to_inputs(mod, fake_args, fake_kwargs):
     return {**sig.bind_partial(*fake_args).arguments, **fake_kwargs}
 
 
+<<<<<<< HEAD
 def _build_cache(name, find_available, used_names):
     used_names.add(name)
     match = re.match(r"(.*)_(\d+)", name)
@@ -926,6 +1003,8 @@ def _build_cache(name, find_available, used_names):
             find_available[prefix] = int(n)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _name_hoo_subgraph_placeholders(gm: torch.fx.GraphModule) -> None:
     """
     Propagate placeholder names from the top-level graph into HigherOrderOp subgraphs,
@@ -933,7 +1012,10 @@ def _name_hoo_subgraph_placeholders(gm: torch.fx.GraphModule) -> None:
     Different HOO subgraph types have different input schemas, so we first enumerate them
     and gather the top-level named placeholder nodes.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # gather all HOO subgraphs and their top-level named placeholder nodes
     subgraph_ph_tuples: list[tuple[torch.fx.GraphModule, list[torch.fx.Node]]] = []
     for node in gm.graph.nodes:
@@ -957,17 +1039,25 @@ def _name_hoo_subgraph_placeholders(gm: torch.fx.GraphModule) -> None:
     # propagate names
     for subgraph, hoo_phs in subgraph_ph_tuples:
         name_map: dict[str, str] = {}
+<<<<<<< HEAD
         find_available: dict[str, int] = defaultdict(int)
         used_names: set[str] = set()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, node in enumerate(subgraph.graph.nodes):
             if i < len(hoo_phs):  # placeholder, retain name
                 name_map[node.name] = hoo_phs[i].name
                 node.name = node.target = hoo_phs[i].name
+<<<<<<< HEAD
                 _build_cache(node.name, find_available, used_names)
             else:  # non-placeholder, check for collisions
                 node.name = _rename_without_collisions(
                     name_map, find_available, used_names, node.name, node.name
                 )
+=======
+            else:  # non-placeholder, check for collisions
+                node.name = _rename_without_collisions(name_map, node.name, node.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # recurse and recompile
         _name_hoo_subgraph_placeholders(subgraph)
@@ -1027,8 +1117,11 @@ def _extract_pytree_key(x):
             raise RuntimeError(f"Pytree key of type {type(x)} not handled for {x}")
 
     name_map: dict[str, str] = {}
+<<<<<<< HEAD
     find_available: dict[str, int] = defaultdict(int)
     used_names: set[str] = set()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # map user input names with mod.forward() signature
     combined_args = _bind_signature_to_inputs(mod, fake_args, fake_kwargs)
@@ -1045,8 +1138,11 @@ def _extract_pytree_key(x):
         if user_input_name:
             _rename_without_collisions(
                 name_map,
+<<<<<<< HEAD
                 find_available,
                 used_names,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 user_input_name,
                 placeholder_prefixes[InputKind.USER_INPUT]
                 + "_".join(_extract_pytree_key(x).lower() for x in arg_path),
@@ -1066,8 +1162,11 @@ def _extract_pytree_key(x):
 
         _rename_without_collisions(
             name_map,
+<<<<<<< HEAD
             find_available,
             used_names,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             spec.arg.name,
             placeholder_prefixes[spec.kind] + base_name,
             is_placeholder=True,
@@ -1086,9 +1185,13 @@ def _extract_pytree_key(x):
     for node in gm.graph.nodes:
         if node.op == "placeholder":
             continue
+<<<<<<< HEAD
         _rename_without_collisions(
             name_map, find_available, used_names, node.name, node.name
         )
+=======
+        _rename_without_collisions(name_map, node.name, node.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # assign new node names
     for node in gm.graph.nodes:
@@ -1120,14 +1223,20 @@ def _extract_pytree_key(x):
         if (  # handle targets for custom objects
             spec.kind == InputKind.CUSTOM_OBJ and spec.target in name_map
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             spec.target = name_map[spec.target][4:]  # strip obj_ prefix
 
     for spec in export_graph_signature.output_specs:
         if spec.arg.name in name_map:
             spec.arg.name = name_map[spec.arg.name]
         if spec.kind == OutputKind.USER_INPUT_MUTATION and spec.target in name_map:
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             spec.target = name_map[spec.target]
 
     # rename keys in constants dict for custom objects
@@ -1169,7 +1278,11 @@ def remove_proxy_from_state_dict(state_dict: dict, in_place: bool) -> dict:
 
 def _detect_fake_mode_from_gm(
     gm: torch.fx.GraphModule,
+<<<<<<< HEAD
 ) -> Optional[torch._subclasses.fake_tensor.FakeTensorMode]:
+=======
+) -> torch._subclasses.fake_tensor.FakeTensorMode:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     For a given graph module, we look at the "val" of placeholder nodes to find the fake inputs.
     Additionally, if gm doesn't have placeholders, we further look at the "example_value" or "val" of other nodes.
@@ -1344,7 +1457,11 @@ def _collect_all_valid_cia_ops() -> set["OperatorBase"]:
 
 
 def _get_decomp_for_cia(op: "OperatorBase"):
+<<<<<<< HEAD
     # [NOTE] Separating out func.decompose
+=======
+    # [NOTE] Seperating out func.decompose
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Ideally we should be able to just register func.decompose but
     # we can't as this decomp is gonna be registered to the py_impl.
     # As a result it will infinitely recurse. So we first check if the op
@@ -1420,7 +1537,10 @@ def register_module_as_pytree_input_node(cls: type[torch.nn.Module]) -> None:
 
         import torch
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Module(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1429,15 +1549,23 @@ def __init__(self):
             def forward(self, x):
                 return self.linear(x)
 
+<<<<<<< HEAD
 
         torch._export.utils.register_module_as_pytree_node(InputDataClass)
 
 
+=======
+        torch._export.utils.register_module_as_pytree_node(InputDataClass)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Mod(torch.nn.Module):
             def forward(self, x, m):
                 return m(x) + x
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep = torch.export.export(Mod(), (torch.randn(3), Module()))
         print(ep)
 
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 28593291b22cc..f58e25897b5d9 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -215,8 +215,11 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 torch.sym_min,
                 torch.sym_not,
                 torch.sym_sqrt,
+<<<<<<< HEAD
                 torch.sym_sum,
                 torch.export.custom_ops._call_custom_autograd_function_in_pre_dispatch,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO (tmanlaibaatar)
                 # Predispatch export is able to contain autograd ops.
                 # These will be modeled as HOO later
@@ -224,11 +227,14 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 torch.amp.autocast_mode._enter_autocast,
                 torch.amp.autocast_mode._exit_autocast,
                 torch.fx.experimental.symbolic_shapes.cast_symbool_to_symint_guardless,
+<<<<<<< HEAD
                 torch._functorch.predispatch._add_batch_dim,
                 torch._functorch.predispatch._remove_batch_dim,
                 torch._functorch.predispatch._vmap_increment_nesting,
                 torch._functorch.predispatch._vmap_decrement_nesting,
                 torch._functorch.predispatch.lazy_load_decompositions,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             if not isinstance(op, _allowed_op_types()):
@@ -283,6 +289,7 @@ def _is_type(name, ty):
                         if type(attr).__name__ == "LoweredBackendModule":
                             if (
                                 _is_type("backend_id", str)
+<<<<<<< HEAD
                                 and hasattr(attr, "original_module")
                                 and hasattr(attr, "module_name")
                                 and getattr(attr, "backend_id", None) == "aoti"
@@ -290,6 +297,8 @@ def _is_type(name, ty):
                                 continue
                             if (
                                 _is_type("backend_id", str)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 and _is_type("processed_bytes", bytes)
                                 and _is_type("compile_specs", list)
                                 and hasattr(attr, "original_module")
@@ -476,12 +485,16 @@ def _verify_exported_program_signature(exported_program) -> None:
         )
 
     num_tokens = len(gs.output_tokens)
+<<<<<<< HEAD
     end = (
         len(gs.buffers_to_mutate)
         + len(gs.parameters_to_mutate)
         + len(gs.user_inputs_to_mutate)
         + num_tokens
     )
+=======
+    end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate) + num_tokens
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mutate_nodes: list[str] = output_nodes[num_tokens:end]
     user_output_nodes = output_nodes[end : end + len(gs.user_outputs)]
 
@@ -493,6 +506,7 @@ def _verify_exported_program_signature(exported_program) -> None:
                     f"Dict of buffers that are mutated, in order: {gs.buffers_to_mutate} \n"
                     f"Buffer nodes available: {gs.buffers} \n"
                 )
+<<<<<<< HEAD
         elif mutation_node in gs.parameters_to_mutate:
             if gs.parameters_to_mutate[mutation_node] not in gs.parameters:
                 raise SpecViolationError(
@@ -500,6 +514,8 @@ def _verify_exported_program_signature(exported_program) -> None:
                     f"Dict of parameters that are mutated, in order: {gs.parameters_to_mutate} \n"
                     f"Parameter nodes available: {gs.parameters} \n"
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif mutation_node in gs.user_inputs_to_mutate:
             if gs.user_inputs_to_mutate[mutation_node] not in gs.user_inputs:
                 raise SpecViolationError(
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index e023169403937..b8349a8ca58bf 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -1,12 +1,19 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import inspect
 from contextlib import contextmanager
 from functools import wraps
+=======
+from contextlib import contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._custom_ops
 from torch._C import DispatchKey
+<<<<<<< HEAD
 from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.flat_apply import (
     _ConstantFunction,
     flat_apply,
@@ -17,6 +24,10 @@
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
+<<<<<<< HEAD
+=======
+    get_proxy_slot,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PreDispatchTorchFunctionMode,
     ProxyTorchDispatchMode,
     track_tensor_tree,
@@ -130,7 +141,11 @@ def call(self, *args):
     return cls
 
 
+<<<<<<< HEAD
 def _register_func_spec_proxy_in_tracer(tracer, name, spec):
+=======
+def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This is a wrapper utility method on top of tracer to cache the
     already registered subclass spec attribute. This is useful because
@@ -147,6 +162,7 @@ def _register_func_spec_proxy_in_tracer(tracer, name, spec):
     return tracer.create_proxy("get_attr", qualname, (), {})
 
 
+<<<<<<< HEAD
 def _emit_flat_apply_call(
     *,
     tracer,
@@ -182,6 +198,8 @@ def _is_init(fn):
     return callable(fn) and fn.__name__ == "__init__"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def mark_subclass_constructor_exportable_experimental(constructor_subclass):
     """
     Experimental decorator that makes subclass to be traceable in export
@@ -203,6 +221,13 @@ def __new__(cls, elem, *, requires_grad=False):
         def __init__(self, elem, ...):
             # ...
     """
+<<<<<<< HEAD
+=======
+
+    def _is_init(fn):
+        return callable(fn) and fn.__name__ == "__init__"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not _is_init(constructor_subclass):
         raise RuntimeError(
             f"torch._export.wrappers.mark_constructor_exportable_experimental can only be applied on subclass tensor.__init__"
@@ -211,15 +236,19 @@ def __init__(self, elem, ...):
         )
 
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         constructor_subclass(*args, **kwargs)
 
         if not torch.compiler.is_exporting():
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not is_traceable_wrapper_subclass_type(type(args[0])):
             assert constructor_subclass.__qualname__.endswith("__init__")
             obj_name = constructor_subclass.__qualname__[: -len("__init__")]
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Can't intercept {obj_name} in export because this object is not a traceable "
                 f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
             )
@@ -335,3 +364,72 @@ def wrapper(*args, **kwargs):
         return out
 
     return wrapper
+=======
+                f"Applying mark_constructor_exportable_experimental on {obj_name} is not valid as it is not a traceable "
+                f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
+            )
+        constructor_subclass(*args, **kwargs)
+        if not torch._C._is_torch_function_mode_enabled():
+            return
+        torch_function_mode_stack = torch.overrides._get_current_function_mode_stack()
+
+        pre_dispatch_tf_modes = [
+            mode
+            for mode in torch_function_mode_stack
+            if isinstance(mode, PreDispatchTorchFunctionMode)
+        ]
+        assert (
+            len(pre_dispatch_tf_modes) <= 1
+        ), f"Expected only one PreDispatchTorchFunctionMode, found {len(pre_dispatch_tf_modes)}"
+
+        if len(pre_dispatch_tf_modes) == 0:
+            return
+
+        mode = pre_dispatch_tf_modes[0]
+
+        tracer = mode.tracer
+        subclass = args[0]
+
+        flat_args, in_spec = to_graphable((tuple(args[1:]), kwargs))
+
+        constructor_spec_name = "_".join(
+            constructor_subclass.__qualname__.lower().split(".")
+        )
+        qualname = tracer.get_fresh_qualname(constructor_spec_name)  # type: ignore[union-attr]
+        setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
+        spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
+        flat_proxy_args = pytree.tree_map_only(
+            torch.Tensor, lambda x: get_proxy_slot(x, tracer).proxy, flat_args
+        )
+
+        _, func_spec = torch.utils._pytree.tree_flatten(
+            _ConstantFunction(type(subclass))
+        )
+
+        # We actually don't want to create a new spec for each instance
+        # In fx graph, it will look like dtensor_const_func_spec
+        # We can't directly shove DTensor.__init__ into fx as it is not
+        # allowed type.
+        fxable_constructor_call_spec_name = (
+            type(subclass).__name__.lower() + "_const_func_spec"
+        )
+
+        # We should try to reuse the constructor call spec as it is guaranteed to be same
+        # for each subclass type. This is different from proxy-ing the init arguments which
+        # can't be reused because for example, DTensor can receive different DeviceMesh etc
+        # as it's arguments
+        func_spec_proxy = _register_subclass_spec_proxy_in_tracer(
+            tracer, fxable_constructor_call_spec_name, func_spec
+        )
+
+        inner_proxy = tracer.create_proxy(
+            "call_function",
+            flat_apply,
+            (func_spec_proxy, spec_proxy, *flat_proxy_args),
+            {},
+        )
+        track_tensor_tree(subclass, inner_proxy, constant=None, tracer=tracer)
+        return
+
+    return wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
index b629d43ef3b5d..f5792c04385a3 100644
--- a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
+++ b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
@@ -60,6 +60,7 @@ def create_activation_checkpointing_logging_structure_payload(
     expected_runtime: float,
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
+<<<<<<< HEAD
     memories_banned_nodes: list[int],
     normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
@@ -86,6 +87,12 @@ def create_activation_checkpointing_logging_structure_payload(
     Returns:
         A dictionary containing structured logging information for activation checkpointing.
     """
+=======
+    memories_banned_nodes: list[float],
+    runtimes_banned_nodes: list[float],
+    min_cut_saved_values: list[Node],
+) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     activation_checkpointing_logging_structure_payload: dict[str, Any] = {
         "Joint Graph Size": len(joint_graph.nodes),
         "Joint Graph Edges": {
@@ -99,8 +106,12 @@ def create_activation_checkpointing_logging_structure_payload(
         "Expected Runtime": expected_runtime,
         "Knapsack Saved Nodes": saved_node_idxs,
         "Knapsack Recomputed Nodes": recomputable_node_idxs,
+<<<<<<< HEAD
         "Absolute Memories": memories_banned_nodes,
         "Knapsack Input Memories": normalized_memories_banned_nodes,
+=======
+        "Knapsack Input Memories": memories_banned_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Knapsack Input Runtimes": runtimes_banned_nodes,
         "Min Cut Solution Saved Values": [node.name for node in min_cut_saved_values],
     }
@@ -113,6 +124,7 @@ def create_structured_trace_for_min_cut_info(
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
     expected_runtime: float,
+<<<<<<< HEAD
     memories_banned_nodes: list[int],
     normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
@@ -139,11 +151,23 @@ def create_structured_trace_for_min_cut_info(
     }
 
     # Create joint graph node information
+=======
+    memories_banned_nodes: list[float],
+    runtimes_banned_nodes: list[float],
+    min_cut_saved_values: list[Node],
+) -> None:
+    recomputable_node_info: dict[str, int] = {
+        node.name: idx for idx, node in enumerate(all_recomputable_banned_nodes)
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     joint_graph_node_information = create_joint_graph_node_information(
         joint_graph, recomputable_node_info
     )
 
+<<<<<<< HEAD
     # Update node information with recomputable candidate details
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node_name, node_info in joint_graph_node_information.items():
         if node_info["is_recomputable_candidate"]:
             idx = recomputable_node_info[node_name]
@@ -160,6 +184,7 @@ def create_structured_trace_for_min_cut_info(
                 idx in recomputable_node_idxs
             )
 
+<<<<<<< HEAD
     # Create joint graph edges
     joint_graph_edges = create_joint_graph_edges(joint_graph)
 
@@ -184,6 +209,30 @@ def create_structured_trace_for_min_cut_info(
     trace_structured(
         "artifact",
         metadata_fn=lambda: {"name": "min_cut_information", "encoding": "json"},
+=======
+    joint_graph_edges = create_joint_graph_edges(joint_graph)
+    activation_checkpointing_logging_structure_payload = (
+        create_activation_checkpointing_logging_structure_payload(
+            joint_graph,
+            joint_graph_node_information,
+            joint_graph_edges,
+            all_recomputable_banned_nodes,
+            expected_runtime,
+            saved_node_idxs,
+            recomputable_node_idxs,
+            memories_banned_nodes,
+            runtimes_banned_nodes,
+            min_cut_saved_values,
+        )
+    )
+
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "min_cut_information",
+            "encoding": "json",
+        },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         payload_fn=lambda: json.dumps(
             activation_checkpointing_logging_structure_payload
         ),
diff --git a/torch/_functorch/_activation_checkpointing/graph_info_provider.py b/torch/_functorch/_activation_checkpointing/graph_info_provider.py
index 2a5da58fdd633..dd9439c102bf1 100644
--- a/torch/_functorch/_activation_checkpointing/graph_info_provider.py
+++ b/torch/_functorch/_activation_checkpointing/graph_info_provider.py
@@ -96,9 +96,15 @@ def inialize_from_graph(
     @property
     def recomputable_node_only_graph(self) -> nx.DiGraph:
         if self._lazily_initialized_graphs[self.__RECOMPUTABLE_NODE_ONLY_GRAPH] is None:
+<<<<<<< HEAD
             self._lazily_initialized_graphs[self.__RECOMPUTABLE_NODE_ONLY_GRAPH] = (
                 self._create_recomputable_node_only_graph()
             )
+=======
+            self._lazily_initialized_graphs[
+                self.__RECOMPUTABLE_NODE_ONLY_GRAPH
+            ] = self._create_recomputable_node_only_graph()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._lazily_initialized_graphs[self.__RECOMPUTABLE_NODE_ONLY_GRAPH]
 
     @property
@@ -119,17 +125,29 @@ def recomputable_node_only_graph_with_larger_graph_context(self) -> nx.DiGraph:
     @property
     def full_joint_nx_graph(self) -> nx.DiGraph:
         if self._lazily_initialized_graphs[self.__FULL_NX_JOINT_GRAPH] is None:
+<<<<<<< HEAD
             self._lazily_initialized_graphs[self.__FULL_NX_JOINT_GRAPH] = (
                 self._create_full_joint_graph()
             )
+=======
+            self._lazily_initialized_graphs[
+                self.__FULL_NX_JOINT_GRAPH
+            ] = self._create_full_joint_graph()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._lazily_initialized_graphs[self.__FULL_NX_JOINT_GRAPH]
 
     @property
     def simplified_fx_joint_graph(self) -> Graph:
         if self._lazily_initialized_graphs[self.__SIMPLIFIED_FX_JOINT_GRAPH] is None:
+<<<<<<< HEAD
             self._lazily_initialized_graphs[self.__SIMPLIFIED_FX_JOINT_GRAPH] = (
                 self._recreate_psuedo_joint_graph()
             )
+=======
+            self._lazily_initialized_graphs[
+                self.__SIMPLIFIED_FX_JOINT_GRAPH
+            ] = self._recreate_psuedo_joint_graph()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._lazily_initialized_graphs[self.__SIMPLIFIED_FX_JOINT_GRAPH]
 
     def get_non_ac_peak_memory(self) -> float:
@@ -285,7 +303,13 @@ def _visualize_recomputable_candidate_graph_with_larger_context(
                     float(
                         self.recomputable_node_only_graph_with_larger_graph_context.nodes[
                             node
+<<<<<<< HEAD
                         ]["memory"]
+=======
+                        ][
+                            "memory"
+                        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
             )
diff --git a/torch/_functorch/_activation_checkpointing/knapsack.py b/torch/_functorch/_activation_checkpointing/knapsack.py
index 0a3eaa5a9344c..5e3ae9805b8dc 100644
--- a/torch/_functorch/_activation_checkpointing/knapsack.py
+++ b/torch/_functorch/_activation_checkpointing/knapsack.py
@@ -69,12 +69,20 @@ def dp_knapsack(
 
     # Quantize the memory weights
     quantized_memory = torch.tensor(
+<<<<<<< HEAD
         [round(m * S) for m in memory], dtype=torch.long, device="cpu"
+=======
+        [int(round(m * S)) for m in memory], dtype=torch.long, device="cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     runtimes = torch.tensor(runtime, dtype=torch.float32, device="cpu")
 
     # Quantized pseudopolynomial DP for 0-1 Knapsack
+<<<<<<< HEAD
     quantized_max_memory = round(max_memory * S)
+=======
+    quantized_max_memory = int(round(max_memory * S))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     n = len(memory)
 
diff --git a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
index 2a1a3db275d2d..1dabdc929d2ac 100644
--- a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
+++ b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
@@ -1,6 +1,10 @@
 import operator
 from collections import deque
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import networkx as nx
 
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index f60bf274b8fb9..66f928007152f 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -2,7 +2,10 @@
 """
 Utils for caching the outputs of AOTAutograd
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import base64
@@ -16,6 +19,7 @@
 import time
 import traceback
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from copy import copy, deepcopy
 from dataclasses import dataclass
@@ -24,6 +28,15 @@
 
 import torch
 from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
+=======
+from copy import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import override
+
+import torch
+from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.trace_rules import torch_non_c_binding_in_graph_functions
 from torch._dynamo.utils import (
     chromium_event_log_active,
@@ -45,14 +58,21 @@
     sha256_hash,
     write_atomic,
 )
+<<<<<<< HEAD
 from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.output_code import (
     CompiledFxGraph,
     CompiledFxGraphConstants,
     OutputCode,
 )
 from torch._inductor.runtime.runtime_utils import cache_dir
+<<<<<<< HEAD
 from torch._inductor.utils import BoxedBool, should_use_remote_fx_graph_cache
+=======
+from torch._inductor.utils import should_use_remote_fx_graph_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import LazyString
 from torch._utils_internal import log_cache_bypass
 from torch.compiler._cache import (
@@ -72,16 +92,28 @@
     FunctionalizedRngRuntimeWrapper,
     post_compile,
     RuntimeWrapper,
+<<<<<<< HEAD
     SerializableCompiledFunction,
     SubclassMeta,
 )
 from .schemas import AOTAutogradCacheInfo, AOTConfig, ViewAndMutationMeta  # noqa: F401
 from .utils import simple_wraps
+=======
+    SubclassMeta,
+)
+from .schemas import AOTAutogradCacheInfo, AOTConfig, ViewAndMutationMeta  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     from torch._inductor.compile_fx import _CompileFxKwargs
+<<<<<<< HEAD
     from torch._inductor.remote_cache import JsonDataTy, RemoteCache
+=======
+    from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+    from torch._inductor.remote_cache import JsonDataTy, RemoteCache
+    from torch._inductor.utils import BoxedBool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.fx.node import Node
 
 log = logging.getLogger(__name__)
@@ -97,7 +129,11 @@ class FXGraphCacheMiss(BypassAOTAutogradCache):
 
 
 def should_use_remote_autograd_cache():
+<<<<<<< HEAD
     if torch.compiler.config.force_disable_caches:
+=======
+    if torch._inductor.config.force_disable_caches:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     if config.enable_remote_autograd_cache is not None:
         return config.enable_remote_autograd_cache
@@ -118,15 +154,22 @@ def should_use_remote_autograd_cache():
 
 
 def should_use_local_autograd_cache():
+<<<<<<< HEAD
     if torch.compiler.config.force_disable_caches:
+=======
+    if torch._inductor.config.force_disable_caches:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     return config.enable_autograd_cache
 
 
+<<<<<<< HEAD
 def should_bundle_autograd_cache():
     return config.bundled_autograd_cache or torch._dynamo.config.caching_precompile
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_node_safe(node: Node):
     """
     Checks that the node only uses supported operators. We are starting with very
@@ -235,7 +278,11 @@ def is_tensor(target):
                 f"Unsupported call_method target {method_target}. \nMethod module: {module}, \nMethod name: {name}"
             )
         if (
+<<<<<<< HEAD
             type(method_name) is not str
+=======
+            type(method_name) != str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and type(method_name).__name__ != "method_descriptor"
         ):
             raise BypassAOTAutogradCache(
@@ -281,16 +328,37 @@ def check_cacheable(gm: torch.fx.GraphModule):
     # Subgraphs are only used for caching logic.
     if hasattr(gm, "saved_tensors_hooks_pack_0"):
         check_cacheable(gm.saved_tensors_hooks_pack_0)  # type: ignore[arg-type]
+<<<<<<< HEAD
         # We have guarantee of unpack sugraph existence if pack subgraph exists
         check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]
 
 
+=======
+        # We have guarantee of unpack sugraph existance if pack subgraph exists
+        check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]
+
+
+def check_metadata_cacheable(metadata: ViewAndMutationMeta):
+    """
+    When view replay is turned on, we bypass autograd cache if
+    the output is aliased.
+    """
+    if config.view_replay_for_aliased_outputs:
+        for info in metadata.output_info:
+            if info.functional_tensor is not None:
+                raise BypassAOTAutogradCache(
+                    "Cannot cache a graph with functional tensor"
+                )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AOTAutogradCacheDetails(FxGraphHashDetails):
     """
     Object to capture all the details for a dynamo graph module relevant to computing
     a safe and stable cache key for AOTAutograd.
     """
 
+<<<<<<< HEAD
     def get_triton_source_codes_from_gm(
         self,
         gm: torch.fx.GraphModule,
@@ -334,6 +402,8 @@ def get_triton_source_codes_from_gm(
 
         return triton_kernel_source_codes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         gm: torch.fx.GraphModule,
@@ -351,8 +421,11 @@ def __init__(
             [],
             [],
         )
+<<<<<<< HEAD
         if has_triton_package():
             self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if hasattr(gm, "saved_tensors_hooks_pack_0"):
 
@@ -384,7 +457,10 @@ def _add_wrapped_user_cache_hashes(_gm, _l):
 class AOTAutogradCachePickler(FxGraphCachePickler):
     def __init__(self, gm: torch.fx.GraphModule):
         super().__init__(gm)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dispatch_table: dict
         self.dispatch_table.update(
             {
@@ -419,6 +495,7 @@ def _reduce_tensor(self, tensor):
         return (_ident, (metadata,))
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
 def normalize_placeholder_names(gm: torch.fx.GraphModule):
     """
@@ -470,6 +547,8 @@ def normalize_placeholder_names(gm: torch.fx.GraphModule):
         gm.recompile()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def autograd_cache_key(
     gm: torch.fx.GraphModule,
     example_inputs,
@@ -493,6 +572,10 @@ def autograd_cache_key(
 
         if triton.__version__ < "3.2.0":
             raise BypassAOTAutogradCache("AOTAutogradCache requires triton 3.2.0")
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     details = AOTAutogradCacheDetails(gm, example_inputs, config, fx_config)
     pickler = AOTAutogradCachePickler(gm)
     # The prefix distinguishes among the other kinds of objects we cache
@@ -509,12 +592,17 @@ def autograd_cache_key(
 TOut = TypeVar("TOut", bound=OutputCode)
 
 
+<<<<<<< HEAD
 class InductorOutput(ABC, Generic[TOut]):
+=======
+class InductorOutput(Generic[TOut], ABC):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Class representing a single inductor output
     """
 
     @abstractmethod
+<<<<<<< HEAD
     def pre_save(self) -> None: ...
 
     @abstractmethod
@@ -522,6 +610,18 @@ def load(self, example_inputs) -> TOut: ...
 
     @abstractmethod
     def post_compile(self, result: TOut, fx_config: _CompileFxKwargs) -> TOut: ...
+=======
+    def pre_save(self) -> None:
+        ...
+
+    @abstractmethod
+    def load(self, example_inputs) -> TOut:
+        ...
+
+    @abstractmethod
+    def post_compile(self, result: TOut, fx_config: _CompileFxKwargs) -> TOut:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -560,6 +660,10 @@ def post_compile(
             },
             payload_fn=lambda: json.dumps(cache_info),
         )
+<<<<<<< HEAD
+=======
+        counters["inductor"]["fxgraph_cache_hit"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Run normal post compile
         graph.post_compile(self.example_inputs, constants, fx_config)
         return graph
@@ -684,9 +788,13 @@ def post_compile(
         # See note [Wrapping bw_compiler in disable]
         # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
         # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+<<<<<<< HEAD
         return torch._dynamo.disable(  # type: ignore[return-value]
             compiled_bw, reason="do not trace generated backwards pass"
         )
+=======
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Forward types don't have any extra parameters, so this is just a TypeAlias, in essence
@@ -705,9 +813,13 @@ def post_compile(
         # See note [Wrapping bw_compiler in disable]
         # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
         # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+<<<<<<< HEAD
         return torch._dynamo.disable(  # type: ignore[return-value]
             compiled_bw, reason="do not trace generated backwards pass"
         )
+=======
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -792,6 +904,10 @@ def pre_save(self):
         """
         Perform any preparations to make the cache entry ready for serialization.
         """
+<<<<<<< HEAD
+=======
+        check_metadata_cacheable(self.runtime_metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.compiled_fw.pre_save()
         if self.compiled_bw is not None:
             self.compiled_bw.pre_save()
@@ -952,7 +1068,10 @@ def wrap_post_compile(
                 fw_metadata=self.runtime_metadata,
                 try_save_cache_entry=None,
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             compiled_function = RuntimeWrapper(
                 indices_of_inps_to_detach=self.indices_of_inps_to_detach,
@@ -962,7 +1081,10 @@ def wrap_post_compile(
                 compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata
             )
 
+<<<<<<< HEAD
         # Add serialization function back onto object
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_function, _ = post_compile(
             self.dispatch_wrappers,
             compiled_function,
@@ -1010,6 +1132,7 @@ def sanitize_gm_for_cache(gm: torch.fx.GraphModule):
     and then put them back before returning. This way, we generate a cache key based off of a canonical graph
     without these fields, and also guarantee they aren't used to affect the cache's output.
     """
+<<<<<<< HEAD
     # Mapping from each field to a default value
     IGNORED_FIELDS: dict[str, Any] = {
         "meta": {},  # metadata used by export
@@ -1026,6 +1149,23 @@ def sanitize_gm_for_cache(gm: torch.fx.GraphModule):
         with normalize_placeholder_names(gm):
             yield
     finally:
+=======
+    IGNORED_FIELDS = (
+        "meta",  # metadata used by export
+        "compile_subgraph_reason",  # Used by dynamo only for logging, no change in inductor/autograd behavior
+        "_param_name_to_source",  # Encapsulated by aot_config.aot_autograd_arg_pos_to_source
+        "_backend_id",
+    )
+    saved_fields = {}
+    for field in IGNORED_FIELDS:
+        saved_fields[field] = getattr(gm, field, None)
+        # Clear the field
+        setattr(gm, field, None)
+    try:
+        yield
+    finally:
+        # Put the fields back after dispatch_and_compile is complete
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for field, value in saved_fields.items():
             setattr(gm, field, value)
 
@@ -1042,6 +1182,7 @@ def type():
         return "aot_autograd"
 
 
+<<<<<<< HEAD
 def deserialize_bundled_cache_entry(entry: BundledAOTAutogradCacheEntry) -> Callable:
     # In the precompile use case, guards are already serialized
     # by dynamo, so we don't need to add them to the environment
@@ -1092,6 +1233,35 @@ def forward(*runtime_args: tuple[Any]):
 class BundledAOTAutogradCacheArtifact(BackendCacheArtifact[Callable]):
     def after_deserialization(self) -> Callable:
         return deserialize_bundled_cache_entry(self.content)
+=======
+@CacheArtifactFactory.register
+class BundledAOTAutogradCacheArtifact(PrecompileCacheArtifact[Callable]):
+    @override
+    @staticmethod
+    def type():
+        return "precompile_aot_autograd"
+
+    @override
+    def after_deserialization(self) -> Callable:
+        entry = pickle.loads(self.content)
+        # In the precompile use case, guards are already serialized
+        # by dynamo, so we don't need to add them to the environment
+        entry.guards_expr = None
+        # TODO: this isn't exactly right, because cudagraphs needs to be a shared config
+        # which is set by compile_fx. But in precompile, we never actually call compile_fx
+        # so we don't have a place to track cudagraphs here.
+        cudagraphs = torch._inductor.config.triton.cudagraphs
+        compiled_fn = entry.wrap_post_compile(
+            [], entry.sanitized_aot_config, {"cudagraphs": cudagraphs}
+        )
+
+        # TODO: this ignores flat_params, which can exist
+        # if inline_builtin_nn_modules=False
+        def forward(*runtime_args: tuple[Any]):
+            return compiled_fn(list(runtime_args))
+
+        return forward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
@@ -1139,7 +1309,12 @@ def clear():
             pass
 
     @staticmethod
+<<<<<<< HEAD
     def try_load(
+=======
+    def load(
+        dispatch_and_compile: Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod: Union[torch.fx.GraphModule, torch._dynamo.utils.GmWrapper],
         args,
         aot_config: AOTConfig,
@@ -1147,7 +1322,11 @@ def try_load(
         boxed_forward_device_index: Optional[BoxedDeviceIndex],
         local: bool,
         remote: bool,
+<<<<<<< HEAD
     ) -> Optional[Callable]:
+=======
+    ) -> Callable:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Load a result from the cache, and reconstruct a runtime wrapper around the object
         """
@@ -1167,6 +1346,7 @@ def try_load(
                 cache_key, debug_lines = autograd_cache_key(
                     gm, args, aot_config, fx_config
                 )
+<<<<<<< HEAD
                 result: Optional[tuple[GenericAOTAutogradCacheEntry, bytes]] = (
                     AOTAutogradCache._lookup(
                         cache_key, local, remote, args, cache_info, aot_config
@@ -1180,6 +1360,15 @@ def try_load(
                     compiled_fn = SerializableCompiledFunction(
                         compiled_fn, lambda: pickle.loads(pickled_content)
                     )
+=======
+                entry: Optional[
+                    GenericAOTAutogradCacheEntry
+                ] = AOTAutogradCache._lookup(
+                    cache_key, local, remote, args, cache_info, aot_config
+                )
+                if entry is not None:
+                    compiled_fn = entry.wrap_post_compile(args, aot_config, fx_config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     log.info("AOTAutograd cache hit for key %s", cache_key)
 
                     counters["aot_autograd"]["autograd_cache_hit"] += 1
@@ -1201,8 +1390,14 @@ def try_load(
                     # FXGraphCache and AOTAutogradCache?
                     # get_metrics_context().increment(...)
                     if (
+<<<<<<< HEAD
                         ephemeral_increase
                         := add_ephemeral_timeout_increase_for_distributed(time_saved_ns)
+=======
+                        ephemeral_increase := add_ephemeral_timeout_increase_for_distributed(
+                            time_saved_ns
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ) != 0:
                         cache_info["ephemeral_timeout_increase"] = ephemeral_increase
 
@@ -1215,10 +1410,14 @@ def try_load(
             except FXGraphCacheMiss as e:
                 counters["aot_autograd"]["autograd_cache_miss"] += 1
                 cache_state = "miss"
+<<<<<<< HEAD
                 if (
                     config.strict_autograd_cache
                     or torch._dynamo.config.strict_precompile
                 ):
+=======
+                if config.strict_autograd_cache:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise e
             # Most often this is BypassAOTAutogradCache, but
             # if there's ever different reason we can't cache,
@@ -1232,7 +1431,11 @@ def try_load(
             except Exception as e:
                 cache_key = None
                 counters["aot_autograd"]["autograd_cache_bypass"] += 1
+<<<<<<< HEAD
                 log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
+=======
+                log.info("Bypassing autograd cache due to: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cache_state = "bypass"
                 cache_event_time = time.time_ns()
                 cache_info["cache_bypass_reason"] = str(e)
@@ -1248,10 +1451,14 @@ def try_load(
                 )
                 if remote:
                     log_cache_bypass("bypass_aot_autograd", str(e))
+<<<<<<< HEAD
                 if (
                     config.strict_autograd_cache
                     or torch._dynamo.config.strict_precompile
                 ):
+=======
+                if config.strict_autograd_cache:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise e
             if compiled_fn is None:
                 # Set the cache key so we can save a cache result later
@@ -1262,6 +1469,10 @@ def try_load(
                         time.time_ns(),
                         forward_symints=symints,
                     )
+<<<<<<< HEAD
+=======
+                compiled_fn = dispatch_and_compile()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             cache_info.update(
                 {
@@ -1295,7 +1506,10 @@ def try_load(
                 },
                 payload_fn=lambda: json.dumps(cache_info),
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return compiled_fn
 
     @classmethod
@@ -1339,7 +1553,11 @@ def _lookup(
         args: list[Any],
         cache_info: dict[str, Any],
         aot_config: Optional[AOTConfig],
+<<<<<<< HEAD
     ) -> Optional[tuple[GenericAOTAutogradCacheEntry, bytes]]:
+=======
+    ) -> Optional[GenericAOTAutogradCacheEntry]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Given a key generated by AOTAutogradCachePickler, look up its location in the cache."""
         remote_cache: Optional[RemoteCache[JsonDataTy]] = None
         if remote:
@@ -1348,7 +1566,10 @@ def _lookup(
         symints = AOTAutogradCache._filter_backed_symints(args)
         hints = [hint_int(s) for s in symints]
         entry = None
+<<<<<<< HEAD
         pickled_content = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             (
                 entry,
@@ -1366,7 +1587,11 @@ def _lookup(
                     AOTAutogradCacheArtifact.type(), key, pickled_content
                 )
                 if (
+<<<<<<< HEAD
                     should_bundle_autograd_cache()
+=======
+                    config.bundled_autograd_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and aot_config is not None
                     and aot_config.precompile_backend_id is not None
                 ):
@@ -1374,6 +1599,7 @@ def _lookup(
                     # 1. because we set it to None on save 2. even if we didn't, this new run
                     # that cache hit has a *new* backend id associated with it.
                     PrecompileContext.record_artifact(
+<<<<<<< HEAD
                         BundledAOTAutogradCacheArtifact(
                             aot_config.precompile_backend_id, entry
                         ),
@@ -1387,6 +1613,17 @@ def _lookup(
             return (entry, pickled_content)
         else:
             return None
+=======
+                        BundledAOTAutogradCacheArtifact.type(),
+                        aot_config.precompile_backend_id,
+                        pickled_content,
+                    )
+        except Exception as e:
+            log.info("AOTAutograd cache unable to load compiled graph: %s", e)
+            if config.strict_autograd_cache:
+                raise e
+        return entry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _write_to_local_cache(key: str, content: bytes):
@@ -1412,6 +1649,7 @@ def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
                 AOTAutogradCacheArtifact.type(), key, content
             )
             if (
+<<<<<<< HEAD
                 should_bundle_autograd_cache()
                 and entry.sanitized_aot_config.precompile_backend_id is not None
             ):
@@ -1421,16 +1659,36 @@ def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
                 # useful, remove it from the entry.
                 entry.sanitized_aot_config.precompile_backend_id = None
                 PrecompileContext.record_artifact(artifact)
+=======
+                config.bundled_autograd_cache
+                and entry.sanitized_aot_config.precompile_backend_id is not None
+            ):
+                precompile_key = entry.sanitized_aot_config.precompile_backend_id
+                # Now that we're saving it, the precompile_backend_id field is no longer
+                # useful, remove it from the entry.
+                entry.sanitized_aot_config.precompile_backend_id = None
+                PrecompileContext.record_artifact(
+                    BundledAOTAutogradCacheArtifact.type(), precompile_key, content
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             AOTAutogradCache._write_to_local_cache(key, content)
             counters["aot_autograd"]["autograd_cache_saved"] += 1
         except BypassAOTAutogradCache as e:
             counters["aot_autograd"]["autograd_cache_bypass"] += 1
+<<<<<<< HEAD
             log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
+=======
+            log.info("Bypassing autograd cache due to: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if remote:
                 log_cache_bypass("bypass_aot_autograd", str(e))
             return None
         except Exception as e:
+<<<<<<< HEAD
             log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)  # noqa: G200
+=======
+            log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if remote:
                 log_cache_bypass(
                     "bypass_aot_autograd", "Unable to serialize: " + str(e)
@@ -1440,9 +1698,15 @@ def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
             return None
 
         if remote:
+<<<<<<< HEAD
             remote_cache: Optional[RemoteCache[JsonDataTy]] = (
                 AOTAutogradCache.get_remote_cache()
             )
+=======
+            remote_cache: Optional[
+                RemoteCache[JsonDataTy]
+            ] = AOTAutogradCache.get_remote_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if remote_cache is not None:
                 time_taken_ms = int(
                     (entry.forward_time_taken_ns + entry.backward_time_taken_ns) // 1e6
@@ -1487,7 +1751,11 @@ def make_entry(
         num_symints_saved_for_bw: Optional[int],
         serialized_bw_module: Optional[SerializedGraphModule],
     ) -> GenericAOTAutogradCacheEntry:
+<<<<<<< HEAD
         if should_bundle_autograd_cache():
+=======
+        if config.bundled_autograd_cache:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Helper function to unwrap all the wrappers we added during aotdispatch
             # They get reapplied on cache load
             def unwrap_compiled_fx_graph(obj):
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index 6f0a76d5d6f13..2aa6226045ec5 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -11,8 +11,13 @@
 import collections
 import contextlib
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from functools import wraps
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -28,6 +33,7 @@
     transform_subclass,
 )
 
+<<<<<<< HEAD
 from .descriptors import (
     AOTInput,
     AOTOutput,
@@ -36,6 +42,8 @@
     PlainAOTOutput,
     TangentAOTInput,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .functional_utils import (
     are_all_mutations_hidden_from_autograd,
     are_all_mutations_under_no_grad_or_inference_mode,
@@ -44,10 +52,17 @@
     has_metadata_mutation,
     MetadataKey,
     to_fun,
+<<<<<<< HEAD
     ViewMetaSequence,
     was_inductor_storage_resized,
 )
 from .schemas import (
+=======
+    was_inductor_storage_resized,
+)
+from .schemas import (
+    FunctionalTensorMetadataEq,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InputAliasInfo,
     MemoryFormatMeta,
     MutationType,
@@ -56,7 +71,11 @@
     ViewAndMutationMeta,
 )
 from .subclass_utils import create_subclass_meta
+<<<<<<< HEAD
 from .utils import _get_autocast_states, KNOWN_TYPES, simple_wraps, strict_zip
+=======
+from .utils import _get_autocast_states, KNOWN_TYPES, strict_zip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 zip = strict_zip
@@ -69,7 +88,11 @@
 # We assume tangents memory format to be similar to corresponding output's memory_format.
 # The idea is that we are technically making a guess about the strides of our tangents,
 # while we trace out the joint.
+<<<<<<< HEAD
 # If runtime specified tangents will not have the same memory format as predicted traced tangents,
+=======
+# If runtime specfied tangents will not have the same memory format as predicted traced tangents,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # we coerce them at runtime to traced tangents memory format.
 
 
@@ -86,6 +109,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
 
     memory_format = MemoryFormatMeta.from_tensor(out)
 
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     if memory_format.memory_format is not None:
         was = out
@@ -94,6 +118,14 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
         updated = was is not out
 
     # For subclass we keep memory format of outer strides at the beginning of the list
+=======
+    if memory_format.memory_format is not None:
+        was = out
+        out = out.contiguous(memory_format=memory_format.memory_format)
+        updated = was is not out
+
+    # For subclass we keep memory format of outer strides at the beggining of the list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_memory_format = [memory_format] if is_subclass else memory_format
 
     # Note [Tangents memory format, Part 2]
@@ -119,7 +151,10 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
         out = out.__coerce_tangent_metadata__()  # type: ignore[attr-defined]
 
     if is_subclass:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attrs = out.__tensor_flatten__()[0]
 
         for attr in attrs:
@@ -129,7 +164,10 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
                 new_elem_memory_format,
                 elem_updated,
             ) = coerce_tangent_and_suggest_memory_format(elem)
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_memory_format.append(new_elem_memory_format)
             if elem_updated:
                 setattr(out, attr, new_elem)
@@ -159,13 +197,22 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
 def run_functionalized_fw_and_collect_metadata(
     f,
     *,
+<<<<<<< HEAD
     flat_args_descs: list[AOTInput],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     keep_input_mutations: bool,
     # TODO: refactor to kill this flag
     is_train: bool = False,
     # Note: this is guaranteed to be set when running under dynamo
     static_input_indices: Optional[list[int]] = None,
     pre_dispatch: bool = False,
+<<<<<<< HEAD
+=======
+    # is_export is technically only needed to avoid using functionalization V2
+    # during analysis
+    is_export: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: dict[Tensor, Tensor] = {}
 
@@ -179,7 +226,11 @@ def _to_fun(t):
         else:
             return t
 
+<<<<<<< HEAD
     @simple_wraps(f)
+=======
+    @wraps(f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inner(*flat_args):
         # This function is meant to be run with the forward, which expects a flat list of tensor/symint/other args.
         assert all(isinstance(a, tuple(KNOWN_TYPES)) for a in flat_args)
@@ -197,7 +248,11 @@ def inner(*flat_args):
 
         # It doesn't matter if we run this under predispatch or not because it is
         # only for figuring out metadata
+<<<<<<< HEAD
         mode = FunctionalTensorMode(_allow_token_discovery=True)
+=======
+        mode = FunctionalTensorMode(_allow_token_discovery=True, export=is_export)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         suppress_pending = contextlib.nullcontext()
         fake_mode = detect_fake_mode()
         if fake_mode and (shape_env := fake_mode.shape_env):
@@ -205,6 +260,7 @@ def inner(*flat_args):
         with disable_above, mode, suppress_pending:
             # precondition: The passed in function already handles unflattening inputs + flattening outputs
             flat_f_args = pytree.tree_map(_to_fun, flat_args)
+<<<<<<< HEAD
             flat_f_args_descs = flat_args_descs
             flat_f_outs = f(*flat_f_args)
 
@@ -223,6 +279,9 @@ def inner(*flat_args):
             # actually do the trace
             flat_f_outs_descs = [PlainAOTOutput(i) for i in range(len(flat_f_outs))]
 
+=======
+            flat_f_outs = f(*flat_f_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We didn't do any tracing, so we don't need to process the
             # unbacked symbols, they will just disappear into the ether.
             # Also, prevent memoization from applying.
@@ -413,7 +472,10 @@ def inner(*flat_args):
         # maps the id of an intermediate base to its index in the output of the compiled forward
         intermediate_base_tensor_id_to_output_idx: dict[int, int] = {}
         intermediate_bases: list[torch.Tensor] = []
+<<<<<<< HEAD
         intermediate_bases_descs: list[AOTInput] = []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Why Do We Care If Storage Changed?
         # It's important to understand the implications of storage changes in complex scenarios. Take this example:
         #
@@ -438,7 +500,11 @@ def inner(*flat_args):
         # the autograd engine mistakenly assumes that 'x' and 'out' are aliased, treating 'x' as 'out._base'.
         # This misinterpretation leads to an 'alias_of_input' flag, causing an unnecessary as_strided() call to be generated,
         # which could lead to issues later in the code.
+<<<<<<< HEAD
         for o, desc in zip(flat_f_outs, flat_f_outs_descs):
+=======
+        for o in flat_f_outs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             functional_tensor_storage_changed = isinstance(
                 o, FunctionalTensor
             ) and torch._functionalize_was_storage_changed(  # type: ignore[attr-defined]
@@ -493,7 +559,10 @@ def inner(*flat_args):
                 curr_storage in inp_storage_refs
                 and not functional_tensor_storage_changed
             ):
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_idx = inp_storage_refs[curr_storage]
                 is_input_tensor = id(o) in inp_tensor_ids
                 num_aliased_outs = out_tensor_alias_counts[curr_storage]
@@ -600,6 +669,7 @@ def inner(*flat_args):
                             output_type = (
                                 OutputType.alias_of_intermediate_save_as_output
                             )
+<<<<<<< HEAD
                             intermediate_base_tensor_id_to_output_idx[id(o._base)] = (
                                 new_out_idx
                             )
@@ -612,6 +682,12 @@ def inner(*flat_args):
                             intermediate_bases_descs.append(
                                 TangentAOTInput(IntermediateBaseAOTOutput(desc))
                             )
+=======
+                            intermediate_base_tensor_id_to_output_idx[
+                                id(o._base)
+                            ] = new_out_idx
+                            intermediate_bases.append(o._base)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif (
                 # See https://github.com/pytorch/pytorch/issues/100348 for this case.
                 # This protects against the specific case where a user fn returns (output, output.detach())
@@ -620,7 +696,11 @@ def inner(*flat_args):
                 and not o.requires_grad
             ):
                 # In theory we could use any of these tensors to regenerate the aliased outputs from,
+<<<<<<< HEAD
                 # since they all alias each other and have identical metadata
+=======
+                # since they all alias each other and have identical metatadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_alias = outs_with_identical_metadata_that_require_grad[0]
                 existing_out_idx = out_tensor_ids[id(out_alias)]
                 output_type = OutputType.alias_of_intermediate_base_is_user_output
@@ -643,7 +723,11 @@ def inner(*flat_args):
             #
             # The FunctionalTensor will be saved if one of the 2 conditions below
             # is true:
+<<<<<<< HEAD
             view_meta_sequence = None
+=======
+            functional_tensor = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 # 1. If the output_type is either of:
                 #    (i) alias_of_intermediate;
@@ -675,7 +759,11 @@ def inner(*flat_args):
                 and not input_info[base_idx].mutates_metadata
             ):
                 if isinstance(o, FunctionalTensor):
+<<<<<<< HEAD
                     view_meta_sequence = ViewMetaSequence(o)
+=======
+                    functional_tensor = FunctionalTensorMetadataEq(o.elem)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             out_info = OutputAliasInfo(
                 output_type=output_type,
@@ -683,7 +771,11 @@ def inner(*flat_args):
                 base_idx=base_idx,
                 dynamic_dims=dynamic_dims,
                 requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
+<<<<<<< HEAD
                 view_meta_sequence=view_meta_sequence,
+=======
+                functional_tensor=functional_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             output_info.append(out_info)
 
@@ -701,7 +793,10 @@ def view_avoid_dupes_with_primals(t):
         # Anything that aliases (inputs returned in the fw due to metadata mutations, or outputs that alias inputs/intermediates)
         # are *regenerated* later, and not used directly in the autograd graph
         def _plain_fake_tensor_like_subclass(x):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-context-manager]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with detect_fake_mode():
                 return torch.empty(
                     x.shape, dtype=x.dtype, device=x.device, layout=x.layout
@@ -713,7 +808,11 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
                 or torch._functorch.config.disable_guess_zero_tangent_for_mutated_input_subclass
             )
 
+<<<<<<< HEAD
         f_input_tangents_pairs = [
+=======
+        f_input_tangents = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Note: [AOTAutograd Tangent Subclassness for mutated inputs]
             # Generally when creating tangents to trace with, we assume that tangents will have
             # the same subclass-ness as their forward outs
@@ -740,6 +839,7 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
             # (a * b).sum().backward()
             #
             # We can not deduce it easily now, so introducing a debug config to be able to turn off this for specific cases.
+<<<<<<< HEAD
             # NJT guarantees to have its tangent as NJT, because it has dedicated integration in Autograd
             # See torch/csrc/autograd/python_function.cpp, use_zeros_like.
             (
@@ -752,10 +852,22 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
                 TangentAOTInput(InputMutationAOTOutput(inp_desc)),
             )
             for inp, inp_desc, info in zip(flat_f_args, flat_f_args_descs, input_info)
+=======
+            # NJT gurantees to have its tangent as NJT, because it has dedicated integration in Autograd
+            # See torch/csrc/autograd/python_function.cpp, use_zeros_like.
+            (
+                _plain_fake_tensor_like_subclass(inp)
+                if is_traceable_wrapper_subclass(inp)
+                and not _is_subclass_mutated_input_tangent_always_subclass(inp)
+                else inp
+            )
+            for inp, info in zip(flat_f_args, input_info)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if info.mutation_type == MutationType.MUTATED_OUT_GRAPH
             and info.mutates_data
             and info.requires_grad
         ]
+<<<<<<< HEAD
         f_input_tangents, f_input_tangents_descs = (
             [x[0] for x in f_input_tangents_pairs],
             [x[1] for x in f_input_tangents_pairs],
@@ -764,6 +876,11 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
         f_output_tangents_pairs = [
             (o, TangentAOTInput(desc))
             for o, info, desc in zip(flat_f_outs, output_info, flat_f_outs_descs)
+=======
+        f_output_tangents = [
+            o
+            for o, info in zip(flat_f_outs, output_info)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if info.output_type
             in [
                 OutputType.non_alias,
@@ -773,6 +890,7 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
             and issubclass(info.raw_type, torch.Tensor)
             and info.requires_grad
         ]
+<<<<<<< HEAD
         f_output_tangents, f_output_tangents_descs = (
             [x[0] for x in f_output_tangents_pairs],
             [x[1] for x in f_output_tangents_pairs],
@@ -785,18 +903,29 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
         )
 
         # TODO: I'm pretty sure you don't need a tree_map here
+=======
+        # intermediate bases are also included in the backward graph
+        f_tangents = f_input_tangents + f_output_tangents + intermediate_bases
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         traced_tangents = pytree.tree_map(from_fun, f_tangents)
         traced_tangents = pytree.tree_map(
             view_avoid_dupes_with_primals, traced_tangents
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         traced_tangents = [
             coerce_tangent_and_suggest_memory_format(tt)[0]
             for i, tt in enumerate(traced_tangents)
         ]
+<<<<<<< HEAD
         # NB: update this if the maps above ever change structure.
         # Also, it might be helpful to add coercion information to the tangent desc!
         traced_tangents_descs = f_tangents_descs
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonlocal static_input_indices
         static_input_indices = static_input_indices or []
         if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
@@ -857,7 +986,10 @@ def _is_subclass_mutated_input_tangent_always_subclass(inp):
             num_intermediate_bases=len(intermediate_bases),
             keep_input_mutations=keep_input_mutations,
             traced_tangents=traced_tangents,
+<<<<<<< HEAD
             traced_tangents_descs=traced_tangents_descs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             subclass_inp_meta=create_subclass_meta(flat_args),
             subclass_fw_graph_out_meta=create_subclass_meta(fw_graph_outs),
             subclass_tangent_meta=create_subclass_meta(
diff --git a/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py b/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
new file mode 100644
index 0000000000000..3382cb102dcad
--- /dev/null
+++ b/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
@@ -0,0 +1,338 @@
+# mypy: allow-untyped-defs
+"""
+This module dispatches the graphs to either the forward-only or joint compilation
+pathways, taking into account the AOTConfig and the collected ViewAndMutationMetadata.
+"""
+
+import dataclasses
+from typing import Any, Optional
+
+import torch
+import torch.utils._pytree as pytree
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import detect_fake_mode, lazy_format_graph_code
+from torch._logging import getArtifactLogger, trace_structured
+from torch._subclasses.functional_tensor import FunctionalTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torchgen.utils import dataclass_repr
+
+from .. import config
+from .functional_utils import (
+    assert_functional_graph,
+    propagate_input_mutation_stacktraces,
+)
+from .schemas import AOTConfig, SubclassMeta, ViewAndMutationMeta
+from .traced_function_transforms import (
+    aot_dispatch_subclass,
+    create_functionalized_fn,
+    create_joint,
+    fn_input_mutations_to_outputs,
+    fn_prepped_for_autograd,
+    handle_effect_tokens_fn,
+)
+from .utils import (
+    copy_fwd_metadata_to_bw_nodes,
+    register_buffer_assignment_hook,
+    root_module_when_exporting_non_strict,
+    unlift_tokens,
+)
+
+
+aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
+
+
+def _create_graph(f, args, *, aot_config: AOTConfig) -> torch.fx.GraphModule:
+    # FunctionalTensorMode must be enabled here.
+    # See Note [Accessing .grad_fn on FunctionalTensor]
+    with enable_python_dispatcher(), FunctionalTensorMode(
+        pre_dispatch=aot_config.pre_dispatch,
+        export=aot_config.is_export,
+        # Allow token discovery for joint fn tracing as tokens can be used in backward.
+        _allow_token_discovery=True,
+    ):
+        fx_g = make_fx(
+            f,
+            decomposition_table=aot_config.decompositions,
+            record_module_stack=True,
+            pre_dispatch=aot_config.pre_dispatch,
+        )(*args)
+
+    return fx_g
+
+
+# TODO: Refactor the following code so detach() persists item_memo
+def _detach_and_copy_item_memo(t):
+    detached_t = t.detach()
+    if hasattr(t, "item_memo"):
+        detached_t.item_memo = t.item_memo
+    return detached_t
+
+
+def aot_dispatch_base_graph(
+    flat_fn,
+    flat_args: list[Tensor],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> tuple[torch.fx.GraphModule, list[Any], Optional[SubclassMeta]]:
+    # aot_dispatch_base requires functionalization, but doesn't need to handle as many cases as the autograd case.
+    # The cases that aot_dispatch_base doesn't need to handle include:
+    # - outputs that are aliases of graph intermediates
+    # - outputs that are aliases of graph inputs
+    # While cases that it does need to handle include:
+    # - input mutations (including when inputs are aliases of each other)
+    # - input metadata mutations
+    fn_to_trace = fn_input_mutations_to_outputs(
+        flat_fn,
+        fw_metadata,
+        keep_data_input_mutations=aot_config.keep_inference_input_mutations,
+    )
+
+    fn_to_trace, updated_flat_args = create_functionalized_fn(
+        fn_to_trace,
+        flat_args,
+        meta=fw_metadata,
+        aot_config=aot_config,
+        trace_joint=False,
+    )
+
+    # TODO: replace with AOTDispatchSubclassWrapper once we refactor
+    # fn_input_mutations_to_outputs and create_functionalized_fn
+    # into CompilerWrappers.
+    (
+        fn_to_trace,
+        updated_flat_args_subclasses_desugared,
+        maybe_subclass_meta,
+    ) = aot_dispatch_subclass(
+        fn_to_trace,
+        updated_flat_args,
+        is_joint_structure=False,
+        meta=fw_metadata,
+        fw_only=flat_fn,
+    )
+
+    (fn_to_trace, updated_flat_args_subclasses_desugared) = handle_effect_tokens_fn(
+        fn_to_trace,
+        updated_flat_args_subclasses_desugared,
+        meta=fw_metadata,
+        trace_joint=False,
+    )
+
+    aot_graphs_log.debug(
+        "aot_config id: %s, fw_metadata=%s,subclass_metadata=%s",
+        str(aot_config.aot_id),
+        str(fw_metadata),
+        str(maybe_subclass_meta),
+    )
+
+    # We track buffer assignments when exporting in non-strict mode.
+    # (In contrast, strict mode errors on any attribute assignment.)
+    mod_when_exporting_non_strict = root_module_when_exporting_non_strict(flat_fn)
+    if aot_config.is_export and mod_when_exporting_non_strict is not None:
+        # For any buffer that is assigned, we want to associate it to the final proxy node
+        # that it is assigned to. This node can then be added as a buffer mutation output.
+        assigned_buffers: dict[str, str] = {}
+        hook = register_buffer_assignment_hook(
+            mod_when_exporting_non_strict, assigned_buffers
+        )
+
+    fake_mode = detect_fake_mode()
+    if fake_mode:
+        saved_updated_flat_args_subclasses_desugared = pytree.tree_map_only(
+            torch.Tensor,
+            _detach_and_copy_item_memo,
+            updated_flat_args_subclasses_desugared,
+        )
+    else:
+        saved_updated_flat_args_subclasses_desugared = pytree.tree_map_only(
+            torch.Tensor, lambda t: t.detach(), updated_flat_args_subclasses_desugared
+        )
+
+    fw_module = _create_graph(
+        fn_to_trace,
+        updated_flat_args_subclasses_desugared,
+        aot_config=aot_config,
+    )
+
+    if aot_config.is_export and mod_when_exporting_non_strict is not None:
+        # We update metadata to consider any assigned buffers as buffer mutations.
+        i = len(dict(mod_when_exporting_non_strict.named_parameters()))
+        for name, _ in mod_when_exporting_non_strict.named_buffers():
+            if name in assigned_buffers and not fw_metadata.input_info[i].mutates_data:  # type: ignore[possibly-undefined]
+                fw_metadata.input_info[i] = dataclasses.replace(
+                    fw_metadata.input_info[i], mutates_data=True
+                )
+                fw_metadata.num_mutated_inp_runtime_indices += 1
+            i += 1
+
+        # We add nodes corresponding to buffer assignments as output nodes in the graph.
+        add_nodes = []
+        output_node = list(fw_module.graph.nodes)[-1]
+        for name in assigned_buffers.values():  # type: ignore[possibly-undefined]
+            for node in fw_module.graph.nodes:
+                if node.name == name:
+                    add_nodes.append(node)
+                    node.users[output_node] = None
+        output_node.args = ((*add_nodes, *output_node.args[0]),)
+
+        hook.remove()  # type: ignore[possibly-undefined]
+
+    # As long as we opted to remove input mutations, then
+    # there should be *NO* mutating ops in the graph at this point.
+    copy_count = assert_functional_graph(fw_module.graph)
+    fw_module.graph.eliminate_dead_code()
+    fw_module.recompile()
+
+    copy_count2 = assert_functional_graph(fw_module.graph)
+    propagate_input_mutation_stacktraces(fw_module.graph)
+
+    # See Note [Side-Effectful Tokens in AOTAutograd]
+    num_tokens = len(fw_metadata.tokens)
+    if num_tokens != 0 and config.unlift_effect_tokens:
+        unlift_tokens(fw_module, fw_metadata, aot_config)
+        saved_updated_flat_args_subclasses_desugared = (
+            saved_updated_flat_args_subclasses_desugared[num_tokens:]
+        )
+
+    assert copy_count == copy_count2
+
+    if aot_config.enable_log:
+        aot_graphs_log.info(
+            "%s",
+            lazy_format_graph_code(
+                "Forward graph",
+                fw_module,
+                aot_config.aot_id,
+                include_stride=True,
+                include_device=True,
+                colored=True,
+            ),
+        )
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "aot_forward_graph_fw_metadata",
+                "encoding": "string",
+            },
+            payload_fn=lambda: dataclass_repr(fw_metadata),
+        )
+        if maybe_subclass_meta is not None:
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "aot_forward_graph_fw_subclass_metadata",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: dataclass_repr(maybe_subclass_meta),
+            )
+
+        trace_structured(
+            "aot_inference_graph",
+            payload_fn=lambda: fw_module.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
+        )
+
+    # TODO: should factor this into a separate function for export that always only returns just the graph.
+    if aot_config.is_export:
+        assert (
+            maybe_subclass_meta is None
+        ), "aot_export_module does not support tensor subclass inputs for now."
+    return fw_module, saved_updated_flat_args_subclasses_desugared, maybe_subclass_meta
+
+
+# Has the precondition that there
+# are no duplicate arguments in flat_args (e.g., the same Tensor
+# object never shows up twice.  However, two tensor inputs MAY alias
+# the same storage, so long as they have separate TensorImpls.)
+def aot_dispatch_autograd_graph(
+    flat_fn,
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> tuple[torch.fx.GraphModule, tuple[list[Any], list[Any]], Optional[SubclassMeta]]:
+    # traced_tangents corresponds to the set of outputs in the traced forward that should get grad_outputs in the traced backward.
+    # It includes outputs of the original forward, *and* any updated inputs due to input mutations.
+    # However, it does *not* include any outputs that are aliases of inputs or intermediates, or any metadata-only input mutations.
+    joint_inputs = (flat_args, fw_metadata.traced_tangents)
+
+    fn_prepared_for_autograd = fn_prepped_for_autograd(
+        flat_fn,
+        fw_metadata,
+    )
+    joint_fn_to_trace = create_joint(fn_prepared_for_autograd, aot_config=aot_config)
+
+    joint_fn_to_trace, updated_joint_inputs = create_functionalized_fn(
+        joint_fn_to_trace,
+        joint_inputs,
+        meta=fw_metadata,
+        aot_config=aot_config,
+        trace_joint=True,
+    )
+
+    # TODO: replace with AOTDispatchSubclassWrapper once we refactor
+    # fn_input_mutations_to_outputs and create_functionalized_fn
+    # into CompilerWrappers.
+    subclass_tracing_info = aot_dispatch_subclass(
+        joint_fn_to_trace,
+        updated_joint_inputs,
+        is_joint_structure=True,
+        meta=fw_metadata,
+        fw_only=flat_fn,
+    )
+
+    joint_fn_to_trace = subclass_tracing_info.plain_tensor_trace_fn
+    updated_joint_inputs = subclass_tracing_info.plain_tensor_args
+
+    (joint_fn_to_trace, updated_joint_inputs) = handle_effect_tokens_fn(
+        joint_fn_to_trace,
+        updated_joint_inputs,
+        meta=fw_metadata,
+        trace_joint=True,
+    )
+
+    # When we call _create_graph, this may mutate the metadata of joint
+    # inputs.  But callers are expecting to get the original joint inputs.  So
+    # we make aliases of all the inputs to make sure we have a copy that
+    # doesn't get modified.
+    #
+    # This destroys requires_grad/grad_fn information.  However, backends
+    # beneath AOTAutograd are indifferent to this information, so it doesn't
+    # matter.
+
+    fake_mode = detect_fake_mode()
+    if fake_mode:
+        saved_updated_joint_inputs = pytree.tree_map_only(
+            torch.Tensor, _detach_and_copy_item_memo, updated_joint_inputs
+        )
+    else:
+        saved_updated_joint_inputs = pytree.tree_map_only(
+            torch.Tensor, lambda t: t.detach(), updated_joint_inputs
+        )
+    maybe_subclass_meta = subclass_tracing_info.maybe_subclass_meta
+
+    fx_g = _create_graph(joint_fn_to_trace, updated_joint_inputs, aot_config=aot_config)
+
+    # There should be *NO* mutating ops in the graph at this point.
+    assert_functional_graph(fx_g.graph)
+
+    # Redundant with the check above, but worth having in case tracing introduced
+    # a fake tensor. Unlikely.
+    # See Note: [Fake Modules and AOTAutograd]
+    torch._dynamo.utils.assert_no_fake_params_or_buffers(fx_g)
+    fx_g.graph.eliminate_dead_code()
+    copy_fwd_metadata_to_bw_nodes(fx_g)
+    fx_g.recompile()
+
+    # TODO: in AOTAutograd, we create metadata like _indices_of_inps_to_detach to detect
+    # when we need to manually detach() some inputs in the forward.
+    # Higher order ops might eventually need to do the same.
+    if aot_config.is_export:
+        assert (
+            maybe_subclass_meta is None
+        ), "aot_export_module does not support tensor subclass inputs for now."
+    return fx_g, saved_updated_joint_inputs, maybe_subclass_meta
diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py
index 958804e5c763f..c1acba693f414 100644
--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@@ -6,7 +6,10 @@
 3. regenerating/replaying views from their base
 4. checking if a graph is functional i.e. whether it contains any mutation ops
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -14,7 +17,10 @@
 
 import torch
 from torch import Tensor
+<<<<<<< HEAD
 from torch._C import _functionalization
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import getArtifactLogger
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
@@ -225,9 +231,15 @@ def gen_alias_from_base(
     aliased_base_tensor,
     target_meta_tensor,
     target_requires_grad,
+<<<<<<< HEAD
     target_view_meta_sequence: Optional[ViewMetaSequence] = None,
     *,
     replay_views: bool,
+=======
+    target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
+    *,
+    replay_views,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # Patch the correct requires_grad field of the output tensor, depending on whether:
     # (i) the reconstructed output (out) was came from a tensor that requires grad or not;
@@ -246,11 +258,21 @@ def patch_requires_grad(out):
     # to replay them (view functions) on the aliased_base_tensor.
     if (
         replay_views
+<<<<<<< HEAD
         and target_view_meta_sequence is not None
         and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
     ):
         out = _functionalization.apply_view_meta_sequence(
             aliased_base_tensor, target_view_meta_sequence.sequence
+=======
+        and target_functional_tensor is not None
+        and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
+    ):
+        functional_tensor = target_functional_tensor.tensor
+
+        out = torch._functionalize_apply_view_metas(
+            functional_tensor, aliased_base_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # If re-applying the ViewMeta sequence succeeded, there should be no more
         # problems going forward. We just check we got to the target shape and
@@ -356,6 +378,7 @@ def make(t):
         )
 
 
+<<<<<<< HEAD
 # ViewMeta sequence wrapper for equality comparisons.
 #
 # Even though we can compare each ViewMeta instance, we compare the resulting
@@ -395,6 +418,27 @@ def __eq__(self, other: object) -> bool:
             return NotImplemented
 
         return self.metadata == other.metadata
+=======
+# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
+# after applying all the ViewMeta operations.
+class FunctionalTensorMetadataEq:
+    def __init__(self, tensor: torch.Tensor) -> None:
+        assert torch._is_functional_tensor(tensor)
+        self.tensor = tensor
+
+    def __eq__(self, other: object) -> bool:
+        # If other is None, then it probably means that we weren't able to recreate
+        # the FunctionalTensorMetadataEq. One of this cases is when we update the
+        # view metadata by calling: create_synthetic_base_metadata.
+        if other is None:
+            return True
+
+        # Comparison agains any other type is not implemented.
+        if not isinstance(other, FunctionalTensorMetadataEq):
+            return NotImplemented
+
+        return has_same_metadata(self.tensor, other.tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # new_arg and arg here are either:
@@ -472,6 +516,7 @@ def assert_functional_graph(fx_g: torch.fx.Graph) -> int:
                 # this is mostly a hack to avoid failing XLA tests.
                 # See https://github.com/pytorch/pytorch/pull/122434#issuecomment-2101012113
                 if "set_buffer_donor_" not in str(n.args[0]):
+<<<<<<< HEAD
                     assert n.args[0] in placeholders, (
                         f"n={str(n)}, n.args[0]={str(n.args[0])}, placeholders={str(placeholders)}, graph={str(fx_g)}"
                     )
@@ -480,6 +525,16 @@ def assert_functional_graph(fx_g: torch.fx.Graph) -> int:
                 assert not n.target._schema.is_mutable, (
                     f"aot_autograd expected to have an entirely functional graph, but found {n.format_node()}"
                 )
+=======
+                    assert (
+                        n.args[0] in placeholders
+                    ), f"n={str(n)}, n.args[0]={str(n.args[0])}, placeholders={str(placeholders)}, graph={str(fx_g)}"
+                mutation_count += 1
+            else:
+                assert (
+                    not n.target._schema.is_mutable
+                ), f"aot_autograd expected to have an entirely functional graph, but found {n.format_node()}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mutation_count
 
 
@@ -492,9 +547,15 @@ def propagate_input_mutation_stacktraces(fx_g: torch.fx.Graph) -> None:
             if n.target is torch.ops.aten.copy_.default:
                 # Can only copy_ into an input, and can only do so once
                 if "set_buffer_donor_" not in str(n.args[0]):
+<<<<<<< HEAD
                     assert n.args[0] in placeholders, (
                         f"n={str(n)}, n.args[0]={str(n.args[0])}, placeholders={str(placeholders)}, graph={str(fx_g)}"
                     )
+=======
+                    assert (
+                        n.args[0] in placeholders
+                    ), f"n={str(n)}, n.args[0]={str(n.args[0])}, placeholders={str(placeholders)}, graph={str(fx_g)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     placeholders.remove(n.args[0])
                 copy_from_node = n.args[1]
                 # Pre-condition: every node has a "stack_trace" field in its meta,
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index 06581e1524fde..8e75f5afd7aa1 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -24,7 +24,10 @@
 from torch.fx.experimental.symbolic_shapes import is_concrete_int
 
 from .collect_metadata_analysis import coerce_tangent_and_suggest_memory_format
+<<<<<<< HEAD
 from .descriptors import AOTInput, InputMutationAOTOutput, TangentAOTInput
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .schemas import (
     BackwardSignature,
     GraphSignature,
@@ -53,14 +56,18 @@ def remove_dupe_metadata(
     num_data_mutations = len([x for x in m.input_info if x.mutates_data])
     other_traced_tangents = m.traced_tangents[num_data_mutations:]
     inp_traced_tangents = m.traced_tangents[:num_data_mutations]
+<<<<<<< HEAD
     other_traced_tangents_descs = m.traced_tangents_descs[num_data_mutations:]
     inp_traced_tangents_descs = m.traced_tangents_descs[:num_data_mutations]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     filtered_inp_traced_tangents = [
         # See Note [Tangents memory format]
         x
         for i, x in enumerate(inp_traced_tangents)
         if keep_arg_mask[m.mutated_inp_runtime_indices[i]]
     ]
+<<<<<<< HEAD
     filtered_inp_traced_tangents_descs = [
         x_desc
         for i, x_desc in enumerate(inp_traced_tangents_descs)
@@ -70,6 +77,9 @@ def remove_dupe_metadata(
     traced_tangents_descs = (
         filtered_inp_traced_tangents_descs + other_traced_tangents_descs
     )
+=======
+    traced_tangents = filtered_inp_traced_tangents + other_traced_tangents
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert m.subclass_tangent_meta is not None
     subclass_tangent_meta = [
@@ -89,14 +99,21 @@ def remove_dupe_metadata(
                 dynamic_dims=o.dynamic_dims,
                 base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
                 requires_grad=o.requires_grad,
+<<<<<<< HEAD
                 view_meta_sequence=o.view_meta_sequence,
+=======
+                functional_tensor=o.functional_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for o in m.output_info
         ],
         num_intermediate_bases=m.num_intermediate_bases,
         keep_input_mutations=m.keep_input_mutations,
         traced_tangents=traced_tangents,
+<<<<<<< HEAD
         traced_tangents_descs=traced_tangents_descs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We are guaranteed not to get here, since dupes are not supported today with subclass inputs.
         subclass_inp_meta=[],
         subclass_fw_graph_out_meta=[],
@@ -122,7 +139,10 @@ def create_synthetic_base_metadata(
     synthetic_base_info: list[Union[int, tuple[int, torch.Tensor]]],
     outer_args: list[Any],
     inner_args: list[Any],
+<<<<<<< HEAD
     inner_args_desc: list[AOTInput],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[ViewAndMutationMeta, list[int]]:
     # maps inner arg indices to outer arg indices
     synthetic_base_to_indices: dict[int, list[int]] = {}
@@ -242,12 +262,17 @@ def create_synthetic_base_metadata(
                 # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
                 base_idx=new_base_idx,  # type: ignore[arg-type]
                 requires_grad=o.requires_grad,
+<<<<<<< HEAD
                 view_meta_sequence=o.view_meta_sequence,
+=======
+                functional_tensor=o.functional_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
     inner_mutated_tangents_and_memory_formats = [
         # See Note [Tangents memory format]
+<<<<<<< HEAD
         (
             coerce_tangent_and_suggest_memory_format(x),
             TangentAOTInput(InputMutationAOTOutput(x_desc)),
@@ -263,6 +288,15 @@ def create_synthetic_base_metadata(
     ]
     inner_mutated_tangents_memory_formats = [
         x[0][1] for x in inner_mutated_tangents_and_memory_formats
+=======
+        coerce_tangent_and_suggest_memory_format(x)
+        for inner_idx, x in enumerate(inner_args)
+        if input_infos[inner_idx].mutates_data and input_infos[inner_idx].requires_grad
+    ]
+    inner_mutated_tangents = [x[0] for x in inner_mutated_tangents_and_memory_formats]
+    inner_mutated_tangents_memory_formats = [
+        x[1] for x in inner_mutated_tangents_and_memory_formats
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     output_info = existing_output_infos + input_metadata_output_info
@@ -270,10 +304,13 @@ def create_synthetic_base_metadata(
     traced_tangents = (
         inner_mutated_tangents + m.traced_tangents[len(inner_mutated_tangents) :]
     )
+<<<<<<< HEAD
     traced_tangents_descs = (
         inner_mutated_tangents_descs
         + m.traced_tangents_descs[len(inner_mutated_tangents) :]
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert m.subclass_tangent_meta is not None
     subclass_tangent_meta = [
         PlainTensorMeta(0, memory_format=x)
@@ -287,7 +324,10 @@ def create_synthetic_base_metadata(
             num_intermediate_bases=m.num_intermediate_bases,
             keep_input_mutations=m.keep_input_mutations,
             traced_tangents=traced_tangents,
+<<<<<<< HEAD
             traced_tangents_descs=traced_tangents_descs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We are guaranteed not to get here, since synthetic_base codepaths are not supported today with subclass inputs.
             subclass_inp_meta=[],
             subclass_fw_graph_out_meta=[],
@@ -306,12 +346,19 @@ def compute_overlapping_inputs(aot_config, fwd_inputs, aliased_input_indices):
     tracing_context = torch._guards.TracingContext.try_get()
 
     if tracing_context is not None:
+<<<<<<< HEAD
         assert tracing_context.fake_mode is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape_env = tracing_context.fake_mode.shape_env
 
         # Check whether we can actually get the dynamo sources from within AOTAutograd.
         if aot_config.aot_autograd_arg_pos_to_source and shape_env is not None:
+<<<<<<< HEAD
             maybe_suppress_guards = shape_env.suppress_guards  # type: ignore[assignment]
+=======
+            maybe_suppress_guards = shape_env.suppress_guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Check whether there are any symbolic values being used.
     # We do this for 2 reasons:
@@ -460,7 +507,10 @@ def create_graph_signature(
         named_buffers=buffer_names,
         num_user_inputs=num_user_args,
         num_user_outputs=num_user_fw_outs,
+<<<<<<< HEAD
         trace_joint=trace_joint,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss_index=loss_index,
         backward_signature=backward_signature,
     )
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
new file mode 100644
index 0000000000000..7e0b21b0b0446
--- /dev/null
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -0,0 +1,1845 @@
+# mypy: allow-untyped-defs
+"""
+Functions in this module do most of the "work" of AOTAutograd.
+An aot_dispatch_* function:
+- Takes in the input flat_fn, flat_args, and some metadata
+- Runs a set of pre compile wrappers (e.g. argument deduping)
+- Runs the actual compiler
+- Wraps the returned callable in a set of post compile wrappers
+- Returns the wrapped callable and metadata.
+"""
+
+import copy
+import dataclasses
+import itertools
+import logging
+import operator
+import time
+import traceback
+from collections import defaultdict
+from contextlib import nullcontext
+from typing import Any, Callable, Optional, TYPE_CHECKING
+
+import torch
+import torch.utils._pytree as pytree
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dynamo.utils import detect_fake_mode, dynamo_timed, lazy_format_graph_code
+from torch._guards import CompileContext, TracingContext
+from torch._logging import getArtifactLogger, trace_structured
+from torch._subclasses import FakeTensor
+from torch._subclasses.meta_utils import is_sparse_any
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.proxy_tensor import is_sym_node
+from torch.fx.experimental.symbolic_shapes import fx_placeholder_vals
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes._tensorify_python_scalars import tensorify_python_scalars
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.types import py_sym_types
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torchgen.utils import dataclass_repr
+
+from .. import config
+from .autograd_cache import (
+    AOTAutogradCache,
+    serialize_graph_module,
+    should_use_remote_autograd_cache,
+)
+from .dispatch_and_compile_graph import (
+    aot_dispatch_autograd_graph,
+    aot_dispatch_base_graph,
+)
+from .logging_utils import track_graph_compiling
+from .runtime_wrappers import (
+    AOTDedupeWrapper,
+    AOTDispatchAutograd,
+    AOTDispatchSubclassWrapper,
+    AOTSyntheticBaseWrapper,
+    AutogradLazyBackwardCompileInfo,
+    CompilerWrapper,
+    DebugAssertWrapper,
+    EffectTokensWrapper,
+    FakifiedOutWrapper,
+    FunctionalizedRngRuntimeWrapper,
+    make_runtime_safe,
+    post_compile,
+    pre_compile,
+    RuntimeWrapper,
+)
+from .schemas import AOTConfig, MutationType, ViewAndMutationMeta
+from .subclass_utils import compute_inner_mutated_inp_indices_from_subclass_meta
+from .utils import (
+    _get_symint_hints,
+    contain_metadata_mutation_ops,
+    get_cuda_generator_meta_val,
+    make_boxed_func,
+    strict_zip,
+    unlift_tokens,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+zip = strict_zip
+
+log = logging.getLogger(__name__)
+aot_joint_log = getArtifactLogger(__name__, "aot_joint_graph")
+aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
+
+aten = torch.ops.aten
+
+# Returns a Callable and a ViewAndMutationMeta.
+# Currently, only export needs the ViewAndMutationMeta after this function.
+DispatchReturn = tuple[Callable, ViewAndMutationMeta]
+
+
+def _create_wrappers_for_dispatch(needs_autograd: bool) -> list[CompilerWrapper]:
+    """
+    Wrappers that run on every dispatch function
+    """
+    return [AOTDedupeWrapper(), AOTSyntheticBaseWrapper(trace_joint=needs_autograd)]
+
+
+# Export's dispatching logic is unique in a few ways: it only needs the "graph"
+# bits of aot_autograd, and doesn't need to do any specific wrapping.
+def aot_dispatch_export(
+    flat_fn: Callable,
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+    needs_autograd: bool,
+) -> DispatchReturn:
+    wrappers = _create_wrappers_for_dispatch(needs_autograd)
+    flat_fn, flat_args, fw_metadata = pre_compile(
+        wrappers,
+        flat_fn,
+        flat_args,
+        aot_config,
+        fw_metadata=fw_metadata,
+    )
+    if needs_autograd and not aot_config.pre_dispatch:
+        graph, _, _ = aot_dispatch_autograd_graph(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+        )
+    else:
+        graph, _, _ = aot_dispatch_base_graph(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+        )
+
+    # NB: the wrappers that run in pre_compile for export are
+    # either a no-op, because they're not needed, or will raise a runtime error,
+    # since they don't support export.
+    # We still run these wrappers to make sure that they're not needed pre compile,
+    # but we technically don't need to run them post compile at all here.
+    compiled_fn, fw_metadata = post_compile(
+        wrappers, graph, aot_config, runtime_metadata=fw_metadata
+    )
+
+    # Therefore, since no wrapperes run, we don't get back a callable - we get back the raw fx graph
+    # (either a joint or an inference-only graph)
+    assert isinstance(compiled_fn, torch.fx.GraphModule)
+    return compiled_fn, fw_metadata
+
+
+def sanitize_aot_config(input: AOTConfig) -> AOTConfig:
+    return AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        inference_compiler=None,
+        num_params_buffers=input.num_params_buffers,
+        aot_id=input.aot_id,
+        keep_inference_input_mutations=input.keep_inference_input_mutations,
+        is_export=input.is_export,
+        no_tangents=input.no_tangents,
+        aot_autograd_arg_pos_to_source=input.aot_autograd_arg_pos_to_source,
+        dynamic_shapes=input.dynamic_shapes,
+        enable_log=input.enable_log,
+        static_input_indices=input.static_input_indices,
+        pre_dispatch=input.pre_dispatch,
+        cache_info=None,
+        precompile_backend_id=input.precompile_backend_id,
+    )
+
+
+def aot_dispatch_base(
+    flat_fn,
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> DispatchReturn:
+    """
+    Handles functions that don't need autograd. Runs wrappers and compiles with fw_compiler.
+    """
+    wrappers = _create_wrappers_for_dispatch(needs_autograd=False)
+    flat_fn, flat_args, fw_metadata = pre_compile(
+        wrappers, flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+    )
+    fw_module, updated_flat_args, maybe_subclass_meta = aot_dispatch_base_graph(  # type: ignore[misc]
+        flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+    )
+    # Save the forward_graph_str right after aot_dispatch_base_graph,
+    # to save in the cache
+    aot_forward_graph_str = None
+    if aot_config.cache_info is not None:
+        aot_forward_graph_str = fw_module.print_readable(
+            print_output=False,
+            include_stride=True,
+            include_device=True,
+            fast_sympy_print=True,
+        )
+
+    fakified_out_wrapper = FakifiedOutWrapper()
+    (
+        fw_module,
+        updated_flat_args,
+        fw_metadata,
+    ) = fakified_out_wrapper.pre_compile(
+        fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
+    )
+    functionalized_rng_wrapper = FunctionalizedRngRuntimeWrapper()
+    (
+        fw_module,
+        updated_flat_args,
+        fw_metadata,
+    ) = functionalized_rng_wrapper.pre_compile(
+        fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
+    )
+    assert isinstance(fw_module, GraphModule)
+
+    if aot_config.enable_log:
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "torch._functorch.config",
+                "encoding": "string",
+            },
+            payload_fn=lambda: torch._functorch.config.get_config_copy(),
+        )
+
+    disable_amp = torch._C._is_any_autocast_enabled()
+    context = torch._C._DisableAutocast if disable_amp else nullcontext
+
+    with context(), track_graph_compiling(aot_config, "inference"):
+        compiler = (
+            aot_config.inference_compiler
+            if aot_config.inference_compiler is not None
+            else aot_config.fw_compiler
+        )
+
+        if tracing_context := torch._guards.TracingContext.try_get():
+            tracing_context.fw_metadata = (
+                fw_metadata
+                if maybe_subclass_meta is None
+                else maybe_subclass_meta.fw_metadata
+            )
+
+        with TracingContext.report_output_strides() as fwd_output_strides:
+            fake_mode = detect_fake_mode()
+            if fake_mode is not None and fake_mode.shape_env is not None:
+                tensorify_python_scalars(fw_module, fake_mode.shape_env, fake_mode)
+            compiled_fw = compiler(fw_module, updated_flat_args)
+
+        if fakified_out_wrapper.needs_post_compile:
+            fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)
+
+    make_runtime_safe(fw_metadata, maybe_subclass_meta)
+
+    # However, RuntimeWrapper does not expect the rng offsets in the
+    # output. So, we have to create another wrapper and take out the offset. As
+    # a result, we have to account for not boxed_call compilers as well.
+    if not getattr(compiled_fw, "_boxed_call", False):
+        compiled_fw = make_boxed_func(compiled_fw)
+
+    # Create a wrapper to set up the rng functionalize and fakified out bits
+    compiled_fw = functionalized_rng_wrapper.post_compile(
+        compiled_fw, aot_config, runtime_metadata=fw_metadata
+    )
+    cache_info = aot_config.cache_info
+    if cache_info is not None:
+        if hasattr(compiled_fw, "_fx_graph_cache_key"):
+            time_taken_ns = time.time_ns() - cache_info.start_time_ns
+            guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
+            entry = AOTAutogradCache.make_entry(
+                compiled_fw_func=compiled_fw,  # type: ignore[arg-type]
+                compiled_bw_func=None,
+                aot_joint_graph_str=None,
+                aot_forward_graph_str=aot_forward_graph_str,
+                aot_backward_graph_str=None,
+                runtime_metadata=fw_metadata,
+                dispatch_wrappers=wrappers,
+                maybe_subclass_meta=maybe_subclass_meta,
+                num_fw_outs_saved_for_bw=None,
+                indices_of_inps_to_detach=[],
+                forward_time_taken_ns=time_taken_ns,
+                backward_time_taken_ns=0,
+                sanitized_aot_config=sanitize_aot_config(aot_config),
+                guards_expr=guards_expr,
+                backward_state_indices=None,
+                num_symints_saved_for_bw=None,
+                serialized_bw_module=None,
+            )
+            AOTAutogradCache.save(
+                cache_info.cache_key, entry, remote=should_use_remote_autograd_cache()
+            )
+
+    compiled_fw = fakified_out_wrapper.post_compile(
+        compiled_fw,
+        aot_config,
+        runtime_metadata=fw_metadata,
+    )
+
+    compiled_fw = EffectTokensWrapper().post_compile(
+        compiled_fw,
+        aot_config,
+        runtime_metadata=fw_metadata,
+    )
+
+    # Why do we need to pass in num_fw_outs_saved_for_bw?
+    # See Note: [Partitioner handling for Subclasses, Part 2]
+    compiled_fw = AOTDispatchSubclassWrapper(
+        trace_joint=False,
+        # TODO: once we use pre_compile this will be flat_fn at the top of this function
+        fw_only=None,
+        maybe_subclass_meta=maybe_subclass_meta,
+        num_fw_outs_saved_for_bw=None,
+    ).post_compile(
+        compiled_fw,
+        aot_config,  # not used
+        runtime_metadata=fw_metadata,
+    )
+
+    if not getattr(compiled_fw, "_boxed_call", False):
+        compiled_fw = make_boxed_func(compiled_fw)
+
+    compiled_fn = RuntimeWrapper(
+        indices_of_inps_to_detach=[],
+        trace_joint=False,
+        disable_amp=disable_amp,
+    ).post_compile(
+        compiled_fw,
+        aot_config,
+        runtime_metadata=fw_metadata,
+    )
+
+    compiled_fn = post_compile(
+        wrappers, compiled_fn, aot_config, runtime_metadata=fw_metadata
+    )
+    return compiled_fn
+
+
+def collect_fw_donated_buffer_idxs(
+    fw_ins: list[Optional[FakeTensor]],
+    user_fw_outs: list[Optional[FakeTensor]],
+    bw_outs: list[Optional[FakeTensor]],
+    saved_tensors: list[FakeTensor],
+) -> list[int]:
+    """
+    Checks if the saved tensors are donated buffers, which means a saved tensor is not
+    an alias of any tensors in fw_ins, user_fw_outs, and bw_outs.
+    """
+
+    storage_refs = set()
+    for t in itertools.chain(fw_ins, user_fw_outs, bw_outs):
+        # Only access storage if a tensor has storage (not sparse)
+        if t is not None and isinstance(t, FakeTensor) and not is_sparse_any(t):
+            storage_refs.add(StorageWeakRef(t.untyped_storage()))
+
+    num_saved_tensor = len(saved_tensors)
+    donated_buffer_idxs = []
+    for i in range(num_saved_tensor):
+        t = saved_tensors[i]
+        if (
+            t is not None
+            and not is_sparse_any(t)
+            and StorageWeakRef(t.untyped_storage()) not in storage_refs
+        ):
+            donated_buffer_idxs.append(i)
+
+    return donated_buffer_idxs
+
+
+def collect_bw_donated_buffer_idxs(
+    fw_module: torch.fx.GraphModule,
+    bw_module: torch.fx.GraphModule,
+    fw_metadata: ViewAndMutationMeta,
+) -> list[int]:
+    """
+    Collects backward donated buffer indexes from fw_module and bw_module.
+    """
+
+    # [Note: Metadata mutation in proxy tracing]
+    # node.meta["val"] is a snapshot of the tensor value when tracing a graph,
+    # instead of the final state after the graph has run. node.meta["val"] is
+    # not updated even if later there is a metadata mutation op.
+    # See: https://github.com/pytorch/pytorch/pull/141308#issuecomment-2495798947
+    #
+    # Currently, metadata mutation op happens only for sacrificial parameter
+    # specifically the `set_` op. This motivates banning metadata mutation from
+    # proxy tracing.
+    #
+    # Since node.meta["val"] is used to detect donated buffer, we return an empty
+    # list if there exists metadata mutation op.
+    if contain_metadata_mutation_ops(fw_module) or contain_metadata_mutation_ops(
+        bw_module
+    ):
+        return []
+
+    fw_ins = fw_module.graph.find_nodes(op="placeholder")
+    bw_outs = next(reversed(bw_module.graph.find_nodes(op="output"))).args[0]
+    fw_outs = next(reversed(fw_module.graph.find_nodes(op="output"))).args[0]
+
+    fw_ins = [
+        n.meta["val"] if (hasattr(n, "meta") and "val" in n.meta) else None
+        for n in fw_ins
+    ]
+    fw_outs = [
+        n.meta["val"] if (hasattr(n, "meta") and "val" in n.meta) else None
+        for n in fw_outs
+    ]
+    bw_outs = [
+        n.meta["val"] if (hasattr(n, "meta") and "val" in n.meta) else None
+        for n in bw_outs
+    ]
+
+    user_fw_outs = fw_outs[: fw_metadata.num_forward]
+    saved_tensors = fw_outs[fw_metadata.tensors_saved_for_backwards_slice]
+
+    fw_donated_buffer = collect_fw_donated_buffer_idxs(
+        fw_ins,
+        user_fw_outs,
+        bw_outs,
+        saved_tensors,
+    )
+
+    assert fw_metadata.num_symints_saved_for_bw is not None
+    return [fw_metadata.num_symints_saved_for_bw + i for i in fw_donated_buffer]
+
+
+@dataclasses.dataclass
+class InvokeSubgraphHopGraphs:
+    """
+    A data structure to hold all the information needed to partition the
+    `joint_hop_gm` and joint graph and the restitch the `new_fw_hop_gm` and
+    `new_bw_hop_gm` into the bigger `joint_gm`.
+    """
+
+    # To avoid re-partitioning subgraphs
+    partitioning_done: bool = False
+    old_num_fw_outputs: Optional[int] = None
+    old_num_fw_inputs: Optional[int] = None
+
+    new_fw_hop_gm: Optional[torch.fx.GraphModule] = None
+    new_bw_hop_gm: Optional[torch.fx.GraphModule] = None
+    new_num_sym_nodes: Optional[int] = None
+    new_num_saved_nodes: Optional[int] = None
+
+
+def run_joint_graph_passes_on_hops(
+    joint_gm: torch.fx.GraphModule,
+    joint_inputs: Any,
+    aot_config: AOTConfig,
+) -> torch.fx.GraphModule:
+    """
+    This pass runs the joint graph passes on the HOP graph. In torch.compile, we
+    typically have many passes which work on the joint graph and then end with a
+    partitioner.
+
+
+    The partitioner part is quite mechanical to handle. HOP have their own
+    forward and backward graph. The process can be broken into following steps
+
+    1) Get a `joint_hop_gm` from the `fw_hop_gm` and `bw_hop_gm`
+    2) Run joint graph passes on the `joint_hop_gm` to get `new_fw_hop_gm` and `new_bw_hop_gm`
+    3) Stitch the `new_fw_hop_gm` and `new_bw_hop_gm` back into the `joint_gm`.
+
+    The terminology used in the code is
+    `joint_graph/joint_gm` : Refers to the main graph. This may contain many HOPs which have their own `hop_graph`
+    `fw_hop_graph/fw_hop_gm` : Refers to the forward graph associated with a HOP.
+    `bw_hop_graph/bw_hop_gm` : Refers to the backward graph associated with a HOP.
+    `joint_hop_graph/joint_hop_gm` : Refers to the subgraph associated with the HOP like invoke_subgraph.
+    `new_fw_hop_graph/new_fw_hop_gm` : Refers to the forward graph after partitioning is applied to `joint_hop_gm`.
+    `new_bw_hop_graph/new_bw_hop_gm` : Refers to the backward graph after partitioning is applied to `joint_hop_gm`.
+
+    NB: This pass works for invoke_subgraph today because we took extra care in
+    the Autograd.Dispatch key of invoke_subgraph to vastly simplify Step 1.
+    """
+    from torch._higher_order_ops import invoke_subgraph
+
+    def num_outputs(mod):
+        return len(mod.graph.find_nodes(op="output")[0].args[0])
+
+    def num_inputs(mod):
+        return len(mod.graph.find_nodes(op="placeholder"))
+
+    def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
+        # min-cut partitioner requires the placeholders to have primals and
+        # tangents string in the node.name. The signature of the joint graph is
+        # (*primals, *tangents)
+
+        # We also have to update the output signature which is right now
+        # (*grads, *fw_outs) and we have to change to (*fw_outs, *grads) for the
+        # partitioner to work.
+        new_graph = torch.fx.Graph()
+        env = {}
+
+        primals_counter = itertools.count(0)
+        tangents_counter = itertools.count(0)
+
+        for idx, node in enumerate(mod.graph.nodes):
+            if node.op == "placeholder":
+                if idx < num_primals:
+                    env[node] = new_graph.placeholder(
+                        f"primals_{next(primals_counter)}"
+                    )
+                else:
+                    env[node] = new_graph.placeholder(
+                        f"tangents_{next(tangents_counter)}"
+                    )
+                env[node].meta = copy.copy(node.meta)
+            elif node.op == "output":
+                # Reverse the (*grads, *fw_outs) to (*fw_outs, *grads)
+                # The reason for having the reversed signature in the first
+                # place is to simplify step 3.
+                old_outputs = node.args[0]
+                new_outputs = (
+                    *old_outputs[-num_fw_outputs:],
+                    *old_outputs[:-num_fw_outputs],
+                )
+                new_outputs = [env[n] if n else None for n in new_outputs]
+                new_graph.output(tuple(new_outputs))
+            else:
+                env[node] = new_graph.node_copy(node, lambda n: env[n])
+                env[node].meta = copy.copy(node.meta)
+
+        new_graph.lint()
+
+        out = torch.fx.GraphModule(mod, new_graph)
+        return out
+
+    new_hop_graphs: dict[str, InvokeSubgraphHopGraphs] = defaultdict(
+        lambda: InvokeSubgraphHopGraphs()
+    )
+
+    # Step 1 - Get a `joint_hop_gm` from the `fw_hop_gm` and `bw_hop_gm` This is
+    # easy to do for `invoke_subgraph` HOP. During the Autograd dispatch key
+    # tracing, we have put the joint_hop_graph in the backward hop graph itself.
+    # So to recover the joint_hop_gm, we just have to look at the backward
+    # HOP graphs.
+    # So we will merge step 1 and step 2 in this next section
+
+    # Save the fw and bwd hop nodes. We will later in-place modify the graph
+    # using these nodes.
+    fw_hop_nodes = []
+    bw_hop_nodes = []
+    for node in joint_gm.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target is invoke_subgraph
+            and isinstance(node.args[1], str)
+        ):
+            if node.args[1].startswith("fw"):
+                fw_hop_nodes.append(node)
+            elif node.args[1].startswith("bw"):
+                bw_hop_nodes.append(node)
+
+    if not bw_hop_nodes:
+        return joint_gm
+
+    assert len(fw_hop_nodes) == len(bw_hop_nodes)
+
+    # Create a bw to hop node mapping. This helps us in identifying the bw and
+    # fw subgraph pairs without relying on the identifier. This is important
+    # because we can have different subgraphs for bwd for same subgraph in the
+    # fwd because of differing strides in the backward.
+    bw_to_fw_hop_node = dict(zip(list(reversed(bw_hop_nodes)), fw_hop_nodes))
+
+    for node in bw_hop_nodes:
+        identifier = node.args[1].removeprefix("bw")
+
+        # If partitioning already done for this identifier, skip. This saves
+        # redundant joint graph passes for same subgraphs.
+        if new_hop_graphs[identifier].partitioning_done:
+            continue
+
+        # Collect some information from the forward hop graph
+        fw_hop_node = bw_to_fw_hop_node[node]
+        fw_hop_gm = getattr(joint_gm, fw_hop_node.args[0].target)
+        assert isinstance(fw_hop_gm, torch.fx.GraphModule)
+        num_fw_inputs = num_inputs(fw_hop_gm)
+        num_fw_outputs = num_outputs(fw_hop_gm)
+        new_hop_graphs[identifier].old_num_fw_inputs = num_fw_inputs
+        new_hop_graphs[identifier].old_num_fw_outputs = num_fw_outputs
+
+        # Step 1) - Get the `joint_hop_gm`. As mentioned earlier, the
+        # backward graph is the joint graph.
+        joint_hop_gm = getattr(joint_gm, node.args[0].target)
+        assert isinstance(joint_hop_gm, torch.fx.GraphModule)
+
+        # Prepare the graph for the partitioner
+        joint_hop_gm = prepare_for_partitioner(
+            joint_hop_gm, num_fw_inputs, num_fw_outputs
+        )
+
+        # TODO: invoke_subgraph should track which of its inputs static indices
+        # so it can propagate them to the partitioner (and use in cudagraphs)
+        static_lifetime_input_indices: list[int] = []
+        # Step 2) and 3) - Run joint graph passes and partitioner
+        new_fw_hop_gm, new_bw_hop_gm = aot_config.partition_fn(
+            joint_hop_gm,
+            [],
+            num_fwd_outputs=num_fw_outputs,
+            static_lifetime_input_indices=static_lifetime_input_indices,
+        )
+
+        # Save the new forward and backward graph modules
+        new_hop_graphs[identifier].new_fw_hop_gm = new_fw_hop_gm
+        new_hop_graphs[identifier].new_bw_hop_gm = new_bw_hop_gm
+
+        # Save the number of symints and saved tensors
+        new_fw_out_nodes = new_fw_hop_gm.graph.find_nodes(op="output")[0].args[0]
+        extra_outputs = new_fw_out_nodes[num_fw_outputs:]
+        symint_outputs = [n for n in extra_outputs if is_sym_node(n)]
+
+        new_hop_graphs[identifier].new_num_sym_nodes = len(symint_outputs)
+        new_hop_graphs[identifier].new_num_saved_nodes = len(extra_outputs) - len(
+            symint_outputs
+        )
+
+        new_hop_graphs[identifier].partitioning_done = True
+
+    # Step 3) Restitch the new fw and bw graphs back into the main graph.
+    #
+    # This is a very mechanical process. There are a quite a few pieces that we
+    # need to connect together to make it work. Lets try to understand the
+    # problem statement first.
+    #
+    # For the forward graph, the signature of the old_fw_hop_gm is
+    #   inputs - (*primals)
+    #   outputs - (*fw_outs)
+    # Now the signature of the new_fw_hop_gm is
+    #   inputs - (*primals)     -- This is same
+    #   outputs - (*fw_outs, *saved_tensors)    - This is different
+    # At a high level, this is an easy transformation, in the new graph we just
+    # have to replace the old_fw_hop_gm with the new_fw_hop_gm. Everything else
+    # falls into place, because the input signature (i.e. args) is same. And
+    # even though output signature is different, fw_outs are still at the same
+    # indexes as before. So the forward of the `joint_gm` works nicely.
+    #
+    # Now, lets look at the backward hop graph. Old signature
+    #   inputs - (*primals, *tangents)
+    #   outputs - (*grad_outs, *fw_outs)
+    # New signature
+    #   inputs - (*saved_tensors, *tangents) -- Different
+    #   outputs - (*grad_outs)  -- Different
+    # Here both input and output signature change. The output signature handling
+    # is quite easy because the grads_out are sitting at the right place, so we
+    # dont have to do anything.
+    #
+    # For the input signature, we have to collect the saved tensors from the
+    # corresponding forward graph output. We collect all saved_tensors when we
+    # see the forward graph, and save it into a map and then later use it during
+    # the backward.
+
+    # The stack of fw_nodes for invoke_subgraph HOP. There is an implicit
+    # assumption about the graph structure, i.e., if we have hop1, hop2, hop3,
+    # ... in the forward part of the joint graph, we will have .., hop3, hop2,
+    # hop1 order for the backward. This structure allows us to just use a stack
+    # to collect all the information that we need to pass from the forward hop
+    # node to the corresponding backward node.
+
+    already_added_new_hop_mods = set()
+
+    def add_new_hop_gm(new_subgraph_mod, name):
+        new_subgraph_attr_name = f"partitioned_{name}"
+        if new_subgraph_attr_name in already_added_new_hop_mods:
+            return new_subgraph_attr_name
+
+        joint_gm.register_module(new_subgraph_attr_name, new_subgraph_mod)
+        already_added_new_hop_mods.add(new_subgraph_attr_name)
+        return new_subgraph_attr_name
+
+    def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_node):
+        # Copy all the fields from the old call_function node. And then override
+        # the `val` meta field with the outputs of new_hop_gm.
+        new_call_function_node.meta = copy.copy(old_call_function_node.meta)
+
+        output = new_hop_gm.graph.find_nodes(op="output")[0]
+        out_example_vals = [n.meta["val"] if n else None for n in output.args[0]]
+        new_call_function_node.meta["val"] = tuple(out_example_vals)
+
+    for bw_node in reversed(bw_hop_nodes):
+        identifier = bw_node.args[1].removeprefix("bw")
+
+        # Make changes to the corresponding fw and bw node pair simultaneously.
+        # The removes the need of any bookkeeping.
+
+        # Fw node changes
+        # Insert the new_fw_hop_gm. This is straightforward. Get the
+        # new_fw_hop_gm, insert the hop_gm as a get_attr fw_node, and then
+        # add a call_function fw_node. Additionally, also use getitem
+        # call_functions to collect the saved_tensor nodes
+
+        fw_node = bw_to_fw_hop_node[bw_node]
+        new_fw_hop_gm = new_hop_graphs[identifier].new_fw_hop_gm
+        assert new_fw_hop_gm is not None
+
+        old_num_fw_outputs = new_hop_graphs[identifier].old_num_fw_outputs
+        new_num_sym_nodes = new_hop_graphs[identifier].new_num_sym_nodes
+        new_num_saved_nodes = new_hop_graphs[identifier].new_num_saved_nodes
+        assert old_num_fw_outputs is not None
+        assert new_num_sym_nodes is not None
+        assert new_num_saved_nodes is not None
+        total_outputs = old_num_fw_outputs + new_num_saved_nodes + new_num_sym_nodes
+
+        extra_fw_outputs = []
+
+        # Insert the new_fw_hop_gm into the joint_gm
+        with joint_gm.graph.inserting_after(fw_node):
+            new_fw_mod_attr_name = add_new_hop_gm(new_fw_hop_gm, f"fw{identifier}")
+            new_fw_mod_attr = joint_gm.graph.get_attr(new_fw_mod_attr_name)
+
+        # new_hop_fw_gm output signature is (*fw_outs, *saved_tensors)
+        with joint_gm.graph.inserting_after(new_fw_mod_attr):
+            new_fw_node = joint_gm.graph.call_function(
+                the_function=invoke_subgraph,
+                args=(
+                    new_fw_mod_attr,
+                    new_fw_mod_attr_name,
+                    *fw_node.args[2:],
+                ),
+            )
+            propagate_meta_info(new_fw_hop_gm, new_fw_node, fw_node)
+
+        # old_num_fw_outputs = (*fw_outs)
+        # new_num_fw_outputs = (*fw_outs, *saved_tensors, *sym_nodes)
+        with joint_gm.graph.inserting_after(new_fw_node):
+            for fw_out_idx in range(old_num_fw_outputs, total_outputs):
+                saved_tensor_node = joint_gm.graph.call_function(
+                    the_function=operator.getitem, args=(new_fw_node, fw_out_idx)
+                )
+                saved_tensor_node.meta = copy.copy(new_fw_node.meta)
+                saved_tensor_node.meta["val"] = new_fw_node.meta["val"][fw_out_idx]
+                extra_fw_outputs.append(saved_tensor_node)
+
+        fw_node.replace_all_uses_with(new_fw_node)
+        joint_gm.graph.erase_node(fw_node)
+
+        # Bw node changes
+        # Prepare the operands for the bwd graph
+        # Old bw graph signature : (*primals, *tangents)
+        # New signature will be : (*sym_nodes, *saved_tensors, *tangents)
+        # We have already collected the saved_tensors in the forward hop processing.
+
+        # extra_fw_outputs are in the order (*saved_nodes, *sym_nodes).
+        # Partitioner has this quirk where the backward wants sym_nodes
+        # first. So extract the sym and saved nodes.
+
+        new_bw_hop_gm = new_hop_graphs[identifier].new_bw_hop_gm
+        assert new_bw_hop_gm is not None
+
+        saved_tensor_nodes = extra_fw_outputs[:new_num_saved_nodes]
+        sym_nodes = extra_fw_outputs[new_num_saved_nodes:]
+
+        num_primals = new_hop_graphs[identifier].old_num_fw_inputs
+        assert num_primals is not None
+        tangents = list(bw_node.args[2 + num_primals :])
+        operands = sym_nodes + saved_tensor_nodes + tangents
+
+        # Insert the new_bw_hop_gm into the joint_gm
+        with joint_gm.graph.inserting_after(bw_node):
+            new_bw_mod_attr_name = add_new_hop_gm(new_bw_hop_gm, bw_node.args[1])
+            new_bw_mod_attr = joint_gm.graph.get_attr(new_bw_mod_attr_name)
+
+        with joint_gm.graph.inserting_after(new_bw_mod_attr):
+            new_bw_node = joint_gm.graph.call_function(
+                the_function=invoke_subgraph,
+                args=(
+                    new_bw_mod_attr,
+                    new_bw_mod_attr_name,
+                    *operands,
+                ),
+            )
+            propagate_meta_info(new_bw_hop_gm, new_bw_node, bw_node)
+            # Since the partitioner is run after the graph passes, we have lost
+            # the eager information and cannot faithfully extract the eager
+            # inputs for the new partitioned backward graph. For the forward
+            # graph, it was fine because the input signature remains same.
+            new_bw_node.meta.pop("eager_input_vals", None)
+
+        bw_node.replace_all_uses_with(new_bw_node)
+        joint_gm.graph.erase_node(bw_node)
+
+    joint_gm.graph.eliminate_dead_code()
+    joint_gm.graph.lint()
+    joint_gm.recompile()
+    return joint_gm
+
+
+def maybe_log_graph(
+    gm,
+    graph_name,
+    aot_config,
+    structured_log_prefix_fn,
+    out_structured_logs: Optional[list[str]] = None,
+):
+    if not aot_config.enable_log:
+        return
+    aot_graphs_log.debug(
+        "%s",
+        lazy_format_graph_code(
+            f"{graph_name}",
+            gm,
+            aot_config.aot_id,
+            include_stride=True,
+            include_device=True,
+            colored=True,
+        ),
+    )
+
+    def gm_str_fn() -> str:
+        return gm.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+
+    if out_structured_logs is not None:
+        out_structured_logs.append(f"{structured_log_prefix_fn()}:{gm_str_fn()}")
+    else:
+        trace_structured(
+            f"{structured_log_prefix_fn()}",
+            payload_fn=lambda: gm_str_fn(),
+        )
+
+
+def create_wrap_fn(fn, args):
+    from functools import wraps
+
+    from torch.fx.experimental.proxy_tensor import maybe_enable_thunkify
+
+    from .functional_utils import from_fun, has_data_mutation, to_fun
+
+    def assert_no_mutation(t):
+        assert not has_data_mutation(
+            t
+        ), "Saved tensors hooks with inputs mutations are not allowed"
+
+    @wraps(fn)
+    def _wrapper(*args):
+        with maybe_enable_thunkify():
+            disable_above = torch._C._ExcludeDispatchKeyGuard(
+                torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+            )
+
+            with disable_above:
+                f_args = pytree.tree_map(to_fun, args)
+                f_outs = fn(*f_args)
+                pytree.tree_map(assert_no_mutation, f_args)
+                return pytree.tree_map(from_fun, f_outs)
+
+    return _wrapper, args
+
+
+def prepare_hook_gm(aot_config, fn, args):
+    from torch._functorch._aot_autograd.dispatch_and_compile_graph import _create_graph
+
+    fn, args = create_wrap_fn(fn, args)
+    gm = _create_graph(fn, args, aot_config=aot_config)
+    return gm
+
+
+# Inline Autograd saved_tensors_hooks into epilogue of forward graph
+# and prologue of backward graph.
+# This changes forward graph outputs and inputs.
+# Pack hook can return tensors, sym scalars, constants.
+# All tensors to save for backward will be grouped together at front.
+# Sym scalars grouped on another end. Constants are inlined in the graph.
+def maybe_inline_graph_saved_tensors_hooks(
+    fw_module,
+    bw_module,
+    num_inner_fwd_outputs,
+    inner_meta,
+    aot_config,
+    static_input_indices,
+):
+    if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+        return
+
+    get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
+    are_inline_hooks = (
+        torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
+    )
+
+    hooks = get_hooks()
+    if not are_inline_hooks(hooks):
+        return
+
+    pack_hook_gm, unpack_hook_gm = hooks
+
+    structured_logs: list[str] = []
+    maybe_log_graph(
+        fw_module,
+        "Forward graph pre saved_tensors_hooks inlining",
+        aot_config,
+        lambda: "aot_forward_graph_pre_saved_tensors_hooks",
+        structured_logs,
+    )
+    maybe_log_graph(
+        bw_module,
+        "Backward graph pre saved_tensors_hooks inlining",
+        aot_config,
+        lambda: "aot_backward_graph_pre_saved_tensors_hooks",
+        structured_logs,
+    )
+    fw_g = fw_module.graph
+    bw_g = bw_module.graph
+
+    fw_g_names = {node.name for node in fw_g.nodes}
+    bw_g_names = {node.name for node in bw_g.nodes}
+
+    def _gen_unused_name(candidate: str):
+        c = candidate
+        i = 0
+        while c in fw_g_names or c in bw_g_names:
+            c = f"{candidate}_{i}"
+            i = i + 1
+        return c
+
+    bw_g_inputs = bw_g.find_nodes(op="placeholder")
+
+    fw_out_n = fw_g.output_node()
+    fw_outs = fw_out_n.args[0]  # type: ignore[var-annotated]
+    fw_outs_inner_set = set(fw_outs[:num_inner_fwd_outputs])
+    fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+    fw_outs_packed_tensors = []  # type: ignore[var-annotated]
+    fw_outs_packed_syms = []  # type: ignore[var-annotated]
+
+    # The main use case for saved_tensors_hooks is activation quantization,
+    # for memory usage optimization.
+    # Desired behavior is to quantize saved activations to free the original saved tensor.
+    # Saved nodes may include forward inputs, outputs, parameters.
+    # They may be held by something else and will not be deallocated after quantization.
+    # Donated buffers are intermediates in the graph invisible for the user,
+    # this guarantees that they can be deallocated.
+    # Using this as a default behavior to select saved nodes to apply hooks.
+    # There is also a config to apply hooks for all saved nodes without any filtering.
+    # The plan is to propagate meta about the source of the saved node to the user hook function.
+    mode = torch._functorch.config.saved_tensors_hooks_filtering_mode
+    allow_set = None
+    exclude_set = None
+
+    if mode == "donated":
+        # collect_bw_donated_buffer_idxs requires inner_meta to have num_symints_saved_for_bw
+        inner_meta.num_symints_saved_for_bw = len(
+            [n for n in fw_outs_saved_for_bw if is_sym_node(n)]
+        )
+        bw_donated_idxs = collect_bw_donated_buffer_idxs(
+            fw_module,
+            bw_module,
+            inner_meta,
+        )
+        fw_donated_idxs = [
+            i - inner_meta.num_symints_saved_for_bw for i in bw_donated_idxs
+        ]
+        allow_set = {fw_outs_saved_for_bw[i].name for i in fw_donated_idxs}
+    elif mode == "no_static":
+        fw_g_inputs = fw_g.find_nodes(op="placeholder")
+        exclude_set = {fw_g_inputs[i].name for i in static_input_indices}
+
+    if (allow_set is not None) and (not allow_set):
+        # This means we have empty whitelist,
+        # No donated (intermediate) saved.
+        # Do not do anything in this case
+        return
+
+    if aot_config.enable_log:
+        structured_logs.append(f"fw_outs_saved_for_bw:{fw_outs_saved_for_bw}")
+        structured_logs.append(f"mode:{mode}")
+        structured_logs.append(f"allow_set:{allow_set}")
+        structured_logs.append(f"exclude_set:{exclude_set}")
+
+    for saved in fw_outs_saved_for_bw:
+        if ((allow_set is not None) and (saved.name not in allow_set)) or (
+            (exclude_set is not None) and (saved.name in exclude_set)
+        ):
+            if isinstance(saved.meta["val"], torch.Tensor):
+                fw_outs_packed_tensors.append(saved)
+            continue
+
+        val = saved.meta["val"]
+        if not isinstance(val, torch.Tensor):
+            continue
+
+        pack_out_val = pack_hook_gm(val)
+
+        requires_sc_handling = any(
+            is_traceable_wrapper_subclass(x) for x in pytree.tree_leaves(pack_out_val)
+        )
+        if requires_sc_handling:
+            raise NotImplementedError(
+                "Tensor subclasses in GraphModule saved tensors hooks are not supported"
+                "You can workaround it by manually returning subclass's inner tensors"
+                " in the pack hook, and reconstructing the subclass in the unpack hook"
+            )
+
+        pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
+        pack_g = pack_gm.graph
+        maybe_log_graph(
+            pack_gm,
+            f"saved_tensors_pack_hook {saved.name}",
+            aot_config,
+            lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
+            structured_logs,
+        )
+        pack_out_val = pack_gm(val)
+
+        # Install pack hook graph as eiplogue of fw_module.
+        # Saved tensor output becomes input of pack hook graph.
+        # Replace saved tensor output with pack hook graph output.
+        # Outputs symbolic scalars, tensors  are accumulated separately.
+        # Then in forward outputs and backward inputs installed in order
+        # sym_scalars, packed_saved_tensors.
+        # Keeping all tensors together allows to preserve
+        # the same identification at runtime,
+        # updating only number of saved sym_scalars and tensors.
+        pack_g_inputs = pack_g.find_nodes(op="placeholder")
+        assert len(pack_g_inputs) == 1
+        env = {pack_g_inputs[0]: saved}
+        fw_pack_out_args = None
+        with fw_g.inserting_before(fw_out_n):
+            for node in pack_g.nodes:
+                if node.op == "placeholder":
+                    continue
+                new_n = fw_g.node_copy(node, lambda n: env[n])
+                fw_g_names.add(new_n.name)
+                env[node] = new_n
+                # Output node is temporarily copied to have remapped arguments.
+                # Removed in the end.
+                if node.op == "output":
+                    fw_pack_out_args = new_n.args[0]
+                    fw_g.erase_node(new_n)
+
+        env.clear()
+        assert fw_pack_out_args
+        fw_outs_bw_ins_node_names = []
+        for out_idx, _n in enumerate(pytree.tree_leaves(fw_pack_out_args)):
+            if not isinstance(_n, torch.fx.Node):
+                fw_outs_bw_ins_node_names.append("")
+                continue
+
+            # This happens when hook is noop and it is either user input or user output.
+            # Do not do anything with this node.
+            if _n.op == "placeholder" or _n in fw_outs_inner_set:
+                # This means the hook returned input primals unchanged
+                # Do not rename in this case.
+                n = _n
+                new_node_name = _n.name
+                fw_outs_bw_ins_node_names.append(new_node_name)
+            else:
+                # We can not specify desired name in node_copy.
+                # Copying node manually to set specifc name,
+                # to have matching fw_outs, bw_inputs names.
+                new_node_name = _gen_unused_name(f"{saved.name}_hook_{out_idx}")
+                with fw_g.inserting_before(_n):
+                    n = fw_g.create_node(
+                        _n.op,
+                        _n.target,
+                        _n.args,
+                        _n.kwargs,
+                        name=new_node_name,
+                    )
+                assert n.name == new_node_name
+                fw_outs_bw_ins_node_names.append(new_node_name)
+                n.meta = copy.copy(_n.meta)
+                _n.replace_all_uses_with(n)
+                fw_g.erase_node(_n)
+            if isinstance(n.meta["val"], torch.Tensor):
+                fw_outs_packed_tensors.append(n)
+            elif is_sym_node(n):
+                fw_outs_packed_syms.append(n)
+
+        # Install unpack hook graph as a prologue of backward graph
+        # Saved tensors inputs are replaced with packed tensors and packed sym scalars.
+        # The saved tensors inputs usages in the graph are replaced with unpack hook graph outputs.
+        unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
+        unpack_g = unpack_gm.graph
+        maybe_log_graph(
+            unpack_gm,
+            f"saved_tensors_unpack_hook {saved.name}",
+            aot_config,
+            lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
+            structured_logs,
+        )
+
+        def find_saved_in_bw_inputs(bw_inputs):
+            for n in bw_inputs:
+                if n.name == saved.name:
+                    return n
+
+        bw_g_input = find_saved_in_bw_inputs(bw_g_inputs)
+        assert bw_g_input
+        original_bw_g_input_users = list(bw_g_input.users.keys())
+        bw_g_input_used_directly = False
+
+        # Replace backward graph saved tensor input with copy of pack graph outputs
+        # All non-Tensor, non-symscalars outputs are constanted.
+
+        unpack_g_inputs = unpack_g.find_nodes(op="placeholder")
+        env = {}
+        for out_idx, (unp_in_n, out_n, val) in enumerate(
+            zip(
+                unpack_g_inputs,
+                pytree.tree_leaves(fw_pack_out_args),
+                pytree.tree_leaves(pack_out_val),
+            )
+        ):
+            is_sym = isinstance(val, py_sym_types)
+            if isinstance(val, torch.Tensor) or is_sym:
+                # We want forward_outputs names to match backward_inputs,
+                # Potentially backward may already have "{saved.name}_hook_{idx}",
+                # In this case fx.Graph will add suffix.
+                new_node_name = fw_outs_bw_ins_node_names[out_idx]
+                if bw_g_input.name == new_node_name:
+                    env[unp_in_n] = bw_g_input
+                    bw_g_input_used_directly = True
+                else:
+                    # Backward calling convention: ctx_symints,ctx_saved_tensors
+                    # Inserting packed sym scalars before first saved tensor input.
+                    # Inserting packed tensors before last saved tensor input.
+                    # Saved tensor inputs between them will be removed.
+                    with bw_g.inserting_before(
+                        bw_g_inputs[0]
+                    ) if is_sym else bw_g.inserting_before(bw_g_input):
+                        new_n = bw_g.placeholder(new_node_name)
+                        assert new_n.name == new_node_name
+                    new_n.meta = copy.copy(out_n.meta)
+                    env[unp_in_n] = new_n
+            else:
+                # Inline values of non-Tensor, non-SymScalars
+                env[unp_in_n] = val
+
+        # Inserting unpack hook after placeholders.
+        bw_unpack_out_n = None
+        with bw_g.inserting_before(bw_g_inputs[-1].next):
+            for node in unpack_g.nodes:
+                if node.op == "placeholder":
+                    continue
+                new_n = bw_g.node_copy(node, lambda n: env[n])
+                bw_g_names.add(new_n.name)
+                env[node] = new_n
+                # Temporary insert output, to have remapped by node_copy args.
+                # Removed in the end.
+                if node.op == "output":
+                    bw_unpack_out_n = new_n
+
+        assert bw_unpack_out_n
+        _leaves = pytree.tree_leaves(bw_unpack_out_n.args)
+        assert len(_leaves) == 1
+        unpack_saved_tensor_n = _leaves[0]
+
+        if not bw_g_input_used_directly:
+            bw_g_input.replace_all_uses_with(unpack_saved_tensor_n)
+            bw_g.erase_node(bw_g_input)
+        else:
+            # Keep usages of bw_g_input in inserted unpacked hook graph.
+            # Replace other usages of bw_g_input with unpack_saved_tensor_n.
+            from torch._C import _fx_map_arg
+
+            def maybe_replace_node(n):
+                return unpack_saved_tensor_n if n == bw_g_input else n
+
+            for use_node in original_bw_g_input_users:
+                new_args = _fx_map_arg(use_node.args, maybe_replace_node)
+                new_kwargs = _fx_map_arg(use_node.kwargs, maybe_replace_node)
+                assert isinstance(new_args, tuple)
+                assert isinstance(new_kwargs, dict)
+                use_node._update_args_kwargs(new_args, new_kwargs)
+        bw_g.erase_node(bw_unpack_out_n)
+
+    # Changing forward graph outputs,
+    # Inserting packed_tensors and packed_syms on the place of saved tensors.
+    # Packed sym_scalars are together with saved symints
+    symint_outs_saved_for_bw = [n for n in fw_outs_saved_for_bw if is_sym_node(n)]
+    fw_new_outs = pytree.tree_leaves(
+        (
+            fw_outs[:num_inner_fwd_outputs],
+            fw_outs_packed_tensors,
+            fw_outs_packed_syms,
+            symint_outs_saved_for_bw,
+        )
+    )
+    fw_out_n.args = (tuple(fw_new_outs),)
+
+    # Assert that saved tensors and symints in forward outputs are aligned with backward inputs
+    _fw_n = num_inner_fwd_outputs
+    _fw_num_t = len(fw_outs_packed_tensors)
+    _fw_num_s = len(fw_outs_packed_syms) + len(symint_outs_saved_for_bw)
+    fw_outs_saved_tensors = fw_new_outs[_fw_n : _fw_n + _fw_num_t]
+    fw_outs_saved_syms = fw_new_outs[_fw_n + _fw_num_t :]
+    bw_new_ins = list(bw_g.find_nodes(op="placeholder"))
+    bw_ins_saved_syms = bw_new_ins[:_fw_num_s]
+    bw_ins_saved_tensors = bw_new_ins[_fw_num_s : _fw_num_s + _fw_num_t]
+
+    fw_t_names = [n.name for n in fw_outs_saved_tensors]
+    bw_t_names = [n.name for n in bw_ins_saved_tensors]
+    fw_s_names = [n.name for n in fw_outs_saved_syms]
+    bw_s_names = [n.name for n in bw_ins_saved_syms]
+
+    def _log_structured_logs():
+        if not aot_config.enable_log:
+            return
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "aot_saved_tensors_hooks_graphs",
+                "encoding": "string",
+            },
+            payload_fn=lambda: "\n".join(structured_logs),
+        )
+
+    if aot_config.enable_log:
+        structured_logs.append(
+            f"fw_outs[:num_inner_fwd_outputs]:{fw_outs[:num_inner_fwd_outputs]}"
+        )
+        structured_logs.append(f"fw_outs_packed_tensors:{fw_outs_packed_tensors}")
+        structured_logs.append(f"fw_t_names:{fw_t_names}")
+        structured_logs.append(f"bw_t_names:{bw_t_names}")
+        structured_logs.append(f"fw_s_names:{fw_s_names}")
+        structured_logs.append(f"bw_s_names:{bw_s_names}")
+        structured_logs.append(f"\nfw_g_pre_assert:{fw_g}")
+        structured_logs.append(f"\nbw_g_pre_assert:{bw_g}")
+        maybe_log_graph(
+            fw_module,
+            "Forward graph after transform pre-assert",
+            aot_config,
+            lambda: "aot_forward_graph_pre_assert_saved_tensors_hooks",
+            structured_logs,
+        )
+        maybe_log_graph(
+            bw_module,
+            "Backward graph after transform pre-assert",
+            aot_config,
+            lambda: "aot_backward_graph_pre_assert_saved_tensors_hooks",
+            structured_logs,
+        )
+        _log_structured_logs()
+
+    assert fw_t_names == bw_t_names
+    assert fw_s_names == bw_s_names
+
+    fw_g.lint()
+    bw_g.lint()
+    fw_module.recompile()
+    bw_module.recompile()
+
+
+def aot_dispatch_autograd(
+    flat_fn,
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> DispatchReturn:
+    """
+    Autograd logic. Generates a joint graph, partitions it, manipulates the input with various wrappers,
+    and returns a wrapped torch.autograd.Function with a forward and backward.
+    """
+    wrappers = _create_wrappers_for_dispatch(needs_autograd=True)
+    flat_fn, flat_args, fw_metadata = pre_compile(
+        wrappers,
+        flat_fn,
+        flat_args,
+        aot_config,
+        fw_metadata=fw_metadata,
+    )
+
+    fw_metadata.deterministic = torch.are_deterministic_algorithms_enabled()
+    with dynamo_timed("aot_trace_joint_graph", log_pt2_compile_event=True):
+        fx_g, joint_inputs, maybe_subclass_meta = aot_dispatch_autograd_graph(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+        )
+
+    # Copied from aot_dispatch_autograd_graph.
+    disable_amp = torch._C._is_any_autocast_enabled()
+    joint_graph_str = None
+    if aot_config.enable_log:
+        aot_joint_log.info(
+            "%s",
+            lazy_format_graph_code(
+                "Joint graph",
+                fx_g,
+                aot_config.aot_id,
+                include_stride=True,
+                include_device=True,
+                colored=True,
+            ),
+        )
+        joint_graph_str = fx_g.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+        trace_structured(
+            "aot_joint_graph",
+            payload_fn=lambda: joint_graph_str,
+        )
+
+    with torch.no_grad():
+        inner_meta = (
+            fw_metadata
+            if maybe_subclass_meta is None
+            else maybe_subclass_meta.fw_metadata
+        )
+        with track_graph_compiling(aot_config, "joint"):
+            # See Note: [Partitioner handling for Subclasses, Part 1]
+            # See Note: [Recomputing subclass mutation handling]
+            mutated_inp_runtime_indices = (
+                compute_inner_mutated_inp_indices_from_subclass_meta(
+                    fw_metadata, inner_meta
+                )
+            )
+            num_tokens = len(fw_metadata.tokens)
+            num_mutated_inp_runtime_indices = len(mutated_inp_runtime_indices)
+            num_inner_fwd_outputs = (
+                num_mutated_inp_runtime_indices
+                + inner_meta.num_outputs
+                + inner_meta.num_intermediate_bases
+                + inner_meta.num_outputs_rng_offset
+                + num_tokens  # See Note [Side-Effectful Tokens in AOTAutograd]
+            )
+            fake_mode = detect_fake_mode()
+            fx_g = run_joint_graph_passes_on_hops(fx_g, joint_inputs, aot_config)
+
+            # TODO(anijain2305) - Add tensorify_python_scalars to the HOP graph passes.
+            if fake_mode is not None and fake_mode.shape_env is not None:
+                tensorify_python_scalars(fx_g, fake_mode.shape_env, fake_mode)
+
+            static_lifetime_input_indices = fw_metadata.static_input_indices
+            fw_module, bw_module = aot_config.partition_fn(
+                fx_g,
+                joint_inputs,
+                num_fwd_outputs=num_inner_fwd_outputs,
+                static_lifetime_input_indices=static_lifetime_input_indices,
+            )
+            rng_states = [
+                n
+                for n in fw_module.graph.find_nodes(op="placeholder")
+                if "fwd_rng_state" in n.name
+            ]
+            fw_metadata.num_graphsafe_rng_states = len(rng_states)
+            if rng_states:
+                fw_metadata.graphsafe_rng_state_index = (
+                    rng_states[0].meta["val"].device.index
+                )
+
+            # See Note [Side-Effectful Tokens in AOTAutograd]
+            if config.unlift_effect_tokens and (
+                num_tokens > 0 or fw_metadata.num_backward_tokens > 0
+            ):
+                unlift_tokens(fw_module, fw_metadata, aot_config, bw_module)
+
+                num_inner_fwd_outputs -= num_tokens
+                joint_inputs = (
+                    joint_inputs[0][num_tokens:],
+                    joint_inputs[1],
+                )
+
+            maybe_inline_graph_saved_tensors_hooks(
+                fw_module,
+                bw_module,
+                num_inner_fwd_outputs,
+                inner_meta,
+                aot_config,
+                fw_metadata.static_input_indices,
+            )
+            static_lifetime_input_indices = fw_metadata.static_input_indices
+
+            fw_outs = next(iter(fw_module.graph.find_nodes(op="output"))).args[0]
+            # we only need to bookkeep the symints that are saved for bw, not any symints
+            # the user forward might have returned in its own output
+            fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+            num_fw_outs_saved_for_bw = len(fw_outs_saved_for_bw)
+            symint_outs_saved_for_bw = []
+            for idx, node in enumerate(fw_outs_saved_for_bw):
+                if is_sym_node(node):
+                    symint_outs_saved_for_bw.append(node)
+                elif (
+                    isinstance(node, torch.fx.Node)
+                    and "val" in getattr(node, "meta", {})
+                    and isinstance(node.meta["val"], FakeTensor)
+                ):
+                    # record dynamic tensor activations
+                    dynamic_dims: set[int] = {
+                        dim
+                        for dim, size in enumerate(node.meta["val"].shape)
+                        if not isinstance(size, int)
+                    }
+                    if dynamic_dims:
+                        fw_metadata.dynamic_saved_tensors_idxs[idx] = dynamic_dims
+
+            fw_metadata.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            if torch._functorch.config.donated_buffer:
+                fw_metadata.bw_donated_idxs = collect_bw_donated_buffer_idxs(
+                    fw_module,
+                    bw_module,
+                    inner_meta,
+                )
+                inner_meta.bw_donated_idxs = fw_metadata.bw_donated_idxs
+
+        if aot_config.enable_log:
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "torch._functorch.config",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: torch._functorch.config.get_config_copy(),
+            )
+            aot_graphs_log.info(
+                "aot_config id: %s, fw_metadata=%s, inner_meta=%s",
+                str(aot_config.aot_id),
+                str(fw_metadata),
+                str(inner_meta),
+            )
+
+        # Note [Detaching inputs that never need gradients]
+        # See https://github.com/pytorch/pytorch/issues/97745
+        # Suppose we have a function like this that we want to compile:
+        #
+        # def f(x, y):
+        #     return torch.mul(x, y.detach())
+        #
+        # What gradients should we compute for x and y?
+        # By default, AOTAutograd will compute a gradient for **every** input that requires gradients,
+        # and so we'll compute:
+        #    x_grad_input = y
+        #    y_grad_input = None
+        # Does this preserve the semantics of eager mode?
+        # Unfortunately, no.
+        # Doing the above will cause autograd to **continue** to backprop the autograd tape
+        # that was generated from constructing y.
+        #
+        # This is **different** from what would have happened in eager mode.
+        # In eager mode, if we backprop through the output of this function, autograd will only traverse
+        # the bit of the autograd tape corresponding to "x".
+        # In particular, if a user had previously backpropped through y's autograd tape,
+        # And then they try to backprop through the output of the above function,
+        # then we'll hit the dreaded "Trying to backward through the graph a second time" error.
+        #
+        # You might think: If autograd sees that a gradient is None, shouldn't it stop early,
+        # instead of continuing the backprop through the ancestors of that node in the graph?
+        #
+        # Autograd has two passes:
+        # (1) a first pass that traverses the autograd graph and figures out which nodes need to be executed
+        # (2) a second pass that actually goes ahead and executes each node when it becomes ready,
+        #     propagating gradients
+        # By the time we're executing a node and we see that it produces a None, the set of nodes to execute
+        # is already locked-in.
+        #
+        # The fix: instead, we can recognize statically that the graph we're compiling will never contribute
+        # gradients to y, and prevent autograd from trying to traverse y's autograd tape at all.
+        # We can do this by manually detach'ing y before sending it through the `CompiledFunction`.
+        #
+        # Note that this solution is not bulletproof.
+        # It's possible to construct a case where eager may or may not have have tried to autograd through y,
+        # depending on the actual grad_outputs that were passed in during the backward.
+        # There is no easy fix for this: the simplest fix would be to run with `retain_graph=True`,
+        # allowing autograd to re-use the graph.
+        #
+        # An example of this case is:
+        # def f(x):
+        #     return x.detach() * 2, x * 3
+        # If we were to only backprop through outs[0], in eager, we would stop
+        # If we backward only on the first output, we shouldn't send a grad through x.
+        # But the custom autograd function doesn't know that: it will materialize zero grads for x * 3
+        # and we will end up with a zero grad at x.
+        # If we later backprop through the second output, this will also require backprop'ing through x.
+        # Meaning we'll need to use `retain_graph=True` to be able to backprop through x the second time.
+        _indices_of_inps_to_detach: list[int] = []
+
+        # reversed() since we expect output at end of graph
+        bw_output = next(reversed(bw_module.graph.find_nodes(op="output")))
+        bw_outs: Sequence[torch.fx.Node] = bw_output.args[0]  # type: ignore[assignment]
+
+        # TODO: we should apply the below "detach inputs if their gradients are statically known to be None"
+        # optimization even if we have subclass inputs/outputs (we do not handle this today).
+        # Computing which our our inputs get None gradients is a bit more complicated,
+        # if any of our inputs are subclasses. Why?
+        # (a) we need to make sure that we call .detach() on the input subclasses, since autograd sees subclasses.
+        # (b) The grad_outputs that we AOT computed in our backward graph are the desugared tensor tensors,
+        #     so we need to figure out which subclass fw inputs they map to.
+        if maybe_subclass_meta is None:
+            num_backward_tokens: int = inner_meta.num_backward_tokens
+            assert (
+                len(bw_outs)
+                == len(fw_metadata.input_info)
+                + inner_meta.num_outputs_rng_offset
+                + num_backward_tokens
+            )
+            bw_outs_no_rng_no_tokens = bw_outs
+            if (inner_meta.num_outputs_rng_offset + num_backward_tokens) > 0:
+                bw_outs_no_rng_no_tokens = bw_outs[
+                    : -(inner_meta.num_outputs_rng_offset + num_backward_tokens)
+                ]
+            assert len(bw_outs_no_rng_no_tokens) == len(fw_metadata.input_info)
+
+            for i, (bw_out) in enumerate(bw_outs_no_rng_no_tokens):
+                # If our input experiences a metadata mutation inside the graph (e.g. set_()),
+                # we *must* not detach, otherwise it will be the detach'd input that gets the metadata mutation
+                metadata_mutation_in_graph = (
+                    fw_metadata.input_info[i].mutation_type
+                    == MutationType.MUTATED_IN_GRAPH
+                    and fw_metadata.input_info[i].mutates_storage_metadata
+                )
+                is_non_leaf = (
+                    fw_metadata.input_info[i].requires_grad
+                    and not fw_metadata.input_info[i].is_leaf
+                )
+                if bw_out is None and not metadata_mutation_in_graph and is_non_leaf:
+                    _indices_of_inps_to_detach.append(i)
+
+        fw_module_str = None
+        bw_module_str = None
+        if aot_config.enable_log:
+            aot_graphs_log.info(
+                "%s",
+                lazy_format_graph_code(
+                    "Forward graph",
+                    fw_module,
+                    aot_config.aot_id,
+                    include_stride=True,
+                    include_device=True,
+                    colored=True,
+                ),
+            )
+            aot_graphs_log.info(
+                "%s",
+                lazy_format_graph_code(
+                    "Backward graph",
+                    bw_module,
+                    aot_config.aot_id,
+                    include_stride=True,
+                    include_device=True,
+                    colored=True,
+                ),
+            )
+            fw_module_str = fw_module.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            )
+            bw_module_str = bw_module.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            )
+
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "aot_forward_graph_fw_metadata",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: dataclass_repr(fw_metadata),
+            )
+            if maybe_subclass_meta is not None:
+                trace_structured(
+                    "artifact",
+                    metadata_fn=lambda: {
+                        "name": "aot_forward_graph_fw_subclass_metadata",
+                        "encoding": "string",
+                    },
+                    payload_fn=lambda: dataclass_repr(maybe_subclass_meta),
+                )
+
+            trace_structured(
+                "aot_forward_graph",
+                payload_fn=lambda: fw_module_str,
+            )
+            trace_structured(
+                "aot_backward_graph",
+                payload_fn=lambda: bw_module_str,
+            )
+
+        # AMP is already traced out in joint graph. we do not wish to reapply it accidentally
+        # in the compiler.
+        with track_graph_compiling(aot_config, "forward"), torch._C._DisableAutocast():
+            # flat_args at this point might still be subclasses-
+            # make sure to pass the unwrapped fake tensors into the compiler!
+            adjusted_flat_args = joint_inputs[0]
+
+            fakified_out_wrapper = FakifiedOutWrapper()
+            (
+                fw_module,
+                adjusted_flat_args,
+                fw_metadata,
+            ) = fakified_out_wrapper.pre_compile(
+                fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
+            )
+
+            functionalized_rng_wrapper = FunctionalizedRngRuntimeWrapper(
+                return_new_outs=False
+            )
+
+            if rng_states:
+                index = fw_metadata.graphsafe_rng_state_index
+                assert index is not None
+                rng_states = [
+                    get_cuda_generator_meta_val(index)
+                    for _ in range(fw_metadata.num_graphsafe_rng_states)
+                ]
+                adjusted_flat_args.extend(rng_states)  # type: ignore[arg-type]
+
+            (
+                fw_module,
+                adjusted_flat_args,
+                fw_metadata,
+            ) = functionalized_rng_wrapper.pre_compile(
+                fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
+            )
+            if tracing_context := torch._guards.TracingContext.try_get():
+                tracing_context.fw_metadata = inner_meta
+
+            with TracingContext.report_output_strides() as fwd_output_strides:
+                compiled_fw_func = aot_config.fw_compiler(fw_module, adjusted_flat_args)
+
+            if not getattr(compiled_fw_func, "_boxed_call", False):
+                compiled_fw_func = make_boxed_func(compiled_fw_func)
+
+            if fakified_out_wrapper.needs_post_compile:
+                fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)
+
+            compiled_fw_func = EffectTokensWrapper().post_compile(
+                compiled_fw_func,
+                aot_config,
+                runtime_metadata=fw_metadata,
+            )
+
+            compiled_fw_func = AOTDispatchSubclassWrapper(
+                fw_only=None,
+                trace_joint=False,
+                maybe_subclass_meta=maybe_subclass_meta,
+                num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+            ).post_compile(
+                compiled_fw_func,
+                aot_config,  # not used
+                runtime_metadata=fw_metadata,
+            )
+
+            compiled_fw_func = functionalized_rng_wrapper.post_compile(
+                compiled_fw_func, aot_config, runtime_metadata=fw_metadata
+            )
+            compiled_fw_func = fakified_out_wrapper.post_compile(
+                compiled_fw_func,
+                aot_config,
+                runtime_metadata=fw_metadata,
+            )
+
+        # NB: It's important to compile backwards ahead of time, as this may
+        # add extra guards which we need to apply to the Dynamo cache at
+        # forwards
+        with track_graph_compiling(aot_config, "backward"), torch._C._DisableAutocast():
+            placeholder_list = fx_placeholder_vals(bw_module)
+
+            forward_saved_for_backwards_strides = None
+            if fwd_output_strides is not None:
+                forward_saved_for_backwards_strides = fwd_output_strides[
+                    inner_meta.tensors_saved_for_backwards_slice
+                ]
+
+            # saved activations can have different stride to eager if
+            # the compiler does layout optimization. We should restride the
+            # tensor passed in for compiling the backward graph using the
+            # saved tensor's stride.
+            for i in range(len(placeholder_list)):
+                ph_arg = placeholder_list[i]
+                if not isinstance(ph_arg, torch.Tensor):
+                    continue
+
+                if forward_saved_for_backwards_strides is None:
+                    continue
+
+                real_stride = None
+                # Per all_args calling convention
+                j = i - num_symints_saved_for_bw
+                if 0 <= j < len(forward_saved_for_backwards_strides):
+                    real_stride = forward_saved_for_backwards_strides[j]
+                if real_stride is None:
+                    continue
+
+                # Comparing ph_arg.stride() with real_stride directly may
+                # cause dynamic dimensions in ph_arg being specialized to static
+                # value. Using the hints to avoid that.
+                if _get_symint_hints(ph_arg.stride()) != real_stride:
+                    # Note that here we use the stride of the real tensor to
+                    # restride a FakeTensor. This does not cause trouble
+                    # for dynamic shape since this code path only get
+                    # executed if layout optimization is enabled. And we
+                    # disable layout optimization for dynamic shape right
+                    # now.
+                    #
+                    # A solution that decide stride order based on real
+                    # tensor's stride and then apply that stride order to
+                    # the FakeTensor does not work smoothly since some
+                    # tensor's layout is not 'dense'. E.g. mixnet_l has a
+                    # tensor with size [8, 64, 112, 112] and strides
+                    # (2408448, 1, 21504, 192). The solution mentioned will
+                    # decide a stride of (802816, 1, 7168, 64) for this
+                    # tensor which is wrong.
+                    placeholder_list[i] = ph_arg.as_strided(ph_arg.size(), real_stride)
+
+            compiled_bw_func = None
+            if num_symints_saved_for_bw > 0:
+                try:
+                    # See Note: [Backward graph lazy lowering]
+                    with torch._subclasses.fake_tensor.unset_fake_temporarily():
+                        # If bw_module contains lifted constants, they will be real tensors stored as
+                        # GraphModule. Deepcopying tensors under fake mode is not supported and will
+                        # raise when attempting to set storage.
+                        bw_module_copy = copy.deepcopy(bw_module)
+                    compiled_bw_func = aot_config.bw_compiler(
+                        bw_module_copy, placeholder_list
+                    )
+                    del bw_module_copy
+                except Exception as e:
+                    exc = e
+                    trace_structured(
+                        "artifact",
+                        metadata_fn=lambda: {
+                            "name": "eager_compile_backwards_failure",
+                            "encoding": "string",
+                        },
+                        payload_fn=lambda: "\n".join(
+                            traceback.format_exception(
+                                type(exc), exc, exc.__traceback__
+                            )
+                        ),
+                    )
+                    log.warning(
+                        "failed to eagerly compile backwards for dynamic, suppressing in case backwards not needed",
+                        exc_info=True,
+                    )
+            # Compiled autograd will run the bw_module in the backward pass,
+            # so recompilation need happen anyway if the backward pass is ever
+            # called.
+            #
+            # The reason we do the GraphModule recompilation here is because
+            # the lazy recompilation will cause issue in the backward pass
+            # with compiled autograd.
+            #
+            # Do the _LazyGraphModule.force_recompile here rather than when
+            # bw_module is first generated by the partitioner because the bw_module.recompile
+            # may be called in some code path later and cause the _LazyGraphModule.forward
+            # becomes the lazy version again. One example is when dynamic shape is enabled
+            # upfront, the bw_compiler will be called above which can cause extra
+            # graph module recompilation on bw_module.
+            if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+                from torch.fx._lazy_graph_module import _LazyGraphModule
+
+                _LazyGraphModule.force_recompile(bw_module)
+
+    saved_context = TracingContext.try_get()
+    saved_compile_context = CompileContext.try_get()
+
+    backward_state_indices = [
+        idx for idx, x in enumerate(flat_args) if isinstance(x, BackwardState)
+    ]
+    assert len(backward_state_indices) <= 1
+
+    lazy_backward_info = AutogradLazyBackwardCompileInfo(
+        bw_module,
+        placeholder_list,
+        saved_context,
+        saved_compile_context,
+    )
+
+    make_runtime_safe(fw_metadata, maybe_subclass_meta)
+
+    try_save_cache_entry: Optional[Callable] = None
+
+    if aot_config.cache_info is not None:
+        forward_time_taken_ns = time.time_ns() - aot_config.cache_info.start_time_ns
+
+        # NB: aot_config here is technically not needed as an argument: we could just
+        # close over aot_config.cache_info, since aot_config never changes.
+        # But closing over random variables is confusing IMO, so I'm leaving it.
+        def try_save_cache_entry(  # noqa: F811
+            compiled_bw_func: Callable,
+            bw_module: torch.fx.GraphModule,
+            _fw_metadata: ViewAndMutationMeta,
+            aot_config: AOTConfig,
+        ):
+            fw_key = getattr(compiled_fw_func, "_fx_graph_cache_key", None)
+            bw_key = getattr(compiled_bw_func, "_fx_graph_cache_key", None)
+            cache_info = aot_config.cache_info
+            if cache_info is not None and fw_key and bw_key:
+                assert forward_time_taken_ns is not None
+                # TODO: technically, AOTAutograd does a *little* bit of post processing work
+                # in the backward that isn't measured here. But it's small enough that it's not worth
+                # the complexity of threading a bunch of times through the code, so we
+                # use the compiled_bw_func's inductor compile time instead.
+                # It's possible this changes in the future, in which case we should
+                # update backward_time_taken_ns to be more inclusive
+                backward_time_taken_ns = getattr(compiled_bw_func, "_time_taken_ns", 0)
+
+                aot_forward_graph_str: Optional[str] = fw_module_str
+                aot_backward_graph_str: Optional[str] = bw_module_str
+                aot_joint_graph_str: Optional[str] = joint_graph_str
+                guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
+
+                entry = AOTAutogradCache.make_entry(
+                    compiled_fw_func,  # type: ignore[arg-type]
+                    compiled_bw_func,  # type: ignore[arg-type]
+                    aot_joint_graph_str,
+                    aot_forward_graph_str,
+                    aot_backward_graph_str,
+                    _fw_metadata,
+                    wrappers,
+                    maybe_subclass_meta,
+                    num_fw_outs_saved_for_bw,
+                    _indices_of_inps_to_detach,
+                    forward_time_taken_ns,
+                    backward_time_taken_ns,
+                    sanitized_aot_config=sanitize_aot_config(aot_config),
+                    guards_expr=guards_expr,
+                    backward_state_indices=backward_state_indices,
+                    num_symints_saved_for_bw=num_symints_saved_for_bw,
+                    serialized_bw_module=serialize_graph_module(bw_module),
+                )
+                remote = should_use_remote_autograd_cache()
+                AOTAutogradCache.save(cache_info.cache_key, entry, remote)
+
+        if compiled_bw_func is not None:
+            # If we already compiled the backward, we save its cache entry now
+            try_save_cache_entry(compiled_bw_func, bw_module, fw_metadata, aot_config)
+            try_save_cache_entry = None
+
+    compiled_fn = AOTDispatchAutograd.post_compile(
+        compiled_fw_func,
+        compiled_bw_func,
+        maybe_subclass_meta,
+        num_symints_saved_for_bw,
+        backward_state_indices,
+        disable_amp,
+        _indices_of_inps_to_detach,
+        lazy_backward_info,
+        aot_config,
+        fw_metadata=fw_metadata,
+        try_save_cache_entry=try_save_cache_entry,
+    )
+
+    if config.debug_assert:
+        flat_requires_grad: list[Optional[bool]] = [
+            a.requires_grad if isinstance(a, Tensor) else None for a in flat_args
+        ]
+        compiled_fn = DebugAssertWrapper(
+            flat_requires_grad=flat_requires_grad
+        ).post_compile(compiled_fn, aot_config, runtime_metadata=fw_metadata)
+
+    compiled_fn = post_compile(
+        wrappers,
+        compiled_fn,
+        aot_config,
+        runtime_metadata=fw_metadata,
+    )
+    return compiled_fn
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 33aea13c3365d..b26ba2afa46ad 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -6,11 +6,15 @@
 3. handle functionalized randomness
 4. deduplicate inputs and consolidate views into their bases (see input_output_analysis)
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import builtins
 import collections
 import contextlib
 import copy
+<<<<<<< HEAD
 import functools
 import itertools
 import pprint
@@ -26,6 +30,16 @@
 
 import torch
 import torch.fx as fx
+=======
+import itertools
+import pprint
+from contextlib import AbstractContextManager, nullcontext
+from dataclasses import dataclass, field
+from functools import wraps
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.utils.dlpack
 from torch import Tensor
 from torch._dynamo import config as dynamo_config
@@ -47,6 +61,7 @@
 
 from .. import config
 from .collect_metadata_analysis import run_functionalized_fw_and_collect_metadata
+<<<<<<< HEAD
 from .descriptors import (
     AOTInput,
     AOTOutput,
@@ -57,6 +72,9 @@
 )
 from .functional_utils import gen_alias_from_base
 from .graph_capture_wrappers import aot_dispatch_subclass
+=======
+from .functional_utils import gen_alias_from_base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .input_output_analysis import (
     compute_overlapping_inputs,
     create_synthetic_base_metadata,
@@ -65,9 +83,12 @@
 from .logging_utils import describe_input, format_guard_bug_msg, track_graph_compiling
 from .schemas import (
     AOTConfig,
+<<<<<<< HEAD
     CompilerWrapper,
     FxValue,
     InductorWrapper,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InputAliasInfo,
     MemoryFormatMeta,
     MutationType,
@@ -76,7 +97,10 @@
     SubclassCreationMeta,
     SubclassMeta,
     TensorAlias,
+<<<<<<< HEAD
     TraceFn,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ViewAndMutationMeta,
 )
 from .subclass_utils import (
@@ -84,6 +108,7 @@
     runtime_unwrap_tensor_subclasses,
     wrap_tensor_subclasses,
 )
+<<<<<<< HEAD
 from .utils import (
     call_and_expect_output_descs,
     call_func_at_runtime_with_args,
@@ -98,6 +123,72 @@
 zip = strict_zip
 
 
+=======
+from .traced_function_transforms import aot_dispatch_subclass
+from .utils import (
+    call_func_at_runtime_with_args,
+    make_boxed_func,
+    partial_flatten_asdict,
+    strict_zip,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+zip = strict_zip
+
+
+class CompilerWrapper:
+    """
+    A wrapper around the inputs and outputs to the compiler_fn. We separate these into two parts:
+
+    1. The prologue, which edits the input to the compiler_fn(flat_fn, flat_args, etc)
+    2. The epilogue, which edits the outputs of the compiler_fn (compiled_fn, real arguments)
+
+    Each wrapper below should be implemented as a CompilerWrapper, so that we can facilitate
+    caching on the compiled output, and re-wrapping the output via epilogues.
+    Extra metadata that is needed to compute pre or post compile can be passed in via attributes.
+    """
+
+    def pre_compile(
+        self,
+        flat_fn,
+        flat_args: list[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+        """
+        Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs.
+        Args:
+        flat_fn: The function to compile
+        flat_args: Metadata from example inputs of the function to compile
+        aot_config: AOTConfig passed in at compile time
+        fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args
+        """
+        return flat_fn, flat_args, fw_metadata
+
+    def post_compile(self, compiled_fn, aot_config, *, runtime_metadata) -> Callable:
+        """
+        Given an output of the compiler, wrap it with information received from prologue.
+        Args:
+        compiled_fn: Callable after calling compiler_fn
+        aot_config: AOTConfig after calling prologue
+        runtime_metadata: ViewAndMutationMeta after calling all wrappers's pre_compile steps.
+        Example:
+
+        def wrapped_compiled_fn(args):
+            # do something with args, aot_config, fw_metadata
+            return compiled_fn(args)
+
+        return wrapped_compiled_fn
+        """
+        return compiled_fn
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
 # that needs to run after the compiled function.
 #
@@ -151,7 +242,11 @@ def __init__(self, info, runtime_metadata, trace_joint):
         self.base_idx = info.base_idx
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
+<<<<<<< HEAD
         self.view_meta_sequence = info.view_meta_sequence
+=======
+        self.functional_tensor = info.functional_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -160,7 +255,11 @@ def __call__(self, orig_inputs, fw_outs, out):
             aliased_base_tensor,
             self.unwrap_out(out),
             self.requires_grad,
+<<<<<<< HEAD
             self.view_meta_sequence,
+=======
+            self.functional_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replay_views=self.replay_views,
         )
 
@@ -191,7 +290,11 @@ def __init__(self, info, runtime_metadata, trace_joint):
 
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
+<<<<<<< HEAD
         self.view_meta_sequence = info.view_meta_sequence
+=======
+        self.functional_tensor = info.functional_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -200,7 +303,11 @@ def __call__(self, orig_inputs, fw_outs, out):
             self._unwrap_aliased_base_tensor(aliased_base_tensor),
             self.unwrap_out(out),
             self.requires_grad,
+<<<<<<< HEAD
             self.view_meta_sequence,
+=======
+            self.functional_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replay_views=self.replay_views,
         )
 
@@ -225,7 +332,10 @@ def make_output_handler(info, runtime_metadata, trace_joint):
 # not sure why AOTDispatcher needs to manually set this
 def maybe_mark_dynamic_helper(t: torch.Tensor, dims: set[int]):
     if hasattr(t, "_dynamo_weak_dynamic_indices"):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t._dynamo_weak_dynamic_indices |= dims
     else:
         t._dynamo_weak_dynamic_indices = dims.copy()  # type: ignore[attr-defined]
@@ -290,9 +400,15 @@ def _create_runtime_wrapper(
             for info in runtime_metadata.output_info
         )
 
+<<<<<<< HEAD
     def record_runtime_wrapper_prologue_enter() -> Optional[
         AbstractContextManager[None]
     ]:
+=======
+    def record_runtime_wrapper_prologue_enter() -> (
+        Optional[AbstractContextManager[None]]
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             torch.autograd.profiler._is_profiler_enabled
             and dynamo_config.record_runtime_overhead
@@ -310,7 +426,10 @@ def record_runtime_wrapper_prologue_exit(
         if cm is not None:
             cm.__exit__(None, None, None)
 
+<<<<<<< HEAD
     @simple_wraps(compiled_fn)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def runtime_wrapper(args: list[Any]):
         # Create context manager for profiler
         cm = record_runtime_wrapper_prologue_enter()
@@ -469,7 +588,10 @@ def runtime_wrapper(args: list[Any]):
         return runtime_wrapper
 
     # Disabling saved tensors hooks
+<<<<<<< HEAD
     @simple_wraps(runtime_wrapper)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _runtime_wrapper(*args, **kwargs):
         with _disable_saved_tensors_hooks():
             return runtime_wrapper(*args, **kwargs)
@@ -477,9 +599,14 @@ def _runtime_wrapper(*args, **kwargs):
     return _runtime_wrapper
 
 
+<<<<<<< HEAD
 # WARNING: this does NOT operate on TraceFn
 @dataclass
 class FunctionalizedRngRuntimeWrapper(InductorWrapper):
+=======
+@dataclass
+class FunctionalizedRngRuntimeWrapper(CompilerWrapper):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: I would love to get rid of this argument, but it's
     # Wrapped pretty tightly around our aot_dispatch_autograd logic.
     # Specifically, tensors_saved_for_backwards_slice's value is both used for calculating indices
@@ -491,21 +618,36 @@ class FunctionalizedRngRuntimeWrapper(InductorWrapper):
 
     def pre_compile(
         self,
+<<<<<<< HEAD
         flat_fn: torch.fx.GraphModule,
+=======
+        flat_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args,
         aot_config,
         *,
         fw_metadata,
+<<<<<<< HEAD
     ) -> None:
         if config.functionalize_rng_ops:
             # Update example inputs for the fw_compiler
             fake_mode = detect_fake_mode()
             assert fake_mode is not None
+=======
+    ) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+        if config.functionalize_rng_ops:
+            # Update example inputs for the fw_compiler
+            fake_mode = detect_fake_mode()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             seed, offset = CUDARngStateHelper.get_torch_state_as_tuple(fake_mode)
             flat_args.extend([seed, offset])
             # We are not clearing flat_args here because
             # 1) There is a check in the debug compiler at the end
             # 2) It does not matter as these are fake tensors
+<<<<<<< HEAD
+=======
+        return flat_fn, flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def post_compile(
         self,
@@ -553,9 +695,14 @@ def _functionalized_rng_runtime_epilogue(
         return outs
 
 
+<<<<<<< HEAD
 # WARNING: this does NOT operate on TraceFn
 @dataclass
 class FakifiedOutWrapper(InductorWrapper):
+=======
+@dataclass
+class FakifiedOutWrapper(CompilerWrapper):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_metas: list[torch.Tensor] = field(default_factory=list)
     # TracingContext.fwd_output_strides
     # Generated from actually doing compile
@@ -565,12 +712,20 @@ class FakifiedOutWrapper(InductorWrapper):
 
     def pre_compile(
         self,
+<<<<<<< HEAD
         fw_module: fx.GraphModule,  # Must be fw_module from aot_dispatch_*_graph
+=======
+        fw_module,  # Must be fw_module from aot_dispatch_*_graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args,
         aot_config,
         *,
         fw_metadata,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tracing_context = torch._guards.TracingContext.try_get()
         if tracing_context and tracing_context.fakify_first_call:
             self.out_metas = [
@@ -578,6 +733,10 @@ def pre_compile(
             ]
         else:
             self.needs_post_compile = False
+<<<<<<< HEAD
+=======
+        return fw_module, flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _compute_output_meta_with_inductor_strides(self):
         out = self.out_metas
@@ -651,13 +810,19 @@ class AOTDispatchSubclassWrapper(CompilerWrapper):
 
     def pre_compile(
         self,
+<<<<<<< HEAD
         flat_fn: TraceFn,
         flat_args: list[FxValue],
         flat_args_descs: list[AOTInput],
+=======
+        flat_fn,
+        flat_args: list[Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
     ):
+<<<<<<< HEAD
         (new_flat_fn, new_flat_args, new_flat_args_descs, subclass_meta) = (
             aot_dispatch_subclass(
                 flat_fn,
@@ -670,6 +835,17 @@ def pre_compile(
         )
         self.maybe_subclass_meta = subclass_meta
         return new_flat_fn, new_flat_args, new_flat_args_descs, fw_metadata
+=======
+        (new_flat_fn, new_flat_args, subclass_meta) = aot_dispatch_subclass(
+            flat_fn,
+            flat_args,
+            is_joint_structure=self.trace_joint,
+            meta=fw_metadata,
+            fw_only=self.fw_only,  # type: ignore[arg-type]
+        )
+        self.maybe_subclass_meta = subclass_meta
+        return new_flat_fn, new_flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def post_compile(
         self,
@@ -837,6 +1013,7 @@ def add_dupe_args(self, args):
 
     def pre_compile(
         self,
+<<<<<<< HEAD
         flat_fn: TraceFn,
         flat_args: list[FxValue],
         flat_args_descs: list[AOTInput],
@@ -844,11 +1021,20 @@ def pre_compile(
         *,
         fw_metadata: ViewAndMutationMeta,
     ) -> tuple[TraceFn, list[FxValue], list[AOTInput], ViewAndMutationMeta]:
+=======
+        flat_fn,
+        flat_args: list[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Use information about whether or not flat_fn mutates its arguments
         # or not to handle dupe args
 
         # Strategy 1: For any input that is not mutated, we can leafify it if we
         # need to remove a duplicate.
+<<<<<<< HEAD
         leaf_flat_args: list[FxValue] = []
         leaf_flat_args_descs: list[AOTInput] = []
         args_set = set()
@@ -862,19 +1048,38 @@ def pre_compile(
                 args_set.add(a)
                 leaf_flat_args.append(a)
                 leaf_flat_args_descs.append(a_desc)
+=======
+        leaf_flat_args = []
+        args_set = set()
+        ok = True
+
+        for i, a in enumerate(flat_args):
+            if not isinstance(a, torch.Tensor):
+                leaf_flat_args.append(a)
+            elif a not in args_set:
+                args_set.add(a)
+                leaf_flat_args.append(a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif (
                 not fw_metadata.input_info[i].mutates_data
                 and not fw_metadata.input_info[i].mutates_metadata
             ):
                 leaf_flat_args.append(a.detach().requires_grad_(a.requires_grad))
+<<<<<<< HEAD
                 leaf_flat_args_descs.append(a_desc)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 ok = False
                 break
 
         if ok:
             self.needs_post_compile = False
+<<<<<<< HEAD
             return flat_fn, leaf_flat_args, leaf_flat_args_descs, fw_metadata
+=======
+            return flat_fn, leaf_flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
             raise RuntimeError(
@@ -934,18 +1139,27 @@ def pre_compile(
             keep_arg_mask.append(True)
             add_dupe_map.append(j)
             j += 1
+<<<<<<< HEAD
         assert len(add_dupe_map) == duped_arg_len, (
             f"Expects add_dupe_map to have length {duped_arg_len} but got {len(add_dupe_map)}"
         )
+=======
+        assert (
+            len(add_dupe_map) == duped_arg_len
+        ), f"Expects add_dupe_map to have length {duped_arg_len} but got {len(add_dupe_map)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.keep_arg_mask = keep_arg_mask
         self.add_dupe_map = add_dupe_map
 
         deduped_flat_args = self.remove_dupe_args(flat_args)
+<<<<<<< HEAD
         # TODO: instead of arbitrarily removing args, it might be useful to
         # have a record that these were duped, perhaps as a mutable attribute
         # on the kept arg?  Do this if someone needs it
         deduped_flat_args_descs = self.remove_dupe_args(flat_args_descs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Update our input metadata to remove duped input metadata.
         updated_fw_metadata = remove_dupe_metadata(
@@ -973,6 +1187,7 @@ def pre_compile(
                         DuplicateInputs(kept_arg_source, dupe_arg_source)
                     )
 
+<<<<<<< HEAD
         @simple_wraps(flat_fn)
         def wrapped_flat_fn(
             *args: FxValue,
@@ -986,10 +1201,20 @@ def wrapped_flat_fn(
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 without_output_descs(wrapped_flat_fn),
                 flat_args_descs=deduped_flat_args_descs,
+=======
+        @wraps(flat_fn)
+        def wrapped_flat_fn(*args):
+            return flat_fn(*self.add_dupe_args(args))
+
+        if config.debug_assert:
+            ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
+                wrapped_flat_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
             )(*deduped_flat_args)
+<<<<<<< HEAD
             assert ref_fw_metadata == updated_fw_metadata, (
                 f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}"
             )
@@ -1000,6 +1225,13 @@ def wrapped_flat_fn(
             deduped_flat_args_descs,
             updated_fw_metadata,
         )
+=======
+            assert (
+                ref_fw_metadata == updated_fw_metadata
+            ), f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}"
+
+        return wrapped_flat_fn, deduped_flat_args, updated_fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def post_compile(
         self,
@@ -1074,6 +1306,7 @@ class AOTSyntheticBaseWrapper(CompilerWrapper):
 
     def pre_compile(
         self,
+<<<<<<< HEAD
         flat_fn: TraceFn,
         flat_args: list[FxValue],
         flat_args_descs: list[AOTInput],
@@ -1090,6 +1323,18 @@ def pre_compile(
             aot_config,
             flat_args,
             flat_args_descs,
+=======
+        flat_fn,
+        flat_args: list[Any],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+        is_inference = not self.trace_joint
+        flat_args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
+            aot_config,
+            flat_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fw_metadata.input_info,
             is_inference=is_inference,
         )
@@ -1097,7 +1342,11 @@ def pre_compile(
         # Happy path: we don't need synthetic bases
         if synthetic_base_info is None:
             self.needs_post_compile = False
+<<<<<<< HEAD
             return flat_fn, flat_args, flat_args_descs, fw_metadata
+=======
+            return flat_fn, flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # export path: ban synthetic bases for now, add later if requested.
         if requires_subclass_dispatch(flat_args, fw_metadata):
@@ -1127,11 +1376,15 @@ def pre_compile(
             fw_metadata_updated,
             aliased_arg_idx_with_metadata_mutations,
         ) = create_synthetic_base_metadata(
+<<<<<<< HEAD
             fw_metadata,
             synthetic_base_info,
             flat_args,
             flat_args_with_synthetic_bases,
             flat_args_descs_with_synthetic_bases,
+=======
+            fw_metadata, synthetic_base_info, flat_args, flat_args_with_synthetic_bases
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Save old input args for post-compile
         self.old_input_info = fw_metadata.input_info
@@ -1143,7 +1396,10 @@ def pre_compile(
 
         def _unpack_synthetic_bases(primals: tuple[Any, ...]) -> list[Any]:
             f_args_inner = []
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for inner_idx_or_tuple in synthetic_base_info:
                 if isinstance(inner_idx_or_tuple, int):
                     f_args_inner.append(primals[inner_idx_or_tuple])
@@ -1159,7 +1415,11 @@ def _unpack_synthetic_bases(primals: tuple[Any, ...]) -> list[Any]:
                     f_args_inner.append(view_arg)
             return f_args_inner
 
+<<<<<<< HEAD
         @simple_wraps(flat_fn)
+=======
+        @wraps(flat_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def wrapped_flat_fn(*args):
             unpacked_args = _unpack_synthetic_bases(args)
             # This is a bit subtle. The goal of this entire function (aot_dispatch_synthetic_bases)
@@ -1180,6 +1440,7 @@ def wrapped_flat_fn(*args):
                 for i, x in enumerate(unpacked_args)
                 if i in self.aliased_arg_idx_with_metadata_mutations
             ]
+<<<<<<< HEAD
             out, out_descs = call_and_expect_output_descs(flat_fn, unpacked_args)
             if len(aliased_args_with_metadata_mutations) > 0:
                 # TODO: record more detailed desc information here
@@ -1201,6 +1462,16 @@ def wrapped_flat_fn(*args):
             ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
                 without_output_descs(wrapped_flat_fn),
                 flat_args_descs=flat_args_descs_with_synthetic_bases,
+=======
+            if len(aliased_args_with_metadata_mutations) > 0:
+                return *(flat_fn(*unpacked_args)), *aliased_args_with_metadata_mutations
+            else:
+                return flat_fn(*unpacked_args)
+
+        if config.debug_assert:
+            ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
+                wrapped_flat_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 static_input_indices=aot_config.static_input_indices,
                 keep_input_mutations=fw_metadata.keep_input_mutations,
                 is_train=fw_metadata.is_train,
@@ -1212,7 +1483,10 @@ def wrapped_flat_fn(*args):
         return (
             wrapped_flat_fn,
             flat_args_with_synthetic_bases,
+<<<<<<< HEAD
             flat_args_descs_with_synthetic_bases,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fw_metadata_updated,
         )
 
@@ -1230,10 +1504,15 @@ def post_compile(
 
         @wraps(compiled_fn)
         def wrapped_compiled_fn(args):
+<<<<<<< HEAD
             # TODO: this sure seems expensive to run at runtime (which
             # post_compile seems to imply it does?!)
             args_with_synthetic_bases, _, synthetic_base_info = merge_view_inputs(
                 aot_config, args, None, self.old_input_info, is_inference=is_inference
+=======
+            args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
+                aot_config, args, self.old_input_info, is_inference=is_inference
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             assert synthetic_base_info is not None
             aliased_args_w_metadata_mutations = [
@@ -1339,18 +1618,25 @@ def wrapped_compiled_fn(args):
 def merge_view_inputs(
     aot_config: AOTConfig,
     fwd_inputs: list[Any],
+<<<<<<< HEAD
     # This is None when called at runtime from post_compile closure
     fwd_inputs_descs: Optional[list[AOTInput]],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mutated_input_info: list[InputAliasInfo],
     *,
     # The autograd case currently has more restrictions than the inference case.
     is_inference: bool,
+<<<<<<< HEAD
 ) -> tuple[
     list[Any], list[AOTInput], Optional[list[Union[int, tuple[int, torch.Tensor]]]]
 ]:
     if fwd_inputs_descs is None:
         fwd_inputs_descs = [DummyAOTInput(i) for i in range(len(fwd_inputs))]
 
+=======
+) -> tuple[list[Any], Optional[list[Union[int, tuple[int, torch.Tensor]]]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _are_differentiable_views(view1, view2):
         if view1 is view2:
             return True
@@ -1372,20 +1658,31 @@ def _same_dtype_views(view1, view2):
     assert len(fwd_inputs) == len(mutated_input_info)
     if not [info for info in mutated_input_info if info.mutates_data]:
         # Return early when there are no mutations.
+<<<<<<< HEAD
         return fwd_inputs, fwd_inputs_descs, None
+=======
+        return fwd_inputs, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     storage_ref_to_idx: dict[StorageWeakRef, list[int]] = collections.defaultdict(list)
     base_args = []
     other_args = []
+<<<<<<< HEAD
     base_args_descs = []
     other_args_descs = []
     for i, (inpt, source) in enumerate(zip(fwd_inputs, fwd_inputs_descs)):
+=======
+    for i, inpt in enumerate(fwd_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(inpt, Tensor):
             storage_ref = StorageWeakRef(inpt.untyped_storage())
             storage_ref_to_idx[storage_ref].append(i)
         else:
             other_args.append(inpt)
+<<<<<<< HEAD
             other_args_descs.append(source)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Note [Synthetic Base Info Metadata]
     # This list contains metadata that tells you what the i'th argument in the inner calling convention should be.
     # It's either:
@@ -1403,9 +1700,12 @@ def _same_dtype_views(view1, view2):
             other_args.extend(
                 fwd_inputs[curr_idx] for curr_idx in aliased_input_indices
             )
+<<<<<<< HEAD
             other_args_descs.extend(
                 fwd_inputs_descs[curr_idx] for curr_idx in aliased_input_indices
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         # Here, we attempt to do a more complicated check to detect false aliasing
@@ -1421,9 +1721,12 @@ def _same_dtype_views(view1, view2):
             other_args.extend(
                 fwd_inputs[curr_idx] for curr_idx in aliased_input_indices
             )
+<<<<<<< HEAD
             other_args_descs.extend(
                 fwd_inputs_descs[curr_idx] for curr_idx in aliased_input_indices
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         # We detected an input that was mutated, AND aliases with another input.
@@ -1439,6 +1742,7 @@ def _same_dtype_views(view1, view2):
             # The "inputs that are aliased but have different differentiable bases" case
             # is more complicated and hopefully pretty rare. Not currently handled.
             if not is_inference:
+<<<<<<< HEAD
                 assert _are_differentiable_views(view1, view2), (
                     "aot_autograd() does not yet handle non-differentiable view input mutations."
                 )
@@ -1449,13 +1753,28 @@ def _same_dtype_views(view1, view2):
             )
         non_none_bases = [
             (i, fwd_inputs[i]._base)
+=======
+                assert _are_differentiable_views(
+                    view1, view2
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+            # Regenerating views when reinterpreting complex / real tensors seems non-trivial,
+            # not handling for now
+            assert _same_dtype_views(
+                view1, view2
+            ), "aot_autograd() does not yet handle input mutations on views with different dtypes."
+        non_none_bases = [
+            fwd_inputs[i]._base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in aliased_input_indices
             if fwd_inputs[i]._base is not None
         ]
         aliases_with_none_bases = [
             fwd_inputs[i] for i in aliased_input_indices if fwd_inputs[i]._base is None
         ]
+<<<<<<< HEAD
         synthetic_base_desc: AOTInput
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(non_none_bases) == 0:
             # Case where none of the aliases have a ._base
             # we generate a synthetic base without gradients, and generate views off of it
@@ -1482,7 +1801,11 @@ def _same_dtype_views(view1, view2):
             # to have incorrect sizes.
             example_idx = aliased_input_indices[0]
             example_alias = fwd_inputs[example_idx]
+<<<<<<< HEAD
             # Note that this function is reused at both trace time and runtime.
+=======
+            # Note that this function is re-used at both trace time and runtime.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # At trace time, we're under a FakeMode so synthetic_base becomes a FakeTensor.
             synthetic_base = torch.empty(
                 (0,), dtype=example_alias.dtype, device=example_alias.device
@@ -1490,6 +1813,7 @@ def _same_dtype_views(view1, view2):
             # We don't actually have a convenient way of going from storage -> tensor,
             # So using set_() here (we suffer some minor overhead, but this case is rare).
             synthetic_base.set_(example_alias.untyped_storage())
+<<<<<<< HEAD
             synthetic_base_desc = SyntheticBaseAOTInput(fwd_inputs_descs[example_idx])
         else:
             # Case where all of the aliases require gradients, and have the same _base.
@@ -1505,6 +1829,20 @@ def _same_dtype_views(view1, view2):
                 )
         base_args.append(synthetic_base)
         base_args_descs.append(synthetic_base_desc)
+=======
+        else:
+            # Case where all of the aliases require gradients, and have the same _base.
+            synthetic_base = non_none_bases[0]
+            for other_base in non_none_bases[1:]:
+                assert (
+                    other_base is synthetic_base
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+            for alias in aliases_with_none_bases:
+                assert (
+                    alias is synthetic_base
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+        base_args.append(synthetic_base)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for curr_view_idx in aliased_input_indices:
             curr_view = fwd_inputs[curr_view_idx]
             base_idx = len(base_args) - 1
@@ -1514,7 +1852,11 @@ def _same_dtype_views(view1, view2):
     if len(base_args) == 0:
         assert len(other_args) == len(fwd_inputs)
         # If no synthetic bases are necessary, just return the original inputs.
+<<<<<<< HEAD
         return fwd_inputs, fwd_inputs_descs, None
+=======
+        return fwd_inputs, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         from torch.fx.experimental.symbolic_shapes import SymIntEqByExpr
 
@@ -1530,7 +1872,10 @@ def make_hashable(arg):
         # (2) Metadata telling functionalization how to generate the inner argument list given the outer calling convention.
         #     We post-process it into a list, where meta[i] tells you info about the i'th argument in the inner calling convention.
         args_to_functionalization = base_args + other_args
+<<<<<<< HEAD
         args_to_functionalization_descs = base_args_descs + other_args_descs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Map each argument into its old index.
         # There may be some repeated arguments, so we collect their indices in a list.
@@ -1557,11 +1902,15 @@ def make_hashable(arg):
         # Quick assert: every argument in the inner calling convention should be accounted for.
         for x in post_processed_calling_convention_meta:
             assert x != -1
+<<<<<<< HEAD
         return (
             args_to_functionalization,
             args_to_functionalization_descs,
             post_processed_calling_convention_meta,
         )
+=======
+        return args_to_functionalization, post_processed_calling_convention_meta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Note: [Backward graph lazy lowering]
@@ -1569,7 +1918,11 @@ def make_hashable(arg):
 # unless we suspect that inductor might specialize and insert additional guards. When we do lazy
 # lowering, we stash the AOT backward graph (bw_module) in this class.
 #
+<<<<<<< HEAD
 # Lowering passes are performed on a deepcopy of this bw_module due to compatibility
+=======
+# Lowering passes are performed on a deepcopy of this bw_module due to compatbility
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # with compiled autograd. See: https://github.com/pytorch/pytorch/pull/149229#discussion_r2002122645.
 @dataclass
 class AutogradLazyBackwardCompileInfo:
@@ -1892,7 +2245,11 @@ def coerce_to_expected_memory_format(x: torch.Tensor, memory_format: MemoryForma
         return x
 
     # Empty_strided creates a raw Tensor.
+<<<<<<< HEAD
     # We are guaranteed that only raw Tensors has expected size and stride.
+=======
+    # We are guranteed that only raw Tensors has expected size and stride.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Subclasses have only expected memory_format.
     restrided = torch.empty_strided(
         size=expected_size,
@@ -1935,6 +2292,7 @@ def _disable_saved_tensors_hooks():
             )
 
 
+<<<<<<< HEAD
 @dataclass
 class SerializableCompiledFunction:
     """
@@ -1962,6 +2320,8 @@ def __call__(self, *args, **kwargs):
         return self.compiled_fn(*args, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is wrapped in a class just for namespacing purposes
 # No need to make it into an actual CompilerWrapper because it doesn't fit the abstract as cleanly
 class AOTDispatchAutograd:
@@ -1982,11 +2342,20 @@ def process_runtime_tangent(x, meta: Union[PlainTensorMeta, SubclassCreationMeta
             expected_meta = meta.meta
 
         runtime_type = type(x)
+<<<<<<< HEAD
         # When we're inside compiled autograd's AOTDispatcher step,
         # regular Tensors look like FunctionalTensors.
         # Tensor subclasses still look like Tensor subclasses though.
         if isinstance(x, torch._subclasses.functional_tensor.FunctionalTensor):
             runtime_type = torch.Tensor
+=======
+        if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+            # When we're inside compiled autograd's AOTDispatcher step,
+            # regular Tensors look like FunctionalTensors.
+            # Tensor subclasses still look like Tensor subclasses though.
+            if isinstance(x, torch._subclasses.functional_tensor.FunctionalTensor):
+                runtime_type = torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         runtime_meta = None
         runtime_subclass_keys: Sequence[str] = []
@@ -2070,7 +2439,11 @@ def post_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,  # runtime metadata
+<<<<<<< HEAD
         try_save_cache_entry: Optional[Callable],  # Serialization function
+=======
+        try_save_cache_entry: Optional[Callable],  # Save cache entry after compilation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # For additional context see Note [CUDA Graph Safe RNG Functionalization]
         # Each pair forward, backward rng states must be equal prior to its invocation on any
@@ -2114,7 +2487,10 @@ def _compiled_autograd_key(ctx):
                 return (ctx._autograd_function_id, *ctx.symints)
 
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(ctx, *deduped_flat_tensor_args):
                 args = deduped_flat_tensor_args
                 if backward_state_indices:
@@ -2151,7 +2527,10 @@ def forward(ctx, *deduped_flat_tensor_args):
                 #   in the fw output order.
                 fw_outs = call_func_at_runtime_with_args(
                     CompiledFunction.compiled_fw,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     args,
                     disable_amp=disable_amp,
                 )
@@ -2347,7 +2726,10 @@ class CompiledFunctionBackward(torch.autograd.Function):
                     _aot_id = aot_config.aot_id
 
                     @staticmethod
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     def forward(double_ctx, *unused_args):
                         return impl_fn(double_ctx)
 
@@ -2365,12 +2747,19 @@ def backward(double_ctx, *args):
 
             @staticmethod
             def _backward_impl(ctx, all_args):
+<<<<<<< HEAD
                 from torch._inductor.async_compile import async_compile_pool_manager
 
                 # compiled autograd reimplements this function at proxy_call_aot_backward
                 assert not backward_state_indices, (
                     "BackwardState requires CompiledAutograd"
                 )
+=======
+                # compiled autograd reimplements this function at proxy_call_aot_backward
+                assert (
+                    not backward_state_indices
+                ), "BackwardState requires CompiledAutograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ctx.maybe_clear_saved_tensors()
 
                 saved_tensors_use_once = (
@@ -2383,6 +2772,7 @@ def _backward_impl(ctx, all_args):
                         lazy_backward_info, AutogradLazyBackwardCompileInfo
                     )
 
+<<<<<<< HEAD
                     if (
                         hasattr(lazy_backward_info, "saved_context")
                         and lazy_backward_info.saved_context is not None
@@ -2421,6 +2811,8 @@ def _backward_impl(ctx, all_args):
                             ddp_ctx.curr_bucket -= 1
                             lazy_backward_info.saved_context.fw_metadata = curr_fw_meta
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not saved_tensors_use_once:
                         fw_metadata.bw_donated_idxs = []
                         # Update bw_donated_idxs if using lazy_backward_info from `aot_dispatch_autograd`
@@ -2446,7 +2838,10 @@ def _backward_impl(ctx, all_args):
                     with (
                         tracing(saved_context),
                         compile_context(saved_compile_context),
+<<<<<<< HEAD
                         async_compile_pool_manager(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         context(),
                         track_graph_compiling(aot_config, "backward"),
                         metrics_context,
@@ -2554,6 +2949,7 @@ def debug_compiled_function(args: list[Any]):
 
 def pre_compile(
     wrappers: list[CompilerWrapper],
+<<<<<<< HEAD
     flat_fn: TraceFn,
     flat_args: list[FxValue],
     flat_args_descs: list[AOTInput],
@@ -2561,15 +2957,30 @@ def pre_compile(
     *,
     fw_metadata: ViewAndMutationMeta,
 ) -> tuple[TraceFn, list[FxValue], list[AOTInput], ViewAndMutationMeta]:
+=======
+    flat_fn: Callable,
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> tuple[Callable, list[Tensor], ViewAndMutationMeta]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Runs a sequence of wrappers on the given function and arguments.
     Mutates wrappers in place.
     """
     for wrapper in wrappers:
+<<<<<<< HEAD
         flat_fn, flat_args, flat_args_descs, fw_metadata = wrapper.pre_compile(
             flat_fn, flat_args, flat_args_descs, aot_config, fw_metadata=fw_metadata
         )
     return flat_fn, flat_args, flat_args_descs, fw_metadata
+=======
+        flat_fn, flat_args, fw_metadata = wrapper.pre_compile(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+        )
+    return flat_fn, flat_args, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def post_compile(
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 1dc03c7adb7ee..2244fd82ec7da 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -4,6 +4,7 @@
 input/output types, metadata, config, function signatures etc.
 """
 
+<<<<<<< HEAD
 from __future__ import annotations
 
 import collections
@@ -39,6 +40,33 @@
     from .graph_capture_wrappers import JointFnHandle
 
 
+=======
+import collections
+import dataclasses
+import functools
+import itertools
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, NewType, Optional, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch._guards import Source
+from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
+from torch._subclasses.fake_tensor import is_fake
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+from .. import config
+from .functional_utils import (
+    _check_if_mutation_can_be_in_graph,
+    FunctionalTensorMetadataEq,
+)
+from .utils import strict_zip
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 zip = strict_zip
 
 
@@ -104,6 +132,7 @@ class OutputAliasInfo:
     dynamic_dims: Optional[set[int]]
     # requires_grad
     requires_grad: bool
+<<<<<<< HEAD
     # Sequence of ViewMeta objects.
     #
     # Provides us the means to re-run view functions on other tensors.
@@ -112,6 +141,17 @@ class OutputAliasInfo:
     # we compare the ViewMeta elements appropriately, i.e. their type and
     # the elements returned by the `as_tuple()` call.
     view_meta_sequence: Optional[ViewMetaSequence] = None
+=======
+    # FunctionalTensorWrapper that represents this output.
+    #
+    # Provides us the means to replay views from it.
+    #
+    # We need to wrap the actual FunctionalTensorWrapper with this class so that
+    # we only compare the tensor's metadata. That's because with the transformations
+    # of the model throughout AOTAutograd, the sequence of ViewMeta and the base
+    # tensor might change.
+    functional_tensor: Optional[FunctionalTensorMetadataEq] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MutationType(Enum):
@@ -175,7 +215,11 @@ class MemoryFormatMeta:
     memory_format: Optional[torch.memory_format] = None
 
     @staticmethod
+<<<<<<< HEAD
     def from_tensor(t: torch.Tensor) -> Optional[MemoryFormatMeta]:
+=======
+    def from_tensor(t: torch.Tensor) -> Optional["MemoryFormatMeta"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We only memorize expected memory format for
         # 1. Traceable wrapper subclasses
         # We can not create restrided subclass tensor, as torch.empty_strided works only with dense tensors.
@@ -196,7 +240,10 @@ def from_tensor(t: torch.Tensor) -> Optional[MemoryFormatMeta]:
 
         if use_memory_format:
             return MemoryFormatMeta(
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 memory_format=torch._prims_common.suggest_memory_format(t),
             )
 
@@ -234,7 +281,11 @@ class SubclassCreationMeta:
     # arg_count is inclusive of the arg_counts of any
     # inner tensor subclasses: If I have a TwoTensor and
     # both of its inner elements are TwoTensors, then the
+<<<<<<< HEAD
     # arg_count of the outer-most subclass will be 4
+=======
+    # arg_count of the outer-most sublass will be 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_count: int
     # Mark where or not symints were included. This flag is only used in one assertion
     # in "wrap_tensor_subclasses"
@@ -242,7 +293,11 @@ class SubclassCreationMeta:
     # meta and attrs are produced by the subclass's __tensor_flatten__.
     # We need to keep them around along with outer_size / outer_stride to plumb them
     # into __tensor_unflatten__
+<<<<<<< HEAD
     attrs: dict[str, Union[SubclassCreationMeta, PlainTensorMeta]]
+=======
+    attrs: dict[str, Union["SubclassCreationMeta", PlainTensorMeta]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     outer_size: Iterable[Union[None, int, torch.SymInt]]
     outer_stride: Iterable[Union[None, int, torch.SymInt]]
     meta: Any
@@ -394,12 +449,18 @@ class ViewAndMutationMeta:
     # metadata pass of the user's forward function.
     # Their only use today is to pass them as a best-guess for tangents when tracing the joint.
     # Stashing them as part of our "metadata" makes it simpler if we want to run our analysis
+<<<<<<< HEAD
     # pass once, and reuse the output throughout AOTAutograd
     traced_tangents: list[Any]
 
     # TODO doc
     traced_tangents_descs: list[AOTInput]
 
+=======
+    # pass once, and re-use the output throughout AOTAutograd
+    traced_tangents: list[Any]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Each of these is a list telling us about subclasses for the inputs/outputs/grad_outs
     # They are used throughout AOTDispatch to tell us how to generate a list of subclass tensors,
     # Given a (potentially larger) list of plain torch tensors.
@@ -652,6 +713,20 @@ def extract_metadata(t):
         self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
         # Clear traced tangents at runtime
         self.traced_tangents = []
+<<<<<<< HEAD
+=======
+        new_output_info = []
+        for out in self.output_info:
+            if config.view_replay_for_aliased_outputs:
+                new_out = out
+            else:
+                # If we're not using view_replay, remove the functional tensor.
+                # Functional tensors are unfortunately not serializable,
+                # so doing this is required for AOTAutograd caching.
+                new_out = dataclasses.replace(out, functional_tensor=None)
+            new_output_info.append(new_out)
+        self.output_info = new_output_info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inp_meta in self.subclass_inp_meta:
             if isinstance(inp_meta, SubclassCreationMeta):
                 inp_meta.make_runtime_safe()
@@ -691,7 +766,11 @@ def __eq__(self, other):
             and len(self.traced_tangents) == len(other.traced_tangents)
             and all(
                 x.shape == y.shape and x.dtype == y.dtype
+<<<<<<< HEAD
                 for x, y in zip(self.traced_tangents, other.traced_tangents)
+=======
+                for x, y, in zip(self.traced_tangents, other.traced_tangents)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             and self.num_backward_tokens == other.num_backward_tokens
         )
@@ -728,9 +807,15 @@ class SubclassMeta:
     # in case we made incorrect assumptions about the subclass-ness of our grad_outputs
     #
     # Optional field because we don't compute for inference graphs
+<<<<<<< HEAD
     grad_input_metas: Optional[list[Union[PlainTensorMeta, SubclassCreationMeta]]] = (
         None
     )
+=======
+    grad_input_metas: Optional[
+        list[Union[PlainTensorMeta, SubclassCreationMeta]]
+    ] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self) -> None:
         # The fields in this class get set after its construction.
@@ -805,7 +890,10 @@ class GraphSignature:
     # "graph outputs that correspond to updated buffers"
     # to the FQN names of those mutated buffers.
     buffers_to_mutate: dict[GraphOutputName, FQN]
+<<<<<<< HEAD
     parameters_to_mutate: dict[GraphOutputName, FQN]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     user_inputs_to_mutate: dict[GraphOutputName, GraphInputName]
 
     in_spec: pytree.TreeSpec
@@ -829,10 +917,16 @@ def from_tracing_metadata(
         named_buffers: list[str],
         num_user_inputs: int,
         num_user_outputs: int,
+<<<<<<< HEAD
         trace_joint: bool,
         loss_index: Optional[int],
         backward_signature: Optional[BackwardSignature],
     ) -> GraphSignature:
+=======
+        loss_index: Optional[int],
+        backward_signature: Optional[BackwardSignature],
+    ) -> "GraphSignature":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_inputs = graph_input_names
         graph_outputs = graph_output_names
         parameters = list(named_parameters)
@@ -875,9 +969,14 @@ def from_tracing_metadata(
         mutations = []
         for idx, input_info in enumerate(view_mutation_metadata.input_info):
             if input_info.mutates_data:
+<<<<<<< HEAD
                 if trace_joint:
                     # Only buffers can be mutated, not parameters
                     assert idx >= len(parameters)
+=======
+                # Only buffers can be mutated, not parameters
+                assert idx >= len(parameters)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mutations.append(names[idx + num_tokens])
 
         assert len(mutations) == view_mutation_metadata.num_mutated_inp_runtime_indices
@@ -890,6 +989,7 @@ def from_tracing_metadata(
 
         user_inputs_to_mutate = {}
         buffers_to_mutate = {}
+<<<<<<< HEAD
         parameters_to_mutate = {}
         for output_name, mutation_name in outputs_to_mutations.items():
             if mutation_name in user_inputs:
@@ -903,6 +1003,14 @@ def from_tracing_metadata(
                 else:
                     # pyrefly: ignore [unsupported-operation]
                     parameters_to_mutate[output_name] = mutation_name
+=======
+        for output_name, mutation_name in outputs_to_mutations.items():
+            if mutation_name in user_inputs:
+                user_inputs_to_mutate[output_name] = mutation_name
+            else:
+                assert mutation_name in buffers
+                buffers_to_mutate[output_name] = mutation_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         start, stop = stop, stop + num_user_outputs
         user_outputs = graph_outputs[start:stop]
@@ -923,7 +1031,10 @@ def from_tracing_metadata(
             inputs_to_parameters=inputs_to_parameters,  # type: ignore[arg-type]
             user_inputs_to_mutate=user_inputs_to_mutate,
             buffers_to_mutate=buffers_to_mutate,  # type: ignore[arg-type]
+<<<<<<< HEAD
             parameters_to_mutate=parameters_to_mutate,  # type: ignore[arg-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             in_spec=in_spec,
             out_spec=out_spec,
             backward_signature=backward_signature,
@@ -969,17 +1080,21 @@ class AOTConfig:
     # Used only by standalone_compile.
     ignore_shape_env: bool = False
     precompile_backend_id: Optional[str] = None
+<<<<<<< HEAD
     force_non_lazy_backward_lowering: bool = False
     # This config makes sure to check certain things like
     # mutating input with req_grad in export joint tracing.
     export_trace_joint: bool = False
     disable_functionalization: bool = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         if self.pre_dispatch:
             assert self.is_export, "Can only have pre_dispatch IR for export."
 
 
+<<<<<<< HEAD
 # TODO: types here
 # plain_tensor_trace_fn, when it is joint, has tuple structure on the trace
 # info too!
@@ -1295,3 +1410,9 @@ def graph_module(self):
     @graph_module.setter
     def graph_module(self, value):
         self._aot_graph_capture.graph_module = value
+=======
+SubclassTracingInfo = collections.namedtuple(
+    "SubclassTracingInfo",
+    ["plain_tensor_trace_fn", "plain_tensor_args", "maybe_subclass_meta"],
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/_aot_autograd/subclass_parametrization.py b/torch/_functorch/_aot_autograd/subclass_parametrization.py
index 3b7f80114bbf2..194b32ce294e4 100644
--- a/torch/_functorch/_aot_autograd/subclass_parametrization.py
+++ b/torch/_functorch/_aot_autograd/subclass_parametrization.py
@@ -90,7 +90,10 @@ def unwrap_tensor_subclass_parameters(module: torch.nn.Module) -> torch.nn.Modul
     """
     for name, tensor in itertools.chain(
         list(module.named_parameters(recurse=False)),
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         list(module.named_buffers(recurse=False)),
     ):
         if is_traceable_wrapper_subclass(tensor):
diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
index a579888dfade3..f65a2319e4108 100644
--- a/torch/_functorch/_aot_autograd/subclass_utils.py
+++ b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -7,8 +7,13 @@
 
 import collections
 import typing
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any, Optional, TypeGuard, TypeVar, Union
+=======
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -17,6 +22,7 @@
 from torch.types import IntLikeType
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
+<<<<<<< HEAD
 from .descriptors import (
     AOTInput,
     AOTOutput,
@@ -30,6 +36,9 @@
 )
 from .schemas import (
     FxValue,
+=======
+from .schemas import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MutationType,
     PlainTensorMeta,
     SubclassCreationMeta,
@@ -124,8 +133,13 @@ def create_subclass_metadata(
 
     new_start_idx = (
         new_start_idx
+<<<<<<< HEAD
         + count_symints * len(enumerate_filter_symints(a.size()))
         + count_symints * len(enumerate_filter_symints(a.stride()))
+=======
+        + count_symints * len(filter_symints(a.size()))
+        + count_symints * len(filter_symints(a.stride()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return (
@@ -179,12 +193,21 @@ def create_subclass_meta(
     return infos
 
 
+<<<<<<< HEAD
 def enumerate_filter_symints(lst: Iterable[IntLikeType]) -> list[tuple[int, SymInt]]:
     # Capture all SymInts from the iterable.
     def symint_check(s: IntLikeType) -> TypeGuard[SymInt]:
         return isinstance(s, SymInt) and not s.node.is_nested_int()
 
     return [(i, s) for i, s in enumerate(lst) if symint_check(s)]
+=======
+def filter_symints(lst: Iterable[IntLikeType]):
+    # Capture all SymInts from the iterable.
+    def symint_check(s: IntLikeType) -> bool:
+        return isinstance(s, SymInt) and not s.node.is_nested_int()
+
+    return [s for s in lst if symint_check(s)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def compute_symint_placeholders(lst: Iterable[Union[None, int, SymInt]]) -> list[bool]:
@@ -192,12 +215,15 @@ def compute_symint_placeholders(lst: Iterable[Union[None, int, SymInt]]) -> list
     return [s is None for s in lst]
 
 
+<<<<<<< HEAD
 # Intended to make it easier to define function that is
 # either (AOTInput -> AOTInput) or (AOTOutput -> AOTOutput)
 # but not the other combos
 AOTDescriptor = TypeVar("AOTDescriptor", AOTInput, AOTOutput)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This function takes in a pytree of arguments and unwraps any tensor
 # subclasses.
 #
@@ -212,6 +238,7 @@ def compute_symint_placeholders(lst: Iterable[Union[None, int, SymInt]]) -> list
 # primals (but not tangents) on entry to the forward. See the runtime version of
 # this function below.
 def unwrap_tensor_subclasses(
+<<<<<<< HEAD
     wrapped_args: list[FxValue],
     wrapped_args_descs: list[AOTDescriptor],
     *,
@@ -228,12 +255,24 @@ def flatten_subclass(
         if not is_traceable_wrapper_subclass(t):
             out[0].append(t)
             out[1].append(desc)
+=======
+    wrapped_args: list[Union[Tensor, int]],
+    *,
+    append_symints: bool,
+):
+    def flatten_subclass(t: Union[Tensor, int], *, out=None):
+        # unwrap a subclass into plain tensors and their size/stride if "append_symint"
+        # is True
+        if not is_traceable_wrapper_subclass(t):
+            out.append(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         attrs, _ = t.__tensor_flatten__()
 
         for attr in attrs:
             inner_tensor = getattr(t, attr)
+<<<<<<< HEAD
             n_desc: Any = (
                 SubclassGetAttrAOTInput(desc, attr)
                 if isinstance(desc, AOTInput)
@@ -262,6 +301,20 @@ def flatten_subclass(
         flatten_subclass(typing.cast(Tensor, x), desc, out=(xs_inner, descs_inner))
 
     return xs_inner, descs_inner
+=======
+            flatten_subclass(inner_tensor, out=out)
+
+        if append_symints:
+            out.extend(filter_symints(t.size()))
+            out.extend(filter_symints(t.stride()))
+
+    xs_inner: list[Union[int, Tensor, SymInt]] = []
+
+    for x in wrapped_args:
+        flatten_subclass(typing.cast(Tensor, x), out=xs_inner)
+
+    return xs_inner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # subclass_metas is needed at runtime to compute which indices are symints in
@@ -283,7 +336,10 @@ def flatten_subclass(x: Tensor, meta: Optional[SubclassCreationMeta], *, out):
 
         for attr in attrs:
             inner_tensor = getattr(x, attr)
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inner_meta = meta.attrs.get(attr)
             flatten_subclass(inner_tensor, inner_meta, out=out)
 
@@ -330,9 +386,13 @@ def unwrap_tensor_subclasses_with_indices_to_original(wrapped_args):
     ret_unwrapped = []
     ret_indices_to_original = []
     for i, a in enumerate(wrapped_args):
+<<<<<<< HEAD
         a_unwrapped, _ = unwrap_tensor_subclasses(
             [a], [DummyAOTInput(9999)], append_symints=False
         )
+=======
+        a_unwrapped = unwrap_tensor_subclasses([a], append_symints=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ret_unwrapped.extend(a_unwrapped)
         n = len(a_unwrapped)
         ret_indices_to_original.extend([i] * n)
@@ -349,8 +409,13 @@ def remap_unwrapped_subclass_arg_indices(wrapped_args, static_input_indices):
         if is_traceable_wrapper_subclass(arg):
             num_indices = (
                 len(get_plain_tensors(typing.cast(Tensor, arg), out=[]))
+<<<<<<< HEAD
                 + len(enumerate_filter_symints(arg.size()))
                 + len(enumerate_filter_symints(arg.stride()))
+=======
+                + len(filter_symints(arg.size()))
+                + len(filter_symints(arg.stride()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         for _ in range(num_indices):
@@ -414,7 +479,11 @@ def wrap_tensor_subclasses(
     # we computed subclass metadata on every forward output, but this did **not** include activations
     # created by the partitioner.
     # as a result, `unwrapped_args` here will correspond to (*unwrapped_user_fw_outs, *activations),
+<<<<<<< HEAD
     # but `subclass_metas` will only correspond to subclass metadata on `user_fw_outs`.
+=======
+    # but `subclass_metas` will only correspond to subclass metatadata on `user_fw_outs`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We then need to make sure that we return (*wrapped_user_fw_outs, *activations).
     if num_fw_outs_saved_for_bw is not None:
         assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw, (
@@ -427,9 +496,15 @@ def wrap_tensor_subclasses(
             return wrapped_args + activations
         return tuple(list(wrapped_args) + list(activations))
     else:
+<<<<<<< HEAD
         assert len(unwrapped_args) == num_args_tallied, (
             f"Expected {len(unwrapped_args)} == {num_args_tallied}"
         )
+=======
+        assert (
+            len(unwrapped_args) == num_args_tallied
+        ), f"Expected {len(unwrapped_args)} == {num_args_tallied}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tuple(wrapped_args)
 
 
@@ -440,7 +515,11 @@ def wrap_tensor_subclasses(
 def wrap_tensor_subclasses_maybe_joint(
     unwrapped_args, *, is_joint_structure: bool, meta: ViewAndMutationMeta
 ) -> Union[tuple[Any, ...], list[Any]]:
+<<<<<<< HEAD
     # Since this function is reused for both inference and joint graphs,
+=======
+    # Since this function is re-used for both inference and joint graphs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_joint_structure:
         assert isinstance(unwrapped_args, tuple) and len(unwrapped_args) == 2
         assert isinstance(unwrapped_args[0], (tuple, list)) and isinstance(
diff --git a/torch/_functorch/_aot_autograd/traced_function_transforms.py b/torch/_functorch/_aot_autograd/traced_function_transforms.py
new file mode 100644
index 0000000000000..8b8b5d11884ab
--- /dev/null
+++ b/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -0,0 +1,924 @@
+# mypy: allow-untyped-defs
+"""
+This module is responsible for transforming functions to be traced into a form
+that is easier for the downstream infra (e.g. Autograd, FX, AOTAutograd analysis)
+to handle.
+
+It does so by:
+1. functionalization (including RNG functionalzation)
+2. creating a joint graph when required
+3. transforming mutations into extra outputs
+4. dispatching subclasses
+"""
+
+import warnings
+from contextlib import contextmanager, nullcontext
+from functools import wraps
+from typing import Any, Callable, Union
+from unittest.mock import patch
+
+import torch
+import torch.fx.traceback as fx_traceback
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._decomp.decompositions_for_rng import PhiloxStateTracker
+from torch._guards import detect_fake_mode
+from torch._prims_common import CUDARngStateHelper
+from torch.fx.experimental.proxy_tensor import (
+    maybe_disable_thunkify,
+    maybe_enable_thunkify,
+)
+from torch.fx.experimental.symbolic_shapes import (
+    guard_or_true,
+    PropagateUnbackedSymInts,
+    sym_eq,
+)
+from torch.nn.utils import stateless
+
+from .. import config
+from .collect_metadata_analysis import run_functionalized_fw_and_collect_metadata
+from .functional_utils import (
+    _check_if_mutation_can_be_in_graph,
+    are_all_mutations_hidden_from_autograd,
+    are_all_mutations_under_no_grad_or_inference_mode,
+    from_fun,
+    has_data_mutation,
+    has_metadata_mutation,
+    is_fun,
+    sync_functional_tensor,
+    to_fun,
+    was_inductor_storage_resized,
+)
+from .logging_utils import setup_stacktrace_preservation_hooks
+from .schemas import (
+    AOTConfig,
+    MutationType,
+    OutputType,
+    SubclassMeta,
+    SubclassTracingInfo,
+    ViewAndMutationMeta,
+)
+from .subclass_utils import (
+    create_subclass_meta,
+    remap_unwrapped_subclass_arg_indices,
+    requires_subclass_dispatch,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses_maybe_joint,
+)
+from .utils import maybe_to_fresh_input
+
+
+# This function returns a new function that returns mutated inputs as outputs.
+# if keep_data_input_mutations is set, then we assume that data-only mutations
+# will be left in the graph, and we only return metadata-mutated inputs as outputs.
+def fn_input_mutations_to_outputs(
+    fn: Callable,
+    meta: ViewAndMutationMeta,
+    keep_data_input_mutations: bool,
+) -> Any:
+    @wraps(fn)
+    def inner_fn(*args):
+        outs = fn(*args)
+        assert len(meta.output_info) == len(outs)
+        # The compiled fw will return mutated input tensors, *including* metadata-only mutation.
+        # However, if keep_data_input_mutations is set, the compiled fw only needs to return metadata-mutated inputs.
+        # (because data-only input mutations are handled directly in the compiled graph)
+        mutated_inputs_to_return = [
+            x for (i, x) in enumerate(args) if i in meta.mutated_inp_runtime_indices
+        ]
+        return *mutated_inputs_to_return, *outs
+
+    return inner_fn
+
+
+# This function takes in a fn with external aliasing and mutation,
+# and returns a new fn with no external aliasing and mutation,
+# as needed for autograd.
+# The main transformations are:
+# - Return mutated inputs as extra outputs
+# - Clone mutated inputs that require gradients,
+#   because autograd will require us to pass the pre-mutated inputs into autograd.grad
+# - Return intermediate bases of outputs as additional outputs,
+#   needed to appease autograd.Function
+# The new function returns:
+# (1) The updated outputs
+# (2) A boolean mask of len(new_fn_outputs),
+#     that can be used to tell autograd.grad which outputs should get tangents
+#     if we trace the backward.
+def fn_prepped_for_autograd(
+    fn: Callable,
+    meta: ViewAndMutationMeta,
+) -> Any:
+    @wraps(fn)
+    def inner_fn(*args):
+        args_maybe_cloned = [
+            maybe_to_fresh_input(i, t, meta) for i, t in enumerate(args)
+        ]
+
+        outs = fn(*args_maybe_cloned)
+        assert isinstance(outs, (tuple, list))
+        outs = list(outs)
+        assert len(meta.output_info) == len(outs)
+
+        mutated_inputs_to_return = [
+            x
+            for (i, x) in enumerate(args_maybe_cloned)
+            if i in meta.mutated_inp_runtime_indices
+        ]
+
+        intermediate_bases = []
+        for i, (o, info) in enumerate(zip(outs, meta.output_info)):
+            if info.output_type == OutputType.alias_of_intermediate_save_as_output:
+                intermediate_bases.append(o._base)
+
+        assert meta.num_intermediate_bases == len(intermediate_bases)
+
+        # the compiled forward should return (mutated_inputs, user_outs, intermediate_bases)
+        fw_outs_to_return = *mutated_inputs_to_return, *outs, *intermediate_bases
+
+        # Also return a boolean mask specifying which outputs to this function will be used as tangents
+        mutated_inputs_grad_mask = [
+            meta.input_info[meta.mutated_inp_runtime_indices[i]].mutates_data
+            and meta.input_info[meta.mutated_inp_runtime_indices[i]].requires_grad
+            for (i, x) in enumerate(mutated_inputs_to_return)
+        ]
+
+        # Pass any (non-aliased) outputs in as tangents, since they'll be returned as outputs in the fw
+        # For outputs that are aliases of intermediates, we will have returned the output's _base as an output in the graph instead,
+        # which we *should* send to grad()
+        output_grad_mask = [
+            meta.output_info[i].output_type
+            in [
+                OutputType.non_alias,
+                OutputType.unsafe_view_alias,
+                OutputType.custom_function_view,
+            ]
+            # Also, only tensor outputs should participate in the backward
+            # (in particular, Symint outputs in the forward graph shouldn't get tangents)
+            and issubclass(meta.output_info[i].raw_type, Tensor)
+            and meta.output_info[i].requires_grad
+            for (i, x) in enumerate(outs)
+        ]
+
+        intermediate_base_grad_mask = [True for _ in range(len(intermediate_bases))]
+
+        out_grad_mask = (
+            mutated_inputs_grad_mask + output_grad_mask + intermediate_base_grad_mask
+        )
+        assert len(out_grad_mask) == len(fw_outs_to_return)
+
+        # Take care to grab and sync the updated inputs from primals_after_cloning (the inputs we actually mutate!)
+        # and not primals (the preserved inputs, pre-mutation, that we pass to grad())
+        # This is annoying: our joint function needs to be aware of functionalization
+        # (syncing mutated inputs before calling autograd.grad())
+        # In theory, we could make the autograd engine do this automatically, although that probably isn't any cleaner.
+        for arg in args_maybe_cloned:
+            if not isinstance(arg, Tensor):
+                continue
+            sync_functional_tensor(arg)
+
+        return fw_outs_to_return, out_grad_mask
+
+    return inner_fn
+
+
+# Given a fn, computes the joint.
+# NOTE: fn is expects the following behavior:
+# (1) fn() needs to return a tuple of (outs, mask),
+#     where `mask` tells us which outputs are meant to have tangents.
+#     we don't know this info automatically, because we don't actually want to blindly
+#     compute tangents for every output that requires grad.
+#     Specifically, outputs that alias inputs won't participate in the backward and get tangents.
+# (2) fn() cannot mutate any inputs that require gradient.
+#     otherwise, when we compute autograd.grad(), we will not take those input mutations into account
+#     (the way this is handled is that we ensure any inputs that normally get mutated are cloned first)
+def create_joint(fn: Callable, *, aot_config: AOTConfig) -> Any:
+    def inner_fn(primals: list[Any], tangents: list[Any]):
+        outs, tangent_mask = fn(*primals)
+
+        assert len(tangent_mask) == len(outs)
+        outs_to_grad = [
+            o for needs_tangent, o in zip(tangent_mask, outs) if needs_tangent
+        ]
+        assert len(outs_to_grad) == len(tangents)
+
+        # Get the inputs that need gradients
+        grad_primals = []
+        inputs_needs_grads = []
+        # Note that we're not using primals here,
+        # being carefully not to pass any mutated inputs into autograd.grad()
+        for p in primals:
+            is_grad_tensor = isinstance(p, Tensor) and p.requires_grad
+            inputs_needs_grads.append(is_grad_tensor)
+            if is_grad_tensor:
+                grad_primals.append(p)
+
+        # Get the outputs that need gradients
+        needed_outs = []
+        needed_tangents = []
+        for out, tangent in zip(outs_to_grad, tangents):
+            if isinstance(out, Tensor) and out.requires_grad:
+                # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
+                # The issue is that we are sensitive to decomps that don't accurately maintain
+                # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
+                # The guard_or_true also sketchy; if unbacked
+                # symints are involved, we're just going to assume that the
+                # decomps setup the base shape correctly
+
+                # Return out if the result of out.shape==tangent.shape is unknown or known to be true.
+                # otherwise if its a known false return out.view(tangent.shape).
+                needed_outs.append(
+                    out
+                    if guard_or_true(sym_eq(out.shape, tangent.shape))
+                    else out.view(tangent.shape)
+                )
+                needed_tangents.append(tangent)
+
+        setup_stacktrace_preservation_hooks([out.grad_fn for out in needed_outs])
+
+        if config.functionalize_rng_ops:
+            PhiloxStateTracker.mark_beginning_of_backward()
+        backward_out: tuple[Tensor, ...] = ()
+        # Call the backwards pass
+        if grad_primals:
+            functional_tensor_mode = torch.utils._python_dispatch._detect_infra_mode(
+                torch._C._TorchDispatchModeKey.FUNCTIONAL
+            )
+            if functional_tensor_mode is not None:
+                # Side-Effect Tokens:
+                # We want to have independent chains of tokens for forward and backward.
+                # functional_tensor_mode._tokens is used by both.
+                # We memoize the result tokens of forward in functional_tensor_mode._tokens_forward_output,
+                # to return them as joint graph outputs.
+                # We clean functional_tensor_mode._tokens before backward, to prevent reuse of forward tokens in backward.
+                # Joint graph tracing allows tokens discovery,
+                # So all the tokens in backward will be created and added as a graph inputs during tracing.
+                functional_tensor_mode._tokens_forward_output = (
+                    functional_tensor_mode._tokens
+                )
+                functional_tensor_mode._tokens = {}
+
+            with set_partitioner_tag_is_backward(), fx_traceback.preserve_node_meta():
+                # for full graph export, we always export a joint graph where we assume no tangents are needed.
+                if aot_config.no_tangents:
+                    assert len(needed_tangents) == 1 and needed_tangents[0].numel() == 1
+                    backward_out = torch.autograd.grad(
+                        needed_outs,
+                        grad_primals,
+                        allow_unused=True,
+                    )
+                else:
+                    backward_out = torch.autograd.grad(
+                        needed_outs,
+                        grad_primals,
+                        grad_outputs=needed_tangents,
+                        allow_unused=True,
+                    )
+        backward_out_iter = iter(backward_out)
+        return outs, [
+            next(backward_out_iter) if i else None for i in inputs_needs_grads
+        ]
+
+    def inner_fn_with_anomaly(*args):
+        with fx_traceback.preserve_node_meta(), warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "Anomaly Detection has been enabled.")
+            with torch.autograd.detect_anomaly(check_nan=False):
+                return inner_fn(*args)
+
+    return inner_fn_with_anomaly
+
+
+def create_functionalized_rng_ops_wrapper(func, args, trace_joint=True) -> Any:
+    # Functionalization of rng ops changes the calling convention of the joint graph.
+    # It goes from (primals, tangents) to (seed, offset, primals, tangents)
+    # At runtime, we pass on the current seed and offset. This is hidden from
+    # the user.
+    fake_mode = detect_fake_mode()
+    if fake_mode is None:
+        fake_mode = nullcontext()
+
+    def override_get_rng_state(device: Union[int, str, torch.device] = "cuda"):
+        out = PhiloxStateTracker.get_state_as_tensor()
+        return out
+
+    def override_set_rng_state(x, device: Union[int, str, torch.device] = "cuda"):
+        PhiloxStateTracker.set_state_from_tensor(x)
+
+    def append_rng_offsets(args):
+        if trace_joint:
+            # args signature before: Tuple(fwd_outputs), Tuple(bwd_outputs)
+            # args signature after: Tuple(fwd_outputs, new_fwd_rng_offset), Tuple(bwd_offset, new_bwd_rng_offset)
+            return (
+                (*args[0], PhiloxStateTracker.get_updated_fwd_offset()),
+                (*args[1], PhiloxStateTracker.get_updated_bwd_offset()),
+            )
+        else:
+            # args signature before: Tuple(fwd_outputs)
+            # args signature after: Tuple(fwd_outputs, new_fwd_rng_offset)
+            return (*args, PhiloxStateTracker.get_updated_fwd_offset())
+
+    def traced_joint(
+        primals, tangents, fwd_seed, fwd_base_offset, bwd_seed, bwd_base_offset
+    ):
+        with patch("torch.cuda.get_rng_state", override_get_rng_state), patch(
+            "torch.cuda.set_rng_state", override_set_rng_state
+        ):
+            return append_rng_offsets(func(primals, tangents))
+
+    def traced_forward(*primals_fwd_seed_fwd_base_offset):
+        # The signature is (*primals, seed, offset)
+        with patch("torch.cuda.get_rng_state", override_get_rng_state), patch(
+            "torch.cuda.set_rng_state", override_set_rng_state
+        ):
+            return append_rng_offsets(func(*primals_fwd_seed_fwd_base_offset[:-2]))
+
+    if trace_joint:
+        # Get the current seed and offset to setup tracing.
+        fwd_seed, fwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        bwd_seed, bwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        PhiloxStateTracker.record_state(fwd_seed, fwd_base_offset, "forward")
+        PhiloxStateTracker.record_state(bwd_seed, bwd_base_offset, "backward")
+        return traced_joint, (
+            *args,
+            fwd_seed,
+            fwd_base_offset,
+            bwd_seed,
+            bwd_base_offset,
+        )
+    else:
+        # Get the current seed and offset to setup tracing.
+        fwd_seed, fwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        PhiloxStateTracker.record_state(fwd_seed, fwd_base_offset, "forward")
+        return traced_forward, (*args, fwd_seed, fwd_base_offset)
+
+
+@contextmanager
+def set_partitioner_tag(tag: str):
+    meta_key = "partitioner_tag"
+    assert fx_traceback.has_preserved_node_meta()
+
+    original_val = fx_traceback.current_meta.get(meta_key, None)
+    fx_traceback.current_meta[meta_key] = tag
+    try:
+        yield
+    finally:
+        fx_traceback.current_meta[meta_key] = original_val
+
+
+def set_partitioner_tag_is_backward():
+    return set_partitioner_tag("is_backward")
+
+
+def set_partitioner_tag_must_be_in_backward():
+    return set_partitioner_tag("must_be_in_backward")
+
+
+# This creates the final function that we want to trace using make_fx(),
+# in both aot_dispatch_autograd and aot_dispatch_base.
+# Preconditions:
+# - fn corresponds to the user's fw function
+# - fn arguments have been flattened, duplicate arguments have been handled
+# - In the returned function, the "primals" arguments *includes* synthetic bases.
+# This function does the work of functionalizing the input function,
+# and performing copy_() calls at the end of the function if `keep_input_mutations` is set.
+# The function returned has signature that is either:
+# (1) "traced_fn(primals: List[Any])" if trace_joint is False
+# (2) "traced_fn(primals: List[Any], tangents: List[Any])" if trace_joint is True
+# Returns a new (functionalized) function, and updated arguments to call it with.
+def create_functionalized_fn(
+    fn,
+    args,
+    *,
+    meta: ViewAndMutationMeta,
+    aot_config: AOTConfig,
+    trace_joint: bool,
+) -> Any:
+    @wraps(fn)
+    def _functionalized_f_helper(*args):
+        with maybe_enable_thunkify():
+            # See Note [Disabling Functionalize TLS Above Python Functionalization]
+            disable_above = torch._C._ExcludeDispatchKeyGuard(
+                torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+            )
+
+            with disable_above:
+                # The functionalization code here can potentially trigger traces
+                # into the graph, but we'd prefer to NOT do this, because if we
+                # trace them now, we will end up with FX nodes that don't have
+                # module stack annotations, which makes unflattener unhappy.
+                # Wrap inputs into functional wrappers
+                f_args = pytree.tree_map(to_fun, args)
+
+                # Run the joint
+                f_outs = fn(*f_args)
+
+            if trace_joint:
+                # We support a limited amount of mutation of graph inputs during the backward pass.
+                # (This is used e.g. by Float8, which needs to update buffers during the backward pass)
+                # Here, we perform extra checks for primals that were mutated in the **backward**
+                # We're doing the checks here instead of doing them with the rest of the input mutation handling because:
+                # - We need to detect inputs that were mutated in the backward **separately** from mutations that happened
+                #   during the forward, because the handling is different: some input mutations from the the forward
+                #   can be only handled in a fw-only runtime epilogue, and in theory if we wanted to handle those same
+                #   types of mutations in the backward we would need a bw-only runtime epilogue.
+                # - We could in theory have our analysis pass differentiate mutations in the fw from mutations in
+                #   the bw by running our analysis first on the fw-only graph, and then on the joint graph. This would
+                #   require an extra round of tracing though, so it's more efficient to do in-line here.
+                assert (
+                    isinstance(args, tuple)
+                    and len(args) == 2
+                    and isinstance(args[0], (list, tuple))
+                )
+                # Only look at mutations that happened to forward inputs (e.g. fw buffers that were saved for bw)
+                primals_before = args[0]
+                primals_after = pytree.tree_map(from_fun, f_args[0])
+                for idx, (f_inpt, before, after, inpt_info) in enumerate(
+                    zip(f_args[0], primals_before, primals_after, meta.input_info)
+                ):
+                    # Store information about mutations in joint(for backward analysis)
+                    joint_mutates_data = has_data_mutation(f_inpt)
+
+                    joint_mutates_metadata = has_metadata_mutation(
+                        f_inpt, before, check_only_storage_mutation=False
+                    )
+
+                    # Ban metadata mutations on fw inputs during the bw
+                    if not inpt_info.mutates_metadata:
+                        assert (
+                            not joint_mutates_metadata
+                        ), "Found a graph input that had its metadata mutated in the backward. This is not supported"
+
+                    # Ban storage resizing on fw inputs during the bw
+                    if not inpt_info.mutation_inductor_storage_resize:
+                        assert not was_inductor_storage_resized(
+                            f_inpt
+                        ), "Found a graph input that had storage resizing in the backward. This is not supported"
+
+                    # Allow data mutations on fw inputs during the bw, but only if they do not require grad
+                    # So we can guarantee that we can keep the mutations in the graph
+                    if (
+                        joint_mutates_data
+                        and not inpt_info.mutates_data
+                        and not inpt_info.mutates_storage_metadata
+                    ):
+                        # Not banning here mutations on inpt_info.requires_grad -
+                        # we'll check at runtime and fail only when backward is under torch.is_grad_enabled (create_graph)
+                        # Add node meta for copy_ for partitioner that this node should be in backward graph.
+                        with torch.fx.traceback.preserve_node_meta(), set_partitioner_tag_must_be_in_backward():
+                            before.copy_(after)
+                        meta.indices_of_inputs_that_requires_grad_with_mutations_in_bw.append(
+                            idx
+                        )
+                # Now that we covered mutations to *forward* inputs during the backward,
+                # we also need to cover mutations to *backward-only* inputs during the backward (e.g. mutation to a grad_out).
+                # Today, we will just error in all cases of this happening unless someone needs us to support it.
+                tangents_before = args[1]
+                tangents_after = pytree.tree_map(from_fun, f_args[1])
+                for f_inpt, before, after in zip(
+                    f_args[1], tangents_before, tangents_after
+                ):
+                    assert not has_metadata_mutation(
+                        f_inpt, before, check_only_storage_mutation=False
+                    ), "Found an input to the backward that had metadata mutated during the backward pass. This is not supported"
+                    if has_data_mutation(f_inpt):
+                        can_be_in_graph = _check_if_mutation_can_be_in_graph(
+                            keep_input_mutations=True,
+                            mutates_data=True,
+                            mutates_metadata=False,
+                            mutations_hidden_from_autograd=are_all_mutations_hidden_from_autograd(
+                                f_inpt
+                            ),
+                            mutations_under_no_grad_or_inference_mode=are_all_mutations_under_no_grad_or_inference_mode(
+                                f_inpt
+                            ),
+                            mutates_storage_metadata=False,
+                            mutation_inductor_storage_resize=was_inductor_storage_resized(
+                                f_inpt
+                            ),
+                            requires_grad=f_inpt.requires_grad,
+                        )
+                        assert (
+                            can_be_in_graph
+                        ), "a backward input that had data mutated in an autograd-aware way. This is not supported"
+                        # Perform the input mutation
+                        with torch.fx.traceback.preserve_node_meta():
+                            before.copy_(after)
+
+            if aot_config.keep_inference_input_mutations:
+                # Note: This is a bit annoying. There's a layering issue here, where:
+                # (1) functionalization needs to operate on **synthetic base** inputs, before unpacking them into the "real" inputs.
+                # (2) For keep_input_mutations, we support tracing a call to copy_() directly on mutated inputs.
+                #     However, we **only** want to support this for inputs that have data-only (and no metadata) mutations,
+                #     because inductor (and backends in generally) would prefer not to see these (e.g. as_strided_(), resize_()).
+                #     This makes it pretty difficult for this logic to operate on synthetic bases.
+                # (3) In addition, there are cases where it's significantly cheaper to perform the copy on the individual
+                #     (unpacked) input aliases, instead of the synthetic base.
+                # Example case where (3) could be important:
+                #
+                #     def f(x, y):
+                #         x.mul_(2)
+                #         y.mul_(3)
+                #         return x, y
+                #    a = torch.ones(1'000'000)
+                #    x, y = out(a[0:9], a[1:10])
+                #
+                # It would be much better to add copy_() calls into the graph for the two tiny slices, instead of materializing
+                # a giant "updated synthetic base" and copying into a's entire storage.
+                #
+                # For now, we are pessimistically not performing the optimization from (3);
+                # we will materialize an "updated" synthetic base, and copy it back to the synthetic input base.
+                # This allows us to factor aot autograd much more nicely, since only one area of the code needs to worry
+                # about synthetic bases.
+                for i, (inpt_old, inpt_f) in enumerate(
+                    zip(args, f_args) if not trace_joint else zip(args[0], f_args[0])
+                ):
+                    if not isinstance(inpt_f, torch.Tensor):
+                        continue
+                    assert is_fun(inpt_f)
+                    inpt_new = from_fun(inpt_f)
+                    if (
+                        meta.input_info[i].mutation_type
+                        == MutationType.MUTATED_IN_GRAPH
+                    ):
+                        # See Note [set_() Input Mutations in AOTAutograd]
+                        # all mutations on the input must be under no_grad, so it is safe to put in the graph
+                        # Here, we're saying that if an input experienced a set call, inp.set_(other),
+                        # then we can effectively not have to worry about whether its data was mutated.
+                        # There are 3 cases:
+                        # (1) We mutate inp *after* the set_() call. other is a graph intermediate.
+                        #     In this case, we're not really mutating the input storage of "inp";
+                        #     we're mutating the storage of an intermdiate value (other),
+                        #     and slamming that storage into the input tensor. So no data mutation is necessary.
+                        # (2) We mutate inp *after* the set_() call. other is a graph *input*.
+                        #     In this case, the data mutation will be properly handled in the runtime
+                        #     epilogue during the processing of "other"
+                        # (3) We mutate inp *before* the set_() call.
+                        #     This case is *not* currently handled.
+                        if meta.input_info[i].mutates_storage_metadata:
+                            with torch.no_grad():
+                                inpt_old.set_(inpt_new)
+
+                        # Note [Ordering of resize_() and set_()]
+                        # Importantly: the common usage in FSDP is that we have a dummy parameter
+                        # that sees a set_() and **Then** a resize_().
+                        # We must put those mutations into the graph in the same order,
+                        # Since running them in the opposite order will have different behavior.
+                        # We fully ban resize_() followed by set_() for now, although in principal
+                        # we could support this
+                        if meta.input_info[i].mutation_inductor_storage_resize:
+                            # resizing is not supported on subclasses (we error earlier if this happens)
+                            from torch._subclasses.functional_tensor import (
+                                FunctionalTensor,
+                            )
+
+                            assert isinstance(inpt_f, FunctionalTensor)
+                            old_storage_size = torch._functionalize_get_storage_size(  # type: ignore[attr-defined]
+                                inpt_f.elem, before=True
+                            )
+                            new_storage_size = torch._functionalize_get_storage_size(  # type: ignore[attr-defined]
+                                inpt_f.elem, before=False
+                            )
+                            if old_storage_size != new_storage_size:
+                                assert (
+                                    old_storage_size == 0 or new_storage_size == 0
+                                ), f"""\
+    Encountered a storage resize during tracing on input {i}. Old nbytes={old_storage_size}, new nbytes={new_storage_size}
+    We only support storage resizing on graph inputs as long as the input either starts or ends with a storage size of 0
+    (the case for FSDP)"""
+                                torch.ops.inductor.resize_storage_bytes_(
+                                    inpt_old, new_storage_size
+                                )
+                            if new_storage_size == 0:
+                                # Even if we marked the input as having a data mutation (thus needing a copy_()),
+                                # We should **ignore** it if our input has no storage
+                                # (this can happen if, e.g. we temporarily resize our input, copy data into it,
+                                #  and resize it back down to zero)
+                                continue
+                        # Optimization: if the copy_() is a no-op then don't include it in the graph.
+                        # In theory inductor could optimize this away, however in fsdp, we end up with
+                        # param.copy_(param), where param is a zero-storage-size tensor,
+                        # and running this op in eager mode (using the aot_eager backend) will result in a segfault.
+                        # So we may as well optimize it away here.
+                        if inpt_old is inpt_new:
+                            # (This check needs to be done after putting resize_() in the graph,
+                            # since a resize_(0) doesn't actually change the FunctionalTensor's inner tensor)
+                            continue
+                        # We found an input that had a (data-only) mutation.
+                        # Since keep_input_mutations is set, we need to faithfully apply a copy_()
+                        # so the compiler will see the input mutation in the graph.
+                        if (
+                            meta.input_info[i].mutates_data
+                            and meta.input_info[i].mutations_hidden_from_autograd
+                        ):
+                            # Hidden from autograd = run under no_grad, **and** don't bump VC
+                            # (although if the tensor was created in inference mode, it has no VC)
+                            if inpt_old.is_inference():
+                                maybe_preserve_vc = nullcontext()
+                            else:
+                                maybe_preserve_vc = torch.autograd._unsafe_preserve_version_counter(
+                                    inpt_old  # type: ignore[assignment]
+                                )
+                            with torch.no_grad(), maybe_preserve_vc:
+                                inpt_old.copy_(inpt_new)
+                        elif (
+                            meta.input_info[i].mutates_data
+                            and meta.input_info[
+                                i
+                            ].mutations_under_no_grad_or_inference_mode
+                        ):
+                            # Under no_grad = run under no_grad (we still bump the VC though)
+                            # (inference_mode will also bump the VC, as long as the tensor in question
+                            # was created outside of inference_mode)
+                            with torch.no_grad():
+                                inpt_old.copy_(inpt_new)
+                        elif meta.input_info[i].mutates_data:
+                            inpt_old.copy_(inpt_new)
+
+                # When an output tensor is a functionalized mutated input, and we
+                # were able to move the mutation in to the graph then we can return
+                # the mutated input directly. This prevents duplicating the
+                # tensors contents.
+                flat_outs, outs_spec = pytree.tree_flatten(f_outs)
+                flat_outs = [from_fun(o) for o in flat_outs]
+                num_outs = len(meta.output_info)
+
+                for i in range(num_outs):
+                    info = meta.output_info[i]
+                    if info.output_type != OutputType.is_input:
+                        continue
+
+                    assert info.base_idx is not None
+                    if (
+                        meta.input_info[info.base_idx].mutation_type
+                        == MutationType.MUTATED_IN_GRAPH
+                    ):
+                        fw_args = args[0] if trace_joint else args
+                        flat_outs[i] = fw_args[info.base_idx]
+                return pytree.tree_unflatten(flat_outs, outs_spec)
+
+            return pytree.tree_map(from_fun, f_outs)
+
+    # Kinda annoying, but needed to make sure that the fx graph we trace out has "primals"
+    # and "tangents" as its input names (which are special-cased by the partitioner)
+    # TODO (tmanlaibaatar) revisit this if we ever need to turn on non-strict joint graph export
+    def joint_helper(primals, tangents):
+        return _functionalized_f_helper(primals, tangents)
+
+    helper = joint_helper if trace_joint else _functionalized_f_helper
+    if config.functionalize_rng_ops:
+        # Setup the wrapper for functionalization of rng ops
+        helper, args = create_functionalized_rng_ops_wrapper(helper, args, trace_joint)
+
+    return helper, args
+
+
+def handle_effect_tokens_fn(
+    fn,
+    args,
+    *,
+    meta: ViewAndMutationMeta,
+    trace_joint: bool,
+) -> Any:
+    num_tokens = len(meta.tokens)
+
+    @wraps(fn)
+    def inner_fn(*args):
+        # See Note [Disabling Functionalize TLS Above Python Functionalization]
+        disable_above = torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+
+        with disable_above:
+            # See Note [Side-Effectful Tokens in AOTAutograd]
+            if trace_joint:
+                assert isinstance(args, tuple) and isinstance(args[0], (list, tuple))
+                tokens = args[0][:num_tokens]
+                assert all(token.numel() == 0 for token in tokens)
+                args = (args[0][num_tokens:], *args[1:])
+            else:
+                tokens = args[:num_tokens]
+                assert all(token.numel() == 0 for token in tokens)
+                args = args[num_tokens:]
+
+            # Populate the current FunctionalTensorMode with the tokens per
+            # operator. See Note [FunctionalTensorMode is Stateful]
+            functional_tensor_mode = torch.utils._python_dispatch._detect_infra_mode(
+                torch._C._TorchDispatchModeKey.FUNCTIONAL
+            )
+            assert functional_tensor_mode is not None
+            f_tokens = pytree.tree_map(to_fun, tokens)
+            for i, k in enumerate(meta.tokens.keys()):
+                functional_tensor_mode._tokens[k] = f_tokens[i]
+
+            # Run the joint
+            outs = fn(*args)
+
+        # Return both the tokens and the outputs
+        # See Note [Side-Effectful Tokens in AOTAutograd]
+        if trace_joint:
+            assert len(outs) == 2
+            assert len(functional_tensor_mode._tokens_forward_output) == num_tokens
+            fwd_out_tokens = functional_tensor_mode._tokens_forward_output.values()
+
+            bwd_out_tokens = functional_tensor_mode._tokens.values()
+
+            f_fwd_out_tokens = [from_fun(t) for t in fwd_out_tokens]
+            f_bwd_out_tokens = [from_fun(t) for t in bwd_out_tokens]
+
+            meta.num_backward_tokens = len(bwd_out_tokens)
+            return ((*f_fwd_out_tokens, *outs[0]), (*outs[1], *f_bwd_out_tokens))
+
+        out_tokens = [from_fun(t) for t in functional_tensor_mode._tokens.values()]
+        return (*out_tokens, *outs)
+
+    # Additionally pass in tokens as inputs
+    # See Note [Side-Effectful Tokens in AOTAutograd]
+    additional_fwd_token_inputs = [torch.tensor([])] * num_tokens
+
+    if trace_joint:
+        args = ([*additional_fwd_token_inputs, *args[0]], *args[1:])
+    else:
+        args = [*additional_fwd_token_inputs, *args]
+    return inner_fn, args
+
+
+# Given a function operating on Subclass -> Subclass, returns an function that operates on Tensor -> Tensor
+# Also returns:
+# - the new set of arguments to pass into this function (now that tensor subclasses have been eliminated)
+# - the updated ViewAndMutationMeta for this dense -> dense function.
+# The other important arguments are:
+# - flat_fn_maybe_joint: when is_joint_structure=True, this is the joint fw-bw function.
+#                        when is_joint_structure=False, this is just the forward function.
+# - fw_only: this is *always* the forward-only function.
+#   Why do we need this? We need to collect updated ViewAndMutationMeta on our new dense -> dense functions.
+#   In particular, we need this to tell the partitioner how many dense forward outputs there are.
+def aot_dispatch_subclass(
+    flat_fn_maybe_joint,
+    args: list[Any],
+    *,
+    is_joint_structure: bool,
+    meta: ViewAndMutationMeta,
+    fw_only: Callable,
+) -> SubclassTracingInfo:
+    # Skip logic if we don't need to trace through any subclasses
+    req_subclass_dispatch = requires_subclass_dispatch(args, meta)
+    if not req_subclass_dispatch:
+        return SubclassTracingInfo(
+            plain_tensor_trace_fn=flat_fn_maybe_joint,
+            plain_tensor_args=args,
+            maybe_subclass_meta=None,
+        )
+
+    # TODO: add subclass guards (later PR).
+
+    # What's going on here? We need to compute subclass metadata about the outputs of the joint (grad_inputs).
+    # Annoying: we don't know the grad input metas until we're in the middle of tracing the joint,
+    # so we set it later, while we're tracing the joint (see inner_fn() below).
+    # Another option would be to run our run_functionalized_fw_and_collect_metadata() function
+    # directly on the joint, but this would hurt compile time (adding yet another pass through the joint).
+    subclass_meta = SubclassMeta()
+
+    def inner_fn(fn, args, *, use_trace_joint: bool):
+        # Step 1: wrap tensor inputs into subclasses if necessary
+        all_args = wrap_tensor_subclasses_maybe_joint(
+            args, is_joint_structure=use_trace_joint, meta=meta
+        )
+
+        # Step 2: call the inner function, with our (maybe subclass) inputs
+        wrapped_outs = fn(*all_args)
+
+        if use_trace_joint:
+            # See Note: [Computing Subclass Metadata about grad_inputs]
+            # We also stash subclass info on our grad_inputs, if we're tracing the joint.
+            nonlocal subclass_meta
+            assert isinstance(wrapped_outs, tuple) and len(wrapped_outs) == 2
+            # Don't need fw outs since we already have subclass metadata on them
+            grad_inputs = wrapped_outs[1]
+            subclass_meta.grad_input_metas = create_subclass_meta(grad_inputs)
+
+            # Add extra symints as outputs to the forward/backward graphs
+            # ignore nested ints here
+            forward_outs = unwrap_tensor_subclasses(
+                wrapped_outs[0], append_symints=True
+            )
+            # ignore nested ints here
+            backward_outs = unwrap_tensor_subclasses(
+                wrapped_outs[1], append_symints=True
+            )
+            return (forward_outs, backward_outs)
+
+        # Step 3: Unwrap any subclass outputs back into dense tensors
+        unwrapped_outs = unwrap_tensor_subclasses(wrapped_outs, append_symints=True)
+        return unwrapped_outs
+
+    def joint_fn(primals, tangents):
+        with maybe_enable_thunkify():
+            return inner_fn(
+                flat_fn_maybe_joint, (primals, tangents), use_trace_joint=True
+            )
+
+    def fw_fn(*primals):
+        with maybe_enable_thunkify():
+            return inner_fn(flat_fn_maybe_joint, primals, use_trace_joint=False)
+
+    def metadata_fn(*primals):
+        return inner_fn(fw_only, primals, use_trace_joint=False)
+
+    if is_joint_structure:
+        args_unwrapped = (
+            # Add extra symints (size/strides) as input to the forward graph
+            unwrap_tensor_subclasses(args[0], append_symints=True),
+            # We pass append_symints=False here because the partitioner will
+            # capture and add any extra argument
+            unwrap_tensor_subclasses(args[1], append_symints=False),
+        )
+    else:
+        args_unwrapped = unwrap_tensor_subclasses(args, append_symints=True)
+    remapped_static_indices = remap_unwrapped_subclass_arg_indices(
+        args, meta.static_input_indices
+    )
+
+    if is_joint_structure:
+        primals_unwrapped = args_unwrapped[0]
+        fn_to_trace = joint_fn
+    else:
+        primals_unwrapped = args_unwrapped
+        fn_to_trace = fw_fn
+
+    # Note: [Partitioner handling for Subclasses, Part 1]
+    # The way the partitioner works is that:
+    # (1) we pass is a single graph containing the joint fw/bw,
+    #     where the # of graph outputs corresponds to # fw_outputs + # grad_inputs
+    # (2) The partitioner accepts an arguments, num_fwd_outputs,
+    #     and assumes that the first "num_fwd_outputs" graph outputs correspond
+    #     to outputs of the forward graph.
+    # How do tensor subclasses enter the picture?
+    # the num_fwd_outputs in the final graph is actually non-trivial to compute,
+    # because it can be influenced by input mutations and intermediate bases.
+    # So we compute it by inspecting the current ViewAndMutationMeta object.
+    # However, the original ViewAndMutationMeta that we computed was created
+    # on the subclass -> subclass graph,
+    # which can have a different number of outputs than the dense -> dense graph.
+    # That's why we created a fresh metadata object on the dense -> dense function here,
+    # and plumb it back up to the partitioner.
+    # See Note: [Partitioner handling for Subclasses, Part 2] for more info.
+    meta_updated = run_functionalized_fw_and_collect_metadata(
+        metadata_fn,
+        static_input_indices=remapped_static_indices,
+        keep_input_mutations=meta.keep_input_mutations,
+        is_train=meta.is_train,
+    )(*primals_unwrapped)
+
+    subclass_meta.fw_metadata = meta_updated
+
+    return SubclassTracingInfo(
+        plain_tensor_trace_fn=fn_to_trace,
+        plain_tensor_args=args_unwrapped,
+        maybe_subclass_meta=subclass_meta,
+    )
+
+
+def create_functional_call(mod, params_spec, params_len, store_orig_mod=False):
+    # Redundant with dynamo, but worth having in case this gets invoked elsewhere.
+    # https://github.com/pytorch/pytorch/issues/103569
+
+    def functional_call(*args, **kwargs):
+        with stateless._reparametrize_module(
+            mod, pytree.tree_unflatten(args[:params_len], params_spec)
+        ), maybe_disable_thunkify():
+            if isinstance(mod, torch.fx.GraphModule):
+                with fx_traceback.preserve_node_meta(), warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore", "Anomaly Detection has been enabled."
+                    )
+                    with torch.autograd.detect_anomaly(check_nan=False):
+                        detect_fake_mode().epoch += 1
+                        out = PropagateUnbackedSymInts(mod).run(
+                            *args[params_len:], **kwargs
+                        )
+            else:
+                out = mod(*args[params_len:], **kwargs)
+
+        if not isinstance(out, (tuple, list)):
+            raise RuntimeError(
+                "Graph output must be a (). This is so that we can avoid "
+                "pytree processing of the outputs. Please change the module to "
+                "have tuple outputs or use aot_module instead."
+            )
+        return out
+
+    # Note [Preserving the nn module stack metadata during export non-strict mode]
+    # This path is currently only used by the non-strict export flow,
+    # where we cannot rely on dynamo to preserve nn stack metadata in our captured graph.
+    # Instead, we stash the original user nn module here, and rely on `make_fx` to grab
+    # this stashed module and use it to track nn module stack metadata
+    if store_orig_mod and not hasattr(functional_call, "_orig_mod"):
+        functional_call._orig_mod = mod  # type: ignore[attr-defined]
+
+    return functional_call
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index 4fd88e53f3843..79e99ff75ca63 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -4,6 +4,7 @@
 """
 
 import dataclasses
+<<<<<<< HEAD
 import logging
 import operator
 import warnings
@@ -12,6 +13,13 @@
 from functools import wraps
 from typing import Any, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
+=======
+import operator
+import warnings
+from contextlib import nullcontext
+from functools import wraps
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -22,8 +30,11 @@
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import py_sym_types
 
+<<<<<<< HEAD
 from .descriptors import AOTOutput
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 KNOWN_TYPES = [
     torch.Tensor,
@@ -41,7 +52,10 @@
 original_zip = zip
 
 aot_graphs_effects_log = getArtifactLogger(__name__, "aot_graphs_effects")
+<<<<<<< HEAD
 annotation_log = getArtifactLogger(__name__, "annotation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def strict_zip(*iterables, strict=True, **kwargs):
@@ -102,7 +116,10 @@ def _get_autocast_states():
 
 
 def make_boxed_func(f):
+<<<<<<< HEAD
     @simple_wraps(f)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def g(args):
         return f(*args)
 
@@ -137,8 +154,12 @@ def call_func_at_runtime_with_args(
             warnings.warn(
                 "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
                 "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
+<<<<<<< HEAD
                 "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.",
                 stacklevel=2,
+=======
+                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             out = normalize_as_list(f(*args))
     return out
@@ -148,9 +169,15 @@ def call_func_at_runtime_with_args(
 class PytreeThunk:
     spec: Optional[pytree.TreeSpec] = None
     # These are some kinda dumb microoptimizations that save about 3-4 us of overhead.
+<<<<<<< HEAD
     is_simple: Optional[bool] = (
         None  # if the output spec is a tuple/list, we won't bother unflattening it.
     )
+=======
+    is_simple: Optional[
+        bool
+    ] = None  # if the output spec is a tuple/list, we won't bother unflattening it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_really_simple: Optional[bool] = None  # if the output spec is a LeafSpec
 
     def set(self, spec: pytree.TreeSpec) -> None:
@@ -158,7 +185,11 @@ def set(self, spec: pytree.TreeSpec) -> None:
         assert spec is not None
         self.spec: pytree.TreeSpec = spec
         if self.spec.type in {tuple, list} and all(
+<<<<<<< HEAD
             child.is_leaf() for child in spec.children()
+=======
+            child.is_leaf() for child in spec.children_specs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             self.is_simple = True
         if self.spec.is_leaf():
@@ -331,7 +362,10 @@ def do(module, subgraph, expected_num_erased):
                         and out.args[1] == 0
                         and out.args[0] in with_effect_nodes
                     ):
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         output_token_nodes.append(out)
                     else:
                         other_output_nodes.append(out)
@@ -344,12 +378,21 @@ def do(module, subgraph, expected_num_erased):
 
         num_erased_inputs = len(input_token_nodes)
 
+<<<<<<< HEAD
         assert num_erased_inputs == expected_num_erased, (
             f"{subgraph} num_erased_inputs:{num_erased_inputs} {input_token_nodes}!=expected {expected_num_erased}"
         )
         assert num_erased_outs == expected_num_erased, (
             f"{subgraph} num_erased_outs:{num_erased_outs} {output_token_nodes}!=expected {expected_num_erased}"
         )
+=======
+        assert (
+            num_erased_inputs == expected_num_erased
+        ), f"{subgraph} num_erased_inputs:{num_erased_inputs} {input_token_nodes}!=expected {expected_num_erased}"
+        assert (
+            num_erased_outs == expected_num_erased
+        ), f"{subgraph} num_erased_outs:{num_erased_outs} {output_token_nodes}!=expected {expected_num_erased}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         module.recompile()
 
@@ -407,6 +450,7 @@ def root_module_when_exporting_non_strict(flat_fn):
         return None
 
 
+<<<<<<< HEAD
 def _is_forward_node_with_seq_nr(node: torch.fx.Node) -> bool:
     # For now, assume that if nn_module_stack_metadata is populated, this
     # node is from the forward. Ignore nodes without `seq_nr`.
@@ -429,6 +473,36 @@ def _collect_fwd_nodes_from_subgraph(
     fx_g: torch.fx.GraphModule, fwd_seq_nr_to_node: dict[str, torch.fx.Node]
 ) -> None:
     """Collect forward nodes from a single subgraph into the global mapping."""
+=======
+def copy_fwd_metadata_to_bw_nodes(fx_g):
+    """
+    Input: `fx_g` which contains the joint fwd+bwd FX graph created by
+    aot_autograd.
+
+    This function walks the graph and copies over metadata from forward nodes
+    to backward nodes, using the `seq_nr` field as a one-to-many mapping
+    from forward node to backward node. This metadata is useful for performance
+    profiling and debugging.
+    """
+
+    def _is_forward_node_with_seq_nr(node):
+        # For now, assume that if nn_module_stack_metadata is populated, this
+        # node is from the forward. Ignore nodes without `seq_nr`.
+        # TODO(future): there is likely a less brittle way to do this by walking
+        # the descendants of graph inputs corresponding to fwd inputs, didn't
+        # seem obvious at first glance on how to partition graph inputs into
+        # fwd vs bwd without relying on string names.
+        return "nn_module_stack" in node.meta and "seq_nr" in node.meta
+
+    def _is_backward_node_with_seq_nr(node):
+        # For now, assume that if nn_module_stack_metadata is not populated,
+        # this node is from the backward. Ignore nodes without `seq_nr`.
+        # TODO(future): there is likely a less brittle way to do this, same
+        # as with the forward.
+        return ("nn_module_stack" not in node.meta) and "seq_nr" in node.meta
+
+    fwd_seq_nr_to_node = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in fx_g.graph.nodes:
         if not _is_forward_node_with_seq_nr(node):
             continue
@@ -438,6 +512,7 @@ def _collect_fwd_nodes_from_subgraph(
             # that the current op did not create an autograd node, and there
             # is no corresponding backward node, so we skip.
             continue
+<<<<<<< HEAD
         fwd_seq_nr_to_node[seq_nr] = node
 
 
@@ -494,6 +569,18 @@ def copy_fwd_metadata_to_bw_nodes(fx_g: torch.fx.GraphModule) -> None:
     for submod in fx_g.modules():
         if isinstance(submod, torch.fx.GraphModule):
             _copy_metadata_to_bw_nodes_in_subgraph(submod, fwd_seq_nr_to_node)
+=======
+        fwd_seq_nr_to_node[node.meta["seq_nr"]] = node
+
+    for node in fx_g.graph.nodes:
+        if not _is_backward_node_with_seq_nr(node):
+            continue
+        # fwd_node should always exist, but handle non-existence just in case
+        fwd_node = fwd_seq_nr_to_node.get(node.meta["seq_nr"])
+        if fwd_node is not None:
+            node.meta["fwd_nn_module_stack"] = fwd_node.meta["nn_module_stack"]
+            node.meta["fwd_source_fn_stack"] = fwd_node.meta.get("source_fn_stack")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def register_buffer_assignment_hook(mod, assigned_buffers):
@@ -562,6 +649,7 @@ def saved_tensors_hooks_are_inlineable(hooks) -> bool:
     return isinstance(pack, torch.fx.GraphModule) and isinstance(
         unpack, torch.fx.GraphModule
     )
+<<<<<<< HEAD
 
 
 _P = ParamSpec("_P")
@@ -628,3 +716,5 @@ def fn_wrappers(fn):
         f = f.__wrapped__
         fns.append(f)
     return fns
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index f48cb04f08f98..e5e8bc0e035ea 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -1,11 +1,19 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 import contextlib
 import itertools
 from collections.abc import Callable
 from contextlib import nullcontext
 from functools import wraps
 from typing import Any, Optional
+=======
+import itertools
+from collections.abc import KeysView, Sequence
+from contextlib import contextmanager, nullcontext
+from functools import partial, wraps
+from typing import Any, Callable, NewType, Optional, Protocol, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -25,11 +33,23 @@
 )
 from torch._guards import detect_fake_mode
 from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+<<<<<<< HEAD
 from torch._inductor.utils import BoxedBool
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.export._tree_utils import reorder_kwargs
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
+=======
+from torch._inductor.output_code import OutputCode
+from torch._inductor.utils import BoxedBool, InputType
+from torch._subclasses import FakeTensor, FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    _pytree_subclasses_that_lose_info,
+    make_fx,
+)
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 static_inputs_log = torch._logging.getArtifactLogger(
@@ -45,6 +65,7 @@
 from ._aot_autograd.collect_metadata_analysis import (  # noqa: F401
     run_functionalized_fw_and_collect_metadata,
 )
+<<<<<<< HEAD
 from ._aot_autograd.descriptors import (
     AOTInput,
     BufferAOTInput,
@@ -57,6 +78,8 @@
     construct_fake_mode,
     process_inputs,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._aot_autograd.functional_utils import (  # noqa: F401
     _check_if_mutation_can_be_in_graph,
     are_all_mutations_hidden_from_autograd,
@@ -70,6 +93,7 @@
     sync_functional_tensor,
     to_fun,
 )
+<<<<<<< HEAD
 from ._aot_autograd.graph_capture_wrappers import (  # noqa: F401
     aot_dispatch_subclass,
     create_functional_call,
@@ -84,12 +108,22 @@
     aot_stage2_compile,
     aot_stage2_export,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._aot_autograd.input_output_analysis import (  # noqa: F401
     compute_overlapping_inputs,
     create_graph_signature,
     create_synthetic_base_metadata,
     remove_dupe_metadata,
 )
+<<<<<<< HEAD
+=======
+from ._aot_autograd.jit_compile_runtime_wrappers import (  # noqa: F401
+    aot_dispatch_autograd,
+    aot_dispatch_base,
+    aot_dispatch_export,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._aot_autograd.logging_utils import (  # noqa: F401
     callback_set,
     describe_input,
@@ -107,6 +141,7 @@
 from ._aot_autograd.runtime_wrappers import (  # noqa: F401
     AOTDedupeWrapper,
     AOTSyntheticBaseWrapper,
+<<<<<<< HEAD
     SerializableCompiledFunction,
 )
 from ._aot_autograd.schemas import (  # noqa: F401
@@ -116,16 +151,28 @@
     AOTState,
     BackwardSignature,
     FakifiedFlatArgs,
+=======
+)
+from ._aot_autograd.schemas import (  # noqa: F401
+    AOTConfig,
+    BackwardSignature,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FQN,
     GraphInputName,
     GraphOutputName,
     GraphSignature,
     InputAliasInfo,
+<<<<<<< HEAD
     JointWithDescriptors,
     MutationType,
     OutputAliasInfo,
     OutputType,
     SerializableAOTDispatchCompiler,
+=======
+    MutationType,
+    OutputAliasInfo,
+    OutputType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SubclassCreationMeta,
     SubclassMeta,
     TensorAlias,
@@ -138,6 +185,18 @@
     wrap_tensor_subclasses,
     wrap_tensor_subclasses_maybe_joint,
 )
+<<<<<<< HEAD
+=======
+from ._aot_autograd.traced_function_transforms import (  # noqa: F401
+    aot_dispatch_subclass,
+    create_functional_call,
+    create_functionalized_fn,
+    create_functionalized_rng_ops_wrapper,
+    create_joint,
+    fn_input_mutations_to_outputs,
+    fn_prepped_for_autograd,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._aot_autograd.utils import (  # noqa: F401
     _get_autocast_states,
     _get_symint_hints,
@@ -150,7 +209,10 @@
     normalize_as_list,
     partial_flatten_asdict,
     root_module_when_exporting_non_strict,
+<<<<<<< HEAD
     simple_wraps,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     strict_zip,
 )
 from .partitioners import default_partition
@@ -382,7 +444,11 @@
 #
 # We view every forward output when creating out tangent tensors to handle the problematic
 # case in which a subclass does extra aliasing between graph outputs/inputs in a way that
+<<<<<<< HEAD
 # is not visible above the subclass.
+=======
+# is not visible above the sublass.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 # Ordinarily, when constructing the joint function that we want to trace in AOTAutograd,
 # we're guaranteed that the tangent tensors that we pass
@@ -456,6 +522,7 @@
 
 aot_autograd_decompositions = {}
 
+<<<<<<< HEAD
 
 def create_aot_state(
     stack: contextlib.ExitStack,
@@ -466,6 +533,154 @@ def create_aot_state(
     fake_mode: FakeTensorMode,
     shape_env: Optional[ShapeEnv],
 ) -> AOTState:
+=======
+FakifiedFlatArgs = NewType("FakifiedFlatArgs", list[Any])
+
+
+TOutputCode = TypeVar("TOutputCode", bound=OutputCode)
+
+
+class AOTDispatchCompiler(Protocol):
+    """
+    Represents a fw or bw_compiler passed to AOTAutograd.
+    """
+
+    def __call__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: Sequence[InputType],
+    ) -> Any:
+        ...
+
+
+# TODO: bikeshed on this name
+class SerializableAOTDispatchCompiler(AOTDispatchCompiler):
+    """
+    Represents an AOTDispatchCompiler that returns an OutputCode, and is
+    therefore cacheable. SerializableAOTDispatchCompiler always return an OutputCode.
+    A _CompileFxCallable usually gets converted into an AOTDispatchCompiler after binding all of
+    the kwargs in _CompileFxKwargs.
+    """
+
+    def __init__(
+        self,
+        output_code_ty: type[TOutputCode],
+        compiler_fn: Callable[[torch.fx.GraphModule, Sequence[InputType]], TOutputCode],
+    ):
+        self.output_code_ty = output_code_ty
+        self.compiler_fn = compiler_fn
+
+    def __call__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: Sequence[InputType],
+    ) -> OutputCode:
+        return self.compiler_fn(gm, example_inputs)
+
+
+def process_inputs(
+    flat_args: list[Any],
+    aot_config: AOTConfig,
+    fake_mode: FakeTensorMode,
+    shape_env: Optional[ShapeEnv],
+    ignore_shape_env: bool = False,
+) -> FakifiedFlatArgs:
+    with fake_mode:
+
+        def convert(idx, x):
+            if shape_env is not None and not ignore_shape_env:
+                from torch._dynamo.source import ConstantSource
+
+                if isinstance(x, int):
+                    # We always specialize on scalar values in export.
+                    if aot_config.is_export:
+                        return x
+                    source = ConstantSource(f"sym_{idx}")
+                    return shape_env.create_symintnode(
+                        shape_env.create_symbol(x, source), hint=x, source=source
+                    )
+            if isinstance(x, torch.ScriptObject):
+                return torch._library.fake_class_registry.maybe_to_fake_obj(
+                    fake_mode, x
+                )
+            if not isinstance(x, torch.Tensor):
+                return x
+            if isinstance(x, FakeTensor):
+                assert x.fake_mode is fake_mode
+                return x
+            if is_traceable_wrapper_subclass(x):
+                attrs, _ = x.__tensor_flatten__()
+                if all(isinstance(getattr(x, attr), FakeTensor) for attr in attrs):
+                    assert all(
+                        getattr(x, attr).fake_mode is fake_mode for attr in attrs
+                    )
+                    return x
+
+            # see note [Tensor Fakification and Symbol Caching]
+            symbolic_context = None
+            source = None
+            trace = True
+            if tracing_context := torch._guards.TracingContext.try_get():
+                if x in tracing_context.tensor_to_context:
+                    symbolic_context = tracing_context.tensor_to_context[x]
+                    source = symbolic_context.tensor_source
+                    # We already fakeified this tensor in Dynamo, don't
+                    # dump the trace for it again
+                    trace = False
+            if (
+                idx < aot_config.num_params_buffers
+                and config.static_weight_shapes
+                and not symbolic_context
+            ):
+                # TODO: Ensure that this codepath is never exercised from
+                # Dynamo
+                return fake_mode.from_tensor(x, static_shapes=True)
+
+            result = fake_mode.from_tensor(
+                x,
+                static_shapes=ignore_shape_env,
+                symbolic_context=symbolic_context,
+                source=source,
+                trace=trace,
+            )
+            return result
+
+        return FakifiedFlatArgs([convert(idx, x) for idx, x in enumerate(flat_args)])
+
+
+def construct_fake_mode(
+    flat_args: list[Any], aot_config: AOTConfig
+) -> tuple[FakeTensorMode, Optional[ShapeEnv]]:
+    fake_mode = detect_fake_mode(flat_args)
+    if fake_mode is None:
+        shape_env = ShapeEnv() if aot_config.dynamic_shapes else None
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+    else:
+        shape_env = fake_mode.shape_env
+    return (fake_mode, shape_env)
+
+
+def create_aot_dispatcher_function(
+    flat_fn,
+    fake_flat_args: FakifiedFlatArgs,
+    aot_config: AOTConfig,
+    fake_mode: FakeTensorMode,
+    shape_env: Optional[ShapeEnv],
+) -> tuple[Callable, ViewAndMutationMeta]:
+    with dynamo_timed("create_aot_dispatcher_function", log_pt2_compile_event=True):
+        return _create_aot_dispatcher_function(
+            flat_fn, fake_flat_args, aot_config, fake_mode, shape_env
+        )
+
+
+def _create_aot_dispatcher_function(
+    flat_fn,
+    fake_flat_args: FakifiedFlatArgs,
+    aot_config: AOTConfig,
+    fake_mode: FakeTensorMode,
+    shape_env: Optional[ShapeEnv],
+) -> tuple[Callable, ViewAndMutationMeta]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Traces the forward and backward graphs of the attr:`flat_fn` to generate a
     joint graph. The joint graph is an Fx graph with Aten ops. Please refer to
@@ -482,6 +697,7 @@ def create_aot_state(
     inputs in flat_args are parameters and buffers, and the rest are inputs.
 
     We use this to assume that parameters/buffer's shapes don't change.
+<<<<<<< HEAD
     """
 
     # Old name for now to avoid messing with stats.  Also, note this is pushed
@@ -489,6 +705,13 @@ def create_aot_state(
     stack.enter_context(
         dynamo_timed("create_aot_dispatcher_function", log_pt2_compile_event=True)
     )
+=======
+
+    Note: this function is used both by aot_function and aot_export (controlled by aot_config.is_export)
+        When aot_config.is_export is True, we return an FX graph + metadata
+        When aot_config.is_export is False, we return an ordinary runtime function
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # This is the main entry point.
     # TODO: Chillee argues that dynamo itself should pass in fake tensors to
@@ -520,6 +743,7 @@ def create_aot_state(
     # If any saved tensor hooks are active, we **don't** want to trace them.
     # Instead, we'll let them run at runtime, around the custom autograd.Function
     # that we generate in torch.compile.
+<<<<<<< HEAD
     stack.enter_context(torch.autograd.set_multithreading_enabled(False))
     stack.enter_context(preserve_rng_state())
     stack.enter_context(fake_mode)
@@ -655,10 +879,143 @@ def _dup_fake_script_obj(fake_flat_args):
         if len([x for x in fw_metadata.input_info if x.mutates_metadata]) != 0:
             raise RuntimeError(
                 f"""\
+=======
+    with torch.autograd.set_multithreading_enabled(
+        False
+    ), preserve_rng_state(), (
+        fake_mode
+    ), (
+        python_dispatcher_mode
+    ), PhiloxStateTracker(), torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+        from torch._library.fake_class_registry import (
+            FakeScriptObject,
+            maybe_to_fake_obj,
+        )
+
+        # Tracing may mutate the states the fake script object,
+        # so we need to duplicate the fake script objects so that subsequent tracing
+        # won't be affected.
+        def _dup_fake_script_obj(fake_flat_args):
+            return [
+                maybe_to_fake_obj(detect_fake_mode(fake_flat_args), arg.real_obj)
+                if isinstance(arg, FakeScriptObject)
+                else arg
+                for arg in fake_flat_args
+            ]
+
+        needs_autograd = any(
+            x.requires_grad for x in fake_flat_args if isinstance(x, Tensor)
+        )
+
+        with enable_python_dispatcher():
+            # Patch set_rng_state as set_rng_state with fake tensors is
+            # nonsensical. This does not affect the collection of metadata.
+            with patch("torch.cuda.set_rng_state", lambda *args: None):
+                mod = root_module_when_exporting_non_strict(flat_fn)
+                if mod is not None:
+                    ctx = _detect_attribute_assignment(mod)
+                else:
+                    ctx = nullcontext()
+
+                if torch._functorch.config.fake_tensor_propagate_real_tensors:
+                    # Running dynamo_timed causes fake tensor issues when
+                    # propagate real tensor is switched on.
+                    dynamo_timed_ctx = nullcontext()
+                else:
+                    dynamo_timed_ctx = dynamo_timed(
+                        "aot_collect_metadata", log_pt2_compile_event=True
+                    )
+
+                with dynamo_timed_ctx, ctx:
+                    fw_metadata = run_functionalized_fw_and_collect_metadata(
+                        flat_fn,
+                        static_input_indices=aot_config.static_input_indices,
+                        keep_input_mutations=aot_config.keep_inference_input_mutations,
+                        is_train=needs_autograd,
+                        pre_dispatch=aot_config.pre_dispatch,
+                        is_export=aot_config.is_export,
+                    )(*_dup_fake_script_obj(fake_flat_args))
+
+                req_subclass_dispatch = requires_subclass_dispatch(
+                    fake_flat_args, fw_metadata
+                )
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", requires_subclass_dispatch=req_subclass_dispatch
+                )
+
+                output_and_mutation_safe = not any(
+                    x.requires_grad
+                    # view-type operations preserve requires_grad even in no_grad.
+                    # Do not count aliases of inputs with requires_grad as reason to make a training graph,
+                    # as AOTAutograd will perform view-replay to regenerate the view outputs at runtime,
+                    # setting their grad_fn properly.
+                    and not (
+                        x.output_type
+                        in (OutputType.alias_of_input, OutputType.is_input)
+                        and fw_metadata.input_info[x.base_idx].requires_grad
+                    )
+                    for x in fw_metadata.output_info
+                ) and not any(
+                    x.requires_grad
+                    and x.mutates_data
+                    and not x.mutations_under_no_grad_or_inference_mode
+                    and not x.mutations_hidden_from_autograd
+                    for x in fw_metadata.input_info
+                )
+
+                if needs_autograd and output_and_mutation_safe:
+                    # We realized that none of the outputs require grad,
+                    # and none of the inputs that require grad are mutated.
+                    # so we actually have an inference graph.
+                    needs_autograd = False
+                    # A bit silly: right now in the subclass codepath, our ViewAndMutationMeta
+                    # changes depending on whether we pass in is_train / keep_input_mutations,
+                    # so we're forced to recompute the metadata.
+                    # TODO: refactor the subclass path of run_functionalized_fw_and_collect_metadata
+                    # so that this is unnecessary.
+                    if req_subclass_dispatch:
+                        fw_metadata = run_functionalized_fw_and_collect_metadata(
+                            flat_fn,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations,
+                            is_train=False,
+                            pre_dispatch=aot_config.pre_dispatch,
+                            static_input_indices=aot_config.static_input_indices,
+                        )(*fake_flat_args)
+                    else:
+                        fw_metadata = ViewAndMutationMeta(
+                            input_info=fw_metadata.input_info,
+                            output_info=fw_metadata.output_info,
+                            num_intermediate_bases=fw_metadata.num_intermediate_bases,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations,
+                            traced_tangents=fw_metadata.traced_tangents,
+                            subclass_inp_meta=fw_metadata.subclass_inp_meta,
+                            subclass_fw_graph_out_meta=fw_metadata.subclass_fw_graph_out_meta,
+                            subclass_tangent_meta=fw_metadata.subclass_tangent_meta,
+                            is_train=False,
+                            tokens=fw_metadata.tokens,
+                            static_input_indices=fw_metadata.static_input_indices,
+                        )
+
+        if fw_metadata.num_intermediate_bases > 0:
+            assert not req_subclass_dispatch, f"""\
+torch.compile is currently being used with tensor subclass inputs:
+{','.join([str(type(x)) for x in fake_flat_args])}. We are attempting to a compile a graph with two graph outputs
+that alias one another, which is currently unsupported in the subclass use case. If you run into this,
+please file a github issue"""
+
+        if aot_config.is_export:
+            # aot_export: ban input metadata mutations for now to keep shared code paths simpler.
+            # Keeping .resize_() in the graph will require some work
+            # Allowing it but keeping the graph functional will require some calling convention changes.
+            if len([x for x in fw_metadata.input_info if x.mutates_metadata]) != 0:
+                raise RuntimeError(
+                    f"""\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Found an input that received a metadata mutation, through e.g. a call to `.resize_()` or `.transpose_()`.
 This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
 
 fw_metadata={str(fw_metadata)}"""
+<<<<<<< HEAD
             )
         # In export, banning data mutations on inputs that require grad for now.
         # This should be rare, and is tricky to get right. When we trace the backward,
@@ -677,10 +1034,30 @@ def _dup_fake_script_obj(fake_flat_args):
         ):
             raise RuntimeError(
                 f"""\
+=======
+                )
+            # In export, banning data mutations on inputs that require grad for now.
+            # This should be rare, and is tricky to get right. When we trace the backward,
+            # we currently trace with autograd.grad instead of .backward(), which makes it difficult
+            # to ensure that we run autograd all the way through the input **before** it saw the mutation.
+            if (
+                len(
+                    [
+                        x
+                        for x in fw_metadata.input_info
+                        if x.requires_grad and x.mutates_data
+                    ]
+                )
+                != 0
+            ):
+                raise RuntimeError(
+                    f"""\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Found a graph input that requires gradients, and received a mutation.
 This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
 
 fw_metadata={str(fw_metadata)}"""
+<<<<<<< HEAD
             )
         if req_subclass_dispatch:
             raise RuntimeError(
@@ -707,6 +1084,56 @@ def _dup_fake_script_obj(fake_flat_args):
         aot_config=aot_config,
         stack=stack,
     )
+=======
+                )
+            if req_subclass_dispatch:
+                raise RuntimeError(
+                    """\
+aot_export is not currently supported with traceable tensor subclass.
+If you need this feature, please comment on <CREATE_ISSUE_LINK>"""
+                )
+
+            # Need to decide on a strategy for functionalized RNG: toggling via global config seems bad,
+            # and turning it on will require a non-trivial calling convention change for any export runtime.
+            if config.functionalize_rng_ops:
+                raise RuntimeError(
+                    """\
+Functionalized RNG is not currently supported in the aot_export workflow. Please file a github issue,
+or otherwise set torch._functorch.config.functionalize_rng_ops = False."""
+                )
+
+        def choose_dispatcher(needs_autograd, aot_config):
+            """
+            Pick a dispatcher based on the config rules.
+            """
+            if aot_config.is_export:
+                # export uses just the "graph bits", whereas the other
+                # two dispatchers include some extra work around handling a runtime epilogue
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", dispatch_mode="export"
+                )
+                return partial(aot_dispatch_export, needs_autograd=needs_autograd)
+            elif needs_autograd and not aot_config.pre_dispatch:
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", dispatch_mode="autograd"
+                )
+                return aot_dispatch_autograd
+            else:
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", dispatch_mode="inference"
+                )
+                return aot_dispatch_base
+
+        compiler_fn = choose_dispatcher(needs_autograd, aot_config)
+
+        compiled_fn, fw_metadata = compiler_fn(
+            flat_fn,
+            _dup_fake_script_obj(fake_flat_args),
+            aot_config,
+            fw_metadata=fw_metadata,
+        )
+        return compiled_fn, fw_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def aot_function(
@@ -722,7 +1149,10 @@ def aot_function(
     # Whether or not to trace with dynamic shapes
     dynamic=False,
     enable_log=True,
+<<<<<<< HEAD
     disable_functionalization=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable:
     """
     Traces the forward and backward graph of :attr:`fn` using torch dispatch
@@ -740,7 +1170,11 @@ def aot_function(
         This API is experimental and likely to change.
 
     Args:
+<<<<<<< HEAD
         fn (Callable): A Python function that takes one or more arguments. Must
+=======
+        fn (Callable): A Python function that takes one ore more arguments. Must
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return one or more Tensors.
         fw_compiler (Callable): A Python function that accepts an Fx graph with
             Aten ops and input args, and returns a Callable that semantically is
@@ -767,7 +1201,11 @@ def aot_function(
     A simple example usage of :func:`aot_function` is as follows. This example
     will print the forward and backward graphs of the function ``fn``
 
+<<<<<<< HEAD
         >>> fn = lambda x: x.sin().cos()
+=======
+        >>> fn = lambda x : x.sin().cos()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> def print_compile_fn(fx_module, args):
         >>>     print(fx_module)
         >>>     return fx_module
@@ -776,11 +1214,23 @@ def aot_function(
         >>> aot_fn(x)
     """
 
+<<<<<<< HEAD
     aot_config = AOTConfig(
         fw_compiler=None,
         bw_compiler=None,
         inference_compiler=None,
         partition_fn=None,
+=======
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    if inference_compiler is None:
+        inference_compiler = fw_compiler
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        inference_compiler=inference_compiler,
+        partition_fn=partition_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         decompositions=decompositions,
         num_params_buffers=num_params_buffers,
         aot_id=next(AOT_COUNTER),
@@ -790,7 +1240,10 @@ def aot_function(
         is_export=False,
         no_tangents=False,
         enable_log=enable_log,
+<<<<<<< HEAD
         disable_functionalization=disable_functionalization,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     cached_res = None
 
@@ -807,6 +1260,7 @@ def returned_function(*args, **kwargs):
             fake_flat_args: FakifiedFlatArgs = process_inputs(
                 flat_args, aot_config, fake_mode, shape_env
             )
+<<<<<<< HEAD
             # TODO: We actually could use the pytree path to make better descs.
             # Also, the descs here are bad if you do aot_module.
             fake_flat_args_descs = [
@@ -831,6 +1285,15 @@ def returned_function(*args, **kwargs):
                     bw_compiler,
                     inference_compiler,
                 )
+=======
+            compiled_fn, _ = create_aot_dispatcher_function(
+                flat_fn,
+                fake_flat_args,
+                aot_config,
+                fake_mode,
+                shape_env,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cached_res = (compiled_fn, out_spec)
 
         cached_fn, out_spec = cached_res
@@ -892,6 +1355,7 @@ def forward(self, *args, **kwargs):
     return AOTModule()
 
 
+<<<<<<< HEAD
 def prepare_aot_module_simplified(
     mod: nn.Module,
     args,
@@ -1022,6 +1486,87 @@ def prepare_aot_module_simplified(
         in_spec,
         out_spec,
     )
+=======
+def _try_get_metadata_from_dynamo(
+    mod: torch.nn.Module, param_keys: KeysView[str], full_args_num: int
+) -> tuple[Optional[list[torch._guards.Source]], list[int]]:
+    """
+    Metadata is forwarded from Dynamo to AOTDispatch via special fields on GraphModule.
+    We first verify that `mod` does come from Dynamo, then we handle cases where
+    metadata might be missing.
+
+    Returns:
+        aot_autograd_arg_pos_to_source: used to dedup params and their guards
+        static_input_indices: used to identify static inputs for cudagraphs
+    """
+    # Note [Assumption on Dynamo Metadata]
+    # This function assumes a graph module from dynamo provides `dynamo_compiled_id`,
+    # _param_name_to_source, and every placeholder node has `_dynamo_source` attributes.
+    # When gm is modified (e.g., DDPOptimizer via split_module), metadata needs to
+    # be propagated in order to be recognized as a dynamo graph
+
+    if not (isinstance(mod, torch.fx.GraphModule) and "dynamo_compile_id" in mod.meta):
+        # graph was not captured by dynamo
+        return None, []
+
+    if not hasattr(mod, "_param_name_to_source"):
+        # is from export
+        return None, []
+
+    # We now know this came from dynamo, and (1) we care about guards,
+    # so setting up aot_autograd_arg_pos_to_source for downstream dedup guards
+    # can now be done safely. (2) Dynamo logic protects the 1:1 sizing below.
+    # Additionally, we mark static indices for cudagraphs.
+    param_name_to_source = mod._param_name_to_source
+    seen_sources = set()
+
+    aot_autograd_arg_pos_to_source = []
+    static_input_indices = []
+    # Collect the new inputs lifted by aotdispatch
+    for i, name in enumerate(param_keys):
+        assert name in param_name_to_source, f"{name} not found."
+        source = param_name_to_source[name]
+        assert source not in seen_sources, source
+        seen_sources.add(source)
+        aot_autograd_arg_pos_to_source.append(source)
+
+        static_input_indices.append(i)
+
+    # Collect the dynamo graph inputs
+    # TODO(mlazos): Revisit if this is still needed. With Dynamo install ID
+    # matched tensors back into the Fx graph, this might not be necessary.
+    for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
+        assert hasattr(node, "_dynamo_source")
+        source = node._dynamo_source
+        # `source`` specifies the source from user code. ddp optimizer may have
+        # intermediate values becoming submodule placeholders which does not
+        # have a source
+        assert source is None or source not in seen_sources, source
+        seen_sources.add(source)
+        aot_autograd_arg_pos_to_source.append(source)
+        source_name = source.name() if source else str(source)
+
+        # input[i] in dynamo is now:
+        # input[i + len(extra_params)] in AOT,
+        # where extra_params are the params/buffers that dynamo baked into the
+        # OutputGraph
+        actual_pos = pos + len(param_keys)
+
+        if "tensor_dict" in node.meta and node.meta["tensor_dict"].get(
+            "_dynamo_static_input_type", None
+        ):
+            static_inputs_log.debug(
+                "Adding static input pos %s for source %s", actual_pos, source_name
+            )
+            static_input_indices.append(actual_pos)
+        else:
+            static_inputs_log.debug(
+                "Non-static input pos %s for source %s", actual_pos, source_name
+            )
+
+    assert full_args_num == len(aot_autograd_arg_pos_to_source)
+    return aot_autograd_arg_pos_to_source, static_input_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def aot_module_simplified(
@@ -1033,12 +1578,18 @@ def aot_module_simplified(
     decompositions: Optional[dict] = None,
     keep_inference_input_mutations=False,
     inference_compiler: Optional[AOTDispatchCompiler] = None,
+<<<<<<< HEAD
     # TODO: This doesn't seem to be used in any nontrivial way, check if it's
     # actually needed
     cudagraphs: Optional[BoxedBool] = None,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
     ignore_shape_env: bool = False,
     disable_functionalization: bool = False,
+=======
+    cudagraphs: Optional[BoxedBool] = None,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    ignore_shape_env: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -1050,10 +1601,21 @@ def aot_module_simplified(
 
     :func:`aot_module_simplified` removes these overheads.
     """
+<<<<<<< HEAD
+=======
+    params = {
+        **dict(mod.named_parameters(remove_duplicate=False)),
+        **dict(mod.named_buffers(remove_duplicate=False)),
+    }
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_flat = list(params_flat)
+    params_len = len(params_flat)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if cudagraphs is None:
         cudagraphs = BoxedBool(torch._inductor.config.triton.cudagraphs)
 
+<<<<<<< HEAD
     with contextlib.ExitStack() as stack:
         (
             functional_call,
@@ -1104,10 +1666,72 @@ def aot_module_simplified(
                 functional_call,
                 fake_flat_args,
                 full_args_descs,
+=======
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    if inference_compiler is None:
+        inference_compiler = fw_compiler
+
+    full_args = []
+    # First, the params
+    full_args.extend(params_flat)
+
+    if tracing_context := torch._guards.TracingContext.try_get():
+        tracing_context.params_flat = params_flat
+        (
+            tracing_context.params_flat_unwrap_subclasses,
+            tracing_context.params_unwrapped_to_flat_index,
+        ) = unwrap_tensor_subclasses_with_indices_to_original(params_flat)
+
+    # Next, the input args
+    full_args.extend(args)
+
+    (
+        aot_autograd_arg_pos_to_source,
+        static_input_indices,
+    ) = _try_get_metadata_from_dynamo(mod, params.keys(), len(full_args))
+
+    dynamic_shapes = False
+    for x in full_args:
+        if isinstance(x, FakeTensor):
+            dynamic_shapes = x.fake_mode.shape_env is not None
+            break
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        inference_compiler=inference_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=params_len,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+        dynamic_shapes=dynamic_shapes,
+        aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        static_input_indices=static_input_indices,
+        is_export=False,
+        no_tangents=False,
+        cache_info=None,
+        ignore_shape_env=ignore_shape_env,
+        precompile_backend_id=getattr(mod, "_backend_id", None),
+    )
+    fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
+    fake_flat_args = process_inputs(
+        full_args, aot_config, fake_mode, shape_env, ignore_shape_env
+    )
+
+    def dispatch_and_compile():
+        functional_call = create_functional_call(mod, params_spec, params_len)
+        with compiled_autograd._disable():
+            compiled_fn, _ = create_aot_dispatcher_function(
+                functional_call,
+                fake_flat_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 aot_config,
                 fake_mode,
                 shape_env,
             )
+<<<<<<< HEAD
             aot_graph_capture = aot_stage1_graph_capture(aot_state, functional_call)
             compiled_fn, _ = aot_stage2_compile(
                 aot_state,
@@ -1117,20 +1741,51 @@ def aot_module_simplified(
                 bw_compiler,
                 inference_compiler,
             )
+=======
+        return compiled_fn
+
+    # We only care if the forward will return an OutputCode.
+    if isinstance(fw_compiler, SerializableAOTDispatchCompiler):
+        local = should_use_local_autograd_cache()
+        remote = should_use_remote_autograd_cache()
+        if local or remote:
+            set_feature_use("aot_autograd_remote_cache", remote)
+            compiled_fn = AOTAutogradCache.load(
+                dispatch_and_compile,
+                mod,
+                fake_flat_args,
+                aot_config,
+                cudagraphs,
+                boxed_forward_device_index,
+                local,
+                remote,
+            )
+        else:
+            compiled_fn = dispatch_and_compile()
+    else:
+        compiled_fn = dispatch_and_compile()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(mod, torch._dynamo.utils.GmWrapper):
         # This function is called by the flatten_graph_inputs wrapper, which boxes
         # the inputs so that they can be freed before the end of this scope.
         # For overhead reasons, this is not the default wrapper, see comment:
         # https://github.com/pytorch/pytorch/pull/122535/files#r1560096481
+<<<<<<< HEAD
         @simple_wraps(compiled_fn)
         def forward(runtime_args: list[Any]):
             flat_args = []
             flat_args.extend(params_buffers_flat)
+=======
+        def boxed_forward(runtime_args: list[Any]):
+            flat_args = []
+            flat_args.extend(params_flat)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             flat_args.extend(runtime_args)
             runtime_args.clear()
             return compiled_fn(flat_args)
 
+<<<<<<< HEAD
     else:
         # TODO: There is something deeply wrong here; compiled_fn running with
         # the boxed calling convention, but aot_module_simplified somehow
@@ -1143,12 +1798,31 @@ def forward(*runtime_args: tuple[Any]):
             full_args.extend(params_buffers_flat)
             full_args.extend(runtime_args)
             return compiled_fn(full_args)
+=======
+        # Just for convenience
+        boxed_forward.zero_grad = mod.zero_grad
+        boxed_forward.named_parameters = mod.named_parameters
+        boxed_forward.named_buffers = mod.named_buffers
+        return boxed_forward
+
+    # TODO: There is something deeply wrong here; compiled_fn running with
+    # the boxed calling convention, but aot_module_simplified somehow
+    # historically returned a function that was not the boxed calling
+    # convention.  This should get fixed...
+    # NB: GraphModule/nn.Module rely on the non-boxed calling convention here
+    def forward(*runtime_args: tuple[Any]):
+        full_args = []
+        full_args.extend(params_flat)
+        full_args.extend(runtime_args)
+        return compiled_fn(full_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Just for convenience
     forward.zero_grad = mod.zero_grad
     forward.named_parameters = mod.named_parameters
     forward.named_buffers = mod.named_buffers
 
+<<<<<<< HEAD
     # Add a serialize function
     def grab_serialize_fn(fn):
         if isinstance(fn, SerializableCompiledFunction):
@@ -1341,6 +2015,11 @@ def unflattened_compiled_fn(*args, **kwargs):
     return unflattened_compiled_fn
 
 
+=======
+    return forward
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def aot_export_module(
     mod: nn.Module,
     args,
@@ -1353,7 +2032,11 @@ def aot_export_module(
     # Your module can return multiple outputs, so you must specify which output the loss is.
     output_loss_index: Optional[int] = None,
     pre_dispatch: bool = False,
+<<<<<<< HEAD
     # If None, will be inferred from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong.
+=======
+    # If None, will be infered from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dynamic_shapes: Optional[bool] = None,
     kwargs=None,
 ) -> tuple[torch.fx.GraphModule, GraphSignature]:
@@ -1485,11 +2168,16 @@ def fn_to_trace(*args):
             no_tangents=True,
             pre_dispatch=pre_dispatch,
             dynamic_shapes=dynamic_shapes,
+<<<<<<< HEAD
             trace_joint=trace_joint,
             kwargs=kwargs,
         )
 
     # TODO: subsume this path with the aot_stage2_graph_capture path
+=======
+            kwargs=kwargs,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if trace_joint:
 
         @wraps(functional_call)
@@ -1521,7 +2209,13 @@ def flattened_joint(*args):
             output_gradients = []
             for a, grad in zip(args, gradients):
                 if isinstance(a, torch.Tensor) and a.requires_grad:
+<<<<<<< HEAD
                     assert grad is not None, """\
+=======
+                    assert (
+                        grad is not None
+                    ), """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Found a parameter that did not receive a gradient.
 "This is most likely a bug, but if this needs to be supported please comment on this Github issue:
 https://github.com/pytorch/pytorch/issues/101192
@@ -1555,7 +2249,11 @@ def aot_export_joint_simple(
     *,
     trace_joint: bool,
     # It looks like the main consequence of this API is that for dynamic shapes,
+<<<<<<< HEAD
     # it will assume that params/buffers are static.
+=======
+    # it will assume that parms/buffers are static.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # With the new inferred dynamic shapes API, maybe this doesn't matter?
     num_params_buffers: int = 0,
     decompositions: Optional[dict] = None,
@@ -1588,9 +2286,14 @@ def aot_export_joint_simple(
             func,
             args,
             decompositions=decompositions,
+<<<<<<< HEAD
             trace_joint=trace_joint,
         )
         in_spec, _kw_in_spec = in_spec.children()
+=======
+        )
+        in_spec, _kw_in_spec = in_spec.children_specs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # At this point, we can just directly return the (joint or inference graph) that we traced.
     # First though: a bunch of assertions to make sure that our graph doesn't require
     # any calling convention changes compared to the original function.
@@ -1617,7 +2320,11 @@ def aot_export_joint_simple(
         raise RuntimeError(
             f"aot_export_joint_simple requires inputs to be a single list/tuple. in_spec={str(in_spec)}"
         )
+<<<<<<< HEAD
     if not all(child.is_leaf() for child in in_spec.children()):
+=======
+    if not all(child.is_leaf() for child in in_spec.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             f"aot_export_joint_simple requires individual inputs not to be pytrees. in_spec={str(in_spec)}"
         )
@@ -1625,7 +2332,11 @@ def aot_export_joint_simple(
         raise RuntimeError(
             f"aot_export_joint_simple requires outputs to be a single list/tuple. out_spec={str(out_spec)}"
         )
+<<<<<<< HEAD
     if not all(child.is_leaf() for child in out_spec.children()):
+=======
+    if not all(child.is_leaf() for child in out_spec.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             f"aot_export_joint_simple requires individual outputs not to be pytrees. out_spec={str(out_spec)}"
         )
@@ -1635,9 +2346,13 @@ def aot_export_joint_simple(
     if config.debug_assert:
         # Smoke test that after partitioning, we can run the forward without any calling convention changes.
         fw_module, _bw_module = aot_config.default_partition(  # noqa: F821
+<<<<<<< HEAD
             fx_g,
             args,
             num_fwd_outputs=len(fw_metadata.output_infos),  # noqa: F821
+=======
+            fx_g, args, num_fwd_outputs=len(fw_metadata.output_infos)  # noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Attempt to run the fw_module with the original user inputs
         fake_mode = detect_fake_mode(args)
@@ -1667,11 +2382,16 @@ def _aot_export_function(
     # We don't know this info at trace time though, so we need to make it an explicit config.
     no_tangents: bool = False,
     pre_dispatch: bool = False,
+<<<<<<< HEAD
     # If None, `dynamic_shapes` will be inferred from inputs, but the inferred result might be wrong.
     dynamic_shapes: Optional[bool] = None,
     keep_input_mutations: bool = False,
     # Under export, configures whether we are getting inference or training IR
     trace_joint: bool = False,
+=======
+    # If None, `dynamic_shapes` will be infered from inputs, but the inferred result might be wrong.
+    dynamic_shapes: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs=None,
 ) -> tuple[torch.fx.GraphModule, ViewAndMutationMeta, pytree.TreeSpec, pytree.TreeSpec]:
     kwargs = kwargs or {}
@@ -1710,19 +2430,27 @@ def _aot_export_function(
         # For now there's no use case involving keeping input mutations in the graph
         # (which we can only do in the inference case anyway).
         # We can add this later if we need to.
+<<<<<<< HEAD
         keep_inference_input_mutations=keep_input_mutations,
+=======
+        keep_inference_input_mutations=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_shapes=dynamic_shapes,
         aot_autograd_arg_pos_to_source=None,
         is_export=True,
         no_tangents=no_tangents,
         pre_dispatch=pre_dispatch,
+<<<<<<< HEAD
         export_trace_joint=trace_joint,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if fake_mode is None:
         fake_mode, shape_env = construct_fake_mode(flat_args, aot_config)
     else:
         shape_env = fake_mode.shape_env
     fake_flat_args = process_inputs(flat_args, aot_config, fake_mode, shape_env)
+<<<<<<< HEAD
     # TODO: Improve the descs here with pytree information
     fake_flat_args_descs = [PlainAOTInput(i) for i in range(len(fake_flat_args))]
 
@@ -1742,5 +2470,118 @@ def _aot_export_function(
     return fx_g, meta, in_spec, out_spec.spec
 
 
+=======
+
+    fx_g, meta = create_aot_dispatcher_function(
+        flat_fn,
+        fake_flat_args,
+        aot_config,
+        fake_mode,
+        shape_env,
+    )
+    return fx_g, meta, in_spec, out_spec.spec
+
+
+@contextmanager
+def _detect_attribute_assignment(mod: torch.nn.Module):
+    # Do not allow assignment of tensor attributes during export unless
+    # the attribute is registered as a buffer.
+
+    NN_MODULE_STD_ATTRS = [
+        "_backward_hooks",
+        "_backward_pre_hooks",
+        "_buffers",
+        "_forward_hooks",
+        "_forward_hooks_always_called",
+        "_forward_hooks_with_kwargs",
+        "_forward_pre_hooks",
+        "_forward_pre_hooks_with_kwargs",
+        "_is_full_backward_hook",
+        "_load_state_dict_post_hooks",
+        "_load_state_dict_pre_hooks",
+        "_modules",
+        "_non_persistent_buffers_set",
+        "_parameters",
+        "_state_dict_hooks",
+        "_state_dict_pre_hooks",
+        "training",
+    ]
+    NN_MODULE_LAZY_STD_ATTRS = [
+        "_initialize_hook",
+        "_load_hook",
+    ]
+    STD_ATTRS = {
+        *NN_MODULE_STD_ATTRS,
+        *NN_MODULE_LAZY_STD_ATTRS,
+    }
+
+    def _get_attributes(mod):
+        # return any attributes of a module that are not standard attributes
+        return {k: v for k, v in mod.__dict__.items() if k not in STD_ATTRS}
+
+    # save state of attributes before enter
+    snapshot = pytree.tree_map(
+        lambda x: x,
+        _get_attributes(mod),
+        is_leaf=lambda x: type(x) in _pytree_subclasses_that_lose_info,
+    )
+    try:
+        yield
+    finally:
+        # after exit, compare state of attributes with snapshot
+        # to detect which tensor attributes were assigned
+        assigned_tensor_attributes = []
+
+        def _collect_assigned_tensor_attributes(kp, v, _v):
+            if _v is not v:
+                attr, *rest = kp
+                if isinstance(v, torch.Tensor):
+                    assigned_tensor_attributes.append(
+                        f"self.{attr.key}{pytree.keystr(rest)}"
+                    )
+                # TODO(avik): Assigning all other types are allowed right now.
+                # Maybe in the future we want to limit this to primitive types?
+            return v
+
+        new_attrs = _get_attributes(mod)
+        if len(new_attrs) != len(snapshot):
+            added_attrs = new_attrs.keys() - snapshot.keys()
+            deleted_attrs = snapshot.keys() - new_attrs.keys()
+
+            if len(added_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were created in the model.forward: {added_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
+
+            if len(deleted_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were deleted in the model.forward: {deleted_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
+
+        pytree.tree_map_with_path(
+            _collect_assigned_tensor_attributes, snapshot, new_attrs
+        )
+        # restore state of all attributes (including, e.g., of primitive types)
+        mod.__dict__.update(snapshot)
+
+        if assigned_tensor_attributes:
+            if len(assigned_tensor_attributes) > 1:
+                noun, verb = "attributes", "were"
+            else:
+                noun, verb = "attribute", "was"
+            raise ValueError(
+                f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. "
+                "Such attributes must be registered as buffers using the `register_buffer` API "
+                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 compiled_function = aot_function
 compiled_module = aot_module
diff --git a/torch/_functorch/apis.py b/torch/_functorch/apis.py
index 1faa767d4d05c..67bdea4e1a6b1 100644
--- a/torch/_functorch/apis.py
+++ b/torch/_functorch/apis.py
@@ -92,7 +92,11 @@ def vmap(
     doesn't provide a batched ``torch.dot`` API; instead of unsuccessfully
     rummaging through docs, use :func:`vmap` to construct a new function.
 
+<<<<<<< HEAD
         >>> torch.dot  # [D], [D] -> []
+=======
+        >>> torch.dot                            # [D], [D] -> []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> batched_dot = torch.func.vmap(torch.dot)  # [N, D], [N, D] -> [N]
         >>> x, y = torch.randn(2, 5), torch.randn(2, 5)
         >>> batched_dot(x, y)
@@ -104,7 +108,11 @@ def vmap(
         >>> weights = torch.randn(feature_size, requires_grad=True)
         >>>
         >>> def model(feature_vec):
+<<<<<<< HEAD
         >>> # Very simple linear model with activation
+=======
+        >>>     # Very simple linear model with activation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>     return feature_vec.dot(weights).relu()
         >>>
         >>> examples = torch.randn(batch_size, feature_size)
@@ -120,7 +128,11 @@ def vmap(
 
         >>> # Setup
         >>> N = 5
+<<<<<<< HEAD
         >>> f = lambda x: x**2
+=======
+        >>> f = lambda x: x ** 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> x = torch.randn(N, requires_grad=True)
         >>> y = f(x)
         >>> I_N = torch.eye(N)
@@ -137,49 +149,84 @@ def vmap(
 
     :func:`vmap` can also be nested, producing an output with multiple batched dimensions
 
+<<<<<<< HEAD
         >>> torch.dot  # [D], [D] -> []
         >>> batched_dot = torch.vmap(
         ...     torch.vmap(torch.dot)
         ... )  # [N1, N0, D], [N1, N0, D] -> [N1, N0]
         >>> x, y = torch.randn(2, 3, 5), torch.randn(2, 3, 5)
         >>> batched_dot(x, y)  # tensor of size [2, 3]
+=======
+        >>> torch.dot                            # [D], [D] -> []
+        >>> batched_dot = torch.vmap(torch.vmap(torch.dot))  # [N1, N0, D], [N1, N0, D] -> [N1, N0]
+        >>> x, y = torch.randn(2, 3, 5), torch.randn(2, 3, 5)
+        >>> batched_dot(x, y) # tensor of size [2, 3]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     If the inputs are not batched along the first dimension, ``in_dims`` specifies
     the dimension that each inputs are batched along as
 
+<<<<<<< HEAD
         >>> torch.dot  # [N], [N] -> []
         >>> batched_dot = torch.vmap(torch.dot, in_dims=1)  # [N, D], [N, D] -> [D]
         >>> x, y = torch.randn(2, 5), torch.randn(2, 5)
         >>> batched_dot(
         ...     x, y
         ... )  # output is [5] instead of [2] if batched along the 0th dimension
+=======
+        >>> torch.dot                            # [N], [N] -> []
+        >>> batched_dot = torch.vmap(torch.dot, in_dims=1)  # [N, D], [N, D] -> [D]
+        >>> x, y = torch.randn(2, 5), torch.randn(2, 5)
+        >>> batched_dot(x, y)   # output is [5] instead of [2] if batched along the 0th dimension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     If there are multiple inputs each of which is batched along different dimensions,
     ``in_dims`` must be a tuple with the batch dimension for each input as
 
+<<<<<<< HEAD
         >>> torch.dot  # [D], [D] -> []
         >>> batched_dot = torch.vmap(torch.dot, in_dims=(0, None))  # [N, D], [D] -> [N]
         >>> x, y = torch.randn(2, 5), torch.randn(5)
         >>> batched_dot(
         ...     x, y
         ... )  # second arg doesn't have a batch dim because in_dim[1] was None
+=======
+        >>> torch.dot                            # [D], [D] -> []
+        >>> batched_dot = torch.vmap(torch.dot, in_dims=(0, None))  # [N, D], [D] -> [N]
+        >>> x, y = torch.randn(2, 5), torch.randn(5)
+        >>> batched_dot(x, y) # second arg doesn't have a batch dim because in_dim[1] was None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     If the input is a Python struct, ``in_dims`` must be a tuple containing a struct
     matching the shape of the input:
 
+<<<<<<< HEAD
         >>> f = lambda dict: torch.dot(dict["x"], dict["y"])
         >>> x, y = torch.randn(2, 5), torch.randn(5)
         >>> input = {"x": x, "y": y}
         >>> batched_dot = torch.vmap(f, in_dims=({"x": 0, "y": None},))
+=======
+        >>> f = lambda dict: torch.dot(dict['x'], dict['y'])
+        >>> x, y = torch.randn(2, 5), torch.randn(5)
+        >>> input = {'x': x, 'y': y}
+        >>> batched_dot = torch.vmap(f, in_dims=({'x': 0, 'y': None},))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> batched_dot(input)
 
     By default, the output is batched along the first dimension. However, it can be batched
     along any dimension by using ``out_dims``
 
+<<<<<<< HEAD
         >>> f = lambda x: x**2
         >>> x = torch.randn(2, 5)
         >>> batched_pow = torch.vmap(f, out_dims=1)
         >>> batched_pow(x)  # [5, 2]
+=======
+        >>> f = lambda x: x ** 2
+        >>> x = torch.randn(2, 5)
+        >>> batched_pow = torch.vmap(f, out_dims=1)
+        >>> batched_pow(x) # [5, 2]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     For any function that uses kwargs, the returned function will not batch the kwargs but will
     accept kwargs
@@ -190,7 +237,11 @@ def vmap(
         >>>
         >>> batched_pow = torch.vmap(fn)
         >>> assert torch.allclose(batched_pow(x), x * 4)
+<<<<<<< HEAD
         >>> batched_pow(x, scale=x)  # scale is not batched, output has shape [2, 2, 5]
+=======
+        >>> batched_pow(x, scale=x) # scale is not batched, output has shape [2, 2, 5]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note::
         vmap does not provide general autobatching or handle variable-length
@@ -343,7 +394,11 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla
         >>> batch_size, feature_size = 3, 5
         >>>
         >>> def model(weights, feature_vec):
+<<<<<<< HEAD
         >>> # Very simple linear model with activation
+=======
+        >>>     # Very simple linear model with activation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>     assert feature_vec.dim() == 1
         >>>     return feature_vec.dot(weights).relu()
         >>>
@@ -355,9 +410,13 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla
         >>> examples = torch.randn(batch_size, feature_size)
         >>> targets = torch.randn(batch_size)
         >>> inputs = (weights, examples, targets)
+<<<<<<< HEAD
         >>> grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(
         ...     *inputs
         ... )
+=======
+        >>> grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(*inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example of using ``grad`` with ``has_aux`` and ``argnums``:
 
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
index 3f4c1a4979446..fed3f0ebc0f54 100644
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@@ -753,7 +753,10 @@ def __call__(self, fwd, bwd, *fwd_args, **fwd_kwargs):
 
         class ApplyTemplate(torch.autograd.Function):
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(ctx, *args):
                 nonlocal saved_values
                 output, saved_values = fwd(None, *fwd_args)
diff --git a/torch/_functorch/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
index ba0b31c018bd1..9e3bc9f02aa9c 100644
--- a/torch/_functorch/benchmark_utils.py
+++ b/torch/_functorch/benchmark_utils.py
@@ -185,12 +185,17 @@ def benchmark_utilization(
     ```
     def f(a):
         return a.sum()
+<<<<<<< HEAD
 
 
     a = torch.rand(2**20, device="cuda")
     utilization, mm_conv_utilization = benchmark_utilization(
         f, a, "tmp", trace_file_name="tmp_chrome_trace"
     )
+=======
+    a = torch.rand(2**20, device="cuda")
+    utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ```
 
     Args:
diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py
index cdf2e1855a093..fe2eda8580ce7 100644
--- a/torch/_functorch/compile_utils.py
+++ b/torch/_functorch/compile_utils.py
@@ -2,7 +2,11 @@
 
 
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -179,7 +183,11 @@ def raise_getitems(gm: fx.GraphModule) -> fx.GraphModule:
     )
 
     # loop through getitem nodes in the graph and raise them to the parent node
+<<<<<<< HEAD
     # in reverse order to preserve their original relative order
+=======
+    # in reverse order to perserve their original relative order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in reversed(getitem_nodes):
         assert len(node.all_input_nodes) == 1
         parent = node.all_input_nodes[0]
diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py
index 8070e47153ca5..af09911ee0c79 100644
--- a/torch/_functorch/compilers.py
+++ b/torch/_functorch/compilers.py
@@ -5,10 +5,16 @@
 import os
 import pickle
 import random
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager
 from functools import partial
 from typing import Union
+=======
+from contextlib import contextmanager
+from functools import partial
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -32,7 +38,11 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 # These canonicalization are needed here (and not decompositions), as the ops
+=======
+# These canonicalizations are needed here (and not decompositions), as the ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # we're trying to canonicalize to CompositeImplicitAutograd.
 def _canonicalize(fx_g):
     for node in fx_g.graph.find_nodes(
@@ -151,6 +161,7 @@ def check_significant_strides(a, b):
         def check(nv, rv, desc):
             assert callable(desc)
             assert nv.dtype == rv.dtype, f"{desc()}: {nv.dtype} != {rv.dtype}"
+<<<<<<< HEAD
             assert subst_symint_tuple(nv.size()) == rv.size(), (
                 f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
             )
@@ -158,6 +169,15 @@ def check(nv, rv, desc):
             assert same_strides, (
                 f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
             )
+=======
+            assert (
+                subst_symint_tuple(nv.size()) == rv.size()
+            ), f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
+            same_strides = check_significant_strides(nv, rv)
+            assert (
+                same_strides
+            ), f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         r = super().run_node(n)
         if "val" in n.meta:
@@ -250,7 +270,11 @@ def memory_efficient_fusion(
 
     Args:
         fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
+<<<<<<< HEAD
             that takes one or more arguments. Must return one or more Tensors.
+=======
+            that takes one ore more arguments. Must return one or more Tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs: Any other overrides you want to make to the settings
 
     Returns:
@@ -361,7 +385,11 @@ def get_input_meta(args):
             input_meta += get_input_meta(args[1])
             return input_meta
         for arg in args:
+<<<<<<< HEAD
             if type(arg) is int or type(arg) is float:
+=======
+            if type(arg) == int or type(arg) == float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_meta.append((type(arg),))
             else:
                 input_meta.append(
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 89fd907619175..e67ad3d80036f 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 from typing import Callable
 
 
@@ -11,6 +12,11 @@
 Global flags for aot autograd
 """
 
+=======
+"""
+Global flags for aot autograd
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 from typing import Literal, Optional, TYPE_CHECKING
@@ -18,6 +24,7 @@
 from torch.utils._config_module import Config, install_config_module
 
 
+<<<<<<< HEAD
 # [@compile_ignored: debug]
 _save_config_ignore = [
     # callable not serializeable
@@ -25,6 +32,8 @@
 ]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Converts torch rng ops to their functional philox rng equivalents. Note that
 # we functionalize only CUDA rng ops today.
 functionalize_rng_ops = False
@@ -71,10 +80,13 @@
 # need to add env vars or make it configurable
 bundled_autograd_cache: bool = False
 
+<<<<<<< HEAD
 # Whether or not to normalize placeholder names in graphs
 # from dynaom in AOTAutogradCache
 autograd_cache_normalize_inputs = not is_fbcode()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def remote_autograd_cache_default() -> Optional[bool]:
     if os.environ.get("TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE") == "1":
@@ -244,6 +256,7 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # of tensors in question.
 fake_tensor_propagate_real_tensors = False
 
+<<<<<<< HEAD
 # AOTDispatcher traces out a backward graph at the time of the forward pass.
 # This flags controls whether or not that backward graph gets autocast behavior
 # applied to it.
@@ -281,6 +294,11 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # This controls whether we collect donated buffer. This flag must be set
 # False if a user wants to retain_graph=True for backward.
 donated_buffer = not is_fbcode()
+=======
+# This controls whether we collect donated buffer. This flag must be set
+# False if a user wants to retain_graph=True for backward.
+donated_buffer = False if is_fbcode() else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Controls the default graph output format used by draw_graph
 # Supported formats are defined here https://graphviz.org/docs/outputs/
@@ -291,6 +309,7 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # real tensor outputs.
 generate_fake_kernels_from_real_mismatches = False
 
+<<<<<<< HEAD
 # When there are device mismatches in FakeTensor device propagation,
 # prefer a specific device type over others. This is particularly useful
 # in full compiled mode where intermediate tensors with device mismatches
@@ -302,10 +321,13 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # CPU, or "cuda" to prefer CUDA devices over CPU.
 fake_tensor_prefer_device_type: Optional[str] = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # CUDAGraph save run_with_rng functionalization.
 # TODO: turn on by default
 graphsafe_rng_functionalization = True
 
+<<<<<<< HEAD
 # Whether or not to eagerly compile the backward
 # used by AOT compile and other settings
 # TODO: once AOT compile calls aot autograd directly instead of
@@ -314,6 +336,8 @@ def remote_autograd_cache_default() -> Optional[bool]:
 
 # only for testing, used to turn functionalization off in AOTDispatcher
 _test_disable_functionalization = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Error on BypassAOTAutogradCache instead of just a warning
 # Used for tests
@@ -325,7 +349,11 @@ def remote_autograd_cache_default() -> Optional[bool]:
 #   which can reorder or ,delete duplicate nodes in the graph
 # - If any of these passes reorder/delete/duplicate a collective
 #   in a setting where the compiler is being run independently on multiple
+<<<<<<< HEAD
 #   ranks, we run the risk that the compiler will make a different decision on
+=======
+#   ranks, we run the risk that the compiler will make a different decison on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   different ranks, resulting in a NCCL hang when using torch.compile
 # To handle this, we will (by default) ensure that collectives are not modified
 # by the compiler.
@@ -356,7 +384,11 @@ def remote_autograd_cache_default() -> Optional[bool]:
 
 # This is a temporary config to ensure all ranks take the same decision in the partitioner
 # it will untimately be removed once we share size_hints across ranks through compiler collectives
+<<<<<<< HEAD
 _sync_decision_cross_ranks = False
+=======
+_broadcast_rank0_decision = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # By default apply inlined saved_tensors_hooks only for "donated" buffers.
 # "donated" buffers are invisible to the user, they are intermediates of the forward graph.
@@ -371,10 +403,13 @@ def remote_autograd_cache_default() -> Optional[bool]:
 saved_tensors_hooks_filtering_mode = "donated"
 
 
+<<<<<<< HEAD
 # This callback is invoked on the joint graph before partitioning
 joint_custom_pass: Callable = None  # type: ignore[assignment]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_functorch/deprecated.py b/torch/_functorch/deprecated.py
index 773eb2aa8be9a..07725bd783832 100644
--- a/torch/_functorch/deprecated.py
+++ b/torch/_functorch/deprecated.py
@@ -10,8 +10,12 @@
 
 import textwrap
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._functorch.apis as apis
 import torch._functorch.eager_transforms as _impl
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index 7a6cf009b27c4..72f677944f3f0 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -7,9 +7,14 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial, wraps
 from typing import Any, Optional, Union
+=======
+from functools import partial, wraps
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.autograd.forward_ad as fwAD
@@ -234,7 +239,11 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
         >>> x = torch.randn([5])
         >>> f = lambda x: x.sin().sum()
         >>> (_, vjpfunc) = torch.func.vjp(f, x)
+<<<<<<< HEAD
         >>> grad = vjpfunc(torch.tensor(1.0))[0]
+=======
+        >>> grad = vjpfunc(torch.tensor(1.))[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> assert torch.allclose(grad, torch.func.grad(f)(x))
 
     However, :func:`vjp` can support functions with multiple outputs by
@@ -249,9 +258,15 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
     :func:`vjp` can even support outputs being Python structs
 
         >>> x = torch.randn([5])
+<<<<<<< HEAD
         >>> f = lambda x: {"first": x.sin(), "second": x.cos()}
         >>> (_, vjpfunc) = torch.func.vjp(f, x)
         >>> cotangents = {"first": torch.ones([5]), "second": torch.ones([5])}
+=======
+        >>> f = lambda x: {'first': x.sin(), 'second': x.cos()}
+        >>> (_, vjpfunc) = torch.func.vjp(f, x)
+        >>> cotangents = {'first': torch.ones([5]), 'second': torch.ones([5])}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> vjps = vjpfunc(cotangents)
         >>> assert torch.allclose(vjps[0], x.cos() + -x.sin())
 
@@ -275,7 +290,11 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
         >>>
         >>> (_, vjpfunc) = torch.func.vjp(f, x)
         >>> vjps = vjpfunc(torch.ones_like(x))
+<<<<<<< HEAD
         >>> assert torch.allclose(vjps[0], torch.full(x.shape, 4.0))
+=======
+        >>> assert torch.allclose(vjps[0], torch.full(x.shape, 4.))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note::
         Using PyTorch ``torch.no_grad`` together with ``vjp``.
@@ -931,7 +950,12 @@ def assert_output_is_tensor_or_tensors(output: Any, api: str) -> None:
         return
     if not isinstance(output, tuple):
         raise RuntimeError(
+<<<<<<< HEAD
             f"{api}: Expected output of f to be a Tensor or Tensors, got {type(output)}"
+=======
+            f"{api}: Expected output of f to be a Tensor or Tensors, got "
+            f"{type(output)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if len(output) == 0:
         raise RuntimeError(
@@ -1023,10 +1047,17 @@ def jvp(
 
         >>> from torch.func import jvp
         >>> x = torch.randn([])
+<<<<<<< HEAD
         >>> f = lambda x: x * torch.tensor([1.0, 2.0, 3])
         >>> value, grad = jvp(f, (x,), (torch.tensor(1.0),))
         >>> assert torch.allclose(value, f(x))
         >>> assert torch.allclose(grad, torch.tensor([1.0, 2, 3]))
+=======
+        >>> f = lambda x: x * torch.tensor([1., 2., 3])
+        >>> value, grad = jvp(f, (x,), (torch.tensor(1.),))
+        >>> assert torch.allclose(value, f(x))
+        >>> assert torch.allclose(grad, torch.tensor([1., 2, 3]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     :func:`jvp` can support functions with multiple inputs by passing in the
     tangents for each of the inputs
@@ -1594,7 +1625,11 @@ def forward(self, a_1):
           If you call `functionalize(f)` on a function that takes views / mutations of
           non-local state, functionalization will simply no-op and pass the view/mutation
           calls directly to the backend.
+<<<<<<< HEAD
           One way to work around this is to ensure that any non-local state creation
+=======
+          One way to work around this is is to ensure that any non-local state creation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           is wrapped into a larger function, which you then call functionalize on.
       (3) `resize_()` has some limitations: functionalize will only work on programs
           that use resize_()` as long as the tensor being resized is not a view.
diff --git a/torch/_functorch/functional_call.py b/torch/_functorch/functional_call.py
index 55f45c9256962..e00b3b078dc39 100644
--- a/torch/_functorch/functional_call.py
+++ b/torch/_functorch/functional_call.py
@@ -60,10 +60,14 @@ def functional_call(
 
     .. code-block:: python
 
+<<<<<<< HEAD
             a = (
                 {"weight": torch.ones(1, 1)},
                 {"buffer": torch.zeros(1)},
             )  # two separate dictionaries
+=======
+            a = ({'weight': torch.ones(1, 1)}, {'buffer': torch.zeros(1)})  # two separate dictionaries
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod = nn.Bar(1, 1)  # return self.weight @ x + self.buffer
             print(mod.weight)  # tensor(...)
             print(mod.buffer)  # tensor(...)
@@ -86,12 +90,18 @@ def functional_call(
         t = torch.randn(4, 3)
         model = nn.Linear(3, 3)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def compute_loss(params, x, t):
             y = functional_call(model, params, x)
             return nn.functional.mse_loss(y, t)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_weights = grad(compute_loss)(dict(model.named_parameters()), x, t)
 
     .. note:: If the user does not need grad tracking outside of grad transforms, they can detach all of the
@@ -184,11 +194,17 @@ def stack_module_state(
         models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
         data = torch.randn(batch_size, 3)
 
+<<<<<<< HEAD
 
         def wrapper(params, buffers, data):
             return torch.func.functional_call(models[0], (params, buffers), data)
 
 
+=======
+        def wrapper(params, buffers, data):
+            return torch.func.functional_call(models[0], (params, buffers), data)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         params, buffers = stack_module_state(models)
         output = vmap(wrapper, (0, 0, None))(params, buffers, data)
 
@@ -199,8 +215,11 @@ def wrapper(params, buffers, data):
     .. code-block:: python
 
         import torch.nn as nn
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Foo(nn.Module):
             def __init__(self, in_features, out_features):
                 super().__init__()
@@ -211,7 +230,10 @@ def __init__(self, in_features, out_features):
             def forward(self, x):
                 return self.l2(self.l1(x))
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_models = 5
         in_features, out_features = 3, 3
         models = [Foo(in_features, out_features) for i in range(num_models)]
@@ -230,7 +252,11 @@ def forward(self, x):
             "stack_module_state: Expected all models to have the same training/eval mode."
         )
     model0_typ = type(models[0])
+<<<<<<< HEAD
     if not all(type(m) is model0_typ for m in models):
+=======
+    if not all(type(m) == model0_typ for m in models):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             "stack_module_state: Expected all models to be of the same class."
         )
diff --git a/torch/_functorch/fx_minifier.py b/torch/_functorch/fx_minifier.py
index 60609ad95e68b..8103090a4614c 100644
--- a/torch/_functorch/fx_minifier.py
+++ b/torch/_functorch/fx_minifier.py
@@ -4,9 +4,15 @@
 import math
 import os
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial, wraps
+=======
+from dataclasses import dataclass
+from functools import partial, wraps
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx as fx
diff --git a/torch/_functorch/make_functional.py b/torch/_functorch/make_functional.py
index 3bd60a84fded6..bae934390fde5 100644
--- a/torch/_functorch/make_functional.py
+++ b/torch/_functorch/make_functional.py
@@ -6,8 +6,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, NoReturn, Union
+=======
+from collections.abc import Iterable, Sequence
+from typing import Any, Callable, NoReturn, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -42,9 +47,13 @@ def create_names_map(
     This function creates a mapping from the names in named_params to the
     names in tied_named_params: {'A': ['A'], 'B': ['B', 'B_tied']}.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
     named_params = dict(named_params)
     # pyrefly: ignore [no-matching-overload]
+=======
+    named_params = dict(named_params)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tied_named_params = dict(tied_named_params)
 
     tensors_dict_keys = set(named_params.keys())
@@ -53,11 +62,17 @@ def create_names_map(
 
     tensor_to_mapping: dict[Tensor, tuple[str, list[str]]] = {}
     for key, tensor in named_params.items():
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         tensor_to_mapping[tensor] = (key, [])
     for key, tensor in tied_named_params.items():
         assert tensor in tensor_to_mapping
         # pyrefly: ignore [bad-argument-type]
+=======
+        tensor_to_mapping[tensor] = (key, [])
+    for key, tensor in tied_named_params.items():
+        assert tensor in tensor_to_mapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_to_mapping[tensor][1].append(key)
     return dict(tensor_to_mapping.values())
 
@@ -378,12 +393,18 @@ def make_functional(
         model = nn.Linear(3, 3)
         func, params = make_functional(model)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def compute_loss(params, x, t):
             y = func(params, x)
             return nn.functional.mse_loss(y, t)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_weights = grad(compute_loss)(params, x, t)
 
     If the model has any buffers, please use :func:`make_functional_with_buffers` instead.
@@ -449,12 +470,18 @@ def make_functional_with_buffers(
         model = nn.Linear(3, 3)
         func, params, buffers = make_functional_with_buffers(model)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def compute_loss(params, buffers, x, t):
             y = func(params, buffers, x)
             return nn.functional.mse_loss(y, t)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_weights = grad(compute_loss)(params, buffers, x, t)
 
     Args:
@@ -477,7 +504,11 @@ def compute_loss(params, buffers, x, t):
 
 
 def transpose_stack(
+<<<<<<< HEAD
     tuple_of_tuple_of_tensors: tuple[tuple[Tensor, ...], ...],
+=======
+    tuple_of_tuple_of_tensors: tuple[tuple[Tensor, ...], ...]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[Tensor, ...]:
     tuple_of_tuple_of_tensors = tuple(zip(*tuple_of_tuple_of_tensors))
     results = tuple(
@@ -536,7 +567,11 @@ def combine_state_for_ensemble(
             "have the same training/eval mode."
         )
     model0_typ = type(models[0])
+<<<<<<< HEAD
     if not all(type(m) is model0_typ for m in models):
+=======
+    if not all(type(m) == model0_typ for m in models):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             "combine_state_for_ensemble: Expected all models to be of the same class."
         )
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 8e194a0f0ce77..7fa935084f1cb 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -9,11 +9,17 @@
 import operator
 import os
 import os.path
+<<<<<<< HEAD
 import re
 from collections import defaultdict
 from collections.abc import Callable
 from dataclasses import dataclass, replace
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from collections import defaultdict
+from dataclasses import dataclass, replace
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.inductor_prims
@@ -50,7 +56,10 @@
     ilp_knapsack,
 )
 from ._activation_checkpointing.knapsack_evaluator import KnapsackEvaluator
+<<<<<<< HEAD
 from ._aot_autograd.descriptors import AOTOutput, SavedForBackwardsAOTOutput
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._aot_autograd.logging_utils import get_aot_graph_name
 from ._aot_autograd.utils import get_cuda_generator_meta_val, is_with_effects
 from .compile_utils import fx_graph_cse, get_aten_target, raise_getitems
@@ -178,7 +187,10 @@ def _extract_graph_with_inputs_outputs(
     joint_graph: fx.Graph,
     inputs: list[fx.Node],
     outputs: list[fx.Node],
+<<<<<<< HEAD
     outputs_descs: list[AOTOutput],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subgraph: Optional[str] = None,
 ) -> fx.Graph:
     """
@@ -199,6 +211,7 @@ def _extract_graph_with_inputs_outputs(
         new_node = new_graph.placeholder(node.name)
         # Can't use node_copy here as we may be turning previous call_function into placeholders
         new_node.meta = node.meta
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         env[node] = new_node
 
@@ -208,6 +221,12 @@ def _extract_graph_with_inputs_outputs(
             continue
 
         if _must_be_in_forward(node) and subgraph != "forward" and node not in inputs:
+=======
+        env[node] = new_node
+
+    for node in joint_graph.nodes:
+        if _must_be_in_backward(node) and subgraph != "backward":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             env[node] = InvalidNode  # type: ignore[assignment]
             continue
 
@@ -228,10 +247,15 @@ def _extract_graph_with_inputs_outputs(
             if any(all_args):
                 env[node] = InvalidNode  # type: ignore[assignment]
                 continue
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation, bad-argument-type]
             env[node] = new_graph.node_copy(node, lambda x: env[x])
         elif node.op == "get_attr":
             # pyrefly: ignore [unsupported-operation, bad-argument-type]
+=======
+            env[node] = new_graph.node_copy(node, lambda x: env[x])
+        elif node.op == "get_attr":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             env[node] = new_graph.node_copy(node, lambda x: env[x])
         elif node.op == "output":
             pass
@@ -240,6 +264,7 @@ def _extract_graph_with_inputs_outputs(
         if isinstance(x, fx.Node):
             if x not in env:
                 raise RuntimeError(f"Node {x} couldn't be found in env")
+<<<<<<< HEAD
             assert not isinstance(env[x], InvalidNodeBase), (
                 f"Node {x} was invalid, but is output"
             )
@@ -248,6 +273,15 @@ def _extract_graph_with_inputs_outputs(
             output_values.append(x)
     out = new_graph.output(tuple(output_values))
     out.meta["desc"] = outputs_descs
+=======
+            assert not isinstance(
+                env[x], InvalidNodeBase
+            ), f"Node {x} was invalid, but is output"
+            output_values.append(env[x])
+        else:
+            output_values.append(x)
+    new_graph.output(tuple(output_values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_graph.eliminate_dead_code()
     new_graph.lint()
@@ -287,14 +321,18 @@ def _has_tag_is_backward(node: fx.Node) -> bool:
     return node.meta.get("partitioner_tag", None) == "is_backward"
 
 
+<<<<<<< HEAD
 def _has_tag_must_be_in_forward(node: fx.Node) -> bool:
     return node.meta.get("partitioner_tag", None) == "must_be_in_forward"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _has_tag_must_be_in_backward(node: fx.Node) -> bool:
     return node.meta.get("partitioner_tag", None) == "must_be_in_backward"
 
 
+<<<<<<< HEAD
 def _must_be_in_forward(node: fx.Node) -> bool:
     if _has_tag_must_be_in_forward(node):
         return True
@@ -317,10 +355,17 @@ def _must_be_in_backward(node: fx.Node) -> bool:
         and node.target._schema.is_mutable
     )
     return _has_tag_is_backward(node) and is_mutable
+=======
+def _must_be_in_backward(node: fx.Node) -> bool:
+    return _has_tag_must_be_in_backward(node) or (
+        _has_tag_is_backward(node) and is_with_effects(node)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _extract_fwd_bwd_outputs(
     joint_module: fx.GraphModule, *, num_fwd_outputs
+<<<<<<< HEAD
 ) -> tuple[list[fx.Node], list[fx.Node], list[AOTOutput], list[AOTOutput]]:
     outputs = pytree.arg_tree_leaves(
         *(node.args for node in joint_module.graph.find_nodes(op="output"))
@@ -335,6 +380,15 @@ def _extract_fwd_bwd_outputs(
     fwd_outputs_descs = outputs_descs[:num_fwd_outputs]
     bwd_outputs_descs = outputs_descs[num_fwd_outputs:]
     return fwd_outputs, bwd_outputs, fwd_outputs_descs, bwd_outputs_descs
+=======
+) -> tuple[list[fx.Node], list[fx.Node]]:
+    outputs = pytree.arg_tree_leaves(
+        *(node.args for node in joint_module.graph.find_nodes(op="output"))
+    )
+    fwd_outputs = outputs[:num_fwd_outputs]
+    bwd_outputs = outputs[num_fwd_outputs:]
+    return fwd_outputs, bwd_outputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _remove_by_name(saved_values: list[fx.Node], name: str):
@@ -360,7 +414,10 @@ def calculate_quantization_scaling(
     node: torch.fx.Node,
     max: float = 57344.0,
     min: float = 1e-12,
+<<<<<<< HEAD
     position: int = 0,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     with graph.inserting_after(node):
         abs_node = graph.call_function(
@@ -424,7 +481,11 @@ def calculate_quantization_scaling(
         scale_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(mul_node, torch.float32),
+<<<<<<< HEAD
             name=f"fp8_scale_pos_{position}_{node.name}",
+=======
+            name="fp8_scale_" + str(node.name),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         scale_node.meta["val"] = torch.ops.prims.convert_element_type.default(
             mul_node.meta["val"], torch.float32
@@ -440,7 +501,10 @@ def perform_quantization(
     quant_type: torch.dtype,
     clamp_min: float,
     clamp_max: float,
+<<<<<<< HEAD
     position: int,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.fx.Node:
     with graph.inserting_after(scale_node):
         target_node_32 = graph.call_function(
@@ -490,12 +554,21 @@ def perform_quantization(
         quant_activation_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(clamp_max_scaled_node, quant_type),
+<<<<<<< HEAD
             name=f"fp8_quant_pos_{position}_{node.name}",
         )
         quant_activation_node.meta["val"] = (
             torch.ops.prims.convert_element_type.default(
                 clamp_max_scaled_node.meta["val"], quant_type
             )
+=======
+            name="fp8_quant_" + str(node.name),
+        )
+        quant_activation_node.meta[
+            "val"
+        ] = torch.ops.prims.convert_element_type.default(
+            clamp_max_scaled_node.meta["val"], quant_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         quant_activation_node.meta["tensor_meta"] = extract_tensor_metadata(
             quant_activation_node.meta["val"]
@@ -544,7 +617,11 @@ def should_quantize(node: torch.fx.Node) -> bool:
     ].get("skip_dynamo_guards", False):
         return size_in_mb >= size_threshold
     else:
+<<<<<<< HEAD
         # case 1: we always quantize tensors with dynamic shapes
+=======
+        # case 1: we alway quantize tensors with dynamic shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch._inductor.config.post_grad_fusion_options[
             "activation_quantization_aten_pass"
         ].get("quantize_dynamic_shape", False):
@@ -552,7 +629,11 @@ def should_quantize(node: torch.fx.Node) -> bool:
                 size_in_mb >= size_threshold
             ) or not statically_known_false(size_in_mb >= size_threshold)
         else:
+<<<<<<< HEAD
             # case 2: we always not quantize tensors with dynamic shapes
+=======
+            # case 2: we alway not quantize tensors with dynamic shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return statically_known_true(size_in_mb >= size_threshold)
 
 
@@ -581,9 +662,15 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
     fwd_outputs = output.args[0]
     quant_type = get_quant_type()
     clamp_min, clamp_max = calculate_range(quant_type)
+<<<<<<< HEAD
     position_to_quant = dict()
     tensor_scale_nodes, sym_scale_nodes = [], []
     for position, node in enumerate(fwd_outputs):
+=======
+    node_to_quant = dict()
+    tensor_scale_nodes, sym_scale_nodes = [], []
+    for node in fwd_outputs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check if the activation node is the node saved for quantization
         if node.meta.get("saved_for_quantization", False):
             # case: use scaling
@@ -592,12 +679,20 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
             ].get("use_scaling", True):
                 # calculating the scale
                 scale_node = calculate_quantization_scaling(
+<<<<<<< HEAD
                     graph, node, clamp_max, 1e-12, position
                 )
 
                 # converting to fp8
                 quant_node = perform_quantization(
                     graph, node, scale_node, quant_type, clamp_min, clamp_max, position
+=======
+                    graph, node, clamp_max, 1e-12
+                )
+                # converting to fp8
+                quant_node = perform_quantization(
+                    graph, node, scale_node, quant_type, clamp_min, clamp_max
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if not is_sym_node(scale_node):
                     tensor_scale_nodes.append(scale_node)
@@ -609,16 +704,26 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
                     quant_node = graph.call_function(
                         torch.ops.prims.convert_element_type.default,
                         args=(node, quant_type),
+<<<<<<< HEAD
                         name=f"fp8_quant_pos_{position}_{node.name}",
                     )
                     quant_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
                             node.meta["val"], quant_type
                         )
+=======
+                        name="fp8_quant_" + str(node.name),
+                    )
+                    quant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], quant_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     quant_node.meta["tensor_meta"] = extract_tensor_metadata(
                         quant_node.meta["val"]
                     )
+<<<<<<< HEAD
 
             position_to_quant[position] = quant_node
 
@@ -629,6 +734,14 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
     ]
     # add the scale nodes to the output find the first sym_node in the output
     # pyrefly: ignore [bad-argument-type]
+=======
+            node_to_quant[node] = quant_node
+    # only update the return node args, and remain all other users unchanged
+    output_updated_args = [
+        node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs  # type: ignore[union-attr]
+    ]
+    # add the scale nodes to the ouput find the first sym_node in the output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     idx = find_first_sym_node(output_updated_args)
     scale_nodes = tensor_scale_nodes + sym_scale_nodes
     if scale_nodes:
@@ -665,10 +778,17 @@ def quantize_activation_bw(graph: torch.fx.Graph) -> None:
                         torch.ops.prims.convert_element_type.default,
                         args=(node, dequant_type),
                     )
+<<<<<<< HEAD
                     activation_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
                             node.meta["val"], dequant_type
                         )
+=======
+                    activation_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], dequant_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     activation_node.meta["tensor_meta"] = extract_tensor_metadata(
                         activation_node.meta["val"]
@@ -681,18 +801,31 @@ def quantize_activation_bw(graph: torch.fx.Graph) -> None:
                     divided_target_node_32.meta["val"] = torch.ops.aten.div.Tensor(
                         activation_node.meta["val"], scale_node.meta["val"]
                     )
+<<<<<<< HEAD
                     divided_target_node_32.meta["tensor_meta"] = (
                         extract_tensor_metadata(divided_target_node_32.meta["val"])
                     )
+=======
+                    divided_target_node_32.meta[
+                        "tensor_meta"
+                    ] = extract_tensor_metadata(divided_target_node_32.meta["val"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with graph.inserting_after(divided_target_node_32):
                     dequant_node = graph.call_function(
                         torch.ops.prims.convert_element_type.default,
                         args=(divided_target_node_32, dequant_type),
                     )
+<<<<<<< HEAD
                     dequant_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
                             divided_target_node_32.meta["val"], dequant_type
                         )
+=======
+                    dequant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        divided_target_node_32.meta["val"], dequant_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     dequant_node.meta["tensor_meta"] = extract_tensor_metadata(
                         dequant_node.meta["val"]
@@ -704,10 +837,17 @@ def quantize_activation_bw(graph: torch.fx.Graph) -> None:
                         args=(node, dequant_type),
                         name="dequant_" + str(node.name),
                     )
+<<<<<<< HEAD
                     dequant_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
                             node.meta["val"], dequant_type
                         )
+=======
+                    dequant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], dequant_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     dequant_node.meta["tensor_meta"] = extract_tensor_metadata(
                         dequant_node.meta["val"]
@@ -720,11 +860,55 @@ def quantize_activation_bw(graph: torch.fx.Graph) -> None:
     counters["inductor"]["activation_quantization_bwd_aten_pass"] += 1
 
 
+<<<<<<< HEAD
 def perform_fp8_activation_quantization(
     fwd_module: fx.GraphModule,
     bwd_module: fx.GraphModule,
     bwd_module_inputs: dict[str, fx.Node],
 ) -> None:
+=======
+def enable_activation_quantization(
+    saved_values: list[fx.Node],
+    fwd_module: fx.GraphModule,
+    bwd_module: fx.GraphModule,
+    static_lifetime_input_nodes: Optional[OrderedSet[fx.Node]] = None,
+) -> None:
+    if (
+        inductor_config.post_grad_fusion_options.get(
+            "activation_quantization_aten_pass", None
+        )
+        is None
+    ):
+        return
+
+    static_input_names = (
+        [node.name for node in static_lifetime_input_nodes]
+        if static_lifetime_input_nodes
+        else []
+    )
+    saved_values_names = {node.name: node for node in saved_values}
+    if torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("exclude_primals", False):
+        saved_values_names = {
+            node.name: node for node in saved_values if "primals" not in node.name
+        }
+    fwd_module_outputs = fwd_module.graph.find_nodes(op="output")[0].args[0]
+    bwd_module_inputs = {
+        node.name: node for node in bwd_module.graph.find_nodes(op="placeholder")
+    }
+    for node in fwd_module_outputs:
+        if node.name in saved_values_names and should_quantize(node):
+            if node.name in static_input_names:
+                log.debug("Skipping quantization of static input %s: ", node.name)
+                continue
+            node.meta["saved_for_quantization"] = True
+            node.meta["dequant_type"] = node.meta["val"].dtype
+            # some of the fwd outputs and bwd inputs are not share the same object
+            bwd_module_inputs[node.name].meta["saved_for_quantization"] = True
+            bwd_module_inputs[node.name].meta["dequant_type"] = node.meta["val"].dtype
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     trace_structured(
         "artifact",
         metadata_fn=lambda: {
@@ -764,9 +948,13 @@ def perform_fp8_activation_quantization(
     # update the corresponding bwd_inputs due to the fwd_outputs quantization
     for fwd_node in quant_fwd_module_outputs:
         if "fp8_quant_" in fwd_node.name:
+<<<<<<< HEAD
             bwd_input = bwd_module_inputs[
                 re.sub(r"^fp8_quant_pos_\d+_", "", fwd_node.name)
             ]
+=======
+            bwd_input = bwd_module_inputs[fwd_node.name.replace("fp8_quant_", "")]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with bwd_module.graph.inserting_after(bwd_input):
                 quant_bwd_input = bwd_module.graph.placeholder(name=fwd_node.name)
             dequant_type = bwd_input.meta["dequant_type"]
@@ -810,6 +998,7 @@ def perform_fp8_activation_quantization(
     )
 
 
+<<<<<<< HEAD
 def enable_activation_quantization(
     saved_values: list[fx.Node],
     fwd_module: fx.GraphModule,
@@ -857,6 +1046,8 @@ def enable_activation_quantization(
         perform_fp8_activation_quantization(fwd_module, bwd_module, bwd_module_inputs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _extract_fwd_bwd_modules(
     joint_module: fx.GraphModule,
     saved_values: list[fx.Node],
@@ -865,8 +1056,13 @@ def _extract_fwd_bwd_modules(
     num_fwd_outputs: int,
     static_lifetime_input_nodes: Optional[OrderedSet[fx.Node]] = None,
 ) -> tuple[fx.GraphModule, fx.GraphModule]:
+<<<<<<< HEAD
     fwd_outputs, bwd_outputs, fwd_outputs_descs, bwd_outputs_descs = (
         _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+=======
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+        joint_module, num_fwd_outputs=num_fwd_outputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     placeholders = joint_module.graph.find_nodes(op="placeholder")
     primal_inputs = [*filter(_is_primal, placeholders)]
@@ -879,7 +1075,10 @@ def _extract_fwd_bwd_modules(
         joint_module.graph,
         saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
         bwd_outputs,
+<<<<<<< HEAD
         bwd_outputs_descs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "backward",
     )
 
@@ -953,11 +1152,14 @@ def _extract_fwd_bwd_modules(
         joint_module.graph,
         primal_inputs + fwd_seed_offset_inputs,
         fwd_outputs + saved_values + saved_sym_nodes,
+<<<<<<< HEAD
         fwd_outputs_descs
         + [
             SavedForBackwardsAOTOutput(i)
             for i in range(len(saved_values) + len(saved_sym_nodes))
         ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "forward",
     )
     bwd_graph = _extract_graph_with_inputs_outputs(
@@ -968,7 +1170,10 @@ def _extract_fwd_bwd_modules(
         + bwd_seed_offset_inputs
         + backward_state_inputs,
         bwd_outputs,
+<<<<<<< HEAD
         bwd_outputs_descs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "backward",
     )
 
@@ -1021,15 +1226,24 @@ def default_partition(
     primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
     fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
     inputs = primal_inputs + fwd_seed_offset_inputs
+<<<<<<< HEAD
     fwd_outputs, bwd_outputs, fwd_outputs_descs, bwd_outputs_descs = (
         _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
     )
     forward_only_graph = _extract_graph_with_inputs_outputs(
         joint_module.graph, inputs, fwd_outputs, fwd_outputs_descs, "forward"
+=======
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+        joint_module, num_fwd_outputs=num_fwd_outputs
+    )
+    forward_only_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph, inputs, fwd_outputs, "forward"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     forward_node_names = OrderedSet(
         node.name for node in forward_only_graph.nodes if node.op != "output"
     )
+<<<<<<< HEAD
     order = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
     saved_values = []
     saved_sym_nodes = []
@@ -1074,16 +1288,27 @@ def is_mutated_later_in_fw(node):
             # NB: doesn't handle nodes where the input is a list of tensors and one of those tensors is later mutated
             if is_mutated_later_in_fw(node):
                 saved_values.append(node)
+=======
+    saved_values = []
+    saved_sym_nodes = []
+
+    for node in joint_module.graph.nodes:
+        if node.name not in forward_node_names:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if is_sym_node(node):
             # Symints must be kept separate from tensors so that PythonFunction only calls
             # save_for_backward on tensors and stashes symints in autograd .ctx
             saved_sym_nodes.append(node)
+<<<<<<< HEAD
         elif (
             "tensor_meta" not in node.meta
             and node.op == "call_function"
             and not isinstance(node.meta.get("val"), torch._subclasses.FakeTensor)
         ):
+=======
+        elif "tensor_meta" not in node.meta and node.op == "call_function":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Since we can't save tuple of tensor values, we need to flatten out what we're saving
             users = node.users
             assert all(user.target == operator.getitem for user in users)
@@ -1193,7 +1418,11 @@ def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule:
     """
     This pass finds the first bwd node in the graph (by looking at users of
     tangents) and then reorders the graph by walking from this node to all the
+<<<<<<< HEAD
     way to the end of the graph. At each op in this traversal, we insert this op
+=======
+    way to the end of the graph. At each op in this traveral, we insert this op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     in a new graph and try to bring only the relevant subgraph from the other
     non-bwd edges relevant for this op. This closely mimics the behavior of
     autograd engine.
@@ -1234,7 +1463,10 @@ def insert_node_in_graph(node):
             # critical path first.
             cur_nodes += node.all_input_nodes
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         insertable_nodes = sorted(insertable_nodes, key=lambda n: order[n])
         for node in insertable_nodes:
             env[node] = new_graph.node_copy(node, lambda x: env[x])
@@ -1417,6 +1649,7 @@ def get_device(node) -> Optional[torch.device]:
         return torch.device("cpu")
 
     def get_sample_rng_state(device: Optional[torch.device]):
+<<<<<<< HEAD
         from torch._guards import detect_fake_mode  # noqa: F401
 
         fake_mode = detect_fake_mode()
@@ -1425,6 +1658,11 @@ def get_sample_rng_state(device: Optional[torch.device]):
             if device is not None and device.type == "cuda":
                 return fake_mode.from_tensor(torch.cuda.get_rng_state())
             return fake_mode.from_tensor(torch.get_rng_state())
+=======
+        if device is not None and device.type == "cuda":
+            return torch.cuda.get_rng_state()
+        return torch.get_rng_state()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd.
     joint_graph_rng_ops = get_rng_ops(joint_module)
@@ -1463,14 +1701,22 @@ def get_sample_rng_state(device: Optional[torch.device]):
     devices = OrderedSet(
         get_device(node_pair["fwd"]) for node_pair in recomputable_rng_ops_map.values()
     )
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
     devices.discard(torch.device("cpu"))
     # multiple cuda devices won't work with cudagraphs anyway,
+=======
+    devices.discard(torch.device("cpu"))
+    # multiple cuda devices wont work with cudagraphs anyway,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # fallback to non graphsafe rng checkpointing
     multi_cuda_devices = len(devices) > 1
 
     # this changes numerics, so if fallback_random is set we will not use it
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ind_config = torch._inductor.config
     use_rng_graphsafe_rng_functionalization = (
         config.graphsafe_rng_functionalization
@@ -1521,8 +1767,11 @@ def get_sample_rng_state(device: Optional[torch.device]):
                     args=(functional_fw_node, 0),
                     kwargs={},
                 )
+<<<<<<< HEAD
                 state.meta["val"] = get_sample_rng_state(device)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rng_output = fw_graph.create_node(
                     "call_function",
                     operator.getitem,
@@ -1532,9 +1781,12 @@ def get_sample_rng_state(device: Optional[torch.device]):
                     ),
                     kwargs={},
                 )
+<<<<<<< HEAD
                 # Copy the meta data from the original node
                 rng_output.meta = copy.copy(fw_node.meta)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fw_node.replace_all_uses_with(rng_output)
                 fw_graph.erase_node(fw_node)
                 fw_rng_state_outputs.append(state)
@@ -1590,6 +1842,7 @@ def force_save_collectives(joint_module: fx.GraphModule) -> None:
             node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
 
 
+<<<<<<< HEAD
 def force_save_bw_mutation_src(joint_module: fx.GraphModule) -> None:
     # If we have mutations of the same primal in forward and backward,
     # We must not recompute the source of mutation to not apply twice.
@@ -1613,6 +1866,8 @@ def force_save_bw_mutation_src(joint_module: fx.GraphModule) -> None:
             break
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cleanup_recompute_tags(joint_module: fx.GraphModule) -> fx.GraphModule:
     """
     If there are two consecutive checkpointed blocks with no operator in
@@ -1625,8 +1880,11 @@ def cleanup_recompute_tags(joint_module: fx.GraphModule) -> fx.GraphModule:
             for user in node.users:
                 if (
                     must_recompute(user)
+<<<<<<< HEAD
                     and "ac_graph_id" in user.meta
                     and "ac_graph_id" in node.meta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and user.meta["ac_graph_id"] > node.meta["ac_graph_id"]
                 ):
                     node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
@@ -1868,7 +2126,11 @@ def ban_recomputation_if_allowed(node):
             # If someone saves a input for backward as-is and backward
             # returns that tensor as-is as a grad input, then the node x would
             # be both a required_bw_node and an input. In this case we
+<<<<<<< HEAD
             # (1) connect x_in to the source, (2) x_out to the sink, and
+=======
+            # (1) connect x_in to to the source, (2) x_out to the sink, and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # (3) assign the proper weight to the x_in-x_out edge, so that
             # x would be part of cut nodes. A case where this happens is if
             # NestedTensor saves a offset tensor as part of the singleton int
@@ -2491,7 +2753,11 @@ def get_saved_values_knapsack(memory_budget, node_info, joint_graph):
             # if idx in all_recomputable_banned_nodes:
             try:
                 dont_ban.add(all_recomputable_banned_nodes[idx])
+<<<<<<< HEAD
             except BaseException:  # noqa: B036
+=======
+            except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 pass
 
         assert dont_ban.issubset(all_recomputable_banned_nodes)
@@ -2509,10 +2775,14 @@ def get_saved_values_knapsack(memory_budget, node_info, joint_graph):
                 saved_node_idxs=saved_node_idxs,
                 recomputable_node_idxs=recomputable_node_idxs,
                 expected_runtime=expected_runtime,
+<<<<<<< HEAD
                 memories_banned_nodes=[
                     _size_of(i) for i in all_recomputable_banned_nodes
                 ],
                 normalized_memories_banned_nodes=memories_banned_nodes,
+=======
+                memories_banned_nodes=memories_banned_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 runtimes_banned_nodes=runtimes_banned_nodes,
                 min_cut_saved_values=saved_values,
             )
@@ -2596,7 +2866,11 @@ def estimate_for_budget(b):
     )[0]
 
 
+<<<<<<< HEAD
 def _sync_decision_cross_ranks(
+=======
+def _broadcast_rank0_decision(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     joint_graph: torch.fx.Graph, saved_values: list[torch.fx.Node]
 ):
     # use the same policy across different GPUs
@@ -2614,7 +2888,11 @@ def has_same_nodes(joint_graph):
         # proxy to check if the graph is the same across different GPUs.
         # We only consider the name and order of nodes. A more robust way
         # would be to check the hash of the whole graph (disregarding input shapes),
+<<<<<<< HEAD
         # this is a reasonable first-order approximation.
+=======
+        # this is is a reasonable first-order approximation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node_str = "/".join(x.name for x in joint_graph.nodes)
         inputs = hashlib.sha256(node_str.encode("utf-8")).hexdigest()
         all_inputs = [None for _ in range(torch.distributed.get_world_size())]
@@ -2632,6 +2910,7 @@ def has_same_nodes(joint_graph):
     ):
         with no_dispatch(), unset_fake_temporarily():
             objects = [[x.name for x in saved_values]]
+<<<<<<< HEAD
             saved_ops_names_all_ranks: list[list[str]] = [
                 [] for _ in range(torch.distributed.get_world_size())
             ]
@@ -2759,6 +3038,16 @@ def thread_graphsafe_rng_from_hops(module, is_backward):
     return module
 
 
+=======
+            # TODO: maybe use a different process group for this
+            torch.distributed.broadcast_object_list(objects, src=0)
+            saved_values_names = objects[0]
+            name_to_node = get_name_to_node(joint_graph)
+            saved_values = [name_to_node[n] for n in saved_values_names]
+    return saved_values
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule,
     _joint_inputs,
@@ -2810,7 +3099,10 @@ def min_cut_rematerialization_partition(
         joint_module = cleanup_recompute_tags(joint_module)
     if not config.unsafe_allow_optimization_of_collectives:
         force_save_collectives(joint_module)
+<<<<<<< HEAD
     force_save_bw_mutation_src(joint_module)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def classify_nodes(joint_module, static_lifetime_input_indices):
         name_to_node = get_name_to_node(joint_module.graph)
@@ -2829,14 +3121,23 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
             filter(_is_fwd_seed_offset, joint_module.graph.nodes)
         )
         inputs = primal_inputs + fwd_seed_offset_inputs
+<<<<<<< HEAD
         fwd_outputs, bwd_outputs, fwd_outputs_descs, bwd_outputs_descs = (
             _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+=======
+        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+            joint_module, num_fwd_outputs=num_fwd_outputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         required_bw_nodes.update(
             o for o in bwd_outputs if o is not None and o.op != "output"
         )
         forward_only_graph = _extract_graph_with_inputs_outputs(
+<<<<<<< HEAD
             joint_module.graph, inputs, fwd_outputs, fwd_outputs_descs, "forward"
+=======
+            joint_module.graph, inputs, fwd_outputs, "forward"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         required_fw_nodes: OrderedSet[fx.Node] = OrderedSet(
             name_to_node[node.name]
@@ -2902,9 +3203,14 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
         node_info,
         memory_budget=memory_budget,
     )
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
     if config._sync_decision_cross_ranks:
         saved_values = _sync_decision_cross_ranks(joint_graph, saved_values)
+=======
+    if config._broadcast_rank0_decision:
+        saved_values = _broadcast_rank0_decision(joint_graph, saved_values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # save_for_backward on tensors and stashes symints in autograd .ctx
     saved_sym_nodes = list(filter(is_sym_node, saved_values))
     saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
@@ -2913,7 +3219,10 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
     fw_module, bw_module = _extract_fwd_bwd_modules(
         joint_module,
         saved_values,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         saved_sym_nodes=saved_sym_nodes,
         num_fwd_outputs=num_fwd_outputs,
         static_lifetime_input_nodes=node_info.static_lifetime_input_nodes,
@@ -2930,9 +3239,12 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
     fw_module = raise_getitems(fw_module)
     bw_module = raise_getitems(bw_module)
 
+<<<<<<< HEAD
     fw_module = thread_graphsafe_rng_from_hops(fw_module, is_backward=False)
     bw_module = thread_graphsafe_rng_from_hops(bw_module, is_backward=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if AOT_PARTITIONER_DEBUG:
         # Calculate sorted sizes of saved values
         sorted_sizes = sorted([(_size_of(i), str(i)) for i in saved_values])
diff --git a/torch/_functorch/pyfunctorch.py b/torch/_functorch/pyfunctorch.py
index 0a811ed86c21c..a81605b5cf25c 100644
--- a/torch/_functorch/pyfunctorch.py
+++ b/torch/_functorch/pyfunctorch.py
@@ -131,7 +131,10 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _cptr(self):
         return CVmapInterpreterPtr(self._cdata)
 
@@ -171,7 +174,10 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _cptr(self):
         return CGradInterpreterPtr(self._cdata)
 
@@ -209,7 +215,10 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _cptr(self):
         return CJvpInterpreterPtr(self._cdata)
 
@@ -246,7 +255,10 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _cptr(self):
         return CFunctionalizeInterpreterPtr(self._cdata)
 
diff --git a/torch/_functorch/top_operators_github_usage.py b/torch/_functorch/top_operators_github_usage.py
index 171c6fc6c1e01..efbb8acf46e75 100644
--- a/torch/_functorch/top_operators_github_usage.py
+++ b/torch/_functorch/top_operators_github_usage.py
@@ -4,7 +4,10 @@
 From https://docs.google.com/spreadsheets/d/12R3nCOLskxPYjjiNkdqy4OdQ65eQp_htebXGODsjSeA/edit#gid=0
 Try to keep this list in sync with that.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 
 
diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index 465be67e41fa4..0bf40efd1516e 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -9,6 +9,7 @@
 import contextlib
 import functools
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial
 from typing import Any, Optional, Union
@@ -17,11 +18,25 @@
 from torch import Tensor
 from torch._C._functorch import is_batchedtensor
 from torch._functorch.predispatch import (
+=======
+import os
+import threading
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import Tensor
+from torch._C._functorch import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _add_batch_dim,
     _remove_batch_dim,
     _vmap_decrement_nesting,
     _vmap_increment_nesting,
+<<<<<<< HEAD
     lazy_load_decompositions,
+=======
+    is_batchedtensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.utils._pytree import (
     _broadcast_to_and_flatten,
@@ -258,6 +273,60 @@ def _get_name(func: Callable):
     return repr(func)
 
 
+<<<<<<< HEAD
+=======
+DECOMPOSITIONS_LOADED = False
+DECOMPOSITIONS_LOCK = threading.Lock()
+VMAP_DECOMPOSITIONS_LIB = None
+
+
+# torch.package, Python 3.11, and torch.jit-less environments are unhappy with
+# decompositions. Only load them when needed if possible.
+def lazy_load_decompositions():
+    global DECOMPOSITIONS_LOADED
+    if DECOMPOSITIONS_LOADED:
+        return
+
+    with DECOMPOSITIONS_LOCK:
+        if DECOMPOSITIONS_LOADED:
+            return
+
+        if not (os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__):
+            DECOMPOSITIONS_LOADED = True
+            return
+
+        # use an alternate way to register an operator into the decomposition table
+        # _register_jit_decomposition doesn't work for some operators, e.g. addr,
+        #  because the Tensor types generated cannot be unioned by torchscript
+        # decomp should be type OpOverload
+        global VMAP_DECOMPOSITIONS_LIB
+        VMAP_DECOMPOSITIONS_LIB = torch.library.Library(
+            "aten", "IMPL", "FuncTorchBatched"
+        )
+
+        from torch._decomp import decomposition_table
+
+        def _register_python_decomposition_vmap(decomp):
+            if decomp in decomposition_table:
+                VMAP_DECOMPOSITIONS_LIB.impl(decomp, decomposition_table[decomp])
+            else:
+                raise RuntimeError(f"could not find decomposition for {decomp}")
+
+        _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
+        _register_python_decomposition_vmap(
+            torch.ops.aten.smooth_l1_loss_backward.default
+        )
+        _register_python_decomposition_vmap(torch.ops.aten.huber_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.addr.default)
+
+        DECOMPOSITIONS_LOADED = True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
     lazy_load_decompositions()
     _check_out_dims_is_int_or_int_pytree(out_dims, func)
@@ -293,7 +362,11 @@ def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
 
 
 def get_chunk_sizes(total_elems, chunk_size):
+<<<<<<< HEAD
     n_chunks = total_elems // chunk_size
+=======
+    n_chunks = n_chunks = total_elems // chunk_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     chunk_sizes = [chunk_size] * n_chunks
     # remainder chunk
     remainder = total_elems % chunk_size
diff --git a/torch/_guards.py b/torch/_guards.py
index fc8f88f237c4c..ccf526494993c 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import contextlib
@@ -14,11 +18,28 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, Generic, NamedTuple, Optional, TYPE_CHECKING, TypeVar, Union
 
 import torch
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+=======
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    NamedTuple,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch.utils import _pytree as pytree
+from torch.utils._backport_slots import dataclass_slots
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._traceback import CapturedTraceback, format_frame
 from torch.utils.weak import WeakTensorKeyDictionary
 
@@ -27,16 +48,22 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Generator, Iterator
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from types import CodeType
 
     import sympy
 
+<<<<<<< HEAD
     from torch._dynamo.backends.distributed import DDPOptimizerContext
     from torch._dynamo.codegen import PyCodegen
     from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
     from torch._subclasses.fake_tensor import FakeTensorMode
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 """
 torch._guards is the definitional source of truth for general purpose guard structures.
@@ -63,7 +90,12 @@
 # 3. Compact: The string form is directly displayed by some tools. Special symbols are okay.
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, kw_only=True, slots=True)
+=======
+# TODO: mark as kw_only=True once we drop support for <Python 3.10
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CompileId:
     frame_id: Optional[int]
     # This id is per-frame, and counts how many times we've compiled this
@@ -78,7 +110,11 @@ class CompileId:
     # TODO: consider also tracking the recompilation count
     # See Note: Updating CompileId
 
+<<<<<<< HEAD
     def __str__(self) -> str:
+=======
+    def __str__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: Keep this in sync with both from_string and the tlparse repo
         if self.compiled_autograd_id is not None:
             assert (self.frame_id is None) == (self.frame_compile_id is None)
@@ -92,7 +128,11 @@ def __str__(self) -> str:
             return f"{self.frame_id}/{self.frame_compile_id}"
 
     @classmethod
+<<<<<<< HEAD
     def from_string(cls, compile_id: Optional[str]) -> Optional[CompileId]:
+=======
+    def from_string(cls, compile_id: Optional[str]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Factory method that creates a CompileId from its string representation.
         Keep this in sync with the __str__ method.
@@ -120,7 +160,11 @@ class TraceId(NamedTuple):
     # up by one
     attempt: int
 
+<<<<<<< HEAD
     def __str__(self) -> str:
+=======
+    def __str__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Keep this in sync with tlparse repo
         if self.attempt == 0:
             return str(self.compile_id)
@@ -180,7 +224,11 @@ def is_unspecialized_builtin_nn_module(self) -> bool:
             GuardSource.LOCAL_UNSPECIALIZED_BUILTIN_NN_MODULE,
         )
 
+<<<<<<< HEAD
     def is_local(self) -> bool:
+=======
+    def is_local(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self in (
             GuardSource.LOCAL,
             GuardSource.LOCAL_SPECIALIZED_NN_MODULE,
@@ -213,7 +261,11 @@ class SLoc:
     framework_loc: Optional[Union[traceback.FrameSummary, str]]
     maybe_user_loc: Optional[str]
 
+<<<<<<< HEAD
     def __str__(self) -> str:
+=======
+    def __str__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         floc = (
             self.framework_loc
             if isinstance(self.framework_loc, str)
@@ -231,7 +283,12 @@ class ShapeGuard(NamedTuple):
     size_oblivious: bool
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Guard:
     # originating_source is the source that called the make_guard method to
     # construct this guard object. The property name specifies what exactly it
@@ -257,19 +314,32 @@ class Guard:
     guard_types: Optional[list[str]] = None
     code_list: Optional[list[str]] = None
     obj_weakref: Optional[object] = None
+<<<<<<< HEAD
     guarded_class_weakref: Optional[weakref.ReferenceType[Any]] = None
+=======
+    guarded_class_weakref: Optional[type] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     stack: Optional[CapturedTraceback] = None
     user_stack: Optional[traceback.StackSummary] = None
     _hash: Optional[int] = None
+<<<<<<< HEAD
     _unserializable: bool = False
 
     def __hash__(self) -> int:
+=======
+
+    def __hash__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._hash is None:
             self._hash = hash((self.name, self.source, id(self.create_fn)))
         return self._hash
 
+<<<<<<< HEAD
     def sort_key(self) -> tuple[bool, int, int, str, int]:
+=======
+    def sort_key(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Put the duplicate input guards at the end. The duplicate guards have
         # two sources while guard.name only considers one source.
 
@@ -285,10 +355,17 @@ def sort_key(self) -> tuple[bool, int, int, str, int]:
             self.inner_create_fn().__code__.co_firstlineno,
         )
 
+<<<<<<< HEAD
     def __lt__(self, other: Guard) -> bool:
         return self.sort_key() < other.sort_key()
 
     def inner_create_fn(self) -> Callable[[GuardBuilderBase, Guard], Any]:
+=======
+    def __lt__(self, other):
+        return self.sort_key() < other.sort_key()
+
+    def inner_create_fn(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.create_fn, functools.partial):
             return self.create_fn.func
         else:
@@ -303,7 +380,11 @@ def source(self) -> GuardSource:
         return self.originating_source.guard_source()
 
     @staticmethod
+<<<<<<< HEAD
     def weakref_to_str(obj_weakref: object) -> str:
+=======
+    def weakref_to_str(obj_weakref):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This is a workaround of a Python weakref bug.
 
@@ -327,7 +408,11 @@ def __getattr__(self, x):
         else:
             return str(obj_weakref)
 
+<<<<<<< HEAD
     def __repr__(self) -> str:
+=======
+    def __repr__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = f"""
         {self.source.name.lower() if self.source else ""} {repr(self.name)} {self.inner_create_fn().__name__}
         {{
@@ -339,7 +424,11 @@ def __repr__(self) -> str:
         """
         return s
 
+<<<<<<< HEAD
     def __str__(self) -> str:
+=======
+    def __str__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = f"Name: {repr(self.name)}\n"
         source = self.source.name.lower() if self.source else ""
         output += f"    Source: {source}\n"
@@ -350,7 +439,11 @@ def __str__(self) -> str:
         output += f"    Guarded Class Weakref: {self.guarded_class_weakref}\n"
         return output
 
+<<<<<<< HEAD
     def create(self, builder: GuardBuilderBase) -> Any:
+=======
+    def create(self, builder: GuardBuilderBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             return self.create_fn(builder, self)
         except Exception:
@@ -359,6 +452,7 @@ def create(self, builder: GuardBuilderBase) -> Any:
                 log.error("Created at:\n%s", "".join(self.stack.format()[-4:]).rstrip())
             raise
 
+<<<<<<< HEAD
     def is_specialized_nn_module(self) -> bool:
         return self.source.is_specialized_nn_module()
 
@@ -382,6 +476,18 @@ def set_export_info(
         code_list: list[str],
         obj_weakref: object,
     ) -> None:
+=======
+    def is_specialized_nn_module(self):
+        return self.source.is_specialized_nn_module()
+
+    def is_fsdp_module(self):
+        return self.source.is_fsdp_module()
+
+    def is_local(self):
+        return self.source.is_local()
+
+    def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.guard_types:
             self.guard_types = []
 
@@ -436,7 +542,11 @@ class DuplicateInputs(GuardEnvExpr):
     input_source_a: Source
     input_source_b: Source
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
+=======
+    def __post_init__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.input_source_a != self.input_source_b
 
 
@@ -467,7 +577,11 @@ class StorageOverlap(GuardEnvExpr):
 can also be taken in at restore_graphstate(T) calls.
 
 When to snapshot, is, at the moment, an implementation detail of upstream callers. Checkpointable
+<<<<<<< HEAD
 does not provide any guarantees around consistency, idempotency, or safety of calling its APIs, yet.
+=======
+does not provide any garuantees around consistency, idempotency, or safety of calling its APIs, yet.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 In the future, it will have a closer coupling to a generic Checkpoint management system.
 """
@@ -478,7 +592,11 @@ class Checkpointable(Generic[T]):
     def copy_graphstate(self) -> T: ...
 
     @abstractmethod
+<<<<<<< HEAD
     def restore_graphstate(self, state: T) -> None: ...
+=======
+    def restore_graphstate(self, state: T): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class GuardsCheckpointState:
@@ -488,10 +606,17 @@ class GuardsCheckpointState:
 
     dynamo_guards: set[Guard] = set()
 
+<<<<<<< HEAD
     def __init__(self, dynamo_guards: set[Guard]) -> None:
         self.dynamo_guards = dynamo_guards
 
     def diff(self, other: GuardsCheckpointState) -> Optional[set[Guard]]:
+=======
+    def __init__(self, dynamo_guards):
+        self.dynamo_guards = dynamo_guards
+
+    def diff(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Produces a delta against another GuardsCheckpointState.
 
@@ -503,19 +628,30 @@ def diff(self, other: GuardsCheckpointState) -> Optional[set[Guard]]:
             return None
         return r
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, GuardsCheckpointState):
             return False
+=======
+    def __eq__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.diff(other) is None
 
 
 class ModuleContextCheckpointState:
     nn_modules: dict[str, torch.nn.Module] = {}
 
+<<<<<<< HEAD
     def __init__(self, nn_modules: dict[str, torch.nn.Module]) -> None:
         self.nn_modules = nn_modules
 
     def diff(self, other: ModuleContextCheckpointState) -> Optional[set[str]]:
+=======
+    def __init__(self, nn_modules):
+        self.nn_modules = nn_modules
+
+    def diff(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Produces a delta against another ModuleContextCheckpointState.
 
@@ -527,9 +663,13 @@ def diff(self, other: ModuleContextCheckpointState) -> Optional[set[str]]:
             return None
         return r
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, ModuleContextCheckpointState):
             return False
+=======
+    def __eq__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.diff(other) is None
 
 
@@ -537,21 +677,37 @@ class ModuleContext(Checkpointable[ModuleContextCheckpointState]):
     def __init__(self) -> None:
         self.nn_modules: dict[str, Any] = {}
 
+<<<<<<< HEAD
     def copy_graphstate(self) -> ModuleContextCheckpointState:
         return ModuleContextCheckpointState(dict(self.nn_modules))
 
     def restore_graphstate(self, state: ModuleContextCheckpointState) -> None:
+=======
+    def copy_graphstate(self):
+        return ModuleContextCheckpointState(dict(self.nn_modules))
+
+    def restore_graphstate(self, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(state, ModuleContextCheckpointState)
         self.nn_modules = state.nn_modules
 
 
 class GlobalContextCheckpointState:
+<<<<<<< HEAD
     global_state: dict[str, tuple[Callable, Any]] = {}
 
     def __init__(self, global_states: dict[str, tuple[Callable, Any]]) -> None:
         self.global_state = global_states
 
     def diff(self, other: GlobalContextCheckpointState) -> Optional[set[str]]:
+=======
+    global_state: dict[str, tuple[Callable, ...]] = {}
+
+    def __init__(self, global_states):
+        self.global_state = global_states
+
+    def diff(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Produces a delta against another GlobalContextCheckpointState.
 
@@ -563,9 +719,13 @@ def diff(self, other: GlobalContextCheckpointState) -> Optional[set[str]]:
             return None
         return r
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, GlobalContextCheckpointState):
             return False
+=======
+    def __eq__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.diff(other) is None
 
 
@@ -585,12 +745,21 @@ class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
     }
 
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.global_state: dict[str, tuple[Callable, Any]] = {}
 
     def copy_graphstate(self) -> GlobalContextCheckpointState:
         return GlobalContextCheckpointState(self.global_state)
 
     def restore_graphstate(self, state: GlobalContextCheckpointState) -> None:
+=======
+        self.global_state: dict[str, tuple[Callable, ...]] = {}
+
+    def copy_graphstate(self):
+        return GlobalContextCheckpointState(dict(self.global_state))
+
+    def restore_graphstate(self, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(state, GlobalContextCheckpointState)
         self.global_state = state.global_state
         assert (
@@ -604,19 +773,31 @@ def restore_graphstate(self, state: GlobalContextCheckpointState) -> None:
 # Like a Set[Guard] but will record the user stack on all guards at the
 # time they were installed at their destination
 class GuardsSet:
+<<<<<<< HEAD
     def __init__(self, inner: Optional[set[Guard]] = None) -> None:
+=======
+    def __init__(self, inner=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inner is None:
             inner = set()
         self.inner = inner
 
+<<<<<<< HEAD
     def __iter__(self) -> Iterator[Guard]:
         return iter(self.inner)
 
     def __len__(self) -> int:
+=======
+    def __iter__(self):
+        return iter(self.inner)
+
+    def __len__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.inner)
 
     # Subtraction along with bool is typically used to determine the delta of
     # added guards between checkpoints for higher order ops
+<<<<<<< HEAD
     def __sub__(self, other: GuardsSet) -> GuardsSet:
         return GuardsSet(self.inner - other.inner)
 
@@ -626,21 +807,42 @@ def __bool__(self) -> bool:
     def add(
         self, guard: Guard, *, collect_debug_stack: bool = True, skip: int = 0
     ) -> None:
+=======
+    def __sub__(self, other):
+        return GuardsSet(self.inner - other.inner)
+
+    def __bool__(self):
+        return bool(self.inner)
+
+    def add(self, guard: Guard, *, collect_debug_stack=True, skip=0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if guard in self.inner:
             return
         if collect_debug_stack:
             if guard.stack is None:
                 guard.stack = CapturedTraceback.extract(skip=1 + skip)
+<<<<<<< HEAD
         if guard.user_stack is None:
             guard.user_stack = TracingContext.extract_stack()
         self.inner.add(guard)
 
     def update(self, *others: set[Guard]) -> None:
+=======
+            if guard.user_stack is None:
+                guard.user_stack = TracingContext.extract_stack()
+        self.inner.add(guard)
+
+    def update(self, *others: set[Guard]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for o in others:
             for g in o:
                 self.add(g, skip=1)
 
+<<<<<<< HEAD
     def remove_guards_with_source(self, source: Source) -> None:
+=======
+    def remove_guards_with_source(self, source):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Delete all guards that contains a given source"""
         from ._dynamo.source import is_from_source
 
@@ -662,10 +864,17 @@ def __init__(self) -> None:
         self.dynamo_guards: GuardsSet = GuardsSet()
         self.aotautograd_guards: list[GuardEnvExpr] = []
 
+<<<<<<< HEAD
     def copy_graphstate(self) -> GuardsCheckpointState:
         return GuardsCheckpointState(set(self.dynamo_guards.inner))
 
     def restore_graphstate(self, state: GuardsCheckpointState) -> None:
+=======
+    def copy_graphstate(self):
+        return GuardsCheckpointState(set(self.dynamo_guards.inner))
+
+    def restore_graphstate(self, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: "steals" the passed in state
         assert isinstance(state, GuardsCheckpointState)
         self.dynamo_guards = GuardsSet(state.dynamo_guards)
@@ -673,12 +882,17 @@ def restore_graphstate(self, state: GuardsCheckpointState) -> None:
 
 class HopSubgraphCache:
     @abstractmethod
+<<<<<<< HEAD
     def add_dynamo_installed_submodule(self, fn_id: int, identifier: str) -> None: ...
+=======
+    def add_dynamo_installed_submodule(self, fn_id: int, identifier: str): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def get_dynamo_installed_submodules(self, fn_id: int) -> list[str]: ...
 
     @abstractmethod
+<<<<<<< HEAD
     def add_autograd_key_entry(self, identifier: str, key: Callable) -> None: ...
 
     @abstractmethod
@@ -689,6 +903,18 @@ def add_proxy_dispatch_entry(self, identifier: str, key: Callable) -> None: ...
 
     @abstractmethod
     def get_proxy_dispatch_entry(self, identifier: str) -> Optional[Callable]: ...
+=======
+    def add_autograd_key_entry(self, identifier: str, key: Callable): ...
+
+    @abstractmethod
+    def get_autograd_key_entry(self, identifier: str): ...
+
+    @abstractmethod
+    def add_proxy_dispatch_entry(self, identifier: str, key: Callable): ...
+
+    @abstractmethod
+    def get_proxy_dispatch_entry(self, identifier: str): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def add_lazy_bwd_entry(
@@ -696,12 +922,20 @@ def add_lazy_bwd_entry(
         identifier: str,
         tangent_metadata: tuple[object],
         gmod: torch.fx.GraphModule,
+<<<<<<< HEAD
     ) -> int: ...
+=======
+    ): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def get_lazy_bwd_entry(
         self, identifier: str, tangent_metadata: tuple[object]
+<<<<<<< HEAD
     ) -> tuple[Optional[torch.fx.GraphModule], Optional[int]]: ...
+=======
+    ) -> int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class InvokeSubgraphCache(HopSubgraphCache):
@@ -713,12 +947,17 @@ def __init__(self) -> None:
             str, dict[tuple[object], tuple[torch.fx.GraphModule, int]]
         ] = defaultdict(dict)
 
+<<<<<<< HEAD
     def add_dynamo_installed_submodule(self, fn_id: int, identifier: str) -> None:
+=======
+    def add_dynamo_installed_submodule(self, fn_id: int, identifier: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dynamo_installed_submodules[fn_id].append(identifier)
 
     def get_dynamo_installed_submodules(self, fn_id: int) -> list[str]:
         return self.dynamo_installed_submodules.get(fn_id, [])
 
+<<<<<<< HEAD
     def add_autograd_key_entry(self, identifier: str, key: Callable) -> None:
         self.autograd_cache[identifier] = key
 
@@ -729,6 +968,18 @@ def add_proxy_dispatch_entry(self, identifier: str, key: Callable) -> None:
         self.proxy_dispatch_cache[identifier] = key
 
     def get_proxy_dispatch_entry(self, identifier: str) -> Optional[Callable]:
+=======
+    def add_autograd_key_entry(self, identifier: str, key: Callable):
+        self.autograd_cache[identifier] = key
+
+    def get_autograd_key_entry(self, identifier: str):
+        return self.autograd_cache.get(identifier, None)
+
+    def add_proxy_dispatch_entry(self, identifier: str, key: Callable):
+        self.proxy_dispatch_cache[identifier] = key
+
+    def get_proxy_dispatch_entry(self, identifier: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.proxy_dispatch_cache.get(identifier, None)
 
     def add_lazy_bwd_entry(
@@ -736,15 +987,23 @@ def add_lazy_bwd_entry(
         identifier: str,
         tangent_metadata: tuple[object],
         gmod: torch.fx.GraphModule,
+<<<<<<< HEAD
     ) -> int:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Save the number of existing graph modules in the dictionary to get the suffix
         num_gmods = len(self.lazy_bwd_cache[identifier])
         self.lazy_bwd_cache[identifier][tangent_metadata] = (gmod, num_gmods)
         return num_gmods
 
+<<<<<<< HEAD
     def get_lazy_bwd_entry(
         self, identifier: str, tangent_metadata: tuple[object]
     ) -> tuple[Optional[torch.fx.GraphModule], Optional[int]]:
+=======
+    def get_lazy_bwd_entry(self, identifier: str, tangent_metadata: tuple[object]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if identifier not in self.lazy_bwd_cache:
             return (None, None)
 
@@ -797,7 +1056,11 @@ def get() -> CompileContext:
     def try_get() -> Optional[CompileContext]:
         return getattr(_TLS, "compile_context", None)
 
+<<<<<<< HEAD
     def __init__(self, compile_id: Optional[CompileId]) -> None:
+=======
+    def __init__(self, compile_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert compile_id is None or isinstance(compile_id, CompileId)
         self.compile_id: Optional[CompileId] = compile_id
         self.attempt = 0
@@ -805,14 +1068,22 @@ def __init__(self, compile_id: Optional[CompileId]) -> None:
         self.shape_env_guards: list[str] = []
 
     @staticmethod
+<<<<<<< HEAD
     def current_compile_id() -> Optional[CompileId]:
+=======
+    def current_compile_id():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = CompileContext.try_get()
         if self is None:
             return None
         return self.compile_id
 
     @staticmethod
+<<<<<<< HEAD
     def current_trace_id() -> Optional[TraceId]:
+=======
+    def current_trace_id():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = CompileContext.try_get()
         if self is None:
             return None
@@ -841,6 +1112,7 @@ def get() -> TracingContext:
             "TracingContext.get() must be called within an ongoing trace."
         )
 
+<<<<<<< HEAD
     def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         self.guards_context = GuardsContext()
         self.module_context = ModuleContext()
@@ -849,12 +1121,23 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         self.previously_cleaned_instructions: dict[Any, Any] = dict()
         self.fake_mode: Optional[FakeTensorMode] = fake_mode
         self.frame_summary_stack: list[traceback.FrameSummary] = []
+=======
+    def __init__(self, fake_mode):
+        self.guards_context = GuardsContext()
+        self.module_context = ModuleContext()
+        self.global_context = GlobalContext()
+        self.previously_inlined_functions = dict()
+        self.previously_cleaned_instructions = dict()
+        self.fake_mode = fake_mode
+        self.frame_summary_stack = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is morally part of frame_summary_stack, but it is kept separate
         # for clarity.  As we process a frame, this variable gets updated
         # to keep track of what line we are in the function.  We make a
         # function call, this gets cleared and the frame location is pushed
         # to frame_summary_stack (prepping this variable for the inner frame's
         # progress)
+<<<<<<< HEAD
         self.loc_in_frame: Optional[tuple[str, int, str]] = None
         # this is only set after aot_autograd
         self.fw_metadata: Optional[ViewAndMutationMeta] = None
@@ -865,6 +1148,16 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         self.params_flat: Optional[list[Any]] = None
         self.params_flat_unwrap_subclasses: Optional[list[Any]] = None
         self.params_unwrapped_to_flat_index: Optional[list[Any]] = None
+=======
+        self.loc_in_frame = None
+        # this is only set after aot_autograd
+        self.fw_metadata = None
+        # this is only set after aot_autograd
+        self.aot_graph_name = None
+        self.params_flat = None
+        self.params_flat_unwrap_subclasses = None
+        self.params_unwrapped_to_flat_index = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # this is for extended return calling convention from backend
         # compiler to aot_autograd
         # Per output, what the compiler specified stride of the output is,
@@ -884,7 +1177,11 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         # See note [Tensor Fakification and Symbol Caching]
         self.tensor_to_context = WeakTensorKeyDictionary()
 
+<<<<<<< HEAD
         # If this true, Aot Autograd will return output Fake Tensors with appropriate
+=======
+        # If this true, Aot Autograd will return output Fake Tensors with appropiate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # meta on the first invocation
         # see note: [Returning Fake Tensors on First AOT Autograd Call]
         self.fakify_first_call = False
@@ -892,7 +1189,11 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         # list of code objects for inlined functions
         self.traced_code: list[CodeType] = []
 
+<<<<<<< HEAD
     def clear(self) -> None:
+=======
+    def clear(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Look at the note in output_graph.py in function `save_global_state`
         # for the context on clearing global context.
         self.global_context.global_state = {}
@@ -901,7 +1202,11 @@ def clear(self) -> None:
 
     @staticmethod
     @contextmanager
+<<<<<<< HEAD
     def patch(**kwargs: Any) -> Generator[None, None, None]:
+=======
+    def patch(**kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior = {}
         ctx = TracingContext.get()
 
@@ -917,7 +1222,11 @@ def patch(**kwargs: Any) -> Generator[None, None, None]:
                 setattr(ctx, key, val)
 
     @staticmethod
+<<<<<<< HEAD
     def extract_stack() -> traceback.StackSummary:
+=======
+    def extract_stack():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = TracingContext.try_get()
         if self is None:
             return traceback.StackSummary()
@@ -926,7 +1235,11 @@ def extract_stack() -> traceback.StackSummary:
             stack = stack + [self._populate_loc_in_frame_summary()]
         return traceback.StackSummary.from_list(stack)
 
+<<<<<<< HEAD
     def _populate_loc_in_frame_summary(self) -> traceback.FrameSummary:
+=======
+    def _populate_loc_in_frame_summary(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.loc_in_frame is not None
         filename, lineno, frame_name = self.loc_in_frame
         return traceback.FrameSummary(filename, lineno, frame_name, lookup_line=False)
@@ -935,7 +1248,11 @@ def _populate_loc_in_frame_summary(self) -> traceback.FrameSummary:
     # associated with the current frame state
     @staticmethod
     @contextlib.contextmanager
+<<<<<<< HEAD
     def clear_frame() -> Generator[None, None, None]:
+=======
+    def clear_frame():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tc = TracingContext.get()
         with (
             unittest.mock.patch.object(tc, "frame_summary_stack", []),
@@ -967,9 +1284,13 @@ def clear_frame() -> Generator[None, None, None]:
 
     @staticmethod
     @contextlib.contextmanager
+<<<<<<< HEAD
     def current_frame(
         frame_summary: Optional[traceback.FrameSummary],
     ) -> Generator[None, None, None]:
+=======
+    def current_frame(frame_summary):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # frame_summary can be None to solely take advantage of real_stack
         # attachment to thrown exceptions
         tc = TracingContext.get()
@@ -990,9 +1311,13 @@ def current_frame(
 
     @staticmethod
     @contextlib.contextmanager
+<<<<<<< HEAD
     def report_output_strides() -> Generator[
         Optional[list[Optional[tuple[int, ...]]]], None, None
     ]:
+=======
+    def report_output_strides():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tc = TracingContext.try_get()
         if tc is None:
             yield None
@@ -1005,13 +1330,21 @@ def report_output_strides() -> Generator[
             tc.output_strides = old_output_strides
 
     @staticmethod
+<<<<<<< HEAD
     def set_current_loc(filename: str, lineno: int, frame_name: str) -> None:
+=======
+    def set_current_loc(filename, lineno, frame_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Save the current location in the frame. Lazily generate the
         # framesummary.
         TracingContext.get().loc_in_frame = (filename, lineno, frame_name)
 
     @staticmethod
+<<<<<<< HEAD
     def get_traced_code() -> Optional[list[CodeType]]:
+=======
+    def get_traced_code():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tc = TracingContext.try_get()
         if tc is None:
             return None
@@ -1019,9 +1352,13 @@ def get_traced_code() -> Optional[list[CodeType]]:
 
 
 @contextmanager
+<<<<<<< HEAD
 def compile_context(
     context: Optional[CompileContext],
 ) -> Generator[Optional[CompileContext], None, None]:
+=======
+def compile_context(context: Optional[CompileContext]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     old_context = getattr(_TLS, "compile_context", None)
     _TLS.compile_context = context
     try:
@@ -1031,9 +1368,13 @@ def compile_context(
 
 
 @contextmanager
+<<<<<<< HEAD
 def tracing(
     context: Optional[TracingContext],
 ) -> Generator[Optional[TracingContext], None, None]:
+=======
+def tracing(context: Optional[TracingContext]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This function installs the passed in tracing context as a dynamic scoped
     global variable.
@@ -1063,6 +1404,7 @@ def tracing(
 # TODO(voz): Consider a toplevel torch/_source.py
 @dataclasses.dataclass(frozen=True)
 class Source:
+<<<<<<< HEAD
     def is_dict_key(self) -> bool:
         return False
 
@@ -1070,6 +1412,15 @@ def is_ephemeral(self) -> bool:
         return False
 
     def reconstruct(self, codegen: PyCodegen) -> None:
+=======
+    def is_dict_key(self):
+        return False
+
+    def is_ephemeral(self):
+        return False
+
+    def reconstruct(self, codegen):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def guard_source(self) -> GuardSource:
@@ -1078,7 +1429,11 @@ def guard_source(self) -> GuardSource:
     def name(self) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def make_guard(self, fn: Callable[..., Any]) -> Guard:
+=======
+    def make_guard(self, fn) -> Guard:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.guard_source() is GuardSource.CONSTANT:
             raise NotImplementedError
         return Guard(self, fn)
@@ -1086,7 +1441,11 @@ def make_guard(self, fn: Callable[..., Any]) -> Guard:
     def is_specialized_nn_module(self) -> bool:
         return self.guard_source().is_specialized_nn_module()
 
+<<<<<<< HEAD
     def subguards_allowed(self) -> bool:
+=======
+    def subguards_allowed(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """True if you can guard on attributes of this"""
         return self.guard_source() != GuardSource.SYNTHETIC_LOCAL
 
@@ -1096,11 +1455,19 @@ def subguards_allowed(self) -> bool:
 class ChainedSource(Source):
     base: Source
 
+<<<<<<< HEAD
     def is_dict_key(self) -> bool:
         # Recurse until you either hit a ConstDictKey or a Source
         return self.base.is_dict_key()
 
     def is_ephemeral(self) -> bool:
+=======
+    def is_dict_key(self):
+        # Recurse until you either hit a ConstDictKey or a Source
+        return self.base.is_dict_key()
+
+    def is_ephemeral(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base.is_ephemeral()
 
     def get_base(self) -> Source:
@@ -1110,7 +1477,11 @@ def get_base(self) -> Source:
         return current
 
 
+<<<<<<< HEAD
 def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
+=======
+def detect_fake_mode(inputs: Any = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Attempts to "detect" what the current fake mode is.  If there is one ambiently
     available from TracingContext, we preferentially use that.  Otherwise, we
@@ -1121,11 +1492,15 @@ def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
         - Fake mode associated with passed in tensors (inputs does not
           have to be flattened)
     """
+<<<<<<< HEAD
     from torch._subclasses.fake_tensor import (
         FakeTensor,
         FakeTensorMode,
         get_plain_tensors,
     )
+=======
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fake_modes = []
 
@@ -1138,12 +1513,16 @@ def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
 
     for i, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
         if isinstance(m, FakeTensorMode):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fake_modes.append((m, "active fake mode", i))
 
     flat_inputs = pytree.tree_leaves(inputs)
     for i, flat_input in enumerate(flat_inputs):
         if isinstance(flat_input, FakeTensor):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
         if is_traceable_wrapper_subclass(flat_input):
@@ -1159,24 +1538,37 @@ def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
                     for ix, tensor in enumerate(fake_tensors)
                 ]
             )
+=======
+            fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if fake_modes:
         fake_mode, desc1, i1 = fake_modes[0]
         for m, desc2, i2 in fake_modes[1:]:
             assert fake_mode is m, (
                 f"fake mode ({fake_mode}) from {desc1} {i1} doesn't match mode ({m}) from {desc2} {i2}\n\n"
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
                 f"fake mode from {desc1} {i1} allocated at:\n{fake_mode.stack}\n"
                 # pyrefly: ignore [missing-attribute]
                 f"fake mode from {desc2} {i2} allocated at:\n{m.stack}"
             )
         # pyrefly: ignore [bad-return]
+=======
+                f"fake mode from {desc1} {i1} allocated at:\n{fake_mode.stack}\n"
+                f"fake mode from {desc2} {i2} allocated at:\n{m.stack}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fake_mode
     else:
         return None
 
 
+<<<<<<< HEAD
 def active_fake_mode() -> Optional[FakeTensorMode]:
+=======
+def active_fake_mode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Inspects the dispatch mode stack for an active fake mode and returns it.
     Returns None if no fake mode is active.
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index 516d58bdf314e..cb56a536dd410 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -21,17 +21,24 @@
 from torch._higher_order_ops.foreach_map import _foreach_map, foreach_map
 from torch._higher_order_ops.hints_wrap import hints_wrapper
 from torch._higher_order_ops.invoke_subgraph import invoke_subgraph
+<<<<<<< HEAD
 from torch._higher_order_ops.local_map import local_map_hop
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.map import map
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._higher_order_ops.run_const_graph import run_const_graph
 from torch._higher_order_ops.scan import scan
 from torch._higher_order_ops.strict_mode import strict_mode
 from torch._higher_order_ops.torchbind import call_torchbind
+<<<<<<< HEAD
 from torch._higher_order_ops.while_loop import (
     while_loop,
     while_loop_stack_output_op as while_loop_stack_output,
 )
+=======
+from torch._higher_order_ops.while_loop import while_loop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.wrap import (
     dynamo_bypassing_wrapper,
     tag_activation_checkpoint,
@@ -73,6 +80,9 @@
     "strict_mode",
     "aoti_call_delegate",
     "map",
+<<<<<<< HEAD
     "while_loop_stack_output",
     "local_map_hop",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/_higher_order_ops/aoti_call_delegate.py b/torch/_higher_order_ops/aoti_call_delegate.py
index bb2c62de7617a..cb618a117d6ce 100644
--- a/torch/_higher_order_ops/aoti_call_delegate.py
+++ b/torch/_higher_order_ops/aoti_call_delegate.py
@@ -156,9 +156,13 @@ def call_delegate_functionalize(
     )
     with ctx.redispatch_to_next():
         res = aoti_call_delegate(
+<<<<<<< HEAD
             lowered_module,
             original_gm,
             unwrapped_weight_args,  # type: ignore[arg-type]
             unwrapped_input_args,  # type: ignore[arg-type]
+=======
+            lowered_module, original_gm, unwrapped_weight_args, unwrapped_input_args  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return ctx.wrap_tensors(res)
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index 7cc2e3007cdff..14a93df04c1d4 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -1,16 +1,25 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
 
 import torch
 import torch._prims_common as utils
+=======
+from typing import Any, Callable
+
+import torch
+import torch._prims_common as utils
+import torch._subclasses.functional_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
     _maybe_compile_and_run_fn,
     _maybe_run_with_interpreter,
+<<<<<<< HEAD
     check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
     create_bw_fn,
@@ -21,6 +30,12 @@
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
     split_into_chunks,
+=======
+    autograd_not_implemented,
+    check_meta_consistency,
+    first_slice_copy,
+    reenter_make_fx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unique_graph_id,
     validate_subgraph_args_types,
 )
@@ -37,9 +52,15 @@
 
 
 def wrap_combine_fn_flat(*args, combine_fn, spec, num_leaves):
+<<<<<<< HEAD
     assert len(args) == 2 * num_leaves, (
         f"Combin_fn received wrong number of arguments, expected {2 * num_leaves}, but got {len(args)}"
     )
+=======
+    assert (
+        len(args) == 2 * num_leaves
+    ), f"Combin_fn received wrong number of arguments, expected {2 * num_leaves}, but got {len(args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lhs = pytree.tree_unflatten(args[:num_leaves], spec)
     rhs = pytree.tree_unflatten(args[num_leaves:], spec)
     return combine_fn(lhs, rhs)
@@ -57,7 +78,10 @@ def _interleave(a, b, dim=0):
 
     stacked = torch.stack([a, b], dim=dim + 1)
     interleaved = torch.flatten(stacked, start_dim=dim, end_dim=dim + 1)
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if b_trunc:
         # TODO: find torch alternative for slice_along dim for torch.jit.script to work
         interleaved = aten.slice(interleaved, dim, 0, b.shape[dim] + a.shape[dim] - 1)
@@ -86,9 +110,15 @@ def __call__(self, combine_fn, xs, additional_inputs):
         # the additional_inputs being a list. See https://github.com/pytorch/pytorch/issues/145785
         # Once this issue is resolved, the assertion should only allow tuples
         # and the tuple cast should be removed
+<<<<<<< HEAD
         assert isinstance(additional_inputs, (tuple, list)), (
             "additional_inputs must be a tuple."
         )
+=======
+        assert isinstance(
+            additional_inputs, (tuple, list)
+        ), "additional_inputs must be a tuple."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         additional_inputs = (
             tuple(additional_inputs)
             if isinstance(additional_inputs, list)
@@ -97,6 +127,7 @@ def __call__(self, combine_fn, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, xs, additional_inputs)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def gen_schema(self, combine_fn, xs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
@@ -140,6 +171,8 @@ def gen_schema(self, combine_fn, xs, additional_inputs):
         schema_gen.add_schema_tree_spec(combine_fn, xs, additional_inputs)
         return schema_gen.gen_schema()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 associative_scan_op = AssociativeScanOp()
 
@@ -184,6 +217,7 @@ def associative_scan(
         def add(x: torch.Tensor, y: torch.Tensor):
             return x + y
 
+<<<<<<< HEAD
 
         cumsum = associative_scan(add, x, dim)
 
@@ -191,6 +225,11 @@ def add(x: torch.Tensor, y: torch.Tensor):
     # TODO: Support lifted arguments in inductor for associative_scan
     # TODO: Support autograd for cases with lifted arguments for combine_mode=pointwise
 
+=======
+        cumsum = associative_scan(add, x, dim)
+
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The reason we flatten xs before calling into dynamo is that
     # we want to create a consistent input ordering for combine_fn
     # and we also want to the input ordering matches the output ordering.
@@ -199,18 +238,30 @@ def add(x: torch.Tensor, y: torch.Tensor):
     def _validate_input(cfn, lxs, d, r, cm):
         # Basic arguments check
         if not callable(cfn):
+<<<<<<< HEAD
             raise ValueError(f"Combine_fn must be a callable, but got {cfn}")
+=======
+            raise ValueError("Combine_fn must be a callable, but got {cfn}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(d, int):
             raise ValueError("Dim must be an int, but got " + str(type(d)))
         if not isinstance(r, bool):
             raise RuntimeError("Reverse must be a bool, but got " + str(type(r)))
         if cm not in ["pointwise", "generic"]:
             raise ValueError(
+<<<<<<< HEAD
                 f"Combine_mode must either 'pointwise' or 'generic', but got {cm}"
             )
         if cm == "pointwise" and not all(l.device.type in ("cuda", "xpu") for l in lxs):
             raise ValueError(
                 "For combine_mode='pointwise', all input tensors need to be on CUDA or XPU"
+=======
+                "Combine_mode must either 'pointwise' or 'generic', but got {cm}"
+            )
+        if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
+            raise ValueError(
+                "For combine_mode='pointwise', all input tensors need to be on CUDA"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # Checks for xs
@@ -242,6 +293,12 @@ def _validate_input(cfn, lxs, d, r, cm):
     if reverse:
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
 
+<<<<<<< HEAD
+=======
+    # TODO: Support Autograd
+    # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if combine_mode == "generic":
         # The generic_associative_scan implementation calls the combine_fn with a `batch` along the scan dimension
         # For example, consider:
@@ -428,9 +485,15 @@ def trace_associative_scan(
 
     assert outputs is not None
     outputs = pytree.tree_leaves(outputs)
+<<<<<<< HEAD
     assert len(outputs) == len(xs), (
         f"expected combine_fn to return {len(xs)} results but got {len(outputs)}"
     )
+=======
+    assert len(outputs) == len(
+        xs
+    ), f"expected combine_fn to return {len(xs)} results but got {len(outputs)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     xs_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
         first_slice_copy(x) for x in xs
@@ -465,6 +528,7 @@ def associative_scan_op_dense(combine_fn, xs, additional_inputs):
     return generic_associative_scan(combine_fn, xs, additional_inputs=additional_inputs)
 
 
+<<<<<<< HEAD
 class AssociativeScanAutogradOp(torch.autograd.Function):
     r""" associative_scan
         Example::
@@ -838,6 +902,11 @@ def associative_scan_autograd(combine_fn, xs, additional_inputs):
         *(tuple(xs) + tuple(additional_inputs)),
     )
     return (*flat_out,)
+=======
+associative_scan_op.py_autograd_impl(
+    autograd_not_implemented(associative_scan_op, deferred_error=True)
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @associative_scan_op.py_impl(ProxyTorchDispatchMode)
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index 68942ee0b9032..f2f64130520ac 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import warnings
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
+=======
+from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from typing import Any, get_args, Optional, Union
 
@@ -239,7 +243,11 @@ def use_alias():
             write_single_view(
                 f"_{arg_name}",
                 kwargs[arg_name],
+<<<<<<< HEAD
                 arg_to_base_index.get(arg_name),  # type: ignore[arg-type]
+=======
+                arg_to_base_index.get(arg_name, None),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             raise RuntimeError(f"Unsupported type {arg_type}")
@@ -390,7 +398,11 @@ def __call__(
         if isinstance(_mutable_op, HigherOrderOperator):
             _op_to_check = HopInstance(
                 _mutable_op,
+<<<<<<< HEAD
                 SchemaHolder.from_tree_spec(kwargs.get("_op_schema")).schema,  # type: ignore[arg-type]
+=======
+                SchemaHolder.from_tree_spec(kwargs.get("_op_schema", None)).schema,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             _op_to_check = _mutable_op
@@ -508,7 +520,11 @@ def do_auto_functionalize(
             normalized_kwargs[arg.name] = kwargs[arg.name]
         elif idx < len(args):
             # if its out of bounds we don't need to do anything
+<<<<<<< HEAD
             # as it means the optional arg was passed with its default
+=======
+            # as it means the the optional arg was passed with its default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # value
             normalized_kwargs[arg.name] = args[idx]
         else:
@@ -518,6 +534,7 @@ def do_auto_functionalize(
     if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
         warnings.warn(
             "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
+<<<<<<< HEAD
             "Please consider using a different name for this argument to avoid potential issues.",
             stacklevel=2,
         )
@@ -525,6 +542,13 @@ def do_auto_functionalize(
         unwrapped_outs = auto_functionalized(
             op,
             **unwrapped_kwargs,  # type: ignore[arg-type]
+=======
+            "Please consider using a different name for this argument to avoid potential issues."
+        )
+    with ctx.redispatch_to_next():
+        unwrapped_outs = auto_functionalized(
+            op, **unwrapped_kwargs  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # List of the name of args that get mutated (according to the schema)
@@ -573,6 +597,7 @@ def sync_update(o, orig_arg):
     return ctx.wrap_tensors(unwrapped_actual_out)  # type: ignore[arg-type]
 
 
+<<<<<<< HEAD
 # Wrapper for GraphModule that applies functionalization during execution to enable
 # epilogue graph inlining and better fusion opportunities in subgraphs
 # When tracing this wrapper, we'll get a graph module with epilogue.
@@ -595,6 +620,8 @@ def __hash__(self):
         return id(self.orig_callable)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def do_auto_functionalize_v2(
     mode: "torch._subclasses.functional_tensor.FunctionalTensorMode",
     op: Union[OpOverload, HopInstance],
@@ -610,13 +637,32 @@ def do_auto_functionalize_v2(
     normalized_kwargs = {}
 
     schema = op._schema
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     op = op._op if isinstance(op, HopInstance) else op
     assert isinstance(op, get_args(_MutableOpType))
 
     def _functionalize_callable(arg: Any):
         if callable(arg):
+<<<<<<< HEAD
             return FunctionalCallableWithEpilogue(arg)
+=======
+
+            def functional_fn(*args, **kwargs):
+                # We call torch.func.functionalize. This allows us to inline the epilogue graph.
+                # Inlining has the benefit of allowing easiser fusion inside subgraph.
+                # Though the epilogue graph contains copy_, it is OK becuase inductor can handle it
+                # and this is also how we have been supporting top-level graph input mutation.
+                return tuple(
+                    pytree.tree_leaves(torch.func.functionalize(arg)(*args, **kwargs))
+                )
+
+            return torch._higher_order_ops.base_hop.FunctionWithNoFreeVars(
+                functional_fn
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return arg
 
     args, kwargs = pytree.tree_map(_functionalize_callable, (args, kwargs))
@@ -627,7 +673,11 @@ def _functionalize_callable(arg: Any):
             normalized_kwargs[arg.name] = kwargs[arg.name]
         elif idx < len(args):
             # if its out of bounds we don't need to do anything
+<<<<<<< HEAD
             # as it means the optional arg was passed with its default
+=======
+            # as it means the the optional arg was passed with its default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # value
             normalized_kwargs[arg.name] = args[idx]
         else:
@@ -692,8 +742,12 @@ def set_result(base_index):
     if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
         warnings.warn(
             "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
+<<<<<<< HEAD
             "Please consider using a different name for this argument to avoid potential issues.",
             stacklevel=2,
+=======
+            "Please consider using a different name for this argument to avoid potential issues."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     all_basis_unwrapped = ctx.unwrap_tensors(all_bases)
 
@@ -708,8 +762,12 @@ def set_result(base_index):
 
     with ctx.redispatch_to_next():
         unwrapped_outs = auto_functionalized_v2(
+<<<<<<< HEAD
             op,
             **auto_func_kwargs,  # type: ignore[arg-type]
+=======
+            op, **auto_func_kwargs  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     unwrapped_actual_out: Union[Any, tuple[Any]] = (
@@ -721,9 +779,15 @@ def set_result(base_index):
     )
 
     if isinstance(op, HigherOrderOperator):
+<<<<<<< HEAD
         assert len(schema.returns) > 0, (
             f"hop is expected to return at least one output {schema}."
         )
+=======
+        assert (
+            len(schema.returns) > 0
+        ), f"hop is expected to return at least one output {schema}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(unwrapped_actual_out) == len(schema.returns)
     else:
         if len(schema.returns) == 0:
@@ -947,7 +1011,11 @@ def auto_functionalized_v2_proxy(
         # Below code materializes the callable inputs to the hop as graph modules.
         # kwargs may contain general callables, that are not proxable e.g. FunctionWithNoFreeVars
         # this could happen when we auto_functionalize the backward of the hop,
+<<<<<<< HEAD
         # where backward fn is a callablle that wraps forward graph module.
+=======
+        # where backward fn is a callablle that wrapps forward graph module.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This function materialize the callable args according to the schema of the hop.
 
         # We cannot materialize the callables in kwargs directly because the inputs to callable
@@ -960,11 +1028,19 @@ def auto_functionalized_v2_proxy(
         # hop node in the traced graph and graph module inputs to the hop. Finally, we replace the
         # original kwarg's callable with the graph module.
         all_bases = kwargs.get("_all_bases", [])
+<<<<<<< HEAD
         _only_clone_these_bases = kwargs.get("_only_clone_these_bases")
         if _only_clone_these_bases is None:
             _only_clone_these_bases = tuple(range(len(all_bases)))
 
         schema = pytree.tree_unflatten([], kwargs.get("_op_schema")).schema  # type: ignore[arg-type]
+=======
+        _only_clone_these_bases = kwargs.get("_only_clone_these_bases", None)
+        if _only_clone_these_bases is None:
+            _only_clone_these_bases = tuple(range(len(all_bases)))
+
+        schema = pytree.tree_unflatten([], kwargs.get("_op_schema", None)).schema  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_kwargs, _ = _generate_new_op_kwargs_from_bases(
             schema,
             {k: v for k, v in kwargs.items() if k not in ("_all_bases", "_op_schema")},
diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py
index a3e0cc52f8b3a..7af3cf65e7d09 100644
--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@@ -6,7 +6,10 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._dispatch.python import suspend_functionalization
+<<<<<<< HEAD
 from torch._higher_order_ops.auto_functionalize import FunctionalCallableWithEpilogue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.utils import (
     check_input_alias_and_mutation_return_outputs,
     HopInstance,
@@ -40,6 +43,7 @@ class InvokeQuant(BaseHOP):
         def __init__(self):
             return super().__init__("invoke_quant")
 
+<<<<<<< HEAD
 
     invoke_quant = InvokeQuant()
 
@@ -48,6 +52,13 @@ def g(x):
         return x.sin().cos()
 
 
+=======
+    invoke_quant = InvokeQuant()
+
+    def g(x):
+        return x.sin().cos()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.compile(backend="aot_eager")
     def f(x):
         return invoke_quant(g, x, scheme="nf4")
@@ -71,6 +82,7 @@ def __init__(self, hop_name) -> None:
         )
 
     def __call__(self, subgraph, *operands, **kwargs):
+<<<<<<< HEAD
         if not isinstance(
             subgraph,
             (
@@ -79,6 +91,9 @@ def __call__(self, subgraph, *operands, **kwargs):
                 FunctionalCallableWithEpilogue,
             ),
         ):
+=======
+        if not isinstance(subgraph, (torch.fx.GraphModule, FunctionWithNoFreeVars)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"{self._name}: when calling this API without torch.compile, "
                 f"we require that the subgraph be a torch.fx.GraphModule (or "
@@ -116,10 +131,14 @@ def _call_ProxyTorchDispatchMode(self, proxy_mode, subgraph, *operands, **kwargs
 
         out = self(subgraph, *operands, **kwargs)
         return track_tensor_tree(
+<<<<<<< HEAD
             out,
             out_proxy,
             constant=None,
             tracer=proxy_mode.tracer,  # type: ignore[arg-type]
+=======
+            out, out_proxy, constant=None, tracer=proxy_mode.tracer  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _call_FakeTensorMode(self, mode, subgraph, *operands, **kwargs):
@@ -170,18 +189,35 @@ def _call_Functionalize(self, ctx, subgraph, *operands, **kwargs):
             out = self(functionalized_subgraph, *unwrapped_operands, **kwargs)
         return ctx.wrap_tensors(out)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def gen_schema(self, subgraph, *operands, **kwargs):
         from .schema import HopSchemaGenerator
 
         subgraph = materialize_as_graph(subgraph, operands)
+=======
+    def gen_schema(self, subgraph, *operands, **kwargs):
+        from .schema import HopSchemaGenerator
+
+        if not isinstance(subgraph, torch.fx.GraphModule):
+            subgraph = materialize_as_graph(subgraph, operands)
+
+        fake_args = [
+            ph.meta["example_value"] if "example_value" in ph.meta else ph.meta["val"]
+            for ph in subgraph.graph.find_nodes(op="placeholder")
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             inp_inp_alias,
             inp_out_alias,
             out_out_alias,
             mutated_inp_idx,
             output,
+<<<<<<< HEAD
         ) = check_input_alias_and_mutation_return_outputs(subgraph)
+=======
+        ) = check_input_alias_and_mutation_return_outputs(subgraph, fake_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not (
             len(inp_inp_alias) == 0
@@ -193,11 +229,18 @@ def gen_schema(self, subgraph, *operands, **kwargs):
             import warnings
 
             warnings.warn(
+<<<<<<< HEAD
                 "Aliasing is not supported for HOP subgraph.\n"
                 f"{subgraph.print_readable(print_output=False)}\n"
                 f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
                 f"This may lead to silent incorrectness.",
                 stacklevel=2,
+=======
+                "Aliasing is not suppported for HOP subgraph.\n"
+                f"{subgraph.print_readable(print_output=False)}\n"
+                f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
+                f"This may lead to silent incorrectness."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         schema_gen = HopSchemaGenerator(self)
@@ -216,7 +259,10 @@ def gen_schema(self, subgraph, *operands, **kwargs):
 
 class BaseHOPFunction(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, hop, subgraph, kwargs, *operands):
         ctx.hop = hop
         ctx.operands = operands
@@ -233,11 +279,15 @@ def backward(ctx, *grad_outputs):
         kwargs = ctx.kwargs
 
         # TODO: Something special needs to happen with min cut partitioner
+<<<<<<< HEAD
         with (
             suspend_functionalization(),
             disable_functional_mode(),
             torch.enable_grad(),
         ):
+=======
+        with suspend_functionalization(), disable_functional_mode(), torch.enable_grad():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with disable_proxy_modes_tracing():
                 from .invoke_subgraph import create_fw_bw_graph
                 from .utils import _from_fun
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 0849d690eaf41..22c3ec3d1f697 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 import functools
 import logging
 import warnings
@@ -8,6 +9,14 @@
 from typing import Any, Optional, Union
 
 import torch
+=======
+import logging
+import warnings
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch._subclasses.functional_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._C._functorch import (
@@ -19,10 +28,14 @@
 from torch._functorch.utils import exposed_in
 from torch._higher_order_ops.utils import (
     _maybe_run_with_interpreter,
+<<<<<<< HEAD
     check_input_alias_and_mutation_return_outputs,
     create_bw_fn,
     fill_none_with_masks,
     filter_with_masks,
+=======
+    _set_compilation_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     materialize_as_graph,
     reenter_make_fx,
     save_tensors_and_symints_for_backward,
@@ -32,9 +45,22 @@
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+<<<<<<< HEAD
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
+=======
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+from .utils import clone_outputs_aliasing_inputs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -52,6 +78,7 @@ def __call__(self, pred, true_fn, false_fn, operands):
         validate_subgraph_args_types(operands)
         return super().__call__(pred, true_fn, false_fn, operands)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def gen_schema(self, pred, true_fn, false_fn, operands):
         from torch._higher_order_ops.schema import HopSchemaGenerator
@@ -87,6 +114,8 @@ def gen_schema(self, pred, true_fn, false_fn, operands):
         schema_gen.add_schema_tree_spec(pred, true_fn, false_fn, operands)
         return schema_gen.gen_schema()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cond_op = CondOp()
 
@@ -139,12 +168,17 @@ def cond(pred, true_branch, false_branch, operands):
 
         def true_fn(x: torch.Tensor):
             return x.cos()
+<<<<<<< HEAD
 
 
         def false_fn(x: torch.Tensor):
             return x.sin()
 
 
+=======
+        def false_fn(x: torch.Tensor):
+            return x.sin()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cond(x.shape[0] > 4, true_fn, false_fn, (x,))
 
     Restrictions:
@@ -169,6 +203,13 @@ def false_fn(x: torch.Tensor):
     if torch.compiler.is_dynamo_compiling():
         return cond_op(pred, true_fn, false_fn, operands)
 
+<<<<<<< HEAD
+=======
+    from torch._dynamo.backends.debugging import (
+        make_eager_backend_with_torch_function_mode,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(pred, (bool, int, float)):
         # This is the non-strict export case. Strict export and torch.compile are
         # handled above in dynamo.
@@ -177,7 +218,10 @@ def false_fn(x: torch.Tensor):
                 "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."
                 " If you want torch.cond to preserve two branches, please make the predicate a boolean tensor or a SymBool.",
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # This is the eager case. We can just run the true or false branch.
         if pred:
@@ -215,6 +259,7 @@ def _validate_input(pred, true_fn, false_fn, operands):
     def _cond_op_wrapper(*args, **kwargs):
         return cond_op(*args, **kwargs)
 
+<<<<<<< HEAD
     from torch._higher_order_ops.utils import setup_compilation_env
 
     with setup_compilation_env() as backend:
@@ -227,6 +272,77 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     assert isinstance(operands, (list, tuple)), (
         f"Cond operands must be a list or tuple of tensors and SymInts {operands}"
     )
+=======
+    with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit(), _temp_remove_pre_dispatch_torch_function_mode():
+        with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+            if metadata_mode:
+                backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+            else:
+                backend = "eager"
+            return torch.compile(_cond_op_wrapper, backend=backend, fullgraph=True)(
+                pred, true_fn, false_fn, operands
+            )
+
+
+def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
+    """
+    For a fn that accepts flat inputs and returns flat outputs:
+        fw_out = fn(*args),
+    this function returns:
+        grad_args = bw_fn(*args_and_grad_output)
+    with the following invariants:
+      1. args + fw_out has an 1-1 correspondence to args_and_grad_output
+      2. grad_args has an 1-1 corresponsence to args
+      3. for tensor arg whose requires_grad is False, its corresponding grad in
+         grad_args will be a zero tensor with the same shape.
+    """
+
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._higher_order_ops.utils import prepare_fw_with_masks_all_requires_grad
+
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+    n_primals = len(args)
+
+    bw_fn = create_joint(
+        prepare_fw_with_masks_all_requires_grad(fn), aot_config=dummy_aot_config
+    )
+
+    def flat_fn(*args_and_grad_outs):
+        primals = args_and_grad_outs[:n_primals]
+        tangents = args_and_grad_outs[n_primals:]
+        grad_args = bw_fn(primals, tangents)[1]
+        assert len(args) == len(grad_args)
+        # In order to keep HOPs functional where the backward graph,
+        # would have outputs that are aliasing inputs.
+        # For example in cases where the backward of the function is simply
+        # passing the upstream gradients through.
+        maybe_clone = clone_outputs_aliasing_inputs(args_and_grad_outs)
+
+        return [
+            (
+                torch.zeros_like(arg)
+                if isinstance(arg, torch.Tensor) and grad is None
+                else maybe_clone(grad)
+            )
+            for grad, arg in zip(grad_args, primals)
+        ]
+
+    return flat_fn
+
+
+def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
+    assert isinstance(
+        operands, (list, tuple)
+    ), f"Cond operands must be a list or tuple of tensors and SymInts {operands}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     true_graph = reenter_make_fx(true_fn)(*operands)
     false_graph = reenter_make_fx(false_fn)(*operands)
@@ -273,9 +389,15 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
 
 @cond_op.py_impl(DispatchKey.CompositeExplicitAutograd)
 def cond_op_dense(pred, true_fn, false_fn, operands):
+<<<<<<< HEAD
     assert all(isinstance(o, (torch.Tensor, int)) for o in operands), (
         f"Dense implementation operands must be a list of tensors and ints {operands}"
     )
+=======
+    assert all(
+        isinstance(o, (torch.Tensor, int)) for o in operands
+    ), f"Dense implementation operands must be a list of tensors and ints {operands}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mode = _get_current_dispatch_mode()
     assert mode is None, "Mode should never be enabled for CPU/CUDA key"
     if pred:
@@ -286,7 +408,10 @@ def cond_op_dense(pred, true_fn, false_fn, operands):
 
 class CondAutogradOp(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(
         ctx,
         pred,
@@ -317,6 +442,7 @@ def backward(ctx, *flat_grads):
         operands = saved_tensors_and_symints(ctx)
         args = operands + flat_grads
         # TODO: we need to materialize the bw graphs because dynamo is unable to
+<<<<<<< HEAD
         # trace through the joint function when torch.compile torch.autograd.grad.
 
         grads_tensor_masks = []
@@ -336,13 +462,22 @@ def wrapped(*args):
 
         true_bw_gm = materialize_as_graph(
             create_fn_remove_none(ctx._true_bw_fn),
+=======
+        # trace through the joint funcion when torch.compile torch.autograd.grad.
+        true_bw_gm = materialize_as_graph(
+            ctx._true_bw_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args,
             ctx._fw_include_key_set,
             ctx._fw_exclude_key_set,
             force_enable_grad=True,
         )
         false_bw_gm = materialize_as_graph(
+<<<<<<< HEAD
             create_fn_remove_none(ctx._false_bw_fn),
+=======
+            ctx._false_bw_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args,
             ctx._fw_include_key_set,
             ctx._fw_exclude_key_set,
@@ -354,7 +489,11 @@ def wrapped(*args):
             false_bw_gm,
             args,
         )
+<<<<<<< HEAD
         return None, None, None, *fill_none_with_masks(grads, grads_tensor_masks)
+=======
+        return None, None, None, *grads
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Note:
@@ -516,11 +655,19 @@ def _has_unbacked_symbols(s: Union[int, torch.SymInt]) -> bool:
 
     """
     This follows the logic in symbolic_shapes._compute_symbolic_stride
+<<<<<<< HEAD
     Step 2: Since tensor stride is an accumulative multiplication of the sizes, which is a permutated
         (due to view ops) non-descending sequence.
 
         Case 1: No size is 1. In this case, strides have unique values.
             For example, suppose we have a tensor with:
+=======
+    Step 2: Since tensor stride is an accumulative muliplication of the sizes, which is a permutated
+        (due to view ops) non-decending sequence.
+
+        Case 1: No size is 1. In this case, strides have unique values.
+            For example, suppose we have a tenosr with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             size [3, 4, 3, 5, 4, 5],
             stride (1200, 300, 1, 12, 3, 60),
             merged_size [u0, u1, u2, u3, u4, u5].
@@ -537,7 +684,11 @@ def _has_unbacked_symbols(s: Union[int, torch.SymInt]) -> bool:
                 ...
 
         Case 2: At least one dimension has size 1, which can produce duplicates in strides.
+<<<<<<< HEAD
             In this case, theoretically, we cannot uniquely determine the expr of strides because
+=======
+            In this case, theorectically, we cannot uniquely determine the expr of strides because
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             the accessing stride_expr with same key in different order causes the final stride expression
             to be different.
 
@@ -547,7 +698,11 @@ def _has_unbacked_symbols(s: Union[int, torch.SymInt]) -> bool:
                 merged_size: (u0, u1)
 
             The stride expr could either be (u1, 1) or (1, u0) depending on whether we start with u1 or u0.
+<<<<<<< HEAD
             For this reason, we try to break tie by sorting via descending index so we always get (u1, 1).
+=======
+            For this reason, we try to break tie by sorting via decending index so we always get (u1, 1).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             Note that backend might optimize the strides anyway so this is usually not a problem as long
             as two branches matches. See relevant discussions in https://github.com/pytorch/pytorch/issues/142024.
@@ -620,9 +775,15 @@ def _maybe_expr(s: Union[int, torch.SymInt]):
 
             if _maybe_expr(a_val) in a_stride_expr:
                 a_expr = a_stride_expr[_maybe_expr(a_val)]
+<<<<<<< HEAD
                 assert b_stride_expr[_maybe_expr(b_val)] == a_expr, (
                     f"a_stride_expr:{a_stride_expr}, b_stride_expr:{b_stride_expr}"
                 )
+=======
+                assert (
+                    b_stride_expr[_maybe_expr(b_val)] == a_expr
+                ), f"a_stride_expr:{a_stride_expr}, b_stride_expr:{b_stride_expr}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 merged_strides[i] = a_expr
             else:
                 if a_val == 1:
@@ -679,12 +840,21 @@ def cond_func(ctx, pred, true_fn, false_fn, inputs):
 
 @cond_op.py_impl(torch._C._functorch.TransformType.Vmap)
 def cond_batch_rule(interpreter, pred, true_fn, false_fn, inputs):
+<<<<<<< HEAD
     assert isinstance(inputs, (list, tuple)), (
         "Cond inputs must be a list or tuple of tensors"
     )
     assert all(isinstance(i, torch.Tensor) for i in inputs), (
         "Cond inputs must be a list of tensors"
     )
+=======
+    assert isinstance(
+        inputs, (list, tuple)
+    ), "Cond inputs must be a list or tuple of tensors"
+    assert all(
+        isinstance(i, torch.Tensor) for i in inputs
+    ), "Cond inputs must be a list of tensors"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     pred_is_batched = isinstance(pred, torch.Tensor) and is_batchedtensor(pred)
     pred_ = get_unwrapped(pred) if pred_is_batched else pred
@@ -722,4 +892,8 @@ def fn(p, *args):
     if not isinstance(result, tuple):
         result = (result,)
     lvl = interpreter.level()
+<<<<<<< HEAD
     return tuple(_add_batch_dim(r, 0, lvl) for r in result)
+=======
+    return tuple([_add_batch_dim(r, 0, lvl) for r in result])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
index beb26ca3a8d43..7cdcce0371347 100644
--- a/torch/_higher_order_ops/effects.py
+++ b/torch/_higher_order_ops/effects.py
@@ -240,9 +240,15 @@ def handle_effects(
     key = get_effect_key(op, args, kwargs)
     assert key is not None
     if key not in tokens:
+<<<<<<< HEAD
         assert allow_token_discovery, (
             f"Could not find a token for effect {key} which came from the function {op}"
         )
+=======
+        assert (
+            allow_token_discovery
+        ), f"Could not find a token for effect {key} which came from the function {op}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proxy_tensor_mode = torch._C._get_dispatch_mode(
             torch._C._TorchDispatchModeKey.PROXY
         )
@@ -298,5 +304,8 @@ def handle_effects(
     assert isinstance(wrapped_token, torch.Tensor)
     tokens[key] = wrapped_token
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ctx.wrap_tensors(unwrapped_outs)
diff --git a/torch/_higher_order_ops/executorch_call_delegate.py b/torch/_higher_order_ops/executorch_call_delegate.py
index 3274502b943cd..d0b69934e0653 100644
--- a/torch/_higher_order_ops/executorch_call_delegate.py
+++ b/torch/_higher_order_ops/executorch_call_delegate.py
@@ -49,10 +49,14 @@ def _unwrap_proxy(e):
         if not isinstance(e, (torch.Tensor, torch.SymInt, torch.SymFloat)):
             return e
         return get_proxy_slot(
+<<<<<<< HEAD
             cast(torch.Tensor, e),
             proxy_mode.tracer,
             e,
             lambda e: e.proxy,  # type: ignore[attr-defined]
+=======
+            cast(torch.Tensor, e), proxy_mode.tracer, e, lambda e: e.proxy  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if not is_lowered_module(lowered_module):
diff --git a/torch/_higher_order_ops/flat_apply.py b/torch/_higher_order_ops/flat_apply.py
index 8b45cb3db6391..3f8a936690e6d 100644
--- a/torch/_higher_order_ops/flat_apply.py
+++ b/torch/_higher_order_ops/flat_apply.py
@@ -1,6 +1,11 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
+=======
+from dataclasses import dataclass
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.node
@@ -108,7 +113,11 @@ def impl(func, in_spec, *flat_args):
     #
     # TODO: The following can be updated to support non-graphable outputs and pytrees.
     # For non-graphable constant outputs: the assumption would be that they are constant
+<<<<<<< HEAD
     # (every time the function runs those MUST be the same)
+=======
+    # (everytime the function runs those MUST be the same)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For pytree outputs:
     # I'm not sure if we need to return (flat_output, spec) or just (flat_output,):
     # in the latter case the tracers need to carry out the output specs
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index de23e07671ed0..c6d0deb87624a 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -1,6 +1,11 @@
 import math
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, Union
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -38,9 +43,15 @@ def _construct_strides(
 ) -> Sequence[int]:
     """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
     # Initialize strides
+<<<<<<< HEAD
     assert len(sizes) == len(fill_order), (
         "Length of sizes must match the length of the fill order"
     )
+=======
+    assert len(sizes) == len(
+        fill_order
+    ), "Length of sizes must match the length of the fill order"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     strides = [0] * len(sizes)
 
     # Start with stride 1 for the innermost dimension
@@ -92,7 +103,11 @@ def __call__(
         kernel_options: dict[str, Any],
         score_mod_other_buffers: tuple = (),
         mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
         return super().__call__(
             query,
@@ -112,7 +127,11 @@ def __call__(
 
 class FlexAttentionBackwardHOP(HigherOrderOperator):
     def __init__(self) -> None:
+<<<<<<< HEAD
         super().__init__("flex_attention_backward", cacheable=True)
+=======
+        super().__init__("flex_attention_backward")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self,
@@ -134,7 +153,10 @@ def __call__(
         torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
     ]:
         validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().__call__(
             query,
             key,
@@ -171,7 +193,11 @@ def _math_attention_inner(
 
     working_precision = torch.float64 if query.dtype == torch.float64 else torch.float32
 
+<<<<<<< HEAD
     scores = query.to(working_precision) @ key.to(working_precision).transpose(-2, -1)
+=======
+    scores = (query @ key.transpose(-2, -1)).to(dtype=working_precision)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     b = torch.arange(0, scores.size(0), device=scores.device)
     h = torch.arange(0, scores.size(1), device=scores.device)
@@ -209,7 +235,11 @@ def math_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Eager implementation
 
     This implementation uses vmap to vectorize the score_mod function over the batch, head, m, and n dimensions.
@@ -252,6 +282,7 @@ def math_attention(
     masked_rows = torch.all(post_mod_scores == -float("inf"), dim=-1)
     logsumexp = torch.where(masked_rows, -float("inf"), logsumexp)
 
+<<<<<<< HEAD
     # working precision will be used so no need to cast to fp32
     max_scores = torch.max(post_mod_scores, dim=-1)[0]
 
@@ -265,6 +296,11 @@ def math_attention(
         logsumexp / math.log(2),
         max_scores / math.log(2),
     )
+=======
+    post_mod_scores = torch._safe_softmax(post_mod_scores, dim=-1)
+
+    return post_mod_scores.to(query.dtype) @ value, logsumexp / math.log(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @flex_attention.py_impl(DispatchKey.CompositeExplicitAutograd)
@@ -278,8 +314,13 @@ def sdpa_dense(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     out, lse, max_scores = math_attention(
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+    out, lse = math_attention(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         query,
         key,
         value,
@@ -291,7 +332,11 @@ def sdpa_dense(
         mask_mod_other_buffers,
     )
     out = _permute_strides(out, query.stride())
+<<<<<<< HEAD
     return out, lse, max_scores
+=======
+    return out, lse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def trace_flex_attention(
@@ -305,7 +350,11 @@ def trace_flex_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Traces the flex_attention operator with the given score_mod function and other_buffers.
 
     Trace SDPA will call make_fx with "fake" example vals and then trace the score_mod function
@@ -354,17 +403,24 @@ def trace_flex_attention(
         score_mod_other_buffers,
         mask_mod_other_buffers,
     )
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function", flex_attention, proxy_args, {}
     )
     return track_tensor_tree(
+<<<<<<< HEAD
         example_out,
         out_proxy,
         constant=None,
         # pyrefly: ignore [bad-argument-type]
         tracer=proxy_mode.tracer,
+=======
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -380,7 +436,11 @@ def flex_attention_proxy_torch_dispatch_mode(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert mode is not None, "Mode should always be enabled for python fallback key"
     return trace_flex_attention(
         mode,
@@ -408,7 +468,11 @@ def flex_attention_functionalize(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Defines the functionalization rules for the flex_attention operator.
 
     Write now we are unwrapping each tensor and then redispatching to the next, however we want to
@@ -493,7 +557,11 @@ def flex_attention_fake_impl(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if has_user_subclass(
         (
             query,
@@ -510,6 +578,7 @@ def flex_attention_fake_impl(
     ):
         return NotImplemented
 
+<<<<<<< HEAD
     v_head_dim = value.size(-1)
     batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
     logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
@@ -518,6 +587,21 @@ def flex_attention_fake_impl(
     out = query.new_empty(out_shape)
     out = _permute_strides(out, query.stride())
     return out, logsumexp, max_scores
+=======
+    # TODO: Figure out a better way to handle this for NJT than using sum()
+    if query.is_nested:
+        out = torch.empty_like(query, memory_format=torch.contiguous_format)
+        logsumexp = query.sum(dim=-1)
+        return out, logsumexp
+
+    v_head_dim = value.size(-1)
+    batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
+    logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
+    out_shape = (batch_size, num_heads, seq_len_q, v_head_dim)
+    out = query.new_empty(out_shape)
+    out = _permute_strides(out, query.stride())
+    return out, logsumexp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Registers dispatches for SAC
@@ -605,7 +689,11 @@ def joint_f(
             *other_buffers: tuple[Tensor, ...],
         ) -> tuple[Tensor, ...]:
             def fw_with_masks(
+<<<<<<< HEAD
                 *args: tuple[Tensor, ...],
+=======
+                *args: tuple[Tensor, ...]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) -> tuple[tuple[Tensor], tuple[bool]]:
                 fw_out = score_mod(*args)
                 out_requires_grad = fw_out.requires_grad
@@ -626,7 +714,10 @@ def fw_with_masks(
 
 class FlexAttentionAutogradOp(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(
         ctx: Any,
         query: Tensor,
@@ -639,15 +730,25 @@ def forward(
         kernel_options: dict[str, Any],
         mask_mod_other_buffers: tuple[Any, ...],
         *score_mod_other_buffers: tuple[Any, ...],
+<<<<<<< HEAD
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         any_buffer_requires_grad = any(
             buffer.requires_grad
             for buffer in mask_mod_other_buffers
             if isinstance(buffer, torch.Tensor)
         )
+<<<<<<< HEAD
         assert not any_buffer_requires_grad, (
             "Captured buffers from mask mod that require grad are not supported."
         )
+=======
+        assert (
+            not any_buffer_requires_grad
+        ), "Captured buffers from mask mod that require grad are not supported."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctx._fw_graph = fw_graph
         ctx._joint_graph = joint_graph
         ctx._mask_graph = block_mask[-1]
@@ -655,7 +756,11 @@ def forward(
         ctx.kernel_options = kernel_options
         ctx._score_mod_other_buffers_len = len(score_mod_other_buffers)
         with torch._C._AutoDispatchBelowAutograd():
+<<<<<<< HEAD
             out, logsumexp, max_scores = flex_attention(
+=======
+            out, logsumexp = flex_attention(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 query,
                 key,
                 value,
@@ -666,8 +771,12 @@ def forward(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
+<<<<<<< HEAD
         # no grads for you sir
         ctx.mark_non_differentiable(max_scores)
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         save_tensors_and_symints_for_backward(
             ctx,
             (
@@ -676,12 +785,16 @@ def forward(
                 value,
                 out,
                 logsumexp,
+<<<<<<< HEAD
                 max_scores,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *block_mask[:-1],
                 *score_mod_other_buffers,
                 *mask_mod_other_buffers,
             ),
         )
+<<<<<<< HEAD
         return out, logsumexp, max_scores
 
     @staticmethod
@@ -691,6 +804,12 @@ def backward(  # type: ignore[override]
         grad_logsumexp: Tensor,
         grad_max_scores: Tensor,
     ) -> tuple[Optional[Tensor], ...]:
+=======
+        return out, logsumexp
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: Tensor, grad_logsumexp: Tensor) -> tuple[Optional[Tensor], ...]:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fw_args = saved_tensors_and_symints(ctx)
         (
             query,
@@ -698,7 +817,10 @@ def backward(  # type: ignore[override]
             value,
             out,
             logsumexp,
+<<<<<<< HEAD
             max_scores,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             query_lengths,
             kv_lengths,
             kv_num_blocks,
@@ -777,7 +899,11 @@ def flex_attention_autograd(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple[Tensor, ...] = (),
     mask_mod_other_buffers: tuple[Tensor, ...] = (),
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+=======
+) -> tuple[torch.Tensor, torch.Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
     with TransformGetItemToIndex():
@@ -786,11 +912,14 @@ def flex_attention_autograd(
             for t in (query, key, value, *score_mod_other_buffers)
         )
         if torch.is_grad_enabled() and input_requires_grad:
+<<<<<<< HEAD
             if block_mask[7] is None:
                 raise RuntimeError(
                     "BlockMask q_indices is None. Backward pass requires q_indices to be computed. "
                     "Please create the BlockMask with compute_q_blocks=True"
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_vals = (
                 query.new_zeros((), requires_grad=input_requires_grad),
                 query.new_zeros((), dtype=torch.int),
@@ -803,7 +932,11 @@ def flex_attention_autograd(
             )
         else:
             fw_graph, bw_graph = score_mod, None
+<<<<<<< HEAD
         out, logsumexp, max_scores = FlexAttentionAutogradOp.apply(
+=======
+        out, logsumexp = FlexAttentionAutogradOp.apply(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             query,
             key,
             value,
@@ -815,7 +948,11 @@ def flex_attention_autograd(
             mask_mod_other_buffers,
             *score_mod_other_buffers,
         )
+<<<<<<< HEAD
     return out, logsumexp, max_scores
+=======
+    return out, logsumexp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ---------------------------- Backward HOP Implementation ----------------------------
@@ -902,6 +1039,7 @@ def _maybe_new_buffer(
 
     grad_value = softmax_scores.to(query.dtype).transpose(-2, -1) @ grad_out
 
+<<<<<<< HEAD
     grad_softmax_scores = grad_out.to(dtype=softmax_scores.dtype) @ value.to(
         dtype=softmax_scores.dtype
     ).transpose(-2, -1)
@@ -911,6 +1049,11 @@ def _maybe_new_buffer(
         -1,
         keepdim=True,
     )
+=======
+    grad_softmax_scores = grad_out @ value.transpose(-2, -1)
+
+    sum_scores = torch.sum(out * grad_out, -1, keepdim=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_score_mod = softmax_scores * (
         grad_softmax_scores - sum_scores + grad_logsumexp.unsqueeze(-1)
     )
@@ -970,9 +1113,15 @@ def _maybe_new_buffer(
     actual_grad_value.copy_(grad_value)
 
     if Bq != Bkv:
+<<<<<<< HEAD
         assert Bq > 1 and Bkv == 1, (
             f"Bq and Bkv must broadcast. Got Bq={Bq} and Bkv={Bkv}"
         )
+=======
+        assert (
+            Bq > 1 and Bkv == 1
+        ), f"Bq and Bkv must broadcast. Got Bq={Bq} and Bkv={Bkv}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         actual_grad_key = torch.sum(actual_grad_key, 0, keepdim=True)
         actual_grad_value = torch.sum(actual_grad_value, 0, keepdim=True)
@@ -1075,7 +1224,10 @@ def trace_flex_attention_backward(
         score_mod_other_buffers,
         mask_mod_other_buffers,
     )
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function",
@@ -1085,11 +1237,15 @@ def trace_flex_attention_backward(
         name="flex_attention_backward",
     )
     return track_tensor_tree(
+<<<<<<< HEAD
         example_out,
         out_proxy,
         constant=None,
         # pyrefly: ignore [bad-argument-type]
         tracer=proxy_mode.tracer,
+=======
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -1273,12 +1429,23 @@ def flex_attention_backward_fake_tensor_mode(
     grad_query = torch.empty_like(query)
     # zeros_and_scatter creates a contiguous zeros tensor -> contiguous_format
     grad_score_mod_captured = tuple(
+<<<<<<< HEAD
         (
             torch.empty_like(buffer, memory_format=torch.contiguous_format)
             if isinstance(buffer, torch.Tensor)
             else None
         )
         for buffer in score_mod_other_buffers
+=======
+        [
+            (
+                torch.empty_like(buffer, memory_format=torch.contiguous_format)
+                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                else None
+            )
+            for buffer in score_mod_other_buffers
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     broadcasted_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
diff --git a/torch/_higher_order_ops/foreach_map.py b/torch/_higher_order_ops/foreach_map.py
index 0d02515d555d3..bb511fe0b987c 100644
--- a/torch/_higher_order_ops/foreach_map.py
+++ b/torch/_higher_order_ops/foreach_map.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._higher_order_ops.base_hop import BaseHOP, FunctionWithNoFreeVars
 
diff --git a/torch/_higher_order_ops/hints_wrap.py b/torch/_higher_order_ops/hints_wrap.py
index 3f21c518cbd74..1adcc8e78dbf3 100644
--- a/torch/_higher_order_ops/hints_wrap.py
+++ b/torch/_higher_order_ops/hints_wrap.py
@@ -38,7 +38,12 @@ def __call__(self, body_fn, args, kwargs, hints):
 
         if not all(isinstance(t, (torch.Tensor, int, float, bool)) for t in args):
             raise RuntimeError(
+<<<<<<< HEAD
                 f"args must be a tuple of tensors, ints, floats, or bools, got {args}"
+=======
+                "args must be a tuple of tensors, ints, floats, or bools, got "
+                f"{args}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if not isinstance(kwargs, dict):
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index e22b741631d3f..0da517285c09f 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -1,9 +1,17 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import contextlib
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+
+import contextlib
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -36,10 +44,13 @@
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 invoke_subgraph_counter = 0
 
 
@@ -49,7 +60,11 @@
 @dataclass
 class OutputMetadata:
     num_fw_outs: Optional[int] = None
+<<<<<<< HEAD
     indexes_with_symint: set[int] = field(default_factory=set)
+=======
+    indexes_with_none: set[int] = field(default_factory=set)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     indexes_with_no_grad: set[int] = field(default_factory=set)
 
 
@@ -73,6 +88,7 @@ def __call__(
         identifier: Optional[str],
         *operands,
     ):
+<<<<<<< HEAD
         assert identifier is None or isinstance(identifier, str), (
             "identifier must be a None or a string"
         )
@@ -87,6 +103,18 @@ def __call__(
         return super().__call__(subgraph, identifier, *operands)
 
     # pyrefly: ignore [bad-override]
+=======
+        assert identifier is None or isinstance(
+            identifier, str
+        ), "identifier must be a None or a string"
+
+        assert all(
+            isinstance(o, (torch.Tensor, int, torch.SymInt)) for o in operands
+        ), f"invoke_subgraph operands must be a list of tensors/ints/SymInts {operands}"
+
+        return super().__call__(subgraph, identifier, *operands)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def gen_schema(self, subgraph, identifier, *operands):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import (
@@ -94,7 +122,15 @@ def gen_schema(self, subgraph, identifier, *operands):
             materialize_as_graph,
         )
 
+<<<<<<< HEAD
         gm: torch.fx.GraphModule = materialize_as_graph(subgraph, operands)
+=======
+        gm: torch.fx.GraphModule = (
+            subgraph
+            if isinstance(subgraph, torch.fx.GraphModule)
+            else materialize_as_graph(subgraph, operands)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         schema_gen = HopSchemaGenerator(self)
         schema_gen.add_arg("subgraph", gm)
@@ -105,7 +141,11 @@ def gen_schema(self, subgraph, identifier, *operands):
             _,
             mutated_inputs,
             outputs,
+<<<<<<< HEAD
         ) = check_input_alias_and_mutation_return_outputs(gm)
+=======
+        ) = check_input_alias_and_mutation_return_outputs(gm, operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx, arg in enumerate(operands):
             schema_gen.add_arg(f"arg{idx}", arg, is_mutated=idx in mutated_inputs)
         for out in outputs:
@@ -131,6 +171,7 @@ def invoke_subgraph_placeholder(func, *args, **kwargs):
         def _invoke_subgraph_placeholder_wrapper(func, args):
             return invoke_subgraph_placeholder(func, *args)
 
+<<<<<<< HEAD
         with (
             _set_compilation_env(),
             torch._dynamo.utils.disable_cache_limit(),
@@ -141,6 +182,12 @@ def _invoke_subgraph_placeholder_wrapper(func, args):
                     backend: Union[str, Callable[..., Any]] = (
                         make_eager_backend_with_torch_function_mode(metadata_mode)
                     )
+=======
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit(), _temp_remove_pre_dispatch_torch_function_mode():
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     backend = "eager"
 
@@ -259,8 +306,13 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
+<<<<<<< HEAD
                 if isinstance(fw_out, torch.SymInt):
                     output_metadata.indexes_with_symint.add(idx)
+=======
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
 
@@ -332,8 +384,13 @@ def get_output_metadata(subgraph, *operands):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
+<<<<<<< HEAD
                 if isinstance(fw_out, torch.SymInt):
                     output_metadata.indexes_with_symint.add(idx)
+=======
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
             return output_metadata
@@ -402,7 +459,10 @@ class InvokeSubgraphAutogradOp(torch.autograd.Function):
     """
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(
         ctx,
         subgraph,
@@ -430,10 +490,17 @@ def forward(
                 *operands,
             )
 
+<<<<<<< HEAD
         # Check that int (coming from symint) is at expected indexes.
         for idx, o in enumerate(out):
             if isinstance(o, int):
                 assert idx in output_metadata.indexes_with_symint
+=======
+        # Check that None is at expected indexes.
+        for idx, o in enumerate(out):
+            if o is None:
+                assert idx in output_metadata.indexes_with_none
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return out
 
@@ -454,7 +521,11 @@ def backward(
         filtered_grad_outs = []
         for idx, o in enumerate(grad_outs):
             if o is None:
+<<<<<<< HEAD
                 assert idx in output_metadata.indexes_with_symint
+=======
+                assert idx in output_metadata.indexes_with_none
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif idx in output_metadata.indexes_with_no_grad:
                 # Deliberately skip over the grad_outs which we know should be
                 # None because the corresponding fwd_out does not require_grad.
@@ -472,14 +543,20 @@ def backward(
         from torch._subclasses.fake_tensor import extract_tensor_metadata
 
         fake_mode = detect_fake_mode(primals + filtered_grad_outs)
+<<<<<<< HEAD
         assert fake_mode is not None, "fake_mode should be enabled for HOPs"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state = _CacheKeyState(fake_mode.shape_env)
 
         tangent_metadata: list[object] = []
         for tangent in filtered_grad_outs:
             metadata = extract_tensor_metadata(tangent)
             metadata._flatten_into(tangent_metadata, fake_mode, state)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tangent_metadata = tuple(tangent_metadata)
 
         # bw_graph is a joint graph with signature (*primals_and_tangents) and
@@ -569,9 +646,15 @@ def _(ctx, subgraph, identifier, *operands):
         # We call auto_functionalized_v2 to support input mutation of invoke_subgraph.
         # See NOTE [Support input mutation of hops] for the overall design.
         #
+<<<<<<< HEAD
         # invoke_subgraph is special because of its identifier based caching mechanism.
         # In invoke_subgraph's functionalization key implementation, we create a new
         # identifier because the subgraph is replaced by FunctionWithNoFreeVars in a
+=======
+        # invoke_subgraph is special because of its identifier based caching machanism.
+        # In invoke_subgraph's functionalization key implementation, we create a new
+        # identifer because the subgraph is replaced by FunctionWithNoFreeVars in a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # functional + epilogue form.
         assert isinstance(identifier, str), identifier
         return do_auto_functionalize_v2(
@@ -616,7 +699,10 @@ def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, *operands):
         from torch._guards import detect_fake_mode
 
         fake_mode = detect_fake_mode(operands)
+<<<<<<< HEAD
         assert fake_mode is not None and fake_mode.shape_env is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         insert_deferred_runtime_asserts(
             graph,
             fake_mode.shape_env,
@@ -645,7 +731,11 @@ def _unwrap_proxy(arg):
             # with a previously cached identifier, the corresponding graph module might not
             # exist as a submodule in the new tracer's root. Therefore, we register it as a submodule below.
             #
+<<<<<<< HEAD
             # The alternative is to give a new identifier when we re-trace the invoke_subgraph but this will increase
+=======
+            # The alternative is to give a new identifer when we re-trace the invoke_subgraph but this will increase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # the compilatoin time, which defeats the purpose of caching.
             registered_before = False
             for (
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index dd30c2efd67ee..4fba951e56264 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Union
+=======
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeVarTuple
 
 import torch
@@ -14,6 +18,10 @@
 from torch._subclasses.functional_tensor import disable_functional_mode
 from torch.fx.experimental.proxy_tensor import (
     disable_proxy_modes_tracing,
+<<<<<<< HEAD
+=======
+    make_fx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
@@ -22,6 +30,7 @@
     _from_fun,
     _stack_pytree,
     _unstack_pytree,
+<<<<<<< HEAD
     create_bw_fn,
     fill_none_with_masks,
     filter_with_masks,
@@ -29,6 +38,12 @@
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
     split_into_chunks,
+=======
+    clone_outputs_aliasing_inputs,
+    prepare_fw_with_masks,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -43,13 +58,91 @@ def __call__(self, *args, **kwargs):
 map_impl = MapImpl()
 
 
+<<<<<<< HEAD
+=======
+def create_fw_bw_graph(f, num_mapped_args, *args):
+    mapped_xs = args[:num_mapped_args]
+    pos_args = args[num_mapped_args:]
+
+    # See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
+
+            example_pos_args = [
+                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
+                for arg in pos_args
+            ]
+            example_flat_out = pytree.tree_map(
+                _from_fun, f(*example_xs, *example_pos_args)
+            )
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in example_flat_out
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of map only contains tensors or None. "
+                    f"Got types {[type(out) for out in example_flat_out]}."
+                )
+            example_grad = [_from_fun(out) for out in example_flat_out]
+
+            fw_graph = make_fx(f)(*example_xs, *example_pos_args)
+
+        from torch._functorch.aot_autograd import AOTConfig, create_joint
+
+        dummy_aot_config = AOTConfig(
+            fw_compiler=None,  # type: ignore[arg-type]
+            bw_compiler=None,  # type: ignore[arg-type]
+            partition_fn=None,  # type: ignore[arg-type]
+            decompositions={},
+            num_params_buffers=0,
+            aot_id=0,
+            keep_inference_input_mutations=False,
+        )
+
+        def joint_f(*example_args):
+            joint_mapped_args = example_args[:joint_num_mapped]
+            args = example_args[joint_num_mapped:]
+
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+
+            joint = create_joint(prepare_fw_with_masks(f), aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input) + list(args),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs
+            maybe_clone = clone_outputs_aliasing_inputs(example_args)
+
+            return pytree.tree_map(maybe_clone, grads)
+
+        joint_num_mapped = len(example_grad) + len(example_xs)
+        joint_graph = make_fx(joint_f)(*example_xs, *example_grad, *example_pos_args)
+        return fw_graph, joint_graph
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def map(
     f: Callable[[pytree.PyTree, tuple[pytree.PyTree, ...]], pytree.PyTree],
     xs: Union[pytree.PyTree, torch.Tensor],
     *args: TypeVarTuple,
 ):
     r"""
+<<<<<<< HEAD
     Performs a map of f with xs. Intuitively, you can think of the semantic being:
+=======
+    Perfoms a map of f with xs. Intuitively, you can think of the semantic being:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out = []
     for idx in len(xs.size(0)):
@@ -125,6 +218,7 @@ def wrapped_fn(*flat_args, f, xs_tree_spec, args_tree_spec, num_xs):
 
 class MapAutogradOp(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def forward(ctx, f, num_mapped_args, *flat_args):
         ctx._f = f
@@ -139,11 +233,23 @@ def forward(ctx, f, num_mapped_args, *flat_args):
         with torch._C._AutoDispatchBelowAutograd():
             return (
                 *map_impl(f, flat_args[:num_mapped_args], flat_args[num_mapped_args:]),
+=======
+    def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
+        save_tensors_and_symints_for_backward(ctx, flat_args)
+        ctx._joint_graph = joint_graph
+        ctx._num_mapped_args = num_mapped_args
+        with torch._C._AutoDispatchBelowAutograd():
+            return (
+                *map_impl(
+                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @staticmethod
     def backward(ctx, *flat_grads):
         fw_args = saved_tensors_and_symints(ctx)
+<<<<<<< HEAD
         num_mapped_args = ctx._num_mapped_args
         num_pos_args = ctx._num_pos_args
         num_grads = len(flat_grads)
@@ -216,6 +322,24 @@ def trace_map(proxy_mode, func_overload, f, xs, pos_args):
         body_graph = f
 
         body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
+=======
+        fw_mapped_args = fw_args[: ctx._num_mapped_args]
+        pos_args = fw_args[ctx._num_mapped_args :]
+
+        grads = map_impl(
+            ctx._joint_graph,
+            fw_mapped_args + flat_grads,
+            pos_args,
+        )
+        return None, None, None, *grads
+
+
+def trace_map(proxy_mode, func_overload, f, xs, pos_args):
+    example_input = _unstack_pytree(xs)[0]
+    body_graph = f
+
+    body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
 
@@ -242,7 +366,12 @@ def map_dense(f, xs, pos_args):
 @map_impl.py_autograd_impl
 def map_autograd(f, xs, pos_args):
     num_mapped_args = len(xs)
+<<<<<<< HEAD
     flat_out = MapAutogradOp.apply(f, num_mapped_args, *xs, *pos_args)
+=======
+    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return flat_out
 
 
diff --git a/torch/_higher_order_ops/out_dtype.py b/torch/_higher_order_ops/out_dtype.py
index 38c07e37bdb85..93107f70c683d 100644
--- a/torch/_higher_order_ops/out_dtype.py
+++ b/torch/_higher_order_ops/out_dtype.py
@@ -111,8 +111,13 @@ def is_int_mm(op, output_dtype, args):
         and len(args) == 2
         and args[0].dtype == torch.int8
         and args[1].dtype == torch.int8
+<<<<<<< HEAD
         and (args[0].is_cuda or args[0].is_xpu)
         and (args[1].is_cuda or args[1].is_xpu)
+=======
+        and args[0].is_cuda
+        and args[1].is_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/_higher_order_ops/run_const_graph.py b/torch/_higher_order_ops/run_const_graph.py
index ed7c5278f5fe6..eebb0a296d524 100644
--- a/torch/_higher_order_ops/run_const_graph.py
+++ b/torch/_higher_order_ops/run_const_graph.py
@@ -1,24 +1,38 @@
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
 
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import autograd_not_implemented
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
+<<<<<<< HEAD
 
 
 if TYPE_CHECKING:
     from torch._subclasses.functional_tensor import BaseFunctionalizeAPI
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.utils import _pytree as pytree
 
 
 class RunConstGraph(HigherOrderOperator):
+<<<<<<< HEAD
     def __init__(self) -> None:
         super().__init__("run_const_graph")
 
     def __call__(self, graph: torch.fx.GraphModule, args: tuple[object, ...]) -> object:
+=======
+    def __init__(self):
+        super().__init__("run_const_graph")
+
+    def __call__(self, graph, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().__call__(graph, args)
 
 
@@ -26,6 +40,7 @@ def __call__(self, graph: torch.fx.GraphModule, args: tuple[object, ...]) -> obj
 
 
 @run_const_graph.py_impl(ProxyTorchDispatchMode)
+<<<<<<< HEAD
 def run_const_graph_dispatch_mode(
     mode: ProxyTorchDispatchMode, graph: torch.fx.GraphModule, args: tuple[object, ...]
 ) -> object:
@@ -34,6 +49,14 @@ def run_const_graph_dispatch_mode(
     assert isinstance(const_gm, torch.fx.GraphModule)
     assert not hasattr(mode.tracer.root, "_const_graph")  # type: ignore[union-attr]
     mode.tracer.root.register_module("_const_graph", const_gm)  # type: ignore[union-attr]
+=======
+def run_const_graph_dispatch_mode(mode, graph, args):
+    const_gm, weights = graph, args
+    p_args = pytree.tree_map(mode.tracer.unwrap_proxy, (graph, args))
+    assert isinstance(const_gm, torch.fx.GraphModule)
+    assert not hasattr(mode.tracer.root, "_const_graph")
+    mode.tracer.root.register_module("_const_graph", const_gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     proxy = mode.tracer.create_proxy("call_function", run_const_graph, p_args, {})
 
@@ -42,6 +65,7 @@ def run_const_graph_dispatch_mode(
 
 
 @run_const_graph.py_functionalize_impl
+<<<<<<< HEAD
 def run_const_graph_functional(
     ctx: "BaseFunctionalizeAPI", graph: torch.fx.GraphModule, args: tuple[Any, ...]
 ) -> Any:
@@ -50,6 +74,14 @@ def run_const_graph_functional(
     with ctx.redispatch_to_next():
         out = run_const_graph(graph, unwrapped_args)
         return ctx.wrap_tensors(out)  # type: ignore[arg-type]
+=======
+def run_const_graph_functional(ctx, graph, args):
+    unwrapped_args = ctx.unwrap_tensors(args)
+
+    with ctx.redispatch_to_next():
+        out = run_const_graph(*unwrapped_args)
+        return ctx.wrap_tensors(out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 run_const_graph.py_autograd_impl(
@@ -58,17 +90,25 @@ def run_const_graph_functional(
 
 
 @run_const_graph.py_impl(FakeTensorMode)
+<<<<<<< HEAD
 def run_const_graph_fake_tensor_mode(
     mode: FakeTensorMode, graph: torch.fx.GraphModule, args: tuple[object, ...]
 ) -> object:
+=======
+def run_const_graph_fake_tensor_mode(mode, graph, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(graph, torch.fx.GraphModule)
     with mode:
         return graph(*args)
 
 
 @run_const_graph.py_impl(DispatchKey.CPU)
+<<<<<<< HEAD
 def run_const_graph_cpu(
     graph: torch.fx.GraphModule, args: tuple[object, ...]
 ) -> object:
+=======
+def run_const_graph_cpu(graph, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(graph, torch.fx.GraphModule)
     return graph(*args)
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index 2c3067f2cce00..0d8f2b18a6341 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -1,15 +1,23 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import enum
 import functools
 import itertools
 import logging
 from collections.abc import Callable
 from typing import Any
+=======
+import functools
+import itertools
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._prims_common as utils
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
+<<<<<<< HEAD
 from torch._higher_order_ops.partitioner import (
     _find_hop_subgraph_outputs,
     HopGraphMinCutPartitioner,
@@ -27,6 +35,17 @@
     materialize_as_graph,
     reenter_make_fx,
     split_into_chunks,
+=======
+from torch._higher_order_ops.cond import create_bw_fn
+from torch._higher_order_ops.utils import (
+    _maybe_compile_and_run_fn,
+    check_meta_consistency,
+    first_slice_copy,
+    materialize_as_graph,
+    reenter_make_fx,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unique_graph_id,
     validate_subgraph_args_types,
 )
@@ -40,16 +59,25 @@
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
+<<<<<<< HEAD
 logger: logging.Logger = logging.getLogger(__name__)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch._ops.ops.aten
 
 
 def wrap_combine_fn_flat(
     *args, combine_fn, spec_init, spec_xs, num_init_leaves, num_inp_leaves
 ):
+<<<<<<< HEAD
     assert len(args) == (num_init_leaves + num_inp_leaves), (
         f"combine_fn received wrong number of arguments, expected {num_init_leaves + num_inp_leaves}, but got {len(args)}"
     )
+=======
+    assert len(args) == (
+        num_init_leaves + num_inp_leaves
+    ), f"Combin_fn received wrong number of arguments, expected {num_init_leaves + num_inp_leaves}, but got {len(args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     carry = pytree.tree_unflatten(args[:num_init_leaves], spec_init)
     xs = pytree.tree_unflatten(args[num_init_leaves:], spec_xs)
     return combine_fn(carry, xs)
@@ -70,6 +98,53 @@ def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
     )
 
 
+<<<<<<< HEAD
+=======
+# NOTE: These functions can be reused in associative_scan and eventually moved to
+# torch._higher_order_ops.utils
+def get_tensor_mask(tensor_list: list[Any]) -> list[bool]:
+    # Returns a mask whether a list element is a tensor or not
+    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+
+
+def mask_list(
+    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
+) -> list[Any]:
+    # Masks elements on an `inp` list.
+    # If other is None, then the elements of the `inp` list where the mask is False are removed
+    # If other is not None, then the elements of the `inp` list where the mask is False are
+    # replaced with the elements of the `other` list
+    assert len(mask) == len(
+        inp
+    ), "The length of the mask needs to be identical to the length of the input"
+    if other is not None:
+        assert len(inp) == len(
+            other
+        ), "If an input and an other list is provided, they need to have the same length"
+        return [i if m else o for m, i, o in zip(mask, inp, other)]
+    else:
+        return [i for m, i in zip(mask, inp) if m]
+
+
+def first_slice_copy_with_grad(li: list[Any]) -> list[Any]:
+    # First_slice_copy does not keep the original requires_grad flag,
+    # but we need it for materialize_as_graph
+    # in order to compute the correct gradients
+    # The reason why first_slice_copy doesn't keep requires_grad flag is
+    # because it's called in torch.autograd.Function.backward/forward.
+    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
+    return slc
+
+
+def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
+    it = iter(iterable)
+    assert sum(chunk_sizes) == len(
+        iterable
+    ), "the sum of all chunks needs to match the length of the iterable."
+    return [list(itertools.islice(it, size)) for size in chunk_sizes]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def call_operator(operator, *args):
     return pytree.tree_leaves(operator(*args))
 
@@ -102,7 +177,11 @@ def scan(
             and the second output  of ``combine_fn`` represents a slice of the output.
             This function must be pure, i.e., no lifted arguments are supported at the moment
             and may not have any side effects.
+<<<<<<< HEAD
         init (torch.Tensor or pytree with tensor leaves): The initial scan carry, a tensor, or nested pytree of tensors.
+=======
+        init (torch.Tensor or pytree with tensor leaves): The inital scan carry, a tensor, or nested pytree of tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             The ``init`` is expected to have the same pytree structure as the first output element (i.e. carry)
             of ``combine_fn``.
         xs (torch.Tensor or pytree with tensor leaves): The input tensor, or nested pytree of tensors.
@@ -121,7 +200,11 @@ def scan(
         - The combine_fn shouldn't have any aliasing between input-input, input-output, and output-output. E.g. return a view
             or the same tensor as input is not supported. As a workaround, can clone the output to avoid aliasing.
 
+<<<<<<< HEAD
         - The combine_fn shouldn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
+=======
+        - The combine_fn shoudn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if you input mutation support for training is needed.
 
         - The combine_fn's init carry should match the next_carry in pytree structure and in tensor metadata.
@@ -133,7 +216,10 @@ def add(x: torch.Tensor, y: torch.Tensor):
             # clone the output to avoid output-output aliasing
             return next_carry, y.clone()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         i0 = torch.zeros(1)
         xs = torch.arange(5)
         # returns torch.tensor([10.]), torch.tensor([[0], [1.], [3.], [6.], [10.]])
@@ -154,7 +240,11 @@ def add(x: torch.Tensor, y: torch.Tensor):
     def _validate_input(cfn, lxs, linit, d, r):
         # Basic arguments check
         if not callable(cfn):
+<<<<<<< HEAD
             raise RuntimeError(f"Combine_fn must be a callable, but got {cfn}")
+=======
+            raise RuntimeError("Combine_fn must be a callable, but got {cfn}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(d, int):
             raise RuntimeError("Dim must be an int, but got " + str(type(d)))
         if not isinstance(r, bool):
@@ -188,7 +278,11 @@ def _validate_input(cfn, lxs, linit, d, r):
     # Move scan dim to 0 and always perform scan on dim 0
     leaves_xs = []
     for elem in leaves_xs_orig:
+<<<<<<< HEAD
         leaves_xs.append(torch.movedim(elem, dim, 0) if dim != 0 else elem)
+=======
+        leaves_xs.append(torch.movedim(elem, dim, 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if reverse:
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
@@ -230,9 +324,15 @@ def __call__(self, combine_fn, init, xs, additional_inputs):
         # the additional_inputs being a list. See https://github.com/pytorch/pytorch/issues/145785
         # Once this issue is resolved, the assertion should only allow tuples
         # and the tuple cast should be removed
+<<<<<<< HEAD
         assert isinstance(additional_inputs, (tuple, list)), (
             "additional_inputs must be a tuple."
         )
+=======
+        assert isinstance(
+            additional_inputs, (tuple, list)
+        ), "additional_inputs must be a tuple."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         additional_inputs = (
             tuple(additional_inputs)
             if isinstance(additional_inputs, list)
@@ -241,6 +341,7 @@ def __call__(self, combine_fn, init, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, init, xs, additional_inputs)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def gen_schema(self, combine_fn, init, xs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
@@ -283,6 +384,8 @@ def gen_schema(self, combine_fn, init, xs, additional_inputs):
         schema_gen.add_schema_tree_spec(combine_fn, init, xs, additional_inputs)
         return schema_gen.gen_schema()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 scan_op = ScanOp()
 
@@ -357,7 +460,11 @@ def store_out_in_outs(out, ind):
         # Expand outs with None depending on the tensor mask of the output
         outs_expanded = [outs.pop(0) if out_m else None for out_m in out_tensor_mask]
 
+<<<<<<< HEAD
         return (*carry, *outs_expanded)
+=======
+        return [*carry, *outs_expanded]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     scans = _scan(init, xs)
     return scans
@@ -436,6 +543,124 @@ def scan_op_dense(combine_fn, init, xs, additional_inputs):
 
 class ScanAutogradOp(torch.autograd.Function):
     """
+<<<<<<< HEAD
+=======
+    Example ::
+
+        def combine_fn(x: torch.Tensor, y: torch.Tensor):
+            next_carry = y = x * y
+            return next_carry, y
+
+        The ``combine_fn_bw``, computing the gradients for x and y of ``combine_fn`` is computed as:
+        def combine_fn_bw(x: torch.Tensor, y: torch.Tensor, g_carry: torch.Tensor, g_y: torch.Tensor):
+            return g_y * y + g_carry * y, g_y * x + g_carry * x
+
+        Note: In a real usecase of scan, there may be additional_inputs that participate in the
+        forward as well as in the backward of the scan operator. For the sake of readability those inputs
+        have been omitted in the following example, but are included in the subsequent detailed description below
+
+        The forward output of scan is computed as:
+        carry, ys = scan(combine_fn, init, xs).
+
+        This computation can be unpacked as
+        c_0, ys_0 = combine_fn(init, xs_0)
+        c_1, ys_1 = combine_fn(carry_0, xs_1)
+        c_2, ys_2 = combine_fn(carry_1, xs_2)
+        ...
+        c_T, ys_T = combine_fn(carry_(T-1), xs_T)
+
+        We collect c_0, c_1, ..., c_T into a vector of carries that we save for the backward,
+        but we only output (c_T, ys),
+        where ys is the vector of all intermediate outputs [y_0, y_1, ..., y_T].
+
+        Given the carries and the ys, the gradients for xs and for init can be computed as follows:
+        We receive the upstream gradients in torch.autograd.Function, i.e., we get g_c_T and g_ys,
+        where g_ys is the vector of all intermediate gradients of the outputs [g_ys_0, g_ys_1, ..., g_ys_T]
+
+        We then proceed to compute the gradients for the init (g_init) and the xs (g_xs) by running a
+        scan operation reverse over time. For example,
+
+        g_c_(T-1), g_xs_T = combine_fn_bw(c_(T-1), xs_T, g_c_T, g_ys_T)
+        g_c_(T-2), g_xs_(T-1) = combine_fn_bw(c_(T-2), xs_(T-1), g_c_(T-1), g_ys_(T-1))
+        g_c_(T-3), g_xs_(T-2) = combine_fn_bw(c_(T-3), xs_(T-2), g_c_(T-2), g_ys_(T-2))
+        ...
+        g_init, g_xs_1 = combine_fn_bw(c_0, xs_1, g_c_0, g_ys_1)
+        0     , g_xs_0 = combine_fn_bw(init, xs_0, g_init, g_ys_0),
+
+        where combine_fn_bw takes the forward inputs of step t (i.e. c_(t-1), xs_t),
+        the gradients of the carry of step t (i.e. g_c_t) and
+        the upstream gradient of the output of step t (i.e. g_ys_T)
+        and returns the gradient of xs_t -> g_xs_t, as well as the gradient for the carry of step t-1 -> g_c_(t-1).
+
+        Through this procedure we end up with the
+        gradients for the init -> g_init,
+        the gradients for the xs -> g_xs.
+
+
+    NOTE: [scan autograd implementation]
+
+    The forward of scan can be computed as:
+    1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``:
+    To use a scan operation for the backward path as well, we need access to the carries from all steps.
+    Thus, the function ``combine_fn`` is wrapped such that it returns all carries and not only the last carry.
+    In particular, we define ``combine_fn_with_carry_checkpoint``:
+    def combine_fn_with_carry_checkpoint(x: torch.Tensor, y: torch.Tensor):
+        carry, y = combine_fn(x, y)
+        return carry, (carry, y)
+
+    The scan operator will stack all outputs along the scan dimension.
+    Thus, by putting next_carry also into outputs of ``combine_fn_with_carry_checkpoint``,
+    the carries from all steps will be stacked and hence gives us chekpointed_carries
+
+    2.) Compute all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``:
+    c_T, (carries, ys) = scan_op(combine_fn_with_carry_checkpoint, init, xs, additional_inputs),
+    Where c_T (last carry) and ys (all outputs) are the original results of scan with the ``combine_fn``.
+    However, carries are checkpointed carries from all steps.
+    As a result of the forward, only the last carry c_T and the ys are returned,
+    while all carries are saved for the backward.
+
+    The backward of scan can be computed as:
+
+    3.) Prepare the backward graph:
+    We prepare the backward graph to be used in the backward function.
+    We utilize ``create_bw_fn`` to generate the joint function, i.e.,
+    ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands), where fw_operands = [init, xs_0, additional_inputs]
+
+    The ctx._combine_fn_bw requires the primals (operands)
+    followed by the tangents (upstream gradients) from a single step
+    and produces the gradients of that step, i.e.,
+    g_c_(T-1), g_xs_T, g_additional_input_T = ctx._combine_fn_bw(c_(T-1), xs_T, additional_inputs, g_c_T, g_ys_T).
+
+    4.) Create a wrapper of the ``combine_fn_bw``, i.e., ``combine_fn_bw_grad_accumulation``:
+    In the forward, there may be additional inputs that participate in every forward step.
+    The gradients for those additional inputs are also computed at every step and need to be accumulated over all steps,
+    which is taken care of in this wrapper. For example:
+    def combine_fn_bw_grad_accumulation(*args):
+        carried_g_additional_input = args[:num_additional_inputs]
+        inputs_bw_fn = args[num_additional_inputs:]
+        g_c_(t-1), g_xs_t, g_additional_input_t = ctx._combine_fn_bw(*inputs_bw_fn)
+        new_g_additional_inputs = carried_g_additional_input + g_additional_input_t
+        # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+        # The ``g_xs_t`` is encoded as the output of the backward scan operator
+        return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+
+    5.) Perform the backward scan as
+    g_additional_inputs, g_init, g_xs = scan_op(combine_fn_bw_grad_accumulation, bw_init, bw_xs), where
+    bw_init consists of the initial gradient carry for the additional_inputs (initialized with 0s):
+    initial_g_additional_inputs, and the gradient of the last carry: g_c_T. Thus:
+    bwd_init = [*initial_g_additional_inputs, *g_c_T].
+
+    bw_xs consists of the combination of the upstream gradients g_ys,
+    the forward carries prepended with the fw_init, i.e., bw_carries = concat([fw_init, fw_carries[:-1]]) and
+    the fw_xs. In particular,
+    bwd_xs = [*g_ys, *bw_carries, *fw_xs].
+
+    Note: g_c_T and g_ys are provided through the torch.autograd.Function.backward's input
+
+    As demonstrated in the Example above, this backward scan then yields the gradient for the init -> g_init
+    and the gradient for the xs -> g_xs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NOTE: [scan partial grad handling]
     If any element of init, of xs, of the outputs or of the additional_inputs does not require gradients,
     i.e., requires_grad=False, there will be still gradients returned for those elements,
@@ -449,6 +674,7 @@ class ScanAutogradOp(torch.autograd.Function):
     """
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def forward(
         ctx,
@@ -806,6 +1032,282 @@ def scan_autograd(combine_fn, init, xs, additional_inputs):
         *xs,
         *additional_inputs,
     )
+=======
+    def forward(
+        ctx,
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *operands,
+    ):
+        ctx._num_leaves_init = num_leaves_init
+        ctx._num_leaves_xs = num_leaves_xs
+        ctx._num_additional_inputs = num_additional_inputs
+        ctx._combine_fn = combine_fn
+        init, xs, additional_inputs = split_into_chunks(
+            operands, [num_leaves_init, num_leaves_xs, num_additional_inputs]
+        )
+        additional_inputs_tensor_mask = get_tensor_mask(additional_inputs)
+        ctx._additional_inputs_tensor_mask = additional_inputs_tensor_mask
+
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+        # 1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``
+        # The wrapper of the forward graph returns carries from all iterations,
+        # not just from the last iteration. These are required in the backward path
+        def combine_fn_with_carry_checkpoint(*args):
+            carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init)
+            return [
+                *carry,
+                # We additionally checkpoint all the intemediate carry outputs for backward.
+                *[
+                    n_c.clone().detach() if isinstance(n_c, torch.Tensor) else n_c
+                    for n_c in carry
+                ],
+                *y,
+            ]
+
+        with torch._C._AutoDispatchBelowAutograd():
+            # 2.) Compute the all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``
+            c_T, carries_ys = _extract_carry_and_out(
+                scan_op(
+                    combine_fn_with_carry_checkpoint,
+                    init,
+                    xs,
+                    additional_inputs,
+                ),
+                num_leaves_init,
+            )
+
+            # Collect the carries for each time step from the outs
+            # and save them for the backward path
+            carries = list(carries_ys[:num_leaves_init])
+            ys = list(carries_ys[num_leaves_init:])
+            save_tensors_and_symints_for_backward(ctx, list(operands) + carries + ys)
+            ctx._num_leaves_ys = len(ys)
+
+            return (*c_T, *ys)
+
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        r"""
+        This function computes the gradients of the scan operation.
+        It does so by using a scan operator using all carries and the upstream gradients (see description above)
+
+        Args:
+            flat_grads (torch.Tensor): The tensor of flattened upstream gradients.
+        """
+
+        # Collect the saved items from the forward
+        num_leaves_init = ctx._num_leaves_init
+        num_leaves_xs = ctx._num_leaves_xs
+        num_leaves_ys = ctx._num_leaves_ys
+        num_additional_inputs = ctx._num_additional_inputs
+        additional_inputs_tensor_mask = ctx._additional_inputs_tensor_mask
+
+        def prepend_init_to_carries(init, carries):
+            # Prepare the carries for the backward path.
+            # This requires to concatenate the init and the carries
+            return [
+                torch.cat([torch.unsqueeze(i, 0), c[:-1]], dim=0)
+                for i, c in zip(init, carries)
+            ]
+
+        def initialize_g_additional_inputs(
+            additional_inputs,
+        ):
+            # The initial gradients for the additional_inputs are all zeros
+            g_additional_inputs = [
+                torch.zeros_like(ai) if ai_tm else None
+                for ai_tm, ai in zip(additional_inputs_tensor_mask, additional_inputs)
+            ]
+            return g_additional_inputs
+
+        # Retrieve the forward inputs and the forward outputs and dissect them
+        flat_args = saved_tensors_and_symints(ctx)
+        fw_init, fw_xs, additional_inputs, fw_carries, fw_ys = split_into_chunks(
+            flat_args,
+            [
+                num_leaves_init,
+                num_leaves_xs,
+                num_additional_inputs,
+                num_leaves_init,
+                num_leaves_ys,
+            ],
+        )
+
+        # 3.) Prepare the backward graph
+        fw_operands = (
+            *fw_init,
+            *[first_slice_copy(xs) for xs in fw_xs],
+            *additional_inputs,
+        )
+        ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands)
+
+        # 4.) Create the BW wrapper to accumulate the gradients for the additional_inputs
+        def combine_fn_bw_grad_accumulation(*args):
+            # Dissect args and re-order them for the ``ctx._combine_fn_bw``
+            # The content of ``combine_fn_bw_tangents`` is [*carries_g, *outs_g]
+            # The content of ``combine_fn_bw_primals`` is [*init, *xs, *additional_inputs]
+            (
+                carried_g_additional_input,
+                combine_fn_bw_tangents,
+                combine_fn_bw_primals,
+            ) = split_into_chunks(
+                args,
+                [
+                    num_additional_inputs,
+                    num_leaves_init + num_leaves_ys,
+                    num_leaves_init + num_leaves_xs + num_additional_inputs,
+                ],
+            )
+            combine_fn_bw_args = (*combine_fn_bw_primals, *combine_fn_bw_tangents)
+
+            g_c_t, g_xs_t, g_additional_inputs_t = split_into_chunks(
+                ctx._combine_fn_bw(*combine_fn_bw_args),
+                [num_leaves_init, num_leaves_xs, num_additional_inputs],
+            )
+
+            new_g_additional_inputs = [
+                # If the additional inputs are ints or SymInts, those values are taken as is and no gradients are added
+                carr_g + curr_g if add_inp_tm else carr_g
+                for add_inp_tm, carr_g, curr_g in zip(
+                    additional_inputs_tensor_mask,
+                    carried_g_additional_input,
+                    g_additional_inputs_t,
+                )
+            ]
+
+            # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+            # The ``g_xs_t`` is encoded as the output of the backward scan operator
+            return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+
+        # Materialize the ``combine_fn_bw_grad_accumulation``
+        def construct_args_single_step_bw():
+            # This function constructs the arguments for a single step of the backward scan.
+            # In other words, it creates the arguments for ``combine_fn_bw_grad_accumulation``
+            # The order of the arguments returned is identical to the order the backward scan
+            # operations provides
+
+            # The following arguments are used for the backward part of the joint graph
+            # The first argument relates to the gradient accumulation of the additional inputs.
+            # Because only tensor elements of additional inputs can have requires_grad=True,
+            # the values for non-tensor elements of additional inputs are None
+            masked_additional_inputs = [
+                a.clone() if add_inp_tm else None
+                for add_inp_tm, a in zip(
+                    additional_inputs_tensor_mask, additional_inputs
+                )
+            ]
+
+            # The second argument relates to the gradients of the carries.
+            # Because the arguments are for a single step only,
+            # only the first slice of the carries is used.
+            sliced_carries = [first_slice_copy(c) for c in fw_carries]
+
+            # The third argument relates to the gradients of the ys.
+            # Because the arguments are for a single step only,
+            # only the first slice of the ys is used.
+            sliced_ys = [first_slice_copy(o) for o in fw_ys]
+
+            # The following arguments are used for the forward part of the joint graph
+            # The fourth argument relates to the init for the forward.
+            # I.e., fw_init
+
+            # The fifth argument relates to the xs for the forward.
+            # Because the arguments are for a single step only,
+            # only the first slice of the xs is used.
+            # Note: It is important to preserve the requires_grad flag of xs
+            # and thus we use the wrapper function ``first_slice_copy_with_grad``
+            fw_xs_slice = first_slice_copy_with_grad(fw_xs)
+
+            # The last argument relates to the additional inputs for the forward.
+            # I.e., additional_inputs
+
+            return (
+                *masked_additional_inputs,
+                *sliced_carries,
+                *sliced_ys,
+                *fw_init,
+                *fw_xs_slice,
+                *additional_inputs,
+            )
+
+        args_single_step_bw = construct_args_single_step_bw()
+
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint function when torch.compile torch.autograd.grad.
+        combine_fn_bw_grad_accumulation_gm = materialize_as_graph(
+            combine_fn_bw_grad_accumulation,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        # Decompose the flat_grads into g_c_T, g_ys
+        g_c_T, g_ys = split_into_chunks(flat_grads, [num_leaves_init, num_leaves_ys])
+
+        # Initialize the g_additional_inputs with zero-tensors.
+        # This step is necessary because the gradients of the additional inputs are accumulated in the
+        # ``wrapper_bwd_combine_fn`` and thus need a zero-initialized starting point
+        initial_g_additional_inputs = initialize_g_additional_inputs(additional_inputs)
+
+        # Prepend the inits to the carries.
+        # This is needed, because when computing the gradients, the last carry is not needed
+        # but the first carry, the init, is required.
+        bw_carries = prepend_init_to_carries(fw_init, fw_carries)
+
+        # Prepare the xs for the backward scan.
+        bwd_xs = [*g_ys, *bw_carries, *fw_xs]
+
+        # The flipping of the ``bwd_xs`` is necessary because the scan_op in the backward is always performed in reverse
+        bwd_xs = [torch.flip(elem, [0]) for elem in bwd_xs]
+
+        # Prepare the bwd_init
+        bwd_init = [*initial_g_additional_inputs, *g_c_T]
+
+        # 5.) Perform the backwrad scan:
+        # The ``combine_fn_bw_wrapped`` receives the
+        # initial_g_additional_inputs and the last carry as the ``bwd_init`` and the
+        # gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs``
+        gradients = scan_op(
+            combine_fn_bw_grad_accumulation_gm,
+            bwd_init,
+            bwd_xs,
+            additional_inputs,
+        )
+
+        # Unpack the computed gradients
+        g_additional_inputs, g_init, g_xs = split_into_chunks(
+            gradients, [num_additional_inputs, num_leaves_init, num_leaves_xs]
+        )
+
+        # The flipping back along the scan dimension is required to get the gradients in the right order for ``xs``
+        g_xs = [torch.flip(elem, [0]) for elem in g_xs]
+
+        return *[None] * 4, *g_init, *g_xs, *g_additional_inputs
+
+
+@scan_op.py_autograd_impl
+def scan_autograd(combine_fn, init, xs, additional_inputs):
+    num_leaves_init = len(init)
+    num_leaves_xs = len(xs)
+    num_additional_inputs = len(additional_inputs)
+
+    flat_out = ScanAutogradOp.apply(
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *(tuple(init) + tuple(xs) + additional_inputs),
+    )
+    return *flat_out[:num_leaves_init], *flat_out[num_leaves_init:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @scan_op.py_impl(ProxyTorchDispatchMode)
@@ -866,6 +1368,7 @@ def scan_functionalize(ctx, combine_fn, init, xs, additional_inputs):
     return ctx.wrap_tensors(ret)
 
 
+<<<<<<< HEAD
 @scan_op.py_impl(torch._C._functorch.TransformType.Vmap)
 def scan_batch_rule(interpreter, combine_fn, init, xs, additional_inputs):
     from torch._functorch.vmap import restore_vmap, unwrap_batched, wrap_batched
@@ -924,6 +1427,8 @@ def wrapper(*args):
     return batched_out
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # dense implementation for scan. Used for testing only.
 def _fake_scan(combine_fn, init, xs=None, dim=0, reverse=False):
     carry_leaves, carry_spec = pytree.tree_flatten(init)
diff --git a/torch/_higher_order_ops/schema.py b/torch/_higher_order_ops/schema.py
index 46dc11573a781..b4cedd8bc0eae 100644
--- a/torch/_higher_order_ops/schema.py
+++ b/torch/_higher_order_ops/schema.py
@@ -18,7 +18,11 @@ class HopArgumentInfo:
     example_value: Any
     # Provide an default_value
     default_value: Any
+<<<<<<< HEAD
     # Whether this argument gets mutated in the hop subgraph.
+=======
+    # Whether this arugment gets mutated in the hop subgraph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For output, this should always be False
     is_mutated: bool
     kw_only: bool
@@ -35,9 +39,15 @@ def from_example(
         kw_only: bool = False,
     ) -> HopArgumentInfo:
         if default_value is not None:
+<<<<<<< HEAD
             assert type(example_value) is type(default_value), (
                 f"example_value type {type(example_value)} doesn't match default_value type: {type(default_value)}"
             )
+=======
+            assert type(example_value) == type(
+                default_value
+            ), f"example_value type {type(example_value)} doesn't match default_value type: {type(default_value)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return HopArgumentInfo(
             name=name,
@@ -65,8 +75,11 @@ def from_example(obj: Any) -> Any:
             return torch._C.AnyType.get()
         elif isinstance(obj, torch.SymInt):
             return torch._C.SymIntType.get()
+<<<<<<< HEAD
         elif isinstance(obj, torch.SymBool):
             return torch._C.SymBoolType.get()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._jit_try_infer_type(obj).type()
 
 
@@ -209,12 +222,21 @@ def from_hop_argument_info(
             args.append(CArgumentGen.from_hop_argument_info(i, arg_info))
 
         # NOTE: we want the output to always be a single argument with torch._C.TupleType.
+<<<<<<< HEAD
         assert isinstance(out_argument_info.example_value, tuple), (
             f"expect out_argument_info's example_value to be a tuple but got {out_argument_info.example_value}"
         )
         assert not out_argument_info.is_mutated, (
             "out_argument_info.is_mutated should always be set to False."
         )
+=======
+        assert isinstance(
+            out_argument_info.example_value, tuple
+        ), f"expect out_argument_info's example_value to be a tuple but got {out_argument_info.example_value}"
+        assert (
+            not out_argument_info.is_mutated
+        ), "out_argument_info.is_mutated should always be set to False."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rets = None
         if len(out_argument_info.example_value) == 1:
             rets = [CArgumentGen.from_hop_argument_info(0, out_argument_info, True)]
diff --git a/torch/_higher_order_ops/strict_mode.py b/torch/_higher_order_ops/strict_mode.py
index f5875ded5a994..94fdd212a3a88 100644
--- a/torch/_higher_order_ops/strict_mode.py
+++ b/torch/_higher_order_ops/strict_mode.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING, Union
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._subclasses.functional_tensor
 import torch.utils._pytree as pytree
@@ -20,10 +23,13 @@
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @exposed_in("torch")
 def strict_mode(callable, operands):
     from torch._dynamo.backends.debugging import (
@@ -39,9 +45,13 @@ def strict_mode(callable, operands):
                 modes = [metadata_mode, predispatch_mode]
                 modes = [mode for mode in modes if mode is not None]
                 if modes:
+<<<<<<< HEAD
                     backend: Union[str, Callable[..., Any]] = (
                         make_eager_backend_with_torch_function_modes(modes)
                     )
+=======
+                    backend = make_eager_backend_with_torch_function_modes(modes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     backend = "eager"
                 with torch._dynamo.utils.disable_cache_limit():
diff --git a/torch/_higher_order_ops/torchbind.py b/torch/_higher_order_ops/torchbind.py
index c10e674b7ac0c..7ccb92fbe5b1a 100644
--- a/torch/_higher_order_ops/torchbind.py
+++ b/torch/_higher_order_ops/torchbind.py
@@ -81,9 +81,15 @@ def enable_torchbind_tracing():
         torch.ScriptMethod.__call__ = torchbind_method_redispatch  # type: ignore[method-assign]
         yield
     finally:
+<<<<<<< HEAD
         assert KNOWN_TYPES.pop() is torch.ScriptObject, (
             "Someone else messed with KNOWN_TYPES during tracing, exploding."
         )
+=======
+        assert (
+            KNOWN_TYPES.pop() is torch.ScriptObject
+        ), "Someone else messed with KNOWN_TYPES during tracing, exploding."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.ScriptMethod.__call__ = _orig_scriptmethod_call  # type: ignore[method-assign]
 
 
@@ -127,16 +133,26 @@ def inner(mode, *args, **kwargs):
 
     ret = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
     if "val" not in out_proxy.node.meta:
+<<<<<<< HEAD
         assert out is None or isinstance(out, (int, float, bool)), (
             "Currently, only these constant dtypes are supported to be returned from torchbind methods."
         )
+=======
+        assert out is None or isinstance(
+            out, (int, float, bool)
+        ), "Currently, only these constant dtypes are supported to be returned from torchbind methods."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_proxy.node.meta["val"] = out
     return ret
 
 
 # When tracing with fake script object, the call_torchbind op will return a fake tensor
 # When tracing with real script object, the call_torchbind op may return a real tensor,
+<<<<<<< HEAD
 # we need to convert it to fake tensor manually. Dynamic shape is supported.
+=======
+# we need to convert it to fake tensor mannually. Dynamic shape is surpported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @call_torchbind.py_impl(FakeTensorMode)
 def call_torchbind_fake(mode, *args, **kwargs):
     with mode:
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 6526695d1d82e..d634871093744 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -8,8 +8,13 @@
 import operator
 import threading
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Never
 
 import sympy
@@ -18,7 +23,10 @@
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
 from torch._C import DispatchKey
+<<<<<<< HEAD
 from torch._higher_order_ops.utils import redirect_to_mode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import HigherOrderOperator
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -29,7 +37,10 @@
 )
 from torch.fx.experimental.symbolic_shapes import guard_scalar
 from torch.types import IntLikeType
+<<<<<<< HEAD
 from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -95,7 +106,11 @@ def create_tma_experimental_metadata(
 
 
 def maybe_unpack_tma_experimental_metadata(
+<<<<<<< HEAD
     tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata],
+=======
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[tuple[list[IntLikeType], list[IntLikeType], IntLikeType]]:
     if not tma_meta or len(tma_meta) != 2:
         return None
@@ -111,7 +126,11 @@ def create_tma_stable_metadata(
 
 
 def maybe_unpack_tma_stable_metadata(
+<<<<<<< HEAD
     tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata],
+=======
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[tuple[list[IntLikeType]]]:
     if not tma_meta or len(tma_meta) != 2:
         return None
@@ -292,7 +311,10 @@ def generate_ttir(
             ordered_args[name] = 2
         elif (
             stable_meta := maybe_unpack_tma_stable_metadata(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tma_descriptor_metadata.get(name, None)
             )
         ) is not None:
@@ -387,6 +409,7 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
                         triton.runtime.jit.create_specialize_impl(),
                         specialize_extra=backend.get_arg_specialization,
                     )
+<<<<<<< HEAD
             # create_specialize_impl is removed in https://github.com/triton-lang/triton/pull/7771
             # switch to native_specialize_impl instead
             elif hasattr(triton.runtime.jit, "native_specialize_impl"):
@@ -404,6 +427,8 @@ def _native_specialize_impl(
                     )
 
                 specialize_impl = _native_specialize_impl
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 from triton.runtime.jit import specialize_impl as specialize_impl_orig
 
@@ -426,7 +451,10 @@ def _native_specialize_impl(
                         specialize_value=not kp.do_not_specialize,
                         align=not kp.do_not_specialize_on_alignment,
                     )
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     attrvals.append(spec[1])
 
             attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
@@ -445,7 +473,10 @@ def _native_specialize_impl(
         def get_signature_value(idx: int, arg: Any) -> str:
             if kernel.params[idx].is_constexpr:
                 return "constexpr"
+<<<<<<< HEAD
             # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return mangle_type(arg)
 
     else:
@@ -461,11 +492,18 @@ def get_signature_value(idx: int, arg: Any) -> str:
         }
     else:
         # In older versions of Triton, the signature does not include constexpr args
+<<<<<<< HEAD
         constexprs = [p.num for p in kernel.params if p.is_constexpr]
         signature = {
             name: get_signature_value(i, arg)
             for i, (name, arg) in enumerate(ordered_args.items())
             if i not in constexprs
+=======
+        signature = {
+            name: get_signature_value(i, arg)
+            for i, (name, arg) in enumerate(ordered_args.items())
+            if i not in kernel.constexprs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
     triton._C.libtriton.ir.load_dialects(context)
@@ -480,6 +518,7 @@ def get_signature_value(idx: int, arg: Any) -> str:
         inspect.signature(backend.get_codegen_implementation).parameters
     )
     if make_ir_sig_params == 2:
+<<<<<<< HEAD
         ttir_module = src.make_ir(target, options, context)
     elif make_ir_sig_params == 3:
         codegen_fns = backend.get_codegen_implementation()
@@ -489,11 +528,21 @@ def get_signature_value(idx: int, arg: Any) -> str:
         codegen_fns = backend.get_codegen_implementation(*codegen_args)
         module_map = backend.get_module_map()
         ttir_module = src.make_ir(target, codegen_fns, module_map, context)
+=======
+        ttir_module = src.make_ir(options, context)
+    elif make_ir_sig_params == 3:
+        codegen_fns = backend.get_codegen_implementation()
+        ttir_module = src.make_ir(options, codegen_fns, context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
         codegen_fns = backend.get_codegen_implementation(*codegen_args)
         module_map = backend.get_module_map()
+<<<<<<< HEAD
         ttir_module = src.make_ir(target, options, codegen_fns, module_map, context)
+=======
+        ttir_module = src.make_ir(options, codegen_fns, module_map, context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not ttir_module.verify():
         raise RuntimeError("Verification for TTIR module has failed")
 
@@ -819,7 +868,10 @@ def get_tma_stores(
         for op in op_list:
             if op.name == "tt.call":
                 assert op.fn_call_name in functions
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tma_stores = get_tma_stores(functions, op.fn_call_name)
                 for i, inp in enumerate(op.args):
                     if Param(idx=i) in tma_stores:
@@ -827,9 +879,12 @@ def get_tma_stores(
             elif op.name == "tt.experimental_descriptor_store":
                 assert len(op.args) >= 1
                 result.add(op.args[0])
+<<<<<<< HEAD
             elif op.name == "tt.descriptor_store":
                 assert len(op.args) >= 1
                 result.add(op.args[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for val in list(result):
         if val in ops:
@@ -879,6 +934,12 @@ def analyze_kernel_mutations(
             # (e.g. `tt.elementwise_inline_asm`), we assume it does not mutate any input parameters.
             if op.name in UNKNOWN_OPS:
                 if op.name == "tt.elementwise_inline_asm" and op.is_pure:
+<<<<<<< HEAD
+=======
+                    log.warning(
+                        "TTIR mutation analysis: Skipping pure tt.elementwise_inline_asm op (is_pure=True)"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 raise RuntimeError(
                     f"ttir analysis hit an op we do not know how to analyze: {op.name}"
@@ -900,10 +961,14 @@ def analyze_kernel_mutations(
             if op.name == "tt.call":
                 assert op.fn_call_name in functions
                 mutations = analyze_kernel_mutations(
+<<<<<<< HEAD
                     functions,
                     # pyrefly: ignore [bad-argument-type]
                     op.fn_call_name,
                     len(op.args),
+=======
+                    functions, op.fn_call_name, len(op.args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
             else:
@@ -956,7 +1021,10 @@ def identify_mutated_tensors(
         assert functions is not None
         kernel_name = next(iter(functions.keys()))
         # Triton codegen modifies the name
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert kernel.fn.__name__ in kernel_name
         # Reset the cache between top level invocations
         # The cache for analyze kernel mutations is mainly used for cycle
@@ -1060,11 +1128,15 @@ def triton_kernel_wrapper_mutation_dense(
         grid_fn = grid[0]
     else:
         fn_name, code = user_defined_kernel_grid_fn_code(
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             kernel.fn.__name__,
             # pyrefly: ignore [missing-attribute]
             kernel.configs,
             grid,
+=======
+            kernel.fn.__name__, kernel.configs, grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         namespace: dict[str, Any] = {}
         exec(code, namespace)
@@ -1074,7 +1146,11 @@ def triton_kernel_wrapper_mutation_dense(
         # as we need to launch the kernel here, we "unwrap" the
         # tma_descriptor_metadata, create the TMA descriptors
         # from it, and replace the tensors in the kwargs by the
+<<<<<<< HEAD
         # corresponding TMA descriptors before launching
+=======
+        # correspoinding TMA descriptors before launching
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = kwargs.copy()
         for k, v in tma_descriptor_metadata.items():
             tensor = kwargs[k]
@@ -1113,7 +1189,10 @@ def triton_kernel_wrapper_mutation_dense(
     # avoid mutating the original inputs
     kwargs = kwargs.copy()
     constant_args = constant_args.copy()
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name in kernel.arg_names:
         if name in kwargs:
             args.append(kwargs.pop(name))
@@ -1122,7 +1201,10 @@ def triton_kernel_wrapper_mutation_dense(
         else:
             break
 
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kernel[grid_fn](*args, **kwargs, **constant_args)
 
 
@@ -1161,8 +1243,12 @@ def trace_triton_kernel_wrapper(
         out = func_overload(**node_args)
 
     proxy_args = pytree.tree_map(
+<<<<<<< HEAD
         proxy_mode.tracer.unwrap_proxy,  # type: ignore[union-attr]
         node_args,
+=======
+        proxy_mode.tracer.unwrap_proxy, node_args  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function",
@@ -1376,9 +1462,12 @@ def triton_kernel_wrapper_functional_functionalize(
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)
 
+<<<<<<< HEAD
 # Adds SAC support for triton ops
 redirect_to_mode(triton_kernel_wrapper_mutation, _CachingTorchDispatchMode)
 redirect_to_mode(triton_kernel_wrapper_mutation, _CachedTorchDispatchMode)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ###############################################################################
 # The "TritonHOPifier": a class that transforms a call to a triton kernel into
@@ -1528,7 +1617,10 @@ def init_variable(
 
         assert kernel_idx is None or variable.kernel_idx == kernel_idx
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         variable.grid = grid
 
         if isinstance(kernel, Autotuner):
@@ -1704,9 +1796,15 @@ def call_triton_kernel(
 
                         # Update the kwargs in each config
                         # maybe_unpack_heuristic_result raises unsupported if the value is non-constant
+<<<<<<< HEAD
                         new_configs[config_idx].__dict__["kwargs"][kwarg_key] = (
                             self.maybe_unpack_heuristic_result(heuristic_result)
                         )
+=======
+                        new_configs[config_idx].__dict__["kwargs"][
+                            kwarg_key
+                        ] = self.maybe_unpack_heuristic_result(heuristic_result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 iter_kernel = iter_kernel.fn
             assert isinstance(iter_kernel, JITFunction)
@@ -1729,7 +1827,10 @@ def call_triton_kernel(
             "num_ctas",
             "num_consumer_groups",
             "num_buffers_warp_spec",
+<<<<<<< HEAD
             "num_cpu_threads",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         # move special config names to configs out of kwargs
@@ -1787,9 +1888,15 @@ def call_triton_kernel(
                 for config in new_configs:
                     for name in special_param_names:
                         if name not in config.__dict__["kwargs"]:
+<<<<<<< HEAD
                             assert name in config.__dict__, (
                                 f"{name} must be in autotuning configs to be used as a kernel parameter"
                             )
+=======
+                            assert (
+                                name in config.__dict__
+                            ), f"{name} must be in autotuning configs to be used as a kernel parameter"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             config.__dict__["kwargs"][name] = config.__dict__[name]
                             updated = True
 
@@ -1901,16 +2008,26 @@ def call_triton_kernel(
 
         assert len(grids) != 0
         if isinstance(variable.kernel, JITFunction):
+<<<<<<< HEAD
             constexprs = [p.num for p in variable.kernel.params if p.is_constexpr]
             arg_names = [p.name for p in variable.kernel.params]
+=======
+            constexprs = variable.kernel.constexprs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # If we are looking at an @triton.autotune decorator, the nested function should be a JITFunction
             # This is because we don't support @triton.heuristics or nested @triton.autotune decorators yet
             assert isinstance(variable.kernel, Autotuner)
+<<<<<<< HEAD
             constexprs = [p.num for p in variable.kernel.fn.params if p.is_constexpr]
             arg_names = [p.name for p in variable.kernel.fn.params]
 
         for idx, arg_name in enumerate(arg_names):
+=======
+            constexprs = variable.kernel.fn.constexprs
+
+        for idx, arg_name in enumerate(variable.kernel.arg_names):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if idx in constexprs:
                 if arg_name in combined_args_raw:
                     # [Note: Specialize tl.constexpr args in user-defined triton kernels]
@@ -2076,7 +2193,10 @@ def run(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
             return tracing_triton_hopifier_singleton.call_run(self, args, kwargs, None)
         else:
             assert self.kernel is not None
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.kernel.run(*args, **kwargs)
 
     def __call__(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
@@ -2088,7 +2208,10 @@ def __call__(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
             )
         else:
             assert self.kernel is not None
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.kernel[self.grid](*args, **kwargs)
 
     def specialize_symbolic(self, arg: Sequence[Any]) -> Any:
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index e734bd4df5e4e..c27f93ad65100 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from contextlib import AbstractContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
 from typing import Any, Optional, overload, TypeVar, Union
+=======
+from contextlib import contextmanager, ExitStack, nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.traceback as fx_traceback
@@ -96,8 +102,22 @@ def graph_with_interpreter(*args):
 
 def _maybe_compile_and_run_fn(fn, *args):
     if not torch.compiler.is_dynamo_compiling():
+<<<<<<< HEAD
         with setup_compilation_env() as backend:  # type: ignore[attr-defined]
             return torch.compile(fn, backend=backend, fullgraph=True)(*args)
+=======
+        from torch._dynamo.backends.debugging import (
+            make_eager_backend_with_torch_function_mode,
+        )
+
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                else:
+                    backend = "eager"
+                return torch.compile(fn, backend=backend, fullgraph=True)(*args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return fn(*args)
 
@@ -107,6 +127,7 @@ def reenter_make_fx(fn):
 
     @functools.wraps(fn)
     def wrapped(*args):
+<<<<<<< HEAD
         assert _CURRENT_MAKE_FX_TRACER is not None, (
             "Cannot reenter make_fx when we're not under a make_fx tracing session"
         )
@@ -114,6 +135,14 @@ def wrapped(*args):
             _maybe_run_with_interpreter(fn), *args
         )
         return gm
+=======
+        assert (
+            _CURRENT_MAKE_FX_TRACER is not None
+        ), "Cannot reenter make_fx when we're not under a make_fx tracing session"
+        return _CURRENT_MAKE_FX_TRACER.trace_subgraph(
+            _maybe_run_with_interpreter(fn), *args
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapped
 
@@ -226,6 +255,7 @@ def diff_device(
 
 
 @contextmanager
+<<<<<<< HEAD
 def setup_compilation_env():
     """
     Context manager that sets up proper environment and backend when invoking torch.compile
@@ -258,6 +288,11 @@ def _set_compilation_env():
     _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
     _old_allow_empty_graphs = torch._dynamo.config.allow_empty_graphs
     _old_capture_scalar_outputs = torch._dynamo.config.capture_scalar_outputs
+=======
+def _set_compilation_env():
+    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
+    _old_allow_empty_graphs = torch._dynamo.config.allow_empty_graphs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The issue is tracked in https://github.com/pytorch/pytorch/issues/144360: when dynamo finds
     # the top-level frame produces no graph, the default behavior is to fallback to eager.
     # Then when it encounters an inner function, it will try to trace that function again, which is unnecessary.
@@ -270,24 +305,39 @@ def _set_compilation_env():
         # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
         # once we are confident fx tracing works with dynamo.
         torch.fx._symbolic_trace._is_fx_tracing_flag = False
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         torch._dynamo.config.allow_empty_graphs = True
         torch._dynamo.config.capture_scalar_outputs = True
+=======
+        torch._dynamo.config.allow_empty_graphs = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield
     finally:
         torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
         torch._dynamo.config.allow_empty_graphs = _old_allow_empty_graphs
+<<<<<<< HEAD
         torch._dynamo.config.capture_scalar_outputs = _old_capture_scalar_outputs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The invariant here is that we always trace the branch with fake tensor
 def _maybe_fake_tracing(fn, inputs: list[Any], pre_dispatch):
+<<<<<<< HEAD
     fake_mode_det = detect_fake_mode(inputs)
     fake_mode: AbstractContextManager = nullcontext()
     tracing_mode = "fake"
     if fake_mode_det is not None:
         fake_mode = fake_mode_det
         tracing_mode = "real"
+=======
+    fake_mode = detect_fake_mode(inputs)
+    tracing_mode = "real"
+    if fake_mode is None:
+        fake_mode = nullcontext()
+        tracing_mode = "fake"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Note: we need to turn off proxy tensor mode to avoid tracing infra
     # code that happens in make_fx e.g. we now call as_strided when wrapping tensor
@@ -299,12 +349,18 @@ def _maybe_fake_tracing(fn, inputs: list[Any], pre_dispatch):
             pre_dispatch=pre_dispatch,
             _error_on_data_dependent_ops=False,
         )(*inputs)
+<<<<<<< HEAD
         if not isinstance(fake_mode, nullcontext) and fake_mode.shape_env is not None:  # type: ignore[attr-defined]
             insert_deferred_runtime_asserts(
                 gm,
                 fake_mode.shape_env,  # type: ignore[attr-defined]
                 "hoo_maybe_fake_tracing",
                 export=True,  # type: ignore[attr-defined]
+=======
+        if not isinstance(fake_mode, nullcontext) and fake_mode.shape_env is not None:
+            insert_deferred_runtime_asserts(
+                gm, fake_mode.shape_env, "hoo_maybe_fake_tracing", export=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return gm
 
@@ -352,15 +408,23 @@ def analyze_potential_input_alias_or_mutation(name, aliases, input_mutations):
 
 def _has_potential_branch_input_mutation(gm, inputs, pre_dispatch=False):
     (
+<<<<<<< HEAD
         (_, _, _),
         inp_mutation,
     ) = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+=======
+        _,
+        _,
+        _,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return len(inp_mutation) > 0
 
 
 def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
     (
+<<<<<<< HEAD
         (
             inp_inp_alias_map,
             inp_out_alias_map,
@@ -368,6 +432,12 @@ def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
         ),
         inp_mutation,
     ) = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+=======
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         any(
             (
@@ -423,7 +493,13 @@ def _check_alias_and_mutation(graph_module, inputs_fake, name, pre_dispatch):
         graph_module, inputs_fake, pre_dispatch=pre_dispatch
     )
     if aliases:
+<<<<<<< HEAD
         raise RuntimeError(f"{name} might be aliasing the input or the output!")  # noqa: F541
+=======
+        raise RuntimeError(
+            f"{name} might be aliasing the input or the output!"
+        )  # noqa: F541
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if inp_mutation:
         raise RuntimeError(f"{name} might be modifying the input!")  # noqa: F541
 
@@ -441,7 +517,10 @@ def unique_graph_name_with_root(
 ) -> tuple[int, str]:
     next_name = None
     i = 0
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while not next_name:
         candidate = f"{prefix}_{i}"
         if hasattr(root, candidate):
@@ -502,7 +581,12 @@ def prepare_fw_with_masks(fn):
     def fw_with_masks(*args):
         fw_out = fn(*args)
         return fw_out, [
+<<<<<<< HEAD
             bool(isinstance(ret, torch.Tensor) and ret.requires_grad) for ret in fw_out
+=======
+            True if isinstance(ret, torch.Tensor) and ret.requires_grad else False
+            for ret in fw_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
     return fw_with_masks
@@ -519,6 +603,7 @@ def fw_with_masks(*args):
         # require_gradness reasoning much easier.
         if pytree.tree_any_only(torch.Tensor, lambda t: t.requires_grad, args):
             fw_out = pytree.tree_map_only(
+<<<<<<< HEAD
                 torch.Tensor,
                 lambda x: x.requires_grad_(True) if x.dtype.is_floating_point else x,
                 fw_out,
@@ -530,6 +615,13 @@ def _query_requires_grad(t: torch.Tensor) -> bool:
             return t.requires_grad
 
         return fw_out, pytree.tree_map_only(torch.Tensor, _query_requires_grad, fw_out)
+=======
+                torch.Tensor, lambda x: x.requires_grad_(True), fw_out
+            )
+        return fw_out, pytree.tree_map_only(
+            torch.Tensor, lambda x: x.requires_grad, fw_out
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return fw_with_masks
 
@@ -539,9 +631,15 @@ def _query_requires_grad(t: torch.Tensor) -> bool:
 # replaced with an all-zero tensor for better optimization
 def unmask_none_gradients(grads, operands):
     allowed_types = (torch.Tensor, int, torch.SymInt)
+<<<<<<< HEAD
     assert all(isinstance(o, allowed_types) for o in operands), (
         f"operands can only be of {allowed_types} but got {[type(o) for o in operands]}"
     )
+=======
+    assert all(
+        isinstance(o, allowed_types) for o in operands
+    ), f"operands can only be of {allowed_types} but got {[type(o) for o in operands]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     unmasked_grads = []
     for g, o in zip(grads, operands):
@@ -744,6 +842,7 @@ def saved_tensors_and_symints(ctx):
     return tuple(args)
 
 
+<<<<<<< HEAD
 def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
     assert sum(chunk_sizes) == len(iterable), (
         "the sum of all chunks needs to match the length of the iterable."
@@ -839,6 +938,8 @@ def flat_fn(*args_and_grad_outs):
     return flat_fn
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_dummy_aot_autograd_config():
     from torch._functorch.aot_autograd import AOTConfig
 
@@ -858,6 +959,7 @@ def first_slice_copy(t: torch.Tensor, dim: int = 0) -> torch.Tensor:
     return torch.select_copy(t, dim, 0)
 
 
+<<<<<<< HEAD
 # Returns a mask whether a list element is a tensor or not
 def get_tensor_mask(tensor_list: Iterable[Any]) -> list[bool]:
     return [bool(isinstance(v, torch.Tensor)) for v in tensor_list]
@@ -892,6 +994,8 @@ def first_slice_copy_with_grad(li: Iterable[Any]) -> list[Any]:
     return slc
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Reports the difference between meta of two tensors in a string
 def diff_tensor_meta(
     meta1: TensorMetadata, meta2: TensorMetadata, check_grad=True
@@ -926,9 +1030,13 @@ def validate_subgraph_args_types(lifted_args: Union[tuple[Any, ...], list[Any]])
     allowed_types = (torch.Tensor, int, torch.SymInt)
     assert all(
         isinstance(arg, (torch.Tensor, int, torch.SymInt)) for arg in lifted_args
+<<<<<<< HEAD
     ), (
         f"{lifted_args} can only be of {allowed_types} but got {tuple(type(arg) for arg in lifted_args)}"
     )
+=======
+    ), f"{lifted_args} can only be of {allowed_types} but got {tuple(type(arg) for arg in lifted_args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO: Return a more detailed information as to which node
@@ -942,6 +1050,7 @@ def check_input_alias_and_mutation(
         inp_out_alias_map,
         out_out_alias_map,
         mutated_inputs,
+<<<<<<< HEAD
     ) = check_input_alias_and_mutation_return_outputs(gm)[:-1]
     # pyrefly: ignore [bad-return]
     return inp_inp_alias_map, inp_out_alias_map, out_out_alias_map, mutated_inputs
@@ -953,6 +1062,15 @@ def _tensor_storage(t) -> StorageWeakRef:
 
 def check_input_alias_and_mutation_return_outputs(
     gm: torch.fx.GraphModule,
+=======
+    ) = check_input_alias_and_mutation_return_outputs(gm, fake_args)[:-1]
+    return inp_inp_alias_map, inp_out_alias_map, out_out_alias_map, mutated_inputs
+
+
+def check_input_alias_and_mutation_return_outputs(
+    gm: torch.fx.GraphModule,
+    fake_args: Union[list[FakeTensor], tuple[FakeTensor, ...]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[
     dict[int, int],
     dict[int, int],
@@ -960,6 +1078,7 @@ def check_input_alias_and_mutation_return_outputs(
     list[int],
     Union[tuple[Any, ...], list[Any]],
 ]:
+<<<<<<< HEAD
     def _get_example_value(n):
         if not isinstance(n, torch.fx.Node):
             return n
@@ -1022,6 +1141,137 @@ def _get_example_value(n):
         mutated_inputs,
         outputs,
     )
+=======
+    # This function can be called under autograd, functional, proxy and fake tensor mode.
+    # We need to return either a fake tensor or a real tensor depending on the mode.
+    # to detect the input mutation/aliasing.
+    with disable_proxy_modes_tracing(), disable_functional_mode(), suspend_functionalization():
+
+        def _from_functional_tensor(t: torch.Tensor) -> torch.Tensor:
+            if isinstance(t, FunctionalTensor) or torch._is_functional_tensor(t):
+                return torch.empty_strided(
+                    t.size(),
+                    t.stride(),
+                    dtype=t.dtype,
+                    requires_grad=t.requires_grad,
+                    device=t.device,
+                )
+            return t
+
+        fake_args = pytree.tree_map_only(
+            torch.Tensor, _from_functional_tensor, fake_args
+        )
+    # We want to disable active functional, proxy and fake modes if any.
+    # to create a encapsulated environment for fake tensor prop
+    with torch.utils._python_dispatch._disable_current_modes():
+        """This function returns mutated inputs, inp-inp alias, inp-out alias, out-out alias
+        in the graph module gm. It checks whether input tensor versions have
+        changed after run gm once to detect mutation and checks tensor storage
+        to detect alias.
+        """
+
+        def _tensor_version(t) -> Optional[int]:
+            if isinstance(t, torch.Tensor):
+                if not isinstance(t, FakeTensor):
+                    raise RuntimeError("Only fake tensor is allowed")
+                return t._version
+            return None
+
+        def _tensor_storage(t) -> StorageWeakRef:
+            return StorageWeakRef(t._typed_storage())
+
+        def _get_shape_env(
+            fake_args,
+        ) -> Optional[torch.fx.experimental.symbolic_shapes.ShapeEnv]:
+            # detect_fake_mode requires there could be only one active fake mode. This
+            # restricts the usage of this function because the global TracingContext
+            # has a persistent fake mode but fake tensors can be created
+            # outside of the tracing context (e.g. in testing).
+            # Instead, we just look at fake_args fake tensor mode
+            if len(fake_args) == 0:
+                return torch.fx.experimental.symbolic_shapes.ShapeEnv()
+
+            for arg in fake_args:
+                if isinstance(arg, FakeTensor):
+                    return arg.fake_mode.shape_env
+            return None
+
+        # Clone the fake args to avoid mutating the original fake args
+        with ExitStack() as ctx_stack:
+            # We need to re-use prev_fake_mode's shape env to resolve
+            # the runtime assertions for unbacked symbols.
+            new_fake_mode = torch._subclasses.FakeTensorMode(
+                shape_env=_get_shape_env(fake_args),
+                allow_non_fake_inputs=False,
+            )
+            # We need to temporarily turn inference_mode off because
+            # under inference mode, tensor version counter is not tracked.
+            no_inference_mode_ctx = torch.inference_mode(False)
+            ctx_stack.enter_context(new_fake_mode)
+            ctx_stack.enter_context(no_inference_mode_ctx)
+            if new_fake_mode.shape_env is not None:
+                ctx_stack.enter_context(
+                    new_fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                )
+
+            # create new fake tensors in new fake mode to avoid mutating original tensors
+            cloned = [
+                torch.empty_strided(
+                    arg.size(),
+                    arg.stride(),
+                    dtype=arg.dtype,
+                    device=arg.device,
+                    requires_grad=arg.requires_grad,
+                    layout=arg.layout,
+                )
+                if isinstance(arg, torch.Tensor)
+                else arg
+                for arg in fake_args
+            ]
+            before = [_tensor_version(arg) for arg in cloned]
+            outputs = gm(*cloned)
+            outputs = [outputs] if not isinstance(outputs, (list, tuple)) else outputs
+            after = [_tensor_version(arg) for arg in cloned]
+            mutated_inputs = [
+                i for i, (v1, v2) in enumerate(zip(before, after)) if v1 != v2
+            ]
+        # We need to analyze the original fake_args to detect
+        # inp-inp alias.
+        inp_storage_map = {
+            _tensor_storage(inp): i
+            for i, inp in enumerate(fake_args)
+            if isinstance(inp, torch.Tensor)
+        }
+        inp_inp_alias_map = {
+            i: inp_storage_map[_tensor_storage(inp)]
+            for i, inp in enumerate(fake_args)
+            if isinstance(inp, torch.Tensor)
+            and inp_storage_map[_tensor_storage(inp)] != i
+        }
+        out_storage_map = {
+            _tensor_storage(out): i
+            for i, out in enumerate(outputs)
+            if isinstance(out, torch.Tensor)
+        }
+        out_out_alias_map = {
+            i: out_storage_map[_tensor_storage(out)]
+            for i, out in enumerate(outputs)
+            if isinstance(out, torch.Tensor)
+            and out_storage_map[_tensor_storage(out)] != i
+        }
+        inp_out_alias_map = {
+            i: out_storage_map[_tensor_storage(inp)]
+            for i, inp in enumerate(cloned)
+            if isinstance(inp, torch.Tensor) and _tensor_storage(inp) in out_storage_map
+        }
+        return (
+            inp_inp_alias_map,
+            inp_out_alias_map,
+            out_out_alias_map,
+            mutated_inputs,
+            outputs,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 registered_hop_fake_fns: dict[torch._ops.OpOverload, Callable] = {}
@@ -1031,11 +1281,21 @@ def _get_example_value(n):
 
 
 @overload
+<<<<<<< HEAD
 def register_fake(hop, fn: None = None) -> Callable[[F], F]: ...
 
 
 @overload
 def register_fake(hop, fn: F) -> F: ...
+=======
+def register_fake(hop, fn: None = None) -> Callable[[F], F]:
+    ...
+
+
+@overload
+def register_fake(hop, fn: F) -> F:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def register_fake(hop, fn=None):
@@ -1140,7 +1400,11 @@ def call_op(op: Union[OpOverload, HopInstance], args, kwargs):
 
 def materialize_as_graph(
     fn: Callable,
+<<<<<<< HEAD
     args: tuple[Any, ...],
+=======
+    args: tuple[Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     include_key_set: Optional[torch._C.DispatchKeySet] = None,
     exclude_key_set: Optional[torch._C.DispatchKeySet] = None,
     force_enable_grad=False,
@@ -1155,22 +1419,30 @@ def _materialize_as_graph_inner():
         with suspend_functionalization(), disable_functional_mode():
             with disable_proxy_modes_tracing():
                 unfunc_t = [_from_fun(arg) for arg in args]
+<<<<<<< HEAD
 
             with contextlib.ExitStack() as stack:
                 stack.enter_context(
                     torch.utils._python_dispatch._disable_current_modes()
                 )
                 stack.enter_context(
+=======
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
                 )
                 if force_enable_grad:
                     stack.enter_context(torch.enable_grad())
+<<<<<<< HEAD
                 # fake_mode is needed because parent tracer's fake_mode might
                 # be None but the associated inputs have fake mode or there
                 # is a global tracing context with fake mode. We nneed to
                 # make sure the fake mode when tracing subgraph is consistent.
                 if fake_mode := detect_fake_mode(unfunc_t):
                     stack.enter_context(fake_mode)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return _maybe_reenter_make_fx(fn)(*unfunc_t)
 
     gm = _materialize_as_graph_inner()
@@ -1243,6 +1515,7 @@ def _has_gen_schema(op: HigherOrderOperator):
     return hasattr(type(op), method) and getattr(type(op), method) is not getattr(
         HigherOrderOperator, method
     )
+<<<<<<< HEAD
 
 
 def filter_with_masks(data: list[Optional[torch.Tensor]], masks: list[bool]):
@@ -1253,3 +1526,5 @@ def filter_with_masks(data: list[Optional[torch.Tensor]], masks: list[bool]):
 def fill_none_with_masks(data: list[Optional[torch.Tensor]], masks: list[bool]):
     data_iter = iter(data)
     return [next(data_iter) if kept else None for kept in masks]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 4ada93c6e47c6..6994bf8138010 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 import functools
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -11,11 +15,15 @@
     _maybe_run_with_interpreter,
     _set_compilation_env,
     autograd_not_implemented,
+<<<<<<< HEAD
     check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
     fill_none_with_masks,
     filter_with_masks,
     materialize_as_graph,
+=======
+    check_meta_consistency,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reenter_make_fx,
     validate_subgraph_args_types,
 )
@@ -23,7 +31,10 @@
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_metadata_torch_function_mode,
+<<<<<<< HEAD
     disable_proxy_modes_tracing,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
@@ -54,6 +65,7 @@ def __call__(
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def gen_schema(self, cond_fn, body_fn, carried_inputs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
@@ -124,6 +136,8 @@ def _find_example_value(n, real_inp):
         )
         return schema_gen.gen_schema()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 while_loop_op = WhileLoopOp()
 
@@ -184,9 +198,15 @@ def body_fn(int_iter, x):
 
         - body_fn and cond_fn must not in-place mutate the carried_inputs. A clone before the mutation is required.
 
+<<<<<<< HEAD
         - body_fn and cond_fn must not mutate python variables (e.g. list/dict) created outside of the body_fn.
 
         - body_fn and cond_fn's output cannot alias any of the inputs. A clone is required.
+=======
+        - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
+
+        - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. warning::
         Temporal Limitations:
@@ -247,9 +267,13 @@ def _while_loop_op_wrapper(*args, **kwargs):
         with _temp_remove_metadata_torch_function_mode() as metadata_mode:
             with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                 if metadata_mode:
+<<<<<<< HEAD
                     backend: Union[str, Callable[..., Any]] = (
                         make_eager_backend_with_torch_function_mode(metadata_mode)
                     )
+=======
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     backend = "eager"
                 return torch.compile(
@@ -258,9 +282,13 @@ def _while_loop_op_wrapper(*args, **kwargs):
 
 
 @while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+<<<<<<< HEAD
 def while_loop_dense(
     cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
 ):
+=======
+def while_loop_dense(cond_fn, body_fn, carried_inputs, additional_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     carried_vals = carried_inputs
 
     def _validate_cond_output(pred):
@@ -280,6 +308,7 @@ def _validate_cond_output(pred):
             f"carried_inputs must be a tuple or list but got {type(carried_inputs)}"
         )
 
+<<<<<<< HEAD
     # Check condition and set up flag
     should_loop = cond_fn(*carried_vals, *additional_inputs)
     _validate_cond_output(should_loop)
@@ -333,6 +362,24 @@ def while_loop_autograd(cond_fn, body_fn, operands, additional_inputs):
         *operands,
         *additional_inputs,
     )
+=======
+    while pred := cond_fn(*carried_vals, *additional_inputs):
+        _validate_cond_output(pred)
+        out = body_fn(*carried_vals, *additional_inputs)
+        assert isinstance(
+            out, tuple
+        ), f"body_fn should return a tuple but got {type(out)}"
+        assert len(out) == len(
+            carried_inputs
+        ), "body_fn should return the same number of elements as carried_inputs"
+        carried_vals = out
+    return carried_vals
+
+
+while_loop_op.py_autograd_impl(
+    autograd_not_implemented(while_loop_op, deferred_error=True)
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _find_or_create_fake_mode() -> FakeTensorMode:
@@ -348,9 +395,15 @@ def _find_or_create_fake_mode() -> FakeTensorMode:
 def _create_unbacked_symint(
     fake_mode: FakeTensorMode, ignore_fresh_unbacked_symbols: bool
 ) -> torch.SymInt:
+<<<<<<< HEAD
     assert fake_mode is not None and fake_mode.shape_env is not None, (
         "Must provide a fake_mode with shape_env."
     )
+=======
+    assert (
+        fake_mode is not None and fake_mode.shape_env is not None
+    ), "Must provide a fake_mode with shape_env."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ctx = (
         contextlib.nullcontext()
         if not ignore_fresh_unbacked_symbols
@@ -361,6 +414,7 @@ def _create_unbacked_symint(
 
 
 @while_loop_op.py_impl(ProxyTorchDispatchMode)
+<<<<<<< HEAD
 def while_loop_tracing(
     mode,
     cond_fn,
@@ -373,6 +427,11 @@ def while_loop_tracing(
 
     def _trace_while_loop(
         proxy_mode, op, cond_fn, body_fn, carried_inputs, additional_inputs
+=======
+def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs):
+    def _trace_while_loop(
+        proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # NOTE [unspecialize int carry with unbacked symints]
         # When we support int carry, we'll also need to support int output of body_fn because.
@@ -406,12 +465,18 @@ def _trace_while_loop(
         #   For this reason, we treat int, symint outputs in the same way:
         #   - they can match against any of int, symint carry
         #   - we unspecialize them with new unbacked symints in fake while_loop
+<<<<<<< HEAD
         #   Similarly, we could do some analysis to refine the output ranges but it's easier to start with
         #   fresh unbacked symints. One surprising case can be: an input unbacked symint is constrained by
+=======
+        #   Similarly, we could do some analysis to refine the output ranges but it's eaiser to start with
+        #   fresh unbacked symints. One suprising case can be: an input unbacked symint is constrained by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #   users to be >= 0 (either before while_loop or inside body_fn) and it increments by 1 in each
         #   iteration. Ideally, we should know that the final output is >= 0 but we didn't constrain the
         #   unbacked symint output of subgraph as of today because this requires a smart range analysis.
         fake_mode: FakeTensorMode = _find_or_create_fake_mode()
+<<<<<<< HEAD
 
         def _unspecialize_carried_inputs(x):
             if isinstance(x, (int, torch.SymInt)):
@@ -455,6 +520,26 @@ def produce_graph(fn):
         next_name = None
         i = 0
         # pyrefly: ignore [bad-assignment]
+=======
+        unspecialized_carried_inputs = pytree.tree_map_only(
+            (int, torch.SymInt),
+            # For temporarily created unbacked symints, we don't need to bind them to any proxy
+            lambda _: _create_unbacked_symint(
+                fake_mode, ignore_fresh_unbacked_symbols=True
+            ),
+            carried_inputs,
+        )
+
+        cond_graph = reenter_make_fx(cond_fn)(
+            *unspecialized_carried_inputs, *additional_inputs
+        )
+        body_graph = reenter_make_fx(body_fn)(
+            *unspecialized_carried_inputs, *additional_inputs
+        )
+
+        next_name = None
+        i = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while not next_name:
             candidate = f"while_loop_cond_graph_{i}"
             if hasattr(proxy_mode.tracer.root, candidate):
@@ -473,10 +558,17 @@ def produce_graph(fn):
         proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
 
         out_proxy = proxy_mode.tracer.create_proxy(
+<<<<<<< HEAD
             "call_function", op, proxy_args, {}, name=op._name
         )
 
         out = op(
+=======
+            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+        )
+
+        out = while_loop_op(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cond_graph, body_graph, unspecialized_carried_inputs, additional_inputs
         )
         return track_tensor_tree(
@@ -484,18 +576,26 @@ def produce_graph(fn):
         )
 
     return _trace_while_loop(
+<<<<<<< HEAD
         mode,
         op,
         cond_fn,
         body_fn,
         carried_inputs,
         additional_inputs,
+=======
+        mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 @while_loop_op.py_impl(FakeTensorMode)
 def while_loop_fake_tensor_mode(
+<<<<<<< HEAD
     mode, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
+=======
+    mode, cond_fn, body_fn, carried_inputs, additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     with mode:
         # NOTE: [Handling unback symints in subgraph of while_loop]
@@ -540,6 +640,7 @@ def while_loop_fake_tensor_mode(
                 "body_output",
                 include_contiguity=False,
             )
+<<<<<<< HEAD
 
         if stack_output:
             n_iter = _create_unbacked_symint(mode, ignore_fresh_unbacked_symbols=False)
@@ -560,6 +661,8 @@ def while_loop_fake_tensor_mode(
                 fake_outputs,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # See NOTE [unspecialize int carry with unbacked symints]
         return pytree.tree_map_only(
             (int, torch.SymInt),
@@ -573,6 +676,7 @@ def while_loop_fake_tensor_mode(
 
 
 @while_loop_op.py_functionalize_impl
+<<<<<<< HEAD
 def while_loop_func(
     ctx, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
 ):
@@ -580,6 +684,11 @@ def while_loop_func(
 
     op = while_loop_stack_output_op if stack_output else while_loop_op
 
+=======
+def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
@@ -592,13 +701,18 @@ def while_loop_func(
             (body_fn, "body_fn"),
         ]:
             _check_alias_and_mutation(fn, unwrapped_inputs, fn_name, pre_dispatch)
+<<<<<<< HEAD
         ret = op(
+=======
+        ret = while_loop_op(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             functional_cond_fn,
             functional_body_fn,
             unwrapped_carried_inputs,
             unwrapped_additional_inputs,
         )
         return ctx.wrap_tensors(ret)
+<<<<<<< HEAD
 
 
 class WhileLoopStackOutputOp(HigherOrderOperator):
@@ -926,3 +1040,5 @@ def body_fn(*flat_args):
 while_loop_stack_output_op.py_autograd_impl(
     autograd_not_implemented(while_loop_stack_output_op, deferred_error=True)
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index ba6bbe0c39b6b..13fb700474145 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -2,6 +2,7 @@
 import inspect
 import itertools
 import logging
+<<<<<<< HEAD
 from typing import Any, Optional
 
 import torch
@@ -11,6 +12,12 @@
 from torch._ops import HigherOrderOperator
 from torch.fx import GraphModule
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+=======
+from typing import Optional
+
+from torch._logging import warning_once
+from torch._ops import HigherOrderOperator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.types import _dtype
 
 
@@ -53,11 +60,16 @@ def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
 
         @disable
         def wrapper():
+<<<<<<< HEAD
             prev = torch.is_grad_enabled()
             torch.set_grad_enabled(enable_grad)
             res = wrapped_func(*args, **kwargs)
             torch.set_grad_enabled(prev)
             return res
+=======
+            with torch.set_grad_enabled(enable_grad):
+                return wrapped_func(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return wrapper()
 
@@ -235,8 +247,12 @@ def divide_kwargs(kwargs):
         }
         return checkpoint_kwargs, gmod_kwargs
 
+<<<<<<< HEAD
     @staticmethod
     def tag_nodes(gmod, is_sac):
+=======
+    def tag_nodes(self, gmod, is_sac):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.utils.checkpoint import CheckpointPolicy
 
         unique_graph_id = next(uid)
@@ -252,6 +268,7 @@ def tag_nodes(gmod, is_sac):
         return gmod
 
     def __call__(self, gmod, *args, **kwargs):
+<<<<<<< HEAD
         dispatch_key_set = torch._ops._compute_keyset(
             args, kwargs, self.non_fallthrough_keys
         )
@@ -341,3 +358,46 @@ def proxy_mode_key(
     return track_tensor_tree(
         example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
     )
+=======
+        import torch.fx.traceback as fx_traceback
+        from torch.fx import Interpreter
+
+        if "_checkpoint_context_fn" in gmod.meta:
+            warning_once(
+                log,
+                """
+Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
+Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
+""",
+            )
+            # use_reentrant is set to False because this op is going to be traced.
+            # And we ensure that AOT Autograd traces through the non reentrant
+            # version of checkpointing.
+            kwargs["use_reentrant"] = False
+            # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
+            # `torch.random.fork_rng` op (which is not supported yet under CUDA).
+            # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
+            # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
+            # instead of in AOTAutograd).
+            kwargs["preserve_rng_state"] = False
+            kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
+            # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
+            # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
+            gmod = self.tag_nodes(gmod, is_sac=True)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            with fx_traceback.preserve_node_meta():
+                from torch.utils.checkpoint import checkpoint
+
+                return checkpoint(Interpreter(gmod).run, *args, **kwargs)
+        else:
+            gmod = self.tag_nodes(gmod, is_sac=False)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
+            # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
+            # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
+            with fx_traceback.preserve_node_meta():
+                return Interpreter(gmod).run(*args)
+
+
+tag_activation_checkpoint = TagActivationCheckpoint()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index f2f3021984f66..d4071f55cc370 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -6,6 +6,10 @@
 import os
 from typing import Any, IO, Literal, Optional, TYPE_CHECKING, Union
 
+<<<<<<< HEAD
+=======
+import torch._inductor.config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.fx
 
 from .standalone_compile import CompiledArtifact  # noqa: TC001
@@ -14,7 +18,10 @@
 if TYPE_CHECKING:
     from torch._inductor.utils import InputType
     from torch.export import ExportedProgram
+<<<<<<< HEAD
     from torch.export.pt2_archive._package import AOTICompiledModel
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.export.pt2_archive._package_weights import Weights
     from torch.types import FileLike
 
@@ -151,7 +158,10 @@ def aoti_compile_and_package(
     return aot_inductor_minifier_wrapper(
         _aoti_compile_and_package_inner,
         exported_program,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         package_path=package_path,
         inductor_configs=inductor_configs,
     )
@@ -224,7 +234,11 @@ def _aoti_compile_and_package_inner(
             not_strict_accuracy = check_accuracy == "accuracy"
             if not same_two_models(
                 gm,
+<<<<<<< HEAD
                 compiled_model,  # type: ignore[arg-type]
+=======
+                compiled_model,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args,
                 only_fwd=True,
                 require_fp64=not_strict_accuracy,
@@ -239,7 +253,11 @@ def _aoti_compile_and_package_inner(
 
 def aoti_load_package(
     path: FileLike, run_single_threaded: bool = False, device_index: int = -1
+<<<<<<< HEAD
 ) -> AOTICompiledModel:
+=======
+) -> Any:  # type: ignore[type-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Loads the model from the PT2 package.
 
@@ -276,7 +294,11 @@ def aot_compile(
     kwargs: Optional[dict[str, Any]] = None,
     *,
     options: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
 ) -> Union[str, list[Union[str, Weights]], torch.fx.GraphModule]:
+=======
+) -> Union[str, list[Union[str, Weights]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
 
@@ -293,6 +315,7 @@ def aot_compile(
     """
     from .compile_fx import _aoti_flatten_inputs, compile_fx_aot
 
+<<<<<<< HEAD
     if hasattr(gm, "_guards_fn"):
         # Do not compile the guards function, since it may contain checks
         # that are not currently supported by AOTI. In particular, non-Tensor
@@ -302,6 +325,8 @@ def aot_compile(
         delattr(gm, "_guards_fn")
         gm.recompile()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flat_example_inputs, options = _aoti_flatten_inputs(
         gm, args, kwargs, options=options
     )
@@ -390,7 +415,10 @@ def standalone_compile(
         "from_example_inputs", "from_tracing_context", "from_graph"
     ] = "from_graph",
     options: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
     aot: bool = False,  # AOT mode, which uses BundledAOTAutogradCache
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> CompiledArtifact:
     """
     Precompilation API for inductor.
@@ -422,5 +450,9 @@ def standalone_compile(
 
     options = options if options else {}
     return standalone_compile(
+<<<<<<< HEAD
         gm, example_inputs, dynamic_shapes=dynamic_shapes, options=options, aot=aot
+=======
+        gm, example_inputs, dynamic_shapes=dynamic_shapes, options=options
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/torch/_inductor/analyze_preserves_zero_mask.py b/torch/_inductor/analyze_preserves_zero_mask.py
index 0674d1566c33b..a9becc844819a 100644
--- a/torch/_inductor/analyze_preserves_zero_mask.py
+++ b/torch/_inductor/analyze_preserves_zero_mask.py
@@ -106,7 +106,10 @@ def check_bounds(
         pass
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def indirect_indexing(*args: Any, **kwargs: Any) -> sympy.Expr:
         return sympy.S.Zero
 
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 9ab6c68a77837..9341c6f7ebf94 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -2,13 +2,19 @@
 from __future__ import annotations
 
 import atexit
+<<<<<<< HEAD
 import contextlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import json
 import logging
 import multiprocessing
 import os
+<<<<<<< HEAD
 import re
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from concurrent.futures import Future, ThreadPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
@@ -38,11 +44,15 @@
     StaticAutotunerFuture,
     torch_key,
 )
+<<<<<<< HEAD
 from torch._inductor.compile_worker.subproc_pool import (
     AnyPool,
     SubprocException,
     SubprocPool,
 )
+=======
+from torch._inductor.compile_worker.subproc_pool import AnyPool, SubprocPool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.compile_worker.tracked_process_pool import (
     TrackedProcessPoolExecutor,
 )
@@ -53,7 +63,10 @@
 )
 from torch._inductor.utils import clear_on_fresh_cache
 from torch._inductor.virtualized import V
+<<<<<<< HEAD
 from torch._utils_internal import log_triton_builds
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.hub import _Faketqdm, tqdm
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._triton import has_triton_package
@@ -73,10 +86,13 @@
 
 _triton_kernel_metrics: Optional[dict[str, dict[str, Any]]] = None
 
+<<<<<<< HEAD
 size_hints_regex = re.compile(
     r"size_hints=(\{.*?\})",
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def pre_fork_setup():
     """
@@ -88,10 +104,20 @@ def pre_fork_setup():
 
     # Computing the triton key can be slow. If we call it before fork,
     # it will be cached for the forked subprocesses.
+<<<<<<< HEAD
     from torch._inductor.runtime.triton_compat import HAS_TRITON, triton_key
 
     if HAS_TRITON:
         triton_key()
+=======
+    try:
+        from triton.compiler.compiler import triton_key
+
+        triton_key()
+    except ImportError:
+        # Triton might not be installed or might be an old version.
+        pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def caching_device_properties():
@@ -148,7 +174,10 @@ def shutdown_compile_workers() -> None:
     """Shut down all outstanding compile-worker pools."""
     for pool in _pool_set:
         pool.shutdown()
+<<<<<<< HEAD
     AsyncCompile._ready_future = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     after_fork()
 
 
@@ -228,6 +257,7 @@ def remove_future(kernel_src: str) -> None:
             del CompiledTritonKernels._cache[key]
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
 def async_compile_pool_manager():
     """
@@ -247,6 +277,9 @@ class AsyncCompile:
 
     _ready_future: Optional[Future[Any]] = None
 
+=======
+class AsyncCompile:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         pass
 
@@ -265,7 +298,10 @@ def _get_ready():
     @functools.lru_cache(1)
     def process_pool() -> AnyPool:
         assert get_compile_threads() > 1
+<<<<<<< HEAD
         AsyncCompile._ready_future = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.info(
             "Creating '%s' pool with %d workers",
             config.worker_start_method,
@@ -293,6 +329,11 @@ def process_pool() -> AnyPool:
             # kill the worker thread that sends the shutdown message to the workers...
             multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
 
+<<<<<<< HEAD
+=======
+        # Set an attribute we can check to see if the pool is ready.
+        pool.ready_future = pool.submit(AsyncCompile._get_ready)  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _pool_set.add(pool)
         return pool
 
@@ -301,24 +342,32 @@ def warm_pool(cls) -> None:
         if get_compile_threads() <= 1:
             return
         _compile_start()
+<<<<<<< HEAD
         # Pool is created on first access. Note for a SubprocPool, the sidecar process starts,
         # but its ProcessPoolExecutor does not initialize until a wakeup() call or the first
         # job is submitted.
+=======
+        # Pool is initialized on first access
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls.process_pool()
         _compile_end()
 
     @classmethod
+<<<<<<< HEAD
     def wait_pool_ready(cls, timeout=120) -> None:
         cls.use_process_pool()
         if cls._ready_future is not None:
             cls._ready_future.result(timeout=timeout)
 
     @classmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def submit(cls, task: Callable[..., Any]) -> Any:
         if get_compile_threads() <= 1:
             return task()
         return cls.pool().submit(task)
 
+<<<<<<< HEAD
     @classmethod
     def use_process_pool(cls):
         if get_compile_threads() <= 1:
@@ -356,6 +405,12 @@ def wakeup(cls) -> None:
         pool = cls.process_pool()
         if isinstance(pool, SubprocPool):
             pool.wakeup()
+=======
+    def use_process_pool(self):
+        return (
+            get_compile_threads() > 1 and self.process_pool().ready_future.done()  # type: ignore[union-attr]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
         """
@@ -426,6 +481,7 @@ def reload_kernel_in_parent():
                 "use_static_cuda_launcher": torch._inductor.config.use_static_cuda_launcher
             }
 
+<<<<<<< HEAD
             if len(torch._inductor.config.autotune_lookup_table) > 0:
                 m = size_hints_regex.search(source_code)
                 if m:
@@ -447,6 +503,8 @@ def reload_kernel_in_parent():
                         fn_hash: torch._inductor.config.autotune_lookup_table[fn_hash]
                     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             task = self.process_pool().submit(
                 _worker_compile_triton,
                 load_kernel,
@@ -455,18 +513,25 @@ def reload_kernel_in_parent():
             )
 
             def get_result() -> CachingAutotuner:
+<<<<<<< HEAD
                 try:
                     kernel, elapsed_us = task.result()
                 except SubprocException as e:
                     raise e.with_name(kernel_name) from e
 
+=======
+                kernel, elapsed_us = task.result()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Now that we've compiled, we should clear the future
                 # so it can't be used again
                 kernel.set_compile_info(compile_id, is_backward)
                 CompiledTritonKernels.remove_future(source_code)
 
+<<<<<<< HEAD
                 kernel.restore_after_unpickle(old_values=None)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kernel.precompile(
                     warm_cache_only=False,
                     reload_kernel=reload_kernel_in_parent,
@@ -491,6 +556,7 @@ def get_result() -> CachingAutotuner:
                 log_waitcounter=True,
                 waitcounter_name_override="compile_triton",
             ):
+<<<<<<< HEAD
                 fail = None
                 try:
                     start_ns = time_ns()
@@ -514,6 +580,24 @@ def get_result() -> CachingAutotuner:
                     raise
                 finally:
                     log_triton_builds(fail=fail)
+=======
+                start_ns = time_ns()
+                _set_triton_ptxas_path()
+                kernel = load_kernel()
+                kernel.set_compile_info(compile_id, is_backward)
+                kernel.precompile(
+                    warm_cache_only=False,
+                    static_triton_bundle_key=CompiledTritonKernels.key(source_code),
+                )
+                elapsed_us = (time_ns() - start_ns) // 1000
+                get_metrics_context().add_top_n(
+                    "triton_kernel_compile_times_us", kernel_name, elapsed_us
+                )
+                info = kernel.autotune_cache_info or {}
+                info["compile_time_us"] = elapsed_us
+                _add_triton_kernel_info(kernel_name, info)
+                return kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def multi_kernel(self, *args, **kwargs) -> Any:
         from torch._inductor.codegen.multi_kernel import MultiKernelCall
@@ -521,11 +605,14 @@ def multi_kernel(self, *args, **kwargs) -> Any:
         # no need to call this in parallel since the sub-kernels are already parallel tasks
         return MultiKernelCall(*args, **kwargs)
 
+<<<<<<< HEAD
     def size_hint_multi_kernel(self, *args, **kwargs) -> Any:
         from torch._inductor.codegen.multi_kernel import SizeHintMultiKernelCall
 
         return SizeHintMultiKernelCall(*args, **kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def cpp(self, source_code: str):
         kernel_code_log.info("CPP Kernel:\n%s", source_code)
         if get_compile_threads() <= 1:
@@ -585,6 +672,7 @@ def halide(self, meta: HalideMeta, source_code: str):
             )
             return LambdaFuture(get_result)
 
+<<<<<<< HEAD
     def cutedsl(self, kernel_name: str, source_code: str):
         """
         Compile CuteDSL (CUTLASS Python DSL) kernels.
@@ -624,6 +712,8 @@ def task():
             future = self.submit(task)
             return LambdaFuture(lambda: future.result())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def wait(self, scope: dict[str, Any]) -> None:
         if get_compile_threads() > 1:
             with dynamo_timed(
@@ -665,6 +755,7 @@ def _wait_futures(self, scope: dict[str, Any]) -> None:
             pbar.update(1)
 
 
+<<<<<<< HEAD
 def maybe_warm_pool() -> None:
     if (
         os.environ.get("TORCH_TNT_IN_USE", "0") == "1"
@@ -683,6 +774,20 @@ def maybe_warm_pool() -> None:
     # could start them lazily if we're willing to lose a small amount of compile time.
     AsyncCompile.wakeup()
 
+=======
+if (
+    os.environ.get("TORCH_TNT_IN_USE", "0") == "1"
+    or os.environ.get("TORCH_WARM_POOL", "1") != "1"
+    # The subprocess pool is only used for the Triton backend
+    or not has_triton_package()
+    # Skip for fbcode. We have internal reports of usages inside multiprocessing
+    # pools that lead a multiplicative number of compile subprocesses.
+    or config.is_fbcode()
+):
+    pass
+else:
+    AsyncCompile.warm_pool()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # On exit give the workers a chance to clean themselves up. Without this the
 # resource_tracker can complain about leaked semaphores coming from the
diff --git a/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py b/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
index 7ebf134c83d7c..9e824ee8eb690 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
@@ -17,7 +17,11 @@
 class MMRankingA100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.choices: list[Choice] = []
+=======
+        self.choices: List[Choice] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -238,7 +242,11 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mm'
 
+<<<<<<< HEAD
     def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
+=======
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if context.get_value('arith_intensity') <= 52.6245059967041:
             if context.get_value('n') <= 34.0:
                 if context.get_value('n') <= 18.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py b/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
index 6201acc4213aa..fbbe40895c45a 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
@@ -17,7 +17,11 @@
 class MMRankingH100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.choices: list[Choice] = []
+=======
+        self.choices: List[Choice] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -242,7 +246,11 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mm'
 
+<<<<<<< HEAD
     def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
+=======
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if context.get_value('arith_intensity') <= 29.89772129058838:
             if context.get_value('n') <= 34.0:
                 if context.get_value('n') <= 18.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py b/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
index 1ba7cbaf90275..3f51de9c269a2 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
@@ -17,7 +17,11 @@
 class MixedMMA100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.choices: list[Choice] = []
+=======
+        self.choices: List[Choice] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -62,7 +66,11 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mixed_mm'
 
+<<<<<<< HEAD
     def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
+=======
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if str(context.get_value('1LEQmLEQ16')) != 'True':
             if context.get_value('m') <= 32.5:
                 if context.get_value('n') <= 6976.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
index c215790770420..f5bc4d3307a7a 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
@@ -17,7 +17,11 @@
 class MixedMMH100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
+<<<<<<< HEAD
         self.choices: list[Choice] = []
+=======
+        self.choices: List[Choice] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -61,7 +65,11 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mixed_mm'
 
+<<<<<<< HEAD
     def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
+=======
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if context.get_value('arith_intensity') <= 15.988086223602295:
             if context.get_value('n') <= 25280.0:
                 if context.get_value('n') <= 1344.0:
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 4b6d9e7347b3b..91ccdcaf5ad85 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -31,12 +31,16 @@
     get_hash,
     PyCodeCache,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import (
     get_gpu_type,
     get_ld_library_path,
     is_gpu,
     python_subprocess_env,
 )
+=======
+from torch._inductor.utils import get_gpu_type, get_ld_library_path, is_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import getArtifactLogger
 from torch.utils._ordered_set import OrderedSet
 
@@ -44,7 +48,11 @@
 if TYPE_CHECKING:
     from types import ModuleType
 
+<<<<<<< HEAD
     from torch._inductor.select_algorithm import PartialRender, TritonTemplateCaller
+=======
+    from torch._inductor.select_algorithm import TritonTemplateCaller
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import config
 from .runtime.benchmarking import benchmarker
@@ -128,8 +136,16 @@ def start(self):
             f"--read-fd={str(subproc_read_fd)}",
             f"--write-fd={str(subproc_write_fd)}",
         ]
+<<<<<<< HEAD
         env = {
             **python_subprocess_env(),
+=======
+        extra_env = {
+            # We need to set the PYTHONPATH so the subprocess can find torch.
+            "PYTHONPATH": os.environ.get(
+                "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We shouldn't be using the Triton async compile subprocess pool,
             # but as a precaution set the env var that disables its creation.
             "TORCH_WARM_POOL": "0",
@@ -141,10 +157,17 @@ def start(self):
             else "0",
         }
         if self.device is not None:
+<<<<<<< HEAD
             env[CUDA_VISIBLE_DEVICES] = str(self.device)
         self.process = subprocess.Popen(
             cmd,
             env=env,
+=======
+            extra_env[CUDA_VISIBLE_DEVICES] = str(self.device)
+        self.process = subprocess.Popen(
+            cmd,
+            env={**os.environ, **extra_env},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pass_fds=(subproc_read_fd, subproc_write_fd),
         )
         os.close(subproc_read_fd)
@@ -231,6 +254,7 @@ def kill(self) -> None:
             self.process.kill()
         self.close()
 
+<<<<<<< HEAD
     def restart(self) -> None:
         """
         Gracefully restarts the child process.
@@ -238,6 +262,8 @@ def restart(self) -> None:
         self.shutdown(wait=True)
         self.start()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TuningProcessPool:
     """
@@ -318,16 +344,23 @@ def target(self, choice: TritonTemplateCaller) -> float:
             )
             # Set to INF so this choice will be ignored
             return float("inf")
+<<<<<<< HEAD
         except Exception as process_exception:
+=======
+        except Exception:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             warnings.warn(
                 f"Failed to benchmark choice '{choice}'. It will be ignored. "
                 "Please debug the root cause in case the choice can bring perf gains."
             )
+<<<<<<< HEAD
             # An unspecified launch failure (cudaErrorLaunchFailure) corrupts the
             # CUDA context, making it unrecoverable. All subsequent CUDA calls will
             # fail as well. The process must be restarted to restore CUDA functionality.
             if "cudaErrorLaunchFailure" in str(process_exception):
                 process.restart()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Set to INF so this choice will be ignored
             return float("inf")
         finally:
@@ -778,7 +811,11 @@ def update_workspace_size(self) -> None:
             return
         self.ensure_dll_loaded()
         unique_input_count = len(
+<<<<<<< HEAD
             dict.fromkeys(meta.name for meta in self.input_tensor_meta)
+=======
+            {meta.name for meta in self.input_tensor_meta}  # noqa: set_linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         args = [c_void_p(None) for _ in range(unique_input_count + 1)]
         stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
@@ -888,6 +925,7 @@ def __str__(self) -> str:
         return f"{self.kernel_name=}"
 
 
+<<<<<<< HEAD
 class CuteDSLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
     """Benchmark request for CuteDSL (CUTLASS Python DSL) kernels."""
 
@@ -937,6 +975,8 @@ def cleanup_run_fn(self) -> None:
         """Clean up any resources used by the kernel."""
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.cache
 def get_tuning_process_pool() -> TuningProcessPool:
     pool = TuningProcessPool()
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index 47542cb6aef77..c2f46f0be5152 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -1,11 +1,16 @@
 from __future__ import annotations
 
 import typing
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.runtime.runtime_utils import next_power_of_2
 from torch._inductor.scheduler import MixOrderReduction
 from torch.utils._sympy.value_ranges import bound_sympy
@@ -28,6 +33,21 @@
     XPUConfigHeuristic,
 )
 from .utils import _use_autotune_backend
+=======
+
+from . import config
+from .codecache import write_text
+from .metrics import get_metric_table, is_metric_table_enabled
+from .runtime.hints import DeviceProperties, ReductionHint
+from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
+from .template_heuristics import (
+    BaseConfigHeuristic,
+    CPUConfigHeuristic,
+    CUDAConfigHeuristic,
+    ROCmConfigHeuristic,
+    XPUConfigHeuristic,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -37,6 +57,7 @@
 
     from triton import Config as TritonConfig
 
+<<<<<<< HEAD
     from .codegen.common import KernelTemplate
     from .codegen.simd_kernel_features import SIMDKernelFeatures
     from .codegen.triton import TritonKernel
@@ -44,6 +65,12 @@
     from .kernel_template_choice import KernelTemplateChoice
 
     from torch.utils._ordered_set import OrderedSet  # isort: skip
+=======
+    from torch.utils._ordered_set import OrderedSet
+
+    from .codegen.simd_kernel_features import SIMDKernelFeatures
+    from .codegen.triton import TritonKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Sortable(typing.Protocol):
@@ -77,11 +104,69 @@ def get_config_heuristics(
             return XPUConfigHeuristic()
         elif device_type == "cpu":
             return CPUConfigHeuristic()
+<<<<<<< HEAD
         elif device_type == "mtia":
             return MTIAConfigHeuristic()
         else:
             return BaseConfigHeuristic()
 
+=======
+        else:
+            return BaseConfigHeuristic()
+
+    # GEMM configs
+    def get_base_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        if config.max_autotune_gemm_search_space != "EXHAUSTIVE":
+            return mm_heuristics.get_mm_configs()
+        else:
+            return mm_heuristics.get_exhaustive_mm_configs()
+
+    def get_extra_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_extra_mm_configs()
+
+    def get_int8_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_int8_mm_configs()
+
+    def get_mixed_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_mixed_mm_configs()
+
+    def get_persistent_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_persistent_mm_configs()
+
+    def get_scaled_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_scaled_mm_configs()
+
+    def get_scaled_persistent_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_scaled_persistent_mm_configs()
+
+    def get_mm_plus_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_mm_plus_mm_configs()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Conv configs
     def get_conv_configs(
         self, device_type: Optional[str] = "cuda"
@@ -90,7 +175,10 @@ def get_conv_configs(
         return conv_heuristics.get_conv_configs()
 
     # Flex attention configs
+<<<<<<< HEAD
     # TODO(coconutruben): break out flexattention/decode configs into the new retrieval mechanism
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_flex_attention_fwd_configs(
         self, head_dim: int, dtype: torch.dtype, device_type: Optional[str] = "cuda"
     ) -> list[Any]:
@@ -109,6 +197,7 @@ def get_flex_decode_configs(
         flex_heuristics = self.get_config_heuristics(device_type)
         return flex_heuristics.get_flex_decode_configs(head_dim, dtype)
 
+<<<<<<< HEAD
     def _finalize_template_configs(
         self,
         template_choices: dict[str, Generator[KernelTemplateChoice, None, None]],
@@ -292,6 +381,8 @@ def get_template_configs(
         # Third pass: Convert to ChoiceCaller objects
         return [ktc.choice for ktc in adjusted_choices if ktc.choice is not None]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def triton_kernel_kwargs(
         self,
         kernel_cls: type[TritonKernel],
@@ -338,6 +429,7 @@ def should_use_persistent_reduction(
             ReductionHint.INNER: 1024,
         }.get(features.get_reduction_hint(), 64)
 
+<<<<<<< HEAD
         if features.get_reduction_hint() not in (
             ReductionHint.INNER,
             ReductionHint.OUTER_TINY,
@@ -372,6 +464,12 @@ def should_use_persistent_reduction(
                 threshold *= 32 // min(
                     V.graph.sizevars.size_hint_or_throw(features.numel), 32
                 )
+=======
+        if cooperative_reduction:
+            # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements
+            try:
+                threshold *= 32 // min(V.graph.sizevars.size_hint(features.numel), 32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except ValueError:
                 pass  # unbacked symint
 
@@ -381,12 +479,30 @@ def should_use_persistent_reduction(
         # to pick the faster one.
         if config.triton.multi_kernel:
             threshold *= 16
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return V.graph.sizevars.statically_known_leq(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    def want_no_x_dim(features: SIMDKernelFeatures) -> bool:
+        """
+        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
+        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
+        Strangely this is faster than a [1, RBLOCK] block in some cases.
+
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reduction_split_factor(
         device: torch.device,
         reduction_numel_hint: int,
@@ -488,9 +604,13 @@ def can_fuse(
             - config.triton.tiling_prevents_reduction_fusion
             - config.aggressive_fusion (will cause this function to be called more times)
         """
+<<<<<<< HEAD
         if (
             shared_data_score == 0 and not MixOrderReduction.can_fuse(node1, node2)
         ) and (
+=======
+        if shared_data_score == 0 and (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
         ):
             if is_metric_table_enabled("fusion_failure_due_to_indexing_mismatch"):
@@ -530,6 +650,7 @@ def can_fuse(
             WhyNoFuse(node1, node2)("Fusion will increase peak memory")
             return False
 
+<<<<<<< HEAD
         if (
             config.max_fusion_unique_io_buffers is not None
             and scheduler.fusion_prevent_too_many_reads_and_writes(
@@ -541,6 +662,8 @@ def can_fuse(
             WhyNoFuse(node1, node2)("fusion_prevent_too_many_reads_and_writes")
             return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     @staticmethod
@@ -561,9 +684,13 @@ def can_fuse_horizontal(
         shared_data_score: int,
     ) -> bool:
         """Hook for heuristics to prevent horizontal (consumer/consumer) fusions"""
+<<<<<<< HEAD
         if (
             shared_data_score < config.score_fusion_memory_threshold
         ) and not MixOrderReduction.can_fuse(node1, node2):
+=======
+        if shared_data_score < config.score_fusion_memory_threshold:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             WhyNoFuse(node1, node2)("score_fusion_memory_threshold")
             return False
         if scheduler.are_long_distant_nodes(node1, node2):
@@ -604,7 +731,10 @@ def score_fusion(
                 and memory_score > 0
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             template_score,
             node1.is_reduction() == node2.is_reduction() and memory_score > 0,
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 512efeb633625..cbb045d0dafa4 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -14,7 +14,10 @@
 import os
 import pickle
 import pkgutil
+<<<<<<< HEAD
 import platform
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import shlex
 import shutil
@@ -31,15 +34,32 @@
 from datetime import timedelta
 from functools import lru_cache, partial
 from pathlib import Path
+<<<<<<< HEAD
 from tempfile import _TemporaryFileWrapper
 from time import time, time_ns
 from types import ModuleType
 from typing import Any, Callable, cast, Generic, NoReturn, TYPE_CHECKING, TypeVar, Union
+=======
+from time import time, time_ns
+from types import ModuleType
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Generic,
+    NoReturn,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import override, Self
 
 import torch
 import torch.distributed as dist
 from torch import SymInt, Tensor
+<<<<<<< HEAD
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.exc import SkipFrame
 from torch._dynamo.utils import (
@@ -51,6 +71,12 @@
 from torch._inductor import config, exc, metrics
 from torch._inductor.codegen.common import (
     custom_backend_codegen_configs,
+=======
+from torch._dynamo.exc import SkipFrame
+from torch._dynamo.utils import CompileEventLogger, counters, dynamo_timed
+from torch._inductor import config, exc, metrics
+from torch._inductor.codegen.common import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_backend_passes,
     init_backend_registration,
 )
@@ -73,15 +99,21 @@
     get_ld_and_objcopy,
     get_name_and_dir_from_output_file_path,
     normalize_path_separator,
+<<<<<<< HEAD
     run_asm_build_object,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._inductor.cpu_vec_isa import pick_vec_isa
 from torch._inductor.custom_graph_pass import (
     CustomGraphModulePass,
     CustomGraphPass,
     CustomGraphPassType,
+<<<<<<< HEAD
     CustomPartitionerFn,
     CustomPartitionerFnType,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.runtime.compile_tasks import _reload_python_module
@@ -89,7 +121,10 @@
 from torch._inductor.utils import (
     ALIGN_BYTES,
     clear_on_fresh_cache,
+<<<<<<< HEAD
     determine_aoti_mmap_flags,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_linux,
     is_windows,
 )
@@ -122,6 +157,29 @@
 if config.is_fbcode():
     from triton.fb.build import build_paths
 
+<<<<<<< HEAD
+=======
+    from torch._inductor.fb.utils import (
+        log_global_cache_errors,
+        log_global_cache_stats,
+        log_global_cache_vals,
+        use_global_cache,
+    )
+else:
+
+    def log_global_cache_errors(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def log_global_cache_stats(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def log_global_cache_vals(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def use_global_cache() -> bool:
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 T = TypeVar("T")
 
@@ -141,10 +199,16 @@
 
 
 _IS_WINDOWS = sys.platform == "win32"
+<<<<<<< HEAD
 LOCK_TIMEOUT = config.file_lock_timeout
 
 output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
 autotuning_log = torch._logging.getArtifactLogger(__name__, "autotuning")
+=======
+LOCK_TIMEOUT = 600
+
+output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 
@@ -172,6 +236,7 @@ def get_kernel_bin_format(device: str) -> str:
         return ""
 
 
+<<<<<<< HEAD
 def get_device_information(device_type: str) -> dict[str, str]:
     """
     Gets all the current device information used to compile the .so.
@@ -185,12 +250,22 @@ def get_device_information(device_type: str) -> dict[str, str]:
         ),
     }
     return metadata
+=======
+@functools.cache
+def get_global_cache_path_impl(global_cache_dir: str) -> Optional[Path]:
+    return (
+        Path(os.path.join(global_cache_dir, CacheBase.get_system()["hash"]))
+        if global_cache_dir is not None
+        else None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CacheBase:
     @staticmethod
     @functools.cache
     def get_system() -> dict[str, Any]:
+<<<<<<< HEAD
         from torch._inductor.runtime.triton_compat import HAS_TRITON, triton_key
 
         if HAS_TRITON:
@@ -198,6 +273,15 @@ def get_system() -> dict[str, Any]:
             # is not updated with each code change
             triton_version = triton_key()
         else:
+=======
+        try:
+            from triton.compiler.compiler import triton_key
+
+            # Use triton_key instead of triton.__version__ as the version
+            # is not updated with each code change
+            triton_version = triton_key()
+        except ModuleNotFoundError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             triton_version = None
 
         try:
@@ -232,6 +316,13 @@ def get_system() -> dict[str, Any]:
     def get_local_cache_path() -> Path:
         return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def get_global_cache_path() -> Optional[Path]:
+        return get_global_cache_path_impl(config.global_cache_dir)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         self.system = CacheBase.get_system()
 
@@ -253,7 +344,11 @@ def update_local_cache(self, local_cache: dict[str, Any]) -> None:
 
 
 class LocalCache(CacheBase):
+<<<<<<< HEAD
     def lookup(self, *keys: str) -> dict[str, Any] | None:
+=======
+    def lookup(self, *keys: str) -> Optional[dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cache = self.get_local_cache()
 
         sub_cache = cache
@@ -278,43 +373,84 @@ def set_value(self, *keys: str, value: Any) -> None:
 
 
 class PersistentCache(CacheBase):
+<<<<<<< HEAD
+=======
+    @functools.cache  # noqa: B019
+    def get_global_cache(self) -> dict[str, Any]:
+        global_cache_path = self.get_global_cache_path()
+        if global_cache_path is None or not global_cache_path.is_file():
+            return {}
+        with open(global_cache_path) as global_cache_fp:
+            global_cache = json.load(global_cache_fp)
+        return global_cache["cache"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def lookup(
         self,
         choices: list[ChoiceCaller],
         op: str,
         inputs: str,
+<<<<<<< HEAD
         benchmark: Callable[[Any], dict[ChoiceCaller, float]] | None,
         hint_override: int | None = None,
+=======
+        benchmark: Optional[Callable[[Any], dict[ChoiceCaller, float]]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> dict[ChoiceCaller, float]:
         """
         Check to see if we have benchmarked the given choice callers. For each
         choice caller:
 
+<<<<<<< HEAD
             1. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
             2. If benchmark is not None:
+=======
+            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
+            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
+            3. If benchmark is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 a. `max_autotune_gemm=True`: benchmark the choice, update
                     local_cache[op][inputs][choice], and return the benchmark.
                 b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
         """
         precision = torch.get_float32_matmul_precision()
+<<<<<<< HEAD
         cache_key = f"{inputs}_{hint_override}" if hint_override is not None else inputs
 
         timings = {}
 
         def check_cache(cache: dict[str, Any]) -> bool:
+=======
+
+        log_stats = partial(log_global_cache_stats, self.system, op, inputs, precision)
+        log_vals = partial(log_global_cache_vals, self.system, op, inputs, precision)
+        log_errors = partial(
+            log_global_cache_errors, self.system, op, inputs, precision
+        )
+        timings = {}
+
+        def check_cache(cache: dict[str, Any], callback: Any = None) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """Check if `cache` contains data for all the choices"""
             hit = True
             for choice in choices:
                 choice_hash = choice.hash_key()
+<<<<<<< HEAD
                 if choice_hash in cache.get(op, {}).get(cache_key, {}).get(
                     precision, {}
                 ):
                     # cache hit
                     timings[choice] = cache[op][cache_key][precision][choice_hash]
+=======
+                if choice_hash in cache.get(op, {}).get(inputs, {}).get(precision, {}):
+                    # cache hit
+                    timings[choice] = cache[op][inputs][precision][choice_hash]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     # cache miss
                     hit = False
                     break
+<<<<<<< HEAD
             return hit
 
         local_cache = self.get_local_cache() if config.autotune_local_cache else {}
@@ -328,6 +464,46 @@ def check_cache(cache: dict[str, Any]) -> bool:
                 local_cache[op][cache_key][precision][choice.hash_key()] = timing
 
             self.update_local_cache(local_cache)
+=======
+            if callback:
+                callback(cached=hit)
+            return hit
+
+        if config.max_autotune or config.max_autotune_gemm:
+            local_cache = self.get_local_cache() if config.autotune_local_cache else {}
+            # check local cache first since it is data specific to the current machine
+            if (
+                not check_cache(local_cache)
+                and not (
+                    use_global_cache()
+                    and check_cache(self.get_global_cache(), callback=log_stats)
+                )
+                and benchmark is not None
+            ):
+                try:
+                    # re-benchmark everything to try to get consistent numbers from the same machine
+                    timings = benchmark(choices)
+                    assert all(choice in timings for choice in choices)
+                    local_cache.setdefault(op, {})
+                    local_cache[op].setdefault(inputs, {}).setdefault(precision, {})
+                    for choice, timing in timings.items():
+                        local_cache[op][inputs][precision][choice.hash_key()] = timing
+                except RuntimeError as e:
+                    # catch and log autotuning failures
+                    log_errors(e)
+                    raise e
+
+                self.update_local_cache(local_cache)
+
+                timings_to_log = {
+                    choice.hash_key(): timings[choice] for choice in choices
+                }
+                log_vals(timings_to_log)
+        elif use_global_cache():
+            # only check global cache, not local one
+            check_cache(self.get_global_cache(), callback=log_stats)
+            # may have a partial cache hit, where not everything is benchmarked
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return timings
 
@@ -344,7 +520,11 @@ def sha256_hash(data: bytes) -> str:
     return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower()
 
 
+<<<<<<< HEAD
 def code_hash(code: str | bytes, extra: str | bytes = "") -> str:
+=======
+def code_hash(code: Union[str, bytes], extra: Union[str, bytes] = "") -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hashing_str = code if isinstance(code, bytes) else code.encode("utf-8")
     if extra:
         extra_b = extra if isinstance(extra, bytes) else extra.encode("utf-8")
@@ -366,7 +546,13 @@ def get_path(
     return basename, subdir, path
 
 
+<<<<<<< HEAD
 def get_hash(content: str | bytes, extra: str = "", hash_type: str = "code") -> str:
+=======
+def get_hash(
+    content: Union[str, bytes], extra: str = "", hash_type: str = "code"
+) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if hash_type in {"amdgcn", "code", "ptx", "spv"}:
         return code_hash(content, extra)
     if hash_type in {"cubin", "hsaco", "spv"}:
@@ -374,6 +560,7 @@ def get_hash(content: str | bytes, extra: str = "", hash_type: str = "code") ->
     raise AssertionError(f"Unknown hash type {hash_type}")
 
 
+<<<<<<< HEAD
 class WritableTempFile:
     """
     Avoid "Permission denied error" on Windows:
@@ -413,11 +600,19 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
 
 def write(
     content: str | bytes,
+=======
+def write(
+    content: Union[str, bytes],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extension: str,
     extra: str = "",
     hash_type: str = "code",
     specified_dir: str = "",
+<<<<<<< HEAD
     key: str | None = None,
+=======
+    key: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[str, str]:
     if key is None:
         # use striped content to compute hash so we don't end up with different
@@ -439,7 +634,11 @@ def write_text(text: str) -> str:
 
 def write_atomic(
     path_: str,
+<<<<<<< HEAD
     content: str | bytes,
+=======
+    content: Union[str, bytes],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_dirs: bool = False,
     encode_utf_8: bool = False,
 ) -> None:
@@ -550,7 +749,11 @@ def _reduce_fake_tensor(
 
     def _reduce_tensor(
         self, t: Tensor
+<<<<<<< HEAD
     ) -> tuple[Callable[[T], T], tuple[TensorMetadata | TensorMetadataAndValues]]:
+=======
+    ) -> tuple[Callable[[T], T], tuple[Union[TensorMetadata, TensorMetadataAndValues]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
         stored as attributes on the GraphModule.
@@ -651,8 +854,12 @@ def get_str(obj: Any) -> str:
             if isinstance(obj, torch.Tensor):
                 return str(extract_tensor_metadata_for_cache_key(obj))
             elif isinstance(obj, bytes):
+<<<<<<< HEAD
                 val = obj.decode("utf-8", errors="replace")
                 return val if len(val) <= 1024 else val[:1024] + "..."
+=======
+                return "<bytes>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif type(obj) in self.dispatch_table:
                 # Run the reducer on the object
                 return str(self.dispatch_table[type(obj)](obj)[1])
@@ -872,7 +1079,11 @@ def __init__(
 
         # Global settings affecting matmul codegen.
         self.cuda_matmul_settings = (
+<<<<<<< HEAD
             torch.backends.cuda.matmul.fp32_precision,
+=======
+            torch.backends.cuda.matmul.allow_tf32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction,
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction,
         )
@@ -885,6 +1096,7 @@ def __init__(
         self.post_grad_custom_pre_pass = self._get_custom_pass_detail(
             config.post_grad_custom_pre_pass
         )
+<<<<<<< HEAD
         # TODO: change to more holistic config rather than bundled_autograd_cache
         self.precompile_enabled = torch._functorch.config.bundled_autograd_cache
         self.post_grad_custom_post_pass = self._get_custom_pass_detail(
@@ -896,6 +1108,11 @@ def __init__(
         self.joint_custom_post_pass = self._get_custom_pass_detail(
             config.joint_custom_post_pass
         )
+=======
+        self.post_grad_custom_post_pass = self._get_custom_pass_detail(
+            config.post_grad_custom_post_pass
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._pre_fusion_custom_pass = self._get_custom_pass_detail_unsafe(
             config._pre_fusion_custom_pass
         )
@@ -909,6 +1126,7 @@ def __init__(
             map(self._get_custom_pass_detail, custom_backend_passes.values())
         )
 
+<<<<<<< HEAD
         # Save custom inductor codegen configs
         self.custom_backend_codegen_configs = {
             device: custom_config.save_config_portable(ignore_private_configs=False)
@@ -921,6 +1139,8 @@ def __init__(
             config.custom_partitioner_fn
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This is mainly added to handle these two inductor configs, which are (unfortunately)
     # sometimes cache safe:
     # - _pre_fusion_custom_pass
@@ -930,7 +1150,11 @@ def __init__(
     # - if any of them are set to custom callables, we will need to cache miss
     # Future work is for someone to find any places where these functions are used
     # and force them to be of type CustomGraphPass, so we can guarantee serialization.
+<<<<<<< HEAD
     def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Any | None:
+=======
+    def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Optional[Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not custom_pass:
             return None
         if isinstance(custom_pass, list):
@@ -946,13 +1170,19 @@ def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Any | None:
         raise AssertionError(f"unknown config type: {str(type(custom_pass))}")
 
     def _get_custom_pass_detail(
+<<<<<<< HEAD
         self, custom_pass: CustomGraphPassType | CustomGraphModulePass
     ) -> Any | None:
+=======
+        self, custom_pass: Union[CustomGraphPassType, CustomGraphModulePass]
+    ) -> Optional[Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not custom_pass:
             return None
         assert isinstance(custom_pass, (CustomGraphPass, CustomGraphModulePass))
         return custom_pass.uuid()
 
+<<<<<<< HEAD
     def _get_custom_partitioner_fn_detail(
         self, custom_partitioner_fn: CustomPartitionerFnType
     ) -> Any | None:
@@ -961,6 +1191,8 @@ def _get_custom_partitioner_fn_detail(
         assert isinstance(custom_partitioner_fn, CustomPartitionerFn)
         return custom_partitioner_fn.uuid()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def compiled_fx_graph_hash(
     gm: torch.fx.GraphModule,
@@ -1025,7 +1257,11 @@ def _get_tmp_dir_for_key(cls: type[GuardedCache[T]], _key: str) -> str:
     def iterate_over_candidates(
         cls: type[GuardedCache[T]],
         local: bool,
+<<<<<<< HEAD
         remote_cache: RemoteCache[JsonDataTy] | None,
+=======
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         key: str,
     ) -> Generator[tuple[T, bytes], None, None]:
         if local:
@@ -1060,10 +1296,17 @@ def find_guarded_entry(
         cls: type[GuardedCache[T]],
         key: str,
         local: bool,
+<<<<<<< HEAD
         remote_cache: RemoteCache[JsonDataTy] | None,
         evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool],
         hints: list[int],
     ) -> tuple[T | None, bytes | None, dict[str, str]]:
+=======
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        evaluate_guards: Callable[[str, Union[list[int], list[torch.SymInt]]], bool],
+        hints: list[int],
+    ) -> tuple[Optional[T], Optional[bytes], dict[str, str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Find the first cache entry in iterate_over_candidates that passes `evaluate_guards`.
 
@@ -1127,12 +1370,20 @@ def _filter_backed_symints(
         return [s for s in inputs if isinstance(s, torch.SymInt) and has_hint(s)]
 
     @classmethod
+<<<<<<< HEAD
     def _get_shape_env(cls: type[GuardedCache[T]]) -> ShapeEnv | None:
+=======
+    def _get_shape_env(cls: type[GuardedCache[T]]) -> Optional[ShapeEnv]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Helper to get the shape env from the tracing context.
         """
         ctx = torch._guards.TracingContext.try_get()
+<<<<<<< HEAD
         if not ctx or not ctx.fake_mode:
+=======
+        if not ctx:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
         return ctx.fake_mode.shape_env
 
@@ -1198,7 +1449,11 @@ def cache_hit_post_compile(
         graph: CompiledFxGraph,
         cache_info: dict[str, Any],
         constants: CompiledFxGraphConstants,
+<<<<<<< HEAD
     ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
+=======
+    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Cache specific post compile steps that need to run if we find a graph in the cache
         This includes putting bundled triton artifacts in the right place,
@@ -1264,6 +1519,7 @@ def cache_hit_post_compile(
         )
         trace_structured(
             "inductor_output_code",
+<<<<<<< HEAD
             lambda: {
                 "filename": artifact_path,
                 "file_path": os.path.abspath(artifact_path),
@@ -1293,6 +1549,11 @@ def cache_hit_post_compile(
             get_metrics_context().add_to_set(
                 "inductor_provenance", graph.inductor_provenance_stack_traces_str
             )
+=======
+            lambda: {"filename": artifact_path},
+            payload_fn=lambda: code,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return graph, cache_info
 
     @staticmethod
@@ -1300,11 +1561,20 @@ def _lookup_graph(
         key: str,
         example_inputs: Sequence[InputType],
         local: bool,
+<<<<<<< HEAD
         remote_cache: RemoteCache[JsonDataTy] | None,
         constants: CompiledFxGraphConstants,
         evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool]
         | None = None,
     ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
+=======
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        constants: CompiledFxGraphConstants,
+        evaluate_guards: Optional[
+            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
+        ] = None,
+    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Lookup a compiled graph in the cache by key. On a hit, return the
         deserialized CompiledFxGraph object. On a miss, return None.
@@ -1372,7 +1642,11 @@ def _save_graph(
         compiled_graph: OutputCode,
         example_inputs: Sequence[InputType],
         local: bool,
+<<<<<<< HEAD
         remote_cache: RemoteCache[JsonDataTy] | None,
+=======
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Store a serialized CompiledFxGraph on disk.
@@ -1454,10 +1728,13 @@ def _check_can_cache(gm: torch.fx.GraphModule) -> None:
         for p in (config.post_grad_custom_pre_pass, config.post_grad_custom_post_pass):
             if p and (not isinstance(p, CustomGraphPass) or not p.uuid()):
                 raise BypassFxGraphCache("Unsupported post grad custom pass")
+<<<<<<< HEAD
         # Same with the joint custom passes
         for p in (config.joint_custom_pre_pass, config.joint_custom_post_pass):
             if p and (not isinstance(p, CustomGraphPass) or not p.uuid()):
                 raise BypassFxGraphCache("Unsupported joint custom pass")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We should find any users of _pre_fusion_custom_pass and _fuse_ddp_communication_passes
         # and ensure they are not passing us raw callables
         if config._pre_fusion_custom_pass is not None:
@@ -1501,7 +1778,11 @@ def prepare_key(
         fx_kwargs: _CompileFxKwargs,
         inputs_to_check: Sequence[int],
         remote: bool,
+<<<<<<< HEAD
     ) -> tuple[tuple[str, list[str]] | None, dict[str, Any]]:
+=======
+    ) -> tuple[Optional[tuple[str, list[str]]], dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Checks that the inductor input is cacheable, then computes
         and returns the cache key for the input.
@@ -1519,7 +1800,11 @@ def prepare_key(
             )
         except BypassFxGraphCache as e:
             counters["inductor"]["fxgraph_cache_bypass"] += 1
+<<<<<<< HEAD
             log.info("Bypassing FX Graph Cache because '%s'", e)  # noqa: G200
+=======
+            log.info("Bypassing FX Graph Cache because '%s'", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if remote:
                 log_cache_bypass("bypass_fx_graph", str(e))
             cache_info = {
@@ -1532,7 +1817,11 @@ def prepare_key(
         return (key, debug_lines), {}
 
     @staticmethod
+<<<<<<< HEAD
     def get_remote_cache() -> RemoteCache[JsonDataTy] | None:
+=======
+    def get_remote_cache() -> Optional[RemoteCache[JsonDataTy]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Attempts to load the remote cache, returns None on error.
         """
@@ -1550,12 +1839,22 @@ def load_with_key(
         debug_lines: list[str],
         example_inputs: Sequence[InputType],
         local: bool,
+<<<<<<< HEAD
         remote_cache: RemoteCache[JsonDataTy] | None,
         is_backward: bool,
         constants: CompiledFxGraphConstants,
         evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool]
         | None = None,
     ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
+=======
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        is_backward: bool,
+        constants: CompiledFxGraphConstants,
+        evaluate_guards: Optional[
+            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
+        ] = None,
+    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Lookup the graph with the given key, and return results and metadata.
         Doesn't do any logging on its own, because AOTAutograd handles a cache miss
@@ -1629,6 +1928,7 @@ def clear() -> None:
 
 @functools.cache
 def split_aot_inductor_output_path(path: str) -> tuple[str, str]:
+<<<<<<< HEAD
     def get_module_ext_type() -> str:
         if _IS_WINDOWS:
             return ".pyd"
@@ -1637,6 +1937,10 @@ def get_module_ext_type() -> str:
 
     """Returns the path where the AOT Inductor compiled kernels are stored."""
     if path.endswith(get_module_ext_type()):
+=======
+    """Returns the path where the AOT Inductor compiled kernels are stored."""
+    if path.endswith(".so"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return os.path.split(path)
     elif path.endswith(".pt2"):
         return os.path.split(path)
@@ -1653,11 +1957,19 @@ class CudaKernelParamCache:
     def set(
         cls,
         key: str,
+<<<<<<< HEAD
         params: dict[str, str | None],
         cubin: str,
         bin_type: str,
         asm: str | None = None,
         asm_type: str | None = None,
+=======
+        params: dict[str, Optional[str]],
+        cubin: str,
+        bin_type: str,
+        asm: Optional[str] = None,
+        asm_type: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         basename = None
         if config.aot_inductor.package_cpp_only:
@@ -1710,7 +2022,11 @@ def set(
         cls.cache[key] = params
 
     @classmethod
+<<<<<<< HEAD
     def get(cls, key: str) -> dict[str, Any] | None:
+=======
+    def get(cls, key: str) -> Optional[dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls.cache.get(key, None)
 
     @classmethod
@@ -1729,16 +2045,31 @@ def compile(
         graph: GraphLowering,
         wrapper_code: str,
         kernel_code: str,
+<<<<<<< HEAD
         serialized_extern_kernel_nodes: str | None,
         *,
         device_type: str,
         additional_files: list[str],
     ) -> list[Union[str, Weights]] | str:
+=======
+        serialized_extern_kernel_nodes: Optional[str],
+        *,
+        device_type: str,
+        additional_files: list[str],
+    ) -> Union[list[Union[str, Weights]], str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns the .so path, or returns a list of files that were generated if
         config.aot_inductor.package=True.
         """
+<<<<<<< HEAD
         generated_files: list[str | Weights] = additional_files  # type: ignore[assignment]
+=======
+        generated_files: list[Union[str, Weights]] = additional_files  # type: ignore[assignment]
+
+        if sys.platform == "win32":
+            raise RuntimeError("AotCodeCompiler not yet supported for inductor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _set_gpu_runtime_env()  # cpp_extension consults the env
 
@@ -1795,6 +2126,7 @@ def compile(
             key=config.aot_inductor.model_name_for_generated_files,
         )
 
+<<<<<<< HEAD
         header_code = ""
         header_path = ""
         if not config.aot_inductor.dynamic_linkage:
@@ -1839,6 +2171,10 @@ def compile(
                 with WritableTempFile("w", suffix=".gv") as temp_file:
                     tree.to_dotfile(temp_file.name)
             """
+=======
+        # Log the AOTInductor wrapper and kernel code, if needed.
+        with tempfile.NamedTemporaryFile("w+") as t:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t.writelines((wrapper_code, "\n", kernel_code, "\n"))
             t.flush()
             V.debug.output_code(t.name, extension="cpp")
@@ -1847,8 +2183,11 @@ def compile(
             generated_files.append(wrapper_path)
             if not config.aot_inductor.package_cpp_only:
                 generated_files.append(kernel_path)
+<<<<<<< HEAD
             if not config.aot_inductor.dynamic_linkage:
                 generated_files.append(header_path)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_code_log.info("Wrapper code written to: %s", wrapper_path)
         output_code_log.info("Kernel code written to: %s", kernel_path)
@@ -1870,6 +2209,7 @@ def compile(
             },
             payload_fn=lambda: kernel_code,
         )
+<<<<<<< HEAD
         if not config.aot_inductor.dynamic_linkage:
             output_code_log.info("Header code written to: %s", header_path)
             trace_structured(
@@ -1881,6 +2221,8 @@ def compile(
                 },
                 payload_fn=lambda: header_code,
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We use a file lock below to protect FS operations. The lock file
         # is scoped to the 'key', so make sure the consts_s is protected
@@ -1893,9 +2235,12 @@ def compile(
         cmake_path = str(Path(specified_sub_dir) / "CMakeLists.txt")
 
         def _compile_consts(consts: bytes, platform: str) -> str:
+<<<<<<< HEAD
             # Load from aot_inductor, and update the value on demand.
             use_asm_build: bool = config.aot_inductor.use_consts_asm_build
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if platform == "linux":
                 if graph.mutated_buffers & OrderedSet(graph.constants.keys()):
                     # .data section is between .text and .bss. When the size of .data is large,
@@ -1912,6 +2257,7 @@ def _compile_consts(consts: bytes, platform: str) -> str:
             elif platform == "darwin":
                 section_attr = "__DATA,__data"
                 symbol_prefix = "_"
+<<<<<<< HEAD
             elif platform == "win32":
                 symbol_prefix = ""
                 # ASM build is not supported on Windows, force use CPP build.
@@ -2045,6 +2391,38 @@ def get_zero_consts_asm_code(
             consts_s = Path(consts_s)
             object_build_options = CppTorchDeviceOptions(
                 device_type=device_type,
+=======
+            else:
+                raise RuntimeError(f"Unsupported platform: {platform}")
+
+            is_large_consts = len(consts) > 1024
+            consts_asm = f"\t.section\t{section_attr}\n"
+            consts_asm += f"\t.balign {ALIGN_BYTES}\n"
+            consts_asm += f"\t.globl\t{symbol_prefix}_binary_constants_bin_start\n"
+            consts_asm += f"{symbol_prefix}_binary_constants_bin_start:\n"
+            if not is_large_consts:
+                for c in consts:
+                    consts_asm += f"\t.byte {c}\n"
+                # Add one element even if constants are empty
+                # Otherwise assembler will not put them in data section
+                if not consts:
+                    consts_asm += "\t.space 1\n"
+            else:
+                consts_asm += "\t.quad 0x1234567899abcdef\n"
+                consts_asm += f"\t.space {len(consts) - 8}\n"
+            consts_asm += f".globl\t{symbol_prefix}_binary_constants_bin_end\n"
+            consts_asm += f"{symbol_prefix}_binary_constants_bin_end:\n"
+            _, consts_s = write(
+                consts_asm,
+                "S",
+                specified_dir=str(specified_sub_dir),
+            )
+            consts_s = Path(consts_s)
+            object_build_options = CppTorchDeviceOptions(
+                # Intel compiler failed to compile this manually constructed assembly file.
+                # it is ok to use gcc to compile the .S to a .o and linked with Intel compiler .
+                device_type=device_type if device_type != "xpu" else "cpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 aot_mode=graph.aot_mode,
                 compile_only=True,
                 use_relative_path=use_relative_path,
@@ -2056,21 +2434,31 @@ def get_zero_consts_asm_code(
                 BuildOption=object_build_options,
             )
             consts_o = object_builder.get_target_file_path()
+<<<<<<< HEAD
             if use_asm_build is False and is_zero_size_consts:
                 run_asm_build_object(str(consts_s), consts_o, str(consts_s.parent))
             else:
                 object_builder.build()
 
             if is_large_consts and use_asm_build:
+=======
+            object_builder.build()
+
+            if is_large_consts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with open(consts_o, "r+b") as f:
                     f.seek(0)
                     hdr = f.read(1024)
                     # Search for magic number and write the actual data over it
+<<<<<<< HEAD
                     start_idx = (
                         hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12")
                         if sys.byteorder == "little"
                         else hdr.find(b"\x12\x34\x56\x78\x99\xab\xcd\xef")
                     )
+=======
+                    start_idx = hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert start_idx != -1
                     f.seek(start_idx)
                     pos = 0
@@ -2103,9 +2491,12 @@ def get_zero_consts_asm_code(
             metadata = config.aot_inductor.metadata
             metadata["AOTI_DEVICE_KEY"] = device_type
 
+<<<<<<< HEAD
             # Add environment information to ensure .so compatibility
             metadata.update(get_device_information(device_type))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Save user provided metadata
             meta_json = str(
                 wrapper_path_operator.with_name(
@@ -2170,6 +2561,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     data_ptr,
                     ctypes.POINTER(ctypes.c_ubyte * nbytes),
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
                 raw_bytes = bytes(raw_array.contents)
                 return raw_bytes if all_cuda else _pad_to_alignment(raw_bytes)
@@ -2178,6 +2570,12 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 config.aot_inductor.package_constants_in_so
                 or config.aot_inductor.package_constants_on_disk_format == "binary_blob"
             ):
+=======
+                raw_bytes = bytes(raw_array.contents)
+                return raw_bytes if all_cuda else _pad_to_alignment(raw_bytes)
+
+            if config.aot_inductor.package_constants_in_so:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 serialized_weights = b"".join(
                     _to_bytes(graph.get_original_value_of_constant(name), all_cuda)
                     for name in graph.constants.keys()
@@ -2186,7 +2584,11 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             else:
                 serialized_weights = b""
 
+<<<<<<< HEAD
             if config.aot_inductor.package_constants_on_disk_format == "pickle_weights":
+=======
+            if config.aot_inductor.package_constants_on_disk:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # We need to return a storage key here because the original value tensor might be a clone
                 weights_dict = Weights(
                     {
@@ -2202,6 +2604,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 
             consts_size = len(serialized_weights)
 
+<<<<<<< HEAD
             use_external_weights, use_mmap_weights = determine_aoti_mmap_flags(
                 consts_size
             )
@@ -2217,12 +2620,21 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 external_weights_path = str(
                     wrapper_path_operator.with_name(external_weights_filename)
                 )
+=======
+            # TODO: Fix mmap weights with cuda
+            use_mmap_weights = not config.is_fbcode() and consts_size > 2_000_000_000
+            if config.aot_inductor.force_mmap_weights:
+                use_mmap_weights = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             compile_command: dict[str, Any] = {
                 "aot_mode": graph.aot_mode,
                 "device_type": device_type,
                 "use_mmap_weights": use_mmap_weights,
+<<<<<<< HEAD
                 "use_mmap_weights_external": use_external_weights,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "use_relative_path": use_relative_path,
                 "vec_isa": picked_vec_isa,
             }
@@ -2301,6 +2713,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             if not use_mmap_weights:
                 aot_constants = serialized_weights
                 magic_number = 0
+<<<<<<< HEAD
                 if use_external_weights:
                     aot_constants = struct.pack("q", consts_size)
                     assert external_weights_path is not None
@@ -2310,6 +2723,9 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     generated_files.append(external_weights_path)
             else:
                 # we'll append weights binary to the end of .so file and mmap it when loading
+=======
+            else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 magic_number = cast(
                     int, torch.randint(0, torch.iinfo(torch.int64).max, (1,)).item()
                 )
@@ -2352,7 +2768,11 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     f.write(json.dumps(qual_name_to_id))
                 generated_files.append(constants_config_json)
 
+<<<<<<< HEAD
             gpu_codecache: ROCmCodeCache | CUDACodeCache = (
+=======
+            gpu_codecache: Union[ROCmCodeCache, CUDACodeCache] = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ROCmCodeCache() if torch.version.hip else CUDACodeCache()
             )
             gpu_kernels_o = gpu_codecache.aot_kernels_o.copy()
@@ -2366,6 +2786,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 
             cubins_o = []
             asm_files = []
+<<<<<<< HEAD
             if not _IS_WINDOWS:
                 ld, objcopy = get_ld_and_objcopy(use_relative_path)
                 kernels = getattr(V.graph.wrapper_code, "_kernel_name_to_body", {})
@@ -2411,6 +2832,32 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                         cubins_o.append(
                             convert_cubin_to_obj(cubin_file, kernel_name, ld, objcopy)
                         )
+=======
+            ld, objcopy = get_ld_and_objcopy(use_relative_path)
+            for kernel_name, value in CudaKernelParamCache.cache.items():
+                if asm_file := value["asm"]:
+                    asm_files.append(asm_file)
+
+                cubin_file = value[get_cpp_wrapper_cubin_path_name()]
+                if config.aot_inductor.emit_multi_arch_kernel and device_type == "cuda":
+                    current_arch = _nvcc_arch_as_compile_option()
+                    cmd = (
+                        f"{_cuda_compiler()} -fatbin {asm_file} -o {cubin_file} "
+                        # Triton only allows generating PTX version as same as the current arch
+                        f"-gencode arch=compute_{current_arch},code=compute_{current_arch} "
+                        # Include SASS for the current specific arch
+                        f"-gencode arch=compute_{current_arch},code=sm_{current_arch} "
+                    )
+                    subprocess.run(
+                        cmd.split(), capture_output=True, text=True, check=True
+                    )
+
+                if config.aot_inductor.embed_kernel_binary:
+                    # Embed cubin files into model.so using objcopy
+                    cubins_o.append(
+                        convert_cubin_to_obj(cubin_file, kernel_name, ld, objcopy)
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             output_name, output_dir = get_name_and_dir_from_output_file_path(output_so)
             so_build_options = CppTorchDeviceOptions(
@@ -2490,6 +2937,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     os.remove(o_file)
 
                 if use_mmap_weights:
+<<<<<<< HEAD
                     if config.aot_inductor.cross_target_platform == "windows":
                         raise RuntimeError(
                             "when cross_target_platform is windows, use_mmap_weights should not be true."
@@ -2532,6 +2980,11 @@ class SYSTEM_INFO(Structure):
                         return sys_page_size
 
                     page_size_ = get_page_size()
+=======
+                    import resource
+
+                    page_size_ = resource.getpagesize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     page_size = max(16384, page_size_)
 
                     with open(output_so, "a+b") as f_so:
@@ -2544,6 +2997,7 @@ class SYSTEM_INFO(Structure):
                 if config.aot_inductor.package:
                     generated_files.append(output_so)
 
+<<<<<<< HEAD
         if config.trace.provenance_tracking_level != 0:
             kernel_info = torch._inductor.debug.create_kernel_information_json()
             kernel_info_json = os.path.join(
@@ -2553,6 +3007,8 @@ class SYSTEM_INFO(Structure):
                 f.write(json.dumps(kernel_info, indent=4))
             generated_files.append(kernel_info_json)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.aot_inductor.package:
             # We want to return the directory that contains all the AOTI
             # generated files, not just the so
@@ -2562,10 +3018,17 @@ class SYSTEM_INFO(Structure):
         return output_so
 
 
+<<<<<<< HEAD
 _libgomp: CDLL | None = None
 
 
 def custom_op_wrapper(op: str, *args: Any) -> list[c_void_p] | c_void_p | None:
+=======
+_libgomp: Optional[CDLL] = None
+
+
+def custom_op_wrapper(op: str, *args: Any) -> Union[list[c_void_p], c_void_p, None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This function will be called from generated cpp wrapper code in the JIT mode.
     # Because tensors will be passed in as AtenTensorHandle, we need to explicitly convert them.
     def convert_arg(arg: Any) -> Any:
@@ -2592,7 +3055,10 @@ def convert_arg(arg: Any) -> Any:
 
     # convert any kwarg-only arguments to kwargs
     kwargs = dict()
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for func_arg, conv_arg in zip(func._schema.arguments, converted_args):
         if func_arg.kwarg_only:
             kwargs[func_arg.name] = conv_arg
@@ -2685,7 +3151,11 @@ def _get_file_checksum(filename: str) -> str:
     return header_full_path
 
 
+<<<<<<< HEAD
 def _get_cpp_prefix_header(device: str) -> str | None:
+=======
+def _get_cpp_prefix_header(device: str) -> Optional[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if device.startswith("cpu"):
         return "torch/csrc/inductor/cpp_prefix.h"
     return None
@@ -2694,7 +3164,11 @@ def _get_cpp_prefix_header(device: str) -> str | None:
 def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
     """Given a device type (and optionally whether we're in AOT Inductor mode), returns
     the path to the cpp_wrapper header file to be precompiled."""
+<<<<<<< HEAD
     base_device = device.split(":", maxsplit=1)[0]
+=======
+    base_device = device.split(":")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_array_ref = config.aot_inductor.allow_stack_allocation and base_device == "cpu"
     return (
         "torch/csrc/inductor/"
@@ -2708,16 +3182,28 @@ class CppCodeCache:
     """Compiles and caches C++ libraries.  Users of this class supply the source code to
     be compiled, while compilation flags are set by CppBuilder."""
 
+<<<<<<< HEAD
     cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
+=======
+    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags: dict[str, Any] = {}
 
     @staticmethod
+<<<<<<< HEAD
     def _load_library_inner(path: str, key: str) -> CDLL | ModuleType:
         return cdll.LoadLibrary(path)
 
     @classmethod
     def _load_library(cls, path: str, key: str) -> CDLL | ModuleType:
+=======
+    def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]:
+        return cdll.LoadLibrary(path)
+
+    @classmethod
+    def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             result = cls._load_library_inner(path, key)
             result.key = key  # type: ignore[union-attr]
@@ -2753,7 +3239,11 @@ def load_async(
         device_type: str = "cpu",
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
+<<<<<<< HEAD
         optimized_code: str | None = None,
+=======
+        optimized_code: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Any:
         """Compile and load a C++ library.  Returns a callable that returns the loaded
         library."""
@@ -2774,6 +3264,7 @@ def load_async(
         main_build_option = CppTorchDeviceOptions(
             compile_only=bool(optimized_code),
             min_optimize=optimized_code is not None,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             **compile_command,
         )
@@ -2782,6 +3273,12 @@ def load_async(
             compile_only=True,
             # pyrefly: ignore [bad-argument-type]
             **compile_command,
+=======
+            **compile_command,
+        )
+        optimized_build_option = CppTorchDeviceOptions(
+            compile_only=True, **compile_command
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
@@ -2812,7 +3309,11 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
             from torch.utils._filelock import FileLock
 
             lock_path = os.path.join(get_lock_dir(), key + ".lock")
+<<<<<<< HEAD
             future: Future[Any] | None = None
+=======
+            future: Optional[Future[Any]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lib = None
 
             # if requested, pre-compile any headers
@@ -2830,7 +3331,10 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
                 # decision if that ever changes.
                 if optimized_code and (header := _get_cpp_prefix_header(device_type)):
                     optimized_build_option.precompiled_header = _precompile_header(
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         header,
                         optimized_cmd_line,
                         **compile_command,
@@ -2861,7 +3365,10 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
                         main_builder.get_target_file_path(),
                         optimized_builder.get_target_file_path(),
                     ],
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     BuildOption=CppTorchDeviceOptions(**compile_command),
                     output_dir=output_dir,
                 )
@@ -2920,7 +3427,11 @@ def _worker_compile_cpp(
 # Customized Python binding for cpp kernels
 @clear_on_fresh_cache
 class CppPythonBindingsCodeCache(CppCodeCache):
+<<<<<<< HEAD
     cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
+=======
+    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags = {
         # kernels have no dependency on libtorch
@@ -3020,7 +3531,10 @@ class CppPythonBindingsCodeCache(CppCodeCache):
     )
 
     @classmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _load_library_inner(cls, path: str, key: str) -> ModuleType:
         os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str(
             torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
@@ -3051,7 +3565,11 @@ def load_pybinding_async(
         num_outputs: int = -1,
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
+<<<<<<< HEAD
         kernel_code: str | None = None,
+=======
+        kernel_code: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Any:
         """
         Wrap a C++ function in fast Python bindings.
@@ -3102,7 +3620,11 @@ def load_pybinding(cls, *args: Any, **kwargs: Any) -> Any:
 
 @clear_on_fresh_cache
 class CppWrapperCodeCache(CppPythonBindingsCodeCache):
+<<<<<<< HEAD
     cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
+=======
+    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags = {
         "include_pytorch": True,
@@ -3171,9 +3693,15 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
 
 @clear_on_fresh_cache
 class HalideCodeCache(CppPythonBindingsCodeCache):
+<<<<<<< HEAD
     cache: dict[str, Callable[[], ModuleType | CDLL]] = {}
     cache_clear = staticmethod(cache.clear)
     _standalone_runtime_path: str | None = None
+=======
+    cache: dict[str, Callable[[], Union[ModuleType, CDLL]]] = {}
+    cache_clear = staticmethod(cache.clear)
+    _standalone_runtime_path: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prefix = textwrap.dedent(
         """
         #include "{halideruntime_h}"
@@ -3270,9 +3798,13 @@ def _codegen_buffer(cls, name: str, arg: HalideInputSpec, cuda: bool) -> list[st
 
         return [
             f"halide_buffer_t {name};",
+<<<<<<< HEAD
             f"halide_dimension_t {name}_dims[] = {{{', '.join(dims)}}};"
             if len(dims) > 0
             else f"halide_dimension_t * {name}_dims = nullptr;",
+=======
+            f"halide_dimension_t {name}_dims[] = {{{', '.join(dims)}}};",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{name}.device = {device};",
             f"{name}.device_interface = {device_interface};",
             f"{name}.host = {host};",
@@ -3292,12 +3824,18 @@ def _codegen_glue(cls, meta: HalideMeta, headerfile: object) -> str:
         buffer_names = []
         for i, arg in enumerate(meta.argtypes):
             if arg.is_buffer():
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 buffer_names.append(f"&hl_buf_{i}")
                 buffers.extend(cls._codegen_buffer(f"hl_buf_{i}", arg, is_cuda))
             else:
                 assert "*" not in arg.ctype
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 buffer_names.append(arg.name)
         buffers = "\n".join([f"    {line}" for line in buffers]).lstrip()
 
@@ -3552,7 +4090,10 @@ def __repr__(self) -> str:
 
                 ci = cmd.index("-o")
                 assert isinstance(ci, int)
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cmd[ci + 1] = Out()
                 repl = textwrap.indent(
                     textwrap.dedent(
@@ -3604,8 +4145,13 @@ def load_by_key_path(
         cls,
         key: str,
         path: str,
+<<<<<<< HEAD
         linemap: list[tuple[int, str]] | None = None,
         attrs: dict[str, Any] | None = None,
+=======
+        linemap: Optional[list[tuple[int, str]]] = None,
+        attrs: Optional[dict[str, Any]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> ModuleType:
         if linemap is None:
             linemap = []
@@ -3653,7 +4199,11 @@ def cache_clear(cls, purge: bool = False) -> None:
     @functools.cache
     def stack_frames_for_code(
         cls, path: str, lineno: int
+<<<<<<< HEAD
     ) -> list[dict[str, Any]] | None:
+=======
+    ) -> Optional[list[dict[str, Any]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if path not in cls.linemaps:
             return None
         if len(cls.linemaps[path]) == 0:
@@ -3686,7 +4236,11 @@ def _load_triton_kernel_from_source(
     return getattr(PyCodeCache.load(source_code), kernel_name)
 
 
+<<<<<<< HEAD
 def _cuda_compiler() -> str | None:
+=======
+def _cuda_compiler() -> Optional[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
         return config.cuda.cuda_cxx
     if config.is_fbcode():
@@ -3702,7 +4256,11 @@ def _cutlass_path() -> str:
     if config.is_fbcode():
         from libfb.py import parutil
 
+<<<<<<< HEAD
         return parutil.get_dir_path("cutlass-4-headers")
+=======
+        return parutil.get_dir_path("cutlass-3-headers")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return config.cuda.cutlass_dir
 
@@ -3743,9 +4301,13 @@ def cutlass_key() -> bytes:
     Note: OSS and fbcode will have different keys.
     """
     if config.is_fbcode():
+<<<<<<< HEAD
         with importlib.resources.path(
             "cutlass_library", "src_hash.txt"
         ) as resource_path:
+=======
+        with importlib.resources.path("cutlass", "src_hash.txt") as resource_path:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with open(resource_path) as resource_file:
                 return resource_file.read().encode()
 
@@ -3774,12 +4336,18 @@ def _cuda_lib_options() -> list[str]:
             if "torch/lib" in path:
                 # don't want to depend on pytorch
                 continue
+<<<<<<< HEAD
             extra_ldflags.append(f"-L{path}")
             # -rpath ensures the DLL can find its dependencies when loaded, even
             # if the library path is non-standard.
             # But do not add the stubs folder to rpath as the driver is expected to be found at runtime
             if os.path.basename(path) != "stubs":
                 extra_ldflags.extend(["-Xlinker", f"-rpath={path}"])
+=======
+            # -rpath ensures the DLL can find its dependencies when loaded, even
+            # if the library path is non-standard.
+            extra_ldflags.extend([f"-L{path}", "-Xlinker", f"-rpath={path}"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_ldflags.append("-lcuda")
         extra_ldflags.append("-lcudart")
     else:
@@ -3853,7 +4421,11 @@ def cuda_compile_command(
     src_files: list[str],
     dst_file: str,
     dst_file_ext: str,
+<<<<<<< HEAD
     extra_args: list[str] | None = None,
+=======
+    extra_args: Optional[list[str]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> str:
     if extra_args is None:
         extra_args = []
@@ -3888,10 +4460,14 @@ def cuda_compile_command(
         res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
     else:
         raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!")
+<<<<<<< HEAD
     if log.isEnabledFor(logging.DEBUG):
         log.debug("CUDA command: %s", res)
     else:
         autotuning_log.debug("CUDA command: %s", res)
+=======
+    log.debug("CUDA command: %s", res)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return res
 
 
@@ -3991,7 +4567,11 @@ class CUDACodeCache:
     class CacheEntry:
         input_path: str
         output_path: str
+<<<<<<< HEAD
         error_json: str | None = None
+=======
+        error_json: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     cache: dict[str, CacheEntry] = {}
     aot_kernels_o: list[str] = []
@@ -4006,7 +4586,11 @@ def cache_clear() -> None:
     @lru_cache(maxsize=4)
     def get_kernel_binary_remote_cache(
         caching_enabled: bool, caching_available: bool
+<<<<<<< HEAD
     ) -> Any | None:
+=======
+    ) -> Optional[Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Get or create the class instance of the CUTLASSKernelBinaryRemoteCache.
 
@@ -4036,7 +4620,10 @@ def get_kernel_binary_remote_cache(
             return None
 
     @classmethod
+<<<<<<< HEAD
     @lru_cache(None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
         """
         Writes source code into a file with dst_file_ext as the file extension.
@@ -4061,12 +4648,19 @@ def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
                     cutlass_key(),
                     # hack to deal with AOTI .o compilation
                 ]
+<<<<<<< HEAD
+=======
+                + [dst_file_ext]
+                if dst_file_ext == "o"
+                else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         key, input_path = write(source_code, cls._SOURCE_CODE_SUFFIX, extra=extra)
         return key, input_path
 
     @classmethod
     def compile(
+<<<<<<< HEAD
         cls, source_code: str, dst_file_ext: str, extra_args: list[str] | None = None
     ) -> tuple[str, str, str]:
         """
@@ -4086,6 +4680,16 @@ def compile(
 
         key_with_ext = key + dst_file_ext
         if key_with_ext not in cls.cache:
+=======
+        cls, source_code: str, dst_file_ext: str, extra_args: Optional[list[str]] = None
+    ) -> tuple[str, str, str]:
+        """
+        Compiles CUDA source_code into a file with dst_file_ext extension.
+        Returns a tuple of dst_file_path, hash_key, source_code_path
+        """
+        key, input_path = cls.write(source_code, dst_file_ext)
+        if key not in cls.cache:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.utils._filelock import FileLock
 
             lock_dir = get_lock_dir()
@@ -4117,12 +4721,17 @@ def compile(
                         binary_remote_cache.put(
                             error_path, config.cuda.binary_remote_cache_force_write
                         )
+<<<<<<< HEAD
                     cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
+=======
+                    cls.cache[key] = CUDACodeCache.CacheEntry(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         input_path, output_path, error_json
                     )
                     raise exc.CUDACompileError(cmd_parts, error_output)
                 if not os.path.exists(output_path):
                     cmd = cuda_compile_command(
+<<<<<<< HEAD
                         src_files, output_path, dst_file_ext, extra_args
                     )
                     with open(input_path, "a") as f:
@@ -4130,6 +4739,15 @@ def compile(
                         f.write(f"// CUDA {operation_name} cmd\n// {cmd}\n")
                     start_time = time()
                     log.debug("CUDA %s: %s", operation_name, cmd)
+=======
+                        [input_path], output_path, dst_file_ext, extra_args
+                    )
+                    with open(input_path, "a") as f:
+                        f.write("\n")
+                        f.write(f"// CUDA Compile cmd\n// {cmd}\n")
+                    start_time = time()
+                    log.debug("CUDA Compilation: %s", cmd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cmd_parts = cmd.split(" ")
                     try:
                         if use_re_build():
@@ -4147,7 +4765,11 @@ def compile(
                     except subprocess.CalledProcessError as error:
                         cls._record_cuda_compile_error(
                             error.output.decode("utf-8"),
+<<<<<<< HEAD
                             key_with_ext,
+=======
+                            key,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             cmd_parts,
                             input_path,
                             output_path,
@@ -4158,7 +4780,11 @@ def compile(
                         if "COMPILE FAILED WITH" in str(error):
                             cls._record_cuda_compile_error(
                                 str(error),
+<<<<<<< HEAD
                                 key_with_ext,
+=======
+                                key,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 cmd_parts,
                                 input_path,
                                 output_path,
@@ -4167,14 +4793,23 @@ def compile(
                             raise exc.CUDACompileError(cmd_parts, str(error)) from error
                         raise error
                     end_time = time()
+<<<<<<< HEAD
                     log_duration_msg = f"CUDA {operation_name} took {end_time - start_time} seconds. Command: {cmd}"
+=======
+                    log_duration_msg = f"CUDA Compilation took {end_time - start_time} seconds. Compile command: {cmd}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     log.info(log_duration_msg)
 
                 else:
                     log.debug(
+<<<<<<< HEAD
                         "CUDA %s skipped: %s since output already exists",
                         operation_name,
                         output_path,
+=======
+                        "CUDA Compilation skipped: %s since output already exists",
+                        input_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 # Upload to remote cache if enabled
                 if (
@@ -4185,16 +4820,25 @@ def compile(
                     binary_remote_cache.put(
                         output_path, config.cuda.binary_remote_cache_force_write
                     )
+<<<<<<< HEAD
                 cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
                     input_path, output_path, None
                 )
 
         cache_entry: CUDACodeCache.CacheEntry = cls.cache[key_with_ext]
+=======
+                cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path, None)
+        cache_entry: CUDACodeCache.CacheEntry = cls.cache[key]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cache_entry.error_json is not None:
             # Restore cached Exception and raise it as if we had compiled
             cmd_parts, error_output = json.loads(cache_entry.error_json)
             raise exc.CUDACompileError(cmd_parts, error_output.encode("utf-8"))
+<<<<<<< HEAD
         return (cls.cache[key_with_ext].output_path, key, input_path)
+=======
+        return (cls.cache[key].output_path, key, input_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def load(cls, source_code: str, dst_file_ext: str) -> tuple[DLLWrapper, str, str]:
@@ -4217,7 +4861,11 @@ def load(cls, source_code: str, dst_file_ext: str) -> tuple[DLLWrapper, str, str
     def _record_cuda_compile_error(
         cls,
         error_str: str,
+<<<<<<< HEAD
         key_with_ext: str,
+=======
+        key: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cmd_parts: list[str],
         input_path: str,
         output_path: str,
@@ -4226,9 +4874,13 @@ def _record_cuda_compile_error(
         binary_remote_cache: Any = None,
     ) -> None:
         error_json = json.dumps([cmd_parts, error_str])
+<<<<<<< HEAD
         cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
             input_path, output_path, error_json
         )
+=======
+        cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path, error_json)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         error_path = binary_error_path(output_path)
         with open(error_path, "w", encoding="utf-8") as fh:
             fh.write(error_json)
@@ -4277,7 +4929,11 @@ def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
 
     @classmethod
     def compile(
+<<<<<<< HEAD
         cls, source_code: str, dst_file_ext: str, extra_args: list[str] | None = None
+=======
+        cls, source_code: str, dst_file_ext: str, extra_args: Optional[list[str]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[str, str, str]:
         """
         Compiles source_code into a file with dst_file_ext extension,
@@ -4350,7 +5006,11 @@ def result(self) -> Callable[..., Any]:
 
 class LambdaFuture(CodeCacheFuture):
     def __init__(
+<<<<<<< HEAD
         self, result_fn: Callable[..., Any], future: Future[Any] | None = None
+=======
+        self, result_fn: Callable[..., Any], future: Optional[Future[Any]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.result_fn = result_fn
         self.future = future
@@ -4371,7 +5031,11 @@ def __init__(self, static_autotuner: CachingAutotuner) -> None:
         # we need to reload the CachingAutotuner from its source code
         # We don't store the source code on the CachingAutotuner itself
         # since it can be very large.
+<<<<<<< HEAD
         self.reload_kernel_from_src: Callable[[], Any] | None = None
+=======
+        self.reload_kernel_from_src: Optional[Callable[[], Any]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def result(self) -> CachingAutotuner:
         assert self.reload_kernel_from_src is not None
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
index 515ab89d1f2d1..9cd559cf5f77e 100644
--- a/torch/_inductor/codegen/aoti_runtime/interface.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -66,7 +66,10 @@ AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
     size_t num_models,
     const char* device_str,
     const char* cubin_dir) {
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (num_models == 0) {
     std::cerr << "Error: num_models must be positive, but got 0\n";
     return AOTI_RUNTIME_FAILURE;
@@ -83,7 +86,10 @@ AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
   })
 }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTIRuntimeError AOTInductorModelContainerDelete(
     AOTInductorModelContainerHandle container_handle) {
   CONVERT_EXCEPTION_TO_ERROR_CODE({
@@ -251,6 +257,7 @@ AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
   })
 }
 
+<<<<<<< HEAD
 AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBufferPairs(
     AOTInductorModelContainerHandle container_handle,
     const AOTInductorConstantMapEntry* pairs,
@@ -271,6 +278,8 @@ AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBufferPairs(
   })
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
@@ -462,6 +471,7 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
   })
 }
 
+<<<<<<< HEAD
 AOTIRuntimeError AOTInductorModelContainerGetConstantsBlobSize(
     AOTInductorModelContainerHandle container_handle,
     uint64_t* ret_size) {
@@ -485,4 +495,6 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsFromBlob(
     }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // extern "C"
diff --git a/torch/_inductor/codegen/block_analysis.py b/torch/_inductor/codegen/block_analysis.py
index b47c8325e2154..e6eb91ccd2c5e 100644
--- a/torch/_inductor/codegen/block_analysis.py
+++ b/torch/_inductor/codegen/block_analysis.py
@@ -17,6 +17,7 @@ class BlockPatternMatcher:
     Matches block indexing expressions.
     """
 
+<<<<<<< HEAD
     _indexing_wild_signed_int = functools.partial(
         sympy.Wild, properties=[lambda x: x.is_integer]
     )
@@ -24,6 +25,8 @@ class BlockPatternMatcher:
         sympy.Wild, properties=[lambda x: x.is_integer and x.is_nonnegative]
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def get_subexpr_involving_symbol(cls, expr: Expr, symbol: Symbol) -> Expr:
         """
@@ -70,6 +73,7 @@ def match_mod_div_block_expr(
         index = cls._preprocess(index)
 
         # Pattern match to find the strides and offset.
+<<<<<<< HEAD
         wild_unsigned_int = functools.partial(
             cls._indexing_wild_unsigned_int, exclude=[index_var]
         )
@@ -82,6 +86,11 @@ def match_mod_div_block_expr(
         strides: list[Expr] = [
             wild_signed_int(f"stride_mod{idx}") for idx in range(num_dims)
         ]
+=======
+        wild = functools.partial(sympy.Wild, exclude=[index_var])
+        dims: list[Expr] = [wild(f"dim_mod{idx}") for idx in range(num_dims)]
+        strides: list[Expr] = [wild(f"stride_mod{idx}") for idx in range(num_dims)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # The first dimension's index is computed by division.
         # The remaining are computed by modulo.
@@ -99,8 +108,12 @@ def match_mod_div_block_expr(
         # for more details. In short, here we check that each subexpression in sympy.Add contains
         # only FloorDiv or ModularIndexing expressions.
         if num_dims >= 5:
+<<<<<<< HEAD
             stride = sympy.symbols("stride", cls=wild_signed_int)
             denom, other = sympy.symbols("denominator other", cls=wild_unsigned_int)
+=======
+            stride, denom, other = sympy.symbols("stride denominator other", cls=wild)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod_div_pattern = stride * ModularIndexing(index_var, denom, other)
             floor_div_pattern = stride * FloorDiv(index_var, denom)
             first_dim_floor_div_matched = False
@@ -184,7 +197,11 @@ def match_affine_block_expr(
         stride.
         """
         index = cls._preprocess(index)
+<<<<<<< HEAD
         stride = cls._indexing_wild_signed_int(name="stride", exclude=[index_var])
+=======
+        stride = sympy.Wild("stride", exclude=[index_var])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = index.match(index_var * stride)
         if m is None:
             return None
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index ec967ca83c3bc..dfb5dc2c9c11d 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -34,7 +34,10 @@
 import torch.fx
 from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
 from torch.utils import _pytree as pytree
+<<<<<<< HEAD
 from torch.utils._config_module import ConfigModule
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.numbers import int_oo
 from torch.utils._sympy.printers import PythonPrinter as _PythonPrinter
@@ -44,7 +47,10 @@
 from .. import config, metrics
 from ..dtype_propagation import DtypePropagationOpsHandler
 from ..ops_handler import BasicMathOpsMixin, DefaultHandler
+<<<<<<< HEAD
 from ..shape_propagation import ShapePropagationOpsHandler
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import (
     boolean_ops,
     DeferredLineBase,
@@ -59,6 +65,7 @@
     triton_type,
     unique,
 )
+<<<<<<< HEAD
 from ..virtualized import (
     NullHandler,
     ops,
@@ -68,6 +75,9 @@
     StoreMode,
     V,
 )
+=======
+from ..virtualized import ops, OpsHandler, OpsValue, ReductionType, StoreMode, V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -79,7 +89,10 @@
     from ..ir import Buffer, ChoiceCaller, FixedLayout, IRNode
     from ..loop_body import LoopBody
     from ..scheduler import BaseScheduling, Scheduler, SchedulerNode
+<<<<<<< HEAD
     from ..shape_propagation import BlockShapeType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .wrapper import PythonWrapperCodegen
 
     _T = TypeVar("_T")
@@ -263,9 +276,12 @@ def get_stride(self) -> list[sympy.Expr]:
     def get_name(self) -> str:
         return self.outer_name
 
+<<<<<<< HEAD
     def get_is_pinned(self) -> bool:
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_inputs_that_alias_output(self) -> list[str]:
         return []
 
@@ -316,7 +332,10 @@ class DeviceCodegen:
     scheduling: SchedulingConstructor
     wrapper_codegen: WrapperConstructor
     cpp_wrapper_codegen: Optional[WrapperConstructor] = None
+<<<<<<< HEAD
     fx_wrapper_codegen: Optional[WrapperConstructor] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg, TMADescriptorArg, ConstexprArg]
@@ -373,8 +392,13 @@ def cpp_device_ptr(self) -> str:
     def tma_descriptor_helpers(self) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def cpp_scratch(
         self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[tuple[list[str], str]]:
         # optionally return (scratch definition, arg name)
         raise NotImplementedError
@@ -382,7 +406,10 @@ def cpp_scratch(
 
 device_op_overrides_dict: dict[str, DeviceOpOverrides] = {}
 custom_backend_passes: dict[str, Optional[CustomGraphModulePass]] = {}
+<<<<<<< HEAD
 custom_backend_codegen_configs: dict[str, Optional[ConfigModule]] = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The code generated by Inductor consists of two main parts: kernel code and wrapper code.
@@ -411,6 +438,7 @@ def register_backend_for_device(
     device_scheduling: SchedulingConstructor,
     device_wrapper_codegen: WrapperConstructor,
     device_cpp_wrapper_codegen: Optional[WrapperConstructor] = None,
+<<<<<<< HEAD
     device_fx_wrapper_codegen: Optional[WrapperConstructor] = None,
     device_custom_pass: Optional[CustomGraphModulePass] = None,
     device_custom_config: Optional[ConfigModule] = None,
@@ -430,6 +458,14 @@ def register_backend_for_device(
             f"{device_custom_config=} cannot be the same as the default inductor config {config=}"
         )
     custom_backend_codegen_configs[device] = device_custom_config
+=======
+    device_custom_pass: Optional[CustomGraphModulePass] = None,
+) -> None:
+    device_codegens[device] = DeviceCodegen(
+        device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
+    )
+    custom_backend_passes[device] = device_custom_pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class BackendFeature(Enum):
@@ -476,6 +512,7 @@ def get_scheduling_for_device(device: str) -> Optional[SchedulingConstructor]:
 
 
 def get_wrapper_codegen_for_device(
+<<<<<<< HEAD
     device: str, cpp_wrapper: bool = False, fx_wrapper: bool = False
 ) -> Optional[WrapperConstructor]:
     if device in device_codegens:
@@ -486,23 +523,41 @@ def get_wrapper_codegen_for_device(
             return wrapper_codegen_obj.cpp_wrapper_codegen
         else:
             return wrapper_codegen_obj.wrapper_codegen
+=======
+    device: str, cpp_wrapper: bool = False
+) -> Optional[WrapperConstructor]:
+    if device in device_codegens:
+        wrapper_codegen_obj: DeviceCodegen = device_codegens[device]
+        return (
+            wrapper_codegen_obj.cpp_wrapper_codegen
+            if cpp_wrapper
+            else wrapper_codegen_obj.wrapper_codegen
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return None
 
 
 def get_custom_backend_pass_for_device(device: str) -> Optional[CustomGraphModulePass]:
+<<<<<<< HEAD
     return custom_backend_passes.get(device)
 
 
 def get_custom_backend_config_for_device(device: str) -> Optional[ConfigModule]:
     return custom_backend_codegen_configs.get(device)
+=======
+    return custom_backend_passes[device] if device in custom_backend_passes else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
 def init_backend_registration() -> None:
+<<<<<<< HEAD
     """
     Register the backend for different devices, including the scheduling
     for kernel code generation and the host side wrapper code generation.
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .cpp import CppScheduling
     from .cpp_wrapper_cpu import CppWrapperCpu
     from .cpp_wrapper_cpu_array_ref import CppWrapperCpuArrayRef
@@ -511,10 +566,15 @@ def init_backend_registration() -> None:
     from .cuda_combined_scheduling import CUDACombinedScheduling
     from .halide import HalideScheduling
     from .mps import MetalScheduling
+<<<<<<< HEAD
     from .python_wrapper_mtia import PythonWrapperMtia
     from .triton import TritonScheduling
     from .wrapper import PythonWrapperCodegen
     from .wrapper_fxir import WrapperFxCodegen
+=======
+    from .triton import TritonScheduling
+    from .wrapper import PythonWrapperCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if get_scheduling_for_device("cpu") is None:
         cpu_backends = {
@@ -529,7 +589,10 @@ def init_backend_registration() -> None:
             CppWrapperCpuArrayRef
             if config.aot_inductor.allow_stack_allocation
             else CppWrapperCpu,
+<<<<<<< HEAD
             WrapperFxCodegen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if get_scheduling_for_device("cuda") is None:
@@ -543,7 +606,10 @@ def init_backend_registration() -> None:
             lambda scheduling: cuda_backends[config.cuda_backend](scheduling),
             PythonWrapperCodegen,
             CppWrapperGpu,
+<<<<<<< HEAD
             WrapperFxCodegen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if get_scheduling_for_device("xpu") is None:
@@ -552,7 +618,10 @@ def init_backend_registration() -> None:
             TritonScheduling,
             PythonWrapperCodegen,
             CppWrapperGpu,
+<<<<<<< HEAD
             WrapperFxCodegen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if get_scheduling_for_device("mps") is None:
@@ -561,6 +630,7 @@ def init_backend_registration() -> None:
             MetalScheduling,
             PythonWrapperCodegen,
             CppWrapperMps,
+<<<<<<< HEAD
             WrapperFxCodegen,
         )
 
@@ -571,6 +641,8 @@ def init_backend_registration() -> None:
             PythonWrapperMtia,
             CppWrapperGpu,
             WrapperFxCodegen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     private_backend = torch._C._get_privateuse1_backend_name()
@@ -584,14 +656,20 @@ def init_backend_registration() -> None:
             device_scheduling = _get_custom_mod_func("Scheduling")
             wrapper_codegen = _get_custom_mod_func("PythonWrapperCodegen")
             cpp_wrapper_codegen = _get_custom_mod_func("CppWrapperCodegen")
+<<<<<<< HEAD
             fx_wrapper_codegen = _get_custom_mod_func("WrapperFxCodegen")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device_scheduling and wrapper_codegen and cpp_wrapper_codegen:
                 register_backend_for_device(
                     private_backend,
                     device_scheduling,
                     wrapper_codegen,
                     cpp_wrapper_codegen,
+<<<<<<< HEAD
                     fx_wrapper_codegen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         except RuntimeError:
             pass
@@ -620,7 +698,10 @@ def get_device_op_overrides(device: str) -> DeviceOpOverrides:
     if not device_op_overrides_dict:
         from . import cpu_device_op_overrides, mps_device_op_overrides  # noqa: F401
         from .cuda import device_op_overrides  # noqa: F401
+<<<<<<< HEAD
         from .mtia import device_op_overrides as mtia_op_overrides  # noqa: F401
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .xpu import device_op_overrides as xpu_op_overrides  # noqa: F401
 
     return device_op_overrides_dict[device]
@@ -715,6 +796,7 @@ def check_dtype(
         buffer.writeline(f"static_assert({is_same_dt});")
 
 
+<<<<<<< HEAD
 def check_shape(
     buffer: IndentedBuffer, var: CSEVariableType, shape: BlockShapeType
 ) -> None:
@@ -736,6 +818,8 @@ def check_nan(buffer: IndentedBuffer, var: CSEVariableType) -> None:
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DataTypePropagation:
     def __init__(self, body: LoopBody) -> None:
         self.body = body
@@ -845,6 +929,7 @@ def doprint(
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+<<<<<<< HEAD
     def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
         if isinstance(item, sympy.Mod):
             # use parenthesis to enforce precedence.
@@ -853,6 +938,8 @@ def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> st
         else:
             return super().parenthesize(item, level, strict)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class OpDecompositions:
     """
@@ -967,7 +1054,10 @@ def paren(string: OpVarT) -> OpVarT:
             or _all_in_parens(string)
         ):
             # don't put extra parens for strings that are already wrapped in parens
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return string
         return f"({string})"
 
@@ -1042,11 +1132,14 @@ def store(
             f"{type(self).__name__}: store should be handled by CSEProxy"
         )
 
+<<<<<<< HEAD
     def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
         raise NotImplementedError(
             f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def store_reduction(self, name: str, index: sympy.Expr, value: OpVarT) -> None:
         raise NotImplementedError(
             f"{type(self).__name__}: store_reduction should be handled by CSEProxy"
@@ -1106,11 +1199,14 @@ def halide_clamp(self, value: OpVarT, size: sympy.Expr, check: bool) -> OpVarT:
             f"{type(self).__name__}: halide_clamp only implemented for Halide backend"
         )
 
+<<<<<<< HEAD
     def dot(self, x: OpVarT, y: OpVarT) -> OpVarT:
         raise NotImplementedError(
             f"{type(self).__name__}: dot only implemented for Triton backend"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inline_asm_elementwise(
         self,
         *inputs: OpVarT,
@@ -1573,11 +1669,17 @@ def make_inplace(self, input_name: str, output_name: str) -> None:
             self.inplace_buffers[input_name] = buf
             self.inplace_buffers[output_name] = buf
 
+<<<<<<< HEAD
     def workspace(
         self, nelem: sympy.Expr, zero_fill: bool, dtype: torch.dtype = torch.uint8
     ) -> tuple[str, str, int]:
         """
         Allocate or extend a workspace buffer of nelem elements.
+=======
+    def workspace(self, nbytes: sympy.Expr, zero_fill: bool) -> tuple[str, int]:
+        """
+        Allocate or extend a workspace buffer of nbytes bytes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         This function manages the allocation of a workspace buffer. It either creates
         a new WorkspaceArg or extends an existing one.
@@ -1590,6 +1692,7 @@ def workspace(
         - A new argument "ws_ptr" will be present in the generated code.
 
         Args:
+<<<<<<< HEAD
             nelem (sympy.Expr): The number of elements to allocate.
             zero_fill (bool): Whether to initialize the buffer to zero.
             dtype (torch.dtype): the dtype of the workspace tensor
@@ -1607,18 +1710,41 @@ def workspace(
             device=V.graph.get_current_device_or_throw(),
             outer_name=WorkspaceArg.unique_name(),
             dtype=dtype,
+=======
+            nbytes (sympy.Expr): The number of bytes to allocate.
+            zero_fill (bool): Whether to initialize the buffer to zero.
+
+        Returns:
+            Tuple[str, int]: A tuple containing:
+                - "ws_ptr": A string identifier for the workspace pointer.
+                - offset: An integer representing the byte offset in the workspace.
+        """
+        arg = WorkspaceArg(
+            count=nbytes,
+            zero_mode=WorkspaceZeroMode.from_bool(zero_fill),
+            device=V.graph.get_current_device_or_throw(),
+            outer_name=WorkspaceArg.unique_name(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for i, existing_arg in enumerate(self.workspace_args):
             if WorkspaceArg.can_join(existing_arg, arg):
                 offset = existing_arg.count
                 self.workspace_args[i] = WorkspaceArg.join(existing_arg, arg)
+<<<<<<< HEAD
                 return existing_arg.inner_name, existing_arg.outer_name, offset
+=======
+                return existing_arg.inner_name, offset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert (
                 existing_arg.inner_name != arg.inner_name
                 and existing_arg.outer_name != arg.outer_name
             ), existing_arg
         self.workspace_args.append(arg)
+<<<<<<< HEAD
         return arg.inner_name, arg.outer_name, 0
+=======
+        return arg.inner_name, 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def semaphores(self, min_size: sympy.Expr) -> str:
         """
@@ -1760,9 +1886,13 @@ def python_argdefs(
                 )
             )
         for outer, inner in chain(
+<<<<<<< HEAD
             self.input_buffers.items(),
             # pyrefly: ignore [bad-argument-type]
             self.output_buffers.items(),
+=======
+            self.input_buffers.items(), self.output_buffers.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
                 continue
@@ -1838,7 +1968,10 @@ def __init__(
         name: str,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         super().__init__()
         assert isinstance(bounds, ValueRanges), type(bounds)
@@ -1846,7 +1979,10 @@ def __init__(
         self.bounds = bounds
         self.use_count = 1  # track how many times this expression is used
         self.dtype = dtype
+<<<<<<< HEAD
         self.shape = shape
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return self.name
@@ -1956,7 +2092,10 @@ def generate(
         write: bool = True,
         assignment: bool = True,
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> CSEVariableType:
         if isinstance(expr, OpsValue):
             expr = expr.value
@@ -1977,12 +2116,17 @@ def generate(
             assert isinstance(expr, str)
             cache_key = expr
         var = self.try_get(cache_key)
+<<<<<<< HEAD
         if shape is None and not assignment:
             # since there's no assignment to a variable, use any shape here
             # other than None to avoid the unknown shape failures
             shape = ()
         if not var:
             var = self.newvar(bounds, dtype, shape)
+=======
+        if not var:
+            var = self.newvar(bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.put(cache_key, var)
             if write:
                 if V.kernel.current_node:
@@ -2028,10 +2172,16 @@ def newvar(
         self,
         bounds: ValueRanges[Any] = ValueRanges.unknown(),
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
     ) -> CSEVariableType:
         var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
         var = V.kernel.create_cse_var(var_name, bounds, dtype, shape)
+=======
+    ) -> CSEVariableType:
+        var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
+        var = V.kernel.create_cse_var(var_name, bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.varname_map[var_name] = var
         return var
 
@@ -2040,12 +2190,19 @@ def namedvar(
         name: str,
         bounds: ValueRanges[Any] = ValueRanges.unknown(),
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> CSEVariableType:
         torch._check_value(
             name not in self.varname_map, lambda: f"duplicate name: {name}"
         )
+<<<<<<< HEAD
         var = V.kernel.create_cse_var(name, bounds, dtype, shape)
+=======
+        var = V.kernel.create_cse_var(name, bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.varname_map[name] = var
         return var
 
@@ -2073,7 +2230,10 @@ def __init__(
     ) -> None:
         super().__init__()
         if increase_kernel_count:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             metrics.generated_kernel_count += 1
         self.args = args or KernelArgs()
         self.loads = IndentedBuffer()
@@ -2081,7 +2241,10 @@ def __init__(
         self.stores = IndentedBuffer()
 
         self.num_load = 0
+<<<<<<< HEAD
         self.num_store = 0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.num_reduction = 0
 
         self.cse: CSE[CSEVariableType, Any] = CSE(self.newvar_prefix, self.suffix)
@@ -2140,7 +2303,10 @@ def swap_buffers(
             self.compute = compute
             self.stores = stores
             self.cse = cse
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if disallow_stores:
                 assert not sb, "unexpected store inside swap_buffers"
 
@@ -2165,11 +2331,14 @@ def store(
     ) -> None:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
         raise NotImplementedError(
             f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reduction(
         self,
         dtype: torch.dtype,
@@ -2179,6 +2348,7 @@ def reduction(
     ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def partial_accumulate(
         self,
         name: str,
@@ -2187,6 +2357,8 @@ def partial_accumulate(
     ) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def scan(
         self,
         dtypes: tuple[torch.dtype, ...],
@@ -2304,7 +2476,10 @@ def remove_kernel_local_buffers(self) -> None:
                     name, fused_node_names
                 )
             ):
+<<<<<<< HEAD
                 self.num_store -= 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 names_to_remove.add(name)
 
         for name in names_to_remove:
@@ -2420,7 +2595,10 @@ def _template_from_string(source: str) -> Any:
             class DetailedTemplateSyntaxError(TemplateSyntaxError):
                 def __init__(self, original_error: TemplateSyntaxError) -> None:
                     super().__init__(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         original_error.message,
                         original_error.lineno,
                         original_error.name,
@@ -2432,7 +2610,10 @@ def __str__(self) -> str:
                     error_info = f"Error in template at line {self.lineno}\n"
                     error_info += f"Error message: {self.message}\n"
                     if hasattr(self.original_error, "source"):
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         lines = self.original_error.source.split("\n")
                         error_info += "Context:\n"
                         start = max(0, self.lineno - 2)
@@ -2470,6 +2651,7 @@ def get_dtype(name: str) -> torch.dtype:
 
         return get_dtype
 
+<<<<<<< HEAD
     def __init__(self, name: str, hash: Optional[str] = None) -> None:
         self.name = name
         self._hash = hash
@@ -2507,6 +2689,10 @@ def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
         if result is None and len(temp_choices) == 1:
             return temp_choices[0]
         return None
+=======
+    def __init__(self, name: str) -> None:
+        self.name = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
@@ -2523,7 +2709,11 @@ def maybe_append_choice(
             choices.append(self.generate(**kwargs))
             return None
         except NotImplementedError as e:
+<<<<<<< HEAD
             log.info(  # noqa: G200
+=======
+            log.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
@@ -2540,10 +2730,13 @@ def generate(self, **kwargs: Any) -> ChoiceCaller:
 
 
 class CSEProxy(DefaultHandler):
+<<<<<<< HEAD
     """A ops handler that proxies calls to `kernel` and its
     handler and returns `CSEVariable`s with correct shape and dtype.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name = "CSEProxy"
 
     def __init__(self, kernel: Kernel[Any], parent_handler: OpsHandler[Any]):
@@ -2559,6 +2752,7 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
 
         value = getattr(self.parent_handler, name)(*args, **kwargs)
         dtype_handler = DtypePropagationOpsHandler()
+<<<<<<< HEAD
         shape_handler = ShapePropagationOpsHandler()
 
         backend = get_current_backend()
@@ -2570,16 +2764,30 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
         if name == "masked" and backend == "triton":
             output_dtype = value.dtype
             output_shape = value.shape
+=======
+
+        backend = get_current_backend()
+
+        output_dtype = None
+        if name == "masked" and backend == "triton":
+            output_dtype = value.dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif name == "masked" and backend == "cpp":
             output_dtype = V.interpreter.current_node.meta.get(
                 OptimizationContext.key, None
             ).dtype
+<<<<<<< HEAD
             # TODO: fix me
             output_shape = None
         elif backend in ("triton", "cpp", "mps"):
             dtype_op = getattr(dtype_handler, name)
             output_dtype = dtype_op(*args, **kwargs)
             output_shape = shape_op(*args, **kwargs)
+=======
+        elif backend in ("triton", "cpp", "mps"):
+            dtype_op = getattr(dtype_handler, name)
+            output_dtype = dtype_op(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if backend in ("triton", "cpp"):
             # maybe there are some exceptions on mps?
@@ -2587,7 +2795,11 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
 
         output_idx = 0
 
+<<<<<<< HEAD
         def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
+=======
+        def do_cse(v: str) -> CSEVariable:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # we tree_map over the output, so we need to fetch corresponding dtype
             nonlocal output_idx
             var_dtype: Optional[torch.dtype] = (
@@ -2595,6 +2807,7 @@ def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
                 if isinstance(output_dtype, (list, tuple))
                 else output_dtype
             )
+<<<<<<< HEAD
             var_shape: BlockShapeType = (
                 output_shape[output_idx]  # type: ignore[assignment]
                 if isinstance(output_shape, (list, tuple))
@@ -2610,13 +2823,23 @@ def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
                     v.dtype = var_dtype
                 if v.shape is None:
                     v.shape = var_shape
+=======
+            output_idx += 1
+
+            # some cpp op implementations don't set the dtype
+            if backend == "cpp" and isinstance(v, CSEVariable) and v.dtype is None:
+                v.dtype = var_dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             csevar = V.kernel.cse.generate(
                 V.kernel.compute,
                 v,
                 bounds=bounds,
                 dtype=output_dtype,
+<<<<<<< HEAD
                 shape=output_shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             csevar.update_on_args(name, args, kwargs)
@@ -2627,6 +2850,7 @@ def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
             ):
                 assert var_dtype is not None
                 check_dtype(V.kernel.compute, csevar, var_dtype)
+<<<<<<< HEAD
 
             if config.test_configs.runtime_triton_shape_assert:
                 assert output_shape is not None
@@ -2635,6 +2859,8 @@ def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
             if config.runtime_triton_nan_asserts:
                 check_nan(V.kernel.compute, csevar)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return csevar
 
         return pytree.tree_map(do_cse, value)
@@ -2654,9 +2880,12 @@ def _bound_variable(self, name: str, *args: Any, **kwargs: Any) -> ValueRanges[A
         if isinstance(V.kernel, CUDATemplateKernel):
             return ValueRanges.unknown()
 
+<<<<<<< HEAD
         if isinstance(V.interpreter, NullHandler):
             return ValueRanges.unknown()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fx_node = V.interpreter.current_node
         if fx_node.target == name and self.kernel.node_to_bounds is not None:
             assert isinstance(self.kernel.node_to_bounds, dict), type(
@@ -2724,6 +2953,7 @@ def indirect_indexing(
                     pos = var.bounds & ValueRanges(0, int_oo)
                     new_bounds = new_bounds | pos
 
+<<<<<<< HEAD
             var = self.kernel.cse.generate(
                 self.kernel.compute,
                 stm,
@@ -2731,6 +2961,9 @@ def indirect_indexing(
                 dtype=var.dtype,
                 shape=var.shape,
             )
+=======
+            var = self.kernel.cse.generate(self.kernel.compute, stm, bounds=new_bounds)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sympy_var = self.parent_handler.indirect_indexing(var, size, check)
         if generate_assert(check):
@@ -2779,6 +3012,7 @@ def store(
             self._update_store_cache(name, value)
         if name not in V.graph.removed_buffers:
             self.kernel.store(name, index, value, mode=mode)
+<<<<<<< HEAD
             self.kernel.num_store += 1
 
     def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
@@ -2787,13 +3021,18 @@ def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
     # pyrefly: ignore [bad-override]
     def partial_accumulate(self, *args: Any) -> None:
         self.kernel.partial_accumulate(*args)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
         self.kernel.store_buffer_names.add(name)
         self._update_store_cache(name, value)
 
         if name not in V.graph.removed_buffers:
+<<<<<<< HEAD
             self.kernel.num_store += 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.kernel.store_reduction(name, index, value)
 
     def reduction(
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 65f5d37d0d852..46919f51d6a5e 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -24,7 +24,10 @@
 
 from ..._dynamo.utils import counters
 from .. import config, cpp_builder, cpu_vec_isa, ir, metrics
+<<<<<<< HEAD
 from ..debug import set_kernel_post_grad_provenance_tracing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..loop_body import LoopBody
 from ..scheduler import (
     BaseSchedulerNode,
@@ -44,6 +47,10 @@
     is_welford_reduction,
     parallel_num_threads,
     Placeholder,
+<<<<<<< HEAD
+=======
+    set_kernel_post_grad_provenance_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_index_symbol,
     sympy_index_symbol_with_prefix,
     sympy_product,
@@ -159,7 +166,10 @@ def get_export_declaration():
 ]
 
 MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
+<<<<<<< HEAD
     torch.float64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.float,
     torch.bfloat16,
     torch.float16,
@@ -217,17 +227,25 @@ def reduction_combine(
     reduction_type,
     var,
     next_value,
+<<<<<<< HEAD
     helper_val=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     index: Optional[sympy.Symbol] = None,
     src_dtype=None,
 ):
     is_bool = src_dtype == torch.bool
     if reduction_type == "sum":
+<<<<<<< HEAD
         if helper_val:
             return f"cascade_sum_combine({next_value}, &{helper_val})"
         else:
             conjunction = "|" if is_bool else "+"
             return f"{var} {conjunction} {next_value}"
+=======
+        conjunction = "|" if is_bool else "+"
+        return f"{var} {conjunction} {next_value}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if reduction_type == "prod":
         return f"{var} * {next_value}"
     if reduction_type == "xor_sum":
@@ -367,6 +385,7 @@ def replace_acc_name(buffer: IndentedBuffer, name: str, new_name: str):
             buffer._lines[i] = re.sub(r"\b" + f"{name}" + r"\b", f"{new_name}", line)
 
 
+<<<<<<< HEAD
 def replace_cascade_sum_with_add(buffer: IndentedBuffer):
     """
     Replaces `acc = cascade_sum_combine(value, ...)` with `acc = acc + value;`
@@ -392,6 +411,8 @@ def replace_cascade_sum_with_add(buffer: IndentedBuffer):
                 buffer._lines[i] = new_content
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache
 def stride_at(index: sympy.Expr, var: sympy.Symbol):
     if not index.has(var):
@@ -505,7 +526,10 @@ def fuse(  # type: ignore[override]
         if any(type(node) is OuterLoopFusedSchedulerNode for node in (node1, node2)):
             return cls(
                 node1.scheduler,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (
                     list(node1.get_outer_nodes())
                     if type(node1) is OuterLoopFusedSchedulerNode
@@ -935,8 +959,13 @@ def frexp(x):
             return tuple(V.kernel.cse.try_get(cache_key) for cache_key in cache_keys)
 
         code = BracesBuffer()
+<<<<<<< HEAD
         exponent = V.kernel.cse.newvar(dtype=torch.int32, shape=x.shape)
         mantissa = V.kernel.cse.newvar(dtype=x.dtype, shape=x.shape)
+=======
+        exponent = V.kernel.cse.newvar(dtype=torch.int32)
+        mantissa = V.kernel.cse.newvar(dtype=x.dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.writeline(f"int32_t {exponent};")
         code.writeline(f"auto {mantissa} = std::frexp({x}, &{exponent});")
         V.kernel.compute.splice(code)
@@ -1121,6 +1150,7 @@ def sign(x):
         code.writeline("()")
         return code
 
+<<<<<<< HEAD
     def partial_accumulate(
         self,
         name: str,
@@ -1129,6 +1159,8 @@ def partial_accumulate(
     ) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 CppOverrides._initialize_pointwise_overrides("cpp")
 
@@ -1221,7 +1253,11 @@ def wrapper(*args, **kwargs):
             return wrapper
 
         for name, method in vars(CppVecOverrides).items():
+<<<<<<< HEAD
             if getattr(method, "__class__", None) is staticmethod and name not in [
+=======
+            if getattr(method, "__class__", None) == staticmethod and name not in [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "masked",
                 "index_expr",
             ]:
@@ -1726,7 +1762,10 @@ def maskify_or_vecify(code):
                     body_vec_var.dtype = dtype
                     other_vec_var.dtype = dtype
                     overrides: type[Union[CppOverrides, CppVecOverrides]] = (
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         V.kernel.overrides
                     )  # type: ignore[has-type]
                     code.writeline(
@@ -1770,7 +1809,10 @@ def index_expr(expr, dtype):
             csevar = V.kernel._load_or_store_non_contiguous(  # type: ignore[assignment]
                 None, index, dtype, V.kernel.compute
             )
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         csevar.update_on_args("index_expr", (expr, dtype), {})
         return csevar
 
@@ -1915,6 +1957,7 @@ def index_expr(expr, dtype):
 
 
 class CppKernel(Kernel):
+<<<<<<< HEAD
     """
     Base class for C++ kernel code generation in PyTorch Inductor.
     This class is responsible for generating C++ code from the intermediate representation.
@@ -1924,6 +1967,8 @@ class CppKernel(Kernel):
         num_threads: Number of threads for parallel execution
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overrides = CppOverrides  # type: ignore[assignment]
     sexpr = cexpr
     newvar_prefix = "auto "
@@ -1961,9 +2006,12 @@ def __init__(self, args, num_threads):
         self.welford_helper_cse = CSE(
             self.newvar_prefix, self.suffix, name_prefix="welford_helper"
         )
+<<<<<<< HEAD
         self.cascade_helper_cse = CSE(
             self.newvar_prefix, self.suffix, name_prefix="cascade_helper"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.preloads = IndentedBuffer()
         self.poststores = IndentedBuffer()
         self.num_threads = num_threads  # num_threads the kernel specialized for
@@ -2048,7 +2096,10 @@ def masked(self, mask):
                 # mask's dtype should be bool
                 mask.dtype = torch.bool
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._load_mask = mask
         try:
             yield mask
@@ -2147,11 +2198,14 @@ def store(self, name, index, value, mode=None):
             raise NotImplementedError(f"store mode={mode}")
         self.stores.writeline(DeferredLine(name, line))
 
+<<<<<<< HEAD
     def device_assert_async(self, cond, msg):
         self.compute.writeline(
             f'({cond} ? 0 : (throw std::runtime_error("{msg}"), 0));'
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _gen_reduction_prefix(
         self,
         acc: Union[CSEVariable, str],
@@ -2185,6 +2239,7 @@ def finalize_reduction_prefix(self, size: Optional[int] = None):
         for gen_fn in self.reduction_prefix_generators:
             self.reduction_prefix.splice(gen_fn(size))
 
+<<<<<<< HEAD
     def need_use_acc_helper(self, reduction_type, dtype, use_scalar):
         # Check if we need accumulate helper for the reduction operation.
         # using accumulate helper generates the necessary code to improve precision for
@@ -2302,6 +2357,8 @@ def _use_acc_helper(
                 f"{result}_local = cascade_sum_final(&{helper_val});"
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reduction(self, dtype, src_dtype, reduction_type, value):
         argmax_or_argmin = reduction_type in ("argmax", "argmin")
         reduction_key = src_dtype, reduction_type, value
@@ -2320,6 +2377,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 acc, acc_type, reduction_type, init_dtype, reduction_init
             )
         )
+<<<<<<< HEAD
 
         if self.need_use_acc_helper(reduction_type, dtype, True):
             # use cascade_helper for vec kernel
@@ -2350,6 +2408,15 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             self.stores.writeline(
                 f"{acc} = {reduction_combine(reduction_type, acc, value, index=index)};"
             )
+=======
+        assert self.reduction_depth is not None
+        index = self.itervars[self.reduction_depth]
+        for i in range(self.reduction_depth + 1, len(self.itervars)):
+            index = index * self.ranges[i] + self.itervars[i]
+        self.stores.writeline(
+            f"{acc} = {reduction_combine(reduction_type, acc, value, index)};"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._gen_parallel_reduction_buffers(acc, acc_type, reduction_type, init_dtype)
         result = reduction_project(reduction_type, acc)
@@ -2376,7 +2443,10 @@ def set_ranges(self, lengths, reduction_lengths):
                 sympy_index_symbol_with_prefix(SymT.XBLOCK, n)
                 for n in range(len(self.ranges))
             ]
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.reduction_depth = len(lengths)
         return (
             self.itervars[: self.reduction_depth],
@@ -2618,15 +2688,23 @@ def gen(start, end, var):
                     var_id = i
                     break
             if (
+<<<<<<< HEAD
                 type(self) is CppKernel
+=======
+                type(self) == CppKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and var_id
                 and start == 0
                 and end == self.ranges[var_id]
             ):
                 end = 1
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             conditions.append(f"{var} >= {cexpr_index(start)}")
             # pyrefly: ignore [bad-argument-type]
+=======
+            conditions.append(f"{var} >= {cexpr_index(start)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             conditions.append(f"{var} < {cexpr_index(end)}")
             return True
 
@@ -2989,6 +3067,7 @@ def store(self, name, index, value, mode=None):
             raise NotImplementedError(f"store mode={mode}")
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
+<<<<<<< HEAD
         """
         Perform vectorized reduction operation.
 
@@ -3005,6 +3084,8 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         Returns:
             The result of the reduction operation
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Note: For argmax and argmin on bool type, we always convert bool to float.
         # Fix issue: https://github.com/pytorch/pytorch/issues/143568
         assert reduction_type in VECTORIZABLE_RTYPES
@@ -3030,7 +3111,10 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         )
         assert isinstance(acc, CppCSEVariable)
         acc_vec = f"{acc}_vec"
+<<<<<<< HEAD
         masked_acc = f"masked_{acc}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         masked_acc_vec = f"masked_{acc_vec}"
         self.reduction_var_names += [f"{acc}", acc_vec, masked_acc_vec]
         self.is_reduction = True
@@ -3048,9 +3132,13 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 self.reduction_init_vec,
             )
         )
+<<<<<<< HEAD
 
         use_acc_helper = self.need_use_acc_helper(reduction_type, dtype, False)
         if use_acc_helper:
+=======
+        if reduction_type == "welford_reduce":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use masked acc_vec for tail vec kernel
             self.reduction_prefix_generators.append(
                 self._gen_reduction_prefix(
@@ -3062,11 +3150,16 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 )
             )
 
+<<<<<<< HEAD
             # use welford_helper/cascade_helper for vec kernel
+=======
+            # use welford_helper for vec kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert self.reduction_depth is not None
             reduction_size = functools.reduce(
                 operator.mul, self.ranges[self.reduction_depth :]
             )
+<<<<<<< HEAD
             if reduction_type == "welford_reduce":
                 helper_val = self.welford_helper_cse.generate(
                     self.compute, f"reduction {reduction_key}", write=False
@@ -3077,6 +3170,13 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 )
             masked_helper_val = f"masked_{helper_val}"
             helper_vec_range = (
+=======
+            welford_helper_val = self.welford_helper_cse.generate(
+                self.compute, f"reduction {reduction_key}", write=False
+            )
+            masked_welford_helper_val = f"masked_{welford_helper_val}"
+            welford_helper_vec_range = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (
                     FloorDiv(reduction_size, self.ranges[self.tiling_idx])
                     * FloorDiv(self.ranges[self.tiling_idx], self.tiling_factor)
@@ -3086,7 +3186,11 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 if FloorDiv(self.ranges[self.tiling_idx], self.tiling_factor)
                 else sympy.Integer(0)
             )
+<<<<<<< HEAD
             masked_helper_vec_range = (
+=======
+            masked_welford_helper_vec_range = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (
                     FloorDiv(reduction_size, self.ranges[self.tiling_idx])
                     if self.tiling_idx >= self.reduction_depth
@@ -3095,6 +3199,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 if self.ranges[self.tiling_idx] % self.tiling_factor
                 else sympy.Integer(0)
             )
+<<<<<<< HEAD
             # scalar helper for scalar sum is also needed when vec kernel is included
             # Note: is it different from welford reduction as welford reduction of scalar version
             # does not need helper, and the helper needs the information of reduction size to initialize
@@ -3116,11 +3221,21 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 masked_acc,
                 masked_helper_val,
                 masked_helper_vec_range,
+=======
+            self._use_welford_helper(
+                acc_vec, welford_helper_val, welford_helper_vec_range, dtype
+            )
+            self._use_welford_helper(
+                masked_acc_vec,
+                masked_welford_helper_val,
+                masked_welford_helper_vec_range,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype,
             )
 
             # use masked acc_vec for tail vec kernel
             acc_vec_ = masked_acc_vec if self.tail_size else acc_vec
+<<<<<<< HEAD
             helper_val_ = masked_helper_val if self.tail_size else helper_val
             if reduction_type == "sum":
                 self.stores.writeline(
@@ -3130,6 +3245,14 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 self.stores.writeline(
                     f"{acc_vec_} = {self.reduction_combine_vec(reduction_type, acc_vec_, value, helper_val_)};"
                 )
+=======
+            welford_helper_val_ = (
+                masked_welford_helper_val if self.tail_size else welford_helper_val
+            )
+            self.stores.writeline(
+                f"{acc_vec_} = {self.reduction_combine_vec(reduction_type, acc_vec_, value, welford_helper_val_)};"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             assert self.reduction_depth is not None
             index = self.itervars[self.reduction_depth]
@@ -3160,7 +3283,11 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             reduction_combine_fn=reduction_combine,
             reduction_init_fn=reduction_init,
         )
+<<<<<<< HEAD
         if use_acc_helper:
+=======
+        if reduction_type == "welford_reduce":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use masked acc_vec for tail vec kernel
             self._gen_parallel_reduction_buffers(
                 masked_acc_vec,
@@ -3207,11 +3334,15 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 vec_dtype = torch.float if is_bool else dtype
                 vec = f"at::vec::Vectorized<{DTYPE_TO_CPP[vec_dtype]}>"
                 vec_reduce_all_func = f"at::vec::vec_reduce_all<{DTYPE_TO_CPP[vec_dtype]}, {self._get_num_vectors(vec_dtype)}>"
+<<<<<<< HEAD
                 result_vec = f"{acc_vec}"
                 if use_acc_helper:
                     assert reduction_type == "sum"
                     result_vec = f"{acc_vec} + {masked_acc_vec}"
                 next_value = f"{vec_reduce_all_func}([]({vec}& x, {vec}& y) {reduce_all_body}, {result_vec})"
+=======
+                next_value = f"{vec_reduce_all_func}([]({vec}& x, {vec}& y) {reduce_all_body}, {acc_vec})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.reduction_suffix.writeline(
                 f"{acc} = {reduction_combine(reduction_type, acc, next_value, src_dtype=src_dtype)};"
@@ -3224,12 +3355,15 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 self.reduction_suffix.writeline(
                     f"{tmpvar} = {reduction_combine(reduction_type, tmpvar, masked_tmpvar)};"
                 )
+<<<<<<< HEAD
             elif use_acc_helper:
                 assert reduction_type == "sum"
                 masked_tmpvar = f"masked_{tmpvar}"
                 self.reduction_suffix.writeline(
                     f"{tmpvar} = {tmpvar} + {masked_tmpvar};"
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result = reduction_project(reduction_type, tmpvar)
         self.reduction_cse.reduction_cache[reduction_key] = result
@@ -3239,10 +3373,18 @@ def store_reduction(self, name, index, value):
         index = self.rename_indexing(index)
         var = self.args.output(name)
         out_dtype = V.graph.get_dtype(name)
+<<<<<<< HEAD
         if out_dtype.is_floating_point and out_dtype != torch.double:
             dtype = torch.float
         else:
             dtype = out_dtype
+=======
+        dtype = (
+            (out_dtype if out_dtype == torch.double else torch.float)
+            if out_dtype.is_floating_point
+            else torch.int64
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_num_vectors = V.kernel._get_num_vectors(out_dtype)
         src_num_vectors = V.kernel._get_num_vectors(dtype)
         code = IndentedBuffer()
@@ -3354,12 +3496,66 @@ def reduction_acc_type_vec(self, reduction_type, dtype):
             return f"{self._get_mask_type()}"
         return vec_type
 
+<<<<<<< HEAD
+=======
+    def _welford_helper_init(
+        self, welford_helper_val, welford_helper_vec_range, dtype, num_threads=None
+    ):
+        vec_num_range_thread = (
+            CeilDiv(welford_helper_vec_range, num_threads)
+            if num_threads
+            else welford_helper_vec_range
+        )
+        vec_num_range_thread_expr = cexpr_index(vec_num_range_thread)
+        chunk_size = 4096
+        num_chunks = CeilDiv(vec_num_range_thread, chunk_size)
+        welford_helper_init_line = (
+            f"WelfordHelper<{self._get_vec_type(dtype)}, {chunk_size}> {welford_helper_val}"
+            f"("
+            f"{vec_num_range_thread_expr}"
+            f");"
+        )
+        if isinstance(num_chunks, sympy.Integer) and num_chunks <= 1:
+            # When the number of chunks <= 1, there is no need to use cascade summation to improve
+            # reduction accuracy. We can initialize a static WelfordHelper to improve performance.
+            return f"static {welford_helper_init_line}"
+        else:
+            return welford_helper_init_line
+
+    def _use_welford_helper(
+        self, acc_vec, welford_helper_val, welford_helper_vec_range, dtype
+    ):
+        num_threads = (
+            "max_threads" if config.cpp.dynamic_threads else parallel_num_threads()
+        )
+        self.non_parallel_reduction_prefix.writeline(
+            self._welford_helper_init(
+                welford_helper_val, welford_helper_vec_range, dtype
+            )
+        )
+        self.local_reduction_init.writeline(
+            self._welford_helper_init(
+                welford_helper_val, welford_helper_vec_range, dtype, num_threads
+            )
+        )
+        self.non_parallel_reduction_suffix.writeline(
+            f"{acc_vec} = welford_combine({acc_vec}, &{welford_helper_val});"
+        )
+        self.local_reduction_stores.writeline(
+            f"{acc_vec}_local = welford_combine({acc_vec}_local, &{welford_helper_val});"
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reduction_combine_vec(
         self,
         reduction_type,
         var,
         next_value,
+<<<<<<< HEAD
         helper_val=None,
+=======
+        welford_helper_val=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         index: Optional[sympy.Symbol] = None,
         horizontal_reduction: Optional[bool] = None,
         src_dtype: Optional[torch.dtype] = torch.float32,
@@ -3384,6 +3580,7 @@ def reduction_combine_vec(
                     else f"at::vec::minimum({var}, {next_value})"
                 )
         elif reduction_type == "sum":
+<<<<<<< HEAD
             if helper_val:
                 if self.tail_size:
                     return f"cascade_sum_combine({next_value}, {cexpr_index(self.tail_size)}, &{helper_val})"
@@ -3395,6 +3592,13 @@ def reduction_combine_vec(
                 else:
                     conjunction = "|" if is_bool else "+"
                     return f"{var} {conjunction} {next_value}"
+=======
+            if self.tail_size:
+                return f"sum_masked_reduce({var}, {next_value}, {cexpr_index(self.tail_size)})"
+            else:
+                conjunction = "|" if is_bool else "+"
+                return f"{var} {conjunction} {next_value}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif reduction_type == "prod":
             if self.tail_size:
                 return f"prod_masked_reduce({var}, {next_value}, {cexpr_index(self.tail_size)})"
@@ -3406,11 +3610,21 @@ def reduction_combine_vec(
             else:
                 return f"{var} ^ {next_value}"
         elif reduction_type == "welford_reduce":
+<<<<<<< HEAD
             if helper_val:
                 if self.tail_size:
                     return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)}, &{helper_val})"
                 else:
                     return f"welford_combine({var}, {next_value}, &{helper_val})"
+=======
+            if welford_helper_val:
+                if self.tail_size:
+                    return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)}, &{welford_helper_val})"
+                else:
+                    return (
+                        f"welford_combine({var}, {next_value}, &{welford_helper_val})"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 if self.tail_size:
                     return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)})"
@@ -3944,7 +4158,11 @@ def _is_valid_indices(
                             call_ranges[tiling_indice], fallback=0
                         )
                         if call_range < factor_lowp:
+<<<<<<< HEAD
                             V.graph.sizevars.check_lt(call_range, factor_lowp)  # type: ignore[arg-type]
+=======
+                            V.graph.sizevars.guard_lt(call_range, factor_lowp)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             tiling_factor = factor_lowp // 2
                             break
                     elif call_ranges[tiling_indice] < factor_lowp:
@@ -4101,7 +4319,10 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                     and (dt := get_output_dtype(_node)) in DTYPE_LOWP_FP
                 ):
                     # No need to promote to float if all users are ops that accepts lowp fp input
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if all(is_lowp_fp_sink(user, dt) for user in _node.users):
                         continue
                     ops = _node.args[0]
@@ -4112,14 +4333,20 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                         _node.replace_all_uses_with(
                             to_type_node, lambda n: n is not to_type_node
                         )
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         metrics.cpp_to_dtype_count += 1
                 elif (
                     _node.target == "store"
                     and (dt := get_input_dtype(_node)) in DTYPE_LOWP_FP
                 ):
                     ops, name, _, value_var, _ = _node.args
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if is_lowp_fp_source_no_promote(value_var, dt):
                         continue
                     dtype = V.graph.get_dtype(name)
@@ -4128,7 +4355,10 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                             "to_dtype", args=(ops, value_var, dtype)
                         )
                         _node.replace_input_with(value_var, to_type_node)
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         metrics.cpp_to_dtype_count += 1
                 elif _node.target == "reduction":
                     (
@@ -4198,7 +4428,10 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                                     "to_dtype", args=(ops, value_var, src_dtype)
                                 )
                                 _node.replace_input_with(value_var, to_type_node)
+<<<<<<< HEAD
                                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 metrics.cpp_to_dtype_count += 1
 
                     # to_dtype_bitcast act as a lowp fp source:
@@ -4217,8 +4450,14 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                                 _node.replace_all_uses_with(
                                     to_type_node, lambda n: n is not to_type_node
                                 )
+<<<<<<< HEAD
                                 # pyrefly: ignore [bad-assignment]
                                 metrics.cpp_to_dtype_count += 1
+=======
+                                metrics.cpp_to_dtype_count += 1
+                else:
+                    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def eliminate_to_dtype(sub_graph: torch.fx.Graph):
                 def _eliminate_duplicate_to_node(sub_graph: torch.fx.Graph):
@@ -4311,7 +4550,10 @@ def codegen_kernel(cls, *args):
             with kernel_group.new_kernel(cls, *args) as kernel:
                 # Ugly hack to maintain the metrics kernel count since
                 # we only count in CppKernelProxy, not those contained in it
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 metrics.generated_kernel_count -= 1
 
                 run(kernel)
@@ -4383,7 +4625,10 @@ def run(kernel):
                     )
 
             if len(tiling_indices) == 1:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 metrics.generated_cpp_vec_kernel_count += 1
                 loop = self.loop_nest.tile(tiling_indices[0], factor=tiling_factors[0])
                 vec_kernel = codegen_kernel(
@@ -4410,7 +4655,10 @@ def run(kernel):
                     and tiling_factors[0] == tiling_factors[1]
                 )
 
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 metrics.generated_cpp_vec_kernel_count += 2
                 outer_loop = self.loop_nest.tile(
                     tiling_indices[0], factor=tiling_factors[0]
@@ -4546,12 +4794,18 @@ def gen_body(self, code: Optional[BracesBuffer] = None):
     def aggregate_reduction_buffers(
         self, inner_loop_reduction_outer_not: bool, outer_loop: Optional["LoopLevel"]
     ):
+<<<<<<< HEAD
         """
         CppKernel/CppVecKernel/CppTile2dKernel have reduction buffers themselves.
         Here, we decide how to aggregate them together and place new reduction buffers
         under CppKernelProxy.
         """
 
+=======
+        # CppKernel/CppVecKernel/CppTile2dKernel have reduction buffers themselves.
+        # Here, we decide how to aggregate them together and place new reduction buffers
+        # under CppKernelProxy.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
             assert len(self.kernels) >= 2
             main_loop_kernel = self.kernels[0]
@@ -4559,7 +4813,11 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
             assert isinstance(main_loop_kernel, self.vec_kernel_cls)
 
             # Prefix
+<<<<<<< HEAD
             if type(tail_loop_kernel) is self.kernel_cls:
+=======
+            if type(tail_loop_kernel) == self.kernel_cls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # if tail loop kernel is a scalar kernel, we need to extend tmp_acc -> tmp_acc_arr[] to
                 # hold the temporary inner loop acc result for outer tail loop
                 tail_loop_kernel.finalize_reduction_prefix(
@@ -4587,7 +4845,11 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
                     suffix_buf, "C10_UNLIKELY", outer_loop.var
                 ):
                     stack.enter_context(suffix_buf.indent())
+<<<<<<< HEAD
                     if type(tail_loop_kernel) is self.kernel_cls:
+=======
+                    if type(tail_loop_kernel) == self.kernel_cls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         reduction_vars = tail_loop_kernel.reduction_var_names
                         for name in reduction_vars:
                             new_name = f"{name}_arr[{outer_loop.var}_tail - {cexpr_index(outer_loop.tiled_size)}]"
@@ -4595,9 +4857,12 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
                             replace_acc_name(
                                 tail_loop_kernel.reduction_suffix, name, new_name
                             )
+<<<<<<< HEAD
                         # If tail loop kernel is a scalar kernel, use direct sum instead of cascade_sum_combine
                         # as the reduction vars are extended: tmp_acc -> tmp_acc_arr[].
                         replace_cascade_sum_with_add(tail_loop_kernel.stores)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         suffix_buf.splice(
                             move_code_under_inner_loop(
                                 tail_loop_kernel.reduction_suffix,
@@ -4886,7 +5151,11 @@ def can_fuse_multi_outputs_template(
                 isinstance(template_buf.layout, ir.MultiOutputLayout)
                 and isinstance(node2.node, ir.MultiOutput)
                 and len(node2.node.inputs) == 1
+<<<<<<< HEAD
                 and node2.node.inputs[0].get_name() == template_buf.name  # type: ignore[union-attr]
+=======
+                and node2.node.inputs[0].get_name() == template_buf.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return False
 
@@ -5159,12 +5428,18 @@ def is_all_write_read_contiguous():
                             contiguous_index_expr = 0
                             stride = 1
                             for var, range in reversed(
+<<<<<<< HEAD
                                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 scheduler_node._body.var_ranges.items()
                             ):
                                 contiguous_index_expr += stride * var
                                 stride *= range
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             write_index_expr = scheduler_node._body.get_write_expr(
                                 scheduler_buffer.get_name()
                             )
@@ -5188,6 +5463,7 @@ def is_contiguous_index(x):
                         ):
                             continue
                         # Local Buffer is a view of global buffer
+<<<<<<< HEAD
                         local_buffer_stride: list[int] = []
                         stride = global_buffer_layout.stride[-1]
                         local_buffer_size = get_call_ranges(scheduler_node)[
@@ -5201,6 +5477,13 @@ def is_contiguous_index(x):
                             global_buffer_layout.dtype,
                             local_buffer_size,
                             local_buffer_stride,
+=======
+                        local_buffer_layout = ir.FixedLayout(
+                            global_buffer_layout.device,
+                            global_buffer_layout.dtype,
+                            global_buffer_layout.size[size_offset:],
+                            global_buffer_layout.stride[size_offset:],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
                         def try_share_local_buffer(local_buffer_layout, local_buffers):
@@ -5233,7 +5516,10 @@ def try_share_local_buffer(local_buffer_layout, local_buffers):
                             )
                             local_buffers.append(local_buffer_used)
                             local_to_global_buffers[local_buffer_used.name] = []  # type: ignore[index]
+<<<<<<< HEAD
                         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         local_to_global_buffers[local_buffer_used.name].append(
                             global_buffer,
                         )
@@ -5371,7 +5657,11 @@ def template_buffer_has_other_users(
         flag_template_buffer_has_other_users = template_buffer_has_other_users(
             ctb, template_node.outputs_by_name, epilogue_ir_nodes
         )
+<<<<<<< HEAD
         kernel, render = ctb.make_kernel_render(  # type: ignore[misc]
+=======
+        kernel, render = ctb.make_kernel_render(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ctb,
             flag_template_buffer_has_other_users=flag_template_buffer_has_other_users,
             epilogue_nodes=epilogue_ir_nodes,
@@ -5402,7 +5692,10 @@ def template_buffer_has_other_users(
                 )
                 user.node.mark_run()
 
+<<<<<<< HEAD
         self.codegen_comment(node_schedule, kernel_name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
@@ -5418,6 +5711,7 @@ def codegen_sync(self):
 
     def define_kernel(self, src_code, nodes, kernel_args=None):
         wrapper = V.graph.wrapper_code
+<<<<<<< HEAD
         if src_code in wrapper.src_to_kernel:
             kernel_name = wrapper.src_to_kernel[src_code]
         else:
@@ -5460,6 +5754,48 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
                 gpu=False,
                 cpp_definition=kernel_definition,
             )
+=======
+        fused_name = (
+            get_fused_kernel_name(nodes, config.cpp.descriptive_names)
+            if config.cpp.descriptive_names
+            else ""
+        )
+        kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
+        # below add provenance tracing info for cpu CppKernel types
+        if config.trace.enabled:
+            set_kernel_post_grad_provenance_tracing(nodes, kernel_name)
+
+        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
+        src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+        src_code = src_code.replace("#pragma CMT", "//")
+
+        # Get the lines in the source code representing the function definition,
+        # excluding the the first line including cpp_prefix.h.
+        first_char = src_code.rfind('extern "C"')
+        last_char = src_code.find(")", first_char)
+        if _IS_WINDOWS:
+            # get_export_declaration introduced one more ')' in Windows
+            last_char = src_code.find(")", last_char + 1)
+        kernel_definition = f"{src_code[first_char : last_char + 1]};\n"
+
+        compile_wrapper = IndentedBuffer()
+        args = self.kernel_group.args if kernel_args is None else kernel_args
+        _, _, arg_types = args.cpp_argdefs()
+        if not V.graph.cpp_wrapper:
+            compile_wrapper.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
+        compile_wrapper.splice(src_code, strip=True)
+        if not V.graph.cpp_wrapper:
+            compile_wrapper.writeline("''')")
+        wrapper.define_kernel(
+            kernel_name,
+            compile_wrapper.getvalue(),
+            gpu=False,
+            cpp_definition=kernel_definition,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return kernel_name
 
     def flush(self):
@@ -5468,11 +5804,15 @@ def flush(self):
             kernel_name = self.define_kernel(
                 src_code, self.kernel_group.scheduled_nodes
             )
+<<<<<<< HEAD
             self.codegen_comment(self.kernel_group.scheduled_nodes, kernel_name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
         self.reset_kernel_group()
         self._set_flush_status(False)
 
+<<<<<<< HEAD
     def codegen_comment(self, node_schedule, kernel_name=None):
         # below add provenance tracing info for cpu CppKernel types
         wrapper = V.graph.wrapper_code
@@ -5483,6 +5823,8 @@ def codegen_comment(self, node_schedule, kernel_name=None):
         )
         wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class KernelGroup:
     def __init__(self):
@@ -5520,7 +5862,11 @@ def codegen_group(self, name=None) -> str:
             "win32",
         ]
         if enable_kernel_profile:
+<<<<<<< HEAD
             code.writelines(["#include <torch/csrc/inductor/aoti_runtime/utils.h>"])
+=======
+            code.writelines(["#include <ATen/record_function.h>"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.writeline("#include <torch/csrc/inductor/cpp_prefix.h>")
 
         # 2. Function definition
@@ -5529,11 +5875,16 @@ def codegen_group(self, name=None) -> str:
         arg_defs, _, _ = self.args.cpp_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
         func_export_decl = get_export_declaration()
+<<<<<<< HEAD
         inline_attr = (
             "C10_ALWAYS_INLINE_ATTRIBUTE" if config.cpp.force_inline_kernel else ""
         )
         code.writeline(
             f'extern "C" {func_export_decl} void {inline_attr} {kernel_decl_name}({arg_defs})'
+=======
+        code.writeline(
+            f'extern "C" {func_export_decl} void {kernel_decl_name}({arg_defs})'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # 3. Function body
@@ -5543,10 +5894,14 @@ def codegen_group(self, name=None) -> str:
                 prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
                 code.writelines(
                     [
+<<<<<<< HEAD
                         (
                             "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
                             f'record_{prefix + kernel_name}_("{prefix + kernel_name}", nullptr);'
                         )
+=======
+                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ]
                 )
             for old, new in self.args.aliases():
@@ -5557,10 +5912,14 @@ def codegen_group(self, name=None) -> str:
     def call_kernel(self, wrapper, kernel_name):
         _, call_args, arg_types = self.args.cpp_argdefs()
         wrapper.generate_kernel_call(
+<<<<<<< HEAD
             kernel_name,
             call_args,
             triton=False,
             arg_types=arg_types,
+=======
+            kernel_name, call_args, triton=False, arg_types=arg_types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -5759,6 +6118,7 @@ def get_simd_vec_depth(loops):
 
         simd_vec_depth = get_simd_vec_depth(self.loops)
 
+<<<<<<< HEAD
         def has_scalar_kernel(loop_nest: LoopNest):
             assert isinstance(loop_nest.kernel, CppKernelProxy)
             return any(
@@ -5766,6 +6126,8 @@ def has_scalar_kernel(loop_nest: LoopNest):
                 for kernel in loop_nest.kernel.kernels
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # When the number of steps of the first inner loop is much larger than the number of steps of
         # all outer loops, change `start_depth` to the first inner loop and recalculate `max_depth`.
         if (
@@ -5779,7 +6141,10 @@ def has_scalar_kernel(loop_nest: LoopNest):
                 simd_vec_depth is not None
                 and max_depth > simd_vec_depth
                 and self.loops[max_depth].is_reduction
+<<<<<<< HEAD
                 and has_scalar_kernel(self)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ):
             start_depth = max_depth
@@ -5800,7 +6165,10 @@ def mark_parallel(self, par_depth):
         loop = self.loops[par_depth.start_depth]
         loop.parallel = par_depth.parallel_depth
         if loop.is_reduction:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             metrics.parallel_reduction_count += 1
         for i in range(par_depth.start_depth + 1, par_depth.parallel_depth):
             self.loops[i].collapsed = True
diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py
index a1ceecf7f7c9e..ab48d89aff4ab 100644
--- a/torch/_inductor/codegen/cpp_flex_attention_template.py
+++ b/torch/_inductor/codegen/cpp_flex_attention_template.py
@@ -792,7 +792,11 @@ def get_arg_name(name):
             return ""
 
         if start_offset == -1:
+<<<<<<< HEAD
             start_offset = self.len_score_other
+=======
+            start_offset = getattr(self, len_attr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         length = getattr(self, len_attr)
         for i in range(length):
@@ -814,7 +818,11 @@ def modification(self, subgraph_buffer, output_name, output_idx):
         from ..loop_body import LoopBody
         from ..utils import sympy_index_symbol_with_prefix, SymT
         from ..virtualized import V
+<<<<<<< HEAD
         from .cpp import CppKernelProxy, KernelGroup, ParallelDepth
+=======
+        from .cpp import CppKernelProxy, KernelGroup
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kernel_group = KernelGroup()
         kernel_input_args = {
@@ -883,6 +891,7 @@ def fn(*args):
         var_sizes_list.append((var_sizes, ()))
 
         cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+<<<<<<< HEAD
 
         def max_parallel_depth():
             return ParallelDepth(parallel_depth=0, start_depth=0)
@@ -892,6 +901,9 @@ def max_parallel_depth():
             cpp_kernel_proxy.loop_nest, "max_parallel_depth", max_parallel_depth
         ):
             kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+=======
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_code = kernel_group.loops_code.getvalue()
 
         var_q_symbol, var_kv_symbol = self.block_vars
@@ -985,8 +997,12 @@ def render(  # type: ignore[override,return]
         self.input_dtype = query.layout.dtype
 
         num_threads = parallel_num_threads()
+<<<<<<< HEAD
         assert isinstance(self.output_node, ir.IRNode)
         buf_out: ir.IRNode = TensorBox.create(self.output_node)
+=======
+        buf_out = TensorBox.create(self.output_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if template_buffer_node is not None:
             buf_out = template_buffer_node
         options = dict(
@@ -995,9 +1011,15 @@ def render(  # type: ignore[override,return]
             value=value,
             kv_num_blocks=self.input_nodes[3],
             kv_indices=self.input_nodes[4],
+<<<<<<< HEAD
             full_kv_num_blocks=(
                 self.input_nodes[5] if not self.no_full_kv_block else None
             ),
+=======
+            full_kv_num_blocks=self.input_nodes[5]
+            if not self.no_full_kv_block
+            else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             full_kv_indices=self.input_nodes[6] if not self.no_full_kv_block else None,
             score_mod_other_buffers=self.score_mod_other_buffers,
             mask_mod_other_buffers=self.mask_mod_other_buffers,
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 906b802f7fb30..11797cdefc8fa 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -396,15 +396,22 @@ def transpose_w(W: _T, trans_w: bool) -> _T:
     if isinstance(W, ir.IRNode):
         if trans_w:
             if not isinstance(W, ir.TensorBox):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 W = ir.TensorBox(W)
             W = L.permute(W, [1, 0])
     else:
         if trans_w:
             assert isinstance(W, torch.Tensor)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             W = W.transpose(0, 1)
     # pyrefly: ignore [bad-return]
+=======
+            W = W.transpose(0, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return W
 
 
@@ -415,15 +422,23 @@ def expand_bias(B: Optional[_T], X: _T) -> Optional[_T]:
     if B is not None:
         if isinstance(B, ir.IRNode):
             if not isinstance(B, ir.TensorBox):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 B = ir.TensorBox(B)
             assert hasattr(X, "get_size")
             # pyrefly: ignore [missing-attribute]
+=======
+                B = ir.TensorBox(B)
+            assert hasattr(X, "get_size")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             B = L.expand(B, (X.get_size()[0], B.get_size()[-1]))
         else:
             assert isinstance(B, torch.Tensor)
             assert isinstance(X, torch.Tensor)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             B = B.expand(X.shape[0], B.shape[-1])
     return B
 
@@ -815,6 +830,7 @@ def get_num_byte(dtype):
             if (
                 config.cpp.use_small_dequant_buffer
                 and dtype_A is torch.bfloat16
+<<<<<<< HEAD
                 and Mt_blocks == 1
             ):
                 if dtype_B is torch.uint8:
@@ -836,6 +852,15 @@ def get_num_byte(dtype):
                     )
                     if Kc_blocks * Kr >= K_block_size:
                         Kc_blocks = (K_block_size + Kr - 1) // Kr
+=======
+                and dtype_B is torch.uint8
+                and Mt_blocks == 1
+            ):
+                # Make a small dequant_B buffer for woq int4 [q_group_size, Nr]
+                # Since when Mt_blocks == 1, L1-reside B block can't be reused by A.
+                if Kc_blocks * Kr >= self.q_group_size():
+                    Kc_blocks = self.q_group_size() // Kr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Step 2: Decide Mc assuming A block is L2-reside.
             min_Mc_ratio = 2  # TODO(jgong5): something to tune?
@@ -937,6 +962,12 @@ def add_choices(
 
         if input_indices is None:
             input_indices = list(range(len(input_nodes)))
+<<<<<<< HEAD
+=======
+        only_one_input = (
+            input_nodes[0] == input_nodes[1] if len(input_nodes) > 1 else False
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def reorder_and_filter(inputs, layout_or_out):
             if has_bias:
@@ -1036,9 +1067,12 @@ def normalize_shapes(inputs, layout_or_out):
         assert micro_gemm is not None
         pre_block_weights = cls.check_if_block_weight(new_inputs[1], micro_gemm)
         micro_gemm.use_local_vnni_blocking(not pre_block_weights)
+<<<<<<< HEAD
         only_one_input = (
             input_nodes[0] == input_nodes[1] if len(input_nodes) > 1 else False
         ) and not pre_block_weights  # If weights are blocked, use the second input
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def preprocessor(inputs, layout):
             new_inputs, new_layout = normalize_shapes(
@@ -1049,7 +1083,10 @@ def preprocessor(inputs, layout):
             return cls.prep_weight(
                 new_inputs,
                 new_layout,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 micro_gemm,
                 pre_block_weights,
                 use_int8_fast_compensation_path,
@@ -1073,7 +1110,10 @@ def postprocessor(output):
                 new_input_nodes, _ = cls.prep_weight(
                     new_input_nodes,
                     new_layout,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     micro_gemm,
                     pre_block_weights,
                     use_int8_fast_compensation_path,
@@ -1116,6 +1156,7 @@ def get_padded_size(n, block_n, k, should_block_weight):
         new_size = [padded_n // block_n, k, block_n]
         return new_size, padded_n
 
+<<<<<<< HEAD
     @staticmethod
     def _maybe_remove_storage_offset(node: ir.IRNode):
         if node.get_layout().offset == 0:
@@ -1128,6 +1169,8 @@ def _maybe_remove_storage_offset(node: ir.IRNode):
         #   W.data_ptr[...]
         return ir.ExternKernel.copy_input(node)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def prep_weight(
         cls,
@@ -1183,7 +1226,10 @@ def prep_weight(
         elif isinstance(W, ir.IRNode):
             # Require W layout to be fixed & contiguous, happens inplace.
             ir.ExternKernel.require_contiguous(W)
+<<<<<<< HEAD
             new_inputs[1] = cls._maybe_remove_storage_offset(W)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not skip_int8_compensation and _is_int8_gemm(new_inputs):
             BCompensate = None
@@ -1262,7 +1308,11 @@ def block_weight(cls, W, new_size, padding):
                 permute_size[-2], permute_size[-3] = permute_size[-3], permute_size[-2]
                 blocked_w = L.constant_pad_nd(W, (0, padding))
                 blocked_w = L.permute(
+<<<<<<< HEAD
                     L.view(blocked_w, permute_size),  # type: ignore[arg-type]
+=======
+                    L.view(blocked_w, permute_size),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     permute_dims,
                 )
         else:
@@ -1478,9 +1528,13 @@ def copy_inner(index):
             assert isinstance(template_buffer, ir.IRNode)
             gemm_output_name = f"{template_buffer.get_name()}_GemmOut"
             gemm_output_buffer = ir.Buffer(
+<<<<<<< HEAD
                 name=gemm_output_name,
                 # pyrefly: ignore [missing-attribute]
                 layout=template_buffer.layout,
+=======
+                name=gemm_output_name, layout=template_buffer.layout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             current_input_buffer = gemm_output_buffer
             for i, creator in enumerate(epilogue_creators):
@@ -1491,7 +1545,10 @@ def copy_inner(index):
                 epilogues.append(
                     ir.ComputedBuffer(
                         name=buffer_name,
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         layout=template_buffer.layout,
                         data=creator(current_input_buffer),
                     )
@@ -1501,9 +1558,13 @@ def copy_inner(index):
                 reindexers.append(None)
                 if i < len(epilogue_creators) - 1:
                     current_input_buffer = ir.Buffer(
+<<<<<<< HEAD
                         name=buffer_name,
                         # pyrefly: ignore [missing-attribute]
                         layout=template_buffer.layout,
+=======
+                        name=buffer_name, layout=template_buffer.layout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         assert isinstance(Y, (ir.Buffer, ir.ReinterpretView))
@@ -1534,7 +1595,10 @@ def copy_inner(index):
             self.n,
             self.k,
             input_dtype=X.get_dtype(),
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input2_dtype=W.get_dtype(),
             output_dtype=output_dtype,
             compute_dtype=compute_dtype,
diff --git a/torch/_inductor/codegen/cpp_grouped_gemm_template.py b/torch/_inductor/codegen/cpp_grouped_gemm_template.py
index 1fd2e7e53715d..952d89dc192a5 100644
--- a/torch/_inductor/codegen/cpp_grouped_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_grouped_gemm_template.py
@@ -183,14 +183,20 @@ def __init__(
         )
         self.act_mapping = act_mapping
         self.gemm_grouped_num = gemm_grouped_num
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.output_node: list[ir.Buffer] = [
             ir.Buffer(name="buf_out" + str(idx), layout=layout)
             for idx in range(gemm_grouped_num)
         ]
 
     @classmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_choices(
         cls,
         choices: list[ChoiceCaller],
@@ -233,7 +239,10 @@ def maybe_to_dense(
                 if isinstance(inputs[idx], torch.Tensor):
                     W = inputs[idx]
                     assert isinstance(W, torch.Tensor), "W must be a torch.Tensor"
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_inputs[idx] = W.to_dense() if W.is_mkldnn else W
             return new_inputs, layout_or_out
 
@@ -249,10 +258,15 @@ def normalize_shapes(
                 new_input = new_inputs[wgt_idx]
                 new_inputs[wgt_idx] = transpose_w(new_input, trans_w)
             for bias_idx in range(bias_start_idx, len(new_inputs)):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 new_bias = expand_bias(new_inputs[bias_idx], X)
                 assert new_bias is not None
                 # pyrefly: ignore [unsupported-operation]
+=======
+                new_bias = expand_bias(new_inputs[bias_idx], X)
+                assert new_bias is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_inputs[bias_idx] = new_bias
             return new_inputs, layout_or_out
 
@@ -313,7 +327,10 @@ def postprocessor(output: _T) -> _T:
                 W_tensor = []
                 for W_node in W_nodes:
                     assert W_node.get_name() in V.graph.constants
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     W_tensor.append(V.graph.constants[W_node.get_name()])
                 new_input_nodes[wgt_start_idx : wgt_start_idx + gemm_grouped_num] = (
                     W_tensor  # type: ignore[assignment]
@@ -330,7 +347,10 @@ def postprocessor(output: _T) -> _T:
                     template_buffer.inputs[idx] = (
                         ir.InputsKernel.unwrap_storage_for_input(W_packed_constant)
                     )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return output
 
         template = DataProcessorTemplateWrapper(
@@ -369,7 +389,10 @@ def render(  # type: ignore[override,return,no-untyped-def]
         cur_idx = bias_start_idx
         for inp_idx in range(self.gemm_grouped_num):
             inp = None
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.has_bias[inp_idx]:
                 inp = self.input_nodes[cur_idx]
                 cur_idx += 1
@@ -398,7 +421,10 @@ def render(  # type: ignore[override,return,no-untyped-def]
             self.n,
             self.k,
             input_dtype=X_list[0].get_dtype(),
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input2_dtype=W_list[0].get_dtype(),
             output_dtype=output_dtype,
             compute_dtype=compute_dtype,
@@ -436,7 +462,10 @@ def render(  # type: ignore[override,return,no-untyped-def]
         for x_idx in range(wgt_start_idx):
             kernel_args["X" + str(x_idx)] = act_deduplicated[x_idx]
         for w_idx in range(self.gemm_grouped_num):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel_args["W" + str(w_idx)] = W_list[w_idx]
         for inp_idx in range(self.gemm_grouped_num):
             kernel_args["inp" + str(inp_idx)] = inp_list[inp_idx]
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index a494ed6dd8bdf..5af7ecf5a8f46 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -195,7 +195,11 @@ def get_b_layout(self) -> LayoutType:
     ALLOCATE_WEIGHT_BUFFER = r"""
     {%- if is_msvc_compiler %}
     // MSVC doesn't support stack-allocated dynamic-sized arrays, so using heap memory here.
+<<<<<<< HEAD
     auto heap_deq_b_buf_ptr = std::make_unique<{{buffer_dtype}}[]>({{buffer_size}});
+=======
+    std::unique_ptr<{{buffer_dtype}}[]> heap_deq_b_buf_ptr(new {{buffer_dtype}}[{{buffer_size}}]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {{buffer_dtype}}* {{buffer_name}} = heap_deq_b_buf_ptr.get();
     {%- else %}
     // It's safe to use a stack-allocated array since the blocking strategy would
@@ -211,12 +215,21 @@ def codegen_allocate_weight_buffer(
     ) -> str:
         buffer_size = " * ".join(map(str, size_args))
         return KernelTemplate._template_from_string(self.ALLOCATE_WEIGHT_BUFFER).render(
+<<<<<<< HEAD
             {
                 "buffer_name": buffer_name,
                 "buffer_dtype": buffer_dtype,
                 "buffer_size": buffer_size,
                 "is_msvc_compiler": cpp_builder.is_msvc_cl(),
             }
+=======
+            dict(
+                buffer_name=buffer_name,
+                buffer_dtype=buffer_dtype,
+                buffer_size=buffer_size,
+                is_msvc_compiler=cpp_builder.is_msvc_cl(),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def is_woq_int4(self):
@@ -963,6 +976,7 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     return k % vnni_size == 0 and alpha == 1
 
 
+<<<<<<< HEAD
 def check_int8_bf16_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     # We need avx512_bf16 to dequant int8 to bf16
     vec_isa = kwargs.get("vec_isa")
@@ -981,6 +995,8 @@ def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
     return vec_isa.is_amx_fp16_supported() and k % vnni_size == 0 and alpha == 1
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_micro_gemm(
     *generate_gemm_config(
         VecAMX,
@@ -993,11 +1009,16 @@ def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
     ),
     *generate_gemm_config(
         VecAMX,
+<<<<<<< HEAD
         [(32, 32, 32), (48, 16, 32)],
+=======
+        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_dtype=torch.bfloat16,
         input2_dtype=torch.int8,
         output_dtype=torch.float,
         compute_dtype=torch.float,
+<<<<<<< HEAD
         extra_check=check_int8_bf16_amx_extra,
     ),
     *generate_gemm_config(
@@ -1005,14 +1026,22 @@ def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
         [(32, 16, 32), (32, 32, 32), (48, 16, 32), (16, 48, 32)],
         input_dtype=torch.bfloat16,
         output_dtype=torch.float,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_check=check_amx_extra,
     ),
     *generate_gemm_config(
         VecAMX,
         [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+<<<<<<< HEAD
         input_dtype=torch.float16,
         output_dtype=torch.float,
         extra_check=check_amx_fp16_extra,
+=======
+        input_dtype=torch.bfloat16,
+        output_dtype=torch.float,
+        extra_check=check_amx_extra,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     *generate_gemm_config(
         VecAMX,
@@ -1049,6 +1078,7 @@ class CppMicroGemmAMX(CppMicroGemm):
         {{input2_t}}* base_addr = const_cast<{{input2_t}}*>(B) + base_idx;
         for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
         {%- for vec_idx in range(0, block_n, 32) %}
+<<<<<<< HEAD
             _mm_prefetch(base_addr + idx_q + 64 * ldb, _MM_HINT_T0);
             {%- if (block_n - vec_idx) >= 32 %}
             // 1) Load 32 x int8
@@ -1078,6 +1108,15 @@ class CppMicroGemmAMX(CppMicroGemm):
             __m256i bf16 = (__m256i)_mm512_cvtneps_pbh(f32);
             // 5) Store 16 x bf16 (256 bits)
             _mm256_storeu_si256((__m256i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf16);
+=======
+            {%- if (block_n - vec_idx) >= 32 %}
+            auto b_int8_idx_{{vec_idx}} = at::vec::Vectorized<int8_t>::loadu(
+                base_addr + idx_q + {{vec_idx}} ,
+                static_cast<int64_t>(32)
+            );
+            auto b_bf16_idx_{{vec_idx}} = at::vec::convert<{{input_t}}>(b_int8_idx_{{vec_idx}});
+            b_bf16_idx_{{vec_idx}}.store(dequantized_B_buf + idx_dq + {{vec_idx}});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {%- else %}
             auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
                 base_addr + idx_q + {{block_n - (block_n % 32)}},
@@ -1234,11 +1273,15 @@ class CppMicroGemmAMX(CppMicroGemm):
         _tile_dpbusd({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
             {%- endif %}
         {%- else %}
+<<<<<<< HEAD
             {%- if input_dtype == torch.float16 %}
         _tile_dpfp16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
             {%- else %}
         _tile_dpbf16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
             {%- endif %}
+=======
+        _tile_dpbf16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {%- endif %}
     {%- endfor %}
 {%- endfor %}
@@ -1410,10 +1453,17 @@ def get_b_layout(self):
 def check_woq_int4_extra(config, m, n, k, alpha, num_threads, **kwargs):
     if alpha != 1:
         return False
+<<<<<<< HEAD
     q_group_size = kwargs.get("q_group_size")
     assert q_group_size is not None
     if (
         q_group_size not in [32, 64, 128]
+=======
+    q_group_size = kwargs.get("q_group_size", None)
+    assert q_group_size is not None
+    if (
+        q_group_size < 32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         or k % q_group_size != 0
         or config.register_blocking.block_k > q_group_size
     ):
@@ -1559,7 +1609,13 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
   auto load_scale_and_zeros = [&](int i, int _kb) {
     // load 2x bfloat16 vector
     __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
+<<<<<<< HEAD
     _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+=======
+    if (_kb + PREFETCH_SIZE_KB < KB) {
+      _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // convert to 2x f32 vector
     __m512 a, b;
@@ -1593,7 +1649,13 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
 
     if constexpr (col == 0) {
       float aa = static_cast<float>(A[row * lda + k]);
+<<<<<<< HEAD
       _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+=======
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       va = _mm512_set1_ps(aa);
     }
 
@@ -1603,7 +1665,13 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
         // to reduce de-quantize overhead.
         if constexpr (col == 0) {
           __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb));
+<<<<<<< HEAD
           _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
+=======
+          if (k + PREFETCH_SIZE_K < K) {
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
           vb[0] = _mm512_permutexvar_ps(b32, lut);
@@ -1695,8 +1763,12 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
 
     TEMPLATE_ENTRY = r"""
 inline bool {{kernel_name}}_is_block_start(int index, int k_start, int group_size) {
+<<<<<<< HEAD
   // check if (k_start + index) % group_size == 0, assuming group_size = 32/64/128
   return ((k_start + index) & (group_size - 1)) == 0;
+=======
+  return (k_start + index) % group_size == 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 {{declare_kernel}} {
@@ -1780,7 +1852,13 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
     auto load_scale_and_zeros = [&](int i, int _kb) {
         // load 2x bfloat16 vector
         __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
+<<<<<<< HEAD
         _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+=======
+        if (_kb + PREFETCH_SIZE_KB < KB) {
+            _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         // convert to 2x f32 vector
         __m512 a, b;
@@ -1809,9 +1887,17 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
                 c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
             }
 
+<<<<<<< HEAD
             _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
 
             // load 256 bits = 64 elements in int4
+=======
+            // load 256 bits = 64 elements in int4
+            if (k + PREFETCH_SIZE_K < K) {
+                _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
+            }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             __m128i b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + k * ldb_int4));
             b32[0] = _mm512_cvtepu8_epi32(b4);
             b32[1] = _mm512_srli_epi32(b32[0], 4);
@@ -1820,8 +1906,13 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
             vb[1] = _mm512_permutexvar_ps(b32[1], lut);
             vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
 
+<<<<<<< HEAD
             __m128i b4_2 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
             b32[0 + COLS] = _mm512_cvtepu8_epi32(b4_2);
+=======
+            b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
+            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             b32[1 + COLS] = _mm512_srli_epi32(b32[0 + COLS], 4);
             vb[0 + COLS] = _mm512_permutexvar_ps(b32[0 + COLS] , lut);
             vb[0 + COLS] = _mm512_fmadd_ps(vb[0 + COLS], scale[0], zero[0]);
@@ -1946,7 +2037,11 @@ def create_from_config(cls, config: CppMicroGemmConfig):
             alpha,
         )
 
+<<<<<<< HEAD
     def skip_amx_kernel_for_woq(dynamic_M):
+=======
+    def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For WoQ GEMM, AMX micro-kernel may not perform well if m is small.
         # Exception: for dynamic shapes, we consider using the AMX micro-kernel.
         if (
@@ -1955,7 +2050,15 @@ def skip_amx_kernel_for_woq(dynamic_M):
             or input2_dtype not in [torch.int8, torch.uint8]
         ):
             return False
+<<<<<<< HEAD
         m_threshold = 5
+=======
+        # For WOQ INT8, use AMX for m >= block_m
+        # For WOQ INT4, use AMX for m >= 5
+        block_m, *_ = config.register_blocking
+        is_woq_int4 = micro_gemm_cls == CppMicroGemmWoQInt4Amx
+        m_threshold = 5 if is_woq_int4 else block_m
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return m < m_threshold
 
     assert isinstance(n, int) or n.is_number, n
@@ -1997,11 +2100,20 @@ def skip_amx_kernel_for_woq(dynamic_M):
                     num_threads,
                     dynamic_M=dynamic_M,
                     q_group_size=q_group_size,
+<<<<<<< HEAD
                     vec_isa=vec_isa,
                 ):
                     continue
                 block_m, block_n, block_k = config.register_blocking
                 if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(dynamic_M):
+=======
+                ):
+                    continue
+                block_m, block_n, block_k = config.register_blocking
+                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(
+                    config, dynamic_M, cls
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 # Criteria on the ranking of configurations
                 # 1. ISA: AMX > VEC
@@ -2030,6 +2142,7 @@ def skip_amx_kernel_for_woq(dynamic_M):
                     + (block_m * block_k + block_k * block_n)
                     * config.input_dtype.itemsize
                 )
+<<<<<<< HEAD
                 size_score = register_bytes
                 # if number of mxn blocks can not occupy all the threads,
                 # we favor smaller register blocks.
@@ -2038,6 +2151,11 @@ def skip_amx_kernel_for_woq(dynamic_M):
                 matched_configs.append(
                     (
                         (isa_score, dividable_score, occupancy_score, size_score),
+=======
+                matched_configs.append(
+                    (
+                        (isa_score, dividable_score, occupancy_score, register_bytes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         cls,
                         config,
                     )
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
index a2fd370550046..cebde15240c1a 100644
--- a/torch/_inductor/codegen/cpp_template.py
+++ b/torch/_inductor/codegen/cpp_template.py
@@ -85,7 +85,10 @@ def generate(self, **kwargs):
         bmreq = CppBenchmarkRequest(
             kernel_name=kernel_name,
             input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
             extra_args=extra_args,
             source_code=code,
@@ -113,7 +116,10 @@ def make_kernel_render(
             kernel_hash_name,
             self.name,
             self.input_nodes,
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.output_node[0].get_layout()
             if isinstance(self.output_node, Iterable)
             else self.output_node.get_layout(),
@@ -133,7 +139,11 @@ def header(self) -> IndentedBuffer:
             "win32",
         ]
         if enable_kernel_profile:
+<<<<<<< HEAD
             res.writelines(["#include <torch/csrc/inductor/aoti_runtime/utils.h>"])
+=======
+            res.writelines(["#include <ATen/record_function.h>"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return res
 
     def render(self, **kwargs) -> str:
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
index 84cdcd7266725..776660b0cca78 100644
--- a/torch/_inductor/codegen/cpp_template_kernel.py
+++ b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -2,7 +2,10 @@
 import itertools
 from collections.abc import Iterable
 from typing import Any, Callable, Optional, Union
+<<<<<<< HEAD
 from unittest.mock import patch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 from sympy.parsing.sympy_parser import parse_expr
@@ -19,7 +22,11 @@
 from ..utils import sympy_index_symbol, sympy_index_symbol_with_prefix
 from ..virtualized import V
 from .common import REMOVED
+<<<<<<< HEAD
 from .cpp import CppKernel, CppKernelProxy, KernelGroup, ParallelDepth
+=======
+from .cpp import CppKernel, CppKernelProxy, KernelGroup
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .cpp_utils import cexpr_index, DTYPE_TO_CPP, LocalBufferContext
 
 
@@ -34,7 +41,11 @@ def parse_expr_with_index_symbols(expr):
         return expr.subs(int_symbols)
 
 
+<<<<<<< HEAD
 def wrap_with_tensorbox(node) -> Union[ir.TensorBox, ir.ShapeAsConstantBuffer]:
+=======
+def wrap_with_tensorbox(node) -> ir.TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         ir.TensorBox.create(node) if isinstance(node, ir.Buffer) else ir.TensorBox(node)
     )
@@ -162,7 +173,10 @@ def slice_nd(self, node, ranges: list[tuple[Any, Any]]) -> ir.ReinterpretView:
             assert len(_range) == 2
             start, end = parse_expr_with_index_symbols(_range)
             sliced = L.slice_(sliced, dim, start, end, clamp=False)
+<<<<<<< HEAD
         assert isinstance(sliced, ir.TensorBox)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(sliced.data, ir.ReinterpretView), sliced.data
         return sliced.data
 
@@ -175,10 +189,17 @@ def select(self, node, dim: int, idx: int) -> ir.ReinterpretView:
         assert isinstance(sliced.data, ir.ReinterpretView), sliced.data
         return sliced.data
 
+<<<<<<< HEAD
     def view(self, node, sizes: list[Any]) -> ir.IRNode:
         node = wrap_with_tensorbox(node)
         sizes = parse_expr_with_index_symbols(sizes)
         return L.view(node, sizes).data  # type: ignore[arg-type]
+=======
+    def view(self, node, sizes: list[Any]) -> ir.View:
+        node = wrap_with_tensorbox(node)
+        sizes = parse_expr_with_index_symbols(sizes)
+        return L.view(node, sizes).data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def permute(self, node, dims):
         node = wrap_with_tensorbox(node)
@@ -190,11 +211,15 @@ def maybe_codegen_profile(self) -> str:
         if config.cpp.enable_kernel_profile:
             graph_id = V.graph.graph_id
             prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
+<<<<<<< HEAD
             handle_str = (
                 "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
                 f'record_{prefix}{self.kernel_name}_("{prefix}{self.kernel_name}", nullptr);'
             )
             return handle_str
+=======
+            return f'RECORD_FUNCTION("{prefix}{self.kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return ""
 
@@ -293,6 +318,7 @@ def fn(*args):
             var_sizes_list.append(var_sizes)
 
         cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+<<<<<<< HEAD
 
         def max_parallel_depth():
             return ParallelDepth(parallel_depth=0, start_depth=0)
@@ -302,6 +328,9 @@ def max_parallel_depth():
             cpp_kernel_proxy.loop_nest, "max_parallel_depth", max_parallel_depth
         ):
             kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+=======
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return kernel_group.loops_code.getvalue()
 
     def store_grouped_gemm_pointwise_nodes(
@@ -355,6 +384,7 @@ def fn(*args):
             var_sizes_list.append(var_sizes)
 
         cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+<<<<<<< HEAD
 
         def max_parallel_depth():
             return ParallelDepth(parallel_depth=0, start_depth=0)
@@ -364,6 +394,9 @@ def max_parallel_depth():
             cpp_kernel_proxy.loop_nest, "max_parallel_depth", max_parallel_depth
         ):
             kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+=======
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return kernel_group.loops_code.getvalue()
 
     def store_output(
@@ -411,7 +444,10 @@ def store_output(
                     )
                     epilogue_nodes = scope.localize_nodes(epilogue_nodes)
                 return self.store_pointwise_nodes(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dst,
                     epilogue_nodes,  # type: ignore[arg-type]
                     offsets,
@@ -423,7 +459,10 @@ def store_output(
                 copy = L.copy(dst, src).data.data
                 with LocalBufferContext(self.args) as scope:
                     scope.add_local_buffer(src)
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return self.store_pointwise_nodes(dst, [copy])
             else:
                 assert dst.layout == src.layout, f"{dst=}, {src=}"
@@ -609,7 +648,11 @@ def info_dict(
     ) -> dict[str, Union[ir.PrimitiveInfoType, list[ir.PrimitiveInfoType]]]:
         return {"backend": "CPP", "op_type": "unknown"}
 
+<<<<<<< HEAD
     def output_node(self) -> Union[ir.TensorBox, ir.ShapeAsConstantBuffer]:
+=======
+    def output_node(self) -> ir.TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ir.TensorBox.create(
             ir.CppTemplateBuffer(
                 layout=self.layout,
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index a6583c0dd68e1..644107b9ec7a1 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -22,7 +22,10 @@
 from ..dependencies import Dep
 from ..loop_body import LoopBody
 from ..scheduler import BaseSchedulerNode, SchedulerBuffer
+<<<<<<< HEAD
 from ..shape_propagation import BlockShapeType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import IndentedBuffer, sympy_index_symbol_with_prefix, sympy_subs
 from ..virtualized import ops, OpsValue, V
 from .common import CSEVariable, Kernel, KernelArgs, OptimizationContext
@@ -146,9 +149,14 @@ def __init__(
         name,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
     ) -> None:
         super().__init__(name, bounds, dtype, shape=shape)
+=======
+    ) -> None:
+        super().__init__(name, bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.is_vec = False
         self.dependent_itervars = OrderedSet[sympy.Symbol]()
 
@@ -201,6 +209,7 @@ def doprint(self, expr, *, simplify: bool = True, p=True):
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+<<<<<<< HEAD
     def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
         if isinstance(item, sympy.Mod):
             # use parenthesis to enforce precedence.
@@ -209,6 +218,8 @@ def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> st
         else:
             return super().parenthesize(item, level, strict)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # A function to print, useful for printing sympy symbols.
 cexpr = CppPrinter().doprint
@@ -311,7 +322,10 @@ def store(self, name, index, value, mode=None):
         return res
 
     def store_reduction(self, name, index, value):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._inner.store_reduction(*self.localize(name, index), value)
 
 
@@ -430,7 +444,11 @@ def localize_nodes(
         `local_buf`. This helps the fused loops to work on smaller-sized local buffers
         for better data locality.
 
+<<<<<<< HEAD
         The data access of `local_buf` is assumed to be contiguous with the
+=======
+        The the data access of `local_buf` is assumed to be contiguous with the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         same order as the `global_buf`.
         """
         assert len(nodes) > 0
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index e49498cce411d..13925bb6f41c0 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -21,7 +21,11 @@
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.symbol import symbol_is_type, SymT
 
+<<<<<<< HEAD
 from .. import config, cpp_builder, ir
+=======
+from .. import config, ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import _align, DeferredLineBase, LineContext, normalize_name
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
@@ -58,6 +62,7 @@ def __init__(self):
             self.device = "cpu"
         # must be initialized prior to calling super().__init__()
         self.included_devices: OrderedSet[str] = OrderedSet()
+<<<<<<< HEAD
         self.model_class_name_suffix = (
             ""
             if config.aot_inductor.dynamic_linkage
@@ -67,6 +72,9 @@ def __init__(self):
 
         super().__init__()
 
+=======
+        super().__init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.declare = "auto "
         self.declare_maybe_reference = "decltype(auto) "
         self.ending = ";"
@@ -117,12 +125,16 @@ def _generate_temporary_array_pointer(
         # e.g. const double** is possible, but not const double* const*.  This means
         # that an array containing pointers must _already_ be properly const-qualified
         # by the c_type, and not add additional const-ness.
+<<<<<<< HEAD
         # MSVC does not support implicitly converting a const iterator to a const pointer.
         ptr_call = (
             "data()"
             if force_mutable or c_type.endswith("*") or cpp_builder.is_msvc_cl()
             else "cbegin()"
         )
+=======
+        ptr_call = "data()" if force_mutable or c_type.endswith("*") else "cbegin()"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             f"std::array<{c_type}, {len(elements)}>{{{', '.join(elements)}}}.{ptr_call}"
         )
@@ -221,6 +233,7 @@ def write_header(self):
         self.add_device_include(self.device)
 
         if V.graph.aot_mode:
+<<<<<<< HEAD
             if config.aot_inductor.dynamic_linkage:
                 with open(
                     os.path.join(
@@ -233,6 +246,23 @@ def write_header(self):
                 self.header.splice(f"""#include \"{self.model_class_name_suffix}.h\"""")
             self.header.splice("\n")
 
+=======
+            with open(
+                os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp")
+            ) as f:
+                self.header.splice(f.read())
+            self.header.splice("\n")
+
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        if config.profiler_mark_wrapper_call or enable_kernel_profile:
+            # No C shim for profiling APIs, assuming profiling is a debugging feature which
+            # does not provide any ABI compatibility promise.
+            self.header.splice("#include <ATen/record_function.h>")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _include_extra_header(self, header: str):
         # This is needed for cpp to python dtype conversion
         self.header.splice(f"#include <{header}>")
@@ -507,8 +537,11 @@ def gen_check(handle_kind, idx, name, tensor):
     def write_wrapper_decl(self):
         inputs_len = len(V.graph.graph_inputs.keys())
         if V.graph.aot_mode:
+<<<<<<< HEAD
             self.codegen_additional_funcs()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if V.graph.const_module:
                 self.header.splice(V.graph.const_module.wrapper_code.header)
 
@@ -520,12 +553,21 @@ def write_wrapper_decl(self):
 
             if V.graph.is_const_graph:
                 self.prefix.splice(
+<<<<<<< HEAD
                     f"""
                     void {self.aoti_model_class_name}::_const_run_impl(
                         std::vector<AtenTensorHandle>& output_handles,
                         DeviceStreamType stream,
                         AOTIProxyExecutorHandle proxy_executor
                     ) {{
+=======
+                    """
+                    void AOTInductorModel::_const_run_impl(
+                        std::vector<AtenTensorHandle>& output_handles,
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     """
                 )
             else:
@@ -533,18 +575,32 @@ def write_wrapper_decl(self):
                     # If we do not split the constant graph, we'll just create
                     # an empty implementation when wrapping the main module.
                     self.prefix.splice(
+<<<<<<< HEAD
                         f"""
                         void {self.aoti_model_class_name}::_const_run_impl(
                             std::vector<AtenTensorHandle>& output_handles,
                             DeviceStreamType stream,
                             AOTIProxyExecutorHandle proxy_executor
                         ) {{}}
+=======
+                        """
+                        void AOTInductorModel::_const_run_impl(
+                            std::vector<AtenTensorHandle>& output_handles,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                         """
                     )
 
+<<<<<<< HEAD
                 run_impl_proto = f"""
                     void {self.aoti_model_class_name}::run_impl(
+=======
+                run_impl_proto = """
+                    void AOTInductorModel::run_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         AtenTensorHandle*
                             input_handles, // array of input AtenTensorHandle; handles
                                             // are stolen; the array itself is borrowed
@@ -554,7 +610,11 @@ def write_wrapper_decl(self):
                                             // borrowed
                         DeviceStreamType stream,
                         AOTIProxyExecutorHandle proxy_executor
+<<<<<<< HEAD
                     ) {{
+=======
+                    ) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         __check_inputs_outputs(input_handles, output_handles);
                     """
 
@@ -584,7 +644,11 @@ def write_wrapper_decl(self):
                     # Weights are promoted in the JIT mode
                     num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
                     # release GIL to support multiple instances inference (in different threads of the same process)
+<<<<<<< HEAD
                     self.prefix.splice("py::gil_scoped_release_simple release;")
+=======
+                    self.prefix.splice("py::gil_scoped_release release;")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.prefix.splice(
                     f"""
@@ -626,10 +690,17 @@ def write_wrapper_decl(self):
             ), "Expect all constants to be Tensor"
             for idx, constants_key in enumerate(V.graph.constants.keys()):
                 if V.graph.aot_mode:
+<<<<<<< HEAD
                     # Weights are stored in constants_ and owned by ConstantHandle there.
                     # Don't call std::move here because it will cause constants_ to lose the ownership.
                     self.prefix.writeline(
                         f"""[[maybe_unused]] auto& {constants_key} = constants_->at({idx});"""
+=======
+                    # Weights are stored in constants_ and owned by RAIIAtenTensorHandle there.
+                    # Don't call std::move here because it will cause constants_ to lose the ownership.
+                    self.prefix.writeline(
+                        f"""[[maybe_unused]] auto {constants_key} = constants_->at({idx});"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 else:
                     # Append constants as inputs to the graph
@@ -665,9 +736,12 @@ def codegen_input_device_type_var_decl(self, code: IndentedBuffer, name):
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type({name}, &{name}_device_type));"
         )
 
+<<<<<<< HEAD
     def codegen_additional_funcs(self):
         pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_model_kernels(self):
         self.prefix.writeline("namespace {")
 
@@ -721,6 +795,7 @@ def codegen_model_kernels(self):
                     )
             self.prefix.writeline("}")
 
+<<<<<<< HEAD
     # MSVC string was longer than the limit of 16380 single-byte characters.
     # https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026
     MSVC_C2026_MAX_STRING_LENGTH = 16000
@@ -743,6 +818,8 @@ def truncate_string(s: str, length: int) -> list[str]:
         else:
             self.prefix.writeline(f'{arg_name} = R"({arg_str_val})";')
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_model_constructor(self):
         """
         // Generated code example
@@ -767,6 +844,7 @@ def codegen_model_constructor(self):
         num_outputs = len(V.graph.graph_outputs)
         num_constants = len(V.graph.constants)
         include_weights = (
+<<<<<<< HEAD
             "true"
             if config.aot_inductor.package_constants_in_so
             and config.aot_inductor.package_constants_on_disk_format != "binary_blob"
@@ -775,6 +853,13 @@ def codegen_model_constructor(self):
         self.prefix.splice(
             f"""
             {self.aoti_model_class_name}::{self.aoti_model_class_name}(std::shared_ptr<ConstantMap> constants_map,
+=======
+            "true" if config.aot_inductor.package_constants_in_so else "false"
+        )
+        self.prefix.splice(
+            f"""
+            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                std::shared_ptr<std::vector<ConstantHandle>> constants_array,
                                                const std::string& device_str,
                                                std::optional<std::string> cubin_dir)
@@ -893,6 +978,7 @@ def escape_string(x):
                     .replace("\t", "\\t")
                 )
 
+<<<<<<< HEAD
             # Origin code: self.prefix.writeline(f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";')
             # Fix msvc C2026 error via codegen_write_arg_with_large_length_string
             self.codegen_write_arg_with_large_length_string(
@@ -903,6 +989,13 @@ def escape_string(x):
             self.codegen_write_arg_with_large_length_string(
                 arg_name="out_spec_",
                 arg_str_val=config.aot_inductor.serialized_out_spec,
+=======
+            self.prefix.writeline(
+                f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";'
+            )
+            self.prefix.writeline(
+                f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             for idx, output in enumerate(V.graph.graph_outputs):
@@ -936,12 +1029,21 @@ def codegen_const_run_driver(self):
         """
 
         self.prefix.splice(
+<<<<<<< HEAD
             f"""
             std::unordered_map<std::string, AtenTensorHandle> {self.aoti_model_class_name}::const_run_impl(
                 DeviceStreamType stream,
                 AOTIProxyExecutorHandle proxy_executor,
                 bool initialization
             ) {{
+=======
+            """
+            std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+                DeviceStreamType stream,
+                AOTIProxyExecutorHandle proxy_executor,
+                bool initialization
+            ) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
         )
         if not config.aot_inductor.use_runtime_constant_folding:
@@ -1090,7 +1192,10 @@ def generate_return(self, output_refs: list[str]):
             output_buffer = V.graph.graph_outputs[idx]
             if isinstance(output_buffer, ir.BaseView):
                 output_storage = output_buffer.unwrap_view()
+<<<<<<< HEAD
                 assert isinstance(output_storage, (ir.BaseView, ir.MutableBox))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(output_storage.data, ir.ConstantBuffer):
                     is_constant_buffer = True
 
@@ -1124,7 +1229,11 @@ def generate_return(self, output_refs: list[str]):
     def generate_before_suffix(self, result):
         if not V.graph.is_const_graph:
             if V.graph.aot_mode:
+<<<<<<< HEAD
                 result.writeline(f"}} // {self.aoti_model_class_name}::run_impl")
+=======
+                result.writeline("} // AOTInductorModel::run_impl")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 result.writeline("} // inductor_entry_impl")
 
@@ -1132,7 +1241,11 @@ def generate_end(self, result):
         """Generates the end of the code block, and any code needed to call it."""
         if V.graph.aot_mode:
             if V.graph.is_const_graph:
+<<<<<<< HEAD
                 result.writeline(f"}} // {self.aoti_model_class_name}::_const_run_impl")
+=======
+                result.writeline("} // AOTInductorModel::_const_run_impl")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 result.writeline("} // namespace torch::aot_inductor\n\n\n")
             return
@@ -1171,7 +1284,11 @@ def generate_end(self, result):
             """
         )
 
+<<<<<<< HEAD
         wrapper_body = "input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg, device='cpu') for arg in args]"
+=======
+        wrapper_body = "input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.constants:
             # Append constants to the input args for cpp wrapper.
             # Python wrapper directly gets the value inside the wrapper call
@@ -1249,7 +1366,10 @@ def generate_c_shim_extern_kernel_call(
         device: str,
         *,
         debug_args: Optional[list[str]] = None,
+<<<<<<< HEAD
         debug_handle: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """debug_args kwarg allows CppWrapperCpuArrayRef to pass in wrapped arguments in
         place of args while preserving debug printer output."""
@@ -1266,16 +1386,26 @@ def generate_c_shim_extern_kernel_call(
         ]
         with debug_printer_manager:
             shim_fn = self.get_c_shim_func_name(kernel, device)
+<<<<<<< HEAD
             self.write_provenance_debug_handle(shim_fn, debug_handle)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shim_fn_codes = (
                 f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
             )
             if enable_kernel_profile:
+<<<<<<< HEAD
                 debug_handle_str = "" if debug_handle is None else f":{debug_handle}"
                 shim_fn_codes = textwrap.dedent(
                     f"""
                     {{
                       RAIIAtenRecordFunctionHandle record_{shim_fn}_("{shim_fn}{debug_handle_str}", nullptr);
+=======
+                shim_fn_codes = textwrap.dedent(
+                    f"""
+                    {{
+                      RECORD_FUNCTION("{shim_fn}", c10::ArrayRef<c10::IValue>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       {shim_fn_codes}
                     }}
                     """
@@ -1298,11 +1428,15 @@ def generate_c_shim_extern_kernel_alloc(
             args = [*args, f"&{output_handle_name}"]
 
         device = d.type if (d := extern_kernel.get_device()) else self.device
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.generate_c_shim_extern_kernel_call(
             extern_kernel.get_kernel_name(), args, device
         )
 
+<<<<<<< HEAD
         if extern_kernel.python_kernel_name in (
             "torch.ops._c10d_functional.all_reduce_.default",
             "torch.ops._c10d_functional.wait_tensor.default",
@@ -1314,6 +1448,9 @@ def generate_c_shim_extern_kernel_alloc(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object({output_handle_name}));"
             )
         elif not is_inplace:
+=======
+        if not is_inplace:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
 
     def _generate_extern_kernel_alloc_helper(self, extern_kernel, args):
@@ -1357,7 +1494,10 @@ def generate_c_shim_fallback_kernel(
                 raise NotImplementedError(f"unsupported type of {output=}")
         args = args + output_args
         device = d.type if (d := fallback_kernel.get_device()) else self.device
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.generate_c_shim_extern_kernel_call(
             fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
             args,
@@ -1373,7 +1513,10 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
+<<<<<<< HEAD
         debug_handle: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if out_view:
             out_name = f"{out}_as_strided"
@@ -1382,6 +1525,7 @@ def _generate_extern_kernel_out_helper(
         else:
             args.insert(0, out)
 
+<<<<<<< HEAD
         self.generate_c_shim_extern_kernel_call(
             kernel, args, device, debug_handle=debug_handle
         )
@@ -1395,6 +1539,11 @@ def _get_scatter_reduce_enum(self, reduce):
         return reduce
 
     def _generate_scatter_fallback(
+=======
+        self.generate_c_shim_extern_kernel_call(kernel, args, device)
+
+    def generate_scatter_fallback(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         output,
         inputs,
@@ -1404,8 +1553,11 @@ def _generate_scatter_fallback(
         reduce,
         kwargs,
     ):
+<<<<<<< HEAD
         reduce = self._get_scatter_reduce_enum(reduce)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # call the ABI shim function instead of the ATen one
         cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
         # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
@@ -1426,7 +1578,11 @@ def _generate_scatter_fallback(
         line += ");"
         self.writeline(line)
 
+<<<<<<< HEAD
     def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+=======
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
         # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding
@@ -1485,7 +1641,11 @@ def _generate_symbolic_call_arg_helper(
         else:
             self.writeline(f"{arg.inner} = {cexpr(arg.inner_expr)};")
 
+<<<<<<< HEAD
     def _codegen_dynamic_scalar(self, node):
+=======
+    def codegen_dynamic_scalar(self, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (data,) = (t.codegen_reference() for t in node.inputs)
         self.codegen_tensor_item(node.inputs[0].get_dtype(), data, f"{node.sym}_raw")
 
@@ -1504,6 +1664,7 @@ def _codegen_dynamic_scalar(self, node):
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
         self.unbacked_symbol_decls.add(str(node.sym))
 
+<<<<<<< HEAD
     def codegen_dynamic_select_index(self, node, clamp):
         index_cpp_str = self.val_to_arg_str_for_prim_type(node.index, int)
         size_cpp_str = self.val_to_arg_str_for_prim_type(node.size, int)
@@ -1557,6 +1718,8 @@ def codegen_clamp(index_str, start=True):
         self.writeline(f"int64_t {sym} = {sym}_with_step < 0 ? 0 : {sym}_with_step;")
         self.unbacked_symbol_decls.add(str(sym))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def make_buffer_free(self, buffer):
         return (
             ""
@@ -1573,7 +1736,11 @@ def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str
 
     def generate_profiler_mark_wrapper_call(self, stack):
         self.wrapper_call.writeline(
+<<<<<<< HEAD
             'RAIIAtenRecordFunctionHandle record_inductor_wrapper_call_("inductor_wrapper_call", nullptr);'
+=======
+            'RECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def generate_start_graph(self):
@@ -1630,6 +1797,7 @@ def codegen_int_array_var(
         # This is why writeline needs to explicitly passed in as a parameter.
         var = f"int_array_{next(self.int_array_id)}"
         ctype = "int64_t"
+<<<<<<< HEAD
         if int_array == "{}":
             #  An array of unknown bound cannot be initialized with {}.
             if known_statically:
@@ -1649,6 +1817,14 @@ def codegen_int_array_var(
                         writeline(f"static const {ctype} {var}[] = {int_array};")
                 else:
                     writeline(f"const {ctype} {var}[] = {int_array};")
+=======
+        if var not in self.declared_int_array_vars:
+            self.declared_int_array_vars.add(var)
+            if known_statically:
+                writeline(f"static constexpr {ctype} {var}[] = {int_array};")
+            else:
+                writeline(f"const {ctype} {var}[] = {int_array};")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return var
 
     def make_buffer_allocation(self, buffer):
@@ -1659,11 +1835,18 @@ def make_buffer_allocation(self, buffer):
             buffer.get_size(),
             buffer.get_stride(),
             V.graph.get_allocation_size(buffer),
+<<<<<<< HEAD
             buffer.get_is_pinned(),
         )
 
     def make_allocation(
         self, name, device, dtype, shape, stride, allocation_shape=None, is_pinned=False
+=======
+        )
+
+    def make_allocation(
+        self, name, device, dtype, shape, stride, allocation_shape=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if allocation_shape is None:
             allocation_shape = shape
@@ -1715,9 +1898,14 @@ def make_allocation(
         ]
 
         self.wrapper_call.writeline(f"AtenTensorHandle {handle_name};")
+<<<<<<< HEAD
         pinned_str = "_pinned" if is_pinned else ""
         self.wrapper_call.writeline(
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided{pinned_str}({', '.join(args)}));"
+=======
+        self.wrapper_call.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if allocation_size != size:
@@ -1732,6 +1920,7 @@ def make_allocation(
             self.wrapper_call.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_as_strided({', '.join(args)}));"
             )
+<<<<<<< HEAD
             self.wrapper_call.writeline(
                 f"wrap_with_raii_handle_if_needed({old_handle_name});"
             )
@@ -1741,6 +1930,12 @@ def make_allocation(
     def codegen_alloc_from_pool(
         self, name, offset, dtype, shape, stride
     ) -> tuple[str, list[str]]:
+=======
+
+        return f"RAIIAtenTensorHandle {name}({handle_name});"
+
+    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         size = self.codegen_shape_tuple(shape)
         stride = self.codegen_shape_tuple(stride)
         tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
@@ -1757,6 +1952,7 @@ def codegen_alloc_from_pool(
             ),
             f"&{tmp_name}",
         ]
+<<<<<<< HEAD
         # We return the lines instead of writing here because writing here is bug prune.
         # If you write aoti_torch__alloc_from_pool lines, you must write the RAIIAtenTensorHandle
         # as well, otherwise you get memory leaks
@@ -1765,6 +1961,13 @@ def codegen_alloc_from_pool(
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));",
         ]
         return f"RAIIAtenTensorHandle({tmp_name})", allocations_to_write
+=======
+        self.wrapper_call.writeline(f"AtenTensorHandle {tmp_name};")
+        self.wrapper_call.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));"
+        )
+        return f"RAIIAtenTensorHandle({tmp_name})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_reinterpret_view(
         self,
@@ -1879,7 +2082,11 @@ def create_new_tensor_handle() -> tuple[str, list[str]]:
         # ```
         return final_tensor_str
 
+<<<<<<< HEAD
     def codegen_device_copy(self, src, dst, non_blocking: Union[bool, str]):
+=======
+    def codegen_device_copy(self, src, dst, non_blocking: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """This function is overridden by cpp_wrapper_cpu_array_ref, so we don't need to
         handle cases where dst is not an AtenTensorHandle."""
         self.writeline(
@@ -1984,9 +2191,13 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
         finally:
             self.pop_codegened_graph()
 
+<<<<<<< HEAD
     def codegen_while_loop(self, while_loop, stack_output=False):
         if stack_output:
             raise NotImplementedError("NYI cpp wrapper for while_loop_stack_output")
+=======
+    def codegen_while_loop(self, while_loop):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_bool_pred = isinstance(
             while_loop.cond_subgraph.graph.graph_outputs[0], ir.ShapeAsConstantBuffer
         )
@@ -2347,7 +2558,11 @@ def generate_scoped_gil_acquire(self, declarations_before_scope, lines_in_scope)
 
         scoped_lines.writeline("{")
         with scoped_lines.indent():
+<<<<<<< HEAD
             scoped_lines.writeline("py::gil_scoped_acquire_simple acquire;")
+=======
+            scoped_lines.writeline("py::gil_scoped_acquire acquire;")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             scoped_lines.writelines(lines_in_scope.split("\n"))
         scoped_lines.writelines("}")
         return scoped_lines._lines
@@ -2702,7 +2917,11 @@ def generate_fallback_kernel_with_runtime_lookup_aot(
             "AtenTensorHandle", tensor_call_args, force_mutable=True
         )
 
+<<<<<<< HEAD
         extern_kernel_node_index = len(V.extern_kernel_nodes) - 1
+=======
+        extern_kernel_node_index = len(V.graph.extern_kernel_nodes) - 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.writeline(
             f"aoti_torch_proxy_executor_call_function(proxy_executor, "
             f"{extern_kernel_node_index}, "
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index 9749d09a1af20..d0dbd6065a2ef 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -297,7 +297,11 @@ def write_wrapper_decl(self):
                         # Weights are promoted in the JIT mode
                         num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
                         # release GIL to support multiple instances inference (in different threads of the same process)
+<<<<<<< HEAD
                         self.prefix.splice("py::gil_scoped_release_simple release;")
+=======
+                        self.prefix.splice("py::gil_scoped_release release;")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     self.prefix.splice(
                         f"""
@@ -409,7 +413,10 @@ def use_thread_local_cached_output_tensor(idx, output):
             output_buffer = V.graph.graph_outputs[idx]
             if isinstance(output_buffer, ir.BaseView):
                 output_storage = output_buffer.unwrap_view()
+<<<<<<< HEAD
                 assert isinstance(output_storage, (ir.BaseView, ir.MutableBox))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(output_storage.data, ir.ConstantBuffer):
                     is_constant_buffer = True
 
@@ -565,6 +572,7 @@ def make_buffer_allocation(self, buffer):
             buffer.get_size(),
             buffer.get_stride(),
             buffer if self.can_stack_allocate_buffer(buffer) else None,
+<<<<<<< HEAD
             buffer.get_is_pinned(),
         )
 
@@ -577,6 +585,12 @@ def make_allocation(
         stride,
         buffer_if_can_stack_allocate=None,
         is_pinned=False,
+=======
+        )
+
+    def make_allocation(
+        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         orig_stride = stride
         device_str = self.codegen_device(device)
@@ -623,9 +637,14 @@ def make_allocation(
         ]
 
         self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
+<<<<<<< HEAD
         pinned_str = "_pinned" if is_pinned else ""
         self.wrapper_call.writeline(
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided{pinned_str}({', '.join(args)}));"
+=======
+        self.wrapper_call.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return f"RAIIAtenTensorHandle {name}({name}_handle);"
@@ -694,12 +713,16 @@ def generate_c_shim_extern_kernel_call(
             kernel, wrapped_args, device, debug_args=args
         )
 
+<<<<<<< HEAD
     def generate_scatter_fallback(self, node: ir.ScatterFallback):
         # No stack allocation when there is a fallback op
         self.allow_stack_allocation = False
         super().generate_scatter_fallback(node)
 
     def _generate_scatter_fallback(
+=======
+    def generate_scatter_fallback(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         output,
         inputs,
@@ -709,7 +732,12 @@ def _generate_scatter_fallback(
         reduce,
         kwargs,
     ):
+<<<<<<< HEAD
         reduce = self._get_scatter_reduce_enum(reduce)
+=======
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # call the ABI shim function instead of the ATen one
         cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
@@ -735,12 +763,19 @@ def _generate_scatter_fallback(
         line += ");"
         self.writeline(line)
 
+<<<<<<< HEAD
     def generate_index_put_fallback(self, node: ir.IndexPutFallback) -> None:
         # No stack allocation when there is a fallback op
         self.allow_stack_allocation = False
         super().generate_index_put_fallback(node)
 
     def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+=======
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
@@ -777,7 +812,11 @@ def generate_fallback_kernel_with_runtime_lookup(
             buf_name, python_kernel_name, get_args, op_overload, raw_args, outputs
         )
 
+<<<<<<< HEAD
     def codegen_device_copy(self, src, dst, non_blocking: Union[bool, str]):
+=======
+    def codegen_device_copy(self, src, dst, non_blocking: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # aoti_torch_tensor_copy_ takes AtenTensorHandle as input,
         # while stack-allocation results in ArrayRefTensor
         # so disable stack allocation here
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 02129fff24160..1dab0191f5958 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -3,7 +3,10 @@
 
 import dataclasses
 import re
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from itertools import count, zip_longest
 from typing import Any, Optional, Union
 from typing_extensions import Self
@@ -200,11 +203,14 @@ def generate_load_kernel(self, prefix, kernel_var_name, params):
         prefix.writeline("}")
 
     def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
+<<<<<<< HEAD
         """
         Generate the GPU kernel launching code.
         This is where all the call args being sorted out and generated.
         If enable_kernel_profile is enabled, all args related information would be packed in this function.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_meta = params["triton_meta"]
         assert len(self.arg_types) == len(params["def_args"]), (
             self.arg_types,
@@ -217,17 +223,24 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
         ]
         arg_types = [arg_type_loookup[name] for name in call_args]
         arg_signatures = [triton_meta["signature"][name] for name in call_args]
+<<<<<<< HEAD
         scratch_spaces = {
             name: params[name]
             for name in ["global_scratch", "profile_scratch"]
             if params.get(name, None) is not None
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         call_args_str = wrapper.generate_args_decl(
             prefix,
             call_args,
             arg_types,
             arg_signatures,
+<<<<<<< HEAD
             scratch_spaces=scratch_spaces,
+=======
+            workspace_size=params.get("global_scratch") or 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         prefix.writeline(f"void* kernel_args_[] = {{{call_args_str}}};")
         launch_kernel_args = [
@@ -240,6 +253,7 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
             "kernel_args_",
             "stream_",
         ]
+<<<<<<< HEAD
         if wrapper.device == "xpu":
             launch_kernel_args.append(str(params["threads_per_warp"]))
 
@@ -396,6 +410,9 @@ def process_args_for_input_shape(arg, arg_type, arg_signature=None):
             prefix.writeline("}")
         else:
             prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
+=======
+        prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CppWrapperGpu(CppWrapperCpu):
@@ -480,7 +497,11 @@ def codegen_inputs(self):
                 )
                 self.prefix.splice(
                     f"""
+<<<<<<< HEAD
                     if ((reinterpret_cast<std::uintptr_t>({input_name}.data_ptr()) & ({GPU_ALIGN_BYTES} -1)) != 0) {{
+=======
+                    if ((long({input_name}.data_ptr()) & ({GPU_ALIGN_BYTES} -1)) != 0) {{
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         AOTI_TORCH_WARN("{warn_msg}");
                         AtenTensorHandle {input_name}_aligned;
                         aoti_torch_clone_preserve_strides({input_name}, &{input_name}_aligned);
@@ -620,7 +641,11 @@ def generate_args_decl(
         arg_types,
         arg_signatures,
         is_triton_kernel=True,
+<<<<<<< HEAD
         scratch_spaces: Optional[dict[str, int]] = None,
+=======
+        workspace_size=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Generates any declarations of args to pass into a kernel call, and then returns the arg names.
@@ -738,6 +763,7 @@ def process_args(arg, arg_type, arg_signature=None):
         ):
             process_args(arg, arg_type, arg_signature)
 
+<<<<<<< HEAD
         for scratch_name, workspace_size in (scratch_spaces or {}).items():
             if (
                 is_triton_kernel
@@ -758,6 +784,24 @@ def process_args(arg, arg_type, arg_signature=None):
                 scratch_def, scratch_var = scratch
                 code.writelines([maybe_hipify_code_wrapper(x) for x in scratch_def])
                 new_args.append(f"&{scratch_var}")
+=======
+        if (
+            is_triton_kernel
+            and (
+                global_scratch := self.device_codegen.cpp_global_scratch(
+                    next(self.arg_var_id),
+                    workspace=TritonScratchWorkspace(
+                        size=workspace_size,
+                        generate_dtype_str=(lambda: self.codegen_dtype(torch.uint8)),
+                    ),
+                )
+            )
+            is not None
+        ):
+            global_scratch_def, global_scratch_var = global_scratch
+            code.writelines([maybe_hipify_code_wrapper(x) for x in global_scratch_def])
+            new_args.append(f"&{global_scratch_var}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return ", ".join(new_args)
 
@@ -822,9 +866,13 @@ def _generate_kernel_call_helper(
 
         if triton:
             call_args, arg_types = self.prepare_triton_wrapper_args(
+<<<<<<< HEAD
                 call_args,
                 # pyrefly: ignore [bad-argument-type]
                 arg_types,
+=======
+                call_args, arg_types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             wrapper_name = f"call_{kernel_name}"
             if wrapper_name not in self._triton_call_wrappers:
@@ -848,12 +896,18 @@ def _generate_kernel_call_helper(
                 self.writeline(f"{wrapper_name}({', '.join(call_args)});")
         else:
             casted = []
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for arg_type, arg in zip(arg_types, call_args):
                 new_arg = arg
                 if arg_type.endswith("*") and arg != "nullptr":
                     new_arg = f"{arg}.data_ptr()"
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 casted.append(f"({arg_type}){cexpr(new_arg)}")
             call_args_str = ", ".join(casted)
             self.writeline(f"kernels.{kernel_name}({call_args_str}, {stream});")
diff --git a/torch/_inductor/codegen/cpp_wrapper_mps.py b/torch/_inductor/codegen/cpp_wrapper_mps.py
index 7a5638f37b785..9c05a98d84dee 100644
--- a/torch/_inductor/codegen/cpp_wrapper_mps.py
+++ b/torch/_inductor/codegen/cpp_wrapper_mps.py
@@ -3,6 +3,7 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch.utils._ordered_set import OrderedSet
 
 from ..ir import GraphPartitionSignature
@@ -22,6 +23,16 @@ def __init__(self) -> None:
         self._used_kernel_names: OrderedSet[str] = OrderedSet()
         self._lambda_counter: int = 0
 
+=======
+
+from ..ir import GraphPartitionSignature
+from ..virtualized import V
+from .cpp_wrapper_gpu import CppWrapperGpu
+from .wrapper import PythonWrapperCodegen
+
+
+class CppWrapperMps(CppWrapperGpu):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def create(
         is_subgraph: bool,
@@ -35,6 +46,7 @@ def _generate_kernel_call_helper(
         self,
         kernel_name: str,
         call_args: list[str],
+<<<<<<< HEAD
         *,
         device: Optional[torch.device] = None,
         triton: bool = True,
@@ -44,10 +56,15 @@ def _generate_kernel_call_helper(
         triton_meta: Optional[dict[str, Any]] = None,
         graph_name: str = "",
         original_fxnode_name: Optional[str] = None,
+=======
+        arg_types: Optional[list[type]] = None,
+        **kwargs: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Generates MPS kernel call code. It should look something like:
         ```
+<<<<<<< HEAD
         auto mps_lib_0_lambda = [&](AOTIMetalKernelFunctionHandle handle) {
             aoti_torch_mps_start_encoding(handle);
             aoti_torch_mps_set_arg_tensor(handle, 0, buf0);
@@ -77,14 +94,37 @@ def _generate_kernel_call_helper(
 
         assert device.type == "mps"
 
+=======
+        auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
+        auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
+        mps_lib_0_func->runCommandBlock([&] {
+            mps_lib_0_func->startEncoding();
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 0, buf0);
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 1, arg0_1);
+            ...
+            mps_lib_0_func->dispatch(9);
+        });
+        ```
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert arg_types is not None
 
         new_args = []
         for idx, (arg, arg_type) in enumerate(zip(call_args[:-2], arg_types[:-2])):
             if isinstance(arg_type, torch.dtype):
+<<<<<<< HEAD
                 new_args.append(f"aoti_torch_mps_set_arg_tensor(handle, {idx}, {arg});")
             elif arg_type in (int, sympy.core.symbol.Symbol):
                 new_args.append(f"aoti_torch_mps_set_arg_int(handle, {idx}, {arg});")
+=======
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_tensor({kernel_name}_handle, {idx}, {arg});\n"
+                )
+            elif arg_type in (int, sympy.core.symbol.Symbol):
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_int({kernel_name}_handle, {idx}, {arg});\n"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise NotImplementedError(
                     f"Unsupported arg type {arg_type} for arg {arg} for kernel {kernel_name}"
@@ -93,6 +133,7 @@ def _generate_kernel_call_helper(
         threads, group_size = call_args[-2], call_args[-1]
         if threads is None:
             raise NotImplementedError("No threads or group_size provided")
+<<<<<<< HEAD
 
         # Check if threads is a single value or an array-like structure
         threads_str = str(threads)
@@ -172,6 +213,12 @@ def get_array_size(array_str: str) -> int:
                     f"    aoti_torch_mps_dispatch_array_with_group_size({dispatch_args});"
                 )
                 new_args.append("}")
+=======
+        elif group_size is None:
+            new_args.append(f"{kernel_name}->dispatch({threads});\n")
+        else:
+            new_args.append(f"{kernel_name}->dispatch({threads}, {group_size});\n")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # debug printer related logic for cpp kernel type.
         debug_printer_manager = V.graph.wrapper_code.debug_printer
@@ -183,6 +230,7 @@ def get_array_size(array_str: str) -> int:
             "cpp",
         )
         with debug_printer_manager:
+<<<<<<< HEAD
             self.write_mps_kernel_call(kernel_name, new_args)
 
     def write_mps_kernel_call(self, name: str, call_args: list[str]) -> None:
@@ -214,6 +262,21 @@ def write_mps_kernel_call(self, name: str, call_args: list[str]) -> None:
         self.writeline(
             f"aoti_torch_mps_run_command_block(get_{name}_handle(), aoti_torch_mps_shared_callback, &{wrapper_name});"
         )
+=======
+            self.writeline(self.wrap_kernel_call(kernel_name, new_args))
+
+    def wrap_kernel_call(self, name: str, call_args: list[str]) -> str:
+        lib_name = name[: -len("_func")]
+        calling_args = "        ".join(call_args)
+        return f"""
+    auto {name} = {lib_name}.getKernelFunction("generated_kernel");
+    auto {name}_handle = AOTIMetalKernelFunctionHandle({name}.get());
+    {name}->runCommandBlock([&] {{
+        {name}->startEncoding();
+        {calling_args}
+    }});
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def get_device_include_path(device: str) -> str:
@@ -222,6 +285,7 @@ def get_device_include_path(device: str) -> str:
             "#include <torch/csrc/inductor/aoti_include/mps.h>\n"
             "#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>"
         )
+<<<<<<< HEAD
 
     def codegen_additional_funcs(self) -> None:
         """
@@ -299,3 +363,5 @@ def codegen_additional_funcs(self) -> None:
     return kernel_handle.first;
 }}
 """)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/cpu_device_op_overrides.py b/torch/_inductor/codegen/cpu_device_op_overrides.py
index ccada837abbd4..f273d02e515d5 100644
--- a/torch/_inductor/codegen/cpu_device_op_overrides.py
+++ b/torch/_inductor/codegen/cpu_device_op_overrides.py
@@ -14,9 +14,12 @@ def get_raw_stream(_):
             """
         )
 
+<<<<<<< HEAD
     def cpp_kernel_type(self) -> str:
         return "void*"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_device(self, device_idx: int) -> str:
         return "pass"
 
diff --git a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
index 2496860ca1f7c..7b2b2baa10459 100644
--- a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
+++ b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@@ -144,9 +144,14 @@ def codegen_template(
         assert all(isinstance(n, ComputedBuffer) for n in epilogue_ir_nodes), (
             "Epilogue nodes must all be instances of ir.ComputedBuffer"
         )
+<<<<<<< HEAD
         kernel, render = ctb.make_kernel_render(  # type: ignore[misc]
             ctb, epilogue_nodes=epilogue_nodes
         )
+=======
+        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_nodes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with kernel:
             for node in [template_node, *epilogue_nodes]:
                 node.mark_run()
@@ -175,7 +180,10 @@ def codegen_template(
             call_args, kernel_name, arg_signatures, kernel
         )
         with debug_printer_manager:
+<<<<<<< HEAD
             self.codegen_comment(node_schedule, kernel_name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel.call_kernel(kernel_name, ctb)
 
         V.graph.removed_buffers |= kernel.removed_buffers
@@ -190,7 +198,10 @@ def _unwrap_epilogue_nodes(
         assert all(n.node is not None for n in nodes), (
             "All epilogue nodes should have an IRNode"
         )
+<<<<<<< HEAD
         # pyrefly: ignore [redundant-cast]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cast(
             list[BaseSchedulerNode], [n for n in nodes if n.node is not template_node]
         )
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index 9ca3afbd9ca57..27be3f3bdffe4 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -22,13 +22,19 @@ def get_cuda_arch() -> Optional[str]:
             major, minor = torch.cuda.get_device_capability(0)
             return str(major * 10 + minor)
         return str(cuda_arch)
+<<<<<<< HEAD
     except Exception:
         log.exception("Error getting cuda arch")
+=======
+    except Exception as e:
+        log.error("Error getting cuda arch: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
 @clear_on_fresh_cache
 @functools.lru_cache(1)
+<<<<<<< HEAD
 def is_datacenter_blackwell_arch() -> bool:
     arch = get_cuda_arch()
     if arch is None:
@@ -39,14 +45,21 @@ def is_datacenter_blackwell_arch() -> bool:
 
 @clear_on_fresh_cache
 @functools.lru_cache(1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_cuda_version() -> Optional[str]:
     try:
         cuda_version = config.cuda.version
         if cuda_version is None:
             cuda_version = torch.version.cuda
         return cuda_version
+<<<<<<< HEAD
     except Exception:
         log.exception("Error getting cuda version")
+=======
+    except Exception as e:
+        log.error("Error getting cuda version: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
index 0a9c6b0ca4e5f..8f6ea39ef9afe 100644
--- a/torch/_inductor/codegen/cuda/cuda_kernel.py
+++ b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -29,7 +29,10 @@
     IRNode,
     Layout,
     PrimitiveInfoType,
+<<<<<<< HEAD
     ShapeAsConstantBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorBox,
 )
 from ...utils import sympy_product
@@ -177,9 +180,12 @@ def get_ld(node) -> Union[Expr, int]:
     def get_dynamic_shape_args(self) -> list[Union[Expr, int]]:
         return [*self.get_layout_args(), *self.size_args]
 
+<<<<<<< HEAD
     def get_offset_args(self) -> list[Expr]:
         return [node.get_layout().offset for node in self.named_nodes.values()]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def find_ld_idx(node: IRNode) -> int:
         strides = node.get_stride()
@@ -267,7 +273,10 @@ def def_kernel(
                            In this case, the `input_reorder` would be [2, 0, 1].
             additional_size_args: Additional size arguments for epilogue inputs
         """
+<<<<<<< HEAD
         # NB: name order matters here, it's used to match up offsets
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         names = [x.strip() for x in names_str.strip().split(",")]
         if len(inputs) + len(outputs) != len(names):
             raise RuntimeError(
@@ -289,7 +298,10 @@ def def_kernel(
         free_symbols: OrderedSet[Expr] = OrderedSet()
         for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
             if node is not None:
+<<<<<<< HEAD
                 # NB: named nodes must be populated in the order of names
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
@@ -311,17 +323,25 @@ def def_kernel(
         size_vars.extend(str(s) for s in free_symbols)
         self.size_args.extend(free_symbols)
         size_args = [f"const int {s}" for s in size_vars]
+<<<<<<< HEAD
         offset_args = [f"const int {name}_offset" for name in self.named_nodes.keys()]
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         runtime_arg_decls = ",".join(
             [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
         )
         if runtime_arg_decls:
             runtime_arg_decls += ", "
 
+<<<<<<< HEAD
         signature = (
             f"int {self.kernel_name}({', '.join(arg_defs + size_args + offset_args)},\
  {runtime_arg_decls}{self._EXTRA_CPP_ARGS})"
         )
+=======
+        signature = f"int {self.kernel_name}({', '.join(arg_defs + size_args)}, {runtime_arg_decls}{self._EXTRA_CPP_ARGS})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.signature = signature
         return signature
 
@@ -354,6 +374,7 @@ def call_kernel(
             _, call_args, _, arg_types = self.args.python_argdefs()
 
         dynamic_shape_args = self.get_dynamic_shape_args()
+<<<<<<< HEAD
         offset_args = self.get_offset_args()
         call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
         call_args.extend(offset_args)  # type: ignore[arg-type]
@@ -361,6 +382,12 @@ def call_kernel(
             call_args.append(str(arg))
         arg_types.extend("const int" for _ in dynamic_shape_args)
         arg_types.extend("const int" for _ in offset_args)
+=======
+        call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
+        for arg in self.runtime_arg_values:
+            call_args.append(arg)
+        arg_types.extend("int" for _ in dynamic_shape_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in self.runtime_arg_info:
             arg_types.append(arg.ty)
         # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
@@ -436,6 +463,18 @@ def max_valid_index(self, node: IRNode, default=-1):
             max_valid_offset += (node.get_size()[i] - 1) * node.get_stride()[i]
         return max_valid_offset
 
+<<<<<<< HEAD
+=======
+    def offset(self, node: IRNode) -> str:
+        """
+        Generates code which represents offset of a given node.
+        """
+
+        if node is None:
+            return "0"
+        return str(node.get_layout().offset)  # type: ignore[union-attr]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def ptr(self, node: IRNode) -> str:
         """
         Generates code which represents pointer of a given node.
@@ -446,7 +485,12 @@ def ptr(self, node: IRNode) -> str:
         arg_name = self.arg_name(node)
         if arg_name is None:
             return "nullptr"
+<<<<<<< HEAD
         return f"{arg_name} + {arg_name}_offset"
+=======
+        offset = self.offset(node)
+        return arg_name if offset == "0" else f"{arg_name} + {offset}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def size(
         self,
@@ -632,26 +676,37 @@ def hash_key(self) -> str:
         """
         Return kernel hash key that does not depend on swizzle.
         """
+<<<<<<< HEAD
         swizzle_str: str = (
             str(self.info_kwargs.get("swizzle"))
             if isinstance(self.info_kwargs, dict)
             else "None"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "-".join(
             [
                 self.category,
                 self.bmreq.hash_key,
+<<<<<<< HEAD
                 swizzle_str,
+=======
+                str(self.info_dict().get("swizzle")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
 
     def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]:
+<<<<<<< HEAD
         """
         Information returned here is logged to the autotune log file when that is enabled.
 
         In general, we should avoid calling this function as it is expensive to compute,
         and can add up very fast.
         """
+=======
+        """Information returned here is logged to the autotune log file when that is enabled."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.info_kwargs is not None and "op" in self.info_kwargs:
             op: Any = self.info_kwargs["op"]
             return {
@@ -672,7 +727,11 @@ def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType
         else:
             return {"backend": "CUDA", "op_type": "unknown"}
 
+<<<<<<< HEAD
     def output_node(self) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def output_node(self) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.bmreq.update_workspace_size()
         return TensorBox.create(
             CUDATemplateBuffer(
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index 79dfa9c6c391f..501649f276f26 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -3,15 +3,23 @@
 import hashlib
 import itertools
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import override
 from unittest.mock import patch
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._inductor import config
 from torch._inductor.utils import clear_on_fresh_cache, Placeholder
+=======
+from torch._inductor.utils import Placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import getArtifactLogger
 
 from ...autotune_process import CUDABenchmarkRequest, TensorMeta
@@ -39,12 +47,17 @@ class ArgInfo:
     ty: str
 
 
+<<<<<<< HEAD
 @clear_on_fresh_cache
 class CUDATemplate(KernelTemplate):
     index_counter = itertools.count()
     # dict of cache key to (code, size_args)
     code_cache: dict[str, tuple[str, tuple[int, ...], tuple[int, ...]]] = {}
     cache_clear = staticmethod(code_cache.clear)
+=======
+class CUDATemplate(KernelTemplate):
+    index_counter = itertools.count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -54,15 +67,25 @@ def __init__(
         input_reorder: Optional[list[int]] = None,
     ) -> None:
         """
+<<<<<<< HEAD
         Baseclass for CUDA C++ Templates, derived from KernelTemplate.
         Not to be instantiated directly.
+=======
+
+        Baseclass for CUDA C++ Templates, derived from KernelTemplate. Not to be instantiated directly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             name (str): The name of the CUDATemplate object.
             input_nodes (List[IRNode]): A list of input IRNodes.
             layout (Layout): The layout of the output buffer / tensor.
+<<<<<<< HEAD
             input_reorder (Optional[List[int]]): An optional list that specifies
                 the order of the input nodes.
+=======
+            input_reorder (Optional[List[int]]): An optional list that specifies the order of the input nodes.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         super().__init__(name)
         self.input_nodes = input_nodes
@@ -70,16 +93,20 @@ def __init__(
         self.input_reorder = input_reorder
         self.layout = layout
 
+<<<<<<< HEAD
     @classmethod
     @functools.lru_cache(None)
     # pyrefly: ignore [bad-override]
     def _template_from_string(cls, source: str) -> Any:
         return KernelTemplate._template_from_string(source)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def supports_epilogue_fusion(op: GemmOperation) -> bool:
         return False
 
+<<<<<<< HEAD
     def make_key(self, name: str, input_key: str, layout_repr: str) -> str:
         """
         Make a key for the code cache. The idea of the method is to cache
@@ -129,6 +156,32 @@ def generate_code_and_args(
             runtime_arg_values=self.get_runtime_arg_values(**kwargs),
         )
         with patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)):
+=======
+    def generate(  # type: ignore[override]
+        self,
+        description,
+        **kwargs,
+    ) -> CUDATemplateCaller:
+        """
+        Generates the CUDA template caller object for the given GEMM template and operation. This CUDATemplateCaller
+        may be used to call and benchmark the generated CUDA kernel in a standalone manner to enable Autotuning.
+
+        Args:
+            kwargs: Additional keyword arguments.
+
+        Returns:
+            A CUDATemplateCaller object representing the generated CUDA template caller.
+        """
+        kernel_name = str(Placeholder.KERNEL_NAME)
+        with (
+            patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)),
+            CUDATemplateKernel(
+                kernel_name=kernel_name,
+                runtime_arg_info=self.get_runtime_arg_info(),
+                runtime_arg_values=self.get_runtime_arg_values(**kwargs),
+            ) as kernel,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code = self.render(kernel=kernel, **kwargs)
             _, call_args, _, _ = kernel.args.python_argdefs()
             autotuning_log.debug("Generated Code:\n%s", code)
@@ -153,6 +206,7 @@ def generate_code_and_args(
         )
         V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))
         size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())
+<<<<<<< HEAD
         offset_args = V.graph.sizevars.size_hints(kernel.get_offset_args())
 
         if key is not None:
@@ -195,6 +249,10 @@ def generate(  # type: ignore[override]
         )
 
         # not caching since kernel name is needed below
+=======
+        extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:8]
         kernel_name = f"cutlass_{kernel_hash}"
         code = code.replace(self.name, kernel_name)
@@ -202,8 +260,13 @@ def generate(  # type: ignore[override]
         # create the BenchmarkRequest
         bmreq = CUDABenchmarkRequest(
             kernel_name=kernel_name,
+<<<<<<< HEAD
             input_tensor_meta=input_tensor_meta,
             output_tensor_meta=output_tensor_meta,
+=======
+            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extra_args=extra_args,
             source_code=code,
         )
diff --git a/torch/_inductor/codegen/cuda/cutlass_cache.py b/torch/_inductor/codegen/cuda/cutlass_cache.py
index 66db98867b413..8e2d17683ca25 100644
--- a/torch/_inductor/codegen/cuda/cutlass_cache.py
+++ b/torch/_inductor/codegen/cuda/cutlass_cache.py
@@ -1,7 +1,10 @@
 # mypy: allow-untyped-defs
 import functools
 import hashlib
+<<<<<<< HEAD
 import inspect
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import json
 import logging
 import os
@@ -10,7 +13,10 @@
 
 import torch._inductor.config as config
 from torch._inductor.codecache import cutlass_key
+<<<<<<< HEAD
 from torch._inductor.codegen.cuda import cutlass_utils, serialization
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch, get_cuda_version
 from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
 from torch._inductor.runtime.cache_dir_utils import cache_dir
@@ -29,6 +35,7 @@ def get_config_request_key(
     instantiation_level: str,
 ) -> str:
     """
+<<<<<<< HEAD
     Return a key for the full ops, based on cutlass key, arch, cuda version, instantiation level, and serialization.py file hash.
     """
 
@@ -41,14 +48,21 @@ def get_file_hash(file_module):
     serialization_hash = get_file_hash(serialization)
     cutlass_utils_hash = get_file_hash(cutlass_utils)
 
+=======
+    Return a key for the full ops, based on cutlass key, arch, cuda version, and instantiation level.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hash_target = "-".join(
         [
             cutlass_key().hex(),
             arch,
             cuda_version,
             instantiation_level,
+<<<<<<< HEAD
             serialization_hash,
             cutlass_utils_hash,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     )
     return hashlib.sha256(hash_target.encode("utf-8")).hexdigest()[0:8]
@@ -94,11 +108,19 @@ def maybe_fetch_ops() -> Optional[list[Any]]:
             assert isinstance(serialized_ops, list), (
                 f"Expected serialized ops is a list, got {type(serialized_ops)}"
             )
+<<<<<<< HEAD
         except Exception:
             log.warning(
                 "Failed to load CUTLASS config %s from local cache",
                 filename,
                 exc_info=True,
+=======
+        except Exception as e:
+            log.warning(
+                "Failed to load CUTLASS config %s from local cache: %s",
+                filename,
+                e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             serialized_ops = None
     elif config.is_fbcode():
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
index 472438fec90e3..a4240467bedd3 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from sympy import Expr
 
@@ -28,6 +32,7 @@
     import textwrap
     from typing import Union
 
+<<<<<<< HEAD
     from cutlass_cppgen.backend.c_types import (  # type: ignore[import-not-found]
         EmptyByte,
     )
@@ -47,6 +52,27 @@
         PythonASTFrontend,
     )
     from cutlass_cppgen.backend.evt.ir.tensor import (  # type: ignore[import-not-found]
+=======
+    from cutlass.backend.c_types import (  # type: ignore[import-untyped, import-not-found]
+        EmptyByte,
+    )
+    from cutlass.backend.epilogue import (  # type: ignore[import-untyped, import-not-found]
+        dtype2ctype,
+    )
+    from cutlass.backend.evt import (  # type: ignore[import-untyped, import-not-found]
+        EpilogueFunctorVisitor,
+    )
+    from cutlass.backend.evt.backend.emitter_base import (  # type: ignore[import-untyped, import-not-found]
+        FusionCallbacks,
+    )
+    from cutlass.backend.evt.backend.sm90_emitter import (  # type: ignore[import-untyped, import-not-found]
+        CollectiveEpilogue,
+    )
+    from cutlass.backend.evt.frontend import (  # type: ignore[import-untyped, import-not-found]
+        PythonASTFrontend,
+    )
+    from cutlass.backend.evt.ir.tensor import (  # type: ignore[import-untyped, import-not-found]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensor as CutlassTensor,
     )
     from cutlass_library import (
@@ -61,6 +87,7 @@
 
     _CUTLASS_C_DTYPES = OrderedSet(dtype2ctype.values())  # type: ignore[var-annotated]
 
+<<<<<<< HEAD
     class EVTArgRenames:
         """Handles mapping buffer names to variable names in the cpp kernel signature and body"""
 
@@ -78,14 +105,20 @@ def new_name(self, name: str) -> str:
         def get(self, name: str) -> str:
             return self.buf_renames.get(name, name)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create_example_tensors(
         var_name_to_buffer_name: dict[str, str],
         name_to_buffer: dict[str, Buffer],
         size_hint_fn: Callable[[Union[Expr, int]], int],
     ) -> dict[str, CutlassTensor]:
+<<<<<<< HEAD
         def cutlass_tensor_from_buffer(
             buffer: Buffer,
         ) -> CutlassTensor:
+=======
+        def cutlass_tensor_from_buffer(buffer: Buffer) -> CutlassTensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape = buffer.get_layout().size
             stride = buffer.get_layout().stride
             shape = tuple(size_hint_fn(x) for x in shape)
@@ -102,9 +135,15 @@ def cutlass_tensor_from_buffer(
 
             return CutlassTensor(
                 shape=shape,
+<<<<<<< HEAD
                 layout_tag=(
                     LayoutType.RowMajor if is_row_major else LayoutType.ColumnMajor
                 ),
+=======
+                layout_tag=LayoutType.RowMajor
+                if is_row_major
+                else LayoutType.ColumnMajor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 element=torch_dtype_to_cutlass_type(buffer.get_layout().dtype),
             )
 
@@ -123,7 +162,11 @@ def trace(
         name_to_buffer: dict[str, Buffer],
         size_hint_fn: Callable[[Union[Expr, int]], int],
         **kwargs: dict[str, Any],
+<<<<<<< HEAD
     ) -> tuple[str, str, str, EVTArgRenames]:
+=======
+    ) -> tuple[str, str, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cuda_arch = int(cuda_env.get_cuda_arch())  # type: ignore[arg-type]
         assert cuda_arch >= 90, "Only SM90+ is supported for EVT"
         epilogue_functor = _trace(fn_src, example_tensors, cuda_arch, **kwargs)
@@ -137,26 +180,36 @@ def trace(
             fusion_callbacks,
         )
         evt_name, evt_code = collective_epilogue.emit()
+<<<<<<< HEAD
         evt_args, arg_renames = _render_argument_type(
             epilogue_functor, name_to_buffer, size_hint_fn
         )
         return evt_name, evt_args, evt_code, arg_renames
+=======
+        evt_args = _render_argument_type(epilogue_functor, name_to_buffer, size_hint_fn)
+        return evt_name, evt_args, evt_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Based off of
     # https://github.com/NVIDIA/cutlass/blob/df18f5e4f5de76bed8be1de8e4c245f2f5ec3020/python/cutlass/epilogue/epilogue.py#L117
     # This is modified to enable directly passing the source code of the epilogue vs getting it from a bona-fide python function
     # The reason for this is that inspect.getsource does not work with functions defined at runtime via exec/eval
     def _trace(
+<<<<<<< HEAD
         fn_src: str,
         example_tensors: dict[str, CutlassTensor],
         cc: int,
         **kwargs: Any,
+=======
+        fn_src: str, example_tensors: dict[str, CutlassTensor], cc: int, **kwargs: Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> EpilogueFunctor:
         class EpilogueFunctor(PythonASTFrontend):
             def __init__(self, cc: int, **kwargs: Any):
                 self.source = textwrap.dedent(fn_src)
                 super().__init__(cc, **kwargs)
 
+<<<<<<< HEAD
             def parse(
                 self,
                 example_inputs: dict[str, CutlassTensor],
@@ -164,6 +217,11 @@ def parse(
                 self.example_inputs = example_inputs
                 self.ast = ast.parse(self.source)
                 # pyrefly: ignore [missing-attribute]
+=======
+            def parse(self, example_inputs: dict[str, CutlassTensor]) -> None:
+                self.example_inputs = example_inputs
+                self.ast = ast.parse(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.visit(self.ast)
 
         cc = int(cuda_env.get_cuda_arch())
@@ -175,15 +233,24 @@ def _render_argument_type(
         epilogue_functor: EpilogueFunctor,
         name_to_buffer: dict[str, Buffer],
         size_hint_fn: Callable[[Union[Expr, int]], int],
+<<<<<<< HEAD
     ) -> tuple[str, EVTArgRenames]:
         epilogue_thread_type = epilogue_functor.epilogue_thread_type
         arg_renames = EVTArgRenames()
+=======
+    ) -> str:
+        epilogue_thread_type = epilogue_functor.epilogue_thread_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Fragile, but this is the only way to guarantee t is expected type because t is a local class
         def is_nested_visitor_type(t: type) -> bool:
             return (
                 ".".join([t.__module__, t.__qualname__])
+<<<<<<< HEAD
                 == "cutlass_cppgen.backend.c_types.visitor_factory.<locals>.VisitorType"
+=======
+                == "cutlass.backend.c_types.visitor_factory.<locals>.VisitorType"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         buffer = IndentedBuffer()
@@ -196,9 +263,13 @@ def render_argument_type(name: str, t: CutlassArgType) -> None:
                     fields = [
                         (
                             fname,
+<<<<<<< HEAD
                             _get_arg_from_node(
                                 ty, name_to_buffer[name], size_hint_fn, arg_renames
                             ),
+=======
+                            _get_arg_from_node(ty, name_to_buffer[name], size_hint_fn),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         for fname, ty in t._fields_
                     ]
@@ -229,6 +300,7 @@ def render_thread_type(name: str, t: CutlassArgType) -> None:
                     render_argument_type("thread", epilogue_thread_type)
                 buffer.writeline("}")
 
+<<<<<<< HEAD
         return buffer.getvalue(), arg_renames
 
     def _get_arg_from_node(
@@ -236,6 +308,12 @@ def _get_arg_from_node(
         node: Buffer,
         size_hint_fn: Callable[[Union[Expr, int]], int],
         arg_renames: EVTArgRenames,
+=======
+        return buffer.getvalue()
+
+    def _get_arg_from_node(
+        arg_ty: type, node: Buffer, size_hint_fn: Callable[[Union[Expr, int]], int]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> str:
         from ..cuda_template import CUTLASSTemplate
 
@@ -244,7 +322,11 @@ def _get_arg_from_node(
         # Once again, need to check for local class type for stride tuple
         if (
             str(arg_ty)
+<<<<<<< HEAD
             == "<class 'cutlass_cppgen.backend.c_types.tuple_factory_.<locals>.TupleType'>"
+=======
+            == "<class 'cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             DEFAULT_STRIDE_LEN = 3
             assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN
@@ -264,8 +346,12 @@ def render_stride(x: int) -> str:
             return f"{{{', '.join([render_stride(x) for x in stride])}}}"
 
         elif issubclass(arg_ty, ctypes.c_void_p):
+<<<<<<< HEAD
             name = arg_renames.new_name(node.get_name())
             return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) ({name} + {name}_offset)"
+=======
+            return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) {node.get_name()}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif (
             arg_ty in _CUTLASS_C_DTYPES
         ):  # Assumption: this is the element dtype, this holds for all cutlass ir nodes currently
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
index 95af1a968a97c..6f834156568e7 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
@@ -396,7 +396,11 @@ def emit(self, operation):
                 "align_a": str(operation.A.alignment),
                 "align_b": str(operation.B.alignment),
                 "align_c": str(operation.C.alignment),
+<<<<<<< HEAD
                 "align_d": str(operation.D.alignment),
+=======
+                "align_d": str(operation.C.alignment),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "transform_a": ComplexTransformTag[operation.A.complex_transform],
                 "transform_b": ComplexTransformTag[operation.B.complex_transform],
                 "math_operation": MathOperationTag[
diff --git a/torch/_inductor/codegen/cuda/cutlass_presets.py b/torch/_inductor/codegen/cuda/cutlass_presets.py
new file mode 100644
index 0000000000000..f0888e2ef29bb
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_presets.py
@@ -0,0 +1,239 @@
+import functools
+from collections import defaultdict
+
+import torch
+from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+
+@functools.cache
+def gen_cutlass_presets() -> dict[int, dict[str, list[str]]]:
+    """
+    Generate cutlass presets for the given CUDA arch.
+    """
+    presets: dict[int, dict[str, list[str]]] = {}
+
+    if not torch._C._has_cuda:
+        return presets
+
+    presets[0] = defaultdict(list)
+    arch = get_cuda_arch()
+    if arch == "90":
+        preset = presets[0]
+        preset["0"] = [
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_64x256x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+        ]
+        preset["1111"] = [
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+        ]
+        preset["2222"] = [
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+        ]
+        preset["3333"] = [
+            r"cutlass3x_sm90_tensorop_s64x48x16gemm_.*_64x48x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_4x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_4x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+        ]
+        preset["4444"] = [
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x8x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_64x192x64_4x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+        ]
+        preset["5555"] = [
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x32x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_128x32x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x256_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x8x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_128x192x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x8x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x256_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+        ]
+
+    return presets
diff --git a/torch/_inductor/codegen/cuda/cutlass_python_evt.py b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
index 72108b29b3cb0..f66d5a020a5fe 100644
--- a/torch/_inductor/codegen/cuda/cutlass_python_evt.py
+++ b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
@@ -88,7 +88,11 @@ def relu(x0: str) -> str:
 
     @staticmethod
     def sigmoid(x0: str) -> str:
+<<<<<<< HEAD
         return CutlassEVTOpsMixIn._prefix_un_op("sigmoid", x0)
+=======
+        raise NotImplementedError("sigmoid is not supported in CUTLASS python evt")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def sub(x0: str, x1: str) -> str:
@@ -96,11 +100,15 @@ def sub(x0: str, x1: str) -> str:
 
     @staticmethod
     def tanh(x0: str) -> str:
+<<<<<<< HEAD
         return CutlassEVTOpsMixIn._prefix_un_op("tanh", x0)
 
     @staticmethod
     def exp(x0: str) -> str:
         return CutlassEVTOpsMixIn._prefix_un_op("exp", x0)
+=======
+        raise NotImplementedError("tanh is not supported in CUTLASS python evt")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MockCutlassHandler(CutlassEVTOpsMixIn, WrapperHandler):
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index fa46e8766cd58..599b5850076db 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -9,14 +9,21 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
+<<<<<<< HEAD
 from typing_extensions import TypeIs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._inductor.utils import clear_on_fresh_cache
 from torch.utils._ordered_set import OrderedSet
+=======
+from torch._inductor.utils import clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ... import config
 from ...ir import Layout
@@ -29,15 +36,19 @@
 log = logging.getLogger(__name__)
 
 CUTLASS_OPERATION_KIND: str = "gemm"
+<<<<<<< HEAD
 ACCUMULATOR_DTYPES: OrderedSet[torch.dtype] = OrderedSet([torch.float, torch.int32])
 XW_DTYPES: OrderedSet[torch.dtype] = OrderedSet(
     [torch.half, torch.bfloat16, torch.float8_e4m3fn, torch.int8]
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @atexit.register
 def move_cutlass_compiled_cache() -> None:
     """Move CUTLASS compiled cache file to the cache directory if it exists."""
+<<<<<<< HEAD
     if not try_import_cutlass.cache_info().currsize > 0:
         return
 
@@ -55,6 +66,22 @@ def move_cutlass_compiled_cache() -> None:
         log.debug("Moved CUTLASS compiled cache file to %s", cache_dir())
     except OSError:
         log.warning("Failed to move CUTLASS compiled cache file", exc_info=True)
+=======
+    if "cutlass" not in sys.modules:
+        return
+
+    import cutlass  # type: ignore[import-not-found]
+
+    if not os.path.exists(cutlass.CACHE_FILE):
+        return
+
+    try:
+        filename = os.path.basename(cutlass.CACHE_FILE)
+        shutil.move(cutlass.CACHE_FILE, os.path.join(cache_dir(), filename))
+        log.debug("Moved CUTLASS compiled cache file to %s", cache_dir())
+    except OSError as e:
+        log.warning("Failed to move CUTLASS compiled cache file: %s", str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _rename_cutlass_import(content: str, cutlass_modules: list[str]) -> str:
@@ -71,15 +98,28 @@ def try_import_cutlass() -> bool:
     """
     We want to support three ways of passing in CUTLASS:
     1. fbcode, handled by the internal build system.
+<<<<<<< HEAD
     2. User specifies cutlass_dir. The default is ../third_party/cutlass/,
+=======
+    2. pip install nvidia-cutlass, which provides the cutlass_library package
+       and the header files in the cutlass_library/source directory.
+    3. User specifies cutlass_dir. The default is ../third_party/cutlass/,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        which is the directory when developers build from source.
     """
     if config.is_fbcode():
         try:
+<<<<<<< HEAD
             import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401
             import cutlass_library  # type: ignore[import-not-found]
         except ImportError as e:
             log.warning(  # noqa: G200
+=======
+            import cutlass  # type: ignore[import-not-found]
+            import cutlass_library  # type: ignore[import-not-found]
+        except ImportError as e:
+            log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Failed to import CUTLASS packages in fbcode: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
@@ -87,6 +127,37 @@ def try_import_cutlass() -> bool:
 
         return True
 
+<<<<<<< HEAD
+=======
+    try:
+        import cutlass  # type: ignore[import-not-found]  # noqa: F811
+        import cutlass_library  # type: ignore[import-not-found]  # noqa: F811
+
+        cutlass_minor_vesion = int(cutlass.__version__.split(".")[1])
+        if cutlass_minor_vesion < 7:
+            log.warning("CUTLASS version < 3.7 is not recommended.")
+
+        log.debug(
+            "Found cutlass_library in python search path, overriding config.cuda.cutlass_dir"
+        )
+        cutlass_library_dir = os.path.dirname(cutlass_library.__file__)
+        assert os.path.isdir(cutlass_library_dir), (
+            f"{cutlass_library_dir} is not a directory"
+        )
+        config.cuda.cutlass_dir = os.path.abspath(
+            os.path.join(
+                cutlass_library_dir,
+                "source",
+            )
+        )
+
+        return True
+    except ModuleNotFoundError:
+        log.debug(
+            "cutlass_library not found in sys.path, trying to import from config.cuda.cutlass_dir"
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path.
     # This is a temporary hack to avoid CUTLASS module naming conflicts.
     # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
@@ -110,13 +181,21 @@ def path_join(path0, path1):
     )
 
     cutlass_library_src_path = path_join(cutlass_python_path, "cutlass_library")
+<<<<<<< HEAD
     cutlass_cppgen_src_path = path_join(cutlass_python_path, "cutlass_cppgen")
+=======
+    cutlass_src_path = path_join(cutlass_python_path, "cutlass")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pycute_src_path = path_join(cutlass_python_path, "pycute")
 
     tmp_cutlass_full_path = os.path.abspath(os.path.join(cache_dir(), "torch_cutlass"))
 
     dst_link_library = path_join(tmp_cutlass_full_path, "cutlass_library")
+<<<<<<< HEAD
     dst_link_cutlass_cppgen = path_join(tmp_cutlass_full_path, "cutlass_cppgen")
+=======
+    dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dst_link_pycute = path_join(tmp_cutlass_full_path, "pycute")
 
     # mock modules to import cutlass
@@ -126,7 +205,11 @@ def path_join(path0, path1):
         if tmp_cutlass_full_path not in sys.path:
 
             def link_and_append(dst_link, src_path, parent_dir):
+<<<<<<< HEAD
                 if os.path.lexists(dst_link):
+=======
+                if os.path.exists(dst_link):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert os.path.islink(dst_link), (
                         f"{dst_link} is not a symlink. Try to remove {dst_link} manually and try again."
                     )
@@ -143,9 +226,13 @@ def link_and_append(dst_link, src_path, parent_dir):
             link_and_append(
                 dst_link_library, cutlass_library_src_path, tmp_cutlass_full_path
             )
+<<<<<<< HEAD
             link_and_append(
                 dst_link_cutlass_cppgen, cutlass_cppgen_src_path, tmp_cutlass_full_path
             )
+=======
+            link_and_append(dst_link_cutlass, cutlass_src_path, tmp_cutlass_full_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             link_and_append(dst_link_pycute, pycute_src_path, tmp_cutlass_full_path)
 
             for module in mock_modules:
@@ -156,7 +243,11 @@ def link_and_append(dst_link, src_path, parent_dir):
                 )
 
         try:
+<<<<<<< HEAD
             import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401, F811
+=======
+            import cutlass  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             import cutlass_library.generator  # noqa: F401
             import cutlass_library.library  # noqa: F401
             import cutlass_library.manifest  # noqa: F401
@@ -164,7 +255,11 @@ def link_and_append(dst_link, src_path, parent_dir):
 
             return True
         except ImportError as e:
+<<<<<<< HEAD
             log.debug(  # noqa: G200
+=======
+            log.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
@@ -287,10 +382,16 @@ def gen_ops() -> dict[Any, Any]:
     """
     Generates all supported CUTLASS operations.
     """
+<<<<<<< HEAD
     with dynamo_timed("cutlass_utils.gen_ops"):
         arch = get_cuda_arch()
         version = get_cuda_version()
         return _gen_ops_cached(arch, version)
+=======
+    arch = get_cuda_arch()
+    version = get_cuda_version()
+    return _gen_ops_cached(arch, version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 DTYPE_TO_CUTLASS_TYPE = {
@@ -356,10 +457,13 @@ def get_accumulator_dtype(
     Given a pair of input torch dtypes, returns the inferred accumulator torch dtype.
     """
 
+<<<<<<< HEAD
     assert OrderedSet(input_torch_dtypes) <= XW_DTYPES, (
         f"{input_torch_dtypes=} is not supported"
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(input_torch_dtypes) != 2:
         return None
 
@@ -380,6 +484,7 @@ def get_accumulator_dtype(
             torch_dtype = dtype0
 
     if torch_dtype in (torch.float16, torch.bfloat16, torch.float, torch.float8_e4m3fn):
+<<<<<<< HEAD
         accumulator_dtype = torch.float
     elif torch_dtype == torch.int8:
         accumulator_dtype = torch.int32
@@ -390,6 +495,12 @@ def get_accumulator_dtype(
         f"{accumulator_dtype=} is not supported"
     )
     return accumulator_dtype
+=======
+        return torch.float
+    if torch_dtype == torch.int8:
+        return torch.int32
+    raise NotImplementedError(f"Unsupported data types: {input_torch_dtypes=}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.lru_cache(32)
@@ -420,8 +531,13 @@ def get_max_alignment(inductor_layout: Layout) -> int:
     size = inductor_layout.size
     offset = inductor_layout.offset
 
+<<<<<<< HEAD
     def is_static_int(number: object) -> TypeIs[int | sympy.Integer]:
         return isinstance(number, (int | sympy.Integer))
+=======
+    def is_static_int(number):
+        return isinstance(number, (int, sympy.Integer))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def a_factor_of(x, alignment):
         if is_static_int(x) and is_static_int(alignment):
@@ -464,6 +580,7 @@ def __enter__(self, *args, **kwargs):
 
         _compile_method_orig = torch._inductor.codecache.CUDACodeCache.compile
 
+<<<<<<< HEAD
         def my_compile(
             source_code, dst_file_ext, extra_args: Optional[list[str]] = None
         ):
@@ -471,6 +588,12 @@ def my_compile(
             return _compile_method_orig(source_code, dst_file_ext)
 
         # pyrefly: ignore [bad-assignment]
+=======
+        def my_compile(source_code, dst_file_ext):
+            self.sources.append(source_code)
+            return _compile_method_orig(source_code, dst_file_ext)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._compile_patch = mock.patch(
             "torch._inductor.codecache.CUDACodeCache.compile", my_compile
         )
diff --git a/torch/_inductor/codegen/cuda/device_op_overrides.py b/torch/_inductor/codegen/cuda/device_op_overrides.py
index 147515e0decfe..aeb0ec4b85fde 100644
--- a/torch/_inductor/codegen/cuda/device_op_overrides.py
+++ b/torch/_inductor/codegen/cuda/device_op_overrides.py
@@ -4,6 +4,10 @@
 
 import torch
 
+<<<<<<< HEAD
+=======
+from ...utils import triton_version_uses_attrs_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..common import (
     DeviceOpOverrides,
     register_device_op_overrides,
@@ -332,6 +336,7 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "CUdeviceptr"
 
+<<<<<<< HEAD
     def cpp_scratch(
         self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
     ) -> Optional[tuple[list[str], str]]:
@@ -359,6 +364,36 @@ def cpp_scratch(
             )
         else:
             return [f"CUdeviceptr {var_name} = 0;"], var_name
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+        if triton_version_uses_attrs_dict():
+            var_name = f"global_scratch_{idx}"
+            if workspace.size > 0:
+                size_array = f"int64_t {var_name}_size[] = {{{workspace.size}}};"
+                stride_array = f"int64_t {var_name}_stride[] = {{1}};"
+                device_type = "cached_torch_device_type_cuda"
+                device_idx = "device_idx_"
+
+                return (
+                    [
+                        f"{size_array}",
+                        f"{stride_array}",
+                        f"AtenTensorHandle {var_name}_handle;",
+                        (
+                            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, {var_name}_size, {var_name}_stride, "
+                            f"{workspace.generate_dtype_str()}, {device_type}, {device_idx}, &{var_name}_handle));"
+                        ),
+                        f"RAIIAtenTensorHandle {var_name}_tensor({var_name}_handle);",
+                        f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({var_name}_tensor.data_ptr());",
+                    ],
+                    var_name,
+                )
+            else:
+                return [f"CUdeviceptr {var_name} = 0;"], var_name
+        return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 register_device_op_overrides("cuda", CUDADeviceOpOverrides())
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index 9ed1cfb9adfcd..b59f990557cac 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -10,10 +10,14 @@
 
 import torch
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.runtime.runtime_utils import dynamo_timed
+=======
+from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.select_algorithm import create_inputs_key
 from torch._inductor.utils import clear_on_fresh_cache
@@ -35,6 +39,7 @@
 from . import cutlass_utils
 from .cuda_kernel import CUDATemplateKernel
 from .cuda_template import CUTLASSTemplate
+<<<<<<< HEAD
 from .cutlass_python_evt import CutlassEVTCodegen, scaled_mm_evt
 from .cutlass_utils import (
     ACCUMULATOR_DTYPES,
@@ -46,6 +51,14 @@
 
 GemmOperation = Any
 EVTArgRenames = Any
+=======
+from .cutlass_presets import gen_cutlass_presets
+from .cutlass_python_evt import CutlassEVTCodegen, scaled_mm_evt
+from .cutlass_utils import torch_dtype_to_cutlass_type
+
+
+GemmOperation = Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -564,6 +577,7 @@ def _add_cutlass_gemm_choices(
         """
 
         ops = self.gen_ops()
+<<<<<<< HEAD
 
         # pre-computation
         layout_repr: str = str(layout)
@@ -601,6 +615,21 @@ def _add_cutlass_gemm_choices(
                 [node.get_layout() for node in input_nodes],
                 [node.get_stride() for node in input_nodes],
             )
+=======
+        for name, op in ops:
+            for swizzle in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
+                description = f"{name} swizzle={swizzle}"
+                self.maybe_append_choice(
+                    choices, description=description, op=op, swizzle=swizzle
+                )
+
+        if len(ops) == 0:
+            input_layouts = [node.get_layout() for node in input_nodes]
+            input_strides = [node.get_stride() for node in input_nodes]
+            output_layout = layout
+            warning_msg = f"No suitable Cutlass GEMM configs found, fallbacks used ( {len(ops)=}, {output_layout=}, {input_layouts=}, {input_strides=} )"  # noqa: B950
+            log.warning(warning_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug(
             "Added %d Cutlass gemm configs.",
             len(ops),
@@ -688,6 +717,7 @@ def layout_match(
         return CUTLASSGemmTemplate.cutlass_layout(torch_layout) == cutlass_layout
 
     @staticmethod
+<<<<<<< HEAD
     def set_layout(tensor_desc: "TensorDescription", torch_layout: ir.Layout) -> None:  # type: ignore[name-defined]  # noqa: F821
         """
         Helper method: Sets the layout of a given tensor description to match the given torch layout
@@ -697,6 +727,8 @@ def set_layout(tensor_desc: "TensorDescription", torch_layout: ir.Layout) -> Non
         tensor_desc.layout = CUTLASSGemmTemplate.cutlass_layout(torch_layout)
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_alignment(torch_layout, op_element) -> bool:
         """
         Helper method to update the alignment of a given CUTLASS GEMM op operand's element.
@@ -837,6 +869,7 @@ def _dtype_match(
 
         return True
 
+<<<<<<< HEAD
     @classmethod
     def global_filter_ops(
         cls,
@@ -884,6 +917,8 @@ def global_filter_ops(
 
         return ops
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def filter_op(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
@@ -901,6 +936,19 @@ def filter_op(
         have been mutated.
         """
 
+<<<<<<< HEAD
+=======
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib  # type: ignore[import]
+
+        # Skip simt kernels
+        if (
+            op.tile_description.math_instruction.opcode_class
+            == cutlass_lib.OpcodeClass.Simt
+        ):
+            return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op.gemm_kind not in self._get_supported_ops():
             return None
 
@@ -915,6 +963,16 @@ def filter_op(
         if not self._dtype_match(op):
             return None
 
+<<<<<<< HEAD
+=======
+        # Filter ops by input layouts.
+        if not (
+            self.layout_match(X.get_layout(), op.A.layout)
+            and self.layout_match(W.get_layout(), op.B.layout)
+        ):
+            return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Filter ops by alignment.
         if not self._alignment_match(op):
             log.debug(
@@ -922,6 +980,7 @@ def filter_op(
             )
             return None
 
+<<<<<<< HEAD
         # only use stream k for static shape
         if op.tile_scheduler.name == "StreamK":
             static_shape = PythonWrapperCodegen.statically_known_list_of_ints_or_none(
@@ -937,6 +996,11 @@ def filter_op(
         self.set_layout(op.A, X.get_layout())
         self.set_layout(op.B, W.get_layout())
 
+=======
+        # Update op.
+        op = copy.deepcopy(op)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Set output layout.
         op.D.layout = CUTLASSGemmTemplate.cutlass_layout(self.output_node.get_layout())
 
@@ -975,10 +1039,34 @@ def filter_op(
             return None
 
         # Apply regex filters at the end when configuration name doesn't change anymore
+<<<<<<< HEAD
         if inductor_cuda_config.cutlass_op_allowlist_regex:
             if not re.search(
                 inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
             ):
+=======
+        if (
+            inductor_cuda_config.cutlass_op_allowlist_regex
+            or inductor_cuda_config.cutlass_presets
+        ):
+            patterns = []
+            if inductor_cuda_config.cutlass_op_allowlist_regex:
+                patterns.append(inductor_cuda_config.cutlass_op_allowlist_regex)
+            if inductor_cuda_config.cutlass_presets:
+                presets = gen_cutlass_presets()
+                preset_nums = [
+                    int(x) for x in inductor_cuda_config.cutlass_presets.split(",")
+                ]
+                for preset_num in preset_nums:
+                    preset = presets.get(preset_num, {}).get(
+                        inductor_cuda_config.cutlass_instantiation_level, []
+                    )
+
+                    patterns.extend(preset)
+
+            pattern = "|".join(patterns)
+            if pattern and not re.search(pattern, op.configuration_name()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return None
         if inductor_cuda_config.cutlass_op_denylist_regex is not None:
             if re.search(
@@ -1006,8 +1094,12 @@ def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type:
             log.debug("Using cached ops for %s", self.cache_key)
             return self.filtered_ops_cache[self.cache_key]
 
+<<<<<<< HEAD
         with dynamo_timed("CUTLASSGemmTemplate.maybe_fetch_ops"):
             maybe_ops = maybe_fetch_ops()
+=======
+        maybe_ops = maybe_fetch_ops()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if maybe_ops is None:
             log.debug("Cannot fetch ops from cache, generating ops from scratch")
             full_ops = cutlass_utils.gen_ops()
@@ -1016,8 +1108,11 @@ def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type:
             log.debug("Using cached ops from cache")
             ops = maybe_ops
 
+<<<<<<< HEAD
         ops = self.global_filter_ops(ops)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res: dict[str, cutlass_gemm_op.GemmOperation] = {}
         start_time = time.time()
         for op in ops:
@@ -1150,10 +1245,13 @@ def render(  # type: ignore[override]
                 op = self.swap_XW(op)
                 should_swap_xw = True
 
+<<<<<<< HEAD
         name_to_buffer = {node.get_name(): node for node in self.input_nodes}
         # handle the fake output buffer during lowering
         name_to_buffer[Y.get_name()] = Y  # type: ignore[assignment]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if epilogue_nodes or is_scaled_mm:
             if epilogue_nodes:
                 (
@@ -1165,6 +1263,7 @@ def render(  # type: ignore[override]
                     Y.get_name(), epilogue_nodes, V.kernel.removed_buffers
                 )
 
+<<<<<<< HEAD
                 # TODO: mlazos remove this by returning buffer metadata from
                 # ir_to_evt_python code
                 for name, buf in (
@@ -1174,6 +1273,14 @@ def render(  # type: ignore[override]
                         name_to_buffer[name] = buf  # type: ignore[assignment]
 
                 D_output_name = var_name_to_buffer_name["D"]
+=======
+                D_output_name = var_name_to_buffer_name["D"]
+                name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+                for name in V.graph.constants.keys():
+                    name_to_buffer[name] = V.graph.add_tensor_constant(
+                        V.graph.constants[name], name
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 D_output_buffer = name_to_buffer[D_output_name]
                 Y = D_output_buffer  # type: ignore[assignment]
                 # Interestingly, I don't think the rest of the layout matters here since we
@@ -1214,11 +1321,18 @@ def render(  # type: ignore[override]
             )
             assert acc_dtype, "Could not determine accumulator dtype"
 
+<<<<<<< HEAD
             evt_name, evt_args, evt_code, evt_arg_renames = self._render_evt(
                 op,
                 evt_py_code,
                 var_name_to_buffer_name,
                 name_to_buffer,
+=======
+            evt_name, evt_args, evt_code = self._render_evt(
+                op,
+                evt_py_code,
+                var_name_to_buffer_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Y.get_dtype(),
                 acc_dtype,
             )
@@ -1231,9 +1345,12 @@ def render(  # type: ignore[override]
                 Y,
                 *extra_inputs,
             ]
+<<<<<<< HEAD
             input_names = [evt_arg_renames.get(name) for name in input_names]
             output_names = [evt_arg_renames.get(name) for name in output_names]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             names_str = ",".join(
                 ["X", "W", "Bias", *input_names, "Y", *output_names, *extra_names]
             )
@@ -1254,6 +1371,7 @@ def render(  # type: ignore[override]
 
         instance_definition, instance_type = self._define_gemm_instance(op, evt_name)
 
+<<<<<<< HEAD
         options = {
             "alpha": self.alpha,
             "beta": self.beta,
@@ -1275,6 +1393,29 @@ def render(  # type: ignore[override]
             "op_conf_name": op.configuration_name(),
             "epilogue_visitor_tree": evt_code,
         }
+=======
+        options = dict(
+            alpha=self.alpha,
+            beta=self.beta,
+            X=X,
+            W=W,
+            Y=Y,
+            kernel_call_signature=kernel_call_signature,
+            Bias=Bias,
+            epilogue_template=epilogue_template,
+            argument_template=argument_template,
+            should_swap_xw=should_swap_xw,
+            template=self,
+            kernel=kernel,
+            instance_definition=instance_definition,
+            instance_type=instance_type,
+            input_reorder=self.input_reorder,
+            epilogue_args=evt_args,
+            test_call_statement=test_call_statement,
+            op_conf_name=op.configuration_name(),
+            epilogue_visitor_tree=evt_code,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         options.update(dict(zip(extra_names, extra_inputs)))
         res = self._template_from_string(self._get_template()).render(**options)
         if inductor_cuda_config.generate_test_runner and not is_dynamic(X, W, Y, Bias):
@@ -1310,17 +1451,27 @@ def test_call_statement(
             f"(({arg_type}){arg_name}_data.get())"
             for arg_type, arg_name in zip(arg_types, arg_names)
         ]
+<<<<<<< HEAD
         return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, 0, 0, 0, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
+=======
+        return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _render_evt(
         self,
         op: GemmOperation,
         evt_py_code: str,
         buffer_renames: dict[str, str],
+<<<<<<< HEAD
         name_to_buffer: dict[str, Buffer],
         output_dtype: torch.dtype,
         accumulator_dtype: torch.dtype,
     ) -> tuple[str, str, str, EVTArgRenames]:  # type: ignore[name-defined]  # noqa: F821
+=======
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError("_render_evt in CUTLASSGemmTemplate not implemented")
 
 
@@ -1479,6 +1630,7 @@ def _render_evt(
         op: GemmOperation,
         evt_py_code: str,
         var_name_to_buffer_name: dict[str, str],
+<<<<<<< HEAD
         name_to_buffer: dict[str, Buffer],
         output_dtype: torch.dtype,
         accumulator_dtype: torch.dtype,
@@ -1488,12 +1640,35 @@ def _render_evt(
         acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
         output_dtype = torch_dtype_to_cutlass_type(output_dtype)
 
+=======
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+        from .cutlass_lib_extensions.evt_extensions import create_example_tensors, trace
+
+        name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+
+        for name in V.graph.constants.keys():
+            name_to_buffer[name] = V.graph.add_tensor_constant(
+                V.graph.constants[name], name
+            )
+
+        # handle the fake output buffer during lowering
+        name_to_buffer[self.output_node.get_name()] = self.output_node  # type: ignore[assignment]
+
+        acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
+        output_dtype = torch_dtype_to_cutlass_type(output_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         examples = create_example_tensors(
             var_name_to_buffer_name,
             name_to_buffer,  # type: ignore[arg-type]
             V.graph.sizevars.size_hint,
         )
+<<<<<<< HEAD
         evt_name, evt_args, evt_code, arg_renames = trace(
+=======
+        evt_name, evt_args, evt_code = trace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             evt_py_code,
             examples,
             acc_dtype,
@@ -1508,7 +1683,10 @@ def _render_evt(
             evt_name,
             evt_args,
             evt_code,
+<<<<<<< HEAD
             arg_renames,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _shape_match(
@@ -1656,6 +1834,7 @@ def render_gemm_arguments(
         tensors. This operation also implies the M and N dimensions of Bias and GEMM output to be swapped
         before the function call.
         """
+<<<<<<< HEAD
         options = {
             "alpha": alpha,
             "beta": beta,
@@ -1669,6 +1848,21 @@ def render_gemm_arguments(
             "N": "N",
             "epilogue_args": epilogue_args,
         }
+=======
+        options = dict(
+            alpha=alpha,
+            beta=beta,
+            X=X,
+            W=W,
+            Y=Y,
+            Bias=Bias,
+            template=self,
+            kernel=kernel,
+            M="M",
+            N="N",
+            epilogue_args=epilogue_args,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert epilogue_template is not None
 
         if should_swap_xw:
@@ -1947,6 +2141,7 @@ def render_gemm_arguments(
         tensors. This operation also implies the M and N dimensions of Bias and GEMM output to be swapped
         before the function call.
         """
+<<<<<<< HEAD
         options = {
             "instance_type": instance_type,
             "alpha": alpha,
@@ -1962,6 +2157,23 @@ def render_gemm_arguments(
             "N": "N",
             "epilogue_args": epilogue_args,
         }
+=======
+        options = dict(
+            instance_type=instance_type,
+            alpha=alpha,
+            beta=beta,
+            X=X,
+            W=W,
+            Y=Y,
+            Bias=Bias,
+            Meta=Meta,
+            template=self,
+            kernel=kernel,
+            M="M",
+            N="N",
+            epilogue_args=epilogue_args,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if epilogue_template is None:
             arguments = self._template_from_string(argument_template).render(
diff --git a/torch/_inductor/codegen/cuda/serialization.py b/torch/_inductor/codegen/cuda/serialization.py
index a17f04b0a1b5a..d87c52341f6a2 100644
--- a/torch/_inductor/codegen/cuda/serialization.py
+++ b/torch/_inductor/codegen/cuda/serialization.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import functools
 import json
 from enum import Enum
 from typing import Any, Optional
+=======
+import enum
+import functools
+import json
+from enum import Enum
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
 
@@ -29,14 +37,25 @@ class CUTLASSOperationSerializer:
     ]
 
     @classmethod
+<<<<<<< HEAD
     def serialize(cls, operation: "GemmOperation") -> str:  # type: ignore[name-defined]  # noqa: F821
+=======
+    def serialize(cls, operation: "GemmOperation"):  # type: ignore[name-defined]  # noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Serialize a GEMM operation to JSON string.
 
         Args:
             operation: GemmOperation object
+<<<<<<< HEAD
 
         Returns:
             str: JSON string representation of the operation
+=======
+            indent: JSON indentation spaces
+
+        Returns:
+            str: JSON representation of the operation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         assert operation.__class__.__qualname__ == "GemmOperation", (
             "Only GemmOperation objects are supported via the main API"
@@ -57,7 +76,11 @@ def deserialize(cls, json_str: str) -> "GemmOperation":  # type: ignore[name-def
         return cls._json_to_gemm_operation(json_dict)
 
     @classmethod
+<<<<<<< HEAD
     def _gemm_operation_to_json(cls, operation: "GemmOperation") -> dict[str, Any]:  # type: ignore[name-defined]  # noqa: F821
+=======
+    def _gemm_operation_to_json(cls, operation):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Convert GemmOperation to JSON-serializable dict.
 
         Args:
@@ -119,7 +142,11 @@ def _gemm_operation_to_json(cls, operation: "GemmOperation") -> dict[str, Any]:
         return result
 
     @classmethod
+<<<<<<< HEAD
     def _json_to_gemm_operation(cls, json_dict: dict[str, Any]) -> "GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+=======
+    def _json_to_gemm_operation(cls, json_dict):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Convert JSON dict to GemmOperation object.
 
         Args:
@@ -144,9 +171,15 @@ def _json_to_gemm_operation(cls, json_dict: dict[str, Any]) -> "GemmOperation":
         gemm_kind = cls._json_to_enum(json_dict["gemm_kind"], GemmKind)
         arch = json_dict["arch"]
         tile_description = cls._json_to_tile_description(json_dict["tile_description"])
+<<<<<<< HEAD
         A = cls._json_to_tensor_description(json_dict.get("A"), "A")
         B = cls._json_to_tensor_description(json_dict.get("B"), "B")
         C = cls._json_to_tensor_description(json_dict.get("C"), "C")
+=======
+        A = cls._json_to_tensor_description(json_dict.get("A"))
+        B = cls._json_to_tensor_description(json_dict.get("B"))
+        C = cls._json_to_tensor_description(json_dict.get("C"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         element_epilogue = cls._json_to_enum(json_dict["element_epilogue"], DataType)
 
         # Get optional parameters with defaults
@@ -157,7 +190,11 @@ def _json_to_gemm_operation(cls, json_dict: dict[str, Any]) -> "GemmOperation":
         swizzling_functor = cls._json_to_enum(
             json_dict.get("swizzling_functor"), SwizzlingFunctor
         )
+<<<<<<< HEAD
         D = cls._json_to_tensor_description(json_dict.get("D"), "D")
+=======
+        D = cls._json_to_tensor_description(json_dict.get("D"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_schedule = cls._json_to_enum(
             json_dict.get("kernel_schedule"), KernelScheduleType
         )
@@ -181,7 +218,11 @@ def _json_to_gemm_operation(cls, json_dict: dict[str, Any]) -> "GemmOperation":
         if "ScaleFactorD" in json_dict and "ScaleFactorVectorSize" in json_dict:
             ScaleFactorD = {
                 "tensor": cls._json_to_tensor_description(
+<<<<<<< HEAD
                     json_dict.get("ScaleFactorD"), "ScaleFactorD"
+=======
+                    json_dict.get("ScaleFactorD")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 "vector_size": json_dict.get("ScaleFactorVectorSize"),
             }
@@ -218,26 +259,70 @@ def _json_to_gemm_operation(cls, json_dict: dict[str, Any]) -> "GemmOperation":
         return operation
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
     def _tile_description_to_json(cls, tile_desc: "TileDescription") -> str:  # type: ignore[name-defined]  # noqa: F821
         """
         Convert TileDescription to JSON string.
+=======
+    def _tile_description_to_json(cls, tile_desc):
+        """
+        Convert TileDescription to JSON dict.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             tile_desc: TileDescription object
 
         Returns:
+<<<<<<< HEAD
             str: JSON string representation
         """
+=======
+            dict: Dictionary representation
+        """
+        if tile_desc is None:
+            return None
+
+        # Create a dictionary for math_instruction if it exists
+        math_instruction_dict = None
+        if (
+            hasattr(tile_desc, "math_instruction")
+            and tile_desc.math_instruction is not None
+        ):
+            math_instruction = tile_desc.math_instruction
+            math_instruction_dict = {
+                "instruction_shape": math_instruction.instruction_shape,
+                "element_a": cls._enum_to_json(math_instruction.element_a),
+                "element_b": cls._enum_to_json(math_instruction.element_b),
+                "element_accumulator": cls._enum_to_json(
+                    math_instruction.element_accumulator
+                ),
+                "opcode_class": cls._enum_to_json(math_instruction.opcode_class),
+                "math_operation": cls._enum_to_json(math_instruction.math_operation),
+            }
+
+            # Add element_scale_factor if it exists
+            if (
+                hasattr(math_instruction, "element_scale_factor")
+                and math_instruction.element_scale_factor is not None
+            ):
+                math_instruction_dict["element_scale_factor"] = cls._enum_to_json(
+                    math_instruction.element_scale_factor
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Create the main dictionary with field names matching TileDescription constructor parameters
         result = {
             "threadblock_shape": tile_desc.threadblock_shape,
             "stages": tile_desc.stages,
             "warp_count": tile_desc.warp_count,
+<<<<<<< HEAD
             "math_instruction": cls._math_instruction_to_json(
                 tile_desc.math_instruction
             ),
+=======
+            "math_instruction": math_instruction_dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "min_compute": tile_desc.minimum_compute_capability,  # Store as min_compute for constructor
             "max_compute": tile_desc.maximum_compute_capability,  # Store as max_compute for constructor
             "cluster_shape": tile_desc.cluster_shape,
@@ -251,6 +336,7 @@ def _tile_description_to_json(cls, tile_desc: "TileDescription") -> str:  # type
         ):
             result["tile_shape"] = tile_desc.tile_shape
 
+<<<<<<< HEAD
         return json.dumps(result)
 
     @classmethod
@@ -258,6 +344,12 @@ def _tile_description_to_json(cls, tile_desc: "TileDescription") -> str:  # type
     def _json_to_tile_description(
         cls, json_dict: Optional[str]
     ) -> Optional["TileDescription"]:  # type: ignore[name-defined]  # noqa: F821
+=======
+        return result
+
+    @classmethod
+    def _json_to_tile_description(cls, json_dict):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Convert JSON dict to TileDescription object.
 
@@ -270,6 +362,7 @@ def _json_to_tile_description(
         if json_dict is None:
             return None
 
+<<<<<<< HEAD
         tile_dict = json.loads(json_dict)
 
         from cutlass_library.library import TileDescription
@@ -297,18 +390,95 @@ def _json_to_tile_description(
             max_compute=max_compute,
             cluster_shape=cluster_shape,
             explicit_vector_sizes=tile_dict.get("explicit_vector_sizes"),
+=======
+        from cutlass_library import DataType
+        from cutlass_library.library import (
+            MathInstruction,
+            MathOperation,
+            OpcodeClass,
+            TileDescription,
+        )
+
+        # First, reconstruct the math_instruction if it exists
+        math_instruction_obj = None
+        if (
+            "math_instruction" in json_dict
+            and json_dict["math_instruction"] is not None
+        ):
+            mi_dict = json_dict["math_instruction"]
+
+            # Convert string enum names back to enum values
+            element_a = cls._json_to_enum(mi_dict["element_a"], DataType)
+            element_b = cls._json_to_enum(mi_dict["element_b"], DataType)
+            element_acc = cls._json_to_enum(mi_dict["element_accumulator"], DataType)
+
+            # Get the opcode_class enum
+            opcode_class = cls._json_to_enum(mi_dict["opcode_class"], OpcodeClass)
+
+            # Get the math_operation enum
+            math_op = cls._json_to_enum(mi_dict["math_operation"], MathOperation)
+
+            # Create the MathInstruction object
+            math_instruction_obj = MathInstruction(
+                instruction_shape=mi_dict["instruction_shape"],
+                element_a=element_a,
+                element_b=element_b,
+                element_accumulator=element_acc,
+                opcode_class=opcode_class,
+                math_operation=math_op,
+            )
+
+            # Add element_scale_factor if it exists
+            if (
+                "element_scale_factor" in mi_dict
+                and mi_dict["element_scale_factor"] is not None
+            ):
+                math_instruction_obj.element_scale_factor = cls._json_to_enum(
+                    mi_dict["element_scale_factor"], DataType
+                )
+
+        # Get compute capability values, checking both naming conventions
+        min_compute = json_dict.get(
+            "min_compute", json_dict.get("minimum_compute_capability")
+        )
+        max_compute = json_dict.get(
+            "max_compute", json_dict.get("maximum_compute_capability")
+        )
+
+        # Get cluster shape with default value
+        cluster_shape = json_dict.get("cluster_shape", [1, 1, 1])
+
+        # Create the TileDescription object
+        tile_desc = TileDescription(
+            threadblock_shape=json_dict["threadblock_shape"],
+            stages=json_dict["stages"],
+            warp_count=json_dict["warp_count"],
+            math_instruction=math_instruction_obj,
+            min_compute=min_compute,
+            max_compute=max_compute,
+            cluster_shape=cluster_shape,
+            explicit_vector_sizes=json_dict.get("explicit_vector_sizes"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Set tile_shape if it exists and differs from threadblock_shape
         if (
+<<<<<<< HEAD
             "tile_shape" in tile_dict
             and tile_dict["tile_shape"] != tile_dict["threadblock_shape"]
         ):
             tile_desc.tile_shape = tile_dict["tile_shape"]
+=======
+            "tile_shape" in json_dict
+            and json_dict["tile_shape"] != json_dict["threadblock_shape"]
+        ):
+            tile_desc.tile_shape = json_dict["tile_shape"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return tile_desc
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
     def _math_instruction_to_json(
         cls,
@@ -401,23 +571,36 @@ def _tensor_description_to_json(
         tensor_desc: Optional["TensorDescription"],  # type: ignore[name-defined]  # noqa: F821
     ) -> Optional[str]:
         """Convert TensorDescription to JSON string.
+=======
+    def _tensor_description_to_json(cls, tensor_desc):
+        """Convert TensorDescription to JSON dict.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             tensor_desc: TensorDescription object
 
         Returns:
+<<<<<<< HEAD
             Optional[str]: JSON string representation or None
+=======
+            dict: Dictionary representation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if tensor_desc is None:
             return None
 
+<<<<<<< HEAD
         result = {
+=======
+        return {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "element": cls._enum_to_json(tensor_desc.element),
             "layout": cls._enum_to_json(tensor_desc.layout),
             "alignment": tensor_desc.alignment,
             "complex_transform": cls._enum_to_json(tensor_desc.complex_transform),
         }
 
+<<<<<<< HEAD
         return json.dumps(result)
 
     @classmethod
@@ -441,6 +624,21 @@ def _json_to_tensor_description(
 
         tensor_dict = json.loads(json_dict)
 
+=======
+    @classmethod
+    def _json_to_tensor_description(cls, tensor_json):
+        """Convert JSON dict to TensorDescription object.
+
+        Args:
+            tensor_json: Dictionary representation
+
+        Returns:
+            TensorDescription: Reconstructed object
+        """
+        if tensor_json is None:
+            return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from cutlass_library import DataType
         from cutlass_library.library import (
             ComplexTransform,
@@ -448,44 +646,73 @@ def _json_to_tensor_description(
             TensorDescription,
         )
 
+<<<<<<< HEAD
         element = cls._json_to_enum(tensor_dict["element"], DataType)
         layout = cls._json_to_enum(tensor_dict["layout"], LayoutType)
         alignment = tensor_dict["alignment"]
         complex_transform = cls._json_to_enum(
             tensor_dict["complex_transform"], ComplexTransform
+=======
+        element = cls._json_to_enum(tensor_json["element"], DataType)
+        layout = cls._json_to_enum(tensor_json["layout"], LayoutType)
+        alignment = tensor_json["alignment"]
+        complex_transform = cls._json_to_enum(
+            tensor_json["complex_transform"], ComplexTransform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return TensorDescription(element, layout, alignment, complex_transform)
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
     def _enum_to_json(cls, enum_value: Optional[Enum]) -> Optional[str]:
         """Convert enum value to JSON string.
+=======
+    def _enum_to_json(cls, enum_value):
+        """Convert enum value to JSON dict.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             enum_value: Enum value
 
         Returns:
+<<<<<<< HEAD
             Optional[str]: JSON string representation or None
+=======
+            dict: Dictionary representation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if enum_value is None:
             return None
 
+<<<<<<< HEAD
         result = {
+=======
+        assert isinstance(enum_value, enum.Enum)
+        return {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "type": enum_value.__class__.__name__,
             "name": enum_value.name,
         }
 
+<<<<<<< HEAD
         return json.dumps(result)
 
     @classmethod
     @functools.lru_cache(None)
     def _json_to_enum(cls, json_dict: Optional[str], enum_class: Any) -> Optional[Enum]:
         """Convert JSON string to enum value.
+=======
+    @classmethod
+    def _json_to_enum(cls, json_dict, enum_class):
+        """Convert JSON dict to enum value.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Format: {name: "EnumName", value: 1}
 
         Args:
+<<<<<<< HEAD
             json_dict: JSON string representation
             enum_class: Target enum class
 
@@ -498,6 +725,17 @@ def _json_to_enum(cls, json_dict: Optional[str], enum_class: Any) -> Optional[En
         enum_dict = json.loads(json_dict)
 
         return enum_class[enum_dict["name"]]
+=======
+            json_dict: Dictionary representation
+            enum_class: Target enum class
+
+        Returns:
+            Reconstructed enum value
+        """
+        if json_dict is None or json_dict.get("name", "None") == "None":
+            return None
+        return enum_class[json_dict["name"]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.lru_cache(1)
diff --git a/torch/_inductor/codegen/cuda_combined_scheduling.py b/torch/_inductor/codegen/cuda_combined_scheduling.py
index 92f4fb8adc986..925f6a5781423 100644
--- a/torch/_inductor/codegen/cuda_combined_scheduling.py
+++ b/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -11,7 +11,10 @@
     SchedulerNode,
 )
 from .cuda.cuda_cpp_scheduling import CUDACPPScheduling
+<<<<<<< HEAD
 from .cutedsl.cutedsl_scheduling import CuteDSLScheduling
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .rocm.rocm_cpp_scheduling import ROCmCPPScheduling
 from .triton import TritonScheduling
 
@@ -45,7 +48,10 @@ def __init__(self, scheduler: Optional[Scheduler]) -> None:
         self._triton_scheduling = TritonScheduling(scheduler)
         self._cuda_cpp_scheduling = CUDACPPScheduling(scheduler)
         self._rocm_cpp_scheduling = ROCmCPPScheduling(scheduler)
+<<<<<<< HEAD
         self._cutedsl_scheduling = CuteDSLScheduling(scheduler)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_backend_features(self, device: torch.device) -> OrderedSet[BackendFeature]:
         return self._triton_scheduling.get_backend_features(device)
@@ -55,8 +61,11 @@ def choose_node_backend(self, node: BaseSchedulerNode) -> BaseScheduling:
             return self._cuda_cpp_scheduling
         if self._rocm_cpp_scheduling.is_rocm_cpp_template(node):
             return self._rocm_cpp_scheduling
+<<<<<<< HEAD
         if self._cutedsl_scheduling.is_cutedsl_template(node):
             return self._cutedsl_scheduling
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._triton_scheduling
 
     def can_fuse_vertical(
@@ -68,11 +77,14 @@ def can_fuse_vertical(
             node1
         ) or self._cuda_cpp_scheduling.is_cuda_cpp_template(node2):
             return False
+<<<<<<< HEAD
         # CuteDSL doesn't support vertical fusion currently
         elif self._cutedsl_scheduling.is_cutedsl_template(
             node1
         ) or self._cutedsl_scheduling.is_cutedsl_template(node2):
             return False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._triton_scheduling.can_fuse_vertical(node1, node2)
 
     def can_fuse_horizontal(
@@ -83,10 +95,13 @@ def can_fuse_horizontal(
                 return self._cuda_cpp_scheduling.can_fuse_horizontal(
                     node1, node2
                 )  # always False at the moment
+<<<<<<< HEAD
             if self._cutedsl_scheduling.is_cutedsl_template(node):
                 return self._cutedsl_scheduling.can_fuse_horizontal(
                     node1, node2
                 )  # always False at the moment
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._triton_scheduling.can_fuse_horizontal(node1, node2)
 
     def group_fn(
@@ -111,6 +126,7 @@ def codegen_template(
             return self._rocm_cpp_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
             )
+<<<<<<< HEAD
         elif self._cutedsl_scheduling.is_cutedsl_template(template_node):
             # TODO remove this when we add epilogue support
             assert not epilogue_nodes
@@ -118,14 +134,19 @@ def codegen_template(
             return self._cutedsl_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return self._triton_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
             )
 
+<<<<<<< HEAD
     def codegen_mix_order_reduction(self, node):
         return self._triton_scheduling.codegen_mix_order_reduction(node)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]) -> None:
         return self._triton_scheduling.codegen_node(node)
 
@@ -147,6 +168,7 @@ def benchmark_codegened_module(self, module):
         return self._triton_scheduling.benchmark_codegened_module(module)
 
     def generate_kernel_code_from_nodes(
+<<<<<<< HEAD
         self,
         nodes: Sequence[Any],
         benchmark_kernel: bool = False,
@@ -154,6 +176,12 @@ def generate_kernel_code_from_nodes(
     ) -> str:
         return self._triton_scheduling.generate_kernel_code_from_nodes(
             nodes, benchmark_kernel, hint_override=hint_override
+=======
+        self, nodes: Sequence[Any], benchmark_kernel: bool = False
+    ) -> str:
+        return self._triton_scheduling.generate_kernel_code_from_nodes(
+            nodes, benchmark_kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def benchmark_combo_kernel(
diff --git a/torch/_inductor/codegen/debug_utils.py b/torch/_inductor/codegen/debug_utils.py
index d10b3bfbb9c62..0ea5335a8e5f2 100644
--- a/torch/_inductor/codegen/debug_utils.py
+++ b/torch/_inductor/codegen/debug_utils.py
@@ -274,7 +274,11 @@ def codegen_intermediate_tensor_value_print(
                         f'printf("[  {launch_prefix} - {kernel_name} - {arg}: %ld  ]", {arg}); printf("\\\\n");'
                     )
                 else:
+<<<<<<< HEAD
                     if arg_signatures is None and self.kernel_type in ("cpp", "extern"):
+=======
+                    if arg_signatures is None and self.kernel_type == "cpp" or "extern":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         V.graph.wrapper_code.writeline(
                             f'aoti_torch_print_tensor_handle({arg}, "{launch_prefix} - {kernel_name} - {arg}");'
                         )
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index cdad5f1c72426..b8da1d573d834 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -54,7 +54,10 @@
     from collections.abc import Sequence
 
     from ..ops_handler import ReductionType, StoreMode
+<<<<<<< HEAD
     from ..shape_propagation import BlockShapeType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -557,7 +560,10 @@ def masked(mask, body, other):
             f"hl.cast({result.name}.type(), {halide_constant(other)})",
             [],
             bounds=ValueRanges.wrap(other),
+<<<<<<< HEAD
             shape=result.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # TODO(jansel): look into removing the where in the same places triton does
         return ops.where(new_mask, result, other)
@@ -566,6 +572,7 @@ def masked(mask, body, other):
     def frexp(x):
         raise NotImplementedError("frexp")
 
+<<<<<<< HEAD
     @staticmethod
     def device_assert_async(cond, msg):
         raise NotImplementedError("device_assert_async")
@@ -579,6 +586,8 @@ def partial_accumulate(
     ) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 HalideOverrides._initialize_pointwise_overrides("halide")
 
@@ -591,9 +600,14 @@ def __init__(
         name,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
         shape: BlockShapeType = None,
     ) -> None:
         super().__init__(name, bounds, dtype, shape=shape)
+=======
+    ) -> None:
+        super().__init__(name, bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.used_dims: Optional[list[sympy.Symbol]] = None
 
     def update_on_args(self, name, args, kwargs):
@@ -645,7 +659,10 @@ def index_str(self, replacements=None, zero_vars=False):
             return "hl.Var()"
         if replacements:
             replacements = {**replacements}
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for sym in expr.free_symbols:
                 if symbol_is_type(sym, SymT.TMP):
                     assert isinstance(sym, sympy.Symbol)
@@ -660,12 +677,21 @@ def eq(left, right):
     if V.graph.sizevars.statically_known_equals(left, right):
         return True
     try:
+<<<<<<< HEAD
         a = V.graph.sizevars.size_hint_or_throw(left)
         b = V.graph.sizevars.size_hint_or_throw(right)
     except TypeError:  # unbacked symints
         return False
     if a == b:
         V.graph.sizevars.check_equals(left, right)
+=======
+        a = V.graph.sizevars.size_hint(left)
+        b = V.graph.sizevars.size_hint(right)
+    except TypeError:  # unbacked symints
+        return False
+    if a == b:
+        V.graph.sizevars.guard_equals(left, right)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return a == b
 
 
@@ -673,15 +699,24 @@ def lt(left, right):
     if V.graph.sizevars.statically_known_lt(left, right):
         return True
     try:
+<<<<<<< HEAD
         a = V.graph.sizevars.size_hint_or_throw(left)
         b = V.graph.sizevars.size_hint_or_throw(right)
+=======
+        a = V.graph.sizevars.size_hint(left)
+        b = V.graph.sizevars.size_hint(right)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except TypeError:  # unbacked symints
         gcd = sympy.gcd(left, right)
         if gcd == left:
             return left != right
         return False
     if a < b:
+<<<<<<< HEAD
         V.graph.sizevars.check_lt(left, right)
+=======
+        V.graph.sizevars.guard_lt(left, right)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return a < b
 
 
@@ -719,11 +754,17 @@ def __init__(
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return halide_type(dtype)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def create_cse_var(self, name, bounds=None, dtype=None, shape=None):
         self.body.writeline(f"{name} = hl.Func({name!r})")
         # pyrefly: ignore [bad-argument-type]
         return HalideCSEVariable(name, bounds, dtype, shape)
+=======
+    def create_cse_var(self, name, bounds=None, dtype=None):
+        self.body.writeline(f"{name} = hl.Func({name!r})")
+        return HalideCSEVariable(name, bounds, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def finalize_indexing(self, indices: Sequence[sympy.Expr]):
         """
@@ -740,7 +781,10 @@ def finalize_indexing(self, indices: Sequence[sympy.Expr]):
             self.index_replacements or self.halide_vars or self.reduction_renames
         )
         size_hint = functools.partial(V.graph.sizevars.size_hint, fallback=inf)  # type: ignore[arg-type]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         indices = dict.fromkeys(map(super().prepare_indexing, indices))
         all_used_symbols = OrderedSet[Any]()
         sym_to_node = {
@@ -839,7 +883,10 @@ def visit_floor_div(base, divisor):
                         handled_count = len(nodes)
                         had_fallback = True
                     sym = sympy_index_symbol(f"h{len(self.halide_vars)}")
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if tree.is_reduction:
                         self.reduction_renames[sym] = sympy_index_symbol(
                             f"hr{len(self.halide_vars)}"
@@ -1217,13 +1264,20 @@ def reduction(
         assert isinstance(value, HalideCSEVariable) and value.used_dims is not None
         reduction_vars = OrderedSet(self.reduction_renames)
         result_var = self.newfunc(
+<<<<<<< HEAD
             [v for v in value.used_dims if v not in reduction_vars],
+=======
+            [v for v in value.used_dims if v not in reduction_vars]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if reduction_vars - OrderedSet(value.used_dims):
             value = self.genfunc(
                 f"{value}",
                 self.sort_used_dims(OrderedSet((*value.used_dims, *reduction_vars))),
+<<<<<<< HEAD
                 shape=value.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         value_str = value.subs_str(self.reduction_renames)
         default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
@@ -1236,10 +1290,15 @@ def reduction(
             parts = []
             stride = 1
             for i, sym in enumerate(self.reduction_renames):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 parts.append(f"{index}[{i}]")
                 if stride != 1:
                     # pyrefly: ignore [unsupported-operation]
+=======
+                parts.append(f"{index}[{i}]")
+                if stride != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     parts[-1] += f"*{stride}"
                 stride *= self.halide_vars[sym]
             self.body.writeline(f"{result_var} = {' + '.join(parts)}")
@@ -1315,9 +1374,13 @@ def scan(
             else:
                 values.append(
                     self.genfunc(
+<<<<<<< HEAD
                         f"{value}",
                         [*value.used_dims, [*self.reduction_renames][:1]],
                         shape=value.shape,
+=======
+                        f"{value}", [*value.used_dims, [*self.reduction_renames][:1]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
             all_used_dims.update(value.used_dims)
@@ -1381,6 +1444,7 @@ def maybe_tuple(x):
         return tuple(unpack_vars)
 
     def genfunc(
+<<<<<<< HEAD
         self,
         line,
         used_dims,
@@ -1389,12 +1453,22 @@ def genfunc(
         shape: BlockShapeType = None,
     ) -> HalideCSEVariable:
         var = self.cse.generate(self.body, line, bounds=bounds, shape=shape)
+=======
+        self, line, used_dims, *, bounds=ValueRanges.unknown()
+    ) -> HalideCSEVariable:
+        var = self.cse.generate(self.body, line, bounds=bounds)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(var, HalideCSEVariable)
         var.used_dims = used_dims
         return var
 
+<<<<<<< HEAD
     def newfunc(self, used_dims, *, shape: BlockShapeType = None) -> HalideCSEVariable:
         var = self.cse.newvar(shape=shape)
+=======
+    def newfunc(self, used_dims) -> HalideCSEVariable:
+        var = self.cse.newvar()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(var, HalideCSEVariable)
         var.used_dims = used_dims
         return var
@@ -1592,7 +1666,10 @@ def update_index(m):
                     hint = self._autoscheduler_workarounds(
                         V.graph.sizevars.size_hint(dim.size, fallback=1), dims
                     )
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     range_hints.append(f"hl.Range(0, {hint})")
                     if "out" not in arg.name:
                         code.writeline(f"{arg.name}.dim({i}).set_min(0)")
@@ -1659,7 +1736,11 @@ def _autoscheduler_workarounds(n, dims):
             n = max(2, n)
         return n
 
+<<<<<<< HEAD
     def call_kernel(self, name: str, node=None, deallocate_ws: bool = True):
+=======
+    def call_kernel(self, name: str, node=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Codegen a call to this kernel"""
         wrapper = V.graph.wrapper_code
         call_args = [f"{n}" for n, arg in self.halide_argdefs() if arg.alias_of is None]
diff --git a/torch/_inductor/codegen/memory_planning.py b/torch/_inductor/codegen/memory_planning.py
index 12d7500975e5b..e33de61e838ff 100644
--- a/torch/_inductor/codegen/memory_planning.py
+++ b/torch/_inductor/codegen/memory_planning.py
@@ -10,7 +10,10 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from .. import config
@@ -143,6 +146,7 @@ class Allocation(AllocationTreeNode):
     allocated: bool = False
     pool: Optional[AllocationPool] = None
     offset: Optional[sympy.Expr] = None
+<<<<<<< HEAD
     earliest_available: Optional[float] = None
 
     def __post_init__(self) -> None:
@@ -154,6 +158,8 @@ def __post_init__(self) -> None:
 
         if has_unbacked_sym:
             self.earliest_available = self.get_live_ranges().begin
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def device(self):
@@ -198,9 +204,12 @@ def __repr__(self):
             f"offset={self.offset})"
         )
 
+<<<<<<< HEAD
     def get_earliest_available(self):
         return self.earliest_available
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class Empty(AllocationTreeNode):
@@ -392,6 +401,7 @@ class AllocationPool:
     names_to_del: list[str] = dataclasses.field(default_factory=list)
     creation_cache: dict[str, str] = dataclasses.field(default_factory=dict)
 
+<<<<<<< HEAD
     def __post_init__(self) -> None:
         for block in self.root.allocations:
             if isinstance(block, Allocation):
@@ -412,6 +422,16 @@ def allocate(self, block: Allocation, is_last: bool):
         is_last = self.can_expand and is_last
         if self.root.allocate(block, is_last):
             self.update_restrict_live_range(block)
+=======
+    def allocate(self, block: Allocation, is_last: bool):
+        if self.restrict_live_range and not self.restrict_live_range.contains(
+            block.live_range
+        ):
+            return False
+
+        is_last = self.can_expand and is_last
+        if self.root.allocate(block, is_last):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
 
         if is_last:
@@ -419,6 +439,7 @@ def allocate(self, block: Allocation, is_last: bool):
 
         return False
 
+<<<<<<< HEAD
     def update_restrict_live_range(self, block: Allocation):
         if block_earliest_available := block.get_earliest_available():
             if self.restrict_live_range is None:
@@ -435,6 +456,11 @@ def allocate_at_end(self, block):
         block.mark_allocated()
         self.root = TemporalSplit([SpatialSplit(self.root, TemporalSplit([block]))])
         self.update_restrict_live_range(block)
+=======
+    def allocate_at_end(self, block):
+        block.mark_allocated()
+        self.root = TemporalSplit([SpatialSplit(self.root, TemporalSplit([block]))])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     def finalize(self, name):
@@ -448,6 +474,10 @@ def codegen_create(self, wrapper, code: IndentedBuffer):
         nbytes = self.root.get_symbolic_size()
         for block in self.root.allocations:
             if isinstance(block, Allocation) and nbytes == block.get_symbolic_size():
+<<<<<<< HEAD
+=======
+                # optimization: fuse first allocation and pool creation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node = block.node
                 code.writeline(
                     wrapper.make_allocation(
@@ -458,6 +488,10 @@ def codegen_create(self, wrapper, code: IndentedBuffer):
                         stride=tuple(node.get_stride()),
                     )
                 )
+<<<<<<< HEAD
+=======
+                self.creation_cache[block.codegen_alloc_from_pool(wrapper)] = self.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return
         else:
             code.writeline(
@@ -615,10 +649,14 @@ def codegen(self, code: IndentedBuffer):
             pool.codegen_create(self.wrapper, code)
 
         pool.names_to_del.extend(self.group.names)
+<<<<<<< HEAD
         alloc_from_pool, allocation_lines_to_write = allocation.codegen_alloc_from_pool(
             self.wrapper
         )
         code.writelines(allocation_lines_to_write)
+=======
+        alloc_from_pool = allocation.codegen_alloc_from_pool(self.wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if alloc_from_pool in pool.creation_cache:
             code.writeline(
                 self.wrapper.make_tensor_alias(
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 268d044db6bae..c9a566494b286 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -366,7 +366,11 @@ def randint64(
 
     @staticmethod
     def round(x: CSEVariable) -> str:
+<<<<<<< HEAD
         return f"metal::rint({x})"
+=======
+        return f"metal::round({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def pow(a: CSEVariable, b: CSEVariable) -> str:
@@ -421,8 +425,11 @@ def _initialize_special_ops(cls) -> None:
         # Binary special ops
         for name in [
             "polygamma",
+<<<<<<< HEAD
             "igamma",
             "igammac",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "zeta",
         ]:
             setattr(cls, name, functools.partialmethod(cls._special_binary, name=name))
@@ -435,10 +442,13 @@ def _initialize_special_ops(cls) -> None:
             "chebyshev_polynomial_w",
             "hermite_polynomial_h",
             "hermite_polynomial_he",
+<<<<<<< HEAD
             "shifted_chebyshev_polynomial_t",
             "shifted_chebyshev_polynomial_u",
             "shifted_chebyshev_polynomial_v",
             "shifted_chebyshev_polynomial_w",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]:
             setattr(
                 cls,
@@ -516,7 +526,10 @@ def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> N
         var = self.args.output(name)
         index = self.prepare_indexing(index)
         dtype_str = self.dtype_to_str(V.graph.get_dtype(name))
+<<<<<<< HEAD
         # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_dim = next(t for t in self.range_trees if t.is_reduction)
         # Only one thread in the reduction group needs to store the results
         line = f"{var}[{self.index_to_str(index)}] = static_cast<{dtype_str}>({value});"
@@ -538,7 +551,11 @@ def _new_idxvar(
         var_def = "threadgroup " if is_threadgroup else ""
         var_def += f"{dtype} {var_name}"
         if elem_count:
+<<<<<<< HEAD
             var_def += f"[{self.sexpr(elem_count)}]"
+=======
+            var_def += f"[{elem_count}]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if default_value is not None:
             assert not is_threadgroup, "Thread group var can not have default value"
             var_def += f" = {default_value}"
@@ -583,12 +600,16 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
         reduction_idx = ""
         acc_buf_size = 1
         for rd in self.range_trees:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not rd.is_reduction:
                 continue
             if reduction_idx:
                 reduction_idx += " + "
             reduction_idx += f"{rd.name} * {acc_buf_size}"
+<<<<<<< HEAD
 
             if isinstance(rd.numel, sympy.Integer):
                 acc_buf_size *= rd.numel
@@ -604,6 +625,10 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
             if isinstance(acc_buf_size, sympy.Integer)
             else self.simd_group_size
         )
+=======
+            acc_buf_size *= rd.numel
+        acc_buf_size = min(acc_buf_size, self.max_threadgroup_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if reduction_type == "any":
             acc = self._new_idxvar(dtype)
@@ -627,7 +652,13 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
 
         if reduction_type in ["prod", "sum"]:
             acc_dtype = DTYPE_TO_COMPUTATION_DTYPE[src_dtype]
+<<<<<<< HEAD
             acc_buf = self._new_idxvar(acc_dtype, shmem_buf_size)
+=======
+            acc_buf = self._new_idxvar(
+                acc_dtype, ceildiv(acc_buf_size, self.simd_group_size)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self.multistage_reduction_entry:
                 val = value
             else:
@@ -638,6 +669,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                     acc_dtype, default_value=default_val, is_threadgroup=False
                 )
                 self.compute.splice(f"{val} {reduction_op}= {value};")
+<<<<<<< HEAD
 
             return self.cse.generate(
                 self.stores,
@@ -701,6 +733,55 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 self.stores,
                 f"c10::metal::threadgroup_{reduction_type}({data_acc_buf}, {idx_acc_buf}, "
                 f"{val}, {idx_val}, {reduction_idx}, {acc_buf_size_str})",
+=======
+            return self.cse.generate(
+                self.stores,
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {val}, {reduction_idx}, {acc_buf_size})",
+                dtype=DTYPE_TO_COMPUTATION_DTYPE[dtype],
+            )
+        if reduction_type in ["max", "min", "argmin", "argmax"]:
+            acc_buf = self._new_idxvar(src_dtype, acc_buf_size)
+            acc_thread_var = f"{acc_buf}[{reduction_idx}]"
+            src_metal_type = DTYPE_TO_METAL[src_dtype]
+            if not self.multistage_reduction_entry:
+                self.compute.splice(
+                    f"{acc_thread_var} = static_cast<{src_metal_type}>({value});"
+                )
+                return self.cse.generate(
+                    self.stores,
+                    f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+                    dtype=dtype,
+                )
+            lim_fn = "lowest" if reduction_type.endswith("max") else "max"
+            self.indexing_code.writeline(
+                f"{acc_thread_var} = ::metal::numeric_limits<{src_metal_type}>::{lim_fn}();"
+            )
+            if reduction_type.startswith("arg"):
+                idx_var = next(
+                    t for t in self.range_tree_nodes.values() if t.is_reduction
+                )
+                idx_acc_buf = self._new_idxvar(torch.long, acc_buf_size)
+                cmp_op = ">" if reduction_type == "argmax" else "<"
+                idx_thread_var = f"{idx_acc_buf}[{reduction_idx}]"
+                self.indexing_code.splice(f"{idx_thread_var} = -1;")
+                self.compute.splice(f"""
+                if ({value} {cmp_op} {acc_thread_var}) {{
+                    {acc_thread_var} = {value};
+                    {idx_thread_var} = {idx_var.name};
+                }}
+                """)
+                return self.cse.generate(
+                    self.stores,
+                    f"{idx_acc_buf}[c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})]",
+                    dtype=dtype,
+                )
+            self.compute.writeline(
+                f"{acc_thread_var} = ::c10::metal::{reduction_type}({acc_thread_var}, {value});"
+            )
+            return self.cse.generate(
+                self.stores,
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=dtype,
             )
         if reduction_type == "welford_reduce":
@@ -709,7 +790,11 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 self.compute.splice(f"{acc_buf}[{reduction_idx}] = {value};")
                 wf_res = self.cse.generate(
                     self.compute,
+<<<<<<< HEAD
                     f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size_str})",
+=======
+                    f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dtype=torch.float32,
                 )
                 return _unwrap_helper(wf_res)
@@ -740,7 +825,11 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 self.compute.writeline(f"{acc_thread_var} = {inp_value};")
             wf_res = self.cse.generate(
                 self.stores if self.multistage_reduction_entry else self.compute,
+<<<<<<< HEAD
                 f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size_str})",
+=======
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=torch.float32,
             )
             return _unwrap_helper(wf_res)
@@ -750,15 +839,20 @@ def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry) -> None:
         index_expr = self.rename_indexing(entry.expr)
         index_str = self.sexpr(index_expr)  # type: ignore[misc]
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-argument]
         if not entry.is_reduction or (
             isinstance(entry.root.numel, sympy.Integer)
             and entry.root.numel <= self.max_threadgroup_size
         ):
+=======
+        if not entry.is_reduction or entry.root.numel <= self.max_threadgroup_size:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.indexing_code.writeline(
                 f"{self.index_dtype} {entry.name} = {index_str};"
             )
             return
+<<<<<<< HEAD
 
         acc_size = (
             entry.root.numel
@@ -766,10 +860,13 @@ def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry) -> None:
             else sympy.Symbol(f"{entry.root.prefix}numel", integer=True, positive=True)
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.multistage_reduction_entry.append(entry)
         # When reducing the tensor whose size exceeds max threadgroup size
         # loop over extra indices per reduction thread and perform part of the operation
         # using values in the shared memory
+<<<<<<< HEAD
 
         # Use floats so that it doesn't do integer division
         loop_size = (acc_size + float(self.max_threadgroup_size - 1)) // float(
@@ -796,6 +893,21 @@ def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry) -> None:
                 or loop_size * self.max_threadgroup_size != acc_size
             ):
                 self.body.writeline(f"if ({entry.name} >= {acc_size}) break;")
+=======
+        loop_size = (
+            entry.root.numel + self.max_threadgroup_size - 1
+        ) // self.max_threadgroup_size
+        self.body.writeline(
+            f"for(auto {entry.name}_cnt = 0; {entry.name}_cnt < {loop_size}; ++{entry.name}_cnt) {{"
+        )
+        with self.body.indent():
+            self.body.writeline(
+                f"{self.index_dtype} {entry.name} = {loop_size} * {index_str} + {entry.name}_cnt;"
+            )
+            # Check that reduction is performed only within tensor boundary
+            if loop_size * self.max_threadgroup_size != entry.root.numel:
+                self.body.writeline(f"if ({entry.name} >= {entry.root.numel}) break;")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_body(self) -> None:
         """
@@ -862,6 +974,7 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
 
             if self.inside_reduction:
                 total_reduction_size = math.prod(
+<<<<<<< HEAD
                     t.numel
                     for t in self.range_trees
                     # pyrefly: ignore [missing-argument]
@@ -874,6 +987,11 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                     if isinstance(total_reduction_size, sympy.Integer)
                     else self.max_threadgroup_size
                 )
+=======
+                    t.numel for t in self.range_trees if t.is_reduction
+                )
+                threadgroup_size = min(total_reduction_size, self.max_threadgroup_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 code.writeline(
                     f"[[max_total_threads_per_threadgroup({threadgroup_size})]]"
                 )
@@ -897,6 +1015,7 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                     code.writeline(f"constant {dtype_str}* {inner},")
                 for outer, inner in self.args.sizevars.items():
                     code.writeline(f"constant long& {inner},")
+<<<<<<< HEAD
 
                 # Write dynamic values as inputs
                 for idx_var in idx_vars:
@@ -905,6 +1024,8 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                     else:
                         code.writeline(f"constant long& {idx_var.prefix}numel,")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(idx_vars) < 4, "Up to 3 index variables are supported"
                 thread_pos_dtype = (
                     f"uint{len(idx_vars)}" if len(idx_vars) > 1 else "uint"
@@ -938,12 +1059,17 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
 
         return code.getvalue()
 
+<<<<<<< HEAD
     def call_kernel(
         self, name: str, node: Any = None, deallocate_ws: bool = True
     ) -> None:
         """
         Codegens a call to this kernel
         """
+=======
+    def call_kernel(self, name: str, node: Any = None) -> None:
+        """Codegen a call to this kernel"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper = V.graph.wrapper_code
         # Make sure sizevars has been computed
         for v in self.args.sizevars.keys():
@@ -957,6 +1083,7 @@ def call_kernel(
         args = [*self.args.output_buffers.keys(), *self.args.input_buffers.keys()]
         args = [arg for arg in args if arg not in self.removed_buffers]
         args += [str(v) for v in self.args.sizevars.keys()]
+<<<<<<< HEAD
         arg_types = [arg_name_to_type[arg] for arg in args]
 
         # Add any dynamic ints as inputs
@@ -974,6 +1101,10 @@ def call_kernel(
                 args.append(str(expr))
                 arg_types.append(int)
 
+=======
+
+        arg_types = [arg_name_to_type[arg] for arg in args]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expr_printer = self.cexpr if V.graph.cpp_wrapper else self.pexpr
 
         def format_threads(threads: list[str], kwarg: str) -> str:
@@ -989,7 +1120,10 @@ def format_threads(threads: list[str], kwarg: str) -> str:
             threads = [
                 expr_printer(
                     sympy.Min(v.numel, self.max_threadgroup_size)  # type: ignore[misc]
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if v.is_reduction
                     else v.numel
                 )
@@ -1005,7 +1139,10 @@ def format_threads(threads: list[str], kwarg: str) -> str:
         if self.inside_reduction:
             threads = [
                 expr_printer(sympy.Min(v.numel, self.max_threadgroup_size))  # type: ignore[misc]
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if v.is_reduction
                 else "1"
                 for v in self.active_range_trees()
@@ -1022,7 +1159,11 @@ def format_threads(threads: list[str], kwarg: str) -> str:
         wrapper.generate_kernel_call(
             name,
             args,
+<<<<<<< HEAD
             device=torch.device("mps"),
+=======
+            device=torch.device("cpu"),  # TODO: Fix me, MPS does not expose streams now
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             triton=False,
             arg_types=arg_types,
         )
@@ -1068,6 +1209,7 @@ def define_kernel(
             # Either using MultiKernel concept or overriding SIMDScheduling.codegen_node_scheduling
             mps_lib_name = f"mps_lib_{wrapper.next_kernel_suffix()}"
 
+<<<<<<< HEAD
             kernel_name = f"{mps_lib_name}"
             wrapper.src_to_kernel[src_code] = kernel_name
 
@@ -1075,6 +1217,18 @@ def define_kernel(
                 # For shimified version, generate source constant instead of direct instantiation
                 src_code = f"const char* {mps_lib_name}_source = " + src_code
 
+=======
+            if V.graph.cpp_wrapper:
+                src_code = (
+                    f"at::native::mps::DynamicMetalShaderLibrary {mps_lib_name}"
+                    + src_code
+                )
+                kernel_name = f"{mps_lib_name}_func"
+            else:
+                kernel_name = f"{mps_lib_name}.generated_kernel"
+
+            wrapper.src_to_kernel[src_code] = kernel_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
             metadata_comment = f"{origins}\n{detailed_origins}"
             wrapper.define_kernel(mps_lib_name, src_code, metadata_comment, gpu=False)
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
index 9bd0d780f824f..ca9850ae2a5d7 100644
--- a/torch/_inductor/codegen/multi_kernel.py
+++ b/torch/_inductor/codegen/multi_kernel.py
@@ -1,12 +1,18 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+<<<<<<< HEAD
 import math
 import os
 import pathlib
 from typing import Any, Optional, Union
 
 from torch._inductor.ir import MultiTemplateBuffer
+=======
+import os
+import pathlib
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
 from torch.utils._ordered_set import OrderedSet
 
@@ -33,6 +39,7 @@ def __init__(self):
         self.subkernel_to_kernel_name = {}
         self.kernel_defs = IndentedBuffer()
 
+<<<<<<< HEAD
     def define_kernel(
         self,
         kernels: list[Any],
@@ -40,6 +47,9 @@ def define_kernel(
             list[Union[None, tuple[tuple[int, ...], ...]]]
         ] = None,
     ) -> str:
+=======
+    def define_kernel(self, kernels):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
         This has some minor issue.
@@ -53,6 +63,7 @@ def define_kernel(
         The only different is cache eviction policy.
 
         We should name the multi-kernel differently in these 2 cases.
+<<<<<<< HEAD
 
         kernels:
             A list of kernels
@@ -63,6 +74,9 @@ def define_kernel(
         # Prevent circular import
         from ..select_algorithm import TritonTemplateKernel
 
+=======
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_names = tuple(k.kernel_name for k in kernels)
         if kernel_names in self.subkernel_to_kernel_name:
             return self.subkernel_to_kernel_name[kernel_names]
@@ -76,6 +90,7 @@ def define_kernel(
             # the second pass of cpp-wrapper.
             return multi_kernel_name
 
+<<<<<<< HEAD
         arg_index: dict[int, list[slice]] = {}
         _, call_args, _, arg_types = kernels[0].args.python_argdefs()
         if isinstance(kernels[0], TritonTemplateKernel) and isinstance(
@@ -125,6 +140,17 @@ def define_kernel(
                 for shape_key, name in zip(kernel_shape_keys, kernel_names):
                     buf.writeline(f"{shape_key}: {name},")
             buf.writeline("}, arg_index=arg_index)")
+=======
+        buf = self.kernel_defs
+        buf.writeline("")
+        buf.writeline(
+            f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
+        )
+        with buf.indent():
+            for name in kernel_names:
+                buf.writeline(f"{name},")
+        buf.writeline("])")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if config.triton.autotune_at_compile_time:
             V.graph.wrapper_code.src_to_kernel["\n".join(kernel_names)] = (
@@ -193,9 +219,12 @@ def call_kernel(self, kernel_name):
         Collect the union of arguments from all subkernels as the arguments
         for the multi-kernel.
         """
+<<<<<<< HEAD
         # Prevent circular import
         from ..select_algorithm import TritonTemplateKernel
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert kernel_name == self.kernel_name
         V.graph.wrapper_code.write_triton_header_once()
         _, call_args, _, arg_types = self.kernels[0].args.python_argdefs()
@@ -209,6 +238,7 @@ def call_kernel(self, kernel_name):
             # the fast kernel directly
             kernel_name = MultiKernelCall.lookup_choice(self.kernel_name)
 
+<<<<<<< HEAD
         if isinstance(self.kernels[0], TritonTemplateKernel) and isinstance(
             self.kernels[0].output_node, MultiTemplateBuffer
         ):
@@ -229,10 +259,15 @@ def call_kernel(self, kernel_name):
             self.kernels[0].add_numel_to_call_args(kernel_name, call_args, arg_types)
             multi_call_args = call_args
             multi_call_arg_types = arg_types
+=======
+        # numels for all subkernels should be the same. Use kernels[0] here
+        self.kernels[0].add_numel_to_call_args(kernel_name, call_args, arg_types)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for ws in self.kernels[0].args.workspace_args:
             V.graph.wrapper_code.generate_workspace_allocation(ws)
 
+<<<<<<< HEAD
         if V.graph.cpp_wrapper:
             # We have already selected the best kernel at compile time
             # so we only have one set of call args. NB: this currently
@@ -245,6 +280,13 @@ def call_kernel(self, kernel_name):
             V.graph.wrapper_code.generate_kernel_call(
                 kernel_name, multi_call_args, arg_types=multi_call_arg_types
             )
+=======
+        V.graph.wrapper_code.generate_kernel_call(
+            kernel_name,
+            call_args,
+            arg_types=arg_types,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for ws in reversed(self.kernels[0].args.workspace_args):
             V.graph.wrapper_code.generate_workspace_deallocation(ws)
@@ -291,8 +333,13 @@ class MultiKernelCall:
     This class is called at run time to actually run the kernel
     """
 
+<<<<<<< HEAD
     def __init__(self, multi_kernel_name, kernels, arg_index):
         assert len(kernels) >= 1
+=======
+    def __init__(self, multi_kernel_name, kernels):
+        assert len(kernels) >= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._kernels = kernels
         self.multi_kernel_name = multi_kernel_name
 
@@ -301,12 +348,18 @@ def __init__(self, multi_kernel_name, kernels, arg_index):
         ) == "1" or is_metric_table_enabled("persistent_red_perf")
 
         self.picked_kernel = None
+<<<<<<< HEAD
         self.arg_index = arg_index
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.triton.multi_kernel > 1:
             # manually force a subkernel to ease perf testing
             picked_by_config = config.triton.multi_kernel - 2
             assert picked_by_config < len(self._kernels)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.picked_kernel = picked_by_config
         elif not self.disable_cache:
             self.load_cache()
@@ -330,9 +383,13 @@ def load_cache(self):
         path = self.cache_file_path()
         if path.exists():
             with path.open() as fd:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 self.picked_kernel = int(fd.read())
                 # pyrefly: ignore [unsupported-operation]
+=======
+                self.picked_kernel = int(fd.read())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert self.picked_kernel >= 0 and self.picked_kernel < len(
                     self._kernels
                 )
@@ -372,15 +429,22 @@ def benchmark_sub_kernels(self, *args, **kwargs):
         be picked.
         """
 
+<<<<<<< HEAD
         def wrap_fn(kernel, index):
             def inner():
                 filtered_args = self._get_filtered_args(args, index)
                 args_clone, kwargs_clone = kernel.clone_args(*filtered_args, **kwargs)
+=======
+        def wrap_fn(kernel):
+            def inner():
+                args_clone, kwargs_clone = kernel.clone_args(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return kernel.run(*args_clone, **kwargs_clone)
 
             return inner
 
         return [
+<<<<<<< HEAD
             benchmarker.benchmark_gpu(wrap_fn(kernel, index), rep=40)
             for index, kernel in enumerate(self.kernels)
         ]
@@ -400,6 +464,12 @@ def _get_filtered_args(self, args, index):
             return args
         return [item for s in self.arg_index[index] for item in args[s]]
 
+=======
+            benchmarker.benchmark_gpu(wrap_fn(kernel), rep=40)
+            for kernel in self.kernels
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # record_choice and lookup_choice are helper functions for cpp-wrapper
     # codegen. The first pass use record_choice to keep the choice and
     # the second pass do lookup by calling lookup_choice.
@@ -451,7 +521,10 @@ def run(self, *args, **kwargs):
             get_metric_table("persistent_red_perf").add_row(
                 functools.partial(self._metrics_table_row, timings)
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self.disable_cache:
                 self.store_cache()
 
@@ -462,10 +535,15 @@ def run(self, *args, **kwargs):
             )
             assert picked_kernel_name is not None
             self.record_choice(self.multi_kernel_name, picked_kernel_name)
+<<<<<<< HEAD
 
         run = self.kernels[self.picked_kernel].run  # type: ignore[method-assign]
         filtered_args = self._get_filtered_args(args, self.picked_kernel)
         run(*filtered_args, **kwargs)
+=======
+        self.run = self.kernels[self.picked_kernel].run  # type: ignore[method-assign]
+        self.run(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _metrics_table_row(self, timings):
         def get_kernel_path(k):
@@ -486,6 +564,7 @@ def get_kernel_path(k):
                 row[f"kernel{i}_path"] = ""
                 row[f"kernel{i}_latency"] = ""
         return row
+<<<<<<< HEAD
 
 
 class SizeHintMultiKernel(MultiKernel):
@@ -605,3 +684,5 @@ def _select_kernel_by_shape(self, *args, **kwargs):
         # pyrefly: ignore [bad-assignment]
         self.picked_kernel = dists.index(min(dists))
         self._cache_shape_choice(shape_key, self.picked_kernel)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/rocm/ck_conv_template.py b/torch/_inductor/codegen/rocm/ck_conv_template.py
index 277b6ed374948..2a261d17f64d2 100644
--- a/torch/_inductor/codegen/rocm/ck_conv_template.py
+++ b/torch/_inductor/codegen/rocm/ck_conv_template.py
@@ -2,6 +2,7 @@
 import copy
 import logging
 import random
+<<<<<<< HEAD
 from typing import Any
 from typing_extensions import override
 
@@ -9,6 +10,11 @@
 
 from .rocm_template import ArgInfo
 
+=======
+
+from torch._inductor.virtualized import V
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     import ck4inductor  # type: ignore[import]
@@ -287,8 +293,11 @@ def globals(self) -> IndentedBuffer:
 
                 using ConvolutionForwardSpecialization = ck::tensor_operation::device::ConvolutionForwardSpecialization;
 
+<<<<<<< HEAD
                 using OutElementOp = PassThrough;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 namespace ck {
                 namespace utils {
                 namespace conv {
@@ -513,11 +522,17 @@ def emit_ck_instance(self, op: "CKGroupedConvFwdOp") -> tuple[str, str]:  # type
                     arg = f"/* {field_name} */ Tuple<{tuple_elements}>"
                 else:  # tile shape
                     arg = f"/* {field_name} */ S<{tuple_elements}>"
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 template_params.append(arg)
             else:
                 if field_value is not None:
                     # pyrefly: ignore [bad-argument-type]
+=======
+                template_params.append(arg)
+            else:
+                if field_value is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     template_params.append(f"/* {field_name} */ {field_value}")
         return self._template_from_string(template_definition).render(
             operation_name=op.name(),
@@ -530,7 +545,11 @@ def render(  # type: ignore[override]
         op: "CKGroupedConvFwdOp",  # type: ignore[name-defined]
         **kwargs,
     ) -> str:
+<<<<<<< HEAD
         template_buffer_node = kwargs.get("template_buffer_node")
+=======
+        template_buffer_node = kwargs.get("template_buffer_node", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         X, W = self.input_nodes[0], self.input_nodes[1]
@@ -614,6 +633,7 @@ def size_args(self):
             right_pads_0,
             right_pads_1,
         )
+<<<<<<< HEAD
 
     @override
     def get_runtime_arg_info(self) -> list[ArgInfo]:
@@ -625,3 +645,5 @@ def get_runtime_arg_values(self, **kwargs: Any) -> list[Any]:
         Helper method to retrieve runtime args from generate kwargs
         """
         return []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/rocm/ck_template.py b/torch/_inductor/codegen/rocm/ck_template.py
index b1eaf5c228eed..1b3eaac41dfe7 100644
--- a/torch/_inductor/codegen/rocm/ck_template.py
+++ b/torch/_inductor/codegen/rocm/ck_template.py
@@ -21,10 +21,15 @@ class CKTemplate(ROCmTemplate):
         torch.bfloat16: "BF16",
         torch.int32: "I32",
         torch.int8: "I8",
+<<<<<<< HEAD
         torch.float8_e4m3fnuz: "F8",  # gfx94
         torch.float8_e4m3fn: "F8",  # gfx95
         torch.float8_e5m2fnuz: "BF8",  # gfx94
         torch.float8_e5m2: "BF8",  # gfx95
+=======
+        torch.float8_e4m3fnuz: "F8",
+        torch.float8_e5m2fnuz: "BF8",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     def header(self) -> IndentedBuffer:
diff --git a/torch/_inductor/codegen/rocm/ck_tile_template.py b/torch/_inductor/codegen/rocm/ck_tile_template.py
index 70d31d635cc36..391b0aa3ff22e 100644
--- a/torch/_inductor/codegen/rocm/ck_tile_template.py
+++ b/torch/_inductor/codegen/rocm/ck_tile_template.py
@@ -16,10 +16,15 @@ class CKTileTemplate(ROCmTemplate):
         torch.bfloat16: "BF16",
         torch.int32: "I32",
         torch.int8: "I8",
+<<<<<<< HEAD
         torch.float8_e4m3fnuz: "F8",  # gfx94
         torch.float8_e4m3fn: "F8",  # gfx95
         torch.float8_e5m2fnuz: "BF8",  # gfx94
         torch.float8_e5m2: "BF8",  # gfx95
+=======
+        torch.float8_e4m3fnuz: "F8",
+        torch.float8_e5m2fnuz: "BF8",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     ck_dtype_to_size = {
diff --git a/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
index 94a79297ef5e4..290509a320e64 100644
--- a/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
@@ -242,6 +242,7 @@ class CKTileGemmTemplate(CKTileTemplate):
         constexpr auto TileK = {{instance_namespace}}::TileK;
         constexpr auto kPrefetchStages = BaseGemmPipeline::PrefetchStages;
 
+<<<<<<< HEAD
         const auto BiasTerms = std::array<const void*, 0> ();
         const auto BiasStrides = std::array<int32_t, 0> ();
 
@@ -249,13 +250,23 @@ class CKTileGemmTemplate(CKTileTemplate):
            {X},
            {W},
            BiasTerms,
+=======
+        auto kargs = ck_tile::GemmKernelArgs {
+           X,
+           W,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            Y,
            M,
            N,
            K,
+<<<<<<< HEAD
            {LDA},
            {LDB},
            BiasStrides,
+=======
+           LDA,
+           LDB,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            LDC,
            kBatch
         };
@@ -693,6 +704,7 @@ def render_epilogue(epilogue_type):
             elif epilogue_type == "CShuffle":
                 return r"""
             constexpr auto kMemoryOperation = ck_tile::memory_operation_enum::set;
+<<<<<<< HEAD
             using DsDataType = ck_tile::tuple<>; // no bias terms for vanilla GEMM
             using DsLayout = ck_tile::tuple<>;
             constexpr auto ELayout = CLayout;
@@ -705,6 +717,13 @@ def render_epilogue(epilogue_type):
                                                                      DsLayout,
                                                                      ELayout,
                                                                      CDEElementWise,
+=======
+            using EpilogueProblem = ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CDataType,
+                                                                     CLayout,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                                      GemmPipelineProblem::kBlockSize,
                                                                      TileM,
                                                                      TileN,
@@ -750,9 +769,15 @@ def render(  # type: ignore[override]
         """
         The primary entry point for the code rendering process used in this template.
         """
+<<<<<<< HEAD
         epilogue_nodes = kwargs.get("epilogue_nodes")
         assert epilogue_nodes is None or 0 == len(epilogue_nodes)
         template_buffer_node = kwargs.get("template_buffer_node")
+=======
+        epilogue_nodes = kwargs.get("epilogue_nodes", None)
+        assert epilogue_nodes is None or 0 == len(epilogue_nodes)
+        template_buffer_node = kwargs.get("template_buffer_node", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         assert 2 == len(self.input_nodes)
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
index e9f8ff54f9f45..c10106b69426b 100644
--- a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -510,7 +510,11 @@ def _check_num_k_loops(self, op, kBatch):
                         torch.cuda.get_device_properties(X_meta.device).warp_size,
                     )
                 except Exception as e:
+<<<<<<< HEAD
                     log.debug(  # noqa: G200
+=======
+                    log.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "Failed to prefetch_stages for %s with exception %s", op.name, e
                     )
                     # be conservative here and disable the op
@@ -547,7 +551,11 @@ def _prefetch_stages(self, op, a_dtype_size, b_dtype_size, warp_size: int = 64):
         # Define the mapping of versions to stages
         version_to_stages = {1: 1, 3: 2, 4: 4, 5: 3}
         # Get the stages for the given version
+<<<<<<< HEAD
         stages = version_to_stages.get(version)
+=======
+        stages = version_to_stages.get(version, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if stages is None:
             # This means we're at stage 2, and this requires computation
             # See github.com/ROCm/composable_kernel/blob/d6a4605/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp#L143 # noqa: B950
@@ -590,11 +598,17 @@ def emit_ck_instance(self, op: "CKGemmOperation"):
                     arg = f"/* {field_name} */ Tuple<{tuple_elements}>"
                 else:  # tile shape
                     arg = f"/* {field_name} */ S<{tuple_elements}>"
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 template_params.append(arg)
             else:
                 if field_value is not None:
                     # pyrefly: ignore [bad-argument-type]
+=======
+                template_params.append(arg)
+            else:
+                if field_value is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     template_params.append(f"/* {field_name} */ {field_value}")
         operation_name = op.name().replace("(", "").replace(",", "").replace(")", "")
         return self._template_from_string(template_definition).render(
@@ -614,9 +628,15 @@ def render(  # type: ignore[override]
         """
         The primary entry point for the code rendering process used in this template.
         """
+<<<<<<< HEAD
         epilogue_nodes = kwargs.get("epilogue_nodes")
         assert epilogue_nodes is None or 0 == len(epilogue_nodes)
         template_buffer_node = kwargs.get("template_buffer_node")
+=======
+        epilogue_nodes = kwargs.get("epilogue_nodes", None)
+        assert epilogue_nodes is None or 0 == len(epilogue_nodes)
+        template_buffer_node = kwargs.get("template_buffer_node", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         # input nodes:
@@ -939,7 +959,10 @@ def gen_ops(self) -> list[InductorROCmOp]:
         for o in rops:
             kBatches = self._get_kBatch(o)
             for kBatch in kBatches:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ops.append(InductorROCmOp(op=o, kBatch=kBatch))
 
         filtered_instances = list(filter(lambda op: self.filter_op(op), ops))
diff --git a/torch/_inductor/codegen/rocm/compile_command.py b/torch/_inductor/codegen/rocm/compile_command.py
index aa935b14af23c..21de7ca4e404d 100644
--- a/torch/_inductor/codegen/rocm/compile_command.py
+++ b/torch/_inductor/codegen/rocm/compile_command.py
@@ -4,7 +4,11 @@
 from typing import Optional
 
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.utils import is_linux, try_import_ck_lib
+=======
+from torch._inductor.utils import is_linux
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -18,23 +22,34 @@ def _rocm_include_paths(dst_file_ext: str) -> list[str]:
         if config.rocm.rocm_home
         else cpp_extension._join_rocm_home("include")
     )
+<<<<<<< HEAD
+=======
+    if not config.rocm.ck_dir:
+        log.warning("Unspecified Composable Kernel include dir")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if config.is_fbcode():
         from libfb.py import parutil
 
         ck_path = parutil.get_dir_path("composable-kernel-headers")
     else:
+<<<<<<< HEAD
         if not config.rocm.ck_dir:
             ck_dir, _, _, _ = try_import_ck_lib()
             if not ck_dir:
                 log.warning("Unspecified Composable Kernel directory")
             config.rocm.ck_dir = ck_dir
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ck_path = config.rocm.ck_dir or cpp_extension._join_rocm_home(
             "composable_kernel"
         )
 
+<<<<<<< HEAD
     log.debug("Using ck path %s", ck_path)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ck_include = os.path.join(ck_path, "include")
     ck_library_include = os.path.join(ck_path, "library", "include")
 
diff --git a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
index df4982988aa15..79663c42f3448 100644
--- a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
+++ b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
@@ -96,7 +96,11 @@ def update_workspace_size(self) -> None:
             return
         self.ensure_dll_loaded()
         unique_input_count = len(
+<<<<<<< HEAD
             dict.fromkeys(meta.name for meta in self.input_tensor_meta)
+=======
+            {meta.name for meta in self.input_tensor_meta}  # noqa: set_linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         args = [c_void_p(None) for _ in range(unique_input_count + 1)]
         stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
diff --git a/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py b/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
index ec58e458df6b1..b7a22d5a92e08 100644
--- a/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
+++ b/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
@@ -86,7 +86,11 @@ def codegen_template(
         _, (_numel, rnumel) = template_node.group
         assert rnumel == 1
         ctb: ROCmTemplateBuffer = cast(ROCmTemplateBuffer, template_node.node)
+<<<<<<< HEAD
         kernel, render = ctb.make_kernel_render(ctb)  # type: ignore[misc]
+=======
+        kernel, render = ctb.make_kernel_render(ctb)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with kernel:
             template_node.mark_run()
             src_code = render()
@@ -94,7 +98,10 @@ def codegen_template(
         with V.set_kernel_handler(kernel):
             node_schedule = [template_node]
             kernel_name = self.define_kernel(src_code, node_schedule)
+<<<<<<< HEAD
         self.codegen_comment(node_schedule, kernel_name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
diff --git a/torch/_inductor/codegen/rocm/rocm_kernel.py b/torch/_inductor/codegen/rocm/rocm_kernel.py
index 5b90823b7f41c..eb5fdc27ebc67 100644
--- a/torch/_inductor/codegen/rocm/rocm_kernel.py
+++ b/torch/_inductor/codegen/rocm/rocm_kernel.py
@@ -7,6 +7,7 @@
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch._inductor.utils import do_bench_using_profiling
 
+<<<<<<< HEAD
 from ...ir import (
     Buffer,
     ChoiceCaller,
@@ -16,6 +17,9 @@
     ShapeAsConstantBuffer,
     TensorBox,
 )
+=======
+from ...ir import Buffer, ChoiceCaller, IRNode, Layout, PrimitiveInfoType, TensorBox
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ...virtualized import V
 from ..common import Kernel, OpOverrides, WorkspaceArg, WorkspaceZeroMode
 from ..cpp_utils import CppPrinter
@@ -284,7 +288,11 @@ def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType
             **dict(self.info_kwargs["op"].dict_items()),  # type: ignore[union-attr, index]
         }
 
+<<<<<<< HEAD
     def output_node(self) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def output_node(self) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.bmreq.update_workspace_size()
         return TensorBox.create(
             ROCmTemplateBuffer(
diff --git a/torch/_inductor/codegen/rocm/rocm_utils.py b/torch/_inductor/codegen/rocm/rocm_utils.py
index 36871ac5c7f8f..0a762b5c8b1be 100644
--- a/torch/_inductor/codegen/rocm/rocm_utils.py
+++ b/torch/_inductor/codegen/rocm/rocm_utils.py
@@ -11,7 +11,10 @@
     torch.float16: "uint16_t",
     torch.float8_e4m3fnuz: "uint8_t",
     torch.float8_e5m2fnuz: "uint8_t",
+<<<<<<< HEAD
     torch.float8_e4m3fn: "uint8_t",
     torch.float8_e5m2: "uint8_t",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.bfloat16: "uint16_t",
 }
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 7e5457f78ebb8..a137f5bbcf5e2 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -11,15 +11,22 @@
 import operator
 import textwrap
 from collections import Counter
+<<<<<<< HEAD
 from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Generic, no_type_check, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeVar
 
 import sympy
 
 import torch
 import torch._logging
+<<<<<<< HEAD
 from torch._inductor import metrics
 from torch._inductor.ir import MultiTemplateBuffer
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.tiling_utils import analyze_memory_coalescing
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 from torch.fx.immutable_collections import immutable_dict
@@ -46,12 +53,20 @@
 from ..runtime.runtime_utils import green_text, yellow_text
 from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
 from ..utils import (
+<<<<<<< HEAD
     cache_property_on_self,
+=======
+    cache_on_self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     expr_fits_within_32bit,
     get_dtype_size,
     IndentedBuffer,
     Placeholder,
     prefix_is_reduction,
+<<<<<<< HEAD
+=======
+    set_kernel_post_grad_provenance_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_index_symbol,
     sympy_product,
     sympy_subs,
@@ -60,7 +75,11 @@
 from ..virtualized import ops, OpsWrapper, V
 from .block_analysis import BlockPatternMatcher
 from .common import CSEVariable, index_prevent_reordering, Kernel, PythonPrinter
+<<<<<<< HEAD
 from .multi_kernel import MultiKernel, SizeHintMultiKernel
+=======
+from .multi_kernel import MultiKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .simd_kernel_features import (
     DisableReduction,
     EnableReduction,
@@ -133,7 +152,12 @@ def __init__(
         self.root = root
 
     @property
+<<<<<<< HEAD
     @cache_property_on_self
+=======
+    @cache_on_self
+    @no_type_check  # https://github.com/python/mypy/issues/17184
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_reduction(self) -> bool:
         return prefix_is_reduction(self.prefix)
 
@@ -141,7 +165,12 @@ def symbol(self) -> sympy.Symbol:
         return sympy_index_symbol(self.name)
 
     @property
+<<<<<<< HEAD
     @cache_property_on_self
+=======
+    @cache_on_self
+    @no_type_check
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def symt(self) -> SymT:
         prefix_to_symt = {prefix: symt for symt, prefix in prefix_str.items()}
         return prefix_to_symt[self.prefix]
@@ -188,7 +217,10 @@ def __init__(
 
         # True if the dimension is implemented as a single program looping over
         # the full dimension (currently only used for non-persistent reduction)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert not is_loop or (self.is_reduction and grid_dim is None)
         self.is_loop = is_loop
         # Index of corresponding dimension on triton tensors
@@ -368,6 +400,7 @@ def constant_repr(value: Union[int, float]) -> str:
 CSEVariableType = TypeVar("CSEVariableType", bound=CSEVariable, default=CSEVariable)
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
 class PartialAccumulate:
     buffer_name: str
@@ -375,6 +408,8 @@ class PartialAccumulate:
     value: Any
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
     """
     Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
@@ -383,7 +418,10 @@ class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
     sexpr: Callable[[sympy.Expr], str] = pexpr
     kexpr: Callable[[sympy.Expr], str]
     allow_block_ptr: bool = False
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kernel_name: str
 
     def __init__(
@@ -394,7 +432,10 @@ def __init__(
         override_persistent_reduction: Optional[bool] = None,
         override_cooperative_reduction: Optional[bool] = None,
         tiling_scores: Optional[dict[str, sympy.Expr]] = None,
+<<<<<<< HEAD
         mix_order_reduction: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if pid_cache is None:
             pid_cache = {}
@@ -416,12 +457,16 @@ def __init__(
             else self.should_use_cooperative_reduction()
         )
         self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores
+<<<<<<< HEAD
         self.tiling: dict[str, sympy.Expr] = tiling
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
             else self.should_use_persistent_reduction()
         )
+<<<<<<< HEAD
         self.mix_order_reduction: bool = mix_order_reduction
         self.no_x_dim = self.want_no_x_dim()
         self.code_hash: Optional[str] = None
@@ -437,6 +482,10 @@ def __init__(
                 ):
                     self.is_native_matmul = True
                     break
+=======
+        self.no_x_dim = self.want_no_x_dim()
+        self.code_hash: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # define this in a closure to make cache local to object
         @functools.cache
@@ -450,6 +499,7 @@ def simplify_indexing(index: sympy.Expr):
         self.simplify_indexing = simplify_indexing
         self.initialize_range_tree(pid_cache)
 
+<<<<<<< HEAD
         self.rsplit_size = 0
         self.saved_partial_accumulate: list[PartialAccumulate] = []
 
@@ -463,6 +513,11 @@ def get_store_output_count(self):
 
     @property
     @cache_property_on_self
+=======
+    @property
+    @cache_on_self
+    @no_type_check  # https://github.com/python/mypy/issues/17184
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def num_reduction_dims(self) -> int:
         return sum(prefix_is_reduction(prefix) for prefix in self.numels)
 
@@ -585,7 +640,10 @@ def dense_size_list(self) -> list[str]:
             if tree.tensor_dim is None:
                 continue
 
+<<<<<<< HEAD
             # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not tree.is_reduction or self.inside_reduction:
                 sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
         return sizes
@@ -697,6 +755,7 @@ def add_range(i: int, expr: sympy.Expr) -> int:
             return next(var_count)
 
         def make_combined(
+<<<<<<< HEAD
             sizes: list[sympy.Expr], idxs: list[int]
         ) -> Callable[[list[sympy.Expr]], sympy.Expr]:
             """
@@ -710,6 +769,12 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                 for s, idx in zip(sizes, idxs[1:]):
                     expr = s * expr + flat_vars[idx]
                 return expr
+=======
+            size: sympy.Expr, idx1: int, idx2: int
+        ) -> Callable[[list[sympy.Expr]], sympy.Expr]:
+            def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
+                return size * flat_vars[idx1] + flat_vars[idx2]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return getter
 
@@ -729,6 +794,7 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                     # scroll to next group with remaining elements
                     current_group += 1
 
+<<<<<<< HEAD
                 # During native matmul on bmm, we enforce tiling order (z, y, x, r).
                 # When fusing a bmm node with loop (z, y, x, r) with a pw node
                 # of shape (z*y*x, 1), we need to split the pw iteration range
@@ -770,6 +836,9 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
 
                 # Two-dimensional tiling
                 elif current_group + 1 < len(remaining) and sv.statically_known_gt(
+=======
+                if current_group + 1 < len(remaining) and sv.statically_known_gt(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     size, remaining[current_group]
                 ):
                     # need to break size in two
@@ -782,11 +851,17 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                     size2 = FloorDiv(size, remaining[current_group])
                     return_getters.append(
                         make_combined(
+<<<<<<< HEAD
                             [size2],
                             [
                                 add_range(current_group, size1),
                                 add_range(current_group + 1, size2),
                             ],
+=======
+                            size2,
+                            add_range(current_group, size1),
+                            add_range(current_group + 1, size2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     )
                 else:
@@ -799,6 +874,10 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
         assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining), (
             f"failed to set ranges {remaining} {lengths}"
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return new_ranges, return_getters_groups
 
     @classmethod
@@ -839,6 +918,7 @@ def is_compatible(
     def split_and_set_ranges(
         self, lengths: Sequence[Sequence[sympy.Expr]]
     ) -> list[list[sympy.Expr]]:
+<<<<<<< HEAD
         """
         Split and set iteration ranges for the kernel based on the provided lengths.
 
@@ -858,16 +938,23 @@ def split_and_set_ranges(
 
         # If we're not inside a reduction loop, set all reduction dimensions to 1
         # This effectively disables reduction dimensions when not needed
+=======
+        tiling = {rt.prefix: rt.numel for rt in self.range_trees}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.inside_reduction:
             for prefix in tiling:
                 if prefix_is_reduction(prefix):
                     tiling[prefix] = sympy.S.One
 
+<<<<<<< HEAD
         # Extract the values from the tiling dictionary to create groups
         groups = [*tiling.values()]
 
         # Map the kernel's group structure to the node's sizes and set the ranges
         # using the set_ranges method, returning the resulting iteration variables
+=======
+        groups = [*tiling.values()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.map_kernel_groups_to_node_sizes(groups, lengths, self.set_ranges)
 
     @classmethod
@@ -978,10 +1065,14 @@ def prepare_indexing(
 
     def active_range_trees(self) -> list[IterationRangesRoot]:
         return [
+<<<<<<< HEAD
             t
             for t in self.range_trees
             # pyrefly: ignore [missing-argument]
             if not t.is_reduction or self.inside_reduction
+=======
+            t for t in self.range_trees if not t.is_reduction or self.inside_reduction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
     def codegen_indexing(self, expr: sympy.Expr) -> sympy.Expr:
@@ -1004,6 +1095,7 @@ def codegen_indexing(self, expr: sympy.Expr) -> sympy.Expr:
     def codegen_nan_check(self) -> None:
         raise NotImplementedError("NYI: codegen_nan_check")
 
+<<<<<<< HEAD
     def deallocate_workspaces(self):
         wrapper = V.graph.wrapper_code
         for ws in reversed(self.args.workspace_args):
@@ -1012,6 +1104,9 @@ def deallocate_workspaces(self):
     def call_kernel(
         self, name: str, node: Optional[IRNode] = None, deallocate_ws: bool = True
     ) -> None:
+=======
+    def call_kernel(self, name: str, node: Optional[IRNode] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError("NYI: call_kernel")
 
     @contextlib.contextmanager
@@ -1064,6 +1159,7 @@ def _map_tuple_or_scalar(fn, value):
             return tuple(map(fn, value))
         return fn(value)
 
+<<<<<<< HEAD
     def estimate_flops(self) -> Optional[int]:
         flops = [
             node.estimate_flops()
@@ -1071,6 +1167,8 @@ def estimate_flops(self) -> Optional[int]:
         ]
         return sum(filter(None, flops))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def estimate_kernel_num_bytes(self):
         """
         Try the best to estimate the total size (in bytes) of the
@@ -1102,10 +1200,14 @@ def estimate_kernel_num_bytes(self):
         # for the "cat". However, I think it might be a bit overwhelming that
         # we add such complexity only for handling some particular cases for
         # benchmarking.
+<<<<<<< HEAD
         out_numel = V.graph.sizevars.size_hint(
             sympy_product(self.numels.values()),
             fallback=config.unbacked_symint_fallback,
         )
+=======
+        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels.values()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, arg in enumerate(call_args):
             # "buf" may be narrowed. In this case, the number of memory accesses
             # should be estimated based on the reinterpreted layout.
@@ -1116,9 +1218,13 @@ def estimate_kernel_num_bytes(self):
                 nbytes.append(0)
                 continue
             arg_numel = V.graph.get_numel(arg)
+<<<<<<< HEAD
             buf_size = V.graph.sizevars.size_hint(
                 arg_numel, fallback=config.unbacked_symint_fallback
             )
+=======
+            buf_size = V.graph.sizevars.size_hint(arg_numel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if buf_size > out_numel:
                 # This arg points to a buf that has been sliced.
                 # We need to count each individual slice to have
@@ -1136,7 +1242,10 @@ def estimate_kernel_num_bytes(self):
                 numel = buf_size
             dtype = V.graph.get_dtype(arg)
             dtype_size = get_dtype_size(dtype)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
         return sum(nbytes)
 
@@ -1157,7 +1266,10 @@ def warn_mix_layout(self, kernel_name):
 
         argdefs, call_args, _signature, _ = self.args.python_argdefs()
         uniform_stride_order = None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg_name in call_args:
             buf = V.graph.try_get_buffer(arg_name)
             if not buf:
@@ -1277,11 +1389,14 @@ def can_fuse(self, node1, node2):
         if node1.is_reduction() and node2.is_reduction():
             reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
             if not reduction_can_fuse:
+<<<<<<< HEAD
                 from torch._inductor.scheduler import MixOrderReduction
 
                 reduction_can_fuse = MixOrderReduction.can_fuse(node1, node2)
 
             if not reduction_can_fuse:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 why(
                     "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
                     numel1,
@@ -1289,6 +1404,7 @@ def can_fuse(self, node1, node2):
                     rnumel1,
                     rnumel2,
                 )
+<<<<<<< HEAD
 
             if reduction_can_fuse and (
                 node1.is_native_matmul() or node2.is_native_matmul()
@@ -1317,6 +1433,8 @@ def can_fuse(self, node1, node2):
                     why("invalid loop order and tiling for native matmul")
                     return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return reduction_can_fuse
 
         if not node1.is_reduction() and not node2.is_reduction():
@@ -1513,6 +1631,7 @@ def requires_closing_previous_reduction(node, node_schedule):
 
         return node_schedule
 
+<<<<<<< HEAD
     def codegen_mix_order_reduction(self, node):
         node1, node2 = node.node1, node.node2
 
@@ -1643,6 +1762,21 @@ def _codegen_nodes(
         nodes: Sequence[scheduler.SchedulerNode],
         coalesce_analysis: Optional[CoalesceVarAnalysis] = None,
     ):
+=======
+    def codegen_node(
+        self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
+    ):
+        """
+        Given a set of pre-fused nodes, generate a Triton kernel.
+        """
+
+        nodes: list[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
+
+        if torch._inductor.config.triton.coalesce_tiling_analysis:
+            coalesce_analysis = analyze_memory_coalescing(node)
+        else:
+            coalesce_analysis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
 
         node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
@@ -1652,6 +1786,7 @@ def _codegen_nodes(
             SIMDKernelFeatures(node_schedule, numel, rnumel, coalesce_analysis)
         )
 
+<<<<<<< HEAD
     def codegen_node(
         self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
     ):
@@ -1667,6 +1802,8 @@ def codegen_node(
         nodes: list[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
         return self._codegen_nodes(nodes, coalesce_analysis)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def can_use_32bit_indexing(
         numel: sympy.Expr,
@@ -1687,6 +1824,7 @@ def can_use_32bit_indexing(
             if buf.has_tensor_output()
         ]
 
+<<<<<<< HEAD
         for buf in buffers:
             if not buf.has_tensor_output() and isinstance(buf, ir.MutationOutput):
                 mutated_bufs = buf.get_mutation_buffers()
@@ -1696,14 +1834,22 @@ def can_use_32bit_indexing(
                     if buf.has_tensor_output()
                 ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not all(expr_fits_within_32bit(size) for size in buf_sizes):
             return False
 
         # Only install guards for 32-bit indexing as there is no correctness
         # issue with using 64-bit for everything
+<<<<<<< HEAD
         V.graph.sizevars.check_leq(numel, int_max)  # type: ignore[arg-type]
         for size in buf_sizes:
             V.graph.sizevars.check_leq(size, int_max)  # type: ignore[arg-type]
+=======
+        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
+        for size in buf_sizes:
+            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
@@ -1727,6 +1873,14 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
             with V.set_kernel_handler(kernel):
                 src_code = kernel.codegen_kernel()
             kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+<<<<<<< HEAD
+=======
+            if config.trace.enabled:
+                set_kernel_post_grad_provenance_tracing(
+                    node_schedule,  # type: ignore[arg-type]
+                    kernel_name,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("Generating kernel code with kernel_name: %s", kernel_name)
             kernel.kernel_name = kernel_name
             kernel.code_hash = code_hash(src_code)
@@ -1742,11 +1896,15 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
             for node in kernel_features.scheduler_nodes():
                 node.mark_run()
 
+<<<<<<< HEAD
         # filter out NodeScheduleMarker
         base_scheduler_nodes = [
             node for node in node_schedule if isinstance(node, BaseSchedulerNode)
         ]
         self.codegen_comment(base_scheduler_nodes, final_kernel.kernel_name)
+=======
+        self.codegen_comment(node_schedule)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         final_kernel.call_kernel(final_kernel.kernel_name)
 
         if config.nan_asserts:
@@ -1758,7 +1916,11 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
         V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
 
         if (
+<<<<<<< HEAD
             V.graph.wrapper_code.supports_intermediate_hooks  # type: ignore[has-type]
+=======
+            V.graph.wrapper_code.supports_intermediate_hooks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and config.generate_intermediate_hooks
         ):
             # Not every node in the schedule will actually be live on output;
@@ -1822,6 +1984,7 @@ def codegen_node_schedule_with_kernel(self, node_schedule, kernel):
                     index_vars = kernel.split_and_set_ranges(node.get_ranges())
                     node.codegen(index_vars)
 
+<<<<<<< HEAD
     def _codegen_single_template(
         self,
         kernel,
@@ -1835,6 +1998,20 @@ def _codegen_single_template(
         """
         Helper method to codegen a single template kernel variant
         """
+=======
+    def codegen_template(
+        self, template_node, epilogue_nodes, prologue_nodes, *, only_gen_src_code=False
+    ) -> Optional[str]:
+        """
+        Codegen a triton template
+
+        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
+        """
+        _, (_numel, rnumel) = template_node.group
+        assert rnumel == 1
+        kernel, render = template_node.node.make_kernel_render(template_node.node)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buf_name_to_prologue_group = {}
         template_reads = template_node.used_buffer_names()
         prologue_group = []
@@ -1860,6 +2037,7 @@ def _codegen_single_template(
 
             partial_code = render()
 
+<<<<<<< HEAD
             num_store_subgraphs = kernel.get_store_output_count()
             for i in range(num_store_subgraphs):
                 subgraph_name = kernel._get_store_output_subgraph_name(i)
@@ -1867,6 +2045,12 @@ def _codegen_single_template(
                     for node in epilogue_nodes:
                         node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
                     kernel.cse.invalidate(OrderedSet())
+=======
+            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
+                for node in epilogue_nodes:
+                    node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+                kernel.cse.invalidate(OrderedSet())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for input_name, buffer in kernel.named_input_nodes.items():
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
@@ -1900,6 +2084,7 @@ def _codegen_single_template(
                                 )
                             kernel.cse.invalidate(OrderedSet())
 
+<<<<<<< HEAD
         # Template hooks must be finalised after kernel.remove_kernel_local_buffers
         # is called (this is called when the kernel context is exited above), and when
         # the kernel handler is set (as below). This is because the hooks may add
@@ -1914,10 +2099,19 @@ def _codegen_single_template(
                     partial_code.finalize_hook("<DEF_KERNEL>")
                 partial_code.finalize_hook("<ARGDEFS>", strict=False)
 
+=======
+        if not isinstance(partial_code, str):
+            partial_code.finalize_hook("<DEF_KERNEL>")
+            partial_code.finalize_hook("<ARGDEFS>", strict=False)
+        # finalize must be called after adding epilogue above
+
+        with V.set_kernel_handler(kernel):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
 
             for input_name in kernel.named_input_nodes.keys():
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
                 partial_code.finalize_hook(subgraph_name, strict=False)
 
@@ -1934,6 +2128,16 @@ def _codegen_single_template(
                 # Note: some of these hooks may have been registered by a kernel subclass
                 src_code = partial_code.finalize_remaining()
 
+=======
+                partial_code.finalize_hook(subgraph_name, strict=False)
+
+            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
+                if isinstance(partial_code, str):
+                    src_code = partial_code
+                else:
+                    partial_code.finalize_hook("<STORE_OUTPUT>")
+                    src_code = partial_code.code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node_schedule = [*prologue_nodes, template_node, *epilogue_nodes]
 
             if config.benchmark_kernel:
@@ -1947,6 +2151,7 @@ def _codegen_single_template(
             if only_gen_src_code:
                 return src_code
 
+<<<<<<< HEAD
             kernel.kernel_name = self.define_kernel(src_code, node_schedule, kernel)
 
             return kernel
@@ -2110,6 +2315,20 @@ def codegen_template(
                 V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
                 self.free_buffers_in_scheduler()
                 return None
+=======
+            kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+
+            if config.trace.enabled:
+                set_kernel_post_grad_provenance_tracing(node_schedule, kernel_name)
+
+        self.codegen_comment(node_schedule)
+        kernel.call_kernel(kernel_name, template_node.node)
+
+        V.graph.removed_buffers |= kernel.removed_buffers
+        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
+        self.free_buffers_in_scheduler()
+        return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_sync(self):
         V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
@@ -2151,8 +2370,11 @@ def generate_combo_kernel_code(
         )
         kernel_code_list = []
         for node_group in partitions:
+<<<<<<< HEAD
             if len(node_group) == 0:
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fused_node_lists = [node.get_nodes() for node in node_group]
             kernel = ComboKernel(
                 enable_autotune=enable_autotune,
@@ -2191,7 +2413,16 @@ def codegen_combo_kernel(self, combo_kernel_node):
 
         for src_code, kernel, _ in kernel_code_list:
             kernel_name = self.define_kernel(src_code, [combo_kernel_node], kernel)
+<<<<<<< HEAD
             self.codegen_comment(combo_kernel_node.snodes, kernel_name)
+=======
+            # dump provenance node info for ComboKernelNode/ForeachKernel type
+            if config.trace.enabled:
+                set_kernel_post_grad_provenance_tracing(
+                    combo_kernel_node.snodes, kernel_name
+                )
+            self.codegen_comment([combo_kernel_node])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("ComboKernels: generated kernel %s.", kernel_name)
             kernel.call_kernel(V.graph.wrapper_code, kernel_name)
 
@@ -2329,7 +2560,11 @@ def collapse_ranges(ranges: Sequence[sympy.Expr]) -> sympy.Expr:
     @classmethod
     def create_tiling(
         cls, pw_tiling: Sequence[sympy.Expr], reduction_tiling: Sequence[sympy.Expr]
+<<<<<<< HEAD
     ) -> immutable_dict[str, sympy.Expr]:
+=======
+    ) -> dict[str, sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a tiling dict from pointwise and reduction splits.
         """
@@ -2344,7 +2579,11 @@ def create_partial_tiling(
         cls,
         tiling: Sequence[sympy.Expr],
         is_pointwise: bool,
+<<<<<<< HEAD
     ) -> immutable_dict[str, sympy.Expr]:
+=======
+    ) -> dict[str, sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls.create_tiling(
             tiling if is_pointwise else [],
             tiling if not is_pointwise else [],
@@ -2356,7 +2595,11 @@ def complete_partial_tiling(
         tiling: dict[str, sympy.Expr],
         numel: sympy.Expr,
         reduction_numel: sympy.Expr,
+<<<<<<< HEAD
     ) -> immutable_dict[str, sympy.Expr]:
+=======
+    ) -> dict[str, sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Given a tiling for only pointwise or reduction dimensions, adds the missing one.
         """
@@ -2377,7 +2620,11 @@ def get_nd_tilings(
         node_schedule,
         pointwise_numel,
         reduction_numel,
+<<<<<<< HEAD
     ) -> list[immutable_dict[str, sympy.Expr]]:
+=======
+    ) -> list[dict[str, tuple[sympy.Expr]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Creates N-dimensional tiling candidates, attempting to simplify loads/stores
         by tiling the kernel into higher dimensions.
@@ -2385,7 +2632,11 @@ def get_nd_tilings(
         Returns a list of tilings ranked by dimensionality.
         """
         is_pointwise = reduction_numel == 1
+<<<<<<< HEAD
         tilings = OrderedSet[immutable_dict[str, sympy.Expr]]()
+=======
+        tilings = OrderedSet[dict[str, sympy.Expr]]()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in EnableReduction.filter(node_schedule):
             if not isinstance(node, scheduler.SchedulerNode):
                 continue
@@ -2552,7 +2803,11 @@ def process_node_vars(
                     return ([], [])
 
             key = (repr(vars_to_use), use_split_var, is_pointwise)
+<<<<<<< HEAD
             if out := scored_sub_split.get(key):
+=======
+            if out := scored_sub_split.get(key, None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return out
 
             splitting_vars = all_iter_vars if is_pointwise else all_red_vars
@@ -2650,7 +2905,11 @@ def process_node_vars(
                     )
                 )
 
+<<<<<<< HEAD
         tilings: list[tuple[CandidateTiling, immutable_dict[str, sympy.Expr]]] = []
+=======
+        tilings: list[tuple[CandidateTiling, dict[str, sympy.Expr]]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (pw_split, pw_score), (red_split, red_score) in score_split:
             candidate = CandidateTiling(
                 cls.create_tiling(pw_split, red_split),
@@ -2766,6 +3025,7 @@ def get_tiling_and_scores(
         # Tiled reductions are gated by a config flag.
         default_tiling = cls.create_tiling([numel], [reduction_numel])
 
+<<<<<<< HEAD
         # Force tiling compatible with matmul dimensions
         # when natively generating matmul without template calls.
         for node in EnableReduction.filter(node_schedule):
@@ -2782,6 +3042,8 @@ def get_tiling_and_scores(
                     tiling = cls.create_tiling(range_y_x, range_r)
                     return tiling, None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # # TODO: enable by default
         if (
             torch._inductor.config.triton.coalesce_tiling_analysis
@@ -2876,7 +3138,10 @@ def convert_tiling_to_3d(
             perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
 
         # Optionally, prefer tiling into as many dimensions as possible.
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.triton.prefer_nd_tiling:
             ranked_tilings = (
                 cls.get_nd_tilings(node_schedule, numel, reduction_numel)
@@ -2896,9 +3161,13 @@ def flush(self):
     def ready_to_flush(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def generate_kernel_code_from_nodes(
         self, nodes, benchmark_kernel=False, hint_override: Optional[int] = None
     ):
+=======
+    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not any(n.is_template() for n in nodes):
             _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
             node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
@@ -2923,6 +3192,7 @@ def generate_kernel_code_from_nodes(
                     epilogue,
                     prologue,
                     only_gen_src_code=True,
+<<<<<<< HEAD
                     hint_override=hint_override,
                 )
 
@@ -2930,6 +3200,16 @@ def generate_kernel_code_from_nodes(
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
         return src_code
 
+=======
+                )
+
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
+        return src_code
+
+    def codegen_comment(self, node_schedule):
+        pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def define_kernel(self, src_code, node_schedule, kernel):
         raise NotImplementedError
 
diff --git a/torch/_inductor/codegen/simd_kernel_features.py b/torch/_inductor/codegen/simd_kernel_features.py
index 3cb38dda5a366..0ebd0d03c227c 100644
--- a/torch/_inductor/codegen/simd_kernel_features.py
+++ b/torch/_inductor/codegen/simd_kernel_features.py
@@ -290,8 +290,12 @@ def simulate_codegen(self) -> None:
             )
 
             for dep in rw._reads:
+<<<<<<< HEAD
                 if not isinstance(dep, MemoryDep):
                     continue
+=======
+                assert isinstance(dep, MemoryDep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dep = dep.simplify_with_ranges()
                 if not self.persistent.writes.get(dep.name):  # cache miss?
                     self.persistent.reads[dep.name].add(dep)
@@ -309,8 +313,12 @@ def simulate_codegen(self) -> None:
                         self.must_keep_buffers.add(dep.name)
 
             for dep in rw._writes:
+<<<<<<< HEAD
                 if not isinstance(dep, MemoryDep):
                     continue
+=======
+                assert isinstance(dep, MemoryDep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dep = dep.simplify_with_ranges()
                 self.store_buffer_names.add(dep.name)
                 self.persistent.writes[dep.name].add(dep)
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
index 7259bd3460054..d53ebd1276525 100644
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@@ -1,6 +1,10 @@
 import itertools
 import logging
+<<<<<<< HEAD
 from typing import Any, Callable, Union
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.config as config
@@ -80,7 +84,10 @@ def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
             bm_graph_lowering.graph_input_names.append(sym_inp.name)
 
         sym_inputs = [
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             int(V.graph.sizevars.shape_env.size_hint(sym_var))
             for sym_var in self.sym_inputs
         ]
@@ -96,6 +103,20 @@ def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
                     assert ar.shape == example_inp.shape
                     assert ar.stride() == example_inp.stride()
 
+<<<<<<< HEAD
+=======
+        if len(sym_inputs) == 0:
+            # Sanity check that args are same layout as example inputs
+            # Only do it if there are no symbolic inputs, otherwise
+            # the dynamic dim will be realized to the same size as args
+            for ar, example_inp in zip(args, self.example_inputs):
+                # Sanity check that args are same layout as example inputs
+                if isinstance(ar, torch.Tensor):
+                    assert isinstance(example_inp, torch.Tensor)
+                    assert ar.shape == example_inp.shape
+                    assert ar.stride() == example_inp.stride()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with V.set_graph_handler(bm_graph_lowering):
             # Don't bother autotuning on Triton here
             with inductor_config.patch(
@@ -122,7 +143,11 @@ def hash_key(self) -> str:
             ]
         )
 
+<<<<<<< HEAD
     def output_node(self) -> Union[ir.TensorBox, ir.ShapeAsConstantBuffer]:
+=======
+    def output_node(self) -> ir.TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ir.TensorBox.create(
             ir.SubgraphBuffer(
                 layout=self.layout,
@@ -158,6 +183,10 @@ class SubgraphTemplate(KernelTemplate):
     def __init__(
         self,
         name: str,
+<<<<<<< HEAD
+=======
+        make_fx_graph: Callable[..., Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Initialize a subgraph template.
@@ -166,6 +195,7 @@ def __init__(
             name: The name of this template
             graph: The FX graph
         """
+<<<<<<< HEAD
         super().__init__(name=name)
 
     def generate(  # type: ignore[override]
@@ -175,6 +205,15 @@ def generate(  # type: ignore[override]
         layout: Layout,
         make_fx_graph: Callable[..., Any],
         description: str = "",
+=======
+        self.name = f"{name}_{next(SubgraphTemplate.index_counter)}"
+        self.make_fx_graph = make_fx_graph
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs: Any,
     ) -> SubgraphChoiceCaller:
         """
@@ -191,9 +230,17 @@ def generate(  # type: ignore[override]
         """
 
         return SubgraphChoiceCaller(
+<<<<<<< HEAD
             name=f"{name}_{next(SubgraphTemplate.index_counter)}",
             input_nodes=input_nodes,
             layout=layout,
             description=description,
             make_fx_graph=make_fx_graph,
+=======
+            name=self.name,
+            input_nodes=input_nodes,
+            layout=layout,
+            description="",
+            make_fx_graph=self.make_fx_graph,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 5ee88a78bee6f..1180e03c99603 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -26,14 +26,21 @@
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
+<<<<<<< HEAD
 from torch.utils._triton import has_triton_package, has_triton_stable_tma_api
+=======
+from torch.utils._triton import has_triton_package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ...utils._sympy.symbol import free_symbol_is_type, prefix_str, symbol_is_type, SymT
 from ...utils._sympy.value_ranges import ValueRanges
 from .. import config, ir, metrics
 from ..async_compile import AsyncCompile
 from ..codecache import code_hash, get_path, PyCodeCache, write_atomic
+<<<<<<< HEAD
 from ..debug import set_kernel_post_grad_provenance_tracing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..ops_handler import DefaultHandler
 from ..runtime import triton_heuristics
 from ..runtime.benchmarking import benchmarker
@@ -45,10 +52,15 @@
 )
 from ..runtime.runtime_utils import get_max_y_grid, next_power_of_2
 from ..scheduler import BaseSchedulerNode, FusedSchedulerNode, Scheduler, SchedulerNode
+<<<<<<< HEAD
 from ..shape_propagation import get_broadcasted_shape
 from ..utils import (
     cache_on_self,
     DelayMaybeLine,
+=======
+from ..utils import (
+    cache_on_self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DelayReplaceLine,
     get_bounds_index_expr,
     get_fused_kernel_name,
@@ -75,7 +87,10 @@
     DeferredLine,
     IndentedBuffer,
     InplacedBuffer,
+<<<<<<< HEAD
     is_buffer_removed,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpOverrides,
     PythonPrinter,
     RemovedArg,
@@ -89,7 +104,10 @@
     IterationRanges,
     IterationRangesEntry,
     IterationRangesRoot,
+<<<<<<< HEAD
     PartialAccumulate,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SIMDKernel,
     SIMDScheduling,
 )
@@ -110,7 +128,10 @@
     from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
 
     from ..ir import IRNode
+<<<<<<< HEAD
     from .common import BlockShapeType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .simd_kernel_features import SIMDKernelFeatures
 
     _T = TypeVar("_T")
@@ -122,6 +143,7 @@
 async_compile = AsyncCompile()
 
 
+<<<<<<< HEAD
 def get_triton_reduction_function(reduction_type):
     use_helper = reduction_type in ("any", "max", "min", "prod")
     module = "triton_helpers" if use_helper else "tl"
@@ -143,6 +165,8 @@ def is_sympy_integer_like(expr: object):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OpDtypeSupport:
     """
     Some Triton ops such as libdevice and tl.math only support float32 and float64.
@@ -221,6 +245,7 @@ class TritonSymbols:
     }
 
     @classmethod
+<<<<<<< HEAD
     def get_block_shape(cls, expr: sympy.Expr) -> BlockShapeType:
         # return block shape of sympy Expression
         # e.g.,
@@ -284,6 +309,8 @@ def get_block_shape(cls, expr: sympy.Expr) -> BlockShapeType:
         return expr_shape
 
     @classmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_block_size(cls, tree: IterationRanges) -> sympy.Symbol:
         return cls.block_sizes[tree.symt]
 
@@ -299,7 +326,10 @@ class IndexingOptions:
     expand_str: Optional[str]
     _has_rindex: bool
     index: sympy.Expr
+<<<<<<< HEAD
     expand_shape: Optional[Sequence[Union[int, str]]]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def has_mask(self) -> bool:
         return bool(self.mask_vars)
@@ -327,6 +357,7 @@ def mask_str(self) -> str:
 
 
 @dataclasses.dataclass
+<<<<<<< HEAD
 class BlockDescriptorOptions:
     """
     This is a base class that describes a block descriptor used in Triton kernels.
@@ -334,6 +365,9 @@ class BlockDescriptorOptions:
     or a block pointer (with BlockPtrOptions).
     """
 
+=======
+class BlockPtrOptions:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     params: BlockParameters
     constant_offset: sympy.Expr
     order: list[int]
@@ -342,9 +376,12 @@ class BlockDescriptorOptions:
     broadcasting_dims: list[bool]
     final_shape: Sequence[sympy.Expr]
     _boundary_check: Optional[list[int]] = None
+<<<<<<< HEAD
     # Can we safely lift the constructor
     # to the top of the kernel?
     can_lift: bool = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def shape(self) -> list[sympy.Expr]:
@@ -362,19 +399,72 @@ def strides(self) -> list[sympy.Expr]:
     def offsets(self) -> list[sympy.Expr]:
         return self.params.offsets
 
+<<<<<<< HEAD
     @classmethod
     def create(
         cls,
+=======
+    def codegen_broadcast_and_reshape(
+        self,
+        value: str,
+        initial_shape: Sequence[sympy.Expr],
+        final_shape: Sequence[sympy.Expr],
+        allow_implicit: bool,
+    ) -> str:
+        """
+        Generate a broadcast and a reshape for the block pointer.
+        This restores stride-0 dimensions which were removed from the block pointer.
+        """
+
+        # Reshape to add singletons.
+        pre_broadcast_shape = [
+            sympy.S.One if is_broadcasting else dim
+            for dim, is_broadcasting in zip(
+                self.broadcast_shape, self.broadcasting_dims
+            )
+        ]
+        value = triton_reshape(value, initial_shape, pre_broadcast_shape)
+
+        # Broadcast singletons.
+        # For loads, we can often implicitly broadcast singleton dimensions.
+        # We need an explicit broadcast for stores, or if the final reshape does more
+        # than add singletons.
+        sizevars = V.graph.sizevars
+        supports_implicit_broadcast = allow_implicit and (
+            len(pre_broadcast_shape) == len(final_shape)
+            and all(
+                sizevars.statically_known_equals(pre_dim, 1)
+                or sizevars.statically_known_equals(pre_dim, post_dim)
+                for pre_dim, post_dim in zip(pre_broadcast_shape, final_shape)
+            )
+        )
+
+        if any(self.broadcasting_dims) and not supports_implicit_broadcast:
+            value = f"tl.broadcast_to({value}, {V.kernel.index_to_str(self.broadcast_shape)})"
+
+        # Reshape to the final shape.
+        value = triton_reshape(value, self.broadcast_shape, final_shape)
+
+        return value
+
+    @staticmethod
+    def create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         params: BlockParameters,
         constant_offset: sympy.Expr,
         range_trees: list[IterationRangesRoot],
         mask_vars: OrderedSet[str],
         get_max_block: Callable[[str], int],
+<<<<<<< HEAD
         can_lift=False,
         transpose_contiguous=False,
     ) -> BlockDescriptorOptions:
         """Helper to create a BlockDescriptorOptions instance"""
+=======
+    ) -> BlockPtrOptions:
+        """Helper to create a  BlockPtrOptions instance"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sizevars = V.graph.sizevars
 
@@ -412,6 +502,7 @@ def lookup_size(exprs: Iterable[sympy.Expr]) -> list[sympy.Expr]:
         # Combine all removable dims.
         removable_dims = [any(dims) for dims in zip(singleton_dims, broadcasting_dims)]
 
+<<<<<<< HEAD
         # Remove singleton_dims from broadcasting_dims so that
         # broadcast_shape and broadcasting_dims have the same length
         broadcasting_dims = [
@@ -420,6 +511,8 @@ def lookup_size(exprs: Iterable[sympy.Expr]) -> list[sympy.Expr]:
             if not is_singleton
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def remove_dims(it):
             """Removes any broadcasting or singleton dims from a given sequence"""
             return [
@@ -430,6 +523,7 @@ def remove_dims(it):
 
         # Drop removable dimensions from the input.
         params = BlockParameters(
+<<<<<<< HEAD
             **{
                 key: remove_dims(val) for key, val in dataclasses.asdict(params).items()
             },
@@ -438,6 +532,10 @@ def remove_dims(it):
         transpose = transpose_contiguous and params.strides[-1] != 1
         if transpose:
             params = params.transpose()
+=======
+            **{key: remove_dims(val) for key, val in dataclasses.asdict(params).items()}
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Compute the final shape, adjusting for special kernel types.
         final_shape = [TritonSymbols.get_block_size(tree) for tree in range_trees]
@@ -445,12 +543,15 @@ def remove_dims(it):
             assert range_trees[0].prefix == "x"
             final_shape.pop(0)
 
+<<<<<<< HEAD
         # Check for when BlockParams have been transposed.
         order = list(reversed(range(len(params.shape))))
         if transpose:
             final_shape.reverse()
             order.reverse()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_ndim = V.kernel.num_reduction_dims
         if (
             not V.kernel.inside_reduction
@@ -460,15 +561,25 @@ def remove_dims(it):
             # Need to expand rank to match the rank used inside the reduction loop
             final_shape += [sympy.S.One] * reduction_ndim
 
+<<<<<<< HEAD
         result = cls(
             params=params,
             constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
             order=order,
+=======
+        result = BlockPtrOptions(
+            params=params,
+            constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
+            order=list(reversed(range(len(params.shape)))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mask_vars=mask_vars,
             final_shape=final_shape,
             broadcast_shape=broadcast_shape,
             broadcasting_dims=broadcasting_dims,
+<<<<<<< HEAD
             can_lift=can_lift,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         result.compute_boundary_check(get_max_block, range_trees)
         return result
@@ -482,10 +593,47 @@ def replace_offset(
         roffset = TritonSymbols.block_offsets[symt]
         return sympy_subs(expr, {roffset: replacement})
 
+<<<<<<< HEAD
     def remove_roffsets(self, expr: sympy.Expr) -> sympy.Expr:
         for symt in TritonSymbols.reduction_types:
             expr = self.replace_offset(expr, sympy.Integer(0), symt)
         return expr
+=======
+    def format(self, name: str, roffset=True) -> str:
+        """
+        Codegen a call to tl.make_block_ptr()
+
+        Args:
+            name: variable name for pointer
+            roffset: should rn_offset be included in offsets=..., for use with tl.advance()
+
+        Returns:
+            "tl.make_block_ptr(...)"
+        """
+
+        def remove_roffsets(expr: sympy.Expr) -> sympy.Expr:
+            for symt in TritonSymbols.reduction_types:
+                expr = self.replace_offset(expr, sympy.Integer(0), symt)
+            return expr
+
+        f = V.kernel.index_to_str
+        offsets = [*self.offsets]
+        if not roffset:
+            offsets = [remove_roffsets(offset) for offset in offsets]
+        args = [
+            (
+                f"{name} + ({f(self.constant_offset)})"
+                if self.constant_offset != 0
+                else name
+            ),
+            f"shape={f(self.shape)}",
+            f"strides={f(self.strides)}",
+            f"block_shape={f(self.block_shape)}",
+            f"order={f(self.order)}",
+            f"offsets={f(offsets)}",
+        ]
+        return f"tl.make_block_ptr({', '.join(args)})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compute_boundary_check(
         self,
@@ -543,6 +691,7 @@ def boundary_check(self) -> list[int]:
         assert self._boundary_check is not None
         return self._boundary_check
 
+<<<<<<< HEAD
     def has_indirect(self) -> bool:
         return False  # block_ptr can't do indirect indexing
 
@@ -679,6 +828,8 @@ def format(self, name: str, roffset=True) -> str:
         ]
         return f"tl.make_block_ptr({', '.join(args)})"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def advance_roffset(self, symt: SymT) -> sympy.Expr:
         """
         Codegen string to pass to tl.advance(name, ...).
@@ -698,6 +849,27 @@ def advance_roffset(self, symt: SymT) -> sympy.Expr:
         ]
         return advance
 
+<<<<<<< HEAD
+=======
+    def has_indirect(self) -> bool:
+        return False  # block_ptr can't do indirect indexing
+
+    def has_rindex(self) -> bool:
+        return any(
+            free_symbol_is_type(expr, TritonSymbols.reduction_types)
+            for expr in self.block_shape
+        )
+
+    def has_rmask(self) -> bool:
+        return self.has_rindex()
+
+    def has_tmpmask(self) -> bool:
+        return False  # block_ptr can't do indirect indexing
+
+    def has_mask(self) -> bool:
+        return bool(self.boundary_check())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def triton_reshape(
     value: str, old_shape: Sequence[sympy.Expr], new_shape: Sequence[sympy.Expr]
@@ -726,6 +898,7 @@ def triton_reshape(
     return f"{value}[{', '.join(expand)}]"
 
 
+<<<<<<< HEAD
 def enable_pdl_codegen():
     if not torch._inductor.config.triton.enable_pdl:
         return False
@@ -733,6 +906,8 @@ def enable_pdl_codegen():
     return major >= 9
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: Inheriting from PythonPrinter is somewhat dangerous, because there are a
 # number of operators which Triton "implements", but in a way that is
 # inconsistent with Python semantics (and consistent with C semantics).  We
@@ -801,7 +976,11 @@ def _print_CeilToInt(self, expr: sympy.Expr) -> str:
         return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
 
     def _helper_sqrt(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         return f"tl.sqrt_rn(({self._print(expr)}).to(tl.float32))"
+=======
+        return f"libdevice.sqrt(({self._print(expr)}).to(tl.float32))"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_FloatPow(self, expr: sympy.Expr) -> str:
         return (
@@ -950,6 +1129,7 @@ def low_precision_fp_var(var: Union[CSEVariable, Any]) -> bool:
 
 
 class TritonCSEVariable(CSEVariable):
+<<<<<<< HEAD
     def __init__(
         self,
         name: str,
@@ -963,6 +1143,13 @@ def __init__(
         assert dtype is not None, "TritonCSEVariable must have dtype"
         # TODO: uncomment this and fix the few failures left
         # assert shape is not None, "TritonCSEVariable must have shape"
+=======
+    def __init__(self, name, bounds: ValueRanges[Any], dtype: torch.dtype) -> None:
+        super().__init__(name, bounds, dtype)
+        # We'll use this to track which masks the variable needs when used for indirect indexing
+        self.mask_vars: OrderedSet[str] = OrderedSet()
+        assert dtype is not None, "TritonCSEVariable must have dtype"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def update_on_args(self, name, args, kwargs):
         for arg in args:
@@ -1034,7 +1221,11 @@ def wrapped(*args, **kwargs) -> str:
 
 
 class TritonOverrides(OpOverrides):
+<<<<<<< HEAD
     """Map element-wise ops to Triton e.g., ops.to_dtype(x,...) -> x.to(...)"""
+=======
+    """Map element-wise ops to Triton"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _LOG_2_E = math.log2(math.e)
 
@@ -1083,6 +1274,7 @@ def _get_min_elements_per_thread(
 
         if dtype == torch.bool:
             return f"({x} != 0)"
+<<<<<<< HEAD
         elif dtype == torch.uint8 and (
             src_dtype is not None and src_dtype.is_floating_point or src_dtype is None
         ):
@@ -1091,6 +1283,12 @@ def _get_min_elements_per_thread(
             # optimization - if source type is known and it's not a floating type, then
             # do not apply conversion to the intermediate type.
             return f"{x}.to(tl.int16).to(tl.uint8)"
+=======
+        elif dtype == torch.uint8:
+            # to work around llvm uint conversion semantics
+            # that produces 0's for negative values
+            return f"{x}.to(tl.int8).to(tl.uint8)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if use_compute_types:
             out_dtype = triton_compute_type(dtype)
@@ -1148,6 +1346,7 @@ def abs(x):
 
     @staticmethod
     def truediv(x, y):
+<<<<<<< HEAD
         x_dtype = getattr(x, "dtype", None)
         y_dtype = getattr(y, "dtype", None)
 
@@ -1168,6 +1367,9 @@ def truediv(x, y):
         if torch.xpu.is_available():
             out = f"({x} / {y})"
 
+=======
+        out = f"({x} / {y})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if low_precision_fp_var(x) or low_precision_fp_var(y):
             out_dtype = get_dtype_handler().truediv(x, y)
             if out_dtype in (torch.float16, torch.float32):
@@ -1195,9 +1397,15 @@ def exp(x):
         more details.
         """
         if config.use_fast_math:
+<<<<<<< HEAD
             return f"tl_math.exp({x})"
         else:
             return f"libdevice.exp({x})"
+=======
+            return f"libdevice.exp2({x} * {TritonOverrides._LOG_2_E})"
+        else:
+            return f"tl_math.exp({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1212,7 +1420,11 @@ def expm1(x):
     @staticmethod
     @maybe_upcast_float32()
     def sqrt(x):
+<<<<<<< HEAD
         return f"tl.sqrt_rn({x})"
+=======
+        return f"libdevice.sqrt({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def relu(x):
@@ -1234,6 +1446,7 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
+<<<<<<< HEAD
         if torch.version.hip:
             return f"tl.minimum({a}, {b}, tl.PropagateNan.ALL)"
         else:
@@ -1245,12 +1458,20 @@ def maximum(a, b):
             return f"tl.maximum({a}, {b}, tl.PropagateNan.ALL)"
         else:
             return f"triton_helpers.maximum({a}, {b})"
+=======
+        return f"triton_helpers.minimum({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"triton_helpers.maximum({a}, {b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def where(a, b, c):
         return f"tl.where({a}, {b}, {c})"
 
     @staticmethod
+<<<<<<< HEAD
     def dot(a, b):
         """
         Triton code generation for lowering ops.dot to tl.dot.
@@ -1438,6 +1659,8 @@ def reshape_transpose_broadcast_for_dot(
         return f'tl.dot({a}, {b}, input_precision="{input_precision}")'
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inline_asm_elementwise(
         *inputs, asm, constraints=None, dtype=torch.float32, is_pure=True, pack=1
     ):
@@ -1617,10 +1840,14 @@ def load_seed(name, offset):
     @staticmethod
     @maybe_upcast_float32()
     def rsqrt(x):
+<<<<<<< HEAD
         if torch.version.hip:
             return f"tl.rsqrt({x})"
         else:
             return f"libdevice.rsqrt({x})"
+=======
+        return f"libdevice.rsqrt({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1785,6 +2012,7 @@ def constant(cls, value, dtype):
 
     @classmethod
     def index_expr(cls, expr, dtype):
+<<<<<<< HEAD
         indexing = V.kernel.indexing(
             expr, block_ptr=False, tma_compatibility_checker=None
         )
@@ -1796,6 +2024,11 @@ def index_expr(cls, expr, dtype):
         else:
             shape = TritonSymbols.get_block_shape(indexing.index)
 
+=======
+        indexing = V.kernel.indexing(expr, block_ptr=False)
+        assert isinstance(indexing, IndexingOptions)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Our sympy expr printing casts to the current kernel index dtype.
         # we only respect non int32-int64 dtypes and otherwise use current kernel indexing dtype
         index_dtype = V.kernel.get_index_dtype_as_torch_dtype()
@@ -1810,7 +2043,10 @@ def index_expr(cls, expr, dtype):
                 indexing.index_str,
                 bounds=get_bounds_index_expr(expr),
                 dtype=dtype,
+<<<<<<< HEAD
                 shape=shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         finally:
             config.test_configs.runtime_triton_dtype_assert = orig
@@ -1820,7 +2056,10 @@ def index_expr(cls, expr, dtype):
                 V.kernel.compute,
                 cls.to_dtype(var, dtype),
                 dtype=upcast_compute_type(dtype),
+<<<<<<< HEAD
                 shape=var.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             # TODO: we are not always consistent in enforcing that the output of the index expr printing
@@ -1839,7 +2078,10 @@ def index_expr(cls, expr, dtype):
                     V.kernel.compute,
                     cls.to_dtype(var, index_dtype),
                     dtype=index_dtype,
+<<<<<<< HEAD
                     shape=var.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         var.mask_vars = indexing.mask_vars
@@ -1852,7 +2094,10 @@ def masked(mask, body, other):
                 V.kernel.compute,
                 f"{mask}.to(tl.int1)",
                 dtype=torch.bool,
+<<<<<<< HEAD
                 shape=mask.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         nodes = body.graph.find_nodes(op="output")
@@ -1883,7 +2128,10 @@ def masked(mask, body, other):
                 f"tl.full({result}.shape, {constant_repr(other)}, {result}.dtype)",
                 bounds=ValueRanges.wrap(other),
                 dtype=result.dtype,
+<<<<<<< HEAD
                 shape=result.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             ret = ops.where(new_mask, result, other)
         else:
@@ -1905,14 +2153,20 @@ def frexp(x):
         if cse_val := V.kernel.cse.try_get(cache_key):
             return cse_val
 
+<<<<<<< HEAD
         mantissa = V.kernel.cse.newvar(dtype=x.dtype, shape=x.shape)
         exponent = V.kernel.cse.newvar(dtype=torch.int32, shape=x.shape)
+=======
+        mantissa = V.kernel.cse.newvar(dtype=x.dtype)
+        exponent = V.kernel.cse.newvar(dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.kernel.compute.writeline(
             f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
         )
         V.kernel.cse.put(cache_key, (mantissa, exponent))
         return (mantissa, exponent)
 
+<<<<<<< HEAD
     @staticmethod
     def partial_accumulate(
         name: str,
@@ -1921,6 +2175,8 @@ def partial_accumulate(
     ) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class HelperFunctions:
     """An ordered set of helper functions."""
@@ -1980,6 +2236,7 @@ def __add__(self, other: BlockParameters) -> BlockParameters:
         a, b = tuple(dataclasses.asdict(x) for x in (self, other))
         return cls(**{key: a[key] + b[key] for key in a})
 
+<<<<<<< HEAD
     def transpose(self) -> BlockParameters:
         return BlockParameters(
             self.shape[::-1],
@@ -1988,6 +2245,8 @@ def transpose(self) -> BlockParameters:
             self.offsets[::-1],
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CooperativeReductionWorkspaceCache:
     """
@@ -2007,7 +2266,11 @@ def allocate(self, nbytes: sympy.Expr):
         cached = self.ready_for_reuse.get(nbytes)
         if cached:
             return cached.popleft()
+<<<<<<< HEAD
         ws_name, _, ws_offset = self.args.workspace(nbytes, False)
+=======
+        ws_name, ws_offset = self.args.workspace(nbytes, False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.current_loop.append((nbytes, ws_name, ws_offset))
         return (ws_name, ws_offset)
 
@@ -2049,6 +2312,7 @@ def augment_key(self, cache_key: str) -> Union[str, tuple[str, str]]:
             return cache_key
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
 class TMACompatibilityChecker:
     """
@@ -2245,15 +2509,21 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
     triton kernel programmatically
     """
 
+=======
+class TritonKernel(SIMDKernel[TritonCSEVariable]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overrides = TritonKernelOverrides  # type: ignore[assignment]
     helper_functions: HelperFunctions
     kexpr: Callable[[sympy.Expr], str] = texpr
     allow_block_ptr = True
+<<<<<<< HEAD
     tma_compatibility_checker_cls = TMACompatibilityChecker
     block_ptr_options_cls: type[BlockPtrOptions] = BlockPtrOptions
     tensor_descriptor_options_cls: type[TensorDescriptorOptions] = (
         TensorDescriptorOptions
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -2261,16 +2531,22 @@ def __init__(
         min_elem_per_thread=0,
         optimize_mask=True,
         fixed_config: Optional[FixedTritonConfig] = None,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ) -> None:
         self.optimize_mask: bool = optimize_mask
         self.fixed_config = fixed_config
         super().__init__(tiling, **kwargs)
         self.cse = TritonCSE(self.newvar_prefix, self.suffix)
+<<<<<<< HEAD
         # Cache of values that can be reused for the prologue.
         self.prologue_cache: dict[str, str] = {}
         self.prologue: IndentedBuffer = IndentedBuffer()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.post_loop_combine: IndentedBuffer = IndentedBuffer()
         self.post_loop_store: IndentedBuffer = IndentedBuffer()
         self.outside_loop_vars = OrderedSet[Any]()
@@ -2281,10 +2557,14 @@ def __init__(
         self.pointer_advancements: dict[SymT, dict[str, list[sympy.Expr]]] = (
             collections.defaultdict(dict)
         )
+<<<<<<< HEAD
         self.tma_min_block_sizes = dict[str, int]()
         self.hint_override = hint_override
         self._load_counts: collections.Counter[str] = collections.Counter()
         self._load_index = 0
+=======
+        self._load_counts: collections.Counter[str] = collections.Counter()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # A set of autotuning hints to pass as part of triton_meta
         self.autotune_hints = OrderedSet[AutotuneHint]()
@@ -2301,6 +2581,7 @@ def __init__(
         if self.cooperative_reduction:
             self.init_cooperative_reduction_mask()
 
+<<<<<<< HEAD
         self.has_load_with_contiguous_rdim = False
         # We track the store name since a store can be canceled later
         self.stores_with_contiguous_rdim: list[str] = []
@@ -2339,6 +2620,8 @@ def has_store_with_contiguous_rdim(self) -> bool:
             is_buffer_removed(name) for name in self.stores_with_contiguous_rdim
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return triton_type(dtype)
 
@@ -2445,12 +2728,20 @@ def should_use_persistent_reduction(self) -> bool:
         )
 
     def want_no_x_dim(self):
+<<<<<<< HEAD
         return (
             self.persistent_reduction
             and len(self.numels) == self.num_reduction_dims + 1
             and self.fixed_config
             and self.fixed_config["XBLOCK"] == 1
         )
+=======
+        """
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def assert_function(self) -> str:
@@ -2460,11 +2751,18 @@ def indexing(
         self,
         index: sympy.Expr,
         *,
+<<<<<<< HEAD
         copy_shape: Optional[Union[str, tuple[str]]] = None,
         dense_indexing=False,
         override_mask=None,
         block_ptr=False,
         tma_compatibility_checker: Optional[TMACompatibilityChecker] = None,
+=======
+        copy_shape=None,
+        dense_indexing=False,
+        override_mask=None,
+        block_ptr=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Compute the index and mask to pass to tl.load() or tl.store()
@@ -2504,8 +2802,11 @@ def indexing(
                     for symt in TritonSymbols.block_types
                     if symbol_is_type(var, symt)
                 ]
+<<<<<<< HEAD
                 if len(prefix_matches) == 0:
                     pass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(prefix_matches) == 1, f"Ambiguous type: {var.name}"
                 mask_vars.add(f"{prefix_matches[0]}mask")
 
@@ -2527,6 +2828,7 @@ def indexing(
             dense_mask_vars.add(f"{tree.prefix}mask")
 
         if (
+<<<<<<< HEAD
             (
                 (block_ptr and self.allow_block_ptr and config.triton.use_block_ptr)
                 or (
@@ -2534,6 +2836,11 @@ def indexing(
                     and tma_compatibility_checker.can_use_tma()
                 )
             )
+=======
+            block_ptr
+            and self.allow_block_ptr
+            and config.triton.use_block_ptr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and not override_mask
             and not self._load_mask
             and len(mask_vars - dense_mask_vars) == 0
@@ -2597,9 +2904,13 @@ def match_mod_div_block(
                 )
                 num_dims = max(
                     2,
+<<<<<<< HEAD
                     # range_tree.nodes only includes the entries for the range tree
                     # len(range_tree.nodes) <= self.range_tree_nodes
                     len(range_tree.nodes),
+=======
+                    len(self.range_tree_nodes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (
                         index.count(FloorDiv(index_var, denom))
                         + index.count(ModularIndexing(index_var, denom, modulo))
@@ -2665,7 +2976,11 @@ def match_mod_div_block(
                     offsets=block_offsets,
                 )
 
+<<<<<<< HEAD
             def match_block_subexpr(
+=======
+            def match_block_pointer_subexpr(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 expr: sympy.Expr, range_tree: IterationRangesRoot
             ) -> Optional[BlockParameters]:
                 """
@@ -2681,7 +2996,11 @@ def match_block_subexpr(
 
                 return None
 
+<<<<<<< HEAD
             def match_block_expr() -> Optional[BlockDescriptorOptions]:
+=======
+            def match_block_pointer() -> Optional[BlockPtrOptions]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 index_relative_to_xyr_index = sympy_subs(
                     index, {v: t.expr for v, t in self.range_tree_nodes.items()}
                 )
@@ -2707,7 +3026,11 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                         return None
 
                     # Match the subexpression for this range tree.
+<<<<<<< HEAD
                     params = match_block_subexpr(subexpr, tree)
+=======
+                    params = match_block_pointer_subexpr(subexpr, tree)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if params is None:
                         return None
                     block_params += params
@@ -2715,6 +3038,7 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                 # Collect leftover terms as a constant offset.
                 offset = index_relative_to_xyr_index - sum(index_subexprs)
 
+<<<<<<< HEAD
                 # Form the block pointer or TMA descriptor.
                 self.filter_masks(mask_vars)
 
@@ -2737,11 +3061,17 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                     transpose_contiguous = copy_shape is not None
 
                 options = options_class.create(
+=======
+                # Form the block pointer.
+                self.filter_masks(mask_vars)
+                return BlockPtrOptions.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     params=block_params,
                     constant_offset=offset,
                     range_trees=range_trees,
                     mask_vars=mask_vars,
                     get_max_block=self.max_block,
+<<<<<<< HEAD
                     can_lift=can_lift,
                     transpose_contiguous=transpose_contiguous,
                 )
@@ -2791,6 +3121,19 @@ def _get_expand_str():
                 expand_str = str([1] * len(self.dense_size_list()))
                 expand_shape = tuple([1] * len(self.dense_size_list()))
 
+=======
+                )
+
+            # Return a block pointer, if indexing matches the pattern.
+            options = match_block_pointer()
+            if options is not None:
+                return options
+
+        expand_str = None
+        index_str = self.index_to_str(index)
+        if isinstance(index, sympy.Integer):
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
             if self.fixed_config and not self._has_constant_xmask():
                 mask_vars = OrderedSet(["xmask"])
@@ -2798,6 +3141,7 @@ def _get_expand_str():
                 mask_vars = OrderedSet()
             if self._load_mask:
                 mask_vars.add(self._load_mask)
+<<<<<<< HEAD
             return IndexingOptions(
                 index_str,
                 mask_vars,
@@ -2870,6 +3214,17 @@ def _get_expand_str():
                 _, expand_shape = _get_expand_str()
             else:
                 expand_shape = ()
+=======
+            return IndexingOptions(index_str, mask_vars, expand_str, has_rindex, index)
+
+        if need_dense and not have_dense:
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+            mask_vars = dense_mask_vars
+        elif not have_loop_vars and copy_shape:
+            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
+            mask_vars = dense_mask_vars
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if override_mask:
             mask_vars = OrderedSet([override_mask])
@@ -2879,6 +3234,7 @@ def _get_expand_str():
 
         self.filter_masks(mask_vars)
 
+<<<<<<< HEAD
         return IndexingOptions(
             index_str,
             mask_vars,
@@ -2930,10 +3286,27 @@ def codegen_block_ptr(
             else:
                 other = f", boundary_check={check!r}"
 
+=======
+        return IndexingOptions(index_str, mask_vars, expand_str, has_rindex, index)
+
+    def codegen_block_ptr(
+        self, name: str, var: str, indexing: BlockPtrOptions, other=""
+    ) -> tuple[str, str]:
+        check = indexing.boundary_check()
+        if not check:
+            # workaround https://github.com/triton-lang/triton/issues/2813
+            other = ""
+        elif other:
+            assert other == ", other=0.0"
+            other = f", boundary_check={check!r}, padding_option='zero'"
+        else:
+            other = f", boundary_check={check!r}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             self.inside_reduction
             and self.range_trees[-1].is_loop
             and indexing.has_rindex()
+<<<<<<< HEAD
         ) or indexing.can_lift:
             if indexing.can_lift and var in self.prologue_cache:
                 # Check for epilogue subtiling to reuse the same
@@ -3021,6 +3394,44 @@ def stringify_shape(shape):
         #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
         #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
         #  2. In case the block pointer / tma descriptor has different dimensionality, broadcast/reshape the
+=======
+        ):
+            block_ptr = f"block_ptr{next(self.block_ptr_id)}"
+            self.body.writeline(
+                DeferredLine(
+                    name, f"{block_ptr} = {indexing.format(var, roffset=False)}"
+                )
+            )
+            # Store for later use. If the buffer is removed the below advancements
+            # are no longer necessary
+            self.block_ptr_to_buffer[block_ptr] = name
+
+            # Generate block pointer advancements, for later use.
+            for symt in TritonSymbols.reduction_types:
+                advance_offsets = indexing.advance_roffset(symt)
+
+                # Ignore identity advancements.
+                if all(
+                    V.graph.sizevars.statically_known_equals(offset, sympy.Integer(0))
+                    for offset in advance_offsets
+                ):
+                    continue
+
+                advancements = self.pointer_advancements[symt]
+                assert block_ptr not in advancements, (
+                    "duplicate advancement for pointer '{block_ptr}' at type '{symt}'"
+                )
+                advancements[block_ptr] = advance_offsets
+        else:
+            block_ptr = indexing.format(var)
+        return block_ptr, other
+
+    def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
+        # Stores require an explicit broadcast. We do this in two phases:
+        #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
+        #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
+        #  2. In case the block pointer has different dimensionality, broadcast/reshape the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #     result to the shape of the pointer.
         value = f"tl.broadcast_to({value}, {indexing.final_shape})"
 
@@ -3037,9 +3448,13 @@ def stringify_shape(shape):
 
         # workaround https://github.com/triton-lang/triton/issues/2814
         value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
+<<<<<<< HEAD
         if isinstance(indexing, BlockPtrOptions):
             return f"tl.store({block_ptr}, {value}{other})"
         return f"{block_ptr}.store({V.kernel.index_to_str(indexing.offsets)}, {value})"
+=======
+        return f"tl.store({block_ptr}, {value}{other})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def check_bounds(
         self,
@@ -3052,7 +3467,11 @@ def check_bounds(
             return
 
         assert isinstance(expr, sympy.Expr)
+<<<<<<< HEAD
         indexing = self.indexing(expr, block_ptr=False, tma_compatibility_checker=None)
+=======
+        indexing = self.indexing(expr, block_ptr=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(indexing, IndexingOptions)
 
         index_str = indexing.index_str
@@ -3082,6 +3501,7 @@ def get_load_buffer(self, indexing):
         else:
             return self.loads
 
+<<<<<<< HEAD
     def _handle_pdl_before_load(self, wait_buffer):
         GDC_WAIT = "tl.extra.cuda.gdc_wait()"
         self._load_index += 1
@@ -3112,12 +3532,16 @@ def load(self, name: str, index: sympy.Expr):
         """
         Load from the memory location 'name', offset by some indexing expression 'index'.
         """
+=======
+    def load(self, name: str, index: sympy.Expr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var = self.args.input(name)
         load_counts = self._load_counts
         load_counts[name] += 1
         make_line: Callable[[str], Union[str, DelayReplaceLine]] = identity
         indirect_indexing = self.is_indirect_indexing(index)
         original_index = index
+<<<<<<< HEAD
         dtype = V.graph.get_dtype(name)
         indexing = self.indexing(
             index,
@@ -3135,6 +3559,9 @@ def load(self, name: str, index: sympy.Expr):
         ):
             self.has_load_with_contiguous_rdim = True
 
+=======
+        indexing = self.indexing(index, block_ptr=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         has_rindex = indexing.has_rindex()
         has_tmpmask = indexing.has_tmpmask()
 
@@ -3199,13 +3626,18 @@ def decide_later():
             cachemod = ", cache_modifier='.cg'"
 
         append_broadcast = None
+<<<<<<< HEAD
         shape: BlockShapeType = None
+=======
+        dtype = V.graph.get_dtype(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if should_unwrap_unspec_arg(name):
             line = var
             # unwrapped bf16/fp16 0d tensors are passed in as float32 scalars
             # see triton_utils.py:signature_of
             if dtype in (torch.float16, torch.bfloat16):
+<<<<<<< HEAD
                 if config.triton.codegen_upcast_to_fp32:
                     dtype = torch.float32
                 else:
@@ -3241,6 +3673,23 @@ def decide_later():
                 else:
                     shape = TritonSymbols.get_block_shape(indexing.index)
 
+=======
+                dtype = torch.float32
+
+        else:
+            if isinstance(indexing, BlockPtrOptions):
+                block_ptr, other = self.codegen_block_ptr(name, var, indexing, other)
+                line = f"tl.load({block_ptr}{other}{ep}{cachemod})"
+                line = indexing.codegen_broadcast_and_reshape(
+                    line, indexing.block_shape, indexing.final_shape, True
+                )
+            elif isinstance(original_index, sympy.Integer):
+                line = f"tl.load({var} + ({original_index}))"
+                append_broadcast = indexing.expand_str
+            else:
+                line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other}{cachemod})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 dtype in (torch.float16, torch.bfloat16)
                 and config.triton.codegen_upcast_to_fp32
@@ -3255,11 +3704,15 @@ def decide_later():
                 dtype = torch.bool
 
         load_buffer = self.get_load_buffer(indexing)
+<<<<<<< HEAD
         self._handle_pdl_before_load(load_buffer)
         result_var = self.cse.generate(
             load_buffer, make_line(line), dtype=dtype, shape=shape
         )
         self._handle_pdl_after_load(load_buffer, result_var)
+=======
+        result_var = self.cse.generate(load_buffer, make_line(line), dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if result_var.use_count > 1:
             load_counts[name] -= 1  # don't double count cache hit
         assert isinstance(result_var, TritonCSEVariable)
@@ -3267,9 +3720,13 @@ def decide_later():
 
         if append_broadcast:
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
+<<<<<<< HEAD
             result_var = self.cse.generate(
                 load_buffer, line, dtype=dtype, shape=indexing.expand_shape
             )
+=======
+            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if indexing.mask_vars:
                 if dtype.is_floating_point:
                     zero = "0.0"
@@ -3281,9 +3738,13 @@ def decide_later():
                     constant_repr(self._load_other) if self._load_other else zero
                 )
                 line = f"tl.where({indexing.mask_str}, {result_var}, {other_val})"
+<<<<<<< HEAD
                 result_var = self.cse.generate(
                     load_buffer, line, dtype=dtype, shape=result_var.shape
                 )
+=======
+                result_var = self.cse.generate(load_buffer, line, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
             self.outside_loop_vars.add(result_var)
@@ -3293,6 +3754,7 @@ def decide_later():
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> None:
+<<<<<<< HEAD
         """
         store the 'value' to the memory location 'name', offset by some indexing expression 'index'.
         """
@@ -3321,6 +3783,11 @@ def store(
             indexing.index
         ):
             self.stores_with_contiguous_rdim.append(name)
+=======
+        var = self.args.output(name)
+        original_index = index
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Guard against write-after-read corruption in triton.
         # See # https://github.com/triton-lang/triton/issues/1615
@@ -3333,6 +3800,7 @@ def store(
         if is_inplace and is_broadcasted:
             self.stores.writeline(DeferredLine(name, "tl.debug_barrier()"))
 
+<<<<<<< HEAD
         if isinstance(indexing, (BlockPtrOptions, TensorDescriptorOptions)):
             block_descriptor, other = self.codegen_block_ptr(name, var, indexing)
             # block_ptr / tma descriptor stores don't do implicit casting
@@ -3366,6 +3834,18 @@ def store(
                 value_shape = ", ".join(map(str, value.shape))
                 indexing_str += f".broadcast_to({value_shape})"
             line = f"tl.atomic_add({var} + ({indexing_str}), {value}, {indexing.mask_str}, sem='relaxed')"
+=======
+        if isinstance(indexing, BlockPtrOptions):
+            block_ptr, other = self.codegen_block_ptr(name, var, indexing)
+            # block_ptr stores don't do implicit casting
+            line = self.codegen_block_ptr_store_line(
+                name, indexing, block_ptr, value, other
+            )
+        elif mode is None:
+            line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+        elif mode == "atomic_add":
+            line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str}, sem='relaxed')"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise NotImplementedError(f"store mode={mode}")
 
@@ -3380,9 +3860,12 @@ def store(
 
         exit_stack.close()
 
+<<<<<<< HEAD
     def device_assert_async(self, cond, msg) -> None:
         self.compute.writeline(f"tl.device_assert({cond}, {repr(msg)})")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def guard_cooperative_store(self, name, buffer):
         """
         For cooperative reductions only one thread block should write out the result.
@@ -3392,6 +3875,7 @@ def guard_cooperative_store(self, name, buffer):
         buffer.writeline(DeferredLine(name, f"if rsplit_id == ({idx} % RSPLIT):"))
         return buffer.indent()
 
+<<<<<<< HEAD
     def _combine_masks(self, *variables: Optional[CSEVariable]):
         masks = None
         for elem in variables:
@@ -3404,6 +3888,8 @@ def _combine_masks(self, *variables: Optional[CSEVariable]):
                     masks = masks | elem.mask_vars
         return masks
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def bucketize(
         self,
         values: CSEVariable,
@@ -3440,7 +3926,10 @@ def bucketize(
                 "Bucketize only supports indexing with int32 and int64"
             )
 
+<<<<<<< HEAD
         self._handle_pdl_before_load(self.compute)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = self.cse.generate(
             self.compute,
             f"triton_helpers.bucketize_binary_search({values}, "
@@ -3452,12 +3941,16 @@ def bucketize(
             f"{sorter_indices}, "
             ")",
             dtype=indexing_dtype,  # type: ignore[attr-defined]
+<<<<<<< HEAD
             shape=values.shape,
         )
         self._handle_pdl_after_load(self.compute, result)
 
         masks = self._combine_masks(values, boundary_indices, sorter_indices)
         result.mask_vars = masks  # type: ignore[attr-defined]
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return result
 
@@ -3470,6 +3963,7 @@ def reduction_resize(self, value) -> str:
         sizes = [":"] * (ndims - nreduce) + ["None"] * nreduce
         return f"{value}[{', '.join(sizes)}]"
 
+<<<<<<< HEAD
     def reduction_resize_and_shape(self, value, shape) -> tuple[str, BlockShapeType]:
         ndims = self.triton_tensor_ndim()
         if ndims == 1:
@@ -3485,6 +3979,9 @@ def reduction_resize_and_shape(self, value, shape) -> tuple[str, BlockShapeType]
     def reduction_collapse_dims(
         self, buffer, value: CSEVariable, dtype: torch.dtype
     ) -> CSEVariable:
+=======
+    def reduction_collapse_dims(self, buffer, value: str, dtype: torch.dtype) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Reshape to RBLOCK, collapsing all reduction dims.
         """
@@ -3495,11 +3992,18 @@ def reduction_collapse_dims(
         target_ndim = self.triton_tensor_ndim() - self.num_reduction_dims
         initial_shape = self.dense_size_list()
         target_shape = initial_shape[:target_ndim] + ["RBLOCK"]
+<<<<<<< HEAD
         return self.cse.generate(
             buffer,
             triton_reshape(str(value), initial_shape, target_shape),
             dtype=dtype,
             shape=tuple(target_shape),
+=======
+        return str(
+            self.cse.generate(
+                buffer, triton_reshape(value, initial_shape, target_shape), dtype=dtype
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def reduction(
@@ -3509,10 +4013,13 @@ def reduction(
         reduction_type: ReductionType,
         value: Union[CSEVariable, tuple[CSEVariable, ...]],
     ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+<<<<<<< HEAD
         """
         codegen reduction of value to Triton according the reduction_type
         """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def maybe_upcast(value: CSEVariable) -> CSEVariable:
             # Math reductions in FP16/BF16 are less accurate because the Triton compiler does not
             # automatically promote to FP32 for accumulation. Additionally, max/min reductions
@@ -3542,6 +4049,7 @@ def maybe_upcast(value: CSEVariable) -> CSEVariable:
             masks.append(self._load_mask)
         reduction_range_prefix = self.range_trees[-1].prefix[0]
 
+<<<<<<< HEAD
         # When we do native matmtul codegen,
         # we don't want to keep the R0_BLOCK/R1_BLOCK in the accumulator.
         # so instead of naively calling dense_size_str(), we filter out
@@ -3560,32 +4068,45 @@ def maybe_upcast(value: CSEVariable) -> CSEVariable:
             dense_size_str = self.dense_size_str()
             value_shape = tuple(self.dense_size_list())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Say we have
         #     tmp0 = ops.constant(1, torch.int64)
         #     tmp1 = ops.reduction(torch.int64, torch.int64, "sum", tmp0)
         # tmp0 in the triton code is either a scalar, or single-element tensor
         # so if we emit tl.sum directly, it will only give 1 instead of RBLOCK * 1
         # To avoid this, we broadcast to the expected shape first.
+<<<<<<< HEAD
+=======
+        dense_size_str = self.dense_size_str()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = self._map_tuple_or_scalar(
             lambda v: self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({v}, {dense_size_str})",
                 dtype=v.dtype,
+<<<<<<< HEAD
                 shape=value_shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             value,
         )
 
+<<<<<<< HEAD
         logical_index = None
         if reduction_type in ("argmin", "argmax"):
             if isinstance(value, tuple):
                 value, logical_index = value
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
         root_op: str
 
         def final_reduction(
             buffer,
+<<<<<<< HEAD
             value: CSEVariable,
             result_type: Optional[torch.dtype],
         ) -> tuple[str, Optional[torch.dtype], BlockShapeType]:
@@ -3622,11 +4143,46 @@ def final_reduction_define(
             result_var: CSEVariable,
             value: CSEVariable,
             result_type: Optional[torch.dtype],
+=======
+            value: str,
+            result_type: Optional[str],
+        ) -> str:
+            """
+            Helper to generate a reduction call, e.g. tl.sum.
+            """
+            use_helper = reduction_type in ("any", "max", "min", "prod")
+            module = "triton_helpers" if use_helper else "tl"
+
+            value = self.reduction_collapse_dims(buffer, value, dtype)
+            if reduction_type in ("max", "min"):
+                value = self.reduction_resize(
+                    f"{module}.{reduction_type}2({value}, {dim})"
+                )
+            else:
+                value = self.reduction_resize(
+                    f"{module}.{reduction_type}({value}, {dim})"
+                )
+
+            if result_type is not None:
+                value = f"{value}.to({result_type})"
+
+            return value
+
+        def final_reduction_define(
+            buffer,
+            result_var: str,
+            value: str,
+            result_type: Optional[str],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> None:
             """
             Generate a reduction and assign it to an existing variable.
             """
+<<<<<<< HEAD
             value, _, _ = final_reduction(buffer, value, result_type)
+=======
+            value = final_reduction(buffer, value, result_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer.splice(f"{result_var} = {value}")
 
         def final_argreduce(buffer, result_var, value, index):
@@ -3645,11 +4201,15 @@ def final_argreduce(buffer, result_var, value, index):
 
         acc_type = triton_acc_type(src_dtype)
         torch_acc_type = upcast_acc_dtype(src_dtype)
+<<<<<<< HEAD
         result_shape = list(self.dense_size_list())
         result_shape[dim] = "1"
         result_var: Any = self.cse.newvar(
             dtype=torch_acc_type, shape=tuple(result_shape)
         )
+=======
+        result_var: Any = self.cse.newvar(dtype=torch_acc_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result_var.mask_vars = OrderedSet(
             var for var in masks if not prefix_is_reduction(var[0])
         )
@@ -3662,6 +4222,7 @@ def where_cond(tval, fval):
 
         if self.persistent_reduction:
             default = ir.Reduction.default_value(reduction_type, src_dtype)
+<<<<<<< HEAD
 
             def update_constant_dtype(constant, src_dtype, dst_dtype):
                 "update reduction constant mask value to match dst_dtype"
@@ -3690,6 +4251,13 @@ def _mask_value(value, default) -> CSEVariable:
                     where_cond(value, default_str),
                     dtype=value.dtype,
                     shape=value.shape,
+=======
+            default = self._map_tuple_or_scalar(constant_repr, default)
+
+            def _mask_value(value, default) -> CSEVariable:
+                return self.cse.generate(
+                    self.compute, where_cond(value, default), dtype=value.dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             masked_value: Union[CSEVariable, Sequence[CSEVariable]]
@@ -3698,6 +4266,7 @@ def _mask_value(value, default) -> CSEVariable:
                 # will fallback below
                 pass
             elif isinstance(value, tuple):
+<<<<<<< HEAD
                 masked_value = [_mask_value(v, d) for v, d in zip(value, default)]  # type: ignore[arg-type]
             elif reduction_type == "dot":
                 # Here, we don't perform the masking.
@@ -3705,10 +4274,14 @@ def _mask_value(value, default) -> CSEVariable:
                 # Since tl.dot performs reduction within the triton block,
                 # masking should happen before the tl.dot is called.
                 masked_value = self.cse.generate(self.compute, value, dtype=value.dtype)
+=======
+                masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 masked_value = _mask_value(value, default)
 
             if reduction_type in ("argmax", "argmin"):
+<<<<<<< HEAD
                 assert isinstance(masked_value, CSEVariable)
                 accumulator_dtype = V.kernel.get_index_dtype_as_torch_dtype()
                 if logical_index:
@@ -3722,6 +4295,16 @@ def _mask_value(value, default) -> CSEVariable:
                             shape=masked_value.shape,
                         )
                     )
+=======
+                accumulator_dtype = V.kernel.get_index_dtype_as_torch_dtype()
+                accumulator_index = str(
+                    self.cse.generate(
+                        self.compute,
+                        f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
+                        dtype=accumulator_dtype,
+                    )
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
                 final_argreduce(
                     self.compute, result_var, masked_value, accumulator_index
@@ -3742,8 +4325,13 @@ def _mask_value(value, default) -> CSEVariable:
                 assert isinstance(masked_value, Sequence)
                 (mean, m2, weight) = masked_value
                 result_var = tuple(
+<<<<<<< HEAD
                     self.cse.generate(self.compute, value, dtype=dtype, shape=shape)
                     for value, shape in self._welford(
+=======
+                    self.cse.generate(self.compute, value, dtype=dtype)
+                    for value in self._welford(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.compute, mean, m2, weight, dim, dtype
                     )
                 )
@@ -3753,6 +4341,7 @@ def _mask_value(value, default) -> CSEVariable:
                 result_var = self.prepare_softmax_twopass_fallback(dtype, value)
             else:
                 assert isinstance(masked_value, CSEVariable)
+<<<<<<< HEAD
                 _result, _dtype, _shape = final_reduction(
                     self.compute, masked_value, masked_value.dtype
                 )
@@ -3783,6 +4372,21 @@ def _mask_value(value, default) -> CSEVariable:
                     self.body.writeline(
                         f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
                     )
+=======
+                result_var = self.cse.generate(
+                    self.compute,
+                    final_reduction(self.compute, str(masked_value), None),
+                    dtype=masked_value.dtype,
+                )
+        else:
+            accumulator = self.cse.namedvar(f"_{result_var}", dtype=torch_acc_type)
+            default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(constant_repr, default)
+            if not isinstance(default, tuple):
+                self.body.writeline(
+                    f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if reduction_type in ("argmax", "argmin"):
                 accumulator_index = f"_{result_var}_index"
@@ -3792,6 +4396,7 @@ def _mask_value(value, default) -> CSEVariable:
                     f"{torch.iinfo(index_dtype).max}, {self.dtype_to_str(index_dtype)})"
                 )
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+<<<<<<< HEAD
                 # Use logical_index if it was unpacked, otherwise fall back to physical index
                 index_var = (
                     f"({str(logical_index)}).to({self.dtype_to_str(index_dtype)})"
@@ -3802,6 +4407,13 @@ def _mask_value(value, default) -> CSEVariable:
                     f"""\
                 {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index(
                     {accumulator}, {accumulator_index}, {value}, {index_var}
+=======
+
+                self.compute.splice(
+                    f"""\
+                {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index(
+                    {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 {accumulator} = {where_cond(f"{accumulator}_next", accumulator)}
                 {accumulator_index} = {where_cond(f"{accumulator_index}_next", accumulator_index)}
@@ -3848,7 +4460,11 @@ def _mask_value(value, default) -> CSEVariable:
                 # reduce. Similar to the final reduction for coopereative
                 # reduction
                 result_max = result_var
+<<<<<<< HEAD
                 result_sum = self.cse.newvar(dtype=dtype, shape=result_max.shape)
+=======
+                result_sum = self.cse.newvar(dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 result_var = self.online_softmax_reduce_final_reduction(
                     self.post_loop_combine,
@@ -3862,12 +4478,18 @@ def _mask_value(value, default) -> CSEVariable:
             else:
                 combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
                 updated = combine_fn(accumulator, value)
+<<<<<<< HEAD
                 if reduction_type == "dot":
                     self.compute.writeline(f"{accumulator} = {updated}")
                 else:
                     self.compute.writeline(
                         f"{accumulator} = {where_cond(updated, accumulator)}"
                     )
+=======
+                self.compute.writeline(
+                    f"{accumulator} = {where_cond(updated, accumulator)}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if src_dtype == torch.bool:
                     # This is only really used for aten.any. It changes the
@@ -3876,6 +4498,7 @@ def _mask_value(value, default) -> CSEVariable:
                     # to
                     #     tmp5 = triton_helpers.max(_tmp5.to(tl.int8), 1)[:, None].to(tl.int1)
                     # which is needed because tl.reduce doesn't support tl.int1
+<<<<<<< HEAD
                     accumulator = self.cse.generate(
                         self.post_loop_combine,
                         f"{accumulator}.to(tl.int8)",
@@ -3886,6 +4509,20 @@ def _mask_value(value, default) -> CSEVariable:
                 final_reduction_define(
                     self.post_loop_combine, result_var, accumulator, None
                 )
+=======
+                    accumulator_casted_str = f"{accumulator}.to(tl.int8)"
+                    result_type = triton_compute_type(dtype)
+                    final_reduction_define(
+                        self.post_loop_combine,
+                        str(result_var),
+                        accumulator_casted_str,
+                        result_type,
+                    )
+                else:
+                    final_reduction_define(
+                        self.post_loop_combine, str(result_var), str(accumulator), None
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.cooperative_reduction:
             default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
@@ -3938,7 +4575,10 @@ def _mask_value(value, default) -> CSEVariable:
                 )
             elif reduction_type == "online_softmax_reduce":
                 result_max, result_sum = result_var
+<<<<<<< HEAD
                 assert isinstance(default, Sequence)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 peer_max = self.codegen_cooperative_reduction_peer_combine(
                     result_max, upcast_acc_dtype(src_dtype), default[0]
                 )
@@ -3958,7 +4598,13 @@ def _mask_value(value, default) -> CSEVariable:
                 peers = self.codegen_cooperative_reduction_peer_combine(
                     result_var, upcast_acc_dtype(src_dtype), default
                 )
+<<<<<<< HEAD
                 final_reduction_define(self.post_loop_store, result_var, peers, None)
+=======
+                final_reduction_define(
+                    self.post_loop_store, str(result_var), peers, None
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             exit_stack.close()
 
         self.cse.reduction_cache[cache_key] = result_var
@@ -4018,6 +4664,7 @@ def _welford(self, buffer, mean, m2, weight, dim, dtype: torch.dtype):
             for value in (mean, m2, weight)
         )
         welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})"
+<<<<<<< HEAD
 
         def reduced_shape(shape):
             return tuple(shape[0:dim] + shape[dim + 1 :])
@@ -4032,6 +4679,13 @@ def reduced_shape(shape):
             self.reduction_resize_and_shape(value, value.shape)
             for value in welford_results
         )
+=======
+        welford_results = [str(self.cse.newvar(dtype=dtype)) for _ in range(3)]
+        buffer.writeline(f"{', '.join(welford_results)} = {welford}")
+
+        result_values = tuple(self.reduction_resize(value) for value in welford_results)
+        return result_values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def welford_reduce(
         self, result_var, reduction_type, value, where_cond, acc_type, dtype
@@ -4039,6 +4693,7 @@ def welford_reduce(
         """Helper to codegen a welford reduction"""
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
 
+<<<<<<< HEAD
         accumulator = TritonCSEVariable(
             f"{result_var}_mean",
             shape=tuple(self.dense_size_list()),
@@ -4057,6 +4712,11 @@ def welford_reduce(
             dtype=acc_type,
             bounds=ValueRanges.unknown(),
         )
+=======
+        accumulator = f"{result_var}_mean"
+        accumulator_m2 = f"{result_var}_m2"
+        accumulator_weight = f"{result_var}_weight"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.body.writeline(
             f"{accumulator} = tl.zeros({self.dense_size_str()}, {acc_type})"
         )
@@ -4093,11 +4753,21 @@ def welford_reduce(
             """
         )
         result_mean = result_var
+<<<<<<< HEAD
         return self.welford_reduce_final_reduction(
             self.post_loop_combine,
             result_mean,
             None,
             None,
+=======
+        result_m2 = self.cse.newvar(dtype=dtype)
+        result_weight = self.cse.newvar(dtype=dtype)
+        return self.welford_reduce_final_reduction(
+            self.post_loop_combine,
+            result_mean,
+            result_m2,
+            result_weight,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             accumulator,
             accumulator_m2,
             accumulator_weight,
@@ -4118,6 +4788,7 @@ def welford_reduce_final_reduction(
         dtype,
     ):
         """Helper to codegen call to triton_helpers.welford"""
+<<<<<<< HEAD
         values = list(self._welford(buffer, mean, m2, weight, dim, dtype))
 
         result_exprs = [result_mean, result_m2, result_weight]
@@ -4128,10 +4799,19 @@ def welford_reduce_final_reduction(
             buffer.splice(f"{result_expr} = {value}")
 
         return tuple(result_exprs)
+=======
+        values = self._welford(buffer, mean, m2, weight, dim, dtype)
+        result_exprs = [result_mean, result_m2, result_weight]
+        for result_expr, value in zip(result_exprs, values):
+            buffer.splice(f"{result_expr} = {value}")
+
+        return result_mean, result_m2, result_weight
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def online_softmax_reduce_final_reduction(
         self, buffer, result_max, result_sum, peer_max, peer_sum, dim, dtype
     ):
+<<<<<<< HEAD
         accumulator_max = self.reduction_collapse_dims(buffer, peer_max, dtype)
         accumulator_sum = self.reduction_collapse_dims(buffer, peer_sum, dtype)
         buffer.splice(
@@ -4142,6 +4822,13 @@ def online_softmax_reduce_final_reduction(
             {result_sum} = {self.reduction_resize(f"{result_sum}")}
             """
         )
+=======
+        values = self._online_softmax_reduce(buffer, peer_max, peer_sum, dim, dtype)
+        result_exprs = [result_max, result_sum]
+        for result_expr, value in zip(result_exprs, values):
+            buffer.splice(f"{result_expr} = {value}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result_max, result_sum
 
     def max_rsplit(self):
@@ -4151,7 +4838,11 @@ def max_rsplit(self):
 
     def codegen_cooperative_reduction_peer_combine(
         self, result_var, dtype, default_val
+<<<<<<< HEAD
     ) -> CSEVariable:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
         column.  After the barrier, every thread block loads the completed value so that it can compute the final
@@ -4170,6 +4861,7 @@ def codegen_cooperative_reduction_peer_combine(
             """,
             strip=True,
         )
+<<<<<<< HEAD
         peers = self.create_cse_var(
             f"{result_var}_peers",
             shape=["XBLOCK", "RSPLIT"],
@@ -4181,11 +4873,19 @@ def codegen_cooperative_reduction_peer_combine(
             f"rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, {constant_repr(default_val)}))"
         )
         return peers
+=======
+        self.post_loop_store.writeline(
+            f"{result_var}_peers = tl.load({result_var}_ws + (xindex * RSPLIT + rsplit_arange), "
+            f"rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, {constant_repr(default_val)}))"
+        )
+        return f"{result_var}_peers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def store_reduction(
         self,
         name: str,
         index: sympy.Expr,
+<<<<<<< HEAD
         value: CSEVariable,
     ):
         assert self.inside_reduction
@@ -4201,6 +4901,13 @@ def store_reduction(
                 force=False,
             ),
         )
+=======
+        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+    ):
+        assert self.inside_reduction
+        self.inside_reduction = False
+        indexing = self.indexing(index, block_ptr=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.inside_reduction = True
         var = self.args.output(name)
 
@@ -4210,7 +4917,11 @@ def store_reduction(
                 self.guard_cooperative_store(name, self.post_loop_store)
             )
 
+<<<<<<< HEAD
         if isinstance(indexing, (BlockPtrOptions, TensorDescriptorOptions)):
+=======
+        if isinstance(indexing, BlockPtrOptions):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.post_loop_store.writeline(
                 DeferredLine(
                     name,
@@ -4225,6 +4936,7 @@ def store_reduction(
             )
         else:
             assert isinstance(indexing, IndexingOptions)
+<<<<<<< HEAD
 
             indexing_str = indexing.index_str
             if (
@@ -4239,14 +4951,24 @@ def store_reduction(
                 DeferredLine(
                     name,
                     f"tl.store({var} + ({indexing_str}), {value}, {indexing.mask_str})",
+=======
+            self.post_loop_store.writeline(
+                DeferredLine(
+                    name,
+                    f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
         exit_stack.close()
 
+<<<<<<< HEAD
     def _lift_helper(
         self, fn, values: tuple[CSEVariable, ...], dtypes: tuple[torch.dtype, ...]
     ) -> str:
+=======
+    def _lift_helper(self, fn, num_args, dtypes: tuple[torch.dtype, ...]) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Lift IR function for scan operations into a triton function
         # in the global namespace
         helper = IndentedBuffer()
@@ -4254,10 +4976,14 @@ def _lift_helper(
         cse = CSE()
 
         args = [
+<<<<<<< HEAD
             tuple(
                 cse.namedvar(f"arg{i}_{n}", dtype=dtype, shape=value.shape)
                 for n, (value, dtype) in enumerate(zip(values, dtypes))
             )
+=======
+            tuple(cse.namedvar(f"arg{i}_{n}", dtype=dtypes[n]) for n in range(num_args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(2)
         ]
         signature = ", ".join(str(x) for x in itertools.chain.from_iterable(args))
@@ -4272,9 +4998,13 @@ def _lift_helper(
         helper_name = "_triton_helper_fn"
 
         from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+<<<<<<< HEAD
         from torch._inductor.shape_propagation import ShapePropagationOpsHandler
 
         shape_handler = ShapePropagationOpsHandler()
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype_handler = DtypePropagationOpsHandler()
 
         class CSEProxy(DefaultHandler):
@@ -4289,16 +5019,22 @@ def _default(
                     name,
                 )(*args, **kwargs)
 
+<<<<<<< HEAD
                 output_shape = getattr(
                     shape_handler,
                     name,
                 )(*args, **kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return cse.generate(
                     helper,
                     getattr(overrides, name)(*args, **kwargs),
                     dtype=output_dtype,
+<<<<<<< HEAD
                     shape=output_shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         with helper.indent(), V.set_ops_handler(CSEProxy()):
@@ -4316,9 +5052,12 @@ def scan(
         ],
         values: tuple[CSEVariable, ...],
     ) -> tuple[CSEVariable, ...]:
+<<<<<<< HEAD
         """
         Perform an associative scan on 'values'.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.inside_reduction
         assert not self.cooperative_reduction, "TODO"
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
@@ -4331,7 +5070,11 @@ def scan(
 
         dtypes = tuple(upcast_compute_type(dtype) for dtype in dtypes)
         cse_compute = functools.partial(self.cse.generate, self.compute)
+<<<<<<< HEAD
         combine_helper_fn = self._lift_helper(combine_fn, values, dtypes)
+=======
+        combine_helper_fn = self._lift_helper(combine_fn, len(values), dtypes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
 
         for value, dtype in zip(values, dtypes):
@@ -4339,19 +5082,26 @@ def scan(
                 self.compute,
                 f"{value}.to({triton_compute_type(dtype)})",
                 dtype=dtype,
+<<<<<<< HEAD
                 shape=value.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             value = self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({value_dtype}, {self.dense_size_str()})",
                 dtype=dtype,
+<<<<<<< HEAD
                 shape=tuple(self.dense_size_list()),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             broadcasted_values.append(value)
 
             acc_type = triton_acc_type(dtype)
 
             if not self.persistent_reduction:
+<<<<<<< HEAD
                 reduced_size = self.dense_size_list()
                 reduced_size[-1] = "1"
                 accumulator = self.cse.newvar(dtype=dtype, shape=reduced_size)
@@ -4360,6 +5110,16 @@ def scan(
                 default = "float('nan')" if dtype.is_floating_point else "-1"
                 self.body.writeline(
                     f"{accumulator} = tl.full({reduced_size_str}, {default}, {acc_type})"
+=======
+                accumulator = self.cse.newvar(dtype=dtype)
+                reduced_size = self.dense_size_list()
+                reduced_size[-1] = "1"
+                reduced_size = f"[{', '.join(reduced_size)}]"
+
+                default = "float('nan')" if dtype.is_floating_point else "-1"
+                self.body.writeline(
+                    f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 accumulators.append(accumulator)
@@ -4372,10 +5132,14 @@ def cse_multiple(line, values, masks, dtypes):
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
             if all(self.cse.contains(cache_key) for cache_key in cache_keys):
                 return [self.cse.get(cache_key) for cache_key in cache_keys]
+<<<<<<< HEAD
             result_vars = [
                 self.cse.newvar(dtype=dtype, shape=value.shape)
                 for (dtype, value) in zip(dtypes, values)
             ]
+=======
+            result_vars = [self.cse.newvar(dtype=_dtype) for _dtype in dtypes]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -4387,7 +5151,11 @@ def cse_multiple(line, values, masks, dtypes):
 
         partial_scan_vars = cse_multiple(
             f"tl.associative_scan(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
+<<<<<<< HEAD
             broadcasted_values,
+=======
+            values,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             masks,
             dtypes,
         )
@@ -4396,6 +5164,7 @@ def cse_multiple(line, values, masks, dtypes):
             # tl.reduce doesn't work for non-commutative operators, so instead
             # of repeating the scan op as a reduction, we use sum to select the
             # last scan value
+<<<<<<< HEAD
             def _partial_scan_shape(var):
                 if var.shape is None:
                     return None
@@ -4404,11 +5173,16 @@ def _partial_scan_shape(var):
                     shape[-1] = "1"
                     return shape
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partial_reduce_vars = [
                 cse_compute(
                     f"triton_helpers.select_one(({partial_scan_var}), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)",
                     dtype=upcast_compute_type(partial_scan_var.dtype),
+<<<<<<< HEAD
                     shape=_partial_scan_shape(partial_scan_var),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for partial_scan_var in partial_scan_vars
             ]
@@ -4418,7 +5192,10 @@ def _partial_scan_shape(var):
                 cse_compute(
                     f"tl.where(roffset > 0, {full_scan}, {partial_scan})",
                     dtype=partial_scan.dtype,
+<<<<<<< HEAD
                     shape=partial_scan.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for full_scan, partial_scan in zip(full_scan_vars, partial_scan_vars)
             ]
@@ -4461,9 +5238,13 @@ def sort(
         assert len(dtypes) == len(values)
         broadcasted_values = [
             cse_compute(
+<<<<<<< HEAD
                 f"tl.broadcast_to({value}, {self.dense_size_str()})",
                 dtype=dtypes[i],
                 shape=tuple(self.dense_size_list()),
+=======
+                f"tl.broadcast_to({value}, {self.dense_size_str()})", dtype=dtypes[i]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for i, value in enumerate(values)
         ]
@@ -4471,6 +5252,7 @@ def sort(
         def csv(values):
             return " ".join(f"{value}," for value in values)
 
+<<<<<<< HEAD
         def cse_multiple(line, broadcasted_values, masks, dtypes):
             n = len(broadcasted_values)
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
@@ -4480,6 +5262,13 @@ def cse_multiple(line, broadcasted_values, masks, dtypes):
                 self.cse.newvar(dtype=dtype, shape=value.shape)
                 for dtype, value in zip(dtypes, broadcasted_values)
             ]  # type: ignore[attr-defined]
+=======
+        def cse_multiple(line, n, masks, dtypes):
+            cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
+            if all(self.cse.contains(cache_key) for cache_key in cache_keys):
+                return [self.cse.get(cache_key) for cache_key in cache_keys]
+            result_vars = [self.cse.newvar(dtype=dtypes[i]) for i in range(n)]  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -4497,7 +5286,11 @@ def cse_multiple(line, broadcasted_values, masks, dtypes):
                 f"triton_helpers.sort_with_index({broadcasted_values[0]}, {broadcasted_values[1]},"
                 f" {rnumel}, {dim}, stable={stable}, descending={descending})"
             )
+<<<<<<< HEAD
             result_vars = cse_multiple(line, broadcasted_values, masks, dtypes)
+=======
+            result_vars = cse_multiple(line, len(values), masks, dtypes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise AssertionError("Unhandled sort")
 
@@ -4507,6 +5300,7 @@ def cse_multiple(line, broadcasted_values, masks, dtypes):
 
         return tuple(result_vars)
 
+<<<<<<< HEAD
     def codegen_prologue(self, code: IndentedBuffer):
         """
         Generate the output from prologue. This should be
@@ -4520,6 +5314,8 @@ def codegen_prologue(self, code: IndentedBuffer):
         self.prologue.clear()
         self.prologue_cache.clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_body(self):
         """
         Concat output code from index_code, loads, compute, stores,
@@ -4541,6 +5337,7 @@ def codegen_body(self):
             return
 
         loop_trees = [tree for tree in self.range_trees if tree.is_loop]
+<<<<<<< HEAD
         if self.mix_order_reduction:
             assert self.persistent_reduction, (
                 "Mix order reduction requires persistent reduction"
@@ -4599,6 +5396,9 @@ def codegen_body(self):
                 )
 
         elif self.inside_reduction and len(loop_trees) > 0:
+=======
+        if self.inside_reduction and len(loop_trees) > 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Write the loop headers.
             for level, tree in enumerate(loop_trees):
                 with self.body.indent(offset=level):
@@ -4607,9 +5407,14 @@ def codegen_body(self):
                     loop_end = (
                         "rsplit_end" if self.cooperative_reduction else f"{prefix}numel"
                     )
+<<<<<<< HEAD
                     num_stages = ", num_stages = 2" if torch.version.hip else ""
                     self.body.writeline(
                         f"for {prefix}offset in tl.range({loop_start}, {loop_end}, {prefix.upper()}BLOCK{num_stages}):"
+=======
+                    self.body.writeline(
+                        f"for {prefix}offset in range({loop_start}, {loop_end}, {prefix.upper()}BLOCK):"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 with self.body.indent(offset=level + 1):
                     self.iteration_ranges_codegen_header(tree, self.body)
@@ -4670,8 +5475,12 @@ def codegen_body(self):
                 strip=True,
             )
             self.cooperative_reduction_workspace_cache.on_loop_end()
+<<<<<<< HEAD
         if not self.mix_order_reduction:
             self.body.splice(self.post_loop_store)
+=======
+        self.body.splice(self.post_loop_store)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.indexing_code.clear()
         self.loads.clear()
         self.compute.clear()
@@ -4688,6 +5497,7 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                 if isinstance(arg, int):
                     args.append(str(arg))
                 elif isinstance(arg, SymbolicCallArg):
+<<<<<<< HEAD
                     hint = V.graph.sizevars.size_hint(
                         arg.inner_expr,
                         hint_override=self.hint_override,
@@ -4701,10 +5511,16 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                         fallback=config.unbacked_symint_fallback,
                     )
                     args.append(str(hint))
+=======
+                    args.append(str(V.graph.sizevars.size_hint(arg.inner_expr)))
+                elif isinstance(arg, sympy.Expr):
+                    args.append(str(V.graph.sizevars.size_hint(arg)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     raise ValueError(f"Unsupported numel argument type: {type(arg)}")
         return args
 
+<<<<<<< HEAD
     def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
         """
         Generates Python code for benchmarking this Triton kernel.
@@ -4716,6 +5532,9 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
         Returns:
             IndentedBuffer: A buffer containing the generated Python benchmark code.
         """
+=======
+    def codegen_kernel_benchmark(self, num_gb):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = IndentedBuffer()
         _argdefs, call_args, signature, _ = self.args.python_argdefs()
 
@@ -4727,6 +5546,7 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                 var_name = f"arg_{next(name_cnt)}"
                 buf = V.graph.try_get_buffer(arg_name)
                 if buf:
+<<<<<<< HEAD
                     size = V.graph.sizevars.size_hints(
                         buf.get_size(),
                         hint_override=self.hint_override,
@@ -4739,10 +5559,15 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                     )
                     result.writeline(
                         f"{var_name} = rand_strided({size}, {stride}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+=======
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 elif arg_name in V.graph.constants:
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
+<<<<<<< HEAD
                     size = V.graph.sizevars.size_hints(
                         const_tensor.size(),
                         hint_override=self.hint_override,
@@ -4762,6 +5587,13 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                         hint_override=self.hint_override,
                         fallback=config.unbacked_symint_fallback,
                     )
+=======
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+                    )
+                elif isinstance(arg_sig, SizeArg):
+                    symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     # Force the seed_offset to be 0 so calls to the same kernel
                     # using different seed offset will have the same benchmark harness.
@@ -4771,9 +5603,13 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                     result.writeline(f"{var_name} = {symval_hint}")
                 elif isinstance(arg_sig, WorkspaceArg):
                     device = V.graph.get_current_device_or_throw()
+<<<<<<< HEAD
                     count = V.graph.sizevars.size_hint(
                         arg_sig.count, hint_override=self.hint_override
                     )
+=======
+                    count = V.graph.sizevars.size_hint(arg_sig.count)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result.writeline(
                         f"{var_name} = torch.zeros({count}, device='{device}', dtype={arg_sig.dtype})"
                     )
@@ -4856,6 +5692,10 @@ def _get_heuristic(self):
     def inductor_meta_common():
         inductor_meta = {
             "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+<<<<<<< HEAD
+=======
+            "are_deterministic_algorithms_enabled": torch.are_deterministic_algorithms_enabled(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "assert_indirect_indexing": config.assert_indirect_indexing,
             "autotune_local_cache": config.autotune_local_cache,
             "autotune_pointwise": config.triton.autotune_pointwise,
@@ -4867,6 +5707,7 @@ def inductor_meta_common():
             "min_split_scan_rblock": config.triton.min_split_scan_rblock,
             "spill_threshold": config.triton.spill_threshold,
             "store_cubin": config.triton.store_cubin,
+<<<<<<< HEAD
             "deterministic": config.deterministic,
             "force_filter_reduction_configs": config.test_configs.force_filter_reduction_configs,
         }
@@ -4876,6 +5717,9 @@ def inductor_meta_common():
                 torch.are_deterministic_algorithms_enabled()
             )
 
+=======
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip is not None:
             inductor_meta["is_hip"] = True
         if config.is_fbcode():
@@ -4899,12 +5743,16 @@ def inductor_meta_common():
             )
         return inductor_meta
 
+<<<<<<< HEAD
     def codegen_kernel(self, name=None) -> str:
         """
         Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
         metadata, and benchmarking infra.
         """
 
+=======
+    def codegen_kernel(self, name=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = IndentedBuffer()
 
         size_hints = {}
@@ -5019,9 +5867,12 @@ def add_constexpr_arg(arg_name):
         if self.cooperative_reduction:
             add_constexpr_arg("RSPLIT")
 
+<<<<<<< HEAD
         if self.mix_order_reduction:
             add_constexpr_arg("RSPLIT_SIZE")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_meta_signature = signature_to_meta(
             signature, size_dtype=self.index_dtype, argdefs=argdefs
         )
@@ -5029,10 +5880,13 @@ def add_constexpr_arg(arg_name):
             "signature": triton_meta_signature,
             "device": DeviceProperties.create(V.graph.get_current_device_or_throw()),
             "constants": {},
+<<<<<<< HEAD
             "native_matmul": (
                 torch._inductor.config.triton.native_matmul
                 and ("tl.dot" in str(self.body) or "tl.dot" in str(self.compute))
             ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         # Skip memory optimization for forward of the training loop where we expect
@@ -5041,14 +5895,20 @@ def add_constexpr_arg(arg_name):
         optimize_mem = V.graph.is_inference or V.graph.is_backward
 
         inductor_meta = {
+<<<<<<< HEAD
             "grid_type": self._get_grid_type().__name__,
             # Triton will not accept an OrderedSet for autotune_hints
+=======
+            # Triton will not accept an OrderedSet for autotune_hints
+            "grid_type": self._get_grid_type().__name__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "autotune_hints": set(self.autotune_hints),  # noqa: set_linter
             "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
             "mutated_arg_names": mutated_args,
             "optimize_mem": optimize_mem,
             "no_x_dim": self.no_x_dim,
             "num_load": self.num_load,
+<<<<<<< HEAD
             "num_store": self.num_store,
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
@@ -5123,12 +5983,21 @@ def add_constexpr_arg(arg_name):
         if self.tma_min_block_sizes:
             inductor_meta["tma_min_block_sizes"] = self.tma_min_block_sizes
 
+=======
+            "num_reduction": self.num_reduction,
+            **self.inductor_meta_common(),
+        }
+        if self.tiling_scores:
+            inductor_meta["tiling_scores"] = self.tiling_scores
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.cooperative_reduction:
             inductor_meta["persistent_reduction"] = self.persistent_reduction
 
         num_gb = None
         if config.benchmark_kernel or config.profile_bandwidth:
             num_gb = self.estimate_kernel_num_bytes() / 1e9
+<<<<<<< HEAD
             if num_gb is not None:
                 inductor_meta["kernel_num_gb"] = num_gb
         if config.benchmark_kernel:
@@ -5141,6 +6010,12 @@ def add_constexpr_arg(arg_name):
         if enable_pdl_codegen():
             triton_meta["launch_pdl"] = True
 
+=======
+            inductor_meta["kernel_num_gb"] = num_gb
+
+        triton_meta["configs"] = [config_of(signature)]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Triton compiler includes equal_to_1 args into constants even
         # when they are not constexpr. otherwise there may be a segfault
         # during launching the Inductor-compiled Triton kernel.
@@ -5148,11 +6023,17 @@ def add_constexpr_arg(arg_name):
         # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
         for arg_num in equal_1_arg_indices(signature):  # type: ignore[index]
             triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index,union-attr]
+<<<<<<< HEAD
         triton_meta["enable_fp_fusion"] = not config.emulate_precision_casts
 
         self.triton_meta = triton_meta
 
         self.codegen_prologue(self.body)
+=======
+
+        self.triton_meta = triton_meta
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.codegen_body()
 
         for helper in self.helper_functions:
@@ -5222,14 +6103,21 @@ def _get_persistent_RBLOCK(rnumel):
             val = int(rnumel)
             val = next_power_of_2(val)
         else:
+<<<<<<< HEAD
             val = 2
+=======
+            val = 128
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while not V.graph.sizevars.statically_known_leq(rnumel, val):
                 if val > 16 * 1024:
                     raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
                 val *= 2
+<<<<<<< HEAD
 
             return val
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return val
 
     @staticmethod
@@ -5273,10 +6161,13 @@ def is_static_integer(expr: sympy.Expr) -> bool:
                     val = f"triton_helpers.constexpr_next_power_of_2(({numel} + RSPLIT - 1) // RSPLIT)"
                 else:
                     val = self._get_persistent_RBLOCK(tree.numel)
+<<<<<<< HEAD
                     if self.is_native_matmul:
                         # tl.dot only supports shapes >= 16
                         val = max(val, 16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 code.writeline(f"{tree.prefix.upper()}BLOCK: tl.constexpr = {val}")
 
             if tree.prefix == "x" and self.no_x_dim:
@@ -5284,10 +6175,14 @@ def is_static_integer(expr: sympy.Expr) -> bool:
 
     def _get_grid_type(self) -> type[triton_heuristics.GridExpr]:
         n = sum([int(not tree.is_reduction) for tree in self.range_trees])
+<<<<<<< HEAD
         if self.mix_order_reduction:
             assert n == 1
             return triton_heuristics.MixOrderReductionGrid
         elif self.cooperative_reduction:
+=======
+        if self.cooperative_reduction:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert n == 1
             return triton_heuristics.CooperativeReductionGrid
         elif n == 1:
@@ -5312,9 +6207,13 @@ def add_numel_to_call_args(self, name, call_args, arg_types):
                 call_args.append(expr)
                 arg_types.append(type(expr))
 
+<<<<<<< HEAD
     def call_kernel(
         self, name: str, node: Optional[IRNode] = None, deallocate_ws: bool = True
     ):
+=======
+    def call_kernel(self, name: str, node: Optional[IRNode] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper = V.graph.wrapper_code
         wrapper.write_triton_header_once()
         _, call_args, _, arg_types = self.args.python_argdefs()
@@ -5331,8 +6230,13 @@ def call_kernel(
             triton_meta=self.triton_meta,
         )
 
+<<<<<<< HEAD
         if deallocate_ws:
             self.deallocate_workspaces()
+=======
+        for ws in reversed(self.args.workspace_args):
+            wrapper.generate_workspace_deallocation(ws)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_nan_check(self) -> None:
         wrapper = V.graph.wrapper_code
@@ -5410,12 +6314,15 @@ def max_block(self, prefix: str) -> int:
         return TRITON_MAX_BLOCK[prefix.upper()]
 
     def _has_constant_mask(self, tree: IterationRangesRoot) -> bool:
+<<<<<<< HEAD
         if self.is_native_matmul:
             # tl.dot requires the shape to be >= 16,
             # so when matmul shape is smaller than 16, we always keep the mask.
             if V.graph.sizevars.statically_known_lt(tree.numel, 16):
                 return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.optimize_mask:
             return False
 
@@ -5554,6 +6461,7 @@ def iteration_ranges_codegen_header(
                 line = f"{x}offset + {self.iteration_ranges_ranges_code(entry)}"
             else:
                 line = self.iteration_ranges_scalar_code(entry, f"{x}offset")
+<<<<<<< HEAD
 
             block_size = (
                 f"{x.upper()}BLOCK" if not self.mix_order_reduction else "RSPLIT_SIZE"
@@ -5564,6 +6472,15 @@ def iteration_ranges_codegen_header(
                     f"{entry.name} = {line}",
                 ]
             )
+=======
+            code.writelines(
+                [
+                    f"{x}offset = {self.iteration_ranges_get_pid(entry)} * {x.upper()}BLOCK",
+                    f"{entry.name} = {line}",
+                ]
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._has_constant_mask(entry):
             sizes = self.dense_size_str()
             code.writeline(f"{x}mask = tl.full({sizes}, True, tl.int1)")
@@ -5605,7 +6522,11 @@ def get_backend_features(cls, device: torch.device):
             )
         return cls.backend_features
 
+<<<<<<< HEAD
     def codegen_comment(self, node_schedule, kernel_name=None):
+=======
+    def codegen_comment(self, node_schedule):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper = V.graph.wrapper_code
         origins, _detailed_origins = get_kernel_metadata(node_schedule, wrapper)
         if origins:
@@ -5631,6 +6552,7 @@ def codegen_comment(self, node_schedule, kernel_name=None):
                     f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
                 )
 
+<<<<<<< HEAD
         if kernel_name:
             debug_handle = set_kernel_post_grad_provenance_tracing(
                 node_schedule,  # type: ignore[arg-type]
@@ -5638,6 +6560,8 @@ def codegen_comment(self, node_schedule, kernel_name=None):
             )
             wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def define_kernel(self, src_code, node_schedule, kernel):
         wrapper = V.graph.wrapper_code
         if src_code in wrapper.src_to_kernel:
@@ -5652,11 +6576,14 @@ def define_kernel(self, src_code, node_schedule, kernel):
             kernel_name = "_".join(
                 ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()]
             )
+<<<<<<< HEAD
             if config.aot_inductor.model_name_for_generated_files:
                 # When AOTI compiles multiple submodules, we need to use the model name to
                 # distinguish kernel related symbols.
                 kernel_name = f"{config.aot_inductor.model_name_for_generated_files}_{kernel_name}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use the original src_code as the key
             wrapper.src_to_kernel[src_code] = kernel_name
             subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
@@ -5759,7 +6686,11 @@ def load_cache():
             except Exception as e:
                 if config.triton.disallow_failing_autotune_kernels_TESTING_ONLY:
                     raise
+<<<<<<< HEAD
                 log.debug(  # noqa: G200
+=======
+                log.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Exception (%s) in compiling fused nodes %s",
                     e,
                     node_names,
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index 6372fdf22fb70..0fcf6f94ed0fc 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -90,10 +90,14 @@ def _default_custom_combo_kernel_horizontal_partition(
         long_reduction = [
             n
             for n in reduction
+<<<<<<< HEAD
             if (
                 V.graph.sizevars.shape_env.has_hint(n.group[-1][-1])
                 and V.graph.sizevars.size_hint(n.group[-1][-1]) > 2048  # type: ignore[arg-type]
             )
+=======
+            if V.graph.sizevars.size_hint(n.group[-1][-1]) > 2048  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         short_reduction = [n for n in reduction if n not in long_reduction]
         if long_reduction:
@@ -106,7 +110,10 @@ def _default_custom_combo_kernel_horizontal_partition(
             for n in not_reduction
             if not kernel_map[n].inside_reduction
             and len(kernel_map[n].numels) == 2
+<<<<<<< HEAD
             and V.graph.sizevars.shape_env.has_hint(kernel_map[n].numels["x"])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and V.graph.sizevars.size_hint(kernel_map[n].numels["x"]) > LARGE_NUMELS
         ]
         if large_pointwise:
@@ -172,13 +179,22 @@ def finalize(self) -> None:
 
 
 class ComboKernel(Kernel):
+<<<<<<< HEAD
+=======
+    MAX_NUM_ARGS = 250  # number where I would no longer get triton errors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def _update_partition(
         partition_state: PartitionState,
         node_rw_count: int,
         node_info: BaseSchedulerNode,
     ) -> None:
+<<<<<<< HEAD
         if partition_state.cur_count + node_rw_count > config.combo_kernel_max_num_args:
+=======
+        if partition_state.cur_count + node_rw_count > ComboKernel.MAX_NUM_ARGS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partition_state.partitions.append(partition_state.cur_partition)
             partition_state.cur_partition = [node_info]
             partition_state.cur_count = node_rw_count
@@ -377,7 +393,10 @@ def __init__(
 
     def create_sub_kernel(self, triton_kernel: TritonKernel) -> TritonKernel:
         sub_kernel = triton_kernel
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metrics.generated_kernel_count -= 1
         sub_kernel.args = self.args
         sub_kernel.iter_vars_count = self.iter_vars_count
@@ -433,12 +452,18 @@ def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexp
                 assert f"{tree.prefix}numel_{num}" in self.dynamic_shape_args
                 uniquify_block_sizes.append(f"{tree.prefix}numel")
 
+<<<<<<< HEAD
             # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not tree.is_reduction:
                 if isinstance(simplified_tree_numel, (Integer, int)):
                     grid.append(int(simplified_tree_numel))
                 else:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     grid.append(f"{tree.prefix}numel_{num}")
 
             if tree.is_reduction and sub_kernel.persistent_reduction:
@@ -476,10 +501,15 @@ def min_x_blocks_sub_kernel(self, sub_kernel: TritonKernel, num: int) -> None:
                 if sub_kernel.no_x_dim:
                     min_x_blocks = x_numels
                     x_numels = (
+<<<<<<< HEAD
                         # pyrefly: ignore [unsupported-operation]
                         -min_x_blocks
                         if isinstance(x_numels, int)
                         # pyrefly: ignore [redundant-cast]
+=======
+                        -min_x_blocks
+                        if isinstance(x_numels, int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         else "-" + cast(str, x_numels)
                     )
                 else:
@@ -492,11 +522,15 @@ def min_x_blocks_sub_kernel(self, sub_kernel: TritonKernel, num: int) -> None:
 
     def select_heuristics(self, sub_kernel: TritonKernel) -> tuple[str, dict[str, int]]:
         size_hints = {
+<<<<<<< HEAD
             prefix: next_power_of_2(
                 V.graph.sizevars.size_hint(
                     numel, fallback=config.unbacked_symint_fallback
                 )
             )
+=======
+            prefix: next_power_of_2(V.graph.sizevars.size_hint(numel))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for prefix, numel in sub_kernel.numels.items()
             if not prefix_is_reduction(prefix) or sub_kernel.inside_reduction
         }
@@ -609,7 +643,10 @@ def jit_line(
             "device": DeviceProperties.create(V.graph.get_current_device_or_throw()),
             "constants": {},
         }
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_meta["configs"] = [config_of(signature)]
         mutated_args = self.get_mutated_args_sub_kernels()
         dispatch = self.dispatch_class
@@ -688,7 +725,10 @@ def get_block_args(self) -> list[ConstexprArg]:
         for sub_kernel in self.sub_kernels:
             # TODO: we assume all sub_kernels have the same block size
             for tree in sub_kernel.range_trees:
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if tree.is_reduction and (
                     not sub_kernel.inside_reduction or sub_kernel.persistent_reduction
                 ):
@@ -727,7 +767,10 @@ def add_numel_to_call_args(
                     expr = V.graph.wrapper_code.generate_numel_expr(
                         name, tree, suffix=str(num)
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not tree.is_reduction or sub_kernel.inside_reduction:
                     call_args.append(expr)
                     arg_types.append(type(expr))
@@ -739,6 +782,7 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                 numel_name = f"{tree.prefix}numel_{num}"
                 if numel_name not in self.dynamic_shape_args:
                     continue
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-argument]
                 if not tree.is_reduction or sub_kernel.inside_reduction:
                     extra_args.append(
@@ -748,6 +792,10 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                             )
                         )
                     )
+=======
+                if not tree.is_reduction or sub_kernel.inside_reduction:
+                    extra_args.append(str(V.graph.sizevars.size_hint(tree.numel)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return extra_args
 
     def codegen_kernel(self, name: Optional[str] = None) -> str:
@@ -771,6 +819,7 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
         if config.benchmark_combo_kernel:
             code.splice(self.imports_for_benchmark_kernel())
 
+<<<<<<< HEAD
         seen_helpers: OrderedSet[str] = OrderedSet()
         for sub_kernel in self.sub_kernels:
             for helper in sub_kernel.helper_functions:
@@ -779,6 +828,8 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                     code.splice(helper)
                     seen_helpers.add(helper)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         argdefs, _, signature, _ = self.args.python_argdefs()
         argdefs = self.add_numel_to_args(argdefs, signature)
         block_args = self.get_block_args()
@@ -829,6 +880,7 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
         return code.getvalue()
 
     def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
+<<<<<<< HEAD
         """
         Generates Python code for benchmarking this combo kernel.
         - Creates example inputs (random tensors, constants, sizes).
@@ -839,6 +891,8 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
         Returns:
             IndentedBuffer: A buffer containing the generated Python benchmark code.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = IndentedBuffer()
         _argdefs, call_args, signature, _ = self.args.python_argdefs()
         result.writelines(["", "", "def get_args():"])
@@ -849,6 +903,7 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
                 var_name = f"arg_{next(name_cnt)}"
                 buf = V.graph.try_get_buffer(arg_name)
                 if buf:
+<<<<<<< HEAD
                     size = V.graph.sizevars.size_hints(
                         buf.get_size(), fallback=config.unbacked_symint_fallback
                     )
@@ -857,10 +912,15 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
                     )
                     result.writeline(
                         f"{var_name} = rand_strided({size}, {stride}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+=======
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 elif arg_name in V.graph.constants:
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
+<<<<<<< HEAD
                     size = V.graph.sizevars.size_hints(
                         const_tensor.size(), fallback=config.unbacked_symint_fallback
                     )
@@ -869,6 +929,10 @@ def codegen_kernel_benchmark(self, num_gb: float) -> IndentedBuffer:
                     )
                     result.writeline(
                         f"{var_name} = rand_strided({size}, {stride}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+=======
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 elif isinstance(arg_sig, SizeArg):
                     symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
@@ -1019,7 +1083,10 @@ def combo_grid_meta(self) -> dict[str, Any]:
         for num, sub_kernel in enumerate(self.sub_kernels):
             meta[f"no_x_dim_{num}"] = sub_kernel.no_x_dim
             for i, tree in enumerate(sub_kernel.range_trees):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not tree.is_reduction:
                     numel_name = f"{tree.prefix}numel_{num}"
                     if numel_name in self.dynamic_shape_args:
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
index 0abee54393939..b1f9a4fc44e1d 100644
--- a/torch/_inductor/codegen/triton_split_scan.py
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -86,9 +86,12 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError("NYI TritonSplitDimKernel reductions")
 
     def scan(self, dtypes, combine_fn, values):
+<<<<<<< HEAD
         """
         Perform an associative scan on 'values'.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import triton.language as tl
 
         (dtype,) = dtypes
@@ -124,6 +127,7 @@ def scan(self, dtypes, combine_fn, values):
         max_blocks = pointwise_numel * CeilDiv(reduction_numel, min_rblock)
         nbytes = scratch_nbytes_per_block * max_blocks
         scratch_base: Union[str, TritonCSEVariable]
+<<<<<<< HEAD
         scratch_base, _, offset = self.args.workspace(nelem=nbytes, zero_fill=True)
         if offset != 0:
             scratch_base = cse_load(
@@ -136,6 +140,15 @@ def scan(self, dtypes, combine_fn, values):
             f"{scratch_base}.to(tl.pointer_type({scratch_type})) + xoffset * "
             f"{scratch_elems_per_block} * {runtime_rblocks}",
             shape=(),
+=======
+        scratch_base, offset = self.args.workspace(nbytes=nbytes, zero_fill=True)
+        if offset != 0:
+            scratch_base = cse_load(f"{scratch_base} + {self.index_to_str(offset)}")
+        runtime_rblocks = cse_load(f"tl.num_programs({self.range_trees[-1].index})")
+        scratch_base = cse_load(
+            f"{scratch_base}.to(tl.pointer_type({scratch_type})) + xoffset * "
+            f"{scratch_elems_per_block} * {runtime_rblocks}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
@@ -145,11 +158,15 @@ def scan(self, dtypes, combine_fn, values):
         value = cse_compute(
             f"{value}.to({compute_type})",
             dtype=dtype,
+<<<<<<< HEAD
             shape=value.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         value = cse_compute(
             f"tl.broadcast_to({value}, {self.dense_size_str()})",
             dtype=dtype,
+<<<<<<< HEAD
             shape=self.dense_size_list(),
         )
 
@@ -158,15 +175,28 @@ def scan(self, dtypes, combine_fn, values):
         assert dim == 0, ""
         shape = list(self.dense_size_list())
         del shape[dim]
+=======
+        )
+
+        combine_helper_fn = self._lift_helper(combine_fn, 1, (dtype,))
+        dim = self.triton_tensor_ndim() - 1
+        assert dim == 0, ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         block_sum = cse_compute(
             f"tl.reduce({value}, {dim}, {combine_helper_fn})",
             dtype=dtype,
+<<<<<<< HEAD
             shape=shape,
         )
         exclusive_prefix = self.cse.newvar(
             dtype=dtype,
             shape=shape,
+=======
+        )
+        exclusive_prefix = self.cse.newvar(
+            dtype=dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if element_nbits == 64:
             self.compute.splice(
@@ -202,18 +232,27 @@ def scan(self, dtypes, combine_fn, values):
         block_scan = cse_compute(
             f"tl.associative_scan({value}, {dim}, {combine_helper_fn})",
             dtype=dtype,
+<<<<<<< HEAD
             shape=shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         combined_result = cse_compute(
             f"{combine_helper_fn}({exclusive_prefix}, {block_scan})",
             dtype=dtype,
+<<<<<<< HEAD
             shape=shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return (
             cse_compute(
                 f"tl.where(roffset == 0, {block_scan}, {combined_result})",
                 dtype=dtype,
+<<<<<<< HEAD
                 shape=block_scan.shape,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
index 2a2706ad5720b..67decceaa0ab9 100644
--- a/torch/_inductor/codegen/triton_utils.py
+++ b/torch/_inductor/codegen/triton_utils.py
@@ -71,8 +71,11 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
             return "constexpr"
         elif isinstance(arg.expr, (float, sympy.Float)):
             return "fp32"
+<<<<<<< HEAD
         elif isinstance(arg.expr, bool):
             return "i1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # if this is a integer
         if size_dtype == "tl.int32":
@@ -83,7 +86,11 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
             # no hint: we'll see if we know that this is a 32-bit int, and guard if possible.
             int_max = torch.iinfo(torch.int32).max
             if expr_fits_within_32bit(arg.expr):
+<<<<<<< HEAD
                 V.graph.sizevars.check_leq(arg.expr, int_max)
+=======
+                V.graph.sizevars.guard_leq(arg.expr, int_max)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return "i32"
             else:
                 return "i64"
@@ -256,5 +263,8 @@ def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
 
     equal_to_1 = equal_1_arg_indices(args, indices=indices)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return AttrsDescriptorWrapper(divisible_by_16, equal_to_1)
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index fa5048fd726b7..7da801c8a569b 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -26,7 +26,10 @@
 from torch._inductor.codegen.debug_utils import DebugPrinterManager
 from torch._inductor.codegen.multi_kernel import MultiKernelState
 from torch._inductor.runtime.runtime_utils import cache_dir
+<<<<<<< HEAD
 from torch._logging import trace_structured
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import (
     CallMethodKey,
     ConvertIntKey,
@@ -48,11 +51,18 @@
     cache_on_self,
     DelayReplaceLine,
     get_benchmark_name,
+<<<<<<< HEAD
     get_dtype_size,
     IndentedBuffer,
     is_codegen_graph_partition_subgraph,
     is_using_cudagraph_partition,
     LineContext,
+=======
+    IndentedBuffer,
+    is_codegen_graph_partition_subgraph,
+    LineContext,
+    set_kernel_post_grad_provenance_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_product,
     sympy_str,
     sympy_subs,
@@ -228,6 +238,7 @@ def writeline(line: str, example_grid: Optional[str] = None):
                 key=lambda x: len(x[1].kwargs),
                 reverse=True,
             ):
+<<<<<<< HEAD
                 guardslist = []
                 if c.kwargs:
                     # Remove AMD specific kwargs.
@@ -240,6 +251,13 @@ def writeline(line: str, example_grid: Optional[str] = None):
                             guardslist.append(f"meta['{kwarg}'] == {c.kwargs[kwarg]}")
                 if guardslist:
                     guards = " and ".join(guardslist)
+=======
+                if c.kwargs:
+                    guards = [
+                        f"meta['{name}'] == {val}" for name, val in c.kwargs.items()
+                    ]
+                    guards = " and ".join(guards)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     guards = "True"  # for configs with empty kwargs
                 grid, example_grid = determine_grid(grid, example_grid)
@@ -261,7 +279,10 @@ def user_defined_triton_kernel_transitive_closure_source_code(kernel) -> str:
     compile_wrapper.splice(kernel.src, strip=True)
 
     # Also include any possible kernel being called indirectly
+<<<<<<< HEAD
     import triton
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from triton import JITFunction  # type: ignore[name-defined, attr-defined]
     from triton.language import constexpr  # type: ignore[name-defined]
 
@@ -290,6 +311,7 @@ def traverse(cur_kernel):
                     compile_wrapper.splice(symbol.src, strip=True)
                     symbols_included.add(symbol_name)
                     traverse(symbol)
+<<<<<<< HEAD
                 elif hasattr(triton, "constexpr_function") and isinstance(
                     symbol, triton.runtime.jit.ConstexprFunction
                 ):
@@ -298,6 +320,8 @@ def traverse(cur_kernel):
                     compile_wrapper.splice(symbol.src, strip=True)
                     symbols_included.add(symbol_name)
                     traverse(symbol)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif isinstance(symbol, (int, str, bool, constexpr)):
                     compile_wrapper.newline()
                     if isinstance(symbol, constexpr):
@@ -340,7 +364,11 @@ def traverse(cur_kernel):
 
 @dataclasses.dataclass
 class SymbolicCallArg:
+<<<<<<< HEAD
     inner: sympy.Symbol
+=======
+    inner: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # the original symbolic expression represented by inner
     inner_expr: sympy.Expr
 
@@ -371,7 +399,11 @@ def push(self, key: ReuseKey, item: FreeIfNotReusedLine) -> None:
 
 class WrapperLine:
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+<<<<<<< HEAD
         raise NotImplementedError(f"FX codegen not yet supported for type {type(self)}")
+=======
+        raise NotImplementedError("FX codegen not yet supported for type {type(self)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -391,6 +423,7 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+<<<<<<< HEAD
 class ConditionalLine(WrapperLine):
     wrapper: PythonWrapperCodegen
     node: ir.Conditional
@@ -404,6 +437,8 @@ def codegen_fx(converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CommentLine(WrapperLine):
     line: LineContext
 
@@ -416,6 +451,7 @@ def codegen_fx(converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+<<<<<<< HEAD
 class DynamicScalarLine(WrapperLine):
     wrapper: PythonWrapperCodegen
     node: ir.DynamicScalar
@@ -429,6 +465,8 @@ def codegen_fx(converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ExitSubgraphLine(WrapperLine):
     wrapper: PythonWrapperCodegen
 
@@ -522,6 +560,12 @@ def codegen(self, code: IndentedBuffer) -> None:
         else:
             kernel_name = node.get_kernel_name()
         device = d.type if (d := node.get_device()) else V.graph.device_type
+<<<<<<< HEAD
+=======
+        # set provenance tracing kernel mapping for ExternKernel types
+        if config.trace.enabled:
+            set_kernel_post_grad_provenance_tracing(node, kernel_name, is_extern=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.wrapper._generate_extern_kernel_out_helper(
             kernel_name,
             node.codegen_reference(),
@@ -627,6 +671,7 @@ def __str__(self) -> str:
         return f"{type(self).__name__}({', '.join(args)})"
 
 
+<<<<<<< HEAD
 class EfficientPeakEstimate:
     def __init__(self):
         from ..memory import estimate_peak_memory, get_freeable_input_buf
@@ -667,10 +712,13 @@ def update_peak_between(self, line_a: FreeIfNotReusedLine, line_b: AllocateLine)
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class AllocateLine(MemoryPlanningLine):
     node: BufferLike
 
+<<<<<<< HEAD
     def __post_init__(self):
         assert V.graph.scheduler.current_node is not None
         self.scheduler_node_index = V.graph.scheduler.nodes.index(
@@ -685,6 +733,8 @@ def should_reuse_buffer(self, free_line: FreeIfNotReusedLine, size: int) -> bool
         new_peak_memory = size + peak_memory_in_range
         return new_peak_memory <= overall_peak_memory
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if self.node.get_name() in V.graph.removed_buffers:
             return NullLine(self.wrapper)
@@ -693,6 +743,7 @@ def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         key = buffer_reuse_key(self.node)
         if config.allow_buffer_reuse and key in state:
             free_line = state.pop(key)
+<<<<<<< HEAD
             size = V.graph.sizevars.size_hint(
                 V.graph.get_allocation_storage_size(self.node), fallback=0
             ) * get_dtype_size(self.node.get_dtype())
@@ -703,6 +754,10 @@ def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
             else:
                 state.push(key, free_line)
                 return self
+=======
+            free_line.is_reused = True
+            return ReuseLine(self.wrapper, free_line.node, self.node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.node.get_device_or_error().type == "cpu":
             static_shape = self.wrapper.static_shape_for_buffer_or_none(self.node)
@@ -727,12 +782,15 @@ class FreeIfNotReusedLine(MemoryPlanningLine):
     node: BufferLike
     is_reused: bool = False
 
+<<<<<<< HEAD
     def __post_init__(self):
         assert V.graph.scheduler.current_node is not None
         self.scheduler_node_index = V.graph.scheduler.nodes.index(
             V.graph.scheduler.current_node
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if len(self.node.get_inputs_that_alias_output()) > 0:
             return self
@@ -928,6 +986,7 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+<<<<<<< HEAD
 class IndexPutFallbackLine(WrapperLine):
     wrapper: PythonWrapperCodegen
     node: ir.IndexPutFallback
@@ -978,6 +1037,8 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
 
 
 @dataclasses.dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SymbolicCallArgLine(WrapperLine):
     wrapper: PythonWrapperCodegen
     arg: SymbolicCallArg
@@ -990,6 +1051,7 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_symbolic_call_arg
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
 class UnbackedSymbolDefsLine(WrapperLine):
     wrapper: PythonWrapperCodegen
@@ -1006,6 +1068,8 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_unbacked_symbol_defs
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BufferName = str
 Line = Union[MemoryPlanningLine, LineContext]
 
@@ -1128,7 +1192,10 @@ def create(
         return PythonWrapperCodegen()
 
     def set_launcher_fn_name(self) -> None:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.launcher_fn_name = "call"
 
     def write_constant(self, name: str, hashed: str) -> None:
@@ -1139,12 +1206,18 @@ def write_header(self) -> None:
         aot_config_comment = ""
         if context is not None and context.aot_graph_name is not None:
             aot_config_comment = f"# AOT ID: {context.aot_graph_name}"
+<<<<<<< HEAD
         inductor_debug_utils = ""
         if int(config.aot_inductor.debug_intermediate_value_printer) > 0:
             inductor_debug_utils = "from torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_info"
         elif torch._inductor.config.test_configs.track_memory_lifecycle:
             inductor_debug_utils = "from torch._inductor.runtime.debug_utils import tracked_empty_strided\n"
 
+=======
+        aot_inductor_debug_utils = ""
+        if int(config.aot_inductor.debug_intermediate_value_printer) > 0:
+            aot_inductor_debug_utils = "from torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_info"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.imports.splice(
             f"""
                 {aot_config_comment}
@@ -1162,7 +1235,11 @@ def write_header(self) -> None:
                 from torch import device, empty_strided
                 from {async_compile.__name__} import AsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
+<<<<<<< HEAD
                 {inductor_debug_utils}
+=======
+                {aot_inductor_debug_utils}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """,
             strip=True,
         )
@@ -1174,10 +1251,15 @@ def write_header(self) -> None:
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                 assert_alignment = torch._C._dynamo.guards.assert_alignment
                 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+<<<<<<< HEAD
                 empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                 empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+=======
+                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
                 async_compile = AsyncCompile()
@@ -1219,6 +1301,7 @@ def write_kernel_autotune_defs_header(self) -> None:
             """
         )
 
+<<<<<<< HEAD
         try:
             from torch._C import _cuda_getCurrentRawStream  # noqa: F401
 
@@ -1231,6 +1314,8 @@ def write_kernel_autotune_defs_header(self) -> None:
         except (ImportError, AttributeError):
             pass
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @cache_on_self
     def write_triton_header_once(self) -> None:
         import_str = f"""
@@ -1250,6 +1335,7 @@ def write_triton_header_once(self) -> None:
             )
 
     def write_get_raw_stream_header(self) -> None:
+<<<<<<< HEAD
         import_get_raw_stream_str = V.graph.device_ops.import_get_raw_stream_as(
             "get_raw_stream"
         )
@@ -1259,23 +1345,42 @@ def write_get_raw_stream_header(self) -> None:
         if not V.graph.cpp_wrapper:
             if not self.imports.contains(import_get_raw_stream_str):
                 self.imports.writeline(import_get_raw_stream_str)
+=======
+        if config.triton.autotune_at_compile_time:
+            self.kernel_autotune_calls.writeline(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+        if not V.graph.cpp_wrapper:
+            self.imports.writeline(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @cache_on_self
     def write_get_raw_stream_header_once(self) -> None:
         self.write_get_raw_stream_header()
 
     def add_meta_once(self, meta: TritonMetaParams) -> str:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         meta = repr(meta)
         if meta not in self._metas:
             var = f"meta{len(self._metas)}"
             # pyrefly: ignore [unsupported-operation]
+=======
+        meta = repr(meta)
+        if meta not in self._metas:
+            var = f"meta{len(self._metas)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._metas[meta] = var
             self.header.writeline(f"{var} = {meta}")
             if config.triton.autotune_at_compile_time:
                 self.kernel_autotune_calls.writeline(f"{var} = {meta}")
                 self._meta_vars.add(var)
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._metas[meta]
 
     @cache_on_self
@@ -1389,6 +1494,7 @@ def write_prefix(self) -> None:
                 self.write_args(graph_input_names)
 
             self.codegen_inputs()
+<<<<<<< HEAD
 
             # avoid duplicating asserts for both partition functions and
             # the call function when using cudagraph partition
@@ -1397,6 +1503,9 @@ def write_prefix(self) -> None:
                 and (not is_codegen_graph_partition_subgraph(self))
             ):
                 self.codegen_input_size_and_nan_asserts()
+=======
+            self.codegen_input_size_and_nan_asserts()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_input_size_and_nan_asserts(self) -> None:
         if config.size_asserts:
@@ -1408,7 +1517,11 @@ def codegen_input_size_and_nan_asserts(self) -> None:
     # that stream caching happens per graph instance. this
     # is important for nested subgraph codegening.
     def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
+<<<<<<< HEAD
         self.write_get_raw_stream_header()
+=======
+        self.write_get_raw_stream_header_once()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = f"stream{device_idx}"
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.writeline(
@@ -1451,6 +1564,12 @@ def codegen_device_guard_enter(self, device_idx: int) -> None:
                 f"with {V.graph.device_ops.device_guard(device_idx)}:"
             )
             self.kernel_autotune_calls.do_indent()
+<<<<<<< HEAD
+=======
+            self.kernel_autotune_calls.writeline(
+                V.graph.device_ops.set_device(device_idx)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_codegen_graph_partition_subgraph(self):
                 # Need get_raw_stream for subgraph
                 self.write_get_raw_stream_header()
@@ -1611,10 +1730,14 @@ def generate_tma_descriptor(self, desc):
         line = f"{desc.name} = {call}{self.ending}"
         self.writeline(line)
 
+<<<<<<< HEAD
     def generate_scatter_fallback(self, node: ir.ScatterFallback):
         self.writeline(ScatterFallbackLine(self, node))
 
     def _generate_scatter_fallback(
+=======
+    def generate_scatter_fallback(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         output,
         inputs,
@@ -1633,6 +1756,7 @@ def _generate_scatter_fallback(
         line += ")"
         self.writeline(line)
 
+<<<<<<< HEAD
     def generate_index_put_fallback(self, node: ir.IndexPutFallback) -> None:
         # Collect index tensors into a list.
         indices: list[Optional[ir.IRNode]] = []
@@ -1649,6 +1773,9 @@ def generate_index_put_fallback(self, node: ir.IndexPutFallback) -> None:
         self.writeline(IndexPutFallbackLine(self, node, indices))
 
     def _generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+=======
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         indices_str = f"[{', '.join(indices)}]"
         args = [x, indices_str, values, accumulate]
         self.writeline(self.wrap_kernel_call(kernel, args))
@@ -1711,7 +1838,10 @@ def _generate(self, is_inference):
             with self.set_writeline(self.wrapper_call.writeline):
                 for line in self.lines:
                     if isinstance(line, WrapperLine):
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         line.codegen(self.wrapper_call)
                     else:
                         self.wrapper_call.writeline(line)
@@ -1806,6 +1936,7 @@ def generate_and_run_autotune_block(self):
                 "Auto-tuning code written to %s",
                 file_path,
             )
+<<<<<<< HEAD
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
@@ -1814,6 +1945,8 @@ def generate_and_run_autotune_block(self):
             },
             payload_fn=lambda: tuning_code,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Execute the code to autotune kernels
         try:
             exec(tuning_code, scope)
@@ -1826,8 +1959,12 @@ def memory_plan(self):
         self.lines = MemoryPlanner(self).plan(self.lines)
 
     def memory_plan_reuse(self):
+<<<<<<< HEAD
         outputs = self.get_graph_outputs()
         out_names = V.graph._get_output_names(outputs)
+=======
+        out_names = V.graph.get_output_names()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         while (
             self.lines
@@ -1864,8 +2001,11 @@ def run_wrapper_ir_passes(self, is_inference: bool):
         if is_inference and config.memory_planning:
             self.memory_plan()
         else:
+<<<<<<< HEAD
             if config.allow_buffer_reuse:
                 self.estimate_peak = EfficientPeakEstimate()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.memory_plan_reuse()
 
     def codegen_input_symbol_assignment(
@@ -1957,8 +2097,12 @@ def ensure_size_computed(self, sym: sympy.Symbol):
                 return
             self.computed_sizes.add(sym)
             expr = V.graph.sizevars.inv_precomputed_replacements[sym]
+<<<<<<< HEAD
             arg = SymbolicCallArg(sym, expr)
             self.writeline(SymbolicCallArgLine(self, arg, V.graph))
+=======
+            self.writeline(f"{sym} = {pexpr(expr)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def finalize_prefix(self):
         pass
@@ -1986,9 +2130,13 @@ def codegen_python_shape_tuple(self, shape: Sequence[Expr]) -> str:
     def codegen_shape_tuple(self, shape: Sequence[Expr]) -> str:
         return self.codegen_python_shape_tuple(shape)
 
+<<<<<<< HEAD
     def codegen_alloc_from_pool(
         self, name, offset, dtype, shape, stride
     ) -> tuple[str, list[str]]:
+=======
+    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "alloc_from_pool({})".format(
             ", ".join(
                 [
@@ -1999,7 +2147,11 @@ def codegen_alloc_from_pool(
                     self.codegen_python_shape_tuple(stride),
                 ]
             )
+<<<<<<< HEAD
         ), []
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_reinterpret_view(
         self,
@@ -2030,11 +2182,16 @@ def codegen_reinterpret_view(
                     f"reinterpret_tensor({data.get_name()}, {size}, {stride}, {offset})"
                 )
 
+<<<<<<< HEAD
     def codegen_device_copy(self, src, dst, non_blocking: Union[bool, str]):
+=======
+    def codegen_device_copy(self, src, dst, non_blocking: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.writeline(f"{dst}.copy_({src}, {non_blocking})")
 
     def codegen_multi_output(self, node: ir.MultiOutput):
         result_name = node.get_name()
+<<<<<<< HEAD
         arg_name = node.input_name(0)
         self.writeline(MultiOutputLine(self, result_name, arg_name, node.indices))
 
@@ -2076,6 +2233,12 @@ def codegen_dynamic_scalar(self, node):
         self.writeline(DynamicScalarLine(self, node))
 
     def _codegen_dynamic_scalar(self, node):
+=======
+        arg_name = node.inputs[0].get_name()
+        self.writeline(MultiOutputLine(self, result_name, arg_name, node.indices))
+
+    def codegen_dynamic_scalar(self, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (data,) = (t.codegen_reference() for t in node.inputs)
         if len(node.keypath) == 0:
             self.writeline(f"{node.sym} = {data}.item()")
@@ -2230,10 +2393,13 @@ def define_kernel(
     def _format_kernel_definition(
         kernel_name: str, kernel_body: str, metadata: Optional[str] = None
     ):
+<<<<<<< HEAD
         if config.triton.autotune_at_compile_time and metadata:
             # Generating autotune block
             # Need to replace C++ comment starter with Python comment starter
             metadata = re.sub(r"^// ", "# ", metadata, flags=re.MULTILINE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metadata_comment = f"{metadata}\n" if metadata else ""
         body = f"\n\n{metadata_comment}{kernel_name} = {kernel_body}"
         return body
@@ -2247,8 +2413,14 @@ def _define_kernel_helper(
         cpp_definition: Optional[str] = None,
     ):
         if config.triton.autotune_at_compile_time:
+<<<<<<< HEAD
             body = self._format_kernel_definition(
                 kernel_name, kernel_body, metadata=metadata
+=======
+            # Skip inserting comments for the autotune block as they may contain cpp style comments
+            body = self._format_kernel_definition(
+                kernel_name, kernel_body, metadata=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.kernel_autotune_defs.splice(body)
             if V.graph.cpp_wrapper:
@@ -2260,8 +2432,13 @@ def _define_kernel_helper(
         )
         self.header.splice(body)
 
+<<<<<<< HEAD
     def define_subgraph_launcher_fn(self, name: str, subgraph_code):
         self.subgraph_definitions.splice(subgraph_code.value)
+=======
+    def define_subgraph_launcher_fn(self, fn_code: str):
+        self.subgraph_definitions.splice(fn_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def define_user_defined_triton_kernel(
         self,
@@ -2332,10 +2509,15 @@ def add_arg(idx, arg, is_constexpr=False, equals_1=False, equals_none=False):
                 else:
                     add_to_signature(idx, arg)
 
+<<<<<<< HEAD
         arg_names = [p.name for p in kernel.params]
         constexprs = [p.num for p in kernel.params if p.is_constexpr]
         for idx, key in enumerate(arg_names):
             if idx in constexprs:
+=======
+        for idx, key in enumerate(kernel.arg_names):
+            if idx in kernel.constexprs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 add_arg(idx, ConstexprArg(name=key), is_constexpr=True)
                 continue
 
@@ -2462,7 +2644,10 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
                         "config": config_to_dict(cfg),
                         "python": [*map(pexpr, grid)],
                         "cpp": [*map(cexpr, grid)],
+<<<<<<< HEAD
                         "python_slow": [*map(pexpr, grid)],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     }
                 )
             inductor_meta = {
@@ -2534,10 +2719,16 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
         return name, triton_meta, extra_launcher_call_args
 
     def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = None):
+<<<<<<< HEAD
         sym_name = f"{kernel_name}_{tree.prefix}numel"
         if suffix is not None:
             sym_name += f"_{suffix}"
         sym = sympy.Symbol(sym_name, is_integer=True, is_positive=True)
+=======
+        expr = f"{kernel_name}_{tree.prefix}numel"
+        if suffix is not None:
+            expr += f"_{suffix}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We can get symbolic expressions here, like s0*64
         # It is fine to have them here, but we need to handle them correctly as their own type
@@ -2546,11 +2737,16 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for
         # constant now, need type info. I agree, this needs type info, and while this is not true type info
         # it suffices as a type hint for the purposes of producing the correct code for this type.
+<<<<<<< HEAD
         arg = SymbolicCallArg(sym, tree.numel)
 
         is_benchmark_kernel = kernel_name == ""
         if not is_benchmark_kernel:
             self.writeline(SymbolicCallArgLine(self, arg, V.graph))
+=======
+        arg = SymbolicCallArg(expr, tree.numel)
+        self.writeline(SymbolicCallArgLine(self, arg, V.graph))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return arg
 
@@ -2649,6 +2845,10 @@ def generate_save_uncompiled_kernels(self):
                         if len(kernel.launchers) == 0:
                             kernel.precompile()
                         kernel.save_gpu_kernel(
+<<<<<<< HEAD
+=======
+                            grid=(0, 0, 0),   # use dummy grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             stream="stream",  # use dummy stream
                             launcher=kernel.launchers[0],
                         )
@@ -2796,6 +2996,7 @@ def generate_kernel_call(
                 self,
                 kernel_name=kernel_name,
                 call_args=call_args,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 raw_keys=raw_keys,
                 # pyrefly: ignore [bad-argument-type]
@@ -2808,6 +3009,15 @@ def generate_kernel_call(
                 device=device,
                 graph_name=V.graph.name,
                 # pyrefly: ignore [bad-argument-type]
+=======
+                raw_keys=raw_keys,
+                raw_args=raw_args,
+                arg_types=arg_types,
+                triton=triton,
+                triton_meta=triton_meta,
+                device=device,
+                graph_name=V.graph.name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 original_fxnode_name=original_fxnode_name,
             )
         )
@@ -2827,6 +3037,7 @@ def _generate_kernel_call_helper(
         original_fxnode_name=None,
     ):
         device = device or V.graph.get_current_device_or_throw()
+<<<<<<< HEAD
         if not triton and device.type != "cuda":
             if device.type == "cpu":
                 self.writeline(self.wrap_kernel_call(kernel_name, call_args))
@@ -2837,6 +3048,10 @@ def _generate_kernel_call_helper(
                 )
             else:
                 raise RuntimeError(f"device {device.type} nyi")
+=======
+        if not (triton or device.type != "cpu"):
+            self.writeline(self.wrap_kernel_call(kernel_name, call_args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         call_args_str = self.prepare_triton_kernel_call(call_args)
@@ -2928,7 +3143,10 @@ def infer_arg_by_inputs(raw_keys, raw_args, idx, reused_args):
 
             reused_args = {}
             for i, (arg, arg_type, raw_key, raw_arg) in enumerate(
+<<<<<<< HEAD
                 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 zip(call_args, arg_types, raw_keys, raw_args)
             ):
                 key = None
@@ -2971,6 +3189,7 @@ def infer_arg_by_inputs(raw_keys, raw_args, idx, reused_args):
                     arg_str = self.generate_example_arg_value(arg, arg_type, raw_arg)
                 all_args.append(arg_str if key is None else f"{key}={arg_str}")
 
+<<<<<<< HEAD
             # Make sure kernel launch under a device guard because models don't always run on device 0
             self.kernel_autotune_calls.writeline(
                 f"with {V.graph.device_ops.device_guard(device.index)}:"
@@ -2981,6 +3200,11 @@ def infer_arg_by_inputs(raw_keys, raw_args, idx, reused_args):
             )
             self.kernel_autotune_calls.do_unindent()
 
+=======
+            self.kernel_autotune_calls.writeline(
+                f"{kernel_name}.run({', '.join(all_args)}, stream={stream_name})"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.kernel_autotune_calls.writeline(
                 DelayReplaceLine("<del_call>", get_autotune_deletion_call, "<del_call>")
             )
@@ -3047,6 +3271,7 @@ def make_buffer_allocation(self, buffer: BufferLike):
         shape = tuple(buffer.get_size())
         allocation_shape = tuple(V.graph.get_allocation_size(buffer))
         stride = tuple(buffer.get_stride())
+<<<<<<< HEAD
         is_pinned = buffer.get_is_pinned()
         return self.make_allocation(
             buffer.get_name(), device, dtype, shape, stride, allocation_shape, is_pinned
@@ -3062,6 +3287,14 @@ def write_memory_track_allocation_once(self):
 
     def make_allocation(
         self, name, device, dtype, shape, stride, allocation_shape=None, is_pinned=False
+=======
+        return self.make_allocation(
+            buffer.get_name(), device, dtype, shape, stride, allocation_shape
+        )
+
+    def make_allocation(
+        self, name, device, dtype, shape, stride, allocation_shape=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if allocation_shape is None:
             allocation_shape = shape
@@ -3071,6 +3304,7 @@ def make_allocation(
             allocation_shape
         )
         codegen_stride_tuple = self.codegen_python_shape_tuple(stride)
+<<<<<<< HEAD
         if torch._inductor.config.test_configs.track_memory_lifecycle:
             out = (
                 f"{name} = tracked_empty_strided("
@@ -3088,6 +3322,9 @@ def make_allocation(
                 f"{dtype})"
             )
         elif device.type in ("cpu", "cuda", "xpu", "mtia"):
+=======
+        if device.type in ("cpu", "cuda", "xpu"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # optimized path for faster allocations, saving ~2us versus the stuff below
             out = (
                 f"{name} = empty_strided_{device.type}("
@@ -3123,6 +3360,7 @@ def make_free_by_names(self, names_to_del: list[str]):
     def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
         return f"{self.declare_maybe_reference}{new_name} = {old_name}{del_line}{self.ending}  {self.comment} reuse"
 
+<<<<<<< HEAD
     def write_provenance_debug_handle(
         self,
         kernel_name,
@@ -3133,6 +3371,8 @@ def write_provenance_debug_handle(
                 f"{self.comment} [Provenance debug handles] {kernel_name}:{debug_handle}"
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def make_buffer_reuse(self, old: BufferLike, new: BufferLike, delete_old: bool):
         assert old.get_dtype() == new.get_dtype()
         old_name = old.get_name()
@@ -3269,6 +3509,7 @@ def codegen_unbacked_symbol_defs_for_outputs(
         unbacked_bindings = resolve_unbacked_bindings(
             V.graph.sizevars.shape_env, unbacked_bindings
         )
+<<<<<<< HEAD
         self.writeline(
             UnbackedSymbolDefsLine(self, output_name, outputs, unbacked_bindings)
         )
@@ -3279,6 +3520,9 @@ def _codegen_unbacked_symbol_defs_for_outputs(
         outputs: Any,
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
     ) -> None:
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not unbacked_bindings:
             return
 
@@ -3467,12 +3711,20 @@ def codegen_subgraph_call(self, subgraph, outer_inputs, outer_buffer_name):
 
     def codegen_subgraph_common(self, subgraph):
         self.push_codegened_graph(subgraph.graph)
+<<<<<<< HEAD
         self.make_comment("")
         self.make_comment(f"{self.comment} subgraph: {subgraph.name}")
 
         parent_graph = V.graph
         subgraph.graph.cpp_wrapper = parent_graph.cpp_wrapper
         subgraph.graph.fx_wrapper = parent_graph.fx_wrapper
+=======
+        self.writeline("")
+        self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+
+        parent_graph = V.graph
+        subgraph.graph.cpp_wrapper = parent_graph.cpp_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if subgraph.graph.name not in self.already_codegened_subgraphs:
             # If it is already codegened, the parent wrapper already has
@@ -3482,9 +3734,14 @@ def codegen_subgraph_common(self, subgraph):
                 with config.patch("graph_partition", False):
                     # Call the codegen of subgraph recursively
                     subgraph_code, _ = subgraph.graph.codegen()
+<<<<<<< HEAD
             subgraph_name = subgraph.graph.name
             self.already_codegened_subgraphs.add(subgraph_name)
             self.define_subgraph_launcher_fn(subgraph_name, subgraph_code)
+=======
+            self.already_codegened_subgraphs.add(subgraph.graph.name)
+            self.define_subgraph_launcher_fn(subgraph_code.value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_subgraph_with_flattened_outputs(
         self, subgraph, outer_inputs, outer_flattened_outputs
@@ -3516,7 +3773,11 @@ def codegen_invoke_subgraph(self, invoke_subgraph):
         else:
             self.codegen_subgraph(invoke_subgraph.subgraph, outer_inputs, name)
 
+<<<<<<< HEAD
     def codegen_conditional(self, conditional) -> None:
+=======
+    def codegen_conditional(self, conditional):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = conditional.get_name()
 
         outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
@@ -3549,6 +3810,7 @@ def codegen_conditional(self, conditional) -> None:
             self.codegen_subgraph(conditional.false_subgraph, outer_inputs, name)
         self.writeline(ExitSubgraphLine(self))
 
+<<<<<<< HEAD
     def codegen_while_loop(self, while_loop, stack_output):
         """while_loop is codegened as a host side while_loop"""
 
@@ -3561,6 +3823,9 @@ def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
                     subgraph, outer_inputs, outer_outputs
                 )
 
+=======
+    def codegen_while_loop(self, while_loop):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = while_loop.get_name()
         outer_carried_inputs = [
             buf.codegen_reference() for buf in while_loop.carried_inputs
@@ -3569,6 +3834,7 @@ def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
             buf.codegen_reference() for buf in while_loop.additional_inputs
         ]
 
+<<<<<<< HEAD
         ckp_offset = len(outer_carried_inputs)
         self.writeline(f"{name} = [None] * {len(outer_carried_inputs)}")
         if stack_output:
@@ -3576,6 +3842,9 @@ def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
                 f"{name}.extend([[] for _ in range({len(outer_carried_inputs)})])"
             )
 
+=======
+        self.writeline(f"{name} = [None] * {len(outer_carried_inputs)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, inp in enumerate(outer_carried_inputs):
             # set the initial state before the loop
             self.writeline(f"{name}[{i}] = {inp}")
@@ -3592,6 +3861,7 @@ def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
         # the carried_inputs part of the inputs, the additional ones
         # are passed in as they're before.
         body_outer_outputs = body_outer_inputs[: len(outer_carried_inputs)]
+<<<<<<< HEAD
         # Check condition at the beginning and set up flag
         codegen_subgraph(
             while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
@@ -3647,6 +3917,34 @@ def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
                     f"{name}[{i}] = torch.stack({name}[{i + ckp_offset}], dim=0)"
                 )
                 self.writeline(ExitSubgraphLine(self))
+=======
+
+        self.writeline("while True:")
+        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+
+        if V.graph.aot_mode:
+            self.codegen_subgraph_by_inlining(
+                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+            )
+        else:
+            self.codegen_subgraph_with_flattened_outputs(
+                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+            )
+        self.writeline(
+            f"if not {cond_outer_outputs[0]}: break"
+        )  # condition doesn't hold
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+        if V.graph.aot_mode:
+            self.codegen_subgraph_by_inlining(
+                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+            )
+        else:
+            self.codegen_subgraph_with_flattened_outputs(
+                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+            )
+        self.writeline(ExitSubgraphLine(self))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def statically_known_int_or_none(x):
@@ -3716,7 +4014,10 @@ def __init__(
     def set_launcher_fn_name(self) -> None:
         # This sets up the name of the function containing the launcher code of
         # the subgraph.
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.launcher_fn_name = self.subgraph_name
 
     def write_header(self) -> None:
@@ -3752,7 +4053,11 @@ def get_wrapper_call_indent(self) -> int:
 
     def get_graph_inputs(
         self,
+<<<<<<< HEAD
     ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]:
+=======
+    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if signature := self.partition_signatures:
             inputs = signature.input_nodes | {
                 str(s): s for s in signature.symbol_inputs
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 7b684124a3980..26f4d17568aba 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -4,22 +4,29 @@
 import operator
 import textwrap
 from collections import Counter
+<<<<<<< HEAD
 from collections.abc import Sequence
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, Union
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._export.passes._node_metadata_hook import (
     _node_metadata_hook,
     _set_node_metadata_hook,
 )
 from torch._export.utils import _detect_fake_mode_from_gm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.triton_kernel_wrap import (
     TraceableTritonKernelWrapper,
     tracing_triton_hopifier_singleton,
     triton_kernel_wrapper_mutation,
 )
+<<<<<<< HEAD
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
@@ -42,6 +49,20 @@
 from .. import config, ir
 from ..runtime.triton_compat import Config
 from ..utils import cache_property_on_self, LineContext, ValueWithLineMap
+=======
+from torch._inductor.codecache import PyCodeCache
+from torch._inductor.runtime.triton_heuristics import CachingAutotuner
+from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
+from torch._inductor.utils import sympy_product
+from torch._inductor.virtualized import V
+from torch._library.triton import wrap_triton
+from torch.fx import GraphModule
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.functions import FloorDiv
+
+from .. import config, ir
+from ..utils import convert_shape_to_symint, convert_to_symint, LineContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .common import (
     CodegenSymbol,
     FileBackedGraphModule,
@@ -54,8 +75,11 @@
     CommBufferAllocateLine,
     CommBufferFreeLine,
     CommentLine,
+<<<<<<< HEAD
     ConditionalLine,
     DynamicScalarLine,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EnterDeviceContextManagerLine,
     EnterSubgraphLine,
     ExitDeviceContextManagerLine,
@@ -64,7 +88,10 @@
     ExternKernelOutLine,
     FreeIfNotReusedLine,
     FreeLine,
+<<<<<<< HEAD
     IndexPutFallbackLine,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     KernelCallLine,
     KernelDefinitionLine,
     Line,
@@ -73,11 +100,16 @@
     PythonWrapperCodegen,
     ReinterpretLine,
     ReuseLine,
+<<<<<<< HEAD
     ScatterFallbackLine,
     SubgraphPythonWrapperCodegen,
     SymbolicCallArg,
     SymbolicCallArgLine,
     UnbackedSymbolDefsLine,
+=======
+    SymbolicCallArg,
+    SymbolicCallArgLine,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WrapperLine,
 )
 
@@ -97,10 +129,15 @@ class SymbolBuffer(CodegenSymbol):
     def get_name(self) -> str:
         return str(self.symbol)
 
+<<<<<<< HEAD
     def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
         sym_int = convert_to_symint(self.symbol)
         assert isinstance(sym_int, torch.SymInt)
         return sym_int
+=======
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        return self.symbol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 CodegenBuffer = Union[BufferLike, SymbolBuffer]
@@ -116,6 +153,7 @@ class TritonKernel:
     wrapped: TraceableTritonKernelWrapper
 
 
+<<<<<<< HEAD
 def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
     """
     Replace sympy.floor with FloorDiv.
@@ -142,6 +180,8 @@ def replace(expr: sympy.Expr) -> sympy.Expr:
     return expr.replace(sympy.floor, replace)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WrapperFxCodegen(PythonWrapperCodegen):
     """
     Backend to generate wrapper code as an FX IR graph.
@@ -149,6 +189,7 @@ class WrapperFxCodegen(PythonWrapperCodegen):
 
     supports_caching = False
 
+<<<<<<< HEAD
     def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
         self.subgms: dict[str, torch.fx.GraphModule] = {}
@@ -198,6 +239,8 @@ def get_fx_graph_inputs(
 
         return self.get_graph_inputs()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
         self.run_wrapper_ir_passes(is_inference)
 
@@ -207,6 +250,7 @@ def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
                 self.header.getvalue(),
             ]
         )
+<<<<<<< HEAD
         gm = FxConverter(
             lines=self.lines,
             prologue=prologue,
@@ -217,6 +261,9 @@ def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
             is_subgraph=self.is_subgraph,
         ).generate()
 
+=======
+        gm = FxConverter(lines=self.lines, prologue=prologue).generate()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fn = self.compile_graph(gm)
 
         return FileBackedGraphModule(gm, compiled_fn), None
@@ -229,6 +276,7 @@ def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
         """
         return gm.forward
 
+<<<<<<< HEAD
     def write_header(self) -> None:
         """
         Python subgraphs normally lack headers.
@@ -239,12 +287,18 @@ def write_header(self) -> None:
     @classmethod
     def create(
         cls: type["WrapperFxCodegen"],
+=======
+    @classmethod
+    def create(
+        cls,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_subgraph: bool,
         subgraph_name: Optional[str],
         parent_wrapper: Optional[PythonWrapperCodegen],
         partition_signatures: Optional[ir.GraphPartitionSignature] = None,
     ) -> "WrapperFxCodegen":
         if is_subgraph:
+<<<<<<< HEAD
             assert subgraph_name is not None
             assert parent_wrapper is not None
 
@@ -266,6 +320,13 @@ def crash_if_run(*args: Any) -> None:
                 subgraph_name, parent_wrapper, partition_signatures
             )
 
+=======
+            raise NotImplementedError(
+                "Subgraphs are not yet supported by FX conversion"
+            )
+
+        # For derived backends, this could be a subclass.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls()
 
 
@@ -277,11 +338,15 @@ class FxConverter:
     """
 
     lines: list[Line]
+<<<<<<< HEAD
     prologue: str
     graph_inputs: dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]
     graph_outputs: list[ir.IRNode]
     subgms: dict[str, torch.fx.GraphModule]
     is_subgraph: bool
+=======
+    prologue: str = ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self) -> None:
         graph = torch.fx.Graph()
@@ -291,8 +356,11 @@ def __post_init__(self) -> None:
         ] = {}  # Symbol table for codegen.
         self.kernels: dict[str, TritonKernel] = {}  # Table to store Triton kernels.
         self._unique_symbol_ids: Counter[str] = Counter()
+<<<<<<< HEAD
         self.tracer = torch.fx.proxy.GraphAppendingTracer(graph)
         self.expr_to_proxy: dict[sympy.Expr, torch.fx.Proxy] = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
         """
@@ -302,9 +370,12 @@ def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
         mod = PyCodeCache.load(module_code)
         kernel = getattr(mod, kernel_name)
 
+<<<<<<< HEAD
         if isinstance(kernel, LambdaFuture):
             kernel = kernel.result()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(kernel, CachingAutotuner):
             raise NotImplementedError(
                 textwrap.dedent(f"""
@@ -330,6 +401,17 @@ def _fake_tensor(
                 device=device,
             )
 
+<<<<<<< HEAD
+=======
+    def _create_meta_from_buffer(
+        self, node: torch.fx.Node, buffer: CodegenBuffer
+    ) -> None:
+        name = buffer.get_name()
+        assert name
+        node.name = name
+        node.meta["val"] = buffer.get_example()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _create_as_strided(
         self,
         input_node: torch.fx.Node,
@@ -341,9 +423,15 @@ def _create_as_strided(
             torch.as_strided,
             args=(
                 input_node,
+<<<<<<< HEAD
                 self._generate_sym_nodes(size),
                 self._generate_sym_nodes(stride),
                 self._generate_sym_node(offset),
+=======
+                convert_shape_to_symint(size),
+                convert_shape_to_symint(stride),
+                convert_to_symint(offset),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
@@ -388,6 +476,7 @@ def _get_buffer(self, node: ir.IRNode) -> CodegenBuffer:
         else:
             raise NotImplementedError(f"Unable to extract buffer from node: {node}")
 
+<<<<<<< HEAD
     def _generate_size_proxy(
         self, node: torch.fx.Node, expr: sympy.Expr
     ) -> torch.fx.Proxy:
@@ -395,10 +484,13 @@ def _generate_size_proxy(
         self.expr_to_proxy[expr] = proxy
         return proxy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_graph_inputs(self) -> None:
         """
         Converts graph inputs to FX placeholders.
         """
+<<<<<<< HEAD
 
         for name, ir_node in self.graph_inputs.items():
             if ir_node is None:
@@ -510,6 +602,18 @@ def _generate_graph_constants(self) -> None:
             node.meta["val"] = value
             setattr(self.gm, name, value)
             self.buffer_to_node[name] = node
+=======
+        for name, ir_node in V.graph.graph_inputs.items():
+            # Introduce a new symbol for constant inputs.
+            buffer = (
+                SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+                else self._get_buffer(ir_node)
+            )
+            node = self.gm.graph.placeholder(buffer.get_name())
+            self._create_meta_from_buffer(node, buffer)
+            self._record_allocation(buffer, node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
         """
@@ -517,16 +621,23 @@ def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
         Does nothing if no such transformations are present.
         """
 
+<<<<<<< HEAD
         if isinstance(node, ir.ShapeAsConstantBuffer):
             # Generate FX nodes to compute the shape expression.
             return self._sympy_interp(node.expr).node
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
             if isinstance(node, (ir.Buffer, WorkspaceArg)):
                 return node
             elif isinstance(node, ir.NoneAsConstantBuffer):
                 return None
+<<<<<<< HEAD
             elif isinstance(node, ir.MutableBox):
+=======
+            elif isinstance(node, ir.StorageBox):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return generate_to_buffer(node.data)
             elif isinstance(node, ir.ReinterpretView):
                 # We need to introduce a new symbol if the output is a ReinterpretView.
@@ -556,13 +667,18 @@ def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
         buffer = generate_to_buffer(node)
         return self.buffer_to_node[buffer.get_name()] if buffer is not None else None
 
+<<<<<<< HEAD
     def _generate_outputs(
         self,
     ) -> Union[Optional[torch.fx.Node], list[Optional[torch.fx.Node]]]:
+=======
+    def _generate_output(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Generate FX IR for graph outputs.
         """
         output_nodes = [
+<<<<<<< HEAD
             self._generate_buffer(node) for idx, node in enumerate(self.graph_outputs)
         ]
 
@@ -597,12 +713,23 @@ def _get_subgm_attr(self, subgraph: ir.Subgraph) -> torch.fx.Node:
         graph = subgraph.graph
         assert graph is not None
         return self.subgm_getattrs[graph.name]
+=======
+            self._generate_buffer(node)
+            for idx, node in enumerate(V.graph.graph_outputs)
+        ]
+
+        # Single return elements don't use a tuple.
+        output_value = output_nodes[0] if len(output_nodes) == 1 else output_nodes
+
+        self.gm.graph.output(output_value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate(self) -> torch.fx.GraphModule:
         """
         Main entrypoint for FX codegen.
         """
         self._generate_graph_inputs()
+<<<<<<< HEAD
         self._generate_graph_constants()
         self._generate_subgm_getattrs()
 
@@ -689,6 +816,32 @@ def _generate_sym_nodes(
     ) -> list[Union[int, torch.fx.Node]]:
         return [self._generate_sym_node(s) for s in shape]
 
+=======
+
+        # Generate FX IR from Wrapper IR lines.
+        for line in self.lines:
+            if isinstance(line, WrapperLine):
+                line.codegen_fx(self)(line)
+            elif isinstance(line, LineContext):
+                # Ignore line context in FX IR.
+                pass
+            else:
+                raise NotImplementedError(
+                    textwrap.dedent(
+                        f"""
+                    Found line of unrecognized type '{type(line)}':
+                        '{line}'
+
+                    FX conversion only supports Wrapper IR lines.
+                    """
+                    )
+                )
+
+        self._generate_output()
+        self.gm.recompile()
+        return self.gm
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_allocate(self, line: WrapperLine) -> None:
         assert isinstance(line, AllocateLine)
         buffer = line.node
@@ -697,8 +850,13 @@ def _generate_allocate(self, line: WrapperLine) -> None:
 
         device = buffer.get_device()
         dtype = buffer.get_dtype()
+<<<<<<< HEAD
         shape = self._generate_sym_nodes(buffer.get_size())
         stride = self._generate_sym_nodes(buffer.get_stride())
+=======
+        shape = convert_shape_to_symint(buffer.get_size())
+        stride = convert_shape_to_symint(buffer.get_stride())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         node = self.gm.graph.call_function(
             torch.empty_strided,
@@ -707,6 +865,7 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         )
         assert name
         node.name = name
+<<<<<<< HEAD
         self._record_allocation(buffer, node)
 
     def _generate_conditional(self, line: WrapperLine) -> None:
@@ -736,10 +895,16 @@ def generate_buffer(node: Optional[ir.IRNode]) -> Optional[torch.fx.Node]:
         )
         self._record_allocation(ir_node, fx_node)
 
+=======
+        self._create_meta_from_buffer(node, buffer)
+        self._record_allocation(buffer, node)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_comment(self, line: WrapperLine) -> None:
         assert isinstance(line, CommentLine)
         # We ignore comments in FX IR.
 
+<<<<<<< HEAD
     def _generate_dynamic_scalar(self, line: WrapperLine) -> None:
         assert isinstance(line, DynamicScalarLine)
 
@@ -773,6 +938,8 @@ def generate_item(x: Optional[torch.fx.Node]) -> torch.fx.Node:
         self._record_allocation(result_buffer, result_fx_node)
         self._generate_size_proxy(result_fx_node, result_symbol)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_enter_device_context_manager(self, line: WrapperLine) -> None:
         assert isinstance(line, EnterDeviceContextManagerLine)
         # We ignore the device context in FX IR.
@@ -783,11 +950,19 @@ def _generate_exit_device_context_manager(self, line: WrapperLine) -> None:
 
     def _generate_enter_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, EnterSubgraphLine)
+<<<<<<< HEAD
         # We ignore memory planning lines in FX IR.
 
     def _generate_exit_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, ExitSubgraphLine)
         # We ignore memory planning lines in FX IR.
+=======
+        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+
+    def _generate_exit_subgraph(self, line: WrapperLine) -> None:
+        assert isinstance(line, ExitSubgraphLine)
+        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _generate_free(self, line: WrapperLine) -> None:
         assert isinstance(line, FreeLine)
@@ -837,6 +1012,10 @@ def _generate_reinterpret_helper(
         # Map ReinterpretView to as_strided.
         result_node = self._create_as_strided(input_node, size, stride, offset)
         result_node.name = name
+<<<<<<< HEAD
+=======
+        result_node.meta["val"] = layout.get_example()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._record_allocation(result_buffer, result_node)
 
     def _generate_reuse(self, line: WrapperLine) -> None:
@@ -859,6 +1038,10 @@ def _generate_reuse(self, line: WrapperLine) -> None:
             or old.get_offset() != offset
         ):
             result_node = self._create_as_strided(old_node, size, stride, offset)
+<<<<<<< HEAD
+=======
+            self._create_meta_from_buffer(result_node, new)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._record_allocation(new, result_node)
 
@@ -873,6 +1056,7 @@ def _generate_reuse(self, line: WrapperLine) -> None:
     def _generate_multi_output(self, line: WrapperLine) -> None:
         assert isinstance(line, MultiOutputLine)
 
+<<<<<<< HEAD
         arg_node = self.buffer_to_node[line.arg_name]
 
         # For non-tuple / non-list outputs, map the
@@ -881,11 +1065,14 @@ def _generate_multi_output(self, line: WrapperLine) -> None:
             self.buffer_to_node[line.result_name] = arg_node
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Extract the index for tuple access.
         inds = line.indices[0][1:]
         assert len(inds) == 1, f"Cannot convert {inds} to an index."
         idx = inds[0]
 
+<<<<<<< HEAD
         node = self.gm.graph.call_function(operator.getitem, args=(arg_node, idx))
         node.name = line.result_name
         self.buffer_to_node[line.result_name] = node
@@ -940,6 +1127,14 @@ def _generate_scatter_fallback(self, line: WrapperLine) -> None:
 
         self._generate_fallback_call(ir_node, args, kwargs)
 
+=======
+        arg_node = self.buffer_to_node[line.arg_name]
+        node = self.gm.graph.call_function(operator.getitem, args=(arg_node, idx))
+        node.meta["val"] = arg_node.meta["val"][idx]
+        node.name = line.result_name
+        self.buffer_to_node[line.result_name] = node
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_null(self, line: WrapperLine) -> None:
         assert isinstance(line, NullLine)
         # Does nothing.
@@ -960,10 +1155,17 @@ def _generate_triton_call(self, line: WrapperLine) -> None:
         kernel = self.kernels[line.kernel_name]
         tuner = kernel.tuner
 
+<<<<<<< HEAD
         class UnbackedSymintsError(Exception):
             pass
 
         def tune_kernel(tuner: CachingAutotuner, call_args: Sequence[Any]) -> None:
+=======
+        # Optionally autotune the kernels.
+        # The FX backend currently only supports compile-time tuning.
+        kernel_name = tuner.fn.__name__
+        if config.triton.autotune_at_compile_time:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from triton.runtime import driver
 
             log.info("Autotuning Triton kernel %s at compile time.", kernel_name)
@@ -975,6 +1177,7 @@ def node_to_tuning_arg(arg: Any) -> Any:
                 Create real tensors for autotuning arguments, substituting size hints
                 for dynamic shapes.
                 """
+<<<<<<< HEAD
 
                 def to_size_hint(arg: Any) -> Any:
                     if len(free_unbacked_symbols(arg)) > 0:
@@ -982,6 +1185,11 @@ def to_size_hint(arg: Any) -> Any:
                         raise UnbackedSymintsError
                     return pytree.tree_map(V.graph.sizevars.size_hint, arg)
 
+=======
+                to_size_hint = functools.partial(
+                    pytree.tree_map, V.graph.sizevars.size_hint
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not isinstance(arg, torch.fx.Node):
                     return to_size_hint(arg)
 
@@ -989,12 +1197,16 @@ def to_size_hint(arg: Any) -> Any:
                 return torch.empty_strided(
                     to_size_hint(fake.shape),
                     to_size_hint(fake.stride()),
+<<<<<<< HEAD
                     dtype=fake.dtype,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     device=device,
                 ).zero_()
 
             arg_values = [node_to_tuning_arg(arg) for arg in call_args]
             tuner.run(*arg_values, stream=stream)
+<<<<<<< HEAD
 
         # Optionally autotune the kernels.
         # The FX backend currently only supports compile-time tuning.
@@ -1007,12 +1219,15 @@ def to_size_hint(arg: Any) -> Any:
                     "Detected unbacked symints. Skipping autotuning for kernel %s.",
                     kernel_name,
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             log.info(
                 "Skipping autotuning for kernel %s. Set config.triton.autotune_at_compile_time = True to enable.",
                 kernel_name,
             )
 
+<<<<<<< HEAD
         triton_meta = tuner.triton_meta
         signature = triton_meta["signature"]
 
@@ -1068,6 +1283,46 @@ def add_constants_to_call_args(
         call_kwargs = {
             name: self._generate_sym_node(val) for name, val in call_kwargs.items()
         }
+=======
+        kernel_config = tuner.compile_results[0].config
+        call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
+        call_kwargs = dict(zip(tuner.triton_meta["signature"], call_args))
+        call_kwargs.update(kernel_config.kwargs)
+
+        def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
+            """
+            Converts floor(x / c) to x // c.
+            """
+            if isinstance(expr, sympy.core.mul.Mul) and isinstance(
+                expr.args[0], sympy.Rational
+            ):
+                # Only the first argument of a Mul can be a Rational.
+                frac = expr.args[0]
+                numerator = sympy_product(expr.args[1:]) * frac.numerator
+                denominator = frac.denominator
+
+                # Sanity check the results.
+                new_expr = numerator / denominator
+                assert V.graph.sizevars.statically_known_equals(new_expr, expr), (
+                    f"Unsound replacement: '{new_expr}' != '{expr}'"
+                )
+
+                return FloorDiv(numerator, denominator)
+            else:
+                return sympy.floor(expr)
+
+        def expr_to_symint(expr: Union[int, sympy.Expr]) -> Union[int, sympy.Expr]:
+            return (
+                convert_to_symint(expr.replace(sympy.floor, replace_floor_div))
+                if isinstance(expr, sympy.Expr)
+                else expr
+            )
+
+        # Convert sympy expressions to symints.
+        # Use FloorDiv over sympy.floor, so we can get nicer Python code from FX.
+        wrapper_grid = [tuple(expr_to_symint(dim) for dim in grid)]
+        call_kwargs = {name: expr_to_symint(val) for name, val in call_kwargs.items()}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Store non-graphable kwargs in the side table.
         (
@@ -1075,7 +1330,11 @@ def add_constants_to_call_args(
             constant_args_idx,
         ) = tracing_triton_hopifier_singleton.store_non_graphable_args(call_kwargs)
 
+<<<<<<< HEAD
         triton_node = self.gm.graph.call_function(
+=======
+        self.gm.graph.call_function(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             triton_kernel_wrapper_mutation,
             kwargs={
                 "kernel_idx": kernel.wrapped.kernel_idx,
@@ -1085,8 +1344,11 @@ def add_constants_to_call_args(
                 "kwargs": call_kwargs,
             },
         )
+<<<<<<< HEAD
         if extra_options:
             triton_node.meta["extra_options"] = extra_options
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _generate_extern_kernel_alloc(self, line: WrapperLine) -> None:
         assert isinstance(line, ExternKernelAllocLine)
@@ -1110,17 +1372,25 @@ def _generate_extern_kernel_common(
         """
 
         # Get FX nodes corresponding to the call args.
+<<<<<<< HEAD
         assert ir.is_node_sequence(kernel.inputs)
         tensor_nodes = tuple(self._generate_buffer(arg) for arg in kernel.inputs)
         if hasattr(kernel, "unflatten_args"):
             args, _ = kernel.unflatten_args(tensor_nodes, kernel.constant_args)
         else:
             args = tensor_nodes + tuple(kernel.constant_args)
+=======
+        tensor_nodes = tuple(self._generate_buffer(arg) for arg in kernel.inputs)
+        args = tensor_nodes + tuple(kernel.constant_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Get the result buffer.
         # Some kernels write to a pre-existing output tensor via the "out" kwarg.
         kwargs = kernel.kwargs.copy()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result_buffer: Optional[str] = None
         if isinstance(kernel, ir.ExternKernelOut):
             kwargs["out"] = self.buffer_to_node[out_ir_node.codegen_reference()]
@@ -1131,11 +1401,22 @@ def _generate_extern_kernel_common(
         else:
             raise NotImplementedError(f"Unrecognized output layout: {kernel.layout}")
 
+<<<<<<< HEAD
         fx_node = self.gm.graph.call_function(
             kernel.op_overload,  # type: ignore[arg-type]
             args=args,
             kwargs=kwargs,
         )
+=======
+        # Look up the kernel function from its name.
+        kernel_name = kernel.get_kernel_name()
+        module_name, kernel_name = kernel_name.split(".", 1)
+        op = globals()[module_name]  # E.g. extern_kernels, aten, etc.
+        for subname in kernel_name.split("."):
+            op = getattr(op, subname)  # E.g. extern_kernels.addmm
+
+        fx_node = self.gm.graph.call_function(op, args=args, kwargs=kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Assign the result to the given name.
         if result_buffer:
@@ -1145,6 +1426,17 @@ def _generate_extern_kernel_common(
             fx_node.name = result_buffer
             self.buffer_to_node[result_buffer] = fx_node
 
+<<<<<<< HEAD
+=======
+            arg_tensors = [
+                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+                for arg in args
+            ]
+
+            # Run the operation to propagate metadata.
+            fx_node.meta["val"] = op(*arg_tensors, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _generate_kernel_call(self, line: WrapperLine) -> None:
         assert isinstance(line, KernelCallLine)
         if not line.triton:
@@ -1167,6 +1459,7 @@ def _generate_kernel_definition(self, line: WrapperLine) -> None:
 
     def _generate_symbolic_call_arg(self, line: WrapperLine) -> None:
         assert isinstance(line, SymbolicCallArgLine)
+<<<<<<< HEAD
         # Store the arg: expr mapping for later use.
         arg = line.arg
 
@@ -1226,3 +1519,6 @@ def convert_key(node: torch.fx.Node, path: pytree.KeyPath) -> torch.fx.Node:
             out_buffer = SymbolBuffer(s)
             self._record_allocation(out_buffer, node)
             self._generate_size_proxy(node, s)
+=======
+        # No need for an FX node, as we will pass the arg to kernels via a SymInt.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/xpu/device_op_overrides.py b/torch/_inductor/codegen/xpu/device_op_overrides.py
index 5d538ec20ca21..2c5d9f77f6d00 100644
--- a/torch/_inductor/codegen/xpu/device_op_overrides.py
+++ b/torch/_inductor/codegen/xpu/device_op_overrides.py
@@ -58,10 +58,17 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "void *"
 
+<<<<<<< HEAD
     def cpp_scratch(
         self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
     ) -> Optional[tuple[list[str], str]]:
         return [f"void *global_scratch_{idx} = 0;"], f"global_scratch_{idx}"
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+        return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 register_device_op_overrides("xpu", XPUDeviceOpOverrides())
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 51c5472c7fe34..12cc5486ede01 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -1,12 +1,18 @@
 import functools
+<<<<<<< HEAD
 import logging
 import math
 from enum import IntEnum
 from typing import Optional
+=======
+import math
+from enum import IntEnum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch.fx.operator_schemas import normalize_function
 
 from . import ir
@@ -17,11 +23,22 @@
 log = logging.getLogger(__name__)
 
 
+=======
+
+from . import ir
+from .utils import get_dtype_size, sympy_product
+from .virtualized import V
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NCCL_COLL(IntEnum):
     ALL_REDUCE = 0
     ALL_GATHER = 1
     REDUCE_SCATTER = 2
+<<<<<<< HEAD
     ALL_TO_ALL = 3
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NVIDIA_GPU_TYPE(IntEnum):
@@ -44,7 +61,15 @@ def get_gpu_type() -> NVIDIA_GPU_TYPE:
         return NVIDIA_GPU_TYPE.AMPERE
 
 
+<<<<<<< HEAD
 def get_collective_type_from_kernel_name(kernel_name: str) -> NCCL_COLL:
+=======
+def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
+    if not isinstance(node, ir._CollectiveKernel):
+        raise ValueError(f"node is not a collective kernel: {node}")
+
+    kernel_name = node.python_kernel_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert kernel_name is not None
     if "all_reduce" in kernel_name:
         return NCCL_COLL.ALL_REDUCE
@@ -52,12 +77,16 @@ def get_collective_type_from_kernel_name(kernel_name: str) -> NCCL_COLL:
         return NCCL_COLL.ALL_GATHER
     elif "reduce_scatter" in kernel_name:
         return NCCL_COLL.REDUCE_SCATTER
+<<<<<<< HEAD
     elif "torch.ops._dtensor.shard_dim_alltoall.default" in kernel_name:
         return NCCL_COLL.ALL_TO_ALL
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise ValueError(f"Unsupported collective kernel: {kernel_name}")
 
 
+<<<<<<< HEAD
 def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
     if not isinstance(node, ir._CollectiveKernel):
         raise ValueError(f"node is not a collective kernel: {node}")
@@ -67,6 +96,8 @@ def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
     return get_collective_type_from_kernel_name(name)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_collective_input_size_bytes(node: ir.IRNode) -> int:
     sz_bytes = 0
     for inp in node.inputs:  # type: ignore[attr-defined]
@@ -81,7 +112,11 @@ def get_collective_input_size_bytes(node: ir.IRNode) -> int:
 
 
 def get_collective_group_size(node: ir.IRNode) -> int:
+<<<<<<< HEAD
     if isinstance(node, ir._CollectiveKernel) and not isinstance(node, ir._WaitKernel):
+=======
+    if type(node) == ir._CollectiveKernel:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.distributed.distributed_c10d import _get_group_size_by_name
 
         return _get_group_size_by_name(node.constant_args[-1])
@@ -172,6 +207,7 @@ class NCCL_PROTO(IntEnum):
 ]
 
 
+<<<<<<< HEAD
 def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:  # type: ignore[no-untyped-def]
     kernel = snode.node
     assert kernel is not None
@@ -221,6 +257,11 @@ def estimate_nccl_collective_runtime_impl(
 ) -> float:
     """
     Returns estimated NCCL collective runtime in milliseconds (ms).
+=======
+def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
+    """
+    Returns estimated NCCL collective runtime in nanoseconds (ns).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
     We aim to estimate the runtime as accurately as possible.
@@ -231,12 +272,20 @@ def estimate_nccl_collective_runtime_impl(
     - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     - collective is one of: allreduce, reducescatter, allgather
     """
+<<<<<<< HEAD
+=======
+    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Convert bytes to GB
     tensor_storage_size_GB = tensor_storage_size_bytes / 1024 / 1024 / 1024
 
     # Currently assumes each node has 8 gpus. And when >1 node is used, assumes each node uses all 8 gpus.
     # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     num_gpus_per_node = 8
+<<<<<<< HEAD
+=======
+    group_size = get_collective_group_size(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nNodes = math.ceil(group_size / num_gpus_per_node)
     nRanks = group_size  # this is total # of gpus globally that participate in this collective op
 
@@ -246,6 +295,10 @@ def estimate_nccl_collective_runtime_impl(
     # Assumes ring algorithm
     nccl_algo = NCCL_ALGO.RING
     nccl_proto = NCCL_PROTO.LL
+<<<<<<< HEAD
+=======
+    coll = get_collective_type(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # =============== bandwidth computation ===============
     # First compute bandwidth in GB/s; then at the end, convert it to GB/ns
@@ -277,8 +330,11 @@ def estimate_nccl_collective_runtime_impl(
 
     if coll == NCCL_COLL.ALL_REDUCE:
         nsteps = 2 * (nRanks - 1)
+<<<<<<< HEAD
     elif coll == NCCL_COLL.ALL_TO_ALL:
         nsteps = 2 * (nRanks - 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
         nsteps = nRanks - 1
 
@@ -296,7 +352,11 @@ def estimate_nccl_collective_runtime_impl(
             nInterSteps = 2 * nNodes
         else:
             nInterSteps = 0
+<<<<<<< HEAD
     elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER, NCCL_COLL.ALL_TO_ALL):
+=======
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nInterSteps = nNodes - 1
 
     # First compute latency in us; then at the end, convert it to ns
@@ -315,14 +375,19 @@ def estimate_nccl_collective_runtime_impl(
 
     # =============== final result ===============
     transport_ns = tensor_storage_size_GB / bandwidth_GB_per_ns
+<<<<<<< HEAD
     ns = transport_ns + latency_ns
     ms = ns / 1e6
     return ms
+=======
+    return transport_ns + latency_ns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ################################################################################################################
 # The above code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
 ################################################################################################################
+<<<<<<< HEAD
 
 
 def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
@@ -395,3 +460,5 @@ def estimate_nccl_collective_runtime_from_fx_node(
     return estimate_nccl_collective_runtime_impl(
         tensor_storage_size_bytes, group_size, coll
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/comm_lowering.py b/torch/_inductor/comm_lowering.py
index 5ec3d2bba7908..dcafbb7e4ed75 100644
--- a/torch/_inductor/comm_lowering.py
+++ b/torch/_inductor/comm_lowering.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import logging
+<<<<<<< HEAD
+=======
+from typing import cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -113,12 +117,19 @@ def realize_as_comm_buffer(
 def _get_data(x: ir.TensorBox) -> ir.IRNode:
     if isinstance(x.data, ir.BaseView):
         # TensorBox -> *View -> StorageBox -> IRNode
+<<<<<<< HEAD
         node = x.data.unwrap_view()
         assert isinstance(node, (ir.BaseView, ir.MutableBox))
         return node.data
     elif isinstance(x.data, ir.StorageBox):
         # TensorBox -> StorageBox -> IRNode
         return x.data.data
+=======
+        return x.data.unwrap_view().data
+    elif isinstance(x.data, ir.StorageBox):
+        # TensorBox -> StorageBox -> IRNode
+        return cast(ir.Buffer, x.data.data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise AssertionError(
             "Expect the data attr of a `TensorBox` to be either "
@@ -151,7 +162,11 @@ def _should_lower_as_one_shot_all_reduce(
         config._collective.auto_select
         and is_symm_mem_enabled_for_group(group_name)
         and can_realize_as_comm_buffer(inp, ir.CommBufferType.SYMM_MEM)
+<<<<<<< HEAD
         and reduce_op == "sum"
+=======
+        and reduce_op in ("sum",)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and inp_size <= config._collective.one_shot_all_reduce_threshold_bytes
     )
 
@@ -208,6 +223,7 @@ def _all_reduce(inp: ir.TensorBox, reduce_op: str, group_name: str) -> ir.Tensor
             # in-place reuse. Therefore, we tell the scheduler to not fuse it.
             inp.realize()
             V.graph.no_fuse_buffer_names.add(inp.get_name())
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         inp = ir.ExternKernel.require_contiguous(inp)
         # Because we are lowering as inplace c10d.all_reduce_, we should generate
@@ -219,6 +235,13 @@ def _all_reduce(inp: ir.TensorBox, reduce_op: str, group_name: str) -> ir.Tensor
             group_name,  # type: ignore[arg-type]
         )
         return inp  # type: ignore[return-value]
+=======
+        inp = ir.ExternKernel.require_contiguous(inp)
+        ir._CollectiveKernel.create_inplace(
+            c10d.all_reduce_.default, inp, reduce_op, group_name
+        )
+        return inp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @register_comm_lowering(c10d.all_reduce_)  # type: ignore[misc]
     def _all_reduce_(
@@ -233,6 +256,7 @@ def _all_reduce_(
             return inp
 
         # Lower as c10d.all_reduce_
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         inp = ir.ExternKernel.require_contiguous(inp)
         ir._AllReduce_Kernel.create_inplace(
@@ -242,6 +266,13 @@ def _all_reduce_(
             group_name,  # type: ignore[arg-type]
         )
         return inp  # type: ignore[return-value]
+=======
+        inp = ir.ExternKernel.require_contiguous(inp)
+        ir._CollectiveKernel.create_inplace(
+            c10d.all_reduce_.default, inp, reduce_op, group_name
+        )
+        return inp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @register_comm_lowering(c10d.all_reduce_coalesced)
     def _all_reduce_coalesced(inputs, reduce_op, group_name):
@@ -264,6 +295,7 @@ def _all_reduce_coalesced_(inputs, reduce_op, group_name):
         )
         return inputs
 
+<<<<<<< HEAD
     def _create_out_of_place(kernel, inputs, *args) -> ir.IRNode:
         node = ir._CollectiveKernel.create_out_of_place(kernel, inputs, *args)
         assert isinstance(node, ir.IRNode)
@@ -276,6 +308,17 @@ def _all_gather_into_tensor(inp, group_size, group_name):
             inp,
             group_size,
             group_name,
+=======
+    @register_comm_lowering(c10d.all_gather_into_tensor)
+    def _all_gather_into_tensor(inp, group_size, group_name):
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                c10d.all_gather_into_tensor.default,
+                inp,
+                group_size,
+                group_name,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @register_comm_lowering(c10d.all_gather_into_tensor_coalesced)
@@ -303,12 +346,23 @@ def _all_gather_into_tensor_out(inp, group_size, group_name, *, out):
 
     @register_comm_lowering(c10d.reduce_scatter_tensor)
     def _reduce_scatter_tensor(inp, reduce_op, group_size, group_name):
+<<<<<<< HEAD
         return _create_out_of_place(
             c10d.reduce_scatter_tensor.default,
             inp,
             reduce_op,
             group_size,
             group_name,
+=======
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                c10d.reduce_scatter_tensor.default,
+                inp,
+                reduce_op,
+                group_size,
+                group_name,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @register_comm_lowering(c10d.reduce_scatter_tensor_coalesced)
@@ -326,12 +380,23 @@ def _reduce_scatter_tensor_coalesced(inputs, reduce_op, group_size, group_name):
 
     @register_comm_lowering(c10d.all_to_all_single)
     def _all_to_all_single(inp, output_split_sizes, input_split_sizes, group_name):
+<<<<<<< HEAD
         return _create_out_of_place(
             c10d.all_to_all_single.default,
             inp,
             output_split_sizes,
             input_split_sizes,
             group_name,
+=======
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                c10d.all_to_all_single.default,
+                inp,
+                output_split_sizes,
+                input_split_sizes,
+                group_name,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @register_comm_lowering(c10d.broadcast)
@@ -351,12 +416,23 @@ def _broadcast_(inp, src, group_name):
 
     @register_comm_lowering(torch.ops._dtensor.shard_dim_alltoall)
     def _shard_dim_alltoall(inp, gather_dim, shard_dim, group_name):
+<<<<<<< HEAD
         return _create_out_of_place(
             torch.ops._dtensor.shard_dim_alltoall.default,
             inp,
             gather_dim,
             shard_dim,
             group_name,
+=======
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                torch.ops._dtensor.shard_dim_alltoall.default,
+                inp,
+                gather_dim,
+                shard_dim,
+                group_name,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @register_comm_lowering(c10d.wait_tensor)
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index f063d911b2a46..e061d7e013aca 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -4,14 +4,21 @@
 
 import heapq
 import importlib
+<<<<<<< HEAD
 import itertools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import operator
 import sys
 import time
 from collections import defaultdict
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._logging import trace_structured
@@ -20,6 +27,7 @@
 
 from . import config, ir
 from .dependencies import WeakDep
+<<<<<<< HEAD
 
 
 if TYPE_CHECKING:
@@ -33,6 +41,9 @@
     get_freeable_input_buf,
     SNodeMemory,
 )
+=======
+from .memory import estimate_peak_memory, FreeableInputBuffer, get_freeable_input_buf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .utils import (
     contains_collective,
     contains_wait,
@@ -52,6 +63,7 @@
     from torch._inductor.scheduler import BaseSchedulerNode
 
 
+<<<<<<< HEAD
 def align_runtime_estimations_across_all_distributed_ranks(
     snodes: list[BaseSchedulerNode],
 ):
@@ -74,6 +86,8 @@ def align_runtime_estimations_across_all_distributed_ranks(
         snodes[i].override_estimated_runtime = median_runtime_estimations[i]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def sink_waits(snodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
     """
     Greedily schedules waits as late as possible.
@@ -145,6 +159,7 @@ def reorder_communication_preserving_peak_memory(
     reordered_snodes, node_stats = (
         _reorder_communication_preserving_peak_memory_internal(snodes)
     )
+<<<<<<< HEAD
 
     return reordered_snodes
 
@@ -625,6 +640,8 @@ def is_groupable(
         curr = _next[curr]  # type: ignore[assignment]
 
     node_stats = stats
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     improvement = {snode: node_stats[snode].improvement for snode in node_stats}
     total_improvement = sum([improvement[snode] for snode in improvement])
     total_moves = sum([node_stats[snode].moves for snode in node_stats])
@@ -640,12 +657,16 @@ def is_groupable(
         "improvement",
         "limiting factor",
         "moves",
+<<<<<<< HEAD
         "grouped",
         "grouped_info",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     rows = [
         [
             node_summary(snode),
+<<<<<<< HEAD
             node_info.initial_exposed,
             node_info.final_exposed,
             node_info.improvement,
@@ -655,6 +676,15 @@ def is_groupable(
             node_info.grouped_info,
         ]
         for snode, node_info in node_stats.items()
+=======
+            node_reorder_info.initial_exposed,
+            node_reorder_info.final_exposed,
+            node_reorder_info.improvement,
+            node_reorder_info.limiting_factor,
+            node_reorder_info.moves,
+        ]
+        for snode, node_reorder_info in node_stats.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     if importlib.util.find_spec("tabulate"):
         from tabulate import tabulate
@@ -669,6 +699,7 @@ def is_groupable(
         )
         reorder_log_str += str(headers) + "\n"
         reorder_log_str += "\n".join(map(str, rows))
+<<<<<<< HEAD
 
     new_snodes = _group_nodes(_head, None)
     assert len(new_snodes) == original_snodes_num
@@ -678,6 +709,8 @@ def is_groupable(
     reorder_log_str += f"\n peak_memory_before:{peak_memory}"
     reorder_log_str += f"\n peak_memory_after:{new_peak_memory}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overlap_log.info(reorder_log_str)
     trace_structured(
         "artifact",
@@ -688,7 +721,116 @@ def is_groupable(
         payload_fn=lambda: reorder_log_str,
     )
 
+<<<<<<< HEAD
     return new_snodes, stats
+=======
+    return reordered_snodes
+
+
+@dataclass
+class ReorderInfo:
+    """
+    Debug info describing how an individual snode was reordered
+    """
+
+    initial_exposed: float = -1
+    final_exposed: float = -1
+    limiting_factor: str = "None"
+    moves: int = 0
+
+    @property
+    def improvement(self):
+        return self.initial_exposed - self.final_exposed
+
+
+def _reorder_communication_preserving_peak_memory_internal(
+    snodes: list[BaseSchedulerNode],
+) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]:
+    """
+    Internal testing helper that also returns debug info.
+    Returns:
+        - reordered snodes list
+        - dict {snode: ReorderInfo}
+    """
+    # heuristic to avoid degenerating to quadratic time
+    MOVE_LIMIT = len(snodes) * 100
+    total_moves = 0
+    # TODO - experiment with whether this limit is useful, setting `len(snodes)` disables it
+    PER_COLLECTIVE_PREFETCH_LIMIT = len(snodes)
+    if config.reorder_prefetch_limit is not None:
+        PER_COLLECTIVE_PREFETCH_LIMIT = config.reorder_prefetch_limit
+    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = get_freeable_input_buf(
+        snodes, graph_inputs
+    )
+    peak_memory, curr_memory = estimate_peak_memory(
+        snodes, name_to_freeable_input_buf, graph_outputs
+    )
+    runtimes = {snode: estimate_op_runtime(snode) for snode in snodes}
+
+    # debug stats
+    stats: dict[BaseSchedulerNode, ReorderInfo] = {}
+
+    def exposed_communication_time(collective_snode, remaining_snodes):
+        # assumes a linear schedule and computes the overlap of the collective with the remaining nodes
+        comm_time = estimate_op_runtime(collective_snode)
+        compute_time = 0.0
+        for snode in remaining_snodes:
+            if contains_collective(snode):
+                continue
+            if contains_wait(snode):
+                # TODO - if the wait is for a collective that started before this collective or on another stream,
+                # we can ignore it. Otherwise, it's the end of the road for overlap opportunities
+                break
+
+            compute_time += runtimes[snode]
+        return max(0, comm_time - compute_time)
+
+    for i, snode in enumerate(snodes):
+        if contains_collective(snode):
+            reorder_info = stats[snode] = ReorderInfo()
+            reorder_info.initial_exposed = reorder_info.final_exposed = (
+                exposed_communication_time(snode, snodes[i + 1 :])
+            )
+            if total_moves >= MOVE_LIMIT:
+                reorder_info.limiting_factor = "move limit"
+                continue
+            for j in range(i - 1, -1, -1):
+                prev_snode = snodes[j]
+                if j < max(0, i - PER_COLLECTIVE_PREFETCH_LIMIT):
+                    reorder_info.limiting_factor = "prefetch limit"
+                    break
+                if contains_collective(prev_snode):
+                    reorder_info.limiting_factor = "collective ordering"
+                    break
+                dep_names = OrderedSet([s.name for s in snode.unmet_dependencies])
+                if any(
+                    o.get_name() in dep_names for o in prev_snode.get_outputs()
+                ) and not contains_wait(prev_snode):
+                    reorder_info.limiting_factor = "data dependency"
+                    break
+                if peak_memory - curr_memory[j] < curr_memory[j - 1] - curr_memory[j]:
+                    reorder_info.limiting_factor = "peak memory"
+                    break
+                if reorder_info.final_exposed > runtimes[snode]:
+                    reorder_info.limiting_factor = "sufficient overlapping"
+                    break
+                reorder_info.moves += 1
+                total_moves += 1
+                tmp = snodes[j]
+                snodes[j] = snodes[j + 1]
+                snodes[j + 1] = tmp
+                # swapping nodes j and j+1 affects curr memory at j only
+                j_plus_one_alloc = curr_memory[j + 1] - curr_memory[j]
+                j_alloc = curr_memory[j] - curr_memory[j - 1]
+                curr_memory[j] = curr_memory[j] - j_alloc + j_plus_one_alloc
+                reorder_info.final_exposed = exposed_communication_time(
+                    snode, snodes[j + 1 :]
+                )
+
+    return snodes, stats
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _schedule_for_comm(
@@ -827,6 +969,7 @@ def schedule_collective_for_overlap(snode):
             and (candidate := get_overlapping_candidate()) is not None
         ):
             ready.remove(candidate)
+<<<<<<< HEAD
 
             schedule(candidate.snode)
 
@@ -834,6 +977,13 @@ def schedule_collective_for_overlap(snode):
         heapq.heapify(ready)
 
     while ready:
+=======
+            schedule(candidate.snode)
+            collective_cost -= snode_to_cost[candidate.snode]
+        heapq.heapify(ready)
+
+    while len(ready):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         snode = heapq.heappop(ready).snode
         if reorder_for_overlap and contains_collective(snode):
             schedule_collective_for_overlap(snode)
@@ -864,13 +1014,18 @@ def decide_global_ordering_of_comms(
         # Enforce ordering by making previous comm a `WeakDep` dependency of the next comm
         mutating_buf = next(iter(comm_nodes[i].get_buffer_names()))
         for buf in comm_nodes[i - 1].get_buffer_names():
+<<<<<<< HEAD
             comm_nodes[i].add_fake_dep(
                 WeakDep(buf, mutating_buf=mutating_buf, is_fake=True)
             )
+=======
+            comm_nodes[i].add_fake_dep(WeakDep(buf, mutating_buf=mutating_buf))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return nodes
 
 
+<<<<<<< HEAD
 @dataclass
 class SinkWaitInfo:
     grouped: int = 0
@@ -1251,6 +1406,8 @@ def sink_waits_iterative(
     return _sink_waits_iterative_internal(snodes)[0]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def estimate_op_runtime(snode: BaseSchedulerNode) -> float:
     """
     Returns estimated op runtime in nanoseconds (ns)
@@ -1268,9 +1425,13 @@ def node_summary(snode):
     if len(snodes) == 1:
         detail = ""
         if isinstance(snode.node, (ir.ExternKernelOut, ir._CollectiveKernel)):
+<<<<<<< HEAD
             outs_str = f"outs:{[o.get_name() for o in snode.get_outputs()]}"
             ins_str = f"ins:{[d.name for d in snode.unmet_dependencies]}"
             detail = f" {snode.get_name()} ({snode.node.python_kernel_name})\n {outs_str}\n ({ins_str})"
+=======
+            detail = f" ({snode.node.python_kernel_name})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layouts = [child.node.get_output_spec() for child in snode.get_nodes()]
         out_tensor_info = ",".join(
             [
@@ -1370,7 +1531,10 @@ def reorder_compute_and_comm_for_overlap(
             snodes, get_freeable_input_buf(snodes, graph_inputs), graph_outputs
         )
         print(f"final {peak_memory=}")
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return order
 
 
@@ -1419,7 +1583,11 @@ def check_resize_pattern(graph_input):
         )
         resized_to_0_idxes = graph_input_to_resized_to_0_node_idxes.get(graph_input, [])
 
+<<<<<<< HEAD
         if len(resized_to_full_idxes) != len(resized_to_0_idxes):
+=======
+        if not len(resized_to_full_idxes) == len(resized_to_0_idxes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.warning(
                 f"""
 Unequal number of resize-to-full and resize-to-0 nodes for graph input {graph_input}:
@@ -1628,17 +1796,31 @@ def remove_unused_getitem(g):
                 CallFunction(
                     torch.ops.fsdp.all_gather_copy_in.default,
                     KeywordArg("all_gather_inputs"),
+<<<<<<< HEAD
                     KeywordArg("all_gather_output"),
                     KeywordArg("inp_split_sizes"),
                     KeywordArg("all_gather_input_numel"),
                     KeywordArg("rank"),
+=======
+                    KeywordArg("inp_split_sizes"),
+                    KeywordArg("all_gather_input_numel"),
+                    KeywordArg("world_size"),
+                    KeywordArg("rank"),
+                    KeywordArg("dtype"),
+                    KeywordArg("device"),
+                    KeywordArg("group_name_inner"),
+                    KeywordArg("allocate_memory_from_process_group"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 KeywordArg("item_idx"),
             ),
             KeywordArg("group_size"),
             KeywordArg("group_name"),
         ),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
         extra_check=lambda match: match.kwargs["item_idx"] == 0,
     )
@@ -1662,6 +1844,7 @@ def repl(
             return all_gather_into_tensor
 
         match.replace_by_example(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             repl,
             [
@@ -1670,6 +1853,19 @@ def repl(
                 kwargs["inp_split_sizes"],
                 kwargs["all_gather_input_numel"],
                 kwargs["rank"],
+=======
+            repl,
+            [
+                kwargs["all_gather_inputs"],
+                kwargs["inp_split_sizes"],
+                kwargs["all_gather_input_numel"],
+                kwargs["world_size"],
+                kwargs["rank"],
+                kwargs["dtype"],
+                kwargs["device"],
+                kwargs["group_name_inner"],
+                kwargs["allocate_memory_from_process_group"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["group_size"],
                 kwargs["group_name"],
             ],
@@ -1848,7 +2044,11 @@ def _create_group_node(snodes_to_group):
             mutating_buf = next(iter(ag_group_node.get_buffer_names()))
             for o in prev_ag_wait.get_outputs():
                 ag_group_node.add_fake_dep(
+<<<<<<< HEAD
                     WeakDep(o.get_name(), mutating_buf=mutating_buf, is_fake=True)
+=======
+                    WeakDep(o.get_name(), mutating_buf=mutating_buf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         prev_ag_wait = wait_group_node
 
@@ -1860,7 +2060,11 @@ def _create_group_node(snodes_to_group):
             mutating_buf = next(iter(rs_group_node.get_buffer_names()))
             for o in prev_rs_wait.get_outputs():
                 rs_group_node.add_fake_dep(
+<<<<<<< HEAD
                     WeakDep(o.get_name(), mutating_buf=mutating_buf, is_fake=True)
+=======
+                    WeakDep(o.get_name(), mutating_buf=mutating_buf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         prev_rs_wait = wait_group_node
 
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 03f890558b395..34bdb09260cd4 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
 import contextlib
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import enum
 import functools
 import io
@@ -15,7 +18,10 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from contextlib import AbstractContextManager
+<<<<<<< HEAD
 from dataclasses import dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from inspect import currentframe
 from itertools import count
 from operator import attrgetter
@@ -23,7 +29,11 @@
 from typing_extensions import Never, override, ParamSpec, Protocol, TypedDict, Unpack
 from unittest import mock
 
+<<<<<<< HEAD
 import torch._inductor.async_compile
+=======
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.fx
 import torch.utils._pytree as pytree
 from functorch.compile import min_cut_rematerialization_partition
@@ -54,7 +64,10 @@
 )
 from torch._functorch.aot_autograd import (
     aot_export_module,
+<<<<<<< HEAD
     GraphOutputName,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_boxed_func,
     SerializableAOTDispatchCompiler,
 )
@@ -65,11 +78,15 @@
     log_cudagraph_skip_and_bump_counter,
     PlaceholderInfo,
 )
+<<<<<<< HEAD
 from torch._inductor.custom_graph_pass import CustomPartitionerFn
 from torch._inductor.debug import (
     create_mapping_pre_post_grad_nodes,
     save_args_for_compile_fx_inner,
 )
+=======
+from torch._inductor.debug import save_args_for_compile_fx_inner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.output_code import (
     CompiledAOTI,
     CompiledFxGraph,
@@ -114,7 +131,11 @@
 from .fx_passes.pre_grad import pre_grad_passes
 from .graph import GraphLowering
 from .ir import get_device_type, IRNode
+<<<<<<< HEAD
 from .output_code import complex_memory_overlap  # noqa: F401
+=======
+from .output_code import complex_memory_overlap as complex_memory_overlap  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .triton_bundler import TritonBundler
 from .utils import (
     align_inputs_from_check_idxs,
@@ -155,14 +176,18 @@ def log_optimus_to_scuba(*args: object, **kwargs: object) -> None:
     from torch._inductor.fb.utils import log_optimus_to_scuba, time_and_log
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     import types
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._functorch._aot_autograd.schemas import (
         FQN,
         GraphInputName,
         GraphSignature,
     )
 
+<<<<<<< HEAD
     CompileFxOutput = Union[
         Callable[[list[object]], Sequence[torch.Tensor]],
         str,
@@ -170,6 +195,8 @@ def log_optimus_to_scuba(*args: object, **kwargs: object) -> None:
         Weights,
     ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class FxCompileMode(enum.Enum):
     NORMAL = 0
@@ -180,6 +207,7 @@ class FxCompileMode(enum.Enum):
     SUBPROCESS = 2
 
 
+<<<<<<< HEAD
 @dataclass
 class FxCompileConfig:
     mode: FxCompileMode
@@ -199,13 +227,27 @@ def _fx_compile_mode_default() -> FxCompileConfig:
     if value.lower().startswith("progressive+"):
         use_progressive = True
         value = value[12:]
+=======
+# Return compile mode and use_async flag
+def _fx_compile_mode_default() -> tuple[FxCompileMode, bool]:
+    name = "TORCHINDUCTOR_FX_COMPILE_MODE"
+    value = os.environ.get(name)
+    if value is None:
+        return FxCompileMode.NORMAL, False
+
+    use_async = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if value.lower().startswith("async+"):
         use_async = True
         value = value[6:]
 
     try:
         value = value.upper()
+<<<<<<< HEAD
         return FxCompileConfig(FxCompileMode[value], use_async, use_progressive)
+=======
+        return FxCompileMode[value], use_async
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except KeyError:
         import logging
 
@@ -218,6 +260,7 @@ def _fx_compile_mode_default() -> FxCompileConfig:
         )
         # Remove from the environment so subprocesses don't ALSO complain.
         os.environ.pop(name)
+<<<<<<< HEAD
         return FxCompileConfig(FxCompileMode.NORMAL, False, False)
 
 
@@ -232,6 +275,12 @@ def _get_progression_configs() -> list[dict[str, Any]]:
 fx_compile_mode = _fx_compile_config.mode
 fx_compile_async = _fx_compile_config.use_async
 fx_compile_progressive = _fx_compile_config.use_progressive
+=======
+        return FxCompileMode.NORMAL, False
+
+
+fx_compile_mode, fx_compile_async = _fx_compile_mode_default()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
@@ -273,7 +322,10 @@ def record_original_output_strides(gm: GraphModule) -> None:
         ):
             output_strides.append(val.stride())
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_strides.append(None)
     output_node.meta["original_output_strides"] = output_strides
 
@@ -371,6 +423,7 @@ def find_smallest_i(graph: fx.Graph, prefix: str) -> int:
                 continue
             gm_target = attrgetter(target_name)(gm)
             model_target = attrgetter(target_name)(mod)
+<<<<<<< HEAD
             if isinstance(gm_target, FakeScriptObject):
                 if (
                     isinstance(model_target, FakeScriptObject)
@@ -384,6 +437,12 @@ def find_smallest_i(graph: fx.Graph, prefix: str) -> int:
             ):
                 # If tensors with same name from gm and model are indeed the same, we don't need to rename
                 # Check device first, to avoid torch.equal(wrapper_CUDA__equal) raise when different device
+=======
+            if (
+                torch.equal(gm_target, model_target)
+                and gm_target.dtype == model_target.dtype
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             prefix = (
@@ -448,7 +507,11 @@ def _unlift_graph(
 
     from torch.export._unlift import _unlift
 
+<<<<<<< HEAD
     outputs: tuple[torch.fx.Node, ...] = tuple(gm.graph.output_node().args[0])  # type: ignore[arg-type]
+=======
+    outputs = list(gm.graph.nodes)[-1].args[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mutated_outputs = []
     buffer_mutations = graph_signature.buffers_to_mutate
     user_input_mutations = graph_signature.user_inputs_to_mutate
@@ -457,11 +520,18 @@ def _unlift_graph(
         value: Optional[Union[FQN, GraphInputName]] = None
 
         if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
+<<<<<<< HEAD
             name = GraphOutputName(out.name)
             if name in buffer_mutations:
                 value = buffer_mutations[name]
             elif name in user_input_mutations:
                 value = user_input_mutations[name]
+=======
+            if out.name in buffer_mutations:
+                value = buffer_mutations[out.name]
+            elif out.name in user_input_mutations:
+                value = user_input_mutations[out.name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mutated_outputs.append(value)
 
@@ -469,8 +539,15 @@ def _unlift_graph(
         gm,
         lifted_inputs,
         mutated_outputs,
+<<<<<<< HEAD
         pytree.treespec_leaf(),
         None,
+=======
+        pytree.LeafSpec(),
+        None,
+        state_dict,
+        {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return unlifted_gm
 
@@ -739,7 +816,10 @@ class _CompileFxKwargs(TypedDict, total=False):
     layout_opt: Optional[bool]
     extern_node_serializer: Optional[Callable[[list[ExternKernelNode]], Any]]
     boxed_forward_device_index: Optional[BoxedDeviceIndex]
+<<<<<<< HEAD
     fx_wrapper: bool
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _CompileFxCallable(Protocol):
@@ -761,7 +841,10 @@ def compile_fx_inner(
     kwargs.setdefault("is_backward", False)
     kwargs.setdefault("graph_id", None)
     kwargs.setdefault("cpp_wrapper", False)
+<<<<<<< HEAD
     kwargs.setdefault("fx_wrapper", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs.setdefault("is_inference", False)
     kwargs.setdefault("boxed_forward_device_index", None)
     kwargs.setdefault("layout_opt", None)
@@ -857,9 +940,13 @@ def _compile_fx_inner(
     backends_support_caching = all(
         backend.supports_caching
         for backend in (
+<<<<<<< HEAD
             get_wrapper_codegen_for_device(
                 device.type, config.cpp_wrapper, config.fx_wrapper
             )
+=======
+            get_wrapper_codegen_for_device(device.type, config.cpp_wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for device in get_all_devices(gm)
         )
         if backend is not None
@@ -873,7 +960,10 @@ def _compile_fx_inner(
             and (config.fx_graph_cache or fx_graph_remote_cache)
             and not aot_mode
             and backends_support_caching
+<<<<<<< HEAD
             and not torch._functorch.config.bundled_autograd_cache
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         local = config.fx_graph_cache
         remote = fx_graph_remote_cache
@@ -931,6 +1021,7 @@ def _compile_fx_inner(
             else:
                 log.debug("Failed to generate FX cache key")
 
+<<<<<<< HEAD
         if torch._functorch.config.bundled_autograd_cache:
             assert mb_compiled_graph is None
             assert cache_info is None
@@ -962,6 +1053,12 @@ def _compile_fx_inner(
         # (this can happen either because cache was disabled, or we
         # determined the input is uncacheable)
         elif cache_info is None or cache_info["cache_state"] == "bypass":
+=======
+        # CACHE BYPASS: Compile the graph, don't save it to the cache
+        # (this can happen either because cache was disabled, or we
+        # determined the input is uncacheable)
+        if cache_info is None or cache_info["cache_state"] == "bypass":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mb_compiled_graph is None
             log.debug(
                 "FX cache bypass reason: %s",
@@ -971,6 +1068,7 @@ def _compile_fx_inner(
                     else "FX cache disabled or key generation failed"
                 ),
             )
+<<<<<<< HEAD
             try:
                 mb_compiled_graph = fx_codegen_and_compile(
                     gm, example_inputs, inputs_to_check, **graph_kwargs
@@ -979,6 +1077,11 @@ def _compile_fx_inner(
                 raise InductorError(e, currentframe()).with_traceback(
                     e.__traceback__
                 ) from None
+=======
+            mb_compiled_graph = fx_codegen_and_compile(
+                gm, example_inputs, inputs_to_check, **graph_kwargs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # CACHE MISS: Compile the graph and save to cache
         elif cache_info["cache_state"] == "miss":
@@ -1082,6 +1185,34 @@ def _compile_fx_inner(
 
     log.debug("FX codegen and compilation took %.3fs", time.time() - start)
 
+<<<<<<< HEAD
+=======
+    # Dump provenance artifacts for debugging trace
+    provenance_info = V.debug.log_inductor_triton_kernel_to_post_grad_node_info()
+    # provenance_info might be None if config.trace.enabled is not set
+    if provenance_info:
+        (
+            debug_info,
+            node_mappings,
+        ) = provenance_info
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_generated_kernel_to_post_grad_nodes",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(debug_info),
+        )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_node_mappings",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(node_mappings),
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This message is for printing overview information of inductor mm counts, shapes,etc after lowering
     if log.isEnabledFor(logging.INFO):
         mm_table_data = []
@@ -1115,7 +1246,10 @@ def _compile_fx_inner(
         )
         log.info("-" * 130)
         for row in mm_table_data:
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.info("{:<30} | {:<20} | {:<20} | {:<20} | {:<20} | {:<20}".format(*row))  # noqa: G001
             log.info("-" * 130)
 
@@ -1189,7 +1323,10 @@ def codegen_and_compile(
         is_backward: bool = graph_kwargs.get("is_backward", False)
         graph_id: Optional[int] = graph_kwargs.get("graph_id", None)
         cpp_wrapper: bool = graph_kwargs.get("cpp_wrapper", False)
+<<<<<<< HEAD
         fx_wrapper: bool = graph_kwargs.get("fx_wrapper", False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot_mode: bool = V.aot_compilation
         is_inference: bool = graph_kwargs.get("is_inference", False)
         extern_node_serializer: Optional[Callable[[list[ExternKernelNode]], Any]] = (
@@ -1244,9 +1381,13 @@ def codegen_and_compile(
             # structured logs...
             # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
 
+<<<<<<< HEAD
             shape_env = gm.shape_env
             if shape_env is None:
                 shape_env = shape_env_from_inputs(example_inputs)
+=======
+            shape_env = shape_env_from_inputs(example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Convert view to reshape in the graph. This is necessary primarily for
             # layout optimization. Do it unconditionally for uniformity.
@@ -1317,16 +1458,25 @@ def codegen_and_compile(
                     include_device=True,
                     fast_sympy_print=True,
                 )
+<<<<<<< HEAD
                 # "inductor_post_grad_graph" is used in inductor provenance
+=======
+                # "after_post_grad_graph" is used in inductor provenance
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # tracking highlighter front-end.
                 trace_structured(
                     "artifact",
                     metadata_fn=lambda: {
+<<<<<<< HEAD
                         "name": "inductor_post_grad_graph",
+=======
+                        "name": "after_post_grad_graph",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "encoding": "string",
                     },
                     payload_fn=lambda: inductor_post_grad_graph_str,
                 )
+<<<<<<< HEAD
                 if config.trace.provenance_tracking_level != 0:
                     provenance_tracking_json = (
                         torch.fx.traceback.get_graph_provenance_json(gm.graph)
@@ -1336,11 +1486,35 @@ def codegen_and_compile(
                             torch._inductor.debug._pre_grad_graph_id,
                             provenance_tracking_json,
                         )
+=======
+                if config.trace.enabled:
+                    provenance_tracking_json = (
+                        torch.fx.traceback.get_graph_provenance_json(gm.graph)
+                    )
+                    trace_structured(
+                        "artifact",
+                        metadata_fn=lambda: {
+                            "name": "inductor_post_to_pre_grad_nodes",
+                            "encoding": "json",
+                        },
+                        payload_fn=lambda: json.dumps(provenance_tracking_json),
+                    )
+                    torch._inductor.debug._inductor_post_to_pre_grad_nodes = (
+                        provenance_tracking_json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                 metrics_context = get_metrics_context()
                 if metrics_context.in_progress():
+<<<<<<< HEAD
                     num_graph_breaks = counters["graph_break"].total()
+=======
+                    # TODO: Remove this when 3.9 is no longer supported
+                    if sys.version_info < (3, 10):
+                        num_graph_breaks = sum(counters["graph_break"].values())
+                    else:
+                        num_graph_breaks = counters["graph_break"].total()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     CompileEventLogger.compilation_metric(
                         overwrite=True, num_graph_breaks=num_graph_breaks
                     )
@@ -1390,12 +1564,17 @@ def codegen_and_compile(
                         is_inference=is_inference,
                         is_backward=is_backward,
                         is_const_graph=True,
+<<<<<<< HEAD
                         fx_wrapper=fx_wrapper,
                     )
                     with (
                         V.set_graph_handler(const_graph),
                         V.set_extern_kernel_nodes([]),
                     ):
+=======
+                    )
+                    with V.set_graph_handler(const_graph):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assert cpp_wrapper, "AOT mode only supports C++ wrapper"
                         const_graph.run()
                         const_wrapper_code, const_kernel_code = (
@@ -1424,14 +1603,21 @@ def codegen_and_compile(
                     ),
                     const_module=const_graph,
                     inputs_to_check=inputs_to_check,
+<<<<<<< HEAD
                     fx_wrapper=fx_wrapper,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 metrics_helper = metrics.CachedMetricsHelper()
 
                 # We are going to start code generating runtime asserts, so make sure
                 # you don't start adding new ones in the lowering process
                 graph.freeze_runtime_asserts()
+<<<<<<< HEAD
                 with V.set_graph_handler(graph), V.set_extern_kernel_nodes([]):
+=======
+                with V.set_graph_handler(graph):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     graph.run(*example_inputs)
                     output_strides: list[Optional[tuple[_StrideExprStr, ...]]] = []
                     if graph.graph_outputs is not None:
@@ -1462,6 +1648,7 @@ def codegen_and_compile(
                     with dynamo_timed(
                         "GraphLowering.compile_to_fn", log_pt2_compile_event=True
                     ):
+<<<<<<< HEAD
                         if graph.aot_mode and graph.fx_wrapper:
                             assert not graph.cpp_wrapper
                             compiled_fn = graph.codegen()[0].gm  # type: ignore[attr-defined]
@@ -1471,6 +1658,9 @@ def codegen_and_compile(
                             )
 
                         elif graph.aot_mode:
+=======
+                        if graph.aot_mode:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             from .codecache import AotCodeCompiler
 
                             assert graph.cpp_wrapper, (
@@ -1486,9 +1676,17 @@ def codegen_and_compile(
                                 )
 
                             serialized_extern_kernel_nodes = None
+<<<<<<< HEAD
                             if V.extern_kernel_nodes:
                                 serialized_extern_kernel_nodes = (
                                     graph.extern_node_serializer(V.extern_kernel_nodes)
+=======
+                            if graph.extern_kernel_nodes:
+                                serialized_extern_kernel_nodes = (
+                                    graph.extern_node_serializer(
+                                        graph.extern_kernel_nodes
+                                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 )
                                 output_code_log.debug(
                                     "Serialized Extern Kernel Nodes: \n%s",
@@ -1523,6 +1721,7 @@ def codegen_and_compile(
                                 compiled_module, "runner", None
                             )
 
+<<<<<<< HEAD
                     # Dump provenance artifacts for debugging trace
                     inductor_provenance_tracking_node_mappings = None
                     inductor_kernel_stack_trace_str = None
@@ -1558,6 +1757,10 @@ def codegen_and_compile(
                     if inductor_metrics_log.isEnabledFor(logging.INFO):
                         num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
                         # pyrefly: ignore [bad-assignment]
+=======
+                    if inductor_metrics_log.isEnabledFor(logging.INFO):
+                        num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         metrics.num_bytes_accessed += num_bytes
                         metrics.node_runtimes += node_runtimes
                         metrics.nodes_num_elem += nodes_num_elem
@@ -1570,6 +1773,7 @@ def codegen_and_compile(
                             },
                         )
 
+<<<<<<< HEAD
                     # Collect and dump op runtimes and tensor metadata for TLParse
                     if config.log_tlparse:
                         _, _, node_runtimes = graph.count_bytes()
@@ -1578,6 +1782,8 @@ def codegen_and_compile(
                     # Collect and dump collective-op schedule for external diagnostics
                     torch._inductor.debug.log_collective_schedule(graph.scheduler.nodes)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (
                         cudagraphs
                         and config.triton.cudagraph_skip_dynamic_graphs
@@ -1601,10 +1807,15 @@ def codegen_and_compile(
                             disable = f"{disable} Found from {stack_trace}\n"
                         else:
                             disable = f"{disable}\n"
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
                         V.graph.disable_cudagraphs_reason = disable
 
                     # pyrefly: ignore [unbound-name]
+=======
+                        V.graph.disable_cudagraphs_reason = disable
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if cudagraphs and not V.graph.disable_cudagraphs_reason:
                         maybe_incompat_node = get_first_incompatible_cudagraph_node(gm)
                         if maybe_incompat_node:
@@ -1613,6 +1824,7 @@ def codegen_and_compile(
                                 "stack_trace", None
                             ):
                                 disable = f"{disable} Found from {stack_trace}\n"
+<<<<<<< HEAD
                             # pyrefly: ignore [unbound-name]
                             V.graph.disable_cudagraphs_reason = disable
 
@@ -1627,21 +1839,36 @@ def codegen_and_compile(
 
                     # TODO: Hoist this above V.aot_compilation
                     # pyrefly: ignore [unbound-name]
+=======
+                            V.graph.disable_cudagraphs_reason = disable
+
+                    if V.aot_compilation:
+                        assert isinstance(compiled_fn, (str, list))
+                        return CompiledAOTI(compiled_fn)
+
+                    # TODO: Hoist this above V.aot_compilation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if cudagraphs and not V.graph.disable_cudagraphs_reason:
                         from torch._inductor.cudagraph_utils import (
                             check_lowering_disable_cudagraph,
                         )
 
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
                         V.graph.disable_cudagraphs_reason = (
                             check_lowering_disable_cudagraph(
                                 # pyrefly: ignore [unbound-name]
+=======
+                        V.graph.disable_cudagraphs_reason = (
+                            check_lowering_disable_cudagraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 V.graph.device_node_mapping
                             )
                         )
 
                     self._compile_stats[type(self)].codegen_and_compile += 1
 
+<<<<<<< HEAD
                     if (
                         # pyrefly: ignore [unbound-name]
                         torch._inductor.debug.RECORD_GRAPH_EXECUTION
@@ -1661,11 +1888,17 @@ def codegen_and_compile(
 
                     return CompiledFxGraph(
                         # pyrefly: ignore [bad-argument-type]
+=======
+                    return CompiledFxGraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         compiled_fn,
                         graph,
                         gm,
                         output_strides,
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         V.graph.disable_cudagraphs_reason,
                         metrics_helper.get_deltas(),
                         counters["inductor"] - inductor_counters,
@@ -1677,8 +1910,11 @@ def codegen_and_compile(
                         runnable_graph_str,
                         inductor_post_grad_graph_str,
                         compiled_fn_runner,
+<<<<<<< HEAD
                         inductor_provenance_tracking_node_mappings,
                         inductor_kernel_stack_trace_str,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
 
@@ -1707,6 +1943,7 @@ def fx_codegen_and_compile(
         from .compile_fx_async import _AsyncFxCompile
         from .compile_fx_ext import _OutOfProcessFxCompile
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         assert isinstance(scheme, _OutOfProcessFxCompile), (
             "async is only valid with an out-of-process compile mode"
@@ -1732,6 +1969,13 @@ def fx_codegen_and_compile(
         scheme = _ProgressiveFxCompile(fast_scheme, scheme, progression_configs)
 
     # pyrefly: ignore [unbound-name]
+=======
+        assert isinstance(scheme, _OutOfProcessFxCompile), (
+            "async is only valid with an out-of-process compile mode"
+        )
+        scheme = _AsyncFxCompile(scheme)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
 
 
@@ -1841,7 +2085,10 @@ def cudagraphify_impl(
     Assumes inputs[static_input_idxs[i]] are always the same memory address
     """
     check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)  # type: ignore[arg-type]
+<<<<<<< HEAD
     # pyrefly: ignore [annotation-mismatch]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static_input_idxs: OrderedSet[int] = OrderedSet(
         remove_unaligned_input_idxs(inputs, static_input_idxs)  # type: ignore[arg-type]
     )
@@ -1908,7 +2155,10 @@ def run(new_inputs: list[InputType]) -> Callable[[list[InputType]], Any]:
                     index_expanded_dims_and_copy_(dst, src, expanded_dims)
             new_inputs.clear()
             graph.replay()
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return static_outputs
 
     else:
@@ -1924,7 +2174,10 @@ def run(new_inputs: list[InputType]) -> Callable[[list[InputType]], Any]:
                 index_expanded_dims_and_copy_(static_inputs[idx], src, expanded_dims)
             new_inputs.clear()
             graph.replay()
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return static_outputs
 
     return align_inputs_from_check_idxs(run, check_input_idxs, OrderedSet())
@@ -1934,19 +2187,32 @@ def compile_fx_aot(
     model_: GraphModule,
     example_inputs_: list[InputType],
     inner_compile: _CompileFxCallable = compile_fx_inner,
+<<<<<<< HEAD
     config_patches: Optional[dict[str, Any]] = None,
 ) -> Union[list[Union[str, Weights]], str, GraphModule]:
+=======
+    config_patches: Optional[dict[str, str]] = None,
+) -> Union[list[Union[str, Weights]], str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(model_, GraphModule), model_
 
     # [See NOTE] Unwrapping subclasses AOT
     unwrap_tensor_subclass_parameters(model_)
 
+<<<<<<< HEAD
     # pyrefly: ignore [annotation-mismatch]
     config_patches: dict[str, Any] = copy.deepcopy(config_patches or {})
 
     if not (config_patches.get("fx_wrapper", False) or config.fx_wrapper):
         # If fx_wrapper is not set, then set cpp_wrapper
         config_patches["cpp_wrapper"] = True
+=======
+    config_patches: dict[str, Any] = (
+        {"cpp_wrapper": True}
+        if config_patches is None
+        else {**config_patches, "cpp_wrapper": True}
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     output_path = config_patches.get(
         "aot_inductor.output_path", config.aot_inductor.output_path
@@ -1965,10 +2231,13 @@ def compile_fx_aot(
             "aot_inductor.output_path": code_hash(model_.code),
         }
 
+<<<<<<< HEAD
     from .utils import maybe_aoti_standalone_config
 
     config_patches = maybe_aoti_standalone_config(config_patches)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extern_node_serializer = config_patches.pop("extern_node_serializer", None)
     saved_compile_id = model_.meta.get("dynamo_compile_id", None)
     saved_compile_context = torch._guards.CompileContext(saved_compile_id)
@@ -2038,7 +2307,11 @@ def fw_compiler_freezing(
         idx for idx, n in enumerate(model_outputs) if isinstance(n, torch.fx.Node)
     ]
 
+<<<<<<< HEAD
     static_input_idxs: list[Any] = []
+=======
+    static_input_idxs = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # constant params will be real tensors, not fake
     tracing_context = torch._guards.TracingContext.try_get()
     unwrapped_args_offsets = [0]
@@ -2139,6 +2412,7 @@ def get_cuda_device_context(gm: torch.fx.GraphModule) -> AbstractContextManager[
     )
 
 
+<<<<<<< HEAD
 def partition_fn(
     gm: GraphModule,
     joint_inputs: Sequence[object],
@@ -2423,6 +2697,8 @@ def run_pre_grad_passes(
     return model_
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compile_fx(
     model_: GraphModule,
     example_inputs_: Sequence[InputType],
@@ -2430,7 +2706,11 @@ def compile_fx(
     config_patches: Optional[dict[str, Any]] = None,
     decompositions: Optional[dict[OpOverload, Callable[..., Any]]] = None,
     ignore_shape_env: bool = False,
+<<<<<<< HEAD
 ) -> CompileFxOutput:
+=======
+) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str], Weights]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Main entry point for compiling given FX graph.  Despite the fact that this
     lives in :mod:`torch._inductor`, this function is responsible for calling
@@ -2442,6 +2722,10 @@ def compile_fx(
     NB: This function TAKES OWNERSHIP of the input ``model_`` and can potentially
     mutate it!  Make a copy if you need to preserve the original GraphModule.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Some arguments trigger a recursive call to compile_fx.  Handle these
     # short circuits first, before anything else
 
@@ -2456,6 +2740,7 @@ def compile_fx(
                 ignore_shape_env=ignore_shape_env,
             )
 
+<<<<<<< HEAD
     # Wake up the AsyncCompile subproc pool as early as possible (if there's cuda).
     if any(
         isinstance(e, torch.Tensor) and e.device.type in ("cuda", "xpu")
@@ -2478,6 +2763,48 @@ def compile_fx(
                 if isinstance(model_, GraphModule)
                 else example_inputs_
             )
+=======
+    # TODO: This probably shouldn't be a recursive call
+    if config.cpp_wrapper:
+        with (
+            config.patch(
+                {
+                    "cpp_wrapper": False,  # reset to break recursive call to compile_fx
+                    **get_cpp_wrapper_config(),
+                }
+            ),
+            V.set_real_inputs(example_inputs_),
+        ):
+            inputs_: Sequence[InputType] = example_inputs_
+
+            if isinstance(model_, GraphModule):
+                fake_inputs = [
+                    node.meta.get("val")
+                    for node in model_.graph.nodes
+                    if node.op == "placeholder"
+                ]
+                # Replace non-tensor (constant) inputs with Nones, since these are not being
+                # used anyways by the graph
+                fake_inputs = [
+                    inp if isinstance(inp, torch.Tensor) else None
+                    for inp in fake_inputs
+                ]
+
+                if any(v is not None for v in fake_inputs):
+                    # Validate devices before switching to fake tensors.
+                    for idx, fi, i in zip(count(), fake_inputs, inputs_):
+                        if fi is not None:
+                            assert isinstance(i, torch.Tensor)
+                            if fi.device != i.device:
+                                raise ValueError(
+                                    f"Device mismatch between fake input and example input at position #{idx}: "
+                                    f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                                    "make sure torch.export() and torch.aot_compile() run on the same device."
+                                )
+                    inputs_ = fake_inputs  # type: ignore[assignment]
+            from torch._export.non_strict_utils import _fakify_script_objects
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fake_mode = detect_fake_mode(inputs_)
             with _fakify_script_objects(model_, inputs_, {}, fake_mode) as (
                 patched_mod,
@@ -2486,6 +2813,7 @@ def compile_fx(
                 _,
                 _,
             ):
+<<<<<<< HEAD
                 return _maybe_wrap_and_compile_fx_main(
                     patched_mod,
                     fake_args,
@@ -2494,10 +2822,17 @@ def compile_fx(
                         cpp_wrapper=cpp_wrapper_config,
                         fx_wrapper=fx_wrapper_config,
                     ),
+=======
+                return compile_fx(
+                    patched_mod,
+                    fake_args,
+                    inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     decompositions=decompositions,
                     ignore_shape_env=ignore_shape_env,
                 )
 
+<<<<<<< HEAD
     return _maybe_wrap_and_compile_fx_main(
         model_,
         example_inputs_,
@@ -2553,17 +2888,32 @@ def _maybe_wrap_and_compile_fx_main(
     # called inside the wrapper. This just recursively calls this function.
     compile_gm = functools.partial(
         _maybe_wrap_and_compile_fx_main,
+=======
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inner_compile=inner_compile,
         decompositions=decompositions,
         ignore_shape_env=ignore_shape_env,
     )
+<<<<<<< HEAD
     if not graph_returns_tuple(model_):
         return make_graph_return_tuple(model_, example_inputs_, compile_gm)
+=======
+
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(model_, GraphModule) and isinstance(
         model_.graph._codegen, _PyTreeCodeGen
     ):
         # this graph is the result of dynamo.export()
+<<<<<<< HEAD
         return handle_dynamo_export_graph(model_, example_inputs_, compile_gm)
 
     if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
@@ -2607,6 +2957,20 @@ def _compile_fx_main(
             config.trace.provenance_tracking_level == 1
         ),
         torch._inductor.debug.reset_provenance_globals(),
+=======
+        return handle_dynamo_export_graph(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    # Do the actual work
+
+    with (
+        _use_lazy_graph_module(dynamo_config.use_lazy_graph_module),
+        enable_python_dispatcher(),
+        torch.fx.traceback.preserve_node_meta(config.trace.enabled),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # Pre-grad passes cannot be run if we weren't given a GraphModule.
         # Dynamo will always produce a GraphModule, but this handles cases
@@ -2614,13 +2978,80 @@ def _compile_fx_main(
         # having AOTAutograd trace it.
         # TODO: Get rid of this?
         if isinstance(model_, GraphModule):
+<<<<<<< HEAD
             model_ = run_pre_grad_passes(model_, example_inputs_)
+=======
+            # "before_pre_grad_graph" is used in inductor provenance
+            # tracking highlighter front-end.
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "before_pre_grad_graph",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: model_.print_readable(
+                    print_output=False, include_stride=True, include_device=True
+                )
+                + f"\n\n # graph id: {id(model_.graph)}",
+            )
+            pre_grad_graphs_log.debug(
+                "%s",
+                lazy_format_graph_code(
+                    "BEFORE PRE GRAD",
+                    model_,
+                    include_stride=True,
+                    include_device=True,
+                    colored=True,
+                ),
+            )
+            torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
+
+            model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "after_pre_grad_graph",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: model_.print_readable(
+                    print_output=False, include_stride=True, include_device=True
+                )
+                + f"\n\n # graph id: {id(model_.graph)}",
+            )
+
+        # TODO: Move this before recursive pre-grad passes
+        # NB: This short circuit never occurs for Dynamo produced graphs
+        # (which are pre-flattened)
+        if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+            return flatten_graph_inputs(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert not config._raise_error_for_testing
 
         num_example_inputs = len(example_inputs_)
 
+<<<<<<< HEAD
         compiler_config_extra = create_compiler_config_extra(config)
+=======
+        # Although cudagraphs may have been enabled via config, various
+        # conditions (which are tested within the bowels of Inductor) may
+        # force cudagraphs to be disabled.  This mutable box lets us retrieve
+        # the final determination if cudagraphs actually can be used or not.
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+
+        # See [Backward Generation Handling]
+        forward_device = BoxedDeviceIndex(None)
+
+        # TODO: The modern style is to use CompileId from TracingContext to
+        # identify Inductor compilation.  However, this CompileId cannot
+        # uniquely identify multiple Inductor compilations that arise from
+        # DDPOptimizer
+        graph_id = next(_graph_counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         decompositions = (
             decompositions if decompositions is not None else select_decomp_table()
@@ -2632,6 +3063,7 @@ def fw_compiler_base(
             is_inference: bool,
         ) -> OutputCode:
             with dynamo_utils.dynamo_timed("compile_fx.<locals>.fw_compiler_base"):
+<<<<<<< HEAD
                 if isinstance(model_, GraphModule):
                     num_orig_model_outputs = get_num_model_outputs(model_)
                 else:
@@ -2644,6 +3076,85 @@ def fw_compiler_base(
                     compiler_config_extra=compiler_config_extra,
                     inner_compile=inner_compile,
                     is_inference=is_inference,
+=======
+                if is_inference:
+                    # partition_fn won't be called
+                    _recursive_joint_graph_passes(gm)
+
+                fixed = torch._inductor.utils.num_fw_fixed_arguments(
+                    num_example_inputs, len(example_inputs)
+                )
+
+                model_outputs_node = output_node(gm)
+                if config.keep_output_stride:
+                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+                    num_model_outputs = len(model_outputs)
+
+                    context = torch._guards.TracingContext.try_get()
+                    # See Note [User Outputs in the inductor graph]
+                    if context is not None and context.fw_metadata and not is_inference:
+                        original_output_start_index = (
+                            context.fw_metadata.num_mutated_inp_runtime_indices
+                        )
+                    else:
+                        original_output_start_index = 0
+
+                    if isinstance(model_, GraphModule):
+                        *_, orig_model_outputs_node = model_.graph.nodes
+                        assert orig_model_outputs_node.op == "output"
+                        orig_model_outputs, _ = pytree.tree_flatten(
+                            orig_model_outputs_node.args
+                        )
+                        num_orig_model_outputs = len(orig_model_outputs)
+                    else:
+                        num_orig_model_outputs = num_model_outputs
+
+                    assert num_orig_model_outputs <= num_model_outputs
+
+                    # Note [User Outputs in the inductor graph]
+                    # We makes the following assumption
+                    # For inference
+                    #   len(orig_model_outputs) == len(model_outputs)
+                    # For training
+                    #   len(orig_model_outputs) <= len(model_outputs)
+                    # During training, most of the time the model_outputs starts with
+                    # original module's outputs followed by saved activations.
+                    # But this can be not true if the model have inplace updated tensors.
+                    # AOTAutograd will make those tensors being returned before the original
+                    # module's output.
+                    # To make things safe, we'll use original_output_start_index field
+                    # set by AOTAutograd to decide where the original module outputs start.
+                    orig_output_end_idx = (
+                        original_output_start_index + num_orig_model_outputs
+                    )
+                    # Sanity check: we are about to splice out the "user" outputs from the full set
+                    # of "graph" outputs. Make sure we're within bounds.
+                    assert orig_output_end_idx <= num_model_outputs
+
+                    model_outputs_node.meta["user_visible_output_idxs"] = [
+                        idx
+                        for idx in range(
+                            original_output_start_index, orig_output_end_idx
+                        )
+                        if isinstance(model_outputs[idx], torch.fx.Node)
+                    ]
+                else:
+                    model_outputs_node.meta["user_visible_output_idxs"] = []
+
+                # We also mark the invoke_subgraph outputs as user_visible to
+                # force the outputs of invoke_subgraph subgraph to follow the
+                # original strides
+                _recursive_record_user_visible_output_idxs(gm)
+
+                return inner_compile(
+                    gm,
+                    example_inputs,
+                    static_input_idxs=get_static_input_idxs(fixed),
+                    cudagraphs=cudagraphs,
+                    graph_id=graph_id,
+                    is_inference=is_inference,
+                    boxed_forward_device_index=forward_device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         fw_compiler: Callable[[GraphModule, Sequence[InputType]], OutputCode] = (
@@ -2657,9 +3168,15 @@ def fw_compiler_base(
                 dynamo_model=model_,
                 num_example_inputs=num_example_inputs,
                 inner_compile=inner_compile,
+<<<<<<< HEAD
                 cudagraphs=compiler_config_extra.cudagraphs,
                 graph_id=compiler_config_extra.graph_id,
                 forward_device=compiler_config_extra.forward_device,
+=======
+                cudagraphs=cudagraphs,
+                graph_id=graph_id,
+                forward_device=forward_device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
@@ -2667,10 +3184,41 @@ def fw_compiler_base(
                 OutputCode, inference_compiler
             )
 
+<<<<<<< HEAD
+=======
+        def partition_fn(
+            gm: GraphModule,
+            joint_inputs: Sequence[object],
+            **kwargs: object,
+        ) -> tuple[GraphModule, GraphModule]:
+            cuda_context = get_cuda_device_context(gm)
+            with cuda_context:
+                # We can skip the invoke_subgraph because the
+                # entire_partition_fn is called recursively for invoke_subgraph
+                # in partitioning.
+                _recursive_joint_graph_passes(gm, skip_invoke_subgraph=True)
+
+            static_lifetime_input_indices: Optional[list[int]] = kwargs.pop(  # type: ignore[assignment]
+                "static_lifetime_input_indices", None
+            )
+
+            with dynamo_utils.dynamo_timed(
+                "min_cut_rematerialization_partition", log_pt2_compile_event=True
+            ):
+                return min_cut_rematerialization_partition(
+                    gm,
+                    joint_inputs,
+                    compiler="inductor",
+                    static_lifetime_input_indices=static_lifetime_input_indices,
+                    **kwargs,
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @compile_time_strobelight_meta(phase_name="backward")
         def bw_compiler(
             gm: GraphModule, example_inputs: Sequence[InputType]
         ) -> OutputCode:
+<<<<<<< HEAD
             with (
                 dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"),
             ):
@@ -2680,6 +3228,40 @@ def bw_compiler(
                     compiler_config_extra=compiler_config_extra,
                     inner_compile=inner_compile,
                 )
+=======
+            from torch._dynamo.convert_frame import compile_lock
+
+            with (
+                dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"),
+                compile_lock,
+            ):
+                model_outputs_node = output_node(gm)
+                if config.bw_outputs_user_visible:
+                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+                    model_outputs_node.meta["user_visible_output_idxs"] = [
+                        idx
+                        for idx, n in enumerate(model_outputs)
+                        if isinstance(n, torch.fx.Node)
+                    ]
+                else:
+                    model_outputs_node.meta["user_visible_output_idxs"] = []
+
+                fixed = count_tangents(gm)
+                with (
+                    config.patch(get_cpp_wrapper_config())
+                    if config.cpp_wrapper
+                    else contextlib.nullcontext()
+                ):
+                    return inner_compile(
+                        gm,
+                        example_inputs,
+                        static_input_idxs=list(range(fixed)),
+                        cudagraphs=cudagraphs,
+                        is_backward=True,
+                        graph_id=graph_id,
+                        boxed_forward_device_index=forward_device,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bw_compiler = SerializableAOTDispatchCompiler(OutputCode, bw_compiler)
 
@@ -2692,10 +3274,13 @@ def bw_compiler(
         )
 
         if V.aot_compilation:
+<<<<<<< HEAD
             from .utils import is_valid_aoti_model_name
 
             is_valid_aoti_model_name()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with functorch_config.patch(unlift_effect_tokens=True):
                 gm, graph_signature = aot_export_module(
                     model_,
@@ -2706,7 +3291,11 @@ def bw_compiler(
 
                 from torch._export.utils import _detect_fake_mode_from_gm
 
+<<<<<<< HEAD
                 fake_mode = _detect_fake_mode_from_gm(gm)  # type: ignore[assignment]
+=======
+                fake_mode = _detect_fake_mode_from_gm(gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # aot_export_module doesn't account for constant tensor attributes
                 # so we end up having tensors that don't have fake vals attached.
                 # This can happen when upstream export is non-strict where we
@@ -2717,7 +3306,10 @@ def bw_compiler(
                     if node.op == "get_attr" and "val" not in node.meta:
                         target = attrgetter(node.target)(gm)
                         if isinstance(target, torch.Tensor):
+<<<<<<< HEAD
                             assert fake_mode is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             node.meta["val"] = fake_mode.from_tensor(
                                 target, static_shapes=True
                             )
@@ -2766,8 +3358,13 @@ def bw_compiler(
                     decompositions=decompositions,
                     partition_fn=partition_fn,
                     keep_inference_input_mutations=True,
+<<<<<<< HEAD
                     cudagraphs=compiler_config_extra.cudagraphs,
                     boxed_forward_device_index=compiler_config_extra.forward_device,
+=======
+                    cudagraphs=cudagraphs,
+                    boxed_forward_device_index=forward_device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ignore_shape_env=ignore_shape_env,
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
@@ -2884,7 +3481,10 @@ def _aoti_flatten_inputs(
     Flatten the inputs to the graph module and return the flat inputs and options.
     Add "aot_inductor.serialized_in_spec" and "aot_inductor.serialized_out_spec" to the options.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-module-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .compile_fx import graph_returns_tuple
 
     assert graph_returns_tuple(gm), (
diff --git a/torch/_inductor/compile_fx_async.py b/torch/_inductor/compile_fx_async.py
index 4faa48a004436..b8ac2331d216d 100644
--- a/torch/_inductor/compile_fx_async.py
+++ b/torch/_inductor/compile_fx_async.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
 from collections import deque
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, TYPE_CHECKING
 from typing_extensions import final, override
@@ -9,11 +12,15 @@
 from torch._inductor.output_code import CompiledFxGraphConstants, OutputCode
 
 from .compile_fx import _CompileFxKwargs, _InProcessFxCompile, FxCompile
+<<<<<<< HEAD
 from .output_code import complex_memory_overlap  # noqa: F401
 
 
 # When async compile works with cache, remove the disabling below
 BUG_CACHES_DONT_WORK_WITH_ASYNC = True
+=======
+from .output_code import complex_memory_overlap as complex_memory_overlap  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -33,6 +40,7 @@ class _PostCompileData:
     graph_kwargs: _CompileFxKwargs
 
 
+<<<<<<< HEAD
 @dataclass
 class ProgressiveCompilationState:
     progression_futures: deque[Future[_WireProtocolPickledOutput]]
@@ -75,11 +83,17 @@ def switch_to_progression_stage(self, stage_index: int) -> tuple[OutputCode, boo
         return optimized_output_code, should_clear_state
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # _AsyncOutputCode handles the actual management of waiting for an
 # out-of-process compile to finish and then switching over to it.
 @final
 class _AsyncOutputCode(OutputCode):
+<<<<<<< HEAD
     _eager_fn: Optional[Callable[..., Any]]
+=======
+    _eager_forward: Optional[Callable[..., Any]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _output_code: Optional[OutputCode]
     _future: Optional[Future[_WireProtocolPickledOutput]]
     _callback: Callable[[_WireProtocolPickledOutput], OutputCode]
@@ -88,16 +102,26 @@ class _AsyncOutputCode(OutputCode):
 
     def __init__(
         self,
+<<<<<<< HEAD
         # eager_fn is run until the future is finished.
         eager_fn: Callable[..., Any],
+=======
+        # eager_forward is run until the future is finished.
+        eager_forward: Callable[..., Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # this responds with the result of the out-of-process compile when it's
         # ready.
         future: Future[_WireProtocolPickledOutput],
         # this callback gets called to turn the _WireProtocolPickledOutput into an OutputCode
         callback: Callable[[_WireProtocolPickledOutput], OutputCode],
     ) -> None:
+<<<<<<< HEAD
         self._eager_fn = eager_fn
         self._boxed_call = getattr(eager_fn, "_boxed_call", False)
+=======
+        self._eager_forward = eager_forward
+        self._boxed_call = getattr(eager_forward, "_boxed_call", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._output_code = None
 
         self._future = future
@@ -106,11 +130,19 @@ def __init__(
     @override
     def __call__(self, *args: Any) -> Any:
         if self._future is not None and self._future.done():
+<<<<<<< HEAD
             args = self._switch_to_compiled_fn(args)
 
         if eager_fn := self._eager_fn:
             _AsyncFxCompile._stat_eager_runs += 1
             return eager_fn(*args)
+=======
+            args = self._switch_to_compiled_forward(args)
+
+        if eager_forward := self._eager_forward:
+            _AsyncFxCompile._stat_eager_runs += 1
+            return eager_forward(*args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         else:
             _AsyncFxCompile._stat_compiled_runs += 1
@@ -118,7 +150,11 @@ def __call__(self, *args: Any) -> Any:
             return self._output_code.__call__(*args)
 
     # Takes and returns the args (converted to the "right" boxed mode)
+<<<<<<< HEAD
     def _switch_to_compiled_fn(self, args: tuple[Any, ...]) -> tuple[Any, ...]:
+=======
+    def _switch_to_compiled_forward(self, args: tuple[Any, ...]) -> tuple[Any, ...]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self._future is not None
 
         # TODO: If the future ended in an exception do we want to continue
@@ -134,7 +170,11 @@ def _switch_to_compiled_fn(self, args: tuple[Any, ...]) -> tuple[Any, ...]:
             )
 
         self._output_code = output_code
+<<<<<<< HEAD
         self._eager_fn = None
+=======
+        self._eager_forward = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         boxed_call = getattr(output_code, "_boxed_call", False)
 
         if self._boxed_call != boxed_call:
@@ -155,7 +195,11 @@ def post_compile(
         constants: CompiledFxGraphConstants,
         graph_kwargs: _CompileFxKwargs,
     ) -> None:
+<<<<<<< HEAD
         if self._eager_fn is not None:
+=======
+        if self._eager_forward is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._post_compile_data = _PostCompileData(
                 example_inputs, constants, graph_kwargs
             )
@@ -218,7 +262,11 @@ def codegen_and_compile(
         _AsyncFxCompile._stat_bg_started += 1
         f = self._compile._send_to_child_async(inputs)
 
+<<<<<<< HEAD
         # This is called by _switch_to_compiled_fn() when f has a result...
+=======
+        # This is called by _switch_to_compiled_forward() when f has a result...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def callback(pickled_output: _WireProtocolPickledOutput) -> OutputCode:
             _AsyncFxCompile._stat_bg_finished += 1
             output = pickled_output.deserialize(constants)
@@ -226,6 +274,7 @@ def callback(pickled_output: _WireProtocolPickledOutput) -> OutputCode:
             return output.graph
 
         return _AsyncOutputCode(eager_output_code, f, callback)
+<<<<<<< HEAD
 
 
 # _ProgressiveOutputCode handles running a fast compile first, then hot-swapping
@@ -396,3 +445,5 @@ def callback(pickled_output: _WireProtocolPickledOutput) -> OutputCode:
             return output.graph
 
         return _ProgressiveOutputCode(fast_output_code, progression_futures, callback)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/compile_fx_ext.py b/torch/_inductor/compile_fx_ext.py
index 9c032fafb9665..404b4ab2b1281 100644
--- a/torch/_inductor/compile_fx_ext.py
+++ b/torch/_inductor/compile_fx_ext.py
@@ -30,7 +30,11 @@
 from .compile_fx import _CompileFxKwargs, _InProcessFxCompile, FxCompile, log
 from .debug import DebugContext
 from .graph import GraphLowering
+<<<<<<< HEAD
 from .output_code import complex_memory_overlap  # noqa: F401
+=======
+from .output_code import complex_memory_overlap as complex_memory_overlap  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -445,7 +449,11 @@ def serialize_compile(
             # we can't cache (or serialize)
             FxGraphCache._check_for_hop(gm)
         except BypassFxGraphCache as e:
+<<<<<<< HEAD
             log.debug("Skipping %s compile: %s", type(self), e)  # noqa: G200
+=======
+            log.debug("Skipping %s compile: %s", type(self), e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         context = torch._guards.TracingContext.try_get()
diff --git a/torch/_inductor/compile_fx_subproc.py b/torch/_inductor/compile_fx_subproc.py
index 58d5195046fd1..d0dd920a0ec94 100644
--- a/torch/_inductor/compile_fx_subproc.py
+++ b/torch/_inductor/compile_fx_subproc.py
@@ -20,7 +20,11 @@
     _WireProtocolPickledInput,
     _WireProtocolPickledOutput,
 )
+<<<<<<< HEAD
 from .output_code import complex_memory_overlap  # noqa: F401
+=======
+from .output_code import complex_memory_overlap as complex_memory_overlap  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 30a4e2203bb96..96b568462d893 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -13,7 +13,11 @@
 import typing
 from concurrent.futures import Future, ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
+<<<<<<< HEAD
 from enum import Enum, IntEnum
+=======
+from enum import Enum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, IO, Optional, TypeVar
 from typing_extensions import Never, ParamSpec
 
@@ -27,8 +31,12 @@
     TrackedProcessPoolExecutor,
 )
 from torch._inductor.compile_worker.utils import _async_compile_initializer
+<<<<<<< HEAD
 from torch._inductor.utils import get_ld_library_path, python_subprocess_env
 from torch._utils_internal import find_compile_subproc_binary
+=======
+from torch._inductor.utils import get_ld_library_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -37,6 +45,7 @@
 _T = TypeVar("_T")
 
 
+<<<<<<< HEAD
 class MsgHeader(IntEnum):
     ERROR = 0
     SHUTDOWN = 1
@@ -73,6 +82,33 @@ def _recv_msg(read_pipe: IO[bytes]) -> tuple[MsgHeader, int, bytes]:
     msg_header, job_id, length = _unpack_msg(read_pipe.read(msg_bytes))
     data = read_pipe.read(length) if length > 0 else b""
     return msg_header, job_id, data
+=======
+def _pack_msg(job_id: int, length: int) -> bytes:
+    return struct.pack("nn", job_id, length)
+
+
+def _unpack_msg(data: bytes) -> tuple[int, int]:
+    if not data:
+        return -1, -1
+    return struct.unpack("nn", data)
+
+
+msg_bytes = len(_pack_msg(0, 0))
+
+
+def _send_msg(write_pipe: IO[bytes], job_id: int, job_data: bytes = b"") -> None:
+    length = len(job_data)
+    write_pipe.write(_pack_msg(job_id, length))
+    if length > 0:
+        write_pipe.write(job_data)
+    write_pipe.flush()
+
+
+def _recv_msg(read_pipe: IO[bytes]) -> tuple[int, bytes]:
+    job_id, length = _unpack_msg(read_pipe.read(msg_bytes))
+    data = read_pipe.read(length) if length > 0 else b""
+    return job_id, data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _SubprocExceptionInfo:
@@ -91,6 +127,7 @@ class SubprocException(Exception):
     Thrown when a job in a subprocess raises an Exception.
     """
 
+<<<<<<< HEAD
     def __init__(self, details: str, name: str = "<unknown>") -> None:
         self.details = details
         super().__init__(
@@ -99,6 +136,10 @@ def __init__(self, details: str, name: str = "<unknown>") -> None:
 
     def with_name(self, name: str) -> "SubprocException":
         return SubprocException(self.details, name)
+=======
+    def __init__(self, details: str) -> None:
+        super().__init__(f"An exception occurred in a subprocess:\n\n{details}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SubprocPickler:
@@ -144,11 +185,14 @@ def __init__(
         cmd = [
             sys.executable,
             entry,
+<<<<<<< HEAD
         ]
         if (binary := find_compile_subproc_binary()) is not None:
             cmd = [binary]
 
         args = [
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"--pickler={self.pickler.__class__.__module__}.{self.pickler.__class__.__name__}",
             f"--kind={self.kind.value}",
             f"--workers={nprocs}",
@@ -157,6 +201,7 @@ def __init__(
             f"--write-fd={str(subproc_write_fd)}",
             f"--torch-key={torch_key_str}",
         ]
+<<<<<<< HEAD
         cmd.extend(args)
         log_path = None
         self.log_file = None
@@ -172,17 +217,35 @@ def __init__(
         if log_path:
             # pyrefly: ignore [bad-assignment]
             self.log_file = open(log_path, "w")
+=======
+        local = False
+        if config.worker_suppress_logging:
+            log.info("Suppressing compile worker output due to config")
+            local = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.process = subprocess.Popen(
             cmd,
             env={
+<<<<<<< HEAD
                 **python_subprocess_env(),
                 # Safeguard against creating a SubprocPool in the subprocess.
+=======
+                **os.environ,
+                # We need to set the PYTHONPATH so the subprocess can find torch.
+                "PYTHONPATH": os.environ.get(
+                    "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+                ),
+                # We don't want to re-warm the pool when the subprocess imports
+                # torch._inductor.codecache since the warming process is what
+                # creates the SubprocPool in the first place.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "TORCH_WARM_POOL": "0",
                 # Some internal usages need a modified LD_LIBRARY_PATH.
                 "LD_LIBRARY_PATH": get_ld_library_path(),
             },
             pass_fds=(subproc_read_fd, subproc_write_fd),
+<<<<<<< HEAD
             stdout=self.log_file,
             stderr=self.log_file,
         )
@@ -190,6 +253,13 @@ def __init__(
         self.read_thread = threading.Thread(
             target=self._read_thread, name="InductorSubproc", daemon=True
         )
+=======
+            stdout=subprocess.DEVNULL if local else None,
+            stderr=subprocess.DEVNULL if local else None,
+        )
+        self.write_lock = threading.Lock()
+        self.read_thread = threading.Thread(target=self._read_thread, daemon=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.futures_lock = threading.Lock()
         self.pending_futures: dict[int, Future[Any]] = {}
@@ -205,7 +275,10 @@ def submit(
         self, job_fn: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs
     ) -> Future[_T]:
         if args or kwargs:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             job_fn = functools.partial(job_fn, *args, **kwargs)
         job_data = self.pickler.dumps(job_fn)
         future: Future[_T]
@@ -213,6 +286,7 @@ def submit(
             job_id = next(self.job_id_count)
             self.pending_futures[job_id] = future = Future()
         future.set_running_or_notify_cancel()
+<<<<<<< HEAD
         self._send(MsgHeader.JOB, job_id, job_data)
         return future
 
@@ -221,10 +295,18 @@ def _send(self, msg_header: MsgHeader, job_id: int = -1, data: bytes = b"") -> N
             if not self.running:
                 raise RuntimeError("Attempting to use a closed pool")
             _send_msg(self.write_pipe, msg_header, job_id, data)
+=======
+        with self.write_lock:
+            if not self.running:
+                raise RuntimeError("submit() on closed pool")
+            _send_msg(self.write_pipe, job_id, job_data)
+        return future
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _read_thread(self) -> None:
         while True:
             data = b""
+<<<<<<< HEAD
             job_id = -1
             try:
                 msg_header, job_id, data = _recv_msg(self.read_pipe)
@@ -235,6 +317,17 @@ def _read_thread(self) -> None:
                 msg_header = MsgHeader.ERROR
 
             if msg_header != MsgHeader.JOB:
+=======
+            try:
+                job_id, data = _recv_msg(self.read_pipe)
+            except Exception:
+                # Something went wrong during the read. There's no way we have a
+                # valid job_id.
+                log.exception("failure in subproc_pool._recv_msg")
+                job_id = -1
+
+            if job_id < 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # read_pipe returned None or got exception
                 if self.running:
                     log.warning("SubprocPool unclean exit")
@@ -267,18 +360,22 @@ def _read_thread(self) -> None:
                     self.pending_futures[job_id].set_result(result)
                 del self.pending_futures[job_id]
 
+<<<<<<< HEAD
     def quiesce(self) -> None:
         self._send(MsgHeader.QUIESCE)
 
     def wakeup(self) -> None:
         self._send(MsgHeader.WAKEUP)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def shutdown(self) -> None:
         try:
             with self.write_lock:
                 if not self.running:
                     return
                 self.running = False
+<<<<<<< HEAD
                 _send_msg(self.write_pipe, MsgHeader.SHUTDOWN)
                 self.write_pipe.close()
             self.process.wait(300)
@@ -286,6 +383,13 @@ def shutdown(self) -> None:
                 self.log_file.close()
         except OSError:
             log.warning("Ignored OSError in pool shutdown", exc_info=True)
+=======
+                _send_msg(self.write_pipe, -1)
+                self.write_pipe.close()
+            self.process.wait(300)
+        except OSError as e:
+            log.warning("Ignored OSError in pool shutdown:  %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             with self.futures_lock:
                 for future in self.pending_futures.values():
@@ -311,6 +415,7 @@ def __init__(
         self.write_pipe = write_pipe
         self.write_lock = threading.Lock()
         self.nprocs = nprocs
+<<<<<<< HEAD
         self.pool: Optional[ProcessPoolExecutor] = None
         self.running = True
 
@@ -330,17 +435,47 @@ def _quiesce(self) -> None:
         if self.pool is not None:
             self.pool.shutdown(wait=False)
             self.pool = None
+=======
+        self.pool = self._new_pool(nprocs, True)
+        self.running = True
+
+    def _new_pool(self, nprocs: int, warm: bool) -> ProcessPoolExecutor:
+        pool = TrackedProcessPoolExecutor(
+            nprocs,
+            mp_context=multiprocessing.get_context(self.kind.value),
+            initializer=functools.partial(_async_compile_initializer, os.getpid()),
+        )
+        multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
+        if warm:
+            _warm_process_pool(pool, nprocs)
+        return pool
+
+    def main(self) -> None:
+        while True:
+            job_id, data = _recv_msg(self.read_pipe)
+            if job_id < 0:
+                return self._shutdown()
+            self.submit(job_id, data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _shutdown(self) -> None:
         with self.write_lock:
             self.running = False
             try:
+<<<<<<< HEAD
                 _send_msg(self.write_pipe, MsgHeader.SHUTDOWN)
+=======
+                _send_msg(self.write_pipe, -1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.write_pipe.close()
             except BrokenPipeError:
                 pass  # parent process already shutdown
             self.read_pipe.close()
+<<<<<<< HEAD
         self._quiesce()
+=======
+        self.pool.shutdown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def submit(self, job_id: int, data: bytes) -> None:
         while self.running:
@@ -351,7 +486,11 @@ def submit(self, job_id: int, data: bytes) -> None:
                 # If any subprocess in the pool crashes, we get a BrokenProcessPool
                 # exception and the whole pool becomes unusable. Handle crashes by
                 # recreating the pool and resubmitting.
+<<<<<<< HEAD
                 self.pool = None
+=======
+                self.pool = self._new_pool(self.nprocs, False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _submit_inner(self, job_id: int, data: bytes) -> None:
         def callback(fut: Future[Any]) -> None:
@@ -365,17 +504,24 @@ def callback(fut: Future[Any]) -> None:
             assert isinstance(result, bytes)
             with self.write_lock:
                 if self.running:
+<<<<<<< HEAD
                     _send_msg(self.write_pipe, MsgHeader.JOB, job_id, result)
             return
 
         self._start_pool()
         assert self.pool is not None
 
+=======
+                    _send_msg(self.write_pipe, job_id, result)
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         future = self.pool.submit(
             functools.partial(SubprocMain.do_job, self.pickler, data)
         )
         future.add_done_callback(callback)
 
+<<<<<<< HEAD
     def _start_pool(self) -> None:
         if self.pool is not None:
             return
@@ -390,6 +536,8 @@ def _start_pool(self) -> None:
         )
         _warm_process_pool(self.pool, self.nprocs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def do_job(pickler: SubprocPickler, data: bytes) -> bytes:
         # do the pickle/unpickle in the sub-subproc
diff --git a/torch/_inductor/compile_worker/tracked_process_pool.py b/torch/_inductor/compile_worker/tracked_process_pool.py
index 9428d2ae6aef8..080db02f491ed 100644
--- a/torch/_inductor/compile_worker/tracked_process_pool.py
+++ b/torch/_inductor/compile_worker/tracked_process_pool.py
@@ -83,7 +83,10 @@ def set_running_or_notify_cancel() -> Any:
     def submit(
         self, fn: Callable[_P, _R], /, *args: _P.args, **kwargs: _P.kwargs
     ) -> Future[_R]:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f = super().submit(fn, *args, **kwargs)
         self._record_enqueue(f)
         return f
diff --git a/torch/_inductor/compile_worker/utils.py b/torch/_inductor/compile_worker/utils.py
index b4b5e21630c27..56278419a073b 100644
--- a/torch/_inductor/compile_worker/utils.py
+++ b/torch/_inductor/compile_worker/utils.py
@@ -23,11 +23,17 @@ def in_toplevel_process() -> bool:
 # This function cannot be an inner function since otherwise mp_context="spawn" would
 # not work for ProcessPoolExecutor since inner functions cannot be pickled.
 def _async_compile_initializer(orig_ppid: int) -> None:
+<<<<<<< HEAD
     import torch._C
 
     def run() -> None:
         while True:
             sleep(60)
+=======
+    def run() -> None:
+        while True:
+            sleep(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if orig_ppid != os.getppid():
                 os.kill(os.getpid(), signal.SIGKILL)
 
@@ -38,9 +44,12 @@ def run() -> None:
     # Ignore Ctrl-C (i.e. SIGINT) sent to pool workers to avoid meaningless log spam.
     signal.signal(signal.SIGINT, signal.SIG_IGN)
 
+<<<<<<< HEAD
     # Install a crash handler to print out the stacktrace for SEGV
     torch._C._initCrashHandler()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Set a bit to distinguish async_compile subprocesses from the toplevel process.
     global _IN_TOPLEVEL_PROCESS
     _IN_TOPLEVEL_PROCESS = False
diff --git a/torch/_inductor/compiler_bisector.py b/torch/_inductor/compiler_bisector.py
index 41dc4777df823..edb0a938e7f74 100644
--- a/torch/_inductor/compiler_bisector.py
+++ b/torch/_inductor/compiler_bisector.py
@@ -58,7 +58,10 @@ def __post_init__(self) -> None:
     # applies CrossRefFakeMode on invocation
     "aot_eager_decomp_partition_crossref": [],
     "inductor": [
+<<<<<<< HEAD
         BisectSubsystem("pre_grad_passes"),  # passes applied on pre-grad IR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BisectSubsystem("joint_graph_passes"),  # passes applied on joint graph
         BisectSubsystem(
             "post_grad_passes"
@@ -243,7 +246,10 @@ def get_bisect_range(
         lines = cls.read_lines_from_file(file_path)
         low = None
         high = None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for line in reversed(lines):
             if line.startswith("low="):
                 low = int(line.strip().split("=")[1])
@@ -551,7 +557,11 @@ def __del__(self) -> None:
                         curr_backend,
                         curr_subsystem.name,
                         low,
+<<<<<<< HEAD
                         call_counter_debug_info.get(low),
+=======
+                        call_counter_debug_info.get(low, None),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                 next_subsystem = cls.advance_subsystem(curr_backend, curr_subsystem)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index ec083c45da825..4f5cf7ae135c1 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -40,7 +40,11 @@ def bundle_triton_into_fx_graph_cache_default() -> Optional[bool]:
 
 
 def static_cuda_launcher_default() -> bool:
+<<<<<<< HEAD
     STATIC_CUDA_LAUNCHER_VERSION = 2
+=======
+    STATIC_CUDA_LAUNCHER_VERSION = 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if "TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER" in os.environ:
         return os.environ.get("TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER") == "1"
@@ -81,24 +85,33 @@ def prologue_fusion_enabled() -> bool:
 # Whether to enable printing the source code for each future
 verbose_progress = False
 
+<<<<<<< HEAD
 # Configurable compile worker logging path for subproc_pool
 worker_log_path = (
     "/logs/dedicated_log_torch_compile_worker_rank" if is_fbcode() else None
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # precompilation timeout
 precompilation_timeout_seconds: int = 60 * 60
 
 # use fx aot graph codegen cache
 fx_graph_cache: bool = Config(
     justknob="pytorch/remote_cache:enable_local_fx_graph_cache",
+<<<<<<< HEAD
     env_name_default="TORCHINDUCTOR_FX_GRAPH_CACHE_DEFAULT",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env_name_force="TORCHINDUCTOR_FX_GRAPH_CACHE",
     default=True,
 )
 
+<<<<<<< HEAD
 remote_gemm_autotune_cache: bool = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # use remote fx aot graph codegen cache
 # False: Disables the cache
 # True: Enables the cache
@@ -146,8 +159,17 @@ def prologue_fusion_enabled() -> bool:
 # None: Not set -- Off for OSS, JustKnobs based for internal
 bundled_autotune_remote_cache: Optional[bool] = bundled_autotune_remote_cache_default()
 
+<<<<<<< HEAD
 # See torch.compiler.config.force_disable_caches
 force_disable_caches: bool = Config(alias="torch.compiler.config.force_disable_caches")
+=======
+# Force disabled all inductor level caching -- This will override any other caching flag
+force_disable_caches: bool = Config(
+    justknob="pytorch/remote_cache:force_disable_caches",
+    env_name_force="TORCHINDUCTOR_FORCE_DISABLE_CACHES",
+    default=False,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Unsafe way to skip dynamic shape guards to get faster cache load
 unsafe_skip_cache_dynamic_shape_guards: bool = False
@@ -188,8 +210,11 @@ def prologue_fusion_enabled() -> bool:
     os.environ.get("TORCHINDUCTOR_CPP_WRAPPER_BUILD_SEPARATE", "0") == "1"
 )
 
+<<<<<<< HEAD
 fx_wrapper: bool = os.environ.get("TORCHINDUCTOR_FX_WRAPPER", "0") == "1"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Controls automatic precompiling of common include files for codecache.CppCodeCache
 # (i.e. for cpp_wrapper mode and for cpp kernels on CPU).  AOTI header precompiling is
 # controlled by a separate flag.
@@ -206,9 +231,12 @@ def prologue_fusion_enabled() -> bool:
 # put correctness assertions in generated code
 size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1"
 nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1"
+<<<<<<< HEAD
 runtime_triton_nan_asserts = (
     os.environ.get("TORCHINDUCTOR_RUNTIME_TRITON_NAN_ASSERTS") == "1"
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 scalar_asserts = os.environ.get("TORCHINDUCTOR_SCALAR_ASSERTS", "1") == "1"
 
 # Disable by default in fbcode
@@ -270,12 +298,18 @@ def prologue_fusion_enabled() -> bool:
 post_grad_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 post_grad_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 
+<<<<<<< HEAD
 # Allow users to pass in custom partition function
 custom_partitioner_fn: torch._inductor.custom_graph_pass.CustomPartitionerFnType = None
 
 # Registers a custom joint graph pass.
 joint_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 joint_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
+=======
+# Registers a custom joint graph pass.
+joint_custom_pre_pass: Optional[Callable[[torch.fx.Graph], None]] = None
+joint_custom_post_pass: Optional[Callable[[torch.fx.Graph], None]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Registers a custom pregrad pass. Note that the pre-grad IR is 1.
 # non-functional, 2. non-normalized, and 3. prone to change. Ideally we should
@@ -395,6 +429,7 @@ def prologue_fusion_enabled() -> bool:
 
 # enable operator reordering for peak memory optimization
 reorder_for_peak_memory = True
+<<<<<<< HEAD
 reorder_for_peak_memory_debug = False
 
 # In some cases, when all the nodes that can be scheduled are quite large,
@@ -424,13 +459,18 @@ def prologue_fusion_enabled() -> bool:
 bucket_reduce_scatters_fx_bucket_size_determinator: Optional[Callable[[int], int]] = (
     None
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # runtime estimation function for ops
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
 
+<<<<<<< HEAD
 runtime_estimations_mms_benchmark: bool = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # unit: GB/s, uni-directional P2P bandwidth per card
 # default value is NVLink
 intra_node_bw = 300
@@ -458,6 +498,7 @@ def prologue_fusion_enabled() -> bool:
 # enable slow autotuning passes to select gemm algorithms
 max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
 
+<<<<<<< HEAD
 # Modifies the number of autotuning choices displayed, set to None for all
 autotune_num_choices_displayed: Optional[int] = 10
 
@@ -485,11 +526,22 @@ def prologue_fusion_enabled() -> bool:
 
 # whether template autotuning should allow flexible layouts if possible (e.g. only extern choices)
 max_autotune_allow_flexible_layouts: bool = False
+=======
+# disable decomposek autotune choice for gemm
+disable_decompose_k = os.environ.get("TORCHINDUCTOR_DISABLE_DECOMPOSE_K") == "1"
+
+# Modifies the number of autotuning choices displayed, set to None for all
+autotune_num_choices_displayed: Optional[int] = 10
+
+# enable inductor graph partition to allow multiple inductor graphs for the same dynamo graph
+graph_partition = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
 # when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
 # for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure
 # that triton does not use TF32 wherever cublas would not use TF32
+<<<<<<< HEAD
 # DEPRECATED. cuBLAS no longer has the above alignment requirements. will remove in the future.
 force_same_precision: bool = Config(
     justknob="pytorch/compiler:force_same_precision",
@@ -505,11 +557,22 @@ def prologue_fusion_enabled() -> bool:
 
 # Specify candidate backends for gemm autotune.
 # Possible choices are combinations of: ATen, Triton, CUTLASS, CK, CKTILE, CPP.
+=======
+force_same_precision = (
+    True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1"
+)
+
+# Specify candidate backends for gemm autotune.
+# Possible choices are combinations of: ATen, Triton, CUTLASS, CK, CPP.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ATen: default Pytorch ATen kernels.
 # Triton: Triton templates defined in torch inductor (AMD and NVidia GPUs).
 # CUTLASS: Cutlass templates and kernels (NVidia GPUs only).
 # CK: Composable Kernel templates and kernels (AMD Instinct GPUs only).
+<<<<<<< HEAD
 # CKTILE: Composable Kernel templates and kernels, new API (AMD Instinct GPUs only).
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # CPP: CPP templates and kernels for CPU.
 max_autotune_gemm_backends = os.environ.get(
     "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON,CPP"
@@ -545,8 +608,13 @@ def prologue_fusion_enabled() -> bool:
 # that can appear in the input shapes (e.g., in autotuning)
 unbacked_symint_fallback = 8192
 
+<<<<<<< HEAD
 # DEPRECATED. This setting is ignored.
 search_autotune_cache = False
+=======
+# enable searching global and local cache regardless of `max_autotune`
+search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1"
 
@@ -582,11 +650,14 @@ def prologue_fusion_enabled() -> bool:
 # Specify a list of comma separated optimizations to use learned heuristics for
 autoheuristic_use = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_USE", "mixed_mm")
 
+<<<<<<< HEAD
 # If set to 1, will run a JIT post compile hook if one is set.
 run_jit_post_compile_hook = (
     os.environ.get("TORCHINDUCTOR_RUN_JIT_POST_COMPILE_HOOK", "0") == "1"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def run_autoheuristic(name: str) -> bool:
     return collect_autoheuristic(name) or use_autoheuristic(name)
@@ -632,16 +703,22 @@ def use_autoheuristic(name: str) -> bool:
 
 # Threshold to prevent excessive accumulation of ops in one buffer during lowering
 realize_acc_reads_threshold = 8
+<<<<<<< HEAD
 realize_acc_reads_size_threshold: Optional[int] = (
     None  # TODO(xuanzh): harden this to make it non optional
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # fallback to eager for random/dropout, this is slow but useful for debugging
 fallback_random = False
 
+<<<<<<< HEAD
 # fallback embedding_bag_byte_unpack to eager
 fallback_embedding_bag_byte_unpack = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # automatically create fallbacks when encountering an unhandled op
 implicit_fallbacks = True
 assume_unaligned_fallback_output = (
@@ -657,10 +734,14 @@ def use_autoheuristic(name: str) -> bool:
 benchmark_fusion: bool = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 loop_ordering_after_fusion: bool = (
+<<<<<<< HEAD
     os.environ.get(
         "TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0" if is_fbcode() else "1"
     )
     == "1"
+=======
+    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 # If fusing two nodes only save less then score_fusion_memory_threshold memory,
@@ -688,10 +769,13 @@ def use_autoheuristic(name: str) -> bool:
 # how many nodes to attempt pairwise fusion with in a buffer group
 max_fusion_buffer_group_pairwise_attempts = 64
 
+<<<<<<< HEAD
 # maximum number of unique input/output buffers allowed in fused kernels.
 # The check is disabled if set to None.
 max_fusion_unique_io_buffers: Optional[int] = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # max number of inputs to generate cat as a pointwise op with masked loads
 max_pointwise_cat_inputs = 8
 
@@ -713,11 +797,15 @@ def use_autoheuristic(name: str) -> bool:
 #   split_reductions: uses multiple kernels to gain more parallelism
 #   triton.cooperative_reductions: uses cross thread-block synchronization to gain more parallelism
 # enabling both of these will implicitly disable split_reductions
+<<<<<<< HEAD
 split_reductions = os.getenv("TORCHINDUCTOR_SPLIT_REDUCTIONS", "1") == "1"
 
 # A deterministic mode that skips any on device benchmarking in Inductor
 # if we know they affect numerics.  WARNING: Expect perf hit in this mode.
 deterministic = os.getenv("TORCHINDUCTOR_DETERMINISTIC") == "1"
+=======
+split_reductions = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # When we do split reduction, this number control the minimum value for
 # num_split. Too small num_split make the split reduction less efficient.
@@ -752,9 +840,13 @@ def use_autoheuristic(name: str) -> bool:
 # for all except for foreach, 2 - enable for all
 combo_kernel_allow_mixed_sizes = 1
 # Enable dynamic shapes for foreach kernels
+<<<<<<< HEAD
 combo_kernel_foreach_dynamic_shapes = True
 # Maximum number of arguments (read/write buffers) allowed in a combo kernel
 combo_kernel_max_num_args = 250
+=======
+combo_kernel_foreach_dynamic_shapes = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # constant folding on the joint graph
 joint_graph_constant_folding = True
@@ -774,6 +866,7 @@ def use_autoheuristic(name: str) -> bool:
     os.environ.get("TORCHINDUCTOR_EMULATE_PRECISION_CASTS", "0") == "1"
 )
 
+<<<<<<< HEAD
 # x / y in Triton is lowered to div.full which is approx
 # PyTorch eager uses the equivalent of Triton's div_rn, which can
 # come at a performance penalty
@@ -781,6 +874,8 @@ def use_autoheuristic(name: str) -> bool:
     os.environ.get("TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING", "0") == "1"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # warnings intended for PyTorch developers, disable for point releases
 is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
 developer_warnings = is_fbcode() or is_nightly_or_source
@@ -816,10 +911,13 @@ def decide_worker_start_method() -> str:
 
 worker_start_method: str = decide_worker_start_method()
 
+<<<<<<< HEAD
 # Threshold to decide if a kernel has small memory access in bytes
 # Default value is 16 MB which is arbitrarily selected.
 small_memory_access_threshold: int = 16777216
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Whether to log from subprocess workers that are launched.
 worker_suppress_logging: bool = Config(
     justknob="pytorch/compiler:worker_suppress_logging",
@@ -827,12 +925,15 @@ def decide_worker_start_method() -> str:
     default=True,
 )
 
+<<<<<<< HEAD
 # Log per-operation runtime estimates for TLParse analysis.
 log_tlparse: bool = Config(
     env_name_force="LOG_TLPARSE",
     default=False,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Flags to turn on all_reduce fusion. These 2 flags should be automatically turned
 # on by DDP and should not be set by the users.
 _fuse_ddp_communication = False
@@ -864,6 +965,7 @@ class _collective:
     one_shot_all_reduce_threshold_bytes: int = 128 * 1024
 
 
+<<<<<<< HEAD
 class aten_distributed_optimizations:
     """Configuration for distributed optimization passes on ATen FX graphs."""
 
@@ -889,6 +991,8 @@ class aten_distributed_optimizations:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def parallel_compile_enabled_internally() -> bool:
     """
     TODO: Remove when parallel compiled is fully enabled internally. For rollout, use a
@@ -942,6 +1046,7 @@ def decide_compile_threads() -> int:
 # TODO: Set directly after internal rollout.
 compile_threads: Optional[int] = None if is_fbcode() else decide_compile_threads()
 
+<<<<<<< HEAD
 # Whether to quiesce the Triton-compile subprocess pool at the end of each compilation.
 quiesce_async_compile_pool: bool = Config(
     justknob="pytorch/inductor:quiesce_async_compile_pool",
@@ -949,6 +1054,8 @@ def decide_compile_threads() -> int:
     default=False,
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Whether or not to enable statically launching CUDA kernels
 # compiled by triton (instead of using triton's own launcher)
 use_static_cuda_launcher: bool = static_cuda_launcher_default()
@@ -997,6 +1104,7 @@ def decide_compile_threads() -> int:
 )
 pad_channels_last = False
 
+<<<<<<< HEAD
 # Control if we will do padding on dynamic shapes
 pad_dynamic_shapes = False
 
@@ -1006,6 +1114,11 @@ def decide_compile_threads() -> int:
 # Control if we will expand the dimension of pointwise nodes to fuse
 expand_dimension_for_pointwise_nodes = False
 
+=======
+# Disable comprehensive padding on the CPU
+disable_padding_cpu = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The width of comprehensive padding, in bytes.
 # CUDA max memory transaction size is 128 bytes for a warp.
 padding_alignment_bytes = 128
@@ -1120,6 +1233,7 @@ def decide_compile_threads() -> int:
 annotate_training: bool = os.environ.get("TORCHINDUCTOR_ANNOTATE_TRAINING", "0") == "1"
 
 # Enable caching codegen of triton templates.
+<<<<<<< HEAD
 enable_caching_generated_triton_templates: bool = True
 
 # Lookup table for overriding autotune configs based on hash of Triton source code
@@ -1144,15 +1258,21 @@ def get_worker_log_path() -> Optional[str]:
     env_name_force="TORCHINDUCTOR_WORKER_LOGPATH",
     default="",
 )
+=======
+enable_caching_generated_triton_templates: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # config specific to codegen/cpp.py
 class cpp:
+<<<<<<< HEAD
     """
     Settings for cpp backend.
     This class provides a centralized location for managing cpp backend settings.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # set to torch.get_num_threads()
     threads = -1
 
@@ -1169,9 +1289,15 @@ class cpp:
     dynamic_threads = os.environ.get("TORCHINDUCTOR_CPP_DYNAMIC_THREADS", "0") == "1"
 
     simdlen: Optional[int] = None
+<<<<<<< HEAD
     min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "512"))
 
     cxx: tuple[None, str] = (
+=======
+    min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "4096"))
+
+    cxx: tuple[Literal[None], str] = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         None,  # download gcc12 from conda-forge if conda is installed
         os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
     )  # type: ignore[assignment]
@@ -1268,6 +1394,7 @@ class cpp:
     # Use a small dequant buffer for wgt of woq int4 size as: [q_group_size, Nr]
     use_small_dequant_buffer = False
 
+<<<<<<< HEAD
     force_inline_kernel = (
         os.environ.get("TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL", "0") == "1"
     )
@@ -1283,6 +1410,11 @@ class triton:
     Config specific to codegen/triton.py
     """
 
+=======
+
+# config specific to codegen/triton.py
+class triton:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Use cudagraphs on output code
     cudagraphs = os.environ.get("TORCHINDUCTOR_CUDAGRAPHS") == "1"
 
@@ -1304,7 +1436,11 @@ class triton:
     cudagraph_trees_history_recording = False
 
     # Enable cudagraph support for mutated inputs from prior cudagraph pool
+<<<<<<< HEAD
     cudagraph_support_input_mutation = not is_fbcode()
+=======
+    cudagraph_support_input_mutation = False if is_fbcode() else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Maximal number of allowed cudagraph re-record for a function and
     # a cudagraph node due to static input tensor address changes or
@@ -1315,7 +1451,11 @@ class triton:
 
     # Warn loudly when the number of cudagraphs due to dynamic shape
     # exceeds this limit
+<<<<<<< HEAD
     cudagraph_dynamic_shape_warn_limit: Optional[int] = 8
+=======
+    cudagraph_dynamic_shape_warn_limit: Optional[int] = 50
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # synchronize after cudagraph invocation
     force_cudagraph_sync = False
@@ -1324,6 +1464,7 @@ class triton:
     # instead of recording and executing cudagraphs
     force_cudagraphs_warmup = False
 
+<<<<<<< HEAD
     # If False (default), torch.compile skips cudagraph for a graph if it
     # contains cudagraph-unsafe ops. If True, we require that all cuda ops
     # be captured into cudagraph. If this is not possible, this will raise
@@ -1333,6 +1474,8 @@ class triton:
         default=False,
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assertions on the fast path
     fast_path_cudagraph_asserts = False
 
@@ -1390,6 +1533,7 @@ class triton:
     # For best results, this should be used with prefer_nd_tiling.
     tile_reductions: bool = False
 
+<<<<<<< HEAD
     # Codegen matmul natively with tl.dot without using a template.
     # This option makes Inductor generate matrix multiplication from scratch,
     # instead of calling predefined Triton templates (mm, bmm, mm_plus_mm).
@@ -1408,6 +1552,8 @@ class triton:
     # an error will be thrown.
     native_matmul: bool = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # should we stop a fusion to allow better tiling?
     tiling_prevents_pointwise_fusion = True
     tiling_prevents_reduction_fusion = True
@@ -1479,11 +1625,16 @@ class triton:
     # So far we see a fixed 8 spilled registers for kernels using sin/cos.
     # Raise the threshold to 16 to be safe.
     # We should revisit this once we understand more of the source of register spills.
+<<<<<<< HEAD
     spill_threshold: int = 32 if torch.version.hip else 16
+=======
+    spill_threshold: int = 16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Generate code containing the newer tl.make_block_ptr() API for loads/store
     use_block_ptr = False
 
+<<<<<<< HEAD
     # (Experimental)
     # Generate code using the tl.make_tensor_descriptor() API for loads/store
     # [Note: TMA API Restrictions] Currently the TMA API requires the following:
@@ -1497,6 +1648,8 @@ class triton:
     # can be satisfied, along with any existing requirements for index expressions
     use_tensor_descriptor = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Inject a bug into our relu implementation; useful for testing our repro
     # extraction and minification functionality.
     # Valid values: "compile_error", "runtime_error", "accuracy"
@@ -1510,11 +1663,14 @@ class triton:
     enable_persistent_tma_matmul = (
         os.environ.get("ENABLE_PERSISTENT_TMA_MATMUL", "0") == "1"
     )
+<<<<<<< HEAD
     # Should TMA store be enable from templates. TODO: Remove once we
     # can autotune over the result.
     enable_template_tma_store = os.environ.get("ENABLE_TEMPLATE_TMA_STORE", "0") == "1"
     # Use epilogue subtiling. We allow disabling it due to limited B200 testing.
     enable_epilogue_subtiling = os.environ.get("ENABLE_EPILOGUE_SUBTILING", "1") == "1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Skip L1 cache for buffers that are used only once.  Disabled by default
     skip_l1_cache = os.environ.get("TORCHINDUCTOR_SKIP_L1", "0") == "1"
 
@@ -1524,6 +1680,7 @@ class triton:
     # Note: it may also need to be used with config.compile_threads = 1
     disallow_failing_autotune_kernels_TESTING_ONLY = False
 
+<<<<<<< HEAD
     # specify number of splits to autotune on for decompose_k. 0 disables decompose_k
     num_decompose_k_splits = int(
         os.environ.get("TORCHINDUCTOR_NUM_DECOMPOSE_K_SPLITS", "10")
@@ -1544,6 +1701,8 @@ class triton:
         os.environ.get("TORCHINDUCTOR_MIX_ORDER_REDUCTION", "0") == "1"
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class aot_inductor:
     """
@@ -1595,6 +1754,7 @@ class aot_inductor:
     # rather than embedded into the data section. Needed to support 1B+ parameter models
     force_mmap_weights: bool = False
 
+<<<<<<< HEAD
     # Default value of use_consts_asm_build is True, it will build by assembly language.
     # When the value is False, it will build by c++ language.
     use_consts_asm_build = True
@@ -1605,6 +1765,10 @@ class aot_inductor:
     # If package_cpp_only is True, whether cpp files will be compiled to a
     # dynamically linked library or static linked library
     dynamic_linkage: bool = True
+=======
+    package: bool = False
+    package_cpp_only: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Dictionary of metadata users might want to save to pass to the runtime.
     # TODO: Move this somewhere else, since it's no longer really a config
@@ -1643,6 +1807,7 @@ class aot_inductor:
     # but performance for that interface may be degraded.
     use_minimal_arrayref_interface: bool = False
 
+<<<<<<< HEAD
     # Set to True if we want to use Pytorch's CUDACachingAllocator for weight management
     weight_use_caching_allocator: bool = (
         os.environ.get("AOT_INDUCTOR_WEIGHT_USE_CACHING_ALLOCATOR", "0") == "1"
@@ -1665,16 +1830,28 @@ class aot_inductor:
     #       Stores all weights in a single binary blob in data/aot_inductor/model folder for each model.
     #       This option and config.aot_inductor.force_mmap_weights cannot both be True
     package_constants_on_disk_format: Optional[str] = None
+=======
+    # Experimental. Flag to control whether to include weight in .so
+    package_constants_in_so: bool = True
+
+    # Experimental. Flag to control whether to package weight separately on disk
+    package_constants_on_disk: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Experimental.  Controls automatic precompiling of common AOTI include files.
     precompile_headers: bool = not is_fbcode()
 
     # Embed generated kernel binary files into model.so
+<<<<<<< HEAD
     embed_kernel_binary: Optional[bool] = None
+=======
+    embed_kernel_binary: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Generate kernel files that support multiple archs
     # For CUDA, this means generating fatbin files for kernels, and the fatbin files
     # contains PTX and SASS for the current architecture.
+<<<<<<< HEAD
     emit_multi_arch_kernel: Optional[bool] = None
 
     # If not None, the generated files with use this name in file stem.
@@ -1686,6 +1863,12 @@ class aot_inductor:
     # If compile_standalone, the aoti model class name is f"AOTInductorModel{name}"
     #
     # This name can only contain letters, numbers, and underscores.
+=======
+    emit_multi_arch_kernel: bool = False
+
+    # If not None, the generated files with use this name in file stem.
+    # If None, we will use a hash to name files.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     model_name_for_generated_files: Optional[str] = None
 
     # Custom ops that have implemented C shim wrappers, defined as an op to C shim declaration dict
@@ -1693,6 +1876,7 @@ class aot_inductor:
     # custom op libs that have implemented C shim wrappers
     custom_op_libs: Optional[list[str]] = None
 
+<<<<<<< HEAD
     # Whether to enable link-time-optimization
     enable_lto = os.environ.get("AOT_INDUCTOR_ENABLE_LTO", "0") == "1"
 
@@ -1724,6 +1908,8 @@ class aot_inductor_mode:
     # emit_multi_arch_kernel=True
     compile_standalone: bool = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class cuda:
     """Settings for cuda backend, today this consists of cutlass"""
@@ -1755,11 +1941,19 @@ class cuda:
 
     # Path to the CUTLASS repo root directory.
     # The default path only works under PyTorch local development environment.
+<<<<<<< HEAD
     cutlass_dir = os.path.realpath(
         os.environ.get(
             "TORCHINDUCTOR_CUTLASS_DIR",
             os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/"),
         )
+=======
+    cutlass_dir = os.environ.get(
+        "TORCHINDUCTOR_CUTLASS_DIR",
+        os.path.abspath(
+            os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/")
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # Configures the maximum number of CUTLASS configs to profile in max_autotune.
@@ -1824,6 +2018,14 @@ class cuda:
         "TORCHINDUCTOR_CUTLASS_INSTANTIATION_LEVEL", "0"
     )
 
+<<<<<<< HEAD
+=======
+    # Experimental. Only for H100 for now. Flag to control whether to use presets.
+    # Format looks like: "0,1,3" for using presets 0, 1, and 3. Presets can be
+    # controlled by some cutlass instantiation level flags (e.g. 0, 1111, 2222, ...)
+    cutlass_presets: Optional[str] = os.environ.get("TORCHINDUCTOR_CUTLASS_PRESETS")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # use compile command to create kernel .cu and .so name
     cutlass_hash_with_compile_cmd: bool = (
         os.environ.get("TORCHINDUCTOR_CUTLASS_HASH_WITH_COMPILE_CMD", "0") == "1"
@@ -1851,9 +2053,12 @@ class cuda:
     # Use this to overwrite and handle cache pollution
     binary_remote_cache_force_write: bool = False
 
+<<<<<<< HEAD
     # Enable caching codegen of cuda templates.
     enable_caching_codegen: bool = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class rocm:
     # Offload arch list for device code compilation, e.g. ["gfx90a", "gfx942"].
@@ -1862,11 +2067,15 @@ class rocm:
 
     # Enable the CK backend for CDNA2 and CDNA3 only (for now)
     # Processor name reference: https://llvm.org/docs/AMDGPUUsage.html#processors
+<<<<<<< HEAD
     ck_supported_arch: list[Literal["gfx90a", "gfx942", "gfx950"]] = [
         "gfx90a",
         "gfx942",
         "gfx950",
     ]
+=======
+    ck_supported_arch: list[str] = ["gfx90a", "gfx942"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Optimization level, use to balance compilation speed and runtime performance.
     # The type will not necessarily be comprehensive and won't be enforced at runtime.
@@ -1924,9 +2133,12 @@ class rocm:
     # The threshold at which we trigger a splitK config - K // max(M,N) has to be greater than this
     split_k_threshold: int = 16
 
+<<<<<<< HEAD
     # The threshold at which we trigger a contiguous subgraph transformation
     contiguous_threshold: int = 16
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental)
 cpu_backend: Literal["cpp", "triton", "halide"] = "cpp"
@@ -2026,6 +2238,7 @@ class trace:
 
     log_autotuning_results = os.environ.get("LOG_AUTOTUNE_RESULTS", "0") == "1"
 
+<<<<<<< HEAD
     # Save mapping info from inductor generated kernel to post_grad/pre_grad fx nodes
     # Levels:
     #   0 - disabled (default)
@@ -2039,6 +2252,10 @@ class trace:
             "INDUCTOR_PROVENANCE", os.environ.get("TORCH_COMPILE_DEBUG", "0")
         )
     )
+=======
+    # Save mapping info from inductor generated triton kernel to post_grad fx nodes
+    log_inductor_triton_kernel_to_post_grad_node_info: bool = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _save_config_ignore: list[str] = [
@@ -2066,8 +2283,11 @@ class trace:
     # see CustomGraphPass; these are handled specially
     "post_grad_custom_post_pass",
     "post_grad_custom_pre_pass",
+<<<<<<< HEAD
     "joint_custom_pre_pass",
     "joint_custom_post_pass",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "_fuse_ddp_communication_passes",
     "_pre_fusion_custom_pass",
     # tests assume that changes here don't invalidate cache
@@ -2082,10 +2302,13 @@ class trace:
 # External callable for matmul tuning candidates
 external_matmul: list[Callable[[torch.Tensor, torch.Tensor, torch.Tensor], None]] = []
 
+<<<<<<< HEAD
 write_are_deterministic_algorithms_enabled = (
     os.getenv("TORCHINDUCTOR_WRITE_ARE_DETERMINISTIC_ALGORITHMS_ENABLED", "1") == "1"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class test_configs:
     force_extern_kernel_in_multi_template: bool = False
@@ -2093,7 +2316,10 @@ class test_configs:
     max_mm_configs: Optional[int] = None
 
     runtime_triton_dtype_assert = False
+<<<<<<< HEAD
     runtime_triton_shape_assert = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static_cpp_dtype_assert = False
 
     # regex to control the set of considered autotuning
@@ -2103,6 +2329,7 @@ class test_configs:
 
     graphsafe_rng_func_ignores_fallback_random = False
 
+<<<<<<< HEAD
     track_memory_lifecycle: Optional[Literal["assert", "log"]] = None
 
     # If set to True, AOTI-generated CMakelists.txt will still use libtorch
@@ -2125,6 +2352,8 @@ class test_configs:
         "TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT", ""
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
index 85033e2b3e8d5..c3c52981b0b8a 100644
--- a/torch/_inductor/constant_folding.py
+++ b/torch/_inductor/constant_folding.py
@@ -206,7 +206,11 @@ def set_env(arg: torch.fx.Node) -> None:
         # contains a ScriptObject, equality checking results in a type error if
         # the types are different.
         if any(
+<<<<<<< HEAD
             type(self.unknown_value) is type(input_) and self.unknown_value == input_
+=======
+            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for input_ in flattened_inputs
         ):
             return self.unknown_value
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index 8e072178099c6..584d42af2be1d 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -2,11 +2,17 @@
 # The design document please check this RFC: https://github.com/pytorch/pytorch/issues/124245
 
 import copy
+<<<<<<< HEAD
 import ctypes
 import errno
 import functools
 import json
 import locale
+=======
+import errno
+import functools
+import json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import os
 import platform
@@ -20,7 +26,11 @@
 import textwrap
 import warnings
 from collections.abc import Sequence
+<<<<<<< HEAD
 from ctypes import cdll, wintypes
+=======
+from ctypes import cdll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ctypes.util import find_library
 from pathlib import Path
 from typing import Any, Optional, Union
@@ -68,9 +78,13 @@ def use_global_cache() -> bool:  # type: ignore[misc]
 _IS_MACOS = sys.platform.startswith("darwin")
 _IS_WINDOWS = sys.platform == "win32"
 
+<<<<<<< HEAD
 MINGW_GXX = "x86_64-w64-mingw32-g++"
 
 SUBPROCESS_DECODE_ARGS = (locale.getpreferredencoding(),) if _IS_WINDOWS else ()
+=======
+SUBPROCESS_DECODE_ARGS = ("utf-8",) if _IS_WINDOWS else ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -145,6 +159,7 @@ def check_compiler_exist_windows(compiler: str) -> None:
         pass
 
 
+<<<<<<< HEAD
 class WinPeFileVersionInfo:
     def __init__(self, file_path: str) -> None:
         self.file_path = file_path
@@ -382,6 +397,12 @@ def get_cpp_compiler() -> str:
         compiler = normalize_path_separator(compiler)
         check_compiler_exist_windows(compiler)
         check_msvc_cl_language_id(compiler)
+=======
+def get_cpp_compiler() -> str:
+    if _IS_WINDOWS:
+        compiler = os.environ.get("CXX", "cl")
+        check_compiler_exist_windows(compiler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         if config.is_fbcode():
             return build_paths.cc
@@ -593,7 +614,13 @@ def _create_if_dir_not_exist(path_dir: str) -> None:
             Path(path_dir).mkdir(parents=True, exist_ok=True)
         except OSError as exc:  # Guard against race condition
             if exc.errno != errno.EEXIST:
+<<<<<<< HEAD
                 raise RuntimeError(f"Fail to create path {path_dir}") from exc
+=======
+                raise RuntimeError(  # noqa: TRY200 (Use `raise from`)
+                    f"Fail to create path {path_dir}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _remove_dir(path_dir: str) -> None:
@@ -615,7 +642,11 @@ def _run_compile_cmd(cmd_line: str, cwd: str) -> None:
             cmd, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
         )
     except subprocess.CalledProcessError as e:
+<<<<<<< HEAD
         output = e.stdout.decode(*SUBPROCESS_DECODE_ARGS)
+=======
+        output = e.stdout.decode("utf-8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         openmp_problem = "'omp.h' file not found" in output or "libomp" in output
         if openmp_problem and sys.platform == "darwin":
             instruction = (
@@ -797,11 +828,14 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
             "wd4067",
             "wd4068",
             "EHsc",
+<<<<<<< HEAD
             # For Intel oneAPI, ref: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
             "Zc:__cplusplus",
             # Enable max compatible to msvc for oneAPI headers.
             # ref: https://github.com/pytorch/pytorch/blob/db38c44ad639e7ada3e9df2ba026a2cb5e40feb0/cmake/public/utils.cmake#L352-L358 # noqa: B950
             "permissive-",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     else:
         cflags = ["Wno-unused-variable", "Wno-unknown-pragmas"]
@@ -812,6 +846,7 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
                 else "Wno-ignored-optimization-argument"
             )
             cflags.append(ignored_optimization_argument)
+<<<<<<< HEAD
         if _is_gcc(cpp_compiler):
             # Issue all the warnings demanded by strict ISO C and ISO C++.
             # Ref: https://github.com/pytorch/pytorch/issues/153180#issuecomment-2986676878
@@ -853,10 +888,33 @@ def _get_ffast_math_flags() -> list[str]:
 
         if is_gcc():
             flags.append("fexcess-precision=fast")
+=======
+    return cflags
+
+
+def _get_ffast_math_flags() -> list[str]:
+    # ffast-math is equivalent to these flags as in
+    # https://github.com/gcc-mirror/gcc/blob/4700ad1c78ccd7767f846802fca148b2ea9a1852/gcc/opts.cc#L3458-L3468
+    # however gcc<13 sets the FTZ/DAZ flags for runtime on x86 even if we have
+    # -ffast-math -fno-unsafe-math-optimizations because the flags for runtime
+    # are added by linking in crtfastmath.o. This is done by the spec file which
+    # only does globbing for -ffast-math.
+    flags = [
+        "fno-trapping-math",
+        "funsafe-math-optimizations",
+        "ffinite-math-only",
+        "fno-signed-zeros",
+        "fno-math-errno",
+    ]
+
+    if is_gcc():
+        flags.append("fexcess-precision=fast")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return flags
 
 
+<<<<<<< HEAD
 def _get_inductor_debug_symbol_cflags() -> tuple[list[str], list[str]]:
     """
     When we turn on generate debug symbol.
@@ -904,6 +962,26 @@ def _get_optimization_cflags(
     if _IS_WINDOWS:
         pass
     else:
+=======
+def _get_optimization_cflags(
+    cpp_compiler: str, min_optimize: bool = False
+) -> list[str]:
+    if _IS_WINDOWS:
+        return ["O1" if min_optimize else "O2"]
+    else:
+        wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
+        cflags = (
+            ["O0", "g"]
+            if config.aot_inductor.debug_compile
+            else [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
+        )
+        cflags += _get_ffast_math_flags()
+        cflags.append("fno-finite-math-only")
+        if not config.cpp.enable_unsafe_math_opt_flag:
+            cflags.append("fno-unsafe-math-optimizations")
+        cflags.append(f"ffp-contract={config.cpp.enable_floating_point_contract_flag}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.platform != "darwin":
             # on macos, unknown argument: '-fno-tree-loop-vectorize'
             if _is_gcc(cpp_compiler):
@@ -916,6 +994,7 @@ def _get_optimization_cflags(
                 else:
                     cflags.append("march=native")
 
+<<<<<<< HEAD
         if config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
             cflags.append("flto=thin")
 
@@ -923,12 +1002,19 @@ def _get_optimization_cflags(
 
 
 def _get_shared_cflags(do_link: bool) -> list[str]:
+=======
+        return cflags
+
+
+def _get_shared_cflag(do_link: bool) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _IS_WINDOWS:
         """
         MSVC `/MD` using python `ucrtbase.dll` lib as runtime.
         https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=msvc-170
         """
         return ["DLL", "MD"]
+<<<<<<< HEAD
     if platform.system() == "Darwin" and "clang" in get_cpp_compiler():
         # This causes undefined symbols to behave the same as linux
         return ["shared", "fPIC", "undefined dynamic_lookup"]
@@ -938,6 +1024,14 @@ def _get_shared_cflags(do_link: bool) -> list[str]:
 
     flags.append("fPIC")
     return flags
+=======
+    if not do_link:
+        return ["fPIC"]
+    if platform.system() == "Darwin" and "clang" in get_cpp_compiler():
+        # This causes undefined symbols to behave the same as linux
+        return ["shared", "fPIC", "undefined dynamic_lookup"]
+    return ["shared", "fPIC"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_cpp_options(
@@ -955,16 +1049,23 @@ def get_cpp_options(
     libraries: list[str] = []
     passthrough_args: list[str] = []
 
+<<<<<<< HEAD
     opt_cflags, opt_ldflags = _get_optimization_cflags(cpp_compiler, min_optimize)
 
     cflags = (
         opt_cflags
         + _get_shared_cflags(do_link)
+=======
+    cflags = (
+        _get_shared_cflag(do_link)
+        + _get_optimization_cflags(cpp_compiler, min_optimize)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + _get_warning_all_cflag(warning_all)
         + _get_cpp_std_cflag()
         + _get_os_related_cpp_cflags(cpp_compiler)
     )
 
+<<<<<<< HEAD
     definitions += _get_os_related_cpp_definitions(cpp_compiler)
 
     if not _IS_WINDOWS and config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
@@ -978,11 +1079,19 @@ def get_cpp_options(
         if check_mingw_win32_flavor(MINGW_GXX) == "posix":
             passthrough_args.append("-Wl,-Bstatic -lwinpthread -Wl,-Bdynamic")
 
+=======
+    passthrough_args.append(" ".join(extra_flags))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         definitions,
         include_dirs,
         cflags,
+<<<<<<< HEAD
         ldflags + opt_ldflags,
+=======
+        ldflags,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         libraries_dirs,
         libraries,
         passthrough_args,
@@ -1045,6 +1154,16 @@ def __init__(
         self._finalize_options()
 
 
+<<<<<<< HEAD
+=======
+def _get_glibcxx_abi_build_flags() -> list[str]:
+    if not _IS_WINDOWS:
+        return ["-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
+    else:
+        return []
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_torch_cpp_wrapper_definition() -> list[str]:
     return ["TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER"]
 
@@ -1135,6 +1254,7 @@ def _get_torch_related_args(
 ) -> tuple[list[str], list[str], list[str]]:
     from torch.utils.cpp_extension import include_paths, TORCH_LIB_PATH
 
+<<<<<<< HEAD
     libraries = []
     include_dirs = include_paths()
 
@@ -1164,6 +1284,15 @@ def _get_torch_related_args(
             " when 'cross_target_platform' is 'windows'.",
         )
         libraries_dirs.append(config.aot_inductor.aoti_shim_library_path)
+=======
+    include_dirs = include_paths()
+    libraries_dirs = [TORCH_LIB_PATH]
+    libraries = []
+    if sys.platform != "darwin" and not config.is_fbcode():
+        libraries = ["torch", "torch_cpu"]
+        if not aot_mode:
+            libraries.append("torch_python")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if _IS_WINDOWS:
         libraries.append("sleef")
@@ -1294,9 +1423,12 @@ def _get_openmp_args(
     lib_dir_paths: list[str] = []
     libs: list[str] = []
     passthrough_args: list[str] = []
+<<<<<<< HEAD
 
     if config.aot_inductor.cross_target_platform == "windows":
         return cflags, ldflags, include_dir_paths, lib_dir_paths, libs, passthrough_args
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _IS_MACOS:
         # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
         cflags.append("Xclang")
@@ -1395,6 +1527,7 @@ def _get_openmp_args(
     return cflags, ldflags, include_dir_paths, lib_dir_paths, libs, passthrough_args
 
 
+<<<<<<< HEAD
 def _get_libstdcxx_args() -> tuple[list[str], list[str]]:
     """
     For fbcode cpu case, we should link stdc++ instead assuming the binary where dlopen is executed is built with dynamic stdc++.
@@ -1430,6 +1563,12 @@ def get_caching_allocator_macro() -> list[str]:
     macros = []
     if config.aot_inductor.weight_use_caching_allocator:
         macros.append(" AOT_INDUCTOR_USE_CACHING_ALLOCATOR")
+=======
+def get_mmap_self_macro(use_mmap_weights: bool) -> list[str]:
+    macros = []
+    if use_mmap_weights:
+        macros.append(" USE_MMAP_SELF")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return macros
 
 
@@ -1440,6 +1579,7 @@ def get_cpp_torch_options(
     aot_mode: bool,
     use_relative_path: bool,
     use_mmap_weights: bool,
+<<<<<<< HEAD
     use_mmap_weights_external: bool,
 ) -> tuple[list[str], list[str], list[str], list[str], list[str], list[str], list[str]]:
     """
@@ -1451,6 +1591,9 @@ def get_cpp_torch_options(
     5. MISC
     6. Return the build args
     """
+=======
+) -> tuple[list[str], list[str], list[str], list[str], list[str], list[str], list[str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     definitions: list[str] = []
     include_dirs: list[str] = []
     cflags: list[str] = []
@@ -1487,10 +1630,17 @@ def get_cpp_torch_options(
         omp_passthrough_args,
     ) = _get_openmp_args(cpp_compiler)
 
+<<<<<<< HEAD
     fb_macro_passthrough_args = _use_fb_internal_macros()
 
     mmap_self_macros = get_mmap_self_macro(use_mmap_weights, use_mmap_weights_external)
     caching_allocator_macros = get_caching_allocator_macro()
+=======
+    cxx_abi_passthrough_args = _get_glibcxx_abi_build_flags()
+    fb_macro_passthrough_args = _use_fb_internal_macros()
+
+    mmap_self_macros = get_mmap_self_macro(use_mmap_weights)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     definitions = (
         torch_cpp_wrapper_definitions
@@ -1498,7 +1648,10 @@ def get_cpp_torch_options(
         + isa_macros
         + fb_macro_passthrough_args
         + mmap_self_macros
+<<<<<<< HEAD
         + caching_allocator_macros
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     include_dirs = (
         sys_libs_include_dirs
@@ -1511,7 +1664,14 @@ def get_cpp_torch_options(
     libraries_dirs = python_libraries_dirs + torch_libraries_dirs + omp_lib_dir_paths
     libraries = torch_libraries + omp_lib
     passthrough_args = (
+<<<<<<< HEAD
         sys_libs_passthrough_args + isa_ps_args_build_flags + omp_passthrough_args
+=======
+        sys_libs_passthrough_args
+        + isa_ps_args_build_flags
+        + cxx_abi_passthrough_args
+        + omp_passthrough_args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return (
@@ -1546,7 +1706,10 @@ def __init__(
         compile_only: bool = False,
         use_relative_path: bool = False,
         use_mmap_weights: bool = False,
+<<<<<<< HEAD
         use_mmap_weights_external: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shared: bool = True,
         extra_flags: Sequence[str] = (),
         compiler: str = "",
@@ -1582,7 +1745,10 @@ def __init__(
             aot_mode=aot_mode,
             use_relative_path=use_relative_path,
             use_mmap_weights=use_mmap_weights,
+<<<<<<< HEAD
             use_mmap_weights_external=use_mmap_weights_external,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         _append_list(self._definitions, torch_definitions)
@@ -1635,6 +1801,7 @@ def get_cpp_torch_device_options(
     aot_mode: bool = False,
     compile_only: bool = False,
 ) -> tuple[list[str], list[str], list[str], list[str], list[str], list[str], list[str]]:
+<<<<<<< HEAD
     """
     This function is used to get the build args of device related build options.
     1. Device include_directories, libraries, libraries_directories.
@@ -1642,6 +1809,8 @@ def get_cpp_torch_device_options(
     3. MISC
     4. Return the build args
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     definitions: list[str] = []
     include_dirs: list[str] = []
     cflags: list[str] = []
@@ -1659,6 +1828,7 @@ def get_cpp_torch_device_options(
     _set_gpu_runtime_env()
     from torch.utils import cpp_extension
 
+<<<<<<< HEAD
     include_dirs = cpp_extension.include_paths(
         device_type, config.aot_inductor.link_libtorch is None
     )
@@ -1668,10 +1838,15 @@ def get_cpp_torch_device_options(
         torch_include_dirs=link_libtorch,
         cross_target_platform=config.aot_inductor.cross_target_platform,
     )
+=======
+    include_dirs = cpp_extension.include_paths(device_type)
+    libraries_dirs = cpp_extension.library_paths(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if device_type == "cuda":
         definitions.append(" USE_ROCM" if torch.version.hip else " USE_CUDA")
 
         if torch.version.hip is not None:
+<<<<<<< HEAD
             if config.is_fbcode() or not link_libtorch:
                 libraries += ["amdhip64"]
             else:
@@ -1684,10 +1859,23 @@ def get_cpp_torch_device_options(
                 libraries += ["cuda", "torch_cuda"]
             if config.aot_inductor.cross_target_platform == "windows":
                 libraries += ["cudart"]
+=======
+            if config.is_fbcode():
+                libraries += ["amdhip64"]
+            else:
+                libraries += ["c10_hip", "torch_hip"]
+            definitions.append(" __HIP_PLATFORM_AMD__")
+        else:
+            if config.is_fbcode():
+                libraries += ["cuda"]
+            else:
+                libraries += ["c10_cuda", "cuda", "torch_cuda"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _transform_cuda_paths(libraries_dirs)
 
     if device_type == "xpu":
         definitions.append(" USE_XPU")
+<<<<<<< HEAD
         xpu_error_string = (
             "Intel GPU driver is not properly installed, please follow the instruction "
             "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
@@ -1707,6 +1895,16 @@ def get_cpp_torch_device_options(
         libraries += ["ze_loader", "sycl"]
         if link_libtorch:
             libraries += ["torch_xpu"]
+=======
+        # Suppress multi-line comment warnings in sycl headers
+        cflags += ["Wno-comment"]
+        libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
+        if not find_library("ze_loader"):
+            raise OSError(
+                "Intel GPU driver is not properly installed, please follow the instruction "
+                "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if device_type == "mps":
         definitions.append(" USE_MPS")
@@ -1720,6 +1918,7 @@ def get_cpp_torch_device_options(
                     # Only add link args, when compile_only is false.
                     passthrough_args = ["-Wl,-Bstatic -lcudart_static -Wl,-Bdynamic"]
 
+<<<<<<< HEAD
         if device_type == "cpu":
             (
                 stdcxx_lib_dir_paths,
@@ -1728,6 +1927,8 @@ def get_cpp_torch_device_options(
             libraries_dirs += stdcxx_lib_dir_paths
             libraries += stdcxx_libs
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if config.aot_inductor.custom_op_libs:
         libraries += config.aot_inductor.custom_op_libs
 
@@ -1758,7 +1959,10 @@ def __init__(
         compile_only: bool = False,
         use_relative_path: bool = False,
         use_mmap_weights: bool = False,
+<<<<<<< HEAD
         use_mmap_weights_external: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shared: bool = True,
         extra_flags: Sequence[str] = (),
         min_optimize: bool = False,
@@ -1772,7 +1976,10 @@ def __init__(
             compile_only=compile_only,
             use_relative_path=use_relative_path,
             use_mmap_weights=use_mmap_weights,
+<<<<<<< HEAD
             use_mmap_weights_external=use_mmap_weights_external,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extra_flags=extra_flags,
             min_optimize=min_optimize,
             precompiling=precompiling,
@@ -1796,9 +2003,13 @@ def __init__(
             device_libraries,
             device_passthrough_args,
         ) = get_cpp_torch_device_options(
+<<<<<<< HEAD
             device_type=device_type,
             aot_mode=aot_mode,
             compile_only=compile_only,
+=======
+            device_type=device_type, aot_mode=aot_mode, compile_only=compile_only
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         _append_list(self._definitions, device_definitions)
         _append_list(self._include_dirs, device_include_dirs)
@@ -1911,9 +2122,12 @@ def __init__(
         self._aot_mode: bool = False
 
         self._name = name
+<<<<<<< HEAD
         self._target_name = (
             config.aot_inductor.model_name_for_generated_files or "aoti_model"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Code start here, initial self internal variables firstly.
         self._build_option = BuildOption
@@ -1966,8 +2180,12 @@ def __init__(
         if isinstance(sources, str):
             sources = [sources]
 
+<<<<<<< HEAD
         # Use relative paths only when requested (typically for remote builds)
         if config.is_fbcode() and self._use_relative_path:
+=======
+        if config.is_fbcode() and (not self._aot_mode or self._use_relative_path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Will create another temp directory for building, so do NOT use the
             # absolute path.
             self._orig_source_paths = list(sources)
@@ -2140,6 +2358,7 @@ def save_compile_cmd_to_cmake(
         """
 
         definitions = " ".join(self._build_option.get_definitions())
+<<<<<<< HEAD
         target_library_type = (
             "STATIC" if not config.aot_inductor.dynamic_linkage else "SHARED"
         )
@@ -2195,6 +2414,30 @@ def save_compile_cmd_to_cmake(
                 """
             )
 
+=======
+        contents = textwrap.dedent(
+            f"""
+            cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+            project(aoti_model LANGUAGES CXX)
+            set(CMAKE_CXX_STANDARD 17)
+
+            # May need to point CMAKE_PREFIX_PATH to the right torch location
+            find_package(Torch REQUIRED)
+
+            # Set a shared library target
+            add_library(aoti_model SHARED)
+
+            # Add macro definitions
+            target_compile_definitions(aoti_model PRIVATE {definitions})
+
+            # Add compile flags
+            target_compile_options(aoti_model PRIVATE {self._cflags_args})
+            # Backend specific flags
+            target_compile_options(aoti_model PRIVATE {self._passthrough_parameters_args} -c)
+
+            """
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if device_type == "cuda" and torch.version.hip is None:
             from torch._inductor.codecache import _nvcc_arch_as_compile_option
 
@@ -2202,11 +2445,15 @@ def save_compile_cmd_to_cmake(
             contents += textwrap.dedent(
                 f"""
                 enable_language(CUDA)
+<<<<<<< HEAD
                 set(CMAKE_CUDA_STANDARD 17)
                 find_package(CUDAToolkit REQUIRED)
                 target_include_directories({self._target_name} PRIVATE ${{CUDAToolkit_INCLUDE_DIRS}})
                 target_compile_definitions({self._target_name} PRIVATE USE_CUDA)
                 target_link_libraries({self._target_name} PRIVATE cuda CUDA::cudart_static)
+=======
+                find_package(CUDAToolkit REQUIRED)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 find_program(OBJCOPY_EXECUTABLE objcopy)
                 if(NOT OBJCOPY_EXECUTABLE)
@@ -2235,7 +2482,11 @@ def save_compile_cmd_to_cmake(
                     add_custom_command(
                         OUTPUT ${{FATBIN_FILE}}
                         COMMAND ${{CUDAToolkit_NVCC_EXECUTABLE}} --fatbin ${{PTX_FILE}} -o ${{FATBIN_FILE}} ${{NVCC_GENCODE_FLAGS}}
+<<<<<<< HEAD
                                 -gencode arch=compute_{current_arch},code=compute_{current_arch}
+=======
+                                -gencode arch=compute_80,code=compute_80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 -gencode arch=compute_{current_arch},code=sm_{current_arch}
                         DEPENDS ${{PTX_FILE}}
                     )
@@ -2270,7 +2521,11 @@ def save_src_to_cmake(self, cmake_path: str, src_path: str) -> None:
         # Remove the directory part of file_path
         src_path = "${CMAKE_CURRENT_SOURCE_DIR}/" + Path(src_path).name
         with open(cmake_path, "a") as f:
+<<<<<<< HEAD
             f.write(f"target_sources({self._target_name} PRIVATE {src_path})\n")
+=======
+            f.write(f"target_sources(aoti_model PRIVATE {src_path})\n")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> None:
         # TODO: make this work beyond CUDA
@@ -2284,11 +2539,18 @@ def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> Non
                     """
                 )
                 f.write(contents)
+<<<<<<< HEAD
             if asm_files:
                 f.write(f"add_dependencies({self._target_name} ${{KERNEL_TARGETS}})\n")
                 f.write(
                     f"target_link_libraries({self._target_name} PRIVATE ${{KERNEL_OBJECT_FILES}})\n"
                 )
+=======
+            f.write("add_dependencies(aoti_model ${KERNEL_TARGETS})\n")
+            f.write(
+                "target_link_libraries(aoti_model PRIVATE ${KERNEL_OBJECT_FILES})\n"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
         lflags = " ".join(self._build_option.get_ldflags())
@@ -2296,10 +2558,17 @@ def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
         contents = textwrap.dedent(
             f"""
             # Add linker flags
+<<<<<<< HEAD
             target_link_options({self._target_name} PRIVATE {lflags})
 
             # Add libraries
             target_link_libraries({self._target_name} PRIVATE {libs})
+=======
+            target_link_options(aoti_model PRIVATE {lflags})
+
+            # Add libraries
+            target_link_libraries(aoti_model PRIVATE {libs})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          """
         )
 
@@ -2308,6 +2577,7 @@ def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
         )
         with open(cmake_path, "a") as f:
             f.write(contents)
+<<<<<<< HEAD
 
 
 def run_asm_build_object(src: str, target: str, cwd: str) -> None:
@@ -2338,3 +2608,5 @@ def get_command_line(asm_cc: str, src: str, target: str) -> str:
         target=normalize_path_separator(target),
     )
     run_compile_cmd(cmd, cwd=normalize_path_separator(cwd))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index cad5b1da182c6..dcb4623527714 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -11,7 +11,10 @@
 
 import torch
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.utils import python_subprocess_env
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _IS_WINDOWS = sys.platform == "win32"
@@ -132,7 +135,16 @@ def check_build(self, code: str) -> bool:
                     ],
                     cwd=output_dir,
                     stderr=subprocess.DEVNULL,
+<<<<<<< HEAD
                     env=python_subprocess_env(),
+=======
+                    env={
+                        **os.environ,
+                        "PYTHONPATH": os.environ.get(
+                            "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+                        ),
+                    },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             except Exception:
                 return False
@@ -200,13 +212,17 @@ class VecAVX512(VecISA):
         else "/arch:AVX512"
     )  # TODO: use cflags
     _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+<<<<<<< HEAD
     _is_avx512_bf16_supported = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return "avx512"
 
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
 
+<<<<<<< HEAD
     _avx512_bf16_code = """
 #include <cstdint>
 #include <immintrin.h>
@@ -246,13 +262,18 @@ def build_arch_flags(self) -> str:
         else:
             return self._arch_flags
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class VecAMX(VecAVX512):
     _arch_flags = VecAVX512._arch_flags + " -mamx-tile -mamx-bf16 -mamx-int8"
+<<<<<<< HEAD
     # check amx_fp16 separately since it is not always supported when amx is supported
     # amx_fp16 intrinsic compilation need gcc >=13 on platforms which support amx_fp16
     _is_amx_fp16_supported = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return super().__str__() + " amx_tile"
@@ -280,14 +301,18 @@ def __str__(self) -> str:
 }
 """
 
+<<<<<<< HEAD
     _amx_fp16_code = _amx_code.replace("_tile_dpbf16ps", "_tile_dpfp16ps")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @functools.cache  # noqa: B019
     def __bool__(self) -> bool:
         if super().__bool__():
             if config.is_fbcode():
                 return False
             if self.check_build(VecAMX._amx_code) and torch.cpu._init_amx():
+<<<<<<< HEAD
                 # check amx-fp16 as well when check amx
                 if torch.cpu._is_amx_fp16_supported():
                     # save _arch_flags
@@ -316,6 +341,11 @@ def build_arch_flags(self) -> str:
             extra_flags += " -mamx-fp16"
         return self._arch_flags + extra_flags
 
+=======
+                return True
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class VecAVX2(VecISA):
@@ -430,7 +460,10 @@ def get_isa_from_cpu_capability(
         "avx512": "avx512",
     }
     if capability in capability_to_isa_str.keys():
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         isa_str = capability_to_isa_str[capability]
         if isa_str == "INVALID_VEC_ISA":
             return invalid_vec_isa
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 34f55c7bf797c..7ebeb2d260cfc 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -90,7 +90,10 @@
 
     from torch._guards import CompileId
     from torch._inductor.utils import InputType
+<<<<<<< HEAD
     from torch.cuda import _POOL_HANDLE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.types import _bool
 
 StorageWeakRefPointer = int
@@ -102,7 +105,11 @@
 if torch.backends.cuda.is_built():
     from torch._C import (
         _cuda_CUDAAllocator_AllocatorState as AllocatorState,
+<<<<<<< HEAD
         _set_cached_tensors_enabled,
+=======
+        _set_cached_tensors_enabled as _set_cached_tensors_enabled,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 else:
 
@@ -407,7 +414,10 @@ def deferred_cudagraphify(inputs: list[InputType]) -> OutputType:
         fn = align_inputs_from_check_idxs(
             fn, inputs_to_check=check_input_idxs, mutated_input_idxs=mutated_input_idxs
         )
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn_cache[int_key] = fn
 
         return out
@@ -819,7 +829,11 @@ def __init__(
         id: GraphID,
         parent: Optional[CUDAGraphNode],
         inputs: list[InputType],
+<<<<<<< HEAD
         cuda_graphs_pool: _POOL_HANDLE,
+=======
+        cuda_graphs_pool: tuple[int, int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_index: int,
         stack_traces: Optional[StackTraces],
         stream: torch.cuda.Stream,
@@ -923,7 +937,10 @@ def maybe_get_static_data_ptr(
             return None
 
         self.static_input_data_ptrs: InputList[Optional[int]] = [
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             maybe_get_static_data_ptr(i, inputs, self.static_input_idxs)
             for i in range(len(inputs))
         ]
@@ -970,10 +987,15 @@ def maybe_get_static_data_ptr(
             self.expected_dead_indices_before_graph = different_indices
 
         rng_states = [inp for inp in inputs if isinstance(inp, torch.Generator)]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         recording_inputs = self._allocate_and_copy_recording_inputs(inputs)
         # recording inputs will copy over memory, so we can free non recording inputs
         # pyrefly: ignore [missing-attribute]
+=======
+        recording_inputs = self._allocate_and_copy_recording_inputs(inputs)
+        # recording inputs will copy over memory, so we can free non recording inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs.clear()
         del inputs
 
@@ -1233,7 +1255,10 @@ def all_outputs_are_dead(self) -> bool:
 
     def _record(self, model: ModelType, inputs: list[InputType]) -> OutputType:
         "Record the model"
+<<<<<<< HEAD
         assert self.graph is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def static_input_iter() -> Generator[torch.Tensor, None, None]:
             for i in self.wrapped_function.static_input_idxs:
@@ -1285,10 +1310,15 @@ def static_input_iter() -> Generator[torch.Tensor, None, None]:
         if not isinstance(static_outputs, (list, tuple)):
             static_outputs = (static_outputs,)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self._add_first_outputs(static_outputs, static_input_persistent_storage_ptrs)
 
         # pyrefly: ignore [bad-return]
+=======
+        self._add_first_outputs(static_outputs, static_input_persistent_storage_ptrs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return static_outputs
 
     def _add_first_outputs(
@@ -1318,11 +1348,21 @@ def _add_first_outputs(
                 self.output_storage_alias.append(UnaliasedStorage)
                 continue
 
+<<<<<<< HEAD
             torch._check(
                 o.is_cuda or o.untyped_storage().data_ptr() == 0,
                 lambda: (
                     "Expected all cuda outputs in cuda graph recording. Non cuda output "
                     f"from {self.stack_traces[i] if self.stack_traces else '(unknown)'}"
+=======
+            (
+                torch._check(
+                    o.is_cuda or o.untyped_storage().data_ptr() == 0,
+                    lambda: (
+                        "Expected all cuda outputs in cuda graph recording. Non cuda output "
+                        f"from {self.stack_traces[i] if self.stack_traces else '(unknown)'}"
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
 
@@ -1682,7 +1722,10 @@ def _allocate_and_copy_recording_inputs(
             for i, inp in enumerate(inputs):
                 if not isinstance(inp, torch.Tensor):
                     assert isinstance(inp, (int, torch.Generator))
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     recording_inputs.append(inp)
                 elif i not in self.static_input_idxs:
                     # static_input does an allocation!
@@ -1847,7 +1890,10 @@ def check_memory_pool(
         formatted = []
         for dp, block in allocated_not_in_live_storages.items():
             trace = format_tb(block.get("frames", []))
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             formatted.append(f"Data Pointer: {dp}, history: \n{trace}")
         formatted_s = "\n".join(formatted)
         msg = (
@@ -2555,11 +2601,15 @@ def apply_checkpoint_execution_state_in_allocator(self) -> None:
         live_storages_weak_refs: list[int] = [t() for t in live_storages_wrappers]  # type: ignore[misc]
         ptrs_to_deallocate = self.current_node.data_ptrs_dead_since_invocation()
         torch._C._cuda_setCheckpointPoolState(
+<<<<<<< HEAD
             device,
             # pyrefly: ignore [bad-argument-type]
             state,
             stale_storages,
             live_storages_weak_refs,
+=======
+            device, state, stale_storages, live_storages_weak_refs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # NB: deduplicate aliased outputs
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
index effed470548cb..e02e45b32e3ca 100644
--- a/torch/_inductor/cudagraph_utils.py
+++ b/torch/_inductor/cudagraph_utils.py
@@ -10,11 +10,17 @@
 from torch._inductor.utils import GraphPartitionMap, InputType
 from torch.utils._ordered_set import OrderedSet
 
+<<<<<<< HEAD
 from .utils import is_using_cudagraph_partition
 
 
 if TYPE_CHECKING:
     from collections.abc import Sequence, Set as AbstractSet
+=======
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
@@ -110,8 +116,12 @@ def format_default_skip_message(reason: str) -> str:
 
 
 def get_mutation_stack_trace(
+<<<<<<< HEAD
     placeholders: Sequence[PlaceholderInfo],
     mutation_indices: Union[AbstractSet[int], Sequence[int]],
+=======
+    placeholders: Sequence[PlaceholderInfo], mutation_indices: Sequence[int]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> str:
     stack_trace: Optional[str] = ""
 
@@ -173,8 +183,12 @@ def check_multiple_devices_or_any_cpu_nodes(
     # meta tensors are supported since there is no compute
     device_node_mapping.pop(torch.device("meta"), None)
 
+<<<<<<< HEAD
     # dynamo cudagraph does not support graph partition
     if is_using_cudagraph_partition():
+=======
+    if torch._inductor.config.graph_partition:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # graph partition supports splitting on cpu op. So we can ignore cpu nodes.
         device_node_mapping.pop(torch.device("cpu"), None)
 
@@ -204,10 +218,13 @@ def check_lowering_disable_cudagraph(
 def log_cudagraph_skip_and_bump_counter(msg: str) -> None:
     perf_hint_log.warning(msg)
     counters["inductor"]["cudagraph_skips"] += 1
+<<<<<<< HEAD
 
     if torch._inductor.config.triton.cudagraph_or_error:
         raise RuntimeError(msg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     metrics_context = get_metrics_context()
     if metrics_context.in_progress():
         metrics_context.set("cudagraph_skip_reason", msg, overwrite=True)
diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py
index 97497ceeada81..eb98eef35b57a 100644
--- a/torch/_inductor/custom_graph_pass.py
+++ b/torch/_inductor/custom_graph_pass.py
@@ -1,6 +1,9 @@
 import hashlib
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Sequence
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import lru_cache
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -100,6 +103,7 @@ def get_hash_for_files(paths: tuple[str], extra: str = "") -> bytes:
     hasher.update(extra.encode("utf-8"))
     for path in paths:
         with open(path, "rb") as f:
+<<<<<<< HEAD
             hasher.update(f.read())
     return hasher.digest()
 
@@ -157,3 +161,8 @@ def uuid(self) -> Optional[Any]:
 
 
 CustomPartitionerFnType: TypeAlias = Optional[CustomPartitionerFn]
+=======
+            hasher.update(path.encode("utf-8"))
+            hasher.update(f.read())
+    return hasher.digest()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index 9b412e7808237..f82842f341387 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -13,12 +13,17 @@
 import pstats
 import shutil
 import traceback
+<<<<<<< HEAD
 from collections.abc import Iterator, Sequence
+=======
+from collections.abc import Iterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, IO, Optional, Union
 from unittest.mock import patch
 
 import torch
 from functorch.compile import draw_graph, get_aot_graph_name, get_graph_being_compiled
+<<<<<<< HEAD
 from torch import fx
 from torch._dynamo.repro.after_aot import save_graph_repro
 from torch._dynamo.utils import get_debug_dir
@@ -26,6 +31,12 @@
 from torch._logging import getArtifactLogger
 from torch._logging._internal import trace_structured
 from torch._utils_internal import signpost_event
+=======
+from torch import fx as fx
+from torch._dynamo.repro.after_aot import save_graph_repro
+from torch._dynamo.utils import get_debug_dir
+from torch._logging import getArtifactLogger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.graph_module import GraphModule
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
 from torch.fx.passes.tools_common import legalize_graph
@@ -34,7 +45,10 @@
 from torch.utils._pytree import tree_map
 
 from . import config, ir  # noqa: F811, this is needed
+<<<<<<< HEAD
 from .ir import ExternKernel
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .scheduler import (
     BaseSchedulerNode,
     FusedSchedulerNode,
@@ -47,11 +61,14 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 # Graph execution tracking for debugging
 GRAPH_EXECUTION_ORDER: Optional[list[dict[str, object]]] = None
 RECORD_GRAPH_EXECUTION: bool = False
 GRAPH_COMPILE_IDS: Optional[dict[int, Optional[str]]] = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ir_pre_fusion_log = getArtifactLogger(__name__, "ir_pre_fusion")
 ir_post_fusion_log = getArtifactLogger(__name__, "ir_post_fusion")
 SchedulerNodeList = list[Any]
@@ -97,7 +114,10 @@ def draw_buffers(
             dtype = node.data.dtype
 
         metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node.meta["tensor_meta"] = metadata
 
     if print_graph:
@@ -229,7 +249,10 @@ def update_orig_fx_node_name_to_buf_name(
             )
             continue
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type, unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(children_nodes) == 1 and children_nodes[0] == node
 
         ir_node = node.node
@@ -253,7 +276,10 @@ def get_node_name_to_buf_meta(
         if buf_name not in buf_name_to_n_node:
             buf_name_to_n_node[buf_name] = OrderedSet([node_name])
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buf_name_to_n_node[buf_name].add(node_name)
 
     node_name_to_buf_meta = {}
@@ -324,6 +350,7 @@ def enable_aot_logging() -> Iterator[None]:
 # Used for provenance tracking
 # They are not stored in DebugContext because they are not set in
 # _inductor_triton_kernel_to_post_grad_node_info's Debug Context
+<<<<<<< HEAD
 _inductor_post_to_pre_grad_nodes: dict[str, dict[str, list[str]]] = {}
 _inductor_triton_kernel_to_post_grad_node_info: dict[str, list[str]] = {}
 _pre_grad_graph_id: Optional[int] = None
@@ -378,11 +405,21 @@ def reset_provenance_globals() -> Iterator[None]:
         _inductor_pre_grad_node_stack_trace = (
             original_inductor_pre_grad_node_stack_trace
         )
+=======
+_inductor_post_to_pre_grad_nodes: dict[str, Any] = {}
+_pre_grad_graph_id: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DebugContext:
     _counter = itertools.count()
 
+<<<<<<< HEAD
+=======
+    # Used for provenance tracking
+    _inductor_triton_kernel_to_post_grad_node_info: dict[str, list[str]] = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def create_debug_dir(folder_name: str) -> Optional[str]:
         debug_dir = config.trace.debug_dir or get_debug_dir()
@@ -618,6 +655,28 @@ def draw_orig_fx_graph(
     def output_code(self, filename: str, extension: str = "py") -> None:
         shutil.copy(filename, self.filename(f"output_code.{extension}"))
 
+<<<<<<< HEAD
+=======
+    def log_inductor_triton_kernel_to_post_grad_node_info(
+        self, filename: str = "inductor_generated_kernel_to_post_grad_nodes.json"
+    ) -> tuple[dict[str, list[str]], dict[str, Any]]:
+        debug_info = {}
+        with self.fopen(filename, "w") as fd:
+            log.info("Writing provenance tracing debugging info to %s", fd.name)
+            debug_info = DebugContext._inductor_triton_kernel_to_post_grad_node_info
+            json.dump(debug_info, fd)
+        node_mapping = {}
+        if _pre_grad_graph_id:
+            with self.fopen(
+                "inductor_provenance_tracking_node_mappings.json", "w"
+            ) as fd:
+                node_mapping = create_node_mapping(
+                    _pre_grad_graph_id, _inductor_post_to_pre_grad_nodes, debug_info
+                )
+                json.dump(node_mapping, fd)
+        return debug_info, node_mapping
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def log_autotuning_results(
         self,
         name: str,
@@ -723,6 +782,7 @@ def log_ir_post_fusion(nodes: SchedulerNodeList) -> None:
     V.debug.ir_post_fusion(nodes)
 
 
+<<<<<<< HEAD
 def _dump_collective_schedule(schedule: list[Union[str, None]]) -> None:
     try:
         trace_structured(
@@ -849,6 +909,8 @@ def record_and_log_graph_execution_order() -> Iterator[None]:
         GRAPH_COMPILE_IDS = None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class TensorMetadataHolder:
     tensor_metadata: TensorMetadata
@@ -858,6 +920,7 @@ class TensorMetadataHolder:
 save_args_cnt = itertools.count()
 
 
+<<<<<<< HEAD
 def create_mapping_pre_post_grad_nodes(
     pre_grad_graph_id: Optional[int],
     post_to_pre_grad_nodes_json: dict[str, Any],
@@ -872,19 +935,67 @@ def create_mapping_pre_post_grad_nodes(
         "postToPre": {},
     }
 
+=======
+def create_node_mapping(
+    pre_grad_graph_id: int,
+    post_to_pre_grad_nodes_json: dict[str, Any],
+    triton_kernel_to_post_grad_json: dict[str, Any],
+) -> dict[str, dict[str, Any]]:
+    """Create bidirectional mappings between:
+
+    - pre_grad graph nodes and post_grad graph code nodes, and vice versa
+    - triton kernel name and post_grad graph code nodes, and vice versa
+    """
+
+    # return a dummy dict if there's any error
+    empty_return: dict[str, dict[str, Any]] = {
+        "preToPost": {},
+        "postToPre": {},
+        "cppCodeToPost": {},
+        "postToCppCode": {},
+    }
+
+    log.info("Creating node mappings for provenance tracking")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(post_to_pre_grad_nodes_json, dict):
         log.error("Provenance tacking error: post_to_pre_grad_nodes_json is not a dict")
         return empty_return
 
+<<<<<<< HEAD
     if not isinstance(pre_grad_graph_id, int):
         # pre_grad_graph_id may be empty if there's no pre_grad graph
         # and there's only a backward graph from backward pass engine
+=======
+    if not isinstance(triton_kernel_to_post_grad_json, dict):
+        log.error(
+            "Provenance tacking error: triton_kernel_to_post_grad_json is not a dict"
+        )
+        return empty_return
+
+    if not isinstance(pre_grad_graph_id, int):
+        log.error("Provenance tacking error: pre_grad_graph_id is not an int")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return empty_return
 
     pre_to_post: dict[str, Any] = collections.defaultdict(OrderedSet)
     post_to_pre: dict[str, Any] = collections.defaultdict(OrderedSet)
 
+<<<<<<< HEAD
     try:
+=======
+    post_to_cpp_code: dict[str, Any] = collections.defaultdict(OrderedSet)
+
+    try:
+        for outer_key, node_array in triton_kernel_to_post_grad_json.items():
+            if not isinstance(node_array, list):
+                log.error(
+                    "Provenance tacking error: triton_kernel_to_post_grad_json value is not a list"
+                )
+                return empty_return
+            for curr_node in node_array:
+                post_to_cpp_code[curr_node].add(outer_key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def check_format(node: dict[str, Any]) -> bool:
             if not isinstance(node, dict):
@@ -934,6 +1045,7 @@ def convert_sets_to_lists(d: dict[str, Any]) -> None:
         # convert to list because set is not JSON serializable
         convert_sets_to_lists(pre_to_post)
         convert_sets_to_lists(post_to_pre)
+<<<<<<< HEAD
         return {
             "preToPost": pre_to_post,
             "postToPre": post_to_pre,
@@ -994,12 +1106,19 @@ def convert_sets_to_lists(d: dict[str, Any]) -> None:
         # convert to list because set is not JSON serializable
         convert_sets_to_lists(post_to_cpp_code)
         return {
+=======
+        convert_sets_to_lists(post_to_cpp_code)
+        return {
+            "preToPost": pre_to_post,
+            "postToPre": post_to_pre,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "cppCodeToPost": triton_kernel_to_post_grad_json,
             "postToCppCode": post_to_cpp_code,
         }
     except Exception as e:
         # Since this is just logging code, it should never interfere with regular
         # program execution, so we use this try-except to guard against any error
+<<<<<<< HEAD
         signpost_event(
             "inductor",
             "provenance_tracking_error",
@@ -1175,6 +1294,18 @@ def set_kernel_post_grad_provenance_tracing(
         return None
 
 
+=======
+        log.error("Unexpected error in create_node_mapping: %s", e)
+        log.error("post_to_pre_grad_nodes_json:  %s", post_to_pre_grad_nodes_json)
+        log.error(
+            "triton_kernel_to_post_grad_json:  %s", triton_kernel_to_post_grad_json
+        )
+        log.error("pre_grad_graph_id:  %s", pre_grad_graph_id)
+        log.error(traceback.format_exc())
+        return empty_return
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def save_args_for_compile_fx_inner(*args: Any, **kwargs: Any) -> None:
     """
     This function is used to save arguments for a compile_fx_inner function call
@@ -1258,7 +1389,11 @@ def aot_inductor_minifier_wrapper(
 
     use_minifier = config.aot_inductor.dump_aoti_minifier
 
+<<<<<<< HEAD
     gm = exported_program.module(check_guards=False)
+=======
+    gm = exported_program.module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(gm, torch.fx.GraphModule)
 
     args, kwargs = exported_program.example_inputs
@@ -1287,7 +1422,11 @@ def aot_inductor_minifier_wrapper(
             tuple_inputs = tuple(flat_example_inputs)
             flattened_ep = torch.export.export(gm_copy, tuple_inputs, strict=False)
             func(
+<<<<<<< HEAD
                 flattened_ep.module(check_guards=False),
+=======
+                flattened_ep.module(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tuple_inputs,
                 inductor_configs=config_copy,
                 package_path=package_path,
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 6b40365c6e906..63afe04c1ca2b 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -34,7 +34,15 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     type_to_dtype,
 )
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import guard_or_false, statically_known_true
+=======
+from torch.fx.experimental.symbolic_shapes import (
+    guard_or_false,
+    guard_size_oblivious,
+    statically_known_true,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import config, inductor_prims
 from .utils import (
@@ -158,6 +166,22 @@ def _embedding_dense_backward(
     )
 
 
+<<<<<<< HEAD
+=======
+# TODO: for now, inductor doesn't handle asserts
+# because the condition is symbol -> tensor in the graph.
+@register_decomposition([aten._assert_async.msg])
+def assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
+    return
+
+
+# Following `assert_async_msg_decomp` and implement as non-op.
+@register_decomposition([aten._functional_assert_async.msg])
+def functional_assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
+    return
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_decomposition([aten.sym_constrain_range_for_size.default])
 def sym_constrain_range_for_size(
     symbol: torch.SymInt,
@@ -350,7 +374,11 @@ def mm(
             and guard_or_false((torch.numel(self) + torch.numel(input2)) <= 32)
         ):
             counters["inductor"]["decompose_mm"] += 1
+<<<<<<< HEAD
             return self * input2
+=======
+            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if statically_known_true(self.size(0) == 1) and statically_known_true(
             input2.size(-1) == 1
         ):
@@ -387,10 +415,17 @@ def non_empty_tensor(x: torch.Tensor) -> bool:
         # runtime assert forcing u0 to be zero.  So if this hasn't happened,
         # we know that the unbacked SymInt has appropriate size and there are
         # no problems.
+<<<<<<< HEAD
         if len(x.shape) == 1 and guard_or_false(x.shape[0] == 0):
             return False
 
         if dim < len(x.shape) and guard_or_false(x.shape[dim] == 0):
+=======
+        if len(x.shape) == 1 and guard_size_oblivious(x.shape[0] == 0):
+            return False
+
+        if dim < len(x.shape) and guard_size_oblivious(x.shape[dim] == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         return True
@@ -459,6 +494,7 @@ def add(
     y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
     if not x_is_complex_tensor or not y_is_complex_tensor:
         return NotImplemented
+<<<<<<< HEAD
 
     def _requires_fallback(tensor: torch.Tensor) -> bool:
         if tensor.ndim == 0:
@@ -475,14 +511,19 @@ def _requires_fallback(tensor: torch.Tensor) -> bool:
     if y.ndim == 0:
         y = y.reshape(1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     z = y
     if alpha is not None:
         z = alpha * y
     complex_type = torch.promote_types(x.dtype, y.dtype)
 
+<<<<<<< HEAD
     if _requires_fallback(x) or _requires_fallback(z):
         return NotImplemented
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For complex typed `x`, `x.view(x.real.dtype)` doubles the last dimension and can cause problem
     # when broadcasting the add.
     def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
@@ -509,9 +550,12 @@ def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
     x_reshaped = reshape_tensor_complex(x.view(x.real.dtype))
     z_reshaped = reshape_tensor_complex(z.view(y.real.dtype))
     result = torch.flatten(x_reshaped + z_reshaped, start_dim=-2).view(complex_type)
+<<<<<<< HEAD
 
     if output_size_zero:
         return result[0]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result
 
 
@@ -585,6 +629,7 @@ def view_copy_dtype(
     return self.to(dtype).clone()
 
 
+<<<<<<< HEAD
 def _get_shape_permutation_like(
     self: torch.Tensor,
 ) -> tuple[utils.ShapeType, utils.StrideType]:
@@ -596,6 +641,51 @@ def _get_shape_permutation_like(
         permutation[l] = p
 
     return (shape, permutation)
+=======
+def get_like_layout(
+    tensor: torch.Tensor,
+    memory_format: Optional[torch.memory_format] = None,
+) -> torch.memory_format:
+    # TODO: _to_copy tensor to stride permutation
+    if memory_format is torch.preserve_format or memory_format is None:
+        return utils.suggest_memory_format(tensor)
+    else:
+        return memory_format
+
+
+@register_decomposition(aten.rand_like)
+def rand_like(
+    self: torch.Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return torch.rand(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randn_like)
+def randn_like(
+    self: torch.Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return torch.randn(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_decomposition(aten.full_like)
@@ -610,6 +700,7 @@ def full_like(
     requires_grad: bool = False,
     memory_format: torch.memory_format = torch.preserve_format,
 ) -> torch.Tensor:
+<<<<<<< HEAD
     dtype = self.dtype if dtype is None else dtype
     layout = self.layout if layout is None else layout
     device = self.device if device is None else device
@@ -688,13 +779,63 @@ def randn_like(self: torch.Tensor, **kwargs: Any) -> torch.Tensor:
 @register_decomposition(aten.randint_like.default)
 def randint_like(self: torch.Tensor, high: int, **kwargs: Any) -> torch.Tensor:
     return _rand_like(functools.partial(aten.randint.low, 0, high), self, **kwargs)
+=======
+    return torch.full(
+        [*self.size()],
+        fill_value,
+        dtype=dtype or self.dtype,
+        layout=layout or self.layout,
+        device=device or self.device,
+        requires_grad=requires_grad,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randint_like.default)
+def randint_like(
+    self: torch.Tensor,
+    high: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return aten.randint.low(
+        0,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_decomposition(aten.randint_like.low_dtype)
 def randint_like_low(
+<<<<<<< HEAD
     self: torch.Tensor, low: int, high: int, **kwargs: Any
 ) -> torch.Tensor:
     return _rand_like(functools.partial(aten.randint.low, low, high), self, **kwargs)
+=======
+    self: torch.Tensor,
+    low: int,
+    high: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return aten.randint.low(
+        low,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_decomposition(aten.randint.default)
@@ -710,7 +851,11 @@ def randint(
 def linear_dynamic_fp16_unpacked_weight(
     input: torch.Tensor,
     weight: torch.Tensor,
+<<<<<<< HEAD
     bias: Optional[torch.Tensor] = None,
+=======
+    bias: torch.Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor:
     packed_weight = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16(weight)
     return torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight(
@@ -885,10 +1030,13 @@ def select_decomp_table() -> dict[Any, Callable[..., Any]]:
     """decomps can change based on config"""
     if config.fallback_random:
         return decompositions
+<<<<<<< HEAD
     if config.fallback_embedding_bag_byte_unpack:
         # remove q_embedding_bag_byte_unpack_decomp from decompositions
         decompositions.pop(torch.ops.quantized.embedding_bag_byte_unpack.default, None)
         return decompositions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fast_random_decomps()
 
 
@@ -1045,6 +1193,7 @@ def _max_pool_with_indices(
     if not stride:
         stride = kernel_size
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     kernel_size = pad_listlike(kernel_size, dim)
     # pyrefly: ignore [bad-assignment]
@@ -1052,6 +1201,11 @@ def _max_pool_with_indices(
     # pyrefly: ignore [bad-assignment]
     padding = pad_listlike(padding, dim)
     # pyrefly: ignore [bad-assignment]
+=======
+    kernel_size = pad_listlike(kernel_size, dim)
+    dilation = pad_listlike(dilation, dim)
+    padding = pad_listlike(padding, dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stride = pad_listlike(stride, dim)
 
     window_size = functools.reduce(operator.mul, kernel_size)
@@ -1167,6 +1321,7 @@ def rrelu_with_noise_functional(
     else:
         negative_slope = (lower + upper) / 2
         return aten.leaky_relu(self, negative_slope), torch.Tensor()
+<<<<<<< HEAD
 
 
 @register_decomposition(aten.repeat_interleave.Tensor)
@@ -1235,3 +1390,5 @@ def conv1d_to_conv2d(
 
     # Squeeze dummy dimension back out: (N,C_out,L_out,1) -> (N,C_out,L_out)
     return out_2d.squeeze(-1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index cf3b2a6b60932..e4f2de49e7ad7 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -11,7 +11,10 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import get_free_symbols
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import free_symbols, free_unbacked_symbols
 from torch.utils._ordered_set import OrderedSet
 
@@ -22,6 +25,10 @@
     get_dtype_size,
     reduction_num_outputs,
     sympy_index_symbol,
+<<<<<<< HEAD
+=======
+    sympy_str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_subs,
     VarRanges,
 )
@@ -39,12 +46,15 @@ class Dep(abc.ABC):
     index: sympy.Expr
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
         pass
 
     @abc.abstractmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def rename(self, renames: dict[str, str]) -> Self:
         pass
 
@@ -70,14 +80,19 @@ def normalize_with_stride_order(self, prefix: str = "t") -> Self:
 
 @dataclasses.dataclass(frozen=True)
 class MemoryDep(Dep):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     name: str
     # pyrefly: ignore [bad-override]
+=======
+    name: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     index: sympy.Expr
     var_names: tuple[sympy.Symbol, ...]
     size: tuple[sympy.Expr, ...]
     mode: Optional[str] = None
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
@@ -87,6 +102,8 @@ def get_free_symbol_uses(
             | get_free_symbols(self.var_names, unbacked_only)
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __repr__(self) -> str:
         maybe_mode = ""
         if self.mode is not None:
@@ -151,7 +168,11 @@ def decide_loop_order_to_match(self, other: "MemoryDep") -> Optional[list[int]]:
         stride_to_index = {s: i for i, s in enumerate(self_strides)}
         order = [stride_to_index[s] for s in other_strides]
 
+<<<<<<< HEAD
         assert OrderedSet(order) == OrderedSet(range(self.num_vars))
+=======
+        assert OrderedSet(order) == OrderedSet(range(0, self.num_vars))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return order
 
     def get_offset(self) -> sympy.Expr:
@@ -308,13 +329,19 @@ def is_indirect(self) -> bool:
 
 @dataclasses.dataclass(frozen=True)
 class StarDep(Dep):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name: str
     mode: Optional[str] = None
 
     # depends on the entire buffer
     @property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def index(self) -> sympy.Expr:
         raise NotImplementedError("StarDep does not have an index")
 
@@ -326,11 +353,14 @@ def rename(self, renames: dict[str, str]) -> "StarDep":
             return StarDep(renames[self.name], self.mode)
         return self
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def numbytes_hint(self) -> int:
         try:
             return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
@@ -363,6 +393,7 @@ def is_indirect(self) -> bool:
 @dataclasses.dataclass(frozen=True)
 class WeakDep(Dep):
     # Fake dependency on unused buffer
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     name: str
     # Buffer that is doing the mutation
@@ -381,6 +412,13 @@ def get_free_symbol_uses(
 
     @property
     # pyrefly: ignore [bad-override]
+=======
+    name: str
+    # Buffer that is doing the mutation
+    mutating_buf: str
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def index(self) -> sympy.Expr:
         raise NotImplementedError("WeakDep does not have an index")
 
@@ -389,7 +427,11 @@ def get_numel(self) -> sympy.Expr:
 
     def rename(self, renames: dict[str, str]) -> "WeakDep":
         if self.name in renames:
+<<<<<<< HEAD
             return WeakDep(renames[self.name], self.mutating_buf, self.is_fake)
+=======
+            return WeakDep(renames[self.name], self.mutating_buf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def numbytes_hint(self) -> int:
@@ -477,6 +519,7 @@ def buffer_names(self, ignore_integer_index: bool = True) -> OrderedSet[str]:
                 names.add(dep.name)
         return names
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
@@ -486,6 +529,8 @@ def get_free_symbol_uses(
             result |= dep.get_free_symbol_uses(unbacked_only)
         return result
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
     def __init__(self, var_ranges: VarRanges, normalize: bool) -> None:
@@ -559,6 +604,7 @@ def canonicalize(
         }
         return self._normalize(index, var_ranges)
 
+<<<<<<< HEAD
     def load(self, name: str, index: sympy.Expr) -> None:
         self._reads.add(MemoryDep(name, *self.canonicalize(index)))
 
@@ -576,6 +622,28 @@ def store_reduction(self, name: str, index: sympy.Expr, value: str) -> None:
 
     def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> None:
         self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+=======
+    def load(self, name: str, index: sympy.Expr) -> str:
+        self._reads.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"load({name}, {sympy_str(index)})"
+
+    def load_seed(self, name: str, index: int) -> str:
+        assert isinstance(index, int)
+        return self.load(name, sympy.Integer(index))
+
+    def store(
+        self, name: str, index: sympy.Expr, value: str, mode: Optional[str] = None
+    ) -> str:
+        self._writes.add(MemoryDep(name, *self.canonicalize(index), mode=mode))
+        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
+
+    def store_reduction(self, name: str, index: sympy.Expr, value: str) -> str:
+        return self.store(name, index, f"store_reduction({value})")
+
+    def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> str:
+        self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+        return f"index_expr({sympy_str(index)}, {dtype})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def bucketize(
         self,
@@ -624,12 +692,21 @@ def index_vars_no_squeeze(
 
 def index_vars_squeeze(
     *argsizes: Sequence[sympy.Expr], prefix: str = "d"
+<<<<<<< HEAD
 ) -> tuple[list[Sequence[sympy.Expr]], VarRanges]:
     from .ir import SqueezeView
 
     var_ranges, add_var = var_builder(prefix)
     args: list[Sequence[sympy.Expr]] = []
     new_sizes: list[Sequence[sympy.Expr]] = []
+=======
+) -> tuple[list[list[sympy.Expr]], VarRanges]:
+    from .ir import SqueezeView
+
+    var_ranges, add_var = var_builder(prefix)
+    args: list[list[sympy.Expr]] = []
+    new_sizes: list[list[sympy.Expr]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for size in argsizes:
         new_size, reindex = SqueezeView.squeezer(size)
         new_sizes.append(new_size)
@@ -650,10 +727,14 @@ def extract_read_writes(
 
     if isinstance(fn, LoopBody):
         inner = extract_loop_body_with_args(
+<<<<<<< HEAD
             fn,
             [*args, *hidden_args],  # type: ignore[list-item]
             var_ranges,
             normalize,
+=======
+            fn, [*args, *hidden_args], var_ranges, normalize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         # Slow path tracing the function
@@ -668,11 +749,16 @@ def extract_read_writes(
         range_vars = [*itertools.chain.from_iterable(args)]
 
     return ReadWrites(
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         OrderedSet(inner._reads),
         # pyrefly: ignore [missing-attribute]
         OrderedSet(inner._writes),
         # pyrefly: ignore [missing-attribute]
+=======
+        OrderedSet(inner._reads),
+        OrderedSet(inner._writes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inner._index_exprs,
         range_vars,
         var_ranges,
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
index aab2c49fe4e35..40108eff58184 100644
--- a/torch/_inductor/dtype_propagation.py
+++ b/torch/_inductor/dtype_propagation.py
@@ -58,7 +58,10 @@ def promote_types(
 ):
     dtype_prop_candidates = []
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args:
         assert not isinstance(arg, str)
         if isinstance(arg, OpsValue):
@@ -69,7 +72,10 @@ def promote_types(
             dtype_prop_candidates.append((type_to_dtype(type(arg)), True))
             continue
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype_prop_candidates.append((arg.dtype, getattr(arg, "is_scalar", False)))
 
     dtype = get_promoted_dtype(
@@ -249,6 +255,7 @@ def store(name: str, index, value: DTypeArg, mode: Optional[str] = None) -> None
         return None
 
     @staticmethod
+<<<<<<< HEAD
     def partial_accumulate(
         name: str,
         reduction_type: str,
@@ -257,6 +264,8 @@ def partial_accumulate(
         return None
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def load(name: str, index) -> torch.dtype:
         return upcast_compute_type(V.graph.get_dtype(name))
 
@@ -358,11 +367,14 @@ def halide_clamp(value, size, check):
         return torch.int32
 
     @staticmethod
+<<<<<<< HEAD
     def dot(x: DTypeArg, y: DTypeArg) -> torch.dtype:
         # triton tl.dot out_dtype is tl.float32 by default.
         return torch.float32
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inline_asm_elementwise(
         *inputs, asm, constraints=None, dtype=torch.float32, is_pure=True, pack=1
     ):
@@ -388,10 +400,13 @@ def placeholder(self, index: int) -> torch.dtype:
             f"{type(self).__name__}: ops.placeholder should not appear here"
         )
 
+<<<<<<< HEAD
     @staticmethod
     def device_assert_async(cond, msg: str) -> None:
         return None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING:
 
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index 8c932c0369897..891325eb4cae3 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -92,9 +92,12 @@ def __init__(self, cmd: list[str], output: str) -> None:
         if isinstance(output, bytes):
             output = output.decode("utf-8")
 
+<<<<<<< HEAD
         self.cmd = cmd
         self.output = output
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             textwrap.dedent(
                 """
@@ -111,9 +114,12 @@ def __init__(self, cmd: list[str], output: str) -> None:
             .format(cmd=" ".join(cmd), output=output)
         )
 
+<<<<<<< HEAD
     def __reduce__(self) -> tuple[type, tuple[list[str], str]]:
         return (self.__class__, (self.cmd, self.output))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CUDACompileError(CppCompileError):
     pass
@@ -132,7 +138,10 @@ def __init__(self, first_useful_frame: Optional[types.FrameType]) -> None:
 class GPUTooOldForTriton(ShortenTraceback):
     def __init__(
         self,
+<<<<<<< HEAD
         # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_props: _CudaDeviceProperties,
         first_useful_frame: Optional[types.FrameType],
     ) -> None:
diff --git a/torch/_inductor/freezing.py b/torch/_inductor/freezing.py
index 76bad0ec967b1..ebf14516e25a0 100644
--- a/torch/_inductor/freezing.py
+++ b/torch/_inductor/freezing.py
@@ -153,7 +153,10 @@ def __init__(self, elem, name: Optional[str], mod) -> None:
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
         erased_tensors = [
             e
+<<<<<<< HEAD
             # pyrefly: ignore [bad-unpacking]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for e in pytree.arg_tree_leaves(*args, **kwargs)
             if isinstance(e, ErasedTensor)
         ]
@@ -178,7 +181,10 @@ def invalidate_eager_modules():
             for attr_name, tensor in list(
                 itertools.chain(
                     mod.named_parameters(recurse=False),
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     mod.named_buffers(recurse=False),
                 )
             ):
@@ -194,9 +200,13 @@ def discard_traced_gm_params(mod: torch.fx.GraphModule):
     with torch.utils._python_dispatch._disable_current_modes():
         for attr_name, tensor in list(
             itertools.chain(
+<<<<<<< HEAD
                 mod.named_parameters(recurse=False),
                 # pyrefly: ignore [bad-argument-type]
                 mod.named_buffers(recurse=False),
+=======
+                mod.named_parameters(recurse=False), mod.named_buffers(recurse=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ):
             with torch._dispatch.python.no_python_dispatcher():
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 0a8459da2ffc9..9363dd9c2d238 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -5,6 +5,10 @@
 import random
 import signal
 import string
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import traceback
 from collections.abc import KeysView, Sequence
 from enum import Enum
@@ -22,8 +26,12 @@
 )
 
 import torch
+<<<<<<< HEAD
 from functorch.compile import min_cut_rematerialization_partition
 from torch._inductor.custom_graph_pass import CustomGraphPass, CustomPartitionerFn
+=======
+from torch._inductor.custom_graph_pass import CustomGraphPass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch.utils._config_module import _ConfigEntry, ConfigModule
 from torch.utils._ordered_set import OrderedSet
@@ -74,6 +82,7 @@ def uuid(self) -> Optional[Any]:
         return None
 
 
+<<<<<<< HEAD
 class DummyPartitionerFn(CustomPartitionerFn):
     """
     A Dummy partitioner function to be used by ConfigFuzzer
@@ -88,6 +97,8 @@ def uuid(self) -> Optional[Any]:
         return None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T = TypeVar("T")
 
 
@@ -98,7 +109,10 @@ class TypeExemplars:
 
     TYPE_EXEMPLARS: dict[str, Any] = {
         CustomGraphPass.__name__: DummyPass(),
+<<<<<<< HEAD
         CustomPartitionerFn.__name__: DummyPartitionerFn(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.fx.graph.Graph.__name__: torch.fx.graph.Graph(),
         BaseSchedulerNode.__name__: BaseSchedulerNode(None),  # type: ignore[arg-type]
     }
@@ -108,12 +122,18 @@ def example(t: type[T]) -> Optional[T]:
         """
         Return an example of a class.
         """
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type, bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return TypeExemplars.TYPE_EXEMPLARS.get(t.__name__, None)
 
     @staticmethod
     def contains(t: type[T]) -> bool:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type, bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return t.__name__ in TypeExemplars.TYPE_EXEMPLARS
 
 
@@ -220,6 +240,7 @@ def _generate_value_for_type(
         if field_name in TYPE_OVERRIDES:
             return random.choice(TYPE_OVERRIDES[field_name])
 
+<<<<<<< HEAD
         if type_hint is bool:
             return random.choice([True, False]) if random_sample else not default
         elif type_hint is int:
@@ -229,6 +250,17 @@ def _generate_value_for_type(
         elif type_hint is float:
             return random.uniform(0, 1000)
         elif type_hint is str:
+=======
+        if type_hint == bool:
+            return random.choice([True, False]) if random_sample else not default
+        elif type_hint == int:
+            # NOTE initially tried to use negation of the value, but it doesn't work because most types are ints
+            # when they should be natural numbers + zero. Python types to cover these values aren't super convenient.
+            return random.randint(0, 1000)
+        elif type_hint == float:
+            return random.uniform(0, 1000)
+        elif type_hint == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             characters = string.ascii_letters + string.digits + string.punctuation
             return "".join(
                 random.choice(characters) for _ in range(random.randint(1, 20))
@@ -306,11 +338,19 @@ def _generate_value_for_type(
                 new_type = random.choice(type_hint.__args__)
             else:
                 new_type = random.choice(
+<<<<<<< HEAD
                     [t for t in type_hint.__args__ if t is not type(default)]
                 )
             try:
                 new_default = new_type()
             except Exception:
+=======
+                    [t for t in type_hint.__args__ if t != type(default)]
+                )
+            try:
+                new_default = new_type()
+            except Exception:  # noqa: E722
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # if default constructor doesn't work, try None
                 new_default = None
 
@@ -385,7 +425,11 @@ def dummy_function(*args, **kwargs):  # type: ignore[no-untyped-def]
         elif TypeExemplars.contains(type_hint):
             return TypeExemplars.example(type_hint)
         elif type_hint == Any:
+<<<<<<< HEAD
             return 1 if default != 1 else 2
+=======
+            return 1 if not default == 1 else 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise ValueError(f"Unable to process type {type_hint}. PRs welcome :)")
 
@@ -516,7 +560,10 @@ def keys(self) -> KeysView[ComboType]:
         "joint_custom_post_pass": DEFAULT,  # Typing
         "joint_custom_pre_pass": DEFAULT,  # Typing
         "pre_grad_custom_pass": DEFAULT,  # Typing
+<<<<<<< HEAD
         "custom_partitioner_fn": DEFAULT,  # Typing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
     "torch._dynamo.config": {
         "traceable_tensor_subclasses": DEFAULT,  # Typing
@@ -524,7 +571,10 @@ def keys(self) -> KeysView[ComboType]:
         "compiled_autograd_kwargs_override": DEFAULT,  # Typing
         "fail_on_recompile_limit_hit": DEFAULT,  # fails in combo with suppress_errors
         "suppress_errors": DEFAULT,
+<<<<<<< HEAD
         "caching_precompile": False,  # Required
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
 }
 
@@ -611,6 +661,12 @@ def __init__(
             sm: How type value samples are generated, default TOGGLE.
             test_timeout: max time a test can take.
         """
+<<<<<<< HEAD
+=======
+        if sys.version_info < (3, 10):
+            log.error("Only python 3.10 and later supported")
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.seed = seed
         self.test_timeout = test_timeout
         self.detailed_results: dict[ComboType, dict[str, Any]] = {}
@@ -779,7 +835,11 @@ def handle_return(
         test_model_fn = self.test_model_fn_factory()
         try:
             test_model_fn()
+<<<<<<< HEAD
         except Exception as exc:
+=======
+        except Exception as exc:  # noqa: E722
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return handle_return(
                 "Eager exception", Status.FAILED_RUN_EAGER_EXCEPTION, True, exc
             )
@@ -788,7 +848,11 @@ def handle_return(
         try:
             test_model_fn2 = self.test_model_fn_factory()
             comp = torch.compile(test_model_fn2, backend="inductor")
+<<<<<<< HEAD
         except Exception as exc:
+=======
+        except Exception as exc:  # noqa: E722
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return handle_return(
                 "Exception compiling", Status.FAILED_COMPILE, True, exc
             )
@@ -796,7 +860,11 @@ def handle_return(
         # try running compiled
         try:
             compile_result = comp()
+<<<<<<< HEAD
         except Exception as exc:
+=======
+        except Exception as exc:  # noqa: E722
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return handle_return(
                 "Exception running compiled",
                 Status.FAILED_RUN_COMPILE_EXCEPTION,
diff --git a/torch/_inductor/fx_passes/b2b_gemm.py b/torch/_inductor/fx_passes/b2b_gemm.py
index 303d9bfd59a39..d67ca01c85ca7 100644
--- a/torch/_inductor/fx_passes/b2b_gemm.py
+++ b/torch/_inductor/fx_passes/b2b_gemm.py
@@ -12,7 +12,10 @@
     FixedLayout,
     FlexibleLayout,
     InputBuffer,
+<<<<<<< HEAD
     ShapeAsConstantBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     StorageBox,
     Subgraph,
     TensorBox,
@@ -122,7 +125,11 @@ def b2b_gemm_grid(M, P, meta, *, cdiv):
     idx_p = offs_p[None, :]
     out_mask = (idx_m < M) & (idx_p < P)
 
+<<<<<<< HEAD
     {{store_output(("idx_m", "idx_p"), "acc", "out_mask", val_shape=("BLOCK_SIZE_M", "BLOCK_SIZE_P"))}}
+=======
+    {{store_output(("idx_m", "idx_p"), "acc", "out_mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -204,7 +211,11 @@ def b2b_gemm_grid(M, P, meta, *, cdiv):
     idx_p = offs_p[None, :]
     out_mask = (idx_m < M) & (idx_p < P)
 
+<<<<<<< HEAD
     {{store_output(("idx_m", "idx_p"), "acc", "out_mask", val_shape=("BLOCK_SIZE_M", "BLOCK_SIZE_P"))}}
+=======
+    {{store_output(("idx_m", "idx_p"), "acc", "out_mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -475,7 +486,13 @@ def build_subgraph_buffer(
         elif node.op == "call_function":
             # For call_function we use the default lowerings and pass in the
             # already created TensorBoxes as args
+<<<<<<< HEAD
             args, kwargs = tree_map(lambda x: env.get(x, x), (node.args, node.kwargs))
+=======
+            args, kwargs = tree_map(
+                lambda x: env[x] if x in env else x, (node.args, node.kwargs)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             env[node] = lowerings[node.target](*args, **kwargs)
         elif node.op == "output":
 
@@ -492,12 +509,19 @@ def convert_output_node_to_buffer(output):
                     "The output node for B2B-GEMM's subgraph must be a StorageBox, but got: ",
                     type(output_buffer),
                 )
+<<<<<<< HEAD
                 device = output_buffer.data.get_device()
                 assert device is not None
                 subgraph_buffer = ComputedBuffer(
                     name=None,
                     layout=FlexibleLayout(
                         device=device,
+=======
+                subgraph_buffer = ComputedBuffer(
+                    name=None,
+                    layout=FlexibleLayout(
+                        device=output_buffer.data.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         dtype=output_buffer.data.get_dtype(),
                         size=output_buffer.data.get_size(),
                     ),
@@ -513,7 +537,11 @@ def convert_output_node_to_buffer(output):
 
 def create_placeholder(
     name: str, dtype: torch.dtype, device: torch.device
+<<<<<<< HEAD
 ) -> TensorBox | ShapeAsConstantBuffer:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Creates a placeholder input buffers for producing subgraph_output
     """
@@ -539,11 +567,16 @@ def tuned_b2b_gemm(
         A.get_dtype(),
         [A.shape[0], C.shape[1]],  # type: ignore[index]
     )
+<<<<<<< HEAD
     placeholders = [
         create_placeholder("inner_mm", A.get_dtype(), A.get_device_or_error())
     ]
     subgraph_buffer = build_subgraph_buffer(
         placeholders,  # type: ignore[arg-type, list-item]
+=======
+    subgraph_buffer = build_subgraph_buffer(
+        [create_placeholder("inner_mm", A.get_dtype(), A.get_device_or_error())],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subgraph,
     )
     choices: list[TritonTemplateCaller] = []
@@ -577,7 +610,10 @@ def tuned_b2b_gemm(
 # match the inner mm of a potential b2b_gemm
 @register_graph_pattern(
     CallFunction(torch.ops.aten.mm, Arg(), Arg()),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=B2B_GEMM_PASS,
 )
 def b2b_gemm_handler(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node) -> None:
@@ -689,12 +725,19 @@ def all_reach_via_pointwise_with_no_other_inputs(
     for node in graph.nodes:  # preserve the order of nodes
         if node in subgraph_node_set:
             subgraph_node_list.append(node)
+<<<<<<< HEAD
             new_node = new_graph.node_copy(node, lambda x: node_remapping.get(x, x))
+=======
+            new_node = new_graph.node_copy(
+                node, lambda x: node_remapping[x] if x in node_remapping else x
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node_remapping[node] = new_node
             if node is inner_mm:
                 new_input_anchor = new_node
             if node is f_node:
                 new_output_anchor = new_node
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
     if new_input_anchor is not new_output_anchor:  # subgraph is non-trivial
         # update the input node
@@ -722,6 +765,24 @@ def all_reach_via_pointwise_with_no_other_inputs(
             # pyrefly: ignore [unbound-name]
             new_input_anchor.replace_all_uses_with(new_input_node)
         # pyrefly: ignore [unbound-name]
+=======
+    if new_input_anchor is not new_output_anchor:  # subgraph is non-trivial
+        # update the input node
+        with new_graph.inserting_before(new_input_anchor):
+            new_input_node = new_graph.placeholder(name="subgraph_input")
+            new_input_node.meta.update(new_input_anchor.meta)
+            new_input_anchor.replace_all_uses_with(new_input_node)
+        new_graph.erase_node(new_input_anchor)
+        # add the output node
+        new_output_node = new_graph.output(new_output_anchor)
+        new_output_node.meta.update(new_output_anchor.meta)
+    else:  # subgraph is trivial, e.g. (A @ (B @ C))
+        # update the input node
+        with new_graph.inserting_before(new_input_anchor):
+            new_input_node = new_graph.placeholder(name="subgraph_input")
+            new_input_node.meta.update(new_input_anchor.meta)
+            new_input_anchor.replace_all_uses_with(new_input_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_graph.erase_node(new_input_anchor)
         # update the output node (don't use new_output_anchor since it has been erased)
         new_output_node = new_graph.output(new_input_node)
diff --git a/torch/_inductor/fx_passes/binary_folding.py b/torch/_inductor/fx_passes/binary_folding.py
index f2f68a76c426f..dab8ffa06bd4e 100644
--- a/torch/_inductor/fx_passes/binary_folding.py
+++ b/torch/_inductor/fx_passes/binary_folding.py
@@ -19,18 +19,30 @@ def mark_mixed_dtype(computation_node):
     if computation_node_dtype not in (torch.float16, torch.bfloat16):
         return
 
+<<<<<<< HEAD
     if len(computation_node.users) != 1:
+=======
+    if not len(computation_node.users) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     computation_node_user = next(iter(computation_node.users.keys()))
     if not isinstance(computation_node_user.meta["val"], torch.Tensor):
         return
 
+<<<<<<< HEAD
     if computation_node_user.meta["val"].dtype != torch.float32:
         return
 
     while computation_node_user.target in _binary_ops:
         if len(computation_node_user.users) != 1:
+=======
+    if not computation_node_user.meta["val"].dtype == torch.float32:
+        return
+
+    while computation_node_user.target in _binary_ops:
+        if not len(computation_node_user.users) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         computation_node_user = next(iter(computation_node_user.users.keys()))
@@ -188,7 +200,11 @@ def _check_conv_and_broadcast_op(conv_node, other):
         ):
             return False
 
+<<<<<<< HEAD
         if len(conv_node.args[1].users) != 1:
+=======
+        if not len(conv_node.args[1].users) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         weight_meta_value = conv_node.args[1].meta.get("val")
@@ -242,7 +258,11 @@ def _check_linear_and_broadcast_op(linear_node, other, has_reshape):
         ):
             return False
 
+<<<<<<< HEAD
         if len(weight_node.users) != 1:
+=======
+        if not len(weight_node.users) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         weight_meta_value = weight_node.meta.get("val")
diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py
index 8f55d670058fb..3e4b90713bfe8 100644
--- a/torch/_inductor/fx_passes/ddp_fusion.py
+++ b/torch/_inductor/fx_passes/ddp_fusion.py
@@ -7,7 +7,11 @@
 from collections.abc import Generator
 from dataclasses import dataclass
 from functools import partial
+<<<<<<< HEAD
 from typing import Any, Callable, cast
+=======
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx as fx
@@ -39,12 +43,21 @@ def move_block_before(block: list[fx.Node], target_node: fx.Node) -> None:
 
 def call_function(
     graph: fx.Graph,
+<<<<<<< HEAD
     target: str | Callable[..., Any],
     args: tuple[fx.node.Argument, ...] | None = None,
     kwargs: dict[str, fx.node.Argument] | None = None,
 ) -> fx.Node:
     # We accept target as a str to avoid typing error as the type of
     # a node.target is str | Callable[..., Any].
+=======
+    target: Union[str, Callable[..., Any]],
+    args: Optional[tuple[fx.node.Argument, ...]] = None,
+    kwargs: Optional[dict[str, fx.node.Argument]] = None,
+) -> fx.Node:
+    # We accept target as a str to avoid typing error as the type of
+    # a node.target is Union[str, Callable[..., Any]].
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This also allows us to avoid writing check for every call.
     if isinstance(target, str):
         raise RuntimeError(f"Call function should not get a str target {target=}")
@@ -62,7 +75,11 @@ def call_function(
 
 @dataclass(unsafe_hash=True)
 class CommBlock:
+<<<<<<< HEAD
     shape: torch.Size | list[torch.Size]
+=======
+    shape: Union[torch.Size, list[torch.Size]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     node_list: list[fx.Node]
     inputs: list[fx.Node]
     wait_nodes: list[fx.Node]
@@ -70,7 +87,11 @@ class CommBlock:
     outputs: OrderedSet[fx.Node]
 
 
+<<<<<<< HEAD
 def get_comm_block(comm_node: fx.Node) -> CommBlock | None:
+=======
+def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Given a collective node (e.g., allreduce), find out all the nodes belong to
     this communication.
@@ -128,7 +149,11 @@ def get_comm_block(comm_node: fx.Node) -> CommBlock | None:
                 break
 
     tensor_meta = input_nodes[0].meta["tensor_meta"]
+<<<<<<< HEAD
     shape: torch.Size | list[torch.Size]
+=======
+    shape: Union[torch.Size, list[torch.Size]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(tensor_meta, TensorMetadata):
         shape = tensor_meta.shape
     elif isinstance(tensor_meta, (list, tuple)):
@@ -150,7 +175,11 @@ def get_comm_block(comm_node: fx.Node) -> CommBlock | None:
 def get_all_comm_blocks(
     graph: fx.Graph,
     comm_ops: tuple[torch._ops.OpOverload, ...],
+<<<<<<< HEAD
     comm_filter: Callable[..., bool] | None = None,
+=======
+    comm_filter: Optional[Callable[..., bool]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[CommBlock]:
     if comm_filter is None:
 
@@ -215,7 +244,10 @@ def _fuse_allreduce_by_concat(
 
     # Move the fused all_reduce and its args to right after the input node
     nodes_to_move = cat_inputs + [cat_node, div_node, fused_comm_node, fused_wait_node]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     move_block_after(nodes_to_move, last_input_node)
 
     return CommBlock(
@@ -308,7 +340,10 @@ def _scatter_fused_allreduce_waits(
     # in orig_comm_blocks. This index will be later used to determine what users
     # nodes need to be move to maintain a correct topological sort order.
     last_wait_node_idx = 0
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in graph.nodes:
         last_wait_node_idx = max(
             node_indices.get(node, last_wait_node_idx), last_wait_node_idx
@@ -358,7 +393,10 @@ def _scatter_fused_allreduce_waits(
             user_node = nodes.popleft()
             if not isinstance(user_node, fx.Node):
                 continue
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node_indices[user_node] < last_wait_node_idx:
                 incorrect_order_nodes.append(user_node)
                 nodes.extend(list(user_node.users))
@@ -571,7 +609,11 @@ def schedule_comm_wait(graph: fx.Graph) -> None:
 
 
 def fuse_ddp_communication(
+<<<<<<< HEAD
     graph: fx.Graph, passes: list[Callable[..., None] | str], bucket_size_mb: int
+=======
+    graph: fx.Graph, passes: list[Union[Callable[..., None], str]], bucket_size_mb: int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     for i, pa in enumerate(passes):
         with GraphTransformObserver(
diff --git a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
index 61ddd42a1d157..aa98cce9f701b 100644
--- a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
+++ b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -4,10 +4,14 @@
 import torch
 from torch import Tensor
 from torch._dynamo.utils import counters, is_node_meta_valid
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import (
     statically_known_false,
     statically_known_true,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import statically_known_true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .. import config
 from ..pattern_matcher import Arg, CallFunction, Match, register_graph_pattern
@@ -18,6 +22,7 @@
 log = logging.getLogger(__name__)
 
 # TODO: need a better strategy for decomposing mm
+<<<<<<< HEAD
 # The following two constants are for CUDA device only
 MIN_FIRST_DIMENSION_DECOMPOSITION = 10240
 MAX_OTHER_DIMENSION_DECOMPOSITION = 32
@@ -29,6 +34,13 @@
 max_other_dimension_decomposition = MAX_OTHER_DIMENSION_DECOMPOSITION
 cpu_max_first_dimension_decomposition = CPU_MAX_FIRST_DIMENSION_DECOMPOSITION
 cpu_max_other_dimension_decomposition = CPU_MAX_OTHER_DIMENSION_DECOMPOSITION
+=======
+MIN_FIRST_DIMENSION_DECOMPOSITION = 10240
+MAX_OTHER_DIMENSION_DECOMPOSITION = 32
+
+min_first_dimension_decomposition = MIN_FIRST_DIMENSION_DECOMPOSITION
+max_other_dimension_decomposition = MAX_OTHER_DIMENSION_DECOMPOSITION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "decompose_mm_pass" in config.post_grad_fusion_options:
     min_first_dimension_decomposition = config.post_grad_fusion_options[
         "decompose_mm_pass"
@@ -36,6 +48,7 @@
     max_other_dimension_decomposition = config.post_grad_fusion_options[
         "decompose_mm_pass"
     ].get("max_other_dimension_decomposition", MAX_OTHER_DIMENSION_DECOMPOSITION)
+<<<<<<< HEAD
     cpu_max_first_dimension_decomposition = config.post_grad_fusion_options[
         "decompose_mm_pass"
     ].get(
@@ -46,6 +59,8 @@
     ].get(
         "cpu_max_other_dimension_decomposition", CPU_MAX_OTHER_DIMENSION_DECOMPOSITION
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def check_device(a: Tensor, b: Tensor, device="cuda") -> bool:
@@ -70,6 +85,7 @@ def should_decompose_bmm(mat1, mat2) -> bool:
         if mat1.shape[0] < min_first_dimension_decomposition:
             return False
         # 2 of m, n, k must be <= MAX_OTHER_DIMENSION_DECOMPOSITION
+<<<<<<< HEAD
         # use bool() to deal with BooleanAtom type
         if (
             bool(mat1.shape[1] < max_other_dimension_decomposition)
@@ -84,11 +100,21 @@ def should_decompose_bmm(mat1, mat2) -> bool:
             mat1.shape[0] <= cpu_max_first_dimension_decomposition
             and mat2.shape[0] <= cpu_max_first_dimension_decomposition
         ):
+=======
+        if (mat1.shape[1] < max_other_dimension_decomposition) + (
+            mat1.shape[2] < max_other_dimension_decomposition
+        ) + (mat2.shape[2] < max_other_dimension_decomposition) < 2:
+            return False
+        return True
+    elif check_device(mat1, mat2, device="cpu"):
+        if mat1.shape[0] == 1 and mat2.shape[0] == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
     return False
 
 
 def should_decompose_mm(mat1, mat2) -> bool:
+<<<<<<< HEAD
     """
     Determines whether matrix multiplication (mm) should be decomposed into pointwise operations
     based on the input matrices' metadata, shapes, device placement, and configuration options.
@@ -118,6 +144,8 @@ def should_decompose_mm(mat1, mat2) -> bool:
         - Designed for use in graph optimization or fusion passes where decomposing large or dynamic
           matrix multiplications can improve performance or memory usage.
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
         mat1 = mat1.meta["val"]
         mat2 = mat2.meta["val"]
@@ -125,6 +153,7 @@ def should_decompose_mm(mat1, mat2) -> bool:
         return False
     if len(mat1.shape) != 2 or len(mat2.shape) != 2:
         return False
+<<<<<<< HEAD
     # case 1: we skip decompose mm if the input is dynamic shape
     if not config.post_grad_fusion_options["decompose_mm_pass"].get(
         "skip_dynamic_shape_dim_check", False
@@ -199,6 +228,19 @@ def should_decompose_mm(mat1, mat2) -> bool:
                 )
             )
         )
+=======
+    return (
+        check_device(mat1, mat2, device="cuda")
+        and statically_known_true(mat1.shape[0] >= min_first_dimension_decomposition)
+        and statically_known_true(mat2.shape[0] < max_other_dimension_decomposition)
+        and statically_known_true(mat2.shape[1] < max_other_dimension_decomposition)
+    ) or (
+        check_device(mat1, mat2, device="cpu")
+        and statically_known_true(mat1.shape[0] == 1)
+        and statically_known_true(mat2.shape[0] <= 128)
+        and statically_known_true(mat2.shape[1] <= 512)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def print_decompose_pattern(match: Match, inputs: list[torch.fx.Node]):
@@ -225,7 +267,10 @@ def repl(mat1, mat2):
 
     if should_decompose_bmm(mat1, mat2):
         counters["inductor"]["decompose_bmm"] += 1
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(repl, [mat1, mat2])
         print_decompose_pattern(match, [mat1, mat2])
         realize_inputs([mat1, mat2])
@@ -249,7 +294,10 @@ def repl(mat1, mat2, mat3):
 
     if should_decompose_mm(mat2, mat3):
         counters["inductor"]["decompose_addmm"] += 1
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(repl, [mat1, mat2, mat3])
         print_decompose_pattern(match, [mat1, mat2, mat3])
         realize_inputs([mat1, mat2, mat3])
@@ -270,7 +318,10 @@ def repl(mat1, mat2):
 
     if should_decompose_mm(mat1, mat2):
         counters["inductor"]["decompose_mm"] += 1
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(repl, [mat1, mat2])
         print_decompose_pattern(match, [mat1, mat2])
         realize_inputs([mat1, mat2])
diff --git a/torch/_inductor/fx_passes/dedupe_symint_uses.py b/torch/_inductor/fx_passes/dedupe_symint_uses.py
index 7b431c2f17117..e381ed4db32e8 100644
--- a/torch/_inductor/fx_passes/dedupe_symint_uses.py
+++ b/torch/_inductor/fx_passes/dedupe_symint_uses.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any
+=======
+from typing import Any, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import SymBool, SymFloat, SymInt
@@ -14,7 +18,11 @@ class _SymExprHash:
     Hash for a py_sym_types that will use the underlying sympy expression
     """
 
+<<<<<<< HEAD
     sym_obj: SymInt | SymFloat | SymBool
+=======
+    sym_obj: Union[SymInt, SymFloat, SymBool]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self) -> int:
         return hash((type(self.sym_obj), self.sym_obj.node.expr))
diff --git a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
index 72c853f7e5f66..21083390b45b6 100644
--- a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
+++ b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -108,10 +108,21 @@ def efficient_conv_bn_eval_decomposed(
     else:
         bias_on_the_fly = torch.zeros_like(bn_running_var)
 
+<<<<<<< HEAD
     if bn_weight is None:
         bn_weight = torch.ones_like(bn_running_var)
 
     if bn_bias is None:
+=======
+    if bn_weight is not None:
+        bn_weight = bn_weight
+    else:
+        bn_weight = torch.ones_like(bn_running_var)
+
+    if bn_bias is not None:
+        bn_bias = bn_bias
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bn_bias = torch.zeros_like(bn_running_var)
 
     # shape of [C_out, 1, 1, 1] in Conv2d
@@ -140,7 +151,10 @@ def efficient_conv_bn_eval_decomposed(
             torch.nn.functional.batch_norm,
         ]
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
@@ -232,7 +246,10 @@ def efficient_conv_bn_eval_graph_transform_inlined(match: Match, *args, **kwargs
             torch.ops.aten.batch_norm.default,
         ]
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
@@ -328,7 +345,10 @@ def efficient_conv_bn_eval_graph_transform_decomposed(match: Match, *args, **kwa
             nn.SyncBatchNorm,
         ],
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
diff --git a/torch/_inductor/fx_passes/freezing_patterns.py b/torch/_inductor/fx_passes/freezing_patterns.py
index 04add7596b6d4..f8f742ef5d5ec 100644
--- a/torch/_inductor/fx_passes/freezing_patterns.py
+++ b/torch/_inductor/fx_passes/freezing_patterns.py
@@ -107,7 +107,10 @@ def register_freezing_graph_pattern(pattern, extra_check=_return_true, pass_numb
     return register_graph_pattern(
         pattern,
         extra_check=extra_check,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=pass_patterns[pass_number],
     )
 
@@ -116,7 +119,10 @@ def register_binary_folding_pattern(pattern, extra_check=_return_true):
     return register_graph_pattern(
         pattern,
         extra_check=extra_check,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=binary_folding_pass,
     )
 
@@ -162,6 +168,16 @@ def check_int8_woq_concat_linear_weights(match):
         ):
             return False
 
+<<<<<<< HEAD
+=======
+        equal_shape_inputs = [weight_inputs]
+        for equal_shape_group in equal_shape_inputs:
+            inps = [match.kwargs[name] for name in equal_shape_group]
+            if not all(
+                inp.meta["val"].shape == inps[0].meta["val"].shape for inp in inps
+            ):
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     def check_concat_weights(match):
@@ -200,6 +216,7 @@ def int8_woq_fusion_replacement(inp, w1, w2, w3, s1, s2, s3):
         cat_w = torch.cat((w1, w2, w3), dim=1)
         cat_s = torch.cat((s1, s2, s3), dim=0)
         mm = (inp @ cat_w).mul(cat_s)
+<<<<<<< HEAD
         n1, n2 = w1.size(1), w2.size(1)
         return mm.tensor_split([n1, n1 + n2], dim=-1)
 
@@ -212,6 +229,15 @@ def int8_woq_fusion_replacement(inp, w1, w2, w3, s1, s2, s3):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        return mm.chunk(3, dim=1)
+
+    register_replacement(
+        int8_woq_fusion_pattern,
+        int8_woq_fusion_replacement,
+        [val(), val(), val(), val(), scale(), scale(), scale()],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_patterns[0],
         extra_check=check_int8_woq_concat_linear_weights,
         exclusive_arg_names=("w1", "w2", "w3", "s1", "s2", "s3"),
@@ -226,6 +252,7 @@ def matmul_replacement(inp, w1, w2, w3):
         return mm.chunk(3, dim=1)
 
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         matmul_fuse_pattern,
         # pyrefly: ignore [bad-argument-type]
@@ -234,6 +261,12 @@ def matmul_replacement(inp, w1, w2, w3):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        matmul_fuse_pattern,
+        matmul_replacement,
+        [val(), val(), val(), val()],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2", "w3"),
@@ -248,6 +281,7 @@ def matmul_replacement_two(inp, w1, w2):
         return mm.chunk(2, dim=1)
 
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         matmul_fuse_pattern_two,
         # pyrefly: ignore [bad-argument-type]
@@ -256,6 +290,12 @@ def matmul_replacement_two(inp, w1, w2):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        matmul_fuse_pattern_two,
+        matmul_replacement_two,
+        [val(), val(), val()],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2"),
@@ -274,6 +314,7 @@ def addmm_fuse_replacement_second(inp, w1, w2, w3, b1, b2, b3):
         return aten.addmm(cat_b, inp, cat_w).chunk(3, dim=1)
 
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         addmm_fuse_pattern_second,
         # pyrefly: ignore [bad-argument-type]
@@ -282,6 +323,12 @@ def addmm_fuse_replacement_second(inp, w1, w2, w3, b1, b2, b3):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        addmm_fuse_pattern_second,
+        addmm_fuse_replacement_second,
+        [val() for _ in range(7)],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2", "w3", "b1", "b2", "b3"),
@@ -298,7 +345,10 @@ def same_dtype(match):
         Ignored(),
         KeywordArg("dtype"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=pass_patterns[0],
     extra_check=same_dtype,
 )
diff --git a/torch/_inductor/fx_passes/fuse_attention.py b/torch/_inductor/fx_passes/fuse_attention.py
index 9a09d25313488..5ca2cc0f5cccd 100644
--- a/torch/_inductor/fx_passes/fuse_attention.py
+++ b/torch/_inductor/fx_passes/fuse_attention.py
@@ -18,6 +18,10 @@
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _scaled_dot_product_attention = aten.scaled_dot_product_attention
 
 
@@ -607,7 +611,11 @@ def _sfdp_replacement_21(query, key, value, attn_mask):
         query,
         key,
         value,
+<<<<<<< HEAD
         attn_mask=attn_mask.to(dtype=query.dtype),
+=======
+        attn_mask=attn_mask,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_causal=False,
         scale=1.0,
     )
@@ -640,7 +648,11 @@ def _sfdp_replacement_22(query, key, value, attn_mask):
             query,
             key,
             value,
+<<<<<<< HEAD
             attn_mask=attn_mask.to(dtype=query.dtype),
+=======
+            attn_mask=attn_mask,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_causal=False,
             scale=1.0,
         ),
@@ -687,6 +699,7 @@ def _sfdp_replacement_23(query, key, value):
     )
 
 
+<<<<<<< HEAD
 def _sfdp_pattern_24(query, key, value, attention_mask):
     """
     this pattern is for MBartForCausalLM/PLBartForCausalLM.
@@ -723,6 +736,8 @@ def _sfdp_replacement_24(query, key, value, attention_mask):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _sfdp_params_check(match):
     assert all(k in match.kwargs for k in ("query", "key", "value"))
     query = match.kwargs["query"].meta["val"]
@@ -1025,6 +1040,7 @@ def _get_sfdp_patterns():
                 _sfdp_params_check,
             ),
             (
+<<<<<<< HEAD
                 _sfdp_pattern_21,
                 _sfdp_replacement_21,
                 [g_bs1(), g_bs1(), g_bs1(), m_bs1_float()],
@@ -1032,6 +1048,8 @@ def _get_sfdp_patterns():
                 _sfdp_params_check,
             ),
             (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _sfdp_pattern_22,
                 _sfdp_replacement_22,
                 [g(), g(), g(), m_float()],
@@ -1039,6 +1057,7 @@ def _get_sfdp_patterns():
                 _sfdp_params_check,
             ),
             (
+<<<<<<< HEAD
                 _sfdp_pattern_22,
                 _sfdp_replacement_22,
                 [g_bs1(), g_bs1(), g_bs1(), m_bs1_float()],
@@ -1046,12 +1065,15 @@ def _get_sfdp_patterns():
                 _sfdp_params_check,
             ),
             (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _sfdp_pattern_23,
                 _sfdp_replacement_23,
                 [g(), g(), g()],
                 {},
                 _sfdp_params_check,
             ),
+<<<<<<< HEAD
             (
                 _sfdp_pattern_23,
                 _sfdp_replacement_23,
@@ -1066,6 +1088,8 @@ def _get_sfdp_patterns():
                 {},
                 _sfdp_extra_check,
             ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         mask_fp32_patterns = ["pattern_16"]
         if dtype == torch.half:
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index 471bdfb02813b..ff61dc44d6416 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -4,7 +4,11 @@
 import operator
 from collections import OrderedDict
 from collections.abc import Iterable, Iterator
+<<<<<<< HEAD
 from typing import Any
+=======
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._dynamo.utils import counters, is_node_meta_valid
@@ -185,7 +189,13 @@ def _is_input_2d(self, input: torch.fx.Node) -> bool:
             and isinstance(input_shapes[1], int)
         )
 
+<<<<<<< HEAD
     def match(self, node: torch.fx.Node) -> tuple[str, int, int, int, bool, str] | None:
+=======
+    def match(
+        self, node: torch.fx.Node
+    ) -> Optional[tuple[str, int, int, int, bool, str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if CallFunctionVarArgs(aten.mm).match(node):
             input_m, weight_m = node.args
             bias_m = None
@@ -323,7 +333,11 @@ def _mm_node_can_be_fused(self, node: torch.fx.Node):
             )
         )
 
+<<<<<<< HEAD
     def match(self, node: torch.fx.Node) -> tuple[str, bool] | None:
+=======
+    def match(self, node: torch.fx.Node) -> Optional[tuple[str, bool]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if CallFunctionVarArgs(aten.mm.default).match(
             node
         ) and self._mm_node_can_be_fused(node):
@@ -491,7 +505,11 @@ class BatchLinearLHSFusion(BatchFusion):
     We have a separate pass to eliminate contiguous transpose in a generic way.
     """
 
+<<<<<<< HEAD
     def match(self, node: torch.fx.Node) -> tuple[str, bool, Any] | None:
+=======
+    def match(self, node: torch.fx.Node) -> Optional[tuple[str, bool, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if CallFunctionVarArgs(torch.nn.functional.linear).match(
             node
         ) and is_linear_node_can_be_fused(node):
@@ -838,7 +856,10 @@ def fuse(self, graph: torch.fx.GraphModule, subset: list[torch.fx.Node]):
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stack_weight.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.mul,
@@ -849,33 +870,48 @@ def fuse(self, graph: torch.fx.GraphModule, subset: list[torch.fx.Node]):
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stack_bias.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.add,
                 )
             elif group_weights is not None and group_biases is None:
                 previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+<<<<<<< HEAD
                 # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 batch_layer_norm = graph.call_function(
                     torch.mul, args=(stack_weight, batch_layer_norm)
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stack_weight.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.mul,
                 )
             elif group_weights is None and group_biases is not None:
                 previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+<<<<<<< HEAD
                 # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 batch_layer_norm = graph.call_function(
                     torch.add, args=(stack_bias, batch_layer_norm)
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stack_bias.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.add,
@@ -1145,12 +1181,15 @@ def __init__(self, **kwargs):
         super().__init__(torch.clamp, **kwargs)
 
 
+<<<<<<< HEAD
 @register_fusion("batch_dropout")
 class BatchDropoutPreGradFusion(BatchMathOpsPreGradFusion):
     def __init__(self, **kwargs):
         super().__init__(torch.nn.functional.dropout, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_fusion("batch_aten_tanh", pre_grad=False)
 class BatchTanhPostGradFusion(BatchPointwiseOpsPostGradFusion):
     def __init__(self, **kwargs) -> None:
@@ -1375,6 +1414,7 @@ def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusion
             print_output=False, include_stride=True, include_device=True
         )
 
+<<<<<<< HEAD
         name = f"optimus_{str(rule.__class__.__name__)}"
         if "MTIA" in name:
             name = f"cff_{str(rule.__class__.__name__)}"
@@ -1382,6 +1422,12 @@ def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusion
             "artifact",
             metadata_fn=lambda: {
                 "name": name,
+=======
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": f"optimus_{str(rule.__class__.__name__)}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "encoding": "string",
             },
             payload_fn=lambda: graph_str,
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index 87075efc20258..c774db311ac40 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -6,7 +6,11 @@
 import typing
 from collections import Counter
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import Any
+=======
+from typing import Any, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._guards
@@ -31,7 +35,11 @@
     KeywordArg,
     Match,
     MULTIPLE,
+<<<<<<< HEAD
     PatternMatcherPass as PatternMatcherPassBase,
+=======
+    PatternMatcherPass,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_graph_pattern,
     stable_topological_sort,
 )
@@ -39,10 +47,13 @@
 from .replace_random import replace_random_passes
 
 
+<<<<<<< HEAD
 PatternMatcherPass = functools.partial(
     PatternMatcherPassBase, subsystem="joint_graph_passes"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 patterns = PatternMatcherPass()
 aten = torch.ops.aten
@@ -517,7 +528,10 @@ def canonicalize_quant_mapping(gm: torch.fx.GraphModule):
             invoke_quant_replacement = graph.call_function(
                 torch._higher_order_ops.invoke_quant,
                 (subgraph, *args),
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs,
             )
             invoke_quant_replacement.meta.update(subgraph.meta)
@@ -593,6 +607,15 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
             constant_fold_uniform_value
         )
 
+<<<<<<< HEAD
+=======
+    if config.joint_custom_pre_pass is not None:
+        GraphTransformObserver(graph, "joint_custom_pre_pass").apply_graph_pass(
+            config.joint_custom_pre_pass
+        )
+        count += 1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if config.pattern_matcher:
         for i, patterns in enumerate(pass_patterns):
             maybe_count = GraphTransformObserver(
@@ -628,7 +651,10 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
         device=KeywordArg("device"),
         requires_grad=KeywordArg("requires_grad"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def fix_iota_device(match: Match, length, start, step, dtype, device, requires_grad):
@@ -682,7 +708,10 @@ def fix_iota_device(match: Match, length, start, step, dtype, device, requires_g
         ),
         KeywordArg("dtype2"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtype):
@@ -700,8 +729,13 @@ def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtyp
 
 
 def definitely_equal(
+<<<<<<< HEAD
     old_sizes: Sequence[torch.SymInt | int],
     new_sizes: Sequence[torch.SymInt | torch.fx.Node | int],
+=======
+    old_sizes: Sequence[Union[torch.SymInt, int]],
+    new_sizes: Sequence[Union[torch.SymInt, torch.fx.Node, int]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> bool:
     """
     Leverage guard_or_true/false to compare if two lists of int/symint are equal.
@@ -745,7 +779,10 @@ def definitely_equal(
 
 @register_graph_pattern(
     CallFunction(torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def pointless_view(match: Match, arg, size):
@@ -763,7 +800,10 @@ def pointless_view(match: Match, arg, size):
         CallFunction(aten.view.default, KeywordArg("arg"), KeywordArg("size1")),
         KeywordArg("size2"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def pointless_view_pair(match: Match, arg, size1, size2):
@@ -784,7 +824,10 @@ def pointless_view_pair(match: Match, arg, size1, size2):
         CallFunction(aten.permute.default, KeywordArg("arg"), KeywordArg("perm1")),
         KeywordArg("perm2"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def pointless_permute_pair(match: Match, arg, perm1, perm2):
@@ -805,7 +848,10 @@ def pointless_permute_pair(match: Match, arg, perm1, perm2):
         Arg(),
         Arg(),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=patterns,
 )
 def bmm_to_mm(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node):
@@ -819,7 +865,10 @@ def repl(a, b):
         and statically_known_true(mat1.meta["val"].shape[0] == 1)
         and statically_known_true(mat2.meta["val"].shape[0] == 1)
     ):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(repl, [mat1, mat2])
 
 
@@ -900,7 +949,11 @@ def repl(inp, other):
         if dtype is not None:
             inp = inp.to(dtype)
 
+<<<<<<< HEAD
         sign: int | float | torch.Tensor
+=======
+        sign: Union[int, float, torch.Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(other, (int, float, torch.SymInt, torch.SymFloat)):
             sign = 1 if other >= 0 else -1
         else:
@@ -909,17 +962,25 @@ def repl(inp, other):
 
         inp = inp * sign
         max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         return (inp - max_) * (sign * other)
 
     # pyrefly: ignore [bad-argument-type]
+=======
+        return (inp - max_) * (sign * other)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     match.replace_by_example(repl, [inp, other])
 
 
 for reverse, to_dtype in itertools.product((False, True), repeat=2):
     register_graph_pattern(
         _partial_softmax_pattern(aten.mul.Tensor, reverse=reverse, to_dtype=to_dtype),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=pass_patterns[1],
         extra_check=_other_is_broadcasted_in_dim,
     )(mul_softmax_pattern)
@@ -930,7 +991,11 @@ def repl(inp, other):
         if dtype is not None:
             inp = inp.to(dtype)
 
+<<<<<<< HEAD
         sign: int | float | torch.Tensor
+=======
+        sign: Union[int, float, torch.Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(other, (int, float, torch.SymInt, torch.SymFloat)):
             sign = 1 if other >= 0 else -1
         else:
@@ -939,17 +1004,25 @@ def repl(inp, other):
 
         inp = inp * sign
         max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         return (inp - max_) / (sign * other)
 
     # pyrefly: ignore [bad-argument-type]
+=======
+        return (inp - max_) / (sign * other)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     match.replace_by_example(repl, [inp, other])
 
 
 for to_dtype in (False, True):
     register_graph_pattern(
         _partial_softmax_pattern(aten.div.Tensor, to_dtype=to_dtype),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=pass_patterns[1],
         extra_check=_other_is_broadcasted_in_dim,
     )(div_softmax_pattern)
diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py
index 97b4342fa7638..3b8c090ec90c0 100644
--- a/torch/_inductor/fx_passes/micro_pipeline_tp.py
+++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@@ -4,7 +4,11 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from math import prod
+<<<<<<< HEAD
 from typing import Any, cast
+=======
+from typing import Any, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.utils._ordered_set import OrderedSet
@@ -27,10 +31,13 @@
 patterns = PatternMatcherPass()
 
 
+<<<<<<< HEAD
 def _is_last_dim(t: torch.Tensor, dim: int) -> bool:
     return dim == t.ndim - 1 or dim == -1
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_backward(graph: torch.fx.Graph) -> bool:
     placeholders = []
     for node in graph.nodes:
@@ -378,8 +385,13 @@ class _Matmul:
     arg_ancestor_nodes: OrderedSet[torch.fx.Node] = field(init=False)
     A_node: torch.fx.Node
     B_node: torch.fx.Node
+<<<<<<< HEAD
     pre_mm_reshape: torch.fx.Node | None
     post_mm_reshape: torch.fx.Node | None
+=======
+    pre_mm_reshape: Optional[torch.fx.Node]
+    post_mm_reshape: Optional[torch.fx.Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         assert len(self.nodes) in (1, 3)
@@ -454,12 +466,21 @@ def from_match(cls, match: list[torch.fx.Node]) -> "_Matmul":
 class _ScaledMatmul(_Matmul):
     A_scale_node: torch.fx.Node
     B_scale_node: torch.fx.Node
+<<<<<<< HEAD
     bias_node: torch.fx.Node | None
     result_scale_node: torch.fx.Node | None
     out_dtype: torch.dtype | None
     use_fast_accum: bool
     pre_mm_reshape: torch.fx.Node | None
     post_mm_reshape: torch.fx.Node | None
+=======
+    bias_node: Optional[torch.fx.Node]
+    result_scale_node: Optional[torch.fx.Node]
+    out_dtype: Optional[torch.dtype]
+    use_fast_accum: bool
+    pre_mm_reshape: Optional[torch.fx.Node]
+    post_mm_reshape: Optional[torch.fx.Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         super().__post_init__()
@@ -649,6 +670,7 @@ def fuse_all_gather_matmul(all_gather: _AllGatherMatch) -> None:
     if not is_symm_mem_enabled_for_group(group_name):
         return
 
+<<<<<<< HEAD
     filter_matmul = None
     if _is_last_dim(_get_tensor(shard_node), gather_dim):
         # Decomposed mms should not be too small
@@ -660,6 +682,11 @@ def _filter_out_scaled_matmul(matmul: _Matmul):
             return not isinstance(matmul, _ScaledMatmul)
 
         filter_matmul = _filter_out_scaled_matmul
+=======
+    if gather_dim >= len(_get_tensor(shard_node).shape) - 1:
+        # Decomposing the matmul on the K dimension is not supported
+        return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Find consumer matmuls
     matmuls = _find_consumer_matmuls(ag_res_node)
@@ -675,6 +702,7 @@ def _filter_out_scaled_matmul(matmul: _Matmul):
     if len(matmuls) == 0 or len(OrderedSet(map(type, matmuls))) != 1:
         return
 
+<<<<<<< HEAD
     if _is_last_dim(_get_tensor(shard_node), gather_dim) and len(
         all_gather.res_node.users
     ) > len(matmuls):
@@ -698,6 +726,20 @@ def _filter_out_scaled_matmul(matmul: _Matmul):
                     inductor_prims.force_stride_order,
                     args=(shard_node, restrided.stride()),
                 )
+=======
+    # Fuse the all_gather_tensor with the eligible matmuls
+    graph = ag_node.graph
+    with graph.inserting_before(ag_node):
+        if "val" in shard_node.meta:
+            restrided = restride_A_shard_for_fused_all_gather_matmul(
+                _get_tensor(shard_node),
+                gather_dim,
+            )
+            shard_node = graph.call_function(
+                inductor_prims.force_stride_order,
+                args=(shard_node, restrided.stride()),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fused_node = _insert_fused_all_gather_matmul(
             graph, matmuls, shard_node, gather_dim, group_name
@@ -786,7 +828,11 @@ def _scatter_dim_after_reshape(
     return 0 if leading_dims_collapsed else 1
 
 
+<<<<<<< HEAD
 def _find_producer_matmul(node: torch.fx.Node) -> _Matmul | None:
+=======
+def _find_producer_matmul(node: torch.fx.Node) -> Optional[_Matmul]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns producer matmul node if found, otherwise returns None.
     """
@@ -825,7 +871,11 @@ def _insert_fused_matmul_reduce_scatter(
     scatter_dim_after_reshape: int,  # only used for reshape -> scaled_mm -> reshape pattern
     output_shape: list[int],  # only used for reshape -> scaled_mm -> reshape pattern
 ) -> torch.fx.Node:
+<<<<<<< HEAD
     if type(matmul) is _Matmul:
+=======
+    if type(matmul) == _Matmul:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return graph.call_function(
             torch.ops.symm_mem.fused_matmul_reduce_scatter.default,
             args=(
@@ -836,7 +886,11 @@ def _insert_fused_matmul_reduce_scatter(
                 group_name,
             ),
         )
+<<<<<<< HEAD
     elif type(matmul) is _ScaledMatmul:
+=======
+    elif type(matmul) == _ScaledMatmul:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return graph.call_function(
             torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default,
             args=(
@@ -903,6 +957,7 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
     if not is_symm_mem_enabled_for_group(group_name):
         return
 
+<<<<<<< HEAD
     filter_matmul = None
     if _is_last_dim(_get_tensor(input_node), orig_scatter_dim):
         # scaled_mm is not supported yet for last dim mm+rs
@@ -911,6 +966,8 @@ def _filter_out_scaled_matmul(matmul: _Matmul):
 
         filter_matmul = _filter_out_scaled_matmul
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Currently fused_matmul_reduce_scatter doesn't return the matmul result,
     # so we can't apply the fusion if the matmul result is used by multiple
     # users. This is not a fundamental limitation of the fused op and can be
@@ -922,16 +979,22 @@ def _filter_out_scaled_matmul(matmul: _Matmul):
         return
 
     matmul = _find_producer_matmul(input_node)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if matmul is None:
         log.warning(
             "no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
         )
         return
 
+<<<<<<< HEAD
     if filter_matmul and not filter_matmul(matmul):
         return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
         log.warning(
             "reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"
@@ -1060,7 +1123,11 @@ def _get_unexposed_collectives(graph: torch.fx.Graph) -> list[torch.fx.Node]:
     """
 
     def _is_compute_intensive(node: torch.fx.Node) -> bool:
+<<<<<<< HEAD
         return node.target is torch.ops.aten.mm.default
+=======
+        return node.target in [torch.ops.aten.mm.default]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     collective_to_overlapping_candidates = defaultdict(list)
     available_nodes = OrderedSet[torch.fx.Node]()
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index 2159e8811ad9e..c5d3ad98ee904 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -44,6 +44,7 @@ def randperm_index_add_replacement(x, y):
         )
 
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         randperm_index_add_pattern,
         # pyrefly: ignore [bad-argument-type]
@@ -52,6 +53,12 @@ def randperm_index_add_replacement(x, y):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        randperm_index_add_pattern,
+        randperm_index_add_replacement,
+        [torch.empty(4, 8, device=device), torch.empty(2, 8, device=device)],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [post_grad_patterns, joint_graph_patterns],
     )
 
@@ -64,6 +71,7 @@ def randperm_index_replacement(x, slice_shape):
         return torch.ops.aten._unsafe_index(x, (index,)), index
 
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         randperm_index_pattern,
         # pyrefly: ignore [bad-argument-type]
@@ -72,6 +80,12 @@ def randperm_index_replacement(x, slice_shape):
         # pyrefly: ignore [bad-argument-type]
         fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        randperm_index_pattern,
+        randperm_index_replacement,
+        [torch.empty(4, 8, device=device)],
+        fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [post_grad_patterns, joint_graph_patterns],
         scalar_workaround={"slice_shape": 42},
     )
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index addc6e1ea8ece..9ece3ce4e81ca 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -109,15 +109,22 @@ def pack_linear_weight(
             # depends on the alignment of internally-stored metadata.
             # In aot mode, we need to firstly save the packed weight, when loading it,
             # it will be in a different address which doesn't work.
+<<<<<<< HEAD
             # Disable MKL prepack linear in AOT mode.
             # Disable MKL prepack linear when batch_size has free symbols.
+=======
+            # Disable MKL prepack linear in AOT mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             packed_weight_op = (
                 mkldnn._reorder_linear_weight
                 if (
                     is_lp_weight
                     or mkldnn._is_mkldnn_acl_supported()
                     or V.aot_compilation
+<<<<<<< HEAD
                     or has_free_symbols(batch_size)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 else torch.ops.mkl._mkl_reorder_linear_weight
             )
@@ -130,12 +137,16 @@ def pack_linear(
         ):
             packed_linear_inputs: tuple[Any, ...] = (input, packed_weight_node)
             transpose_weight_node = packed_weight_node.args[0]
+<<<<<<< HEAD
             if (
                 is_lp_weight
                 or mkldnn._is_mkldnn_acl_supported()
                 or V.aot_compilation
                 or has_free_symbols(batch_size)
             ):
+=======
+            if is_lp_weight or mkldnn._is_mkldnn_acl_supported() or V.aot_compilation:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 packed_linear_inputs += (bias, "none", [], "")
                 packed_linear_op: Callable[..., Any] = mkldnn._linear_pointwise.default
             else:
@@ -712,7 +723,10 @@ def _other_input_not_inplaceable(_binary_node, _other_index):
             if any(_other_input_not_inplaceable(n, other_index) for n in binary_nodes):
                 return False
             if any(
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 n.args[other_index].op in ["placeholder", "output"]
                 for n in binary_nodes
             ):
@@ -1226,6 +1240,10 @@ def is_const_or_cat_by_const(weight):
         weight_meta_value = linear_node.args[weight_idx].meta.get("val")
         if input_meta_value is None or weight_meta_value is None:
             return False
+<<<<<<< HEAD
+=======
+        batch_size = input_meta_value.shape[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             input_meta_value.dtype == torch.float64
             or weight_meta_value.dtype == torch.float64
@@ -1235,6 +1253,7 @@ def is_const_or_cat_by_const(weight):
             torch.bfloat16,
             torch.float16,
         )
+<<<<<<< HEAD
         reduced_f32_matmul_enabled = torch.backends.mkldnn.matmul.fp32_precision in [  # type: ignore[attr-defined]
             "bf16",
             "tf32",
@@ -1249,6 +1268,14 @@ def is_const_or_cat_by_const(weight):
             not compute_with_lp
             and not mkldnn._is_mkldnn_acl_supported()
             and not torch._C.has_mkl
+=======
+        # on x86, for fp32, mkl should be enabled and batch_size should not be a free symbol.
+        # on aarch64, use mkldnn op for fp32 as well if acl is enabled
+        if (
+            not is_lp_weight
+            and not mkldnn._is_mkldnn_acl_supported()
+            and ((not torch._C.has_mkl) or has_free_symbols(batch_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return False
         for meta_value in [input_meta_value, weight_meta_value]:
@@ -1459,6 +1486,7 @@ def linear(match, *args, **kwargs):
                     torch.bfloat16,
                     torch.float16,
                 )
+<<<<<<< HEAD
                 reduced_f32_matmul_enabled = (
                     torch.backends.mkldnn.matmul.fp32_precision in ["bf16", "tf32"]  # type: ignore[attr-defined]
                 )
@@ -1472,6 +1500,18 @@ def linear(match, *args, **kwargs):
                 )
                 packed_linear_node = mkldnn_device_op.pack_linear(
                     graph, compute_with_lp, batch_size, input, packed_weight_node, bias
+=======
+                batch_size = input.meta.get("val").shape[0]
+                if has_free_symbols(batch_size):
+                    assert is_lp_weight or mkldnn._is_mkldnn_acl_supported(), (
+                        f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
+                    )
+                packed_weight_node = mkldnn_device_op.pack_linear_weight(
+                    graph, is_lp_weight, transpose_weight_node, batch_size
+                )
+                packed_linear_node = mkldnn_device_op.pack_linear(
+                    graph, is_lp_weight, batch_size, input, packed_weight_node, bias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 linear_node.replace_all_uses_with(packed_linear_node)
diff --git a/torch/_inductor/fx_passes/numeric_utils.py b/torch/_inductor/fx_passes/numeric_utils.py
index b50859448f072..8957084fa0d25 100644
--- a/torch/_inductor/fx_passes/numeric_utils.py
+++ b/torch/_inductor/fx_passes/numeric_utils.py
@@ -207,7 +207,11 @@ def numeric_check_if_enabled(
                 precision=precision,
             )
     except Exception as e:
+<<<<<<< HEAD
         logger.warning(  # noqa: G200
+=======
+        logger.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Runtime numeric check failed in pre grad fx passes with error: %s", e
         )
         traceback.print_exc()
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index 909b199cf4da2..5d5cae1cbde4b 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -3,7 +3,11 @@
 import operator
 import typing
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import Any, Callable
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.runtime.runtime_utils
@@ -83,10 +87,19 @@ def check_dtype(a: Tensor, b: Tensor) -> bool:
     return a.is_floating_point() and b.is_floating_point()
 
 
+<<<<<<< HEAD
 def should_pad_common(mat1: Tensor, mat2: Tensor, input: Tensor | None = None) -> bool:
     # It's fine we have symbolic shapes or strides as long as they
     # have hints. Later, we will make sure we only pad non-symbolic dimensions.
     def valid_shape_and_stride(t: Tensor | None) -> bool:
+=======
+def should_pad_common(
+    mat1: Tensor, mat2: Tensor, input: Optional[Tensor] = None
+) -> bool:
+    # It's fine we have symbolic shapes or strides as long as they
+    # have hints. Later, we will make sure we only pad non-symbolic dimensions.
+    def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if t is None:
             return True
 
@@ -95,7 +108,10 @@ def valid_shape_and_stride(t: Tensor | None) -> bool:
             if isinstance(x, int):
                 continue
             elif utils.is_symbolic(x):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not x.node.has_hint():
                     return False
                 symbolic_cnt += 1
@@ -105,7 +121,10 @@ def valid_shape_and_stride(t: Tensor | None) -> bool:
         if symbolic_cnt == len(t.size()):
             return False
         return all(
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             isinstance(x, int) or (utils.is_symbolic(x) and x.node.has_hint())
             for x in t.stride()
         )
@@ -118,7 +137,11 @@ def valid_shape_and_stride(t: Tensor | None) -> bool:
     )
 
 
+<<<<<<< HEAD
 def get_padded_length(x: int | torch.SymInt, alignment_size: int) -> int:
+=======
+def get_padded_length(x: Union[int, torch.SymInt], alignment_size: int) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # we don't pad x if it is symbolic
     if isinstance(x, torch.SymInt) or alignment_size == 0 or x % alignment_size == 0:
         return 0
@@ -151,7 +174,11 @@ def should_pad_addmm(match: Match) -> bool:
 
 
 def pad_addmm(
+<<<<<<< HEAD
     input: Tensor | None,
+=======
+    input: Optional[Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mat1: Tensor,
     mat2: Tensor,
     m_padded_length: int,
@@ -193,7 +220,11 @@ def pad_addmm(
 
 
 def addmm_replace(
+<<<<<<< HEAD
     input: Tensor | None,
+=======
+    input: Optional[Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mat1: Tensor,
     mat2: Tensor,
     beta: float = 1.0,
@@ -273,7 +304,11 @@ def should_pad_bench_key(
     mat1: Tensor,
     mat2: Tensor,
     op: torch._ops.OpOverloadPacket,
+<<<<<<< HEAD
     input: Tensor | None = None,
+=======
+    input: Optional[Tensor] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_base_time_key: bool = False,
 ) -> str:
     def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
@@ -283,7 +318,11 @@ def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
         None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
     )
 
+<<<<<<< HEAD
     def fmt_pad(name: str) -> str | None:
+=======
+    def fmt_pad(name: str) -> Optional[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_base_time_key:
             return None
         return f"exclude_pad:{should_exclude_padding_time(match, name)}"
@@ -399,7 +438,10 @@ def should_pad_bench(*args: Any, **kwargs: Any) -> bool:
 def get_do_bench() -> Callable[[Callable[[], Any]], float]:
     with dynamo_timed("pad_mm_benchmark_get_do_bench"):
         return functools.partial(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._inductor.runtime.benchmarking.benchmarker.benchmark_gpu,
             warmup=5,
         )
@@ -410,7 +452,11 @@ def _should_pad_bench(
     mat1: Tensor,
     mat2: Tensor,
     op: torch._ops.OpOverloadPacket,
+<<<<<<< HEAD
     input: Tensor | None = None,
+=======
+    input: Optional[Tensor] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> bool:
     do_bench = get_do_bench()
 
@@ -438,7 +484,11 @@ def _should_pad_bench(
             return False
 
         def realize_symbols(
+<<<<<<< HEAD
             ds: torch.Size | tuple[torch.SymInt, ...],
+=======
+            ds: Union[torch.Size, tuple[torch.SymInt, ...]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> list[int]:
             return [d if isinstance(d, int) else d.node.hint for d in ds]
 
@@ -453,6 +503,7 @@ def realize_symbols(
         if torch._inductor.config.force_shape_pad:
             return True
 
+<<<<<<< HEAD
         if torch._inductor.config.deterministic:
             # In deterministic mode, don't benchmark for pad-mm and assumes
             # no padding.
@@ -461,6 +512,8 @@ def realize_symbols(
             # so unit test relying on force_shape_pad should still pass
             return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             "pad_aten_mm_pass" in torch._inductor.config.post_grad_fusion_options
             and should_pad_mm_bf16(mat1.dtype, m, n, k)
@@ -484,7 +537,10 @@ def realize_symbols(
         def realize_tensor(t):
             if isinstance(t, FakeTensor):
                 size_hints = realize_symbols(t.size())
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 stride_hint = realize_symbols(t.stride())
                 real_size = (
                     sum((d - 1) * s for d, s in zip(size_hints, stride_hint)) + 1
@@ -625,8 +681,11 @@ def pad_bench_fn():
             set_cached_base_mm_benchmark_time(ori_time_key, ori_time)
 
         pad_time = do_bench(pad_bench_fn)
+<<<<<<< HEAD
 
         counters["inductor"]["pad_mm_bench"] += 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return should_pad(key, ori_time, pad_time)
 
 
@@ -679,10 +738,17 @@ def run_autoheuristic(
     ori_time: float,
     ori_time_key: str,
     key: str,
+<<<<<<< HEAD
 ) -> bool | None:
     def feedback_fn(
         choice: str,
     ) -> float | None:
+=======
+) -> Optional[bool]:
+    def feedback_fn(
+        choice: str,
+    ) -> Optional[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if choice == orig_choice:
             return do_bench(orig_bench_fn)
         elif choice == pad_choice:
@@ -717,7 +783,11 @@ def fallback() -> str:
     )
     choice = autoheuristic.get_choice()
     choice2should_pad = {orig_choice: False, pad_choice: True, "autotune": None}
+<<<<<<< HEAD
     ah_should_pad = choice2should_pad.get(choice)
+=======
+    ah_should_pad = choice2should_pad.get(choice, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch._inductor.config.collect_autoheuristic(name):
         ah_ori_time = autoheuristic.get_collected_feedback(orig_choice)
@@ -919,9 +989,13 @@ def _pad_mm_init() -> None:
             pattern,
             replacement,
             args,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             joint_fwd_bwd,
             # pyrefly: ignore [bad-argument-type]
+=======
+            joint_fwd_bwd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             patterns,
             extra_check=extra_check,
             scalar_workaround=workaround,
@@ -932,9 +1006,13 @@ def _pad_mm_init() -> None:
             pattern,
             replacement,
             args,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             fwd_only,
             # pyrefly: ignore [bad-argument-type]
+=======
+            fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             patterns,
             extra_check=extra_check,
             scalar_workaround=workaround,
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index bc5e03ea44fc1..fbbe4492865b9 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -5,7 +5,11 @@
 import logging
 import operator
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from typing import Any, Callable, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -42,7 +46,11 @@
     Match,
     MultiOutputPattern,
     MULTIPLE,
+<<<<<<< HEAD
     PatternMatcherPass as PatternMatcherPassBase,
+=======
+    PatternMatcherPass,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_graph_pattern,
     register_replacement,
     stable_topological_sort,
@@ -68,10 +76,13 @@
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
 
+<<<<<<< HEAD
 PatternMatcherPass = functools.partial(
     PatternMatcherPassBase, subsystem="post_grad_passes"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -201,6 +212,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
                 pass_name = "custom_backend_passes_" + device
                 GraphTransformObserver(gm, pass_name).apply_gm_pass(custom_backend_pass)
 
+<<<<<<< HEAD
     collectives_bucketing: bool = False
 
     if config.bucket_reduce_scatters_fx != "none":
@@ -302,6 +314,12 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
     # ./fx_passes/README.md for a discussion of mutation invariants.
     GraphTransformObserver(gm, "reinplace_inplaceable_ops").apply_graph_pass(
         functools.partial(reinplace_inplaceable_ops, fake_tensor_updater),
+=======
+    # Keep these last, since they introduces mutation. Look at
+    # ./fx_passes/README.md for a discussion of mutation invariants.
+    GraphTransformObserver(gm, "reinplace_inplaceable_ops").apply_graph_pass(
+        reinplace_inplaceable_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     GraphTransformObserver(
         gm, "decompose_triton_kernel_wrapper_functional"
@@ -362,7 +380,10 @@ def decompose_map_to_while_loop(gm: torch.fx.GraphModule):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.map_impl),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -461,7 +482,11 @@ def body_fn(*flat_args):
 
 
 def resolve_shape_to_proxy(
+<<<<<<< HEAD
     shape: list[int | torch.SymInt], bound_symbols: dict[Any, Any]
+=======
+    shape: list[Union[int, torch.SymInt]], bound_symbols: dict[Any, Any]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Given a list of symints/ints, this function returns a calculated expression of bound_symbols' values.
@@ -550,7 +575,10 @@ def body_fn(loop_idx, ys, init, xs):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.scan),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -684,6 +712,7 @@ def lazy_init():
     # pass since otherwise there will be perf/peak-memory regression:
     # https://github.com/pytorch/pytorch/issues/148141
     register_replacement(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         prepare_softmax_pattern,
         # pyrefly: ignore [bad-argument-type]
@@ -693,6 +722,13 @@ def lazy_init():
         # pyrefly: ignore [bad-argument-type]
         trace_fn=fwd_only,
         # pyrefly: ignore [bad-argument-type]
+=======
+        prepare_softmax_pattern,
+        prepare_softmax_replacement,
+        [torch.empty(4, 8)],
+        scalar_workaround=dict(dim=-1),
+        trace_fn=fwd_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dicts=pass_patterns[1],
         extra_check=prepare_softmax_extra_check,
     )
@@ -735,7 +771,11 @@ def visit(other_node):
         iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)),
         None,
     )
+<<<<<<< HEAD
     past_mutating_epilogue = first_copy is None
+=======
+    past_mutating_epilogue = True if first_copy is None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for node in reversed(graph.nodes):
         seen_nodes.add(node)
@@ -753,10 +793,14 @@ def register_lowering_pattern(
     Register an aten to inductor IR replacement pattern
     """
     return pattern_matcher.register_lowering_pattern(
+<<<<<<< HEAD
         pattern,
         extra_check,
         # pyrefly: ignore [bad-argument-type]
         pass_dict=pass_patterns[pass_number],
+=======
+        pattern, extra_check, pass_dict=pass_patterns[pass_number]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -847,6 +891,7 @@ def scatter_upon_const_tensor(
     """
     from torch._inductor import metrics
 
+<<<<<<< HEAD
     # Check if inputs are tensors instead of inductor IR nodes
     if isinstance(selector, torch.Tensor):
         # Return a fake tensor with the proper shape that this operator is intended to return
@@ -854,6 +899,8 @@ def scatter_upon_const_tensor(
         return torch.empty(shape, dtype=dtype, device=device)
 
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     metrics.num_matches_for_scatter_upon_const_tensor += 1
 
     selector_loader = selector.make_loader()
@@ -905,7 +952,10 @@ def mm_plus_mm(match: Match, mat1, mat2, mat3, mat4):
         KeywordArg("dim"),
         _users=MULTIPLE,
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=pass_patterns[1],
 )
 def pointless_cumsum_replacement(match: Match, shape, fill_value, device, dtype, dim):
@@ -925,7 +975,10 @@ def repl(*shape):
 
     # only replace the output node, not all nodes
     match.nodes = [match.output_node()]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     match.replace_by_example(repl, list(shape))
 
 
@@ -1147,8 +1200,13 @@ def remove_noop_ops(graph: torch.fx.Graph):
     Removes both operations that are essentially aten.clone and operations that are essentially aten.alias from the graph.
     """
     inputs = OrderedSet[torch.fx.Node]()
+<<<<<<< HEAD
     input_storages = OrderedSet[int | None]()
     output_storages = OrderedSet[int | None]()
+=======
+    input_storages = OrderedSet[Union[int, None]]()
+    output_storages = OrderedSet[Union[int, None]]()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for node in graph.find_nodes(op="placeholder"):
         inputs.add(node)
@@ -1243,7 +1301,10 @@ def decompose_triton_kernel_wrapper_functional(graph):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.triton_kernel_wrapper_functional),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1260,7 +1321,10 @@ def decomp(*flat_args):
             args, kwargs = pytree.tree_unflatten(flat_args, spec)
             return (triton_kernel_wrapper_functional_dense(*args, **kwargs),)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     graph_pass.apply(graph)
@@ -1284,7 +1348,10 @@ def decompose_auto_functionalized(graph):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1305,12 +1372,18 @@ def decomp(*flat_args):
             mode = args[0]
             return auto_functionalized_dense(mode, only_clone_these_tensors, **kwargs)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized_v2),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1351,11 +1424,15 @@ def decomp(*flat_args):
                 mutable_op, only_clone_these_bases, **kwargs
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     graph_pass.apply(graph)
 
+<<<<<<< HEAD
     # Remove unused get_attr nodes and their corresponding attributes from the graph module.
     # When auto_functionalizing a hop, we need to clean up get_attr nodes for _constant_schema
     # and the auto_functionalized graph module that are no longer referenced.
@@ -1395,6 +1472,21 @@ def decomp(*flat_args):
         assert isinstance(attr_name, str)
         delattr(graph.owning_module, attr_name)
 
+=======
+    # We need to remove the get_attr registered for _constant_schema and the
+    # auto_functioanlized's graph module (it's replaced with original ) when auto_functionalize a hop.
+    _to_remove = []
+    for node in graph.nodes:
+        if node.op == "get_attr" and len(node.users) == 0:
+            _to_remove.append(node)
+            if hasattr(graph.owning_module, node.target) and isinstance(
+                getattr(graph.owning_module, node.target), torch.fx.GraphModule
+            ):
+                delattr(graph.owning_module, node.target)
+    for node in _to_remove:
+        graph.erase_node(node)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph.lint()
 
     for _ in graph.find_nodes(
@@ -1516,7 +1608,10 @@ def should_prefer_unfused_addmm(match):
 
 @register_graph_pattern(
     CallFunction(aten.addmm, KeywordArg("inp"), Arg(), Arg()),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=pass_patterns[2],
     extra_check=should_prefer_unfused_addmm,
 )
@@ -1524,7 +1619,10 @@ def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp):
     def repl(inp, x1, x2):
         return x1 @ x2 + inp
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     match.replace_by_example(repl, [inp, mat1, mat2])
 
 
@@ -1543,12 +1641,15 @@ def is_valid_addmm_fusion(match):
     if not matched:
         return False  # Shape mismatch
 
+<<<<<<< HEAD
     inp_dtype = inp.meta["val"].dtype
 
     # aten cublas integration assumes equal dtypes
     if inp_dtype != mat1.meta["val"].dtype or inp_dtype != mat2.meta["val"].dtype:
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return not should_prefer_unfused_addmm(match)
 
 
@@ -1558,7 +1659,10 @@ def is_valid_addmm_fusion(match):
         CallFunction(aten.mm, Arg(), Arg()),
         KeywordArg("inp"),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=pass_patterns[2],
     extra_check=is_valid_addmm_fusion,
 )
@@ -1568,7 +1672,10 @@ def is_valid_addmm_fusion(match):
         KeywordArg("inp"),
         CallFunction(aten.mm, Arg(), Arg()),
     ),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_dict=pass_patterns[2],
     extra_check=is_valid_addmm_fusion,
 )
@@ -1598,9 +1705,13 @@ def register_partial_reduction_pattern():
         full_reduc = CallFunction([red_op, equiv_red[red_op]], inp)
 
         @register_graph_pattern(
+<<<<<<< HEAD
             MultiOutputPattern([partial_reduc, full_reduc]),
             # pyrefly: ignore [bad-argument-type]
             pass_dict=pass_patterns[2],
+=======
+            MultiOutputPattern([partial_reduc, full_reduc]), pass_dict=pass_patterns[2]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         def reuse_partial(match, input, reduced_dims, keepdim):
             partial_red, full_red = match.output_nodes()
@@ -1753,7 +1864,11 @@ def cannot_be_moved(self, node: fx.Node) -> bool:
 
         return False
 
+<<<<<<< HEAD
     def get_node_device(self, node: fx.Node) -> torch.device | None:
+=======
+    def get_node_device(self, node: fx.Node) -> Optional[torch.device]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Get the device of a node.
         """
@@ -1776,7 +1891,10 @@ def add_cpu_inp(node):
 
             pytree.tree_map_only(fx.Node, add_cpu_inp, (node.args, node.kwargs))
 
+<<<<<<< HEAD
             # pyrefly: ignore [redundant-condition]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if cpu_count:
                 cpu_indeg[node] = cpu_count
 
@@ -1810,7 +1928,11 @@ def __call__(self, graph: fx.Graph) -> None:
             if not torch._subclasses.fake_tensor._is_tensor_constructor(node.target):
                 continue
 
+<<<<<<< HEAD
             if node.kwargs.get("device") != torch.device("cpu"):
+=======
+            if not node.kwargs.get("device") == torch.device("cpu"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             constructors.append(node)
@@ -1822,6 +1944,7 @@ def __call__(self, graph: fx.Graph) -> None:
         movable_constructors = self.find_movable_constructors(graph, constructors)
 
         target_device = next(iter(target_devices))
+<<<<<<< HEAD
         movable_cpu_placeholders = movable_constructors & cpu_placeholders
         if movable_cpu_placeholders:
             node = next(iter(reversed(movable_cpu_placeholders)))
@@ -1860,6 +1983,19 @@ def __call__(self, graph: fx.Graph) -> None:
                         and x.target != torch.ops.aten.copy_.default,
                     )
                     last_node = gpu_node
+=======
+        for node in movable_constructors:
+            if node in cpu_placeholders:
+                with graph.inserting_after(node):
+                    gpu_node = graph.call_function(
+                        torch.ops.prims.device_put.default, (node, target_device)
+                    )
+                node.replace_all_uses_with(
+                    gpu_node,
+                    lambda x: x != gpu_node
+                    and x.target != torch.ops.aten.copy_.default,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # noop elimination if there are other device_put for gpu_node to
                 # target device. Alternatively, we could just move the other device_put
@@ -1873,12 +2009,19 @@ def __call__(self, graph: fx.Graph) -> None:
                 for noop in noop_device_puts:
                     noop.replace_all_uses_with(gpu_node)
                     graph.erase_node(noop)
+<<<<<<< HEAD
 
         movable_constructors -= movable_cpu_placeholders
         for node in movable_constructors:
             kwargs = node.kwargs.copy()
             kwargs["device"] = target_device
             node.kwargs = kwargs
+=======
+            else:
+                kwargs = node.kwargs.copy()
+                kwargs["device"] = target_device
+                node.kwargs = kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def find_movable_constructors(
         self, graph: fx.Graph, constructors: list[fx.Node]
@@ -1971,9 +2114,19 @@ def move_constructors_to_gpu(graph: fx.Graph) -> None:
     # by explicitly moving cpu scalar tensors to gpu when profitable, relying on
     # graph partition to split off this data copy, and cudagraphifying
     # the remaining gpu ops.
+<<<<<<< HEAD
     allow_inputs_outputs = bool(
         torch._inductor.config.triton.cudagraphs
         and torch._inductor.config.graph_partition
+=======
+    allow_inputs_outputs = (
+        True
+        if (
+            torch._inductor.config.triton.cudagraphs
+            and torch._inductor.config.graph_partition
+        )
+        else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     ConstructorMoverPass(
         get_gpu_type(),
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index b953a7ad01a23..fa515aebd7cf7 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -1,10 +1,17 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 import functools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import logging
 import types
 from collections.abc import Sequence
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -14,9 +21,13 @@
     matches_module_pattern,
     replace_node_module,
 )
+<<<<<<< HEAD
 from torch.fx.passes.graph_transform_observer import (
     GraphTransformObserver as GraphTransformObserverBase,
 )
+=======
+from torch.fx.passes.graph_transform_observer import GraphTransformObserver
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.utils.fusion import fuse_conv_bn_eval, fuse_conv_bn_weights
@@ -25,7 +36,11 @@
 from ..fx_utils import matches_module_function_pattern
 from ..pattern_matcher import (
     init_once_fakemode,
+<<<<<<< HEAD
     PatternMatcherPass as PatternMatcherPassBase,
+=======
+    PatternMatcherPass,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stable_topological_sort,
 )
 from ..utils import is_cpu_device, pass_execution_and_save
@@ -34,6 +49,7 @@
 from .split_cat import PRE_GRAD_PATTERNS
 
 
+<<<<<<< HEAD
 PatternMatcherPass = functools.partial(
     PatternMatcherPassBase, subsystem="pre_grad_passes"
 )
@@ -41,6 +57,8 @@
     GraphTransformObserverBase, subsystem="pre_grad_passes"
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 efficient_conv_bn_eval_pass = PatternMatcherPass(
@@ -174,7 +192,11 @@ def lazy_init():
 
 
 def _get_pass_name_func(p):
+<<<<<<< HEAD
     if isinstance(p, PatternMatcherPassBase):
+=======
+    if isinstance(p, PatternMatcherPass):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_name = p.pass_name
         pass_func = p.apply
     elif isinstance(p, types.FunctionType):
@@ -190,8 +212,13 @@ def _get_pass_name_func(p):
 def _run_pre_dispatch_passes(
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[object] = (),
+<<<<<<< HEAD
     add_passes: str | None = None,
     remove_passes: str | None = None,
+=======
+    add_passes: Optional[str] = None,
+    remove_passes: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     # order matters
     default_pass_list = [
@@ -277,8 +304,13 @@ def _run_pre_dispatch_passes(
 def pre_grad_passes(
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[object] = (),
+<<<<<<< HEAD
     add_passes: str | None = None,
     remove_passes: str | None = None,
+=======
+    add_passes: Optional[str] = None,
+    remove_passes: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.fx.GraphModule:
     """
     Apply passes on the input FX graph using Torch IR.
@@ -338,9 +370,14 @@ def pre_grad_passes(
             efficient_conv_bn_eval_pass.apply(gm.graph)  # type: ignore[arg-type]
 
     if config.pre_grad_custom_pass is not None:
+<<<<<<< HEAD
         GraphTransformObserver(gm, "pre_grad_custom_pass").apply_graph_pass(
             config.pre_grad_custom_pass
         )
+=======
+        with GraphTransformObserver(gm, "pre_grad_custom_pass"):
+            config.pre_grad_custom_pass(gm.graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stable_topological_sort(gm.graph)
 
     from .quantization import quant_lift_up
@@ -508,7 +545,10 @@ def is_fusion_enabled(self):
                 conv = conv_bn_fusion.conv_module
                 bn = conv_bn_fusion.bn_module
 
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fused_conv = fuse_conv_bn_eval(conv, bn)
                 for bn_node in bn_nodes:
                     replace_node_module(bn_node.args[0], modules, fused_conv)
@@ -596,11 +636,16 @@ def _used_by_same_conv_module(users):
                 fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
                     fused_conv.weight,
                     fused_conv.bias,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
                     bn_running_mean,
                     # pyrefly: ignore [bad-argument-type]
                     bn_running_var,
                     # pyrefly: ignore [bad-argument-type]
+=======
+                    bn_running_mean,
+                    bn_running_var,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     bn_eps,
                     bn_weight,
                     bn_bias,
@@ -618,7 +663,11 @@ def _used_by_same_conv_module(users):
 class NormalizedLinearNode:
     def __init__(self, node: torch.fx.Node) -> None:
         assert node.op == "call_function"
+<<<<<<< HEAD
         assert node.target is torch.nn.functional.linear
+=======
+        assert node.target in [torch.nn.functional.linear]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.node: torch.fx.Node = node
 
     def get_input(self) -> torch.fx.Node:
@@ -637,7 +686,11 @@ def get_bias(self) -> torch.fx.Node:
         if len(self.node.args) > 2:
             return self.node.args[2]  # type: ignore[return-value]
         else:
+<<<<<<< HEAD
             return self.node.kwargs.get("bias", None)  # type: ignore[return-value]
+=======
+            return self.node.kwargs["bias"] if "bias" in self.node.kwargs else None  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NormalizedMatmulNode:
@@ -762,7 +815,11 @@ def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 # ---->
 # Y2 = (W * X^T + bias.unsqueeze(-1))^T
 def linear_transpose(
+<<<<<<< HEAD
     input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None
+=======
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor:
     if bias is None:
         return torch.matmul(weight, input.transpose(-1, -2))
@@ -859,7 +916,11 @@ def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 # ---->
 # Y2 = X1.transpose(-1, -2) * W1^T + bias1
 def transpose_linear(
+<<<<<<< HEAD
     input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None
+=======
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor:
     if bias is None:
         return torch.matmul(input.transpose(-1, -2), weight.t())
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 80bb9a05e2aae..908fb0e28871c 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -43,7 +43,10 @@
     aten.transpose.int,
     aten.permute.default,
     aten.view.default,
+<<<<<<< HEAD
     aten.reshape.default,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 """
@@ -73,6 +76,7 @@ def _get_pattern_output_dtype(match: Match):
     output_node = pattern_output_nodes[0]
     assert isinstance(output_node, torch.fx.Node)
     output_dtype = output_node.meta["val"].dtype
+<<<<<<< HEAD
     assert output_dtype in [
         torch.int8,
         torch.uint8,
@@ -80,6 +84,9 @@ def _get_pattern_output_dtype(match: Match):
         torch.bfloat16,
         torch.float8_e4m3fn,
     ]
+=======
+    assert output_dtype in [torch.int8, torch.uint8, torch.float32, torch.bfloat16]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return output_dtype
 
 
@@ -531,7 +538,11 @@ def qlinear(match: Match, *args, **kwargs):
         )
 
         # bias
+<<<<<<< HEAD
         b = kwargs.get("b")
+=======
+        b = kwargs["b"] if "b" in kwargs else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Output QParams
         o_inv_scale = kwargs["output_scale"]
@@ -593,7 +604,11 @@ def qlinear_binary(match: Match, *args, **kwargs):
             kwargs["w_zp"],
         )
         # bias
+<<<<<<< HEAD
         b = kwargs.get("b")
+=======
+        b = kwargs["b"] if "b" in kwargs else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Output QParams
         o_inv_scale = kwargs["output_scale"]
         o_zero_point = kwargs["output_zero_point"]
@@ -885,10 +900,17 @@ def _register_quantized_maxpool2d_lowering(
     def qmaxpool2d(match: Match, *args, **kwargs):
         x = kwargs["x"]
         kernel_size = kwargs["kernel_size"]
+<<<<<<< HEAD
         stride = kwargs.get("stride")
         padding = kwargs.get("padding", 0)
         dilation = kwargs.get("dilation", 1)
         ceil_mode = kwargs.get("ceil_mode", False)
+=======
+        stride = kwargs["stride"] if ("stride" in kwargs) else None
+        padding = kwargs["padding"] if ("padding" in kwargs) else 0
+        dilation = kwargs["dilation"] if ("dilation" in kwargs) else 1
+        ceil_mode = kwargs["ceil_mode"] if ("ceil_mode" in kwargs) else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if padding == 0:
             padding = [0, 0]
@@ -1105,6 +1127,15 @@ def fn(match):
         w1_cols = match.kwargs["w1"].meta["val"].size()[0]
         w2_cols = match.kwargs["w2"].meta["val"].size()[0]
         w3_cols = match.kwargs["w3"].meta["val"].size()[0]
+<<<<<<< HEAD
+=======
+        # Technically, the shapes of the three weights need not be equal.
+        # But currently, we only enable replacement in this case.
+        if w1_cols != w2_cols or w2_cols != w3_cols:
+            return False
+        if 3 * w1_cols != num_scales:
+            return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             # For now, we only support woq mm kernels
             # with x.type=bfloat16 and w.type=int8
@@ -1113,12 +1144,21 @@ def fn(match):
             and w2.dtype == torch.int8
             and w3.dtype == torch.int8
             and scales.dtype == torch.bfloat16
+<<<<<<< HEAD
             and x.device.type in ("cpu", "cuda")
+=======
+            # _weight_int8pack_mm kernel only supports cpu now
+            # TODO: add cuda kernel support instead of calling mul+sum
+            and x.device.type == "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and x.device == w1.device
             and w1.device == w2.device
             and w2.device == w3.device
             and x.device == scales.device
+<<<<<<< HEAD
             and num_scales == w1_cols + w2_cols + w3_cols
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return fn
@@ -1140,7 +1180,13 @@ def fn(match):
             x.dtype == torch.bfloat16
             and weight.dtype == torch.int8
             and scales.dtype == torch.bfloat16
+<<<<<<< HEAD
             and x.device.type in ("cpu", "cuda")
+=======
+            # _weight_int8pack_mm kernel only supports cpu now
+            # TODO: add cuda kernel support instead of calling mul+sum
+            and x.device.type == "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and x.device == weight.device
             and x.device == scales.device
         )
@@ -1156,7 +1202,11 @@ def _register_concat_linear_int8_woq_lowering(
         extra_check=_is_valid_concat_linear_int8_woq_optimization_pattern(),
         pass_number=4,
     )
+<<<<<<< HEAD
     def woq_int8(match: Match, *args, **kwargs):
+=======
+    def woq(match: Match, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = kwargs["x"]
         w1 = kwargs["w1"]
         w2 = kwargs["w2"]
@@ -1212,7 +1262,11 @@ def woq_int8(match: Match, *args, **kwargs):
             match.graph.erase_node(cat_wgt_node)
             match.graph.lint()
 
+<<<<<<< HEAD
     return woq_int8
+=======
+    return woq
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _register_woq_lowering(pattern, computation_woq, computation_reshape):
@@ -1220,7 +1274,11 @@ def _register_woq_lowering(pattern, computation_woq, computation_reshape):
         pattern,
         extra_check=_is_valid_woq_optimization_pattern(),
     )
+<<<<<<< HEAD
     def woq_int8(match: Match, *args, **kwargs):
+=======
+    def woq(match: Match, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = kwargs["x"]
         weight = kwargs["weight"]
         scales = kwargs["scales"]
@@ -1236,7 +1294,11 @@ def woq_int8(match: Match, *args, **kwargs):
         func2 = L[computation_woq](func1, weight, scales)
         return L[computation_reshape](func2, out_shape)
 
+<<<<<<< HEAD
     return woq_int8
+=======
+    return woq
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _register_woq_mm_int8_pattern1():
@@ -1522,7 +1584,11 @@ def _find_first_node_in_dequant_pattern(_node):
         counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
 
 
+<<<<<<< HEAD
 def _is_valid_dequant_conv_pattern(dtype, with_dtype_convert):
+=======
+def _is_valid_dequant_conv_pattern(dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _inner(match):
         # Here we do some further check to ensure:
         # 1. It's a conv2d node with dim of 4, since we only support lowering of conv2d now.
@@ -1544,7 +1610,11 @@ def _inner(match):
 
         assert dtype in [torch.float32, torch.bfloat16]
 
+<<<<<<< HEAD
         if not with_dtype_convert:
+=======
+        if dtype == torch.float32:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1559,12 +1629,19 @@ def _inner(match):
     return _inner
 
 
+<<<<<<< HEAD
 def _register_qconv_weight_prepack_pass(
     pattern, pass_number, dtype=torch.float32, with_dtype_convert=False
 ):
     @register_freezing_graph_pattern(
         pattern,
         extra_check=_is_valid_dequant_conv_pattern(dtype, with_dtype_convert),
+=======
+def _register_qconv_weight_prepack_pass(pattern, pass_number, dtype=torch.float32):
+    @register_freezing_graph_pattern(
+        pattern,
+        extra_check=_is_valid_dequant_conv_pattern(dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_number=pass_number,
     )
     def qconv_weight_prepack(match: Match, *args, **kwargs):
@@ -1584,7 +1661,11 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
         assert dtype in [torch.float32, torch.bfloat16]
         conv_node = match.output_node()
         assert conv_node.target is aten.convolution.default
+<<<<<<< HEAD
         if not with_dtype_convert:
+=======
+        if dtype == torch.float32:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1689,7 +1770,11 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
             # Erase the original conv node
             graph.erase_node(conv_node)
             # Erase the dequant pattern
+<<<<<<< HEAD
             if with_dtype_convert:
+=======
+            if dtype == torch.bfloat16:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph.erase_node(convert_to_bf16)  # type: ignore[possibly-undefined, arg-type]
             graph.erase_node(dequant_node)  # type: ignore[arg-type]
             # Erase the dequant per channel pattern
@@ -1705,7 +1790,11 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
 
 
 def _generate_dequant_convolution_node_pattern(
+<<<<<<< HEAD
     _dequant_per_channel_pattern, dtype=torch.float32, with_dtype_convert=False
+=======
+    _dequant_per_channel_pattern, dtype=torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     dequant_convolution_node_pattern = CallFunction(
@@ -1713,7 +1802,11 @@ def _generate_dequant_convolution_node_pattern(
         _may_generate_pattern_with_dtype_convert(
             get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("autocast_act_dtype"),
+<<<<<<< HEAD
             with_dtype_convert,
+=======
+            dtype == torch.bfloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         _dequant_per_channel_pattern,
         KeywordArg("b"),
@@ -1727,9 +1820,13 @@ def _generate_dequant_convolution_node_pattern(
     return dequant_convolution_node_pattern
 
 
+<<<<<<< HEAD
 def _generate_qconv_weight_prepack_patterns(
     dtype=torch.float32, with_dtype_convert=False
 ):
+=======
+def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert dtype in [torch.float32, torch.bfloat16]
     return (
         _generate_dequant_convolution_node_pattern(
@@ -1737,7 +1834,10 @@ def _generate_qconv_weight_prepack_patterns(
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_weight_pattern,
             dtype,
+<<<<<<< HEAD
             with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         # There is another pattern due to the pass of convert_conv_weights_to_channels_last
         # https://github.com/pytorch/pytorch/blob/07107919297db3f8ab37f11c12666b6d6d5f692e/torch/_inductor/freezing.py#L338-L362.
@@ -1748,7 +1848,10 @@ def _generate_qconv_weight_prepack_patterns(
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_clone_weight_pattern,
             dtype,
+<<<<<<< HEAD
             with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
 
@@ -1776,11 +1879,15 @@ def _get_linear_node(match, input_dim_exceeds_two, input_contiguous):
 
 
 def _get_linear_dq_node(
+<<<<<<< HEAD
     linear_node,
     input_index,
     input_dim_exceeds_two,
     input_contiguous,
     with_dtype_convert,
+=======
+    linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     act_reshape_node = None
     activation_to_bf16_node = None
@@ -1789,7 +1896,11 @@ def _get_linear_dq_node(
         if input_contiguous:
             act_reshape_node = linear_node.args[input_index]
             assert act_reshape_node.target is aten.reshape.default
+<<<<<<< HEAD
             if not with_dtype_convert:
+=======
+            if dtype == torch.float32:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # pattern: linear -> reshape -> dequant
                 dequant_node = act_reshape_node.args[0]
             else:
@@ -1800,13 +1911,21 @@ def _get_linear_dq_node(
             # bmm pattern decomposed from linear when input dim exceeds 2 and not contiguous
             act_expand_node = linear_node.args[input_index]
             assert act_expand_node.target is aten.expand.default
+<<<<<<< HEAD
             if not with_dtype_convert:
+=======
+            if dtype == torch.float32:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dequant_node = act_expand_node.args[0]
             else:
                 activation_to_bf16_node = act_expand_node.args[0]
                 dequant_node = activation_to_bf16_node.args[0]
     else:
+<<<<<<< HEAD
         if not with_dtype_convert:
+=======
+        if dtype == torch.float32:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # pattern: linear -> dequant
             dequant_node = linear_node.args[input_index]
         else:
@@ -1816,9 +1935,13 @@ def _get_linear_dq_node(
     return dequant_node, act_reshape_node, activation_to_bf16_node, act_expand_node
 
 
+<<<<<<< HEAD
 def _is_valid_dequant_linear_pattern(
     dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
 ):
+=======
+def _is_valid_dequant_linear_pattern(dtype, input_dim_exceeds_two, input_contiguous):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _inner(match):
         # Check dequant pattern has only 1 user.
         (
@@ -1834,11 +1957,15 @@ def _inner(match):
             _,
             _,
         ) = _get_linear_dq_node(
+<<<<<<< HEAD
             linear_node,
             input_index,
             input_dim_exceeds_two,
             input_contiguous,
             with_dtype_convert,
+=======
+            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         assert dequant_node.target in [
@@ -1900,12 +2027,19 @@ def _register_qlinear_weight_prepack_pass(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     input_contiguous=True,
+<<<<<<< HEAD
     with_dtype_convert=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     @register_freezing_graph_pattern(
         pattern,
         extra_check=_is_valid_dequant_linear_pattern(
+<<<<<<< HEAD
             dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
+=======
+            dtype, input_dim_exceeds_two, input_contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         pass_number=pass_number,
     )
@@ -1937,11 +2071,15 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
             activation_to_bf16_node,
             act_expand_node,
         ) = _get_linear_dq_node(
+<<<<<<< HEAD
             linear_node,
             input_index,
             input_dim_exceeds_two,
             input_contiguous,
             with_dtype_convert,
+=======
+            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if input_dim_exceeds_two and not input_contiguous:
@@ -1976,7 +2114,11 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
         )
 
         # Params
+<<<<<<< HEAD
         bias = kwargs.get("b")
+=======
+        bias = kwargs["b"] if "b" in kwargs else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x_shape = qx.meta.get("tensor_meta").shape
         if has_free_symbols(x_shape):
@@ -2048,7 +2190,11 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
                 else:
                     graph.erase_node(act_expand_node)
                     graph.erase_node(wgt_expand_node)  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
             if with_dtype_convert:
+=======
+            if dtype == torch.bfloat16:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph.erase_node(activation_to_bf16_node)
             # Erase the dequant pattern
             graph.erase_node(dequant_node)
@@ -2069,7 +2215,10 @@ def _generate_dequant_linear_node_pattern(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     is_tensor_overload=False,
+<<<<<<< HEAD
     with_dtype_convert=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2081,7 +2230,11 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
+<<<<<<< HEAD
                     with_dtype_convert,
+=======
+                    dtype == torch.bfloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2098,7 +2251,11 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
+<<<<<<< HEAD
                     with_dtype_convert,
+=======
+                    dtype == torch.bfloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2116,7 +2273,10 @@ def _generate_dequant_bmm_node_pattern(
     dtype=torch.float32,
     with_bias=False,
     is_tensor_overload=False,
+<<<<<<< HEAD
     with_dtype_convert=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # When activation of linear dim exceed 2 and not contiguous
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2129,7 +2289,11 @@ def _generate_dequant_bmm_node_pattern(
             _may_generate_pattern_with_dtype_convert(
                 get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                 KeywordArg("autocast_act_dtype"),
+<<<<<<< HEAD
                 with_dtype_convert,
+=======
+                dtype == torch.bfloat16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             KeywordArg("act_expand_size"),
         ),
@@ -2159,7 +2323,10 @@ def _generate_qlinear_weight_prepack_patterns(
     input_contiguous=True,
     with_bias=False,
     is_tensor_overload=False,
+<<<<<<< HEAD
     with_dtype_convert=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     if input_dim_exceeds_two and not input_contiguous:
         return _generate_dequant_bmm_node_pattern(
@@ -2167,7 +2334,10 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             with_bias,
             is_tensor_overload,
+<<<<<<< HEAD
             with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         return _generate_dequant_linear_node_pattern(
@@ -2175,7 +2345,10 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload,
+<<<<<<< HEAD
             with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -2291,6 +2464,7 @@ def _register_dequant_promotion():
 
 
 def _register_qconv_weight_prepack():
+<<<<<<< HEAD
     for dtype, with_dtype_convert in itertools.product(
         [torch.float32, torch.bfloat16], [True, False]
     ):
@@ -2306,6 +2480,14 @@ def _register_qconv_weight_prepack():
                 pass_number=1,
                 dtype=dtype,
                 with_dtype_convert=with_dtype_convert,
+=======
+    for dtype in [torch.float32, torch.bfloat16]:
+        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(dtype)
+        for weight_prepack_pattern in weight_prepack_patterns:
+            # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
+            _register_qconv_weight_prepack_pass(
+                weight_prepack_pattern, pass_number=1, dtype=dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -2345,6 +2527,7 @@ def _register_qlinear_weight_prepack():
     #   |            OPT(add)               |
 
     linear_weight_prepack_cases = itertools.product(
+<<<<<<< HEAD
         [torch.float32, torch.bfloat16], [True, False], [True, False], [True, False]
     )
 
@@ -2357,11 +2540,21 @@ def _register_qlinear_weight_prepack():
     ) in linear_weight_prepack_cases:
         if dtype == torch.float32 and with_dtype_convert:
             continue
+=======
+        [torch.float32, torch.bfloat16], [True, False], [True, False]
+    )
+
+    # Step 1: register patterns from mm and addmm
+    for dtype, input_dim_exceeds_two, is_tensor_overload in linear_weight_prepack_cases:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_prepack_patterns = _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload=is_tensor_overload,
+<<<<<<< HEAD
             with_dtype_convert=with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
@@ -2370,7 +2563,10 @@ def _register_qlinear_weight_prepack():
                 pass_number=1,
                 dtype=dtype,
                 input_dim_exceeds_two=input_dim_exceeds_two,
+<<<<<<< HEAD
                 with_dtype_convert=with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     # Step 2: register patterns from bmm
@@ -2378,6 +2574,7 @@ def _register_qlinear_weight_prepack():
     # refer to:
     # https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
     # in this case, we can convert it back to qlinear
+<<<<<<< HEAD
     for (
         dtype,
         with_bias,
@@ -2388,13 +2585,21 @@ def _register_qlinear_weight_prepack():
     ):
         if dtype == torch.float32 and with_dtype_convert:
             continue
+=======
+    for dtype, with_bias, is_tensor_overload in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False], [True, False]
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bmm_pattern = _generate_qlinear_weight_prepack_patterns(
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
             with_bias=with_bias,
             is_tensor_overload=is_tensor_overload,
+<<<<<<< HEAD
             with_dtype_convert=with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         _register_qlinear_weight_prepack_pass(
             bmm_pattern,
@@ -2404,7 +2609,10 @@ def _register_qlinear_weight_prepack():
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
+<<<<<<< HEAD
             with_dtype_convert=with_dtype_convert,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -2451,7 +2659,11 @@ def linear_dynamic_fp16_weight_prepack(match: Match, *args, **kwargs):
         # find params
         x = kwargs["x"]
         w = kwargs["w"]
+<<<<<<< HEAD
         bias = kwargs.get("b")
+=======
+        bias = kwargs["b"] if "b" in kwargs else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # find linear node
         nodes_to_find = [aten.addmm.default, aten.mm.default, aten.bmm.default]
@@ -2727,7 +2939,11 @@ def _validate_pattern(match: Match):
             pass_number=pass_number,
         )
         def _int_mm_weight_prepack(match: Match, *args, **kwargs):
+<<<<<<< HEAD
             bias = kwargs.get("bias")
+=======
+            bias = kwargs.get("bias", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = kwargs["a"]
             weight = kwargs["b"]
             dtype = kwargs["dtype"]
@@ -2794,7 +3010,11 @@ def _int_mm_weight_prepack(match: Match, *args, **kwargs):
                 else:
                     # onednn.qlinear does not support per-channel quantization of x
                     # so in this case, we have to apply x scale and add bias ourselves after qlinear
+<<<<<<< HEAD
                     in_shape = kwargs.get("in_shape")
+=======
+                    in_shape = kwargs.get("in_shape", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if in_shape is None:
                         x_reshaped = x
                     else:
@@ -2826,8 +3046,13 @@ def _int_mm_weight_prepack(match: Match, *args, **kwargs):
 
                     # Add bias and reshape
                     has_outer_reshape = (
+<<<<<<< HEAD
                         kwargs.get("out_shape_with_bias") is not None
                         or kwargs.get("out_shape_no_bias") is not None
+=======
+                        kwargs.get("out_shape_with_bias", None) is not None
+                        or kwargs.get("out_shape_no_bias", None) is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                     if has_outer_reshape:
@@ -3276,7 +3501,11 @@ def qlinear_post_op_fusion(match: Match, *args, **kwargs):
         )
 
         # bias
+<<<<<<< HEAD
         b = kwargs.get("b")
+=======
+        b = kwargs["b"] if "b" in kwargs else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Output QParams
         o_inv_scale = (
@@ -3868,7 +4097,11 @@ def quant_lift_up(graph_module: torch.fx.GraphModule):
         ADD
       SOFTMAX
 
+<<<<<<< HEAD
     We want to lift up the quant nodes from matmul before view like nodes
+=======
+    We want to lift up the the quant nodes from matmul before view like nodes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     as the output of Linear node.
 
              DQ
@@ -3918,7 +4151,10 @@ def is_view_op(node):
 
             # Further check the input node of the first view node has only 1 user node
             if could_lift_up and len(input_node.users) == 1:
+<<<<<<< HEAD
                 counters["inductor"]["quant_lift_up_count"] += 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Replace dequant's input from quant to quant's input
                 quant_node.replace_all_uses_with(input_node_of_quant)
                 # Insert the new quant node
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index 8b9deac6ba5a5..b32e55cda521a 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -4,16 +4,24 @@
 import operator
 from collections import defaultdict
 from collections.abc import Sequence
+<<<<<<< HEAD
 from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Any, Callable, cast
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.node
 from torch._C._dynamo.guards import compute_overlapping_tensors
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import ReinplaceCounters, ReInplaceTrigger
+<<<<<<< HEAD
 from torch._guards import detect_fake_mode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.triton_kernel_wrap import (
     kernel_side_table,
     triton_kernel_wrapper_functional,
@@ -24,10 +32,14 @@
     inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
 )
 from torch._inductor.virtualized import V
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import (
     compute_unbacked_bindings,
     GuardOnDataDependentSymNode,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.reinplace import _is_view_op
 from torch.utils import _pytree as pytree
@@ -63,9 +75,13 @@ def graph_call_function(graph: torch.fx.Graph, fn, *args, **kwargs):
         fake_result = fn(*fake_args, **fake_kwargs)
 
     node = graph.call_function(fn, args, kwargs)
+<<<<<<< HEAD
 
     node.meta["val"] = fake_result
 
+=======
+    node.meta["val"] = fake_result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return node
 
 
@@ -85,6 +101,7 @@ def _inplace_generalized_scatter(
             lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
             (view.args, view.kwargs),
         )
+<<<<<<< HEAD
         # slice and select can allocate new unbacked symints, but those won't be reflected
         # in the output of this function, hence shall be ignored.
         fake_mode = detect_fake_mode(fake_args)
@@ -94,6 +111,9 @@ def _inplace_generalized_scatter(
             else nullcontext()
         ):
             tmp = view.target(tmp, *fake_args, **fake_kwargs)
+=======
+        tmp = view.target(tmp, *fake_args, **fake_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         tmp.copy_(src)
     except RuntimeError as e:
@@ -176,6 +196,7 @@ def _decompose_scatter_mutating(
     tmp = inp
     for view in view_ops:  # type: ignore[union-attr]
         tmp = graph_call_function(graph, view.target, tmp, *view.args, **view.kwargs)  # type: ignore[union-attr]
+<<<<<<< HEAD
         # we need to set unbacked bindings that could have been created in the view ops.
         if (V.fake_mode.shape_env) and (
             symbol_to_path := compute_unbacked_bindings(
@@ -183,6 +204,8 @@ def _decompose_scatter_mutating(
             )
         ):
             tmp.meta["unbacked_bindings"] = symbol_to_path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     graph_call_function(graph, aten.copy_.default, tmp, src)
     return inp  # type: ignore[return-value]
@@ -525,7 +548,11 @@ def _overlap(ls) -> bool:
 
         if mutated_arg.op in ("placeholder", "get_attr"):
             # Get the first copy_ node that mutates the mutated_arg.
+<<<<<<< HEAD
             copy_node = copy_nodes.get(mutated_arg)
+=======
+            copy_node = copy_nodes.get(mutated_arg, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if copy_node is None:
                 # There is no copy_ back to the candidate mutated_arg (which is a graph input).
                 # Therefore the semantics of the program are that it does not mutate
@@ -600,7 +627,11 @@ def reinplace_and_refine_tensors_to_clone(
         old_tensors_to_clone, kwargs, node_name, trigger
     ):
         tensors_to_clone: list[str] = []
+<<<<<<< HEAD
         storage_of_reinplaced_args = OrderedSet[int | None]()
+=======
+        storage_of_reinplaced_args = OrderedSet[Union[int, None]]()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Those used to count possibly_missed_reinplacing_opportunities
         missed_nodes = []
@@ -637,7 +668,11 @@ def tensor_with_same_storage_already_reinplaced(arg):
                 copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
                 if copy_node is not None:
                     replace_dict[copy_node] = copy_node.args[0]
+<<<<<<< HEAD
                 if trigger != ReInplaceTrigger.AUTO_FUNC_V2:
+=======
+                if not trigger == ReInplaceTrigger.AUTO_FUNC_V2:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for user in node.users:
                         # For auto_functionalize_v2, arg is the index of the base, where base at index i corresponds to
                         # output atindex size(out)+i.
@@ -701,7 +736,11 @@ def tensor_with_same_storage_already_reinplaced(arg):
             from torch._higher_order_ops.auto_functionalize import get_mutable_args
 
             tensors_to_clone, _ = get_mutable_args(_mutable_op)
+<<<<<<< HEAD
             # Don't try to reinplace Tensor | None args that are None.
+=======
+            # Don't try to reinplace Optional[Tensor] args that are None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tensors_to_clone = [
                 t for t in tensors_to_clone if node.kwargs[t] is not None
             ]
@@ -781,6 +820,7 @@ def tensor_with_same_storage_already_reinplaced(arg):
         graph.erase_node(node)
 
 
+<<<<<<< HEAD
 def reinplace_inplaceable_ops(
     fake_tensor_updater: torch._inductor.fx_utils.FakeTensorUpdater,
     graph: torch.fx.Graph,
@@ -791,5 +831,10 @@ def reinplace_inplaceable_ops(
         # We run fake_tensor_updater to update the alias information.
         # Correct alias information is required for `reinplace_inplaceable_ops_core`.
         fake_tensor_updater.incremental_update()
+=======
+def reinplace_inplaceable_ops(graph: torch.fx.Graph) -> None:
+    with enable_python_dispatcher():
+        canonicalize_view_scatter_ops(graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reinplace_inplaceable_ops_core(graph)
         decompose_generalized_scatter(graph)
diff --git a/torch/_inductor/fx_passes/replace_random.py b/torch/_inductor/fx_passes/replace_random.py
index 150ba5cde4a7c..e92d025e2c8b0 100644
--- a/torch/_inductor/fx_passes/replace_random.py
+++ b/torch/_inductor/fx_passes/replace_random.py
@@ -17,7 +17,11 @@
 
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
 patterns = PatternMatcherPass(subsystem="joint_graph_passes")
+=======
+patterns = PatternMatcherPass()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 
 
@@ -27,7 +31,11 @@ def replace_random_passes(gm: torch.fx.GraphModule):
         return 0
 
     count = patterns.apply(gm)
+<<<<<<< HEAD
     with GraphTransformObserver(gm, "fuse_seed_creation_pass", "joint_graph_passes"):
+=======
+    with GraphTransformObserver(gm, "fuse_seed_creation_pass"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count += fuse_seed_creation_pass(gm.graph)
 
     return count
@@ -88,6 +96,7 @@ def get_device(device):
     return torch.empty([]).device  # default device
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [bad-argument-type]
 @register_graph_pattern(CallFunctionVarArgs(aten.rand.default), pass_dict=patterns)
 # pyrefly: ignore [bad-argument-type]
@@ -95,6 +104,11 @@ def get_device(device):
 # pyrefly: ignore [bad-argument-type]
 @register_graph_pattern(CallFunctionVarArgs(aten.randn.default), pass_dict=patterns)
 # pyrefly: ignore [bad-argument-type]
+=======
+@register_graph_pattern(CallFunctionVarArgs(aten.rand.default), pass_dict=patterns)
+@register_graph_pattern(CallFunctionVarArgs(aten.rand.generator), pass_dict=patterns)
+@register_graph_pattern(CallFunctionVarArgs(aten.randn.default), pass_dict=patterns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_graph_pattern(CallFunctionVarArgs(aten.randn.generator), pass_dict=patterns)
 def replace_random(
     match: Match,
@@ -124,11 +138,17 @@ def replacement(size):
         match.output_node().target.overloadpacket  # type: ignore[union-attr]
     ]  # type: ignore[union-attr]
     device = get_device(device)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     match.replace_by_example(replacement, [size])
 
 
 # pyrefly: ignore [bad-argument-type]
+=======
+    match.replace_by_example(replacement, [size])
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_graph_pattern(CallFunctionVarArgs(aten.randint.low), pass_dict=patterns)
 def replace_randint(
     match: Match,
@@ -146,5 +166,8 @@ def replacement(low, high, size):
         return result.to(dtype)
 
     device = get_device(device)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     match.replace_by_example(replacement, [low, high, size])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
index 4ebd4a4e14e48..496a627fad7e2 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
@@ -121,6 +121,7 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
@@ -203,6 +204,8 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
@@ -297,6 +300,7 @@
 view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
 _sfdp_pattern_21_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+<<<<<<< HEAD
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -389,3 +393,5 @@
 view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
 _sfdp_pattern_21_half_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
index 0971c09ad972f..3e1f3df4bbca0 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
@@ -127,6 +127,7 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
@@ -215,6 +216,8 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
@@ -315,6 +318,7 @@
   permute_default_1,
   permute_default_3
 ])
+<<<<<<< HEAD
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -413,3 +417,5 @@
   permute_default_1,
   permute_default_3
 ])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
index 2be036c2e8ae7..9fbe36b0a79b5 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
@@ -124,6 +124,7 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
@@ -209,6 +210,8 @@
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
@@ -308,6 +311,7 @@
   permute_default_1,
   permute_default_3
 ])
+<<<<<<< HEAD
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -405,3 +409,5 @@
   permute_default_1,
   permute_default_3
 ])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 15ea6867dba38..cf9d2a21c9ce7 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -2,15 +2,25 @@
 import itertools
 import logging
 import operator
+<<<<<<< HEAD
 import os
 from collections import defaultdict
 from collections.abc import Sequence
 from typing import Any, Callable
+=======
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeAlias
 
 import torch
 from torch._dynamo.utils import counters
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import free_symbols, guard_or_false
+=======
+from torch.fx.experimental.symbolic_shapes import free_symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from ..pattern_matcher import (
@@ -38,10 +48,17 @@
 
 _Arguments: TypeAlias = tuple[torch.fx.node.Argument, ...]
 _TransformParam: TypeAlias = tuple[
+<<<<<<< HEAD
     _Arguments | None,
     _Arguments | None,
     _Arguments | None,
     _Arguments | None,
+=======
+    Optional[_Arguments],
+    Optional[_Arguments],
+    Optional[_Arguments],
+    Optional[_Arguments],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 _Range: TypeAlias = tuple[int, int]
 
@@ -63,7 +80,10 @@
     "split_stack_to_cats_pass",
     "unbind_stack_to_slices_pass",
     "move_reshape_out_of_split_stack_pass",
+<<<<<<< HEAD
     "einsum_to_pointwise_pass",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 post_grad_pass_names = [
@@ -77,8 +97,11 @@
     "move_view_after_cat_aten_pass",
 ]
 
+<<<<<<< HEAD
 backend = os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_BACKEND", "inductor")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 for pass_name in pre_grad_pass_names:
     # exclude all passes from the group batch fusion
     # they do not use pattern matcher
@@ -167,7 +190,11 @@ def _get_dim(node: Any):
 def normalize_split_base(
     match: Match,
     _get_split_args: Callable[
+<<<<<<< HEAD
         [torch.fx.Node], tuple[torch.fx.Node | None, Any | None, int | None]
+=======
+        [torch.fx.Node], tuple[Optional[torch.fx.Node], Optional[Any], Optional[int]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
 ):
     """
@@ -210,7 +237,11 @@ def normalize_split_base(
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
+<<<<<<< HEAD
     counters[backend]["normalization_pass"] += 1
+=======
+    counters["inductor"]["normalization_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -262,7 +293,11 @@ def remove_split_with_size_one(match: Match, *args, **kwargs):
         # erase the split node and its child
         graph.erase_node(user)
         graph.erase_node(split_node)
+<<<<<<< HEAD
         counters[backend]["remove_split_with_size_one_pass"] += 1
+=======
+        counters["inductor"]["remove_split_with_size_one_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -291,7 +326,10 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
         log.debug("example value absent for node: %s", input)
         return
     ndim = input.meta["example_value"].ndim
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if dim < 0:  # Normalize unbind dim
         dim += ndim
     with graph.inserting_after(node):
@@ -303,6 +341,7 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
     node.replace_all_uses_with(new_node)
     new_node.meta.update(node.meta)
     graph.erase_node(node)
+<<<<<<< HEAD
     counters[backend]["normalization_pass"] += 1
 
 
@@ -311,6 +350,18 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
     pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_cat_default(match: Match, *args, **kwargs):
+=======
+    counters["inductor"]["normalization_pass"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.cat, users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
+)
+def normalize_cat_default(match: Match, *args, **kwargs):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cat_node = match.nodes[0]
     graph = match.graph
     tensors = get_arg_value(cat_node, 0, "tensors")
@@ -335,13 +386,20 @@ def normalize_cat_default(match: Match, *args, **kwargs):
     def is_empty_tensor(x):
         # special case where torch.cat supports cat'ing with an empty tensor
         x_shape = x.meta["example_value"].shape
+<<<<<<< HEAD
         return len(x_shape) == 1 and guard_or_false(x_shape[0] == 0)
+=======
+        return len(x_shape) == 1 and guard_size_oblivious(x_shape[0] == 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert all(
         ndim == x.meta["example_value"].dim() or is_empty_tensor(x) for x in tensors
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if cat_dim < 0:  # Normalize cat dim
         cat_dim += ndim
 
@@ -351,7 +409,10 @@ def is_empty_tensor(x):
         cat_node.args == new_args
         and cat_node.kwargs == new_kwargs
         and cat_node.op == "call_function"
+<<<<<<< HEAD
         and cat_node.target == torch.cat
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         return
 
@@ -364,7 +425,11 @@ def is_empty_tensor(x):
     cat_node.replace_all_uses_with(new_cat_node)
     new_cat_node.meta.update(cat_node.meta)
     graph.erase_node(cat_node)
+<<<<<<< HEAD
     counters[backend]["normalization_pass"] += 1
+=======
+    counters["inductor"]["normalization_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -400,7 +465,11 @@ def normalize_stack_default(match: Match, *args, **kwargs):
     node.replace_all_uses_with(new_node)
     new_node.meta.update(node.meta)
     graph.erase_node(node)
+<<<<<<< HEAD
     counters[backend]["normalization_pass"] += 1
+=======
+    counters["inductor"]["normalization_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def find_next_users(split_node: torch.fx.Node) -> list[torch.fx.Node]:
@@ -661,7 +730,11 @@ def merge_splits(
     for node in to_remove:
         graph.erase_node(node)
 
+<<<<<<< HEAD
     counters[backend]["merge_splits_pass"] += 1
+=======
+    counters["inductor"]["merge_splits_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SplitCatSimplifier:
@@ -721,18 +794,30 @@ def simplify(
             transform_params_list,  # type: ignore[arg-type]
         )
         self.erase_old_nodes(graph, split_node, next_users)  # type: ignore[arg-type]
+<<<<<<< HEAD
         counters[backend]["unbind_stack_pass"] += 1
 
     def get_user_input_list(
         self, split_node: torch.fx.Node, next_users: list[torch.fx.Node]
     ) -> list[list[torch.fx.Node | _Range]]:
+=======
+        counters["inductor"]["unbind_stack_pass"] += 1
+
+    def get_user_input_list(
+        self, split_node: torch.fx.Node, next_users: list[torch.fx.Node]
+    ) -> list[list[Union[torch.fx.Node, _Range]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns list of inputs to the following user nodes, in order. The outer list represents the user node. The inner
         list represents the inputs to that particular node. This list can either contain
           - a tuple representing the ranges of get_items that should go into the cat (closed interval)
           - torch.fx.Node representing "other" inputs (which are not coming from our split)
         """
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]] = []
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for user in next_users:
             if user.target in (torch.cat, torch.stack):
                 user_inputs_list.append(self.get_merged_user_inputs(split_node, user))
@@ -742,7 +827,11 @@ def get_user_input_list(
 
     def get_merged_user_inputs(
         self, split_node: torch.fx.Node, cat_node: torch.fx.Node
+<<<<<<< HEAD
     ) -> list[torch.fx.Node | _Range]:
+=======
+    ) -> list[Union[torch.fx.Node, _Range]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         user_inputs = get_arg_value(cat_node, 0, "tensors")
         simplified_user_inputs = []
         split_users = OrderedSet(split_node.users.keys())
@@ -769,8 +858,13 @@ def get_non_cat_node_input(
         return node_input
 
     def merge_consecutive_inputs(
+<<<<<<< HEAD
         self, inputs: list[torch.fx.Node | int]
     ) -> list[torch.fx.Node | _Range]:
+=======
+        self, inputs: list[Union[torch.fx.Node, int]]
+    ) -> list[Union[torch.fx.Node, _Range]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Merge consecutive inputs going into a user node.
 
@@ -801,8 +895,13 @@ def get_simplified_split_ranges(
         self,
         split_sections,
         next_users,
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]],
     ) -> list[_Range] | None:
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[list[_Range]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ranges = OrderedSet[Any]()
         for user_inputs in user_inputs_list:
             ranges.update(u for u in user_inputs if isinstance(u, tuple))
@@ -820,6 +919,7 @@ def get_simplified_split_ranges(
         split_ranges = self.fill_gaps(split_ranges, 0, cumulative_sizes[-1])
         if len(split_sections) == len(split_ranges):  # Simplification not possible
             return None
+<<<<<<< HEAD
         counters[backend]["scmerge_split_sections_removed"] = len(split_sections) - len(
             split_ranges
         )
@@ -827,6 +927,15 @@ def get_simplified_split_ranges(
 
     def has_non_overlapping_ranges(self, ranges: list[_Range]) -> bool:
         for range_, next_range in itertools.pairwise(ranges):
+=======
+        counters["inductor"]["scmerge_split_sections_removed"] = len(
+            split_sections
+        ) - len(split_ranges)
+        return split_ranges
+
+    def has_non_overlapping_ranges(self, ranges: list[_Range]) -> bool:
+        for range_, next_range in zip(ranges, ranges[1:]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if range_[1] > next_range[0]:
                 return False
         return True
@@ -847,8 +956,13 @@ def get_transform_params(
         self,
         split_node: torch.fx.Node,
         next_users: list[torch.fx.Node],
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]],
     ) -> list[list[_TransformParam]] | None:
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[list[list[_TransformParam]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Figure out what transforms are needed for each input to each cat node.
 
@@ -901,7 +1015,11 @@ def replace_split(
         graph: torch.fx.Graph,
         split_node: torch.fx.Node,
         split_sections: list[int],
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]],
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         split_ranges: list[_Range],
     ) -> list[list[torch.fx.Node]]:
         """
@@ -930,7 +1048,11 @@ def replace_split(
                         [r[1] - r[0] for r in split_ranges],
                         dim=split_dim,
                     )
+<<<<<<< HEAD
                 counters[backend]["scmerge_split_added"] += 1
+=======
+                counters["inductor"]["scmerge_split_added"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             split_items = []
             with graph.inserting_after(new_split):
                 for i in range(len(split_ranges)):
@@ -949,7 +1071,10 @@ def replace_split(
                 if isinstance(user_input, tuple):
                     # Find the correct new getitem (present in split_items)
                     new_user_inputs.append(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         split_items[
                             split_ranges.index(
                                 (
@@ -1000,7 +1125,10 @@ def replace_cat(
                 for user_input_new, transform_param in zip(
                     user_inputs_new, transform_params
                 ):
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not is_node_meta_valid(user_input_new):
                         log.debug("example value absent for node: %s", user_input_new)
                         return
@@ -1015,7 +1143,10 @@ def replace_cat(
                         stack_dim is None or stack_dim == unsqueeze_params[0]
                     ):
                         to_stack.append(user_input_new)
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         to_stack_meta.append(user_input_new.meta["example_value"])
                         stack_dim = unsqueeze_params[0]
                         continue
@@ -1036,12 +1167,18 @@ def replace_cat(
                         if unsqueeze_params:
                             to_stack.append(user_input_new)
                             stack_dim = unsqueeze_params[0]
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             to_stack_meta.append(user_input_new.meta["example_value"])
                             continue
 
                     if unflatten_params:
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.unflatten, args=(user_input_new, *unflatten_params)
@@ -1051,7 +1188,10 @@ def replace_cat(
                             *unflatten_params,  # type: ignore[arg-type]
                         )
                     if movedim_params:
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.movedim, args=(user_input_new, *movedim_params)
@@ -1061,7 +1201,10 @@ def replace_cat(
                             *movedim_params,  # type: ignore[arg-type]
                         )
                     if flatten_params:
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.flatten, args=(user_input_new, *flatten_params)
@@ -1072,7 +1215,10 @@ def replace_cat(
                         )
                     user_inputs_new_transformed.append(user_input_new)
                     user_inputs_new_transformed_meta.append(
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         user_input_new.meta["example_value"]
                     )
                 if to_stack:
@@ -1099,7 +1245,11 @@ def replace_cat(
                         user_inputs_new_transformed_meta,
                         dim=cat_dim,
                     )
+<<<<<<< HEAD
                     counters[backend]["scmerge_cat_added"] += 1
+=======
+                    counters["inductor"]["scmerge_cat_added"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     new_cat_node = user_inputs_new_transformed[-1]
                     new_cat_node.meta["example_value"] = (
@@ -1131,12 +1281,20 @@ def erase_old_nodes(
         next_users: list[torch.fx.Node],
     ):
         to_remove = [split_node]
+<<<<<<< HEAD
         counters[backend]["scmerge_split_removed"] += 1
+=======
+        counters["inductor"]["scmerge_split_removed"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to_remove.extend(split_node.users.keys())
         for next_user in next_users:
             if next_user.target not in (torch.cat, torch.stack):
                 continue
+<<<<<<< HEAD
             counters[backend]["scmerge_cat_removed"] += 1
+=======
+            counters["inductor"]["scmerge_cat_removed"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             to_remove.append(next_user)
         for node in reversed(to_remove):
             if len(node.users.keys()) == 0:
@@ -1177,8 +1335,13 @@ def get_simplified_split_ranges(
         self,
         split_sections: list[int],
         next_users: list[torch.fx.Node],
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]],
     ) -> list[_Range] | None:
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[list[_Range]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         simplified_split_ranges = super().get_simplified_split_ranges(
             split_sections, next_users, user_inputs_list
         )
@@ -1190,8 +1353,13 @@ def get_transform_params(
         self,
         split_node: torch.fx.Node,
         next_users: list[torch.fx.Node],
+<<<<<<< HEAD
         user_inputs_list: list[list[torch.fx.Node | _Range]],
     ) -> list[list[_TransformParam]] | None:
+=======
+        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[list[list[_TransformParam]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Figure out what transforms are needed for each input to each cat node.
 
@@ -1329,7 +1497,11 @@ def merge_split_squeeze(
             graph.erase_node(squeeze)
             graph.erase_node(getitem_node)
     graph.erase_node(split)
+<<<<<<< HEAD
     counters[backend]["split_cat_pass"] += 1
+=======
+    counters["inductor"]["split_cat_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 getitem_unbind = ListOf(
@@ -1432,7 +1604,10 @@ def simplify_split_cat(match: Match, split_sections: list[int], dim: int):
     if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
         return
     split_node = next(node for node in match.nodes if node.target == torch.split)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SplitCatSimplifier().simplify(match.graph, split_node, split_sections)
 
 
@@ -1488,7 +1663,11 @@ def is_sorted_and_consecutive(arr: list[int]) -> bool:
     # check if the array is sorted
     if arr == sorted(arr):
         # check if the differences between adjacent elements are all 1
+<<<<<<< HEAD
         return all(x[1] - x[0] == 1 for x in itertools.pairwise(arr))
+=======
+        return all(x[1] - x[0] == 1 for x in zip(arr, arr[1:]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return False
 
@@ -1501,7 +1680,10 @@ def calculate_fused_tensor_size(split_node: torch.fx.Node, indices: list[int]) -
     for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
         if i in indices:
             fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fused_tensor_size
 
 
@@ -1591,7 +1773,11 @@ def merge_getitem_cat(match: Match, split_sections: list[int], dim: int):
                 split_node = new_split_node
                 split_sections = new_split_sections
 
+<<<<<<< HEAD
                 counters[backend]["merge_getitem_cat_pass"] += 1
+=======
+                counters["inductor"]["merge_getitem_cat_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ############pattern to be optimized is#########
@@ -1652,7 +1838,11 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
                 cat_user.replace_all_uses_with(split_node.args[0])  # type: ignore[arg-type]
                 # remove the cat node
                 graph.erase_node(cat_user)
+<<<<<<< HEAD
                 counters[backend]["mutate_cat_pass"] += 1
+=======
+                counters["inductor"]["mutate_cat_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # case 2: the cat uses some getitems from the split
             elif is_node_meta_valid(split_node.args[0]):  # type: ignore[arg-type]
                 # check the split dim, and construct the slice tuple
@@ -1680,7 +1870,11 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
 
                 # remove the cat node
                 graph.erase_node(cat_user)
+<<<<<<< HEAD
                 counters[backend]["mutate_cat_pass"] += 1
+=======
+                counters["inductor"]["mutate_cat_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 getitem_split_aten = ListOf(
@@ -1715,12 +1909,15 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
         return
     if split_dim < 0:  # Normalize split dim
         split_dim += split_input.meta["val"].dim()
+<<<<<<< HEAD
     # we also need to check the input of the split_node
     # primals =torch.randn(4096, 300)
     # split = torch.ops.aten.split.Tensor(primals, 320, 1) -> truncate to 300 automatically
     # split_2 = torch.ops.aten.split_with_sizes.default(primals, [320], dim = 1) -> runtime error
     split_input_size = split_input.meta["val"].shape[split_dim]
     split_size = min(split_size, split_input_size)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     split_section_list = [split_size] * (len(split_node.meta["val"]))
     new_args = (split_input, split_section_list)
     new_kwargs = {"dim": split_dim}
@@ -1740,7 +1937,11 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
+<<<<<<< HEAD
     counters[backend]["normalization_aten_pass"] += 1
+=======
+    counters["inductor"]["normalization_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -1781,7 +1982,11 @@ def normalize_split_with_size_default_aten(match: Match, *args, **kwargs):
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
+<<<<<<< HEAD
     counters[backend]["normalization_aten_pass"] += 1
+=======
+    counters["inductor"]["normalization_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -1804,11 +2009,15 @@ def merge_split_cat_aten(match: Match, *args, **kwargs):
     for cat_node in list(getitem_nodes[0].users.keys()):
         cat_dim = get_arg_value(cat_node, 1, "dim")
         cat_inputs = get_arg_value(cat_node, 0, "tensors")
+<<<<<<< HEAD
         try:
             cat_input_len = len(cat_inputs)
         except TypeError:
             continue
         if cat_input_len < threshold_to_cat:
+=======
+        if len(cat_inputs) < threshold_to_cat:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         # check split node and cat node has same dim, and all getitem nodes have same parent node
         parent_to_indices = defaultdict(list)  # type: ignore[var-annotated]
@@ -1882,7 +2091,11 @@ def merge_split_cat_aten(match: Match, *args, **kwargs):
                 graph.erase_node(getitem_node)
         if len(split_node.users) == 0:
             graph.erase_node(split_node)
+<<<<<<< HEAD
         counters[backend]["split_cat_aten_pass"] += 1
+=======
+        counters["inductor"]["split_cat_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -1942,7 +2155,11 @@ def merge_select_cat_aten(match: Match, *args, **kwargs):
             for select_node in select_nodes:
                 if len(select_node.users) == 0:
                     graph.erase_node(select_node)
+<<<<<<< HEAD
             counters[backend]["select_cat_aten_pass"] += 1
+=======
+            counters["inductor"]["select_cat_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -1978,7 +2195,10 @@ def is_empty_tensor(x: torch.fx.Node) -> bool:
 
     assert all(ndim == x.meta["val"].dim() or is_empty_tensor(x) for x in tensors)
 
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if cat_dim < 0:  # Normalize cat dim
         cat_dim += ndim
 
@@ -1991,7 +2211,11 @@ def is_empty_tensor(x: torch.fx.Node) -> bool:
     cat_node.replace_all_uses_with(new_cat_node)
     new_cat_node.meta.update(cat_node.meta)
     graph.erase_node(cat_node)
+<<<<<<< HEAD
     counters[backend]["normalization_aten_pass"] += 1
+=======
+    counters["inductor"]["normalization_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -2052,7 +2276,11 @@ def merge_unbind_stack_aten(match: Match, *args, **kwargs):
     for select_node in select_nodes:
         if len(select_node.users) == 0:
             graph.erase_node(select_node)
+<<<<<<< HEAD
     counters[backend]["unbind_stack_aten_pass"] += 1
+=======
+    counters["inductor"]["unbind_stack_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def divide_into_consecutive_sublists(indices: list[int]) -> list[list[int]]:
@@ -2390,7 +2618,11 @@ def split_cat_to_slices(match: Match, split_sections: list[int], dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters[backend]["split_cat_to_slices_pass"] += 1
+=======
+            counters["inductor"]["split_cat_to_slices_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(cat_inputs):
             new_args = (new_cat_args,)
@@ -2406,7 +2638,11 @@ def split_cat_to_slices(match: Match, split_sections: list[int], dim: int):
                 # remove the cat node
                 graph.erase_node(cat_node)
                 remove_split_unbind_children(graph, cat_inputs)
+<<<<<<< HEAD
                 counters[backend]["split_cat_to_slices_pass"] += 1
+=======
+                counters["inductor"]["split_cat_to_slices_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ############pattern to be optimized is#########
@@ -2467,7 +2703,11 @@ def unbind_cat_to_view(match: Match, unbind_input: torch.fx.Node, dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters[backend]["unbind_cat_to_view_pass"] += 1
+=======
+            counters["inductor"]["unbind_cat_to_view_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # get the view shape
@@ -2487,7 +2727,11 @@ def unbind_cat_to_view(match: Match, unbind_input: torch.fx.Node, dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters[backend]["unbind_cat_to_view_pass"] += 1
+=======
+            counters["inductor"]["unbind_cat_to_view_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def reshape_cat_node_to_stack(
@@ -2512,8 +2756,12 @@ def reshape_cat_node_to_stack(
             args=(cat_node, tuple(reshape_list)),
         )
         reshape_node.meta["example_value"] = torch.reshape(
+<<<<<<< HEAD
             cat_node.meta["example_value"],
             tuple(reshape_list),  # pyrefly: ignore [bad-argument-type]
+=======
+            cat_node.meta["example_value"], tuple(reshape_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         permute_list = list(range(len(stack_shape)))
         permute_list[stack_dim], permute_list[split_or_unbind_dim] = (
@@ -2638,7 +2886,11 @@ def split_stack_to_cats(match: Match, split_sections: list[int], dim: int):
         # case 1: only one node in the new cat args, don't need to cat
         if len(new_cat_args) == 1:
             reshape_cat_node_to_stack(graph, new_cat_args[0], stack_node, split_dim)
+<<<<<<< HEAD
             counters[backend]["split_stack_to_cats_pass"] += 1
+=======
+            counters["inductor"]["split_stack_to_cats_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             with graph.inserting_after(stack_node):
@@ -2651,7 +2903,11 @@ def split_stack_to_cats(match: Match, split_sections: list[int], dim: int):
                     new_cat_args_meta, dim=split_dim
                 )
                 reshape_cat_node_to_stack(graph, cat_node, stack_node, split_dim)
+<<<<<<< HEAD
                 counters[backend]["split_stack_to_cats_pass"] += 1
+=======
+                counters["inductor"]["split_stack_to_cats_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ############pattern to be optimized is#########
@@ -2710,7 +2966,11 @@ def unbind_stack_to_slices(match: Match, unbind_input: torch.fx.Node, dim: int):
         # case 1: only one node in the new cat args, don't need to cat
         if len(new_cat_args) == 1:
             reshape_cat_node_to_stack(graph, new_cat_args[0], stack_node, unbind_dim)
+<<<<<<< HEAD
             counters[backend]["unbind_stack_to_slices_pass"] += 1
+=======
+            counters["inductor"]["unbind_stack_to_slices_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # get the view shape
@@ -2725,7 +2985,11 @@ def unbind_stack_to_slices(match: Match, unbind_input: torch.fx.Node, dim: int):
                     new_cat_args_meta, dim=cat_dim
                 )
                 reshape_cat_node_to_stack(graph, new_cat_node, stack_node, unbind_dim)
+<<<<<<< HEAD
             counters[backend]["unbind_stack_to_slices_pass"] += 1
+=======
+            counters["inductor"]["unbind_stack_to_slices_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ############pattern to be optimized is#########
@@ -2830,7 +3094,11 @@ def move_reshape_out_of_split_stack(match: Match, *args, **kwargs):
             # check the input of stack node, and remove nodes that have no users
             remove_split_unbind_children(graph, stack_inputs)  # type: ignore[arg-type]
             remove_split_unbind_children(graph, split_users)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters[backend]["move_reshape_out_of_split_stack_pass"] += 1
+=======
+            counters["inductor"]["move_reshape_out_of_split_stack_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # decompose the cat args into multiple stack nodes, i.e., we stack
@@ -2892,7 +3160,11 @@ def move_reshape_out_of_split_stack(match: Match, *args, **kwargs):
                 graph.erase_node(stack_node)
                 remove_split_unbind_children(graph, stack_inputs)  # type: ignore[arg-type]
                 remove_split_unbind_children(graph, split_users)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters[backend]["move_reshape_out_of_split_stack_pass"] += 1
+=======
+            counters["inductor"]["move_reshape_out_of_split_stack_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 view_getitem_split_aten = ListOf(
@@ -2984,6 +3256,7 @@ def move_view_after_cat(match: Match, *args, **kwargs):
             cat_node.replace_all_uses_with(view_node)
             view_node.meta.update(cat_node.meta)
             graph.erase_node(cat_node)
+<<<<<<< HEAD
         counters[backend]["move_view_after_cat_aten_pass"] += 1
 
 
@@ -3048,3 +3321,6 @@ def should_replace_einsum(einsum_node) -> bool:
         # pyrefly: ignore [bad-argument-type]
         match.replace_by_example(repl, [input, weights])
         counters[backend]["einsum_to_pointwise_pass"] += 1
+=======
+        counters["inductor"]["move_view_after_cat_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_utils.py b/torch/_inductor/fx_utils.py
index ec3a1d83d9248..a2803b59a9bd8 100644
--- a/torch/_inductor/fx_utils.py
+++ b/torch/_inductor/fx_utils.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import contextlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 from collections import defaultdict
 from typing import Any, Callable, Optional
@@ -89,7 +92,10 @@ def hash_node(self, node: torch.fx.Node):
         return (node, node.target, id(node.args), id(node.kwargs))
 
     def incremental_update(self):
+<<<<<<< HEAD
         """Update FakeTensors on self.graph. We will try to do the minimum amount of work."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         existing_storages: defaultdict[Optional[int], int] = defaultdict(int)
         for node in self.graph.nodes:
             existing_storages[get_node_storage(node)] += 1
@@ -97,15 +103,24 @@ def incremental_update(self):
         def is_intlist_same(new, old):
             return statically_known_true(sym_eq(new, old))
 
+<<<<<<< HEAD
         def is_fake_tensor_same(new, old, *, node):
             if type(new) is not type(old):
+=======
+        def is_fake_tensor_same(new, old):
+            if type(new) != type(old):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
             if isinstance(new, (list, tuple)):
                 if len(new) != len(old):
                     return False
                 return all(
+<<<<<<< HEAD
                     is_fake_tensor_same(new_i, old_i, node=node)
                     for new_i, old_i in zip(new, old)
+=======
+                    is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if new is None:
                 return old is None
@@ -135,6 +150,7 @@ def is_fake_tensor_same(new, old, *, node):
             if get_storage(new) == get_storage(old):
                 return True
 
+<<<<<<< HEAD
             def any_user_may_alias(node):
                 if not isinstance(node.meta["val"], torch.Tensor):
                     # analysis too complicated on lists, can support in the future
@@ -190,6 +206,14 @@ def any_user_may_alias(node):
             ):
                 return True
 
+=======
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+            ):
+                return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         def should_process_node(node):
@@ -201,16 +225,22 @@ def should_process_node(node):
             return node.op == "call_function" and (
                 isinstance(node.target, torch._ops.OpOverload)
                 or node.target == operator.getitem
+<<<<<<< HEAD
                 or node.target
                 == torch._inductor.fx_passes.reinplace._generalized_scatter
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         to_process = OrderedSet[int]()
         for node in self.graph.nodes:
+<<<<<<< HEAD
             # NB: Be very careful about skipping nodes (via continues) here
             # and ask for a careful review when changing this code. The
             # consequence for incorrect FakeTensor metadata is difficult-to-debug
             # silent incorrectness.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 self.hash_node(node) in self.processed_hashes
                 and id(node) not in to_process
@@ -225,9 +255,14 @@ def should_process_node(node):
                 continue
             with V.fake_mode, enable_python_dispatcher():
                 new_fake_tensor = node.target(*args, **kwargs)
+<<<<<<< HEAD
 
             if "val" in node.meta and is_fake_tensor_same(
                 new_fake_tensor, node.meta["val"], node=node
+=======
+            if "val" in node.meta and is_fake_tensor_same(
+                new_fake_tensor, node.meta["val"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 continue
 
@@ -238,7 +273,10 @@ def should_process_node(node):
                 symbol_to_path := compute_unbacked_bindings(shape_env, new_fake_tensor)
             ):
                 # Refresh the bindings to the new symbols
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node.meta["unbacked_bindings"] = symbol_to_path
 
             existing_storages[get_node_storage(node)] += 1
@@ -315,7 +353,11 @@ def realizes_inputs(node: torch.fx.Node) -> bool:
 
 
 def count_flops_fx(node: torch.fx.Node) -> Optional[int]:
+<<<<<<< HEAD
     if not countable_fx(node) or isinstance(node.target, str):
+=======
+    if isinstance(node.target, str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     with FakeTensorMode(allow_non_fake_inputs=True):
         success, args, kwargs = get_fake_args_kwargs(node)
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 2e83e6b3a694b..f776d14b7a439 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -44,7 +44,10 @@
     SymTypes,
 )
 from torch.fx.node import Node
+<<<<<<< HEAD
 from torch.fx.passes.reinplace import _is_view_op
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.numbers import int_oo
@@ -66,9 +69,13 @@
     MissingOperatorWithDecomp,
     MissingOperatorWithoutDecomp,
 )
+<<<<<<< HEAD
 from .fx_utils import count_flops_fx
 from .ir import (
     assign_origin_node,
+=======
+from .ir import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Constant,
     DonatedBuffer,
     FixedLayout,
@@ -77,7 +84,10 @@
     InputBuffer,
     Pointwise,
     Reduction,
+<<<<<<< HEAD
     ShapeAsConstantBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     StorageBox,
     TensorBox,
     TorchBindObject,
@@ -125,7 +135,10 @@
     from torch.fx.graph import Graph
 
     from .codegen.wrapper import PythonWrapperCodegen
+<<<<<<< HEAD
     from .dependencies import Dep
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .scheduler import BaseSchedulerNode
 
     CompiledModule = Union[ModuleType, FileBackedGraphModule]
@@ -205,6 +218,7 @@ def get_user_visible_output_strides(g: Graph) -> dict[Node, tuple[int, ...]]:
     return ret
 
 
+<<<<<<< HEAD
 def extend_user_visible_output_strides(
     user_visible_outputs: dict[Node, tuple[int, ...]],
 ) -> dict[Node, object]:
@@ -229,6 +243,8 @@ def extend_user_visible_output_strides(
     return result
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def mark_nodes_dislike_padding(
     g: Graph, user_visible_output_strides: dict[Node, tuple[int, ...]]
 ) -> None:
@@ -242,10 +258,13 @@ def mark_nodes_dislike_padding(
     """
     if not config.comprehensive_padding:
         return
+<<<<<<< HEAD
 
     extended_user_visible_nodes = extend_user_visible_output_strides(
         user_visible_output_strides
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ops_dislike_padding = OrderedSet(
         [
             aten.convolution,
@@ -314,7 +333,11 @@ def _get_overload_packet(
                 if prior_op not in ops_like_padding:
                     prior.meta["dislike_padding"] = True
         # We only want to mark output nodes. So, move it after the above prior nodes process.
+<<<<<<< HEAD
         if not config.pad_outputs and cur in extended_user_visible_nodes:
+=======
+        if not config.pad_outputs and cur in user_visible_output_strides:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cur.meta["dislike_padding"] = True
 
 
@@ -342,7 +365,10 @@ def __init__(
         const_module: Optional[GraphLowering] = None,
         name: Optional[str] = None,
         inputs_to_check: Optional[Sequence[int]] = None,
+<<<<<<< HEAD
         fx_wrapper: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(gm)
         self.example_inputs = example_inputs
@@ -372,7 +398,10 @@ def __init__(
             shape_env.deferred_runtime_asserts.copy()
         )
         self.bound_unbacked_symbols = OrderedSet[sympy.Symbol]()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.sizevars = SizeVarAllocator(shape_env)
         self.graph_input_names: list[str] = []
         self.graph_inputs: dict[str, Union[TensorBox, TorchBindObject, sympy.Expr]] = {}
@@ -386,9 +415,12 @@ def __init__(
             const_module.device_idxs if const_module else OrderedSet()
         )
         self.device_type = "cpu"
+<<<<<<< HEAD
         self.additional_buffer_deps: dict[str, OrderedSet[str]] = defaultdict(
             OrderedSet
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Inplace padding may require Inductor to allocate slightly larger
         # tensor for padding.
@@ -426,6 +458,11 @@ def __init__(
         self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
         self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
         self.wrapper_code: PythonWrapperCodegen = None  # type: ignore[assignment]
+<<<<<<< HEAD
+=======
+        # See `ProxyExecutor Design Note` in ir.py for more details
+        self.extern_kernel_nodes: list[ir.ExternKernelNode] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch._inductor.extern_node_serializer import extern_node_json_serializer
 
@@ -445,7 +482,10 @@ def __init__(
         self.creation_time = time.time()
         self.name = name  # type: ignore[assignment]
         self.cpp_wrapper = cpp_wrapper
+<<<<<<< HEAD
         self.fx_wrapper = fx_wrapper
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # record multi_kernel choice for cpp_wrapper so the second pass knows
         # which sub-kernel is picked. Copy cpp_wrapper to another variable
@@ -512,7 +552,11 @@ def __init__(
         # Below field is related to printing debug intermediate tensor values info for debugging
         self.all_codegen_kernel_names: OrderedSet[str] = OrderedSet()
 
+<<<<<<< HEAD
         # state used by for KernelArgs.workspace
+=======
+        # state used by for Kernel.workspace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.workspace_id = itertools.count()
 
         # track the current placeholder index that we are processing
@@ -520,9 +564,12 @@ def __init__(
 
         self.bw_donated_idxs = get_donated_idxs()
 
+<<<<<<< HEAD
         # Cache for dep size hints to avoid expensive recomputation
         self.dep_size_hint_cache: dict[Dep, int] = {}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def freeze_runtime_asserts(self) -> None:
         self._shape_env.freeze_runtime_asserts()
 
@@ -587,7 +634,10 @@ def get_allocation_size(
             isinstance(node, ir.ComputedBuffer)
             and node.name in self.buffer_to_padded_size
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.buffer_to_padded_size[node.name]
         else:
             return node.get_size()
@@ -609,6 +659,7 @@ def has_feature(
         assert isinstance(feature, BackendFeature), feature
         return feature in self.get_backend_features(get_device_type(device))
 
+<<<<<<< HEAD
     def get_dep_size_hint(self, dep: Dep) -> int:
         """
         Get the size hint for a dependency with caching to avoid expensive recomputation.
@@ -626,6 +677,8 @@ def get_dep_size_hint(self, dep: Dep) -> int:
             self.dep_size_hint_cache[dep] = res
         return self.dep_size_hint_cache[dep]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_current_device_or_throw(self) -> torch.device:
         if device := self.current_device:
             return device
@@ -716,6 +769,7 @@ def is_small_channel(n: torch.fx.Node) -> bool:
 
         # only grouped convolutions benchmarked as slower in conv samples for inference only
         if is_inference:
+<<<<<<< HEAD
             flop_counts: dict[str, float] = defaultdict(float)
             for node in conv_nodes:
                 counted_flops = count_flops_fx(node)
@@ -734,6 +788,34 @@ def is_small_channel(n: torch.fx.Node) -> bool:
                 flop_counts[node_type] += counted_flops
             else:
                 log.debug("Conv inputs meta not found")
+=======
+            from torch.utils.flop_counter import FlopCounterMode
+
+            flop_counts: dict[str, float] = defaultdict(float)
+            for node in conv_nodes:
+                success, args, kwargs = torch._inductor.fx_utils.get_fake_args_kwargs(
+                    node
+                )
+
+                if success:
+                    with FlopCounterMode(display=False) as flop_counter_mode:
+                        with V.fake_mode:
+                            node.target(*args, **kwargs)
+
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    if is_grouped(node):
+                        node_type = "grouped"
+                    elif is_small_channel(node):
+                        node_type = "small"
+                    elif is_in_out_channel(node):
+                        node_type = "in_out"
+                    else:
+                        node_type = "default"
+
+                    flop_counts[node_type] += counted_flops
+                else:
+                    log.debug("Conv inputs meta not found")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # average benchmarked channels last speedup / slowdown, < 1 is speedup.
             # taken from the set of convolution inputs in benchmarks/dynamo/microbenchmarks/operator_inp_logs/torchbench_train/
@@ -856,17 +938,25 @@ def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
         With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
         can be saved.
         """
+<<<<<<< HEAD
         last_conv = None
         nodes_cannot_propagate = [torch.ops.aten.bmm.default]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_set = OrderedSet[Node]()
         for n in reversed(self.module.graph.nodes):  # type: ignore[arg-type, union-attr]
             if n.target == torch.ops.aten.convolution.default:
                 output_set.add(n)
+<<<<<<< HEAD
                 if last_conv is None:
                     last_conv = n
                 continue
             if n.target in nodes_cannot_propagate:
                 continue
+=======
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for user in n.users:
                 if user in output_set:
                     output_set.add(n)
@@ -887,6 +977,7 @@ def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
         # - res2net50_14w_8s
         # - sebotnet33ts_256
         for n in self.module.graph.nodes:  # type: ignore[union-attr]
+<<<<<<< HEAD
             # layout propagation ends at last conv node, which will benefit vison transformers.
             if last_conv is not None and n == last_conv:
                 break
@@ -895,6 +986,10 @@ def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
                     if user.target in nodes_cannot_propagate:
                         continue
                     output_set.add(user)
+=======
+            if n in output_set:
+                output_set.update(n.users)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return output_set
 
@@ -1091,7 +1186,11 @@ def allocate_non_dup_const_name(
 
     def add_tensor_constant(
         self, data: Tensor, name: Optional[str] = None
+<<<<<<< HEAD
     ) -> Union[TensorBox, ir.ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_name = self.allocate_non_dup_const_name(name, data)
         return TensorBox.create(
             ir.ConstantBuffer(
@@ -1118,7 +1217,10 @@ def constant_name(self, name: str, device_override: Optional[torch.device]) -> s
                 self.constants[name].to(device_override),
             )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def placeholder(
         self,
         target: str,  # type: ignore[override]
@@ -1158,11 +1260,18 @@ def placeholder(
             return None
         # See note: Note: [Generator arguments in AOTDispatcher]
         elif isinstance(example, torch.Generator):
+<<<<<<< HEAD
             assert len(V.graph.current_node.users) == 1 and next(
                 iter(V.graph.current_node.users)
             ).target in (
                 torch._prims.rng_prims.graphsafe_run_with_rng_state,
                 torch.ops.higher_order.invoke_subgraph,
+=======
+            assert (
+                len(V.graph.current_node.users) == 1
+                and next(iter(V.graph.current_node.users)).target
+                is torch._prims.rng_prims.graphsafe_run_with_rng_state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             gen = ir.GeneratorState(name=target, device=example.device)
             self.graph_inputs[target] = gen  # type: ignore[assignment]
@@ -1202,7 +1311,11 @@ def placeholder(
 
         self.graph_inputs[target] = tensor
         self.graph_input_names.append(target)
+<<<<<<< HEAD
         self.graph_inputs_original[target] = tensor.data.data  # type: ignore[union-attr]
+=======
+        self.graph_inputs_original[target] = tensor.data.data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.current_node.users:  # cudagraphs should work with an unused CPU input
             self.add_device_info(example.device)
 
@@ -1252,9 +1365,13 @@ def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) ->
                     error.operator_str(target, args, kwargs),
                 )
 
+<<<<<<< HEAD
                 tag: Optional[torch._C.Tag] = get_layout_constraint_tag(
                     target, with_default=False
                 )
+=======
+                tag = get_layout_constraint_tag(target, with_default=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if (
                     tag is None
                     and torch._library.utils.is_builtin(target)
@@ -1271,10 +1388,15 @@ def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) ->
                     # and identify them one by one.
                     decided_constraint = require_contiguous  # type: ignore[assignment]
                 else:
+<<<<<<< HEAD
                     default_tag: torch._C.Tag = get_layout_constraint_tag(
                         target, with_default=True
                     )
                     decided_constraint = tag_to_layout_constraint(default_tag)
+=======
+                    tag = get_layout_constraint_tag(target, with_default=True)
+                    decided_constraint = tag_to_layout_constraint(tag)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 make_fallback(target, layout_constraint=decided_constraint)
 
@@ -1322,12 +1444,16 @@ def normalize(args: Any, kwargs: Any) -> tuple[Any, Any]:
                 else:
                     args, kwargs = layout_constraints(n, *args, **kwargs)
 
+<<<<<<< HEAD
             if "should_fallback" in n.meta:
                 out = fallback_handler(target, add_to_fallback_set=False)(
                     *args, **kwargs
                 )
             else:
                 out = lowerings[target](*args, **kwargs)  # type: ignore[index]
+=======
+            out = lowerings[target](*args, **kwargs)  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if layout_constraints:
                 # layout_constraints are allowed to make new copies of the inputs.
@@ -1348,15 +1474,22 @@ def can_inline_constant(t: torch.Tensor) -> bool:
         """
         return len(t.shape) == 1 and t.shape[0] <= 8
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_attr(
         self,
         target: str,  # type: ignore[override]
         args: tuple[()],  # type: ignore[override]
         kwargs: dict[str, object],
+<<<<<<< HEAD
     ) -> Union[
         Constant, TensorBox, ShapeAsConstantBuffer, ir.Subgraph, TorchBindObject
     ]:
+=======
+    ) -> Union[Constant, TensorBox, ir.Subgraph, TorchBindObject]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # this is a constant
         value = getattr_recursive(self.module, target)  # type: ignore[arg-type]
 
@@ -1406,7 +1539,10 @@ def call_module(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
     def call_method(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
         raise AssertionError
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def output(
         self,
         target: str,  # type: ignore[override]
@@ -1618,10 +1754,15 @@ def debug(msg: str) -> None:
         ):
             if (
                 n.op == "call_function"
+<<<<<<< HEAD
                 # this path only for built-in operators
                 and n.target
                 and isinstance(n.target, torch._ops.OpOverload)
                 and torch._library.utils.is_builtin(n.target)
+=======
+                and n.target
+                not in (operator.getitem, torch._higher_order_ops.invoke_subgraph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and (
                     fallback_node_due_to_unsupported_type(n)
                     or CompilerBisector.disable_subsystem(
@@ -1651,12 +1792,16 @@ def debug(msg: str) -> None:
                         inp_args = eager_input_vals[0]
                         inp_kwargs = eager_input_vals[1]
                         args, kwargs = constrain_to_fake_tensors(
+<<<<<<< HEAD
                             # pyrefly: ignore [unbound-name]
                             args,
                             # pyrefly: ignore [unbound-name]
                             kwargs,
                             inp_args,
                             inp_kwargs,
+=======
+                            args, kwargs, inp_args, inp_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     else:
                         args, kwargs = constrain_to_fx_strides(n, *args, **kwargs)  # type: ignore[index]
@@ -1752,9 +1897,13 @@ def debug(msg: str) -> None:
                         # require_exact_strides to handle views. But ultimately it's better to require
                         # the right strides at the tensor definition.
                         if n.meta["val"]._is_view() or isinstance(
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
                             result.data,
                             ir.BaseView,
+=======
+                            result.data, ir.BaseView
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ):
                             result = ir.ExternKernel.require_stride_order(
                                 result,
@@ -1762,6 +1911,7 @@ def debug(msg: str) -> None:
                                 allow_padding=allow_padding,
                             )
                         else:
+<<<<<<< HEAD
                             # Fix for 0-d tensors: if result size is empty,
                             # strides should also be empty
                             if len(result.get_size()) == 0 and len(strides) > 0:
@@ -1771,6 +1921,12 @@ def debug(msg: str) -> None:
                                     s.node.expr if isinstance(s, torch.SymInt) else s
                                     for s in strides
                                 ]
+=======
+                            strides = [
+                                s.node.expr if isinstance(s, torch.SymInt) else s
+                                for s in strides
+                            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             result = ir.ExternKernel.require_exact_strides(
                                 result, strides, allow_padding=allow_padding
                             )
@@ -1837,7 +1993,10 @@ def debug(msg: str) -> None:
                                 ),
                             )
                     if user.op == "output":
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if isinstance(result.data.data, (Pointwise, Reduction)):
                             result.realize()
 
@@ -1860,7 +2019,35 @@ def debug(msg: str) -> None:
                     if curr.has_large_inner_fn(threshold=100):
                         result.realize()
 
+<<<<<<< HEAD
         assign_origin_node(result, n)
+=======
+        # This is not complete, but it doesn't have to be: origin_node
+        # tracking is best effort.  The logic here critically relies on direct
+        # TensorBox -> StorageBox denoting a non-view; we don't bother trying
+        # to get views to work.  Feel free to add any extra cases as needed.
+        #
+        # Note: we can't YOLO tree_map over this result, because if there are
+        # buffers or a view involved, we might not be able to validly assign
+        # the origin_node here.
+        if isinstance(result, TensorBox) and isinstance(result.data, ir.StorageBox):
+            if isinstance(result.data.data, ir.Loops):
+                result.data.data._post_init_setattr("origin_node", n)
+            elif isinstance(result.data.data, ir.Buffer):
+                result.data.data._post_init_setattr("origin_node", n)
+                if isinstance(result.data.data, ir.ComputedBuffer) and isinstance(
+                    result.data.data.data, ir.Loops
+                ):
+                    result.data.data.data._post_init_setattr("origin_node", n)
+                # Not really multi-output, can straightforwardly recurse in
+                elif (
+                    isinstance(result.data.data, ir.MultiOutput)
+                    and not result.data.data.indices
+                ):
+                    if isinstance(result.data.data.inputs[0], ir.Buffer):
+                        result.data.data.inputs[0]._post_init_setattr("origin_node", n)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_users_of(result)
 
         new_unbacked_defs = OrderedSet[sympy.Symbol]()
@@ -1871,7 +2058,11 @@ def debug(msg: str) -> None:
 
         shape_env = V.graph.sizevars.shape_env
 
+<<<<<<< HEAD
         # An input can be unbacked symint i.e.: when mark_unbacked is used.
+=======
+        # An input can an unbacked symint i.e.: when mark_unabcked is used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in that case add it to new_unbacked_defs.
         if (
             n.op == "placeholder"
@@ -1938,7 +2129,10 @@ def format_new_defs() -> str:
             V.fake_mode.shape_env.unbacked_renamings.get(s, s)
             for s in unbacked_bindings.keys()
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert new_unbacked_defs >= renamed_unbacked_bindings, (
             f"failed {new_unbacked_defs} >= {renamed_unbacked_bindings} (inductor >= fx)\n"
             f"fx node is: {n.format_node()}\n"
@@ -2061,7 +2255,11 @@ def init_wrapper_code(
 
         self.device_ops = get_device_op_overrides(self.device_type)
         wrapper_code_gen_cls = get_wrapper_codegen_for_device(
+<<<<<<< HEAD
             self.device_type, self.cpp_wrapper, self.fx_wrapper
+=======
+            self.device_type, self.cpp_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         assert wrapper_code_gen_cls is not None, (
             f"Device {self.device_type} not supported"
@@ -2162,7 +2360,10 @@ def extract_autotune_inputs(
                             continue
                         dynamic_grid = True
                         new_grid.append(grid_outputs[visited_grids[val]])
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_grids.append(tuple(new_grid))
 
                 if dynamic_grid:
@@ -2184,7 +2385,10 @@ def materialize(
                     x: Union[torch.SymInt, torch.SymFloat, torch.Tensor],
                 ) -> Union[int, float, torch.Tensor]:
                     if x is None:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return None
                     elif isinstance(x, (torch.SymInt, torch.SymFloat)):
                         # Need concrete value to run dynamic shapes and tune the result
@@ -2387,9 +2591,14 @@ def _compile_to_module(self) -> CompiledModule:
         output_code_log.info("Output code written to: %s", mod.__file__)
         if config.benchmark_kernel:
             print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
+<<<<<<< HEAD
         if isinstance(wrapper_code, FileBackedGraphModule):
             V.debug.output_code(mod.__file__)
             V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+=======
+        V.debug.output_code(mod.__file__)
+        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return mod
 
@@ -2425,9 +2634,12 @@ def _compile_to_module_lines(
             ]
             key, path = PyCodeCache.write(wrapper_code.value)
             output_code_log.debug("Output code written to: %s", path)
+<<<<<<< HEAD
 
             V.debug.output_code(path)
             V.debug.copy(os.path.splitext(path)[0] + ".debug")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Exception:
             trace_structured(
                 "inductor_output_code",
@@ -2438,10 +2650,14 @@ def _compile_to_module_lines(
         else:
             trace_structured(
                 "inductor_output_code",
+<<<<<<< HEAD
                 lambda: {
                     "filename": path,
                     "file_path": os.path.abspath(path),
                 },
+=======
+                lambda: {"filename": path},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 payload_fn=lambda: wrapper_code.value,
             )
         with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
@@ -2461,11 +2677,19 @@ def _compile_to_module_lines(
 
         return mod
 
+<<<<<<< HEAD
     def _get_output_names(self, graph_outputs: list[ir.IRNode]) -> list[str]:
         names = []
         shape_counter = itertools.count(0)
         none_counter = itertools.count(0)
         for node in graph_outputs:
+=======
+    def get_output_names(self) -> list[str]:
+        names = []
+        shape_counter = itertools.count(0)
+        none_counter = itertools.count(0)
+        for node in self.graph_outputs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(node, ir.NoneAsConstantBuffer):
                 names.append(f"{self.name}_none{next(none_counter)}")
             elif isinstance(node, ir.ShapeAsConstantBuffer):
@@ -2474,9 +2698,12 @@ def _get_output_names(self, graph_outputs: list[ir.IRNode]) -> list[str]:
                 names.append(node.get_name())
         return names
 
+<<<<<<< HEAD
     def get_output_names(self) -> list[str]:
         return self._get_output_names(self.graph_outputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_unspec_arg(self, name: str) -> bool:
         # dynamo wraps unspec variable as 0d CPU tensor,
         # need to convert to scalar during codegen (triton only)
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index 8f7a1ab5d16be..a981984c164e6 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -65,6 +65,7 @@ def is_constant(self):
 
     def __post_init__(self):
         if _is_constant(self.expr):
+<<<<<<< HEAD
             expr = self.expr
             if isinstance(expr, sympy.Expr):
                 expr = expr.expand(identity=True)
@@ -77,6 +78,9 @@ def __post_init__(self):
                 if self.dtype.is_signed:
                     expr = expr - 2 ** (bits - 1)
             self.expr = expr
+=======
+            self.expr = dtype_to_type(self.dtype)(self.expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SymPyOps:
@@ -333,7 +337,10 @@ def statically_true(self, e):
                 for k, v in self.indirect_var_ranges.items()
             ),
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return statically_known_true(self.shape_env, e, self.axioms, var_to_range)
 
     def indirect_indexing(
diff --git a/torch/_inductor/inductor_prims.py b/torch/_inductor/inductor_prims.py
index 458c881ef0e74..332524226b719 100644
--- a/torch/_inductor/inductor_prims.py
+++ b/torch/_inductor/inductor_prims.py
@@ -112,7 +112,10 @@ def eager_prepare_softmax(x: Tensor, dim: int) -> tuple[Tensor, Tensor]:
     "fma(Tensor a, Tensor b, Tensor c) -> Tensor",
     lambda a, b, c: (a * b) + c,
     doc="Fused multiply add: fma(a, b, c) -> (a * b) + c without rounding after the multiplication",
+<<<<<<< HEAD
     tags=(torch.Tag.pointwise,),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 prepare_softmax_online = make_prim(
     "prepare_softmax_online(Tensor a, int dim) -> (Tensor, Tensor)",
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index a5da990e4ba24..0d7999c928af2 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -6,27 +6,41 @@
 import itertools
 import logging
 import operator
+<<<<<<< HEAD
 import os
 import textwrap
 import traceback
 from collections.abc import Container, Generator, Iterable, Iterator, Sequence
+=======
+import textwrap
+import traceback
+import typing
+from collections.abc import Generator, Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import AbstractContextManager, nullcontext
 from enum import Enum
 from functools import partial
 from typing import (
     Any,
     Callable,
+<<<<<<< HEAD
     cast,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ClassVar,
     Literal,
     Optional,
     overload,
+<<<<<<< HEAD
     SupportsFloat,
     SupportsInt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPE_CHECKING,
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import (
     assert_never,
     Never,
@@ -36,6 +50,9 @@
     TypeAlias,
     TypeIs,
 )
+=======
+from typing_extensions import assert_never, Never, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import sympy
@@ -50,7 +67,10 @@
 from torch._export.serde.serialize import GraphModuleSerializer
 from torch._higher_order_ops.auto_functionalize import can_auto_functionalize
 from torch._inductor import metrics
+<<<<<<< HEAD
 from torch._inductor.utils import get_free_symbols
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import (
     compute_required_storage_length,
     is_boolean_dtype,
@@ -64,6 +84,7 @@
     compute_unbacked_bindings,
     free_symbols,
     free_unbacked_symbols,
+<<<<<<< HEAD
     rebind_unbacked,
     resolve_unbacked_bindings,
     ShapeEnv,
@@ -72,6 +93,17 @@
 from torch.fx.node import Node
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CleanDiv, FloorDiv, Mod, ModularIndexing
+=======
+    IterateExprs,
+    rebind_unbacked,
+    resolve_unbacked_bindings,
+    ShapeEnv,
+    statically_known_true,
+    SymTypes,
+)
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.functions import CleanDiv, FloorDiv, ModularIndexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._sympy.symbol import SymT
 
 from . import config, dependencies
@@ -80,7 +112,10 @@
     CodegenSymbol,
     get_scheduling_for_device,
     index_prevent_reordering,
+<<<<<<< HEAD
     Kernel,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .dependencies import (
     Dep,
@@ -121,11 +156,17 @@
 
 if TYPE_CHECKING:
     from torch._library.fake_class_registry import FakeScriptObject
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import SympyBoolean
     from torch.fx.node import Argument
 
     from .codegen.cuda.cuda_template import CUDATemplate
     from .codegen.wrapper import PythonWrapperCodegen
+=======
+    from torch.fx.node import Node
+
+    from .codegen.cuda.cuda_template import CUDATemplate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .graph import GraphLowering
     from .utils import IndentedBuffer
 
@@ -143,7 +184,10 @@
     has_triton = False
 
 
+<<<<<<< HEAD
 _P = ParamSpec("_P")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _T = TypeVar("_T")
 _U = TypeVar("_U")
 _V = TypeVar("_V")
@@ -151,15 +195,21 @@
 _IntLike: TypeAlias = Union[int, Expr]
 _NumLike: TypeAlias = Union[int, float, Expr]
 
+<<<<<<< HEAD
 _OpOverloads: TypeAlias = Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 indent = functools.partial(textwrap.indent, prefix="  ")
 aten = torch.ops.aten
 
+<<<<<<< HEAD
 autotune_warmup = int(os.getenv("TORCH_AUTOTUNE_WARMUP", 25))
 autotune_rep = int(os.getenv("TORCH_AUTOTUNE_REP", 100))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """ [Note: Inductor IR]
 
 Inductor's IR is produced by executing 'lowering' code (see lowering.py).  Each
@@ -214,7 +264,11 @@
 ]
 
 
+<<<<<<< HEAD
 def _is_static(x: object) -> TypeIs[Union[int, Integer]]:
+=======
+def _is_static(x: object) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(x, (int, Integer))
 
 
@@ -272,7 +326,11 @@ def _check_tensorbox(nodes: Optional[_NodeOrNodes]) -> None:
 
 
 def ops_wrapper(name: str) -> Callable[..., OpsValue]:
+<<<<<<< HEAD
     assert isinstance(name, str), type(name)
+=======
+    assert isinstance(name, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def fn(*args: object, **kwargs: object) -> OpsValue:
         return getattr(ops, name)(*args, **kwargs)
@@ -308,6 +366,16 @@ def reindex(index: Sequence[_T]) -> Sequence[_V]:
     return reindex
 
 
+<<<<<<< HEAD
+=======
+def get_free_symbols(x: IterateExprs, unbacked_only: bool) -> OrderedSet[sympy.Symbol]:
+    if unbacked_only:
+        return free_unbacked_symbols(x)
+    else:
+        return free_symbols(x)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NHWC_STRIDE_ORDER = [3, 0, 2, 1]
 NHWDC_STRIDE_ORDER = [4, 0, 3, 2, 1]
 
@@ -318,7 +386,11 @@ def get_fill_order(
     """
     Convert strides to fill order (argsort)
     """
+<<<<<<< HEAD
     if shape_env is None or all(isinstance(s, (int, sympy.Integer)) for s in seq):
+=======
+    if shape_env is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sorted_idx: Sequence[int] = argsort(seq)
     else:
         # argsort_sym handles unbacked symints (with the help of the shape_env)
@@ -352,7 +424,11 @@ def get_stride_order(
 
 
 @overload
+<<<<<<< HEAD
 def ir_node_to_tensor(x: None, guard_shape: bool = True) -> None: ...
+=======
+def ir_node_to_tensor(x: Literal[None], guard_shape: bool = True) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -379,7 +455,10 @@ def ir_node_to_tensor(
     dtype = x.get_dtype()
     device = x.get_device()
     size = convert_shape_to_symint(size)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stride = convert_shape_to_symint(stride)
     with V.graph.sizevars.shape_env.suppress_guards():
         t = torch.empty_strided(
@@ -407,7 +486,10 @@ def get_device_type(
         return x.type
     elif isinstance(x, (IRNode, OutputSpec)):
         return get_device_type(x.get_device())
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert_never(f"get_device_type({x}: {type(x).__name__})")
 
 
@@ -427,7 +509,11 @@ def is_triton(x: Union[IRNode, torch.device, None, str]) -> bool:
         return False
     from .codegen.triton import TritonScheduling
 
+<<<<<<< HEAD
     assert isinstance(device_scheduling, type), type(device_scheduling)
+=======
+    assert isinstance(device_scheduling, type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return issubclass(device_scheduling, TritonScheduling)
 
 
@@ -436,6 +522,7 @@ def is_cpu(x: Union[IRNode, torch.device, None, str]) -> bool:
 
 
 def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> bool:
+<<<<<<< HEAD
     if (
         not isinstance(x, IRNode)
         or x.maybe_get_stride() is None
@@ -454,6 +541,21 @@ def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> b
 
     # Make sure to guard to recompile when necessary.
     return V.graph.sizevars.guard_or_false(is_aligned)
+=======
+    if not isinstance(x, IRNode) or x.maybe_get_stride() is None:
+        return False
+
+    aligned_strides = all(
+        (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0
+        for i in range(len(x.get_stride()) - 1)
+    )
+    # if the last dim size is <= 1, stride doesn't matter
+    aligned_last_dim = (
+        V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1
+        or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1
+    )
+    return aligned_last_dim and aligned_strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def significant_strides_equal(
@@ -466,21 +568,37 @@ def significant_strides_equal(
     """
     assert len(shape) == len(strides1) and len(strides1) == len(strides2)
     for dim, s1, s2 in zip(shape, strides1, strides2):
+<<<<<<< HEAD
         if V.graph.sizevars.statically_known_leq(dim, 1):
+=======
+        if V.graph.sizevars.statically_known_leq(dim, 1):  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         if not V.graph.sizevars.statically_known_equals(
             s1, s2
+<<<<<<< HEAD
         ) and V.graph.sizevars.symbolic_hint(s1) != V.graph.sizevars.symbolic_hint(s2):
+=======
+        ) and not V.graph.sizevars.symbolic_hint(s1) == V.graph.sizevars.symbolic_hint(
+            s2
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
     return True
 
 
 def try_match_insignificant_strides(
+<<<<<<< HEAD
     tensor: IRNode,
     strides: Sequence[Union[int, torch.SymInt]],
 ) -> IRNode:
+=======
+    tensor: Union[TensorBox, BaseView],
+    strides: Sequence[Union[int, torch.SymInt]],
+) -> Union[TensorBox, BaseView]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
     dimensions - size 0 or 1 - will be updated.
@@ -494,7 +612,11 @@ def try_match_insignificant_strides(
         V.graph.sizevars.statically_known_equals(s1, s2)
         for s1, s2 in zip(strides, tensor.get_stride())
     ):
+<<<<<<< HEAD
         return tensor
+=======
+        return tensor  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not significant_strides_equal(strides, tensor.get_stride(), tensor.get_size()):
         return tensor
@@ -502,7 +624,11 @@ def try_match_insignificant_strides(
     storage, old_layout = as_storage_and_layout(tensor)
     new_stride = [*old_layout.stride]
     for i, s in enumerate(tensor.get_size()):
+<<<<<<< HEAD
         if V.graph.sizevars.statically_known_leq(s, 1):
+=======
+        if V.graph.sizevars.statically_known_leq(s, 1):  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_stride[i] = strides[i]
 
     new_layout = FixedLayout(
@@ -511,7 +637,10 @@ def try_match_insignificant_strides(
         old_layout.size,
         new_stride,
         old_layout.offset,
+<<<<<<< HEAD
         old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return TensorBox(ReinterpretView(data=storage, layout=new_layout))
 
@@ -526,7 +655,11 @@ def gm_original_output_strides(gm: torch.fx.GraphModule) -> None:
     record_original_output_strides(gm)
 
 
+<<<<<<< HEAD
 def get_symbolic_inputs(inputs: Sequence[IRNode]) -> list[Expr]:
+=======
+def get_symbolic_inputs(inputs: list[Buffer]) -> list[Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sym_vars: OrderedSet[Expr] = OrderedSet()
     for inp in inputs:
         sym_vars |= get_free_symbols(inp.get_size(), unbacked_only=False)
@@ -536,6 +669,7 @@ def get_symbolic_inputs(inputs: Sequence[IRNode]) -> list[Expr]:
 
 
 class IRNode:
+<<<<<<< HEAD
     """Base class for all intermediate representation (IR) nodes in TorchInductor.
 
     Note:
@@ -543,11 +677,16 @@ class IRNode:
         and must be overridden by concrete subclasses.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _current_origins: ClassVar[OrderedSet[Any]] = OrderedSet()
 
     # NB: These are kinda weird,
     origins: OrderedSet[Any] = dataclasses.field(init=False)
+<<<<<<< HEAD
     # traces back to where the IRNode is created in Inductor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     traceback: Optional[list[str]] = dataclasses.field(init=False)
     origin_node: Optional[torch.fx.Node] = dataclasses.field(init=False)
 
@@ -561,6 +700,7 @@ def current_origins(origins: OrderedSet[Node]) -> Generator[None, None, None]:
         finally:
             IRNode._current_origins = old
 
+<<<<<<< HEAD
     @staticmethod
     def is_realized_node(node: IRNode) -> bool:
         return isinstance(
@@ -574,6 +714,8 @@ def is_realized_node(node: IRNode) -> bool:
             ),
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _post_init_setattr(self, attr: str, value: Any) -> None:
         # Intended for use in __post_init__ for enforcing an invariant on a dataclass
         # If you must, can also be used for setting provenance info
@@ -581,8 +723,12 @@ def _post_init_setattr(self, attr: str, value: Any) -> None:
         object.__setattr__(self, attr, value)
 
     def __post_init__(self) -> None:
+<<<<<<< HEAD
         origins = OrderedSet(self._current_origins)
         self._post_init_setattr("origins", origins)
+=======
+        self._post_init_setattr("origins", OrderedSet(self._current_origins))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._post_init_setattr(
             "traceback", traceback.format_stack() if config.debug_ir_traceback else None
         )
@@ -600,6 +746,7 @@ def get_origin_node(self) -> Optional[torch.fx.Node]:
     def get_defining_op(self) -> Optional[Operation]:
         return None
 
+<<<<<<< HEAD
     def get_stack_traces(self) -> OrderedSet[str]:
         # Return stack traces to user model code
         # A single IRNode could correspond to multiple lines of code
@@ -633,11 +780,14 @@ def get_stack_traces(self) -> OrderedSet[str]:
                         stack_traces.add(stack_trace)
         return stack_traces
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def common_repr(self, shorten: bool = True) -> Sequence[str]:
         origins = f"origins={getattr(self, 'origins', '')}"
         if shorten and len(origins) > 64:
             # this can get *very* long
             origins = f"{origins[:61]}..."
+<<<<<<< HEAD
         if not self.get_stack_traces():
             return [origins]
 
@@ -647,6 +797,9 @@ def common_repr(self, shorten: bool = True) -> Sequence[str]:
             stack_trace_str += stack_trace.split("\n")
             stack_trace_str.append("}")
         return [origins] + stack_trace_str
+=======
+        return [origins]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def str_helper(
         self, lines: Sequence[object], shorten: bool = True, multiline: bool = True
@@ -654,7 +807,10 @@ def str_helper(
         lines = list(lines) + list(self.common_repr(shorten))
         lines = list(map(str, lines))
         if multiline:
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_lines = indent(",\n".join(lines))
             return f"{type(self).__name__}(\n{new_lines}\n)"
         else:
@@ -788,6 +944,7 @@ def freeze_layout(self) -> None:
         raise NotImplementedError(type(self).__name__)
 
     def freeze_layout_with_stride_order(
+<<<<<<< HEAD
         self, order: Sequence[int], allow_padding: bool = False
     ) -> None:
         raise NotImplementedError(type(self).__name__)
@@ -800,6 +957,20 @@ def freeze_layout_with_same_order(self, stride: Sequence[_IntLike]) -> None:
 
     def freeze_layout_with_exact_strides(
         self, exact_strides: Sequence[_IntLike], allow_padding: bool = False
+=======
+        self, order: list[int], allow_padding: bool = False
+    ) -> None:
+        raise NotImplementedError(type(self).__name__)
+
+    def freeze_layout_with_fill_order(self, order: list[int]) -> None:
+        raise NotImplementedError(type(self).__name__)
+
+    def freeze_layout_with_same_order(self, stride: list[_IntLike]) -> None:
+        raise NotImplementedError(type(self).__name__)
+
+    def freeze_layout_with_exact_strides(
+        self, exact_strides: list[_IntLike], allow_padding: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         raise NotImplementedError(type(self).__name__)
 
@@ -823,7 +994,11 @@ def get_free_symbol_uses(
     def get_reduction_type(self) -> Optional[str]:
         raise NotImplementedError(type(self).__name__)
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(type(self).__name__)
 
     def is_extern(self) -> bool:
@@ -973,9 +1148,13 @@ def get_pointwise_size(self) -> Sequence[Expr]:
         return self.ranges
 
     @classmethod
+<<<<<<< HEAD
     def create(
         cls, *args: Any, **kwargs: Any
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def create(cls, *args: Any, **kwargs: Any) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         origin_node = kwargs.pop("origin_node", None)
         tb = kwargs.pop("traceback", None)
         r = cls(*args, **kwargs)
@@ -1042,7 +1221,11 @@ def get_read_names(self) -> OrderedSet[str]:
     def num_reads(self) -> int:
         return len(self.inner_fn_opcount().read_buffers)
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(
             f"get_reduction_size() is not implemented by {type(self)}!"
         )
@@ -1074,11 +1257,14 @@ def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
 
         return self.inner_fn
 
+<<<<<<< HEAD
     def __str__(self) -> str:
         return self._to_str(("ranges",))
 
     __repr__ = __str__
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_reduction_size(self) -> Sequence[sympy.Expr]:
         return []
 
@@ -1099,10 +1285,14 @@ def constant_to_device(self, device: torch.device) -> IRNode:
         loader = self.make_loader()
         loader = patch.object(ConstantBuffer, "override_device", device)(loader)
         return Pointwise(
+<<<<<<< HEAD
             device=device,
             dtype=self.dtype,
             inner_fn=loader,
             ranges=self.ranges,
+=======
+            device=device, dtype=self.dtype, inner_fn=loader, ranges=self.ranges
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1129,7 +1319,11 @@ def store_output(
         output_name: Optional[str],
         indexer: Callable[[Sequence[Expr]], Never],
         vars: Sequence[Expr],
+<<<<<<< HEAD
     ) -> Any:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loader = self.make_loader()
         if output_name is None:
             output_name = "unnamed"
@@ -1147,7 +1341,10 @@ def store_output(
     "min": ops_wrapper("minimum"),
     "prod": ops_wrapper("mul"),
     "sum": ops_wrapper("add"),
+<<<<<<< HEAD
     "dot": ops_wrapper("add"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "xor_sum": ops_wrapper("bitwise_xor"),
 }
 
@@ -1233,7 +1430,11 @@ def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol
             *(get_free_symbols(e, unbacked_only) for e in self.reduction_ranges)
         )
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.reduction_ranges
 
     def get_reduction_type(self) -> Optional[str]:
@@ -1252,7 +1453,11 @@ def store_reduction(
             self.reduction_type,
             self.inner_fn(vars, reduction_vars),
         )
+<<<<<<< HEAD
         ops.store_reduction(output_name or "unnamed", indexer(vars), value)
+=======
+        return ops.store_reduction(output_name or "unnamed", indexer(vars), value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def index_length(self) -> int:
         return len(self.ranges) + len(self.reduction_ranges)
@@ -1289,7 +1494,11 @@ def num_splits(
         device: torch.device,
         dst_dtype: torch.dtype,
         src_dtype: torch.dtype,
+<<<<<<< HEAD
         inner_fn: Callable[_P, OpsValue],
+=======
+        inner_fn: Callable[..., OpsValue],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ranges: Sequence[_IntLike],
         reduction_ranges: Sequence[_IntLike],
         reduction_type: Union[ReductionType, Literal["scan"]],
@@ -1308,15 +1517,21 @@ def num_splits(
             )
             and config.split_reductions
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not (_is_static(reduction_numel_hint) and _is_static(numel_hint)):
             # We don't support unbacked symints
             return ReductionHint.DEFAULT, 1
 
+<<<<<<< HEAD
         if reduction_type == "dot":
             # Don't split when doing native matmul
             return ReductionHint.DEFAULT, 1
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         props = DeviceProperties.create(device)
         num_sm = props.multi_processor_count
         min_elements_per_thread = 32
@@ -1386,12 +1601,19 @@ def inner_reduction_splits(
         )
 
         def get_read_indices(r: Reduction) -> tuple[Sequence[Expr], bool]:
+<<<<<<< HEAD
             device = r.get_device()
             assert device is not None
             cb = ComputedBuffer(
                 name=None,
                 layout=FlexibleLayout(
                     device=device,
+=======
+            cb = ComputedBuffer(
+                name=None,
+                layout=FlexibleLayout(
+                    device=r.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dtype=r.get_dtype(),
                     size=r.get_size(),
                 ),
@@ -1460,7 +1682,13 @@ def _unroll_reduction_fn(
         src_dtype: torch.dtype,
     ) -> Callable[[Sequence[_IntLike]], OpsValue]:
         """Convert inner_fn from a reduction to an pointwise"""
+<<<<<<< HEAD
         reduction_ranges = V.graph.sizevars.guard_int_seq(reduction_ranges)
+=======
+        reduction_ranges = [
+            V.graph.sizevars.evaluate_static_shape(x) for x in reduction_ranges
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         combine_fn = get_reduction_combine_fn(reduction_type, src_dtype)
 
@@ -1477,10 +1705,19 @@ def fn(index: Sequence[_IntLike]) -> Any:
 
         value_fn: Callable[[Sequence[_IntLike], Sequence[_IntLike]], Any]
         if reduction_type in ("argmin", "argmax"):
+<<<<<<< HEAD
             flatten_index = _fixed_indexer(
                 reduction_ranges,
                 FlexibleLayout.contiguous_strides(reduction_ranges),
             )
+=======
+            flatten_index = FixedLayout(
+                None,  # type: ignore[arg-type]
+                None,  # type: ignore[arg-type]
+                reduction_ranges,
+                FlexibleLayout.contiguous_strides(reduction_ranges),
+            ).make_indexer()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def value_fn(
                 index: Sequence[_IntLike], rindex: Sequence[_IntLike]
@@ -1497,7 +1734,10 @@ def value_fn(
             return fn
 
     @classmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create(
         cls,
         device: torch.device,
@@ -1509,7 +1749,11 @@ def create(
         reduction_type: ReductionType,
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
         input_node: Optional[IRNode] = None,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_numel = V.graph.sizevars.simplify(sympy_product(reduction_ranges))
 
         if reduction_numel == 0:
@@ -1520,10 +1764,17 @@ def py_cnst(val: object) -> Union[bool, float, int]:
                 if dst_dtype == torch.bool:
                     return bool(val)
                 elif dst_dtype.is_floating_point:
+<<<<<<< HEAD
                     assert isinstance(val, SupportsFloat), type(val)
                     return float(val)
                 else:
                     assert isinstance(val, SupportsInt), type(val)
+=======
+                    assert isinstance(val, typing.SupportsFloat)
+                    return float(val)
+                else:
+                    assert isinstance(val, typing.SupportsInt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return int(val)
 
             rtypes_to_inits = {
@@ -1570,10 +1821,14 @@ def fn(index: int) -> OpsValue:
             and V.graph.sizevars.size_hint_or_throw(reduction_numel)
             < config.unroll_reductions_threshold
             and (sympy_product(ranges) != 1 or is_gpu(device.type))
+<<<<<<< HEAD
             and reduction_type != "dot"
         ):
             # When native matmul, don't unroll the dot reduction.
 
+=======
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # NB: This works around https://github.com/pytorch/pytorch/issues/140457
             # since turning reductions into pointwise ops can exacerbate this problem
             return Pointwise.create(
@@ -1616,10 +1871,16 @@ def _maybe_increase_split(split: int) -> int:
             reduction_hint = hint
         if split == -1:
             assert input_node is not None
+<<<<<<< HEAD
             with patch.object(FlexibleLayout, "allow_indexing", True):
                 new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
                     input_node
                 )
+=======
+            new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
+                input_node
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert new_ranges is not None
             assert new_reduction_ranges is not None
             return cls.create_multilayer_existing_ranges(
@@ -1686,7 +1947,10 @@ def default_accumulator(
         return {
             "sum": zero,
             "prod": one,
+<<<<<<< HEAD
             "dot": zero,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "xor_sum": zero,
             "any": zero,
             "welford_reduce": (zero, zero, zero),
@@ -1797,7 +2061,11 @@ def body() -> OpsValue:
     @classmethod
     def _multilayer_wrap_loader_existing_ranges(
         cls,
+<<<<<<< HEAD
         loader: Callable[[Sequence[Expr], Sequence[Expr]], OpsValue],
+=======
+        loader: Callable[[Sequence[sympy.Expr], Sequence[sympy.Expr]], OpsValue],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         original_ranges: Sequence[Expr],
         original_reduction_ranges: Sequence[Expr],
         new_ranges: Sequence[Integer],
@@ -1811,8 +2079,13 @@ def _multilayer_wrap_loader_existing_ranges(
         )
 
         def wrapper_fn(
+<<<<<<< HEAD
             merged_index: Sequence[Expr],
             new_reduction_index: Sequence[Expr],
+=======
+            merged_index: Sequence[sympy.Expr],
+            new_reduction_index: Sequence[sympy.Expr],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> OpsValue:
             original_idx = merged_index[: len(original_ranges)]
             new_index = merged_index[len(original_ranges) :]
@@ -1837,7 +2110,11 @@ def create_multilayer_helper(
         reduction_type: ReductionType,
         split: _IntLike,
         reduction_hint: ReductionHint,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Break a large reduction up into multiple smaller reductions
         recursively
@@ -1900,7 +2177,11 @@ def create_multilayer(
         split: _IntLike,
         reduction_hint: ReductionHint,
         input_node: Optional[IRNode] = None,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Break a large reduction up into multiple smaller reductions
         recursively
@@ -1946,7 +2227,11 @@ def create_multilayer_existing_ranges(
         new_reduction_ranges: list[Integer],
         reduction_type: ReductionType,
         reduction_hint: ReductionHint,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Break a large reduction up into multiple smaller reductions
         recursively
@@ -1973,6 +2258,7 @@ def create_multilayer_existing_ranges(
         )
 
 
+<<<<<<< HEAD
 def _fixed_indexer(
     size: Sequence[int],
     stride: Optional[Sequence[int]] = None,
@@ -1993,6 +2279,9 @@ def indexer(index: Sequence[int]) -> int:
 
 
 INNER_FN_TY: TypeAlias = Callable[[Sequence[Expr], Sequence[Expr]], OpsValue]
+=======
+INNER_FN_TY = Callable[[Sequence[Expr], Sequence[Expr]], OpsValue]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MultiOutputReduction(Reduction):
@@ -2041,14 +2330,22 @@ def store_reduction(
         indexer: Callable[[Sequence[Expr]], Never],
         vars: Sequence[Expr],
         reduction_vars: Sequence[Symbol],
+<<<<<<< HEAD
     ) -> Any:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         values = ops.reduction(
             self.dtype,
             self.src_dtype,
             self.reduction_type,
             self.inner_fn(vars, reduction_vars),
         )
+<<<<<<< HEAD
         assert isinstance(values, (tuple, list)), type(values)
+=======
+        assert isinstance(values, (tuple, list)), f"{type(values)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = values[self.output_index]
         return ops.store_reduction(output_name or "unnamed", indexer(vars), value)
 
@@ -2066,7 +2363,11 @@ def create(  # type: ignore[override]
         num_output: int,
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
         input_node: Optional[IRNode] = None,
+<<<<<<< HEAD
     ) -> Sequence[Union[TensorBox, ShapeAsConstantBuffer]]:
+=======
+    ) -> Sequence[TensorBox]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create the reduction disregarding splitting.
         """
@@ -2078,7 +2379,11 @@ def create(  # type: ignore[override]
                     inner_fn,
                     ranges,
                     reduction_ranges,
+<<<<<<< HEAD
                     "online_softmax_reduce",
+=======
+                    "online_softmax_reduce",  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     src_dtype,
                     reduction_hint,
                     output_idx,
@@ -2102,12 +2407,20 @@ def create(  # type: ignore[override]
         reduction_ranges: list[Integer],
         reduction_type: ReductionType,
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
+<<<<<<< HEAD
     ) -> Sequence[Union[TensorBox, ShapeAsConstantBuffer]]:
+=======
+    ) -> Sequence[TensorBox]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert reduction_type in ("welford_reduce", "welford_combine")
 
         reduction_numel = V.graph.sizevars.simplify(sympy_product(reduction_ranges))
 
+<<<<<<< HEAD
         def const(val: int) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+        def const(val: int) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def inner_fn(idx: Sequence[Expr]) -> OpsValue:
                 return ops.constant(
                     val,
@@ -2131,7 +2444,11 @@ def inner_fn(idx: Sequence[Expr]) -> OpsValue:
 
             def copy(
                 loader: Callable[[Sequence[Expr], Sequence[Expr]], OpsValue],
+<<<<<<< HEAD
             ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+            ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 def inner_fn(idx: Sequence[Expr]) -> OpsValue:
                     reduction_index = [sympy.S.Zero for _ in reduction_ranges]
                     return loader(idx, reduction_index)
@@ -2230,7 +2547,11 @@ def create_multilayer(  # type: ignore[override]
         reduction_type: ReductionType,
         split: _IntLike,
         reduction_hint: ReductionHint,
+<<<<<<< HEAD
     ) -> Sequence[Union[TensorBox, ShapeAsConstantBuffer]]:
+=======
+    ) -> Sequence[TensorBox]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Break a large reduction up into multiple smaller reductions
         recursively
@@ -2351,7 +2672,11 @@ def store_reduction(
         indexer: Callable[[Sequence[_IntLike]], Never],
         vars: Sequence[Expr],
         scan_vars: Sequence[Symbol],
+<<<<<<< HEAD
     ) -> Any:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = self.reindex(vars, scan_vars)
         values = tuple(inner_fn(idx) for inner_fn in self.inner_fns)
         result = ops.scan(self.dtypes, self.combine_fn, values)
@@ -2363,7 +2688,11 @@ def get_reduction_type(self) -> Optional[str]:
         # return self.scan_op
         return "custom"
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.scan_ranges
 
     def get_size(self) -> Sequence[Expr]:
@@ -2401,7 +2730,11 @@ def create(  # type: ignore[override]
         # Whether we have the option to fallback to aten
         can_fallback_to_aten: bool = True,
         **kwargs: Any,
+<<<<<<< HEAD
     ) -> Sequence[Optional[Union[TensorBox, ShapeAsConstantBuffer]]]:
+=======
+    ) -> Sequence[Optional[TensorBox]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pointwise_ranges = [*size[:axis], *size[axis + 1 :]]
         scan_ranges = [size[axis]]
 
@@ -2443,7 +2776,10 @@ def create(  # type: ignore[override]
         scan_type = Scan
         if num_splits > 1:
             supports_split = (
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.version.hip is None or (has_triton and triton_version >= "3.3.0")
             ) and (len(dtypes) == 1)
             if not supports_split:
@@ -2558,7 +2894,11 @@ def store_reduction(
         indexer: Callable[[Sequence[Expr]], Expr],
         vars: Sequence[Expr],
         reduction_vars: Sequence[Expr],
+<<<<<<< HEAD
     ) -> Any:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = self.reindex(vars, reduction_vars)
         values = tuple(inner_fn(idx) for inner_fn in self.inner_fns)
         result = ops.sort(self.dtypes, values, self.stable, self.descending)
@@ -2605,7 +2945,11 @@ def create(  # type: ignore[override]
         descending: bool,
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
         **kwargs: Any,
+<<<<<<< HEAD
     ) -> Sequence[Optional[Union[TensorBox, ShapeAsConstantBuffer]]]:
+=======
+    ) -> Sequence[Optional[TensorBox]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pointwise_ranges = [*size[:axis], *size[axis + 1 :]]
         sort_ranges = [size[axis]]
 
@@ -2769,8 +3113,13 @@ def is_unaligned(node: IRNode) -> bool:
 
     if isinstance(node, ReinterpretView):
         layout = node.layout
+<<<<<<< HEAD
         has_unaligned_layout = not V.graph.sizevars.statically_known_multiple_of(
             layout.offset * get_dtype_size(layout.dtype), GPU_ALIGN_BYTES
+=======
+        has_unaligned_layout = not statically_known_true(
+            layout.offset * get_dtype_size(layout.dtype) % GPU_ALIGN_BYTES == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return is_unaligned(node.data) or has_unaligned_layout
 
@@ -2837,6 +3186,7 @@ def has_exceeded_max_reads(self) -> bool:
     def realize(self) -> Optional[str]:
         return self.data.realize()
 
+<<<<<<< HEAD
     def realize_hint(self) -> None:
         self.data.realize_hint()
 
@@ -2849,6 +3199,19 @@ def is_extern(self) -> bool:
     def is_module_buffer(self) -> bool:
         assert isinstance(self.data, BaseView), type(self.data)
         return self.data.is_module_buffer()
+=======
+    def realize_hint(self):  # type: ignore[no-untyped-def]
+        return self.data.realize_hint()
+
+    def get_storage_numel(self):  # type: ignore[no-untyped-def]
+        return self.data.get_storage_numel()
+
+    def is_extern(self) -> bool:
+        return self.data.is_extern()  # type: ignore[attr-defined]
+
+    def is_module_buffer(self) -> bool:
+        return self.data.is_module_buffer()  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_read_names(self) -> OrderedSet[str]:
         return self.data.get_read_names()
@@ -2857,10 +3220,17 @@ def get_reads(self) -> OrderedSet[Dep]:
         with patch.object(FlexibleLayout, "allow_indexing", True):
             return extract_read_writes(
                 self.make_loader(),
+<<<<<<< HEAD
                 self.get_size(),
             ).reads
 
     def unwrap_view(self) -> IRNode:
+=======
+                self.get_size(),  # type: ignore[arg-type]
+            ).reads
+
+    def unwrap_view(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x: IRNode = self
         while isinstance(x, BaseView):
             x = x.data
@@ -2880,6 +3250,7 @@ def constant_to_device(self, device: torch.device) -> IRNode:
 
 @ir_dataclass
 class ExpandView(BaseView):
+<<<<<<< HEAD
     size: Sequence[Expr]
 
     @staticmethod
@@ -2887,6 +3258,15 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
         """Replace `-1` with correct sizes"""
         sizevars = V.graph.sizevars
         new_size = [sympy.expand(s) for s in new_size]
+=======
+    size: list[Expr]
+
+    @staticmethod
+    def _normalize_size(x, new_size):  # type: ignore[no-untyped-def]
+        """Replace `-1` with correct sizes"""
+        sizevars = V.graph.sizevars
+        new_size = list(map(sympy.expand, new_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         old_size = x.get_size()
         old_size = [None] * (len(new_size) - len(old_size)) + list(old_size)
         assert len(new_size) == len(old_size)
@@ -2894,8 +3274,13 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
             if new_size[i] == -1:
                 assert old_size[i] is not None
                 new_size[i] = old_size[i]
+<<<<<<< HEAD
             elif old_size[i] is None or V.graph.sizevars.is_size_one_or_false(
                 old_size[i]
+=======
+            elif old_size[i] is None or V.graph.sizevars.shape_env.evaluate_expr(
+                sympy.Eq(old_size[i], 1), size_oblivious=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 pass
             else:
@@ -2904,14 +3289,23 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
                 # NB: new_size[i] == old_size[i] is expected to already be
                 # guarded because the meta formula was expected to have taught
                 # us this equality.
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
                 assert sizevars.size_hint(new_size[i] - old_size[i], fallback=0) == 0, (
                     f"Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}"
+=======
+                assert sizevars.size_hint(new_size[i] - old_size[i], fallback=0) == 0, (
+                    "Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         return new_size
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
+=======
+    def create(cls, x, new_size):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_size = cls._normalize_size(x, new_size)
 
         if is_storage_and_layout(x):
@@ -2922,7 +3316,13 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
             for stride, size in zip(old_layout.stride, old_layout.size):
                 new_stride.append(
                     stride
+<<<<<<< HEAD
                     if not V.graph.sizevars.is_size_one_or_false(size)
+=======
+                    if not V.graph.sizevars.shape_env.evaluate_expr(
+                        sympy.Eq(size, 1), size_oblivious=True
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else sympy.S.Zero
                 )
             new_layout = FixedLayout(
@@ -2931,7 +3331,10 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
                 list(new_size),
                 new_stride,
                 old_layout.offset,
+<<<<<<< HEAD
                 old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -2940,16 +3343,24 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
     def get_size(self) -> Sequence[Expr]:
         return self.size
 
+<<<<<<< HEAD
     def make_reindexer(
         self,
     ) -> Callable[[Sequence[Expr]], Sequence[Expr]]:
+=======
+    def make_reindexer(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target = self.get_size()
         actual = self.data.get_size()
         skip = len(target) - len(actual)
 
+<<<<<<< HEAD
         def reindex(
             index: Sequence[Expr],
         ) -> Sequence[Expr]:
+=======
+        def reindex(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             index = list(index[skip:])
             assert len(index) == len(actual)
             for i in range(len(actual)):
@@ -2966,7 +3377,11 @@ class PermuteView(BaseView):
     dims: list[Expr]
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, dims: Sequence[int]) -> BaseView:
+=======
+    def create(cls, x, dims):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dims = cls._map_neg_dims(dims)
         assert OrderedSet(dims) == OrderedSet(range(len(dims)))
 
@@ -2978,14 +3393,21 @@ def create(cls, x: IRNode, dims: Sequence[int]) -> BaseView:
                 [old_layout.size[i] for i in dims],
                 [old_layout.stride[i] for i in dims],
                 old_layout.offset,
+<<<<<<< HEAD
                 old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
         return PermuteView(data=x, dims=dims)
 
     @classmethod
+<<<<<<< HEAD
     def _map_neg_dims(cls, dims: Sequence[int]) -> list[int]:
+=======
+    def _map_neg_dims(cls, dims):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [dim if dim >= 0 else len(dims) + dim for dim in dims]
 
     def get_size(self) -> Sequence[Expr]:
@@ -2995,16 +3417,24 @@ def get_size(self) -> Sequence[Expr]:
         size = self.data.get_size()
         return [size[i] for i in self.dims]
 
+<<<<<<< HEAD
     def make_reindexer(
         self,
     ) -> Callable[[Sequence[Expr]], Sequence[Expr]]:
+=======
+    def make_reindexer(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inv = {j: i for i, j in enumerate(self.dims)}
         inv = [inv[i] for i in range(len(self.dims))]
         assert OrderedSet(inv) == OrderedSet(range(len(self.dims)))
 
+<<<<<<< HEAD
         def reindex(
             index: Sequence[Expr],
         ) -> Sequence[Expr]:
+=======
+        def reindex(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [index[i] for i in inv]
 
         return reindex
@@ -3013,19 +3443,31 @@ def reindex(
 @ir_dataclass
 class SqueezeView(BaseView):
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
+=======
+    def create(cls, x, *, dim=None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_storage_and_layout(x):
             storage, old_layout = as_storage_and_layout(x)
             new_size = []
             new_stride = []
             if dim is not None:
+<<<<<<< HEAD
                 assert isinstance(dim, int), type(dim)
+=======
+                assert isinstance(dim, int), "expected integer dim argument"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert 0 <= dim and dim < len(old_layout.size)
 
             for i, (size, stride) in enumerate(zip(old_layout.size, old_layout.stride)):
                 if dim is None:
+<<<<<<< HEAD
                     # Only append if dim is not squeezed out
                     if not V.graph.sizevars.is_size_one_or_false(size):
+=======
+                    if size != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         new_size.append(size)
                         new_stride.append(stride)
                 else:
@@ -3041,11 +3483,15 @@ def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
                 new_size,
                 new_stride,
                 old_layout.offset,
+<<<<<<< HEAD
                 old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
         if dim is None:
+<<<<<<< HEAD
             return View.create(
                 x,
                 [
@@ -3054,19 +3500,31 @@ def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
                     if not V.graph.sizevars.is_size_one_or_false(s)
                 ],
             )
+=======
+            # redirect to a generic view
+            return View.create(x, [s for s in x.get_size() if s != 1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             assert x.get_size()[dim] == 1
             return View.create(x, [s for i, s in enumerate(x.get_size()) if i != dim])
 
     @staticmethod
+<<<<<<< HEAD
     def squeezer(
         size: Sequence[Expr],
     ) -> tuple[list[int], Callable[[Sequence[Expr]], tuple[Expr]]]:
+=======
+    def squeezer(size: Sequence[sympy.Expr]):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_size = [s for s in size if s != 1]
         not_one = [i for i, s in enumerate(size) if s != 1]
         length = len(size)
 
+<<<<<<< HEAD
         def reindex(index: Sequence[Expr]) -> tuple[Expr]:
+=======
+        def reindex(index: list[sympy.Expr]) -> tuple[sympy.Expr, ...]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(index) == len(not_one), f"{index} {not_one}"
             new_index = [sympy.S.Zero] * length
             for idx, s in zip(not_one, index):
@@ -3075,18 +3533,29 @@ def reindex(index: Sequence[Expr]) -> tuple[Expr]:
 
         return new_size, reindex
 
+<<<<<<< HEAD
     def __init__(self, data: Any) -> None:
+=======
+    def __init__(self, data) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise AssertionError("use SqueezeView.create()")
 
 
 @ir_dataclass
 class GenericView(BaseView):
+<<<<<<< HEAD
     size: Sequence[Expr]
     reindex: Callable[[Sequence[Expr]], Sequence[Expr]]
 
     def make_reindexer(
         self,
     ) -> Callable[[Sequence[Expr]], Sequence[Expr]]:
+=======
+    size: list[Expr]
+    reindex: Callable[..., Any]
+
+    def make_reindexer(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.reindex
 
     def reindex_str(self) -> str:
@@ -3104,12 +3573,16 @@ def __str__(self) -> str:
     __repr__ = __str__
 
     @classmethod
+<<<<<<< HEAD
     def create(
         cls,
         x: IRNode,
         new_size: Sequence[Expr],
         reindex: Callable[[Sequence[Expr]], Sequence[Expr]],
     ) -> BaseView:
+=======
+    def create(cls, x, new_size, reindex):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(data=x, size=list(new_size), reindex=reindex)
 
     def get_size(self) -> Sequence[Expr]:
@@ -3119,7 +3592,11 @@ def get_size(self) -> Sequence[Expr]:
 @ir_dataclass
 class View(GenericView):
     @staticmethod
+<<<<<<< HEAD
     def handle_negative_index(idx: Expr, size: Expr) -> Expr:
+=======
+    def handle_negative_index(idx, size):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = sympy.expand(idx)
         size = sympy.expand(size)
         evaluate_expr = V.graph.sizevars.shape_env.evaluate_expr
@@ -3128,8 +3605,13 @@ def handle_negative_index(idx: Expr, size: Expr) -> Expr:
         return idx
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, new_size: Sequence[Expr]) -> IRNode:  # type: ignore[override]
         assert isinstance(new_size, Sequence), type(new_size)
+=======
+    def create(cls, x, new_size):  # type: ignore[no-untyped-def, override]
+        assert isinstance(new_size, (tuple, list))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         old_size, new_size = cls.resolve_negative_size(x.get_size(), new_size)
 
         # Skip pointless views
@@ -3145,7 +3627,11 @@ def create(cls, x: IRNode, new_size: Sequence[Expr]) -> IRNode:  # type: ignore[
 
         if 0 in new_size:
 
+<<<<<<< HEAD
             def fake_reindex(index: Any) -> tuple[int, ...]:
+=======
+            def fake_reindex(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return tuple([0] * len(old_size))
 
             return cls(data=x, size=list(new_size), reindex=fake_reindex)
@@ -3166,7 +3652,10 @@ def fake_reindex(index: Any) -> tuple[int, ...]:
                 new_size,
                 FlexibleLayout.contiguous_strides(new_size),
                 old_layout.offset,
+<<<<<<< HEAD
                 old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -3174,9 +3663,13 @@ def fake_reindex(index: Any) -> tuple[int, ...]:
         return cls(data=x, size=list(new_size), reindex=reindex)
 
     @staticmethod
+<<<<<<< HEAD
     def resolve_negative_size(
         old_size: Sequence[Expr], new_size: Sequence[Expr]
     ) -> tuple[list[Expr], list[Expr]]:
+=======
+    def resolve_negative_size(old_size, new_size):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_size = [V.graph.sizevars.simplify(x) for x in new_size]
         old_size = [V.graph.sizevars.simplify(x) for x in old_size]
 
@@ -3187,7 +3680,11 @@ def resolve_negative_size(
                 new_size[i] = CleanDiv(sympy_product(old_size), sympy_product(new_size))
                 break
 
+<<<<<<< HEAD
         V.graph.sizevars.check_equals(sympy_product(old_size), sympy_product(new_size))
+=======
+        V.graph.sizevars.guard_equals(sympy_product(old_size), sympy_product(new_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return old_size, new_size
 
     @classmethod
@@ -3195,7 +3692,11 @@ def dynamic_reshape_indexer(
         cls,
         old_size: Sequence[_IntLike],
         new_size: Sequence[_IntLike],
+<<<<<<< HEAD
         dense_dim: Optional[int] = None,
+=======
+        dense_dim: Optional[int] = None,  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Callable[[Sequence[_T]], Sequence[_V]]:
         try:
             reindex = cls._dynamic_reshape_indexer(old_size, new_size, dense_dim)
@@ -3208,11 +3709,15 @@ def dynamic_reshape_indexer(
         return reindex
 
     @staticmethod
+<<<<<<< HEAD
     def _dynamic_reshape_indexer(
         old_size: Sequence[Expr],
         new_size: Sequence[Expr],
         dense_dim: Optional[int] = None,
     ) -> Callable[[Sequence[Expr]], Sequence[Expr]]:
+=======
+    def _dynamic_reshape_indexer(old_size, new_size, dense_dim: Optional[int] = None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Perform a reshape entirely by modifying indexing math
         """
@@ -3248,14 +3753,22 @@ def _dynamic_reshape_indexer(
                 stack_old.append(size_old)  # re-add
             elif size_hint(size_new) == size_hint(size_old):
                 view_expr.append(var)
+<<<<<<< HEAD
                 V.graph.sizevars.check_equals(size_new, size_old)
+=======
+                V.graph.sizevars.guard_equals(size_new, size_old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif size_hint(size_new) < size_hint(size_old):
                 while size_hint(size_new) < size_hint(size_old):
                     var2, size_new2 = stack_new.pop()
                     var = var2 * size_new + var
                     size_new = size_new * size_new2
                 view_expr.append(var)
+<<<<<<< HEAD
                 V.graph.sizevars.check_equals(size_new, size_old)
+=======
+                V.graph.sizevars.guard_equals(size_new, size_old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif size_hint(size_new) > size_hint(size_old):
                 divisor = sympy.S.One
                 modulus = size_old
@@ -3266,18 +3779,30 @@ def _dynamic_reshape_indexer(
                     view_expr.append(ModularIndexing(var, divisor, modulus))
                     divisor = divisor * modulus
                     size_old = size_old * modulus
+<<<<<<< HEAD
                 V.graph.sizevars.check_equals(size_new, size_old)
+=======
+                V.graph.sizevars.guard_equals(size_new, size_old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise AssertionError
 
         while stack_old:
             size_old = stack_old.pop()
+<<<<<<< HEAD
             V.graph.sizevars.check_equals(size_old, 1)
+=======
+            V.graph.sizevars.guard_equals(size_old, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             view_expr.append(sympy.S.Zero)
 
         while stack_new:
             var, size_new = stack_new.pop()
+<<<<<<< HEAD
             V.graph.sizevars.check_equals(size_new, 1)
+=======
+            V.graph.sizevars.guard_equals(size_new, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dense_dim is not None and len(new_size) == 1:
             view_expr.reverse()
@@ -3289,9 +3814,13 @@ def _dynamic_reshape_indexer(
 
         assert len(view_expr) == len(old_size)
 
+<<<<<<< HEAD
         def reindex(
             index: Sequence[Expr],
         ) -> Sequence[Expr]:
+=======
+        def reindex(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(index) == len(vars), (len(index), len(vars))
             replacements = dict(zip(vars, index))
             return tuple(sympy_subs(x, replacements) for x in view_expr)
@@ -3330,13 +3859,21 @@ def get_origin_node(self) -> Optional[torch.fx.Node]:
         return None
 
     @property
+<<<<<<< HEAD
     def dtype(self) -> torch.dtype:
+=======
+    def dtype(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.layout.dtype
 
     def get_size(self) -> Sequence[Expr]:
         return list(self.layout.size)
 
+<<<<<<< HEAD
     def get_stride(self) -> Sequence[Expr]:
+=======
+    def get_stride(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return list(self.layout.stride)
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
@@ -3356,7 +3893,11 @@ def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
     def get_layout(self) -> Layout:
         return self.layout
 
+<<<<<<< HEAD
     def freeze_layout(self) -> None:
+=======
+    def freeze_layout(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
     def get_free_symbol_uses(
@@ -3392,7 +3933,11 @@ class DtypeView(BaseView):
     target_dtype: torch.dtype
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, new_dtype: torch.dtype) -> BaseView:
+=======
+    def create(cls, x, new_dtype):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_storage_and_layout(x):
             storage, old_layout = as_storage_and_layout(x)
             new_layout = FixedLayout(
@@ -3401,7 +3946,10 @@ def create(cls, x: IRNode, new_dtype: torch.dtype) -> BaseView:
                 old_layout.size,
                 old_layout.stride,
                 old_layout.offset,
+<<<<<<< HEAD
                 old_layout.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return ReinterpretView(data=storage, layout=new_layout)
         return DtypeView(data=x, target_dtype=new_dtype)
@@ -3412,7 +3960,11 @@ def __str__(self) -> str:
     __repr__ = __str__
 
     @property
+<<<<<<< HEAD
     def dtype(self) -> torch.dtype:
+=======
+    def dtype(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.target_dtype
 
     def get_size(self) -> Sequence[Expr]:
@@ -3421,7 +3973,11 @@ def get_size(self) -> Sequence[Expr]:
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
         inner = self.data.make_loader()
 
+<<<<<<< HEAD
         def loader(idx: Sequence[Expr]) -> OpsValue:
+=======
+        def loader(idx):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ops.to_dtype_bitcast(inner(idx), self.target_dtype, self.data.dtype)
 
         return loader
@@ -3429,9 +3985,13 @@ def loader(idx: Sequence[Expr]) -> OpsValue:
 
 class SliceView(View):
     @classmethod
+<<<<<<< HEAD
     def normalize_start_end(
         cls, x: IRNode, dim: int, start: int, end: int
     ) -> tuple[int, int]:
+=======
+    def normalize_start_end(cls, x, dim, start, end):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Normalize start and end such that both are in the range
         [0, x.get_size()[dim]] and start <= end.
@@ -3446,7 +4006,11 @@ def normalize_start_end(
             min_func = sizevars.evaluate_min
             max_func = sizevars.evaluate_max
 
+<<<<<<< HEAD
         def clamp(x: Expr, lower: int, upper: int) -> Expr:
+=======
+        def clamp(x, lower, upper):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             clamped_lower = (
                 x if sizevars.statically_known_geq(x, lower) else max_func(x, lower)
             )
@@ -3457,11 +4021,16 @@ def clamp(x: Expr, lower: int, upper: int) -> Expr:
             )
             return clamped_full
 
+<<<<<<< HEAD
         def clamp_wrap(
             val: Union[int, None], lower: int, upper: int, default: Union[Expr, int]
         ) -> Union[Expr, int]:
             if val is None:
                 # TODO(rec): can this really happen?
+=======
+        def clamp_wrap(val, lower, upper, default):  # type: ignore[no-untyped-def]
+            if val is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return default
             val = cls.handle_negative_index(val, dim_size)
             return clamp(val, lower, upper)
@@ -3471,6 +4040,7 @@ def clamp_wrap(
         return start, end
 
     @classmethod
+<<<<<<< HEAD
     def create(  # type: ignore[override]
         cls,
         x: IRNode,
@@ -3482,6 +4052,11 @@ def create(  # type: ignore[override]
     ) -> IRNode:
         step = sympy.expand(step)
         assert isinstance(step, Expr) or step > 0, step
+=======
+    def create(cls, x, dim, start, end, step=1, clamp=True):  # type: ignore[no-untyped-def, override]
+        step = sympy.expand(step)
+        assert isinstance(step, sympy.Expr) or step > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             if start == 0 and end >= 2**63 - 1 and step == 1:
                 return x
@@ -3509,6 +4084,7 @@ def create(  # type: ignore[override]
                 new_size,
                 new_stride,
                 old_layout.offset + old_layout.stride[dim] * start,
+<<<<<<< HEAD
                 old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
@@ -3516,6 +4092,12 @@ def create(  # type: ignore[override]
         def reindex(
             index: Sequence[Expr],
         ) -> Sequence[Expr]:
+=======
+            )
+            return ReinterpretView(data=storage, layout=new_layout)
+
+        def reindex(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(index) == len(new_size), f"wrong ndim {index} {new_size}"
             index = list(index)
             index[dim] = index[dim] * step + start
@@ -3581,6 +4163,7 @@ def constant_to_device(self, device: torch.device) -> IRNode:
 def is_contiguous_strides_for_shape(
     stride: Sequence[_IntLike], shape: Sequence[_IntLike]
 ) -> bool:
+<<<<<<< HEAD
     expected_stride = 1
     expected_stride_max = 1
     for x, y in reversed(tuple(zip(shape, stride))):
@@ -3596,6 +4179,14 @@ def is_contiguous_strides_for_shape(
         expected_stride *= x
 
     return True
+=======
+    return all(
+        size == 1 or left == right
+        for left, right, size in zip(
+            stride, FlexibleLayout.contiguous_strides(shape), shape
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_align_for_dtype(dtype: torch.dtype) -> int:
@@ -3612,6 +4203,7 @@ def get_device(self) -> Optional[torch.device]:
     def storage_size(self) -> int:
         raise NotImplementedError(type(self).__name__)
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
@@ -3627,10 +4219,16 @@ class Layout(OutputSpec):
     whether it is pinned.
     """
 
+=======
+
+@ir_dataclass
+class Layout(OutputSpec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         device: torch.device,
         dtype: torch.dtype,
+<<<<<<< HEAD
         size: Sequence[Expr],
         stride: Optional[Sequence[Expr]] = None,
         offset: Expr = Integer(0),
@@ -3639,16 +4237,30 @@ def __init__(
         if stride is None:
             stride = FlexibleLayout.contiguous_strides(size)
         # pyrefly: ignore [read-only]
+=======
+        size: list[Expr],
+        stride: Optional[list[Expr]] = None,
+        offset: Expr = Integer(0),
+    ) -> None:
+        if stride is None:
+            stride = FlexibleLayout.contiguous_strides(size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self.dtype = dtype
         assert len(size) == len(stride), f"size={size}, stride={stride}"
         assert all(isinstance(s, (Expr, int)) for s in size)
+<<<<<<< HEAD
         self.size = size
         self.stride = stride
         self.offset = offset
         self.is_pinned = is_pinned
         # is_pinned implies cpu
         assert (not self.is_pinned) or (self.device.type == "cpu")
+=======
+        self.size: list[Expr] = size
+        self.stride: list[Expr] = stride
+        self.offset: Expr = offset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         offset = ""
@@ -3656,12 +4268,18 @@ def __str__(self) -> str:
             offset = f", offset={self.offset}"
 
         device_index_str = "" if self.device.index is None else f":{self.device.index}"
+<<<<<<< HEAD
         is_pinned_str = ""
         if self.is_pinned:
             is_pinned_str = f", is_pinned={self.is_pinned}"
         return (
             f"{type(self).__name__}('{self.device.type}{device_index_str}', {self.dtype}, "
             f"size={self.size}, stride={self.stride}{offset}{is_pinned_str})"
+=======
+        return (
+            f"{type(self).__name__}('{self.device.type}{device_index_str}', {self.dtype}, "
+            f"size={self.size}, stride={self.stride}{offset})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     __repr__ = __str__
@@ -3676,7 +4294,10 @@ def get_example(self) -> torch.Tensor:
                 convert_shape_to_symint(self.stride),
                 dtype=self.dtype,
                 device=self.device,
+<<<<<<< HEAD
                 pin_memory=self.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def is_contiguous(self) -> bool:
@@ -3706,7 +4327,11 @@ def is_transposed(self) -> bool:
                 return False
         return True
 
+<<<<<<< HEAD
     def is_stride_ordered(self, order: Sequence[int]) -> bool:
+=======
+    def is_stride_ordered(self, order) -> bool:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(self.stride) == len(order)
 
         # ignore dimensions of size 1, they dont affect layout
@@ -3717,9 +4342,15 @@ def is_stride_ordered(self, order: Sequence[int]) -> bool:
         ]
 
         stride = [self.stride[i] for i in non_1_indices]
+<<<<<<< HEAD
         order: Sequence[int] = [order[i] for i in non_1_indices]
 
         def sorted_indices(arr: Sequence[int]) -> Sequence[int]:
+=======
+        order = [order[i] for i in non_1_indices]
+
+        def sorted_indices(arr):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sorted_arr = sorted(arr)
             return [sorted_arr.index(element) for element in arr]
 
@@ -3741,16 +4372,24 @@ def sorted_indices(arr: Sequence[int]) -> Sequence[int]:
                 return False
         return True
 
+<<<<<<< HEAD
     def is_channels_last_stride_ordered(self) -> bool:
+=======
+    def is_channels_last_stride_ordered(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # create channels_last order(NCHW, NCDHW, the C is the first order).
         order = [0] + list(reversed(range(1, len(self.stride) - 1)))
         order = [len(order)] + order
         return self.is_stride_ordered(order)
 
     @staticmethod
+<<<<<<< HEAD
     def _pad_strides(
         in_strides: Sequence[int], size: Sequence[Expr], dtype: torch.dtype
     ) -> Sequence[int]:
+=======
+    def _pad_strides(in_strides, size, dtype):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         The padding does not change stride order but makes sure all strides larger
         than the threshold are multiple of align.
@@ -3770,6 +4409,7 @@ def _pad_strides(
         ):
             return in_strides
 
+<<<<<<< HEAD
         # Skip padding the strides for dynamic shapes based on config.pad_dynamic_shape
         # Checking both shape and strides, as there are cases where only one is dynamic
         is_dynamic = not all(
@@ -3793,6 +4433,20 @@ def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
             return in_strides
 
         stride_order = get_stride_order(in_strides, shape_env)
+=======
+        # get_stride_order does not work with dynamic shape. Also we can not
+        # statically decide if a padding is needed or how much padding we should
+        # do for dynamic shape.
+        #
+        # Skip padding the strides for dynamic shape for now.
+        if not all(
+            isinstance(s, (int, sympy.Integer))
+            for s in itertools.chain(in_strides, size)
+        ):
+            return in_strides
+
+        stride_order = get_stride_order(in_strides)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fill_order = stride_order2fill_order(stride_order)
 
         new_strides = [0 for _ in range(len(in_strides))]
@@ -3804,6 +4458,7 @@ def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
         for rank, idx in enumerate(fill_order[1:], start=1):
             prev_idx = fill_order[rank - 1]
             stride = new_strides[prev_idx] * size[prev_idx]
+<<<<<<< HEAD
             # Static stride and meets padding conditions OR
             # Dynamic stride and config.pad_dynamic_shape=True
             require_padding = (
@@ -3815,6 +4470,13 @@ def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
             if require_padding:
                 new_strides[idx] = ceildiv(stride, align) * align
                 padded = True
+=======
+
+            if stride > config.padding_stride_threshold and stride % align != 0:
+                stride = ceildiv(stride, align) * align
+                padded = True
+            new_strides[idx] = stride
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not padded:
             # Consider a tensor with shape [256, 1, 5, 5]
@@ -3822,6 +4484,7 @@ def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
             # [25, 25, 5, 1].
             return in_strides
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         metrics.num_comprehensive_padding += 1
         return new_strides
@@ -3835,6 +4498,20 @@ def should_pad_strides(self) -> bool:
         return config.comprehensive_padding and isinstance(self, FlexibleLayout)
 
     def as_fixed(self) -> FixedLayout:
+=======
+        metrics.num_comprehensive_padding += 1
+        return new_strides
+
+    def pad_strides(self):  # type: ignore[no-untyped-def]
+        assert isinstance(self, FlexibleLayout)
+        assert self.stride is not None
+        self.stride = self._pad_strides(self.stride, self.size, self.dtype)
+
+    def should_pad_strides(self):  # type: ignore[no-untyped-def]
+        return config.comprehensive_padding and isinstance(self, FlexibleLayout)
+
+    def as_fixed(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self, FixedLayout):
             return self
 
@@ -3846,7 +4523,10 @@ def as_fixed(self) -> FixedLayout:
             self.size,
             self.stride,
             self.offset,
+<<<<<<< HEAD
             self.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
@@ -3855,14 +4535,21 @@ def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
         )
         return self.as_fixed().make_indexer()
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         return (
             isinstance(other, Layout)
             and self.device == other.device
+=======
+    def __eq__(self, other) -> bool:  # type: ignore[no-untyped-def]
+        return (
+            self.device == other.device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and self.dtype == other.dtype
             and self.size == other.size
             and self.stride == other.stride
             and self.offset == other.offset
+<<<<<<< HEAD
             and self.is_pinned == other.is_pinned
         )
 
@@ -3877,6 +4564,12 @@ def get_free_symbol_uses(
             | get_free_symbols(self.stride, unbacked_only)
             | get_free_symbols(self.offset, unbacked_only)
         )
+=======
+        )
+
+    def storage_size(self) -> sympy.Expr:
+        return compute_required_storage_length(self.size, self.stride, self.offset)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FixedLayout(Layout):
@@ -3884,17 +4577,39 @@ class FixedLayout(Layout):
 
     def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
         """A closure containing math to read a given element"""
+<<<<<<< HEAD
         return _fixed_indexer(self.size, self.stride, self.offset)
 
 
 class FlexibleLayout(Layout):
     """A Tensor layout that we are allowed to change"""
+=======
+
+        def indexer(index):  # type: ignore[no-untyped-def]
+            assert len(index) == len(self.stride)
+            assert len(index) == len(self.size)
+            result = self.offset
+            for idx, stride, sz in zip(index, self.stride, self.size):
+                if sz != 1:
+                    result = result + idx * stride
+            return result
+
+        return indexer
+
+
+class FlexibleLayout(Layout):
+    """A Tensor layout we are allowed to change"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     allow_indexing = False
 
     # WARNING!  This doesn't handle zero size tensors correctly
     @staticmethod
+<<<<<<< HEAD
     def contiguous_strides(sizes: Sequence[int]) -> list[Expr]:
+=======
+    def contiguous_strides(sizes):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(sizes) == 0:
             return []
         reversed_strides = [sympy.S.One]
@@ -3903,7 +4618,11 @@ def contiguous_strides(sizes: Sequence[int]) -> list[Expr]:
         return list(reversed(reversed_strides))
 
     @staticmethod
+<<<<<<< HEAD
     def fill_ordered(sizes: Sequence[int], order: Sequence[int]) -> list[Expr]:
+=======
+    def fill_ordered(sizes, order):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a stride based on the order the dimensions should be filled in.
 
@@ -3920,7 +4639,11 @@ def fill_ordered(sizes: Sequence[int], order: Sequence[int]) -> list[Expr]:
         return strides
 
     @staticmethod
+<<<<<<< HEAD
     def stride_ordered(sizes: Sequence[int], order: Sequence[int]) -> Sequence[Expr]:
+=======
+    def stride_ordered(sizes, order):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a stride based on the sorted order of a permuted range.
 
@@ -3932,9 +4655,13 @@ def stride_ordered(sizes: Sequence[int], order: Sequence[int]) -> Sequence[Expr]
         return FlexibleLayout.fill_ordered(sizes, fill_order)
 
     @staticmethod
+<<<<<<< HEAD
     def stride_ordered_for_memory_format(
         sizes: Sequence[int], memory_format: torch.memory_format
     ) -> Sequence[Expr]:
+=======
+    def stride_ordered_for_memory_format(sizes, memory_format):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a stride based on a memory format.
 
@@ -3959,9 +4686,13 @@ def stride_ordered_for_memory_format(
             raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def same_ordered(
         sizes: Sequence[int], stride: Sequence[_IntLike]
     ) -> Sequence[Expr]:
+=======
+    def same_ordered(sizes, stride):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a stride that has the same stride order as given stride
 
@@ -3973,9 +4704,13 @@ def same_ordered(
         fill_order = sorted(range(len(stride)), key=stride.__getitem__)
         return FlexibleLayout.fill_ordered(sizes, fill_order)
 
+<<<<<<< HEAD
     def as_stride_order(
         self, order: Sequence[int], allow_padding: bool = False
     ) -> FixedLayout:
+=======
+    def as_stride_order(self, order, allow_padding=False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_stride = self.stride_ordered(self.size, order)
         if self.should_pad_strides() and allow_padding:
             new_stride = self._pad_strides(new_stride, self.size, self.dtype)
@@ -3986,12 +4721,18 @@ def as_stride_order(
             self.size,
             new_stride,
             self.offset,
+<<<<<<< HEAD
             self.is_pinned,
         )
 
     def as_exact_strides(
         self, exact_strides: Sequence[_IntLike], allow_padding: bool = False
     ) -> FixedLayout:
+=======
+        )
+
+    def as_exact_strides(self, exact_strides, allow_padding=False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_stride = exact_strides
         if self.should_pad_strides() and allow_padding:
             new_stride = self._pad_strides(new_stride, self.size, self.dtype)
@@ -4002,11 +4743,18 @@ def as_exact_strides(
             self.size,
             new_stride,
             self.offset,
+<<<<<<< HEAD
             self.is_pinned,
         )
 
     def as_fill_order(self, order: Sequence[int]) -> FixedLayout:
         new_stride: Sequence[int] = self.fill_ordered(self.size, order)
+=======
+        )
+
+    def as_fill_order(self, order):  # type: ignore[no-untyped-def]
+        new_stride = self.fill_ordered(self.size, order)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.should_pad_strides():
             new_stride = self._pad_strides(new_stride, self.size, self.dtype)
         return FixedLayout(
@@ -4015,10 +4763,16 @@ def as_fill_order(self, order: Sequence[int]) -> FixedLayout:
             self.size,
             new_stride,
             self.offset,
+<<<<<<< HEAD
             self.is_pinned,
         )
 
     def as_same_order(self, stride: Sequence[_IntLike]) -> FixedLayout:
+=======
+        )
+
+    def as_same_order(self, stride):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_stride = self.same_ordered(self.size, stride)
         if self.should_pad_strides():
             new_stride = self._pad_strides(new_stride, self.size, self.dtype)
@@ -4028,6 +4782,7 @@ def as_same_order(self, stride: Sequence[_IntLike]) -> FixedLayout:
             self.size,
             new_stride,
             self.offset,
+<<<<<<< HEAD
             self.is_pinned,
         )
 
@@ -4039,11 +4794,20 @@ def __init__(
         stride_order: Optional[Sequence[Union[int, Integer]]] = None,
         is_pinned: bool = False,
     ) -> None:
+=======
+        )
+
+    def __init__(self, device, dtype, size, stride_order=None) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if stride_order:
             strides = FlexibleLayout.fill_ordered(size, stride_order)
         else:
             strides = FlexibleLayout.contiguous_strides(size)
+<<<<<<< HEAD
         super().__init__(device, dtype, size, strides, is_pinned=is_pinned)
+=======
+        super().__init__(device, dtype, size, strides)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NonOwningLayout(Layout):
@@ -4062,7 +4826,11 @@ def __init__(self, view: Union[BaseView, TensorBox]) -> None:
     def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
         return self.as_fixed().make_indexer()
 
+<<<<<<< HEAD
     def maybe_guard_aligned(self) -> bool:
+=======
+    def maybe_guard_aligned(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offset = self.view.get_layout().offset
         if offset == 0:
             return True
@@ -4070,6 +4838,7 @@ def maybe_guard_aligned(self) -> bool:
 
         return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
@@ -4080,6 +4849,8 @@ def get_free_symbol_uses(
         assert isinstance(input_buffer, Buffer), type(box)
         return input_buffer.layout.get_free_symbol_uses(unbacked_only)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CommBufferType(Enum):
     SYMM_MEM = "symm_mem"
@@ -4119,7 +4890,10 @@ def __init__(
             size=fixed.size,
             stride=fixed.stride,
             offset=fixed.offset,
+<<<<<<< HEAD
             is_pinned=fixed.is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.comm_buffer_type = comm_buffer_type
         self.group_name = group_name
@@ -4142,7 +4916,11 @@ class NoneLayout(OutputSpec):
     def storage_size(self) -> int:
         return 0
 
+<<<<<<< HEAD
     def as_fixed(self) -> OutputSpec:
+=======
+    def as_fixed(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def get_device(self) -> Optional[torch.device]:
@@ -4154,7 +4932,11 @@ def __init__(self, target: IRNode) -> None:
         super().__init__(
             target.get_device_or_error(),
             target.get_dtype(),
+<<<<<<< HEAD
             target.get_size(),
+=======
+            target.get_size(),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             None,
         )
         self.target = target
@@ -4162,18 +4944,30 @@ def __init__(self, target: IRNode) -> None:
         V.graph.mark_buffer_mutated(name)
 
     @property
+<<<<<<< HEAD
     def stride(self) -> Sequence[Expr]:  # type: ignore[override]
+=======
+    def stride(self) -> list[Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.real_layout().stride
 
     @stride.setter  # type: ignore[override]
     def stride(self, value: Never) -> None:
         pass  # ignore setting of stride
 
+<<<<<<< HEAD
     def storage_size(self) -> Expr:
         return self.real_layout().storage_size()
 
     def get_buffer(self) -> Buffer:
         def unwrap_views(target: Any) -> Any:
+=======
+    def storage_size(self) -> sympy.Expr:
+        return self.real_layout().storage_size()
+
+    def get_buffer(self) -> Buffer:
+        def unwrap_views(target):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(target, MutationLayoutSHOULDREMOVE):
                 return unwrap_views(target.target)
             if isinstance(target, BaseView):
@@ -4183,6 +4977,7 @@ def unwrap_views(target: Any) -> Any:
             return target
 
         result = unwrap_views(self.target)
+<<<<<<< HEAD
         assert isinstance(result, Buffer), type(result)
         return result
 
@@ -4195,6 +4990,18 @@ def real_layout(self) -> Layout:
     def realize_into(
         cls, src: IRNode, dst: IRNode, unsafe_alias: bool = False
     ) -> IRNode:
+=======
+        assert isinstance(result, Buffer), (
+            "MutationLayoutSHOULDREMOVE must refer to a buffer"
+        )
+        return result
+
+    def real_layout(self):  # type: ignore[no-untyped-def]
+        return self.get_buffer().layout
+
+    @classmethod
+    def realize_into(cls, src, dst, unsafe_alias=False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dst.realize()
         # NOTE: We must realize users of `dst` before we realize `src`, since
         # realization order determines scheduling order. Otherwise, src's
@@ -4213,11 +5020,16 @@ def realize_into(
         src.realize_hint()
 
         if not unsafe_alias:
+<<<<<<< HEAD
             node = Pointwise.create(
+=======
+            src = Pointwise.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device=src.get_device(),
                 dtype=src.get_dtype(),
                 inner_fn=src.make_loader(),
                 ranges=[
+<<<<<<< HEAD
                     V.graph.sizevars.check_equals_and_simplify(a, b)
                     for a, b in zip(src.get_size(), dst.get_size())
                 ],
@@ -4232,6 +5044,19 @@ def realize_into(
         return src.data
 
     def as_fixed(self) -> Self:  # type: ignore[override]
+=======
+                    V.graph.sizevars.guard_equals(a, b)
+                    for a, b in zip(src.get_size(), dst.get_size())
+                ],
+            ).data
+
+        src.realize()
+        assert isinstance(src.data.layout, FlexibleLayout)
+        src.data.layout = MutationLayoutSHOULDREMOVE(dst)
+        return src.data
+
+    def as_fixed(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
@@ -4259,7 +5084,11 @@ def get_name(self) -> str:
         assert self.name, self
         return self.name
 
+<<<<<<< HEAD
     def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
+=======
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.layout, Layout):
             return self.layout.get_example()
         raise NotImplementedError(type(self.layout).__name__)
@@ -4291,6 +5120,7 @@ def get_layout(self) -> Layout:
     def get_output_spec(self) -> OutputSpec:
         return self.layout
 
+<<<<<<< HEAD
     def get_storage_numel(self) -> int:
         return self.get_numel()
 
@@ -4298,11 +5128,18 @@ def get_is_pinned(self) -> bool:
         return self.get_layout().is_pinned
 
     def freeze_layout(self) -> None:
+=======
+    def get_storage_numel(self):  # type: ignore[no-untyped-def]
+        return self.get_numel()
+
+    def freeze_layout(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.layout, Layout) and not isinstance(
             self.layout, NonOwningLayout
         ):
             self.layout = self.layout.as_fixed()
 
+<<<<<<< HEAD
     def freeze_layout_with_stride_order(
         self, order: Sequence[int], allow_padding: bool = False
     ) -> None:
@@ -4321,11 +5158,33 @@ def freeze_layout_with_exact_strides(
         self, exact_strides: Sequence[int], allow_padding: bool = False
     ) -> None:
         assert isinstance(self.layout, FlexibleLayout), type(self.layout)
+=======
+    def freeze_layout_with_stride_order(self, order, allow_padding=False) -> None:  # type: ignore[no-untyped-def]
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_stride_order(order, allow_padding=allow_padding)
+
+    def freeze_layout_with_fill_order(self, order) -> None:  # type: ignore[no-untyped-def]
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_fill_order(order)
+
+    def freeze_layout_with_same_order(self, stride) -> None:  # type: ignore[no-untyped-def]
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_same_order(stride)
+
+    def freeze_layout_with_exact_strides(  # type: ignore[no-untyped-def]
+        self, exact_strides, allow_padding=False
+    ) -> None:
+        assert isinstance(self.layout, FlexibleLayout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.layout = self.layout.as_exact_strides(
             exact_strides, allow_padding=allow_padding
         )
 
+<<<<<<< HEAD
     def is_zero_elements(self) -> bool:
+=======
+    def is_zero_elements(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return V.graph.sizevars.statically_known_true(sympy.Eq(self.get_numel(), 0))
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
@@ -4333,7 +5192,11 @@ def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
         if self.is_zero_elements():
             return partial(nop_loader_fn, dtype=self.get_dtype())
 
+<<<<<<< HEAD
         def loader(index: Sequence[Expr]) -> OpsValue:
+=======
+        def loader(index):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             indexer = self.make_indexer()
             return ops.load(self.name or "unnamed", indexer(index))
 
@@ -4342,7 +5205,11 @@ def loader(index: Sequence[Expr]) -> OpsValue:
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
         return self.get_name()
 
+<<<<<<< HEAD
     def decide_layout(self) -> None:
+=======
+    def decide_layout(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
@@ -4463,6 +5330,7 @@ def has_tensor_output(self) -> bool:
 
 @ir_dataclass(frozen=False)
 class ComputedBuffer(OperationBuffer):
+<<<<<<< HEAD
     """
     Represents a buffer that is computed during kernel execution rather than being an input.
     """
@@ -4479,6 +5347,9 @@ def force_realize() -> Iterator[None]:
             yield
         finally:
             ComputedBuffer._force_realize = old_value
+=======
+    data: Loops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_computed_buffer_name(self) -> Optional[str]:
         """
@@ -4501,6 +5372,7 @@ def get_read_names(self) -> OrderedSet[str]:
         return self.data.get_read_names()
 
     def get_read_writes(self) -> dependencies.ReadWrites:
+<<<<<<< HEAD
         if not isinstance(self.data, (Reduction, Scan, Sort, Pointwise)):
             return dependencies.ReadWrites(
                 reads=OrderedSet(),
@@ -4508,17 +5380,28 @@ def get_read_writes(self) -> dependencies.ReadWrites:
                 index_exprs=OrderedSet(),
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with patch.object(FlexibleLayout, "allow_indexing", True):
             if self.data.get_reduction_type():
                 return extract_read_writes(
                     self.get_store_function(),
+<<<<<<< HEAD
                     self.data.get_pointwise_size(),
                     self.data.get_reduction_size(),
+=======
+                    self.data.get_pointwise_size(),  # type: ignore[arg-type]
+                    self.data.get_reduction_size(),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 return extract_read_writes(
                     self.get_store_function(),
+<<<<<<< HEAD
                     self.data.get_size(),
+=======
+                    self.data.get_size(),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def get_free_symbol_uses(
@@ -4533,11 +5416,18 @@ def get_free_symbol_uses(
         # those symbols that establishes a dependency).  However, we haven't
         # started codegen yet so we can't directly reuse that logic.
         #
+<<<<<<< HEAD
+=======
+        # For now, I'm just yoloing with the size of the buffer.  Not sure if
+        # it is enough.
+        #
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # One thing you might wonder is if this is enough for a ComputedBuffer
         # denoting a reduction over i0.  Empirically, it is enough, but for an
         # unusual reason: we only need accurate dependencies for item() call,
         # but it's impossible to end up with a reduction over i0 from an
         # item() call without a regular non-reduction buffer first.
+<<<<<<< HEAD
         result = self.layout.get_free_symbol_uses(
             unbacked_only
         ) | self.data.get_free_symbol_uses(unbacked_only)
@@ -4545,27 +5435,45 @@ def get_free_symbol_uses(
         if self.has_store_function():
             result |= self.get_read_writes().get_free_symbol_uses(unbacked_only)
         return result
+=======
+        return (
+            get_free_symbols(self.get_size(), unbacked_only)
+            | get_free_symbols(self.get_stride(), unbacked_only)
+            | get_free_symbols(self.get_offset(), unbacked_only)
+            | self.data.get_free_symbol_uses(unbacked_only)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
         if (
             not self.get_reduction_type()
             and self.name not in V.graph.mutated_buffers
             and self.num_reads() == 0
+<<<<<<< HEAD
             and not self._force_realize
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # inline this op rather than generating ops.load()
             return self.data.make_loader()
         return super().make_loader()
 
+<<<<<<< HEAD
     def has_store_function(self) -> bool:
         return isinstance(self.data, (Reduction, Scan, Sort, Pointwise))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_store_function(self) -> Callable[..., None]:
         indexer = self.get_layout().as_fixed().make_indexer()
         if isinstance(self.data, (Reduction, Scan, Sort)):
             return partial(self.data.store_reduction, self.name, indexer)
         else:
+<<<<<<< HEAD
             assert isinstance(self.data, Pointwise), type(self.data)
+=======
+            assert isinstance(self.data, Pointwise)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return partial(self.data.store_output, self.name, indexer)
 
     def get_fill_order(self) -> Optional[list[int]]:
@@ -4619,9 +5527,15 @@ def decide_layout(self) -> None:
     def get_default_sizes_body(
         self,
     ) -> tuple[
+<<<<<<< HEAD
         tuple[list[Expr], list[Expr]],
         LoopBody,
         tuple[list[Expr], list[Expr]],
+=======
+        tuple[list[sympy.Expr], list[sympy.Expr]],
+        LoopBody,
+        tuple[list[sympy.Expr], list[sympy.Expr]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]:
         args, var_ranges = dependencies.index_vars_squeeze(
             self.data.get_pointwise_size(), self.data.get_reduction_size(), prefix="q"
@@ -4652,7 +5566,11 @@ def simplify_and_reorder(
         self,
         extra_indexing_constraints: Optional[tuple[dict[Any, Any], list[Any]]] = None,
         recompute_sizes_body_func: Optional[Callable[..., Any]] = None,
+<<<<<<< HEAD
     ) -> tuple[tuple[list[Expr], list[Expr]], Optional[LoopBody]]:
+=======
+    ) -> tuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This is a main place where we do loop transformations in a
         backend-agnostic way.
@@ -4692,8 +5610,13 @@ def simplify_and_reorder(
                 and len(extra_indexing_constraints) == 2
             )
             extra_indexing_ranges, extra_indexing_expr = extra_indexing_constraints
+<<<<<<< HEAD
             assert isinstance(extra_indexing_ranges, dict), type(extra_indexing_ranges)
             assert isinstance(extra_indexing_expr, list), type(extra_indexing_expr)
+=======
+            assert isinstance(extra_indexing_ranges, dict)
+            assert isinstance(extra_indexing_expr, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert all(isinstance(f, Expr) for f in extra_indexing_expr)
 
             expected_var_ranges = body.var_ranges
@@ -4711,6 +5634,7 @@ def simplify_and_reorder(
         if not V.graph.has_feature(self, BackendFeature.PREFER_STORE_LOOP_ORDER):
             memory_addrs.extend(body.get_read_exprs())
 
+<<<<<<< HEAD
         def simplify_and_reorder(
             x_vars: Sequence[sympy.Symbol],
             support_vars: Sequence[sympy.Symbol],
@@ -4751,19 +5675,36 @@ def simplify_and_reorder(
                     reindex0 = same_reorder(order)
                     reindex1 = inverse_reorder(order)
 
+=======
+        def simplify_and_reorder(x_vars, support_vars, sizes, simplify_loops):  # type: ignore[no-untyped-def]
+            sizes, reindex0, reindex1 = self._apply_loop_reordering(
+                x_vars, support_vars, sizes, memory_addrs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # for NHWC: reindex0([0,1,2,3]) = [0,2,3,1], reindex1([0,1,2,3]) = [0,3,2,1]
             x_vars = reindex0(x_vars)
 
             if simplify_loops:
+<<<<<<< HEAD
                 newsizes, reindex2, _prune = V.graph.sizevars._simplify_loops(
                     x_vars,
                     newsizes,
                     index_prevent_reordering(index_formulas, x_vars, newsizes),
+=======
+                sizes, reindex2, _prune = V.graph.sizevars._simplify_loops(
+                    x_vars,
+                    sizes,
+                    index_prevent_reordering(index_formulas, x_vars, sizes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 reindex = fuse_reindexing(reindex1, reindex2)
             else:
                 reindex = reindex1
+<<<<<<< HEAD
             return newsizes, reindex, reindex1
+=======
+            return sizes, reindex, reindex1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         support_vars = index_vars + reduce_vars
         should_merge_loops = (
@@ -4800,6 +5741,7 @@ def simplify_and_reorder(
         return (iter_ranges, reduce_ranges), body
 
     @staticmethod
+<<<<<<< HEAD
     def _apply_loop_reordering(
         index_vars: Sequence[sympy.Symbol],
         support_vars: Sequence[sympy.Symbol],
@@ -4811,6 +5753,15 @@ def _apply_loop_reordering(
         Callable[[Sequence[int]], Sequence[int]],
         Callable[[Sequence[int]], Sequence[int]],
     ]:
+=======
+    def _apply_loop_reordering(  # type: ignore[no-untyped-def]
+        index_vars,
+        support_vars,
+        sizes,
+        memory_addrs,
+        priority_idx=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Shuffle the order of loops around to hopefully improve performance.
         """
@@ -4839,7 +5790,11 @@ def _apply_loop_reordering(
         sizes = [sizes[i] for i in order]
         return sizes, same_reorder(order), inverse_reorder(order)
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.get_reduction_size()
 
     def get_reduction_type(self) -> Optional[str]:
@@ -4864,9 +5819,15 @@ class TemplateBuffer(OperationBuffer):
 
     def __init__(
         self,
+<<<<<<< HEAD
         layout: OutputSpec,
         inputs: Sequence[IRNode],
         make_kernel_render: Optional[Callable[..., Any]],
+=======
+        layout: Layout,
+        inputs: Sequence[IRNode],
+        make_kernel_render: Callable[..., Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(name=None, layout=layout)
         self.inputs = InputsKernel.unwrap_storage(inputs)
@@ -4877,11 +5838,19 @@ def __init__(
     def get_read_writes(self) -> dependencies.ReadWrites:
         return self.extract_read_writes(normalize=True)
 
+<<<<<<< HEAD
     def extract_read_writes(self, normalize: bool = False) -> dependencies.ReadWrites:
         name = self.get_name()
         indexer = self.get_layout().make_indexer()
 
         def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
+=======
+    def extract_read_writes(self, normalize):  # type: ignore[no-untyped-def]
+        name = self.get_name()
+        indexer = self.get_layout().make_indexer()
+
+        def dummy(index, rindex):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(rindex) == 0
             return ops.store(name, indexer(index), "fake")
 
@@ -4890,6 +5859,7 @@ def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
         )
 
         for inp in self.inputs:
+<<<<<<< HEAD
             assert isinstance(inp, (ReinterpretView, Buffer)), type(inp)
             assert isinstance(inp.layout, Layout), type(inp.layout)
 
@@ -4902,11 +5872,25 @@ def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
 
             deps.reads |= dependencies.extract_read_writes(
                 dummy, inp.get_size(), (), normalize=normalize
+=======
+            indexer = inp.layout.make_indexer()
+
+            def dummy(index, rindex):  # type: ignore[no-untyped-def]
+                assert len(rindex) == 0
+                ops.load(inp.get_name(), indexer(index))
+
+            deps.reads |= dependencies.extract_read_writes(
+                dummy, inp.get_size(), (), normalize=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).reads
 
         return deps
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return sympy.S.One
 
     def get_reduction_type(self) -> Optional[str]:
@@ -4915,6 +5899,7 @@ def get_reduction_type(self) -> Optional[str]:
     def should_allocate(self) -> bool:
         return True
 
+<<<<<<< HEAD
     def simplify_and_reorder(
         self,
         extra_indexing_constraints: Optional[tuple[dict[Any, Any], list[Any]]] = None,
@@ -4924,17 +5909,36 @@ def simplify_and_reorder(
             (
                 self.get_size(),
                 [],
+=======
+    def simplify_and_reorder(  # type: ignore[no-untyped-def]
+        self,
+        extra_indexing_constraints: Optional[tuple[dict[Any, Any], list[Any]]] = None,
+        recompute_sizes_body_func: Optional[Callable[..., Any]] = None,
+    ):
+        return (
+            (
+                self.get_size(),
+                (),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             None,
         )
 
 
 class TritonTemplateBuffer(TemplateBuffer):
+<<<<<<< HEAD
     def __init__(
         self,
         layout: Layout,
         inputs: Sequence[IRNode],
         make_kernel_render: Optional[Callable[_P, _T]],
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        inputs,
+        make_kernel_render,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mutated_inputs: Optional[Iterable[IRNode]] = None,
         allowed_prologue_inps: Optional[OrderedSet[str]] = None,
     ) -> None:
@@ -4960,7 +5964,10 @@ def __init__(
             assert current_node in allowed_set, (
                 f"Mutated inputs are only allowed for {allowed_set} but got {current_node}"
             )
+<<<<<<< HEAD
             assert isinstance(self.inputs[0], IRNode), type(self.inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device = self.inputs[0].get_device()
             self.outputs += [
                 MutationOutput(NoneLayout(device=device), buf, self)
@@ -5034,6 +6041,7 @@ def __init__(
         # An additional description used to describe the choice (useful for
         # knowing what autotuning is choosing)
         self.description = description
+<<<<<<< HEAD
         self.failed: bool = False
         # A place to store annotations that can be read post benchmarking
         # Use this to shuttle information between ChoieCaller generation
@@ -5049,11 +6057,23 @@ def benchmark(self, *args: Any, out: torch.Tensor) -> float:
         if config.profile_bandwidth_with_do_bench_using_profiling:
             return do_bench_using_profiling(lambda: algo(*args), **benchmark_configs)  # type: ignore[arg-type]
         return benchmarker.benchmark(algo, args, {"out": out}, **benchmark_configs)
+=======
+
+    def benchmark(self, *args, out) -> float:  # type: ignore[no-untyped-def]
+        algo = self.to_callable()
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            return do_bench_using_profiling(lambda: algo(*args))
+        return benchmarker.benchmark(algo, args, {"out": out})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_name(self) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def to_callable(self) -> Callable[..., Any]:
+=======
+    def to_callable(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def kernel_hash_key(self) -> str:
@@ -5066,7 +6086,11 @@ def kernel_hash_key(self) -> str:
     def hash_key(self) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def output_node(self) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def output_node(self) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]:
@@ -5076,6 +6100,7 @@ def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType
     def autoheuristic_id(self) -> str:
         return "unsupported_choice"
 
+<<<<<<< HEAD
     def mark_failed(self) -> None:
         """
         Mark the choice as failed so that it can be
@@ -5084,6 +6109,8 @@ def mark_failed(self) -> None:
         """
         self.failed = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TritonTemplateCallerBase(ChoiceCaller):
     def get_make_kernel_render(self) -> Any:
@@ -5102,8 +6129,13 @@ class MultiTemplateBuffer(TritonTemplateBuffer):
     def __init__(
         self,
         layout: Layout,
+<<<<<<< HEAD
         inputs: Sequence[IRNode],
         choice_timings_fn: Callable[[Optional[int]], dict[ChoiceCaller, float]],
+=======
+        inputs: list[IRNode],
+        choice_timings_fn: Callable[[], dict[ChoiceCaller, float]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unfiltered_choices: list[ChoiceCaller],
         allowed_prologue_inps: OrderedSet[str],
     ) -> None:
@@ -5114,7 +6146,11 @@ def __init__(
             allowed_prologue_inps=allowed_prologue_inps,
         )
         self._choice_timings_fn = choice_timings_fn
+<<<<<<< HEAD
         self._choice_timings: dict[Optional[int], dict[ChoiceCaller, float]] = {}
+=======
+        self._choice_timings: Optional[dict[ChoiceCaller, float]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.original_inputs = inputs
         self._output_plannable = all(
             isinstance(choice, TritonTemplateCallerBase)
@@ -5124,7 +6160,10 @@ def __init__(
             )
             for choice in unfiltered_choices
         )
+<<<<<<< HEAD
         self._make_kernel_renders: dict[Optional[int], Any] = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def output_plannable(self) -> bool:
@@ -5133,6 +6172,7 @@ def output_plannable(self) -> bool:
         """
         return self._output_plannable
 
+<<<<<<< HEAD
     def choice_timings(
         self, hint_override: Optional[int] = None
     ) -> dict[ChoiceCaller, float]:
@@ -5145,6 +6185,17 @@ def swap_as_triton_caller(self, caller: TritonTemplateCallerBase) -> Iterator[No
         assert isinstance(
             caller, torch._inductor.select_algorithm.TritonTemplateCaller
         ), type(caller)
+=======
+    @property
+    def choice_timings(self) -> dict[ChoiceCaller, float]:
+        if self._choice_timings is None:
+            self._choice_timings = self._choice_timings_fn()
+        return self._choice_timings
+
+    @contextlib.contextmanager
+    def swap_as_triton_caller(self, caller: TritonTemplateCallerBase):  # type: ignore[no-untyped-def]
+        assert isinstance(caller, torch._inductor.select_algorithm.TritonTemplateCaller)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.layout == caller.layout
 
         render = self.make_kernel_render
@@ -5155,13 +6206,18 @@ def swap_as_triton_caller(self, caller: TritonTemplateCallerBase) -> Iterator[No
             self.make_kernel_render = render
 
     def finalize_as_triton_caller(self, caller: TritonTemplateCallerBase) -> None:
+<<<<<<< HEAD
         assert isinstance(
             caller, torch._inductor.select_algorithm.TritonTemplateCaller
         ), type(caller)
+=======
+        assert isinstance(caller, torch._inductor.select_algorithm.TritonTemplateCaller)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.get_size() == caller.layout.size
         assert self.get_stride() == caller.layout.stride
         self.make_kernel_render = caller.get_make_kernel_render()
 
+<<<<<<< HEAD
     def get_min_choice(
         self, hint_override: Optional[int] = None
     ) -> tuple[ChoiceCaller, float]:
@@ -5186,6 +6242,19 @@ def __init__(
         layout: Layout,
         inputs: Sequence[IRNode],
         make_kernel_render: Callable[_P, _T],
+=======
+    def get_min_choice(self) -> tuple[ChoiceCaller, float]:
+        min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
+        return (min_choice, self.choice_timings[min_choice])
+
+
+class CUDATemplateBuffer(TemplateBuffer):
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        inputs,
+        make_kernel_render,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         workspace_size: int,
         template: CUDATemplate,
         supports_epilogue_fusion: bool,
@@ -5196,7 +6265,11 @@ def __init__(
         self.template = template
         self.supports_epilogue_fusion = supports_epilogue_fusion
 
+<<<<<<< HEAD
     def get_workspace_size(self) -> int:
+=======
+    def get_workspace_size(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.workspace_size if self.workspace_size is not None else 0
 
     def emulate_store_fn(self) -> None:
@@ -5205,6 +6278,7 @@ def emulate_store_fn(self) -> None:
 
 
 class CppTemplateBuffer(TemplateBuffer):
+<<<<<<< HEAD
     def __init__(
         self,
         layout: Layout,
@@ -5213,6 +6287,9 @@ def __init__(
         template: CUDATemplate,
         choice: Any,
     ) -> None:
+=======
+    def __init__(self, layout, inputs, make_kernel_render, template, choice) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(layout, inputs, make_kernel_render)
         self.template = template
         self.choice = choice
@@ -5220,17 +6297,26 @@ def __init__(
 
     def get_layout(self) -> Layout:
         if isinstance(self.layout, MultiOutputLayout):
+<<<<<<< HEAD
             assert isinstance(self.outputs, Iterable), type(self.outputs)
             # pyrefly: ignore [index-error]
             first_output = self.outputs[0]
             assert isinstance(first_output, Buffer), type(first_output)
             layout = first_output.layout
             assert isinstance(layout, Layout), type(layout)
+=======
+            assert isinstance(self.outputs, Iterable)
+            first_output = self.outputs[0]
+            assert isinstance(first_output, Buffer)
+            layout = first_output.layout
+            assert isinstance(layout, Layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return layout
         else:
             return super().get_layout()
 
 
+<<<<<<< HEAD
 class CuteDSLTemplateBuffer(TemplateBuffer):
     """
     Buffer for CuteDSL (CUTLASS Python DSL) template kernels.
@@ -5276,12 +6362,21 @@ def input_name(self, i: int) -> str:
         input = self.inputs[i]
         assert isinstance(input, IRNode)
         return input.get_name()
+=======
+@ir_dataclass(frozen=False)
+class InputsKernel(OperationBuffer):
+    inputs: list[Buffer]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_read_writes(self) -> dependencies.ReadWrites:
         reads = OrderedSet[dependencies.Dep]()
         StarDep = dependencies.StarDep
         for input in self.inputs:
+<<<<<<< HEAD
             if isinstance(input, Sequence):
+=======
+            if isinstance(input, list):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 reads.update(StarDep(x.get_name()) for x in input)
             elif isinstance(input, ShapeAsConstantBuffer):
                 # Skip creating dependency for symbolics as they're visible globally
@@ -5318,6 +6413,7 @@ def unwrap_storage_for_input(cls, x: IRNode) -> IRNode:
             return cls.unwrap_storage_for_input(x)
         if isinstance(x, TorchBindObject):
             return x
+<<<<<<< HEAD
         assert isinstance(x, (Buffer, ReinterpretView)), type(x)
         return x
 
@@ -5328,6 +6424,16 @@ def unwrap_storage(
         inputs_new: list[Union[IRNode, Sequence[IRNode]]] = []
         for x in inputs:
             if isinstance(x, Sequence):
+=======
+        assert isinstance(x, (Buffer, ReinterpretView)), x
+        return x
+
+    @staticmethod
+    def unwrap_storage(inputs):  # type: ignore[no-untyped-def]
+        inputs_new = []
+        for x in inputs:
+            if isinstance(x, list):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = [InputsKernel.unwrap_storage_for_input(i) for i in x]
             else:
                 x = InputsKernel.unwrap_storage_for_input(x)
@@ -5340,6 +6446,7 @@ def is_extern(self) -> bool:
     def num_reads(self) -> int:
         return 1
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
@@ -5352,6 +6459,8 @@ def get_free_symbol_uses(
                     r |= inner_inp.get_free_symbol_uses(unbacked_only)
         return r
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class NopKernel(InputsKernel):
     def is_no_op(self) -> bool:
@@ -5368,10 +6477,14 @@ class ConcatKernel(NopKernel):
     """
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
         """
         Create the concat kernel from inputs
         """
+=======
+    def create(cls, inputs, dim):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = inputs[0].get_device()
         dtype = inputs[0].get_dtype()
         new_size = list(inputs[0].get_size())
@@ -5388,12 +6501,20 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
                 if j == dim:
                     new_size[j] = new_size[j] + input_size[j]
                 else:
+<<<<<<< HEAD
                     new_size[j] = V.graph.sizevars.check_equals_and_simplify(
+=======
+                    new_size[j] = V.graph.sizevars.guard_equals(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         new_size[j], input_size[j]
                     )
             offsets_end.append(new_size[dim])
 
+<<<<<<< HEAD
         output_stride: Sequence[int] = FlexibleLayout.contiguous_strides(new_size)
+=======
+        output_stride = FlexibleLayout.contiguous_strides(new_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.comprehensive_padding:
             # Ensure the output stride matches the alignment requirements
             output_stride = Layout._pad_strides(
@@ -5413,7 +6534,11 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
                     break
         any_input_is_storage_and_layout = any(is_storage_and_layout(x) for x in inputs)
         fx_node_args = V.graph.current_node.args[0]
+<<<<<<< HEAD
         assert isinstance(fx_node_args, list), type(fx_node_args)
+=======
+        assert isinstance(fx_node_args, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If any of the inputs has meta tensor and the meta tensor is in CL format, use CL format for the output
         if any_input_is_storage_and_layout is False and any(
             "val" in arg.meta
@@ -5425,11 +6550,14 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
         ):
             output_stride = make_channels_last_strides_for(new_size)
 
+<<<<<<< HEAD
         is_pinned = all(
             is_storage_and_layout(x) and x.get_layout().is_pinned for x in inputs
         )
 
         assert device is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         concat_kernel = ConcatKernel(
             name=None,
             layout=FixedLayout(
@@ -5437,20 +6565,30 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
                 dtype=dtype,
                 size=new_size,
                 stride=output_stride,
+<<<<<<< HEAD
                 is_pinned=is_pinned,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             inputs=[],
         )
         kernel = StorageBox(concat_kernel)
         op_names = []
+<<<<<<< HEAD
         for i, inp in enumerate(inputs):
             assert isinstance(inp, (BaseView, MutableBox)), type(inp)
             input_buffer = cls.realize_into(
                 inp,
+=======
+        for i in range(len(inputs)):
+            input_buffer = cls.realize_into(
+                inputs[i],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 SliceView.create(
                     kernel, dim, offsets_start[i], offsets_end[i], clamp=False
                 ),
             )
+<<<<<<< HEAD
             assert isinstance(input_buffer, Buffer), type(input_buffer)
             assert isinstance(concat_kernel.inputs, list), type(concat_kernel.inputs)
             concat_kernel.inputs.append(input_buffer)
@@ -5465,6 +6603,18 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
                 and input_unwrapped.is_input_buffer()
                 and (dev := inp.get_device()) is not None
                 and is_gpu(dev.type)
+=======
+            concat_kernel.inputs.append(input_buffer)
+
+            if isinstance(inputs[i].data, BaseView):
+                input_unwrapped = inputs[i].data.unwrap_view()
+            else:
+                input_unwrapped = inputs[i].data
+
+            if (
+                input_unwrapped.is_input_buffer()
+                and is_gpu(inputs[i].get_device().type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and not is_dynamic(input_buffer)
             ):
                 op_names.append(input_buffer.get_operation_name())
@@ -5479,14 +6629,21 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
         return kernel
 
     @classmethod
+<<<<<<< HEAD
     def can_realize_into_without_copy(
         cls, src: IRNode, dst: Optional[IRNode] = None
     ) -> bool:
+=======
+    def can_realize_into_without_copy(cls, src, dst=None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(src, TensorBox):
             # unwrap a TensorBox
             return cls.can_realize_into_without_copy(src.data, dst)
 
+<<<<<<< HEAD
         assert isinstance(src, (BaseView, StorageBox)), type(src)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(src.data, MultiTemplateBuffer):
             if (
                 not isinstance(src.data.layout, FixedLayout)
@@ -5500,7 +6657,11 @@ def can_realize_into_without_copy(
                 return True
 
             # otherwise, check equality of layouts
+<<<<<<< HEAD
             if len(src.get_stride()) != len(dst.get_stride()):
+=======
+            if not len(src.get_stride()) == len(dst.get_stride()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
             return all(
@@ -5508,6 +6669,7 @@ def can_realize_into_without_copy(
                 for s1, s2 in zip(src.get_stride(), dst.get_stride())
             )
 
+<<<<<<< HEAD
         return (
             hasattr(src.data, "layout")
             and isinstance(src.data.layout, FlexibleLayout)
@@ -5521,6 +6683,14 @@ def get_free_symbol_uses(
 
     @classmethod
     def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
+=======
+        return isinstance(src.data.layout, FlexibleLayout) and not isinstance(
+            src.data, ExternKernelAlloc
+        )
+
+    @classmethod
+    def realize_into(cls, src, dst):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Attempt to turn this into a ReinterpretView rather than assert.
         # This has concessions around layout, as as_storage_and_layout
         # can cause us to go from flexible to fixed layout.
@@ -5528,7 +6698,11 @@ def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
             if is_storage_and_layout(dst):
                 storage, layout = as_storage_and_layout(dst)
                 dst = ReinterpretView(data=storage, layout=layout)
+<<<<<<< HEAD
         assert isinstance(dst, ReinterpretView), type(dst)
+=======
+        assert isinstance(dst, ReinterpretView), dst
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(src, TensorBox):
             # unwrap a TensorBox
             return cls.realize_into(src.data, dst)
@@ -5538,7 +6712,10 @@ def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
             # ExternKernelAlloc has specific requirements for output layout, should create a copy
             assert hasattr(src.data, "layout")
             if cls.can_realize_into_without_copy(src, dst):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 src.data.layout = NonOwningLayout(dst)
                 return src.data
         # introduce a copy
@@ -5547,7 +6724,11 @@ def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
             dtype=src.get_dtype(),
             inner_fn=src.make_loader(),
             ranges=[
+<<<<<<< HEAD
                 V.graph.sizevars.check_equals_and_simplify(a, b)
+=======
+                V.graph.sizevars.guard_equals(a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for a, b in zip(src.get_size(), dst.get_size())
             ],
         )
@@ -5559,12 +6740,16 @@ def should_allocate(self) -> bool:
 
 @ir_dataclass(frozen=False)
 class ExternKernel(InputsKernel):
+<<<<<<< HEAD
     """
     A class that represents Kernels which are not directly lowered to Inductor
     Loop Level IR, such as custom operators, or aten operators which we fallback to.
     """
 
     constant_args: Sequence[Any] = ()
+=======
+    constant_args: tuple[Any, ...] = ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs: dict[str, Any] = dataclasses.field(default_factory=dict)
     output_view: Optional[ReinterpretView] = None
     python_kernel_name: Optional[str] = None
@@ -5574,17 +6759,25 @@ class ExternKernel(InputsKernel):
     ordered_kwargs_for_cpp_kernel: Iterable[str] = dataclasses.field(
         default_factory=list
     )
+<<<<<<< HEAD
     op_overload: Optional[_OpOverloads] = None
     arg_properties: Optional[list[dict[str, Any]]] = None
     allarg_properties: dict[str, dict[str, Any]] = dataclasses.field(
         default_factory=dict
     )
+=======
+    op_overload: Optional[
+        Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]
+    ] = None
+    arg_properties: Optional[list[dict[str, Any]]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwarg_properties: Optional[dict[str, dict[str, Any]]] = None
     unbacked_bindings: dict[sympy.Symbol, pytree.KeyPath] = dataclasses.field(
         default_factory=dict
     )
     mutation_outputs: list[MutationOutput] = dataclasses.field(default_factory=list)
 
+<<<<<<< HEAD
     def __init__(
         self,
         name: Optional[str],
@@ -5597,6 +6790,20 @@ def __init__(
         cpp_kernel_name: Optional[str] = None,
         ordered_kwargs_for_cpp_kernel: Iterable[str] = (),
         op_overload: Optional[_OpOverloads] = None,
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        name,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        output_view=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             name=name,
@@ -5621,7 +6828,11 @@ def get_outputs(self) -> list[Buffer]:
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def collect_arg_kwarg_properties(self) -> None:
+=======
+    def collect_arg_kwarg_properties(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if self.op_overload is torch._ops.OpOverload, we can use its schema to collect additional
         # information for args and kwargs, e.g. type and default value, to help with the cpp wrapper codegen
         self.arg_properties = (
@@ -5658,18 +6869,27 @@ def collect_arg_kwarg_properties(self) -> None:
         else:
             self.schema_kwargs = []
 
+<<<<<<< HEAD
     def decide_layout(self) -> None:
+=======
+    def decide_layout(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.layout, FlexibleLayout):
             self.apply_constraint()
             self.freeze_layout()
 
+<<<<<<< HEAD
     def codegen_comment(
         self, wrapper: PythonWrapperCodegen, kernel_name: Optional[str] = None
     ) -> None:
+=======
+    def codegen_comment(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         origin_str, _detailed_origin_str = get_kernel_metadata(self, wrapper)
         if origin_str:
             wrapper.make_comment(origin_str)
 
+<<<<<<< HEAD
         if not kernel_name:
             kernel_name = self.try_get_kernel_name()
         if kernel_name:
@@ -5681,6 +6901,9 @@ def codegen_comment(
             wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def set_cpp_kernel_name(self, cpp_kernel_name: Optional[str] = None) -> None:
@@ -5722,6 +6945,7 @@ def set_python_kernel_name(self, python_kernel_name: Optional[str]) -> None:
                 f"{kernel.__module__.replace('._ops.', '.ops.')}.{kernel.__name__}"
             )
 
+<<<<<<< HEAD
     def try_get_kernel_name(self) -> Optional[str]:
         from .codegen.cpp_wrapper_cpu import CppWrapperCpu
 
@@ -5747,6 +6971,18 @@ def get_kernel_name(self) -> str:
 
     @staticmethod
     def copy_input(x: IRNode) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def get_kernel_name(self):  # type: ignore[no-untyped-def]
+        device = d.type if (d := self.get_device()) else V.graph.device_type
+        return (
+            V.graph.wrapper_code.get_c_shim_func_name(self.cpp_kernel_name, device)  # type: ignore[attr-defined]
+            if V.graph.cpp_wrapper
+            else self.python_kernel_name
+        )
+
+    @staticmethod
+    def copy_input(x):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pw = Pointwise.create(
             device=x.get_device(),
             dtype=x.get_dtype(),
@@ -5759,8 +6995,13 @@ def copy_input(x: IRNode) -> Union[TensorBox, ShapeAsConstantBuffer]:
         return pw
 
     @classmethod
+<<<<<<< HEAD
     def process_kernel(
         cls, kernel: _OpOverloads, *args: Any, **kwargs: Any
+=======
+    def process_kernel(  # type: ignore[no-untyped-def]
+        cls, kernel, *args, **kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[
         Any,
         list[Any],
@@ -5783,6 +7024,7 @@ def process_kernel(
             if is_arg_tensor[-1]:
                 tensor_args.append(arg)
             else:
+<<<<<<< HEAD
                 if isinstance(arg, Expr):
                     arg = V.graph.sizevars.shape_env.create_symintnode(arg, hint=None)
                 non_tensor_args.append(arg)
@@ -5790,6 +7032,13 @@ def process_kernel(
         def unflatten_args(
             new_tensor_args: Sequence[_T], new_non_tensor_args: Sequence[_T]
         ) -> tuple[list[_T], dict[str, _T]]:
+=======
+                if isinstance(arg, sympy.Expr):
+                    arg = V.graph.sizevars.shape_env.create_symintnode(arg, hint=None)
+                non_tensor_args.append(arg)
+
+        def unflatten_args(new_tensor_args, new_non_tensor_args):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             it_tensors = iter(new_tensor_args)
             it_non_tensors = iter(new_non_tensor_args)
@@ -5848,11 +7097,19 @@ def unflatten_args(
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]] = None
         if shape_env := V.fake_mode.shape_env:
             node_meta_val = V.current_node.meta.get("val")
+<<<<<<< HEAD
             ctx: AbstractContextManager[None] = nullcontext()
             if V.current_node.target == torch._higher_order_ops.effects.with_effects:
                 # remove the first effect token in meta["val"] and meta["unbacked_bindings"]
                 node_meta_val = node_meta_val[1]
                 ctx = _remove_effect_token_unbacked_bindings(V.current_node)
+=======
+            ctx = nullcontext()
+            if V.current_node.target == torch._higher_order_ops.effects.with_effects:
+                # remove the first effect token in meta["val"] and meta["unbacked_bindings"]
+                node_meta_val = node_meta_val[1]
+                ctx = _remove_effect_token_unbacked_bindings(V.current_node)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with ctx:
                 rebind_unbacked(shape_env, V.current_node, example_output)
@@ -5881,13 +7138,21 @@ def unflatten_args(
         )
 
     @classmethod
+<<<<<<< HEAD
     def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
+=======
+    def convert_to_reinterpret_view(cls, x):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         In order to pass this to an extern kernel we need a
         ReinterpretView not a View.  This allows us to avoid some
         unneeded copies.
         """
+<<<<<<< HEAD
         assert isinstance(x, BaseView), type(x)
+=======
+        assert isinstance(x, BaseView)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(x, ReinterpretView):
             return x
 
@@ -5901,7 +7166,10 @@ def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
         if (
             x_unwrap_view_fx_node is not None
             and "val" in x_unwrap_view_fx_node.meta
+<<<<<<< HEAD
             and isinstance(x_unwrap_view, (ReinterpretView, Buffer, MutableBox))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and isinstance(x_unwrap_view.layout, FlexibleLayout)
             and (
                 x_unwrap_view_fx_node.meta["val"].is_contiguous(
@@ -5919,7 +7187,12 @@ def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
             x_unwrap_view.freeze_layout()
 
         index_args, var_ranges = dependencies.index_vars_squeeze(
+<<<<<<< HEAD
             x.get_size(), prefix="r"
+=======
+            x.get_size(),
+            prefix="r",  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         range_vars = index_args[0]
         index = x.make_indexer()(range_vars)
@@ -5943,18 +7216,31 @@ def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
             layout=FixedLayout(
                 device=x.get_device_or_error(),
                 dtype=x.get_dtype(),
+<<<<<<< HEAD
                 size=x.get_size(),
                 stride=strides,
                 offset=offset,
                 is_pinned=False,
+=======
+                size=x.get_size(),  # type: ignore[arg-type]
+                stride=strides,
+                offset=offset,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
     @classmethod
+<<<<<<< HEAD
     def realize_input(cls, x: IRNode) -> IRNode:
         if x is None:
             return NoneAsConstantBuffer()
         if isinstance(x, (Expr, sympy.logic.boolalg.Boolean, int)):
+=======
+    def realize_input(cls, x):  # type: ignore[no-untyped-def]
+        if x is None:
+            return NoneAsConstantBuffer()
+        if isinstance(x, (sympy.Expr, sympy.logic.boolalg.Boolean, int)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ShapeAsConstantBuffer(expr=x)
         if isinstance(x, Constant):
             return V.graph.add_tensor_constant(
@@ -5984,7 +7270,11 @@ def realize_input(cls, x: IRNode) -> IRNode:
         return cls.copy_input(x)
 
     @classmethod
+<<<<<<< HEAD
     def require_stride1(cls, x: IRNode) -> IRNode:
+=======
+    def require_stride1(cls, x):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_storage_and_layout(x):
             if len(x.get_stride()) == 0:
                 return x
@@ -5994,6 +7284,7 @@ def require_stride1(cls, x: IRNode) -> IRNode:
         return cls.copy_input(x)
 
     @classmethod
+<<<<<<< HEAD
     def require_strides(
         cls,
         x: IRNode,
@@ -6001,6 +7292,15 @@ def require_strides(
         exact_strides: Optional[Sequence[_IntLike]] = None,
         allow_padding: bool = False,
     ) -> IRNode:
+=======
+    def require_strides(  # type: ignore[no-untyped-def]
+        cls,
+        x,
+        order: Optional[Sequence[int]] = None,
+        exact_strides: Optional[Sequence[_IntLike]] = None,
+        allow_padding=False,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert order is not None or exact_strides is not None
         # Layout generally doesn't matter, but some consuming external ops might have requirements
         if x.get_numel() in (0, 1) and not exact_strides:
@@ -6010,7 +7310,11 @@ def require_strides(
         if is_storage_and_layout(x):
             if isinstance(x.get_layout(), FlexibleLayout):
                 if order:
+<<<<<<< HEAD
                     # If the FlexibleLayout already has the size and stride in the required order,
+=======
+                    # If the the FlexibleLayout already has the size and stride in the required order,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # freeze it to a FixedLayout by using its current size and stride.
                     # The behavior of using its current size and stride or the given order can be different
                     # if the size and stride has ambiguilty, for example for a 4D input where the iC = 1:
@@ -6018,9 +7322,13 @@ def require_strides(
                     # the current size and stride already satisfies this order.
                     # However by freezing it to the required order, the layout will be changed to:
                     # size=[s0, 1, 28, 28], stride=[784, 1, 28, 1]), which is not actually necessary.
+<<<<<<< HEAD
                     use_current_stride_order = is_stride_order_storage_and_layout(
                         x, order
                     ) and not free_unbacked_symbols(x.get_layout().stride)
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # fix flexiblelayout to be FixedLayout with stride_order
                     as_storage_and_layout(
                         x,
@@ -6028,11 +7336,17 @@ def require_strides(
                         want_contiguous=False,
                         stride_order=(
                             get_stride_order(
+<<<<<<< HEAD
                                 V.graph.sizevars.size_hints_or_throw(
                                     x.get_layout().stride
                                 )
                             )
                             if use_current_stride_order
+=======
+                                V.graph.sizevars.size_hints(x.get_layout().stride)
+                            )
+                            if is_stride_order_storage_and_layout(x, order)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             else order
                         ),
                         allow_padding=allow_padding,
@@ -6063,6 +7377,7 @@ def require_strides(
                     if exact_strides is not None
                     else x
                 )
+<<<<<<< HEAD
             elif isinstance(
                 (mutation_layout := x.get_layout()), MutationLayoutSHOULDREMOVE
             ):
@@ -6078,6 +7393,21 @@ def require_strides(
                         exact_strides
                         and significant_strides_equal(
                             exact_strides, real_layout.stride, x.get_size()
+=======
+            elif isinstance(x.get_layout(), MutationLayoutSHOULDREMOVE):
+                if isinstance(x.get_layout().real_layout(), FlexibleLayout):
+                    raise AssertionError(
+                        "the MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayout"
+                    )
+                elif isinstance(x.get_layout().real_layout(), FixedLayout) and (
+                    (order and x.get_layout().real_layout().is_stride_ordered(order))
+                    or (
+                        exact_strides
+                        and significant_strides_equal(
+                            exact_strides,
+                            x.get_layout().real_layout().stride,
+                            x.get_size(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     )
                 ):
@@ -6098,9 +7428,14 @@ def require_strides(
             isinstance(x, TensorBox)
             and isinstance(x.data, BaseView)
             and not isinstance(x.data, ReinterpretView)
+<<<<<<< HEAD
             and is_storage_and_layout(unwrap_view := x.unwrap_view())
             and hasattr(unwrap_view, "data")
             and not isinstance(unwrap_view.data, ExternKernelAlloc)
+=======
+            and is_storage_and_layout(x.unwrap_view())
+            and not isinstance(x.unwrap_view().data, ExternKernelAlloc)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             try:
                 x.data = cls.convert_to_reinterpret_view(x.data)
@@ -6158,14 +7493,19 @@ def require_strides(
         return x
 
     @classmethod
+<<<<<<< HEAD
     def require_exact_strides(
         cls, x: IRNode, exact_strides: Sequence[_IntLike], allow_padding: bool = False
     ) -> IRNode:
+=======
+    def require_exact_strides(cls, x, exact_strides, allow_padding=False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls.require_strides(
             x, exact_strides=exact_strides, allow_padding=allow_padding
         )
 
     @classmethod
+<<<<<<< HEAD
     def require_stride_order(
         cls, x: IRNode, order: Sequence[int], allow_padding: bool = False
     ) -> IRNode:
@@ -6188,6 +7528,32 @@ def is_mkldnn_tensor(x: IRNode) -> bool:
                 return False
 
             return name in V.graph.constants and V.graph.constants[name].is_mkldnn
+=======
+    def require_stride_order(cls, x, order, allow_padding=False):  # type: ignore[no-untyped-def]
+        return cls.require_strides(x, order=order, allow_padding=allow_padding)
+
+    @classmethod
+    def require_channels_last(cls, x):  # type: ignore[no-untyped-def]
+        return cls.require_stride_order(x, NHWC_STRIDE_ORDER)
+
+    @classmethod
+    def require_channels_last_3d(cls, x):  # type: ignore[no-untyped-def]
+        return cls.require_stride_order(x, NHWDC_STRIDE_ORDER)
+
+    @classmethod
+    def require_contiguous(cls, x):  # type: ignore[no-untyped-def]
+        def is_mkldnn_tensor(x):  # type: ignore[no-untyped-def]
+            def safe_get_name(x):  # type: ignore[no-untyped-def]
+                try:
+                    return x.get_name()
+                except (AttributeError, NotImplementedError):
+                    return None
+
+            return (
+                safe_get_name(x) in V.graph.constants
+                and V.graph.constants[safe_get_name(x)].is_mkldnn
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO move this to the more proper places
         if is_mkldnn_tensor(x):
@@ -6198,7 +7564,11 @@ def is_mkldnn_tensor(x: IRNode) -> bool:
             )
 
     @classmethod
+<<<<<<< HEAD
     def require_contiguous_strides(cls, x: IRNode) -> IRNode:
+=======
+    def require_contiguous_strides(cls, x):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: combine this with require_contiguous after
         # https://github.com/pytorch/pytorch/pull/148235 lands.
         return cls.require_exact_strides(
@@ -6208,9 +7578,13 @@ def require_contiguous_strides(cls, x: IRNode) -> IRNode:
     def apply_constraint(self) -> None:
         pass
 
+<<<<<<< HEAD
     def fill_non_provided_args(
         self, args: Sequence[Any], kwargs: dict[str, Any]
     ) -> Sequence[Any]:
+=======
+    def fill_non_provided_args(self, args, kwargs):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Previously, we want to maintain forward-compatibility by skipping
         # default args in the serialized artifacts in fbcode. However,
         # some of our shim interfaces require default values being OrderedSet.
@@ -6219,8 +7593,13 @@ def fill_non_provided_args(
         # part if we see real FC requirement. More details related to FC
         # can be found at:
         # https://docs.google.com/document/d/1FzWm-sHYwmRi3x_g036kOxd99KaYquUsA-L5JwOn8ys/edit?usp=sharing
+<<<<<<< HEAD
         assert isinstance(args, Sequence), type(args)
         if not isinstance(args, list):
+=======
+        assert isinstance(args, (list, tuple))
+        if isinstance(args, tuple):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args = list(args)
         assert self.arg_properties, "ExternKernel.arg_properties should not be empty"
 
@@ -6244,7 +7623,11 @@ def fill_non_provided_args(
                 )
         return args
 
+<<<<<<< HEAD
     def codegen_const_args(self, names: Optional[list[str]] = None) -> list[str]:
+=======
+    def codegen_const_args(self, names: Optional[list[str]] = None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.cpp_wrapper:
             result = []
             # Aten ops follow the convention that tensor args are before non-tensor args,
@@ -6262,8 +7645,12 @@ def codegen_const_args(self, names: Optional[list[str]] = None) -> list[str]:
 
             for i, x in enumerate(self.constant_args):
                 if name_to_arg_properties is not None:
+<<<<<<< HEAD
                     assert names is not None
                     prop = name_to_arg_properties.get(names[i])
+=======
+                    prop = name_to_arg_properties.get(names[i])  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     type_ = prop.get("type") if prop else None
                 else:
                     idx = len(self.inputs) + i
@@ -6275,9 +7662,15 @@ def codegen_const_args(self, names: Optional[list[str]] = None) -> list[str]:
                 result.append(V.graph.wrapper_code.val_to_arg_str(x, type_))
             return result
         else:
+<<<<<<< HEAD
             return [V.graph.wrapper_code.val_to_arg_str(a) for a in self.constant_args]
 
     def codegen_args(self) -> list[str]:
+=======
+            return map(V.graph.wrapper_code.val_to_arg_str, self.constant_args)
+
+    def codegen_args(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.cpp_wrapper and self.op_overload is not None:
             # cpp wrapper needs special logic to fill in missing args with default values
             inputs = self.fill_non_provided_args(
@@ -6303,7 +7696,11 @@ def codegen_args(self) -> list[str]:
             args.extend(self.codegen_const_args())
         return args
 
+<<<<<<< HEAD
     def get_kwargs_value(self, arg_name: str, **kwargs: Any) -> Any:
+=======
+    def get_kwargs_value(self, arg_name, **kwargs):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Given an argument name, queries for values in (in order):
         1. any provided kwargs for this function.
         2. the class self.kwargs member.
@@ -6312,11 +7709,19 @@ def get_kwargs_value(self, arg_name: str, **kwargs: Any) -> Any:
             return kwargs.get(arg_name)
         if arg_name in self.kwargs:
             return self.kwargs.get(arg_name)
+<<<<<<< HEAD
         if (arg := self.allarg_properties.get(arg_name)) is not None:
             return arg.get("default_value")
         raise AssertionError(f"{arg_name} not in self.allarg_properties")
 
     def codegen_kwargs(self, skip_out: bool = False) -> list[str]:
+=======
+        if self.allarg_properties and arg_name in self.allarg_properties:
+            return self.allarg_properties.get(arg_name).get("default_value")  # type: ignore[union-attr]
+        raise AssertionError(f"{arg_name} not in self.allarg_properties")
+
+    def codegen_kwargs(self, skip_out=False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.cpp_wrapper:
             if self.op_overload is not None and len(self.schema_kwargs) == 0:
                 # All the args should have been generated by fill_non_provided_args in codegen_args
@@ -6329,11 +7734,22 @@ def codegen_kwargs(self, skip_out: bool = False) -> list[str]:
                     continue
 
                 v = self.get_kwargs_value(arg_name)
+<<<<<<< HEAD
                 if isinstance(v, Expr):
                     kwargs.append(v)
                 else:
                     assert self.allarg_properties is not None
                     type_ = self.allarg_properties.get(arg_name, {}).get("type")
+=======
+                if isinstance(v, sympy.Expr):
+                    kwargs.append(v)
+                else:
+                    type_ = (
+                        self.allarg_properties.get(arg_name).get("type")  # type: ignore[union-attr]
+                        if self.allarg_properties and arg_name in self.allarg_properties
+                        else None
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     kwargs.append(V.graph.wrapper_code.val_to_arg_str(v, type_))
         else:
             kwargs = [
@@ -6353,7 +7769,11 @@ def get_op_name(self) -> str:
             op_name = "unknown_op"
         return op_name
 
+<<<<<<< HEAD
     def codegen_size_asserts(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen_size_asserts(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.size_asserts and not V.graph.cpp_wrapper:
             # comparing strides for 0 size tensor is tricky. Ignore them for now.
             if sympy_product(self.get_size()) == 0:
@@ -6365,7 +7785,11 @@ def codegen_size_asserts(self, wrapper: PythonWrapperCodegen) -> None:
                 f"assert_size_stride({self.get_name()}, {size}, {stride}, {op_name!r})"
             )
 
+<<<<<<< HEAD
     def codegen_alignment_asserts(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen_alignment_asserts(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.alignment_asserts and not V.graph.cpp_wrapper:
             name = self.get_name()
             aligned = name not in V.graph.unaligned_buffers
@@ -6379,6 +7803,7 @@ def codegen_alignment_asserts(self, wrapper: PythonWrapperCodegen) -> None:
                     f"# buffer {name} (op: {op_name}) is assumed to be not aligned"
                 )
 
+<<<<<<< HEAD
     def codegen_memory_tracking(self, wrapper: PythonWrapperCodegen) -> None:
         """
         Track outputs of fallback operators if config.test_configs.track_memory_lifecycle
@@ -6391,6 +7816,9 @@ def codegen_memory_tracking(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.writeline(f"track_tensor({name}, '{name}')")
 
     def get_group_stride(self) -> tuple[list[Sequence[Expr]], list[Expr]]:
+=======
+    def get_group_stride(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         get output sizes and strides, for template_codegen
         """
@@ -6399,7 +7827,11 @@ def get_group_stride(self) -> tuple[list[Sequence[Expr]], list[Expr]]:
         # iter_ranges = _size of output tensor, reduce_range = [] because no reduction
         return [_size, []], _stride
 
+<<<<<<< HEAD
     def canonicalize(self) -> tuple[Expr, Sequence[Expr]]:
+=======
+    def canonicalize(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Manually get canonicalization of the output index
         """
@@ -6438,7 +7870,11 @@ def get_free_symbol_uses(
         maybe_get_symbols = (
             maybe_free_unbacked_symbols if unbacked_only else maybe_free_symbols
         )
+<<<<<<< HEAD
         r = InputsKernel.get_free_symbol_uses(self, unbacked_only)
+=======
+        r = OrderedSet[sympy.Symbol]()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in self.constant_args:
             r |= maybe_get_symbols(arg)
         for arg in self.kwargs.values():
@@ -6462,6 +7898,7 @@ def __str__(self) -> str:
 
 @ir_dataclass(frozen=False)
 class ExternKernelOut(ExternKernel):
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.generate_extern_kernel_out(self)
 
@@ -6483,6 +7920,27 @@ def __init__(
             None,
             layout,
             unwrapped_inputs,
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        wrapper.generate_extern_kernel_out(self)
+
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        output_view=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ) -> None:
+        super().__init__(
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             constant_args,
             kwargs or {},
             None,
@@ -6519,6 +7977,7 @@ def __init__(self, count: int, device: torch.device) -> None:
 
 
 class ExternKernelAlloc(ExternKernel):
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.generate_extern_kernel_alloc(self)
 
@@ -6539,6 +7998,26 @@ def __init__(
             None,
             layout,
             cast(Sequence[IRNode], unwrapped_inputs),
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        wrapper.generate_extern_kernel_alloc(self)
+
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ) -> None:
+        super().__init__(
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             constant_args,
             kwargs or {},
             None,
@@ -6557,7 +8036,11 @@ def __init__(
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def apply_constraint(self) -> None:
+=======
+    def apply_constraint(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
 
@@ -6566,9 +8049,13 @@ class MutationOutput(Buffer):
     An output buffer that represents the mutation of a pre-existing buffer
     """
 
+<<<<<<< HEAD
     def __init__(
         self, layout: OutputSpec, mutated_node: IRNode, mutating_node: Operation
     ) -> None:
+=======
+    def __init__(self, layout, mutated_node, mutating_node: Operation) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(name=None, layout=layout)
         mutated_node_name = mutated_node.get_name()
         V.graph.mark_buffer_mutated(mutated_node_name)
@@ -6585,6 +8072,7 @@ def get_mutation_names(self) -> Sequence[str]:
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_buffers(self) -> Sequence[IRNode]:
         mutation_names = self.get_mutation_names()
         return [
@@ -6593,6 +8081,8 @@ def get_mutation_buffers(self) -> Sequence[IRNode]:
             if buf is not None
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TMADescriptor(ExternKernel):
     """
@@ -6628,9 +8118,13 @@ def create(
             cls._CACHE[key] = cls._create_impl(tensor, tma_meta)
         return cls._CACHE[key]
 
+<<<<<<< HEAD
     def __init__(
         self, tensor: IRNode, inputs: Sequence[Any], constant_args: Sequence[Any]
     ) -> None:
+=======
+    def __init__(self, tensor: IRNode, inputs, constant_args):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             None,
             # link back to the underlying tensor in terms of ownership
@@ -6642,7 +8136,11 @@ def __init__(
                     layout=tensor.get_layout(),
                 )
             ),
+<<<<<<< HEAD
             cast(Sequence[Buffer], inputs),
+=======
+            inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tuple(constant_args),
             None,
         )
@@ -6651,7 +8149,11 @@ def __init__(
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.generate_tma_descriptor(self)
 
     def get_tensor(self) -> IRNode:
@@ -6733,7 +8235,10 @@ def __init__(
 
         self.subgraph = V.graph.make_subgraph(self.gm, example_inputs, subgraph_name)
 
+<<<<<<< HEAD
         assert is_node_sequence(self.inputs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sym_inputs = get_symbolic_inputs(self.inputs)
 
         for sym_inp in sym_inputs:
@@ -6746,20 +8251,31 @@ def __init__(
 
         with V.set_graph_handler(self.subgraph):
             # Don't bother autotuning on Triton here
+<<<<<<< HEAD
             with inductor_config.patch(
+=======
+            with inductor_config.patch(  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 max_autotune=False,
                 max_autotune_gemm=False,
                 max_autotune_gemm_backends="ATEN",
             ):
                 self.subgraph.run(*self.example_inputs)
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class CodegenGraph:
             def __init__(self, graph: GraphLowering):
                 self.graph = graph
                 self.name = graph.name
 
+<<<<<<< HEAD
         assert is_node_sequence(self.inputs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outer_inputs = [t.codegen_reference() for t in self.inputs]
         wrapper.codegen_subgraph_with_flattened_outputs(
             CodegenGraph(self.subgraph),
@@ -6769,7 +8285,11 @@ def __init__(self, graph: GraphLowering):
 
 
 class UserDefinedTritonKernel(ExternKernel):
+<<<<<<< HEAD
     def get_kernel_and_metadata(self) -> tuple[Kernel, Any, list[str], list[str]]:
+=======
+    def get_kernel_and_metadata(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from triton.runtime.autotuner import Autotuner
 
         from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
@@ -6800,11 +8320,15 @@ def get_kernel_and_metadata(self) -> tuple[Kernel, Any, list[str], list[str]]:
             kernel = kernel.fn
         return kernel, configs, restore_value_args, reset_to_zero_args
 
+<<<<<<< HEAD
     @override
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         """Overrides the parent member.
         See https://github.com/pytorch/pytorch/issues/151692"""
 
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.utils import triton_version_uses_attrs_dict
 
         (
@@ -6830,9 +8354,13 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         named_args = {
             k: self.get_kwargs_value(k) for k in self.ordered_kwargs_for_cpp_kernel
         }
+<<<<<<< HEAD
         arg_names = [p.name for p in kernel.params]  # type: ignore[attr-defined]
         constexprs = [p.num for p in kernel.params if p.is_constexpr]  # type: ignore[attr-defined]
         constexpr_names = OrderedSet(arg_names[i] for i in constexprs)
+=======
+        constexpr_names = OrderedSet([kernel.arg_names[i] for i in kernel.constexprs])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         args: list[Any] = []
         arg_types: list[Any] = []
@@ -6841,9 +8369,12 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         for name, arg in itertools.chain(
             named_args.items(), zip(itertools.repeat(""), extra_launch_args)
         ):
+<<<<<<< HEAD
             if name in constexpr_names and triton_version_uses_attrs_dict():
                 # see #160000 - we don't pass in constexpr args to speed up runtime.
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raw_keys_filtered.append(name)
             raw_args_filtered.append(arg)
             if isinstance(arg, IRNode):
@@ -6876,7 +8407,11 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
             else:
                 raise NotImplementedError(f"Unsupported arg type: {type(arg)}: {arg}")
 
+<<<<<<< HEAD
         self.codegen_comment(wrapper, new_name)
+=======
+        self.codegen_comment(wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.generate_kernel_call(
             new_name,
             args,
@@ -6901,6 +8436,7 @@ def get_free_symbol_uses(
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def __init__(
         self,
         *,
@@ -6913,6 +8449,14 @@ def __init__(
         kwargs: dict[str, IRNode] = {}
         constant_args: list[IRNode] = []
 
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self, *, kernel_idx, grid, tma_descriptor_metadata, kernel_args
+    ) -> None:
+        inputs = []
+        kwargs = {}
+        constant_args = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for k, v in kernel_args.items():
             if isinstance(v, TensorBox):
                 t = InputsKernel.unwrap_storage_for_input(self.realize_input(v))
@@ -6927,7 +8471,10 @@ def __init__(
         assert len(inputs) != 0
         self.device = inputs[0].get_device()
 
+<<<<<<< HEAD
         assert isinstance(inputs, Sequence), type(inputs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             None,
             NoneLayout(device=self.device),
@@ -6941,7 +8488,10 @@ def __init__(
         kernel, configs, _, _ = self.get_kernel_and_metadata()
 
         # If we are autotuning, not all arguments will be passed
+<<<<<<< HEAD
         assert hasattr(kernel, "arg_names")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ordered_kwargs_for_cpp_kernel = [
             arg for arg in kernel.arg_names if arg in kernel_args
         ]
@@ -6974,9 +8524,14 @@ class InplaceBernoulliFallback(ExternKernel):
     This needs to be a custom class to handle mutation properly
     """
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         assert all(isinstance(t, IRNode) for t in self.inputs)
         (x,) = (cast(IRNode, t).codegen_reference() for t in self.inputs)
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        (x,) = (t.codegen_reference() for t in self.inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if V.graph.cpp_wrapper:
             # Inductor doesn't really support aten Generator, so the Generator kwarg is always NULL here,
@@ -6993,14 +8548,22 @@ def should_allocate(self) -> bool:
         return False
 
     def get_mutation_names(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(0)]
+=======
+        return [self.inputs[0].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def __init__(
         self, op_overload: _OpOverloads, x: IRNode, *constant_args: Any
     ) -> None:
+=======
+    def __init__(self, op_overload, x, *constant_args) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             None,
             NoneLayout(device=x.get_device()),
@@ -7019,7 +8582,11 @@ class InplaceCopyFallback(ExternKernel):
     This needs to be a custom class to handle mutation properly
     """
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (dst, src, non_blocking) = self.codegen_args()
         wrapper.codegen_device_copy(src, dst, non_blocking)
 
@@ -7027,16 +8594,28 @@ def should_allocate(self) -> bool:
         return False
 
     def get_mutation_names(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(0)]
+=======
+        return [self.inputs[0].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def __init__(
         self,
         layout: OutputSpec,
         inputs: Sequence[IRNode],
         constant_args: Sequence[Any],
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        inputs,
+        constant_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             None,
@@ -7051,9 +8630,13 @@ def __init__(
         V.graph.register_operation(self)
 
     @classmethod
+<<<<<<< HEAD
     def create(
         cls, dst: IRNode, src: IRNode, non_blocking: bool = False
     ) -> InplaceCopyFallback:
+=======
+    def create(cls, dst, src, non_blocking: bool = False):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs = [cls.realize_input(t) for t in [dst, src]]
         constant_args = (non_blocking,)
         result = InplaceCopyFallback(
@@ -7069,8 +8652,12 @@ class MutatingFirstArgExternKernel(ExternKernel):
     This needs to be a custom class to handle mutation properly
     """
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         assert is_node_sequence(self.inputs)
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         argrefs = [
             *(t.codegen_reference() for t in self.inputs),
             *map(repr, self.constant_args),
@@ -7083,7 +8670,11 @@ def should_allocate(self) -> bool:
         return False
 
     def get_mutation_names(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(0)]
+=======
+        return [self.inputs[0].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
@@ -7093,7 +8684,11 @@ def has_side_effects(self) -> bool:
 
 
 class ResizeStorageBytes(MutatingFirstArgExternKernel):
+<<<<<<< HEAD
     def __init__(self, variable: IRNode, new_size: int) -> None:
+=======
+    def __init__(self, variable, new_size) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(new_size, int), "TODO: dynamic shapes"
         super().__init__(
             None,
@@ -7106,12 +8701,19 @@ def __init__(self, variable: IRNode, new_size: int) -> None:
         V.graph.register_operation(self)
         self.python_kernel_name = "inductor_ops.resize_storage_bytes_"
         self.cpp_kernel_name = "torch::inductor::resize_storage_bytes_"
+<<<<<<< HEAD
         assert isinstance(variable, (BaseView, StorageBox, TensorBox)), type(variable)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.graph.never_reuse_buffers.add(variable.data.get_name())
 
 
 class SetSourceTensorKernel(ExternKernelAlloc):
+<<<<<<< HEAD
     def __init__(self, self_tensor: IRNode, storage_tensor: IRNode) -> None:
+=======
+    def __init__(self, self_tensor, storage_tensor) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         storage_tensor.freeze_layout()
         super().__init__(
             storage_tensor.get_layout(),
@@ -7119,9 +8721,12 @@ def __init__(self, self_tensor: IRNode, storage_tensor: IRNode) -> None:
             python_kernel_name="torch.ops.aten.set_.source_Tensor",
             op_overload=torch.ops.aten.set_.source_Tensor,
         )
+<<<<<<< HEAD
         assert isinstance(self_tensor, (BaseView, StorageBox, TensorBox)), type(
             self_tensor
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.graph.never_reuse_buffers.add(self_tensor.data.get_name())
         V.graph.never_reuse_buffers.add(storage_tensor.get_name())
         V.graph.never_reuse_buffers.add(self.get_name())
@@ -7132,7 +8737,11 @@ def __init__(self, self_tensor: IRNode, storage_tensor: IRNode) -> None:
         ]
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(0), self.input_name(1)]
+=======
+        return [self.inputs[0].get_name(), self.inputs[1].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ScatterFallback(ExternKernel):
@@ -7142,20 +8751,51 @@ class ScatterFallback(ExternKernel):
     It also handle the case `src` being a scalar properly.
     """
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.generate_scatter_fallback(self)
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        reduce = self.kwargs["reduce"]
+        if V.graph.cpp_wrapper:
+            # Follow aten/src/ATen/native/ReductionType.h:get_operator_enum
+            get_operator_enum = {"add": "sum", "multiply": "prod"}
+            if reduce in get_operator_enum:
+                reduce = get_operator_enum[reduce]
+
+        if self.src_is_tensor:
+            (x, index, src) = (t.codegen_reference() for t in self.inputs)
+        else:
+            (x, index) = (t.codegen_reference() for t in self.inputs)
+            src = self.constant_args[1]
+        wrapper.generate_scatter_fallback(
+            x,
+            [x, self.constant_args[0], index, src],
+            self.cpp_kernel_name,
+            self.python_kernel_name,
+            self.src_is_tensor,
+            reduce,
+            self.codegen_kwargs(),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self) -> list[str]:
         inp = self.inputs[0]
         assert isinstance(inp, IRNode)
         return [inp.get_name()]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+        return [self.inputs[0].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def __init__(
         self,
         op_overload: _OpOverloads,
@@ -7163,6 +8803,15 @@ def __init__(
         dim: int,
         index: IRNode,
         src: IRNode,
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        op_overload,
+        x,
+        dim: int,
+        index,
+        src,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         reduce: Optional[str] = None,
         include_self: bool = True,
@@ -7197,18 +8846,39 @@ class IndexPutFallback(ExternKernel):
     This needs to be a custom class to handle mutation and indices properly
     """
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.generate_index_put_fallback(self)
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        (x, values, *valid_indices) = (t.codegen_reference() for t in self.inputs)
+        indices = []
+        iter_valid_indices = iter(valid_indices)
+        for i, _ in enumerate(self.indices):
+            if self.indices[i] is not None:
+                indices.append(next(iter_valid_indices))
+            else:
+                indices.append(V.graph.wrapper_code.none_str)
+
+        wrapper.generate_index_put_fallback(
+            self.get_kernel_name(), x, indices, values, *self.codegen_const_args()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def should_allocate(self) -> bool:
         return False
 
     def get_mutation_names(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(0)]
+=======
+        return [self.inputs[0].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def __init__(
         self,
         op_overload: torch._ops.OpOverload,
@@ -7220,6 +8890,11 @@ def __init__(
         self.indices = indices
         valid_indices = [i for i in indices if i is not None]
         # pyrefly: ignore [bad-argument-type]
+=======
+    def __init__(self, op_overload, x, indices, values, accumulate) -> None:  # type: ignore[no-untyped-def]
+        self.indices = indices
+        valid_indices = [i for i in indices if i is not None]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensors = [self.realize_input(x) for x in [x, values, *valid_indices]]
         cpp_kernel_name = "aoti_torch_index_put_out"
         super().__init__(
@@ -7231,14 +8906,22 @@ def __init__(
             cpp_kernel_name=cpp_kernel_name,
             op_overload=op_overload,
         )
+<<<<<<< HEAD
         V.graph.mark_buffer_mutated(self.input_name(0))
+=======
+        V.graph.mark_buffer_mutated(self.inputs[0].get_name())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
 
 class DeviceCopy(ExternKernelOut):
     @classmethod
+<<<<<<< HEAD
     def create(cls, x: IRNode, device: torch.device, non_blocking: bool) -> IRNode:
+=======
+    def create(cls, x, device, non_blocking):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             not x.is_extern()
             and all(r in V.graph.constants for r in x.get_read_names())
@@ -7247,6 +8930,7 @@ def create(cls, x: IRNode, device: torch.device, non_blocking: bool) -> IRNode:
             return x.constant_to_device(device)
 
         V.graph.add_device_info(device)
+<<<<<<< HEAD
         x_device = x.get_device()
         assert x_device is not None
         V.graph.add_device_info(x_device)
@@ -7274,12 +8958,27 @@ def create(cls, x: IRNode, device: torch.device, non_blocking: bool) -> IRNode:
                 x.get_size(),
                 stride,
                 is_pinned=is_destination_pinned,
+=======
+        V.graph.add_device_info(x.get_device())
+
+        developer_warning("DeviceCopy in input program")
+        constant_args = (non_blocking,)
+        return DeviceCopy(
+            FlexibleLayout(
+                device=device,
+                dtype=x.get_dtype(),
+                size=x.get_size(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             [cls.realize_input(x)],
             constant_args,
         )
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = self.codegen_args()
         assert len(args) == 2
         if self.output_view:
@@ -7290,6 +8989,7 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
             wrapper.codegen_device_copy(args[0], self.codegen_reference(), args[1])
 
 
+<<<<<<< HEAD
 class DynamicSelectStorageOffset(ExternKernel):
     """
     The result of computing a dynamic selection index is determined as follows: when the index in the
@@ -7388,6 +9088,8 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.codegen_dynamic_slice_size(self)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DynamicScalar(ExternKernel):
     """
     The result of a call to aten._local_scalar_dense.
@@ -7399,9 +9101,13 @@ def get_reads(self) -> OrderedSet[Dep]:
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def __init__(
         self, sym: sympy.Symbol, keypath: pytree.KeyPath, data: IRNode
     ) -> None:
+=======
+    def __init__(self, sym, keypath, data) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data.realize()
         super().__init__(
             None, NoneLayout(device=torch.device("cpu")), self.unwrap_storage([data])
@@ -7412,7 +9118,11 @@ def __init__(
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet([self.sym])
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.codegen_dynamic_scalar(self)
 
 
@@ -7427,7 +9137,11 @@ def get_reads(self) -> OrderedSet[Dep]:
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def __init__(self, scalar: SympyBoolean, msg: str) -> None:
+=======
+    def __init__(self, scalar, msg) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             # Buffer(name, layotu)
             None,
@@ -7441,12 +9155,19 @@ def __init__(self, scalar: SympyBoolean, msg: str) -> None:
     def has_side_effects(self) -> bool:
         return True
 
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
         return get_free_symbols(self.scalar, unbacked_only)
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def get_free_symbol_uses(self, unbacked_only: bool = False):  # type: ignore[no-untyped-def]
+        return get_free_symbols(self.scalar, unbacked_only)
+
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not config.scalar_asserts:
             return
         # NB: It is EXTREMELY important not to simplify the scalar under assertion here,
@@ -7455,10 +9176,14 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         # simplify(u0 == 0), you will get True (because we've already runtime assert'ed
         # that it's true).  But we're code generating the actual runtime assert here!!
         symbol = next(iter(self.get_free_symbol_uses(unbacked_only=False)))
+<<<<<<< HEAD
         if V.graph.fx_wrapper:
             # TODO fix
             pass
         elif V.graph.cpp_wrapper:
+=======
+        if V.graph.cpp_wrapper:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             symbol_str = f"std::to_string({symbol})"
             sizevar = V.graph.wrapper_code.codegen_cpp_sizevar(
                 self.scalar, simplify=False
@@ -7491,6 +9216,7 @@ class FallbackKernel(ExternKernelAlloc):
     inplace aten ops, and mutating ops that are auto-functionalizable.
     """
 
+<<<<<<< HEAD
     def __init__(
         self,
         layout: OutputSpec,
@@ -7501,6 +9227,18 @@ def __init__(
         kwargs: Optional[dict[str, Any]] = None,
         *,
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]] = None,
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+        kwargs=None,
+        *,
+        unbacked_bindings=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             layout,
@@ -7510,16 +9248,31 @@ def __init__(
         )
 
         self.use_runtime_dispatch = False
+<<<<<<< HEAD
         self.unbacked_bindings = unbacked_bindings or {}
 
         assert isinstance(
             kernel, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+=======
+        self.unbacked_bindings = unbacked_bindings
+
+        assert isinstance(
+            kernel,
+            (
+                torch._ops.OpOverload,
+                torch._ops.HigherOrderOperator,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ), f"Fails to create FallbackKernel for {kernel}: {type(kernel)} not supported"
         self.op_overload = kernel
         self.unflatten_args = unflatten_args
         self.kwargs = {} if kwargs is None else kwargs
+<<<<<<< HEAD
         assert self.python_kernel_name is not None
         V.graph.warn_fallback(self.python_kernel_name)
+=======
+        V.graph.warn_fallback(self.python_kernel_name)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # args that are aliased
         self.alias_names: list[str] = []
@@ -7564,10 +9317,17 @@ def __init__(
 
         args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
 
+<<<<<<< HEAD
         def handle_aliasing_and_mutation(info: torch._C.Argument, arg: Any) -> None:
             # Assertions to make sure we didn't mismatch args
             if isinstance(info.type, torch.ListType):
                 assert isinstance(arg, (list, tuple)), type(arg)
+=======
+        def handle_aliasing_and_mutation(info, arg) -> None:  # type: ignore[no-untyped-def]
+            # Assertions to make sure we didn't mismatch args
+            if isinstance(info.type, torch.ListType):
+                assert isinstance(arg, (list, tuple))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if library_utils.is_tensor_like_type(info.type):
                 # PyTorch also accepts None and scalar types for args marked as "Tensor".
                 # We're not going to check all of them here.
@@ -7578,9 +9338,14 @@ def handle_aliasing_and_mutation(info: torch._C.Argument, arg: Any) -> None:
             if info.alias_info is None:
                 return
 
+<<<<<<< HEAD
             def add_alias(t: IRNode) -> None:
                 self.alias_names.append(t.get_name())
                 assert info.alias_info is not None
+=======
+            def add_alias(t) -> None:  # type: ignore[no-untyped-def]
+                self.alias_names.append(t.get_name())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if info.alias_info.is_write:
                     self.mutation_outputs.append(
                         MutationOutput(NoneLayout(device=t.get_device()), t, self)
@@ -7592,7 +9357,10 @@ def add_alias(t: IRNode) -> None:
                         add_alias(optional_tensor_arg)
             else:
                 assert library_utils.is_tensor_like_type(info.type)
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 add_alias(arg)
 
         for info, arg in torch._library.utils.zip_schema(schema, args, kwargs):
@@ -7610,22 +9378,38 @@ def get_read_writes(self) -> dependencies.ReadWrites:
 
         return read_writes
 
+<<<<<<< HEAD
     def codegen_unbacked_symbol_defs(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen_unbacked_symbol_defs(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return wrapper.codegen_unbacked_symbol_defs_for_outputs(
             self.get_name(), self.outputs, getattr(self, "unbacked_bindings", None)
         )
 
+<<<<<<< HEAD
     def get_unbacked_symbol_defs(self) -> Container[sympy.Symbol]:  # type: ignore[override]
+=======
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if unbacked_bindings := getattr(self, "unbacked_bindings", None):
             resolved = resolve_unbacked_bindings(
                 V.graph.sizevars.shape_env, unbacked_bindings
             )
             assert resolved is not None
+<<<<<<< HEAD
             return resolved.keys()
         else:
             return OrderedSet()
 
     def codegen_args(self) -> list[str]:
+=======
+            return resolved.keys()  # type: ignore[return-value]
+        else:
+            return OrderedSet()
+
+    def codegen_args(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @dataclasses.dataclass
         class Shim:
             ref: Any
@@ -7633,7 +9417,10 @@ class Shim:
             def __repr__(self) -> str:
                 return self.ref
 
+<<<<<<< HEAD
         assert is_node_sequence(self.inputs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
         args, kwargs = self.unflatten_args(tensor_args, self.constant_args)
         if V.graph.cpp_wrapper and isinstance(self.op_overload, torch._ops.OpOverload):
@@ -7650,16 +9437,23 @@ def __repr__(self) -> str:
         return args
 
     @staticmethod
+<<<<<<< HEAD
     def find_device(
         tensor_args: Optional[Sequence[torch.Tensor]], example_output: Sequence[Any]
     ) -> Any:
+=======
+    def find_device(tensor_args, example_output):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         non_torch_bind_tensor_args = (
             [t for t in tensor_args if not isinstance(t, TorchBindObject)]
             if tensor_args
             else None
         )
         if non_torch_bind_tensor_args:
+<<<<<<< HEAD
             assert tensor_args
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             devices = [arg.get_device() for arg in tensor_args if arg.get_device()]
             return devices[0]
         if isinstance(example_output, torch.Tensor):
@@ -7673,17 +9467,25 @@ def find_device(
             if len(devices) == 1:
                 return devices[0]
             for device in devices:
+<<<<<<< HEAD
                 assert isinstance(device, torch.device)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if is_gpu(device.type):
                     return device
             return devices[0]
         return None
 
+<<<<<<< HEAD
     def has_side_effects(self) -> bool:
+=======
+    def has_side_effects(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.op_overload, torch._ops.HigherOrderOperator):
             return False
         return get_schema_info(self.op_overload).is_mutable()
 
+<<<<<<< HEAD
     def get_inputs_that_alias_output(self) -> Sequence[str]:
         assert isinstance(
             self.op_overload, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
@@ -7704,6 +9506,10 @@ def get_inputs_that_alias_output(self) -> Sequence[str]:
             return []
         else:
             return self.alias_names
+=======
+    def get_inputs_that_alias_output(self):  # type: ignore[no-untyped-def]
+        return self.alias_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_mutation_names(self) -> Sequence[str]:
         assert len(self.mutation_names) <= 1
@@ -7724,7 +9530,11 @@ def export_extern_kernel_node(self):  # type: ignore[no-untyped-def]
             self.op_overload,
         )
 
+<<<<<<< HEAD
         assert isinstance(self, FallbackKernel), type(self)
+=======
+        assert isinstance(self, FallbackKernel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
         args = self.fill_non_provided_args(args, kwargs)
         ordered_kwargs = [
@@ -7737,6 +9547,7 @@ def export_extern_kernel_node(self):  # type: ignore[no-untyped-def]
             # No need to serialize in the cpp wrapper JIT mode
             return [*args, *ordered_kwargs]
 
+<<<<<<< HEAD
         serializer = GraphModuleSerializer(None, [])  # type: ignore[arg-type]
         named_arguments = serializer.serialize_inputs(target, args, kwargs)
 
@@ -7745,6 +9556,13 @@ def handle_single_output(
             return_type: Union[torch.TensorType, torch.ListType, torch.JitType],
             output: Union[IRNode, Sequence[IRNode]],
         ) -> export_schema.Argument:
+=======
+        serializer = GraphModuleSerializer(None, None)  # type: ignore[arg-type]
+        named_arguments = serializer.serialize_inputs(target, args, kwargs)
+
+        # serialize_outputs
+        def handle_single_output(return_type, output):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(return_type, (torch.TensorType, torch.NoneType)):
                 # For single Tensor or None
                 out = output
@@ -7752,7 +9570,10 @@ def handle_single_output(
                     assert len(output) == 1
                     out = output[0]
                 if isinstance(return_type, torch.TensorType):
+<<<<<<< HEAD
                     assert isinstance(out, IRNode)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return export_schema.Argument.create(
                         as_tensor=export_schema.TensorArgument(name=out.get_name())
                     )
@@ -7762,7 +9583,10 @@ def handle_single_output(
             elif isinstance(return_type, torch.ListType) and isinstance(
                 return_type.getElementType(), torch.TensorType
             ):
+<<<<<<< HEAD
                 assert isinstance(output, Sequence), type(output)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # For single TensorList
                 return export_schema.Argument.create(
                     as_tensors=[
@@ -7781,7 +9605,10 @@ def handle_single_output(
                         )
                     )
                 else:
+<<<<<<< HEAD
                     assert isinstance(output, IRNode)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return export_schema.Argument.create(
                         as_optional_tensor=export_schema.OptionalTensorArgument.create(
                             as_tensor=export_schema.TensorArgument(
@@ -7795,7 +9622,11 @@ def handle_single_output(
                 raise RuntimeError(f"Unsupported return type {type(return_type)}")
 
         if isinstance(target, torch._higher_order_ops.torchbind.CallTorchBind):
+<<<<<<< HEAD
             returns = target.schema(args[0], args[1]).returns
+=======
+            returns = target.schema(args[0], args[1]).returns  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             returns = target._schema.returns  # type: ignore[union-attr]
         if len(returns) == 1:
@@ -7808,6 +9639,7 @@ def handle_single_output(
             # For tuple returns, e.g "-> (Tensor, Tensor)" or "-> (Tesnor, Tensor[])"
             # Not generating output args for self.mutation_outputs
             output_arguments = [
+<<<<<<< HEAD
                 handle_single_output(
                     return_schema.real_type,  # type: ignore[attr-defined]
                     output,
@@ -7820,12 +9652,23 @@ def handle_single_output(
             name=self.get_name(),
             node=export_schema.Node(
                 target=self.op_overload.name(),
+=======
+                handle_single_output(return_schema.real_type, output)
+                for return_schema, output in zip(returns, self.outputs)
+            ]
+
+        node = ExternKernelNode(
+            name=self.get_name(),
+            node=export_schema.Node(
+                target=self.op_overload.name(),  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inputs=named_arguments,
                 outputs=output_arguments,
                 metadata={},
             ),
         )
 
+<<<<<<< HEAD
         V.extern_kernel_nodes.append(node)
 
         return [*args, *ordered_kwargs]
@@ -7839,6 +9682,17 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         if kernel.namespace == "aten":
             # Aten Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload), type(kernel)
+=======
+        V.graph.extern_kernel_nodes.append(node)
+
+        return [*args, *ordered_kwargs]
+
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        kernel = self.op_overload
+        if kernel.namespace == "aten":  # type: ignore[union-attr]
+            # Aten Fallback Ops
+            assert isinstance(kernel, torch._ops.OpOverload)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if V.graph.cpp_wrapper:
                 from torchgen.aoti.fallback_ops import inductor_fallback_ops
 
@@ -7850,9 +9704,15 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
                         kernel,
                     )
                     self.use_runtime_dispatch = True
+<<<<<<< HEAD
         elif kernel.namespace == "_quantized":
             # Internal Quantized Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload), type(kernel)
+=======
+        elif kernel.namespace == "_quantized":  # type: ignore[union-attr]
+            # Internal Quantized Fallback Ops
+            assert isinstance(kernel, torch._ops.OpOverload)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif V.graph.cpp_wrapper:
             # For non-aten OpOverload, i.e. custom ops
             # If the op is in custom_ops_to_c_shims, generate direct function call
@@ -7895,9 +9755,12 @@ def is_number(t: torch.JitType) -> bool:
         self.codegen_comment(wrapper)
         if self.use_runtime_dispatch:
             exported_args = self.export_extern_kernel_node()
+<<<<<<< HEAD
             assert self.python_kernel_name is not None
             assert self.op_overload is not None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             wrapper.generate_fallback_kernel_with_runtime_lookup(
                 self.get_name(),
                 self.python_kernel_name,
@@ -7912,11 +9775,15 @@ def is_number(t: torch.JitType) -> bool:
             if isinstance(self.layout, Layout):
                 self.codegen_size_asserts(wrapper)
                 self.codegen_alignment_asserts(wrapper)
+<<<<<<< HEAD
                 self.codegen_memory_tracking(wrapper)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.codegen_unbacked_symbol_defs(wrapper)
 
     @staticmethod
+<<<<<<< HEAD
     def tensor_to_layout(output: torch.Tensor) -> FixedLayout:
         is_pinned = False
         try:
@@ -7924,11 +9791,15 @@ def tensor_to_layout(output: torch.Tensor) -> FixedLayout:
         except RuntimeError:
             # dispatch not implemented
             pass
+=======
+    def tensor_to_layout(output: torch.Tensor):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return FixedLayout(
             output.device,
             output.dtype,
             convert_shape_to_inductor(output.size()),
             convert_shape_to_inductor(output.stride()),
+<<<<<<< HEAD
             is_pinned=is_pinned,
         )
 
@@ -7941,6 +9812,16 @@ def create(cls, kernel: _OpOverloads, *args: Any, **kwargs: Any) -> FallbackKern
         else:
             context = nullcontext()
 
+=======
+        )
+
+    @classmethod
+    def create(cls, kernel, *args, **kwargs):  # type: ignore[no-untyped-def]
+        fake_incorrect_kernels = (aten._fused_moving_avg_obs_fq_helper_functional,)
+        context: AbstractContextManager[None] = (
+            V.graph.fake_mode if kernel not in fake_incorrect_kernels else nullcontext()  # type: ignore[assignment]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with context:
             (
                 example_output,
@@ -7983,7 +9864,11 @@ def create(cls, kernel: _OpOverloads, *args: Any, **kwargs: Any) -> FallbackKern
                 unbacked_bindings=unbacked_bindings,
             )
 
+<<<<<<< HEAD
         def generate_output(output: Any, indices: list[tuple[Any, int]]) -> Any:
+=======
+        def generate_output(output, indices):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(output, (list, tuple)):
                 return type(output)(
                     generate_output(output[i], indices + [(type(output), i)])
@@ -8018,6 +9903,7 @@ def generate_output(output: Any, indices: list[tuple[Any, int]]) -> Any:
                 return None
 
         outputs = generate_output(example_output, [])
+<<<<<<< HEAD
         if isinstance(outputs, (list, tuple)):
             packed.outputs = outputs
         elif isinstance(outputs, dict):
@@ -8028,6 +9914,15 @@ def generate_output(output: Any, indices: list[tuple[Any, int]]) -> Any:
         return outputs
 
     def apply_constraint(self) -> None:
+=======
+        if isinstance(outputs, (list, tuple, dict)):
+            packed.outputs = outputs  # type: ignore[assignment]
+        else:
+            packed.outputs = [outputs]
+        return outputs
+
+    def apply_constraint(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().apply_constraint()
 
 
@@ -8040,6 +9935,7 @@ def should_allocate(self) -> bool:
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
         # Signal to codegen that our output buffer isn't safe to reuse
+<<<<<<< HEAD
         return [self.input_name(0)]
 
     def __init__(
@@ -8051,6 +9947,19 @@ def __init__(
         unflatten_args: Callable[..., Any],
         *,
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]] = None,
+=======
+        return [self.inputs[0].get_name()]
+
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+        *,
+        unbacked_bindings=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             layout,
@@ -8062,6 +9971,7 @@ def __init__(
         )
 
 
+<<<<<<< HEAD
 class MemoryCheckKernel(FallbackKernel):
     """
     Custom kernel for memory checking that generates direct function calls
@@ -8087,6 +9997,8 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.writeline(call)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @ir_dataclass
 class MultiOutputLayout(OutputSpec):
     device: torch.device
@@ -8096,18 +10008,31 @@ def get_device(self) -> Optional[torch.device]:
 
 
 class MultiOutput(ExternKernel):
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.codegen_multi_output(self)
         if not self.skip_size_stride_alignment_checks:
             self.codegen_size_asserts(wrapper)
             self.codegen_alignment_asserts(wrapper)
 
+<<<<<<< HEAD
     def __init__(
         self,
         layout: OutputSpec,
         input: IRNode,
         indices: list[tuple[Any, ...]],
         skip_size_stride_alignment_checks: bool = False,
+=======
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout: OutputSpec,
+        input,
+        indices: list[tuple[Any, ...]],
+        skip_size_stride_alignment_checks=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(None, layout, [input], ())
         self.name = V.graph.register_buffer(self)
@@ -8118,6 +10043,7 @@ def __init__(
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
+<<<<<<< HEAD
         input_node = self.inputs[0]
         assert isinstance(input_node, IRNode), input_node
         return input_node.get_free_symbol_uses(unbacked_only)
@@ -8126,6 +10052,16 @@ def should_allocate(self) -> bool:
         return len(self.inputs) == 1 and (
             isinstance(self.inputs[0], CppTemplateBuffer)  # Grouped GEMM
         )
+=======
+        return self.inputs[0].get_free_symbol_uses(unbacked_only)
+
+    def should_allocate(self) -> bool:
+        if len(self.inputs) == 1 and (
+            isinstance(self.inputs[0], CppTemplateBuffer)  # Grouped GEMM
+        ):
+            return True
+        return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
         return [
@@ -8183,6 +10119,7 @@ def freeze_layout(self) -> None:
         return self.data.freeze_layout()
 
     def freeze_layout_with_stride_order(
+<<<<<<< HEAD
         self, order: Sequence[int], allow_padding: bool = False
     ) -> None:
         return self.data.freeze_layout_with_stride_order(order, allow_padding)
@@ -8195,6 +10132,20 @@ def freeze_layout_with_same_order(self, stride: Sequence[_IntLike]) -> None:
 
     def freeze_layout_with_exact_strides(
         self, exact_strides: Sequence[_IntLike], allow_padding: bool = False
+=======
+        self, order: list[int], allow_padding: bool = False
+    ) -> None:
+        return self.data.freeze_layout_with_stride_order(order, allow_padding)
+
+    def freeze_layout_with_fill_order(self, order: list[int]) -> None:
+        return self.data.freeze_layout_with_fill_order(order)
+
+    def freeze_layout_with_same_order(self, stride: list[_IntLike]) -> None:
+        return self.data.freeze_layout_with_same_order(stride)
+
+    def freeze_layout_with_exact_strides(
+        self, exact_strides: list[_IntLike], allow_padding: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         return self.data.freeze_layout_with_exact_strides(exact_strides, allow_padding)
 
@@ -8213,7 +10164,11 @@ def get_storage_numel(self) -> _IntLike:
     def get_reduction_type(self) -> Optional[str]:
         return self.data.get_reduction_type()
 
+<<<<<<< HEAD
     def get_reduction_size(self) -> Sequence[Expr]:
+=======
+    def get_reduction_size(self) -> Sequence[sympy.Expr]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.get_reduction_size()
 
     def is_extern(self) -> bool:
@@ -8266,7 +10221,11 @@ def get_size(self) -> Sequence[Expr]:
         return self.data.get_size()
 
     @property
+<<<<<<< HEAD
     def dtype(self) -> torch.dtype:
+=======
+    def dtype(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.dtype
 
     def __str__(self) -> str:
@@ -8291,37 +10250,63 @@ def __str__(self) -> str:
 
 class TensorBox(MutableBox):
     @staticmethod
+<<<<<<< HEAD
     def create(data: IRNode) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    def create(data):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(data, ShapeAsConstantBuffer):
             return data
         return TensorBox(StorageBox(data))
 
 
 class StorageBox(MutableBox):
+<<<<<<< HEAD
     """
     StorageBox allow in-place mutation of Tensors
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_input_buffer(self) -> bool:
         if isinstance(self.data, (InputBuffer, ReinterpretView)):
             return self.data.get_name() in V.graph.graph_inputs
         return False
 
+<<<<<<< HEAD
     def is_module_buffer(self) -> bool:
+=======
+    def is_module_buffer(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             isinstance(self.data, (ConstantBuffer))
             and self.data.get_name() in V.graph.constants
         )
 
     def realize(self) -> Optional[str]:
+<<<<<<< HEAD
         if IRNode.is_realized_node(self.data):
             return self.data.get_name()
 
+=======
+        if isinstance(
+            self.data,
+            (
+                ComputedBuffer,
+                InputsKernel,
+                InputBuffer,
+                ReinterpretView,
+                TemplateBuffer,
+            ),
+        ):
+            return self.data.get_name()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(self.data, (Pointwise, Reduction, Scan, Sort)), type(
             self.data
         )
         origin_node = self.data.get_origin_node()
         traceback = self.data.get_traceback()
+<<<<<<< HEAD
         device = self.data.get_device()
         assert device is not None
 
@@ -8332,6 +10317,14 @@ def realize(self) -> Optional[str]:
                 dtype=self.data.get_dtype(),
                 size=self.data.get_size(),
                 is_pinned=False,
+=======
+        self.data = ComputedBuffer(
+            name=None,
+            layout=FlexibleLayout(
+                device=self.data.get_device(),
+                dtype=self.data.get_dtype(),
+                size=self.data.get_size(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             data=self.data,
         )
@@ -8352,6 +10345,7 @@ def realize_hint(self) -> None:
         ):
             self.realize()
 
+<<<<<<< HEAD
     def has_accumulated_enough_reads_by_size(self, threshold: int) -> bool:
         from torch._inductor.utils import is_nonfreeable_buffers
 
@@ -8371,10 +10365,13 @@ def has_accumulated_enough_reads_by_size(self, threshold: int) -> bool:
             and max_size == min_size
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_exceeded_max_reads(self) -> bool:
         return isinstance(self.data, Pointwise) and (
             self.num_reads() > config.realize_acc_reads_threshold
             or self.has_large_inner_fn()
+<<<<<<< HEAD
             or (
                 config.realize_acc_reads_size_threshold is not None
                 and self.has_accumulated_enough_reads_by_size(
@@ -8384,6 +10381,11 @@ def has_exceeded_max_reads(self) -> bool:
         )
 
     def should_realize_on_reuse(self, users: int) -> bool:
+=======
+        )
+
+    def should_realize_on_reuse(self, users):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         A heuristic to decide if we should realize a tensor
         that is used multiple times.
@@ -8405,7 +10407,11 @@ def mark_reuse(self, users: int) -> None:
         if self.should_realize_on_reuse(users):
             self.realize()
 
+<<<<<<< HEAD
     def num_reads(self) -> int:
+=======
+    def num_reads(self):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.num_reads()
 
 
@@ -8432,11 +10438,19 @@ class InvokeSubgraph(ExternKernel):
     """
 
     subgraph: Optional[Subgraph] = None
+<<<<<<< HEAD
     operands: Optional[Sequence[IRNode]] = None
     outputs: Optional[Sequence[IRNode]] = None
 
     def __init__(
         self, subgraph: Subgraph, operands: Sequence[IRNode], layout: MultiOutputLayout
+=======
+    operands: Optional[list[TensorBox]] = None
+    outputs: Optional[list[MultiOutput]] = None
+
+    def __init__(
+        self, subgraph: Subgraph, operands: list[TensorBox], layout: MultiOutputLayout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             name=None,
@@ -8448,11 +10462,15 @@ def __init__(
         V.graph.register_operation(self)
 
     @classmethod
+<<<<<<< HEAD
     def create(
         cls, subgraph: Subgraph, *operands: IRNode
     ) -> list[Union[ShapeAsConstantBuffer, NoneAsConstantBuffer, MultiOutput]]:
         """For each operand, get a realized input, force it to have the same
         strides as the subgraph inputs, then use an InvokeSubgraph"""
+=======
+    def create(cls, subgraph: Subgraph, *operands):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .lowering import constrain_to_fake_tensor
 
         # TODO(anijain2305) - Support sym expr as operands in future.
@@ -8471,19 +10489,30 @@ def create(
         # Realize the inputs. Also intermediates can have different strides than
         # the inputs of the subgraph. So, force the intermediates to have same
         # strides as that of subgraph inputs.
+<<<<<<< HEAD
         # pyrefly: ignore [annotation-mismatch]
         operands: list[IRNode] = [cls.realize_input(x) for x in operands]
         new_operands: list[IRNode] = []
 
         for idx, operand in enumerate(operands):
             if isinstance(operand, (ShapeAsConstantBuffer, GeneratorState)):
+=======
+        operands = [cls.realize_input(x) for x in operands]
+
+        new_operands = []
+        for idx, operand in enumerate(operands):
+            if isinstance(operand, ShapeAsConstantBuffer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_operands.append(operand)
             else:
                 new_operands.append(
                     constrain_to_fake_tensor(operand, fake_operands[idx])
                 )
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         operands = new_operands
 
         if subgraph.graph is None:
@@ -8506,12 +10535,17 @@ def create(
                 device = operand.get_device()
                 break
         assert device is not None
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         invoke_subgraph = InvokeSubgraph(
             subgraph=subgraph,
             operands=operands,
             layout=MultiOutputLayout(device=device),
         )
 
+<<<<<<< HEAD
         def create_output(
             output: IRNode, ind: int
         ) -> Union[ShapeAsConstantBuffer, NoneAsConstantBuffer, MultiOutput]:
@@ -8529,32 +10563,64 @@ def create_output(
                         stride=output.get_stride(),
                         offset=output.get_layout().offset,
                         is_pinned=output.get_layout().is_pinned,
+=======
+        def create_output(output: IRNode, ind: int):  # type: ignore[no-untyped-def]
+            if isinstance(output, (ShapeAsConstantBuffer, NoneAsConstantBuffer)):
+                return output
+            else:
+                return MultiOutput(
+                    FixedLayout(
+                        device=output.get_device(),  # type: ignore[arg-type]
+                        dtype=output.get_dtype(),
+                        size=output.get_size(),  # type: ignore[arg-type]
+                        stride=output.get_stride(),  # type: ignore[arg-type]
+                        offset=output.get_layout().offset,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     invoke_subgraph,  # type: ignore[has-type]
                     [(list, ind)],
                     skip_size_stride_alignment_checks=True,
                 )
 
+<<<<<<< HEAD
         outs = [create_output(output, i) for i, output in enumerate(outputs)]
         invoke_subgraph.outputs = outs  # type: ignore[assignment]
         return outs
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+        outputs = [create_output(output, i) for i, output in enumerate(outputs)]
+        invoke_subgraph.outputs = outputs
+        return outputs
+
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.codegen_invoke_subgraph(self)
 
 
 @ir_dataclass(frozen=False)
 class Conditional(ExternKernel):
     predicate: Optional[IRNode] = None
+<<<<<<< HEAD
     operands: Optional[Sequence[IRNode]] = None
     true_subgraph: Optional[Subgraph] = None
     false_subgraph: Optional[Subgraph] = None
     outputs: Optional[Sequence[MultiOutput]] = None
+=======
+    operands: Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]] = None
+    true_subgraph: Optional[Subgraph] = None
+    false_subgraph: Optional[Subgraph] = None
+    outputs: Optional[list[MultiOutput]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
         predicate: IRNode,
+<<<<<<< HEAD
         operands: Sequence[IRNode],
+=======
+        operands: list[Union[TensorBox, ShapeAsConstantBuffer]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         true_subgraph: Subgraph,
         false_subgraph: Subgraph,
         layout: MultiOutputLayout,
@@ -8565,7 +10631,11 @@ def __init__(
         self.true_subgraph = true_subgraph
         self.false_subgraph = false_subgraph
 
+<<<<<<< HEAD
         sym_args, tensor_args = _split_by_sym_type([predicate, *operands])
+=======
+        sym_args, tensor_args = _split_by_sym_type([predicate] + operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         super().__init__(
             name=None,
@@ -8579,6 +10649,7 @@ def __init__(
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+<<<<<<< HEAD
     @staticmethod
     def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
         if isinstance(s, int):
@@ -8587,11 +10658,16 @@ def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
 
     @classmethod
     def create(
+=======
+    @classmethod
+    def create(  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls,
         predicate: TensorBox,
         true_fn: Subgraph,
         false_fn: Subgraph,
         operands: list[Union[TensorBox, ShapeAsConstantBuffer]],
+<<<<<<< HEAD
     ) -> Sequence[IRNode]:
         """Create a Sequence of IRNodes from a conditional statement (see .lowering.cond)"""
         # pyrefly: ignore [bad-assignment]
@@ -8603,6 +10679,13 @@ def create(
         assert isinstance(fx_operands, Sequence), type(fx_operands)
         assert all(isinstance(n, Node) for n in fx_operands)
         fake_operands = [cast(Node, x).meta["val"] for x in fx_operands]
+=======
+    ):
+        predicate = cls.realize_input(predicate)
+        operands = [cls.realize_input(x) for x in operands]
+        fx_operands = V.graph.current_node.args[-1]
+        fake_operands = [x.meta["val"] for x in fx_operands]  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for subgraph in (true_fn, false_fn):
             if subgraph.graph is None:
@@ -8615,10 +10698,15 @@ def create(
                 with V.set_graph_handler(subgraph.graph):
                     subgraph.graph.run(*fake_operands)
 
+<<<<<<< HEAD
         assert true_fn.graph is not None
         assert false_fn.graph is not None
         true_outputs = true_fn.graph.graph_outputs
         false_outputs = false_fn.graph.graph_outputs
+=======
+        true_outputs = true_fn.graph.graph_outputs  # type: ignore[union-attr]
+        false_outputs = false_fn.graph.graph_outputs  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for name, outputs in (("true_fn", true_outputs), ("false_fn", false_outputs)):
             if _has_aliased_buffers(true_outputs):
@@ -8653,6 +10741,7 @@ def create(
             unbacked_bindings=unbacked_bindings,
         )
 
+<<<<<<< HEAD
         outputs = [
             MultiOutput(
                 FixedLayout(
@@ -8664,6 +10753,21 @@ def create(
                     ],
                     offset=output.get_layout().offset,
                     is_pinned=output.get_layout().is_pinned,
+=======
+        def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.expr]:
+            if isinstance(s, int):
+                return s
+            return s.node.expr
+
+        outputs = [
+            MultiOutput(
+                FixedLayout(
+                    device=output.get_device(),
+                    dtype=output.get_dtype(),
+                    size=[_maybe_expr(sz) for sz in merged_output.size()],
+                    stride=[_maybe_expr(sz) for sz in merged_output.stride()],
+                    offset=output.get_layout().offset,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 conditional,
                 [(list, i)],
@@ -8678,7 +10782,11 @@ def create(
         conditional.outputs = outputs  # type: ignore[assignment]
         return outputs
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper.codegen_conditional(self)
         wrapper.codegen_unbacked_symbol_defs_for_outputs(
             self.get_name(), self.outputs, getattr(self, "unbacked_bindings", {})
@@ -8690,7 +10798,11 @@ def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
                 V.graph.sizevars.shape_env, unbacked_bindings
             )
             assert resolved is not None
+<<<<<<< HEAD
             return OrderedSet(resolved.keys())
+=======
+            return resolved.keys()  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return OrderedSet()
 
@@ -8711,6 +10823,7 @@ def _split_by_sym_type(
 
 @ir_dataclass(frozen=False)
 class WhileLoop(ExternKernel):
+<<<<<<< HEAD
     """The IR node for while_loop and while_loop_stack_output. It supports input mutation."""
 
     carried_inputs: Optional[Sequence[IRNode]] = None
@@ -8728,28 +10841,51 @@ def __init__(
         layout: MultiOutputLayout,
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
         stack_output: bool,
+=======
+    carried_inputs: Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]] = None
+    additional_inputs: Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]] = None
+    cond_subgraph: Optional[Subgraph] = None
+    body_subgraph: Optional[Subgraph] = None
+    outputs: Optional[list[MultiOutput]] = None
+
+    def __init__(
+        self,
+        carried_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
+        additional_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
+        cond_subgraph: Subgraph,
+        body_subgraph: Subgraph,
+        layout: MultiOutputLayout,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.carried_inputs = carried_inputs
         self.additional_inputs = additional_inputs
         self.cond_subgraph = cond_subgraph
         self.body_subgraph = body_subgraph
 
+<<<<<<< HEAD
         sym_args, tensor_args = _split_by_sym_type(
             [*carried_inputs, *additional_inputs]
         )
+=======
+        sym_args, tensor_args = _split_by_sym_type(carried_inputs + additional_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             name=None,
             layout=layout,
             inputs=tensor_args,
             constant_args=sym_args,
         )
+<<<<<<< HEAD
         if unbacked_bindings is not None:
             self.unbacked_bindings = unbacked_bindings
         self.stack_output = stack_output
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+<<<<<<< HEAD
     # Accidental aliasing can be created due to cse, where the empty buffers we
     # allocated for backward to use gets csed into the same buffer in function fx_graph_cse.
     # See test_scan_multiple_layers_gradient for a concrete example.
@@ -8810,10 +10946,27 @@ def _require_exact_strides(
             tensor_boxes: Sequence[IRNode],
             fake_tensors: list[Union[int, torch.SymInt, torch.Tensor]],
         ) -> list[IRNode]:
+=======
+    @classmethod
+    def create(  # type: ignore[no-untyped-def]
+        cls,
+        cond_fn: Subgraph,
+        body_fn: Subgraph,
+        carried_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
+        additional_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
+    ):
+        from torch._higher_order_ops.utils import check_input_alias_and_mutation
+
+        def _require_exact_strides(
+            tensor_boxes: list[TensorBox | ShapeAsConstantBuffer],
+            fake_tensors: list[Union[int, torch.SymInt, torch.Tensor]],
+        ) -> list[TensorBox | ShapeAsConstantBuffer]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert len(tensor_boxes) == len(fake_tensors)
             ret = []
             for tb, fk in zip(tensor_boxes, fake_tensors):
                 if isinstance(fk, torch.Tensor):
+<<<<<<< HEAD
                     # Subgraph lowering always return StorageBox as graph_outputs because
                     # it realizes the outputs.
                     #
@@ -8829,6 +10982,11 @@ def _require_exact_strides(
                     ret.append(
                         ExternKernel.require_exact_strides(
                             new_tb, fk.stride(), allow_padding=False
+=======
+                    ret.append(
+                        ExternKernel.require_exact_strides(
+                            tb, fk.stride(), allow_padding=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     )
                 else:
@@ -8842,6 +11000,7 @@ def _require_exact_strides(
         fake_carried_inputs = [x.meta["val"] for x in fx_carried_inputs]  # type: ignore[union-attr]
         fake_additional_inputs = [x.meta["val"] for x in fx_additional_inputs]  # type: ignore[union-attr]
 
+<<<<<<< HEAD
         carried_inputs_ = [cls.realize_input(x) for x in carried_inputs]
         carried_inputs_ = WhileLoop._clone_aliased_inputs(carried_inputs_)
         carried_inputs_ = _require_exact_strides(carried_inputs_, fake_carried_inputs)
@@ -8850,11 +11009,23 @@ def _require_exact_strides(
             additional_inputs_, fake_additional_inputs
         )
         all_inputs = carried_inputs_ + additional_inputs_
+=======
+        carried_inputs = [cls.realize_input(x) for x in carried_inputs]
+        carried_inputs = _require_exact_strides(carried_inputs, fake_carried_inputs)
+        additional_inputs = [cls.realize_input(x) for x in additional_inputs]
+        additional_inputs = _require_exact_strides(
+            additional_inputs, fake_additional_inputs
+        )
+        all_inputs = carried_inputs + additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for subgraph in (cond_fn, body_fn):
             if subgraph.graph is None:
                 # create and lower subgraphs
+<<<<<<< HEAD
                 assert isinstance(fx_all_inputs, Sequence), type(fx_all_inputs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 subgraph.graph = V.graph.make_subgraph(
                     gm=subgraph.graph_module,
                     example_inputs=fx_all_inputs,  # type: ignore[arg-type]
@@ -8873,6 +11044,7 @@ def _require_exact_strides(
                             fake_carried_inputs
                         )
                         subgraph.graph.graph_outputs = _require_exact_strides(  # type: ignore[assignment]
+<<<<<<< HEAD
                             subgraph.graph.graph_outputs,
                             fake_carried_inputs,
                         )
@@ -8880,6 +11052,14 @@ def _require_exact_strides(
         assert cond_fn.graph and body_fn.graph
         cond_outputs = cond_fn.graph.graph_outputs
         body_outputs = body_fn.graph.graph_outputs
+=======
+                            subgraph.graph.graph_outputs,  # type: ignore[arg-type]
+                            fake_carried_inputs,
+                        )
+
+        cond_outputs = cond_fn.graph.graph_outputs  # type: ignore[union-attr]
+        body_outputs = body_fn.graph.graph_outputs  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if _has_aliased_buffers(body_outputs):
             raise AssertionError(
@@ -8901,6 +11081,7 @@ def _require_exact_strides(
         device = all_inputs[0].get_device()
 
         assert device is not None  # to make linter happy
+<<<<<<< HEAD
         # make sure carried_inputs_ and body outputs are structurally equivalent
         assert len(carried_inputs_) == len(body_outputs), (
             carried_inputs_,
@@ -8933,12 +11114,39 @@ def _guard_list_equals(
         while_loop = WhileLoop(
             carried_inputs=carried_inputs_,
             additional_inputs=additional_inputs_,
+=======
+        # make sure carried_inputs and body outputs are structurally equivalent
+        assert len(carried_inputs) == len(body_outputs), (carried_inputs, body_outputs)
+        for i, (op, bo) in enumerate(zip(carried_inputs, body_outputs)):
+
+            def _guard_list_equals(
+                lhs_exprs: Sequence[Union[int, Any]],
+                rhs_exprs: Sequence[Union[int, Any]],
+            ) -> None:
+                for lhs, rhs in zip(lhs_exprs, rhs_exprs):
+                    V.graph.sizevars.guard_equals(lhs, rhs)
+
+            _guard_list_equals(op.get_size(), bo.get_size())
+            _guard_list_equals(op.get_stride(), bo.get_stride())
+            # assume all carried_inputs and outputs are on the same device
+            # as the MultiOutputLayout below requires single device
+            assert op.get_device() == bo.get_device(), (i, op, bo, device)
+            assert op.get_dtype() == bo.get_dtype(), (i, op, bo)
+            assert op.get_layout().offset == bo.get_layout().offset, (i, op, bo)
+
+        while_loop = WhileLoop(
+            carried_inputs=carried_inputs,
+            additional_inputs=additional_inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cond_subgraph=cond_fn,
             body_subgraph=body_fn,
             # asserted above that there is at least one operand
             layout=MultiOutputLayout(device=device),
+<<<<<<< HEAD
             unbacked_bindings=unbacked_bindings,
             stack_output=stack_output,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         assert body_fn.graph is not None and isinstance(
@@ -8951,6 +11159,7 @@ def _guard_list_equals(
         )[3]
         mutated_idx_set = OrderedSet(mutated_idxs)
         mutated_inputs = [all_inputs[idx] for idx in mutated_idx_set]
+<<<<<<< HEAD
 
         # Create all outputs first
         mutated_inputs_iter = iter(mutated_inputs)
@@ -9000,6 +11209,39 @@ def _guard_list_equals(
                     while_loop.outputs.append(multi_out)
                     all_outputs.append(multi_out)
 
+=======
+        real_outputs = {
+            idx: out
+            for idx, out in enumerate(body_outputs)
+            if idx not in mutated_idx_set
+        }
+        real_outputs = [
+            MultiOutput(
+                FixedLayout(
+                    device=output.get_device(),
+                    dtype=output.get_dtype(),
+                    size=output.get_size(),
+                    stride=output.get_stride(),
+                    offset=output.get_layout().offset,
+                ),
+                while_loop,
+                [(list, idx)],
+            )
+            for idx, output in real_outputs.items()
+        ]
+        while_loop.outputs = real_outputs
+        while_loop.mutation_outputs = [
+            MutationOutput(inp.layout, inp, while_loop)  # type: ignore[union-attr]
+            for inp in mutated_inputs
+        ]
+
+        outputs_iter = iter(real_outputs)
+        mutated_inputs_iter = iter(mutated_inputs)
+        all_outputs = [
+            next(mutated_inputs_iter) if idx in mutated_idx_set else next(outputs_iter)
+            for idx in range(len(body_outputs))
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inp, out in zip(carried_inputs, all_outputs):
             if inp.get_name() in V.graph.graph_inputs:
                 # if a carried input of the while_loop is a graph input,
@@ -9010,6 +11252,7 @@ def _guard_list_equals(
                 V.graph.never_reuse_buffers.add(out.get_name())
         return all_outputs
 
+<<<<<<< HEAD
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         wrapper.codegen_while_loop(self, self.stack_output)
         wrapper.codegen_unbacked_symbol_defs_for_outputs(
@@ -9038,6 +11281,23 @@ def __init__(
         kwargs: Optional[dict[str, Any]] = None,
         *,
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]] = None,
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        wrapper.codegen_while_loop(self)
+
+
+class EffectfulKernel(FallbackKernel):
+    def __init__(  # type: ignore[no-untyped-def]
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+        kwargs=None,
+        *,
+        unbacked_bindings=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             layout,
@@ -9075,10 +11335,14 @@ def has_side_effects(self) -> bool:
 
 
 class NonTensorObj(IRNode):
+<<<<<<< HEAD
     def get_free_symbol_uses(
         self, unbacked_only: bool = False
     ) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
+=======
+    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @ir_dataclass
@@ -9104,8 +11368,12 @@ def get_real_obj(self) -> torch.ScriptObject:
     def get_buf_bytes(self) -> int:
         # Returns the sum of all tensors in the flattened object
         real_script_obj = self.get_real_obj()
+<<<<<<< HEAD
         assert hasattr(real_script_obj, "__obj_flatten__")
         flat_dict = dict(real_script_obj.__obj_flatten__())
+=======
+        flat_dict = dict(real_script_obj.__obj_flatten__())  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_elems = pytree.tree_flatten(flat_dict)[0]
         flat_sizes = [
             x.element_size() * x.numel()
@@ -9141,10 +11409,14 @@ def set_cpp_kernel_name(self, cpp_kernel_name: Optional[str] = None) -> None:
             "Setting cpp kernel needs a valid op_overload"
         )
         kernel = self.op_overload
+<<<<<<< HEAD
         if cpp_kernel_name is not None:
             self.cpp_kernel_name = cpp_kernel_name
         else:
             self.cpp_kernel_name = kernel._schema.name
+=======
+        self.cpp_kernel_name = kernel._schema.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.ordered_kwargs_for_cpp_kernel = [
             x.name for x in kernel._schema.arguments if x.kwarg_only
@@ -9157,12 +11429,17 @@ def set_cpp_kernel_name(self, cpp_kernel_name: Optional[str] = None) -> None:
     # the constraints, we model collective -> wait_tensor as as two-step
     # mutation of the input buffers.
     @classmethod
+<<<<<<< HEAD
     def create_inplace(
         cls,
         kernel: _OpOverloads,
         inputs: Union[IRNode, list[IRNode]],
         *args: Any,
         **kwargs: Any,
+=======
+    def create_inplace(  # type: ignore[no-untyped-def]
+        cls, kernel, inputs: Union[TensorBox, list[TensorBox]], *args, **kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         with V.graph.fake_mode:
             (
@@ -9175,7 +11452,10 @@ def create_inplace(
         assert not unbacked_bindings, f"{kernel} {unbacked_bindings}"
         for tensor_arg in tensor_args:
             tensor_arg.realize()
+<<<<<<< HEAD
             V.graph.mark_buffer_mutated(tensor_arg.get_name())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         device = tensor_args[0].get_device()
         packed = cls(
@@ -9223,6 +11503,7 @@ def create_inplace(
     # TODO(yifu): add a pre-grad pass to validate the correctness of collective
     # usage in the user program.
     @classmethod
+<<<<<<< HEAD
     def create_out_of_place(
         cls,
         kernel: _OpOverloads,
@@ -9230,6 +11511,11 @@ def create_out_of_place(
         *args: Any,
         **kwargs: Any,
     ) -> Union[list[MultiOutput], _CollectiveKernel]:
+=======
+    def create_out_of_place(  # type: ignore[no-untyped-def]
+        cls, kernel, inputs: Union[TensorBox, list[TensorBox]], *args, **kwargs
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with V.graph.fake_mode:
             (
                 example_output,
@@ -9244,7 +11530,10 @@ def create_out_of_place(
 
         if isinstance(example_output, list):
             device = cls.find_device(tensor_args, example_output)
+<<<<<<< HEAD
             assert device is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             packed = cls(
                 MultiOutputLayout(device=device),
                 kernel,
@@ -9282,6 +11571,7 @@ def create_out_of_place(
             return packed
 
 
+<<<<<<< HEAD
 class _AllReduce_Kernel(_CollectiveKernel):
     def __init__(
         self,
@@ -9382,6 +11672,14 @@ def get_volatile_reads(self) -> Sequence[IRNode]:
             i = inp.inputs[0]
             assert isinstance(i, IRNode), type(i)
             return [i]
+=======
+class _WaitKernel(_CollectiveKernel):
+    def get_volatile_reads(self):  # type: ignore[no-untyped-def]
+        inp = self.inputs[0]
+        if isinstance(inp, _CollectiveKernel):
+            # Out-of-place single-output
+            return [inp.inputs[0]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(inp, MultiOutput):
             # This can be two things:
             # 1. Out-of-place multi-output coll
@@ -9390,7 +11688,10 @@ def get_volatile_reads(self) -> Sequence[IRNode]:
             # Case 1
             if isinstance(coll, _CollectiveKernel):
                 _, idx = inp.indices[0]
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return [coll.inputs[idx]]
             # Case 2
             return []
@@ -9400,7 +11701,11 @@ def get_volatile_reads(self) -> Sequence[IRNode]:
             return []
 
     @classmethod
+<<<<<<< HEAD
     def create_wait(cls, kernel: _OpOverloads, inp: TensorBox) -> None:
+=======
+    def create_wait(cls, kernel, inp: TensorBox) -> None:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with V.graph.fake_mode:
             (
                 _example_output,
@@ -9463,6 +11768,7 @@ def maybe_free_symbols(s: object) -> OrderedSet[Symbol]:
         return free_symbols(s)
     else:
         return OrderedSet()
+<<<<<<< HEAD
 
 
 def assign_origin_node(result: Any, n: torch.fx.Node) -> None:
@@ -9490,3 +11796,5 @@ def assign_origin_node(result: Any, n: torch.fx.Node) -> None:
             ):
                 if isinstance(result.data.data.inputs[0], Buffer):
                     result.data.data.inputs[0]._post_init_setattr("origin_node", n)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/jagged_lowerings.py b/torch/_inductor/jagged_lowerings.py
index 83848c5a9612c..9048c75eb334d 100644
--- a/torch/_inductor/jagged_lowerings.py
+++ b/torch/_inductor/jagged_lowerings.py
@@ -5,7 +5,12 @@
 
 import torch
 
+<<<<<<< HEAD
 from .ir import Pointwise, ShapeAsConstantBuffer, TensorBox
+=======
+from .ir import Pointwise, TensorBox
+from .lowering import fallback_handler, is_integer_type, register_lowering
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import ops
 
 
@@ -26,7 +31,11 @@ def get_inverse_offsets(
     offsets: TensorBox,
     jagged_len: Union[int, sympy.Expr],
     realize: bool = True,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns "inverse_offsets" - the inverse of the offsets array.
     offsets maps batch index (dense) to jagged index (i.e. offset into jagged tensor).
@@ -108,9 +117,12 @@ def jagged_idx_to_dense_idx(
 
 
 def register_jagged_ops():
+<<<<<<< HEAD
     # Avoid circular import by importing here
     from .lowering import fallback_handler, is_integer_type, register_lowering
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # pyre-ignore[56]
     @register_lowering(torch.ops.aten._jagged_to_padded_dense_forward.default)
     def _jagged_to_padded_dense_forward(
@@ -118,7 +130,11 @@ def _jagged_to_padded_dense_forward(
         jagged_offsets: list[TensorBox],
         max_lengths: list[int],  # list of ints/SymInts
         padding_value: float = 0.0,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = jagged_values.get_device_or_error()
         dtype = jagged_values.get_dtype()
 
@@ -188,7 +204,11 @@ def _dense_to_jagged_forward_impl(
         dense: TensorBox,
         jagged_offsets: list[TensorBox],
         jagged_len: Optional[int] = None,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = dense.get_device_or_error()
         dtype = dense.get_dtype()
 
@@ -261,7 +281,11 @@ def _dense_to_jagged_forward(
         dense: TensorBox,
         jagged_offsets: list[TensorBox],
         jagged_len: Optional[int] = None,
+<<<<<<< HEAD
     ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+    ) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _dense_to_jagged_forward_impl(
             fallback_op=torch.ops.aten._padded_dense_to_jagged_forward.default,
             dense=dense,
diff --git a/torch/_inductor/kernel/__init__.py b/torch/_inductor/kernel/__init__.py
index 9668f1b6c6e1d..d410e701aa805 100644
--- a/torch/_inductor/kernel/__init__.py
+++ b/torch/_inductor/kernel/__init__.py
@@ -1 +1,5 @@
+<<<<<<< HEAD
 from . import flex, mm, mm_common, mm_plus_mm
+=======
+from . import mm, mm_common, mm_plus_mm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index a155d35b5d059..aeae42a5a963f 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -1,14 +1,21 @@
 # mypy: allow-untyped-defs
 import logging
+<<<<<<< HEAD
 from typing import TYPE_CHECKING, Union
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor.codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 
+<<<<<<< HEAD
 from .. import config as inductor_config, ir, lowering as L
 from ..kernel_inputs import MMKernelInputs
 from ..lowering import lowerings, make_pointwise, make_reduction, transform_args
+=======
+from .. import ir, lowering as L
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -23,6 +30,7 @@
     use_cutlass_template,
     use_triton_template,
 )
+<<<<<<< HEAD
 from ..virtualized import ops, V
 from .mm_common import (
     _is_static_problem,
@@ -36,6 +44,19 @@
     from ..ir import ChoiceCaller
     from ..select_algorithm import KernelTemplate
 
+=======
+from ..virtualized import V
+from .mm_common import (
+    _is_static_problem,
+    addmm_epilogue,
+    is_batch_stride_largest,
+    mm_args,
+    mm_config_kwargs,
+    mm_options,
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 
@@ -45,6 +66,16 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
 
 
+<<<<<<< HEAD
+=======
+def _is_large_block_for_cpu(m, n, k):
+    # Thresholds are experimentally determined to reduce Triton CPU compile times
+    if m > 128 or n > 128 or k > 128:
+        return True
+    return m * n > 2**12
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bmm_template = TritonTemplate(
     name="bmm",
     grid=bmm_grid,
@@ -63,7 +94,11 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     stride_bn = {{stride("B", 2)}}
 
     # based on triton.ops.matmul
+<<<<<<< HEAD
     pid = tl.program_id(0).to(INDEX_DTYPE)
+=======
+    pid = tl.program_id(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -89,7 +124,11 @@ def bmm_grid(b, m, n, meta, *, cdiv):
 
     rk = tl.arange(0, BLOCK_K)
 
+<<<<<<< HEAD
     idx_q = tl.program_id(1).to(INDEX_DTYPE)  # batch dimension for BMM
+=======
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
     B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)
 
@@ -108,18 +147,30 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     # rematerialize rm and rn to save registers
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+<<<<<<< HEAD
     idx_q = tl.program_id(1).to(INDEX_DTYPE)  # batch dimension for BMM
+=======
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     idx_m = rm[:, None]
     idx_n = rn[None, :]
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
     cache_codegen_enabled_for_template=True,
 )
 
+<<<<<<< HEAD
 aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out", op_overload=aten.bmm.out)
+=======
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten_bmm_dtype = ExternKernelChoice(
     torch.bmm,
     "at::_bmm_out_dtype_cuda",
@@ -173,6 +224,7 @@ def may_require_contiguous(t, meta_t):
             meta_mat2 = V.graph.current_node.args[1]
             mat2 = may_require_contiguous(mat2, meta_mat2)
 
+<<<<<<< HEAD
     if use_native_matmul(mat1, mat2):
         mat1 = lowerings[aten.unsqueeze](mat1, -1)
         mat2 = lowerings[aten.unsqueeze](mat2, 1)
@@ -207,6 +259,11 @@ def _to_dtype(x):
 
     # Create MMKernelInputs for BMM at the top
     kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=out_dtype)
+=======
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=out_dtype
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]  # Extract batch dimension
@@ -222,6 +279,7 @@ def _to_dtype(x):
         layout,
     )
 
+<<<<<<< HEAD
     aten_handler: ExternKernelChoice = aten_bmm
     aten_extra_kwargs = {}
     if out_dtype:
@@ -267,6 +325,47 @@ def _to_dtype(x):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, kernel_inputs.nodes()
         )  # type: ignore[arg-type]
+=======
+    if out_dtype:
+        assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
+        aten_func = aten_bmm_dtype.bind((mat1, mat2), layout, out_dtype=out_dtype)
+    else:
+        aten_func = aten_bmm.bind((mat1, mat2), layout)
+
+    # options to tune from
+    choices = [aten_func] if use_aten_gemm_kernels() else []
+
+    device_type = ir.get_device_type(mat1)
+    bmm_configs = V.choices.get_base_mm_configs(device_type)
+
+    dtype = mat1.get_dtype()
+    if use_triton_template(layout):
+        # TODO: add out_dtype support for Triton Template
+        assert out_dtype is None, "out_dtype is not supported for Triton"
+        for config in bmm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+        ):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    _, is_nonzero = _is_static_problem(layout)
+    batch_stride_largest = is_batch_stride_largest(mat1, mat2, layout)
+    if (
+        batch_stride_largest
+        and is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("bmm")
+    ):
+        from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
+
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if use_cpp_bmm_template(layout, mat1, mat2):
         from ..codegen.cpp_bmm_template import CppBmmTemplate
@@ -274,6 +373,7 @@ def _to_dtype(x):
         CppBmmTemplate.add_choices(
             choices,
             layout,
+<<<<<<< HEAD
             kernel_inputs.nodes(),
         )
 
@@ -281,10 +381,20 @@ def _to_dtype(x):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+=======
+            [mat1, mat2],
+        )
+
+    if use_ck_gemm_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+
+    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+<<<<<<< HEAD
     """
     Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
     """
@@ -309,6 +419,10 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
     )
 
+=======
+    m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]
     counters["aten_mm_info"][f"aten.baddbmm_{batch_size}_{m}_{n}_{k}"] += 1
@@ -323,6 +437,7 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         inp.get_dtype(),
         layout,
     )
+<<<<<<< HEAD
     name = "baddbmm"
     # options to tune from
     choices: list[ChoiceCaller] = []
@@ -341,3 +456,31 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     )
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+=======
+
+    # options to tune from
+    choices = (
+        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    device_type = ir.get_device_type(mat1)
+    bmm_configs = V.choices.get_base_mm_configs(device_type)
+
+    if use_triton_template(layout):
+        for config in bmm_configs(
+            m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+        ):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+            )
+
+    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 2977932c084f6..b4eeeecc8a000 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -29,6 +29,10 @@
     use_triton_template,
 )
 from ..virtualized import V
+<<<<<<< HEAD
+=======
+from .mm_common import mm_config_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -60,6 +64,16 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     )
 
 
+<<<<<<< HEAD
+=======
+def _is_large_block_for_cpu(m, n, k):
+    # Thresholds are experimentally determined to reduce Triton CPU compile times
+    if m > 256 or n > 256 or k > 256:
+        return True
+    return m * n * k > 2**17
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 LOOP_BODY_2D = """
         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
@@ -117,19 +131,31 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     stride_wh = {{stride("W", 2)}}
     stride_ww = {{stride("W", 3)}}
 
+<<<<<<< HEAD
     nhw = tl.program_id(0).to(INDEX_DTYPE) * BLOCK_M + tl.arange(0, BLOCK_M)
+=======
+    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     idx_y_w = nhw % OUT_W
     nh = nhw // OUT_W
     idx_y_h = nh % OUT_H
     idx_n = nh // OUT_H
+<<<<<<< HEAD
     idx_y_c = tl.program_id(1).to(INDEX_DTYPE) * BLOCK_N + tl.arange(0, BLOCK_N)
+=======
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {% if GROUPS == 1 %}
     group = 0
     GROUP_IN_C = IN_C
     GROUP_OUT_C = OUT_C
 {% else %}
+<<<<<<< HEAD
     group = tl.program_id(2).to(INDEX_DTYPE)
+=======
+    group = tl.program_id(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GROUP_IN_C = IN_C // GROUPS
     GROUP_OUT_C = OUT_C // GROUPS
 {% endif %}
@@ -180,7 +206,11 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     idx_w = idx_y_w[:, None]
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -245,21 +275,33 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     stride_wh = {{stride("W", 3)}}
     stride_ww = {{stride("W", 4)}}
 
+<<<<<<< HEAD
     ndhw = tl.program_id(0).to(INDEX_DTYPE) * BLOCK_M + tl.arange(0, BLOCK_M)
+=======
+    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     idx_y_w = ndhw % OUT_W
     ndh = ndhw // OUT_W
     idx_y_h = ndh % OUT_H
     nd = ndh // OUT_H
     idx_y_d = nd % OUT_D
     idx_n = nd // OUT_D
+<<<<<<< HEAD
     idx_y_c = tl.program_id(1).to(INDEX_DTYPE) * BLOCK_N + tl.arange(0, BLOCK_N)
+=======
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {% if GROUPS == 1 %}
     group = 0
     GROUP_IN_C = IN_C
     GROUP_OUT_C = OUT_C
 {% else %}
+<<<<<<< HEAD
     group = tl.program_id(2).to(INDEX_DTYPE)
+=======
+    group = tl.program_id(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GROUP_IN_C = IN_C // GROUPS
     GROUP_OUT_C = OUT_C // GROUPS
 {% endif %}
@@ -318,7 +360,11 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     idx_w = idx_y_w[:, None]
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -430,17 +476,29 @@ def convolution(
     dilation = tuple(dilation)
     output_padding = tuple(output_padding)
     if not isinstance(groups, int):
+<<<<<<< HEAD
         groups = V.graph.sizevars.guard_int(groups)
+=======
+        groups = V.graph.sizevars.evaluate_static_shape(groups)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(groups, int)
 
     # Need use hint for triton template since the template does not
     # work with a dynamic shape.
     #
+<<<<<<< HEAD
     # No need to guard_int for dilation and output_padding
     # since the template is only used when dilation is 1 and output_padding
     # is 0.
     stride = tuple(V.graph.sizevars.guard_int_seq(stride))
     padding = tuple(V.graph.sizevars.guard_int_seq(padding))
+=======
+    # No need to evaluate_static_shape for dilation and output_padding
+    # since the template is only used when dilation is 1 and output_padding
+    # is 0.
+    stride = tuple(V.graph.sizevars.evaluate_static_shapes(stride))
+    padding = tuple(V.graph.sizevars.evaluate_static_shapes(padding))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     kwargs: ConvLayoutParams = {
         "stride": stride,
@@ -460,7 +518,13 @@ def convolution(
             dim=0,
         )
 
+<<<<<<< HEAD
     out_chan, in_chan, *kernel_shape = V.graph.sizevars.guard_int_seq(weight.get_size())
+=======
+    out_chan, in_chan, *kernel_shape = V.graph.sizevars.evaluate_static_shapes(
+        weight.get_size()
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Always convert conv1D to 2D for Intel GPU.
     # Only conv2D can be converted to channel last layout,
@@ -529,18 +593,30 @@ def channels_last_conv():
     # apply channels last.
     if V.graph.layout_opt and ndim == 2:
         V.graph.num_channels_last_conv += 1
+<<<<<<< HEAD
         x = ir.ExternKernel.require_channels_last(x)  # type: ignore[assignment]
         # TODO maybe we can convert weights to channels last just once before
         # running the model.
         weight = ir.ExternKernel.require_channels_last(weight)  # type: ignore[assignment]
+=======
+        x = ir.ExternKernel.require_channels_last(x)
+        # TODO maybe we can convert weights to channels last just once before
+        # running the model.
+        weight = ir.ExternKernel.require_channels_last(weight)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout = conv_layout(x, weight, None, **kwargs)
     else:
         layout = conv_layout(x, weight, None, **kwargs)
         req_stride_order = ir.get_stride_order(
             V.graph.sizevars.size_hints(layout.stride)
         )
+<<<<<<< HEAD
         x = ir.ExternKernel.require_stride_order(x, req_stride_order)  # type: ignore[assignment]
         weight = ir.ExternKernel.require_stride_order(weight, req_stride_order)  # type: ignore[assignment]
+=======
+        x = ir.ExternKernel.require_stride_order(x, req_stride_order)
+        weight = ir.ExternKernel.require_stride_order(weight, req_stride_order)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ordered_kwargs_for_cpp_kernel = [
         "stride",
@@ -558,7 +634,11 @@ def channels_last_conv():
         args = [x, weight, bias]
         bias.realize()
         bias.freeze_layout()
+<<<<<<< HEAD
         V.graph.sizevars.guard_int_seq(bias.get_size())
+=======
+        V.graph.sizevars.evaluate_static_shapes(bias.get_size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     choices = []
     if torch._inductor.utils._use_conv_autotune_backend("ATEN"):
@@ -579,7 +659,11 @@ def channels_last_conv():
         and not transposed
         and is_zeros(output_padding)
         # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
+<<<<<<< HEAD
         and V.graph.sizevars.statically_known_equals(in_chan * groups, x.get_size()[1])  # type: ignore[arg-type]
+=======
+        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if (
             is_ones(kernel_shape)
@@ -591,12 +675,19 @@ def channels_last_conv():
 
         conv_configs = V.choices.get_conv_configs(device_type)
 
+<<<<<<< HEAD
         dtype_size = x.get_dtype().itemsize
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for cfg in conv_configs(
             sympy_product([x.get_size()[0], *x.get_size()[2:]]),
             out_chan,
             in_chan,
+<<<<<<< HEAD
             dtype_size=dtype_size,
+=======
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if ndim == 2:
                 conv2d_template.maybe_append_choice(
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
new file mode 100644
index 0000000000000..22751fe100479
--- /dev/null
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -0,0 +1,2763 @@
+# mypy: allow-untyped-defs
+"""Triton Implementation of the flex_attention Kernel"""
+
+import copy
+import logging
+import math
+from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import Any, Optional, Union
+
+import sympy
+
+import torch
+from torch._inductor.virtualized import V
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_map
+from torch.utils._sympy.numbers import int_oo
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from ..ir import (
+    Buffer,
+    ComputedBuffer,
+    ExternKernel,
+    FixedLayout,
+    FlexibleLayout,
+    get_fill_order,
+    InputBuffer,
+    IRNode,
+    MutationLayoutSHOULDREMOVE,
+    Scatter,
+    StorageBox,
+    Subgraph,
+    TensorBox,
+)
+from ..lowering import (
+    _full,
+    check_and_broadcast_indices,
+    empty,
+    empty_strided,
+    expand,
+    index_output_size_and_inner_fn,
+    lowerings,
+    register_lowering,
+    to_dtype,
+)
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    realize_inputs,
+    SymbolicGridFn,
+    TritonTemplate,
+)
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+Expr = sympy.Expr
+
+
+def construct_strides(
+    sizes: Sequence[int],
+    fill_order: Sequence[int],
+) -> Sequence[int]:
+    """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
+    # Initialize strides
+    assert len(sizes) == len(fill_order), (
+        "Length of sizes must match the length of the fill order"
+    )
+    strides = [0] * len(sizes)
+
+    # Start with stride 1 for the innermost dimension
+    current_stride = 1
+
+    # Iterate through the fill order populating strides
+    for dim in fill_order:
+        strides[dim] = current_stride
+        current_stride *= sizes[dim]
+
+    return strides
+
+
+def infer_dense_strides(size: Sequence[int], orig_strides: Sequence[int]):
+    """This is a mirror of the same function in aten/src/ATen/ExpandUtils.cpp
+
+    Args:
+        size: The size of the output tensor
+        orig_strides: The strides of the input tensor
+    Returns:
+        List[int]: Dense non-overlapping strides that preserve the input tensor's layout permutation.
+        The returned strides follow the same stride propagation rules as TensorIterator. This matches
+        The behavior of empty_like()
+    """
+    fill_order = get_fill_order(orig_strides, V.graph.sizevars.shape_env)
+    return construct_strides(size, fill_order)
+
+
+@SymbolicGridFn
+def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta, *, cdiv):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the final attention output.
+    """
+    return (cdiv(num_queries, meta["BLOCK_M"]), batch_size * q_heads, 1)
+
+
+def create_placeholder(
+    name: str,
+    dtype: torch.dtype,
+    device: torch.device,
+    size: Optional[list[int]] = None,
+) -> TensorBox:
+    """Creates a placeholder input buffers for producing subgraph_output."""
+    input_buffer = InputBuffer(
+        name=name,
+        layout=FixedLayout(
+            device,
+            dtype,
+            size if size else [],
+            FlexibleLayout.contiguous_strides(size) if size else [],
+        ),
+    )
+    return TensorBox.create(input_buffer)
+
+
+def maybe_realize(args: list[Optional[IRNode]]):
+    """Accepts a list of optional IRNodes and returns a list of realized IRNodes"""
+    return tree_map(
+        lambda x: (
+            realize_inputs(x)
+            if x is not None and not isinstance(x, sympy.Symbol)
+            else x
+        ),
+        args,
+    )
+
+
+def get_float32_precision():
+    if (
+        torch.get_float32_matmul_precision() == "highest"
+        or torch.version.hip
+        or torch.mtia.is_available()
+    ):
+        return "'ieee'"
+    else:
+        return "'tf32'"
+
+
+def zeros_and_scatter_lowering(shape: list[int], indices, values):
+    # Always accumulate into fp32 then cast
+    grad = _full(0, values.get_device(), torch.float32, shape)
+    assert isinstance(grad, TensorBox)
+    grad.realize()
+    x_size = grad.get_size()
+    values = to_dtype(values, grad.get_dtype())
+    indices_loaders = [i.make_loader() if i is not None else None for i in indices]
+    indices, tensor_indices = check_and_broadcast_indices(indices, grad.get_device())
+    # We can use the first one since they are all required to be the same size
+    tensor_size = list(indices[tensor_indices[0]].get_size())
+    indexed_size = [x_size[i] for i in range(len(indices))]
+
+    expected_vals_size, inner_fn = index_output_size_and_inner_fn(
+        x_size,
+        indices,
+        tensor_indices,
+        tensor_size,
+        indices_loaders,
+        indexed_size,
+        None,
+        check=True,
+    )
+
+    values = expand(values, expected_vals_size)
+    device = grad.get_device()
+    assert device is not None
+    scatter = Scatter(
+        device=device,
+        dtype=grad.get_dtype(),
+        inner_fn=values.make_loader(),
+        ranges=expected_vals_size,  # iter_ranges,
+        output_indexer=inner_fn,
+        scatter_mode="atomic_add",
+    )
+
+    buffer = ComputedBuffer(
+        name=grad.data.data.name,  # type: ignore[attr-defined]
+        layout=MutationLayoutSHOULDREMOVE(grad),
+        data=scatter,
+    )
+    return buffer
+
+
+SubgraphResults = Union[list[Optional[ComputedBuffer]], Optional[ComputedBuffer]]
+
+
+def build_subgraph_module_buffer(
+    args: list[TensorBox], graph_module: torch.fx.GraphModule
+) -> SubgraphResults:
+    """This function's goal is to take in the required args and produce the subgraph buffer
+    The subgraph buffer is a ComputedBuffer that will be inlined into the triton template
+
+    Args:
+        args: The args that are passed into the subgraph. Contains both fixed and lifted inputs.
+        subgraph: The Subgraph ir for which to produce the output node
+    """
+    from ..subgraph_lowering import PointwiseSubgraphLowering
+
+    pw_subgraph = PointwiseSubgraphLowering(
+        graph_module,
+        root_graph_lowering=V.graph,
+        allowed_mutations=OrderedSet([torch.ops.flex_lib.zeros_and_scatter.default]),
+        additional_lowerings={
+            torch.ops.flex_lib.zeros_and_scatter.default: zeros_and_scatter_lowering
+        },
+    )
+    with V.set_graph_handler(pw_subgraph):  # type: ignore[arg-type]
+        pw_subgraph.run(*args)
+
+    # Since we are allowing mutations/buffer creation, we need to register any fresh buffers
+    # creating during the pointwise subgraph lowering
+    if len(pw_subgraph.buffers) > 0:
+        for buffer in pw_subgraph.buffers:
+            V.graph.register_buffer(buffer)
+
+    def convert_output_node_to_buffer(output_buffer) -> Optional[ComputedBuffer]:
+        if output_buffer is None:
+            return None
+        if isinstance(output_buffer, ComputedBuffer):
+            # These nodes are coming from the output of zeros_and_scatter
+            return output_buffer
+        assert isinstance(output_buffer, TensorBox), (
+            "The output node for flex attention's subgraph must be a TensorBox, but got: ",
+            type(output_buffer),
+        )
+        assert isinstance(output_buffer.data, StorageBox), (
+            "The output node for the flex attention subgraph must be a StorageBox, but got: ",
+            type(output_buffer),
+        )
+        subgraph_buffer = ComputedBuffer(
+            name=None,
+            layout=FlexibleLayout(
+                device=output_buffer.data.get_device(),
+                dtype=output_buffer.data.get_dtype(),
+                size=output_buffer.data.get_size(),
+            ),
+            data=output_buffer.data.data,  # type: ignore[arg-type]
+        )
+        return subgraph_buffer
+
+    return tree_map(convert_output_node_to_buffer, pw_subgraph.graph_outputs)
+
+
+def build_subgraph_buffer(args: list[TensorBox], subgraph: Subgraph) -> SubgraphResults:
+    return build_subgraph_module_buffer(args, subgraph.graph_module)
+
+
+def get_fwd_subgraph_outputs(
+    subgraph_buffer: SubgraphResults, mask_graph_buffer: SubgraphResults
+) -> list[Optional[ComputedBuffer]]:
+    subgraph_buffer = (
+        subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
+    )
+    mask_graph_buffer = (
+        mask_graph_buffer
+        if isinstance(mask_graph_buffer, Sequence)
+        else [mask_graph_buffer]
+    )
+    return [*subgraph_buffer, *mask_graph_buffer]
+
+
+# Inner Triton functions shared by flex_attention & split-k decoding kernels.
+compute_next_offset_func = r"""
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+"""
+
+get_bounded_indices_func = r"""
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+"""
+
+
+load_checked_block = r"""
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+"""
+
+load_checked_2d = r"""
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_DIM: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_DIM), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_DIM), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
+"""
+
+compute_flex_attention = r"""
+{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+
+    ZQ = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    ZKV = {{size("K", 0)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    q_start = tl.program_id(0)
+    off_zq = tl.program_id(1) // HQ
+    off_hq = tl.program_id(1) % HQ
+
+
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    {%- if USE_TMA %}
+    desc_q = tl.make_tensor_descriptor(
+        base=Q,
+        shape=[Q_LEN*HQ*ZQ, QK_HEAD_DIM],
+        strides=[QK_HEAD_DIM, 1],
+        block_shape=[BLOCK_M, QK_HEAD_DIM_ROUNDED],
+    )
+    desc_v = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN*ZKV*HQ, V_HEAD_DIM],
+        strides=[V_HEAD_DIM, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    desc_k = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN*ZKV*HQ, V_HEAD_DIM],
+        strides=[V_HEAD_DIM, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    {%- endif %}
+
+
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+
+    stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+    stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+    stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    K_block_ptr = None
+    V_block_ptr = None
+    Q_block_ptr = None
+
+    if not USE_TMA:
+        Q_block_ptr = tl.make_block_ptr(
+            base=Q ,
+            shape=(Q_LEN, QK_HEAD_DIM),
+            strides=(stride_qm, stride_qk),
+            offsets=(q_start * BLOCK_M, 0),
+            block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
+
+    {%- if USE_TMA %}
+    q = tl.load_tensor_descriptor(
+        desc_q,
+        [(q_start * BLOCK_M).to(tl.int32), 0],
+    )
+    {%- else %}
+        q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    {%- endif %}
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+
+    if not USE_TMA:
+        K_block_ptr = tl.make_block_ptr(
+            base=K,
+            shape=(QK_HEAD_DIM, KV_LEN),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, kv_start),
+            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+            order=(0, 1)
+        )
+
+        V_block_ptr = tl.make_block_ptr(
+            base=V,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(kv_start, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
+
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        if not USE_TMA:
+            K_block_ptr = tl.make_block_ptr(
+                base=K,
+                shape=(QK_HEAD_DIM, KV_LEN),
+                strides=(stride_kk, stride_kn),
+                offsets=(0, kv_start),
+                block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+                order=(0, 1)
+            )
+            V_block_ptr = tl.make_block_ptr(
+                base=V,
+                shape=(KV_LEN, V_HEAD_DIM),
+                strides=(stride_vn, stride_vk),
+                offsets=(kv_start, 0),
+                block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+                order=(1, 0)
+            )
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+
+
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1) // HQ
+    idx_hq = tl.program_id(1) % HQ
+    idx_m = offs_m[:, None]
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :]
+
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+
+    {{store_output(("idx_zq", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+
+    if OUTPUT_LOGSUMEXP:
+        off_hz = tl.program_id(1)
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+ """
+
+
+compute_forward_inner = r"""
+@triton.jit
+def forward_inner(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                start_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                start_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+
+
+
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+
+        offs_n = offs_n + offset
+        if not USE_TMA:
+            K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+            V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+
+
+    return acc, l_i, m_i
+
+"""
+
+
+compute_forward_block_mn = r"""
+@triton.jit
+def forward_block_mn(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    start_n,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    # -- load k --
+    # NB reversed order to since K is transposed
+    {%- if USE_TMA %}
+    k = tl.load_tensor_descriptor(  # load in row major
+            desc_k,
+            [start_n.to(tl.int32) , kv_start],
+    )
+    {%- else %}
+    k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
+    {%- endif %}
+
+    if USE_TMA:
+        k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_h",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=1,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_h",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    {%- if USE_TMA %}
+    v = tl.load_tensor_descriptor(
+        desc_v,
+        [kv_start.to(tl.int32) + start_n.to(tl.int32),0],
+    )
+    {%- else %}
+    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    {%- endif %}
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+
+    # -- update m_i
+    m_i = m_ij
+
+    return acc, l_i, m_i
+
+"""
+
+
+flex_attention_template = TritonTemplate(
+    name="flex_attention",
+    grid=flex_attention_grid,
+    source=compute_flex_attention
+    + compute_forward_inner
+    + compute_next_offset_func
+    + compute_forward_block_mn
+    + load_checked_block
+    + get_bounded_indices_func,
+)
+
+
+def _use_flex_decoding(query, kv_indices, kernel_options, enable_gqa):
+    """Decide which kernel to use, return true if use flex decoding kernel.
+    Note:
+       Since the number of splits is calculated based of the the number of batch and head dims
+       we need to ensure that the batch and head dims are statically known. Otherwise we just
+       use the main flex_attention kernel.
+    """
+    force_flex = kernel_options.get("FORCE_USE_FLEX_ATTENTION", False)
+    short_query_length = V.graph.sizevars.evaluate_expr(
+        sympy.Lt(query.get_size()[-2], 128)
+    )
+    non_zero_length = V.graph.sizevars.evaluate_expr(sympy.Gt(query.get_size()[-2], 0))
+    static_batch = isinstance(query.get_size()[0], (int, sympy.Integer))
+    static_num_heads = isinstance(query.get_size()[1], (int, sympy.Integer))
+    if enable_gqa:
+        # in the current flex decoding triton kernel, grouped query heads for the
+        # same kv head are handled by the same block. So it's hard to support different
+        # kv num blocks for grouped query heads. We just fall back to main flex_attention
+        # kernel where each query head is handled by a separate block.
+        valid_block_mask_num_heads = V.graph.sizevars.evaluate_expr(
+            sympy.Eq(kv_indices.get_size()[1], 1)
+        )
+    else:
+        valid_block_mask_num_heads = V.graph.sizevars.evaluate_expr(
+            sympy.Or(
+                sympy.Eq(kv_indices.get_size()[1], 1),
+                sympy.Eq(kv_indices.get_size()[1], query.get_size()[1]),
+            )
+        )
+    return (
+        not force_flex
+        and short_query_length
+        and static_batch
+        and static_num_heads
+        and non_zero_length
+        and valid_block_mask_num_heads
+    )
+
+
+_h100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (32, 64, 4, 3),
+    (torch.float32, 256): (32, 32, 4, 3),
+    (torch.bfloat16, 64): (128, 128, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (64, 32, 4, 3),
+    (torch.float16, 64): (128, 128, 4, 3),
+    (torch.float16, 128): (128, 128, 8, 3),
+    (torch.float16, 256): (64, 32, 4, 3),
+}
+
+_a100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (128, 32, 4, 3),
+    (torch.float32, 256): (64, 16, 4, 3),
+    (torch.bfloat16, 64): (128, 64, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (32, 64, 4, 3),
+    (torch.float16, 64): (128, 64, 4, 3),
+    (torch.float16, 128): (128, 64, 8, 3),
+    (torch.float16, 256): (32, 64, 4, 3),
+}
+
+_rocm_default_config = {
+    (torch.float32, 64): (128, 32, 4, 1),
+    (torch.float32, 128): (128, 32, 4, 1),
+    (torch.float32, 256): (64, 16, 4, 1),
+    (torch.bfloat16, 64): (128, 64, 8, 1),
+    (torch.bfloat16, 128): (128, 64, 8, 1),
+    (torch.bfloat16, 256): (32, 64, 8, 1),
+    (torch.float16, 64): (128, 64, 8, 1),
+    (torch.float16, 128): (128, 64, 8, 1),
+    (torch.float16, 256): (32, 64, 4, 1),
+}
+
+
+class Mode(Enum):
+    fwd = auto()
+    bwd = auto()
+
+
+def create_num_blocks_fake_generator(sparse_indices):
+    # The idea here is that we need to create a real tensor with real data
+    # that's representative for benchmarking.
+    # For example, returning all zeros for the `kv_num_blocks` input would mean
+    # that we are computing 0 blocks for each row, which would provide bogus
+    # autotuning results.
+    #
+    # In this case, we choose to use min(16, max_block) blocks, because I
+    # (Horace) think it'll probably result in pretty representative performance.
+    # If it's too short then prefetching won't help. If it's too long then
+    # autotuning will take longer for no good reason.
+    def create_num_blocks_fake(x) -> torch.Tensor:
+        num_blocks_for_autotuning = V.graph.sizevars.size_hint(sparse_indices.shape[-1])
+        size = [V.graph.sizevars.size_hint(i) for i in x.get_size()]
+        return torch.full(
+            size,
+            num_blocks_for_autotuning,
+            dtype=x.get_dtype(),
+            device=x.get_device(),
+        )
+
+    return create_num_blocks_fake
+
+
+def create_indices_fake(x) -> torch.Tensor:
+    size = [V.graph.sizevars.size_hint(i) for i in x.get_size()]
+    indices = torch.arange(0, size[-1], dtype=x.get_dtype(), device=x.get_device())
+    indices = indices.expand(size).contiguous()
+    return indices
+
+
+from torch._inductor.kernel.flex_decoding import create_flex_decoding_kernel
+
+from ..codegen.cpp_flex_attention_template import CppFlexAttentionTemplate
+
+
+def check_cpu_supported():
+    import os
+    import sys
+
+    requires_avx2_on_cpu = (
+        torch.cpu._is_avx2_supported() and os.getenv("ATEN_CPU_CAPABILITY") != "default"
+    )
+    supported = (
+        requires_avx2_on_cpu
+        and not torch.xpu.is_available()
+        and not sys.platform == "darwin"
+    )
+    return supported
+
+
+def contiguous_last_dim(x):
+    """Ensure that realized IR node has a contiguous stride in the last dimension."""
+    strides = x.maybe_get_stride()
+    if strides and strides[-1] != 1:
+        contiguous_stride_order = list(reversed(range(len(x.get_size()))))
+        return ExternKernel.require_stride_order(x, contiguous_stride_order)
+    return x
+
+
+def lower_cpu(
+    query,
+    key,
+    value,
+    subgraph,
+    block_mask,
+    scale,
+    kernel_options,
+    score_mod_other_buffers,
+    mask_mod_other_buffers,
+):
+    (
+        _,  # q_length
+        _,  # kv_length
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_Q_BLOCK_SIZE,
+        SPARSE_KV_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+
+    if kernel_options["OUTPUT_LOGSUMEXP"]:
+        raise NotImplementedError(
+            "torch.compile on CPU only supports inference and `return_lse` is not supported yet."
+        )
+    if not check_cpu_supported():
+        raise NotImplementedError(
+            "torch.compile on current platform is not supported for CPU."
+        )
+
+    fake_buffers: list[Buffer] = []  # noqa: F821
+
+    # [Note] Handle the case where the split sizes are not statically known.
+    # The value of cur_qSplitSize and cur_kvSplitSize are decided during runtime.
+    # We use symbols to represent them during the compilation here.
+    # They'll be replaced by the string "cur_qSplitSize" and "cur_kvSplitSize" in
+    # the modification function of the CppFlexAttentionTemplate class.
+    cur_qSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr
+    cur_kvSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr
+    shape_env = V.graph.sizevars.shape_env
+
+    # We don't know the concrete value of cur_qSplitSize and cur_kvSplitSize during the compilation.
+    # Mark symbols > 1 to ensure broadcasting is always applied.
+    # This avoids treating them as equal when `eq(var, 1)` is evaluated in `broadcast_symbolic_shapes`.
+    shape_env.var_to_range[cur_qSplitSize] = ValueRanges(2, int_oo)
+    shape_env.var_to_range[cur_kvSplitSize] = ValueRanges(2, int_oo)
+
+    score_dtype = torch.float
+    placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device(), size)
+        for name, dtype, size in [
+            ("score", score_dtype, [cur_qSplitSize, cur_kvSplitSize]),
+            ("b", torch.int64, []),
+            ("h", torch.int64, []),
+            ("q_idx", torch.int64, [cur_qSplitSize, 1]),
+            ("kv_idx", torch.int64, [1, cur_kvSplitSize]),
+        ]
+    ]
+    subgraph_buffer = build_subgraph_buffer(
+        placeholder_inps + list(score_mod_other_buffers), subgraph
+    )
+    if subgraph_buffer is not None:
+        if isinstance(subgraph_buffer, list):
+            for _buf in subgraph_buffer:
+                if _buf is not None:
+                    _buf.freeze_layout()
+        else:
+            subgraph_buffer.freeze_layout()
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device(), size)
+        for name, dtype, size in [
+            ("score", score_dtype, [cur_qSplitSize, cur_kvSplitSize]),
+            ("b", torch.int64, []),
+            ("h", torch.int64, []),
+            ("q_idx", torch.int64, [cur_qSplitSize, 1]),
+            ("kv_idx", torch.int64, [1, cur_kvSplitSize]),
+        ]
+    ]
+
+    # The original mask_graph works on a scalar and only includes
+    # the logic of calculating the mask value.
+    # We need to add the logic of applying the mark to the qk_data tensor
+    # into the graph for the later codegen of this part.
+    # Example:
+    #   mask_graph:
+    #   def mask_fn(b, h, q_idx, kv_idx):
+    #       mask = q_idx >= kv_idx
+    #       return mask
+    #   The converted_mask_graph should be:
+    #   def converted_mask_fn(qk_data, b, h, q_idx, kv_idx):
+    #       mask = q_idx >= kv_idx
+    #       qk_data = torch.where(mask, qk_data, torch.full_like(qk_data, -float("inf")))
+    #       return qk_data
+    def convert_mask_graph_module(mask_graph):
+        gm = copy.deepcopy(mask_graph.graph_module)
+        graph = gm.graph
+        # Add qk_data as the first input
+        with graph.inserting_before(next(iter(graph.nodes))):
+            qk_data_node = graph.placeholder("qk_data")
+
+        # Find the node that returns the mask
+        output_node = None
+        for node in graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+        # Get the mask node
+        assert output_node is not None
+        mask_node = output_node.args[0]
+
+        size_node = [cur_qSplitSize, cur_kvSplitSize]
+        # Create a new node for torch.full
+        with graph.inserting_after(mask_node):
+            full_node = graph.call_function(
+                torch.full,
+                args=(size_node, -float("inf")),
+                kwargs={"dtype": score_dtype},
+            )
+
+        # Create a new node for torch.where
+        with graph.inserting_after(full_node):
+            where_node = graph.call_function(
+                torch.ops.aten.where, args=(mask_node, qk_data_node, full_node)
+            )
+
+        # Update the output node to return the result of torch.where
+        output_node.args = (where_node,)
+
+        graph.lint()
+        converted = torch.fx.GraphModule(gm, graph)
+        return converted
+
+    converted_mask_graph_module = convert_mask_graph_module(mask_graph)
+
+    mask_graph_buffer = build_subgraph_module_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers),
+        converted_mask_graph_module,
+    )
+
+    # Clear the pending fresh unbacked symbols that are created for cur_qSplitSize and cur_kvSplitSize in the current kernel.
+    pending = V.graph.sizevars.shape_env.pending_fresh_unbacked_symbols
+    V.graph.sizevars.shape_env.pending_fresh_unbacked_symbols = [
+        x for x in pending if x not in (cur_qSplitSize, cur_kvSplitSize)
+    ]
+
+    buffer_list = (
+        placeholder_inps
+        + list(score_mod_other_buffers)
+        + mask_graph_placeholder_inps
+        + list(mask_mod_other_buffers)
+    )
+    for item in buffer_list:
+        if isinstance(item, TensorBox):
+            fake_buffers.append(item.data.data)  # type: ignore[attr-defined]
+
+    # CPU kernel requires last dim to be contiguous
+    query, key, value = map(contiguous_last_dim, [query, key, value])
+
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+
+    if len(OrderedSet([query.get_name(), key.get_name(), value.get_name()])) != 3:
+        raise NotImplementedError(
+            "Unsupported for now if query, key, value are the same buffer."
+        )
+    if query.get_dtype() not in [torch.float, torch.bfloat16, torch.float16]:
+        raise NotImplementedError(
+            "`torch.float` , `torch.float16` and `torch.bfloat16` are supported in FlexAttention for CPU device. "
+            f"Found input tensors are `{query.get_dtype()}`."
+        )
+    score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
+    mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    B = Bq
+
+    # Construct output layout with strides matching the query.
+    out_size = [B, Hq, seq_len_q, v_head_dim]
+    out_strides = infer_dense_strides(out_size, query.get_stride())
+
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        [B, Hq, seq_len_q, v_head_dim],
+        stride=[sympy.sympify(s) for s in out_strides],
+    )
+    _choices: list[Any] = []
+    input_nodes = [query, key, value, kv_num_blocks, kv_indices]
+    if not full_kv_num_blocks:
+        no_full_kv_block = True
+    else:
+        no_full_kv_block = False
+        input_nodes += [full_kv_num_blocks]
+        input_nodes += [full_kv_indices]
+    has_other_buffer = False
+    kernel_input_name_to_buffer = {}
+    if score_mod_other_buffers or mask_mod_other_buffers:
+        has_other_buffer = True
+
+        for prefix, buffers in [
+            ("score_others", score_mod_other_buffers),
+            ("mask_others", mask_mod_other_buffers),
+        ]:
+            kernel_input_name_to_buffer.update(
+                {f"{prefix}_{i}": buf for i, buf in enumerate(buffers)}
+            )
+        input_nodes += [
+            value
+            for value in kernel_input_name_to_buffer.values()
+            if not isinstance(value, sympy.Symbol)
+        ]
+
+    skip_mask_score = kernel_options.get("SKIP_MASK_SCORE", False)
+    # Mark SPARSE_KV_BLOCK_SIZE & SPARSE_Q_BLOCK_SIZE as static shapes and add guards.
+    SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_KV_BLOCK_SIZE)
+    SPARSE_Q_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_Q_BLOCK_SIZE)
+    assert V.graph.sizevars.evaluate_expr(
+        sympy.Le(seq_len_q, sympy.Mul(kv_indices.get_size()[-2], SPARSE_Q_BLOCK_SIZE))
+    ), (
+        "Q seqlen must be smaller than the block_mask size in the Q dimension, considering pass a larger block_mask."
+    )
+    assert V.graph.sizevars.evaluate_expr(
+        sympy.Le(seq_len_kv, sympy.Mul(kv_indices.get_size()[-1], SPARSE_KV_BLOCK_SIZE))
+    ), (
+        "KV seqlen must be smaller than the block_mask size in the KV dimension, considering pass a larger block_mask."
+    )
+    CppFlexAttentionTemplate.add_choices(
+        choices=_choices,
+        input_nodes=input_nodes,
+        layout=layout,
+        scale=scale,
+        score_mod=None if skip_mask_score else subgraph_buffer,
+        mask_mod=None if skip_mask_score else mask_graph_buffer,
+        kv_block_size=SPARSE_KV_BLOCK_SIZE,
+        q_block_size=SPARSE_Q_BLOCK_SIZE,
+        has_other_buffer=has_other_buffer,
+        no_full_kv_block=no_full_kv_block,
+        fake_buffers=fake_buffers,
+        len_score_other=len(score_mod_other_buffers),
+        len_mask_other=len(mask_mod_other_buffers),
+        kernel_input_name_to_buffer=kernel_input_name_to_buffer,
+        block_vars=(cur_qSplitSize, cur_kvSplitSize),
+    )
+    inputs_for_autotuning = [
+        query,
+        key,
+        value,
+    ]
+    res = autotune_select_algorithm(
+        "flex_attention",
+        _choices,
+        inputs_for_autotuning,
+        layout,
+    )
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    res.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    res.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        subgraph_buffer, mask_graph_buffer
+    )
+
+    return (res,)
+
+
+def is_power_of_2(n):
+    return n != 0 and ((n & (n - 1)) == 0)
+
+
+def next_power_of_two(n):
+    if n <= 0:
+        return 1
+    return 2 ** math.ceil(math.log2(n))
+
+
+def set_head_dim_values(
+    kernel_options: dict[str, Any], qk_head_dim, v_head_dim, graph_sizevars
+):
+    """
+    Mutates kernel options, adding head dimension calculations.
+
+    Args:
+        kernel_options: Dictionary to populate with options
+        qk_head_dim: Query/Key head dimension
+        v_head_dim: Value head dimension
+        graph_sizevars: Graph size variables object with evaluate_static_shape method
+
+    """
+    # QK dimensions
+    qk_head_dim_static = graph_sizevars.evaluate_static_shape(qk_head_dim)
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim_static)
+    kernel_options.setdefault(
+        "QK_HEAD_DIM_ROUNDED", next_power_of_two(qk_head_dim_static)
+    )
+
+    # V dimensions
+    v_head_dim_static = graph_sizevars.evaluate_static_shape(v_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim_static)
+    kernel_options.setdefault(
+        "V_HEAD_DIM_ROUNDED", next_power_of_two(v_head_dim_static)
+    )
+
+    # Safety flag
+    kernel_options.setdefault(
+        "SAFE_HEAD_DIM",
+        is_power_of_2(qk_head_dim_static) and is_power_of_2(v_head_dim_static),
+    )
+
+
+@register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None)
+def flex_attention(
+    query,
+    key,
+    value,
+    subgraph,
+    block_mask,
+    scale,
+    kernel_options,
+    score_mod_other_buffers,
+    mask_mod_other_buffers,
+):
+    if query.get_device().type == "cpu":
+        return lower_cpu(
+            query,
+            key,
+            value,
+            subgraph,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+
+    # below is cuda path if device is not cpu
+    # tl.dot does not support embedding size less than 16
+    small_dqk = V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-1], 16))
+    small_dv = V.graph.sizevars.evaluate_expr(sympy.Lt(value.get_size()[-1], 16))
+    if small_dqk or small_dv:
+        raise NotImplementedError(
+            f"NYI: embedding dimension of the query, key, and value must be "
+            f"at least 16 but got E={query.get_size()[-1]} and Ev={value.get_size()[-1]}"
+        )
+
+    (
+        _,  # q_length
+        _,  # kv_length
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_Q_BLOCK_SIZE,
+        SPARSE_KV_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+
+    placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("score", query.get_dtype()),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    subgraph_buffer = build_subgraph_buffer(
+        placeholder_inps + list(score_mod_other_buffers), subgraph
+    )
+
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+
+    kernel_options = dict(kernel_options)
+    # Mark symbols in custom kernel options as static shapes and add guards.
+    kernel_options = {
+        k: V.graph.sizevars.evaluate_static_shape(v)
+        if isinstance(v, sympy.Symbol)
+        else v
+        for k, v in kernel_options.items()
+    }
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    enable_gqa = V.graph.sizevars.evaluate_expr(
+        sympy.Ne(query.get_size()[1], key.get_size()[1]),
+    )
+    if _use_flex_decoding(query, kv_indices, kernel_options, enable_gqa):
+        return create_flex_decoding_kernel(
+            query,
+            key,
+            value,
+            block_mask,
+            scale,
+            kernel_options,
+            subgraph_buffer,
+            mask_graph_buffer,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+
+    score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
+    mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
+
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv) | sympy.Eq(Bkv, 1)), (
+        f"Bq and Bkv must broadcastable. Got Bq={Bq} and Bkv={Bkv}"
+    )
+    assert V.graph.sizevars.evaluate_expr(sympy.Gt(seq_len_q, 0)), (
+        "Query length must be greater than 0"
+    )
+    assert V.graph.sizevars.evaluate_expr(sympy.Gt(seq_len_kv, 0)), (
+        "Key length must be greater than 0"
+    )
+
+    B = Bq
+
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    # NB it is okay that the v_head_dim is different
+    # We are using these to match fill order of the output.
+    q_strides = query.get_stride()
+    # Construct output layout with strides matching the query.
+    out_size = [B, Hq, seq_len_q, v_head_dim]
+    out_strides = infer_dense_strides(out_size, q_strides)
+
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        [B, Hq, seq_len_q, v_head_dim],
+        stride=[sympy.sympify(s) for s in out_strides],
+    )
+    # see NOTE:[TritonTemplates with multiple outputs]
+    logsumexp_shape = [B, Hq, seq_len_q]
+    logsumexp = empty_strided(
+        logsumexp_shape,
+        None,
+        dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    kernel_options.setdefault("SM_SCALE", scale)
+
+    # Determine GQA broadcast factor.
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is None if partial blocks are not computed
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+
+    set_head_dim_values(kernel_options, qk_head_dim, v_head_dim, V.graph.sizevars)
+
+    choices: list[Any] = []
+
+    dtype = query.get_dtype()
+    head_dim = V.graph.sizevars.evaluate_static_shape(query.get_size()[-1])
+    configs = V.choices.get_flex_attention_fwd_configs(head_dim, dtype)
+
+    # Mark SPARSE_KV_BLOCK_SIZE & SPARSE_Q_BLOCK_SIZE as static shapes and add guards.
+    SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_KV_BLOCK_SIZE)
+    SPARSE_Q_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_Q_BLOCK_SIZE)
+
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+    original_kernel_options = kernel_options.copy()
+    # Default config for warp specialization
+    num_consumer_groups, num_buffers_warp_spec = 0, 0
+
+    for conf in configs:
+        if (
+            SPARSE_KV_BLOCK_SIZE % conf.block_n != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m != 0
+        ):
+            if len(configs) == 1:
+                raise ValueError(
+                    f"Q and KV block size must be divisible by BLOCK_M and BLOCK_N. We "
+                    f"got Q_BLOCK_SIZE={SPARSE_Q_BLOCK_SIZE} and KV_BLOCK_SIZE={SPARSE_KV_BLOCK_SIZE}."
+                )
+            continue
+
+        cur_kernel_options = original_kernel_options.copy()
+        # Performance tuning
+        # Triton parameters
+        # Remove prefix for forward kernels options and delete backward kernel options.
+        for k in list(cur_kernel_options.keys()):
+            if k.startswith("fwd_"):
+                v = cur_kernel_options.pop(k)
+                cur_kernel_options[k[4:]] = v
+            if k.startswith("bwd_"):
+                cur_kernel_options.pop(k)
+        cur_kernel_options.setdefault("num_stages", conf.num_stages)
+        cur_kernel_options.setdefault("num_warps", conf.num_warps)
+        if cur_kernel_options.get("num_consumer_groups", False):
+            cur_kernel_options.setdefault("num_consumer_groups", num_consumer_groups)
+            cur_kernel_options.setdefault(
+                "num_buffers_warp_spec", num_buffers_warp_spec
+            )
+
+        # Disabling TMA by default, only explicit kernel_options supported for now
+        cur_kernel_options.setdefault("USE_TMA", False)
+
+        cur_kernel_options.setdefault("BLOCK_M", conf.block_m)
+        cur_kernel_options.setdefault("BLOCK_N", conf.block_n)
+        # Blocksparse options
+        cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+
+        # ROCm specific kernargs
+        for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
+            if hasattr(conf, attrib):
+                cur_kernel_options[attrib] = getattr(conf, attrib)
+
+        error = flex_attention_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout,
+            subgraphs=[
+                subgraph_buffer,
+                mask_graph_buffer,
+            ],
+            mutated_inputs=[
+                logsumexp,
+            ],
+            call_sizes=query.get_size(),
+            **cur_kernel_options,
+        )
+        if error is not None and len(configs) == 1:
+            raise error
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        4: create_num_blocks_fake_generator(kv_indices),
+        5: create_indices_fake,
+        6: create_num_blocks_fake_generator(full_kv_indices),
+        7: create_indices_fake,
+    }
+
+    out = autotune_select_algorithm(
+        "flex_attention",
+        choices,
+        # Need to filter out symbols since there is an invariant
+        # that all input_nodes are of type IRNode
+        [x for x in inputs_for_autotuning if isinstance(x, torch._inductor.ir.IRNode)],
+        layout,
+        input_gen_fns=input_gen_fns,
+    )
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    out.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    out.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        subgraph_buffer, mask_graph_buffer
+    )
+
+    return (out, logsumexp)
+
+
+# ---------------------------- Backward HOP Implementation ----------------------------
+
+
+def flex_attention_backward_grid(
+    batch_size, q_heads, num_queries, d_model, kv_heads, num_key_value, meta
+):
+    """How is this kernel parallelized?
+    Currently this is only parallelizing over batch* kv_heads, but we can, and want to
+    parallelize over ceil_div(q_heads//kv_heads * num_key_value, key_value_block_size).
+    To do this will either require atomic updates to some grad values or to have a two pass kernel design.
+    """
+    import triton
+
+    return (
+        triton.cdiv(num_queries, meta["BLOCK_M2"]) * (q_heads // kv_heads)
+        + triton.cdiv(num_key_value, meta["BLOCK_N1"]),
+        1,
+        batch_size * kv_heads,
+    )
+
+
+flex_attention_backward_template = TritonTemplate(
+    name="flex_attention_backward",
+    grid=flex_attention_backward_grid,
+    source=r"""
+{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}}
+    stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}}
+
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}}
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}}
+
+    ZQ = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    HKV = {{size("K", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    ZKV = {{size("K", 0)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    pid = tl.program_id(0)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+
+    off_hz = tl.program_id(2)
+    off_zq = off_hz // HKV # q batch idx
+    off_hkv = off_hz % HKV # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_zq % SPARSE_Z
+
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+        stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+        stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            {{gen_argdefs()}},
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                {{gen_argdefs()}},
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+
+        stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}}
+        stride_q_idx_h = {{stride("Q_IDX", 1)}}
+        stride_q_idx_n = {{stride("Q_IDX", 2)}}
+
+
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                {{gen_argdefs()}},
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+
+
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    {{gen_argdefs()}},
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+
+        dk *= SM_SCALE
+
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        {{store_output(("off_zq", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8)}}
+
+@triton.jit
+def bwd_dq_inner(
+    {{gen_argdefs()}},
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_n in range(0, hi - 1):
+                dq = bwd_dq_block_mn(
+                    {{gen_argdefs()}},
+                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                    stride_kn, stride_kd, stride_vn, stride_vd,
+                    kv_indices, sparse_kv_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_n, kv_indices, sparse_kv_num_blocks,
+                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+                )
+
+                kT_ptrs += offset * stride_kn
+                vT_ptrs += offset * stride_vn
+
+                offs_n2 += offset
+
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_n in range(0, hi):
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_n, kv_indices, sparse_kv_num_blocks,
+                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+            )
+
+            kT_ptrs += offset * stride_kn
+            vT_ptrs += offset * stride_vn
+
+            offs_n2 += offset
+
+    return dq
+
+
+@triton.jit
+def bwd_dq_block_mn(
+    {{gen_argdefs()}},
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1)}}
+
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds prior to the last loop
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="ds"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = offs_m2[:, None] < Q_LEN and offs_n2[None, :] < KV_LEN
+        {{ modification(
+            subgraph_number=3,
+            output_name=None,
+            mask="scatter_mask",
+            score="pre_mod_scores",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+            grad_score_mod="ds"
+        ) | indent_except_first(2) }}
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+
+    return dq
+
+
+@triton.jit
+def bwd_dkdv_inner(
+    {{gen_argdefs()}},
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_m in range(0, hi - 1):
+                dk, dv = bwd_dkdv_block_mn(
+                    {{gen_argdefs()}},
+                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_m, q_indices, sparse_q_num_blocks,
+                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+                )
+
+                qT_ptrs += offset * stride_qm
+                do_ptrs += offset * stride_dom
+
+                offs_m1 += offset
+
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_m in range(0, hi):
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_m, q_indices, sparse_q_num_blocks,
+                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+            )
+
+            qT_ptrs += offset * stride_qm
+            do_ptrs += offset * stride_dom
+
+            offs_m1 += offset
+
+    return dk, dv
+
+
+@triton.jit
+def bwd_dkdv_block_mn(
+    {{gen_argdefs()}},
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1) }}
+
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds prior to the last loop
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+
+    pre_mod_scores = qkT
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qkT",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qkT"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qkT",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="dsT"
+    ) | indent_except_first(1) }}
+
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = offs_m1[None, :] < Q_LEN and offs_n1[:, None] < KV_LEN
+        {{ modification(
+            subgraph_number=3,
+            output_name=None,
+            mask="scatter_mask",
+            score="pre_mod_scores",
+            b="idx_b",
+            h="idx_h",
+            m="idx_m",
+            n="idx_n",
+            grad_score_mod="dsT"
+        ) | indent_except_first(2) }}
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
+
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+
+    return dk, dv
+ """
+    + compute_next_offset_func
+    + get_bounded_indices_func
+    + load_checked_2d,
+)
+
+
+def validate_joint_graph(joint_graph: torch.fx.Graph):
+    """We do some pre lowering graph checks in order to raise nicer error messages"""
+    for node in joint_graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target == torch.ops.flex_lib.zeros_and_scatter.default
+        ):
+            for user in node.users:
+                if user.op != "output":
+                    raise NotImplementedError(
+                        "Using multiple indexing operations on the same tensor that requires gradients "
+                        "in a score_mod function is not currently supported. "
+                        "This typically happens when indexing the same tensor multiple times, like:\n\n"
+                        "    def score_mod(score, b, h, q_idx, kv_idx):\n"
+                        "        return score + bias[q_idx] + bias[kv_idx]  # bias used twice!\n\n"
+                        "A valid workaround is to clone() the tensors that will be indexed multiple times. For example:\n\n"
+                        "    bias1 = bias.clone()\n"
+                        "    def score_mod(score, b, h, q_idx, kv_idx):\n"
+                        "        return score + bias[q_idx] + bias1[kv_idx]\n\n"
+                        "Note that this solution will use additional memory."
+                    )
+    return
+
+
+@dataclass(frozen=True)
+class JointOutputResult:
+    """Results from processing joint outputs."""
+
+    grad_input: ComputedBuffer
+    captured_grads_compute: list[ComputedBuffer]
+    captured_grads: list[Optional[TensorBox]]
+    mutated_grads: list[TensorBox]
+
+
+def process_joint_outputs(
+    all_joint_outputs: SubgraphResults, num_placeholders: int
+) -> JointOutputResult:
+    """Process joint outputs and extract various buffers needed for lowering
+
+    Args:
+        all_joint_outputs: List of all the outputs from build_subgraphs
+        num_placeholders: The number of placeholder inputs, used to skip over unused backward compute buffers
+
+    Returns:
+        JointOutputResult containing processed buffers and gradients
+    """
+    assert isinstance(all_joint_outputs, list)
+    assert all_joint_outputs[0] is not None, (
+        "joint_subgraph_buffer is None - this is a bug!"
+    )
+
+    joint_buffer = all_joint_outputs[0]
+    other_grads = all_joint_outputs[num_placeholders - 1 :]
+
+    # outer_grads has the structure: Len(other_buffer_grads) if buffer doesn't require grad than it will be None
+    # We only grab the buffers that require grad for inlining into kernel
+    grads_compute = [buf for buf in other_grads if buf is not None]
+
+    def get_out(buf):
+        if buf is None:
+            return None
+        assert isinstance(buf, ComputedBuffer)
+        assert buf.name is not None
+        return TensorBox.create(V.graph.get_buffer(buf.name))
+
+    grads_out = [get_out(x) for x in other_grads]
+    mutated_grads = [buf for buf in grads_out if buf is not None]
+
+    return JointOutputResult(
+        grad_input=joint_buffer,
+        captured_grads_compute=grads_compute,
+        captured_grads=grads_out,
+        mutated_grads=mutated_grads,
+    )
+
+
+# TODO: We probably also need a layout constraint?
+@register_lowering(
+    torch.ops.higher_order.flex_attention_backward, type_promotion_kind=None
+)
+def flex_attention_backward(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        _,  # q_length
+        _,  # kv_length
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_Q_BLOCK_SIZE,
+        SPARSE_KV_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+
+    (
+        query,
+        key,
+        value,
+        grad_out,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            grad_out,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+
+    device = query.get_device()
+    dtype = query.get_dtype()
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+
+    assert V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv) | sympy.Eq(Bkv, 1)), (
+        f"Bq and Bkv must broadcastable. Got Bq={Bq} and Bkv={Bkv}"
+    )
+
+    kernel_options = dict(kernel_options)
+    # Mark symbols in custom kernel options as static shapes and add guards.
+    kernel_options = {
+        k: V.graph.sizevars.evaluate_static_shape(v)
+        if isinstance(v, sympy.Symbol)
+        else v
+        for k, v in kernel_options.items()
+    }
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    fwd_placeholder_inps = [
+        create_placeholder(name, dtype, device)
+        for name, dtype in [
+            ("score", dtype),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    fw_subgraph_buffer = build_subgraph_buffer(
+        fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph
+    )
+
+    joint_placeholder_inps = fwd_placeholder_inps + [
+        create_placeholder("grad_score_mod", dtype, device)
+    ]
+    # Sometimes we have weird unused nodes here
+    joint_graph.graph_module.graph.eliminate_dead_code()
+
+    # It is hard to raise nice errors for some joint graphs during subgraph lowering
+    # This lets us do some checks before attempting to lower
+    validate_joint_graph(joint_graph.graph_module.graph)
+
+    all_joint_outputs = build_subgraph_buffer(
+        joint_placeholder_inps + list(score_mod_other_buffers),
+        joint_graph,
+    )
+
+    joint_outputs = process_joint_outputs(
+        all_joint_outputs, len(joint_placeholder_inps)
+    )
+
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+
+    mask_graph_buffer = mask_graph_buffer
+
+    # Construct layout with stride order matching K
+    key_size = [Bq, Hkv, seq_len_kv, qk_head_dim]
+    key_strides = infer_dense_strides(key_size, key.get_stride())
+
+    layout_broadcasted_k = FixedLayout(
+        key.get_device(),
+        key.get_dtype(),
+        key_size,
+        stride=[sympy.sympify(s) for s in key_strides],
+    )
+
+    # Create delta which will is needed for the bwd's kernel
+    grad_lse_exp2 = lowerings[aten.mul](grad_logsumexp, 1 / math.log(2))
+    mul_delta = lowerings[aten.mul](out, grad_out)
+    delta = lowerings[aten.sum](mul_delta, axis=-1)
+    delta = lowerings[aten.sub](delta, grad_lse_exp2)
+    delta = ExternKernel.require_contiguous(delta)
+
+    grad_lse_exp2, delta = maybe_realize([grad_lse_exp2, delta])
+
+    # # see NOTE:[TritonTemplates with multiple outputs]
+    query_size = [Bq, Hq, seq_len_q, qk_head_dim]
+    grad_query_strides = infer_dense_strides(query_size, query.get_stride())
+    grad_query = empty_strided(
+        query_size,
+        stride=[sympy.sympify(s) for s in grad_query_strides],
+        dtype=query.get_dtype(),
+        device=query.get_device(),
+    )
+
+    # Construct output layout with stride order matching value
+    value_size = [Bq, Hkv, seq_len_kv, v_head_dim]
+    value_strides = infer_dense_strides(value_size, value.get_stride())
+
+    broadcasted_grad_value = empty_strided(
+        value_size,
+        stride=[sympy.sympify(s) for s in value_strides],
+        dtype=value.get_dtype(),
+        device=value.get_device(),
+    )
+
+    kernel_options.setdefault("SM_SCALE", scale)
+
+    # Determine GQA factor
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is torch.zeros([1, 1, 1]) if partial blocks are not computed.
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices, full_q_num_blocks, full_q_indices = (
+            empty(0, device=query.get_device()) for _ in range(4)
+        )
+
+    set_head_dim_values(kernel_options, qk_head_dim, v_head_dim, V.graph.sizevars)
+
+    SPARSE_Q_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_Q_BLOCK_SIZE)
+    SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_KV_BLOCK_SIZE)
+
+    choices: list[Any] = []
+
+    dtype = query.get_dtype()
+    head_dim = V.graph.sizevars.evaluate_static_shape(query.get_size()[-1])
+    configs = V.choices.get_flex_attention_bwd_configs(head_dim, dtype)
+
+    # Default config for warp specialization
+    num_consumer_groups, num_buffers_warp_spec = 0, 0
+
+    original_kernel_options = kernel_options.copy()
+    for conf in configs:
+        if (
+            SPARSE_KV_BLOCK_SIZE % conf.block_m != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m != 0
+            or SPARSE_KV_BLOCK_SIZE % conf.block_n != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_n != 0
+        ):
+            continue
+
+        # Performance tuning
+        # Triton heuristics
+        cur_kernel_options = original_kernel_options.copy()
+        # Remove prefix for backward kernels options and delete forward kernel options.
+        for k in list(cur_kernel_options.keys()):
+            if k.startswith("bwd_"):
+                v = cur_kernel_options.pop(k)
+                cur_kernel_options[k[4:]] = v
+            if k.startswith("fwd_"):
+                cur_kernel_options.pop(k)
+        cur_kernel_options.setdefault("num_warps", conf.num_warps)
+        cur_kernel_options.setdefault("num_stages", conf.num_stages)
+
+        if cur_kernel_options.get("num_consumer_groups", False):
+            cur_kernel_options.setdefault("num_consumer_groups", num_consumer_groups)
+            cur_kernel_options.setdefault(
+                "num_buffers_warp_spec", num_buffers_warp_spec
+            )
+
+        cur_kernel_options.setdefault("BLOCK_M1", conf.block_m)
+        cur_kernel_options.setdefault("BLOCK_N1", conf.block_n)
+        cur_kernel_options.setdefault("BLOCK_M2", conf.block_n)
+        cur_kernel_options.setdefault("BLOCK_N2", conf.block_m)
+
+        # Blocksparse options
+        cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+
+        # ROCm specific kernargs
+        for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
+            if hasattr(conf, attrib):
+                cur_kernel_options[attrib] = getattr(conf, attrib)
+
+        flex_attention_backward_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                delta,
+                grad_out,
+                grad_query,
+                broadcasted_grad_value,
+                kv_num_blocks,
+                kv_indices,
+                q_num_blocks,
+                q_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+                full_q_num_blocks,
+                full_q_indices,
+            ],
+            layout=layout_broadcasted_k,  # We use store_output only for grad_key
+            subgraphs=[
+                fw_subgraph_buffer,
+                joint_outputs.grad_input,
+                mask_graph_buffer,
+                joint_outputs.captured_grads_compute,
+            ],
+            mutated_inputs=[
+                grad_query,
+                broadcasted_grad_value,
+                *joint_outputs.mutated_grads,
+            ],
+            call_sizes=query.get_size() + key.get_size()[1:3],
+            **cur_kernel_options,
+        )
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            delta,
+            grad_out,
+            grad_query,
+            broadcasted_grad_value,
+            kv_num_blocks,
+            kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+        + joint_outputs.mutated_grads
+    )
+    input_gen_fns = {
+        8: create_num_blocks_fake_generator(kv_indices),  # kv_num_blocks
+        9: create_indices_fake,
+        10: create_num_blocks_fake_generator(q_indices),  # q_num_blocks
+        11: create_indices_fake,
+        12: create_num_blocks_fake_generator(full_kv_indices),  # full_kv_num_blocks
+        13: create_indices_fake,
+        14: create_num_blocks_fake_generator(full_q_indices),  # full_q_num_blocks
+        15: create_indices_fake,
+    }
+
+    broadcasted_grad_key = autotune_select_algorithm(
+        "flex_attention_backward",
+        choices,
+        [x for x in inputs_for_autotuning if isinstance(x, torch._inductor.ir.IRNode)],
+        layout_broadcasted_k,
+        input_gen_fns=input_gen_fns,
+    )  # [Bq, Hkv, seq_len_kv, k_head_dim]
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    broadcasted_grad_key.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    broadcasted_grad_key.data.data.subgraph_outs = get_bwd_subgraph_outputs(
+        fw_subgraph_buffer, mask_graph_buffer, joint_outputs
+    )
+
+    if V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv)):
+        grad_key = broadcasted_grad_key
+        grad_value = broadcasted_grad_value
+    else:
+        assert V.graph.sizevars.evaluate_expr(sympy.Gt(Bq, 1) & sympy.Eq(Bkv, 1)), (
+            f"Bq and Bkv must broadcastable. "
+            f"Got Bq={V.graph.sizevars.evaluate_expr(Bq)} "
+            f"and Bkv={V.graph.sizevars.evaluate_expr(Bkv)}"
+        )
+        grad_key = lowerings[aten.sum](broadcasted_grad_key, axis=0, keepdims=True)
+        grad_value = lowerings[aten.sum](broadcasted_grad_value, axis=0, keepdims=True)
+
+    return (grad_query, grad_key, grad_value, tuple(joint_outputs.captured_grads))
+
+
+def get_bwd_subgraph_outputs(
+    subgraph_buffer: SubgraphResults,
+    mask_graph_buffer: SubgraphResults,
+    joint_outputs: JointOutputResult,
+) -> list[Optional[Union[ComputedBuffer, TensorBox]]]:
+    subgraph_buffer = (
+        subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
+    )
+    mask_graph_buffer = (
+        mask_graph_buffer
+        if isinstance(mask_graph_buffer, Sequence)
+        else [mask_graph_buffer]
+    )
+    joint_output_buffers = [
+        joint_outputs.grad_input,
+        *joint_outputs.captured_grads_compute,
+        *joint_outputs.captured_grads,
+        *joint_outputs.mutated_grads,
+    ]
+
+    return [*subgraph_buffer, *mask_graph_buffer, *joint_output_buffers]
diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
new file mode 100644
index 0000000000000..0c663421a036e
--- /dev/null
+++ b/torch/_inductor/kernel/flex_decoding.py
@@ -0,0 +1,628 @@
+# mypy: allow-untyped-defs
+"""Triton Implementation of the flex_attention Kernel for short query length (FlexDecoding)"""
+
+from typing import Any
+
+import sympy
+
+import torch
+from torch._inductor.virtualized import V
+
+from .. import ir
+from ..ir import FixedLayout, FlexibleLayout
+from ..lowering import empty, empty_strided, lowerings
+from ..runtime.runtime_utils import is_power_of_2, next_power_of_2
+from ..select_algorithm import autotune_select_algorithm, SymbolicGridFn, TritonTemplate
+from .flex_attention import (
+    compute_forward_block_mn,
+    compute_forward_inner,
+    compute_next_offset_func,
+    create_indices_fake,
+    create_num_blocks_fake_generator,
+    get_bounded_indices_func,
+    get_fwd_subgraph_outputs,
+    load_checked_2d,
+    load_checked_block,
+    maybe_realize,
+)
+
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+
+@SymbolicGridFn
+def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * kv_heads, SPLIT_KV, 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the local output for their tile of keys and values over all full length of query.
+    groups of SPLIT_KV blocks then combine their output to produce the final result.
+    """
+
+    return (batch_size * kv_heads, meta["SPLIT_KV"], 1)
+
+
+flex_decoding_template = TritonTemplate(
+    name="flex_decoding",
+    grid=flex_decoding_grid,
+    source=r"""
+    {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+    stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}}
+    stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}}
+
+
+    Z = {{size("Q", 0)}}
+    ZKV = {{size("K", 0)}}
+    HKV = {{size("Q", 1)}}
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = {{size("Q", 3)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+
+    off_z = tl.program_id(0) // HKV
+    off_zkv = off_z % ZKV
+    off_hkv = tl.program_id(0) % HKV
+    off_t = tl.program_id(1)
+
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_h = off_hkv % SPARSE_HQ
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_vd = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    # Get HZ offsets for KV_NUM_BLKS and KV_IDX
+    stride_block_z, stride_block_h, stride_block_row = {{stride("KV_NUM_BLKS")}}
+    sparse_block_hz_offset = sparse_idx_z * stride_block_z + sparse_idx_h * stride_block_h
+    stride_kv_z, stride_kv_h, stride_kv_row, stride_kv_col = {{stride("KV_IDX")}}
+    sparse_idx_hz_offset = sparse_idx_z * stride_kv_z + sparse_idx_h * stride_kv_h
+
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+
+    if not SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=(offs_d[None, None, :] < QK_HEAD_DIM) & (off_m[None, :, None] < Q_LEN))
+    elif SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=offs_d[None, None, :] < QK_HEAD_DIM)
+    elif not SAFE_M_BOUNDARY and SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=off_m[None, :, None] < Q_LEN)
+    else:
+        q = tl.load(Q + q_offset + q_range)
+
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM_ROUNDED])
+
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # Apply both score_mod and mask_mod
+
+    # find first kv block we are loading and the number of blocks we are loading
+    # Offset the kv_indices tensor by the correct batch and head
+    kv_indices = KV_IDX + sparse_idx_hz_offset
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
+    indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+        strides=(stride_kk, stride_kn),
+        offsets=(0, off_n),
+        block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(KV_LEN, V_HEAD_DIM),
+        strides=(stride_vn, stride_vk),
+        offsets=(off_n, 0),
+        block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+        order=(1, 0)
+    )
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        None,
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_idx_hz_offset
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_block_hz_offset)
+        # Assign full block in a reverse order for off_t. Prioritize the last CTA.
+        block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
+        block_n_end = block_n_start + TILE_KV_MULTIPLE
+        indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+        K_block_ptr = tl.make_block_ptr(
+            base=K + k_offset,
+            shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+            strides=(stride_kk, stride_kn),
+            offsets=(0, off_n),
+            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+            order=(0, 1)
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V + v_offset,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(off_n, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            None,
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+ """
+    + compute_forward_inner
+    + get_bounded_indices_func
+    + load_checked_block
+    + load_checked_2d
+    + compute_next_offset_func
+    + compute_forward_block_mn,
+)
+
+
+def get_split_k(B: int, H: int, Mk: int) -> int:
+    num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
+    bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
+    assert isinstance(bh, (int, sympy.Integer)), "B and H must be concrete integers"
+    split_k = num_SM // bh * 2  # Each SM should at least get one block.
+    # TODO: workload evening at runtime for splits fully masked out.
+    # Before we have runtime workload evening, assign 2 splits per SM.
+    split_k = max(split_k, 1)
+
+    return split_k
+
+
+def create_flex_decoding_kernel(*args, **kwargs):
+    from .flex_attention import set_head_dim_values
+
+    (
+        query,
+        key,
+        value,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_subgraph,
+        mask_mod_subgraph,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        _,  # q_length
+        _,  # kv_length
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,  # full_kv_num_blocks,
+        full_kv_indices,  # full_kv_indices,
+        _,  # q_num_blocks
+        _,  # q_indices
+        _,  # full_q_num_blocks,
+        _,  # full_q_indices,
+        _,  # SPARSE_Q_BLOCK_SIZE,
+        SPARSE_KV_BLOCK_SIZE,
+        _,
+    ) = block_mask
+
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+
+    assert V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv) | sympy.Eq(Bkv, 1)), (
+        f"Bq and Bkv must broadcastable. Got Bq={Bq} and Bkv={Bkv}"
+    )
+
+    B = Bq
+    kernel_options = dict(kernel_options)
+    # Mark symbols in custom kernel options as static shapes and add guards.
+    kernel_options = {
+        k: V.graph.sizevars.evaluate_static_shape(v)
+        if isinstance(v, sympy.Symbol)
+        else v
+        for k, v in kernel_options.items()
+    }
+
+    # TODO: Fix flex decoding non-divisible case!
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+
+    # Calculate GQA head sharing
+    gqa_shared_heads = Hq // Hkv
+    if not is_power_of_2(gqa_shared_heads):
+        raise ValueError(
+            "Number of shared query heads sharing the same KV head must be power of 2. "
+        )
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+
+    # Determine if there are "full" blocks where we only need to apply score_mod, and can skip mask_mod
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        # Create a plackeholder full block list in case it is empty
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+    )
+    score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
+    mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
+
+    choices: list[Any] = []
+    dtype = key.get_dtype()
+    head_dim = V.graph.sizevars.evaluate_static_shape(key.get_size()[-1])
+    configs = V.choices.get_flex_decode_configs(head_dim, dtype)
+
+    # TODO: fix autotuning.
+
+    kernel_options.setdefault("SM_SCALE", scale)
+    kernel_options.setdefault("SPLIT_KV", get_split_k(B, Hkv, seq_len_kv))
+    MAX_SPLIT_KV = kernel_options["SPLIT_KV"]
+
+    # create config dependent intermediate buffers
+    buf_ACC_shape = [B, MAX_SPLIT_KV, Hq, seq_len_q, v_head_dim]
+    buf_ML_shape = buf_ACC_shape[:-1]
+    buf_M = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The rowmax is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    buf_L = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The intermediate sumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+
+    layout_acc = FixedLayout(
+        query.get_device(),
+        torch.float32,
+        buf_ACC_shape,
+        FlexibleLayout.contiguous_strides(buf_ACC_shape),
+    )
+
+    set_head_dim_values(kernel_options, qk_head_dim, v_head_dim, V.graph.sizevars)
+
+    kernel_options.setdefault(
+        "BLOCK_M",
+        (
+            # m
+            # if V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 0))
+            # else  # Always use a BLOCK_M > 16 before Triton fix https://github.com/triton-lang/triton/pull/4061 is in pin
+            max(
+                next_power_of_2(
+                    V.graph.sizevars.size_hint(
+                        seq_len_q,
+                        fallback=torch._inductor.config.unbacked_symint_fallback,  # type: ignore[arg-type]
+                    )
+                    * gqa_shared_heads
+                ),
+                16,
+            )
+        ),
+    )
+
+    query = ir.ExternKernel.realize_input(query)
+    stride_b, stride_hq, stride_seq_len_q, stride_qk_head_dim = query.get_stride()
+
+    # Reshape query for GQA: [B, Hq, Mq, D] -> [B, Hkv, G, Mq, D]
+    gqa_query_shape = (B, Hkv, gqa_shared_heads, seq_len_q, qk_head_dim)
+    gqa_query_stride = (
+        stride_b,
+        stride_hq * gqa_shared_heads,
+        stride_hq,
+        stride_seq_len_q,
+        stride_qk_head_dim,
+    )
+    query = lowerings[aten.as_strided](query, gqa_query_shape, gqa_query_stride)
+
+    V.graph.sizevars.guard_leq(
+        seq_len_q * gqa_shared_heads, sympy.Integer(kernel_options["BLOCK_M"])
+    )
+
+    kernel_options.setdefault(
+        "SAFE_M_BOUNDARY",
+        ((seq_len_q * gqa_shared_heads) % kernel_options["BLOCK_M"]) == 0,
+    )
+    # TODO: This feels sketchy
+    kernel_options.setdefault("SAFE_N_BOUNDARY", True)
+    # Mark SPARSE_KV_BLOCK_SIZE as static shapes and add guards.
+    SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_KV_BLOCK_SIZE)
+
+    original_kernel_options = kernel_options.copy()
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+
+    # Default config for warp specialization
+    num_consumer_groups, num_buffers_warp_spec = 0, 0
+
+    for conf in configs:
+        if SPARSE_KV_BLOCK_SIZE % conf.block_n != 0:
+            continue
+
+        cur_kernel_options = original_kernel_options.copy()
+        # Remove prefix for forward kernels options and delete backward kernel options.
+        for k in list(cur_kernel_options.keys()):
+            if k.startswith("fwd_"):
+                v = cur_kernel_options.pop(k)
+                cur_kernel_options[k[4:]] = v
+            if k.startswith("bwd_"):
+                cur_kernel_options.pop(k)
+        # Performance tuning
+        cur_kernel_options.setdefault("BLOCK_N", conf.block_n)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        cur_kernel_options.setdefault("num_warps", conf.num_warps)
+        cur_kernel_options.setdefault("num_stages", conf.num_stages)
+
+        if cur_kernel_options.get("num_consumer_groups", False):
+            cur_kernel_options.setdefault("num_consumer_groups", num_consumer_groups)
+            cur_kernel_options.setdefault(
+                "num_buffers_warp_spec", num_buffers_warp_spec
+            )
+
+        # Set default to False
+        cur_kernel_options.setdefault("USE_TMA", False)
+
+        # Add ROCm-specific parameters if they exist in the config
+        for attrib in ["kpack", "matrix_instr_nonkdim", "waves_per_eu"]:
+            if hasattr(conf, attrib):
+                cur_kernel_options[attrib] = getattr(conf, attrib)
+
+        flex_decoding_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                buf_M,
+                buf_L,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout_acc,
+            subgraphs=[
+                score_mod_subgraph,
+                mask_mod_subgraph,
+            ],
+            mutated_inputs=[buf_M, buf_L],
+            call_sizes=query.get_size(),
+            **cur_kernel_options,
+        )
+
+    inputs_for_flex_decoding = (
+        [
+            query,
+            key,
+            value,
+            buf_M,
+            buf_L,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+
+    input_gen_fns = {
+        5: create_num_blocks_fake_generator(kv_indices),
+        6: create_indices_fake,
+        7: create_num_blocks_fake_generator(full_kv_indices),
+        8: create_indices_fake,
+    }
+
+    buf_ACC = autotune_select_algorithm(
+        "flex_decoding",
+        choices,
+        inputs_for_flex_decoding,
+        layout_acc,
+        input_gen_fns=input_gen_fns,
+    )
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    buf_ACC.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    buf_ACC.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        score_mod_subgraph, mask_mod_subgraph
+    )
+
+    # Reduction
+
+    g_M = lowerings[aten.max](buf_M, dim=1, keepdim=True)[0]
+    # See [Note] Handle fully masked out rows:
+    # g_M Is the global max among split kv blocks.
+    masked_rows = lowerings[aten.eq](g_M, -float("inf"))
+    adj_M = lowerings[aten.sub](buf_M, g_M)
+    adj_M = lowerings[aten.where](masked_rows, 0, adj_M)
+    alpha = lowerings[aten.exp2](adj_M)
+
+    buf_L = lowerings[aten.mul](buf_L, alpha)
+    g_L = lowerings[aten.sum](buf_L, axis=1)
+    masked_rows_squeezed = lowerings[aten.squeeze](masked_rows, dim=1)
+    g_L = lowerings[aten.where](masked_rows_squeezed, 1.0, g_L)
+    logsumexp = lowerings[aten.log2](g_L)
+    logsumexp = lowerings[aten.add](logsumexp, lowerings[aten.squeeze](g_M, dim=1))
+
+    alpha_unseq = lowerings[aten.unsqueeze](alpha, 4)
+    buf_ACC = lowerings[aten.mul](buf_ACC, alpha_unseq)
+    output = lowerings[aten.sum](buf_ACC, axis=1)
+    L_unseq = lowerings[aten.unsqueeze](g_L, 3)
+    output = lowerings[aten.div](output, L_unseq)
+    output = lowerings[prims.convert_element_type](output, query.get_dtype())
+
+    return (
+        output,
+        logsumexp,
+    )
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 5e4aed0d507a0..0439ea0af4b17 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -1,7 +1,13 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+from typing import Any, Optional
+
+import sympy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._dynamo.utils import counters
@@ -13,6 +19,7 @@
     mm_operations,
 )
 from torch._inductor.codegen.cpp_gemm_template import CppGemmTemplate
+<<<<<<< HEAD
 from torch._inductor.remote_gemm_autotune_cache import gen_best_config
 from torch._inductor.virtualized import ops, V
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -32,33 +39,73 @@
     make_reduction,
     register_lowering,
     transform_args,
+=======
+from torch._inductor.virtualized import V
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.torch_version import TorchVersion
+
+from .. import config as inductor_config, ir
+from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
+from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
+from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
+from ..codegen.subgraph import SubgraphTemplate
+from ..ir import FlexibleLayout, is_triton
+from ..lowering import (
+    add_layout_constraint,
+    constrain_to_fx_strides,
+    lowerings as L,
+    register_lowering,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
+<<<<<<< HEAD
     KernelTemplate,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     realize_inputs,
     TritonTemplate,
 )
 from ..utils import (
     _use_cutlass_for_op,
+<<<<<<< HEAD
     ceildiv,
+=======
+    get_k_splits,
+    get_tma_workspace_arg,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_aten_gemm_kernels,
     use_ck_gemm_template,
     use_ck_tile_gemm_template,
     use_cpp_gemm_template,
     use_cutlass_template,
     use_decompose_k_choice,
+<<<<<<< HEAD
     use_triton_blackwell_tma_template,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_triton_template,
     use_triton_tma_template,
 )
 from .mm_common import (
     _is_static_problem,
+<<<<<<< HEAD
     mm_args,
     mm_grid,
     persistent_mm_grid,
     use_native_matmul,
+=======
+    addmm_epilogue,
+    mm_args,
+    mm_config_kwargs,
+    mm_grid,
+    mm_options,
+    persistent_mm_grid,
+    persistent_mm_options,
+    scale_mm_epilogue,
+    scaled_mm_options,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -93,7 +140,11 @@
     stride_bn = {{stride("B", 1)}}
 
     # based on triton.ops.matmul
+<<<<<<< HEAD
     pid = tl.program_id(0).to(INDEX_DTYPE)
+=======
+    pid = tl.program_id(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -108,11 +159,19 @@
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+<<<<<<< HEAD
     if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and (M >= BLOCK_M and K > 1):
         offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
     else:
         offs_a_m = rm % M
     if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and (N >= BLOCK_N and K > 1):
+=======
+    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
+        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        offs_a_m = rm % M
+    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
     else:
         offs_b_n = rn % N
@@ -129,6 +188,7 @@
 
         idx_m = offs_a_m[:, None]
         idx_n = a_k_idx_vals
+<<<<<<< HEAD
         {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask",
                      indent_width=8, index_shape=("BLOCK_M", "BLOCK_K"))}}
 
@@ -136,6 +196,13 @@
         idx_n = offs_b_n[None, :]
         {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask",
                      indent_width=8, index_shape=("BLOCK_K", "BLOCK_N"))}}
+=======
+        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}
+
+        idx_m = b_k_idx_vals
+        idx_n = offs_b_n[None, :]
+        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         {% if USE_FAST_ACCUM %}
         acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
@@ -151,7 +218,11 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
         if (torch.version.hip is None) or triton_version >= "3.3.0"
         # FIXME: To get around rocm failures like https://github.com/pytorch/pytorch/actions/runs/13123783322/job/36617154943
@@ -171,7 +242,11 @@
     stride_bn = {{stride("B", 1)}}
 
     # based on triton.ops.matmul
+<<<<<<< HEAD
     pid = tl.program_id(0).to(INDEX_DTYPE)
+=======
+    pid = tl.program_id(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -207,6 +282,7 @@
 
         idx_m = offs_a_m[:, None]
         idx_n = a_k_idx_vals
+<<<<<<< HEAD
         {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask",
                      indent_width=8, index_shape=("BLOCK_M", "BLOCK_K"))}}
 
@@ -214,6 +290,13 @@
         idx_n = offs_b_n[None, :]
         {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask",
                      indent_width=8, index_shape=("BLOCK_K", "BLOCK_N"))}}
+=======
+        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}
+
+        idx_m = b_k_idx_vals
+        idx_n = offs_b_n[None, :]
+        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {% if USE_FAST_ACCUM %}
         acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
         {% else %}
@@ -228,7 +311,11 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
     ),
     cache_codegen_enabled_for_template=True,
@@ -247,7 +334,11 @@
         # early exit due to zero-size input(s)
         return
 
+<<<<<<< HEAD
     start_pid = tl.program_id(0).to(INDEX_DTYPE)
+=======
+    start_pid = tl.program_id(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grid_m = tl.cdiv(M, BLOCK_M)
     grid_n = tl.cdiv(N, BLOCK_N)
     k_tiles = tl.cdiv(K, BLOCK_K)
@@ -263,11 +354,18 @@
     rk_for_mask = tl.arange(0, BLOCK_K)
     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
 
+<<<<<<< HEAD
     {%- if TMA_EXPERIMENTAL_API %}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
     a_desc_ptr = workspace_base
     b_desc_ptr = workspace_base + TMA_SIZE
 
+<<<<<<< HEAD
+=======
+    {%- if TMA_EXPERIMENTAL_API %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton.language.extra.cuda.experimental_device_tensormap_create2d(
         desc_ptr=a_desc_ptr,
         global_address=A,
@@ -286,6 +384,7 @@
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
+<<<<<<< HEAD
     {%- else %}
     stride_am = {{stride("A", 0)}}
     stride_ak = {{stride("A", 1)}}
@@ -295,12 +394,25 @@
         base=A,
         shape=[M, K] if A_ROW_MAJOR else [K, M],
         strides=[stride_am, 1] if A_ROW_MAJOR else [stride_ak, 1],
+=======
+    a_desc = a_desc_ptr
+    b_desc = b_desc_ptr
+    {%- else %}
+    a_desc = triton.language.make_tensor_descriptor(
+        base=A,
+        shape=[M, K] if A_ROW_MAJOR else [K, M],
+        strides=[K, 1] if A_ROW_MAJOR else [M, 1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
     )
     b_desc = triton.language.make_tensor_descriptor(
         base=B,
         shape=[K, N] if B_ROW_MAJOR else [N, K],
+<<<<<<< HEAD
         strides=[stride_bk, 1] if B_ROW_MAJOR else [stride_bn, 1],
+=======
+        strides=[N, 1] if B_ROW_MAJOR else [K, 1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
     )
     {%- endif %}
@@ -327,13 +439,21 @@
 
         {%- if TMA_EXPERIMENTAL_API %}
         a = tl._experimental_descriptor_load(
+<<<<<<< HEAD
             a_desc_ptr,
+=======
+            a_desc,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [rm, rk] if A_ROW_MAJOR else [rk, rm],
             [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
             A.dtype.element_ty,
         )
         b = tl._experimental_descriptor_load(
+<<<<<<< HEAD
             b_desc_ptr,
+=======
+            b_desc,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [rk, rn] if B_ROW_MAJOR else [rn, rk],
             [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
             B.dtype.element_ty,
@@ -355,18 +475,27 @@
         )
 
         if ki == k_tiles - 1:
+<<<<<<< HEAD
             # inductor generates a suffix
             {%- if TMA_EXPERIMENTAL_API %}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # rematerialize rm and rn to save registers
             rcm = rm + tl.arange(0, BLOCK_M)
             rcn = rn + tl.arange(0, BLOCK_N)
             idx_m = rcm[:, None]
             idx_n = rcn[None, :]
             mask = (idx_m < M) & (idx_n < N)
+<<<<<<< HEAD
             {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12, val_shape=("BLOCK_M", "BLOCK_N"))}}
             {%- else %}
             {{store_output(("rm", "rn"), "acc", indent_width=12, val_shape=("BLOCK_M", "BLOCK_N"), block_indexing=True)}}
             {%- endif %}
+=======
+
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
 
 """,
@@ -374,11 +503,23 @@
 
 load_scales = r"""
 @triton.jit
+<<<<<<< HEAD
 def load_scales(scale_ptr, SCALE_RECIPE: tl.constexpr):
     if SCALE_RECIPE == 0:
         return tl.load(scale_ptr)  # For tensor-wise scaling, we'll load the scalar values
     else:
         return scale_ptr  # For all other scaling recipes, we'll return the pointers
+=======
+def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
+    if SCALING_ROWWISE:
+        # For row-wise scaling, we'll return the pointers
+        return a_scale_ptr, b_scale_ptr
+    else:
+        # For per-tensor scaling, we'll load the scalar values
+        a_scale = tl.load(a_scale_ptr)
+        b_scale = tl.load(b_scale_ptr)
+        return a_scale, b_scale
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 
@@ -388,8 +529,12 @@ def apply_scaling(
     accumulator,
     a_scale,
     b_scale,
+<<<<<<< HEAD
     SCALE_RECIPE_A: tl.constexpr,
     SCALE_RECIPE_B: tl.constexpr,
+=======
+    SCALING_ROWWISE: tl.constexpr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     offs_cm,
     offs_cn,
     M,
@@ -397,7 +542,11 @@ def apply_scaling(
     stride_a_scale_m,
     stride_b_scale_n,
 ):
+<<<<<<< HEAD
     if SCALE_RECIPE_A == 1 and SCALE_RECIPE_B == 1:  # (ScalingType.RowWise, ScalingType.RowWise)
+=======
+    if SCALING_ROWWISE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For row-wise scaling, we need to load the scales for each row/column
         a_scales = tl.load(
             a_scale + (offs_cm * stride_a_scale_m),
@@ -410,7 +559,11 @@ def apply_scaling(
             other=0.0,
         )
         acc_scale = a_scales[:, None] * b_scales[None, :]
+<<<<<<< HEAD
     else:  # (ScalingType.TensorWise, ScalingType.TensorWise)
+=======
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For per-tensor scaling, we can directly use the loaded scalar values
         acc_scale = a_scale * b_scale
 
@@ -418,7 +571,11 @@ def apply_scaling(
 """
 
 
+<<<<<<< HEAD
 scaled_mm_device_tma_epilogue_scaling = r"""
+=======
+device_tma = r"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
     M = {{size("A", 0)}}
     N = {{size("B", 1)}}
@@ -432,6 +589,7 @@ def apply_scaling(
     stride_bk = {{stride("B", 0)}}
     stride_bn = {{stride("B", 1)}}
 
+<<<<<<< HEAD
     if SCALE_RECIPE_A == 1:  # ScalingType.RowWise
         stride_a_scale_m = 1
     else:
@@ -443,16 +601,33 @@ def apply_scaling(
         stride_b_scale_n = 0
 
     start_pid = tl.program_id(axis=0).to(INDEX_DTYPE)
+=======
+    if SCALING_ROWWISE:
+        stride_a_scale_m = 1
+        stride_b_scale_n = 1
+    else:
+        stride_a_scale_m = 0
+        stride_b_scale_n = 0
+
+    start_pid = tl.program_id(axis=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     num_pid_m = tl.cdiv(M, BLOCK_M)
     num_pid_n = tl.cdiv(N, BLOCK_N)
     k_tiles = tl.cdiv(K, BLOCK_K)
     num_tiles = num_pid_m * num_pid_n
 
+<<<<<<< HEAD
     {%- if TMA_EXPERIMENTAL_API %}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
     a_desc_ptr = workspace_base
     b_desc_ptr = workspace_base + TMA_SIZE
 
+<<<<<<< HEAD
+=======
+    {%- if TMA_EXPERIMENTAL_API %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton.language.extra.cuda.experimental_device_tensormap_create2d(
         desc_ptr=a_desc_ptr,
         global_address=A,
@@ -471,6 +646,7 @@ def apply_scaling(
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
+<<<<<<< HEAD
     {%- else %}
     stride_am = {{stride("A", 0)}}
     stride_bn = {{stride("B", 1)}}
@@ -478,12 +654,25 @@ def apply_scaling(
         base=A,
         shape=[M, K],
         strides=[stride_am, 1],
+=======
+    a_desc = a_desc_ptr
+    b_desc = a_desc_ptr
+    {%- else %}
+    a_desc = triton.language.make_tensor_descriptor(
+        base=A,
+        shape=[M, K],
+        strides=[K, 1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_shape=[BLOCK_M, BLOCK_K],
     )
     b_desc = triton.language.make_tensor_descriptor(
         base=B,
         shape=[N, K],
+<<<<<<< HEAD
         strides=[stride_bn, 1],
+=======
+        strides=[K, 1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_shape=[BLOCK_N, BLOCK_K],
     )
     {%- endif %}
@@ -502,8 +691,12 @@ def apply_scaling(
 
     num_pid_in_group = GROUP_M * num_pid_n
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+<<<<<<< HEAD
     a_scale = load_scales(A_inverse_scale, SCALE_RECIPE_A)
     b_scale = load_scales(B_inverse_scale, SCALE_RECIPE_B)
+=======
+    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for _ in range(0, k_tiles * tiles_per_SM):
         ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
@@ -545,8 +738,12 @@ def apply_scaling(
                 accumulator,
                 a_scale,
                 b_scale,
+<<<<<<< HEAD
                 SCALE_RECIPE_A,
                 SCALE_RECIPE_B,
+=======
+                SCALING_ROWWISE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 offs_cm,
                 offs_cn,
                 M,
@@ -555,6 +752,7 @@ def apply_scaling(
                 stride_b_scale_n,
             )
 
+<<<<<<< HEAD
             # inductor generates a suffix
             {%- if TMA_EXPERIMENTAL_API %}
             idx_m = offs_cm[:, None]
@@ -914,6 +1112,21 @@ def _compute_pid(tile_id, num_pid_in_group, grid_m, GROUP_M: tl.constexpr, NUM_S
     name="blackwell_ws_persistent_device_tma",
     grid=persistent_mm_grid,
     source=_blackwell_ws_persistent_device_tma + _compute_blackwell_pid,
+=======
+            idx_m = offs_cm[:, None]
+            idx_n = offs_cn[None, :]
+            mask = (idx_m < M) & (idx_n < N)
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
+            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+"""
+
+
+scaled_mm_device_tma_template = TritonTemplate(
+    name="scaled_mm_device_tma",
+    grid=persistent_mm_grid,
+    source=device_tma + load_scales + apply_scaling,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -923,6 +1136,7 @@ def lazy_register_extern_choice(fn):
     return ExternKernelChoice(fn)
 
 
+<<<<<<< HEAD
 aten_mm = ExternKernelChoice(torch.mm, "at::mm_out", op_overload=aten.mm.out)
 aten_mm_dtype = ExternKernelChoice(
     torch.mm,
@@ -938,12 +1152,24 @@ def lazy_register_extern_choice(fn):
 aten__int_mm = ExternKernelChoice(
     torch._int_mm, "at::_int_mm_out", op_overload=aten._int_mm.out
 )
+=======
+aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+
+aten_addmm = ExternKernelChoice(
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+)
+
+aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm_out")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten__sparse_semi_structured_mm = ExternKernelChoice(
     torch._sparse_semi_structured_mm,
     "at::_sparse_semi_structured_mm",
     has_out_variant=False,
+<<<<<<< HEAD
     op_overload=aten._sparse_semi_structured_mm.default,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 aten__fp8_mm = ExternKernelChoice(
@@ -955,13 +1181,35 @@ def _is_int8_mat(mat):
     return mat.get_dtype() in (torch.int8, torch.uint8)
 
 
+<<<<<<< HEAD
+=======
+def _is_large_block_for_cpu(m, n, k):
+    # Thresholds are experimentally determined to reduce Triton CPU compile times
+    return m * n > 2**13
+
+
+@functools.lru_cache
+def using_b200() -> bool:
+    """Returns true if the device is a NVIDIA B200, otherwise returns false."""
+    if not torch.cuda.is_available():
+        return False
+    # compute capability 10.0 or 10.0a is NVIDIA B200
+    device_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return device_properties.major == 10
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
     """
     Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
     kernel under the hood.  There are a few shapes where this is slower,
     but they are rare.
     """
+<<<<<<< HEAD
     if (inp.stride(0) == 0 and inp.size(0) != 0) or inp.size(0) == 1:
+=======
+    if inp.stride(0) == 0 or inp.size(0) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
     return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
 
@@ -1009,6 +1257,7 @@ def decomposeK(a, b, k_splits):
     return reduced_buf.to(a.dtype)
 
 
+<<<<<<< HEAD
 class DecomposeKSugraphTemplate(SubgraphTemplate):
     def __init__(self):
         super().__init__(
@@ -1171,6 +1420,17 @@ def _to_dtype(x):
     # Create MMKernelInputs for standard MM at the top
     kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=out_dtype)
 
+=======
+@register_lowering(aten.mm, type_promotion_kind=None)
+def tuned_mm(mat1, mat2, *, layout=None):
+    """
+    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    device_type = ir.get_device_type(mat1)
+    name = "mm"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.mm_{m}_{n}_{k}"] += 1
     log.info(
@@ -1183,6 +1443,7 @@ def _to_dtype(x):
         layout,
     )
 
+<<<<<<< HEAD
     choices: list[ChoiceCaller] = []
     static_shape, is_nonzero = _is_static_problem(layout)
 
@@ -1252,12 +1513,129 @@ def _to_dtype(x):
             choices,
             layout,
             kernel_inputs.nodes(),
+=======
+    aten_layout = layout
+    if not (inductor_config.max_autotune or inductor_config.max_autotune_gemm):
+        aten_layout = FlexibleLayout(
+            device=layout.device, dtype=layout.dtype, size=layout.size
+        )
+
+    # options to tune from
+    choices = (
+        [aten_mm.bind((mat1, mat2), aten_layout)] if use_aten_gemm_kernels() else []
+    )
+    static_shape, is_nonzero = _is_static_problem(layout)
+
+    mm_configs = V.choices.get_base_mm_configs(device_type)
+    persistent_mm_configs = V.choices.get_persistent_mm_configs(device_type)
+    extra_mm_configs = V.choices.get_extra_mm_configs(device_type)
+
+    dtype = mat1.get_dtype()
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+        ):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+
+        if use_triton_tma_template(mat1, mat2):
+            for config in persistent_mm_configs(
+                m,
+                n,
+                k,
+                **mm_config_kwargs(
+                    device_type, _is_large_block_for_cpu, dtype.itemsize
+                ),
+            ):
+                persistent_tma_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2),
+                    layout=layout,
+                    workspace_arg=get_tma_workspace_arg(
+                        num_tma_descriptors=2,
+                        device=mat1.get_device(),
+                    ),
+                    **mm_options(config, m, n, k, layout),
+                    **persistent_mm_options(mat1, mat2),
+                )
+
+        from torch._inductor.ir import get_free_symbols
+
+        # Only do split-k optimization if K is much larger than m, n and m, n are small
+        # and if there aren't any unbacked symbols
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                mat1.get_size(),
+                mat1.get_stride(),
+                mat2.get_size(),
+                mat2.get_stride(),
+            )
+        )
+        if use_decompose_k_choice(m, n, k) and not unbacked_symbols:
+            from torch._dispatch.python import enable_python_dispatcher
+
+            from ..decomposition import select_decomp_table
+
+            k_splits = get_k_splits(m, n, k)
+            for k_split in k_splits:
+                if not V.graph.sizevars.statically_known_true(
+                    sympy.Eq(sympy.Mod(k, k_split), 0)
+                ):
+                    continue
+
+                with enable_python_dispatcher():
+                    decompositions = select_decomp_table()
+
+                    decompose_k_subgraph_template = SubgraphTemplate(
+                        name=f"decompose_k_mm_{k_split}_split",
+                        make_fx_graph=make_fx(
+                            functools.partial(decomposeK, k_splits=k_split),
+                            decompositions,
+                        ),
+                    )
+
+                decompose_k_subgraph_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2),
+                    layout=layout,
+                )
+
+    if (
+        is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("mm")
+    ):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+
+    if is_nonzero and use_ck_gemm_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+    if is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
+        CKTileGemmTemplate.add_choices(choices, layout, [mat1, mat2])
+
+    if use_cpp_gemm_template(layout, mat1, mat2):
+        CppGemmTemplate.add_choices(
+            choices,
+            layout,
+            [mat1, mat2],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     input_nodes = [mat1, mat2]
     if (
+<<<<<<< HEAD
         out_dtype is None
         and is_nonzero
+=======
+        is_nonzero
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and use_triton_template(layout)
         and torch._inductor.config.run_autoheuristic(name)
         and is_triton(mat1)
@@ -1266,6 +1644,7 @@ def _to_dtype(x):
         if use_aten_gemm_kernels():
             always_included.append("extern_mm")
         num_choices_before_extra_configs = len(choices)
+<<<<<<< HEAD
         choices.extend(
             V.choices.get_template_configs(
                 # TODO(coconutruben): remove once we deprecate ah
@@ -1276,6 +1655,17 @@ def _to_dtype(x):
                 "mm-ah",
             )
         )
+=======
+        for config in extra_mm_configs(
+            m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+        ):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # using AutoHeuristic for ranking
         ah_choices = mm_autoheuristic(
@@ -1303,6 +1693,7 @@ def _to_dtype(x):
             else:
                 choices = choices[:num_choices_before_extra_configs]
 
+<<<<<<< HEAD
     if out_dtype is None:
         for k in inductor_config.external_matmul:
             choices.append(
@@ -1322,15 +1713,28 @@ def _to_dtype(x):
         layout,
         best_config_future=best_config_future,
     )
+=======
+    for k in inductor_config.external_matmul:
+        choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout))
+
+    return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten._int_mm, type_promotion_kind=None)
 def tuned_int_mm(mat1, mat2, *, layout=None):
+<<<<<<< HEAD
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=torch.int32
     )
     name = "int_mm"
+=======
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=torch.int32
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten._int_mm_{m}_{n}_{k}"] += 1
     log.info(
@@ -1343,6 +1747,7 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
         layout,
     )
 
+<<<<<<< HEAD
     static_shape, is_nonzero = _is_static_problem(layout)
     use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
     choices: list[ChoiceCaller] = []
@@ -1371,10 +1776,41 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
         )
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+=======
+    device_type = ir.get_device_type(mat1)
+
+    static_shape, is_nonzero = _is_static_problem(layout)
+    use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
+
+    choices = (
+        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    )
+
+    if use_cutlass and _use_cutlass_for_op("int_mm"):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+
+    int8_mm_configs = V.choices.get_int8_mm_configs(device_type)
+
+    if is_nonzero and use_triton_template(layout, enable_int32=True):
+        for config in int8_mm_configs(
+            m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+        ):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+<<<<<<< HEAD
     """
     Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
     """
@@ -1401,6 +1837,11 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         [inp_expanded, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
     )
     choices: list[ChoiceCaller] = []
+=======
+    device_type = ir.get_device_type(mat1)
+    m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
+    static_shape, is_nonzero = _is_static_problem(layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.addmm_{m}_{n}_{k}"] += 1
@@ -1413,6 +1854,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         mat2.get_dtype(),
         layout,
     )
+<<<<<<< HEAD
     if (not is_nonzero) or (
         not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
     ):
@@ -1456,24 +1898,138 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
         and _use_cutlass_for_op(name)
+=======
+
+    if (not is_nonzero) or (
+        not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
+    ):
+        # Use a FlexibleLayout if we are not autotuning.
+        # This allows padding strides for the output.
+        from torch._inductor.ir import FixedLayout, FlexibleLayout
+
+        if isinstance(layout, FixedLayout):
+            layout = FlexibleLayout(
+                device=layout.device, dtype=layout.dtype, size=layout.size
+            )
+        choices = (
+            [
+                aten_addmm.bind(
+                    (inp, mat1, mat2),
+                    layout,
+                    alpha=alpha,
+                    beta=beta,
+                )
+            ]
+            if use_aten_gemm_kernels()
+            else []
+        )
+        return autotune_select_algorithm("addmm", choices, [inp, mat1, mat2], layout)
+
+    choices = (
+        [
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                alpha=alpha,
+                beta=beta,
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    if (
+        use_aten_gemm_kernels()
+        and inp_expanded.get_stride()[0] == 0
+        and inp_expanded.get_device().type == "cuda"
+        and inductor_config.triton.autotune_cublasLt
+    ):
+        # unexpand inp to make sure fused addmm from cublasLt is used
+        choices.insert(
+            0,
+            aten_bias_addmm.bind(
+                (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+            ),
+        )
+
+    mm_configs = V.choices.get_base_mm_configs(device_type)
+    persistent_mm_configs = V.choices.get_persistent_mm_configs(device_type)
+
+    dtype = mat1.get_dtype()
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+        ):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp_expanded, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+            )
+
+        if use_triton_tma_template(mat1, mat2):
+            for config in persistent_mm_configs(
+                m,
+                n,
+                k,
+                **mm_config_kwargs(
+                    device_type, _is_large_block_for_cpu, dtype.itemsize
+                ),
+            ):
+                persistent_tma_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(inp_expanded, mat1, mat2),
+                    layout=layout,
+                    workspace_arg=get_tma_workspace_arg(
+                        num_tma_descriptors=2,
+                        device=mat1.get_device(),
+                    ),
+                    **mm_options(config, m, n, k, layout),
+                    **persistent_mm_options(mat1, mat2),
+                    prefix_args=1,
+                    epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+                )
+
+    if (
+        is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("addmm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
             layout,
+<<<<<<< HEAD
             # reorder here because CUTLASS expects (x, w, bias) but torch
             # is bias, x, w
             kernel_inputs.nodes(reorder=[1, 2, 0]),
             alpha=alpha,
             beta=beta,
+=======
+            [mat1, mat2, inp_expanded],
+            alpha=alpha,
+            beta=beta,
+            input_reorder=[2, 0, 1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(
             choices,
             layout,
+<<<<<<< HEAD
             # reorder here because CK expects (x, w, bias) but torch
             # is bias, x, w
             kernel_inputs.nodes(reorder=[1, 2, 0]),
+=======
+            [mat1, mat2, inp_expanded],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             alpha=alpha,
             beta=beta,
             input_reorder=[2, 0, 1],
@@ -1483,13 +2039,23 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         CppGemmTemplate.add_choices(
             choices,
             layout,
+<<<<<<< HEAD
             kernel_inputs.nodes(),
+=======
+            [inp_expanded, mat1, mat2],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             alpha=alpha,
             beta=beta,
             has_bias=True,
         )
 
+<<<<<<< HEAD
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+=======
+    return autotune_select_algorithm(
+        "addmm", choices, [inp_expanded, mat1, mat2], layout
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
@@ -1498,13 +2064,22 @@ def tuned_sparse_semi_structured_mm(
 ):
     from torch._inductor.select_algorithm import realize_inputs
 
+<<<<<<< HEAD
     # TODO(coconturuben): support V.choices.get_mm_configs for sparse_semi_structured_mm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2)
     m1, k1 = mat1.get_size()
     m2, _ = mat1_meta.get_size()
     k2, n = mat2.get_size()
+<<<<<<< HEAD
     m = V.graph.sizevars.check_equals_and_simplify(m1, m2)
     k = V.graph.sizevars.check_equals_and_simplify(2 * k1, k2)
+=======
+    m = V.graph.sizevars.guard_equals(m1, m2)
+    k = V.graph.sizevars.guard_equals(2 * k1, k2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if layout is None:
         from torch._inductor.ir import FixedLayout
 
@@ -1537,6 +2112,7 @@ def tuned_sparse_semi_structured_mm(
         )
 
     return autotune_select_algorithm(
+<<<<<<< HEAD
         "sparse_semi_structured_mm", choices, (mat1, mat1_meta, mat2), layout
     )
 
@@ -1623,6 +2199,13 @@ def get_scaling_options(
     raise AssertionError(
         f"Inductor Triton does not support scale_a.shape = {scale_a_size}, scale_b.shape = {scale_b_size}"
     )  # verify that shapes are supported by at least one existing pairing
+=======
+        "sparse_semi_structured_mm", choices, [mat1, mat1_meta, mat2], layout
+    )
+
+
+add_layout_constraint(aten._scaled_mm.default, constrain_to_fx_strides)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten._scaled_mm.default, type_promotion_kind=None)  # type: ignore[misc]
@@ -1652,7 +2235,10 @@ def tuned_scaled_mm(
     Returns:
         Tensor: The result of the scaled matrix multiplication
     """
+<<<<<<< HEAD
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     m, n, k, layout, mat_a, mat_b = mm_args(
         mat_a, mat_b, layout=layout, out_dtype=out_dtype
     )
@@ -1667,11 +2253,17 @@ def tuned_scaled_mm(
         mat_b.get_dtype(),
         layout,
     )
+<<<<<<< HEAD
     name = "scaled_mm"
+=======
+
+    device_type = ir.get_device_type(mat_a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_supported_striding(mat_a, mat_b)
 
     scale_a_real, scale_b_real = realize_inputs(scale_a, scale_b)
 
+<<<<<<< HEAD
     input_nodes: list[Any]
 
     if not bias:
@@ -1768,23 +2360,145 @@ def tuned_scaled_mm(
     # Early return for MX variants
     if scale_a.dtype != torch.float32:
         return autotune_select_algorithm(name, choices, input_nodes, layout)
+=======
+    input_nodes: tuple[Any, ...]
+
+    if not bias:
+        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real)
+    else:
+        bias_real = realize_inputs(bias)
+        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real, bias_real)
+
+    aten_choice = aten__fp8_mm.bind(
+        input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum
+    )
+
+    choices = []
+    if use_aten_gemm_kernels():
+        choices.append(aten_choice)
+
+    # We dont have triton lowerings for the MX variants yet
+    if scale_a.dtype != torch.float32:
+        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+
+    _, is_nonzero = _is_static_problem(layout)
+
+    scaled_mm_configs = V.choices.get_scaled_mm_configs(device_type)
+    scaled_persistent_mm_configs = V.choices.get_scaled_persistent_mm_configs(
+        device_type
+    )
+
+    if is_nonzero and use_triton_template(layout, enable_float8=True):
+        triton_input_nodes: tuple[Any, ...]
+        if bias and len(mat_b.get_size()) == len(bias.get_size()) + 1:
+            # Need to unsqueeze bias from [N] -> [1, N]
+            triton_bias = L[aten.unsqueeze](bias, 0)
+        else:
+            triton_bias = bias
+
+        if len(scale_a.get_size()) == 0 or len(scale_b.get_size()) == 0:
+            assert len(scale_a.get_size()) == len(scale_b.get_size())
+            # Need to unsqueeze scale from [] -> [1, 1]
+            triton_scale_a = L[aten.unsqueeze](L[aten.unsqueeze](scale_a, 0), 1)
+            triton_scale_b = L[aten.unsqueeze](L[aten.unsqueeze](scale_b, 0), 1)
+        else:
+            triton_scale_a = scale_a
+            triton_scale_b = scale_b
+
+        if bias:
+            triton_input_nodes = (
+                mat_a,
+                mat_b,
+                triton_scale_a,
+                triton_scale_b,
+                triton_bias,
+            )
+            suffix_args = 3
+        else:
+            triton_input_nodes = (mat_a, mat_b, triton_scale_a, triton_scale_b)
+            suffix_args = 2
+
+        # TODO (paulzhan): There is no template that exists for bias and TMA
+        # Don't run tma template currently if bias exists
+        if use_triton_tma_template(mat_a, mat_b) and not bias:
+            for config in scaled_persistent_mm_configs(m, n, k):
+                kwargs = scaled_mm_options(
+                    config,
+                    m,
+                    n,
+                    k,
+                    layout,
+                    scale_a,
+                    scale_b,
+                    use_fast_accum,
+                    device_tma=True,
+                )
+                scaled_mm_device_tma_template.maybe_append_choice(
+                    choices,
+                    input_nodes=triton_input_nodes,
+                    layout=layout,
+                    workspace_arg=get_tma_workspace_arg(
+                        num_tma_descriptors=2,
+                        device=mat_a.get_device(),
+                    ),
+                    **kwargs,
+                )
+
+        for config in scaled_mm_configs(m, n, k):
+            if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
+                # Triton crashes however uncommon for real workloads
+                continue
+
+            # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
+            # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
+            if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
+                continue
+
+            kwargs = scaled_mm_options(
+                config, m, n, k, layout, scale_a, scale_b, use_fast_accum
+            )
+            # possibly appends a TritonTemplateCaller to choices
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=triton_input_nodes,
+                layout=layout,
+                **kwargs,
+                suffix_args=suffix_args,
+                epilogue_fn=scale_mm_epilogue(),
+                epilogue_fn_hash="scale_mm_epilogue",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
+<<<<<<< HEAD
         and _use_cutlass_for_op(name)
+=======
+        and _use_cutlass_for_op("scaled_mm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
             layout,
+<<<<<<< HEAD
             kernel_inputs.nodes(),  # type: ignore[arg-type]
+=======
+            input_nodes,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_fast_accum=use_fast_accum,  # type: ignore[arg-type]
         )
 
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
+<<<<<<< HEAD
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+=======
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes)
+
+    return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 5da5eaa70ffb7..a75b2c2bf5eb0 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -3,15 +3,28 @@
 from collections.abc import Sequence
 from typing import Any
 
+<<<<<<< HEAD
+=======
+import sympy
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch._inductor.select_algorithm import realize_inputs, SymbolicGridFn
 from torch._inductor.utils import sympy_product
 from torch._inductor.virtualized import V
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
 
 from .. import config
 from ..codegen.wrapper import PythonWrapperCodegen
 from ..ir import _IntLike, Layout, TensorBox
+=======
+
+from .. import config as inductor_config
+from ..codegen.wrapper import PythonWrapperCodegen
+from ..ir import _IntLike, Layout, TensorBox
+from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -47,6 +60,99 @@ def acc_type(dtype):
     return f"tl.{dtype}".replace("torch.", "")
 
 
+<<<<<<< HEAD
+=======
+def mm_options(config, sym_m, sym_n, sym_k, layout):
+    """
+    Common options to matmul triton templates.
+    """
+    even_k_symbolic = (
+        # it isn't worth guarding on this
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"]) == config.kwargs["BLOCK_K"]
+    )
+    allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+        not inductor_config.force_same_precision
+        or ((sym_m % 16) == 0 and (sym_n % 16) == 0 and (sym_k % 8) == 0)
+    )
+    options_dict = dict(
+        EVEN_K=even_k_symbolic,
+        ALLOW_TF32=allow_tf32,
+        USE_FAST_ACCUM=False,  # Option for _scaled_mm
+        ACC_TYPE=acc_type(layout.dtype),
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        **config.kwargs,
+    )
+
+    # If GROUP_M not specified then default to 8
+    if "GROUP_M" not in config.kwargs:
+        group_m = config.kwargs.get("GROUP_M", 8)
+        options_dict["GROUP_M"] = group_m
+
+    return options_dict
+
+
+def tma_options() -> dict[str, Any]:
+    from torch.utils._triton import has_triton_stable_tma_api
+
+    return {"TMA_EXPERIMENTAL_API": not has_triton_stable_tma_api()}
+
+
+def persistent_mm_options(mat1, mat2):
+    res = dict(
+        A_ROW_MAJOR=not mat1.layout.is_transposed(),
+        B_ROW_MAJOR=not mat2.layout.is_transposed(),
+        NUM_SMS=get_num_sms(),
+        TMA_SIZE=TMA_DESCRIPTOR_SIZE,
+    )
+    res.update(tma_options())
+    return res
+
+
+def scaled_mm_options(  # type: ignore[no-untyped-def]
+    config,  # triton.Config
+    sym_m: sympy.core.numbers.Integer,
+    sym_n: sympy.core.numbers.Integer,
+    sym_k: sympy.core.numbers.Integer,
+    layout: Layout,
+    scale_a,
+    scale_b,
+    use_fast_accum: bool,
+    device_tma: bool = False,
+) -> dict[str, Any]:
+    def are_compatible_scales(size_a, size_b) -> bool:
+        # Same sized scales are compatible
+        if len(size_a) == len(size_b):
+            return True
+
+        # Both need to be scalars or len(1) tensors
+        if len(size_a) <= 1 and len(size_b) <= 1:
+            return True
+
+        return False
+
+    size_a, size_b = scale_a.get_size(), scale_b.get_size()
+    assert are_compatible_scales(size_a, size_b), (
+        "Expect scale_a and scale_b to be either both scalars (including single-element tensors) "
+        f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
+    )
+
+    mm_template_options = mm_options(config, sym_m, sym_n, sym_k, layout)
+
+    mm_template_options["ACC_TYPE"] = "tl.float32"
+    mm_template_options["USE_FAST_ACCUM"] = use_fast_accum
+    mm_template_options["SCALING_ROWWISE"] = len(size_a) == 2
+
+    if device_tma:
+        mm_template_options["TMA_SIZE"] = TMA_DESCRIPTOR_SIZE
+        mm_template_options["NUM_SMS"] = get_num_sms()
+
+    mm_template_options.update(tma_options())
+
+    return mm_template_options
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def mm_args(
     mat1,
     mat2,
@@ -65,10 +171,17 @@ def mm_args(
         *b2, n, k2 = mat2.get_size()
     else:
         *b2, k2, n = mat2.get_size()
+<<<<<<< HEAD
     b = [V.graph.sizevars.check_equals_and_simplify(a, b) for a, b in zip(b1, b2)]
     if use_4x2_dim:
         k2 = k2 * 2
     k = V.graph.sizevars.check_equals_and_simplify(k1, k2)
+=======
+    b = [V.graph.sizevars.guard_equals(a, b) for a, b in zip(b1, b2)]
+    if use_4x2_dim:
+        k2 = k2 * 2
+    k = V.graph.sizevars.guard_equals(k1, k2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if layout is None:
         from torch._inductor.ir import FixedLayout
 
@@ -89,6 +202,23 @@ def mm_args(
     return [m, n, k, layout, mat1, mat2, *others]
 
 
+<<<<<<< HEAD
+=======
+def mm_config_kwargs(device, exclude_condition, dtype_size=None):
+    if device == "cpu":
+        return {
+            "scale": 0.5,
+            "exclude": exclude_condition,
+        }
+
+    if dtype_size and inductor_config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+        return {
+            "dtype_size": dtype_size,
+        }
+    return {}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def addmm_epilogue(dtype, alpha, beta):
     def epilogue(acc, bias):
         if alpha != 1:
@@ -128,6 +258,7 @@ def epilogue(acc, inv_a_scale, inv_b_scale, bias=None):
     return epilogue
 
 
+<<<<<<< HEAD
 def use_native_matmul(mat1, mat2):
     if not config.triton.native_matmul:
         return False
@@ -185,6 +316,8 @@ def use_native_matmul(mat1, mat2):
     return True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_static_problem(layout: Layout) -> tuple[bool, bool]:
     """
     Check if input tensors and output layout have static shapes and non-zero sizes.
@@ -239,7 +372,11 @@ def has_zero_dim(size: Sequence[_IntLike]) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_batch_stride_largest_or_zero(mat1, mat2, layout) -> bool:
+=======
+def is_batch_stride_largest(mat1, mat2, layout) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Checking if the batch stride is the largest in the stride.
     """
@@ -247,7 +384,11 @@ def is_batch_stride_largest_or_zero(mat1, mat2, layout) -> bool:
     strides = [mat1.get_stride(), mat2.get_stride(), layout.stride]
     for size, stride in zip(sizes, strides):
         assert len(size) == len(stride) == 3, "Expect 3D tensors"
+<<<<<<< HEAD
         if stride[0] != 0 and stride[0] != sympy_product(size[1:]):
+=======
+        if stride[0] != sympy_product(size[1:]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
     return True
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index aef8dfb2168f4..b022cfa54ea7a 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import logging
 from typing import TYPE_CHECKING, Union
 
@@ -7,6 +8,11 @@
 
 from .. import config as inductor_config
 from ..kernel_inputs import MMKernelInputs
+=======
+import torch
+
+from .. import ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..lowering import lowerings
 from ..select_algorithm import (
     autotune_select_algorithm,
@@ -15,6 +21,7 @@
 )
 from ..utils import use_aten_gemm_kernels, use_triton_template
 from ..virtualized import V
+<<<<<<< HEAD
 from .mm_common import mm_args, mm_grid
 
 
@@ -24,6 +31,11 @@
 
 log = logging.getLogger(__name__)
 
+=======
+from .mm_common import mm_args, mm_grid, mm_options
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 
 aten_mm_plus_mm = ExternKernelChoice(
@@ -53,7 +65,11 @@
     stride_dn = {{stride("D", 1)}}
 
     # based on triton.ops.matmul
+<<<<<<< HEAD
     pid = tl.program_id(0).to(INDEX_DTYPE)
+=======
+    pid = tl.program_id(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grid_m = (M + BLOCK_M - 1) // BLOCK_M
     grid_n = (N + BLOCK_N - 1) // BLOCK_N
 
@@ -119,7 +135,11 @@
     mask = (idx_m < M) & (idx_n < N)
 
     # inductor generates a suffix
+<<<<<<< HEAD
     {{store_output(("idx_m", "idx_n"), "acc", "mask", val_shape=("BLOCK_M", "BLOCK_N"))}}
+=======
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
     cache_codegen_enabled_for_template=True,
 )
@@ -129,9 +149,15 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
     """
     Computes mm(mat1, mat2) + mm(mat3, mat4)
     """
+<<<<<<< HEAD
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
     m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+=======
+    m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+    device_type = ir.get_device_type(mat1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Optimization is optional, because we can always just not do the fusion
     if (
@@ -143,7 +169,10 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
         or not V.graph.sizevars.statically_known_list_equals(
             mat2.get_size(), mat4.get_size()
         )
+<<<<<<< HEAD
         or inductor_config.triton.native_matmul
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # TODO(jansel): support different K values when this is fixed:
         # https://github.com/triton-lang/triton/issues/967
@@ -151,6 +180,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
             lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
         )
 
+<<<<<<< HEAD
     # Create MMKernelInputs for MM Plus MM (matrices are at indices 0, 1 for first pair)
     # Note: This is a special case with 4 matrices, but we use the first pair for M, N, K extraction
     kernel_inputs = MMKernelInputs([mat1, mat2, mat3, mat4], mat1_idx=0, mat2_idx=1)
@@ -174,4 +204,29 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
 
     return autotune_select_algorithm(
         "mm_plus_mm", choices, kernel_inputs.nodes(), layout1
+=======
+    assert layout1 == layout2
+    # options to tune from
+    choices = (
+        [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout1)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    mm_configs = V.choices.get_mm_plus_mm_configs(device_type)
+    if use_triton_template(layout1):
+        for config in mm_configs():
+            # see https://github.com/triton-lang/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
+                mm_plus_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2, mat3, mat4),
+                    layout=layout1,
+                    **mm_options(config, m1, n1, k1, layout1),
+                )
+
+    return autotune_select_algorithm(
+        "mm_plus_mm", choices, [mat1, mat2, mat3, mat4], layout1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/torch/_inductor/kernel/mm_scaled_grouped.py b/torch/_inductor/kernel/mm_scaled_grouped.py
new file mode 100644
index 0000000000000..ad34ea0210b51
--- /dev/null
+++ b/torch/_inductor/kernel/mm_scaled_grouped.py
@@ -0,0 +1,741 @@
+# mypy: allow-untyped-defs
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+from torch._dynamo.utils import counters
+from torch._inductor.runtime.triton_compat import tl
+from torch._inductor.virtualized import V
+from torch.utils._triton import has_triton
+
+from ..ir import ChoiceCaller, Layout, TensorBox
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    realize_inputs,
+    TritonTemplate,
+)
+from ..utils import (
+    get_gpu_shared_memory,
+    get_num_sms,
+    has_free_symbols,
+    use_aten_gemm_kernels,
+)
+from .mm_common import (
+    _is_static_problem,
+    check_supported_striding,
+    persistent_grouped_mm_grid,
+)
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+@dataclass
+class Config:
+    kwargs: dict[str, int]
+    num_stages: int
+    num_warps: int
+
+
+_NV_CONFIGS = [
+    Config(
+        {
+            "BLOCK_M": block_size_m,
+            "BLOCK_N": block_size_n,
+            "BLOCK_K": block_size_k,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+    )
+    for block_size_m in [16, 32, 64, 128]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+]
+
+
+def grouped_mm_configs():
+    return _NV_CONFIGS
+
+
+def early_config_prune(g, m, configs, named_args):
+    dtsize = 1
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps, num_consumer_groups = (
+            kw["BLOCK_M"],
+            kw["BLOCK_N"],
+            kw["BLOCK_K"],
+            config.num_stages,
+            config.num_warps,
+            getattr(config, "num_consumer_groups", 0),
+        )
+
+        # 1. Prune NV configs depending on g and m.
+        if not has_free_symbols((g, m)):
+            a_is_2d, b_is_2d = named_args["A_IS_2D"], named_args["B_IS_2D"]
+            m_avg = m // g if a_is_2d and not b_is_2d else m
+            if m_avg <= 16:
+                if BLOCK_M > 32:
+                    continue
+            elif m_avg <= 32:
+                if BLOCK_M > 64:
+                    continue
+            elif m_avg <= 64:
+                if BLOCK_M <= 16:
+                    continue
+            else:
+                if BLOCK_M <= 32:
+                    continue
+
+        # 2. make sure we have enough smem
+        max_shared_memory = get_gpu_shared_memory()
+
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+
+        use_warp_specialization = num_consumer_groups >= 1
+
+        # 3. make sure we can partition for ws
+        if use_warp_specialization:
+            if num_warps != 4:
+                continue
+
+            # "tritongpu-warp-spec-data-partition"
+            m_slice = BLOCK_M // num_consumer_groups
+            n_slice = BLOCK_N // num_consumer_groups
+            if m_slice < 64 and n_slice < 256:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+triton_grouped_mm_source = r"""
+{%- if SCALED %}
+{%- if A_IS_2D or B_IS_2D %}
+{{def_kernel("a_ptr", "b_ptr", "scale_a_ptr", "scale_b_ptr", "offsets_ptr")}}
+{%- else %}
+{{def_kernel("a_ptr", "b_ptr", "scale_a_ptr", "scale_b_ptr")}}
+{%- endif %}
+{%- else %}
+{%- if A_IS_2D or B_IS_2D %}
+{{def_kernel("a_ptr", "b_ptr", "offsets_ptr")}}
+{%- else %}
+{{def_kernel("a_ptr", "b_ptr")}}
+{%- endif %}
+{%- endif %}
+    tidx = tl.program_id(0)
+
+{%- set M_IS_VARYING = A_IS_2D and not B_IS_2D %}
+{%- set N_IS_VARYING = not A_IS_2D and B_IS_2D %}
+{%- set K_IS_VARYING = A_IS_2D and B_IS_2D %}
+
+{%- if A_IS_2D %}
+{%- if B_IS_2D %}
+    G = {{size("offsets_ptr", 0)}}
+{%- else %}
+    G = {{size("b_ptr", 0)}}
+{%- endif %}
+{%- else %}
+{%- if B_IS_2D %}
+    G = {{size("a_ptr", 0)}}
+{%- else %}
+    G = {{size("a_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+
+    # the b_ptr tensor is given with its last two dims transposed, revert here
+
+    M = {{size("a_ptr", -2)}}
+    N = {{size("b_ptr", -1)}}
+    K = {{size("a_ptr", -1)}}
+
+    A_STRIDE_M = {{stride("a_ptr", -2)}}
+    A_STRIDE_K = {{stride("a_ptr", -1)}}
+{%- if not A_IS_2D %}
+    A_STRIDE_G = {{stride("a_ptr", 0)}}
+{%- if SCALED %}
+    SCALE_A_STRIDE_G = {{stride("scale_a_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+    B_STRIDE_N = {{stride("b_ptr", -1)}}
+    B_STRIDE_K = {{stride("b_ptr", -2)}}
+{%- if not B_IS_2D %}
+    B_STRIDE_G = {{stride("b_ptr", 0)}}
+{%- if SCALED %}
+    SCALE_B_STRIDE_G = {{stride("scale_b_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+
+{%- if USE_TMA_LOAD %}
+{%- if USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR %}
+    a_desc = tl._experimental_make_tensor_descriptor(
+{%- else %}
+    a_desc = tl.make_tensor_descriptor(
+{%- endif %}
+        a_ptr,
+{%- if A_IS_2D %}
+        shape=[M, K],
+        # fixme: strides=[A_STRIDE_M, A_STRIDE_K],
+        strides=[{{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
+        block_shape=[BLOCK_M, BLOCK_K],
+{%- else %}
+        shape=[G, M, K],
+        # fixme: strides=[A_STRIDE_G, A_STRIDE_M, A_STRIDE_K],
+        strides=[{{stride("a_ptr", 0)}}, {{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
+        block_shape=[1, BLOCK_M, BLOCK_K],
+{%- endif %}
+    )
+
+{%- if USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR %}
+    b_desc = tl._experimental_make_tensor_descriptor(
+{%- else %}
+    b_desc = tl.make_tensor_descriptor(
+{%- endif %}
+        b_ptr,
+{%- if B_IS_2D %}
+        shape=[N, K],
+        # fixme: strides=[B_STRIDE_N, B_STRIDE_K],
+        strides=[{{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
+        block_shape=[BLOCK_N, BLOCK_K],
+{%- else %}
+        shape=[G, N, K],
+        # fixme: strides=[B_STRIDE_G, B_STRIDE_N, B_STRIDE_K],
+        strides=[{{stride("b_ptr", 0)}}, {{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
+        block_shape=[1, BLOCK_N, BLOCK_K],
+{%- endif %}
+    )
+{%- endif %}
+
+{%- if M_IS_VARYING %}
+    m_end_offset = 0
+{%- endif %}
+{%- if N_IS_VARYING %}
+    n_end_offset = 0
+{%- endif %}
+{%- if K_IS_VARYING %}
+    k_end_offset = 0
+{%- endif %}
+    iterated_tiles = 0
+    for g in tl.range(G):
+{%- if M_IS_VARYING %}
+        # Move across groups
+        m_start_offset = m_end_offset
+        m_end_offset = tl.load(offsets_ptr + g)
+        m_size = m_end_offset - m_start_offset
+{%- if SCALED %}
+        m_scale_start_offset = m_start_offset
+{%- endif %}
+{%- else %}
+        m_start_offset = 0
+        m_size = M
+{%- if SCALED %}
+        m_scale_start_offset = g * M
+{%- endif %}
+{%- endif %}
+
+{%- if N_IS_VARYING %}
+        # Move across groups
+        n_start_offset = n_end_offset
+        n_end_offset = tl.load(offsets_ptr + g)
+        n_size = n_end_offset - n_start_offset
+{%- if SCALED %}
+        n_scale_start_offset = n_start_offset
+{%- endif %}
+{%- else %}
+        n_start_offset = 0
+        n_size = N
+{%- if SCALED %}
+        n_scale_start_offset = g * N
+{%- endif %}
+{%- endif %}
+
+        if m_size > 0 and n_size > 0:
+{%- if K_IS_VARYING %}
+            # Move across groups
+            k_start_offset = k_end_offset
+            k_end_offset = tl.load(offsets_ptr + g)
+            k_size = k_end_offset - k_start_offset
+{%- else %}
+            k_start_offset = 0
+            k_size = K
+{%- endif %}
+
+            num_m_tiles = tl.cdiv(m_size, BLOCK_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_N)
+            num_tiles = num_m_tiles * num_n_tiles
+
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+
+                accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+{%- if USE_TMA_LOAD %}
+                m_offset = (m_start_offset + tile_m_idx * BLOCK_M).to(tl.int32)
+                n_offset = (n_start_offset + tile_n_idx * BLOCK_N).to(tl.int32)
+
+                for k_offset in range(0, k_size, BLOCK_K):
+{%- if A_IS_2D %}
+                    a = a_desc.load([m_offset, k_start_offset + k_offset])
+{%- else %}
+                    a = a_desc.load([g, m_offset, k_start_offset + k_offset]).reshape(BLOCK_M, BLOCK_K)
+{%- endif %}
+{%- if B_IS_2D %}
+                    b = b_desc.load([n_offset, k_start_offset + k_offset])
+{%- else %}
+                    b = b_desc.load([g, n_offset, k_start_offset + k_offset]).reshape(BLOCK_N, BLOCK_K)
+{%- endif %}
+
+{%- if K_IS_VARYING %}
+                    if k_offset + BLOCK_K > k_size:
+                        group_offs_k = k_offset + tl.arange(0, BLOCK_K)
+                        a = tl.where(group_offs_k < k_size, a, 0)
+                        b = tl.where(group_offs_k < k_size, b, 0)
+{%- endif %}
+
+{%- if USE_FAST_ACCUM %}
+                    accumulator = tl.dot(a, b.T, accumulator)
+{%- else %}
+                    accumulator += tl.dot(a, b.T)
+{%- endif %}
+{%- else %}
+                offs_am = tile_m_idx * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = tile_n_idx * BLOCK_N + tl.arange(0, BLOCK_N)
+                offs_k = k_start_offset + tl.arange(0, BLOCK_K)
+                a_ptrs = (
+                    a_ptr
+{%- if not A_IS_2D %}
+                    + g * A_STRIDE_G
+{%- endif %}
+                    + (m_start_offset + offs_am[:, None]) * A_STRIDE_M
+                    + offs_k[None, :] * A_STRIDE_K
+                )
+                b_ptrs = (
+                    b_ptr
+{%- if not B_IS_2D %}
+                    + g * B_STRIDE_G
+{%- endif %}
+                    + (n_start_offset + offs_bn[:, None]) * B_STRIDE_N
+                    + offs_k[None, :] * B_STRIDE_K
+                )
+                for k_offset in range(0, k_size, BLOCK_K):
+                    a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
+                    b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                    if k_offset + BLOCK_K > k_size:
+                        group_offs_k = k_offset + tl.arange(0, BLOCK_K)
+                        a = tl.where(group_offs_k < k_size, a, 0)
+                        b = tl.where(group_offs_k < k_size, b, 0)
+{%- if USE_FAST_ACCUM %}
+                    accumulator = tl.dot(a, b.T, accumulator)
+{%- else %}
+                    accumulator += tl.dot(a, b.T)
+{%- endif %}
+                    a_ptrs += BLOCK_K
+                    b_ptrs += BLOCK_K
+{%- endif %}
+
+                offs_am = tile_m_idx * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = tile_n_idx * BLOCK_N + tl.arange(0, BLOCK_N)
+{%- if SCALED %}
+                scale_a = tl.load(
+                    scale_a_ptr
+{%- if A_IS_2D %}
+                    + m_scale_start_offset
+{%- else %}
+                    + g * SCALE_A_STRIDE_G
+{%- endif %}
+                    + offs_am[:, None],
+                    mask=offs_am[:, None] < m_size,
+                )
+                scale_b = tl.load(
+                    scale_b_ptr
+{%- if B_IS_2D %}
+                    + n_scale_start_offset
+{%- else %}
+                    + g * SCALE_B_STRIDE_G
+{%- endif %}
+                    + offs_bn[None, :],
+                    mask=offs_bn[None, :] < n_size,
+                )
+                c = accumulator.to(tl.float32) * scale_a * scale_b
+{%- else %}
+                c = accumulator.to(tl.float32)
+{%- endif %}
+
+{%- if M_IS_VARYING %}
+                idx_m = (m_start_offset + offs_am[:, None])
+{%- else %}
+                idx_m = offs_am[:, None]
+{%- endif %}
+{%- if N_IS_VARYING %}
+                idx_n = (n_start_offset + offs_bn[None, :])
+{%- else %}
+                idx_n = offs_bn[None, :]
+{%- endif %}
+                mask = offs_am[:, None] < m_size and offs_bn[None, :] < n_size
+{%- if M_IS_VARYING or N_IS_VARYING %}
+                {{store_output(("idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+{%- else %}
+                {{store_output(("g", "idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+{%- endif %}
+                tidx += NUM_SMS
+
+            iterated_tiles += num_tiles
+"""
+
+
+triton_grouped_mm_template = TritonTemplate(
+    name="grouped_mm",
+    grid=persistent_grouped_mm_grid,
+    source=triton_grouped_mm_source,
+)
+
+triton_scaled_grouped_mm_template = TritonTemplate(
+    name="scaled_grouped_mm",
+    grid=persistent_grouped_mm_grid,
+    source=triton_grouped_mm_source,
+)
+
+
+def grouped_mm_args(
+    mat1: TensorBox,
+    mat2: TensorBox,
+    offs: Optional[TensorBox],
+    layout=None,
+    out_dtype=None,
+):
+    mat1, mat2 = realize_inputs(mat1, mat2)
+    if offs is not None:
+        realize_inputs(offs)
+    mat1_size = mat1.get_size()
+    mat2_size = mat2.get_size()
+
+    m1dim, m2dim = len(mat1_size), len(mat2_size)
+
+    assert m1dim == 2 or m1dim == 3
+    assert m2dim == 2 or m2dim == 3
+
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
+
+        dims = []
+        if m1dim == 2:
+            if m2dim == 2:
+                assert offs is not None
+                dims = [offs.get_size()[0], mat1_size[0], mat2_size[1]]
+            else:
+                dims = [mat1_size[0], mat2_size[-1]]
+        else:
+            if m2dim == 2:
+                dims = [mat1_size[1], mat2_size[1]]
+            else:
+                dims = [mat1_size[0], mat1_size[1], mat2_size[-1]]
+        layout = FixedLayout(
+            mat1.get_device(),
+            out_dtype,
+            dims,
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+
+    return (mat1_size, mat2_size, layout, mat1, mat2, offs)
+
+
+aten__grouped_mm = ExternKernelChoice(
+    torch._grouped_mm,
+    "at::_grouped_mm",
+    op_overload=aten._grouped_mm,
+    has_out_variant=False,
+)
+
+
+aten__scaled_grouped_mm = ExternKernelChoice(
+    torch._scaled_grouped_mm,
+    "at::_scaled_grouped_mm",
+    op_overload=aten._scaled_grouped_mm,
+    has_out_variant=False,
+)
+
+
+def can_use_triton_kernel(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    offs: Optional[TensorBox],
+    bias: Optional[TensorBox],
+    scale_result: Optional[TensorBox],
+) -> bool:
+    if not (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (9, 0)
+        and not torch.version.hip
+    ):
+        return False
+    if not has_triton():
+        return False
+
+    # The _grouped_mm()/_scaled_grouped_mm() operator do not support
+    # bias nor scale_result yet.
+    if bias is not None:
+        return False
+    if scale_result is not None:
+        return False
+
+    if len(mat_a.get_size()) == 2 or len(mat_b.get_size()) == 2:
+        return offs is not None
+    else:
+        return offs is None
+
+
+def create_offsets(x, m1_size, m2_size, offs_size):
+    m1_is_2d = len(m1_size) == 2
+    m2_is_2d = len(m2_size) == 2
+    if m1_is_2d:
+        if m2_is_2d:
+            k = V.graph.sizevars.size_hint(m1_size[1])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = k / noffs
+            return torch.linspace(
+                step, k, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+
+        else:
+            m = V.graph.sizevars.size_hint(m1_size[0])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = m / noffs
+            return torch.linspace(
+                step, m, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+    else:
+        if m2_is_2d:
+            n = V.graph.sizevars.size_hint(m2_size[0])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = n / noffs
+            return torch.linspace(
+                step, n, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+        else:
+            return None
+
+
+def _tuned_grouped_mm_common(
+    operator_name: str,
+    algorithm_name: str,
+    extern_kernel_choice: ExternKernelChoice,
+    kernel_template: TritonTemplate,
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: Optional[TensorBox] = None,
+    scale_b: Optional[TensorBox] = None,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: Optional[bool] = None,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    assert (scale_a is None) == (scale_b is None)
+    assert scale_result is None or scale_a is not None
+
+    m1_size, m2_size, layout, mat_a, mat_b, offs = grouped_mm_args(
+        mat_a, mat_b, offs, layout=layout, out_dtype=out_dtype
+    )
+    counters["aten_mm_info"][operator_name] += 1
+    log_message = f"Tuned {operator_name}: mat1_shape=%s, mat2_shape=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s"
+    log.info(
+        log_message,
+        m1_size,
+        m2_size,
+        mat_a.get_dtype(),
+        mat_b.get_dtype(),
+        layout,
+    )
+
+    if scale_a is not None and scale_b is not None:
+        check_supported_striding(mat_a, mat_b)
+
+    # workaround for Inductor not supporting optional tensor input arguments
+    input_nodes: list[Any] = [mat_a, mat_b]
+    if scale_a is not None:
+        input_nodes.append(realize_inputs(scale_a))
+    if scale_b is not None:
+        input_nodes.append(realize_inputs(scale_b))
+    if offs is not None:
+        input_nodes.append(realize_inputs(offs))
+
+    if use_fast_accum is None:
+        aten_choice = extern_kernel_choice.bind(
+            input_nodes,
+            layout,
+            out_dtype=out_dtype,
+        )
+    else:
+        aten_choice = extern_kernel_choice.bind(
+            input_nodes,
+            layout,
+            out_dtype=out_dtype,
+            use_fast_accum=use_fast_accum,
+        )
+    if use_fast_accum is None:
+        use_fast_accum = False
+
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.append(aten_choice)
+
+    _, is_nonzero = _is_static_problem(layout)
+
+    # Checking only for the equality of corresponding dims of
+    # multiplicands here, relying on meta function checks for
+    # everything else.
+    if is_nonzero and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result):
+        scaled = scale_a is not None
+        if len(m1_size) == 2:
+            if len(m2_size) == 2:
+                m, k1 = m1_size
+                k2, _ = m2_size
+                g = offs.get_size()[0]
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = True, True
+            else:
+                g1 = offs.layout.size[0]
+                m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = True, False
+        else:
+            if len(m2_size) == 2:
+                g1 = offs.layout.size[0]
+                g2, m, k1 = m1_size
+                k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = False, True
+            else:
+                g1, m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = False, False
+
+        triton_has_make_tensor_descriptor = hasattr(tl, "make_tensor_descriptor")
+        triton_has_experimental_make_tensor_descriptor = hasattr(
+            tl, "_experimental_make_tensor_descriptor"
+        )
+        use_tma_load = (
+            triton_has_make_tensor_descriptor
+            or triton_has_experimental_make_tensor_descriptor
+        )
+        # The make_tensor_descriptor imposes this additional limitation.
+        use_tma_load = use_tma_load and (
+            mat_a.get_stride()[-1] == 1 and mat_b.get_stride()[-2] == 1
+        )
+
+        kwargs = {
+            "SCALED": scaled,
+            "A_IS_2D": a_is_2d,
+            "B_IS_2D": b_is_2d,
+            "USE_FAST_ACCUM": use_fast_accum,
+            "NUM_SMS": get_num_sms(),
+            "USE_TMA_LOAD": use_tma_load,
+            "USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR": triton_has_experimental_make_tensor_descriptor,
+        }
+
+        for config in early_config_prune(g, m, grouped_mm_configs(), kwargs):
+            kernel_template.maybe_append_choice(
+                choices,
+                input_nodes=input_nodes,
+                layout=layout,
+                num_stages=config.num_stages,
+                num_warps=config.num_warps,
+                **kwargs,
+                **config.kwargs,
+            )
+
+    input_gen_fns = {
+        4: lambda x: create_offsets(
+            x, m1_size, m2_size, offs.get_size() if offs is not None else None
+        ),
+    }
+    return autotune_select_algorithm(
+        algorithm_name, choices, input_nodes, layout, input_gen_fns=input_gen_fns
+    )
+
+
+@register_lowering(aten._grouped_mm.default, type_promotion_kind=None)
+def tuned_grouped_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    """Auto-tuning for _grouped_mm() operator."""
+
+    return _tuned_grouped_mm_common(
+        "aten._grouped_mm.default",
+        "grouped_mm",
+        aten__grouped_mm,
+        triton_grouped_mm_template,
+        mat_a,
+        mat_b,
+        None,
+        None,
+        offs,
+        bias,
+        None,
+        out_dtype,
+        None,
+        layout,
+    )
+
+
+@register_lowering(aten._scaled_grouped_mm.default, type_promotion_kind=None)
+def tuned_scaled_grouped_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: TensorBox,
+    scale_b: TensorBox,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    """Auto-tuning for _scaled_grouped_mm() operator."""
+
+    return _tuned_grouped_mm_common(
+        "aten._scaled_grouped_mm.default",
+        "scaled_grouped_mm",
+        aten__scaled_grouped_mm,
+        triton_scaled_grouped_mm_template,
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        offs,
+        bias,
+        scale_result,
+        out_dtype,
+        use_fast_accum,
+        layout,
+    )
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index 77969952fce20..30793baf10daf 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -52,7 +52,10 @@ def __init__(self, graph, submodules):
         self.current_node = None
 
     def run_node(self, n: torch.fx.Node) -> Any:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.current_node = n
         return super().run_node(n)
 
@@ -104,6 +107,7 @@ class LoopBody:
     memory_usage: dict[MemoryUsageType, list[MemoryEntry]]
     op_counts: collections.Counter[str]
 
+<<<<<<< HEAD
     def __init__(
         self,
         fn,
@@ -113,6 +117,9 @@ def __init__(
         reduce_vars,
         allow_same_symbol_in_index=False,
     ):
+=======
+    def __init__(self, fn, args, var_ranges, iter_vars, reduce_vars):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
 
         _flat_sizes = tuple(var_ranges.values())
@@ -126,12 +133,17 @@ def __init__(
         self.var_ranges = var_ranges
 
         if isinstance(fn, LoopBody):
+<<<<<<< HEAD
             self._init_with_copy(fn, args, allow_same_symbol_in_index)
+=======
+            self._init_with_copy(fn, args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self._init_with_tracing(fn, args)
 
         self.indexing = None
 
+<<<<<<< HEAD
     def extract_pw_from_reduction(self):
         self.root_block = self.root_block.extract_pw_from_reduction()
         self.has_partial_accumulate = True
@@ -140,6 +152,8 @@ def extract_pw_from_reduction(self):
         self.sizes = (self.sizes[0] + self.sizes[1], tuple())
         return self
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _init_with_tracing(self, fn, args):
         """Do an FX trace of an arbitrary callable to construct self"""
         self.indexing_exprs = {}
@@ -151,18 +165,28 @@ def _init_with_tracing(self, fn, args):
         self.memory_usage = {t: [] for t in MemoryUsageType}
         self.op_counts = collections.Counter()
         self.root_block = LoopBodyBlock(self, fn, args)  # traces
+<<<<<<< HEAD
         self.has_partial_accumulate = self.root_block.graph.find_nodes(
             op="call_method", target="partial_accumulate"
         )
         del self.indexing_exprs_name  # not used after _init_with_tracing
 
     def _init_with_copy(self, other: LoopBody, args, allow_same_symbol_in_index):
+=======
+        del self.indexing_exprs_name  # not used after _init_with_tracing
+
+    def _init_with_copy(self, other: LoopBody, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         _init_with_tracing() is slow, so this is a fast path in the case
         where we are just reordering/merging/splitting the args of an
         existing LoopBody.
         """
+<<<<<<< HEAD
         indexing_exprs = other.indexing_from_args(args, allow_same_symbol_in_index)
+=======
+        indexing_exprs = other.indexing_from_args(args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.indexing_exprs = {
             name: V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges)
             for name, expr in indexing_exprs.items()
@@ -173,7 +197,10 @@ def _init_with_copy(self, other: LoopBody, args, allow_same_symbol_in_index):
         self.memory_usage = other.memory_usage
         self.op_counts = other.op_counts
         self.root_block = other.root_block.clone(self)
+<<<<<<< HEAD
         self.has_partial_accumulate = other.has_partial_accumulate
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         submodules = {**other.submodules}
         submodules.pop("get_index")
@@ -208,22 +235,41 @@ def merge_loops(self) -> LoopBody:
             index_prevent_reordering(index_exprs, old_reduce_vars, old_reduce_sizes),
         )
 
+<<<<<<< HEAD
         if iter_sizes == old_iter_sizes and reduce_sizes == old_reduce_sizes:
             return old_body
 
+=======
+        # if iter_sizes == old_iter_sizes:
+        #     # no dimensions get merged.
+        #     return old_sizes, old_body
+
+        # Note: if no dimension get merges, the symbol prefix will
+        # remain 'y'. But if we merge dimensions, we change prefix to
+        # 'z'. If this is an issue, we can always retrace the LoopBody
+        # to change symbol prefix to 'z'.
+        #
+        # There is indeed an issue due to symbol name conflicting.
+        # y0 maybe reused for the y dimension later.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             (
                 iter_vars,
                 reduce_vars,
             ),
             var_ranges,
+<<<<<<< HEAD
         ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="p")
+=======
+        ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="t")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_body = LoopBody(
             old_body,
             [iter_reindex(iter_vars), reduce_reindex(reduce_vars)],
             var_ranges,
             iter_vars,
             reduce_vars,
+<<<<<<< HEAD
             allow_same_symbol_in_index=True,
         )
 
@@ -275,6 +321,19 @@ def new_body(*indices: Sequence[sympy.Expr]) -> Any:
             loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
         )
         return new_body
+=======
+        )
+
+        # use the original symbol prefix
+        # Can try to optimize if this is a bottleneck for compilation time
+        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
+            iter_sizes, reduce_sizes, prefix="p"
+        )
+        new_body2 = LoopBody(
+            new_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        )
+        return new_body2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def reorder_iter_loops(self, new_order) -> LoopBody:
         """
@@ -294,7 +353,11 @@ def reorder_iter_loops(self, new_order) -> LoopBody:
 
         (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
             *new_sizes,
+<<<<<<< HEAD
             prefix="p",  # type: ignore[arg-type]
+=======
+            prefix="t",  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         inverse_order = {b: a for a, b in enumerate(new_order)}
@@ -306,6 +369,7 @@ def new_body(*indices: Sequence[sympy.Expr]) -> Any:
             iter_idx = index[: len(iter_size)]
             reduce_idx = index[len(iter_size) :]
             iter_idx = [iter_idx[i] for i in inverse_order]
+<<<<<<< HEAD
             return old_body(iter_idx, reduce_idx, allow_same_symbol_in_index=True)
 
         return LoopBody(
@@ -316,6 +380,24 @@ def new_body(*indices: Sequence[sympy.Expr]) -> Any:
             reduce_vars,
         )
 
+=======
+            return old_body(iter_idx, reduce_idx)
+
+        loop_body = LoopBody(
+            new_body, (iter_vars, reduce_vars), var_ranges, iter_vars, reduce_vars
+        )
+
+        # use the original symbol prefix so we can do multiple round of reordering
+        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
+            *new_sizes,
+            prefix="p",  # type: ignore[arg-type]
+        )
+        new_body = LoopBody(
+            loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        )
+        return new_body
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def vars(self):
         assert self.iter_vars is not None
@@ -449,13 +531,17 @@ def replace_indirect(self, old, new):
         if str(old) == str(new):
             return
         assert self.indexing is not None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.indexing = {k: sympy_subs(v, {old: new}) for k, v in self.indexing.items()}
 
     def get_index(self, name):
         assert self.indexing is not None
         return self.indexing[name]
 
+<<<<<<< HEAD
     def indexing_from_args(self, indices, allow_same_symbol_in_index=False):
         index = [*itertools.chain.from_iterable(indices)]
         assert len(index) == len(self.var_ranges), (index, self.var_ranges)
@@ -463,14 +549,27 @@ def indexing_from_args(self, indices, allow_same_symbol_in_index=False):
             v not in self.var_ranges for v in index
         ), f"{self.var_ranges=}, {indices=}"
 
+=======
+    def indexing_from_args(self, indices):
+        index = [*itertools.chain.from_iterable(indices)]
+        assert len(index) == len(self.var_ranges), (index, self.var_ranges)
+        assert all(v not in self.var_ranges for v in index), (
+            f"{self.var_ranges=}, {indices=}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replacements = dict(zip(self.var_ranges.keys(), index))
         return {
             name: sympy_subs(expr, replacements)
             for name, expr in self.indexing_exprs.items()
         }
 
+<<<<<<< HEAD
     def __call__(self, *indices, allow_same_symbol_in_index=False):
         self.indexing = self.indexing_from_args(indices, allow_same_symbol_in_index)
+=======
+    def __call__(self, *indices):
+        self.indexing = self.indexing_from_args(indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = self.root_block()
         self.indexing = None
         return result
@@ -536,6 +635,7 @@ def __init__(self, body: LoopBody, fn: Callable[..., Any], args: list[Any]):
             ops.output(fn(*args))
         self.graph = tracer.graph
 
+<<<<<<< HEAD
     def extract_pw_from_reduction(self):
         red = None
         store = None
@@ -561,6 +661,8 @@ def extract_pw_from_reduction(self):
         self.graph.erase_node(red)
         return self
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __call__(self):
         graph = self.graph
         submodules = self.body.submodules
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 52521285dfec3..c1cdd308659a2 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -26,8 +26,12 @@
 from torch._dynamo.utils import counters
 from torch._higher_order_ops.associative_scan import associative_scan_op
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
+<<<<<<< HEAD
 from torch._library.utils import get_layout_constraint_tag
 from torch._prims_common import (  # pyrefly: ignore  # deprecated
+=======
+from torch._prims_common import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     canonicalize_dim,
     canonicalize_dims,
     check,
@@ -41,6 +45,7 @@
     Number,
 )
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import (
     free_unbacked_symbols,
     has_free_unbacked_symbols,
@@ -54,24 +59,38 @@
     Mod,
     ModularIndexing,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.functions import CeilDiv, FloorDiv, Identity, ModularIndexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .._dynamo.utils import import_submodule
 from . import config, inductor_prims, ir, test_operators  # NOQA: F401
 from .decomposition import decompositions, get_decompositions
 from .ir import (
+<<<<<<< HEAD
     BaseView,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DtypeView,
     ExpandView,
     IndexingConstant,
     IRNode,
     is_triton,
+<<<<<<< HEAD
     MutableBox,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OnlineSoftmaxReduction,
     ops_wrapper,
     PermuteView,
     Pointwise,
     Reduction,
+<<<<<<< HEAD
     ShapeAsConstantBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SqueezeView,
     TensorBox,
     validate_ir,
@@ -160,7 +179,10 @@ def group_foreach_args(arg_pairs: Iterable[Union[tuple[Any, Any], Any]]):
                 break
         assert device is not None, "foreach op should have at least one tensor arg"
         if unpack_args:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-unpacking]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (args,) = args
         out[(device, use_foreach)].append((i, args))
     return out
@@ -171,10 +193,13 @@ def maybe_layout_constraints(fn: Callable[..., Any]) -> Optional[Callable[..., A
     if not isinstance(fn, torch._ops.OpOverload):
         # Only OpOverloads have layout constraints.
         return None
+<<<<<<< HEAD
 
     if maybe_layout_tag := get_layout_constraint_tag(fn, with_default=False):
         return tag_to_layout_constraint(maybe_layout_tag)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if fn in _maybe_layout_constraints:
         return _maybe_layout_constraints[fn]
     return None
@@ -183,7 +208,11 @@ def maybe_layout_constraints(fn: Callable[..., Any]) -> Optional[Callable[..., A
 def tag_to_layout_constraint(tag):
     if tag == torch._C.Tag.needs_exact_strides:
         return constrain_to_fake_tensors
+<<<<<<< HEAD
     if tag == torch._C.Tag.needs_contiguous_strides:  # type: ignore[attr-defined]
+=======
+    if tag == torch._C.Tag.needs_contiguous_strides:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return require_contiguous_strides
     if tag == torch._C.Tag.needs_fixed_stride_order:
         return constrain_to_fx_strides
@@ -263,7 +292,10 @@ def decode_dtype(dtype: int):
     if not isinstance(dtype, int):
         return dtype
     assert dtype in DTYPE_ID_LOOKUP, f"id {dtype} missing from DTYPE_ID_LOOKUP"
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dtype = DTYPE_ID_LOOKUP[dtype]
     return dtype
 
@@ -322,6 +354,7 @@ def in_namespace(op, namespace):
     return False
 
 
+<<<<<<< HEAD
 def maybe_copy_cpu_scalar(x: TensorBox, device: torch.device) -> TensorBox:
     """
     Copy cpu scalar if doesn't not match with given `device`
@@ -342,6 +375,8 @@ def maybe_copy_cpu_scalar(x: TensorBox, device: torch.device) -> TensorBox:
     return x
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def transform_args(
     args: list[Any],
     kwargs: dict[str, Any],
@@ -349,10 +384,13 @@ def transform_args(
     type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND],
     convert_input_to_bool: bool,
 ) -> tuple[list[Any], dict[str, Any]]:
+<<<<<<< HEAD
     """
     Transforms arguments for broadcasting and type promotion
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args_indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
     kwargs_indices = [k for k, v in kwargs.items() if isinstance(v, TensorBox)]
     # check that there's something to transform
@@ -380,12 +418,15 @@ def transform_args(
             args[args_indices[0]] if args_indices else kwargs[kwargs_indices[0]]
         ).get_device()
 
+<<<<<<< HEAD
         for i in args_indices:
             args[i] = maybe_copy_cpu_scalar(args[i], device)
 
         for k in kwargs_indices:
             kwargs[k] = maybe_copy_cpu_scalar(kwargs[k], device)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # sometimes args are an immutable list so we can't mutate them
         def promote(arg):
             if isinstance(arg, TensorBox):
@@ -535,12 +576,25 @@ def broadcast_symbolic_shapes(a, b):
     """
     output = []
     for x, y in itertools.zip_longest(reversed(a), reversed(b), fillvalue=sympy.S.One):
+<<<<<<< HEAD
         if V.graph.sizevars.is_size_one_or_false(y):
             output.append(x)
         elif V.graph.sizevars.is_size_one_or_false(x):
             output.append(y)
         else:
             V.graph.sizevars.check_equals(x, y)
+=======
+        if V.graph.sizevars.shape_env.evaluate_expr(
+            sympy.Eq(y, 1), size_oblivious=True
+        ):
+            output.append(x)
+        elif V.graph.sizevars.shape_env.evaluate_expr(
+            sympy.Eq(x, 1), size_oblivious=True
+        ):
+            output.append(y)
+        else:
+            V.graph.sizevars.guard_equals(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(sympy.expand(y).free_symbols) < len(sympy.expand(x).free_symbols):
                 output.append(y)  # prefer shorter formula
             else:
@@ -560,9 +614,13 @@ def promote_constants(inputs, override_return_dtype=None, type_promotion_kind=No
         return inputs
     if all(isinstance(x, (int, float, sympy.Basic)) for x in inputs):
         dtype = override_return_dtype or get_promoted_dtype(
+<<<<<<< HEAD
             *inputs,
             # pyrefly: ignore [bad-argument-type]
             type_promotion_kind=type_promotion_kind,
+=======
+            *inputs, type_promotion_kind=type_promotion_kind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def const_func(x):
@@ -619,9 +677,13 @@ def inner(*inputs: TensorBox, alpha=None):
         inputs = promote_constants(inputs, override_return_dtype)
         if allow_alpha:
             if alpha is not None and alpha != 1:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 inputs = list(inputs)
                 # pyrefly: ignore [unsupported-operation]
+=======
+                inputs = list(inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inputs[-1] = mul(inputs[-1], alpha)
         else:
             assert alpha is None
@@ -643,8 +705,13 @@ def inner(*inputs: TensorBox, alpha=None):
             and getattr(V.graph, "current_node", None) is not None
             and V.graph.current_node.meta is not None
             and V.graph.current_node.meta.get("low_precision_pointwise_barrier", False)
+<<<<<<< HEAD
         )
         emulate_output_cast = emulate_precision_casts and dtype in low_pr_fp
+=======
+            and dtype in low_pr_fp
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def inner_fn(index):
             assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
@@ -661,7 +728,11 @@ def inner_fn(index):
                     inputs_loaded.append(out)
 
                 out = fn(*inputs_loaded)
+<<<<<<< HEAD
                 if emulate_output_cast:
+=======
+                if emulate_precision_casts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # fp16/bf16 kernels are computed in fp32. Casting down to fp16/bf16 here,
                     # then upcasting again, to emulate casts that eager would do.
                     downcast = ops.to_dtype(out, dtype, use_compute_types=False)
@@ -671,14 +742,20 @@ def inner_fn(index):
         if not override_device:
             device = None
             for i in inputs:
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if is_gpu(i.get_device().type):
                     device = i.get_device()
                     break
             if not device:
                 device = inputs[0].get_device()
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = override_device or device
 
         return Pointwise.create(
@@ -733,7 +810,10 @@ def inner(*inputs: list[list[TensorBox]], alpha=1):
                 outputs[output_ind] = output
 
                 if (
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     V.graph.has_feature(device, BackendFeature.FOREACH)
                     and use_foreach
                     and realize_outputs
@@ -742,7 +822,10 @@ def inner(*inputs: list[list[TensorBox]], alpha=1):
                     operation_list.append(output.get_operation_name())
 
             if operation_list:
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 V.graph.register_operation_list(operation_list)
 
         assert all(x is not None for x in outputs)
@@ -751,9 +834,13 @@ def inner(*inputs: list[list[TensorBox]], alpha=1):
     return inner
 
 
+<<<<<<< HEAD
 def to_dtype(
     x: Union[TensorBox, ShapeAsConstantBuffer], dtype: torch.dtype, copy: bool = False
 ):
+=======
+def to_dtype(x: TensorBox, dtype: torch.dtype, copy=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     src_dtype = x.get_dtype()
     if src_dtype == dtype:
         return clone(x) if copy else x
@@ -990,10 +1077,32 @@ def broadcast_tensors(*inputs):
     outputs = []
     for x in inputs:
         sizes = x.get_size()
+<<<<<<< HEAD
 
         if len(sizes) != len(target) or any(
             V.graph.sizevars.is_size_one_or_false(a)
             != V.graph.sizevars.is_size_one_or_false(b)
+=======
+        if len(sizes) != len(target) or any(
+            (
+                (
+                    V.graph.sizevars.shape_env.evaluate_expr(
+                        sympy.Eq(a, 1), size_oblivious=True
+                    )
+                    and not V.graph.sizevars.shape_env.evaluate_expr(
+                        sympy.Eq(b, 1), size_oblivious=True
+                    )
+                )
+                or (
+                    not V.graph.sizevars.shape_env.evaluate_expr(
+                        sympy.Eq(a, 1), size_oblivious=True
+                    )
+                    and V.graph.sizevars.shape_env.evaluate_expr(
+                        sympy.Eq(b, 1), size_oblivious=True
+                    )
+                )
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for a, b in zip(sizes, target)
         ):
             x = expand(x, target)
@@ -1017,16 +1126,29 @@ def squeeze(x, dim=None):
         return TensorBox(SqueezeView.create(x.data))
 
     dim = (
+<<<<<<< HEAD
         V.graph.sizevars.guard_int(dim)
         if isinstance(dim, (int, sympy.Expr))
         else tuple(V.graph.sizevars.guard_int(d) for d in dim)
+=======
+        V.graph.sizevars.evaluate_static_shape(dim)
+        if isinstance(dim, (int, sympy.Expr))
+        else tuple(V.graph.sizevars.evaluate_static_shape(d) for d in dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     dim = canonicalize_dims(len(x.get_size()), dim)  # type: ignore[call-overload]
     dims = OrderedSet((dim,) if not isinstance(dim, tuple) else dim)
 
     new_shape = []
     for d, s in enumerate(x.get_size()):
+<<<<<<< HEAD
         if not (d in dims and V.graph.sizevars.guard_or_false(sympy.Eq(s, 1))):
+=======
+        if not (
+            d in dims
+            and V.graph.sizevars.evaluate_expr(sympy.Eq(s, 1), size_oblivious=True)
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_shape.append(s)
 
     # squeeze does nothing if the size isn't 1
@@ -1198,7 +1320,13 @@ def inner_fn(index):
 @register_lowering(aten._unsafe_view, type_promotion_kind=None)
 @register_lowering(aten.view, type_promotion_kind=None)
 @register_lowering(aten.reshape, type_promotion_kind=None)
+<<<<<<< HEAD
 def view(x: TensorBox, sizes: Sequence[sympy.Expr]) -> TensorBox:
+=======
+def view(x, sizes):
+    assert isinstance(x, TensorBox)
+    assert isinstance(sizes, (list, tuple))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return TensorBox(View.create(x.data, sizes))
 
 
@@ -1211,6 +1339,7 @@ def permute(x, dims):
 
 @register_lowering(aten.slice, type_promotion_kind=None)
 def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
+<<<<<<< HEAD
     """
     Lowers a slice call, creating ExternKernels for the output size & storage offset symbols,
     if the indices are unbacked and appropriate semantics aren't known.
@@ -1325,10 +1454,16 @@ def compute_slice_index(index, size, default=None):
     new_sizes[dim] = new_size
     new_strides[dim] *= step
     return as_strided(x, new_sizes, new_strides, new_storage_offset)
+=======
+    assert isinstance(x, TensorBox)
+    dim = _validate_dim(x, dim, 0)
+    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten.as_strided, type_promotion_kind=None)
 def as_strided(x, size, stride, storage_offset=None):
+<<<<<<< HEAD
     new_device = None
     new_dtype = None
     if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
@@ -1341,14 +1476,23 @@ def as_strided(x, size, stride, storage_offset=None):
         # to have a cross-device view today.
         new_device = x.get_device()
         new_dtype = x.dtype
+=======
+    if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
+        # as_strided ignores views
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = x.data.unwrap_view()
     x.realize()
     if not ir.is_storage_and_layout(x):
         raise NotImplementedError(f"unrealized as_strided({x}, ...)")
     storage, old_layout = ir.as_storage_and_layout(x)
     new_layout = ir.FixedLayout(
+<<<<<<< HEAD
         new_device if new_device else old_layout.device,
         new_dtype if new_dtype else old_layout.dtype,
+=======
+        old_layout.device,
+        old_layout.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [sympy.expand(s) for s in size],
         [sympy.expand(s) for s in stride],
         sympy.expand(storage_offset or 0),
@@ -1447,7 +1591,11 @@ def quantized_decomposed_quantize_per_channel(
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert len(scales.get_size()) == 1, "expect scales 1 dim"
     assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
 
@@ -1489,6 +1637,7 @@ def inner_fn(idx):
     )
 
 
+<<<<<<< HEAD
 def _assert_async(cond, msg):
     cond.realize()
     cond = to_dtype(cond, torch.bool)
@@ -1517,6 +1666,8 @@ def lower_assert_functional_async(cond, msg):
     return _assert_async(cond, msg)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_lowering(
     quantized_decomposed.dequantize_per_channel, type_promotion_kind=None
 )
@@ -1530,7 +1681,11 @@ def quantized_decomposed_dequantize_per_channel(
     dtype: torch.dtype,
     *,
     out_dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert len(scales.get_size()) == 1, "expect scales 1 dim"
     assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
     assert input.get_dtype() == dtype, (
@@ -1580,7 +1735,11 @@ def quantized_decomposed_quantize_per_tensor_default(
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if input.get_dtype() == torch.bfloat16:
         input = to_dtype(input, torch.float32)
     assert input.get_dtype() == torch.float32, (
@@ -1621,7 +1780,11 @@ def quantized_decomposed_dequantize_per_tensor_default(
     dtype: torch.dtype,
     *,
     out_dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert input.get_dtype() == dtype, (
         f"Expecting input to have dtype {dtype}, but got dtype: {input.get_dtype()}"
     )
@@ -1658,7 +1821,11 @@ def quantized_decomposed_quantize_per_tensor_tensor(
     quant_min: int,
     quant_max: int,
     dtype: torch.dtype,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if input.get_dtype() == torch.bfloat16:
         input = to_dtype(input, torch.float32)
     assert input.get_dtype() == torch.float32, (
@@ -1708,7 +1875,11 @@ def quantized_decomposed_dequantize_per_tensor_tensor(
     dtype: torch.dtype,
     *,
     out_dtype: Optional[torch.dtype] = None,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert len(scale.get_size()) == 0 or (
         len(scale.get_size()) == 1 and scale.get_size()[0] == 1
     ), "expect scale as scalar tensor"
@@ -1941,6 +2112,7 @@ def diagonal_scatter(input, src, offset: int = 0, dim1: int = 0, dim2: int = 1):
 
 @register_lowering(aten.select, type_promotion_kind=None)
 def select(x, dim, idx):
+<<<<<<< HEAD
     idx = sympy.expand(idx)
     size = sympy.expand(x.get_size()[dim])
     actual_index = None
@@ -2002,6 +2174,10 @@ def select(x, dim, idx):
     del new_size[dim]
     del new_stride[dim]
     return as_strided(x, new_size, new_stride, new_storage_offset)
+=======
+    idx = View.handle_negative_index(idx, x.get_size()[dim])
+    return squeeze(slice_(x, dim, idx, idx + 1), dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten.split, type_promotion_kind=None)
@@ -2013,7 +2189,13 @@ def split(x, sizes, dim=0):
     # by computing what the actual size of each chunk should be.
     if not isinstance(sizes, (list, tuple)):
         x_size = x.get_size()[dim]
+<<<<<<< HEAD
         chunks = V.graph.sizevars.guard_int(FloorDiv(x_size + sizes - 1, sizes))
+=======
+        chunks = V.graph.sizevars.evaluate_static_shape(
+            FloorDiv(x_size + sizes - 1, sizes)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sizes_ = [sizes] * chunks
         # The last chunk might have a smaller size than the rest.
         sizes_[-1] = x_size - (chunks - 1) * sizes
@@ -2039,7 +2221,11 @@ def split_with_sizes(x, sizes, dim=0):
 @register_lowering(aten.unbind, type_promotion_kind=None)
 def unbind(x, dim=0):
     dim = _validate_dim(x, dim, 0)
+<<<<<<< HEAD
     x_size = V.graph.sizevars.guard_int(x.get_size()[dim])
+=======
+    x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = [select(x, dim, i) for i in range(x_size)]
     return result
 
@@ -2051,12 +2237,21 @@ def unfold(x, dimension, size, step):
     dim = canonicalize_dim(ndim, dimension)
 
     if ndim == 0:
+<<<<<<< HEAD
         return slice_(unsqueeze(x, 0), end=size, clamp=False)
 
     dim_size = sizes[dim]
     sizevars = V.graph.sizevars
     sizevars.check_leq(size, dim_size)
     sizevars.check_lt(0, step)  # type: ignore[arg-type]
+=======
+        return slice_(unsqueeze(x, 0), end=size)
+
+    dim_size = sizes[dim]
+    sizevars = V.graph.sizevars
+    sizevars.guard_leq(size, dim_size)
+    sizevars.guard_lt(0, step)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_dim_size = FloorDiv(dim_size - size, step) + 1
     if sizevars.size_hint_or_throw(dim_size) > 0:
@@ -2103,10 +2298,16 @@ def _validate_dim(x, dim, offset=0):
 def glu(x, dim=-1):
     dim = _validate_dim(x, dim, 0)
     # TODO: don't guard on static shape here
+<<<<<<< HEAD
     new_len = V.graph.sizevars.guard_int(x.get_size()[dim]) // 2
     # no need to clamp, index is int based on input size
     a = slice_(x, dim, 0, new_len, clamp=False)
     b = slice_(x, dim, new_len, new_len * 2, clamp=False)
+=======
+    new_len = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim]) // 2
+    a = slice_(x, dim, 0, new_len)
+    b = slice_(x, dim, new_len, new_len * 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mul(a, sigmoid(b))
 
 
@@ -2147,9 +2348,12 @@ def unsupported_input_tensor(t: torch.Tensor, node=None):
     if t.is_meta:
         return True
 
+<<<<<<< HEAD
     if t.is_sparse:
         return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if t.dtype == torch.float8_e8m0fnu:
         if not node:
             return True
@@ -2384,7 +2588,11 @@ def warn_triton_random():
 
 @register_lowering(aten.rand)
 def rand(*args, **kwargs):
+<<<<<<< HEAD
     if kwargs.get("generator") is not None:
+=======
+    if kwargs.get("generator", None) is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fallback_rand_generator(*args, **kwargs)
     elif config.fallback_random:
         kwargs.pop("generator", None)
@@ -2394,7 +2602,11 @@ def rand(*args, **kwargs):
 
 @register_lowering(aten.randn)
 def randn(*args, **kwargs):
+<<<<<<< HEAD
     if kwargs.get("generator") is not None:
+=======
+    if kwargs.get("generator", None) is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fallback_randn_generator(*args, **kwargs)
     elif config.fallback_random:
         kwargs.pop("generator", None)
@@ -2511,7 +2723,11 @@ def searchsorted(
     right: bool = False,
     side: Optional[str] = None,
     sorter: Optional[TensorBox] = None,
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_bucketize = lambda tb: V.graph.has_feature(  # noqa: E731
         tb, BackendFeature.BUCKETIZE
     )
@@ -2746,8 +2962,13 @@ def apply_constraint(idx, arg, fx_arg):
         meta_stride_expr = [
             s.node.expr if isinstance(s, torch.SymInt) else s for s in meta_val.stride()
         ]
+<<<<<<< HEAD
         shape_env = V.graph.sizevars.shape_env
         stride_order = ir.get_stride_order(meta_val.stride(), shape_env)
+=======
+
+        stride_order = ir.get_stride_order(meta_val.stride())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if stride_order and stride_order[-1] != 0:
             # contiguous stride order
@@ -2783,8 +3004,12 @@ def apply_constraint(idx, arg, fx_arg):
         if len(arg.get_size()) not in (3, 4):
             return arg
 
+<<<<<<< HEAD
         is_aligned_tensor = ir.is_aligned_realized_tensor(arg, ALIGNMENT)
         if is_aligned_tensor:
+=======
+        if ir.is_aligned_realized_tensor(arg, ALIGNMENT):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
             )
@@ -2792,7 +3017,11 @@ def apply_constraint(idx, arg, fx_arg):
         if (
             isinstance(arg, IRNode)
             and arg.maybe_get_stride() is not None
+<<<<<<< HEAD
             and is_aligned_tensor
+=======
+            and ir.is_aligned_realized_tensor(arg, ALIGNMENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
@@ -2836,7 +3065,11 @@ def apply_constraint(idx, arg, fx_arg):
 
             return ir.ExternKernel.require_exact_strides(arg, out_strides)
 
+<<<<<<< HEAD
         if is_aligned_tensor:
+=======
+        if ir.is_aligned_realized_tensor(arg, ALIGNMENT):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
             )
@@ -2844,16 +3077,24 @@ def apply_constraint(idx, arg, fx_arg):
         if (
             isinstance(arg, IRNode)
             and arg.maybe_get_stride() is not None
+<<<<<<< HEAD
             and is_aligned_tensor
+=======
+            and ir.is_aligned_realized_tensor(arg, ALIGNMENT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
             )
 
         def is_aligned(x):
+<<<<<<< HEAD
             return V.graph.sizevars.guard_or_false(
                 sympy.Eq(Mod(x.get_size()[-1], ALIGNMENT), 0)
             )
+=======
+            return (V.graph.sizevars.size_hint(x.get_size()[-1]) % ALIGNMENT) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(arg.data, ir.BaseView):
             if not is_aligned(arg):
@@ -2945,7 +3186,11 @@ def is_aligned(x):
 make_fallback(aten.upsample_linear1d_backward)
 make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
 make_fallback(aten.upsample_trilinear3d_backward)
+<<<<<<< HEAD
 make_fallback(aten.grid_sampler_2d_backward)
+=======
+make_fallback(aten.grid_sampler_2d_backward, require_dense)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 make_fallback(aten._pdist_backward)
 
 
@@ -3080,13 +3325,17 @@ def is_aligned(x):
 
 # index_reduce requires fallback when use_scatter_fallback(...) returns True
 make_fallback(aten.index_reduce)
+<<<<<<< HEAD
 make_fallback(aten.repeat_interleave.Tensor, override_decomp=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Register with type_promotion_kind None.
 # For example, fp16.copy_(fp32) should **not** promote the first input's dtype.
 @register_lowering(aten.copy, type_promotion_kind=None)
 def copy(self, src, non_blocking=False):
+<<<<<<< HEAD
     if not isinstance(src, ir.IRNode):
         src = tensor(src, dtype=self.get_dtype(), device=self.get_device())
     x = src
@@ -3095,6 +3344,12 @@ def copy(self, src, non_blocking=False):
         x = to_device(x, self.get_device())
     if self.get_dtype() != src.get_dtype():
         # pyrefly: ignore [bad-argument-type]
+=======
+    x = src
+    if self.get_device() != src.get_device():
+        x = to_device(x, self.get_device())
+    if self.get_dtype() != src.get_dtype():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = to_dtype(x, self.get_dtype())
 
     if self.get_size() != src.get_size():
@@ -3118,7 +3373,10 @@ def clone_preserve_reinterpret_view(x):
     reinterpret_view_layouts = []
     if isinstance(x, TensorBox) and isinstance(x.data, ir.ReinterpretView):
         x = x.data  # unwrap TensorBox
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while isinstance(x, ir.ReinterpretView):
             reinterpret_view_layouts.append(x.get_layout())
             x = x.data
@@ -3165,6 +3423,7 @@ def select_scatter(x, src, dim: int, index: int):
     assert x.get_dtype() == src.get_dtype()
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
+<<<<<<< HEAD
     if V.graph.sizevars.guard_or_false(sympy.Lt(index, 0)):
         index = index + x.get_size()[dim]
     elif V.graph.sizevars.guard_or_false(sympy.Ge(index, 0)):
@@ -3175,6 +3434,12 @@ def select_scatter(x, src, dim: int, index: int):
 
     V.graph.sizevars.check_leq(0, index)  # type: ignore[arg-type]
     V.graph.sizevars.check_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
+=======
+    if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
+        index = index + x.get_size()[dim]
+    V.graph.sizevars.guard_leq(0, index)  # type: ignore[arg-type]
+    V.graph.sizevars.guard_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     src = expand(unsqueeze(src, dim), x.get_size())
     src_loader = src.make_loader()
 
@@ -3198,12 +3463,19 @@ def inner_fn(idx):
 
 @register_lowering(aten.slice_scatter, type_promotion_kind=None)
 def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
+<<<<<<< HEAD
     src = to_dtype(src, x.get_dtype())
+=======
+    assert x.get_dtype() == src.get_dtype()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
 
     src_size = list(x.get_size())
@@ -3346,6 +3618,11 @@ def long_tensor(data):
 
 @register_lowering(aten._local_scalar_dense)
 def _local_scalar_dense(data):
+<<<<<<< HEAD
+=======
+    from torch.fx.experimental.symbolic_shapes import resolve_unbacked_bindings
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This is interesting!  Most lowerings return tensors, so you can just
     # return the buffer you allocated and it will get used (or not used, if
     # it's dead.)  But _local_scalar_dense (aka item) returns an int,
@@ -3436,6 +3713,10 @@ def inner_fn(index):
     )
 
 
+<<<<<<< HEAD
+=======
+@register_lowering(aten.full_like, type_promotion_kind=None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def full_like(x, fill_value, **kwargs):
     return create_tensor_like(tensor_constructor(fill_value))(x, **kwargs)
 
@@ -3526,7 +3807,10 @@ def _new_constant(
         assert isinstance(size, (list, tuple))
         assert_nyi(not pin_memory, "pin_memory")
         assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = decode_dtype(dtype) or x.get_dtype()
         device = device or x.get_device()
         size = [sympy.Integer(s) for s in size]
@@ -3559,7 +3843,10 @@ def empty_strided(
     assert isinstance(stride, (list, tuple, type(None)))
     assert_nyi(not pin_memory, "pin_memory")
     assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dtype = decode_dtype(dtype) or torch.get_default_dtype()
     device = device or torch.tensor(0.0).device
     device = decode_device(device)
@@ -3731,7 +4018,11 @@ def index_output_size_and_inner_fn(
     # Then, a[:,x,:,x,:] will have shape 2,3,5,7 as due to x,:,x then 2 will
     # be pulled to the front.
     non_consecutive_tensors = False
+<<<<<<< HEAD
     for previous, current in itertools.pairwise(tensor_indices):
+=======
+    for previous, current in zip(tensor_indices, tensor_indices[1:]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if current - previous != 1:
             non_consecutive_tensors = True
 
@@ -3881,8 +4172,12 @@ def index_put_as_masked_fill(self, indices, value, accumulate):
 
 
 def index_put_fallback(self, indices, values, accumulate):
+<<<<<<< HEAD
     op_overload = getattr(aten.index_put_, V.graph.current_node.target._overloadname)  # type: ignore[union-attr]
     ir.IndexPutFallback(op_overload, self, indices, values, accumulate)
+=======
+    ir.IndexPutFallback(V.graph.current_node.target, self, indices, values, accumulate)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self
 
 
@@ -4005,10 +4300,15 @@ def indice_slice_from_randperm(indice):
     values = expand(values, expected_vals_size)
     # all guards are set above during broadcast_tensors and expand
 
+<<<<<<< HEAD
     device = self.get_device()
     assert device is not None
     scatter = ir.Scatter(
         device=device,
+=======
+    scatter = ir.Scatter(
+        device=self.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype=self.get_dtype(),
         inner_fn=values.make_loader(),
         ranges=expected_vals_size,  # iter_ranges,
@@ -4218,7 +4518,10 @@ def fn(idx):
             return src_loader(idx)
         else:
             # src is a scalar
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ops.constant(src, self.get_dtype())
 
     def backend_reduce_str(reduce):
@@ -4229,6 +4532,7 @@ def backend_reduce_str(reduce):
             assert reduce is None
             return None
 
+<<<<<<< HEAD
     device = self.get_device()
     assert device is not None
 
@@ -4236,6 +4540,12 @@ def backend_reduce_str(reduce):
         # zero out the corresponding elements first
         zero_out = ir.Scatter(
             device=device,
+=======
+    if not include_self:
+        # zero out the corresponding elements first
+        zero_out = ir.Scatter(
+            device=self.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=self.get_dtype(),
             inner_fn=lambda index: ops.constant(0, self.get_dtype()),
             ranges=index.get_size(),
@@ -4254,7 +4564,11 @@ def backend_reduce_str(reduce):
     # self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
     # self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
     scatter = ir.Scatter(
+<<<<<<< HEAD
         device=device,
+=======
+        device=self.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype=self.get_dtype(),
         inner_fn=fn,
         ranges=index.get_size(),
@@ -4285,7 +4599,11 @@ def upsample_nearestnd(
     x_loader = x.make_loader()
     i_sizes = x.get_size()[-n:]
     batch = x.get_size()[:-n]
+<<<<<<< HEAD
     i_sizes = [V.graph.sizevars.guard_int(i) for i in i_sizes]
+=======
+    i_sizes = [V.graph.sizevars.evaluate_static_shape(i) for i in i_sizes]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert len(scales_x) == n
     o_sizes = output_size
@@ -4483,7 +4801,11 @@ def _padding_can_be_fused():
         layout.offset,
     )
 
+<<<<<<< HEAD
     sliced_x = slice_(resized_x, dim=1, start=rowsize, end=rowsize + npad, clamp=False)
+=======
+    sliced_x = slice_(resized_x, dim=1, start=rowsize, end=rowsize + npad)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fill_(sliced_x, fill_value)
 
     counters["inductor"]["inplace_padding"] += 1
@@ -4572,7 +4894,10 @@ def constant_boundary_condition(
 ):
     h = x.get_size()[-dim:]
     x_loader = x.make_loader()
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding_h = padding or [0] * dim
 
     def load(index):
@@ -4581,7 +4906,10 @@ def load(index):
 
         mask = functools.reduce(
             ops.and_,
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [range_mask(ih[i], h[i] + padding_h[i], -padding_h[i]) for i in range(dim)],
         )
         return (
@@ -4619,10 +4947,17 @@ def pooling_size(x, i, kernel_size, stride, padding, ceil_mode, *, dilation=None
         if V.graph.sizevars.size_hint((x_alt - 1) * stride[i] - x - padding[i]) >= 0:
             # Sliding windows must start within the input or left padding
             x_alt -= 1  # type: ignore[assignment]
+<<<<<<< HEAD
             V.graph.sizevars.check_leq(0, x_alt * stride[i] - x - padding[i])  # type: ignore[arg-type]
         if V.graph.sizevars.size_hint(x_out - x_alt) == 0:
             # ceil mode is actually a no-op, lets guard on that
             V.graph.sizevars.check_equals(x_out, x_alt)
+=======
+            V.graph.sizevars.guard_leq(0, x_alt * stride[i] - x - padding[i])  # type: ignore[arg-type]
+        if V.graph.sizevars.size_hint(x_out - x_alt) == 0:
+            # ceil mode is actually a no-op, lets guard on that
+            V.graph.sizevars.guard_equals(x_out, x_alt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ceil_mode = False
         else:
             x_out = x_alt
@@ -4729,10 +5064,17 @@ def fn_inner(idx, reduction_idx):
         ranges=new_size,
         reduction_ranges=kernel_size,
     )
+<<<<<<< HEAD
     if isinstance(result.data.data, Reduction):  # type: ignore[attr-defined, union-attr]
         # Only realize if reduction isn't unrolled
         result.realize()
     if isinstance(offsets.data.data, Reduction):  # type: ignore[attr-defined, union-attr]
+=======
+    if isinstance(result.data.data, Reduction):  # type: ignore[attr-defined]
+        # Only realize if reduction isn't unrolled
+        result.realize()
+    if isinstance(offsets.data.data, Reduction):  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Only realize if reduction isn't unrolled
         offsets.realize()
 
@@ -4782,7 +5124,11 @@ def _pool_offsets_to_indices(
         [Sequence[Union[int, torch.SymInt]], Sequence[Union[int, torch.SymInt]]],
         torch._inductor.virtualized.OpsValue,
     ],
+<<<<<<< HEAD
 ) -> Union[TensorBox, ShapeAsConstantBuffer]:
+=======
+) -> TensorBox:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     n_dim = len(kernel_size)
     offsets_loader = offsets.make_loader()
     window_size = sympy.sympify(functools.reduce(operator.mul, kernel_size))
@@ -4915,12 +5261,19 @@ def max_pool2d_with_indices_backward(
     x_stride: Optional[Sequence[Any]]
     if isinstance(x, TensorBox) and isinstance(x.data.data, Pointwise):  # type: ignore[attr-defined]
         data = x.data.data  # type: ignore[attr-defined]
+<<<<<<< HEAD
         device = data.get_device()
         assert device is not None
         x_buffer = ir.ComputedBuffer(
             name=None,
             layout=ir.FlexibleLayout(
                 device=device,
+=======
+        x_buffer = ir.ComputedBuffer(
+            name=None,
+            layout=ir.FlexibleLayout(
+                device=data.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=data.get_dtype(),
                 size=data.get_size(),
             ),
@@ -4948,11 +5301,19 @@ def max_pool2d_with_indices_backward(
     new_size = list(x.get_size())
 
     h_window_size = max(
+<<<<<<< HEAD
         max(FloorDiv(h, stride[0]) - max(0, FloorDiv(h - kernel_size[0], stride[0])), 1)
         for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
         max(FloorDiv(w, stride[1]) - max(0, FloorDiv(w - kernel_size[1], stride[1])), 1)
+=======
+        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        for h in range(kernel_size[0] * 2)
+    )
+    w_window_size = max(
+        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for w in range(kernel_size[1] * 2)
     )
 
@@ -5185,8 +5546,13 @@ def _adaptive_avg_pool2d(x, output_size):
 
     *batch, h_in, w_in = x.get_size()
 
+<<<<<<< HEAD
     h_in = V.graph.sizevars.guard_int(h_in)
     w_in = V.graph.sizevars.guard_int(w_in)
+=======
+    h_in = V.graph.sizevars.evaluate_static_shape(h_in)
+    w_in = V.graph.sizevars.evaluate_static_shape(w_in)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     h_out, w_out = output_size
 
@@ -5198,7 +5564,11 @@ def _adaptive_avg_pool2d(x, output_size):
         o_size = [*batch, h_out, w_out]
         return empty(o_size, dtype=x.get_dtype(), device=x.get_device())
     if h_in % h_out == 0 and w_in % w_out == 0:
+<<<<<<< HEAD
         kernel_size = [FloorDiv(h_in, h_out), FloorDiv(w_in, w_out)]
+=======
+        kernel_size = [h_in // h_out, w_in // w_out]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return avg_pool2d(x, kernel_size)
 
     h_kernel_max = ceildiv((h_in + h_out - 1), h_out)
@@ -5260,8 +5630,13 @@ def adaptive_max_pool2d(x, output_size):
 
     *batch, h_in, w_in = x.get_size()
 
+<<<<<<< HEAD
     h_in = V.graph.sizevars.guard_int(h_in)
     w_in = V.graph.sizevars.guard_int(w_in)
+=======
+    h_in = V.graph.sizevars.evaluate_static_shape(h_in)
+    w_in = V.graph.sizevars.evaluate_static_shape(w_in)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     h_out, w_out = output_size
 
@@ -5338,6 +5713,7 @@ def _fractional_pooling_offsets(samples, in_sz, out_sz, kernel_sz, dim, ndims):
     samples_loader = samples.make_loader()
 
     def load(prefix, i):
+<<<<<<< HEAD
         # Handle indexing for samples tensor correctly for different input dimensions
         # samples tensor always has shape (N, C, 2) for fractional_max_pool2d where:
         # - N=1 for 3D inputs (C,H,W), N=batch_size for 4D inputs (N,C,H,W)
@@ -5360,6 +5736,9 @@ def load(prefix, i):
         else:
             # Fallback for unexpected tensor shapes
             sample = samples_loader([*prefix, ndims - 1 - dim])
+=======
+        sample = samples_loader([*prefix, ndims - 1 - dim])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         i_expr = ops.index_expr(i, samples.get_dtype())
         diff = ops.index_expr(in_sz - kernel_sz, torch.int64)
         out_sz_expr = ops.index_expr(out_sz - 1, torch.int64)
@@ -5438,11 +5817,17 @@ def increments_to_index(idx, reduction_idx):
             ranges=new_size,
             reduction_ranges=kernel_size,
         )
+<<<<<<< HEAD
         assert isinstance(result, TensorBox), result
         if isinstance(result.data.data, Reduction):  # type: ignore[attr-defined]
             # Only realize if reduction isn't unrolled
             result.realize()
         assert isinstance(offsets, TensorBox), offsets
+=======
+        if isinstance(result.data.data, Reduction):  # type: ignore[attr-defined]
+            # Only realize if reduction isn't unrolled
+            result.realize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(offsets.data.data, Reduction):  # type: ignore[attr-defined]
             # Only realize if reduction isn't unrolled
             offsets.realize()
@@ -5460,6 +5845,7 @@ def upsample_nearest2d_backward(
     x.realize_hint()
 
     *_batch, inp_h, inp_w = x.get_size()
+<<<<<<< HEAD
     inp_h = V.graph.sizevars.guard_int(inp_h)
     inp_w = V.graph.sizevars.guard_int(inp_w)
 
@@ -5470,6 +5856,15 @@ def upsample_nearest2d_backward(
         return avg_pool2d(
             x, [FloorDiv(inp_h, out_h), FloorDiv(inp_w, out_w)], divisor_override=1
         )
+=======
+    inp_h = V.graph.sizevars.evaluate_static_shape(inp_h)
+    inp_w = V.graph.sizevars.evaluate_static_shape(inp_w)
+
+    *_batch, out_h, out_w = input_size
+
+    if inp_h % out_h == 0 and inp_w % out_w == 0:
+        return avg_pool2d(x, [inp_h // out_h, inp_w // out_w], divisor_override=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     h_kernel_max = ceildiv(inp_h, out_h)
     w_kernel_max = ceildiv(inp_w, out_w)
@@ -5496,7 +5891,10 @@ def fn(idx):
         device=x.get_device(),
         dtype=x.get_dtype(),
         inner_fn=fn,
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ranges=list(input_size),
     )
 
@@ -5724,11 +6122,19 @@ def avg_pool2d_backward(
     dtype = x.get_dtype()
 
     h_window_size = max(
+<<<<<<< HEAD
         max(FloorDiv(h, stride[0]) - max(0, FloorDiv(h - kernel_size[0], stride[0])), 1)
         for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
         max(FloorDiv(w, stride[1]) - max(0, FloorDiv(w - kernel_size[1], stride[1])), 1)
+=======
+        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        for h in range(kernel_size[0] * 2)
+    )
+    w_window_size = max(
+        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for w in range(kernel_size[1] * 2)
     )
 
@@ -6071,9 +6477,13 @@ def _validate_reduction_axis(x, axis):
     return axis
 
 
+<<<<<<< HEAD
 def _make_reduction_inner(
     x, *, axis, keepdims, dtype, override_return_dtype, reduction_type=None
 ):
+=======
+def _make_reduction_inner(x, *, axis, keepdims, dtype, override_return_dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if dtype is not None:
         x = to_dtype(x, dtype)
     size = x.get_size()
@@ -6091,6 +6501,7 @@ def _make_reduction_inner(
             kept_idx.append(i)
             kept_sizes.append(size[i])
 
+<<<<<<< HEAD
     # For argmax/argmin compute logical indices when the tensor has non-contiguous layout.
     should_compute_logical_index = False
     if (
@@ -6108,6 +6519,8 @@ def _make_reduction_inner(
                 layout.is_transposed() or not layout.is_contiguous()
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def loader(index, reduction_index):
         assert len(reduction_index) == len(reduced_idx)
         if keepdims:
@@ -6119,6 +6532,7 @@ def loader(index, reduction_index):
             zip(kept_idx, index), zip(reduced_idx, reduction_index)
         ):
             new_index[idx] = var
+<<<<<<< HEAD
         value = inner_loader(new_index)
 
         # For argmax/argmin, return tuple with logical linear index if needed
@@ -6134,6 +6548,9 @@ def loader(index, reduction_index):
             return (value, ops.index_expr(linear_idx, torch.int64))
 
         return value
+=======
+        return inner_loader(new_index)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if keepdims:
         new_size = list(size)
@@ -6161,11 +6578,18 @@ def inner(x, axis=None, keepdims=False, *, dtype=None):
             keepdims=keepdims,
             dtype=dtype,
             override_return_dtype=override_return_dtype,
+<<<<<<< HEAD
             reduction_type=reduction_type,
         )
         result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
         if isinstance(
             result.data.data,  # type: ignore[attr-defined, attr-type, union-attr]
+=======
+        )
+        result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
+        if isinstance(
+            result.data.data,  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Reduction,
         ):  # Only realize if reduction isn't unrolled
             result.realize()
@@ -6386,7 +6810,10 @@ def fn(idx):
     if isinstance(a, Number):
         if a == 1:
             return full_like(b, 1)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if a == 2 and is_float_dtype(b.get_dtype()):
             return exp2(b)
 
@@ -6412,14 +6839,22 @@ def mutate_to(changed, val, unsafe_alias=False):
 
     if not isinstance(val, ir.StorageBox):
         # introduce a copy to handle views
+<<<<<<< HEAD
         node = Pointwise.create(
+=======
+        val = Pointwise.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device=changed.get_device(),
             dtype=changed.get_dtype(),
             inner_fn=val.make_loader(),
             ranges=changed.get_size(),
+<<<<<<< HEAD
         )
         assert isinstance(node, (BaseView, MutableBox))
         val = node.data
+=======
+        ).data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(val, ir.StorageBox)
 
     if isinstance(changed_data, ir.StorageBox) and not (
@@ -6530,11 +6965,16 @@ def div_prim(a, b):
     if is_integral:
         return truncdiv(a, b)
 
+<<<<<<< HEAD
     # Disable CPU optimization to avoid precision issues.
     # see https://github.com/pytorch/pytorch/issues/157959
     if (divisor := get_constant_value(b)) is not None and a.get_device().type != "cpu":
         # Replace divide by constant with multiply by reciprocal
 
+=======
+    if (divisor := get_constant_value(b)) is not None:
+        # Replace divide by constant with multiply by reciprocal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if divisor.value == 0:
             reciprocal = math.copysign(float("inf"), divisor.value)
         else:
@@ -6995,7 +7435,11 @@ def make_triton_fallback(op):
 register_foreach_pointwise(aten._foreach_clamp_max.Scalar, minimum)
 register_foreach_pointwise(aten._foreach_reciprocal, reciprocal)
 register_foreach_pointwise(aten._foreach_sign, sign)
+<<<<<<< HEAD
 foreach_copy = register_foreach_pointwise(aten._foreach_copy, copy)
+=======
+register_foreach_pointwise(aten._foreach_copy, copy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # these are only encountered as outputs of the graph
@@ -7034,9 +7478,12 @@ def fn(*args, **kwargs):
 register_foreach_inplace(
     aten._foreach_div_.Scalar, aten._foreach_div.Scalar, foreach_div_scalar
 )
+<<<<<<< HEAD
 register_foreach_inplace(
     aten._foreach_copy_.default, aten._foreach_copy.default, foreach_copy
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def register_inplace(aten_op, outplace_op):
@@ -7099,9 +7546,13 @@ def sym_size(a, dim):
     # int, but you KNOW that int must always be a constant,
     # then you do not need trace that call at all (and just
     # constant propagate the integer as is.)
+<<<<<<< HEAD
     assert isinstance(val, torch.SymInt), (
         f"Expect val to be torch.SymInt but got val={val}"
     )
+=======
+    assert isinstance(val, torch.SymInt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return val.node.expr
 
 
@@ -7109,9 +7560,13 @@ def sym_size(a, dim):
 def sym_stride(a, dim):
     val = V.graph.current_node.meta["val"]
     # See Note [Can val be an int?]
+<<<<<<< HEAD
     assert isinstance(val, torch.SymInt), (
         f"Expect val to be torch.SymInt but got val={val}"
     )
+=======
+    assert isinstance(val, torch.SymInt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return val.node.expr
 
 
@@ -7194,6 +7649,7 @@ def resize(x, size, *, memory_format=None):
         and torch.utils.deterministic.fill_uninitialized_memory  # type: ignore[attr-defined]
     ):
         if is_float_dtype(dtype):
+<<<<<<< HEAD
             uninitialized_val = float("nan")
         elif is_integer_dtype(dtype):
             uninitialized_val = torch.iinfo(dtype).max
@@ -7205,6 +7661,19 @@ def resize(x, size, *, memory_format=None):
 
     if V.graph.sizevars.statically_known_equals(old_numel, 0):  # type: ignore[arg-type]
         return full(size, uninitialized_val, dtype=dtype, device=device)
+=======
+            uninitalized_val = float("nan")
+        elif is_integer_dtype(dtype):
+            uninitalized_val = torch.iinfo(dtype).max
+        else:
+            uninitalized_val = True
+    else:
+        # using zero as that is what empty does
+        uninitalized_val = 0.0
+
+    if V.graph.sizevars.statically_known_equals(old_numel, 0):  # type: ignore[arg-type]
+        return full(size, uninitalized_val, dtype=dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     x_flat = as_strided(
         x,
@@ -7224,7 +7693,11 @@ def inner_fn(idx):
         flat_index_expr = ops.index_expr(flat_index, torch.int64)
         limit = ops.index_expr(old_numel, torch.int64)
         mask = ops.lt(flat_index_expr, limit)
+<<<<<<< HEAD
         return ops.masked(mask, lambda: flat_loader([flat_index]), uninitialized_val)
+=======
+        return ops.masked(mask, lambda: flat_loader([flat_index]), uninitalized_val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out = Pointwise.create(
         device=device, dtype=dtype, inner_fn=inner_fn, ranges=list(size)
@@ -7272,7 +7745,11 @@ def cond(pred, true_fn, false_fn, operands):
 
 
 @register_lowering(torch.ops.higher_order.while_loop, type_promotion_kind=None)
+<<<<<<< HEAD
 def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False):
+=======
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if any(
         isinstance(x, IRNode) and is_triton(x)
         for x in carried_inputs + additional_inputs
@@ -7282,6 +7759,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
             msg = f"{msg} Found from : \n {stack_trace}"
         V.graph.disable_cudagraphs_reason = msg
 
+<<<<<<< HEAD
     result = ir.WhileLoop.create(
         cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
     )
@@ -7292,11 +7770,26 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
 register_lowering(
     torch.ops.higher_order.while_loop_stack_output, type_promotion_kind=None
 )(functools.partial(while_loop, stack_output=True))
+=======
+    def _map_output(out: Any):
+        if isinstance(out, TensorBox):
+            return out
+        elif isinstance(out, ir.StorageBox):
+            return TensorBox(out)
+        elif isinstance(out, ir.MultiOutput):
+            return TensorBox.create(out)
+        else:
+            raise RuntimeError(f"NYI unsupported output type: {type(out)}")
+
+    result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
+    return list(map(_map_output, result))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(torch.ops.higher_order.invoke_subgraph, type_promotion_kind=None)
 def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
     result = ir.InvokeSubgraph.create(subgraph_fn, *operands)
+<<<<<<< HEAD
     return list(map(TensorBox.create, result))  # type: ignore[call-overload]
 
 
@@ -7359,6 +7852,9 @@ def control_deps_op_lowering(additional_deps, subgraph_fn, *args):
             V.graph.additional_buffer_deps[op_name].add(dep_name)
 
     return output
+=======
+    return list(map(TensorBox.create, result))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(torch._higher_order_ops.invoke_quant, type_promotion_kind=None)
@@ -7476,8 +7972,14 @@ def prepare_softmax_online(x, dim):
         reduction_numel=rnumel,
     )
 
+<<<<<<< HEAD
     if num_split == 1 and V.graph.sizevars.statically_known_geq(
         rnumel, config.unroll_reductions_threshold
+=======
+    if (
+        num_split == 1
+        and V.graph.sizevars.size_hint(rnumel) >= config.unroll_reductions_threshold
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         max_tensor, sum_tensor = OnlineSoftmaxReduction.create(
             input_node=x, num_output=2, reduction_hint=hint, **kwargs
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index a8df2fe559875..e380ea9747baa 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -4,6 +4,7 @@
 import dataclasses
 import heapq
 import logging
+<<<<<<< HEAD
 from typing import Callable, Optional, TYPE_CHECKING, TypedDict, Union
 
 from torch._environment import is_fbcode
@@ -13,6 +14,15 @@
 from . import config
 from .ir import MultiOutputLayout, NoneLayout
 from .utils import get_dtype_size, is_nonfreeable_buffers
+=======
+from typing import Callable, TYPE_CHECKING, TypedDict, Union
+
+from torch._utils_internal import signpost_event
+from torch.utils._ordered_set import OrderedSet
+
+from .ir import MultiOutputLayout, NoneLayout
+from .utils import get_dtype_size, is_wait
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -77,11 +87,30 @@ def get_freeable_input_buf(
     Create and keep track of all input buffers that can be freed during the program
 
     Returns:
+<<<<<<< HEAD
         A dictionary containing all freeable input buffers, keyed by their names.
     """
 
     def _dep_size_hint(dep: Dep) -> int:
         return V.graph.get_dep_size_hint(dep)
+=======
+        A dictionary containing all freeble input buffers, keyed by their names.
+    """
+
+    # this function is copied from torch/_inductor/scheduler.py
+    # TODO: would be nice to remove the try/except block for both places
+    def _dep_size_hint(dep: Dep) -> int:
+        res = 0
+        try:
+            if not dep.has_unbacked_symbols():
+                res = dep.numbytes_hint()
+        except KeyError:
+            # In at least one test (test/inductor/test_torchbind.py) we
+            # create a StarDep that doesn't exist in the graph and calling
+            # `has_unbacked_symbols()` throws an error.
+            pass
+        return res
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # get freeable input buffers' successor nodes and their sizes
     # note that different deps can have the same name, so we use name as keys
@@ -89,6 +118,7 @@ def _dep_size_hint(dep: Dep) -> int:
         collections.defaultdict(OrderedSet)
     )
     dep_name_to_size: dict[str, int] = dict()
+<<<<<<< HEAD
 
     for node in nodes:
         for dep in node.read_writes.reads:
@@ -96,6 +126,15 @@ def _dep_size_hint(dep: Dep) -> int:
                 if not is_nonfreeable_buffers(dep):
                     dep_name_to_succ_nodes[dep.name].add(node)
                     dep_name_to_size[dep.name] = _dep_size_hint(dep)
+=======
+    for node in nodes:
+        for dep in node.read_writes.reads:
+            if dep.name in graph_inputs and not dep.name.startswith(
+                ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
+            ):
+                dep_name_to_succ_nodes[dep.name].add(node)
+                dep_name_to_size[dep.name] = _dep_size_hint(dep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # create FreeableInputBuffer objects and add them to the returned dictionary
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = dict()
@@ -126,6 +165,7 @@ def compute_size_for_scheduler_buffer(
         buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
         buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed
 
+<<<<<<< HEAD
     When an operation mutates a buffer in-place, the scheduler creates a new buffer name
     to track the "before" and "after" states, even though they share the same memory.
 
@@ -148,6 +188,8 @@ def compute_size_for_scheduler_buffer(
 
     The only memory events are the creation prior to op0, and the deletion following buf1.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Returns:
         A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
     """
@@ -159,12 +201,33 @@ def compute_size_for_scheduler_buffer(
     def _compute_and_update_buf_size(
         sched_buf: SchedulerBuffer, user_of_MultiOutputLayout: bool = False
     ) -> int:
+<<<<<<< HEAD
         if sched_buf.get_name() in V.graph.scheduler.mutation_real_name:
             sched_buf_to_size[sched_buf.get_name()] = (0, 0)
             return 0
         elif isinstance(sched_buf.node.layout, NoneLayout):
             sched_buf_to_size[sched_buf.get_name()] = (0, 0)
             return 0
+=======
+        if isinstance(sched_buf.node.layout, NoneLayout):
+            _size = 0
+            # for a wait tensor op, its schedulerBuffer NoneLayout layout. However,
+            # the schedulerBuffer is treated as a mutation of the collective output
+            # so it needs to inherit the size of the collectives
+            if (
+                sched_buf.defining_op
+                and is_wait(sched_buf.defining_op.node)
+                and sched_buf.get_mutations()
+            ):
+                mutated_buf_name = sched_buf.get_mutations()[0]
+                _size = (
+                    sched_buf_to_size[mutated_buf_name][1]
+                    if mutated_buf_name in sched_buf_to_size
+                    else 0
+                )
+            sched_buf_to_size[sched_buf.get_name()] = (_size, _size)
+            return _size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(sched_buf.node.layout, MultiOutputLayout):
             size_alloc = 0
             for user in sched_buf.users:
@@ -217,6 +280,7 @@ def assign_memory_planning_info_for_scheduler_buffers(
         for dep in node.unmet_dependencies:
             dep_name_to_succ_nodes[dep.name].add(node)
 
+<<<<<<< HEAD
     # iterate in reverse, so dependencies are picked up transitively.
     for mutating_buf_name, real_buf_name in reversed(
         V.graph.scheduler.mutation_real_name.items()
@@ -225,6 +289,8 @@ def assign_memory_planning_info_for_scheduler_buffers(
             mutating_buf_name
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # populate the MemoryPlanningInfoForBuffer attribute to each scheduler buffer
     # note: there are scheduler buffers not in dep_name_to_succ_nodes (e.g., graph outputs)
     for buf_name in name_to_buf.keys():
@@ -244,6 +310,7 @@ def assign_memory_planning_info_for_scheduler_nodes(
     """
     Assign to each scheduler node its predecessor and successor nodes.
     """
+<<<<<<< HEAD
 
     node_to_pred_nodes: dict[BaseSchedulerNode, OrderedSet[BaseSchedulerNode]] = (
         collections.defaultdict(OrderedSet)
@@ -255,11 +322,29 @@ def assign_memory_planning_info_for_scheduler_nodes(
 
     # collect all predecessors using existing successor mappings
     for node in nodes:
+=======
+    from .scheduler import SchedulerBuffer
+
+    for index, node in enumerate(nodes):
+        size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
+        pred_buffers = OrderedSet[Union[SchedulerBuffer, FreeableInputBuffer]]()
+        for dep in node.read_writes.reads:
+            if dep.name in name_to_buf and dep in node.unmet_dependencies:
+                pred_buffers.add(name_to_buf[dep.name])
+            elif dep.name in name_to_freeable_input_buf:
+                pred_buffers.add(name_to_freeable_input_buf[dep.name])
+        pred_nodes = OrderedSet(
+            name_to_fused_node[pred_buffer.defining_op_name()]
+            for pred_buffer in pred_buffers
+            if (isinstance(pred_buffer, SchedulerBuffer))
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         succ_nodes = OrderedSet(
             succ_node
             for buffer in node.get_outputs()
             for succ_node in buffer.mpi_buffer.succ_nodes
         )
+<<<<<<< HEAD
         node_to_succ_nodes[node] = succ_nodes
 
         # For each successor, add current node as its predecessor
@@ -291,10 +376,18 @@ def assign_memory_planning_info_for_scheduler_nodes(
             size=size_alloc,
             pred_buffers=node_to_pred_buffers[node],
             pred_nodes=node_to_pred_nodes[node],
+=======
+        node.mpi_node = MemoryPlanningInfoForNode(
+            index=index,
+            size=size_alloc,
+            pred_buffers=pred_buffers,
+            pred_nodes=pred_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             succ_nodes=succ_nodes,
         )
 
 
+<<<<<<< HEAD
 # map each scheduler buffer to its size, start step, and end step
 @dataclasses.dataclass
 class BufferInfo:
@@ -394,6 +487,8 @@ def _get_end_step_and_snode(
     return buf_info_list, node_to_step, buf_to_snode_last_use
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def estimate_peak_memory(
     nodes: list[BaseSchedulerNode],
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
@@ -408,9 +503,74 @@ def estimate_peak_memory(
         List[int]: memory usage at each node (or each step).
     """
 
+<<<<<<< HEAD
     buf_info_list, _, _ = compute_memory_timeline(
         nodes, name_to_freeable_input_buf, graph_outputs
     )
+=======
+    # map each scheduler buffer to its size, start step, and end step
+    @dataclasses.dataclass
+    class BufferInfo:
+        buffer: Union[SchedulerBuffer, FreeableInputBuffer]
+        size_alloc: int
+        size_free: int
+        start_step: int
+        end_step: int
+
+    # get the execution step of each node, this will be used to determine
+    # the end_step of buffers
+    node_to_step: dict[BaseSchedulerNode, int] = {
+        node: step for step, node in enumerate(nodes)
+    }
+
+    # get buffers' size and liveliness information
+    buf_info_list: list[BufferInfo] = []
+    # 1. for freeable input buffers
+    for buf_name, input_buf in name_to_freeable_input_buf.items():
+        end_step = (
+            len(nodes) - 1
+            if buf_name in graph_outputs
+            else max(
+                node_to_step[succ_node] for succ_node in input_buf.mpi_buffer.succ_nodes
+            )
+        )
+        buf_info_list.append(
+            BufferInfo(
+                input_buf,
+                input_buf.mpi_buffer.size_free,
+                input_buf.mpi_buffer.size_free,
+                0,
+                end_step,
+            )
+        )
+
+    # 2. for scheduler buffers
+    for step, node in enumerate(nodes):
+        for sched_buf in node.get_outputs():
+            # note: it is possible for a non-graph-output sched_buf to have no succ_nodes and
+            # to be only used by its defining op (e.g., due to fusion when all consumers of
+            # the buffer are fused with its defining op). In such cases, end_step is step.
+            end_step = (
+                len(nodes) - 1
+                if sched_buf.get_name() in graph_outputs
+                else max(
+                    [
+                        node_to_step[succ_node]
+                        for succ_node in sched_buf.mpi_buffer.succ_nodes
+                    ],
+                    default=step,
+                )
+            )
+            buf_info_list.append(
+                BufferInfo(
+                    sched_buf,
+                    sched_buf.mpi_buffer.size_alloc,
+                    sched_buf.mpi_buffer.size_free,
+                    step,
+                    end_step,
+                )
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # incremental memory changes at each step
     memory = [0 for _ in range(len(nodes) + 1)]
@@ -432,6 +592,7 @@ def estimate_peak_memory(
     return (max_memory, memories_at_nodes)
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
 class SNodeMemory:
     size_alloc: int
@@ -499,6 +660,8 @@ def estimate_peak_memory_allocfree(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def topological_sort_lpmf(
     nodes: list[BaseSchedulerNode],
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
@@ -512,7 +675,11 @@ def topological_sort_lpmf(
     Buffer memory optimization for video codec application modeled in Simulink
     https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF
 
+<<<<<<< HEAD
     The algorithm maintains the max memory so far.
+=======
+    The algorithm maintain the max memory so far.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     At every iteration, for each scheduleable node, it computes:
         - how much memory needs to be allocated for the output buffers of this node;
         - how much memory can be freed as a result of executing this node.
@@ -568,7 +735,10 @@ class BufferInfo(TypedDict):
         elif buf_name in name_to_freeable_input_buf:
             output_memory += name_to_freeable_input_buf[buf_name].mpi_buffer.size_free
     max_memory = max(live_memory, output_memory)
+<<<<<<< HEAD
     memory_gap = max_memory - live_memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # compute the amount of memory that is allocated when a node is scheduled
     # and the amount of memory that can be freed when a node is scheduled
@@ -584,6 +754,7 @@ class BufferInfo(TypedDict):
 
     # schedule nodes one at a time
     schedule: list[BaseSchedulerNode] = []
+<<<<<<< HEAD
     size_threshold = config.size_threshold_for_succ_based_strategy
     num_iters: int = 0
     while num_iters < len(nodes) and nodes_to_schedule:
@@ -611,6 +782,19 @@ class BufferInfo(TypedDict):
                     node.mpi_node.index,
                 ),
             )
+=======
+    num_iters: int = 0
+    while num_iters < len(nodes) and nodes_to_schedule:
+        # select a node to schedule:
+        selected_node = min(
+            nodes_to_schedule,
+            key=lambda node: (
+                max(live_memory + node.mpi_node.size, max_memory),
+                node.mpi_node.size - node_info[node]["memory_to_free"],
+                node.mpi_node.index,
+            ),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nodes_to_schedule.remove(selected_node)
         schedule.append(selected_node)
         num_iters += 1
@@ -619,7 +803,10 @@ class BufferInfo(TypedDict):
         live_memory += selected_node.mpi_node.size
         max_memory = max(max_memory, live_memory)
         live_memory -= node_info[selected_node]["memory_to_free"]
+<<<<<<< HEAD
         memory_gap = max_memory - live_memory
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update successor nodes and nodes_to_schedule
         for succ_node in selected_node.mpi_node.succ_nodes:
@@ -756,6 +943,7 @@ def visit(n: BaseSchedulerNode) -> None:
     return result
 
 
+<<<<<<< HEAD
 def validate_graph_acyclic(nodes: list[BaseSchedulerNode]) -> None:
     """
     Validate that the graph is acyclic by checking predecessor relationships.
@@ -843,6 +1031,8 @@ def validate_unique_buffer_names(
                 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_planning_info(
     nodes: list[BaseSchedulerNode],
     name_to_buf: dict[str, SchedulerBuffer],
@@ -899,6 +1089,7 @@ def reorder_for_peak_memory(
         graph_outputs,
     )
 
+<<<<<<< HEAD
     # export graph for simulator if needed
     if config.reorder_for_peak_memory_debug:
         export_graph_for_simulator(
@@ -918,6 +1109,8 @@ def reorder_for_peak_memory(
         if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
             raise
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # keep track of the peak memory estimates of different methods
     peak_memory_diff_methods: list[PeakMemoryResult] = []
     peak_memory_diff_methods.append(
@@ -942,10 +1135,15 @@ def reorder_for_peak_memory(
                 PeakMemoryResult(order, peak_memory, method.__name__)
             )
             torch_log.info("%s peak memory: %d", method.__name__, peak_memory)
+<<<<<<< HEAD
         except Exception:
             torch_log.exception("Failed to reorder for %s", method.__name__)
             if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
                 raise
+=======
+        except Exception as e:
+            torch_log.error("Failed to reorder for %s: %s", method.__name__, e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     signpost_event(
         category="inductor",
@@ -959,6 +1157,7 @@ def reorder_for_peak_memory(
     best_result = min(peak_memory_diff_methods, key=lambda x: x.peak_memory)
 
     return best_result.order
+<<<<<<< HEAD
 
 
 def export_graph_for_simulator(
@@ -1068,3 +1267,5 @@ class ORMGraph(TypedDict):
         },
         payload_fn=lambda: g_str,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index e891cd6819e09..051d859c2ed39 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -7,7 +7,11 @@
 import re
 from dataclasses import dataclass
 from functools import lru_cache
+<<<<<<< HEAD
 from typing import Callable, Optional, TYPE_CHECKING, Union
+=======
+from typing import Callable, cast, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._inductor import config
 from torch._inductor.utils import get_benchmark_name
@@ -16,7 +20,10 @@
 
 # Prevent circular import
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from torch._inductor.runtime.triton_compat import Config
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.scheduler import BaseSchedulerNode
 
 # counter for tracking how many kernels have been generated
@@ -55,8 +62,11 @@ class CppOuterLoopFusedCount:
 # counter for parallel reduction.
 parallel_reduction_count = 0
 
+<<<<<<< HEAD
 codegen_mix_order_reduction = 0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # reset all counters
 def reset() -> None:
@@ -70,7 +80,10 @@ def reset() -> None:
     global num_matches_for_scatter_upon_const_tensor
     global num_loop_reordering
     global parallel_reduction_count
+<<<<<<< HEAD
     global codegen_mix_order_reduction
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     generated_kernel_count = 0
     generated_cpp_vec_kernel_count = 0
@@ -84,7 +97,10 @@ def reset() -> None:
     num_matches_for_scatter_upon_const_tensor = 0
     num_loop_reordering = 0
     parallel_reduction_count = 0
+<<<<<<< HEAD
     codegen_mix_order_reduction = 0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -158,8 +174,13 @@ def add_row(
         bn = get_benchmark_name()
         # assert bn is not None
         row = [bn] + [row_dict[column_name] for column_name in self.column_names]
+<<<<<<< HEAD
         assert all(isinstance(i, (str, float, type(None))) for i in row)
         self._write_row(row)
+=======
+        assert all(isinstance(i, str) for i in row)
+        self._write_row(cast(list[str], row))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def output_filename(self) -> str:
         return f"metric_table_{self.table_name}.csv"
@@ -170,7 +191,11 @@ def write_header(self) -> None:
             writer = csv.writer(fd, lineterminator="\n")
             writer.writerow(["model_name"] + self.column_names)
 
+<<<<<<< HEAD
     def _write_row(self, row: list[str | float | None]) -> None:
+=======
+    def _write_row(self, row: list[str]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         filename = self.output_filename()
         if self.num_rows_added == 0 and not os.path.exists(filename):
             self.write_header()
@@ -457,6 +482,7 @@ def is_metric_table_enabled(name: str) -> bool:
 def get_metric_table(name: str) -> MetricTable:
     assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
     return REGISTERED_METRIC_TABLES[name]
+<<<<<<< HEAD
 
 
 MetricTable.register_table(
@@ -481,3 +507,5 @@ def log_kernel_autotune_result(
             "latency_ms": latency,
         }
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index 0fb7bde84450d..fbfed0262d0e4 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -1,11 +1,19 @@
 # mypy: allow-untyped-defs
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._prims_common import make_channels_last_strides_for, StrideType
+=======
+from torch._prims_common import make_channels_last_strides_for
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from .ir import (
@@ -14,7 +22,10 @@
     FlexibleLayout,
     get_device_type,
     ir_node_to_tensor,
+<<<<<<< HEAD
     IRNode,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_contiguous_storage_and_layout,
     Layout,
     may_convert_to_optional,
@@ -22,7 +33,10 @@
     MultiOutputLayout,
     MutationOutput,
     NoneLayout,
+<<<<<<< HEAD
     ShapeAsConstantBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorBox,
 )
 from .utils import convert_shape_to_inductor, pad_listlike, SUPPORTED_MKLDNN_DEVICES
@@ -83,7 +97,10 @@ def _conv_output_size(input_size, weight_size, padding, stride, dilation=None):
         output_size.append(input_size[0])
         output_size.append(weight_size[0])
         for d in range(2, dim):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dilation_ = dilation[d - 2] if has_dilation else 1
             kernel = dilation_ * (weight_size[d] - 1) + 1
             output_size_d = (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[
@@ -178,7 +195,11 @@ def _original_deconv_weight_size(
     if (
         dynamic_shapes or get_device_type(x) == "xpu"
     ) and is_contiguous_storage_and_layout(x):
+<<<<<<< HEAD
         output_stride: StrideType = FlexibleLayout.contiguous_strides(output_size)
+=======
+        output_stride = FlexibleLayout.contiguous_strides(output_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Currently we don't support channel last for the situation that stride of input's batch dim is 0,
     # eg. input_size = (1, 1280, 64, 64), but input_stride=(0, 1, 81920, 1280).
     # So we use NCHW hear instead.
@@ -410,7 +431,10 @@ def create(
         ) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         other = cls.require_stride_order(other, req_stride_order)
         inputs.insert(1, other)
         constant_args = constant_args + [
@@ -488,7 +512,10 @@ def create(
         ) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         other = cls.require_stride_order(other, req_stride_order)
         inputs.insert(1, other)
         constant_args = constant_args + [
@@ -516,13 +543,17 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_transpose_pointwise.default,
+<<<<<<< HEAD
             cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_transpose_pointwise",
         )
 
@@ -530,6 +561,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_transpose_pointwise",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -596,7 +634,10 @@ def __init__(
             - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.has_bias = len(inputs) == 5
         super().__init__(
             layout,
@@ -604,6 +645,7 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv_pointwise.default,
+<<<<<<< HEAD
             cpp_kernel_name=f"aoti_torch_{self.device_type}__qconv_pointwise_tensor",
         )
 
@@ -611,6 +653,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name="aoti_torch_cpu__qconv_pointwise_tensor",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -619,8 +668,13 @@ def codegen(self, wrapper):
     def create(
         cls,
         qx: "TensorBox",
+<<<<<<< HEAD
         x_scale: Union["ShapeAsConstantBuffer", "TensorBox"],
         x_zero_point: Union["ShapeAsConstantBuffer", "TensorBox"],
+=======
+        x_scale: "TensorBox",
+        x_zero_point: "TensorBox",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qw: "TensorBox",  # qw
         w_scale: "TensorBox",
         w_zero_point: "TensorBox",
@@ -655,7 +709,11 @@ def create(
             groups,
             transposed,
             output_padding,
+<<<<<<< HEAD
             [x_scale, x_zero_point, w_scale, w_zero_point],  # type: ignore[list-item]
+=======
+            [x_scale, x_zero_point, w_scale, w_zero_point],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # swap padding and stride to align with functional conv arg order
         if bias is None:
@@ -703,7 +761,10 @@ def __init__(
             - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
              output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.has_bias = len(inputs) == 8
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -712,6 +773,7 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv2d_pointwise.binary,
+<<<<<<< HEAD
             cpp_kernel_name=(
                 f"aoti_torch_{self.device_type}__qconv2d_pointwise_binary_tensor"
             ),
@@ -721,12 +783,23 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name=("aoti_torch_cpu__qconv2d_pointwise_binary_tensor"),
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
 
     def get_mutation_names(self) -> Sequence[str]:
+<<<<<<< HEAD
         return [self.input_name(self.idx_for_inplace_sum)]
+=======
+        return [self.inputs[self.idx_for_inplace_sum].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
@@ -848,10 +921,17 @@ def create(cls, x, packed_w, orig_w, B, batch_size):
         else:
             constant_args.insert(0, None)
 
+<<<<<<< HEAD
         device = x.get_device()
         assert device is not None
         return MKLPackedLinear(
             layout=FixedLayout(device, x.get_dtype(), output_size, output_stride),
+=======
+        return MKLPackedLinear(
+            layout=FixedLayout(
+                x.get_device(), x.get_dtype(), output_size, output_stride
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputs=inputs,
             constant_args=constant_args,
         )
@@ -864,13 +944,17 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.default,
+<<<<<<< HEAD
             cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise",
         )
 
@@ -878,6 +962,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name="aoti_torch_cpu__linear_pointwise",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -896,12 +987,18 @@ def create(cls, x, w, B, attr, scalars, algorithm):
         else:
             constant_args.insert(0, None)
 
+<<<<<<< HEAD
         device = x.get_device()
         assert device is not None
 
         packed = LinearUnary(
             layout=FixedLayout(
                 device=device,
+=======
+        packed = LinearUnary(
+            layout=FixedLayout(
+                device=x.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=x.get_dtype(),
                 size=output_size,
             ),
@@ -923,13 +1020,17 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.binary,
+<<<<<<< HEAD
             cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise_binary",
         )
 
@@ -937,6 +1038,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name="aoti_torch_cpu__linear_pointwise_binary",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -956,11 +1064,17 @@ def create(cls, x, y, w, B, attr):
         else:
             constant_args.insert(0, B)
 
+<<<<<<< HEAD
         device = x.get_device()
         assert device is not None
         packed = LinearBinary(
             layout=FixedLayout(
                 device=device,
+=======
+        packed = LinearBinary(
+            layout=FixedLayout(
+                device=x.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=x.get_dtype(),
                 size=output_size,
             ),
@@ -991,7 +1105,10 @@ def __init__(
             - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.has_bias = has_bias
         super().__init__(
             layout,
@@ -999,6 +1116,7 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.tensor),
+<<<<<<< HEAD
             cpp_kernel_name=(
                 f"aoti_torch_{self.device_type}__qlinear_pointwise_tensor"
             ),
@@ -1008,6 +1126,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name=("aoti_torch_cpu__qlinear_pointwise_tensor"),
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
         if isinstance(self.layout, Layout):
@@ -1079,7 +1204,10 @@ def __init__(
             - const_args is: [bias, o_scale, o_zp,
               fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+<<<<<<< HEAD
         self.device_type = get_device_type(inputs[0])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.has_bias = has_bias
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -1088,6 +1216,7 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.binary_tensor),
+<<<<<<< HEAD
             cpp_kernel_name=f"aoti_torch_{self.device_type}__qlinear_pointwise_binary_tensor",
         )
 
@@ -1095,6 +1224,13 @@ def codegen(self, wrapper):
         wrapper.include_extra_header(
             f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
         )
+=======
+            cpp_kernel_name="aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -1102,9 +1238,13 @@ def codegen(self, wrapper):
     def get_mutation_names(self) -> Sequence[str]:
         binary_post_op = self.constant_args[-5]
         if binary_post_op == "sum":
+<<<<<<< HEAD
             input = self.inputs[self.idx_for_inplace_sum]
             assert isinstance(input, IRNode)
             return [input.get_name()]
+=======
+            return [self.inputs[self.idx_for_inplace_sum].get_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return []
 
@@ -1219,11 +1359,15 @@ def create(
         batch_first: bool,
         train: bool,
     ):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = cls.require_stride1(cls.realize_input(x))
         # If batch_first, x has been permuted in lstm before entering the mkldnn_rnn_layer.
         # Make sure x is contiguous in batch_first case.
         x.freeze_layout()
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         w0 = cls.require_stride1(cls.realize_input(w0))
         # pyrefly: ignore [bad-assignment]
@@ -1236,6 +1380,14 @@ def create(
         hx = cls.require_stride1(cls.realize_input(hx))
         hx.freeze_layout()
         # pyrefly: ignore [bad-assignment]
+=======
+        w0 = cls.require_stride1(cls.realize_input(w0))
+        w1 = cls.require_stride1(cls.realize_input(w1))
+        w2 = cls.require_stride1(cls.realize_input(w2))
+        w3 = cls.require_stride1(cls.realize_input(w3))
+        hx = cls.require_stride1(cls.realize_input(hx))
+        hx.freeze_layout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cx = cls.require_stride1(cls.realize_input(cx))
         cx.freeze_layout()
 
@@ -1262,10 +1414,15 @@ def create(
             train,
         ]
 
+<<<<<<< HEAD
         device = x.get_device()
         assert device is not None
         packed = MkldnnRnnLayer(
             MultiOutputLayout(device=device),
+=======
+        packed = MkldnnRnnLayer(
+            MultiOutputLayout(device=x.get_device()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputs=inputs,
             constant_args=constant_args,
         )
@@ -1286,7 +1443,11 @@ def get_strides_of_lstm_output(output_shape, batch_first):
         output_ir = [
             MultiOutput(
                 FixedLayout(
+<<<<<<< HEAD
                     x.get_device(),  # type: ignore[arg-type]
+=======
+                    x.get_device(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     x.get_dtype(),
                     output_size,
                     output_stride,
diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py
index 65261b2dff61b..75e5f2f690902 100644
--- a/torch/_inductor/mkldnn_lowerings.py
+++ b/torch/_inductor/mkldnn_lowerings.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 from typing import Optional, Union
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -35,6 +39,7 @@ def create_int8_compensation(
     x_scale: ir.TensorBox,
     x_zp: ir.TensorBox,
     w_scale: ir.TensorBox,
+<<<<<<< HEAD
 ) -> tuple[
     bool,
     Union[ir.TensorBox, ir.ShapeAsConstantBuffer],
@@ -42,13 +47,25 @@ def create_int8_compensation(
 ]:
     x_w_scale: Optional[Union[ir.TensorBox, ir.ShapeAsConstantBuffer]] = None
     use_int8_fast_compensation_path = all(
+=======
+) -> tuple[bool, ir.TensorBox, Optional[ir.TensorBox]]:
+    use_int8_fast_compensation_path = False
+    weight_compens = None
+    x_w_scale = None
+    if all(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         isinstance(item, ir.TensorBox)
         and item.get_name() in V.graph.constants
         and hasattr(item.data, "data")
         and isinstance(item.data.data, ir.ConstantBuffer)
         for item in [x_scale, x_zp, w_scale]
+<<<<<<< HEAD
     )
     if use_int8_fast_compensation_path:
+=======
+    ):
+        use_int8_fast_compensation_path = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_w_scale_tensor = (
             V.graph.constants[x_scale.get_name()]
             * V.graph.constants[w_scale.get_name()]
@@ -70,7 +87,11 @@ def create_int8_compensation(
             weight_compens_tensor,
             name=packed_weight.get_name() + "_BMatrixCompens",
         )
+<<<<<<< HEAD
     return (  # type: ignore[return-type]
+=======
+    return (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_int8_fast_compensation_path,
         weight_compens,
         x_w_scale,
@@ -142,18 +163,30 @@ def grouped_gemm_lowering(
     num_gemm = len(w)
 
     assert config.max_autotune or config.max_autotune_gemm
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     b = [bias if bias is None else ir.ExternKernel.realize_input(bias) for bias in b]
 
     choices: list[ChoiceCaller] = []
     *_, layout, x, _ = mm_args(x, permute(w[0], [1, 0]), layout=layout)
 
+<<<<<<< HEAD
     kwargs = {
         "has_bias": [bias is not None for bias in b],
         "trans_w": True,
         "epilogue_creator": None,
         "act_mapping": dict.fromkeys(range(num_gemm), x),
     }
+=======
+    kwargs = dict(
+        has_bias=[bias is not None for bias in b],
+        trans_w=True,
+        epilogue_creator=None,
+        act_mapping=dict.fromkeys(range(num_gemm), x),
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     input_nodes = [x, *w]
     input_nodes.extend([bias for bias in b if bias is not None])
@@ -177,7 +210,10 @@ def grouped_gemm_lowering(
         ir.MultiOutput(layout, template_buf, [(list, gemm_idx)])
         for gemm_idx in range(num_gemm)
     ]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     template_buf.layout = ir.MultiOutputLayout(device=input_nodes[0].get_device())
     template_buf.outputs = return_bufs
     return_tensors = [
@@ -186,7 +222,11 @@ def grouped_gemm_lowering(
     if len(x_size) > 2:
         for gemm_idx in range(num_gemm):
             return_tensors[gemm_idx] = view(
+<<<<<<< HEAD
                 return_tensors[gemm_idx],  # type: ignore[arg-type]
+=======
+                return_tensors[gemm_idx],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (*x_size[:-1], return_tensors[gemm_idx].get_size()[-1]),
             )
     return return_tensors
@@ -343,7 +383,11 @@ def linear_unary(
                 # GEMM template needs 2D input, normalize input shape here
                 x = view(x, [-1, x_size[-1]])
             if b is not None:
+<<<<<<< HEAD
                 b = ir.ExternKernel.realize_input(b)  # type: ignore[assignment]
+=======
+                b = ir.ExternKernel.realize_input(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             choices: list[ChoiceCaller] = []
             if config.max_autotune or config.max_autotune_gemm:
                 transposed_w = permute(w, [1, 0])
@@ -355,6 +399,7 @@ def epilogue_creator(buf):
                             buf, attr, scalars=scalars, algorithm=algorithm
                         )
 
+<<<<<<< HEAD
                     kwargs = {
                         "has_bias": b is not None,
                         "trans_w": True,
@@ -362,6 +407,13 @@ def epilogue_creator(buf):
                             None if attr == "none" else epilogue_creator
                         ),
                     }
+=======
+                    kwargs = dict(
+                        has_bias=b is not None,
+                        trans_w=True,
+                        epilogue_creator=None if attr == "none" else epilogue_creator,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if b is not None:
                         kwargs["input_indices"] = [2, 0, 1]  # type: ignore[assignment]
                     CppGemmTemplate.add_choices(
@@ -408,7 +460,11 @@ def linear_binary(
             if len(y_size) > 2:
                 y = view(y, [-1, y_size[-1]])
             if b is not None:
+<<<<<<< HEAD
                 b = ir.ExternKernel.realize_input(b)  # type: ignore[assignment]
+=======
+                b = ir.ExternKernel.realize_input(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             choices: list[ChoiceCaller] = []
             if config.max_autotune or config.max_autotune_gemm:
                 transposed_w = permute(w, [1, 0])
@@ -420,6 +476,7 @@ def linear_binary(
                     def epilogue_creator(buf):
                         return create_epilogue_with_attr(buf, attr, other=y)
 
+<<<<<<< HEAD
                     kwargs = {
                         "has_bias": b is not None,
                         "trans_w": True,
@@ -427,6 +484,13 @@ def epilogue_creator(buf):
                     }
 
                     # pyrefly: ignore [unsupported-operation]
+=======
+                    kwargs = dict(
+                        has_bias=b is not None,
+                        trans_w=True,
+                        epilogue_creator=epilogue_creator,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     kwargs["input_indices"] = [0, 2, 1] if b is None else [3, 0, 2, 1]
                     CppGemmTemplate.add_choices(
                         choices,
@@ -552,11 +616,19 @@ def qconvolution_unary(
             algorithm,
         ):
             # To align with qlinear where x_scale and x_zp are converted to Tensor
+<<<<<<< HEAD
             assert type(x_scale) is float
             x_scale = V.graph.add_tensor_constant(
                 torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
             )
             assert type(x_zp) is int
+=======
+            assert type(x_scale) == float
+            x_scale = V.graph.add_tensor_constant(
+                torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
+            )
+            assert type(x_zp) == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x_zp = V.graph.add_tensor_constant(
                 torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
             )
@@ -614,11 +686,19 @@ def qconvolution_binary(
             unary_algorithmm,
         ):
             # To align with qlinear where x_scale and x_zp are converted to Tensor
+<<<<<<< HEAD
             assert type(x_scale) is float
             x_scale = V.graph.add_tensor_constant(
                 torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
             )
             assert type(x_zp) is int
+=======
+            assert type(x_scale) == float
+            x_scale = V.graph.add_tensor_constant(
+                torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
+            )
+            assert type(x_zp) == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x_zp = V.graph.add_tensor_constant(
                 torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
             )
@@ -637,8 +717,13 @@ def qconvolution_binary(
             return TensorBox.create(
                 mkldnn_ir.QConvPointWiseBinaryPT2E.create(
                     x,
+<<<<<<< HEAD
                     x_scale,  # type: ignore[arg-type]
                     x_zp,  # type: ignore[arg-type]
+=======
+                    x_scale,
+                    x_zp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     packed_weight,
                     w_scale,
                     w_zp,
@@ -678,15 +763,24 @@ def qlinear_unary(
             algorithm,
             layout=None,
         ):
+<<<<<<< HEAD
             assert packed_weight.get_dtype() in [torch.int8, torch.float8_e4m3fn], (
                 "Only int8 and e4m3fn weights are supported by oneDNN qlinear."
+=======
+            assert packed_weight.get_dtype() is torch.int8, (
+                "Only int8 weights are supported by oneDNN qlinear."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             x_size = x.get_size()
             if len(x_size) > 2:
                 # GEMM template needs 2D input, normalize input shape here
                 x = view(x, [-1, x_size[-1]])
             if not isinstance(x_scale, ir.TensorBox):
+<<<<<<< HEAD
                 assert type(x_scale) is float
+=======
+                assert type(x_scale) == float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x_scale = V.graph.add_tensor_constant(
                     torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
                 )
@@ -708,7 +802,11 @@ def qlinear_unary(
                     torch.tensor(0, dtype=torch.int32), name="x_zp"
                 )
             if not isinstance(x_zp, ir.TensorBox):
+<<<<<<< HEAD
                 assert type(x_zp) is int
+=======
+                assert type(x_zp) == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x_zp = V.graph.add_tensor_constant(
                     torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
                 )
@@ -724,7 +822,10 @@ def qlinear_unary(
                 # If w_zp is None, then it's a dummy tensor created to denote the
                 # absence of a zero point, and thus w is int8 symmetrically quantized.
                 # Moreover, oneDNN qlinear API doesn't accept None value for zp
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 w_zp = V.graph.add_tensor_constant(
                     torch.tensor(0, dtype=torch.int32), name="w_zp"
                 )
@@ -736,7 +837,11 @@ def qlinear_unary(
             ):
                 # W_zp might be a ConstantBuffer with int64, convert it to int32
                 w_zp_tensor = V.graph.constants[w_zp.get_name()].to(torch.int32)
+<<<<<<< HEAD
                 w_zp = V.graph.add_tensor_constant(  # type: ignore[assignment]
+=======
+                w_zp = V.graph.add_tensor_constant(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.tensor(w_zp_tensor, dtype=torch.int32), name=w_zp.get_name()
                 )
 
@@ -768,9 +873,13 @@ def qlinear_unary(
                     ) = create_int8_compensation(
                         W_tensor,
                         packed_weight,
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
                         x_scale,
                         # pyrefly: ignore [bad-argument-type]
+=======
+                        x_scale,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         x_zp,
                         w_scale,
                     )
@@ -829,7 +938,10 @@ def inner_fn(index):
                             )
                             # Step 2: add Bias if applicable
                             if bias is not None:
+<<<<<<< HEAD
                                 # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 _bias = bias_loader(weight_compens_index)
                                 nonlocal bias_dtype
                                 assert bias_dtype in [torch.float32, torch.bfloat16]
@@ -1001,7 +1113,11 @@ def qlinear_binary(
                 x = view(x, [-1, x_size[-1]])
                 x2 = view(x2, [-1, x2_size[-1]])
             if not isinstance(x_scale, ir.TensorBox):
+<<<<<<< HEAD
                 assert type(x_scale) is float
+=======
+                assert type(x_scale) == float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x_scale = V.graph.add_tensor_constant(
                     torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
                 )
@@ -1020,13 +1136,20 @@ def qlinear_binary(
                 )
 
             if w_zp is None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 w_zp = V.graph.add_tensor_constant(
                     torch.tensor(0, dtype=torch.int32), name="w_zp"
                 )
 
             if not isinstance(x_zp, ir.TensorBox):
+<<<<<<< HEAD
                 assert type(x_zp) is int
+=======
+                assert type(x_zp) == int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x_zp = V.graph.add_tensor_constant(
                     torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
                 )
@@ -1043,7 +1166,11 @@ def qlinear_binary(
                 ir.ConstantBuffer,
             ):
                 w_zp_tensor = V.graph.constants[w_zp.get_name()].to(torch.int32)
+<<<<<<< HEAD
                 w_zp = V.graph.add_tensor_constant(  # type: ignore[assignment]
+=======
+                w_zp = V.graph.add_tensor_constant(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.tensor(w_zp_tensor, dtype=torch.int32), name=w_zp.get_name()
                 )
             if binary_attr == "sum":
@@ -1095,9 +1222,13 @@ def qlinear_binary(
                     ) = create_int8_compensation(
                         W_tensor,
                         packed_weight,
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
                         x_scale,
                         # pyrefly: ignore [bad-argument-type]
+=======
+                        x_scale,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         x_zp,
                         w_scale,
                     )
@@ -1157,7 +1288,10 @@ def inner_fn(index):
                             )
                             # Step 2: add Bias if applicable
                             if bias is not None:
+<<<<<<< HEAD
                                 # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 _bias = bias_loader(weight_compens_index)
                                 nonlocal bias_dtype
                                 assert bias_dtype in [torch.float32, torch.bfloat16]
@@ -1359,3 +1493,8 @@ def mkl_packed_linear(
                 return result
 
         add_needs_realized_inputs(cpu_needs_realized_inputs)
+<<<<<<< HEAD
+=======
+    else:
+        pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index 857fe238c25c8..7ba4415621b76 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -19,7 +19,11 @@
 
 
 T = TypeVar("T")
+<<<<<<< HEAD
 StoreMode = Optional[Literal["atomic_add", "tma"]]
+=======
+StoreMode = Optional[Literal["atomic_add"]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ReductionType = Literal[
     "argmax",
     "argmin",
@@ -30,9 +34,13 @@
     "min",
     "prod",
     "sum",
+<<<<<<< HEAD
     "dot",
     "xor_sum",
     "online_softmax_reduce",
+=======
+    "xor_sum",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -288,6 +296,7 @@ def bucketize(
         # See [Note: Inductor bucketize op]
         raise NotImplementedError
 
+<<<<<<< HEAD
     def partial_accumulate(
         self,
         name: str,
@@ -296,6 +305,8 @@ def partial_accumulate(
     ) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # The following ops have semantics that correspond exactly to the torch
     # operation with the same corresponding name.
@@ -696,10 +707,13 @@ def halide_clamp(self, value: T, size: sympy.Expr, check: bool) -> T:
         raise NotImplementedError
 
     # triton-only
+<<<<<<< HEAD
     def dot(self, x: T, y: T) -> T:
         raise NotImplementedError
 
     # triton-only
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def inline_asm_elementwise(
         self,
         *inputs: T,
@@ -719,9 +733,12 @@ def placeholder(self, index: int) -> T:
         """This is a fake op used in analysis but not codegen"""
         raise NotImplementedError
 
+<<<<<<< HEAD
     def device_assert_async(self, cond: T, msg: str) -> T:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _ignore_op_re = re.compile(r"_.*|paren").fullmatch
 
@@ -1162,8 +1179,11 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
         val = getattr(self._inner, name)(*args, **kwargs)
         self.cse_cache[key] = val
         return val
+<<<<<<< HEAD
 
     def device_assert_async(self, *args, **kwargs) -> None:
         raise NotImplementedError(
             f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index 214b52a7491ad..a0c7adb9e38a6 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -41,16 +41,24 @@
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.utils import (
+<<<<<<< HEAD
     _unstable_customized_partition_wrapper,
     align_inputs_from_check_idxs,
     BoxedBool,
     CUDAGraphWrapperMetadata,
+=======
+    align_inputs_from_check_idxs,
+    BoxedBool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GraphPartitionMap,
     InputType,
     output_node,
     set_tracing_context_output_strides,
 )
+<<<<<<< HEAD
 from torch.autograd.profiler import record_function
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from . import config
@@ -110,7 +118,10 @@ def set_triton_bundle(self, triton_bundle: Any) -> None:
 # to achieve writing to all values of that dimension of the input tensor
 def get_expanded_dims(t: torch.Tensor) -> list[int]:
     if not isinstance(t, torch.Tensor):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
 
@@ -425,8 +436,11 @@ class CompiledFxGraph(OutputCode):
     # fx graph. The expression must be generated by:
     # ShapeEnv.produce_guards_expression()
     guards_expr: Optional[str]
+<<<<<<< HEAD
     inductor_provenance_mapping_str: Optional[str]
     inductor_provenance_stack_traces_str: Optional[str]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     cudagraph_info: Optional[CudagraphCachedInfo]
     partition_maps: Optional[list[GraphPartitionMap]]
@@ -453,8 +467,11 @@ def __init__(
         runnable_graph_str: str,
         inductor_post_grad_graph_str: str,
         compiled_fn_runner: Optional[Any] = None,
+<<<<<<< HEAD
         inductor_provenance_mapping_str: Optional[str] = None,
         inductor_provenance_stack_traces_str: Optional[str] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.current_callable = current_callable
         self.compiled_fn_runner = compiled_fn_runner
@@ -469,8 +486,11 @@ def __init__(
                 self.source_code = f.read()
         self.runnable_graph_str = runnable_graph_str
         self.inductor_post_grad_graph_str = inductor_post_grad_graph_str
+<<<<<<< HEAD
         self.inductor_provenance_mapping_str = inductor_provenance_mapping_str
         self.inductor_provenance_stack_traces_str = inductor_provenance_stack_traces_str
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cache_linemap = graph.cache_linemap
         # TODO - ordered set
         self.device_types = OrderedSet(graph.device_types)
@@ -590,6 +610,7 @@ def __del__(self) -> None:
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         assert self.current_callable is not None
+<<<<<<< HEAD
 
         if (
             torch._inductor.debug.RECORD_GRAPH_EXECUTION
@@ -616,6 +637,10 @@ def __call__(self, inputs: Sequence[Any]) -> Any:
                     return self.current_callable(inputs)
             else:
                 return self.current_callable(inputs)
+=======
+        try:
+            return self.current_callable(inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             get_runtime_metrics_context().finish()
             AutotuneCacheBundler.end_compile()
@@ -635,6 +660,7 @@ def post_compile(
         This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
         The results of this function are *not* saved in the cache itself.
         """
+<<<<<<< HEAD
         if config.graph_partition and _unstable_customized_partition_wrapper.wrapper:
             # Mechanically apply user-specified cudagraph wrappers without modification
             assert self.recursively_apply_fns is not None
@@ -652,6 +678,8 @@ def post_compile(
             self.recursively_apply_fns(customized_wrappers_with_metadata)
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         set_tracing_context_output_strides(example_inputs, self)
         assert graph_kwargs["cudagraphs"] is not None
         assert graph_kwargs["is_backward"] is not None
@@ -770,7 +798,11 @@ class CompiledAOTI(OutputCode):
     Class holding an AOTInductor compiled so.
     """
 
+<<<<<<< HEAD
     filename: Union[str, list[Union[str, Weights]], torch.fx.GraphModule]
+=======
+    filename: Union[str, list[Union[str, Weights]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         raise NotImplementedError("NYI")
diff --git a/torch/_inductor/package/package.py b/torch/_inductor/package/package.py
index bd11d033cadb3..7a996f59e47bb 100644
--- a/torch/_inductor/package/package.py
+++ b/torch/_inductor/package/package.py
@@ -105,7 +105,11 @@ def load_package(
     run_single_threaded: bool = False,
     num_runners: int = 1,
     device_index: int = -1,
+<<<<<<< HEAD
 ) -> AOTICompiledModel:
+=======
+) -> AOTICompiledModel:  # type: ignore[type-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         pt2_contents = load_pt2(
             path,
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index c457a4a863fb2..71b0ccb8df343 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -63,11 +63,18 @@
 from torch._prims_common import is_integer_dtype
 from torch._subclasses.fake_tensor import unset_fake_temporarily
 from torch.fx.experimental.proxy_tensor import make_fx
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import guard_or_false, statically_known_true
 from torch.fx.graph_module import _get_attr
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.graph_transform_observer import GraphTransformObserver
 from torch.fx.traceback import preserve_node_meta
+=======
+from torch.fx.experimental.symbolic_shapes import statically_known_true
+from torch.fx.graph_module import _get_attr
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from torch.fx.passes.graph_transform_observer import GraphTransformObserver
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from .._functorch import config as functorch_config
@@ -87,8 +94,11 @@
 Constant = Any
 NodeOrConstant = Union[Constant, torch.fx.Node]
 
+<<<<<<< HEAD
 backend = os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_BACKEND", "inductor")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SearchFn(Protocol):
     __name__: str
@@ -130,7 +140,11 @@ def _transfer_meta(
     # transfer metadata after pattern matching occurs.
     # skip "val" and "tensor_meta" because this info is too specific; it's unlikely
     # to remain accurate after pattern matching has occurred.
+<<<<<<< HEAD
     if config.trace.provenance_tracking_level == 1:
+=======
+    if config.trace.enabled:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We handle "from_node" field of the node meta specially to record that the new node comes from the old_node.
         new_from_node = new_meta.get("from_node", []).copy()
         new_from_node.append(NodeSource(old_node, pass_name, NodeSourceAction.REPLACE))
@@ -146,8 +160,11 @@ def _transfer_meta(
             for k, v in old_node.meta.items()
             if k in torch.fx.proxy._COPY_META_FIELDS
         )
+<<<<<<< HEAD
     if "stack_trace" in old_node.meta:
         new_meta["stack_trace"] = old_node.meta["stack_trace"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Match:
@@ -270,7 +287,10 @@ def should_propagate_eager_input_vals(nodes: list[torch.fx.Node]) -> bool:
                 ]
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-context-manager]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with context:
             if trace_fn is None:
                 trace_fn = functools.partial(
@@ -324,12 +344,16 @@ def record(node: torch.fx.Node, val: Any) -> None:
                         ]
 
             else:
+<<<<<<< HEAD
                 example_vals = torch.fx.map_arg(
                     args,
                     lambda arg: arg.meta["val"]
                     if "val" in arg.meta
                     else arg.meta["example_value"],
                 )
+=======
+                example_vals = torch.fx.map_arg(args, lambda arg: arg.meta["val"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 replacement = trace_fn(replacement_fn, example_vals)
             if len(self.nodes) == 1:
                 for n in replacement.graph.nodes:
@@ -549,7 +573,11 @@ def __init__(
         fns = [fns] if callable(fns) or isinstance(fns, str) else list(fns)
         for fn in fns:
             if isinstance(fn, torch._ops.OpOverloadPacket):
+<<<<<<< HEAD
                 fns.extend(getattr(fn, overload) for overload in fn.overloads())  # noqa: B909
+=======
+                fns.extend(getattr(fn, overload) for overload in fn.overloads())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.fns = fns
         self.fns_set = OrderedSet(fns)
@@ -1128,10 +1156,13 @@ def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> Non
 
 @dataclasses.dataclass
 class ReplacementPatternEntry(PatternEntry):
+<<<<<<< HEAD
     """
     The replacement pattern for the graph
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     normalize_args: Callable[..., list[Any]]
 
     @staticmethod
@@ -1141,12 +1172,15 @@ def replace_with_graph(
         replacement_graph: Union[torch.fx.Graph, torch.fx.GraphModule],
         args: Sequence[torch.fx.Node],
     ) -> None:
+<<<<<<< HEAD
         """
         Inserts the replacement graph into the toplevel graph at the match
         """
 
         added_replacement_nodes: list[torch.fx.Node] = []
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Replacer(torch.fx.Interpreter):
             call_method = None  # type: ignore[assignment]
             call_module = None  # type: ignore[assignment]
@@ -1160,7 +1194,10 @@ def run_node(self, node: torch.fx.Node) -> Any:
                 if node.op == "call_function":
                     assert callable(target)
                     result = graph.call_function(target, args, kwargs)
+<<<<<<< HEAD
                     added_replacement_nodes.append(result)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     _transfer_meta(
                         new_meta=result.meta,
                         old_node=node,
@@ -1189,6 +1226,7 @@ def run_node(self, node: torch.fx.Node) -> Any:
                         raise NotImplementedError(
                             f"NYI: replacement_graph.{target} is not a graph module. Got {sub_gm}."
                         )
+<<<<<<< HEAD
                     assert graph.owning_module is not None
                     graph_name = None
                     for n, mod in graph.owning_module.named_modules():
@@ -1208,6 +1246,15 @@ def run_node(self, node: torch.fx.Node) -> Any:
                     getattr_node = graph.get_attr(graph_name)
                     added_replacement_nodes.append(getattr_node)
                     return getattr_node
+=======
+
+                    assert graph.owning_module is not None
+                    _, graph_name = unique_graph_name_with_root(
+                        graph.owning_module, str(target)
+                    )
+                    graph.owning_module.register_module(graph_name, sub_gm)
+                    return graph.get_attr(graph_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 raise NotImplementedError(f"unhandled {node}")
 
@@ -1263,22 +1310,30 @@ def replace(
                 old: Union[torch.fx.Node, None],
                 new: Union[torch.fx.Node, Sequence[torch.fx.Node], None],
             ) -> None:
+<<<<<<< HEAD
                 def filter_nodes_in_newly_added_nodes(node: torch.fx.Node) -> bool:
                     # Do not replace the use of a node if it is being used by
                     # nodes in the replaced graph
                     return node not in added_replacement_nodes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if old is None:
                     assert new is None
                     return
                 assert isinstance(old, torch.fx.Node)
                 if new is None:
+<<<<<<< HEAD
                     old.replace_all_uses_with(
                         None,  # type: ignore[arg-type]
                         delete_user_cb=filter_nodes_in_newly_added_nodes,
                     )
                     if len(old.users) == 0:
                         graph.erase_node(old)
+=======
+                    old.replace_all_uses_with(None)  # type: ignore[arg-type]
+                    graph.erase_node(old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return
                 if isinstance(new, torch.fx.Node):
                     if "val" not in new.meta:
@@ -1298,11 +1353,16 @@ def filter_nodes_in_newly_added_nodes(node: torch.fx.Node) -> bool:
                                 new, tag_name, old.meta[tag_name], OrderedSet(args)
                             )
 
+<<<<<<< HEAD
                     old.replace_all_uses_with(
                         new, delete_user_cb=filter_nodes_in_newly_added_nodes
                     )
                     if len(old.users) == 0:
                         graph.erase_node(old)
+=======
+                    old.replace_all_uses_with(new)
+                    graph.erase_node(old)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return
 
                 # `new` is not a node: it's a list of nodes.
@@ -1336,9 +1396,13 @@ def filter_nodes_in_newly_added_nodes(node: torch.fx.Node) -> bool:
                 for user in old_uses:
                     idx = maybe_getitem(user)
                     if idx is None:
+<<<<<<< HEAD
                         raise AssertionError(
                             "Deleted index from getitem, did you erase the index and not properly replace it?"
                         )
+=======
+                        raise AssertionError("can't handle")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     replace(user, new[idx])
                 graph.erase_node(old)
 
@@ -1403,7 +1467,11 @@ def check_and_add_duplicate_pattern(
 
     new_graph_str = str(graph)
     for graph_str in equiv_pattern_reprs:
+<<<<<<< HEAD
         if new_graph_str != graph_str:
+=======
+        if not new_graph_str == graph_str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         if skip_duplicates:
             return True
@@ -1464,9 +1532,13 @@ def check_fn(match: Match) -> bool:
         )
 
         sym_args: list[torch.SymInt] = []
+<<<<<<< HEAD
         fake_mode = torch._dynamo.utils.detect_fake_mode(args)
         assert fake_mode is not None
         with fake_mode:
+=======
+        with torch._dynamo.utils.detect_fake_mode(args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i, grad in enumerate(requires_grad):
                 if isinstance(args[i], torch.Tensor):
                     if grad and is_integer_dtype(args[i].dtype):
@@ -1503,7 +1575,10 @@ def search_fn_new(*args_new: Any) -> Any:
                         return search_fn(*args_new[len(args_new) - len(args) :])
 
                     try:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         specific_graph = trace_fn(search_fn_new, sym_args + args)
                     except RuntimeError as e:
                         log_trace_failure(search_fn, e)
@@ -1654,7 +1729,10 @@ def get_file_template() -> str:
                 if isinstance(attr, type) and issubclass(
                     attr, (PatternExpr, _TargetExpr)
                 ):
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     pattern_matcher_imports.append(name)
             except TypeError:
                 pass
@@ -1937,14 +2015,20 @@ class PatternMatcherPass:
     def __init__(
         self,
         pass_name: Optional[str] = None,
+<<<<<<< HEAD
         subsystem: Optional[str] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__()
         self.patterns: defaultdict[
             tuple[str, torch.fx.node.Target], list[PatternEntry]
         ] = defaultdict(list)
         self.pass_name = pass_name
+<<<<<<< HEAD
         self.subsystem = subsystem
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # For a particular generated pattern repr, store all of the str representations
         # of the graph used to generate them. Because we ignore certain patterns
@@ -1984,7 +2068,11 @@ def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
             nodes.append(graph.find_nodes(op="call_module", sort=False))
         pass_name = self.pass_name if self.pass_name is not None else "pattern_matcher"
         assert isinstance(gm, torch.fx.GraphModule)
+<<<<<<< HEAD
         with GraphTransformObserver(gm, pass_name, self.subsystem):
+=======
+        with GraphTransformObserver(gm, pass_name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for node in sorted(itertools.chain.from_iterable(nodes), reverse=True):
                 target = extract_target(node)
                 if node.op == "call_module":
@@ -2012,12 +2100,20 @@ def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
                         continue
                     if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
                         log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
+<<<<<<< HEAD
 
                     if is_match(m) and guard_or_false(entry.extra_check(m)):
                         count += 1
                         entry.apply(m, graph, node)
                         counters[backend]["pattern_matcher_count"] += 1
                         counters[backend]["pattern_matcher_nodes"] += len(m.nodes)
+=======
+                    if is_match(m) and entry.extra_check(m):
+                        count += 1
+                        entry.apply(m, graph, node)
+                        counters["inductor"]["pattern_matcher_count"] += 1
+                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return count
 
     def clear(self) -> None:
@@ -2062,6 +2158,7 @@ def process_arg(
     argnum = itertools.count()
 
     class Converter(torch.fx.Interpreter):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
         call_method = _not_implemented
         # pyrefly: ignore [bad-override]
@@ -2070,6 +2167,12 @@ class Converter(torch.fx.Interpreter):
         get_attr = _not_implemented
 
         # pyrefly: ignore [bad-override]
+=======
+        call_method = _not_implemented
+        call_module = _not_implemented
+        get_attr = _not_implemented
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def placeholder(
             self,
             target: str,  # type: ignore[override]
@@ -2090,7 +2193,10 @@ def placeholder(
             else:
                 return KeywordArg(name)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def call_function(
             self,
             target: str,  # type: ignore[override]
@@ -2125,7 +2231,10 @@ def run_node(self, n: torch.fx.Node) -> Any:
                 assert isinstance(args, Collection)
                 assert len(rv) == len(args)
                 for r, arg in zip(rv, args):
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     r.users = len(arg.users)
             else:
                 rv.users = len(n.users)
@@ -2148,7 +2257,11 @@ def fwd_only(
 ) -> torch.fx.GraphModule:
     """Build a normalized inference graph, for use with fx_to_pattern"""
     # TODO - look into using aot autograd, asserting no mutating ops here
+<<<<<<< HEAD
     with enable_python_dispatcher(), preserve_node_meta():
+=======
+    with enable_python_dispatcher():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         decompositions = (
             get_decomp_fn() if get_decomp_fn is not None else select_decomp_table()
         )
@@ -2200,10 +2313,14 @@ def record_joint_graph(
         torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")
     )
     GraphPatternEntry(
+<<<<<<< HEAD
         pattern=pattern,
         handler=pointless_view,
         extra_check=_return_true,
         # pyrefly: ignore [bad-argument-type]
+=======
+        pattern=pattern, handler=pointless_view, extra_check=_return_true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ).register(matcher_pass.patterns)
     matcher_pass.apply(gm.graph)
 
@@ -2262,13 +2379,21 @@ def init_once_fakemode(fn: Callable[..., Any]) -> Callable[[], Any]:
     @functools.cache
     @functools.wraps(fn)
     def lazy_init() -> Any:
+<<<<<<< HEAD
         counters_ref = counters[backend].copy()
+=======
+        counters_ref = counters["inductor"].copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode():
             result = fn()
 
         # clear view matches encountered during tracing
+<<<<<<< HEAD
         counters[backend] = counters_ref
+=======
+        counters["inductor"] = counters_ref
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return result
 
@@ -2293,7 +2418,10 @@ def run_node(self, old_node: torch.fx.Node) -> torch.fx.Node:
                 new_node.node.name = self.new_graph._graph_namespace.create_name(
                     old_node.name, None
                 )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return new_node
 
     return CopyGraph(input_graph).transform()
diff --git a/torch/_inductor/quantized_lowerings.py b/torch/_inductor/quantized_lowerings.py
index 5b6f8c12309b8..8f205c6719421 100644
--- a/torch/_inductor/quantized_lowerings.py
+++ b/torch/_inductor/quantized_lowerings.py
@@ -137,7 +137,10 @@ def int4pack_mm_cpu(
             )
             and mat2.get_layout().is_contiguous()
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-specialization, missing-attribute, not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             CppWoqInt4GemmTemplate[qGroupSize].add_choices(
                 choices,
                 aten_layout,
diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py
index 3dcf486feff74..00a401b991146 100644
--- a/torch/_inductor/remote_cache.py
+++ b/torch/_inductor/remote_cache.py
@@ -160,7 +160,10 @@ def __init__(
             self.backend = override_cls()
         else:
             self.backend = backend
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.serde = serde
 
     # See if the cache contains `key`. Returns `None` if the value is not
@@ -171,6 +174,7 @@ def get(self, key: str) -> Optional[_T]:
             try:
                 result = self._get(key, sample)
                 cache_stats.get(type(self).__name__, result)
+<<<<<<< HEAD
             except Exception as e:
                 cache_stats.exception(type(self).__name__)
                 if sample:
@@ -178,6 +182,12 @@ def get(self, key: str) -> Optional[_T]:
                 raise
             finally:
                 self._log_sample(sample)
+=======
+            except Exception:
+                cache_stats.exception(type(self).__name__)
+                raise
+            self._log_sample(sample)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return result
 
     # Add `value` to the cache with the key `key`. Note that `None` is not a
@@ -190,6 +200,7 @@ def put(self, key: str, value: _T) -> None:
             try:
                 self._put(key, value, sample)
                 cache_stats.put(type(self).__name__)
+<<<<<<< HEAD
             except Exception as e:
                 cache_stats.exception(type(self).__name__)
                 if sample:
@@ -197,6 +208,12 @@ def put(self, key: str, value: _T) -> None:
                 raise
             finally:
                 self._log_sample(sample)
+=======
+            except Exception:
+                cache_stats.exception(type(self).__name__)
+                raise
+            self._log_sample(sample)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Used to convert data from the cache into structured data.
     def _decode(self, data: _U, sample: Optional[Sample]) -> _T:  # type: ignore[override]
@@ -246,7 +263,10 @@ class RedisRemoteCacheBackend(RemoteCacheBackend[bytes]):
     A Redis implementation of a remote/distributed cache.
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _redis: Optional[redis.Redis] = None
 
     def __init__(self, cache_id: str) -> None:
@@ -269,9 +289,13 @@ def _get(self, key: str) -> Optional[bytes]:
             return None
 
         try:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             value = self._redis.get(key)
         # pyrefly: ignore [missing-attribute]
+=======
+            value = self._redis.get(key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except redis.exceptions.ConnectionError:
             # Redis is lazy and doesn't actually attempt to connect until the
             # first use. Mark is as unavailable now.
@@ -289,9 +313,13 @@ def _put(self, key: str, data: bytes) -> None:
             return
 
         try:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             self._redis.set(key, data)
         # pyrefly: ignore [missing-attribute]
+=======
+            self._redis.set(key, data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except redis.exceptions.ConnectionError:
             # Redis is lazy and doesn't actually attempt to connect until the
             # first use. Mark is as unavailable now.
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 0034a6a8feb3d..3e6bf0180dfb3 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -31,7 +31,11 @@
 import os
 import os.path
 import re
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import override
 
 import torch
@@ -115,17 +119,28 @@ def encode(content: JsonDataTy) -> bytes:
 @dataclasses.dataclass
 class AutotuneCache:
     configs_hash: str
+<<<<<<< HEAD
     local_cache: tuple[RemoteCache[JsonDataTy], str] | None = None
     remote_cache: tuple[RemoteCache[JsonDataTy], str] | None = None
+=======
+    local_cache: Optional[tuple[RemoteCache[JsonDataTy], str]] = None
+    remote_cache: Optional[tuple[RemoteCache[JsonDataTy], str]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Create a AutotuneCache. Returns None if none of the caches can be used.
     @staticmethod
     def create(
         inductor_meta: _InductorMetaTy, filename: str, configs_hash: str
+<<<<<<< HEAD
     ) -> AutotuneCache | None:
         cache = AutotuneCache(configs_hash)
         key = AutotuneCache._prepare_key(filename)
 
+=======
+    ) -> Optional[AutotuneCache]:
+        cache = AutotuneCache(configs_hash)
+        key = AutotuneCache._prepare_key(filename)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cache._setup_local_cache(inductor_meta, os.path.dirname(filename), key)
         cache._setup_remote_autotune_cache(inductor_meta, key)
         if cache.local_cache or cache.remote_cache:
@@ -142,7 +157,11 @@ def _prepare_key(filename: str) -> str:
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
     # Read the best config options from the most local cache and return it.
+<<<<<<< HEAD
     def _read(self) -> dict[str, JsonDataTy] | None:
+=======
+    def _read(self) -> Optional[dict[str, JsonDataTy]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if local_cache := self.local_cache:
             cache, key = local_cache
             if best_config := cache.get(key):
@@ -161,7 +180,11 @@ def _read(self) -> dict[str, JsonDataTy] | None:
     # which `configs` represents that option.
     def read_best(
         self, inductor_meta: _InductorMetaTy, configs: list[Config]
+<<<<<<< HEAD
     ) -> Config | None:
+=======
+    ) -> Optional[Config]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if best := self._read():
             return _load_cached_autotuning(
                 best, self.configs_hash, configs, inductor_meta
@@ -272,6 +295,7 @@ def save(
         config: Config,
         time_taken_ns: int,
         found_by_coordesc: bool = False,
+<<<<<<< HEAD
         triton_cache_hash: str | None = None,
     ) -> None:
         data = {
@@ -280,6 +304,13 @@ def save(
             # pyrefly: ignore [missing-attribute]
             "num_warps": config.num_warps,
             # pyrefly: ignore [missing-attribute]
+=======
+        triton_cache_hash: Optional[str] = None,
+    ) -> None:
+        data = {
+            **config.kwargs,
+            "num_warps": config.num_warps,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "num_stages": config.num_stages,
             "configs_hash": self.configs_hash,
             "found_by_coordesc": found_by_coordesc,
@@ -417,7 +448,11 @@ def _get_backend_hash(inductor_meta: _InductorMetaTy) -> str:
 
 
 class AutotuneCacheBundler:
+<<<<<<< HEAD
     _bundler: _AutotuneCacheBundlerImpl | None = None
+=======
+    _bundler: Optional[_AutotuneCacheBundlerImpl] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self) -> None:
         pass
@@ -430,8 +465,13 @@ def begin_compile(
         cls,
         inductor_meta: _InductorMetaTy,
         *,
+<<<<<<< HEAD
         code: str | None = None,
         code_hash: str | None = None,
+=======
+        code: Optional[str] = None,
+        code_hash: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         assert cls._bundler is None
 
@@ -539,7 +579,11 @@ def _load_cached_autotuning(
     configs_hash: str,
     configs: list[Config],
     inductor_meta: _InductorMetaTy,
+<<<<<<< HEAD
 ) -> Config | None:
+=======
+) -> Optional[Config]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if best_config is None:
         return None
     if best_config.pop("configs_hash", None) != configs_hash:
@@ -573,20 +617,29 @@ def _load_cached_autotuning(
             )
 
         # Create the triton_config with the appropriate arguments
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-count]
         triton_config = Config(best_config, **config_args)
         # pyrefly: ignore [missing-attribute]
+=======
+        triton_config = Config(best_config, **config_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_config.found_by_coordesc = True
         return triton_config
 
     matching_configs = [
         cfg
         for cfg in configs
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
         # pyrefly: ignore [missing-attribute]
         and cfg.num_warps == best_config.get("num_warps")
         # pyrefly: ignore [missing-attribute]
+=======
+        if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        and cfg.num_warps == best_config.get("num_warps")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and cfg.num_stages == best_config.get("num_stages")
     ]
     if len(matching_configs) != 1:
@@ -597,7 +650,11 @@ def _load_cached_autotuning(
 
 class _LocalAutotuneCacheBackend(RemoteCacheBackend[bytes]):
     @override
+<<<<<<< HEAD
     def _get(self, key: str) -> bytes | None:
+=======
+    def _get(self, key: str) -> Optional[bytes]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             with open(key, "rb") as fd:
                 return fd.read()
@@ -619,7 +676,11 @@ def __init__(self) -> None:
         super().__init__(backend, serde)
 
     @override
+<<<<<<< HEAD
     def _get(self, key: str, sample: Sample | None) -> JsonDataTy | None:
+=======
+    def _get(self, key: str, sample: Optional[Sample]) -> Optional[JsonDataTy]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         AutotuneCacheBundler.sync()
         result = super()._get(key, sample)
         if result is not None:
@@ -637,7 +698,11 @@ def _get(self, key: str, sample: Sample | None) -> JsonDataTy | None:
         return result
 
     @override
+<<<<<<< HEAD
     def _put(self, key: str, value: JsonDataTy, sample: Sample | None) -> None:
+=======
+    def _put(self, key: str, value: JsonDataTy, sample: Optional[Sample]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         AutotuneCacheBundler.put(key, value)
         super()._put(key, value, sample)
 
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index 2ee2a7ae05434..f6fb80471da7a 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 import functools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import inspect
 import time
 from functools import cached_property, wraps
@@ -24,6 +27,7 @@
 T = TypeVar("T")
 
 
+<<<<<<< HEAD
 def may_distort_benchmarking_result(fn: Callable[..., Any]) -> Callable[..., Any]:
     from torch._inductor import config
 
@@ -73,6 +77,8 @@ def may_ban_benchmarking() -> None:
         """)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def time_and_count(
     fn: Callable[Concatenate[Any, P], T],
 ) -> Callable[Concatenate[Any, P], T]:
@@ -123,7 +129,10 @@ def benchmark(
         - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
         """
         inferred_device = None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
             if not isinstance(arg_or_kwarg, torch.Tensor):
                 continue
@@ -195,6 +204,7 @@ def triton_do_bench(self: Self) -> Callable[..., Any]:
             raise NotImplementedError("requires Triton") from e
         return do_bench
 
+<<<<<<< HEAD
     @may_distort_benchmarking_result
     @time_and_count
     # pyrefly: ignore [bad-override]
@@ -204,6 +214,10 @@ def benchmark_gpu(
         is_vetted_benchmarking: bool = False,
         **kwargs: Any,
     ) -> float:
+=======
+    @time_and_count
+    def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.
 
         Arguments:
@@ -220,9 +234,12 @@ def benchmark_gpu(
         this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
         this is the requested return mode. Otherwise, this is the median.
         """
+<<<<<<< HEAD
         if not is_vetted_benchmarking:
             may_ban_benchmarking()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         do_bench_params = inspect.signature(self.triton_do_bench).parameters
         for kwarg in list(kwargs.keys()):
             if kwarg not in do_bench_params:
@@ -234,7 +251,11 @@ def benchmark_gpu(
         return self.triton_do_bench(_callable, **kwargs, return_mode="median")
 
 
+<<<<<<< HEAD
 class InductorBenchmarker(TritonBenchmarker):  # noqa: docstring_linter
+=======
+class InductorBenchmarker(TritonBenchmarker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @cached_property
     def L2_cache_size(self: Self) -> int:
         """Get the L2 cache size, in bytes, of the current device."""
@@ -265,20 +286,30 @@ def get_event_pairs_min_timing(
             ]
         )
 
+<<<<<<< HEAD
     @may_distort_benchmarking_result
     @time_and_count
     def benchmark_gpu(  # type: ignore[override]
+=======
+    @time_and_count
+    def benchmark_gpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self: Self,
         _callable: Callable[[], Any],
         estimation_iters: int = 5,
         memory_warmup_iters: int = 100,
         benchmark_iters: int = 100,
         max_benchmark_duration: int = 25,
+<<<<<<< HEAD
         return_mode: str = "min",
         grad_to_none: list[torch.Tensor] | None = None,
         is_vetted_benchmarking: bool = False,
         **kwargs: Any,
     ) -> float | list[float]:
+=======
+        **kwargs: Any,
+    ) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Benchmark a GPU callable using a custom benchmarking implementation.
 
         Arguments:
@@ -296,6 +327,7 @@ def benchmark_gpu(  # type: ignore[override]
         of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
         runtime of `_callable` and various other factors, and we then shrink
         `benchmark_iters` to fit in the allotted maximum duration.
+<<<<<<< HEAD
         - return_mode: Return mode for benchmark results. Options are "min" (default),
         "all" (returns all measurements).
         - grad_to_none: Optionally, a list of tensors whose gradients should be cleared
@@ -312,6 +344,13 @@ def benchmark_gpu(  # type: ignore[override]
         if not is_vetted_benchmarking:
             may_ban_benchmarking()
 
+=======
+        - **kwargs: Additional kwargs that may be passed to the fallback.
+
+        Returns:
+        - The minimum runtime of `_callable`, in milliseconds.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # we don't want any outside errors propagating into benchmarking
         torch.cuda.synchronize()
 
@@ -326,10 +365,13 @@ def benchmark_gpu(  # type: ignore[override]
         # estimate the runtime of `_callable`
         event_pairs = self.get_event_pairs(estimation_iters)
         for start_event, end_event in event_pairs:
+<<<<<<< HEAD
             # Clear gradients before timing (matches triton.testing.do_bench)
             if grad_to_none is not None:
                 for x in grad_to_none:
                     x.grad = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer.zero_()
             start_event.record()
             _callable()
@@ -349,20 +391,28 @@ def benchmark_gpu(  # type: ignore[override]
         # benchmark `_callable`
         event_pairs = self.get_event_pairs(benchmark_iters)
         for start_event, end_event in event_pairs:
+<<<<<<< HEAD
             # Clear gradients before timing (matches triton.testing.do_bench)
             if grad_to_none is not None:
                 for x in grad_to_none:
                     x.grad = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer.zero_()
             start_event.record()
             _callable()
             end_event.record()
         torch.cuda.synchronize()
+<<<<<<< HEAD
+=======
+        benchmarked_timing = self.get_event_pairs_min_timing(event_pairs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # explicitly delete the buffer, sometimes helps memory
         # footprint metrics in OSS Inductor performance benchmarks
         del buffer
 
+<<<<<<< HEAD
         # Return based on the requested mode
         if return_mode == "all":
             # Get all timings from event pairs
@@ -380,6 +430,11 @@ def benchmark_gpu(  # type: ignore[override]
             raise ValueError(
                 f"Unsupported return_mode: {return_mode}. Use 'min' or 'all'."
             )
+=======
+        # return the minimum of `estimated_timing` and `benchmarked_timing`,
+        # we just want the minimum timing overall so we might as well check both
+        return min(estimated_timing, benchmarked_timing)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 benchmarker = (
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 1851e447e1950..2667c99414d1a 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -10,8 +10,11 @@
 from types import ModuleType
 from typing import Any, Callable, TYPE_CHECKING
 
+<<<<<<< HEAD
 from torch._utils_internal import log_triton_builds
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING:
     from torch._inductor.runtime.triton_heuristics import CachingAutotuner
@@ -40,7 +43,11 @@ def _reload_python_module(
 def _set_triton_ptxas_path() -> None:
     if os.environ.get("TRITON_PTXAS_PATH") is not None:
         return
+<<<<<<< HEAD
     ptxas = Path(__file__).absolute().parents[2] / "bin" / "ptxas"
+=======
+    ptxas = Path(__file__).absolute().parents[1] / "bin" / "ptxas"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not ptxas.exists():
         return
     if ptxas.is_file() and os.access(ptxas, os.X_OK):
@@ -59,6 +66,7 @@ def _worker_compile_triton(
     from torch._inductor import config
 
     with config.patch(extra_config):
+<<<<<<< HEAD
         fail = None
         try:
             start_ns = time.time_ns()
@@ -74,3 +82,13 @@ def _worker_compile_triton(
             raise
         finally:
             log_triton_builds(fail=fail)
+=======
+        start_ns = time.time_ns()
+        kernel = load_kernel()
+        kernel.precompile(warm_cache_only=True)
+        elapsed_ns = time.time_ns() - start_ns
+        kernel.prepare_for_pickle()
+        # We can release this memory in the compile subprocesses:
+        linecache.clearcache()
+        return kernel, elapsed_ns // 1000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index e729207846054..e9c0d93ad0589 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -2,7 +2,12 @@
 import copy
 import itertools
 import logging
+<<<<<<< HEAD
 from typing import Callable, TYPE_CHECKING
+=======
+from typing import Callable, Optional, TYPE_CHECKING
+from functools import lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
@@ -47,6 +52,7 @@ class CoordescTuner:
     """
 
     def __init__(
+<<<<<<< HEAD
         self,
         is_mm=False,
         is_native_matmul=False,
@@ -61,6 +67,11 @@ def __init__(
         # tl.dot also does not support size smaller than 16; we put this restriction.
         self.is_native_matmul = is_native_matmul
         assert not (self.is_mm and self.is_native_matmul)
+=======
+        self, is_mm=False, name="unknown", size_hints=None, inductor_meta=None
+    ):
+        self.is_mm = is_mm  # we will tune num_stages for mm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cached_benchmark_results = {}
         self.name = name
         self.size_hints = size_hints
@@ -71,10 +82,18 @@ def get_config_max(self, prefix: str) -> int:
         size_hint = self.size_hints.get(prefix) if self.size_hints is not None else None
         return min(max_block, size_hint) if size_hint is not None else max_block
 
+<<<<<<< HEAD
     def get_warpsmax(self):
         # CUDA/ROCm has a maximum of 1024 threads per block
         from torch.cuda import current_device, get_device_properties, is_available
 
+=======
+    @lru_cache(maxsize=1)
+    def get_warpsmax(self):
+        # CUDA/ROCm has a maximum of 1024 threads per block
+        from torch.cuda import current_device, get_device_properties, is_available
+        
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warp_size = (
             get_device_properties(current_device()).warp_size if is_available() else 32
         )
@@ -117,9 +136,12 @@ def tunable_fields(self):
             out.append("num_stages")
         if self.inductor_meta.get("is_hip") is True:
             out.append("waves_per_eu")
+<<<<<<< HEAD
         if self.is_native_matmul:
             out.append("num_stages")
             out.remove("ZBLOCK")  # ZBLOCK=1 always in native matmul
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return out
 
@@ -135,6 +157,7 @@ def value_too_large(self, name: str, val: int) -> bool:
 
         return False
 
+<<<<<<< HEAD
     def value_too_small(self, name: str, val: int) -> bool:
         # In native matmul, block size should be >= 16 for tl.dot
         if self.is_native_matmul:
@@ -144,6 +167,8 @@ def value_too_small(self, name: str, val: int) -> bool:
         # Break if value becomes 0/neg
         return val <= 0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
         """
         Get neighbour values in 'radius' steps. The original value is not
@@ -176,7 +201,11 @@ def update(cur_val, inc=True):
         cur_val = orig_val
         for _ in range(radius):
             cur_val = update(cur_val, False)
+<<<<<<< HEAD
             if self.value_too_small(name, cur_val):
+=======
+            if cur_val <= 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 break
             out.append(cur_val)
 
@@ -191,7 +220,10 @@ def has_improvement(baseline, test):
 
     def check_all_tuning_directions(
         self,
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func: Callable[["triton.Config"], float],
         best_config,
         best_timing,
@@ -244,7 +276,11 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
         try:
             candidate_timing = self.call_func(func, candidate_config)
         except Exception as e:
+<<<<<<< HEAD
             log.debug("Got exception %s", e)  # noqa: G200
+=======
+            log.debug("Got exception %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False, float("inf")
 
         if self.has_improvement(best_timing, candidate_timing):
@@ -261,21 +297,32 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
 
     def autotune(
         self,
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         func: Callable[["triton.Config"], float],
         # pyrefly: ignore [missing-attribute]
         baseline_config: "triton.Config",
         baseline_timing: float | None = None,
     ) -> "triton.Config":  # pyrefly: ignore  # missing-attribute
+=======
+        func: Callable[["triton.Config"], float],
+        baseline_config: "triton.Config",
+        baseline_timing: Optional[float] = None,
+    ) -> "triton.Config":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if baseline_timing is None:
             baseline_timing = self.call_func(func, baseline_config)
 
         log.debug("= Do coordinate descent tuning for %s =", self.name)
         log.debug(
+<<<<<<< HEAD
             "%s: Baseline Config %s, baseline timing %f",
             self.name,
             baseline_config,
             baseline_timing,
+=======
+            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         improved = True
         best_config = baseline_config
@@ -317,17 +364,28 @@ def autotune(
 
                 if improved:
                     msg = red_text(
+<<<<<<< HEAD
                         "%s: Coordinate descend tuning found improvement of %.3fx by looking in all directions."
                     )
                     log.debug(
                         msg,
                         self.name,
+=======
+                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                    )
+                    log.debug(
+                        msg,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         old_best_timing / best_timing,
                     )
 
         log.debug(
+<<<<<<< HEAD
             "%s: Improve from %s %f -> %s %f, %.3fx",
             self.name,
+=======
+            "Improve from %s %f -> %s %f, %.3fx",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             baseline_config,
             baseline_timing,
             best_config,
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 7e7409c698e90..5686255e3376e 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -5,15 +5,24 @@
 import functools
 import typing
 from enum import auto, Enum
+<<<<<<< HEAD
 
 import torch
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton_package
 
 
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
+<<<<<<< HEAD
     "X": 8192 if torch.version.hip else 4096,
+=======
+    "X": 4096,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
@@ -89,13 +98,19 @@ def AttrsDescriptorWrapper(
             divisible_by_16=None,
             equal_to_1=None,
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return {(x,): [["tt.divisibility", 16]] for x in divisible_by_16}
 
 else:
     # Define a namedtuple as a fallback when AttrsDescriptor is not available
     AttrsDescriptorWrapper = collections.namedtuple(  # type: ignore[no-redef, name-match]
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "AttrsDescriptor",
         ["divisible_by_16", "equal_to_1"],
         defaults=[(), ()],
@@ -132,10 +147,17 @@ class DeviceProperties(typing.NamedTuple):
     index: int  # type: ignore[assignment]
     multi_processor_count: int
     cc: int
+<<<<<<< HEAD
     major: int | None = None
     regs_per_multiprocessor: int | None = None
     max_threads_per_multi_processor: int | None = None
     warp_size: int | None = None
+=======
+    major: Optional[int] = None
+    regs_per_multiprocessor: Optional[int] = None
+    max_threads_per_multi_processor: Optional[int] = None
+    warp_size: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     @functools.cache
@@ -155,8 +177,14 @@ def create(cls, device) -> DeviceProperties:
         except AttributeError:
             if device_type == "xpu":
                 multi_processor_count = props.gpu_subslice_count
+<<<<<<< HEAD
             elif device_type == "mtia":
                 multi_processor_count = 64
+=======
+            elif device_type == "mps":
+                # TODO: Fetch the actual value from ioreg
+                multi_processor_count = 8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise
         return cls(
@@ -176,10 +204,17 @@ def create(cls, device) -> DeviceProperties:
 class HalideInputSpec(typing.NamedTuple):
     ctype: str
     name: str
+<<<<<<< HEAD
     shape: list[str] | None = None
     stride: list[str] | None = None
     offset: str | None = None
     alias_of: str | None = None
+=======
+    shape: Optional[list[str]] = None
+    stride: Optional[list[str]] = None
+    offset: Optional[str] = None
+    alias_of: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def bindings_type(self) -> str:
         if self.ctype in ("at::Half*", "at::BFloat16*"):
@@ -203,9 +238,15 @@ def is_buffer(self) -> bool:
 class HalideMeta(typing.NamedTuple):
     argtypes: list[HalideInputSpec]
     target: str
+<<<<<<< HEAD
     scheduler: str | None = None
     scheduler_flags: dict[str, int | str] | None = None
     cuda_device: int | None = None
+=======
+    scheduler: Optional[str] = None
+    scheduler_flags: Optional[dict[str, Union[int, str]]] = None
+    cuda_device: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def args(self) -> list[str]:
         """Command line args to pass to halide generator"""
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index b6e3e13ced517..29250583ba85f 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -68,11 +68,16 @@ def triton_config_to_hashable(cfg: Config) -> Hashable:
     Convert triton config to a tuple that can uniquely identify it. We can use
     the return value as a dictionary key.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     items = sorted(cfg.kwargs.items())
     # pyrefly: ignore [missing-attribute]
     items.append(("num_warps", cfg.num_warps))
     # pyrefly: ignore [missing-attribute]
+=======
+    items = sorted(cfg.kwargs.items())
+    items.append(("num_warps", cfg.num_warps))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     items.append(("num_stages", cfg.num_stages))
     return tuple(items)
 
@@ -106,7 +111,10 @@ def get_max_y_grid() -> int:
 
 
 try:
+<<<<<<< HEAD
     # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import colorama
 
     HAS_COLORAMA = True
@@ -118,7 +126,10 @@ def get_max_y_grid() -> int:
 if HAS_COLORAMA:
 
     def _color_text(msg: str, color: str) -> str:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
 
 else:
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index f48f351ce823a..510543cfe3367 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -1,10 +1,17 @@
 import functools
 import os
+<<<<<<< HEAD
 from typing import Any
 from typing_extensions import Unpack
 
 from .triton_compat import ASTSource, CompiledKernel, knobs as triton_knobs
 from .triton_helpers import get_constexprs
+=======
+from typing import Any, Optional
+from typing_extensions import Unpack
+
+from .triton_compat import ASTSource, CompiledKernel, knobs as triton_knobs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class StaticallyLaunchedCudaKernel:
@@ -35,6 +42,7 @@ class StaticallyLaunchedCudaKernel:
     """
 
     def __init__(self, kernel: CompiledKernel) -> None:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         self.name = kernel.src.fn.__name__
         # pyrefly: ignore [missing-attribute]
@@ -44,10 +52,18 @@ def __init__(self, kernel: CompiledKernel) -> None:
 
         # Used by torch.compile to filter constants in older triton versions
         # pyrefly: ignore [missing-attribute]
+=======
+        self.name = kernel.src.fn.__name__
+        self.cubin_raw = kernel.asm.get("cubin", None)
+        self.cubin_path = kernel._cubin_path
+
+        # Used by torch.compile to filter constants in older triton versions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.arg_names = kernel.src.fn.arg_names
 
         # Const exprs that are declared by the triton kernel directly
         # Used to generate the kernel launcher's def args
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         self.declared_constexprs = get_constexprs(kernel.src.fn)
 
@@ -58,11 +74,20 @@ def __init__(self, kernel: CompiledKernel) -> None:
             # pyrefly: ignore [missing-attribute]
             launch_enter = kernel.__class__.launch_enter_hook
             # pyrefly: ignore [missing-attribute]
+=======
+        self.declared_constexprs = kernel.src.fn.constexprs
+
+        self.hash = kernel.hash
+
+        if triton_knobs is None:
+            launch_enter = kernel.__class__.launch_enter_hook
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             launch_exit = kernel.__class__.launch_exit_hook
         else:
             launch_enter = triton_knobs.runtime.launch_enter_hook
             launch_exit = triton_knobs.runtime.launch_exit_hook
 
+<<<<<<< HEAD
         def hook_is_empty(hook: Any) -> bool:
             if hook is None:
                 return True
@@ -106,6 +131,32 @@ def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
         # pyrefly: ignore [missing-attribute]
         self.arg_tys = self.arg_ty_from_signature(kernel.src)
         self.function: int | None = None  # Loaded by load_kernel(on the parent process)
+=======
+        if launch_enter is not None or launch_exit is not None:
+            raise NotImplementedError(
+                "We don't support launch enter or launch exit hooks"
+            )
+        self.num_warps = kernel.metadata.num_warps
+        self.shared = (
+            kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
+        )
+
+        # Newer triton versions pass an extra global scratch parameter to the compiled cuda kernel.
+        # Inductor never uses this field or enables it, but we still have to pass
+        # an extra None into the set of params if its enabled
+        if hasattr(kernel.metadata, "global_scratch_size"):
+            if kernel.metadata.global_scratch_size > 0:
+                raise NotImplementedError("Global scratch not yet supported")
+            else:
+                self.has_global_scratch = True
+        else:
+            self.has_global_scratch = False
+
+        self.arg_tys = self.arg_ty_from_signature(kernel.src)
+        self.function: Optional[int] = (
+            None  # Loaded by load_kernel(on the parent process)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_ctas = 1
         if hasattr(kernel, "num_ctas"):
             num_ctas = kernel.num_ctas
@@ -183,7 +234,10 @@ def extract_type(self, ty: str) -> str:
     def arg_ty_from_signature(self, src: ASTSource) -> str:
         def index_key(i: Any) -> int:
             if isinstance(i, str):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return src.fn.arg_names.index(i)
             elif isinstance(i, tuple):
                 # In triton 3.3, src.fn.constants has tuples as a key
@@ -191,7 +245,10 @@ def index_key(i: Any) -> int:
             else:
                 return i
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         signature = {index_key(key): value for key, value in src.signature.items()}
         # Triton uses these as the main way to filter out constants passed to their cubin
         constants = [index_key(key) for key in getattr(src, "constants", dict())]
@@ -213,7 +270,10 @@ def index_key(i: Any) -> int:
             if ty == "constexpr" or i in constants:
                 pass
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 params.append(self.extract_type(ty))
         return "".join(params)
 
@@ -245,6 +305,7 @@ def run(
         # thing, it should always match.
         # Get rid of constants before passing to cubin launcher
 
+<<<<<<< HEAD
         # Add a None if triton wants extra parameters for scratch spaces
         arg_tys = self.arg_tys
         for has_scratch in [self.has_global_scratch, self.has_profile_scratch]:
@@ -252,6 +313,14 @@ def run(
                 arg_tys = arg_tys + "O"
                 args = (*args, None)
         # pyrefly: ignore [bad-argument-type]
+=======
+        # Add a None if triton wants an extra parameter to the cubin
+        if self.has_global_scratch:
+            arg_tys = self.arg_tys + "O"
+            args = (*args, None)
+        else:
+            arg_tys = self.arg_tys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(args) == len(arg_tys)
 
         # TODO: can handle grid functions here or in C++, so
@@ -264,7 +333,10 @@ def run(
             self.num_warps,
             self.shared,
             arg_tys,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args,
             stream,
         )
diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py
index faae38ea46dc1..a669ea7306a17 100644
--- a/torch/_inductor/runtime/triton_compat.py
+++ b/torch/_inductor/runtime/triton_compat.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
 
 import inspect
+<<<<<<< HEAD
 from typing import Any
+=======
+from typing import Any, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -17,7 +21,11 @@
     from triton import Config
     from triton.compiler import CompiledKernel
     from triton.runtime.autotuner import OutOfResources
+<<<<<<< HEAD
     from triton.runtime.jit import JITFunction, KernelInterface
+=======
+    from triton.runtime.jit import KernelInterface
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     try:
         from triton.runtime.autotuner import PTXASError
@@ -37,7 +45,11 @@ class PTXASError(Exception):  # type: ignore[no-redef]
 
         def GPUTarget(
             backend: str,
+<<<<<<< HEAD
             arch: int | str,
+=======
+            arch: Union[int, str],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             warp_size: int,
         ) -> Any:
             if torch.version.hip:
@@ -87,6 +99,7 @@ def _triton_config_has(param_name: str) -> bool:
     except ImportError:
         knobs = None
 
+<<<<<<< HEAD
     try:
         from triton.runtime.cache import triton_key  # type: ignore[attr-defined]
     except ImportError:
@@ -98,6 +111,11 @@ def _triton_config_has(param_name: str) -> bool:
         "_semantic" in inspect.signature(triton.language.core.view).parameters
     )
     HAS_TRITON = True
+=======
+    builtins_use_semantic_kwarg = (
+        "_semantic" in inspect.signature(triton.language.core.view).parameters
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else:
 
     def _raise_error(*args: Any, **kwargs: Any) -> Any:
@@ -133,6 +151,7 @@ def constexpr(val: Any) -> Any:
         tensor = Any
         dtype = Any
 
+<<<<<<< HEAD
     class JITFunction:  # type: ignore[no-redef]
         pass
 
@@ -142,6 +161,12 @@ class JITFunction:  # type: ignore[no-redef]
 
 
 def cc_warp_size(cc: str | int) -> int:
+=======
+    HAS_WARP_SPEC = False
+
+
+def cc_warp_size(cc: Union[str, int]) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch.version.hip:
         cc_str = str(cc)
         if "gfx10" in cc_str or "gfx11" in cc_str:
@@ -175,5 +200,8 @@ class autograd_profiler:  # type: ignore[no-redef]
     "triton",
     "cc_warp_size",
     "knobs",
+<<<<<<< HEAD
     "triton_key",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
index c851b02fca115..4006eae484fdc 100644
--- a/torch/_inductor/runtime/triton_helpers.py
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -2,12 +2,19 @@
 # mypy: allow-untyped-defs
 import math as pymath
 import warnings
+<<<<<<< HEAD
+=======
+from functools import wraps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, TypeVar
 
 from .triton_compat import (  # noqa: F401
     _log2,
     builtins_use_semantic_kwarg,
+<<<<<<< HEAD
     JITFunction,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     libdevice,
     math,
     tl,
@@ -60,10 +67,13 @@ def get_backend_options():
     return options.__dict__
 
 
+<<<<<<< HEAD
 def get_constexprs(kernel: JITFunction) -> list[int]:
     return [p.num for p in kernel.params if p.is_constexpr]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @triton.jit
 def promote_to_tensor(x):
     # Addition promotes to tensor for us
@@ -84,7 +94,11 @@ def div_floor_integer(a, b):
 def remainder_integer(a, b):
     # NOTE: a % b matches C division, not floor division
     remainder = a % b
+<<<<<<< HEAD
     return tl.where((remainder != 0) & ((a < 0) != (b < 0)), remainder + b, remainder)
+=======
+    return tl.where(remainder != 0 and ((a < 0) != (b < 0)), remainder + b, remainder)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @triton.jit
@@ -135,9 +149,15 @@ def minimum_with_index(a_value, a_index, b_value, b_index):
     if is_floating(a_value):
         a_isnan = a_value != a_value
         b_isnan = b_value != b_value
+<<<<<<< HEAD
         mask |= a_isnan & (not b_isnan)
         # Consider NaNs as equal
         equal |= a_isnan & b_isnan
+=======
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Prefer lowest index if values are equal
     mask |= equal & (a_index < b_index)
@@ -151,9 +171,15 @@ def maximum_with_index(a_value, a_index, b_value, b_index):
     if is_floating(a_value):
         a_isnan = a_value != a_value
         b_isnan = b_value != b_value
+<<<<<<< HEAD
         mask |= a_isnan & (not b_isnan)
         # Consider NaNs as equal
         equal |= a_isnan & b_isnan
+=======
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Prefer lowest index if values are equal
     mask |= equal & (a_index < b_index)
@@ -173,15 +199,25 @@ def max_with_index(value, index, dim):
 @triton.jit
 def exp(x, use_fast_math: tl.constexpr):
     if use_fast_math:
+<<<<<<< HEAD
         return math.exp(x)
     else:
         return libdevice.exp(x)
+=======
+        return libdevice.exp2(x * _LOG_2_E)
+    else:
+        return math.exp(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @triton.jit
 def online_softmax_reduce(lhs_max, lhs_sum, dim, use_fast_math: tl.constexpr):
     out_max = max2(lhs_max, dim)
+<<<<<<< HEAD
     out_max_keepdim = tl.expand_dims(out_max, dim)
+=======
+    out_max_keepdim = out_max[:, None]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     delta = tl.where(out_max_keepdim == float("-inf"), 0, lhs_max - out_max_keepdim)
     out_sum = tl.sum(lhs_sum * exp(delta, use_fast_math), dim)
     return out_max, out_sum
@@ -319,8 +355,13 @@ def bucketize_binary_search(
     while full_range > 1:
         mid = (high + low) // 2
         mask = (
+<<<<<<< HEAD
             (mid * BOUNDARIES_STRIDE + boundary_indices) < BOUNDARIES_UNDERLYING_NUMEL
         ).logical_and(mid < BOUNDARIES_SIZE)
+=======
+            mid * BOUNDARIES_STRIDE + boundary_indices
+        ) < BOUNDARIES_UNDERLYING_NUMEL and mid < BOUNDARIES_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mid_indices = (
             mid
             if sorter_ptr is None or SORTER_STRIDE is None
@@ -569,6 +610,7 @@ def _compare_and_swap_with_index(
     # actual compare-and-swap
     ix = x.to(idtype, bitcast=True)
 
+<<<<<<< HEAD
     # sort treats nan as having the higher value. comparisons with nan always return False.
     # to align with sort semantics, we need to update descending to check if right_isnan,
     # and ascending to check if left_isnan.
@@ -597,6 +639,16 @@ def _compare_and_swap_with_index(
         if is_floating(left):
             eq = eq | (left_isnan & right_isnan)
         cond = cond | (eq & (left_idx > right_idx))
+=======
+    if descending:
+        cond = left < right
+    else:
+        cond = left > right
+
+    if stable:
+        # When stable sorting, tie break by index
+        cond = cond | ((left == right) & (left_idx > right_idx))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     cond = (right_valid_mask > left_valid_mask) | (
         (right_valid_mask == left_valid_mask) & cond
@@ -727,9 +779,16 @@ def triton_builtin(f: Callable[..., _T]) -> Callable[..., _T]:
     """
     if builtins_use_semantic_kwarg:
         # support Triton before and after https://github.com/triton-lang/triton/pull/7054
+<<<<<<< HEAD
         # and after https://github.com/triton-lang/triton/pull/7239
         def wrapper(*args, _semantic, **kwargs):
             kwargs["_builder"] = _semantic
+=======
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            kwargs["_builder"] = kwargs["_semantic"]
+            del kwargs["_semantic"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return f(*args, **kwargs)
     else:
         wrapper = f  # type: ignore[assignment]
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 61c9f286021dc..46ef35f43b1f7 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -18,12 +18,28 @@
 import threading
 import time
 from collections import namedtuple
+<<<<<<< HEAD
 from typing import Any, Callable, Generic, Literal, TYPE_CHECKING, TypeVar, Union
 
 import torch
 from torch._dynamo.utils import counters, set_feature_use
 from torch._environment import is_fbcode
 from torch._inductor import metrics
+=======
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Literal,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch._dynamo.utils import set_feature_use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import compute_required_storage_length
 from torch.utils._ordered_set import OrderedSet
 
@@ -45,6 +61,10 @@
 )
 from .runtime_utils import (
     ceildiv,
+<<<<<<< HEAD
+=======
+    compilation_callback,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     conditional_product,
     create_bandwidth_info_str,
     dynamo_timed,
@@ -72,6 +92,7 @@
     PTXASError,
     triton,
 )
+<<<<<<< HEAD
 from .triton_helpers import get_constexprs
 
 
@@ -81,6 +102,8 @@ class InductorConfig(Config):
     def __init__(self, *args, dynamic_scale_rblock=True, **kwargs):
         super().__init__(*args, **kwargs)
         self.dynamic_scale_rblock = dynamic_scale_rblock
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NoTritonConfigsError(RuntimeError):
@@ -99,6 +122,7 @@ class NoTritonConfigsError(RuntimeError):
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 triton_name_sub = re.compile(r"^def [^(]+\(")
 
 
@@ -127,6 +151,8 @@ def lookup_autotune_config(size_hints, fn) -> Config | None:
 
     return cached_config
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_total_reduction_numel(numels: dict[str, int]) -> int:
     return conditional_product(
@@ -149,7 +175,11 @@ def autotune_hints_to_configs(
     Based on those hints, this function will generate a list of additional autotuning
     configs to try.
     """
+<<<<<<< HEAD
     xyz_options: tuple[tuple[int, int | None, int | None], ...]
+=======
+    xyz_options: tuple[tuple[int, Optional[int], Optional[int]], ...]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     configs: list[Config] = []
     for hint in hints:
         if hint == AutotuneHint.ONE_ELEMENT_PER_THREAD:
@@ -177,6 +207,17 @@ def autotune_hints_to_configs(
     return configs
 
 
+<<<<<<< HEAD
+=======
+def disable_pointwise_autotuning(inductor_meta):
+    # Autotuning can give different benchmarking results from run to run, and
+    # therefore we disable autotuning when use_deterministic flag is on.
+    if inductor_meta.get("are_deterministic_algorithms_enabled"):
+        return True
+    return not inductor_meta.get("autotune_pointwise", True)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
     call_args = []
     call_kwargs = {}
@@ -190,7 +231,12 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
             call_kwargs[k] = v
         else:
             call_kwargs[k] = v
+<<<<<<< HEAD
     call_kwargs.update(launcher.config.kwargs)
+=======
+    if not triton_version_uses_attrs_dict():
+        call_kwargs.update(launcher.config.kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     call_kwargs["num_warps"] = launcher.config.num_warps
     call_kwargs["num_stages"] = launcher.config.num_stages
     if HAS_WARP_SPEC:
@@ -209,8 +255,13 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
 
 
 def check_autotune_cache(
+<<<<<<< HEAD
     configs: list[Config], filename: str | None, inductor_meta: dict[str, Any]
 ) -> tuple[list[Config], AutotuneCache | None, dict[str, Any]]:
+=======
+    configs: list[Config], filename: Optional[str], inductor_meta: dict[str, Any]
+) -> tuple[list[Config], Optional[AutotuneCache], dict[str, Any]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Given a list of configs, checks autotune cache and return metadata
     """
@@ -221,7 +272,11 @@ def check_autotune_cache(
         not disabled
         and filename is not None
         and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning"))
+<<<<<<< HEAD
         and os.environ.get("TRITON_INTERPRET", "0") != "1"
+=======
+        and not os.environ.get("TRITON_INTERPRET", "0") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         configs_hash = hash_configs(configs)
 
@@ -277,9 +332,15 @@ def __init__(
         size_hints=None,
         inductor_meta=None,  # metadata not relevant to triton
         custom_kernel=False,  # whether the kernel is inductor-generated or custom
+<<<<<<< HEAD
         filename: str | None = None,
         reset_to_zero_arg_names: list[str] | None = None,
         autotune_cache_info: dict[str, Any] | None = None,
+=======
+        filename: Optional[str] = None,
+        reset_to_zero_arg_names: Optional[list[str]] = None,
+        autotune_cache_info: Optional[dict[str, Any]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         super().__init__()
 
@@ -296,17 +357,24 @@ def __init__(
             "device_type": self.device_props.type,
         }
         self.inductor_meta = {} if inductor_meta is None else inductor_meta
+<<<<<<< HEAD
         self.deterministic_mode = self.inductor_meta.get("deterministic", False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.save_cache_hook = save_cache_hook
         self.mutated_arg_names = mutated_arg_names
         self.reset_to_zero_arg_names = (
             [] if reset_to_zero_arg_names is None else reset_to_zero_arg_names
         )
         self.optimize_mem = optimize_mem
+<<<<<<< HEAD
         cached_config = lookup_autotune_config(size_hints, fn)
         self.configs = [cached_config] if cached_config else configs
 
+=======
+        self.configs = configs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.heuristic_type = heuristic_type
         self.custom_kernel = custom_kernel
         self.cuda_kernel_saved = False
@@ -332,7 +400,10 @@ def __init__(
         self.size_hints = size_hints
         self.coordesc_tuner = CoordescTuner(
             is_mm=False,
+<<<<<<< HEAD
             is_native_matmul=triton_meta.get("native_matmul", False),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name=self.fn.__name__,
             size_hints=size_hints,
             inductor_meta=self.inductor_meta,
@@ -359,12 +430,18 @@ def __init__(
         self.triton_interpret = os.environ.get("TRITON_INTERPRET", "0") == "1"
 
         # Compile-time info included in runtime logginging
+<<<<<<< HEAD
         self.compile_id: CompileId | None = None
         self.is_backward = False
 
         # Mode for launch grid calculation
         self.grid_mode: Literal["python", "cpp"] = "python"
 
+=======
+        self.compile_id: Optional[CompileId] = None
+        self.is_backward = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_statically_launchable(self):
         """
         Checks if every compiled kernel is statically launchable, which
@@ -411,15 +488,26 @@ def recheck_autotune_cache(
                         self.fn = reload_kernel_from_src().fn
                     self.compile_results = [self._precompile_config(best_config)]
 
+<<<<<<< HEAD
     def set_compile_info(self, compile_id: CompileId | None, is_backward: bool) -> None:
+=======
+    def set_compile_info(
+        self, compile_id: Optional[CompileId], is_backward: bool
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.compile_id = compile_id
         self.is_backward = is_backward
 
     def precompile(
         self,
         warm_cache_only=False,
+<<<<<<< HEAD
         reload_kernel: Callable[[], CachingAutotuner] | None = None,
         static_triton_bundle_key: str | None = None,
+=======
+        reload_kernel: Optional[Callable[[], CachingAutotuner]] = None,
+        static_triton_bundle_key: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if warm_cache_only:
             self._precompile_worker()
@@ -468,8 +556,12 @@ def _dynamic_scale_rblock(self):
         # Currently it relies on _make_launchers(), which requires a cuda context, to populate nreg.
         device_prop = self.device_props
         if (
+<<<<<<< HEAD
             not self.deterministic_mode
             and self.inductor_meta.get("dynamic_scale_rblock", True)
+=======
+            self.inductor_meta.get("dynamic_scale_rblock", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and not self.inductor_meta.get("persistent_reduction")
             and self.heuristic_type == HeuristicType.REDUCTION
             and self.size_hints is not None
@@ -482,7 +574,11 @@ def _dynamic_scale_rblock(self):
             assert device_prop.regs_per_multiprocessor
             assert device_prop.max_threads_per_multi_processor
             assert device_prop.multi_processor_count
+<<<<<<< HEAD
             seen_config_hashes: OrderedSet[Hashable] | None = None
+=======
+            seen_config_hashes: Optional[OrderedSet[Hashable]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             warp_size = device_prop.warp_size or 32
             for result in self.compile_results:
                 triton_config = result.config
@@ -530,7 +626,11 @@ def _dynamic_scale_rblock(self):
                 #   = regs_per_multiprocessor / (nreg * 32 * num_warps)
                 #   < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps)
                 #   = max_threads_per_multi_processor / (32 * num_warps)
+<<<<<<< HEAD
                 # Using a tighter upper bound can reveal more optimization opportunities.
+=======
+                # Using a tigher upper bound can reveal more optimization opportunities.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 max_blocks_per_sm = max(
                     device_prop.regs_per_multiprocessor // nreg_per_block, 1
                 )
@@ -572,7 +672,11 @@ def _dynamic_scale_rblock(self):
                     assert hasattr(self, "_reload_kernel")
                     assert callable(self._reload_kernel)
                     self.fn = self._reload_kernel().fn
+<<<<<<< HEAD
                 self.compile_results.append(self._precompile_config(new_config))  # noqa: B909
+=======
+                self.compile_results.append(self._precompile_config(new_config))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self._make_launchers()
 
@@ -587,6 +691,7 @@ def _make_launchers(self):
         # load binary to the correct device
         with DeviceGuard(device_interface, self.triton_meta["device"]):
             # need to initialize context
+<<<<<<< HEAD
             with dynamo_timed(
                 "CachingAutotuner.synchronize",
                 # Deliberately avoid overloading pt2_compile_events:
@@ -594,6 +699,9 @@ def _make_launchers(self):
             ):
                 device_interface.synchronize(device_interface.current_device())
 
+=======
+            device_interface.synchronize(device_interface.current_device())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             launchers = []
             exc = None
             for result in self.compile_results:
@@ -606,7 +714,11 @@ def _make_launchers(self):
             raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
         self.launchers = launchers
 
+<<<<<<< HEAD
     def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any, Any]:
+=======
+    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Drop stuff from triton.JITFunction that does not pickle.
         This must be called after precompile so that these things are no longer needed.
         Returns a tuple of old values
@@ -617,13 +729,17 @@ def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any, Any]:
             self.fn.used_global_vals,
             self.fn.repr,
             self.launchers,
+<<<<<<< HEAD
             getattr(self.fn, "_hash_lock", None),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.fn.fn = None
         self.fn.__globals__ = None
         self.fn.used_global_vals = None
         self.fn.repr = _ConstRepr(self.fn.repr(self.fn))
         self.launchers = []
+<<<<<<< HEAD
         self.fn._hash_lock = None
         return old_values
 
@@ -644,6 +760,10 @@ def restore_after_unpickle(
             # _hash_lock to be a valid RLock
             self.fn._hash_lock = threading.RLock()
 
+=======
+        return old_values
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def prepare_for_caching(self) -> None:
         """
         Statically Launched CUDA Kernels have a raw cubin on them
@@ -673,6 +793,7 @@ def get_device_interface(self):
 
         return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
 
+<<<<<<< HEAD
     def _create_compile_meta(self, cfg: Config) -> dict[str, Any]:
         """
         Create compilation metadata for a given autotuner config. This involves
@@ -684,6 +805,11 @@ def _create_compile_meta(self, cfg: Config) -> dict[str, Any]:
         compile_meta["num_warps"] = cfg.num_warps
         compile_meta["num_stages"] = cfg.num_stages
 
+=======
+    def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = copy.deepcopy(self.triton_meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cfg_kwargs = cfg.kwargs
         if self.device_props.type == "hip":
             cfg_kwargs = {**cfg_kwargs}
@@ -691,13 +817,22 @@ def _create_compile_meta(self, cfg: Config) -> dict[str, Any]:
                 if k in cfg_kwargs:
                     compile_meta[k] = cfg_kwargs.pop(k)
         compile_meta["constants"].update(cfg_kwargs)
+<<<<<<< HEAD
 
         for i in get_constexprs(self.fn):
+=======
+        for i in self.fn.constexprs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_name = self.fn.arg_names[i]
             if arg_name not in compile_meta["constants"] and (
                 arg_name == "num_warps" or arg_name == "num_stages"
             ):
                 compile_meta["constants"][arg_name] = getattr(cfg, arg_name)
+<<<<<<< HEAD
+=======
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if HAS_WARP_SPEC:
             compile_meta["num_consumer_groups"] = getattr(cfg, "num_consumer_groups", 0)
             compile_meta["num_buffers_warp_spec"] = getattr(
@@ -711,6 +846,7 @@ def _create_compile_meta(self, cfg: Config) -> dict[str, Any]:
         compile_meta["device_type"] = self.device_props.type
         compile_meta["cc"] = self.device_props.cc
 
+<<<<<<< HEAD
         return compile_meta
 
     def _create_compile_options(
@@ -758,6 +894,8 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
         """Ahead of time compile a given autotuner config."""
         compile_meta = self._create_compile_meta(cfg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device_props.type == "cpu":
             triton_helpers.set_driver_to_cpu()
         else:
@@ -790,8 +928,31 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
             cc_warp_size(compile_meta["cc"]),
         )
 
+<<<<<<< HEAD
         options = self._create_compile_options(cfg, compile_meta)
 
+=======
+        options = {
+            "num_warps": compile_meta["num_warps"],
+            "num_stages": compile_meta["num_stages"],
+            "debug": compile_meta["debug"],
+            "sanitize_overflow": False,  # turn off additional asserts added for overflow checks
+        }
+        if HAS_WARP_SPEC:
+            options.update(
+                {
+                    "num_consumer_groups": compile_meta.get("num_consumer_groups", 0),
+                    "num_buffers_warp_spec": compile_meta.get(
+                        "num_buffers_warp_spec", 0
+                    ),
+                }
+            )
+        if self.device_props.type == "hip":
+            if "waves_per_eu" in compile_meta:
+                options["waves_per_eu"] = compile_meta["waves_per_eu"]
+            if "matrix_instr_nonkdim" in compile_meta:
+                options["matrix_instr_nonkdim"] = compile_meta["matrix_instr_nonkdim"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_kwargs = {
             "target": target,
             "options": options,
@@ -807,6 +968,7 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                 compile_meta,
             )
             raise
+<<<<<<< HEAD
 
         # Simulate JIT Hook call
         if (
@@ -837,6 +999,8 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
             except Exception:
                 log.exception("jit_post_compile_hook failed")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TritonBundler.put(
             triton_hash_to_path_key(binary.hash), self.triton_meta.get("device", 0)
         )
@@ -853,6 +1017,31 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
 
         return TritonCompileResult(binary, cfg, compile_meta, self.inductor_meta)
 
+<<<<<<< HEAD
+=======
+    def _get_args_with_constexprs(self, args, launcher):
+        """
+        `args` is passed in with only the non-constexpr args (because the constexpr arg values
+        depend on the config). However, in later triton versions, the constexpr args need to be
+        added into the args list.
+        """
+        if triton_version_uses_attrs_dict():
+            # first: aggregate the constexpr args in (index, val) pairs
+            # so we can sort them by index.
+            constexpr_args: list[tuple[int, Any]] = []
+            for arg_name, arg_val in launcher.config.kwargs.items():
+                if arg_name in self.fn.arg_names:
+                    constexpr_args.append((self.fn.arg_names.index(arg_name), arg_val))
+
+            constexpr_args.sort()
+            new_args = [*args]
+            for arg_idx, arg_val in constexpr_args:
+                new_args.insert(arg_idx, arg_val)
+
+            return new_args
+        return args
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def bench(self, launcher, *args, with_profiler=False, **kwargs):
         """Measure the performance of a given launcher"""
         # we don't skip configs with spilled registers when auto-tuning custom
@@ -861,7 +1050,11 @@ def bench(self, launcher, *args, with_profiler=False, **kwargs):
         # for some (complicated) custom Triton kernels, a register-spilling
         # config may yield the best latency.
         if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
+<<<<<<< HEAD
             "spill_threshold", 32 if torch.version.hip else 16
+=======
+            "spill_threshold", 16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             log.debug(
                 "Skip config %s because of register spilling: %d",
@@ -881,6 +1074,7 @@ def kernel_call():
             )
             # reset to zero before evaluating any config
             self.reset_to_zero_args(*args, **kwargs)
+<<<<<<< HEAD
             kernel_name = self.inductor_meta.get("kernel_name", "triton kernel")
             if autograd_profiler._is_profiler_enabled:
                 profiler_kwargs = self.get_profiler_kwargs(stream, launcher)
@@ -913,6 +1107,17 @@ def kernel_call():
 
         # only use profiler when not already in a profiler instance
         if with_profiler and not autograd_profiler._is_profiler_enabled:
+=======
+            args_with_constexprs = self._get_args_with_constexprs(cloned_args, launcher)
+            launcher(
+                *args_with_constexprs,
+                **cloned_kwargs,
+                stream=stream,
+            )
+            self.restore_args_from_cpu(cpu_copies)
+
+        if with_profiler:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch._inductor.utils import do_bench_using_profiling
 
             return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
@@ -920,9 +1125,13 @@ def kernel_call():
         if self.device_props.type == "cpu":
             return benchmarker.benchmark_cpu(kernel_call)
 
+<<<<<<< HEAD
         return benchmarker.benchmark_gpu(
             kernel_call, rep=40, is_vetted_benchmarking=True
         )
+=======
+        return benchmarker.benchmark_gpu(kernel_call, rep=40)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def copy_args_to_cpu_if_needed(self, *args, **kwargs):
         """
@@ -936,11 +1145,15 @@ def copy_args_to_cpu_if_needed(self, *args, **kwargs):
             return {}
 
         copies = {}
+<<<<<<< HEAD
         try:
             budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
         except RuntimeError:
             # Possibly a custom CUDA allocator, see https://github.com/pytorch/pytorch/issues/163257
             return {}
+=======
+        budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def maybe_copy(name, arg):
             if name in self.mutated_arg_names and arg.is_cuda:
@@ -1051,11 +1264,18 @@ def benchmark_all_configs(self, *args, **kwargs):
                 log_waitcounter=True,
                 waitcounter_name_override="triton_autotuner",
             ),
+<<<<<<< HEAD
             # Temporarily disable due to spam
             # compilation_callback.callback_handler.install_callbacks(
             #     compilation_callback.CallbackTrigger.TRITON_AUTOTUNING,
             #     str(self.compile_id),
             # ),
+=======
+            compilation_callback.callback_handler.install_callbacks(
+                compilation_callback.CallbackTrigger.TRITON_AUTOTUNING,
+                str(self.compile_id),
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             timings = {
                 launcher: self.bench(launcher, *args, **kwargs)
@@ -1077,6 +1297,7 @@ def benchmark_all_configs(self, *args, **kwargs):
                         k.shared,
                     )
 
+<<<<<<< HEAD
             if metrics.is_metric_table_enabled("kernel_autotune"):
                 if self.fn.fn is None:
                     self.fn = self._reload_kernel().fn
@@ -1089,6 +1310,8 @@ def benchmark_all_configs(self, *args, **kwargs):
                         kernel_path, kernel_name, k.config, v
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.reset_to_zero_args(*args, **kwargs)
             return timings
 
@@ -1148,6 +1371,7 @@ def save_gpu_kernel(self, stream, launcher):
             "def_args": launcher.def_args,
             "call_args": launcher.call_args,
             "global_scratch": launcher.global_scratch,
+<<<<<<< HEAD
             "profile_scratch": launcher.profile_scratch,
         }
         if self.device_props.type == "xpu":
@@ -1159,6 +1383,9 @@ def save_gpu_kernel(self, stream, launcher):
                 launcher.bin.metadata, "threads_per_warp", 32
             )
 
+=======
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
@@ -1183,14 +1410,21 @@ def coordinate_descent_tuning(self, launcher, *args, **kwargs):
         Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1;
         while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
         """
+<<<<<<< HEAD
         if self.heuristic_type in (
             HeuristicType.TEMPLATE,
             HeuristicType.USER_AUTOTUNE,
             HeuristicType.FIXED,
+=======
+        if (
+            self.heuristic_type == HeuristicType.TEMPLATE
+            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # skip triton template
             return launcher
 
+<<<<<<< HEAD
         if self.deterministic_mode and self.heuristic_type in (
             HeuristicType.REDUCTION,
             HeuristicType.PERSISTENT_REDUCTION,
@@ -1217,6 +1451,8 @@ def coordinate_descent_tuning(self, launcher, *args, **kwargs):
             return self._coordinate_descent_tuning(launcher, *args, **kwargs)
 
     def _coordinate_descent_tuning(self, launcher, *args, **kwargs):
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         config2launcher = {launcher.config: launcher}
 
         # TODO: should we just load the kernels ahead of time if we know we're going to call this?
@@ -1236,7 +1472,10 @@ def benchmark_one_config(config):
             config2launcher[config] = launcher
 
             out = self.bench(launcher, *args, **kwargs)
+<<<<<<< HEAD
             counters["inductor"]["coordesc_tuning_bench"] += 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug(
                 "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
                 launcher.config,
@@ -1274,6 +1513,7 @@ def benchmark_one_config(config):
             config2launcher[best_config] = self._precompile_config(
                 best_config
             ).make_launcher()
+<<<<<<< HEAD
 
         fn_hash = generate_lookup_hash_from_source_code(
             str(self.size_hints), self.fn.src
@@ -1303,6 +1543,10 @@ def get_profiler_kwargs(self, stream, launcher):
             ret["kernel_num_gb"] = self.inductor_meta["kernel_num_gb"]
         return ret
 
+=======
+        return config2launcher[best_config]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run(
         self,
         *args,
@@ -1312,7 +1556,11 @@ def run(
     ):  # type:ignore[override]
         if hasattr(triton, "set_allocator"):
 
+<<<<<<< HEAD
             def alloc_fn(size: int, align: int, stream: int | None):
+=======
+            def alloc_fn(size: int, align: int, stream: Optional[int]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return torch.empty(
                     size, dtype=torch.int8, device=self.device_props.type
                 )
@@ -1346,10 +1594,14 @@ def alloc_fn(size: int, align: int, stream: int | None):
         if launcher.store_cubin and (not benchmark_run or not self.cuda_kernel_saved):
             self.save_gpu_kernel(stream, launcher)
 
+<<<<<<< HEAD
         # PyTorch execution trace replay calls CachingAutotuner::run() instead of calls launcher
         # so _RecordFunctionFast need to capture the args into CachingAutotuner::run()
         # make a copy here to avoid mutating the original args
         args_without_constexprs = tuple(args)
+=======
+        args = self._get_args_with_constexprs(args, launcher)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.dump_launch_params:
             new_args, grid = self._interpret_args_grid(args, launcher.config)
@@ -1358,11 +1610,31 @@ def alloc_fn(size: int, align: int, stream: int | None):
         # it is faster than entering and exiting a context manager, even if the context
         # manager is a nullcontext.
         if autograd_profiler._is_profiler_enabled:
+<<<<<<< HEAD
             profiler_kwargs = self.get_profiler_kwargs(stream, launcher)
 
             with torch._C._profiler._RecordFunctionFast(
                 self.inductor_meta.get("kernel_name", "triton kernel"),
                 args_without_constexprs,
+=======
+            kernel_kwargs_str = ",".join(
+                f"{k}={v}" for (k, v) in launcher.config.kwargs.items()
+            )
+
+            profiler_kwargs = {
+                "kernel_file": (self.filename or ""),
+                "kernel_hash": self.kernel_hash,
+                "kernel_backend": "triton",
+                "stream": stream,
+                "num_warps": launcher.config.num_warps,
+                "num_stages": launcher.config.num_stages,
+                "kernel_kwargs": kernel_kwargs_str,
+            }
+
+            with torch._C._profiler._RecordFunctionFast(
+                self.inductor_meta.get("kernel_name", "triton kernel"),
+                args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 profiler_kwargs,
             ):
                 return launcher(
@@ -1380,6 +1652,7 @@ def alloc_fn(size: int, align: int, stream: int | None):
     def _interpret_args_grid(
         self, args: tuple[Any, ...], cfg: Config
     ) -> tuple[tuple[Any, ...], tuple[int, int, int]]:
+<<<<<<< HEAD
         if triton_version_uses_attrs_dict():
 
             def filtered_signature() -> list[str]:
@@ -1413,6 +1686,13 @@ def filtered_signature() -> list[str]:
                 zip(
                     [
                         *filtered_signature(),
+=======
+        grid = GridExpr.from_meta(self.inductor_meta, cfg).eval_slow(
+            dict(
+                zip(
+                    [
+                        *self.triton_meta["signature"].keys(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         *self.inductor_meta.get("extra_launcher_args", ()),
                     ],
                     args,
@@ -1433,10 +1713,13 @@ def __call__(self, _=None) -> str:
 
 
 class CompileResult(Generic[_T]):
+<<<<<<< HEAD
     """
     Base class representing compiled result.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         kernel: _T,
@@ -1500,6 +1783,7 @@ def _get_arg_lists(
         )
         none_args = none_args.difference(OrderedSet(compile_meta["signature"].keys()))
 
+<<<<<<< HEAD
         def _convert_constant(constant):
             if isinstance(constant, str):
                 return "r'" + constant + "'"
@@ -1524,6 +1808,23 @@ def _convert_constant(constant):
                 repl = {
                     k: _convert_constant(compile_meta["constants"].get(k))
                     for k in implicit_constants
+=======
+        if triton_version_uses_attrs_dict():
+            call_args = arg_names
+            def_args = arg_names
+            if (
+                "num_warps" in compile_meta["constants"]
+                or "num_stages" in compile_meta["constants"]
+            ):
+                # num_warps/num_stages are special implicit args that are not in the signature
+                # see test_triton_kernel_special_params
+                def_args = [
+                    arg for arg in def_args if arg not in ("num_warps", "num_stages")
+                ]
+                repl = {
+                    k: str(compile_meta["constants"].get(k))
+                    for k in ("num_warps", "num_stages")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 }
                 call_args = [repl.get(arg, arg) for arg in call_args]
         else:
@@ -1561,12 +1862,20 @@ def can_statically_launch(
         inductor_meta: dict[str, Any],
         triton_meta: dict[str, Any],
         heuristic_type: HeuristicType,
+<<<<<<< HEAD
     ) -> StaticallyLaunchedCudaKernel | None:
+=======
+    ) -> Optional[StaticallyLaunchedCudaKernel]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not torch._inductor.config.use_static_cuda_launcher:
             return None
 
         def check_can_launch() -> StaticallyLaunchedCudaKernel:
+<<<<<<< HEAD
             if triton_meta.get("device_type") != "cuda":
+=======
+            if triton_meta.get("device_type", None) != "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Only cuda kernels
                 raise CannotStaticallyLaunchKernel("Non-cuda device")
 
@@ -1583,6 +1892,7 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
                 # Don't support user defined triton kernels yet
                 raise CannotStaticallyLaunchKernel("User defined triton kernel")
 
+<<<<<<< HEAD
             if inductor_meta.get("store_cubin"):
                 # Requires storing the entire binary
                 raise CannotStaticallyLaunchKernel("store_cubin is enabled")
@@ -1594,6 +1904,12 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
                     "static launch does not support launch attributes"
                 )
 
+=======
+            if inductor_meta.get("store_cubin", None):
+                # Requires storing the entire binary
+                raise CannotStaticallyLaunchKernel("store_cubin is enabled")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cubin_location = os.path.join(
                 triton_cache_dir(triton_meta.get("device", 0)),
                 triton_hash_to_path_key(kernel.hash),
@@ -1619,7 +1935,11 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
             result = check_can_launch()
             return result
         except CannotStaticallyLaunchKernel as e:
+<<<<<<< HEAD
             log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))  # noqa: G200
+=======
+            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if torch._inductor.config.strict_static_cuda_launcher:
                 raise e
             return None
@@ -1795,7 +2115,11 @@ def make_launcher(self) -> LauncherType:
         fn = binary.src.fn
         binary._init_handles()
         (call_args, def_args, none_args) = self._get_arg_lists(
+<<<<<<< HEAD
             fn.arg_names, get_constexprs(fn)
+=======
+            fn.arg_names, fn.constexprs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         binary_shared = (
             binary.shared if hasattr(binary, "shared") else binary.metadata.shared
@@ -1810,8 +2134,11 @@ def make_launcher(self) -> LauncherType:
 
         import math as math_lib
 
+<<<<<<< HEAD
         import triton as triton_lib
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import torch as torch_lib
 
         scope = {
@@ -1846,7 +2173,10 @@ def make_launcher(self) -> LauncherType:
             "runner": get_first_attr(binary, "run", "c_wrapper"),
             "math": math_lib,
             "torch": torch_lib,
+<<<<<<< HEAD
             "triton": triton_lib,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         if not hasattr(binary, "launch_metadata"):
@@ -1915,6 +2245,7 @@ def make_launcher(self) -> LauncherType:
             launcher.def_args = def_args
             launcher.call_args = call_args
             kernel_metadata = getattr(self.kernel, "metadata", None)
+<<<<<<< HEAD
 
             # for the scratch arguments: None indicates that the kernel doesn't
             # take any scratch argument; otherwise a number indicates the number
@@ -1932,6 +2263,11 @@ def make_launcher(self) -> LauncherType:
             )
             launcher.global_scratch = global_scratch
             launcher.profile_scratch = profile_scratch
+=======
+            launcher.global_scratch = getattr(
+                kernel_metadata, "global_scratch_size", None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return launcher
 
 
@@ -1998,11 +2334,19 @@ def end_graph(output_file):
                     )
                     file.write(bw_info_str + "\n")
                 file.write(f"{summary_str}\n\n")
+<<<<<<< HEAD
         except Exception:
             log.warning(
                 "failed to write profile bandwidth result into %s",
                 output_file,
                 exc_info=True,
+=======
+        except Exception as e:
+            log.warning(
+                "failed to write profile bandwidth result into %s: %s",
+                output_file,
+                e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -2030,6 +2374,10 @@ def run(self, *args, stream, **kwargs):
             kernel_name = f"{max(possible_names, key=len)}"
             if not re.match(self.regex_filter, kernel_name):
                 return
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(self.launchers) != 1:
                 if len(self.launchers) == 0:
                     start_time = time.time_ns()
@@ -2081,7 +2429,11 @@ def hash_configs(configs: list[Config]):
 
 
 def cached_autotune(
+<<<<<<< HEAD
     size_hints: list[int] | None,
+=======
+    size_hints: Optional[list[int]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     configs: list[Config],
     triton_meta,
     heuristic_type,
@@ -2245,9 +2597,12 @@ def triton_config(
     num_stages=1,
     num_elements_per_warp=256,
     min_elem_per_thread=0,
+<<<<<<< HEAD
     num_warps=None,
     matrix_instr=None,
     waves_per_eu=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Config:
     """
     Construct a pointwise triton config with some adjustment heuristics
@@ -2304,11 +2659,17 @@ def triton_config(
     ):
         z *= 2
 
+<<<<<<< HEAD
     # Calculate num_warps if they are not hard passed to config
     if num_warps is None:
         num_warps = _num_warps(
             conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
         )
+=======
+    num_warps = _num_warps(
+        conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # we are going to arrive at 2 warps only if bs was too small due to
     # numel being too small. However to workaround some ptx bugs we still
     # want at least 4 warps if there's enough elements per thread
@@ -2338,6 +2699,7 @@ def triton_config(
         cfg["ZBLOCK"] = z
     check_max_block(cfg)
     check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
+<<<<<<< HEAD
     config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
     if torch.version.hip:
@@ -2347,6 +2709,9 @@ def triton_config(
             config.kwargs["waves_per_eu"] = waves_per_eu
 
     return config
+=======
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_nd_reduction_numels(r: int, size_hints: dict[str, int]) -> dict[str, int]:
@@ -2394,9 +2759,12 @@ def triton_config_reduction(
     num_stages=1,
     num_warps=None,
     register_intensive=False,
+<<<<<<< HEAD
     waves_per_eu=None,
     dynamic_scale_rblock=True,
     reduction_hint=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2424,6 +2792,7 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
+<<<<<<< HEAD
         if reduction_hint == ReductionHint.INNER and not is_fbcode():
             # r is contiguous, so ensure that each thread has 8 elements for
             # vectorized loads, assuming bf16/fp16
@@ -2435,6 +2804,11 @@ def total_numel() -> int:
     max_num_warps = 16 if r <= 8192 else 32
     num_warps = _num_warps(
         num_warps, max_num_warps=max_num_warps, register_intensive=register_intensive
+=======
+        num_warps = total_numel() // 128
+    num_warps = _num_warps(
+        num_warps, max_num_warps=16, register_intensive=register_intensive
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
@@ -2448,6 +2822,7 @@ def total_numel() -> int:
     cfg = _get_config({"x": x, **rnumels})
     check_max_block(cfg)
     check_config(cfg, xnumel=size_hints["x"])
+<<<<<<< HEAD
     config = InductorConfig(
         cfg,
         num_warps=num_warps,
@@ -2460,6 +2835,9 @@ def total_numel() -> int:
             config.kwargs["waves_per_eu"] = waves_per_eu
 
     return config
+=======
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_config(numels: dict[str, int]) -> dict[str, int]:
@@ -2471,7 +2849,11 @@ def _get_config(numels: dict[str, int]) -> dict[str, int]:
 
 
 def triton_config_tiled_reduction(
+<<<<<<< HEAD
     size_hints, x, y, r, num_stages=1, register_intensive=False, waves_per_eu=None
+=======
+    size_hints, x, y, r, num_stages=1, register_intensive=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Construct a tile reduction triton config with some adjustment
@@ -2508,6 +2890,7 @@ def total_numel() -> int:
     )
     check_config(cfg, xnumel=size_hints["x"], ynumel=size_hints["y"])
     check_max_block(cfg)
+<<<<<<< HEAD
     config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
     if torch.version.hip:
         if waves_per_eu is not None:
@@ -2558,6 +2941,9 @@ def _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs: list[Conf
         )
         return new_configs
     return configs
+=======
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def pointwise(
@@ -2590,7 +2976,11 @@ def pointwise(
 
     configs = None
     if len(size_hints) == 1:
+<<<<<<< HEAD
         if not inductor_meta.get("autotune_pointwise", True) and not (
+=======
+        if disable_pointwise_autotuning(inductor_meta) and not (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inductor_meta.get("max_autotune")
             or inductor_meta.get("max_autotune_pointwise")
         ):
@@ -2603,6 +2993,7 @@ def pointwise(
                 ),
                 *hinted_configs,
             ]
+<<<<<<< HEAD
             # Additional configs appended for ROCm builds
             if torch.version.hip:
                 configs.extend(
@@ -2629,6 +3020,11 @@ def pointwise(
         if (
             not inductor_meta.get("autotune_pointwise", True)
             or (torch.version.hip is None and tile_hint == TileHint.SQUARE)
+=======
+    if len(size_hints) == 2:
+        if (
+            disable_pointwise_autotuning(inductor_meta) or tile_hint == TileHint.SQUARE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) and not (
             inductor_meta.get("max_autotune")
             or inductor_meta.get("max_autotune_pointwise")
@@ -2644,6 +3040,7 @@ def pointwise(
                 triton_config_with_settings(size_hints, 1, bs),
                 *hinted_configs,
             ]
+<<<<<<< HEAD
             # Additional configs appended for ROCm builds
             if torch.version.hip:
                 configs.extend(
@@ -2664,6 +3061,10 @@ def pointwise(
                 )
     if len(size_hints) == 3:
         if not inductor_meta.get("autotune_pointwise", True):
+=======
+    if len(size_hints) == 3:
+        if disable_pointwise_autotuning(inductor_meta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             configs = [triton_config_with_settings(size_hints, 16, 16, 16)]
         else:
             configs = [
@@ -2679,9 +3080,12 @@ def pointwise(
 
     if not configs:
         raise NotImplementedError(f"size_hints: {size_hints}")
+<<<<<<< HEAD
 
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cached_autotune(
         size_hints,
         configs,
@@ -2692,6 +3096,7 @@ def pointwise(
     )
 
 
+<<<<<<< HEAD
 def make_matmul_triton_config(sizes: dict[str, int], num_warps: int, num_stages: int):
     config = {
         "XBLOCK": sizes.get("x"),
@@ -2756,10 +3161,17 @@ def _reduction_configs(
     num_dynamic=0,
 ) -> list[Config]:
     reduction_hint = inductor_meta.get("reduction_hint")
+=======
+def _reduction_configs(
+    *, size_hints: dict[str, int], inductor_meta: dict[str, Any]
+) -> list[Config]:
+    reduction_hint = inductor_meta.get("reduction_hint", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Convert reductions to 1D, to simplify heuristics.
     rnumel = get_total_reduction_numel(size_hints)
 
+<<<<<<< HEAD
     # Is max autotune enabled
     max_autotune_enabled = inductor_meta.get("max_autotune") or inductor_meta.get(
         "max_autotune_pointwise"
@@ -2771,6 +3183,15 @@ def _reduction_configs(
         "num_reduction", 0
     )
     if size_hints["x"] >= 1024 and loads_and_red >= 10:
+=======
+    register_intensive = False
+    MAX_R0_BLOCK = 2048
+    if (
+        size_hints["x"] >= 1024
+        and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0)
+        >= 10
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # A heuristics to reduce R0_BLOCK if a kernel potentially need many registers.
         # Consider load and reduction since load need move data into registers and
         # reduction needs an accumulator.
@@ -2786,6 +3207,7 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
+<<<<<<< HEAD
     if triton_meta.get("native_matmul"):
         if len(size_hints) == 3:
             return [
@@ -2809,6 +3231,9 @@ def make_config(
         dynamic_scale_rblock=True,
         waves_per_eu=None,
     ):
+=======
+    def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For 3D case with tiling scores, create an adapted version
         if "y" in size_hints:
             assert "tiling_scores" in inductor_meta
@@ -2820,7 +3245,10 @@ def make_config(
                 num_warps=num_warps,
                 num_stages=num_stages,
                 register_intensive=register_intensive,
+<<<<<<< HEAD
                 waves_per_eu=waves_per_eu,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             # For other cases, use the original function
@@ -2831,6 +3259,7 @@ def make_config(
                 num_warps=num_warps,
                 num_stages=num_stages,
                 register_intensive=register_intensive,
+<<<<<<< HEAD
                 waves_per_eu=waves_per_eu,
                 dynamic_scale_rblock=dynamic_scale_rblock,
                 reduction_hint=reduction_hint,
@@ -2889,11 +3318,22 @@ def outer_config_opt():
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+=======
+            )
+
+    contiguous_config = make_config(
+        1,
+        min(rnumel, MAX_R0_BLOCK),
+        register_intensive=register_intensive,
+    )
+    outer_config = make_config(64, 8, register_intensive=register_intensive)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tiny_config = make_config(
         2 * (256 // rnumel) if rnumel <= 256 else 1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+<<<<<<< HEAD
 
     outer_config = make_config(64, 8, register_intensive=register_intensive)
     # TODO (paulzhan): Test heuristic on AMD and internal testing
@@ -2931,6 +3371,24 @@ def outer_config_opt():
     # - max_autotune_enabled is True
     # - max_autotune_enabled is False and reduction_hint is NOT one of the above cases
     result_configs = configs + [
+=======
+    # For 3d tiling, default to more autotuning initially
+    if "y" in size_hints:
+        pass
+    elif inductor_meta.get("max_autotune") or inductor_meta.get(
+        "max_autotune_pointwise"
+    ):
+        pass  # skip all these cases
+    elif reduction_hint == ReductionHint.INNER:
+        return [contiguous_config]
+    elif reduction_hint == ReductionHint.OUTER:
+        return [outer_config]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        return [tiny_config]
+    if disable_pointwise_autotuning(inductor_meta):
+        return [make_config(32, 128)]
+    return [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         contiguous_config,
         outer_config,
         tiny_config,
@@ -2942,6 +3400,7 @@ def outer_config_opt():
         make_config(64, 4, num_warps=8),
     ]
 
+<<<<<<< HEAD
     if torch.version.hip:
         result_configs.extend(
             [
@@ -2952,6 +3411,8 @@ def outer_config_opt():
 
     return result_configs
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def match_target_block_product(
     size_hints, tiling_scores, target_block_product, min_block_size=1
@@ -2981,7 +3442,11 @@ def match_target_block_product(
         relative_scores[dim] = score / total_score
 
     # Scale up dimensions by their relative scores until we reach the target
+<<<<<<< HEAD
     while curr_block_product < target_block_product and relative_scores:
+=======
+    while curr_block_product < target_block_product and len(relative_scores):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim, score = max(relative_scores.items(), key=lambda item: item[1])
 
         # Check if we've hit the max for this dimension
@@ -3008,7 +3473,10 @@ def adapt_config_for_tiling(
     num_stages=1,
     register_intensive=False,
     persistent_reduction=False,
+<<<<<<< HEAD
     waves_per_eu=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Config:
     """
     Create an adapted configuration based on tiling scores,
@@ -3027,6 +3495,7 @@ def adapt_config_for_tiling(
         block_sizes["r0_"],
         num_stages=num_stages,
         register_intensive=register_intensive,
+<<<<<<< HEAD
         waves_per_eu=waves_per_eu,
     )
 
@@ -3137,6 +3606,11 @@ def _pick_config():
     return configs
 
 
+=======
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def reduction(
     size_hints,
     reduction_hint=False,
@@ -3152,6 +3626,7 @@ def reduction(
 
     assert triton_meta is not None
 
+<<<<<<< HEAD
     num_dynamic = 0
     for k in triton_meta["signature"].keys():
         if "ks" in k:
@@ -3167,6 +3642,9 @@ def reduction(
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
 
+=======
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -3202,23 +3680,33 @@ def cooperative_reduction(
     assert split <= TRITON_MAX_RSPLIT
     if inductor_meta["persistent_reduction"]:
         configs = _persistent_reduction_configs(
+<<<<<<< HEAD
             {"x": xnumel, "r0_": rnumel // split},
             reduction_hint,
             inductor_meta,
             triton_meta,
+=======
+            {"x": xnumel, "r0_": rnumel // split}, reduction_hint, inductor_meta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         configs = _reduction_configs(
             size_hints={"x": xnumel, "r0_": rnumel // split},
             inductor_meta=inductor_meta,
+<<<<<<< HEAD
             triton_meta=triton_meta,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     for config in configs:
         config.kwargs["RSPLIT"] = split
     # TODO(jansel): add more configs in max_autotune
 
+<<<<<<< HEAD
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -3233,6 +3721,7 @@ def _persistent_reduction_configs(
     size_hints,
     reduction_hint=False,
     inductor_meta=None,
+<<<<<<< HEAD
     triton_meta=None,
 ):
     xnumel = size_hints["x"]
@@ -3276,6 +3765,22 @@ def _persistent_reduction_configs(
                 reduction_hint=reduction_hint,
             )
             for xblock in xblock_vals
+=======
+):
+    xnumel = size_hints["x"]
+    rnumel = get_total_reduction_numel(size_hints)
+
+    MAX_PERSISTENT_BLOCK_NUMEL = 4096
+    max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
+        inductor_meta.get("max_autotune")
+        or inductor_meta.get("max_autotune_pointwise")
+    )
+
+    if "y" not in size_hints:
+        configs = [
+            triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
+            for xblock in (1, 8, 32, 128)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if xblock == 1
             or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
         ]
@@ -3283,7 +3788,11 @@ def _persistent_reduction_configs(
         configs = []
         assert "tiling_scores" in inductor_meta
         x_y_scores = {dim: inductor_meta["tiling_scores"][dim] for dim in ("x", "y")}
+<<<<<<< HEAD
         for target_block_size in xblock_vals:
+=======
+        for target_block_size in (1, 8, 32, 64, 128):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if target_block_size * rnumel > MAX_PERSISTENT_BLOCK_NUMEL:
                 continue
 
@@ -3296,6 +3805,10 @@ def _persistent_reduction_configs(
                 )
             )
 
+<<<<<<< HEAD
+=======
+    # defer to more autotuning, initially
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tiny_configs = [
         triton_config_reduction(
             size_hints,
@@ -3304,6 +3817,7 @@ def _persistent_reduction_configs(
         )
     ]
 
+<<<<<<< HEAD
     # defer to more autotuning, initially
     if "y" in size_hints:
         pass
@@ -3326,23 +3840,45 @@ def _persistent_reduction_configs(
                     )
                 ]
 
+=======
+    if "y" in size_hints:
+        pass
+    # TODO(jansel): we should be able to improve these heuristics
+    elif not max_autotune_enabled: # Don't filter if tuning enabled
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            configs = configs[:1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif reduction_hint == ReductionHint.OUTER:
             configs = configs[-1:]
         elif reduction_hint == ReductionHint.OUTER_TINY:
             configs = tiny_configs
     else:
+<<<<<<< HEAD
         if torch.version.hip:
             # If autotune is enabled append tiny configs
             for conf in tiny_configs:
                 if conf not in configs:
                     configs.append(conf)
 
+=======
+        if max_autotune_enabled:
+            for conf in tiny_configs:
+                if conf not in configs:
+                    configs.append(conf)
+    
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:
             if prefix_is_reduction(prefix):
                 c.kwargs.pop(f"{prefix.upper()}BLOCK")
 
+<<<<<<< HEAD
+=======
+    if disable_pointwise_autotuning(inductor_meta):
+        configs = configs[:1]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return configs
 
 
@@ -3358,6 +3894,7 @@ def persistent_reduction(
     if inductor_meta.get("no_x_dim"):
         size_hints["x"] = 1
 
+<<<<<<< HEAD
     configs = _persistent_reduction_configs(
         size_hints, reduction_hint, inductor_meta, triton_meta
     )
@@ -3390,6 +3927,10 @@ def persistent_reduction(
         configs = unique_configs(new_configs)
 
     configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
+=======
+    configs = _persistent_reduction_configs(size_hints, reduction_hint, inductor_meta)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cached_autotune(
         size_hints,
         configs,
@@ -3417,9 +3958,13 @@ def split_scan(
     if len(size_hints) != 2:
         raise NotImplementedError(f"size_hints: {size_hints}")
 
+<<<<<<< HEAD
     configs = _reduction_configs(
         size_hints=size_hints, inductor_meta=inductor_meta, triton_meta=triton_meta
     )
+=======
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Fixup configs to enforce the minimum Rn_BLOCK size
     min_rblock = inductor_meta.get("min_split_scan_rblock", 256)
@@ -3428,8 +3973,11 @@ def split_scan(
             if var.startswith("R") and cfg.kwargs[var] < min_rblock:
                 cfg.kwargs[var] = min_rblock
 
+<<<<<<< HEAD
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -3550,6 +4098,7 @@ def user_autotune(
     )
 
 
+<<<<<<< HEAD
 def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
     """
     Compile a triton foreach kernel
@@ -3557,13 +4106,35 @@ def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
     return cached_autotune(
         None,
         [triton.Config({}, num_stages=1, num_warps=num_warps)],
+=======
+def foreach(triton_meta, filename=None, inductor_meta=None):
+    """
+    Compile a triton foreach kernel
+    """
+    configs = []
+    if disable_pointwise_autotuning(inductor_meta) and not (
+        inductor_meta.get("max_autotune") or
+        inductor_meta.get("max_autotune_pointwise")
+    ):
+        configs.append(triton.Config({}, num_stages=1, num_warps=8))
+    else:
+        for warps in [1, 2, 4, 8]:
+            configs.append(triton.Config({}, num_stages=1, num_warps=warps))
+
+    return cached_autotune(
+        None,
+        configs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_meta=triton_meta,
         inductor_meta=inductor_meta,
         heuristic_type=HeuristicType.TEMPLATE,
         filename=filename,
     )
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class GridExpr:
     """Generate code for grid size expressions in launcher"""
@@ -3571,9 +4142,15 @@ class GridExpr:
     inductor_meta: dict[str, Any]
     mode: Literal["python", "cpp"] = "python"
     prefix: list[str] = dataclasses.field(default_factory=list)
+<<<<<<< HEAD
     x_grid: str | int = 1
     y_grid: str | int = 1
     z_grid: str | int = 1
+=======
+    x_grid: Union[str, int] = 1
+    y_grid: Union[str, int] = 1
+    z_grid: Union[str, int] = 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self) -> None:
         assert self.mode in ("python", "cpp")
@@ -3581,11 +4158,18 @@ def __post_init__(self) -> None:
     def generate(self, meta: dict[str, int]) -> None:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def ceildiv(self, numel: str | int, block: None | int | str) -> str | int:
+=======
+    def ceildiv(
+        self, numel: Union[str, int], block: Union[None, int, str]
+    ) -> Union[str, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if block is None or block == 1:
             return numel
         if isinstance(numel, int) and isinstance(block, int):
             return ceildiv(numel, block)  # constant fold
+<<<<<<< HEAD
         # This trick only works in python, where
         # negative integer division is floored
         if self.mode == "python":
@@ -3594,6 +4178,14 @@ def ceildiv(self, numel: str | int, block: None | int | str) -> str | int:
         return f"(({numel} + ({block} - 1)) / ({block}))"
 
     def maximum(self, seq: list[int | str]) -> int | str:
+=======
+        if self.mode == "python":
+            return f"-(({numel}) // -({block}))"
+        # trick above doesn't work in C++ due to rounding differences
+        return f"(({numel} + ({block} - 1)) / ({block}))"
+
+    def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Codegen for max function with constant folding, constants are represented as int"""
         items = self._constant_fold(max, seq)
         if len(items) <= 1:
@@ -3602,7 +4194,11 @@ def maximum(self, seq: list[int | str]) -> int | str:
             return f"max({', '.join(map(str, items))})"
         return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
 
+<<<<<<< HEAD
     def summation(self, seq: list[int | str]) -> int | str:
+=======
+    def summation(self, seq: list[Union[int, str]]) -> Union[int, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Codegen for sum function with constant folding, constants are represented as int"""
         items = self._constant_fold(sum, seq)
         if len(items) <= 1:
@@ -3610,16 +4206,27 @@ def summation(self, seq: list[int | str]) -> int | str:
         return " + ".join(map(str, items))
 
     def _constant_fold(
+<<<<<<< HEAD
         self, fn: Callable[[list[int]], int], seq: list[int | str]
     ) -> list[int | str]:
         """Constant fold through a commutative fn where ints are constants"""
         items: list[int | str] = [x for x in seq if not isinstance(x, int)]
+=======
+        self, fn: Callable[[list[int]], int], seq: list[Union[int, str]]
+    ) -> list[Union[int, str]]:
+        """Constant fold through a commutative fn where ints are constants"""
+        items: list[Union[int, str]] = [x for x in seq if not isinstance(x, int)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         const_items = [x for x in seq if isinstance(x, int)]
         if const_items:
             items.append(fn(const_items))
         return items
 
+<<<<<<< HEAD
     def assign_tmp(self, name: str, expr: str | int) -> str:
+=======
+    def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Grid functions are one per kernel, so name collisions are fine
         if self.mode == "python":
             return f"{name} = {expr}"
@@ -3630,7 +4237,11 @@ def assign_tmp(self, name: str, expr: str | int) -> str:
     @staticmethod
     def from_meta(
         inductor_meta: dict[str, Any],
+<<<<<<< HEAD
         cfg: Config | dict[str, int],
+=======
+        cfg: Union[Config, dict[str, int]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mode: Literal["python", "cpp"] = "python",
     ) -> GridExpr:
         grid_cls = globals()[inductor_meta["grid_type"]]
@@ -3686,6 +4297,7 @@ def generate(self, meta: dict[str, int]) -> None:
         self.z_grid = "y_grid_div_"
 
 
+<<<<<<< HEAD
 class MixOrderReductionGrid(GridExpr):
     def generate(self, meta: dict[str, int]) -> None:
         split_size = meta.get("RSPLIT_SIZE")
@@ -3696,6 +4308,8 @@ def generate(self, meta: dict[str, int]) -> None:
         self.x_grid = self.ceildiv("xnumel", split_size)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CooperativeReductionGrid(GridExpr):
     def generate(self, meta: dict[str, int]) -> None:
         self.x_grid = str(meta["RSPLIT"])
@@ -3764,20 +4378,34 @@ def generate(self, meta: dict[str, int]):
 
     def combo_x_grid(
         self,
+<<<<<<< HEAD
         xnumels: list[int | str],
         no_x_dims: list[bool],
         meta: dict[str, int],
     ) -> str | int:
+=======
+        xnumels: list[Union[int, str]],
+        no_x_dims: list[bool],
+        meta: dict[str, int],
+    ) -> Union[str, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
 
 class SequentialComboKernelGrid(ComboKernelGrid):
     def combo_x_grid(
         self,
+<<<<<<< HEAD
         xnumels: list[int | str],
         no_x_dims: list[bool],
         meta: dict[str, int],
     ) -> str | int:
+=======
+        xnumels: list[Union[int, str]],
+        no_x_dims: list[bool],
+        meta: dict[str, int],
+    ) -> Union[str, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(xnumels) == len(no_x_dims)
         return self.summation(
             [
@@ -3790,7 +4418,11 @@ def combo_x_grid(
 class RoundRobinComboKernelGrid(ComboKernelGrid):
     def combo_x_grid(
         self,
+<<<<<<< HEAD
         xnumels: list[int | str],
+=======
+        xnumels: list[Union[int, str]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         no_x_dims: list[bool],
         meta: dict[str, int],
     ) -> str:
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 35b42174a62ca..0e8d6a36308a5 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
 import collections
+<<<<<<< HEAD
 import contextlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import dataclasses
 import functools
 import inspect
@@ -16,6 +19,7 @@
 import typing
 from collections import Counter, defaultdict
 from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+<<<<<<< HEAD
 from typing_extensions import ParamSpec, TypeAlias
 
 from torch.utils._ordered_set import OrderedSet
@@ -27,10 +31,19 @@
 
 import weakref
 
+=======
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from types import ModuleType
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sympy
 
 import torch
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+<<<<<<< HEAD
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
@@ -52,6 +65,24 @@
 from .fx_utils import count_flops_fx
 from .ir import (
     assign_origin_node,
+=======
+from torch._dynamo.utils import counters, dynamo_timed
+from torch._inductor.codecache import LambdaFuture, PyCodeCache
+from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+from torch.fx.experimental.symbolic_shapes import free_symbols
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
+from torch.utils._triton import has_triton
+
+from . import comms, config, dependencies, ir, metrics
+from .analyze_preserves_zero_mask import can_codegen_without_upcasts
+from .codegen.common import BackendFeature, get_scheduling_for_device, Kernel
+from .comm_analysis import estimate_nccl_collective_runtime
+from .dependencies import Dep, MemoryDep, StarDep, WeakDep
+from .exc import GPUTooOldForTriton, TritonMissing
+from .fx_utils import count_flops_fx, countable_fx
+from .ir import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_device_type,
     GraphPartitionSignature,
     MultiOutput,
@@ -60,11 +91,17 @@
 )
 from .loop_body import LoopBody
 from .memory import MemoryPlanningInfoForBuffer, MemoryPlanningInfoForNode
+<<<<<<< HEAD
 from .runtime.hints import ReductionHint
 from .runtime.runtime_utils import green_text, red_text
 from .sizevars import SimplifyIndexing
 from .utils import (
     _unstable_customized_partition_wrapper,
+=======
+from .runtime.runtime_utils import green_text, red_text
+from .sizevars import SimplifyIndexing
+from .utils import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_on_self,
     cmp,
     device_need_guard,
@@ -79,7 +116,10 @@
     is_multi_outputs_template,
     is_output_of_multi_outputs_template,
     is_wait,
+<<<<<<< HEAD
     maybe_log_cudagraph_partition,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_product,
 )
 from .virtualized import V
@@ -92,6 +132,7 @@
     __name__, "compute_dependencies"
 )
 
+<<<<<<< HEAD
 PartitionType: TypeAlias = list["BaseSchedulerNode"]
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
@@ -293,6 +334,9 @@ def is_contiguous_load(cls, buf: str, parent_node: BaseSchedulerNode) -> bool:
             if n_congituous_read > 0:
                 break
         return n_congituous_read > 0
+=======
+PartitionType = list["BaseSchedulerNode"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -409,10 +453,16 @@ class SchedulerDonatedBuffer(SchedulerBuffer):
 
 
 class BaseSchedulerNode:
+<<<<<<< HEAD
     ancestors: OrderedSet[str]
     debug_device_str: Callable[[BaseSchedulerNode], list[str]]
     group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]
     last_usage: OrderedSet[str]
+=======
+    group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]
+    read_writes: dependencies.ReadWrites
+    unmet_dependencies: OrderedSet[Dep]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # .min_order and .max_order are only relevant for "grouped" nodes such as FusedSchedulerNode.
     # e.g. if the FusedSchedulerNode includes nodes (op_1, op_2, op_3), and op_X is X-th node
     # in `self.scheduler.nodes`, then for this FusedSchedulerNode, .min_order is 1 and .max_order is 3.
@@ -421,6 +471,7 @@ class BaseSchedulerNode:
     min_order: int
     max_order: int
     mpi_node: MemoryPlanningInfoForNode
+<<<<<<< HEAD
     mutation_renames: dict[str, str]
     node: Optional[ir.Operation]
     outputs: list[SchedulerBuffer]
@@ -439,6 +490,23 @@ def _init_from_node(self, node: ir.Operation) -> None:
         self.last_usage = OrderedSet()  # buffers that won't be used after this kernel
         self.written = False
         self.outputs = [
+=======
+
+    def __init__(self, scheduler: Scheduler) -> None:
+        self.scheduler: Scheduler = scheduler
+        self.debug_device_str: Callable[[BaseSchedulerNode], list[str]] = (
+            lambda *args, **kwargs: []
+        )
+
+    def _init_from_node(self, node: ir.Operation) -> None:
+        self.node: Optional[ir.Operation] = node
+        self.ancestors: OrderedSet[str] = OrderedSet()
+        self.last_usage = OrderedSet[
+            str
+        ]()  # buffers that won't be used after this kernel
+        self.written = False
+        self.outputs: list[SchedulerBuffer] = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             SchedulerBuffer(
                 scheduler=self.scheduler,
                 node=output,
@@ -446,6 +514,7 @@ def _init_from_node(self, node: ir.Operation) -> None:
             )
             for output in node.get_outputs()
         ]
+<<<<<<< HEAD
         self.outputs_by_name = {buf.get_name(): buf for buf in self.outputs}
 
         # mutation_renames for the current node. Due to potential
@@ -454,6 +523,11 @@ def _init_from_node(self, node: ir.Operation) -> None:
         # since only mutation information relevant to the deps for this
         # node is stored here.
         self.mutation_renames = {}
+=======
+        self.outputs_by_name: dict[str, SchedulerBuffer] = {
+            buf.get_name(): buf for buf in self.outputs
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self) -> str:
         return f"{type(self).__name__}(name={self.get_name()!r})"
@@ -514,6 +588,7 @@ def log_details(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
+<<<<<<< HEAD
     ) -> bool:
         return False
 
@@ -524,6 +599,13 @@ def update_mutated_names(self, renames: dict[str, str]) -> None:
             if name in renames
         }
         self.set_read_writes(self.read_writes.rename(self.mutation_renames))
+=======
+    ) -> None:
+        return
+
+    def update_mutated_names(self, renames: dict[str, str]) -> None:
+        self.set_read_writes(self.read_writes.rename(renames))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def add_fake_dep(self, dep: Dep) -> None:
         self.set_read_writes(self.read_writes.with_read(dep))
@@ -654,9 +736,12 @@ def is_gpu(self) -> bool:
     def is_reduction(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def is_native_matmul(self) -> bool:
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_split_scan(self) -> bool:
         return False
 
@@ -831,15 +916,22 @@ def codegen_originating_info(
             out_lines.append(op_info_str)
             if "stack_trace" in o.meta:
                 stack_trace = f"{o.meta['stack_trace']}"
+<<<<<<< HEAD
                 stack_trace_last_line = stack_trace.rsplit("|", maxsplit=1)[-1]
+=======
+                stack_trace_last_line = stack_trace.split("|")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_lines.append(
                     "#pragma CMT "
                     + stack_trace_last_line.replace("{", "{{")
                     .replace("}", "}}")
                     .replace("\n", "\\")
+<<<<<<< HEAD
                     .replace(
                         "\\", "\\\\"
                     )  # For windows safe path, avoid for example \x, \U.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 out_lines.append("#pragma CMT END ORIGIN")
                 out_lines.append("")
@@ -1027,6 +1119,7 @@ def estimate_flops(self) -> int | None:
         fx_node = self.node.get_origin_node()
         if fx_node is None:
             return None
+<<<<<<< HEAD
 
         flops = count_flops_fx(fx_node)
         if flops is None:
@@ -1046,6 +1139,21 @@ def get_estimated_runtime(self) -> float:
     def _get_estimated_runtime(self) -> float:
         """
         Returns estimated op runtime in milliseconds (ms)
+=======
+        if not countable_fx(fx_node):
+            return None
+
+        flops = count_flops_fx(fx_node)
+
+        resolved_flops = V.graph.sizevars.size_hints((flops,), fallback=0)[0]
+        counters["inductor"]["flop_count"] += resolved_flops
+        return resolved_flops
+
+    @cache_on_self
+    def get_estimated_runtime(self) -> float:
+        """
+        Returns estimated op runtime in nanoseconds (ns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         buf = self.get_nodes()[0].get_outputs()[0]
         layout = buf.node.get_output_spec()
@@ -1057,6 +1165,7 @@ def _get_estimated_runtime(self) -> float:
         if is_collective(self.node):
             assert isinstance(self.node, ir.IRNode)
             try:
+<<<<<<< HEAD
                 if config_comms.runtime_estimations_use_nccl_lib_estimations:
                     cache_key = get_estimate_runtime_cache_key_from_snode(self)
                     cache = get_estimate_runtime_cache()
@@ -1072,15 +1181,25 @@ def _get_estimated_runtime(self) -> float:
 
                     cache.set_value(cache_key, value=ms)
                     return ms
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return estimate_nccl_collective_runtime(self.node)
             except ValueError as e:
                 # We don't know how to estimate runtime for this collective,
                 # falling back to 0
+<<<<<<< HEAD
                 log.info(e)  # noqa: G200
                 return 0
             except TypeError as e:
                 # this happens when the collective is not of type ir._CollectiveKernel
                 log.info(e)  # noqa: G200
+=======
+                log.info(e)
+                return 0
+            except TypeError as e:
+                # this happens when the collective is not of type ir._CollectiveKernel
+                log.info(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return 0
 
         elif is_wait(self.node):
@@ -1090,10 +1209,13 @@ def _get_estimated_runtime(self) -> float:
             # since it doesn't take extra time to get the result after the collective is completed.
             return 0
 
+<<<<<<< HEAD
         ret = maybe_estimate_runtime_benchmark(self)
         if ret is not None:
             return ret
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = buf.node.maybe_get_dtype()
         try:
             gpu_memory_bandwidth = get_gpu_dram_gbps()
@@ -1114,9 +1236,13 @@ def _get_estimated_runtime(self) -> float:
 
         if flops_est == 0 or flops_est is None:
             # no flops estimate, so fall back to memory estimate
+<<<<<<< HEAD
             ns = self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
             ms = ns / 1e6
             return ms
+=======
+            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
         factor = 1.0
@@ -1125,10 +1251,15 @@ def _get_estimated_runtime(self) -> float:
         compute_time = (factor * flops_est / gpu_flops) * 1e9
         transfer_time = counted_bytes / gpu_memory_bandwidth
 
+<<<<<<< HEAD
         # Return estimated runtime in milliseconds
         ns = max(compute_time, transfer_time)
         ms = ns / 1e6
         return ms
+=======
+        # Return estimated runtime in nanoseconds
+        return max(compute_time, transfer_time)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_template_node(self) -> Optional[ir.TemplateBuffer]:
         return None
@@ -1153,6 +1284,7 @@ def get_prologue_template_epilogue(
         return prologue, template_node, epilogue
 
 
+<<<<<<< HEAD
 @functools.cache
 def get_estimate_runtime_cache() -> torch._inductor.codecache.LocalCache:
     return torch._inductor.codecache.LocalCache()
@@ -1229,6 +1361,12 @@ def maybe_estimate_runtime_benchmark(snode: BaseSchedulerNode) -> Optional[float
 class WhyNoFuse:
     name1: str
     name2: str
+=======
+class WhyNoFuse:
+    # TODO when we drop support for Python < 3.10, we can use
+    # @dataclass(slots=True) instead of manually specifying __slots__.
+    __slots__ = ["name1", "name2", "reason", "args"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reason: str
     args: tuple[Any, ...]
 
@@ -1296,11 +1434,15 @@ def _prune_redundant_deps(
     def should_prune(dep: Dep) -> bool:
         if isinstance(dep, WeakDep):
             op_name = name_to_buf[dep.name].defining_op_name()
+<<<<<<< HEAD
             is_redundant = name_to_dep_count[
                 name_to_fused_node[op_name].get_name()
             ] > 0 and node.scheduler.fusable_weak_dep(
                 dep, name_to_fused_node[op_name], node
             )
+=======
+            is_redundant = name_to_dep_count[name_to_fused_node[op_name].get_name()] > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # These can occur because fused nodes always gather deps from their snodes
             # If B has a weakdep on A
             # B gets fused with C, then any time BC is fused, the weakdep will reappear
@@ -1343,11 +1485,14 @@ def __init__(self, scheduler: Scheduler, node: ir.Operation) -> None:
 
 
 class SchedulerNode(BaseSchedulerNode):
+<<<<<<< HEAD
     """
     A SchedulerNode is a node for scheduling that encapsulates either
     a ComputedBuffer or a TemplateBuffer.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _sizes: tuple[Sequence[sympy.Expr], ...]
     _body: LoopBody
 
@@ -1363,6 +1508,7 @@ def __init__(
     def _compute_attrs(
         self,
         extra_indexing_constraints: Optional[tuple[dict[Any, Any], list[Any]]] = None,
+<<<<<<< HEAD
         recompute_sizes_body_func: Optional[Callable[_P, _T]] = None,
     ) -> None:
         assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
@@ -1371,6 +1517,15 @@ def _compute_attrs(
             recompute_sizes_body_func=recompute_sizes_body_func,
         )
         self._body = body  # type: ignore[assignment]
+=======
+        recompute_sizes_body_func: Optional[Callable[..., Any]] = None,
+    ) -> None:
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+        self._sizes, self._body = self.node.simplify_and_reorder(
+            extra_indexing_constraints=extra_indexing_constraints,
+            recompute_sizes_body_func=recompute_sizes_body_func,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         device = self.node.get_device_or_error()
         group_fn = self.scheduler.get_backend(device).group_fn
@@ -1417,9 +1572,13 @@ def refresh_dependencies(
         self.set_read_writes(
             dependencies.extract_read_writes(
                 self._body, *self._sizes, normalize=normalize
+<<<<<<< HEAD
             )
             .with_read(fake_deps)
             .rename(self.mutation_renames)
+=======
+            ).with_read(fake_deps)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.pointwise_read_writes.clear_cache(self)
@@ -1441,6 +1600,7 @@ def apply_new_loop_order(self, new_order: Sequence[int]) -> None:
 
         self.refresh_dependencies(normalize=False, need_clear_tiling_cache=True)
 
+<<<<<<< HEAD
     def swap_pw_red_dimension(self) -> None:
         assert len(self._body.sizes[0]) == 2
         self.apply_new_loop_order([1, 0])
@@ -1468,6 +1628,8 @@ def expand_dimension_for_pointwise_node(
         # Need normalize the prefix name to facilitate finding common dependencies
         self.refresh_dependencies(normalize=True, need_clear_tiling_cache=True)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def merge_loops(self) -> None:
         self._body = self._body.merge_loops()
         self._sizes = self._body.sizes
@@ -1482,26 +1644,39 @@ def merge_loops(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
+<<<<<<< HEAD
     ) -> bool:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_order = None
         self_sizes = self._sizes[0]
         if len(self_sizes) == self_dep.num_vars == other_dep.num_vars:
             new_order = self_dep.decide_loop_order_to_match(other_dep)
 
         if new_order:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             metrics.num_loop_reordering += 1
             loop_ordering_log.debug(
                 "Reorder loops for %s with order %s", self.get_name(), new_order
             )
             self.apply_new_loop_order(new_order)
+<<<<<<< HEAD
             return True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             loop_ordering_log.debug(
                 "Don't reordering %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
+<<<<<<< HEAD
             return False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def debug_str_extra(self) -> str:
         name = self.get_name()
@@ -1532,6 +1707,7 @@ def is_reduction(self) -> bool:
         assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer)), (
             f"{type(self.node)=}"
         )
+<<<<<<< HEAD
 
         # self._body containing partial accumulate means the reduction is
         # converted to a pointwise node.  Need this extra check since
@@ -1544,6 +1720,9 @@ def is_reduction(self) -> bool:
     def is_native_matmul(self) -> bool:
         assert isinstance(self.node, ir.ComputedBuffer), f"{type(self.node)=}"
         return self.node.get_reduction_type() == "dot"
+=======
+        return bool(self.node.get_reduction_type())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_split_scan(self) -> bool:
         assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer)), (
@@ -1578,6 +1757,7 @@ def ranges_from_index_vars(
         return var_ranges
 
     def codegen(self, index_vars: Sequence[Sequence[sympy.Expr]]) -> None:
+<<<<<<< HEAD
         """
         Generate code for this node using the provided index variables.
 
@@ -1589,6 +1769,8 @@ def codegen(self, index_vars: Sequence[Sequence[sympy.Expr]]) -> None:
             index_vars: A sequence of sequences of sympy expressions representing
                         the index variables for each dimension of the computation.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var_ranges = self.ranges_from_index_vars(index_vars)
         try:
             with (
@@ -1658,6 +1840,7 @@ def _get_atomic_add_buffers(self) -> OrderedSet[str]:
                     )
         return buffers_store_as_atomic_add
 
+<<<<<<< HEAD
     @cache_on_self
     def has_side_effects(self) -> bool:
         # self._body is None sometimes that's why this check was added
@@ -1665,6 +1848,8 @@ def has_side_effects(self) -> bool:
             return True
         return super().has_side_effects()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def refresh_group_node_dependencies(
     group_snode: Union[FusedSchedulerNode, GroupedSchedulerNode],
@@ -1749,6 +1934,7 @@ def fuse(
         nodes = list(itertools.chain(node1.get_nodes(), node2.get_nodes()))
         return cls(node1.scheduler, nodes)
 
+<<<<<<< HEAD
     def extract_pw_from_reduction(self) -> BaseSchedulerNode:
         for subnode in self.snodes:
             assert isinstance(subnode, SchedulerNode)
@@ -1761,6 +1947,8 @@ def swap_pw_red_dimension(self) -> None:
             assert isinstance(subnode, SchedulerNode)
             subnode.swap_pw_red_dimension()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @cache_on_self
     def estimate_flops(self) -> int | None:
         # don't increment counters in fused methods so we don't double count
@@ -1781,6 +1969,7 @@ def estimate_flops(self) -> int | None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
+<<<<<<< HEAD
     ) -> bool:
         """
         Return true if a loop reordering is performed.
@@ -1788,6 +1977,12 @@ def reorder_loops_by_dep_pair(
         if self.is_template():
             # We can not really reorder loops for a triton template
             return False
+=======
+    ) -> None:
+        if self.is_template():
+            # We can not really reorder loops for a triton template
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_sizes = None
         for snode in self.snodes:
             assert isinstance(snode, SchedulerNode)
@@ -1795,7 +1990,11 @@ def reorder_loops_by_dep_pair(
                 loop_ordering_log.debug(
                     "Can not reorder fused node due to different sizes"
                 )
+<<<<<<< HEAD
                 return False
+=======
+                return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self_sizes = snode._sizes[0]
         new_order = None
 
@@ -1808,8 +2007,12 @@ def reorder_loops_by_dep_pair(
                 "Dont reordering fused node %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
+<<<<<<< HEAD
             return False
         # pyrefly: ignore [bad-assignment]
+=======
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metrics.num_loop_reordering += 1
         loop_ordering_log.debug(
             "Reorder loops for fused node %s with order %s", self.get_name(), new_order
@@ -1819,7 +2022,10 @@ def reorder_loops_by_dep_pair(
             snode.apply_new_loop_order(new_order)
 
         refresh_group_node_dependencies(self)
+<<<<<<< HEAD
         return True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, scheduler: Scheduler, snodes: list[BaseSchedulerNode]) -> None:
         super().__init__(scheduler)
@@ -1893,10 +2099,13 @@ def is_reduction(self) -> bool:
         return any(x.is_reduction() for x in self.snodes)
 
     @cache_on_self
+<<<<<<< HEAD
     def is_native_matmul(self) -> bool:
         return any(x.is_native_matmul() for x in self.snodes)
 
     @cache_on_self
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_split_scan(self) -> bool:
         return any(x.is_split_scan() for x in self.snodes)
 
@@ -1955,6 +2164,7 @@ def debug_str(self) -> str:
 
         return buf.getrawvalue().rstrip()
 
+<<<<<<< HEAD
     @cache_on_self
     def has_side_effects(self) -> bool:
         if self.snodes is not None:
@@ -1970,6 +2180,8 @@ def __init__(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> None:
             node1.scheduler, list(node1.get_nodes()) + list(node2.get_nodes())
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ForeachKernelSchedulerNode(FusedSchedulerNode):
     """
@@ -2314,6 +2526,7 @@ def create(cls, snodes: list[BaseSchedulerNode]) -> GroupedSchedulerNode:
         scheduler.name_to_fused_node[grouped_snode.get_name()] = grouped_snode
         return grouped_snode
 
+<<<<<<< HEAD
     def __init__(
         self,
         scheduler: Scheduler,
@@ -2328,15 +2541,23 @@ def __init__(
         # Reusing calculation of grouped unmed_dependencies etc.
         # No fusion logic in this case.
         self.temp_grouping = temp_grouping
+=======
+    def __init__(self, scheduler: Scheduler, snodes: list[BaseSchedulerNode]) -> None:
+        super().__init__(scheduler)
+        init_group_node(self, scheduler, snodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unpack(self) -> list[BaseSchedulerNode]:
         """
         Do fusion among nodes within this GroupedSchedulerNode,
         and then unpack this GroupedSchedulerNode into regular nodes.
         """
+<<<<<<< HEAD
         if self.temp_grouping:
             return self.snodes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for snode in self.snodes:
             self.scheduler.name_to_fused_node[snode.get_name()] = snode
         del self.scheduler.name_to_fused_node[self.get_name()]
@@ -2393,7 +2614,11 @@ def can_fuse(cls, producer: BaseSchedulerNode, consumer: BaseSchedulerNode) -> b
 def pick_loop_order(
     stride_lengths: list[list[int]],
     sizes: Sequence[sympy.Expr],
+<<<<<<< HEAD
     priority_idx: Sequence[int] = (),
+=======
+    priority_idx: tuple[int, ...] = (),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[int]:
     """
     A heuristic to decide loop iteration orders.  This has not been well
@@ -2436,6 +2661,7 @@ def index_cmp(a: int, b: int) -> int:
     return order
 
 
+<<<<<<< HEAD
 def _replace_operation_buffer(
     orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
 ) -> None:
@@ -2464,6 +2690,8 @@ def _replace_operation_buffer(
     V.graph.name_to_op[orig_op_name] = new_node
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class NodeUser:
     node: Union[BaseSchedulerNode, OutputNode]
@@ -2499,22 +2727,34 @@ def merge(self, other: NodeUser) -> NodeUser:
 _post_grad_graph_counter = itertools.count()
 
 
+<<<<<<< HEAD
 def used_non_deterministic_runtime_estimations() -> bool:
     return config.runtime_estimations_mms_benchmark
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Scheduler:
     """
     A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
     optimizations such as fusion, reorder, and graph partition.
     """
 
+<<<<<<< HEAD
+=======
+    __dep_size_hint_cache: dict[Dep, int]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, nodes: list[ir.Operation]) -> None:
         with dynamo_timed("Scheduler.__init__"):
             self._init(nodes)
 
     def _init(self, nodes: list[ir.Operation]) -> None:
         super().__init__()
+<<<<<<< HEAD
+=======
+        self.__dep_size_hint_cache = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.graph.scheduler = self
         self.backends: dict[torch.device, BaseScheduling] = {}
         self.post_grad_graph_id = next(_post_grad_graph_counter)
@@ -2528,17 +2768,25 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 *V.graph.torchbind_constants.keys(),
             ]
         )
+<<<<<<< HEAD
         self.nodes = [self.create_scheduler_node(n) for n in nodes]
         self.current_node: Optional[BaseSchedulerNode] = None
+=======
+
+        self.nodes = [self.create_scheduler_node(n) for n in nodes]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.update_zero_dim_cpu_tensor()
         # some new constants could have been created above
         self.available_buffer_names.update(V.graph.constants.keys())
         for node in self.nodes:
             node.prune_deps()
 
+<<<<<<< HEAD
         # See [Note: Graph Partition Device Contexts]
         self.default_device_context: Optional[torch.device] = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name_to_donated_buffer: dict[str, SchedulerDonatedBuffer] = (
             self.get_donated_buffers()
         )
@@ -2581,7 +2829,10 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         self.name_to_fused_node = {n.get_name(): n for n in self.nodes}
         self.compute_ancestors()
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metrics.ir_nodes_pre_fusion += len(self.nodes)
         from torch._inductor.debug import log_ir_post_fusion, log_ir_pre_fusion
 
@@ -2592,6 +2843,7 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         self.logged_slow_fusion = OrderedSet[tuple[str, str]]()
         if config._pre_fusion_custom_pass is not None:
             self.nodes = config._pre_fusion_custom_pass(self.nodes)
+<<<<<<< HEAD
 
         self.nodes = self.fuse_nodes(self.nodes)
         if config._post_fusion_custom_pass is not None:
@@ -2606,6 +2858,15 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 log_waitcounter=True,
             ):
                 self.create_combo_kernel_nodes(num_ck_nodes=None)
+=======
+        self.nodes = self.fuse_nodes(self.nodes)
+        if config._post_fusion_custom_pass is not None:
+            self.nodes = config._post_fusion_custom_pass(self.nodes)
+        self.merge_loops()
+        self.finalize_multi_template_buffers()
+        if config.combo_kernels:
+            self.create_combo_kernel_nodes(num_ck_nodes=None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Peak memory pass and overlap pass must run last, otherwise
         # other reordering passes could undo their effects.
@@ -2619,6 +2880,7 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 OrderedSet(V.graph.graph_inputs.keys()),
                 OrderedSet(V.graph.get_output_names()),
             )
+<<<<<<< HEAD
 
         # reorder_for_compute_comm_overlap may do benchmarking to estimate
         # op runtime. Disable it for now in deterministic mode.
@@ -2664,14 +2926,24 @@ def _init(self, nodes: list[ir.Operation]) -> None:
             torch._inductor.config.graph_partition
             and torch._inductor.config.triton.cudagraphs
         ):
+=======
+        if config.reorder_for_compute_comm_overlap:
+            self.nodes = comms.reorder_compute_and_comm_for_overlap(self.nodes)
+        self.process_grouped_nodes()
+
+        if torch._inductor.config.graph_partition:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.nodes = self.maybe_reorder_for_minimizing_partition(self.nodes)
             self.nodes = self.reorder_for_partition_with_simple_dependency(self.nodes)
 
         self.compute_last_usage()
+<<<<<<< HEAD
 
         if torch._inductor.config.test_configs.track_memory_lifecycle:
             self.insert_memory_check_nodes()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log_ir_post_fusion(self.nodes)
         V.debug.graph_diagram(self.nodes)
         self.debug_draw_graph()
@@ -2778,7 +3050,13 @@ def compute_dependencies(self) -> None:
         mutation properly.
         """
 
+<<<<<<< HEAD
         class DedupList(Generic[_T]):
+=======
+        T = TypeVar("T")
+
+        class DedupList(Generic[T]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             This data structure behaves like a list except it makes sure the
             elements remain unique.
@@ -2790,26 +3068,42 @@ class DedupList(Generic[_T]):
 
             def __init__(
                 self,
+<<<<<<< HEAD
                 items: Optional[list[_T]] = None,
                 membership: Optional[OrderedSet[_T]] = None,
+=======
+                items: Optional[list[T]] = None,
+                membership: Optional[OrderedSet[T]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) -> None:
                 self.items = items or []
                 self.membership = membership or OrderedSet()
 
+<<<<<<< HEAD
             def append(self, node_user: _T) -> None:
+=======
+            def append(self, node_user: T) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if node_user in self.membership:
                     return
                 self.items.append(node_user)
                 self.membership.add(node_user)
 
+<<<<<<< HEAD
             def __add__(self, other: DedupList[_T]) -> DedupList[_T]:
+=======
+            def __add__(self, other: DedupList[T]) -> DedupList[T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_membership = OrderedSet.union(self.membership, other.membership)
                 new_items = self.items + [
                     x for x in other.items if x not in self.membership
                 ]
                 return DedupList(new_items, new_membership)
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name_to_users: defaultdict[str, DedupList[NodeUser]] = collections.defaultdict(
             DedupList
         )
@@ -2846,14 +3140,20 @@ def __add__(self, other: DedupList[_T]) -> DedupList[_T]:
                     else:
                         name_to_users[buf1_name] = name_to_users[buf2_name]
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def rename(n: str) -> str:
             if n in self.mutation_renames:
                 return rename(self.mutation_renames[n])
             return n
 
         def add_user(
+<<<<<<< HEAD
             # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             used_by_name: str,
             user_node: Union[BaseSchedulerNode, OutputNode],
             can_inplace: bool = False,
@@ -2863,7 +3163,10 @@ def add_user(
                 NodeUser(user_node, can_inplace, is_weak)
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unbacked_symbol_to_origin_node: dict[sympy.Symbol, Optional[str]] = {}
 
         # NB: None means that the dependency is on an input.  Don't actually
@@ -2881,11 +3184,20 @@ def add_user(
                     for fs in s.free_symbols:
                         unbacked_symbol_to_origin_node[fs] = None
 
+<<<<<<< HEAD
         has_non_input_unbacked_defs = False
         for node in self.nodes:
             assert node.node is not None
             # unbacked symbols don't follow ordinary buffer dependencies, so
             # we track their def/uses separately
+=======
+        for node in self.nodes:
+            log.debug("scheduling %s", node.node)
+
+            # unbacked symbols don't follow ordinary buffer dependencies, so
+            # we track their def/uses separately
+            assert node.node is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unbacked_symbol_defs = sorted(
                 node.node.get_unbacked_symbol_defs(), key=lambda x: x.name
             )
@@ -2894,6 +3206,7 @@ def add_user(
                 # Pick the first definer as canonical.  There may be multiple
                 # because if a MultiOutputLayout buffer propagates an unbacked
                 # symint to multiple outputs, they will all claim to def it.
+<<<<<<< HEAD
                 has_non_input_unbacked_defs = True
                 if s not in unbacked_symbol_to_origin_node:
                     unbacked_symbol_to_origin_node[s] = node.get_name()
@@ -2916,6 +3229,22 @@ def add_user(
                     if (r := unbacked_symbol_to_origin_node[s]) is not None:
                         for buf in self.name_to_node[r].get_outputs():
                             node.add_fake_dep(StarDep(buf.get_name()))
+=======
+                if s not in unbacked_symbol_to_origin_node:
+                    unbacked_symbol_to_origin_node[s] = node.get_name()
+
+            unbacked_symbol_uses = sorted(
+                node.node.get_free_symbol_uses(unbacked_only=True), key=lambda x: x.name
+            )
+            # if a kernel takes unbacked symints, register dependencies
+            for s in unbacked_symbol_uses:
+                assert s in unbacked_symbol_to_origin_node, (
+                    f"{s} not in {unbacked_symbol_to_origin_node}"
+                )
+                if (r := unbacked_symbol_to_origin_node[s]) is not None:
+                    for buf in self.name_to_node[r].get_outputs():
+                        node.add_fake_dep(StarDep(buf.get_name()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if (
                 len(node.read_writes.writes) == 1
@@ -2948,10 +3277,13 @@ def add_user(
                             )
                             add_user(other_name, node, is_weak=True)
 
+<<<<<<< HEAD
             for add_dep in V.graph.additional_buffer_deps[node.get_name()]:
                 add_user(add_dep, node, is_weak=True)
                 node.add_fake_dep(WeakDep(add_dep, node.get_name()))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # add normal non-mutation dependencies
             for read in node.read_writes.reads:
                 if not isinstance(read, WeakDep):
@@ -2974,6 +3306,7 @@ def add_user(
             add_user(buf_name, OutputNode(StarDep(buf_name)))
 
         # make sure unbacked symints aren't dead-code-eliminated
+<<<<<<< HEAD
         if has_non_input_unbacked_defs:
             for out in V.graph.graph_outputs:
                 for s in out.get_free_symbol_uses(unbacked_only=True):
@@ -2988,6 +3321,19 @@ def add_user(
                                 s,
                             )
                             add_user(buf_name, OutputNode(StarDep(buf_name)))
+=======
+        for out in V.graph.graph_outputs:
+            for s in out.get_free_symbol_uses(unbacked_only=True):
+                assert s in unbacked_symbol_to_origin_node, (
+                    f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
+                )
+                if r := unbacked_symbol_to_origin_node[s]:
+                    for buf_name in self.name_to_node[r].get_buffer_names():
+                        log.debug(
+                            "scheduling output %s for unbacked symint %s", buf_name, s
+                        )
+                        add_user(buf_name, OutputNode(StarDep(buf_name)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # make sure input mutation isn't dead-code-eliminated
         for name in self.mutation_renames:
@@ -3025,6 +3371,7 @@ def add_user(
         compute_dependencies_log.debug("BUFFER USER LIST\n")
         compute_dependencies_log.debug("===== AFTER SCHEDULING =====\n%s", str)
 
+<<<<<<< HEAD
     def insert_memory_check_nodes(self) -> None:
         from .memory import (
             assign_memory_planning_info_for_scheduler_buffers,
@@ -3102,6 +3449,8 @@ def construct_mem_check_node(
 
         self.nodes = new_nodes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def dead_node_elimination(self) -> None:
         """
         Remove any nodes without users
@@ -3237,10 +3586,17 @@ def compute_ancestors(self) -> None:
             node.max_order = order
 
     def merge_loops(self) -> None:
+<<<<<<< HEAD
         if not config.loop_ordering_after_fusion:
             return
 
         for node in self.nodes:
+=======
+        for node in self.nodes:
+            if not config.loop_ordering_after_fusion:
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Even for CPU, if we are using the halide backend, we still need
             # the merge loops steps below
             if not isinstance(node, (SchedulerNode, FusedSchedulerNode)) or (
@@ -3277,7 +3633,11 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                     i + 1,
                     old_len,
                 )
+<<<<<<< HEAD
                 nodes = self.fuse_nodes_once(nodes, is_reorder_round=False)
+=======
+                nodes = self.fuse_nodes_once(nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_len = len(nodes)
                 fusion_log.debug(
                     "completed fusion round (%d/10): fused %d nodes into %d nodes\n",
@@ -3290,9 +3650,12 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                         "===== fusion complete (%d iterations) =====", i + 1
                     )
                     break
+<<<<<<< HEAD
 
             if config.loop_ordering_after_fusion:
                 nodes = self.fuse_nodes_once(nodes, is_reorder_round=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return nodes
 
     def process_grouped_nodes(self) -> None:
@@ -3325,10 +3688,14 @@ def benchmark_fused_nodes(
             return backend.benchmark_fused_nodes(nodes)
 
     def generate_kernel_code_from_nodes(
+<<<<<<< HEAD
         self,
         nodes: Sequence[BaseSchedulerNode],
         benchmark_kernel: bool,
         hint_override: Optional[int] = None,
+=======
+        self, nodes: Sequence[BaseSchedulerNode], benchmark_kernel: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> str:
         """
         Benchmark fused list of nodes and return the execution time
@@ -3339,9 +3706,13 @@ def generate_kernel_code_from_nodes(
         self.current_device = device
         backend = self.get_backend(device)
         with dynamo_timed("benchmark_fused_nodes"):
+<<<<<<< HEAD
             return backend.generate_kernel_code_from_nodes(
                 nodes, benchmark_kernel, hint_override=hint_override
             )
+=======
+            return backend.generate_kernel_code_from_nodes(nodes, benchmark_kernel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def benchmark_codegened_module(
         self, module: ModuleType, device: torch.device
@@ -3365,6 +3736,36 @@ def finalize_multi_template_buffers(self) -> None:
         will force completion of compilation and benchmarking.
         """
 
+<<<<<<< HEAD
+=======
+        def replace_operation_buffer(
+            orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
+        ) -> None:
+            replaced_buf_name = new_node.get_name()
+            orig_buf_name = orig_node.get_name()
+            assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str)
+
+            replaced_op_name = new_node.get_operation_name()
+            orig_op_name = orig_node.get_operation_name()
+            assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str)
+
+            del V.graph.name_to_buffer[replaced_buf_name]
+            new_node.name = orig_buf_name
+
+            del V.graph.name_to_op[replaced_op_name]
+            new_node.operation_name = orig_op_name
+
+            orig = V.graph.buffers.index(orig_node)
+            V.graph.buffers.remove(new_node)
+            V.graph.buffers[orig] = new_node
+            V.graph.name_to_buffer[orig_buf_name] = new_node
+
+            orig = V.graph.operations.index(orig_node)
+            V.graph.operations.remove(new_node)
+            V.graph.operations[orig] = new_node
+            V.graph.name_to_op[orig_op_name] = new_node
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, node in enumerate(self.nodes):
             if isinstance(node, SchedulerNode) and isinstance(
                 node.node, ir.MultiTemplateBuffer
@@ -3376,7 +3777,11 @@ def finalize_multi_template_buffers(self) -> None:
                     min_node_unfused = next(
                         (
                             timing
+<<<<<<< HEAD
                             for timing in multi_node.choice_timings()
+=======
+                            for timing in multi_node.choice_timings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if isinstance(
                                 timing,
                                 torch._inductor.select_algorithm.ExternKernelCaller,
@@ -3388,6 +3793,7 @@ def finalize_multi_template_buffers(self) -> None:
                     min_node_unfused,
                     torch._inductor.ir.TritonTemplateCallerBase,
                 ):
+<<<<<<< HEAD
                     if config.multi_kernel_hints:
                         callers: dict[Optional[int], TritonTemplateCallerBase] = {}
                         callers[None] = min_node_unfused
@@ -3410,10 +3816,18 @@ def finalize_multi_template_buffers(self) -> None:
                 with ir.IRNode.current_origins(multi_node.origins):
                     out_tensorbox = min_node_unfused.output_node()
                 out_storage = out_tensorbox.data  # type: ignore[union-attr]
+=======
+                    node.node.finalize_as_triton_caller(min_node_unfused)
+                    continue
+
+                out_tensorbox = min_node_unfused.output_node()
+                out_storage = out_tensorbox.data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert isinstance(out_storage, ir.StorageBox)
                 out_buffer = out_storage.data
                 assert isinstance(out_buffer, ir.OperationBuffer)
 
+<<<<<<< HEAD
                 if multi_node.origin_node:
                     assign_origin_node(out_tensorbox, multi_node.origin_node)
 
@@ -3459,6 +3873,43 @@ def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
         new_scheduler_node.min_order = node.min_order
         new_scheduler_node.max_order = node.max_order
         new_scheduler_node.last_usage = node.last_usage
+=======
+                out_buffer.layout = multi_node.layout
+                replace_operation_buffer(multi_node, out_buffer)
+                new_scheduler_node = self.create_scheduler_node(out_buffer)
+
+                self.nodes[i] = new_scheduler_node
+                self.name_to_node[node.get_name()] = new_scheduler_node
+                self.name_to_fused_node[node.get_name()] = new_scheduler_node
+
+                # We need to reflect the mutation renames that were recorded in the original node
+                mutation_renames = {}
+                for dep in itertools.chain(
+                    node.read_writes.reads, node.unmet_dependencies
+                ):
+                    if real_name := self.mutation_real_name.get(dep.name, None):
+                        mutation_renames[real_name] = dep.name
+
+                def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
+                    return OrderedSet(dep.rename(mutation_renames) for dep in deps)
+
+                new_scheduler_node.unmet_dependencies = rename_deps(
+                    new_scheduler_node.unmet_dependencies
+                )
+                new_scheduler_node.read_writes.reads = rename_deps(
+                    new_scheduler_node.read_writes.reads
+                )
+
+                for new_out, old_out in zip(
+                    new_scheduler_node.get_outputs(), node.get_outputs()
+                ):
+                    self.name_to_buf[old_out.get_name()] = new_out
+                    new_out.users = old_out.users
+
+                new_scheduler_node.min_order = node.min_order
+                new_scheduler_node.max_order = node.max_order
+                new_scheduler_node.last_usage = node.last_usage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool:
         return any(
@@ -3538,10 +3989,17 @@ def log_fusion(ms_fused: float, ms1: float, ms2: float) -> None:
         async_compile = torch._inductor.async_compile.AsyncCompile()
 
         def compile_kernel(
+<<<<<<< HEAD
             nodes: Sequence[BaseSchedulerNode], hint_override: Optional[int] = None
         ) -> tuple[Optional[LambdaFuture], ModuleType]:
             src_code = self.generate_kernel_code_from_nodes(
                 nodes, benchmark_kernel=True, hint_override=hint_override
+=======
+            nodes: Sequence[BaseSchedulerNode],
+        ) -> tuple[Optional[LambdaFuture], ModuleType]:
+            src_code = self.generate_kernel_code_from_nodes(
+                nodes, benchmark_kernel=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             mod = PyCodeCache.load(src_code)
             if not async_compile.use_process_pool():
@@ -3563,6 +4021,7 @@ def compile_kernel(
             )
             assert isinstance(multi_node, ir.MultiTemplateBuffer)
 
+<<<<<<< HEAD
             hint_override_best_fusion_choice: dict[
                 Optional[int], TritonTemplateCallerBase
             ] = {}
@@ -3615,6 +4074,10 @@ def compile_kernel(
 
             # Eagerly compile and benchmark non-template nodes
             choice_timings = multi_node.choice_timings()
+=======
+            # Eagerly compile and benchmark non-template nodes
+            choice_timings = multi_node.choice_timings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _, ms1 = multi_node.get_min_choice()
             ms2, path2 = (
                 self.benchmark_fused_nodes(node_list_2)
@@ -3671,18 +4134,28 @@ def benchmark_when_ready() -> bool:
                     # triton  will unpredictably error with valid prologue fusions
                     except Exception as e:
                         if fusion_log.isEnabledFor(logging.DEBUG):
+<<<<<<< HEAD
                             fusion_log.debug(  # noqa: G200
+=======
+                            fusion_log.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 "Exception in compiling %s: %s",
                                 "prologue" if not epilogue_fusion else "epilogue",
                                 str(e),
                             )
                         continue
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
                     with multi_node.swap_as_triton_caller(choice):
                         ms_fused, path = self.benchmark_codegened_module(
                             mod_fused,
                             # pyrefly: ignore [bad-argument-type]
                             device,
+=======
+                    with multi_node.swap_as_triton_caller(choice):
+                        ms_fused, path = self.benchmark_codegened_module(
+                            mod_fused, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         new_timings[choice] = ms_fused
                         if ms_fused < min_ms_fused:
@@ -3692,6 +4165,7 @@ def benchmark_when_ready() -> bool:
                 log_fusion(min_ms_fused, ms1, ms2)
 
                 if min_ms_fused < (ms1 + ms2) and ms_fused_choice is not None:
+<<<<<<< HEAD
                     if config.multi_kernel_hints:
                         hint_override_best_fusion_choice[None] = ms_fused_choice
                         # pyrefly: ignore [missing-attribute]
@@ -3704,6 +4178,10 @@ def benchmark_when_ready() -> bool:
 
                     # pyrefly: ignore [missing-attribute]
                     multi_node._choice_timings[None] = new_timings
+=======
+                    multi_node.finalize_as_triton_caller(ms_fused_choice)
+                    multi_node._choice_timings = new_timings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return True
                 else:
                     return False
@@ -3732,27 +4210,39 @@ def benchmark_when_ready() -> bool:
                             fut.result()
 
                     ms1, path1 = self.benchmark_codegened_module(
+<<<<<<< HEAD
                         future_and_mod_l1[1],
                         # pyrefly: ignore [bad-argument-type]
                         device,
+=======
+                        future_and_mod_l1[1], device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if math.isinf(ms1):
                         why("register spilling of the first kernel")
                         return False
 
                     ms2, path2 = self.benchmark_codegened_module(
+<<<<<<< HEAD
                         future_and_mod_l2[1],
                         # pyrefly: ignore [bad-argument-type]
                         device,
+=======
+                        future_and_mod_l2[1], device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if math.isinf(ms2):
                         why("register spilling of the second kernel")
                         return False
 
                     ms_fused, path_fused = self.benchmark_codegened_module(
+<<<<<<< HEAD
                         future_and_mod_l1_fused[1],
                         # pyrefly: ignore [bad-argument-type]
                         device,
+=======
+                        future_and_mod_l1_fused[1], device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if math.isinf(ms_fused):
                         why("register spilling of the fused kernel")
@@ -3795,9 +4285,13 @@ def get_fused_node(self, node: BaseSchedulerNode) -> BaseSchedulerNode:
         return self.name_to_fused_node[node.get_first_name()]
 
     def fuse_nodes_once(
+<<<<<<< HEAD
         self,
         nodes: list[BaseSchedulerNode],
         is_reorder_round: bool,
+=======
+        self, nodes: list[BaseSchedulerNode]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[BaseSchedulerNode]:
         """
         Combine eligible nodes into FusedSchedulerNodes.
@@ -3806,7 +4300,10 @@ def fuse_nodes_once(
             - self.can_fuse(): checks if a fusion is legal
             - self.score_fusion(): assigns priority to a given fusion
         """
+<<<<<<< HEAD
         self.prune_redundant_deps(nodes)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fused_nodes = OrderedSet(nodes)
         if fusion_log.isEnabledFor(logging.DEBUG):
             fusion_log.debug("fuse_nodes_once, candidates:")
@@ -3861,7 +4358,11 @@ def resolve_pending_fusions(
 
                 fuse_two_nodes(node_key1, node_key2)
 
+<<<<<<< HEAD
         for node1, node2 in self.get_possible_fusions(nodes, is_reorder_round):
+=======
+        for node1, node2 in self.get_possible_fusions(nodes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # if either node is in a pending fusion, resolve it.
             # since we iterate on potential fusions based on profitability
             # the first potential fusion should take precedence.
@@ -3869,9 +4370,15 @@ def resolve_pending_fusions(
             node1 = self.get_fused_node(node1)
             node2 = self.get_fused_node(node2)
 
+<<<<<<< HEAD
             if self.can_fuse(
                 node1, node2, is_reorder_round
             ) and not self.will_fusion_create_cycle(node1, node2):
+=======
+            if self.can_fuse(node1, node2) and not self.will_fusion_create_cycle(
+                node1, node2
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 speedup = self.speedup_by_fusion(node1, node2)
                 if callable(speedup):
                     pending_fusions[node1] = (speedup, node1, node2)
@@ -3900,6 +4407,10 @@ def resolve_pending_fusions(
 
         nodes = sorted(fused_nodes, key=lambda x: x.min_order)
         nodes = self.topological_sort_schedule(nodes)
+<<<<<<< HEAD
+=======
+        self.prune_redundant_deps(nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return nodes
 
     def create_combo_kernel_nodes(self, num_ck_nodes: Optional[int] = None) -> None:
@@ -3955,9 +4466,13 @@ def prune_redundant_deps(self, nodes: list[BaseSchedulerNode]) -> None:
             node.prune_redundant_deps(self.name_to_fused_node)
 
     def get_possible_fusions(
+<<<<<<< HEAD
         self,
         nodes: list[BaseSchedulerNode],
         is_reorder_round: bool,
+=======
+        self, nodes: list[BaseSchedulerNode]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[tuple[BaseSchedulerNode, BaseSchedulerNode]]:
         """
         Helper to find all legal fusion opportunities, sorted by self.score_fusion()
@@ -3977,10 +4492,17 @@ def check_all_pairs(nodes: list[BaseSchedulerNode]) -> None:
                         continue
                     seen.add(key)
 
+<<<<<<< HEAD
                     if self.can_fuse(node1, node2, is_reorder_round):
                         possible_fusions.append(key)
                     elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
                         node2, node1, is_reorder_round
+=======
+                    if self.can_fuse(node1, node2):
+                        possible_fusions.append(key)
+                    elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
+                        node2, node1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         # foreach fusions and epilogue fusions are order dependent
                         possible_fusions.append((node2, node1))
@@ -4113,6 +4635,7 @@ def _find_single_user_inputs(
             return True
         return False
 
+<<<<<<< HEAD
     def fusion_prevent_too_many_reads_and_writes(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode, threshold: int
     ) -> bool:
@@ -4161,6 +4684,8 @@ def fusion_prevent_too_many_reads_and_writes(
 
         return len(unique_io_buffers) > threshold
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def are_long_distant_nodes(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> bool:
@@ -4257,11 +4782,14 @@ def shared_data_after_reordering_loop(
         Right now just greedily reorder the loop of node1 to be compatible with node2,
         but ideally we should have some heuristics to reorder the loop for node2
         to be compatible with node1 if that's more efficient.
+<<<<<<< HEAD
 
         Return the amount of shared data re-computed in this method.
         If no such recomputation happens, return -1 (not return 0 since 0 is a valid
         amount of shared data).
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         # TODO Don't do loop reordering for CPU for now.
@@ -4269,6 +4797,7 @@ def shared_data_after_reordering_loop(
         if not config.loop_ordering_after_fusion or any(
             n.is_cpu() for n in [node1, node2]
         ):
+<<<<<<< HEAD
             return -1
 
         # in some rare case, a template can be passed in.
@@ -4276,13 +4805,20 @@ def shared_data_after_reordering_loop(
         # and https://github.com/pytorch/pytorch/issues/165579
         if node1.is_template() or node2.is_template():
             return -1
+=======
+            return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         node1_buffer_names = node1.read_writes.buffer_names()
         node2_buffer_names = node2.read_writes.buffer_names()
         # Fast path: no common buffers.
         common_buffer_names = node1_buffer_names & node2_buffer_names
         if not common_buffer_names:
+<<<<<<< HEAD
             return -1
+=======
+            return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         node1_name2dep = {dep.name: dep for dep in node1.read_writes.reads_and_writes()}
         node2_name2dep = {dep.name: dep for dep in node2.read_writes.reads_and_writes()}
@@ -4305,13 +4841,21 @@ def shared_data_after_reordering_loop(
                 )
 
         if len(candidates) == 0:
+<<<<<<< HEAD
             return -1
+=======
+            return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Pick the largest buffer to guide the loop reordering
         _numel, lhs_dep, rhs_dep = max(candidates, key=operator.itemgetter(0))
 
         if not isinstance(lhs_dep, MemoryDep) or not isinstance(rhs_dep, MemoryDep):
+<<<<<<< HEAD
             return -1
+=======
+            return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if lhs_dep.num_vars != rhs_dep.num_vars:
             # this can happen due to we don't merge loops.
@@ -4320,6 +4864,7 @@ def shared_data_after_reordering_loop(
             # normalization (merging loops)
             if lhs_dep.normalize() == rhs_dep.normalize():
                 return self.dep_size_hint(lhs_dep)
+<<<<<<< HEAD
             return -1
 
         reordered = False
@@ -4328,6 +4873,15 @@ def shared_data_after_reordering_loop(
             reordered = node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
         elif not node2.is_reduction():
             reordered = node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
+=======
+            return 0
+
+        # Only reorder loops for pointwise for now
+        if not node1.is_reduction():
+            node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
+        elif not node2.is_reduction():
+            node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             loop_ordering_log.debug(
                 "Don't reorder loops since both nodes are reductions: %s v.s. %s",
@@ -4335,7 +4889,11 @@ def shared_data_after_reordering_loop(
                 node2.get_name(),
             )
 
+<<<<<<< HEAD
         return self.score_fusion_memory(node1, node2) if reordered else -1
+=======
+        return self.score_fusion_memory(node1, node2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unfusable_node(self, node: BaseSchedulerNode) -> bool:
         """
@@ -4406,6 +4964,7 @@ def low_prec_fp(dtype: torch.dtype) -> bool:
 
         return True
 
+<<<<<<< HEAD
     def get_expand_dim_for_pointwise_nodes(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> Optional[tuple[int, SchedulerNode, sympy.Expr]]:
@@ -4510,10 +5069,17 @@ def can_fuse(
         node2: BaseSchedulerNode,
         can_reorder: bool = False,
     ) -> bool:
+=======
+    def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Determine if it is possible to combine node1 and node2 into a
         single fused node.
         """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if node1 is node2:
             return False
 
@@ -4563,7 +5129,11 @@ def can_fuse(
             allowed_prologue_inps = template.get_allowed_prologue_inps()
 
             unsupported_prologue_args = (
+<<<<<<< HEAD
                 OrderedSet(inp.get_name() for inp in template.inputs)  # type: ignore[union-attr]
+=======
+                OrderedSet(inp.get_name() for inp in template.inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 - allowed_prologue_inps
             )
 
@@ -4617,6 +5187,10 @@ def can_fuse(
         ):
             why("fusion for buffer explicit disabled")
             return False
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = node1.get_device()
         device2 = node2.get_device()
         if device != device2:
@@ -4626,6 +5200,7 @@ def can_fuse(
 
         shared_data_score = self.score_fusion_memory(node1, node2)
         if (
+<<<<<<< HEAD
             can_reorder
             and shared_data_score < config.score_fusion_memory_threshold
             and config.loop_ordering_after_fusion
@@ -4640,6 +5215,12 @@ def can_fuse(
             (expand_dim, smaller_node, expand_size) = expand_analysis
             smaller_node.expand_dimension_for_pointwise_node(expand_dim, expand_size)
             shared_data_score = self.score_fusion_memory(node1, node2)
+=======
+            shared_data_score < config.score_fusion_memory_threshold
+            and config.loop_ordering_after_fusion
+        ):
+            shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if loop_ordering_log.isEnabledFor(logging.DEBUG):
             loop_ordering_log.debug(
@@ -4693,7 +5274,11 @@ def can_fuse_vertical(
             if remaining:
                 for rd in remaining:
                     if self.fusable_read_and_write(rd, cd):
+<<<<<<< HEAD
                         remaining.remove(rd)  # noqa: B909
+=======
+                        remaining.remove(rd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         remaining_deps = OrderedSet(
             dep.name
@@ -4733,14 +5318,18 @@ def fusable_weak_dep(
         if len(mutating_writes) != 1:
             return False
         write = mutating_writes[0]
+<<<<<<< HEAD
         if isinstance(write, StarDep):
             return False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(write, MemoryDep)
 
         if free_symbol_is_type(write.index, SymT.TMP):
             return False
 
         real_name = self.mutation_real_name[weak_dep.mutating_buf]
+<<<<<<< HEAD
         relevant_reading_nodes = [node1]
         if isinstance(node1, ForeachKernelSchedulerNode):
             relevant_reading_nodes = node1.snodes
@@ -4763,6 +5352,18 @@ def fusable_weak_dep(
             ):
                 return False
         return num_concurrent_reads <= 1
+=======
+        relevant_reads = [
+            read for read in node1.read_writes.reads if read.name == real_name
+        ]
+        return all(
+            isinstance(read, MemoryDep)
+            and not free_symbol_is_type(read.index, SymT.TMP)
+            and read.index == write.index
+            and read.size == write.size
+            for read in relevant_reads
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # StarDep doesn't match MemoryDep, different indices don't match
     # However, broadcasting sometimes strips dimensions, and if that's the case
@@ -4803,7 +5404,24 @@ def fusable_read_and_write(self, read: Dep, write: MemoryDep) -> bool:
         return False
 
     def dep_size_hint(self, dep: Dep) -> int:
+<<<<<<< HEAD
         return V.graph.get_dep_size_hint(dep)
+=======
+        res = 0
+        if dep not in self.__dep_size_hint_cache:
+            try:
+                if not dep.has_unbacked_symbols():
+                    res = dep.numbytes_hint()
+            except KeyError:
+                # In at least one test (test/inductor/test_torchbind.py) we
+                # create a StarDep that doesn't exist in the graph and calling
+                # `has_unbacked_symbols()` throws an error.
+                pass
+            self.__dep_size_hint_cache[dep] = res
+        else:
+            res = self.__dep_size_hint_cache[dep]
+        return res
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def score_fusion_memory(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
@@ -4813,12 +5431,22 @@ def score_fusion_memory(
         memory operations.
         """
         node1_dep_len = len(node1.read_writes.reads) + len(node1.read_writes.writes)
+<<<<<<< HEAD
         node2_dep_len = len(node2.read_writes.reads) + len(node2.read_writes.writes)
+=======
+        node2_dep_len = len(node1.read_writes.reads) + len(node2.read_writes.writes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # optimization: iter over smaller set
         if min(node1_dep_len, node2_dep_len) * 4 < max(node1_dep_len, node2_dep_len):
             if node1_dep_len > node2_dep_len:
+<<<<<<< HEAD
                 node1, node2 = node2, node1
+=======
+                tmp = node1
+                node1 = node2
+                node2 = tmp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             deps = [
                 dep
@@ -4988,6 +5616,7 @@ def can_buffer_be_removed_through_fusion(
             and name not in self.mutation_real_name
         )
 
+<<<<<<< HEAD
     def should_partition(
         self, node: BaseSchedulerNode, should_log: bool = False
     ) -> bool:
@@ -5052,6 +5681,29 @@ def noop_log(msg: str, node: Optional[BaseSchedulerNode]) -> None:
 
         if is_cudagraph_unsafe_op(node.node):
             log_partition_reason("CUDAGraph-unsafe custom ops", node=node)
+=======
+    def should_partition(self, node: BaseSchedulerNode) -> bool:
+        """Return True if we should partition the inductor graph on this node"""
+        if isinstance(node, FusedSchedulerNode):
+            return any(self.should_partition(snode) for snode in node.snodes)
+
+        if not node.is_gpu():
+            return True
+
+        if node.node is None:
+            return True
+
+        if isinstance(node.node, ir.DeviceCopy):
+            return True
+
+        if isinstance(node.node, ir.Conditional):
+            return True
+
+        if getattr(node.node, "unbacked_bindings", None):
+            return True
+
+        if is_cudagraph_unsafe_op(node.node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
 
         return False
@@ -5286,6 +5938,7 @@ def is_none_layout(buf_name: str) -> bool:
             for node in partition:
                 buffer_names_to_free.update(node.last_usage)
 
+<<<<<<< HEAD
             # buffer_names_to_free may contain buffers allocated in previous
             # graph partitions. These buffers should also be a partition
             # input.
@@ -5296,13 +5949,19 @@ def is_none_layout(buf_name: str) -> bool:
             ]
             partition_input_names.update(extra_input_names)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_nodes = {
                 name: name_to_node[name]
                 for name in partition_input_names
                 if name in name_to_node
             }
             input_deallocation = {
+<<<<<<< HEAD
                 name: name in buffer_names_to_free
+=======
+                name: True if name in buffer_names_to_free else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for name in partition_input_names
                 if name in name_to_node
             }
@@ -5350,7 +6009,10 @@ def is_none_layout(buf_name: str) -> bool:
             signatures.append(partition_signature)
 
             unmet_output_names = partition_input_names.union(
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 unmet_output_names - returned_output_names
             )
 
@@ -5532,7 +6194,11 @@ def graph_partition(
         cur_partition: PartitionType = []
         skip_cudagraphs = []
         for node in self.nodes:
+<<<<<<< HEAD
             should_partition = self.should_partition(node, should_log=True)
+=======
+            should_partition = self.should_partition(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if cur_partition and skip_cudagraph != should_partition:
                 partitions.append(cur_partition)
                 skip_cudagraphs.append(skip_cudagraph)
@@ -5593,16 +6259,23 @@ def _codegen_partition_wrapper(
             V.graph.wrapper_code.partition_signatures = signature
             V.graph.wrapper_code.write_prefix()
 
+<<<<<<< HEAD
             graph_name = V.graph.name
             partition_code, _ = V.graph.wrapper_code.generate(V.graph.is_inference)
 
         V.graph.wrapper_code.define_subgraph_launcher_fn(graph_name, partition_code)
+=======
+            partition_code, _ = V.graph.wrapper_code.generate(V.graph.is_inference)
+
+        V.graph.wrapper_code.define_subgraph_launcher_fn(partition_code.value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         V.graph.wrapper_code.codegen_partition_call(graph_partition_id, signature)
         V.graph.wrapper_code.allocated.update(  # type: ignore[has-type]
             [node.get_name() for node in signature.output_nodes]
         )
 
+<<<<<<< HEAD
     def use_default_device_context(
         self, partitions: list[PartitionType], signatures: list[GraphPartitionSignature]
     ) -> contextlib.AbstractContextManager[None]:
@@ -5677,6 +6350,8 @@ def all_on_target_device(
 
         self.default_device_context = cudagraph_partition_device
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _codegen_partitions(self) -> None:
         """
         Split nodes into partitions and codegen each partition into separate functions.
@@ -5685,6 +6360,7 @@ def _codegen_partitions(self) -> None:
         """
         partitions, signatures = self.graph_partition()
 
+<<<<<<< HEAD
         if len(partitions) > 1:
             msg = f"cudagraph partition into {len(partitions)} partitions"
             maybe_log_cudagraph_partition(msg=msg, prefix="")
@@ -5699,6 +6375,17 @@ def _codegen_partitions(self) -> None:
                     self._codegen(partition)
                 else:
                     self._codegen_partition_wrapper(partition, signature)
+=======
+        for partition, signature in zip(partitions, signatures):
+            assert len(partition) >= 1, (
+                f"Each partition must have at least one node but found {len(partition)}"
+            )
+
+            if signature.skip_cudagraph:
+                self._codegen(partition)
+            else:
+                self._codegen_partition_wrapper(partition, signature)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         num_partitions = next(self._graph_partition_counter)
         V.graph.wrapper_code.set_all_partition_names(num_partitions)
@@ -5731,12 +6418,16 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 )
                 seen.add(key)
 
+<<<<<<< HEAD
         self.current_device = self.default_device_context
 
         # pyrefly: ignore [unbound-name]
         if self.default_device_context and config.triton.autotune_at_compile_time:
             V.graph.wrapper_code.write_get_raw_stream_header()
 
+=======
+        self.current_device = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in nodes:
             if log.isEnabledFor(logging.DEBUG):
                 try:
@@ -5770,14 +6461,20 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                         assert device.index is not None, "device should have an index"
                         V.graph.wrapper_code.codegen_device_guard_enter(device.index)
 
+<<<<<<< HEAD
             self.current_node = node
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.buffer_names_to_free.update(node.last_usage)
 
             if node.is_template():
                 prologue, template_node, epilogue = node.get_prologue_template_epilogue(
                     list(node.get_nodes())
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.get_backend(device).codegen_template(
                     template_node, epilogue, prologue
                 )
@@ -5786,7 +6483,10 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 self.codegen_extern_call(node)
             elif node.is_foreach():
                 node = typing.cast(ForeachKernelSchedulerNode, node)
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 backend_ = self.get_backend(device)
                 from .codegen.cuda_combined_scheduling import CUDACombinedScheduling
                 from .codegen.simd import SIMDScheduling
@@ -5796,19 +6496,27 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 else:
                     raise AssertionError(f"{type(self)=}")
                 backend.codegen_combo_kernel(node)
+<<<<<<< HEAD
             elif isinstance(node, FusedMixOrderReductions):
                 # pyrefly: ignore [unbound-name]
                 self.get_backend(device).codegen_mix_order_reduction(node)
             elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
                 # pyrefly: ignore [unbound-name]
+=======
+            elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.get_backend(device).codegen_node(node)
             else:
                 assert isinstance(node, NopKernelSchedulerNode)
                 node.mark_run()
 
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             if config.triton.debug_sync_kernel:
                 # pyrefly: ignore [unbound-name]
+=======
+            if config.triton.debug_sync_kernel:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.get_backend(device).codegen_sync()
 
             self.available_buffer_names.update(node.get_buffer_names())
@@ -5823,6 +6531,7 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 ):
                     self.flush()
 
+<<<<<<< HEAD
         if self.current_device != self.default_device_context:
             # when default_device_context is not None, we are codegen
             # for graph partitions and all nodes must be on
@@ -5832,6 +6541,12 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 # exit the outermost CUDA device guard. this is
                 # important for nested indentation codegen-ing.
                 V.graph.wrapper_code.codegen_device_guard_exit()
+=======
+        if self.current_device and device_need_guard(self.current_device.type):
+            # exit the outermost CUDA device guard. this is
+            # important for nested indentation codegen-ing.
+            V.graph.wrapper_code.codegen_device_guard_exit()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.flush()
 
@@ -5945,7 +6660,11 @@ def update_zero_dim_cpu_tensor(self) -> None:
                         V.graph.zero_dim_cpu_tensor_list.add(read.name)
 
 
+<<<<<<< HEAD
 class BaseScheduling:  # noqa: docstring_linter
+=======
+class BaseScheduling:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, scheduler: Optional[Scheduler]):
         super().__init__()
         self.scheduler = scheduler
@@ -5994,8 +6713,11 @@ def fuse(
         """
         if node1.is_foreach() or node2.is_foreach():
             return ForeachKernelSchedulerNode.fuse(node1, node2)
+<<<<<<< HEAD
         elif MixOrderReduction.are_mix_order_reductions(node1, node2):
             return FusedMixOrderReductions(node1, node2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return FusedSchedulerNode.fuse(node1, node2)
 
@@ -6022,10 +6744,14 @@ def codegen_template(
         raise NotImplementedError
 
     def generate_kernel_code_from_nodes(
+<<<<<<< HEAD
         self,
         nodes: Sequence[BaseSchedulerNode],
         benchmark_kernel: bool,
         hint_override: Optional[int] = None,
+=======
+        self, nodes: Sequence[BaseSchedulerNode], benchmark_kernel: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> str:
         """
         Generate a kernel given a list of pre-fused nodes.
@@ -6038,9 +6764,12 @@ def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]) -> None:
         """
         raise NotImplementedError
 
+<<<<<<< HEAD
     def codegen_mix_order_reduction(self, node: FusedMixOrderReductions) -> None:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_sync(self) -> None:
         """
         Generate synchronization code for the kernel. This method depends on the hardware characteristics.
@@ -6093,6 +6822,7 @@ def benchmark_combo_kernel(
         and memory copy time in milliseconds on randomly generated inputs.
         """
         raise NotImplementedError
+<<<<<<< HEAD
 
     def codegen_comment(
         self,
@@ -6109,3 +6839,5 @@ def codegen_comment(
             V.graph.wrapper_code.write_provenance_debug_handle(
                 kernel_name, debug_handle
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 6edf351b42a23..f9db7b60bde59 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -2,7 +2,10 @@
 import contextlib
 import dataclasses
 import functools
+<<<<<<< HEAD
 import hashlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import inspect
 import itertools
 import json
@@ -17,7 +20,10 @@
 from collections.abc import Sequence
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from io import StringIO
+<<<<<<< HEAD
 from pathlib import Path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from types import ModuleType
 from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
 from typing_extensions import Self
@@ -29,6 +35,7 @@
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.testing import rand_strided
+<<<<<<< HEAD
 from torch._dynamo.utils import (
     counters,
     dynamo_timed,
@@ -37,6 +44,9 @@
     preserve_rng_state,
 )
 from torch._inductor.await_utils import await_sync
+=======
+from torch._dynamo.utils import counters, dynamo_timed, identity, preserve_rng_state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.utils import clear_on_fresh_cache
 from torch.utils._filelock import FileLock
 from torch.utils._ordered_set import OrderedSet
@@ -63,14 +73,20 @@
 from .codegen.triton import (
     gen_common_triton_imports,
     texpr,
+<<<<<<< HEAD
     TMACompatibilityChecker,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TritonKernel,
     TritonScheduling,
 )
 from .codegen.triton_utils import config_of, equal_1_arg_indices, signature_to_meta
 from .codegen.wrapper import pexpr
 from .exc import CUDACompileError
+<<<<<<< HEAD
 from .fx_utils import count_flops_fx
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .ir import ChoiceCaller, PrimitiveInfoType
 from .ops_handler import StoreMode
 from .runtime.benchmarking import benchmarker
@@ -106,9 +122,13 @@
 if TYPE_CHECKING:
     import concurrent
 
+<<<<<<< HEAD
     from torch._inductor.codegen.simd import IterationRangesEntry, IterationRangesRoot
 
     from .codegen.common import CSE
+=======
+    from torch._inductor.codegen.simd import IterationRangesRoot
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class KernelNamespace:
@@ -178,6 +198,7 @@ class PartialRender:
     of replacements after the initial render.
     """
 
+<<<<<<< HEAD
     HookFn = Callable[[], str]
 
     def __init__(
@@ -211,6 +232,14 @@ def finalize_hook(self, hook_key: str, strict: bool = True) -> None:
 
         NOTE: Will **error** if the hook has already been finalized.
         """
+=======
+    def __init__(self, code, replacement_hooks) -> None:
+        super().__init__()
+        self.code = code
+        self.replacement_hooks = replacement_hooks
+
+    def finalize_hook(self, hook_key: str, strict=True) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hook_key not in self.replacement_hooks:
             if strict:
                 raise RuntimeError(
@@ -218,6 +247,7 @@ def finalize_hook(self, hook_key: str, strict: bool = True) -> None:
                 )
             else:
                 return
+<<<<<<< HEAD
 
         hook = self.replacement_hooks[hook_key]
         assert hook is not None, f"Hook key {hook_key} can only be called once"
@@ -247,6 +277,17 @@ def finalize_all(self) -> str:
         """
         for key in self.replacement_hooks:
             self.finalize_hook(key)
+=======
+        assert self.replacement_hooks[hook_key] is not None, (
+            "hook_key can only be called once"
+        )
+        self.code = self.code.replace(hook_key, self.replacement_hooks[hook_key]())
+        self.replacement_hooks[hook_key] = None
+
+    def finalize_all(self) -> str:
+        for key, fn in self.replacement_hooks.items():
+            self.code = self.code.replace(key, fn())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.code
 
 
@@ -258,12 +299,17 @@ def finalize_all(self) -> str:
 class SubgraphInfo:
     body: IndentedBuffer
     template_mask: Optional[str] = None
+<<<<<<< HEAD
     template_out_shape: Optional[Union[str, tuple[str]]] = None
+=======
+    template_out: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compute: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     indexing_code: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     loads: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     stores: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
+<<<<<<< HEAD
     cse: Optional["CSE[Any]"] = None
 
     # only copied over if not None
@@ -278,6 +324,15 @@ def __post_init__(self):
             "numels",
             "cse",
         )
+=======
+
+    # only copied over if not None
+    range_trees: Optional[list["IterationRangesRoot"]] = None
+    numels = None  # type: ignore[var-annotated]
+
+    def __post_init__(self):
+        self.only_copy_if_non_none_fields = ("range_trees", "numels")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def to_dict(self):
         return {
@@ -306,8 +361,12 @@ def load(self, name: str, index: sympy.Expr):
         if name not in self.fixed_inputs:
             index_str = self._process_indexing(index)
             var = self._add_kernel_input(name)
+<<<<<<< HEAD
             buffer = V.graph.get_buffer(name)
             var_dtype = buffer.dtype
+=======
+            var_dtype = V.graph.get_buffer(name).dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             line = f"tl.load({var} + {index_str})"
 
             if (
@@ -317,6 +376,7 @@ def load(self, name: str, index: sympy.Expr):
                 line += ".to(tl.float32)"
                 var_dtype = torch.float32
 
+<<<<<<< HEAD
             out = self.kernel.cse.generate(
                 self.kernel.compute, line, dtype=var_dtype, shape=()
             )
@@ -327,13 +387,23 @@ def load(self, name: str, index: sympy.Expr):
             f"({self.fixed_inputs[name]})",
             dtype=torch.float32,
             shape=(),
+=======
+            out = self.kernel.cse.generate(self.kernel.compute, line, dtype=var_dtype)
+            return out
+
+        return self.kernel.cse.generate(
+            self.kernel.compute, f"({self.fixed_inputs[name]})", dtype=torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
         """Convert index variable to symbolic form."""
         return sympy_index_symbol(str(index_var))
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> str:
@@ -366,6 +436,7 @@ def _process_indexing(self, index):
 
 
 class TritonTemplateKernel(TritonKernel):
+<<<<<<< HEAD
     """
     A specialized kernel class for Triton templates that handles code generation
     for templated Triton kernels.
@@ -379,6 +450,12 @@ def __init__(
         self,
         kernel_name,
         input_nodes: tuple[ir.IRNode],
+=======
+    def __init__(
+        self,
+        kernel_name,
+        input_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_node,
         defines,
         num_stages,
@@ -389,13 +466,17 @@ def __init__(
         num_consumer_groups=0,
         num_buffers_warp_spec=0,
         use_jit=False,
+<<<<<<< HEAD
         tma_store=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
         subgraphs: Optional[list[ir.ComputedBuffer]] = None,
         workspace_arg: Optional[WorkspaceArg] = None,
         prologue_loads_all_inputs=False,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
     ) -> None:
         if tma_store:
@@ -419,6 +500,16 @@ def __init__(
             tiling,
             features=SIMDKernelFeatures([], numel),
             hint_override=hint_override,
+=======
+    ) -> None:
+        numel = sympy_product(output_node.get_size())
+        super().__init__(
+            {
+                "x": numel,
+                "r0_": sympy.S.One,
+            },
+            features=SIMDKernelFeatures([], numel),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.input_nodes = input_nodes
         self.output_node = output_node
@@ -426,7 +517,10 @@ def __init__(
         self.defines = defines
         self.kernel_name = kernel_name
         self.use_jit = use_jit
+<<<<<<< HEAD
         self.tma_store = tma_store
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.num_stages = num_stages
         self.num_warps = num_warps
         self.num_consumer_groups = num_consumer_groups
@@ -437,7 +531,10 @@ def __init__(
         # for templates with fixed epilogues
         self.prefix_args = prefix_args
         self.suffix_args = suffix_args
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.epilogue_fn = epilogue_fn
         self.render_hooks = {}  # type: ignore[var-annotated]
         self.triton_meta: Optional[dict[str, object]] = None
@@ -473,7 +570,11 @@ def __init__(
         self.loads: IndentedBuffer = FakeIndentedBuffer()
         self.stores: IndentedBuffer = FakeIndentedBuffer()
         self.template_mask: Optional[str] = None
+<<<<<<< HEAD
         self.template_out_shape: Optional[Union[str, tuple[str]]] = None
+=======
+        self.template_out: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
 
         # When caching is enabled, the generated code is not dependent on the input nodes names, or
@@ -493,6 +594,7 @@ def __init__(
         # by adding all inputs.
         self.prologue_loads_all_inputs = prologue_loads_all_inputs
 
+<<<<<<< HEAD
         # Extra functions to be exposed during partial template rendering.
         self.extra_template_env_fns: list[Callable[..., Any]] = []
 
@@ -502,6 +604,8 @@ def __init__(
     def _gen_tmp_var(self) -> str:
         return f"_tmp_var{next(self.tmp_var_ctr)}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def input_dependent_preserved_state(self) -> str:
         # Not adding self.args.output_buffers on purpose. But we do not need to reproduce it on a cache hit.
         # (never accessed).
@@ -555,7 +659,10 @@ def set_subgraph_body(self, body_name: str):
         context = (
             contextlib.nullcontext
             if not self.ops_handler
+<<<<<<< HEAD
             # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else lambda: V.set_ops_handler(self.ops_handler(V.get_ops_handler()))
         )
         with context():  # type: ignore[operator]
@@ -570,10 +677,19 @@ def set_subgraph_body(self, body_name: str):
             setattr(self, key, value)
 
     @contextlib.contextmanager
+<<<<<<< HEAD
     def create_subgraph_body(self, body_name: str, clear_cse: bool = False):
         assert body_name not in self.subgraph_bodies
         self.subgraph_bodies[body_name] = SubgraphInfo(
             IndentedBuffer(), None, None, cse=self.cse.clone() if clear_cse else None
+=======
+    def create_subgraph_body(self, body_name: str):
+        assert body_name not in self.subgraph_bodies
+        self.subgraph_bodies[body_name] = SubgraphInfo(
+            IndentedBuffer(),
+            None,
+            None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with self.set_subgraph_body(body_name):
             yield
@@ -590,12 +706,17 @@ def estimate_kernel_num_bytes(self):
         ninplace_args = len(unique(self.args.inplace_buffers.values()))
         num_bytes = []
         for i, inp in enumerate(itertools.chain(self.input_nodes, (self.output_node,))):
+<<<<<<< HEAD
             size = V.graph.sizevars.size_hints(inp.get_size(), fallback=0)
+=======
+            size = V.graph.sizevars.size_hints(inp.get_size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             numel = functools.reduce(operator.mul, size, 1)
             dtype_size = get_dtype_size(inp.get_dtype())
             num_bytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
         return sum(num_bytes)
 
+<<<<<<< HEAD
     def estimate_flops(self) -> int:
         for node in self.input_nodes:
             for fx_node in node._current_origins:
@@ -604,6 +725,8 @@ def estimate_flops(self) -> int:
                     return V.graph.sizevars.size_hint(f, fallback=0)
         return 0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def jit_lines(self):
         if self.use_jit:
             return "@triton.jit"
@@ -636,17 +759,24 @@ def jit_lines(self):
 
         inductor_meta = {
             "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+<<<<<<< HEAD
             **self.inductor_meta_common(),
+=======
+            **TritonKernel.inductor_meta_common(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **FixedGrid.setup_grid_as_args(),
         }
         if config.profile_bandwidth or config.benchmark_kernel:
             num_gb = self.estimate_kernel_num_bytes() / 1e9
             inductor_meta["kernel_num_gb"] = num_gb
+<<<<<<< HEAD
         if config.benchmark_kernel:
             flops = self.estimate_flops()
             inductor_meta["kernel_flop"] = flops
 
         inductor_meta["config_args"] = self.meta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         template_args = f"""
             num_stages={self.num_stages},
@@ -674,7 +804,12 @@ def hook():
             arg_defs, *_ = self.args.python_argdefs()
             return f"{', '.join(x.full_name() for x in arg_defs)}"
 
+<<<<<<< HEAD
         return self._register_hook("<ARGDEFS>", hook, allow_overwriting=True)
+=======
+        self.render_hooks["<ARGDEFS>"] = hook
+        return "<ARGDEFS>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def gen_defines(self):
         return self.defines
@@ -750,12 +885,22 @@ def hook():
             with code.indent():
                 code.splice(self.defines)
                 code.splice(renames.getvalue())
+<<<<<<< HEAD
                 self.codegen_prologue(code)
             return code.getvalue()
 
         return self._register_hook("<DEF_KERNEL>", hook)
 
     def size(self, name: Optional[str], index: int):
+=======
+            return code.getvalue()
+
+        assert "<DEF_KERNEL>" not in self.render_hooks
+        self.render_hooks["<DEF_KERNEL>"] = hook
+        return "<DEF_KERNEL>"
+
+    def size(self, name: str, index: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Hook called from template code to get the size of an arg.
         Will add needed args to pass it in if it is dynamic.
@@ -875,7 +1020,10 @@ def load_input(
         mask: Optional[str] = None,
         other: Optional[Union[float, int]] = 0.0,
         indent_width: int = 4,
+<<<<<<< HEAD
         index_shape: Optional[tuple[str]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """Loads an input and applies any necessary preprocessing or masking.
 
@@ -953,7 +1101,11 @@ def load_input(
             # We are using "None" for clarity in output code, but
             # we could alternatively emit `xmask = tl.full([xindex.shape], True, tl.int1)`
             self.template_mask = mask if mask is not None else "None"
+<<<<<<< HEAD
             self.template_out_shape = index_shape if index_shape else "xindex"
+=======
+            self.template_out = "xindex"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.template_indices = indices
             self.named_input_nodes[input_name].data.freeze_layout()
             self.cse.invalidate(OrderedSet())
@@ -994,7 +1146,10 @@ def store(
                             f"{output_name} = {value_str}.broadcast_to(xindex.shape)"
                         )
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.ops_handler = StoreOutputSubstitution
 
             input_node = self.named_input_nodes[input_name]
@@ -1017,7 +1172,11 @@ def store(
             else:
                 out_indexing = self.indexing(
                     output_index,
+<<<<<<< HEAD
                     copy_shape=self.template_out_shape,
+=======
+                    copy_shape=self.template_out,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     override_mask=self.template_mask,
                 )
                 from .codegen.triton import IndexingOptions
@@ -1048,6 +1207,7 @@ def hook():
 
                 return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
+<<<<<<< HEAD
         return self._register_hook(hook_key, hook)
 
     def _generate_index_from_tma_index(
@@ -1119,6 +1279,11 @@ def _generated_mask_for_tma(
             str: The mask generation line.
         """
         return f"{output_name} = {index_name} < {shape_val}"
+=======
+        assert hook_key not in self.render_hooks
+        self.render_hooks[hook_key] = hook
+        return hook_key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def store_output(
         self,
@@ -1126,8 +1291,11 @@ def store_output(
         val: str,
         mask: Optional[str] = None,
         indent_width: int = 4,
+<<<<<<< HEAD
         val_shape: Optional[tuple[str]] = None,
         block_indexing: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.
 
@@ -1139,6 +1307,7 @@ def store_output(
                 will be applied to the store.
             indent_width (int): The number of spaces to use for indentation. This is used when the call to
                 store_output is indented in the kernel definition.
+<<<<<<< HEAD
             block_indexing (bool): Are the input indices presented as offsets for creating the block (e.g.
                 inputs to TMA) or are they tensors that should be passed in directly.
         """
@@ -1151,6 +1320,13 @@ def store_output(
             assert isinstance(mask, (str, type(None)))
             assert isinstance(val_shape, (tuple, type(None)))
             assert isinstance(block_indexing, bool)
+=======
+        """
+        with self.create_subgraph_body("<STORE_OUTPUT>"):
+            assert isinstance(indices, (list, tuple))
+            assert isinstance(val, str)
+            assert isinstance(mask, (str, type(None)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert self.template_mask is None
             indices = list(map(OpOverrides.paren, indices))
             index_symbols = [sympy.Symbol(x, integer=True) for x in indices]
@@ -1159,6 +1335,7 @@ def store_output(
             ]
             assert len(indices) == len(lengths)
 
+<<<<<<< HEAD
             output_layout = self.output_node.get_layout()
             self.template_out = val
             if block_indexing:
@@ -1292,19 +1469,47 @@ def store_output(
 
             # pyrefly: ignore [bad-assignment]
             self.template_out_shape = val_shape if val_shape else val
+=======
+            # glue to make generated code use same indexing from template
+            for name, range_tree_entry in zip(
+                indices, self.range_trees[0].construct_entries(lengths)
+            ):
+                range_tree_entry.set_name(name)
+            contiguous_index = sympy_dot(
+                ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
+            )
+            contiguous_index = self.rename_indexing(contiguous_index)
+            self.body.writeline("xindex = " + texpr(contiguous_index))
+            self.range_trees[0].lookup(sympy.S.One, sympy_product(lengths)).set_name(
+                "xindex"
+            )
+            self.template_mask = mask
+            self.template_out = val
+            self.template_indices = indices
+            output_index = self.output_node.get_layout().make_indexer()(index_symbols)
+            output_index = self.rename_indexing(output_index)
+            if output_index == contiguous_index:
+                output_index = sympy.Symbol("xindex", integer=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             acc_dtype = (
                 triton_type_to_torch(self.meta["ACC_TYPE"])
                 if "ACC_TYPE" in self.meta
                 else torch.float32
             )
+<<<<<<< HEAD
             epilogue_args = [
                 V.kernel.cse.namedvar(val, dtype=acc_dtype, shape=val_shape)
             ]
+=======
+            epilogue_args = [V.kernel.cse.namedvar(val, dtype=acc_dtype)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for input_node in itertools.chain(
                 self.input_nodes[: self.prefix_args],
                 self.input_nodes[len(self.input_nodes) - self.suffix_args :],
             ):
                 input_node.freeze_layout()
+<<<<<<< HEAD
                 epilogue_arg = V.kernel.cse.generate(
                     self.compute,
                     input_node.make_loader()(index_symbols),
@@ -1312,6 +1517,9 @@ def store_output(
                     shape=input_node.get_size(),
                 )
                 epilogue_args.append(epilogue_arg)
+=======
+                epilogue_args.append(input_node.make_loader()(index_symbols))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # We update frozen_layouts_cnt in order to replay this function on a cache hit.
                 self.frozen_layouts_cnt += 1
 
@@ -1319,11 +1527,15 @@ def store_output(
                 self.output_node.get_name(),
                 output_index,
                 self.epilogue_fn(*epilogue_args),
+<<<<<<< HEAD
                 mode="tma" if self.tma_store else None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.codegen_body()
 
         def hook():
+<<<<<<< HEAD
             with self.set_subgraph_body(subgraph_name):
                 # more stuff might have been added since the codegen_body above
                 self.codegen_body()
@@ -1375,17 +1587,34 @@ def _register_extra_template_env_fns(self, *fns: Callable[..., Any]):
           function.
         """
         self.extra_template_env_fns.extend(fns)
+=======
+            # more stuff might have been added since the codegen_body above
+            self.codegen_body()
+            self.cse.invalidate(OrderedSet())
+
+            return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
+
+        assert "<STORE_OUTPUT>" not in self.render_hooks
+        self.render_hooks["<STORE_OUTPUT>"] = hook
+        return "<STORE_OUTPUT>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def render(self, template, kwargs, record_input_dependent_tracked_event=False):
         if record_input_dependent_tracked_event:
             self.cached_replay_events = []
 
         template_env = {
+<<<<<<< HEAD
             fn.__name__: (
                 self.record_input_dependent_tracked_event()(fn)
                 if record_input_dependent_tracked_event
                 else fn
             )
+=======
+            fn.__name__: self.record_input_dependent_tracked_event()(fn)
+            if record_input_dependent_tracked_event
+            else fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for fn in [
                 self.def_kernel,
                 self.size,
@@ -1396,7 +1625,10 @@ def render(self, template, kwargs, record_input_dependent_tracked_event=False):
                 self.modification,
                 self.gen_argdefs,
                 self.gen_defines,
+<<<<<<< HEAD
                 *self.extra_template_env_fns,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         }
         return PartialRender(
@@ -1428,7 +1660,10 @@ def indexing(
         copy_shape=None,
         override_mask=None,
         block_ptr=False,
+<<<<<<< HEAD
         tma_compatibility_checker: Optional[TMACompatibilityChecker] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Override the default indexing to use our custom mask and force
@@ -1439,15 +1674,22 @@ def indexing(
             dense_indexing=False,
             # We pass template_out as the shape to broadcast the indexing to as
             # the mask might be broadcast to the output shape
+<<<<<<< HEAD
             copy_shape=self.template_out_shape,
             override_mask=self.template_mask,
             block_ptr=block_ptr,
             tma_compatibility_checker=tma_compatibility_checker,
+=======
+            copy_shape=self.template_out,
+            override_mask=self.template_mask,
+            block_ptr=block_ptr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def codegen_range_tree(self):
         pass  # ignore default codegen
 
+<<<<<<< HEAD
     def additional_call_args_and_types(self):
         if isinstance(self.grid_fn, SymbolicGridFn):
             grid_args = self.grid_fn.sympy_call(*self.call_sizes, self.meta)
@@ -1470,6 +1712,18 @@ def call_kernel(
         )
 
         if not additional_call_args:
+=======
+    def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        _, call_args, _, arg_types = self.args.python_argdefs()
+
+        grid_args = ()
+        if isinstance(self.grid_fn, SymbolicGridFn):
+            grid_args = self.grid_fn.sympy_call(*self.call_sizes, self.meta)
+        elif all(isinstance(x, (int, sympy.Integer)) for x in self.call_sizes):
+            grid_args = self.grid_fn(*map(int, self.call_sizes), self.meta)
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not V.graph.cpp_wrapper, "cpp_wrapper requires SymbolicGridFn"
             wrapper.add_import_once(f"import {self.grid_fn.__module__}")
             meta = wrapper.add_meta_once(self.meta)
@@ -1478,9 +1732,15 @@ def call_kernel(
                 f"*{fn_name}({', '.join(map(pexpr, self.call_sizes))}, {meta})"
             )
             arg_types.append(None)
+<<<<<<< HEAD
 
         call_args.extend(additional_call_args)
         arg_types.extend(additional_arg_types)
+=======
+        assert len(grid_args) in (0, 3), "grid_fn should return 3 values"
+        call_args.extend(grid_args)
+        arg_types.extend(map(type, grid_args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.workspace_arg is not None:
             wrapper.generate_workspace_allocation(self.workspace_arg)
@@ -1555,19 +1815,29 @@ def make_key(
         input_nodes: tuple[ir.IRNode],
         num_stages: int,
         num_warps: int,
+<<<<<<< HEAD
         call_sizes: Sequence[sympy.core.symbol.Symbol],
+=======
+        call_sizes: list[sympy.core.symbol.Symbol],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prefix_args: int,
         suffix_args: int,
         epilogue_fn: Optional[Callable[..., Any]],
         epilogue_fn_hash: Optional[str],
+<<<<<<< HEAD
         tma_store: bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subgraphs: Optional[list[ir.Buffer]],  # has to be none to cache
         workspace_arg: Optional[WorkspaceArg],  # has to be none to cache
         layout: ir.Layout,
         num_consumer_groups: int,
         num_buffers_warp_spec: int,
         kwargs: dict[str, Any],
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[str]:
         def layout_key(layout: ir.Layout) -> str:
             assert not isinstance(layout, ir.FlexibleLayout)
@@ -1617,9 +1887,13 @@ def has_flexible_layout() -> bool:
                 "num_consumer_groups": num_consumer_groups,
                 "num_buffers_warp_spec": num_buffers_warp_spec,
                 "epilogue_fn_hash": epilogue_fn_hash,
+<<<<<<< HEAD
                 "tma_store": tma_store,
                 "kwargs": kwargs,
                 "hint_override": hint_override,
+=======
+                "kwargs": kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
@@ -1666,7 +1940,11 @@ def __init__(
         cache_codegen_enabled_for_template=False,
         prologue_loads_all_inputs=False,
     ) -> None:
+<<<<<<< HEAD
         super().__init__(name, hash=hashlib.sha256(source.encode("utf-8")).hexdigest())
+=======
+        super().__init__(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.grid = grid
         self.template = self._template_from_string(source)
         assert name not in self.all_templates, "duplicate template name"
@@ -1683,11 +1961,14 @@ def __init__(
     # was not used are the same.
     test_cache = False
 
+<<<<<<< HEAD
     @property
     def uid(self) -> str:
         # unique by prefixing with triton
         return f"triton::{self.name}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
     ) -> Optional[NotImplementedError]:
@@ -1700,12 +1981,19 @@ def maybe_append_choice(
         """
 
         try:
+<<<<<<< HEAD
             choice = self.generate(generate_with_caching=True, **kwargs)
             if choice is not None:
                 choices.append(choice)
             return None
         except NotImplementedError as e:
             log.info(  # noqa: G200
+=======
+            choices.append(self.generate(generate_with_caching=True, **kwargs))
+            return None
+        except NotImplementedError as e:
+            log.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
@@ -1719,7 +2007,11 @@ def generate_and_load(
         input_nodes: tuple[ir.IRNode],
         num_stages: int,
         num_warps: int,
+<<<<<<< HEAD
         call_sizes: Sequence[sympy.core.symbol.Symbol],
+=======
+        call_sizes: list[sympy.core.symbol.Symbol],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prefix_args: int,
         suffix_args: int,
         epilogue_fn: Optional[Callable[..., Any]],
@@ -1731,8 +2023,11 @@ def generate_and_load(
         layout: ir.Layout,
         kwargs: dict[str, Any],
         generate_with_caching,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
         tma_store: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[GenerateAndLoadResult]:
         """Generate the python code and load it into the current process"""
         caching_enabled = (
@@ -1751,7 +2046,10 @@ def generate_and_load(
                 suffix_args,
                 epilogue_fn,
                 epilogue_fn_hash,
+<<<<<<< HEAD
                 tma_store,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 subgraphs,
                 workspace_arg,
                 layout,
@@ -1765,12 +2063,17 @@ def generate_and_load(
 
         for name, val in kwargs.items():
             defines.write(f"{name} : tl.constexpr = {val}\n")
+<<<<<<< HEAD
+=======
+        defines = defines.getvalue()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fake_out = ir.Buffer(name="buf_out", layout=layout)
         kernel_name = f"triton_{self.name}"
 
         numel = sympy_product(layout.size)
         buffers = itertools.chain(input_nodes, (fake_out,))
+<<<<<<< HEAD
 
         if TritonScheduling.can_use_32bit_indexing(numel, buffers):
             index_dtype = "tl.int32"
@@ -1780,6 +2083,12 @@ def generate_and_load(
         # Add index dtype to defines so it's available in the template
         defines.write(f"INDEX_DTYPE : tl.constexpr = {index_dtype}\n")
         defines = defines.getvalue()
+=======
+        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            raise NotImplementedError(
+                "64-bit indexing is not yet implemented for triton templates"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kernel_options = {
             "input_nodes": input_nodes,
@@ -1810,8 +2119,11 @@ def make_kernel():
                 output_node=fake_out,
                 workspace_arg=workspace_arg,
                 use_jit=False,
+<<<<<<< HEAD
                 hint_override=hint_override,
                 tma_store=tma_store,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 **kernel_options,
             )
 
@@ -1839,7 +2151,12 @@ def make_extra() -> str:
 
             try:
                 template = kernel.render(self.template, kwargs, caching_enabled)
+<<<<<<< HEAD
                 code = template.finalize_all()
+=======
+                with kernel.set_subgraph_body("<STORE_OUTPUT>"):
+                    code = template.finalize_all()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except ZeroDivisionError:
                 # TODO(nmacchioni): fix sympy division by zero
                 return None
@@ -1909,7 +2226,10 @@ def maybe_test_cache(code: str, extra: str, kernel):
             extra,
             input_call_args,
             prologue_supported_inputs,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel_args_sizevars_keys,
             kernel_options,
         )
@@ -1928,11 +2248,17 @@ def generate(  # type: ignore[override]
         epilogue_fn_hash: Optional[str] = None,
         subgraphs: Optional[list[ir.Buffer]] = None,
         mutated_inputs: Optional[list[ir.IRNode]] = None,
+<<<<<<< HEAD
         call_sizes: Optional[Sequence[sympy.core.symbol.Symbol]] = None,
         workspace_arg: Optional[WorkspaceArg] = None,
         generate_with_caching=False,
         hint_override: Optional[int] = None,
         tma_store: bool = False,
+=======
+        call_sizes: Optional[list[sympy.core.symbol.Symbol]] = None,
+        workspace_arg: Optional[WorkspaceArg] = None,
+        generate_with_caching=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ):
         """This function generates a TritonTemplateCaller
@@ -1977,8 +2303,11 @@ def generate(  # type: ignore[override]
             layout,
             kwargs,
             generate_with_caching and self._cache_codegen_enabled_for_template,
+<<<<<<< HEAD
             hint_override=hint_override,
             tma_store=tma_store,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # May happen as result of dev by 0.
@@ -1994,6 +2323,7 @@ def generate(  # type: ignore[override]
             expected_input_args,
         )
 
+<<<<<<< HEAD
         # `kernel_input_nodes` are the actual inputs that will be passed to the kernel,
         # so e.g. views of the same input are not included. `codegen_input_nodes`
         # includes views of inputs to preserve the kernel semantics. The shape and
@@ -2010,6 +2340,14 @@ def generate(  # type: ignore[override]
             map(sympy.expand, result.kernel_args_sizevars_keys),
             fallback=config.unbacked_symint_fallback,
             hint_override=hint_override,
+=======
+        full_input_nodes = tuple(
+            [V.graph.get_buffer(k) for k in result.input_call_args]
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, result.kernel_args_sizevars_keys),
+            fallback=config.unbacked_symint_fallback,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         kernel_hash_name = f"triton_{self.name}_{next(self.index_counter)}"
@@ -2033,15 +2371,22 @@ def generate(  # type: ignore[override]
 
         options = result.kernel_options
 
+<<<<<<< HEAD
         def make_kernel_render(out_node, hint_override: Optional[int] = None):
+=======
+        def make_kernel_render(out_node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert result is not None
             kernel = self.kernel_type(
                 kernel_name=str(Placeholder.KERNEL_NAME),
                 output_node=out_node,
                 workspace_arg=workspace_arg,
                 use_jit=False,
+<<<<<<< HEAD
                 hint_override=hint_override,
                 tma_store=tma_store,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 **options,
             )
             render = functools.partial(
@@ -2057,7 +2402,10 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
             *V.graph.sizevars.size_hints(
                 call_sizes,
                 fallback=config.unbacked_symint_fallback,
+<<<<<<< HEAD
                 hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             kwargs,
         )
@@ -2078,13 +2426,21 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
             matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
             waves_per_eu=kwargs.get("waves_per_eu", 0),
             kpack=kwargs.get("kpack", 2),
+<<<<<<< HEAD
             input_tensor_meta=TensorMeta.from_irnodes(kernel_input_nodes),  # type: ignore[arg-type]
+=======
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_tensor_meta=TensorMeta.from_irnodes(layout),
         )
 
         return TritonTemplateCaller(
             kernel_hash_name,
+<<<<<<< HEAD
             codegen_input_nodes,
+=======
+            full_input_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             layout,
             make_kernel_render,
             result.extra.strip("-").replace("-", ", "),
@@ -2100,6 +2456,7 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
                 "num_stages": num_stages,
                 "num_warps": num_warps,
                 "GROUP_M": kwargs.get("GROUP_M", -1),
+<<<<<<< HEAD
                 "allow_tf32": str(kwargs.get("ALLOW_TF32")),
                 "acc_type": str(kwargs.get("ACC_TYPE")),
                 "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
@@ -2110,11 +2467,21 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
                     for k in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS
                     if k in kwargs
                 },
+=======
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
+                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+                "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
+                "waves_per_eu": kwargs.get("waves_per_eu", 0),
+                "kpack": kwargs.get("kpack", 2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             mutated_inputs=mutated_inputs,
             workspace_arg=workspace_arg,
             allowed_prologue_inps=result.prologue_supported_inputs,
+<<<<<<< HEAD
             hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -2141,10 +2508,13 @@ def __init__(
         self.op_overload = op_overload
         self.use_fallback_kernel = use_fallback_kernel
         self.kernel_creator = kernel_creator
+<<<<<<< HEAD
         # match the API for KernelTemplate as they can be treated the same
         # There is no src hash for ExternKernelChoice in the traditional sense
         # so we indicate this by returning None
         self.src_hash = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def to_callable(self):
         return getattr(extern_kernels, self.name)
@@ -2178,6 +2548,7 @@ def bind(
             self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
         )
 
+<<<<<<< HEAD
     @property
     def uid(self) -> str:
         # unique by prefixing with aten
@@ -2208,6 +2579,8 @@ def maybe_append_choice(
         choices.append(self.bind(input_nodes=input_nodes, layout=layout, **kwargs))
         return None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TritonTemplateCaller(ir.TritonTemplateCallerBase):
     def __init__(
@@ -2224,7 +2597,10 @@ def __init__(
         mutated_inputs=None,
         workspace_arg: Optional[WorkspaceArg] = None,
         allowed_prologue_inps: Optional[OrderedSet[str]] = None,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(name, input_nodes, layout, description)
         self.make_kernel_render = make_kernel_render
@@ -2244,7 +2620,10 @@ def __init__(
         self.allowed_prologue_inps = (
             allowed_prologue_inps if allowed_prologue_inps is not None else OrderedSet()
         )
+<<<<<<< HEAD
         self.hint_override = hint_override
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def benchmark(self, *args, out):
         assert self.bmreq is not None
@@ -2361,7 +2740,11 @@ def output_node(self):
             assert self.choice.op_overload is not None, (
                 "Please provide an op_overload to use ir.FallbackKernel"
             )
+<<<<<<< HEAD
             inner: ir.IRNode = ir.FallbackKernel.create(
+=======
+            inner = ir.FallbackKernel.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.choice.op_overload, *self.input_nodes, **self.kwargs
             )
         elif self.choice.kernel_creator is not None:
@@ -2403,6 +2786,7 @@ def get_mm_log_filename() -> Optional[str]:
     return mm_file_name
 
 
+<<<<<<< HEAD
 @functools.cache
 def get_flex_attention_log_filename() -> Optional[str]:
     flex_attention_file_name = os.environ.get(
@@ -2414,6 +2798,8 @@ def get_flex_attention_log_filename() -> Optional[str]:
     return str(Path(flex_attention_file_name).with_suffix(".json"))
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def append_to_log(filename, data):
     lock_file = filename.replace(".json", ".lock")
     lock = FileLock(lock_file)
@@ -2489,7 +2875,10 @@ def __init__(
             self._postprocessor = lambda x: x
         assert "input_nodes" in kwargs
         assert "layout" in kwargs
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs["input_nodes"], kwargs["layout"] = preprocessor(
             kwargs["input_nodes"], kwargs["layout"]
         )
@@ -2579,6 +2968,7 @@ def create_precompile_key(
     None,
 ]
 
+<<<<<<< HEAD
 # Args to PreprocessingFunctions
 # choices: list of ChoiceCaller objects to preprocess
 # Returns: modified list of ChoiceCaller objects
@@ -2612,6 +3002,8 @@ def filter_choices_by_desc_regex(choices: list[ChoiceCaller]) -> list[ChoiceCall
         ]
     return choices
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AlgorithmSelectorCache(PersistentCache):
     """
@@ -2624,6 +3016,7 @@ class AlgorithmSelectorCache(PersistentCache):
     doesn't depend on the output layout.
     """
 
+<<<<<<< HEAD
     FLEX_ATTENTION_TUNABLE_KEYS = tuple(
         dict.fromkeys(
             [
@@ -2643,6 +3036,8 @@ class AlgorithmSelectorCache(PersistentCache):
         )
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
@@ -2651,6 +3046,7 @@ def __init__(self, *args, **kwargs) -> None:
         # first to benchmark it. share a single precompilation function for all lowerings
         # of a particular key
         self.precompile_cache: dict[str, Callable[[], None]] = {}
+<<<<<<< HEAD
         # cache for prescreening results to ensure deterministic candidate selection
         self.prescreening_cache: dict[str, OrderedSet[str]] = {}
         # list of callbacks that are called after benchmarking
@@ -2670,10 +3066,20 @@ def _register_default_preprocessing_fns(self):
         self.add_preprocessing_fn(filter_choices_by_name_regex)
         self.add_preprocessing_fn(filter_choices_by_desc_regex)
 
+=======
+        # list of callbacks that are called after benchmarking
+        self.feedback_saver_fns: list[FeedbackFunction] = []
+        # cache for prescreening results to ensure deterministic candidate selection
+        self.prescreening_cache: dict[str, OrderedSet[str]] = {}
+
+        clear_on_fresh_cache(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def cache_clear(self) -> None:
         self.precompile_cache.clear()
         self.prescreening_cache.clear()
 
+<<<<<<< HEAD
     def pick_deterministic_choice(self, choices: list[ChoiceCaller]) -> ChoiceCaller:
         assert len(choices) >= 2
         externs = [
@@ -2685,6 +3091,8 @@ def pick_deterministic_choice(self, choices: list[ChoiceCaller]) -> ChoiceCaller
         else:
             return choices[0]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __call__(
         self,
         name,
@@ -2699,6 +3107,7 @@ def __call__(
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
         precompilation_timeout_seconds: int = 60 * 60,
         return_multi_template=False,
+<<<<<<< HEAD
         best_config_future=None,
     ):
         from .codegen.cuda.cuda_kernel import CUDATemplateCaller
@@ -2707,6 +3116,11 @@ def __call__(
         for preprocessing_fn in self.preprocessing_fns:
             choices = preprocessing_fn(choices)
 
+=======
+    ):
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Templates selected with input_gen_fns require specific input data to avoid IMA
         # Passing custom input gen fns to benchmark_fusion NYI, so skip deferred template selection
         # TODO(jgong5): support multi-template on CPU
@@ -2715,17 +3129,44 @@ def __call__(
 
         # TODO - assert that we have not mutating kernels here
 
+<<<<<<< HEAD
+=======
+        if config.test_configs.autotune_choice_name_regex is not None:
+            choices = [
+                c
+                for c in choices
+                if re.search(
+                    config.test_configs.autotune_choice_name_regex,
+                    c.name,
+                )
+            ]
+        if config.test_configs.autotune_choice_desc_regex is not None:
+            choices = [
+                c
+                for c in choices
+                if re.search(
+                    config.test_configs.autotune_choice_desc_regex,
+                    c.description,
+                )
+            ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if mm_file_name := get_mm_log_filename():
             M, K = input_nodes[-2].get_size()[:2]
             N = input_nodes[-1].get_size()[-1]
             append_to_log(mm_file_name, {"invoke": str((M, K, N))})
 
+<<<<<<< HEAD
         def create_no_valid_choices(reason: str) -> NoValidChoicesError:
+=======
+        if len(choices) == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             backend_config = (
                 "max_autotune_gemm_backends"
                 if name != "convolution"
                 else "max_autotune_conv_backends"
             )
+<<<<<<< HEAD
             return NoValidChoicesError(
                 f"No choices to select. Provided reason: {reason} "
                 f"please consider adding ATEN into {backend_config} "
@@ -2734,6 +3175,12 @@ def create_no_valid_choices(reason: str) -> NoValidChoicesError:
 
         if len(choices) == 0:
             raise create_no_valid_choices("No choices exist for backend.")
+=======
+            raise NoValidChoicesError(
+                f"No choices to select, please consider adding ATEN into {backend_config} "
+                "config (defined in torch/_inductor/config.py) to allow at least one choice. "
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug("Max autotune selects from %s choices.", str(len(choices)))
 
         if len(choices) == 1:
@@ -2741,6 +3188,7 @@ def create_no_valid_choices(reason: str) -> NoValidChoicesError:
                 # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
                 return choices[0].output_node()
 
+<<<<<<< HEAD
         if config.deterministic:
             return self.pick_deterministic_choice(choices).output_node()
 
@@ -2764,12 +3212,22 @@ def benchmark(choices, hint_override: Optional[int] = None):
             return benchmark_fn(choices)
 
         def autotune(choices, hint_override: Optional[int] = None):
+=======
+        @functools.cache
+        def make_benchmark_fn():
+            return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
+
+        inputs_key = create_inputs_key(input_nodes)
+
+        def autotune(choices):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("Starting autotuning")
 
             with dynamo_timed(
                 f"{name}_template_autotuning",
                 log_pt2_compile_event=True,
                 dynamo_compile_column_us="compile_time_autotune_time_us",
+<<<<<<< HEAD
                 metadata=_autotune_metadata(input_nodes),
             ):
                 benchmark_results = benchmark(choices, hint_override=hint_override)
@@ -2778,12 +3236,34 @@ def autotune(choices, hint_override: Optional[int] = None):
                         f"{name}_template_autotuning", benchmark_results
                     )
                 return benchmark_results
+=======
+                metadata={
+                    "autotune_strides": ", ".join(
+                        [str(n.get_stride()) for n in input_nodes]
+                    ),
+                    "autotune_dtypes": ", ".join(
+                        [str(n.get_dtype()) for n in input_nodes]
+                    ),
+                    "autotune_shape": ", ".join(
+                        ["x".join(map(str, n.get_size())) for n in input_nodes]
+                    ),
+                    "autotune_offset": ", ".join(
+                        [str(n.get_layout().offset) for n in input_nodes]
+                    ),
+                },
+            ):
+                return make_benchmark_fn()(choices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if config.autotune_in_subproc:
             # Initialize the suprocess pool so it will warmup early.
             torch._inductor.autotune_process.get_tuning_process_pool()
 
+<<<<<<< HEAD
         def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
+=======
+        def do_autotuning(choices, precompile_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             precompile_start_ts = time.time()
             with dynamo_timed(
                 f"{name}_template_precompiling",
@@ -2793,12 +3273,15 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                 precompile_fn()
             precompile_elapse = time.time() - precompile_start_ts
             log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
+<<<<<<< HEAD
             # Prune anything that failed to compile
             choices = [c for c in choices if not c.failed]
             if len(choices) == 0:
                 raise create_no_valid_choices(
                     "All choices failed to compile for backend."
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             candidates = self.prescreen_choices(
                 choices, name, inputs_key, self.prescreening_cache
@@ -2810,8 +3293,12 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                     candidates,
                     name,
                     inputs_key,
+<<<<<<< HEAD
                     lambda choices: autotune(choices, hint_override=hint_override),
                     hint_override=hint_override,
+=======
+                    autotune,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 choices = self.prune_choices_postscreen(
                     choices, timings, name, inputs_key, self.prescreening_cache
@@ -2820,6 +3307,7 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                 log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
 
             autotune_start_ts = time.time()
+<<<<<<< HEAD
 
             if best_config_future is not None:
                 best_config = await_sync(best_config_future)
@@ -2849,12 +3337,18 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                 ]
                 log.info("Filtered to %d choices based on best_config", len(choices))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             timings = self.lookup(
                 choices,
                 name,
                 inputs_key,
+<<<<<<< HEAD
                 lambda choices: autotune(choices, hint_override=hint_override),
                 hint_override=hint_override,
+=======
+                autotune,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             autotune_elapse = time.time() - autotune_start_ts
@@ -2865,8 +3359,16 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
             ):
                 raise NoValidChoicesError
 
+<<<<<<< HEAD
             if (
                 has_autotuned
+=======
+            if make_benchmark_fn.cache_info().currsize:
+                counters["inductor"]["select_algorithm_autotune"] += 1
+
+            if (
+                make_benchmark_fn.cache_info().currsize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or log.getEffectiveLevel() == logging.DEBUG
                 or config.trace.log_autotuning_results
             ):
@@ -2877,7 +3379,10 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                     autotune_elapse,
                     precompile_elapse,
                     prescreening_elapse,
+<<<<<<< HEAD
                     hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             def profiler_bench_function():
@@ -2889,7 +3394,13 @@ def profiler_bench_function():
                     profile_bandwidth_with_do_bench_using_profiling=True,
                     autotune_in_subproc=False,
                 ):
+<<<<<<< HEAD
                     return benchmark(choices)
+=======
+                    return self.make_benchmark_fn(
+                        choices, input_nodes, layout, input_gen_fns
+                    )(choices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for feedback_fn in self.feedback_saver_fns:
                 # re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
@@ -2912,6 +3423,7 @@ def profiler_bench_function():
 
         if return_multi_template and (config.max_autotune or config.max_autotune_gemm):
 
+<<<<<<< HEAD
             def get_timings(hint_override: Optional[int] = None):
                 filtered_choices = [
                     c
@@ -2922,6 +3434,10 @@ def get_timings(hint_override: Optional[int] = None):
                 timings = do_autotuning(
                     filtered_choices, precompile_fn, hint_override=hint_override
                 )
+=======
+            def get_timings():
+                timings = do_autotuning(choices, precompile_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 min_extern_choice = float("inf")
                 for choice, timing in timings.items():
                     if isinstance(choice, ExternKernelCaller):
@@ -2940,7 +3456,11 @@ def get_timings(hint_override: Optional[int] = None):
 
             # We take the union of allowed prologue inputs from all choices,
             # and, within benchmark fusion, don't allow prologue fusion for
+<<<<<<< HEAD
             # choices which don't support the whole union.
+=======
+            # choices which dont support the whole union.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             allowed_prologue_inps: OrderedSet[str] = OrderedSet()
             for c in choices:
                 if isinstance(c, TritonTemplateCaller):
@@ -3035,6 +3555,14 @@ def no_op(*args, **kwargs):
             log.debug("Found all %d timings in cache, returning no_op", len(timings))
             return no_op
 
+<<<<<<< HEAD
+=======
+        if config.search_autotune_cache and not (
+            config.max_autotune or config.max_autotune_gemm
+        ):
+            return no_op
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         precompile_key = create_precompile_key(name, inputs_key, choices)
         if precompile_func := self.precompile_cache.get(precompile_key):
             log.debug("Precompile function found in cache, returning it")
@@ -3112,16 +3640,22 @@ def on_complete(future):
         def wait_on_futures():
             log.debug("Waiting on futures")
             counters["inductor"]["select_algorithm_precompile"] += 1
+<<<<<<< HEAD
             exceptions: list[tuple[ChoiceCaller, BaseException]] = []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for future in as_completed(
                 futures,
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
+<<<<<<< HEAD
                     counters["inductor"][
                         "select_algorithm_num_precompilation_exceptions"
                     ] += 1
                     exceptions.append((futures[future], e))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     from torch._inductor.codegen.cuda.cuda_kernel import (
                         CUDATemplateCaller,
                     )
@@ -3133,6 +3667,7 @@ def wait_on_futures():
                             "Exception %s for benchmark choice %s",
                             e,
                             futures[future],
+<<<<<<< HEAD
                             exc_info=e,
                         )
                         futures[future].mark_failed()
@@ -3144,6 +3679,14 @@ def wait_on_futures():
                             exc_info=e,
                         )
                         futures[future].mark_failed()
+=======
+                            exc_info=True,
+                        )
+                    else:
+                        log.error(
+                            "Exception %s for benchmark choice %s", e, futures[future]
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     counters["inductor"]["select_algorithm_num_precompiles"] += 1
                     log.info(
@@ -3151,8 +3694,11 @@ def wait_on_futures():
                         futures.get(future),
                         elapsed_times.get(future),
                     )
+<<<<<<< HEAD
             if exceptions:
                 _log_autotune_exceptions(exceptions)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             executor.shutdown(wait=True)
 
@@ -3167,7 +3713,10 @@ def get_inputs(
         input_nodes: list[ir.IRNode],
         layout: ir.Layout,
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> AutotuneArgs:
         """
         Factory method to create AutotuneArgs from a list of ChoiceCallers.
@@ -3177,6 +3726,7 @@ def get_inputs(
 
         # de-duplicate args
         unique_example_inputs = {
+<<<<<<< HEAD
             x.get_name(): input_gen_fns.get(
                 i,
                 lambda x: cls.benchmark_example_value(x, hint_override=hint_override),
@@ -3261,6 +3811,38 @@ def get_inputs(
             )
 
         out_extern = torch.as_strided(out_base, out.size(), out.stride(), out_offset)
+=======
+            x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+            for i, x in enumerate(input_nodes)
+        }
+        example_inputs = list(unique_example_inputs.values())
+        example_inputs_extern = [
+            (
+                unique_example_inputs[input_node.get_name()]
+                if unique_example_inputs[input_node.get_name()].is_mkldnn
+                else torch.as_strided(
+                    unique_example_inputs[input_node.get_name()],
+                    V.graph.sizevars.size_hints(
+                        input_node.get_size(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hints(
+                        input_node.get_stride(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hint(
+                        input_node.get_layout().offset,
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                )
+            )
+            for input_node in input_nodes
+        ]
+        out = cls.benchmark_example_value(layout)
+        out_extern = torch.as_strided(
+            out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected = None
         if VERIFY:
             choices[0].benchmark(*example_inputs_extern, out=out_extern)
@@ -3274,20 +3856,33 @@ def get_inputs(
             expected,
         )
 
+<<<<<<< HEAD
     @staticmethod
     def _is_extern(choice: ChoiceCaller) -> bool:
         return isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def benchmark_choice(
         cls, choice: ChoiceCaller, autotune_args: AutotuneArgs
     ) -> float:
+<<<<<<< HEAD
         benchmark_tensors = autotune_args.get_benchmark_tensors(cls._is_extern(choice))
         inputs, output = benchmark_tensors.unpack()
         output.zero_()
         result = choice.benchmark(*inputs, out=output)
         device_type = next(
             (tensor.device.type for tensor in inputs if is_gpu(tensor.device.type)),
+=======
+        is_extern = isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
+        benchmark_tensors = autotune_args.get_benchmark_tensors(is_extern)
+        inpts, output = benchmark_tensors.unpack()
+        output.zero_()
+        result = choice.benchmark(*inpts, out=output)
+        device_type = next(
+            (tensor.device.type for tensor in inpts if is_gpu(tensor.device.type)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "cuda",
         )
         device_interface = get_interface_for_device(device_type)
@@ -3308,6 +3903,7 @@ def benchmark_choices(
         for choice in choices:
             try:
                 timing = cls.benchmark_choice(choice, autotune_args)
+<<<<<<< HEAD
             except CUDACompileError:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 
@@ -3318,6 +3914,19 @@ def benchmark_choices(
                 timing = float("inf")
             except NotImplementedError:
                 log.warning("Not yet implemented", exc_info=True)
+=======
+            except CUDACompileError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                if not isinstance(choice, CUDATemplateCaller):
+                    log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        e,
+                    )
+                timing = float("inf")
+            except NotImplementedError as e:
+                log.warning("Not yet implemented: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 timing = float("inf")
             except RuntimeError as e:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
@@ -3325,11 +3934,17 @@ def benchmark_choices(
                 msg = str(e)
                 if "invalid argument" in msg:
                     msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+<<<<<<< HEAD
                 elif "illegal memory access" in msg:
                     msg += "\n\nEither error in template or triton bug.\n"
                 elif "unspecified launch failure" in msg:
                     msg += "\n\nAn unrecoverable unspecified launch failure was caught during autotuning."
                     msg += "\nPlease try re-running with TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1.\n\n"
+=======
+                else:
+                    if "illegal memory access" in msg:
+                        msg += "\n\nEither error in template or triton bug.\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if isinstance(choice, CUDATemplateCaller):
                     log.debug(
@@ -3352,7 +3967,11 @@ def benchmark_choices(
                     from triton.runtime.autotuner import OutOfResources
 
                     if isinstance(e, OutOfResources):
+<<<<<<< HEAD
                         log.warning(e)  # noqa: G200
+=======
+                        log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         timing = float("inf")
                     else:
                         raise e
@@ -3370,11 +3989,16 @@ def benchmark_in_current_process(
         input_nodes: list[ir.IRNode],
         layout: ir.Layout,
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
     ) -> dict[ChoiceCaller, float]:
         inputs = cls.get_inputs(
             choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
         )
+=======
+    ) -> dict[ChoiceCaller, float]:
+        inputs = cls.get_inputs(choices, input_nodes, layout, input_gen_fns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls.benchmark_choices(choices, inputs)
 
     @classmethod
@@ -3384,17 +4008,28 @@ def benchmark_in_sub_process(
         input_nodes: list[ir.IRNode],
         layout: ir.Layout,
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         from . import autotune_process
 
         # only benchmark triton kernel in sub process for now.
         # ATen/Extern kernel are still benchmarked in the current process.
+<<<<<<< HEAD
         extern = [c for c in choices if cls._is_extern(c)]
         triton = [c for c in choices if not cls._is_extern(c)]
 
         timings = cls.benchmark_in_current_process(
             extern, input_nodes, layout, input_gen_fns, hint_override=hint_override
+=======
+        extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
+        triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+
+        timings = cls.benchmark_in_current_process(
+            extern, input_nodes, layout, input_gen_fns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         timings.update(autotune_process.benchmark_in_sub_process(triton))  # type: ignore[arg-type]
         return timings
@@ -3406,7 +4041,10 @@ def make_benchmark_fn(
         input_nodes: list[ir.IRNode],
         layout: ir.Layout,
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if DEBUG:
             print(f"{len(choices)} tuning requests:")
@@ -3417,7 +4055,10 @@ def make_benchmark_fn(
                 input_nodes=input_nodes,
                 layout=layout,
                 input_gen_fns=input_gen_fns,
+<<<<<<< HEAD
                 hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             return functools.partial(
@@ -3425,7 +4066,10 @@ def make_benchmark_fn(
                 input_nodes=input_nodes,
                 layout=layout,
                 input_gen_fns=input_gen_fns,
+<<<<<<< HEAD
                 hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @staticmethod
@@ -3577,6 +4221,7 @@ def prune_choices_postscreen(
         return pruned_choices
 
     @staticmethod
+<<<<<<< HEAD
     def get_flex_attention_choice_info(
         choice: ChoiceCaller, timings: dict[ChoiceCaller, float]
     ) -> dict[str, Any]:
@@ -3645,6 +4290,8 @@ def maybe_log_flex_attention_results(
         append_to_log(flex_attention_filename, out_dict)
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def log_results(
         name: str,
         input_nodes: list[ir.IRNode],
@@ -3652,9 +4299,13 @@ def log_results(
         elapse: float,
         precompile_elapse: float,
         prescreening_elapse: Optional[float] = None,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
     ):
         """Log the autotuning results, currently only handles mm and flex"""
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.debug.log_autotuning_results(
             name, input_nodes, timings, elapse, precompile_elapse
         )
@@ -3668,7 +4319,10 @@ def log_results(
                         V.graph.sizevars.size_hints(
                             n.get_size(),
                             fallback=config.unbacked_symint_fallback,  # type: ignore[arg-type]
+<<<<<<< HEAD
                             hint_override=hint_override,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ),
                     )
                 )
@@ -3723,10 +4377,13 @@ def get_choice_info(choice):
 
             append_to_log(mm_filename, out_dict)
 
+<<<<<<< HEAD
         AlgorithmSelectorCache.maybe_log_flex_attention_results(
             name, input_nodes, timings
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         best_time = timings[best]
         sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
         sys.stderr.write(f"strides: {strides}\n")
@@ -3760,7 +4417,11 @@ def get_choice_info(choice):
         )
 
     @staticmethod
+<<<<<<< HEAD
     def benchmark_example_value(node, hint_override: Optional[int] = None):
+=======
+    def benchmark_example_value(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Convert an ir.Buffer into a concrete torch.Tensor we can use for
         benchmarking.
@@ -3776,6 +4437,7 @@ def benchmark_example_value(node, hint_override: Optional[int] = None):
         # So we need call as_strided in the end to 'view' the tensor with the correct
         # sizes/strides
         return AlgorithmSelectorCache.generate_example_value(
+<<<<<<< HEAD
             tuple(
                 V.graph.sizevars.atomically_apply_size_hint(
                     size,
@@ -3808,6 +4470,22 @@ def benchmark_example_value(node, hint_override: Optional[int] = None):
                 )
                 # pyrefly: ignore [bad-argument-type]
                 for size in V.graph.get_allocation_size(node)
+=======
+            V.graph.sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            V.graph.sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            node.get_device(),
+            node.get_dtype(),
+            node.layout.offset,
+            V.graph.sizevars.size_hints(
+                V.graph.get_allocation_size(node),
+                fallback=config.unbacked_symint_fallback,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
@@ -3849,12 +4527,18 @@ def key_of(node):
                 node.get_size(),
                 fallback=config.unbacked_symint_fallback,
             ),
+<<<<<<< HEAD
             *tuple(
                 V.graph.sizevars.atomically_apply_size_hint(
                     stride,
                     fallback=config.unbacked_symint_fallback,
                 )
                 for stride in node.get_stride()
+=======
+            *sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             sizevars.size_hint(
                 node.get_layout().offset,
@@ -3865,6 +4549,7 @@ def key_of(node):
     def add_feedback_saver(self, fn: FeedbackFunction):
         self.feedback_saver_fns.append(fn)
 
+<<<<<<< HEAD
     def clear_feedback_savers(self):
         self.feedback_saver_fns = []
 
@@ -3882,10 +4567,13 @@ def clear_preprocessing_fns(self, clear_defaults: bool = False):
         if not clear_defaults:
             self._register_default_preprocessing_fns()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _ALGORITHM_SELECTOR_CACHE: Optional[AlgorithmSelectorCache] = None
 
 
+<<<<<<< HEAD
 def get_algorithm_selector_cache() -> AlgorithmSelectorCache:
     """Get the global algorithm selector cache, creating it if it doesn't exist."""
     global _ALGORITHM_SELECTOR_CACHE
@@ -3896,6 +4584,12 @@ def get_algorithm_selector_cache() -> AlgorithmSelectorCache:
 
 def autotune_select_algorithm(*args, **kwargs):
     cache = get_algorithm_selector_cache()
+=======
+def autotune_select_algorithm(*args, **kwargs):
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if "return_multi_template" not in kwargs:
         kwargs["return_multi_template"] = (
@@ -3905,12 +4599,17 @@ def autotune_select_algorithm(*args, **kwargs):
     if "precompilation_timeout_seconds" not in kwargs:
         kwargs["precompilation_timeout_seconds"] = config.precompilation_timeout_seconds
 
+<<<<<<< HEAD
     return cache(*args, **kwargs)
+=======
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def add_feedback_saver(
     fn: FeedbackFunction,
 ):
+<<<<<<< HEAD
     cache = get_algorithm_selector_cache()
     cache.add_feedback_saver(fn)
 
@@ -3954,6 +4653,12 @@ def clear_preprocessing_fns(clear_defaults: bool = False):
     """
     cache = get_algorithm_selector_cache()
     cache.clear_preprocessing_fns(clear_defaults)
+=======
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+    _ALGORITHM_SELECTOR_CACHE.add_feedback_saver(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def realize_inputs(*args):
@@ -3992,6 +4697,7 @@ def sympy_call(self, *args, **kwargs):
         return self.fn(*args, **kwargs, **self.kwargs_sym)
 
 
+<<<<<<< HEAD
 def _autotune_metadata(input_nodes):
     """Helper function to extract autotune metadata from input nodes."""
     return {
@@ -4132,5 +4838,7 @@ def _log_autotune_exceptions(
         pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ensure lowering is imported so that `extern_kernels.*` is populated
 from . import lowering  # noqa: F401
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 6a83c08ffe592..e9bde9177de33 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -2,18 +2,25 @@
 import functools
 import itertools
 import logging
+<<<<<<< HEAD
 from collections import defaultdict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterable, Sequence
 from typing import Any, Callable, cast, Optional, Union
 
 import sympy
 from sympy import Expr
 
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import (
     free_symbols,
     has_free_unbacked_symbols,
     ShapeEnv,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols, ShapeEnv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
 from torch.utils._sympy.symbol import symbol_is_type, SymT
@@ -60,6 +67,7 @@ def statically_known_true(
 # lifting and in some cases we should be directly passing through to ShapeEnv,
 # but there is some extra inductor logic that needs to be handled here
 class SizeVarAllocator:
+<<<<<<< HEAD
     """
     A class that manages symbolic size variables and their relationships.
 
@@ -73,11 +81,18 @@ def __init__(self, shape_env=None) -> None:
         super().__init__()
         # Note: this can lead to bugs. Reasoning APIs depends on existing information in
         # in the shape_env. For example! var_to_ranges can't be empty!
+=======
+    def __init__(self, shape_env=None) -> None:
+        super().__init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if shape_env is None:
             shape_env = ShapeEnv()
         self.shape_env = shape_env
         self.var_to_val = self.shape_env.var_to_val
+<<<<<<< HEAD
         self.var_to_hint_override = self.shape_env.var_to_hint_override
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.replacements: dict[sympy.Symbol, Expr] = self.shape_env.replacements
         self.unbacked_replacements: Optional[dict[Expr, Expr]] = None
         # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
@@ -113,7 +128,11 @@ def simplify_with_ranges(expr: Expr, var_ranges: VarRanges) -> Expr:
                 cache.clear()
                 replacement_count = len(self.replacements)
             key = (expr, *var_ranges.items())
+<<<<<<< HEAD
             result = cache.get(key)
+=======
+            result = cache.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if result is None:
                 result = self._simplify_with_ranges(expr, var_ranges)
                 cache[key] = result
@@ -137,7 +156,11 @@ def simplify_loops(index_vars, sizes, index_formulas):
                 cache.clear()
                 replacement_count = len(self.replacements)
             key = (*index_vars, *sizes, *index_formulas)
+<<<<<<< HEAD
             result = cache.get(key)
+=======
+            result = cache.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if result is None:
                 result = self._simplify_loops_impl(index_vars, sizes, index_formulas)
                 cache[key] = result
@@ -181,7 +204,10 @@ def _simplify_with_ranges(self, expr: Expr, var_ranges: VarRanges) -> Expr:
         def statically_known(expr):
             evaluated = self.shape_env._maybe_evaluate_static(
                 expr,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 axioms=axioms,
                 var_to_range=var_to_range_tuple,
             )
@@ -324,7 +350,11 @@ def prune(index):
     # Note - [On Statically Known]
     # The statically_known_* family of functions below NEVER guard, they could return True if the
     # asked questions can be answered without guarding otherwise they return False.
+<<<<<<< HEAD
     # Those are similar to statically_known_true in symbolic_shapes.py but operate on sympy
+=======
+    # Those are similar to statically_known_true in symbolic_shapes but operate on sympy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # expressions instead of symnodes.
     def statically_known_true(self, expr: Union[sympy.Basic, bool]) -> bool:
         """
@@ -341,9 +371,13 @@ def statically_known_equals(
         """
         return self.statically_known_true(sympy.Eq(left, right))  # type: ignore[arg-type]
 
+<<<<<<< HEAD
     def statically_known_list_equals(
         self, left: Sequence[Expr], right: Sequence[Expr]
     ) -> bool:
+=======
+    def statically_known_list_equals(self, left: list[Expr], right: list[Expr]) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns a bool indicating if it is sound to optimize as if left and right lists are equal.
         """
@@ -385,16 +419,23 @@ def statically_known_multiple_of(
         """
         Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
         """
+<<<<<<< HEAD
         # The reason we skip compute here is to avoid the cost of trying to eval this symbolically.
         # see https://github.com/sympy/sympy/issues/28200
+=======
+        # The reason we skip unbacked here is that we want to avoid the cost of trying to eval this symbolically.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if has_free_unbacked_symbols(numerator) or has_free_unbacked_symbols(
             denominator
         ):
             return False
+<<<<<<< HEAD
 
         if len(free_symbols(numerator)) > 20:
             return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expr = sympy.Eq(numerator % denominator, 0)
         return self.statically_known_true(expr)  # type: ignore[arg-type]
 
@@ -404,6 +445,7 @@ def statically_known_power_of_2(self, expr: Expr) -> bool:
         """
         return isinstance(expr, sympy.Integer) and is_power_of_2(int(expr))
 
+<<<<<<< HEAD
     # The expect/check functions require you to ALREADY KNOW that a particular
     # condition holds. They are similar to expect_true in symbolic_shapes.py and
     # torch.check but operates on sympy expressions instead of symnodes.
@@ -472,6 +514,68 @@ def guard_or_true(self, left):
             if static_val is not None:
                 return static_val
             return True
+=======
+    # The guard functions require you to ALREADY KNOW that a particular
+    # condition holds.  If you don't know (you want to guard on an expression
+    # being a particular value, and then get access to that value), use
+    # the evaluate functions.
+
+    def guard_equals(self, left: Expr, right: Expr) -> Expr:
+        if isinstance(left, Expr):
+            left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        if isinstance(right, Expr):
+            right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+
+        expr = sympy.Eq(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return left
+
+        assert self.shape_env.guard_or_defer_runtime_assert(expr, "guard_equals")
+        return left
+
+    def guard_leq(self, left: Expr, right: Expr) -> None:
+        return self.guard_lt(left, right + 1)
+
+    def guard_lt(self, left: Expr, right: Expr) -> None:
+        expr = sympy.Lt(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return
+
+        assert self.shape_env.guard_or_defer_runtime_assert(expr, "guard_lt")
+
+    def guarded_order(self, seq):
+        """
+        Return the order of a sequence as a permutation of range(len(seq)) and guard on that order not changing.
+        """
+        seq = [*map(self.remove_precomputed_replacements, seq)]
+        seq = [
+            (self.size_hint_or_throw(var), orig_idx, var)
+            for orig_idx, var in enumerate(seq)
+        ]
+        seq.sort()
+        order = [-1] * len(seq)
+        last_var = None
+        for new_index, (_, orig_index, var) in enumerate(seq):
+            order[orig_index] = new_index
+            if last_var is not None:
+                self.guard_leq(last_var, var)
+            last_var = var
+        return order
+
+    # Similar to the functions guard_or_false/guard_or_true in symbolic_shapes but operates on sympy
+    # expressions instead of symnodes. see Note [guard_or_].
+
+    def guard_or_false(self, left):
+        return self.evaluate_expr(left, fallback_value=False)
+
+    def guard_or_true(self, left):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.evaluate_expr(left, fallback_value=True)
 
     # The evaluate functions evaluate some symbolic sympy expression
@@ -495,6 +599,7 @@ def evaluate_expr(
             fallback_value=fallback_value,
         )
 
+<<<<<<< HEAD
     def is_size_one_or_false(self, size: Expr) -> bool:
         """Return True if size equals 1.
 
@@ -502,6 +607,8 @@ def is_size_one_or_false(self, size: Expr) -> bool:
         """
         return self.guard_or_false(sympy.Eq(size, 1))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def evaluate_min(self, left: Expr, right: Expr) -> Expr:
         """return the smaller of left and right, and guard on that choice"""
         if isinstance(left, Expr):
@@ -525,10 +632,17 @@ def evaluate_min(self, left: Expr, right: Expr) -> Expr:
                 f"evaluate_min({left}, {right}) with unbacked symints"
             ) from None
         if lv <= rv:
+<<<<<<< HEAD
             self.check_leq(left, right)
             return left
         else:
             self.check_leq(right, left)
+=======
+            self.guard_leq(left, right)
+            return left
+        else:
+            self.guard_leq(right, left)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return right
 
     def evaluate_max(self, left: Expr, right: Expr) -> Expr:
@@ -538,6 +652,7 @@ def evaluate_max(self, left: Expr, right: Expr) -> Expr:
         min_val = self.evaluate_min(left, right)
         return right if min_val is left else left
 
+<<<<<<< HEAD
     def guard_int(self, expr: Union[Expr, int]) -> int:
         """
         Similar to guard_int in symbolic_shapes.py, except this function works with SymPy
@@ -556,12 +671,24 @@ def guard_int_seq(self, left: Sequence[Union[Expr, int]]) -> list[int]:
         Apply guard_int on a sequence of inputs.
         """
         return [self.guard_int(x) for x in left]
+=======
+    def evaluate_static_shape(self, left: Union[Expr, int]) -> int:
+        if isinstance(left, int):
+            return left
+        right = self.size_hint_or_throw(left)
+        self.guard_equals(left, sympy.Integer(right))
+        return int(right)
+
+    def evaluate_static_shapes(self, left: Sequence[Union[Expr, int]]) -> list[int]:
+        return [self.evaluate_static_shape(x) for x in left]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def remove_precomputed_replacements(self, expr: Expr) -> Expr:
         if any(symbol_is_type(s, SymT.PRECOMPUTED_SIZE) for s in expr.free_symbols):  # type: ignore[attr-defined]
             return sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         return expr
 
+<<<<<<< HEAD
     def symbolic_hint(
         self,
         expr: Union[Expr, int],
@@ -571,6 +698,9 @@ def symbolic_hint(
         # in a heuristic nature.
         use_user_provided_hint_override: bool = False,
     ) -> Union[Expr, int]:
+=======
+    def symbolic_hint(self, expr: Union[Expr, int]) -> Union[Expr, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(expr, int):
             return expr
         # Substitute all hints into expr, but leave unbacked symints alone
@@ -584,6 +714,7 @@ def symbolic_hint(
                 return int(expr)  # type: ignore[return-value]
             except TypeError:
                 return expr  # inf/nan/I
+<<<<<<< HEAD
 
         if hint_override:
             return hint_override
@@ -607,6 +738,15 @@ def size_hint(
             hint_override=hint_override,
             use_user_provided_hint_override=fallback is not None,
         )
+=======
+        expr = self.remove_precomputed_replacements(expr)
+        return sympy_subs(expr, self.var_to_val)
+
+    def size_hint(
+        self, expr: Union[Expr, int], *, fallback: Optional[int] = None
+    ) -> int:
+        out = self.symbolic_hint(expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
             # Use the provided heuristic fallback hint
             unbacked_sym_vrs = {
@@ -627,7 +767,10 @@ def size_hint(
             raise
 
     def size_hint_or_throw(self, expr: Union[Expr, int]) -> int:
+<<<<<<< HEAD
         # Like size_hint but there's no fallback for unbacked symints, so it throws.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = self.symbolic_hint(expr)
         try:
             return int(out)
@@ -640,6 +783,7 @@ def size_hints(
         exprs: Iterable[Union[Expr, int]],
         *,
         fallback: Optional[int] = None,
+<<<<<<< HEAD
         hint_override: Optional[int] = None,
     ) -> tuple[int, ...]:
         return tuple(
@@ -657,6 +801,10 @@ def size_hints_or_throw(
     ) -> tuple[int, ...]:
         # Like size_hints but there's no fallback for unbacked symints, so it throws.
         return tuple(self.size_hint_or_throw(x) for x in exprs)
+=======
+    ) -> tuple[int, ...]:
+        return tuple(self.size_hint(x, fallback=fallback) for x in exprs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _lru_cache(self, fn, maxsize=None):
         """
@@ -731,6 +879,7 @@ def _stride_vars(
         return strides
 
     def _get_unbacked_replacements(self) -> dict[Expr, Expr]:
+<<<<<<< HEAD
         if self.unbacked_replacements is not None:
             return self.unbacked_replacements
 
@@ -887,21 +1036,58 @@ def _choose(x: int, y: int) -> bool:
             if expr != canonical_expr:
                 self.unbacked_replacements[expr] = canonical_expr
 
+=======
+        """
+        This helps with covering unbacked symint cases where you may have two
+        expressions: s0 + u0 and u1. And s0 + u0 is known to be equal to u1
+        via deferred_runtime_asserts.
+
+        For example in atomically_apply_size_hint, it must return the same size
+        hint for both s0 + u0 and u1, but it first needs to know they are equal.
+        Then it can substitute s0 + u0 for u1.
+        """
+        if self.unbacked_replacements is not None:
+            return self.unbacked_replacements
+
+        self.unbacked_replacements = {}
+        for assertions in self.shape_env.deferred_runtime_asserts.values():
+            for assertion in assertions:
+                if not isinstance(assertion.expr, sympy.Equality):
+                    continue
+
+                lhs, rhs = assertion.expr.lhs, assertion.expr.rhs
+                l2r = lhs.compare(rhs) == 1  # see sympy.Basic.compare
+                src = lhs if l2r else rhs
+                dst = rhs if l2r else lhs
+
+                existing_replacement = self.unbacked_replacements.get(src, None)
+                if existing_replacement and isinstance(
+                    existing_replacement, sympy.Symbol
+                ):
+                    # Prefer to keep replacements with symbols.
+                    continue
+                self.unbacked_replacements[src] = dst
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.unbacked_replacements
 
     @functools.lru_cache  # noqa: B019
     def _sub_unbacked_exprs(self, expr: Expr) -> Expr:
         # it's fine to cache this fn since self is a singleton
         replacements = self._get_unbacked_replacements()
+<<<<<<< HEAD
 
         # consider making this threshold configurable
         sub_cnt_limit = 30
         sub_cnt = 0
         while sub_cnt < sub_cnt_limit:
+=======
+        while True:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_expr = expr.subs(replacements)
             if new_expr == expr:
                 return new_expr
             expr = sympy.factor(new_expr)
+<<<<<<< HEAD
             sub_cnt += 1
 
         log.warning("Substitution limit (%d) reached w/ %s", sub_cnt_limit, expr)
@@ -913,6 +1099,11 @@ def atomically_apply_size_hint(
         *,
         fallback: Optional[int] = None,
         hint_override: Optional[int] = None,
+=======
+
+    def atomically_apply_size_hint(
+        self, expr: Union[Expr, int], *, fallback: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Union[Expr, int]:
         if isinstance(expr, (int, sympy.Integer)):
             return int(expr)
@@ -929,14 +1120,22 @@ def atomically_apply_size_hint(
         assert isinstance(expr, Expr), type(expr)
         free_symbols = expr.free_symbols
         size_dict = {
+<<<<<<< HEAD
             symbol: V.graph.sizevars.size_hint(
                 symbol, fallback=fallback, hint_override=hint_override
             )
+=======
+            symbol: V.graph.sizevars.size_hint(symbol, fallback=fallback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for symbol in free_symbols
         }
         return expr.subs(size_dict)
 
+<<<<<<< HEAD
     def offset_var(self, index: Expr, vars: Sequence[sympy.Symbol]) -> Expr:
+=======
+    def offset_var(self, index: Expr, vars: list[sympy.Symbol]) -> Expr:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Extract offset part of an indexing expression"""
         index = self.simplify(index)
         return sympy_subs(index, {v: sympy.S.Zero for v in vars if v != 0})
diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py
index 536cfaaa1ecd5..661ec4de300e8 100644
--- a/torch/_inductor/standalone_compile.py
+++ b/torch/_inductor/standalone_compile.py
@@ -5,14 +5,21 @@
 import os
 import pickle
 import shutil
+<<<<<<< HEAD
 from abc import ABC, abstractmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import AbstractContextManager, nullcontext
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING
 
 import torch.fx
+<<<<<<< HEAD
 from torch._dynamo.aot_compile_types import BundledAOTAutogradSerializableCallable
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.cpp_builder import normalize_path_separator
+=======
+from torch._dynamo.utils import dynamo_timed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.cudagraph_utils import BoxedDeviceIndex
 from torch._inductor.runtime.cache_dir_utils import temporary_cache_dir
 from torch._inductor.utils import BoxedBool, InputType
@@ -32,9 +39,15 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 class CompiledArtifact(ABC):
     """
     CompiledArtifact class represents the inductor cache artifacts that
+=======
+class CompiledArtifact:
+    """
+    CompiledArtifact class represents the precompiled inductor artifact that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     can be invoked in order to avoid repeated compilation.
 
     CompiledArtifact can be obtained by calling standalone_compile(gm, example_inputs)
@@ -47,6 +60,7 @@ class CompiledArtifact(ABC):
     binary or unpacked data.
 
     Finally, the CompiledArtifact can be invoked via the __call__ method
+<<<<<<< HEAD
     to execute the cached artifact.
     """
 
@@ -109,6 +123,13 @@ class CacheCompiledArtifact(CompiledArtifact):
     """
 
     CACHE_HEADER = bytes("CacheCompiledArtifact", "utf-8")
+=======
+    to execute the precompiled artifact.
+    """
+
+    _compiled_fn: Callable[..., Any]
+    _artifacts: Optional[tuple[bytes, CacheInfo]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -142,6 +163,7 @@ def save(
                 from .codecache import torch_key
 
                 writer = BytesWriter()
+<<<<<<< HEAD
                 writer.write_bytes(CacheCompiledArtifact.CACHE_HEADER)
                 writer.write_bytes(torch_key())
                 writer.write_str(key)
@@ -150,6 +172,13 @@ def save(
                 from torch._inductor.codecache import write_atomic
 
                 write_atomic(path, writer.to_bytes())
+=======
+                writer.write_bytes(torch_key())
+                writer.write_str(key)
+                writer.write_bytes(artifact_bytes)
+                with open(path, "wb") as file:
+                    file.write(writer.to_bytes())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 assert format == "unpacked"
                 if os.path.exists(path):
@@ -176,6 +205,7 @@ def save(
                             log.info("Output code written to: %s", output_file)
 
     @staticmethod
+<<<<<<< HEAD
     def _load_impl(
         cache_dir_ctx: AbstractContextManager[Any], key: str
     ) -> CompiledArtifact:
@@ -222,6 +252,11 @@ def _prepare_load(
         Do format specific prep and loads, return a context manager and key
         """
         path = normalize_path_separator(path)
+=======
+    def load(
+        *, path: str, format: Literal["binary", "unpacked"] = "binary"
+    ) -> CompiledArtifact:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with dynamo_timed("CompiledArtifact.load"):
             if format == "binary":
                 # can't assert that it is a file since it might not exist yet
@@ -239,7 +274,12 @@ def _prepare_load(
                 assert reader.is_finished()
 
                 torch.compiler.load_cache_artifacts(artifact_bytes)
+<<<<<<< HEAD
                 return key, nullcontext()
+=======
+
+                cache_dir_ctx: AbstractContextManager[None] = nullcontext()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 assert format == "unpacked"
                 assert os.path.isdir(path)
@@ -249,6 +289,7 @@ def _prepare_load(
                 assert len(files) == 1
                 key = files[0]
                 cache_dir_ctx = temporary_cache_dir(path)
+<<<<<<< HEAD
                 return key, cache_dir_ctx
 
     @staticmethod
@@ -348,6 +389,44 @@ def load(
             artifact = reader.read_bytes()
             assert reader.is_finished()
             return AOTCompiledArtifact.deserialize(artifact)
+=======
+
+            with (
+                cache_dir_ctx,
+                config.patch(unsafe_skip_cache_dynamic_shape_guards=True),
+            ):
+                with torch._functorch.config.patch(strict_autograd_cache=True):
+                    from torch._functorch._aot_autograd.autograd_cache import (
+                        AOTAutogradCache,
+                    )
+
+                    entry = AOTAutogradCache._lookup(
+                        key,
+                        local=True,
+                        remote=False,
+                        args=[],
+                        cache_info={},
+                        aot_config=None,
+                    )
+
+                assert entry is not None
+
+                from .compile_fx import _CompileFxKwargs
+
+                fx_config = _CompileFxKwargs(
+                    cudagraphs=BoxedBool(False),
+                    boxed_forward_device_index=BoxedDeviceIndex(0),
+                )
+
+                context = torch._guards.TracingContext(
+                    FakeTensorMode(shape_env=ShapeEnv())
+                )
+                with torch._guards.tracing(context):
+                    compiled_fn = entry.wrap_post_compile(
+                        [], entry.sanitized_aot_config, fx_config
+                    )
+            return CompiledArtifact(lambda *args: compiled_fn(list(args)), None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def standalone_compile(
@@ -356,11 +435,15 @@ def standalone_compile(
     *,
     dynamic_shapes: Any,
     options: Any,
+<<<<<<< HEAD
     aot: bool = False,  # AOT mode, which uses BundledAOTAutogradCache
 ) -> CompiledArtifact:
     """
     Implementation of torch.inductor.standalone_compile
     """
+=======
+) -> CompiledArtifact:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.compiler._cache import CacheArtifactManager
 
     from .compile_fx import compile_fx
@@ -375,7 +458,10 @@ def standalone_compile(
         # Reuse fake_mode from the TracingContext.
         # NB: The TracingContext only exists if we're currently in a torch.compile backend.
         context = torch._guards.TracingContext.get()
+<<<<<<< HEAD
         assert context.fake_mode is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fake_mode = context.fake_mode
     elif dynamic_shapes == "from_graph":
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
@@ -386,13 +472,18 @@ def standalone_compile(
         last_node = next(iter(reversed(gm.graph.nodes)))
         assert last_node.op == "output"
         assert len(last_node.args) == 1
+<<<<<<< HEAD
 
         def handle_node(node: torch.fx.Node) -> None:
             nonlocal fake_mode
+=======
+        for node in last_node.args[0]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if "example_value" in node.meta:
                 maybe_tensor = node.meta["example_value"]
                 if isinstance(maybe_tensor, torch._subclasses.fake_tensor.FakeTensor):
                     fake_mode = maybe_tensor.fake_mode
+<<<<<<< HEAD
 
         # If gm came from Dynamo, then last_node.args[0] is always a list,
         # even in single-Tensor returns.
@@ -406,6 +497,8 @@ def handle_node(node: torch.fx.Node) -> None:
             for node in last_node.args[0]:
                 handle_node(node)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise ValueError(
             f"standalone_compile got unsupported `dynamic_shapes` value: dynamic_shapes={dynamic_shapes}."
@@ -416,7 +509,10 @@ def handle_node(node: torch.fx.Node) -> None:
         torch._guards.tracing(context),
         CacheArtifactManager.with_fresh_cache(),
         config.patch("triton.autotune_at_compile_time", True),
+<<<<<<< HEAD
         torch._functorch.config.patch("bundled_autograd_cache", aot),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # compile_fx can mutate gm
         gm = copy.deepcopy(gm)
@@ -424,12 +520,16 @@ def handle_node(node: torch.fx.Node) -> None:
             gm, example_inputs, ignore_shape_env=ignore_shape_env, **options
         )
         assert callable(compiled_fn)
+<<<<<<< HEAD
         if aot:
             if not hasattr(compiled_fn, "serialize"):
                 raise RuntimeError(
                     "Compiled function should have serialize method when aot=True"
                 )
             return AOTCompiledArtifact(compiled_fn)
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         artifacts = torch.compiler.save_cache_artifacts()
         if artifacts is None:
             log.warning(
@@ -437,4 +537,8 @@ def handle_node(node: torch.fx.Node) -> None:
                 "Run with TORCH_LOGS=+torch._inductor.codecache to identify the problem"
             )
 
+<<<<<<< HEAD
     return CacheCompiledArtifact(compiled_fn, artifacts)
+=======
+    return CompiledArtifact(compiled_fn, artifacts)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/subgraph_lowering.py b/torch/_inductor/subgraph_lowering.py
index 180a9d0eba801..cb2a0667ef89b 100644
--- a/torch/_inductor/subgraph_lowering.py
+++ b/torch/_inductor/subgraph_lowering.py
@@ -13,7 +13,10 @@
 
 from . import ir
 from .exc import SubgraphLoweringException
+<<<<<<< HEAD
 from .graph import GraphLowering
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .ops_handler import SimpleCSEHandler
 from .virtualized import ops, V, WrapperHandler
 
@@ -33,7 +36,11 @@ class PointwiseSubgraphLowering(torch.fx.Interpreter):
     """
 
     graph_outputs: Optional[list[ir.IRNode]]
+<<<<<<< HEAD
     root_graph: GraphLowering
+=======
+    root_graph: torch._inductor.graph.GraphLowering
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _current_op: Optional[TargetType]
     # For backwards of buffer_grads with scatters we allow mutations
     allowed_mutations: Optional[OrderedSet[OpOverload]]
@@ -44,7 +51,11 @@ class PointwiseSubgraphLowering(torch.fx.Interpreter):
     def __init__(
         self,
         gm: torch.fx.GraphModule,
+<<<<<<< HEAD
         root_graph_lowering: GraphLowering,
+=======
+        root_graph_lowering: torch._inductor.graph.GraphLowering,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         allowed_mutations: Optional[OrderedSet[OpOverload]] = None,
         additional_lowerings: Optional[LoweringDict] = None,
     ) -> None:
@@ -87,7 +98,12 @@ def mark_buffer_mutated(self, name: str) -> None:
 
     def register_buffer(self, buffer: ir.Buffer, *, set_name: bool = False) -> str:
         if self._approved_mutator():
+<<<<<<< HEAD
             name = self.root_graph.register_buffer(buffer, set_name=set_name)
+=======
+            name = self.qualify_name(f"buf{len(self.buffers)}")
+            self.buffers.append(buffer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return name
         else:
             raise SubgraphLoweringException(
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
new file mode 100644
index 0000000000000..dfd37523a3702
--- /dev/null
+++ b/torch/_inductor/template_heuristics.py
@@ -0,0 +1,1180 @@
+from __future__ import annotations
+
+import dataclasses
+import itertools
+import math
+from functools import partial
+from threading import Lock
+from typing import Any, Callable, TYPE_CHECKING
+
+import torch
+from torch.utils._ordered_set import OrderedSet
+
+from . import config
+from .utils import get_backend_num_stages
+from .virtualized import V
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from triton import Config as TritonConfig
+
+
+# Gemm Configs
+@dataclasses.dataclass
+class BaseConfig:
+    """
+    Base Gemm configuration used for most backends (CPU, CUDA)
+    """
+
+    block_m: int
+    block_n: int
+    block_k: int
+    num_stages: int
+    num_warps: int
+
+
+@dataclasses.dataclass
+class GemmConfig(BaseConfig):
+    """
+    Gemm configuration used for most backends (CPU, CUDA)
+    """
+
+    group_m: int = 8
+
+
+ConvConfig = BaseConfig
+
+
+# FlexAttention Configs
+@dataclasses.dataclass
+class FlexConfig:
+    """
+    Base Config class for flex attention
+    - FlexAttn forward, backward and flex decode will use this
+
+    NOTE:
+    For flex_attn bwd block_m and block_n are reused for block_m1, block_m2, block_n1, block_n2
+
+    """
+
+    block_m: int
+    block_n: int
+    num_stages: int
+    num_warps: int
+
+
+@dataclasses.dataclass
+class FlexDecodeConfig:
+    """
+    Config class for flex decoding
+    """
+
+    block_n: int
+    num_stages: int
+    num_warps: int
+
+
+# ROCm classes
+@dataclasses.dataclass
+class ROCmGemmConfig(GemmConfig):
+    """
+    ROCm subclass for GEMMs, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 16
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
+@dataclasses.dataclass
+class ROCmConvConfig(ConvConfig):
+    """
+    ROCm subclass for Conv, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 16
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
+@dataclasses.dataclass
+class ROCmFlexConfig(FlexConfig):
+    """
+    ROCm subclass for FlexAttn, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
+@dataclasses.dataclass
+class ROCmFlexDecodeConfig(FlexDecodeConfig):
+    """
+    ROCm subclass for FlexDecode, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
+class BaseHeuristicSingleton(type):
+    """
+    Thread-safe implementation of single to be used in the config heuristic subclasses
+    to ensure heavy __init__ calls are not repeatedly run
+    """
+
+    _instances: dict[type[Any], Any] = {}
+    _lock: Lock = Lock()
+
+    def __call__(
+        cls: BaseHeuristicSingleton, *args: Any, **kwargs: Any
+    ) -> BaseConfigHeuristic:
+        with cls._lock:
+            if cls not in cls._instances:
+                instance = super().__call__()
+                cls._instances[cls] = instance
+            return cls._instances[cls]
+
+
+class BaseConfigHeuristic(metaclass=BaseHeuristicSingleton):
+    """
+    Base class for mm_configs, device specific triton kernels config inherit from here
+    """
+
+    def __init__(self) -> None:
+        # List of dictionaries to store the kernel configs. Configs that evaluate to true
+        # will be utilised on the target platform. The configs are as follows:
+        # (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps)
+        self.mm_configs: list[BaseConfig] = [
+            GemmConfig(32, 32, 16, 1, 2),
+            GemmConfig(32, 32, 128, 2, 4),
+            GemmConfig(32, 64, 32, 5, 8),
+            GemmConfig(64, 32, 32, 5, 8),
+            GemmConfig(64, 32, 128, 5, 4),
+            GemmConfig(64, 64, 16, 2, 4),
+            GemmConfig(64, 64, 32, 2, 4),
+            GemmConfig(64, 64, 64, 3, 8),
+            GemmConfig(64, 64, 128, 5, 4),
+            GemmConfig(64, 128, 32, 3, 4),
+            GemmConfig(64, 128, 32, 4, 8),
+            GemmConfig(64, 128, 64, 3, 4),
+            GemmConfig(64, 128, 128, 4, 4),
+            GemmConfig(128, 64, 32, 3, 4),
+            GemmConfig(128, 64, 32, 4, 8),
+            GemmConfig(128, 128, 32, 2, 8),
+            GemmConfig(128, 128, 32, 3, 4),
+            GemmConfig(128, 128, 64, 3, 4),
+            GemmConfig(128, 128, 64, 5, 8),
+        ]
+
+        # Exhaustive search for mm configs
+        self.exhaustive_configs: list[BaseConfig] = [
+            GemmConfig(BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps, group_m)
+            for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+                [16, 32, 64, 128, 256], repeat=3
+            )
+            for num_stages in [1, 2, 3, 4, 5]
+            for num_warps in [2, 4, 8]
+            for group_m in [8]
+        ]
+
+        # these are only used in tuned_mm when AutoHeuristic is enabled
+        # the idea is that when AutoHeuristic collects data to learn a heuristic, more configs are autotuned
+        # when the learned heuristic is used, the learned heuristic reduces the number of configs down to 10
+        # which saves compilation time (since less configs are autotuned) and potentially increase performance
+        # because the learned heuristic might predict a config that is not part mm_configs
+        self.extra_mm_configs: list[BaseConfig] = [
+            GemmConfig(16, 32, 16, 3, 2),
+            GemmConfig(16, 32, 32, 4, 2),
+            GemmConfig(16, 32, 32, 5, 2),
+            GemmConfig(64, 64, 128, 3, 4),
+            GemmConfig(128, 64, 32, 2, 2),
+            GemmConfig(128, 64, 64, 3, 8),
+            GemmConfig(128, 64, 128, 4, 8),
+            GemmConfig(128, 128, 32, 4, 4),
+            GemmConfig(128, 128, 64, 3, 8),
+            GemmConfig(128, 128, 64, 5, 4),
+        ]
+
+        self.int8_mm_configs: list[BaseConfig] = [
+            GemmConfig(64, 64, 32, 2, 4),
+            GemmConfig(64, 128, 32, 3, 4),
+            GemmConfig(128, 64, 32, 3, 4),
+            GemmConfig(64, 128, 32, 4, 8),
+            GemmConfig(128, 64, 32, 4, 8),
+            GemmConfig(64, 32, 32, 5, 8),
+            GemmConfig(32, 64, 32, 5, 8),
+            GemmConfig(128, 128, 32, 2, 8),
+            GemmConfig(64, 64, 64, 3, 8),
+            GemmConfig(128, 256, 128, 3, 8),
+            GemmConfig(256, 128, 128, 3, 8),
+        ]
+
+        self.mixed_mm_configs: list[BaseConfig] = [
+            GemmConfig(16, 128, 256, 3, 4),
+            GemmConfig(16, 128, 256, 5, 8),
+        ]
+
+        self.persistent_mm_configs: list[BaseConfig] = [
+            GemmConfig(128, 256, 64, 3, 8),
+            GemmConfig(128, 128, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 8),
+            GemmConfig(128, 128, 128, 3, 4),
+            GemmConfig(128, 128, 64, 4, 8),
+            GemmConfig(128, 128, 64, 5, 8),
+            GemmConfig(256, 128, 64, 4, 8),
+            GemmConfig(128, 128, 64, 5, 4),
+        ]
+
+        self.scaled_mm_configs: list[BaseConfig] = [
+            GemmConfig(128, 256, 32, 3, 8),
+            GemmConfig(256, 128, 32, 3, 8),
+            GemmConfig(256, 64, 32, 4, 4),
+            GemmConfig(64, 256, 32, 4, 4),
+            GemmConfig(128, 128, 32, 4, 4),
+            GemmConfig(128, 64, 32, 4, 4),
+            GemmConfig(64, 128, 32, 4, 4),
+            GemmConfig(128, 32, 32, 4, 4),
+            GemmConfig(64, 32, 32, 5, 2),
+            GemmConfig(256, 128, 128, 3, 8),
+            GemmConfig(256, 64, 128, 4, 4),
+            GemmConfig(64, 256, 128, 4, 4),
+            GemmConfig(128, 128, 128, 4, 4),
+            GemmConfig(128, 64, 64, 4, 4),
+            GemmConfig(64, 128, 64, 4, 4),
+            GemmConfig(128, 32, 64, 4, 4),
+            GemmConfig(64, 32, 64, 5, 2),
+            GemmConfig(16, 32, 32, 2, 2),
+            GemmConfig(16, 64, 32, 2, 2),
+            GemmConfig(16, 128, 32, 2, 4),
+            GemmConfig(16, 256, 32, 2, 4),
+            GemmConfig(16, 32, 64, 2, 2),
+            GemmConfig(16, 64, 64, 2, 2),
+            GemmConfig(16, 128, 64, 2, 4),
+            GemmConfig(16, 256, 64, 2, 4),
+            GemmConfig(32, 32, 32, 2, 2),
+            GemmConfig(32, 64, 32, 2, 2),
+            GemmConfig(32, 128, 32, 2, 4),
+            GemmConfig(32, 256, 32, 2, 4),
+            GemmConfig(32, 32, 64, 2, 2),
+            GemmConfig(32, 64, 64, 2, 2),
+            GemmConfig(32, 128, 64, 2, 4),
+            GemmConfig(32, 256, 64, 2, 4),
+            GemmConfig(16, 32, 32, 3, 2),
+            GemmConfig(16, 64, 32, 3, 2),
+            GemmConfig(16, 128, 32, 3, 4),
+            GemmConfig(16, 256, 32, 3, 4),
+            GemmConfig(16, 32, 64, 3, 2),
+            GemmConfig(16, 64, 64, 3, 2),
+            GemmConfig(16, 128, 64, 3, 4),
+            GemmConfig(16, 256, 64, 3, 4),
+            GemmConfig(32, 32, 32, 3, 2),
+            GemmConfig(32, 64, 32, 3, 2),
+            GemmConfig(32, 128, 32, 3, 4),
+            GemmConfig(32, 256, 32, 3, 4),
+            GemmConfig(32, 32, 64, 3, 2),
+            GemmConfig(32, 64, 64, 3, 2),
+            GemmConfig(32, 128, 64, 3, 4),
+            GemmConfig(32, 256, 64, 3, 4),
+            GemmConfig(16, 32, 32, 4, 2),
+            GemmConfig(16, 64, 32, 4, 2),
+            GemmConfig(16, 128, 32, 4, 4),
+            GemmConfig(16, 256, 32, 4, 4),
+            GemmConfig(16, 32, 64, 4, 2),
+            GemmConfig(16, 64, 64, 4, 2),
+            GemmConfig(16, 128, 64, 4, 4),
+            GemmConfig(16, 256, 64, 4, 4),
+            GemmConfig(32, 32, 32, 4, 2),
+            GemmConfig(32, 64, 32, 4, 2),
+            GemmConfig(32, 128, 32, 4, 4),
+            GemmConfig(32, 256, 32, 4, 4),
+            GemmConfig(32, 32, 64, 4, 2),
+            GemmConfig(32, 64, 64, 4, 2),
+            GemmConfig(32, 128, 64, 4, 4),
+            GemmConfig(32, 256, 64, 4, 4),
+            GemmConfig(16, 32, 32, 5, 2),
+            GemmConfig(16, 64, 32, 5, 2),
+            GemmConfig(16, 128, 32, 5, 4),
+            GemmConfig(16, 256, 32, 5, 4),
+            GemmConfig(16, 32, 64, 5, 2),
+            GemmConfig(16, 64, 64, 5, 2),
+            GemmConfig(16, 128, 64, 5, 4),
+            GemmConfig(16, 256, 64, 5, 4),
+            GemmConfig(32, 32, 32, 5, 2),
+            GemmConfig(32, 64, 32, 5, 2),
+            GemmConfig(32, 128, 32, 5, 4),
+            GemmConfig(32, 256, 32, 5, 4),
+            GemmConfig(32, 32, 64, 5, 2),
+            GemmConfig(32, 64, 64, 5, 2),
+            GemmConfig(32, 128, 64, 5, 4),
+            GemmConfig(32, 256, 64, 5, 4),
+            GemmConfig(16, 32, 32, 6, 2),
+            GemmConfig(16, 64, 32, 6, 2),
+            GemmConfig(16, 128, 32, 6, 4),
+            GemmConfig(16, 256, 32, 6, 4),
+            GemmConfig(16, 32, 64, 6, 2),
+            GemmConfig(16, 64, 64, 6, 2),
+            GemmConfig(16, 128, 64, 6, 4),
+            GemmConfig(16, 256, 64, 6, 4),
+            GemmConfig(32, 32, 32, 6, 2),
+            GemmConfig(32, 64, 32, 6, 2),
+            GemmConfig(32, 128, 32, 6, 4),
+            GemmConfig(32, 256, 32, 6, 4),
+            GemmConfig(32, 32, 64, 6, 2),
+            GemmConfig(32, 64, 64, 6, 2),
+            GemmConfig(32, 128, 64, 6, 4),
+            GemmConfig(32, 256, 64, 6, 4),
+        ]
+
+        self.scaled_persistent_mm_configs: list[BaseConfig] = [
+            GemmConfig(128, 128, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 8),
+            GemmConfig(128, 128, 128, 4, 8),
+            GemmConfig(128, 128, 128, 4, 4),
+            GemmConfig(128, 128, 128, 3, 4),
+            GemmConfig(128, 128, 128, 5, 4),
+            GemmConfig(128, 128, 128, 5, 8),
+            GemmConfig(128, 128, 128, 6, 8),
+            GemmConfig(128, 128, 64, 4, 8),
+        ]
+
+        # TODO: Unify with other gemm patterns, mm_plus_mm currently follows
+        # slightly different pattern than rest
+        self.mm_plus_mm_configs: list[BaseConfig] = [
+            GemmConfig(64, 64, 32, 2, 4),
+            GemmConfig(64, 64, 32, 3, 8),
+            GemmConfig(64, 64, 32, 4, 16),
+            GemmConfig(64, 32, 32, 4, 8),
+            GemmConfig(32, 64, 32, 4, 8),
+            GemmConfig(128, 128, 32, 1, 8),
+            GemmConfig(64, 64, 64, 1, 8),
+            GemmConfig(32, 32, 128, 1, 8),
+            GemmConfig(64, 64, 16, 2, 4),
+            GemmConfig(32, 32, 16, 1, 2),
+        ]
+
+        self.conv_configs: list[BaseConfig] = [
+            ConvConfig(64, 256, 16, 2, 4),
+            ConvConfig(256, 64, 16, 2, 4),
+            ConvConfig(1024, 16, 16, 1, 8),
+            ConvConfig(128, 128, 32, 2, 8),
+            ConvConfig(64, 64, 32, 2, 4),
+            ConvConfig(64, 256, 32, 2, 8),
+            ConvConfig(256, 64, 32, 2, 8),
+        ]
+
+        self.flex_attn_fwd_autotune_configs: list[FlexConfig] = [
+            FlexConfig(128, 64, 3, 4),
+            FlexConfig(128, 128, 3, 4),
+            FlexConfig(128, 128, 2, 8),
+            FlexConfig(64, 128, 3, 4),
+            FlexConfig(64, 64, 3, 4),
+        ]
+
+        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
+            FlexConfig(BLOCK1, BLOCK2, s, w)
+            for BLOCK1 in [32, 64]
+            for BLOCK2 in [32, 64, 128]
+            for s in [1, 3, 4, 5]  # num_stages
+            for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
+            if BLOCK2 % BLOCK1 == 0
+        ]
+
+        self.flex_decode_autotune_configs: list[FlexDecodeConfig] = [
+            FlexDecodeConfig(64, 3, 2),
+            FlexDecodeConfig(32, 3, 2),
+            FlexDecodeConfig(128, 3, 2),
+        ]
+
+        self.exhaustive_flex_attn_fwd_configs: list[FlexConfig] = [
+            FlexConfig(BLOCK_M, BLOCK_N, num_stages, num_warps)
+            for BLOCK_M in [16, 32, 64, 128]
+            for BLOCK_N in [32, 64, 128]
+            for num_stages in [1, 3, 4, 5]
+            for num_warps in [2, 4, 8]
+        ]
+
+        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
+            FlexConfig(BLOCK1, BLOCK2, num_stages, num_warps)
+            for BLOCK1 in [16, 32, 64, 128]
+            for BLOCK2 in [16, 32, 64, 128]
+            for num_stages in [1, 3, 4, 5]
+            for num_warps in [2, 4, 8]
+            if BLOCK2 % BLOCK1 == 0
+        ]
+
+        self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
+            FlexDecodeConfig(block_n, num_stages, num_warps)
+            for block_n in [16, 32, 64, 128]
+            for num_stages in [1, 3, 4, 5]
+            for num_warps in [2, 4, 8]
+        ]
+
+    def _finalize_mm_configs(
+        self,
+        configs: list[BaseConfig],
+    ) -> Generator[TritonConfig, None, None]:
+        """
+        Finalizes configs after scaling, applying additional constraints.
+        """
+        used: OrderedSet[tuple[int, ...]] = OrderedSet()
+
+        max_mm_configs = config.test_configs.max_mm_configs
+
+        for conf in configs:
+            # Each warp computes a 16x16 tile = 256 elements
+            num_warps = min(conf.num_warps, conf.block_m * conf.block_n // 256)
+
+            # Construct key for finding duplicate configs
+            key: tuple[int, ...] = (
+                conf.block_m,
+                conf.block_n,
+                conf.block_k,
+                conf.num_stages,
+                num_warps,
+            )
+
+            # Check if gemm specific arg exists - add to key if does
+            group_m = getattr(conf, "group_m", None)
+            if group_m is not None:
+                key += (group_m,)
+
+            if key not in used and (
+                max_mm_configs is None or len(used) < max_mm_configs
+            ):
+                used.add(key)
+                kwargs = {
+                    "BLOCK_M": conf.block_m,
+                    "BLOCK_N": conf.block_n,
+                    "BLOCK_K": conf.block_k,
+                    "num_stages": conf.num_stages,
+                    "num_warps": num_warps,
+                }
+                if group_m is not None:
+                    kwargs["GROUP_M"] = group_m
+                yield self.triton_config(**kwargs)
+
+    def _scale_mm_configs(
+        self,
+        m: int,
+        n: int,
+        k: int,
+        configs: list[BaseConfig],
+        scale: float,
+        has_int8_tensor: bool,
+        exclude: Callable[[int, int, int], bool],
+    ) -> list[BaseConfig]:
+        """
+        Scales and filters matrix multiplication configs based on input size.
+        """
+        from .runtime.runtime_utils import next_power_of_2
+
+        min_block_size = 16
+        min_block_size_k = 32 if has_int8_tensor else 16
+
+        m = max(
+            next_power_of_2(
+                V.graph.sizevars.size_hint(
+                    m,
+                    fallback=config.unbacked_symint_fallback,  # type: ignore[arg-type]
+                )
+            ),
+            min_block_size,
+        )
+        n = max(
+            next_power_of_2(
+                V.graph.sizevars.size_hint(
+                    n,
+                    fallback=config.unbacked_symint_fallback,  # type: ignore[arg-type]
+                )
+            ),
+            min_block_size,
+        )
+        k = max(
+            next_power_of_2(
+                V.graph.sizevars.size_hint(
+                    k,
+                    fallback=config.unbacked_symint_fallback,  # type: ignore[arg-type]
+                )
+            ),
+            min_block_size_k,
+        )
+
+        scaled_configs = []
+        for c in configs:
+            scaled_config = dataclasses.replace(
+                c,
+                block_m=max(min(int(c.block_m * scale), m), min_block_size),
+                block_n=max(min(int(c.block_n * scale), n), min_block_size),
+                block_k=max(min(int(c.block_k * scale), k), min_block_size_k),
+            )
+
+            if not exclude(
+                scaled_config.block_m, scaled_config.block_n, scaled_config.block_k
+            ):
+                scaled_configs.append(scaled_config)
+
+        return scaled_configs
+
+    def _prune_exhaustive_configs(
+        self,
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        import torch
+
+        pruned_configs = []
+        for gemm_config in configs:
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
+            NUM_REG = 255
+
+            acc_regs = math.ceil(
+                gemm_config.block_m * gemm_config.block_n / (gemm_config.num_warps * 32)
+            )
+
+            shared_mem_accum = dtype_size * (
+                gemm_config.block_m * gemm_config.block_k
+                + gemm_config.block_n * gemm_config.block_k
+            )
+
+            # Will use more shared memory than available
+            if shared_mem_accum * gemm_config.num_stages > sm_available:
+                continue
+            # Lower bound for register spillage, if exceeds the kernel will certainly spill
+            elif acc_regs > NUM_REG:
+                continue
+
+            pruned_configs.append(gemm_config)
+
+        return pruned_configs
+
+    def preprocess_mm_configs(
+        self,
+        m: int,
+        n: int,
+        k: int,
+        configs: list[BaseConfig],
+        has_int8_tensor: bool = False,
+        scale: int = 1,
+        exclude: Callable[[int, int, int], bool] = lambda m, n, k: False,
+        dtype_size: int = 0,
+    ) -> Generator[TritonConfig, None, None]:
+        scaled_configs = self._scale_mm_configs(
+            m, n, k, configs, scale, has_int8_tensor, exclude
+        )
+
+        if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+            assert dtype_size > 0, "dtype_size must be provided for exhaustive search"
+            scaled_configs = self._prune_exhaustive_configs(scaled_configs, dtype_size)
+        return self._finalize_mm_configs(scaled_configs)
+
+    def triton_config(
+        self, num_stages: int, num_warps: int, **kwargs: Any
+    ) -> TritonConfig:
+        from triton import Config as TritonConfig  # type: ignore[attr-defined]
+
+        return TritonConfig(kwargs, num_stages=num_stages, num_warps=num_warps)
+
+    def get_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.mm_configs)
+
+    def get_exhaustive_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.exhaustive_configs)
+
+    def get_extra_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.extra_mm_configs)
+
+    def get_int8_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.int8_mm_configs)
+
+    def get_mixed_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        mm_configs = (
+            self.mm_configs + self.mixed_mm_configs
+            if config.max_autotune_gemm_search_space == "EXHAUSTIVE"
+            else self.mm_configs
+        )
+        return partial(self.preprocess_mm_configs, configs=mm_configs)
+
+    def get_persistent_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        persistent_mm_configs = (
+            self.exhaustive_configs
+            if config.max_autotune_gemm_search_space == "EXHAUSTIVE"
+            else self.persistent_mm_configs
+        )
+
+        # num_warps=2 not safe for TMA
+        persistent_mm_configs = [
+            config for config in persistent_mm_configs if config.num_warps != 2
+        ]
+        return partial(self.preprocess_mm_configs, configs=persistent_mm_configs)
+
+    def get_scaled_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.scaled_mm_configs)
+
+    def get_scaled_persistent_mm_configs(
+        self,
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(
+            self.preprocess_mm_configs, configs=self.scaled_persistent_mm_configs
+        )
+
+    def get_mm_plus_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self._finalize_mm_configs, configs=self.mm_plus_mm_configs)
+
+    def get_conv_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        return partial(self.preprocess_mm_configs, configs=self.conv_configs)
+
+    # Flex attn helpers
+    def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_fwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_fwd_configs
+            flex_attn_fwd_configs += self.flex_attn_fwd_autotune_configs
+
+        if head_dim <= 256:
+            if dtype == torch.float32:
+                default_config = FlexConfig(64, 64, 3, 4)
+            else:
+                default_config = FlexConfig(128, 64, 3, 4)
+        else:
+            if dtype == torch.float32:
+                default_config = FlexConfig(32, 16, 3, 4)
+            else:
+                default_config = FlexConfig(64, 32, 3, 4)
+
+        if default_config not in flex_attn_fwd_configs:
+            flex_attn_fwd_configs.append(default_config)
+
+        return flex_attn_fwd_configs
+
+    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_bwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_bwd_configs
+            flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
+
+        default_config = FlexConfig(16, 16, 1, 4)
+
+        if default_config not in flex_attn_bwd_configs:
+            flex_attn_bwd_configs.append(default_config)
+
+        return flex_attn_bwd_configs
+
+    def get_flex_decode_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexDecodeConfig]:
+        flex_decode_configs: list[FlexDecodeConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_decode_configs
+            flex_decode_configs += self.flex_decode_autotune_configs
+
+        default_config = FlexDecodeConfig(block_n=64, num_stages=1, num_warps=2)
+
+        if default_config not in flex_decode_configs:
+            flex_decode_configs.append(default_config)
+
+        return flex_decode_configs
+
+
+class CPUConfigHeuristic(BaseConfigHeuristic):
+    pass
+
+
+class CUDAConfigHeuristic(BaseConfigHeuristic):
+    """
+    Child class for CUDA device specific gemm/flex attention/conv/ configs.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.h100_default_flex_config = {
+            (torch.float32, 64): FlexConfig(128, 32, 3, 4),
+            (torch.float32, 128): FlexConfig(32, 64, 3, 4),
+            (torch.float32, 256): FlexConfig(32, 32, 3, 4),
+            (torch.bfloat16, 64): FlexConfig(128, 128, 3, 4),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
+            (torch.bfloat16, 256): FlexConfig(64, 32, 3, 4),
+            (torch.float16, 64): FlexConfig(128, 128, 3, 4),
+            (torch.float16, 128): FlexConfig(128, 128, 3, 8),
+            (torch.float16, 256): FlexConfig(64, 32, 3, 4),
+        }
+
+        self.a100_default_flex_config = {
+            (torch.float32, 64): FlexConfig(128, 32, 3, 4),
+            (torch.float32, 128): FlexConfig(128, 32, 3, 4),
+            (torch.float32, 256): FlexConfig(64, 16, 3, 4),
+            (torch.bfloat16, 64): FlexConfig(128, 64, 3, 4),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
+            (torch.bfloat16, 256): FlexConfig(32, 64, 3, 4),
+            (torch.float16, 64): FlexConfig(128, 64, 3, 4),
+            (torch.float16, 128): FlexConfig(128, 64, 3, 8),
+            (torch.float16, 256): FlexConfig(32, 64, 3, 4),
+        }
+
+    def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        capability = torch.cuda.get_device_capability()
+        flex_attn_fwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_fwd_configs
+            flex_attn_fwd_configs += self.flex_attn_fwd_autotune_configs
+
+        if head_dim <= 256:
+            if dtype == torch.float32:
+                default_config = FlexConfig(64, 64, 3, 4)
+            else:
+                default_config = FlexConfig(128, 64, 3, 4)
+            if capability >= (9, 0):
+                default_config = self.h100_default_flex_config.get(
+                    (dtype, head_dim), default_config
+                )
+            elif capability >= (8, 0):
+                default_config = self.a100_default_flex_config.get(
+                    (dtype, head_dim), default_config
+                )
+        else:
+            if dtype == torch.float32:
+                default_config = FlexConfig(32, 16, 3, 4)
+            else:
+                default_config = FlexConfig(64, 32, 3, 4)
+
+        if default_config not in flex_attn_fwd_configs:
+            flex_attn_fwd_configs.append(default_config)
+
+        return flex_attn_fwd_configs
+
+    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        capability = torch.cuda.get_device_capability()
+
+        flex_attn_bwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_bwd_configs
+            flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
+
+        if dtype == torch.float32:
+            default_config = FlexConfig(16, 16, 1, 4)
+        elif head_dim <= 256 and capability >= (9, 0):  # H100
+            if head_dim == 64:
+                default_config = FlexConfig(64, 64, 3, 4)
+            elif head_dim == 128:
+                default_config = FlexConfig(64, 128, 3, 8)
+            else:
+                default_config = FlexConfig(64, 64, 2, 4)
+        elif capability >= (8, 0):  # A100
+            if head_dim == 64:
+                default_config = FlexConfig(32, 128, 3, 4)
+            elif head_dim == 128:
+                # SM86/89 have smaller shared memory sizes
+                num_stages = 3 if capability[1] == 0 else 2
+                default_config = FlexConfig(64, 64, num_stages, 4)
+            else:
+                default_config = FlexConfig(64, 64, 2, 4)
+        else:  # modest hardware or extremely large head_dim
+            default_config = FlexConfig(16, 16, 1, 4)
+
+        if default_config not in flex_attn_bwd_configs:
+            flex_attn_bwd_configs.append(default_config)
+
+        return flex_attn_bwd_configs
+
+    def get_flex_decode_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexDecodeConfig]:
+        capability = torch.cuda.get_device_capability()
+
+        default_config = FlexDecodeConfig(64, 1, 2)
+
+        flex_decode_configs: list[FlexDecodeConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_decode_configs
+            flex_decode_configs += self.flex_decode_autotune_configs
+
+        if capability >= (9, 0):  # sm_90+
+            if head_dim > 128 and dtype == torch.float32:
+                default_config = FlexDecodeConfig(64, 1, 2)
+            else:
+                default_config = FlexDecodeConfig(64, 3, 2)
+        else:
+            default_config = FlexDecodeConfig(64, 1, 2)
+
+        if default_config not in flex_decode_configs:
+            flex_decode_configs.append(default_config)
+
+        return flex_decode_configs
+
+
+class ROCmConfigHeuristic(BaseConfigHeuristic):
+    """
+    Child class for ROCm specific gemm/flex attention/conv/ configs.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.default_num_stages = get_backend_num_stages()
+
+        self.mm_configs: list[BaseConfig] = [
+            ROCmGemmConfig(
+                16, 16, 256, self.default_num_stages, 4, group_m=4, waves_per_eu=2
+            ),
+            ROCmGemmConfig(32, 16, 256, self.default_num_stages, 4, group_m=4),
+            ROCmGemmConfig(
+                32, 32, 16, self.default_num_stages, 4, group_m=8, waves_per_eu=2
+            ),
+            ROCmGemmConfig(32, 32, 128, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(32, 64, 64, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(
+                64, 16, 128, self.default_num_stages, 4, group_m=8, waves_per_eu=2
+            ),
+            ROCmGemmConfig(64, 32, 32, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(64, 32, 64, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(64, 32, 64, self.default_num_stages, 8, group_m=8),
+            ROCmGemmConfig(64, 32, 128, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(64, 64, 16, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(64, 64, 64, self.default_num_stages, 4, group_m=4),
+            ROCmGemmConfig(64, 64, 128, self.default_num_stages, 8, group_m=16),
+            ROCmGemmConfig(64, 64, 256, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(
+                64, 128, 32, self.default_num_stages, 4, group_m=4, waves_per_eu=2
+            ),
+            ROCmGemmConfig(64, 128, 32, self.default_num_stages, 8, group_m=8),
+            ROCmGemmConfig(64, 128, 64, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(64, 128, 128, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(128, 32, 32, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(128, 32, 64, self.default_num_stages, 4, group_m=8),
+            ROCmGemmConfig(
+                128, 64, 32, self.default_num_stages, 4, group_m=8, waves_per_eu=2
+            ),
+            ROCmGemmConfig(128, 64, 64, self.default_num_stages, 4, group_m=16),
+            ROCmGemmConfig(128, 64, 128, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(
+                128, 128, 32, self.default_num_stages, 4, group_m=16, waves_per_eu=2
+            ),
+            ROCmGemmConfig(128, 128, 32, self.default_num_stages, 8, group_m=16),
+            ROCmGemmConfig(
+                128, 128, 32, self.default_num_stages, 8, group_m=16, waves_per_eu=2
+            ),
+            ROCmGemmConfig(128, 128, 64, self.default_num_stages, 4, group_m=16),
+            ROCmGemmConfig(128, 128, 64, self.default_num_stages, 8, group_m=8),
+            ROCmGemmConfig(128, 128, 128, self.default_num_stages, 8, group_m=16),
+            ROCmGemmConfig(
+                128, 256, 32, self.default_num_stages, 4, group_m=16, waves_per_eu=2
+            ),
+            ROCmGemmConfig(128, 256, 64, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(256, 64, 64, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(
+                256, 128, 32, self.default_num_stages, 4, group_m=4, waves_per_eu=2
+            ),
+            ROCmGemmConfig(256, 128, 32, self.default_num_stages, 8, group_m=16),
+            ROCmGemmConfig(256, 128, 64, self.default_num_stages, 8, group_m=4),
+            ROCmGemmConfig(256, 256, 64, self.default_num_stages, 8, group_m=4),
+        ]
+
+        # Exhaustive search for mm configs
+        self.exhaustive_configs: list[BaseConfig] = [
+            ROCmGemmConfig(
+                BLOCK_M,
+                BLOCK_N,
+                BLOCK_K,
+                num_stages,
+                num_warps,
+                group_m,
+                matrix_instr_nonkdim,
+                waves_per_eu,
+                kpack,
+            )
+            for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+                [16, 32, 64, 128, 256], repeat=3
+            )
+            for num_stages in [1, self.default_num_stages]
+            for num_warps in [4, 8]
+            for group_m in [4, 8, 16]
+            for matrix_instr_nonkdim in [0, 16]
+            for waves_per_eu in [0, 2]
+            for kpack in [2]
+        ]
+
+        self.default_flex_config = {
+            (torch.float32, 64): ROCmFlexConfig(128, 32, 1, 4),
+            (torch.float32, 128): ROCmFlexConfig(128, 32, 1, 4),
+            (torch.float32, 256): ROCmFlexConfig(64, 16, 1, 4),
+            (torch.bfloat16, 64): ROCmFlexConfig(128, 64, 1, 8),
+            (torch.bfloat16, 128): ROCmFlexConfig(128, 64, 1, 8),
+            (torch.bfloat16, 256): ROCmFlexConfig(32, 64, 1, 8),
+            (torch.float16, 64): ROCmFlexConfig(128, 64, 1, 8),
+            (torch.float16, 128): ROCmFlexConfig(128, 64, 1, 8),
+            (torch.float16, 256): ROCmFlexConfig(32, 64, 1, 4),
+        }
+
+        self.flex_attn_fwd_autotune_configs: list[FlexConfig] = [
+            ROCmFlexConfig(BLOCK1, BLOCK2, 1, w)
+            for BLOCK1 in [16, 64, 128]
+            for BLOCK2 in [16, 32, 64, 128]
+            for w in [4, 8]
+        ]
+
+        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
+            ROCmFlexConfig(BLOCK1, BLOCK2, 1, w, mfma)
+            for BLOCK1 in [16, 32, 64]
+            for BLOCK2 in [32, 64, 128]
+            for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
+            for mfma in [0, 16]
+            if BLOCK2 % BLOCK1 == 0
+        ]
+
+        self.flex_decode_autotune_configs: list[FlexDecodeConfig] = [
+            ROCmFlexDecodeConfig(32, 1, 4),
+            ROCmFlexDecodeConfig(64, 1, 4),
+            ROCmFlexDecodeConfig(128, 1, 4),
+            ROCmFlexDecodeConfig(32, 1, 8),
+            ROCmFlexDecodeConfig(64, 1, 8),
+            ROCmFlexDecodeConfig(128, 1, 8),
+        ]
+
+        self.exhaustive_flex_attn_fwd_configs: list[FlexConfig] = [
+            ROCmFlexConfig(BLOCK_M, BLOCK_N, num_stages, num_warps, mfma, wpeu)
+            for BLOCK_M in [16, 32, 64, 128]
+            for BLOCK_N in [32, 64, 128]
+            for num_stages in [1, 2]
+            for num_warps in [2, 4, 8]
+            for mfma in [0, 16]
+            for wpeu in [0, int(8 // num_warps)]
+        ]
+
+        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
+            ROCmFlexConfig(BLOCK1, BLOCK2, num_stages, num_warps, mfma, wpeu)
+            for BLOCK1 in [16, 32, 64, 128]
+            for BLOCK2 in [16, 32, 64, 128]
+            for num_stages in [1, 2]
+            for num_warps in [2, 4, 8]
+            for mfma in [0, 16]
+            for wpeu in [0, int(8 // num_warps)]
+            if BLOCK2 % BLOCK1 == 0
+        ]
+
+        self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
+            ROCmFlexDecodeConfig(block_n, num_stages, num_warps, mfma, wpeu, kpack=2)
+            for block_n in [16, 32, 64, 128]
+            for num_stages in [1, 2]
+            for num_warps in [2, 4, 8]
+            for mfma in [0, 16]
+            for wpeu in [0, int(8 // num_warps)]
+        ]
+
+    def _filter_configs(
+        self, configs: list[BaseConfig], new_num_stages: int
+    ) -> list[BaseConfig]:
+        # TODO: _filter_configs can be removed once backend specific configs are added
+        # for all methods
+        for c in configs:
+            c.num_stages = self.default_num_stages
+        return configs
+
+    def _finalize_mm_configs(
+        self,
+        configs: list[BaseConfig],
+    ) -> Generator[TritonConfig, None, None]:
+        """
+        Finalizes configs after scaling, applying additional constraints.
+        """
+        used: OrderedSet[tuple[int, ...]] = OrderedSet()
+
+        max_mm_configs = config.test_configs.max_mm_configs
+
+        for conf in configs:
+            # Each warp computes a 16x16 tile = 256 elements
+            conf.num_warps = min(conf.num_warps, conf.block_m * conf.block_n // 256)
+
+            # Defaults for AMD triton backend kern args if not set
+            matrix_instr_nonkdim = getattr(conf, "matrix_instr_nonkdim", 16)
+            waves_per_eu = getattr(conf, "waves_per_eu", 0)
+            kpack = getattr(conf, "kpack", 2)
+
+            if matrix_instr_nonkdim != 0 and (
+                conf.block_m % matrix_instr_nonkdim != 0
+                or conf.block_n % matrix_instr_nonkdim != 0
+            ):
+                #  block_m and block_n must be a multiple of matrix_instr_nonkdim
+                continue
+
+            # Construct key for finding duplicate configs
+            key: tuple[int, ...] = (
+                conf.block_m,
+                conf.block_n,
+                conf.block_k,
+                conf.num_stages,
+                conf.num_warps,
+                waves_per_eu,
+                matrix_instr_nonkdim,
+                kpack,
+            )
+
+            # Check if gemm specific arg exists - add to key if does
+            group_m = getattr(conf, "group_m", None)
+            if group_m is not None:
+                key += (group_m,)
+
+            if waves_per_eu != 0:
+                waves_per_eu = int(8 // conf.num_warps)
+
+            if key not in used and (
+                max_mm_configs is None or len(used) < max_mm_configs
+            ):
+                used.add(key)
+                kwargs = {
+                    "BLOCK_M": conf.block_m,
+                    "BLOCK_N": conf.block_n,
+                    "BLOCK_K": conf.block_k,
+                    "num_stages": conf.num_stages,
+                    "num_warps": conf.num_warps,
+                    "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                    "waves_per_eu": waves_per_eu,
+                    "kpack": kpack,
+                }
+                if group_m is not None:
+                    kwargs["GROUP_M"] = group_m
+                yield self.triton_config(**kwargs)
+
+    def get_extra_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.extra_mm_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_int8_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.int8_mm_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_mixed_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        mm_configs = (
+            self.mm_configs + self.mixed_mm_configs
+            if config.max_autotune_gemm_search_space == "EXHAUSTIVE"
+            else self.mm_configs
+        )
+        filtered_configs = self._filter_configs(mm_configs, self.default_num_stages)
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_persistent_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.persistent_mm_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_scaled_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.scaled_mm_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_scaled_persistent_mm_configs(
+        self,
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.scaled_persistent_mm_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_mm_plus_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(self.mm_plus_mm_configs, 1)
+        return partial(self._finalize_mm_configs, configs=filtered_configs)
+
+    def get_conv_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+        filtered_configs = self._filter_configs(
+            self.conv_configs, self.default_num_stages
+        )
+        return partial(self.preprocess_mm_configs, configs=filtered_configs)
+
+    def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_fwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_fwd_configs
+            flex_attn_fwd_configs += self.flex_attn_fwd_autotune_configs
+
+        if head_dim <= 256:
+            if dtype == torch.float32:
+                default_config = ROCmFlexConfig(64, 64, 1, 4)
+            else:
+                default_config = ROCmFlexConfig(128, 64, 1, 8)
+            default_config = self.default_flex_config.get(
+                (dtype, head_dim), default_config
+            )
+        else:
+            if dtype == torch.float32:
+                default_config = ROCmFlexConfig(32, 16, 1, 4)
+            else:
+                default_config = ROCmFlexConfig(64, 32, 1, 4)
+
+        if default_config not in flex_attn_fwd_configs:
+            flex_attn_fwd_configs.append(default_config)
+
+        return flex_attn_fwd_configs
+
+    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_bwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_bwd_configs
+            flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
+
+        if dtype == torch.float32:
+            default_config = ROCmFlexConfig(16, 16, 1, 4)
+        elif head_dim <= 256:
+            if head_dim == 64:
+                default_config = ROCmFlexConfig(64, 64, 1, 4)
+            elif head_dim == 128:
+                default_config = ROCmFlexConfig(64, 128, 1, 8)
+            else:
+                default_config = ROCmFlexConfig(64, 64, 1, 4)
+        else:
+            default_config = ROCmFlexConfig(16, 16, 1, 4)
+
+        if default_config not in flex_attn_bwd_configs:
+            flex_attn_bwd_configs.append(default_config)
+
+        return flex_attn_bwd_configs
+
+    def get_flex_decode_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexDecodeConfig]:
+        flex_decode_configs: list[FlexDecodeConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_decode_configs
+            flex_decode_configs += self.flex_decode_autotune_configs
+
+        default_config = ROCmFlexDecodeConfig(64, 1, 4)
+
+        if default_config not in flex_decode_configs:
+            flex_decode_configs.append(default_config)
+
+        return flex_decode_configs
+
+
+class XPUConfigHeuristic(BaseConfigHeuristic):
+    """
+    Placeholder child class for XPU specific overrides.
+    """
diff --git a/torch/_inductor/test_operators.py b/torch/_inductor/test_operators.py
index bbdcf89d0ef86..80c1d5fc8767a 100644
--- a/torch/_inductor/test_operators.py
+++ b/torch/_inductor/test_operators.py
@@ -5,6 +5,7 @@
 from torch.autograd import Function
 
 
+<<<<<<< HEAD
 _test_lib_def = torch.library.Library("_inductor_test", "DEF")
 _test_lib_def.define("realize(Tensor self) -> Tensor", tags=torch.Tag.pt2_compliant_tag)
 
@@ -27,3 +28,27 @@ def backward(ctx: Any, *grad_output: Any) -> Any:
 
 def realize(x: Tensor) -> Tensor:
     return Realize.apply(x)
+=======
+if not torch._running_with_deploy():
+    _test_lib_def = torch.library.Library("_inductor_test", "DEF")
+    _test_lib_def.define(
+        "realize(Tensor self) -> Tensor", tags=torch.Tag.pt2_compliant_tag
+    )
+
+    _test_lib_impl = torch.library.Library("_inductor_test", "IMPL")
+    for dispatch_key in ("CPU", "CUDA", "MPS", "Meta"):
+        _test_lib_impl.impl("realize", lambda x: x.clone(), dispatch_key)
+
+    class Realize(Function):
+        @staticmethod
+        def forward(ctx: object, x: Tensor) -> Tensor:
+            return torch.ops._inductor_test.realize(x)
+
+        @staticmethod
+        # types need to stay consistent with _SingleLevelFunction
+        def backward(ctx: Any, *grad_output: Any) -> Any:
+            return grad_output[0]
+
+    def realize(x: Tensor) -> Tensor:
+        return Realize.apply(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 4327637a87207..a607747293971 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -1,6 +1,14 @@
 import dataclasses
+<<<<<<< HEAD
 import itertools
 from collections import Counter, defaultdict
+=======
+import functools
+import itertools
+import sys
+from collections import Counter, defaultdict
+from collections.abc import Iterable, Iterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable, Literal, Optional, overload, TYPE_CHECKING, TypeVar, Union
 
 import sympy
@@ -126,14 +134,21 @@ def indexing_div_rep(
 
     # For the purposes of tiling/coalesced access, approximate ModularIndexing and FloorDiv
     # then check later
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     eq_1_expr_simplified = eq_1_expr.replace(ModularIndexing, indexing_div_rep).replace(
         FloorDiv, indexing_div_rep
     )
 
     out = _solve_simple_expr(eq_1_expr_simplified)
     # since we approximated FloorDiv/ModularIndexing, double check here
+<<<<<<< HEAD
     if not out or sympy_subs(eq_1_expr, {free_symbol: out}) != 1:
+=======
+    if not out or not (sympy_subs(eq_1_expr, {free_symbol: out})) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     required_values.append(out)
@@ -371,6 +386,23 @@ def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]:
         return pw, red
 
 
+<<<<<<< HEAD
+=======
+if sys.version_info >= (3, 10):
+    # On Python 3.10+ we can use zip(strict=True)
+    zip_equal = functools.partial(zip, strict=True)
+else:
+    # Fallback for older versions
+    def zip_equal(it1: Iterable[T], it2: Iterable[U]) -> Iterator[tuple[T, U]]:
+        """
+        Zip two iterables, raising ValueError if their lengths differ.
+        """
+        if len(it1) != len(it2):
+            raise ValueError(f"Lengths differ: {len(it1)} != {len(it2)}")
+        return zip(it1, it2)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def apply_var_mapping(
     iter_vars: list[sympy.Symbol],
     red_vars: list[sympy.Symbol],
@@ -408,7 +440,11 @@ def apply_var_mapping(
 
     iter_vars_to_flat_vars = {}
     for i, (group, var_group) in enumerate(
+<<<<<<< HEAD
         zip(apply_groups, (iter_vars, red_vars), strict=True)
+=======
+        zip_equal(apply_groups, ((iter_vars, red_vars)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # if the node has sizes (p0, 1) and the fused node is (p0, r0)
         # the reduction var gets filled in for split_iteration_range
@@ -421,9 +457,13 @@ def apply_var_mapping(
 
     count = 0
     flat_vars_to_new_vars = {}
+<<<<<<< HEAD
     for new_range, new_var in zip(
         new_ranges, norm_pw_vars + norm_red_vars, strict=True
     ):
+=======
+    for new_range, new_var in zip_equal(new_ranges, norm_pw_vars + norm_red_vars):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         range_vars = []
         for i in range(len(new_range)):
             range_vars.append(flat_vars[count])
@@ -477,6 +517,10 @@ def extract_normalized_read_writes(
     (norm_pw_vars, norm_red_vars), ranges = index_vars_no_squeeze(
         pw_splits, red_splits, prefix="n"
     )
+<<<<<<< HEAD
+=======
+    node = node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for n in list(node.get_nodes()):
         if not isinstance(n, torch._inductor.scheduler.SchedulerNode):
@@ -575,7 +619,11 @@ def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
     # TODO - deduplicate with candidate_tilings
     var_sizes = []
     for v in addr.free_symbols:
+<<<<<<< HEAD
         v_size = var_ranges.get(v)
+=======
+        v_size = var_ranges.get(v, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO - reason about indirect vars
         if not symbol_is_type(v, SymT.INDIRECT) and v_size is not None:
             var_sizes.append(v_size)
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index 5bf5210a2cf46..503d06e6d5187 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -183,9 +183,20 @@ def put_static_autotuner(cls, key: str, kernel: "CachingAutotuner") -> None:  #
                     new_kernel,
                 )
             )
+<<<<<<< HEAD
 
             # Put the values back since we need it to use now
             kernel.restore_after_unpickle(old_values)
+=======
+            # Put the values back since we need it to use now
+            (
+                kernel.fn.fn,
+                kernel.fn.__globals__,
+                kernel.fn.used_global_vals,
+                kernel.fn.repr,
+                kernel.launchers,
+            ) = old_values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def collect_static_autotuners(
@@ -224,11 +235,19 @@ def load_autotuners(
                     # Make sure the cubin path exists and is valid
                     for compile_result in result.kernel.compile_results:
                         compile_result.reload_cubin_path()
+<<<<<<< HEAD
                 except RuntimeError:
                     log.warning(
                         "Failed to reload cubin file statically launchable autotuner %s",
                         result.kernel_name,
                         exc_info=True,
+=======
+                except RuntimeError as e:
+                    log.warning(
+                        "Failed to reload cubin file statically launchable autotuner %s: %s",
+                        result.kernel_name,
+                        e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     continue
                 # We make a future instead of returning the kernel here so that
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index d3b9ee49cb7d2..5cb88f2fb5133 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -18,11 +18,15 @@
 import shutil
 import statistics
 import sys
+<<<<<<< HEAD
 import sysconfig
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import tempfile
 import textwrap
 import time
 import unittest
+<<<<<<< HEAD
 from collections.abc import (
     Collection,
     Generator,
@@ -31,6 +35,9 @@
     MutableMapping,
     MutableSet,
 )
+=======
+from collections.abc import Collection, Iterator, Mapping, MutableMapping, MutableSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from datetime import datetime
 from io import StringIO
 from typing import (
@@ -59,6 +66,7 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 import torch.utils._pytree as pytree
 from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
@@ -83,24 +91,56 @@
 )
 
 
+=======
+from torch._inductor.runtime.hints import DeviceProperties
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_map_only
+
+
+OPTIMUS_EXCLUDE_POST_GRAD = [
+    "activation_quantization_aten_pass",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from collections.abc import Iterable, Sequence, ValuesView
 
     from torch import SymBool, SymFloat, SymInt
     from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
     from torch.fx import GraphModule
+<<<<<<< HEAD
+=======
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.fx.node import Node
 
     from .codegen.common import WorkspaceArg
     from .codegen.wrapper import PythonWrapperCodegen
+<<<<<<< HEAD
     from .dependencies import Dep
     from .graph import GraphLowering
     from .ir import Buffer, ExternKernel, IRNode, Layout, Operation, ReinterpretView
+=======
+    from .graph import GraphLowering
+    from .ir import (
+        Buffer,
+        ExternKernel,
+        ExternKernelOut,
+        IRNode,
+        Layout,
+        Operation,
+        ReinterpretView,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .output_code import CompiledFxGraph
     from .scheduler import BaseSchedulerNode, SchedulerBuffer
 
 
+<<<<<<< HEAD
 GPU_TYPES = ["cuda", "mps", "xpu", "mtia"]
+=======
+GPU_TYPES = ["cuda", "mps", "xpu"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T = TypeVar("T")
 
 
@@ -137,8 +177,11 @@ def get_gpu_type() -> str:
 _IS_WINDOWS = sys.platform == "win32"
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _T = TypeVar("_T")
 VarRanges = dict[sympy.Expr, sympy.Expr]
@@ -258,10 +301,14 @@ def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
         [
             event
             for event in p.events()
+<<<<<<< HEAD
             if (
                 event.device_type == DeviceType.CUDA
                 and re.match(r"fused_abs_max_\d", event.name) is not None
             )
+=======
+            if event.device_type == DeviceType.CUDA and "fused_abs_max_0" in event.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     )
     if filtered_events:
@@ -275,6 +322,7 @@ def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
 
 
 def do_bench_using_profiling(
+<<<<<<< HEAD
     fn: Callable[[], Any],
     warmup: int = 25,
     rep: int = 100,
@@ -303,6 +351,9 @@ def _do_bench_using_profiling(
     warmup: int = 25,
     rep: int = 100,
     is_vetted_benchmarking: bool = False,
+=======
+    fn: Callable[[], Any], warmup: int = 25, rep: int = 100
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> float:
     """
     Returns benchmark results by examining torch profiler events.
@@ -312,11 +363,14 @@ def _do_bench_using_profiling(
     various CUDA events, etc, so could also be fragile.
     """
 
+<<<<<<< HEAD
     if not is_vetted_benchmarking:
         from torch._inductor.runtime.benchmarking import may_ban_benchmarking
 
         may_ban_benchmarking()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn()
     torch.cuda.synchronize()
     cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
@@ -539,7 +593,11 @@ def is_pointwise_use(
     Uses in views ops will follow the views uses
     """
 
+<<<<<<< HEAD
     if use.op != "call_function":
+=======
+    if not use.op == "call_function":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     if not (
         isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
@@ -701,6 +759,7 @@ def clear_cache(self: Any) -> None:
     return wrapper  # type: ignore[return-value]
 
 
+<<<<<<< HEAD
 def cache_property_on_self(fn: Callable[P, RV]) -> CachedMethod[P, RV]:
     """
     Variant of cache_on_self for properties. The only difference is the type signature.
@@ -709,6 +768,8 @@ def cache_property_on_self(fn: Callable[P, RV]) -> CachedMethod[P, RV]:
     return cache_on_self(fn)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def aggregate_origins(
     node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
 ) -> OrderedSet[Node]:
@@ -718,7 +779,10 @@ def aggregate_origins(
         return functools.reduce(
             operator.or_,
             [
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node.node.origins
                 for node in node_schedule
                 if hasattr(node, "node") and node.node
@@ -763,6 +827,10 @@ def get_fused_kernel_name(
         ]
     else:
         raise NotImplementedError
+<<<<<<< HEAD
+=======
+    sources = sources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "_".join(["fused"] + sources)
 
 
@@ -770,6 +838,7 @@ def get_kernel_metadata(
     node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
     wrapper: PythonWrapperCodegen,
 ) -> tuple[str, str]:
+<<<<<<< HEAD
     """
     Retrieves metadata information for a kernel.
     Args:
@@ -784,6 +853,8 @@ def get_kernel_metadata(
                 - The second string represent the kernel's detailed metadata.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_origins = aggregate_origins(node_schedule)
     inductor_nodes = [origin for origin in all_origins if origin.op == "call_function"]
 
@@ -794,7 +865,11 @@ def get_kernel_metadata(
     # where `inductor_nodes` contains nodes from multiple graph instances
     # is not supported. An example of this is conditional statements.
     single_graph = None
+<<<<<<< HEAD
     if inductor_nodes:
+=======
+    if len(inductor_nodes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unique_graphs = OrderedSet(n.graph for n in inductor_nodes)
         if len(unique_graphs) == 1:
             single_graph = inductor_nodes[0].graph
@@ -828,6 +903,7 @@ def get_kernel_metadata(
 
     # print the aot_autograd graph fragment
     if single_graph is not None:
+<<<<<<< HEAD
         from . import ir
 
         detailed_metadata.append(f"{wrapper.comment} Graph fragment:")
@@ -907,6 +983,13 @@ def stringfy_layout(layout: ir.Layout | None) -> str:
             )
 
         detailed_metadata.append(f"{wrapper.comment}   return {','.join(all_writes)}")
+=======
+        detailed_metadata.append(f"{wrapper.comment} Graph fragment:")
+        for n in inductor_nodes:
+            # TODO(future): maybe refactor torch/fx/graph.py to make it easy to
+            # generate python code for graph fragments
+            detailed_metadata.append(f"{wrapper.comment}   {n.format_node()}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return metadata, "\n".join(detailed_metadata)
 
@@ -933,7 +1016,13 @@ def dominated_nodes(
 
 def gather_origins(
     args: Sequence[IRNode], kwargs: dict[str, IRNode]
+<<<<<<< HEAD
 ) -> OrderedSet[torch.fx.Node]:
+=======
+) -> OrderedSet[IRNode]:
+    import itertools
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from . import ir
 
     def is_unrealized_node(n: IRNode) -> bool:
@@ -941,6 +1030,7 @@ def is_unrealized_node(n: IRNode) -> bool:
             return is_unrealized_node(n.data)
         if isinstance(n, ir.StorageBox):
             return is_unrealized_node(n.data)
+<<<<<<< HEAD
         return isinstance(n, ir.IRNode) and not isinstance(
             n,
             (
@@ -958,6 +1048,13 @@ def is_unrealized_node(n: IRNode) -> bool:
     args_flatten, _ = tree_flatten(args)
     args_origins = [val.origins for val in args_flatten if is_unrealized_node(val)]
     return OrderedSet(itertools.chain(*args_origins, *kwargs_origins))
+=======
+        return isinstance(n, ir.IRNode) and isinstance(n, ir.Pointwise)
+
+    kwarg_origins = [val.origins for val in kwargs.values() if is_unrealized_node(val)]
+    arg_origins = [arg.origins for arg in args if is_unrealized_node(arg)]
+    return OrderedSet(itertools.chain(*arg_origins, *kwarg_origins))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def sympy_str(expr: sympy.Expr) -> str:
@@ -1135,7 +1232,11 @@ def get_first_incompatible_cudagraph_node(
         if (
             not torch._inductor.config.graph_partition
             and isinstance(node.target, torch._ops.OpOverload)
+<<<<<<< HEAD
             and torch._C.Tag.cudagraph_unsafe in node.target.tags  # type: ignore[attr-defined]
+=======
+            and torch._C.Tag.cudagraph_unsafe in node.target.tags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # skip cudagraph if a cudagraph_unsafe op is detected.
             # graph_partition helps by splitting on this cudagraph_unsafe
@@ -1194,7 +1295,10 @@ def unload_xpu_triton_pyds() -> None:
                             result,
                             torch._inductor.runtime.triton_heuristics.TritonCompileResult,
                         ):
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             result.kernel.run.mod.__del__()
         del sys.modules[module_name]
 
@@ -1244,17 +1348,25 @@ def fresh_cache(
     """
     clear_caches()
 
+<<<<<<< HEAD
     from torch._inductor.cpp_builder import normalize_path_separator
 
     inductor_cache_dir = normalize_path_separator(tempfile.mkdtemp(dir=dir))
+=======
+    inductor_cache_dir = tempfile.mkdtemp(dir=dir)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         with mock.patch.dict(
             os.environ, {"TORCHINDUCTOR_CACHE_DIR": inductor_cache_dir}
         ):
             log.debug("Using inductor cache dir %s", inductor_cache_dir)
+<<<<<<< HEAD
             triton_cache_dir = normalize_path_separator(
                 os.path.join(inductor_cache_dir, "triton")
             )
+=======
+            triton_cache_dir = os.path.join(inductor_cache_dir, "triton")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with mock.patch.dict(os.environ, {"TRITON_CACHE_DIR": triton_cache_dir}):
                 yield
                 if isinstance(cache_entries, dict):
@@ -1277,7 +1389,10 @@ def fresh_cache(
                 # Let's not fail if we can't clean up the temp dir. Also note that for
                 # Windows, we can't delete the loaded modules because the module binaries
                 # are open.
+<<<<<<< HEAD
                 ignore_errors=is_windows(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 onerror=lambda func, path, exc_info: log.warning(
                     "Failed to remove temporary cache dir at %s",
                     inductor_cache_dir,
@@ -1468,7 +1583,10 @@ def splice(
     ) -> None:
         if isinstance(other_code, IndentedBuffer):
             dedent = float("inf")
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for line in other_code._lines:
                 if not isinstance(line, LineContext) and line:
                     dedent = min(dedent, len(line) - len(line.lstrip()))
@@ -1505,9 +1623,12 @@ def __add__(self, other: Self) -> IndentedBuffer:
         res.writelines(other._lines)
         return res
 
+<<<<<<< HEAD
     def contains(self, new_line: Union[DeferredLineBase, LineContext, str]) -> bool:
         return new_line in self._lines
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class FakeIndentedBuffer(IndentedBuffer):
     def __init__(self) -> None:
@@ -1580,6 +1701,7 @@ def _new_line(self, line: str) -> DelayReplaceLine:
         return DelayReplaceLine(self.key, self.value_fn, line)
 
 
+<<<<<<< HEAD
 class DelayMaybeLine(DeferredLineBase):
     """At end of codegen return `line if `pred_fn() else None`"""
 
@@ -1594,6 +1716,8 @@ def _new_line(self, line: str) -> DelayMaybeLine:
         return DelayMaybeLine(self.pred_fn, line)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.cache
 def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
     if isinstance(index_or_device, torch.device):
@@ -1625,6 +1749,7 @@ def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
 
 @functools.lru_cache
 def get_max_num_sms() -> int:
+<<<<<<< HEAD
     if torch.xpu.is_available():
         return torch.xpu.get_device_properties().gpu_subslice_count
     return torch.cuda.get_device_properties("cuda").multi_processor_count
@@ -1645,6 +1770,14 @@ def get_num_sms() -> int:
     # TODO we need to properly guard on this global
     if torch.xpu.is_available():
         return get_max_num_sms()
+=======
+    return torch.cuda.get_device_properties("cuda").multi_processor_count
+
+
+def get_num_sms() -> int:
+    """Handle experimental carveout if set otherwise return hardware SM count"""
+    # TODO we need to properly guard on this global
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     carveout = torch._C._get_sm_carveout_experimental()
     return get_max_num_sms() - (carveout if carveout is not None else 0)
 
@@ -1698,11 +1831,15 @@ def _use_conv_autotune_backend(backend: str) -> bool:
 
 
 def use_triton_template(
+<<<<<<< HEAD
     layout: Layout,
     *,
     enable_int32: bool = False,
     enable_float8: bool = False,
     check_max_autotune: bool = True,
+=======
+    layout: Layout, *, enable_int32: bool = False, enable_float8: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> bool:
     from .codegen.common import BackendFeature, has_backend_feature
 
@@ -1719,13 +1856,18 @@ def use_triton_template(
             )
             or (layout.device.type == "cpu" and layout.dtype in layout_dtypes)
         )
+<<<<<<< HEAD
         # some callers handle max-autotune checking externally
         and (config.max_autotune or config.max_autotune_gemm or not check_max_autotune)
+=======
+        and (config.max_autotune or config.max_autotune_gemm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and _use_autotune_backend("TRITON")
         and has_backend_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
     )
 
 
+<<<<<<< HEAD
 def can_use_tma(
     *matrices: IRNode, output_layout: Optional[Layout] = None, add_guards: bool = False
 ) -> bool:
@@ -1828,10 +1970,36 @@ def _is_tma_compatible(
 
         # FP8 special case: inner ≥ 32
         if dtype == torch.float8_e4m3fn and not V.graph.sizevars.statically_known_geq(
+=======
+def use_triton_tma_template(*matrices: IRNode) -> bool:
+    from torch.utils._triton import has_triton_stable_tma_api, has_triton_tma_device
+
+    from .virtualized import V
+
+    def _is_tma_compatible(x: IRNode) -> bool:
+        if len(x.get_size()) != 2:
+            return False
+
+        dtype = x.get_dtype()
+        if dtype not in (torch.float16, torch.bfloat16, torch.float8_e4m3fn):
+            return False
+
+        layout = x.get_layout()
+        transposed = layout.is_transposed()
+        if not (layout.is_contiguous() or transposed):
+            return False
+
+        inner_dim = layout.size[1]
+        if transposed:
+            inner_dim = layout.size[0]
+
+        if dtype == torch.float8_e4m3fn and V.graph.sizevars.statically_known_lt(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inner_dim, 32
         ):
             return False
 
+<<<<<<< HEAD
         return True
 
     return (
@@ -1866,6 +2034,20 @@ def use_triton_blackwell_tma_template(
 
     # Blackwell template require the tensor descriptor API, not the experimental API.
     return has_triton_tensor_descriptor_host_tma() and is_datacenter_blackwell_arch()
+=======
+        inner_bytes = inner_dim * dtype.itemsize
+        return V.graph.sizevars.statically_known_multiple_of(inner_bytes, TMA_ALIGNMENT)
+
+    if has_triton_stable_tma_api() and config.cpp_wrapper:
+        # TODO(dberard) remove this when we get AOTI support for new TMA APIs (#155047)
+        return False
+
+    return (
+        config.triton.enable_persistent_tma_matmul
+        and has_triton_tma_device()
+        and all(_is_tma_compatible(m) for m in matrices)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
@@ -1893,9 +2075,14 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
         if not try_import_cutlass():
             log.warning(
                 "Failed to import CUTLASS lib. Please check whether "
+<<<<<<< HEAD
                 "_inductor.config.cuda.cutlass_dir %s is set correctly. "
                 "Skipping CUTLASS backend for now.",
                 config.cuda.cutlass_dir,
+=======
+                "_inductor.config.cuda.cutlass_dir is set correctly. "
+                "Skipping CUTLASS backend for now."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return False
     return res
@@ -1909,6 +2096,7 @@ def _use_cutlass_for_op(op_name: str) -> bool:
     return op_name.upper() in [x.strip() for x in enabled_ops.split(",")]
 
 
+<<<<<<< HEAD
 _IntLike: TypeAlias = Union[int, sympy.Expr]
 
 
@@ -1923,6 +2111,24 @@ def use_decompose_k_choice(
     return (
         not torch.version.hip
         and V.graph.sizevars.statically_known_true(
+=======
+decompose_k_threshold = 32
+
+# To limit compile time
+k_splits_limit = 5
+
+# Hand-tuned
+default_k_splits = [16, 32, 64, 128, 256]
+
+_IntLike: TypeAlias = Union[int, sympy.Expr]
+
+
+def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
+    from torch._inductor.virtualized import V
+
+    return (
+        V.graph.sizevars.statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sympy.And(
                 sympy.Ge(k, decompose_k_threshold * m),
                 sympy.Ge(k, decompose_k_threshold * n),
@@ -1930,6 +2136,7 @@ def use_decompose_k_choice(
         )
         and not V.graph.aot_mode  # TODO: Support AOTI for decomposeK
         and not V.graph.cpp_wrapper
+<<<<<<< HEAD
         and config.triton.num_decompose_k_splits > 0
     )
 
@@ -1955,11 +2162,15 @@ def use_contiguous(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
         )
         and not V.graph.aot_mode
         and not V.graph.cpp_wrapper
+=======
+        and not config.disable_decompose_k
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 @functools.cache
 def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
+<<<<<<< HEAD
     # To limit compile time
     k_splits_limit = config.triton.num_decompose_k_splits
 
@@ -1970,6 +2181,11 @@ def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
         return default_k_splits
     elif k_splits_limit == 0:
         return []
+=======
+    # If k is a sympy expression, we can't do any splitting
+    if isinstance(k, sympy.Expr) and not k.is_number:
+        return default_k_splits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (isinstance(m, sympy.Expr) and not m.is_number) or (
         isinstance(n, sympy.Expr) and not n.is_number
@@ -2009,10 +2225,22 @@ def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
 
     if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
         return pow_of_2_divisors + mul_of_32_divisors + rest_of_splits
+<<<<<<< HEAD
 
     best_splits = pow_of_2_divisors + mul_of_32_divisors + rest_of_splits
     # Otherwise, conform results to k_splits_limit
     return best_splits[:k_splits_limit]
+=======
+    # If the # of power of 2 divisors are greater than k_splits_limit, return all
+    # This should be ok for compile time, all perfect squares between 128 and min(k / m, k / n)
+    # should never be a massive amount
+    if len(pow_of_2_divisors) >= k_splits_limit:
+        return pow_of_2_divisors
+    else:
+        best_splits = pow_of_2_divisors + mul_of_32_divisors + rest_of_splits
+        # Otherwise, conform results to k_splits_limit
+        return best_splits[:k_splits_limit]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
@@ -2058,7 +2286,11 @@ def use_ck_template(layout: Layout) -> bool:
     if not torch.version.hip:
         return False
     # tensors must be on GPU
+<<<<<<< HEAD
     if layout.device.type != "cuda":
+=======
+    if not layout.device.type == "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     # hardware check
     # if config arch list is not specified, get the native arch from the device properties
@@ -2082,7 +2314,20 @@ def use_ck_template(layout: Layout) -> bool:
         log.warning("Please pip install Composable Kernel package")
         return False
 
+<<<<<<< HEAD
     config.rocm.ck_dir = ck_package_dirname
+=======
+    if config.is_fbcode():
+        config.rocm.ck_dir = ck_package_dirname
+
+    if not config.rocm.ck_dir:
+        log.warning("Please set TORCHINDUCTOR_CK_DIR env variable")
+        return False
+
+    if ck_package_dirname != config.rocm.ck_dir:
+        log.warning("Invalid path to CK library")
+        return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return True
 
@@ -2224,21 +2469,35 @@ def run_and_get_code(
 ) -> tuple[_T, list[str]]:
     from .graph import GraphLowering
 
+<<<<<<< HEAD
     source_codes: OrderedSet[str] = OrderedSet()
 
     def save_output_code(code: str) -> None:
         source_codes.add(code)
+=======
+    source_codes: list[str] = []
+
+    def save_output_code(code: str) -> None:
+        source_codes.append(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with mock.patch.object(GraphLowering, "save_output_code", save_output_code):
         torch._dynamo.reset()
         result = fn(*args, **kwargs)
+<<<<<<< HEAD
     return result, list(source_codes)
+=======
+    return result, source_codes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def run_and_get_kernels(
     fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
 ) -> tuple[_T, list[str]]:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result, source_codes = run_and_get_code(fn, *args, **kwargs)
     kernels = []
     for code in source_codes:
@@ -2279,6 +2538,10 @@ def call(self, *args: Any, **kwargs: Any) -> None:
             self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
         )
         # Skip all the actual compiling.
+<<<<<<< HEAD
+=======
+        nonlocal save_output_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         save_output_code(wrapper_code.value)
         if kernel_code:
             save_output_code(kernel_code.value)
@@ -2299,7 +2562,10 @@ def call(self, *args: Any, **kwargs: Any) -> None:
 
 
 def get_triton_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> str:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     source_codes = get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -2311,7 +2577,10 @@ def get_triton_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> s
 def run_and_get_triton_code(
     fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
 ) -> str:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _, source_codes = run_and_get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -2476,6 +2745,7 @@ def get_backend_num_stages() -> int:
 
 
 @functools.cache
+<<<<<<< HEAD
 def get_device_tflops(dtype: torch.dtype) -> float:
     """
     We don't want to throw errors in this function. First check to see if the device is in device_info.py,
@@ -2492,6 +2762,11 @@ def get_device_tflops(dtype: torch.dtype) -> float:
         0,
     )
 
+=======
+def get_device_tflops(dtype: torch.dtype) -> int:
+    from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert dtype in (torch.float16, torch.bfloat16, torch.float32)
 
     if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
@@ -2499,7 +2774,11 @@ def get_device_tflops(dtype: torch.dtype) -> float:
         from torch._utils_internal import max_clock_rate
 
         sm_clock = max_clock_rate()
+<<<<<<< HEAD
         if dtype in (torch.float16, torch.bfloat16) and SM80OrLater:
+=======
+        if dtype in (torch.float16, torch.bfloat16):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return get_max_tensorcore_tflops(dtype, sm_clock)
 
         if torch.backends.cuda.matmul.allow_tf32:
@@ -2507,7 +2786,11 @@ def get_device_tflops(dtype: torch.dtype) -> float:
         else:
             return get_max_simd_tflops(torch.float32, sm_clock)
     else:
+<<<<<<< HEAD
         if dtype in (torch.float16, torch.bfloat16) and SM80OrLater:
+=======
+        if dtype in (torch.float16, torch.bfloat16):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return get_max_tensorcore_tflops(dtype)
 
         if torch.backends.cuda.matmul.allow_tf32:
@@ -2642,7 +2925,11 @@ def is_output_of_multi_outputs_template(
     return (
         isinstance(input_buf, ir.MultiOutput)
         and len(input_buf.inputs) == 1
+<<<<<<< HEAD
         and is_multi_outputs_template(input_buf.inputs[0])  # type: ignore[arg-type]
+=======
+        and is_multi_outputs_template(input_buf.inputs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -2656,14 +2943,22 @@ def is_collective(
     from . import ir
 
     return (
+<<<<<<< HEAD
         isinstance(node, ir._CollectiveKernel)
         and not isinstance(node, ir._WaitKernel)
         and (op is None or node.op_overload is op)
+=======
+        type(node) == ir._CollectiveKernel and (op is None or node.op_overload is op)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) or (
         # TODO: this is a temporary solution to ensure that we can identify torchrec's
         # communication ops. But in order to allow better communication and computation
         # overlap, torchrec's communication ops should be not used.
+<<<<<<< HEAD
         type(node) is ir.FallbackKernel
+=======
+        type(node) == ir.FallbackKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and (
             # NOTE: the `hasattr()` check is to bypass errors such as the following:
             # AttributeError: '_OpNamespace' 'torchrec' object has no attribute 'all_to_all_single'
@@ -2687,7 +2982,11 @@ def is_collective(
 def is_wait(node: Optional[Union[IRNode, Operation]]) -> bool:
     from . import ir
 
+<<<<<<< HEAD
     return type(node) is ir._WaitKernel
+=======
+    return type(node) == ir._WaitKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def contains_collective(snode: BaseSchedulerNode) -> bool:
@@ -2984,9 +3283,16 @@ def maybe_get_suppress_shape_guards_ctx() -> contextlib.AbstractContextManager[N
         return contextlib.nullcontext()
 
     # In standalone inductor compile mode, we might not have a shape_env attached to the fake mode
+<<<<<<< HEAD
     if not tracing_context.fake_mode or not tracing_context.fake_mode.shape_env:
         return contextlib.nullcontext()
     shape_env = tracing_context.fake_mode.shape_env
+=======
+    shape_env = tracing_context.fake_mode.shape_env
+    if not shape_env:
+        return contextlib.nullcontext()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return shape_env.suppress_guards()
 
 
@@ -3031,6 +3337,7 @@ def shape_env_from_inputs(inputs: Sequence[InputType]) -> Optional[ShapeEnv]:
         if isinstance(input, torch.SymInt):
             return input.node.shape_env
 
+<<<<<<< HEAD
         # Check tensor sizes and strides for SymInt values
         if isinstance(input, torch.Tensor):
             for size in input.size():
@@ -3040,6 +3347,8 @@ def shape_env_from_inputs(inputs: Sequence[InputType]) -> Optional[ShapeEnv]:
                 if isinstance(stride, torch.SymInt):
                     return stride.node.shape_env
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO(voz): Should we always have one anyway?
     return None
 
@@ -3139,6 +3448,7 @@ def expr_fits_within_32bit(e: sympy.Expr) -> bool:
     # (e.g., via ValueRanges) that it is still in bounds
     if V.graph.sizevars.statically_known_true(e <= int_max):
         return True
+<<<<<<< HEAD
 
     # AOTI doesn't guard on < 2**32, so checking hints isn't a viable option,
     # in case the hinted value is < 2**32, but the allowed range is larger.
@@ -3159,6 +3469,8 @@ def expr_fits_within_32bit(e: sympy.Expr) -> bool:
             # so this could potentially have int64 values
             return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Otherwise, the hint MUST exist and be in range
     return has_hint(e) and size_hint(e) <= int_max
 
@@ -3393,7 +3705,16 @@ def __delitem__(self, key: KeyType) -> None:
 @dataclass_transform(frozen_default=True)
 def ir_dataclass(cls: Optional[type[Any]] = None, /, *, frozen: bool = True) -> Any:
     def wrap(cls: _T) -> _T:
+<<<<<<< HEAD
         return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
+=======
+        if sys.version_info >= (3, 10):
+            return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
+        else:
+            # Polyfill for python=3.9. kw_only simply introduces an extra check
+            # that only kwargs are used (and is not available on 3.9)
+            return dataclasses.dataclass(cls, frozen=frozen)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if cls is None:
         return wrap
@@ -3407,6 +3728,45 @@ def get_donated_idxs() -> Optional[list[int]]:
     return None
 
 
+<<<<<<< HEAD
+=======
+def set_kernel_post_grad_provenance_tracing(
+    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernelOut],
+    kernel_name: str,
+    is_extern: bool = False,
+) -> None:
+    from .codegen.simd_kernel_features import DisableReduction, EnableReduction
+    from .ir import ExternKernelOut
+    from .virtualized import V
+
+    if is_extern:
+        assert isinstance(node_schedule, ExternKernelOut)
+        curr_node_info = (
+            V.debug._inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                kernel_name, []
+            )
+        )
+        curr_node_info.extend(
+            origin.name
+            for origin in node_schedule.origins
+            if origin.name not in curr_node_info
+        )
+    else:
+        assert isinstance(node_schedule, list)
+        for snode in node_schedule:
+            if snode not in (EnableReduction, DisableReduction):
+                if snode.node is not None:
+                    curr_node_info = V.debug._inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                        kernel_name, []
+                    )
+                    curr_node_info.extend(
+                        origin.name
+                        for origin in snode.node.origins
+                        if origin.name not in curr_node_info
+                    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TritonAttrsDescriptorVersion(enum.Enum):
     V0_NO_TRITON = 0
     V1_COMPILER = 1  # triton.compiler.compiler.AttrsDescriptor
@@ -3459,7 +3819,11 @@ def is_cudagraph_unsafe_op(node: Operation) -> bool:
 
     if (
         isinstance(node.op_overload, torch._ops.OpOverload)
+<<<<<<< HEAD
         and torch._C.Tag.cudagraph_unsafe in node.op_overload.tags  # type: ignore[attr-defined]
+=======
+        and torch._C.Tag.cudagraph_unsafe in node.op_overload.tags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         return True
 
@@ -3488,6 +3852,7 @@ def is_codegen_graph_partition_subgraph(wrapper: PythonWrapperCodegen) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_using_cudagraph_partition() -> bool:
     return (
         torch._inductor.config.triton.cudagraphs
@@ -3495,6 +3860,8 @@ def is_using_cudagraph_partition() -> bool:
     ) and torch._inductor.config.graph_partition
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def dtype_from_size(size: int) -> torch.dtype:
     from .virtualized import V
 
@@ -3531,6 +3898,7 @@ def is_mkldnn_fp16_supported(device_type: str) -> bool:
         # match "xpu", "xpu:0", "xpu:1", etc.
         return True
     return False
+<<<<<<< HEAD
 
 
 def tabulate_2d(elements: Sequence[Sequence[T]], headers: Sequence[T]) -> str:
@@ -3894,3 +4262,5 @@ def load_template(name: str, template_dir: Path) -> str:
     """Load a template file and return its content."""
     with open(template_dir / f"{name}.py.jinja") as f:
         return f.read()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index ea1073f88b714..3347b38a0ec00 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -80,7 +80,10 @@
     from torch._inductor.codegen.cpp_utils import LocalBufferContext
     from torch._inductor.debug import DebugContext
     from torch._inductor.graph import GraphLowering
+<<<<<<< HEAD
     from torch._inductor.ir import ExternKernelNode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.loop_body import InterpreterShim
     from torch._subclasses import FakeTensorMode
 
@@ -184,9 +187,12 @@ def get_index_dtype_as_torch_dtype(self):
     "ops", cast(type[OpsHandler[Any]], MockHandler)
 )
 _graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler)
+<<<<<<< HEAD
 _extern_kernel_nodes: Virtualized[list[ExternKernelNode]] = Virtualized(
     "extern_kernel_nodes", NullHandler
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _real_inputs: Virtualized[list[torch.Tensor]] = Virtualized("real_inputs", NullHandler)
 _fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler)
 _kernel: Virtualized[NullKernelHandler] = Virtualized(
@@ -347,9 +353,12 @@ class _V:
     )
     get_ops_handler: Callable[[], OpsHandler[Any]] = _ops._get_handler
     set_graph_handler: Callable[[GraphLowering], Any] = _graph._set_handler
+<<<<<<< HEAD
     set_extern_kernel_nodes: Callable[[list[ExternKernelNode]], Any] = (
         _extern_kernel_nodes._set_handler
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_real_inputs: Callable[[Any], Any] = _real_inputs._set_handler
     get_real_inputs: Callable[[], Any] = _real_inputs._get_handler
     set_fake_mode: Callable[[Any], Any] = _fake_mode._set_handler
@@ -376,6 +385,7 @@ def graph(self) -> GraphLowering:
         return _graph._get_handler()
 
     @property
+<<<<<<< HEAD
     def extern_kernel_nodes(self) -> list[ExternKernelNode]:
         """
         The extern_kernel_nodes needed for the entire graph, including the
@@ -385,6 +395,8 @@ def extern_kernel_nodes(self) -> list[ExternKernelNode]:
         return _extern_kernel_nodes._get_handler()
 
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def real_inputs(self):
         """non-fake example inputs"""
         return _real_inputs._get_handler()
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
index 9a527471c8cc0..18abfbb085323 100644
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@@ -1,8 +1,15 @@
 import argparse
+<<<<<<< HEAD
 import datetime
 import tempfile
 from collections import defaultdict
 from dataclasses import dataclass
+=======
+import dataclasses
+import datetime
+import tempfile
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from types import ModuleType
 from typing import Any, Optional, Protocol
 
@@ -159,7 +166,11 @@ def get_info_str(
         )
 
 
+<<<<<<< HEAD
 @dataclass
+=======
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ProfileEvent:
     category: str
     key: str
@@ -176,10 +187,13 @@ def parse_profile_event_list(
     nruns: int,
     device_name: str,
 ) -> None:
+<<<<<<< HEAD
     """
     Parse and generate a report for an event_list.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_self_device_time(
         ev: torch.autograd.profiler_util.EventList,
     ) -> float:
@@ -299,10 +313,13 @@ def report() -> None:
     report()
 
 
+<<<<<<< HEAD
 PROFILE_DIR = tempfile.gettempdir()
 PROFILE_PATH = f"{PROFILE_DIR}/compiled_module_profile.json"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def perf_profile(
     wall_time_ms: float,
     times: int,
@@ -313,14 +330,22 @@ def perf_profile(
     with torch.profiler.profile(record_shapes=True) as p:
         benchmark_compiled_module_fn(times=times, repeat=repeat)
 
+<<<<<<< HEAD
     path = PROFILE_PATH
+=======
+    path = f"{tempfile.gettempdir()}/compiled_module_profile.json"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     p.export_chrome_trace(path)
     print(f"Profiling result for a compiled module of benchmark {benchmark_name}:")
     print(f"Chrome trace for the profile is written to {path}")
     event_list = p.key_averages(group_by_input_shape=True)
     print(event_list.table(sort_by="self_device_time_total", row_limit=10))
     parse_profile_event_list(
+<<<<<<< HEAD
         benchmark_name, event_list, wall_time_ms, times * repeat, p.use_device or ""
+=======
+        benchmark_name, event_list, wall_time_ms, times * repeat, p.use_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -468,6 +493,7 @@ def compiled_module_main(
             "If None, NCU will use '--set full'."
         ),
     )
+<<<<<<< HEAD
     parser.add_argument(
         "--times",
         type=int,
@@ -481,13 +507,20 @@ def compiled_module_main(
         help="Number of repetitions of each benchmark run",
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = parser.parse_args()
 
     if args.benchmark_kernels:
         benchmark_all_kernels(benchmark_name, args.benchmark_all_configs)
     else:
+<<<<<<< HEAD
         times = args.times
         repeat = args.repeat
+=======
+        times = 10
+        repeat = 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 9efa0583cdea7..83465534d5b63 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -52,7 +52,19 @@
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
+<<<<<<< HEAD
 BuiltinUnionType: Union[type, tuple[type, ...]] = types.UnionType
+=======
+IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
+
+BuiltinUnionType: Union[type, tuple[type, ...]]
+if sys.version_info >= (3, 10):
+    # NOTE: IS_PY310_PLUS doesn't work with mypy.
+    # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
+    BuiltinUnionType = types.UnionType
+else:
+    BuiltinUnionType = ()  # trick: this makes isinstance short circuit.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 LockType: type
 try:
@@ -147,7 +159,11 @@ def _qualified_name(obj, mangle_name=True) -> str:
 
     # If the module is actually a torchbind module, then we should short circuit
     if module_name == "torch._classes":
+<<<<<<< HEAD
         return obj.qualified_name  # pyrefly: ignore [missing-attribute]
+=======
+        return obj.qualified_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # The Python docs are very clear that `__module__` can be None, but I can't
     # figure out when it actually would be.
@@ -163,7 +179,11 @@ def _qualified_name(obj, mangle_name=True) -> str:
 
     # torch.package and TorchScript have separate mangling schemes to avoid
     # name collisions from multiple packages. To avoid them interfering with
+<<<<<<< HEAD
     # each other, normalize the package managing here.
+=======
+    # each other, normalize the package manging here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if package_mangling.is_mangled(module_name):
         module_name = module_name.replace("<", "_")
         module_name = module_name.replace(">", "_")
@@ -374,7 +394,11 @@ def get_closure(fn):
 # values global in the function.
 # In Python 3.9 declaring class as global will make it invisible to
 # `inspect.getsource`, see https://bugs.python.org/issue42666 .
+<<<<<<< HEAD
 # This could be worked around by manually adding it to `global()` dictionary.
+=======
+# This could be worked around by manualy adding it to `global()` dictionary.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def createResolutionCallbackFromClosure(fn):
@@ -443,7 +467,11 @@ def get_callable_argument_names(fn) -> list[str]:
     for name, param in callable_signature.parameters.items():
         # All four other types of arguments do not map to individual values
         # with a keyword as name.
+<<<<<<< HEAD
         if param.kind != param.POSITIONAL_OR_KEYWORD:
+=======
+        if not param.kind == param.POSITIONAL_OR_KEYWORD:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         argument_names.append(name)
@@ -461,7 +489,11 @@ def get_annotation_str(annotation):
     elif isinstance(annotation, ast.Attribute):
         return ".".join([get_annotation_str(annotation.value), annotation.attr])
     elif isinstance(annotation, ast.Subscript):
+<<<<<<< HEAD
         # In Python3.9+ subscript indices are not wrapped in ast.Index
+=======
+        # In Python3.9+ subscript indicies are not wrapped in ast.Index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subscript_slice = annotation.slice
         return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]"
     elif isinstance(annotation, ast.Tuple):
@@ -759,7 +791,11 @@ def forward(self, x):
                 prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED
             )
 
+<<<<<<< HEAD
         return prop  # pyrefly: ignore [bad-return]
+=======
+        return prop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fn._torchscript_modifier = FunctionModifiers.UNUSED  # type: ignore[attr-defined]
     return fn
@@ -844,7 +880,10 @@ def forward(self, x):
         #   @torch.jit.ignore
         #   def fn(...):
         fn = drop
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn._torchscript_modifier = FunctionModifiers.IGNORE
         return fn
 
@@ -859,7 +898,10 @@ def forward(self, x):
         warnings.warn(
             "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function "
             "call on compilation. Use torch.jit.unused now. {}",
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             category=FutureWarning,
         )
 
@@ -868,7 +910,10 @@ def forward(self, x):
         warnings.warn(
             "ignore(True) has been deprecated. TorchScript will now drop the function "
             "call on compilation. Use torch.jit.unused now. {}",
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             category=FutureWarning,
         )
 
@@ -994,8 +1039,12 @@ def _check_overload_body(func):
         # Parsing the function definition can raise an OSError if source is unavailable.
         # Since this is just an initial check, just raise a warning if this is the case.
         warnings.warn(
+<<<<<<< HEAD
             f"Unable to retrieve source for @torch.jit._overload function: {func}.",
             stacklevel=2,
+=======
+            f"Unable to retrieve source for @torch.jit._overload function: {func}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return
 
@@ -1077,13 +1126,21 @@ def _overload_method(func):
     _check_overload_body(func)
     qual_name = _qualified_name(func)
     global _overloaded_methods
+<<<<<<< HEAD
     class_name_map = _overloaded_methods.get(qual_name)
+=======
+    class_name_map = _overloaded_methods.get(qual_name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if class_name_map is None:
         class_name_map = {}
         _overloaded_methods[qual_name] = class_name_map
 
     class_name, line_no = get_class_name_lineno(func)
+<<<<<<< HEAD
     method_overloads = class_name_map.get(class_name)
+=======
+    method_overloads = class_name_map.get(class_name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if method_overloads is None:
         method_overloads = []
         class_name_map[class_name] = method_overloads
@@ -1105,7 +1162,11 @@ def _get_overloaded_methods(method, mod_class):
     if not hasattr(method, "__name__"):
         return None
     qual_name = _qualified_name(method)
+<<<<<<< HEAD
     class_name_map = _overloaded_methods.get(qual_name)
+=======
+    class_name_map = _overloaded_methods.get(qual_name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if class_name_map is None:
         return None
     overloads = class_name_map.get(mod_class.__name__, None)
@@ -1117,7 +1178,11 @@ def _get_overloaded_methods(method, mod_class):
     mod_end_fileno = mod_class_fileno + len(get_source_lines_and_file(mod_class)[0])
     if not (method_line_no >= mod_class_fileno and method_line_no <= mod_end_fileno):
         raise AssertionError(
+<<<<<<< HEAD
             "Overloads are not usable when a module is redeclared within the same file: "
+=======
+            "Overloads are not useable when a module is redeclared within the same file: "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             + str(method)
         )
     return overloads
@@ -1251,6 +1316,7 @@ def _get_named_tuple_properties(
         ]
     else:
         defaults = []
+<<<<<<< HEAD
 
     obj_annotations = inspect.get_annotations(obj)
     if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
@@ -1258,6 +1324,16 @@ def _get_named_tuple_properties(
             # pyrefly: ignore [bad-argument-type]
             obj.__base__
         )
+=======
+    # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function
+    # Also, annotations from base class are not inherited so they need to be queried explicitly
+    if sys.version_info[:2] < (3, 10):
+        obj_annotations = getattr(obj, "__annotations__", {})
+    else:
+        obj_annotations = inspect.get_annotations(obj)
+        if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
+            obj_annotations = inspect.get_annotations(obj.__base__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     annotations = []
     for field in obj._fields:
@@ -1266,7 +1342,11 @@ def _get_named_tuple_properties(
             # [Note: ForwardRef annotations in NamedTuple attributes]
             # NamedTuple types are slightly different from normal types.
             #
+<<<<<<< HEAD
             # Normally, annotations are evaluated like this (during jit.script):
+=======
+            # Normally, annotations are evaluted like this (during jit.script):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # 1. Load strings of python code into c++ and parse.
             # 2. Get annotations as strings
             # 3. Use the PythonResolver's resolution callback (rcb) to convert
@@ -1388,8 +1468,12 @@ def check_empty_containers(obj) -> None:
             "calling torch.jit.isinstance in eager mode. For "
             "example, List[int] would become list and "
             "therefore falsely return True for List[float] or"
+<<<<<<< HEAD
             " List[str].",
             stacklevel=2,
+=======
+            " List[str]."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1447,9 +1531,13 @@ def container_checker(obj, target_type) -> bool:
                 return False
         return True
     elif origin_type is Union or issubclass(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         origin_type,
         BuiltinUnionType,
+=======
+        origin_type, BuiltinUnionType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):  # also handles Optional
         if obj is None:  # check before recursion because None is always fine
             return True
@@ -1501,7 +1589,11 @@ def persistent_id(self, obj):
         # unpicklable if it doesn't contain tensors, as we can just ignore/skip
         # it. To play it safe, we only do so for common objects that we're sure
         # don't contain tensors. Feel free to add new types here. Note also that
+<<<<<<< HEAD
         # even if a type isn't listed here this won't block users, since they
+=======
+        # even if a type isn't listed here this won't block users, since thet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # can just add a __getstate__ or __reduce__ method to their class.
         if isinstance(obj, LockType):
             return ""
diff --git a/torch/_lazy/closure.py b/torch/_lazy/closure.py
index bbe1a43bb12e2..4fde110da9120 100644
--- a/torch/_lazy/closure.py
+++ b/torch/_lazy/closure.py
@@ -63,10 +63,15 @@ def event_loop():
                         self._closure_exception.put(e)
                         return
 
+<<<<<<< HEAD
             self._closure_event_loop = threading.Thread(
                 target=event_loop
             )  # pyrefly: ignore [bad-assignment]
             self._closure_event_loop.start()  # pyrefly: ignore [missing-attribute]
+=======
+            self._closure_event_loop = threading.Thread(target=event_loop)
+            self._closure_event_loop.start()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def run(self, closure):
         with self._closure_lock:
diff --git a/torch/_lazy/extract_compiled_graph.py b/torch/_lazy/extract_compiled_graph.py
index 78455ebc964bf..46fa31dc9f085 100644
--- a/torch/_lazy/extract_compiled_graph.py
+++ b/torch/_lazy/extract_compiled_graph.py
@@ -3,8 +3,12 @@
 import dataclasses
 import itertools
 import os
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._lazy as lazy
@@ -57,9 +61,15 @@ class ReturnValueHandler:
     r"""
     When ltc_sync_multi is called on multi tensors, the compiled graph
     will contain output only for unique tensors - if a tensor appears multiple
+<<<<<<< HEAD
     times in the input to _ltc_sync_multi, only the first occurrence matters.
 
     However from python level, we still expect multi tensors returned with duplication
+=======
+    times in the input to _ltc_sync_multi, only the first occurance matters.
+
+    However from python level, we still expect multi tensors returned with duplciation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     even if the TS graph dedup the output. e.g. for method:
 
       def forward(self, a):
@@ -124,7 +134,11 @@ def hasDeviceArg(args, kwargs):
         # To force those tensors on the lazy device, we can not simply override
         # the device argument since there is no explicit device argument.
         # What we are doing here is, for the list of covered tensor factory methods
+<<<<<<< HEAD
         # we add a lazy device argument explicitly.
+=======
+        # we add a lazy device argument explicity.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #
         # TODO: This solution is no ideal since we may miss some factory methods. In future
         # when we support lazy mode, this method can be replaced by that.
@@ -171,7 +185,11 @@ def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable:
 
     if len(fallback_ops) > 0:
         raise RuntimeError(
+<<<<<<< HEAD
             f"Fail to extract the compiled graph because of fallback: {','.join(fallback_ops)}"
+=======
+            f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if not isinstance(lazy_out, (tuple, list)):
diff --git a/torch/_lazy/metrics.py b/torch/_lazy/metrics.py
index 3f676ec1f8ae0..62d557ab84a2d 100644
--- a/torch/_lazy/metrics.py
+++ b/torch/_lazy/metrics.py
@@ -13,7 +13,11 @@ def counter_names():
 
 
 def counter_value(name: str):
+<<<<<<< HEAD
     """Return the value of the counter with the specified name"""
+=======
+    """Return the value of the counter with the speficied name"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._C._lazy._counter_value(name)
 
 
diff --git a/torch/_library/autograd.py b/torch/_library/autograd.py
index 2707d07059edf..ba6033d4f44df 100644
--- a/torch/_library/autograd.py
+++ b/torch/_library/autograd.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import dataclasses
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional, Protocol
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Protocol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import _C, _ops, autograd, Tensor
 from torch.utils import _pytree
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index faa066a987f65..f44df33b615ef 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -3,9 +3,15 @@
 import inspect
 import logging
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from contextlib import contextmanager
 from typing import Any, Optional, overload, Union
+=======
+from collections.abc import Iterable, Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, Literal, Optional, overload, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import _C, _ops, Tensor
@@ -22,14 +28,23 @@
 @overload
 def custom_op(
     name: str,
+<<<<<<< HEAD
     fn: None = None,
+=======
+    fn: Literal[None] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     /,
     *,
     mutates_args: Union[str, Iterable[str]],
     device_types: device_types_t = None,
     schema: Optional[str] = None,
+<<<<<<< HEAD
     tags: Optional[Sequence[_C.Tag]] = None,
 ) -> Callable[[Callable[..., object]], "CustomOpDef"]: ...
+=======
+) -> Callable[[Callable[..., object]], "CustomOpDef"]:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -41,8 +56,13 @@ def custom_op(
     mutates_args: Union[str, Iterable[str]],
     device_types: device_types_t = None,
     schema: Optional[str] = None,
+<<<<<<< HEAD
     tags: Optional[Sequence[_C.Tag]] = None,
 ) -> "CustomOpDef": ...
+=======
+) -> "CustomOpDef":
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @exposed_in("torch.library")
@@ -212,7 +232,10 @@ def __init__(
         self._lib = get_library_allowing_overwrite(self._namespace, self._name)
         self._register_to_dispatcher(self._tags)
         self._disabled_kernel: set = set()
+<<<<<<< HEAD
         self._used_triton_kernels: list[Any] = list()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         OPDEFS[self._qualname] = self
 
     @property
@@ -348,6 +371,7 @@ def get_module():
                             fn = self._backend_fns[device_type]
                             return inspect.getmodule(fn)
 
+<<<<<<< HEAD
                         schema = self._opoverload._schema
                         if not schema._is_view_op():
                             utils._c_check_aliasing_constraint(
@@ -357,6 +381,15 @@ def get_module():
                                 result,
                                 get_module,
                             )
+=======
+                        utils._c_check_aliasing_constraint(
+                            self._name,
+                            args,
+                            kwargs,
+                            result,
+                            get_module,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return result
 
                     if device_type is None:
@@ -408,7 +441,11 @@ def register_fake(self, fn: Callable, /) -> Callable:
         (sizes/strides/storage_offset/device), it specifies what the properties of
         the output Tensors are.
 
+<<<<<<< HEAD
         Please see :func:`torch.library.register_fake` for more details.
+=======
+        Please see :func:`torch.library.impl_abstract` for more details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             fn (Callable): The function to register as the FakeTensor
@@ -451,10 +488,17 @@ def register_fake(self, fn: Callable, /) -> Callable:
             >>>
             >>> @nonzero.register_fake
             >>> def _(x):
+<<<<<<< HEAD
             >>> # Number of nonzero-elements is data-dependent.
             >>> # Since we cannot peek at the data in an abstract impl,
             >>> # we use the ctx object to construct a new symint that
             >>> # represents the data-dependent size.
+=======
+            >>>     # Number of nonzero-elements is data-dependent.
+            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # we use the ctx object to construct a new symint that
+            >>>     # represents the data-dependent size.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>>     ctx = torch.library.get_ctx()
             >>>     nnz = ctx.new_dynamic_size()
             >>>     shape = [nnz, x.dim()]
@@ -564,7 +608,11 @@ def register_autograd(
             >>>
             >>> x = torch.randn(3, requires_grad=True)
             >>> y = numpy_sin(x)
+<<<<<<< HEAD
             >>> (grad_x,) = torch.autograd.grad(y, x, torch.ones_like(y))
+=======
+            >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> assert torch.allclose(grad_x, x.cos())
             >>>
             >>> # Example with a keyword-only arg
@@ -584,12 +632,20 @@ def register_autograd(
             >>>
             >>> x = torch.randn(3, requires_grad=True)
             >>> y = numpy_mul(x, val=3.14)
+<<<<<<< HEAD
             >>> (grad_x,) = torch.autograd.grad(y, x, torch.ones_like(y))
+=======
+            >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> assert torch.allclose(grad_x, torch.full_like(x, 3.14))
 
         """
         schema = self._opoverload._schema
+<<<<<<< HEAD
         if not utils.is_functional_schema(schema, allow_valid_view=True):
+=======
+        if not utils.is_functional_schema(schema):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"Cannot register autograd formula for non-functional operator "
                 f"{self} with schema {schema}. Please create "
@@ -600,6 +656,13 @@ def register_autograd(
         self._setup_context_fn = setup_context
 
     def _register_to_dispatcher(self, tags: Sequence[_C.Tag]) -> None:
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            utils.warn_deploy(stacklevel=5)
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lib = self._lib
         schema_str = self._name + self._schema
         cpp_schema = _C.parse_schema(schema_str)
@@ -634,6 +697,7 @@ def fake_impl(*args, **kwargs):
 
         autograd_impl = autograd.make_autograd_impl(self._opoverload, self)
         lib.impl(self._name, autograd_impl, "Autograd", with_keyset=True)
+<<<<<<< HEAD
         schema = self._opoverload._schema
 
         if schema._is_view_op() or schema.is_mutable:
@@ -649,12 +713,27 @@ def fake_impl(*args, **kwargs):
             def adinplaceorview_impl(keyset, *args, **kwargs):
                 # Handle the mutated idx the user gave us explicitly
 
+=======
+
+        schema = self._opoverload._schema
+        if schema.is_mutable:
+            mutated_idxs, mutated_keys = utils.mutated_args_kwargs(schema)
+
+            def adinplaceorview_impl(keyset, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for idx in mutated_idxs:
                     increment_version(args[idx])
                 for key in mutated_keys:
                     increment_version(kwargs[key])
+<<<<<<< HEAD
                 # Handle view + mutation that are in the schema
                 return original_kernel.call_boxed(keyset, *args, **kwargs)
+=======
+                with _C._AutoDispatchBelowADInplaceOrView():
+                    return self._opoverload.redispatch(
+                        keyset & _C._after_ADInplaceOrView_keyset, *args, **kwargs
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             lib.impl(
                 self._name,
@@ -925,7 +1004,11 @@ def get_library_allowing_overwrite(
 
 
 def _maybe_get_opdef(
+<<<<<<< HEAD
     op: Union[CustomOpDef, _ops.OpOverload, str],
+=======
+    op: Union[CustomOpDef, _ops.OpOverload, str]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[CustomOpDef]:
     if isinstance(op, CustomOpDef):
         return op
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index b98949b388a91..802a26af9de83 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -20,14 +20,23 @@ def __init__(self, wrapped_obj: Any, script_class_name: str, x: torch.ScriptObje
         try:
             with _disable_current_modes():
                 self.real_obj = copy.deepcopy(x)
+<<<<<<< HEAD
         except RuntimeError as e:
             log.warning(  # noqa: G200
                 "Unable to deepcopy the custom object %s due to %s. "
+=======
+        except RuntimeError:
+            log.warning(
+                "Unable to deepcopy the custom object %s. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Defaulting to the user given object. This might be "
                 "dangerous as side effects may be directly applied "
                 "to the object.",
                 script_class_name,
+<<<<<<< HEAD
                 str(e),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.real_obj = x
 
@@ -135,6 +144,7 @@ def maybe_to_fake_obj(
     if tracing_with_real(x):
         return x
 
+<<<<<<< HEAD
     from torch._library.opaque_object import FakeOpaqueObject, OpaqueTypeStr
 
     if str(x._type()) == OpaqueTypeStr:
@@ -193,6 +203,24 @@ def maybe_to_fake_obj(
             )
 
         fake_x = _find_fake_class_for_script_object(x).__obj_unflatten__(fake_flattened)
+=======
+    # x.__obj_flatten__() could be calling some tensor operations inside but we don't
+    # want to call these ops in surrounding dispatch modes when executing it.
+    # Otherwise, for example, the fake tensor modes will error out when the tensors inside
+    # script obeject execute some operations like clone if allow_non_fake_input flag is set.
+    with _disable_current_modes():
+        flat_x = x.__obj_flatten__()  # type: ignore[attr-defined]
+
+    _check_valid_flat_script_obj(flat_x)
+
+    fake_flattened = pytree.tree_map_only(
+        torch.Tensor,
+        lambda t: fake_mode.from_tensor(t),
+        flat_x,
+    )
+
+    fake_x = _find_fake_class_for_script_object(x).__obj_unflatten__(fake_flattened)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fake_x_wrapped = FakeScriptObject(fake_x, x._type().qualified_name(), x)  # type: ignore[attr-defined]
 
@@ -215,7 +243,11 @@ def maybe_to_fake_obj(
                 FakeScriptMethod(fake_x_wrapped, name, method_schema),
             )
         else:
+<<<<<<< HEAD
             override_skip_list = {"__obj_flatten__", "__getstate__", "__setstate__"}
+=======
+            override_skip_list = {"__obj_flatten__", "__get_state__", "__set_state__"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if name not in override_skip_list:
                 log.warning("fake object of %s doesn't implement method %s.", x, name)
     return fake_x_wrapped
@@ -281,8 +313,13 @@ def pop(self):
             def size(self):
                 return len(self.queue)
 
+<<<<<<< HEAD
     In this example, the original TensorQeue need to add a __obj_flatten__ method
     to the class TensorQueue and the flattened result is passed into FakeTensorQueue's
+=======
+    In this example, the original TensorQeue need to addd a __obj_flatten__ method
+    to the class TensorQueue and the flattend result is passed into FakeTensorQueue's
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __obj_unflatten__ as inputs to create a fake class. This protocol allows pytorch to look
     at the contents of the script object and properly handle them in the subsystems
     like dynamo, aot_aotugrad or more.
@@ -291,7 +328,11 @@ def size(self):
     def inner(fake_class: HasStaticMethodFromReal):
         ns, name = parse_namespace(qualname)
 
+<<<<<<< HEAD
         # This also checks whether the referred torch::class_ exists.
+=======
+        # This also checks whether the refered torch::class_ exists.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._get_custom_class_python_wrapper(ns, name)
 
         from_method = getattr(fake_class, _CONVERT_FROM_REAL_NAME, None)
diff --git a/torch/_library/fake_impl.py b/torch/_library/fake_impl.py
index 877ebb0c59122..d0cdcfdb5b38b 100644
--- a/torch/_library/fake_impl.py
+++ b/torch/_library/fake_impl.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
diff --git a/torch/_library/fake_profile.py b/torch/_library/fake_profile.py
index 984a996b90dc1..3a2d2f988995e 100644
--- a/torch/_library/fake_profile.py
+++ b/torch/_library/fake_profile.py
@@ -2,9 +2,15 @@
 import io
 import logging
 import os
+<<<<<<< HEAD
 from collections.abc import Callable, Generator
 from dataclasses import dataclass
 from typing import Any, Optional, Union
+=======
+from collections.abc import Generator
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._library.custom_ops import _maybe_get_opdef
@@ -102,7 +108,11 @@ def unsafe_generate_fake_kernels(op_profiles: dict[str, set[OpProfile]]) -> Gene
     an output with the same metadata as in the recorded profile. If a profile
     doesn't exist then an exception will be thrown.
 
+<<<<<<< HEAD
     The fake kernel generation is considered unsafe because it relies on the
+=======
+    The fake kernel generation is considerd unsafe because it relies on the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rigid, pre-defined operator profiles that do not account for potential
     variations in output behavior. Specifically, the generated kernels assume a
     fixed relationship between input and output ranks. However, in reality, it's
@@ -198,7 +208,10 @@ def generate_yaml_from_profiles(op_profiles: dict[str, set[OpProfile]]) -> str:
     to a file. The yaml string can be loaded back into an operator profile
     structure using `read_profiles_from_yaml`.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import yaml
 
     from torch._export.serde.serialize import (
@@ -262,7 +275,10 @@ def read_profiles_from_yaml(yaml_str: str) -> dict[str, set[OpProfile]]:
     """
     Reads the yaml saved by `save_op_profiles` and returns the operator profiles.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import yaml
 
     from torch._export.serde.serialize import (
diff --git a/torch/_library/infer_schema.py b/torch/_library/infer_schema.py
index 62bd70f65a510..7faa38789eb2b 100644
--- a/torch/_library/infer_schema.py
+++ b/torch/_library/infer_schema.py
@@ -9,8 +9,11 @@
 from torch import device, dtype, Tensor, types
 from torch.utils._exposed_in import exposed_in
 
+<<<<<<< HEAD
 from .opaque_object import _OPAQUE_TYPES, is_opaque_type, OpaqueType, OpaqueTypeStr
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # This is used as a negative test for
 # test_custom_ops.py::TestTypeConversion::test_type_eval.
@@ -125,6 +128,7 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
         # we convert it to the actual type.
         annotation_type, _ = unstringify_type(param.annotation)
 
+<<<<<<< HEAD
         schema_type = None
         if annotation_type not in SUPPORTED_PARAM_TYPES:
             if is_opaque_type(annotation_type):
@@ -139,6 +143,10 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                 hasattr(annotation_type, "__origin__")
                 and annotation_type.__origin__ is tuple
             ):
+=======
+        if annotation_type not in SUPPORTED_PARAM_TYPES:
+            if annotation_type.__origin__ is tuple:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 list_type = tuple_to_list(annotation_type)
                 example_type_str = "\n\n"
                 # Only suggest the list type if this type is supported.
@@ -155,28 +163,44 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                     f"Parameter {name} has unsupported type {param.annotation}. "
                     f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
                 )
+<<<<<<< HEAD
         else:
             schema_type = SUPPORTED_PARAM_TYPES[annotation_type]
 
         assert schema_type is not None
 
         if type(mutates_args) is str:
+=======
+
+        schema_type = SUPPORTED_PARAM_TYPES[annotation_type]
+        if type(mutates_args) == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if mutates_args != UNKNOWN_MUTATES:
                 raise ValueError(
                     "mutates_args must either be a sequence of the names of "
                     "the arguments that are mutated or the string 'unknown'. "
                 )
             if schema_type.startswith("Tensor"):
+<<<<<<< HEAD
                 schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor') :]}"
+=======
+                schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor'):]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif name in mutates_args:
             if not schema_type.startswith("Tensor"):
                 error_fn(
                     f"Parameter {name} is in mutable_args but only Tensors or collections of Tensors can be mutated"
                 )
+<<<<<<< HEAD
             schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor') :]}"
         seen_args.add(name)
         if param.default is inspect.Parameter.empty:
             # pyrefly: ignore [bad-argument-type]
+=======
+            schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor'):]}"
+        seen_args.add(name)
+        if param.default is inspect.Parameter.empty:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             params.append(f"{schema_type} {name}")
         else:
             default_repr = None
@@ -194,7 +218,10 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                     f"Parameter {name} has an unsupported default value type {type(param.default)}. "
                     f"Please file an issue on GitHub so we can prioritize this."
                 )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             params.append(f"{schema_type} {name}={default_repr}")
     if mutates_args != UNKNOWN_MUTATES:
         mutates_args_not_seen = set(mutates_args) - seen_args
@@ -221,7 +248,10 @@ def derived_types(
 ):
     result: list[tuple[Union[type, typing._SpecialForm, GenericAlias], str]] = [
         (base_type, cpp_type),
+<<<<<<< HEAD
         # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (typing.Optional[base_type], f"{cpp_type}?"),
     ]
 
@@ -240,7 +270,10 @@ def derived_seq_types(typ: Union[type, typing._SpecialForm]):
     if optional_base_list:
         result.extend(
             (seq_typ, f"{cpp_type}?[]")
+<<<<<<< HEAD
             # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for seq_typ in derived_seq_types(typing.Optional[base_type])
         )
     if optional_list_base:
@@ -252,7 +285,10 @@ def derived_seq_types(typ: Union[type, typing._SpecialForm]):
 
 
 def get_supported_param_types():
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     data: list[tuple[Union[type, typing._SpecialForm], str, bool, bool, bool]] = [
         # (python type, schema type, type[] variant, type?[] variant, type[]? variant
         (Tensor, "Tensor", True, True, False),
@@ -263,7 +299,10 @@ def get_supported_param_types():
         (types.Number, "Scalar", True, False, False),
         (dtype, "ScalarType", False, False, False),
         (device, "Device", False, False, False),
+<<<<<<< HEAD
         (OpaqueType, OpaqueTypeStr, False, False, False),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     result = []
     for line in data:
@@ -296,7 +335,10 @@ def parse_return(annotation, error_fn):
                 f"Return has unsupported type {annotation}. "
                 f"The valid types are: {SUPPORTED_RETURN_TYPES}."
             )
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return SUPPORTED_RETURN_TYPES[annotation]
 
     args = typing.get_args(annotation)
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index 8709c9e95c2b5..ca651ab5918f2 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+# mypy: allow-untyped-defs
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .fake_impl import FakeImplHolder
 from .utils import RegistrationHandle
@@ -24,6 +29,7 @@ class SimpleLibraryRegistry:
     (including the overload) to SimpleOperatorEntry.
     """
 
+<<<<<<< HEAD
     def __init__(self) -> None:
         self._data: dict[str, SimpleOperatorEntry] = {}
 
@@ -32,6 +38,15 @@ def find(self, qualname: str) -> "SimpleOperatorEntry":
         if res is None:
             self._data[qualname] = res = SimpleOperatorEntry(qualname)
         return res
+=======
+    def __init__(self):
+        self._data = {}
+
+    def find(self, qualname: str) -> "SimpleOperatorEntry":
+        if qualname not in self._data:
+            self._data[qualname] = SimpleOperatorEntry(qualname)
+        return self._data[qualname]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 singleton: SimpleLibraryRegistry = SimpleLibraryRegistry()
@@ -44,7 +59,11 @@ class SimpleOperatorEntry:
     registered to.
     """
 
+<<<<<<< HEAD
     def __init__(self, qualname: str) -> None:
+=======
+    def __init__(self, qualname: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.qualname: str = qualname
         self.fake_impl: FakeImplHolder = FakeImplHolder(qualname)
         self.torch_dispatch_rules: GenericTorchDispatchRuleHolder = (
@@ -53,17 +72,30 @@ def __init__(self, qualname: str) -> None:
 
     # For compatibility reasons. We can delete this soon.
     @property
+<<<<<<< HEAD
     def abstract_impl(self) -> FakeImplHolder:
+=======
+    def abstract_impl(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.fake_impl
 
 
 class GenericTorchDispatchRuleHolder:
+<<<<<<< HEAD
     def __init__(self, qualname: str) -> None:
         self._data: dict[type, Callable[..., Any]] = {}
         self.qualname: str = qualname
 
     def register(
         self, torch_dispatch_class: type, func: Callable[..., Any]
+=======
+    def __init__(self, qualname):
+        self._data = {}
+        self.qualname = qualname
+
+    def register(
+        self, torch_dispatch_class: type, func: Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> RegistrationHandle:
         if self.find(torch_dispatch_class):
             raise RuntimeError(
@@ -71,11 +103,16 @@ def register(
             )
         self._data[torch_dispatch_class] = func
 
+<<<<<<< HEAD
         def deregister() -> None:
+=======
+        def deregister():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             del self._data[torch_dispatch_class]
 
         return RegistrationHandle(deregister)
 
+<<<<<<< HEAD
     def find(self, torch_dispatch_class: type) -> Optional[Callable[..., Any]]:
         return self._data.get(torch_dispatch_class, None)
 
@@ -83,6 +120,13 @@ def find(self, torch_dispatch_class: type) -> Optional[Callable[..., Any]]:
 def find_torch_dispatch_rule(
     op: Any, torch_dispatch_class: type
 ) -> Optional[Callable[..., Any]]:
+=======
+    def find(self, torch_dispatch_class):
+        return self._data.get(torch_dispatch_class, None)
+
+
+def find_torch_dispatch_rule(op, torch_dispatch_class: type) -> Optional[Callable]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return singleton.find(op.__qualname__).torch_dispatch_rules.find(
         torch_dispatch_class
     )
diff --git a/torch/_library/triton.py b/torch/_library/triton.py
index 761279743f3aa..3b84a32b4999f 100644
--- a/torch/_library/triton.py
+++ b/torch/_library/triton.py
@@ -1,9 +1,16 @@
+<<<<<<< HEAD
 import ast
 import contextlib
 import inspect
 import threading
 from collections.abc import Callable, Generator, Iterable
 from typing import Any, Optional, Union
+=======
+import contextlib
+import threading
+from collections.abc import Generator, Iterable
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils._exposed_in import exposed_in
 
@@ -11,6 +18,7 @@
 from .infer_schema import infer_schema
 
 
+<<<<<<< HEAD
 triton_ops_to_kernels: dict[str, list[object]] = {}
 
 
@@ -84,6 +92,8 @@ def visit_Call(self, node: ast.Call) -> None:
     return find_triton_kernels(fn)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @exposed_in("torch.library")
 def triton_op(
     name: str,
@@ -230,6 +240,7 @@ def functional_decomp(  # type: ignore[no-untyped-def]
             if custom_triton_ops_decomposition_disabled():
                 return mode.__torch_dispatch__(op, types, args, kwargs)
             else:
+<<<<<<< HEAD
                 # TODO: https://github.com/pytorch/pytorch/issues/160333
                 # We should deduplicate the unrecognized_types logic.
                 import torch._subclasses
@@ -252,6 +263,11 @@ def functional_decomp(  # type: ignore[no-untyped-def]
 
         triton_kernels = get_inner_triton_kernels(fn)
         triton_ops_to_kernels[name] = triton_kernels
+=======
+                with mode:
+                    return fn(*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result.register_torch_dispatch(FunctionalTensorMode, functional_decomp)
         return result
 
diff --git a/torch/_library/utils.py b/torch/_library/utils.py
index edbe86992b6ad..badfb783a828d 100644
--- a/torch/_library/utils.py
+++ b/torch/_library/utils.py
@@ -2,16 +2,37 @@
 import dataclasses
 import inspect
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator
 from typing import Any, Literal, Optional, overload, Union
 
 import torch
 import torch.utils._pytree as pytree
 import torchgen
+=======
+import warnings
+from collections.abc import Iterable, Iterator
+from typing import Any, Callable, Union
+
+import torch
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import _C, _utils_internal
 from torch._ops import OpOverload
 
 
+<<<<<<< HEAD
+=======
+def warn_deploy(stacklevel=3):
+    warnings.warn(
+        "Python torch.library APIs do nothing under torch::deploy (multipy). "
+        "Please instead use C++ custom operator registration APIs.",
+        RuntimeWarning,
+        stacklevel=stacklevel,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class Kernel:
     """Models a (function, source location)"""
@@ -75,15 +96,23 @@ def is_builtin(op: OpOverload) -> bool:
     return op.namespace in {"aten", "prim", "prims"}
 
 
+<<<<<<< HEAD
 def is_functional_schema(schema: Any, *, allow_valid_view: bool = False) -> bool:
+=======
+def is_functional_schema(schema: Any) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Check if the schema is functional.
 
     An operator is functional if:
     - it does not mutate any of its inputs
+<<<<<<< HEAD
     - If no view are allowed
         - it does not return a view on any of its inputs
     - If valid views are allowed
         - it is not a view or a view with a single input Tensor and single output Tensor
+=======
+    - it does not return a view on any of its inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - it has at least one return
     """
 
@@ -94,6 +123,7 @@ def is_functional(schema):
         is_non_mutating_view = len(rets) > 0 and any(
             r.alias_info is not None and not r.alias_info.is_write for r in rets
         )
+<<<<<<< HEAD
         num_tensor_inputs = 0
         num_tensor_outputs = 0
 
@@ -119,6 +149,10 @@ def is_functional(schema):
             return allow_valid_view and (
                 num_tensor_inputs == 1 and num_tensor_outputs == 1
             )
+=======
+        if is_non_mutating_view:
+            return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not schema.returns:
             return False
         return True
@@ -162,7 +196,11 @@ def mutates_and_returns_first_arg(op: OpOverload):
     if op.namespace != "aten":
         return False
     schema = op._schema
+<<<<<<< HEAD
     if len(schema.returns) != 1:
+=======
+    if not len(schema.returns) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     if schema.returns[0].alias_info is None:
         return False
@@ -368,13 +406,22 @@ def check_aliasing_constraint(name, prev, result, get_module=lambda: "???"):
     """
     custom operators' outputs must not alias any inputs or other outputs.
     """
+<<<<<<< HEAD
     storages = {t.untyped_storage()._cdata for t in prev if isinstance(t, torch.Tensor)}
+=======
+    storages = {id(t.untyped_storage()) for t in prev if isinstance(t, torch.Tensor)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tuple_result = result
     if not isinstance(result, tuple):
         tuple_result = (result,)
     for tensor in iter_tensors(tuple_result, {}):
+<<<<<<< HEAD
         key = tensor.untyped_storage()._cdata
         if tensor.untyped_storage()._cdata in storages:
+=======
+        key = id(tensor.untyped_storage())
+        if id(tensor.untyped_storage()) in storages:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"{name} (with implementation in {get_module()}): "
                 f"The output of this custom operator (1) must not "
@@ -459,7 +506,11 @@ def check_one(info, was_mutated):
                     f"{self.op._name}: for argument '{info.name}': the operator's schema "
                     f"{self.op._schema} specified that "
                     f"the operator {'mutates' if info.is_write else 'does not mutate'} "
+<<<<<<< HEAD
                     f"the argument, but this seems to be empirically wrong. "
+=======
+                    f"the argument, but this seems to be emperically wrong. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f"Please make the schema and operator behavior consistent. "
                     f"You can specify that an operator mutates a Tensor by "
                     f"e.g. changing its schema type from 'Tensor name' to 'Tensor(a!) name'"
@@ -528,6 +579,7 @@ def mutated_args_kwargs(schema: _C.FunctionSchema) -> tuple[list[int], list[str]
 ]
 
 
+<<<<<<< HEAD
 # Case 1: with_default=True (or omitted). Return type is guaranteed to be a Tag.
 @overload
 def get_layout_constraint_tag(
@@ -542,6 +594,8 @@ def get_layout_constraint_tag(
 ) -> Optional[_C.Tag]: ...
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_layout_constraint_tag(fn, *, with_default=True):
     for tag in tags_by_priority:
         if tag in fn.tags:
diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py
index 43c8b65767e00..e4f07ff67c695 100644
--- a/torch/_linalg_utils.py
+++ b/torch/_linalg_utils.py
@@ -8,7 +8,11 @@
 
 
 def is_sparse(A):
+<<<<<<< HEAD
     """Check if tensor A is a sparse COO tensor. All other sparse storage formats (CSR, CSC, etc...) will return False."""
+=======
+    """Check if tensor A is a sparse tensor"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(A, torch.Tensor):
         return A.layout == torch.sparse_coo
 
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 1137efdc5f63a..f72c7554caa62 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -57,7 +57,11 @@ def _polynomial_coefficients_given_roots(roots):
         # So the code below tries to circumvent the explicit root finding by series
         # of operations on memory copies imitating the Horner's method.
         # The memory copies are required to construct nodes in the computational graph
+<<<<<<< HEAD
         # by exploiting the explicit (not in-place, separate node for each step)
+=======
+        # by exploting the explicit (not in-place, separate node for each step)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # recursion of the Horner's method.
         # Needs more memory, O(... * k^2), but with only O(... * k^2) complexity.
         poly_coeffs_new = poly_coeffs.clone() if roots.requires_grad else poly_coeffs
@@ -80,7 +84,11 @@ def _polynomial_value(poly, x, zero_power, transition):
                      poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and
                      poly(x) = poly[..., 0] * zero_power + ... + poly[..., n] * x^n
 
+<<<<<<< HEAD
       x (Tensor): the value (possible batched) to evaluate the polynomial `poly` at.
+=======
+      x (Tensor): the value (possible batched) to evalate the polynomial `poly` at.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       zero_power (Tensor): the representation of `x^0`. It is application-specific.
 
@@ -168,7 +176,11 @@ def _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest):
     # of the characteristic polynomial.
     chr_poly_D = _polynomial_coefficients_given_roots(D)
 
+<<<<<<< HEAD
     # the code below finds the explicit solution to the Sylvester equation
+=======
+    # the code belows finds the explicit solution to the Sylvester equation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # U_ortho^T A U_ortho dX - dX D = -U_ortho^T A U
     # and incorporates it into the whole gradient stored in the `res` variable.
     #
@@ -301,7 +313,11 @@ def forward(  # type: ignore[override]
         return D, U
 
     @staticmethod
+<<<<<<< HEAD
     def backward(ctx, D_grad, U_grad):  # pyrefly: ignore  # bad-override
+=======
+    def backward(ctx, D_grad, U_grad):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         A_grad = B_grad = None
         grads = [None] * 14
 
@@ -391,17 +407,23 @@ def lobpcg(
       we do the following symmetrization map: `A -> (A + A.t()) / 2`.
       The map is performed only when the `A` requires gradients.
 
+<<<<<<< HEAD
     .. warning:: LOBPCG algorithm is not applicable when the number of `A`'s rows
       is smaller than 3x the number of requested eigenpairs `n`.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
 
       A (Tensor): the input tensor of size :math:`(*, m, m)`
 
+<<<<<<< HEAD
       k (integer, optional): the number of requested
                   eigenpairs. Default is the number of :math:`X`
                   columns (when specified) or `1`.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       B (Tensor, optional): the input tensor of size :math:`(*, m,
                   m)`. When not specified, `B` is interpreted as
                   identity matrix.
@@ -411,6 +433,7 @@ def lobpcg(
                   initial approximation of eigenvectors. X must be a
                   dense tensor.
 
+<<<<<<< HEAD
       n (integer, optional): if :math:`X` is not specified then `n`
                   specifies the size of the generated random
                   approximation of eigenvectors. Default value for `n`
@@ -426,6 +449,21 @@ def lobpcg(
                  the current approximation of eigenpairs is returned.
                  For infinite iteration but until convergence criteria
                  is met, use `-1`.
+=======
+      iK (tensor, optional): the input tensor of size :math:`(*, m,
+                  m)`. When specified, it will be used as preconditioner.
+
+      k (integer, optional): the number of requested
+                  eigenpairs. Default is the number of :math:`X`
+                  columns (when specified) or `1`.
+
+      n (integer, optional): if :math:`X` is not specified then `n`
+                  specifies the size of the generated random
+                  approximation of eigenvectors. Default value for `n`
+                  is `k`. If :math:`X` is specified, the value of `n`
+                  (when specified) must be the number of :math:`X`
+                  columns.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       tol (float, optional): residual tolerance for stopping
                  criterion. Default is `feps ** 0.5` where `feps` is
@@ -441,6 +479,15 @@ def lobpcg(
                  description of the function above. Default is
                  "ortho".
 
+<<<<<<< HEAD
+=======
+      niter (int, optional): maximum number of iterations. When
+                 reached, the iteration process is hard-stopped and
+                 the current approximation of eigenpairs is returned.
+                 For infinite iteration but until convergence criteria
+                 is met, use `-1`.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tracker (callable, optional) : a function for tracing the
                  iteration process. When specified, it is called at
                  each iteration step with LOBPCG instance as an
@@ -1048,11 +1095,15 @@ def _get_svqb(self, U: Tensor, drop: bool, tau: float) -> Tensor:
         else:
             E[(torch.where(E < t))[0]] = t
 
+<<<<<<< HEAD
         return torch.matmul(
             U * d_col.mT,
             # pyrefly: ignore [unsupported-operation]
             Z * E**-0.5,
         )
+=======
+        return torch.matmul(U * d_col.mT, Z * E**-0.5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_ortho(self, U, V):
         """Return B-orthonormal U with columns are B-orthogonal to V.
diff --git a/torch/_logging/__init__.py b/torch/_logging/__init__.py
index d0fdebb23bde9..0972ce187d5a3 100644
--- a/torch/_logging/__init__.py
+++ b/torch/_logging/__init__.py
@@ -12,7 +12,10 @@
     dtrace_structured,
     get_structured_logging_overhead,
     getArtifactLogger,
+<<<<<<< HEAD
     hide_warnings,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LazyString,
     set_logs,
     trace_structured,
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 04298b7cdac84..ef961c9385d8e 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import contextlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import hashlib
 import importlib.util
@@ -13,11 +16,17 @@
 import sys
 import tempfile
 import time
+<<<<<<< HEAD
 import warnings
 from collections import defaultdict
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Any, Generic, Optional, Union
+=======
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Callable, Generic, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 from weakref import WeakSet
 
@@ -699,7 +708,11 @@ def pad_to(s, length=30):
 
   TORCH_LOGS_OUT=/tmp/output.txt will output the logs to /tmp/output.txt as
   well. This is useful when the output is long.
+<<<<<<< HEAD
 """
+=======
+"""  # flake8: noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     msg = f"""
 TORCH_LOGS Info
 {examples}
@@ -727,6 +740,7 @@ def _invalid_settings_err_msg(settings, verbose=False):
     return msg
 
 
+<<<<<<< HEAD
 def process_env_var_string_for_windows(env_var_str: str) -> str:
     """
     When we setup logging config as guide: https://docs.pytorch.org/docs/stable/logging.html
@@ -770,6 +784,10 @@ def remove_outer_quotes(s: str) -> str:
 def _parse_log_settings(settings):
     settings = process_env_var_string_for_windows(settings)
 
+=======
+@functools.lru_cache
+def _parse_log_settings(settings):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if settings == "":
         return {}
 
@@ -1200,6 +1218,7 @@ def warning_once(logger_obj, *args, **kwargs) -> None:
     logger_obj.warning(*args, **kwargs)
 
 
+<<<<<<< HEAD
 def safe_grad_filter(message, category, filename, lineno, file=None, line=None) -> bool:
     return "The .grad attribute of a Tensor" not in str(message)
 
@@ -1239,6 +1258,8 @@ def _showwarning(*args, **kwargs):
         warnings.showwarning = prior
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LazyString(Generic[_P]):
     def __init__(
         self, func: Callable[_P, str], *args: _P.args, **kwargs: _P.kwargs
@@ -1290,7 +1311,10 @@ def trace_structured_artifact(
     name: str,  # this will go in metadata
     encoding: str,
     payload_fn: Callable[[], Optional[Union[str, object]]] = lambda: None,
+<<<<<<< HEAD
     compile_id: Optional[CompileId] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     trace_structured(
         "artifact",
@@ -1299,7 +1323,10 @@ def trace_structured_artifact(
             "encoding": encoding,
         },
         payload_fn=payload_fn,
+<<<<<<< HEAD
         compile_id=compile_id,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -1322,12 +1349,17 @@ def trace_structured(
     payload is an arbitrary string, which can be arbitrarily long (but expected to have
     newlines so no lines are too long)
     """
+<<<<<<< HEAD
     assert name not in [
+=======
+    assert "name" not in [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "rank",
         "compiled_autograd_id",
         "frame_id",
         "frame_compile_id",
         "attempt",
+<<<<<<< HEAD
         "severity",
         "timestamp",
         "pathname",
@@ -1339,6 +1371,15 @@ def trace_structured(
     assert callable(payload_fn), (
         f"payload_fn should be callable, but got {type(payload_fn)}"
     )
+=======
+    ]
+    assert callable(
+        metadata_fn
+    ), f"metadata_fn should be callable, but got {type(metadata_fn)}"
+    assert callable(
+        payload_fn
+    ), f"payload_fn should be callable, but got {type(payload_fn)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # trace_log never propagates and is ALWAYS DEBUG, so also check that there
     # are handlers instead of checking the log level
     if trace_log.handlers:
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 162ad53a63ccd..22b18cb2b5d11 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -246,9 +246,12 @@
     "Logs debug info for hierarchical compilation",
     off_by_default=True,
 )
+<<<<<<< HEAD
 register_artifact(
     "annotation",
     "Logs detailed steps of the creating annotation on graph nodes",
     off_by_default=True,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 register_artifact("custom_format_test_artifact", "Testing only", log_format="")
diff --git a/torch/_logging/scribe.py b/torch/_logging/scribe.py
index 2feb814d4a2c7..23f800f5c09e1 100644
--- a/torch/_logging/scribe.py
+++ b/torch/_logging/scribe.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import TypeAlias, Union
+=======
+from typing import Callable, Union
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
diff --git a/torch/_logging/structured.py b/torch/_logging/structured.py
index e6dd36a6c6968..ac08617e048b0 100644
--- a/torch/_logging/structured.py
+++ b/torch/_logging/structured.py
@@ -1,7 +1,10 @@
 """
 Utilities for converting data types into structured JSON for dumping.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import inspect
 import os
 import traceback
@@ -21,7 +24,11 @@ def intern_string(s: Optional[str]) -> int:
     if s is None:
         return -1
 
+<<<<<<< HEAD
     r = INTERN_TABLE.get(s)
+=======
+    r = INTERN_TABLE.get(s, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if r is None:
         r = len(INTERN_TABLE)
         INTERN_TABLE[s] = r
diff --git a/torch/_lowrank.py b/torch/_lowrank.py
index 182883cfc5e59..b319439b3044d 100644
--- a/torch/_lowrank.py
+++ b/torch/_lowrank.py
@@ -27,7 +27,11 @@ def get_approximate_basis(
     .. note:: For an adequate approximation of a k-rank matrix
               :math:`A`, where k is not known in advance but could be
               estimated, the number of :math:`Q` columns, q, can be
+<<<<<<< HEAD
               chosen according to the following criteria: in general,
+=======
+              choosen according to the following criteria: in general,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               :math:`k <= q <= min(2*k, m, n)`. For large low-rank
               matrices, take :math:`q = k + 5..10`.  If k is
               relatively small compared to :math:`min(m, n)`, choosing
@@ -100,7 +104,11 @@ def svd_lowrank(
     .. note:: For an adequate approximation of a k-rank matrix
               :math:`A`, where k is not known in advance but could be
               estimated, the number of :math:`Q` columns, q, can be
+<<<<<<< HEAD
               chosen according to the following criteria: in general,
+=======
+              choosen according to the following criteria: in general,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               :math:`k <= q <= min(2*k, m, n)`. For large low-rank
               matrices, take :math:`q = k + 5..10`.  If k is
               relatively small compared to :math:`min(m, n)`, choosing
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 91080bf5a8b3f..e5598dba9e8e1 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1,9 +1,17 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from enum import Enum
 from functools import wraps
 from typing import Optional, TypeVar, Union
+=======
+import operator
+from collections.abc import Sequence
+from enum import Enum
+from functools import reduce, wraps
+from typing import Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -16,15 +24,31 @@
     meta_table,
 )
 from torch._ops import OpOverload
+<<<<<<< HEAD
 from torch._prims import _prim_elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
+=======
+from torch._prims import (
+    _prim_elementwise_meta,
+    ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND,
+    view_of,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import (
     BoolLike,
     corresponding_complex_dtype,
     corresponding_real_dtype,
+<<<<<<< HEAD
+=======
+    definitely_contiguous,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     FloatLike,
     IntLike,
+<<<<<<< HEAD
+=======
+    is_contiguous,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_contiguous_strides_for,
     Number,
     suggest_memory_format,
@@ -195,6 +219,171 @@ def linalg_cross(self, other, *, dim=-1):
     return self.new_empty(out_shape)
 
 
+<<<<<<< HEAD
+=======
+# This function is python match of computeStride_impl in TensorUtils.cpp
+def _compute_stride(old_shape, old_stride, new_shape, size_oblivious=False):
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_eq,
+    )
+
+    def maybe_guard_or_false(x):
+        if size_oblivious:
+            return guard_or_false(x)
+
+        return x
+
+    def maybe_guard_or_true(x):
+        if size_oblivious:
+            return guard_or_true(x)
+
+        return x
+
+    if len(old_shape) == 0:
+        return [1] * len(new_shape)
+
+    numel = reduce(operator.mul, old_shape, 1)
+    zero_numel = maybe_guard_or_false(numel == 0)
+    if zero_numel and maybe_guard_or_false(sym_eq(old_shape, new_shape)):
+        return old_stride
+
+    new_stride = [0] * len(new_shape)
+
+    if zero_numel:
+        for view_d in range(len(new_shape) - 1, -1, -1):
+            if view_d == len(new_shape) - 1:
+                new_stride[view_d] = 1
+            else:
+                new_stride[view_d] = (
+                    max(new_shape[view_d + 1], 1) * new_stride[view_d + 1]
+                )
+        return new_stride
+
+    view_d = len(new_shape) - 1
+    chunk_base_stride = old_stride[-1]
+    tensor_numel = 1
+    view_numel = 1
+
+    for tensor_d in range(len(old_shape) - 1, -1, -1):
+        tensor_numel *= old_shape[tensor_d]
+
+        if tensor_d == 0 or (
+            maybe_guard_or_true(old_shape[tensor_d - 1] != 1)
+            and maybe_guard_or_true(
+                old_stride[tensor_d - 1] != tensor_numel * chunk_base_stride
+            )
+        ):
+            while view_d >= 0 and (
+                maybe_guard_or_true(view_numel < tensor_numel)
+                or maybe_guard_or_false(new_shape[view_d] == 1)
+            ):
+                new_stride[view_d] = view_numel * chunk_base_stride
+                view_numel *= new_shape[view_d]
+                view_d -= 1
+
+            if maybe_guard_or_true(view_numel != tensor_numel):
+                return None
+
+            if tensor_d > 0:
+                chunk_base_stride = old_stride[tensor_d - 1]
+                tensor_numel = 1
+                view_numel = 1
+    if view_d != -1:
+        return None
+    return new_stride
+
+
+def _view_has_unbacked_input(a, shape):
+    from torch.fx.experimental.symbolic_shapes import has_hint
+
+    return (
+        any(not has_hint(s) for s in a.size())
+        or any(not has_hint(s) for s in a.stride())
+        or any(not has_hint(s) for s in shape)
+    )
+
+
+def _view_unbacked_meta(a, shape, size_oblivious_enabled=True):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_eq
+
+    # Creates a valid shape
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+
+    # Reshape may be given a shape with a -1 length
+    # This indicates that the dimension's length should be inferred
+    shape = utils.infer_size(shape, a.numel())
+
+    # Special-cases reshaping zero dim tensors
+    if a.ndim == 0:
+        _a = a
+        for length in shape:
+            torch._check(length == 1)
+            _a = torch._refs.unsqueeze(_a, -1)
+        if _a is a:
+            return view_of(a)
+        else:
+            return _a
+
+    # Special-cases reshaping to zero dim tensors
+    if len(shape) == 0:
+        _a = a
+        for length in a.shape:
+            torch._check(length == 1)
+            _a = torch._refs.squeeze(_a, -1)
+        if _a is a:
+            return view_of(a)
+        else:
+            return _a
+
+    shape_numel = reduce(operator.mul, shape, 1)
+
+    torch._check(
+        a.numel() == shape_numel,
+        lambda: f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+    )
+
+    if len(shape) == len(a.shape) and guard_or_false(sym_eq(shape, a.shape)):
+        return view_of(a)
+
+    if definitely_contiguous(a) if size_oblivious_enabled else is_contiguous(a):
+        strides = utils.make_contiguous_strides_for(shape)
+        return a.as_strided(shape, strides)
+
+    new_strides = _compute_stride(
+        a.size(), a.stride(), shape, size_oblivious=size_oblivious_enabled
+    )
+
+    if new_strides is not None:
+        return a.as_strided(shape, new_strides)
+
+    # If we fail to do size oblivious view, and backed_size_oblivious was on,
+    # then we redo everything by looking at hints and guarding instead of failing.
+    # Also if the expression has unbacked symbols, then we run again with size_oblivious_enabled=False
+    # to throw a data dependent error.
+
+    if size_oblivious_enabled and (
+        torch.fx.experimental._config.backed_size_oblivious
+        or _view_has_unbacked_input(a, shape)
+    ):
+        return _view_unbacked_meta(a, shape, size_oblivious_enabled=False)
+
+    msg = f"Cannot view a tensor with shape {a.shape} and strides {a.stride()} as a tensor with shape {shape}!"
+    raise ValueError(msg)
+
+
+@register_meta(aten.view.default)
+def _view_meta(a, *shape):
+    if torch.fx.experimental._config.backed_size_oblivious or _view_has_unbacked_input(
+        a, shape
+    ):
+        return _view_unbacked_meta(a, shape)
+    else:
+        return torch._refs._reshape_view_helper(a, *shape, allow_copy=False)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten.linalg_matrix_exp)
 @out_wrapper()
 def linalg_matrix_exp(self):
@@ -515,11 +704,16 @@ def meta_copy_(self, src, non_blocking=False):
 def inferUnsqueezeGeometry(tensor, dim):
     result_sizes = list(tensor.size())
     result_strides = list(tensor.stride())
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim]
     # pyrefly: ignore [bad-argument-type]
     result_sizes.insert(dim, 1)
     # pyrefly: ignore [bad-argument-type]
+=======
+    new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim]
+    result_sizes.insert(dim, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result_strides.insert(dim, new_stride)
     return result_sizes, result_strides
 
@@ -649,14 +843,24 @@ def meta__cslt_sparse_mm(
     assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
 
     is_8bit_input_type = compressed_A.dtype in [torch.int8, torch.float8_e4m3fn]
+<<<<<<< HEAD
+=======
+    compression_factor = 10 if is_8bit_input_type else 9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_8bit_input_type:
         assert not dense_B.is_contiguous(), (
             "dense input must be transposed for 8bit dtypes"
         )
 
+<<<<<<< HEAD
     n = dense_B.size(1)
     m = compressed_A.size(0)
+=======
+    k = dense_B.size(0)
+    n = dense_B.size(1)
+    m = (compressed_A.numel() * 16) // (compression_factor * k)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if bias is not None:
         assert m == bias.size(0)
 
@@ -667,7 +871,11 @@ def meta__cslt_sparse_mm(
             torch.int32,
             torch.float8_e4m3fn,
         }, (
+<<<<<<< HEAD
             f"out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!"
+=======
+            "out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     output_shape = (n, m) if transpose_result else (m, n)
     return dense_B.new_empty(output_shape, dtype=out_dtype)
@@ -844,7 +1052,11 @@ def sym_constrain_range_for_size(size, min=None, max=None):
     from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
 
     if min is None and max is None:
+<<<<<<< HEAD
         torch._check(size >= 0)
+=======
+        torch._check_is_size(size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     if isinstance(size, (SymFloat, SymBool)):
@@ -1790,7 +2002,11 @@ def _padding_check_valid_input(input, padding, *, dim):
         for d in range(1, input_dim):
             valid_batch_mode = valid_batch_mode and input.size(d) != 0
     else:
+<<<<<<< HEAD
         for d in range(input_dim):
+=======
+        for d in range(0, input_dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
 
     # allow empty batch size but not other dimensions.
@@ -2241,7 +2457,11 @@ def meta__fused_moving_avg_obs_fq_helper(
 
 @register_meta(aten.mm)
 @out_wrapper(exact_dtype=True)
+<<<<<<< HEAD
 def meta_mm(a, b, out_dtype: Optional[torch.dtype] = None):
+=======
+def meta_mm(a, b):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(a.dim() == 2, lambda: "a must be 2D")
     torch._check(b.dim() == 2, lambda: "b must be 2D")
     N, M1 = a.shape
@@ -2250,6 +2470,7 @@ def meta_mm(a, b, out_dtype: Optional[torch.dtype] = None):
         M1 == M2,
         lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].",
     )
+<<<<<<< HEAD
     if out_dtype is not None:
         torch._check(
             out_dtype == a.dtype
@@ -2261,6 +2482,9 @@ def meta_mm(a, b, out_dtype: Optional[torch.dtype] = None):
         )
     result_dtype = a.dtype if out_dtype is None else out_dtype
     return a.new_empty((N, P), dtype=result_dtype)
+=======
+    return a.new_empty(N, P)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _compute_reduction_shape(self, dims, keepdim):
@@ -2344,19 +2568,28 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
 
     ret_shape = [input_tensor.shape[0], out_channels]
     if isinstance(stride, IntLike):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stride = [stride] * len(dims)
     elif len(stride) == 1:
         stride = [stride[0]] * len(dims)
 
     if isinstance(padding, IntLike):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         padding = [padding] * len(dims)
     elif len(padding) == 1:
         padding = [padding[0]] * len(dims)
 
     if isinstance(dilation, IntLike):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dilation = [dilation] * len(dims)
     elif len(dilation) == 1:
         dilation = [dilation[0]] * len(dims)
@@ -2364,7 +2597,10 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
     output_padding_list: Optional[list[int]] = None
     if output_padding:
         if isinstance(output_padding, IntLike):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_padding_list = [output_padding] * len(dims)
         elif len(output_padding) == 1:
             output_padding_list = [output_padding[0]] * len(dims)
@@ -2377,18 +2613,25 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
             ret_shape.append(
                 _formula_transposed(
                     dims[i],
+<<<<<<< HEAD
                     # pyrefly: ignore [index-error]
                     padding[i],
                     # pyrefly: ignore [index-error]
                     dilation[i],
                     kernel_size[i],
                     # pyrefly: ignore [index-error]
+=======
+                    padding[i],
+                    dilation[i],
+                    kernel_size[i],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stride[i],
                     output_padding_list[i],
                 )
             )
         else:
             ret_shape.append(
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
                 _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
             )
@@ -2396,6 +2639,13 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
 
     torch._check(
         sym_or(*[x > 0 for x in ret_shape[2:]]),
+=======
+                _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
+            )
+
+    torch._check(
+        any(x > 0 for x in ret_shape[2:]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"Given input size per channel: {list(dims)}. "
         f"Calculated output size per channel: {ret_shape[2:]}. "
         f"Output size is too small",
@@ -2423,7 +2673,11 @@ def meta_miopen_batch_norm(
     out_shape = input_tensor.shape
 
     # If tensor is provided for running_mean and running_var then use this. If these are not
+<<<<<<< HEAD
     # provided then we return the shape of weight tensor. Similar to how this is handled in the decomposition
+=======
+    # provded then we return the shape of weight tensor. Similar to how this is handled in the decomposition
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     save_mean_shape = running_mean.shape if running_mean is not None else weight.shape
     save_var_shape = running_var.shape if running_var is not None else weight.shape
 
@@ -2458,6 +2712,21 @@ def meta_conv(
     output_padding: list[int],
     groups: int,
 ):
+<<<<<<< HEAD
+=======
+    def pick_memory_format():
+        if device_hint(input_tensor) == "cuda":
+            if is_channels_last(input_tensor) or is_channels_last(weight):
+                return torch.channels_last
+        else:
+            if is_channels_last(input_tensor):
+                return torch.channels_last
+        if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_out = calc_conv_nd_return_shape(
         input_tensor,
         weight,
@@ -2475,6 +2744,10 @@ def meta_conv(
         shape_out[output_channels_dim] = 0
 
     out = input_tensor.new_empty(shape_out)
+<<<<<<< HEAD
+=======
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return out
 
 
@@ -2558,6 +2831,7 @@ def meta_qconv_pointwise(
             groups,
             None,
         )
+<<<<<<< HEAD
         if output_dtype is None:
             output_dtype = x.dtype
         assert output_dtype in [
@@ -2576,6 +2850,12 @@ def meta_qconv_pointwise(
             4: torch.channels_last,
             5: torch.channels_last_3d,
         }[len(shape_out)]
+=======
+        assert output_dtype in [torch.float32, torch.bfloat16, torch.uint8, torch.int8]
+        out = x.new_empty(shape_out, dtype=output_dtype)
+        assert len(shape_out) in [3, 4], "only conv1d/2d are supported"
+        format = torch.channels_last if len(shape_out) == 4 else torch.contiguous_format
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = out.to(memory_format=format)
         return out
 
@@ -2627,6 +2907,7 @@ def meta_qlinear_pointwise(
         output_shape = list(x.shape)
         # The weight has been transposed during the qlinear weight prepack process.
         output_shape[-1] = w.shape[1]
+<<<<<<< HEAD
         assert output_dtype in [
             torch.float32,
             torch.bfloat16,
@@ -2634,6 +2915,9 @@ def meta_qlinear_pointwise(
             torch.uint8,
             torch.float8_e4m3fn,
         ]
+=======
+        assert output_dtype in [torch.float32, torch.bfloat16, torch.int8, torch.uint8]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = x.new_empty(output_shape, dtype=output_dtype)
         return out
 
@@ -2664,6 +2948,7 @@ def meta_qlinear_pointwise_binary(
         output_shape = list(x.shape)
         # The weight has been transposed during the qlinear weight prepack process.
         output_shape[-1] = w.shape[1]
+<<<<<<< HEAD
         assert output_dtype in [
             torch.float32,
             torch.bfloat16,
@@ -2671,6 +2956,9 @@ def meta_qlinear_pointwise_binary(
             torch.int8,
             torch.float8_e4m3fn,
         ]
+=======
+        assert output_dtype in [torch.float32, torch.bfloat16, torch.uint8, torch.int8]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = x.new_empty(output_shape, dtype=output_dtype)
         return out
 
@@ -2722,6 +3010,7 @@ def meta_quantized_max_pool2d(
 
     @register_meta(torch.ops.quantized.int4mm_packed_weight_cpu)
     def meta_int4mm_packed_weight_cpu(x, w, q_group_size, q_scale_and_zeros):
+<<<<<<< HEAD
         torch._check(x.dim() == 2, lambda: f"x must be a 2D tensor, got {x.dim()}D")
         torch._check(w.dim() == 2, lambda: f"w must be a 2D tensor, got {w.dim()}D")
         torch._check(
@@ -2738,6 +3027,22 @@ def meta_int4mm_packed_weight_cpu(x, w, q_group_size, q_scale_and_zeros):
         torch._check(
             q_scale_and_zeros.dtype == x.dtype,
             lambda: f"q_scale_and_zeros must have the same dtype as x, got {q_scale_and_zeros.dtype}",
+=======
+        torch._check(x.dim() == 2, f"x must be a 2D tensor, got {x.dim()}D")
+        torch._check(w.dim() == 2, f"w must be a 2D tensor, got {w.dim()}D")
+        torch._check(
+            x.dtype in [torch.float32, torch.float16, torch.bfloat16],
+            f"expected x to be f32/f16/bf16, got {x.dtype}",
+        )
+        torch._check(w.dtype == torch.uint8, f"expected w to be uint8, got {w.dtype}")
+        torch._check(
+            q_group_size.dtype == torch.int64,
+            f"q_group_size must be int64, got {q_group_size.dtype}",
+        )
+        torch._check(
+            q_scale_and_zeros.dtype == x.dtype,
+            f"q_scale_and_zeros must have the same dtype as x, got {q_scale_and_zeros.dtype}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
 
@@ -3320,12 +3625,17 @@ def meta_repeat_interleave_Tensor(repeats, output_size=None):
 def meta_complex(real, imag):
     assert real.dtype.is_floating_point
     assert imag.dtype.is_floating_point
+<<<<<<< HEAD
     result = elementwise_meta(
         real.to(corresponding_complex_dtype(real.dtype)),
         imag.to(corresponding_complex_dtype(imag.dtype)),
         type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     )
     return result
+=======
+    out_shape = _broadcast_shapes(real.shape, imag.shape)
+    return real.new_empty(out_shape, dtype=corresponding_complex_dtype(real.dtype))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_meta([aten.nonzero_static.default, aten.nonzero_static.out])
@@ -3457,16 +3767,25 @@ def _restride_src(self):
         """
         shape = before_shape + replacement_shape + after_shape
         strides = list(self.stride())
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         strides[len(before_shape) : len(self.shape) - len(after_shape)] = [0] * len(
             replacement_shape
         )
         return self.as_strided(shape, strides)
 
     out = self.new_empty(before_shape + replacement_shape + after_shape)
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if guard_or_false(self.numel() == 0):
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(self.numel() == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # No need to worry about the output strides if self is empty.
         return out
 
@@ -3474,7 +3793,11 @@ def _restride_src(self):
     # Note that perm here is the reverse of the 'perm_' decided by
     # TensorIteratorBase::reorder_dimensions
     restrided_self = _restride_src(self)
+<<<<<<< HEAD
     perm, _ = utils.compute_elementwise_output_logical_to_physical_perm(restrided_self)
+=======
+    perm = utils.compute_elementwise_output_logical_to_physical_perm(restrided_self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Follow TensorIteratorBase::allocate_or_resize_outputs
     if list(perm) != list(range(len(perm))):
@@ -3807,7 +4130,11 @@ def kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
                     + kai_num_bytes_bias
                 )
 
+<<<<<<< HEAD
             # This function returns size of these datatypes stored as enum. We modify it to just return bf16 datatype
+=======
+            # This funtion retuns size of these datatypes stored as enum. We modify it to just return bf16 datatype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # https://gitlab.arm.com/kleidi/kleidiai/-/blob/main/kai/kai_common.h?ref_type=heads#L55
             def kai_get_bf16_datatype_size_in_bytes():
                 return 2  # 2 bytes
@@ -3860,7 +4187,11 @@ def meta__dyn_quant_matmul_4bit(
 ):
     torch._check(inp.dim() == 2, lambda: "input must be a 2D tensor")
     torch._check(
+<<<<<<< HEAD
         inp.dtype == torch.float32,
+=======
+        inp.dtype in [torch.float32],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"expected input to be f32, got {inp.dtype}",
     )
     M = inp.size(0)
@@ -3898,6 +4229,7 @@ def meta_cdist_forward(x1, x2, p, compute_mode):
     )
     torch._check(
         utils.is_float_dtype(x1.dtype),
+<<<<<<< HEAD
         lambda: f"cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
     )
     torch._check(
@@ -3908,6 +4240,18 @@ def meta_cdist_forward(x1, x2, p, compute_mode):
     torch._check(
         compute_mode in (None, 0, 1, 2),
         lambda: f"possible modes: None, 0, 1, 2, but was: {compute_mode}",
+=======
+        lambda: "cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
+    )
+    torch._check(
+        utils.is_float_dtype(x2.dtype),
+        lambda: "cdist only supports floating-point dtypes, X2 got: {x2.dtype}",
+    )
+    torch._check(p >= 0, lambda: "cdist only supports non-negative p values")
+    torch._check(
+        compute_mode in (None, 1, 2),
+        lambda: f"possible modes: None, 1, 2, but was: {compute_mode}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     r1 = x1.size(-2)
     r2 = x2.size(-2)
@@ -4203,6 +4547,7 @@ def is_booleanic(arg):
     return self
 
 
+<<<<<<< HEAD
 @register_meta(
     [
         aten.add.Scalar,
@@ -4215,6 +4560,8 @@ def meta_binop_alpha(self, other, alpha=1):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta([aten.round.default, aten.round.decimals])
 def meta_round(self, **kwargs):
     return elementwise_meta(
@@ -4355,9 +4702,18 @@ def meta_index_put_(self, indices, values, accumulate=False):
     return self
 
 
+<<<<<<< HEAD
 def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype=None):
     from torch.fx.experimental.symbolic_shapes import sym_and, sym_eq
 
+=======
+@register_meta(aten.alias.default)
+def meta_alias(self):
+    return self.view(self.shape)
+
+
+def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
     torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
 
@@ -4371,7 +4727,11 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype
     output_size = (bs, res_rows, res_cols)
 
     torch._check(
+<<<<<<< HEAD
         sym_and(sym_eq(batch2_sizes[0], bs), sym_eq(batch2_sizes[1], contraction_size)),
+=======
+        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}"
         f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].",
     )
@@ -4391,7 +4751,11 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype
     if not is_bmm and self_baddbmm is not None:
         torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
         torch._check(
+<<<<<<< HEAD
             sym_eq(self_baddbmm.size(), output_size),
+=======
+            self_baddbmm.size() == output_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}",
         )
 
@@ -4482,6 +4846,7 @@ def pool2d_shape_check(
 
     torch._check(
         kW > 0 and kH > 0,
+<<<<<<< HEAD
         lambda: f"kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
     )
     torch._check(
@@ -4491,6 +4856,17 @@ def pool2d_shape_check(
     torch._check(
         dilationH > 0 and dilationW > 0,
         lambda: f"dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
+=======
+        lambda: "kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
+    )
+    torch._check(
+        dW > 0 and dH > 0,
+        lambda: "stride should be greater than zero, but got dH: {dH}, dW: {dW}",
+    )
+    torch._check(
+        dilationH > 0 and dilationW > 0,
+        lambda: "dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     valid_dims = input.size(1) != 0 and input.size(2) != 0
@@ -4499,7 +4875,11 @@ def pool2d_shape_check(
         torch._check(
             ndim == 4 and valid_dims and input.size(3) != 0,
             lambda: "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+<<<<<<< HEAD
             f" with optional 0 dim batch size for input, but got: {input.size()}",
+=======
+            " with optional 0 dim batch size for input, but got: {input.size()}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         torch._check(
@@ -4779,7 +5159,11 @@ def unpack(name, val):
     else:
         torch._check(
             False,
+<<<<<<< HEAD
             lambda: "Unsupported memory format. Supports only ChannelsLast, Contiguous",
+=======
+            lambda: "Unsupport memory format. Supports only ChannelsLast, Contiguous",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
@@ -4900,8 +5284,13 @@ def meta_fractional_max_pool2d(self, kernel_size, output_size, random_samples):
     for d in range(ndim - 3, ndim):
         torch._check(
             self.size(d) > 0,
+<<<<<<< HEAD
             lambda: f"fractional_max_pool2d: Expected input to have non-zero "
             f" size for non-batch dimensions, but got {self.size()} with dimension {d} empty",
+=======
+            f"fractional_max_pool2d: Expected input to have non-zero "
+            f" size for non-batch dimenions, but got {self.size()} with dimension {d} empty",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # the check and message are out of sync, but this matches the structured meta
@@ -4938,7 +5327,11 @@ def meta_fractional_max_pool2d(self, kernel_size, output_size, random_samples):
     d = random_samples.size(2)
     torch._check(
         n >= input_batch,
+<<<<<<< HEAD
         lambda: "Expect _random_samples.size(0) no less then input batch size.",
+=======
+        "Expect _random_samples.size(0) no less then input batch size.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     torch._check(
         c == input_channels,
@@ -5310,11 +5703,18 @@ def grid_sampler_3d_backward(
 
 @register_meta([aten.full.default])
 def full(size, fill_value, *args, **kwargs):
+<<<<<<< HEAD
     dtype = kwargs.get("dtype")
     if not dtype:
         dtype = utils.get_dtype(fill_value)
     kwargs["dtype"] = dtype
     # pyrefly: ignore [not-iterable]
+=======
+    dtype = kwargs.get("dtype", None)
+    if not dtype:
+        dtype = utils.get_dtype(fill_value)
+    kwargs["dtype"] = dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty(size, *args, **kwargs)
 
 
@@ -5408,6 +5808,42 @@ def meta_zeros(
     )
 
 
+<<<<<<< HEAD
+=======
+@register_meta(aten.select.int)
+def meta_select(self, dim, index):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    ndim = self.dim()
+    torch._check_index(
+        ndim != 0,
+        lambda: "select() cannot be applied to a 0-dim tensor.",
+    )
+
+    dim = dim if dim >= 0 else dim + ndim
+    size = self.size(dim)
+
+    torch._check_index(
+        not (
+            guard_size_oblivious(-index > size) or guard_size_oblivious(index >= size)
+        ),
+        lambda: f"select(): index {index} out of range for tensor of size "
+        f"{self.size()} at dimension {dim}",
+    )
+
+    index = index if index >= 0 else index + size
+
+    new_size = list(self.size())
+    new_stride = list(self.stride())
+
+    new_storage_offset = self.storage_offset() + index * new_stride[dim]
+    del new_size[dim]
+    del new_stride[dim]
+
+    return self.as_strided(new_size, new_stride, new_storage_offset)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten.select_scatter.default)
 def meta_select_scatter(self, src, dim, index):
     return utils.clone_preserve_strides(self)
@@ -5454,10 +5890,17 @@ def gather_shape_check(self, dim, index):
 
 @register_meta(aten.gather.default)
 def meta_gather(self, dim, index, sparse_grad=False):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     wrapped_dim = maybe_wrap_dim(dim, self.dim())
     is_index_empty = guard_or_false(index.numel() == 0)
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    wrapped_dim = maybe_wrap_dim(dim, self.dim())
+    is_index_empty = guard_size_oblivious(index.numel() == 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not is_index_empty:
         torch._check(
             index.dtype == torch.long or index.dtype == torch.int,
@@ -5496,9 +5939,15 @@ def get_operator_enum(reduce_, use_new_options=False):
 
 # From aten/src/ATen/native/ScatterGatherChecks.h
 def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_true
 
     if guard_or_true(index.numel() != 0):
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(index.numel() != 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._check(
             index.dtype == torch.long or index.dtype == torch.int,
             lambda: f"{method_name}(): Expected dtype int32/int64 for index",
@@ -5517,9 +5966,15 @@ def ensure_nonempty_dim(dim):
 
 # From aten/src/ATen/native/ScatterGatherChecks.h
 def scatter_shape_check(self, dim, index, src_opt=None):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if guard_or_false(index.numel() == 0):
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(index.numel() == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
     torch._check(
         ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
@@ -5660,7 +6115,11 @@ def meta__scaled_dot_product_flash_attention(
     # it's possible we'll need to have some special handling in inductor for sdpa
     # See [Note] BC breaking change to flash seed/offset
     if torch.version.hip and torch.cuda.is_available():
+<<<<<<< HEAD
         # Maintain old path on AMD
+=======
+        # Maintian old path on AMD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seed = torch.empty((), dtype=torch.long, device="meta")
         offset = torch.empty((), dtype=torch.long, device="meta")
     else:
@@ -5722,7 +6181,11 @@ def meta__scaled_dot_product_cudnn_attention(
     res = alloc_with_matching_layout(query, res_shape)
 
     logsum_exp = torch.empty(
+<<<<<<< HEAD
         (B, H, S_Q, 1),
+=======
+        (B, H, S_Q),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype=torch.float,
         device=query.device,
     )
@@ -5867,21 +6330,41 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
     scale: Optional[float] = None,
 ):
     # cpus's grad layout is different from cuda's,
+<<<<<<< HEAD
     # i.e. (batch_size, seq_len, num_heads, head_dim)
     grad_q = torch.empty_permuted(
         query.size(),
+=======
+    # i.e. (batch_size, seq_len,num_heads, head_dim)
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    head_dim = query.size(3)
+    len_q = query.size(2)
+    len_k = key.size(2)
+
+    grad_q = torch.empty_permuted(
+        (batch_size, num_heads, len_q, head_dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (0, 2, 1, 3),
         dtype=query.dtype,
         device=query.device,
     )
     grad_k = torch.empty_permuted(
+<<<<<<< HEAD
         key.size(),
+=======
+        (batch_size, num_heads, len_k, head_dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (0, 2, 1, 3),
         dtype=key.dtype,
         device=key.device,
     )
     grad_v = torch.empty_permuted(
+<<<<<<< HEAD
         value.size(),
+=======
+        (batch_size, num_heads, len_k, head_dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (0, 2, 1, 3),
         dtype=value.dtype,
         device=value.device,
@@ -5890,6 +6373,7 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
     return grad_q, grad_k, grad_v
 
 
+<<<<<<< HEAD
 @register_meta([aten._scaled_dot_product_attention_math_for_mps])
 def meta__scaled_dot_product_attention_math_for_mps(
     query: Tensor,
@@ -5945,6 +6429,8 @@ def sdpa_vector_2pass_mps():
         return sdpa_vector_fast_mps()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta([aten._scaled_dot_product_efficient_attention])
 def meta__scaled_dot_product_efficient_attention(
     query: Tensor,
@@ -6144,7 +6630,11 @@ def meta__flash_attention_forward(
     # See [Note] BC breaking change to flash seed/offset
     seed, offset = None, None
     if torch.version.hip and torch.cuda.is_available():
+<<<<<<< HEAD
         # Maintain old path on AMD
+=======
+        # Maintian old path on AMD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seed = torch.empty((), dtype=torch.long, device="meta")
         offset = torch.empty((), dtype=torch.long, device="meta")
     else:
@@ -6356,7 +6846,11 @@ def has_zero_dim(tensor_2d):
         )
         torch._check(
             mat2.size(0) % 16 == 0 and mat2.size(1) % 16 == 0,
+<<<<<<< HEAD
             lambda: f"Expected both dimensions of mat2 to be divisible by 16 but got {mat2.shape}",
+=======
+            lambda: f"Expected both dimensions of mat2 to be divisble by 16 but got {mat2.shape}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # determine scaling type and check input dimensions (refer to Blas.cpp op)
@@ -6448,6 +6942,7 @@ def ceil_div(a, b):
                     scale_a.is_contiguous() and scale_b.is_contiguous(),
                     lambda: "Both scale_a and scale_b must be contiguous for rowwise scaling.",
                 )
+<<<<<<< HEAD
             elif (
                 scale_a.size(0) == m
                 and scale_a.size(1) == scale_b.size(0) == (_k + 128 - 1) // 128
@@ -6455,6 +6950,8 @@ def ceil_div(a, b):
             ):
                 # (BlockWise1x128, BlockWise128x128)
                 pass  # do nothing, but do not error
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # does not match any valid scaling type
                 torch._check(
@@ -6463,8 +6960,11 @@ def ceil_div(a, b):
                         "Invalid scaling configuration. "
                         "For tensorwise scaling, both scales should be scalar. "
                         f"For rowwise scaling, scale_a should be ({m}, 1), scale_b should be (1, {n}). "
+<<<<<<< HEAD
                         f"For (BlockWise1x128, BlockWise128x128), scale_a should be ({m}, {(_k + 128 - 1) // 128}), "
                         + f"scale_b should be ({(_k + 128 - 1) // 128}, {(n + 128 - 1) // 128}). "
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         f"Got scale_a.size()=({scale_a.size(0)}, {scale_a.size(1)}) "
                         f"and scale_b.size()=({scale_b.size(0)}, {scale_b.size(1)})"
                     ),
@@ -6492,7 +6992,11 @@ def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True):
 def meta_multinomial(input, num_samples, replacement=False, *, generator=None):
     torch._check(
         0 < input.dim() <= 2,
+<<<<<<< HEAD
         lambda: f"The probability distributions dimensions must be 1 or 2, but got {input.dim()}",
+=======
+        lambda: f"The probabilty distributions dimensions must be 1 or 2, but got {input.dim()}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if input.dim() == 1:
         return torch.empty(num_samples, dtype=torch.long, device=input.device)
@@ -6681,7 +7185,10 @@ def rnn_cell_checkSizes(
     )
     torch._check(
         all(
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x.device == input_gates.device
             for x in [hidden_gates, input_bias, hidden_bias, prev_hidden]
         ),
@@ -6850,7 +7357,11 @@ def topk_meta(self, k, dim=-1, largest=True, sorted=True):
     # From aten/src/ATen/native/Sorting.cpp
     dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
     sliceSize = 1 if self.dim() == 0 else self.size(dim)
+<<<<<<< HEAD
     torch._check(k >= 0)
+=======
+    torch._check_is_size(k)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(k <= sliceSize, lambda: "k not in range for dimension")
 
     topKSize = list(self.shape)
@@ -7042,7 +7553,11 @@ def meta_histc(input, bins=100, min=0, max=0):
         isinstance(max, Number),
         lambda: f"{fn_name}: argument 'max' must be Number, not {type(max)}",
     )
+<<<<<<< HEAD
     torch._check(max >= min, lambda: f"{fn_name}: max must be larger than min")
+=======
+    torch._check(max >= min, lambda: "{fn_name}: max must be larger than min")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty(bins, device=input.device, dtype=input.dtype)
 
 
@@ -7119,7 +7634,12 @@ def _amp_foreach_non_finite_check_and_unscale_(self, found_inf, inv_scale):
 @register_meta([aten.nan_to_num.default, aten.nan_to_num.out])
 @out_wrapper()
 def nan_to_num(self, nan=None, posinf=None, neginf=None):
+<<<<<<< HEAD
     return torch.empty_like(self)
+=======
+    result_size = list(self.size())
+    return self.new_empty(result_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_meta(torch.ops.aten.transpose_)
@@ -7207,7 +7727,11 @@ def meta_searchsorted(
     # Per the docs, if side == "left" and right is True, we error.
     torch._check(
         side != "left" or not right,
+<<<<<<< HEAD
         lambda: "torch.searchsorted(): side and right can't be set to opposites, got side of "
+=======
+        "torch.searchsorted(): side and right can't be set to opposites, got side of "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "left while right was True",
     )
 
@@ -7318,7 +7842,11 @@ def meta_embedding_bag_per_sample_weights_backward(
     embedding_features = grad.size(1)
     torch._check(
         mode == MODE_SUM,
+<<<<<<< HEAD
         lambda: "embedding_bag_backward: per_sample_weights only supported for mode='sum'",
+=======
+        "embedding_bag_backward: per_sample_weights only supported for mode='sum'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     torch._check(grad.dim() == 2)
     torch._check(indices.dim() == 1)
@@ -7388,24 +7916,37 @@ def _create_grouped_mm_output_tensor(mat1, mat2, offs, out_dtype):
             out_size = [offs.size(0), mat1.size(0), mat2.size(1)]
         else:
             torch._check(
+<<<<<<< HEAD
                 offs.size(0) == mat2.size(0), lambda: "matrix batch sizes have to match"
+=======
+                offs.size(0) == mat2.size(0), "matrix batch sizes have to match"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             out_size = [mat1.size(0), mat2.size(-1)]
     else:
         if mat2_is_2d:
             torch._check(
+<<<<<<< HEAD
                 offs.size(0) == mat1.size(0), lambda: "matrix batch sizes have to match"
+=======
+                offs.size(0) == mat1.size(0), "matrix batch sizes have to match"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             out_size = [mat1.size(1), mat2.size(1)]
         else:
             # regular bmm
+<<<<<<< HEAD
             torch._check(
                 mat1.size(0) == mat2.size(0), lambda: "batched dimension has to match"
             )
+=======
+            torch._check(mat1.size(0) == mat2.size(0), "batched dimension has to match")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_size = [mat1.size(0), mat1.size(1), mat2.size(-1)]
 
     out_dtype = out_dtype or mat1.dtype
 
+<<<<<<< HEAD
     if torch.version.cuda:
         alignment = 16 // out_dtype.itemsize
         size_padded = (out_size[-1] + alignment - 1) // alignment * alignment
@@ -7418,6 +7959,15 @@ def _create_grouped_mm_output_tensor(mat1, mat2, offs, out_dtype):
         )
     else:
         out = torch.empty(out_size, dtype=out_dtype, device=mat1.device)
+=======
+    alignment = 16 // out_dtype.itemsize
+    size_padded = (out_size[-1] + alignment - 1) // alignment * alignment
+    if mat1_is_2d == mat2_is_2d:
+        out_stride = [out_size[1] * size_padded, size_padded, 1]
+    else:
+        out_stride = [size_padded, 1]
+    out = torch.empty_strided(out_size, out_stride, dtype=out_dtype, device=mat1.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return out
 
 
@@ -7443,31 +7993,48 @@ def _meta_grouped_mm_common(
     # aten/src/ATen/native/cuda/Blas.cpp.
 
     if scaled:
+<<<<<<< HEAD
         fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
         torch._check(
             mat_a.dtype == fp8_dtype and mat_b.dtype == fp8_dtype,
             lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
+=======
+        torch._check(
+            mat_a.dtype == torch.float8_e4m3fn and mat_b.dtype == torch.float8_e4m3fn,
+            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         torch._check(
             mat_a.dtype == torch.bfloat16 and mat_b.dtype == torch.bfloat16,
+<<<<<<< HEAD
             lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
+=======
+            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     torch._check(
         mat_a.dim() in [2, 3] and mat_b.dim() in [2, 3],
+<<<<<<< HEAD
         lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",  # noqa: B950
+=======
+        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     mat_a_is_2d = mat_a.dim() == 2
     mat_b_is_2d = mat_b.dim() == 2
 
+<<<<<<< HEAD
     if not mat_a_is_2d or not mat_b_is_2d:
         torch._check(
             mat_a.size(-1) == mat_b.size(-2),
             lambda: "contraction dimension of mat_a and mat_b must match",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if scaled:
 
         def is_row_major(mat):
@@ -7480,11 +8047,19 @@ def is_col_major(mat):
 
         torch._check(
             is_row_major(mat_a),
+<<<<<<< HEAD
             lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",  # noqa: B950
         )
         torch._check(
             is_col_major(mat_b),
             lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",  # noqa: B950
+=======
+            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",
+        )
+        torch._check(
+            is_col_major(mat_b),
+            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def check_valid_strides(mat_name, mat):
@@ -7496,7 +8071,11 @@ def check_valid_strides(mat_name, mat):
         ):
             torch._check(
                 mat_stride[end_dim] % alignment == 0,
+<<<<<<< HEAD
                 lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",  # noqa: B950
+=======
+                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif mat_stride[end_dim] == 1 and mat_stride[end_dim - 1] >= max(
             1, mat.shape[end_dim]
@@ -7516,6 +8095,7 @@ def check_valid_strides(mat_name, mat):
 
     if scale_a is not None and scale_b is not None:
         torch._check(
+<<<<<<< HEAD
             (scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32)
             or (
                 scale_a.dtype == torch.float8_e8m0fnu
@@ -7531,10 +8111,16 @@ def check_valid_strides(mat_name, mat):
         def round_up(x, y):
             """Rounds up x to nearest multiple of y"""
             return ((x + y - 1) // y) * y
+=======
+            scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
+            lambda: "Both scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
             if mat.dim() == 2:
                 torch._check(
+<<<<<<< HEAD
                     scale.is_contiguous(),
                     lambda: f"Expected {scale_name} to be contiguous.",
                 )
@@ -7559,12 +8145,33 @@ def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
             else:
                 torch._check(
                     scale.stride(-1) == 1,
+=======
+                    scale.dim() == 1,
+                    lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
+                )
+                torch._check(
+                    scale.is_contiguous(),
+                    lambda: f"Expected {scale_name} to be contiguous.",
+                )
+                torch._check(
+                    scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
+                    lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
+                )
+            else:
+                torch._check(
+                    scale.dim() == 2,
+                    lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
+                )
+                torch._check(
+                    scale.stride(1) == 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     lambda: f"Expected {scale_name} to be contiguous in the last dimension.",
                 )
                 torch._check(
                     scale.shape[0] == mat.shape[0],
                     lambda: f"Expected {scale_name} batch dimension to be {mat.shape[0]}, got {scale.shape[0]}.",
                 )
+<<<<<<< HEAD
                 # For MXFP8, 3d tensors have static 'groups' (stack of 2d tensors) so we can know the expected blocked
                 # scale sizes at compile time.
                 if is_mxfp8:
@@ -7591,6 +8198,12 @@ def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
                         scale.shape[1] == mat.shape[1 + scaled_dim],
                         lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",  # noqa: B950
                     )
+=======
+                torch._check(
+                    scale.shape[1] == mat.shape[1 + scaled_dim],
+                    lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         scale_multiplier = (
             offs.shape[0] if offs is not None and mat_a_is_2d and mat_b_is_2d else 1
@@ -7638,7 +8251,11 @@ def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
 
 @register_meta(aten._grouped_mm)
 @out_wrapper()
+<<<<<<< HEAD
 def meta_grouped_mm(
+=======
+def grouped_mm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mat_a: Tensor,
     mat_b: Tensor,
     offs: Optional[Tensor] = None,
@@ -7657,7 +8274,11 @@ def meta_grouped_mm(
     )
 
 
+<<<<<<< HEAD
 @register_meta([aten._scaled_grouped_mm])
+=======
+@register_meta([aten._scaled_grouped_mm.default])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def meta_scaled_grouped_mm(
     mat_a: torch.Tensor,
     mat_b: torch.Tensor,
@@ -7669,9 +8290,12 @@ def meta_scaled_grouped_mm(
     out_dtype: Optional[torch.dtype] = None,
     use_fast_accum: bool = False,
 ):
+<<<<<<< HEAD
     # matching _scaled_grouped_mm_cuda Blas.cpp implementation
     out_dtype = out_dtype or torch.bfloat16
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _meta_grouped_mm_common(
         mat_a,
         mat_b,
@@ -7720,6 +8344,7 @@ def _constant_pad_nd_meta(input, pad, value=0):
         f"{l_inp} dimensions.",
     )
 
+<<<<<<< HEAD
     if all(isinstance(p, utils.IntWithoutSymInt) and p <= 0 for p in pad):
         c_input = input
         for i in range(l_diff, l_inp):
@@ -7734,6 +8359,8 @@ def _constant_pad_nd_meta(input, pad, value=0):
 
         return c_input.clone()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_shape = list(input_sizes[:l_diff])
     for i in range(l_pad):
         pad_idx = len(pad) - ((i + 1) * 2)
@@ -7807,6 +8434,7 @@ def _f(x):
     return _f
 
 
+<<<<<<< HEAD
 # Implementation follows cuda implementation native_multi_head_attention_cuda
 @register_meta(aten._native_multi_head_attention.default)
 def native_multi_head_attention_fake(
@@ -7857,6 +8485,8 @@ def native_multi_head_attention_fake(
     return (output, attn_weights)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _create_binary_float_meta_func(func):
     @register_meta(func)
     @out_wrapper()
diff --git a/torch/_numpy/__init__.py b/torch/_numpy/__init__.py
index ff60965d6067e..a53d73f0a7455 100644
--- a/torch/_numpy/__init__.py
+++ b/torch/_numpy/__init__.py
@@ -21,10 +21,17 @@
 from math import pi, e  # usort: skip
 
 
+<<<<<<< HEAD
 all = all  # noqa: PLW0127
 alltrue = all
 
 any = any  # noqa: PLW0127
+=======
+all = all
+alltrue = all
+
+any = any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sometrue = any
 
 inf = float("inf")
diff --git a/torch/_numpy/_dtypes.py b/torch/_numpy/_dtypes.py
index a429d28f30cc3..837dcb93a30d4 100644
--- a/torch/_numpy/_dtypes.py
+++ b/torch/_numpy/_dtypes.py
@@ -1,9 +1,15 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 """Define analogs of numpy dtypes supported by pytorch.
 Define the scalar types and supported dtypes and numpy <--> torch dtype mappings.
 """
 
+=======
+""" Define analogs of numpy dtypes supported by pytorch.
+Define the scalar types and supported dtypes and numpy <--> torch dtype mappings.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import builtins
 
 import torch
@@ -408,6 +414,11 @@ def set_default_dtype(fp_dtype="numpy", int_dtype="numpy"):
 
     if int_dtype in ["numpy", "pytorch"]:
         int_dtype = torch.int64
+<<<<<<< HEAD
+=======
+    else:
+        int_dtype = int_dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_defaults = _dtypes_impl.DefaultDTypes(
         float_dtype=float_dtype, complex_dtype=complex_dtype, int_dtype=int_dtype
diff --git a/torch/_numpy/_dtypes_impl.py b/torch/_numpy/_dtypes_impl.py
index feed9c4600501..fd081c14b179b 100644
--- a/torch/_numpy/_dtypes_impl.py
+++ b/torch/_numpy/_dtypes_impl.py
@@ -1,11 +1,18 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 """Dtypes/scalar type implementations with torch dtypes.
+=======
+"""Dtypes/scalar type implementaions with torch dtypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Here `dtype` is always a torch.dtype, this module knows nothing about
 scalar types, wrapper dtypes or anything like that. PyTorch only.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections import namedtuple
 
 import torch
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index f57e7fb001fb0..2cac2bc3c67b1 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -5,7 +5,10 @@
 Things imported from here have numpy-compatible signatures but operate on
 pytorch tensors.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Contents of this module ends up in the main namespace via _funcs.py
 # where type annotations are used in conjunction with the @normalizer decorator.
 from __future__ import annotations
@@ -96,7 +99,11 @@ def _concat_cast_helper(tensors, out=None, dtype=None, casting="same_kind"):
     else:
         out_dtype = _dtypes_impl.result_type_impl(*tensors)
 
+<<<<<<< HEAD
     # cast input arrays if necessary; do not broadcast them against `out`
+=======
+    # cast input arrays if necessary; do not broadcast them agains `out`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensors = _util.typecast_tensors(tensors, out_dtype, casting)
 
     return tensors
@@ -1290,7 +1297,11 @@ def cross(a: ArrayLike, b: ArrayLike, axisa=-1, axisb=-1, axisc=-1, axis=None):
 
 def einsum(*operands, out=None, dtype=None, order="K", casting="safe", optimize=False):
     # Have to manually normalize *operands and **kwargs, following the NumPy signature
+<<<<<<< HEAD
     # We have a local import to avoid polluting the global space, as it will be then
+=======
+    # We have a local import to avoid poluting the global space, as it will be then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # exported in funcs.py
     from ._ndarray import ndarray
     from ._normalizations import (
@@ -1449,7 +1460,11 @@ def rollaxis(a: ArrayLike, axis, start=0):
         # numpy returns a view, here we try returning the tensor itself
         # return tensor[...]
         return a
+<<<<<<< HEAD
     axes = list(range(n))
+=======
+    axes = list(range(0, n))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     axes.remove(axis)
     axes.insert(start, axis)
     return a.view(axes)
@@ -1867,7 +1882,11 @@ def common_type(*tensors: ArrayLike):
         if not (t.is_floating_point or t.is_complex):
             p = 2  # array_precision[_nx.double]
         else:
+<<<<<<< HEAD
             p = array_precision.get(t)
+=======
+            p = array_precision.get(t, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if p is None:
                 raise TypeError("can't get common type for non-numeric array")
         precision = builtins.max(precision, p)
diff --git a/torch/_numpy/_ndarray.py b/torch/_numpy/_ndarray.py
index f192a39dd0296..93adf1bb6970c 100644
--- a/torch/_numpy/_ndarray.py
+++ b/torch/_numpy/_ndarray.py
@@ -169,6 +169,7 @@ def _upcast_int_indices(index):
     return index
 
 
+<<<<<<< HEAD
 def _has_advanced_indexing(index):
     """Check if there's any advanced indexing"""
     return any(
@@ -289,6 +290,8 @@ def unsqueeze_fn(x):
     return index, lambda x: x, lambda x: x
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Used to indicate that a parameter is unspecified (as opposed to explicitly
 # `None`)
 class _Unspecified:
@@ -588,25 +591,36 @@ def neg_step(i, s):
             index = neg_step(0, index)
         index = _util.ndarrays_to_tensors(index)
         index = _upcast_int_indices(index)
+<<<<<<< HEAD
         # Apply NumPy-compatible indexing conversion
         index = _numpy_compatible_indexing(index)
         # Apply NumPy-compatible empty ellipsis behavior
         index, maybe_squeeze, _ = _numpy_empty_ellipsis_patch(index, tensor.ndim)
         return maybe_squeeze(ndarray(tensor.__getitem__(index)))
+=======
+        return ndarray(tensor.__getitem__(index))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __setitem__(self, index, value):
         index = _util.ndarrays_to_tensors(index)
         index = _upcast_int_indices(index)
+<<<<<<< HEAD
         # Apply NumPy-compatible indexing conversion
         index = _numpy_compatible_indexing(index)
         # Apply NumPy-compatible empty ellipsis behavior
         index, _, maybe_unsqueeze = _numpy_empty_ellipsis_patch(index, self.tensor.ndim)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not _dtypes_impl.is_scalar(value):
             value = normalize_array_like(value)
             value = _util.cast_if_needed(value, self.tensor.dtype)
 
+<<<<<<< HEAD
         return self.tensor.__setitem__(index, maybe_unsqueeze(value))
+=======
+        return self.tensor.__setitem__(index, value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     take = _funcs.take
     put = _funcs.put
diff --git a/torch/_numpy/_normalizations.py b/torch/_numpy/_normalizations.py
index 82cdb2b0b11b3..0ca9e3b651642 100644
--- a/torch/_numpy/_normalizations.py
+++ b/torch/_numpy/_normalizations.py
@@ -1,7 +1,12 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 """ "Normalize" arguments: convert array_likes to tensors, dtypes to torch dtypes and so on."""
 
+=======
+""" "Normalize" arguments: convert array_likes to tensors, dtypes to torch dtypes and so on.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import functools
@@ -43,12 +48,17 @@
     NotImplementedType = typing.TypeVar("NotImplementedType")
 
 
+<<<<<<< HEAD
 def normalize_array_like(x, parm=None):  # codespell:ignore
+=======
+def normalize_array_like(x, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from ._ndarray import asarray
 
     return asarray(x).tensor
 
 
+<<<<<<< HEAD
 def normalize_array_like_or_scalar(x, parm=None):  # codespell:ignore
     if _dtypes_impl.is_scalar_or_symbolic(x):
         return x
@@ -72,6 +82,31 @@ def normalize_seq_array_like(x, parm=None):  # codespell:ignore
 
 
 def normalize_dtype(dtype, parm=None):  # codespell:ignore
+=======
+def normalize_array_like_or_scalar(x, parm=None):
+    if _dtypes_impl.is_scalar_or_symbolic(x):
+        return x
+    return normalize_array_like(x, parm)
+
+
+def normalize_optional_array_like_or_scalar(x, parm=None):
+    if x is None:
+        return None
+    return normalize_array_like_or_scalar(x, parm)
+
+
+def normalize_optional_array_like(x, parm=None):
+    # This explicit normalizer is needed because otherwise normalize_array_like
+    # does not run for a parameter annotated as Optional[ArrayLike]
+    return None if x is None else normalize_array_like(x, parm)
+
+
+def normalize_seq_array_like(x, parm=None):
+    return tuple(normalize_array_like(value) for value in x)
+
+
+def normalize_dtype(dtype, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # cf _decorators.dtype_to_torch
     torch_dtype = None
     if dtype is not None:
@@ -80,6 +115,7 @@ def normalize_dtype(dtype, parm=None):  # codespell:ignore
     return torch_dtype
 
 
+<<<<<<< HEAD
 def normalize_not_implemented(arg, parm):  # codespell:ignore
     if arg != parm.default:  # codespell:ignore
         raise NotImplementedError(
@@ -88,6 +124,14 @@ def normalize_not_implemented(arg, parm):  # codespell:ignore
 
 
 def normalize_axis_like(arg, parm=None):  # codespell:ignore
+=======
+def normalize_not_implemented(arg, parm):
+    if arg != parm.default:
+        raise NotImplementedError(f"'{parm.name}' parameter is not supported.")
+
+
+def normalize_axis_like(arg, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from ._ndarray import ndarray
 
     if isinstance(arg, ndarray):
@@ -95,7 +139,11 @@ def normalize_axis_like(arg, parm=None):  # codespell:ignore
     return arg
 
 
+<<<<<<< HEAD
 def normalize_ndarray(arg, parm=None):  # codespell:ignore
+=======
+def normalize_ndarray(arg, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # check the arg is an ndarray, extract its tensor attribute
     if arg is None:
         return arg
@@ -103,11 +151,19 @@ def normalize_ndarray(arg, parm=None):  # codespell:ignore
     from ._ndarray import ndarray
 
     if not isinstance(arg, ndarray):
+<<<<<<< HEAD
         raise TypeError(f"'{parm.name}' must be an array")  # codespell:ignore
     return arg.tensor
 
 
 def normalize_outarray(arg, parm=None):  # codespell:ignore
+=======
+        raise TypeError(f"'{parm.name}' must be an array")
+    return arg.tensor
+
+
+def normalize_outarray(arg, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # almost normalize_ndarray, only return the array, not its tensor
     if arg is None:
         return arg
@@ -119,11 +175,19 @@ def normalize_outarray(arg, parm=None):  # codespell:ignore
         arg = ndarray(arg)
 
     if not isinstance(arg, ndarray):
+<<<<<<< HEAD
         raise TypeError(f"'{parm.name}' must be an array")  # codespell:ignore
     return arg
 
 
 def normalize_casting(arg, parm=None):  # codespell:ignore
+=======
+        raise TypeError(f"'{parm.name}' must be an array")
+    return arg
+
+
+def normalize_casting(arg, parm=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if arg not in ["no", "equiv", "safe", "same_kind", "unsafe"]:
         raise ValueError(
             f"casting must be one of 'no', 'equiv', 'safe', 'same_kind', or 'unsafe' (got '{arg}')"
@@ -147,10 +211,17 @@ def normalize_casting(arg, parm=None):  # codespell:ignore
 }
 
 
+<<<<<<< HEAD
 def maybe_normalize(arg, parm):  # codespell:ignore
     """Normalize arg if a normalizer is registered."""
     normalizer = normalizers.get(parm.annotation, None)  # codespell:ignore
     return normalizer(arg, parm) if normalizer else arg  # codespell:ignore
+=======
+def maybe_normalize(arg, parm):
+    """Normalize arg if a normalizer is registered."""
+    normalizer = normalizers.get(parm.annotation, None)
+    return normalizer(arg, parm) if normalizer else arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ### Return value helpers ###
@@ -217,8 +288,13 @@ def wrapped(*args, **kwds):
                 # NB: extra unknown arguments: pass through, will raise in func(*args) below
                 args = (
                     tuple(
+<<<<<<< HEAD
                         maybe_normalize(arg, parm)  # codespell:ignore
                         for arg, parm in zip(args, params.values())  # codespell:ignore
+=======
+                        maybe_normalize(arg, parm)
+                        for arg, parm in zip(args, params.values())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     + args[len(params.values()) :]
                 )
diff --git a/torch/_numpy/_reductions_impl.py b/torch/_numpy/_reductions_impl.py
index a4ebc094a7284..18c54cdea571a 100644
--- a/torch/_numpy/_reductions_impl.py
+++ b/torch/_numpy/_reductions_impl.py
@@ -1,11 +1,18 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 """Implementation of reduction operations, to be wrapped into arrays, dtypes etc
+=======
+""" Implementation of reduction operations, to be wrapped into arrays, dtypes etc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 in the 'public' layer.
 
 Anything here only deals with torch objects, e.g. "dtype" is a torch.dtype instance etc
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import functools
@@ -428,7 +435,11 @@ def percentile(
     interpolation: NotImplementedType = None,
 ):
     # np.percentile(float_tensor, 30) : q.dtype is int64 => q / 100.0 is float32
+<<<<<<< HEAD
     if _dtypes_impl.python_type_for_torch(q.dtype) is int:
+=======
+    if _dtypes_impl.python_type_for_torch(q.dtype) == int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q = q.to(_dtypes_impl.default_dtypes().float_dtype)
     qq = q / 100.0
 
diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py
index e1102790af2b5..46ebebb35917f 100644
--- a/torch/_numpy/_util.py
+++ b/torch/_numpy/_util.py
@@ -1,6 +1,11 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 """Assorted utilities, which do not need anything other then torch and stdlib."""
+=======
+"""Assorted utilities, which do not need anything other then torch and stdlib.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import operator
 
@@ -88,7 +93,11 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
         except TypeError:
             pass
     # Going via an iterator directly is slower than via list comprehension.
+<<<<<<< HEAD
     axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis)
+=======
+    axis = tuple([normalize_axis_index(ax, ndim, argname) for ax in axis])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not allow_duplicate and len(set(map(int, axis))) != len(axis):
         if argname:
             raise ValueError(f"repeated axis in `{argname}` argument")
@@ -204,7 +213,11 @@ def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0):
 
     Notes
     -----
+<<<<<<< HEAD
     This is almost a "tensor_like" coercive function. Does not handle wrapper
+=======
+    This is almost a "tensor_like" coersion function. Does not handle wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ndarrays (those should be handled in the ndarray-aware layer prior to
     invoking this function).
     """
diff --git a/torch/_numpy/random.py b/torch/_numpy/random.py
index a3d4a1c73241f..00f322aa24a5a 100644
--- a/torch/_numpy/random.py
+++ b/torch/_numpy/random.py
@@ -7,7 +7,10 @@
 Q: default dtype is float64 in numpy
 
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import functools
diff --git a/torch/_numpy/testing/utils.py b/torch/_numpy/testing/utils.py
index d43f63f10388c..8c415e753a590 100644
--- a/torch/_numpy/testing/utils.py
+++ b/torch/_numpy/testing/utils.py
@@ -4,7 +4,10 @@
 Utility function to facilitate testing.
 
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import gc
 import operator
@@ -168,7 +171,11 @@ def assert_equal(actual, desired, err_msg="", verbose=True):
 
     Examples
     --------
+<<<<<<< HEAD
     >>> np.testing.assert_equal([4, 5], [4, 6])
+=======
+    >>> np.testing.assert_equal([4,5], [4,6])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
         ...
     AssertionError:
@@ -299,12 +306,17 @@ def print_assert_equal(test_string, actual, desired):
 
     Examples
     --------
+<<<<<<< HEAD
     >>> np.testing.print_assert_equal(
     ...     "Test XYZ of func xyz", [0, 1], [0, 1]
     ... )  # doctest: +SKIP
     >>> np.testing.print_assert_equal(
     ...     "Test XYZ of func xyz", [0, 1], [0, 2]
     ... )  # doctest: +SKIP
+=======
+    >>> np.testing.print_assert_equal('Test XYZ of func xyz', [0, 1], [0, 1])  # doctest: +SKIP
+    >>> np.testing.print_assert_equal('Test XYZ of func xyz', [0, 1], [0, 2])  # doctest: +SKIP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
     ...
     AssertionError: Test XYZ of func xyz failed
@@ -317,7 +329,11 @@ def print_assert_equal(test_string, actual, desired):
     __tracebackhide__ = True  # Hide traceback for py.test
     import pprint
 
+<<<<<<< HEAD
     if actual != desired:
+=======
+    if not (actual == desired):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         msg = StringIO()
         msg.write(test_string)
         msg.write(" failed\nACTUAL: \n")
@@ -382,9 +398,14 @@ def assert_almost_equal(actual, desired, decimal=7, err_msg="", verbose=True):
      ACTUAL: 2.3333333333333
      DESIRED: 2.33333334
 
+<<<<<<< HEAD
     >>> assert_almost_equal(
     ...     np.array([1.0, 2.3333333333333]), np.array([1.0, 2.33333334]), decimal=9
     ... )
+=======
+    >>> assert_almost_equal(np.array([1.0,2.3333333333333]),
+    ...                     np.array([1.0,2.33333334]), decimal=9)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
         ...
     AssertionError:
@@ -493,6 +514,7 @@ def assert_approx_equal(actual, desired, significant=7, err_msg="", verbose=True
 
     Examples
     --------
+<<<<<<< HEAD
     >>> np.testing.assert_approx_equal(
     ...     0.12345677777777e-20, 0.1234567e-20
     ... )  # doctest: +SKIP
@@ -506,6 +528,13 @@ def assert_approx_equal(actual, desired, significant=7, err_msg="", verbose=True
     ...     0.12345672e-20,  # doctest: +SKIP
     ...     significant=8,
     ... )
+=======
+    >>> np.testing.assert_approx_equal(0.12345677777777e-20, 0.1234567e-20)  # doctest: +SKIP
+    >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345671e-20,  # doctest: +SKIP
+    ...                                significant=8)
+    >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345672e-20,  # doctest: +SKIP
+    ...                                significant=8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
         ...
     AssertionError:
@@ -515,7 +544,11 @@ def assert_approx_equal(actual, desired, significant=7, err_msg="", verbose=True
 
     the evaluated condition that raises the exception is
 
+<<<<<<< HEAD
     >>> abs(0.12345670e-20 / 1e-21 - 0.12345672e-20 / 1e-21) >= 10 ** -(8 - 1)
+=======
+    >>> abs(0.12345670e-20/1e-21 - 0.12345672e-20/1e-21) >= 10**-(8-1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     True
 
     """
@@ -790,16 +823,27 @@ def assert_array_equal(x, y, err_msg="", verbose=True, *, strict=False):
     --------
     The first assert does not raise an exception:
 
+<<<<<<< HEAD
     >>> np.testing.assert_array_equal(
     ...     [1.0, 2.33333, np.nan], [np.exp(0), 2.33333, np.nan]
     ... )
+=======
+    >>> np.testing.assert_array_equal([1.0,2.33333,np.nan],
+    ...                               [np.exp(0),2.33333, np.nan])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Use `assert_allclose` or one of the nulp (number of floating point values)
     functions for these cases instead:
 
+<<<<<<< HEAD
     >>> np.testing.assert_allclose(
     ...     [1.0, np.pi, np.nan], [1, np.sqrt(np.pi) ** 2, np.nan], rtol=1e-10, atol=0
     ... )
+=======
+    >>> np.testing.assert_allclose([1.0,np.pi,np.nan],
+    ...                            [1, np.sqrt(np.pi)**2, np.nan],
+    ...                            rtol=1e-10, atol=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     As mentioned in the Notes section, `assert_array_equal` has special
     handling for scalars. Here the test checks that each value in `x` is 3:
@@ -824,7 +868,11 @@ def assert_array_equal(x, y, err_msg="", verbose=True, *, strict=False):
     The `strict` parameter also ensures that the array data types match:
 
     >>> x = np.array([2, 2, 2])
+<<<<<<< HEAD
     >>> y = np.array([2.0, 2.0, 2.0], dtype=np.float32)
+=======
+    >>> y = np.array([2., 2., 2.], dtype=np.float32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> np.testing.assert_array_equal(x, y, strict=True)
     Traceback (most recent call last):
         ...
@@ -896,11 +944,19 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg="", verbose=True):
     --------
     the first assert does not raise an exception
 
+<<<<<<< HEAD
     >>> np.testing.assert_array_almost_equal([1.0, 2.333, np.nan], [1.0, 2.333, np.nan])
 
     >>> np.testing.assert_array_almost_equal(
     ...     [1.0, 2.33333, np.nan], [1.0, 2.33339, np.nan], decimal=5
     ... )
+=======
+    >>> np.testing.assert_array_almost_equal([1.0,2.333,np.nan],
+    ...                                      [1.0,2.333,np.nan])
+
+    >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
+    ...                                      [1.0,2.33339,np.nan], decimal=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
         ...
     AssertionError:
@@ -912,9 +968,14 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg="", verbose=True):
      x: torch.ndarray([1.0000, 2.3333,    nan], dtype=float64)
      y: torch.ndarray([1.0000, 2.3334,    nan], dtype=float64)
 
+<<<<<<< HEAD
     >>> np.testing.assert_array_almost_equal(
     ...     [1.0, 2.33333, np.nan], [1.0, 2.33333, 5], decimal=5
     ... )
+=======
+    >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
+    ...                                      [1.0,2.33333, 5], decimal=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
         ...
     AssertionError:
@@ -1070,8 +1131,13 @@ def assert_string_equal(actual, desired):
 
     Examples
     --------
+<<<<<<< HEAD
     >>> np.testing.assert_string_equal("abc", "abc")  # doctest: +SKIP
     >>> np.testing.assert_string_equal("abc", "abcd")  # doctest: +SKIP
+=======
+    >>> np.testing.assert_string_equal('abc', 'abc')  # doctest: +SKIP
+    >>> np.testing.assert_string_equal('abc', 'abcd')  # doctest: +SKIP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
     ...
@@ -1357,11 +1423,19 @@ def assert_array_almost_equal_nulp(x, y, nulp=1):
 
     Examples
     --------
+<<<<<<< HEAD
     >>> x = np.array([1.0, 1e-10, 1e-20])
     >>> eps = np.finfo(x.dtype).eps
     >>> np.testing.assert_array_almost_equal_nulp(x, x * eps / 2 + x)  # doctest: +SKIP
 
     >>> np.testing.assert_array_almost_equal_nulp(x, x * eps + x)  # doctest: +SKIP
+=======
+    >>> x = np.array([1., 1e-10, 1e-20])
+    >>> eps = np.finfo(x.dtype).eps
+    >>> np.testing.assert_array_almost_equal_nulp(x, x*eps/2 + x)  # doctest: +SKIP
+
+    >>> np.testing.assert_array_almost_equal_nulp(x, x*eps + x)  # doctest: +SKIP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Traceback (most recent call last):
       ...
     AssertionError: X and Y are not equal to 1 ULP (max is 2)
@@ -1420,7 +1494,11 @@ def assert_array_max_ulp(a, b, maxulp=1, dtype=None):
 
     Examples
     --------
+<<<<<<< HEAD
     >>> a = np.linspace(0.0, 1.0, 100)
+=======
+    >>> a = np.linspace(0., 1., 100)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> res = np.testing.assert_array_max_ulp(a, np.arcsin(np.sin(a)))  # doctest: +SKIP
 
     """
@@ -1505,7 +1583,11 @@ def _integer_repr(x, vdt, comp):
     # See also
     # https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
     rx = x.view(vdt)
+<<<<<<< HEAD
     if rx.size != 1:
+=======
+    if not (rx.size == 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rx[rx < 0] = comp - rx[rx < 0]
     else:
         if rx < 0:
@@ -1578,7 +1660,11 @@ def assert_warns(warning_class, *args, **kwargs):
     >>> import warnings
     >>> def deprecated_func(num):
     ...     warnings.warn("Please upgrade", DeprecationWarning)
+<<<<<<< HEAD
     ...     return num * num
+=======
+    ...     return num*num
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> with np.testing.assert_warns(DeprecationWarning):
     ...     assert deprecated_func(4) == 16
     >>> # or passing a func
@@ -1679,6 +1765,7 @@ def inp():
                 yield out, inp(), ufmt % (o, o, s, dtype, "out of place")
                 d = inp()
                 yield d, d, ufmt % (o, o, s, dtype, "in place")
+<<<<<<< HEAD
                 yield (
                     out[1:],
                     inp()[:-1],
@@ -1702,6 +1789,21 @@ def inp():
                         dtype,
                         "out of place",
                     ),
+=======
+                yield out[1:], inp()[:-1], ufmt % (
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp()[1:], ufmt % (
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "out of place",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 yield inp()[:-1], inp()[1:], ufmt % (o, o + 1, s - 1, dtype, "aliased")
                 yield inp()[1:], inp()[:-1], ufmt % (o + 1, o, s - 1, dtype, "aliased")
@@ -1717,6 +1819,7 @@ def inp1():
                 yield d, d, inp2(), bfmt % (o, o, o, s, dtype, "in place1")
                 d = inp2()
                 yield d, inp1(), d, bfmt % (o, o, o, s, dtype, "in place2")
+<<<<<<< HEAD
                 yield (
                     out[1:],
                     inp1()[:-1],
@@ -1800,6 +1903,55 @@ def inp1():
                         dtype,
                         "aliased",
                     ),
+=======
+                yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % (
+                    o + 1,
+                    o,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp1()[1:], inp2()[:-1], bfmt % (
+                    o,
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp1()[:-1], inp2()[1:], bfmt % (
+                    o,
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % (
+                    o + 1,
+                    o,
+                    o,
+                    s - 1,
+                    dtype,
+                    "aliased",
+                )
+                yield inp1()[:-1], inp1()[1:], inp2()[:-1], bfmt % (
+                    o,
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "aliased",
+                )
+                yield inp1()[:-1], inp1()[:-1], inp2()[1:], bfmt % (
+                    o,
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "aliased",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
 
@@ -1880,10 +2032,16 @@ class clear_and_catch_warnings(warnings.catch_warnings):
     --------
     >>> import warnings
     >>> with np.testing.clear_and_catch_warnings(  # doctest: +SKIP
+<<<<<<< HEAD
     ...     modules=[np.core.fromnumeric]
     ... ):
     ...     warnings.simplefilter("always")
     ...     warnings.filterwarnings("ignore", module="np.core.fromnumeric")
+=======
+    ...         modules=[np.core.fromnumeric]):
+    ...     warnings.simplefilter('always')
+    ...     warnings.filterwarnings('ignore', module='np.core.fromnumeric')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ...     # do something that raises a warning but ignore those in
     ...     # np.core.fromnumeric
     """
@@ -1981,8 +2139,11 @@ class suppress_warnings:
 
         sup = np.testing.suppress_warnings()
         sup.filter(module=np.ma.core)  # module must match exactly
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @sup
         def some_function():
             # do something which causes a warning in np.ma.core
diff --git a/torch/_ops.py b/torch/_ops.py
index 95f78ca7f32a9..f036a27fd46fb 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -6,19 +6,32 @@
 import inspect
 import sys
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from functools import cached_property
 from typing import (
     Any,
     ClassVar,
     Concatenate,
+=======
+from collections.abc import Iterator
+from functools import cached_property
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     final,
     Generic,
     Optional,
     TYPE_CHECKING,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import ParamSpec, TypeVar
+=======
+from typing_extensions import Concatenate, ParamSpec, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -85,7 +98,11 @@ def __init__(self):
 
         # This table allows you to override the behavior of a particular
         # dispatch key to call a custom Python function, rather than the
+<<<<<<< HEAD
         # ordinary C++ configured behavior.  This is the raison d'etre of  # codespell:ignore
+=======
+        # ordinary C++ configured behavior.  This is the raison d'etre of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Python dispatcher: to let you program the dispatcher from Python
         # in case you need something unusual, and don't want to clobber
         # the existing registrations using the Python operator registration
@@ -267,7 +284,10 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
     DispatchKey.BackendSelect,
     DispatchKey.AutocastCPU,  # type: ignore[attr-defined]
     DispatchKey.AutocastCUDA,  # type: ignore[attr-defined]
+<<<<<<< HEAD
     DispatchKey.AutocastXPU,  # type: ignore[attr-defined]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -298,7 +318,11 @@ def __init__(self, name, *, cacheable=False):
             self.fallthrough(dispatch_key)
 
         # [NOTE] We have to register pre-dispatch key implementation
+<<<<<<< HEAD
         # because sometimes HOP use aot-dispatch tracing to detect certain
+=======
+        # because sometimes HOP use aot-dispatch tracing to detect certaion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # mutations. This is problematic when we are functionalizing HOP
         # during pre-dispatch because when the inner tracer starts, it will see
         # that PreDispatch key is still active. In that case, we just redispatch
@@ -416,6 +440,7 @@ def check_overloaded(arg):
                         # TODO(rzou): we should support torch_dispatch calling convention too.
                         result = handler(mode, *args, **kwargs)
                 else:
+<<<<<<< HEAD
                     if curr_mode.supports_higher_order_operators:
                         with _pop_mode_temporarily() as mode:
                             return curr_mode.__torch_dispatch__(self, [], args, kwargs)
@@ -429,6 +454,12 @@ def check_overloaded(arg):
                             f" {curr_mode}.__torch_dispatch__ or"
                             f" returning NotImplemented when not supported."
                         )
+=======
+                    raise NotImplementedError(
+                        f"There was no rule registered for HOP {self._name} and mode {curr_mode}. "
+                        f"We recommend filing an issue."
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if result is not NotImplemented:
                     return result
 
@@ -467,12 +498,19 @@ def check_overloaded(arg):
 
             # All handlers returned NotImplemented
             raise TypeError(
+<<<<<<< HEAD
                 f"HigherOrderOperator '{self._name}' is not supported for the given input types. "
                 f"This typically happens when using custom tensor types or dispatch modes that don't "
                 f"have implementations for this operation.\n\n"
                 f"Current mode: {curr_mode}\n"
                 f"Input types: {[type(a).__name__ for a in overloaded_args]}\n\n"
                 f"To fix this, can add support for '{self._name}' in {curr_mode}'s __torch_dispatch__\n"
+=======
+                f"Multiple dispatch failed for {self._name}. There was no registered that "
+                f"did not return NotImplemented. Use HOP.py_impl to register some. "
+                f"Tried mode: {curr_mode}) and subclasses: "
+                f"{[type(a) for a in overloaded_args]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         functionality_key = torch._C._to_functionality_key(dispatch_key)  # type: ignore[attr-defined]
@@ -521,6 +559,7 @@ def check_overloaded(arg):
 
     @abc.abstractmethod
     def __call__(self, /, *args, **kwargs):
+<<<<<<< HEAD
         flat_args = _to_flat_tuple(args, kwargs)
         if torch.overrides.has_torch_function(flat_args):
             return torch.overrides.handle_torch_function(
@@ -529,6 +568,21 @@ def __call__(self, /, *args, **kwargs):
 
         dispatch_key_set = _compute_keyset(args, kwargs, self.non_fallthrough_keys)
         return self.dispatch(dispatch_key_set.highestPriorityTypeId(), *args, **kwargs)
+=======
+        def wrapper():
+            flat_args = _to_flat_tuple(args, kwargs)
+            if torch.overrides.has_torch_function(flat_args):
+                return torch.overrides.handle_torch_function(
+                    self, flat_args, *args, **kwargs
+                )
+
+            dispatch_key_set = _compute_keyset(args, kwargs, self.non_fallthrough_keys)
+            return self.dispatch(
+                dispatch_key_set.highestPriorityTypeId(), *args, **kwargs
+            )
+
+        return wrapper()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE [HigherOrderOprator Schema]
     # Each invocation of a HigherOrderOperator (hop) should have its own schema because
@@ -628,6 +682,7 @@ def unset_mode_pre_dispatch(mode_key, schema_check=False):
         assert mode_key is None
 
     def _unset_mode():
+<<<<<<< HEAD
         # NOTE: Using `is` rather than `==` to work around slow enum comparison in
         # pybind11.
         if mode_key is torch._C._TorchDispatchModeKey.PROXY:
@@ -635,6 +690,13 @@ def _unset_mode():
             mode_stack_state_for_pre_dispatch().set(0, None)
             return current_mode
         elif mode_key is torch._C._TorchDispatchModeKey.FUNCTIONAL:
+=======
+        if mode_key == torch._C._TorchDispatchModeKey.PROXY:
+            current_mode = current_mode_stack_pre_dispatch.get(0)
+            mode_stack_state_for_pre_dispatch().set(0, None)
+            return current_mode
+        elif mode_key == torch._C._TorchDispatchModeKey.FUNCTIONAL:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             current_mode = current_mode_stack_pre_dispatch.get(1)
             mode_stack_state_for_pre_dispatch().set(1, None)
             return current_mode
@@ -715,11 +777,21 @@ def _len_torch_dispatch_stack_pre_dispatch():
 
 
 def _get_dispatch_mode_pre_dispatch(mode_key):
+<<<<<<< HEAD
     # NOTE: Using `is` rather than `==` to work around slow enum comparison in pybind11.
     if mode_key is torch._C._TorchDispatchModeKey.PROXY:
         return mode_stack_state_for_pre_dispatch().get(0)
     else:
         assert mode_key is torch._C._TorchDispatchModeKey.FUNCTIONAL
+=======
+    assert mode_key in (
+        torch._C._TorchDispatchModeKey.PROXY,
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+    )
+    if mode_key == torch._C._TorchDispatchModeKey.PROXY:
+        return mode_stack_state_for_pre_dispatch().get(0)
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mode_stack_state_for_pre_dispatch().get(1)
 
 
@@ -798,7 +870,11 @@ def __init__(
 
         # Logic replicated from aten/src/ATen/native/MathBitsFallback.h
         is_write = None
+<<<<<<< HEAD
         for a in self._schema.arguments:  # pyrefly: ignore  # bad-assignment
+=======
+        for a in self._schema.arguments:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if a.alias_info is None:
                 continue
             if is_write is None:
@@ -880,7 +956,11 @@ def decompose(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         elif torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), dk):
             return self._op_dk(dk, *args, **kwargs)
         else:
+<<<<<<< HEAD
             return NotImplemented  # pyrefly: ignore [bad-return]
+=======
+            return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Remove a dispatch key from the dispatch cache.  This will force it to get
     # recomputed the next time.  Does nothing
@@ -926,7 +1006,11 @@ def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                         return self._op_dk(key, *args, **kwargs)
 
                 with torch.utils._python_dispatch._pop_mode_temporarily() as mode:
+<<<<<<< HEAD
                     return self.python_key_table[curr_mode](mode, *args, **kwargs)  # type: ignore[index]
+=======
+                    return self.python_key_table[curr_mode](mode, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self._dispatch_cache[key] = handler
             add_cached_op(self)
@@ -985,9 +1069,15 @@ def _temporarily_pop_modes_from_pre_dispatch():
 
         r = self.py_kernels.get(final_key, final_key)
         if cache_result:
+<<<<<<< HEAD
             self._dispatch_cache[key] = r  # pyrefly: ignore [unsupported-operation]
             add_cached_op(self)
         return r  # pyrefly: ignore [bad-return]
+=======
+            self._dispatch_cache[key] = r
+            add_cached_op(self)
+        return r
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def name(self):
         return self._name
@@ -1111,13 +1201,21 @@ def _dispatch_in_python(
                 f" but no python implementation is found."
                 f" Please file an issue on this when you encounter this error."
                 f" This error can happen when you export or compile the model."
+<<<<<<< HEAD
                 f" It can still happen even if a C++ implementation for {dispatch_key}. "
+=======
+                f" It can still happpen even if a C++ implementation for {dispatch_key}. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f" has been registered. That's because FakeScriptObject purely lives in python and cannot work "
                 f" with a C++ implementation."
             )
 
         assert isinstance(handler, Callable)  # type: ignore[arg-type]
+<<<<<<< HEAD
         return handler(*args, **kwargs)  # pyrefly: ignore [bad-return]
+=======
+        return handler(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _must_dispatch_in_python(args, kwargs):
@@ -1246,7 +1344,10 @@ def __call__(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         # the schema and cause an error for torchbind op when inputs consist of FakeScriptObject so we
         # intercept it here and call TorchBindOpverload instead.
         if self._has_torchbind_op_overload and _must_dispatch_in_python(args, kwargs):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return _call_overload_packet_from_python(self, *args, **kwargs)
         return self._op(*args, **kwargs)
 
@@ -1260,7 +1361,11 @@ def overloads(self):
 def _call_overload_packet_from_python(
     op: OpOverloadPacket[_P, _T], *args: _P.args, **kwargs: _P.kwargs
 ) -> _T:
+<<<<<<< HEAD
     # Reuse the torch function handling logic in cpp
+=======
+    # Re-use the torch function handling logic in cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch_function_called, ret = torch._C._maybe_call_torch_function_for_op_packet(
         op, *args, **kwargs
     )
@@ -1409,7 +1514,11 @@ def __iter__(self) -> Iterator[str]:
 
     def __getattr__(self, name: str) -> HigherOrderOperator:
         # Following _OpNamespace.__getattr__, we cache the op on this object.
+<<<<<<< HEAD
         op = _higher_order_ops.get(name)
+=======
+        op = _higher_order_ops.get(name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op is None:
             raise AttributeError(
                 f"'_HigherOrderNamespace' 'torch.ops.higher_order' object has no attribute '{name}'"
@@ -1475,15 +1584,25 @@ def load_library(self, path):
         Args:
             path (str): A path to a shared library to load.
         """
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         path = _utils_internal.resolve_library_path(path)
         with dl_open_guard():
             # Import the shared library into the process, thus running its
             # static (global) initialization code in order to register custom
             # operators with the JIT.
+<<<<<<< HEAD
             try:
                 ctypes.CDLL(path)
             except Exception as e:
                 raise OSError(f"Could not load this library: {path}") from e
+=======
+            ctypes.CDLL(path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loaded_libraries.add(path)
 
 
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 2c2b16373f8a0..d4a10f5aaf7a9 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1,9 +1,16 @@
 # mypy: allow-untyped-defs
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from enum import Enum
 from functools import partial, reduce
 from typing import Optional, Union
+=======
+from collections.abc import Sequence
+from enum import Enum
+from functools import partial, reduce
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._prims_common as utils
@@ -145,6 +152,10 @@
     "conj",
     "expand_dims",
     "slice",
+<<<<<<< HEAD
+=======
+    "slice_in_dim",  # implemented using slice -- make this a ref?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "split_dim",
     "squeeze",
     "transpose",
@@ -301,7 +312,11 @@ def _backend_select_impl(*args, **kwargs):
         else:
             return _prim_impl(*args, **kwargs)
 
+<<<<<<< HEAD
     name = schema.split("(", maxsplit=1)[0]
+=======
+    name = schema.split("(")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     schema = schema[len(name) :]
 
     # register non-functional ops with old custom ops API
@@ -352,6 +367,7 @@ def _backend_select_impl(*args, **kwargs):
 
     from torch._subclasses.fake_tensor import contains_tensor_types
 
+<<<<<<< HEAD
     if (
         not any(contains_tensor_types(a.type) for a in _prim._schema.arguments)
         or str(
@@ -360,6 +376,14 @@ def _backend_select_impl(*args, **kwargs):
         )
         == "prims.device_put.default"
     ):
+=======
+    if not any(contains_tensor_types(a.type) for a in _prim._schema.arguments) or str(
+        _prim
+    ) in [
+        # See https://github.com/pytorch/pytorch/issues/103532
+        "prims.device_put.default"
+    ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prim_backend_select_impl.impl(name, _backend_select_impl)
 
     for p in (_prim_packet, _prim):
@@ -405,7 +429,11 @@ def _prim_elementwise_meta(
     utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
     utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True)
 
+<<<<<<< HEAD
     l2p_perm, _ = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
+=======
+    l2p_perm = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape = utils.extract_shape(*args_, allow_cpu_scalar_tensors=True)
 
     # Acquires the dtype
@@ -427,7 +455,10 @@ def _prim_elementwise_meta(
     # Acquires the device (if it exists) or number
     device = None
     number = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args_:
         if isinstance(arg, TensorLike):
             if utils.is_cpu_scalar_tensor(arg):
@@ -447,7 +478,13 @@ def _prim_elementwise_meta(
     # (but getting it wrong will cause too many casts to be inserted in traces!)
     if device is not None:
         assert dtype is not None
+<<<<<<< HEAD
         if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+=======
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT:
+            dtype = dtype
+        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype = torch.bool
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
             if utils.is_integer_dtype(dtype) or utils.is_boolean_dtype(dtype):
@@ -455,6 +492,11 @@ def _prim_elementwise_meta(
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
             if utils.is_complex_dtype(dtype):
                 dtype = utils.corresponding_real_dtype(dtype)
+<<<<<<< HEAD
+=======
+            else:
+                dtype = dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert shape is not None
         return torch.empty_permuted(shape, l2p_perm, device=device, dtype=dtype)  # type: ignore[return-value]
@@ -690,6 +732,7 @@ def _clone_meta(
             device=input.device,
             memory_format=memory_format,
         )
+<<<<<<< HEAD
     else:
         # Match eager behavior by preserving strides for non_overlapping_and_dense tensors
         # If not, eager clone creates contiguous strides
@@ -706,6 +749,18 @@ def _clone_meta(
             layout=input.layout,
             device=input.device,
         )
+=======
+
+    # memory_format == torch.preserve_format
+    strides = utils.compute_elementwise_output_strides(input)
+    return torch.empty_strided(
+        input.shape,
+        strides,
+        dtype=input.dtype,
+        layout=input.layout,
+        device=input.device,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 clone = _make_prim(
@@ -1012,10 +1067,15 @@ def _div_aten(a, b):
     )
 
     if is_integral:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return torch.div(a, b, rounding_mode="trunc")
     else:
         # pyrefly: ignore [bad-argument-type]
+=======
+        return torch.div(a, b, rounding_mode="trunc")
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.true_divide(a, b)
 
 
@@ -1390,6 +1450,7 @@ def _collapsed_shape(shape: ShapeType, start: int, end: int) -> tuple[int, ...]:
     return shape[0:start] + (dim_length,) + shape[end + 1 :]
 
 
+<<<<<<< HEAD
 # If the collapse is invalid or cannot be determined (because of unbacked data)
 # then `must_be_valid` determines the behavior:
 #   None: return None, None.
@@ -1406,6 +1467,14 @@ def _collapse_view_helper(
         sym_and,
         sym_or,
     )
+=======
+def _collapse_view_helper(
+    a: TensorLikeType, start: int, end: int
+) -> tuple[Optional[ShapeType], Optional[StrideType]]:
+    assert isinstance(a, TensorLike)
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _validate_collapse_args(a, start, end)
 
@@ -1420,6 +1489,7 @@ def _collapse_view_helper(
     if a.ndim == 0 or (end == start):
         return shape, strides
 
+<<<<<<< HEAD
     valid_op = True
     if guard_or_false(a.numel() != 0):
         for idx in range(end - 1, start - 1, -1):
@@ -1465,24 +1535,67 @@ def _collapse_view_helper(
             length = length * shape[idx]
     else:
         stride = 0
+=======
+    length = shape[end]
+    stride = strides[end]
+    for idx in range(end - 1, start - 1, -1):
+        if guard_size_oblivious(shape[idx] == 0) or guard_size_oblivious(
+            shape[idx + 1] == 0
+        ):
+            length = 0
+            stride = 0
+            break
+
+        if guard_size_oblivious(shape[idx] == 1):
+            continue
+
+        length = length * shape[idx]
+        if guard_size_oblivious(stride < strides[idx]):
+            stride = stride
+        else:
+            stride = strides[idx]
+
+        if (
+            guard_size_oblivious(a.numel() > 0)
+            and guard_size_oblivious(shape[idx + 1] != 1)
+            and not guard_size_oblivious(
+                strides[idx] == strides[idx + 1] * shape[idx + 1]
+            )
+        ):
+            return None, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_shape = shape[:start] + (length,) + shape[end + 1 :]
     new_strides = strides[:start] + (stride,) + strides[end + 1 :]
 
     # NOTE: when the input has no elements it's restrided as if it were contiguous
+<<<<<<< HEAD
     # except for unbacked.
     if guard_or_false(a.numel() == 0):
+=======
+    if guard_size_oblivious(a.numel() == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_strides = utils.make_contiguous_strides_for(new_shape)
 
     return new_shape, new_strides
 
 
 def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeType:
+<<<<<<< HEAD
     new_shape, new_strides = _collapse_view_helper(
         a, start, end, "Attempting to view a collapsed tensor, but no such view exists!"
     )
     assert new_strides is not None
     assert new_shape is not None
+=======
+    new_shape, new_strides = _collapse_view_helper(a, start, end)
+
+    if new_shape is None:
+        msg = "Attempting to view a collapsed tensor, but no such view exists!"
+        raise ValueError(msg)
+
+    assert new_strides is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return a.as_strided(new_shape, new_strides, a.storage_offset())
 
 
@@ -2207,7 +2320,11 @@ def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor:
 _resize_doc = """
   Gives a tensor with no elements a new shape, returning the modified tensor.
 
+<<<<<<< HEAD
   The tensor's strides are contiguous and its values are uninitialized.
+=======
+  The tensor's strides are contiguous and its values are unitialized.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   """
 
 # TODO: review support arbitrary resizes
@@ -2546,11 +2663,15 @@ def _full_aten(
 ) -> Tensor:
     # Note that Mypy thinks torch.full can't accept a complex fill_value
     return torch.full(
+<<<<<<< HEAD
         shape,
         fill_value,
         dtype=dtype,
         device=device,
         requires_grad=requires_grad,  # type: ignore[arg-type]
+=======
+        shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -2593,11 +2714,15 @@ def _full_like_aten(
 ) -> Tensor:
     # Note that Mypy thinks torch.full can't accept a complex fill_value
     return torch.full_like(
+<<<<<<< HEAD
         a,
         fill_value,
         dtype=dtype,
         device=device,
         requires_grad=requires_grad,  # type: ignore[arg-type]
+=======
+        a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index e41e3e904d857..78f6102294ab3 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -2,12 +2,20 @@
 
 import functools
 from contextlib import nullcontext
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, TYPE_CHECKING, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._decomp
@@ -125,7 +133,10 @@ def __torch_function__(
         # Unless we are in prims_mode, in which case we want to use nvprims
         if orig_func in torch_function_passthrough or orig_func in all_prims():
             with self.prims_mode_cls():
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return orig_func(*args, **kwargs)
         mapping = torch_to_refs_map()
         func = mapping.get(orig_func, None)
@@ -148,7 +159,10 @@ def __torch_function__(
         if func is not None:
             # If the ref exists query whether we should use it or not
             if self.should_fallback_fn(self, orig_func, func, args, kwargs):
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return orig_func(*args, **kwargs)
             # torch calls inside func should be interpreted as refs calls
             with self:
@@ -157,5 +171,8 @@ def __torch_function__(
             raise RuntimeError(
                 f"no _refs support for {torch.overrides.resolve_name(orig_func)}"
             )
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return orig_func(*args, **kwargs)
diff --git a/torch/_prims/debug_prims.py b/torch/_prims/debug_prims.py
index d52462815229b..a9ff1dc3db85d 100644
--- a/torch/_prims/debug_prims.py
+++ b/torch/_prims/debug_prims.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 import contextlib
 from collections.abc import Generator, Sequence
+=======
+# mypy: allow-untyped-defs
+import contextlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 
 import torch
@@ -10,7 +15,11 @@
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def load_tensor_reader(loc: str) -> Generator[None, None, None]:
+=======
+def load_tensor_reader(loc):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global LOAD_TENSOR_READER
     assert LOAD_TENSOR_READER is None
     # load_tensor is an "op", and we will play merry hell on
@@ -26,13 +35,18 @@ def load_tensor_reader(loc: str) -> Generator[None, None, None]:
         LOAD_TENSOR_READER = None
 
 
+<<<<<<< HEAD
 def register_debug_prims() -> None:
+=======
+def register_debug_prims():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.library.define(
         "debugprims::load_tensor",
         "(str name, int[] size, int[] stride, *, ScalarType dtype, Device device) -> Tensor",
     )
 
     @torch.library.impl("debugprims::load_tensor", "BackendSelect")
+<<<<<<< HEAD
     def load_tensor_factory(
         name: str,
         size: Sequence[int],
@@ -40,6 +54,9 @@ def load_tensor_factory(
         dtype: torch.dtype,
         device: torch.device,
     ) -> torch.Tensor:
+=======
+    def load_tensor_factory(name, size, stride, dtype, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if LOAD_TENSOR_READER is None:
             from torch._dynamo.testing import rand_strided
 
@@ -56,5 +73,9 @@ def load_tensor_factory(
             # Unlike the other properties, we will do coercions for dtype
             # mismatch
             if r.dtype != dtype:
+<<<<<<< HEAD
                 r = clone_input(r, dtype=dtype)  # type: ignore[no-untyped-call]
+=======
+                r = clone_input(r, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return r
diff --git a/torch/_prims/executor.py b/torch/_prims/executor.py
index 55eb0d35c38cd..99c3fecec713c 100644
--- a/torch/_prims/executor.py
+++ b/torch/_prims/executor.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 from torch._prims.context import TorchRefsMode
diff --git a/torch/_prims/rng_prims.py b/torch/_prims/rng_prims.py
index 8becba80dabc3..dc3486e14436b 100644
--- a/torch/_prims/rng_prims.py
+++ b/torch/_prims/rng_prims.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import cast, Optional
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -29,7 +33,10 @@ def register_rng_prim(name, schema, impl_aten, impl_meta, doc, tags=None):
     rngprim_def = torch.library.custom_op(
         "rngprims::" + name, impl_aten, mutates_args=(), schema=schema
     )
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rngprim_def.register_fake(impl_meta)
 
     prim_packet = getattr(torch._ops.ops.rngprims, name)
@@ -70,10 +77,19 @@ def philox_rand_offset(
     curand4_engine_calls = 4
     device_property = torch.cuda.get_device_properties(torch.cuda.current_device())
     blocks_per_sm = device_property.max_threads_per_multi_processor // block_size
+<<<<<<< HEAD
     num = cast(int, numel)
     grid_size = (num + block_size - 1) // block_size
     grid_size = min(grid_size, device_property.multi_processor_count * blocks_per_sm)
     return ((num - 1) // (block_size * grid_size * unroll) + 1) * curand4_engine_calls
+=======
+    grid_size = (numel + block_size - 1) // block_size
+    grid_size = min(grid_size, device_property.multi_processor_count * blocks_per_sm)
+    offset = (
+        (numel - 1) // (block_size * grid_size * unroll) + 1
+    ) * curand4_engine_calls
+    return offset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def register_philox_rand():
@@ -330,11 +346,17 @@ def __call__(self, op, *args, rng_state=None, **kwargs):
 
     @graphsafe_run_with_rng_state.py_impl(DispatchKey.CUDA)
     def impl_cuda(op, *args, rng_state=None, **kwargs):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         device_idx = rng_state.device.index
         generator = torch.cuda.default_generators[device_idx]
         current_state = generator.graphsafe_get_state()
         # pyrefly: ignore [bad-argument-type]
+=======
+        device_idx = rng_state.device.index
+        generator = torch.cuda.default_generators[device_idx]
+        current_state = generator.graphsafe_get_state()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generator.graphsafe_set_state(rng_state)
         out = op(*args, **kwargs)
         generator.graphsafe_set_state(current_state)
@@ -343,9 +365,15 @@ def impl_cuda(op, *args, rng_state=None, **kwargs):
     @graphsafe_run_with_rng_state.py_impl(DispatchKey.BackendSelect)
     def impl_backend_select(op, *args, rng_state=None, **kwargs):
         device = get_device(args, kwargs)
+<<<<<<< HEAD
         assert device == "cuda", (
             f"GraphSafe RNG operations only supported for CUDA, got {device}"
         )
+=======
+        assert (
+            device == "cuda"
+        ), f"GraphSafe RNG operations only supported for CUDA, got {device}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return impl_cuda(op, *args, rng_state=rng_state, **kwargs)
 
     @graphsafe_run_with_rng_state.py_impl(FakeTensorMode)
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 9ba46e8c5310c..4094d6bd3ea7c 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -4,23 +4,39 @@
 import operator
 import typing
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from contextlib import AbstractContextManager, nullcontext
+=======
+from collections.abc import Sequence
+from contextlib import nullcontext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from enum import Enum
 from functools import reduce
 from typing import (
     Any,
+<<<<<<< HEAD
+=======
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cast,
     NamedTuple,
     Optional,
     overload,
     TYPE_CHECKING,
+<<<<<<< HEAD
     TypeAlias,
     TypeGuard,
     TypeVar,
     Union,
 )
 from typing_extensions import deprecated
+=======
+    TypeVar,
+    Union,
+)
+from typing_extensions import deprecated, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import sym_float, sym_int, sym_max
@@ -34,6 +50,7 @@
     import sympy
 
     class _WorksWithInt(typing.Protocol):
+<<<<<<< HEAD
         def __add__(self, other: Any) -> typing.Self: ...
 
         def __radd__(self, other: Any) -> typing.Self: ...
@@ -41,6 +58,19 @@ def __radd__(self, other: Any) -> typing.Self: ...
         def __mul__(self, other: Any) -> typing.Self: ...
 
         def __rmul__(self, other: Any) -> typing.Self: ...
+=======
+        def __add__(self, other: Any) -> typing.Self:
+            ...
+
+        def __radd__(self, other: Any) -> typing.Self:
+            ...
+
+        def __mul__(self, other: Any) -> typing.Self:
+            ...
+
+        def __rmul__(self, other: Any) -> typing.Self:
+            ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _IntLikeT = TypeVar("_IntLikeT", bound=_WorksWithInt)
 
@@ -108,18 +138,36 @@ def __rmul__(self, other: Any) -> typing.Self: ...
 
 
 def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_true
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(a) != len(b):
         return False
 
     for x, y in zip(a, b):
         if allow_rhs_unbacked:
+<<<<<<< HEAD
             if isinstance(y, torch.SymInt):
                 continue
 
         # if we do not know, then they are not the same.
         if guard_or_true(x != y):
+=======
+            # TODO: We should check that the symbols are consistent
+            # with each other
+            if isinstance(y, torch.SymInt):
+                continue
+        # NB: Naively, you would not expect to have to do an oblivious guard
+        # here because there is seemingly no broadcasting here, but in fact we
+        # use this in some situations to determine if we need to do an expand
+        # on the tensor because they don't line up, so you can definitely end
+        # up trying to prove u0 != 1 in this situation.  See
+        # python test/test_proxy_tensor.py -k test_cumsum_unbacked
+        if guard_size_oblivious(x != y):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
     return True
@@ -249,6 +297,7 @@ def check_all_strides(
     return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=False)
 
 
+<<<<<<< HEAD
 def check_contiguous_sizes_strides(sizes, strides, false_if_dde=False):
     """
     Performs an equality check between actual stride & expected stride (based on composed sizes),
@@ -293,6 +342,8 @@ def eval_eager(x):
     return True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This function is equivalent to compute_contiguous() from TensorImpl.cpp
 def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
     """
@@ -303,6 +354,7 @@ def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
     """
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
+<<<<<<< HEAD
         guard_size_oblivious,
     )
 
@@ -310,13 +362,43 @@ def eval_eager(x):
         return bool(x)
 
     maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+=======
+        guard_or_true,
+        guard_size_oblivious,
+        is_nested_int,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if maybe_guard_or_false(a.numel() < 2):
         return True
 
+<<<<<<< HEAD
     return check_contiguous_sizes_strides(
         a.shape, a.stride(), false_if_dde=false_if_dde
     )
+=======
+    expected_stride = 1
+    for x, y in reversed(tuple(zip(a.shape, a.stride()))):
+        # Skips checking strides when a dimension has length 1.
+        if maybe_guard_or_false(x == 1):
+            continue
+
+        if maybe_guard_or_true(y != expected_stride):
+            return False
+
+        # if x is 0 then a is contiguous anyway. So in the check above for non-contiguity condition we can
+        # can assume x is not 0 in expected_stride equation. This make the check consistent with
+        # make_contiguous_strides_for. If we make a tensor and used strides from make_contiguous_strides_for
+        # and then called definitely_contiguous we should get True.
+        expected_stride *= (
+            x if is_nested_int(x) else sym_max(x, 1)
+        )  # type:ignore[assignment]
+
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This function is equivalent to compute_channels_last_contiguous_2d() in TensorImpl.cpp
@@ -325,6 +407,7 @@ def is_channels_last_contiguous_2d(a: Tensor, false_if_dde=False) -> bool:
     if a.ndim != 4:
         return False
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
 
     def eval_eager(x):
@@ -332,6 +415,16 @@ def eval_eager(x):
 
     maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
     maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        guard_size_oblivious,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     expected_stride = 1
     for idx in (1, 3, 2, 0):
@@ -353,6 +446,7 @@ def is_channels_last_contiguous_3d(a: Tensor, false_if_dde=False) -> bool:
     if a.ndim != 5:
         return False
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
 
     def eval_eager(x):
@@ -360,6 +454,16 @@ def eval_eager(x):
 
     maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
     maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        guard_size_oblivious,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     expected_stride = 1
     for idx in (1, 4, 3, 2, 0):
@@ -392,11 +496,15 @@ def validate_memory_format(memory_format: torch.memory_format):
 
 
 def is_contiguous_for_memory_format(  # type: ignore[return]
+<<<<<<< HEAD
     a: Tensor,
     *,
     memory_format: torch.memory_format,
     false_if_dde=False,
     # pyrefly: ignore [bad-return]
+=======
+    a: Tensor, *, memory_format: torch.memory_format, false_if_dde=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> bool:
     validate_memory_format(memory_format)
 
@@ -413,22 +521,38 @@ def is_contiguous_for_memory_format(  # type: ignore[return]
     )
 
 
+<<<<<<< HEAD
 def is_contiguous_or_false(a: TensorLikeType) -> bool:
+=======
+def definitely_contiguous(a: TensorLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return is_contiguous(a, false_if_dde=True)
 
 
 # similar to is_channels_last_contiguous_2d but return false on data dependency.
+<<<<<<< HEAD
 def is_channels_last_contiguous_or_false_2d(a: Tensor) -> bool:
+=======
+def definitely_channels_last_contiguous_2d(a: Tensor) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return is_channels_last_contiguous_2d(a, false_if_dde=True)
 
 
 # similar to is_channels_last_contiguous_3d but return false on data dependency.
+<<<<<<< HEAD
 def is_channels_last_contiguous_or_false_3d(a: Tensor) -> bool:
+=======
+def definitely_channels_last_contiguous_3d(a: Tensor) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return is_channels_last_contiguous_3d(a, false_if_dde=True)
 
 
 # similar to is_contiguous_for_memory_format but return false on data dependency.
+<<<<<<< HEAD
 def is_contiguous_for_memory_format_or_false(  # type: ignore[return]
+=======
+def definitely_contiguous_for_memory_format(  # type: ignore[return]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     a: Tensor, *, memory_format: torch.memory_format
 ) -> bool:
     return is_contiguous_for_memory_format(
@@ -454,6 +578,7 @@ def is_channels_last_contiguous(a: Tensor) -> bool:
 
 
 # similar to is_channels_last_contiguous but return false on data dependency.
+<<<<<<< HEAD
 def is_channels_last_contiguous_or_false(a: Tensor) -> bool:
     return is_channels_last_contiguous_or_false_2d(
         a
@@ -481,6 +606,37 @@ def _is_non_overlapping_and_dense_or_false(sizes, strides) -> bool:
     # non-overlapping and "dense" if their stride is one
     if len(sizes) == 1:
         return guard_or_false(strides[0] == 1)
+=======
+def definitely_channels_last_contiguous(a: Tensor) -> bool:
+    return definitely_channels_last_contiguous_2d(
+        a
+    ) or definitely_channels_last_contiguous_3d(a)
+
+
+def is_non_overlapping_and_dense(a: Tensor) -> bool:
+    """
+    True when a tensor is non-overlapping and dense.
+
+    A tensor is non-overlapping and dense when there exists a permutation of
+    its dimensions that is contiguous.
+    """
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if a.is_sparse:
+        return False
+
+    # Short-circuits if the tensor is already contiguous or channels-last contiguous
+    if definitely_contiguous(a) or definitely_channels_last_contiguous(a):
+        return True
+
+    # The following is equivalent to compute_non_overlapping_and_dense in TensorImpl.cpp
+
+    # Short-circuits for tensors of rank one, which are
+    # non-overlapping and "dense" if their stride is one
+    if a.ndim == 1:
+        return a.stride()[0] == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
     # Sorts (length, stride) pairs by stride
@@ -493,6 +649,7 @@ class K(NamedTuple):
         stride: int
 
         def __lt__(self, other):
+<<<<<<< HEAD
             # for backed symbols, this is practically a < operation
             # for unbacked, we return True if < is statically known,
             # then try to answer this symbolically, with stride ordering semantics
@@ -531,6 +688,35 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
         return False
 
     return _is_non_overlapping_and_dense_or_false(a.shape, a.stride())
+=======
+            return guard_size_oblivious(self.stride < other.stride)
+
+        def __gt__(self, other):
+            return guard_size_oblivious(self.stride > other.stride)
+
+        def __le__(self, other):
+            return guard_size_oblivious(self.stride <= other.stride)
+
+        def __ge__(self, other):
+            return guard_size_oblivious(self.stride >= other.stride)
+
+        def __eq__(self, other):
+            return guard_size_oblivious(self.stride == other.stride)
+
+    lengths_and_strides = sorted(map(K, a.shape, a.stride()))
+
+    expected_stride = 1
+    for length, stride in lengths_and_strides:
+        if guard_size_oblivious(length == 1):
+            continue
+
+        if guard_size_oblivious(stride != expected_stride):
+            return False
+
+        expected_stride *= length
+
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NOTE: Based on the implementation in TensorIterator.cpp, but note that
@@ -542,9 +728,15 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 # This is also INCORRECT because it does not model TensorIterator's
 # short-circuit, which can cause different strides.
 def compute_elementwise_output_logical_to_physical_perm(
+<<<<<<< HEAD
     *tensors, _skip_checks=False, ambiguity_check=False
 ) -> tuple[list[int], bool]:
     from torch.fx.experimental.symbolic_shapes import guard_or_false
+=======
+    *tensors, _skip_checks=False
+) -> list[int]:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
@@ -563,21 +755,32 @@ def compute_elementwise_output_logical_to_physical_perm(
 
     # Short-circuits for CPU scalar case
     if len(tensors) == 0:
+<<<<<<< HEAD
         return [], False
+=======
+        return []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Short-circuits for shapes with zero or one dimensions
     # TODO: are these necessary?
     ndim = tensors[0].ndim
     if ndim == 0:
+<<<<<<< HEAD
         return [], False
     if ndim == 1:
         return [0], False
+=======
+        return []
+    if ndim == 1:
+        return [0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Short-circuits if contiguous or channels last, following the fake fast path.
     # This reduces the number of guards we end up making
     is_contiguous = True
     is_channels_last = True
     for t in tensors:
+<<<<<<< HEAD
         is_contiguous = is_contiguous and is_contiguous_for_memory_format_or_false(
             t, memory_format=torch.contiguous_format
         )
@@ -593,10 +796,25 @@ def compute_elementwise_output_logical_to_physical_perm(
 
     if is_channels_last and not is_contiguous:
         return [0, *list(range(2, ndim)), 1], False
+=======
+        is_contiguous = is_contiguous and definitely_contiguous_for_memory_format(
+            t, memory_format=torch.contiguous_format
+        )
+        is_channels_last = is_channels_last and definitely_contiguous_for_memory_format(
+            t, memory_format=torch.channels_last
+        )
+
+    if is_contiguous and not is_channels_last:
+        return list(range(ndim))
+
+    if is_channels_last and not is_contiguous:
+        return [0, *list(range(2, ndim)), 1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     shape = tensors[0].shape
 
     def should_swap(idx_a, idx_b):
+<<<<<<< HEAD
         def ge(a, b):
             """
             Returns true if a is symbolically greater than or equal to b, assuming a >= 0, b >= 0.
@@ -607,10 +825,13 @@ def ge(a, b):
                 return False
             return guard_or_false(a >= b) or guard_or_false(a % b == 0)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
 
+<<<<<<< HEAD
             if guard_or_false(stride_a == 0) or guard_or_false(stride_b == 0):
                 continue
 
@@ -623,6 +844,21 @@ def ge(a, b):
                 return -1
 
             if ge(stride_a, stride_b):
+=======
+            if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
+                stride_b == 0
+            ):
+                continue
+
+            if guard_size_oblivious(stride_a < stride_b):
+                return -1
+
+            if guard_size_oblivious(stride_a > stride_b):
+                return 1
+
+            # stride_a == stride_b
+            if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return 1
 
         # Note: this case is hit if all strides are zero,
@@ -647,6 +883,7 @@ def ge(a, b):
             elif comparison < 0:
                 break
 
+<<<<<<< HEAD
     # verify we've imposed ordering if ambiguity_check=True
     raise_ambiguous = False
     if ambiguity_check:
@@ -657,6 +894,9 @@ def ge(a, b):
                 break
 
     return list(reversed(perm)), raise_ambiguous
+=======
+    return list(reversed(perm))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def compute_elementwise_output_strides(*tensors) -> tuple[int, ...]:
@@ -686,7 +926,11 @@ def compute_elementwise_output_strides(*tensors) -> tuple[int, ...]:
     if ndim == 1:
         return (1,)
 
+<<<<<<< HEAD
     logical_to_physical_perm, _ = compute_elementwise_output_logical_to_physical_perm(
+=======
+    logical_to_physical_perm = compute_elementwise_output_logical_to_physical_perm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *tensors, _skip_checks=True
     )
     permuted_shape = apply_perm(shape, logical_to_physical_perm)  # to physical
@@ -728,7 +972,11 @@ def validate_dim_length(length: int):
     """
 
     if isinstance(length, (int, torch.SymInt)):
+<<<<<<< HEAD
         torch._check(length >= 0)
+=======
+        torch._check_is_size(length)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         # sometimes called with sympy expression by inductor
         assert length >= 0
@@ -817,16 +1065,23 @@ def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
 # mapping negative offsets to positive ones
 @overload
 def canonicalize_dims(
+<<<<<<< HEAD
     rank: int,
     indices: Sequence[int],
     wrap_scalar: bool = True,
     # pyrefly: ignore [bad-return]
+=======
+    rank: int, indices: Sequence[int], wrap_scalar: bool = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[int, ...]:
     pass
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def canonicalize_dims(rank: int, indices: int, wrap_scalar: bool = True) -> int:
     pass
 
@@ -855,7 +1110,11 @@ def is_same_shape(a: Sequence, b: Sequence) -> bool:
     return tuple(a) == tuple(b)
 
 
+<<<<<<< HEAD
 def is_cpu_scalar_tensor(a: object) -> TypeGuard[TensorLike]:
+=======
+def is_cpu_scalar_tensor(a: Any) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(a, TensorLike) and a.ndim == 0 and a.device.type == "cpu"
 
 
@@ -873,7 +1132,10 @@ def check_same_device(*args, allow_cpu_scalar_tensors):
 
     # Note: cannot initialize device to the first arg's device (it may not have one)
     device = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -921,7 +1183,10 @@ def check_same_shape(*args, allow_cpu_scalar_tensors: bool):
     """
     shape = None
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -948,7 +1213,10 @@ def extract_shape(*args, allow_cpu_scalar_tensors: bool) -> Optional[ShapeType]:
     shape = None
     scalar_shape = None
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -971,7 +1239,11 @@ def extract_shape(*args, allow_cpu_scalar_tensors: bool) -> Optional[ShapeType]:
 # Extracts dimensions that might be passed either as a list/tuple or as varargs.
 # A typical case is Tensor.permute .
 def extract_dims_from_varargs(
+<<<<<<< HEAD
     dims: Union[DimsSequenceType, tuple[DimsSequenceType, ...]],
+=======
+    dims: Union[DimsSequenceType, tuple[DimsSequenceType, ...]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> DimsSequenceType:
     if dims and isinstance(dims[0], Sequence):
         assert len(dims) == 1
@@ -1005,7 +1277,10 @@ def extract_shape_from_varargs(
 
     # Handles tuple unwrapping
     if len(shape) == 1 and isinstance(shape[0], Sequence):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape = shape[0]
 
     if validate:
@@ -1083,7 +1358,17 @@ def infer_size(shape: ShapeType, numel: int) -> tuple[int, ...]:
         # PyTorch, which prints sequences in square brackets.
         shape = list(shape)
         shape[dim] = numel // newsize
+<<<<<<< HEAD
         torch._check(shape[dim] >= 0)
+=======
+        # NB: This is pretty important when you have unbacked SymInts.
+        # Suppose you have (i0, 12) resizing into (2, -1, 12).  The old
+        # range for i0 is typically [2, inf], which means if you divide
+        # by two the new range should be [1, inf].  But this is bad news
+        # if you have an unbacked SymInt: we need to reapply the unsound
+        # assumption that the size is >= 2.
+        torch._check_is_size(shape[dim])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return tuple(shape)
 
 
@@ -1288,7 +1573,11 @@ def get_higher_dtype(
     assert b is None or isinstance(b, (torch.dtype, TensorLike, Number))
 
     def _extract_dtype(
+<<<<<<< HEAD
         x: Optional[Union[torch.dtype, TensorLikeType, NumberType]],
+=======
+        x: Optional[Union[torch.dtype, TensorLikeType, NumberType]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[torch.dtype]:
         if x is None:
             return None
@@ -1301,7 +1590,10 @@ def _extract_dtype(
 
         raise RuntimeError("Unexpected type given to _extract_dtype!")
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     a, b = _extract_dtype(a), _extract_dtype(b)
 
     if a is b:
@@ -1397,7 +1689,10 @@ def check_same_dtype(*args):
     full_dtype = None
     scalar_type = None
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in args:
         if isinstance(arg, Number):
             # Scalar type checking is disabled (and may be removed in the future)
@@ -1508,7 +1803,11 @@ class RETURN_TYPE(Enum):
 
 # TODO: when NumberType contains the sym types, can simplify this
 def number_type(
+<<<<<<< HEAD
     x: Union[NumberType, torch.SymInt, torch.SymFloat, torch.SymBool],
+=======
+    x: Union[NumberType, torch.SymInt, torch.SymFloat, torch.SymBool]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> type:
     if isinstance(x, torch.SymInt):
         return int
@@ -1668,10 +1967,15 @@ def _find_highest_dtype_filtered(
 
         # Prefers dtype of tensors with one or more dimensions
         if one_plus_dim_tensor_dtype is not None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
             return one_plus_dim_tensor_dtype
 
         # pyrefly: ignore [bad-return]
+=======
+            return one_plus_dim_tensor_dtype
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return zero_dim_tensor_dtype
 
     if highest_type is float:
@@ -1766,7 +2070,13 @@ def make_contiguous_strides_for(
     strides = []
     for l in reversed(shape):
         strides.append(multiplier)
+<<<<<<< HEAD
         multiplier *= l if is_nested_int(l) else sym_max(l, 1)  # type:ignore[assignment]
+=======
+        multiplier *= (
+            l if is_nested_int(l) else sym_max(l, 1)
+        )  # type:ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     result = tuple(reversed(strides))
 
@@ -1916,9 +2226,13 @@ def compute_required_storage_length(
 
     >>> # xdoctest: +SKIP(failing)
     >>> t2 = torch.empty_strided((1, 2, 3), (5, 7, 11))
+<<<<<<< HEAD
     >>> size = compute_required_storage_length(
     ...     t2.shape, t2.stride(), t2.storage_offset()
     ... )
+=======
+    >>> size = compute_required_storage_length(t2.shape, t2.stride(), t2.storage_offset())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> size == t.storage().size()
     True
 
@@ -1928,6 +2242,7 @@ def compute_required_storage_length(
     >>> slice.storage().size()
     100
 
+<<<<<<< HEAD
     >>> compute_required_storage_length(
     ...     slice.shape, slice.stride(), slice.storage_offset()
     ... )
@@ -1940,6 +2255,16 @@ def compute_required_storage_length(
     # Note: we are unsafely assuming tensor is not empty here, without
     # runtime assertions.
     if guard_or_false(reduce(operator.mul, shape, 1) == 0):
+=======
+    >>> compute_required_storage_length(slice.shape, slice.stride(), slice.storage_offset())
+    40
+
+    """
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # Short-circuits if the shape has no elements
+    if guard_size_oblivious(reduce(operator.mul, shape, 1) == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return 0
 
     max_offset = sum((x - 1) * y for x, y in zip(shape, strides))
@@ -1989,6 +2314,7 @@ def check(
 
 # This combines is_channels_last_strides_2d and is_channels_last_strides_3d in
 # c10/core/MemoryFormat.h into one function
+<<<<<<< HEAD
 # May return False when input sizes are data-dependent and the property is not
 # determined.
 def are_strides_like_channels_last_or_false(
@@ -1998,6 +2324,12 @@ def are_strides_like_channels_last_or_false(
         guard_or_true,
         statically_known_true,
     )
+=======
+def are_strides_like_channels_last(
+    shape: Sequence[int], strides: Sequence[int]
+) -> bool:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ndim = len(shape)
 
@@ -2010,22 +2342,36 @@ def are_strides_like_channels_last_or_false(
     else:
         return False
 
+<<<<<<< HEAD
     if guard_or_true(strides[1] == 0):
+=======
+    if guard_size_oblivious(strides[1] == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     min = 0
     for d in dim_order:
+<<<<<<< HEAD
         if guard_or_true(shape[d] == 0):
             return False
         if guard_or_true(strides[d] < min):
+=======
+        if guard_size_oblivious(shape[d] == 0):
+            return False
+        if guard_size_oblivious(strides[d] < min):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
         if d == 0 and min == strides[1]:
             return False
         min = strides[d]
+<<<<<<< HEAD
         # Assume stride is not 1, the consequence is min could be larger than needed,
         # which would result in returning False for this function but not vice versa,
         # so it's ok.
         if guard_or_true(strides[d] > 1):
+=======
+        if guard_size_oblivious(strides[d] > 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             min *= shape[d]
     return True
 
@@ -2034,7 +2380,11 @@ def suggest_memory_format(x: TensorLikeType) -> torch.memory_format:
     if x.layout != torch.strided:
         return torch.contiguous_format
 
+<<<<<<< HEAD
     if are_strides_like_channels_last_or_false(x.shape, x.stride()):
+=======
+    if are_strides_like_channels_last(x.shape, x.stride()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.channels_last if x.ndim == 4 else torch.channels_last_3d
 
     return torch.contiguous_format
@@ -2137,8 +2487,12 @@ def alert_not_deterministic(caller: str):
                 f"{caller} does not have a deterministic implementation, but you set "
                 f"'torch.use_deterministic_algorithms(True, warn_only=True)'. "
                 f"You can file an issue at https://github.com/pytorch/pytorch/issues "
+<<<<<<< HEAD
                 f"to help us prioritize adding deterministic support for this operation.",
                 stacklevel=2,
+=======
+                f"to help us prioritize adding deterministic support for this operation."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             torch._check(
@@ -2156,9 +2510,13 @@ def alert_not_deterministic(caller: str):
 
 class CUDARngStateHelper:
     @staticmethod
+<<<<<<< HEAD
     def get_torch_state_as_tuple(
         fake_mode: AbstractContextManager[Any] = nullcontext(),
     ):
+=======
+    def get_torch_state_as_tuple(fake_mode=nullcontext()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not torch.cuda.is_available():
             raise RuntimeError("CUDA not available")
 
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 941fb6ee68e84..f274b4b5683de 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -2,10 +2,17 @@
 import inspect
 import types
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from functools import wraps
 from types import GenericAlias
 from typing import NamedTuple, Optional, overload, TypeVar, Union
+=======
+from collections.abc import Sequence
+from functools import wraps
+from types import GenericAlias
+from typing import Callable, NamedTuple, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -28,19 +35,28 @@
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _maybe_convert_to_dtype(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
     pass
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _maybe_convert_to_dtype(a: NumberType, dtype: torch.dtype) -> NumberType:
     pass
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _maybe_convert_to_dtype(a: Sequence, dtype: torch.dtype) -> Sequence:
     pass
 
@@ -97,7 +113,11 @@ class elementwise_type_promotion_wrapper:
 
     Takes two kwargs, type_promoting_args and type_promotion_kind.
 
+<<<<<<< HEAD
     type_promoting_args must be a string Sequence specifying the argument names of all
+=======
+    type_promoting_args must be a string Sequence specifiying the argument names of all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arguments that participate in type promotion (and should be type promoted). If the
     arg specifies a Sequence-type then every element of the Sequence will participate in
     type promotion.
@@ -180,7 +200,11 @@ def _resize_output_check(out: TensorLikeType, shape: ShapeType):
             "be resized unless they have zero elements. "
             "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)."
         )
+<<<<<<< HEAD
         warnings.warn(msg, stacklevel=2)
+=======
+        warnings.warn(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return True
 
 
@@ -279,9 +303,13 @@ def _out_wrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
             TensorLikeType
             if is_tensor
             else NamedTuple(
+<<<<<<< HEAD
                 f"return_types_{fn.__name__}",
                 # pyrefly: ignore [bad-argument-count]
                 [(o, TensorLikeType) for o in out_names],
+=======
+                f"return_types_{fn.__name__}", [(o, TensorLikeType) for o in out_names]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
@@ -299,7 +327,10 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs):
                         kwargs[k] = out_attr
 
             def maybe_check_copy_devices(out):
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(out, TensorLike) and isinstance(args[0], TensorLike):
                     check_copy_devices(copy_from=args[0], copy_to=out)
 
@@ -322,7 +353,12 @@ def maybe_check_copy_devices(out):
                     and len(result) == len(out_names)  # type: ignore[arg-type]
                 )
                 or (
+<<<<<<< HEAD
                     fn.__name__ == "unbind" and isinstance(result, (list, tuple))  # type: ignore[arg-type]
+=======
+                    fn.__name__ == "unbind"
+                    and isinstance(result, (list, tuple))  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
             # unbind_copy is a special case: see https://github.com/pytorch/pytorch/issues/130829
@@ -330,7 +366,11 @@ def maybe_check_copy_devices(out):
                 # Naively you might expect this assert to be true, but
                 # it's not:
                 #
+<<<<<<< HEAD
                 #   assert type(out) is type(result)
+=======
+                #   assert type(out) == type(result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 #
                 # The reason is that functions under this wrapper can
                 # get registered to the Meta dispatch key, and that
@@ -347,6 +387,7 @@ def maybe_check_copy_devices(out):
                     assert isinstance(out, TensorLike)
                     # These two operations are done in-place
                     _maybe_resize_out(
+<<<<<<< HEAD
                         out,
                         result.shape,  # type: ignore[union-attr]
                         maybe_compute_memory_format(result),
@@ -356,6 +397,11 @@ def maybe_check_copy_devices(out):
                         copy_to=out,
                         exact_dtype=exact_dtype,
                     )
+=======
+                        out, result.shape, maybe_compute_memory_format(result)  # type: ignore[union-attr]
+                    )
+                    _safe_copy_out(copy_from=result, copy_to=out, exact_dtype=exact_dtype)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     if fn.__name__ != "unbind":
                         assert isinstance(out, tuple)  # type: ignore[arg-type]
@@ -396,8 +442,12 @@ def maybe_check_copy_devices(out):
         params = sorted(params, key=lambda p: p.kind)
 
         _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+<<<<<<< HEAD
             parameters=params,
             return_annotation=return_type,  # type: ignore[arg-type]
+=======
+            parameters=params, return_annotation=return_type  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         _fn.__annotations__ = dict(getattr(fn, "__annotations__", {}))
@@ -412,9 +462,13 @@ def maybe_check_copy_devices(out):
         # Add an indicator attribute that can be used in special cases
         # where having a function wrapped by `out_wrapper` is not desirable e.g.
         # jit
+<<<<<<< HEAD
         _fn._torch_decompositions_out_wrapper = (  # type: ignore[attr-defined]
             f"This function is wrapped by {out_wrapper.__module__}.out_wrapper"
         )
+=======
+        _fn._torch_decompositions_out_wrapper = f"This function is wrapped by {out_wrapper.__module__}.out_wrapper"  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return _fn
 
@@ -435,7 +489,10 @@ def redispatch_prim(args, kwargs):
 
     class BackwardsNotSupported(torch.autograd.Function):
         @staticmethod
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(ctx, args_spec, *flat_args):
             args, kwargs = tree_unflatten(flat_args, args_spec)  # type: ignore[arg-type]
             return redispatch_prim(args, kwargs)
@@ -484,14 +541,23 @@ def _fn(*args, **kwargs):
             dtype = utils.type_to_dtype(type(args[0]))
             args_ = list(args)
             args_[0] = torch.tensor(args[0], dtype=dtype)
+<<<<<<< HEAD
             # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = fn(*args_, **kwargs)
             assert isinstance(result, torch.Tensor)
             return result.item()
 
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-param-spec]
         return fn(*args, **kwargs)
 
     _fn.__signature__ = sig  # type: ignore[attr-defined]
     # pyrefly: ignore [bad-return]
+=======
+        return fn(*args, **kwargs)
+
+    _fn.__signature__ = sig  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _fn
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 58a6e8c3c2a6d..ce29c7df553d6 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -7,10 +7,17 @@
 import math
 import operator
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from enum import Enum
 from functools import partial, reduce, singledispatch, wraps
 from typing import Any, cast, Optional, overload, Union
+=======
+from collections.abc import Iterable, Sequence
+from enum import Enum
+from functools import partial, reduce, singledispatch, wraps
+from typing import Any, Callable, cast, Optional, overload, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._prims as prims
@@ -19,6 +26,11 @@
 from torch import sym_float, sym_int
 from torch._prims_common import (
     BoolLike,
+<<<<<<< HEAD
+=======
+    definitely_contiguous,
+    definitely_contiguous_for_memory_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceLikeType,
     Dim,
     DimsSequenceType,
@@ -28,8 +40,11 @@
     FloatLike,
     FloatWithoutSymFloat,
     IntLike,
+<<<<<<< HEAD
     is_contiguous_for_memory_format_or_false,
     is_contiguous_or_false,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_weakly_lesser_type,
     Number,
     NumberType,
@@ -385,7 +400,11 @@ def handle_noncontiguous_outputs(input_tlist, output):
 
 
 def _broadcast_shapes(*_shapes):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false, is_nested_int
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     shapes = tuple(
         (x,) if isinstance(x, IntLike) else x
@@ -396,12 +415,19 @@ def _broadcast_shapes(*_shapes):
     if len(shapes) == 0:
         return None
 
+<<<<<<< HEAD
     for shape in shapes:
         if not isinstance(shape, Sequence):
             raise RuntimeError(
                 "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
                 shape,
             )
+=======
+    # Type checking
+    # TODO: make common validations available as utils
+    for shape in shapes:
+        assert isinstance(shape, Sequence)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Computes common shape
     common_shape: list[Union[int, torch.SymInt]] = [
@@ -409,6 +435,7 @@ def _broadcast_shapes(*_shapes):
     ] * reduce(max, (len(shape) for shape in shapes))
     for arg_idx, shape in enumerate(shapes):
         for idx in range(-1, -1 - len(shape), -1):
+<<<<<<< HEAD
             # NB: handle nested ints specially to avoid invalid guarding on Ne(j0, 1).
             if is_nested_int(shape[idx]):
                 # Broadcasting is allowed for (j0, 1) or (j0, j0);
@@ -422,17 +449,24 @@ def _broadcast_shapes(*_shapes):
                     continue
 
             if guard_or_false(common_shape[idx] == 1):
+=======
+            if guard_size_oblivious(common_shape[idx] == 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if shape[idx] < 0:
                     raise ValueError(
                         "Attempting to broadcast a dimension with negative length!"
                     )
                 common_shape[idx] = shape[idx]
+<<<<<<< HEAD
 
             if not is_nested_int(shape[idx]) and guard_or_false(shape[idx] == 1):
                 # broadcast case .
                 continue
             else:
                 # If broadcasting is undecided we pick non-broadcast path and add runtime assertion.
+=======
+            elif guard_size_oblivious(shape[idx] != 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(
                     common_shape[idx] == shape[idx],
                     lambda: f"Attempting to broadcast a dimension of length {shape[idx]} at {idx}! "
@@ -449,6 +483,7 @@ def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
         *(t.shape if isinstance(t, TensorLike) else None for t in args)
     )
 
+<<<<<<< HEAD
     def should_expand(a: ShapeType, b: ShapeType) -> bool:
         from torch.fx.experimental.symbolic_shapes import (
             guard_or_false,
@@ -481,6 +516,8 @@ def should_expand(a: ShapeType, b: ShapeType) -> bool:
 
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __maybe_broadcast(x, shape):
         if x is None:
             return None
@@ -490,7 +527,11 @@ def __maybe_broadcast(x, shape):
             if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
                 return x
 
+<<<<<<< HEAD
             if should_expand(x.shape, common_shape):
+=======
+            if not utils.same_shape(x.shape, common_shape):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x.expand(common_shape)
 
             return x
@@ -880,6 +921,7 @@ def logsumexp(
     if not isinstance(dim, Iterable):
         dim = (dim,)
     if self.numel() == 0:
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
         return torch.sum(torch.exp(self), dim, keepdim).log()
     # pyrefly: ignore [bad-argument-type]
@@ -888,6 +930,12 @@ def logsumexp(
     # pyrefly: ignore [no-matching-overload]
     maxes_squeezed = maxes if keepdim else torch.squeeze(maxes, dim)
     # pyrefly: ignore [no-matching-overload]
+=======
+        return torch.sum(torch.exp(self), dim, keepdim).log()
+    maxes = torch.amax(torch.real(self), dim, keepdim=True)
+    maxes = torch.masked_fill(maxes, maxes.abs() == float("inf"), 0)
+    maxes_squeezed = maxes if keepdim else torch.squeeze(maxes, dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = torch.sum(torch.exp(self - maxes), dim, keepdim)
     return result.log().add(maxes_squeezed)
 
@@ -1179,7 +1227,11 @@ def add(
     if alpha is not None:
         dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
         python_type = utils.dtype_to_type(dtype)
+<<<<<<< HEAD
         if python_type is not bool and not utils.is_weakly_lesser_type(
+=======
+        if python_type != bool and not utils.is_weakly_lesser_type(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             type(alpha), python_type
         ):
             msg = f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!"
@@ -1245,12 +1297,18 @@ def copysign(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
     if isinstance(b, Number) and isinstance(a, Tensor):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
         msg = f"Expected divisor (b) to be on the same device ({a.device}) as dividend (a), but it is found on {b.device}!"
         raise RuntimeError(msg)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return where(signbit(b), neg(abs(a)), abs(a))
 
 
@@ -1336,6 +1394,7 @@ def float_power(
 
     # Float power has the following contiguous cast behavior to be
     # consistent with its C++ impl
+<<<<<<< HEAD
 
     a = _maybe_convert_to_dtype(a, dtype)
 
@@ -1343,6 +1402,12 @@ def float_power(
 
     a, b = _maybe_broadcast(a, b)
     # pyrefly: ignore [bad-return]
+=======
+    a = _maybe_convert_to_dtype(a, dtype)
+    b = _maybe_convert_to_dtype(b, dtype)
+
+    a, b = _maybe_broadcast(a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return pow(a, b)
 
 
@@ -1384,6 +1449,7 @@ def floor_divide(
 ):
     # Wrap scalars because some references only accept tensor arguments.
     if isinstance(a, Number) and isinstance(b, Number):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         a = scalar_tensor(a)
         # pyrefly: ignore [bad-argument-type]
@@ -1393,6 +1459,13 @@ def floor_divide(
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(a, Number) and isinstance(b, Tensor):
         # pyrefly: ignore [bad-argument-type]
+=======
+        a = scalar_tensor(a)
+        b = scalar_tensor(b)
+    elif isinstance(b, Number) and isinstance(a, Tensor):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(a, Number) and isinstance(b, Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = scalar_tensor(a, dtype=b.dtype, device=b.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
         if a.device == torch.device("cpu"):
@@ -1869,10 +1942,15 @@ def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberT
 
     # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
     if isinstance(b, TensorLike) and isinstance(a, Number):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         a = scalar_tensor(a, dtype=b.dtype, device=b.device)
     elif isinstance(a, TensorLike) and isinstance(b, Number):
         # pyrefly: ignore [bad-argument-type]
+=======
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+    elif isinstance(a, TensorLike) and isinstance(b, Number):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
 
     # mypy: expected "Tensor"
@@ -2012,6 +2090,7 @@ def clamp_max(
 
 
 # https://pytorch.org/docs/stable/generated/torch.where.html
+<<<<<<< HEAD
 # TODO: implement where.default
 @register_decomposition(aten.where.self)
 @register_decomposition(aten.where.ScalarSelf)
@@ -2019,6 +2098,11 @@ def clamp_max(
 @register_decomposition(aten.where.Scalar)
 @register_decomposition(aten.where.self_out)
 @out_wrapper(exact_dtype=True)
+=======
+# TODO: implement alternate where
+@register_decomposition(aten.where)
+@out_wrapper()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a", "b"),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
@@ -2278,6 +2362,7 @@ def _reduction(
         dims = (dims,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dims)
     if not has_identity:
+<<<<<<< HEAD
         from torch.fx.experimental.symbolic_shapes import sym_and
 
         valid_shape = a.ndim == 0 or sym_and(*(a.shape[i] > 0 for i in dims))
@@ -2286,6 +2371,13 @@ def _reduction(
             lambda: "reducing over zero-size dimension for reduction operation without identity",
         )
 
+=======
+        valid_shape = a.ndim == 0 or builtins.all(a.shape[i] for i in dims)
+        if not valid_shape:
+            raise RuntimeError(
+                "reducing over zero-size dimension for reduction operation without identity"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     computation_dtype, result_dtype = utils.reduction_dtypes(
         a, output_dtype_kind, dtype
     )
@@ -2311,21 +2403,32 @@ def _reduction(
     return result
 
 
+<<<<<<< HEAD
 def _make_copy_from_view(fn, return_none_on_out_variant=False):
+=======
+def _make_copy_from_view(fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Given a view function (e.g. torch.diagonal) generates its copy variant (e.g. torch.diagonal_copy)
     """
     aten_fn = getattr(aten, fn.__name__)
     annotations = getattr(fn, "__annotations__", {})
+<<<<<<< HEAD
     # view ops should not change dtypes, this ensures that the decomp path has
     # the same error checks as eager.
     fn = out_wrapper(exact_dtype=True)(aten_fn)
+=======
+    fn = out_wrapper()(aten_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @wraps(fn)
     def _fn(*args, out=None, **kwargs):
         result = fn(*args, out=out, **kwargs)
+<<<<<<< HEAD
         if return_none_on_out_variant and out is not None:
             return None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if out is not None:
             return result
 
@@ -2846,7 +2949,14 @@ def cat_compute_output_memory_format(inputs):
 
     utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_size_oblivious,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # This is a bit tricky.  Naively, you would expect to just pick one
     # arbitrary tensor and check that all tensors match this tensor.  However,
@@ -2865,7 +2975,10 @@ def cat_compute_output_memory_format(inputs):
     # SymInts
 
     example = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, t in enumerate(tensors):
         if example is None:
             if t.ndim != 1:
@@ -2901,7 +3014,11 @@ def cat_compute_output_memory_format(inputs):
                 # through), and is load bearing for our Inductor lowerings
                 # (which assume that size oblivious tests are OK to determine
                 # if a shape is permissibly zero.)
+<<<<<<< HEAD
                 guard_or_false(tensor.shape[0] == 0),
+=======
+                guard_size_oblivious(tensor.shape[0] == 0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 lambda: f"Number of dimensions of tensors must match.  "
                 f"Expected {example.ndim}-D tensors, but got 1-D for "
                 f"tensor number {tensor_idx} in the list",
@@ -3011,7 +3128,11 @@ def constant_pad_nd(
         pad_idx = len(pad) - ((i + 1) * 2)
         new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]
         torch._check(
+<<<<<<< HEAD
             new_dim >= 0,
+=======
+            new_dim > 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda: f"The input size {input_sizes[l_diff + i]}, plus negative padding "
             f"{pad[pad_idx]} and {pad[pad_idx + 1]} resulted in a negative output size, "
             f"which is invalid. Check dimension {l_diff + i} of your input.",
@@ -3055,7 +3176,11 @@ def contiguous(
     )
 
     # TODO: make logic consistent with aten contiguous
+<<<<<<< HEAD
     if is_contiguous_for_memory_format_or_false(a, memory_format=memory_format):
+=======
+    if definitely_contiguous_for_memory_format(a, memory_format=memory_format):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return a
 
     return torch.clone(a, memory_format=memory_format)
@@ -3069,7 +3194,11 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 
 @register_decomposition(aten.expand)
+<<<<<<< HEAD
 def expand(a: Tensor, *shape, implicit: bool = False) -> Tensor:
+=======
+def expand(a: Tensor, *shape) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_or
 
     # NOTE: cannot use utils.extract_shape_from_varargs here
@@ -3155,10 +3284,14 @@ def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorL
 
     # Tries to take a view
     # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view)
+<<<<<<< HEAD
     # Unbacked semnatics: if validty of in-place flattening is undecided we copy.
     new_shape, _new_strides = prims._collapse_view_helper(
         a, start_dim, end_dim, must_be_valid=None
     )
+=======
+    new_shape, _new_strides = prims._collapse_view_helper(a, start_dim, end_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if new_shape is not None:
         return prims.collapse_view(a, start_dim, end_dim)
 
@@ -3244,7 +3377,10 @@ def _normalize(
         mean (Tensor): mean of the tensor along norm_dims.
         rstd (Tensor): 1/std of the tensor along norm_dims.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     norm_dims = utils.canonicalize_dims(a.ndim, norm_dims)
     computation_dtype = utils.get_computation_dtype(a.dtype)
     a_acc = _maybe_convert_to_dtype(a, computation_dtype)
@@ -3345,8 +3481,11 @@ def native_layer_norm(
     bias: Optional[Tensor],
     eps: float,
 ) -> tuple[Tensor, Tensor, Tensor]:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import sym_eq
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     normalized_ndim = len(normalized_shape)
     torch._check(
         normalized_ndim >= 1,
@@ -3358,8 +3497,12 @@ def native_layer_norm(
     # while torch.Size([1, 2, 3]) == (1, 2, 3) is True
     # therefore we use tuple(normalized_shape)
     torch._check(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         weight is None or sym_eq(weight.shape, tuple(normalized_shape)),
+=======
+        weight is None or weight.shape == tuple(normalized_shape),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: "Expected weight to be of same shape as normalized_shape, but got "
         + "weight of shape "
         + str(weight.shape)  # type: ignore[union-attr]
@@ -3367,8 +3510,12 @@ def native_layer_norm(
         + str(normalized_shape),
     )
     torch._check(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         bias is None or sym_eq(bias.shape, tuple(normalized_shape)),
+=======
+        bias is None or bias.shape == tuple(normalized_shape),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: "Expected bias to be of same shape as normalized_shape, but got "
         + "bias of shape "
         + str(bias.shape)  # type: ignore[union-attr]
@@ -3377,11 +3524,15 @@ def native_layer_norm(
     )
     torch._check(
         input.ndim >= normalized_ndim
+<<<<<<< HEAD
         and sym_eq(
             input.shape[(input.ndim - normalized_ndim) :],
             # pyrefly: ignore [bad-argument-type]
             tuple(normalized_shape),
         ),
+=======
+        and input.shape[(input.ndim - normalized_ndim) :] == tuple(normalized_shape),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: "Given normalized_shape="
         + str(normalized_shape)
         + ", expected input with shape "
@@ -3493,7 +3644,11 @@ def stft(
     )
     torch._check(
         not center or align_to_window is None,
+<<<<<<< HEAD
         lambda: "stft only supports align_to_window for center = False.",
+=======
+        "stft only supports align_to_window for center = False.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     hop_length_ = hop_length if hop_length is not None else n_fft // 4
@@ -3505,7 +3660,11 @@ def stft(
         )
         torch._check(
             return_complex_,
+<<<<<<< HEAD
             lambda: (
+=======
+            (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "stft requires the return_complex parameter be given for real inputs, "
                 + "and will further require that return_complex=True in a future PyTorch release."
             ),
@@ -3634,7 +3793,11 @@ def istft(
             n_fft // 2 + 1 == fft_size,
             lambda: (
                 "istft expected the frequency dimension (3rd to the last) of the input tensor "
+<<<<<<< HEAD
                 + f"to match n_fft / 2 + 1 when onesided=True, but got {fft_size}"
+=======
+                + "to match n_fft / 2 + 1 when onesided=True, but got {fft_size}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
     else:
@@ -3642,7 +3805,11 @@ def istft(
             n_fft == fft_size,
             lambda: (
                 "istft expected the frequency dimension (3rd to the last) of the input tensor "
+<<<<<<< HEAD
                 + f"to match n_fft when onesided=False, but got {fft_size}",
+=======
+                + "to match n_fft when onesided=False, but got {fft_size}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
@@ -3729,8 +3896,12 @@ def istft(
     if end > expected_output_signal_len:
         warnings.warn(
             "The length of signal is shorter than the length parameter. Result is being "
+<<<<<<< HEAD
             + "padded with zeros in the tail. Please check your center and hop_length settings",
             stacklevel=2,
+=======
+            + "padded with zeros in the tail. Please check your center and hop_length settings"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         y = aten.constant_pad_nd(y, (0, end - expected_output_signal_len), 0)
     return y
@@ -3872,9 +4043,13 @@ def _reshape_view_helper_core_alg(
             # may return a view of a copy
 
             # Checks if collapse can be a view and short-circuits to copying reshape if it can't
+<<<<<<< HEAD
             new_shape, _new_strides = prims._collapse_view_helper(
                 a_, idx, end, must_be_valid=None
             )
+=======
+            new_shape, _new_strides = prims._collapse_view_helper(a_, idx, end)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if new_shape is None:
                 if allow_copy:
                     return prims.reshape(a, shape)
@@ -3938,7 +4113,11 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         else:
             return _a
 
+<<<<<<< HEAD
     if is_contiguous_or_false(a):
+=======
+    if definitely_contiguous(a):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Special-cases for nd_to_1d
         if len(shape) == 1 and a.ndim > 1:
             return torch.as_strided(a, [a.numel()], [1])
@@ -3951,7 +4130,11 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
     shape_numel = reduce(operator.mul, shape, 1)
     torch._check(
         a.numel() == shape_numel,
+<<<<<<< HEAD
         lambda: f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+=======
+        f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
@@ -3975,7 +4158,10 @@ def reshape_as(self: TensorLikeType, other: TensorLikeType) -> TensorLikeType:
 @out_wrapper()
 def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLikeType:
     """Reference implementation of :func:`torch.roll`."""
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dims = utils.canonicalize_dims(a.ndim, dims)
     # ATen specifies int[1] type for shifts and dims which expands integers to tuples of length 1
     if not isinstance(shifts, Iterable):
@@ -3988,6 +4174,7 @@ def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLike
         # Keeping this as ref for now as FakeTensor runs into some issues with complex tensors
         return a.clone()
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     if a.dim() == 0 and len(dims) > 0:
         raise IndexError(
@@ -3998,6 +4185,14 @@ def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLike
     # pyrefly: ignore [bad-argument-type]
     len_shifts = len(shifts)
     # pyrefly: ignore [bad-argument-type]
+=======
+    if a.dim() == 0 and len(dims) > 0:
+        raise IndexError(
+            f"Dimension specified as {dims[0]} but tensor has no dimensions"
+        )
+
+    len_shifts = len(shifts)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     len_dims = len(dims)
     if len_shifts != 1 or len_dims != 1:
         if len_shifts == 0:
@@ -4005,27 +4200,40 @@ def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLike
         # Takes care of the case when dims is not specified (default)
         # By default, the tensor is flattened before shifting, after which the original shape is restored
         if len_dims == 0 and len_shifts == 1:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return torch.roll(torch.flatten(a), shifts, 0).view(a.shape)
         if len_shifts != len_dims:
             raise RuntimeError(
                 f"shifts and dimensions must align. shifts: {len_shifts}, dims: {len_dims}"
             )
         assert len_dims > 1
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
         tail_shifts = shifts[1:]
         # pyrefly: ignore [index-error]
         tail_dims = dims[1:]
         # pyrefly: ignore [index-error]
+=======
+        tail_shifts = shifts[1:]
+        tail_dims = dims[1:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         first_dim_rolled = torch.roll(a, (shifts[0],), dims[0])
         return torch.roll(first_dim_rolled, tail_shifts, tail_dims)
 
     # This path is taken when only one dimension is rolled
     # For example to get `first_dim_rolled` above
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
     dim = dims[0]
     size = a.shape[dim]
     # pyrefly: ignore [index-error]
+=======
+    dim = dims[0]
+    size = a.shape[dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     start = (size - shifts[0]) % size
     idx = torch.arange(size, device=a.device)
     return a.index_select(dim, torch.fmod(start + idx, size))
@@ -4064,6 +4272,7 @@ def rot90(
 
 
 def _check_stack_inputs(tensors: TensorSequenceType) -> None:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import sym_eq
 
     entry_shape = tensors[0].shape
@@ -4071,6 +4280,13 @@ def _check_stack_inputs(tensors: TensorSequenceType) -> None:
         torch._check(
             sym_eq(tensors[i].shape, entry_shape),
             lambda: f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0 ",
+=======
+    entry_shape = tensors[0].shape
+    for i in range(1, len(tensors)):
+        assert tensors[i].shape == entry_shape, (
+            f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0 "
+            f"and {tensors[i].shape} at entry {i}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -4107,9 +4323,13 @@ def softmax(
         a_max = amax(a_, dim, keepdim=True)
         a_exp = exp(a_ - a_max)
     return _maybe_convert_to_dtype(
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
         true_divide(a_exp, sum(a_exp, dim, keepdim=True)),
         result_dtype,
+=======
+        true_divide(a_exp, sum(a_exp, dim, keepdim=True)), result_dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )  # type: ignore[return-value]
 
 
@@ -4140,15 +4360,24 @@ def unflatten(a: TensorLikeType, dim: int, sizes: ShapeType) -> TensorLikeType:
 
 @register_decomposition(aten.unbind)
 def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
+<<<<<<< HEAD
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = utils.canonicalize_dim(t.ndim, dim)
     torch._check_index(
         len(t.shape) > 0,
         lambda: "Dimension specified as 0 but tensor has no dimensions",
     )
+<<<<<<< HEAD
 
     # Note: t.shape[dim] can't be dynamic or unbacked, even if we use guard_or_false here we will fail
     # later in the split since t.shape[dim] control the number of output tensors.
     if t.shape[dim] == 0:
+=======
+    if guard_size_oblivious(t.shape[dim] == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()
     else:
         return tuple(
@@ -4279,14 +4508,21 @@ def index_select(x: TensorLike, dim: int, index: TensorLike):
 
 @register_decomposition(aten.squeeze.dims)
 def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_or_false
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if dim is None:
         dims = tuple(idx for idx, size in enumerate(a.shape) if size == 1)
         return prims.squeeze(a, dims) if dims else prims.view_of(a)
 
     ndim = a.ndim
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = utils.canonicalize_dims(ndim, dim)
     dims = (dim,) if isinstance(dim, Dim) else dim
     # Short-circuits if the tensor has no dimensions
@@ -4295,8 +4531,12 @@ def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType
         return prims.view_of(a)
 
     # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1
+<<<<<<< HEAD
     # would it be better if we just not allow 1 for unbacked at runtiume?
     dims = tuple(d for d in dims if guard_or_false(a.shape[d] == 1))
+=======
+    dims = tuple(d for d in dims if guard_size_oblivious(a.shape[d] == 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(dims) == 0:
         return prims.view_of(a)
     if len(dims) == 1:
@@ -4315,8 +4555,13 @@ def split_with_sizes(
     # NB: Perform the check_is_size tests first so that the
     # sum test does not try to do a replacement
     for i in range(len(split_sizes)):
+<<<<<<< HEAD
         torch._check(
             split_sizes[i] >= 0,
+=======
+        torch._check_is_size(
+            split_sizes[i],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda: "split_with_sizes expects split_sizes have only non-negative entries",
         )
     torch._check_with(
@@ -4352,7 +4597,11 @@ def tensor_split(
 
     # If indices_or_sections is a tensor, it must be a CPU Long tensor
     if isinstance(indices_or_sections, TensorLike):
+<<<<<<< HEAD
         if indices_or_sections.device.type != "cpu":
+=======
+        if not indices_or_sections.device.type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             msg = (
                 f"tensor_split: if indices_or_sections is a tensor it must be on the CPU, "
                 f"but received one on {indices_or_sections.device}"
@@ -4427,7 +4676,10 @@ def hsplit(
     if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         torch._check(
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (split_size != 0 and a.shape[dim] % split_size == 0),
             lambda: (
                 "torch.hsplit attempted to split along dimension "
@@ -4439,7 +4691,10 @@ def hsplit(
                 + "!"
             ),
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tensor_split(a, split_size, dim)
 
     torch._check_type(
@@ -4470,7 +4725,10 @@ def vsplit(
     if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         torch._check(
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (split_size != 0 and a.shape[0] % split_size == 0),
             lambda: (
                 f"torch.vsplit attempted to split along dimension 0"
@@ -4481,7 +4739,10 @@ def vsplit(
                 f"!"
             ),
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tensor_split(a, split_size, 0)
 
     torch._check_type(
@@ -4686,7 +4947,10 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
         raise RuntimeError(
             f"torch.dsplit requires a tensor with at least 3 dimension, but got a tensor with {a.ndim} dimensions!"
         )
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0):
         raise RuntimeError(
             "torch.dsplit attempted to split along dimension 2, "
@@ -4738,7 +5002,11 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     if a.ndim <= 1 or dim0 == dim1:
         return aten.alias.default(a)
 
+<<<<<<< HEAD
     _permutation = list(range(a.ndim))
+=======
+    _permutation = list(range(0, a.ndim))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _permutation[_dim0] = _dim1
     _permutation[_dim1] = _dim0
     return torch.permute(a, _permutation)
@@ -5161,7 +5429,11 @@ def empty_like(
         )
 
     # memory_format == torch.preserve_format
+<<<<<<< HEAD
     logical_to_physical_perm, _ = (
+=======
+    logical_to_physical_perm = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         utils.compute_elementwise_output_logical_to_physical_perm(a)
     )
     # identity perm is [2, 1, 0]
@@ -5460,7 +5732,10 @@ def logspace(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def meshgrid(tensors: Sequence[TensorLikeType], indexing: str):
     pass
 
@@ -5629,6 +5904,7 @@ def empty_strided(
     )
 
 
+<<<<<<< HEAD
 def _strength_reduce_integer(val: int) -> torch.dtype:
     for possible_dtype in (torch.uint8, torch.uint16, torch.int32):
         if val <= torch.iinfo(possible_dtype).max:
@@ -5636,6 +5912,8 @@ def _strength_reduce_integer(val: int) -> torch.dtype:
     return torch.int64
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_decomposition(aten.eye)
 @out_wrapper()
 def eye(
@@ -5657,6 +5935,7 @@ def eye(
     torch._check(n >= 0, lambda: f"n must be greater or equal to 0, got {n}")
     torch._check(m >= 0, lambda: f"m must be greater or equal to 0, got {m}")
 
+<<<<<<< HEAD
     range_dtype = torch.int64
     if isinstance(n, utils.IntWithoutSymInt) and isinstance(m, utils.IntWithoutSymInt):
         range_dtype = _strength_reduce_integer(max(n, m))
@@ -5666,6 +5945,14 @@ def eye(
     cond = range_n.unsqueeze(-1) == range_m
     if layout in (torch.strided, None) and not pin_memory:
         return cond.to(dtype or torch.get_default_dtype())
+=======
+    range_n = torch.arange(n, dtype=torch.int64, device=device, requires_grad=False)
+    range_m = torch.arange(m, dtype=torch.int64, device=device, requires_grad=False)
+
+    cond = range_n.unsqueeze(-1) == range_m
+    if dtype is torch.bool:
+        return cond
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         one = torch.ones(
             (1,),
@@ -5887,7 +6174,10 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi
 
     # Since `where` allows type-promotion,
     # cast value to correct type before passing to `where`
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     value = _maybe_convert_to_dtype(value, a.dtype)
     r = torch.where(mask, value, a)  # type: ignore[arg-type]
 
@@ -5973,8 +6263,12 @@ def norm(
 @out_wrapper()
 def trace(self: TensorLikeType) -> TensorLikeType:
     torch._check(
+<<<<<<< HEAD
         self.ndim == 2,
         lambda: f"expected a matrix, but got tensor with dim {self.ndim}",
+=======
+        self.ndim == 2, lambda: "expected a matrix, but got tensor with dim {self.ndim}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return torch.sum(torch.diag(self, 0))
 
@@ -6191,7 +6485,11 @@ def bucketize(
     if n_boundaries == 0:
         return torch.zeros_like(a)
     # We are trying to find the bucket (defined by pairs of consecutive elements of `boundaries`)
+<<<<<<< HEAD
     # each element of `a` belongs to. We use binary search to achieve logarithmic complexity,
+=======
+    # each element of `a` belongs to. We use binary search to achieve logarithimic complexity,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # but each step of the search is done "in parallel" over all elements of `a`
     # can't use int32 as indexes, so we have to do all computations with int64 and convert at the end
     start = torch.zeros(a.shape, device=a.device, dtype=torch.int64)
@@ -6616,7 +6914,11 @@ def select_scatter(x: TensorLikeType, src: TensorLikeType, dim: int, index: int)
 permute_copy = _make_copy_from_view(aten.permute)
 t_copy = _make_copy_from_view(aten.t)
 transpose_copy = _make_copy_from_view(aten.transpose)
+<<<<<<< HEAD
 unbind_copy = _make_copy_from_view(aten.unbind, return_none_on_out_variant=True)
+=======
+unbind_copy = _make_copy_from_view(aten.unbind)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 unsqueeze_copy = _make_copy_from_view(aten.unsqueeze)
 view_copy = _make_copy_from_view(aten.view)
 
@@ -6682,7 +6984,10 @@ def _infer_scalar_type(obj):
         # double.
         if length == 0:
             return torch.get_default_dtype()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(length):
             cur_item = obj[i]
             # TODO: test this
@@ -6720,7 +7025,10 @@ def _recursive_build(
         # torch.Size([1, 2])
         return obj.detach().to(dtype=scalarType, device="cpu", copy=True)
     elif isinstance(obj, Number):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.scalar_tensor(obj, dtype=scalarType)
 
     # seq can be a list of tensors
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index e4e300bee62aa..218b9c53835b3 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -106,7 +106,10 @@ def _resize_fft_input(
         if x_sizes[dims[i]] < sizes[i]:
             must_copy = True
             pad_idx = len(pad_amount) - 2 * dims[i] - 1
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pad_amount[pad_idx] = sizes[i] - x_sizes[dims[i]]
 
         if x_sizes[dims[i]] > sizes[i]:
@@ -314,8 +317,12 @@ def _canonicalize_fft_shape_and_dim_args(
 
         # Translate any -1 values in shape to the default length
         ret_shape = tuple(
+<<<<<<< HEAD
             s if s != -1 else input_sizes[d]
             for (s, d) in zip(shape, ret_dims)  # type: ignore[possibly-undefined]
+=======
+            s if s != -1 else input_sizes[d] for (s, d) in zip(shape, ret_dims)  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     elif dim is None:
         # No shape, no dim
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index f4281674bd118..edd17c9245b40 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -56,7 +56,11 @@ def _check_norm_dtype(dtype: Optional[torch.dtype], x_dtype: torch.dtype, fn_nam
         torch._check(
             utils.get_higher_dtype(dtype, x_dtype) == dtype,
             lambda: f"{fn_name}: the dtype of the input ({x_dtype}) should be convertible "
+<<<<<<< HEAD
             f"without narrowing to the specified dtype ({dtype})",
+=======
+            "without narrowing to the specified dtype ({dtype})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -110,7 +114,11 @@ def _check_vector_norm_args(
             x.numel() != 0,
             not isinstance(dim, IntLike) and dim is not None and len(dim) != 0,
         ),
+<<<<<<< HEAD
         lambda: f"linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
+=======
+        "linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "because the operation does not have an identity",
     )
 
@@ -119,7 +127,11 @@ def _check_vector_norm_args(
         for d in dim:
             torch._check(
                 sym_or(x.numel() != 0, d < len(shape) and d >= 0 and shape[d] != 0),
+<<<<<<< HEAD
                 lambda: f"linalg.vector_norm cannot compute the {ord} norm on the "
+=======
+                "linalg.vector_norm cannot compute the {ord} norm on the "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"dimension {d} because this dimension is empty and the "
                 "operation does not have an identity",
             )
@@ -180,7 +192,11 @@ def vector_norm(
             if keepdim or x.ndim == 0:
                 return to_result_dtype(x).contiguous()
             elif dim is None:
+<<<<<<< HEAD
                 return to_result_dtype(x).flatten()[0]
+=======
+                return x.flatten()[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 new_shape = [s for d, s in enumerate(x.shape) if d not in dim]
                 return to_result_dtype(x.view(new_shape)).contiguous()
@@ -216,11 +232,15 @@ def matrix_norm(
     # shape
     check_is_matrix(A, "linalg.matrix_norm")
     # dim
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = utils.canonicalize_dims(A.ndim, dim)
     if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
     torch._check(
+<<<<<<< HEAD
         len(dim) == 2, lambda: f"linalg.matrix_norm: dim must be a 2-tuple. Got {dim}"
     )
     torch._check(
@@ -228,6 +248,13 @@ def matrix_norm(
         dim[0] != dim[1],
         # pyrefly: ignore [index-error]
         lambda: f"linalg.matrix_norm: dims must be different. Got ({dim[0]}, {dim[1]})",
+=======
+        len(dim) == 2, lambda: "linalg.matrix_norm: dim must be a 2-tuple. Got {dim}"
+    )
+    torch._check(
+        dim[0] != dim[1],
+        lambda: "linalg.matrix_norm: dims must be different. Got ({dim[0]}, {dim[1]})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     # dtype arg
     _check_norm_dtype(dtype, A.dtype, "linalg.matrix_norm")
@@ -236,7 +263,11 @@ def matrix_norm(
         # ord
         torch._check(
             ord in ("fro", "nuc"),
+<<<<<<< HEAD
             lambda: f"linalg.matrix_norm: Order {ord} not supported.",
+=======
+            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # dtype
         check_fp_or_complex(
@@ -248,7 +279,10 @@ def matrix_norm(
         else:  # ord == "nuc"
             if dtype is not None:
                 A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             perm = _backshift_permutation(dim[0], dim[1], A.ndim)
             result = torch.sum(svdvals(prims.transpose(A, perm)), -1, keepdim)
             if keepdim:
@@ -260,7 +294,11 @@ def matrix_norm(
         abs_ord = abs(ord)
         torch._check(
             abs_ord in (2, 1, float("inf")),
+<<<<<<< HEAD
             lambda: f"linalg.matrix_norm: Order {ord} not supported.",
+=======
+            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # dtype
         check_fp_or_complex(
@@ -272,7 +310,10 @@ def matrix_norm(
         if abs_ord == 2.0:
             if dtype is not None:
                 A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             perm = _backshift_permutation(dim[0], dim[1], A.ndim)
             result = max_min(svdvals(prims.transpose(A, perm)), dim=-1)
             if keepdim:
@@ -280,7 +321,10 @@ def matrix_norm(
                 result = prims.transpose(torch.unsqueeze(result, -1), inv_perm)
             return result
         else:  # 1, -1, inf, -inf
+<<<<<<< HEAD
             # pyrefly: ignore [bad-unpacking]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dim0, dim1 = dim
             if abs_ord == float("inf"):
                 dim0, dim1 = dim1, dim0
@@ -306,12 +350,20 @@ def norm(
             dim = (dim,)  # type: ignore[assignment]
         torch._check(
             len(dim) in (1, 2),
+<<<<<<< HEAD
             lambda: f"linalg.norm: If dim is specified, it must be of length 1 or 2. Got {dim}",
+=======
+            lambda: "linalg.norm: If dim is specified, it must be of length 1 or 2. Got {dim}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     elif ord is not None:
         torch._check(
             A.ndim in (1, 2),
+<<<<<<< HEAD
             lambda: f"linalg.norm: If dim is not specified but ord is, the input must be 1D or 2D. Got {A.ndim}D",
+=======
+            lambda: "linalg.norm: If dim is not specified but ord is, the input must be 1D or 2D. Got {A.ndim}D",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if ord is not None and (
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 0d4bb74251f01..f879235427cd6 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from typing import Concatenate, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
+=======
+from functools import wraps
+from typing import Callable, Optional, TypeVar, Union
+from typing_extensions import Concatenate, ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._prims as prims
@@ -142,11 +148,17 @@ def _inplace_wrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     # nb. We use the name of the first argument used in the unary references
     @wraps(fn)
     def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         a = args[0]
         if "inplace" not in kwargs:
             kwargs["inplace"] = False
         # pyrefly: ignore [unsupported-operation]
+=======
+        a = args[0]
+        if "inplace" not in kwargs:
+            kwargs["inplace"] = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if kwargs["inplace"]:
             torch._check(
                 "out" not in kwargs,
@@ -627,7 +639,10 @@ def smooth_l1_loss(
         )
     else:
         loss = torch.abs(input - target)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
         return _apply_loss_reduction(loss, reduction)
 
@@ -764,7 +779,11 @@ def _nll_loss_nd(
         batch_size = input.shape[0]
         loss = -input[torch.arange(batch_size), target] * current_weight
     else:
+<<<<<<< HEAD
         # 3D case (N batch size, C classes, K dimensions)
+=======
+        # 3D case (N batch size, C classe, K dimensions)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # input (N batch size, C classes, K)
         batch_size = input.shape[0]
         extent = input.shape[2]
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index a7351fb8f10ca..75b37c10242d0 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -155,10 +155,15 @@ def xlog1py(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, Numbe
 
     # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
     if isinstance(a, TensorLike) and isinstance(b, Number):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(b, TensorLike) and isinstance(a, Number):
         # pyrefly: ignore [bad-argument-type]
+=======
+        b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = refs.scalar_tensor(a, dtype=b.dtype, device=b.device)
 
     # mypy: expected "Tensor"
diff --git a/torch/_strobelight/cli_function_profiler.py b/torch/_strobelight/cli_function_profiler.py
index a63a49c3938a1..7d07e16730791 100644
--- a/torch/_strobelight/cli_function_profiler.py
+++ b/torch/_strobelight/cli_function_profiler.py
@@ -6,10 +6,17 @@
 import re
 import subprocess
 import time
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from threading import Lock
 from timeit import default_timer as timer
 from typing import Any, Optional, TypeVar
+=======
+from collections.abc import Sequence
+from threading import Lock
+from timeit import default_timer as timer
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
@@ -59,7 +66,11 @@ class StrobelightCLIFunctionProfiler:
 
     StrobelightCLIFunctionProfiler can be used to profile a python function and
     generate a strobelight link with the results. It works on meta servers but
+<<<<<<< HEAD
     does not requires an fbcode target.
+=======
+    does not requries an fbcode target.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     When stop_at_error is false(default), error during profiling does not prevent
     the work function from running.
 
@@ -310,11 +321,18 @@ def strobelight(
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
     def strobelight_inner(
+<<<<<<< HEAD
         work_function: Callable[_P, _R],
     ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
             # pyrefly: ignore [bad-argument-type]
+=======
+        work_function: Callable[_P, _R]
+    ) -> Callable[_P, Optional[_R]]:
+        @functools.wraps(work_function)
+        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/_strobelight/compile_time_profiler.py b/torch/_strobelight/compile_time_profiler.py
index 89b44632e2787..1f695dd706977 100644
--- a/torch/_strobelight/compile_time_profiler.py
+++ b/torch/_strobelight/compile_time_profiler.py
@@ -127,7 +127,11 @@ def enable(cls, profiler_class: Any = StrobelightCLIFunctionProfiler) -> None:
 
             if not shutil.which("strobeclient"):
                 logger.info(
+<<<<<<< HEAD
                     "strobeclient not found, can't enable compile time strobelight profiling, seems"
+=======
+                    "strobeclient not found, cant enable compile time strobelight profiling, seems"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "like you are not on a FB machine."
                 )
                 return
@@ -145,7 +149,11 @@ def enable(cls, profiler_class: Any = StrobelightCLIFunctionProfiler) -> None:
             async_stack_max_len=cls.max_stack_length,
             run_user_name="pt2-profiler/"
             + os.environ.get("USER", os.environ.get("USERNAME", "")),
+<<<<<<< HEAD
             sample_tags={cls.identifier},  # pyrefly: ignore  # bad-argument-type
+=======
+            sample_tags={cls.identifier},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @classmethod
diff --git a/torch/_subclasses/_fake_tensor_utils.py b/torch/_subclasses/_fake_tensor_utils.py
index cffa4a2216532..44d70ae3dda14 100644
--- a/torch/_subclasses/_fake_tensor_utils.py
+++ b/torch/_subclasses/_fake_tensor_utils.py
@@ -7,6 +7,10 @@
 from torch import SymInt
 from torch.fx.experimental.sym_node import SymNode
 from torch.types import py_sym_types, PySymType
+<<<<<<< HEAD
+=======
+from torch.utils._backport_slots import dataclass_slots
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -17,7 +21,12 @@
     from .fake_tensor import _DispatchCacheKey, _MetadataIntLike
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DeconstructedSymNode:
     """
     Represents a SymNode without the associated ShapeEnv
@@ -33,12 +42,16 @@ class _DeconstructedSymNode:
     @staticmethod
     def from_node(node: SymNode) -> _DeconstructedSymNode:
         return _DeconstructedSymNode(
+<<<<<<< HEAD
             node._expr,
             node.pytype,
             node._hint,
             node.constant,
             # pyrefly: ignore [bad-argument-type]
             node.fx_node,
+=======
+            node._expr, node.pytype, node._hint, node.constant, node.fx_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def extract(self, shape_env: ShapeEnv) -> SymNode:
@@ -76,7 +89,12 @@ def _value_hash(self) -> int:
         return hash((self._expr, self.pytype, self._hint, self.constant, self.fx_node))
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DeconstructedSymType:
     """
     Represents a SymInt, SymFloat, SymBool without the associated ShapeEnv
@@ -105,12 +123,22 @@ def __hash__(self) -> int:
         return NotImplemented
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _InputBackref:
     value: int
 
 
+<<<<<<< HEAD
 @dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _PySymInputStub:
     """
     Represents a SymInt in the cached key. Needed because SymInt doesn't
@@ -172,7 +200,12 @@ def __hash__(self) -> int:
             return self.value.node._value_hash()
 
 
+<<<<<<< HEAD
 @dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _SymIntOutputStub:
     """
     Represents a SymInt in the cached output.
@@ -206,7 +239,12 @@ def __hash__(self) -> int:
         raise NotImplementedError
 
 
+<<<<<<< HEAD
 @dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _CacheKeyState:
     """
     State used while building our cache key.
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index 0b000cfa1a9aa..ad6f31cee0506 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -3,15 +3,21 @@
 import functools
 import itertools
 import math
+<<<<<<< HEAD
 import operator
 import sys
 from collections.abc import Callable
 from functools import reduce
 from typing import Optional, Union
+=======
+import sys
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._custom_op
 import torch._logging
+<<<<<<< HEAD
 import torch._prims_common as utils
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
@@ -26,6 +32,17 @@
     is_float_dtype,
     is_integer_dtype,
     make_contiguous_strides_for,
+=======
+from torch._dispatch.python import no_python_dispatcher
+from torch._ops import OpOverload
+from torch._prims_common import (
+    definitely_contiguous_for_memory_format,
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    is_boolean_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._subclasses.fake_tensor import (
     DataDependentOutputException,
@@ -137,9 +154,15 @@ def _is_tensor_constructor(func: OpOverload):
 def register_op_impl(run_impl_check: Union[Callable[[OpOverload], bool], OpOverload]):
     def impl_decorator(op_impl):
         if isinstance(run_impl_check, OpOverload):
+<<<<<<< HEAD
             assert run_impl_check not in op_implementations_dict, (
                 f"duplicate registration: {run_impl_check}"
             )
+=======
+            assert (
+                run_impl_check not in op_implementations_dict
+            ), f"duplicate registration: {run_impl_check}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op_implementations_dict[run_impl_check] = op_impl
         elif isinstance(run_impl_check, (list, tuple)):
             for op in run_impl_check:
@@ -158,7 +181,12 @@ def _is_op_registered_to_fake_rule(op):
 
 
 def _deregister_op_impl(op):
+<<<<<<< HEAD
     op_implementations_dict.pop(op, None)
+=======
+    if op in op_implementations_dict:
+        del op_implementations_dict[op]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for check, impl in op_implementations_checks:
         if check is op:
             op_implementations_checks.remove((check, impl))
@@ -238,7 +266,11 @@ def stride_incorrect_op(op):
 # These operators have meta implementations with incorrect strides
 @register_op_impl(stride_incorrect_op)
 def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs):
+<<<<<<< HEAD
     # This is a workaround for meta implementations with incorrect strides
+=======
+    # This is a workaround for meta implmentations with incorrect strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_symbolic(x):
         if isinstance(x, FakeTensor):
@@ -366,6 +398,7 @@ def unique2(
     return _unique(fake_mode, func, arg, None, sorted, return_inverse, return_counts)
 
 
+<<<<<<< HEAD
 @register_op_impl(aten.select.int)
 def meta_select(fake_mode, func, self, dim, index):
     from torch.fx.experimental.symbolic_shapes import guard_or_false
@@ -408,6 +441,8 @@ def meta_select(fake_mode, func, self, dim, index):
     return self.as_strided(new_size, new_stride, new_storage_offset)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_impl(aten.unique_dim.default)
 def unique_dim(
     fake_mode, func, arg, dim, sorted=True, return_inverse=False, return_counts=False
@@ -438,6 +473,7 @@ def _(fake_mode, func, arg, return_inverse=False, return_counts=False, dim=None)
     )
 
 
+<<<<<<< HEAD
 # This function is python match of computeStride_impl in TensorUtils.cpp
 def _compute_stride(old_shape, old_stride, new_shape, size_oblivious=False):
     from torch.fx.experimental.symbolic_shapes import (
@@ -631,6 +667,8 @@ def _view_meta_copy(fake_mode, func, a, *shape, out=None):
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_impl(aten.repeat_interleave.Tensor)
 def repeat_interleave_tensor(fake_mode, func, repeats, output_size=None):
     if output_size is None:
@@ -762,6 +800,7 @@ def _padded_dense_to_jagged_forward(fake_mode, func, padded, offsets, total_L=No
     return padded.new_empty(output_shape)
 
 
+<<<<<<< HEAD
 def _compute_slice_index(size, index):
     from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_and
 
@@ -845,6 +884,8 @@ def slice_forward(
         return self.as_strided(sizes, strides, storage_offset)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_impl(torch.ops.aten.masked_select.default)
 def masked_select(fake_mode, func, self, mask):
     if (
@@ -900,6 +941,7 @@ def assert_tensor_metadata(
     layout=None,
 ) -> None:
     if sizes is not None:
+<<<<<<< HEAD
         assert t.size() == sizes, (
             f"Tensor sizes mismatch! Expected: {sizes}, Got: {t.size()}"
         )
@@ -919,6 +961,27 @@ def assert_tensor_metadata(
         assert t.device == device, (
             f"Tensor device mismatch! Expected: {device}, Got: {t.device}"
         )
+=======
+        assert (
+            t.size() == sizes
+        ), f"Tensor sizes mismatch! Expected: {sizes}, Got: {t.size()}"
+    if strides is not None:
+        assert (
+            t.stride() == strides
+        ), f"Tensor strides mismatch! Expected: {strides}, Got: {t.stride()}"
+    if dtype is not None:
+        assert (
+            t.dtype == dtype
+        ), f"Tensor dtype mismatch! Expected: {dtype}, Got: {t.dtype}"
+    if layout is not None:
+        assert (
+            t.layout == layout
+        ), f"Tensor layout mismatch! Expected: {layout}, Got: {t.layout()}"
+    if device is not None:
+        assert (
+            t.device == device
+        ), f"Tensor device mismatch! Expected: {device}, Got: {t.device}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NB: this must be ordered after local_scalar_dense
@@ -1120,6 +1183,11 @@ def conv(fake_mode, func, *args, **kwargs):
             # TODO: We can make this a little more faithful with best effort
             # channels last detection (but only if it's statically obvious!)
             mem_fmt = None
+<<<<<<< HEAD
+=======
+        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+            mem_fmt = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             if func is aten.convolution.default:
                 conv_backend = torch._C._select_conv_backend(**kwargs)
@@ -1136,6 +1204,7 @@ def conv(fake_mode, func, *args, **kwargs):
                     groups=kwargs["groups"],
                     bias_sizes=kwargs["bias_sizes"],
                 )
+<<<<<<< HEAD
             # Expand 1d -> 2d.
             # Note: Avoid expanding before calling _select_conv_backend,
             # as the function handles 2D expansion internally.
@@ -1160,16 +1229,25 @@ def conv(fake_mode, func, *args, **kwargs):
                     kwargs["padding"].pop(0)
                     kwargs["dilation"].pop(0)
                     kwargs["output_padding"].pop(0)
+=======
+            mem_fmt = torch._C._conv_determine_backend_memory_format(
+                kwargs["input"], kwargs["weight"], conv_backend
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def convert(t, mem_fmt):
         if t is None:
             return t
         if mem_fmt is not None:
+<<<<<<< HEAD
             # channels last only support 4d, try to expand dim then convert it back later.
             if t.dim() == 3 and mem_fmt == torch.channels_last:
                 t = t.unsqueeze(2).to(memory_format=mem_fmt).squeeze(2)
             else:
                 t = t.to(memory_format=mem_fmt)
+=======
+            t = t.to(memory_format=mem_fmt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return FakeTensor(fake_mode, t, device)
 
     with in_kernel_invocation_manager(fake_mode):
@@ -1336,7 +1414,11 @@ def slow(msg):
                 # Use elementwise_dtypes for the tricky case
                 has_different_input_dtypes = True
                 continue
+<<<<<<< HEAD
             if common_device == cpu and op.device.type != "cpu":
+=======
+            if common_device == cpu and not op.device.type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 common_device = op.device
             # Slightly simplified here as target_dtype cannot vary
             if common_dtype is None:
@@ -1368,8 +1450,12 @@ def slow(msg):
         # compute_fast_setup_type
         definitely_contiguous = True
         definitely_channels_last = True
+<<<<<<< HEAD
 
         # TODO: is_non-overlapping_and_dense not bound from Python
+=======
+        # TODO: is_non-overlapping_and_dense (not bound from Python
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # no inplace, no out, everything defined
 
         if is_noncontiguous_supported(common_device):
@@ -1378,13 +1464,21 @@ def slow(msg):
                     continue
                 definitely_contiguous = (
                     definitely_contiguous
+<<<<<<< HEAD
                     and is_contiguous_for_memory_format_or_false(
+=======
+                    and definitely_contiguous_for_memory_format(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         op, memory_format=torch.contiguous_format
                     )
                 )
                 definitely_channels_last = (
                     definitely_channels_last
+<<<<<<< HEAD
                     and is_contiguous_for_memory_format_or_false(
+=======
+                    and definitely_contiguous_for_memory_format(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         op, memory_format=torch.channels_last
                     )
                 )
@@ -1440,9 +1534,13 @@ def get_fast_op_impls():
     register_fast_op_impl(torch.ops.aten.sub.Tensor)(
         make_fast_binary_impl(torch._refs.sub)
     )
+<<<<<<< HEAD
     register_fast_op_impl(torch.ops.aten.mul.Tensor)(
         make_fast_binary_impl(torch._refs.mul)
     )  # type: ignore[has-type]
+=======
+    register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_fast_op_impl(torch.ops.aten.div.Tensor)(
         make_fast_binary_impl(
             torch._refs.div,
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index bf1ed1ff2b111..e7c2bbc1f2a7b 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -15,6 +15,7 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import (
     Any,
     cast,
@@ -26,6 +27,10 @@
     Union,
 )
 from typing_extensions import Self
+=======
+from typing import Any, Callable, cast, Literal, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import Self, TypeGuard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import ReferenceType
 
 import torch
@@ -49,6 +54,10 @@
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.overrides import TorchFunctionMode
 from torch.types import IntLikeType, py_sym_types
+<<<<<<< HEAD
+=======
+from torch.utils._backport_slots import dataclass_slots
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import (
     is_traceable_wrapper_subclass,
@@ -62,7 +71,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Generator, Iterable, Mapping, Sequence
+=======
+    from collections.abc import Generator, Iterable, Mapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from types import TracebackType
 
     from torch._guards import Source
@@ -143,11 +156,17 @@ class FakeTensorTLS(threading.local):
     # Default to None, otherwise it'll be used to override _all_
     # `FakeTensorMode.allow_non_fake_inputs` in this thread.
     allow_non_fake_inputs_override: Optional[bool]
+<<<<<<< HEAD
     non_strict_export_fake_tensor_tracker: weakref.WeakSet
 
     def __init__(self) -> None:
         self.allow_non_fake_inputs_override = None
         self.non_strict_export_fake_tensor_tracker = weakref.WeakSet()
+=======
+
+    def __init__(self) -> None:
+        self.allow_non_fake_inputs_override = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 fake_tensor_tls = FakeTensorTLS()
@@ -404,9 +423,13 @@ def mk_fake_tensor(
             with no_dispatch():
                 return FakeTensor(
                     fake_mode,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
                     make_meta_t(),
                     # pyrefly: ignore [bad-argument-type]
+=======
+                    make_meta_t(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     device,
                     # TODO: callback might be used in recursive contexts, in
                     # which case using t is wrong!  BUG!
@@ -508,9 +531,15 @@ def from_meta_and_device(
         pytype: Optional[type[torch.Tensor]] = None,
         dispatch_keys: Optional[torch.DispatchKeySet] = None,
     ) -> FakeTensor:
+<<<<<<< HEAD
         assert t.device.type == "meta", (
             f"tensor's device must be `meta`, got {t.device.type} instead"
         )
+=======
+        assert (
+            t.device.type == "meta"
+        ), f"tensor's device must be `meta`, got {t.device.type} instead"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is a bit abusive (this is not the "real" tensor) but whatever,
         # the meta tensor should be fresh so there's no way to get it wrong
         maybe_memo = self._get_memo(t)
@@ -681,7 +710,10 @@ class FakeTensor(Tensor):
     _mode_key = torch._C._TorchDispatchModeKey.FAKE
 
     @property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def device(self) -> torch.device:
         if self.fake_mode.in_kernel_invocation:
             return torch.device("meta")
@@ -709,7 +741,10 @@ def device(self, _: torch.device) -> None:
 
     # We don't support named tensors; graph break
     @property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def names(self) -> list[str]:
         raise UnsupportedFakeTensorException(
             "torch.compile doesn't support named tensors"
@@ -768,7 +803,10 @@ def __new__(
                 )
             else:
                 device = torch.device(f"{device.type}:0")
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fake_device = device
         self.fake_mode = fake_mode
         self.constant = constant
@@ -806,11 +844,14 @@ def __new__(
     #
     def __init__(self, *args: object, **kwargs: object) -> None:
         super().__init__()
+<<<<<<< HEAD
         if (
             torch.compiler.is_exporting()
             and torch._export.config.detect_non_strict_fake_tensor_leaks
         ):
             fake_tensor_tls.non_strict_export_fake_tensor_tracker.add(self)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def from_tensor(t: Tensor, fake_mode: FakeTensorMode) -> FakeTensor:
@@ -909,11 +950,14 @@ def _find_common_device(
             aten._foreach_copy.default,
         )
 
+<<<<<<< HEAD
         # list of ops not using zero dim cpu tensor logic to align with the eager mode.
         bypass_zero_dim_cpu_tensor_check_ops = ordered_set(
             aten.nextafter.default,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def check_cpu_device(device: torch.device) -> bool:
             return device.type == "cpu"
 
@@ -937,6 +981,7 @@ def merge_devices(t: object) -> None:
                     is_cpu_zero_dim = t_is_cpu_zero_dim
                 return
 
+<<<<<<< HEAD
             is_bypass_zero_dim_cpu_tensor_check_op = (
                 func in bypass_zero_dim_cpu_tensor_check_ops
             )
@@ -948,6 +993,15 @@ def merge_devices(t: object) -> None:
 
             # current device is from cpu 0 dim tensor, overwrite
             if is_cpu_zero_dim and not is_bypass_zero_dim_cpu_tensor_check_op:
+=======
+            # mismatching devices !
+            # if current tensor is cpu 0 dim, defer to existing device
+            if t_is_cpu_zero_dim:
+                return
+
+            # current device is from cpu 0 dim tensor, overwrite
+            if is_cpu_zero_dim:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 common_device = t.device
                 is_cpu_zero_dim = t_is_cpu_zero_dim
                 return
@@ -960,6 +1014,7 @@ def merge_devices(t: object) -> None:
                 if any(map(check_cpu_device, (common_device, t.device))):
                     return
 
+<<<<<<< HEAD
             # if prefer_device_type is set, prefer that device type over others
             prefer_device_type = torch._functorch.config.fake_tensor_prefer_device_type
             if prefer_device_type is not None:
@@ -975,6 +1030,8 @@ def merge_devices(t: object) -> None:
                     # Keep the existing preferred device type
                     return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # mismatching devices of non-zero dim tensors, throw
             # This might be valid behavior and need to be explicitly modeled, e.g. reshape_as
             raise RuntimeError(
@@ -1021,7 +1078,12 @@ def tolist(self) -> Any:
 _MetadataIntLike = Union[IntLikeType, "_PySymInputStub", "_SymIntOutputStub"]
 
 
+<<<<<<< HEAD
 @dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorMetadata:
     """
     The Tensor metadata relevant to hashing FakeTensors when caching.
@@ -1105,7 +1167,12 @@ def extract_tensor_metadata(t: Tensor) -> TensorMetadata:
     )
 
 
+<<<<<<< HEAD
 @dataclass(slots=True)
+=======
+@dataclass_slots
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DispatchCacheKey:
     """
     Key for the FakeTensor dispatch cache.
@@ -1138,7 +1205,12 @@ class SingletonConstant:
     pass
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DispatchCacheEntryOutputInfo:
     """
     Entry type for the FakeTensor dispatch cache for an output. Accounts for three
@@ -1157,7 +1229,12 @@ class _DispatchCacheEntryOutputInfo:
     constant_value: Optional[Any] = SingletonConstant
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DispatchCacheValidEntry:
     """
     Entry type for the FakeTensor dispatch cache. It supports two types of outputs
@@ -1171,7 +1248,12 @@ class _DispatchCacheValidEntry:
     is_output_tuple: bool = False
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DispatchCacheBypassEntry:
     """
     Entry type for a negative cache entry.
@@ -1184,7 +1266,12 @@ class _DispatchCacheBypassEntry:
     _DispatchCacheEntry = Union[_DispatchCacheValidEntry, _DispatchCacheBypassEntry]
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _BypassDispatchCache(Exception):
     """
     Signals cases that should skip FakeTensor caching.
@@ -1193,7 +1280,12 @@ class _BypassDispatchCache(Exception):
     reason: str
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True, slots=True)
+=======
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DispatchCacheInfo:
     """
     Information about the state of the FakeTensor dispatch cache.
@@ -1374,7 +1466,10 @@ def stack(self) -> str:
         return self._stack
 
     @count
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __torch_dispatch__(
         self,
         func: OpOverload,
@@ -1401,12 +1496,15 @@ def __enter__(self) -> Self:
             # See NOTE: [torch.tensor, lift_fresh, and device movement]
             prev_only_lift_cpu_tensors = torch._C._only_lift_cpu_tensors()
             torch._C._set_only_lift_cpu_tensors(True)
+<<<<<<< HEAD
 
             # In the case of CPU-only build or cuda device unavailable,
             # we patch the cuda device guard to use NoOpDeviceGuardImpl.
             # This enables us to trace over cuda kernels under FakeTensorMode.
             torch._C._ensureCUDADeviceGuardSet()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         maybe_prev_fake_mode = torch._C._unset_dispatch_mode(self._mode_key)
         if self is not maybe_prev_fake_mode:
             self.enter_stack.append(
@@ -1417,7 +1515,10 @@ def __enter__(self) -> Self:
             # no-op (still need to re-set the fake mode though since we unset it)
             torch._C._set_dispatch_mode(self)
             self.enter_stack.append((False, None, prev_only_lift_cpu_tensors))
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def __exit__(
@@ -1499,7 +1600,10 @@ def _cached_dispatch_impl(
             # Do this dispatch outside the above except handler so if it
             # generates its own exception there won't be a __context__ caused by
             # the caching mechanism.
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._dispatch_impl(func, types, args, kwargs)
 
         assert state is not None
@@ -1517,28 +1621,44 @@ def _cached_dispatch_impl(
                 # This represents a negative cache entry - we already saw that the
                 # output is uncachable. Compute it from first principals.
                 FakeTensorMode.cache_bypasses[entry.reason] += 1
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 return self._dispatch_impl(func, types, args, kwargs)
 
             # We have a cache entry.
             # pyrefly: ignore [bad-argument-type]
+=======
+                return self._dispatch_impl(func, types, args, kwargs)
+
+            # We have a cache entry.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = self._output_from_cache_entry(state, entry, key, func, args)
             FakeTensorMode.cache_hits += 1
             if self.cache_crosscheck_enabled:
                 # For debugging / testing: Validate that the output synthesized
                 # from the cache matches the output created by normal dispatch.
                 with disable_fake_tensor_cache(self):
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._crosscheck_cache_output(output, func, types, args, kwargs)
             return output
 
         # We don't have a cache entry.
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         output = self._dispatch_impl(func, types, args, kwargs)
 
         try:
             # pyrefly: ignore [bad-argument-type]
             entry = self._make_cache_entry(state, key, func, args, kwargs, output)
+=======
+        output = self._dispatch_impl(func, types, args, kwargs)
+
+        try:
+            self._validate_cache_key(func, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except _BypassDispatchCache as e:
             # We ran "extra" checks on the cache key and determined that it's no
             # good. Record the reason and mark it so we don't bother validating
@@ -1556,6 +1676,18 @@ def _cached_dispatch_impl(
             set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
             return output
 
+<<<<<<< HEAD
+=======
+        try:
+            entry = self._make_cache_entry(state, key, func, args, kwargs, output)
+        except _BypassDispatchCache as e:
+            # We had trouble making the cache entry. Record the reason and mark
+            # it.
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
+            return output
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         set_cache_key(cache, key, entry)
         FakeTensorMode.cache_misses += 1
         return output
@@ -1571,7 +1703,10 @@ def _cache_key(
         Create a cache key given the dispatch args. Raises _BypassDispatchCache
         for any situation that precludes caching.
         """
+<<<<<<< HEAD
         is_tracing = torch.fx.experimental.proxy_tensor.get_proxy_mode() is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         key_values = [
             func,
             # Capture the default_dtype mode since that can affect the output tensor,
@@ -1587,24 +1722,35 @@ def _cache_key(
             # Disallowing dynamic shapes can introduce a DynamicOutputShapeException
             # where it wasn't seen on a previous instance of the same op.
             self.shape_env.settings if self.shape_env else None,
+<<<<<<< HEAD
             # ProxyTorchDispatchMode needs to track how SymNodes are constructed
             # so we need to handle things a little different depending on
             # whether we're tracing or not.
             is_tracing,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         if state.known_symbols:
             # If there are symbols then include the epoch - this is really more
             # of a Shape env var which lives on the FakeTensorMode.
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key_values.append(self.epoch)
         # Collect the id_hashed objects to attach a weakref finalize later
         id_hashed_objects: list[object] = []
         # Translate any FakeTensor args to metadata.
         if args:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             self._prep_args_for_hash(key_values, args, state, id_hashed_objects)
         if kwargs:
             # pyrefly: ignore [bad-argument-type]
+=======
+            self._prep_args_for_hash(key_values, args, state, id_hashed_objects)
+        if kwargs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._prep_args_for_hash(key_values, kwargs, state, id_hashed_objects)
         key = _DispatchCacheKey(tuple(key_values))
 
@@ -1644,10 +1790,14 @@ def _validate_cache_key(
         if torch.Tag.dynamic_output_shape in func.tags:
             if func is aten.index.Tensor:
                 _, new_kwargs = normalize_function(  # type: ignore[misc]
+<<<<<<< HEAD
                     func,
                     args=args,  # type: ignore[arg-type]
                     kwargs=kwargs,  # type: ignore[arg-type]
                     normalize_to_only_use_kwargs=True,
+=======
+                    func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for index in new_kwargs["indices"]:
                     # index calls nonzero for bool or int8 tensors, and
@@ -1698,6 +1848,7 @@ def _prep_args_for_hash(
         convert FakeTensors into metadata. Raises _BypassDispatchCache to signal
         unsupported cases that should bypass caching.
         """
+<<<<<<< HEAD
         from torch._higher_order_ops.auto_functionalize import (
             FunctionalCallableWithEpilogue,
         )
@@ -1707,6 +1858,10 @@ def _prep_args_for_hash(
             result.append(type(args))
             result.append(f"length_{len(args)}")
 
+=======
+        from torch._higher_order_ops.utils import FunctionalizeCtxWrapper
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(args, dict):
             self._prep_args_for_hash(result, args.keys(), state, id_hashed_objects)
             self._prep_args_for_hash(result, args.values(), state, id_hashed_objects)
@@ -1745,10 +1900,13 @@ def _prep_args_for_hash(
                 # functional wrapper is destroyed after fake tensor prop. We
                 # need to put the finalizer on the subgraph.
                 id_hashed_objects.append(arg.subgraph)
+<<<<<<< HEAD
             elif isinstance(arg, FunctionalCallableWithEpilogue):
                 result.append(type(arg))
                 result.append(hash(arg))
                 id_hashed_objects.append(arg.orig_callable)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # It's important to capture the type of the arg since, e.g., 1 and 1.0
                 # hash to the same value, but can produce different dtypes for the
@@ -1771,9 +1929,17 @@ def _validate_output_for_cache_entry(
         if isinstance(output, (int, type(None))):
             return
 
+<<<<<<< HEAD
         # Check for symbolic content that should bypass caching - raises
         # _BypassDispatchCache if necessary.
         _validate_symbolic_output_for_caching(state, output)
+=======
+        if _has_unrepresented_symbols(state, output):
+            # Unbacked symbols are fine - but only if they're also represented
+            # in the input. If there are any new unbacked symbols then we can't
+            # cache this output.
+            raise _BypassDispatchCache("unrepresented symbol in output")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Some ops return tuples of Tensors, but it's rare, so avoid
         # the complexity of caching other types.
@@ -1889,8 +2055,11 @@ def _make_cache_entry(
         from torch._higher_order_ops.utils import registered_hop_fake_fns
         from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
 
+<<<<<<< HEAD
         self._validate_cache_key(func, args, kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For hops, lets look at the output tensor to find any unbacked symints.
         # If there are none, then we rely on the existing checks to validate
         # caching.
@@ -1920,6 +2089,7 @@ def _make_cache_entry(
         if isinstance(output, tuple):
             for out_element in output:
                 self._validate_output_for_cache_entry(
+<<<<<<< HEAD
                     state,
                     key,
                     # pyrefly: ignore [bad-argument-type]
@@ -1937,11 +2107,19 @@ def _make_cache_entry(
                 args,
                 kwargs,
                 output,
+=======
+                    state, key, func, args, kwargs, out_element
+                )
+        else:
+            self._validate_output_for_cache_entry(
+                state, key, func, args, kwargs, output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if isinstance(output, tuple):
             output_infos = [
                 self._get_output_info_for_cache_entry(
+<<<<<<< HEAD
                     state,
                     key,
                     # pyrefly: ignore [bad-argument-type]
@@ -1949,17 +2127,25 @@ def _make_cache_entry(
                     args,
                     kwargs,
                     out_elem,
+=======
+                    state, key, func, args, kwargs, out_elem
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for out_elem in output
             ]
             return _DispatchCacheValidEntry(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 output_infos=tuple(output_infos),
                 is_output_tuple=True,
+=======
+                output_infos=tuple(output_infos), is_output_tuple=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         else:
             output_info = self._get_output_info_for_cache_entry(
+<<<<<<< HEAD
                 state,
                 key,
                 # pyrefly: ignore [bad-argument-type]
@@ -1967,6 +2153,9 @@ def _make_cache_entry(
                 args,
                 kwargs,
                 output,
+=======
+                state, key, func, args, kwargs, output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return _DispatchCacheValidEntry(
                 output_infos=(output_info,), is_output_tuple=False
@@ -2094,7 +2283,11 @@ def assert_helper(a: Any, b: Any) -> None:
             elif a is None:
                 assert b is None
             elif isinstance(a, py_sym_types):
+<<<<<<< HEAD
                 assert type(a) is type(b) and a.node is b.node
+=======
+                assert type(a) == type(b) and a.node is b.node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(a, torch.Tensor):
                 assert isinstance(b, torch.Tensor)
                 assert_metadata_eq(assert_eq, a, b)
@@ -2219,7 +2412,13 @@ def _check_fake_real_vals(fake: Any, real: Any) -> None:
                 try:
                     _check_fake_real_vals(s_fake, s_real)
                 except MetadataMismatchError as exc:
+<<<<<<< HEAD
                     if torch._functorch.config.generate_fake_kernels_from_real_mismatches:
+=======
+                    if (
+                        torch._functorch.config.generate_fake_kernels_from_real_mismatches
+                    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         dtrace_structured(
                             "mismatched_fake_kernel",
                             metadata_fn=lambda: {
@@ -2372,6 +2571,7 @@ def _dispatch_impl(
         converter = self.fake_tensor_converter
 
         is_lift_func = func in self.lift_fns
+<<<<<<< HEAD
 
         # If we are trying to avoid device init, then we need to avoid constant
         # prop on constant tensors for ops that change devices.
@@ -2389,11 +2589,17 @@ def _dispatch_impl(
         # skip const prop for aten._to_copy if
         # 1. input tensor is on "meta" device
         # 2. destination device is unavailable, captured by `avoiding_device_init`
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_conversion_skip_const_prop = (
             func is torch.ops.aten._to_copy.default
             and isinstance(args[0], torch.Tensor)
             and args[0].device.type == "meta"
+<<<<<<< HEAD
         ) or avoiding_device_init
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # To constant propagate through these functions:
         # 1, If this is a lift due to a torch.tensor call,
@@ -2409,9 +2615,15 @@ def _dispatch_impl(
             and not flat_arg_fake_tensors
             and not device_conversion_skip_const_prop
         ):
+<<<<<<< HEAD
             assert all(t.constant is not None for t in flat_arg_fake_tensors), (
                 f"{func} should not have fake inputs without constants"
             )
+=======
+            assert all(
+                t.constant is not None for t in flat_arg_fake_tensors
+            ), f"{func} should not have fake inputs without constants"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             const_flat_args = [
                 a.constant if self.is_our_fake(a) else a for a in flat_args
             ]
@@ -2439,6 +2651,22 @@ def _dispatch_impl(
             if type(args[0]) is Tensor:
                 return converter.from_real_tensor(self, args[0])
 
+<<<<<<< HEAD
+=======
+        # If we are trying to avoid device init, then we need to avoid constant
+        # prop on constant tensors for ops that change devices.
+        avoiding_device_init = False
+        if self.avoid_device_init:
+            if (
+                func == torch.ops.aten._to_copy.default
+                and "device" in kwargs
+                and kwargs["device"] != "cpu"
+            ):
+                avoiding_device_init = True
+            if func == torch.ops.prims.device_put.default:
+                avoiding_device_init = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Recompute flat_arg_fake_tensors here again in case some of the inputs
         # were real tensors and fakified in validate_and_convert_non_fake_tensors
         (flat_args, flat_arg_fake_tensors) = self.validate_and_convert_non_fake_tensors(
@@ -2450,7 +2678,11 @@ def _dispatch_impl(
         # (aot autograd, torchdynamo) where each operation is run consecutively.
         # Because each operation is run in order, we can trace out and support
         # sequences like: x = torch.tensor(0.); y = x.add_(1)
+<<<<<<< HEAD
         # Whenever a constant is written to but with inputs that cannot be evaluated
+=======
+        # Whenver a constant is written to but with inputs that cannot be evaluated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # statically, such as random_(), we invalidate all constants that alias the input
         # We will rely on functionalization for use of fake tensors constants as persistent
         # objects on an FX Graph.
@@ -2509,7 +2741,10 @@ def _dispatch_impl(
             )
 
             with self, maybe_ignore_fresh_unbacked_symbols():
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return registered_hop_fake_fns[func](*args, **kwargs)
 
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
@@ -2568,7 +2803,11 @@ def maybe_to_real_tensor(
                 # we shouldn't broadly catch all errors here;
                 # some come from real-kernel mutation/aliasing checks we want to run.
                 # add more exception types as needed.
+<<<<<<< HEAD
                 log.debug(  # noqa: G200
+=======
+                log.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "real-tensor fallback failed for %s: %s; silently ignoring",
                     func,
                     exc,
@@ -2620,12 +2859,21 @@ def go(t: object, real_t: Tensor) -> None:
                         and s.rhs == 1
                     ):
                         assert self.shape_env is not None
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.shape_env.set_unbacked_var_to_val(s, int(real_t))
 
             if real_out is not nil:
                 # cross check fake/real outputs, and optionally override fake kernel mismatches
+<<<<<<< HEAD
                 if not torch._functorch.config.generate_fake_kernels_from_real_mismatches:
+=======
+                if (
+                    not torch._functorch.config.generate_fake_kernels_from_real_mismatches
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._maybe_infer_fake_kernel_from_pytree_out(
                         func,
                         (args, kwargs),
@@ -2647,7 +2895,11 @@ def go(t: object, real_t: Tensor) -> None:
                 if (
                     not isinstance(fake_out, Tensor)
                     and not isinstance(real_out, Tensor)
+<<<<<<< HEAD
                     and type(fake_out) is not type(real_out)
+=======
+                    and type(fake_out) != type(real_out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     # This can happen when decompositions have different return types,
                     # e.g. namedtuple vs. tuple vs. list.
@@ -2664,7 +2916,10 @@ def go(t: object, real_t: Tensor) -> None:
                 # TODO: Is this really needed?
                 compute_unbacked_bindings(self.shape_env, fake_out, peek=True)
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return fake_out
 
         # Try for fastpath
@@ -2674,6 +2929,7 @@ def go(t: object, real_t: Tensor) -> None:
                 return maybe_propagate_real_tensors(fast_impl(self, *args, **kwargs))
 
         # If there's a Python meta, prefer that over the decomposition
+<<<<<<< HEAD
         from torch._decomp import meta_table
 
         if (
@@ -2683,6 +2939,11 @@ def go(t: object, real_t: Tensor) -> None:
                 has_symbolic_sizes and func in self._unbacked_special_fake_handling_ops
             )
         ):
+=======
+        from torch._decomp import meta_table as meta_table
+
+        if func not in meta_table and not self.cpp_meta_supports_symint(func):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch._decomp import decomposition_table
 
             # Prefer Python decompositions over C++ ones
@@ -2946,7 +3207,10 @@ def wrap(e: T) -> Union[T, FakeTensor]:
                         self, e, device or common_device
                     )
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return e
 
         return tree_map(wrap, r)
@@ -2991,12 +3255,15 @@ def create_symbolic_nested_int(
         aten._sparse_coo_tensor_with_dims_and_tensors.default,
     )
 
+<<<<<<< HEAD
     _unbacked_special_fake_handling_ops = ordered_set(
         aten.view.default,
         aten._unsafe_view.default,
         aten.slice.Tensor,
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def cpp_meta_supports_symint(self, func: OpOverload) -> bool:
         if torch.Tag.view_copy in func.tags:
             return True
@@ -3009,7 +3276,11 @@ def may_turn_const(self, t: Tensor) -> bool:
             t.numel() <= CONSTANT_NUMEL_LIMIT
             and not is_sparse_any(t)
             and not self.is_our_fake(t)
+<<<<<<< HEAD
             and t.device.type != "meta"
+=======
+            and not t.device.type == "meta"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def invalidate_written_to_constants(
@@ -3023,10 +3294,14 @@ def invalidate_written_to_constants(
         schema_info = get_schema_info(func)
         if any_constant and schema_info.is_mutable():
             _, new_kwargs = normalize_function(  # type: ignore[misc]
+<<<<<<< HEAD
                 func,
                 args=args,  # type: ignore[arg-type]
                 kwargs=kwargs,  # type: ignore[arg-type]
                 normalize_to_only_use_kwargs=True,
+=======
+                func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for k, v in new_kwargs.items():
                 k = k if (k != "input" or schema_info.has_argument(k)) else "self"
@@ -3050,9 +3325,15 @@ def from_tensor(
         if static_shapes is None:
             static_shapes = self.static_shapes
         if static_shapes:
+<<<<<<< HEAD
             assert symbolic_context is None, (
                 "cannot set both static_shapes and symbolic_context"
             )
+=======
+            assert (
+                symbolic_context is None
+            ), "cannot set both static_shapes and symbolic_context"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape_env = None
         return self.fake_tensor_converter.from_real_tensor(
             self,
@@ -3067,6 +3348,7 @@ def from_tensor(
 _StoragePointer = object
 
 
+<<<<<<< HEAD
 def _validate_symbolic_output_for_caching(
     state: _CacheKeyState, output: FakeTensor
 ) -> None:
@@ -3126,6 +3408,19 @@ def _validate_symbolic_output_for_caching(
             for symbol in s.free_symbols:
                 if symbol not in state.known_symbols:
                     raise _BypassDispatchCache("unrepresented symbol in output")
+=======
+def _has_unrepresented_symbols(
+    state: _CacheKeyState, output: Optional[FakeTensor]
+) -> bool:
+    from torch.fx.experimental.symbolic_shapes import _iterate_exprs
+
+    for s in _iterate_exprs(output):
+        for symbol in s.free_symbols:
+            if symbol not in state.known_symbols:
+                return True
+
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NB: returns fake tensors
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index 1212168b09049..07d7aeaca78fa 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -2,8 +2,12 @@
 
 import functools
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -103,7 +107,11 @@ def is_sdpa_error(func, idx, e):
 
 
 def try_convert_fake_to_real(
+<<<<<<< HEAD
     ten_list: list[Union[FakeTensor, Any]],
+=======
+    ten_list: list[Union[FakeTensor, Any]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[Union[FakeTensor, torch.Tensor, Any]]:
     """
     Attempt to convert fake tensors to a corresponding real tensor with the correct underlying storage by looking up
@@ -127,7 +135,11 @@ def try_convert_fake_to_real(
     key_to_real_storage = {v: k for k, v in desc.lookup_storage.items()}
     out = []
     for t in ten_list:
+<<<<<<< HEAD
         if not isinstance(t, FakeTensor) or t.layout != torch.strided:
+=======
+        if not isinstance(t, FakeTensor) or not t.layout == torch.strided:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out.append(t)
             continue
 
@@ -267,9 +279,15 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if fake_r is not None:
             r_flat = pytree.tree_leaves(r)
             f_flat = pytree.tree_leaves(fake_r)
+<<<<<<< HEAD
             assert len(f_flat) == len(r_flat), (
                 f"{context} mismatch in number of returns {len(f_flat)} != {len(r_flat)}"
             )
+=======
+            assert len(f_flat) == len(
+                r_flat
+            ), f"{context} mismatch in number of returns {len(f_flat)} != {len(r_flat)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.check_aliasing:
                 _check_alias_info(
@@ -280,9 +298,15 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 zip(pytree.tree_leaves(r), pytree.tree_leaves(fake_r))
             ):
                 r_is_ten = isinstance(r_out, torch.Tensor)
+<<<<<<< HEAD
                 assert r_is_ten == isinstance(f_out, torch.Tensor), (
                     f"{context} mismatched number of tensor outputs"
                 )
+=======
+                assert r_is_ten == isinstance(
+                    f_out, torch.Tensor
+                ), f"{context} mismatched number of tensor outputs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if r_is_ten:
                     try:
                         _check_fake_real_tensors(
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 206a41100b935..4bb987ef7d158 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -3,12 +3,19 @@
 import warnings
 import weakref
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import AbstractContextManager
 from typing import Any, Optional, Union
 
 import torch
 import torch.fx.traceback as fx_traceback
+=======
+from contextlib import AbstractContextManager
+from typing import Any, Callable, Optional, Union
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.utils._pytree as pytree
 from torch._C import _functionalization_reapply_views_tls as _reapply_views
 from torch._ops import _get_dispatch_mode_pre_dispatch
@@ -69,7 +76,11 @@ class FunctionalTensor(torch.Tensor):
     # later, as long as it doesn't break anything).
     # FunctionalTensorWrapper copies **all** dispatch keys from the inner tensor
     # to the wrapper, excluding functorch and python dispatch keys.
+<<<<<<< HEAD
     # Here I'm trying to reuse the keyset the functorch wrapper subclasses copy,
+=======
+    # Here I'm trying to re-use the keyset the functorch wrapper subclasses copy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # except that they don't include ZeroTensor so I'm manually adding it in.
     _extra_dispatch_keys = torch._C._additional_keys_to_prop_for_wrapper_tensors.add(
         torch._C.DispatchKey.ZeroTensor
@@ -146,7 +157,11 @@ def __new__(cls, elem, mode):
         out.elem = elem
 
         if (
+<<<<<<< HEAD
             torch._export.config.enable_auto_functionalized_v2_for_export
+=======
+            not mode.export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and torch.is_inference_mode_enabled()
             and torch._inductor.config.enable_auto_functionalized_v2
         ):
@@ -268,7 +283,10 @@ def to(self, *args, **kwargs):
                 device=self.device,
                 layout=self.layout,
             )
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().to(*args, **kwargs)
 
     def cuda(self, device=None, *args, **kwargs):
@@ -406,8 +424,12 @@ def _can_decompose(func):
                         warnings.warn(
                             f"At pre-dispatch tracing, we assume that any custom op marked with "
                             f"CompositeImplicitAutograd and have functional schema are safe to not decompose. "
+<<<<<<< HEAD
                             f"Found {func} to be one such op.",
                             stacklevel=2,
+=======
+                            f"Found {func} to be one such op."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     return False
                 return True
@@ -450,6 +472,7 @@ def unwrap(x):
         ) and not torch._C._dispatch_has_kernel_for_dispatch_key(
             func.name(), torch._C.DispatchKey.Functionalize
         ):
+<<<<<<< HEAD
             import torch._export.config as export_config
             import torch._inductor.config as inductor_config
 
@@ -462,6 +485,14 @@ def unwrap(x):
             if inductor_config.enable_auto_functionalized_v2:
                 return do_auto_functionalize_v2(self, func, args, kwargs)
             return do_auto_functionalize(self, func, args, kwargs)
+=======
+            import torch._inductor.config as inductor_config
+
+            if self.export or not inductor_config.enable_auto_functionalized_v2:
+                return do_auto_functionalize(self, func, args, kwargs)
+            else:
+                return do_auto_functionalize_v2(self, func, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch._higher_order_ops.effects import handle_effects, has_effects
 
@@ -498,7 +529,11 @@ def unwrap(x):
             - FunctionalTensor._extra_dispatch_keys
         )
 
+<<<<<<< HEAD
         # All we want to do here is reuse the existing C++ functionalization logic.
+=======
+        # All we want to do here is re-use the existing C++ functionalization logic.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This requires swizzling our TLS dispatch keys so that the Functionalize key is active.
         with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
             try:
@@ -513,6 +548,7 @@ def unwrap(x):
                         torch.Tensor, wrap, outs_unwrapped
                     )
                 else:
+<<<<<<< HEAD
                     # Note: [Functionalization View Replay Annotation]
                     # When functionalization encounters a mutation, it handles aliases by lazily regenerating the aliases
                     # at the first time they are next used.
@@ -537,6 +573,8 @@ def unwrap(x):
                             with fx_traceback.set_current_replay_node(curr_node):
                                 torch._sync(a)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # When we dispatch to the C++ functionalization kernel, we might need to jump back to the
                     # PreDispatch mode stack afterwards, to handle any other PreDispatch modes underneath
                     # FunctionalTensorMode. If we call func() directly, we would need to exclude PreDispatch
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 4b6a05a3085d4..1f2c1b01485d4 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -5,23 +5,38 @@
 import functools
 import threading
 import typing
+<<<<<<< HEAD
+=======
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import weakref
 from abc import abstractmethod
 from contextlib import AbstractContextManager, contextmanager
 from dataclasses import dataclass
 from typing import (
     Any,
+<<<<<<< HEAD
+=======
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ClassVar,
     Generic,
     NewType,
     Optional,
     Protocol,
     TYPE_CHECKING,
+<<<<<<< HEAD
     TypeGuard,
     TypeVar,
     Union,
 )
 from typing_extensions import override, TypedDict, TypeIs, Unpack
+=======
+    TypeVar,
+    Union,
+)
+from typing_extensions import override, TypedDict, TypeGuard, TypeIs, Unpack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C._autograd import CreationMeta
@@ -46,7 +61,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Generator
+=======
+    from collections.abc import Generator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch._C._functorch import CInterpreter
     from torch._guards import Source
@@ -80,8 +99,13 @@ def safe_is_leaf(t: Union[MetaTensorDesc, torch.Tensor]) -> bool:
 
 
 def safe_grad(t: _TensorLikeT) -> Optional[_TensorLikeT]:
+<<<<<<< HEAD
     with torch._logging.hide_warnings(torch._logging._internal.safe_grad_filter):
         # pyrefly: ignore [bad-return]
+=======
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return t.grad
 
 
@@ -356,9 +380,13 @@ def describe_tensor(
 
         maybe_functorch_stack = None
         if is_functorch_wrapped:
+<<<<<<< HEAD
             with (
                 torch._functorch.pyfunctorch.temporarily_clear_interpreter_stack()
             ) as maybe_functorch_stack:
+=======
+            with torch._functorch.pyfunctorch.temporarily_clear_interpreter_stack() as maybe_functorch_stack:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 pass
 
         attrs = None
@@ -416,10 +444,15 @@ def describe_tensor(
             device=t.device,
             size=t.size(),
             stride=stride,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             storage_offset=storage_offset,
             dynamo_dynamic_indices=list(getattr(t, "_dynamo_dynamic_indices", set())),
             dynamo_hint_overrides=getattr(t, "_dynamo_hint_overrides", {}),
+=======
+            storage_offset=storage_offset,
+            dynamo_dynamic_indices=list(getattr(t, "_dynamo_dynamic_indices", set())),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sparse_dim=(
                 t.sparse_dim() if t.is_sparse or is_sparse_compressed(t) else None
             ),
@@ -520,7 +553,12 @@ def apply(
         new_base: _TensorT,
         symint_visitor_fn: Optional[Callable[[int], int]] = None,
         tensor_visitor_fn: Optional[Callable[[torch.Tensor], _TensorT]] = None,
+<<<<<<< HEAD
     ) -> _TensorT: ...
+=======
+    ) -> _TensorT:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def from_tensor(t: torch.Tensor) -> ViewFunc:
@@ -541,11 +579,15 @@ def apply(
         tensor_visitor_fn: Optional[Callable[[torch.Tensor], FakeTensor]] = None,
     ) -> FakeTensor:
         return torch._subclasses.fake_tensor.FakeTensor._view_func_unsafe(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             t,
             new_base,
             symint_visitor_fn,
             tensor_visitor_fn,
+=======
+            t, new_base, symint_visitor_fn, tensor_visitor_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -580,7 +622,12 @@ def apply(
 class _MetaTensorCallback(Protocol, Generic[_TensorT_cov]):
     def __call__(
         self, arg: Callable[[], torch.Tensor], /, *, device: Union[torch.device, str]
+<<<<<<< HEAD
     ) -> _TensorT_cov: ...
+=======
+    ) -> _TensorT_cov:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _MetaTensorCallbackKwargs(TypedDict, total=False):
@@ -597,7 +644,12 @@ def __call__(
         arg: Callable[[], torch.Tensor],
         /,
         **kwargs: Unpack[_MetaTensorCallbackKwargs],
+<<<<<<< HEAD
     ) -> _TensorT_cov: ...
+=======
+    ) -> _TensorT_cov:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(frozen=True)
@@ -621,7 +673,10 @@ class MetaTensorDesc(Generic[_TensorT]):
     # defined on NJT
     size: tuple[int, ...]
     dynamo_dynamic_indices: list[int]
+<<<<<<< HEAD
     dynamo_hint_overrides: dict[int, int]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     layout: torch.layout = torch.strided
     is_inference: bool = False
@@ -790,9 +845,15 @@ def __init__(self, *, copy_data: bool = False) -> None:
         ] = weakref.WeakValueDictionary()
         # Maps MetaTensorId to torch.Tensor (typically a meta tensor or
         # FakeTensor)
+<<<<<<< HEAD
         self.tensor_memo: weakref.WeakValueDictionary[MetaTensorId, _TensorT] = (
             weakref.WeakValueDictionary()
         )
+=======
+        self.tensor_memo: weakref.WeakValueDictionary[
+            MetaTensorId, _TensorT
+        ] = weakref.WeakValueDictionary()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.hit = 0
         self.miss = 0
         self.del_hook = None
@@ -893,6 +954,7 @@ def meta_tensor(
                 f"__meta_utils_unknown_tensor{len(self.tensor_memo)}"
             )
 
+<<<<<<< HEAD
         msg = (
             " This indicates you set no_dispatch() before calling into this"
             " function.  This is an error: we may be creating fake tensors and"
@@ -902,6 +964,15 @@ def meta_tensor(
         assert not torch._C._dispatch_tls_local_exclude_set().has(
             torch._C.DispatchKey.Python
         ), msg
+=======
+        # This indicates you set no_dispatch() before calling into this
+        # function.  This is an error: we may be creating fake tensors and
+        # will perform operations on them which need fake tensor mode to
+        # be active.  You will segfault if you are in a no_dispatch() block.
+        assert not torch._C._dispatch_tls_local_exclude_set().has(
+            torch._C.DispatchKey.Python
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.arg_cnt += 1
 
         # When we make as_strided calls, we end up generating a guard
@@ -966,7 +1037,10 @@ def sym_sizes_strides_storage_offset(
                         [d in t.dynamo_dynamic_indices for d in range(t.ndim)],
                         src,
                         symbolic_context=symbolic_context,
+<<<<<<< HEAD
                         hint_overrides=t.dynamo_hint_overrides,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
             else:
                 return (t.size, t.stride, t.storage_offset)
@@ -1019,7 +1093,10 @@ def empty_create_subclass(
             # Morally, the code here is same as transform_subclass, but we've
             # written it from scratch to read EmptyCreateSubclass
             outer_size = outer_size if outer_size is not None else t.size
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outer_stride = outer_stride if outer_stride is not None else t.stride
 
             assert symbolic_context is None or isinstance(
@@ -1276,7 +1353,10 @@ def tensor_visitor_fn(
                 ) -> torch.Tensor:
                     # It's possible to close over an undefined tensor (e.g. NJT's lengths).
                     if visited_t is None:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return None
 
                     # NB: visited_t being a Tensor here is very naughty!  Should
@@ -1284,7 +1364,11 @@ def tensor_visitor_fn(
 
                     # Fake inner tensors of view subclasses will come from the mapping built above.
                     visited_id = self.describer.get_tensor_id(visited_t)
+<<<<<<< HEAD
                     fake_visited_t = real_to_fake_mapping.get(visited_id)
+=======
+                    fake_visited_t = real_to_fake_mapping.get(visited_id, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if fake_visited_t is not None:
                         return fake_visited_t
 
@@ -1407,7 +1491,10 @@ def tensor_visitor_fn(
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         r = self._backward_error(r)
                 elif t.is_nested and not t.is_traceable_wrapper_subclass:
                     # TODO: Handle this better in Dynamo?
@@ -1446,7 +1533,10 @@ def tensor_visitor_fn(
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         r = self._backward_error(r)
                 elif t.is_functorch_wrapped:
                     if t.is_view:
@@ -1543,7 +1633,10 @@ def _to_fake_tensor(t: MetaTensorDesc) -> _TensorT:
                                     )
                                     assert t.data is not None
                                     _safe_copy(r.real_tensor, t.data)  # type: ignore[attr-defined]
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return r
 
                     r = _to_fake_tensor(t)
@@ -1657,7 +1750,11 @@ def is_c_of_r(
                                 with torch.enable_grad():
                                     r = view_from_base(base, t)
 
+<<<<<<< HEAD
                                 # NB: We don't actually faithfully replicate
+=======
+                                # NB: We don't actaully faithfully replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 # autograd connectivity, but that doesn't matter
                                 # today. See following for more info:
                                 # https://gist.github.com/soulitzer/e03f015b314c3f5fcf80888c69390913
@@ -1683,8 +1780,11 @@ def is_c_of_r(
                             torch._C.DispatchKey.ADInplaceOrView, old_exclude
                         )
 
+<<<<<<< HEAD
                     r.fake_device = t.device  # type: ignore[attr-defined]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     is_leaf = t.is_leaf
 
@@ -1693,7 +1793,10 @@ def is_c_of_r(
                         not (t.is_batchedtensor or t.is_gradtrackingtensor)
                         and t.is_functorch_wrapped
                     ) or t.is_legacy_batchedtensor:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return NotImplemented
 
                     (
@@ -1740,7 +1843,10 @@ def is_c_of_r(
                             # the metadata of the inner tensor.
                             # So instead, we now have a dedicated fn to set autograd history,
                             # without inadvertently changing other metadata.
+<<<<<<< HEAD
                             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             r = self._backward_error(r)
 
                     s = t.storage
@@ -1789,9 +1895,15 @@ def is_c_of_r(
                         # subclasses.  Relevant test is
                         # DynamicShapesFunctionTests::test_add_dynamic_shapes in
                         # test/dynamo/test_dynamic_shapes.py
+<<<<<<< HEAD
                         maybe_fake_mgr: AbstractContextManager[None] = (
                             contextlib.nullcontext()
                         )
+=======
+                        maybe_fake_mgr: AbstractContextManager[
+                            None
+                        ] = contextlib.nullcontext()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         from torch._subclasses.fake_tensor import (
                             in_kernel_invocation_manager,
                             maybe_get_fake_mode,
@@ -1820,7 +1932,10 @@ def is_c_of_r(
 
                     # TODO: Use a valid grad-specific symbolic context instead of recycling
                     # the one from t. This isn't correct if e.g. t._is_view() != t.grad._is_view().
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     r.grad = self.meta_tensor(
                         t.grad,
                         shape_env,
@@ -1828,15 +1943,22 @@ def is_c_of_r(
                         AttrSource(source, "grad"),
                         symbolic_context,
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 torch._C._set_conj(r, t.is_conj)
                 # pyrefly: ignore [unbound-name]
+=======
+                torch._C._set_conj(r, t.is_conj)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._C._set_neg(r, t.is_neg)
             # This can be skipped if necessary for performance reasons
             skip_leaf = (
                 t.is_gradtrackingtensor and t.level == GRAD_TENSOR_SENTINEL_VALUE
             )
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert_metadata_eq(assert_eq, t, r, skip_symbolic=True, skip_leaf=skip_leaf)
             # Thanks to storage resizing, it's possible to end up with a tensor
             # that advertises a real size, but has a storage that actually has zero bytes.
@@ -1844,23 +1966,36 @@ def is_c_of_r(
             from torch.fx.experimental.symbolic_shapes import guard_or_false
 
             if t.storage is not None and guard_or_false(t.storage.size == 0):
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 r.untyped_storage().resize_(0)
 
             if t.is_parameter:
                 # pyrefly: ignore [unbound-name]
+=======
+                r.untyped_storage().resize_(0)
+
+            if t.is_parameter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r._is_param = True
 
             # See Note: [Creating symbolic nested int]
             if t.nested_int is not None:
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 assert _is_fake_tensor(r)
                 # pyrefly: ignore [unbound-name]
+=======
+                assert _is_fake_tensor(r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r.nested_int_memo = r.fake_mode.create_symbolic_nested_int(
                     nt_tensor_id=t.nested_int
                 )
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.set_tensor_memo(t, r)
 
         return self._checked_get_tensor_memo(t)
@@ -1904,13 +2039,19 @@ def __call__(
                 (t._is_view() and t._base is not None and t._base.is_sparse)
             ):
                 self.miss += 1
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return NotImplemented
             else:
                 self.hit += 1
         elif torch.overrides.is_tensor_like(t):
             self.miss += 1
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return NotImplemented
         else:
             # non-Tensor types don't count as hit or miss
diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py
index 28bbb8f335ec0..ef3c007a265d0 100644
--- a/torch/_subclasses/schema_check_mode.py
+++ b/torch/_subclasses/schema_check_mode.py
@@ -86,7 +86,11 @@ def bitwise_equal(lhs, rhs):
                 return torch.allclose(lhs, rhs, equal_nan=True)
 
         def has_mutated(before, after, md):
+<<<<<<< HEAD
             are_tensors = type(before) is torch.Tensor and type(after) is torch.Tensor
+=======
+            are_tensors = type(before) == torch.Tensor and type(after) == torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 are_tensors
                 and before.layout != torch.sparse_csr
@@ -113,7 +117,11 @@ def standardize_name(name):
             return name if name != "self" else "input"
 
         def unwrap(e):
+<<<<<<< HEAD
             if isinstance(e, torch.Tensor) and type(e) is not torch.Tensor:
+=======
+            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     return e.elem
                 except AttributeError:
@@ -122,7 +130,11 @@ def unwrap(e):
 
         def parse_metadata(e):
             if isinstance(e, torch.Tensor):
+<<<<<<< HEAD
                 if type(e) is not torch.Tensor:
+=======
+                if not type(e) == torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     try:
                         current = e.elem
                         return (
diff --git a/torch/_tensor.py b/torch/_tensor.py
index f020b7330946b..4091b09bc2dc1 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -2,6 +2,7 @@
 import copyreg
 import enum
 import functools
+<<<<<<< HEAD
 import itertools
 import warnings
 from collections import OrderedDict
@@ -10,6 +11,13 @@
 from numbers import Number
 from typing import Any, cast, Concatenate, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
+=======
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+from numbers import Number
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C as _C
@@ -30,6 +38,7 @@
 )
 
 
+<<<<<<< HEAD
 _P = ParamSpec("_P")
 _TensorLike = TypeVar("_TensorLike", bound=_C.TensorBase)
 
@@ -45,6 +54,18 @@ def wrapped(self: _TensorLike, *args: _P.args, **kwargs: _P.kwargs) -> "Tensor":
             if has_torch_function(sargs):
                 return handle_torch_function(wrapped, sargs, *sargs, **kwargs)
             return f(self, *args, **kwargs)
+=======
+def _handle_torch_function_and_wrap_type_error_to_not_implemented(f):
+    assigned = functools.WRAPPER_ASSIGNMENTS
+
+    @functools.wraps(f, assigned=assigned)
+    def wrapped(*args, **kwargs):
+        try:
+            # See https://github.com/pytorch/pytorch/issues/75462
+            if has_torch_function(args):
+                return handle_torch_function(wrapped, args, *args, **kwargs)
+            return f(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError:
             return NotImplemented
 
@@ -332,7 +353,11 @@ def _reduce_ex_internal(self, proto):
             torch.serialization._serialization_tls.materialize_fake_tensors
         )
 
+<<<<<<< HEAD
         if self.device.type in ["xla", "maia", "mtia"] or (
+=======
+        if self.device.type in ["xla", "maia"] or (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             not torch._C._has_storage(self)
             and self.device.type == torch._C._get_privateuse1_backend_name()
         ):
@@ -345,13 +370,48 @@ def _reduce_ex_internal(self, proto):
                 torch._utils._rebuild_device_tensor_from_cpu_tensor,
                 (cpu_tensor, self.dtype, str(self.device), self.requires_grad),
             )
+<<<<<<< HEAD
+=======
+        # Legacy comment that does not hold anymore.
+        # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, MAIA Tensors.
+        # We considered a few options:
+        # 1. CPU tensor can't be used here.
+        #    Otherwise in torch.load CPU storage is reconstructed with randomly
+        #    initialized data, moved onto backend device, and then storage is updated
+        #    to the serialized content. This works perfectly for CPU/CUDA but not these backends;
+        #    their tensors are disconnected with storage so they don't get the update.
+        # 2. Python list is not a good fit due to performance reason.
+        #    `tolist()` converts every single element in the tensor into python objects
+        #    and serialize them one by one.
+        if self.device.type in ["mtia"]:
+            # Convert BFloat16 tesors to Float32 before conversion to numpy, as numpy doesn't
+            # support BFloat16. The rebuild tensor from numpy takes in the original self.dtype,
+            # this would reconstruct the BFloat16 tensor from numpy.
+            if skip_data:
+                raise RuntimeError(
+                    "Cannot serialize tensors on backends with no storage under skip_data context manager"
+                )
+            numpy_tensor = (
+                self.cpu().numpy()
+                if self.dtype != torch.bfloat16
+                else self.cpu().to(torch.float32).numpy()
+            )
+            return (
+                torch._utils._rebuild_device_tensor_from_numpy,
+                (numpy_tensor, self.dtype, str(self.device), self.requires_grad),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device.type == "meta":
             # NB: This implementation BREAKS storage sharing.  Current
             # hypothesis is that no one cares for meta tensors.
             if skip_data:
                 warnings.warn(
+<<<<<<< HEAD
                     "Serializing tensors on the meta device under skip_data context manager is a no-op",
                     stacklevel=2,
+=======
+                    "Serializing tensors on the meta device under skip_data context manager is a no-op"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             arg_meta = (
                 self.dtype,
@@ -552,7 +612,10 @@ def __setstate__(self, state):
             raise RuntimeError("__setstate__ can be only called on leaf Tensors")
         if len(state) == 4:
             # legacy serialization of Tensor
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.set_(*state)
             return
         elif len(state) == 5:
@@ -630,6 +693,7 @@ def backward(
             self, gradient, retain_graph, create_graph, inputs=inputs
         )
 
+<<<<<<< HEAD
     def index(self, positions, dims):
         """
         Index a regular tensor by binding specified positions to dims.
@@ -658,6 +722,8 @@ def index(self, positions, dims):
 
         return index(self, positions, dims)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def register_hook(self, hook):
         r"""Registers a backward hook.
 
@@ -759,10 +825,14 @@ def register_post_accumulate_grad_hook(self, hook):
                 "post accumulate grad hooks cannot be registered on non-leaf tensors"
             )
         if self._post_accumulate_grad_hooks is None:
+<<<<<<< HEAD
             self._post_accumulate_grad_hooks: dict[Any, Any] = (
                 # pyrefly: ignore [bad-assignment]
                 OrderedDict()
             )
+=======
+            self._post_accumulate_grad_hooks: dict[Any, Any] = OrderedDict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch.utils.hooks import RemovableHandle
 
@@ -1034,7 +1104,11 @@ def istft(
     def resize(self, *sizes):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.resize, (self,), self, *sizes)
+<<<<<<< HEAD
         warnings.warn("non-inplace resize is deprecated", stacklevel=2)
+=======
+        warnings.warn("non-inplace resize is deprecated")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.autograd._functions import Resize
 
         return Resize.apply(self, sizes)
@@ -1042,7 +1116,11 @@ def resize(self, *sizes):
     def resize_as(self, tensor):
         if has_torch_function_variadic(self, tensor):
             return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor)
+<<<<<<< HEAD
         warnings.warn("non-inplace resize_as is deprecated", stacklevel=2)
+=======
+        warnings.warn("non-inplace resize_as is deprecated")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.autograd._functions import Resize
 
         return Resize.apply(self, tensor.size())
@@ -1062,12 +1140,16 @@ def split(self, split_size, dim=0):
         if isinstance(split_size, (int, torch.SymInt)):
             return torch._VF.split(self, split_size, dim)  # type: ignore[attr-defined]
         else:
+<<<<<<< HEAD
             return torch._VF.split_with_sizes(
                 self,
                 # pyrefly: ignore [bad-argument-type]
                 split_size,
                 dim,
             )
+=======
+            return torch._VF.split_with_sizes(self, split_size, dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unique(self, sorted=True, return_inverse=False, return_counts=False, dim=None):
         r"""Returns the unique elements of the input tensor.
@@ -1111,17 +1193,28 @@ def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None
         )
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
+<<<<<<< HEAD
     def __rsub__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
         return _C._VariableFunctions.rsub(self, other)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rdiv__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
+=======
+    def __rsub__(self, other):
+        return _C._VariableFunctions.rsub(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rdiv__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.reciprocal() * other
 
     __rtruediv__ = __rdiv__
     __itruediv__ = _C.TensorBase.__idiv__
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __pow__ = cast(
         Callable[
             ["torch._C.TensorBase", Union["Tensor", int, float, bool, complex]],
@@ -1131,13 +1224,20 @@ def __rdiv__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor
             _C.TensorBase.pow
         ),
     )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(
         _C.TensorBase.pow_
     )
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
+<<<<<<< HEAD
     def __rmod__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
+=======
+    def __rmod__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.remainder(other, self)
 
     def __format__(self, format_spec):
@@ -1150,6 +1250,7 @@ def __format__(self, format_spec):
         return object.__format__(self, format_spec)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
+<<<<<<< HEAD
     def __rpow__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
         return torch.pow(other, self)
 
@@ -1177,6 +1278,29 @@ def __rrshift__(
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rmatmul__(self, other: "Tensor") -> "Tensor":
+=======
+    def __rpow__(self, other):
+        return torch.pow(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __floordiv__(self, other):
+        return torch.floor_divide(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rfloordiv__(self, other):
+        return torch.floor_divide(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rlshift__(self, other):
+        return torch.bitwise_left_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rrshift__(self, other):
+        return torch.bitwise_right_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rmatmul__(self, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.matmul(other, self)
 
     __pos__ = _C.TensorBase.positive
@@ -1347,7 +1471,11 @@ def storage_type(self):
 
         return self._typed_storage()._get_legacy_storage_class()
 
+<<<<<<< HEAD
     def refine_names(self, *names):  # pyrefly: ignore  # bad-override
+=======
+    def refine_names(self, *names):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
 
         Refining is a special case of renaming that "lifts" unnamed dimensions.
@@ -1391,7 +1519,11 @@ def refine_names(self, *names):  # pyrefly: ignore  # bad-override
         names = resolve_ellipsis(names, self.names, "refine_names")
         return super().refine_names(names)
 
+<<<<<<< HEAD
     def align_to(self, *names):  # pyrefly: ignore  # bad-override
+=======
+    def align_to(self, *names):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Permutes the dimensions of the :attr:`self` tensor to match the order
         specified in :attr:`names`, adding size-one dims for any new names.
 
@@ -1626,19 +1758,30 @@ def has_multiple_dim_order(tensor):
               If any two dimensions have the same stride, swapping these dimensions won't
               change how data is accessed, leading to multiple correct dimension orders.
             """
+<<<<<<< HEAD
             from torch.fx.experimental.symbolic_shapes import guard_or_false
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             sizes = tensor.size()
             strides = tensor.stride()
 
             # Check if there are any duplicate strides
             has_duplicate_strides = any(
+<<<<<<< HEAD
                 guard_or_false(earlier == later)
                 for earlier, later in itertools.pairwise(strides)
             )
 
             # Check if there are any singleton dimensions
             has_singleton_dims = any(guard_or_false(size == 1) for size in sizes)
+=======
+                earlier == later for earlier, later in zip(strides, strides[1:])
+            )
+
+            # Check if there are any singleton dimensions
+            has_singleton_dims = any(size == 1 for size in sizes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return has_duplicate_strides or has_singleton_dims
 
@@ -1658,6 +1801,7 @@ def has_multiple_dim_order(tensor):
 
         import torch._prims_common as utils
 
+<<<<<<< HEAD
         out_perm, raise_ambiguity = (
             utils.compute_elementwise_output_logical_to_physical_perm(
                 self, ambiguity_check=ambiguity_check
@@ -1666,6 +1810,9 @@ def has_multiple_dim_order(tensor):
         if raise_ambiguity:
             raise RuntimeError("The tensor does not have unique dim order.")
         return tuple(out_perm)
+=======
+        return tuple(utils.compute_elementwise_output_logical_to_physical_perm(self))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _update_names(self, names, inplace):
         if has_torch_function_unary(self):
@@ -1709,6 +1856,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
     __torch_dispatch__ = _C._disabled_torch_dispatch_impl
 
+<<<<<<< HEAD
     def __dlpack__(
         self,
         *,
@@ -1717,6 +1865,9 @@ def __dlpack__(
         dl_device: Optional[tuple[enum.IntEnum, int]] = None,
         copy: Optional[bool] = None,
     ):
+=======
+    def __dlpack__(self, stream=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_
         of the current tensor to be exported to other libraries.
@@ -1727,6 +1878,7 @@ def __dlpack__(
 
         Args:
             stream (integer or None): An optional Python integer representing a
+<<<<<<< HEAD
                 pointer to a CUDA stream. The current stream is synchronized with
                 this stream before the capsule is created, and since the capsule
                 shares its storage with the tensor this make it safe to access from
@@ -1757,11 +1909,23 @@ def __dlpack__(
                 "copy": copy,
             }
             return handle_torch_function(Tensor.__dlpack__, (self,), *args, **kwargs)
+=======
+            pointer to a CUDA stream. The current stream is synchronized with
+            this stream before the capsule is created, and since the capsule
+            shares its storage with the tensor this make it safe to access from
+            both streams.  If None or -1 is passed then no synchronization is performed.
+            If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for
+            synchronization.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dlpack__, (self,), self, stream)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # DLPack capsules can't capture all of PyTorch's semantics,
         # so we prohibit exporting tensors that would lose their properties like
         # requires_grad and having the conjugate bit set.
         if self.requires_grad:
+<<<<<<< HEAD
             raise BufferError(
                 "Can't export tensors that require gradient, use tensor.detach()"
             )
@@ -1782,10 +1946,23 @@ def __dlpack__(
                 f"Current device: {torch.cuda.current_device()}."
             )
 
+=======
+            raise RuntimeError(
+                "Can't export tensors that require gradient, use tensor.detach()"
+            )
+        if self.is_conj():
+            raise RuntimeError("Can't export tensors with the conjugate bit set")
+        if self.layout != torch.strided:
+            raise RuntimeError(
+                "Can't export tensors with layout other than torch.strided"
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if stream is not None and type(stream) is not int:
             # Stream pointers in CUDA/ROCm are uniquely numbered and can
             # be retrieved from their integer value.
             raise TypeError("stream must be ``int`` or ``none``")
+<<<<<<< HEAD
         elif self.device.type == "cuda" and stream != -1:
             # NB: This logic handles the special case values for default
             # streams and must be kept in sync with from_dlpack in
@@ -1815,6 +1992,25 @@ def __dlpack__(
         elif self.device.type == "cpu":
             assert stream is None or stream == -1, "stream should be None on cpu."
 
+=======
+        elif stream is not None and stream != -1:
+            if self.device.type == "cuda":
+                # NB: This logic handles the special case values for default
+                # streams and must be kept in sync with from_dlpack in
+                # torch/utils/dlpack.py
+                if stream == 1 and torch.version.hip is None:
+                    stream = torch.cuda.default_stream()
+                elif stream == 0 and torch.version.hip is not None:
+                    stream = torch.cuda.default_stream()
+                else:
+                    stream = torch.cuda.ExternalStream(stream)
+                # Only synchronize on different streams
+                sync_stream = torch.cuda.current_stream()
+                if stream != sync_stream:
+                    event = torch.cuda.Event()
+                    event.record(sync_stream)
+                    stream.wait_event(event)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device.type == "xla":
             import torch_xla
             import torch_xla.utils.dlpack as xla_dlpack
@@ -1826,6 +2022,7 @@ def __dlpack__(
                 raise RuntimeError(
                     "Can't export to dlpack an XLA tensor that is not on CUDA."
                 )
+<<<<<<< HEAD
 
             # Does not support DLPack 1.0, yet.
             return xla_dlpack.to_dlpack(self)
@@ -1835,6 +2032,10 @@ def __dlpack__(
             return _C._to_dlpack(self, dl_device=dl_device, copy=copy)
 
         return _C._to_dlpack_versioned(self, dl_device=dl_device, copy=copy)
+=======
+            return xla_dlpack.to_dlpack(self)
+        return torch.to_dlpack(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __dlpack_device__(self) -> tuple[enum.IntEnum, int]:
         if has_torch_function_unary(self):
@@ -1848,9 +2049,15 @@ def __dlpack_device__(self) -> tuple[enum.IntEnum, int]:
         if torch_device_type == "cuda" and torch.version.hip is not None:
             device_type = DLDeviceType.kDLROCM
         elif torch_device_type == "cpu" and self.is_pinned():
+<<<<<<< HEAD
             device_type = DLDeviceType.kDLCUDAHost
         elif torch_device_type == "cuda":
             device_type = DLDeviceType.kDLCUDA
+=======
+            device_type = DLDeviceType.kDLCPUPinned
+        elif torch_device_type == "cuda":
+            device_type = DLDeviceType.kDLGPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif torch_device_type == "cpu":
             device_type = DLDeviceType.kDLCPU
         elif torch_device_type == "xpu":
@@ -1866,9 +2073,13 @@ def __dlpack_device__(self) -> tuple[enum.IntEnum, int]:
             ):
                 raise ValueError(f"Unknown device type {torch_device_type} for Dlpack")
 
+<<<<<<< HEAD
             device_type = DLDeviceType.kDLCUDA
         elif torch_device_type == "mps":
             device_type = DLDeviceType.kDLMetal
+=======
+            device_type = DLDeviceType.kDLGPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise ValueError(f"Unknown device type {torch_device_type} for Dlpack")
         return (device_type, idx)
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index bc5ed9d510d5a..3f4be330ef30d 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1215,9 +1215,15 @@ def add_docstr_all(method: str, docstr: str) -> None:
 
 Args:
     src (Tensor): the source tensor to copy from
+<<<<<<< HEAD
     non_blocking (bool, optional): if ``True`` and this copy is between CPU and GPU,
         the copy may occur asynchronously with respect to the host. For other
         cases, this argument has no effect. Default: ``False``
+=======
+    non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+        the copy may occur asynchronously with respect to the host. For other
+        cases, this argument has no effect.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -1383,9 +1389,15 @@ def add_docstr_all(method: str, docstr: str) -> None:
 then no copy is performed and the original object is returned.
 
 Args:
+<<<<<<< HEAD
     device (:class:`torch.device`, optional): The destination GPU device.
         Defaults to the current CUDA device.
     non_blocking (bool, optional): If ``True`` and the source is in pinned memory,
+=======
+    device (:class:`torch.device`): The destination GPU device.
+        Defaults to the current CUDA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the copy will be asynchronous with respect to the host.
         Otherwise, the argument has no effect. Default: ``False``.
     {memory_format}
@@ -1403,9 +1415,15 @@ def add_docstr_all(method: str, docstr: str) -> None:
 then no copy is performed and the original object is returned.
 
 Args:
+<<<<<<< HEAD
     device (:class:`torch.device`, optional): The destination MTIA device.
         Defaults to the current MTIA device.
     non_blocking (bool, optional): If ``True`` and the source is in pinned memory,
+=======
+    device (:class:`torch.device`): The destination MTIA device.
+        Defaults to the current MTIA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the copy will be asynchronous with respect to the host.
         Otherwise, the argument has no effect. Default: ``False``.
     {memory_format}
@@ -1423,9 +1441,15 @@ def add_docstr_all(method: str, docstr: str) -> None:
 then no copy is performed and the original object is returned.
 
 Args:
+<<<<<<< HEAD
     device (:class:`torch.device`, optional): The destination IPU device.
         Defaults to the current IPU device.
     non_blocking (bool, optional): If ``True`` and the source is in pinned memory,
+=======
+    device (:class:`torch.device`): The destination IPU device.
+        Defaults to the current IPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the copy will be asynchronous with respect to the host.
         Otherwise, the argument has no effect. Default: ``False``.
     {memory_format}
@@ -1443,9 +1467,15 @@ def add_docstr_all(method: str, docstr: str) -> None:
 then no copy is performed and the original object is returned.
 
 Args:
+<<<<<<< HEAD
     device (:class:`torch.device`, optional): The destination XPU device.
         Defaults to the current XPU device.
     non_blocking (bool, optional): If ``True`` and the source is in pinned memory,
+=======
+    device (:class:`torch.device`): The destination XPU device.
+        Defaults to the current XPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the copy will be asynchronous with respect to the host.
         Otherwise, the argument has no effect. Default: ``False``.
     {memory_format}
@@ -1612,7 +1642,11 @@ def add_docstr_all(method: str, docstr: str) -> None:
 
 Arguments:
     fill_value (Scalar): the fill value
+<<<<<<< HEAD
     wrap (bool, optional): the diagonal 'wrapped' after N columns for tall matrices. Default: ``False``
+=======
+    wrap (bool): the diagonal 'wrapped' after N columns for tall matrices.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -2526,7 +2560,11 @@ def add_docstr_all(method: str, docstr: str) -> None:
 row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
 :obj:`include_self="True"`, the values in the :attr:`self` tensor are included
 in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+<<<<<<< HEAD
 to are treated as if they were filled with the reduction identities.
+=======
+to are treated as if they were filled with the reduction identites.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The :attr:`dim`\ th dimension of ``source`` must have the same size as the
 length of :attr:`index` (which must be a vector), and all other dimensions must
@@ -3640,7 +3678,11 @@ def callable(a, b) -> number
         tensor. Pad the out tensor with `fill_value` if the `size` is larger
         than total number of non-zero elements, truncate out tensor if `size`
         is smaller. The size must be a non-negative integer.
+<<<<<<< HEAD
     fill_value (int, optional): the value to fill the output tensor with when `size` is larger
+=======
+    fill_value (int): the value to fill the output tensor with when `size` is larger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         than the total number of non-zero elements. Default is `-1` to represent
         invalid index.
 
@@ -3848,7 +3890,11 @@ def callable(a, b) -> number
 Args:
     index (LongTensor): the indices into self
     source (Tensor): the tensor containing values to copy from
+<<<<<<< HEAD
     accumulate (bool, optional): whether to accumulate into self. Default: ``False``
+=======
+    accumulate (bool): whether to accumulate into self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -4394,12 +4440,20 @@ def callable(a, b) -> number
 
 This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
 
+<<<<<<< HEAD
 It is also required that
 ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
 ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
 Note that ``input`` and ``index`` do not broadcast against each other for NPUs,
 so when running on NPUs, :attr:`input` and :attr:`index` must have the same number of dimensions.
 Standard broadcasting occurs in all other cases.
+=======
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
 between ``0`` and ``self.size(dim) - 1`` inclusive.
@@ -4526,8 +4580,11 @@ def callable(a, b) -> number
 dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
 dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
 ``d != dim``. Note that ``index`` and ``src`` do not broadcast.
+<<<<<<< HEAD
 When :attr:`index` is empty, we always return the original tensor
 without further error checking.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Note:
     {forward_reproducibility_note}
@@ -6625,6 +6682,7 @@ def callable(a, b) -> number
 )
 
 add_docstr_all(
+<<<<<<< HEAD
     "grad_dtype",
     r"""
 The allowed dtype of :attr:``grad`` for this tensor.
@@ -6655,6 +6713,8 @@ def callable(a, b) -> number
 )
 
 add_docstr_all(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "retain_grad",
     r"""
 retain_grad() -> None
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 613fa9ad6ff95..735d306f7487e 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -178,6 +178,7 @@ def __init__(self, tensor):
                     self.int_mode = False
                     break
 
+<<<<<<< HEAD
             self.sci_mode = (
                 nonzero_finite_max / nonzero_finite_min > 1000.0
                 or nonzero_finite_max > 1.0e8
@@ -190,6 +191,16 @@ def __init__(self, tensor):
                 # in int_mode for floats, all numbers are integers, and we append a decimal to nonfinites
                 # to indicate that the tensor is of floating type. add 1 to the len to account for this.
                 if self.sci_mode:
+=======
+            if self.int_mode:
+                # in int_mode for floats, all numbers are integers, and we append a decimal to nonfinites
+                # to indicate that the tensor is of floating type. add 1 to the len to account for this.
+                if (
+                    nonzero_finite_max / nonzero_finite_min > 1000.0
+                    or nonzero_finite_max > 1.0e8
+                ):
+                    self.sci_mode = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for value in nonzero_finite_vals:
                         value_str = f"{{:.{PRINT_OPTS.precision}e}}".format(value)
                         self.max_width = max(self.max_width, len(value_str))
@@ -199,7 +210,16 @@ def __init__(self, tensor):
                         self.max_width = max(self.max_width, len(value_str) + 1)
             else:
                 # Check if scientific representation should be used.
+<<<<<<< HEAD
                 if self.sci_mode:
+=======
+                if (
+                    nonzero_finite_max / nonzero_finite_min > 1000.0
+                    or nonzero_finite_max > 1.0e8
+                    or nonzero_finite_min < 1.0e-4
+                ):
+                    self.sci_mode = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for value in nonzero_finite_vals:
                         value_str = f"{{:.{PRINT_OPTS.precision}e}}".format(value)
                         self.max_width = max(self.max_width, len(value_str))
@@ -208,6 +228,12 @@ def __init__(self, tensor):
                         value_str = f"{{:.{PRINT_OPTS.precision}f}}".format(value)
                         self.max_width = max(self.max_width, len(value_str))
 
+<<<<<<< HEAD
+=======
+        if PRINT_OPTS.sci_mode is not None:
+            self.sci_mode = PRINT_OPTS.sci_mode
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def width(self):
         return self.max_width
 
@@ -247,7 +273,11 @@ def _vector_str(self, indent, summarize, formatter1, formatter2=None):
         element_length += formatter2.width() + 1
 
     elements_per_line = max(
+<<<<<<< HEAD
         1, math.floor((PRINT_OPTS.linewidth - indent) / (element_length))
+=======
+        1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
@@ -307,7 +337,11 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
                 _tensor_str_with_formatter(
                     self[i], indent + 1, summarize, formatter1, formatter2
                 )
+<<<<<<< HEAD
                 for i in range(PRINT_OPTS.edgeitems)
+=======
+                for i in range(0, PRINT_OPTS.edgeitems)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             + ["..."]
             + [
@@ -322,7 +356,11 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
             _tensor_str_with_formatter(
                 self[i], indent + 1, summarize, formatter1, formatter2
             )
+<<<<<<< HEAD
             for i in range(self.size(0))
+=======
+            for i in range(0, self.size(0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
     tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
@@ -406,7 +444,11 @@ def get_summarized_data(self):
     if not PRINT_OPTS.edgeitems:
         return self.new_empty([0] * self.dim())
     elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
+<<<<<<< HEAD
         start = [self[i] for i in range(PRINT_OPTS.edgeitems)]
+=======
+        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
         return torch.stack([get_summarized_data(x) for x in (start + end)])
     else:
@@ -657,10 +699,15 @@ def indented_str(s, indent):
         grad_fn_name = "Invalid"
 
     if grad_fn_name is None and grad_fn is not None:  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         grad_fn_name = type(grad_fn).__name__
         if grad_fn_name == "CppFunction":
             # pyrefly: ignore [unbound-name]
+=======
+        grad_fn_name = type(grad_fn).__name__
+        if grad_fn_name == "CppFunction":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             grad_fn_name = grad_fn.name().rsplit("::", 1)[-1]
 
     if grad_fn_name is not None:
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 3a8c2083afac6..ef9ee7384c545 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -104,11 +104,14 @@ def merge_dicts(*dicts):
 """
     },
     {
+<<<<<<< HEAD
         "opt_dim_without_none": """
     dim (int, optional): the dimension to reduce. If omitted, all dimensions are reduced. Explicit ``None`` is not supported.
 """
     },
     {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "keepdim_details": """If :attr:`keepdim` is ``True``, the output tensor is of the same size
 as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
 Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
@@ -253,7 +256,11 @@ def merge_dicts(*dicts):
     r"""
 acos(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the arccosine (in radians) of each element in :attr:`input`.
+=======
+Computes the inverse cosine of each element in :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \cos^{-1}(\text{input}_{i})
@@ -1006,8 +1013,12 @@ def merge_dicts(*dicts):
 tensor is constructed using :func:`torch.from_numpy`.
 
 If :attr:`data` is a CuPy array, the returned tensor will be located on the same device as the CuPy array unless
+<<<<<<< HEAD
 specifically overwritten by :attr:`device` or a default device. The device of the CuPy array is inferred from the
 pointer of the array using `cudaPointerGetAttributes` unless :attr:`device` is provided with an explicit device index.
+=======
+specifically overwritten by :attr:`device` or a default device.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. seealso::
 
@@ -1047,7 +1058,11 @@ def merge_dicts(*dicts):
     r"""
 asin(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the arcsine of the elements (in radians) in the :attr:`input` tensor.
+=======
+Returns a new tensor with the arcsine of the elements of :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \sin^{-1}(\text{input}_{i})
@@ -1119,7 +1134,11 @@ def merge_dicts(*dicts):
     r"""
 atan(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the arctangent of the elements (in radians) in the :attr:`input` tensor.
+=======
+Returns a new tensor with the arctangent of the elements of :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \tan^{-1}(\text{input}_{i})
@@ -2783,12 +2802,15 @@ def merge_dicts(*dicts):
     result of this operation to :attr:`input`. To create a tensor without an
     autograd relationship to :attr:`input` see :meth:`~Tensor.detach`.
 
+<<<<<<< HEAD
     In addition, when ``torch.preserve_format`` is used:
     If the input tensor is dense (i.e., non-overlapping strided),
     its memory format (including strides) is retained.
     Otherwise (e.g., a non-dense view like a stepped slice),
     the output is converted to the dense (contiguous) format.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Args:
     {input}
 
@@ -3135,7 +3157,11 @@ def merge_dicts(*dicts):
     r"""
 cos(input, *, out=None) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the cosine of the elements of :attr:`input` given in radians.
+=======
+Returns a new tensor with the cosine  of the elements of :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \cos(\text{input}_{i})
@@ -4587,8 +4613,11 @@ def merge_dicts(*dicts):
 It is also required that ``index.size(d) <= input.size(d)`` for all
 dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
 Note that ``input`` and ``index`` do not broadcast against each other.
+<<<<<<< HEAD
 When :attr:`index` is empty, we always return an empty output with the same shape
 without further error checking.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Args:
     input (Tensor): the source tensor
@@ -4742,8 +4771,12 @@ def merge_dicts(*dicts):
     edge_order (``int``, optional): 1 or 2, for `first-order
         <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
         `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+<<<<<<< HEAD
         estimation of the boundary ("edge") values, respectively. Note that when :attr:`edge_order` is specified, each
         dimension size of :attr:`input` should be at least edge_order+1
+=======
+        estimation of the boundary ("edge") values, respectively.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Examples::
 
@@ -5021,6 +5054,7 @@ def merge_dicts(*dicts):
 )
 
 add_docstr(
+<<<<<<< HEAD
     torch.hash_tensor,
     r"""
 hash_tensor(input, *, mode=0) -> Tensor
@@ -5075,6 +5109,8 @@ def merge_dicts(*dicts):
 )
 
 add_docstr(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.histc,
     r"""
 histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor
@@ -5184,7 +5220,11 @@ def merge_dicts(*dicts):
 If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
 of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
 least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+<<<<<<< HEAD
 the left and right edges of all bins. Every bin is inclusive of its left edge. Only
+=======
+the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 the rightmost bin is inclusive of its right edge.
 
 If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
@@ -5308,7 +5348,11 @@ def merge_dicts(*dicts):
 index_select(input, dim, index, *, out=None) -> Tensor
 
 Returns a new tensor which indexes the :attr:`input` tensor along dimension
+<<<<<<< HEAD
 :attr:`dim` using the entries in :attr:`index`.
+=======
+:attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The returned tensor has the same number of dimensions as the original tensor
 (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
@@ -5555,13 +5599,18 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.is_floating_point,
     r"""
+<<<<<<< HEAD
 is_floating_point(input: Tensor) -> bool
+=======
+is_floating_point(input) -> (bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns True if the data type of :attr:`input` is a floating point data type i.e.,
 one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
 
 Args:
     {input}
+<<<<<<< HEAD
 
 Example::
 
@@ -5573,19 +5622,26 @@ def merge_dicts(*dicts):
     True
     >>> torch.is_floating_point(torch.tensor([1, 2, 3], dtype=torch.complex64))
     False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """.format(**common_args),
 )
 
 add_docstr(
     torch.is_complex,
     r"""
+<<<<<<< HEAD
 is_complex(input: Tensor) -> bool
+=======
+is_complex(input) -> (bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns True if the data type of :attr:`input` is a complex data type i.e.,
 one of ``torch.complex64``, and ``torch.complex128``.
 
 Args:
     {input}
+<<<<<<< HEAD
 
 Example::
 
@@ -5597,6 +5653,8 @@ def merge_dicts(*dicts):
     False
     >>> torch.is_complex(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float16))
     False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """.format(**common_args),
 )
 
@@ -5677,11 +5735,19 @@ def merge_dicts(*dicts):
     >>> torch.is_nonzero(torch.tensor([1, 3, 5]))
     Traceback (most recent call last):
     ...
+<<<<<<< HEAD
     RuntimeError: Boolean value of Tensor with more than one value is ambiguous
     >>> torch.is_nonzero(torch.tensor([]))
     Traceback (most recent call last):
     ...
     RuntimeError: Boolean value of Tensor with no values is ambiguous
+=======
+    RuntimeError: bool value of Tensor with more than one value is ambiguous
+    >>> torch.is_nonzero(torch.tensor([]))
+    Traceback (most recent call last):
+    ...
+    RuntimeError: bool value of Tensor with no values is ambiguous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """.format(**common_args),
 )
 
@@ -6602,6 +6668,7 @@ def merge_dicts(*dicts):
 
 Returns the maximum value of all elements in the ``input`` tensor.
 
+<<<<<<< HEAD
 .. note::
     The difference between ``max``/``min`` and ``amax``/``amin`` is:
         - ``amax``/``amin`` supports reducing on multiple dimensions,
@@ -6614,6 +6681,8 @@ def merge_dicts(*dicts):
         - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
         - If reduce over one specified axis, only propagate to the indexed element.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Args:
     {input}
 
@@ -6646,7 +6715,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim_without_none}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {opt_keepdim}
 
 Keyword args:
@@ -6677,7 +6750,11 @@ def merge_dicts(*dicts):
 
 See :func:`torch.maximum`.
 
+<<<<<<< HEAD
 """.format(**single_dim_common),
+=======
+""".format(**multi_dim_common),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 add_docstr(
@@ -6752,6 +6829,7 @@ def merge_dicts(*dicts):
         - ``amax``/``amin`` supports reducing on multiple dimensions,
         - ``amax``/``amin`` does not return indices.
 
+<<<<<<< HEAD
     Both ``amax``/``amin`` evenly distribute gradients between equal values
     when there are multiple input elements with the same minimum or maximum value.
 
@@ -6759,6 +6837,11 @@ def merge_dicts(*dicts):
         - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
         - If reduce over one specified axis, only propagate to the indexed element.
 
+=======
+    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {keepdim_details}
 
 Args:
@@ -7132,7 +7215,11 @@ def merge_dicts(*dicts):
 - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
 - ``lower``: ``a``.
 - ``higher``: ``b``.
+<<<<<<< HEAD
 - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (follows :func:`torch.round`).
+=======
+- ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - ``midpoint``: ``(a + b) / 2``.
 
 If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
@@ -7148,7 +7235,11 @@ def merge_dicts(*dicts):
     {opt_keepdim}
 
 Keyword arguments:
+<<<<<<< HEAD
     interpolation (str, optional): interpolation method to use when the desired quantile lies between two data points.
+=======
+    interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
                             Default is ``linear``.
     {out}
@@ -7236,6 +7327,7 @@ def merge_dicts(*dicts):
 
 Returns the minimum value of all elements in the :attr:`input` tensor.
 
+<<<<<<< HEAD
 .. note::
     The difference between ``max``/``min`` and ``amax``/``amin`` is:
         - ``amax``/``amin`` supports reducing on multiple dimensions,
@@ -7248,6 +7340,8 @@ def merge_dicts(*dicts):
         - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
         - If reduce over one specified axis, only propagate to the indexed element.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Args:
     {input}
 
@@ -7280,7 +7374,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim_without_none}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {opt_keepdim}
 
 Keyword args:
@@ -7376,6 +7474,7 @@ def merge_dicts(*dicts):
         - ``amax``/``amin`` supports reducing on multiple dimensions,
         - ``amax``/``amin`` does not return indices.
 
+<<<<<<< HEAD
     Both ``amax``/``amin`` evenly distribute gradients between equal values
     when there are multiple input elements with the same minimum or maximum value.
 
@@ -7383,6 +7482,11 @@ def merge_dicts(*dicts):
         - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
         - If reduce over one specified axis, only propagate to the indexed element.
 
+=======
+    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {keepdim_details}
 
 Args:
@@ -7592,6 +7696,7 @@ def merge_dicts(*dicts):
   N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
   argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
   batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+<<<<<<< HEAD
   1 is appended to its dimension for the purpose of the batched matrix multiply and removed after.
 
   The first N-2 dimensions of each argument, the batch dimensions, are
@@ -7603,6 +7708,19 @@ def merge_dicts(*dicts):
   tensor, the batch dimensions are :math:`(j \times 1)` and :math:`(k)`,
   and the matrix dimensions are :math:`(n \times m)` and :math:`(m \times p)`.
   :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+=======
+  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+  The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+  must be broadcastable).  For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+  tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+
+  Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+  are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
+  tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+  matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
 matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
@@ -7673,6 +7791,11 @@ def merge_dicts(*dicts):
 Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
 in the output tensors having 1 fewer dimension than :attr:`input`.
 
+<<<<<<< HEAD
+=======
+.. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Args:
     {input}
     {opt_dim}
@@ -7713,7 +7836,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     other (Tensor or Number): the tensor or number to multiply input by.
+=======
+    other (Tensor or Number) - the tensor or number to multiply input by.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {out}
@@ -9056,7 +9183,11 @@ def merge_dicts(*dicts):
 Keyword args:
     {generator}
     {out}
+<<<<<<< HEAD
     dtype (torch.dtype, optional): the desired data type of returned tensor. Default: if ``None``,
+=======
+    dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         this function returns a tensor with dtype ``torch.int64``.
     {layout}
     {device}
@@ -9645,6 +9776,7 @@ def merge_dicts(*dicts):
 )
 
 add_docstr(
+<<<<<<< HEAD
     torch.segment_reduce,
     r"""
 segment_reduce(data: Tensor, reduce: str, *, lengths: Tensor | None = None, indices: Tensor | None = None, offsets: Tensor | None = None, axis: _int = 0, unsafe: _bool = False, initial: Number | _complex | None = None) -> Tensor # noqa: B950
@@ -9673,6 +9805,8 @@ def merge_dicts(*dicts):
 )
 
 add_docstr(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.select,
     r"""
 select(input, dim, index) -> Tensor
@@ -9940,8 +10074,12 @@ def merge_dicts(*dicts):
     r"""
 sin(input, *, out=None) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the sine of the elements in the :attr:`input` tensor,
 where each value in this input tensor is in radians.
+=======
+Returns a new tensor with the sine of the elements of :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \sin(\text{input}_{i})
@@ -10008,7 +10146,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.sort,
     r"""
+<<<<<<< HEAD
 sort(input, dim=-1, descending=False, *, stable=False, out=None) -> (Tensor, LongTensor)
+=======
+sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Sorts the elements of the :attr:`input` tensor along a given dimension
 in ascending order by value.
@@ -10029,10 +10171,17 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+<<<<<<< HEAD
 
 Keyword args:
     stable (bool, optional): makes the sorting routine stable, which guarantees that the order
         of equivalent elements is preserved.
+=======
+    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+       of equivalent elements is preserved.
+
+Keyword args:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
         be optionally given to be used as output buffers
 
@@ -10073,7 +10222,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.argsort,
     r"""
+<<<<<<< HEAD
 argsort(input, dim=-1, descending=False, *, stable=False) -> Tensor
+=======
+argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns the indices that sort a tensor along a given dimension in ascending
 order by value.
@@ -10089,8 +10242,11 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+<<<<<<< HEAD
 
 Keyword args:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stable (bool, optional): controls the relative order of equivalent elements
 
 Example::
@@ -10167,7 +10323,11 @@ def merge_dicts(*dicts):
         subtracted by the number before it denotes the number of
         elements or blocks in a given compressed dimension.
     plain_indices (array_like): Plain dimension (column or row)
+<<<<<<< HEAD
         coordinates of each element or block in values. (B+1)-dimensional
+=======
+        co-ordinates of each element or block in values. (B+1)-dimensional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor with the same length as values.
 
     values (array_list): Initial values for the tensor. Can be a list,
@@ -10235,7 +10395,11 @@ def merge_dicts(*dicts):
         starts. Each successive number in the tensor subtracted by the
         number before it denotes the number of elements in a given
         row.
+<<<<<<< HEAD
     col_indices (array_like): Column coordinates of each element in
+=======
+    col_indices (array_like): Column co-ordinates of each element in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         values. (B+1)-dimensional tensor with the same length
         as values.
     values (array_list): Initial values for the tensor. Can be a list,
@@ -10298,7 +10462,11 @@ def merge_dicts(*dicts):
         starts. Each successive number in the tensor subtracted by the
         number before it denotes the number of elements in a given
         column.
+<<<<<<< HEAD
     row_indices (array_like): Row coordinates of each element in
+=======
+    row_indices (array_like): Row co-ordinates of each element in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         values. (B+1)-dimensional tensor with the same length as
         values.
     values (array_list): Initial values for the tensor. Can be a list,
@@ -10361,7 +10529,11 @@ def merge_dicts(*dicts):
         given row block starts. Each successive number in the tensor
         subtracted by the number before it denotes the number of
         blocks in a given row.
+<<<<<<< HEAD
     col_indices (array_like): Column block coordinates of each block
+=======
+    col_indices (array_like): Column block co-ordinates of each block
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in values. (B+1)-dimensional tensor with the same length as
         values.
     values (array_list): Initial values for the tensor. Can be a list,
@@ -10429,7 +10601,11 @@ def merge_dicts(*dicts):
         column starts. Each successive number in the tensor subtracted
         by the number before it denotes the number of elements in a
         given column.
+<<<<<<< HEAD
     row_indices (array_like): Row block coordinates of each block in
+=======
+    row_indices (array_like): Row block co-ordinates of each block in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         values. (B+1)-dimensional tensor with the same length
         as values.
     values (array_list): Initial blocks for the tensor. Can be a list,
@@ -11358,8 +11534,12 @@ def merge_dicts(*dicts):
     r"""
 tan(input, *, out=None) -> Tensor
 
+<<<<<<< HEAD
 Returns a new tensor with the tangent of the elements in the :attr:`input` tensor,
 where each value in this input tensor is in radians.
+=======
+Returns a new tensor with the tangent of the elements of :attr:`input`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. math::
     \text{out}_{i} = \tan(\text{input}_{i})
@@ -12250,12 +12430,15 @@ def merge_dicts(*dicts):
     Floating point and complex tensors are filled with NaN, and integer tensors
     are filled with the maximum value.
 
+<<<<<<< HEAD
     When ``torch.preserve_format`` is used:
     If the input tensor is dense (i.e., non-overlapping strided),
     its memory format (including strides) is retained.
     Otherwise (e.g., a non-dense view like a stepped slice),
     the output is converted to the dense format.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Args:
     {input}
 
@@ -12419,6 +12602,7 @@ def merge_dicts(*dicts):
     {device}
     {requires_grad}
     {memory_format}
+<<<<<<< HEAD
 
 Example::
 
@@ -12437,6 +12621,8 @@ def merge_dicts(*dicts):
     tensor([[-1., -1., -1., -1.],
             [-1., -1., -1., -1.],
             [-1., -1., -1., -1.]], dtype=torch.float64)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """.format(**factory_like_common_args),
 )
 
@@ -12632,8 +12818,13 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.hamming_window,
     """
+<<<<<<< HEAD
 hamming_window(window_length, *, dtype=None, layout=None, device=None, pin_memory=False, \
 requires_grad=False) -> Tensor
+=======
+hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
     + r"""
 Hamming window function.
@@ -12661,6 +12852,7 @@ def merge_dicts(*dicts):
     + r"""
 Arguments:
     window_length (int): the size of returned window
+<<<<<<< HEAD
 
 Keyword args:
     {dtype} Only floating point types are supported.
@@ -12683,12 +12875,19 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+=======
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+    beta (float, optional): The coefficient :math:`\beta` in the equation above
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
     {device}
+<<<<<<< HEAD
     {pin_memory}
     {requires_grad}
 
@@ -12737,6 +12936,8 @@ def merge_dicts(*dicts):
           ``torch.strided`` (dense layout) is supported.
     {device}
     {pin_memory}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {requires_grad}
 
 Returns:
diff --git a/torch/_utils.py b/torch/_utils.py
index 01cf9d393188b..5f58e3a666061 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -7,9 +7,14 @@
 import traceback
 import warnings
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from types import ModuleType
 from typing import Any, Generic, Optional, TYPE_CHECKING
+=======
+from types import ModuleType
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -37,7 +42,11 @@ def _type(self, dtype=None, non_blocking=False, **kwargs):
 
     if isinstance(dtype, str):
         dtype = _import_dotted_name(dtype)
+<<<<<<< HEAD
     if dtype is type(self):
+=======
+    if dtype == type(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
     if self.is_sparse:
         if not dtype.is_sparse:
@@ -118,7 +127,11 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
         message = "{}() got an unexpected keyword argument '{}'"
         argument = list(kwargs.keys()).pop()
         raise TypeError(message.format(function_name, argument))
+<<<<<<< HEAD
     warnings.warn("'async' is deprecated; use 'non_blocking'", stacklevel=2)
+=======
+    warnings.warn("'async' is deprecated; use 'non_blocking'")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kwargs["async"]
 
 
@@ -160,7 +173,11 @@ def _get_restore_location(device):
 #     serialization), and the state dict saves "data" only, thus
 #     stripping the backward hooks.  In some cases, hooks are
 #     essential to the well-functioning of a model (e.g., DDP),
+<<<<<<< HEAD
 #     but DDP already manages re-adding the hooks!
+=======
+#     but DDP already manages readding the hooks!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 #   - We didn't serialize them in many cases.  Prior to #10220, we
 #     were dropping backward hooks in ForkingPickler.  We "fixed" this
@@ -191,7 +208,11 @@ def _rebuild_tensor(storage, storage_offset, size, stride):
 
 def get_tensor_metadata(tensor):
     # Tensor's Metadata for serializing.
+<<<<<<< HEAD
     # Currently, this only returns a dict[string, bool] specifying whether
+=======
+    # Currently, this only returns a dict[string, bool] specifing whether
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # `conj` or `neg` bit is set.
     assert isinstance(tensor, torch.Tensor)
     return torch._C._get_tensor_metadata(tensor)  # type: ignore[attr-defined]
@@ -500,7 +521,11 @@ def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
 
 def _get_obj_state(obj):
     # Get the state of the python subclass
+<<<<<<< HEAD
     # This loosely mimics the function on the object class but since Tensor do not inherit
+=======
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # from it, we cannot call that function directly
     # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
     # Note that starting with Python 3.11, this `__getstate__` is always defined and thus
@@ -686,8 +711,13 @@ def _take_tensors(tensors, size_limit):
         if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
             yield buf_and_size[0]
             buf_and_size = buf_dict[t] = [[], 0]
+<<<<<<< HEAD
         buf_and_size[0].append(tensor)  # pyrefly: ignore [missing-attribute]
         buf_and_size[1] += size  # pyrefly: ignore [unsupported-operation]
+=======
+        buf_and_size[0].append(tensor)
+        buf_and_size[1] += size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for buf, _ in buf_dict.values():
         if len(buf) > 0:
             yield buf
@@ -744,7 +774,10 @@ def __init__(self, exc_info=None, where="in background"):
         if exc_info is None:
             exc_info = sys.exc_info()
         self.exc_type = exc_info[0]
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.exc_msg = "".join(traceback.format_exception(*exc_info))
         self.where = where
 
@@ -752,8 +785,13 @@ def reraise(self):
         r"""Reraises the wrapped exception in the current thread"""
         # Format a message such as: "Caught ValueError in DataLoader worker
         # process 2. Original Traceback:", followed by the traceback.
+<<<<<<< HEAD
         msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"  # pyrefly: ignore [missing-attribute]
         if self.exc_type is KeyError:
+=======
+        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"
+        if self.exc_type == KeyError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # KeyError calls repr() on its argument (usually a dict key). This
             # makes stack traces unreadable. It will not be changed in Python
             # (https://bugs.python.org/issue2651), so we work around it.
@@ -761,6 +799,7 @@ def reraise(self):
         elif getattr(self.exc_type, "message", None):
             # Some exceptions have first argument as non-str but explicitly
             # have message field
+<<<<<<< HEAD
             # pyrefly: ignore [not-callable]
             raise self.exc_type(
                 # pyrefly: ignore [unexpected-keyword]
@@ -768,6 +807,11 @@ def reraise(self):
             )
         try:
             exception = self.exc_type(msg)  # pyrefly: ignore [not-callable]
+=======
+            raise self.exc_type(message=msg)
+        try:
+            exception = self.exc_type(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Exception:
             # If the exception takes multiple arguments or otherwise can't
             # be constructed, don't try to instantiate since we don't know how to
@@ -1019,12 +1063,20 @@ def __init__(self):
         self.call_order = []
 
     def queue_seed_all(self, cb, traceback):
+<<<<<<< HEAD
         self.manual_seed_all_cb = (cb, traceback)  # pyrefly: ignore [bad-assignment]
+=======
+        self.manual_seed_all_cb = (cb, traceback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # update seed_all to be latest
         self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
 
     def queue_seed(self, cb, traceback):
+<<<<<<< HEAD
         self.manual_seed_cb = (cb, traceback)  # pyrefly: ignore [bad-assignment]
+=======
+        self.manual_seed_cb = (cb, traceback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # update seed to be latest
         self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
 
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index 3a172a814e2e5..1c5e2ea0a2c42 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -4,9 +4,13 @@
 import os
 import sys
 import tempfile
+<<<<<<< HEAD
 import typing_extensions
 from collections.abc import Callable
 from typing import Any, Optional, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -30,6 +34,7 @@
         StrobelightCompileTimeProfiler.enable()
 
 # this arbitrary-looking assortment of functionality is provided here
+<<<<<<< HEAD
 # to have a central place for overridable behavior. The motivating
 # use is the FB build environment, where this source file is replaced
 # by an equivalent.
@@ -38,6 +43,22 @@
     torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 else:
     torch_parent = os.path.dirname(os.path.dirname(__file__))
+=======
+# to have a central place for overrideable behavior. The motivating
+# use is the FB build environment, where this source file is replaced
+# by an equivalent.
+
+if torch._running_with_deploy():
+    # __file__ is meaningless in the context of frozen torch used in torch deploy.
+    # setting empty torch_parent should allow below functions to operate without crashing,
+    # but it's unclear if there is a valid use case for them in the context of deploy.
+    torch_parent = ""
+else:
+    if os.path.basename(os.path.dirname(__file__)) == "shared":
+        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    else:
+        torch_parent = os.path.dirname(os.path.dirname(__file__))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_file_path(*path_components: str) -> str:
@@ -84,11 +105,15 @@ def compile_time_strobelight_meta_inner(
     ) -> Callable[_P, _T]:
         @functools.wraps(function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+<<<<<<< HEAD
             if "skip" in kwargs and isinstance(
                 # pyrefly: ignore [unsupported-operation]
                 skip := kwargs["skip"],
                 int,
             ):
+=======
+            if "skip" in kwargs and isinstance(skip := kwargs["skip"], int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["skip"] = skip + 1
 
             # This is not needed but we have it here to avoid having profile_compile_time
@@ -122,10 +147,13 @@ def signpost_event(category: str, name: str, parameters: dict[str, Any]):
     log.info("%s %s: %r", category, name, parameters)
 
 
+<<<<<<< HEAD
 def add_mlhub_insight(category: str, insight: str, insight_description: str):
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def log_compilation_event(metrics):
     log.info("%s", metrics)
 
@@ -142,10 +170,13 @@ def log_export_usage(**kwargs):
     pass
 
 
+<<<<<<< HEAD
 def log_draft_export_usage(**kwargs):
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def log_trace_structured_event(*args, **kwargs) -> None:
     pass
 
@@ -181,9 +212,12 @@ def log_torch_jit_trace_exportability(
     return
 
 
+<<<<<<< HEAD
 DISABLE_JUSTKNOBS = True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def justknobs_check(name: str, default: bool = True) -> bool:
     """
     This function can be used to killswitch functionality in FB prod,
@@ -223,9 +257,12 @@ def is_fb_unit_test() -> bool:
 
 @functools.cache
 def max_clock_rate():
+<<<<<<< HEAD
     """
     unit: MHz
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not torch.version.hip:
         from triton.testing import nvsmi
 
@@ -296,6 +333,7 @@ def record_chromium_event_internal(
 
 def profiler_allow_cudagraph_cupti_lazy_reinit_cuda12():
     return True
+<<<<<<< HEAD
 
 
 def deprecated():
@@ -376,3 +414,5 @@ def find_compile_subproc_binary() -> Optional[str]:
     Allows overriding the binary used for subprocesses
     """
     return None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py
index 3f303f78a4713..88dc94fd34679 100644
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 1ac9d2046f242..ac6c1798ea9cd 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -27,7 +27,10 @@
 
 from _codecs import encode
 from collections import Counter, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pickle import (
     APPEND,
     APPENDS,
@@ -69,7 +72,11 @@
 )
 from struct import unpack
 from sys import maxsize
+<<<<<<< HEAD
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._utils import _sparse_tensors_to_validate, IMPORT_MAPPING, NAME_MAPPING
@@ -404,12 +411,18 @@ def load(self):
                     func not in _get_allowed_globals().values()
                     and func not in _get_user_allowed_globals().values()
                 ):
+<<<<<<< HEAD
                     error_msg = (
                         f"Trying to call reduce for unrecognized function {func}"
                     )
                     if hasattr(func, "__self__"):
                         error_msg += f" which belongs to {func.__self__}"
                     raise UnpicklingError(error_msg)
+=======
+                    raise UnpicklingError(
+                        f"Trying to call reduce for unrecognized function {func}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result = func(*args)
                 if func in torch._tensor_classes and "sparse" in func.__module__:
                     _sparse_tensors_to_validate.append(result)
@@ -419,7 +432,10 @@ def load(self):
                 inst = self.stack[-1]
                 if type(inst) is torch.Tensor:
                     # Legacy unpickling
+<<<<<<< HEAD
                     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     inst.set_(*state)
                 elif type(inst) is torch.nn.Parameter:
                     inst.__setstate__(state)
@@ -522,7 +538,11 @@ def load(self):
             elif key[0] == BINPERSID[0]:
                 pid = self.stack.pop()
                 # Only allow persistent load of storage
+<<<<<<< HEAD
                 if type(pid) is not tuple and type(pid) is not int:
+=======
+                if type(pid) is not tuple and not type(pid) is not int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise UnpicklingError(
                         f"persistent_load id must be tuple or int, but got {type(pid)}"
                     )
@@ -555,8 +575,12 @@ def load(self):
                         f"Detected pickle protocol {self.proto} in the checkpoint, which was "
                         "not the default pickle protocol used by `torch.load` (2). The weights_only "
                         "Unpickler might not support all instructions implemented by this protocol, "
+<<<<<<< HEAD
                         "please file an issue for adding support if you encounter this.",
                         stacklevel=2,
+=======
+                        "please file an issue for adding support if you encounter this."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
             elif key[0] == STOP[0]:
                 rc = self.stack.pop()
diff --git a/torch/accelerator/__init__.py b/torch/accelerator/__init__.py
index 4d1a78df1f74c..da92fb567e755 100644
--- a/torch/accelerator/__init__.py
+++ b/torch/accelerator/__init__.py
@@ -8,6 +8,7 @@
 import torch
 
 from ._utils import _device_t, _get_device_index
+<<<<<<< HEAD
 from .memory import (
     empty_cache,
     max_memory_allocated,
@@ -18,6 +19,8 @@
     reset_accumulated_memory_stats,
     reset_peak_memory_stats,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -25,6 +28,7 @@
     "current_device_idx",  # deprecated
     "current_device_index",
     "current_stream",
+<<<<<<< HEAD
     "empty_cache",
     "device_count",
     "device_index",
@@ -36,6 +40,11 @@
     "memory_stats",
     "reset_accumulated_memory_stats",
     "reset_peak_memory_stats",
+=======
+    "device_count",
+    "device_index",
+    "is_available",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "set_device_idx",  # deprecated
     "set_device_index",
     "set_stream",
@@ -137,6 +146,7 @@ def current_device_index() -> int:
     category=FutureWarning,
 )(current_device_index)
 
+<<<<<<< HEAD
 current_device_idx.__doc__ = r"""
     (Deprecated) Return the index of a currently selected device for the current :ref:`accelerator<accelerators>`.
 
@@ -149,6 +159,8 @@ def current_device_index() -> int:
         and will be removed in a future PyTorch release.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def set_device_index(device: _device_t, /) -> None:
     r"""Set the current device index to a given device.
@@ -168,6 +180,7 @@ def set_device_index(device: _device_t, /) -> None:
     category=FutureWarning,
 )(set_device_index)
 
+<<<<<<< HEAD
 set_device_idx.__doc__ = r"""
     (Deprecated) Set the current device index to a given device.
 
@@ -181,6 +194,8 @@ def set_device_index(device: _device_t, /) -> None:
         and will be removed in a future PyTorch release.
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def current_stream(device: _device_t = None, /) -> torch.Stream:
     r"""Return the currently selected stream for a given device.
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index cd9039ce9f0e5..200bf03fb2ac6 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -43,9 +43,13 @@ def decorate_autocast(*args, **kwargs):
         with autocast_instance:
             return func(*args, **kwargs)
 
+<<<<<<< HEAD
     decorate_autocast.__script_unsupported = (  # type: ignore[attr-defined]
         "@autocast() decorator is not supported in script mode"
     )
+=======
+    decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode"  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return decorate_autocast
 
 
@@ -90,9 +94,15 @@ class autocast:
 
         class AutocastModel(nn.Module):
             ...
+<<<<<<< HEAD
 
             @torch.autocast(device_type="cuda")
             def forward(self, input): ...
+=======
+            @torch.autocast(device_type="cuda")
+            def forward(self, input):
+                ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
     After returning to an autocast-disabled region, using them with floating-point
@@ -154,11 +164,17 @@ class TestModel(nn.Module):
             def __init__(self, input_size, num_classes):
                 super().__init__()
                 self.fc1 = nn.Linear(input_size, num_classes)
+<<<<<<< HEAD
 
             def forward(self, x):
                 return self.fc1(x)
 
 
+=======
+            def forward(self, x):
+                return self.fc1(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_size = 2
         num_classes = 2
         model = TestModel(input_size, num_classes).eval()
@@ -230,6 +246,7 @@ def __init__(
             raise ValueError(
                 f"Expected `device_type` of type `str`, got: `{type(device_type)}`"
             )
+<<<<<<< HEAD
         self.fast_dtype = (
             torch.get_autocast_dtype(device_type) if dtype is None else dtype
         )
@@ -237,16 +254,30 @@ def __init__(
             self._enabled = enabled
             self.device = device_type
             assert self.fast_dtype is not None
+=======
+        if dtype is None:
+            dtype = torch.get_autocast_dtype(device_type)
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = device_type
+            self.fast_dtype = dtype
+            assert dtype is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
         self.device = device_type
         if not is_autocast_available(self.device):
             raise RuntimeError(
                 f"User specified an unsupported autocast device_type '{self.device}'"
             )
+<<<<<<< HEAD
 
         device_supported_dtypes = [torch.bfloat16, torch.float16]
 
         self.custom_backend_name = torch._C._get_privateuse1_backend_name()
+=======
+        self.custom_backend_name = torch._C._get_privateuse1_backend_name()
+        self.fast_dtype = torch.get_autocast_dtype(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device == self.custom_backend_name:
             necessary_funcs = [
                 "get_amp_supported_dtype",
@@ -262,6 +293,7 @@ def __init__(
                 assert hasattr(self.custom_device_mod, func), (
                     message + f"But the func `{func}` is missing. \n"
                 )
+<<<<<<< HEAD
             device_supported_dtypes = self.custom_device_mod.get_amp_supported_dtype()
 
         self._cache_enabled = (
@@ -306,12 +338,119 @@ def __init__(
                     and self.fast_dtype == torch.bfloat16
                     and not torch.backends.mps.is_macos_or_newer(14, 0)
                 ):
+=======
+
+        self._cache_enabled = torch.is_autocast_cache_enabled()
+        if (
+            enabled
+            and self.device == "cuda"
+            and torch.cuda.amp.common.amp_definitely_not_available()
+        ):
+            warnings.warn(
+                "User provided device_type of 'cuda', but CUDA is not available. Disabling"
+            )
+            enabled = False
+        if dtype is not None:
+            self.fast_dtype = dtype
+        if cache_enabled is not None:
+            self._cache_enabled = cache_enabled
+
+        if self.device == "cpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype and enabled:
+                error_message = "In CPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "CPU Autocast only supports dtype of "
+                error_message += (
+                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "mtia":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In MTIA autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "MTIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "maia":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In MAIA autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "MAIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "xpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In XPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "XPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "ipu":
+            supported_dtypes = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtypes:
+                error_message = "In IPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "IPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "hpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In HPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "HPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == self.custom_backend_name:
+            supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
+            if self.fast_dtype not in supported_dtype:
+                error_message = f"In {self.custom_backend_name} autocast, but the target dtype is not supported. "
+                error_message += f"Disabling autocast.\n {self.custom_backend_name} Autocast only supports dtypes of "
+                error_message += (
+                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "cuda":
+            if (
+                enabled
+                and self.fast_dtype == torch.bfloat16
+                and not torch.cuda.is_bf16_supported()
+            ):
+                raise RuntimeError(
+                    "Current CUDA Device does not support bfloat16. Please switch dtype to float16."
+                )
+        elif self.device == "mps":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = (
+                    "In MPS autocast, but the target dtype is not supported. Disabling autocast.\n"
+                    "MPS Autocast only supports dtype of torch.bfloat16 and torch.float16 currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+            elif self.fast_dtype == torch.bfloat16:
+                if not torch.backends.mps.is_macos_or_newer(14, 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     error_message = (
                         "In MPS autocast, but the target dtype torch.bfloat16 is not supported "
                         "on macOS versions below 14. Disabling autocast."
                     )
+<<<<<<< HEAD
                     warnings.warn(error_message, stacklevel=2)
                     enabled = False
+=======
+                    warnings.warn(error_message)
+                    enabled = False
+        elif self.device == "xla":
+            supported_dtype = [torch.float16, torch.bfloat16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In XLA autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += (
+                    "XLA Autocast only supports dtype of torch.bfloat16 currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._enabled = enabled
 
     def __enter__(self):
@@ -343,10 +482,14 @@ def __enter__(self):
                         self._enabled,
                         self._cache_enabled,
                     )
+<<<<<<< HEAD
                     mode.__torch_function__(torch.amp._enter_autocast, (), args)
                     return self
 
         return self
+=======
+                    return mode.__torch_function__(torch.amp._enter_autocast, (), args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
         if torch._jit_internal.is_scripting():
@@ -369,10 +512,14 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[ov
                     mode,
                     torch.fx.experimental.proxy_tensor.PreDispatchTorchFunctionMode,
                 ):
+<<<<<<< HEAD
                     mode.__torch_function__(torch.amp._exit_autocast, (), ())
                     # This is very important because the above line actually doesn't
                     # run exit code so it end up swallowing exceptions.
                     return False
+=======
+                    return mode.__torch_function__(torch.amp._exit_autocast, (), ())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     def __call__(self, func):
@@ -413,11 +560,15 @@ def _cast(value, device_type: str, dtype: _dtype):
         return value.to(dtype) if is_eligible else value
     elif isinstance(value, (str, bytes)):
         return value
+<<<<<<< HEAD
     elif HAS_NUMPY and isinstance(
         value,
         # pyrefly: ignore [missing-attribute]
         np.ndarray,
     ):
+=======
+    elif HAS_NUMPY and isinstance(value, np.ndarray):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return value
     elif isinstance(value, collections.abc.Mapping):
         return {
@@ -474,18 +625,30 @@ def decorate_fwd(*args, **kwargs):
         args[0]._dtype = torch.get_autocast_dtype(device_type)
         if cast_inputs is None:
             args[0]._fwd_used_autocast = torch.is_autocast_enabled(device_type)
+<<<<<<< HEAD
             return fwd(*args, **kwargs)  # pyrefly: ignore [not-callable]
+=======
+            return fwd(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             autocast_context = torch.is_autocast_enabled(device_type)
             args[0]._fwd_used_autocast = False
             if autocast_context:
                 with autocast(device_type=device_type, enabled=False):
+<<<<<<< HEAD
                     return fwd(  # pyrefly: ignore  # not-callable
+=======
+                    return fwd(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         *_cast(args, device_type, cast_inputs),
                         **_cast(kwargs, device_type, cast_inputs),
                     )
             else:
+<<<<<<< HEAD
                 return fwd(*args, **kwargs)  # pyrefly: ignore [not-callable]
+=======
+                return fwd(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return decorate_fwd
 
@@ -520,6 +683,10 @@ def decorate_bwd(*args, **kwargs):
             enabled=args[0]._fwd_used_autocast,
             dtype=args[0]._dtype,
         ):
+<<<<<<< HEAD
             return bwd(*args, **kwargs)  # pyrefly: ignore [not-callable]
+=======
+            return bwd(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return decorate_bwd
diff --git a/torch/amp/grad_scaler.py b/torch/amp/grad_scaler.py
index 506c2cf901600..41462881caf9a 100644
--- a/torch/amp/grad_scaler.py
+++ b/torch/amp/grad_scaler.py
@@ -134,8 +134,12 @@ def __init__(
         if self._device == "cuda":
             if enabled and torch.cuda.amp.common.amp_definitely_not_available():
                 warnings.warn(
+<<<<<<< HEAD
                     "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.",
                     stacklevel=2,
+=======
+                    "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self._enabled = False
 
@@ -176,6 +180,7 @@ def _lazy_init_scale_growth_tracker(self, dev: torch.device) -> None:
         )
 
     @overload
+<<<<<<< HEAD
     def scale(self, outputs: torch.Tensor) -> torch.Tensor: ...
 
     @overload
@@ -186,6 +191,22 @@ def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]:
 
     @overload
     def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]: ...
+=======
+    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
+        ...
+
+    @overload
+    def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]:
+        ...
+
+    @overload
+    def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]:
+        ...
+
+    @overload
+    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def scale(
         self,
@@ -422,7 +443,10 @@ def step(
                     "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
                     "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
                     FutureWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 kwargs_.update({"grad_scaler": self})
             else:
@@ -456,9 +480,15 @@ def step(
         if optimizer_state["stage"] is OptState.READY:
             self.unscale_(optimizer)
 
+<<<<<<< HEAD
         assert len(optimizer_state["found_inf_per_device"]) > 0, (
             "No inf checks were recorded for this optimizer."
         )
+=======
+        assert (
+            len(optimizer_state["found_inf_per_device"]) > 0
+        ), "No inf checks were recorded for this optimizer."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
 
@@ -502,10 +532,15 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
             if isinstance(new_scale, float):
                 self._scale.fill_(new_scale)
             else:
+<<<<<<< HEAD
                 reason = (
                     "new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
                     "torch.FloatTensor with requires_grad=False."
                 )
+=======
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert new_scale.device.type == self._device, reason
                 assert new_scale.numel() == 1, reason
                 assert new_scale.requires_grad is False, reason
@@ -683,9 +718,15 @@ def _check_inf_per_device(self, optimizer: torch.optim.Optimizer) -> dict[str, A
         dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
         found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
 
+<<<<<<< HEAD
         self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = (
             self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
         )
+=======
+        self._per_optimizer_states[id(optimizer)][
+            "found_inf_per_device"
+        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
 
diff --git a/torch/ao/nn/intrinsic/modules/fused.py b/torch/ao/nn/intrinsic/modules/fused.py
index 030ac21f91586..0fdcd56968f5f 100644
--- a/torch/ao/nn/intrinsic/modules/fused.py
+++ b/torch/ao/nn/intrinsic/modules/fused.py
@@ -245,7 +245,11 @@ class LinearLeakyReLU(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, leaky_relu):
+<<<<<<< HEAD
         assert type(linear) is Linear and type(leaky_relu) is torch.nn.LeakyReLU, (
+=======
+        assert type(linear) == Linear and type(leaky_relu) == torch.nn.LeakyReLU, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}"
         )
         super().__init__(linear, leaky_relu)
@@ -256,7 +260,11 @@ class LinearTanh(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, tanh):
+<<<<<<< HEAD
         assert type(linear) is Linear and type(tanh) is torch.nn.Tanh, (
+=======
+        assert type(linear) == Linear and type(tanh) == torch.nn.Tanh, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Incorrect types for input modules{type(linear)}{type(tanh)}"
         )
         super().__init__(linear, tanh)
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index 65ec31ee3e3b7..a64642bea60ee 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -361,7 +361,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         """
         # The ignore is because _FLOAT_MODULE is a TypeVar here where the bound
         # has no __name__ (code is fine though)
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "qat."
             + cls.__name__
             + ".from_float only works for "
@@ -620,7 +624,10 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
@@ -821,7 +828,10 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
@@ -1023,7 +1033,10 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
index b8fac4d51bb11..b7976c0db2a06 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -150,7 +150,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod' a float module, either produced by torch.ao.quantization
         utilities or directly from user
         """
+<<<<<<< HEAD
         assert type(mod) is nni.LinearBn1d, (
+=======
+        assert type(mod) == nni.LinearBn1d, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "qat."
             + cls.__name__
             + ".from_float only works for "
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index c8533f8d5b67f..7db21a35a3c27 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,11 +1,16 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 from typing import Optional
 
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.qat as nnqat
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.ao.nn.intrinsic.modules.fused import _FusedModule
 
 
@@ -13,6 +18,11 @@
 
 
 class LinearReLU(nnqat.Linear, _FusedModule):
+=======
+
+
+class LinearReLU(nnqat.Linear, nni._FusedModule):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     A LinearReLU module fused from Linear and ReLU modules, attached with
     FakeQuantize modules for weight, used in
@@ -36,6 +46,7 @@ class LinearReLU(nnqat.Linear, _FusedModule):
         torch.Size([128, 30])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     _FLOAT_MODULE = nni.LinearReLU
 
@@ -60,6 +71,21 @@ def from_float(
         return super().from_float(mod, use_precomputed_fake_quant)  # type: ignore[no-untyped-call,no-any-return]
 
     def to_float(self) -> nni.LinearReLU:
+=======
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, qconfig=None):
+        super().__init__(in_features, out_features, bias, qconfig)
+
+    def forward(self, input):
+        return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+        return super().from_float(mod, use_precomputed_fake_quant)
+
+    def to_float(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         linear = torch.nn.Linear(
             self.in_features, self.out_features, self.bias is not None
         )
@@ -67,4 +93,8 @@ def to_float(self) -> nni.LinearReLU:
         if self.bias is not None:
             linear.bias = torch.nn.Parameter(self.bias.detach())
         relu = torch.nn.ReLU()
+<<<<<<< HEAD
         return torch.ao.nn.intrinsic.LinearReLU(linear, relu)  # type: ignore[no-untyped-call]
+=======
+        return torch.ao.nn.intrinsic.LinearReLU(linear, relu)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
index 620d24ae43e46..cf868b460fc4d 100644
--- a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -1,6 +1,10 @@
+<<<<<<< HEAD
 from typing import Any
 from typing_extensions import Self
 
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.quantized.dynamic as nnqd
@@ -30,6 +34,7 @@ class LinearReLU(nnqd.Linear):
         torch.Size([128, 30])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     _FLOAT_MODULE = nni.LinearReLU
 
@@ -40,6 +45,11 @@ def __init__(
         bias: bool = True,
         dtype: torch.dtype = torch.qint8,
     ) -> None:
+=======
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(in_features, out_features, bias, dtype)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -56,6 +66,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             raise RuntimeError("Unsupported dtype on dynamic quantized linear relu!")
         return Y.to(x.dtype)
 
+<<<<<<< HEAD
     def _get_name(self) -> str:
         return "DynamicQuantizedLinearReLU"
 
@@ -63,10 +74,21 @@ def _get_name(self) -> str:
     def from_float(
         cls, mod: torch.nn.Module, use_precomputed_fake_quant: bool = False
     ) -> Self:
+=======
+    def _get_name(self):
+        return "DynamicQuantizedLinearReLU"
+
+    @classmethod
+    def from_float(cls, mod, use_precomputed_fake_quant=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qlinear_relu: Any) -> Self:  # type: ignore[override]
+=======
+    def from_reference(cls, ref_qlinear_relu):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_reference(ref_qlinear_relu[0])
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
index c2c5e58fb81c3..0828754067417 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -54,7 +54,10 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode=padding_mode,
             device=device,
             dtype=dtype,
@@ -80,7 +83,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+<<<<<<< HEAD
         if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
+=======
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -95,7 +102,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU1d, (
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "BatchNorm1d should be fused into Conv1d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
@@ -161,7 +172,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+<<<<<<< HEAD
         if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
+=======
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -178,7 +193,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU2d, (
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "BatchNorm2d should be fused into Conv2d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
@@ -244,7 +263,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+<<<<<<< HEAD
         if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
+=======
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -261,7 +284,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU3d, (
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "BatchNorm3d should be fused into Conv3d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
index 8ec84101ee0da..34040e47d91c8 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -94,7 +94,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+<<<<<<< HEAD
         assert type(mod) is nni.LinearLeakyReLU, (
+=======
+        assert type(mod) == nni.LinearLeakyReLU, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Input float module should be LinearLeakyReLU"
         )
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
@@ -163,7 +167,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+<<<<<<< HEAD
         assert type(mod) is nni.LinearTanh, "Input float module should be LinearTanh"
+=======
+        assert type(mod) == nni.LinearTanh, "Input float module should be LinearTanh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
         activation_post_process = mod.activation_post_process
         mod = mod[0]
diff --git a/torch/ao/nn/qat/modules/conv.py b/torch/ao/nn/qat/modules/conv.py
index e1f869d105c9b..a56ada45aca4d 100644
--- a/torch/ao/nn/qat/modules/conv.py
+++ b/torch/ao/nn/qat/modules/conv.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import ClassVar, Literal, Union
+=======
+from typing import ClassVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -26,7 +30,11 @@ def __init__(
         output_padding: tuple[int, ...],
         groups: int,
         bias: bool,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"],
+=======
+        padding_mode: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig=None,
         device=None,
         dtype=None,
@@ -62,7 +70,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
            `mod`: a float module, either produced by torch.ao.quantization utilities
            or directly from user
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "qat."
             + cls.__name__
             + ".from_float only works for "
@@ -114,7 +126,10 @@ def to_float(self):
             assert hasattr(cls, "_FLOAT_RELU_MODULE")
             relu = cls._FLOAT_RELU_MODULE()
             modules.append(relu)
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fused = cls._FLOAT_MODULE(*modules)
             fused.train(self.training)
             return fused
@@ -149,7 +164,11 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig=None,
         device=None,
         dtype=None,
@@ -211,7 +230,11 @@ def __init__(
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig=None,
         device=None,
         dtype=None,
@@ -276,7 +299,11 @@ def __init__(
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig=None,
         device=None,
         dtype=None,
diff --git a/torch/ao/nn/qat/modules/embedding_ops.py b/torch/ao/nn/qat/modules/embedding_ops.py
index 1f69e70abcf1d..ba2635ffce441 100644
--- a/torch/ao/nn/qat/modules/embedding_ops.py
+++ b/torch/ao/nn/qat/modules/embedding_ops.py
@@ -50,7 +50,10 @@ def __init__(
             scale_grad_by_freq,
             sparse,
             _weight,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **factory_kwargs,
         )
         assert qconfig, "qconfig must be provided for QAT module"
@@ -79,7 +82,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " qat."
             + cls.__name__
             + ".from_float only works for "
@@ -202,7 +209,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " qat."
             + cls.__name__
             + ".from_float only works for "
diff --git a/torch/ao/nn/quantizable/modules/activation.py b/torch/ao/nn/quantizable/modules/activation.py
index fbb83c0ec289f..c467498339d29 100644
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@@ -116,7 +116,11 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, other):
+<<<<<<< HEAD
         assert type(other) is cls._FLOAT_MODULE
+=======
+        assert type(other) == cls._FLOAT_MODULE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
         # Setting the dropout to 0.0!
         observed = cls(
@@ -170,11 +174,16 @@ def from_float(cls, other):
             observed.linear_K.weight = nn.Parameter(other.k_proj_weight)
             observed.linear_V.weight = nn.Parameter(other.v_proj_weight)
             if other.in_proj_bias is None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 observed.linear_Q.bias = None
                 # pyrefly: ignore [bad-assignment]
                 observed.linear_K.bias = None
                 # pyrefly: ignore [bad-assignment]
+=======
+                observed.linear_Q.bias = None
+                observed.linear_K.bias = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 observed.linear_V.bias = None
             else:
                 observed.linear_Q.bias = nn.Parameter(
@@ -217,7 +226,11 @@ def dequantize(self):
             fp.bias_v = nn.Parameter(self.bias_v.dequantize())
 
         # Set the linear weights
+<<<<<<< HEAD
         # Note: Because the linear layers are quantized, mypy does not know how
+=======
+        # Note: Because the linear layers are quantized, mypy does not nkow how
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # to deal with them -- might need to ignore the typing checks.
         # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
         w, b = self.out_proj._weight_bias()  # type: ignore[operator, has-type]
@@ -237,7 +250,10 @@ def dequantize(self):
             _end = _start + fp.embed_dim
             fp.in_proj_weight[_start:_end, :] = wQ
             if fp.in_proj_bias is not None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert all(bQ == 0)
                 fp.in_proj_bias[_start:_end] = bQ
 
@@ -245,14 +261,20 @@ def dequantize(self):
             _end = _start + fp.embed_dim
             fp.in_proj_weight[_start:_end, :] = wK
             if fp.in_proj_bias is not None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert all(bK == 0)
                 fp.in_proj_bias[_start:_end] = bK
 
             _start = _end
             fp.in_proj_weight[_start:, :] = wV
             if fp.in_proj_bias is not None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert all(bV == 0)
                 fp.in_proj_bias[_start:] = bV
         else:
@@ -260,11 +282,16 @@ def dequantize(self):
             fp.k_proj_weight = nn.Parameter(wK)
             fp.v_proj_weight = nn.Parameter(wV)
             if fp.in_proj_bias is None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 self.linear_Q.bias = None
                 # pyrefly: ignore [bad-assignment]
                 self.linear_K.bias = None
                 # pyrefly: ignore [bad-assignment]
+=======
+                self.linear_Q.bias = None
+                self.linear_K.bias = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.linear_V.bias = None
             else:
                 fp.in_proj_bias[0 : fp.embed_dim] = bQ
@@ -472,7 +499,10 @@ def _forward_impl(
             assert static_v.size(2) == head_dim
             v = static_v
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         src_len = k.size(1)
 
         if key_padding_mask is not None:
@@ -481,6 +511,7 @@ def _forward_impl(
 
         if self.add_zero_attn:
             src_len += 1
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:])
             # pyrefly: ignore [missing-attribute]
@@ -510,6 +541,19 @@ def _forward_impl(
                     v.dtype,
                 )
             # pyrefly: ignore [no-matching-overload]
+=======
+            k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:])
+            if k.is_quantized:
+                k_zeros = torch.quantize_per_tensor(
+                    k_zeros, k.q_scale(), k.q_zero_point(), k.dtype
+                )
+            k = torch.cat([k, k_zeros], dim=1)
+            v_zeros = torch.zeros((v.size(0), 1) + k.size()[2:])
+            if v.is_quantized:
+                v_zeros = torch.quantize_per_tensor(
+                    v_zeros, v.q_scale(), v.q_zero_point(), v.dtype
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             v = torch.cat([v, v_zeros], dim=1)
 
             if attn_mask is not None:
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index 504a3d9015328..f15138588713a 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -203,7 +203,11 @@ def from_params(cls, wi, wh, bi=None, bh=None, split_gates=False):
 
     @classmethod
     def from_float(cls, other, use_precomputed_fake_quant=False, split_gates=False):
+<<<<<<< HEAD
         assert type(other) is cls._FLOAT_MODULE
+=======
+        assert type(other) == cls._FLOAT_MODULE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
         observed = cls.from_params(
             other.weight_ih,
@@ -376,7 +380,10 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
             bidirectional,
             split_gates=split_gates,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layer.qconfig = getattr(other, "qconfig", qconfig)
         wi = getattr(other, f"weight_ih_l{layer_idx}")
         wh = getattr(other, f"weight_hh_l{layer_idx}")
@@ -455,7 +462,10 @@ def __init__(
 
         if (
             not isinstance(dropout, numbers.Number)
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             or not 0 <= dropout <= 1
             or isinstance(dropout, bool)
         ):
@@ -464,21 +474,32 @@ def __init__(
                 "representing the probability of an element being "
                 "zeroed"
             )
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dropout > 0:
             warnings.warn(
                 "dropout option for quantizable LSTM is ignored. "
                 "If you are training, please, use nn.LSTM version "
+<<<<<<< HEAD
                 "followed by `prepare` step.",
                 stacklevel=2,
+=======
+                "followed by `prepare` step."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if num_layers == 1:
                 warnings.warn(
                     "dropout option adds dropout after all but last "
                     "recurrent layer, so non-zero dropout expects "
                     f"num_layers greater than 1, but got dropout={dropout} "
+<<<<<<< HEAD
                     f"and num_layers={num_layers}",
                     stacklevel=2,
+=======
+                    f"and num_layers={num_layers}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         layers = [
@@ -578,7 +599,10 @@ def from_float(cls, other, qconfig=None, split_gates=False):
             other.bidirectional,
             split_gates=split_gates,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         observed.qconfig = getattr(other, "qconfig", qconfig)
         for idx in range(other.num_layers):
             observed.layers[idx] = _LSTMLayer.from_float(
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index 8f51696c48162..446d2a473f76f 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -2,7 +2,11 @@
 r"""Dynamically quantized convolution modules."""
 
 import warnings
+<<<<<<< HEAD
 from typing import ClassVar, Literal, Optional
+=======
+from typing import ClassVar, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.quantized as nnq
@@ -62,19 +66,30 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
         reduce_range=True,
     ):
         warnings.warn(
+<<<<<<< HEAD
             f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
             stacklevel=2,
+=======
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _single(kernel_size)
         stride = _single(stride)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         padding = padding if isinstance(padding, str) else _single(padding)
         dilation = _single(dilation)
 
@@ -156,8 +171,12 @@ def __init__(
     ):
         warnings.warn(
             f"The current implementation of the {self._get_name()} module "
+<<<<<<< HEAD
             "has poor numerical accuracy and its use is not recommended",
             stacklevel=2,
+=======
+            "has poor numerical accuracy and its use is not recommended"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _pair(kernel_size)
@@ -241,8 +260,12 @@ def __init__(
         dtype=None,
     ):
         warnings.warn(
+<<<<<<< HEAD
             f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
             stacklevel=2,
+=======
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         assert padding_mode != "reflect", "Conv3d does not support reflection padding"
         factory_kwargs = {"device": device, "dtype": dtype}
@@ -333,8 +356,12 @@ def __init__(
         dtype=None,
     ):
         warnings.warn(
+<<<<<<< HEAD
             f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
             stacklevel=2,
+=======
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
@@ -416,8 +443,12 @@ def __init__(
         dtype=None,
     ):
         warnings.warn(
+<<<<<<< HEAD
             f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
             stacklevel=2,
+=======
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
@@ -499,8 +530,12 @@ def __init__(
         dtype=None,
     ):
         warnings.warn(
+<<<<<<< HEAD
             f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
             stacklevel=2,
+=======
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
diff --git a/torch/ao/nn/quantized/dynamic/modules/linear.py b/torch/ao/nn/quantized/dynamic/modules/linear.py
index 523ff78c31cf1..e4c7f0b25ccfe 100644
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -117,11 +117,17 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             + str([float_mod.__name__ for float_mod in float_modules])
         )
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+<<<<<<< HEAD
         if type(mod) is nni.LinearReLU:
             mod = mod[0]
         # pyrefly: ignore [missing-attribute]
         if mod.qconfig is not None and mod.qconfig.weight is not None:
             # pyrefly: ignore [not-callable]
+=======
+        if type(mod) == nni.LinearReLU:
+            mod = mod[0]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_observer = mod.qconfig.weight()
         else:
             # We have the circular import issues if we import the qconfig in the beginning of this file:
@@ -145,7 +151,10 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 "Unsupported dtype specified for dynamic quantized Linear!"
             )
         qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qlinear.set_weight_bias(qweight, mod.bias)
         return qlinear
 
diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py
index 1ebe4b6a15af4..6c19063bbc68f 100644
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -136,8 +136,12 @@ def __init__(
                 "dropout option adds dropout after all but last "
                 "recurrent layer, so non-zero dropout expects "
                 f"num_layers greater than 1, but got dropout={dropout} and "
+<<<<<<< HEAD
                 f"num_layers={num_layers}",
                 stacklevel=2,
+=======
+                f"num_layers={num_layers}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if mode == "LSTM":
@@ -522,7 +526,10 @@ class LSTM(RNNBase):
         >>> output, (hn, cn) = rnn(input, (h0, c0))
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.LSTM
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -808,7 +815,10 @@ class GRU(RNNBase):
         >>> output, hn = rnn(input, h0)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.GRU
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -1065,6 +1075,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         qRNNCellBase: Union[LSTMCell, GRUCell, RNNCell]
 
+<<<<<<< HEAD
         if type(mod) is torch.nn.LSTMCell:
             qRNNCellBase = LSTMCell(
                 mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
@@ -1074,6 +1085,17 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
             )
         elif type(mod) is torch.nn.RNNCell:
+=======
+        if type(mod) == torch.nn.LSTMCell:
+            qRNNCellBase = LSTMCell(
+                mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
+            )
+        elif type(mod) == torch.nn.GRUCell:
+            qRNNCellBase = GRUCell(
+                mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
+            )
+        elif type(mod) == torch.nn.RNNCell:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qRNNCellBase = RNNCell(
                 mod.input_size,
                 mod.hidden_size,
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index 30994b2921bd2..9c4c9cc34e9c7 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -724,8 +724,12 @@ def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=
         affects the outputs.
     """
     warnings.warn(
+<<<<<<< HEAD
         "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.",
         stacklevel=2,
+=======
+        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return interpolate(input, size, scale_factor, mode, align_corners)
 
@@ -750,8 +754,12 @@ def upsample_bilinear(input, size=None, scale_factor=None):
     """
     # DeprecationWarning is ignored by default
     warnings.warn(
+<<<<<<< HEAD
         "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.",
         stacklevel=2,
+=======
+        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)
 
@@ -776,7 +784,11 @@ def upsample_nearest(input, size=None, scale_factor=None):
     """
     # DeprecationWarning is ignored by default
     warnings.warn(
+<<<<<<< HEAD
         "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.",
         stacklevel=2,
+=======
+        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return interpolate(input, size, scale_factor, mode="nearest")
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index 3ecf1d5c9a1e2..cb7c1ea986025 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -67,9 +67,13 @@ class Hardswish(torch.nn.Hardswish):
     def __init__(self, scale, zero_point, device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -140,9 +144,13 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(negative_slope, inplace)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -230,7 +238,10 @@ def from_reference(cls, mod, scale, zero_point):
 
 
 class MultiheadAttention(torch.ao.nn.quantizable.MultiheadAttention):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.quantizable.MultiheadAttention
 
     def _get_name(self):
@@ -322,8 +333,12 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         observer(float_wt)
         if observer.dtype != torch.quint8:
             warn(
+<<<<<<< HEAD
                 f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
                 stacklevel=2,
+=======
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         wt_scale, wt_zp = observer.calculate_qparams()
         qweight = torch.quantize_per_tensor(
@@ -340,8 +355,12 @@ def from_reference(cls, mod, scale, zero_point):
         observer(float_wt)
         if observer.dtype != torch.quint8:
             warn(
+<<<<<<< HEAD
                 f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
                 stacklevel=2,
+=======
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         wt_scale, wt_zp = observer.calculate_qparams()
         qweight = torch.quantize_per_tensor(
diff --git a/torch/ao/nn/quantized/modules/batchnorm.py b/torch/ao/nn/quantized/modules/batchnorm.py
index f1e6779c08b1f..da202d97b4e2d 100644
--- a/torch/ao/nn/quantized/modules/batchnorm.py
+++ b/torch/ao/nn/quantized/modules/batchnorm.py
@@ -12,15 +12,23 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(num_features, eps, momentum, True, True, **factory_kwargs)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(1.0, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(1.0, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(0, **factory_kwargs))
 
     @staticmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
         activation_post_process = mod.activation_post_process
+<<<<<<< HEAD
         if type(mod) is cls._NNI_BN_RELU_MODULE:
+=======
+        if type(mod) == cls._NNI_BN_RELU_MODULE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod = mod[0]
         scale, zero_point = activation_post_process.calculate_qparams()
         new_mod = cls(mod.num_features, mod.eps)
diff --git a/torch/ao/nn/quantized/modules/conv.py b/torch/ao/nn/quantized/modules/conv.py
index a292d616a86c3..bf08e35004e3e 100644
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 r"""Quantized convolution modules."""
 
+<<<<<<< HEAD
 from typing import ClassVar, Literal, Optional
+=======
+from typing import ClassVar, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -278,9 +282,15 @@ def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
     @staticmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
         if hasattr(mod, "weight_fake_quant"):
+<<<<<<< HEAD
             # assert type(mod) is cls.__QAT_MODULE, " nnq." + cls.__name__ + \
             # ".from_float only works for " + cls.__QAT_MODULE.__name__
             if type(mod) is cls._NNIQAT_CONV_BN_MODULE:
+=======
+            # assert type(mod) == cls.__QAT_MODULE, " nnq." + cls.__name__ + \
+            # ".from_float only works for " + cls.__QAT_MODULE.__name__
+            if type(mod) == cls._NNIQAT_CONV_BN_MODULE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mod.weight, mod.bias = fuse_conv_bn_weights(
                     mod.weight,
                     mod.bias,
@@ -296,7 +306,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             weight_post_process = mod.weight_fake_quant
             activation_post_process = mod.activation_post_process
         else:
+<<<<<<< HEAD
             assert type(mod) is cls._FLOAT_MODULE, (
+=======
+            assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 " nnq."
                 + cls.__name__
                 + ".from_float only works for "
@@ -401,14 +415,21 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ):
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _single(kernel_size)
         stride = _single(stride)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         padding = padding if isinstance(padding, str) else _single(padding)
         dilation = _single(dilation)
 
@@ -808,7 +829,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
             + ".from_float only works for "
             + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
         )
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, msg
+=======
+        assert type(mod) == cls._FLOAT_MODULE, msg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined."
         weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
         weight_post_process(mod.weight)
diff --git a/torch/ao/nn/quantized/modules/embedding_ops.py b/torch/ao/nn/quantized/modules/embedding_ops.py
index 7e843653ed27a..d9d1b992edcef 100644
--- a/torch/ao/nn/quantized/modules/embedding_ops.py
+++ b/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -196,7 +196,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                           utilities or provided by user
         """
         if hasattr(mod, "weight_fake_quant"):
+<<<<<<< HEAD
             assert type(mod) is torch.ao.nn.qat.Embedding, (
+=======
+            assert type(mod) == torch.ao.nn.qat.Embedding, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "nnq."
                 + cls.__name__
                 + ".from_float "
@@ -205,7 +209,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             )
             weight_observer = mod.weight_fake_quant
         else:
+<<<<<<< HEAD
             assert type(mod) is nn.Embedding, (
+=======
+            assert type(mod) == nn.Embedding, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "nnq."
                 + cls.__name__
                 + ".from_float only works for "
@@ -349,7 +357,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         if hasattr(mod, "weight_fake_quant"):
             weight_observer = mod.weight_fake_quant
         else:
+<<<<<<< HEAD
             assert type(mod) is nn.EmbeddingBag, (
+=======
+            assert type(mod) == nn.EmbeddingBag, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "nnq."
                 + cls.__name__
                 + ".from_float only works for "
diff --git a/torch/ao/nn/quantized/modules/functional_modules.py b/torch/ao/nn/quantized/modules/functional_modules.py
index acb578d0cc798..65cbee92b4138 100644
--- a/torch/ao/nn/quantized/modules/functional_modules.py
+++ b/torch/ao/nn/quantized/modules/functional_modules.py
@@ -288,7 +288,11 @@ def matmul(self, x: Tensor, y: Tensor) -> Tensor:
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+<<<<<<< HEAD
         assert type(mod) is FloatFunctional, (
+=======
+        assert type(mod) == FloatFunctional, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "QFunctional.from_float expects an instance of FloatFunctional"
         )
         scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index e03306d5fb851..1e1eea9cfc024 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -310,7 +310,10 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             # the type mismatch in assignment. Also, mypy has an issue with
             # iterables not being implemented, so we are ignoring those too.
             if not isinstance(cls._FLOAT_MODULE, Iterable):
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cls._FLOAT_MODULE = [cls._FLOAT_MODULE]
             supported_modules = ", ".join(
                 [float_mod.__name__ for float_mod in cls._FLOAT_MODULE]
diff --git a/torch/ao/nn/quantized/modules/normalization.py b/torch/ao/nn/quantized/modules/normalization.py
index fa335b4699db5..bc45e019121a9 100644
--- a/torch/ao/nn/quantized/modules/normalization.py
+++ b/torch/ao/nn/quantized/modules/normalization.py
@@ -37,14 +37,21 @@ def __init__(
             normalized_shape,
             eps=eps,
             elementwise_affine=elementwise_affine,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **factory_kwargs,
         )
         self.weight = weight
         self.bias = bias
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -116,9 +123,13 @@ def __init__(
         super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs)
         self.weight = weight
         self.bias = bias
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -180,9 +191,13 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -249,9 +264,13 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -318,9 +337,13 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
diff --git a/torch/ao/nn/quantized/modules/rnn.py b/torch/ao/nn/quantized/modules/rnn.py
index 5040b8c97d050..7ea1837b7c0f3 100644
--- a/torch/ao/nn/quantized/modules/rnn.py
+++ b/torch/ao/nn/quantized/modules/rnn.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from typing import Any
 
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 
 
@@ -36,11 +40,19 @@ class LSTM(torch.ao.nn.quantizable.LSTM):
 
     _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _get_name(self) -> str:
         return "QuantizedLSTM"
 
     @classmethod
     def from_float(cls, *args: Any, **kwargs: Any) -> None:
+=======
+    def _get_name(self):
+        return "QuantizedLSTM"
+
+    @classmethod
+    def from_float(cls, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The whole flow is float -> observed -> quantized
         # This class does observed -> quantized only
         raise NotImplementedError(
@@ -50,7 +62,11 @@ def from_float(cls, *args: Any, **kwargs: Any) -> None:
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_observed(cls: type["LSTM"], other: torch.ao.nn.quantizable.LSTM) -> "LSTM":
+=======
+    def from_observed(cls, other):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(other, cls._FLOAT_MODULE)  # type: ignore[has-type]
         converted = torch.ao.quantization.convert(
             other, inplace=False, remove_qconfig=True
diff --git a/torch/ao/nn/quantized/modules/utils.py b/torch/ao/nn/quantized/modules/utils.py
index 330070913a752..c1e114626c168 100644
--- a/torch/ao/nn/quantized/modules/utils.py
+++ b/torch/ao/nn/quantized/modules/utils.py
@@ -83,7 +83,11 @@ def _quantize_weight(float_wt, observer):
             torch.qint8,
         )
         qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+<<<<<<< HEAD
     elif observer.qscheme == torch.per_channel_affine_float_qparams:
+=======
+    elif observer.qscheme in [torch.per_channel_affine_float_qparams]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qweight = torch.quantize_per_channel(
             float_wt,
             wt_scale.to(torch.float),
diff --git a/torch/ao/nn/quantized/reference/modules/conv.py b/torch/ao/nn/quantized/reference/modules/conv.py
index f599577ecbb1a..721a6883b5148 100644
--- a/torch/ao/nn/quantized/reference/modules/conv.py
+++ b/torch/ao/nn/quantized/reference/modules/conv.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Any, Literal, Optional
+=======
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -62,7 +66,11 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
         weight_qparams: Optional[dict[str, Any]] = None,
@@ -95,7 +103,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv1d
         """
         weight_quant_dequant = self.get_weight()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = F.conv1d(
             x,
             weight_quant_dequant,
@@ -141,7 +152,10 @@ def __init__(
             dilation,
             groups,
             bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode,
             device,
             dtype,
@@ -160,7 +174,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv2d
         """
         weight_quant_dequant = self.get_weight()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = F.conv2d(
             x,
             weight_quant_dequant,
@@ -206,7 +223,10 @@ def __init__(
             dilation,
             groups,
             bias,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode,
             device,
             dtype,
@@ -225,7 +245,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv3d
         """
         weight_quant_dequant = self.get_weight()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = F.conv3d(
             x,
             weight_quant_dequant,
@@ -287,7 +310,11 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_1_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
         weight_qparams: Optional[dict[str, Any]] = None,
@@ -383,7 +410,10 @@ def __init__(
             groups,
             bias,
             dilation,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode,
             device,
             dtype,
@@ -465,7 +495,10 @@ def __init__(
             groups,
             bias,
             dilation,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_mode,
             device,
             dtype,
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index 3aba49a10db81..e2d6ac77933ee 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -663,11 +663,15 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
+<<<<<<< HEAD
                 output,
                 # pyrefly: ignore [bad-argument-type]
                 batch_sizes,
                 sorted_indices,
                 unsorted_indices,
+=======
+                output, batch_sizes, sorted_indices, unsorted_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
@@ -827,11 +831,15 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
+<<<<<<< HEAD
                 output,
                 # pyrefly: ignore [bad-argument-type]
                 batch_sizes,
                 sorted_indices,
                 unsorted_indices,
+=======
+                output, batch_sizes, sorted_indices, unsorted_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
diff --git a/torch/ao/nn/quantized/reference/modules/sparse.py b/torch/ao/nn/quantized/reference/modules/sparse.py
index a206a81997bcc..8ca32e7de2305 100644
--- a/torch/ao/nn/quantized/reference/modules/sparse.py
+++ b/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -42,7 +42,10 @@ def __init__(
             scale_grad_by_freq,
             sparse,
             _weight,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device,
             dtype,
         )
diff --git a/torch/ao/nn/quantized/reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
index f68cb46534f3e..5cd0757ed6921 100644
--- a/torch/ao/nn/quantized/reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -18,7 +18,10 @@ def _init_weight_qparams(self, weight_qparams, device):
                 "scale": 1.0,
                 "zero_point": 0,
             }
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"]
         self.weight_dtype = weight_qparams["dtype"]
         assert self.weight_qscheme in [
@@ -81,15 +84,27 @@ def _init_weight_qparams(self, weight_qparams, device):
             self.register_buffer(
                 "weight_axis", torch.tensor(0, dtype=torch.int, device=device)
             )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.is_decomposed: bool = weight_qparams.get("is_decomposed", False)
         # store weight_axis as weight_axis_int due to some constraints of torchdynamo.export
         # for capturing `.item` operations
         self.weight_axis_int: int = self.weight_axis.item()  # type: ignore[operator, assignment]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         self.weight_quant_min: typing.Optional[int] = weight_qparams.get("quant_min")
         # pyrefly: ignore [bad-assignment]
         self.weight_quant_max: typing.Optional[int] = weight_qparams.get("quant_max")
+=======
+        self.weight_quant_min: typing.Optional[int] = weight_qparams.get(
+            "quant_min", None
+        )
+        self.weight_quant_max: typing.Optional[int] = weight_qparams.get(
+            "quant_max", None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_weight(self):
         """
@@ -105,7 +120,10 @@ def get_weight(self):
             return _quantize_and_dequantize_weight_decomposed(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -117,7 +135,10 @@ def get_weight(self):
             return _quantize_and_dequantize_weight(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -133,7 +154,10 @@ def get_quantized_weight(self):
             return _quantize_weight_decomposed(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -145,7 +169,10 @@ def get_quantized_weight(self):
             return _quantize_weight(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -202,7 +229,11 @@ def _quantize_weight_decomposed(
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
+<<<<<<< HEAD
         torch.int32: ((-(2**31)), (2**31 - 1)),
+=======
+        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     # TODO: add an util function for converting qdtype to dtype
@@ -265,7 +296,11 @@ def _dequantize_weight_decomposed(
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
+<<<<<<< HEAD
         torch.int32: ((-(2**31)), (2**31 - 1)),
+=======
+        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     # TODO: add an util function for converting qdtype to dtype
     _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
diff --git a/torch/ao/nn/sparse/quantized/dynamic/linear.py b/torch/ao/nn/sparse/quantized/dynamic/linear.py
index 65d9e8df79f39..bb2d9d707383e 100644
--- a/torch/ao/nn/sparse/quantized/dynamic/linear.py
+++ b/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -140,7 +140,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         We only care about the convert at this stage, no need for observers just yet.
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " nnq."
             + cls.__name__
             + ".from_float only works for "
@@ -149,11 +153,17 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         # TODO: Need to add options to qconfig to avoid the calibration.
         # TODO: Add calibration for the sparsity
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+<<<<<<< HEAD
         if type(mod) is nni.LinearReLU:
             mod = mod[0]
         # pyrefly: ignore [missing-attribute]
         if mod.qconfig is not None and mod.qconfig.weight is not None:
             # pyrefly: ignore [not-callable]
+=======
+        if type(mod) == nni.LinearReLU:
+            mod = mod[0]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_observer = mod.qconfig.weight()
         else:
             # We have the circular import issues if we import the qconfig in the beginning of this file:
@@ -187,6 +197,9 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             col_block_size,
             dtype=dtype,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qlinear.set_weight_bias(qweight, mod.bias, row_block_size, col_block_size)
         return qlinear
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index e51ce51138ac5..0e7758de6b8ba 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -224,7 +224,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         TODO(zaf): Need to add the sparse params to the qconfig
         """
+<<<<<<< HEAD
         assert type(mod) is cls._FLOAT_MODULE, (
+=======
+        assert type(mod) == cls._FLOAT_MODULE, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls._get_name() + ".from_float only works for " + cls._FLOAT_MODULE.__name__
         )
         assert hasattr(mod, "sparse_params"), (
diff --git a/torch/ao/nn/sparse/quantized/utils.py b/torch/ao/nn/sparse/quantized/utils.py
index ccf85e68d84ff..1399b60f87b1d 100644
--- a/torch/ao/nn/sparse/quantized/utils.py
+++ b/torch/ao/nn/sparse/quantized/utils.py
@@ -15,7 +15,11 @@ def _is_valid_linear_block_sparse_pattern(
 
 # This is a stop-gap measure as current flow does not allow module
 # specific block sparse pattern.
+<<<<<<< HEAD
 # In fact there is no way to convey sparse pattern via module config
+=======
+# Infact there is no way to convey sparse pattern via module config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # of quantization flow. Thus using the global context to convey
 # sparsity pattern.
 # Once the flow supports it, this should be removed.
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index 1c8e751b1ebdc..c611b224fcdf4 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.quantized as nnq
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 882bbb8e93590..b5911384def38 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -84,8 +84,12 @@
 """
 
 import collections
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.quantization.quantize_fx as quantize_fx
@@ -264,8 +268,12 @@ def forward(self, x, x_ref):  # type: ignore[override]
         # fmt: on
         if not self.enabled:
             return x
+<<<<<<< HEAD
         if not isinstance(x, torch.Tensor):
             raise AssertionError("non-tensor inputs not yet supported")
+=======
+        assert isinstance(x, torch.Tensor), "non-tensor inputs not yet supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.save_activations:
             # save the activation, for debugging
             self.stats.append(x.detach())
@@ -596,8 +604,14 @@ def _extract_logger_info_one_model(
             key = mod.ref_name
             if key not in results:
                 results[key] = {}
+<<<<<<< HEAD
             if mod.model_name in results[key]:
                 raise AssertionError(f"{mod.model_name} is already present in results")
+=======
+            assert mod.model_name not in results[key], (
+                f"{mod.model_name} is already present in results"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if mod.results_type not in results[key]:
                 results[key][mod.results_type] = {}
             if mod.model_name not in results[key][mod.results_type]:
@@ -809,10 +823,19 @@ def extend_logger_results_with_comparison(
     """
     for results_type_to_results in results.values():
         for model_name_to_results in results_type_to_results.values():
+<<<<<<< HEAD
             if model_name_1 not in model_name_to_results:
                 raise AssertionError(f"{model_name_1} not found in results")
             if model_name_2 not in model_name_to_results:
                 raise AssertionError(f"{model_name_2} not found in results")
+=======
+            assert model_name_1 in model_name_to_results, (
+                f"{model_name_1} not found in results"
+            )
+            assert model_name_2 in model_name_to_results, (
+                f"{model_name_2} not found in results"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             results_1 = model_name_to_results[model_name_1]
             results_2 = model_name_to_results[model_name_2]
@@ -830,8 +853,12 @@ def extend_logger_results_with_comparison(
                     ):
                         result_1 = cur_result_1
                         break
+<<<<<<< HEAD
                 if result_1 is None:
                     raise AssertionError("Expected result_1 to be not None")
+=======
+                assert result_1 is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 values_1 = result_1["values"]
                 values_2 = result_2["values"]
diff --git a/torch/ao/ns/fx/graph_matcher.py b/torch/ao/ns/fx/graph_matcher.py
index fd7f5cbe55276..86cded7512270 100644
--- a/torch/ao/ns/fx/graph_matcher.py
+++ b/torch/ao/ns/fx/graph_matcher.py
@@ -84,7 +84,10 @@ def __next__(self) -> NSSubgraph:
                 if is_match:
                     # navigate to the base node
                     for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1):
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.seen_nodes.add(cur_start_node)
                         # for now, assume that there are no other nodes
                         # which need to be added to the stack
@@ -95,10 +98,15 @@ def __next__(self) -> NSSubgraph:
                             cur_base_op_node = cur_start_node
                     break
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             self.seen_nodes.add(cur_start_node)
             # add args of previous nodes to stack
             # pyrefly: ignore [missing-attribute]
+=======
+            self.seen_nodes.add(cur_start_node)
+            # add args of previous nodes to stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for arg in cur_start_node.all_input_nodes:
                 self._recursively_add_node_arg_to_stack(arg)
 
@@ -106,7 +114,10 @@ def __next__(self) -> NSSubgraph:
             # note: this check is done on the start_node, i.e.
             # if we are matching linear-relu in reverse, this would do the matchable
             # check on the linear
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self._is_matchable(cur_base_op_node):
                 continue
 
@@ -120,10 +131,15 @@ def __next__(self) -> NSSubgraph:
                     continue
 
             return NSSubgraph(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 start_node=cur_start_node,
                 end_node=cur_end_node,
                 # pyrefly: ignore [bad-argument-type]
+=======
+                start_node=cur_start_node,
+                end_node=cur_end_node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_op_node=cur_base_op_node,
             )
 
@@ -150,8 +166,12 @@ def _is_matchable(self, node: Node) -> bool:
         if node.op == "call_function":
             return node.target not in self.non_matchable_functions
         elif node.op == "call_module":
+<<<<<<< HEAD
             if not isinstance(node.target, str):
                 raise AssertionError(f"Expected str, got {type(node.target)}")
+=======
+            assert isinstance(node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_mod = getattr_from_fqn(self.gm, node.target)
             return not any(
                 isinstance(target_mod, t)  # type: ignore[arg-type]
@@ -229,6 +249,7 @@ def _get_subgraph_relationship_type(
         else:
             return SubgraphTypeRelationship.NOT_RELATED
     elif node_a.op == "call_module":
+<<<<<<< HEAD
         if (
             subgraph_a.base_op_node != subgraph_a.start_node
             or subgraph_b.base_op_node != subgraph_b.start_node
@@ -242,16 +263,36 @@ def _get_subgraph_relationship_type(
         mod_a = getattr_from_fqn(gm_a, node_a.target)
         if not isinstance(node_b.target, str):
             raise AssertionError(f"Expected str, got {type(node_b.target)}")
+=======
+        assert (
+            subgraph_a.base_op_node == subgraph_a.start_node
+            and subgraph_b.base_op_node == subgraph_b.start_node
+        ), (
+            "Matching call_module patterns where base_op_node != start_node is not supported yet"
+        )
+        # for call_module, we need to look up the modules to do the type check
+        assert isinstance(node_a.target, str)
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        assert isinstance(node_b.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod_b = getattr_from_fqn(gm_b, node_b.target)
 
         key = (type(mod_a), type(mod_b))
 
         if key not in type_a_related_to_b:
+<<<<<<< HEAD
             if type(mod_a) is type(mod_b):
                 return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
             else:
                 return SubgraphTypeRelationship.NOT_RELATED
         elif type(mod_a) is type(mod_b):
+=======
+            if type(mod_a) == type(mod_b):
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        elif type(mod_a) == type(mod_b):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return SubgraphTypeRelationship.EQUAL
         else:
             return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
@@ -316,8 +357,12 @@ def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetT
     if node.op in ("call_function", "call_method"):
         return node.target
     elif node.op == "call_module":
+<<<<<<< HEAD
         if not isinstance(node.target, str):
             raise AssertionError(f"Expected str, got {type(node.target)}")
+=======
+        assert isinstance(node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = getattr_from_fqn(gm, node.target)
         return type(mod)
     return None
@@ -457,10 +502,16 @@ def get_matching_subgraph_pairs(
             key_name_b = _get_name_for_subgraph(
                 cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops, existing_names_b
             )
+<<<<<<< HEAD
             if key_name_a != key_name_b:
                 raise AssertionError(
                     f"Subgraph names {key_name_a} and {key_name_b} do not match"
                 )
+=======
+            assert key_name_a == key_name_b, (
+                f"Subgraph names {key_name_a} and {key_name_b} do not match"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             results[key_name_a] = (cur_subgraph_a, cur_subgraph_b)
             continue
         elif cur_subgraph_a is None and cur_subgraph_b is None:
@@ -481,5 +532,8 @@ def get_matching_subgraph_pairs(
     # subgraphs in their order of execution.
     results = collections.OrderedDict(reversed(results.items()))
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return results
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index 9a93e9ad582d7..8107f2e30ca52 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.ns.fx.mappings import get_node_type_to_io_type_map
@@ -32,8 +36,12 @@ def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
         # an observer, get the fqn of the node being observed.
         node_to_use_for_fqn = node
         if node.op == "call_module":
+<<<<<<< HEAD
             if not isinstance(node.target, str):
                 raise AssertionError(f"Expected str, got {type(node.target)}")
+=======
+            assert isinstance(node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module = getattr_from_fqn(gm, node.target)
             if _is_activation_post_process(module):
                 node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
@@ -129,7 +137,11 @@ def load_arg(a):
                 arg_indices_to_log = get_arg_indices_of_inputs_to_log(node)
                 for node_arg_idx in arg_indices_to_log:
                     node_arg = get_normalized_nth_input(node, gm, node_arg_idx)
+<<<<<<< HEAD
                     if type(node_arg) is Node:
+=======
+                    if type(node_arg) == Node:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # create a single input logger
                         prev_node = env[node_arg.name]
                         env[node_arg.name] = _insert_logger_after_node(
@@ -147,7 +159,11 @@ def load_arg(a):
                             fqn=fqn,
                         )
                     elif (
+<<<<<<< HEAD
                         type(node_arg) is torch.fx.immutable_collections.immutable_list
+=======
+                        type(node_arg) == torch.fx.immutable_collections.immutable_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         # create N input loggers, one for each node
                         for arg_idx, arg in enumerate(node_arg):  # type: ignore[var-annotated, arg-type]
@@ -166,6 +182,11 @@ def load_arg(a):
                                 index_of_arg=node_arg_idx,
                                 fqn=fqn,
                             )
+<<<<<<< HEAD
+=======
+                    else:
+                        pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # ensure env is populated with base node
             # Note: runs for both inputs and outputs
@@ -349,8 +370,12 @@ def _insert_dtype_cast_after_node(
                 new_dtype_cast_name,
             )
         else:
+<<<<<<< HEAD
             if not dtype_cast_mod_cls:
                 raise AssertionError("Expected dtype_cast_mod_cls to be not None")
+=======
+            assert dtype_cast_mod_cls
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype_cast_mod = dtype_cast_mod_cls()
             setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
             return graph_c.create_node(
@@ -375,8 +400,12 @@ def _insert_dtype_cast_after_node(
                 )
                 results.append(new_dtype_cast_node)
             else:
+<<<<<<< HEAD
                 if not dtype_cast_mod_cls:
                     raise AssertionError("Expected dtype_cast_mod_cls to be not None")
+=======
+                assert dtype_cast_mod_cls
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype_cast_mod = dtype_cast_mod_cls()
                 setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
                 new_dtype_cast_node = graph_c.create_node(
@@ -415,8 +444,15 @@ def _copy_node_from_a_to_c(
         )
         return node_a_copy
     elif node_a.op == "call_method":
+<<<<<<< HEAD
         if node_a.target not in ("dequantize", "to"):
             raise AssertionError(f"target {node_a.target} is not implemented")
+=======
+        assert node_a.target in (
+            "dequantize",
+            "to",
+        ), f"target {node_a.target} is not implemented"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if node_a.target == "dequantize":
             arg_copy = _copy_node_from_a_to_c(
                 get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c
@@ -536,8 +572,12 @@ def _insert_copy_of_subgraph_a_after_input_node_c(
     """
     TODO(before land): real docblock
     """
+<<<<<<< HEAD
     if not isinstance(input_node_c, (Node, list)):
         raise AssertionError(f"Expected Node or list, got {type(input_node_c)}")
+=======
+    assert isinstance(input_node_c, (Node, list))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # create a sequential list of the subgraphs' nodes from start to end,
     # because we need to add the nodes to graph C in non-reverse order
@@ -623,8 +663,12 @@ def _insert_copy_of_node_a_after_input_node_c(
     if isinstance(input_node_c, Node):
         graph_c = input_node_c.graph
     else:
+<<<<<<< HEAD
         if not isinstance(input_node_c, list):
             raise AssertionError(f"Expected list, got {type(input_node_c)}")
+=======
+        assert isinstance(input_node_c, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_c = input_node_c[0].graph
 
     norm_args_kwargs = node_a.normalized_arguments(
@@ -648,10 +692,16 @@ def _copy_arg(arg):
             return arg
         elif isinstance(kwarg_val, (list, tuple)):
             for el in kwarg_val:
+<<<<<<< HEAD
                 if isinstance(el, Node):
                     raise AssertionError(
                         "handling of Node inside list is not implemented"
                     )
+=======
+                assert not isinstance(el, Node), (
+                    "handling of Node inside list is not implemented"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return arg
         else:
             raise AssertionError(
@@ -688,8 +738,12 @@ def _copy_arg(arg):
         # if target is a module, we point to the module from gm_b
         new_mod_copy_name = get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
         # fetch the corresponding module from gm_a
+<<<<<<< HEAD
         if not isinstance(node_a.target, str):
             raise AssertionError(f"Expected str, got {type(node_a.target)}")
+=======
+        assert isinstance(node_a.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod_a = getattr_from_fqn(gm_a, node_a.target)
         setattr(gm_b, new_mod_copy_name, mod_a)
         node_a_shadows_c = graph_c.create_node(
@@ -701,8 +755,12 @@ def _copy_arg(arg):
         )
         return node_a_shadows_c
     else:
+<<<<<<< HEAD
         if node_a.op not in ("call_function", "call_method"):
             raise AssertionError(f"Unexpected op: {node_a.op}")
+=======
+        assert node_a.op in ("call_function", "call_method")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node_a_shadows_c = graph_c.create_node(
             node_a.op,
             node_a.target,
@@ -797,8 +855,12 @@ def load_arg(a):
                     ref_node_type_b,
                 ) = start_node_b_to_matched_subgraph_a_and_name[node_b]
             else:
+<<<<<<< HEAD
                 if not node_b_is_end_node:
                     raise AssertionError("Expected node_b_is_end_node to be not false")
+=======
+                assert node_b_is_end_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (
                     subgraph_a,
                     ref_name,
@@ -959,7 +1021,10 @@ def load_arg(a):
                 if should_log_inputs:
                     # skip the input logger when inserting a dtype cast
                     if isinstance(prev_node_c, Node):
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
                     elif isinstance(prev_node_c, list):
                         prev_node_c = [
@@ -968,7 +1033,10 @@ def load_arg(a):
                         ]
                 dtype_cast_node = _insert_dtype_cast_after_node(
                     subgraph_a.start_node,
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node_c,
                     prev_node_c,
                     gm_a,
@@ -1008,10 +1076,14 @@ def load_arg(a):
                         )
                         input_logger: Union[Node, list[Node]] = dtype_cast_node
                     else:
+<<<<<<< HEAD
                         if not isinstance(dtype_cast_node, list):
                             raise AssertionError(
                                 f"Expected list, got {type(dtype_cast_node)}"
                             )
+=======
+                        assert isinstance(dtype_cast_node, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         new_loggers = []
                         for dtype_cast_idx, dtype_cast_node_inner in enumerate(
                             dtype_cast_node
@@ -1052,10 +1124,14 @@ def load_arg(a):
                 if num_non_param_args_node_a == 2:
                     # node_c_second_non_param_arg = node_c.args[1]
                     node_c_second_non_param_arg = get_normalized_nth_input(
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
                         node_c,
                         gm_b,
                         1,
+=======
+                        node_c, gm_b, 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c(
                     dtype_cast_node,
@@ -1063,7 +1139,10 @@ def load_arg(a):
                     subgraph_a,
                     gm_a,
                     gm_b,
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node_c.name + "_shadow_copy_",
                 )
                 env_c[node_a_shadows_c.name] = node_a_shadows_c
@@ -1086,6 +1165,7 @@ def load_arg(a):
                     cur_node = node_a_shadows_c
                     while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
                         cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
                     if isinstance(input_logger, Node):
                         # pyrefly: ignore [unbound-name]
@@ -1099,6 +1179,13 @@ def load_arg(a):
                                 f"Expected list, got {type(input_logger)}"
                             )
                         # pyrefly: ignore [unbound-name]
+=======
+                    if isinstance(input_logger, Node):
+                        input_logger_mod = getattr(gm_b, input_logger.name)
+                        input_logger_mod.ref_node_name = cur_node.name
+                    else:
+                        assert isinstance(input_logger, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         for input_logger_inner in input_logger:
                             input_logger_mod = getattr(gm_b, input_logger_inner.name)
                             input_logger_mod.ref_node_name = cur_node.name
@@ -1147,7 +1234,11 @@ def load_arg(a):
                 # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c
                 #
                 # Note: node_start_c may be the same node as node_end_c, or they
+<<<<<<< HEAD
                 # may have nodes in between.
+=======
+                # may have nodes inbetween.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         else:
             env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index 9c0bc8b0f45f9..259ee1fcc5a3a 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -1,5 +1,9 @@
 import operator
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -19,10 +23,13 @@
 from .ns_types import NSNodeTargetType
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 toq = torch.ops.quantized
 
 
@@ -419,7 +426,10 @@ def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
         target2,
     ) in _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items():
         new_connections.append((source, target1))
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_connections.append((source, target2))
 
     for source_to_target in (
@@ -428,7 +438,10 @@ def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
         quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
     ):
         for source, target in source_to_target.items():  # type:ignore[assignment]
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_connections.append((source, target))
 
     #
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index 9adca1a7751ab..f2e2cc8363e4e 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -2,8 +2,12 @@
 import collections
 import copy
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -94,11 +98,16 @@ def fetch_attr(target: str):
                 )
 
             if isinstance(result, torch.Tensor):  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
                 node.traced_result = result
 
             # pyrefly: ignore [unsupported-operation]
             # pyrefly: ignore [unbound-name]
+=======
+                node.traced_result = result
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             env[node.name] = result
 
         return None
@@ -145,11 +154,17 @@ def _get_dedup_subgraphs(matches: dict[str, _MatchResult]) -> dict[str, list[Nod
                 seen_nodes.add(node_or_tuple)
 
             else:
+<<<<<<< HEAD
                 if not isinstance(node_or_tuple, tuple):
                     raise AssertionError(f"Expected tuple, got {type(node_or_tuple)}")
                 for node in node_or_tuple:
                     if not isinstance(node, Node):
                         raise AssertionError(f"Expected Node, got {type(node)}")
+=======
+                assert isinstance(node_or_tuple, tuple)
+                for node in node_or_tuple:
+                    assert isinstance(node, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if node in seen_nodes:
                         was_seen = True
                     seen_nodes.add(node)
@@ -163,10 +178,14 @@ def _get_dedup_subgraphs(matches: dict[str, _MatchResult]) -> dict[str, list[Nod
         if len(cur_match[1]) == 1:
             list_of_nodes = cur_match[1]
         else:
+<<<<<<< HEAD
             if len(cur_match[1]) != 2:
                 raise ValueError(
                     f"Expected cur_match[1] to have length 2, got {len(cur_match[1])}"
                 )
+=======
+            assert len(cur_match[1]) == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # either (a, b), or ((a, b), c) or (c, (a, b))
             # cannot make any assumptions on order, not clear what the
             # _find_matches function is doing to populate this
@@ -187,12 +206,22 @@ def _order_nodes(node_a, node_b, node_c) -> list[Node]:
                         last_node = n
                     else:
                         mid_node = n
+<<<<<<< HEAD
                 if first_node is None or mid_node is None or last_node is None:
                     raise AssertionError("Expected all nodes to be non-None")
                 if mid_node.args[0] is not first_node:
                     raise AssertionError("Expected mid_node.args[0] to be first_node")
                 if last_node.args[0] is not mid_node:
                     raise AssertionError("Expected last_node.args[0] to be mid_node")
+=======
+                assert (
+                    first_node is not None
+                    and mid_node is not None
+                    and last_node is not None
+                )
+                assert mid_node.args[0] is first_node
+                assert last_node.args[0] is mid_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return [last_node, mid_node, first_node]
 
             if isinstance(cur_match[1][0], Node) and isinstance(cur_match[1][1], Node):
@@ -382,10 +411,14 @@ def _add_placeholder(
             # the current implementation is simplistic and cannot handle
             # ops with two or more arguments which need to be passed from
             # the previous op, so we assert them out
+<<<<<<< HEAD
             if cur_node_orig.target in BINARY_FUNCTIONS:
                 raise AssertionError(
                     f"Unexpected binary function target: {cur_node_orig.target}"
                 )
+=======
+            assert cur_node_orig.target not in BINARY_FUNCTIONS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # at this point in the code, cur_node_copy is pointing to the copy
             # of the previous node
@@ -404,10 +437,15 @@ def _add_placeholder(
                         cur_name_idx += 1
                         setattr(gm, mod_name, new_arg)
                         new_arg_placeholder = gm.placeholder(mod_name)  # type: ignore[operator]
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
                         cur_args_copy.append(new_arg_placeholder)
                     elif isinstance(arg, (float, int, torch.dtype)):
                         # pyrefly: ignore [missing-attribute]
+=======
+                        cur_args_copy.append(new_arg_placeholder)
+                    elif isinstance(arg, (float, int, torch.dtype)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         cur_args_copy.append(arg)
                     else:
                         raise AssertionError(f"arg of type {type(arg)} not handled yet")
@@ -443,10 +481,16 @@ def _add_placeholder(
             break
 
         # go to next node
+<<<<<<< HEAD
         if len(cur_node_orig.users.keys()) != 1:
             raise AssertionError(
                 f"{cur_node_orig} has more than 1 users, not supported yet"
             )
+=======
+        assert len(cur_node_orig.users.keys()) == 1, (
+            f"{cur_node_orig} has more than 1 users, not supported yet"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_node_orig = next(iter(cur_node_orig.users.keys()))
         cur_iteration += 1
         if cur_iteration > iteration_limit:
@@ -503,8 +547,12 @@ def create_one_transformed_and_logged_copy_of_subgraph(
         )
 
         attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+<<<<<<< HEAD
         if hasattr(mt, attr_name):
             raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+=======
+        assert not hasattr(mt, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         setattr(mt, attr_name, logger_mod_orig)
         with mt.graph.inserting_after(last_node):
             new_node = mt.graph.call_module(attr_name, args=(last_node,), kwargs={})
@@ -547,10 +595,16 @@ def create_one_transformed_and_logged_copy_of_subgraph(
                 "prepare_custom_config",
                 "qconfig_mapping",
             ]:
+<<<<<<< HEAD
                 if kwarg_name in custom_prepare_kwargs:
                     raise AssertionError(
                         f"cannot specify {kwarg_name} in custom_prepare_kwargs"
                     )
+=======
+                assert kwarg_name not in custom_prepare_kwargs, (
+                    f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepare_kwargs: dict[str, Any] = {
                 "example_inputs": example_inputs,
                 "qconfig_mapping": qconfig_mapping,
@@ -562,8 +616,12 @@ def create_one_transformed_and_logged_copy_of_subgraph(
 
         # attach the wrapper to the model
         attr_name = _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx)
+<<<<<<< HEAD
         if hasattr(mt, attr_name):
             raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+=======
+        assert not hasattr(mt, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         setattr(mt, attr_name, orig_mod_copy_wrapped)
 
         # add a call to the wrapper module from the parent graph
@@ -612,8 +670,12 @@ def create_one_transformed_and_logged_copy_of_subgraph(
         )
 
         attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+<<<<<<< HEAD
         if hasattr(mt, attr_name):
             raise AssertionError(f"Unexpected attribute '{attr_name}' found in {mt}")
+=======
+        assert not hasattr(mt, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         setattr(mt, attr_name, logger_mod_orig)
         with mt.graph.inserting_after(new_node):
             logger = mt.graph.call_module(
@@ -819,7 +881,10 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 model,
                 cur_subgraph_idx,
                 match_name,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 maybe_subgraph,
                 [qconfig_mapping],
                 [node_name_to_qconfig],
@@ -837,8 +902,12 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 ):
                     new_shadow_mod = maybe_shadow_mod
                     break
+<<<<<<< HEAD
             if new_shadow_mod is None:
                 raise AssertionError("Expected new_shadow_mod to be non-None")
+=======
+            assert new_shadow_mod is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             orig_first_node_to_shadow_in_node[first_node] = new_shadow_mod
             orig_first_node_to_shadow_out_node[first_node] = new_shadow_mod
 
@@ -864,10 +933,14 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 fqn,
             )
             attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+<<<<<<< HEAD
             if hasattr(model, attr_name):
                 raise AssertionError(
                     f"Unexpected attribute '{attr_name}' found in {model}"
                 )
+=======
+            assert not hasattr(model, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             setattr(model, attr_name, logger_mod_orig)
             insertion_point = last_node
             with model.graph.inserting_after(insertion_point):
@@ -880,7 +953,10 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
             cur_node_orig = first_node
             cur_node_copy = None
             first_node_copy = None
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while cur_node_orig in subgraph_to_use:
                 # TODO(future PR): make this support all possible args/kwargs
                 if cur_node_orig is first_node:
@@ -904,6 +980,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 # since now only linear subgraphs are supported, all nodes
                 # except the last one must have only one user
                 if cur_node_orig != last_node:
+<<<<<<< HEAD
                     if len(cur_node_orig.users.keys()) != 1:
                         raise AssertionError(
                             f"Expected exactly 1, but got {len(cur_node_orig.users)}"
@@ -913,6 +990,11 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                     raise AssertionError(
                         "cur_node_orig should not start with SHADOW_NODE_NAME_PREFIX"
                     )
+=======
+                    assert len(cur_node_orig.users.keys()) == 1
+                cur_node_orig = next(iter(cur_node_orig.users.keys()))
+                assert not cur_node_orig.name.startswith(SHADOW_NODE_NAME_PREFIX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 insertion_point = cur_node_copy
 
             # add a comparison logger after last_node's copy
@@ -928,10 +1010,14 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 fqn,
             )
             attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+<<<<<<< HEAD
             if hasattr(model, attr_name):
                 raise AssertionError(
                     f"Unexpected attribute '{attr_name}' found in {model}"
                 )
+=======
+            assert not hasattr(model, attr_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             setattr(model, attr_name, logger_mod_orig)
             with model.graph.inserting_after(insertion_point):
                 logger = model.graph.call_module(
@@ -1005,8 +1091,12 @@ def maybe_remap_node_to_shadow(node):
             return prev_shadow_output
 
         cur_shadow_input = orig_first_node_to_shadow_in_node[first_node]
+<<<<<<< HEAD
         if cur_shadow_input is None:
             raise AssertionError("Expected cur_shadow_input to be non-None")
+=======
+        assert cur_shadow_input is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_shadow_input.args = tree_map(
             maybe_remap_node_to_shadow, cur_shadow_input.args
         )
@@ -1046,8 +1136,12 @@ def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module):
         #  we have `w2_0`, and are navigating this subgraph
         #  to get `_input_scale_1` and `_input_zero_point_1`
 
+<<<<<<< HEAD
         if len(shadow_n.users) != 1:
             raise AssertionError(f"Expected exactly 1, got {len(shadow_n.users)}")
+=======
+        assert len(shadow_n.users) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quant_node = next(iter(shadow_n.users.keys()))
         new_args: Any = None
         if quant_node.target == torch.quantize_per_channel:
@@ -1056,10 +1150,14 @@ def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module):
             zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target)
             new_args = (scale_val, zp_val, axis, dtype)
         else:
+<<<<<<< HEAD
             if quant_node.target != torch.quantize_per_tensor:
                 raise AssertionError(
                     f"Expected torch.quantize_per_tensor, but got {quant_node.target}"
                 )
+=======
+            assert quant_node.target == torch.quantize_per_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _weight, scale_node, zp_node, dtype = quant_node.args
             scale_val = getattr_from_fqn(shadow_wrapper, scale_node.target)
             zp_val = getattr_from_fqn(shadow_wrapper, zp_node.target)
diff --git a/torch/ao/ns/fx/ns_types.py b/torch/ao/ns/fx/ns_types.py
index 134fd485130e0..125618762b55e 100644
--- a/torch/ao/ns/fx/ns_types.py
+++ b/torch/ao/ns/fx/ns_types.py
@@ -1,6 +1,10 @@
 import enum
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, NamedTuple, Union
+=======
+from typing import Any, Callable, NamedTuple, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.fx.graph import Node
 
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index c4d231e713b20..64c9af7d46ad7 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -28,7 +32,11 @@ def get_type_a_related_to_b(
     for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
+<<<<<<< HEAD
         for idx_0 in range(len(s_list)):
+=======
+        for idx_0 in range(0, len(s_list)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for idx_1 in range(idx_0, len(s_list)):
                 type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
                 type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
@@ -167,8 +175,12 @@ def end_node_matches_reversed_fusion(
         elif cur_node.op == "call_module":
             fusion_el_is_mod = isinstance(cur_fusion_el, type)
             if fusion_el_is_mod:
+<<<<<<< HEAD
                 if not isinstance(cur_node.target, str):
                     raise AssertionError(f"Expected str, got {type(cur_node.target)}")
+=======
+                assert isinstance(cur_node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 target_mod = getattr_from_fqn(gm, cur_node.target)
                 if not isinstance(cur_fusion_el, type):
                     return False
@@ -191,10 +203,14 @@ def end_node_matches_reversed_fusion(
                     if cur_node.target != cur_fusion_el:
                         return False
                 else:
+<<<<<<< HEAD
                     if not isinstance(cur_fusion_el, tuple):
                         raise AssertionError(
                             f"Expected tuple, got {type(cur_fusion_el)}"
                         )
+=======
+                    assert isinstance(cur_fusion_el, tuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if cur_node.target != cur_fusion_el[0]:
                         return False
                     elif len(cur_node.args) < 2:
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index 750129807d00a..b758304355707 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -2,7 +2,11 @@
 from __future__ import annotations
 
 import copy
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization import QConfigMapping
@@ -10,8 +14,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.ao.quantization.qconfig import QConfigAny
 
 __all__ = ["QConfigMultiMapping"]
@@ -111,7 +118,11 @@ def _handle_list_size_mismatch(
                         target_qconfigs_dict[key] = None
                 break
 
+<<<<<<< HEAD
             # insert copies of this new QConfigMapping until all entries
+=======
+            # insert copies of this new QConfigMapping until all entires
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # in qconfig_list can fit among the QConfigMappings
             while len(qconfig_list) > len(self.qconfig_mappings_list):
                 self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping))
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index a7541e8a50c79..57c0bd2f10755 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -2,8 +2,12 @@
 # mypy: allow-untyped-defs
 import enum
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic.quantized as nniq
@@ -61,8 +65,12 @@ def get_node_first_input_and_output_type(
             return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
         elif node.target in FUNS_IO_TYPE_FP32_OR_INT8:
             first_arg = get_normalized_nth_input(node, gm, 0)
+<<<<<<< HEAD
             if not isinstance(first_arg, Node):
                 raise AssertionError(f"Expected Node, got {type(first_arg)}")
+=======
+            assert isinstance(first_arg, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 _prev_node_input_type,
                 prev_node_output_type,
@@ -74,11 +82,16 @@ def get_node_first_input_and_output_type(
             return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
     elif node.op == "call_module":
+<<<<<<< HEAD
         if node.op != "call_module":
             raise AssertionError(f"Expected call_module, got '{node.op}'")
         if not isinstance(node.target, str):
             raise AssertionError(f"Expected str, but got {type(node.target)}")
 
+=======
+        assert node.op == "call_module"
+        assert isinstance(node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = getattr_from_fqn(gm, node.target)
         is_known_fp32_or_int8_input_module = any(
             isinstance(mod, target_type)  # type: ignore[arg-type]
@@ -91,8 +104,12 @@ def get_node_first_input_and_output_type(
             # A logger or observer's input and output type is the output
             # type of the preceding node.
             first_arg = get_normalized_nth_input(node, gm, 0)
+<<<<<<< HEAD
             if not isinstance(first_arg, Node):
                 raise AssertionError(f"Expected Node, got {type(first_arg)}")
+=======
+            assert isinstance(first_arg, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 _prev_node_input_type,
                 prev_node_output_type,
@@ -121,8 +138,12 @@ def get_node_first_input_and_output_type(
             # So, we look up the output type of the previous node and return that
             # as the input type of this node instance.
             prev_node = get_normalized_nth_input(node, gm, 0)
+<<<<<<< HEAD
             if not isinstance(prev_node, Node):
                 raise AssertionError(f"Expected Node, got {type(prev_node)}")
+=======
+            assert isinstance(prev_node, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 _prev_node_input_type,
                 prev_node_output_type,
@@ -137,8 +158,12 @@ def get_node_first_input_and_output_type(
             # as the input type of this node instance. We also look up the target
             # of to and return the correct output type.
             prev_node = get_normalized_nth_input(node, gm, 0)
+<<<<<<< HEAD
             if not isinstance(prev_node, Node):
                 raise AssertionError(f"Expected Node, got {type(prev_node)}")
+=======
+            assert isinstance(prev_node, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 _prev_node_input_type,
                 prev_node_output_type,
@@ -147,17 +172,27 @@ def get_node_first_input_and_output_type(
             )
 
             cur_node_dtype_target = get_normalized_nth_input(node, gm, 1)
+<<<<<<< HEAD
             if cur_node_dtype_target is not torch.float16:
                 raise AssertionError(
                     f"{cur_node_dtype_target} handling needs to be added"
                 )
+=======
+            assert cur_node_dtype_target is torch.float16, (
+                f"{cur_node_dtype_target} handling needs to be added"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return (prev_node_output_type, NodeInputOrOutputType.FP16)
 
         elif node.target in METHS_IO_TYPE_FP32_OR_INT8:
             first_arg = get_normalized_nth_input(node, gm, 0)
+<<<<<<< HEAD
             if not isinstance(first_arg, Node):
                 raise AssertionError(f"Expected Node, got {type(first_arg)}")
+=======
+            assert isinstance(first_arg, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 _prev_node_input_type,
                 prev_node_output_type,
@@ -190,6 +225,7 @@ def get_node_input_qparams(
     def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
         scale_node = get_normalized_nth_input(node, gm, scale_arg_idx)
         zp_node = get_normalized_nth_input(node, gm, zp_arg_idx)
+<<<<<<< HEAD
         if not isinstance(scale_node, Node):
             raise AssertionError(f"Expected Node, got {type(scale_node)}")
         if not isinstance(scale_node.target, str):
@@ -198,6 +234,10 @@ def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
             raise AssertionError(f"Expected Node, got {type(zp_node)}")
         if not isinstance(zp_node.target, str):
             raise AssertionError(f"Expected str, got {type(zp_node.target)}")
+=======
+        assert isinstance(scale_node, Node) and isinstance(scale_node.target, str)
+        assert isinstance(zp_node, Node) and isinstance(zp_node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scale_obj = getattr_from_fqn(gm, scale_node.target)
         zp_obj = getattr_from_fqn(gm, zp_node.target)
         return (scale_obj, zp_obj)
@@ -215,8 +255,12 @@ def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
 
     elif prev_node.op == "call_module":
         # get type of the module
+<<<<<<< HEAD
         if not isinstance(prev_node.target, str):
             raise AssertionError(f"Expected str, got {type(prev_node.target)}")
+=======
+        assert isinstance(prev_node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module_obj = getattr_from_fqn(gm, prev_node.target)
         if isinstance(
             module_obj,
@@ -275,6 +319,7 @@ def return_first_non_observer_node(
     if node.op == "call_module":
         node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
         if _is_activation_post_process(node_obj):
+<<<<<<< HEAD
             if len(node.args) != 1:
                 raise AssertionError(
                     f"Expected node.args to have length 1, got {len(node.args)}"
@@ -293,6 +338,17 @@ def return_first_non_observer_node(
                     )
                 if not isinstance(node.args[0], Node):
                     raise AssertionError(f"Expected Node, got {type(node.args[0])}")
+=======
+            assert len(node.args) == 1
+            assert isinstance(node.args[0], Node)
+            node = node.args[0]
+            # code duplication intended, not worth refactoring
+            assert isinstance(node.target, str)
+            node_obj = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(node_obj):
+                assert len(node.args) == 1
+                assert isinstance(node.args[0], Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node = node.args[0]
     return node
 
@@ -342,7 +398,11 @@ def get_arg_indices_of_inputs_to_log(node: Node) -> list[int]:
         node.target in (torch.add, torch.ops.quantized.add, operator.add)
         or node.target in (torch.mul, torch.ops.quantized.mul, operator.mul)
     ):
+<<<<<<< HEAD
         result = [i for i in range(2) if type(node.args[i]) is Node]
+=======
+        result = [i for i in range(2) if type(node.args[i]) == Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
     return [0]
 
@@ -356,8 +416,12 @@ def get_target_type_str(node: Node, gm: GraphModule) -> str:
     if node.op in ("call_function", "call_method"):
         target_type = torch.typename(node.target)
     elif node.op == "call_module":
+<<<<<<< HEAD
         if not isinstance(node.target, str):
             raise AssertionError(f"Expected str, got {type(node.target)}")
+=======
+        assert isinstance(node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target_mod = getattr_from_fqn(gm, node.target)
         target_type = torch.typename(target_mod)
     return target_type
@@ -391,8 +455,12 @@ def rekey_logger_info_on_node_name_of_model(
         for model_name_to_results in result_type_to_results.values():
             for cur_model_name, list_of_results in model_name_to_results.items():
                 if cur_model_name == model_name:
+<<<<<<< HEAD
                     if len(list_of_results) == 0:
                         raise AssertionError("Expected list_of_results to be not empty")
+=======
+                    assert len(list_of_results)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_layer_name = list_of_results[0]["ref_node_name"]
                 else:
                     continue
@@ -432,7 +500,10 @@ def maybe_add_missing_fqns(results: NSResultsType) -> None:
                 for model_name, model_results in model_name_to_results.items():
                     if model_name == model_name_with_fqns:
                         continue
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for i in range(len(model_results)):
                         fqn = ref_model_results[i]["fqn"]
                         model_results[i]["fqn"] = fqn
@@ -496,7 +567,10 @@ def compute_normalized_l2_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tenso
     Return:
         float or tuple of floats
     """
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.sqrt(((x - y) ** 2).sum() / (x**2).sum())
 
 
@@ -546,20 +620,28 @@ def get_normalized_nth_input(node: Node, gm: GraphModule, idx: int) -> Node:
         )
         if norm_args_and_kwargs is not None:
             norm_args, norm_kwargs = norm_args_and_kwargs
+<<<<<<< HEAD
             if len(norm_args) + len(norm_kwargs) <= idx:
                 raise AssertionError(
                     f"Index {idx} out of range: total = {len(norm_args) + len(norm_kwargs)}"
                 )
+=======
+            assert len(norm_args) + len(norm_kwargs) > idx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if idx < len(norm_args):
                 return norm_args[idx]
             else:
                 # note: in Python 3.7+ dicts are ordered
                 return list(norm_kwargs.values())[idx]
         else:
+<<<<<<< HEAD
             if len(node.args) + len(node.kwargs) <= idx:
                 raise AssertionError(
                     f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}"
                 )
+=======
+            assert len(node.args) + len(node.kwargs) > idx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if idx < len(node.args):
                 return node.args[idx]  # type: ignore[return-value]
             else:
@@ -569,10 +651,14 @@ def get_normalized_nth_input(node: Node, gm: GraphModule, idx: int) -> Node:
         # this RuntimeError happens when node argument normalization
         # requires typehints to proceed, such as for torch.add where
         # either the first, second or both arguments could be tensors
+<<<<<<< HEAD
         if len(node.args) + len(node.kwargs) <= idx:
             raise AssertionError(
                 f"Index {idx} out of range: total = {len(node.args) + len(node.kwargs)}"
             ) from None
+=======
+        assert len(node.args) + len(node.kwargs) > idx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if idx < len(node.args):
             return node.args[idx]  # type: ignore[return-value]
         else:
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index 86c4dc0bb3189..d9d4741190876 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -77,8 +81,12 @@ def get_lstm_mod_weights(mod: nn.Module) -> list[torch.Tensor]:
                 res.append(param_value)
         return res
     else:
+<<<<<<< HEAD
         if not isinstance(mod, nnqd.LSTM):
             raise AssertionError(f"type {type(mod)} not handled yet")
+=======
+        assert isinstance(mod, nnqd.LSTM), f"type {type(mod)} not handled yet"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = []
         for weight_value in mod._all_weight_values:
             res.append(
@@ -93,6 +101,7 @@ def get_lstm_mod_weights(mod: nn.Module) -> list[torch.Tensor]:
 def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
     # traverse backwards from the weight arg, accounting for any observers
     weight_arg_node = node.args[1]
+<<<<<<< HEAD
     if not isinstance(weight_arg_node, Node):
         raise AssertionError(f"Expected Node, got {type(weight_arg_node)}")
     weight_node = return_first_non_observer_node(weight_arg_node, gm)
@@ -100,6 +109,12 @@ def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
         raise AssertionError(f"Expected Node, got {type(weight_node)}")
     if weight_node.op != "get_attr":
         raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+=======
+    assert isinstance(weight_arg_node, Node)
+    weight_node = return_first_non_observer_node(weight_arg_node, gm)
+    assert isinstance(weight_node, Node)
+    assert weight_node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
     return weight.detach()
 
@@ -107,10 +122,15 @@ def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
 def get_qconv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
     # qconv state is arg 1
     qconv_state_node = node.args[1]
+<<<<<<< HEAD
     if not isinstance(qconv_state_node, Node):
         raise AssertionError(f"Expected Node, got {type(qconv_state_node)}")
     if qconv_state_node.op != "get_attr":
         raise AssertionError(f"Expected get_attr, got {qconv_state_node.op}")
+=======
+    assert isinstance(qconv_state_node, Node)
+    assert qconv_state_node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     qconv_state_obj = getattr_from_fqn(gm, qconv_state_node.target)  # type: ignore[arg-type]
     return qconv_state_obj.weight()
 
@@ -121,12 +141,17 @@ def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
     # weight -> obs -> linear
     # weight -> to(torch.float16) -> dequantize -> linear
     linear_second_arg = node.args[1]
+<<<<<<< HEAD
     if not isinstance(linear_second_arg, Node):
         raise AssertionError(f"Expected Node, got {type(linear_second_arg)}")
+=======
+    assert isinstance(linear_second_arg, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if linear_second_arg.op == "call_module":
         # weight -> obs -> linear
         weight_arg_node = node.args[1]
+<<<<<<< HEAD
         if not isinstance(weight_arg_node, Node):
             raise AssertionError(f"Expected Node, got {type(weight_arg_node)}")
         weight_node = weight_arg_node.args[0]
@@ -134,10 +159,17 @@ def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
             raise AssertionError(f"Expected Node, got {type(weight_node)}")
         if weight_node.op != "get_attr":
             raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+=======
+        assert isinstance(weight_arg_node, Node)
+        weight_node = weight_arg_node.args[0]
+        assert isinstance(weight_node, Node)
+        assert weight_node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
         return weight.detach()
     elif linear_second_arg.op == "call_method":
         # weight -> to(torch.float16) -> dequantize -> linear
+<<<<<<< HEAD
         if linear_second_arg.op != "call_method":
             raise AssertionError(f"Expected call_method, got {linear_second_arg.op}")
         dequant_node = node.args[1]
@@ -153,12 +185,28 @@ def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
             raise AssertionError(f"Expected Node, got {type(weight_node)}")
         if weight_node.op != "get_attr":
             raise AssertionError(f"Expected get_attr, got {weight_node.op}")
+=======
+        assert linear_second_arg.op == "call_method"
+        dequant_node = node.args[1]
+        assert isinstance(dequant_node, Node)
+        to_fp16_node = dequant_node.args[0]
+        assert isinstance(to_fp16_node, Node)
+        # extract the dtype, so we can cast to it before returning
+        target_dtype = to_fp16_node.args[1]
+        weight_node = to_fp16_node.args[0]
+        assert isinstance(weight_node, Node)
+        assert weight_node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
         # return the weight with fp16 cast
         return weight.detach().to(target_dtype)
     else:
+<<<<<<< HEAD
         if linear_second_arg.op != "get_attr":
             raise AssertionError(f"Expected get_attr, got {linear_second_arg.op}")
+=======
+        assert linear_second_arg.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight = getattr_from_fqn(gm, linear_second_arg.target)  # type: ignore[arg-type]
         return weight.detach()
 
@@ -166,10 +214,15 @@ def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
 def get_qlinear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
     # packed weight is arg 1
     packed_weight_node = node.args[1]
+<<<<<<< HEAD
     if not isinstance(packed_weight_node, Node):
         raise AssertionError(f"Expected Node, got {type(packed_weight_node)}")
     if packed_weight_node.op != "get_attr":
         raise AssertionError(f"Expected get_attr, got {packed_weight_node.op}")
+=======
+    assert isinstance(packed_weight_node, Node)
+    assert packed_weight_node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     packed_weight = getattr_from_fqn(gm, packed_weight_node.target)  # type: ignore[arg-type]
     # TODO(future PR): why does packed_weight.unpack() not work?
     (weight, _bias), _name = packed_weight.__getstate__()
@@ -282,12 +335,20 @@ def extract_weight_from_node(
 
     elif node.op == "call_module":
         # for call_module, we need to look up the modules to do the type check
+<<<<<<< HEAD
         if not isinstance(node.target, str):
             raise AssertionError(f"Expected str, got {type(node.target)}")
         mod = getattr_from_fqn(gm, node.target)
         module_mapping = op_to_type_to_weight_extraction_fn["call_module"]
         for target_mod_type, weight_extraction_fn in module_mapping.items():
             if type(mod) is target_mod_type:
+=======
+        assert isinstance(node.target, str)
+        mod = getattr_from_fqn(gm, node.target)
+        module_mapping = op_to_type_to_weight_extraction_fn["call_module"]
+        for target_mod_type, weight_extraction_fn in module_mapping.items():
+            if type(mod) == target_mod_type:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 weight = weight_extraction_fn(mod)
                 return {
                     "type": res_type,
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index ef43d7a1f7de2..242b37e230d75 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -128,15 +128,23 @@ def _safe_rail_checks(args):
         # if features are not None, then feature_dim must not be None
         features, feature_dim = args["features"], args["feature_dim"]
         if features is not None:
+<<<<<<< HEAD
             if feature_dim is None:
                 raise AssertionError("need feature dim to select features")
+=======
+            assert feature_dim is not None, "need feature dim to select features"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # all the *_fns should be callable
         fn_keys = ["aggregate_fn", "reduce_fn", "mask_fn"]
         for key in fn_keys:
             fn = args[key]
+<<<<<<< HEAD
             if not callable(fn):
                 raise AssertionError(f"{fn} must be callable")
+=======
+            assert callable(fn), "function should be callable"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _aggregate_hook(self, name):
         """Returns hook that computes aggregate of activations passing through."""
@@ -160,9 +168,15 @@ def hook(module, input) -> None:
                 # data should be a list [aggregated over each feature only]
                 if data is None:
                     out_data = [
+<<<<<<< HEAD
                         0 for _ in range(len(features))
                     ]  # create one in case of 1st forward
                     self.state[name]["mask"] = [0 for _ in range(len(features))]
+=======
+                        0 for _ in range(0, len(features))
+                    ]  # create one incase of 1st forward
+                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     out_data = data  # a list
 
@@ -211,6 +225,7 @@ def register_layer(
             - All the functions (fn) passed as argument will be called at a dim, feature level.
         """
         name = module_to_fqn(self.model, layer)
+<<<<<<< HEAD
         if name is None:
             raise AssertionError("layer not found in the model")
 
@@ -218,6 +233,13 @@ def register_layer(
             warnings.warn(
                 "layer already attached to the sparsifier, deregistering the layer and registering with new config",
                 stacklevel=2,
+=======
+        assert name is not None, "layer not found in the model"  # satisfy mypy
+
+        if name in self.data_groups:  # unregister layer if already present
+            warnings.warn(
+                "layer already attached to the sparsifier, deregistering the layer and registering with new config"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.unregister_layer(name=name)
 
@@ -264,6 +286,7 @@ def get_mask(self, name: Optional[str] = None, layer: Optional[nn.Module] = None
             Hence, if get_mask() is called before model.forward(), an
             error will be raised.
         """
+<<<<<<< HEAD
         if name is None and layer is None:
             raise AssertionError("Need at least name or layer obj to retrieve mask")
 
@@ -273,6 +296,16 @@ def get_mask(self, name: Optional[str] = None, layer: Optional[nn.Module] = None
             name = module_to_fqn(self.model, layer)
             if name is None:
                 raise AssertionError("layer not found in the specified model")
+=======
+        assert name is not None or layer is not None, (
+            "Need at least name or layer obj to retrieve mask"
+        )
+
+        if name is None:
+            assert layer is not None
+            name = module_to_fqn(self.model, layer)
+            assert name is not None, "layer not found in the specified model"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if name not in self.state:
             raise ValueError("Error: layer with the given name not found")
@@ -341,7 +374,11 @@ def hook(module, input):
                 return input_data * mask
             else:
                 # apply per feature, feature_dim
+<<<<<<< HEAD
                 for feature_idx in range(len(features)):
+=======
+                for feature_idx in range(0, len(features)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     feature = (
                         torch.Tensor([features[feature_idx]])
                         .long()
@@ -455,8 +492,12 @@ def __set_state__(self, state: dict[str, Any]) -> None:
         for name, config in self.data_groups.items():
             # fetch layer
             layer = fqn_to_module(self.model, name)
+<<<<<<< HEAD
             if layer is None:
                 raise AssertionError(f"layer {name} not found in the model")
+=======
+            assert layer is not None  # satisfy mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # if agg_mode is True, then layer in aggregate mode
             if "hook_state" in config and config["hook_state"] == "aggregate":
diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
index c2f48abfc9dee..256eb286ab542 100644
--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -158,7 +158,10 @@ def step(self):
                     "initialization. Please, make sure to call `data_sparsifier.step()` before "
                     "`scheduler.step()`.",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # Just check if there were two first scheduler.step() calls before sparsifier.step()
@@ -168,7 +171,10 @@ def step(self):
                     "You have to make sure you run the data_sparsifier.step() BEFORE any "
                     "calls to the scheduler.step().",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         self._step_count += 1
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
index 07040584231e1..24a1d35c04e1d 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -91,10 +91,16 @@ def add_data(self, name: str, data, reuse_mask=True, **config):
             4. By default, the config of the replaced data is used as config for the replacing data, unless something
                is specified in the config dictionary.
         """
+<<<<<<< HEAD
         if type(data) not in SUPPORTED_TYPES:
             raise AssertionError(
                 f"specified data type:{type(data)} not  supported at the moment"
             )
+=======
+        assert type(data) in SUPPORTED_TYPES, (
+            "specified data type not supported at the moment"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_args = copy.deepcopy(self.defaults)
         local_args.update(config)
         weight = self._extract_weight(data)
@@ -106,8 +112,12 @@ def add_data(self, name: str, data, reuse_mask=True, **config):
         if name in self.state:
             # If the named data already exists - replace
             warnings.warn(
+<<<<<<< HEAD
                 "Replacing existing data of the same name. - Did you mean a different name?",
                 stacklevel=2,
+=======
+                "Replacing existing data of the same name. - Did you mean a different name?"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             # reuse old config
@@ -117,10 +127,16 @@ def add_data(self, name: str, data, reuse_mask=True, **config):
 
             if reuse_mask:
                 current_data = self.get_data(name=name)
+<<<<<<< HEAD
                 if weight.shape != current_data.shape:
                     raise AssertionError(
                         "to retain the old mask, the shape of the new data must be the same as the previous one"
                     )
+=======
+                assert weight.shape == current_data.shape, (
+                    "to retain the old mask, the shape of the new data must be the same as the previous one"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mask = self.get_mask(
                     name=name
                 )  # reuse mask instead of creating a new one
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
index 234a573029f80..b27087a2fa07c 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@@ -14,7 +14,11 @@ The [DataNormSparsifier](https://github.com/pytorch/pytorch/blob/main/torch/ao/p
 3. Norm: L1 and L2
 
 ## Dataset
+<<<<<<< HEAD
 The benchmarks are created for the dlrm model on the Kaggle CriteoDataset which can be downloaded from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1). <!-- codespell:ignore -->
+=======
+The benchmarks are created for the dlrm model on the Kaggle CriteoDataset which can be downloaded from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Results
 1. **Disk Usage**: Introducing sparsity in the embeddings reduces file size after compression. The compressed model size goes down from 1.9 GB to 150 MB after 100% sparsity.
@@ -34,7 +38,11 @@ The takeaway is that the dlrm model with sparse coo tensor is slower (roughly 2x
 ## Setup
 The benchmark codes depend on the [DLRM codebase](https://github.com/facebookresearch/dlrm).
 1. Clone the dlrm git repository
+<<<<<<< HEAD
 2. Download the dataset from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1) <!-- codespell:ignore -->
+=======
+2. Download the dataset from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 3. The DLRM model can be trained using the following script
 ```
 # Make sure you go into the file and make sure that the path to dataset is correct.
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
index 3c146c55947a0..bdb2561cdb603 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
@@ -23,6 +23,7 @@ def __init__(self, **args):
         super().__init__(**args)
 
     def forward(self, dense_x, lS_o, lS_i):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         x = self.apply_mlp(dense_x, self.bot_l)  # dense features
         # pyrefly: ignore [missing-attribute]
@@ -34,6 +35,14 @@ def forward(self, dense_x, lS_o, lS_i):
         # pyrefly: ignore [missing-attribute]
         z = torch.mm(z, self.top_l[0].weight.T).add(self.top_l[0].bias)
         # pyrefly: ignore [missing-attribute]
+=======
+        x = self.apply_mlp(dense_x, self.bot_l)  # dense features
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)  # apply embedding bag
+        z = self.interact_features(x, ly)
+
+        z = z.to_sparse_coo()
+        z = torch.mm(z, self.top_l[0].weight.T).add(self.top_l[0].bias)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for layer in self.top_l[1:]:
             z = layer(z)
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 0e25f59cea642..c755a9ca0142d 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -99,7 +99,11 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
         sparse_block_shapes (List of tuples)
             List of sparse block shapes to be sparsified on
     """
+<<<<<<< HEAD
     sparsity_levels = [sl / 10 for sl in range(10)]
+=======
+    sparsity_levels = [sl / 10 for sl in range(0, 10)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sparsity_levels += [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
 
     norms = ["L1", "L2"]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
index 84ac2f91c9a33..29bf296c83145 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
@@ -27,7 +27,10 @@ def run_forward(model, **batch):
             model(X, lS_o, lS_i)
         end = time.time()
         time_taken = end - start
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         time_list.append(time_taken)
     avg_time = np.mean(time_list[1:])
     return avg_time
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
index 4dccb52ee24fb..b586f9b5b5da4 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
@@ -47,8 +47,12 @@ def __init__(
         if zeros_per_block is None:
             zeros_per_block = reduce(operator.mul, sparse_block_shape)
 
+<<<<<<< HEAD
         if norm not in ["L1", "L2"]:
             raise AssertionError("only L1 and L2 norm supported at the moment")
+=======
+        assert norm in ["L1", "L2"], "only L1 and L2 norm supported at the moment"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         defaults = {
             "sparsity_level": sparsity_level,
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
index 7e33e05341602..35ff7f5fb0ad4 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/README.md
@@ -3,7 +3,11 @@
 **These are callback scripts for lightning and does not introduce pytorch lightning dependency on PyTorch.**
 
 ## Introduction
+<<<<<<< HEAD
 Callbacks for PytorchLightning that specifies on when and how to sparsify the data weights of the model.
+=======
+Callbacks for PytorchLightning that specifies on when and how to to sparsify the data weights of the model.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Types of Data Sparsity Callbacks
 There are 2 types of data sparsity callbacks
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 9447e3331c22c..fbde16086665e 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -74,7 +74,10 @@ def get_schedule_param(self):
                 "To get the last learning rate computed by the scheduler, "
                 "please use `get_last_lr()`.",
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         data_groups = self.data_sparsifier.data_groups
         if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
@@ -200,7 +203,11 @@ def _check_on_train_epoch_start(self, pl_module, callback):
                    do not want as the config of each layer changes after
                    .step()
 
+<<<<<<< HEAD
         Hence, we need to dump and restore the state_dict() every time because we're
+=======
+        Hence, we need to dump and restore the state_dict() everytime because we're
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         copying the model after each epoch.
         Hence, it is essential to make sure that the sparsifier's state_dict() is being
         correctly dumped and restored.
@@ -300,7 +307,11 @@ def test_train_aware_callback(self):
         self._check_on_train_start(pl_module, callback, sparsifier_args, scheduler_args)
 
         num_epochs = 5
+<<<<<<< HEAD
         for _ in range(num_epochs):
+=======
+        for _ in range(0, num_epochs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_on_train_epoch_start(pl_module, callback)
             self._simulate_update_param_model(pl_module)
             self._check_on_train_epoch_end(pl_module, callback)
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
index 5c3dbde4c3d4c..4fa8cde0c7d11 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
@@ -66,6 +66,7 @@ def post_training_sparse_quantize(
 
     else:
         embedding_modules = []
+<<<<<<< HEAD
         if not isinstance(select_embeddings, list):
             raise AssertionError(
                 "the embedding_modules must be a list of embedding modules"
@@ -80,6 +81,19 @@ def post_training_sparse_quantize(
                 raise AssertionError(
                     "the embedding modules must be part of input model"
                 )
+=======
+        assert isinstance(select_embeddings, list), (
+            "the embedding_modules must be a list of embedding modules"
+        )
+        for emb in select_embeddings:
+            assert type(emb) in SUPPORTED_MODULES, (
+                "the embedding_modules list must be an embedding or embedding bags"
+            )
+            fqn_name = module_to_fqn(model, emb)
+            assert fqn_name is not None, (
+                "the embedding modules must be part of input model"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             embedding_modules.append((fqn_name, emb))
 
     if sparsify_first:
@@ -117,8 +131,12 @@ def post_training_sparse_quantize(
 
         for name, _ in embedding_modules:
             quantized_emb = fqn_to_module(model, name)
+<<<<<<< HEAD
             if quantized_emb is None:
                 raise AssertionError(f"quantized embedding {name} not found in model")
+=======
+            assert quantized_emb is not None  # satisfy mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             quantized_weight = quantized_emb.weight()  # type: ignore[operator]
             quantize_params["scales"][name] = quantized_weight.q_per_channel_scales()
@@ -142,8 +160,12 @@ def post_training_sparse_quantize(
 
         for name, _ in embedding_modules:
             quantized_emb = fqn_to_module(model, name)
+<<<<<<< HEAD
             if quantized_emb is None:
                 raise AssertionError(f"quantized embedding {name} not found in model")
+=======
+            assert quantized_emb is not None  # satisfy mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requantized_vector = torch.quantize_per_channel(
                 quantize_params["dequant_weights"][name],
                 scales=quantize_params["scales"][name],
diff --git a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
index c1ce68d0d83ae..c8712eddb109a 100644
--- a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -12,7 +16,11 @@
 
 class FPGMPruner(BaseStructuredSparsifier):
     r"""Filter Pruning via Geometric Median (FPGM) Structured Pruner
+<<<<<<< HEAD
     This sparsifier prune filter (row) in a tensor according to distances among filters according to
+=======
+    This sparsifier prune fliter (row) in a tensor according to distances among filters according to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`_.
 
     This sparsifier is controlled by three variables:
@@ -72,7 +80,10 @@ def _compute_distance(self, t):
         dist_matrix = self.dist_fn(t_flatten)
 
         # more similar with other filter indicates large in the sum of row
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         distance = torch.sum(torch.abs(dist_matrix), 1)
 
         return distance
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index 4725c44897135..bca23b3cfbbd7 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,8 +1,14 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from itertools import chain
 from operator import getitem
 from typing import Optional, Union
+=======
+from itertools import chain
+from operator import getitem
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -260,7 +266,10 @@ def _prepare(self, *args, **kwargs) -> None:
                     module.register_parameter(
                         "_bias", nn.Parameter(module.bias.detach())
                     )
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     module.bias = None
                     module.prune_bias = prune_bias
 
diff --git a/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
index f904cc3ab8c4c..1cdb3f550d516 100644
--- a/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from typing import Any, cast
 
 import torch
@@ -5,6 +6,14 @@
 
 from .base_structured_sparsifier import BaseStructuredSparsifier
 from .parametrization import FakeStructuredSparsity
+=======
+# mypy: allow-untyped-defs
+from typing import cast
+
+import torch
+
+from .base_structured_sparsifier import BaseStructuredSparsifier, FakeStructuredSparsity
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LSTMSaliencyPruner(BaseStructuredSparsifier):
@@ -26,7 +35,11 @@ class LSTMSaliencyPruner(BaseStructuredSparsifier):
     This applies to both weight_ih_l{k} and weight_hh_l{k}.
     """
 
+<<<<<<< HEAD
     def update_mask(self, module: nn.Module, tensor_name: str, **kwargs: Any) -> None:
+=======
+    def update_mask(self, module, tensor_name, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weights = getattr(module, tensor_name)
 
         for p in getattr(module.parametrizations, tensor_name):
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index 4256d6fd01750..1d7b5b4ab9ca7 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -28,12 +28,17 @@ def __init__(self, mask):
         self.register_buffer("mask", mask)
 
     def forward(self, x):
+<<<<<<< HEAD
         if not isinstance(self.mask, torch.Tensor):
             raise AssertionError("mask must be a torch.Tensor")
         if self.mask.shape[0] != x.shape[0]:
             raise AssertionError(
                 f"mask shape[0] ({self.mask.shape[0]}) must match x shape[0] ({x.shape[0]})"
             )
+=======
+        assert isinstance(self.mask, torch.Tensor)
+        assert self.mask.shape[0] == x.shape[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape = [1] * len(x.shape)
         shape[0] = -1
         return self.mask.reshape(shape) * x
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index c567e5771859d..e51464c5f36c5 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -4,8 +4,12 @@
 Also contains utilities for bias propagation
 """
 
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import cast, Optional
+=======
+from typing import Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import nn, Tensor
@@ -97,7 +101,10 @@ def _propagate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
     if module.bias is not None:
         module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
     elif getattr(module, "_bias", None) is not None:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
 
     # get pruned biases to propagate to subsequent layer
@@ -127,7 +134,10 @@ def _prune_linear_helper(linear: nn.Linear) -> Tensor:
     linear.out_features = linear.weight.shape[0]
     _remove_bias_handles(linear)
 
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mask
 
 
@@ -186,7 +196,10 @@ def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
     conv2d.out_channels = conv2d.weight.shape[0]
 
     _remove_bias_handles(conv2d)
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mask
 
 
@@ -207,7 +220,10 @@ def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
             new_bias = torch.zeros(conv2d_1.bias.shape)
             new_bias[mask] = conv2d_1.bias[mask]  # type: ignore[possibly-undefined]
             # adjusted bias that to keep in conv2d_1
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
             # pruned biases that are kept instead of propagated
             conv2d_1.bias = nn.Parameter(new_bias)
@@ -332,10 +348,16 @@ def prune_conv2d_pool_flatten_linear(
         linear_ic = linear.weight.shape[1]
 
     conv2d_oc = len(mask)
+<<<<<<< HEAD
     if linear_ic % conv2d_oc != 0:
         raise AssertionError(
             f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
         )
+=======
+    assert linear_ic % conv2d_oc == 0, (
+        f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     flatten_scale = linear_ic // conv2d_oc
     flattened_mask = torch.tensor(
diff --git a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
index 11c4652a7f0da..855e876d623dc 100644
--- a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
@@ -7,7 +7,11 @@ class SaliencyPruner(BaseStructuredSparsifier):
     Prune rows based on the saliency (L1 norm) of each row.
 
     This pruner works on N-Dimensional weight tensors.
+<<<<<<< HEAD
     For each row, we will calculate the saliency, which is the sum the L1 norm of all weights in that row.
+=======
+    For each row, we will calculate the saliency, whic is the sum the L1 norm of all weights in that row.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     We expect that the resulting saliency vector has the same shape as our mask.
     We then pick elements to remove until we reach the target sparsity_level.
     """
@@ -23,10 +27,14 @@ def update_mask(self, module, tensor_name, **kwargs):
                 "Structured pruning can only be applied to a 2+dim weight tensor!"
             )
         saliency = -weights.norm(dim=tuple(range(1, weights.dim())), p=1)
+<<<<<<< HEAD
         if saliency.shape != mask.shape:
             raise AssertionError(
                 f"saliency shape ({saliency.shape}) must match mask shape ({mask.shape})"
             )
+=======
+        assert saliency.shape == mask.shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         num_to_pick = int(len(mask) * kwargs["sparsity_level"])
         prune = saliency.topk(num_to_pick).indices
diff --git a/torch/ao/pruning/scheduler/base_scheduler.py b/torch/ao/pruning/scheduler/base_scheduler.py
index ac8916713dae6..5ab40c8a97fbf 100644
--- a/torch/ao/pruning/scheduler/base_scheduler.py
+++ b/torch/ao/pruning/scheduler/base_scheduler.py
@@ -92,8 +92,12 @@ def get_sl(self):
         if not self._get_sl_called_within_step:
             warnings.warn(
                 "To get the last sparsity level computed by the scheduler, "
+<<<<<<< HEAD
                 "please use `get_last_sl()`.",
                 stacklevel=2,
+=======
+                "please use `get_last_sl()`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         raise NotImplementedError
 
@@ -125,7 +129,10 @@ def step(self, epoch=None):
                     "initialization. Please, make sure to call `sparsifier.step()` before "
                     "`scheduler.step()`.",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # Just check if there were two first scheduler.step() calls before sparsifier.step()
@@ -135,7 +142,10 @@ def step(self, epoch=None):
                     "You have to make sure you run the sparsifier.step() BEFORE any "
                     "calls to the scheduler.step().",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         self._step_count += 1
 
diff --git a/torch/ao/pruning/scheduler/cubic_scheduler.py b/torch/ao/pruning/scheduler/cubic_scheduler.py
index d4706900762ad..f3d0b8289dde0 100644
--- a/torch/ao/pruning/scheduler/cubic_scheduler.py
+++ b/torch/ao/pruning/scheduler/cubic_scheduler.py
@@ -90,8 +90,12 @@ def get_sl(self):
         if not self._get_sl_called_within_step:
             warnings.warn(
                 "To get the last sparsity level computed by the scheduler, "
+<<<<<<< HEAD
                 "please use `get_last_sl()`.",
                 stacklevel=2,
+=======
+                "please use `get_last_sl()`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return [
             self.sparsity_compute_fn(
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index d9b6cb0a4d959..9d8f709c65948 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -1,8 +1,13 @@
+<<<<<<< HEAD
 import warnings
 from collections.abc import Callable
 from typing import Union
 
 from torch.ao.pruning.sparsifier.base_sparsifier import BaseSparsifier
+=======
+# mypy: allow-untyped-defs
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .base_scheduler import BaseScheduler
 
@@ -33,6 +38,7 @@ class LambdaSL(BaseScheduler):
         >>>     scheduler.step()
     """
 
+<<<<<<< HEAD
     def __init__(
         self,
         sparsifier: BaseSparsifier,
@@ -40,6 +46,9 @@ def __init__(
         last_epoch: int = -1,
         verbose: bool = False,
     ) -> None:
+=======
+    def __init__(self, sparsifier, sl_lambda, last_epoch=-1, verbose=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.sparsifier = sparsifier
 
         if not isinstance(sl_lambda, list) and not isinstance(sl_lambda, tuple):
@@ -50,6 +59,7 @@ def __init__(
                     f"Expected {len(sparsifier.groups)} lr_lambdas, but got {len(sl_lambda)}"
                 )
             self.sl_lambdas = list(sl_lambda)
+<<<<<<< HEAD
         super().__init__(sparsifier, last_epoch, verbose)  # type: ignore[no-untyped-call]
 
     def get_sl(self) -> list[float]:
@@ -58,6 +68,15 @@ def get_sl(self) -> list[float]:
                 "To get the last sparsity level computed by the scheduler, "
                 "please use `get_last_sl()`.",
                 stacklevel=2,
+=======
+        super().__init__(sparsifier, last_epoch, verbose)
+
+    def get_sl(self):
+        if not self._get_sl_called_within_step:
+            warnings.warn(
+                "To get the last sparsity level computed by the scheduler, "
+                "please use `get_last_sl()`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return [
             base_sl * lmbda(self.last_epoch)
diff --git a/torch/ao/pruning/sparsifier/base_sparsifier.py b/torch/ao/pruning/sparsifier/base_sparsifier.py
index 14764c77cc604..41af063abd679 100644
--- a/torch/ao/pruning/sparsifier/base_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/base_sparsifier.py
@@ -149,8 +149,12 @@ def make_config_from_model(
             for _name, child in module.named_children():
                 if type(child) in SUPPORTED_MODULES:
                     module_fqn = module_to_fqn(model, child)
+<<<<<<< HEAD
                     if not isinstance(module_fqn, str):
                         raise AssertionError("module_fqn must be a string")
+=======
+                    assert isinstance(module_fqn, str)  # for mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.config.append({"tensor_fqn": module_fqn + ".weight"})
                 else:
                     stack.append(child)
@@ -171,6 +175,7 @@ def prepare(self, model, config):
             self.make_config_from_model(model)
 
         # TODO: Remove the configuration by reference ('module')
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
         for module_config in self.config:
             if not isinstance(module_config, dict):
@@ -181,15 +186,31 @@ def prepare(self, model, config):
 
             if not isinstance(self.defaults, dict):
                 raise AssertionError("defaults must be a dict")
+=======
+        for module_config in self.config:
+            assert isinstance(module_config, dict), (
+                "config elements should be dicts not modules i.e.:"
+                "[{`tensor_fqn`: `foo.bar.weight`}, {`tensor_fqn`: ... }, ...]"
+            )
+
+            assert isinstance(self.defaults, dict)  # for mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_args = copy.deepcopy(self.defaults)
             local_args.update(module_config)
 
             tensor_fqn = local_args.get("tensor_fqn", None)
+<<<<<<< HEAD
             if tensor_fqn is None:
                 raise AssertionError(
                     "tensor_fqn is a required argument in the sparsity config which"
                     "replaces previous `module` and [module]`fqn` arguments"
                 )
+=======
+            assert tensor_fqn is not None, (
+                "tensor_fqn is a required argument in the sparsity config which"
+                "replaces previous `module` and [module]`fqn` arguments"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # populate all information from tensor_fqn
             info_from_tensor_fqn = get_arg_info_from_tensor_fqn(model, tensor_fqn)
@@ -198,17 +219,27 @@ def prepare(self, model, config):
             # from tensor_fqn
             for key in info_from_tensor_fqn.keys():
                 if key in local_args:
+<<<<<<< HEAD
                     if not (
+=======
+                    assert (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         info_from_tensor_fqn[key] == local_args[key]
                         or (
                             key == "tensor_fqn"
                             and "." + info_from_tensor_fqn[key] == local_args[key]
                         )
                         # info_from_tensor_fqn will chop leading '.' from tensor_fqn so ignore that
+<<<<<<< HEAD
                     ):
                         raise AssertionError(
                             f"Given both `{key}` and `tensor_fqn` in the config, it is expected them to agree!"
                         )
+=======
+                    ), (
+                        f"Given both `{key}` and `tensor_fqn` in the config, it is expected them to agree!"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_args.update(info_from_tensor_fqn)
             self.groups.append(local_args)
         self._prepare()
diff --git a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
index 26fb3a98b8fb7..428038f2af851 100644
--- a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -53,7 +53,11 @@ def update_mask(  # type:ignore[override]
                 "nearliness cannot be larger than the dimensions of tensor."
             )
 
+<<<<<<< HEAD
         for row in range(height):
+=======
+        for row in range(0, height):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Bounds of entries that needs to be set to 1
             low = max(0, row - dist_to_diagonal)
             high = min(width, row + dist_to_diagonal + 1)
diff --git a/torch/ao/pruning/sparsifier/utils.py b/torch/ao/pruning/sparsifier/utils.py
index a852b35017fcd..ece7913968514 100644
--- a/torch/ao/pruning/sparsifier/utils.py
+++ b/torch/ao/pruning/sparsifier/utils.py
@@ -51,12 +51,19 @@ def swap_module(
             new_mod.register_forward_hook(hook_fn)
 
         # respect device affinity when swapping modules
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         devices = {p.device for p in chain(mod.parameters(), mod.buffers())}
         if len(devices) > 1:
             raise AssertionError(
                 f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
             )
+=======
+        devices = {p.device for p in chain(mod.parameters(), mod.buffers())}
+        assert len(devices) <= 1, (
+            f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = next(iter(devices)) if len(devices) > 0 else None
         if device:
             new_mod.to(device)
@@ -100,7 +107,11 @@ def get_arg_info_from_tensor_fqn(model: nn.Module, tensor_fqn: str) -> dict[str,
     # string manip to split tensor_fqn into module_fqn and tensor_name
     # if tensor_fqn is 'weight' then module_fqn and tensor_name are '' and 'weight'
     # if tensor_fqn is 'linear.weight' then module_fqn and tensor_name are 'linear' and 'weight'
+<<<<<<< HEAD
     tensor_name = tensor_fqn.rsplit(".", maxsplit=1)[-1]
+=======
+    tensor_name = tensor_fqn.split(".")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module_fqn = tensor_fqn[: -len(tensor_name) - ("." in tensor_fqn)]
 
     module = fqn_to_module(model, module_fqn)
@@ -130,10 +141,14 @@ def __init__(self, mask):
         self.register_buffer("mask", mask)
 
     def forward(self, x):
+<<<<<<< HEAD
         if self.mask.shape != x.shape:
             raise AssertionError(
                 f"mask shape ({self.mask.shape}) must match x shape ({x.shape})"
             )
+=======
+        assert self.mask.shape == x.shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.mask * x
 
     def state_dict(self, *args, **kwargs):
diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
index a3645dc3ab872..a59079d367222 100644
--- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import reduce
 from typing import Optional, Union
+=======
+from functools import reduce
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -95,8 +100,12 @@ def _scatter_fold_block_mask(
     ):
         r"""Creates patches of size `block_shape` after scattering the indices."""
         if mask is None:
+<<<<<<< HEAD
             if input_shape is None:
                 raise AssertionError("input_shape must be provided when mask is None")
+=======
+            assert input_shape is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mask = torch.ones(input_shape, device=device)
         mask.scatter_(dim=dim, index=indices, value=0)
         mask.data = F.fold(
@@ -144,7 +153,11 @@ def _make_tensor_mask(
 
         data = data.repeat(1, values_per_block, 1)
 
+<<<<<<< HEAD
         threshold_idx = round(sparsity_level * num_blocks)
+=======
+        threshold_idx = int(round(sparsity_level * num_blocks))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         threshold_idx = max(0, min(num_blocks - 1, threshold_idx))  # Sanity check
         _, sorted_idx = torch.topk(data, k=threshold_idx, dim=2, largest=False)
 
@@ -236,7 +249,10 @@ def update_mask(  # type: ignore[call-override, override]
             ww = self.norm_fn(getattr(module, tensor_name))
             tensor_mask = self._make_tensor_mask(
                 data=ww,
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_shape=ww.shape,
                 sparsity_level=sparsity_level,
                 sparse_block_shape=sparse_block_shape,
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index b07494e9a855f..5f874eac75b5b 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import sys
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -24,8 +28,11 @@
     _move_exported_model_to_eval as move_exported_model_to_eval,
     _move_exported_model_to_train as move_exported_model_to_train,
 )
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .qconfig import *  # noqa: F403
 from .qconfig_mapping import *  # noqa: F403
 from .quant_type import *  # noqa: F403
@@ -36,6 +43,7 @@
 
 
 # ensure __module__ is set correctly for public APIs
+<<<<<<< HEAD
 if sys.version_info < (3, 12):
     ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
     ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
@@ -46,6 +54,10 @@
         "ObserverOrFakeQuantize", Union[ObserverBase, FakeQuantizeBase]
     )
 
+=======
+ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 for _f in [
     compare_results,
     extract_results_from_loggers,
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index ab44cfa09197d..f854aa3342325 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -2,8 +2,12 @@
 import copy
 import operator
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Union
+=======
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic as nni
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 17bbf15e63710..fcda6b50e0592 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -3,14 +3,21 @@
 
 from dataclasses import dataclass
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.ao.quantization.utils import Pattern
 
 
@@ -240,29 +247,46 @@ def from_dict(cls, dtype_config_dict: dict[str, Any]) -> DTypeConfig:
             "bias_type": torch.dtype
             "is_dynamic": bool
         """
+<<<<<<< HEAD
         input_dtype = dtype_config_dict.get(INPUT_DTYPE_DICT_KEY)
+=======
+        input_dtype = dtype_config_dict.get(INPUT_DTYPE_DICT_KEY, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input_dtype is not None and not isinstance(
             input_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected input_dtype to be a torch.dtype or DTypeWithConstraints"
             )
+<<<<<<< HEAD
         output_dtype = dtype_config_dict.get(OUTPUT_DTYPE_DICT_KEY)
+=======
+        output_dtype = dtype_config_dict.get(OUTPUT_DTYPE_DICT_KEY, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if output_dtype is not None and not isinstance(
             output_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected output_dtype to be a torch.dtype or DTypeWithConstraints"
             )
+<<<<<<< HEAD
         weight_dtype = dtype_config_dict.get(WEIGHT_DTYPE_DICT_KEY)
+=======
+        weight_dtype = dtype_config_dict.get(WEIGHT_DTYPE_DICT_KEY, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if weight_dtype is not None and not isinstance(
             weight_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected weight_dtype to be a torch.dtype or DTypeWithConstraints"
             )
+<<<<<<< HEAD
         bias_dtype = dtype_config_dict.get(BIAS_DTYPE_DICT_KEY)
         is_dynamic = dtype_config_dict.get(IS_DYNAMIC_DICT_KEY)
+=======
+        bias_dtype = dtype_config_dict.get(BIAS_DTYPE_DICT_KEY, None)
+        is_dynamic = dtype_config_dict.get(IS_DYNAMIC_DICT_KEY, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(input_dtype, output_dtype, weight_dtype, bias_dtype, is_dynamic)
 
     def to_dict(self) -> dict[str, Any]:
@@ -673,6 +697,7 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         for d in backend_pattern_config_dict.get(DTYPE_CONFIGS_DICT_KEY, []):
             conf.add_dtype_config(_get_dtype_config(d))
         conf.set_root_module(
+<<<<<<< HEAD
             backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY)  # type: ignore[arg-type]
         )
         conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY))  # type: ignore[arg-type]
@@ -690,6 +715,25 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         )
         conf._set_extra_inputs_getter(
             backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY)  # type: ignore[arg-type]
+=======
+            backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY, None))  # type: ignore[arg-type]
+        conf.set_reference_quantized_module(
+            backend_pattern_config_dict.get(REFERENCE_QUANTIZED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_fused_module(
+            backend_pattern_config_dict.get(FUSED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_fuser_method(
+            backend_pattern_config_dict.get(FUSER_METHOD_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf._set_root_node_getter(
+            backend_pattern_config_dict.get(ROOT_NODE_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf._set_extra_inputs_getter(
+            backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         conf._set_num_tensor_args_to_observation_type(
             backend_pattern_config_dict.get(
diff --git a/torch/ao/quantization/backend_config/observation_type.py b/torch/ao/quantization/backend_config/observation_type.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index 65094392abf8e..b7adcebe873de 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
diff --git a/torch/ao/quantization/experimental/adaround_fake_quantize.py b/torch/ao/quantization/experimental/adaround_fake_quantize.py
index 1f129e551af01..2ec15b6eca0b2 100644
--- a/torch/ao/quantization/experimental/adaround_fake_quantize.py
+++ b/torch/ao/quantization/experimental/adaround_fake_quantize.py
@@ -40,6 +40,7 @@ def __init__(
         )
         # Populate quant_min/quant_max to observer_kwargs if valid
         if quant_min is not None and quant_max is not None:
+<<<<<<< HEAD
             if quant_min > quant_max:
                 raise AssertionError(
                     "quant_min must be less than or equal to quant_max, "
@@ -50,6 +51,15 @@ def __init__(
         self.is_symmetric: bool = _is_symmetric_quant(qscheme)
         if not self.is_symmetric:
             raise AssertionError("Only symmetric quantization is supported")
+=======
+            assert quant_min <= quant_max, (
+                "quant_min must be less than or equal to quant_max"
+            )
+        self.qscheme: torch.qscheme = qscheme
+        self.is_per_tensor: bool = is_per_tensor(qscheme)
+        self.is_symmetric: bool = _is_symmetric_quant(qscheme)
+        assert self.is_symmetric, "Only symmetric quantization is supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ch_axis: int = ch_axis
 
         self.scale = torch.tensor([], requires_grad=False)
@@ -108,8 +118,14 @@ def update_scale(
         X_q = X / self.scale
         X_q_floor = torch.floor(X_q)
         residual = X_q - X_q_floor  # [0,1)
+<<<<<<< HEAD
         if not torch.all(torch.ge(residual, 0)):
             raise AssertionError("residual should be non-negative in [0, 1)")
+=======
+        assert torch.all(torch.ge(residual, 0)), (
+            "residual should be non-negative [0, 1)"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V_init = -torch.log((self.zeta - self.gamma) / (residual - self.gamma) - 1)
         self.V.data = V_init
 
diff --git a/torch/ao/quantization/experimental/adaround_loss.py b/torch/ao/quantization/experimental/adaround_loss.py
index 8168e4de70c50..dd4699b0d22b2 100644
--- a/torch/ao/quantization/experimental/adaround_loss.py
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@@ -37,8 +37,14 @@ def rounding_regularization(
         Major logics copied from official Adaround Implementation.
         Apply rounding regularization to the input tensor V.
         """
+<<<<<<< HEAD
         if curr_iter >= self.max_iter:
             raise AssertionError("Current iteration strictly les sthan max iteration")
+=======
+        assert curr_iter < self.max_iter, (
+            "Current iteration strictly les sthan max iteration"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if curr_iter < self.warm_start * self.max_iter:
             return torch.tensor(0.0)
         else:
@@ -53,7 +59,11 @@ def rounding_regularization(
                 1 + np.cos(rel_iter * np.pi)
             )
 
+<<<<<<< HEAD
             # A rectified sigmoid for soft-quantization as formulated [23] in https://arxiv.org/pdf/2004.10568.pdf
+=======
+            # A rectified sigmoid for soft-quantization as formualted [23] in https://arxiv.org/pdf/2004.10568.pdf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             h_alpha = torch.clamp(
                 torch.sigmoid(V) * (ADAROUND_ZETA - ADAROUND_GAMMA) + ADAROUND_GAMMA,
                 min=0,
diff --git a/torch/ao/quantization/experimental/adaround_optimization.py b/torch/ao/quantization/experimental/adaround_optimization.py
index 1b34c8cbfdb8a..8b378e65c7c2a 100644
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.experimental.adaround_fake_quantize import (
@@ -108,7 +112,11 @@ def get_data_inp_out(
         )
         if torch.cuda.is_available():
             # Somehow, we need to move the model continuously
+<<<<<<< HEAD
             # Otherwise, the model will be lowered to CPU mysteriously
+=======
+            # Otherwise, the model will be lowered to CPU misteriously
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.model = self.model.cuda()
             self.q_model = self.q_model.cuda()
         for data_ in data:
@@ -187,10 +195,16 @@ def optimize_adaptive_rounding(
         inp, out, fp_in = self.get_data_inp_out(module, q_module, self.data)
 
         print("==================== Before adaround ====================")
+<<<<<<< HEAD
         if torch.abs(out[0] - module(fp_in[0])).sum().item() != 0:
             raise AssertionError(
                 "In-placed activation is detected, please do not use activation in-placed"
             )
+=======
+        assert torch.abs(out[0] - module(fp_in[0])).sum().item() == 0, (
+            "In-placed activation is detected, please do not use activation in-placed"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Stack the tensors in each list into a single tensor
         # Assuming inp and out are your lists of tensors
         inp_tensor = torch.vstack(inp)
diff --git a/torch/ao/quantization/experimental/fake_quantize.py b/torch/ao/quantization/experimental/fake_quantize.py
index f7e754c604500..6b6e64b7bca05 100644
--- a/torch/ao/quantization/experimental/fake_quantize.py
+++ b/torch/ao/quantization/experimental/fake_quantize.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -36,6 +40,7 @@ def forward(self, X: torch.Tensor) -> Tensor:  # type: ignore[override]
             self.level_indices = result[3]
 
         if self.fake_quant_enabled[0] == 1:
+<<<<<<< HEAD
             if (
                 self.alpha is None
                 or self.gamma is None
@@ -46,4 +51,17 @@ def forward(self, X: torch.Tensor) -> Tensor:  # type: ignore[override]
             X = fake_quantize_function.apply(
                 X, self.alpha, self.gamma, self.quantization_levels, self.level_indices
             )
+=======
+            assert (
+                self.alpha is not None
+                and self.gamma is not None
+                and self.quantization_levels is not None
+                and self.level_indices is not None
+            ), "Must set qparams for fake quant"
+
+            X = fake_quantize_function.apply(
+                X, self.alpha, self.gamma, self.quantization_levels, self.level_indices
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return X
diff --git a/torch/ao/quantization/experimental/linear.py b/torch/ao/quantization/experimental/linear.py
index 3d7d3f79fcaf8..7875af2116f24 100644
--- a/torch/ao/quantization/experimental/linear.py
+++ b/torch/ao/quantization/experimental/linear.py
@@ -28,12 +28,17 @@ class LinearAPoT(WeightedQuantizedModule):
     """
 
     def __init__(self, weight2quantize: torch.Tensor, b: int, k: int):
+<<<<<<< HEAD
         if weight2quantize.dim() != 2:
             raise AssertionError(
                 f"weight2quantize must be a 2-D tensor, got dim={weight2quantize.dim()}"
             )
         if b % k != 0:
             raise AssertionError(f"b must be divisible by k, got b={b}, k={k}")
+=======
+        assert weight2quantize.dim() == 2
+        assert b % k == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         super().__init__()
 
@@ -146,10 +151,14 @@ def forward(self, activation: torch.Tensor) -> torch.FloatTensor:
         Args:
             activation (Tensor): uniformly quantized activation tensor
         """
+<<<<<<< HEAD
         if activation.dim() != 2:
             raise AssertionError(
                 f"activation must be a 2-D tensor, got dim={activation.dim()}"
             )
+=======
+        assert activation.dim() == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         weight_rows = self.weight_transposed.size()[0]
         weight_cols = self.weight_transposed.size()[1]
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index 135b27eaf9256..f52f639a9f469 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -58,12 +58,17 @@ def _calculate_qparams(self, signed: bool, min_val=None, max_val=None):
         alpha = torch.max(-self.min_val, self.max_val)
 
         # check for valid inputs of b, k
+<<<<<<< HEAD
         if not self.k or self.k == 0:
             raise AssertionError(f"k must be a non-zero integer, got k={self.k}")
         if self.b % self.k != 0:
             raise AssertionError(
                 f"b must be divisible by k, got b={self.b}, k={self.k}"
             )
+=======
+        assert self.k and self.k != 0
+        assert self.b % self.k == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # compute n and store as member variable
         self.n = self.b // self.k
@@ -72,10 +77,17 @@ def _calculate_qparams(self, signed: bool, min_val=None, max_val=None):
         p_all = []
 
         # create levels
+<<<<<<< HEAD
         for i in range(self.n):
             p_curr = torch.tensor([0])
 
             for j in range((2**self.k - 2) + 1):
+=======
+        for i in range(0, self.n):
+            p_curr = torch.tensor([0])
+
+            for j in range(0, (2**self.k - 2) + 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 curr_ele = 2 ** (-(i + j * self.n))
                 p_append = torch.tensor([curr_ele])
                 p_curr = torch.cat((p_curr, p_append))
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index a1ebffebb7d9d..d6899634492e4 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -64,7 +64,13 @@ def _is_symmetric_quant(qscheme: "torch.qscheme") -> bool:
 
 
 def _is_float_qparams(qscheme: "torch.qscheme") -> bool:
+<<<<<<< HEAD
     return qscheme == torch.per_channel_affine_float_qparams
+=======
+    return qscheme in [
+        torch.per_channel_affine_float_qparams,
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FakeQuantizeBase(ABC, Module):
@@ -185,9 +191,13 @@ def __init__(
                 dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
                     "dtype", dtype
                 )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             assert torch.iinfo(dtype).min <= quant_min, "quant_min out of bound"
             # pyrefly: ignore [bad-argument-type]
+=======
+            assert torch.iinfo(dtype).min <= quant_min, "quant_min out of bound"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert quant_max <= torch.iinfo(dtype).max, "quant_max out of bound"
             observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
         observer_kwargs["is_dynamic"] = is_dynamic
@@ -329,7 +339,11 @@ class FixedQParamsFakeQuantize(FakeQuantize):
     # TODO: rename observer to observer_ctr
     def __init__(self, observer):
         super().__init__(observer=observer)
+<<<<<<< HEAD
         assert type(self.activation_post_process) is FixedQParamsObserver, (
+=======
+        assert type(self.activation_post_process) == FixedQParamsObserver, (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
         )
         self._observer_ctr = observer
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index f5fd2cad48826..c502c2e01d802 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.ao.nn.intrinsic as nni
 import torch.nn as nn
@@ -286,7 +290,11 @@ def get_fuser_method_new(
     op_patterns = _get_valid_patterns(op_pattern)
     fuser_method = None
     for op_pattern in op_patterns:
+<<<<<<< HEAD
         fuser_method = fuser_method_mapping.get(op_pattern)
+=======
+        fuser_method = fuser_method_mapping.get(op_pattern, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if fuser_method is not None:
             break
     assert fuser_method is not None, f"did not find fuser method for: {op_pattern} "
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index cd380977b2aa5..7a82e107168c2 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -296,7 +296,11 @@ BackendConfig(nniqat.LinearReLU)
 
 Pattern in this case is the same as before, it defines the pattern for the subgraph we are dealing with
 
+<<<<<<< HEAD
 `set_observation_type`: sets the observation type for the pattern, currently only two types:
+=======
+`set_observation_type`: sets the observation type for the patter, currently only two types:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 `OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` means the output observer instance will be different from the input, which is the most common type of observer placement.
 
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index ef7d1436f2178..09c220ccca284 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1149,7 +1149,10 @@ def dequantize_per_channel_group(
 
 class FakeQuantPerChannel(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
         if scales.dtype != torch.float32:
             scales = scales.to(torch.float32)
@@ -1159,7 +1162,11 @@ def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
             f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
         )
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+<<<<<<< HEAD
         broadcast_dims = list(range(axis)) + list(range(axis + 1, input.ndim))
+=======
+        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
         unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
         temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
@@ -1172,7 +1179,10 @@ def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
         return out
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, gy):
         (mask,) = ctx.saved_tensors
         return gy * mask, None, None, None, None, None
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 4d0b098b93abd..aa8b8b293ec54 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -121,8 +121,12 @@ def calculate_scaled_minmax(self):
         ):
             warnings.warn(
                 "Must call calculate_equalization_scale before calling calculate_scaled_minmax. "
+<<<<<<< HEAD
                 + "Will not scale the next quantization observer.",
                 stacklevel=2,
+=======
+                + "Will not scale the next quantization observer."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return None, None
 
@@ -227,8 +231,12 @@ def calculate_equalization_scale(
     ):
         warnings.warn(
             "Must run observer before calling calculate_equalization_scale. "
+<<<<<<< HEAD
             + "Returning default equalization scale torch.tensor(1).",
             stacklevel=2,
+=======
+            + "Returning default equalization scale torch.tensor(1)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return torch.tensor(1)
 
@@ -248,7 +256,10 @@ def calculate_equalization_scale(
 
 
 class EqualizationQConfig(
+<<<<<<< HEAD
     # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     namedtuple("EqualizationQConfig", ["input_activation", "weight"])
 ):
     """
@@ -463,7 +474,10 @@ def maybe_get_next_equalization_scale(
     In this case, the node given is linear1 and we want to locate the InputEqObs.
     """
     next_inp_eq_obs = maybe_get_next_input_eq_obs(node, modules)
+<<<<<<< HEAD
     # pyrefly: ignore [invalid-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if next_inp_eq_obs:
         if (
             next_inp_eq_obs.equalization_scale.nelement() == 1
@@ -825,18 +839,25 @@ def convert_eq_obs(
             # Scale the weight nodes
             if node.op == "call_module":
                 scale_weight_node(
+<<<<<<< HEAD
                     node,
                     modules,
                     # pyrefly: ignore [bad-argument-type]
                     equalization_scale,
                     maybe_next_equalization_scale,
+=======
+                    node, modules, equalization_scale, maybe_next_equalization_scale
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif node.op == "call_function":
                 scale_weight_functional(
                     node,
                     model,
                     modules,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     equalization_scale,
                     maybe_next_equalization_scale,
                 )
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index fa8e7d53e6b02..76f4be8f6d765 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -228,7 +232,11 @@ def is_getattr_tensor_metadata_node(node):
     return (
         node.op == "call_function"
         and node.target == getattr
+<<<<<<< HEAD
         and node.args[1] == "shape"
+=======
+        and node.args[1] in ["shape"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -589,7 +597,11 @@ def _match_static_pattern(
 
     # Handle cases where the node is wrapped in a ReLU
     if (ref_node.op == "call_function" and ref_node.target in (F.relu, torch.relu)) or (
+<<<<<<< HEAD
         ref_node.op == "call_module" and type(_get_module(ref_node, modules)) is nn.ReLU
+=======
+        ref_node.op == "call_module" and type(_get_module(ref_node, modules)) == nn.ReLU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         relu_node = ref_node
         ref_node = relu_node.args[0]
@@ -724,7 +736,11 @@ def _lower_static_weighted_ref_module(
         # If so, we replace the entire fused module with the corresponding quantized module
         if ref_class in STATIC_LOWER_FUSED_MODULE_MAP:
             inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_MAP[ref_class]
+<<<<<<< HEAD
             if type(ref_module[0]) is not inner_ref_class:  # type: ignore[index]
+=======
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
         else:
             q_class = STATIC_LOWER_MODULE_MAP[ref_class]
@@ -786,7 +802,11 @@ def _lower_static_weighted_ref_module_with_two_inputs(
             inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP[
                 ref_class
             ]
+<<<<<<< HEAD
             if type(ref_module[0]) is not inner_ref_class:  # type: ignore[index]
+=======
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
         else:
             continue
@@ -846,7 +866,11 @@ def _lower_dynamic_weighted_ref_module(model: GraphModule):
         ref_class = type(ref_module)
         if ref_class in DYNAMIC_LOWER_FUSED_MODULE_MAP:
             inner_ref_class, q_class = DYNAMIC_LOWER_FUSED_MODULE_MAP[ref_class]
+<<<<<<< HEAD
             if type(ref_module[0]) is not inner_ref_class:
+=======
+            if type(ref_module[0]) != inner_ref_class:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
         else:
             q_class = DYNAMIC_LOWER_MODULE_MAP.get(ref_class)  # type: ignore[assignment]
@@ -1008,7 +1032,11 @@ def _lower_dynamic_weighted_ref_functional(
             func_node.op == "call_function"
             and func_node.target == F.relu
             or func_node.op == "call_module"
+<<<<<<< HEAD
             and type(modules[str(func_node.target)]) is torch.nn.ReLU
+=======
+            and type(modules[str(func_node.target)]) == torch.nn.ReLU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             relu_node = func_node
             func_node = relu_node.args[0]
diff --git a/torch/ao/quantization/fx/_model_report/README.md b/torch/ao/quantization/fx/_model_report/README.md
index fa4f142aa23cf..cffc827a36ec3 100644
--- a/torch/ao/quantization/fx/_model_report/README.md
+++ b/torch/ao/quantization/fx/_model_report/README.md
@@ -8,10 +8,17 @@ ModelReport
  Most detectors require a **traceable GraphModule**, but some (ex. `PerChannelDetector`) require just an `nn.Module`.
 
 #### Typical Fx Workflow
+<<<<<<< HEAD
 - Initialize model &rarr; Prepare model &rarr; Calibrate model &rarr; Convert model &rarr; ...
 
 #### Fx Workflow with ModelReport
 - Initialize model &rarr; Prepare model &rarr; **Add detector observers** &rarr; Calibrate model &rarr; **Generate report** &rarr; **Remove detector observers** &rarr; Convert model &rarr; ...
+=======
+- Initialize model &rarr; Prepare model &rarr; Callibrate model &rarr; Convert model &rarr; ...
+
+#### Fx Workflow with ModelReport
+- Initialize model &rarr; Prepare model &rarr; **Add detector observers** &rarr; Callibrate model &rarr; **Generate report** &rarr; **Remove detector observers** &rarr; Convert model &rarr; ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  > ⚠️ **You can only prepare and remove observers once with a given ModelReport Instance**: Be very careful here!
 
@@ -23,7 +30,11 @@ This snippet should be ready to copy, paste, and use with the exception of a few
 # prep model
 qconfig_mapping = torch.ao.quantization.get_default_qconfig_mapping()
 model = Model() # TODO define model
+<<<<<<< HEAD
 example_input = torch.randn((*args)) # TODO get example data for calibration
+=======
+example_input = torch.randn((*args)) # TODO get example data for callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 prepared_model = quantize_fx.prepare_fx(model, qconfig_mapping, example_input)
 
 # create ModelReport instance and insert observers
@@ -31,8 +42,13 @@ detector_set = set([DynamicStaticDetector()]) # TODO add all desired detectors
 model_report = ModelReport(model, detector_set)
 ready_for_callibrate = model_report.prepare_detailed_callibration()
 
+<<<<<<< HEAD
 # calibrate model and generate report
 ready_for_callibrate(example_input) # TODO run calibration of model with relevant data
+=======
+# callibrate model and generate report
+ready_for_callibrate(example_input) # TODO run callibration of model with relevant data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 reports = model_report.generate_model_report(remove_inserted_observers=True)
 for report_name in report.keys():
     text_report, report_dict = reports[report_name]
@@ -46,7 +62,11 @@ mod_rep_visualizer.generate_table_visualization() # shows collected data as a ta
 ```
 
 There is a tutorial in the works that will walk through a full usage of the ModelReport API.
+<<<<<<< HEAD
 This tutorial will show the ModelReport API being used on toy model in both an Fx Graph Mode workflow and an alternative workflow with just a traceable model.
+=======
+This tutorial will show the ModelReport API being used on toy model in both an Fx Graph Mode workflow and an alterative workflow with just a traceable model.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 This README will be updated with a link to the tutorial upon completion of the tutorial.
 
 # Key Modules Overview
@@ -60,7 +80,11 @@ There are three primary methods to be familiar with when using the ModelReport c
 This is so that we can keep track of where we want to insert observers on a detector by detector basis and also keep track of which detectors to generate reports for.
 - `prepare_detailed_calibration(self)` &rarr; `GraphModule` inserts observers into the locations specified by each detector in the model.
 It then returns the GraphModule with the detectors inserted into both the regular module structure as well as the node structure.
+<<<<<<< HEAD
 - `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses calibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
+=======
+- `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses callibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - A string-based report that is easily digestable and actionable explaining the data collected by relevant observers for that detector
   - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for further analysis or plotting
 
@@ -107,7 +131,11 @@ For both of the two things listed above, you can filter the data by either `modu
 To get a list of all the modules or features, you can call `mod_rep_visualizer.get_all_unique_module_fqns()`
 and `mod_rep_visualizer.get_all_unique_feature_names()` respectively.
 For the features, because some features are not plottable, you can set the flag to only get plottable features
+<<<<<<< HEAD
 in the aforementioned `get_all_unique_feature_names` method.
+=======
+in the aformentioned `get_all_unique_feature_names` method.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Detector Overview
 
@@ -152,7 +180,11 @@ The statistics collected by the `ModelReportObserver` include:
 - Ratio of 100th percentile to some *n*th percentile
 - Number of constant value batches to pass through each channel
 
+<<<<<<< HEAD
 After the `ModelReportObserver` collects the statistics above during the calibration process, the detectors then extract the information they need to generate their reports from the relevant observers.
+=======
+After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relevant observers.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Using Your Own Observer
 
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index b3bc3c3847603..d7133585e4e1f 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn.qat as nnqat
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index ca9c1099298fc..d5e5cae702218 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.fx._equalize import EqualizationQConfig
@@ -37,7 +41,11 @@ class ModelReport:
     - Suggestions for outlier detection for all layers (Graph Modules)
 
     The ModelReport class has the primary functionality of inserting observers (primarily the ModelReportObserver)
+<<<<<<< HEAD
     where needed for each detector to gather the information it needs, and then after calibration, the ModelReport
+=======
+    where needed for each detector to gather the information it needs, and then after callibration, the ModelReport
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class compiles the report generated by each Detector class into a single report to return to the user. It also
     has the capability to remove all the observers it inserted as well.
 
@@ -71,7 +79,11 @@ class compiles the report generated by each Detector class into a single report
     1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects and model
     2.) Prepare your model with prepare_fx
     3.) Call model_report.prepare_detailed_calibration to add relevant observers
+<<<<<<< HEAD
     4.) Calibrate your model with data
+=======
+    4.) Callibrate your model with data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
     Optional
         6.) Call model_report.generate_visualizer to get a ModelReportVisualizer instance
@@ -103,7 +115,11 @@ class compiles the report generated by each Detector class into a single report
         ... )
         >>> tracer_reporter = ModelReport(graph_module, tracer_detector_set)
 
+<<<<<<< HEAD
         >>> # now we insert the observers and calibrate the model
+=======
+        >>> # now we insert the observers and callibrate the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> tracer_model_with_observers = tracer_reporter.prepare_detailed_calibration()
         >>> for i in range(num_callibration_batches):
         >>>     example_input = get_callibration_input()
@@ -180,7 +196,11 @@ def prepare_detailed_calibration(self) -> GraphModule:
         # if already prepared once, cannot prepare again
         if self._prepared_flag:
             raise ValueError(
+<<<<<<< HEAD
                 "Already ran preparing detailed calibration. Run the report generation next after calibration."
+=======
+                "Already ran preparing detailed callibration. Run the report generation next after callibration."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # loop through each detector, find where placements should be, and keep track
@@ -272,7 +292,11 @@ def generate_model_report(
         Generates all the requested reports.
 
         Note:
+<<<<<<< HEAD
             You should have calibrated the model with relevant data before calling this
+=======
+            You should have callibrated the model with relevant data before calling this
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         The reports generated are specified by the desired_reports specified in desired_reports
 
@@ -287,12 +311,21 @@ def generate_model_report(
 
         Note:
             Throws exception if we try to generate report on model we already removed observers from
+<<<<<<< HEAD
             Throws exception if we try to generate report without preparing for calibration
         """
         # if we haven't prepped model for calibration, then we shouldn't generate report yet
         if not self._prepared_flag:
             raise Exception(  # noqa: TRY002
                 "Cannot generate report without preparing model for calibration"
+=======
+            Throws exception if we try to generate report without preparing for callibration
+        """
+        # if we haven't prepped model for callibration, then we shouldn't generate report yet
+        if not self._prepared_flag:
+            raise Exception(  # noqa: TRY002
+                "Cannot generate report without preparing model for callibration"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # if we already removed the observers, we cannot generate report
@@ -370,10 +403,17 @@ def _is_same_info_for_same_key(self, info_dict_a: dict, info_dict_b: dict) -> bo
             dict_b_val = info_dict_b[key]
 
             # if it's a tensor we have to handle separately
+<<<<<<< HEAD
             if type(dict_a_val) is torch.Tensor:
                 # if dict_b_val not tensor, automatically false
                 if (
                     type(dict_b_val) is not torch.Tensor
+=======
+            if type(dict_a_val) == torch.Tensor:
+                # if dict_b_val not tensor, automatically false
+                if (
+                    type(dict_b_val) != torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     or sum(dict_a_val != dict_b_val) != 0
                 ):
                     return False
@@ -547,12 +587,21 @@ def _generate_module_fqn_to_detector_info_mapping(
 
         Note:
             Throws exception if we try to generate mapping on model we already removed observers from
+<<<<<<< HEAD
             Throws exception if we try to generate mapping without preparing for calibration
         """
         # if we haven't prepped model for calibration, then we shouldn't generate mapping yet
         if not self._prepared_flag:
             raise Exception(  # noqa: TRY002
                 "Cannot generate report without preparing model for calibration"
+=======
+            Throws exception if we try to generate mapping without preparing for callibration
+        """
+        # if we haven't prepped model for callibration, then we shouldn't generate mapping yet
+        if not self._prepared_flag:
+            raise Exception(  # noqa: TRY002
+                "Cannot generate report without preparing model for callibration"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # if we already removed the observers, we cannot mapping
@@ -601,7 +650,11 @@ def generate_qconfig_mapping(self) -> QConfigMapping:
 
         Note:
             Throws exception if we try to generate mapping on model we already removed observers from
+<<<<<<< HEAD
             Throws exception if we try to generate mapping without preparing for calibration
+=======
+            Throws exception if we try to generate mapping without preparing for callibration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         # get the mapping info
         detector_qconfig_info_combined = (
diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
index 2e58772660c5a..54bae81fc89a5 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -63,7 +63,11 @@ class ModelReportVisualizer:
     1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects
     2.) Prepare your model with prepare_fx
     3.) Call model_report.prepare_detailed_calibration on your model to add relevant observers
+<<<<<<< HEAD
     4.) Calibrate your model with data
+=======
+    4.) Callibrate your model with data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
     6.) Use output of model_report.generate_report to initialize ModelReportVisualizer instance
     7.) Use instance to view different views of data as desired, applying filters as needed
@@ -132,7 +136,11 @@ def get_all_unique_feature_names(
                 # if we need plottable, ensure type of val is tensor
                 if (
                     not plottable_features_only
+<<<<<<< HEAD
                     or type(feature_dict[feature_name]) is torch.Tensor
+=======
+                    or type(feature_dict[feature_name]) == torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     unique_feature_names.add(feature_name)
 
@@ -223,7 +231,10 @@ def _generate_tensor_table(
                         feature_val = feature_val.item()
 
                     # we add to our list of values
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     tensor_table_row.append(feature_val)
 
                 tensor_table.append(tensor_table_row)
@@ -284,7 +295,10 @@ def _generate_channels_table(
                             feature_val = feature_val.item()
 
                         # add value to channel specific row
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         new_channel_row.append(feature_val)
 
                     # add to table and increment row index counter
@@ -520,7 +534,11 @@ def _get_plottable_data(
                 # the index of the feature will the 0 + num non feature columns
                 tensor_feature_index = feature_column_offset
                 row_value = row[tensor_feature_index]
+<<<<<<< HEAD
                 if type(row_value) is not str:
+=======
+                if not type(row_value) == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     x_data.append(x_val_to_append)
                     y_data.append(row_value)
         elif is_valid_per_channel_plot:
@@ -543,7 +561,11 @@ def _get_plottable_data(
                 # the index of the feature will the 0 + num non feature columns
                 tensor_feature_index = feature_column_offset
                 row_value = row[tensor_feature_index]
+<<<<<<< HEAD
                 if type(row_value) is not str:
+=======
+                if not type(row_value) == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # only append if new index we are appending
                     if len(x_data) == 0 or x_data[-1] != x_val_to_append:
                         x_data.append(x_val_to_append)
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 853641c6b66ba..3864bb661b93d 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -3,7 +3,11 @@
 import copy
 import operator
 import warnings
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization import CUSTOM_KEY, NUMERIC_DEBUG_HANDLE_KEY
@@ -62,10 +66,13 @@
 )
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "convert",
     "convert_custom_module",
@@ -98,7 +105,10 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node working with decomposed Tensor
@@ -215,11 +225,15 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
                     # sure that the default overload can be used.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
+<<<<<<< HEAD
                         model,
                         graph,
                         module_path + prefix + key,
                         value_or_node,
                         model_device,
+=======
+                        model, graph, module_path + prefix + key, value_or_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -282,7 +296,15 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
         # 2. insert choose_qparams op and update the qparams list
         with graph.inserting_before(node):
             input_node = node.args[0]
+<<<<<<< HEAD
             choose_qparams_op_inputs = [node.args[0]] + list(qparams.values())
+=======
+            choose_qparams_op_inputs = [node.args[0]]
+            for key, value in qparams.items():
+                # we have quant_min, quant_max and dtype, all should be stored
+                # as literals
+                choose_qparams_op_inputs.append(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             choose_qparams_node = graph.create_node(
                 "call_function", choose_qparams_op, tuple(choose_qparams_op_inputs), {}
             )
@@ -293,8 +315,11 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
             zero_point_node = graph.create_node(
                 "call_function", operator.getitem, (choose_qparams_node, 1), {}
             )
+<<<<<<< HEAD
             # we have quant_min, quant_max and dtype, all should be stored
             # as literals
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quant_min = qparams["_quant_min_"]
             quant_max = qparams["_quant_max_"]
             dtype = qparams["_dtype_"]
@@ -369,7 +394,10 @@ def _replace_observer_with_quantize_dequantize_node(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node
@@ -450,11 +478,15 @@ def _replace_observer_with_quantize_dequantize_node(
                     # For scale and zero_point values we register them as buffers in the root module.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
+<<<<<<< HEAD
                         model,
                         graph,
                         module_path + prefix + key,
                         value_or_node,
                         model_device,
+=======
+                        model, graph, module_path + prefix + key, value_or_node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -480,7 +512,11 @@ def _replace_observer_with_quantize_dequantize_node(
         with graph.inserting_before(node):
             input_node = node.args[0]
             quantize_op_inputs = [input_node]
+<<<<<<< HEAD
             for value in qparams.values():
+=======
+            for key, value in qparams.items():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 quantize_op_inputs.append(value)
 
             quantized_node = graph.create_node(
@@ -496,7 +532,11 @@ def _replace_observer_with_quantize_dequantize_node(
         with graph.inserting_before(node):
             input_node = node.args[0]
             quantize_op_inputs = [input_node]
+<<<<<<< HEAD
             for value in qparams.values():
+=======
+            for key, value in qparams.items():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO: we can add the information of whether a value needs to
                 # be registered as an attribute in qparams dict itself
                 quantize_op_inputs.append(value)
@@ -595,8 +635,12 @@ def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph) -> No
             _maybe_recursive_remove_dequantize(arg_element, node, graph)
     else:
         warnings.warn(
+<<<<<<< HEAD
             f"Unsupported node type in recursive remove dequantize: {type(arg)}",
             stacklevel=2,
+=======
+            f"Unsupported node type in recursive remove dequantize: {type(arg)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -753,7 +797,10 @@ def convert_weighted_module(
     backend_config: BackendConfig,
     is_decomposed: bool = False,
     is_reference: bool = False,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Convert a weighted module to reference quantized module in the model
     If the QConfig of a QAT module is not set, the module will still be converted to
@@ -842,10 +889,14 @@ def convert_weighted_module(
         is_ptq = weight_post_process is None
         if is_ptq:
             weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
+<<<<<<< HEAD
             if model_device is not None:
                 device = model_device
             else:
                 device = assert_and_get_unique_device(float_module)
+=======
+            device = assert_and_get_unique_device(float_module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device:
                 weight_post_process.to(device)
 
@@ -1161,7 +1212,10 @@ def convert(
     qat_module_classes = get_qat_module_classes(backend_config)
     fused_module_classes = get_fused_module_classes(backend_config)
     statically_quantized_custom_module_nodes: set[Node] = set()
+<<<<<<< HEAD
     model_device = assert_and_get_unique_device(model)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for node in list(model.graph.nodes):
         if node.op == "placeholder":
@@ -1196,8 +1250,12 @@ def convert(
                     _maybe_recursive_remove_dequantize(output, return_node, model.graph)
             else:
                 warnings.warn(
+<<<<<<< HEAD
                     f"Unsupported node type for output_quantized_idxs: {type(output)}",
                     stacklevel=2,
+=======
+                    f"Unsupported node type for output_quantized_idxs: {type(output)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         elif node.op == "call_module":
             mod = _get_module(node, modules)
@@ -1216,7 +1274,10 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+<<<<<<< HEAD
                             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     else:
                         _replace_observer_with_quantize_dequantize_node(
@@ -1225,7 +1286,10 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+<<<<<<< HEAD
                             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(
@@ -1255,7 +1319,10 @@ def convert(
                     backend_config,
                     is_decomposed,
                     is_reference,
+<<<<<<< HEAD
                     model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif type_before_parametrizations(mod) in custom_module_classes:
                 convert_custom_module(
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 435085a6b8459..8213d58c4792f 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.ao.quantization.backend_config import (
     BackendConfig,
diff --git a/torch/ao/quantization/fx/fuse_handler.py b/torch/ao/quantization/fx/fuse_handler.py
index 24f3b13381724..781d168ce760d 100644
--- a/torch/ao/quantization/fx/fuse_handler.py
+++ b/torch/ao/quantization/fx/fuse_handler.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.backend_config import BackendConfig
diff --git a/torch/ao/quantization/fx/lstm_utils.py b/torch/ao/quantization/fx/lstm_utils.py
index b49f462640f0c..87344ff2599b1 100644
--- a/torch/ao/quantization/fx/lstm_utils.py
+++ b/torch/ao/quantization/fx/lstm_utils.py
@@ -1,6 +1,10 @@
 import copy
 import operator
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization import (
@@ -15,10 +19,13 @@
 from torch.ao.quantization.quantize_fx import convert_to_reference_fx, prepare_fx
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: move all LSTM util functions from fx/utils.py to this file
 def _get_lstm_with_individually_observed_parts(
     float_lstm: torch.nn.LSTM,
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index 95d2b27f23ca1..d1af5aeb3f128 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any, Optional
+=======
+from collections.abc import Iterable
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.qconfig import QConfigAny
@@ -51,7 +56,11 @@ def _is_match(modules, node, pattern, max_uses=sys.maxsize):
     if isinstance(self_match, type) and issubclass(self_match, torch.nn.Module):
         if node.op != "call_module":
             return False
+<<<<<<< HEAD
         if type_before_parametrizations(modules[node.target]) != self_match:
+=======
+        if not type_before_parametrizations(modules[node.target]) == self_match:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
     elif callable(self_match):
         if node.op != "call_function" or node.target is not self_match:
@@ -168,7 +177,11 @@ def record_match(pattern, node, last_node, matched_node_pattern, match_map):
     for node in reversed(graph.nodes):
         if node.name not in match_map and node.name not in all_matched:
             for pattern, quantize_handler_cls in patterns.items():
+<<<<<<< HEAD
                 root_node_getter = root_node_getter_mapping.get(pattern)
+=======
+                root_node_getter = root_node_getter_mapping.get(pattern, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if _is_match(modules, node, pattern) and node.name not in match_map:
                     matched_node_pattern: list[Node] = []
                     record_match(pattern, node, node, matched_node_pattern, match_map)
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 4b97311cd93d3..325188de6375a 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -130,7 +130,11 @@ def _get_qspec_for_arg(
 ) -> Optional[QuantizationSpecBase]:
     while _is_activation_post_process_node(arg, named_modules):
         arg = arg.args[0]  # type: ignore[assignment]
+<<<<<<< HEAD
     return input_qspec_map.get(arg)
+=======
+    return input_qspec_map.get(arg, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _create_obs_or_fq_from_qspec(
@@ -166,7 +170,10 @@ def _create_obs_or_fq_from_qspec(
         }
         edge_or_nodes = quantization_spec.derived_from
         obs_or_fqs = [obs_or_fq_map[k] for k in edge_or_nodes]
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs["obs_or_fqs"] = obs_or_fqs
         return _DerivedObserverOrFakeQuantize.with_args(**kwargs)()
     elif isinstance(quantization_spec, FixedQParamsQuantizationSpec):
@@ -479,7 +486,10 @@ def _insert_obs_or_fq(
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Node:
     """
     Attaches `obs_or_fq` to `model`, and creates a node which calls
@@ -487,8 +497,12 @@ def _insert_obs_or_fq(
 
     obs_or_fq: an instance of Observer or FakeQuantize module
     """
+<<<<<<< HEAD
     if model_device is None:
         model_device = assert_and_get_unique_device(model)
+=======
+    model_device = assert_and_get_unique_device(model)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if model_device:
         obs_or_fq.to(model_device)
     # add obs_or_fq module as attribute
@@ -808,7 +822,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -831,7 +848,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 obs_or_fq_map,
                 is_qat,
                 backend_config,
+<<<<<<< HEAD
                 model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
@@ -939,7 +959,11 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
             if maybe_obs_node.op == "call_module":
                 maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
                 if (
+<<<<<<< HEAD
                     type(maybe_obs_mod) is type(arg_as_input_act_obs_or_fq)
+=======
+                    type(maybe_obs_mod) == type(arg_as_input_act_obs_or_fq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and maybe_obs_mod.dtype == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
                 ):
                     arg_as_input_act_obs_or_fq = maybe_obs_mod  # type: ignore[assignment]
@@ -950,12 +974,16 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq
         if existing_obs_node is None:
             new_obs_node = _insert_obs_or_fq(
+<<<<<<< HEAD
                 arg,
                 arg_as_input_act_obs_or_fq,
                 model,
                 named_modules,
                 graph,
                 model_device,
+=======
+                arg, arg_as_input_act_obs_or_fq, model, named_modules, graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # override this arg to be the observed arg
             new_arg = new_obs_node
@@ -976,7 +1004,10 @@ def _maybe_insert_input_observers_for_node(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -1008,7 +1039,10 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+<<<<<<< HEAD
             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         new_args.append(new_arg)
 
@@ -1026,7 +1060,10 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+<<<<<<< HEAD
             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         new_kwargs[k] = new_kwarg
 
@@ -1055,9 +1092,13 @@ def _maybe_insert_input_equalization_observers_for_node(
         return
 
     if is_branch:
+<<<<<<< HEAD
         warnings.warn(
             f"Cannot equalize {node} because it is part of a branch.", stacklevel=2
         )
+=======
+        warnings.warn(f"Cannot equalize {node} because it is part of a branch.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     new_args = []
@@ -1122,7 +1163,11 @@ def _maybe_insert_output_observer_for_node(
         )
     target_dtype, target_is_dynamic = _get_dtype_and_is_dynamic(output_act_obs_or_fq)
     # uncomment after we support reuse_input_obs_or_fq properly by having separate
+<<<<<<< HEAD
     # implementations for this key instead of reusing the input_output_share_observers
+=======
+    # implemntations for this key instead of reusing the input_output_share_observers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # code
     # reuse_input_obs_or_fq = node.meta["target_dtype_info"].get("reuse_input_obs_or_fq", False)
     # for now we set this to False since reuse_input_obs_or_fq for
@@ -1132,7 +1177,11 @@ def _maybe_insert_output_observer_for_node(
     reuse_input_obs_or_fq = False
 
     # Note: prev_output_dtype = torch.float and prev_output_is_dynamic=False
+<<<<<<< HEAD
     # because the prev_output is the output of an fp32 op, although technically
+=======
+    # because the prev_output is the output of an fp32 op, althought technically
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # we should get the dtype of the output from node.meta["val"] in the future
     # if we deprecate fx graph mode quantization
     needs_obs_or_fq = _needs_obs_or_fq(
@@ -1678,7 +1727,10 @@ def insert_observers_for_model(
     outputs_seen_counter = 0
     results_node = None
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+<<<<<<< HEAD
     model_device = assert_and_get_unique_device(model)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: change this to insert obs/fq by pattern instead of by node
     for node in nodes_before_observation:
@@ -1711,7 +1763,11 @@ def insert_observers_for_model(
 
             skip_inserting_observers = (
                 (qconfig is None) or not output_is_a_tensor
+<<<<<<< HEAD
             ) and (node.op != "output")
+=======
+            ) and (not node.op == "output")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # TODO: take a closer look to see if we can remove this check
             # right now it is here because of `observed_node_names`, we are using
@@ -1782,7 +1838,10 @@ def insert_observers_for_model(
                             obs_or_fq_map,
                             is_qat,
                             backend_config,
+<<<<<<< HEAD
                             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
                         # insert equalization input observers if needed
@@ -2019,7 +2078,11 @@ def prepare(
                 same as input_quantized_idxs configuration provided
                 for the standalone module
             standalone_module_output_quantized_idxs(List[Int]): a list of
+<<<<<<< HEAD
                 indices for the graph output that is quantized
+=======
+                indexs for the graph output that is quantized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 same as input_quantized_idxs configuration provided
                 for the standalone module
     """
@@ -2088,11 +2151,16 @@ def prepare(
 
     root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     _update_qconfig_for_fusion(model, qconfig_mapping)
     # pyrefly: ignore [bad-argument-type]
     _update_qconfig_for_fusion(model, _equalization_config)
     # pyrefly: ignore [bad-argument-type]
+=======
+    _update_qconfig_for_fusion(model, qconfig_mapping)
+    _update_qconfig_for_fusion(model, _equalization_config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flattened_qconfig_dict = _get_flattened_qconfig_dict(qconfig_mapping)
     # TODO: support regex as well
     propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config.to_dict())
@@ -2100,7 +2168,10 @@ def prepare(
     if is_qat:
         module_to_qat_module = get_module_to_qat_module(backend_config)
         _qat_swap_modules(model, module_to_qat_module)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _update_qconfig_for_qat(qconfig_mapping, backend_config)
 
     # mapping from fully qualified module name to module instance
@@ -2114,6 +2185,7 @@ def prepare(
 
     # fill node_name_to_qconfig, a map from node name to qconfig, used in _find_matches
     equalization_node_name_to_qconfig = _generate_node_name_to_qconfig(
+<<<<<<< HEAD
         model,
         named_modules,
         model.graph,
@@ -2128,6 +2200,12 @@ def prepare(
         # pyrefly: ignore [bad-argument-type]
         qconfig_mapping,
         node_name_to_scope,
+=======
+        model, named_modules, model.graph, _equalization_config, node_name_to_scope
+    )
+    node_name_to_qconfig = _generate_node_name_to_qconfig(
+        model, named_modules, model.graph, qconfig_mapping, node_name_to_scope
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # match the patterns that will get quantized
@@ -2187,7 +2265,10 @@ def prepare(
         node_name_to_scope,
         prepare_custom_config,
         equalization_node_name_to_qconfig,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig_mapping,
         is_qat,
         observed_node_names,
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 7e4ebbf75bc3d..c1c9acf4a0629 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import re
 from collections import defaultdict, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.nn.intrinsic import _FusedModule
diff --git a/torch/ao/quantization/fx/quantize_handler.py b/torch/ao/quantization/fx/quantize_handler.py
index 2acb711943172..7df5b24f6ae45 100644
--- a/torch/ao/quantization/fx/quantize_handler.py
+++ b/torch/ao/quantization/fx/quantize_handler.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 from abc import ABC
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.backend_config import (
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index 2c1635936845a..9b9d76e4589e7 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.nn.intrinsic import _FusedModule
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 2044fce538fd9..ae13ca92567cb 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 import functools
 import operator
 import warnings
@@ -7,6 +8,13 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional, Union
+=======
+import operator
+import warnings
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -164,7 +172,11 @@ def get_qconv_prepack_op(conv_op: Callable) -> Callable:
         torch.nn.functional.conv_transpose2d: torch.ops.quantized.conv_transpose2d_prepack,
         torch.nn.functional.conv_transpose3d: torch.ops.quantized.conv_transpose3d_prepack,
     }
+<<<<<<< HEAD
     prepack_op = prepack_ops.get(conv_op)
+=======
+    prepack_op = prepack_ops.get(conv_op, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert prepack_op, f"Didn't find prepack op for {conv_op}"
     return prepack_op
 
@@ -192,7 +204,11 @@ def get_attr_name(i: int):
 
 
 def collect_producer_nodes(node: Node) -> Optional[list[Node]]:
+<<<<<<< HEAD
     r"""Starting from a target node, trace back until we hit input or
+=======
+    r"""Starting from a target node, trace back until we hit inpu or
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getattr node. This is used to extract the chain of operators
     starting from getattr to the target node, for example
     def forward(self, x):
@@ -247,7 +263,10 @@ def load_arg(a):
 
 
 # TODO: delete
+<<<<<<< HEAD
 @functools.cache
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     Returns the unique device for a module, or None if no device is found.
@@ -257,11 +276,15 @@ def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
 
 
 def create_getattr_from_value(
+<<<<<<< HEAD
     module: torch.nn.Module,
     graph: Graph,
     prefix: str,
     value: Any,
     device: Optional[torch.device] = None,
+=======
+    module: torch.nn.Module, graph: Graph, prefix: str, value: Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Node:
     """
     Given a value of any type, creates a getattr node corresponding to the value and
@@ -269,8 +292,12 @@ def create_getattr_from_value(
     """
     get_new_attr_name = get_new_attr_name_with_prefix(prefix)
     attr_name = get_new_attr_name(module)
+<<<<<<< HEAD
     if device is None:
         device = assert_and_get_unique_device(module)
+=======
+    device = assert_and_get_unique_device(module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_value = (
         value.detach().clone()
         if isinstance(value, torch.Tensor)
@@ -704,7 +731,11 @@ def match_getitem(a):
         return a.op == "call_function" and a.target == operator.getitem
 
     def match_tuple(a):
+<<<<<<< HEAD
         return a.op == "call_function" and a.target is tuple
+=======
+        return a.op == "call_function" and a.target == tuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _match_pattern(match_pattern: list[Callable]) -> Optional[Node]:
         """
@@ -721,7 +752,10 @@ def _match_pattern(match_pattern: list[Callable]) -> Optional[Node]:
                     a = a.args[0][0]  # type: ignore[assignment,index]
                 else:
                     a = a.args[0]  # type: ignore[assignment]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return a
 
     all_match_patterns = [
@@ -797,7 +831,11 @@ def find_patterns(
 
         # Iterate through users of this node to find tuple/getitem nodes to match
         for user in node.users:
+<<<<<<< HEAD
             if user.op == "call_function" and user.target is tuple:
+=======
+            if user.op == "call_function" and user.target == tuple:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for i, user_arg in enumerate(user.args[0]):  # type: ignore[arg-type]
                     if user_arg == node:
                         index_stack.append(i)
@@ -826,7 +864,11 @@ def find_patterns(
     for pattern in matched_patterns:
         first_tuple = pattern[0]
         last_getitem = pattern[-1]
+<<<<<<< HEAD
         assert first_tuple.op == "call_function" and first_tuple.target is tuple
+=======
+        assert first_tuple.op == "call_function" and first_tuple.target == tuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert (
             last_getitem.op == "call_function"
             and last_getitem.target == operator.getitem
@@ -890,8 +932,12 @@ def _activation_post_process_satisfies_dtype_config_constraints(
         if backend_quant_min is not None and backend_quant_max is not None:
             if app_quant_min is None or app_quant_max is None:
                 warnings.warn(
+<<<<<<< HEAD
                     f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}",
                     stacklevel=2,
+=======
+                    f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
             elif app_quant_min < backend_quant_min or app_quant_max > backend_quant_max:
@@ -899,23 +945,35 @@ def _activation_post_process_satisfies_dtype_config_constraints(
                     f"QConfig {debug_string} quantization range must fall within the backend's:\n"
                     f"QConfig range = ({app_quant_min}, {app_quant_max}), "
                     f"BackendConfig range = ({backend_quant_min}, {backend_quant_max}), "
+<<<<<<< HEAD
                     f"ignoring {qconfig}",
                     stacklevel=2,
+=======
+                    f"ignoring {qconfig}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
         # check scale min
         if backend_scale_min is not None:
             if app_scale_min is None:
                 warnings.warn(
+<<<<<<< HEAD
                     f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}",
                     stacklevel=2,
+=======
+                    f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
             if app_scale_min < backend_scale_min:
                 warnings.warn(
                     f"QConfig {debug_string} eps ({app_scale_min}) must be greater than or equal to "
+<<<<<<< HEAD
                     f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}",
                     stacklevel=2,
+=======
+                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
         # check fixed scale and zero point
@@ -939,8 +997,12 @@ def _activation_post_process_satisfies_dtype_config_constraints(
             ) and not isinstance(activation_post_process, FixedQParamsFakeQuantize):
                 warnings.warn(
                     f"QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize "
+<<<<<<< HEAD
                     f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}",
                     stacklevel=2,
+=======
+                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
             if (
@@ -950,8 +1012,12 @@ def _activation_post_process_satisfies_dtype_config_constraints(
                 warnings.warn(
                     f"QConfig fixed scale ({observer.scale}) and zero point ({observer.zero_point}) "
                     f"do not match the backend's ({backend_scale_exact_match} and {backend_zero_point_exact_match}), "
+<<<<<<< HEAD
                     f"ignoring {qconfig}.\n{suggestion_str}",
                     stacklevel=2,
+=======
+                    f"ignoring {qconfig}.\n{suggestion_str}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return False
         return True
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 20b1252f1be80..2618df54b02a9 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -245,8 +245,12 @@ def __init__(
         if reduce_range:
             warnings.warn(
                 "Please use quant_min and quant_max to specify the range for observers. \
+<<<<<<< HEAD
                     reduce_range will be deprecated in a future release of PyTorch.",
                 stacklevel=2,
+=======
+                    reduce_range will be deprecated in a future release of PyTorch."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self.reduce_range = reduce_range
         self.register_buffer("eps", torch.tensor([eps], **factory_kwargs))
@@ -281,12 +285,18 @@ def __init__(
         )
         self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
         if self.has_customized_qrange:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             validate_qmin_qmax(quant_min, quant_max)
         self.quant_min, self.quant_max = calculate_qmin_qmax(
             # pyrefly: ignore [bad-argument-type]
             quant_min,
             # pyrefly: ignore [bad-argument-type]
+=======
+            validate_qmin_qmax(quant_min, quant_max)
+        self.quant_min, self.quant_max = calculate_qmin_qmax(
+            quant_min,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quant_max,
             self.has_customized_qrange,
             self.dtype,
@@ -362,7 +372,11 @@ def _calculate_qparams(
         # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme
         # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
         # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code
+<<<<<<< HEAD
         # seems unlikely to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+=======
+        # seems unlikey to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO(jakeszwe, jerryzh168)
         if not check_min_max_valid(min_val, max_val):
             return torch.tensor([1.0], device=min_val.device.type), torch.tensor(
@@ -392,7 +406,11 @@ def _calculate_qparams(
                     )
                 else:
                     zero_point = zero_point.new_full(zero_point.size(), 128)
+<<<<<<< HEAD
             elif self.dtype == torch.uint16:
+=======
+            elif self.dtype in [torch.uint16]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 zero_point = zero_point.new_full(zero_point.size(), 2**15)
         elif self.qscheme == torch.per_channel_affine_float_qparams:
             scale = (max_val - min_val) / float(quant_max - quant_min)
@@ -807,7 +825,11 @@ def _load_from_state_dict(
         unexpected_keys: list[str],
         error_msgs: list[str],
     ):
+<<<<<<< HEAD
         version = local_metadata.get("version")
+=======
+        version = local_metadata.get("version", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if version is not None and version < 3:
             local_state = ["min_vals", "max_vals"]
             expected_min_name = "min_vals"
@@ -830,8 +852,12 @@ def _load_from_state_dict(
                     self.max_val.resize_(val.shape)
                 else:
                     warnings.warn(
+<<<<<<< HEAD
                         f"Observer load_from_state_dict got unexpected name {name}",
                         stacklevel=2,
+=======
+                        f"Observer load_from_state_dict got unexpected name {name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 # For torchscript module we need to update the attributes here since we do not
                 # call the `_load_from_state_dict` function defined module.py
@@ -842,8 +868,12 @@ def _load_from_state_dict(
                         self.max_val.copy_(val)
                     else:
                         warnings.warn(
+<<<<<<< HEAD
                             f"Observer load_from_state_dict got unexpected name {name}",
                             stacklevel=2,
+=======
+                            f"Observer load_from_state_dict got unexpected name {name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
             elif strict:
                 missing_keys.append(key)
@@ -1247,7 +1277,11 @@ def _combine_histograms(
         # If the orig hist only has one value (i.e., the min and max are the same)
         # we can just add it into new histogram
         if orig_min == orig_max:
+<<<<<<< HEAD
             bin_value = torch.sum(orig_hist)
+=======
+            bin_value = torch.sum(update_hist)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             transformed_orig_hist = (
                 torch.histc(orig_min, bins=self.bins, min=update_min, max=update_max)  # type: ignore[arg-type]
                 * bin_value
@@ -1292,9 +1326,13 @@ def forward(self, x_orig: torch.Tensor) -> torch.Tensor:  # pyre-ignore[14]
         # want to make our quantization range infinite
         # and in practice those values will be clamped
         if x_min == -torch.inf or x_max == torch.inf:
+<<<<<<< HEAD
             warnings.warn(
                 "torch.inf detected in input tensor, ignoring input", stacklevel=2
             )
+=======
+            warnings.warn("torch.inf detected in input tensor, ignoring input")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = x[x.abs() != torch.inf]
             if x.numel() == 0:
                 return x_orig
@@ -1350,8 +1388,12 @@ def calculate_qparams(self):  # type: ignore[override]
         if is_uninitialized:
             warnings.warn(
                 "must run observer before calling calculate_qparams.\
+<<<<<<< HEAD
                                     Returning default scale and zero point ",
                 stacklevel=2,
+=======
+                                    Returning default scale and zero point "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor(
                 [0], device=self.min_val.device.type
@@ -1515,8 +1557,12 @@ def __init__(
             warnings.warn(
                 "Please use `is_dynamic` instead of `compute_dtype`. \
                     `compute_dtype` will be deprecated in a future release \
+<<<<<<< HEAD
                     of PyTorch.",
                 stacklevel=2,
+=======
+                    of PyTorch."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def forward(self, x):
@@ -1876,7 +1922,11 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
         Converts the observer node in the graph into its quantized representation
 
         Args:
+<<<<<<< HEAD
             model: graph module to convert the observer node in
+=======
+            model: graph module to conver the observer node in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             observer_node: the observer node to convert
         """
         from torch.ao.quantization.fx.utils import create_getattr_from_value
@@ -1912,6 +1962,7 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             else:
                 scale, zero_point = self.calculate_qparams()
                 scale_node = create_getattr_from_value(
+<<<<<<< HEAD
                     model,
                     model.graph,
                     "_scale",
@@ -1924,6 +1975,12 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
                     "_zero_point",
                     zero_point,
                     zero_point.device if isinstance(zero_point, torch.Tensor) else None,
+=======
+                    model, model.graph, "_scale", scale
+                )
+                zero_point_node = create_getattr_from_value(
+                    model, model.graph, "_zero_point", zero_point
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             q_node = model.graph.call_function(
diff --git a/torch/ao/quantization/pt2e/_affine_quantization.py b/torch/ao/quantization/pt2e/_affine_quantization.py
index 02e9c9e6deb8d..f23265a6bfdd5 100644
--- a/torch/ao/quantization/pt2e/_affine_quantization.py
+++ b/torch/ao/quantization/pt2e/_affine_quantization.py
@@ -1,6 +1,10 @@
 # copied from https://github.com/pytorch/ao/blob/main/torchao/quantization/observer.py
 # and https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_primitives.py
+<<<<<<< HEAD
 # PLEASE DON'T MODIFY THIS FILE SO THAT WE DON'T GET OUT OF SYNC
+=======
+# PLESE DON'T MODIFY THIS FILE SO THAT WE DON'T GET OUT OF SYNC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 from abc import ABCMeta
 from typing import Any, Optional, Union
@@ -79,6 +83,7 @@ def _get_and_check_qmin_qmax(dtype, quant_min, quant_max):
     if quant_max is None:
         quant_max = quant_max_upper_bound
 
+<<<<<<< HEAD
     if quant_min < quant_min_lower_bound:
         raise AssertionError(
             "quant_min out of bound for dtype, "
@@ -90,6 +95,17 @@ def _get_and_check_qmin_qmax(dtype, quant_min, quant_max):
             "quant_max out of bound for dtype, "
             f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
         )
+=======
+    assert quant_min >= quant_min_lower_bound, (
+        "quant_min out of bound for dtype, "
+        f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}"
+    )
+
+    assert quant_max <= quant_max_upper_bound, (
+        "quant_max out of bound for dtype, "
+        f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quant_min, quant_max
 
 
@@ -109,21 +125,32 @@ def _get_reduction_params(block_size, input_size):
           shape_for_reduction: (3, 3, 5, 2, 10)
           reduction_dim: [0, 1, 3, 4]
     """
+<<<<<<< HEAD
     if len(block_size) != len(input_size):
         raise AssertionError(
             "block_size length must equal input_size length, got "
             f"block_size={block_size}, input_size={input_size}"
         )
+=======
+    assert len(block_size) == len(input_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_for_reduction = []
     reduction_dims = []
     cur_dim = 0
     for i in range(len(block_size)):
         if block_size[i] != input_size[i] and block_size[i] > 1:
+<<<<<<< HEAD
             if input_size[i] % block_size[i] != 0:
                 raise AssertionError(
                     f"Expecting input size at {i} dimension: {input_size[i]} to be divisible "
                     f"by block_size at {i} dimension: {block_size[i]}"
                 )
+=======
+            assert input_size[i] % block_size[i] == 0, (
+                f"Expecting input size at {i} dimension: "
+                f"{input_size[i]} to be divisible by block_size at {i} dimension: {block_size[i]}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shape_for_reduction.append(input_size[i] // block_size[i])
             shape_for_reduction.append(block_size[i])
             # reduce over the block_size[i] dim
@@ -172,6 +199,7 @@ def decorator(fn):
 
         # expecting fn.__name__ starts with `_` and we want to take the rest
         # to be the name of the custom op
+<<<<<<< HEAD
         if fn.__name__[0] != "_":
             raise AssertionError(
                 f"Expecting function name starts with `_`, got {fn.__name__}"
@@ -180,6 +208,14 @@ def decorator(fn):
             raise AssertionError(
                 f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
             )
+=======
+        assert fn.__name__[0] == "_", (
+            f"Expecting function name starts with `_`, got {fn.__name__}"
+        )
+        assert not any(c in fn.__name__ for c in ".<>"), (
+            f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_name = fn.__name__[1:]
         schema = op_name + infer_schema(fn, mutates_args={})
         lib.define(schema)
@@ -264,6 +300,7 @@ def _choose_qparams_affine(
        and `zero_point_domain`
     """
     quant_min, quant_max = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
+<<<<<<< HEAD
     if mapping_type not in [
         MappingType.SYMMETRIC.name,
         MappingType.SYMMETRIC_NO_CLIPPING_ERR.name,
@@ -275,6 +312,17 @@ def _choose_qparams_affine(
             raise AssertionError(
                 f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
             )
+=======
+    assert mapping_type in [
+        MappingType.SYMMETRIC.name,
+        MappingType.SYMMETRIC_NO_CLIPPING_ERR.name,
+        MappingType.ASYMMETRIC.name,
+    ], f"Unsupported mapping type: {mapping_type}"
+    if target_dtype in FP8_TYPES:
+        assert mapping_type == MappingType.SYMMETRIC.name, (
+            f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if input is not None:
         if scale_dtype is None:
@@ -284,10 +332,16 @@ def _choose_qparams_affine(
         if eps is None:
             eps = torch.finfo(input.dtype).eps
 
+<<<<<<< HEAD
         if len(block_size) != input.dim():
             raise AssertionError(
                 f"Got input dim:{input.dim()}, block_size: {block_size}"
             )
+=======
+        assert len(block_size) == input.dim(), (
+            f"Got input dim:{input.dim()}, block_size: {block_size}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape_for_reduction, reduction_dims = _get_reduction_params(
             block_size, input.size()
         )
@@ -296,6 +350,7 @@ def _choose_qparams_affine(
         min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
         max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
     else:
+<<<<<<< HEAD
         if min_val is None or max_val is None:
             raise AssertionError(
                 f"Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
@@ -304,6 +359,14 @@ def _choose_qparams_affine(
             raise AssertionError(
                 f"Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
             )
+=======
+        assert min_val is not None and max_val is not None, (
+            "Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
+        )
+        assert min_val.dtype == max_val.dtype, (
+            "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if scale_dtype is None:
             scale_dtype = min_val.dtype
@@ -328,10 +391,14 @@ def _choose_qparams_affine(
             max_val_pos = torch.max(-min_val_neg, max_val_pos)
             scale = max_val_pos / (float(quant_max - quant_min) / 2)
         else:
+<<<<<<< HEAD
             if mapping_type != MappingType.SYMMETRIC_NO_CLIPPING_ERR.name:
                 raise AssertionError(
                     f"Expected mapping_type to be SYMMETRIC_NO_CLIPPING_ERR, got {mapping_type}"
                 )
+=======
+            assert mapping_type == MappingType.SYMMETRIC_NO_CLIPPING_ERR.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # calculate smin and smax individually and choose the larger one. For example, if quant_min = -8 and
             # quant_max = 7.
             # - If smin is bigger: There would be coverage on negative values down to -8, and less rounding
@@ -358,10 +425,14 @@ def _choose_qparams_affine(
         scale = torch.clamp(scale, min=eps)
         zero_point = torch.full_like(scale, int((quant_max + quant_min + 1) / 2))
     else:
+<<<<<<< HEAD
         if mapping_type != MappingType.ASYMMETRIC.name:
             raise AssertionError(
                 f"Expected mapping_type to be ASYMMETRIC, got {mapping_type}"
             )
+=======
+        assert mapping_type == MappingType.ASYMMETRIC.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
         scale = torch.clamp(scale, min=eps)
         if zero_point_domain == ZeroPointDomain.NONE.name:
@@ -371,10 +442,16 @@ def _choose_qparams_affine(
                 zero_point = quant_min - torch.round(min_val_neg / scale)
                 zero_point = torch.clamp(zero_point, quant_min, quant_max)
             else:
+<<<<<<< HEAD
                 if zero_point_domain != ZeroPointDomain.FLOAT.name:
                     raise AssertionError(
                         "if not preserve_zero, zero_point must be in FLOAT domain"
                     )
+=======
+                assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
+                    "if not preserve_zero, zero_point must be in FLOAT domain"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mid_point = (quant_max + quant_min + 1) / 2
                 zero_point = min_val_neg + scale * mid_point
 
@@ -490,6 +567,7 @@ def _quantize_affine_no_dtype_cast(
     1. figure out the dimension for reduction based on block_size, also reshape the input to align with
        the shape after reduction
     2. quantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+<<<<<<< HEAD
     3. reshape the quantized result to original shape
     """
     # TODO: validations
@@ -498,6 +576,20 @@ def _quantize_affine_no_dtype_cast(
         raise AssertionError(f"Unsupported input dtype: {input.dtype}")
     if len(block_size) != input.dim():
         raise AssertionError(f"Got input dim:{input.dim()}, block_size: {block_size}")
+=======
+    3. reshape the quantized result to origianl shape
+    """
+    # TODO: validations
+    # TODO: validate scale/zero_point dimensions are compatible with block_size
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported input dtype: {input.dtype}"
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -515,6 +607,7 @@ def _quantize_affine_no_dtype_cast(
             torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max
         )
     elif zero_point_domain == ZeroPointDomain.NONE.name:
+<<<<<<< HEAD
         if zero_point is not None:
             raise AssertionError(
                 "zero_point should be None when zero_point_domain is NONE"
@@ -530,6 +623,20 @@ def _quantize_affine_no_dtype_cast(
     else:
         if zero_point_domain != ZeroPointDomain.FLOAT.name:
             raise AssertionError(f"Unexpected zero_point_domain: {zero_point_domain}")
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is NONE"
+        )
+        quant = torch.clamp(torch.round(input * (1.0 / scale)), quant_min, quant_max)
+    elif zero_point_domain is None:
+        # This case handles quantization for float8 we expect no zero point and no zero point domain
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is None"
+        )
+        quant = torch.clamp(input * scale.reciprocal(), quant_min, quant_max)
+    else:
+        assert zero_point_domain == ZeroPointDomain.FLOAT.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mid_point = (quant_max + quant_min + 1) / 2
         min_val = zero_point - scale * mid_point
         quant = torch.clamp(
@@ -602,10 +709,21 @@ def _dequantize_affine(
     """op definition that has compatible signatures with custom op library"""
     # TODO: validate scale/zero_point dimensions are compatible with block_size
     if input_dtype not in _SUB_BYTE_UINT_BOUNDS:
+<<<<<<< HEAD
         if input.dtype != input_dtype:
             raise AssertionError(f"Expected: {input_dtype}, got: {input.dtype}")
     if output_dtype not in [torch.float32, torch.float16, torch.bfloat16]:
         raise AssertionError(f"Unsupported output dtype: {output_dtype}")
+=======
+        assert input.dtype == input_dtype, (
+            f"Expected: {input_dtype}, got: {input.dtype}"
+        )
+    assert output_dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported output dtype: {output_dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
     return _dequantize_affine_no_dtype_check(
         input,
@@ -635,10 +753,18 @@ def _dequantize_affine_no_dtype_check(
     1. figure out the dimension for reduction based on block_size, also reshape the input to align with
        the shape after reduction
     2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+<<<<<<< HEAD
     3. reshape the quantized result to original shape and change dtype to the output_dtype
     """
     if len(block_size) != input.dim():
         raise AssertionError(f"Got input dim:{input.dim()}, block_size: {block_size}")
+=======
+    3. reshape the quantized result to origianl shape and change dtype to the output_dtype
+    """
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -661,14 +787,21 @@ def _dequantize_affine_no_dtype_check(
         dequant = dequant.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain == ZeroPointDomain.NONE.name:
+<<<<<<< HEAD
         if zero_point is not None:
             raise AssertionError(
                 "zero_point should be None when zero_point_domain is NONE"
             )
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is NONE"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dequant = input.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain is None:
         # This case handles dequantization for float8 we expect no zero point and no zero point domain
+<<<<<<< HEAD
         if zero_point is not None:
             raise AssertionError(
                 "zero_point should be None when zero_point_domain is None"
@@ -682,6 +815,20 @@ def _dequantize_affine_no_dtype_check(
     else:
         if zero_point_domain != ZeroPointDomain.FLOAT.name:
             raise AssertionError(f"Unexpected zero point domain: {zero_point_domain}")
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is None"
+        )
+        assert _is_float8_type(input.dtype), (
+            f"dequantiztion with no zero point domain is only supported with FP8 types, got {input.dtype}"
+        )
+        dequant = input.to(output_dtype)
+        dequant = dequant * scale
+    else:
+        assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
+            f"Unexpected zero point domain: {zero_point_domain}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: this seems to be a detail for tinygemm (converting from uint to int, probably need to refactor this)
         mid_point = (quant_max + quant_min + 1) / 2
         # This should allocate new memory and avoid input modification
@@ -701,8 +848,12 @@ def forward(self, input: torch.Tensor):
 
         input_detached = input.detach()
         self.original_dtype = input_detached.dtype
+<<<<<<< HEAD
         if self.granularity is None:
             raise AssertionError("granularity is None")
+=======
+        assert self.granularity is not None, "granularity is None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.block_size = get_block_size(input_detached.shape, self.granularity)
 
         shape_for_reduction, reduction_dims = _get_reduction_params(
@@ -715,6 +866,7 @@ def forward(self, input: torch.Tensor):
             self.min_val = min_val
             self.max_val = max_val
         else:
+<<<<<<< HEAD
             if self.min_val.shape != min_val.shape:
                 raise AssertionError(
                     f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
@@ -723,6 +875,14 @@ def forward(self, input: torch.Tensor):
                 raise AssertionError(
                     f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
                 )
+=======
+            assert self.min_val.shape == min_val.shape, (
+                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            )
+            assert self.max_val.shape == max_val.shape, (
+                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             min_val = torch.min(self.min_val, min_val)
             max_val = torch.max(self.max_val, max_val)
             self.min_val.copy_(min_val)
@@ -731,10 +891,16 @@ def forward(self, input: torch.Tensor):
         return input
 
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+<<<<<<< HEAD
         if not (hasattr(self, "min_val") and hasattr(self, "max_val")):
             raise AssertionError(
                 "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
             )
+=======
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return choose_qparams_affine_with_min_max(
             self.min_val,
             self.max_val,
@@ -796,8 +962,12 @@ def forward(self, input: torch.Tensor):
 
         input_detached = input.detach()
         self.original_dtype = input_detached.dtype
+<<<<<<< HEAD
         if self.granularity is None:
             raise AssertionError("granularity is None")
+=======
+        assert self.granularity is not None, "granularity is None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.block_size = get_block_size(input_detached.shape, self.granularity)
 
         shape_for_reduction, reduction_dims = _get_reduction_params(
@@ -810,6 +980,7 @@ def forward(self, input: torch.Tensor):
             self.min_val = min_val
             self.max_val = max_val
         else:
+<<<<<<< HEAD
             if self.min_val.shape != min_val.shape:
                 raise AssertionError(
                     f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
@@ -818,6 +989,14 @@ def forward(self, input: torch.Tensor):
                 raise AssertionError(
                     f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
                 )
+=======
+            assert self.min_val.shape == min_val.shape, (
+                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            )
+            assert self.max_val.shape == max_val.shape, (
+                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             min_val = self.min_val + self.averaging_constant * (min_val - self.min_val)
             max_val = self.max_val + self.averaging_constant * (max_val - self.max_val)
             self.min_val.copy_(min_val)
@@ -827,10 +1006,16 @@ def forward(self, input: torch.Tensor):
         return input
 
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+<<<<<<< HEAD
         if not (hasattr(self, "min_val") and hasattr(self, "max_val")):
             raise AssertionError(
                 "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
             )
+=======
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return choose_qparams_affine_with_min_max(
             self.min_val,
diff --git a/torch/ao/quantization/pt2e/_numeric_debugger.py b/torch/ao/quantization/pt2e/_numeric_debugger.py
index 040a352f6be3d..c14ff232b7781 100644
--- a/torch/ao/quantization/pt2e/_numeric_debugger.py
+++ b/torch/ao/quantization/pt2e/_numeric_debugger.py
@@ -1,8 +1,14 @@
 import copy
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from typing import Optional
+=======
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.ns.fx.utils import compute_sqnr
@@ -97,7 +103,11 @@ def _tensor_shape_equals(x: object, y: object) -> bool:
         return all_equal
     else:
         log.debug("Comparing non Tensors: %s and %s, they must be equal", x, y)
+<<<<<<< HEAD
         return type(x) is type(y) and x == y
+=======
+        return type(x) == type(y) and x == y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _loss_fn(
diff --git a/torch/ao/quantization/pt2e/duplicate_dq_pass.py b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
index 34a95eb80fb22..6dfc4f5e3bdc6 100644
--- a/torch/ao/quantization/pt2e/duplicate_dq_pass.py
+++ b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import operator
 
@@ -30,7 +34,11 @@
 
 def _maybe_duplicate_dq(
     gm: torch.fx.GraphModule, dq_node: torch.fx.Node, user: torch.fx.Node
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     annotation = user.meta.get("quantization_annotation", None)
     if not _is_valid_annotation(annotation):  # type: ignore[arg-type]
         return
diff --git a/torch/ao/quantization/pt2e/graph_utils.py b/torch/ao/quantization/pt2e/graph_utils.py
index f00def3c1668b..d8be5980a657d 100644
--- a/torch/ao/quantization/pt2e/graph_utils.py
+++ b/torch/ao/quantization/pt2e/graph_utils.py
@@ -2,8 +2,13 @@
 import itertools
 import operator
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, Union
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.export import ExportedProgram
@@ -124,6 +129,7 @@ def _get_submodule(
     graph_module: torch.fx.GraphModule, node: torch.fx.Node, arg_index: int
 ) -> tuple[str, torch.nn.Module, torch.fx.Node]:
     submod_node = node.args[arg_index]
+<<<<<<< HEAD
     if not isinstance(submod_node, torch.fx.Node):
         raise AssertionError(
             f"Expected submod_node to be a torch.fx.Node, got {type(submod_node)}"
@@ -136,6 +142,11 @@ def _get_submodule(
         raise AssertionError(
             f"Expected submod_node.target to be a string attribute name, got {type(submod_node.target)}"
         )
+=======
+    assert isinstance(submod_node, torch.fx.Node)
+    assert submod_node.op == "get_attr"
+    assert isinstance(submod_node.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     submodule = graph_module.get_submodule(submod_node.target)
     # pyre-ignore
     return submod_node.target, submodule, node
@@ -170,10 +181,16 @@ def bfs_trace_with_node_process(
 ) -> None:
     """Traverse the graph module and apply node_op to each node."""
 
+<<<<<<< HEAD
     if not isinstance(model, (ExportedProgram, torch.fx.GraphModule)):
         raise AssertionError(
             f"Expected GraphModule or ExportedProgram, got {type(model)}"
         )
+=======
+    assert isinstance(model, (ExportedProgram, torch.fx.GraphModule)), (
+        f"Expected GraphModule or ExportedProgram, got {type(model)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm = model.graph_module if isinstance(model, ExportedProgram) else model
     queue = [gm]
     while queue:
diff --git a/torch/ao/quantization/pt2e/lowering.py b/torch/ao/quantization/pt2e/lowering.py
index 742549dedcf8d..26c1e6a6f58af 100644
--- a/torch/ao/quantization/pt2e/lowering.py
+++ b/torch/ao/quantization/pt2e/lowering.py
@@ -50,7 +50,11 @@ def _node_replace(m):  # type: ignore[no-untyped-def]
         m.recompile()
 
     lowered_model = (
+<<<<<<< HEAD
         torch.export.export(model, example_inputs, strict=True)
+=======
+        torch.export.export_for_training(model, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .run_decompositions(_post_autograd_decomp_table())
         .module()
     )
diff --git a/torch/ao/quantization/pt2e/port_metadata_pass.py b/torch/ao/quantization/pt2e/port_metadata_pass.py
index aab4c435c872f..5563a5fbf7a7c 100644
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -177,19 +177,32 @@ class PortNodeMetaForQDQ(PassBase):
         - Example 1:
           - Original: [Conv -> AvgPool -> Linear]
           - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
+<<<<<<< HEAD
           - Inner brackets specify which nodes Q/DQ inherit metadata from
+=======
+          - Inner brackets specify which nodes Q/DQ inherit metdata from
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           - [Q-> [DQ -> Conv -> Q] -> [DQ -> AvgPool -> Q] -> [DQ -> Linear -> Q] -> DQ]
           - Note first Q and last DQ do not inherit metadata from any nodes
         - Example 2:
           - Original: [Conv -> AvgPool -> Linear]
           - AvgPool is not quantized
           - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
+<<<<<<< HEAD
           - Inner brackets specify which nodes Q/DQ inherit metadata from
           - [Q-> [DQ -> Conv -> Q] -> DQ -> [AvgPool] -> Q -> [DQ -> Linear -> Q] -> DQ]
           - Note DQ and Q nodes around AvgPool do not inherit metadata from AvgPool because
             AvgPool was not supposed to be quantized. Metadata porting relies on quantization_annotation
             on the nodes (in this case AvgPool node) to conclude if the node or pattern was
             supposed to be quantized. And subsequently decide if the preceding Q, if any, should
+=======
+          - Inner brackets specify which nodes Q/DQ inherit metdata from
+          - [Q-> [DQ -> Conv -> Q] -> DQ -> [AvgPool] -> Q -> [DQ -> Linear -> Q] -> DQ]
+          - Note DQ and Q nodes around AvgPool do not inherit metadata from AvgPool because
+            AvgPool was not supposed to be quantized. Metadata porting relies on quantization_annotation
+            on the nodes (in this case AvgPool node) to conclude if the node or patter was
+            supposed to be quantized. And subsequntly decide if the preceding Q, if any, should
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inherit metadata from AvgPool.
       - Dynamically quantized patterns:
         - Input that are dynamically quantized have choose_qparams, quantize and dequantize nodes
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index aedad07cc8a67..da0378975a760 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -22,7 +22,10 @@
     QuantizationSpecBase,
     SharedQuantizationSpec,
 )
+<<<<<<< HEAD
 from torch.ao.quantization.utils import _assert_and_get_unique_device
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.node import Argument
 
@@ -229,10 +232,14 @@ def _get_edge_or_node_to_group_id(
                 qspec, edge_or_node_to_qspec, shared_with_map
             )
 
+<<<<<<< HEAD
             if not isinstance(input_edge, tuple):
                 raise AssertionError(
                     f"input_edge must be a tuple (arg, user), got {type(input_edge)}"
                 )
+=======
+            assert isinstance(input_edge, tuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg, n = input_edge
             if n.meta["quantization_annotation"].allow_implicit_sharing:
                 # NOTE: the order is important here, we first share with other users and then share with previous
@@ -279,7 +286,11 @@ def _get_edge_or_node_to_group_id(
 
             _update_shared_with(input_edge, qspec, shared_with_map)
 
+<<<<<<< HEAD
     # now that we get the sharing relations between all edges and nodes, we can assign group ids
+=======
+    # now that we get the sharing relations between all edges and nodes, we can assingn group ids
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cur_group_id = 0
     edge_or_node_to_group_id: dict[EdgeOrNode, int] = {}
     for edge_or_node in shared_with_map.keys():
@@ -323,7 +334,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -342,17 +356,24 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 named_modules,
                 obs_or_fq_map,
                 is_qat,
+<<<<<<< HEAD
                 model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
 
     if not isinstance(arg, Node):
         return arg
+<<<<<<< HEAD
     if not isinstance(arg, Node):
         raise AssertionError(
             f"expect original argument to be a Node, but got: {type(arg)}"
         )
+=======
+    assert isinstance(arg, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # default (no observer)
     new_arg = arg
 
@@ -360,10 +381,16 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     original_arg = arg
     while _is_activation_post_process_node(original_arg, named_modules):
         original_arg = original_arg.args[0]  # type: ignore[assignment]
+<<<<<<< HEAD
     if not isinstance(original_arg, Node):
         raise AssertionError(
             f"expect original argument to be a Node, but got: {type(original_arg)}"
         )
+=======
+    assert isinstance(original_arg, Node), (
+        f"expect original argument to be a Node, but got: {type(original_arg)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     input_edge = (original_arg, node)
     if input_edge not in obs_or_fq_map:
@@ -373,7 +400,11 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     if input_edge_obs_or_fq is None:
         return new_arg
 
+<<<<<<< HEAD
     arg_as_output_obs_or_fq = obs_or_fq_map.get(original_arg)
+=======
+    arg_as_output_obs_or_fq = obs_or_fq_map.get(original_arg, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # the arg is observed as the output and is using the same instance as the input_edge
     # we'll reuse the inserted observer/fake_quant
     if arg_as_output_obs_or_fq is not None and id(arg_as_output_obs_or_fq) == id(
@@ -398,6 +429,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         if id(maybe_obs_mod) == id(input_edge_obs_or_fq):
             return maybe_obs_node
 
+<<<<<<< HEAD
     if not isinstance(model.graph, Graph):
         raise AssertionError(
             f"Expected model.graph to be a torch.fx.Graph, got {type(model.graph)}"
@@ -409,6 +441,11 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         named_modules,
         model.graph,
         model_device,
+=======
+    assert isinstance(model.graph, Graph)
+    new_arg = _insert_obs_or_fq(
+        arg, input_edge_obs_or_fq, model, named_modules, model.graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return new_arg
 
@@ -420,7 +457,10 @@ def _maybe_insert_input_observers_for_node(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -447,20 +487,31 @@ def _maybe_insert_input_observers_for_node(
             named_modules,
             obs_or_fq_map,
             is_qat,
+<<<<<<< HEAD
             model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         new_args.append(new_arg)
 
     # Clone has a memory_format kwarg, zeros_like has a pin_memory kwarg, and
     # gelu has a has an approximate kwarg that persist in exported graph.
     # This is just a work around for these.
+<<<<<<< HEAD
     if not (
+=======
+    assert (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node.target == torch.ops.aten.clone.default
         or node.target == torch.ops.aten.zeros_like.default
         or node.target == torch.ops.aten.gelu.default
         or len(node.kwargs) == 0
+<<<<<<< HEAD
     ):
         raise AssertionError(" expecting kwargs for aten op IR to be empty")
+=======
+    ), " expecting kwargs for aten op IR to be empty"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # assign the new args to the node, inplace
     node.args = tuple(new_args)
@@ -473,17 +524,24 @@ def _maybe_insert_output_observer_for_node(
     graph: Graph,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[Node]:
     if node in obs_or_fq_map:
         output_act_obs_or_fq = obs_or_fq_map[node]
         new_output = _insert_obs_or_fq(
+<<<<<<< HEAD
             node,
             output_act_obs_or_fq,
             model,
             named_modules,
             graph,
             model_device,
+=======
+            node, output_act_obs_or_fq, model, named_modules, graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # propagate numeric debug handle from original node to observer/fake_quant node
         if (
@@ -506,9 +564,18 @@ def _maybe_insert_input_and_output_observers_for_node(
     model: torch.fx.GraphModule,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+<<<<<<< HEAD
     model_device: Optional[torch.device] = None,
 ):
     this_node_quantization_annotation = node.meta.get("quantization_annotation", None)
+=======
+):
+    this_node_quantization_annotation = (
+        node.meta["quantization_annotation"]
+        if "quantization_annotation" in node.meta
+        else None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if this_node_quantization_annotation is None:
         return
 
@@ -520,7 +587,10 @@ def _maybe_insert_input_and_output_observers_for_node(
         named_modules,
         obs_or_fq_map,
         is_qat,
+<<<<<<< HEAD
         model_device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     output_is_a_tensor = "val" in node.meta and isinstance(node.meta["val"], FakeTensor)
@@ -529,6 +599,7 @@ def _maybe_insert_input_and_output_observers_for_node(
 
     # this returns the new observer node if it was needed
     maybe_output_obs_node = _maybe_insert_output_observer_for_node(
+<<<<<<< HEAD
         node,
         model,
         named_modules,
@@ -536,6 +607,9 @@ def _maybe_insert_input_and_output_observers_for_node(
         obs_or_fq_map,
         is_qat,
         model_device,
+=======
+        node, model, named_modules, model.graph, obs_or_fq_map, is_qat
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if maybe_output_obs_node is None:
@@ -583,16 +657,23 @@ def prepare(
     )
     if obs_or_fq_callback:
         obs_or_fq_callback(model, obs_or_fq_map)
+<<<<<<< HEAD
     model_device = _assert_and_get_unique_device(model)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for node in nodes_before_observation:
         # TODO: simplify logic for inserting observers
         _maybe_insert_input_and_output_observers_for_node(
+<<<<<<< HEAD
             node,
             model,
             obs_or_fq_map,
             is_qat,
             model_device,
+=======
+            node, model, obs_or_fq_map, is_qat
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     model = GraphModule(model, model.graph)
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index b7daca97b18f7..ad24b32a3cdac 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -3,8 +3,12 @@
 import dataclasses
 import itertools
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -385,6 +389,7 @@ def _get_nodes(nodes: list[Node]) -> tuple[Node, Node, Optional[Node]]:
             if n.op != "call_function":
                 continue
             if _is_conv_or_conv_transpose_node(n):
+<<<<<<< HEAD
                 if conv_node is not None:
                     raise AssertionError(
                         f"Found multiple conv nodes in match, previous: {conv_node}, new: {n}"
@@ -410,12 +415,25 @@ def _get_nodes(nodes: list[Node]) -> tuple[Node, Node, Optional[Node]]:
             raise AssertionError(
                 "Expected exactly one bn node in the match, found none"
             )
+=======
+                assert conv_node is None
+                conv_node = n
+            if _is_bn_node(n):
+                assert bn_node is None
+                bn_node = n
+            if n.target == operator.getitem:
+                assert getitem_node is None
+                getitem_node = n
+        assert conv_node is not None
+        assert bn_node is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (conv_node, bn_node, getitem_node)
 
     def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
         """
         Return a 3-tuple of (orig_node, q_node, dq_node).
         """
+<<<<<<< HEAD
         if not _is_dequantize(n):
             raise AssertionError(f"Expected a dequantize node, got: {n}")
         q_node = n.args[0]
@@ -432,6 +450,14 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
             raise AssertionError(
                 f"Expected original node to be a torch.fx.Node, got {type(orig_node)}"
             )
+=======
+        assert _is_dequantize(n)
+        q_node = n.args[0]
+        assert isinstance(q_node, Node)
+        assert _is_quantize(q_node)
+        orig_node = q_node.args[0]
+        assert isinstance(orig_node, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (orig_node, q_node, n)
 
     original_nodes = list(_filter_nodes_map(r.nodes_map).values())
@@ -439,10 +465,15 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     r_conv, r_bn, r_getitem = _get_nodes(r.replacements)
 
     # Create the mapping from original node to replacement node
+<<<<<<< HEAD
     if o_getitem is not None:
         raise AssertionError(f"Expected o_getitem to be None, got {o_getitem}")
     if r_getitem is not None:
         raise AssertionError(f"Expected r_getitem to be None, got {r_getitem}")
+=======
+    assert o_getitem is None
+    assert r_getitem is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mapping = {
         "conv": (o_conv, r_conv),
         "bn": (o_bn, r_bn),
@@ -454,6 +485,7 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     (p_conv, _, _) = _get_nodes(list(r.nodes_map.keys()))
     (p_conv_input, p_conv_weight, *_) = p_conv.args
     (r_conv_input, r_conv_weight, *_) = r_conv.args
+<<<<<<< HEAD
     if not isinstance(p_conv_input, Node):
         raise AssertionError(
             f"Expected p_conv_input to be a Node, got {type(p_conv_input)}"
@@ -470,6 +502,12 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
         raise AssertionError(
             f"Expected r_conv_weight to be a Node, got {type(r_conv_weight)}"
         )
+=======
+    assert isinstance(p_conv_input, Node)
+    assert isinstance(p_conv_weight, Node)
+    assert isinstance(r_conv_input, Node)
+    assert isinstance(r_conv_weight, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     o_conv_input = r.nodes_map[p_conv_input]
     o_conv_weight = r.nodes_map[p_conv_weight]
 
@@ -493,6 +531,7 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     if len(p_conv.args) > 2 and len(r_conv.args) > 2:
         p_conv_bias = p_conv.args[2]
         r_conv_bias = r_conv.args[2]
+<<<<<<< HEAD
         if not isinstance(p_conv_bias, Node):
             raise AssertionError(
                 f"Expected p_conv_bias to be a Node, got {type(p_conv_bias)}"
@@ -501,6 +540,10 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
             raise AssertionError(
                 f"Expected r_conv_bias to be a Node, got {type(r_conv_bias)}"
             )
+=======
+        assert isinstance(p_conv_bias, Node)
+        assert isinstance(r_conv_bias, Node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         o_conv_bias = r.nodes_map[p_conv_bias]
 
         # If conv bias is quantized, extract the q - dq nodes
@@ -550,12 +593,17 @@ def _copy_over_literal_conv_args(original_node: Node, new_node: Node):
     Note: Unlike other tensor args like conv weights and biases, literal args are
     preserved in the original nodes after replacement, so we can access them here.
     """
+<<<<<<< HEAD
     if not _is_conv_or_conv_transpose_node(original_node):
         raise AssertionError(
             f"Expected original_node to be a conv node, got {original_node}"
         )
     if not _is_conv_or_conv_transpose_node(new_node):
         raise AssertionError(f"Expected new_node to be a conv node, got {new_node}")
+=======
+    assert _is_conv_or_conv_transpose_node(original_node)
+    assert _is_conv_or_conv_transpose_node(new_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # x, weight, bias, [stride, padding, dilation, transposed, output_padding, groups]
     new_args = list(new_node.args)
     if len(new_args) < 3:
@@ -574,6 +622,7 @@ def _update_conv_input_qspec_map_after_replacement(
     so the keys in the `input_qspec_map` will need to be updated to reflect
     the corresponding nodes in the replacement graph.
     """
+<<<<<<< HEAD
     if not _is_conv_or_conv_transpose_node(original_node):
         raise AssertionError(
             f"Expected original_node to be a conv node, got {original_node}"
@@ -582,6 +631,10 @@ def _update_conv_input_qspec_map_after_replacement(
         raise AssertionError(
             f"Expected replacement_node to be a conv node, got {replacement_node}"
         )
+=======
+    assert _is_conv_or_conv_transpose_node(original_node)
+    assert _is_conv_or_conv_transpose_node(replacement_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if "quantization_annotation" not in original_node.meta:
         return
     original_input_qspec_map = original_node.meta[
@@ -841,8 +894,13 @@ def _duplicate_dequantize_node(m: GraphModule):
 def _remove_extra_dequantize(m: GraphModule):
     """
     Removes duplicate dequant nodes in the graph, for an operator that has
+<<<<<<< HEAD
     multiple dequant nodes as a user. Replace them with a single dequant node
     that can be shared across all uses. This should be seen as the "reverse"
+=======
+    multiple dequant nodes as a user, replace them with a single dequant node
+    that can be shared across all the uses. This should be seen as the "reverse"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     of `_duplicate_dequantize_node`.
     """
     dq_op = torch.ops.quantized_decomposed.dequantize_per_tensor
@@ -870,11 +928,15 @@ def _copy_over_q_dq_args(original_node: Node, replacement_node: Node):
     """
     # For quantize_per_tensor, scale and zp are literals and need to be copied
     # For quantize_per_channel, scale and zp are get_attr nodes and should be skipped
+<<<<<<< HEAD
     if original_node.target != replacement_node.target:
         raise AssertionError(
             "Expected original and replacement nodes to have the same target, got "
             f"{original_node.target} != {replacement_node.target}"
         )
+=======
+    assert original_node.target == replacement_node.target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if original_node.target in (
         torch.ops.quantized_decomposed.quantize_per_tensor.default,
         torch.ops.quantized_decomposed.dequantize_per_tensor.default,
@@ -936,7 +998,11 @@ def _fold_conv_bn_qat(m: GraphModule) -> GraphModule:
             m, F.conv_transpose2d, _quantized_conv2d_bn_example_inputs, is_cuda=is_cuda
         )
 
+<<<<<<< HEAD
     # remove in place add from batchnorm tracking training stats
+=======
+    # remove in place add from batchnorm tracking traning stats
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in m.graph.nodes:
         if (
             node.target == torch.ops.aten.add_.Tensor
@@ -1032,10 +1098,14 @@ def _fold_conv_bn_qat_helper(
         _copy_over_q_dq_args(*node_map["conv_weight_q"])
         _copy_over_q_dq_args(*node_map["conv_weight_dq"])
         if "conv_bias_q" in node_map:
+<<<<<<< HEAD
             if "conv_bias_dq" not in node_map:
                 raise AssertionError(
                     "Expected 'conv_bias_dq' to be present in node_map when 'conv_bias_q' is present"
                 )
+=======
+            assert "conv_bias_dq" in node_map
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _copy_over_q_dq_args(*node_map["conv_bias_q"])
             _copy_over_q_dq_args(*node_map["conv_bias_dq"])
 
diff --git a/torch/ao/quantization/pt2e/representation/rewrite.py b/torch/ao/quantization/pt2e/representation/rewrite.py
index 0f055cc3019a8..0b984df8a8b7b 100644
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -1,8 +1,14 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Optional
+=======
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._export.utils import _disable_aten_to_metadata_assertions
@@ -301,7 +307,11 @@ def _reference_quantized_conv2d(
     # Out_(i, j)_fp32 = ((X_scale * W_scale) * Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)]) + bias_(i)_fp32
     # In order to addition of bias_(i)_fp32 inside, we must do
     # Out_(i, j)_fp32 = (X_scale * W_scale) * (Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)] + (1 / (X_scale * W_scale)) * bias_(i)_fp32)W_scale  # noqa: B950
+<<<<<<< HEAD
     # Note we had to multiply bias_fp32 with X_scale * W_scale = bias_scale
+=======
+    # Note we had to multiply bias_fp32 qith X_scale * W_scale = bias_scale
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Thus bias quantization to int32 must be with X_scale * W_scale
 
     bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
@@ -437,7 +447,11 @@ def _reference_quantized_add(
         x_fp32 = (x_i8 - x_zero_point) * x_scale         (3)
         y_fp32 = (y_i8 - y_zero_point) * y_scale         (4)
 
+<<<<<<< HEAD
         # applying the above formula to the out_i8 equation we can get the following:
+=======
+        # applying the above fomula to the out_i8 equation we can get the following:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_i8 = out_fp32 / out_scale + out_zero_point             # (1)
            = (x_f32 + y_f32) / out_scale + out_zero_point      # applying (2) to substitute out_fp32 with x_fp32 + y_fp32
            = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point  # apply (3) and (4)
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index e43195a38085a..e071927d4fc2f 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -1,13 +1,20 @@
 # mypy: allow-untyped-defs
 import operator
 import types
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.quantization.pt2e._affine_quantization  # noqa: F401
 import torch.nn.functional as F
+<<<<<<< HEAD
 import torch.utils._pytree as pytree
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Makes sure that quantized_decomposed ops are registered
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
@@ -15,6 +22,10 @@
 from torch.export.unflatten import _assign_attr, _AttrKind
 from torch.fx import GraphModule, Node
 from torch.nn.utils.fusion import fuse_conv_bn_weights
+<<<<<<< HEAD
+=======
+from torch.utils._pytree import LeafSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -122,8 +133,12 @@ def _is_valid_annotation(annotation: QuantizationAnnotation) -> bool:
 def _get_tensor_constant_from_node(node, m):
     if node is None:
         return None
+<<<<<<< HEAD
     if node.op != "get_attr":
         raise AssertionError(f"Expected node.op to be 'get_attr', got {node.op}")
+=======
+    assert node.op == "get_attr"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     target_atoms = node.target.split(".")
     attr_itr = m
     for i, atom in enumerate(target_atoms):
@@ -248,10 +263,14 @@ def fold_bn_weights_into_conv_node(
 
     # calling data since the fused_weight and fused_bias are nn.Parameter
     weight_attr_name = conv_weight_node.target
+<<<<<<< HEAD
     if not isinstance(weight_attr_name, str):
         raise AssertionError(
             f"Expected conv_weight_node.target to be a string attribute name, got {type(weight_attr_name)}"
         )
+=======
+    assert isinstance(weight_attr_name, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _assign_attr(fused_weight, m, weight_attr_name, _AttrKind.PARAMETER)
     if conv_bias_node is not None:
         bias_attr_name = conv_bias_node.target
@@ -358,6 +377,7 @@ def _get_aten_graph_module_for_pattern(
     """
     if is_cuda:
         example_inputs = tuple(
+<<<<<<< HEAD
             x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs
         )
 
@@ -368,6 +388,17 @@ def _get_aten_graph_module_for_pattern(
             kwargs,
             strict=True,
         ).module(check_guards=False)
+=======
+            [x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs]
+        )
+
+    aten_pattern = torch.export.export_for_training(
+        pattern,  # type: ignore[arg-type]
+        example_inputs,
+        kwargs,
+        strict=True,
+    ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]
@@ -477,10 +508,14 @@ def pattern(self, x, new_ph):
         exclude_literals = []
 
     in_spec = gm._in_spec
+<<<<<<< HEAD
     assert in_spec.type is tuple
     args_spec = in_spec.child(0)
     assert args_spec.type is tuple
     args_spec_children = args_spec.children()
+=======
+    args_spec = in_spec.children_specs[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.op == "placeholder":
             last_ph = node
@@ -495,7 +530,11 @@ def pattern(self, x, new_ph):
                     else:
                         ph_node = gm.graph.placeholder("arg" + str(cnt))
                         new_args.append(ph_node)
+<<<<<<< HEAD
                         args_spec_children.append(pytree.treespec_leaf())
+=======
+                        args_spec.children_specs.append(LeafSpec())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         cnt += 1
                         if merge_dup:
                             literal_to_ph[arg] = ph_node
@@ -506,8 +545,13 @@ def pattern(self, x, new_ph):
         node.args = new_args
 
     # Update `num_nodes`, `num_leaves`, `num_children`.
+<<<<<<< HEAD
     args_spec = pytree.treespec_tuple(args_spec_children)
     gm._in_spec = in_spec = pytree.treespec_tuple([args_spec, *in_spec.children()[1:]])
+=======
+    args_spec.__post_init__()
+    in_spec.__post_init__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return gm
 
 
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 4cf9cf834de1c..99200d3f2dda7 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from collections import namedtuple
 from typing import Any, Optional, Union
@@ -83,7 +86,10 @@
 ]
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class QConfig(namedtuple("QConfig", ["activation", "weight"])):
     """
     Describes how to quantize a layer or a part of the network by providing
@@ -121,7 +127,10 @@ def __new__(cls, activation, weight):
     "`QConfigDynamic` is going to be deprecated in PyTorch 1.12, please use `QConfig` instead",
     category=FutureWarning,
 )
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class QConfigDynamic(namedtuple("QConfigDynamic", ["activation", "weight"])):
     """
     Describes how to dynamically quantize a layer or a part of the network by providing
@@ -292,8 +301,12 @@ def get_default_qconfig(backend="x86", version=0):
             if not torch.cpu._is_vnni_supported():
                 warnings.warn(
                     "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
+<<<<<<< HEAD
                     "on CPU without Vector Neural Network Instruction support.",
                     stacklevel=2,
+=======
+                    "on CPU without Vector Neural Network Instruction support."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             qconfig = QConfig(
                 activation=HistogramObserver.with_args(reduce_range=False),
@@ -571,6 +584,7 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
         )
 
 
+<<<<<<< HEAD
 if sys.version_info < (3, 12):
     QConfigAny = Optional[QConfig]
     QConfigAny.__module__ = "torch.ao.quantization.qconfig"
@@ -578,6 +592,10 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
     from typing import TypeAliasType
 
     QConfigAny = TypeAliasType("QConfigAny", Optional[QConfig])
+=======
+QConfigAny = Optional[QConfig]
+QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _add_module_to_qconfig_obs_ctr(
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 10111d4ab8a2a..b24ab377c9a58 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -2,7 +2,11 @@
 from __future__ import annotations
 
 from collections import OrderedDict
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -26,10 +30,13 @@
 )
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "get_default_qconfig_mapping",
     "get_default_qat_qconfig_mapping",
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index ee2c63cc291b8..16920b83416bf 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -1,6 +1,10 @@
 import copy
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.ao.nn as ao_nn
@@ -343,7 +347,11 @@ def get_default_float_to_quantized_operator_mappings() -> dict[
 # TODO: merge with get_static_quant_module_class
 def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
     """Get the quantized operator corresponding to the float operator"""
+<<<<<<< HEAD
     quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op)
+=======
+    quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert quantized_op is not None, (
         f"Operator {str(float_op)} does not have corresponding quantized op"
     )
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 3c53876081e07..a3e445042927e 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -237,7 +237,11 @@ def insert_activation_post_process(m, special_act_post_process=None):
 
     for name, child in module.named_children():
         # TODO remove Dropout special after codebase stable
+<<<<<<< HEAD
         if type_before_parametrizations(child) is nn.Dropout:
+=======
+        if type_before_parametrizations(child) in [nn.Dropout]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
         elif issubclass(
             type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional)
@@ -392,8 +396,12 @@ def prepare(
         warnings.warn(
             "None of the submodule got qconfig applied. Make sure you "
             "passed correct configuration through `qconfig_dict` or "
+<<<<<<< HEAD
             "by assigning the `.qconfig` attribute directly on submodules",
             stacklevel=2,
+=======
+            "by assigning the `.qconfig` attribute directly on submodules"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     _add_observer_(
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index c59d35c573505..1cf91922353e2 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -185,7 +185,11 @@ def _prepare_standalone_module_fx(
               same as input_quantized_idxs configuration provided
               for the standalone module
             * `standalone_module_output_quantized_idxs(List[Int])`: a list of
+<<<<<<< HEAD
               indices for the graph output that is quantized
+=======
+              indexs for the graph output that is quantized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               same as input_quantized_idxs configuration provided
               for the standalone module
 
diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py
index 169e2905ddbdc..136b49cd5853b 100644
--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@@ -76,7 +76,11 @@ def calibrate(model, data_loader):
 
         # Step 1. program capture
         # NOTE: this API will be updated to torch.export API in the future, but the captured
+<<<<<<< HEAD
         # result should mostly stay the same
+=======
+        # result shoud mostly stay the same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = torch.export.export_for_training(m, *example_inputs).module()
         # we get a model with aten ops
 
@@ -153,7 +157,11 @@ def train_loop(model, train_data):
 
         # Step 1. program capture
         # NOTE: this API will be updated to torch.export API in the future, but the captured
+<<<<<<< HEAD
         # result should mostly stay the same
+=======
+        # result shoud mostly stay the same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = torch.export.export_for_training(m, *example_inputs).module()
         # we get a model with aten ops
 
@@ -218,7 +226,11 @@ def convert_pt2e(
 
     Args:
       * `model` (torch.fx.GraphModule): calibrated/trained model
+<<<<<<< HEAD
       * `use_reference_representation` (bool): boolean flag to indicate whether to produce reference representation or not
+=======
+      * `use_reference_representation` (bool): boolean flag to indicate whether to produce referece representation or not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not
 
     Returns:
diff --git a/torch/ao/quantization/quantizer/quantizer.py b/torch/ao/quantization/quantizer/quantizer.py
index 91c7159a89afd..3fb93becd4a8a 100644
--- a/torch/ao/quantization/quantizer/quantizer.py
+++ b/torch/ao/quantization/quantizer/quantizer.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Annotated, Optional, Union
+=======
+from dataclasses import dataclass, field
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -82,7 +87,11 @@ class FixedQParamsQuantizationSpec(QuantizationSpecBase):
 input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node]
 output value is an fx Node
 """
+<<<<<<< HEAD
 EdgeOrNode = Annotated[Union[tuple[Node, Node], Node], None]
+=======
+EdgeOrNode = Union[tuple[Node, Node], Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer"
 
 
@@ -112,7 +121,11 @@ class DerivedQuantizationSpec(QuantizationSpecBase):
 
 @dataclass
 class QuantizationAnnotation:
+<<<<<<< HEAD
     """How are input argument or output should be quantized,
+=======
+    """How are input arguemnt or output should be quantized,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     expressed as QuantizationSpec, this corresponds to how a Tensor in the
     operator Graph is observed (PTQ) or fake quantized (QAT)
     """
diff --git a/torch/ao/quantization/quantizer/utils.py b/torch/ao/quantization/quantizer/utils.py
index 7c65a8e680149..36e8a94e1f67c 100644
--- a/torch/ao/quantization/quantizer/utils.py
+++ b/torch/ao/quantization/quantizer/utils.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
 
@@ -15,6 +16,16 @@
 def _annotate_input_qspec_map(
     node: Node, input_node: Node, qspec: Optional[QuantizationSpecBase]
 ) -> None:
+=======
+# mypy: allow-untyped-defs
+
+from torch.ao.quantization.pt2e.utils import _is_sym_size_node
+from torch.ao.quantization.quantizer.quantizer import QuantizationAnnotation
+from torch.fx import Node
+
+
+def _annotate_input_qspec_map(node: Node, input_node: Node, qspec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
     )
@@ -24,7 +35,11 @@ def _annotate_input_qspec_map(
     node.meta["quantization_annotation"] = quantization_annotation
 
 
+<<<<<<< HEAD
 def _annotate_output_qspec(node: Node, qspec: Optional[QuantizationSpecBase]) -> None:
+=======
+def _annotate_output_qspec(node: Node, qspec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
     )
@@ -32,12 +47,20 @@ def _annotate_output_qspec(node: Node, qspec: Optional[QuantizationSpecBase]) ->
     node.meta["quantization_annotation"] = quantization_annotation
 
 
+<<<<<<< HEAD
 def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]) -> bool:
+=======
+def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This utility is used to handle cases when dynami_shape=True tracing leads
     to symint nodes in the pattern of linear module. In those cases, we need to
     distinguish between the nodes that are in input for just extracting value of
+<<<<<<< HEAD
     some dimensions (and symint nodes) vs. the one that is activation.
+=======
+    some dimentions (and symint nodes) vs. the one that is activation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     For example:
     graph(x, y, weight):
        size_0 = torch.ops.aten.sym_size([x], [0])
@@ -57,7 +80,11 @@ def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]) -> boo
     )
 
 
+<<<<<<< HEAD
 def _get_module_name_filter(module_name: str) -> Callable[[Node], bool]:
+=======
+def _get_module_name_filter(module_name: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Get the module_name_filter function for a given module name, the filter accepts
     a node and checks if the node comes from a module that has certain module name
 
@@ -78,7 +105,11 @@ def module_name_filter(n: Node) -> bool:
         # get_attr nodes doesn't have nn_module_stack?
         nn_module_stack = n.meta.get("nn_module_stack", {})
 
+<<<<<<< HEAD
         def _normalize_path(n: str) -> str:
+=======
+        def _normalize_path(n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prefix = 0
             # TODO This is non standard behavior and should be removed when we migrate off capture_pre_autograd_graph.
             if n.startswith("L['self']."):
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index e2482077b73eb..043a203ff51c1 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -3,9 +3,16 @@
 import itertools
 import operator
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
+=======
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -372,7 +379,10 @@ def wrapper(
         if quantizer._need_skip_config(quantization_config):
             warnings.warn(
                 f"Skip the quantization config for {name}.",
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return quantizer
         return method(quantizer, name, quantization_config)
@@ -418,7 +428,10 @@ def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
         # As we use `_need_skip_config` to skip all invalid configurations,
         # we can safely assume that the all existing non-None configurations
         # have the same quantization mode.
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for qconfig in (
             list(self.module_name_qconfig.values())
             + list(self.operator_type_qconfig.values())
@@ -465,10 +478,14 @@ def _need_skip_config(
             current_mode.qat_state is not None
             and current_mode.qat_state != quantization_config.is_qat
         ):
+<<<<<<< HEAD
             warnings.warn(
                 "Mixed QAT and Non-QAT quantization config is not supported.",
                 stacklevel=2,
             )
+=======
+            warnings.warn("Mixed QAT and Non-QAT quantization config is not supported.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             need_skip = True
         if current_mode.dynamic_state is not None:
             input_activation_spec = quantization_config.input_activation
@@ -477,15 +494,23 @@ def _need_skip_config(
                 and current_mode.dynamic_state != input_activation_spec.is_dynamic
             ):
                 warnings.warn(
+<<<<<<< HEAD
                     "Mixed dynamic and static quantization config is not supported.",
                     stacklevel=2,
+=======
+                    "Mixed dynamic and static quantization config is not supported."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 need_skip = True
         return need_skip
 
     def set_global(self, quantization_config: QuantizationConfig):
         if self._need_skip_config(quantization_config):
+<<<<<<< HEAD
             warnings.warn("Skip the global quantization config.", stacklevel=2)
+=======
+            warnings.warn("Skip the global quantization config.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self
         self.global_config = quantization_config
         return self
@@ -494,8 +519,12 @@ def get_global_quantization_config(self):
         if not isinstance(self.global_config, QuantizationConfig):
             warnings.warn(
                 "The global_config for X86InductorQuantizer is currently invalid. \
+<<<<<<< HEAD
                 Please ensure that you use set_global to establish the global quantization configuration.",
                 stacklevel=2,
+=======
+                Please ensure that you use set_global to establish the global quantization configuration."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return self.global_config
 
@@ -514,8 +543,12 @@ def set_function_type_qconfig(
             )
         else:
             warnings.warn(
+<<<<<<< HEAD
                 f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.",
                 stacklevel=2,
+=======
+                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return self
 
@@ -532,8 +565,12 @@ def set_module_type_qconfig(
             )
         else:
             warnings.warn(
+<<<<<<< HEAD
                 f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.",
                 stacklevel=2,
+=======
+                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return self
 
@@ -559,8 +596,12 @@ def _set_aten_operator_qconfig(
             self.operator_type_qconfig[operator_type] = quantization_config
         else:
             warnings.warn(
+<<<<<<< HEAD
                 f"operator: Unable to quantize {operator} by X86InductorQuantizer.",
                 stacklevel=2,
+=======
+                f"operator: Unable to quantize {operator} by X86InductorQuantizer."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return self
 
@@ -607,7 +648,11 @@ def _annotate_linear_node_helper(
             _annotate_nodes_not_quantize(linear_node)
             return
         input_qspec_map = {}
+<<<<<<< HEAD
         assert linear_node.target == torch.ops.aten.linear.default
+=======
+        assert linear_node.target in (torch.ops.aten.linear.default,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         has_bias = len(linear_node.args) == 3
         input_index = 0
         weight_index = 1
@@ -818,7 +863,10 @@ def _annotate_qat_conv2d_bn_binary_unary(
                 )
                 binary_node.meta[QUANT_ANNOTATION_KEY] = (
                     _X86InductorQuantizationAnnotation(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         input_qspec_map=binary_node_input_qspec_map,
                         _annotated=True,
                     )
@@ -889,7 +937,10 @@ def _annotate_qat_conv2d_bn_binary(
                 )
                 binary_node.meta[QUANT_ANNOTATION_KEY] = (
                     _X86InductorQuantizationAnnotation(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         input_qspec_map=binary_node_input_qspec_map,
                         # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
                         output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
@@ -1097,7 +1148,10 @@ def _annotate_conv2d_binary_unary(
                 quantization_config
             )
             binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_qspec_map=binary_node_input_qspec_map,
                 _annotated=True,
             )
@@ -1152,7 +1206,10 @@ def _annotate_conv2d_binary(
                 quantization_config
             )
             binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_qspec_map=binary_node_input_qspec_map,
                 _annotated=True,
                 _is_output_of_quantized_pattern=True,
@@ -1326,8 +1383,12 @@ def is_all_inputs_connected_to_quantized_op(input_nodes):
                 if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
                     if quantization_config is not None:
                         warnings.warn(
+<<<<<<< HEAD
                             f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.",
                             stacklevel=2,
+=======
+                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     return
 
@@ -1367,7 +1428,15 @@ def is_all_inputs_connected_to_quantized_op(input_nodes):
     def _annotate_output_share_observer_as_input(
         self, input_node: Node, source_node: Node
     ):
+<<<<<<< HEAD
         source_node_quantization_annotation = source_node.meta.get(QUANT_ANNOTATION_KEY)
+=======
+        source_node_quantization_annotation = (
+            source_node.meta[QUANT_ANNOTATION_KEY]
+            if QUANT_ANNOTATION_KEY in source_node.meta
+            else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             source_node_quantization_annotation
             and source_node_quantization_annotation._is_output_of_quantized_pattern
@@ -1406,8 +1475,15 @@ def _annotate_output_for_int8_in_int8_out_pattern(
                     return
 
                 # Get the quantization_annotation from getitem_node
+<<<<<<< HEAD
                 maxpool_node_quantization_annotation = maxpool_node.meta.get(
                     QUANT_ANNOTATION_KEY
+=======
+                maxpool_node_quantization_annotation = (
+                    maxpool_node.meta[QUANT_ANNOTATION_KEY]
+                    if QUANT_ANNOTATION_KEY in maxpool_node.meta
+                    else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if (
                     maxpool_node_quantization_annotation
@@ -1444,9 +1520,14 @@ def _annotate_linear(
                     "Linear partition cannot have more than one output node"
                 )
             linear_node = partition.output_nodes[0]
+<<<<<<< HEAD
             if (
                 linear_node.op != "call_function"
                 or linear_node.target != torch.ops.aten.linear.default
+=======
+            if linear_node.op != "call_function" or linear_node.target not in (
+                torch.ops.aten.linear.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 raise ValueError(f"{linear_node} is not an aten linear operator")
             # skip annotation if it is already annotated
@@ -1476,9 +1557,14 @@ def _annotate_linear_unary(
             linear_node, unary_node = self._get_output_nodes_of_partitions(
                 [linear_partition, unary_partition]
             )
+<<<<<<< HEAD
             if (
                 linear_node.op != "call_function"
                 or linear_node.target != torch.ops.aten.linear.default
+=======
+            if linear_node.op != "call_function" or linear_node.target not in (
+                torch.ops.aten.linear.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 continue
             if _skip_annotate([unary_node, linear_node], filter_fn):
@@ -1508,7 +1594,10 @@ def _annotate_linear_binary_unary(
             has_unary = unary_op is not None
             seq_partition = [torch.nn.Linear, binary_op]
             if has_unary:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 seq_partition.append(unary_op)
             fused_partitions = find_sequential_partitions(gm, seq_partition)
             for fused_partition in fused_partitions:
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index 177203e8ff47b..7bd9a07d18467 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -4,7 +4,11 @@
 import copy
 import functools
 import typing_extensions
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo as torchdynamo
@@ -35,8 +39,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
     from torch.fx import Node
 
@@ -247,7 +254,11 @@ def not_module_type_or_name_filter(n: Node) -> bool:
 class XNNPACKQuantizer(Quantizer):
     """
     !!! DEPRECATED !!!
+<<<<<<< HEAD
     XNNPACKQuantizer is a marked as deprecated. It will be removed in the future.
+=======
+    XNNPACKQuantizer is a marked as deprected. It will be removed in the future.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     It has been moved to executorch.backends.xnnpack.quantizer.xnnpack_quantizer.XNNPACKQuantizer.
     Please use the new quantizer instead.
     """
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
index db790a12430e5..98b2e5e5b82d5 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -1,9 +1,14 @@
 # mypy: allow-untyped-defs
 import itertools
 import typing
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
+=======
+from dataclasses import dataclass
+from typing import Callable, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -376,11 +381,17 @@ def _do_annotate_conv_relu(
             input_qspec_map[bias] = get_bias_qspec(quantization_config)
             partition.append(bias)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         if _is_annotated(partition):
             continue
 
         # pyrefly: ignore [bad-argument-type]
+=======
+        if _is_annotated(partition):
+            continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if filter_fn and any(not filter_fn(n) for n in partition):
             continue
 
@@ -391,7 +402,10 @@ def _do_annotate_conv_relu(
             output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
             _annotated=True,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _mark_nodes_as_annotated(partition)
         annotated_partitions.append(partition)
     return annotated_partitions
@@ -426,7 +440,11 @@ def _annotate_conv_bn(
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
     """
+<<<<<<< HEAD
     Find conv + batchnorm partitions
+=======
+    Find conv + batchnorm parititions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
     """
     return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False)
@@ -439,7 +457,11 @@ def _annotate_conv_bn_relu(
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
     """
+<<<<<<< HEAD
     Find conv + batchnorm + relu partitions
+=======
+    Find conv + batchnorm + relu parititions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
     """
     return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
@@ -452,7 +474,11 @@ def _annotate_conv_transpose_bn(
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
     """
+<<<<<<< HEAD
     Find conv_transpose + batchnorm partitions
+=======
+    Find conv_transpose + batchnorm parititions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
     """
     return _do_annotate_conv_bn(
@@ -467,7 +493,11 @@ def _annotate_conv_transpose_bn_relu(
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
     """
+<<<<<<< HEAD
     Find conv_transpose + batchnorm + relu partitions
+=======
+    Find conv_transpose + batchnorm + relu parititions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
     """
     return _do_annotate_conv_bn(
diff --git a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
index eff97dbcf27da..f707a83e9bd29 100644
--- a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
@@ -85,7 +85,11 @@ def __init__(self) -> None:
         overrides. We keep the annotate methods but make the function
         body empty, aiming to let `_generate_qdq_quantized_model`
         generate qdq around op and graph execute on fp32 dtype for
+<<<<<<< HEAD
         unsupported operators.
+=======
+        unspported operators.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def _annotate_qat_conv2d_fusion_pattern(
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 63c635565c4ce..385849dd255b1 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -4,12 +4,19 @@
 """
 
 import functools
+<<<<<<< HEAD
 import sys
 import warnings
 from collections import OrderedDict
 from collections.abc import Callable
 from inspect import getfullargspec, signature
 from typing import Any, Optional, Union
+=======
+import warnings
+from collections import OrderedDict
+from inspect import getfullargspec, signature
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.quant_type import QuantType
@@ -17,6 +24,7 @@
 from torch.nn.utils.parametrize import is_parametrized
 
 
+<<<<<<< HEAD
 if sys.version_info < (3, 12):
     NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
     NodePattern.__module__ = "torch.ao.quantization.utils"
@@ -27,6 +35,10 @@
         "NodePattern", Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
     )
 
+=======
+NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+NodePattern.__module__ = "torch.ao.quantization.utils"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # This is the Quantizer class instance from torch/quantization/fx/quantize.py.
 # Define separately to prevent circular imports.
@@ -38,6 +50,7 @@
 # Type for fusion patterns, it can be more complicated than the following actually,
 # see pattern.md for docs
 # TODO: not sure if typing supports recursive data types
+<<<<<<< HEAD
 
 if sys.version_info < (3, 12):
     Pattern = Union[
@@ -59,6 +72,12 @@
             Any,
         ],
     )
+=======
+Pattern = Union[
+    Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any
+]
+Pattern.__module__ = "torch.ao.quantization.utils"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO: maybe rename this to MatchInputNode
@@ -427,8 +446,12 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
     if min_val.numel() == 0 or max_val.numel() == 0:
         warnings.warn(
             "must run observer before calling calculate_qparams. "
+<<<<<<< HEAD
             + "Returning default values.",
             stacklevel=2,
+=======
+            + "Returning default values."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return False
 
@@ -436,8 +459,12 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
         if min_val == float("inf") and max_val == float("-inf"):
             warnings.warn(
                 "must run observer before calling calculate_qparams. "
+<<<<<<< HEAD
                 + "Returning default values.",
                 stacklevel=2,
+=======
+                + "Returning default values."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             return False
@@ -504,9 +531,15 @@ def calculate_qmin_qmax(
                 quant_min, quant_max = 0, 255
         elif dtype in [torch.qint32, torch.int32]:
             quant_min, quant_max = -1 * (2**31), (2**31) - 1
+<<<<<<< HEAD
         elif dtype == torch.uint16:
             quant_min, quant_max = 0, 2**16 - 1
         elif dtype == torch.int16:
+=======
+        elif dtype in [torch.uint16]:
+            quant_min, quant_max = 0, 2**16 - 1
+        elif dtype in [torch.int16]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quant_min, quant_max = -(2**15), 2**15 - 1
         else:
             quant_min, quant_max = 0, 15
@@ -645,7 +678,11 @@ def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
 
 # Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
 # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+<<<<<<< HEAD
 # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikely to change
+=======
+# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikey to change
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168)
 def determine_qparams(
     min_val: torch.Tensor,
@@ -808,8 +845,12 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     if {torch.device("cpu"), torch.device("meta")} == devices:
         warnings.warn(
+<<<<<<< HEAD
             "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.",
             stacklevel=2,
+=======
+            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         devices = {torch.device("cpu")}
     ""
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 5c8d2664ed7db..2a5b4e7bd6f57 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -92,7 +92,10 @@ def _make_grads(
     is_grads_batched: bool,
 ) -> tuple[_OptionalTensor, ...]:
     new_grads: list[_OptionalTensor] = []
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for out, grad in zip(outputs, grads):
         out = cast(Union[torch.Tensor, graph.GradientEdge], out)
         out_size = None
@@ -113,8 +116,12 @@ def _make_grads(
             # circular import
             from torch.nested._internal.nested_tensor import NestedTensor
 
+<<<<<<< HEAD
             if not isinstance(out, torch.Tensor):
                 raise AssertionError("Expected output to be a torch.Tensor")
+=======
+            assert isinstance(out, torch.Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_dtype = out.dtype
             out_is_nested = out.is_nested
             out_is_cpp_nested = out_is_nested and not isinstance(out, NestedTensor)
@@ -130,15 +137,23 @@ def _make_grads(
             # singleton int to represent jagged dimension, so that size() call
             # on nested tensor works.
             if out_is_cpp_nested:
+<<<<<<< HEAD
                 if not isinstance(out, torch.Tensor):
                     raise AssertionError("Expected output to be a torch.Tensor.")
+=======
+                assert isinstance(out, torch.Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 shape_matches = torch.is_same_size(out, first_grad)
             else:
                 # We need to do a regular size check, without going through
                 # the operator, to be able to handle unbacked symints
                 # (expect_true ensures we can deal with unbacked)
+<<<<<<< HEAD
                 if out_size is None:
                     raise AssertionError("Expected out_size to be set.")
+=======
+                assert out_size is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 shape_matches = expect_true(sym_eq(out_size, first_grad.size()))
 
             if not shape_matches:
@@ -194,12 +209,19 @@ def _make_grads(
         elif grad is None:
             if isinstance(out, graph.GradientEdge) or out.requires_grad:  # type: ignore[attr-defined]
                 if isinstance(out, graph.GradientEdge):
+<<<<<<< HEAD
                     if out_size is None:
                         raise AssertionError("Expected out_size to be set.")
                     out_numel_is_1 = all(o == 1 for o in out_size)
                 else:
                     if not isinstance(out, torch.Tensor):
                         raise AssertionError("Expected output to be a torch.Tensor")
+=======
+                    assert out_size is not None
+                    out_numel_is_1 = all(o == 1 for o in out_size)
+                else:
+                    assert isinstance(out, torch.Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     out_numel_is_1 = out.numel() == 1
                 if not out_numel_is_1:
                     raise RuntimeError(
@@ -212,10 +234,15 @@ def _make_grads(
                     )
                     raise RuntimeError(msg)
                 if isinstance(out, graph.GradientEdge):
+<<<<<<< HEAD
                     if out_size is None:
                         raise AssertionError("Expected out_size to be set.")
                     if out_device is None:
                         raise AssertionError("Expected out_device to be set.")
+=======
+                    assert out_size is not None
+                    assert out_device is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_grads.append(
                         torch.ones(
                             out_size,
@@ -224,8 +251,12 @@ def _make_grads(
                         )
                     )
                 else:
+<<<<<<< HEAD
                     if not isinstance(out, torch.Tensor):
                         raise AssertionError("Expected output to be a torch.Tensor")
+=======
+                    assert isinstance(out, torch.Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_grads.append(
                         torch.ones_like(out, memory_format=torch.preserve_format)
                     )
@@ -350,7 +381,10 @@ def backward(
             Union[tuple[torch.Tensor], tuple[graph.GradientEdge]], (tensors,)
         )
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensors = tuple(tensors)
 
     grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
@@ -450,12 +484,18 @@ def grad(
             Union[Sequence[torch.Tensor], Sequence[graph.GradientEdge]], (outputs,)
         )
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs = tuple(outputs)
     if is_tensor_like(inputs) or isinstance(inputs, graph.GradientEdge):
         inputs = cast(_TensorOrTensorsOrGradEdge, (inputs,))
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs = tuple(inputs)
     t_outputs = tuple(i for i in outputs if is_tensor_like(i))
     t_inputs = tuple(i for i in inputs if is_tensor_like(i))
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
index 716ae1db726ad..3e8a20d2bbb9d 100644
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@@ -15,14 +15,20 @@ class Type(Function):
         "please use `torch.tensor.to(dtype=dtype)` instead.",
         category=FutureWarning,
     )
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, i, dest_type):
         ctx.input_type = type(i)
         ctx.input_device = -1 if not i.is_cuda else i.get_device()
         return i.type(dest_type)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         if ctx.input_device == -1:
             return grad_output.type(ctx.input_type), None
@@ -34,7 +40,10 @@ def backward(ctx, grad_output):
 # TODO: deprecate this
 class Resize(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, tensor, sizes):
         ctx.sizes = sizes
         ctx.numel = reduce(operator.mul, sizes, 1)
@@ -63,10 +72,15 @@ def forward(ctx, tensor, sizes):
             return tensor.contiguous().view(*sizes)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def backward(ctx, grad_output):
         if grad_output.numel() != ctx.numel:
             raise AssertionError(
                 f"Expected grad_output to have {ctx.numel} elements, but got {grad_output.numel()}"
             )
+=======
+    def backward(ctx, grad_output):
+        assert grad_output.numel() == ctx.numel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return grad_output.contiguous().view(ctx.input_sizes), None
diff --git a/torch/autograd/_functions/utils.py b/torch/autograd/_functions/utils.py
index 1e74e21d3cef2..0b8ce3d55120d 100644
--- a/torch/autograd/_functions/utils.py
+++ b/torch/autograd/_functions/utils.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+import operator
+from functools import reduce
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def maybe_view(tensor, size, check_same_size=True):
@@ -24,3 +29,41 @@ def maybe_unexpand(tensor, old_size, check_same_size=True):
     for dim in expanded_dims:
         tensor = tensor.sum(dim, keepdim=True)
     return tensor
+<<<<<<< HEAD
+=======
+
+
+# Check whether the op enable broadcasting, and whether it is supported by ONNX.
+# If dims1 and dims2 are different, then broadcast is True.
+# We always assume the combination of dims1 and dims2 is broadcastable.
+# The following types of broadcasting are supported in ONNX:
+#     1) Only one element in dims2, such as dims2 = [1, 1]
+#     2) dims2 is suffix of dims1, such as dims1 = [2, 3, 4], and dims2 = [3, 4]
+# Details can be found here: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Gemm
+def check_onnx_broadcast(dims1, dims2):
+    broadcast = False
+    supported = True
+    len1 = len(dims1)
+    len2 = len(dims2)
+
+    numel2 = reduce(operator.mul, dims2)
+    if len1 < len2:
+        broadcast = True
+        if numel2 != 1:
+            supported = False
+    elif len1 > len2:
+        broadcast = True
+        if numel2 != 1 and dims1[len1 - len2 :] != dims2:
+            supported = False
+    else:
+        if dims1 != dims2:
+            broadcast = True
+            if numel2 != 1:
+                supported = False
+
+    if not supported:
+        raise ValueError(
+            f"Numpy style broadcasting is not supported in ONNX. Input dims are: {dims1}, {dims2}"
+        )
+    return broadcast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
index 0277f1b75541f..d7884eb7b6af8 100644
--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Autograd anomaly mode."""
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 
 import torch
@@ -32,7 +35,10 @@ class detect_anomaly:
         ...     @staticmethod
         ...     def forward(ctx, inp):
         ...         return inp.clone()
+<<<<<<< HEAD
         ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...     @staticmethod
         ...     def backward(ctx, gO):
         ...         # Error during the backward pass
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 70c6cad4e9967..184931ab0d011 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -4,9 +4,14 @@
 import itertools
 import warnings
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Concatenate, Optional, TypeVar
 from typing_extensions import deprecated, ParamSpec
+=======
+from typing import Any, Optional
+from typing_extensions import deprecated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C as _C
@@ -30,10 +35,13 @@
 # This is incremented in FunctionMeta during class definition
 AUTOGRAD_FUNCTION_COUNTER = itertools.count()
 
+<<<<<<< HEAD
 _T = TypeVar("_T")
 _R = TypeVar("_R")
 _P = ParamSpec("_P")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Formerly known as: _ContextMethodMixin
 class FunctionCtx:
@@ -146,11 +154,18 @@ def save_for_forward(self, *tensors: torch.Tensor):
 
         """
         for tensor in tensors:
+<<<<<<< HEAD
             if not (isinstance(tensor, torch.Tensor) or tensor is None):
                 raise AssertionError(
                     "save_for_forward expects all arguments to be tensors; you should "
                     "save non-tensors as attributes on ctx."
                 )
+=======
+            assert isinstance(tensor, torch.Tensor) or tensor is None, (
+                "save_for_forward expects all arguments to be tensors; you should "
+                "save non-tensors as attributes on ctx."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.saved_for_forward = tensors
 
@@ -372,7 +387,10 @@ def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
             def forward(*args: Any, **kwargs: Any) -> Any:
                 pass
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             @staticmethod
             def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
                 pass
@@ -601,6 +619,7 @@ def _is_setup_context_defined(fn):
     return fn != _SingleLevelFunction.setup_context
 
 
+<<<<<<< HEAD
 def once_differentiable(
     fn: Callable[Concatenate[_T, _P], _R],
 ) -> Callable[Concatenate[_T, _P], _R]:
@@ -608,6 +627,13 @@ def once_differentiable(
     def wrapper(ctx: _T, *args: _P.args, **kwargs: _P.kwargs) -> _R:
         with torch.no_grad():
             outputs = fn(ctx, *args, **kwargs)
+=======
+def once_differentiable(fn):
+    @functools.wraps(fn)
+    def wrapper(ctx, *args):
+        with torch.no_grad():
+            outputs = fn(ctx, *args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not torch.is_grad_enabled():
             return outputs
@@ -628,14 +654,22 @@ def wrapper(ctx: _T, *args: _P.args, **kwargs: _P.kwargs) -> _R:
             return outputs
 
         if not isinstance(outputs, tuple):
+<<<<<<< HEAD
             outputs_ = (outputs,)
         else:
             outputs_ = outputs
+=======
+            outputs = (outputs,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         err_fn = _functions.DelayedError(
             b"trying to differentiate twice a function that was marked "
             b"with @once_differentiable",
+<<<<<<< HEAD
             len(outputs_),
+=======
+            len(outputs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Create aliases of each output that has requires_grad=True. We need
@@ -647,7 +681,11 @@ def fake_requires_grad(var):
                 var.requires_grad = True
             return var
 
+<<<<<<< HEAD
         return err_fn(*[fake_requires_grad(v) for v in outputs_])  # type: ignore[return-value]
+=======
+        return err_fn(*[fake_requires_grad(v) for v in outputs])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
@@ -777,7 +815,10 @@ class NestedIOFunction(Function):
     This class is here only for backward compatibility reasons.
     Use :class:`Function` instead of this for any new use case.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
     # superclass (Function) but are instance methods here, which mypy reports as incompatible.
 
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index e8bce9ed7c54f..619db0b6afcab 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -54,8 +54,12 @@ def _tuple_postprocess(res, to_unpack):
     # - invert _as_tuple when res should match the inp given to _as_tuple
     # - optionally remove nesting of two tuples created by multiple calls to _as_tuple
     if isinstance(to_unpack, tuple):
+<<<<<<< HEAD
         if len(to_unpack) != 2:
             raise AssertionError("Expected to_unpack tuple to have exactly 2 elements")
+=======
+        assert len(to_unpack) == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not to_unpack[1]:
             res = tuple(el[0] for el in res)
         if not to_unpack[0]:
@@ -175,6 +179,7 @@ def _autograd_grad(
 ):
     # Version of autograd.grad that accepts `None` in outputs and do not compute gradients for them.
     # This has the extra constraint that inputs has to be a tuple
+<<<<<<< HEAD
     if not isinstance(outputs, tuple):
         raise AssertionError("Expected outputs to be a tuple")
     if grad_outputs is None:
@@ -186,6 +191,13 @@ def _autograd_grad(
             f"Expected outputs and grad_outputs to have the same length, "
             f"but got {len(outputs)} and {len(grad_outputs)}"
         )
+=======
+    assert isinstance(outputs, tuple)
+    if grad_outputs is None:
+        grad_outputs = (None,) * len(outputs)
+    assert isinstance(grad_outputs, tuple)
+    assert len(outputs) == len(grad_outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_outputs: tuple[torch.Tensor, ...] = ()
     new_grad_outputs: tuple[torch.Tensor, ...] = ()
@@ -496,6 +508,7 @@ def _construct_standard_basis_for(
     # See NOTE: [Computing jacobian with vmap and grad for multiple tensors]
     # for context behind this function. All the pre-conditions are guarded for
     # in torch.autograd.functional.jacobian.
+<<<<<<< HEAD
     if len(tensors) != len(tensor_numels):
         raise AssertionError(
             f"Expected tensors and tensor_numels to have the same length, "
@@ -503,6 +516,10 @@ def _construct_standard_basis_for(
         )
     if len(tensors) == 0:
         raise AssertionError("Expected at least one tensor")
+=======
+    assert len(tensors) == len(tensor_numels)
+    assert len(tensors) > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     total_numel = sum(tensor_numels)
     chunks = tuple(
         tensor.new_zeros(total_numel, tensor_numel)
@@ -676,12 +693,20 @@ def jacobian(
         >>> jac.shape
         torch.Size([4, 2, 4, 2])
     """
+<<<<<<< HEAD
     if strategy not in ("forward-mode", "reverse-mode"):
         raise AssertionError(
             'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
             'function has more outputs than inputs, "forward-mode" tends to be more performant. '
             'Otherwise, prefer to use "reverse-mode".'
         )
+=======
+    assert strategy in ("forward-mode", "reverse-mode"), (
+        'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
+        'function has more outputs than inputs, "forward-mode" tends to be more performant. '
+        'Otherwise, prefer to use "reverse-mode".'
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if strategy == "forward-mode":
         if create_graph:
             raise NotImplementedError(
@@ -945,6 +970,7 @@ def hessian(
                   [0., 6.]])))
     """
     is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "hessian")
+<<<<<<< HEAD
     if outer_jacobian_strategy not in (
         "forward-mode",
         "reverse-mode",
@@ -952,6 +978,12 @@ def hessian(
         raise AssertionError(
             'Expected strategy to be either "forward-mode" or "reverse-mode".'
         )
+=======
+    assert outer_jacobian_strategy in (
+        "forward-mode",
+        "reverse-mode",
+    ), 'Expected strategy to be either "forward-mode" or "reverse-mode".'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def ensure_single_output_function(*inp):
         out = func(*inp)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index bc882ec08cb76..3be4c46382f31 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -210,6 +210,7 @@ def clone(self) -> "set_grad_enabled":
 
 
 class inference_mode(_DecoratorContextManager):
+<<<<<<< HEAD
     r"""Context manager that enables or disables inference mode.
 
     InferenceMode is analogous to :class:`~no_grad` and should be used
@@ -221,11 +222,24 @@ class inference_mode(_DecoratorContextManager):
     recorded by autograd.
 
     This context manager is thread-local; it does not affect computation
+=======
+    r"""Context-manager that enables or disables inference mode.
+
+    InferenceMode is a context manager analogous to :class:`~no_grad`
+    to be used when you are certain your operations will have no interactions
+    with autograd (e.g., model training). Code run under this mode gets better
+    performance by disabling view tracking and version counter bumps. Note that
+    unlike some other mechanisms that locally enable or disable grad,
+    entering inference_mode also disables to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    This context manager is thread local; it will not affect computation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     in other threads.
 
     Also functions as a decorator.
 
     .. note::
+<<<<<<< HEAD
         Inference mode is one of several mechanisms that can locally enable
         or disable gradients. See :ref:`locally-disable-grad-doc` for a
         comparison. If avoiding the use of tensors created in inference mode
@@ -247,6 +261,16 @@ class inference_mode(_DecoratorContextManager):
         mode (bool or function): Either a boolean flag to enable or disable
             inference mode, or a Python function to decorate with inference
             mode enabled.
+=======
+        Inference mode is one of several mechanisms that can enable or
+        disable gradients locally see :ref:`locally-disable-grad-doc` for
+        more information on how they compare.
+
+    Args:
+        mode (bool or function): Either a boolean flag whether to enable or
+            disable inference mode or a Python function to decorate with
+            inference mode enabled
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example::
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
@@ -256,7 +280,11 @@ class inference_mode(_DecoratorContextManager):
         ...     y = x * x
         >>> y.requires_grad
         False
+<<<<<<< HEAD
         >>> # xdoctest: +SKIP("want string isn't quite right")
+=======
+        >>> # xdoctest: +SKIP("want string isnt quite right")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> y._version
         Traceback (most recent call last):
         File "<stdin>", line 1, in <module>
@@ -408,13 +436,20 @@ class _unsafe_preserve_version_counter(_DecoratorContextManager):
 
     def __init__(self, tensors: Union[torch.Tensor, tuple[torch.Tensor, ...]]) -> None:
         self.tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tensors
+<<<<<<< HEAD
         if not isinstance(self.tensors, tuple):
             raise AssertionError("Expected tensors to be a tuple")
+=======
+        assert isinstance(self.tensors, tuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.prev_versions = tuple(t._version for t in self.tensors)
 
     def __enter__(self) -> None:
         pass
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __exit__(self, *args) -> None:
         torch._C._autograd._unsafe_set_version_counter(self.tensors, self.prev_versions)
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 53c8c28af9759..4bf17e13e8e1f 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -2,15 +2,24 @@
 import collections
 import functools
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from itertools import product
 from typing import Optional, Union
+=======
+from collections.abc import Iterable
+from itertools import product
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
 import torch.testing
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._vmap_internals import _vmap, vmap
 from torch.overrides import is_tensor_like
 from torch.types import _TensorOrTensors
@@ -363,6 +372,7 @@ def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
         # sparse compressed tensors don't implement sub/add/copy_
         # yet. However, in non-masked semantics context entry and v
         # have the same sparse indices ...
+<<<<<<< HEAD
         if entry.layout != v.layout:
             raise AssertionError(
                 f"Expected entry and v to have the same layout, but got {entry.layout} and {v.layout}"
@@ -372,6 +382,10 @@ def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
                 f"Expected entry and v to have the same nnz, but got {entry._nnz()} and {v._nnz()} "
                 f"with entry shape {entry.shape}"
             )
+=======
+        assert entry.layout == v.layout, (entry.layout, v.layout)
+        assert entry._nnz() == v._nnz(), (entry._nnz(), v._nnz(), entry.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ... the finite differencing can be performed on values only:
         entry = entry.values()
         v = v.values()
@@ -410,15 +424,23 @@ def _compute_numerical_jvps_wrt_specific_input(
             jvp_fn(delta[1] * 1j) if isinstance(delta, tuple) else jvp_fn(delta * 1j)
         )
         for ds_dx, ds_dy in zip(ds_dx_tup, ds_dy_tup):
+<<<<<<< HEAD
             if ds_dx.is_complex():
                 raise AssertionError("Expected ds_dx to be real-valued, not complex")
+=======
+            assert not ds_dx.is_complex()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # conjugate wirtinger derivative
             conj_w_d = ds_dx + ds_dy * 1j
             jvps.append(conj_w_d)
     else:
         for ds_dx in ds_dx_tup:  # R -> R or (R -> C for the forward AD case)
+<<<<<<< HEAD
             if not is_forward_ad and ds_dx.is_complex():
                 raise AssertionError("Expected ds_dx to be real-valued, not complex.")
+=======
+            assert is_forward_ad or not ds_dx.is_complex()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             jvps.append(ds_dx)
     return jvps
 
@@ -464,6 +486,7 @@ def _prepare_input(
 def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None:
     # Check that the returned outputs don't have different dtype or shape when you
     # perturb the input
+<<<<<<< HEAD
     on_index = f"on index {idx} " if idx is not None else ""
     if output1.shape != output2.shape:
         raise AssertionError(
@@ -477,6 +500,19 @@ def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None
             f" when inputs are perturbed {on_index}by {eps}, but got:"
             f" dtypes {output1.dtype} and {output2.dtype}."
         )
+=======
+    on_index = "on index {idx} " if idx is not None else ""
+    assert output1.shape == output2.shape, (
+        f"Expected `func` to return outputs with the same shape"
+        f" when inputs are perturbed {on_index}by {eps}, but got:"
+        f" shapes {output1.shape} and {output2.shape}."
+    )
+    assert output1.dtype == output2.dtype, (
+        f"Expected `func` to return outputs with the same dtype"
+        f" when inputs are perturbed {on_index}by {eps}, but got:"
+        f" dtypes {output1.dtype} and {output2.dtype}."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_numerical_jacobian_wrt_specific_input(
@@ -489,8 +525,12 @@ def get_numerical_jacobian_wrt_specific_input(
     # is equivalent to a single col of the Jacobian matrix of fn.
     jacobian_cols: dict[int, list[torch.Tensor]] = {}
     input = inputs[input_idx] if input is None else input
+<<<<<<< HEAD
     if not input.requires_grad:
         raise AssertionError("Expected input to have requires_grad=True")
+=======
+    assert input.requires_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for x, idx, d_idx in _iter_tensor(input):
         wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x)
         input_to_perturb = x[idx]
@@ -699,11 +739,15 @@ def _get_numerical_vJu(
         # Filter out the Ju for non floating point outputs
         filtered_Ju = []
         func_out = _as_tuple(func_out)
+<<<<<<< HEAD
         if len(all_Ju) != len(func_out):
             raise AssertionError(
                 f"Expected all_Ju and func_out to have the same length, "
                 f"but got {len(all_Ju)} and {len(func_out)}"
             )
+=======
+        assert len(all_Ju) == len(func_out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for Ju, output in zip(all_Ju, func_out):
             if _is_float_or_complex_tensor(output):
                 filtered_Ju.append(Ju)
@@ -749,12 +793,19 @@ def _stack_and_check_tensors(
             if tensor is None:
                 out_jacobian[:, j].zero_()
             else:
+<<<<<<< HEAD
                 dense = tensor.to_dense() if tensor.layout != torch.strided else tensor
                 if out_jacobian[:, j].numel() != dense.numel():
                     raise AssertionError(
                         f"Expected out_jacobian column to have {dense.numel()} elements, "
                         f"but got {out_jacobian[:, j].numel()}"
                     )
+=======
+                dense = (
+                    tensor.to_dense() if not tensor.layout == torch.strided else tensor
+                )
+                assert out_jacobian[:, j].numel() == dense.numel()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_jacobian[:, j] = dense.reshape(-1)
     return out_jacobians, correct_grad_sizes, correct_grad_types
 
@@ -944,8 +995,12 @@ def _check_inputs(tupled_inputs) -> bool:
                     f"Input #{idx} requires gradient and "
                     "is not a double precision floating point or complex. "
                     "This check will likely fail if all the inputs are "
+<<<<<<< HEAD
                     "not of double precision floating point or complex. ",
                     stacklevel=2,
+=======
+                    "not of double precision floating point or complex. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if inp.is_sparse:
                 content = inp._values()
@@ -1082,8 +1137,12 @@ def _get_failed_batched_grad_test_msg(
 
 def _test_batched_grad_forward_ad(func, inputs) -> bool:
     fwAD = torch.autograd.forward_ad  # To avoid early import issues (do we need this?)
+<<<<<<< HEAD
     if not isinstance(inputs, tuple):
         raise AssertionError("Expected inputs to be a tuple")
+=======
+    assert isinstance(inputs, tuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for input_idx, current_input in enumerate(inputs):
         if not (is_tensor_like(current_input) and current_input.requires_grad):
@@ -1326,8 +1385,12 @@ def warn_bc_breaking():
             "Backwards compatibility: New undefined gradient support checking "
             "feature is enabled by default, but it may break existing callers "
             "of this function. If this is true for you, you can call this "
+<<<<<<< HEAD
             'function with "check_undefined_grad=False" to disable the feature',
             stacklevel=2,
+=======
+            'function with "check_undefined_grad=False" to disable the feature'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def check_undefined_grad_support(output_to_check):
@@ -1664,10 +1727,14 @@ def _slow_gradcheck(
 
 
 def _dot_with_type_promotion(u, v):
+<<<<<<< HEAD
     if u.dim() != 1 or v.dim() != 1:
         raise AssertionError(
             f"Expected u and v to be 1D tensors, but got dims {u.dim()} and {v.dim()}"
         )
+=======
+    assert u.dim() == 1 and v.dim() == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (u * v).sum()
 
 
@@ -1934,8 +2001,12 @@ def _fast_gradcheck(
     )
     # TODO: replicate https://github.com/pytorch/pytorch/pull/77743 for fast gradcheck as well
     if use_forward_ad:
+<<<<<<< HEAD
         if all_v is not None:
             raise AssertionError("Expected all_v to be None.")
+=======
+        assert all_v is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         analytical_vJu = _get_analytical_jacobian_forward_ad(
             func,
             inputs,
@@ -1974,7 +2045,11 @@ def _fast_gradcheck(
 
 # Note [VarArg of Tensors]
 # ~~~~~~~~~~~~~~~~~~~~~~~~
+<<<<<<< HEAD
 # 'func' accepts a vararg of tensors, which isn't expressible in the type system at the moment.
+=======
+# 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
 # the '...' first argument of Callable can be replaced with VarArg(Tensor).
 # For now, we permit any input.
@@ -2063,6 +2138,7 @@ def gradcheck(
         ``True`` if all differences satisfy allclose condition
 
     """
+<<<<<<< HEAD
     if not (check_forward_ad or check_backward_ad):
         raise AssertionError(
             "Expected at least one of check_forward_ad or check_backward_ad to be True"
@@ -2075,6 +2151,17 @@ def gradcheck(
         raise AssertionError(
             "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
         )
+=======
+    assert (
+        check_forward_ad or check_backward_ad
+    ), "Expected at least one of check_forward_ad or check_backward_ad to be True"
+    assert not (
+        check_batched_grad and not check_backward_ad
+    ), "Setting check_batched_grad=True requires check_backward_ad to be True"
+    assert not (
+        check_batched_forward_grad and not check_forward_ad
+    ), "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = locals().copy()
     args.pop("raise_exception")
     if not raise_exception:
@@ -2219,6 +2306,7 @@ def gradgradcheck(
     Returns:
         True if all differences satisfy allclose condition
     """
+<<<<<<< HEAD
     if not (check_fwd_over_rev or check_rev_over_rev):
         raise AssertionError(
             "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
@@ -2231,6 +2319,17 @@ def gradgradcheck(
         raise AssertionError(
             "Setting check_batched_grad=True requires check_rev_over_rev to be True"
         )
+=======
+    assert (
+        check_fwd_over_rev or check_rev_over_rev
+    ), "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
+    assert not (
+        check_undefined_grad and not check_rev_over_rev
+    ), "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
+    assert not (
+        check_batched_grad and not check_rev_over_rev
+    ), "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: do we want to test this too?
     # assert not (check_batched_forward_grad and not check_fwd_over_rev), (
     #     "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 2ade6485fff71..d3e446ab12b9d 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -4,6 +4,7 @@
 import logging
 import threading
 from collections import defaultdict, deque
+<<<<<<< HEAD
 from collections.abc import (
     Callable,
     Generator,
@@ -14,14 +15,26 @@
 )
 from typing import (
     Any,
+=======
+from collections.abc import Generator, Iterable, Iterator, MutableMapping, Sequence
+from typing import (
+    Any,
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cast,
     Literal,
     NamedTuple,
     Optional,
     TYPE_CHECKING,
+<<<<<<< HEAD
     TypeAlias,
     Union,
 )
+=======
+    Union,
+)
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakKeyDictionary, WeakValueDictionary
 
 import torch
@@ -187,8 +200,12 @@ def _get_grad_fn_or_grad_acc(t: Union[torch.Tensor, "GradientEdge"]) -> Node:
             node = t.view_as(t).grad_fn.next_functions[0][0]  # type: ignore[union-attr]
     else:
         node = t.grad_fn
+<<<<<<< HEAD
     if node is None:
         raise AssertionError("Expected gradient function to be set")
+=======
+    assert node is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return node
 
 
@@ -201,9 +218,12 @@ class GradientEdge(NamedTuple):
 
     node: Node
     output_nr: int
+<<<<<<< HEAD
     # This token can be used to ensure the graph stays alive when it cannot be
     # done via the node field
     ownership_token: Optional[Node] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
@@ -219,6 +239,7 @@ def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
         )
     grad_fn = _get_grad_fn_or_grad_acc(tensor)
 
+<<<<<<< HEAD
     # Python-based Node are owned by the C++ side meaning the python grad_fn
     # object we hold here does NOT keep the C++ graph alive.
     # Create an ownership token by creating a new C++ node that own the graph
@@ -232,6 +253,11 @@ def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
     # for the AccumulateGrad node.
     # pyrefly: ignore [bad-argument-type]
     return GradientEdge(grad_fn, tensor.output_nr, ownership_token=token)
+=======
+    # Note that output_nr default to 0 which is the right value
+    # for the AccumulateGrad node.
+    return GradientEdge(grad_fn, tensor.output_nr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def increment_version(tensor: Union[torch.Tensor, Iterable[torch.Tensor]]) -> None:
@@ -261,7 +287,11 @@ class saved_tensors_hooks:
     Use this context-manager to define how intermediary results of an operation
     should be packed before saving, and unpacked on retrieval.
 
+<<<<<<< HEAD
     In that context, the ``pack_hook`` function will be called every time an
+=======
+    In that context, the ``pack_hook`` function will be called everytime an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     operation saves a tensor for backward (this includes intermediary results
     saved using
     :func:`~torch.autograd.function._ContextMethodMixin.save_for_backward` but
@@ -529,12 +559,19 @@ def get_inner_hook(idx: int) -> Callable[[torch.Tensor], None]:
             def inner_hook(grad: torch.Tensor) -> None:
                 nonlocal count, nb_calls, buffer, fn
                 id = torch._C._current_graph_task_id()
+<<<<<<< HEAD
                 if id == -1:
                     raise AssertionError(
                         "expected this hook to be called inside a backward call"
                     )
                 count[id] = count.get(id, 0)
                 # pyrefly: ignore [unsupported-operation]
+=======
+                assert (
+                    id != -1
+                ), "expected this hook to be called inside a backward call"
+                count[id] = count.get(id, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 buffer[id] = buffer.get(id, [None] * len_tensors)
 
                 with lock:
@@ -548,8 +585,12 @@ def inner_hook(grad: torch.Tensor) -> None:
 
                 buffer[id][idx] = grad
 
+<<<<<<< HEAD
                 if nb_calls is None:
                     raise AssertionError("Expected nb_calls to be set")
+=======
+                assert nb_calls is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if curr_count == nb_calls - 1:
                     fn = cast(Callable[[Sequence[Optional[torch.Tensor]]], None], fn)
                     fn(buffer[id])
@@ -569,10 +610,14 @@ def inner_hook(grad: torch.Tensor) -> None:
         def wrapped_fn(grad: torch.Tensor) -> None:
             nonlocal ran_hook
             id = torch._C._current_graph_task_id()
+<<<<<<< HEAD
             if id == -1:
                 raise AssertionError(
                     "expected this hook to be called inside a backward call"
                 )
+=======
+            assert id != -1, "expected this hook to be called inside a backward call"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with lock:
                 prev, ran_hook[id] = ran_hook[id], True
             if prev:
@@ -668,6 +713,7 @@ def unpack_hook(handle: _Handle) -> torch.Tensor:
                 "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
                 "in which the graph was originally recorded."
             )
+<<<<<<< HEAD
             if not _allow_mutation_on_saved_tensors_enabled:
                 raise AssertionError(error_msg)
             if handle in ctx.cloned:
@@ -675,6 +721,13 @@ def unpack_hook(handle: _Handle) -> torch.Tensor:
             else:
                 if handle not in ctx.original:
                     raise AssertionError(error_msg)
+=======
+            assert _allow_mutation_on_saved_tensors_enabled, error_msg
+            if handle in ctx.cloned:
+                res = ctx.cloned[handle]
+            else:
+                assert handle in ctx.original, error_msg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res = ctx.original[handle]
             return res
 
@@ -748,9 +801,15 @@ def clear(self) -> None:
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def allow_mutation_on_saved_tensors() -> Generator[
     _AllowMutationOnSavedContext, None, None
 ]:
+=======
+def allow_mutation_on_saved_tensors() -> (
+    Generator[_AllowMutationOnSavedContext, None, None]
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Context manager under which mutating tensors saved for backward is allowed.
 
     Under this context manager, tensors saved for backward are cloned on mutation,
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index fa43af2701171..d8aa2b77e4f7a 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -95,7 +95,10 @@ def _run_on_profiler_stop():
 @dataclass
 class _ProfilerStats:
     "Profiler timing and stats used by developers to catch issues/regressions"
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     profiling_window_duration_sec: float = 0
     number_of_events: int = 0
     profiler_prepare_call_duration_us: int = 0
@@ -108,9 +111,12 @@ class _ProfilerStats:
 class profile:
     """Context manager that manages autograd profiler state and holds a summary of results.
 
+<<<<<<< HEAD
     .. note::
         This is the backend, most people should use :mod:`torch.profiler` instead.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Under the hood it just records events of functions being executed in C++ and
     exposes those events to Python. You can wrap any code into it and it will
     only report runtime of PyTorch functions.
@@ -255,16 +261,23 @@ def __init__(
         self.custom_trace_id_callback = custom_trace_id_callback
         self.trace_id = ""
         if not self.use_cpu:
+<<<<<<< HEAD
             if not use_kineto:
                 raise AssertionError(
                     "Device-only events supported only with Kineto (use_kineto=True)"
                 )
+=======
+            assert (
+                use_kineto
+            ), "Device-only events supported only with Kineto (use_kineto=True)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.use_device is not None:
             VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
             if _get_privateuse1_backend_name() != "privateuseone":
                 VALID_DEVICE_OPTIONS.append(_get_privateuse1_backend_name())
             if self.use_device not in VALID_DEVICE_OPTIONS:
+<<<<<<< HEAD
                 warn(
                     f"The {self.use_device} is not a valid device option.", stacklevel=2
                 )
@@ -272,17 +285,32 @@ def __init__(
 
             if self.use_device == "cuda" and not torch.cuda.is_available():
                 warn("CUDA is not available, disabling CUDA profiling", stacklevel=2)
+=======
+                warn(f"The {self.use_device} is not a valid device option.")
+                self.use_device = None
+
+            if self.use_device == "cuda" and not torch.cuda.is_available():
+                warn("CUDA is not available, disabling CUDA profiling")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.use_cuda = False
                 self.use_device = None
 
             if self.use_device == "xpu" and not torch.xpu.is_available():
+<<<<<<< HEAD
                 warn("XPU is not available, disabling XPU profiling", stacklevel=2)
+=======
+                warn("XPU is not available, disabling XPU profiling")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.use_device = None
 
             if self.use_device == "hpu" and not (
                 hasattr(torch, "hpu") and torch.hpu.is_available()
             ):
+<<<<<<< HEAD
                 warn("HPU is not available, disabling HPU profiling", stacklevel=2)
+=======
+                warn("HPU is not available, disabling HPU profiling")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.use_device = None
 
         self.kineto_activities = set()
@@ -292,12 +320,17 @@ def __init__(
         self.profiler_kind = ProfilerState.KINETO
         if self.use_device == "cuda":
             if not use_kineto or ProfilerActivity.CUDA not in _supported_activities():
+<<<<<<< HEAD
                 if not self.use_cpu:
                     raise AssertionError("Legacy CUDA profiling requires use_cpu=True")
+=======
+                assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.CUDA)
         elif self.use_device == "xpu":
+<<<<<<< HEAD
             if not (use_kineto and ProfilerActivity.XPU in _supported_activities()):
                 raise AssertionError(
                     "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
@@ -314,22 +347,49 @@ def __init__(
                 raise AssertionError(
                     "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
                 )
+=======
+            assert (
+                use_kineto and ProfilerActivity.XPU in _supported_activities()
+            ), "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+            self.kineto_activities.add(ProfilerActivity.XPU)
+        elif self.use_device == "mtia":
+            assert (
+                use_kineto and ProfilerActivity.MTIA in _supported_activities()
+            ), "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
+            self.kineto_activities.add(ProfilerActivity.MTIA)
+        elif self.use_device == "hpu":
+            assert (
+                use_kineto and ProfilerActivity.HPU in _supported_activities()
+            ), "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.kineto_activities.add(ProfilerActivity.HPU)
         elif self.use_device is not None and self.use_device != "privateuseone":
             if (
                 not use_kineto
                 or ProfilerActivity.PrivateUse1 not in _supported_activities()
             ):
+<<<<<<< HEAD
                 if not self.use_cpu:
                     raise AssertionError(
                         "Legacy custombackend profiling requires use_cpu=True"
                     )
+=======
+                assert (
+                    self.use_cpu
+                ), "Legacy custombackend profiling requires use_cpu=True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.PrivateUse1)
 
+<<<<<<< HEAD
         if len(self.kineto_activities) == 0:
             raise AssertionError("No activities specified for the profiler")
+=======
+        assert (
+            len(self.kineto_activities) > 0
+        ), "No activities specified for the profiler"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def default_trace_id(self):
         # Generate a UUID
@@ -410,7 +470,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         )
 
         # If we plan to accumulate events we should post process the function events
+<<<<<<< HEAD
         # right away to retain the state across multiple start/stop calls
+=======
+        # right away to retain the state across mulitple start/stop calls
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.acc_events:
             self._ensure_function_events()
         return False
@@ -479,8 +543,12 @@ def table(
         top_level_events_only=False,
     ):
         self._ensure_function_events()
+<<<<<<< HEAD
         if self._function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self._function_events is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -508,10 +576,15 @@ def export_chrome_trace(self, path):
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._ensure_function_events()
+<<<<<<< HEAD
         if self._function_events is None:
             raise AssertionError("Expected profiling results")
         if not self.with_stack:
             raise AssertionError("export_stacks() requires with_stack=True")
+=======
+        assert self._function_events is not None, "Expected profiling results"
+        assert self.with_stack, "export_stacks() requires with_stack=True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._function_events.export_stacks(path, metric)
 
     def toggle_collection_dynamic(
@@ -529,8 +602,12 @@ def key_averages(
         group_by_overload_name=False,
     ):
         self._ensure_function_events()
+<<<<<<< HEAD
         if self._function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self._function_events is not None, "Expected profiling results"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._function_events.key_averages(
             group_by_input_shape, group_by_stack_n, group_by_overload_name
         )
@@ -539,8 +616,12 @@ def key_averages(
 
     def total_average(self):
         self._ensure_function_events()
+<<<<<<< HEAD
         if self._function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self._function_events is not None, "Expected profiling results"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -552,8 +633,12 @@ def self_cpu_time_total(self):
         The total time is a sum of all self times across all the events.
         """
         self._ensure_function_events()
+<<<<<<< HEAD
         if self._function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self._function_events is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._function_events.self_cpu_time_total
 
     def _parse_kineto_results(self, result: _ProfilerResult):
@@ -599,10 +684,14 @@ def _device_memory_usage(mem_record):
         device_corr_map: dict[int, list[FunctionEvent]] = {}
         max_evt_id = 0
         for kineto_event in result.events():
+<<<<<<< HEAD
             if (
                 _filter_name(kineto_event.name())
                 or getattr(kineto_event, "is_hidden_event", lambda: False)()
             ):
+=======
+            if _filter_name(kineto_event.name()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             rel_start_ns = kineto_event.start_ns() - trace_start_ns
             rel_end_ns = kineto_event.end_ns() - trace_start_ns
@@ -651,7 +740,10 @@ def _device_memory_usage(mem_record):
                 device_resource_id=kineto_event.device_resource_id(),
                 flops=kineto_event.flops(),
                 is_user_annotation=kineto_event.is_user_annotation(),
+<<<<<<< HEAD
                 metadata_json=kineto_event.metadata_json(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             max_evt_id = max(max_evt_id, fe.id)
             if fe.device_type == DeviceType.CPU and not fe.is_async:
@@ -744,7 +836,10 @@ def createFunctionEventForMemoryEvents(evt):
         return all_function_events
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class record_function(_ContextDecorator):
     """Context manager/function decorator that adds a label to a code block/function when running autograd profiler.
     Label will only appear if CPU activity tracing is enabled.
@@ -760,12 +855,20 @@ class record_function(_ContextDecorator):
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
         >>> x = torch.randn((1, 1), requires_grad=True)
         >>> with torch.autograd.profiler.profile() as prof:
+<<<<<<< HEAD
         ...     y = x**2
         ...     with torch.autograd.profiler.record_function(
         ...         "label-z"
         ...     ):  # label the block
         ...         z = y**3
         ...     y.backward()
+=======
+        ...     y = x ** 2
+        ...     with torch.autograd.profiler.record_function("label-z"): # label the block
+        ...         z = y ** 3
+        ...     y.backward()
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> # xdoctest: +IGNORE_WANT
         >>> # NOTE: some columns were removed for brevity
         >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
@@ -792,9 +895,13 @@ def __init__(self, name: str, args: Optional[str] = None):
         # TODO: TorchScript ignores standard type annotation here
         # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
         self.record = torch.jit.annotate(
+<<<<<<< HEAD
             # pyrefly: ignore [not-a-type]
             Optional["torch.classes.profiler._RecordFunction"],
             None,
+=======
+            Optional["torch.classes.profiler._RecordFunction"], None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __enter__(self):
@@ -809,8 +916,12 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
+<<<<<<< HEAD
         if record is None:
             raise AssertionError("Expected record to be set")
+=======
+        assert record is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -847,8 +958,12 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
+<<<<<<< HEAD
         if record is None:
             raise AssertionError("Expected record to be set")
+=======
+        assert record is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -876,7 +991,11 @@ class emit_itt:
     The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
     control the collection of trace data during its execution across different Intel tools.
     This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
+<<<<<<< HEAD
     you will be able to see labeled ranges in Intel(R) VTune Profiler GUI.
+=======
+    you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. warning:
         This context manager should not be called recursively, i.e. at most one
@@ -1139,8 +1258,12 @@ def parse_nvprof_trace(path):
     for row in conn.execute(kernel_query):
         unique.see(row["marker_id"], row["runtime_id"])
         # 211 is cudaKernelLaunch for cuda >= 9.2
+<<<<<<< HEAD
         if row["cbid"] != 211:
             raise AssertionError(f"Expected cbid to be 211, but got {row['cbid']}")
+=======
+        assert row["cbid"] == 211
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         evt = functions_map[row["marker_id"]]
         evt.append_kernel(
             row["kernel_name"], 0, row["kernel_end"] - row["kernel_start"]
@@ -1226,10 +1349,16 @@ def increment_step(cls, requester: str) -> int:
             if delta > 1:
                 warn(
                     "Profiler step count has increased more than 1 - "
+<<<<<<< HEAD
                     f"current_step = {cls._current_step} step dict =  {cls._step_dict}",
                     stacklevel=2,
                 )
             for _ in range(delta):
+=======
+                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
+                )
+            for _ in range(0, delta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _kineto_step()
             cls._current_step = new_step
         return cls._current_step
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 9f60295655ddb..824fb804336d3 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -101,14 +101,20 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
         records = _disable_profiler_legacy()
         parsed_results = _parse_legacy_records(records)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.function_events = EventList(
             parsed_results,
             use_device="cuda" if self.use_cuda else None,
             profile_memory=self.profile_memory,
             with_flops=self.with_flops,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.function_events._build_tree()
         return False
 
@@ -137,8 +143,12 @@ def table(
         top_level_events_only=False,
     ):
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self.function_events is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -153,32 +163,49 @@ def table(
 
     def export_chrome_trace(self, path):
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self.function_events is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.export_chrome_trace(path)
 
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
         if not self.with_stack:
             raise AssertionError("export_stacks() requires with_stack=True")
+=======
+        assert self.function_events is not None, "Expected profiling results"
+        assert self.with_stack, "export_stacks() requires with_stack=True"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.export_stacks(path, metric)
 
     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self.function_events is not None, "Expected profiling results"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
 
     key_averages.__doc__ = EventList.key_averages.__doc__
 
     def total_average(self):
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self.function_events is not None, "Expected profiling results"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -187,8 +214,12 @@ def total_average(self):
     def self_cpu_time_total(self):
         """Return CPU time as the sum of self times across all events."""
         self._check_finish()
+<<<<<<< HEAD
         if self.function_events is None:
             raise AssertionError("Expected profiling results")
+=======
+        assert self.function_events is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.function_events.self_cpu_time_total
 
 
@@ -206,8 +237,12 @@ def _get_record_key(record):
         if start_record is None and name == "__start_profile":
             start_record = record
 
+<<<<<<< HEAD
     if start_record is None or start_record.is_remote():
         raise AssertionError("Expected a valid local start_record")
+=======
+    assert start_record is not None and not start_record.is_remote()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for thread_record_list in thread_records:
         # accumulated memory allocations per handle
@@ -241,11 +276,18 @@ def _get_record_key(record):
                 cpu_memory_allocs[record_key] = 0
                 cuda_memory_allocs[record_key] = 0
             elif record.kind() == "pop":
+<<<<<<< HEAD
                 if record_key not in range_starts:
                     raise AssertionError(
                         f"Expected record with key {record_key} to exist in range_starts. "
                         "This means that the pop event did not have a corresponding push."
                     )
+=======
+                assert (
+                    record_key in range_starts
+                ), f"""Expected record with key {record_key} to exist in range_starts.
+                    This means that the pop event did not have a corresponding push."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 start = range_starts[record_key]
 
@@ -291,11 +333,15 @@ def _get_record_key(record):
             elif record.kind() == "memory_alloc":
                 num_open_handles_cpu = len(cpu_memory_allocs)
                 num_open_handles_cuda = len(cuda_memory_allocs)
+<<<<<<< HEAD
                 if num_open_handles_cpu != num_open_handles_cuda:
                     raise AssertionError(
                         f"Expected CPU and CUDA memory allocation handles to match, "
                         f"but got {num_open_handles_cpu} CPU and {num_open_handles_cuda} CUDA"
                     )
+=======
+                assert num_open_handles_cpu == num_open_handles_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for handle in cpu_memory_allocs.keys():
                     cpu_memory_allocs[handle] += record.cpu_memory_usage()
                 for handle in cuda_memory_allocs.keys():
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index b2d6530049e61..c41d31e44ed43 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -30,7 +30,10 @@ def __init__(self, *args, **kwargs):
         use_device = kwargs.pop("use_device", None)
         profile_memory = kwargs.pop("profile_memory", False)
         with_flops = kwargs.pop("with_flops", False)
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(*args, **kwargs)
         self._use_device = use_device
         self._profile_memory = profile_memory
@@ -49,7 +52,10 @@ def __str__(self):
     def _remove_dup_nodes(self):
         while True:
             to_delete = set()
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for idx in range(len(self)):
                 if (
                     self[idx].cpu_parent is not None
@@ -63,11 +69,16 @@ def _remove_dup_nodes(self):
                     to_delete.add(idx)
             if len(to_delete) == 0:
                 break
+<<<<<<< HEAD
 
             new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
 
             self.clear()
 
+=======
+            new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
+            self.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.extend(new_evts)
 
     def _populate_cpu_children(self):
@@ -131,10 +142,16 @@ def _populate_cpu_children(self):
                         current_events.pop()
                     else:
                         parent.append_cpu_child(event)
+<<<<<<< HEAD
                         if event.cpu_parent is not None:
                             raise AssertionError(
                                 f"There is already a CPU parent event for {event.key}"
                             )
+=======
+                        assert (
+                            event.cpu_parent is None
+                        ), f"There is already a CPU parent event for {event.key}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         event.set_cpu_parent(parent)
                         break
 
@@ -159,12 +176,21 @@ def bw_parent(evt):
         for evt in self:
             p = bw_parent(evt)
             if p is not None:
+<<<<<<< HEAD
                 if p.fwd_thread is None:
                     raise AssertionError(
                         "Expected fwd_thread to be set for backward parent"
                     )
                 t = (p.sequence_nr, p.fwd_thread)
                 evt.stack = fwd_stacks.get(t, [])
+=======
+                assert p.fwd_thread is not None
+                t = (p.sequence_nr, p.fwd_thread)
+                if t in fwd_stacks:
+                    evt.stack = fwd_stacks[t]
+                else:
+                    evt.stack = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def self_cpu_time_total(self):
@@ -179,7 +205,10 @@ def table(
         max_shapes_column_width=80,
         header=None,
         top_level_events_only=False,
+<<<<<<< HEAD
         time_unit=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """Print an EventList as a nicely formatted table.
 
@@ -196,8 +225,11 @@ def table(
                 display events at top level like top-level invocation of python
                 `lstm`, python `add` or other functions, nested events like low-level
                 cpu/cuda/xpu ops events are omitted for profiler result readability.
+<<<<<<< HEAD
             time_unit(str, optional): A time unit to be used for all values in the
                 table. Valid options are: ``s``, ``ms`` and ``us``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             A string containing the table.
@@ -213,7 +245,10 @@ def table(
             profile_memory=self._profile_memory,
             with_flops=self._with_flops,
             top_level_events_only=top_level_events_only,
+<<<<<<< HEAD
             time_unit=time_unit,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def export_chrome_trace(self, path):
@@ -327,10 +362,14 @@ def key_averages(
         Returns:
             An EventList containing FunctionEventAvg objects.
         """
+<<<<<<< HEAD
         if not self._tree_built:
             raise AssertionError(
                 "Expected tree to be built before calling key_averages"
             )
+=======
+        assert self._tree_built
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
 
         def get_key(
@@ -400,8 +439,12 @@ def _format_time(time_us):
 def _format_time_share(time_us, total_time_us):
     """Define how to format time in FunctionEvent."""
     if total_time_us == 0:
+<<<<<<< HEAD
         if time_us != 0:
             raise AssertionError(f"Expected time_us == 0 but got {time_us}")
+=======
+        assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "NaN"
     return f"{time_us * 100.0 / total_time_us:.2f}%"
 
@@ -501,14 +544,21 @@ def __init__(
         concrete_inputs=None,
         kwinputs=None,
         is_user_annotation=False,
+<<<<<<< HEAD
         metadata_json=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         self.overload_name: str = overload_name
         # pyrefly: ignore [bad-assignment]
+=======
+        self.overload_name: str = overload_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.trace_name: str = trace_name
         self.time_range: Interval = Interval(start_us, end_us)
         self.thread: int = thread
@@ -517,6 +567,7 @@ def __init__(
         self.count: int = 1
         self.cpu_children: list[FunctionEvent] = []
         self.cpu_parent: Optional[FunctionEvent] = None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
         self.input_shapes: tuple[int, ...] = input_shapes
         # pyrefly: ignore [bad-assignment]
@@ -524,6 +575,11 @@ def __init__(
         # pyrefly: ignore [bad-assignment]
         self.kwinputs: dict[str, Any] = kwinputs
         # pyrefly: ignore [bad-assignment]
+=======
+        self.input_shapes: tuple[int, ...] = input_shapes
+        self.concrete_inputs: list[Any] = concrete_inputs
+        self.kwinputs: dict[str, Any] = kwinputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.stack: list = stack
         self.scope: int = scope
         self.use_device: Optional[str] = use_device
@@ -543,11 +599,17 @@ def __init__(
         self.self_cpu_percent = -1
         self.total_cpu_percent = -1
         self.total_device_percent = -1
+<<<<<<< HEAD
         self.metadata_json = metadata_json
 
     def append_kernel(self, name, device, duration):
         if self.device_type != DeviceType.CPU:
             raise AssertionError("Expected device_type to be CPU")
+=======
+
+    def append_kernel(self, name, device, duration):
+        assert self.device_type == DeviceType.CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.kernels.append(Kernel(name, device, duration))
 
     def append_cpu_child(self, child):
@@ -556,12 +618,18 @@ def append_cpu_child(self, child):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
+<<<<<<< HEAD
         if self.device_type != DeviceType.CPU:
             raise AssertionError("Expected device_type to be CPU")
         if not isinstance(child, FunctionEvent):
             raise AssertionError("Expected child to be a FunctionEvent")
         if child.device_type != DeviceType.CPU:
             raise AssertionError("Expected child device_type to be CPU")
+=======
+        assert self.device_type == DeviceType.CPU
+        assert isinstance(child, FunctionEvent)
+        assert child.device_type == DeviceType.CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cpu_children.append(child)
 
     def set_cpu_parent(self, parent):
@@ -571,12 +639,18 @@ def set_cpu_parent(self, parent):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
+<<<<<<< HEAD
         if self.device_type != DeviceType.CPU:
             raise AssertionError("Expected device_type to be CPU")
         if not isinstance(parent, FunctionEvent):
             raise AssertionError("Expected parent to be a FunctionEvent")
         if parent.device_type != DeviceType.CPU:
             raise AssertionError("Expected parent device_type to be CPU")
+=======
+        assert self.device_type == DeviceType.CPU
+        assert isinstance(parent, FunctionEvent)
+        assert parent.device_type == DeviceType.CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cpu_parent = parent
 
     # Note: async events don't have children, are not used when computing 'self'
@@ -634,15 +708,23 @@ def device_time_total(self):
                 # each legacy cpu events has a single (fake) kernel
                 return sum(kinfo.duration for kinfo in self.kernels)
         else:
+<<<<<<< HEAD
             if self.device_type not in [
+=======
+            assert self.device_type in [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
+<<<<<<< HEAD
             ]:
                 raise AssertionError(
                     f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
                 )
+=======
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.time_range.elapsed_us()
 
     @property
@@ -662,15 +744,23 @@ def self_device_time_total(self):
                 child.device_time_total for child in self.cpu_children
             )
         else:
+<<<<<<< HEAD
             if self.device_type not in [
+=======
+            assert self.device_type in [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
+<<<<<<< HEAD
             ]:
                 raise AssertionError(
                     f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
                 )
+=======
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.device_time_total
 
     @property
@@ -748,6 +838,7 @@ def add(self, other):
             self.use_device = other.use_device
             self.is_user_annotation = other.is_user_annotation
 
+<<<<<<< HEAD
         if not isinstance(other, (FunctionEvent, FunctionEventAvg)):
             raise AssertionError(
                 "Expected other to be a FunctionEvent or FunctionEventAvg"
@@ -756,6 +847,10 @@ def add(self, other):
             raise AssertionError(
                 f"Expected keys to match, but got {other.key} vs {self.key}"
             )
+=======
+        assert isinstance(other, (FunctionEvent, FunctionEventAvg))
+        assert other.key == self.key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.cpu_time_total += other.cpu_time_total
         self.device_time_total += other.device_time_total
@@ -767,7 +862,10 @@ def add(self, other):
         self.self_device_memory_usage += other.self_device_memory_usage
         self.count += other.count
         if self.flops is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.flops = other.flops
         elif other.flops is not None:
             self.flops += other.flops
@@ -874,7 +972,10 @@ def _build_table(
     with_flops=False,
     profile_memory=False,
     top_level_events_only=False,
+<<<<<<< HEAD
     time_unit=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
     if len(events) == 0:
@@ -1002,6 +1103,7 @@ def auto_scale_flops(flops):
             "TFLOPs",
             "PFLOPs",
         ]
+<<<<<<< HEAD
         if flops <= 0:
             raise AssertionError(f"Expected flops to be positive, but got {flops}")
         # pyrefly: ignore [no-matching-overload]
@@ -1010,6 +1112,11 @@ def auto_scale_flops(flops):
             raise AssertionError(
                 f"Expected log_flops to be in range [0, {len(flop_headers)}), but got {log_flops}"
             )
+=======
+        assert flops > 0
+        log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
+        assert log_flops >= 0 and log_flops < len(flop_headers)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
 
     add_column(name_column_width)
@@ -1087,6 +1194,7 @@ def trim_path(path, src_column_width):
                 path = "..." + path[3:]
         return path
 
+<<<<<<< HEAD
     def override_time_unit(time_us, default_str, time_unit):
         US_IN_SECOND = 1000.0 * 1000.0
         US_IN_MS = 1000.0
@@ -1099,6 +1207,8 @@ def override_time_unit(time_us, default_str, time_unit):
         else:
             return default_str
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     event_limit = 0
     for evt in events:
         if event_limit == row_limit:
@@ -1132,6 +1242,7 @@ def override_time_unit(time_us, default_str, time_unit):
         row_values += [
             # Self CPU total %, 0 for async events.
             evt.self_cpu_percent,
+<<<<<<< HEAD
             override_time_unit(
                 evt.self_cpu_time_total, evt.self_cpu_time_total_str, time_unit
             ),  # Self CPU total
@@ -1143,6 +1254,13 @@ def override_time_unit(time_us, default_str, time_unit):
             override_time_unit(
                 evt.cpu_time, evt.cpu_time_str, time_unit
             ),  # CPU time avg
+=======
+            evt.self_cpu_time_total_str,  # Self CPU total
+            # CPU total %, 0 for async events.
+            evt.total_cpu_percent,
+            evt.cpu_time_total_str,  # CPU total
+            evt.cpu_time_str,  # CPU time avg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         if has_device_time:
             evt.total_device_percent = _format_time_share(
@@ -1150,6 +1268,7 @@ def override_time_unit(time_us, default_str, time_unit):
             )
             row_values.extend(
                 [
+<<<<<<< HEAD
                     override_time_unit(
                         evt.self_device_time_total,
                         evt.self_device_time_total_str,
@@ -1163,6 +1282,13 @@ def override_time_unit(time_us, default_str, time_unit):
                     override_time_unit(
                         evt.device_time, evt.device_time_str, time_unit
                     ),  # device time avg
+=======
+                    evt.self_device_time_total_str,
+                    # device time total %
+                    evt.total_device_percent,
+                    evt.device_time_total_str,
+                    evt.device_time_str,  # device time avg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
         if profile_memory:
@@ -1215,6 +1341,7 @@ def override_time_unit(time_us, default_str, time_unit):
             append(row_format.format(*empty_headers))
 
     append(header_sep)
+<<<<<<< HEAD
     append(
         f"Self CPU time total: {override_time_unit(sum_self_cpu_time_total, _format_time(sum_self_cpu_time_total), time_unit)}"
     )
@@ -1222,5 +1349,12 @@ def override_time_unit(time_us, default_str, time_unit):
         append(
             f"Self {use_device.upper() if use_device is not None else 'None'} "
             f"time total: {override_time_unit(sum_self_device_time_total, _format_time(sum_self_device_time_total), time_unit)}"
+=======
+    append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")
+    if has_device_time:
+        append(
+            f"Self {use_device.upper() if use_device is not None else 'None'} "
+            f"time total: {_format_time(sum_self_device_time_total)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     return "".join(result)
diff --git a/torch/backends/__init__.py b/torch/backends/__init__.py
index c02a8c36fd08b..ddfa2af21e95e 100644
--- a/torch/backends/__init__.py
+++ b/torch/backends/__init__.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import sys
 import types
 from contextlib import contextmanager
 
 import torch
 
+=======
+import types
+from contextlib import contextmanager
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The idea for this parameter is that we forbid bare assignment
 # to torch.backends.<cudnn|mkldnn>.enabled and friends when running our
@@ -60,6 +66,7 @@ def __getattr__(self, attr):
         return self.m.__getattribute__(attr)
 
 
+<<<<<<< HEAD
 class _FP32Precision:
     def __init__(self, backend, op):
         self.backend = backend
@@ -124,6 +131,8 @@ def __init__(self, m, name):
 
 sys.modules[__name__] = GenericModule(sys.modules[__name__], __name__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.backends import (
     cpu as cpu,
     cuda as cuda,
@@ -131,12 +140,18 @@ def __init__(self, m, name):
     cusparselt as cusparselt,
     kleidiai as kleidiai,
     mha as mha,
+<<<<<<< HEAD
     miopen as miopen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mkl as mkl,
     mkldnn as mkldnn,
     mps as mps,
     nnpack as nnpack,
     openmp as openmp,
+<<<<<<< HEAD
     opt_einsum as opt_einsum,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     quantized as quantized,
 )
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 0ff09959f840c..7b817f82d3a48 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -201,7 +201,11 @@ class DimOrder(enum.Enum):
 
 
 class Operand(NamedTuple):
+<<<<<<< HEAD
     """Representation of an NNAPI operand."""
+=======
+    """Represenation of an NNAPI operand."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NNAPI operand type.  One of NNAPI_OperandCode.
     # TODO: Make this an enum.
@@ -414,7 +418,10 @@ def torch_tensor_to_operand(self, tensor, dim_order):
             )  # noqa: TRY002
         return Operand(
             shape=tuple(tensor.shape),
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op_type=op_type,
             dim_order=dim_order,
             scale=scale,
@@ -1735,13 +1742,19 @@ def add_upsample_nearest2d(self, node):
         for dim in (2, 3):  # h, w indices
             if image_oper.shape[dim] == 0:
                 if size_ctype.kind() != "NoneType":
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.compute_operand_shape(out_id, dim, size_arg[dim - 2])
                 elif scale_ctype.kind() != "NoneType":
                     self.compute_operand_shape(
                         out_id,
                         dim,
+<<<<<<< HEAD
                         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         f"int({scale_arg[dim - 2]} * {flex_name(image_id, dim)})",
                     )
                 else:
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index d62c2b05a1ea1..5d221d31cc71e 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 from typing import Any, Union
+=======
+from typing import Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -126,6 +130,7 @@ def __setattr__(self, name, value):
 
 
 class cuBLASModule:
+<<<<<<< HEAD
     @staticmethod
     def _parse_reduction_setting(value: Any, attr_name: str) -> tuple[bool, bool]:
         def _ensure_bool(obj: Any, which: str) -> bool:
@@ -151,10 +156,13 @@ def _ensure_bool(obj: Any, which: str) -> bool:
             f"{attr_name} expects a bool or a tuple/list of bools, but got {type(value)!r}"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __getattr__(self, name):
         if name == "allow_tf32":
             return torch._C._get_cublas_allow_tf32()
         elif name == "allow_fp16_reduced_precision_reduction":
+<<<<<<< HEAD
             allow_reduced_precision, _ = (
                 torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
             )
@@ -178,12 +186,20 @@ def __getattr__(self, name):
             return torch._C._get_cublas_allow_fp16_accumulation()
         elif name == "fp32_precision":
             return torch._C._get_fp32_precision_getter("cuda", "matmul")
+=======
+            return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+        elif name == "allow_fp16_accumulation":
+            return torch._C._get_cublas_allow_fp16_accumulation()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise AttributeError("Unknown attribute " + name)
 
     def __setattr__(self, name, value):
         if name == "allow_tf32":
             return torch._C._set_cublas_allow_tf32(value)
         elif name == "allow_fp16_reduced_precision_reduction":
+<<<<<<< HEAD
             allow_reduced_precision, allow_splitk = self._parse_reduction_setting(
                 value, "allow_fp16_reduced_precision_reduction"
             )
@@ -203,6 +219,13 @@ def __setattr__(self, name, value):
             return torch._C._set_cublas_allow_fp16_accumulation(value)
         elif name == "fp32_precision":
             return torch._C._set_fp32_precision_setter("cuda", "matmul", value)
+=======
+            return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
+        elif name == "allow_fp16_accumulation":
+            return torch._C._set_cublas_allow_fp16_accumulation(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise AttributeError("Unknown attribute " + name)
 
 
@@ -215,7 +238,11 @@ def __setattr__(self, name, value):
 
 
 def preferred_linalg_library(
+<<<<<<< HEAD
     backend: Union[None, str, torch._C._LinalgBackend] = None,
+=======
+    backend: Union[None, str, torch._C._LinalgBackend] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch._C._LinalgBackend:
     r"""
     Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
@@ -263,7 +290,11 @@ def preferred_linalg_library(
     elif isinstance(backend, str):
         if backend not in _LinalgBackends:
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Unknown input value. Choose from: {_LinalgBackends_str}."
+=======
+                "Unknown input value. " f"Choose from: {_LinalgBackends_str}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
     elif isinstance(backend, torch._C._LinalgBackend):
@@ -286,7 +317,11 @@ def preferred_linalg_library(
 
 
 def preferred_blas_library(
+<<<<<<< HEAD
     backend: Union[None, str, torch._C._BlasBackend] = None,
+=======
+    backend: Union[None, str, torch._C._BlasBackend] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch._C._BlasBackend:
     r"""
     Override the library PyTorch uses for BLAS operations. Choose between cuBLAS, cuBLASLt, and CK [ROCm-only].
@@ -318,7 +353,11 @@ def preferred_blas_library(
     elif isinstance(backend, str):
         if backend not in _BlasBackends:
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Unknown input value. Choose from: {_BlasBackends_str}."
+=======
+                "Unknown input value. " f"Choose from: {_BlasBackends_str}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         torch._C._set_blas_preferred_backend(_BlasBackends[backend])
     elif isinstance(backend, torch._C._BlasBackend):
@@ -341,13 +380,21 @@ def preferred_blas_library(
 
 
 def preferred_rocm_fa_library(
+<<<<<<< HEAD
     backend: Union[None, str, torch._C._ROCmFABackend] = None,
+=======
+    backend: Union[None, str, torch._C._ROCmFABackend] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch._C._ROCmFABackend:
     r"""
     [ROCm-only]
     Override the backend PyTorch uses in ROCm environments for Flash Attention. Choose between AOTriton and CK
 
+<<<<<<< HEAD
     .. warning:: This flag is experimental and subject to change.
+=======
+    .. warning:: This flag is experimeental and subject to change.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     When Flash Attention is enabled and desired, PyTorch defaults to using AOTriton as the backend.
     This flag (a :class:`str`) allows users to override this backend to use composable_kernel
@@ -369,13 +416,21 @@ def preferred_rocm_fa_library(
     elif isinstance(backend, str):
         if backend not in _ROCmFABackends:
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Unknown input value. Choose from: {_ROCmFABackends_str}."
+=======
+                "Unknown input value. " f"Choose from: {_ROCmFABackends_str}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         torch._C._set_rocm_fa_preferred_backend(_ROCmFABackends[backend])
     elif isinstance(backend, torch._C._ROCmFABackend):
         torch._C._set_rocm_fa_preferred_backend(backend)
     else:
+<<<<<<< HEAD
         raise ValueError(f"Unknown input value. Choose from: {_ROCmFABackends_str}.")
+=======
+        raise ValueError("Unknown input value. " f"Choose from: {_ROCmFABackends_str}.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return torch._C._get_rocm_fa_preferred_backend()
 
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 697783c01cb64..6bfa8b3ca4b11 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 import torch
+<<<<<<< HEAD
 from torch.backends import (
     __allow_nonbracketed_mutation,
     _FP32Precision,
@@ -14,6 +15,9 @@
     ContextProp,
     PropModule,
 )
+=======
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -34,11 +38,16 @@
     def _init():
         global __cudnn_version
         if __cudnn_version is None:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             __cudnn_version = _cudnn.getVersionInt()
             # pyrefly: ignore [missing-attribute]
             runtime_version = _cudnn.getRuntimeVersion()
             # pyrefly: ignore [missing-attribute]
+=======
+            __cudnn_version = _cudnn.getVersionInt()
+            runtime_version = _cudnn.getRuntimeVersion()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compile_version = _cudnn.getCompileVersion()
             runtime_major, runtime_minor, _ = runtime_version
             compile_major, compile_minor, _ = compile_version
@@ -47,7 +56,10 @@ def _init():
             # Not sure about MIOpen (ROCm), so always do a strict check
             if runtime_major != compile_major:
                 cudnn_compatible = False
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif runtime_major < 7 or not _cudnn.is_cuda:
                 cudnn_compatible = runtime_minor == compile_minor
             else:
@@ -118,8 +130,12 @@ def is_acceptable(tensor):
     if not is_available():
         warnings.warn(
             "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
+<<<<<<< HEAD
             "PyTorch making sure the library is visible to the build system.",
             stacklevel=2,
+=======
+            "PyTorch making sure the library is visible to the build system."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return False
     if not _init():
@@ -128,8 +144,12 @@ def is_acceptable(tensor):
                 libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
                     sys.platform, "LD_LIBRARY_PATH"
                 )
+<<<<<<< HEAD
             ),
             stacklevel=2,
+=======
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return False
     return True
@@ -141,7 +161,10 @@ def set_flags(
     _benchmark_limit=None,
     _deterministic=None,
     _allow_tf32=None,
+<<<<<<< HEAD
     _fp32_precision="none",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     orig_flags = (
         torch._C._get_cudnn_enabled(),
@@ -149,7 +172,10 @@ def set_flags(
         None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
         torch._C._get_cudnn_deterministic(),
         torch._C._get_cudnn_allow_tf32(),
+<<<<<<< HEAD
         torch._C._get_fp32_precision_getter("cuda", "all"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if _enabled is not None:
         torch._C._set_cudnn_enabled(_enabled)
@@ -161,8 +187,11 @@ def set_flags(
         torch._C._set_cudnn_deterministic(_deterministic)
     if _allow_tf32 is not None:
         torch._C._set_cudnn_allow_tf32(_allow_tf32)
+<<<<<<< HEAD
     if _fp32_precision is not None:
         torch._C._set_fp32_precision_setter("cuda", "all", _fp32_precision)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return orig_flags
 
 
@@ -173,6 +202,7 @@ def flags(
     benchmark_limit=10,
     deterministic=False,
     allow_tf32=True,
+<<<<<<< HEAD
     fp32_precision="none",
 ):
     with __allow_nonbracketed_mutation():
@@ -183,6 +213,12 @@ def flags(
             deterministic,
             allow_tf32,
             fp32_precision,
+=======
+):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(
+            enabled, benchmark, benchmark_limit, deterministic, allow_tf32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     try:
         yield
@@ -217,12 +253,15 @@ def __init__(self, m, name):
     allow_tf32 = ContextProp(
         torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32
     )
+<<<<<<< HEAD
     conv = _FP32Precision("cuda", "conv")
     rnn = _FP32Precision("cuda", "rnn")
     fp32_precision = ContextProp(
         _get_fp32_precision_getter("cuda", "all"),
         _set_fp32_precision_setter("cuda", "all"),
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This is the sys.modules replacement trick, see
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
index 0dc9ca80aa6fd..f994a14fe9a96 100644
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@@ -12,6 +12,7 @@
 
 def get_cudnn_mode(mode):
     if mode == "RNN_RELU":
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         return int(_cudnn.RNNMode.rnn_relu)
     elif mode == "RNN_TANH":
@@ -22,6 +23,14 @@ def get_cudnn_mode(mode):
         return int(_cudnn.RNNMode.lstm)
     elif mode == "GRU":
         # pyrefly: ignore [missing-attribute]
+=======
+        return int(_cudnn.RNNMode.rnn_relu)
+    elif mode == "RNN_TANH":
+        return int(_cudnn.RNNMode.rnn_tanh)
+    elif mode == "LSTM":
+        return int(_cudnn.RNNMode.lstm)
+    elif mode == "GRU":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return int(_cudnn.RNNMode.gru)
     else:
         raise Exception(f"Unknown mode: {mode}")  # noqa: TRY002
@@ -60,7 +69,10 @@ def init_dropout_state(dropout, train, dropout_seed, dropout_state):
                     dropout_p,
                     train,
                     dropout_seed,
+<<<<<<< HEAD
                     # pyrefly: ignore [unexpected-keyword]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self_ty=torch.uint8,
                     device=torch.device("cuda"),
                 )
diff --git a/torch/backends/cusparselt/__init__.py b/torch/backends/cusparselt/__init__.py
index 3e9b9df2acf14..a85f97d839754 100644
--- a/torch/backends/cusparselt/__init__.py
+++ b/torch/backends/cusparselt/__init__.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 
 import torch
@@ -19,11 +23,18 @@
 
 if _cusparselt is not None:
 
+<<<<<<< HEAD
     def _init() -> bool:
         global __cusparselt_version
         global __MAX_ALG_ID
         if __cusparselt_version is None:
             # pyrefly: ignore [missing-attribute]
+=======
+    def _init():
+        global __cusparselt_version
+        global __MAX_ALG_ID
+        if __cusparselt_version is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             __cusparselt_version = _cusparselt.getVersionInt()
             if __cusparselt_version == 400:
                 __MAX_ALG_ID = 4
@@ -35,7 +46,11 @@ def _init() -> bool:
 
 else:
 
+<<<<<<< HEAD
     def _init() -> bool:
+=======
+    def _init():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
 
diff --git a/torch/backends/mkl/__init__.py b/torch/backends/mkl/__init__.py
index ae16922761afe..49db4bd485954 100644
--- a/torch/backends/mkl/__init__.py
+++ b/torch/backends/mkl/__init__.py
@@ -30,7 +30,10 @@ class verbose:
     .. code-block:: python
 
         import torch
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(data)
         with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
             model(data)
@@ -48,9 +51,15 @@ def __enter__(self):
         if self.enable == VERBOSE_OFF:
             return
         st = torch._C._verbose.mkl_set_verbose(self.enable)
+<<<<<<< HEAD
         assert st, (
             "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
         )
+=======
+        assert (
+            st
+        ), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index 2d1ce8f3bb997..97be3e8c8604d 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 import torch
+<<<<<<< HEAD
 from torch.backends import (
     __allow_nonbracketed_mutation,
     _FP32Precision,
@@ -12,6 +13,9 @@
     ContextProp,
     PropModule,
 )
+=======
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_available():
@@ -19,12 +23,15 @@ def is_available():
     return torch._C._has_mkldnn
 
 
+<<<<<<< HEAD
 def is_acl_available():
     r"""Return whether PyTorch is built with MKL-DNN + ACL support."""
     # pyrefly: ignore [missing-attribute]
     return torch._C._has_mkldnn_acl
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 VERBOSE_OFF = 0
 VERBOSE_ON = 1
 VERBOSE_ON_CREATION = 2
@@ -49,7 +56,10 @@ class verbose:
     .. code-block:: python
 
         import torch
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(data)
         with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
             model(data)
@@ -68,9 +78,15 @@ def __enter__(self):
         if self.level == VERBOSE_OFF:
             return
         st = torch._C._verbose.mkldnn_set_verbose(self.level)
+<<<<<<< HEAD
         assert st, (
             "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
         )
+=======
+        assert (
+            st
+        ), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -78,14 +94,21 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return False
 
 
+<<<<<<< HEAD
 def set_flags(
     _enabled=None, _deterministic=None, _allow_tf32=None, _fp32_precision="none"
 ):
+=======
+def set_flags(_enabled=None, _deterministic=None, _allow_tf32=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_flags = (
         torch._C._get_mkldnn_enabled(),
         torch._C._get_mkldnn_deterministic(),
         torch._C._get_onednn_allow_tf32(),
+<<<<<<< HEAD
         torch._C._get_fp32_precision_getter("mkldnn", "all"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if _enabled is not None:
         torch._C._set_mkldnn_enabled(_enabled)
@@ -93,15 +116,24 @@ def set_flags(
         torch._C._set_mkldnn_deterministic(_deterministic)
     if _allow_tf32 is not None:
         torch._C._set_onednn_allow_tf32(_allow_tf32)
+<<<<<<< HEAD
     if _fp32_precision is not None:
         torch._C._set_fp32_precision_setter("mkldnn", "all", _fp32_precision)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return orig_flags
 
 
 @contextmanager
+<<<<<<< HEAD
 def flags(enabled=False, deterministic=False, allow_tf32=True, fp32_precision="none"):
     with __allow_nonbracketed_mutation():
         orig_flags = set_flags(enabled, deterministic, allow_tf32, fp32_precision)
+=======
+def flags(enabled=False, deterministic=False, allow_tf32=True):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled, deterministic, allow_tf32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         yield
     finally:
@@ -123,6 +155,7 @@ def is_available(self):
     allow_tf32 = ContextProp(
         torch._C._get_onednn_allow_tf32, torch._C._set_onednn_allow_tf32
     )
+<<<<<<< HEAD
     matmul = _FP32Precision("mkldnn", "matmul")
     conv = _FP32Precision("mkldnn", "conv")
     rnn = _FP32Precision("mkldnn", "rnn")
@@ -130,6 +163,8 @@ def is_available(self):
         _get_fp32_precision_getter("mkldnn", "all"),
         _set_fp32_precision_setter("generic", "all"),
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
index 5c3c507428cff..c267eaf388d0d 100644
--- a/torch/backends/mps/__init__.py
+++ b/torch/backends/mps/__init__.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import lru_cache as _lru_cache
 from typing import Optional, TYPE_CHECKING
 
@@ -5,6 +9,7 @@
 from torch.library import Library as _Library
 
 
+<<<<<<< HEAD
 __all__ = [
     "get_core_count",
     "get_name",
@@ -13,6 +18,9 @@
     "is_macos13_or_newer",
     "is_macos_or_newer",
 ]
+=======
+__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_built() -> bool:
@@ -43,6 +51,7 @@ def is_macos13_or_newer(minor: int = 0) -> bool:
     return torch._C._mps_is_on_macos_or_newer(13, minor)
 
 
+<<<<<<< HEAD
 @_lru_cache
 def get_name() -> str:
     r"""Return Metal device name"""
@@ -64,6 +73,12 @@ def get_core_count() -> int:
 
 
 def _init() -> None:
+=======
+_lib: Optional[_Library] = None
+
+
+def _init():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Register prims as implementation of var_mean and group_norm."""
     global _lib
 
diff --git a/torch/backends/opt_einsum/__init__.py b/torch/backends/opt_einsum/__init__.py
index 797d847e31e5c..ad1bd42b31574 100644
--- a/torch/backends/opt_einsum/__init__.py
+++ b/torch/backends/opt_einsum/__init__.py
@@ -70,7 +70,10 @@ def _set_strategy(_strategy: str) -> None:
 
 
 def _get_strategy() -> str:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return strategy
 
 
@@ -116,5 +119,9 @@ def __init__(self, m, name):
 # https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
 sys.modules[__name__] = OptEinsumModule(sys.modules[__name__], __name__)
 
+<<<<<<< HEAD
 enabled = bool(is_available())
+=======
+enabled = True if is_available() else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 strategy = "auto" if is_available() else None
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index b6b6bdf78991d..27dc26cd27833 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -119,7 +119,11 @@
 Memory allocator
 ----------------
 
+<<<<<<< HEAD
 "--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allocator.
+=======
+"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 """
 
@@ -262,11 +266,17 @@ def numa_aware_check(self, core_list):
 class _Launcher:
     r"""Class for launcher."""
 
+<<<<<<< HEAD
     msg_lib_notfound = (
         f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
 or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
 {expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
     )
+=======
+    msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
+or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
+{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self) -> None:
         self.cpuinfo = _CPUinfo()
@@ -613,12 +623,22 @@ def launch(self, args):
                     args.rank == -1
                 ):  # sequentially assign ncores_per_instance to ninstances
                     core_list = cores[
+<<<<<<< HEAD
                         i * args.ncores_per_instance : (i + 1)
+=======
+                        i
+                        * args.ncores_per_instance : (i + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         * args.ncores_per_instance
                     ]
                 else:  # assign ncores_per_instance from rank
                     core_list = cores[
+<<<<<<< HEAD
                         args.rank * args.ncores_per_instance : (args.rank + 1)
+=======
+                        args.rank
+                        * args.ncores_per_instance : (args.rank + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         * args.ncores_per_instance
                     ]
 
@@ -626,9 +646,15 @@ def launch(self, args):
                 if local_size > 1:
                     total_num_cores = len(core_list)
                     cores_per_rank = total_num_cores // local_size
+<<<<<<< HEAD
                     assert cores_per_rank >= 1, (
                         "At least one core needs to be assigned to each rank"
                     )
+=======
+                    assert (
+                        cores_per_rank >= 1
+                    ), "At least one core needs to be assigned to each rank"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     core_list = core_list[
                         cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
                     ]
@@ -835,7 +861,10 @@ def create_args(parser=None):
 
     @retval ArgumentParser
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument(
         "--multi-instance",
         "--multi_instance",
@@ -844,7 +873,10 @@ def create_args(parser=None):
         help="Enable multi-instance, by default one instance per node",
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument(
         "-m",
         "--module",
@@ -855,7 +887,10 @@ def create_args(parser=None):
         '"python -m".',
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument(
         "--no-python",
         "--no_python",
@@ -870,7 +905,10 @@ def create_args(parser=None):
 
     _add_multi_instance_params(parser)
     # positional
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument(
         "program",
         type=str,
@@ -879,7 +917,10 @@ def create_args(parser=None):
     )
 
     # rest from the training program
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser.add_argument("program_args", nargs=REMAINDER)
 
 
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 1e744f54362dd..1f45a340bed57 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import io
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -25,7 +29,10 @@
     "set_stance",
     "set_enable_guard_collectives",
     "cudagraph_mark_step_begin",
+<<<<<<< HEAD
     "load_compiled_function",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "wrap_numpy",
     "is_compiling",
     "is_dynamo_compiling",
@@ -42,15 +49,21 @@
 
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
+<<<<<<< HEAD
 FuncType = Callable[..., Any]
 F = TypeVar("F", bound=FuncType)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def compile(*args, **kwargs):
     """
     See :func:`torch.compile` for details on the arguments for this function.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.compile(*args, **kwargs)
 
 
@@ -129,7 +142,10 @@ def allow_in_graph(fn):
 
         torch.compiler.allow_in_graph(my_custom_function)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile(...)
         def fn(x):
             x = torch.add(x, 1)
@@ -137,7 +153,10 @@ def fn(x):
             x = torch.add(x, 1)
             return x
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn(...)
 
     Will capture a single graph containing ``my_custom_function()``.
@@ -258,10 +277,14 @@ def disable(fn=None, recursive=True, *, reason=None):
 
 
 def set_stance(
+<<<<<<< HEAD
     stance: str = "default",
     *,
     skip_guard_eval_unsafe: bool = False,
     force_backend: Union[str, Callable[..., Any], None] = None,
+=======
+    stance: str = "default", *, skip_guard_eval_unsafe=False, force_backend=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Set the current stance of the compiler.
@@ -271,15 +294,23 @@ def set_stance(
     .. code-block:: python
 
         @torch.compile
+<<<<<<< HEAD
         def foo(x): ...
 
+=======
+        def foo(x):
+            ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compiler.set_stance("force_eager")
         def bar():
             # will not be compiled
             foo(...)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bar()
 
         with torch.compiler.set_stance("force_eager"):
@@ -364,7 +395,11 @@ def set_enable_guard_collectives(enabled: bool):
     from torch._dynamo.eval_frame import guard_collectives_hook
 
     if enabled:
+<<<<<<< HEAD
         return set_guard_complete_hook(guard_collectives_hook) is not None  # type: ignore[arg-type]
+=======
+        return set_guard_complete_hook(guard_collectives_hook) is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return set_guard_complete_hook(None) is not None
 
@@ -387,7 +422,10 @@ def cudagraph_mark_step_begin():
         def rand_foo():
             return torch.rand([4], device="cuda")
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(5):
             torch.compiler.cudagraph_mark_step_begin()
             rand_foo() + rand_foo()
@@ -502,12 +540,16 @@ def save_cache_artifacts() -> Optional[tuple[bytes, "CacheInfo"]]:
     - Execute torch.compile
     - Call torch.compiler.save_cache_artifacts()
     """
+<<<<<<< HEAD
     from ._cache import CacheArtifactManager
 
     if torch._dynamo.config.caching_precompile:
         from torch._dynamo.precompile_context import PrecompileContext
 
         PrecompileContext.save_to_dynamo_cache()
+=======
+    from ._cache import CacheArtifactManager, CacheInfo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return CacheArtifactManager.serialize()
 
@@ -648,6 +690,7 @@ def nested_compile_region(fn=None):
     )
 
     return _mark_compile_region(fn)
+<<<<<<< HEAD
 
 
 def load_compiled_function(file: io.IOBase) -> Callable[..., Any]:
@@ -668,3 +711,5 @@ def load_compiled_function(file: io.IOBase) -> Callable[..., Any]:
 
     data = file.read()
     return AOTCompiledFunction.deserialize(data)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/compiler/_cache.py b/torch/compiler/_cache.py
index b525438d1bb5b..093f3611f42ba 100644
--- a/torch/compiler/_cache.py
+++ b/torch/compiler/_cache.py
@@ -48,6 +48,12 @@ def encode(content: Any) -> bytes:
     def populate_cache(self) -> None:
         pass
 
+<<<<<<< HEAD
+=======
+    def precompile_compatible(self) -> bool:
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def type() -> str:
         """
@@ -69,9 +75,15 @@ class CacheArtifactFactory:
     @classmethod
     def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
         artifact_type_key = artifact_cls.type()
+<<<<<<< HEAD
         assert artifact_cls.type() not in cls._artifact_types, (
             f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
         )
+=======
+        assert (
+            artifact_cls.type() not in cls._artifact_types
+        ), f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls._artifact_types[artifact_type_key] = artifact_cls
         setattr(
             CacheInfo,
@@ -82,15 +94,24 @@ def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
 
     @classmethod
     def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
+<<<<<<< HEAD
         assert artifact_type_key in cls._artifact_types, (
             f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
         )
+=======
+        assert (
+            artifact_type_key in cls._artifact_types
+        ), f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls._artifact_types[artifact_type_key]
 
     @classmethod
     def create(cls, artifact_type_key: str, key: str, content: bytes) -> CacheArtifact:
         artifact_cls = cls._get_artifact_type(artifact_type_key)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-instantiation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return artifact_cls(key, content)
 
     @classmethod
@@ -98,7 +119,10 @@ def encode_create(
         cls, artifact_type_key: str, key: str, content: Any
     ) -> CacheArtifact:
         artifact_cls = cls._get_artifact_type(artifact_type_key)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-instantiation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return artifact_cls(key, artifact_cls.encode(content))
 
 
@@ -131,7 +155,11 @@ def pgo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
         ...
 
     @property
+<<<<<<< HEAD
     def precompile_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+=======
+    def precompile_aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...
 
     def add(self, artifact: CacheArtifact) -> None:
@@ -181,21 +209,35 @@ class CacheArtifactManager:
     - Call CacheArtifactManager.deserialize to hot load the cache artifacts on
         a potentially different process
 
+<<<<<<< HEAD
     NOTE: There's no FB/FC guarantees, results of cache artifacts will not be
+=======
+    NOTE: There's no FB/FC guarentees, results of cache artifacts will not be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           used unless code version matches.
     """
 
     # Protected by the compile_lock
     _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
+<<<<<<< HEAD
     # Keep a separate seen artifacts list to make avoid unnecessary duplicates
+=======
+    # Keep a seperate seen artifacts list to make avoid unnecessary duplicates
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This list will not be cleared between serialize() calls
     _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
     # When serialize() is called, artifacts are transferred from _cache_artifacts to
     # internal data structure of the _serializer
     # This allows us to only pay the cost of serialization if serialize() is called
+<<<<<<< HEAD
     _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
         AppendingByteSerializer(serialize_fn=_serialize_single_cache)
     )
+=======
+    _serializer: AppendingByteSerializer[
+        tuple[str, list[CacheArtifact]]
+    ] = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _cache_info: CacheInfo = CacheInfo()
 
     @classmethod
@@ -311,7 +353,10 @@ def _ensure_cache_artifacts_registered(cls) -> None:
         cache artifacts are registered in the cache registry. This is done by
         simply importing all the cache artifacts already wrapped with register call.
         """
+<<<<<<< HEAD
         from torch._dynamo.package import PrecompileCacheArtifact  # noqa: F401
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.pgo import PGOCacheArtifact  # noqa: F401
         from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
             AOTAutogradCacheArtifact,
diff --git a/torch/compiler/config.py b/torch/compiler/config.py
index e7578a57f2c0b..3c47d59ae2b78 100644
--- a/torch/compiler/config.py
+++ b/torch/compiler/config.py
@@ -20,6 +20,7 @@
 
 __all__ = [
     "job_id",
+<<<<<<< HEAD
     "dynamic_shapes",
     "assume_static_by_default",
     "automatic_dynamic_shapes",
@@ -35,6 +36,8 @@
     "enable_cpp_symbolic_shape_guards",
     "wrap_top_frame",
     "reorderable_logging_functions",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -74,6 +77,7 @@
 consistent profiles across all ranks.
 """
 
+<<<<<<< HEAD
 pgo_extra_read_key: Optional[str] = Config(
     env_name_default="TORCH_COMPILE_STICKY_PGO_READ", default=None
 )
@@ -87,6 +91,8 @@
 and merges it with the default state.
 """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cache_key_tag: str = Config(env_name_default="TORCH_COMPILE_CACHE_KEY_TAG", default="")
 """
@@ -94,6 +100,7 @@
 A common use case for such a tag is to break caches.
 """
 
+<<<<<<< HEAD
 force_disable_caches: bool = Config(
     justknob="pytorch/remote_cache:force_disable_caches",
     env_name_force=[
@@ -106,6 +113,8 @@
 Force disables all caching -- This will take precedence over and override any other caching flag
 """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 dynamic_sources: str = Config(
     env_name_default="TORCH_COMPILE_DYNAMIC_SOURCES", default=""
 )
@@ -128,6 +137,7 @@
 and force_parameter_static_shapes.
 """
 
+<<<<<<< HEAD
 # force a python GC before recording cudagraphs
 force_cudagraph_gc: bool = Config(env_name_default="TORCH_CUDAGRAPH_GC", default=False)
 """
@@ -277,4 +287,6 @@
 """
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 install_config_module(sys.modules[__name__])
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
index b42b7f0ff54bd..b5713ec207808 100644
--- a/torch/cpu/__init__.py
+++ b/torch/cpu/__init__.py
@@ -27,6 +27,11 @@
     "Event",
 ]
 
+<<<<<<< HEAD
+=======
+_device_t = Union[_device, str, int, None]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _is_avx2_supported() -> bool:
     r"""Returns a bool indicating if CPU supports AVX2."""
@@ -73,7 +78,11 @@ def is_available() -> bool:
     return True
 
 
+<<<<<<< HEAD
 def synchronize(device: torch.types.Device = None) -> None:
+=======
+def synchronize(device: _device_t = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Waits for all kernels in all streams on the CPU device to complete.
 
     Args:
@@ -119,7 +128,11 @@ def wait(self, stream=None) -> None:
 _current_stream = _default_cpu_stream
 
 
+<<<<<<< HEAD
 def current_stream(device: torch.types.Device = None) -> Stream:
+=======
+def current_stream(device: _device_t = None) -> Stream:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Returns the currently selected :class:`Stream` for a given device.
 
     Args:
@@ -179,7 +192,11 @@ def device_count() -> int:
     return 1
 
 
+<<<<<<< HEAD
 def set_device(device: torch.types.Device) -> None:
+=======
+def set_device(device: _device_t) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Sets the current device, in CPU we do nothing.
 
     N.B. This function only exists to facilitate device-agnostic code
diff --git a/torch/cpu/amp/__init__.py b/torch/cpu/amp/__init__.py
index dae673c7b2313..6c80b07605e81 100644
--- a/torch/cpu/amp/__init__.py
+++ b/torch/cpu/amp/__init__.py
@@ -1,3 +1,6 @@
+<<<<<<< HEAD
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .autocast_mode import autocast
 from .grad_scaler import GradScaler
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index f0f81060d4a01..7fc2bff917741 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 from typing_extensions import deprecated
 
@@ -9,17 +12,21 @@
 __all__ = ["autocast"]
 
 
+<<<<<<< HEAD
 @deprecated(
     "`torch.cpu.amp.autocast(args...)` is deprecated. "
     "Please use `torch.amp.autocast('cpu', args...)` instead.",
     category=FutureWarning,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class autocast(torch.amp.autocast_mode.autocast):
     r"""
     See :class:`torch.autocast`.
     ``torch.cpu.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cpu", args...)`` instead.
     """
 
+<<<<<<< HEAD
     # TODO: remove this conditional once we stop supporting Python < 3.13
     # Prior to Python 3.13, inspect.signature could not retrieve the correct
     # signature information for classes decorated with @deprecated (unless
@@ -39,6 +46,13 @@ def __new__(
         def __init_subclass__(cls):
             pass
 
+=======
+    @deprecated(
+        "`torch.cpu.amp.autocast(args...)` is deprecated. "
+        "Please use `torch.amp.autocast('cpu', args...)` instead.",
+        category=FutureWarning,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         enabled: bool = True,
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index a6ad3f00b2782..e3cb9660f75b8 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -62,7 +62,11 @@ static void setSignalHandler(
     std::ostringstream oss;
     oss << "An error occurred while setting handler for " << strsignal(signal)
         << ".";
+<<<<<<< HEAD
     TORCH_CHECK(false, oss.str());
+=======
+    throw std::runtime_error(oss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -141,6 +145,7 @@ static PyObject* THPModule_errorIfAnyWorkerFails(
         continue;
       if (infop.si_code == CLD_EXITED &&
           infop.si_status != EXIT_SUCCESS) { // exit with error
+<<<<<<< HEAD
         auto error_msg = fmt::format(
             "DataLoader worker (pid {}) exited unexpectedly with exit code {}. "
             "Details are lost due to multiprocessing. Rerunning with "
@@ -162,11 +167,35 @@ static PyObject* THPModule_errorIfAnyWorkerFails(
           error_msg +=
               "It is possible that dataloader's workers are out of shared memory. "
               "Please try to raise your shared memory limit.";
+=======
+        std::ostringstream oss;
+        oss << "DataLoader worker (pid " << worker_pid << ") exited "
+            << "unexpectedly with exit code " << infop.si_status << ". "
+            << "Details are lost due to multiprocessing. Rerunning with "
+            << "num_workers=0 may give better error trace.";
+        // This is necessary. Otherwise, the runtime error will kill the other
+        // workers, and trigger this again.
+        pid_set.clear();
+        throw std::runtime_error(oss.str());
+      } else if (
+          infop.si_code == CLD_KILLED ||
+          infop.si_code == CLD_DUMPED) { // killed by signal
+        std::ostringstream oss;
+        oss << "DataLoader worker (pid " << worker_pid << ") is killed "
+            << "by signal: " << strsignal(infop.si_status) << ". ";
+        if (infop.si_status == SIGBUS) {
+          oss << "It is possible that dataloader's workers are out of shared memory. "
+              << "Please try to raise your shared memory limit.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         // This is necessary. Otherwise, the runtime error will kill the other
         // workers, and trigger this again.
         pid_set.clear();
+<<<<<<< HEAD
         TORCH_CHECK(false, error_msg);
+=======
+        throw std::runtime_error(oss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index f3babe4cd72bb..1bee675b21478 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -18,7 +18,11 @@
 static PyObject* THPUpperModuleOfDevice = nullptr;
 
 PyObject* THPDevice_New(const at::Device& device) {
+<<<<<<< HEAD
   auto type = &THPDeviceType;
+=======
+  auto type = (PyTypeObject*)&THPDeviceType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -67,11 +71,18 @@ static PyObject* THPDevice_pynew(
     auto as_device = r.device(0); // this works, because device can take strings
     if (as_device.has_index()) {
       auto device_type = r.string(0);
+<<<<<<< HEAD
       TORCH_CHECK(
           false,
           "type (string) must not include an index because index "
           "was passed explicitly: " +
               device_type);
+=======
+      throw std::runtime_error(
+          "type (string) must not include an index because index "
+          "was passed explicitly: " +
+          device_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     int64_t device_index = -1;
     if (!r.isNone(1)) {
@@ -142,16 +153,26 @@ static PyObject* THPDevice_rc(PyObject* a, PyObject* b, int op) {
     case Py_LE:
     case Py_GT:
     case Py_GE:
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(false, "comparison not implemented");
     default:
       TORCH_CHECK_TYPE(false, "unexpected comparison op");
+=======
+      throw torch::TypeError("comparison not implemented");
+    default:
+      throw torch::TypeError("unexpected comparison op");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPDevice_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPDevice*>(_self);
+=======
+  auto self = (THPDevice*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto ret = THPObjectPtr{PyTuple_New(2)};
   if (!ret)
     throw python_error();
@@ -221,6 +242,7 @@ typedef PyObject* (*getter)(PyObject*, void*);
 // NB: If you edit these properties/methods, update torch/_C/__init__.pyi.in
 
 static const std::initializer_list<PyGetSetDef> THPDevice_properties = {
+<<<<<<< HEAD
     {"type",
      reinterpret_cast<getter>(THPDevice_type),
      nullptr,
@@ -231,6 +253,10 @@ static const std::initializer_list<PyGetSetDef> THPDevice_properties = {
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"type", (getter)THPDevice_type, nullptr, nullptr, nullptr},
+    {"index", (getter)THPDevice_index, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPDevice_methods = {
@@ -250,18 +276,30 @@ PyTypeObject THPDeviceType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPDevice_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     reinterpret_cast<hashfunc>(THPDevice_hash), /* tp_hash  */
+=======
+    (reprfunc)THPDevice_repr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    (hashfunc)THPDevice_hash, /* tp_hash  */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // TODO: We're not sure if this is a good idea or not, because making
     // torch.device callable means that it will start returning true
     // for callable() queries, and that is unexpected.  We can always add
     // this later, so for now, don't actually implement this
     // THPDevice_call, /* tp_call */
     nullptr, /* tp_call */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPDevice_str), /* tp_str */
+=======
+    (reprfunc)THPDevice_str, /* tp_str */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -269,7 +307,11 @@ PyTypeObject THPDeviceType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
+<<<<<<< HEAD
     static_cast<richcmpfunc>(THPDevice_rc), /* tp_richcompare */
+=======
+    (richcmpfunc)THPDevice_rc, /* tp_richcompare */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -294,8 +336,12 @@ void THPDevice_init(PyObject* module) {
   }
   Py_INCREF(&THPDeviceType);
   THPUpperModuleOfDevice = module;
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "device", reinterpret_cast<PyObject*>(&THPDeviceType)) != 0) {
+=======
+  if (PyModule_AddObject(module, "device", (PyObject*)&THPDeviceType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index b6176f11aaf6e..9adfdb24122d1 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/DeviceAccelerator.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 
@@ -73,6 +76,7 @@ void initModule(PyObject* module) {
     torch::utils::maybe_initialize_device(device_type);
     return at::accelerator::maybeExchangeDevice(device_index);
   });
+<<<<<<< HEAD
 
   m.def("_accelerator_isAllocatorInitialized", []() {
     const auto device_type = at::accelerator::getAccelerator(true).value();
@@ -141,6 +145,8 @@ void initModule(PyObject* module) {
   m.def("_accelerator_setAllocatorSettings", [](std::string env) {
     c10::CachingAllocator::setAllocatorSettings(env);
   });
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::accelerator
diff --git a/torch/csrc/Dtype.cpp b/torch/csrc/Dtype.cpp
index c302378de81e4..fe08719c5a09e 100644
--- a/torch/csrc/Dtype.cpp
+++ b/torch/csrc/Dtype.cpp
@@ -15,7 +15,11 @@
 PyObject* THPDtype_New(at::ScalarType scalar_type, const std::string& name) {
   HANDLE_TH_ERRORS
   AT_ASSERT(name.length() < DTYPE_NAME_LEN);
+<<<<<<< HEAD
   auto type = &THPDtypeType;
+=======
+  auto type = (PyTypeObject*)&THPDtypeType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -69,14 +73,22 @@ static PyObject* THPDtype_reduce(PyObject* _self, PyObject* noargs) {
    * For singletons, a string is returned. The string should be interpreted
    * as the name of a global variable.
    */
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPDtype*>(_self);
+=======
+  auto self = (THPDtype*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packString(self->name);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPDtype_to_real(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto* self = reinterpret_cast<THPDtype*>(_self);
+=======
+  auto* self = (THPDtype*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scalar_type = self->scalar_type;
   if (!at::isFloatingType(self->scalar_type)) {
     scalar_type = at::toRealValueType(self->scalar_type);
@@ -87,7 +99,11 @@ static PyObject* THPDtype_to_real(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPDtype_to_complex(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto* self = reinterpret_cast<THPDtype*>(_self);
+=======
+  auto* self = (THPDtype*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scalar_type = self->scalar_type;
   if (!at::isComplexType(self->scalar_type)) {
     scalar_type = at::toComplexType(self->scalar_type);
@@ -100,6 +116,7 @@ typedef PyObject* (*getter)(PyObject*, void*);
 
 static const std::initializer_list<PyGetSetDef> THPDtype_properties = {
     {"is_floating_point",
+<<<<<<< HEAD
      reinterpret_cast<getter>(THPDtype_is_floating_point),
      nullptr,
      nullptr,
@@ -119,6 +136,15 @@ static const std::initializer_list<PyGetSetDef> THPDtype_properties = {
      nullptr,
      nullptr,
      nullptr},
+=======
+     (getter)THPDtype_is_floating_point,
+     nullptr,
+     nullptr,
+     nullptr},
+    {"is_complex", (getter)THPDtype_is_complex, nullptr, nullptr, nullptr},
+    {"is_signed", (getter)THPDtype_is_signed, nullptr, nullptr, nullptr},
+    {"itemsize", (getter)THPDtype_itemsize, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPDtype_methods = {
@@ -142,7 +168,11 @@ PyTypeObject THPDtypeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPDtype_repr), /* tp_repr */
+=======
+    (reprfunc)THPDtype_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -202,8 +232,12 @@ void THPDtype_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPDtypeType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "dtype", reinterpret_cast<PyObject*>(&THPDtypeType)) != 0) {
+=======
+  if (PyModule_AddObject(module, "dtype", (PyObject*)&THPDtypeType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/Event.cpp b/torch/csrc/Event.cpp
index 319eee8a41c6a..42dbf36cddc97 100644
--- a/torch/csrc/Event.cpp
+++ b/torch/csrc/Event.cpp
@@ -48,7 +48,11 @@ static PyObject* THPEvent_pynew(
     TORCH_CHECK(ptr, "Failed to allocate memory for Event");
   }
 
+<<<<<<< HEAD
   THPEvent* self = reinterpret_cast<THPEvent*>(ptr.get());
+=======
+  THPEvent* self = (THPEvent*)ptr.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // TODO: blocking and interprocess are not supported yet. To support them, the
   // flag system of c10::Event needs to be refactored. C10::Event should also
@@ -64,12 +68,20 @@ static PyObject* THPEvent_pynew(
       (enable_timing ? c10::EventFlag::BACKEND_DEFAULT
                      : c10::EventFlag::PYTORCH_DEFAULT));
 
+<<<<<<< HEAD
   return static_cast<PyObject*>(ptr.release());
+=======
+  return (PyObject*)ptr.release();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THPEvent_new(c10::DeviceType device_type, c10::EventFlag flag) {
+<<<<<<< HEAD
   auto type = &THPEventType;
+=======
+  auto type = (PyTypeObject*)&THPEventType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   TORCH_CHECK(self, "Failed to allocate memory for Event");
   auto self_ = reinterpret_cast<THPEvent*>(self.get());
@@ -82,7 +94,11 @@ static void THPEvent_dealloc(THPEvent* self) {
     pybind11::gil_scoped_release no_gil{};
     self->event.~Event();
   }
+<<<<<<< HEAD
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
+=======
+  Py_TYPE(self)->tp_free((PyObject*)self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPEvent_get_device(THPEvent* self, void* unused) {
@@ -96,7 +112,11 @@ static PyObject* THPEvent_record(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPEvent*>(_self);
+=======
+  auto self = (THPEvent*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* _stream = Py_None;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   constexpr const char* accepted_args[] = {"stream", nullptr};
@@ -111,7 +131,11 @@ static PyObject* THPEvent_record(
     return nullptr;
   }
   if (_stream != Py_None) {
+<<<<<<< HEAD
     auto stream = reinterpret_cast<THPStream*>(_stream);
+=======
+    auto stream = (THPStream*)_stream;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self->event.record(c10::Stream::unpack3(
         stream->stream_id,
         static_cast<c10::DeviceIndex>(stream->device_index),
@@ -130,7 +154,11 @@ static PyObject* THPEvent_from_ipc_handle(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto type = reinterpret_cast<PyTypeObject*>(_type);
+=======
+  auto type = (PyTypeObject*)_type;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static torch::PythonArgParser parser({
       "from_ipc_handle(Device device, std::string ipc_handle)",
@@ -146,13 +174,21 @@ static PyObject* THPEvent_from_ipc_handle(
   if (!ptr) {
     return nullptr;
   }
+<<<<<<< HEAD
   THPEvent* self = reinterpret_cast<THPEvent*>(ptr.get());
+=======
+  THPEvent* self = (THPEvent*)ptr.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // TODO: for constructing event from ipc handle, the c10::Event needs to have
   // more general constructor to achieve that.
   new (&self->event) c10::Event(device.type(), c10::EventFlag::PYTORCH_DEFAULT);
 
+<<<<<<< HEAD
   return static_cast<PyObject*>(ptr.release());
+=======
+  return (PyObject*)ptr.release();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -174,7 +210,11 @@ static PyObject* THPEvent_wait(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS {
+<<<<<<< HEAD
     auto self = reinterpret_cast<THPEvent*>(_self);
+=======
+    auto self = (THPEvent*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* _stream = Py_None;
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
     constexpr const char* accepted_args[] = {"stream", nullptr};
@@ -189,7 +229,11 @@ static PyObject* THPEvent_wait(
       return nullptr;
     }
     if (_stream != Py_None) {
+<<<<<<< HEAD
       auto stream = reinterpret_cast<THPStream*>(_stream);
+=======
+      auto stream = (THPStream*)_stream;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       self->event.block(c10::Stream::unpack3(
           stream->stream_id,
           static_cast<c10::DeviceIndex>(stream->device_index),
@@ -206,15 +250,24 @@ static PyObject* THPEvent_wait(
 
 static PyObject* THPEvent_query(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPEvent*>(_self);
+=======
+  auto self = (THPEvent*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyBool_FromLong(self->event.query());
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPEvent*>(_self);
   auto other = reinterpret_cast<THPEvent*>(_other);
+=======
+  auto self = (THPEvent*)_self;
+  auto other = (THPEvent*)_other;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyFloat_FromDouble(self->event.elapsedTime(other->event));
   END_HANDLE_TH_ERRORS
 }
@@ -222,7 +275,11 @@ static PyObject* THPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
 static PyObject* THPEvent_synchronize(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS {
     pybind11::gil_scoped_release no_gil{};
+<<<<<<< HEAD
     auto self = reinterpret_cast<THPEvent*>(_self);
+=======
+    auto self = (THPEvent*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self->event.synchronize();
   }
   Py_RETURN_NONE;
@@ -231,7 +288,11 @@ static PyObject* THPEvent_synchronize(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPEvent_evend_id(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPEvent*>(_self);
+=======
+  auto self = (THPEvent*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyLong_FromVoidPtr(self->event.eventId());
   END_HANDLE_TH_ERRORS
 }
@@ -251,6 +312,7 @@ static PyObject* THPEvent_repr(THPEvent* self) {
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
 static struct PyGetSetDef THPEvent_properties[] = {
+<<<<<<< HEAD
     {"device",
      reinterpret_cast<getter>(THPEvent_get_device),
      nullptr,
@@ -261,6 +323,10 @@ static struct PyGetSetDef THPEvent_properties[] = {
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"device", (getter)THPEvent_get_device, nullptr, nullptr, nullptr},
+    {"event_id", (getter)THPEvent_evend_id, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
@@ -288,12 +354,20 @@ PyTypeObject THPEventType = {
     "torch.Event", /* tp_name */
     sizeof(THPEvent), /* tp_basicsize */
     0, /* tp_itemsize */
+<<<<<<< HEAD
     reinterpret_cast<destructor>(THPEvent_dealloc), /* tp_dealloc */
+=======
+    (destructor)THPEvent_dealloc, /* tp_dealloc */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPEvent_repr), /* tp_repr */
+=======
+    (reprfunc)THPEvent_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -330,8 +404,12 @@ void THPEvent_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPEventType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "Event", reinterpret_cast<PyObject*>(&THPEventType)) < 0) {
+=======
+  if (PyModule_AddObject(module, "Event", (PyObject*)&THPEventType) < 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index cf74ddff576c3..8e827e16f7881 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -65,8 +65,12 @@ could not be completed because the input matrix is singular.",
           "Exception raised when device is out of memory",
           PyExc_RuntimeError,
           nullptr));
+<<<<<<< HEAD
   PyTypeObject* type =
       reinterpret_cast<PyTypeObject*>(THPException_OutOfMemoryError);
+=======
+  PyTypeObject* type = (PyTypeObject*)THPException_OutOfMemoryError;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   type->tp_name = "torch.OutOfMemoryError";
   ASSERT_TRUE(
       PyModule_AddObject(
@@ -134,7 +138,11 @@ could not be completed because the input matrix is singular.",
           "Exception raised while executing on device",
           PyExc_RuntimeError,
           nullptr));
+<<<<<<< HEAD
   type = reinterpret_cast<PyTypeObject*>(THPException_AcceleratorError);
+=======
+  type = (PyTypeObject*)THPException_AcceleratorError;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_TRUE(
       PyModule_AddObject(
           module, "AcceleratorError", THPException_AcceleratorError) == 0);
@@ -229,6 +237,20 @@ std::string processErrorMsg(std::string str) {
   return str;
 }
 
+<<<<<<< HEAD
+=======
+static std::string formatMessage(const char* format, va_list fmt_args) {
+  constexpr size_t ERROR_BUF_SIZE = 1024;
+  std::string error_buf(ERROR_BUF_SIZE, '\0');
+  auto res = vsnprintf(error_buf.data(), ERROR_BUF_SIZE, format, fmt_args);
+  if (res < 0) {
+    res = 0;
+  }
+  error_buf.resize(res);
+  return error_buf;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void translate_exception_to_python(const std::exception_ptr& e_ptr) {
   try {
     TORCH_INTERNAL_ASSERT(
@@ -240,6 +262,16 @@ void translate_exception_to_python(const std::exception_ptr& e_ptr) {
   CATCH_ALL_ERRORS(return)
 }
 
+<<<<<<< HEAD
+=======
+TypeError::TypeError(const char* format, ...) {
+  va_list fmt_args{};
+  va_start(fmt_args, format);
+  msg = formatMessage(format, fmt_args);
+  va_end(fmt_args);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void PyWarningHandler::InternalHandler::process(const c10::Warning& warning) {
   warning_buffer_.push_back(warning);
 }
@@ -253,10 +285,17 @@ PyWarningHandler::PyWarningHandler() noexcept(true)
 // Get the Python warning type for a warning
 static PyObject* map_warning_to_python_type(const c10::Warning& warning) {
   struct Visitor {
+<<<<<<< HEAD
     PyObject* operator()(const c10::UserWarning& /*unused*/) const {
       return PyExc_UserWarning;
     }
     PyObject* operator()(const c10::DeprecationWarning& /*unused*/) const {
+=======
+    PyObject* operator()(const c10::UserWarning&) const {
+      return PyExc_UserWarning;
+    }
+    PyObject* operator()(const c10::DeprecationWarning&) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return PyExc_DeprecationWarning;
     }
   };
@@ -337,7 +376,11 @@ PyObject* _new_accelerator_error_object(const c10::AcceleratorError& e) {
 
   auto py_msg = PyUnicode_FromString(msg);
   auto rc = PyObject_CallOneArg(THPException_AcceleratorError, py_msg);
+<<<<<<< HEAD
   auto error_code = THPUtils_packUInt32(e.get_error_code());
+=======
+  auto error_code = PyInt_FromLong(e.get_error_code());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject_SetAttrString(rc, "error_code", error_code);
   Py_XDECREF(py_msg);
   Py_XDECREF(error_code);
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index d580809460811..f0011153eebf5 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -74,7 +74,10 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
   _CATCH_GENERIC_ERROR(TypeError, PyExc_TypeError, retstmnt)                  \
   _CATCH_GENERIC_ERROR(                                                       \
       NotImplementedError, PyExc_NotImplementedError, retstmnt)               \
+<<<<<<< HEAD
   _CATCH_GENERIC_ERROR(BufferError, PyExc_BufferError, retstmnt)              \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _CATCH_GENERIC_ERROR(SyntaxError, PyExc_SyntaxError, retstmnt)              \
   _CATCH_GENERIC_ERROR(LinAlgError, THPException_LinAlgError, retstmnt)       \
   _CATCH_GENERIC_ERROR(                                                       \
@@ -269,8 +272,12 @@ bool THPException_init(PyObject* module);
 namespace torch {
 
 // Set python current exception from a C++ exception
+<<<<<<< HEAD
 TORCH_PYTHON_API void translate_exception_to_python(
     const std::exception_ptr& /*e_ptr*/);
+=======
+TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_PYTHON_API std::string processErrorMsg(std::string str);
 
@@ -285,12 +292,28 @@ struct PyTorchError : public std::exception {
   std::string msg;
 };
 
+<<<<<<< HEAD
 // Translates to Python TypeError
 struct TypeError : public PyTorchError {
   TORCH_PYTHON_API TypeError() = default;
   TORCH_PYTHON_API TypeError(std::string msg_)
       : PyTorchError(std::move(msg_)) {}
   using PyTorchError::PyTorchError;
+=======
+// Declare a printf-like function on gcc & clang
+// The compiler can then warn on invalid format specifiers
+#ifdef __GNUC__
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX) \
+  __attribute__((format(printf, FORMAT_INDEX, VA_ARGS_INDEX)))
+#else
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX)
+#endif
+
+// Translates to Python TypeError
+struct TypeError : public PyTorchError {
+  using PyTorchError::PyTorchError;
+  TORCH_PYTHON_API TypeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* python_type() override {
     return PyExc_TypeError;
   }
@@ -359,8 +382,13 @@ using Arg = typename invoke_traits<Func>::template arg<i>::type;
 template <typename Func, size_t... Is, bool release_gil>
 auto wrap_pybind_function_impl_(
     Func&& f,
+<<<<<<< HEAD
     std::index_sequence<Is...> /*unused*/,
     std::bool_constant<release_gil> /*unused*/) {
+=======
+    std::index_sequence<Is...>,
+    std::bool_constant<release_gil>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   namespace py = pybind11;
 
   // f=f is needed to handle function references on older compilers
@@ -372,7 +400,11 @@ auto wrap_pybind_function_impl_(
   };
 }
 
+<<<<<<< HEAD
 PyObject* _new_accelerator_error_object(const c10::AcceleratorError& /*e*/);
+=======
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 // Wrap a function with TH error and warning handling.
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 058335921209e..b728dd48ced2d 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -21,7 +21,11 @@ using namespace torch;
 PyObject* THPGeneratorClass = nullptr;
 
 PyObject* THPGenerator_initDefaultGenerator(const at::Generator& cdata) {
+<<<<<<< HEAD
   auto type = reinterpret_cast<PyTypeObject*>(THPGeneratorClass);
+=======
+  auto type = (PyTypeObject*)THPGeneratorClass;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -49,8 +53,12 @@ static PyObject* THPGenerator_pynew(
   auto r = parser.parse(args, kwargs, parsed_args);
   auto device = r.deviceWithDefault(0, at::Device(at::kCPU));
 
+<<<<<<< HEAD
   THPGeneratorPtr self(
       reinterpret_cast<THPGenerator*>(type->tp_alloc(type, 0)));
+=======
+  THPGeneratorPtr self((THPGenerator*)type->tp_alloc(type, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   c10::DeviceType device_type = device.type();
   if (device_type == at::kCPU) {
@@ -61,14 +69,22 @@ static PyObject* THPGenerator_pynew(
                       .getNewGenerator(device.index());
   }
 
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self.release());
+=======
+  return (PyObject*)self.release();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_getState(PyObject* _self, PyObject* noargs) {
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
+=======
+  auto& gen = ((THPGenerator*)_self)->cdata;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -83,6 +99,7 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
 
   HANDLE_TH_ERRORS
   if (!THPVariable_Check(_new_state)) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format(
@@ -90,6 +107,13 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
             Py_TYPE(_new_state)->tp_name));
   }
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+    throw torch::TypeError(
+        "expected a torch.ByteTensor, but got %s",
+        Py_TYPE(_new_state)->tp_name);
+  }
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& gen = self->cdata;
   const auto& new_state_tensor = THPVariable_Unpack(_new_state);
 
@@ -98,7 +122,11 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
   gen.set_state(new_state_tensor);
 
   Py_INCREF(self);
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self);
+=======
+  return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -126,7 +154,11 @@ static PyObject* THPGenerator_graphSafeGetState(
     PyObject* _self,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
+=======
+  auto& gen = ((THPGenerator*)_self)->cdata;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -139,7 +171,11 @@ static PyObject* THPGenerator_graphSafeSetState(
     PyObject* _self,
     PyObject* _state) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& gen = self->cdata;
 
   // See Note [Acquire lock when using random generators]
@@ -147,13 +183,21 @@ static PyObject* THPGenerator_graphSafeSetState(
   gen.graphsafe_set_state(THPGenerator_Unwrap(_state));
 
   Py_INCREF(self);
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self);
+=======
+  return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_cloneState(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
+=======
+  auto& gen = ((THPGenerator*)_self)->cdata;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -164,7 +208,11 @@ static PyObject* THPGenerator_cloneState(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPGenerator_manualSeed(PyObject* _self, PyObject* seed) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto generator = self->cdata;
   TORCH_CHECK(
       THPUtils_checkLong(seed),
@@ -176,13 +224,21 @@ static PyObject* THPGenerator_manualSeed(PyObject* _self, PyObject* seed) {
   std::scoped_lock<std::mutex> lock(generator.mutex());
   generator.set_current_seed(unsigned_seed);
   Py_INCREF(self);
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self);
+=======
+  return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_setOffset(PyObject* _self, PyObject* offset) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto generator = self->cdata;
   TORCH_CHECK(
       THPUtils_checkLong(offset),
@@ -194,14 +250,22 @@ static PyObject* THPGenerator_setOffset(PyObject* _self, PyObject* offset) {
   std::scoped_lock<std::mutex> lock(generator.mutex());
   generator.set_offset(unsigned_offset);
   Py_INCREF(self);
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self);
+=======
+  return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_seed(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   // See Note [Acquire lock when using random generators]
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::scoped_lock<std::mutex> lock(self->cdata.mutex());
   uint64_t seed_val = self->cdata.seed();
   return THPUtils_packUInt64(seed_val);
@@ -210,14 +274,22 @@ static PyObject* THPGenerator_seed(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPGenerator_initialSeed(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packUInt64(self->cdata.current_seed());
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_getOffset(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packUInt64(self->cdata.get_offset());
   END_HANDLE_TH_ERRORS
 }
@@ -230,7 +302,11 @@ static PyObject* THPGenerator_get_device(THPGenerator* self, void* unused) {
 
 static PyObject* THPGenerator_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPGenerator*>(_self);
+=======
+  auto self = (THPGenerator*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& gen = self->cdata;
 
   auto ret = THPObjectPtr{PyTuple_New(3)};
@@ -280,11 +356,15 @@ static PyObject* THPGenerator_pickleSetState(PyObject* _self, PyObject* state) {
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyGetSetDef THPGenerator_properties[] = {
+<<<<<<< HEAD
     {"device",
      reinterpret_cast<getter>(THPGenerator_get_device),
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"device", (getter)THPGenerator_get_device, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
@@ -354,12 +434,20 @@ static PyTypeObject THPGeneratorType = {
 };
 
 bool THPGenerator_init(PyObject* module) {
+<<<<<<< HEAD
   THPGeneratorClass = reinterpret_cast<PyObject*>(&THPGeneratorType);
   if (PyType_Ready(&THPGeneratorType) < 0)
     return false;
   Py_INCREF(&THPGeneratorType);
   PyModule_AddObject(
       module, "Generator", reinterpret_cast<PyObject*>(&THPGeneratorType));
+=======
+  THPGeneratorClass = (PyObject*)&THPGeneratorType;
+  if (PyType_Ready(&THPGeneratorType) < 0)
+    return false;
+  Py_INCREF(&THPGeneratorType);
+  PyModule_AddObject(module, "Generator", (PyObject*)&THPGeneratorType);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
@@ -383,16 +471,25 @@ PyObject* THPGenerator_Wrap(const Generator& gen) {
     return obj;
   }
 
+<<<<<<< HEAD
   return THPGenerator_NewWithVar(
       reinterpret_cast<PyTypeObject*>(THPGeneratorClass), gen);
+=======
+  return THPGenerator_NewWithVar((PyTypeObject*)THPGeneratorClass, gen);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Generator THPGenerator_Unwrap(PyObject* state) {
   if (!Py_IS_TYPE(state, &THPGeneratorType)) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format(
             "expected a Generator, but got {}", Py_TYPE(state)->tp_name));
+=======
+    throw torch::TypeError(
+        "expected a Generator, but got %s", Py_TYPE(state)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return reinterpret_cast<THPGenerator*>(state)->cdata;
 }
@@ -402,7 +499,11 @@ at::Generator THPGenerator_Unwrap(PyObject* state) {
 PyObject* THPGenerator_NewWithVar(PyTypeObject* type, Generator gen) {
   PyObject* obj = type->tp_alloc(type, 0);
   if (obj) {
+<<<<<<< HEAD
     auto g = reinterpret_cast<THPGenerator*>(obj);
+=======
+    auto g = (THPGenerator*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new (&g->cdata) Generator(std::move(gen));
     set_pyobj(g->cdata, obj);
   }
diff --git a/torch/csrc/Layout.cpp b/torch/csrc/Layout.cpp
index 06b49d56f649d..381482a5eb138 100644
--- a/torch/csrc/Layout.cpp
+++ b/torch/csrc/Layout.cpp
@@ -11,7 +11,11 @@
 #include <string>
 
 PyObject* THPLayout_New(at::Layout layout, const std::string& name) {
+<<<<<<< HEAD
   auto type = &THPLayoutType;
+=======
+  auto type = (PyTypeObject*)&THPLayoutType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -36,7 +40,11 @@ PyTypeObject THPLayoutType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPLayout_repr), /* tp_repr */
+=======
+    (reprfunc)THPLayout_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -72,8 +80,12 @@ void THPLayout_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPLayoutType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "layout", reinterpret_cast<PyObject*>(&THPLayoutType)) != 0) {
+=======
+  if (PyModule_AddObject(module, "layout", (PyObject*)&THPLayoutType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/MemoryFormat.cpp b/torch/csrc/MemoryFormat.cpp
index 5bd3f9eed42d6..badb8c5d4fcd6 100644
--- a/torch/csrc/MemoryFormat.cpp
+++ b/torch/csrc/MemoryFormat.cpp
@@ -13,7 +13,11 @@
 PyObject* THPMemoryFormat_New(
     at::MemoryFormat memory_format,
     const std::string& name) {
+<<<<<<< HEAD
   auto type = &THPMemoryFormatType;
+=======
+  auto type = (PyTypeObject*)&THPMemoryFormatType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -29,7 +33,11 @@ static PyObject* THPMemoryFormat_repr(THPMemoryFormat* self) {
 }
 
 static PyObject* THPMemoryFormat_reduce(PyObject* _self, PyObject* noargs) {
+<<<<<<< HEAD
   auto* self = reinterpret_cast<THPMemoryFormat*>(_self);
+=======
+  auto* self = (THPMemoryFormat*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packString(self->name);
 }
 
@@ -49,7 +57,11 @@ PyTypeObject THPMemoryFormatType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPMemoryFormat_repr), /* tp_repr */
+=======
+    (reprfunc)THPMemoryFormat_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -86,9 +98,13 @@ void THPMemoryFormat_init(PyObject* module) {
   }
   Py_INCREF(&THPMemoryFormatType);
   if (PyModule_AddObject(
+<<<<<<< HEAD
           module,
           "memory_format",
           reinterpret_cast<PyObject*>(&THPMemoryFormatType)) != 0) {
+=======
+          module, "memory_format", (PyObject*)&THPMemoryFormatType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 8c5f8e5918397..8a0c0406770ca 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -2,7 +2,10 @@
 #include <fmt/core.h>
 #include <sys/types.h>
 #include <torch/csrc/python_headers.h>
+<<<<<<< HEAD
 #include <csignal>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 #ifndef _MSC_VER
@@ -20,13 +23,20 @@
 #include <ATen/Parallel.h>
 #include <ATen/Utils.h>
 #include <ATen/core/Vitals.h>
+<<<<<<< HEAD
+=======
+#include <ATen/detail/AcceleratorHooksInterface.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/dlpack.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/Normalization.h>
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
+<<<<<<< HEAD
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/AbortHandler.h>
 #include <c10/util/Backtrace.h>
 #include <c10/util/Logging.h>
@@ -56,7 +66,10 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/TypeInfo.h>
+<<<<<<< HEAD
 #include <torch/csrc/acc/Module.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/api/include/torch/python/init.h>
 #include <torch/csrc/autograd/generated/python_return_types.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
@@ -71,10 +84,15 @@
 #include <torch/csrc/autograd/python_special_functions.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/cpu/Module.h>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/python_placement.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/export/pybind.h>
 #include <torch/csrc/functionalization/Module.h>
+=======
+#include <torch/csrc/dynamo/init.h>
+#include <torch/csrc/export/pybind.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/fx/node.h>
 #include <torch/csrc/inductor/aoti_package/pybind.h>
@@ -114,7 +132,10 @@
 
 #ifdef USE_CUDA
 #include <ATen/ROCmFABackend.h>
+<<<<<<< HEAD
 #include <ATen/cuda/CUDABlas.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 #include <torch/csrc/inductor/static_cuda_launcher.h>
@@ -142,8 +163,11 @@
 #include <torch/csrc/itt.h>
 #endif
 
+<<<<<<< HEAD
 #include <torch/nativert/python/Bindings.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace py = pybind11;
 
 static PyObject* module;
@@ -167,7 +191,11 @@ static PyObject* THPModule_initNames(PyObject* self, PyObject* arg) {
   for (Py_ssize_t i = 0; i < num_classes; i++) {
     PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i);
     TORCH_CHECK(PyType_Check(obj), "expected a PyTypeObject");
+<<<<<<< HEAD
     PyTypeObject* type = reinterpret_cast<PyTypeObject*>(obj);
+=======
+    PyTypeObject* type = (PyTypeObject*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     THPObjectPtr module_name(PyObject_GetAttrString(obj, "__module__"));
     if (!module_name)
@@ -242,7 +270,11 @@ static PyObject* THPModule_initExtension(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 // The idea behind these functions is to make it easy to test if we are
+=======
+// The idea behind these two functions is to make it easy to test if we are
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // built with ASAN: they're designed not to crash if ASAN is not enabled, but
 // to trigger ASAN if it is enabled.  This lets us run a "canary" tests which
 // checks if our build environment is misconfigured.
@@ -269,7 +301,11 @@ static PyObject* THPModule_crashIfCsrcUBSAN(PyObject* module, PyObject* arg) {
       THPUtils_typename(arg));
   int32_t x = THPUtils_unpackInt(arg);
   double y = 1.0 / x;
+<<<<<<< HEAD
   return THPUtils_packInt32(static_cast<int>(y));
+=======
+  return THPUtils_packInt32((int)y);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -287,8 +323,12 @@ static PyObject* THPModule_crashIfvptrUBSAN(PyObject* module, PyObject* noarg) {
     virtual ~Baz() = default;
   };
   Baz x{};
+<<<<<<< HEAD
   // Purposely cast through `void*` so there's no fixups applied.
   // NOLINTNEXTLINE(bugprone-casting-through-void,-warnings-as-errors)
+=======
+  // NOLINTNEXTLINE(bugprone-casting*)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto y = static_cast<Foo*>(static_cast<void*>(&x));
   auto rc = y->bar();
   return THPUtils_packInt32(rc);
@@ -335,7 +375,11 @@ static PyObject* THPModule_setNumThreads(PyObject* module, PyObject* arg) {
       THPUtils_checkLong(arg),
       "set_num_threads expects an int, but got ",
       THPUtils_typename(arg));
+<<<<<<< HEAD
   int nthreads = THPUtils_unpackInt(arg);
+=======
+  int nthreads = (int)THPUtils_unpackLong(arg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(nthreads > 0, "set_num_threads expects a positive integer");
   at::set_num_threads(nthreads);
   Py_RETURN_NONE;
@@ -357,7 +401,11 @@ static PyObject* THPModule_setNumInteropThreads(
       "set_num_interop_threads expects an int, "
       "but got ",
       THPUtils_typename(arg));
+<<<<<<< HEAD
   int nthreads = THPUtils_unpackInt(arg);
+=======
+  int nthreads = (int)THPUtils_unpackLong(arg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       nthreads > 0, "set_num_interop_threads expects a positive integer");
   at::set_num_interop_threads(nthreads);
@@ -413,10 +461,17 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   // associated with the TensorImpl. Swap this field as well.
   std::optional<PyObject*> mb_obj_a =
       a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
           /*ignore_hermetic_tls=*/false);
   std::optional<PyObject*> mb_obj_b =
       b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
           /*ignore_hermetic_tls=*/false);
+=======
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  std::optional<PyObject*> mb_obj_b =
+      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(
       mb_obj_a.has_value() && mb_obj_b.has_value(),
       "Both tensors should have PyObjects tagged by the current python interpreter");
@@ -426,8 +481,15 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   a->cdata = b->cdata;
   b->cdata = tmp;
 
+<<<<<<< HEAD
   a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(a_);
   b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(b_);
+=======
+  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), a_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), b_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -449,7 +511,11 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
   }
 
   if (Py_TYPE(obj) == &PyCFunction_Type) {
+<<<<<<< HEAD
     PyCFunctionObject* f = reinterpret_cast<PyCFunctionObject*>(obj);
+=======
+    PyCFunctionObject* f = (PyCFunctionObject*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (f->m_ml->ml_doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -458,7 +524,11 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     f->m_ml->ml_doc = doc_str;
   } else if (strcmp(Py_TYPE(obj)->tp_name, "method_descriptor") == 0) {
+<<<<<<< HEAD
     PyMethodDescrObject* m = reinterpret_cast<PyMethodDescrObject*>(obj);
+=======
+    PyMethodDescrObject* m = (PyMethodDescrObject*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (m->d_method->ml_doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -467,7 +537,12 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     m->d_method->ml_doc = doc_str;
   } else if (strcmp(Py_TYPE(obj)->tp_name, "getset_descriptor") == 0) {
+<<<<<<< HEAD
     PyGetSetDescrObject* m = reinterpret_cast<PyGetSetDescrObject*>(obj);
+=======
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
+    PyGetSetDescrObject* m = (PyGetSetDescrObject*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (m->d_getset->doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -476,7 +551,11 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     m->d_getset->doc = doc_str;
   } else if (Py_TYPE(obj) == &PyType_Type) {
+<<<<<<< HEAD
     PyTypeObject* t = reinterpret_cast<PyTypeObject*>(obj);
+=======
+    PyTypeObject* t = (PyTypeObject*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (t->tp_doc) {
       return PyErr_Format(
           PyExc_RuntimeError, "Type '%s' already has a docstring", t->tp_name);
@@ -495,7 +574,11 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
 
 static PyObject* THPModule_inferSize(PyObject* _unused, PyObject* args) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
+=======
+  Py_ssize_t num_args = args ? (Py_ssize_t)PyTuple_Size(args) : 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(num_args == 2, "expected exactly 2 arguments");
   PyObject* arg1 = PyTuple_GET_ITEM(args, 0);
   TORCH_CHECK(THPSize_Check(arg1), "expected a torch.Size as argument 1");
@@ -589,11 +672,16 @@ static PyObject* THPModule_getCpuCapability(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 namespace {
 
 template <class T>
 void DLPack_Capsule_Destructor(PyObject* data) {
   if (C10_LIKELY(!PyCapsule_IsValid(data, at::DLPackTraits<T>::capsule))) {
+=======
+static void DLPack_Capsule_Destructor(PyObject* data) {
+  if (C10_LIKELY(!PyCapsule_IsValid(data, "dltensor"))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // early out, see DLPack spec: if a consuming library sets the capsule
     // name to something else, they own it and we don't need to do anything
     return;
@@ -603,6 +691,7 @@ void DLPack_Capsule_Destructor(PyObject* data) {
   // since consuming libraries should rename the capsule according to spec.
   // Note that this cannot set a python error (we checked validity above),
   // so we don't need to handle python error state here.
+<<<<<<< HEAD
   T* tensor = (T*)PyCapsule_GetPointer(data, at::DLPackTraits<T>::capsule);
   // the dlMTensor has not been consumed, call deleter ourselves.
   // DLPack spec mentions that deleter may be NULL, but deleter from
@@ -664,6 +753,25 @@ static PyObject* THPModule_toDLPackVersioned(
   return THPModule_toDLPackImpl<DLManagedTensorVersioned>(self, args, kwargs);
 }
 
+=======
+  DLManagedTensor* dlMTensor =
+      (DLManagedTensor*)PyCapsule_GetPointer(data, "dltensor");
+  // the dlMTensor has not been consumed, call deleter ourselves.
+  // DLPack spec mentions that deleter may be NULL, but deleter from
+  // `at::toDLPack` is never NULL, so no need for an additional check here.
+  dlMTensor->deleter(dlMTensor);
+  END_HANDLE_TH_ERRORS_RET()
+}
+
+static PyObject* THPModule_toDLPack(PyObject* _unused, PyObject* data) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPVariable_Check(data), "data must be a Tensor");
+  DLManagedTensor* dlMTensor = at::toDLPack(THPVariable_Unpack(data));
+  return PyCapsule_New(dlMTensor, "dltensor", DLPack_Capsule_Destructor);
+  END_HANDLE_TH_ERRORS
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* THPModule_fromDLPack(PyObject* _unused, PyObject* data) {
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
@@ -672,6 +780,7 @@ static PyObject* THPModule_fromDLPack(PyObject* _unused, PyObject* data) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPModule_torchDeviceToDLDevice(
     PyObject* _unused,
     PyObject* data) {
@@ -694,6 +803,8 @@ static PyObject* THPModule_torchDeviceToDLDevice(
   END_HANDLE_TH_ERRORS
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) {
   HANDLE_TH_ERRORS
   size_t frames_to_skip = 0;
@@ -742,12 +853,18 @@ static PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) {
 }
 
 static PyObject* THPModule_allowTF32CuDNN(PyObject* _unused, PyObject* noargs) {
+<<<<<<< HEAD
   HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::globalContext().allowTF32CuDNN())
     Py_RETURN_TRUE;
   else
     Py_RETURN_FALSE;
+<<<<<<< HEAD
   END_HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPModule_setFloat32MatmulPrecision(
@@ -768,7 +885,10 @@ static PyObject* THPModule_setFloat32MatmulPrecision(
 static PyObject* THPModule_float32MatmulPrecision(
     PyObject* _unused,
     PyObject* noargs) {
+<<<<<<< HEAD
   HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string s = "highest";
   auto p = at::globalContext().float32MatmulPrecision();
   if (p == at::Float32MatmulPrecision::HIGH) {
@@ -777,7 +897,10 @@ static PyObject* THPModule_float32MatmulPrecision(
     s = "medium";
   }
   return THPUtils_packString(s);
+<<<<<<< HEAD
   END_HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 static PyObject* THPModule_setSDPPriorityOrder(
     PyObject* _unused,
@@ -1175,6 +1298,7 @@ static PyObject* THPModule_benchmarkCuDNN(PyObject* _unused, PyObject* noargs) {
   Py_RETURN_FALSE;
 }
 
+<<<<<<< HEAD
 static PyObject* THPModule_setImmediateMiopen(
     PyObject* _unused,
     PyObject* arg) {
@@ -1198,6 +1322,8 @@ static PyObject* THPModule_immediateMiopen(
   Py_RETURN_FALSE;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* THPModule_setAllowTF32CuBLAS(
     PyObject* _unused,
     PyObject* arg) {
@@ -1215,16 +1341,23 @@ static PyObject* THPModule_setAllowTF32CuBLAS(
 static PyObject* THPModule_allowTF32CuBLAS(
     PyObject* _unused,
     PyObject* noargs) {
+<<<<<<< HEAD
   HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::globalContext().allowTF32CuBLAS()) {
     Py_RETURN_TRUE;
   }
   Py_RETURN_FALSE;
+<<<<<<< HEAD
   END_HANDLE_TH_ERRORS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPModule_setAllowFP16ReductionCuBLAS(
     PyObject* _unused,
+<<<<<<< HEAD
     PyObject* args) {
   HANDLE_TH_ERRORS
   PyObject* allow_reduction_obj = nullptr;
@@ -1249,6 +1382,16 @@ static PyObject* THPModule_setAllowFP16ReductionCuBLAS(
   }
   at::globalContext().setAllowFP16ReductionCuBLAS(
       allow_reduction, allow_splitk);
+=======
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "set_allow_fp16_reduction_cublas expects a bool, "
+      "but got ",
+      THPUtils_typename(arg));
+  at::globalContext().setAllowFP16ReductionCuBLAS(arg == Py_True);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -1256,6 +1399,7 @@ static PyObject* THPModule_setAllowFP16ReductionCuBLAS(
 static PyObject* THPModule_allowFP16ReductionCuBLAS(
     PyObject* _unused,
     PyObject* noargs) {
+<<<<<<< HEAD
   auto option = at::globalContext().allowFP16ReductionCuBLAS();
   bool allow_reduced_precision =
       option == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
@@ -1265,10 +1409,17 @@ static PyObject* THPModule_allowFP16ReductionCuBLAS(
       2,
       allow_reduced_precision ? Py_True : Py_False,
       allow_splitk ? Py_True : Py_False);
+=======
+  if (at::globalContext().allowFP16ReductionCuBLAS()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPModule_setAllowBF16ReductionCuBLAS(
     PyObject* _unused,
+<<<<<<< HEAD
     PyObject* args) {
   HANDLE_TH_ERRORS
   PyObject* allow_reduction_obj = nullptr;
@@ -1293,6 +1444,16 @@ static PyObject* THPModule_setAllowBF16ReductionCuBLAS(
   }
   at::globalContext().setAllowBF16ReductionCuBLAS(
       allow_reduction, allow_splitk);
+=======
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "set_allow_bf16_reduction_cublas expects a bool, "
+      "but got ",
+      THPUtils_typename(arg));
+  at::globalContext().setAllowBF16ReductionCuBLAS(arg == Py_True);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -1300,6 +1461,7 @@ static PyObject* THPModule_setAllowBF16ReductionCuBLAS(
 static PyObject* THPModule_allowBF16ReductionCuBLAS(
     PyObject* _unused,
     PyObject* noargs) {
+<<<<<<< HEAD
   auto option = at::globalContext().allowBF16ReductionCuBLAS();
   bool allow_reduced_precision =
       option == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
@@ -1309,6 +1471,12 @@ static PyObject* THPModule_allowBF16ReductionCuBLAS(
       2,
       allow_reduced_precision ? Py_True : Py_False,
       allow_splitk ? Py_True : Py_False);
+=======
+  if (at::globalContext().allowBF16ReductionCuBLAS()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPModule_setAllowFP16AccumulationCuBLAS(
@@ -1394,7 +1562,10 @@ static PyObject* THPModule_setQEngine(PyObject* /* unused */, PyObject* arg) {
       "but got ",
       THPUtils_typename(arg));
   auto qengine = THPUtils_unpackLong(arg);
+<<<<<<< HEAD
   // NOLINTNEXTLINE(clang-analyzer-optin.core.EnumCastOutOfRange)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::globalContext().setQEngine(static_cast<at::QEngine>(qengine));
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -1408,7 +1579,11 @@ static PyObject* THPModule_qEngine(PyObject* _unused, PyObject* noargs) {
 static PyObject* THPModule_supportedQEngines(
     PyObject* _unused,
     PyObject* noargs) {
+<<<<<<< HEAD
   const auto& qengines = at::globalContext().supportedQEngines();
+=======
+  auto qengines = at::globalContext().supportedQEngines();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto list =
       THPObjectPtr(PyList_New(static_cast<Py_ssize_t>(qengines.size())));
   if (!list)
@@ -1472,11 +1647,18 @@ static PyObject* THPModule_willEngineExecuteNode(
   torch::autograd::Node* node = nullptr;
   std::shared_ptr<torch::autograd::Node> node_sp;
   if (isTHPFunction) {
+<<<<<<< HEAD
     node_sp = (reinterpret_cast<THPFunction*>(arg))->cdata.lock();
     node = node_sp.get();
   } else {
     node =
         (reinterpret_cast<torch::autograd::THPCppFunction*>(arg))->cdata.get();
+=======
+    node_sp = ((THPFunction*)arg)->cdata.lock();
+    node = node_sp.get();
+  } else {
+    node = ((torch::autograd::THPCppFunction*)arg)->cdata.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   const auto nodes_in_graph =
       torch::autograd::get_current_graph_task_nodes_in_graph();
@@ -1605,6 +1787,7 @@ static PyObject* THPModule_are_vmap_fallback_warnings_enabled(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THCPModule_ensureCUDADeviceGuardSet(
     PyObject* self,
     PyObject* noargs) {
@@ -1614,6 +1797,8 @@ static PyObject* THCPModule_ensureCUDADeviceGuardSet(
   END_HANDLE_TH_ERRORS
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::initializer_list<PyMethodDef> TorchMethods = {
     {"_initExtension", THPModule_initExtension, METH_O, nullptr},
     {"_autograd_init", THPAutograd_initExtension, METH_NOARGS, nullptr},
@@ -1720,8 +1905,11 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
     {"_set_onednn_allow_tf32", THPModule_setAllowTF32OneDNN, METH_O, nullptr},
     {"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS, nullptr},
     {"_set_cudnn_benchmark", THPModule_setBenchmarkCuDNN, METH_O, nullptr},
+<<<<<<< HEAD
     {"_get_miopen_immediate", THPModule_immediateMiopen, METH_NOARGS, nullptr},
     {"_set_miopen_immediate", THPModule_setImmediateMiopen, METH_O, nullptr},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_get_cudnn_deterministic",
      THPModule_deterministicCuDNN,
      METH_NOARGS,
@@ -1780,7 +1968,11 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      nullptr},
     {"_set_cublas_allow_fp16_reduced_precision_reduction",
      THPModule_setAllowFP16ReductionCuBLAS,
+<<<<<<< HEAD
      METH_VARARGS,
+=======
+     METH_O,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      nullptr},
     {"_get_cublas_allow_bf16_reduced_precision_reduction",
      THPModule_allowBF16ReductionCuBLAS,
@@ -1788,7 +1980,11 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      nullptr},
     {"_set_cublas_allow_bf16_reduced_precision_reduction",
      THPModule_setAllowBF16ReductionCuBLAS,
+<<<<<<< HEAD
      METH_VARARGS,
+=======
+     METH_O,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      nullptr},
     {"_get_cublas_allow_fp16_accumulation",
      THPModule_allowFP16AccumulationCuBLAS,
@@ -1822,6 +2018,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      THPModule_are_vmap_fallback_warnings_enabled,
      METH_NOARGS,
      nullptr},
+<<<<<<< HEAD
     {"_to_dlpack",
      castPyCFunctionWithKeywords(THPModule_toDLPack),
      METH_VARARGS | METH_KEYWORDS,
@@ -1835,6 +2032,10 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      THPModule_torchDeviceToDLDevice,
      METH_O,
      nullptr},
+=======
+    {"_to_dlpack", THPModule_toDLPack, METH_O, nullptr},
+    {"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr},
     {"_rename_privateuse1_backend",
      THModule_rename_privateuse1_backend,
@@ -1906,6 +2107,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      METH_O,
      nullptr},
     {"_has_torch_function_variadic",
+<<<<<<< HEAD
      reinterpret_cast<PyCFunction>(
          reinterpret_cast<void (*)()>(THPModule_has_torch_function_variadic)),
      METH_FASTCALL,
@@ -1920,16 +2122,30 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
 
 #ifdef USE_CUDA
 // NOLINTBEGIN(misc-use-internal-linkage)
+=======
+     (PyCFunction)(void (*)())THPModule_has_torch_function_variadic,
+     METH_FASTCALL,
+     nullptr},
+    {nullptr, nullptr, 0, nullptr}};
+
+#ifdef USE_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void THCPStream_init(PyObject* module);
 void THCPEvent_init(PyObject* module);
 void THCPGraph_init(PyObject* module);
 void THCPMemPool_init(PyObject* module);
+<<<<<<< HEAD
 void THCPGreenContext_init(PyObject* module);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyMethodDef* THCPModule_methods();
 namespace torch::cuda {
 void initModule(PyObject* module);
 } // namespace torch::cuda
+<<<<<<< HEAD
 // NOLINTEND(misc-use-internal-linkage)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef USE_XPU
@@ -1971,6 +2187,7 @@ class WeakTensorRef {
   }
 };
 
+<<<<<<< HEAD
 namespace {
 
 using SigHandler = void (*)(int);
@@ -2031,6 +2248,8 @@ void _initCrashHandler() {
 
 } // anonymous namespace
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" TORCH_PYTHON_API PyObject* initModule();
 // separate decl and defn for msvc error C2491
 PyObject* initModule() {
@@ -2138,10 +2357,15 @@ PyObject* initModule() {
   torch::cpu::initModule(module);
   torch::accelerator::initModule(module);
   torch::instruction_counter::initModule(module);
+<<<<<<< HEAD
   torch::acc::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));
   torch::functionalization::initModule(module);
+=======
+  torch::initVerboseBindings(module);
+  ASSERT_TRUE(THPStorage_init(module));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_CUDA
   // This will only initialise base classes and attach them to library namespace
@@ -2152,7 +2376,10 @@ PyObject* initModule() {
   THCPEvent_init(module);
   THCPGraph_init(module);
   THCPMemPool_init(module);
+<<<<<<< HEAD
   THCPGreenContext_init(module);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef USE_XPU
@@ -2160,8 +2387,11 @@ PyObject* initModule() {
   THXPEvent_init(module);
 #endif
 
+<<<<<<< HEAD
   torch::distributed::initPlacementBindings(module);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto set_module_attr =
       [&](const char* name, PyObject* v, bool incref = true) {
         // PyModule_AddObject steals reference
@@ -2214,7 +2444,10 @@ PyObject* initModule() {
   });
 
   auto py_module = py::reinterpret_borrow<py::module>(module);
+<<<<<<< HEAD
   py_module.def("_initCrashHandler", &_initCrashHandler);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py_module.def("_demangle", &c10::demangle);
   py_module.def("_log_api_usage_once", &LogAPIUsageOnceFromPython);
   py_module.def("_log_api_usage_metadata", &LogAPIUsageMetadataFromPython);
@@ -2261,7 +2494,11 @@ Call this whenever a new thread is created in order to propagate values from
   py_module.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
+<<<<<<< HEAD
     return c10::raw::intrusive_ptr::use_count(storage_impl);
+=======
+    return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 
   ASSERT_TRUE(
@@ -2271,8 +2508,11 @@ Call this whenever a new thread is created in order to propagate values from
       set_module_attr("_has_kleidiai", at::hasKleidiAI() ? Py_True : Py_False));
   ASSERT_TRUE(
       set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
+<<<<<<< HEAD
   ASSERT_TRUE(set_module_attr(
       "_has_eigen_sparse", at::hasEigenSparse() ? Py_True : Py_False));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py_module.def("_valgrind_supported_platform", []() {
 #if defined(USE_VALGRIND)
@@ -2511,6 +2751,7 @@ Call this whenever a new thread is created in order to propagate values from
     return at::globalContext().blasPreferredBackend();
   });
 
+<<<<<<< HEAD
   py::enum_<at::blas::ScalingType>(
       py_module, "_ScalingType", "Supported Tensor scaling types")
       .value(
@@ -2544,6 +2785,8 @@ Call this whenever a new thread is created in order to propagate values from
           at::blas::SwizzleType::SWIZZLE_32_4_4,
           "Blackwell-stype 32x4x4 swizzle");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::enum_<at::ROCmFABackend>(py_module, "_ROCmFABackend")
       .value("Default", at::ROCmFABackend::Default)
       .value("AOTriton", at::ROCmFABackend::AOTriton)
@@ -2575,6 +2818,7 @@ Call this whenever a new thread is created in order to propagate values from
       });
 
   py_module.def(
+<<<<<<< HEAD
       "_get_fp32_precision_getter",
       [](const std::string& backend, const std::string& op) {
         return at::precision2str(at::globalContext().float32Precision(
@@ -2594,6 +2838,8 @@ Call this whenever a new thread is created in order to propagate values from
       });
 
   py_module.def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "_stash_obj_in_tls", [](const std::string& key, py::handle arg) {
         at::impl::ThreadLocalPythonObjects::get_state().set(
             key,
@@ -2618,7 +2864,11 @@ Call this whenever a new thread is created in order to propagate values from
           .getAcceleratorHooksInterface(device_type)
           .deviceCount();
     }
+<<<<<<< HEAD
     return static_cast<c10::DeviceIndex>(-1);
+=======
+    return c10::DeviceIndex(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 
   py_module.def(
@@ -2639,7 +2889,11 @@ Call this whenever a new thread is created in order to propagate values from
           .getAcceleratorHooksInterface(device_type)
           .getCurrentDevice();
     }
+<<<<<<< HEAD
     return static_cast<c10::DeviceIndex>(-1);
+=======
+    return c10::DeviceIndex(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 
   py_module.def(
@@ -2650,7 +2904,11 @@ Call this whenever a new thread is created in order to propagate values from
               .getAcceleratorHooksInterface(device_type)
               .exchangeDevice(device_index);
         }
+<<<<<<< HEAD
         return static_cast<c10::DeviceIndex>(-1);
+=======
+        return c10::DeviceIndex(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
 
   py_module.def(
@@ -2662,7 +2920,11 @@ Call this whenever a new thread is created in order to propagate values from
               .getAcceleratorHooksInterface(device_type)
               .maybeExchangeDevice(device_index);
         }
+<<<<<<< HEAD
         return static_cast<c10::DeviceIndex>(-1);
+=======
+        return c10::DeviceIndex(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
 
   py_module.def(
@@ -2707,11 +2969,41 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("_has_xpu", has_xpu));
   ASSERT_TRUE(
       set_module_attr("_has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
+<<<<<<< HEAD
   ASSERT_TRUE(set_module_attr(
       "_has_mkldnn_acl", AT_MKLDNN_ACL_ENABLED() ? Py_True : Py_False));
 
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
 
+=======
+
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
+
+// See note [Pybind11 ABI constants]
+#define SET_STR_DEFINE(name) \
+  ASSERT_TRUE(set_module_attr("_" #name, THPUtils_packString(name)))
+
+#ifdef PYBIND11_COMPILER_TYPE
+  SET_STR_DEFINE(PYBIND11_COMPILER_TYPE);
+#else
+  ASSERT_TRUE(
+      set_module_attr("_" C10_STRINGIZE(PYBIND11_COMPILER_TYPE), Py_None));
+#endif
+
+#ifdef PYBIND11_STDLIB
+  SET_STR_DEFINE(PYBIND11_STDLIB);
+#else
+  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_STDLIB), Py_None));
+#endif
+
+#ifdef PYBIND11_BUILD_ABI
+  SET_STR_DEFINE(PYBIND11_BUILD_ABI);
+#else
+  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_BUILD_ABI), Py_None));
+#endif
+#undef SET_STR_DEFINE
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py_module.def(
       "_set_conj", [](const at::Tensor& x, bool conj) { x._set_conj(conj); });
   py_module.def(
@@ -2826,8 +3118,13 @@ Call this whenever a new thread is created in order to propagate values from
       py::arg("eps"));
 
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
+<<<<<<< HEAD
   THPDefaultCPUGenerator = reinterpret_cast<THPGenerator*>(
       THPGenerator_initDefaultGenerator(defaultGenerator));
+=======
+  THPDefaultCPUGenerator =
+      (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This reference is meant to be given away, so no need to incref here.
   ASSERT_TRUE(set_module_attr(
       "default_generator",
@@ -2868,8 +3165,11 @@ Call this whenever a new thread is created in order to propagate values from
 #ifdef USE_KINETO
   torch::global_kineto_init();
 #endif
+<<<<<<< HEAD
   auto nativert_module = py_module.def_submodule("_nativert");
   torch::nativert::initModelRunnerPybind(nativert_module);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return module;
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 6b23752124228..3a4d24132d3b0 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -57,7 +57,11 @@ struct ConcretePyInterpreterVTable final
   void reportErrorCallback(PyObject* callback, DispatchKey key) const override;
   void python_dispatcher(
       const c10::OperatorHandle& op,
+<<<<<<< HEAD
       c10::DispatchKeySet /*ks*/,
+=======
+      c10::DispatchKeySet,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       torch::jit::Stack* stack) const override;
   // NB: this is defined in python_dispatch.cpp
   void python_op_registration_trampoline(
@@ -80,6 +84,7 @@ struct ConcretePyInterpreterVTable final
             opname, pymodule, context);
   }
 
+<<<<<<< HEAD
   bool is_contiguous(
       const c10::TensorImpl* self,
       at::MemoryFormat /*memory_format*/) const override;
@@ -89,6 +94,12 @@ struct ConcretePyInterpreterVTable final
   bool is_strides_like(
       const c10::TensorImpl* self,
       at::MemoryFormat /*memory_format*/) const override;
+=======
+  bool is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
+  bool is_strides_like(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_non_overlapping_and_dense(const c10::TensorImpl* self) const override;
   c10::Device device(const c10::TensorImpl* self) const override;
   int64_t dim(const c10::TensorImpl* self) const override;
@@ -270,7 +281,11 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
           "This probably happened because you took out a weak reference to "
           "Tensor and didn't call _fix_weakref() after dereferencing it.  "
           "Subsequent accesses to this tensor via the PyObject will now fail.");
+<<<<<<< HEAD
       (reinterpret_cast<THPVariable*>(pyobj))->cdata =
+=======
+      ((THPVariable*)pyobj)->cdata =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           c10::MaybeOwned<torch::autograd::Variable>();
     } else if (THPStorage_Check(pyobj)) {
       TORCH_WARN(
@@ -278,8 +293,12 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
           "This probably happened because you took out a weak reference to "
           "UntypedStorage and didn't call _fix_weakref() after dereferencing it.  "
           "Subsequent accesses to this storage via the PyObject will now fail.");
+<<<<<<< HEAD
       (reinterpret_cast<THPStorage*>(pyobj))->cdata =
           c10::MaybeOwned<c10::Storage>();
+=======
+      ((THPStorage*)pyobj)->cdata = c10::MaybeOwned<c10::Storage>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   Py_DECREF(pyobj);
@@ -482,6 +501,7 @@ bool ConcretePyInterpreterVTable::is_contiguous(
   return PyObject_IsTrue(out.ptr());
 }
 
+<<<<<<< HEAD
 c10::SymBool ConcretePyInterpreterVTable::sym_is_contiguous(
     const c10::TensorImpl* self,
     at::MemoryFormat memory_format) const {
@@ -509,6 +529,8 @@ c10::SymBool ConcretePyInterpreterVTable::sym_is_contiguous(
                                 : c10::SymBool{py::cast<bool>(out)};
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool ConcretePyInterpreterVTable::is_strides_like(
     const c10::TensorImpl* self,
     at::MemoryFormat memory_format) const {
@@ -619,7 +641,11 @@ static void set_tensor_attr_with_capsule(
     py::capsule& capsule,
     const char* attr_name) {
   std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
       /*ignore_hermetic_tls=*/false);
+=======
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
   auto obj = mb_obj.value();
@@ -1020,3 +1046,10 @@ py::handle getTorchApiFunction(const c10::OperatorHandle& op) {
 c10::impl::PyInterpreter* getPyInterpreter() {
   return torch::detail::self_interpreter.get();
 }
+<<<<<<< HEAD
+=======
+
+bool isMainPyInterpreter() {
+  return torch::detail::self_interpreter.is_main_interpreter();
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/PyInterpreter.h b/torch/csrc/PyInterpreter.h
index 0ff9f79d02c27..f5fd74cfe5bcc 100644
--- a/torch/csrc/PyInterpreter.h
+++ b/torch/csrc/PyInterpreter.h
@@ -10,4 +10,8 @@ TORCH_PYTHON_API py::handle getTorchApiFunction(const c10::OperatorHandle& op);
 
 // TODO: Move these to a proper namespace
 TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+<<<<<<< HEAD
 TORCH_PYTHON_API void initializeGlobalPyInterpreter();
+=======
+TORCH_PYTHON_API bool isMainPyInterpreter();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/QScheme.cpp b/torch/csrc/QScheme.cpp
index 3fbabc1026f5e..427f70dc72a84 100644
--- a/torch/csrc/QScheme.cpp
+++ b/torch/csrc/QScheme.cpp
@@ -11,7 +11,11 @@
 #include <string>
 
 PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name) {
+<<<<<<< HEAD
   auto type = &THPQSchemeType;
+=======
+  auto type = (PyTypeObject*)&THPQSchemeType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -23,7 +27,11 @@ PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name) {
 }
 
 static PyObject* THPQScheme_reduce(PyObject* _self, PyObject* noargs) {
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPQScheme*>(_self);
+=======
+  auto self = (THPQScheme*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packString(self->name);
 }
 
@@ -48,7 +56,11 @@ PyTypeObject THPQSchemeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPQScheme_repr), /* tp_repr */
+=======
+    (reprfunc)THPQScheme_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -84,9 +96,13 @@ void THPQScheme_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPQSchemeType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "qscheme", reinterpret_cast<PyObject*>(&THPQSchemeType)) !=
       0) {
+=======
+  if (PyModule_AddObject(module, "qscheme", (PyObject*)&THPQSchemeType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
index 62bc48fa9b983..4aa59ccf1cb06 100644
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@@ -133,8 +133,12 @@ static PyObject* THPSize_pynew(
 static PyObject* THPSize_repr(THPSize* self) {
   HANDLE_TH_ERRORS
   std::string repr("torch.Size([");
+<<<<<<< HEAD
   for (Py_ssize_t i = 0; i < PyTuple_Size(reinterpret_cast<PyObject*>(self));
        ++i) {
+=======
+  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (i != 0) {
       repr += ", ";
     }
@@ -157,7 +161,11 @@ static PyObject* wrap_tuple_fn(Args... args) {
     return nullptr;
   if (PyTuple_Check(result.get())) {
     return PyObject_CallFunctionObjArgs(
+<<<<<<< HEAD
         reinterpret_cast<PyObject*>(&THPSizeType), result.get(), nullptr);
+=======
+        (PyObject*)&THPSizeType, result.get(), nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return result.release();
 }
@@ -226,9 +234,15 @@ static PyMappingMethods THPSize_as_mapping = {
 
 static PyObject* THPSize_numel(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPSize*>(_self);
   int64_t numel = 1;
   for (Py_ssize_t i = 0; i < PyTuple_Size(_self); ++i) {
+=======
+  auto self = (THPSize*)_self;
+  int64_t numel = 1;
+  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     numel *= THPUtils_unpackLong(PyTuple_GET_ITEM(self, i));
   }
   return THPUtils_packInt64(numel);
@@ -237,11 +251,16 @@ static PyObject* THPSize_numel(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPSize_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPSize*>(_self);
+=======
+  auto self = (THPSize*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto ret = THPObjectPtr{PyTuple_New(2)};
   if (!ret)
     throw python_error();
 
+<<<<<<< HEAD
   auto obj = reinterpret_cast<PyObject*>(&THPSizeType);
   Py_INCREF(&THPSizeType);
   PyTuple_SET_ITEM(ret.get(), 0, obj);
@@ -250,6 +269,16 @@ static PyObject* THPSize_reduce(PyObject* _self, PyObject* noargs) {
   if (!t)
     throw python_error();
   for (Py_ssize_t i = 0; i < PyTuple_Size(_self); ++i) {
+=======
+  auto obj = (PyObject*)(&THPSizeType);
+  Py_INCREF(&THPSizeType);
+  PyTuple_SET_ITEM(ret.get(), 0, obj);
+
+  THPObjectPtr t(PyTuple_New(PyTuple_Size((PyObject*)self)));
+  if (!t)
+    throw python_error();
+  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto d = PyTuple_GET_ITEM(self, i);
     Py_INCREF(d);
     PyTuple_SET_ITEM(t.get(), i, d);
@@ -280,7 +309,11 @@ PyTypeObject THPSizeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPSize_repr), /* tp_repr */
+=======
+    (reprfunc)THPSize_repr, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     &THPSize_as_number, /* tp_as_number */
     &THPSize_as_sequence, /* tp_as_sequence */
     &THPSize_as_mapping, /* tp_as_mapping */
@@ -316,8 +349,12 @@ void THPSize_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPSizeType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "Size", reinterpret_cast<PyObject*>(&THPSizeType)) < 0) {
+=======
+  if (PyModule_AddObject(module, "Size", (PyObject*)&THPSizeType) < 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 02558cbdf8968..a090bcb14fd63 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -35,6 +35,10 @@ PyTypeObject* THPStorageClass = nullptr;
 PyObject* THPStorage_NewWithStorage(
     PyTypeObject* type,
     c10::Storage _storage,
+<<<<<<< HEAD
+=======
+    c10::impl::PyInterpreterStatus status,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool allow_preexisting_pyobj) {
   TORCH_CHECK(
       PyType_IsSubtype(type, &THPStorageType),
@@ -42,7 +46,11 @@ PyObject* THPStorage_NewWithStorage(
       "Storage is not possible. Make sure your class inherits from Storage.");
 
   auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
       /*ignore_hermetic_tls=*/false);
+=======
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
     TORCH_CHECK(
         allow_preexisting_pyobj,
@@ -68,7 +76,11 @@ PyObject* THPStorage_NewWithStorage(
   PyObject* obj = type->tp_alloc(type, 0);
   TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
+<<<<<<< HEAD
   auto s = reinterpret_cast<THPStorage*>(obj);
+=======
+  auto s = (THPStorage*)obj;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   new (&s->cdata) c10::MaybeOwned<c10::Storage>();
 
@@ -77,7 +89,12 @@ PyObject* THPStorage_NewWithStorage(
   if (!c10::impl::HermeticPyObjectTLS::get_state()) {
     s->is_hermetic = false;
     const auto& storage = THPStorage_Unpack(s);
+<<<<<<< HEAD
     storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(obj);
+=======
+    storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(
+        getPyInterpreter(), obj, status);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     s->is_hermetic = true;
   }
@@ -89,12 +106,37 @@ PyObject* THPStorage_NewWithStorage(
 PyObject* THPStorage_Wrap(c10::Storage storage) {
   c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
   if (c10::impl::HermeticPyObjectTLS::get_state()) {
+<<<<<<< HEAD
     return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
   }
   c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
   std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
       /*ignore_hermetic_tls=*/false);
+=======
+    return THPStorage_NewWithStorage(
+        THPStorageClass,
+        std::move(storage),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+  }
+  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
+
+  // If the StorageImpl has a PyObject that is managed by a different
+  // interpreter than the current one, create a new StorageImpl that points to
+  // the same data and then create the Python storage from that.
+  // NOTE: This is only supposed to happen in MultiPy  // codespell:ignore
+  if (pyobj_slot->has_pyobj_nonhermetic() &&
+      !pyobj_slot->check_interpreter(getPyInterpreter())) {
+    return THPStorage_NewWithStorage(
+        THPStorageClass,
+        c10::newStorageImplFromRefcountedDataPtr(storage),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+  }
+  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  c10::impl::PyInterpreterStatus status =
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (maybe_pyobj.has_value()) {
     auto obj = *maybe_pyobj;
     if (obj) {
@@ -113,8 +155,20 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
         return obj;
       }
     }
+<<<<<<< HEAD
   }
   return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+=======
+    status = c10::impl::PyInterpreterStatus::TAGGED_BY_US;
+  } else {
+    if (storage.use_count() <= 1) {
+      status = c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED;
+    } else {
+      status = c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED;
+    }
+  }
+  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage), status);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static bool THPStorage_isPreservable(THPStorage* self) {
@@ -128,7 +182,12 @@ static bool THPStorage_isPreservable(THPStorage* self) {
   }
 
   if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
           /*ignore_hermetic_tls=*/true) != reinterpret_cast<PyObject*>(self)) {
+=======
+          getPyInterpreter(), /*ignore_hermetic_tls=*/true) !=
+      (PyObject*)self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   if (storage.use_count() <= 1) {
@@ -146,10 +205,18 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
   c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
 
   auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
       /*ignore_hermetic_tls=*/true);
   // NOTE: It is possible to just set the PyObjectSlot here, but the point is
   // that we should have already set PyObjectSlot when the storage PyObject
   // was created.
+=======
+      getPyInterpreter(),
+      /*ignore_hermetic_tls=*/true);
+  // NOTE: It is possible to just set the PyObjectSlot here, but the point is
+  // that we should have already set PyObjectSlot when the storage PyObject was
+  // created.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(
       maybe_pyobj.has_value(),
       "Trying to preserve a Python storage whose PyObjectSlot does not have a PyObject");
@@ -170,14 +237,22 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
   storage_impl->pyobj_slot()->set_owns_pyobj(true);
   // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
   // ensure the PyObject is in a valid state
+<<<<<<< HEAD
   _Py_NewReference(reinterpret_cast<PyObject*>(self));
+=======
+  _Py_NewReference((PyObject*)self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   self->cdata = c10::MaybeOwned<c10::Storage>::borrowed(storage);
   return true;
 }
 
 static void THPStorage_subclass_dealloc(PyObject* self) {
+<<<<<<< HEAD
   THPStorage* _self = reinterpret_cast<THPStorage*>(self);
+=======
+  THPStorage* _self = (THPStorage*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (THPStorage_tryPreserve(_self)) {
     return;
@@ -226,8 +301,13 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
        being finalized that has already been destroyed. */
     if (type->tp_weaklistoffset) {
       /* Modeled after GET_WEAKREFS_LISTPTR() */
+<<<<<<< HEAD
       PyWeakReference** list = reinterpret_cast<PyWeakReference**>(
           PyObject_GET_WEAKREFS_LISTPTR(self));
+=======
+      PyWeakReference** list =
+          (PyWeakReference**)PyObject_GET_WEAKREFS_LISTPTR(self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       while (*list)
         _PyWeakref_ClearRef(*list);
     }
@@ -331,7 +411,10 @@ static PyObject* THPStorage_pynew(
       case at::DeviceType::Meta:
       case at::DeviceType::PrivateUse1:
       case at::DeviceType::MAIA:
+<<<<<<< HEAD
       case at::DeviceType::MTIA:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         allocator = c10::GetAllocator(device.type());
         break;
       default:
@@ -358,7 +441,12 @@ static PyObject* THPStorage_pynew(
             at::DataPtr(),
             allocator,
             /*resizable=*/true,
+<<<<<<< HEAD
             device_opt));
+=======
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // torch.Storage(size, *, ...)
   } else if (r.idx == 1) {
@@ -371,7 +459,12 @@ static PyObject* THPStorage_pynew(
             at::DataPtr(),
             allocator,
             /*resizable=*/true,
+<<<<<<< HEAD
             device_opt));
+=======
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // torch.Storage(sequence, *, ...)
   } else if (r.idx == 2) {
@@ -395,7 +488,12 @@ static PyObject* THPStorage_pynew(
             at::DataPtr(),
             allocator,
             /*resizable=*/true,
+<<<<<<< HEAD
             device_opt));
+=======
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPObjectPtr item;
     try {
       const auto& storage = THPStorage_Unpack(self);
@@ -450,7 +548,11 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
       return nullptr;
     }
     uint8_t value = storage_get(storage, nindex);
+<<<<<<< HEAD
     return THPUtils_packUInt32(value);
+=======
+    return THPByteUtils_newReal(value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     /* Slice index */
   } else if (PySlice_Check(index)) {
     Py_ssize_t start = 0, stop = 0, slicelength = 0, step = 0;
@@ -491,8 +593,15 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
         /* resizable */ false,
         device_opt);
 
+<<<<<<< HEAD
     PyObject* _ret =
         THPStorage_NewWithStorage(Py_TYPE(self), std::move(new_storage_impl));
+=======
+    PyObject* _ret = THPStorage_NewWithStorage(
+        Py_TYPE(self),
+        std::move(new_storage_impl),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return _ret;
   }
@@ -549,9 +658,15 @@ static int THPStorage_set(THPStorage* self, PyObject* index, PyObject* value) {
 }
 
 static PyMappingMethods THPStorage_mappingmethods = {
+<<<<<<< HEAD
     reinterpret_cast<lenfunc>(THPStorage_length),
     reinterpret_cast<binaryfunc>(THPStorage_get),
     reinterpret_cast<objobjargproc>(THPStorage_set)};
+=======
+    (lenfunc)THPStorage_length,
+    (binaryfunc)THPStorage_get,
+    (objobjargproc)THPStorage_set};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct THPStorageMeta {
   PyHeapTypeObject base;
@@ -653,8 +768,12 @@ int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
     return -1;
   }
+<<<<<<< HEAD
   (reinterpret_cast<PyTypeObject*>(cls))->tp_dealloc =
       static_cast<destructor>(THPStorage_subclass_dealloc);
+=======
+  ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPStorage_subclass_dealloc;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return 0;
 }
 
@@ -675,6 +794,7 @@ typedef PyObject* (*getter)(PyObject*, void*);
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyGetSetDef THPStorage_properties[] = {
+<<<<<<< HEAD
     {"device",
      reinterpret_cast<getter>(THPStorage_device),
      nullptr,
@@ -685,6 +805,10 @@ static struct PyGetSetDef THPStorage_properties[] = {
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"device", (getter)THPStorage_device, nullptr, nullptr, nullptr},
+    {"_cdata", (getter)THPStorage_get_cdata, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 bool THPStorage_init(PyObject* module) {
@@ -696,22 +820,35 @@ bool THPStorage_init(PyObject* module) {
   if (PyType_Ready(&THPStorageMetaType) < 0)
     return false;
   Py_INCREF(&THPStorageMetaType);
+<<<<<<< HEAD
   PyModule_AddObject(
       module, "_StorageMeta", reinterpret_cast<PyObject*>(&THPStorageMetaType));
+=======
+  PyModule_AddObject(module, "_StorageMeta", (PyObject*)&THPStorageMetaType);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   THPStorageType.tp_methods = methods.data();
   THPStorageType.tp_getset = THPStorage_properties;
   if (PyType_Ready(&THPStorageType) < 0)
     return false;
   Py_INCREF(&THPStorageType);
+<<<<<<< HEAD
   PyModule_AddObject(
       module, "StorageBase", reinterpret_cast<PyObject*>(&THPStorageType));
+=======
+  PyModule_AddObject(module, "StorageBase", (PyObject*)&THPStorageType);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
 void THPStorage_postInit(PyObject* module) {
+<<<<<<< HEAD
   THPStorageClass = reinterpret_cast<PyTypeObject*>(
       PyObject_GetAttrString(module, "UntypedStorage"));
+=======
+  THPStorageClass =
+      (PyTypeObject*)PyObject_GetAttrString(module, "UntypedStorage");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!THPStorageClass)
     throw python_error();
 }
@@ -722,5 +859,9 @@ void THPStorage_assertNotNull(THPStorage* storage) {
 }
 
 void THPStorage_assertNotNull(PyObject* obj) {
+<<<<<<< HEAD
   THPStorage_assertNotNull(reinterpret_cast<THPStorage*>(obj));
+=======
+  THPStorage_assertNotNull((THPStorage*)obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 698cd80548efa..1818d6926df78 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -19,6 +19,10 @@ TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
 TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
     PyTypeObject* type,
     c10::Storage _storage,
+<<<<<<< HEAD
+=======
+    c10::impl::PyInterpreterStatus status,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool allow_preexisting_pyobj = false);
 TORCH_PYTHON_API extern PyTypeObject* THPStorageClass;
 
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 68c06f7c88c1c..2a636ed1c905d 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -297,7 +297,11 @@ static PyObject* THPStorage_fromBuffer(
     size_bytes = count * element_size;
   }
 
+<<<<<<< HEAD
   if (offset + (count * static_cast<Py_ssize_t>(element_size)) > buffer.len) {
+=======
+  if (offset + (count * (Py_ssize_t)element_size) > buffer.len) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyErr_SetString(
         PyExc_ValueError,
         fmt::format(
@@ -309,7 +313,11 @@ static PyObject* THPStorage_fromBuffer(
     return nullptr;
   }
 
+<<<<<<< HEAD
   uint8_t* src = static_cast<uint8_t*>(buffer.buf);
+=======
+  uint8_t* src = (uint8_t*)buffer.buf;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto fake_mode_active =
       c10::impl::TorchDispatchModeTLS::get_mode(
           c10::impl::TorchDispatchModeKey::FAKE) != std::nullopt;
@@ -390,7 +398,14 @@ static PyObject* THPStorage_fromFile(
     storage->set_nbytes(actual_nbytes);
   }
 
+<<<<<<< HEAD
   return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+=======
+  return THPStorage_NewWithStorage(
+      THPStorageClass,
+      std::move(storage),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -482,7 +497,11 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
       return nullptr;
     }
     Py_INCREF(self);
+<<<<<<< HEAD
     return self;
+=======
+    return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // file is backed by a fd
@@ -508,8 +527,13 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
   // advanced position
   const auto fd_current_pos = LSEEK(fd, 0, SEEK_CUR);
   LSEEK(fd, fd_original_pos, SEEK_SET);
+<<<<<<< HEAD
   const auto seek_return = PyObject_CallMethod(
       file, "seek", "Li", static_cast<long long>(fd_current_pos), 0);
+=======
+  const auto seek_return =
+      PyObject_CallMethod(file, "seek", "Li", (long long)fd_current_pos, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (seek_return == nullptr) {
     return nullptr;
   }
@@ -521,19 +545,31 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
 
 static PyObject* THPStorage__setCdata(PyObject* _self, PyObject* new_cdata) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPStorage*>(_self);
+=======
+  auto self = (THPStorage*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       THPUtils_checkLong(new_cdata),
       "given an invalid argument to "
       "_set_cdata - expected an int or long, but got ",
       THPUtils_typename(new_cdata));
+<<<<<<< HEAD
   c10::StorageImpl* ptr =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(new_cdata));
+=======
+  c10::StorageImpl* ptr = (c10::StorageImpl*)PyLong_AsVoidPtr(new_cdata);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self->cdata.~MaybeOwned<c10::Storage>();
   self->cdata = c10::MaybeOwned<c10::Storage>::owned(
       c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr)));
   Py_INCREF(self);
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(self);
+=======
+  return (PyObject*)self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/StorageSharing.cpp b/torch/csrc/StorageSharing.cpp
index e5fa5f32e9a75..a75467f3ec4a3 100644
--- a/torch/csrc/StorageSharing.cpp
+++ b/torch/csrc/StorageSharing.cpp
@@ -86,7 +86,12 @@ static PyObject* THPStorage_pyNewFilenameStorage(
           THManagedMapAllocator::makeDataPtr(
               "", handle.c_str(), flags, static_cast<size_t>(size)),
           /*allocator=*/nullptr,
+<<<<<<< HEAD
           /*resizable=*/false));
+=======
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -181,7 +186,12 @@ static PyObject* THPStorage_newSharedFilename(
           THManagedMapAllocator::makeDataPtr(
               manager_handle, object_handle, flags, size),
           /*allocator=*/nullptr,
+<<<<<<< HEAD
           /*resizable=*/false));
+=======
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -195,7 +205,13 @@ static PyObject* THPStorage_pyNewFdStorage(PyObject* _unused, PyObject* args) {
     return nullptr;
   }
   return THPStorage_NewWithStorage(
+<<<<<<< HEAD
       THPStorageClass, at::new_shm_fd_storage(size));
+=======
+      THPStorageClass,
+      at::new_shm_fd_storage(size),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -256,7 +272,11 @@ static PyObject* THPStorage_newSharedFd(PyObject* _unused, PyObject* args) {
         "a file descriptor (int) and storage size (int)");
     return nullptr;
   }
+<<<<<<< HEAD
   int tmp_fd = THPUtils_unpackInt(_tmp_fd);
+=======
+  int tmp_fd = (int)THPUtils_unpackLong(_tmp_fd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t size = THPUtils_unpackLong(_size);
   int fd = dup(tmp_fd);
   if (fd == -1) {
@@ -274,7 +294,12 @@ static PyObject* THPStorage_newSharedFd(PyObject* _unused, PyObject* args) {
           at::MapAllocator::makeDataPtr(
               at::WITH_FD, "", fd, flags, size, nullptr),
           /*allocator=*/nullptr,
+<<<<<<< HEAD
           /*resizable=*/false));
+=======
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -312,8 +337,13 @@ static PyObject* THPStorage_shareCuda(PyObject* self, PyObject* noargs) {
     auto shandle =
         c10::cuda::CUDACachingAllocator::shareIpcHandle(storage.mutable_data());
     _handle = PyBytes_FromStringAndSize(
+<<<<<<< HEAD
         shandle.handle.c_str(), static_cast<Py_ssize_t>(shandle.handle.size()));
     _offset_bytes = PyLong_FromSsize_t(static_cast<Py_ssize_t>(shandle.offset));
+=======
+        shandle.handle.c_str(), (Py_ssize_t)shandle.handle.size());
+    _offset_bytes = PyLong_FromSsize_t((Py_ssize_t)shandle.offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Put Storage Data behind new ref counting context
     // See Note [CUDA IPC Refcounting implementation explained]
@@ -334,7 +364,11 @@ static PyObject* THPStorage_shareCuda(PyObject* self, PyObject* noargs) {
     }
 
     _event_handle = PyBytes_FromStringAndSize(
+<<<<<<< HEAD
         reinterpret_cast<const char*>(&ipc_event_handle), CUDA_IPC_HANDLE_SIZE);
+=======
+        (char*)&ipc_event_handle, CUDA_IPC_HANDLE_SIZE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_);
   }
 
@@ -385,7 +419,11 @@ static PyObject* THPStorage_releaseIPCCounter(
   }
   std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
   ptrdiff_t ref_counter_offset =
+<<<<<<< HEAD
       static_cast<ptrdiff_t>(THPUtils_unpackLong(_ref_counter_offset));
+=======
+      (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We don't want to break existing code, so resource deletion is best
   // effort basis. Exception expected if producer process terminated
   // before consumer released data.
@@ -446,9 +484,16 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
     return nullptr;
   }
 
+<<<<<<< HEAD
   size_t storage_size = THPUtils_unpackUInt64(_size_bytes) / sizeof(uint8_t);
   ptrdiff_t storage_offset_bytes =
       static_cast<ptrdiff_t>(THPUtils_unpackLong(_offset_bytes));
+=======
+  size_t storage_size =
+      (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t);
+  ptrdiff_t storage_offset_bytes =
+      (ptrdiff_t)THPUtils_unpackLong(_offset_bytes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const auto device = c10::checked_convert<c10::DeviceIndex>(
       THPUtils_unpackLong(_device), "c10::DeviceIndex");
@@ -479,11 +524,19 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
   // Offset the basePtr to reconstruct the real storage
   // devPtr = basePtr + storage_offset
   void* devPtr = basePtr.get();
+<<<<<<< HEAD
   devPtr = static_cast<char*>(devPtr) + storage_offset_bytes;
 
   std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
   ptrdiff_t ref_counter_offset =
       static_cast<ptrdiff_t>(THPUtils_unpackLong(_ref_counter_offset));
+=======
+  devPtr = (char*)devPtr + storage_offset_bytes;
+
+  std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
+  ptrdiff_t ref_counter_offset =
+      (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   struct IpcDeleterContext {
     std::string ref_counter_handle;
@@ -554,7 +607,14 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
   base->set_resizable(false);
   base->set_received_cuda(true);
 
+<<<<<<< HEAD
   return THPStorage_NewWithStorage(THPStorageClass, std::move(base));
+=======
+  return THPStorage_NewWithStorage(
+      THPStorageClass,
+      std::move(base),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   TORCH_CHECK(false, "CUDA is not available");
 #endif
@@ -577,8 +637,12 @@ static PyObject* THPStorage_newWithWeakPtr(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "_new_with_weak_ptr(): arg must be an 'int'");
+<<<<<<< HEAD
   c10::StorageImpl* weak_storage =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
+=======
+  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (auto* storage = c10::raw::weak_intrusive_ptr::lock(weak_storage)) {
     return THPStorage_Wrap(
         c10::intrusive_ptr<c10::StorageImpl>::reclaim(storage));
@@ -594,8 +658,12 @@ static PyObject* THPStorage_freeWeakRef(PyObject* _unused, PyObject* arg) {
   }
   TORCH_CHECK(
       THPUtils_checkLong(arg), "_free_weak_ref(): arg must be an 'int'");
+<<<<<<< HEAD
   c10::StorageImpl* weak_storage =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
+=======
+  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::raw::weak_intrusive_ptr::decref(weak_storage);
 
   Py_RETURN_NONE;
@@ -605,8 +673,12 @@ static PyObject* THPStorage_freeWeakRef(PyObject* _unused, PyObject* arg) {
 static PyObject* THPStorage_expired(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "_expired(): arg must be an 'int'");
+<<<<<<< HEAD
   c10::StorageImpl* weak_storage =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
+=======
+  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyBool_FromLong(
       c10::raw::weak_intrusive_ptr::use_count(weak_storage) == 0);
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index 534294909a18f..ad022dae23ed5 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -74,7 +74,11 @@ static PyObject* THPStream_pynew(
     return nullptr;
   }
 
+<<<<<<< HEAD
   THPStream* self = reinterpret_cast<THPStream*>(ptr.get());
+=======
+  THPStream* self = (THPStream*)ptr.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // If torch.Stream is not created from existing Stream, then create a new one.
   // It requires other device backends override getNewStream method. How the new
@@ -96,19 +100,31 @@ static PyObject* THPStream_pynew(
   self->device_type = static_cast<int64_t>(stream_opt->device_type());
   self->context = nullptr;
 
+<<<<<<< HEAD
   return static_cast<PyObject*>(ptr.release());
+=======
+  return (PyObject*)ptr.release();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THPStream_Wrap(const c10::Stream& stream) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto type = THPStreamClass;
+=======
+  auto type = (PyTypeObject*)THPStreamClass;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   THPObjectPtr ptr(type->tp_alloc(type, 0));
   if (!ptr) {
     throw python_error();
   }
 
+<<<<<<< HEAD
   THPStream* self = reinterpret_cast<THPStream*>(ptr.get());
+=======
+  THPStream* self = (THPStream*)ptr.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self->stream_id = stream.id();
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   self->device_index = static_cast<int64_t>(stream.device_index());
@@ -119,7 +135,11 @@ PyObject* THPStream_Wrap(const c10::Stream& stream) {
 }
 
 static void THPStream_dealloc(THPStream* self) {
+<<<<<<< HEAD
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
+=======
+  Py_TYPE(self)->tp_free((PyObject*)self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* THPStream_get_device(THPStream* self, void* unused) {
@@ -132,7 +152,11 @@ static PyObject* THPStream_get_device(THPStream* self, void* unused) {
 
 static PyObject* THPStream_query(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPStream*>(_self);
+=======
+  auto self = (THPStream*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return PyBool_FromLong(c10::Stream::unpack3(
                              self->stream_id,
@@ -146,7 +170,11 @@ static PyObject* THPStream_query(PyObject* _self, PyObject* noargs) {
 static PyObject* THPStream_synchronize(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS {
     pybind11::gil_scoped_release no_gil;
+<<<<<<< HEAD
     auto self = reinterpret_cast<THPStream*>(_self);
+=======
+    auto self = (THPStream*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     c10::Stream::unpack3(
         self->stream_id,
@@ -160,8 +188,13 @@ static PyObject* THPStream_synchronize(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPStream_wait_event(PyObject* _self, PyObject* _event) {
   HANDLE_TH_ERRORS {
+<<<<<<< HEAD
     auto self = reinterpret_cast<THPStream*>(_self);
     auto event = reinterpret_cast<THPEvent*>(_event);
+=======
+    auto self = (THPStream*)_self;
+    auto event = (THPEvent*)_event;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::Stream::unpack3(
         self->stream_id,
         static_cast<c10::DeviceIndex>(self->device_index),
@@ -174,8 +207,13 @@ static PyObject* THPStream_wait_event(PyObject* _self, PyObject* _event) {
 
 static PyObject* THPStream_wait_stream(PyObject* _self, PyObject* _other) {
   HANDLE_TH_ERRORS {
+<<<<<<< HEAD
     auto self = reinterpret_cast<THPStream*>(_self);
     auto other_stream = reinterpret_cast<THPStream*>(_other);
+=======
+    auto self = (THPStream*)_self;
+    auto other_stream = (THPStream*)_other;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::Event new_event(
         static_cast<c10::DeviceType>(other_stream->device_type),
         c10::EventFlag::PYTORCH_DEFAULT);
@@ -198,7 +236,11 @@ static PyObject* THPStream_record_event(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPStream*>(_self);
+=======
+  auto self = (THPStream*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* _new_event = nullptr;
   PyObject* _event = Py_None;
 
@@ -222,13 +264,21 @@ static PyObject* THPStream_record_event(
         static_cast<c10::DeviceType>(self->device_type),
         c10::EventFlag::PYTORCH_DEFAULT);
   }
+<<<<<<< HEAD
   auto new_event = reinterpret_cast<THPEvent*>(_new_event);
+=======
+  auto new_event = (THPEvent*)_new_event;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(new_event, "event must not be null");
   new_event->event.record(c10::Stream::unpack3(
       self->stream_id,
       static_cast<c10::DeviceIndex>(self->device_index),
       static_cast<c10::DeviceType>(self->device_type)));
+<<<<<<< HEAD
   return reinterpret_cast<PyObject*>(new_event);
+=======
+  return (PyObject*)new_event;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -260,7 +310,11 @@ static PyObject* THPStream_eq(THPStream* self, THPStream* other) {
 
 static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPStream*>(_self);
+=======
+  auto self = (THPStream*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::DeviceType stream_device_type =
       static_cast<c10::DeviceType>(self->device_type);
   // No operation is performed if the stream does not belong to an accelerator.
@@ -304,7 +358,11 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
 
 static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   auto self = reinterpret_cast<THPStream*>(_self);
+=======
+  auto self = (THPStream*)_self;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // No operation is performed if the stream does not belong to an accelerator.
   if (C10_UNLIKELY(!at::accelerator::isAccelerator(
           static_cast<c10::DeviceType>(self->device_type)))) {
@@ -323,7 +381,11 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   auto ctx_device_index = THPObjectPtr(py_device_index);
   TORCH_INTERNAL_ASSERT(
       ctx_stream.get(), "ctx_stream should be present on the context dict.");
+<<<<<<< HEAD
   auto prev_stream = reinterpret_cast<THPStream*>(ctx_stream.get());
+=======
+  auto prev_stream = (THPStream*)(ctx_stream.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(
       ctx_device_index.get(),
       "ctx_device_index should be present on the context dict.");
@@ -360,6 +422,7 @@ static PyObject* THPStream_richcompare(
   } else {
     switch (op) {
       case Py_EQ:
+<<<<<<< HEAD
         result = THPStream_eq(
             reinterpret_cast<THPStream*>(self),
             reinterpret_cast<THPStream*>(other));
@@ -368,6 +431,12 @@ static PyObject* THPStream_richcompare(
         result = THPStream_ne(
             reinterpret_cast<THPStream*>(self),
             reinterpret_cast<THPStream*>(other));
+=======
+        result = THPStream_eq((THPStream*)self, (THPStream*)other);
+        break;
+      case Py_NE:
+        result = THPStream_ne((THPStream*)self, (THPStream*)other);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         break;
       default:
         result = Py_False;
@@ -397,11 +466,15 @@ static const std::initializer_list<PyMemberDef> THPStream_members = {
     {nullptr}};
 
 static const std::initializer_list<PyGetSetDef> THPStream_properties = {
+<<<<<<< HEAD
     {"device",
      reinterpret_cast<getter>(THPStream_get_device),
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"device", (getter)THPStream_get_device, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPStream_methods = {
@@ -413,7 +486,11 @@ static const std::initializer_list<PyMethodDef> THPStream_methods = {
      castPyCFunctionWithKeywords(THPStream_record_event),
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
+<<<<<<< HEAD
     {"__eq__", reinterpret_cast<PyCFunction>(THPStream_eq), METH_O, nullptr},
+=======
+    {"__eq__", (PyCFunction)THPStream_eq, METH_O, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"__enter__", THPStream_enter, METH_NOARGS, nullptr},
     {"__exit__", THPStream_exit, METH_VARARGS, nullptr},
     {nullptr}};
@@ -423,16 +500,28 @@ static PyTypeObject THPStreamType = {
     "torch.Stream", /* tp_name */
     sizeof(THPStream), /* tp_basicsize */
     0, /* tp_itemsize */
+<<<<<<< HEAD
     reinterpret_cast<destructor>(THPStream_dealloc), /* tp_dealloc */
+=======
+    (destructor)THPStream_dealloc, /* tp_dealloc */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPStream_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     reinterpret_cast<hashfunc>(THPStream_hash), /* tp_hash  */
+=======
+    (reprfunc)THPStream_repr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    (hashfunc)THPStream_hash, /* tp_hash  */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_call */
     nullptr, /* tp_str */
     nullptr, /* tp_getattro */
@@ -470,8 +559,12 @@ void THPStream_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPStreamType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "Stream", reinterpret_cast<PyObject*>(&THPStreamType)) < 0) {
+=======
+  if (PyModule_AddObject(module, "Stream", (PyObject*)&THPStreamType) < 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index 7eac32916a9b4..4866dbd52c021 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -4,6 +4,19 @@
 #include <torch/csrc/Export.h>
 #include <torch/csrc/python_headers.h>
 
+<<<<<<< HEAD
+=======
+// Back-compatibility macros, Thanks to http://cx-oracle.sourceforge.net/
+// define PyInt_* macros for Python 3.x.  NB: We must include Python.h first,
+// otherwise we'll incorrectly conclude PyInt_Check isn't defined!
+#ifndef PyInt_Check
+#define PyInt_Check PyLong_Check
+#define PyInt_FromLong PyLong_FromLong
+#define PyInt_AsLong PyLong_AsLong
+#define PyInt_Type PyLong_Type
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Module.h>
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index 6874374eff768..e778f68c4b4a5 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -18,7 +18,11 @@
 #include <sstream>
 
 static PyObject* THPFInfo_New(const at::ScalarType& type) {
+<<<<<<< HEAD
   auto finfo = &THPFInfoType;
+=======
+  auto finfo = (PyTypeObject*)&THPFInfoType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{finfo->tp_alloc(finfo, 0)};
   if (!self)
     throw python_error();
@@ -28,7 +32,11 @@ static PyObject* THPFInfo_New(const at::ScalarType& type) {
 }
 
 static PyObject* THPIInfo_New(const at::ScalarType& type) {
+<<<<<<< HEAD
   auto iinfo = &THPIInfoType;
+=======
+  auto iinfo = (PyTypeObject*)&THPIInfoType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self = THPObjectPtr{iinfo->tp_alloc(iinfo, 0)};
   if (!self)
     throw python_error();
@@ -117,7 +125,11 @@ static PyObject* THPDTypeInfo_compare(
   return Py_INCREF(Py_NotImplemented), Py_NotImplemented;
 }
 
+<<<<<<< HEAD
 static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint64_t bits = elementSize(self->type) * CHAR_BIT;
   return THPUtils_packUInt64(bits);
 }
@@ -133,7 +145,11 @@ static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void* /*unused*/) {
       at::ScalarType::BFloat16,                   \
       AT_EXPAND(AT_FLOAT8_TYPES))
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_eps(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "epsilon", [] {
     return PyFloat_FromDouble(
@@ -142,7 +158,11 @@ static PyObject* THPFInfo_eps(THPFInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_max(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_max(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "max", [] {
     return PyFloat_FromDouble(
@@ -151,7 +171,11 @@ static PyObject* THPFInfo_max(THPFInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_min(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_min(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "lowest", [] {
     return PyFloat_FromDouble(
@@ -164,7 +188,11 @@ static PyObject* THPFInfo_min(THPFInfo* self, void* /*unused*/) {
   AT_DISPATCH_V2(                                \
       TYPE, NAME, AT_WRAP(__VA_ARGS__), AT_EXPAND(AT_INTEGRAL_TYPES_V2))
 
+<<<<<<< HEAD
 static PyObject* THPIInfo_max(THPIInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPIInfo_max(THPIInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (at::isIntegralType(self->type, /*includeBool=*/false)) {
     return AT_DISPATCH_IINFO_TYPES(self->type, "max", [] {
@@ -182,7 +210,11 @@ static PyObject* THPIInfo_max(THPIInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPIInfo_min(THPIInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPIInfo_min(THPIInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (at::isIntegralType(self->type, /*includeBool=*/false)) {
     return AT_DISPATCH_IINFO_TYPES(self->type, "min", [] {
@@ -200,7 +232,11 @@ static PyObject* THPIInfo_min(THPIInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPIInfo_dtype(THPIInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPIInfo_dtype(THPIInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto primary_name = c10::getDtypeNames(self->type).first;
   return AT_DISPATCH_IINFO_TYPES(self->type, "dtype", [&primary_name] {
@@ -209,7 +245,11 @@ static PyObject* THPIInfo_dtype(THPIInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "min", [] {
     return PyFloat_FromDouble(
@@ -218,12 +258,20 @@ static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_tiny(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_tiny(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // see gh-70909, essentially the array_api prefers smallest_normal over tiny
   return THPFInfo_smallest_normal(self, nullptr);
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_resolution(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_resolution(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "digits10", [] {
     return PyFloat_FromDouble(std::pow(
@@ -233,7 +281,11 @@ static PyObject* THPFInfo_resolution(THPFInfo* self, void* /*unused*/) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPFInfo_dtype(THPFInfo* self, void* /*unused*/) {
+=======
+static PyObject* THPFInfo_dtype(THPFInfo* self, void*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto primary_name = c10::getDtypeNames(self->type).first;
   return _AT_DISPATCH_FINFO_TYPES(self->type, "dtype", [&primary_name] {
@@ -273,6 +325,7 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
 }
 
 static const std::initializer_list<PyGetSetDef> THPFInfo_properties = {
+<<<<<<< HEAD
     {"bits",
      reinterpret_cast<getter>(THPDTypeInfo_bits),
      nullptr,
@@ -301,6 +354,20 @@ static const std::initializer_list<PyGetSetDef> THPFInfo_properties = {
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"bits", (getter)THPDTypeInfo_bits, nullptr, nullptr, nullptr},
+    {"eps", (getter)THPFInfo_eps, nullptr, nullptr, nullptr},
+    {"max", (getter)THPFInfo_max, nullptr, nullptr, nullptr},
+    {"min", (getter)THPFInfo_min, nullptr, nullptr, nullptr},
+    {"smallest_normal",
+     (getter)THPFInfo_smallest_normal,
+     nullptr,
+     nullptr,
+     nullptr},
+    {"tiny", (getter)THPFInfo_tiny, nullptr, nullptr, nullptr},
+    {"resolution", (getter)THPFInfo_resolution, nullptr, nullptr, nullptr},
+    {"dtype", (getter)THPFInfo_dtype, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 PyTypeObject THPFInfoType = {
@@ -313,13 +380,21 @@ PyTypeObject THPFInfoType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPFInfo_str), /* tp_repr */
+=======
+    (reprfunc)THPFInfo_str, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     nullptr, /* tp_hash  */
     nullptr, /* tp_call */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPFInfo_str), /* tp_str */
+=======
+    (reprfunc)THPFInfo_str, /* tp_str */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -327,7 +402,11 @@ PyTypeObject THPFInfoType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
+<<<<<<< HEAD
     reinterpret_cast<richcmpfunc>(THPDTypeInfo_compare), /* tp_richcompare */
+=======
+    (richcmpfunc)THPDTypeInfo_compare, /* tp_richcompare */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -346,6 +425,7 @@ PyTypeObject THPFInfoType = {
 };
 
 static const std::initializer_list<PyGetSetDef> THPIInfo_properties = {
+<<<<<<< HEAD
     {"bits",
      reinterpret_cast<getter>(THPDTypeInfo_bits),
      nullptr,
@@ -358,6 +438,12 @@ static const std::initializer_list<PyGetSetDef> THPIInfo_properties = {
      nullptr,
      nullptr,
      nullptr},
+=======
+    {"bits", (getter)THPDTypeInfo_bits, nullptr, nullptr, nullptr},
+    {"max", (getter)THPIInfo_max, nullptr, nullptr, nullptr},
+    {"min", (getter)THPIInfo_min, nullptr, nullptr, nullptr},
+    {"dtype", (getter)THPIInfo_dtype, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 PyTypeObject THPIInfoType = {
@@ -370,13 +456,21 @@ PyTypeObject THPIInfoType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPIInfo_str), /* tp_repr */
+=======
+    (reprfunc)THPIInfo_str, /* tp_repr */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     nullptr, /* tp_hash  */
     nullptr, /* tp_call */
+<<<<<<< HEAD
     reinterpret_cast<reprfunc>(THPIInfo_str), /* tp_str */
+=======
+    (reprfunc)THPIInfo_str, /* tp_str */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -384,7 +478,11 @@ PyTypeObject THPIInfoType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
+<<<<<<< HEAD
     reinterpret_cast<richcmpfunc>(THPDTypeInfo_compare), /* tp_richcompare */
+=======
+    (richcmpfunc)THPDTypeInfo_compare, /* tp_richcompare */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -407,16 +505,24 @@ void THPDTypeInfo_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPFInfoType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "finfo", reinterpret_cast<PyObject*>(&THPFInfoType)) != 0) {
+=======
+  if (PyModule_AddObject(module, "finfo", (PyObject*)&THPFInfoType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
   if (PyType_Ready(&THPIInfoType) < 0) {
     throw python_error();
   }
   Py_INCREF(&THPIInfoType);
+<<<<<<< HEAD
   if (PyModule_AddObject(
           module, "iinfo", reinterpret_cast<PyObject*>(&THPIInfoType)) != 0) {
+=======
+  if (PyModule_AddObject(module, "iinfo", (PyObject*)&THPIInfoType) != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw python_error();
   }
 }
diff --git a/torch/csrc/api/include/torch/nn/functional/conv.h b/torch/csrc/api/include/torch/nn/functional/conv.h
index 2ab6a7684285c..439f0da908767 100644
--- a/torch/csrc/api/include/torch/nn/functional/conv.h
+++ b/torch/csrc/api/include/torch/nn/functional/conv.h
@@ -8,11 +8,19 @@ namespace torch::nn::functional {
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 
+<<<<<<< HEAD
 inline std::string padding_unwrap(enumtype::kValid /*unused*/) {
   return "valid";
 }
 
 inline std::string padding_unwrap(enumtype::kSame /*unused*/) {
+=======
+inline std::string padding_unwrap(enumtype::kValid) {
+  return "valid";
+}
+
+inline std::string padding_unwrap(enumtype::kSame) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return "same";
 }
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/any.h b/torch/csrc/api/include/torch/nn/modules/container/any.h
index c7a2fcbe62f77..e07aea35fc6df 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -185,12 +185,20 @@ class AnyModule {
       typename... ArgumentTypes>
   std::unique_ptr<AnyModulePlaceholder> make_holder(
       std::shared_ptr<ModuleType>&& module,
+<<<<<<< HEAD
       ReturnType (Class::* /*unused*/)(ArgumentTypes...));
 
   /// Helper method invoked by const and non-const `get()`.
   template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
   ModuleType& get_(
       ReturnType (ModuleType::* /*unused*/)(ArgumentTypes...)) const;
+=======
+      ReturnType (Class::*)(ArgumentTypes...));
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
+  ModuleType& get_(ReturnType (ModuleType::*)(ArgumentTypes...)) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// Helper method invoked by const and non-const `get()`.
   template <typename ModuleType>
@@ -321,7 +329,11 @@ template <
     typename... ArgumentTypes>
 std::unique_ptr<AnyModulePlaceholder> AnyModule::make_holder(
     std::shared_ptr<ModuleType>&& module,
+<<<<<<< HEAD
     ReturnType (Class::* /*unused*/)(ArgumentTypes...)) {
+=======
+    ReturnType (Class::*)(ArgumentTypes...)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(
       torch::detail::check_not_lvalue_references<ArgumentTypes...>(),
       "Modules stored inside AnyModule must not take references. "
@@ -346,7 +358,11 @@ ModuleType& AnyModule::get_() const {
 
 template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
 ModuleType& AnyModule::get_(
+<<<<<<< HEAD
     ReturnType (ModuleType::* /*unused*/)(ArgumentTypes...)) const {
+=======
+    ReturnType (ModuleType::*)(ArgumentTypes...)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (typeid(ModuleType).hash_code() == type_info().hash_code()) {
     return *static_cast<AnyModuleHolder<ModuleType, ArgumentTypes...>&>(
                 *content_)
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index 76b35621c37f8..5df7d4a56a960 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -703,7 +703,11 @@ struct TORCH_API NLLLossImpl : public Cloneable<NLLLossImpl> {
   /// The options with which this `Module` was constructed.
   NLLLossOptions options;
 
+<<<<<<< HEAD
   /// A manual rescaling weight given to each class.
+=======
+  /// A manual rescaling weight given to to each class.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor weight;
 };
 
@@ -743,7 +747,11 @@ struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
   /// The options with which this `Module` was constructed.
   CrossEntropyLossOptions options;
 
+<<<<<<< HEAD
   /// A manual rescaling weight given to each class.
+=======
+  /// A manual rescaling weight given to to each class.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor weight;
 };
 
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index 00c7a99e67519..952ccee260d1c 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -686,6 +686,7 @@ struct TORCH_API MultiheadAttentionForwardFuncOptions {
 
   TORCH_ARG(bool, training) = true;
 
+<<<<<<< HEAD
   TORCH_ARG(Tensor, key_padding_mask);
 
   TORCH_ARG(bool, need_weights) = true;
@@ -703,6 +704,25 @@ struct TORCH_API MultiheadAttentionForwardFuncOptions {
   TORCH_ARG(Tensor, static_k);
 
   TORCH_ARG(Tensor, static_v);
+=======
+  TORCH_ARG(Tensor, key_padding_mask) = {};
+
+  TORCH_ARG(bool, need_weights) = true;
+
+  TORCH_ARG(Tensor, attn_mask) = {};
+
+  TORCH_ARG(bool, use_separate_proj_weight) = false;
+
+  TORCH_ARG(Tensor, q_proj_weight) = {};
+
+  TORCH_ARG(Tensor, k_proj_weight) = {};
+
+  TORCH_ARG(Tensor, v_proj_weight) = {};
+
+  TORCH_ARG(Tensor, static_k) = {};
+
+  TORCH_ARG(Tensor, static_v) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_ARG(bool, average_attn_weights) = true;
 };
diff --git a/torch/csrc/api/include/torch/nn/options/batchnorm.h b/torch/csrc/api/include/torch/nn/options/batchnorm.h
index 78a287207c3a3..56ad2a33eaf71 100644
--- a/torch/csrc/api/include/torch/nn/options/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/options/batchnorm.h
@@ -73,9 +73,15 @@ namespace functional {
 /// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
 /// ```
 struct TORCH_API BatchNormFuncOptions {
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
 
   TORCH_ARG(Tensor, bias);
+=======
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_ARG(bool, training) = false;
 
diff --git a/torch/csrc/api/include/torch/nn/options/conv.h b/torch/csrc/api/include/torch/nn/options/conv.h
index bbaecbeb97b61..48ee780d34379 100644
--- a/torch/csrc/api/include/torch/nn/options/conv.h
+++ b/torch/csrc/api/include/torch/nn/options/conv.h
@@ -196,7 +196,11 @@ struct ConvFuncOptions {
   using padding_t = torch::nn::detail::conv_padding_t<D>;
 
   /// optional bias of shape `(out_channels)`. Default: ``None``
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, bias);
+=======
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// The stride of the convolving kernel.
   /// For a `D`-dim convolution, must be a single number or a list of `D`
@@ -352,7 +356,11 @@ namespace functional {
 template <size_t D>
 struct ConvTransposeFuncOptions {
   /// optional bias of shape `(out_channels)`. Default: ``None``
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, bias);
+=======
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// The stride of the convolving kernel.
   /// For a `D`-dim convolution, must be a single number or a list of `D`
diff --git a/torch/csrc/api/include/torch/nn/options/embedding.h b/torch/csrc/api/include/torch/nn/options/embedding.h
index 3c62b2a068529..89206c535bf18 100644
--- a/torch/csrc/api/include/torch/nn/options/embedding.h
+++ b/torch/csrc/api/include/torch/nn/options/embedding.h
@@ -40,7 +40,11 @@ struct TORCH_API EmbeddingOptions {
   TORCH_ARG(bool, sparse) = false;
   /// The learnable weights of the module of shape (num_embeddings,
   /// embedding_dim)
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, _weight);
+=======
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // ============================================================================
@@ -136,7 +140,11 @@ struct TORCH_API EmbeddingBagOptions {
   TORCH_ARG(bool, sparse) = false;
   /// The learnable weights of the module of shape (num_embeddings,
   /// embedding_dim)
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, _weight);
+=======
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// If ``true``, `offsets` has one additional element, where the last element
   /// is equivalent to the size of `indices`. This matches the CSR format.
   TORCH_ARG(bool, include_last_offset) = false;
@@ -201,7 +209,11 @@ namespace functional {
 struct TORCH_API EmbeddingBagFuncOptions {
   /// Only used when `input` is 1D. `offsets` determines
   /// the starting index position of each bag (sequence) in `input`.
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, offsets);
+=======
+  TORCH_ARG(torch::Tensor, offsets) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
   TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
@@ -223,7 +235,11 @@ struct TORCH_API EmbeddingBagFuncOptions {
   /// be taken to be 1. If specified, `per_sample_weights` must have exactly the
   /// same shape as input and is treated as having the same `offsets`, if those
   /// are not None.
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, per_sample_weights);
+=======
+  TORCH_ARG(torch::Tensor, per_sample_weights) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// If ``true``, `offsets` has one additional element, where the last element
   /// is equivalent to the size of `indices`. This matches the CSR format. Note:
   /// this option is currently only supported when ``mode="sum"``.
diff --git a/torch/csrc/api/include/torch/nn/options/instancenorm.h b/torch/csrc/api/include/torch/nn/options/instancenorm.h
index c37832407edf1..66ac7ad0ca663 100644
--- a/torch/csrc/api/include/torch/nn/options/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/options/instancenorm.h
@@ -67,6 +67,7 @@ namespace functional {
 /// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
 /// ```
 struct TORCH_API InstanceNormFuncOptions {
+<<<<<<< HEAD
   TORCH_ARG(Tensor, running_mean);
 
   TORCH_ARG(Tensor, running_var);
@@ -74,6 +75,15 @@ struct TORCH_API InstanceNormFuncOptions {
   TORCH_ARG(Tensor, weight);
 
   TORCH_ARG(Tensor, bias);
+=======
+  TORCH_ARG(Tensor, running_mean) = Tensor();
+
+  TORCH_ARG(Tensor, running_var) = Tensor();
+
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_ARG(bool, use_input_stats) = true;
 
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index b004fae8cdb0c..a7fac87823e5a 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -131,7 +131,11 @@ struct TORCH_API BCELossOptions {
       reduction_t;
 
   /// A manual rescaling weight given to the loss of each batch element.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Specifies the reduction to apply to the output.
   /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
   TORCH_ARG(reduction_t, reduction) = torch::kMean;
@@ -207,7 +211,11 @@ struct TORCH_API MultiMarginLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Specifies the reduction to apply to the output:
   /// ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be
   /// applied,
@@ -365,7 +373,11 @@ struct TORCH_API MultiLabelSoftMarginLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
   /// 'none': no reduction will be applied, 'mean': the sum of the output will
@@ -697,7 +709,11 @@ struct TORCH_API NLLLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Specifies a target value that is ignored
   /// and does not contribute to the input gradient.
   TORCH_ARG(int64_t, ignore_index) = -100;
@@ -735,7 +751,11 @@ struct TORCH_API CrossEntropyLossOptions {
 
   /// A manual rescaling weight given to each class. If given, has to be a
   /// Tensor of size C
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Specifies a target value that is ignored
   /// and does not contribute to the input gradient.
   TORCH_ARG(int64_t, ignore_index) = -100;
@@ -774,12 +794,20 @@ struct TORCH_API BCEWithLogitsLossOptions {
       reduction_t;
   /// A manual rescaling weight given to the loss of each batch element.
   /// If given, has to be a Tensor of size `nbatch`.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Specifies the reduction to apply to the output. Default: Mean
   TORCH_ARG(reduction_t, reduction) = torch::kMean;
   /// A weight of positive examples.
   /// Must be a vector with length equal to the number of classes.
+<<<<<<< HEAD
   TORCH_ARG(Tensor, pos_weight);
+=======
+  TORCH_ARG(Tensor, pos_weight) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 namespace functional {
diff --git a/torch/csrc/api/include/torch/nn/options/normalization.h b/torch/csrc/api/include/torch/nn/options/normalization.h
index 439f8b2a9808b..330abc200ebc7 100644
--- a/torch/csrc/api/include/torch/nn/options/normalization.h
+++ b/torch/csrc/api/include/torch/nn/options/normalization.h
@@ -43,9 +43,15 @@ struct TORCH_API LayerNormFuncOptions {
   /// input shape from an expected input.
   TORCH_ARG(std::vector<int64_t>, normalized_shape);
 
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
 
   TORCH_ARG(Tensor, bias);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// a value added to the denominator for numerical stability. ``Default:
   /// 1e-5``.
@@ -177,9 +183,15 @@ struct TORCH_API GroupNormFuncOptions {
   /// number of groups to separate the channels into
   TORCH_ARG(int64_t, num_groups);
 
+<<<<<<< HEAD
   TORCH_ARG(Tensor, weight);
 
   TORCH_ARG(Tensor, bias);
+=======
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /// a value added to the denominator for numerical stability. Default: 1e-5
   TORCH_ARG(double, eps) = 1e-5;
diff --git a/torch/csrc/api/include/torch/nn/options/pooling.h b/torch/csrc/api/include/torch/nn/options/pooling.h
index 4449a16f2206b..8ad814801cbca 100644
--- a/torch/csrc/api/include/torch/nn/options/pooling.h
+++ b/torch/csrc/api/include/torch/nn/options/pooling.h
@@ -456,7 +456,11 @@ struct FractionalMaxPoolOptions {
   using ExpandingArrayDouble = torch::ExpandingArray<D, double>;
   TORCH_ARG(std::optional<ExpandingArrayDouble>, output_ratio) = std::nullopt;
 
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, _random_samples);
+=======
+  TORCH_ARG(torch::Tensor, _random_samples) = Tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool2d` module.
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index c75aac3577178..cb2f598cfcd4a 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -38,7 +38,11 @@ struct TORCH_API AdamParamState
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, exp_avg);
   TORCH_ARG(torch::Tensor, exp_avg_sq);
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, max_exp_avg_sq);
+=======
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/optim/adamw.h b/torch/csrc/api/include/torch/optim/adamw.h
index 8b8c4c45f4061..ab31b8fa34dcb 100644
--- a/torch/csrc/api/include/torch/optim/adamw.h
+++ b/torch/csrc/api/include/torch/optim/adamw.h
@@ -38,7 +38,11 @@ struct TORCH_API AdamWParamState
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, exp_avg);
   TORCH_ARG(torch::Tensor, exp_avg_sq);
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, max_exp_avg_sq);
+=======
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index dc90113955fd1..4686f743c186b 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -39,9 +39,15 @@ struct TORCH_API LBFGSParamState
   TORCH_ARG(int64_t, n_iter) = 0;
   TORCH_ARG(double, t) = 0;
   TORCH_ARG(double, prev_loss) = 0;
+<<<<<<< HEAD
   TORCH_ARG(Tensor, d);
   TORCH_ARG(Tensor, H_diag);
   TORCH_ARG(Tensor, prev_flat_grad);
+=======
+  TORCH_ARG(Tensor, d) = {};
+  TORCH_ARG(Tensor, H_diag) = {};
+  TORCH_ARG(Tensor, prev_flat_grad) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_ARG(std::deque<Tensor>, old_dirs);
   TORCH_ARG(std::deque<Tensor>, old_stps);
   TORCH_ARG(std::deque<Tensor>, ro);
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index c6581b87a4b63..aae3e8ed39db2 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -43,8 +43,13 @@ struct TORCH_API RMSpropParamState
     : public OptimizerCloneableParamState<RMSpropParamState> {
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, square_avg);
+<<<<<<< HEAD
   TORCH_ARG(torch::Tensor, momentum_buffer);
   TORCH_ARG(torch::Tensor, grad_avg);
+=======
+  TORCH_ARG(torch::Tensor, momentum_buffer) = {};
+  TORCH_ARG(torch::Tensor, grad_avg) = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/python.h b/torch/csrc/api/include/torch/python.h
index 4878b1cc851a7..9d22544c6797f 100644
--- a/torch/csrc/api/include/torch/python.h
+++ b/torch/csrc/api/include/torch/python.h
@@ -26,7 +26,11 @@ inline Device py_object_to_device(py::object object) {
   if (THPDevice_Check(obj)) {
     return reinterpret_cast<THPDevice*>(obj)->device;
   }
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(false, "Expected device");
+=======
+  throw TypeError("Expected device");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline Dtype py_object_to_dtype(py::object object) {
@@ -34,7 +38,11 @@ inline Dtype py_object_to_dtype(py::object object) {
   if (THPDtype_Check(obj)) {
     return reinterpret_cast<THPDtype*>(obj)->scalar_type;
   }
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(false, "Expected dtype");
+=======
+  throw TypeError("Expected dtype");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename ModuleType>
diff --git a/torch/csrc/api/include/torch/version.h.in b/torch/csrc/api/include/torch/version.h.in
new file mode 100644
index 0000000000000..7b9059d9186b2
--- /dev/null
+++ b/torch/csrc/api/include/torch/version.h.in
@@ -0,0 +1,26 @@
+#pragma once
+
+/// Indicates the major version of LibTorch.
+#define TORCH_VERSION_MAJOR @TORCH_VERSION_MAJOR@
+
+/// Indicates the minor version of LibTorch.
+#define TORCH_VERSION_MINOR @TORCH_VERSION_MINOR@
+
+/// Indicates the patch version of LibTorch.
+#define TORCH_VERSION_PATCH @TORCH_VERSION_PATCH@
+
+/// Indicates the ABI version tag of LibTorch.
+#define TORCH_VERSION_ABI_TAG 0
+
+/// Indicates the version of LibTorch as a string literal.
+#define TORCH_VERSION \
+  "@TORCH_VERSION_MAJOR@.@TORCH_VERSION_MINOR@.@TORCH_VERSION_PATCH@"
+
+/// Indicates the ABI version of LibTorch as a single uint64.
+/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ]
+/// [ MAJ  ][ MIN  ][ PATCH][                              ABI TAG ]
+#define TORCH_ABI_VERSION \
+  (uint64_t)TORCH_VERSION_MAJOR << 56 | \
+  (uint64_t)TORCH_VERSION_MINOR << 48 | \
+  (uint64_t)TORCH_VERSION_PATCH << 40 | \
+  TORCH_VERSION_ABI_TAG << 0
diff --git a/torch/csrc/api/src/nn/modules/_functions.cpp b/torch/csrc/api/src/nn/modules/_functions.cpp
index 77d3039dbe1a4..91c359d978249 100644
--- a/torch/csrc/api/src/nn/modules/_functions.cpp
+++ b/torch/csrc/api/src/nn/modules/_functions.cpp
@@ -103,9 +103,15 @@ variable_list CrossMapLRN2d::backward(
   double cache_ratio_value = 2 * ctx->saved_data["alpha"].toDouble() *
       ctx->saved_data["beta"].toDouble() /
       static_cast<double>(ctx->saved_data["size"].toInt());
+<<<<<<< HEAD
   int64_t inversePrePad =
       (ctx->saved_data["size"].toInt() -
        (ctx->saved_data["size"].toInt() - 1) / 2);
+=======
+  int64_t inversePrePad = static_cast<int64_t>(
+      ctx->saved_data["size"].toInt() -
+      (ctx->saved_data["size"].toInt() - 1) / 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   grad_input.resize_as_(input);
   torch::pow_out(
diff --git a/torch/csrc/api/src/optim/sgd.cpp b/torch/csrc/api/src/optim/sgd.cpp
index 821587e439375..2438af3f0330f 100644
--- a/torch/csrc/api/src/optim/sgd.cpp
+++ b/torch/csrc/api/src/optim/sgd.cpp
@@ -84,7 +84,11 @@ Tensor SGD::step(LossClosure closure) {
         Tensor buf;
         auto param_state = state_.find(p.unsafeGetTensorImpl());
         if (param_state == state_.end()) {
+<<<<<<< HEAD
           buf = d_p.detach().clone();
+=======
+          buf = torch::clone(d_p).detach();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto state = std::make_unique<SGDParamState>();
           state->momentum_buffer(buf);
           state_[p.unsafeGetTensorImpl()] = std::move(state);
diff --git a/torch/csrc/api/src/serialize.cpp b/torch/csrc/api/src/serialize.cpp
index fae54d1248476..01a981e3cd8e0 100644
--- a/torch/csrc/api/src/serialize.cpp
+++ b/torch/csrc/api/src/serialize.cpp
@@ -1,4 +1,8 @@
 #include <torch/csrc/jit/serialization/pickle.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/serialize.h>
 
 #include <vector>
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index ff58cfd18ee39..1ffff7cdaaeee 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -894,8 +894,12 @@ Tensor logcumsumexp_backward(
   // Reference:
   // https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
 
+<<<<<<< HEAD
   auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
+=======
+  auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::ScalarType::BFloat16,
       at::typeMetaToScalarType(grad.dtype()),
       "logcumsumexp_backward",
@@ -1078,7 +1082,11 @@ std::vector<Tensor> cat_tensors_backward(
     auto& shape = sizes[i];
     // If input was empty tensor, gradInput should be empty tensor.
     if (shape.size() == 1) {
+<<<<<<< HEAD
       if (TORCH_GUARD_OR_FALSE(shape[0].sym_eq(0))) {
+=======
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(shape[0].sym_eq(0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_inputs[i] = at::zeros({0}, grad_val.options());
         continue;
       }
@@ -2176,7 +2184,11 @@ Tensor _nested_split_with_sizes_backward(
     const Tensor& nt_sizes,
     const at::TensorOptions& options) {
   // add 1 to account for batch dim
+<<<<<<< HEAD
   dim = at::maybe_wrap_dim(dim, nt_sizes.size(1) + 1);
+=======
+  dim = at::maybe_wrap_dim(dim, static_cast<int64_t>(nt_sizes.size(1)) + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // it's possible some of the grads are not defined (represents tensors of all
   // 0s). Since at::cat can't handle those, let's define them
   std::vector<Tensor> grads_all_defined;
@@ -2187,9 +2199,16 @@ Tensor _nested_split_with_sizes_backward(
       const auto& length = split_sizes[i].guard_int(__FILE__, __LINE__);
       auto nt_split_size = nt_sizes.clone();
       auto nt_split_size_ptr = nt_split_size.data_ptr<int64_t>();
+<<<<<<< HEAD
       for (int64_t j : c10::irange(nt_sizes.size(0))) {
         // subtract 1 to account for batch dim
         nt_split_size_ptr[j * nt_sizes.size(1) + (dim - 1)] = length;
+=======
+      for (int64_t j : c10::irange(static_cast<int64_t>(nt_sizes.size(0)))) {
+        // subtract 1 to account for batch dim
+        nt_split_size_ptr
+            [j * static_cast<int64_t>(nt_sizes.size(1)) + (dim - 1)] = length;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       Tensor zeros_buffer = at::zeros(
           {at::native::get_numel_from_nested_size_tensor(nt_split_size)},
@@ -3451,11 +3470,16 @@ std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(
   const auto V = Vh.mH();
 
   // dP = U^H dA V
+<<<<<<< HEAD
   // U^H (dA V) is O(km(n + k))
   // (U^H dA) V is O(kn(m + k))
   // So prefer U^H (dA V) if m < n
   auto dP = m < n ? at::matmul(U.mH(), at::matmul(dA, V))
                   : at::matmul(at::matmul(U.mH(), dA), V);
+=======
+  auto dP = m >= n ? at::matmul(U.mH(), at::matmul(dA, V))
+                   : at::matmul(at::matmul(U.mH(), dA), V);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto dS =
       is_complex ? at::real(dP.diagonal(0, -2, -1)) : dP.diagonal(0, -2, -1);
@@ -5025,6 +5049,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
   return std::tuple<Tensor, Tensor, Tensor>{gI, gG, ggO};
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> infinitely_differentiable_native_rms_norm_backward(
     const Tensor& dY,
     const Tensor& drstd,
@@ -5122,6 +5147,8 @@ std::tuple<Tensor, Tensor> infinitely_differentiable_native_rms_norm_backward(
   return std::make_tuple(dX, dgamma);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, Tensor, Tensor>
 infinitely_differentiable_native_group_norm_backward(
     const Tensor& dY,
@@ -6476,6 +6503,7 @@ Tensor layer_norm_jvp(
       bias_t.defined() ? bias_t.view(view_size_affine) : bias_t);
 }
 
+<<<<<<< HEAD
 Tensor rms_norm_jvp(
     const Tensor& input_p,
     const Tensor& input_t,
@@ -6568,6 +6596,8 @@ Tensor rms_norm_rstd_jvp(
   return rstd_t;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor group_norm_jvp(
     const Tensor& input_p,
     const Tensor& input_t,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 4dc0425d426ec..832b522811979 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -35,7 +35,13 @@ TORCH_API Tensor toNonOptFwGrad(const std::optional<Tensor>& t);
 TORCH_API Tensor toNonOptPrimal(const std::optional<Tensor>& t);
 TORCH_API Tensor toNonOptTensor(const std::optional<Tensor>& t);
 
+<<<<<<< HEAD
 inline std::optional<Tensor> wrap_opt_if(const Tensor& t, const bool cond) {
+=======
+TORCH_API inline std::optional<Tensor> wrap_opt_if(
+    const Tensor& t,
+    const bool cond) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using OptTensor = std::optional<Tensor>;
   return cond ? OptTensor(t) : static_cast<OptTensor>(std::nullopt);
 }
@@ -279,7 +285,11 @@ std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(
     const at::Tensor& self,
     const at::Tensor& min,
     const at::Tensor& max,
+<<<<<<< HEAD
     const std::array<bool, 2>& /*grad_input_mask*/);
+=======
+    const std::array<bool, 2>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor clamp_jvp(
     const Tensor& self_p,
     const Tensor& self_t,
@@ -826,6 +836,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
     c10::SymIntArrayRef normalized_shape,
     std::array<bool, 3> output_mask);
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> infinitely_differentiable_native_rms_norm_backward(
     const Tensor& dY,
     const Tensor& drstd,
@@ -835,6 +846,8 @@ std::tuple<Tensor, Tensor> infinitely_differentiable_native_rms_norm_backward(
     const std::optional<Tensor>& weight_opt,
     std::array<bool, 2> grad_input_mask);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, Tensor> householder_product_backward(
     const Tensor& grad,
     const Tensor& result,
@@ -974,6 +987,7 @@ Tensor layer_norm_jvp(
     const Tensor& saved_invstd,
     c10::SymIntArrayRef normalized_shape);
 
+<<<<<<< HEAD
 Tensor rms_norm_jvp(
     const Tensor& input_p,
     const Tensor& input_t,
@@ -988,6 +1002,8 @@ Tensor rms_norm_rstd_jvp(
     const Tensor& saved_rstd,
     IntArrayRef normalized_shape);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor group_norm_jvp(
     const Tensor& input_p,
     const Tensor& input_t,
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 3e722903fe81e..83b0c48e3fef3 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -245,12 +245,21 @@ static void general_trace_function(
           tracer::addInputs(
               node, args[i].name().c_str(), iter->toBoolList().vec());
         } else {
+<<<<<<< HEAD
           TORCH_CHECK(false, "unsupported input list type: ", elem_type->str());
+=======
+          throw std::runtime_error(
+              "unsupported input list type: " + elem_type->str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (iter->isObject()) {
         tracer::addInputs(node, args[i].name().c_str(), iter->toObject());
       } else {
+<<<<<<< HEAD
         TORCH_CHECK(false, "unsupported input type: ", type->str());
+=======
+        throw std::runtime_error("unsupported input type: " + type->str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     graph->insertNode(node);
@@ -276,19 +285,30 @@ static void general_trace_function(
           AT_ASSERT(iter->isTensorList());
           tracer::addOutput(node, iter->toTensorList());
         } else {
+<<<<<<< HEAD
           TORCH_CHECK(
               false, "unsupported output list type: ", elem_type->str());
+=======
+          throw std::runtime_error(
+              "unsupported output list type: " + elem_type->str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (type->kind() == TypeKind::ClassType) {
         AT_ASSERT(iter->isObject());
         tracer::addOutput(node, iter->toObject());
       } else {
+<<<<<<< HEAD
         TORCH_CHECK(
             false,
             "unsupported output type: ",
             type->str(),
             ", from operator: ",
             toString(op.operator_name()));
+=======
+        throw std::runtime_error(
+            "unsupported output type: " + type->str() +
+            ", from operator: " + toString(op.operator_name()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index c2c4dffee66eb..0c29f588ac775 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -453,6 +453,7 @@ static Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
     return at::_ops::detach::redispatch(
         ks & c10::after_ADInplaceOrView_keyset, self);
   })();
+<<<<<<< HEAD
   // NB: we can't make detach() a normal view operator because the
   // codegen generates allow_tensor_metadata_change = True (and leaves
   // is_fresh_tensor to the default setting of False) for them. In the
@@ -465,6 +466,22 @@ static Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
       out,
       /* allow_tensor_metadata_change */ false,
       /* is_fresh_tensor */ true);
+=======
+  // NB: we can't make detach() a normal view operator because the codegen
+  // generates allow_tensor_metadata_change = True for them. In the future we
+  // should have an option for this in the codegen.
+  auto result = as_view(
+      /* base */ self,
+      /* output */ out,
+      /* is_bw_differentiable */ false,
+      /* is_fw_differentiable */ false,
+      /* view_func */ nullptr,
+      /* rev_view_func */ nullptr,
+      /* creation_meta */ CreationMeta::DEFAULT,
+      /*allow_tensor_metadata_change=*/false);
+
+  return result;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Tensor _fw_primal(
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index 072501cbcf04e..a07943f1207a9 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -1,7 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/input_metadata.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/variable.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index 9de461cc56a28..da11fc4f051f6 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -478,6 +478,7 @@ torch::CppFunction autogradNotImplementedFallback() {
       &autogradNotImplementedFallbackImpl>();
 }
 
+<<<<<<< HEAD
 struct GenericViewFunc : public ViewFunc {
   GenericViewFunc(
       torch::jit::Stack non_tensor_stack,
@@ -530,6 +531,8 @@ struct GenericViewFunc : public ViewFunc {
   c10::OperatorHandle op_;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void autogradNotImplementedInplaceOrViewFallbackImpl(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
@@ -605,6 +608,7 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       "input and the first output (the output can be a vector of tensors). Please change the "
       "order of your operator's parameters so that this is the case.");
   const bool is_view = aliased_input_idx.has_value();
+<<<<<<< HEAD
   size_t aliased_input_idx_val;
 
   // Save inputs before we redispatch down
@@ -617,6 +621,8 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       non_tensor_stack.push_back((*stack)[stack_start + i]);
     }
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   {
     at::AutoDispatchBelowADInplaceOrView guard;
@@ -672,6 +678,7 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       auto result = std::move(aliased_output);
       stack->at(stack->size() - num_returns + aliased_output_idx) = result;
     } else {
+<<<<<<< HEAD
       c10::IValue& aliased_output_iv =
           (*stack)[stack->size() - num_returns + aliased_output_idx];
       TORCH_CHECK(aliased_output_iv.isTensor());
@@ -692,12 +699,19 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       auto view_func = std::make_unique<GenericViewFunc>(
           non_tensor_stack, aliased_input_idx_val, op);
 
+=======
+      TORCH_CHECK(aliased_output_iv.isTensor());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto result = as_view(
           /* base=*/aliased_input,
           /* tensor=*/std::move(aliased_output_iv).toTensor(),
           /* is_bw_differentiable=*/true,
           /* is_fw_differentiable=*/true,
+<<<<<<< HEAD
           /* view_func=*/std::move(view_func),
+=======
+          /* view_func=*/std::move(erroring_view_func),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           /* rev_view_func=*/erroring_rev_view_func,
           /* creation_meta=*/
           InferenceMode::is_enabled()
diff --git a/torch/csrc/autograd/cpp_hook.cpp b/torch/csrc/autograd/cpp_hook.cpp
index 48292931f33b0..895cd090f3188 100644
--- a/torch/csrc/autograd/cpp_hook.cpp
+++ b/torch/csrc/autograd/cpp_hook.cpp
@@ -11,8 +11,15 @@ void check_single_result(
     const at::TensorBase& value,
     const at::TensorBase& result,
     const std::string& hook_name) {
+<<<<<<< HEAD
   TORCH_CHECK(
       value.defined(), "can't replace a empty gradient with a non-empty value");
+=======
+  if (!value.defined()) {
+    throw std::runtime_error(
+        "can't replace a empty gradient with a non-empty value");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torch::autograd::check_variable_result(value, result, hook_name);
 }
 } // namespace
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 4104dac14a5fc..b96f10a2bbb0e 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -261,8 +261,12 @@ static optional_variable_list _process_backward_mode_ad(
     const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const std::shared_ptr<Node>& cdata,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
+<<<<<<< HEAD
     const _view_as_self_fn_t& view_as_self_fn,
     bool pure_view) {
+=======
+    const _view_as_self_fn_t& view_as_self_fn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto num_outputs = raw_outputs.size();
 
 #ifndef STRIP_ERROR_MESSAGES
@@ -405,8 +409,12 @@ static optional_variable_list _process_backward_mode_ad(
     if (!(is_input && is_modified) && var.is_view()) {
       // is_view() => diff_view_meta
       auto diff_view_meta = impl::get_view_autograd_meta(var);
+<<<<<<< HEAD
       diff_view_meta->set_creation_meta(
           pure_view ? CreationMeta::DEFAULT : CreationMeta::IN_CUSTOM_FUNCTION);
+=======
+      diff_view_meta->set_creation_meta(CreationMeta::IN_CUSTOM_FUNCTION);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     if (is_differentiable) {
@@ -450,20 +458,27 @@ optional_variable_list _wrap_outputs(
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
+<<<<<<< HEAD
     const _view_as_self_fn_t& view_as_self_fn,
     bool pure_view) {
+=======
+    const _view_as_self_fn_t& view_as_self_fn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<at::TensorImpl*, size_t> inputs_mapping;
   inputs_mapping.reserve(input_vars.size());
   for (const auto i : c10::irange(input_vars.size())) {
     inputs_mapping.emplace(input_vars[i].unsafeGetTensorImpl(), i);
   }
 
+<<<<<<< HEAD
   // Limit pure views to 1-1 mapping as it is unclear if it is even
   // possible to have a pure view for N-1 or 1-N.
   TORCH_CHECK(
       !pure_view || (input_vars.size() == 1 && raw_outputs.size() == 1),
       "Pure view custom Function can only have one input Tensor and one output Tensor. Open an issue if you need to support more.");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto outputs = _process_backward_mode_ad(
       inputs_mapping,
       non_differentiable,
@@ -471,8 +486,12 @@ optional_variable_list _wrap_outputs(
       raw_outputs,
       cdata,
       to_save_if_setup_context,
+<<<<<<< HEAD
       view_as_self_fn,
       pure_view);
+=======
+      view_as_self_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // This must happen after the backward processing as we expect the
   // computations happening here to track backward mode gradients.
@@ -492,6 +511,7 @@ void check_variable_result(
     const at::TensorBase& original,
     const at::TensorBase& result,
     const std::string& hook_name) {
+<<<<<<< HEAD
   TORCH_CHECK(
       original.options().type_equal(result.options()),
       "hook '",
@@ -517,6 +537,32 @@ void check_variable_result(
       "hook '",
       hook_name,
       "' has changed the size of value");
+=======
+  if (!original.options().type_equal(result.options())) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the type of value (";
+    ss << "was " << original.toString() << " got ";
+    ss << result.toString() << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  if (original.is_cuda() != result.is_cuda()) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the type of value";
+    if (original.is_cuda()) {
+      ss << " (was CUDA tensor got CPU tensor)";
+    } else {
+      ss << " (was CPU tensor got CUDA tensor)";
+    }
+    throw std::runtime_error(ss.str());
+  }
+
+  if (original.sym_sizes().vec() != result.sym_sizes().vec()) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the size of value";
+    throw std::runtime_error(ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 AutogradContext::AutogradContext(PackedArgs& packed_args) {
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 3b9cf755f4c23..b66308c75664b 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -24,8 +24,12 @@ TORCH_API std::vector<std::optional<Variable>> _wrap_outputs(
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
+<<<<<<< HEAD
     const _view_as_self_fn_t& view_as_self_fn,
     bool pure_view);
+=======
+    const _view_as_self_fn_t& view_as_self_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_API void check_variable_result(
     const at::TensorBase& original,
@@ -229,6 +233,7 @@ inline variable_list CppNode_apply_functional(
     }
   }
 
+<<<<<<< HEAD
   TORCH_CHECK(
       num_outputs == num_forward_inputs,
       "function ",
@@ -238,11 +243,21 @@ inline variable_list CppNode_apply_functional(
       ", got ",
       num_outputs,
       ")");
+=======
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got ";
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   variable_list results;
   results.reserve(num_outputs);
   for (const auto i : c10::irange(num_outputs)) {
     if (!is_variable_input_[i]) {
+<<<<<<< HEAD
       TORCH_CHECK(
           outputs[i].defined() == false,
           "function ",
@@ -250,11 +265,24 @@ inline variable_list CppNode_apply_functional(
           " returned a gradient different that is defined at position ",
           i + 1,
           ", std the corresponding forward input was not a Variable");
+=======
+      if (outputs[i].defined()) {
+        std::string msg("function ");
+        msg += name +
+            " returned a gradient different that is defined at position ";
+        msg += std::to_string(i + 1) +
+            ", std the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     }
     results.emplace_back(outputs[i]);
   }
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return results;
 }
 
@@ -524,8 +552,12 @@ auto Function<T>::apply(Args&&... args)
       is_executable ? node : nullptr,
       jvp_fn,
       {},
+<<<<<<< HEAD
       view_as_self_fn,
       false);
+=======
+      view_as_self_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   node->output_info_.reserve(wrapped_outputs.size());
   for (auto& output : wrapped_outputs) {
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index f92af4994fd5b..dfb49b9f8d3fb 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -707,8 +707,14 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
 }
 
 void GraphTask::exec_post_processing() {
+<<<<<<< HEAD
   TORCH_CHECK(
       not_ready_.empty(), "could not compute gradients for some functions");
+=======
+  if (!not_ready_.empty()) {
+    throw std::runtime_error("could not compute gradients for some functions");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // set the thread_local current_graph_task_ as more callbacks can be installed
   // by existing final callbacks.
@@ -948,6 +954,7 @@ static void validate_outputs_impl(
     TORCH_CHECK(
         isFloatingType(grad.scalar_type()) ||
         (input_is_complex == grad_is_complex));
+<<<<<<< HEAD
 
     if (metadata.grad_dtype().has_value()) {
       if (grad.scalar_type() != metadata.grad_dtype().value()) {
@@ -959,6 +966,17 @@ static void validate_outputs_impl(
         ss << metadata.grad_dtype().value() << " but got " << grad.dtype();
         TORCH_CHECK(false, format_error(ss.str()));
       }
+=======
+    if (c10::typeMetaToScalarType(metadata.options().dtype()) !=
+        grad.scalar_type()) {
+      grad = grad.to(c10::typeMetaToScalarType(metadata.options().dtype()));
+    }
+    if (grad.dtype() != metadata.dtype()) {
+      std::stringstream ss;
+      ss << "invalid gradient at index " << i << " - expected dtype ";
+      ss << metadata.dtype() << " but got " << grad.dtype();
+      TORCH_CHECK(false, format_error(ss.str()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (grad.layout() != metadata.layout()) {
       // TODO: Currently we only support (*, Sparse) combination for
@@ -980,6 +998,7 @@ static void validate_outputs_impl(
     }
 
     if (grad.device() != metadata.device()) {
+<<<<<<< HEAD
       if (grad.dim() == 0) {
         grad = grad.to(metadata.device());
       } else {
@@ -987,6 +1006,15 @@ static void validate_outputs_impl(
         // should be eventually removed
         if (!(metadata.is_tensor_subclass() ||
               grad.unsafeGetTensorImpl()->is_python_dispatch())) {
+=======
+      // quick hack for: https://github.com/pytorch/pytorch/issues/65016 but
+      // should be eventually removed
+      if (!(metadata.is_tensor_subclass() ||
+            grad.unsafeGetTensorImpl()->is_python_dispatch())) {
+        if (grad.dim() == 0) {
+          grad = grad.to(metadata.device());
+        } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::stringstream ss;
           ss << "invalid gradient at index " << i << " - expected device ";
           ss << metadata.device() << " but got " << grad.device();
@@ -1150,6 +1178,7 @@ void Engine::evaluate_function(
     for (const auto i : c10::irange(num_outputs)) {
       auto& output = outputs[i];
       at::OptionalDeviceGuard guard(device_of(output));
+<<<<<<< HEAD
       TORCH_CHECK(
           !output.defined() || !isnan(output)._is_any_true().item<bool>(),
           "Function '",
@@ -1157,6 +1186,14 @@ void Engine::evaluate_function(
           "' returned nan values in its ",
           i,
           "th output.");
+=======
+      if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
+        std::stringstream ss;
+        ss << "Function '" << fn.name() << "' returned nan values in its " << i
+           << "th output.";
+        throw std::runtime_error(ss.str());
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -1177,7 +1214,11 @@ void Engine::evaluate_function(
 
     if (it == dependencies.end()) {
       auto name = next.function->name();
+<<<<<<< HEAD
       TORCH_CHECK(false, "dependency not found for ", name);
+=======
+      throw std::runtime_error(std::string("dependency not found for ") + name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else if (--it->second == 0) {
       dependencies.erase(it);
       is_ready = true;
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index ca97c43ca726e..1406365a6f921 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -200,12 +200,20 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
       const at::TensorOptions& options,
       c10::SymIntArrayRef shape,
       bool is_tensor_subclass,
+<<<<<<< HEAD
       bool is_nested,
       std::optional<at::ScalarType> grad_dtype) noexcept {
     uint32_t input_nr = input_metadata_.size();
     auto meta_shape = MetadataShape{std::in_place_type<SymIntSmallVec>, shape};
     input_metadata_.emplace_back(
         options, meta_shape, is_tensor_subclass, is_nested, grad_dtype);
+=======
+      bool is_nested) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    auto meta_shape = MetadataShape{std::in_place_type<SymIntSmallVec>, shape};
+    input_metadata_.emplace_back(
+        options, meta_shape, is_tensor_subclass, is_nested);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return input_nr;
   }
 
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index 8a847c56834f9..5092c94185d1e 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -60,8 +60,13 @@ struct TORCH_API PostAccumulateGradHook {
   }
 
   virtual void apply_with_saved(
+<<<<<<< HEAD
       Variable& /*unused*/,
       torch::dynamo::autograd::SwapSavedVariables& /*unused*/) {
+=======
+      Variable&,
+      torch::dynamo::autograd::SwapSavedVariables&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index 7ab63db9b0b2e..7b71d8886f211 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -17,7 +17,11 @@ variable_list Error::apply(variable_list&& inputs) {
 }
 
 variable_list Error::apply(variable_list&& inputs) const {
+<<<<<<< HEAD
   TORCH_CHECK(false, msg);
+=======
+  throw std::runtime_error(msg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void Error::compiled_args(CompiledNodeArgs& args) const {
@@ -57,7 +61,17 @@ auto UndefinedGrad::apply(variable_list&& inputs) -> variable_list {
 
 auto UndefinedGradBackward::apply(variable_list&& output_grads)
     -> variable_list {
+<<<<<<< HEAD
   return tensor_list(output_grads.size());
+=======
+  tensor_list input_grads;
+  output_grads.reserve(input_grads.size());
+  for (auto& grad : output_grads) {
+    (void)grad; // Suppress unused variable warning
+    input_grads.emplace_back();
+  }
+  return input_grads;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 auto Identity::apply(variable_list&& grads) -> variable_list {
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
index efdf171e5c010..6fec117843a6a 100644
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -49,7 +49,11 @@ struct UndefinedGradCtor {
 
 struct NoCtor {
   Node* operator()(PyObject* args) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Cannot construct");
+=======
+    throw std::runtime_error("Cannot construct");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index ca57f6488ce1a..4668d229d6b86 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -184,7 +184,13 @@ inline variable_list CopySlices::apply_impl(
   // see Note [Thread Safety on Autograd Node]
   std::lock_guard<std::mutex> lock(mutex_);
 
+<<<<<<< HEAD
   TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
+=======
+  if (!fn) {
+    throw std::runtime_error(ERR_BACKWARD_TWICE);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto result =
       grad.new_empty_strided_symint(base.sym_sizes(), base.sym_strides());
@@ -250,7 +256,13 @@ variable_list CopySlices::apply_with_saved(
 
   auto results = variable_list(num_outputs());
   if (grads[0].defined()) {
+<<<<<<< HEAD
     TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
+=======
+    if (!fn) {
+      throw std::runtime_error(ERR_BACKWARD_TWICE);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     update_exec_info();
 
     std::vector<bool> needs_input_grad;
diff --git a/torch/csrc/autograd/functions/utils.cpp b/torch/csrc/autograd/functions/utils.cpp
index 2f24f22a8b333..0e482cae907a5 100644
--- a/torch/csrc/autograd/functions/utils.cpp
+++ b/torch/csrc/autograd/functions/utils.cpp
@@ -53,6 +53,7 @@ void check_input_variables(
   if (required_args == -1) {
     required_args = args;
   }
+<<<<<<< HEAD
   TORCH_CHECK(
       inputs.size() == static_cast<size_t>(args),
       name,
@@ -69,6 +70,20 @@ void check_input_variables(
         ": expected Tensor at argument ",
         i,
         " (got None)");
+=======
+  if (inputs.size() != static_cast<size_t>(args)) {
+    std::stringstream ss;
+    ss << name << ": expected " << args << " arguments (got " << inputs.size();
+    ss << ")";
+    throw std::runtime_error(ss.str());
+  }
+  for (const auto i : c10::irange(required_args)) {
+    if (!inputs[i].defined() && !allow_undefined) {
+      std::stringstream ss;
+      ss << name << ": expected Tensor at argument " << i << " (got None)";
+      throw std::runtime_error(ss.str());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index b34d15c7d05cc..4c6da5f443d53 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -122,7 +122,11 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
 
   // Note: this field is not ready to be used until the proper
   // `thread_locals_.set_grad_mode()` call in the constructor.
+<<<<<<< HEAD
   at::ThreadLocalState thread_locals_;
+=======
+  at::ThreadLocalState thread_locals_ = at::ThreadLocalState();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::unordered_set<c10::Stream> leaf_streams;
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 7cfb935942046..00547c650b517 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -307,6 +307,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
                 e.activityType() ==
                 (uint8_t)libkineto::ActivityType::GPU_USER_ANNOTATION;
           })
+<<<<<<< HEAD
       .def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); })
       // whether the event is hidden
       .def(
@@ -316,6 +317,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       .def("metadata_json", [](const KinetoEvent& e) {
         return e.metadataJson();
       });
+=======
+      .def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def("_soft_assert_raises", &setSoftAssertRaises);
   m.def("_get_sequence_nr", &at::sequence_number::peek);
@@ -598,6 +602,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
             s.register_hooks(
                 std::make_unique<torch::autograd::PySavedVariableHooks>(
                     pack_hook, unpack_hook));
+<<<<<<< HEAD
           })
       .def_property_readonly(
           "data",
@@ -625,6 +630,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
             const auto& [unpack_safe, _unused_data] = *opt;
             auto* unpack_ptr = unpack_safe.ptr(getPyInterpreter());
             return py::reinterpret_borrow<py::function>(unpack_ptr);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           });
 
   torch::autograd::profiler::python_tracer::init();
@@ -1113,7 +1120,11 @@ static PyObject* any_output_is_alias_to_input_or_output(
     if (!t.storage()) {
       return false;
     }
+<<<<<<< HEAD
     auto* cp = t.storage().unsafeGetStorageImpl();
+=======
+    auto* cp = t.storage().data_ptr().get_context();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (cp) {
       s.insert(cp);
     }
@@ -1124,7 +1135,11 @@ static PyObject* any_output_is_alias_to_input_or_output(
     if (!t.storage()) {
       return false;
     }
+<<<<<<< HEAD
     auto* cp = t.storage().unsafeGetStorageImpl();
+=======
+    auto* cp = t.storage().data_ptr().get_context();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!cp) {
       return false;
     }
diff --git a/torch/csrc/autograd/input_metadata.cpp b/torch/csrc/autograd/input_metadata.cpp
index f43368bbded08..a7591e73c768f 100644
--- a/torch/csrc/autograd/input_metadata.cpp
+++ b/torch/csrc/autograd/input_metadata.cpp
@@ -29,14 +29,22 @@ InputMetadata::InputMetadata(
     const at::TensorOptions& options,
     MetadataShape input_shape,
     bool is_tensor_subclass,
+<<<<<<< HEAD
     bool is_nested,
     std::optional<at::ScalarType> grad_dtype)
+=======
+    bool is_nested)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : options_{options},
       shape_{std::move(input_shape)},
       is_tensor_subclass_{is_tensor_subclass},
       is_nested_{is_nested},
+<<<<<<< HEAD
       was_default_constructed_{false},
       grad_dtype_{grad_dtype} {
+=======
+      was_default_constructed_{false} {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto device_ = options.device();
   stream_ = c10::impl::getDeviceGuardImpl(device_.type())->getStream(device_);
 }
@@ -46,8 +54,12 @@ InputMetadata::InputMetadata(const at::Tensor& t)
           t.options(),
           compute_variant_shape(t),
           is_python_dispatch(t),
+<<<<<<< HEAD
           t.is_nested(),
           t.grad_dtype()) {}
+=======
+          t.is_nested()) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 at::Tensor InputMetadata::zeros_like() const {
   TORCH_CHECK(
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index 1facbf345bc69..14623d877710c 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -38,8 +38,12 @@ struct TORCH_API InputMetadata {
       const at::TensorOptions& options,
       MetadataShape input_shape,
       bool is_tensor_subclass,
+<<<<<<< HEAD
       bool is_nested,
       std::optional<at::ScalarType> grad_dtype);
+=======
+      bool is_nested);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   InputMetadata(const at::Tensor& t);
 
   const at::TensorOptions& options() const {
@@ -98,6 +102,7 @@ struct TORCH_API InputMetadata {
   // Danger: not thread safe, caller must protect with lock
   SymIntSmallVec& mutable_shape_as_dim_vector();
 
+<<<<<<< HEAD
   std::optional<at::ScalarType> grad_dtype() const {
     TORCH_INTERNAL_ASSERT(!was_default_constructed_);
     return grad_dtype_;
@@ -108,13 +113,18 @@ struct TORCH_API InputMetadata {
     grad_dtype_ = grad_dtype;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   at::Tensor shape_as_tensor() const;
   bool is_nestedness_same(const at::Tensor& grad) const;
   bool maybe_expandable_to(const at::Tensor& grad) const;
 
+<<<<<<< HEAD
   // NB: The engine does not use the dtype from the options, but rather the
   //     grad_dtype_ field to validate grad_output dtype.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const at::TensorOptions options_;
   MetadataShape shape_;
@@ -122,11 +132,14 @@ struct TORCH_API InputMetadata {
   bool is_tensor_subclass_ = false;
   bool is_nested_ = false;
   bool was_default_constructed_ = true;
+<<<<<<< HEAD
 
   // The grad_dtype_ field is the dtype that the engine expects the grad to be.
   // When nullopt, grad_dtype_ is allowed to be any dtype.
   // This field is mutated if THPVariable_set_grad_dtype is called
   // and the AccumulateGrad has already been created.
   std::optional<at::ScalarType> grad_dtype_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index fe3acd99761cc..a83f46312b89a 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -37,8 +37,12 @@ extern "C" {
 // https://github.com/pytorch/pytorch/issues/51026
 __attribute__((weak)) int acc_get_device_type();
 __attribute__((weak)) int acc_get_device_type() {
+<<<<<<< HEAD
   TORCH_CHECK(
       false,
+=======
+  throw std::runtime_error(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Dummy implementation of acc_get_device_type is not supposed to be called!");
 }
 } // extern "C"
@@ -222,7 +226,11 @@ struct AddTensorboardFields : public MetadataBase {
   }
 
   template <typename T>
+<<<<<<< HEAD
   void operator()(const T& /*unused*/) {}
+=======
+  void operator()(const T&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct AddGenericMetadata : public MetadataBase {
@@ -265,6 +273,7 @@ struct AddGenericMetadata : public MetadataBase {
         continue;
       }
 
+<<<<<<< HEAD
       // Until needed, lets limit the kwargs to only ints, doubles, strings,
       // bools, and list of strings
       bool isValidType =
@@ -297,6 +306,18 @@ struct AddGenericMetadata : public MetadataBase {
         bool isString = val.isString();
         addMetadata(key, ivalueToStr(val, isString));
       }
+=======
+      // Until needed, lets limit the kwargs to only ints, doubles, strings and
+      // bools
+      if (!val.isInt() && !val.isDouble() && !val.isString() && !val.isBool()) {
+        LOG(WARNING) << "Inputted kwarg: " << key
+                     << " is not an int, double, string, or bool for op: "
+                     << op_event.name_ << " skipping";
+        continue;
+      }
+      bool isString = val.isString();
+      addMetadata(key, ivalueToStr(val, isString));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // Add extra metadata if any
     for (const auto& [key, val] : op_event.extra_meta_) {
@@ -346,7 +367,11 @@ struct AddGenericMetadata : public MetadataBase {
   }
 
   template <typename T>
+<<<<<<< HEAD
   void operator()(const T& /*unused*/) {}
+=======
+  void operator()(const T&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   /* To get names of the performance events */
@@ -959,10 +984,13 @@ bool KinetoEvent::hasKwinputs() const {
   return !kwinputs_.empty();
 }
 
+<<<<<<< HEAD
 bool KinetoEvent::isHiddenEvent() const {
   return result_ && result_->hidden_;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const std::unordered_map<std::string, c10::IValue> KinetoEvent::kwinputs()
     const {
   return kwinputs_;
@@ -1068,6 +1096,7 @@ void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
       [](const auto&) -> void { return; }));
 }
 
+<<<<<<< HEAD
 std::string KinetoEvent::metadataJson() const {
   return result_->visit(c10::overloaded(
       [](const ExtraFields<EventType::TorchOp>& op) -> std::string {
@@ -1079,6 +1108,8 @@ std::string KinetoEvent::metadataJson() const {
       [](const auto&) -> std::string { return std::string(""); }));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define FORWARD_FROM_RESULT(method_name, result_expr)                        \
   decltype(std::declval<KinetoEvent>().method_name())                        \
   KinetoEvent::method_name() const {                                         \
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index dbb4febce78b9..61097875e0115 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -23,7 +23,11 @@ using extra_meta_t = std::unordered_map<std::string, std::string>;
 
 struct TORCH_API KinetoEvent {
   KinetoEvent(
+<<<<<<< HEAD
       const std::shared_ptr<const torch::profiler::impl::Result>& /*result*/,
+=======
+      const std::shared_ptr<const torch::profiler::impl::Result>&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const bool verbose);
 
   uint64_t startThreadId() const;
@@ -37,7 +41,10 @@ struct TORCH_API KinetoEvent {
   bool hasConcreteInputs() const;
   const c10::ArrayRef<c10::IValue> concreteInputs() const;
   bool hasKwinputs() const;
+<<<<<<< HEAD
   bool isHiddenEvent() const;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const std::unordered_map<std::string, c10::IValue> kwinputs() const;
   uint64_t flops() const;
   int64_t sequenceNr() const;
@@ -63,9 +70,14 @@ struct TORCH_API KinetoEvent {
   bool isPythonFunction() const;
   int64_t cudaElapsedUs() const;
   int64_t privateuse1ElapsedUs() const;
+<<<<<<< HEAD
   void getPerfEventCounters(torch::profiler::perf_counters_t& /*in*/) const;
   extra_meta_t extraMeta() const;
   std::string metadataJson() const;
+=======
+  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
+  extra_meta_t extraMeta() const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 30a9fb96f258d..e64699eefc7d5 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -97,7 +97,11 @@ struct TORCH_API LegacyEvent {
       case EventKind::MemoryAlloc:
         return "memory_alloc";
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, "unknown event kind");
+=======
+    throw std::runtime_error("unknown event kind");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   EventKind kind() const {
@@ -328,7 +332,11 @@ struct TORCH_API ProfilerDisableOptions {
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(
+<<<<<<< HEAD
     const torch::profiler::impl::ProfilerConfig& /*new_config*/);
+=======
+    const torch::profiler::impl::ProfilerConfig&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(
     std::optional<ProfilerDisableOptions> profilerDisableOptions =
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index a45935ecb2995..205a25badcd32 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -167,7 +167,11 @@ class CallTypeHelper final {
 //
 // During post processing we:
 //   1) Determine the type represented by a TraceKey by checking which
+<<<<<<< HEAD
 //      sub-cache it appears in the thread local cache.
+=======
+//      sub-cache it appears in in the thread local cache.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   2) Look up the pair of CallKeys from the thread local cache.
 //   3) Look up the expanded values of each CallKey from the global value cache.
 //
@@ -365,9 +369,13 @@ std::vector<std::pair<std::string, TensorMetadata>> ValueCache::unpackTensorMap(
 }
 
 template <>
+<<<<<<< HEAD
 void ValueCache::store<CallType::PyCall>(
     const PyCallKey& key,
     no_ephemeral_t /*unused*/) {
+=======
+void ValueCache::store<CallType::PyCall>(const PyCallKey& key, no_ephemeral_t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& locations = std::get<CallType::PyCall>(state_);
   if (C10_UNLIKELY(locations.find(key) == locations.end())) {
     locations[key] = {
@@ -676,14 +684,18 @@ struct ThreadLocalResults {
   CallTypeHelper<TraceKeyCacheState>::tuple_type trace_keys_;
   AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> exit_times_;
   AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> c_exit_times_;
+<<<<<<< HEAD
 
   int active_frames_{0};
   int remaining_start_frames_{0};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // ============================================================================
 // == Tracing implementation ==================================================
 // ============================================================================
+<<<<<<< HEAD
 #define IS_PYTHON_3_12 (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 12)
 #if IS_PYTHON_3_12
 // forward declarations
@@ -695,6 +707,8 @@ static PyObject* c_call_callback(
     PyObject* kwnames);
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class PythonTracer final : public python_tracer::PythonTracerBase {
  public:
   PythonTracer(torch::profiler::impl::RecordQueue* queue);
@@ -706,7 +720,11 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       int what,
       PyObject* arg);
+<<<<<<< HEAD
   void register_gc_callback() override;
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void stop() override;
   void restart() override;
   std::vector<std::shared_ptr<Result>> getEvents(
@@ -725,8 +743,11 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       bool is_startup_frame);
 
+<<<<<<< HEAD
   static PyObject* gc_event_callback(PyObject* self, PyObject* args);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
@@ -735,9 +756,16 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
 
   const std::vector<PyThreadState*> interpreterThreads() const;
 
+<<<<<<< HEAD
   std::atomic<bool> active_lock_{false};
   bool active_{false};
   bool gc_callback_registered_{false};
+=======
+  PyObject* get_callable_from_frame(PyFrameObject* frame);
+
+  std::atomic<bool> active_lock_{false};
+  bool active_{false};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   torch::profiler::impl::RecordQueue* queue_;
   PyInterpreterState* interpreter_{nullptr};
@@ -747,6 +775,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
   std::vector<StartFrame> start_frames_;
   std::deque<ThreadLocalResults> thread_local_results_;
   ValueCache value_cache_;
+<<<<<<< HEAD
 
 #if IS_PYTHON_3_12
   friend PyObject* c_call_callback(
@@ -965,6 +994,10 @@ static void unregisterMonitoringCallback() {
 }
 #endif
 
+=======
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const std::vector<PyThreadState*> PythonTracer::interpreterThreads() const {
   pybind11::gil_scoped_acquire gil;
   std::vector<PyThreadState*> out;
@@ -978,6 +1011,7 @@ const std::vector<PyThreadState*> PythonTracer::interpreterThreads() const {
   return out;
 }
 
+<<<<<<< HEAD
 // we are only registering on main thread while holding GIL so this should be
 // safe
 static PyObject* py_gc_callback = nullptr;
@@ -999,6 +1033,8 @@ PyObject* PythonTracer::gc_event_callback(PyObject* self, PyObject* args) {
   Py_RETURN_NONE;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     : queue_(queue),
 
@@ -1028,8 +1064,12 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     PyThreadState_Swap(thread_state);
 
     thread_local_results_.emplace_back(thread_state, &value_cache_, this);
+<<<<<<< HEAD
     auto& tls = thread_local_results_.back();
     auto* ctx = tls.ctx_;
+=======
+    auto* ctx = thread_local_results_.back().ctx_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // When we begin profiling there are already frames on the Python
     // interpreter stack. To ensure a complete trace, we must push calls
@@ -1051,7 +1091,21 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     }
 
     for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
+<<<<<<< HEAD
       recordPyCall(tls, it->get(), true);
+=======
+      recordPyCall(thread_local_results_.back(), it->get(), true);
+      PyFrameObject* frame = it->get();
+      PyObject* callable = get_callable_from_frame(frame);
+      if (callable) {
+        // If the frame has a callable, record it as a C call since
+        // PyEval_GetFrame only gets the python frame. We need to record this C
+        // call so that when exiting the profiler we don't have a mismatched C
+        // call.
+        recordCCall(thread_local_results_.back(), it->get(), callable, true);
+      }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto frame_refcount = Py_REFCNT(it->get());
 
       // We hold one reference in `current_stack`, and the interpreter holds
@@ -1059,13 +1113,17 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
       TORCH_INTERNAL_ASSERT(frame_refcount >= 2, frame_refcount);
     }
 
+<<<<<<< HEAD
     tls.remaining_start_frames_ = tls.active_frames_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Note:
     //   This profile will not compose with other CPython profilers, and
     //   cannot be round tripped via `sys.settrace(sys.gettrace())`
     PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
   }
+<<<<<<< HEAD
 #if IS_PYTHON_3_12
   registerMonitoringCallback();
 #endif
@@ -1131,14 +1189,19 @@ void PythonTracer::register_gc_callback() {
   Py_DECREF(callbacks);
   Py_DECREF(gc_module);
   PyGILState_Release(gstate);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void PythonTracer::stop() {
   gil_and_restore_thread gil;
+<<<<<<< HEAD
   if (gc_callback_registered_) {
     unregister_gc_callback();
     gc_callback_registered_ = false;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (active_) {
     for (const auto thread_state : interpreterThreads()) {
       if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
@@ -1147,10 +1210,13 @@ void PythonTracer::stop() {
       }
     }
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_12
     unregisterMonitoringCallback();
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto lock_returned = active_lock_.compare_exchange_strong(active_, false);
     active_ = false;
     SOFT_ASSERT(lock_returned, "Failed to return python tracer lock.");
@@ -1174,9 +1240,12 @@ void PythonTracer::restart() {
       PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
     }
   }
+<<<<<<< HEAD
 #if IS_PYTHON_3_12
   registerMonitoringCallback();
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // NOLINTNEXTLINE(bugprone-exception-escape)
@@ -1239,7 +1308,10 @@ void PythonTracer::recordPyCall(
   const auto time = c10::getApproximateTime();
   is_startup_frame ? start_frames_.push_back({key, time})
                    : queue_->getSubqueue()->emplace_py_call(key, time);
+<<<<<<< HEAD
   ++tls.active_frames_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void PythonTracer::recordCCall(
@@ -1259,7 +1331,30 @@ void PythonTracer::recordCCall(
   auto key = tls.intern<CallType::PyCCall, EventType::PyCCall>(
       arg, (void*)(fn->m_ml), frame);
   queue_->getSubqueue()->emplace_py_call(key, c10::getApproximateTime());
+<<<<<<< HEAD
   ++tls.active_frames_;
+=======
+}
+
+PyObject* PythonTracer::get_callable_from_frame(PyFrameObject* frame) {
+  if (frame == nullptr) {
+    return nullptr;
+  }
+  // Get the code object associated with the frame
+  auto code = THPCodeObjectPtr(PyFrame_GetCode(frame));
+  if (code == nullptr) {
+    return nullptr;
+  }
+  // Get the function name (if needed)
+  auto name = THPUtils_unpackStringView(code->co_name).data();
+  // To get the function object, you will need to look in the globals or the
+  // frame's f_globals
+  PyObject* func = PyDict_GetItemString(PyFrame_GetGlobals(frame), name);
+  if (func) {
+    Py_INCREF(func); // Make sure the returned function has a reference
+  }
+  return func; // Returns a PyObject* (the function)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ============================================================================
@@ -1363,7 +1458,13 @@ class PostProcess {
                state.exits_.top().t_ < enter.enter_t_) {
           auto& exit = state.exits_.top();
           auto& tstack = stacks[exit.python_tid_];
+<<<<<<< HEAD
           pop(tstack, exit.t_);
+=======
+          if (!tstack.empty()) {
+            pop(tstack, exit.t_);
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           state.exits_.pop();
         }
         out.push_back(Result::create(
@@ -1434,7 +1535,11 @@ struct PythonIDVisitor {
   }
 
   template <typename T>
+<<<<<<< HEAD
   void operator()(T& /*unused*/) {}
+=======
+  void operator()(T&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   size_t current_python_id_{0};
   ska::flat_hash_map<PyModuleCls, ska::flat_hash_map<PyModuleSelf, size_t>>
@@ -1555,6 +1660,7 @@ int PythonTracer::pyProfileFn(
       local_results.active_tracer_->recordCCall(local_results, frame, arg);
       break;
 
+<<<<<<< HEAD
     case PyTrace_RETURN:
       local_results.exit_times_.emplace_back(c10::getApproximateTime());
       local_results.active_frames_--;
@@ -1562,15 +1668,24 @@ int PythonTracer::pyProfileFn(
           local_results.remaining_start_frames_) {
         local_results.remaining_start_frames_ = local_results.active_frames_;
       }
+=======
+    case PyTrace_EXCEPTION:
+    case PyTrace_RETURN:
+      local_results.exit_times_.emplace_back(c10::getApproximateTime());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
 
     case PyTrace_C_EXCEPTION:
     case PyTrace_C_RETURN:
+<<<<<<< HEAD
       if (local_results.active_frames_ >
           local_results.remaining_start_frames_) {
         local_results.c_exit_times_.emplace_back(c10::getApproximateTime());
         local_results.active_frames_--;
       }
+=======
+      local_results.c_exit_times_.emplace_back(c10::getApproximateTime());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
   }
   return 0;
diff --git a/torch/csrc/autograd/python_anomaly_mode.cpp b/torch/csrc/autograd/python_anomaly_mode.cpp
index 2b05b93673bfb..2f03d8b83e261 100644
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@@ -30,7 +30,11 @@ void PyAnomalyMetadata::store_stack() {
 void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
   pybind11::gil_scoped_acquire gil;
   if (!PyDict_Check(dict())) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Anomaly metadata is not a python dictionary.");
+=======
+    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   PyObject* trace_stack = nullptr;
   if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {
diff --git a/torch/csrc/autograd/python_cpp_function.cpp b/torch/csrc/autograd/python_cpp_function.cpp
index 6787df7080ad8..a51da3597b1d9 100644
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@@ -261,7 +261,12 @@ PyTypeObject* _initFunctionPyTypeObject(
   type.tp_traverse = THPCppFunction_traverse;
   type.tp_clear = THPCppFunction_clear;
   if (PyType_Ready(&type) < 0) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Unable to instantiate PyTypeObject for ", name);
+=======
+    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
+    throw std::runtime_error(msg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return &type;
 }
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 8a52306e91830..8d70e85f95bcc 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -57,6 +57,13 @@ PythonEngine::~PythonEngine() {
   Engine::stop();
 }
 
+<<<<<<< HEAD
+=======
+#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 9
+#define IS_PYTHON_3_9_PLUS
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void PythonEngine::thread_init(
     int device,
     const std::shared_ptr<ReadyQueue>& ready_queue,
@@ -68,7 +75,15 @@ void PythonEngine::thread_init(
   // Create a PyThreadState, but release the GIL. This lets
   // pybind11::gil_scoped_acquire calls inside thread_main acquire the GIL
   // without having to create a new PyThreadState each time.
+<<<<<<< HEAD
+  auto gil = std::make_unique<pybind11::gil_scoped_acquire>();
+=======
+#if defined(IS_PYTHON_3_9_PLUS)
   auto gil = std::make_unique<pybind11::gil_scoped_acquire>();
+#else
+  pybind11::gil_scoped_acquire gil;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_release no_gil;
   Engine::thread_init(device, ready_queue, false);
 
@@ -77,6 +92,10 @@ void PythonEngine::thread_init(
     decrement_non_reentrant_thread_count();
   }
 
+<<<<<<< HEAD
+=======
+#if defined(IS_PYTHON_3_9_PLUS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Do not call PyEval_RestoreThread, PyThreadState_[Clear|DeleteCurrent] if
   // runtime is finalizing
   if (!Py_IsInitialized()) {
@@ -87,6 +106,10 @@ void PythonEngine::thread_init(
     auto ptr = gil.release();
     operator delete(ptr);
   }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void PythonEngine::thread_on_exception(
@@ -500,9 +523,15 @@ static void child_atfork() {
 
 bool THPEngine_initModule(PyObject* module) {
 #ifndef _WIN32
+<<<<<<< HEAD
   TORCH_CHECK(
       pthread_atfork(nullptr, nullptr, child_atfork) == 0,
       "unable to set pthread_atfork handler");
+=======
+  if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
+    throw std::runtime_error("unable to set pthread_atfork handler");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   if (PyType_Ready(&THPEngineType) < 0)
     return false;
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index b4378faf8d3ec..21c8c97db49f4 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -188,6 +188,7 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
   }
 
   // Now the number of gradients should match
+<<<<<<< HEAD
   TORCH_CHECK(
       num_outputs == num_forward_inputs,
       "function ",
@@ -197,6 +198,15 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
       ", got ",
       num_outputs,
       ")");
+=======
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name() + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got ";
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Massage the Python results tuple back into a C++ variable_list
   return to_variable_list(r.get(), is_variable_input);
@@ -437,6 +447,7 @@ variable_list PyNode::to_variable_list(
     PyObject* output = PyTuple_GET_ITEM(outputs, i);
     bool was_variable = is_variable_input[i];
     if (!was_variable) {
+<<<<<<< HEAD
       TORCH_CHECK(
           output == Py_None,
           "function ",
@@ -444,17 +455,35 @@ variable_list PyNode::to_variable_list(
           " returned a gradient different than None at position ",
           i + 1,
           ", but the corresponding forward input was not a Variable");
+=======
+      if (output != Py_None) {
+        std::string msg("function ");
+        msg += name() + " returned a gradient different than None at position ";
+        msg += std::to_string(i + 1) +
+            ", but the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     }
     if (output == Py_None) {
       results.emplace_back();
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(
           THPVariable_Check(output),
           "expected Variable or None (got ",
           THPUtils_typename(output),
           ")");
 
+=======
+      if (!THPVariable_Check(output)) {
+        std::string msg("expected Variable or None (got ");
+        msg += THPUtils_typename(output);
+        msg += ")";
+        throw std::runtime_error(msg);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       results.emplace_back(THPVariable_Unpack(output));
     }
   }
@@ -539,7 +568,10 @@ static PyObject* THPFunction_new(
   new (&self->saved_variables) std::vector<SavedVariable>();
   new (&self->is_variable_input) std::vector<bool>();
   self->materialize_grads = true;
+<<<<<<< HEAD
   self->pure_view = false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self->materialize_non_diff_grads = true;
   return obj;
 }
@@ -717,8 +749,12 @@ static void _wrap_outputs(
       cdata_if_executable,
       jvp_user_function,
       to_save_if_setup_context,
+<<<<<<< HEAD
       view_as_self_fn,
       self->pure_view);
+=======
+      view_as_self_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GetItem(raw_output, i);
@@ -797,6 +833,7 @@ static void _get_tensors_to_save(
         if (is_executable) {
           // TODO: We should really just ALWAYS throw an error here, but
           // doing so will break some internal tests. We should fix those.
+<<<<<<< HEAD
           TORCH_CHECK_TYPE(
               false,
               fmt::format(
@@ -808,20 +845,36 @@ static void _get_tensors_to_save(
       }
     }
     Py_CLEAR(self->to_save);
+=======
+          throw torch::TypeError(
+              "save_for_backward can only save variables, but argument %ld is of "
+              "type %s",
+              i,
+              Py_TYPE(obj)->tp_name);
+        }
+      }
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 // Save any variables that requested by to_save
 static void _save_variables(
     const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
+<<<<<<< HEAD
     THPFunction* self,
     PyObject* outputs,
     int64_t num_outputs) {
   if (tensors_to_save.empty())
+=======
+    THPFunction* self) {
+  if (!self->to_save)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   size_t num_saved = tensors_to_save.size();
   self->saved_variables.clear();
   self->saved_variables.reserve(num_saved);
+<<<<<<< HEAD
 
   std::unordered_set<at::TensorImpl*> output_impls{};
   output_impls.reserve(num_outputs);
@@ -833,15 +886,26 @@ static void _save_variables(
     }
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& opt_tensor : tensors_to_save) {
     if (!opt_tensor.has_value()) {
       self->saved_variables.emplace_back();
     } else {
+<<<<<<< HEAD
       bool is_output =
           output_impls.count(opt_tensor.value().unsafeGetTensorImpl()) > 0;
       self->saved_variables.emplace_back(opt_tensor.value(), is_output);
     }
   }
+=======
+      bool is_output = opt_tensor.value().grad_fn().get() == cdata_ptr.get();
+      self->saved_variables.emplace_back(opt_tensor.value(), is_output);
+    }
+  }
+  // Free .to_save
+  Py_CLEAR(self->to_save);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Mark requires_grad = 0 on non-differentiable variables (as per
@@ -1071,8 +1135,12 @@ void _trace_post_record(
       }
     }
   }
+<<<<<<< HEAD
   py::object onnx_globals =
       py::module::import("torch.onnx._internal.torchscript_exporter._globals");
+=======
+  py::object onnx_globals = py::module::import("torch.onnx._globals");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::bool_ is_in_onnx_export =
       py::module::import("torch.onnx.__init__").attr("is_in_onnx_export");
   py::bool_ is_autograd_inlining_enabled =
@@ -1153,8 +1221,12 @@ PyObject* process_outputs(
   // wrapping as the outputs must have their grad_fn/fw_grad properly set before
   // we save them.
   if (is_executable) {
+<<<<<<< HEAD
     _save_variables(
         tensors_to_save, cdata, grad_fn, outputs.get(), num_outputs);
+=======
+    _save_variables(tensors_to_save, cdata, grad_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // Remove unnecessary attributes
     Py_CLEAR(grad_fn->to_save);
@@ -1458,6 +1530,7 @@ int THPFunction_set_materialize_grads(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 int THPFunction_set_pure_view(
     THPFunction* self,
     PyObject* value,
@@ -1472,6 +1545,8 @@ int THPFunction_set_pure_view(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* THPFunction_get_materialize_non_diff_grads(
     THPFunction* self,
     void* _unused) {
@@ -1746,11 +1821,14 @@ static struct PyGetSetDef THPFunction_properties[] = {
      (setter)THPFunction_set_materialize_grads,
      nullptr,
      nullptr},
+<<<<<<< HEAD
     {"_is_pure_view",
      nullptr,
      (setter)THPFunction_set_pure_view,
      nullptr,
      nullptr},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_materialize_non_diff_grads",
      (getter)THPFunction_get_materialize_non_diff_grads,
      (setter)THPFunction_set_materialize_non_diff_grads,
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 4b22c40725f95..77cac7dbb3f32 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -109,10 +109,13 @@ struct THPFunction {
   // Default is true.
   bool materialize_grads;
 
+<<<<<<< HEAD
   // boolean indicating whether the function is a "pure view", meaning that
   // replaying the view is enough to get a correct backward.
   bool pure_view;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // boolean indicating whether to materialize output grad tensors
   // corresponding to non-differentiable outputs. Normally, someone would
   // already get this behavior by switching off materialize_grads,
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
index 8d2fd0b996708..d8416a7285d3c 100644
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -289,7 +289,13 @@ static variable_list unwrap_variables(PyObject* py_variables) {
       results[i] = THPVariable_Unpack(item);
     } else {
       // this should never happen, but just in case...
+<<<<<<< HEAD
       TORCH_CHECK(false, "expected variable but got ", Py_TYPE(item)->tp_name);
+=======
+      std::stringstream ss;
+      ss << "expected variable but got " << Py_TYPE(item)->tp_name;
+      throw std::runtime_error(ss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return results;
@@ -306,6 +312,7 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
 
   auto prev_size = PyTuple_GET_SIZE(prev);
   auto result_size = PyTuple_GET_SIZE(result);
+<<<<<<< HEAD
 
   TORCH_CHECK(
       prev_size == result_size,
@@ -316,6 +323,16 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
       ", but expected ",
       prev_size,
       ")");
+=======
+  if (prev_size != result_size) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has returned an incorrect number ";
+    ss << "of values (got " << result_size << ", but expected ";
+    ss << prev_size << ")";
+    throw std::runtime_error(ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (const auto i : c10::irange(prev_size)) {
     check_single_result(
@@ -330,9 +347,16 @@ static void check_single_result(
   if (_result == Py_None)
     return;
 
+<<<<<<< HEAD
   TORCH_CHECK(
       _original != Py_None,
       "can't replace a None gradient with a non-None value");
+=======
+  if (_original == Py_None) {
+    throw std::runtime_error(
+        "can't replace a None gradient with a non-None value");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (!PyObject_IsInstance(_result, THPVariableClass)) {
     PyErr_Format(
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index ee00008c94bb9..1b60ccf7155fb 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -1,7 +1,10 @@
 #include <torch/csrc/autograd/python_legacy_variable.h>
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/autograd/python_function.h>
@@ -58,9 +61,14 @@ static PyObject* THPVariable_pynew(
       !is_volatile || !requires_grad,
       "Variable can't be volatile and require_grad at the same time!");
   if (grad_fn && !THPFunction_Check(grad_fn)) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         "_grad_fn has to be a Function object or None, but got ",
+=======
+    throw TypeError(
+        "_grad_fn has to be a Function object or None, but got %s",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Py_TYPE(grad_fn)->tp_name);
   }
   Variable var;
@@ -76,10 +84,15 @@ static PyObject* THPVariable_pynew(
   } else if (THPVariable_Check(data)) {
     var = THPVariable_Unpack(data).detach();
   } else {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         "Variable data has to be a tensor, but got ",
         Py_TYPE(data)->tp_name);
+=======
+    throw torch::TypeError(
+        "Variable data has to be a tensor, but got %s", Py_TYPE(data)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // We set `tensor`'s `allow_tensor_metadata_change` to true here, because we
   // want to allow the following use case for backward compatibility:
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 79739b6e459d2..df41a9905948d 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -622,6 +622,7 @@ void initTorchFunctions(PyObject* module) {
         return impl->was_inductor_storage_resized();
       });
   py_module.def(
+<<<<<<< HEAD
       "_functionalize_inductor_storage_resized_counter",
       [](const at::Tensor& t) {
         TORCH_INTERNAL_ASSERT(
@@ -630,6 +631,8 @@ void initTorchFunctions(PyObject* module) {
         return impl->inductor_storage_resized_counter();
       });
   py_module.def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "_functionalize_are_all_mutations_hidden_from_autograd",
       [](const at::Tensor& t) {
         TORCH_INTERNAL_ASSERT(
@@ -644,6 +647,18 @@ void initTorchFunctions(PyObject* module) {
             at::functionalization::impl::isFunctionalTensor(t));
         at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
       });
+<<<<<<< HEAD
+=======
+  py_module.def(
+      "_functionalize_apply_view_metas",
+      [](const at::Tensor& tensor, const at::Tensor& base) {
+        TORCH_INTERNAL_ASSERT(
+            at::functionalization::impl::isFunctionalTensor(tensor));
+        auto impl =
+            at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+        return impl->apply_view_metas(base);
+      });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
     auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
@@ -697,11 +712,14 @@ void initTorchFunctions(PyObject* module) {
     auto t_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
     return t_impl->has_data_mutation();
   });
+<<<<<<< HEAD
   py_module.def("_functionalize_mutation_counter", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
     auto t_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
     return t_impl->mutation_counter();
   });
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py_module.def(
       "_functionalize_get_storage_size", [](const at::Tensor& t, bool before) {
         TORCH_INTERNAL_ASSERT(
@@ -711,10 +729,17 @@ void initTorchFunctions(PyObject* module) {
         auto size = wrapper->get_storage_size(/*before=*/before);
         return size;
       });
+<<<<<<< HEAD
   py_module.def("_functionalize_mark_storage_changed", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
     auto wrapper = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
     wrapper->mark_storage_changed();
+=======
+  py_module.def("_functionalize_set_storage_changed", [](const at::Tensor& t) {
+    TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
+    auto wrapper = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
+    wrapper->set_storage_changed();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
   py_module.def("_functionalize_was_storage_changed", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
@@ -722,6 +747,7 @@ void initTorchFunctions(PyObject* module) {
     return wrapper->was_storage_changed();
   });
   py_module.def(
+<<<<<<< HEAD
       "_functionalize_storage_changed_counter", [](const at::Tensor& t) {
         TORCH_INTERNAL_ASSERT(
             at::functionalization::impl::isFunctionalTensor(t));
@@ -730,6 +756,8 @@ void initTorchFunctions(PyObject* module) {
         return t_impl->storage_changed_counter();
       });
   py_module.def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "_functionalize_unsafe_set", [](at::Tensor& dst, const at::Tensor& src) {
         // Forcefully/unsafely dumps src.storage into dst.
         // This API is technically and not specific to functionalization
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 4d6c618d0faef..1c9acd09ea73d 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -157,7 +157,11 @@ void pushPyOutToStack(
     const char* msg) {
   TORCH_CHECK(
       PyGILState_Check(), "GIL must be held before you call pushPyOutToStack");
+<<<<<<< HEAD
   const auto& schema_returns = op.schema().returns();
+=======
+  auto schema_returns = op.schema().returns();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_returns = schema_returns.size();
   if (num_returns == 0) {
     // Check that we got a None return from Python. Anything else is an error.
@@ -209,11 +213,19 @@ PyObject* ParameterClass = nullptr;
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     const at::TensorBase& _var,
+<<<<<<< HEAD
     bool allow_preexisting_pyobj = false,
     std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
 
 // clang-tidy gets confused by static const
 static constexpr const char* VOLATILE_WARNING =
+=======
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj = false);
+
+// clang-tidy gets confused by static const
+static const char* VOLATILE_WARNING =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "volatile was removed and now has no effect. Use "
     "`with torch.no_grad():` instead.";
 
@@ -261,12 +273,24 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   }
 
   if (c10::impl::HermeticPyObjectTLS::get_state()) {
+<<<<<<< HEAD
     return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+=======
+    return THPVariable_NewWithVar(
+        (PyTypeObject*)THPVariableClass,
+        var,
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::optional<PyObject*> mb_obj =
       var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
           /*ignore_hermetic_tls=*/false);
+=======
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  c10::impl::PyInterpreterStatus status{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (mb_obj.has_value()) {
     auto obj = *mb_obj;
     if (obj) {
@@ -291,6 +315,7 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
     // (https://github.com/pytorch/pytorch/pull/56017).  Prior to this PR
     // being a thing, the PyObject field will get cleared when all references
     // to the Python object are removed.
+<<<<<<< HEAD
   }
 
   if (C10_LIKELY(var.device().type() != c10::kXLA)) {
@@ -302,6 +327,29 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   }
 
   return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+=======
+    status = c10::impl::PyInterpreterStatus::TAGGED_BY_US;
+  } else {
+    // Assumption: if a Tensor has been shared across threads, this induces
+    // a refcount bump.  Therefore, if the use count 1, we are the sole thread
+    // with access to this tensor and no race is possible.
+    if (var.use_count() <= 1) {
+      status = c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED;
+    } else {
+      status = c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED;
+    }
+  }
+
+  if (C10_LIKELY(var.device().type() != c10::kXLA)) {
+    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
+  }
+
+  if (auto clazz = getPythonTensorClass(var.device())) {
+    return THPVariable_NewWithVar((PyTypeObject*)clazz, var, status);
+  }
+
+  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static bool isResurrectable(THPVariable* self) {
@@ -330,7 +378,12 @@ static bool isResurrectable(THPVariable* self) {
   }
   // Check if this is hermetic. If it is, no resurrection.
   if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
           /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
+=======
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false) !=
+      (PyObject*)self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return true;
@@ -356,6 +409,10 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
 
   c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
   auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
+=======
+      getPyInterpreter(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /*ignore_hermetic_tls=*/false);
 
   TORCH_INTERNAL_ASSERT(
@@ -571,7 +628,14 @@ static PyObject* THPVariable_as_subclass(
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
   c10::impl::DisablePythonDispatcher dpd_g;
+<<<<<<< HEAD
   return THPVariable_NewWithVar((PyTypeObject*)cls, self.alias());
+=======
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      self.alias(),
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -623,6 +687,7 @@ static PyObject* THPVariable_make_subclass(
     data.unsafeGetTensorImpl()->_change_backend_component_keys(r.device(6));
   }
 
+<<<<<<< HEAD
   return THPVariable_NewWithVar((PyTypeObject*)cls, data);
   END_HANDLE_TH_ERRORS
 }
@@ -687,6 +752,17 @@ static Tensor make_tensor_for_subclass_helper(
 
 static PyObject* THPVariable_make_wrapper_subclass(
     PyObject* /*unused*/,
+=======
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      data,
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPVariable_make_wrapper_subclass(
+    PyObject*,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
@@ -752,6 +828,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
 
   // don't bother releasing GIL here, as we are not allocating any nontrivial
   // data
+<<<<<<< HEAD
   auto sym_sizes = r.symintlist(1);
   auto sym_strides_own = r.symintlistOptional(2);
   Tensor tensor = make_tensor_for_subclass_helper(
@@ -766,6 +843,71 @@ static PyObject* THPVariable_make_wrapper_subclass(
   if (sizes_strides_policy.has_value()) {
     tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
         parseSizesStridesPolicyArgument(*sizes_strides_policy));
+=======
+  Tensor tensor;
+
+  {
+    AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
+    tracer::impl::NoTracerDispatchMode tracer_guard{};
+
+    auto sym_sizes = r.symintlist(1);
+    auto sym_strides_own = r.symintlistOptional(2);
+    auto sym_strides =
+        static_cast<std::optional<c10::SymIntArrayRef>>(sym_strides_own);
+    auto sym_storage_offset = r.toSymIntOptional(3);
+
+    c10::SymInt size_bytes;
+    auto dtype_itemsize = static_cast<int64_t>(options.dtype().itemsize());
+    auto storage_size = r.toSymIntOptional(14);
+
+    if (storage_size.has_value()) {
+      size_bytes = storage_size.value();
+    } else if (sym_strides.has_value()) {
+      size_bytes = at::detail::computeStorageNbytes(
+          sym_sizes,
+          sym_strides.value(),
+          dtype_itemsize,
+          sym_storage_offset.value_or(0));
+    } else {
+      size_bytes = at::detail::computeStorageNbytesContiguous(
+          sym_sizes, dtype_itemsize, sym_storage_offset.value_or(0));
+    }
+
+    // We use storages **only** to track aliasing of subclasses during tracing.
+    // The actual data pointers are not valid.
+    Storage storage{
+        Storage::use_byte_size_t{},
+        size_bytes,
+        /*allocator=*/c10::GetAllocator(c10::kMeta),
+        /*resizable=*/true};
+    // TODO: constructor should probably accept data pointer
+    storage.set_data_ptr_noswap(at::DataPtr{nullptr, r.device(7)});
+
+    auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
+    if (auto mb_extra_keys = r.toDispatchKeySetOptional(13)) {
+      keys = keys | *mb_extra_keys;
+    }
+    tensor = at::detail::make_tensor<TensorImpl>(
+        std::move(storage), keys, options.dtype());
+
+    TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+
+    if (sym_strides.has_value()) {
+      tensor_impl->set_sizes_and_strides(
+          sym_sizes, sym_strides.value(), sym_storage_offset);
+    } else {
+      TORCH_CHECK(
+          !sym_storage_offset.has_value(),
+          "setting storage offset without stride not supported");
+      tensor_impl->generic_set_sizes_contiguous(sym_sizes);
+    }
+
+    const auto sizes_strides_policy = r.stringViewOptional(10);
+    if (sizes_strides_policy.has_value()) {
+      tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
+          parseSizesStridesPolicyArgument(*sizes_strides_policy));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   tensor.set_requires_grad(r.toBool(9));
@@ -780,6 +922,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
   return THPVariable_NewWithVar(
       (PyTypeObject*)cls,
       tensor,
+<<<<<<< HEAD
       // false is the default
       /*allow_preexisting_pyobj=*/false,
       // we checked __torch_dispatch__ above; avoid checking again.
@@ -1129,6 +1272,9 @@ static PyObject* DTensor_OpSchema_post_init(PyObject* mod, PyObject* self) {
   }
   self_handle.attr(dtensor_interned_strings.has_symints) = has_symints;
   Py_RETURN_NONE;
+=======
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -1236,7 +1382,11 @@ static PyObject* THPVariable_get_version(THPVariable* self, void* unused) {
     return handle_torch_function_getter(self, "_version");
   }
   const auto& var = THPVariable_Unpack(self);
+<<<<<<< HEAD
   return THPUtils_packInt64(var._version());
+=======
+  return PyInt_FromLong(var._version());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -1319,6 +1469,7 @@ static int THPVariable_set_grad(
       self != (THPVariable*)py_grad, "can't assign Variable as its own grad");
 
   const auto& grad = THPVariable_Unpack(py_grad);
+<<<<<<< HEAD
   if (var.grad_dtype().has_value()) {
     TORCH_CHECK(
         grad.dtype() == var.grad_dtype().value(),
@@ -1331,6 +1482,15 @@ static int THPVariable_set_grad(
         "None to allow any dtype. Set grad_dtype with caution. Diverging the dtypes of "
         "a tensor and its gradient may break downstream systems that assume they match.");
   }
+=======
+  TORCH_CHECK(
+      var.dtype() == grad.dtype(),
+      "attempting to assign a gradient with dtype '",
+      grad.dtype(),
+      "' to a tensor with dtype '",
+      var.dtype(),
+      "'. Please ensure that the gradient and the tensor have the same dtype");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       var.device().type() == grad.device().type(),
       "attempting to assign a gradient with device type '",
@@ -1339,11 +1499,16 @@ static int THPVariable_set_grad(
       var.device().type(),
       "'. Please ensure that the gradient and the tensor are on the same device");
   if (grad.layout() != kSparse) {
+<<<<<<< HEAD
     auto expected_options = var.options().dtype(
         var.grad_dtype().has_value() ? var.grad_dtype().value()
                                      : grad.scalar_type());
     TORCH_CHECK(
         grad.options().type_equal(expected_options),
+=======
+    TORCH_CHECK(
+        grad.options().type_equal(var.options()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "attempting to assign a gradient to a tensor that has data of a different type");
   }
   TORCH_CHECK(
@@ -1399,8 +1564,14 @@ static PyObject* THPVariable_get_output_nr(THPVariable* self, void* unused) {
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "output_nr");
   }
+<<<<<<< HEAD
   const auto output_nr = THPVariable_Unpack(self).output_nr();
   return THPUtils_packInt64(output_nr);
+=======
+  const auto output_nr =
+      static_cast<long>(THPVariable_Unpack(self).output_nr());
+  return PyInt_FromLong(output_nr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -1437,7 +1608,11 @@ static PyObject* THPVariable_get_ndim(THPVariable* self, void* unused) {
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "ndim");
   }
+<<<<<<< HEAD
   return THPUtils_packInt64(THPVariable_Unpack(self).dim());
+=======
+  return PyInt_FromLong(THPVariable_Unpack(self).dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -1849,6 +2024,7 @@ static PyObject* THPVariable_get_nbytes(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 static PyObject* THPVariable_get_grad_dtype(THPVariable* self, void* unused) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
@@ -1899,6 +2075,8 @@ static int THPVariable_set_grad_dtype(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* THPVariable_get_itemsize(THPVariable* self, void* unused) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
@@ -2057,11 +2235,14 @@ static struct PyGetSetDef THPVariable_properties[] = {
      (setter)THPVariable_set_imag,
      nullptr,
      nullptr},
+<<<<<<< HEAD
     {"grad_dtype",
      (getter)THPVariable_get_grad_dtype,
      (setter)THPVariable_set_grad_dtype,
      nullptr,
      nullptr},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 static PyMappingMethods THPVariable_as_mapping = {
@@ -2084,10 +2265,13 @@ static PyMethodDef extra_methods[] = {
      castPyCFunctionWithKeywords(THPVariable_make_wrapper_subclass),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
+<<<<<<< HEAD
     {"_dtensor__new__",
      castPyCFunctionWithKeywords(THPVariable_dtensor_new),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
     {"_view_func",
      castPyCFunctionWithKeywords(THPVariable_view_func),
@@ -2104,6 +2288,7 @@ static PyMethodDef extra_methods[] = {
     {"_use_count", THPVariable__use_count, METH_NOARGS, nullptr},
     {nullptr}};
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static PyMethodDef extra_functions[] = {
     {"_DTensor_OpSchema_post_init",
@@ -2116,6 +2301,8 @@ static PyMethodDef extra_functions[] = {
      nullptr},
     {nullptr}};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct THPVariableMeta {
   PyHeapTypeObject base;
 };
@@ -2234,6 +2421,10 @@ PyObject* THPVariable_pynew(
   return THPVariable_NewWithVar(
       type,
       tensor,
+<<<<<<< HEAD
+=======
+      c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /*allow_preexisting_pyobj=*/true);
   END_HANDLE_TH_ERRORS
 }
@@ -2286,7 +2477,12 @@ static int THPVariable_subclass_clear(THPVariable* self) {
 
     if (!self->cdata.unsafeIsBorrowed() &&
         tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
             /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
+=======
+            getPyInterpreter(), /*ignore_hermetic_tls=*/false) ==
+            (PyObject*)self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // TODO: empirically, on OS X this assert appears to be untrue
       // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
       // distributed/rpc/test_process_group_agent.py
@@ -2458,6 +2654,7 @@ static void THPVariable_subclass_dealloc(PyObject* self) {
   Py_DECREF(type);
 }
 
+<<<<<<< HEAD
 // Creates a new Python object for a Variable.
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
@@ -2467,13 +2664,34 @@ static PyObject* THPVariable_NewWithVar(
   // Make sure that the reinterpret into a THPVariable* will be valid
   TORCH_CHECK(
       type == &THPVariableType || PyType_IsSubtype(type, &THPVariableType),
+=======
+// Creates a new Python object for a Variable.  The status parameter
+// specifies what the interpreter tag status on the object is; for
+// example, if you ran check_pyobj, the return optional of this object
+// tells you if the tensor was already tagged or not so you can pass
+// TAGGED_BY_US or MAYBE_UNINITIALIZED; in other cases, you know where
+// var came from and can directly assert that it's DEFINITELY_UNINITIALIZED.
+// It's ALWAYS safe (albeit slower) to call this with MAYBE_UNINITIALIZED.
+static PyObject* THPVariable_NewWithVar(
+    PyTypeObject* type,
+    const at::TensorBase& _var,
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj) {
+  // Make sure that the reinterpret into a THPVariable* will be valid
+  TORCH_CHECK(
+      PyType_IsSubtype(type, &THPVariableType),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Creating a Tensor subclass from a class ",
       "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
 
   // This function overwrite the Tensor's pyobj field without extra checks
   // Make sure it is not set otherwise we would leak memory
   auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+<<<<<<< HEAD
       /*ignore_hermetic_tls=*/false);
+=======
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Under some circumstances, we may attempt to create a new Python
   // object for a variable that already has a Python object.  The most common
@@ -2555,10 +2773,16 @@ static PyObject* THPVariable_NewWithVar(
       // Normal codepath
       v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
       const auto& var = THPVariable_Unpack(v);
+<<<<<<< HEAD
       var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
       if (has_torch_dispatch_if_known.has_value()
               ? *has_torch_dispatch_if_known
               : check_has_torch_dispatch(obj)) {
+=======
+      var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+          getPyInterpreter(), obj, status);
+      if (check_has_torch_dispatch(obj)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var.unsafeGetTensorImpl()->set_python_dispatch(true);
       }
     }
@@ -2843,10 +3067,13 @@ bool THPVariable_initModule(PyObject* module) {
   torch::autograd::initTorchFunctions(module);
   torch::autograd::initTensorImplConversion(module);
   torch::utils::validate_numpy_for_dlpack_deleter_bug();
+<<<<<<< HEAD
 
   if (!intern_dtensor_strings()) {
     return false;
   }
   PyModule_AddFunctions(module, extra_functions);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 1a1a12ec20a72..259272f066920 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -29,7 +29,10 @@
 #include <c10/util/irange.h>
 
 #include <c10/core/Layout.h>
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using namespace at;
 using namespace torch::autograd::utils;
@@ -61,6 +64,7 @@ Py_ssize_t THPVariable_length(PyObject* self) {
 // and tuples of those types. We also handle bools as if they were a
 // Variable[ByteTensor].
 
+<<<<<<< HEAD
 // We only go one deep, because that's all torchdim needs (it supports
 // a tuple/list of FCDs which triggers a split behavior, but you can
 // only do it at the top level) and it's all the dispatcher will do
@@ -88,6 +92,8 @@ static bool sequence_has_torch_function(PyObject* seq) {
   return false;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static int64_t count_specified_dimensions(PyObject* index) {
   // Count the number of indexed dimensions (everything but ellipsis and None)
   // -1 is a sentinel for __torch_function__
@@ -95,10 +101,15 @@ static int64_t count_specified_dimensions(PyObject* index) {
   auto size = PyTuple_GET_SIZE(index);
   for (Py_ssize_t i = 0; i < size; i++) {
     PyObject* obj = PyTuple_GET_ITEM(index, i);
+<<<<<<< HEAD
     if (check_has_torch_function(obj)) {
       return -1;
     }
 
+=======
+    if (check_has_torch_function(obj))
+      return -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (THPVariable_Check(obj)) {
       const auto& var = THPVariable_Unpack(obj);
       const auto& var_scalar_type = var.scalar_type();
@@ -107,6 +118,7 @@ static int64_t count_specified_dimensions(PyObject* index) {
       } else {
         count++;
       }
+<<<<<<< HEAD
     } else {
       // Check sequences for __torch_function__ (top-level only)
       // NB: do NOT use PySequence_Check, that will grab things like Numpy
@@ -120,6 +132,12 @@ static int64_t count_specified_dimensions(PyObject* index) {
           obj != Py_False) {
         count++;
       }
+=======
+    } else if (
+        obj != Py_None && obj != Py_Ellipsis && obj != Py_True &&
+        obj != Py_False) {
+      count++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return count;
@@ -162,12 +180,19 @@ inline Variable valueToTensor(
   } else if (torch::is_symbool(value)) {
     scalar = Scalar(py::cast<c10::SymBool>(py::handle(value)));
   } else {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         "can't assign a ",
         Py_TYPE(value)->tp_name,
         " to a ",
         torch::utils::options_to_string(options));
+=======
+    throw TypeError(
+        "can't assign a %s to a %s",
+        Py_TYPE(value)->tp_name,
+        torch::utils::options_to_string(options).c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // lift_fresh is supposed to be used in situations where you are guaranteed to
   // get a plain Tensor which is not true for cpu device but not for non cpu
@@ -436,7 +461,11 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
   variable_list variableIndices;
   int64_t specified_dims = count_specified_dimensions(holder.get());
   if (specified_dims == -1) {
+<<<<<<< HEAD
     return handle_torch_function_indexing(self, index);
+=======
+    return handle_torch_function_indexing(self, holder.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Variable sliced = applySlicing(
       self_,
@@ -484,7 +513,11 @@ static void dispatch_set_item(
 int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   HANDLE_TH_ERRORS
   if (py_value == nullptr) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(false, "Tensor does not support deleting items");
+=======
+    throw TypeError("Tensor does not support deleting items");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if ((check_has_torch_function(self)) ||
       (check_has_torch_function(py_value))) {
@@ -497,7 +530,11 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   if (self_.layout() == kSparse || self_.layout() == kSparseCsr ||
       self_.layout() == kSparseCsc || self_.layout() == kSparseBsr ||
       self_.layout() == kSparseBsc) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(false, "Cannot assign to a sparse tensor");
+=======
+    throw TypeError("Cannot assign to a sparse tensor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   OptionalDeviceGuard device_guard(device_of(self_));
   at::Device self_device = self_.device();
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 0124a0212bc61..d5b1ca35ba247 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -39,11 +39,16 @@ SavedVariable::SavedVariable(
     // follow.
     TORCH_CHECK(
         !variable.is_inference(),
+<<<<<<< HEAD
         "Inference tensors cannot be saved for backward. Please do not use "
         "Tensors created in inference mode in computation tracked by autograd. "
         "To work around this, you can make a clone to get a normal tensor and "
         "use it in autograd, or use `torch.no_grad()` instead of "
         "`torch.inference_mode()`.");
+=======
+        "Inference tensors cannot be saved for backward. To work around "
+        "you can make a clone to get a normal tensor and use it in autograd.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     was_default_constructed_ = false;
     saved_version_ = variable._version();
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 9126f397ffb2c..d76ee444aeb55 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -54,6 +54,7 @@ class TORCH_API SavedVariable {
     return (bool)hooks_;
   }
 
+<<<<<<< HEAD
   std::optional<at::Tensor> get_raw_data() const {
     if (hooks_) {
       return std::nullopt;
@@ -62,6 +63,8 @@ class TORCH_API SavedVariable {
     }
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Used by compiled autograd
   std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
   retrieve_unpack_hook_data() const {
diff --git a/torch/csrc/autograd/saved_variable_hooks.h b/torch/csrc/autograd/saved_variable_hooks.h
index d858f46384673..e1e65346ca097 100644
--- a/torch/csrc/autograd/saved_variable_hooks.h
+++ b/torch/csrc/autograd/saved_variable_hooks.h
@@ -11,8 +11,13 @@ struct TORCH_API SavedVariableHooks {
   virtual ~SavedVariableHooks() = default;
   virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
   retrieve_unpack_hook_data() const {
+<<<<<<< HEAD
     TORCH_CHECK(
         false, "Compiled Autograd only supports python saved tensor hooks ");
+=======
+    throw std::runtime_error(
+        "Compiled Autograd only supports python saved tensor hooks ");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
diff --git a/torch/csrc/autograd/utils/lambda_post_hook.h b/torch/csrc/autograd/utils/lambda_post_hook.h
index 5f0f5626a4ad8..6b7d969a4219e 100644
--- a/torch/csrc/autograd/utils/lambda_post_hook.h
+++ b/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -36,7 +36,11 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
 
  protected:
   std::function<variable_list(const variable_list&, const variable_list&)> fn_;
+<<<<<<< HEAD
   compiled_fn_type compiled_fn_;
+=======
+  compiled_fn_type compiled_fn_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::autograd::utils
diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h
index d9ad69848029c..c50dd3c123f6d 100644
--- a/torch/csrc/autograd/utils/python_arg_parsing.h
+++ b/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -17,8 +17,13 @@ inline std::tuple<
     std::optional<at::MemoryFormat>>
 parse_to_conversion(PythonArgs& r, bool allow_copy) {
   if (r.idx == 0) {
+<<<<<<< HEAD
     TORCH_CHECK(
         allow_copy || r.isNone(3), ".to() does not accept copy argument");
+=======
+    if (!allow_copy && !r.isNone(3))
+      throw std::runtime_error(".to() does not accept copy argument");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_tuple(
         r.deviceOptional(0),
         r.scalartypeOptional(1),
@@ -26,8 +31,13 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
         r.toBool(3),
         r.memoryformatOptional(4));
   } else if (r.idx == 1) {
+<<<<<<< HEAD
     TORCH_CHECK(
         allow_copy || r.isNone(2), ".to() does not accept copy argument");
+=======
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_tuple(
         std::nullopt,
         r.scalartype(0),
@@ -36,8 +46,13 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
         r.memoryformatOptional(3));
   } else {
     auto tensor = r.tensor(0);
+<<<<<<< HEAD
     TORCH_CHECK(
         allow_copy || r.isNone(2), ".to() does not accept copy argument");
+=======
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_tuple(
         tensor.device(),
         tensor.scalar_type(),
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index b559ba44bf52f..4552c4f4435bc 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -274,7 +274,11 @@ void set_grad_accumulator(
       std::move(grad_accumulator);
 }
 
+<<<<<<< HEAD
 std::shared_ptr<Node> try_get_grad_accumulator(const at::TensorBase& self) {
+=======
+std::shared_ptr<Node> try_get_grad_accumulator(const Variable& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (get_autograd_meta(self)) {
     return get_autograd_meta(self)->grad_accumulator_.lock();
   } else {
@@ -282,10 +286,13 @@ std::shared_ptr<Node> try_get_grad_accumulator(const at::TensorBase& self) {
   }
 }
 
+<<<<<<< HEAD
 std::shared_ptr<Node> try_get_grad_accumulator(const Variable& self) {
   return try_get_grad_accumulator(get_tensor_base(self));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::shared_ptr<Node> grad_accumulator(const Variable& self) {
   auto autograd_meta = get_autograd_meta(self);
   if (!autograd_meta) {
@@ -601,9 +608,16 @@ void VariableHooks::_backward(
 void VariableHooks::requires_grad_(
     const at::TensorBase& self,
     bool _requires_grad) const {
+<<<<<<< HEAD
   TORCH_CHECK(
       self.is_leaf() || _requires_grad,
       autograd::utils::requires_grad_leaf_error(_requires_grad));
+=======
+  if (!self.is_leaf() && !_requires_grad) {
+    throw std::runtime_error(
+        autograd::utils::requires_grad_leaf_error(_requires_grad));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self.set_requires_grad(_requires_grad);
 }
 
@@ -627,7 +641,11 @@ const at::TensorBase& VariableHooks::base(const at::TensorBase& self) const {
         "Can't get base of non-backward view Tensor");
     return diff_view_meta->get_backward_view().base_;
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Can't get base of non-view Tensor");
+=======
+    throw std::runtime_error("Can't get base of non-view Tensor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -717,8 +735,12 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
             self.sym_sizes(), // Note: sizes(), not base_.sizes(), is
                               // intentional
             self.unsafeGetTensorImpl()->is_python_dispatch(),
+<<<<<<< HEAD
             self.is_nested(),
             self.grad_dtype());
+=======
+            self.is_nested());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         diff_view_meta->grad_fn_ = std::move(fn);
       }
       diff_view_meta->set_attr_version(current_version);
@@ -914,6 +936,7 @@ std::unique_ptr<ViewFunc> ChainedViewFunc::clone_and_set(
       second->clone_and_set(second_symints, second_tensors));
 }
 
+<<<<<<< HEAD
 std::optional<c10::ScalarType> VariableHooks::grad_dtype(
     const at::TensorBase& self) const {
   if (auto* meta = impl::get_autograd_meta(self)) {
@@ -955,4 +978,6 @@ void AutogradMeta::set_grad_dtype(
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index a297a9f5ef425..6e8821cba4b23 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -108,27 +108,43 @@ namespace impl {
 
 // WARNING: This may return a nullptr.  If you require AutogradMeta to return
 // a materialized structure, use materialize_autograd_meta instead.
+<<<<<<< HEAD
 TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase& /*self*/);
 
 // WARNING: This will return a nullptr if the Tensor is not a view.
 TORCH_API DifferentiableViewMeta* get_view_autograd_meta(
     const at::TensorBase& /*self*/);
+=======
+TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase&);
+
+// WARNING: This will return a nullptr if the Tensor is not a view.
+TORCH_API DifferentiableViewMeta* get_view_autograd_meta(const at::TensorBase&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Returns the current autograd meta, materializing it if it was previously
 // none.  This counts as a *mutating* operation, so do not call it on
 // "read-only" operators; in particular, this is NOT thread safe
+<<<<<<< HEAD
 TORCH_API AutogradMeta* materialize_autograd_meta(
     const at::TensorBase& /*self*/);
+=======
+TORCH_API AutogradMeta* materialize_autograd_meta(const at::TensorBase&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /// Set the gradient accumulator of the `Variable`. This is only applicable to
 /// leaf variables. Interior variables should call `set_gradient_edge()`.
 TORCH_API void set_grad_accumulator(
+<<<<<<< HEAD
     const Variable& /*self*/,
+=======
+    const Variable&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::weak_ptr<Node> grad_accumulator);
 
 /// Attempts to get a pointer to the gradient accumulator of the `Variable`,
 /// if it still exists. If the gradient accumulator function has been
 /// destroyed, returns a `nullptr`.
+<<<<<<< HEAD
 TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(
     const Variable& /*self*/);
 TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(
@@ -137,6 +153,13 @@ TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(
 /// Gets the gradient accumulator of the `Variable` if it has one, or else
 /// create one on the fly and return it.
 TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable& /*self*/);
+=======
+TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(const Variable&);
+
+/// Gets the gradient accumulator of the `Variable` if it has one, or else
+/// create one on the fly and return it.
+TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /// Returns the "canonical" gradient edge of this `Variable`, i.e. either the
 /// gradient function if this is an interior `Variable`, or the gradient
@@ -146,7 +169,11 @@ TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable& /*self*/);
 /// zero. Note that `set_gradient_edge` and `gradient_edge` are not
 /// symmetric. You must use `set_gradient_edge` to set the `grad_fn` and
 /// `set_grad_accumulator` to set the accumulator.
+<<<<<<< HEAD
 TORCH_API Edge gradient_edge(const Variable& /*self*/);
+=======
+TORCH_API Edge gradient_edge(const Variable&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
 /// `Variable`.
@@ -154,7 +181,11 @@ TORCH_API Edge gradient_edge(const Variable& /*self*/);
 /// and never the `grad_accumulator`. For the latter, use
 /// `set_grad_accumulator`. This allows late construction of an interior
 /// `Variable`.
+<<<<<<< HEAD
 TORCH_API void set_gradient_edge(const Variable& /*self*/, Edge edge);
+=======
+TORCH_API void set_gradient_edge(const Variable&, Edge edge);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Autograd Graph Interaction
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -165,6 +196,7 @@ TORCH_API void set_gradient_edge(const Variable& /*self*/, Edge edge);
 /// For View Variables:
 /// Called after in-place modifications. Modifies the grad_fn of the base
 /// Variable.
+<<<<<<< HEAD
 TORCH_API void rebase_history(const Variable& /*self*/, Edge gradient_edge);
 
 /// Gets the raw gradient function pointer, whatever it currently is.
@@ -196,6 +228,38 @@ TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
 
 TORCH_API void create_cpp_hook(
     const at::TensorBase& /*self*/,
+=======
+TORCH_API void rebase_history(const Variable&, Edge gradient_edge);
+
+/// Gets the raw gradient function pointer, whatever it currently is.
+TORCH_API Node* grad_fn_unsafe(const Variable&);
+
+/// Increments the version count of this `Variable`.
+TORCH_API void bump_version(const Variable&);
+TORCH_API void set_version_counter(
+    const Variable&,
+    const c10::VariableVersion& version_counter);
+
+/// Retrieves this `Variable`s version counter.
+TORCH_API const c10::VariableVersion& version_counter(const Variable&);
+
+TORCH_API void set_name(const Variable&, const std::string& name);
+
+TORCH_API void add_hook(
+    const at::TensorBase&,
+    std::unique_ptr<FunctionPreHook> hook);
+TORCH_API std::vector<std::unique_ptr<FunctionPreHook>>& hooks(const Variable&);
+TORCH_API void clear_hooks(const at::TensorBase&);
+
+TORCH_API void set_post_acc_grad_hooks(
+    const at::TensorBase&,
+    std::unique_ptr<PostAccumulateGradHook> dict);
+TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
+    const Variable&);
+
+TORCH_API void create_cpp_hook(
+    const at::TensorBase&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool is_retains_grad_hooks = false);
 } // namespace impl
 
@@ -259,6 +323,7 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   // correctly when this variable is passed to another function.
   uint32_t output_nr_;
 
+<<<<<<< HEAD
   // The dtype of the grad field; when nullopt, defaults to tensor's dtype.
   std::optional<at::ScalarType> grad_dtype_;
 
@@ -266,6 +331,8 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   // bypassing dtype casting and validation in the autograd engine.
   bool allow_grad_dtype_mismatch_{false};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Mutex to ensure that concurrent read operations that modify internal
   // state are still thread-safe. Used by grad_fn(), grad_accumulator(),
   // fw_grad() and set_fw_grad()
@@ -306,12 +373,15 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
       uint64_t level,
       bool is_inplace_op) override;
 
+<<<<<<< HEAD
   std::optional<at::ScalarType> grad_dtype(const at::TensorBase& self) const;
 
   void set_grad_dtype(
       const std::optional<at::ScalarType>& grad_dtype,
       const at::TensorBase& self);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AutogradMeta(
       at::TensorImpl* self_impl = nullptr,
       bool requires_grad = false,
@@ -378,12 +448,20 @@ struct TORCH_API ViewFunc {
   /// must match the number of SymInts in the saved state (i.e. the size of the
   /// list returned by get_symints()).
   /// NOLINTNEXTLINE(performance-unnecessary-value-param)
+<<<<<<< HEAD
   virtual void set_symints(std::vector<c10::SymInt> /*unused*/) {}
+=======
+  virtual void set_symints(std::vector<c10::SymInt>) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Sets the values of any Tensors in the saved state. The input vector size
   /// must match the number of Tensors in the saved state (i.e. the size of the
   /// list returned by get_tensors()).
   /// NOLINTNEXTLINE(performance-unnecessary-value-param)
+<<<<<<< HEAD
   virtual void set_tensors(std::vector<at::Tensor> /*unused*/) {}
+=======
+  virtual void set_tensors(std::vector<at::Tensor>) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /// ViewFunc that represents a chain of two ViewFuncs.
@@ -401,6 +479,7 @@ struct ChainedViewFunc : public ViewFunc {
   size_t num_tensors() const override {
     return first->num_tensors() + second->num_tensors();
   }
+<<<<<<< HEAD
   at::Tensor operator()(
       const at::Tensor& /*input_base*/ /*unused*/) const override;
   std::unique_ptr<ViewFunc> clone_and_set(
@@ -408,6 +487,12 @@ struct ChainedViewFunc : public ViewFunc {
           std::nullopt,
       std::optional<std::vector<at::Tensor>> /*tensors*/ /*unused*/ =
           std::nullopt) const override;
+=======
+  at::Tensor operator()(const at::Tensor&) const override;
+  std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = std::nullopt,
+      std::optional<std::vector<at::Tensor>> = std::nullopt) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   std::unique_ptr<ViewFunc> first;
@@ -418,6 +503,7 @@ struct ChainedViewFunc : public ViewFunc {
 struct ErroringViewFunc : public ViewFunc {
   ErroringViewFunc(std::string error_msg) : error_msg(std::move(error_msg)) {}
   ~ErroringViewFunc() override = default;
+<<<<<<< HEAD
   at::Tensor operator()(const at::Tensor& /*unused*/) const override {
     TORCH_CHECK(false, error_msg);
   }
@@ -425,6 +511,14 @@ struct ErroringViewFunc : public ViewFunc {
       std::optional<std::vector<c10::SymInt>> /*unused*/ = std::nullopt,
       std::optional<std::vector<at::Tensor>> /*unused*/ =
           std::nullopt) const override {
+=======
+  at::Tensor operator()(const at::Tensor&) const override {
+    TORCH_CHECK(false, error_msg);
+  }
+  std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = std::nullopt,
+      std::optional<std::vector<at::Tensor>> = std::nullopt) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::make_unique<ErroringViewFunc>(error_msg);
   }
 
@@ -858,6 +952,7 @@ inline Variable make_variable_differentiable_view(
 inline Variable make_variable_non_differentiable_view(
     const Variable& base,
     const at::Tensor& data,
+<<<<<<< HEAD
     bool allow_tensor_metadata_change = true,
     bool is_fresh_tensor = false) {
   if (data.defined()) {
@@ -872,11 +967,22 @@ inline Variable make_variable_non_differentiable_view(
       data_impl->set_autograd_meta(nullptr);
       return data;
     }
+=======
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    // Currently all of non-differentiable view ops(detach/_indices/_values)
+    // share the same TensorImpl as their base Tensor. Thus a new TensorImpl
+    // allocation here is required.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
         /*version_counter=*/impl::version_counter(base),
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(nullptr);
+<<<<<<< HEAD
     return Variable(std::move(data_impl_copy));
+=======
+    return Variable(data_impl_copy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return Variable();
 }
@@ -935,12 +1041,17 @@ inline Variable make_variable(
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
         data_impl_copy.get(), false, std::move(gradient_edge)));
+<<<<<<< HEAD
     return Variable(std::move(data_impl_copy));
+=======
+    return Variable(data_impl_copy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return Variable();
 }
 
 struct VariableHooks final : at::impl::VariableHooksInterface {
+<<<<<<< HEAD
   at::TensorBase tensor_data(
       const at::TensorBase& /*self*/ /*unused*/) const override;
   at::TensorBase variable_data(
@@ -959,6 +1070,21 @@ struct VariableHooks final : at::impl::VariableHooksInterface {
       const at::TensorBase& /*self*/ /*unused*/) const override;
   bool is_leaf(const at::TensorBase& /*self*/ /*unused*/) const override;
   int64_t output_nr(const at::TensorBase& /*self*/ /*unused*/) const override;
+=======
+  at::TensorBase tensor_data(const at::TensorBase&) const override;
+  at::TensorBase variable_data(const at::TensorBase&) const override;
+  const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const at::TensorBase&) const override;
+  unsigned _register_hook(
+      const at::TensorBase&,
+      std::function<at::TensorBase(const at::TensorBase&)> hook) const override;
+  void remove_hook(const at::TensorBase&, unsigned pos) const override;
+  bool is_view(const at::TensorBase&) const override;
+  const at::TensorBase& base(const at::TensorBase&) const override;
+  const std::string& name(const at::TensorBase&) const override;
+  bool is_leaf(const at::TensorBase&) const override;
+  int64_t output_nr(const at::TensorBase&) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_data(const at::TensorBase& self, const at::TensorBase& new_data)
       const override;
   at::TensorBase data(const at::TensorBase& self) const override;
@@ -977,12 +1103,15 @@ struct VariableHooks final : at::impl::VariableHooksInterface {
       const c10::OperatorHandle& op,
       c10::DispatchKeySet dispatch_keys,
       torch::jit::Stack* stack) const override;
+<<<<<<< HEAD
   std::optional<c10::ScalarType> grad_dtype(
       const at::TensorBase& /*self*/ /*unused*/) const override;
   void set_grad_dtype(
       const at::TensorBase& /*self*/ /*unused*/,
       const std::optional<c10::ScalarType>& /*grad_dtype*/ /*unused*/)
       const override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 namespace utils {
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 079cb9703aa2f..2db8c3566fa11 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -7,6 +7,26 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
+<<<<<<< HEAD
+=======
+CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
+    std::function<FreeFuncType> free_fn,
+    void* data,
+    size_t size,
+    int device,
+    cudaStream_t stream)
+    : free_fn_(std::move(free_fn)),
+      data_(data),
+      size_(size),
+      device_(device),
+      stream_(stream) {}
+
+void CUDAPluggableAllocatorDeleterContext::free() {
+  free_fn_(data_, size_, device_, stream_);
+  delete this;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 int device_count = 0;
 
 void custom_raw_deleter(void* ptr);
@@ -24,8 +44,13 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
+<<<<<<< HEAD
     std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
     std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
+=======
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@@ -97,8 +122,15 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   void* r = this->malloc(size, device, stream);
+<<<<<<< HEAD
   c10::DataPtr data_ptr = {
       r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+=======
+  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
+      free_fn_, r, size, device, stream);
+  c10::DataPtr data_ptr = {
+      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return data_ptr;
 }
 
@@ -165,6 +197,7 @@ void CUDAPluggableAllocator::setMemoryFraction(
   }
 }
 
+<<<<<<< HEAD
 std::vector<c10::cuda::CUDACachingAllocator::StreamSegmentSize>
 CUDAPluggableAllocator::getExpandableSegmentSizes(c10::DeviceIndex device) {
   TORCH_CHECK(
@@ -172,6 +205,8 @@ CUDAPluggableAllocator::getExpandableSegmentSizes(c10::DeviceIndex device) {
       "CUDAMallocAsyncAllocator does not yet support getExpandableSegmentSizes.");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void CUDAPluggableAllocator::emptyCache(
     /*unused*/ c10::cuda::MempoolId_t mempool_id) {
   if (reset_fn_) {
@@ -370,8 +405,13 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
+<<<<<<< HEAD
     std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
     std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
+=======
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::shared_ptr<CUDAPluggableAllocator> allocator(
       new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
   allocator->init(device_count);
@@ -388,8 +428,13 @@ void changeCurrentAllocator(
   current_custom_allocator = allocator;
 }
 
+<<<<<<< HEAD
 void custom_raw_deleter(void* ptr) {
   current_custom_allocator->raw_delete(ptr);
+=======
+void custom_raw_deleter(void* ctx) {
+  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index ab9e2e84cd7bb..bd2385f197852 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -11,6 +11,35 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
+<<<<<<< HEAD
+=======
+using MallocFuncType = void*(size_t, int, cudaStream_t);
+using FreeFuncType = void(void*, size_t, int, cudaStream_t);
+
+// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
+// argument for DataPtr. We need context because a user can use
+// multiple allocators in the same PyTorch program, and
+// the allocators can have different free functions, such as:
+// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
+  explicit CUDAPluggableAllocatorDeleterContext(
+      std::function<FreeFuncType> free_fn,
+      void* data,
+      size_t size,
+      int device,
+      cudaStream_t stream);
+
+  void free();
+
+ private:
+  std::function<FreeFuncType> free_fn_;
+  void* data_;
+  size_t size_;
+  int device_;
+  cudaStream_t stream_{};
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_ROCM)
 using streamType = c10::hip::HIPStream;
 #else
@@ -23,8 +52,13 @@ getCurrentAllocator();
 TORCH_CUDA_CPP_API std::shared_ptr<
     c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
+<<<<<<< HEAD
     std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
     std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+=======
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_CUDA_CPP_API void changeCurrentAllocator(
     const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
         allocator);
@@ -43,8 +77,13 @@ struct _AllocationMetadata {
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
     : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
   CUDAPluggableAllocator(
+<<<<<<< HEAD
       std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
       std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+=======
+      std::function<MallocFuncType> alloc_fn,
+      std::function<FreeFuncType> free_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
   CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
@@ -88,8 +127,11 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   bool initialized() override;
   double getMemoryFraction(c10::DeviceIndex device) override;
   void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+<<<<<<< HEAD
   std::vector<c10::cuda::CUDACachingAllocator::StreamSegmentSize>
   getExpandableSegmentSizes(c10::DeviceIndex device) override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void emptyCache(c10::cuda::MempoolId_t mempool_id = {0, 0}) override;
   void enable(bool) override {}
   bool isEnabled() const override {
@@ -149,8 +191,13 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
+<<<<<<< HEAD
   std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
   std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
+=======
+  std::function<MallocFuncType> alloc_fn_;
+  std::function<FreeFuncType> free_fn_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::function<void(int)> init_fn_;
   std::function<void()> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch/csrc/cuda/GdsFile.cpp b/torch/csrc/cuda/GdsFile.cpp
index b0d796945a959..46124b3bd94f2 100644
--- a/torch/csrc/cuda/GdsFile.cpp
+++ b/torch/csrc/cuda/GdsFile.cpp
@@ -47,7 +47,11 @@ void gds_load_storage(
   const size_t nbytes = storage.nbytes();
 
   // Read the binary file
+<<<<<<< HEAD
   ssize_t ret = cuFileRead(cf_handle, dataPtr, nbytes, offset, 0);
+=======
+  ssize_t ret = cuFileRead(cf_handle, (void*)dataPtr, nbytes, offset, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(ret >= 0, "cuFileRead failed: ", cuGDSFileGetErrorString(ret));
 }
 
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 2a551ae28e96d..aa67948eb5c1f 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -101,6 +101,7 @@ void THCPGraph_init(PyObject* module) {
             // compile error.
             return reinterpret_cast<uintptr_t>(graph);
           },
+<<<<<<< HEAD
           py::call_guard<py::gil_scoped_release>())
       .def(
           "raw_cuda_graph_exec",
@@ -112,5 +113,7 @@ void THCPGraph_init(PyObject* module) {
             // compile error.
             return reinterpret_cast<uintptr_t>(graph_exec);
           },
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::call_guard<py::gil_scoped_release>());
 }
diff --git a/torch/csrc/cuda/MemPool.cpp b/torch/csrc/cuda/MemPool.cpp
index b651a4b5e68aa..350ed4854e48f 100644
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@@ -16,12 +16,24 @@ void THCPMemPool_init(PyObject* module) {
       .def(
           py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
                       bool is_user_created,
+<<<<<<< HEAD
                       bool use_on_oom) {
             torch::utils::device_lazy_init(at::kCUDA);
             return std::make_shared<::c10::cuda::MemPool>(
                 allocator, is_user_created, use_on_oom);
           }))
       .def_property_readonly("id", &::c10::cuda::MemPool::id)
+=======
+                      bool use_on_oom,
+                      bool symmetric) {
+            torch::utils::device_lazy_init(at::kCUDA);
+            return std::make_shared<::c10::cuda::MemPool>(
+                allocator, is_user_created, use_on_oom, symmetric);
+          }))
+      .def_property_readonly("id", &::c10::cuda::MemPool::id)
+      .def_property_readonly(
+          "is_symmetric", &::c10::cuda::MemPool::is_symmetric)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
       .def("use_count", &::c10::cuda::MemPool::use_count);
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index a9a5a13206f9c..3bebb5632c70c 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -4,8 +4,13 @@
 #include <ATen/native/ConvUtils.h>
 #include <c10/core/Device.h>
 #include <c10/core/TensorImpl.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <c10/util/UniqueVoidPtr.h>
+=======
+#include <c10/util/UniqueVoidPtr.h>
+#include <fmt/core.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <pybind11/pytypes.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <unordered_set>
@@ -20,8 +25,13 @@
 #include <ATen/cuda/detail/CUDAHooks.h>
 #include <ATen/cuda/jiterator.h>
 #include <ATen/cuda/tunable/Tunable.h>
+<<<<<<< HEAD
 #include <c10/core/AllocatorConfig.h>
 #include <c10/core/StorageImpl.h>
+=======
+#include <c10/core/StorageImpl.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
@@ -304,7 +314,11 @@ at::Scalar as_scalar(PyObject* arg) {
   }
 
   if (THPUtils_checkLong(arg)) {
+<<<<<<< HEAD
     return at::Scalar(THPUtils_unpackLong(arg));
+=======
+    return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (PyBool_Check(arg)) {
@@ -422,6 +436,19 @@ PyObject* THCPModule_cudaCachingAllocator_enable(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
+=======
+PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
+    PyObject* _unused,
+    PyObject* env) {
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::setAllocatorSettings(
+      THPUtils_unpackString(env));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return THPUtils_packString(c10::cuda::CUDACachingAllocator::name());
@@ -672,10 +699,17 @@ PyObject* THCPModule_hostMemoryStats(PyObject* _unused, PyObject* noargs) {
   py::dict result;
   result["num_host_alloc"] = stats.num_host_alloc;
   result["num_host_free"] = stats.num_host_free;
+<<<<<<< HEAD
   result["allocations"] = statToDict(stats.allocations);
   result["active_requests"] = statToDict(stats.active_requests);
   result["allocated_bytes"] = statToDict(stats.allocated_bytes);
   result["active_bytes"] = statToDict(stats.active_bytes);
+=======
+  result["allocation"] = statToDict(stats.allocation);
+  result["segment"] = statToDict(stats.segment);
+  result["allocated_bytes"] = statToDict(stats.allocated_bytes);
+  result["reserved_bytes"] = statToDict(stats.reserved_bytes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   result["host_alloc_time"] = durationStatToDict(stats.host_alloc_time);
   result["host_free_time"] = durationStatToDict(stats.host_free_time);
 
@@ -726,7 +760,12 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
         "mempool_id elements must be integers");
 
     mempool_id = c10::cuda::MempoolId_t(
+<<<<<<< HEAD
         THPUtils_unpackLong(id1), THPUtils_unpackLong(id2));
+=======
+        static_cast<int64_t>(THPUtils_unpackLong(id1)),
+        static_cast<int64_t>(THPUtils_unpackLong(id2)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   using c10::cuda::CUDACachingAllocator::BlockInfo;
@@ -755,7 +794,10 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
   py::str compile_context_s = "compile_context";
+<<<<<<< HEAD
   py::str user_metadata_s = "user_metadata";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -853,7 +895,11 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       case TraceEntry::SEGMENT_MAP:
         return segment_map_s;
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, "unreachable");
+=======
+    throw std::runtime_error("unreachable");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   for (const auto& traceInfo : snapshot.device_traces) {
@@ -873,7 +919,10 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
       trace_entry[compile_context_s] = te.compile_context_;
+<<<<<<< HEAD
       trace_entry[user_metadata_s] = te.user_metadata_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -899,8 +948,11 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
   py::str roundup_power2_divisions_s = "roundup_power2_divisions";
+<<<<<<< HEAD
   py::str graph_capture_record_stream_reuse_s =
       "graph_capture_record_stream_reuse";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   allocator_settings[last_allocator_settings_s] =
       snapshot.config_metadata.last_allocator_settings;
@@ -916,8 +968,11 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       snapshot.config_metadata.release_lock_on_malloc;
   allocator_settings[pinned_use_host_register_s] =
       snapshot.config_metadata.pinned_use_host_register;
+<<<<<<< HEAD
   allocator_settings[graph_capture_record_stream_reuse_s] =
       snapshot.config_metadata.graph_capture_record_stream_reuse;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   unsigned int roundup_key = 1;
   py::dict roundup_settings;
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
@@ -1012,6 +1067,37 @@ PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
+=======
+std::string uuid_to_string(const char* uuid_bytes) {
+  // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
+  // For string representation, the code here expands this to
+  // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
+  return fmt::format(
+      "{:02x}{:02x}{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
+      (uint8_t)uuid_bytes[0],
+      (uint8_t)uuid_bytes[1],
+      (uint8_t)uuid_bytes[2],
+      (uint8_t)uuid_bytes[3],
+      (uint8_t)uuid_bytes[4],
+      (uint8_t)uuid_bytes[5],
+      (uint8_t)uuid_bytes[6],
+      (uint8_t)uuid_bytes[7],
+      (uint8_t)uuid_bytes[8],
+      (uint8_t)uuid_bytes[9],
+      (uint8_t)uuid_bytes[10],
+      (uint8_t)uuid_bytes[11],
+      (uint8_t)uuid_bytes[12],
+      (uint8_t)uuid_bytes[13],
+      (uint8_t)uuid_bytes[14],
+      (uint8_t)uuid_bytes[15]);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ////////////////////////////////////////////////////////////////////////////////
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////
@@ -1045,6 +1131,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
       .def_readonly("warp_size", &cudaDeviceProp::warpSize)
 #ifndef USE_ROCM
       // NVIDIA-only properties
+<<<<<<< HEAD
       .def_property_readonly(
           "clock_rate",
           [](const cudaDeviceProp&) {
@@ -1064,6 +1151,8 @@ static void registerCudaDeviceProperties(PyObject* module) {
             return mem_clk;
           })
       .def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_readonly(
           "shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
       .def_readonly(
@@ -1129,6 +1218,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
   });
 
+<<<<<<< HEAD
   m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
     c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
   });
@@ -1137,6 +1227,8 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::getUserMetadata();
   });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
     return at::native::_cudnn_get_conv_benchmark_empty_cache();
   });
@@ -1293,16 +1385,24 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             self.set_release_pool(func);
           });
   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
+<<<<<<< HEAD
     using MallocFuncType = void*(size_t, int, cudaStream_t);
     using FreeFuncType = void(void*, size_t, int, cudaStream_t);
+=======
+    using namespace torch::cuda::CUDAPluggableAllocator;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::function<MallocFuncType> malloc_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
+<<<<<<< HEAD
     return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
         malloc_fn, free_fn);
+=======
+    return createCustomAllocator(malloc_fn, free_fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -1653,6 +1753,23 @@ PyObject* THCPModule_cuda_record_untuned_is_enabled(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
+=======
+PyObject* THCPModule_cuda_tunableop_write_file_on_exit(
+    PyObject* _unused,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkBool(arg),
+      "cuda_tunableop_write_file_on_exit expects a bool, but got ",
+      THPUtils_typename(arg));
+  at::cuda::tunable::getTuningContext()->WriteFileOnExit(
+      THPUtils_unpackBool(arg));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* THCPModule_cuda_tunableop_set_max_tuning_duration(
     PyObject* _unused,
     PyObject* arg) {
@@ -1734,6 +1851,35 @@ PyObject* THCPModule_cuda_tunableop_get_filename(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
+=======
+PyObject* THCPModule_cuda_tunableop_write_file(
+    PyObject* _unused,
+    PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* str = nullptr;
+  bool success = false;
+  if (!PyArg_ParseTuple(args, "|O", &str)) {
+  }
+  if (str) {
+    TORCH_CHECK(
+        THPUtils_checkString(str),
+        "cuda_tunableop_write_file expects a string, but got ",
+        THPUtils_typename(str));
+    auto filename = THPUtils_unpackString(str);
+    success = at::cuda::tunable::getTuningContext()->WriteFile(filename);
+  } else {
+    success = at::cuda::tunable::getTuningContext()->WriteFile();
+  }
+  if (success) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* THCPModule_cuda_tunableop_read_file(
     PyObject* _unused,
     PyObject* args) {
@@ -1857,6 +2003,7 @@ PyObject* THCPModule_cuda_tunableop_get_rotating_buffer_size(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THCPModule_cuda_tunableop_set_numerical_check_tolerances(
     PyObject* unused,
     PyObject* args) {
@@ -1915,6 +2062,8 @@ PyObject* THCPModule_cuda_tunableop_set_numerical_check_tolerances(
   END_HANDLE_TH_ERRORS
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
     PyObject* self,
     PyObject* noargs) {
@@ -2067,6 +2216,13 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaCachingAllocator_enable,
      METH_O,
      nullptr},
+<<<<<<< HEAD
+=======
+    {"_cuda_cudaCachingAllocator_set_allocator_settings",
+     THCPModule_cudaCachingAllocator_set_allocator_settings,
+     METH_O,
+     nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_cuda_getAllocatorBackend",
      THCPModule_getAllocatorBackend,
      METH_NOARGS,
@@ -2141,6 +2297,13 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_record_untuned_is_enabled,
      METH_NOARGS,
      nullptr},
+<<<<<<< HEAD
+=======
+    {"_cuda_tunableop_write_file_on_exit",
+     THCPModule_cuda_tunableop_write_file_on_exit,
+     METH_O,
+     nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_cuda_tunableop_set_max_tuning_duration",
      THCPModule_cuda_tunableop_set_max_tuning_duration,
      METH_O,
@@ -2165,6 +2328,13 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_tunableop_get_filename,
      METH_NOARGS,
      nullptr},
+<<<<<<< HEAD
+=======
+    {"_cuda_tunableop_write_file",
+     THCPModule_cuda_tunableop_write_file,
+     METH_VARARGS,
+     nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_cuda_tunableop_read_file",
      THCPModule_cuda_tunableop_read_file,
      METH_VARARGS,
@@ -2185,10 +2355,13 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_tunableop_get_rotating_buffer_size,
      METH_NOARGS,
      nullptr},
+<<<<<<< HEAD
     {"_cuda_tunableop_set_numerical_check_tolerances",
      THCPModule_cuda_tunableop_set_numerical_check_tolerances,
      METH_VARARGS,
      nullptr},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 PyMethodDef* THCPModule_methods() {
@@ -2196,6 +2369,10 @@ PyMethodDef* THCPModule_methods() {
 }
 
 namespace torch::cuda {
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace shared {
 
 void initCudartBindings(PyObject* module);
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 830159d0a9199..6759cfd0afb07 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -1,7 +1,10 @@
 #include <ATen/Context.h>
 #include <ATen/record_function.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/cuda/memory_snapshot.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/serialization/pickler.h>
@@ -311,7 +314,10 @@ std::string _memory_snapshot_pickled() {
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
   IValue compile_contexts_s = "compile_context";
+<<<<<<< HEAD
   IValue user_metadata_s = "user_metadata";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto empty_frames = new_list();
 
@@ -415,7 +421,11 @@ std::string _memory_snapshot_pickled() {
       case TraceEntry::SEGMENT_MAP:
         return segment_map_s;
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, "unreachable");
+=======
+    throw std::runtime_error("unreachable");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   for (const auto& traceInfo : snapshot.device_traces) {
@@ -429,7 +439,10 @@ std::string _memory_snapshot_pickled() {
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       trace_entry.insert(compile_contexts_s, te.compile_context_);
+<<<<<<< HEAD
       trace_entry.insert(user_metadata_s, te.user_metadata_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
@@ -461,8 +474,11 @@ std::string _memory_snapshot_pickled() {
   IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
   IValue roundup_power2_divisions_s = "roundup_power2_divisions";
+<<<<<<< HEAD
   IValue graph_capture_record_stream_reuse_s =
       "graph_capture_record_stream_reuse";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   allocator_settings.insert(
       last_allocator_settings_s,
@@ -483,9 +499,12 @@ std::string _memory_snapshot_pickled() {
   allocator_settings.insert(
       pinned_use_host_register_s,
       snapshot.config_metadata.pinned_use_host_register);
+<<<<<<< HEAD
   allocator_settings.insert(
       graph_capture_record_stream_reuse_s,
       snapshot.config_metadata.graph_capture_record_stream_reuse);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   unsigned int roundup_key = 1;
   auto roundup_settings = new_dict();
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index ee80c8b13f19f..e31b648f67391 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -62,7 +62,11 @@ ncclResult_t to_nccl_result(torch::cuda::nccl::ncclResult var) {
     case torch::cuda::nccl::ncclResult::NumResults:
       return ncclResult_t::ncclNumResults;
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unconvertible NCCL type");
+=======
+      throw std::runtime_error("Unconvertible NCCL type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -91,7 +95,11 @@ torch::cuda::nccl::ncclResult from_nccl_result(ncclResult_t var) {
     case ncclNumResults:
       return torch::cuda::nccl::ncclResult::NumResults;
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unconvertible NCCL type");
+=======
+      throw std::runtime_error("Unconvertible NCCL type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -194,9 +202,16 @@ static void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
     auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
                            currentTimepoint - startTimepoint)
                            .count();
+<<<<<<< HEAD
     TORCH_CHECK(
         timeElapsed <= nccl_nonblocking_timeout(),
         "NCCL timeout when waiting for nonblocking call to become successful.");
+=======
+    if (timeElapsed > nccl_nonblocking_timeout()) {
+      throw std::runtime_error(
+          "NCCL timeout when waiting for nonblocking call to become successful.");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sched_yield(); // yield to other threads
     ncclCommGetAsyncError(to_nccl_comm(comm), &result);
   }
@@ -226,9 +241,16 @@ static void NCCL_CHECK_TIMEOUT(
         auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
                                currentTimepoint - startTimepoint)
                                .count();
+<<<<<<< HEAD
         TORCH_CHECK(
             timeElapsed <= nccl_nonblocking_timeout(),
             "NCCL timeout when waiting for nonblocking call to become successful.");
+=======
+        if (timeElapsed > nccl_nonblocking_timeout()) {
+          throw std::runtime_error(
+              "NCCL timeout when waiting for nonblocking call to become successful.");
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sched_yield(); // yield to other threads
         ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
       } while (result == ncclInProgress);
@@ -256,7 +278,11 @@ void throw_nccl_error(torch::cuda::nccl::ncclResult status) {
   std::ostringstream err;
   err << "NCCL Error " << static_cast<int>(status) << ": "
       << ncclGetErrorString(to_nccl_result(status));
+<<<<<<< HEAD
   TORCH_CHECK(false, err.str());
+=======
+  throw std::runtime_error(err.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct NcclCommList {
@@ -316,6 +342,7 @@ static void check_tensor(
     int64_t ref_numel,
     ScalarType ref_dtype) {
   auto check_one = [&](const at::Tensor& tensor) {
+<<<<<<< HEAD
     TORCH_CHECK(
         tensor.is_cuda() && !tensor.is_sparse(),
         "input and output elements have to be cuda dense Tensors");
@@ -326,19 +353,42 @@ static void check_tensor(
 
     TORCH_CHECK(
         tensor.is_contiguous(), "all inputs and outputs have to be contiguous");
+=======
+    if (!tensor.is_cuda() || tensor.is_sparse()) {
+      throw std::runtime_error(
+          "input and output elements have to be cuda dense Tensors");
+    }
+
+    if (ref_dtype != tensor.scalar_type()) {
+      throw std::runtime_error(
+          "all inputs and outputs must be of the same Tensor dtype");
+    }
+
+    if (!tensor.is_contiguous()) {
+      throw std::runtime_error("all inputs and outputs have to be contiguous");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   check_one(input);
 
   // all inputs must be same size
+<<<<<<< HEAD
   TORCH_CHECK(
       input.numel() == ref_numel,
       "all inputs must have the same number of elements");
+=======
+  if (input.numel() != ref_numel) {
+    throw std::runtime_error(
+        "all inputs must have the same number of elements");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (output) {
     check_one(*output);
 
     // inputs and outputs must be on same device respectively
+<<<<<<< HEAD
     TORCH_CHECK(
         input.get_device() == output->get_device(),
         "input and output must be on the same device");
@@ -346,6 +396,16 @@ static void check_tensor(
     TORCH_CHECK(
         output->numel() * output_multiplier == ref_numel * input_multiplier,
         "output must be of size input_size * size_multiplier");
+=======
+    if (input.get_device() != output->get_device()) {
+      throw std::runtime_error("input and output must be on the same device");
+    }
+
+    if (output->numel() * output_multiplier != ref_numel * input_multiplier) {
+      throw std::runtime_error(
+          "output must be of size input_size * size_multiplier");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -357,13 +417,23 @@ void check_inputs(
   // len(inputs) == len(outputs)
   size_t len = inputs.size();
 
+<<<<<<< HEAD
   TORCH_CHECK(len != 0, "input sequence can't be empty");
+=======
+  if (len == 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (len != outputs.size()) {
     std::stringstream err;
     err << "inputs and outputs sequences have to be of the same length, but got input of length "
         << len << " and output of length " << outputs.size();
+<<<<<<< HEAD
     TORCH_CHECK(false, err.str());
+=======
+    throw std::runtime_error(err.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   device_set devices;
@@ -379,8 +449,14 @@ void check_inputs(
 
     auto input_device = input.get_device();
     // inputs must be on unique devices
+<<<<<<< HEAD
     TORCH_CHECK(
         !devices.test(input_device), "inputs must be on unique devices");
+=======
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     devices.set(input_device);
   }
 }
@@ -393,7 +469,13 @@ void check_inputs(
     int output_multiplier) {
   auto len = inputs.size();
 
+<<<<<<< HEAD
   TORCH_CHECK(len > 0, "input sequence can't be empty");
+=======
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   device_set devices;
   int64_t numel = inputs[0].numel();
@@ -414,8 +496,14 @@ void check_inputs(
 
     auto input_device = input.get_device();
     // inputs must be on unique devices
+<<<<<<< HEAD
     TORCH_CHECK(
         !devices.test(input_device), "inputs must be on unique devices");
+=======
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     devices.set(input_device);
   }
 }
@@ -611,7 +699,11 @@ void broadcast(
         ")");
     ncclComm_t comm = comms[i];
     NCCL_CHECK(ncclBcast(
+<<<<<<< HEAD
         tensors[i].mutable_data_ptr(),
+=======
+        tensors[i].data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         numel,
         data_type,
         0,
@@ -656,9 +748,15 @@ void reduce(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclReduce(
+<<<<<<< HEAD
         inputs[i].const_data_ptr(),
         static_cast<std::remove_cv_t<decltype(i)>>(root) == i
             ? output.mutable_data_ptr()
+=======
+        inputs[i].data_ptr(),
+        static_cast<std::remove_cv_t<decltype(i)>>(root) == i
+            ? output.data_ptr()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             : nullptr,
         count,
         data_type,
@@ -710,8 +808,13 @@ void all_reduce(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclAllReduce(
+<<<<<<< HEAD
         inputs[i].const_data_ptr(),
         outputs[i].mutable_data_ptr(),
+=======
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count,
         data_type,
         to_nccl_red_op(op),
@@ -752,8 +855,13 @@ void reduce_scatter(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclReduceScatter(
+<<<<<<< HEAD
         inputs[i].const_data_ptr(),
         outputs[i].mutable_data_ptr(),
+=======
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count,
         data_type,
         to_nccl_red_op(op),
@@ -794,18 +902,30 @@ void all_gather(
     ncclComm_t comm = comms_ref[i];
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     NCCL_CHECK(ncclAllGather(
+<<<<<<< HEAD
         inputs[i].const_data_ptr(),
         outputs[i].mutable_data_ptr(),
+=======
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count,
         data_type,
         to_nccl_comm(comm),
         stream));
 #else
     NCCL_CHECK(ncclAllGather(
+<<<<<<< HEAD
         inputs[i].const_data_ptr(),
         count,
         data_type,
         outputs[i].mutable_data_ptr(),
+=======
+        inputs[i].data_ptr(),
+        count,
+        data_type,
+        outputs[i].data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to_nccl_comm(comm),
         stream));
 #endif
@@ -830,7 +950,11 @@ void all2all_single_equal_split(
   size_t count = input.numel() / size;
   [[maybe_unused]] size_t rankdiff = input.nbytes() / size;
   const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
+<<<<<<< HEAD
   auto* recvbuff = reinterpret_cast<char*>(output.mutable_data_ptr());
+=======
+  auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto comm = to_nccl_comm(_comm);
 #if defined(USE_ROCM) || defined(NCCL_ALLTOALL_SUPPORTED)
   // NCCL_ALLTOALL_SUPPORTED is used so NCCL can differentiate send/recv
@@ -951,7 +1075,11 @@ void all2all(
 
     if (_nccl_should_send_recv(input.numel())) {
       NCCL_CHECK(ncclSend(
+<<<<<<< HEAD
           input.const_data_ptr(),
+=======
+          input.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           input.numel(),
           to_nccl_data_type(input),
           r,
@@ -960,7 +1088,11 @@ void all2all(
     }
     if (_nccl_should_send_recv(output.numel())) {
       NCCL_CHECK(ncclRecv(
+<<<<<<< HEAD
           output.mutable_data_ptr(),
+=======
+          output.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           output.numel(),
           to_nccl_data_type(output),
           r,
@@ -992,7 +1124,11 @@ void send(
   using namespace torch::cuda::nccl::detail;
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   NCCL_CHECK(ncclSend(
+<<<<<<< HEAD
       input.const_data_ptr(),
+=======
+      input.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       input.numel(),
       to_nccl_data_type(input),
       dst,
@@ -1001,7 +1137,11 @@ void send(
 #else
   NCCL_CHECK_TIMEOUT(
       ncclSend(
+<<<<<<< HEAD
           input.const_data_ptr(),
+=======
+          input.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           input.numel(),
           to_nccl_data_type(input),
           dst,
@@ -1028,7 +1168,11 @@ void recv(
   using namespace torch::cuda::nccl::detail;
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   NCCL_CHECK(ncclRecv(
+<<<<<<< HEAD
       output.mutable_data_ptr(),
+=======
+      output.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       output.numel(),
       to_nccl_data_type(output),
       src,
@@ -1037,7 +1181,11 @@ void recv(
 #else
   NCCL_CHECK_TIMEOUT(
       ncclRecv(
+<<<<<<< HEAD
           output.mutable_data_ptr(),
+=======
+          output.data_ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           output.numel(),
           to_nccl_data_type(output),
           src,
@@ -1078,7 +1226,11 @@ void gather(
   if (cur_rank == root) {
     for (const auto r : c10::irange(numranks)) {
       if (r != root) {
+<<<<<<< HEAD
         auto* recvbuff = reinterpret_cast<char*>(outputs[r].mutable_data_ptr());
+=======
+        auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
       } else {
         // on its own rank, simply copy from the input
@@ -1139,7 +1291,11 @@ void scatter(
   } else {
     size_t recv_count = outputs.numel();
     auto recv_type = to_nccl_data_type(outputs);
+<<<<<<< HEAD
     auto* recvbuff = reinterpret_cast<char*>(outputs.mutable_data_ptr());
+=======
+    auto* recvbuff = reinterpret_cast<char*>(outputs.data_ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream));
   }
 #ifndef NCCL_HAS_COMM_NONBLOCKING
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 55af32792018a..b244c000ed499 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -11,7 +11,10 @@
 #include <torch/csrc/utils/pybind.h>
 
 #include <c10/cuda/CUDAGuard.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 using namespace at;
@@ -19,7 +22,11 @@ using namespace torch;
 using namespace torch::cuda::nccl;
 using namespace torch::cuda::nccl::detail;
 
+<<<<<<< HEAD
 static constexpr const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
+=======
+static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
   return PyLong_FromUnsignedLongLong(version());
@@ -64,9 +71,16 @@ static std::vector<std::optional<at::cuda::CUDAStream>> unpack_streams(
     return std::vector<std::optional<at::cuda::CUDAStream>>(size, std::nullopt);
   }
   auto streams = THPUtils_PySequence_to_CUDAStreamList(obj);
+<<<<<<< HEAD
   TORCH_CHECK(
       streams.size() == size,
       "number of streams is not equal to number of inputs");
+=======
+  if (streams.size() != size) {
+    throw std::runtime_error(
+        "number of streams is not equal to number of inputs");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return streams;
 }
 
@@ -90,9 +104,16 @@ static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
       comms[i] = unpack_nccl_comm(PySequence_Fast_GET_ITEM(seq.get(), i));
     }
   }
+<<<<<<< HEAD
   TORCH_CHECK(
       comms.size() == size,
       "number of communicators is not equal to number of inputs");
+=======
+  if (comms.size() != size) {
+    throw std::runtime_error(
+        "number of communicators is not equal to number of inputs");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return comms;
 }
 
@@ -140,7 +161,11 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
         "nccl_reduce",
         1,
         "(sequence[Tensor] inputs, Tensor output, int root,"
+<<<<<<< HEAD
         " int op, sequence[torch.cuda.Stream or None])");
+=======
+        " int op, sequence[torch.cuda.Stream or None]");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return nullptr;
   }
 
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index d28e8ae222eaa..bca230c04d74a 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -3,11 +3,15 @@
 #endif
 
 #ifndef ROCM_ON_WINDOWS
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
 #else // TORCH_CUDA_USE_NVTX3
 #include <nvToolsExt.h>
 #endif // TORCH_CUDA_USE_NVTX3
+=======
+#include <nvtx3/nvtx3.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
@@ -54,11 +58,15 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
   auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
 #else
   auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
 #endif
+=======
+  auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index 23112a8a06b8e..25dff79ff985c 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/cuda/THCP.h>
 #include <torch/csrc/python_headers.h>
@@ -9,6 +12,7 @@
 // whatever the current stream of the device the input is associated with was.
 std::vector<std::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
+<<<<<<< HEAD
   TORCH_CHECK(
       PySequence_Check(obj),
       "Expected a sequence in THPUtils_PySequence_to_CUDAStreamList");
@@ -20,6 +24,20 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
   std::vector<std::optional<at::cuda::CUDAStream>> streams;
   Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
   streams.reserve(length);
+=======
+  if (!PySequence_Check(obj)) {
+    throw std::runtime_error(
+        "Expected a sequence in THPUtils_PySequence_to_CUDAStreamList");
+  }
+  THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, nullptr));
+  if (seq.get() == nullptr) {
+    throw std::runtime_error(
+        "expected PySequence, but got " + std::string(THPUtils_typename(obj)));
+  }
+
+  std::vector<std::optional<at::cuda::CUDAStream>> streams;
+  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (Py_ssize_t i = 0; i < length; i++) {
     PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i);
 
@@ -34,8 +52,12 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
     } else if (stream == Py_None) {
       streams.emplace_back();
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(
           false,
+=======
+      throw std::runtime_error(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "Unknown data type found in stream list. Need torch.cuda.Stream or None");
     }
   }
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
new file mode 100644
index 0000000000000..2d40ca8361ff4
--- /dev/null
+++ b/torch/csrc/deploy/README.md
@@ -0,0 +1,2 @@
+# torch::deploy has been moved to pytorch/multipy <!-- codespell:ignore -->
+Please check out [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy) to find the new home for torch::deploy. <!-- codespell:ignore -->
diff --git a/torch/csrc/distributed/autograd/context/container.cpp b/torch/csrc/distributed/autograd/context/container.cpp
index 48d3c4af3a76a..95132dfacd571 100644
--- a/torch/csrc/distributed/autograd/context/container.cpp
+++ b/torch/csrc/distributed/autograd/context/container.cpp
@@ -52,9 +52,19 @@ DistAutogradContainer& DistAutogradContainer::init(int64_t worker_id) {
   }
 
   container.worker_id_ = static_cast<int16_t>(worker_id);
+<<<<<<< HEAD
   container.next_context_id_ = worker_id << kAutoIncrementBits;
   container.next_autograd_message_id_ = worker_id << kAutoIncrementBits;
   container.max_id_ = (kAutoIncrementMask | (worker_id << kAutoIncrementBits));
+=======
+  container.next_context_id_ = static_cast<int64_t>(worker_id)
+      << kAutoIncrementBits;
+  container.next_autograd_message_id_ = static_cast<int64_t>(worker_id)
+      << kAutoIncrementBits;
+  container.max_id_ =
+      (kAutoIncrementMask |
+       (static_cast<int64_t>(worker_id) << kAutoIncrementBits));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   container.initialized_ = true;
   return container;
 }
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 3743476c7a52f..2c57f14e019f8 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -65,7 +65,11 @@ class DistAccumulateGradCaptureHook
     for (const auto& hook : accumulateGrad_->post_hooks()) {
       (*hook)(kEmptyOutput, inputGrads);
     }
+<<<<<<< HEAD
     return std::move(inputGrads[0]);
+=======
+    return inputGrads[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
diff --git a/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
index 221ebb143471f..015c2d52d6884 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
@@ -25,7 +25,11 @@ c10::intrusive_ptr<rpc::Message> CleanupAutogradContextReq::toMessageImpl() && {
 std::unique_ptr<CleanupAutogradContextReq> CleanupAutogradContextReq::
     fromMessage(const rpc::Message& message) {
   // unpickle and get the context_id we need to clean up
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
   IValue ivalue_context_id = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
index f7726301e0c0f..6d54f2b22b67d 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
@@ -47,7 +47,11 @@ c10::intrusive_ptr<Message> PropagateGradientsReq::toMessageImpl() && {
 std::unique_ptr<PropagateGradientsReq> PropagateGradientsReq::fromMessage(
     const Message& message) {
   // Unpickle the message and retrieve tupleElements.
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
   IValue tuple = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
index 2c3cdcca4c26e..aca8e647b3cd2 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
@@ -37,7 +37,11 @@ c10::intrusive_ptr<Message> RRefBackwardReq::toMessageImpl() && {
 std::unique_ptr<RRefBackwardReq> RRefBackwardReq::fromMessage(
     const Message& message) {
   // Unpickle the message and retrieve tupleElements.
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
   IValue tuple = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 6ffa1529a4de0..3cc8fb02b54d7 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -46,8 +46,11 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     // backend name
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const std::string backend;
+<<<<<<< HEAD
     std::string group_name;
     std::vector<uint64_t> global_ranks_in_group;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   explicit Backend(int rank, int size);
@@ -79,6 +82,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
+<<<<<<< HEAD
   virtual void setTimeout(std::chrono::milliseconds timeout) {
     TORCH_CHECK(
         false,
@@ -86,6 +90,8 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "Backend ", getBackendName(), " does not support setting timeout"));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void startCoalescing() {
     TORCH_CHECK(
         false,
@@ -107,6 +113,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
   }
 
+<<<<<<< HEAD
   // Subclasses must override this method to return the backend name
   virtual c10::intrusive_ptr<Options> getBackendOptions() {
     TORCH_CHECK(
@@ -117,6 +124,8 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             " does not implement getBackendOptions."));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
       const BroadcastOptions& /* opts */ = BroadcastOptions()) {
@@ -391,6 +400,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         " is missing implementation of enableCollectivesTiming.");
   }
 
+<<<<<<< HEAD
   virtual c10::intrusive_ptr<Backend> split(
       const c10::intrusive_ptr<Store>& store,
       const std::vector<int>& ranks,
@@ -414,12 +424,18 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         " is missing implementation of merge.");
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool hasHooks() const {
     return onCompletionHook_ != nullptr;
   }
 
   // Do not call this directly, use ProcessGroup::setGroupName instead.
+<<<<<<< HEAD
   virtual void setGroupUid(const std::string& pg_uid) {
+=======
+  void setGroupUid(const std::string& pg_uid) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pg_uid_ = pg_uid;
   }
 
diff --git a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
index b0cb420eb6fca..41e1956220dbf 100644
--- a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/utils.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10d {
 
@@ -21,6 +24,7 @@ class FakeWork : public Work {
 
 class FakeProcessGroup : public Backend {
  public:
+<<<<<<< HEAD
   struct Options : Backend::Options {
     explicit Options() : Backend::Options("fake") {}
 
@@ -44,25 +48,37 @@ class FakeProcessGroup : public Backend {
   c10::intrusive_ptr<Backend::Options> getBackendOptions() override {
     return c10::static_intrusive_pointer_cast<Backend::Options>(options_);
   }
+=======
+  FakeProcessGroup(int rank, int size) : Backend(rank, size) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
       const BroadcastOptions& /* opts */ = BroadcastOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> allreduce_sparse(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -70,14 +86,20 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceCoalescedOptions& /* opts */ =
           AllreduceCoalescedOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& /* tensors */,
       const ReduceOptions& /* opts */ = ReduceOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -95,7 +117,10 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto& tensor : outputTensors[0]) {
       tensor.copy_(inputTensors[0]);
     }
@@ -106,7 +131,10 @@ class FakeProcessGroup : public Backend {
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto chunks = outputBuffer.chunk(size_);
     for (auto& tensor : chunks) {
       tensor.copy_(inputBuffer);
@@ -118,7 +146,10 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -126,7 +157,10 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (size_t i = 0; i < outputs.size(); ++i) {
       auto chunks = outputs[i].chunk(size_);
       for (auto& chunk : chunks) {
@@ -140,7 +174,10 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const GatherOptions& /* opts */ = GatherOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -148,7 +185,10 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ScatterOptions& /* opts */ = ScatterOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -157,7 +197,10 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -166,7 +209,10 @@ class FakeProcessGroup : public Backend {
       at::Tensor& /* inputBuffer */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -175,7 +221,10 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* inputs */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -185,7 +234,10 @@ class FakeProcessGroup : public Backend {
       std::vector<int64_t>& /* outputSplitSizes */,
       std::vector<int64_t>& /* inputSplitSizes */,
       const AllToAllOptions& /* opts */ = AllToAllOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -193,7 +245,10 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllToAllOptions& opts = AllToAllOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -222,17 +277,24 @@ class FakeProcessGroup : public Backend {
   }
 
   c10::intrusive_ptr<Work> endCoalescing(OpType /* optype */) {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> endCoalescing() override {
+<<<<<<< HEAD
     checkCollectiveError();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& /* opts */ = BarrierOptions()) override {
+<<<<<<< HEAD
     checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
@@ -248,6 +310,10 @@ class FakeProcessGroup : public Backend {
         !options_ || !options_->error_on_collective,
         "FakeProcessGroup collective operation error (error_on_collective=true)");
   }
+=======
+    return c10::make_intrusive<FakeWork>();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 7e22aa6fd0bd5..1659bb85012c5 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -7,10 +7,17 @@
 #include <cstdint>
 
 #ifdef _WIN32
+<<<<<<< HEAD
 #include <c10/util/FileSystem.h>
 #include <c10/util/win32-headers.h>
 #include <fileapi.h>
 #include <io.h>
+=======
+#include <c10/util/win32-headers.h>
+#include <fileapi.h>
+#include <io.h>
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <sys/file.h>
 #include <unistd.h>
@@ -33,11 +40,15 @@
 #define LOCK_SH 0x00000010
 #define LOCK_UN 0x00000100
 
+<<<<<<< HEAD
 #if defined(_WIN32) && defined(USE_ROCM)
 static
 #endif
     int
     flock_(int fd, int op) {
+=======
+int flock_(int fd, int op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE hdl = (HANDLE)_get_osfhandle(fd);
   DWORD low = 1, high = 0;
   OVERLAPPED offset = {0, 0, 0, 0, NULL};
@@ -161,7 +172,11 @@ class File {
 #ifdef _WIN32
       // if the parent folder doesn't exist it will never be able to create the
       // file so we can skip the retry
+<<<<<<< HEAD
       if (!c10::filesystem::exists(c10::filesystem::path(path).parent_path())) {
+=======
+      if (!std::filesystem::exists(std::filesystem::path(path).parent_path())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         break;
       }
 #endif
@@ -225,7 +240,11 @@ class File {
     while (count > 0) {
       auto rv = syscall([this, buf, count] { return ::read(fd_, buf, count); });
       SYSASSERT(rv, "read");
+<<<<<<< HEAD
       buf = static_cast<uint8_t*>(buf) + rv;
+=======
+      buf = (uint8_t*)buf + rv;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       count -= rv;
     }
   }
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index a404b627752a5..d76d6f26c4c28 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -1,10 +1,15 @@
+<<<<<<< HEAD
 #include <c10/util/FileSystem.h>
 #include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
 #include <fstream>
+=======
+#include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10d {
 
 void DebugInfoWriter::write(const std::string& trace) {
+<<<<<<< HEAD
   std::string filename = filename_;
   if (enable_dynamic_filename_) {
     LOG(INFO) << "Writing Flight Recorder debug info to a dynamic file name";
@@ -15,29 +20,50 @@ void DebugInfoWriter::write(const std::string& trace) {
   // Open a file for writing. The ios::binary flag is used to write data as
   // binary.
   std::ofstream file(filename, std::ios::binary);
+=======
+  // Open a file for writing. The ios::binary flag is used to write data as
+  // binary.
+  std::ofstream file(filename_, std::ios::binary);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check if the file was opened successfully.
   if (!file.is_open()) {
     LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
+<<<<<<< HEAD
                << filename;
+=======
+               << filename_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
   if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
     const auto bad = file.bad();
     LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
+<<<<<<< HEAD
                << filename << " bad bit: " << bad;
+=======
+               << filename_ << " bad bit: " << bad;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
   // Flush the buffer to ensure data is written to the file
   file.flush();
   if (file.bad()) {
+<<<<<<< HEAD
     LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename;
     return;
   }
 
   LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename;
+=======
+    LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
+    return;
+  }
+
+  LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
@@ -45,21 +71,35 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     // Attempt to write to running user's HOME directory cache folder - if it
     // exists.
     auto homeDir = getCvarString({"HOME"}, "/tmp");
+<<<<<<< HEAD
     auto cacheDirPath = c10::filesystem::path(homeDir + "/.cache/torch");
     // Create the .cache directory if it doesn't exist
     c10::filesystem::create_directories(cacheDirPath);
     auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
+=======
+    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
+    // Create the .cache directory if it doesn't exist
+    std::filesystem::create_directories(cacheDirPath);
+    auto defaultLocation = cacheDirPath / "nccl_trace_rank_";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // For internal bc compatibility, we keep the old the ENV check.
     std::string fileNamePrefix = getCvarString(
         {"TORCH_FR_DUMP_TEMP_FILE", "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"},
         defaultLocation.string().c_str());
+<<<<<<< HEAD
     bool useDynamicFileName =
         getCvarBool({"TORCH_FR_DUMP_DYNAMIC_FILE_NAME"}, false);
     // Using std::unique_ptr here to auto-delete the writer object
     // when the pointer itself is destroyed.
     std::unique_ptr<DebugInfoWriter> writerPtr(
         new DebugInfoWriter(fileNamePrefix, rank, useDynamicFileName));
+=======
+    // Using std::unique_ptr here to auto-delete the writer object
+    // when the pointer itself is destroyed.
+    std::unique_ptr<DebugInfoWriter> writerPtr(
+        new DebugInfoWriter(fileNamePrefix, rank));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DebugInfoWriter::registerWriter(std::move(writerPtr));
   }
   return *writer_;
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index 23b8893c54f2c..dc423c2ae9a35 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -20,10 +20,17 @@ namespace c10d {
 // (minor when adding fields, major when changing existing fields)
 // Also update both JSON and Pickle dumps to make use of the newly defined
 // field(s).
+<<<<<<< HEAD
 DEFINE_CONSTANT(version_val, "2.10")
 DEFINE_CONSTANT(entries_key, "entries")
 DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
 DEFINE_CONSTANT(comm_lib_version_key, "comm_lib_version")
+=======
+DEFINE_CONSTANT(version_val, "2.9")
+DEFINE_CONSTANT(entries_key, "entries")
+DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
+DEFINE_CONSTANT(nccl_version_key, "nccl_version")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFINE_CONSTANT(version_key, "version")
 DEFINE_CONSTANT(pg_config_key, "pg_config")
 DEFINE_CONSTANT(pg_status_key, "pg_status")
@@ -76,6 +83,7 @@ class TORCH_API DebugInfoWriter {
   }
 
  protected:
+<<<<<<< HEAD
   DebugInfoWriter(
       const std::string& namePrefix,
       int rank,
@@ -87,6 +95,12 @@ class TORCH_API DebugInfoWriter {
   std::string filename_;
   int rank_;
   bool enable_dynamic_filename_;
+=======
+  DebugInfoWriter(const std::string& namePrefix, int rank) {
+    filename_ = c10::str(namePrefix, rank);
+  }
+  std::string filename_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   static std::unique_ptr<DebugInfoWriter> writer_;
@@ -183,10 +197,17 @@ struct FlightRecorder {
   size_t max_entries_ = 0;
   size_t next_ = 0;
   size_t id_ = 0;
+<<<<<<< HEAD
   std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_;
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_;
   std::string comm_lib_version_;
+=======
+  std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
+  std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
+      pg_name_to_ranks_ = {};
+  std::string nccl_version_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::optional<size_t> record(
       size_t pg_id,
@@ -207,7 +228,11 @@ struct FlightRecorder {
       const std::tuple<std::string, std::string>& pg_name,
       std::vector<uint64_t> ranks);
 
+<<<<<<< HEAD
   void record_accelerator_version(const std::string comm_lib_version);
+=======
+  void record_accelerator_version(const std::string nccl_version);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void update_state(Entry& r);
 
@@ -231,8 +256,11 @@ struct FlightRecorder {
       std::optional<size_t> id,
       bool compute_duration = true);
 
+<<<<<<< HEAD
   TORCH_API void reset_all();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const c10::List<c10::IValue> getCollectiveTrace(
       bool includeStacktraces,
       bool onlyActive);
@@ -265,6 +293,7 @@ struct FlightRecorder {
       bool onlyActive);
 };
 
+<<<<<<< HEAD
 // Whether to include stack trace in the Flight Recorder trace (default true)
 static std::vector<std::string> TORCH_INCLUDE_STACK_TRACE = {
     "TORCH_INCLUDE_STACK_TRACE"};
@@ -274,6 +303,8 @@ static std::vector<std::string> TORCH_INCLUDE_STACK_TRACE = {
 static std::vector<std::string> TORCH_INCLUDE_ONLY_ACTIVE = {
     "TORCH_INCLUDE_ONLY_ACTIVE"};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Dumps the fr traces and additional information about the Process
 // Group.
 TORCH_API std::string dump_fr_trace(
diff --git a/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
index 1673e7a892e57..debd461547192 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
@@ -3,6 +3,10 @@
 #include <cuda_runtime.h>
 
 #include <nlohmann/json.hpp>
+<<<<<<< HEAD
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fstream>
 #include <mutex>
 #include <vector>
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 8813c95158460..44803eb55cb20 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -128,12 +128,20 @@ void FlightRecorder<EventType>::record_pg_ranks(
 
 template <typename EventType>
 void FlightRecorder<EventType>::record_accelerator_version(
+<<<<<<< HEAD
     const std::string comm_lib_version) {
+=======
+    const std::string nccl_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!enabled_) {
     return;
   }
   std::lock_guard<std::mutex> guard(mutex_);
+<<<<<<< HEAD
   comm_lib_version_ = std::move(comm_lib_version);
+=======
+  nccl_version_ = std::move(nccl_version);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename EventType>
@@ -250,6 +258,7 @@ void FlightRecorder<EventType>::retire_id(
 }
 
 template <typename EventType>
+<<<<<<< HEAD
 void FlightRecorder<EventType>::reset_all() {
   std::lock_guard<std::mutex> guard(mutex_);
   next_ = 0;
@@ -258,6 +267,8 @@ void FlightRecorder<EventType>::reset_all() {
 }
 
 template <typename EventType>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const c10::List<c10::IValue> FlightRecorder<EventType>::getCollectiveTrace(
     bool includeStacktraces,
     bool onlyActive) {
@@ -433,7 +444,11 @@ std::string FlightRecorder<EventType>::dump_json(
     bool onlyActive) {
   json result;
   result[version_key_str] = version_val_str;
+<<<<<<< HEAD
   result[comm_lib_version_key_str] = comm_lib_version_;
+=======
+  result[nccl_version_key_str] = nccl_version_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   result[pg_config_key_str] = getPgConfigJson();
   result[pg_status_key_str] = getPgStatusJson();
 
@@ -530,7 +545,11 @@ std::string FlightRecorder<EventType>::dump(
   // common values
   result.insert(version_key, version_val);
   result.insert(pg_config_key, getPgConfig());
+<<<<<<< HEAD
   result.insert(comm_lib_version_key_str, comm_lib_version_);
+=======
+  result.insert(nccl_version_key_str, nccl_version_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   result.insert(pg_status_key, getPgStatus());
 
   // collective trace
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index 16530f0e65028..f36498c623492 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -30,6 +30,7 @@ c10d::ReduceOp to_reduce_op(const std::string& reduce_op) {
   return it->second;
 }
 
+<<<<<<< HEAD
 at::Tensor allocate_all_gather_output(
     const at::Tensor& input,
     int64_t group_size) {
@@ -65,6 +66,8 @@ at::Tensor allocate_reduce_scatter_output(
 
 namespace c10d {
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor& all_reduce_(
     at::Tensor& input,
     // NOLINTNEXTLINE(performance-unnecessary-value-param)
@@ -85,6 +88,7 @@ at::Tensor all_reduce(
     const at::Tensor& input,
     std::string reduce_op,
     std::string group_name) {
+<<<<<<< HEAD
   if (input.is_complex()) {
     TORCH_CHECK(
         // TODO - ideally use 'to_reduce_op' helper but it currently errors on
@@ -100,6 +104,10 @@ at::Tensor all_reduce(
   auto output_ret =
       all_reduce_(output, std::move(reduce_op), std::move(group_name));
   return input.is_complex() ? at::view_as_complex(output_ret) : output_ret;
+=======
+  auto output = input.clone(at::MemoryFormat::Contiguous);
+  return all_reduce_(output, std::move(reduce_op), std::move(group_name));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::vector<at::Tensor> all_reduce_coalesced_(
@@ -133,6 +141,20 @@ std::vector<at::Tensor> all_reduce_coalesced(
       outputs, std::move(reduce_op), std::move(group_name));
 }
 
+<<<<<<< HEAD
+=======
+at::Tensor allocate_all_gather_output(
+    const at::Tensor& input,
+    int64_t group_size) {
+  TORCH_CHECK(input.is_contiguous());
+  auto output_size = input.sizes().vec();
+  output_size[0] *= group_size;
+  return at::empty(
+      output_size,
+      at::TensorOptions().dtype(input.dtype()).device(input.device()));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::vector<at::Tensor> all_gather_into_tensor_coalesced(
     std::vector<at::Tensor> inputs,
     int64_t group_size,
@@ -158,11 +180,17 @@ at::Tensor all_gather_into_tensor(
     int64_t group_size,
     std::string group_name) {
   TORCH_CHECK(input.is_contiguous());
+<<<<<<< HEAD
   auto real_input = input.is_complex() ? at::view_as_real(input) : input;
   std::vector<at::Tensor> inputs{real_input};
   auto output = all_gather_into_tensor_coalesced(
       inputs, group_size, std::move(group_name))[0];
   return input.is_complex() ? at::view_as_complex(output) : output;
+=======
+  std::vector<at::Tensor> inputs{input};
+  return all_gather_into_tensor_coalesced(
+      inputs, group_size, std::move(group_name))[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor& all_gather_into_tensor_out(
@@ -179,6 +207,25 @@ at::Tensor& all_gather_into_tensor_out(
   return output;
 }
 
+<<<<<<< HEAD
+=======
+at::Tensor allocate_reduce_scatter_output(
+    const at::Tensor& input,
+    const int64_t group_size) {
+  TORCH_CHECK(input.is_contiguous());
+  auto output_size = input.sizes().vec();
+  if (output_size[0] % group_size != 0) {
+    LOG(WARNING) << "The first dimension of the reduce_scatter input ("
+                 << output_size[0] << ") is not divisible by the group size ("
+                 << group_size << ").";
+  }
+  output_size[0] /= group_size;
+  return at::empty(
+      output_size,
+      at::TensorOptions().dtype(input.dtype()).device(input.device()));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::vector<at::Tensor> reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor> inputs,
     // NOLINTNEXTLINE(performance-unnecessary-value-param)
@@ -209,12 +256,15 @@ at::Tensor reduce_scatter_tensor(
     int64_t group_size,
     std::string group_name) {
   TORCH_CHECK(input.is_contiguous());
+<<<<<<< HEAD
   if (input.is_complex()) {
     auto real_input = at::view_as_real(input);
     std::vector<at::Tensor> inputs{real_input};
     return at::view_as_complex(reduce_scatter_tensor_coalesced(
         inputs, std::move(reduce_op), group_size, std::move(group_name))[0]);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<at::Tensor> inputs{input};
   return reduce_scatter_tensor_coalesced(
       inputs, std::move(reduce_op), group_size, std::move(group_name))[0];
@@ -222,6 +272,7 @@ at::Tensor reduce_scatter_tensor(
 
 at::Tensor all_to_all_single(
     const at::Tensor& input,
+<<<<<<< HEAD
     c10::SymIntArrayRef _output_split_sizes,
     c10::SymIntArrayRef _input_split_sizes,
     // NOLINTNEXTLINE(performance-unnecessary-value-param)
@@ -237,6 +288,12 @@ at::Tensor all_to_all_single(
     input_split_sizes.emplace_back(size.expect_int());
   }
 
+=======
+    std::vector<int64_t> output_split_sizes,
+    std::vector<int64_t> input_split_sizes,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    std::string group_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(input.is_contiguous());
   std::vector<int64_t> output_sizes = input.sizes().vec();
   output_sizes[0] = std::accumulate(
@@ -258,8 +315,12 @@ at::Tensor all_to_all_single(
 at::Tensor& broadcast_(at::Tensor& input, int64_t src, std::string group_name) {
   c10d::BroadcastOptions opts;
   opts.rootRank = src;
+<<<<<<< HEAD
   auto input_real = input.is_complex() ? at::view_as_real(input) : input;
   std::vector<at::Tensor> inputs{input_real};
+=======
+  std::vector<at::Tensor> inputs{input};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->broadcast(inputs, opts);
@@ -275,68 +336,108 @@ at::Tensor broadcast(
   return broadcast_(output, src, std::move(group_name));
 }
 
+<<<<<<< HEAD
 } // namespace c10d
+=======
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_LIBRARY(_c10d_functional, m) {
   m.def(
       "all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd, c10d::all_reduce),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::all_reduce),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
       "all_reduce_(Tensor(a!) input, str reduce_op, str group_name) -> Tensor(a!)",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd, c10d::all_reduce_),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::all_reduce_),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
       "all_reduce_coalesced(Tensor[] inputs, str reduce_op, str group_name) -> Tensor[]",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd,
           c10d::all_reduce_coalesced),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::all_reduce_coalesced),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
       "all_reduce_coalesced_(Tensor[](a!) inputs, str reduce_op, str group_name) -> Tensor[](a!)",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd,
           c10d::all_reduce_coalesced_),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::all_reduce_coalesced_),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
       "all_gather_into_tensor_out(Tensor input, int group_size, str group_name, *, Tensor(a!) out) -> Tensor(a!)",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
+<<<<<<< HEAD
           c10d::all_gather_into_tensor_out),
+=======
+          ::all_gather_into_tensor_out),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
       "all_gather_into_tensor(Tensor input, int group_size, str group_name) -> Tensor",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
+<<<<<<< HEAD
           c10d::all_gather_into_tensor),
+=======
+          ::all_gather_into_tensor),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
       "all_gather_into_tensor_coalesced(Tensor[] inputs, int group_size, str group_name) -> Tensor[]",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
+<<<<<<< HEAD
           c10d::all_gather_into_tensor_coalesced),
+=======
+          ::all_gather_into_tensor_coalesced),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
       "reduce_scatter_tensor(Tensor input, str reduce_op, int group_size, str group_name) -> Tensor",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd,
           c10d::reduce_scatter_tensor),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::reduce_scatter_tensor),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
       "reduce_scatter_tensor_coalesced(Tensor[] inputs, str reduce_op, int group_size, str group_name) -> Tensor[]",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
+<<<<<<< HEAD
           c10d::reduce_scatter_tensor_coalesced),
+=======
+          ::reduce_scatter_tensor_coalesced),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
@@ -346,19 +447,31 @@ TORCH_LIBRARY(_c10d_functional, m) {
       "SymInt[] input_split_sizes, "
       "str group_name) -> Tensor",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd, c10d::all_to_all_single),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::all_to_all_single),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
 
   m.def(
       "broadcast(Tensor input, int src, str group_name) -> Tensor",
+<<<<<<< HEAD
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd, c10d::broadcast),
+=======
+      torch::dispatch(c10::DispatchKey::CompositeExplicitAutograd, ::broadcast),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
       "broadcast_(Tensor(a!) input, int src, str group_name) -> Tensor(a!)",
       torch::dispatch(
+<<<<<<< HEAD
           c10::DispatchKey::CompositeExplicitAutograd, c10d::broadcast_),
+=======
+          c10::DispatchKey::CompositeExplicitAutograd, ::broadcast_),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {at::Tag::pt2_compliant_tag});
 
   m.def(
@@ -375,6 +488,7 @@ class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
       torch::autograd::AutogradContext* ctx,
       const at::Tensor& input,
       // NOLINTNEXTLINE(performance-unnecessary-value-param)
+<<<<<<< HEAD
       at::SymIntArrayRef output_split_sizes,
       // NOLINTNEXTLINE(performance-unnecessary-value-param)
       at::SymIntArrayRef input_split_sizes,
@@ -383,21 +497,42 @@ class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
     // swap sizes for backwards pass
     ctx->saved_data["output_split_sizes"] = input_split_sizes.vec();
     ctx->saved_data["input_split_sizes"] = output_split_sizes.vec();
+=======
+      std::vector<int64_t> output_split_sizes,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::vector<int64_t> input_split_sizes,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::string group_name) {
+    // swap sizes for backwards pass
+    ctx->saved_data["output_split_sizes"] = input_split_sizes;
+    ctx->saved_data["input_split_sizes"] = output_split_sizes;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ctx->saved_data["group_name"] = group_name;
 
     return c10::Dispatcher::singleton()
         .findSchemaOrThrow("_c10d_functional::all_to_all_single", "")
+<<<<<<< HEAD
         .typed<decltype(c10d::all_to_all_single)>()
+=======
+        .typed<decltype(all_to_all_single)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .call(input, output_split_sizes, input_split_sizes, group_name);
   }
 
   static torch::autograd::variable_list backward(
       torch::autograd::AutogradContext* ctx,
       const torch::autograd::variable_list& grad_out_list) {
+<<<<<<< HEAD
     std::vector<c10::SymInt> output_split_sizes =
         ctx->saved_data["output_split_sizes"].toSymIntVector();
     std::vector<c10::SymInt> input_split_sizes =
         ctx->saved_data["input_split_sizes"].toSymIntVector();
+=======
+    const std::vector<int64_t>& output_split_sizes =
+        ctx->saved_data["output_split_sizes"].toIntVector();
+    const std::vector<int64_t>& input_split_sizes =
+        ctx->saved_data["input_split_sizes"].toIntVector();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::string& group_name = ctx->saved_data["group_name"].toStringRef();
 
     DCHECK(grad_out_list.size() == 1);
@@ -406,7 +541,11 @@ class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
     auto out =
         c10::Dispatcher::singleton()
             .findSchemaOrThrow("_c10d_functional::all_to_all_single", "")
+<<<<<<< HEAD
             .typed<decltype(c10d::all_to_all_single)>()
+=======
+            .typed<decltype(all_to_all_single)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .call(grad_out, output_split_sizes, input_split_sizes, group_name);
 
     // do an explicit wait to avoid cuda stream issues
@@ -422,8 +561,13 @@ class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
 
 at::Tensor all_to_all_single_autograd(
     const at::Tensor& input,
+<<<<<<< HEAD
     at::SymIntArrayRef output_split_sizes,
     at::SymIntArrayRef input_split_sizes,
+=======
+    const std::vector<int64_t>& output_split_sizes,
+    const std::vector<int64_t>& input_split_sizes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::string& group_name) {
   return AllToAllSingle::apply(
       input, output_split_sizes, input_split_sizes, group_name);
@@ -445,7 +589,11 @@ class ReduceScatterTensor
 
     return c10::Dispatcher::singleton()
         .findSchemaOrThrow("_c10d_functional::reduce_scatter_tensor", "")
+<<<<<<< HEAD
         .typed<decltype(c10d::reduce_scatter_tensor)>()
+=======
+        .typed<decltype(reduce_scatter_tensor)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .call(input, reduce_op, group_size, group_name);
   }
 
@@ -461,7 +609,11 @@ class ReduceScatterTensor
     auto out =
         c10::Dispatcher::singleton()
             .findSchemaOrThrow("_c10d_functional::all_gather_into_tensor", "")
+<<<<<<< HEAD
             .typed<decltype(c10d::all_gather_into_tensor)>()
+=======
+            .typed<decltype(all_gather_into_tensor)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .call(grad_out, group_size, group_name);
 
     // do an explicit wait to avoid cuda stream issues
@@ -501,7 +653,11 @@ class AllGatherIntoTensor
 
     return c10::Dispatcher::singleton()
         .findSchemaOrThrow("_c10d_functional::all_gather_into_tensor", "")
+<<<<<<< HEAD
         .typed<decltype(c10d::all_gather_into_tensor)>()
+=======
+        .typed<decltype(all_gather_into_tensor)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .call(input, group_size, group_name);
   }
 
@@ -517,7 +673,11 @@ class AllGatherIntoTensor
     auto out =
         c10::Dispatcher::singleton()
             .findSchemaOrThrow("_c10d_functional::reduce_scatter_tensor", "")
+<<<<<<< HEAD
             .typed<decltype(c10d::reduce_scatter_tensor)>()
+=======
+            .typed<decltype(reduce_scatter_tensor)>()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .call(grad_out, "sum", group_size, group_name);
 
     // do an explicit wait to avoid cuda stream issues
@@ -594,10 +754,14 @@ at::Tensor shard_dim_alltoall(
   input_sizes.insert(input_sizes.begin() + shard_dim, group_size);
 
   auto tensor_reshaped = input.view(input_sizes);
+<<<<<<< HEAD
   auto tensor_shard_contig = tensor_reshaped.movedim(shard_dim, 0).contiguous();
   auto tensor_for_comm = input.is_complex()
       ? at::view_as_real(tensor_shard_contig)
       : tensor_shard_contig;
+=======
+  auto tensor_for_comm = tensor_reshaped.movedim(shard_dim, 0).contiguous();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto recv_tensor = at::empty_like(tensor_for_comm);
   std::vector<int64_t> out_split_sizes;
@@ -619,8 +783,12 @@ at::Tensor shard_dim_alltoall(
   // view/reshape it back to the expected output shape
   output_sizes[shard_dim] /= group_size;
   output_sizes[gather_dim] *= group_size;
+<<<<<<< HEAD
   return input.is_complex() ? at::view_as_complex(output).view(output_sizes)
                             : output.view(output_sizes);
+=======
+  return output.view(output_sizes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace
 
diff --git a/torch/csrc/distributed/c10d/Functional.hpp b/torch/csrc/distributed/c10d/Functional.hpp
index 553ba296cc52c..ed3172e4bffee 100644
--- a/torch/csrc/distributed/c10d/Functional.hpp
+++ b/torch/csrc/distributed/c10d/Functional.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+<<<<<<< HEAD
 
 namespace c10d {
 
@@ -76,3 +77,5 @@ C10_EXPORT at::Tensor broadcast(
     std::string group_name);
 
 } // namespace c10d
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
index d9a74e2efa379..3286ad3cc17e7 100644
--- a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
+++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
@@ -196,7 +196,11 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
   static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
   if (transportName.has_value()) {
     return GlooDeviceRegistry()->Create(
+<<<<<<< HEAD
         transportName.value(), interfaceName, hostName, lazyInit);
+=======
+        transportName.value().c_str(), interfaceName, hostName, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 #ifdef __linux__
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 8074cc98a04f1..e995009dd2849 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -1,5 +1,8 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/util/env.h>
 
@@ -40,6 +43,7 @@ NCCLComm::NCCLComm(NCCLComm&& other) {
   std::swap(deviceIndex_, other.deviceIndex_);
 }
 
+<<<<<<< HEAD
 void NCCLComm::setUniqueHash(ncclUniqueId ncclId) {
   const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclId);
 
@@ -58,6 +62,10 @@ void NCCLComm::setUniqueHash(std::string hash) {
 
 std::string NCCLComm::getUniqueHash() {
   return uniqueHash_;
+=======
+ncclUniqueId NCCLComm::getNcclId() {
+  return ncclId_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::shared_ptr<NCCLComm> NCCLComm::create(
@@ -70,7 +78,11 @@ std::shared_ptr<NCCLComm> NCCLComm::create(
   C10D_NCCL_CHECK(
       ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank),
       std::nullopt);
+<<<<<<< HEAD
   comm->setUniqueHash(commId);
+=======
+  comm->ncclId_ = commId;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   comm->rank_ = rank;
   comm->deviceIndex_ = deviceIndex;
   comm->initialized_ = true;
@@ -95,7 +107,11 @@ std::shared_ptr<NCCLComm> NCCLComm::create(
       ncclCommInitRankConfig(
           &(comm->ncclComm_), numRanks, commId, rank, &config),
       std::nullopt);
+<<<<<<< HEAD
   comm->setUniqueHash(commId);
+=======
+  comm->ncclId_ = commId;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   comm->rank_ = rank;
   comm->deviceIndex_ = deviceIndex;
   // Under blocking mode, comm is initialized immediately after NCCL init
@@ -129,7 +145,11 @@ std::shared_ptr<NCCLComm> NCCLComm::create_scalable(
   // Only the first ncclUniqueId will be used to create the
   // communicator hash id, which is used to identify the communicator
   // in the log file and in the replay tool.
+<<<<<<< HEAD
   comm->setUniqueHash(commIds[0]);
+=======
+  comm->ncclId_ = commIds[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   comm->rank_ = rank;
   comm->deviceIndex_ = deviceIndex;
   comm->initialized_ = !comm->nonBlocking_;
@@ -195,12 +215,24 @@ std::optional<std::string> NCCLComm::getNcclCommFailureReason() const {
   return commFailureReason_;
 }
 
+<<<<<<< HEAD
 #if defined(NCCL_HAS_COMM_SPLIT)
+=======
+// TODO: why do we have `!defined(FBCODE_CAFFE2)` here?
+#if defined(NCCL_HAS_COMM_SPLIT) && !defined(FBCODE_CAFFE2)
+// last argument to split() API is not used to support
+// multiple implementations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::shared_ptr<NCCLComm> NCCLComm::split(
     NCCLComm* source,
     int color_id,
     int rank,
+<<<<<<< HEAD
     ncclConfig_t& config) {
+=======
+    ncclConfig_t& config,
+    std::vector<uint64_t>& ranks_ull) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       color_id >= NCCL_SPLIT_NOCOLOR,
       "Color must be a non-negative value or NCCL_SPLIT_NOCOLOR (-1)"
@@ -250,9 +282,12 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
   // Child comm should be on the same device as parent comm
   comm->deviceIndex_ = source->deviceIndex_;
   comm->nonBlocking_ = config.blocking == 0;
+<<<<<<< HEAD
   comm->setUniqueHash(
       source->getUniqueHash() + ":" +
       std::to_string(source->ncclCommSplitCounter_));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LOG(INFO) << "Rank " << source->rank_ << ": created child comm "
             << comm->repr() << " with color_id " << color_id;
   return comm;
@@ -573,6 +608,7 @@ size_t hashTensors(const std::vector<at::Tensor>& tensors) {
   return hash;
 }
 
+<<<<<<< HEAD
 // NCCL uses Non-negative int to represent in-group according to API
 // requirement. We take a list of ranks and generate a hash value based on the
 // list and ensure its range of 32-bit int.
@@ -594,6 +630,8 @@ int genNcclSplitColor(const std::vector<int>& ranks) {
   return color;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Default value: 30 minutes
 int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index fdd50f69ef3d7..0a1a616d4dbad 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -14,6 +14,10 @@
 #include <c10/util/Exception.h>
 #include <nccl.h>
 #include <torch/csrc/cuda/nccl.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 constexpr int64_t kCommInitBusyWaitMillis = 2;
@@ -231,7 +235,10 @@ static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
 };
 
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
+<<<<<<< HEAD
 TORCH_API int genNcclSplitColor(const std::vector<int>& ranks);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API std::string getNcclVersion();
 TORCH_API std::tuple<int, int, int> getNcclVersionTuple();
 TORCH_API int getNcclVersionNumber();
@@ -259,10 +266,13 @@ class NCCLComm {
 
   ~NCCLComm() noexcept;
 
+<<<<<<< HEAD
   void setUniqueHash(ncclUniqueId ncclId);
   void setUniqueHash(std::string hash);
   std::string getUniqueHash();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static std::shared_ptr<NCCLComm> create(
       int numRanks,
       int rank,
@@ -291,13 +301,22 @@ class NCCLComm {
       NCCLComm* source,
       int color_id,
       int rank,
+<<<<<<< HEAD
       ncclConfig_t& config);
+=======
+      ncclConfig_t& config,
+      std::vector<uint64_t>& ranks_ull);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // NCCL_HAS_COMM_SPLIT
 
 #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
   std::unordered_map<std::string, std::string> ncclCommDump();
 #endif
 
+<<<<<<< HEAD
+=======
+  ncclUniqueId getNcclId();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::DeviceIndex getDeviceIndex();
 
   // Must not be copyable
@@ -357,8 +376,13 @@ class NCCLComm {
   friend class ProcessGroupNCCL;
 
  protected:
+<<<<<<< HEAD
   // Unique hash for this communicator.
   std::string uniqueHash_;
+=======
+  // Unique nccl_id for this communicator.
+  ncclUniqueId ncclId_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool aborted_{false};
   uint64_t ncclCommSplitCounter_{0};
   ncclResult_t ncclAsyncErr_{ncclSuccess};
@@ -367,7 +391,11 @@ class NCCLComm {
   int rank_{};
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for
   // better error messaging.
+<<<<<<< HEAD
   std::optional<std::string> commFailureReason_;
+=======
+  std::optional<std::string> commFailureReason_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool initialized_{false};
   // Whether this communicator is using nonblocking mode. Recorded during comm
   // creation or split. For safety, we give a default value of true (more
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index a5d42771ce05b..2da85b566a88c 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -15,11 +15,15 @@ TORCH_LIBRARY(c10d, m) {
   m.class_<Work>("Work")
       .def(torch::init<>())
       .def("wait", [](const c10::intrusive_ptr<Work>& self) { self->wait(); });
+<<<<<<< HEAD
   m.class_<ReduceOp>("ReduceOp")
       .def(torch::init<>())
       .def("op", [](const c10::intrusive_ptr<ReduceOp>& self) -> int64_t {
         return self->op_;
       });
+=======
+  m.class_<ReduceOp>("ReduceOp").def(torch::init<>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(
       "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
   m.def(
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
index 678c98e91a0b9..70dbf921b21e8 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -91,7 +91,11 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
   std::vector<int64_t> outputSplitSizes_;
   int globalRankStart_{};
   int globalRankStride_{};
+<<<<<<< HEAD
   std::vector<int64_t> groupRanks_;
+=======
+  std::vector<int64_t> groupRanks_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 #define RECORD_PARAM_COMMS(                                                    \
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 9f79a09d236e5..5723c85a6cbed 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -4,7 +4,11 @@
 
 #include <c10/util/Logging.h>
 #include <fmt/format.h>
+<<<<<<< HEAD
 #include <fmt/ranges.h>
+=======
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -158,6 +162,7 @@ void ProcessGroup::release_resources() {
   backendTypeToBackend_.clear();
 }
 
+<<<<<<< HEAD
 c10::intrusive_ptr<ProcessGroup> ProcessGroup::splitGroup(
     const std::vector<int>& ranks,
     const std::optional<std::chrono::milliseconds>& timeout,
@@ -259,6 +264,8 @@ c10::intrusive_ptr<ProcessGroup> ProcessGroup::mergeRemoteGroup(
   return newGroup;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d
 
 namespace {
@@ -428,6 +435,7 @@ bool allow_inflight_collective_as_graph_input() {
       .allow_inflight_collective_as_graph_input();
 }
 
+<<<<<<< HEAD
 c10::intrusive_ptr<ProcessGroup>& currentProcessGroup() {
   thread_local static c10::intrusive_ptr<ProcessGroup> pg = nullptr;
   return pg;
@@ -437,4 +445,6 @@ void setProcessGroup(c10::intrusive_ptr<ProcessGroup> pg) {
   currentProcessGroup() = std::move(pg);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 8c4a657fd7eed..3218f8203d048 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -11,6 +11,10 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/macros/Macros.h>
 
+<<<<<<< HEAD
+=======
+#include <torch/csrc/distributed/c10d/Work.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // *************************************************************************
 // PROCESS GROUP collective communication API IS BEING CHANGED BETWEEN
 // versions 1.7 and 1.8.
@@ -70,6 +74,7 @@ C10_EXPORT bool allow_inflight_collective_as_graph_input();
 //
 class TORCH_API ProcessGroup : public torch::CustomClassHolder {
  public:
+<<<<<<< HEAD
   struct TORCH_API MergeOptions : torch::CustomClassHolder {
     explicit MergeOptions(
         const std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout,
@@ -85,6 +90,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     std::optional<std::string> group_desc;
   };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   enum BackendType : uint8_t {
     UNDEFINED = 0,
     GLOO = 1,
@@ -178,6 +185,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     return false;
   }
 
+<<<<<<< HEAD
   virtual void setTimeout(std::chrono::milliseconds timeout) {
     for (auto& backend : backendTypeToBackend_) {
       backend.second->setTimeout(timeout);
@@ -188,6 +196,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     return splitCounter_++;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void startCoalescing(c10::DeviceType deviceType) {
     // only nccl has implemented startCoalescing so only execute for nccl
     // backends
@@ -966,10 +976,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     return bound_device_id_;
   }
 
+<<<<<<< HEAD
   c10::intrusive_ptr<c10d::Store> getStore() const {
     return store_;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void setBoundDeviceId(std::optional<at::Device> device) {
     if (device) {
       TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
@@ -977,6 +990,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     bound_device_id_ = device;
   }
 
+<<<<<<< HEAD
   // This creates a new subgroup using the specified ranks.
   // The current rank must be included in the list of new_ranks.
   virtual c10::intrusive_ptr<ProcessGroup> splitGroup(
@@ -993,6 +1007,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       const MergeOptions& opts,
       const int& size);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  protected:
   // Implementations of this interface need to call this to setup
   // appropriate logging etc.
@@ -1006,7 +1022,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   BackendType backendType_;
   std::string pg_desc_;
+<<<<<<< HEAD
   int64_t splitCounter_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Debug level setting. It is parsed once when ProcessGroup is constructed and
   // remains the same across use of this process group.
@@ -1014,9 +1033,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
 
   // Backend classes for this ProcessGroup
   std::unordered_set<c10::DeviceType> deviceTypes_;
+<<<<<<< HEAD
   // This mapping is ordered, as splitGroup must call split on the underlying
   // backends in a consistent order.
   std::map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
+=======
+  std::unordered_map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<c10::DeviceType, c10::intrusive_ptr<Backend>>
       deviceTypeToBackend_;
   std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
@@ -1025,8 +1048,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   std::optional<at::Device> bound_device_id_;
 };
 
+<<<<<<< HEAD
 // Thread local functions for managing the currently active process group.
 TORCH_API c10::intrusive_ptr<ProcessGroup>& currentProcessGroup();
 TORCH_API void setProcessGroup(c10::intrusive_ptr<ProcessGroup> processGroup);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index a9612ce759733..6fa176771964d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -299,7 +299,10 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     std::vector<std::vector<at::Tensor>> outputTensors,
     OpType opType,
     uint64_t seq,
+<<<<<<< HEAD
     std::chrono::milliseconds timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const char* profilingTitle,
     const std::optional<std::vector<at::Tensor>>& inputTensors)
     // Profiler: Pass nullptr as profilingTitle to parent constructor to
@@ -307,7 +310,10 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     // correct timestamps for work that is asynchronously executed.
     : Work(-1, opType, nullptr, inputTensors),
       context_(std::move(context)),
+<<<<<<< HEAD
       timeout_(timeout == kUnsetTimeout ? context_->getTimeout() : timeout),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       outputTensors_(std::move(outputTensors)),
       future_(createFutureAsOutput(outputTensors_)),
       seq_(seq) {
@@ -528,16 +534,27 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
   const auto hostNameMax = sysconf(_SC_HOST_NAME_MAX);
+<<<<<<< HEAD
   std::string hostname(hostNameMax, '\0');
   auto rv = gethostname(hostname.data(), hostNameMax);
+=======
+  auto hostname = std::unique_ptr<char[]>(new char[hostNameMax]);
+  auto rv = gethostname(hostname.get(), hostNameMax);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (rv != 0) {
     C10_THROW_ERROR(DistBackendError, c10::utils::str_error(errno));
   }
 
   // Use this machine's hostname if it resolves to an address.
+<<<<<<< HEAD
   if (doesHostnameResolveToUsableAddress(hostname.data())) {
     return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
         hostname.data(), lazyInit);
+=======
+  if (doesHostnameResolveToUsableAddress(hostname.get())) {
+    return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
+        hostname.get(), lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Otherwise, use the loopback address.
@@ -551,6 +568,7 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
 
 static std::atomic<size_t> process_group_id = 0;
 
+<<<<<<< HEAD
 c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
     create_default(std::chrono::milliseconds timeout) {
   auto options = ::c10d::ProcessGroupGloo::Options::create();
@@ -577,6 +595,8 @@ c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
   return options;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ProcessGroupGloo::ProcessGroupGloo(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -723,6 +743,7 @@ const std::vector<uint64_t>& ProcessGroupGloo::groupRanks() const {
   return options_->global_ranks_in_group;
 }
 
+<<<<<<< HEAD
 c10::intrusive_ptr<Backend> ProcessGroupGloo::split(
     const c10::intrusive_ptr<Store>& store,
     const std::vector<int>& ranks,
@@ -771,6 +792,8 @@ c10::intrusive_ptr<Backend> ProcessGroupGloo::merge(
   return c10::static_intrusive_pointer_cast<Backend>(pg);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
   std::unique_lock<std::mutex> lock(workMutex_);
   pgStatus_->lastEnqueuedSeq = static_cast<int64_t>(work->seq_);
@@ -813,14 +836,21 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
       int rootRank,
       int rootTensor,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {inputs},
             OpType::BROADCAST,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:broadcast",
             inputs),
         inputs(inputs),
@@ -828,20 +858,31 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
         rootTensor(rootTensor),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int rootRank;
   const int rootTensor;
   const uint32_t tag;
 
+<<<<<<< HEAD
   void broadcast(at::Tensor tensor) {
     if (tensor.is_complex()) {
       tensor = at::view_as_real(tensor);
     }
+=======
+  void broadcast(at::Tensor& tensor) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto& scalarType = tensor.scalar_type();
     gloo::BroadcastOptions opts(context_);
     opts.setRoot(rootRank);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensor);
     gloo::broadcast(opts);
   }
@@ -875,6 +916,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
       int rootRank,
       int rootTensor,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncBroadcastWork(
@@ -885,6 +927,10 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
             tag,
             seq,
             timeout) {
+=======
+      uint64_t seq)
+      : AsyncBroadcastWork(context, inputs, rootRank, rootTensor, tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, streams, events);
 
     // Create pinned host side tensors.
@@ -924,8 +970,13 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
   }
 
   at::Tensor tmp;
+<<<<<<< HEAD
   std::vector<c10::Stream> streams;
   std::vector<c10::Event> events;
+=======
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
@@ -961,6 +1012,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::broadcast(
   ++seq_;
   if (device.type() == at::kCPU) {
     work = c10::make_intrusive<AsyncBroadcastWork>(
+<<<<<<< HEAD
         std::move(context),
         inputs,
         opts.rootRank,
@@ -977,6 +1029,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::broadcast(
         tag,
         seq_,
         opts.timeout);
+=======
+        std::move(context), inputs, opts.rootRank, opts.rootTensor, tag, seq_);
+  } else if (device.type() == at::kCUDA) {
+    work = c10::make_intrusive<AsyncBroadcastCUDAWork>(
+        std::move(context), inputs, opts.rootRank, opts.rootTensor, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
@@ -1021,7 +1079,11 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce(
   ++seq_;
 
   work = GlooAllreduceRegistry()->Create(
+<<<<<<< HEAD
       device.type(), context, inputs, opts.reduceOp, tag, seq_, opts.timeout);
+=======
+      device.type(), context, inputs, opts.reduceOp, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   enqueue(work);
   return work;
@@ -1032,16 +1094,27 @@ static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCPUWork(
     std::vector<at::Tensor>& inputs,
     ReduceOp reduceOp,
     uint32_t tag,
+<<<<<<< HEAD
     uint64_t seq,
     std::chrono::milliseconds timeout) {
+=======
+    uint64_t seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layout = inputs[0].layout();
 
   if (layout == c10::kStrided) {
     return c10::make_intrusive<AsyncAllreduceWork>(
+<<<<<<< HEAD
         std::move(context), inputs, reduceOp, tag, seq, timeout);
   } else if (layout == c10::kSparse) {
     return c10::make_intrusive<AsyncSparseAllreduceWork>(
         std::move(context), inputs, tag, seq, timeout);
+=======
+        std::move(context), inputs, reduceOp, tag, seq);
+  } else if (layout == c10::kSparse) {
+    return c10::make_intrusive<AsyncSparseAllreduceWork>(
+        std::move(context), inputs, tag, seq);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "ProcessGroupGloo::allreduce: unsupported layout");
   }
@@ -1056,8 +1129,12 @@ C10_DEFINE_TYPED_REGISTRY(
     std::vector<at::Tensor>&,
     ReduceOp,
     uint32_t,
+<<<<<<< HEAD
     uint64_t,
     std::chrono::milliseconds)
+=======
+    uint64_t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_REGISTER_TYPED_CREATOR(
     GlooAllreduceRegistry,
@@ -1122,7 +1199,11 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_coalesced(
   if (device.type() == c10::kCPU) {
     if (layout == c10::kStrided) {
       work = c10::make_intrusive<AsyncAllreduceCoalescedWork>(
+<<<<<<< HEAD
           std::move(context), tensors, opts.reduceOp, tag, seq_, opts.timeout);
+=======
+          std::move(context), tensors, opts.reduceOp, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       invalidArgument("unsupported layout");
     }
@@ -1144,14 +1225,21 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
       int rootTensor,
       ReduceOp reduceOp,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {inputs},
             OpType::REDUCE,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:reduce",
             inputs),
         inputs(inputs),
@@ -1160,13 +1248,18 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int rootRank;
   const int rootTensor;
   const ReduceOp reduceOp;
   const uint32_t tag;
 
   void reduce(std::vector<at::Tensor>& tensors) {
+<<<<<<< HEAD
     auto tensor = tensors[0];
     if (tensor.is_complex()) {
       TORCH_CHECK(
@@ -1183,6 +1276,14 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     opts.setTimeout(timeout_);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensor);
+=======
+    const auto& scalarType = tensors[0].scalar_type();
+    gloo::ReduceOptions opts(context_);
+    opts.setRoot(rootRank);
+    opts.setTag(tag);
+    opts.setReduceFunction(getFunction(scalarType, reduceOp));
+    GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensors[0]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::reduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
@@ -1227,8 +1328,12 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
       int rootTensor,
       ReduceOp reduceOp,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : AsyncReduceWork(
             context,
             inputs,
@@ -1236,8 +1341,12 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
             rootTensor,
             std::move(reduceOp),
             tag,
+<<<<<<< HEAD
             seq,
             timeout) {
+=======
+            seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, streams, events);
 
     // Kick off copy from CUDA tensors to pinned CPU tensors.
@@ -1276,9 +1385,15 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
     }
   }
 
+<<<<<<< HEAD
   std::vector<at::Tensor> tmp;
   std::vector<c10::Stream> streams;
   std::vector<c10::Event> events;
+=======
+  std::vector<at::Tensor> tmp{};
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
@@ -1320,8 +1435,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::reduce(
         opts.rootTensor,
         opts.reduceOp,
         tag,
+<<<<<<< HEAD
         seq_,
         opts.timeout);
+=======
+        seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (device.type() == at::kCUDA) {
     work = c10::make_intrusive<AsyncReduceCUDAWork>(
         std::move(context),
@@ -1330,8 +1449,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::reduce(
         opts.rootTensor,
         opts.reduceOp,
         tag,
+<<<<<<< HEAD
         seq_,
         opts.timeout);
+=======
+        seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
@@ -1348,22 +1471,34 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             outputs,
             OpType::ALLGATHER,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:all_gather",
             inputs),
         outputs(outputs),
         inputs(inputs),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<std::vector<at::Tensor>> outputs;
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<std::vector<at::Tensor>> outputs{};
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint32_t tag;
 
   void allgather(
@@ -1372,7 +1507,10 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     const auto& scalarType = inputs[0].scalar_type();
     gloo::AllgatherOptions opts(context_);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Use single flattened input tensor.
     at::Tensor flatInputTensor = flattenDenseTensors(inputs);
@@ -1415,9 +1553,14 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncAllgatherWork(context, outputs, inputs, tag, seq, timeout) {
+=======
+      uint64_t seq)
+      : AsyncAllgatherWork(context, outputs, inputs, tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, inputStreams, inputEvents);
     initializeStreamsEvents(outputs, outputStreams, outputEvents);
 
@@ -1471,6 +1614,7 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
     }
   }
 
+<<<<<<< HEAD
   std::vector<at::Tensor> tmpInputs;
   std::vector<c10::Stream> inputStreams;
   std::vector<c10::Event> inputEvents;
@@ -1478,6 +1622,15 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
   std::vector<std::vector<at::Tensor>> tmpOutputs;
   std::vector<c10::Stream> outputStreams;
   std::vector<c10::Event> outputEvents;
+=======
+  std::vector<at::Tensor> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
+
+  std::vector<std::vector<at::Tensor>> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // A work that takes an lambda on construction and calls it on wait.
@@ -1533,8 +1686,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor> inp = {buffers[i]};
     AllreduceOptions arOpts;
     arOpts.reduceOp = opts.reduceOp;
+<<<<<<< HEAD
     arOpts.timeout = opts.timeout;
     works.push_back(allreduce(inp, arOpts));
+=======
+    works.push_back(allreduce(inp));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return c10::make_intrusive<LambdaWork>(
       [rank, worldSize, buffers, outputTensors, works = std::move(works)]() {
@@ -1612,10 +1769,17 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allgather(
   ++seq_;
   if (device.type() == at::kCPU) {
     work = c10::make_intrusive<AsyncAllgatherWork>(
+<<<<<<< HEAD
         std::move(context), outputs, inputs, tag, seq_, opts.timeout);
   } else if (device.type() == at::kCUDA) {
     work = c10::make_intrusive<AsyncAllgatherCUDAWork>(
         std::move(context), outputs, inputs, tag, seq_, opts.timeout);
+=======
+        std::move(context), outputs, inputs, tag, seq_);
+  } else if (device.type() == at::kCUDA) {
+    work = c10::make_intrusive<AsyncAllgatherCUDAWork>(
+        std::move(context), outputs, inputs, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
@@ -1632,22 +1796,34 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
       std::vector<std::vector<at::Tensor>>& output_lists,
       std::vector<at::Tensor>& input_list,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             output_lists,
             OpType::ALLGATHER_COALESCED,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:all_gather",
             input_list),
         output_lists(output_lists),
         input_list(input_list),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<std::vector<at::Tensor>> output_lists;
   std::vector<at::Tensor> input_list;
+=======
+  std::vector<std::vector<at::Tensor>> output_lists{};
+  std::vector<at::Tensor> input_list{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint32_t tag;
 
   void allgather_coalesced() {
@@ -1658,7 +1834,10 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
     const auto& scalarType = input_list[0].scalar_type();
     gloo::AllgatherOptions opts(context_);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Use single flattened input tensor.
     at::Tensor flatInputTensor = flattenDenseTensors(input_list);
@@ -1707,7 +1886,11 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
 c10::intrusive_ptr<Work> ProcessGroupGloo::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& output_lists,
     std::vector<at::Tensor>& input_list,
+<<<<<<< HEAD
     const AllgatherOptions& opts) {
+=======
+    const AllgatherOptions& /* unused */) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static auto invalidArgument = [](const std::string& msg) {
     TORCH_CHECK(false, "ProcessGroupGloo::allgather_coalesced: " + msg);
   };
@@ -1755,7 +1938,11 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allgather_coalesced(
   auto context = getContext(tag);
   ++seq_;
   auto work = c10::make_intrusive<AsyncAllgatherCoalescedWork>(
+<<<<<<< HEAD
       std::move(context), output_lists, input_list, tag, seq_, opts.timeout);
+=======
+      std::move(context), output_lists, input_list, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   enqueue(work);
   return work;
 }
@@ -1785,14 +1972,21 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
       std::vector<at::Tensor>& inputs,
       int root,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             outputs,
             OpType::GATHER,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:gather",
             inputs),
         outputs(outputs),
@@ -1800,8 +1994,13 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
         root(root),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<std::vector<at::Tensor>> outputs;
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<std::vector<at::Tensor>> outputs{};
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int root;
   const uint32_t tag;
 
@@ -1812,7 +2011,10 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     gloo::GatherOptions opts(context_);
     opts.setRoot(root);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Set single temporary tensor on root process.
     // This is later scattered to the separate output tensors.
@@ -1823,8 +2025,12 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     }
 
     // Set single input tensor on all processes.
+<<<<<<< HEAD
     at::Tensor flatInputTensor = flattenDenseTensors(inputs[0]);
     GENERATE_ALL_TYPES(scalarType, setInput, opts, flatInputTensor);
+=======
+    GENERATE_ALL_TYPES(scalarType, setInput, opts, inputs[0]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::gather(opts);
 
     // Unflatten into output tensors on root process.
@@ -1862,9 +2068,14 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
       std::vector<at::Tensor>& inputs,
       int root,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncGatherWork(context, outputs, inputs, root, tag, seq, timeout) {
+=======
+      uint64_t seq)
+      : AsyncGatherWork(context, outputs, inputs, root, tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, inputStreams, inputEvents);
     initializeStreamsEvents(outputs, outputStreams, outputEvents);
 
@@ -1918,6 +2129,7 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
     }
   }
 
+<<<<<<< HEAD
   std::vector<at::Tensor> tmpInputs;
   std::vector<c10::Stream> inputStreams;
   std::vector<c10::Event> inputEvents;
@@ -1925,6 +2137,15 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
   std::vector<std::vector<at::Tensor>> tmpOutputs;
   std::vector<c10::Stream> outputStreams;
   std::vector<c10::Event> outputEvents;
+=======
+  std::vector<at::Tensor> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
+
+  std::vector<std::vector<at::Tensor>> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
@@ -1982,6 +2203,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::gather(
   ++seq_;
   if (device.type() == at::kCPU) {
     work = c10::make_intrusive<AsyncGatherWork>(
+<<<<<<< HEAD
         std::move(context),
         outputs,
         inputs,
@@ -1998,6 +2220,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::gather(
         tag,
         seq_,
         opts.timeout);
+=======
+        std::move(context), outputs, inputs, opts.rootRank, tag, seq_);
+  } else if (device.type() == at::kCUDA) {
+    work = c10::make_intrusive<AsyncGatherCUDAWork>(
+        std::move(context), outputs, inputs, opts.rootRank, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
@@ -2015,14 +2243,21 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
       std::vector<std::vector<at::Tensor>>& inputs,
       int root,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {outputs},
             OpType::SCATTER,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:scatter",
             !inputs.empty() ? std::optional<std::vector<at::Tensor>>(inputs[0])
                             : std::nullopt),
@@ -2031,8 +2266,13 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
         root(root),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<at::Tensor> outputs;
   std::vector<std::vector<at::Tensor>> inputs;
+=======
+  std::vector<at::Tensor> outputs{};
+  std::vector<std::vector<at::Tensor>> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int root;
   const uint32_t tag;
 
@@ -2043,7 +2283,10 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
     gloo::ScatterOptions opts(context_);
     opts.setRoot(root);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Set list of input tensors on root process
     if (context_->rank == root) {
@@ -2077,9 +2320,14 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
       std::vector<std::vector<at::Tensor>>& inputs,
       int root,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncScatterWork(context, outputs, inputs, root, tag, seq, timeout) {
+=======
+      uint64_t seq)
+      : AsyncScatterWork(context, outputs, inputs, root, tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, inputStreams, inputEvents);
     initializeStreamsEvents(outputs, outputStreams, outputEvents);
 
@@ -2131,6 +2379,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
     }
   }
 
+<<<<<<< HEAD
   std::vector<at::Tensor> tmpOutputs;
   std::vector<c10::Stream> outputStreams;
   std::vector<c10::Event> outputEvents;
@@ -2138,6 +2387,15 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
   std::vector<std::vector<at::Tensor>> tmpInputs;
   std::vector<c10::Stream> inputStreams;
   std::vector<c10::Event> inputEvents;
+=======
+  std::vector<at::Tensor> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
+
+  std::vector<std::vector<at::Tensor>> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
@@ -2194,6 +2452,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::scatter(
   ++seq_;
   if (device.type() == at::kCPU) {
     work = c10::make_intrusive<AsyncScatterWork>(
+<<<<<<< HEAD
         std::move(context),
         outputs,
         inputs,
@@ -2210,6 +2469,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::scatter(
         tag,
         seq_,
         opts.timeout);
+=======
+        std::move(context), outputs, inputs, opts.rootRank, tag, seq_);
+  } else if (device.type() == at::kCUDA) {
+    work = c10::make_intrusive<AsyncScatterCUDAWork>(
+        std::move(context), outputs, inputs, opts.rootRank, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
@@ -2251,8 +2516,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::reduce_scatter(
     std::vector<at::Tensor> inp = {buffers[i]};
     AllreduceOptions arOpts;
     arOpts.reduceOp = opts.reduceOp;
+<<<<<<< HEAD
     arOpts.timeout = opts.timeout;
     works.push_back(allreduce(inp, arOpts));
+=======
+    works.push_back(allreduce(inp));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return c10::make_intrusive<LambdaWork>(
       [worldSize, works = std::move(works)]() {
@@ -2273,14 +2542,21 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
       std::vector<int64_t>& outputCounts,
       std::vector<int64_t>& inputCounts,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {{outputTensor}},
             OpType::ALLTOALL,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:all_to_all",
             std::optional<std::vector<at::Tensor>>({inputTensor})),
         outputTensor(outputTensor),
@@ -2291,8 +2567,13 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
 
   at::Tensor outputTensor;
   at::Tensor inputTensor;
+<<<<<<< HEAD
   std::vector<int64_t> outputCounts;
   std::vector<int64_t> inputCounts;
+=======
+  std::vector<int64_t> outputCounts{};
+  std::vector<int64_t> inputCounts{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint32_t tag;
 
   void alltoall(at::Tensor& outputTensor, at::Tensor& inputTensor) {
@@ -2301,7 +2582,10 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
       // Gloo alltoall
       gloo::AlltoallOptions opts(context_);
       opts.setTag(tag);
+<<<<<<< HEAD
       opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GENERATE_ALL_TYPES(scalarType, setInput, opts, inputTensor);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, outputTensor);
       gloo::alltoall(opts);
@@ -2319,7 +2603,10 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
           outputCounts, outputTensor, &recvCounts, &recvOffsets);
       gloo::AlltoallvOptions opts(context_);
       opts.setTag(tag);
+<<<<<<< HEAD
       opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GENERATE_ALL_TYPES(scalarType, setInput, opts, inputTensor, sendCounts);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, outputTensor, recvCounts);
       gloo::alltoallv(opts);
@@ -2348,8 +2635,12 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
       std::vector<int64_t>& outputCounts,
       std::vector<int64_t>& inputCounts,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : AsyncAlltoallWork(
             context,
             outputTensor,
@@ -2357,8 +2648,12 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
             outputCounts,
             inputCounts,
             tag,
+<<<<<<< HEAD
             seq,
             timeout) {
+=======
+            seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents({inputTensor}, inputStreams, inputEvents);
     initializeStreamsEvents({outputTensor}, outputStreams, outputEvents);
 
@@ -2394,12 +2689,21 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
   }
 
   at::Tensor cpuOutput;
+<<<<<<< HEAD
   std::vector<c10::Stream> outputStreams;
   std::vector<c10::Event> outputEvents;
 
   at::Tensor cpuInput;
   std::vector<c10::Stream> inputStreams;
   std::vector<c10::Event> inputEvents;
+=======
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
+
+  at::Tensor cpuInput;
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
@@ -2409,7 +2713,11 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::alltoall_base(
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputCounts,
     std::vector<int64_t>& inputCounts,
+<<<<<<< HEAD
     const AllToAllOptions& opts) {
+=======
+    const AllToAllOptions& /* unused */) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static auto invalidArgument = [](const std::string& msg) {
     TORCH_CHECK(false, "ProcessGroupGloo::alltoall_base: " + msg);
   };
@@ -2438,8 +2746,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::alltoall_base(
         outputCounts,
         inputCounts,
         tag,
+<<<<<<< HEAD
         seq_,
         opts.timeout);
+=======
+        seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (device.type() == at::kCUDA) {
     work = c10::make_intrusive<AsyncAlltoallCUDAWork>(
         std::move(context),
@@ -2448,8 +2760,12 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::alltoall_base(
         outputCounts,
         inputCounts,
         tag,
+<<<<<<< HEAD
         seq_,
         opts.timeout);
+=======
+        seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     invalidArgument(c10::str("unsupported device type ", device.type()));
   }
@@ -2473,7 +2789,11 @@ static at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
 
 static uint32_t checkTag(int32_t tag) {
   TORCH_CHECK(tag >= 0, "Tag must be nonnegative");
+<<<<<<< HEAD
   return static_cast<uint32_t>(tag);
+=======
+  return (uint32_t)tag;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Work> ProcessGroupGloo::send(
@@ -2560,22 +2880,35 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
       std::shared_ptr<gloo::Context> context,
       std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {},
             OpType::BARRIER,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:barrier",
             std::nullopt),
         priorWork(std::move(priorWork)),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
   const uint32_t tag;
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork{};
+  const uint32_t tag;
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs;
@@ -2596,7 +2929,10 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
 
     gloo::BarrierOptions opts(context_);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::barrier(opts);
   }
 };
@@ -2620,7 +2956,11 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::barrier(const BarrierOptions& opts) {
   auto context = getContext(tag);
   ++seq_;
   auto work = c10::make_intrusive<AsyncBarrierWork>(
+<<<<<<< HEAD
       std::move(context), std::move(priorWork), tag, seq_, opts.timeout);
+=======
+      std::move(context), std::move(priorWork), tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   enqueue(work);
   return work;
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index b2cc6993528bf..3709b08fae037 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -69,7 +69,10 @@ class TORCH_API ProcessGroupGloo : public Backend {
         std::vector<std::vector<at::Tensor>> outputTensors,
         OpType opType,
         uint64_t seq,
+<<<<<<< HEAD
         std::chrono::milliseconds timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         const char* profilingTitle = nullptr,
         const std::optional<std::vector<at::Tensor>>& inputTensors =
             std::nullopt);
@@ -100,7 +103,10 @@ class TORCH_API ProcessGroupGloo : public Backend {
     // work has completed
     std::optional<uint64_t> trace_id_;
     std::shared_ptr<gloo::Context> context_;
+<<<<<<< HEAD
     const std::chrono::milliseconds timeout_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
    private:
     void finishWorkGloo();
@@ -188,10 +194,13 @@ class TORCH_API ProcessGroupGloo : public Backend {
     }
 #endif
 
+<<<<<<< HEAD
     const c10::intrusive_ptr<::c10d::Store>& _getStore() const {
       return store_;
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    protected:
     c10::intrusive_ptr<::c10d::Store> store_;
   };
@@ -255,9 +264,14 @@ class TORCH_API ProcessGroupGloo : public Backend {
       return c10::make_intrusive<Options>(timeout);
     }
 
+<<<<<<< HEAD
     static c10::intrusive_ptr<Options> create_default(
         std::chrono::milliseconds timeout = kBackendDefaultTimeout);
 
+=======
+    std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
   };
@@ -266,10 +280,13 @@ class TORCH_API ProcessGroupGloo : public Backend {
     return std::string(GLOO_BACKEND_NAME);
   }
 
+<<<<<<< HEAD
   bool supportsSplitting() const override {
     return true;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Helper functions to create a new device object.
   // They are static functions on this class to keep them logically
   // separate from the rest of the code base (e.g. torch/csrc/distributed).
@@ -303,6 +320,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
     return options_;
   }
 
+<<<<<<< HEAD
   void setTimeout(std::chrono::milliseconds timeout) override {
     options_->timeout = timeout;
     for (auto& context : contexts_) {
@@ -325,6 +343,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
       const int& rank,
       const int& size) override;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const std::vector<uint64_t>& groupRanks() const;
 
   c10::intrusive_ptr<Work> broadcast(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
index 7494a960a3dc1..4c90c09b79b16 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -1,7 +1,10 @@
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
+<<<<<<< HEAD
 #include <utility>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <gloo/cuda_allreduce_ring_chunked.h>
 
@@ -10,22 +13,37 @@ namespace c10d {
 class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllreduceCUDADeviceWork(
+<<<<<<< HEAD
       std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {inputs},
             OpType::ALLREDUCE,
             seq,
+<<<<<<< HEAD
             timeout,
             "gloo:all_reduce",
             inputs),
         inputs_(inputs),
         reduceOp_(std::move(reduceOp)) {}
+=======
+            "gloo:all_reduce",
+            inputs),
+        inputs_(inputs),
+        reduceOp_(reduceOp) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   template <typename T>
   void createAlgorithm(std::unique_ptr<gloo::Algorithm>& algo) {
@@ -79,6 +97,7 @@ class AsyncAllreduceCUDAHostWork : public AsyncAllreduceWork {
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncAllreduceWork(
@@ -88,6 +107,10 @@ class AsyncAllreduceCUDAHostWork : public AsyncAllreduceWork {
             tag,
             seq,
             timeout) {
+=======
+      uint64_t seq)
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, streams, events);
 
     // Kick off copy from CUDA tensors to pinned CPU tensors.
@@ -126,8 +149,13 @@ class AsyncAllreduceCUDAHostWork : public AsyncAllreduceWork {
   }
 
   std::vector<at::Tensor> tmp;
+<<<<<<< HEAD
   std::vector<c10::Stream> streams;
   std::vector<c10::Event> events;
+=======
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
@@ -136,9 +164,14 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
       const std::shared_ptr<gloo::Context>& context,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncSparseAllreduceWork(context, inputs, tag, seq, timeout) {
+=======
+      uint64_t seq)
+      : AsyncSparseAllreduceWork(context, inputs, tag, seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     initializeStreamsEvents(inputs, streams, events);
 
     // Kick off copy from CUDA tensors to CPU tensors.
@@ -180,9 +213,15 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
     }
   }
 
+<<<<<<< HEAD
   std::vector<at::Tensor> tmp;
   std::vector<c10::Stream> streams;
   std::vector<c10::Event> events;
+=======
+  std::vector<at::Tensor> tmp{};
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCUDAWork(
@@ -190,13 +229,18 @@ static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCUDAWork(
     std::vector<at::Tensor>& inputs,
     ReduceOp reduceOp,
     uint32_t tag,
+<<<<<<< HEAD
     uint64_t seq,
     std::chrono::milliseconds timeout) {
+=======
+    uint64_t seq) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layout = inputs[0].layout();
 
   if (layout == c10::kStrided) {
     if (context->getDevice()->hasGPUDirect()) {
       return c10::make_intrusive<AsyncAllreduceCUDADeviceWork>(
+<<<<<<< HEAD
           std::move(context), inputs, reduceOp, tag, seq, timeout);
     } else {
       return c10::make_intrusive<AsyncAllreduceCUDAHostWork>(
@@ -205,6 +249,16 @@ static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCUDAWork(
   } else if (layout == c10::kSparse) {
     return c10::make_intrusive<AsyncSparseAllreduceCUDAWork>(
         std::move(context), inputs, tag, seq, timeout);
+=======
+          std::move(context), inputs, reduceOp, tag, seq);
+    } else {
+      return c10::make_intrusive<AsyncAllreduceCUDAHostWork>(
+          std::move(context), inputs, reduceOp, tag, seq);
+    }
+  } else if (layout == c10::kSparse) {
+    return c10::make_intrusive<AsyncSparseAllreduceCUDAWork>(
+        std::move(context), inputs, tag, seq);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(false, "ProcessGroupGloo::allreduce: unsupported layout");
   }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
index 07f0e26c2da90..63748777de686 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -93,8 +93,12 @@ TORCH_DECLARE_TYPED_REGISTRY(
     std::vector<at::Tensor>&,
     ReduceOp,
     uint32_t,
+<<<<<<< HEAD
     uint64_t,
     std::chrono::milliseconds);
+=======
+    uint64_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // This function initializes a vector of CUDA streams, one for every
 // tensor in the input tensor vector, and ensures that these streams are
@@ -232,8 +236,13 @@ void setInput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
 }
 
 template <typename T, typename O>
+<<<<<<< HEAD
 void setOutputs(O& opts, std::vector<at::Tensor>& tensors, int64_t count) {
   opts.setOutputs(getDataPointers<T>(tensors), count);
+=======
+void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
+  opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, typename O>
@@ -270,25 +279,37 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {inputs},
             OpType::ALLREDUCE,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:all_reduce",
             inputs),
         inputs(inputs),
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const ReduceOp reduceOp;
   const uint32_t tag;
 
   void allreduce(std::vector<at::Tensor>& tensors) {
+<<<<<<< HEAD
     auto tensor = tensors[0];
     if (tensor.is_complex()) {
       TORCH_CHECK(
@@ -306,6 +327,13 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
     // Use tensor.numel() instead of tensors[0].numel() to
     // get the right number of elements when tensors[0] is complex
     GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors, tensor.numel());
+=======
+    const auto& scalarType = tensors[0].scalar_type();
+    gloo::AllreduceOptions opts(context_);
+    opts.setReduceFunction(getFunction(scalarType, reduceOp));
+    opts.setTag(tag);
+    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::allreduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
@@ -347,6 +375,7 @@ class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
       : AsyncAllreduceWork(
@@ -356,6 +385,10 @@ class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
             tag,
             seq,
             timeout) {}
+=======
+      uint64_t seq)
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void run() override {
     allreduceCoalesced(inputs);
@@ -386,20 +419,31 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
       std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
+<<<<<<< HEAD
       uint64_t seq,
       std::chrono::milliseconds timeout)
+=======
+      uint64_t seq)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : ProcessGroupGloo::AsyncWork(
             std::move(context),
             {inputs},
             OpType::_ALLREDUCE_SPARSE,
             seq,
+<<<<<<< HEAD
             timeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "gloo:sparse_all_reduce",
             inputs),
         inputs(inputs),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::vector<at::Tensor> inputs;
+=======
+  std::vector<at::Tensor> inputs{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const uint32_t tag;
 
   // We share dimensionality about the sparse tensors before collecting
@@ -568,7 +612,10 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
     gloo::AllgatherOptions opts(context_);
     opts.setOutput(buffer.mutable_data_ptr<int64_t>(), buffer.numel());
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::allgather(opts);
 
     return metadata;
@@ -600,7 +647,10 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
         input.numel());
     opts.setOutput(output.mutable_data_ptr<int64_t>(), counts);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::allgatherv(opts);
 
     // Compile indices tensor per rank.
@@ -646,7 +696,10 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
     GENERATE_ALL_TYPES(
         valueTensor.scalar_type(), setOutput, opts, output, counts);
     opts.setTag(tag);
+<<<<<<< HEAD
     opts.setTimeout(timeout_);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gloo::allgatherv(opts);
 
     // Compile values tensor per rank.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 642893cbf41f5..e7a71946c981d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -68,6 +68,26 @@ inline bool isUnsupportedFloat8(at::ScalarType t) {
   );
 }
 
+<<<<<<< HEAD
+=======
+bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
+  switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::PREMUL_SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
 template <typename T, ncclDataType_t dataType>
 ncclRedOpRAII unpackPreMulSum(
@@ -122,14 +142,21 @@ ncclRedOpRAII getNcclReduceOp(
           return unpackPreMulSum<at::Half, ncclHalf>(reduceOp, comm);
         case ncclFloat:
           return unpackPreMulSum<float, ncclFloat>(reduceOp, comm);
+<<<<<<< HEAD
         case ncclBfloat16:
           return unpackPreMulSum<float, ncclBfloat16>(reduceOp, comm);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         case ncclDouble:
           return unpackPreMulSum<double, ncclDouble>(reduceOp, comm);
         default:
           C10_THROW_ERROR(
+<<<<<<< HEAD
               TypeError,
               "PreMulSum Data type must be half, float, bfloat16 or double");
+=======
+              TypeError, "PreMulSum Data type must be half, float, or double");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return ncclRedOp_t{};
       }
 #else
@@ -203,6 +230,20 @@ void syncStream(
   ncclEvent.block(ncclStream);
 }
 
+<<<<<<< HEAD
+=======
+// Given a ncclUniqueId, convert it to a string representation that can be put
+// in the store.
+std::string buildNcclUniqueIdStr(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (const auto i : c10::irange(NCCL_UNIQUE_ID_BYTES)) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string getNcclAbortedCommStoreKey(const std::string& ncclIdStr) {
   return std::string(kNCCLAbortedCommStoreKey) + ":" + ncclIdStr;
 }
@@ -274,12 +315,17 @@ bool shouldAllCommunicatorsRegisterAllTensors() {
 // - This map has also to be maintained as global variable since the register
 //   hooks are called outside the scope of any PG, thus we need traverse
 //   communicators in all PGs.
+<<<<<<< HEAD
 
 // MemPoolSet has ids of mempools used with this communicator, and whether they
 // were registered with window APIs or not
 using MemPoolSet = std::unordered_set<
     std::tuple<c10::cuda::MempoolId_t, bool>,
     c10::hash<std::tuple<c10::cuda::MempoolId_t, bool>>>;
+=======
+using MemPoolSet = std::
+    unordered_set<c10::cuda::MempoolId_t, c10::hash<c10::cuda::MempoolId_t>>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::unordered_map<std::shared_ptr<NCCLComm>, MemPoolSet>
     ncclCommMemPoolMap;
 static std::mutex ncclCommMemPoolMapMutex;
@@ -297,6 +343,7 @@ static void cacheAllocatorRegisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
+<<<<<<< HEAD
       bool symm = false;
       bool should_register = shouldAllCommunicatorsRegisterAllTensors();
       auto it =
@@ -314,6 +361,12 @@ static void cacheAllocatorRegisterHook(
             te.size_,
             /*errorOnRereg*/ false,
             /*window*/ symm);
+=======
+      if (shouldAllCommunicatorsRegisterAllTensors() ||
+          memPools.find(te.mempool_) != memPools.end()) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -330,6 +383,7 @@ static void cacheAllocatorDeregisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
+<<<<<<< HEAD
       bool symm = false;
       bool should_register = shouldAllCommunicatorsRegisterAllTensors();
       auto it =
@@ -343,6 +397,12 @@ static void cacheAllocatorDeregisterHook(
       if (should_register) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_), symm);
+=======
+      if (shouldAllCommunicatorsRegisterAllTensors() ||
+          memPools.find(te.mempool_) != memPools.end()) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -383,7 +443,12 @@ static std::
     }
   }
   for (auto& ncclComm : allNCCLComms) {
+<<<<<<< HEAD
     ncclDumpMap[ncclComm->getUniqueHash()] = ncclComm->ncclCommDump();
+=======
+    std::string ncclUniqueIDStr = buildNcclUniqueIdStr(ncclComm->getNcclId());
+    ncclDumpMap[ncclUniqueIDStr] = ncclComm->ncclCommDump();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return ncclDumpMap;
 #else
@@ -393,10 +458,13 @@ static std::
 #endif // (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
 }
 
+<<<<<<< HEAD
 void reset_nccl_trace() {
   FlightRecorderCUDA::get()->reset_all();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string dump_nccl_trace(
     bool includeCollectives,
     bool includeStackTraces,
@@ -532,9 +600,17 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
   // DEFAULT_FLAGS = cudaEventDisableTiming.
   if (cudaEventCacheEnabled) {
     ncclStartEvent_ = enableTiming
+<<<<<<< HEAD
         ? CUDAEventCache::get(device.index())->create(enableTiming)
         : nullptr;
     ncclEndEvent_ = CUDAEventCache::get(device.index())->create(enableTiming);
+=======
+        ? ProcessGroupNCCL::CUDAEventCache::get(device.index())
+              ->create(enableTiming)
+        : nullptr;
+    ncclEndEvent_ = ProcessGroupNCCL::CUDAEventCache::get(device.index())
+                        ->create(enableTiming);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     ncclStartEvent_ = enableTiming
         ? std::make_shared<at::cuda::CUDAEvent>(cudaEventDefault)
@@ -871,6 +947,64 @@ void ProcessGroupNCCL::WorkNCCL::abort() {
   }
 }
 
+<<<<<<< HEAD
+=======
+ProcessGroupNCCL::CUDAEventCache::CUDAEventCache() = default;
+
+// CUDA event is used to record the start/end of one Work.
+// Instead of let the CUDA event gets destroyed, we now reuse it after the Work
+// has been erased from workMetaList_.
+// This is to avoid the potential deadlock caused by CudaEventDestroy.
+std::shared_ptr<at::cuda::CUDAEvent> ProcessGroupNCCL::CUDAEventCache::create(
+    bool timing) {
+  // Register the deleter as a callback when the WorkNCCL object is destroyed.
+  // Each deleter keeps a ref count to the cache object, so that even when
+  // the thread that creates the cache is gone, the cache object won't be
+  // destroyed until all the events in the cache are destroyed (ref number drops
+  // to zero).
+  auto deleter = [cache = shared_from_this(),
+                  timing](at::cuda::CUDAEvent* event) {
+    std::lock_guard<std::mutex> lock(cache->cacheMutex_);
+    // We put the event back to the cache deque once the WorkNCCL object is
+    // destroyed.
+    cache->eventsArray_[timing ? 1 : 0].push_back(event);
+  };
+  at::cuda::CUDAEvent* event = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(cacheMutex_);
+    auto& events = eventsArray_[timing ? 1 : 0];
+    // If we still have events in the cache, we reuse it. Otherwise, we create a
+    // new one.
+    if (!events.empty()) {
+      event = events.front();
+      events.pop_front();
+    } else {
+      event = new at::cuda::CUDAEvent(
+          timing ? cudaEventDefault : cudaEventDisableTiming);
+    }
+  }
+  return std::shared_ptr<at::cuda::CUDAEvent>(event, std::move(deleter));
+}
+
+std::shared_ptr<ProcessGroupNCCL::CUDAEventCache> ProcessGroupNCCL::
+    CUDAEventCache::get(at::DeviceIndex device) {
+  // A per-thread singleton of device-to-CUDAEventCache map.
+  // Map is needed because events cannot be reused across devices.
+  // Per-thread ownership is needed to support multi-threaded case (instead of
+  // multi-process case).
+  static thread_local std::
+      map<at::DeviceIndex, std::shared_ptr<ProcessGroupNCCL::CUDAEventCache>>
+          cacheDeviceMap;
+  // Check if device has already been in the map, if not, add a new entry
+  auto it = cacheDeviceMap.find(device);
+  if (it == cacheDeviceMap.end()) {
+    cacheDeviceMap.emplace(
+        device, std::make_shared<ProcessGroupNCCL::CUDAEventCache>());
+  }
+  return cacheDeviceMap[device];
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::atomic<size_t> process_group_id = 0;
 
 constexpr const char* MULTI_DEVICE_ERROR_MSG =
@@ -928,8 +1062,11 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     TORCH_WARN_ONCE(
         "TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated.");
   }
+<<<<<<< HEAD
   showSerializationWarning_ =
       getCvarBool(TORCH_NCCL_SHOW_EAGER_INIT_P2P_SERIALIZATION_WARNING, true);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (blockingWait_) {
     LOG(INFO)
@@ -981,9 +1118,14 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   const std::string OFF = "OFF";
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
+<<<<<<< HEAD
   LOG(INFO) << logPrefix()
             << "ProcessGroupNCCL initialization options: " << "size: " << size
             << ", global rank: " << globalRank()
+=======
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL initialization options: "
+            << "size: " << size << ", global rank: " << globalRank()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             << ", TIMEOUT(ms): " << options_->timeout.count()
             << ", USE_HIGH_PRIORITY_STREAM: "
             << options_->is_high_priority_stream
@@ -1025,7 +1167,10 @@ void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
   LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
             << device;
   initNCCLComm(key, device, OpType::ALLREDUCE);
+<<<<<<< HEAD
   eagerInit_ = true;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool ProcessGroupNCCL::useNonblocking() {
@@ -1079,7 +1224,16 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
     LOG(ERROR) << logPrefix()
                << "No parent communicator exists for nocolor split";
   }
+<<<<<<< HEAD
   NCCLComm::split(comm.get(), NCCL_SPLIT_NOCOLOR, rank_, options_->config);
+=======
+  NCCLComm::split(
+      comm.get(),
+      NCCL_SPLIT_NOCOLOR,
+      rank_,
+      options_->config,
+      options_->global_ranks_in_group);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // NCCL_HAS_COMM_SPLIT
 }
 
@@ -1103,14 +1257,21 @@ ErrorType ProcessGroupNCCL::getError() {
   return error_;
 }
 
+<<<<<<< HEAD
 void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
   const auto key = std::to_string(pool->device());
+=======
+void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
+  const auto key = std::to_string(pool->device());
+  auto device = at::Device(at::DeviceType::CUDA, pool->device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer registration for all buffers in "
             << "MemPool: " << pool->id() << ", device index: " << key
             << ", i am " << this;
   auto ncclComm = getNCCLComm(key);
   if (ncclComm == nullptr) {
+<<<<<<< HEAD
     C10_THROW_ERROR(
         DistBackendError,
         "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
@@ -1119,11 +1280,35 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
     std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
     auto iter = ncclCommMemPoolMap.find(ncclComm);
     iter->second.insert(std::make_tuple(pool->id(), symm));
+=======
+    // HACK: currently we are using this function for NVLS
+    // reductions, and that's why using OpType::ALLREDUCE.
+    // If we end up using this API for zero-copy P2P, we might
+    // need to refactor and account for different OpType.
+    ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
+  }
+  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    auto iter = ncclCommMemPoolMap.find(ncclComm);
+    iter->second.insert(pool->id());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // We must ensure we're listening for allocator trace events in order to
   // register future segments allocated in this pool (this call is idempotent).
   attachAllocatorHooks();
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
+<<<<<<< HEAD
+=======
+  // TODO:
+  // if(pool->is_symmetric()) {
+  //   Allgather to verify len(mempool.snapshot.segments) matches across GPUs
+  //   Allgather to verify mempool.alloc_request_counter matches across GPUs
+  //   add alloc_request_counter per mempool (How many allocations a mempool has
+  //   served during its lifetime) this should guarantee pool is used in a
+  //   symmetric/SPMD manner
+  // }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& segmentInfo : snapshot.segments) {
     TORCH_INTERNAL_ASSERT(
         segmentInfo.device == pool->device(),
@@ -1133,18 +1318,28 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
         reinterpret_cast<void*>(segmentInfo.address),
         segmentInfo.total_size,
         /*errorOnRereg=*/false, // ignores reregistration error
+<<<<<<< HEAD
         /*window*/ symm); // whether to use NCCL symmetric memory
+=======
+        /*window=*/pool->is_symmetric()); // whether to use NCCL symmetric
+                                          // memory
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
   const auto key = std::to_string(pool->device());
+<<<<<<< HEAD
+=======
+  auto device = at::Device(at::DeviceType::CUDA, pool->device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer deregistration for all buffers in "
             << "MemPool: " << pool->id() << ", device index: " << key
             << ", i am " << this;
   auto ncclComm = getNCCLComm(key);
   if (ncclComm == nullptr) {
+<<<<<<< HEAD
     C10_THROW_ERROR(
         DistBackendError,
         "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
@@ -1162,6 +1357,19 @@ void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
         "Trying to unregister not previously registered pool");
     symm = std::get<1>(*mempool_it);
     iter->second.erase(mempool_it);
+=======
+    // HACK: currently we are using this function for NVLS
+    // reductions, and that's why using OpType::ALLREDUCE.
+    // If we end up using this API for zero-copy P2P, we might
+    // need to refactor and account for different OpType.
+    ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
+  }
+  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    auto iter = ncclCommMemPoolMap.find(ncclComm);
+    iter->second.erase(pool->id());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
   for (const auto& segmentInfo : snapshot.segments) {
@@ -1170,7 +1378,11 @@ void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
         "Mismatch between CUDA memory segment device and pool's device");
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     ncclComm->deregisterSegment(
+<<<<<<< HEAD
         reinterpret_cast<void*>(segmentInfo.address), symm);
+=======
+        reinterpret_cast<void*>(segmentInfo.address), pool->is_symmetric());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1260,6 +1472,7 @@ void ProcessGroupNCCL::enableCollectivesTiming() {
   enableTiming_.store(true);
 }
 
+<<<<<<< HEAD
 c10::intrusive_ptr<Backend> ProcessGroupNCCL::split(
     const c10::intrusive_ptr<Store>& store,
     const std::vector<int>& ranks,
@@ -1312,6 +1525,8 @@ c10::intrusive_ptr<Backend> ProcessGroupNCCL::merge(
   return c10::static_intrusive_pointer_cast<Backend>(pg);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool ProcessGroupNCCL::waitForFutureOrTimeout(
     std::future<bool>& fut,
     const std::chrono::milliseconds& timeOutMilSec,
@@ -1430,6 +1645,7 @@ bool ProcessGroupNCCL::abortComms(
   return true;
 }
 
+<<<<<<< HEAD
 void ProcessGroupNCCL::dumpExtraDebuggingInfo() {
   // This extra dump is intended to capture the current snapshot of collectives
   // When this process group is terminated for some exception out of NCCL
@@ -1454,17 +1670,26 @@ void ProcessGroupNCCL::dumpExtraDebuggingInfo() {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Abort this backend.
 void ProcessGroupNCCL::abort() {
   // This will log counter for how long the abort actually takes.
   STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__abort);
 
+<<<<<<< HEAD
   dumpExtraDebuggingInfo();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Don't join threads here since the purpose of this method is to abort all
   // communicators and signal the threads to exit. Joining on the threads could
   // potentially block and hence avoid it in this method.
   terminateProcessGroup_.store(true);
   watchdog_->notify();
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // launch abort asynchronously and wait for it to complete or timeout
   LOG(INFO) << logPrefix()
             << "Launching ProcessGroupNCCL abort asynchronously.";
@@ -1592,9 +1817,13 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
   }
 }
 
+<<<<<<< HEAD
 bool ProcessGroupNCCL::dumpDebuggingInfo(
     bool includeStackTrace /*=true*/,
     bool onlyActive /*=false*/) {
+=======
+bool ProcessGroupNCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This will log counter for how long dumpDebuggingInfo actually takes.
   STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__dumpDebuggingInfo);
 
@@ -1605,12 +1834,20 @@ bool ProcessGroupNCCL::dumpDebuggingInfo(
   LOG(ERROR)
       << logPrefix()
       << "ProcessGroupNCCL preparing to dump debug info. Include stack trace: "
+<<<<<<< HEAD
       << includeStackTrace << ", only active collectives: " << onlyActive;
+=======
+      << includeStackTrace;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (traceBufferSize_ > 0) {
     // We dump nccl trace into local disk by default and users can register
     // their customized writer by inheriting `DebugInfoWriter` via
     // `registerDebugInfoWriter`.
+<<<<<<< HEAD
     auto ncclTrace = dump_nccl_trace(true, includeStackTrace, onlyActive);
+=======
+    auto ncclTrace = dump_nccl_trace(true, includeStackTrace, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // dump_nccl_trace will hang so we don't grab the global lock until we get
     // the trace.
     std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
@@ -1733,8 +1970,11 @@ void ProcessGroupNCCL::HeartbeatMonitor::join() {
 
 void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
   c10::setThreadName("pt_nccl_heartbt");
+<<<<<<< HEAD
   STATIC_SCOPED_WAIT_COUNTER(
       pytorch.ProcessGroupNCCL__HeartbeatMonitor__runLoop);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   uint64_t heartBeatCounter = 0ULL;
   std::string errorMsg;
@@ -1878,11 +2118,18 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
     // recorder and dump. After dump, the training should continue.
     if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
       // best effort dump, not waiting for the dump here
+<<<<<<< HEAD
       bool onlyActive = getCvarBool(TORCH_INCLUDE_ONLY_ACTIVE, false);
       LOG(INFO) << pg_->logPrefix()
                 << "Dump signal received through pipe, triggering FR dump.";
       futures.emplace_back(std::async(std::launch::async, [this, onlyActive]() {
         return this->pg_->dumpDebuggingInfo(false, onlyActive);
+=======
+      LOG(INFO) << pg_->logPrefix()
+                << "Dump signal received through pipe, triggering FR dump.";
+      futures.emplace_back(std::async(std::launch::async, [this]() {
+        return this->pg_->dumpDebuggingInfo();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }));
     }
   }
@@ -1900,8 +2147,12 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
   if (checkDumpSignal && shouldDump_.load()) {
     // Store debug info to storage if no other thread does it. (By default to
     // local disk)
+<<<<<<< HEAD
     bool dumpStackTrace = getCvarBool(TORCH_INCLUDE_STACK_TRACE, true);
     bool onlyActive = getCvarBool(TORCH_INCLUDE_ONLY_ACTIVE, false);
+=======
+    bool dumpStackTrace = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ::c10d::C10dLoggingData debugLog;
     debugLog.integers["pg_id"] = static_cast<int64_t>(pg_->getUid());
     debugLog.integers["rank"] = pg_->getRank();
@@ -1910,8 +2161,13 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
     debugLog.strings["flight_recorder_version"] = c10d::version_val_str;
     for (int i = 0; i < 2; i++) {
       std::future<bool> asyncDebugDump =
+<<<<<<< HEAD
           std::async(std::launch::async, [this, dumpStackTrace, onlyActive]() {
             return this->pg_->dumpDebuggingInfo(dumpStackTrace, onlyActive);
+=======
+          std::async(std::launch::async, [this, dumpStackTrace]() {
+            return this->pg_->dumpDebuggingInfo(dumpStackTrace);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           });
 
       // wait for the dump until timeout - log data
@@ -2064,7 +2320,10 @@ void ProcessGroupNCCL::Watchdog::join() {
 
 void ProcessGroupNCCL::Watchdog::run() {
   c10::setThreadName("pt_nccl_watchdg");
+<<<<<<< HEAD
   STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__Watchdog__run);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   try {
     VLOG(2) << pg_->logPrefix() << "Process group watchdog thread started!";
@@ -2073,9 +2332,12 @@ void ProcessGroupNCCL::Watchdog::run() {
     VLOG(2) << pg_->logPrefix()
             << "Process group watchdog thread terminated normally";
   } catch (std::exception& e) {
+<<<<<<< HEAD
     // This condition is triggered when any routine in watchdog gets an
     // exception
     pg_->dumpExtraDebuggingInfo();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
       VLOG(2)
@@ -2321,10 +2583,13 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
       // Work status logging for desync debug
       desyncDebugger_.logWorkStart(work);
 
+<<<<<<< HEAD
       // allow watchdog to do an event query on a side thread
       at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
       at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // a work could be started but not completed, so we should not update
       // lastStartedSeq and lastStartedOpName if the work state is checked
       // multiple times after the start
@@ -2336,6 +2601,13 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
         pg_->pgStatus_->lastStartedNumelOut = work.numelOut_;
       }
 
+<<<<<<< HEAD
+=======
+      // allow watchdog to do an event query on a side thread
+      at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
+      at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Clean up completed work
       if (work.isCompleted()) {
         // In case user didn't call `work.wait()` with async collectives,
@@ -2659,7 +2931,11 @@ void ProcessGroupNCCL::runHookLoop() {
         // Hook might grab GIL, unlock first to prevent deadlock
         lock.unlock();
 
+<<<<<<< HEAD
         auto timeFinished = std::chrono::steady_clock::now();
+=======
+        auto timeFinished = std::chrono::system_clock::now();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto timeStarted =
             timeFinished +
             std::chrono::duration_cast<std::chrono::steady_clock::duration>(
@@ -3046,7 +3322,15 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
         LOG(INFO) << logPrefix() << "Splitting NCCL communicator from "
                   << parentComm->repr();
         ncclComm = NCCLComm::split(
+<<<<<<< HEAD
             parentComm.get(), options_->split_color, rank, options_->config);
+=======
+            parentComm.get(),
+            options_->split_color,
+            rank,
+            options_->config,
+            options_->global_ranks_in_group);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -3252,6 +3536,7 @@ void check_gpu_single_tensor(
   if (!tensor.is_cuda() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
   }
+<<<<<<< HEAD
   // Check memory format
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
     // P2P is a bit relaxed, supporting transfer of a transposed tensor
@@ -3261,6 +3546,11 @@ void check_gpu_single_tensor(
         C10_THROW_ERROR(
             ValueError, "Tensors for P2P must be non-overlapping and dense");
       }
+=======
+  // Skip the following requirements for P2P operations
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    if (p2p) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN_ONCE(
           "Detected non-contiguous tensor in P2P operations. It is user "
           "responsibility to guarantee that source and destination tensors have "
@@ -3547,6 +3837,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     workEnqueue(work);
   }
 
+<<<<<<< HEAD
   {
     c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
     std::vector<at::Device> devices{device};
@@ -3571,6 +3862,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     work->future_->markCompleted(at::IValue(std::vector<at::Tensor>{}));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Reset coalescing state
   coalescing_state_ = 0;
   coalescedComm_ = nullptr;
@@ -4007,6 +4300,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   at::cuda::OptionalCUDAGuard gpuGuard(device);
 
   std::string key;
+<<<<<<< HEAD
   int p2pRank = -1, p2pTargetRank = -1;
   bool isSendRecvSelf = rank_ == peer;
   // For batch_isend_irecv, ncclGroupStart() would be called upfront
@@ -4073,6 +4367,25 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     p2pRank = rank_ <= peer ? 0 : 1;
     p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
     ncclComm = getNCCLComm(key);
+=======
+  int p2pRank = 0, p2pTargetRank = 0;
+  bool isSendRecvSelf = false;
+  // For batch_isend_irecv, ncclGroupStart() would be called upfront
+  bool batchP2P = ncclActiveGroupCounter_ > 0;
+  if (batchP2P) {
+    // For batch P2P, we need to treat it like a collective when selecting
+    // communicator, because other ranks can call into this batch other than my
+    // rank and my peer
+    key = getKeyFromDevice(device);
+    p2pRank = rank_;
+    p2pTargetRank = peer;
+  } else {
+    // For single P2P, preserve the old two-rank behavior (to avoid perf diff)
+    key = getKeySendRecv(rank_, peer);
+    p2pRank = rank_ <= peer ? 0 : 1;
+    isSendRecvSelf = rank_ == peer;
+    p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (!coalescing_state_) {
       // Bump P2P sequence number.
@@ -4084,6 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   // coalesced or individual
   op_id_++;
 
+<<<<<<< HEAD
   if (ncclComm == nullptr) {
     // ncclComm should never be a nullptr in eager init mode.
     // For lazy init mode, isSendRecvSelf is only valid for non-batch
@@ -4091,6 +4405,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     // argument to be false.
     ncclComm =
         initNCCLComm(key, device, opType, p2pRank, isSendRecvSelf && !batchP2P);
+=======
+  std::shared_ptr<NCCLComm> ncclComm = getNCCLComm(key);
+  if (ncclComm == nullptr) {
+    ncclComm = initNCCLComm(key, device, opType, p2pRank, isSendRecvSelf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (coalescing_state_ & CoalActive) {
@@ -4451,7 +4770,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
+<<<<<<< HEAD
         c10d::isComplexViewAsRealAllowed(opts.reduceOp),
+=======
+        complexViewAsRealAllowed(opts.reduceOp),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "all_reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -4645,7 +4968,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
+<<<<<<< HEAD
         c10d::isComplexViewAsRealAllowed(opts.reduceOp),
+=======
+        complexViewAsRealAllowed(opts.reduceOp),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -4770,9 +5097,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
   bool same_size = check_same_size(outputTensors_);
   if (same_size) {
     // Flatten a vector of tensors into a single, stacked tensor.
+<<<<<<< HEAD
     // we can handle only contiguous inputs, because we are
     // just sending ptr and numel to nccl
     inputTensor = inputTensor.contiguous();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor outputFlattened = newLikeFlat(outputTensors_);
 
     return collective(
@@ -4920,7 +5250,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
   bool same_size = check_same_size(inputTensors_);
   if (same_size) {
     // Flatten a vector of tensors into a single, stacked tensor.
+<<<<<<< HEAD
     outputTensor = outputTensor.contiguous();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor inputFlattened = newLikeFlat(inputTensors_);
 
     return collective(
@@ -5127,12 +5460,23 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
   // offset wrt the device id if intra-node GPUs are sharded into multiple
   // dimensions.
   int devIdx = globalRank() % localDeviceCount_;
+<<<<<<< HEAD
   if (devIdx == 0) { // only log on first rank of each node
     LOG(WARNING) << c10::str(
         "Guessing device ID based on global rank. ",
         "This can cause a hang if rank to GPU mapping is heterogeneous. ",
         "You can specify device_id in init_process_group()");
   }
+=======
+  LOG(WARNING)
+      << logPrefix()
+      << c10::str(
+             " using GPU ",
+             devIdx,
+             " as device used by this process is currently unknown. ",
+             "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
+             "You can specify device_id in init_process_group() to force use of a particular device.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<c10::DeviceIndex>(devIdx);
 }
 
@@ -5499,7 +5843,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
 
   TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto inputTensor = inputTensors.back();
+<<<<<<< HEAD
   check_gpu_single_tensor(inputTensor);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<at::Tensor> outputs;
 
@@ -5821,7 +6168,11 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
     // Pool is created
     memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
     // Register so that we call ncclCommRegister on all new allocations
+<<<<<<< HEAD
     registerMemPool(memPool_.get(), /*symmetric*/ false);
+=======
+    registerMemPool(memPool_.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(INFO) << logPrefix() << "Created memory pool";
   }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 286eab14d1a86..a7ce051216861 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -23,7 +23,10 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/cuda/CUDAEventCache.hpp>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/logger.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
 
@@ -44,11 +47,14 @@ namespace c10d {
 static std::vector<std::string> TORCH_NCCL_BCAST_UNIQUEID = {
     "TORCH_NCCL_BCAST_UNIQUEID"};
 
+<<<<<<< HEAD
 // Control EagerInit P2P serialization warning
 static std::vector<std::string>
     TORCH_NCCL_SHOW_EAGER_INIT_P2P_SERIALIZATION_WARNING = {
         "TORCH_NCCL_SHOW_EAGER_INIT_P2P_SERIALIZATION_WARNING"};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Control whether to always use high priority streams
 static std::vector<std::string> TORCH_NCCL_HIGH_PRIORITY = {
     "TORCH_NCCL_HIGH_PRIORITY"};
@@ -126,11 +132,14 @@ static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
 static std::vector<std::string> TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN = {
     "TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN"};
 
+<<<<<<< HEAD
 // Whether to include only active collectives in the Flight Recorder trace
 // (default false)
 static std::vector<std::string> TORCH_NCCL_EXTRA_DUMP_ON_EXEC = {
     "TORCH_NCCL_EXTRA_DUMP_ON_EXEC"};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Control whether to use CudaEventCache for the collective in watchdog thread.
 // We noticed in the past when cuda global lock is held, destroying CudaEvent
 // can cause a hang.
@@ -355,10 +364,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // or timed out. If timeout, exception will be thrown.
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
 
+<<<<<<< HEAD
     void blockCurrentStream() override {
       synchronize();
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void abort() override;
 
     // Let current stream wait on the completion of the NCCL work
@@ -453,8 +465,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
     // Record collective sizes for debug. We only record the size on the first
     // device as multi-device per process is deprecated
+<<<<<<< HEAD
     size_t numelIn_ = 0;
     size_t numelOut_ = 0;
+=======
+    size_t numelIn_ = -1;
+    size_t numelOut_ = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Wrapper method for the static checkForNCCLErrors which can be overridden
     // for tests.
@@ -509,6 +526,26 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     friend class ProcessGroupNCCL;
   };
 
+<<<<<<< HEAD
+=======
+  class CUDAEventCache
+      : public std::enable_shared_from_this<ProcessGroupNCCL::CUDAEventCache> {
+   public:
+    CUDAEventCache();
+    std::shared_ptr<at::cuda::CUDAEvent> create(bool timing);
+    static std::shared_ptr<ProcessGroupNCCL::CUDAEventCache> get(
+        at::DeviceIndex device);
+
+   private:
+    std::mutex cacheMutex_;
+    // NOTE: We intentionally store raw pointers so that
+    // we do not attempt to destroy the event objects on process exit,
+    // because cuda may be gone.
+    std::array<std::deque<at::cuda::CUDAEvent*>, 2>
+        eventsArray_; // 0 for timing=false, 1 for timing=true
+  };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct Options : Backend::Options {
     // NOTE: timeout in ProcessGroupNCCL::Options denote the timeout for
     // operations. This is only used when blockingWait_ is enabled.
@@ -530,7 +567,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
     // Optional "parent" backend and color to create communicators from
     // via `ncclCommSplit`
+<<<<<<< HEAD
     c10::intrusive_ptr<ProcessGroupNCCL> split_from;
+=======
+    std::shared_ptr<ProcessGroupNCCL> split_from;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Color to use for `ncclCommSplit`, values:
     // * Non-negative value: in group;
     // * NCCL_SPLIT_NOCOLOR (-1): not in group;
@@ -550,6 +591,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // the int value of `NCCL_SPLIT_NOCOLOR` (-1) instead.
     int split_color{-2};
 #endif
+<<<<<<< HEAD
+=======
+    std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   // Helper class related to TORCH_NCCL_DESYNC_DEBUG
@@ -737,7 +783,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     std::condition_variable workMetaListCV_;
 
     // Heartbeat of watchdog thread.
+<<<<<<< HEAD
     std::atomic_uint64_t heartbeat_;
+=======
+    std::atomic_uint64_t heartbeat_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Whether or not to propagate detected errors to all ranks in the same PG
     // through TCPStore.
@@ -791,10 +841,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     return options_;
   }
 
+<<<<<<< HEAD
   c10::intrusive_ptr<Backend::Options> getBackendOptions() override {
     return c10::static_intrusive_pointer_cast<Backend::Options>(options_);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const std::string getBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
   }
@@ -815,10 +868,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 #endif
   }
 
+<<<<<<< HEAD
   void setTimeout(std::chrono::milliseconds timeout) override {
     options_->timeout = timeout;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void startCoalescing() override;
 
   c10::intrusive_ptr<Work> endCoalescing() override;
@@ -963,6 +1019,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   void enableCollectivesTiming() override;
 
+<<<<<<< HEAD
   c10::intrusive_ptr<Backend> split(
       const c10::intrusive_ptr<Store>& store,
       const std::vector<int>& ranks,
@@ -974,6 +1031,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const int& rank,
       const int& size) override;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Helper function for iteratively aborting communicators in the provided map
   void abortCommsFromMap(
       std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
@@ -1007,7 +1066,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Performs NCCL user buffer registration for all buffers in
   // the given MemPool
+<<<<<<< HEAD
   void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
+=======
+  void registerMemPool(c10::cuda::MemPool* pool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Performs NCCL user buffer de-registration for all buffers in
   // the given MemPool
@@ -1084,11 +1147,15 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // In the timeout case and we will dump debug info such as the NCCL flight
   // recorder to storage. Down the road, if we have more complicated or blocking
   // operations, we might need to use a side thread to do it.
+<<<<<<< HEAD
   bool dumpDebuggingInfo(
       bool includeStackTrace = true,
       bool onlyActive = false);
 
   void dumpExtraDebuggingInfo();
+=======
+  bool dumpDebuggingInfo(bool includeStackTrace = true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Abort all communicators on this rank.
   bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
@@ -1098,12 +1165,17 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool useNonblocking();
 
  protected:
+<<<<<<< HEAD
   int globalRankStart_{};
   int globalRankStride_{};
 
  private:
   bool eagerInit_{false};
   bool showSerializationWarning_{true};
+=======
+  int globalRankStart_;
+  int globalRankStride_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Helper that encapsulates work shared across all collective communication
   // primitives.  The callbacks have the following signatures:
@@ -1210,7 +1282,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Returns the global rank of the device. This function assumes that users
   // always create a default global process group(PG) which includes all
   // devices. It is called in the constructor of ProcessGroupNCCL, so it always
+<<<<<<< HEAD
   // return the rank_ of the very first PG created, aka, default global PG.
+=======
+  // return the rank_ of the the very first PG created, aka, default global PG.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int& globalRank() const;
 
   const c10::intrusive_ptr<Store>& globalStore() const;
@@ -1328,7 +1404,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   int traceBufferSize_;
 
   // We gate the cudaEventCache so that we can roll it out gradually.
+<<<<<<< HEAD
   std::atomic<bool> cudaEventCacheEnabled_;
+=======
+  std::atomic<bool> cudaEventCacheEnabled_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::thread onCompletionHookThread_;
 
@@ -1336,7 +1416,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::atomic<bool> terminateProcessGroup_;
 
   // Whether there are hooks pending to be fired
+<<<<<<< HEAD
   std::atomic<bool> hasPendingHooks_;
+=======
+  std::atomic<bool> hasPendingHooks_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // This is the signal from watchdog threads to indicate whether the monitor
   // thread should dump. Making it static so that it is accessible from all the
@@ -1389,7 +1473,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
 
   // Whether the coalesced calls are sync or async.
+<<<<<<< HEAD
   bool coalescedAsync_{};
+=======
+  bool coalescedAsync_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // keeps track of input and output tensors when coalescing is in flight.  Will
   // hand over these tensors to WorkNCCL's stash when coalescing is ended.
@@ -1425,11 +1513,19 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
+<<<<<<< HEAD
   std::atomic<bool> enableTiming_;
 
   // Flag to enable the print of hash value of input/output of collectives for
   // verification.
   std::atomic<bool> enableCollectiveHashDebug_;
+=======
+  std::atomic<bool> enableTiming_{};
+
+  // Flag to enable the print of hash value of input/output of collectives for
+  // verification.
+  std::atomic<bool> enableCollectiveHashDebug_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
   bool avoidRecordStreams_ = false;
@@ -1472,9 +1568,12 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
 };
 
+<<<<<<< HEAD
 // Reset the flighrecorder recordings for the current rank.
 TORCH_API void reset_nccl_trace();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Dumps the NCCL comm traces and additional information about the Process
 // Group.
 TORCH_API std::string dump_nccl_trace(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index 624a8fc11b615..a9a0074fa6ccf 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -163,8 +163,13 @@ struct CollectiveFingerPrint {
     backend->allgather(output_tensors, tensors_to_verify)->wait();
     // Verify equivalence
     for (const auto i : c10::irange(output_tensors.size())) {
+<<<<<<< HEAD
       const std::vector<at::Tensor>& gathered_tensors = output_tensors[i];
       const at::Tensor& reference_tensor = tensors_to_verify[i];
+=======
+      const std::vector<at::Tensor> gathered_tensors = output_tensors[i];
+      const at::Tensor reference_tensor = tensors_to_verify[i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (const auto rank : c10::irange(gathered_tensors.size())) {
         const auto& rank_tensor = gathered_tensors[rank];
         if (!rank_tensor.equal(reference_tensor)) {
diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
index afec6bbe11a9a..b8b4da299aa5a 100644
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -151,6 +151,7 @@ class PyProcessGroup : public ProcessGroup {
         group_desc);
   }
 
+<<<<<<< HEAD
   c10::intrusive_ptr<ProcessGroup> splitGroup(
       const std::vector<int>& ranks,
       const std::optional<std::chrono::milliseconds>& timeout,
@@ -181,6 +182,8 @@ class PyProcessGroup : public ProcessGroup {
         size);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index d4fd1c6c1e424..ab29508aa1814 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -207,7 +207,11 @@ class SendBuffer {
   SendBuffer(detail::TCPClient& client, detail::QueryType cmd)
       : client(client) {
     buffer.reserve(32); // enough for most commands
+<<<<<<< HEAD
     buffer.push_back(static_cast<uint8_t>(cmd));
+=======
+    buffer.push_back((uint8_t)cmd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void appendString(const std::string& str) {
@@ -224,7 +228,11 @@ class SendBuffer {
 
   template <typename T>
   void appendValue(T value) {
+<<<<<<< HEAD
     uint8_t* begin = reinterpret_cast<uint8_t*>(&value);
+=======
+    uint8_t* begin = (uint8_t*)&value;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     buffer.insert(buffer.end(), begin, begin + sizeof(T));
     maybeFlush();
   }
@@ -423,6 +431,7 @@ void TCPStore::ping() {
   buffer.flush();
 
   uint32_t returnedNonce = client_->receiveValue<std::uint32_t>();
+<<<<<<< HEAD
   if (nonce != returnedNonce) {
     C10_THROW_ERROR(
         DistNetworkError,
@@ -431,6 +440,10 @@ void TCPStore::ping() {
             nonce,
             returnedNonce));
   }
+=======
+  TORCH_INTERNAL_ASSERT(
+      nonce == returnedNonce, "Ping failed, invalid nonce returned");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void TCPStore::_splitSet(
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 2caab088a609a..570675419702e 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -41,7 +41,11 @@ class TCPServer;
 class TCPClient;
 
 struct SocketAddress {
+<<<<<<< HEAD
   std::string host;
+=======
+  std::string host{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::uint16_t port{};
 };
 
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
index 22455a22a4610..c9997f99d89fc 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
@@ -96,7 +96,11 @@ class TCPStoreMasterDaemon : public BackgroundThread {
   std::unordered_set<int> miscellaneousSockets_;
 
   Socket storeListenSocket_;
+<<<<<<< HEAD
   std::vector<Socket> sockets_;
+=======
+  std::vector<Socket> sockets_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _WIN32
   const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
   HANDLE ghStopEvent_{};
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
index d5f7f0248bba5..ca71ac6e41beb 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -65,7 +65,11 @@ class BackgroundThread {
 
  private:
   std::atomic<bool> is_running_{false};
+<<<<<<< HEAD
   std::thread daemonThread_;
+=======
+  std::thread daemonThread_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 std::unique_ptr<BackgroundThread> create_tcpstore_backend(
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 80dd7340709a0..463b3bf481ad1 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -36,14 +36,22 @@ Other callbacks don't provide exception safety so avoid there.
 // backlog. This should be at least world size to avoid issues on init. We set
 // it to -1 to use the host max value which is controlled by `soconnmax`.
 auto constexpr DEFAULT_BACKLOG = -1;
+<<<<<<< HEAD
 auto constexpr MAX_KEY_COUNT = static_cast<size_t>(128 * 1024);
+=======
+auto constexpr MAX_KEY_COUNT = size_t(128 * 1024);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 auto constexpr MAX_STRING_LEN = 8 * 1024;
 auto constexpr MAX_PAYLOAD_LEN = 8 * 1024 * 1024;
 
 // This controls the preferred size for buffers.
 // Too small and we'll need multiple buffers for one request
 // Too big and we might taxing malloc
+<<<<<<< HEAD
 auto constexpr ALLOC_BUFFER_SIZE = static_cast<size_t>(4096);
+=======
+auto constexpr ALLOC_BUFFER_SIZE = size_t(4096);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class UvHandle : public c10::intrusive_ptr_target {
  public:
   ~UvHandle() override = default;
@@ -78,7 +86,11 @@ class UvHandle : public c10::intrusive_ptr_target {
 
  private:
   static c10::intrusive_ptr<UvHandle> reclaim(uv_handle_t* handle) {
+<<<<<<< HEAD
     auto h = static_cast<UvHandle*>(uv_handle_get_data(handle));
+=======
+    auto h = (UvHandle*)uv_handle_get_data(handle);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::intrusive_ptr<UvHandle>::reclaim(h);
   }
 
@@ -97,8 +109,12 @@ class UvTcpSocket : public UvHandle {
   }
 
   static c10::intrusive_ptr<UvTcpSocket> borrow(uv_stream_t* handle) {
+<<<<<<< HEAD
     auto h = static_cast<UvTcpSocket*>(
         uv_handle_get_data(reinterpret_cast<uv_handle_t*>(handle)));
+=======
+    auto h = (UvTcpSocket*)uv_handle_get_data((uv_handle_t*)handle);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return h->iptr();
   }
 
@@ -108,7 +124,11 @@ class UvTcpSocket : public UvHandle {
       uv_buf_t* buf) {
     suggested_size = std::min(suggested_size, ALLOC_BUFFER_SIZE);
     // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+<<<<<<< HEAD
     buf->base = static_cast<char*>(malloc(suggested_size));
+=======
+    buf->base = (char*)malloc(suggested_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     buf->len = suggested_size;
   }
 
@@ -169,8 +189,12 @@ class UvTcpSocket : public UvHandle {
           formatSockAddr(reinterpret_cast<struct ::sockaddr*>(&addr), addrLen);
     }
 
+<<<<<<< HEAD
     int res = uv_read_start(
         reinterpret_cast<uv_stream_t*>(&client), alloc_buffer, read_callback);
+=======
+    int res = uv_read_start((uv_stream_t*)&client, alloc_buffer, read_callback);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (res) {
       C10D_WARNING(
           "Failed to setup read callback. client:{} code:{} name:{} desc:{}.",
@@ -183,12 +207,20 @@ class UvTcpSocket : public UvHandle {
   }
 
   uv_handle_t* unsafeGetHandle() override {
+<<<<<<< HEAD
     return reinterpret_cast<uv_handle_t*>(&client);
+=======
+    return (uv_handle_t*)&client;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  protected:
   uv_stream_t* unsafeGetStream() {
+<<<<<<< HEAD
     return reinterpret_cast<uv_stream_t*>(&client);
+=======
+    return (uv_stream_t*)&client;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   uv_tcp_t* unsafeGetSocket() {
@@ -219,7 +251,11 @@ class UvTcpServer : public UvTcpSocket {
     auto res = c10::make_intrusive<UvTcpServer>(loop);
     res->handleReady();
     try {
+<<<<<<< HEAD
       int uv_res = uv_tcp_open(res->unsafeGetSocket(), socket);
+=======
+      int uv_res = uv_tcp_open((uv_tcp_t*)res->unsafeGetStream(), socket);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       C10D_CHECK_WITH(
           SocketError,
           uv_res == 0,
@@ -268,11 +304,17 @@ class UvTcpServer : public UvTcpSocket {
       struct sockaddr_storage addr{};
       int uv_res = 0;
       if (useIpv6) {
+<<<<<<< HEAD
         uv_res = uv_ip6_addr(
             "::", port, reinterpret_cast<struct sockaddr_in6*>(&addr));
       } else {
         uv_res = uv_ip4_addr(
             "0.0.0.0", port, reinterpret_cast<struct sockaddr_in*>(&addr));
+=======
+        uv_res = uv_ip6_addr("::", port, (struct sockaddr_in6*)&addr);
+      } else {
+        uv_res = uv_ip4_addr("0.0.0.0", port, (struct sockaddr_in*)&addr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       TORCH_CHECK_WITH(
           DistStoreError,
@@ -290,9 +332,13 @@ class UvTcpServer : public UvTcpSocket {
           uv_strerror(uv_res));
 
       uv_res = uv_tcp_bind(
+<<<<<<< HEAD
           res->unsafeGetSocket(),
           reinterpret_cast<const struct ::sockaddr*>(&addr),
           0);
+=======
+          res->unsafeGetSocket(), (const struct ::sockaddr*)&addr, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       C10D_CHECK_WITH(
           SocketError,
           uv_res == 0,
@@ -335,9 +381,14 @@ class UvTcpServer : public UvTcpSocket {
   }
 
   void accept(const c10::intrusive_ptr<UvTcpSocket>& socket) {
+<<<<<<< HEAD
     int res = uv_accept(
         unsafeGetStream(),
         reinterpret_cast<uv_stream_t*>(socket->unsafeGetHandle()));
+=======
+    int res =
+        uv_accept(unsafeGetStream(), (uv_stream_t*)socket->unsafeGetHandle());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10D_CHECK_WITH(
         SocketError,
         res == 0,
@@ -359,8 +410,12 @@ class UvTcpServer : public UvTcpSocket {
   }
 
   static c10::intrusive_ptr<UvTcpServer> borrow(uv_stream_t* handle) {
+<<<<<<< HEAD
     auto h = static_cast<UvTcpServer*>(
         uv_handle_get_data(reinterpret_cast<uv_handle_t*>(handle)));
+=======
+    auto h = (UvTcpServer*)uv_handle_get_data((uv_handle_t*)handle);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return h->iptr();
   }
 
@@ -369,12 +424,22 @@ class UvTcpServer : public UvTcpSocket {
 
     int addr_len = sizeof(addr_s);
 
+<<<<<<< HEAD
     TORCH_CHECK(
         uv_tcp_getsockname(
             (uv_tcp_t*)unsafeGetStream(),
             reinterpret_cast<::sockaddr*>(&addr_s),
             &addr_len) == 0,
         "The port number of the socket cannot be retrieved.");
+=======
+    if (uv_tcp_getsockname(
+            (uv_tcp_t*)unsafeGetStream(),
+            reinterpret_cast<::sockaddr*>(&addr_s),
+            &addr_len) != 0) {
+      throw std::runtime_error(
+          "The port number of the socket cannot be retrieved.");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (addr_s.ss_family == AF_INET) {
       portNum_ = ntohs(reinterpret_cast<sockaddr_in*>(&addr_s)->sin_port);
@@ -397,8 +462,12 @@ class WriterPayload : public c10::intrusive_ptr_target {
   static c10::intrusive_ptr<WriterPayload> reclaim(uv_write_t* request) {
     /* This method returns a intrusive_ptr that does not increase the refcount.
      */
+<<<<<<< HEAD
     auto h = static_cast<WriterPayload*>(
         uv_req_get_data(reinterpret_cast<uv_req_t*>(request)));
+=======
+    auto h = (WriterPayload*)uv_req_get_data((uv_req_t*)request);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::intrusive_ptr<WriterPayload>::reclaim(h);
   }
 
@@ -436,12 +505,17 @@ class WriterPayload : public c10::intrusive_ptr_target {
       std::vector<uint8_t>&& in_data,
       c10::intrusive_ptr<UvHandle> handle)
       : data(std::move(in_data)), handle(std::move(handle)) {
+<<<<<<< HEAD
     uv_req_set_data(reinterpret_cast<uv_req_t*>(&req), this);
+=======
+    uv_req_set_data((uv_req_t*)&req, this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   ~WriterPayload() override = default;
 
   void send() {
+<<<<<<< HEAD
     buf = uv_buf_init(reinterpret_cast<char*>(data.data()), data.size());
     int res = uv_write(
         &req,
@@ -449,6 +523,11 @@ class WriterPayload : public c10::intrusive_ptr_target {
         &buf,
         1,
         write_done);
+=======
+    buf = uv_buf_init((char*)data.data(), data.size());
+    int res = uv_write(
+        &req, (uv_stream_t*)handle->unsafeGetHandle(), &buf, 1, write_done);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (res) {
       C10D_WARNING(
@@ -597,7 +676,11 @@ class ChunkedStream {
     if (available() < size)
       return false;
     str.resize(size);
+<<<<<<< HEAD
     return read_many(str.data(), size);
+=======
+    return read_many((char*)str.data(), size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool read_payload(std::vector<uint8_t>& data) {
@@ -617,7 +700,11 @@ class ChunkedStream {
     if (available() < size_in_bytes)
       return false;
     data.resize(size);
+<<<<<<< HEAD
     return read_many(reinterpret_cast<char*>(data.data()), size_in_bytes);
+=======
+    return read_many((char*)data.data(), size_in_bytes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   size_t available() {
@@ -716,6 +803,7 @@ class LibUVStoreDaemon : public BackgroundThread {
   int port_;
 
   static LibUVStoreDaemon& from_uv(uv_handle_t* stream) {
+<<<<<<< HEAD
     return *static_cast<LibUVStoreDaemon*>(uv_handle_get_data(stream));
   }
 
@@ -725,6 +813,17 @@ class LibUVStoreDaemon : public BackgroundThread {
 
   static void on_exit_request(uv_async_t* handle) {
     from_uv(reinterpret_cast<uv_handle_t*>(handle)).onExitRequest();
+=======
+    return *(LibUVStoreDaemon*)uv_handle_get_data(stream);
+  }
+
+  static void on_new_connection(uv_stream_t* server, int status) {
+    from_uv((uv_handle_t*)server).onConnect(status);
+  }
+
+  static void on_exit_request(uv_async_t* handle) {
+    from_uv((uv_handle_t*)handle).onExitRequest();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void onConnect(int status);
@@ -752,12 +851,20 @@ class UvClient : public UvTcpSocket {
       if (!stream.read1(command))
         break;
       if (store->isMiscellaneousClient(iptr())) {
+<<<<<<< HEAD
         if (static_cast<QueryType>(command) != QueryType::VALIDATE)
+=======
+        if ((QueryType)command != QueryType::VALIDATE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return;
         if (!parse_validate_command())
           return;
       } else {
+<<<<<<< HEAD
         switch (static_cast<QueryType>(command)) {
+=======
+        switch ((QueryType)command) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           case QueryType::PING:
             if (!parse_ping_command())
               return;
@@ -996,7 +1103,11 @@ class UvClient : public UvTcpSocket {
 
     if (store->waitKeys(keys, iptr())) {
       StreamWriter sw(iptr());
+<<<<<<< HEAD
       sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
+=======
+      sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sw.send();
     }
 
@@ -1115,7 +1226,11 @@ class UvClient : public UvTcpSocket {
     C10D_TRACE("cancel_wait address:{}", this->address());
 
     StreamWriter sw(iptr());
+<<<<<<< HEAD
     sw.write1(static_cast<uint8_t>(WaitResponseType::WAIT_CANCELED));
+=======
+    sw.write1((uint8_t)WaitResponseType::WAIT_CANCELED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sw.send();
 
     return true;
@@ -1200,7 +1315,11 @@ void LibUVStoreDaemon::onConnect(int status) {
 
 void LibUVStoreDaemon::onExitRequest() {
   C10D_DEBUG("Store exit requested\n");
+<<<<<<< HEAD
   uv_close(reinterpret_cast<uv_handle_t*>(&exit_handle_), nullptr);
+=======
+  uv_close((uv_handle_t*)&exit_handle_, nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uv_stop(&loop_);
 }
 
@@ -1241,12 +1360,20 @@ LibUVStoreDaemon::LibUVStoreDaemon(int port) : port_(port) {
       uv_async_init(&loop_, &exit_handle_, LibUVStoreDaemon::on_exit_request) ==
           0,
       "Failed to init uv async event");
+<<<<<<< HEAD
   uv_handle_set_data(reinterpret_cast<uv_handle_t*>(&exit_handle_), this);
+=======
+  uv_handle_set_data((uv_handle_t*)&exit_handle_, this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 LibUVStoreDaemon::~LibUVStoreDaemon() {
   if (!is_running()) {
+<<<<<<< HEAD
     uv_close(reinterpret_cast<uv_handle_t*>(&exit_handle_), nullptr);
+=======
+    uv_close((uv_handle_t*)&exit_handle_, nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uv_run(&loop_, UV_RUN_NOWAIT);
     if (uv_loop_close(&loop_) != 0) {
       C10D_ERROR("loop cleanup didn't work");
@@ -1490,7 +1617,11 @@ void LibUVStoreDaemon::wakeupWaitingClients(const std::string& key) {
     for (const auto& client : socketsToWait->second) {
       if (--keysAwaited_[client] == 0) {
         StreamWriter sw(client->iptr());
+<<<<<<< HEAD
         sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
+=======
+        sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sw.send();
       }
     }
@@ -1504,7 +1635,11 @@ void LibUVStoreDaemon::wakeupOneWaitingClient(const std::string& key) {
     for (const auto& client : socketsToWait->second) {
       if (--keysAwaited_[client] == 0) {
         StreamWriter sw(client->iptr());
+<<<<<<< HEAD
         sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
+=======
+        sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sw.send();
         return;
       }
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 18db14f5cef04..201b30f27d364 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -110,8 +110,11 @@ ReduceOp makeNCCLPreMulSum(const T& factor) {
   return rop;
 }
 
+<<<<<<< HEAD
 TORCH_API bool isComplexViewAsRealAllowed(const ReduceOp& reduceOp);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr auto kUnsetTimeout = std::chrono::milliseconds(-1);
 
 struct BroadcastOptions {
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index 66d62d662c259..e2c1cc5add319 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -1,6 +1,9 @@
 #ifdef USE_C10D_UCC
 
+<<<<<<< HEAD
 #include <c10/util/FileSystem.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/env.h>
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
@@ -11,6 +14,10 @@
 #include <sys/stat.h>
 #include <cstdlib>
 #include <ctime>
+<<<<<<< HEAD
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fstream>
 
 namespace c10d {
@@ -34,15 +41,26 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
         "_", (1 + ltm->tm_mon), "_", ltm->tm_mday, "_", (1900 + ltm->tm_year));
   }
 
+<<<<<<< HEAD
   c10::filesystem::path fullpath = c10::filesystem::path("/tmp") / dirname;
+=======
+  std::filesystem::path fullpath = std::filesystem::path("/tmp") / dirname;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto user_path = c10::utils::get_env("TORCH_UCC_COMMS_TRACE_OUTPUT_DIR");
   if (user_path.has_value()) {
     fullpath = std::move(user_path.value());
   }
+<<<<<<< HEAD
   c10::filesystem::path trace_filename =
       fullpath / fmt::format("rank{}.json", rank);
   std::error_code ec{};
   if (!c10::filesystem::create_directories(fullpath, ec)) {
+=======
+  std::filesystem::path trace_filename =
+      fullpath / fmt::format("rank{}.json", rank);
+  std::error_code ec{};
+  if (!std::filesystem::create_directories(fullpath, ec)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(INFO) << getLogPrefix() << "[INFO] failed to mkdir " << fullpath
               << " with error " << ec.message();
     return;
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index 6794c4eaa594f..b9e1d1fef1a0f 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -1,8 +1,14 @@
 #ifdef USE_C10D_UCC
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+=======
+#include <torch/csrc/distributed/c10d/UCCTracing.hpp>
+#include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cctype>
 #include <string>
 #include <unordered_map>
@@ -133,7 +139,11 @@ CommUCC::CommUCC(
     TORCH_UCC_LOG_ERROR(
         TORCH_UCC_INIT,
         c10::str("failed to read UCC context config: ", ucc_status_string(st)));
+<<<<<<< HEAD
     TORCH_CHECK(false, ucc_status_string(st));
+=======
+    throw std::runtime_error(ucc_status_string(st));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   st = ucc_context_config_modify(
       context_config,
@@ -148,7 +158,11 @@ CommUCC::CommUCC(
         c10::str(
             "UCC failed to modify UCC context config: ",
             ucc_status_string(st)));
+<<<<<<< HEAD
     TORCH_CHECK(false, ucc_status_string(st));
+=======
+    throw std::runtime_error(ucc_status_string(st));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   memset(&context_params, 0, sizeof(ucc_context_params_t));
   context_params.mask =
@@ -169,7 +183,11 @@ CommUCC::CommUCC(
     TORCH_UCC_LOG_ERROR(
         TORCH_UCC_INIT,
         c10::str("UCC failed to create UCC context: ", ucc_status_string(st)));
+<<<<<<< HEAD
     TORCH_CHECK(false, ucc_status_string(st));
+=======
+    throw std::runtime_error(ucc_status_string(st));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index fc9d735401c73..1b577bac6c1d2 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -437,7 +437,11 @@ inline at::Tensor newLikeFlat(
   }
   at::DeviceGuard gpuGuard(device);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
+<<<<<<< HEAD
   std::vector<int64_t> strides{t.numel()};
+=======
+  std::vector<int64_t> strides{static_cast<int64_t>(t.numel())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
   strides.insert(strides.end(), t.strides().begin(), t.strides().end());
   return at::empty_strided(
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
index cdec9185ce537..e10dafac697f4 100644
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -1,6 +1,9 @@
 #include <ATen/ThreadLocalState.h>
 #include <distributed/c10d/ProcessGroup.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #include <utility>
@@ -101,6 +104,7 @@ bool Work::wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
+<<<<<<< HEAD
 void Work::blockCurrentStream() {
   // block cuda stream indefinitely until work is completed.
   std::shared_ptr<c10d::cuda::StreamBlock> handle =
@@ -110,6 +114,8 @@ void Work::blockCurrentStream() {
       [handle](c10::ivalue::Future& future) { handle->abort(); });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void Work::abort() {
   TORCH_CHECK(false, "Work::abort not implemented.");
 }
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 2eeea75330fde..2a061af260c73 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -110,6 +110,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
   //
   virtual bool wait(std::chrono::milliseconds timeout = kNoTimeout);
 
+<<<<<<< HEAD
   // Blocks the current stream until the work is completed.
   // This is equivalent to synchronize for CUDA tensors but works for both CPU
   // tensors and CUDA tensors by using a spinlock CUDA kernel.
@@ -117,6 +118,8 @@ class TORCH_API Work : public torch::CustomClassHolder {
   // If no stream is active it will throw an error.
   virtual void blockCurrentStream();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void abort();
 
   // Returns a Future object that will be associated with the completion of
@@ -135,7 +138,11 @@ class TORCH_API Work : public torch::CustomClassHolder {
   OpType retrieveOpType() const;
 
   static c10::intrusive_ptr<Work> create_from_future(
+<<<<<<< HEAD
       const c10::intrusive_ptr<c10::ivalue::Future>& /*future*/);
+=======
+      const c10::intrusive_ptr<c10::ivalue::Future>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   // Completes the work object and optionally sets the exception in a
@@ -166,8 +173,13 @@ struct TORCH_API WorkInfo {
   WorkInfo(
       const OpType& opType,
       const uint64_t seq,
+<<<<<<< HEAD
       const std::chrono::time_point<std::chrono::steady_clock>& timeStarted,
       const std::chrono::time_point<std::chrono::steady_clock>& timeFinished,
+=======
+      const std::chrono::time_point<std::chrono::system_clock>& timeStarted,
+      const std::chrono::time_point<std::chrono::system_clock>& timeFinished,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const std::chrono::duration<float>& activeDuration)
       : opType(opType),
         seq(seq),
@@ -177,8 +189,13 @@ struct TORCH_API WorkInfo {
 
   OpType opType;
   uint64_t seq;
+<<<<<<< HEAD
   std::chrono::time_point<std::chrono::steady_clock> timeStarted;
   std::chrono::time_point<std::chrono::steady_clock> timeFinished;
+=======
+  std::chrono::time_point<std::chrono::system_clock> timeStarted;
+  std::chrono::time_point<std::chrono::system_clock> timeFinished;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::chrono::duration<float> activeDuration;
 };
 
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index 4128bb0dd7a8f..87eb7abcdfc58 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -109,7 +109,11 @@ at::Tensor parseCppCommHookResult(const c10::IValue& result) {
   if (result.isPyObject()) {
     std::vector<at::Tensor> tensors =
         result.toPyObjectHolder()->extractTensors();
+<<<<<<< HEAD
     return std::move(tensors[0]);
+=======
+    return tensors[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TORCH_INTERNAL_ASSERT(
       result.isTensor() || result.isTensorList(),
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
index b5bbe8351fb09..7bacc648400fa 100644
--- a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
@@ -49,7 +49,11 @@ void StoreCollectives::barrier(
           msg += fmt::format("{}, ", i);
         }
       }
+<<<<<<< HEAD
       TORCH_CHECK(false, msg, e.what());
+=======
+      throw std::runtime_error(msg + e.what());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -118,7 +122,11 @@ std::vector<std::vector<uint8_t>> StoreCollectives::gatherRecv(
         msg += fmt::format("{}, ", i);
       }
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, msg, e.what());
+=======
+    throw std::runtime_error(msg + e.what());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // insert local data
@@ -194,7 +202,11 @@ std::vector<std::vector<uint8_t>> StoreCollectives::allGather(
         msg += fmt::format("{}, ", i);
       }
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, msg, e.what());
+=======
+    throw std::runtime_error(msg + e.what());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
index 6aefca8eabd33..4eebbab88a7a4 100644
--- a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
@@ -62,7 +62,11 @@ class TORCH_API StoreCollectives : public ControlCollectives {
   int rank_;
   int worldSize_;
 
+<<<<<<< HEAD
   c10::FastSet<std::string> seenKeys_;
+=======
+  c10::FastSet<std::string> seenKeys_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
index 10274d053b995..011fabd020a74 100644
--- a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
@@ -4,10 +4,14 @@
 #include <mutex>
 #include <shared_mutex>
 #include <stdexcept>
+<<<<<<< HEAD
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+=======
+#include <utility>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10d::control_plane {
 
@@ -49,8 +53,13 @@ class HandlerRegistry {
   }
 
  private:
+<<<<<<< HEAD
   std::shared_mutex handlersMutex_;
   std::unordered_map<std::string, HandlerFunc> handlers_;
+=======
+  std::shared_mutex handlersMutex_{};
+  std::unordered_map<std::string, HandlerFunc> handlers_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 HandlerRegistry& getHandlerRegistry() {
diff --git a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
index f9fa068bed0df..c7f1aa1876a4d 100644
--- a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
@@ -4,7 +4,10 @@
 #include <fstream>
 #include <string>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/tempfile.h>
 #include <torch/csrc/distributed/c10d/exception.h>
 #include <torch/csrc/utils/pybind.h>
@@ -18,7 +21,13 @@ RegisterHandler tracebackHandler{
       auto tmpfile = c10::make_tempfile("torch-dump_traceback");
 
       auto cfile = ::fopen(tmpfile.name.c_str(), "w");
+<<<<<<< HEAD
       TORCH_CHECK(cfile, "failed to open file for writing");
+=======
+      if (!cfile) {
+        throw std::runtime_error("failed to open file for writing");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       {
         py::gil_scoped_acquire guard{};
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 2f77bb119a956..1721208b4284c 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -1,10 +1,19 @@
+<<<<<<< HEAD
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <sstream>
 #include <unordered_map>
 
 #include <ATen/core/interned_strings.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <c10/util/FileSystem.h>
 #include <c10/util/thread_name.h>
+=======
+#include <c10/util/thread_name.h>
+#include <caffe2/utils/threadpool/WorkersPool.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
 #include <torch/csrc/distributed/c10d/logging.h>
 
@@ -144,6 +153,7 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
   if (port == -1) {
     // using unix sockets
     server_.set_address_family(AF_UNIX);
+<<<<<<< HEAD
     TORCH_CHECK(
         !c10::filesystem::exists(hostOrFile),
         fmt::format("{} already exists", hostOrFile));
@@ -157,13 +167,36 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
     TORCH_CHECK(
         server_.bind_to_port(hostOrFile, port),
         fmt::format("Error binding to {}:{}", hostOrFile, port));
+=======
+
+    if (std::filesystem::exists(hostOrFile)) {
+      throw std::runtime_error(fmt::format("{} already exists", hostOrFile));
+    }
+
+    C10D_WARNING("Server listening to UNIX {}", hostOrFile);
+    if (!server_.bind_to_port(hostOrFile, 80)) {
+      throw std::runtime_error(fmt::format("Error binding to {}", hostOrFile));
+    }
+  } else {
+    C10D_WARNING("Server listening to TCP {}:{}", hostOrFile, port);
+    if (!server_.bind_to_port(hostOrFile, port)) {
+      throw std::runtime_error(
+          fmt::format("Error binding to {}:{}", hostOrFile, port));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   serverThread_ = std::thread([this]() {
     c10::setThreadName("pt_workerserver");
 
     try {
+<<<<<<< HEAD
       TORCH_CHECK(server_.listen_after_bind(), "failed to listen");
+=======
+      if (!server_.listen_after_bind()) {
+        throw std::runtime_error("failed to listen");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } catch (std::exception& e) {
       C10D_ERROR("Error while running server: {}", e.what());
       throw;
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
index 76f58b8338615..99cdc2b7f2dba 100644
--- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
+++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -8,7 +8,10 @@
 // Two warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
     CUDA_VERSION >= 12000
@@ -40,7 +43,10 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
 
 C10_DIAGNOSTIC_POP()
 C10_DIAGNOSTIC_POP()
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -151,7 +157,11 @@ at::Tensor async_input_mm_impl(
           reinterpret_cast<ElementB*>(b.data_ptr<at::BFloat16>()),
           stride_B,
       },
+<<<<<<< HEAD
       {{},
+=======
+      {{1, 1},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C,
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
diff --git a/torch/csrc/distributed/c10d/cuda/utils.cpp b/torch/csrc/distributed/c10d/cuda/utils.cpp
index 44d5242e1401d..c9c0387fec86a 100644
--- a/torch/csrc/distributed/c10d/cuda/utils.cpp
+++ b/torch/csrc/distributed/c10d/cuda/utils.cpp
@@ -22,7 +22,11 @@ bool deviceSupportsMulticast(int device_idx) {
   // - Device support: Determined by querying
   // CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED at runtime.
   auto driver_api = c10::cuda::DriverAPI::get();
+<<<<<<< HEAD
   int multicast_supported = 0;
+=======
+  int multicast_supported;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_CUDA_DRIVER_CHECK(driver_api->cuDeviceGetAttribute_(
       &multicast_supported,
       CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED,
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index a6c6c6f8c4744..20e3b749a0a41 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -48,10 +48,14 @@
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+<<<<<<< HEAD
 
 #ifdef USE_NVSHMEM
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
 #endif
+=======
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/debug.h>
@@ -443,8 +447,12 @@ PyTypeObject* GetReduceOpMetaclass() {
     spec.basicsize = base_metaclass->tp_basicsize;
     spec.flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
     spec.slots = slots;
+<<<<<<< HEAD
     PyTypeObject* metaclass =
         reinterpret_cast<PyTypeObject*>(PyType_FromSpec(&spec));
+=======
+    PyTypeObject* metaclass = (PyTypeObject*)PyType_FromSpec(&spec);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!metaclass)
       throw py::error_already_set();
     return metaclass;
@@ -813,10 +821,14 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
   //    `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol.
   // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
   py::class_<::c10d::ReduceOp> reduce_op(
+<<<<<<< HEAD
       module,
       "ReduceOp",
       py::metaclass(reinterpret_cast<PyObject*>(GetReduceOpMetaclass())),
       R"(
+=======
+      module, "ReduceOp", py::metaclass((PyObject*)GetReduceOpMetaclass()), R"(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``,
 ``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``.
 
@@ -1133,11 +1145,14 @@ This class does not support ``__members__`` property.)");
       .def_static(
           "has_multicast_support",
           &::c10d::symmetric_memory::has_multicast_support)
+<<<<<<< HEAD
       .def_static("set_backend", &::c10d::symmetric_memory::set_backend)
       .def_static("get_backend", &::c10d::symmetric_memory::get_backend)
       .def_static(
           "get_mempool_allocator",
           &::c10d::symmetric_memory::get_mempool_allocator)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property_readonly("rank", &SymmetricMemory::get_rank)
       .def_property_readonly("world_size", &SymmetricMemory::get_world_size)
       .def_property_readonly(
@@ -1177,7 +1192,10 @@ This class does not support ``__members__`` property.)");
       .def_property_readonly("buffer_size", &SymmetricMemory::get_buffer_size)
       .def_property_readonly(
           "signal_pad_size", &SymmetricMemory::get_signal_pad_size)
+<<<<<<< HEAD
       .def_property_readonly("offset", &SymmetricMemory::get_offset)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "get_buffer",
           &SymmetricMemory::get_buffer,
@@ -1209,12 +1227,15 @@ This class does not support ``__members__`` property.)");
           py::arg("src_rank"),
           py::arg("channel") = 0,
           py::arg("timeout_ms") = 0)
+<<<<<<< HEAD
       .def(
           "get_remote_tensor",
           &SymmetricMemory::get_remote_tensor,
           py::arg("peer"),
           py::arg("sizes"),
           py::arg("dtype"))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Util functions that are often used together with symmetric memory but
       // not necessarily directly on symmetric memory.
       .def_static(
@@ -2080,6 +2101,7 @@ communication mechanism.
           .def("rank", &::c10d::ProcessGroup::getRank, R"(Get the rank of this process group.)")
           .def("size", &::c10d::ProcessGroup::getSize, R"(Get the size of this process group.)")
           .def("name", &::c10d::ProcessGroup::getBackendName, R"(Get the name of this process group.)")
+<<<<<<< HEAD
           .def("get_group_store", &::c10d::ProcessGroup::getStore, R"(Get the store of this process group.)")
           .def(
               "split_group",
@@ -2110,6 +2132,8 @@ communication mechanism.
               py::arg("group_name") = std::nullopt,
               py::arg("group_desc") = std::nullopt,
               py::call_guard<py::gil_scoped_release>())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "abort",
               &::c10d::ProcessGroup::abort,
@@ -2133,22 +2157,35 @@ communication mechanism.
               py::call_guard<py::gil_scoped_release>(),
               R"(Broadcasts the tensor to all processes in the process group.
 
+<<<<<<< HEAD
               See :func:`torch.distributed.broadcast` for more details.)")
+=======
+              See :func:`torch.distributed.broadcast for more details.)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "broadcast",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& x,
+<<<<<<< HEAD
                  int rootRank,
                 std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::BroadcastOptions opts;
                 opts.rootRank = rootRank;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 int rootRank) {
+                ::c10d::BroadcastOptions opts;
+                opts.rootRank = rootRank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> tensors = {x};
                 return self->broadcast(tensors, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Broadcasts the tensor to all processes in the process group.
 
@@ -2166,16 +2203,25 @@ communication mechanism.
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  std::vector<at::Tensor>& xs,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                 std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::AllreduceOptions opts;
+                opts.reduceOp = op;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self->allreduce(xs, opts);
               },
               py::arg("tensors"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Allreduces the provided tensors across all processes in the process group.
 
@@ -2185,17 +2231,26 @@ communication mechanism.
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& x,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                  std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::AllreduceOptions opts;
+                opts.reduceOp = op;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> xs = {x};
                 return self->allreduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Allreduces the provided tensors across all processes in the process group.
 
@@ -2225,19 +2280,29 @@ communication mechanism.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& x,
                  int rootRank,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                 std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::ReduceOptions opts;
                 opts.reduceOp = op;
                 opts.rootRank = rootRank;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::ReduceOptions opts;
+                opts.reduceOp = op;
+                opts.rootRank = rootRank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> xs = {x};
                 return self->reduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Reduces the provided tensors across all processes in the process group.
 
@@ -2256,6 +2321,7 @@ communication mechanism.
               "allgather",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  std::vector<at::Tensor>& output,
+<<<<<<< HEAD
                  at::Tensor& input,
                  std::optional<std::chrono::milliseconds> timeout) {
                 std::vector<std::vector<at::Tensor>> outputs = {output};
@@ -2271,6 +2337,20 @@ communication mechanism.
               R"(Allgathers the input tensors from all processes across the process group.
 
               See :func:`torch.distributed.all_gather` for more details.)")
+=======
+                 at::Tensor& input) {
+                std::vector<std::vector<at::Tensor>> outputs = {output};
+                std::vector<at::Tensor> inputs = {input};
+                return self->allgather(
+                    outputs, inputs, ::c10d::AllgatherOptions());
+              },
+              py::arg("output_tensors"),
+              py::arg("input_tensor"),
+              py::call_guard<py::gil_scoped_release>(),
+              R"(Allgathers the input tensors from all processes across the process group.
+
+              See :func:`torch.distributed.all_gather: for more details.)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "_allgather_base",
               &::c10d::ProcessGroup::_allgather_base,
@@ -2314,6 +2394,7 @@ communication mechanism.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  std::vector<at::Tensor>& output,
                  at::Tensor& input,
+<<<<<<< HEAD
                  int rootRank,
                 std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::GatherOptions opts;
@@ -2323,13 +2404,22 @@ communication mechanism.
                 if (!output.empty()) {
                   outputs.push_back(output);
                 }
+=======
+                 int rootRank) {
+                ::c10d::GatherOptions opts;
+                opts.rootRank = rootRank;
+                std::vector<std::vector<at::Tensor>> outputs = {output};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> inputs = {input};
                 return self->gather(outputs, inputs, opts);
               },
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Gathers the input tensors from all processes across the process group.
 
@@ -2349,6 +2439,7 @@ communication mechanism.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
+<<<<<<< HEAD
                  int rootRank,
                 std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::ScatterOptions opts;
@@ -2358,13 +2449,22 @@ communication mechanism.
                 if (!input.empty()) {
                   inputs.push_back(input);
                 }
+=======
+                 int rootRank) {
+                ::c10d::ScatterOptions opts;
+                opts.rootRank = rootRank;
+                std::vector<std::vector<at::Tensor>> inputs = {input};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> outputs = {output};
                 return self->scatter(outputs, inputs, opts);
               },
               py::arg("output_tensor"),
               py::arg("input_tensors"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Scatters the input tensors from all processes across the process group.
 
@@ -2384,19 +2484,29 @@ communication mechanism.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                 std::optional<std::chrono::milliseconds> timeout) {
+=======
+                 const ::c10d::ReduceOp& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> outputs = {output};
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 ::c10d::ReduceScatterOptions opts;
                 opts.reduceOp = op;
+<<<<<<< HEAD
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self->reduce_scatter(outputs, inputs, opts);
               },
               py::arg("output"),
               py::arg("input"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>(),
               R"(Reduces and scatters the input tensors from all processes across the process group.
 
@@ -2429,6 +2539,7 @@ communication mechanism.
               py::call_guard<py::gil_scoped_release>(),
               R"(Alltoalls the input tensors from all processes across the process group.
 
+<<<<<<< HEAD
               See :func:`torch.distributed.all_to_all` for more details.)")
           .def(
               "alltoall_base",
@@ -2451,6 +2562,9 @@ communication mechanism.
               R"(Alltoalls the input tensors from all processes across the process group.
 
               See :func:`torch.distributed.all_to_all` for more details.)")
+=======
+              See :func:`torch.distributed.all_to_all for more details.)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "alltoall",
               &::c10d::ProcessGroup::alltoall,
@@ -2498,6 +2612,7 @@ communication mechanism.
 
               See :func:`torch.distributed.barrier` for more details.)")
           .def(
+<<<<<<< HEAD
             "barrier",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                 std::optional<std::chrono::milliseconds> timeout) {
@@ -2512,6 +2627,8 @@ communication mechanism.
 
               See :func:`torch.distributed.barrier` for more details.)")
           .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "_set_sequence_number_for_group",
               &::c10d::ProcessGroup::setSequenceNumberForGroup,
               py::call_guard<py::gil_scoped_release>())
@@ -2522,6 +2639,7 @@ communication mechanism.
           .def(
               "monitored_barrier",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+<<<<<<< HEAD
                  const std::optional<std::chrono::milliseconds>& timeout,
                  bool waitAllRanks) {
                 ::c10d::BarrierOptions opts;
@@ -2529,18 +2647,30 @@ communication mechanism.
                 return self->monitoredBarrier(opts, waitAllRanks);
               },
               py::arg("timeout") = std::nullopt,
+=======
+                 const std::chrono::milliseconds& timeout,
+                 bool waitAllRanks) {
+                ::c10d::BarrierOptions opts;
+                opts.timeout = timeout;
+                return self->monitoredBarrier(opts, waitAllRanks);
+              },
+              py::arg("timeout") = ::c10d::kUnsetTimeout,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::arg("wait_all_ranks") = false,
               py::call_guard<py::gil_scoped_release>(),
               R"(Blocks until all processes in the group enter the call, and
               then all leave the call together.
 
               See :func:`torch.distributed.monitored_barrier` for more details.)")
+<<<<<<< HEAD
           .def(
             "set_timeout",
             &::c10d::ProcessGroup::setTimeout,
             py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>(),
               R"(Sets the default timeout for all future operations.)")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def_property_readonly(
               "_device_types", &::c10d::ProcessGroup::getDeviceTypes)
           .def(
@@ -2686,10 +2816,13 @@ The hook must have the following signature:
               return ivalue.toCustomClass<::c10d::ProcessGroup>();
           });
 
+<<<<<<< HEAD
   // Thread local process group manipulation
   module.def("_set_process_group", &::c10d::setProcessGroup);
   module.def("_current_process_group", &::c10d::currentProcessGroup);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::enum_<::c10d::ProcessGroup::BackendType>(
       processGroup,
       "BackendType",
@@ -2735,12 +2868,15 @@ The hook must have the following signature:
               &::c10d::Backend::supportsTimeEstimation,
               "(test whether the backend supports collective time estimation)")
           .def(
+<<<<<<< HEAD
               "set_timeout",
               &::c10d::Backend::setTimeout,
               py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>(),
               R"(Sets the default timeout for all future operations.)")
           .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "broadcast",
               &::c10d::Backend::broadcast,
               py::arg("tensors"),
@@ -2750,17 +2886,26 @@ The hook must have the following signature:
               "broadcast",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& x,
+<<<<<<< HEAD
                  int rootRank,
                  std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::BroadcastOptions opts;
                 opts.rootRank = rootRank;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 int rootRank) {
+                ::c10d::BroadcastOptions opts;
+                opts.rootRank = rootRank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> xs = {x};
                 return self->broadcast(xs, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "allreduce",
@@ -2772,32 +2917,50 @@ The hook must have the following signature:
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  std::vector<at::Tensor>& xs,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                  std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::AllreduceOptions opts;
+                opts.reduceOp = op;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self->allreduce(xs, opts);
               },
               py::arg("tensors"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& x,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                  std::optional<std::chrono::milliseconds> timeout) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 opts.timeout = timeout.value_or(::c10d::kUnsetTimeout);
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::AllreduceOptions opts;
+                opts.reduceOp = op;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> xs = {x};
                 return self->allreduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = std::nullopt,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "allreduce_coalesced",
@@ -2816,19 +2979,29 @@ The hook must have the following signature:
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& x,
                  int rootRank,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                  std::chrono::milliseconds timeout) {
                 ::c10d::ReduceOptions opts;
                 opts.reduceOp = op;
                 opts.rootRank = rootRank;
                 opts.timeout = timeout;
+=======
+                 const ::c10d::ReduceOp& op) {
+                ::c10d::ReduceOptions opts;
+                opts.reduceOp = op;
+                opts.rootRank = rootRank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> xs = {x};
                 return self->reduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "allgather",
@@ -2848,6 +3021,7 @@ The hook must have the following signature:
               "allgather",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  std::vector<at::Tensor>& output,
+<<<<<<< HEAD
                  at::Tensor& input,
                  std::chrono::milliseconds timeout) {
                 std::vector<std::vector<at::Tensor>> outputs = {output};
@@ -2859,6 +3033,16 @@ The hook must have the following signature:
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+                 at::Tensor& input) {
+                std::vector<std::vector<at::Tensor>> outputs = {output};
+                std::vector<at::Tensor> inputs = {input};
+                return self->allgather(
+                    outputs, inputs, ::c10d::AllgatherOptions());
+              },
+              py::arg("output_tensors"),
+              py::arg("input_tensor"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "allgather_coalesced",
@@ -2879,6 +3063,7 @@ The hook must have the following signature:
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  std::vector<at::Tensor>& output,
                  at::Tensor& input,
+<<<<<<< HEAD
                  int rootRank,
                  std::chrono::milliseconds timeout) {
                 ::c10d::GatherOptions opts;
@@ -2888,13 +3073,22 @@ The hook must have the following signature:
                 if (!output.empty()) {
                   outputs.push_back(output);
                 }
+=======
+                 int rootRank) {
+                ::c10d::GatherOptions opts;
+                opts.rootRank = rootRank;
+                std::vector<std::vector<at::Tensor>> outputs = {output};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> inputs = {input};
                 return self->gather(outputs, inputs, opts);
               },
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "scatter",
@@ -2908,6 +3102,7 @@ The hook must have the following signature:
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
+<<<<<<< HEAD
                  int rootRank,
                  std::chrono::milliseconds timeout) {
                 ::c10d::ScatterOptions opts;
@@ -2917,13 +3112,22 @@ The hook must have the following signature:
                 if (!input.empty()) {
                   inputs.push_back(input);
                 }
+=======
+                 int rootRank) {
+                ::c10d::ScatterOptions opts;
+                opts.rootRank = rootRank;
+                std::vector<std::vector<at::Tensor>> inputs = {input};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> outputs = {output};
                 return self->scatter(outputs, inputs, opts);
               },
               py::arg("output_tensor"),
               py::arg("input_tensors"),
               py::arg("root"),
+<<<<<<< HEAD
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "reduce_scatter",
@@ -2937,19 +3141,29 @@ The hook must have the following signature:
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
+<<<<<<< HEAD
                  const ::c10d::ReduceOp& op,
                  std::chrono::milliseconds timeout) {
+=======
+                 const ::c10d::ReduceOp& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::vector<at::Tensor> outputs = {output};
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 ::c10d::ReduceScatterOptions opts;
                 opts.reduceOp = op;
+<<<<<<< HEAD
                 opts.timeout = timeout;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self->reduce_scatter(outputs, inputs, opts);
               },
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
+<<<<<<< HEAD
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "_reduce_scatter_base",
@@ -2972,6 +3186,7 @@ The hook must have the following signature:
               [](::c10d::Backend& self,
                  at::Tensor& output,
                  at::Tensor& input,
+<<<<<<< HEAD
                  std::vector<int64_t>& outputSplitSizes,
                  std::vector<int64_t>& inputSplitSizes,
                  std::chrono::milliseconds timeout) {
@@ -2979,12 +3194,25 @@ The hook must have the following signature:
                 opts.timeout = timeout;
                 return self.alltoall_base(
                     output, input, outputSplitSizes, inputSplitSizes, opts);
+=======
+                 std::vector<int64_t> outputSplitSizes,
+                 std::vector<int64_t> inputSplitSizes) {
+                return self.alltoall_base(
+                    output,
+                    input,
+                    outputSplitSizes,
+                    inputSplitSizes,
+                    ::c10d::AllToAllOptions());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               },
               py::arg("output"),
               py::arg("input"),
               py::arg("output_split_sizes"),
               py::arg("input_split_sizes"),
+<<<<<<< HEAD
               py::arg("timeout") = ::c10d::kUnsetTimeout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "alltoall",
@@ -3103,6 +3331,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::arg("backend"),
               py::arg("timeout") = kProcessGroupDefaultTimeout)
           .def_readonly("backend", &::c10d::Backend::Options::backend)
+<<<<<<< HEAD
           .def_readwrite("_timeout", &::c10d::Backend::Options::timeout)
           .def_readwrite(
               "global_ranks_in_group",
@@ -3110,6 +3339,13 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def_readwrite("group_name", &::c10d::Backend::Options::group_name);
 
 #ifdef USE_C10D_GLOO
+=======
+          .def_readwrite("_timeout", &::c10d::Backend::Options::timeout);
+
+#ifdef USE_C10D_GLOO
+  static const std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto processGroupGloo =
       intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupGloo>(
           module, "ProcessGroupGloo", backend);
@@ -3121,7 +3357,16 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       processGroupGloo, "_Options", backendOptions)
       .def(py::init<>())
       .def_readwrite("_devices", &::c10d::ProcessGroupGloo::Options::devices)
+<<<<<<< HEAD
       .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads);
+=======
+      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads)
+      .def_readwrite(
+          "global_ranks_in_group",
+          &::c10d::ProcessGroupGloo::Options::global_ranks_in_group)
+      .def_readwrite(
+          "group_name", &::c10d::ProcessGroupGloo::Options::group_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   processGroupGloo
       .def_static(
@@ -3186,11 +3431,39 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             // https://github.com/pybind/pybind11/issues/5473
             py::gil_scoped_release nogil{};
 
+<<<<<<< HEAD
             return c10::make_intrusive<::c10d::ProcessGroupGloo>(
                 store,
                 rank,
                 size,
                 ::c10d::ProcessGroupGloo::Options::create_default(timeout));
+=======
+            auto options = ::c10d::ProcessGroupGloo::Options::create();
+            bool lazyInit = ::c10d::getDefaultGlooLazyInit();
+
+            // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
+            auto ifnameEnv =
+                c10::utils::get_env(GLOO_SOCKET_IFNAME_ENV.c_str());
+            if (ifnameEnv && ifnameEnv->size() > 1) {
+              for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
+                options->devices.push_back(
+                    ::c10d::ProcessGroupGloo::createDeviceForInterface(
+                        iface, lazyInit));
+              }
+            } else {
+              // If no hostname is specified, this function looks up
+              // the machine's hostname and returns a device instance
+              // associated with the address that the hostname resolves to.
+              options->devices.push_back(
+                  ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit));
+            }
+
+            options->timeout = timeout;
+            // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+            options->threads = options->devices.size() * 2;
+            return c10::make_intrusive<::c10d::ProcessGroupGloo>(
+                store, rank, size, options);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }),
           py::arg("store"),
           py::arg("rank"),
@@ -3199,7 +3472,14 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           R"(Create a new ProcessGroupGloo instance.)")
       .def(
           "_set_default_timeout",
+<<<<<<< HEAD
           &::c10d::ProcessGroupGloo::setTimeout,
+=======
+          [](const c10::intrusive_ptr<::c10d::ProcessGroupGloo>& self,
+             std::chrono::milliseconds timeout) {
+            self->getOptions()->timeout = timeout;
+          },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::arg("timeout"),
           py::call_guard<py::gil_scoped_release>())
       .def_property_readonly(
@@ -3296,7 +3576,14 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               &::c10d::ProcessGroupNCCL::getCommSplitCounter)
           .def(
               "_set_default_timeout",
+<<<<<<< HEAD
               &::c10d::ProcessGroupNCCL::setTimeout,
+=======
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
+                 std::chrono::milliseconds timeout) {
+                self->getOptions()->timeout = timeout;
+              },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::arg("timeout"),
               py::call_guard<py::gil_scoped_release>())
           .def(
@@ -3329,11 +3616,15 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "perform_nocolor_split",
               &::c10d::ProcessGroupNCCL::performNocolorSplit)
+<<<<<<< HEAD
           .def(
               "register_mem_pool",
               &::c10d::ProcessGroupNCCL::registerMemPool,
               py::arg("pool"),
               py::arg("symm") = false)
+=======
+          .def("register_mem_pool", &::c10d::ProcessGroupNCCL::registerMemPool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "deregister_mem_pool",
               &::c10d::ProcessGroupNCCL::deregisterMemPool)
@@ -3362,6 +3653,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             return ::c10d::getNcclVersionTuple();
           });
 
+<<<<<<< HEAD
 #ifdef NCCL_HAS_CTA_POLICY
   processGroupNCCL.def_property_readonly_static(
       "NCCL_CTA_POLICY_DEFAULT",
@@ -3376,6 +3668,8 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
 #endif // NCCL_CTA_POLICY_ZERO
 #endif // NCCL_HAS_CTA_POLICY
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   module.def(
       "_get_intra_node_comm_usage_counter",
       &::c10d::intra_node_comm::getIntraNodeCommUsageCounter);
@@ -3412,11 +3706,14 @@ for details.
 #ifdef NCCL_HAS_NVLS_CTAS
       .def_readwrite("nvls_ctas", &ncclConfig_t::nvlsCTAs)
 #endif
+<<<<<<< HEAD
       .def(
           "unsafe_get_ptr",
           [](const ncclConfig_t& self) {
             return reinterpret_cast<uintptr_t>(&self);
           })
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property(
           "net_name",
           [](const ncclConfig_t& self) { return self.netName; },
@@ -3481,6 +3778,14 @@ Example::
           "split_from", &::c10d::ProcessGroupNCCL::Options::split_from)
       .def_readwrite(
           "split_color", &::c10d::ProcessGroupNCCL::Options::split_color)
+<<<<<<< HEAD
+=======
+      .def_readwrite(
+          "global_ranks_in_group",
+          &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group)
+      .def_readwrite(
+          "group_name", &::c10d::ProcessGroupNCCL::Options::group_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "__copy__",
           [](const ::c10d::ProcessGroupNCCL::Options& self) {
@@ -3519,6 +3824,7 @@ Example::
           .def(
               py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                           int rank,
+<<<<<<< HEAD
                           int size,
                           c10::intrusive_ptr<::c10d::ProcessGroupXCCL::Options>
                               options) {
@@ -3536,11 +3842,14 @@ Example::
           .def(
               py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                           int rank,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           int size) {
                 // gil_scoped_release is not safe as a call_guard in init.
                 // https://github.com/pybind/pybind11/issues/5473
                 py::gil_scoped_release nogil{};
 
+<<<<<<< HEAD
                 auto options = ::c10d::ProcessGroupXCCL::Options::create();
                 options->is_high_priority_stream = false;
                 return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
@@ -3586,6 +3895,14 @@ Example::
       )")
       .def("get_xccl_version", [] { return ::c10d::getXcclVersion(); });
 
+=======
+                return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
+                    store, rank, size);
+              }),
+              py::arg("store"),
+              py::arg("rank"),
+              py::arg("size"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef USE_C10D_UCC
@@ -3731,6 +4048,7 @@ such as `dist.all_reduce(tensor, async_op=True)`.
                   or timed out. If timeout, exception will be thrown.
             )")
           .def(
+<<<<<<< HEAD
               "block_current_stream",
               &::c10d::Work::blockCurrentStream,
               py::call_guard<py::gil_scoped_release>(),
@@ -3746,6 +4064,8 @@ such as `dist.all_reduce(tensor, async_op=True)`.
               Work object result asynchronously.
             )")
           .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "get_future_result",
               [](::c10d::Work& work)
                   -> std::shared_ptr<jit::PythonFutureWrapper> {
@@ -3844,6 +4164,7 @@ such as `dist.all_reduce(tensor, async_op=True)`.
 
   auto fakeProcessGroup =
       intrusive_ptr_no_gil_destructor_class_<::c10d::FakeProcessGroup>(
+<<<<<<< HEAD
           module, "FakeProcessGroup", backend);
   intrusive_ptr_class_<::c10d::FakeProcessGroup::Options>(
       fakeProcessGroup, "Options", backendOptions)
@@ -3868,6 +4189,16 @@ such as `dist.all_reduce(tensor, async_op=True)`.
               c10::make_intrusive<::c10d::FakeProcessGroup::Options>())
       .def_property_readonly(
           "options", &::c10d::FakeProcessGroup::getBackendOptions);
+=======
+          module, "FakeProcessGroup", backend)
+          .def(
+              py::init([](int rank, int size) {
+                return c10::make_intrusive<::c10d::FakeProcessGroup>(
+                    rank, size);
+              }),
+              py::arg("rank"),
+              py::arg("world_size"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto fakeWork =
       intrusive_ptr_no_gil_destructor_class_<::c10d::FakeWork>(
           module, "FakeWork", work)
@@ -4108,10 +4439,13 @@ such as `dist.all_reduce(tensor, async_op=True)`.
             Stringified pickle work traces.
             Default settings return everything - i.e. contains NCCL comm dumps and collective traces.
       )");
+<<<<<<< HEAD
   module.def(
       "_reset_fr_recording_nccl",
       []() { ::c10d::reset_nccl_trace(); },
       "API to reset Flight recorder recording when it comes fault tolerance.");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   module.def(
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index 75f8b2998f356..4e40d231c6dba 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -153,7 +153,11 @@ class TORCH_API C10dLogger {
   virtual ~C10dLogger() = default;
   virtual void log(const C10dLoggingData& data);
   static C10dLogger* getLogger();
+<<<<<<< HEAD
   static void registerLogger(std::unique_ptr<C10dLogger> /*logger*/);
+=======
+  static void registerLogger(std::unique_ptr<C10dLogger>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   // singletion, hide constructor from the public
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 10a2251754cde..029f4c148f3dd 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -136,9 +136,15 @@ Reducer::Reducer(
   {
     std::set<int> unique_devices;
     for (const auto& v : params_) {
+<<<<<<< HEAD
       auto device_idx = static_cast<int>(v.device().index());
       auto [_, inserted] = unique_devices.emplace(device_idx);
       if (inserted) {
+=======
+      auto device_idx = int(v.device().index());
+      if (unique_devices.find(device_idx) == unique_devices.end()) {
+        unique_devices.insert(device_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (unique_devices.size() > 1) {
           is_multi_device_module_ = true;
           break;
@@ -168,7 +174,11 @@ Reducer::Reducer(
   }
 
   // All variables are expected to have their `grad_fn` set to the gradient
+<<<<<<< HEAD
   // accumulation function (since they are leaves in the autograd graph).
+=======
+  // accumulation function (since they are leafs in the autograd graph).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We store pointers to these functions such that we can check if they are
   // used in an autograd pass. If they are not, we know their grad tensors
   // can be marked as ready for reduction.
@@ -375,7 +385,12 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
       // previous iterations, no copy is needed.
       if (!grad.is_alias_of(bucket_view)) {
         if (comm_hook_ == nullptr) {
+<<<<<<< HEAD
           auto wrapped = at::native::wrapped_scalar_tensor(1. / div_factor_);
+=======
+          auto wrapped =
+              at::native::wrapped_scalar_tensor(double(1.) / div_factor_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (!grad.requires_grad()) {
             // Divides while copying into the bucket view to save one scan over
             // all the input parameters.
@@ -963,6 +978,27 @@ void Reducer::all_reduce_bucket(Bucket& bucket) {
   // do any extra synchronization here.
   const auto& tensor = bucket.gradients;
 
+<<<<<<< HEAD
+=======
+  // TODO(@egienvalue): remove special case after view ops are fully
+  // supported on MTIA.
+  // If the bucket.gradients is on MTIA, bucket.bucket_views_in might not
+  // point to the same storage as bucket.gradients due to the special
+  // memory layout. It has to explicitly copy the data back to 1-D gradients.
+  if (tensor.is_mtia()) {
+    for (const auto i : c10::irange(bucket.variables.size())) {
+      const auto offset = bucket.offsets[i];
+      const auto length = bucket.lengths[i];
+      if (!bucket.bucket_views_in[i].is_alias_of(tensor)) {
+        tensor
+            .narrow(
+                0, static_cast<int64_t>(offset), static_cast<int64_t>(length))
+            .copy_(bucket.bucket_views_in[i].flatten());
+      }
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GradBucket grad_bucket(
       next_bucket_,
       buckets_.size(),
@@ -1266,8 +1302,17 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
     auto& v = bucket.variables[i];
     const auto offset = bucket.offsets[i];
     const auto length = bucket.lengths[i];
+<<<<<<< HEAD
 
     if (v.is_non_overlapping_and_dense()) {
+=======
+    // TODO(@egienvalue): remove special case after view ops are fully
+    // supported on MTIA.
+    // In general, on MTIA, due to the special memory layout, it doesn't
+    // support as_strided which creates a view tensor and aten::view will
+    // create a new tensor on MTIA for now.
+    if (v.is_non_overlapping_and_dense() && !v.is_mtia()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // If the param's memory is dense, match its layout, anticipating
       // the autograd engine (AccumulateGrad) will also create gradients
       // matching its layout.
@@ -1321,8 +1366,17 @@ void Reducer::populate_bucket_views_out(
     const auto& v = bucket.variables[i];
     const auto offset = bucket.offsets[i];
     const auto length = bucket.lengths[i];
+<<<<<<< HEAD
 
     if (v.is_non_overlapping_and_dense()) {
+=======
+    // TODO(@egienvalue): remove special case after view ops are fully
+    // supported on MTIA.
+    // In general, on MTIA, due to the special memory layout, it doesn't
+    // support as_strided which creates a view tensor and aten::view will
+    // create a new tensor on MTIA for now.
+    if (v.is_non_overlapping_and_dense() && !v.is_mtia()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // If the param's memory is dense, match its layout, anticipating
       // the autograd engine (AccumulateGrad) will also create gradients
       // matching its layout.
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index 4e5ed6a9a5c3f..6615e90b912ee 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -26,8 +26,13 @@
 
 namespace c10d {
 
+<<<<<<< HEAD
 constexpr int kDefaultFirstBucketBytes = 1024 * 1024;
 constexpr int kDefaultBucketBytesCap = 25 * 1024 * 1024;
+=======
+constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
+constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
 constexpr int kDDPRuntimeLoggingSampleRate = 100;
 
diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp
index ee220abfa4118..89e7139bd4e9c 100644
--- a/torch/csrc/distributed/c10d/reducer_cuda.cpp
+++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp
@@ -76,7 +76,11 @@ class CudaTimer : public Timer {
     if (milliseconds < 0) {
       return std::nullopt;
     }
+<<<<<<< HEAD
     return static_cast<int64_t>(milliseconds * kMilliSecondToNanosSecond);
+=======
+    return int64_t(milliseconds * kMilliSecondToNanosSecond);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index 48b59f41b7a88..12f1fc31ff605 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -193,6 +193,7 @@ class SocketImpl {
 };
 
 std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
+<<<<<<< HEAD
   // It can be be very slow to repeatedly hit DNS resolution failure, but its
   // very helpful to have DNS names in logs by default. So we try to use DNS but
   // if we hit a transient failure we just disable it for the remainder of the
@@ -237,6 +238,40 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
     }
   }
   return "?UNKNOWN?";
+=======
+  char host[NI_MAXHOST], port[NI_MAXSERV]; // NOLINT
+
+  if (int err = ::getnameinfo(
+          addr, len, host, NI_MAXHOST, port, NI_MAXSERV, NI_NUMERICSERV)) {
+    C10D_WARNING(
+        "The hostname of the client socket cannot be retrieved. err={}", err);
+
+    // if we can't resolve the hostname, display the IP address
+    if (addr->sa_family == AF_INET) {
+      struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
+      // NOLINTNEXTLINE(*array*)
+      char ip[INET_ADDRSTRLEN];
+      if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
+          nullptr) {
+        return fmt::format("{}:{}", ip, psai->sin_port);
+      }
+    } else if (addr->sa_family == AF_INET6) {
+      struct sockaddr_in6* psai = (struct sockaddr_in6*)&addr;
+      // NOLINTNEXTLINE(*array*)
+      char ip[INET6_ADDRSTRLEN];
+      if (inet_ntop(
+              addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
+          nullptr) {
+        return fmt::format("[{}]:{}", ip, psai->sin6_port);
+      }
+    }
+    return "?UNKNOWN?";
+  }
+  if (addr->sa_family == AF_INET) {
+    return fmt::format("{}:{}", host, port);
+  }
+  return fmt::format("[{}]:{}", host, port);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace c10d::detail
 
@@ -532,8 +567,13 @@ class SocketListenOp {
 
   std::string port_;
   const SocketOptions* opts_;
+<<<<<<< HEAD
   std::vector<std::string> errors_;
   std::unique_ptr<SocketImpl> socket_;
+=======
+  std::vector<std::string> errors_{};
+  std::unique_ptr<SocketImpl> socket_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 SocketListenOp::SocketListenOp(std::uint16_t port, const SocketOptions& opts)
@@ -772,9 +812,15 @@ class SocketConnectOp {
   const char* host_;
   std::string port_;
   const SocketOptions* opts_;
+<<<<<<< HEAD
   TimePoint deadline_;
   std::vector<std::string> errors_;
   std::unique_ptr<SocketImpl> socket_;
+=======
+  TimePoint deadline_{};
+  std::vector<std::string> errors_{};
+  std::unique_ptr<SocketImpl> socket_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 SocketConnectOp::SocketConnectOp(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
index 0abbc84ebe52a..bfcf374cb6671 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
@@ -115,20 +115,29 @@ __device__ __forceinline__ void wait_signal(uint32_t* addr) {
 // Pattern 0: Ensures that all writes to symm_mem buffers from previous
 // kernels across all devices are visible to the current kernel:
 //
+<<<<<<< HEAD
 //   sync_remote_blocks<false, true>(...);
+=======
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   __syncthreads();
 //
 // Pattern 1: Ensures that all writes to symm_mem buffers from the current
 // block are visible to all remote blocks with matching blockIdx:
 //
 //   __syncthreads();
+<<<<<<< HEAD
 //   sync_remote_blocks<true, true>(...);
+=======
+//   sync_remote_blocks<std::memory_order_acq_rel>(...);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   __syncthreads();
 //
 // Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
 // for writing by subsequent kernels across all devices.
 //
 //   __syncthreads();
+<<<<<<< HEAD
 //   sync_remote_blocks<true, false>(...);
 template <bool hasPrevMemAccess, bool hasSubsequentMemAccess>
 __device__ __forceinline__ void sync_remote_blocks(
@@ -153,6 +162,42 @@ __device__ __forceinline__ void sync_remote_blocks(
     }
   }
 };
+=======
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+template <std::memory_order Sem>
+__device__ __forceinline__ void sync_remote_blocks(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size);
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_relaxed>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_relaxed>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_relaxed>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_acq_rel>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_release>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_acquire>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename T>
 struct MultimemLdReduce {
@@ -256,7 +301,11 @@ __device__ __inline__ T add_bf16x2(T a, T b) {
     __hip_bfloat16 bf[2];
   } _bf2f_a = {.f = 0}, _bf2f_b = {.f = 0};
 
+<<<<<<< HEAD
   //__hip_bfloat162 is a struct with two __hip_bfloat16 elements called x and y
+=======
+  //__hip_bfloat162 is a struct wtih two __hip_bfloat16 elements called x and y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This typecasts input a and b as bfloat16 and maps to low bits of a float
   // and does the addition in float
   _bf2f_a.bf[1] = reinterpret_cast<__hip_bfloat162*>(&a)->x;
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 4523333c7fad4..097d266d047fe 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
@@ -6,6 +7,15 @@
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
+=======
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
+
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/error.h>
@@ -23,6 +33,7 @@
 #define CUDART_SUPPORTS_MULTICAST
 #endif
 
+<<<<<<< HEAD
 // add these definitions so that we can compile with CUDA < 12.3
 // borrowed from
 // https://github.com/NVIDIA/nccl/blob/3ea7eedf3b9b94f1d9f99f4e55536dfcbd23c1ca/src/include/p2p.h#L20
@@ -35,6 +46,8 @@ typedef struct CUmemFabricHandle_st {
 typedef CUmemFabricHandle_v1 CUmemFabricHandle;
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10d {
 namespace symmetric_memory {
 
@@ -47,6 +60,7 @@ AllocationRef::AllocationRef(
     void* ptr,
     HandleType handle,
     size_t block_size,
+<<<<<<< HEAD
     int device_idx,
     bool is_multicast)
     : ptr(ptr),
@@ -54,6 +68,13 @@ AllocationRef::AllocationRef(
       block_size(block_size),
       device_idx(device_idx),
       is_multicast(is_multicast) {}
+=======
+    int device_idx)
+    : ptr(ptr),
+      handle(handle),
+      block_size(block_size),
+      device_idx(device_idx) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AllocationRef::~AllocationRef() {
   if (is_finalizing()) {
@@ -66,12 +87,15 @@ AllocationRef::~AllocationRef() {
   auto driver_api = c10::cuda::DriverAPI::get();
   C10_CUDA_DRIVER_CHECK(
       driver_api->cuMemUnmap_(reinterpret_cast<CUdeviceptr>(ptr), block_size));
+<<<<<<< HEAD
 #if defined(CUDART_SUPPORTS_MULTICAST)
   if (is_multicast) {
     C10_CUDA_DRIVER_CHECK(
         driver_api->cuMulticastUnbind_(handle, device_idx, 0, block_size));
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemRelease_(handle));
 #elif defined(USE_ROCM)
   C10_HIP_CHECK(hipMemUnmap(reinterpret_cast<hipDeviceptr_t>(ptr), block_size));
@@ -146,6 +170,81 @@ void* CUDASymmetricMemory::get_multicast_ptr() {
   return mc_addr_;
 }
 
+<<<<<<< HEAD
+=======
+at::Tensor CUDASymmetricMemory::get_buffer(
+    int rank,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    int64_t storage_offset) {
+  const size_t numel = std::accumulate(
+      sizes.begin(),
+      sizes.end(),
+      static_cast<size_t>(1),
+      std::multiplies<size_t>());
+  const auto element_size = c10::elementSize(dtype);
+  const auto req_size = (numel + storage_offset) * element_size;
+  TORCH_CHECK(
+      req_size <= buffer_size_,
+      "CUDASymmetricMemory::get_buffer: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      buffer_size_,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+      storage_offset * element_size;
+  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, sizes)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+at::Tensor CUDASymmetricMemory::get_signal_pad(
+    int rank,
+    c10::IntArrayRef sizes,
+    std::optional<c10::ScalarType> dtype,
+    int64_t storage_offset) {
+  // If the dtype is unspecified, default it to UInt32, as it
+  // is the most common type for signaling purposes.
+  if (!dtype.has_value()) {
+    dtype = c10::ScalarType::UInt32;
+  }
+
+  // If the shape is unspecified, treat the signal pad as a 1d tensor.
+  const auto element_size = c10::elementSize(*dtype);
+  std::vector<int64_t> shape;
+  if (!sizes.empty()) {
+    shape = sizes.vec();
+  } else {
+    shape.push_back(signal_pad_size / element_size);
+  }
+
+  const size_t numel = std::accumulate(
+      shape.begin(),
+      shape.end(),
+      static_cast<size_t>(1),
+      std::multiplies<size_t>());
+  const auto req_size = (numel + storage_offset) * element_size;
+  TORCH_CHECK(
+      req_size <= signal_pad_size,
+      "CUDASymmetricMemory::get_signal_pad: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      signal_pad_size,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+      storage_offset * element_size;
+  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+  auto options = at::TensorOptions().dtype(*dtype).device(device);
+  return at::for_blob(data_ptr, shape)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void check_channel(int channel, int world_size) {
   TORCH_CHECK(
       channel >= 0,
@@ -204,11 +303,15 @@ static __global__ void barrier_kernel(
 void CUDASymmetricMemory::barrier(int channel, size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
+<<<<<<< HEAD
   barrier_kernel<<<
       1,
       at::cuda::warp_size(),
       0,
       at::cuda::getCurrentCUDAStream()>>>(
+=======
+  barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       channel,
       rank_,
@@ -246,11 +349,15 @@ void CUDASymmetricMemory::put_signal(
     size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
+<<<<<<< HEAD
   put_signal_kernel<<<
       1,
       at::cuda::warp_size(),
       0,
       at::cuda::getCurrentCUDAStream()>>>(
+=======
+  put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       dst_rank,
       channel,
@@ -294,11 +401,15 @@ void CUDASymmetricMemory::wait_signal(
     size_t timeout_ms) {
   check_channel(channel, world_size_);
   c10::cuda::CUDAGuard guard(local_device_idx_);
+<<<<<<< HEAD
   wait_signal_kernel<<<
       1,
       at::cuda::warp_size(),
       0,
       at::cuda::getCurrentCUDAStream()>>>(
+=======
+  wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       reinterpret_cast<uint32_t**>(signal_pads_dev_),
       src_rank,
       channel,
@@ -316,6 +427,7 @@ int CUDASymmetricMemory::get_world_size() {
   return world_size_;
 }
 
+<<<<<<< HEAD
 c10::Device CUDASymmetricMemory::get_device() {
   return c10::Device(c10::DeviceType::CUDA, local_device_idx_);
 }
@@ -324,6 +436,8 @@ bool CUDASymmetricMemory::world_within_direct_access() {
   return true;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Block::Block(
     c10::intrusive_ptr<AllocationRef> alloc_ref,
     int device_idx,
@@ -338,15 +452,22 @@ Block::Block(
       signal_pad_offset(signal_pad_offset),
       default_group_name(std::move(group_name)) {}
 
+<<<<<<< HEAD
 namespace {
 using Expandable_Segments_Handle_Type =
     c10::cuda::CUDACachingAllocator::Expandable_Segments_Handle_Type;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void* CUDASymmetricMemoryAllocator::alloc(
     size_t size,
     int device_idx,
     const std::optional<std::string>& group_name) {
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t signal_pad_offset = at::round_up(size, 16UL);
   size_t block_size = signal_pad_offset + signal_pad_size;
   c10::cuda::CUDAGuard guard(device_idx);
@@ -357,6 +478,7 @@ void* CUDASymmetricMemoryAllocator::alloc(
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   prop.location.id = device_idx;
+<<<<<<< HEAD
   bool has_fabric_support = at::cuda::get_fabric_access(device_idx);
   LOG(INFO) << "CUDASymmetricMemoryAllocator::alloc: has_fabric_support " << has_fabric_support;
   if (handle_type_ == Expandable_Segments_Handle_Type::UNSPECIFIED) {
@@ -367,6 +489,10 @@ void* CUDASymmetricMemoryAllocator::alloc(
   } else {
     prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
   }
+=======
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   size_t granularity;
   auto driver_api = c10::cuda::DriverAPI::get();
@@ -375,10 +501,17 @@ void* CUDASymmetricMemoryAllocator::alloc(
   block_size = at::round_up(block_size, granularity);
 
   HandleType handle;
+<<<<<<< HEAD
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemCreate_(&handle, block_size, &prop, 0));
 
 #elif defined(USE_ROCM)
   handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD;
+=======
+  C10_CUDA_DRIVER_CHECK(
+      driver_api->cuMemCreate_(&handle, block_size, &prop, 0));
+
+#elif defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hipMemAllocationProp prop = {};
   prop.type = hipMemAllocationTypePinned;
   prop.location.type = hipMemLocationTypeDevice;
@@ -386,17 +519,25 @@ void* CUDASymmetricMemoryAllocator::alloc(
   prop.location.id = device_idx;
   prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t granularity;
   C10_HIP_CHECK(hipMemGetAllocationGranularity(
       &granularity, &prop, hipMemAllocationGranularityRecommended));
   block_size = at::round_up(block_size, granularity);
 
   HandleType handle;
+<<<<<<< HEAD
   C10_HIP_CHECK(hipMemCreate(
       reinterpret_cast<hipMemGenericAllocationHandle_t*>(&handle),
       block_size,
       &prop,
       0));
+=======
+  C10_HIP_CHECK(hipMemCreate(reinterpret_cast<hipMemGenericAllocationHandle_t*>(&handle), block_size, &prop, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #else
   TORCH_CHECK(
@@ -444,7 +585,10 @@ struct RendezvousRequest {
   size_t buffer_size;
   size_t signal_pad_offset;
   bool has_multicast_support;
+<<<<<<< HEAD
   char hostname[HOST_NAME_MAX + 1];
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 void validate_rendezvous_requests(
@@ -452,6 +596,7 @@ void validate_rendezvous_requests(
     int world_size) {
   TORCH_CHECK(reqs.size() == (size_t)world_size);
 
+<<<<<<< HEAD
   // For NVL72 systems, multiple hosts can be within a single nvlink domain.
   // Multiple blocks will have same device_idx but they are on different hosts.
   // Use (hostname, device_idx) pair to uniquely identify each allocation.
@@ -461,6 +606,15 @@ void validate_rendezvous_requests(
   }
   if (!allow_overlapping_devices() &&
       device_host_pairs.size() < (size_t)world_size) {
+=======
+  std::unordered_set<int> device_indices;
+  device_indices.reserve(world_size);
+  for (auto req : reqs) {
+    device_indices.insert(req.device_idx);
+  }
+  if (!allow_overlapping_devices() &&
+      device_indices.size() < (size_t)world_size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false,
         "CUDASymmetricMemoryAllocator::rendezvous: ",
@@ -498,12 +652,19 @@ static bool check_group_multicast_support(
   }
 }
 
+<<<<<<< HEAD
 template <bool use_fabric_handle>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void init_multicast_for_block(
     HandleType& mc_handle,
     void*& mc_addr,
     const c10::intrusive_ptr<Block>& block,
+<<<<<<< HEAD
     std::conditional_t<!use_fabric_handle, IpcChannel&, int&> ipc_channel,
+=======
+    IpcChannel& ipc_channel,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<int>& pids,
     const c10::intrusive_ptr<c10d::Store>& store,
     int rank,
@@ -511,6 +672,7 @@ static void init_multicast_for_block(
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && \
     defined(CUDART_SUPPORTS_MULTICAST)
   auto driver_api = c10::cuda::DriverAPI::get();
+<<<<<<< HEAD
   auto handleType = use_fabric_handle
       ? CU_MEM_HANDLE_TYPE_FABRIC
       : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
@@ -521,6 +683,12 @@ static void init_multicast_for_block(
     CUmulticastObjectProp mc_prop{};
     mc_prop.numDevices = world_size;
     mc_prop.handleTypes = handleType;
+=======
+  if (rank == 0) {
+    CUmulticastObjectProp mc_prop{};
+    mc_prop.numDevices = world_size;
+    mc_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mc_prop.size = block->block_size;
 
     // create a multicast object, which acts as a handle that allows multiple
@@ -537,6 +705,7 @@ static void init_multicast_for_block(
           << "\". Gracefully skipping multicast initialization. "
           << "However, this is unexpected. Please report the issue on GitHub.";
       // Allow peers gracefully skip multicast initialization by sending -1
+<<<<<<< HEAD
       // TODO: allow graceful skip for fabric
       if constexpr (!use_fabric_handle) {
         ipc_channel.broadcast_fds(rank, 0, pids, -1);
@@ -577,6 +746,30 @@ static void init_multicast_for_block(
       C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
           &mc_handle, (void*)&(mc_handles[0]), CU_MEM_HANDLE_TYPE_FABRIC));
     }
+=======
+      ipc_channel.broadcast_fds(rank, 0, pids, -1);
+      return;
+    }
+
+    int mc_fd;
+    // using the CUDA Driver API to export a multicast object into a POSIX file descriptor.
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
+        &mc_fd, mc_handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+    ipc_channel.broadcast_fds(rank, 0, pids, mc_fd);
+    // Ref count is incremented as soon as SCM_RIGHTS send happens
+    close(mc_fd);
+  } else {
+    int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1);
+    if (mc_fd == -1) {
+      return;
+    }
+    // Convert back to a handle from the broadcasted POSIX file descriptor.
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
+        &mc_handle,
+        (void*)(uintptr_t)mc_fd,
+        CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    close(mc_fd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // All rank adds their physical allocation to the multicast object
@@ -590,6 +783,7 @@ static void init_multicast_for_block(
 #endif
 }
 
+<<<<<<< HEAD
 namespace {
 template <bool use_fabric_handle>
 c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
@@ -762,6 +956,12 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
 c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
     void* ptr,
     const std::optional<std::string>& group_name) {
+=======
+c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
+    void* ptr,
+    const std::optional<std::string>& group_name) {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto block = find_block(ptr);
   if (block == nullptr) {
     return nullptr;
@@ -789,6 +989,7 @@ c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
     return it->second;
   }
 
+<<<<<<< HEAD
   auto group_info = get_group_info(group_name_);
 
   TORCH_INTERNAL_ASSERT(
@@ -797,6 +998,113 @@ c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
       handle_type_ == Expandable_Segments_Handle_Type::FABRIC_HANDLE;
   auto symm_mem = use_fabric ? make_symm_mem<true>(ptr, block, group_info)
                              : make_symm_mem<false>(ptr, block, group_info);
+=======
+  c10::cuda::CUDAGuard guard(block->device_idx);
+
+  // Currently, IpcChannel is using a file based socket for inter-process communication
+  IpcChannel ipc_channel;
+  auto group_info = get_group_info(group_name_);
+  auto store = group_info.store;
+  int rank = group_info.rank;
+  int world_size = group_info.world_size;
+  int block_fd;
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  // using the CUDA Driver API to export a GPU memory block as a
+  // POSIX file descriptor (FD), so it can be shared across processes via IPC.
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
+      &block_fd,
+      block->alloc_ref->handle,
+      CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+      0));
+#elif defined (USE_ROCM)
+  C10_HIP_CHECK(hipMemExportToShareableHandle(
+      &block_fd, block->alloc_ref->handle, hipMemHandleTypePosixFileDescriptor, 0));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+
+  auto local_req = RendezvousRequest{
+      .device_idx = block->device_idx,
+      .pid = getpid(),
+      .block_size = block->block_size,
+      .buffer_size = block->buffer_size,
+      .signal_pad_offset = block->signal_pad_offset,
+      .has_multicast_support = device_has_multicast_support(block->device_idx)};
+  auto reqs = storeExchange.all_gather(store, rank, world_size, local_req);
+  validate_rendezvous_requests(reqs, world_size);
+
+  std::vector<int> pids(world_size);
+  for (int r = 0; r < world_size; ++r) {
+    pids[r] = reqs[r].pid;
+  }
+  auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd);
+
+  std::vector<HandleType> handles(world_size);
+  std::vector<void*> buffers(world_size, nullptr);
+  std::vector<void*> signal_pads(world_size, nullptr);
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      handles[r] = block->alloc_ref->handle;
+      buffers[r] = ptr;
+      signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset);
+      continue;
+    }
+    // This api imports a GPU memory allocation that was previously exported as a file
+    // descriptor and it returns a memory handle.
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
+        &handles[r],
+        (void*)(uintptr_t)imported_fds[r],
+        CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+#elif defined (USE_ROCM)
+    C10_HIP_CHECK(hipMemImportFromShareableHandle(
+        &handles[r],
+        (void*)(uintptr_t)&(imported_fds[r]),
+        hipMemHandleTypePosixFileDescriptor));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+    map_block(&buffers[r], handles[r], block->block_size, block->device_idx);
+    signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset);
+    close(imported_fds[r]);
+  }
+  storeExchange.barrier(store, rank, world_size);
+  close(block_fd);
+
+  HandleType mc_handle{};
+  void* mc_addr = nullptr;
+  bool group_has_multicast_support = check_group_multicast_support(reqs);
+  if (!allow_overlapping_devices() && group_has_multicast_support) {
+    init_multicast_for_block(
+        mc_handle, mc_addr, block, ipc_channel, pids, store, rank, world_size);
+  }
+
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs;
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      alloc_refs.emplace_back(block->alloc_ref);
+      continue;
+    }
+    alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
+        buffers[r], handles[r], block->block_size, block->device_idx));
+  }
+
+  auto symm_mem = c10::make_intrusive<CUDASymmetricMemory>(
+      std::move(alloc_refs),
+      std::move(buffers),
+      std::move(signal_pads),
+      mc_handle,
+      mc_addr,
+      block->buffer_size,
+      block->device_idx,
+      group_info.rank,
+      group_info.world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   block->symm_mems[group_name_] = symm_mem;
   return symm_mem;
 }
@@ -805,6 +1113,7 @@ bool CUDASymmetricMemoryAllocator::has_multicast_support(int device_idx) {
   return device_has_multicast_support(device_idx);
 }
 
+<<<<<<< HEAD
 c10::DeviceType CUDASymmetricMemoryAllocator::supported_device_type() {
   return c10::DeviceType::CUDA;
 }
@@ -813,6 +1122,8 @@ std::string CUDASymmetricMemoryAllocator::name() {
   return "CUDA";
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::intrusive_ptr<Block> CUDASymmetricMemoryAllocator::find_block(void* ptr) {
   std::shared_lock lock(mutex_);
   auto it = ptr_to_block_.find(ptr);
@@ -824,6 +1135,7 @@ c10::intrusive_ptr<Block> CUDASymmetricMemoryAllocator::find_block(void* ptr) {
 
 struct RegisterCUDASymmetricMemoryAllocator {
   RegisterCUDASymmetricMemoryAllocator() {
+<<<<<<< HEAD
     auto allocator = c10::make_intrusive<CUDASymmetricMemoryAllocator>();
     // Query backend used for CUDA tensor
     // "CUDA" backend stands for this implementation
@@ -833,6 +1145,14 @@ struct RegisterCUDASymmetricMemoryAllocator {
     } else {
       // Register availability in case `set_backend` is called dynamically
       register_availability("CUDA", allocator);
+=======
+    // Query backend used for CUDA tensor
+    // "CUDA" backend stands for this implementation
+    if (getSymmMemBackendCUDA() == "CUDA") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<CUDASymmetricMemoryAllocator>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 };
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
index 39a6122bcdb27..cf47341e6ac4f 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <c10/cuda/CUDAAllocatorConfig.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
@@ -15,14 +18,21 @@ struct AllocationRef : public c10::intrusive_ptr_target {
   HandleType handle;
   size_t block_size;
   int device_idx;
+<<<<<<< HEAD
   bool is_multicast;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AllocationRef(
       void* ptr,
       HandleType handle,
       size_t block_size,
+<<<<<<< HEAD
       int device_idx,
       bool is_multicast = false);
+=======
+      int device_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ~AllocationRef();
 };
@@ -52,14 +62,32 @@ class CUDASymmetricMemory : public SymmetricMemory {
   bool has_multicast_support() override;
   void* get_multicast_ptr() override;
 
+<<<<<<< HEAD
+=======
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) override;
+
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void barrier(int channel, size_t timeout_ms) override;
   void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
   void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
 
   int get_rank() override;
   int get_world_size() override;
+<<<<<<< HEAD
   c10::Device get_device() override;
   bool world_within_direct_access() override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
@@ -108,17 +136,23 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
       void* ptr,
       const std::optional<std::string>& group_name) override;
   bool has_multicast_support(int device_idx) override;
+<<<<<<< HEAD
   c10::DeviceType supported_device_type() override;
   std::string name() override;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   c10::intrusive_ptr<Block> find_block(void* ptr);
 
   std::shared_mutex mutex_;
   std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
+<<<<<<< HEAD
   c10::cuda::CUDACachingAllocator::Expandable_Segments_Handle_Type
       handle_type_ = c10::cuda::CUDACachingAllocator::
           Expandable_Segments_Handle_Type::UNSPECIFIED;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index b09e4f97b344d..92ea9555ae35c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -16,7 +16,10 @@
 #endif
 
 #include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
 
@@ -105,8 +108,12 @@ void init_elementwise_launch_config(
     size_t max_num_blocks,
     size_t max_num_threads,
     int& num_blocks,
+<<<<<<< HEAD
     int& num_threads,
     int world_size) {
+=======
+    int& num_threads) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Align to preserve alignment in each split
   const size_t aligned_numel = at::round_up(numel, alignment * splits);
   const size_t numel_per_split = aligned_numel / splits;
@@ -114,11 +121,17 @@ void init_elementwise_launch_config(
 
   if (numel_per_split <= max_num_threads * numel_per_thread) {
     num_blocks = 1;
+<<<<<<< HEAD
     num_threads = at::ceil_div(numel_per_split, numel_per_thread);
     // `sync_remote_blocks` maps threads to peers, so we need to make sure there
     // are enough threads
     num_threads = max(num_threads, world_size);
     num_threads = at::round_up(num_threads, at::cuda::warp_size());
+=======
+    num_threads = at::round_up(
+        at::ceil_div(numel_per_split, numel_per_thread),
+        static_cast<size_t>(at::cuda::warp_size()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     num_blocks = std::min(
         at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),
@@ -138,7 +151,11 @@ static __global__ void multimem_all_reduce_kernel(
   static_assert(alignment % sizeof(T) == 0);
   constexpr size_t numel_per_thread = alignment / sizeof(T);
 
+<<<<<<< HEAD
   sync_remote_blocks<false, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __syncthreads();
 
   const size_t numel_per_rank =
@@ -156,7 +173,11 @@ static __global__ void multimem_all_reduce_kernel(
   }
 
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor multimem_all_reduce_(
@@ -189,8 +210,12 @@ at::Tensor multimem_all_reduce_(
       8,
       1024,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_all_reduce_", [&]() {
@@ -214,12 +239,17 @@ at::Tensor multimem_all_reduce_(
 }
 
 template <typename T, int alignment>
+<<<<<<< HEAD
 static __global__ void multimem_one_shot_reduce_kernel(
+=======
+static __global__ void multimem_one_shot_all_reduce_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T* input_mc_ptr,
     T* output_ptr,
     size_t numel,
     uint32_t** signal_pads,
     size_t rank,
+<<<<<<< HEAD
     size_t world_size,
     int64_t root) {
   static_assert(alignment % sizeof(T) == 0);
@@ -245,18 +275,55 @@ at::Tensor multimem_one_shot_reduce_out(
     const at::Tensor& input,
     std::string reduce_op,
     int64_t root,
+=======
+    size_t world_size) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  for (size_t i = offset; i < numel; i += stride) {
+    auto vec = multimem_ld_reduce_add<alignment>(input_mc_ptr + i);
+    at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+}
+
+at::Tensor multimem_one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string group_name,
     at::Tensor out) {
   TORCH_CHECK(
       input.is_contiguous(),
+<<<<<<< HEAD
       "multimem_one_shot_reduce: input must be contiguous.");
   TORCH_CHECK(
       reduce_op == "sum",
       "multimem_one_shot_reduce: only sum is supported for now.");
+=======
+      "multimem_one_shot_all_reduce: input must be contiguous.");
+  TORCH_CHECK(
+      out.is_contiguous(),
+      "multimem_one_shot_all_reduce: output must be contiguous.");
+  TORCH_CHECK(
+      out.sizes() == input.sizes(),
+      "multimem_one_shot_all_reduce: input/output size mismatch.");
+  TORCH_CHECK(
+      reduce_op == "sum",
+      "multimem_one_shot_all_reduce: only sum is supported for now.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
   TORCH_CHECK(
       symm_mem != nullptr,
+<<<<<<< HEAD
       "multimem_one_shot_reduce: input must be allocated with empty_strided_p2p().");
   TORCH_CHECK(
       symm_mem->has_multicast_support(),
@@ -276,6 +343,12 @@ at::Tensor multimem_one_shot_reduce_out(
         out.sizes() == input.sizes(),
         "multimem_one_shot_reduce: input/output size mismatch.");
   }
+=======
+      "multimem_one_shot_all_reduce: input must be allocated with empty_strided_p2p().");
+  TORCH_CHECK(
+      symm_mem->has_multicast_support(),
+      "multimem_one_shot_all_reduce: requires multicast support.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const size_t alignment =
       get_and_verify_alignment(input, "multimem_one_shot_all_reduce");
@@ -289,13 +362,21 @@ at::Tensor multimem_one_shot_reduce_out(
       8,
       1024,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_one_shot_all_reduce", [&]() {
         DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+<<<<<<< HEAD
           multimem_one_shot_reduce_kernel<scalar_t, k_alignment>
+=======
+          multimem_one_shot_all_reduce_kernel<scalar_t, k_alignment>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               <<<num_blocks,
                  num_threads,
                  0,
@@ -306,15 +387,21 @@ at::Tensor multimem_one_shot_reduce_out(
                   input.numel(),
                   reinterpret_cast<uint32_t**>(
                       symm_mem->get_signal_pad_ptrs_dev()),
+<<<<<<< HEAD
                   rank,
                   world_size,
                   root);
+=======
+                  symm_mem->get_rank(),
+                  symm_mem->get_world_size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           C10_CUDA_KERNEL_LAUNCH_CHECK();
         });
       });
   return out;
 }
 
+<<<<<<< HEAD
 at::Tensor multimem_one_shot_all_reduce_out(
     const at::Tensor& input,
     std::string reduce_op,
@@ -325,6 +412,8 @@ at::Tensor multimem_one_shot_all_reduce_out(
   return multimem_one_shot_reduce_out(input, reduce_op, root, group_name, out);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor multimem_one_shot_all_reduce(
     const at::Tensor& input,
     std::string reduce_op,
@@ -341,7 +430,11 @@ static __global__ void multimem_all_gather_kernel(
     uint32_t** signal_pads,
     size_t rank,
     size_t world_size) {
+<<<<<<< HEAD
   sync_remote_blocks<false, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __syncthreads();
 
   const size_t start = bytes_per_rank * rank;
@@ -354,7 +447,11 @@ static __global__ void multimem_all_gather_kernel(
   }
 
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor multimem_all_gather_out(
@@ -408,8 +505,12 @@ at::Tensor multimem_all_gather_out(
       8,
       1024,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
     multimem_all_gather_kernel<k_alignment>
@@ -433,6 +534,10 @@ at::Tensor multimem_all_gather_out(
 // count to 512 to prevent/alleviate register spill.
 constexpr size_t one_shot_all_reduce_max_num_blocks = 24;
 constexpr size_t one_shot_all_reduce_max_num_threads = 512;
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T, int alignment, int k_world_size>
 static __launch_bounds__(one_shot_all_reduce_max_num_threads) __global__
     void one_shot_all_reduce_kernel(
@@ -456,7 +561,11 @@ static __launch_bounds__(one_shot_all_reduce_max_num_threads) __global__
     }
   }
   // TODO make it sync with one block for no-copy case
+<<<<<<< HEAD
   sync_remote_blocks<true, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __syncthreads();
 
   for (size_t i = offset; i < numel; i += stride) {
@@ -466,7 +575,11 @@ static __launch_bounds__(one_shot_all_reduce_max_num_threads) __global__
   }
 
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, false>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor one_shot_all_reduce_out_impl(
@@ -524,8 +637,12 @@ at::Tensor one_shot_all_reduce_out_impl(
       one_shot_all_reduce_max_num_blocks,
       one_shot_all_reduce_max_num_threads,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "one_shot_all_reduce", [&]() {
@@ -592,6 +709,7 @@ at::Tensor one_shot_all_reduce_copy(
       input, local_input, reduce_op, group_name, out);
 }
 
+<<<<<<< HEAD
 #if defined(USE_ROCM)
 constexpr size_t two_shot_all_reduce_max_num_blocks = 64;
 constexpr size_t two_shot_all_reduce_max_num_threads = 128;
@@ -599,6 +717,11 @@ constexpr size_t two_shot_all_reduce_max_num_threads = 128;
 constexpr size_t two_shot_all_reduce_max_num_blocks = 24;
 constexpr size_t two_shot_all_reduce_max_num_threads = 1024;
 #endif
+=======
+constexpr size_t two_shot_all_reduce_max_num_blocks = 24;
+constexpr size_t two_shot_all_reduce_max_num_threads = 1024;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <
     typename T,
     int alignment,
@@ -619,7 +742,11 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
   constexpr size_t numel_per_thread = alignment / sizeof(T);
   int32_t N_last_dim =
       last_dim_size / world_size; // used only for split_last_dim reduce_scatter
+<<<<<<< HEAD
   sync_remote_blocks<false, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __syncthreads();
 
   const size_t numel_per_rank =
@@ -651,7 +778,11 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
   }
 
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if constexpr (reduce_scatter) {
     return;
   }
@@ -662,16 +793,22 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
     for (size_t step = 0; step < k_world_size; ++step) {
       size_t remote_rank = (rank + step) % k_world_size;
       size_t remote_start = numel_per_rank * remote_rank;
+<<<<<<< HEAD
 #if defined (USE_ROCM)
       tmp[step] = at::native::memory::ld_vec<alignment>(
           input_ptrs[remote_rank] + input_offset + min(remote_start + i, numel-1));
 #else
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (remote_start + i >= numel) {
         continue;
       }
       tmp[step] = at::native::memory::ld_vec<alignment>(
           input_ptrs[remote_rank] + input_offset + remote_start + i);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 #pragma unroll k_world_size
     for (size_t step = 0; step < k_world_size; ++step) {
@@ -686,7 +823,11 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
   // need to make sure all blocks exit simultaneously so that the data
   // is not corrupted by the subsequent kernels
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, false>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T, int alignment, int k_world_size>
@@ -701,7 +842,11 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
   static_assert(alignment % sizeof(T) == 0);
   constexpr size_t numel_per_thread = alignment / sizeof(T);
 
+<<<<<<< HEAD
   sync_remote_blocks<false, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __syncthreads();
 
   const size_t numel_per_rank =
@@ -724,7 +869,11 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
   }
 
   __syncthreads();
+<<<<<<< HEAD
   sync_remote_blocks<true, true>(signal_pads, rank, world_size);
+=======
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 at::Tensor two_shot_all_reduce_impl(
@@ -780,8 +929,12 @@ at::Tensor two_shot_all_reduce_impl(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (!output.has_value()) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
@@ -928,8 +1081,12 @@ at::Tensor reduce_scatter_out(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
+<<<<<<< HEAD
       num_threads,
       symm_mem->get_world_size());
+=======
+      num_threads);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (split_last_dim) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
         input.scalar_type(), "two_shot_all_reduce", [&]() {
@@ -1088,6 +1245,7 @@ at::Tensor reduce_scatter_out(
   TORCH_CHECK(false, "reduce_scatter_out: requires CUDA 12.3+.");
   return output;
 }
+<<<<<<< HEAD
 
 at::Tensor multimem_one_shot_reduce_out(
     const at::Tensor& input,
@@ -1099,6 +1257,8 @@ at::Tensor multimem_one_shot_reduce_out(
   return out;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 #endif // #if defined(CUDART_VERSION) && CUDART_VERSION < 12030
 
@@ -1247,8 +1407,11 @@ TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("multimem_one_shot_all_reduce", ::multimem_one_shot_all_reduce);
   m.impl(
       "multimem_one_shot_all_reduce_out", ::multimem_one_shot_all_reduce_out);
+<<<<<<< HEAD
   m.impl(
       "multimem_one_shot_reduce_out", ::multimem_one_shot_reduce_out);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.impl("multimem_all_gather_out", ::multimem_all_gather_out);
 #endif
   m.impl("stream_write_value32_", ::stream_write_value32_);
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
index daf273446ef3a..4f849f7e51e1a 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <cstdint>
 
 namespace c10d::symmetric_memory {
@@ -13,6 +14,11 @@ constexpr int symm_max_nblocks = 32;
 // channels. Each signal is 32 bits, which is the minimum unit for atomic cas.
 constexpr size_t signal_pad_size =
     symm_max_nblocks * max_cuda_p2p_domain_size * sizeof(uint32_t);
+=======
+namespace c10d::symmetric_memory {
+
+constexpr size_t signal_pad_size = 2048;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
index 04838b1581ad2..60f7c9d593366 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@@ -123,7 +123,11 @@ void IpcChannel::send_fd(int dst_pid, int fd) {
     msg.msg_controllen = 0;
   }
 
+<<<<<<< HEAD
   // Finally send the message
+=======
+  // Finally send the the message
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       sendmsg(socket_, &msg, 0) > 0,
       "Failed to send fd: ",
@@ -178,7 +182,11 @@ std::vector<int> IpcChannel::all_gather_fds(
     int rank,
     const std::vector<int>& pids,
     int fd) {
+<<<<<<< HEAD
   int world_size = static_cast<int>(pids.size());
+=======
+  int world_size = (int)pids.size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int> fds(pids.size());
   fds[rank] = fd;
 
@@ -197,10 +205,17 @@ int IpcChannel::broadcast_fds(
     int src_rank,
     const std::vector<int>& pids,
     int fd) {
+<<<<<<< HEAD
   int world_size = static_cast<int>(pids.size());
 
   if (rank == src_rank) {
     for (int dst_rank = 0; dst_rank < world_size; ++dst_rank) {
+=======
+  int world_size = (int)pids.size();
+
+  if (rank == src_rank) {
+    for (int dst_rank = 0; dst_rank < (int)world_size; ++dst_rank) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (dst_rank == rank) {
         continue;
       }
@@ -242,7 +257,11 @@ void map_block(
   CUmemAccessDesc desc;
   desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+<<<<<<< HEAD
   desc.location.id = device_idx;
+=======
+  desc.location.id = static_cast<int>(device_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemSetAccess_(*dev_ptr, size, &desc, 1));
 #elif defined(USE_ROCM)
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 0eda605fad6fb..6efff60301900 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -93,6 +93,85 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
+<<<<<<< HEAD
+=======
+  // TODO: This is up for change.
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) {
+    // TODO: deduplicate
+    const size_t numel = std::accumulate(
+        sizes.begin(),
+        sizes.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto element_size = c10::elementSize(dtype);
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= buffer_size_,
+        "NCCLSymmetricMemory::get_buffer: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        buffer_size_,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(dtype).device(device);
+    return at::for_blob(data_ptr, sizes)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  // TODO: This is up for change.
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override {
+    // TODO: deduplicate
+    // If the dtype is unspecified, default it to UInt32, as it
+    // is the most common type for signaling purposes.
+    if (!dtype.has_value()) {
+      dtype = c10::ScalarType::UInt32;
+    }
+
+    // If the shape is unspecified, treat the signal pad as a 1d tensor.
+    const auto element_size = c10::elementSize(*dtype);
+    std::vector<int64_t> shape;
+    if (!sizes.empty()) {
+      shape = sizes.vec();
+    } else {
+      shape.push_back(signal_pad_size / element_size);
+    }
+
+    const size_t numel = std::accumulate(
+        shape.begin(),
+        shape.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= signal_pad_size,
+        "NCCLSymmetricMemory::get_signal_pad: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        signal_pad_size,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(*dtype).device(device);
+    return at::for_blob(data_ptr, shape)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void barrier(int channel, size_t timeout_ms) override {
     // TODO
   }
@@ -113,10 +192,13 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return world_size_;
   }
 
+<<<<<<< HEAD
   c10::Device get_device() override {
     return c10::Device(c10::DeviceType::CUDA, device_idx_);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual std::vector<int>& get_rank_to_global_rank() override {
     return rank_to_global_rank_;
   };
@@ -157,6 +239,11 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
 
     auto group_info = get_group_info("0");
     auto store = group_info.store;
+<<<<<<< HEAD
+=======
+    int rank = group_info.rank;
+    int world_size = group_info.world_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::cuda::CUDAGuard guard(device_idx);
     // TODO: we might need to use a roundup or mempool for mem allocation.
     void* ptr;
@@ -204,6 +291,10 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     ncclWindow_t signal_handle;
 
     auto group_info = get_group_info(group_name.value());
+<<<<<<< HEAD
+=======
+    auto global_rank = get_group_info("0").rank;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto buffer_size_map =
         storeExchange.all_gather(group_info.store, group_info.rank, group_info.world_size, it->second->buffer_size);
 
@@ -229,7 +320,11 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
           comm));
 
     void* signal_pad_ptr;
+<<<<<<< HEAD
     C10D_NCCL_CHECK(ncclMemAlloc(&signal_pad_ptr, signal_pad_size), "ncclMemAlloc failed");
+=======
+    TORCH_CHECK(ncclMemAlloc(&signal_pad_ptr, signal_pad_size) == ncclSuccess, "ncclMemAlloc failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10D_NCCL_CHECK(
     ncclCommWindowRegister(comm, signal_pad_ptr, signal_pad_size, (ncclWindow_t*)&signal_handle, NCCL_WIN_COLL_SYMMETRIC),
     c10::str(
@@ -252,6 +347,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     return false;
   };
 
+<<<<<<< HEAD
   c10::DeviceType supported_device_type() override {
     return c10::DeviceType::CUDA;
   }
@@ -260,6 +356,8 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     return "NCCL";
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   std::unordered_map<void*, c10::intrusive_ptr<SymmetricMemory>>
       ptr_to_symm_mem_;
@@ -271,6 +369,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
 
 struct RegisterNCCLSymmetricMemoryAllocator {
     RegisterNCCLSymmetricMemoryAllocator() {
+<<<<<<< HEAD
     auto allocator = c10::make_intrusive<NCCLSymmetricMemoryAllocator>();
     // Query backend used for CUDA tensor
     if (getSymmMemBackendCUDA() == "NCCL") {
@@ -281,6 +380,13 @@ struct RegisterNCCLSymmetricMemoryAllocator {
     } else {
       // Register availability in case `set_backend` is called dynamically
       register_availability("NCCL", allocator);
+=======
+    // Query backend used for CUDA tensor
+    if (getSymmMemBackendCUDA() == "NCCL") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<NCCLSymmetricMemoryAllocator>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 };
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 69e75df453f51..939583cdf175c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -9,6 +9,7 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/error.h>
+<<<<<<< HEAD
 
 // Starting from NVSHMEM 3.3.9, nvshmem_host.h exists so that we can cleanly
 // include only the nvshmem host library headers:
@@ -17,11 +18,20 @@
 #include <host/nvshmem_api.h>
 #include <host/nvshmemx_api.h>
 // For maximum compatibility, we use the "host/" style for now.
+=======
+#include <utility>
+
+#include <nvshmem.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10d {
 namespace symmetric_memory {
 
+<<<<<<< HEAD
 /* Start of NVSHMEMSymmetricMemory implementation */
+=======
+/* Start of CUDASymmetricMemory implementation */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static StoreExchange storeExchange = StoreExchange("NVSHMEMSymmetricMemory");
 
@@ -43,6 +53,7 @@ struct NVSHMEMAllocation {
   }
 };
 
+<<<<<<< HEAD
 // A class to hold the base pointers and signal pad pointers for a group of
 // peers. One `NVSHMEMPeerAllocInfo` object can be shared by multiple
 // `NVSHMEMSymmetricMemory` objects when latter reside on the same allocation
@@ -61,6 +72,23 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
 
     auto global_rank = get_group_info("0").rank;
     GroupInfo& group_info = get_group_info(group_name);
+=======
+class NVSHMEMSymmetricMemory : public SymmetricMemory {
+ public:
+  NVSHMEMSymmetricMemory(
+      std::shared_ptr<NVSHMEMAllocation> allocation,
+      const std::string& group_name)
+      : allocation_(allocation),
+        buffer_size_(allocation->buffer_size),
+        device_idx_(allocation->device_idx),
+        group_name_(group_name) {
+    // For logging only
+    static int exchanged_n_times = 0;
+    c10::cuda::CUDAGuard guard(device_idx_);
+
+    auto global_rank = get_group_info("0").rank;
+    GroupInfo& group_info = get_group_info(group_name_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto store = group_info.store;
     rank_ = group_info.rank;
     world_size_ = group_info.world_size;
@@ -73,12 +101,17 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
       if (rank_ == 0) {
         LOG(INFO) << "[rank " << rank_ << "]"
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
+<<<<<<< HEAD
                   << ", group_name: " << group_name
+=======
+                  << ", group_name: " << group_name_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   << ", exchanged_n_times: " << exchanged_n_times;
       }
     }
     TORCH_INTERNAL_ASSERT(!group_info.rank_to_global_rank.empty());
     rank_to_global_rank_ = group_info.rank_to_global_rank;
+<<<<<<< HEAD
 
     world_within_cuda_p2p_ = true;
     for (int r = 0; r < world_size_; ++r) {
@@ -89,11 +122,19 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
       if (peer_ptr == nullptr) {
         world_within_cuda_p2p_ = false;
       }
+=======
+    for (int r = 0; r < world_size_; ++r) {
+      buffers_.push_back(nvshmem_ptr(
+          allocation->ptr, rank_to_global_rank_[r]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     // TODO: use the same allocation for signal pad
     void* signal_pad_ptr = nvshmem_malloc(signal_pad_size);
+<<<<<<< HEAD
     TORCH_CHECK(signal_pad_ptr != nullptr, "nvshmem_malloc failed");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_CUDA_CHECK(cudaMemset(signal_pad_ptr, 0, signal_pad_size));
 
     for (int r = 0; r < world_size_; ++r) {
@@ -124,6 +165,7 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
         cudaMemcpyHostToDevice));
   }
 
+<<<<<<< HEAD
  private:
   void* base_ptr_;
   size_t buffer_size_;
@@ -165,11 +207,14 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     offset_ = offset;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~NVSHMEMSymmetricMemory() override{
       // TODO
   };
 
   std::vector<void*> get_buffer_ptrs() override {
+<<<<<<< HEAD
     return pai_->buffers_;
   }
 
@@ -187,6 +232,25 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
 
   size_t get_buffer_size() override {
     return pai_->buffer_size_;
+=======
+    return buffers_;
+  }
+
+  std::vector<void*> get_signal_pad_ptrs() override {
+    return signal_pads_;
+  }
+
+  void** get_buffer_ptrs_dev() override {
+    return buffers_dev_;
+  }
+
+  void** get_signal_pad_ptrs_dev() override {
+    return signal_pads_dev_;
+  }
+
+  size_t get_buffer_size() override {
+    return buffer_size_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   size_t get_signal_pad_size() override {
@@ -203,8 +267,83 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
+<<<<<<< HEAD
   size_t get_offset() override {
     return offset_;
+=======
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) {
+    // TODO: deduplicate
+    const size_t numel = std::accumulate(
+        sizes.begin(),
+        sizes.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto element_size = c10::elementSize(dtype);
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= buffer_size_,
+        "NVSHMEMSymmetricMemory::get_buffer: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        buffer_size_,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(dtype).device(device);
+    return at::for_blob(data_ptr, sizes)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override {
+    // TODO: deduplicate
+    // If the dtype is unspecified, default it to UInt32, as it
+    // is the most common type for signaling purposes.
+    if (!dtype.has_value()) {
+      dtype = c10::ScalarType::UInt32;
+    }
+
+    // If the shape is unspecified, treat the signal pad as a 1d tensor.
+    const auto element_size = c10::elementSize(*dtype);
+    std::vector<int64_t> shape;
+    if (!sizes.empty()) {
+      shape = sizes.vec();
+    } else {
+      shape.push_back(signal_pad_size / element_size);
+    }
+
+    const size_t numel = std::accumulate(
+        shape.begin(),
+        shape.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= signal_pad_size,
+        "NVSHMEMSymmetricMemory::get_signal_pad: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        signal_pad_size,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(*dtype).device(device);
+    return at::for_blob(data_ptr, shape)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void barrier(int channel, size_t timeout_ms) override {
@@ -220,6 +359,7 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   }
 
   int get_rank() override {
+<<<<<<< HEAD
     return pai_->rank_;
   }
 
@@ -312,6 +452,39 @@ static void initialize_nvshmem_with_store(
   LOG(INFO) << "NVSHMEM is available, version: " << major << '.' << minor;
 }
 
+=======
+    return rank_;
+  }
+
+  int get_world_size() override {
+    return world_size_;
+  }
+
+  virtual const std::vector<int>& get_rank_to_global_rank() override {
+    return rank_to_global_rank_;
+  };
+
+  int* get_rank_to_global_rank_dev() override {
+    return rank_to_global_rank_dev_;
+  };
+
+ private:
+  std::shared_ptr<NVSHMEMAllocation> allocation_;
+  size_t buffer_size_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  int device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::string group_name_;
+
+  std::vector<int> rank_to_global_rank_;
+  int* rank_to_global_rank_dev_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
  public:
   void* alloc(
@@ -322,13 +495,17 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         group_name == std::nullopt,
         "NVSHMEMSymmetricMemoryAllocator::alloc "
         "must not be called with a group_name");
+<<<<<<< HEAD
     c10::cuda::CUDAGuard guard(device_idx);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto group_info = get_group_info("0");
     auto store = group_info.store;
     int rank = group_info.rank;
     int world_size = group_info.world_size;
 
+<<<<<<< HEAD
     initialize_nvshmem_with_store(store, rank, world_size, device_idx);
     auto ptr = nvshmem_malloc(size);
     // If size is 0 (which is legal allocation request) we shouldn't error out
@@ -337,6 +514,14 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     allocations_.try_emplace(
       ptr,
       std::make_unique<NVSHMEMAllocation>(ptr, size, device_idx));
+=======
+    nvshmem_extension::initialize_nvshmem_with_store(store, rank, world_size);
+    auto ptr = nvshmem_malloc(size);
+    auto allocation =
+        std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
+    // TODO: thread safety
+    allocations_.try_emplace(ptr, std::move(allocation));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ptr;
   }
 
@@ -364,6 +549,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         return it->second;
       }
     }
+<<<<<<< HEAD
     // In case of MemPool, tensor.storage().data_ptr() may not match
     // exactly an allocation's base address. Thus we perform the search by
     // testing if the former is within an allocation's range.
@@ -406,6 +592,15 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
       // "shallow" copy adjusting the offset field in the handle.
       return c10::make_intrusive<NVSHMEMSymmetricMemory>(*symm_mem, (uintptr_t)ptr - (uintptr_t)allocation->ptr);
     }
+=======
+    auto it = allocations_.find(ptr);
+    TORCH_CHECK(it != allocations_.end());
+    auto symm_mem =
+        c10::make_intrusive<NVSHMEMSymmetricMemory>(it->second, *group_name);
+
+    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
+    return symm_mem;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   bool has_multicast_support(int device_idx) override {
@@ -413,6 +608,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     return false;
   };
 
+<<<<<<< HEAD
   c10::DeviceType supported_device_type() override {
     return c10::DeviceType::CUDA;
   }
@@ -424,11 +620,17 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
  private:
   std::unordered_map<void*, std::unique_ptr<NVSHMEMAllocation>> allocations_;
   std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
+=======
+ private:
+  std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       symm_mems_;
 };
 
 struct RegisterNVSHMEMSymmetricMemoryAllocator {
   RegisterNVSHMEMSymmetricMemoryAllocator() {
+<<<<<<< HEAD
     auto allocator = c10::make_intrusive<NVSHMEMSymmetricMemoryAllocator>();
     // Query backend used for CUDA tensor
     if (getSymmMemBackendCUDA() == "NVSHMEM") {
@@ -439,6 +641,13 @@ struct RegisterNVSHMEMSymmetricMemoryAllocator {
     } else {
       // Register availability in case `set_backend` is called dynamically
       register_availability("NVSHMEM", allocator);
+=======
+    // Query backend used for CUDA tensor
+    if (getSymmMemBackendCUDA() == "NVSHMEM") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<NVSHMEMSymmetricMemoryAllocator>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 };
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
index ac9b1e1a69ca2..e65ce4f4ba019 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -22,6 +22,7 @@ class AllocatorMap {
     map_[device_type] = std::move(allocator);
   }
 
+<<<<<<< HEAD
   void register_availability(
       const std::string& name,
       c10::intrusive_ptr<SymmetricMemoryAllocator> allocator) {
@@ -55,6 +56,8 @@ class AllocatorMap {
     return it->second->name();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
       c10::DeviceType device_type) {
     auto it = map_.find(device_type);
@@ -62,7 +65,10 @@ class AllocatorMap {
         it != map_.end(),
         "SymmetricMemory does not support device type ",
         device_type);
+<<<<<<< HEAD
     in_use_ = true;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return it->second;
   }
 
@@ -72,6 +78,10 @@ class AllocatorMap {
   }
 
   ~AllocatorMap() {
+<<<<<<< HEAD
+=======
+    LOG(INFO) << "Destroying Symmetric Memory Allocators";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_finalizing_ = true;
   }
 
@@ -82,6 +92,7 @@ class AllocatorMap {
       c10::DeviceType,
       c10::intrusive_ptr<SymmetricMemoryAllocator>>
       map_;
+<<<<<<< HEAD
 
   // For backends to register availability.
   // This registration is at static time. Therefore, it is expected that the
@@ -93,6 +104,8 @@ class AllocatorMap {
       avail_map_;
 
   bool in_use_ = false;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static std::unordered_map<std::string, GroupInfo> group_info_map{};
@@ -125,7 +138,11 @@ static at::Tensor empty_strided_p2p_persistent(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
+<<<<<<< HEAD
       static_cast<size_t>(1),
+=======
+      size_t(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
@@ -152,7 +169,12 @@ static at::Tensor empty_strided_p2p_persistent(
   auto allocated = at::from_blob(dev_ptr, size, stride, options);
 
   // Track the allocation's activeness
+<<<<<<< HEAD
   alloc_id_to_storage.insert_or_assign(
+=======
+  alloc_id_to_storage.erase(alloc_id);
+  alloc_id_to_storage.emplace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       alloc_id, allocated.storage().getWeakStorageImpl());
   return allocated;
 }
@@ -172,6 +194,7 @@ void register_allocator(
       device_type, std::move(allocator));
 }
 
+<<<<<<< HEAD
 void register_availability(
     const std::string& name,
     c10::intrusive_ptr<SymmetricMemoryAllocator> allocator) {
@@ -186,6 +209,8 @@ std::optional<std::string> get_backend(c10::Device device) {
   return AllocatorMap::get().get_backend(device.type());
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool has_allocator(c10::DeviceType device_type) {
   return AllocatorMap::get().has_allocator(device_type);
 }
@@ -230,7 +255,11 @@ at::Tensor empty_strided_p2p(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
+<<<<<<< HEAD
       static_cast<size_t>(1),
+=======
+      size_t(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
@@ -265,6 +294,7 @@ TORCH_API bool has_multicast_support(
     return allocator->has_multicast_support(device_idx);
   }
 }
+<<<<<<< HEAD
 
 // MemPool Support
 
@@ -426,6 +456,8 @@ at::Tensor SymmetricMemory::get_signal_pad(
       .make_tensor();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d::symmetric_memory
 
 namespace {
@@ -457,8 +489,11 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
   m.def(
       "multimem_one_shot_all_reduce_out(Tensor input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)");
   m.def(
+<<<<<<< HEAD
       "multimem_one_shot_reduce_out(Tensor input, str reduce_op, int root, str group_name, Tensor(a!) out) -> Tensor(a!)");
   m.def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "multimem_all_gather_out(Tensor input, str group_name, Tensor(a!) out) -> Tensor(a!)");
   m.def(
       "one_shot_all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor");
@@ -497,6 +532,7 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
       "memset32_(Tensor(a!) input, int offset, int val, int count) -> Tensor(a!)");
 
   m.def("nvshmem_put(Tensor(a!) tensor, int peer) -> ()");
+<<<<<<< HEAD
   m.def("nvshmem_get(Tensor(a!) tensor, int peer) -> ()");
   m.def(
       "nvshmem_broadcast(Tensor(a!) input, int root, str group_name) -> Tensor(a!)");
@@ -515,6 +551,15 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
       "tile_reduce(Tensor in_tile, Tensor(a!) out_tile, int root, str group_name, str reduce_op='sum') -> ()");
   m.def(
       "multi_root_tile_reduce(Tensor[] in_tiles, Tensor(a!) out_tile, int[] roots, str group_name, str reduce_op='sum') -> ()");
+=======
+  m.def("nvshmem_broadcast(Tensor(a!) input, str group_name) -> Tensor(a!)");
+  m.def(
+      "nvshmem_all_to_all(Tensor input, Tensor(a!) out, str group_name) -> Tensor(a!)");
+  m.def(
+      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
+  m.def(
+      "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int? major_align=None) -> Tensor(a!)");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
index d2cb70e1b1ae9..0ba8fd06eb1c7 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -50,6 +50,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual size_t get_buffer_size() = 0;
   virtual size_t get_signal_pad_size() = 0;
 
+<<<<<<< HEAD
   virtual size_t get_offset() {
     TORCH_CHECK(false, "NYI");
   }
@@ -73,6 +74,22 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
       int peer,
       c10::IntArrayRef sizes,
       c10::ScalarType dtype);
+=======
+  virtual bool has_multicast_support() = 0;
+  virtual void* get_multicast_ptr() = 0;
+
+  virtual at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) = 0;
+
+  virtual at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype = std::nullopt,
+      int64_t storage_offset = 0) = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   virtual void barrier(int channel, size_t timeout_ms) = 0;
   virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
@@ -80,7 +97,10 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 
   virtual int get_rank() = 0;
   virtual int get_world_size() = 0;
+<<<<<<< HEAD
   virtual c10::Device get_device() = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   virtual const std::vector<int>& get_rank_to_global_rank() {
     TORCH_CHECK(false, "NYI");
@@ -89,12 +109,15 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual int* get_rank_to_global_rank_dev() {
     TORCH_CHECK(false, "NYI");
   }
+<<<<<<< HEAD
 
   // Returns true if *all* peers within the group are accessible via direct
   // memory load and store.
   virtual bool world_within_direct_access() {
     TORCH_CHECK(false, "NYI");
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
@@ -112,8 +135,11 @@ class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
       void* ptr,
       const std::optional<std::string>& group_name) = 0;
   virtual bool has_multicast_support(int device_idx) = 0;
+<<<<<<< HEAD
   virtual c10::DeviceType supported_device_type() = 0;
   virtual std::string name() = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 C10_EXPORT bool is_finalizing();
@@ -122,10 +148,13 @@ C10_EXPORT void register_allocator(
     c10::DeviceType device_type,
     c10::intrusive_ptr<SymmetricMemoryAllocator> allocator);
 
+<<<<<<< HEAD
 C10_EXPORT void register_availability(
     const std::string& name,
     c10::intrusive_ptr<SymmetricMemoryAllocator> allocator);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_EXPORT bool has_allocator(c10::DeviceType device_type);
 
 C10_EXPORT c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
@@ -195,6 +224,7 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> rendezvous(
 TORCH_API bool has_multicast_support(
     c10::DeviceType device_type,
     int device_idx);
+<<<<<<< HEAD
 
 TORCH_API void set_backend(const std::string& name);
 
@@ -207,4 +237,6 @@ C10_EXPORT void register_mempool_allocator(
 TORCH_API std::shared_ptr<c10::Allocator> get_mempool_allocator(
     c10::Device device);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index cb5d40ef41837..71ac44ef96ea6 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -1,14 +1,21 @@
 #include <dlfcn.h>
+<<<<<<< HEAD
 #include <ATen/ceil_div.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include <torch/csrc/distributed/c10d/symm_mem/env.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp>
+=======
+#include <c10/cuda/CUDAGuard.h>
+
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 
+<<<<<<< HEAD
 #include <ATen/ceil_div.h>
 // Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
 #include <ATen/cuda/cub.cuh>
@@ -39,6 +46,22 @@ namespace c10d::nvshmem_extension {
 #define WARP_SIZE 32
 
 extern "C" void nvshmem_init() __attribute__((weak));
+=======
+#include <cuda_awbarrier_primitives.h>
+// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
+#include <ATen/cuda/cub.cuh>
+#include <nvshmem.h>
+
+namespace c10d::nvshmem_extension {
+
+using c10d::symmetric_memory::StoreExchange;
+static StoreExchange storeExchange = StoreExchange("nvshmem_ext");
+
+#define THREADS_PER_BLOCK 512
+#define WARP_SIZE 32
+
+constexpr int MiB = 1024 * 1024;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Check if NVSHMEM is available
 bool is_nvshmem_available() {
@@ -46,12 +69,15 @@ bool is_nvshmem_available() {
   static std::mutex mutex;
   static int is_available = -2;
   std::lock_guard<std::mutex> lock(mutex);
+<<<<<<< HEAD
 
   // Checked if the symbol is statically linked
   if(is_available == -2 && nvshmem_init) {
     is_available = 1;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_available == -2) {
     void* handle{};
     // Open the shared library, RTLD_LAZY defers symbol resolution until needed
@@ -68,10 +94,72 @@ bool is_nvshmem_available() {
   return is_available == 1;
 }
 
+<<<<<<< HEAD
+=======
+// Bootstrap based on user's setting for NCCL
+// Long term, this may be a bit unclean; short term, it improves UX
+void maybe_initialize_env_vars() {
+  auto nccl_socket_if_name = c10::utils::get_env("NCCL_SOCKET_IFNAME");
+  auto nccl_hca_list = c10::utils::get_env("NCCL_IB_HCA");
+  auto nccl_ib_gid_index = c10::utils::get_env("NCCL_IB_GID_INDEX");
+  auto nvshmem_socket_if_name =
+      c10::utils::get_env("NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME");
+  auto nvshmem_hca_list = c10::utils::get_env("NCCL_IB_HCA");
+  auto nvshmem_ib_gid_index = c10::utils::get_env("NVSHMEM_IB_GID_INDEX");
+
+  if (!nvshmem_socket_if_name.has_value() && nccl_socket_if_name.has_value()) {
+    c10::utils::set_env(
+        "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME", nccl_socket_if_name->c_str());
+  }
+  if (!nvshmem_hca_list.has_value() && nccl_hca_list.has_value()) {
+    c10::utils::set_env("NVSHMEM_ENABLE_NIC_PE_MAPPING", "1");
+    c10::utils::set_env("NVSHMEM_HCA_LIST", nccl_hca_list->c_str());
+  }
+  if (!nvshmem_ib_gid_index.has_value() && nccl_ib_gid_index.has_value()) {
+    c10::utils::set_env("NVSHMEM_IB_GID_INDEX", nccl_ib_gid_index->c_str());
+  }
+}
+
+void initialize_nvshmem_with_store(
+    c10::intrusive_ptr<c10d::Store> store,
+    int rank,
+    int world_size) {
+  static bool is_initialized = false;
+  if (is_initialized) {
+    return;
+  }
+
+  maybe_initialize_env_vars();
+
+  nvshmemx_uniqueid_t unique_id;
+  TORCH_CHECK(
+      nvshmemx_get_uniqueid(&unique_id) == 0, "nvshmemx_get_uniqueid failed");
+
+  // Using an existing store_all_gather due to laziness.
+  // TODO(yifu): should use broadcast
+  auto unique_ids = storeExchange.all_gather(store, rank, world_size, unique_id);
+
+  nvshmemx_init_attr_t attr;
+  nvshmemx_set_attr_uniqueid_args(rank, world_size, &unique_ids[0], &attr);
+
+  TORCH_CHECK(
+      nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr) == 0,
+      "nvshmemx_init_attr failed");
+
+  is_initialized = true;
+
+  // Print version
+  int major, minor;
+  ::nvshmem_info_get_version(&major, &minor);
+  LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
 // operations.
 void nvshmemx_cumodule_init(uintptr_t module) {
   auto cumodule = reinterpret_cast<CUmodule>(module);
+<<<<<<< HEAD
   NVSHMEM_CHECK(
     ::nvshmemx_cumodule_init(cumodule),
     "nvshmemx_cumodule_init failed");
@@ -93,6 +181,56 @@ at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::s
 }
 
 void nvshmem_put(at::Tensor& tensor, const int64_t peer) {
+=======
+  TORCH_CHECK(
+    ::nvshmemx_cumodule_init(cumodule) == 0,
+    "nvshmemx_cumodule_init failed");
+}
+
+std::unordered_map<std::string, nvshmem_team_t> group_name_to_team_;
+
+nvshmem_team_t group_to_team(
+    const std::string& group_name,
+    const std::vector<int>& global_ranks) {
+  auto it = group_name_to_team_.find(group_name);
+  if (it != group_name_to_team_.end()) {
+    return it->second;
+  }
+  TORCH_CHECK(global_ranks.size() > 1);
+  int stride = global_ranks[1] - global_ranks[0];
+  for (size_t r = 1; r < global_ranks.size(); ++r) {
+    TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
+  }
+
+  nvshmem_team_t team;
+  TORCH_CHECK(
+      nvshmem_team_split_strided(
+          NVSHMEM_TEAM_WORLD,
+          global_ranks[0],
+          stride,
+          global_ranks.size(),
+          nullptr,
+          0,
+          &team) == 0);
+  group_name_to_team_[group_name] = team;
+  TORCH_CHECK(team != NVSHMEM_TEAM_INVALID);
+  return team;
+}
+
+at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name) {
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
+  void* buffer_ptr = input_hdl->get_buffer_ptrs()[rank];
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, input_hdl->get_buffer_size(), 0, stream);
+  return input;
+}
+
+void nvshmem_put(at::Tensor& tensor, int64_t peer) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
       "put op currently supports contiguous tensors only");
@@ -101,13 +239,17 @@ void nvshmem_put(at::Tensor& tensor, const int64_t peer) {
   auto rank = hdl->get_rank();
   void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
   auto buffer_size = tensor.numel() * tensor.element_size();
+<<<<<<< HEAD
   TORCH_CHECK(peer < hdl->get_world_size(), "peer must be smaller than world size");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   c10::cuda::CUDAGuard guard(tensor.device());
   auto stream = at::cuda::getCurrentCUDAStream();
   nvshmemx_putmem_on_stream(buffer_ptr, tensor.data_ptr(), buffer_size, peer, stream);
 }
 
+<<<<<<< HEAD
 void nvshmem_wait_for_signal(at::Tensor& sigpad, int64_t signal, int64_t peer) {
   c10::cuda::CUDAGuard guard(sigpad.device());
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -146,6 +288,8 @@ void nvshmem_get(at::Tensor& tensor, const int64_t peer) {
   nvshmemx_getmem_on_stream(tensor.mutable_data_ptr(), buffer_ptr, buffer_size, peer, stream);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor nvshmem_all_to_all(
     at::Tensor& input,
     at::Tensor& out,
@@ -154,6 +298,7 @@ at::Tensor nvshmem_all_to_all(
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
+<<<<<<< HEAD
   auto& team_manager = TeamManager::get(input.device());
   auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
 
@@ -165,6 +310,13 @@ at::Tensor nvshmem_all_to_all(
   TORCH_CHECK_EQ(input.numel() % world_size, 0);
   auto buffer_size = input.numel() * input.element_size();
   size_t bytes_per_rank = buffer_size / world_size;
+=======
+  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
+
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  size_t bytes_per_rank = input_hdl->get_buffer_size() / world_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
   nvshmemx_alltoallmem_on_stream(team, output_ptr, input_ptr, bytes_per_rank, stream);
@@ -203,6 +355,7 @@ __device__ int64_t prefixSum(int64_t *odata, int64_t *idata, int n) {
 // - input splits (IN)
 // - output splits (OUT) and
 // - source offsets (OUT).
+<<<<<<< HEAD
 __global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_splits_offsets, nvshmem_team_t team) {
 #ifndef _NVSHMEM_DEVICELIB_SUPPORTED
   CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
@@ -215,6 +368,14 @@ __global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_split
   int tid = threadIdx.x;
 
   CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
+=======
+__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npes) {
+  auto input_splits = in_out_splits;
+  auto output_splits = in_out_splits + npes;
+  auto source_offsets = in_out_splits + npes * 2;
+  int tid = threadIdx.x;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
 
   // Scan input splits to get the source offsets
@@ -223,6 +384,7 @@ __global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_split
 
   // Use 1 block to do the exchange
   if (tid < npes) {
+<<<<<<< HEAD
     // tid is peer index within team, but put calls require global rank
     int peer_global = nvshmem_team_translate_pe(team, tid, NVSHMEM_TEAM_WORLD);
     nvshmem_int64_p(source_offsets + mype, peer_offsets[tid], peer_global);
@@ -231,11 +393,20 @@ __global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_split
   // This barrier ensures that all remote PEs see the updated values
   nvshmemx_barrier_block(team);
 #endif
+=======
+    int peer = tid;
+    nvshmem_int64_p(source_offsets + mype, peer_offsets[peer], peer);
+    nvshmem_int64_p(output_splits + mype, input_splits[peer], peer);
+  }
+  // This barrier ensures that all remote PEs see the updated values
+  nvshmemx_barrier_all_block();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // This kernel is used to do the actual data exchange.
 // `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
 // `stride` is the stride at dim 0, unit in byte.
+<<<<<<< HEAD
 __global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_offsets, size_t stride, nvshmem_team_t team) {
 #ifndef _NVSHMEM_DEVICELIB_SUPPORTED
   CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
@@ -245,12 +416,20 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_
   int npes = nvshmem_team_n_pes(team);
   auto output_splits = out_splits_offsets;
   auto source_offsets = out_splits_offsets + npes;
+=======
+__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes) {
+  auto output_splits = in_out_splits + npes;
+  auto source_offsets = in_out_splits + npes * 2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int bid = blockIdx.x;
   int tid = threadIdx.x;
   int blocks_per_peer = max(gridDim.x / npes, 1);
 
   // Calculate the output offsets
+<<<<<<< HEAD
   CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
   prefixSum(peer_offsets, output_splits, npes);
   __syncthreads();
@@ -258,7 +437,10 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_
   // Target a different peer based on bid
   for (int i = bid / blocks_per_peer; i < npes; i += gridDim.x / blocks_per_peer) {
     int peer = (mype + i) % npes;
+<<<<<<< HEAD
     auto peer_global = nvshmem_team_translate_pe(team, peer, NVSHMEM_TEAM_WORLD);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Total amount from `peer`
     auto peer_size = output_splits[peer] * stride;
     // Amount to get from `peer` in this block
@@ -269,16 +451,25 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_
     auto block_offset = block_size * (bid % blocks_per_peer);
     auto source_offset = source_offsets[peer] * stride + block_offset;
     auto write_offset = peer_offsets[peer] * stride + block_offset;
+<<<<<<< HEAD
     nvshmemx_getmem_nbi_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
       block_size,
       peer_global);
+=======
+    nvshmemx_getmem_block(
+      (char*)recv_data + write_offset,
+      (char*)send_data + source_offset,
+      block_size,
+      peer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // Write out the output offsets (to the scratchpad line)
   if (bid == 0 && tid < npes) {
     source_offsets[tid] = peer_offsets[tid];
   }
+<<<<<<< HEAD
   // Make sure getmem_nbi calls finish
   nvshmem_quiet();
 #endif
@@ -304,11 +495,20 @@ void all_to_all_vdev(
     at::Tensor& out,
     at::Tensor& in_splits,
     at::Tensor& out_splits_offsets,
+=======
+}
+
+at::Tensor all_to_all_vdev(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string group_name) {
   /* Perform AllToAllv operation using NVSHMEM, with split information provided on device.
    * Arguments:
    *  - `input` is the input tensor
    *  - `out` is the output tensor
+<<<<<<< HEAD
    *  - `in_splits` is a 1D tensor of size (npes), containing the input splits
    *  - `out_splits_offsets` is a 2D tensor of size (2, npes). The rows are (in order):
         output splits and output offsets.
@@ -331,13 +531,37 @@ void all_to_all_vdev(
   auto& team_manager = TeamManager::get(device);
   auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
   auto stream = at::cuda::getCurrentCUDAStream(device.index());
+=======
+   *  - `in_out_splits` is a 2D tensor of size (3, npes). The rows are (in order):
+        input splits (IN)
+        output splits (OUT) and
+        output offsets (OUT).
+  */
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
+  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+
+  auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Exchange output splits and source offsets
   // Use collective launch because kernel involves nvshmem barrier
   void* args0[] = {
+<<<<<<< HEAD
       &in_splits_ptr,
       &out_splits_offsets_ptr,
       &team};
+=======
+      &splits_ptr,
+      &rank,
+      &world_size};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nvshmemx_collective_launch(
       (const void*)exchangeSplitAndOffset,
       dim3(1),
@@ -347,11 +571,36 @@ void all_to_all_vdev(
       stream);
 
   // CTA Tuning
+<<<<<<< HEAD
   auto input_size = input.numel() * input.element_size();
   int num_blocks = get_a2a_nblocks(
     input_size,
     input_hdl->get_world_size(),
     input_hdl->world_within_direct_access());
+=======
+  // Intra-node: use multiple blocks per peer to increase data parallelism, up to 8.
+  // Up to 1 MB -> 1 block
+  // Up to 2 MB -> 2 blocks
+  // Up to 4 MB -> 4 blocks
+  // More -> 8 blocks
+  // The tuning for `num_blocks` below multiplies these numbers by world_size
+  // (e.g. 8 -> 8 * 8). If world_size is smaller, we simply shift the blocks
+  // towards data parallelism. (There may be room for improvement here)
+  auto input_size = input.numel() * input.element_size();
+  int num_blocks = input_size < MiB ? 8 :
+      (input_size < 2 * MiB ? 16 :
+      (input_size < 4 * MiB ? 32 : 64));
+
+  // Inter-node: limit the total the number of blocks:
+  // = 16 for 16GPUs which is enough to max out 90 GB/s bandwidth perf
+  // = 8 for more than 16 GPUs which is enough to max out approx 50 GB/s bandwidth perf
+  // Above assumes 400Gb/s NIC for inter-node and 400GB/s NVLinks for intra-node comms.
+  // TODO: better intra vs inter detection, currently it is based on world_size.
+  int max_inter_node_blocks = world_size <= 16 ? 16 : 8;
+  if (world_size > 8) {
+    num_blocks = std::min(num_blocks, max_inter_node_blocks);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Stride at dim 0 (assuming input is contiguous, TODO)
   size_t stride_bytes = input.stride(0) * input.element_size();
@@ -360,9 +609,16 @@ void all_to_all_vdev(
   void* args1[] = {
       &input_ptr,
       &output_ptr,
+<<<<<<< HEAD
       &out_splits_offsets_ptr,
       &stride_bytes,
       &team};
+=======
+      &splits_ptr,
+      &stride_bytes,
+      &rank,
+      &world_size};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nvshmemx_collective_launch(
       (const void*)allToAllV,
       dim3(num_blocks),
@@ -370,6 +626,7 @@ void all_to_all_vdev(
       args1,
       0,
       stream);
+<<<<<<< HEAD
 }
 
 // Start of `all_to_all_vdev_2d`
@@ -444,6 +701,45 @@ __global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* o
   // This barrier ensures that all remote PEs see the updated values
   nvshmemx_barrier_block(team);
 #endif
+=======
+  return out;
+}
+
+// Start of `all_to_all_vdev_2d`
+// This kernel is used to exchange output splits and source offsets between peers.
+// For meaning of `mype` and `npes`, see the docstring of `all_to_all_vdev_2d`.
+// `in_out_splits` is of size (3, npes * ne) and contains:
+// - input splits (IN)
+// - output splits (OUT) and
+// - source offsets (OUT).
+__global__ void exchangeSplitAndOffset_2d(int64_t* in_out_splits, int mype, int npes, int ne, size_t input_dim0) {
+  int nsplits = npes * ne;
+  auto input_splits = in_out_splits;
+  auto output_splits = in_out_splits + nsplits;
+  auto source_offsets = in_out_splits + nsplits * 2;
+  int tid = threadIdx.x;
+
+  __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
+
+  // Scan input splits to get the source offsets
+  auto sum_of_splits = prefixSum(peer_offsets, input_splits, nsplits);
+  __syncthreads();;
+  CUDA_KERNEL_ASSERT(sum_of_splits <= input_dim0);
+
+  // Use 1 block to do the exchange
+  if (tid < nsplits) {
+    int peer = tid / ne;
+    int e = tid % ne;
+    // This does a transpose from rank-major order to expert-major order
+    int dst_offset = e * npes + mype;
+    auto split_val = input_splits[tid];
+    CUDA_KERNEL_ASSERT(split_val >= 0);
+    nvshmem_int64_p(source_offsets + dst_offset, peer_offsets[tid], peer);
+    nvshmem_int64_p(output_splits + dst_offset, split_val, peer);
+  }
+  // This barrier ensures that all remote PEs see the updated values
+  nvshmemx_barrier_all_block();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // This is an warp-scope, exclusive prefix sum. When called by a block of
@@ -486,6 +782,7 @@ __device__ int64_t prefixSum_warp(int64_t *odata, int64_t *idata, int n) {
 // `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
 // `stride` is the stride at dim 0, unit in byte.
 // For meaning of `mype` and `npes`, see the docstring of `all_to_all_vdev_2d`.
+<<<<<<< HEAD
 // `major_align` is the alignment at dim 0, unit in element. If 0, no alignment is needed.
 
 // `rank_is_row_out` is a boolean flag indicating whether the output has ranks as rows or experts as rows.
@@ -499,6 +796,12 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
   int nsplits = minor_size * major_size;
   auto output_splits = out_splits_offsets;
   auto source_offsets = out_splits_offsets + nsplits;
+=======
+__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes, int ne, int64_t major_align) {
+  int nsplits = npes * ne;
+  auto output_splits = in_out_splits + nsplits;
+  auto source_offsets = in_out_splits + nsplits * 2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int bid = blockIdx.x;
   int tid = threadIdx.x;
 
@@ -508,6 +811,7 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
   int laneId = tid % A2AV_TILE_SIZE;
   // Each tile calculates its own prefix sum
   __shared__ int64_t tile_prefix_sums[NUM_TILES][A2AV_TILE_SIZE];
+<<<<<<< HEAD
   // A tile takes care of minor_size worth of splits
   int nsplits_per_tile = min(minor_size, nsplits - tileId * minor_size);
   // TODO: currently it is assumed that the number of PE's is smaller than
@@ -517,6 +821,17 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
   // Similarly, the number of experts per rank is also assumed to be smaller
   // than `NUM_TILES`
   CUDA_KERNEL_ASSERT(major_size <= NUM_TILES && "major_size is too large\n");
+=======
+  // A tile takes care of npes worth of splits
+  int nsplits_per_tile = min(npes, nsplits - tileId * npes);
+  // TODO: currently it is assumed that the number of PE's is smaller than
+  // `A2AV_TILE_SIZE` bc the warp-scope prefix sum can only handle up to
+  // WARP_SIZE elements
+  CUDA_KERNEL_ASSERT(npes <= A2AV_TILE_SIZE);
+  // Similarly, the number of experts per rank is also assumed to be smaller
+  // than `NUM_TILES`
+  CUDA_KERNEL_ASSERT(ne <= NUM_TILES);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Total length of each tile
   __shared__ int64_t len_per_tile[NUM_TILES];
@@ -524,6 +839,7 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
   // this local prefix sum.
   if (nsplits_per_tile > 0) {
     // Each tile calculates its own prefix sum, return value is the sum of all elements in the tile.
+<<<<<<< HEAD
     int64_t my_tile_len = prefixSum_warp<NUM_TILES>(tile_prefix_sums[tileId], output_splits + tileId * minor_size, nsplits_per_tile);
     // Last thread in each tile does the up aligning.
     if (laneId == A2AV_TILE_SIZE - 1) {
@@ -536,6 +852,16 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
       } else {  // 0 means alignment not needed
         len_per_tile[tileId] = my_tile_len;
       }
+=======
+    int64_t my_tile_len = prefixSum_warp<NUM_TILES>(tile_prefix_sums[tileId], output_splits + tileId * npes, nsplits_per_tile);
+    // Last thread in each tile does the up aligning.
+    if (laneId == A2AV_TILE_SIZE - 1) {
+      auto aligned_len = (my_tile_len + major_align - 1) / major_align * major_align;
+      // In case `aligned_len` is 0, we set it to `major_align` to avoid an
+      // empty bin, bc cutlass currently does not support it. See
+      // https://github.com/pytorch/pytorch/issues/152668.
+      len_per_tile[tileId] = max(aligned_len, major_align);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   __syncthreads();
@@ -557,6 +883,7 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
 
   // Target a different e based on bid
   for (int eid = bid; eid < nsplits; eid += gridDim.x) {
+<<<<<<< HEAD
     int row = eid / minor_size;
     int col = eid % minor_size;
     // Amount from `peer` for `e`
@@ -585,6 +912,30 @@ void all_to_all_vdev_2d(
     at::Tensor& out,
     at::Tensor& in_splits,
     at::Tensor& out_splits_offsets,
+=======
+    int peer = eid % npes;
+    // Amount from `peer` for `e`
+    auto peer_size = output_splits[eid] * stride;
+    auto source_offset = source_offsets[eid] * stride;
+    auto e_offset = tile_prefix_sums[eid / npes][peer];
+    auto write_offset = e_offset * stride;
+    nvshmemx_getmem_block(
+      (char*)recv_data + write_offset,
+      (char*)send_data + source_offset,
+      peer_size,
+      peer);
+  }
+  // Write out the output offsets (to the scratchpad line)
+  if (bid == 0 && tid < nsplits) {
+    source_offsets[tid] = tile_prefix_sums[tid / npes][tid % npes];
+  }
+}
+
+at::Tensor all_to_all_vdev_2d(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string group_name,
     std::optional<int64_t> major_align) {
   /* Perform a 2D AllToAllv shuffle operation using NVSHMEM, with split information provided on device.
@@ -625,8 +976,12 @@ void all_to_all_vdev_2d(
   */
   auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
+<<<<<<< HEAD
   auto in_splits_hdl = c10d::symmetric_memory::rendezvous(in_splits, group_name);
   auto out_splits_offsets_hdl = c10d::symmetric_memory::rendezvous(out_splits_offsets, group_name);
+=======
+  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
   // TODO: world_size is currently limited by the number of elements in a WarpScan.
@@ -636,6 +991,7 @@ void all_to_all_vdev_2d(
   int64_t major_align_val = major_align.value_or(1);
   TORCH_CHECK(major_align_val > 0, "major_align must be positive");
 
+<<<<<<< HEAD
   void* input_ptr = input.data_ptr();
   void* output_ptr = out.mutable_data_ptr();
   int64_t* in_splits_ptr = (int64_t*)(in_splits.data_ptr());
@@ -654,22 +1010,46 @@ void all_to_all_vdev_2d(
       && out_split_shape[1] == in_split_shape[0]
       && in_split_shape[0] % world_size == 0,
       "out_splits_offsets must be 2D with 2 rows, "
+=======
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+
+  // Shape checks
+  auto split_shape = in_out_splits.sizes();
+  TORCH_CHECK(in_out_splits.is_contiguous()
+      && input.is_contiguous()
+      && out.is_contiguous(),
+      "input, out and in_out_splits must be contiguous");
+  TORCH_CHECK(split_shape.size() == 2
+      && split_shape[0] == 3
+      && split_shape[1] % world_size == 0,
+      "in_out_splits must be 2D with 3 rows, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "each row must be a multiple of world_size");
 
   // Consistency checks
   TORCH_CHECK(input.dtype() == out.dtype()
       && input.stride(0) == out.stride(0),
       "input and out must have the same dtype and same stride at dim 0");
+<<<<<<< HEAD
   TORCH_CHECK(in_splits.scalar_type() == at::kLong
       && out_splits_offsets.scalar_type() == at::kLong,
       "splits and offsets must be int64");
 
   // Number of experts per rank
   int ne = in_split_shape[0] / world_size;
+=======
+  TORCH_CHECK(in_out_splits.scalar_type() == at::kLong, "in_out_splits must be int64");
+
+  // Number of experts per rank
+  int ne = split_shape[1] / world_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr int NUM_TILES = THREADS_PER_BLOCK / A2AV_TILE_SIZE;
   TORCH_CHECK(ne <= NUM_TILES, "Number of experts must be smaller than NUM_TILES", NUM_TILES);
 
   // Set device context for getting the stream and launching kernels below
+<<<<<<< HEAD
   auto device = input.device();
   TORCH_CHECK(device.type() == at::DeviceType::CUDA &&
       out.device() == device &&
@@ -694,6 +1074,22 @@ void all_to_all_vdev_2d(
       &rank_is_row_in};
   nvshmemx_collective_launch(
       (const void*)exchangeSplitAndOffset_2d<false>,  // false: input offsets not provided
+=======
+  c10::cuda::CUDAGuard guard(input.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // Exchange output splits and source offsets
+  auto input_dim0 = input.size(0);
+  // Use collective launch because kernel involves nvshmem barrier
+  void* args0[] = {
+      &splits_ptr,
+      &rank,
+      &world_size,
+      &ne,
+      &input_dim0};
+  nvshmemx_collective_launch(
+      (const void*)exchangeSplitAndOffset_2d,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dim3(1),
       dim3(THREADS_PER_BLOCK),
       args0,
@@ -707,12 +1103,16 @@ void all_to_all_vdev_2d(
 
   // Stride at dim 0
   size_t stride_bytes = input.stride(0) * input.element_size();
+<<<<<<< HEAD
   bool rank_is_row_out = !rank_is_row_in;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // All to all data exchange
   void* args1[] = {
       &input_ptr,
       &output_ptr,
+<<<<<<< HEAD
       &in_splits_ptr,
       &out_splits_offsets_ptr,
       &stride_bytes,
@@ -721,6 +1121,14 @@ void all_to_all_vdev_2d(
       &major_align_val,
       &rank_is_row_out,
       &team};
+=======
+      &splits_ptr,
+      &stride_bytes,
+      &rank,
+      &world_size,
+      &ne,
+      &major_align_val};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
@@ -728,6 +1136,7 @@ void all_to_all_vdev_2d(
       args1,
       0,
       stream);
+<<<<<<< HEAD
 }
 
 void all_to_all_vdev_2d_offset(
@@ -1067,6 +1476,9 @@ void multi_root_tile_reduce(
         stream);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
+=======
+  return out;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10d::nvshmem_extension
@@ -1075,6 +1487,7 @@ void multi_root_tile_reduce(
 TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("nvshmem_broadcast", c10d::nvshmem_extension::nvshmem_broadcast);
   m.impl("nvshmem_put", c10d::nvshmem_extension::nvshmem_put);
+<<<<<<< HEAD
   m.impl("nvshmem_get", c10d::nvshmem_extension::nvshmem_get);
   m.impl("nvshmem_wait_for_signal", c10d::nvshmem_extension::nvshmem_wait_for_signal);
   m.impl("nvshmem_put_with_signal", c10d::nvshmem_extension::nvshmem_put_with_signal);
@@ -1084,4 +1497,9 @@ TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("all_to_all_vdev_2d_offset", c10d::nvshmem_extension::all_to_all_vdev_2d_offset);
   m.impl("tile_reduce", c10d::nvshmem_extension::tile_reduce);
   m.impl("multi_root_tile_reduce", c10d::nvshmem_extension::multi_root_tile_reduce);
+=======
+  m.impl("nvshmem_all_to_all", c10d::nvshmem_extension::nvshmem_all_to_all);
+  m.impl("all_to_all_vdev", c10d::nvshmem_extension::all_to_all_vdev);
+  m.impl("all_to_all_vdev_2d", c10d::nvshmem_extension::all_to_all_vdev_2d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
index 50b9e268cba7a..a35edb1f2882d 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -1,5 +1,6 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/macros/Macros.h>
 #include <ATen/ATen.h>
 
@@ -14,6 +15,19 @@
 
 namespace c10d::nvshmem_extension {
 
+=======
+#include <ATen/ATen.h>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d::nvshmem_extension {
+
+void initialize_nvshmem_with_store(
+    c10::intrusive_ptr<c10d::Store> store,
+    int rank,
+    int world_size);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Check if NVSHMEM is available
 TORCH_API bool is_nvshmem_available();
 
@@ -21,6 +35,7 @@ TORCH_API bool is_nvshmem_available();
 // operations.
 TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
 
+<<<<<<< HEAD
 TORCH_API void nvshmem_put(at::Tensor& tensor, const int64_t peer);
 
 TORCH_API void nvshmem_get(at::Tensor& tensor, const int64_t peer);
@@ -30,12 +45,18 @@ at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::s
 TORCH_API void nvshmem_wait_for_signal(at::Tensor& sigpad, int64_t signal, int64_t peer);
 
 TORCH_API void nvshmem_put_with_signal(at::Tensor& tensor, at::Tensor& sigpad, int64_t signal, int64_t peer);
+=======
+TORCH_API void nvshmem_put(at::Tensor& tensor, int64_t peer);
+
+at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 at::Tensor nvshmem_all_to_all(
     at::Tensor& input,
     at::Tensor& out,
     std::string group_name);
 
+<<<<<<< HEAD
 void all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
@@ -72,4 +93,19 @@ void multi_root_tile_reduce(
     std::string group_name,
     std::string reduce_op = "sum");
 
+=======
+at::Tensor all_to_all_vdev(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name);
+
+at::Tensor all_to_all_vdev_2d(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name,
+    std::optional<int64_t> major_align = std::nullopt);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d::nvshmem_extension
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index fe04f23c66d11..7c753332ec65a 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -23,8 +23,12 @@ std::unordered_map<std::string, worker_id_t> collectNames(
     }
     std::vector<uint8_t> workerNameVector = store.get(std::to_string(workerId));
     std::string workerName(
+<<<<<<< HEAD
         reinterpret_cast<char*>(workerNameVector.data()),
         workerNameVector.size());
+=======
+        (char*)workerNameVector.data(), workerNameVector.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     TORCH_CHECK(
         nameToId.find(workerName) == nameToId.end(),
@@ -92,8 +96,12 @@ std::unordered_map<std::string, worker_id_t> collectCurrentNames(
     // Get the current list of workers
     std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
     allWorkerInfos = std::string(
+<<<<<<< HEAD
         reinterpret_cast<const char*>(allWorkerInfosKeyVector.data()),
         allWorkerInfosKeyVector.size());
+=======
+        (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // workerInfos are comma separated with a comma at the end (e.g.
     // "Name1-Rank1,Name2-Rank2,Name3-Rank2,") parse list of workers.
     if (!allWorkerInfos.empty()) {
@@ -134,8 +142,12 @@ void removeCurrentName(
   // Get current list of names/ranks
   std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
   std::string allWorkerInfos = std::string(
+<<<<<<< HEAD
       reinterpret_cast<const char*>(allWorkerInfosKeyVector.data()),
       allWorkerInfosKeyVector.size());
+=======
+      (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Remove the current name and rank
   std::string str_to_erase = fmt::format("{}-{},", selfName, selfId);
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index ce85ee4f5c4ba..2f33318d2531e 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -79,8 +79,11 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
   module.attr("_DEFAULT_RPC_TIMEOUT_SEC") = py::cast(kDefaultRpcTimeoutSeconds);
   module.attr("_UNSET_RPC_TIMEOUT") = py::cast(kUnsetRpcTimeout);
   module.attr("_DEFAULT_INIT_METHOD") = py::cast(kDefaultInitMethod);
+<<<<<<< HEAD
   module.attr("_DEFAULT_NUM_WORKER_THREADS") =
       py::cast(kDefaultNumWorkerThreads);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto workerInfo =
       shared_ptr_class_<WorkerInfo>(
@@ -149,6 +152,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               py::call_guard<py::gil_scoped_release>())
           .def(
               "get_worker_info",
+<<<<<<< HEAD
               static_cast<const WorkerInfo& (RpcAgent::*)(void) const>(
                   &RpcAgent::getWorkerInfo),
               py::call_guard<py::gil_scoped_release>())
@@ -156,6 +160,15 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               "get_worker_info",
               static_cast<const WorkerInfo& (RpcAgent::*)(const std::string&)
                               const>(&RpcAgent::getWorkerInfo),
+=======
+              (const WorkerInfo& (RpcAgent::*)(void) const) &
+                  RpcAgent::getWorkerInfo,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "get_worker_info",
+              (const WorkerInfo& (RpcAgent::*)(const std::string&) const) &
+                  RpcAgent::getWorkerInfo,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               py::call_guard<py::gil_scoped_release>())
           .def(
               "get_worker_infos",
@@ -570,6 +583,12 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           R"(All devices used by the local agent.)")
       .def("_set_device_map", &TensorPipeRpcBackendOptions::setDeviceMap);
 
+<<<<<<< HEAD
+=======
+  module.attr("_DEFAULT_NUM_WORKER_THREADS") =
+      py::cast(kDefaultNumWorkerThreads);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   shared_ptr_class_<TensorPipeAgent>(module, "TensorPipeAgent", rpcAgent)
       .def(
           py::init(
@@ -611,6 +630,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
+<<<<<<< HEAD
           static_cast<const WorkerInfo& (TensorPipeAgent::*)(void) const>(
               &RpcAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
@@ -633,6 +653,30 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           "_get_device_map",
           static_cast<DeviceMap (TensorPipeAgent::*)(const WorkerInfo& dst)
                           const>(&TensorPipeAgent::getDeviceMap),
+=======
+          (const WorkerInfo& (TensorPipeAgent::*)(void) const) &
+              RpcAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_info",
+          (const WorkerInfo& (TensorPipeAgent::*)(const std::string&) const) &
+              TensorPipeAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_info",
+          (const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id) const) &
+              TensorPipeAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_infos",
+          (std::vector<WorkerInfo>(TensorPipeAgent::*)() const) &
+              TensorPipeAgent::getWorkerInfos,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_get_device_map",
+          (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst)
+               const)&TensorPipeAgent::getDeviceMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_get_backend_options",
diff --git a/torch/csrc/distributed/rpc/python_remote_call.cpp b/torch/csrc/distributed/rpc/python_remote_call.cpp
index 2b259234b9cd0..8241620d952b2 100644
--- a/torch/csrc/distributed/rpc/python_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/python_remote_call.cpp
@@ -32,7 +32,11 @@ c10::intrusive_ptr<Message> PythonRemoteCall::toMessageImpl() && {
 
 std::unique_ptr<PythonRemoteCall> PythonRemoteCall::fromMessage(
     const Message& message) {
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
diff --git a/torch/csrc/distributed/rpc/python_remote_call.h b/torch/csrc/distributed/rpc/python_remote_call.h
index 09d4ba36dc62b..f4b842a5c8667 100644
--- a/torch/csrc/distributed/rpc/python_remote_call.h
+++ b/torch/csrc/distributed/rpc/python_remote_call.h
@@ -3,6 +3,10 @@
 #include <torch/csrc/distributed/rpc/message.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/distributed/rpc/types.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::distributed::rpc {
 
 class TORCH_API PythonRemoteCall : public RpcCommandBase {
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index 843867a0f7520..d92ecb328e6e4 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -1,6 +1,9 @@
 #include <torch/csrc/distributed/rpc/request_callback_impl.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/distributed/autograd/context/container.h>
 #include <torch/csrc/distributed/autograd/context/context.h>
@@ -97,7 +100,11 @@ SerializedPyObj serializePyObject(IValue value) {
     std::string err_msg = e.what();
     e.restore();
     PyErr_Clear();
+<<<<<<< HEAD
     TORCH_CHECK(false, err_msg);
+=======
+    throw std::runtime_error(err_msg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index ef645675af20a..1f45e8395fffd 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -74,7 +74,11 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
         [this,
          // std::function must be copyable, hence hae to cast the unique_ptr to
          // a shared_ptr here.
+<<<<<<< HEAD
          rpc = std::shared_ptr<RpcCommandBase>(std::move(rpc)),
+=======
+         rpc = (std::shared_ptr<RpcCommandBase>)std::move(rpc),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          messageType = request.type(),
          streams = std::move(streams)](JitFuture& /* unused */) mutable {
           // The cost of pre-request check is minimal thanks to
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 9eee15bdc4d88..89f836c9f7723 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -13,7 +13,11 @@ RegisterWorkerInfoOnce::RegisterWorkerInfoOnce() {
 }
 
 WorkerInfo::WorkerInfo(std::string name, int64_t id)
+<<<<<<< HEAD
     : WorkerInfo(std::move(name), static_cast<worker_id_t>(id)) {
+=======
+    : WorkerInfo(std::move(name), (worker_id_t)id) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       id <= std::numeric_limits<worker_id_t>::max(),
       "RPC worker id ",
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index e353c54805415..b07f50c8a1c4c 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -24,7 +24,10 @@ constexpr auto kDefaultInitMethod = "env://";
 constexpr float kSecToMsConversion = 1000;
 constexpr auto kRpcTimeoutErrorStr =
     "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
+<<<<<<< HEAD
 constexpr auto kDefaultNumWorkerThreads = 16;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using steady_clock_time_point =
     std::chrono::time_point<std::chrono::steady_clock>;
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
index c36c6386b861e..257a0e916f01e 100644
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@@ -348,7 +348,11 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
     // here is a plain TensorType, they are not equal relationship:
     // specialized TensorType <: plain TensorType
     //
+<<<<<<< HEAD
     // In RPC we don't care the difference as we Ser/De with just the
+=======
+    // In RPC we don't care the difference as we ser'de with just the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // plain TensorType. This is not a issue for UserRRef creation either,
     // since Tensor can only get specialized with a previous run of local
     // JIT function, and we shouldn't preserve the specialized SubTensorType
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index 5a3fff5d6722e..0fa1f527c0fac 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -225,7 +225,11 @@ class TORCH_API RRefContext {
     c10::intrusive_ptr<JitFuture> confirmationFuture_;
   };
 
+<<<<<<< HEAD
   RRefContext(std::shared_ptr<RpcAgent> /*agent*/);
+=======
+  RRefContext(std::shared_ptr<RpcAgent>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   c10::intrusive_ptr<UserRRef> createUserRRef(
       worker_id_t ownerId,
diff --git a/torch/csrc/distributed/rpc/rref_proto.cpp b/torch/csrc/distributed/rpc/rref_proto.cpp
index ea02972f8915e..88498c88e5efb 100644
--- a/torch/csrc/distributed/rpc/rref_proto.cpp
+++ b/torch/csrc/distributed/rpc/rref_proto.cpp
@@ -15,7 +15,11 @@ c10::ivalue::TupleElements toIValues(const Message& message, MessageType type) {
       type,
       ", but got ",
       message.type());
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
@@ -87,7 +91,11 @@ std::unique_ptr<ScriptRRefFetchCall> ScriptRRefFetchCall::fromMessage(
           id <= std::numeric_limits<worker_id_t>::max(),
       "ScriptRRefFetchCall fromWorkerId exceeds worker_id_t limit.")
   return std::make_unique<ScriptRRefFetchCall>(
+<<<<<<< HEAD
       static_cast<worker_id_t>(id), RRefId::fromIValue(values[0]));
+=======
+      worker_id_t(id), RRefId::fromIValue(values[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Message> PythonRRefFetchCall::toMessageImpl() && {
@@ -109,7 +117,11 @@ std::unique_ptr<PythonRRefFetchCall> PythonRRefFetchCall::fromMessage(
           id <= std::numeric_limits<worker_id_t>::max(),
       "PythonRRefFetchCall fromWorkerId exceeds worker_id_t limit.")
   return std::make_unique<PythonRRefFetchCall>(
+<<<<<<< HEAD
       static_cast<worker_id_t>(id), RRefId::fromIValue(values[0]));
+=======
+      worker_id_t(id), RRefId::fromIValue(values[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 const std::vector<at::IValue>& RRefFetchRet::values() {
diff --git a/torch/csrc/distributed/rpc/rref_proto.h b/torch/csrc/distributed/rpc/rref_proto.h
index a1482b46939b1..9c005f32019fc 100644
--- a/torch/csrc/distributed/rpc/rref_proto.h
+++ b/torch/csrc/distributed/rpc/rref_proto.h
@@ -4,6 +4,10 @@
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/distributed/rpc/types.h>
 #include <torch/csrc/jit/runtime/operator.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <vector>
 
 namespace torch::distributed::rpc {
diff --git a/torch/csrc/distributed/rpc/script_call.cpp b/torch/csrc/distributed/rpc/script_call.cpp
index d9aefc659c7d4..bffb2ca3f566a 100644
--- a/torch/csrc/distributed/rpc/script_call.cpp
+++ b/torch/csrc/distributed/rpc/script_call.cpp
@@ -127,7 +127,11 @@ c10::intrusive_ptr<Message> ScriptCall::toMessageImpl() && {
 }
 
 std::unique_ptr<ScriptCall> ScriptCall::fromMessage(const Message& message) {
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
   auto value = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/rpc/script_call.h b/torch/csrc/distributed/rpc/script_call.h
index 476ee118fe7fa..93962da46c4d1 100644
--- a/torch/csrc/distributed/rpc/script_call.h
+++ b/torch/csrc/distributed/rpc/script_call.h
@@ -3,6 +3,10 @@
 #include <torch/csrc/distributed/rpc/message.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/runtime/operator.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 #include <vector>
 
diff --git a/torch/csrc/distributed/rpc/script_remote_call.cpp b/torch/csrc/distributed/rpc/script_remote_call.cpp
index 4458d8da4e162..cb2eea3515396 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/script_remote_call.cpp
@@ -65,7 +65,11 @@ c10::intrusive_ptr<Message> ScriptRemoteCall::toMessageImpl() && {
 
 std::unique_ptr<ScriptRemoteCall> ScriptRemoteCall::fromMessage(
     const Message& message) {
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
diff --git a/torch/csrc/distributed/rpc/script_remote_call.h b/torch/csrc/distributed/rpc/script_remote_call.h
index e18edab648210..28144c695c8a2 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.h
+++ b/torch/csrc/distributed/rpc/script_remote_call.h
@@ -3,6 +3,10 @@
 #include <torch/csrc/distributed/rpc/script_call.h>
 #include <torch/csrc/distributed/rpc/types.h>
 #include <torch/csrc/jit/runtime/operator.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <vector>
 
 namespace torch::distributed::rpc {
diff --git a/torch/csrc/distributed/rpc/script_resp.cpp b/torch/csrc/distributed/rpc/script_resp.cpp
index 12d1f2a8756a9..408605ab8f425 100644
--- a/torch/csrc/distributed/rpc/script_resp.cpp
+++ b/torch/csrc/distributed/rpc/script_resp.cpp
@@ -20,7 +20,11 @@ c10::intrusive_ptr<Message> ScriptResp::toMessageImpl() && {
 }
 
 std::unique_ptr<ScriptResp> ScriptResp::fromMessage(const Message& message) {
+<<<<<<< HEAD
   auto payload = message.payload().data();
+=======
+  auto payload = static_cast<const char*>(message.payload().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto payload_size = message.payload().size();
   auto value = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/rpc/script_resp.h b/torch/csrc/distributed/rpc/script_resp.h
index 53841e3d705c2..f60f2b4a8b1dd 100644
--- a/torch/csrc/distributed/rpc/script_resp.h
+++ b/torch/csrc/distributed/rpc/script_resp.h
@@ -2,6 +2,10 @@
 
 #include <torch/csrc/distributed/rpc/message.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::distributed::rpc {
 
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index e20f8730b6ec0..fa7de8c05d53c 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -263,12 +263,19 @@ constexpr static int kNumUvThreads = 16;
 
 std::unique_ptr<ChannelRegistration> makeMultiplexedUvChannel() {
   std::vector<std::shared_ptr<tensorpipe::transport::Context>> contexts;
+<<<<<<< HEAD
   contexts.reserve(kNumUvThreads);
   std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners;
   listeners.reserve(kNumUvThreads);
   for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) {
     auto context = tensorpipe::transport::uv::create();
     const std::string& address = TensorPipeAgent::guessAddress();
+=======
+  std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners;
+  for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) {
+    auto context = tensorpipe::transport::uv::create();
+    std::string address = TensorPipeAgent::guessAddress();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     contexts.push_back(std::move(context));
     listeners.push_back(contexts.back()->listen(address));
   }
@@ -304,10 +311,16 @@ void TensorPipeAgent::TimeSeriesMetricsTracker::addData(uint64_t dataPoint) {
 }
 
 float TensorPipeAgent::TimeSeriesMetricsTracker::computeAverage() const {
+<<<<<<< HEAD
   return currentCount_ == 0 ? 0
                             : static_cast<float>(
                                   static_cast<double>(currentSum_) /
                                   static_cast<double>(currentCount_));
+=======
+  return currentCount_ == 0
+      ? 0
+      : static_cast<float>((double)currentSum_ / (double)currentCount_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ////////////////////////  TensorpipeRpcAgent  /////////////////////////////////
@@ -504,9 +517,14 @@ void TensorPipeAgent::startImpl() {
   for (const auto& p : workerNameToInfo_) {
     const auto& name = p.first;
     auto nodeAddrData = nameToAddressStore_.get(name);
+<<<<<<< HEAD
     auto nodeAddrStr = std::string(
         reinterpret_cast<const char*>(nodeAddrData.data()),
         nodeAddrData.size());
+=======
+    auto nodeAddrStr =
+        std::string((const char*)nodeAddrData.data(), nodeAddrData.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     workerNameToURL_.insert({name, nodeAddrStr});
   }
 
@@ -1242,9 +1260,14 @@ void TensorPipeAgent::updateGroupMembership(
     // TODO: we should get nodeAddrStr in the joining process, then pass in as
     // an argument rather than getting from store each time
     auto nodeAddrData = nameToAddressStore_.get(name);
+<<<<<<< HEAD
     auto nodeAddrStr = std::string(
         reinterpret_cast<const char*>(nodeAddrData.data()),
         nodeAddrData.size());
+=======
+    auto nodeAddrStr =
+        std::string((const char*)nodeAddrData.data(), nodeAddrData.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     workerNameToURL_.insert({name, nodeAddrStr});
 
     for (const auto& it : reverseDeviceMaps) {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index a1d449fba5498..191e50eb24a79 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -74,6 +74,11 @@ struct TORCH_API ChannelRegistration {
 
 TORCH_DECLARE_REGISTRY(TensorPipeChannelRegistry, ChannelRegistration);
 
+<<<<<<< HEAD
+=======
+constexpr auto kDefaultNumWorkerThreads = 16;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
   TensorPipeRpcBackendOptions(
       int numWorkerThreads,
@@ -232,11 +237,19 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // messages by server, and write request messages by client. This
   // is a protected method since it is overwritten by FaultyTensorPipeAgent
   virtual void pipeWrite(
+<<<<<<< HEAD
       const std::shared_ptr<tensorpipe::Pipe>& /*pipe*/,
       const c10::intrusive_ptr<Message>& message,
       std::vector<c10::Device>&& devices,
       std::vector<c10::Stream> streams,
       std::function<void(const tensorpipe::Error&)> /*fn*/) noexcept;
+=======
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      const c10::intrusive_ptr<Message>& message,
+      std::vector<c10::Device>&& devices,
+      std::vector<c10::Stream> streams,
+      std::function<void(const tensorpipe::Error&)>) noexcept;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   // Removes the given messageId with the given expirationTime from the
@@ -257,11 +270,19 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // TensorPipe read function that could be used to read response messages
   // by client, and read request messages by server.
   void pipeRead(
+<<<<<<< HEAD
       const std::shared_ptr<tensorpipe::Pipe>& /*pipe*/,
       std::function<void(
           const tensorpipe::Error&,
           c10::intrusive_ptr<Message>,
           std::vector<c10::Stream>)> /*fn*/) noexcept;
+=======
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      std::function<void(
+          const tensorpipe::Error&,
+          c10::intrusive_ptr<Message>,
+          std::vector<c10::Stream>)>) noexcept;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Callback of listener accept()
   void onListenerAccepted(
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h
index cfb0bad8bdad4..bf563cc037860 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -49,8 +49,13 @@ extern TORCH_API std::array<
 class TORCH_API TensorpipeDeviceTypeConverterRegistrar {
  public:
   TensorpipeDeviceTypeConverterRegistrar(
+<<<<<<< HEAD
       DeviceType /*type*/,
       const TensorpipeDeviceTypeConverter* /*impl*/);
+=======
+      DeviceType,
+      const TensorpipeDeviceTypeConverter*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 #define C10_REGISTER_TENSORPIPE_DEVICE_TYPE_CONVERTER(                     \
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
index 9ec4725fd9663..9f3b937005bcd 100644
--- a/torch/csrc/distributed/rpc/testing/init.cpp
+++ b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -106,6 +106,7 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
+<<<<<<< HEAD
           static_cast<const WorkerInfo& (TensorPipeAgent::*)(void) const>(
               &RpcAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
@@ -123,6 +124,25 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
           "get_worker_infos",
           static_cast<std::vector<WorkerInfo> (TensorPipeAgent::*)() const>(
               &TensorPipeAgent::getWorkerInfos),
+=======
+          (const WorkerInfo& (TensorPipeAgent::*)(void) const) &
+              RpcAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_info",
+          (const WorkerInfo& (TensorPipeAgent::*)(const std::string&) const) &
+              TensorPipeAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_info",
+          (const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id) const) &
+              TensorPipeAgent::getWorkerInfo,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "get_worker_infos",
+          (std::vector<WorkerInfo>(TensorPipeAgent::*)() const) &
+              TensorPipeAgent::getWorkerInfos,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::call_guard<py::gil_scoped_release>());
 #endif // USE_TENSORPIPE
 
diff --git a/torch/csrc/distributed/rpc/types.h b/torch/csrc/distributed/rpc/types.h
index 665d26a87c9e7..2d7944bac376c 100644
--- a/torch/csrc/distributed/rpc/types.h
+++ b/torch/csrc/distributed/rpc/types.h
@@ -32,7 +32,11 @@ struct TORCH_API GloballyUniqueId final {
   bool operator!=(const GloballyUniqueId& other) const;
 
   at::IValue toIValue() const;
+<<<<<<< HEAD
   static GloballyUniqueId fromIValue(const at::IValue& /*ivalue*/);
+=======
+  static GloballyUniqueId fromIValue(const at::IValue&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   struct Hash {
     size_t operator()(const GloballyUniqueId& key) const {
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 7405da7f3294e..a3d5a2843646b 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -314,8 +314,13 @@ parseWireSections(const void* data, size_t data_size) {
   return out;
 }
 
+<<<<<<< HEAD
 static constexpr const char* kMeta = "meta";
 static constexpr const char* kPayload = "payload";
+=======
+static const char* kMeta = "meta";
+static const char* kPayload = "payload";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 c10::List<at::Tensor> cloneSparseTensors(
@@ -507,7 +512,12 @@ std::vector<at::IValue> readWrappedPayload(
       " but additional payload size is ",
       additionalPayloadSize);
   auto wrappedPayloadBegin =
+<<<<<<< HEAD
       message.payload().data() + payload.size() - additionalPayloadSize;
+=======
+      static_cast<const char*>(message.payload().data()) + payload.size() -
+      additionalPayloadSize;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<torch::Tensor> tensorTable;
   IValue tuple = jit::unpickle(
       wrappedPayloadBegin,
diff --git a/torch/csrc/dynamo/cache_entry.cpp b/torch/csrc/dynamo/cache_entry.cpp
index beb8064ba6c24..6451c822dc67c 100644
--- a/torch/csrc/dynamo/cache_entry.cpp
+++ b/torch/csrc/dynamo/cache_entry.cpp
@@ -5,7 +5,11 @@
 #include <torch/csrc/dynamo/extra_state.h>
 
 CacheEntry::CacheEntry(const py::handle& guarded_code, PyObject* backend)
+<<<<<<< HEAD
     : backend{py::cast<py::object>(get_backend(backend))} {
+=======
+    : backend{backend} {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   this->guard_manager = guarded_code.attr("guard_manager");
   this->code = guarded_code.attr("code");
   this->compile_id = guarded_code.attr("compile_id");
@@ -52,7 +56,10 @@ void CacheEntry::invalidate(py::object deleted_guard_manager) {
   this->guard_manager = std::move(deleted_guard_manager);
   this->root_mgr = nullptr;
   this->trace_annotation = "Invalidated";
+<<<<<<< HEAD
   this->backend = py::none();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void CacheEntry::update_diff_guard_root_manager() {
@@ -77,8 +84,13 @@ PyObject* CacheEntry_to_obj(CacheEntry* e) {
 
 PyObject* get_backend(PyObject* callback) {
   py::handle handle = py::handle(callback);
+<<<<<<< HEAD
   while (py::hasattr(handle, "_torchdynamo_orig_backend")) {
     handle = handle.attr("_torchdynamo_orig_backend");
+=======
+  while (py::hasattr(handle, "_torchdynamo_orig_callable")) {
+    handle = handle.attr("_torchdynamo_orig_callable");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return handle.ptr();
 }
diff --git a/torch/csrc/dynamo/cache_entry.h b/torch/csrc/dynamo/cache_entry.h
index e7c58f31a090d..4d30270b5630d 100644
--- a/torch/csrc/dynamo/cache_entry.h
+++ b/torch/csrc/dynamo/cache_entry.h
@@ -53,7 +53,11 @@ typedef struct VISIBILITY_HIDDEN CacheEntry {
   // diff guard root guard manager if exists
   void* diff_guard_root_mgr{nullptr};
   // backend used to create this cache entry
+<<<<<<< HEAD
   py::object backend;
+=======
+  PyObject* backend{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Reference to owning ExtraState
   ExtraState* _owner{nullptr};
   // Reference to this CacheEntry's location in owner's linked list
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index ca9eb3e638f49..3fcd661888ec5 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -1106,8 +1106,12 @@ struct IValuePacker {
   // That's what the TypePtr is for: it contains the information to do the
   // parsing. See torch::jit::toIValue for more information.
   static at::TypePtr packed_type() {
+<<<<<<< HEAD
     // On windows CPU is support compiled autograd.
 #if defined(_WIN32) && (defined(USE_CUDA) || defined(USE_ROCM))
+=======
+#ifdef _WIN32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NB: the if-constexpr usage triggers compilation errors on Windows
     // with certain compiler settings
     // (see https://github.com/pytorch/pytorch/pull/144707 for examples).
@@ -1385,8 +1389,12 @@ struct IValuePacker<std::vector<T>> {
     }
     std::vector<T> result;
     auto lst = t.toList();
+<<<<<<< HEAD
     for (size_t i = 0; i < lst.size(); ++i) {
       const at::IValue& elt = lst.get(i);
+=======
+    for (const at::IValue& elt : lst) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       result.emplace_back(IValuePacker<T>::unpack(elt));
     }
     return result;
@@ -1458,6 +1466,7 @@ struct IValuePacker<InputMetadata> {
     auto tuple = std::make_tuple(
         pack_TensorOptions(t.options()),
         t.shape_as_dim_vector().vec(),
+<<<<<<< HEAD
         t.is_tensor_subclass(),
         t.grad_dtype());
     return tuple;
@@ -1468,20 +1477,36 @@ struct IValuePacker<InputMetadata> {
         std::vector<at::SymInt>,
         bool,
         std::optional<c10::ScalarType>>>();
+=======
+        t.is_tensor_subclass());
+    return tuple;
+  }
+  static InputMetadata unpack(const at::IValue& t) {
+    auto tuple = t.to<
+        std::tuple<packed_tensoroptions_t, std::vector<at::SymInt>, bool>>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return InputMetadata(
         unpack_TensorOptions(std::get<0>(tuple)),
         SymIntSmallVec(std::get<1>(tuple)),
         std::get<2>(tuple),
+<<<<<<< HEAD
         false,
         std::get<3>(tuple));
+=======
+        false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   static at::TypePtr packed_type() {
     return at::TupleType::create(
         {IValuePacker<at::TensorOptions>::packed_type(),
          IValuePacker<std::vector<at::SymInt>>::packed_type(),
+<<<<<<< HEAD
          at::BoolType::get(),
          IValuePacker<std::optional<at::ScalarType>>::packed_type()});
+=======
+         at::BoolType::get()});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
index 7b86017c59b32..5acdaa4db3e09 100644
--- a/torch/csrc/dynamo/cpython_defs.c
+++ b/torch/csrc/dynamo/cpython_defs.c
@@ -2,6 +2,7 @@
 #include <torch/csrc/dynamo/cpython_includes.h>
 #include <torch/csrc/dynamo/debug_macros.h>
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
 
 const uint8_t* THP_PyOpcode_Caches = NULL;
@@ -16,6 +17,8 @@ void init_THPCaches() {}
 
 #else
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if IS_PYTHON_3_11_PLUS
 
 #define Py_BUILD_CORE
@@ -33,12 +36,19 @@ void init_THPCaches() {}
 #undef NEED_OPCODE_TABLES
 #undef Py_BUILD_CORE
 
+<<<<<<< HEAD
 // As a simple way to reduce the impact of ABI changes on the CPython side, this
 // check forces us to manually re-check that the function didn't change on the
 // next major version
 #if IS_PYTHON_3_15_PLUS
 #error \
     "Please ensure that the functions below still match the CPython implementation for 3.15"
+=======
+// As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
+// us to manually re-check that the function didn't change on the next major version
+#if IS_PYTHON_3_14_PLUS
+#error "Please ensure that the functions below still match the CPython implementation for 3.14"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 // e.g. COPY_FIELD(op, o, globals) becomes
@@ -55,15 +65,26 @@ void init_THPCaches() {}
 // Ensure that all fields defined in the PyFunctionObject struct in
 // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Include/cpython/funcobject.h
 // are accounted for.
+<<<<<<< HEAD
 PyFunctionObject* _PyFunction_CopyWithNewCode(
     PyFunctionObject* o,
     PyCodeObject* code) {
   PyFunctionObject* op = PyObject_GC_New(PyFunctionObject, &PyFunction_Type);
+=======
+PyFunctionObject *
+_PyFunction_CopyWithNewCode(PyFunctionObject *o, PyCodeObject* code)
+{
+  PyFunctionObject *op = PyObject_GC_New(PyFunctionObject, &PyFunction_Type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (op == NULL) {
     return NULL;
   }
   Py_XINCREF(code);
+<<<<<<< HEAD
   op->func_code = (PyObject*)code;
+=======
+  op->func_code = (PyObject *) code;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_XINCREF(code->co_name);
   op->func_name = code->co_name;
   Py_XINCREF(code->co_qualname);
@@ -78,18 +99,25 @@ PyFunctionObject* _PyFunction_CopyWithNewCode(
   op->func_weakreflist = NULL;
   COPY_FIELD(op, o, module);
   COPY_FIELD(op, o, annotations);
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS
   COPY_FIELD(op, o, annotate);
 #endif
 #if IS_PYTHON_3_12_PLUS
   COPY_FIELD(op, o, typeparams);
 #endif
+=======
+  #if IS_PYTHON_3_12_PLUS
+  COPY_FIELD(op, o, typeparams);
+  #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op->vectorcall = o->vectorcall;
   op->func_version = o->func_version;
   PyObject_GC_Track(op);
   return op;
 }
 
+<<<<<<< HEAD
 // From
 // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Objects/frameobject.c#L1020
 PyFrameObject* THP_PyFrame_New_NoTrack(const PyCodeObject* code) {
@@ -493,3 +521,294 @@ void init_THPCaches() {
 }
 
 #endif // IS_PYTHON_3_15_PLUS
+=======
+// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Objects/frameobject.c#L1020
+PyFrameObject*
+THP_PyFrame_New_NoTrack(const PyCodeObject *code)
+{
+    // DYNAMO: commented out
+    // CALL_STAT_INC(frame_objects_created);
+    int slots = code->co_nlocalsplus + code->co_stacksize;
+    PyFrameObject *f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots);
+    if (f == NULL) {
+        return NULL;
+    }
+    f->f_back = NULL;
+    f->f_trace = NULL;
+    f->f_trace_lines = 1;
+    f->f_trace_opcodes = 0;
+#if IS_PYTHON_3_13_PLUS
+    f->f_extra_locals = NULL;
+#else
+    f->f_fast_as_locals = 0;
+#endif
+    f->f_lineno = 0;
+    return f;
+}
+
+// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L27
+PyFrameObject *
+THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame)
+{
+    CHECK(frame->frame_obj == NULL);
+    PyObject *error_type = NULL, *error_value = NULL, *error_traceback = NULL;
+    PyErr_Fetch(&error_type, &error_value, &error_traceback);
+
+    PyFrameObject *f = THP_PyFrame_New_NoTrack(F_CODE(frame));
+    if (f == NULL) {
+        Py_XDECREF(error_type);
+        Py_XDECREF(error_value);
+        Py_XDECREF(error_traceback);
+        return NULL;
+    }
+    PyErr_Restore(error_type, error_value, error_traceback);
+    if (frame->frame_obj) {
+        // GH-97002: How did we get into this horrible situation? Most likely,
+        // allocating f triggered a GC collection, which ran some code that
+        // *also* created the same frame... while we were in the middle of
+        // creating it! See test_sneaky_frame_object in test_frame.py for a
+        // concrete example.
+        //
+        // Regardless, just throw f away and use that frame instead, since it's
+        // already been exposed to user code. It's actually a bit tricky to do
+        // this, since we aren't backed by a real _PyInterpreterFrame anymore.
+        // Just pretend that we have an owned, cleared frame so frame_dealloc
+        // doesn't make the situation worse:
+        f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data;
+        f->f_frame->owner = FRAME_CLEARED;
+        f->f_frame->frame_obj = f;
+        Py_DECREF(f);
+        return frame->frame_obj;
+    }
+    CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+    CHECK(frame->owner != FRAME_CLEARED);
+    f->f_frame = frame;
+    frame->frame_obj = f;
+    return f;
+}
+
+// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Include/internal/pycore_frame.h#L163
+static inline PyFrameObject *
+THP_PyFrame_GetFrameObject(_PyInterpreterFrame *frame)
+{
+
+    CHECK(!_PyFrame_IsIncomplete(frame));
+    PyFrameObject *res =  frame->frame_obj;
+    if (res != NULL) {
+        return res;
+    }
+    return THP_PyFrame_MakeAndSetFrameObject(frame);
+}
+
+// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L79
+static void
+THP_take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame)
+{
+    CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+    CHECK(frame->owner != FRAME_CLEARED);
+    Py_ssize_t size = ((char*)&frame->localsplus[frame->stacktop]) - (char *)frame;
+    memcpy((_PyInterpreterFrame *)f->_f_frame_data, frame, size);
+    frame = (_PyInterpreterFrame *)f->_f_frame_data;
+    f->f_frame = frame;
+    frame->owner = FRAME_OWNED_BY_FRAME_OBJECT;
+    if (_PyFrame_IsIncomplete(frame)) {
+        // This may be a newly-created generator or coroutine frame. Since it's
+        // dead anyways, just pretend that the first RESUME ran:
+        PyCodeObject *code = F_CODE(frame);
+        PREV_INSTR(frame) = _PyCode_CODE(code) + code->_co_firsttraceable;
+    }
+    CHECK(!_PyFrame_IsIncomplete(frame));
+    CHECK(f->f_back == NULL);
+    _PyInterpreterFrame *prev = frame->previous;
+    while (prev && _PyFrame_IsIncomplete(prev)) {
+        prev = prev->previous;
+    }
+    if (prev) {
+        /* Link PyFrameObjects.f_back and remove link through _PyInterpreterFrame.previous */
+        PyFrameObject *back = THP_PyFrame_GetFrameObject(prev);
+        if (back == NULL) {
+            /* Memory error here. */
+            CHECK(PyErr_ExceptionMatches(PyExc_MemoryError));
+            /* Nothing we can do about it */
+            PyErr_Clear();
+        }
+        else {
+            f->f_back = (PyFrameObject *)Py_NewRef(back);
+        }
+        frame->previous = NULL;
+    }
+    // DYNAMO: use public GC functions instead of internal ones
+    if (!PyObject_GC_IsTracked((PyObject *) f)) {
+        PyObject_GC_Track((PyObject *) f);
+    }
+}
+
+// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L120
+void
+THP_PyFrame_Clear(_PyInterpreterFrame *frame)
+{
+    /* It is the responsibility of the owning generator/coroutine
+     * to have cleared the enclosing generator, if any. */
+    CHECK(frame->owner != FRAME_OWNED_BY_GENERATOR ||
+        _PyFrame_GetGenerator(frame)->gi_frame_state == FRAME_CLEARED);
+    // GH-99729: Clearing this frame can expose the stack (via finalizers). It's
+    // crucial that this frame has been unlinked, and is no longer visible:
+#if IS_PYTHON_3_13_PLUS
+    CHECK(_PyThreadState_GET()->current_frame != frame);
+#else
+    CHECK(_PyThreadState_GET()->cframe->current_frame != frame);
+#endif
+    if (frame->frame_obj) {
+        PyFrameObject *f = frame->frame_obj;
+        frame->frame_obj = NULL;
+        if (Py_REFCNT(f) > 1) {
+            THP_take_ownership(f, frame);
+            Py_DECREF(f);
+            return;
+        }
+        Py_DECREF(f);
+    }
+    CHECK(frame->stacktop >= 0);
+    for (int i = 0; i < frame->stacktop; i++) {
+        Py_XDECREF(frame->localsplus[i]);
+    }
+    Py_XDECREF(frame->frame_obj);
+    Py_XDECREF(frame->f_locals);
+    // DYNAMO: additional field for 3.12
+    #if IS_PYTHON_3_12_PLUS
+    Py_DECREF(frame->f_funcobj);
+    #else
+    Py_DECREF(frame->f_func);
+    #endif
+    Py_DECREF(F_CODE(frame));
+}
+
+// https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L635
+void *
+THP_PyObject_VirtualAlloc(size_t size)
+{
+    PyObjectArenaAllocator arena;
+    PyObject_GetArenaAllocator(&arena);
+    return arena.alloc(arena.ctx, size);
+}
+
+// https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L641
+void
+THP_PyObject_VirtualFree(void *obj, size_t size)
+{
+    PyObjectArenaAllocator arena;
+    PyObject_GetArenaAllocator(&arena);
+    return arena.free(arena.ctx, obj, size);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L728
+static _PyStackChunk*
+allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
+{
+    CHECK(size_in_bytes % sizeof(PyObject **) == 0);
+    _PyStackChunk *res = THP_PyObject_VirtualAlloc(size_in_bytes);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->previous = previous;
+    res->size = size_in_bytes;
+    res->top = 0;
+    return res;
+}
+
+#define DATA_STACK_CHUNK_SIZE (16*1024)
+#define MINIMUM_OVERHEAD 1000
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2182
+static PyObject **
+push_chunk(PyThreadState *tstate, int size)
+{
+    int allocate_size = DATA_STACK_CHUNK_SIZE;
+    while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
+        allocate_size *= 2;
+    }
+    _PyStackChunk *new = allocate_chunk(allocate_size, tstate->datastack_chunk);
+    if (new == NULL) {
+        return NULL;
+    }
+    if (tstate->datastack_chunk) {
+        tstate->datastack_chunk->top = tstate->datastack_top -
+                                       &tstate->datastack_chunk->data[0];
+    }
+    tstate->datastack_chunk = new;
+    tstate->datastack_limit = (PyObject **)(((char *)new) + allocate_size);
+    // When new is the "root" chunk (i.e. new->previous == NULL), we can keep
+    // _PyThreadState_PopFrame from freeing it later by "skipping" over the
+    // first element:
+    PyObject **res = &new->data[new->previous == NULL];
+    tstate->datastack_top = res + size;
+    return res;
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Include/internal/pycore_frame.h#L199
+static inline bool
+THP_PyThreadState_HasStackSpace(PyThreadState *tstate, size_t size)
+{
+    CHECK(
+        (tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
+        ||
+        (tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
+    );
+    return tstate->datastack_top != NULL &&
+        size < (size_t)(tstate->datastack_limit - tstate->datastack_top);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2207
+_PyInterpreterFrame *
+THP_PyThreadState_BumpFramePointerSlow(PyThreadState *tstate, size_t size)
+{
+    if (THP_PyThreadState_HasStackSpace(tstate, size)) {
+        _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->datastack_top;
+        tstate->datastack_top += size;
+        return res;
+    }
+    if (size > INT_MAX/2) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    return (_PyInterpreterFrame *)push_chunk(tstate, (int)size);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2222
+void
+THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
+{
+    CHECK(tstate->datastack_chunk);
+    PyObject **base = (PyObject **)frame;
+    if (base == &tstate->datastack_chunk->data[0]) {
+        _PyStackChunk *chunk = tstate->datastack_chunk;
+        _PyStackChunk *previous = chunk->previous;
+        // push_chunk ensures that the root chunk is never popped:
+        CHECK(previous);
+        tstate->datastack_top = &previous->data[previous->top];
+        tstate->datastack_chunk = previous;
+        THP_PyObject_VirtualFree(chunk, chunk->size);
+        tstate->datastack_limit = (PyObject **)(((char *)previous) + previous->size);
+    }
+    else {
+        CHECK(tstate->datastack_top);
+        CHECK(tstate->datastack_top >= base);
+        tstate->datastack_top = base;
+    }
+}
+
+
+#endif
+
+#if IS_PYTHON_3_11_PLUS
+
+const uint8_t* THP_PyOpcode_Caches = _PyOpcode_Caches;
+const int THP_PyOpcode_Caches_size = sizeof(_PyOpcode_Caches) / sizeof(uint8_t);
+
+#else
+
+const uint8_t* THP_PyOpcode_Caches = NULL;
+const int THP_PyOpcode_Caches_size = 0;
+
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
index 7183875dc6829..8b4ef70a02d4f 100644
--- a/torch/csrc/dynamo/cpython_defs.h
+++ b/torch/csrc/dynamo/cpython_defs.h
@@ -28,6 +28,7 @@ void THP_PyThreadState_PopFrame(
 
 // pointers to _PyOpcode_Caches for C++
 #ifdef __cplusplus
+<<<<<<< HEAD
 extern "C" {
 #endif
 
@@ -37,4 +38,15 @@ void init_THPCaches();
 
 #ifdef __cplusplus
 } // extern "C"
+=======
+
+extern "C" const uint8_t* THP_PyOpcode_Caches;
+extern "C" const int THP_PyOpcode_Caches_size;
+
+#else
+
+extern const uint8_t* THP_PyOpcode_Caches;
+extern const int THP_PyOpcode_Caches_size;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
index ff1fd4fcfc485..a5fcb83b7244d 100644
--- a/torch/csrc/dynamo/cpython_includes.h
+++ b/torch/csrc/dynamo/cpython_includes.h
@@ -10,6 +10,10 @@
 #endif
 
 // see https://bugs.python.org/issue35886
+<<<<<<< HEAD
+=======
+#if PY_VERSION_HEX >= 0x03080000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define Py_BUILD_CORE
 
 #ifndef __cplusplus
@@ -20,6 +24,7 @@
 
 #if IS_PYTHON_3_11_PLUS
 #include <internal/pycore_frame.h>
+<<<<<<< HEAD
 
 #if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 #include <internal/pycore_code.h>
@@ -33,11 +38,18 @@
 #endif
 
 #undef Py_BUILD_CORE
+=======
+#endif
+
+#undef Py_BUILD_CORE
+#endif // PY_VERSION_HEX >= 0x03080000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 
 #define F_CODE(x) ((PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable))
@@ -50,6 +62,8 @@ extern "C" {
 
 #else
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if IS_PYTHON_3_13_PLUS
 #define F_CODE(x) ((PyCodeObject*)(x)->f_executable)
 #define PREV_INSTR(x) (x)->instr_ptr
@@ -58,6 +72,7 @@ extern "C" {
 #define PREV_INSTR(x) (x)->prev_instr
 #endif
 
+<<<<<<< HEAD
 #endif // IS_PYTHON_3_14_PLUS
 
 #if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
@@ -68,6 +83,12 @@ extern "C" {
 #define FUNC(x) ((PyFunctionObject*)(x)->f_funcobj)
 #else
 #define FUNC(x) ((PyFunctionObject*)(x)->f_func)
+=======
+#if IS_PYTHON_3_12_PLUS
+#define FUNC(x) ((x)->f_funcobj)
+#else
+#define FUNC(x) ((x)->f_func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef __cplusplus
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index f597cea30ed3c..b42a6a9a22f41 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -34,9 +34,14 @@ void eval_frame_callback_set(PyObject* obj) {
   PyThread_tss_set(&eval_frame_callback_key, obj);
 }
 
+<<<<<<< HEAD
 // 3.15 Not supported at all. See cpython_defs.c for hints
 // 3.14 currently not fully supported on Windows
 #if !(IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32)))
+=======
+// 3.14 Not supported at all. See cpython_defs.c for hints
+#if !(IS_PYTHON_3_14_PLUS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define DECLARE_PYOBJ_ATTR(name)                        \
   static PyObject* THPPyInterpreterFrame_##name(        \
@@ -57,6 +62,7 @@ static PyObject* THPPyInterpreterFrame_f_locals(
   return self->locals;
 }
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS
 static PyObject* THPPyInterpreterFrame_f_executable(
     THPPyInterpreterFrame* self,
@@ -64,6 +70,9 @@ static PyObject* THPPyInterpreterFrame_f_executable(
   return PyStackRef_AsPyObjectNew(self->frame->f_executable);
 }
 #elif IS_PYTHON_3_13_PLUS
+=======
+#if IS_PYTHON_3_13_PLUS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DECLARE_PYOBJ_ATTR(f_executable)
 #else
 DECLARE_PYOBJ_ATTR(f_code)
@@ -116,8 +125,16 @@ static PyObject* THPPyInterpreterFrame_f_back(
 static PyObject* THPPyInterpreterFrame_closure(
     THPPyInterpreterFrame* self,
     PyObject* _noargs) {
+<<<<<<< HEAD
 #if IS_PYTHON_3_11_PLUS
   PyObject* closure = FUNC(self->frame)->func_closure;
+=======
+#if IS_PYTHON_3_12_PLUS
+  PyObject* closure = ((PyFunctionObject*)self->frame->f_funcobj)->func_closure;
+  return closure == NULL ? PyTuple_New(0) : Py_XNewRef(closure);
+#elif IS_PYTHON_3_11_PLUS
+  PyObject* closure = ((PyFunctionObject*)self->frame->f_func)->func_closure;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return closure == NULL ? PyTuple_New(0) : Py_XNewRef(closure);
 #else
   PyCodeObject* code = self->frame->f_code;
@@ -228,6 +245,7 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
 }
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS
 static void dup_obj(_PyStackRef* dst, _PyStackRef src) {
   *dst = PyStackRef_DUP(src);
@@ -238,6 +256,18 @@ static void dup_obj(PyObject** dst, PyObject* src) {
   *dst = src;
 }
 #endif
+=======
+void clear_old_frame_if_python_312_plus(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame) {
+#if IS_PYTHON_3_12_PLUS
+
+  THP_PyFrame_Clear(frame);
+  THP_PyThreadState_PopFrame(tstate, frame);
+
+#endif
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static PyObject* dynamo_eval_custom_code_impl(
     PyThreadState* tstate,
@@ -252,10 +282,18 @@ static PyObject* dynamo_eval_custom_code_impl(
 
   // Generate Python function object and _PyInterpreterFrame in a way similar to
   // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/ceval.c#L1130
+<<<<<<< HEAD
   PyFunctionObject* old_func = FUNC(frame);
 #if IS_PYTHON_3_12_PLUS
   size_t size = code->co_framesize;
 #else
+=======
+#if IS_PYTHON_3_12_PLUS
+  PyFunctionObject* old_func = (PyFunctionObject*)frame->f_funcobj;
+  size_t size = code->co_framesize;
+#else
+  PyFunctionObject* old_func = frame->f_func;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
 #endif
 
@@ -273,16 +311,21 @@ static PyObject* dynamo_eval_custom_code_impl(
 
   Py_INCREF(func);
   // consumes reference to func
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS
   _PyStackRef func_stackref = PyStackRef_FromPyObjectSteal((PyObject*)func);
   _PyFrame_Initialize(
       tstate, shadow, func_stackref, NULL, code, 0, frame->previous);
 #elif IS_PYTHON_3_12_PLUS
+=======
+#if IS_PYTHON_3_12_PLUS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _PyFrame_Initialize(shadow, func, NULL, code, 0);
 #else
   _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
 #endif
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_14_PLUS
   _PyStackRef* fastlocals_old = frame->localsplus;
   _PyStackRef* fastlocals_new = shadow->localsplus;
@@ -290,6 +333,10 @@ static PyObject* dynamo_eval_custom_code_impl(
   PyObject** fastlocals_old = frame->localsplus;
   PyObject** fastlocals_new = shadow->localsplus;
 #endif
+=======
+  PyObject** fastlocals_old = frame->localsplus;
+  PyObject** fastlocals_new = shadow->localsplus;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_ssize_t n_old = F_CODE(frame)->co_nlocalsplus;
   Py_ssize_t n_new = code->co_nlocalsplus;
 
@@ -390,14 +437,24 @@ static PyObject* dynamo_eval_custom_code_impl(
       !!(F_CODE(frame)->co_flags & CO_VARKEYWORDS);
 
   for (Py_ssize_t i = 0; i < total_argcount_old; i++) {
+<<<<<<< HEAD
     dup_obj(&fastlocals_new[i], fastlocals_old[i]);
+=======
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[i] = fastlocals_old[i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // copy free vars
   Py_ssize_t nfrees_old = PyCode_GetNFreevars(F_CODE(frame));
 
   for (Py_ssize_t i = 0; i < nfrees_old; i++) {
+<<<<<<< HEAD
     dup_obj(&fastlocals_new[n_new - 1 - i], fastlocals_old[n_old - 1 - i]);
+=======
+    Py_XINCREF(fastlocals_old[n_old - 1 - i]);
+    fastlocals_new[n_new - 1 - i] = fastlocals_old[n_old - 1 - i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // copy cell vars, from high index to low index, until it meets a variable
@@ -423,7 +480,12 @@ static PyObject* dynamo_eval_custom_code_impl(
     }
 #endif
 
+<<<<<<< HEAD
     dup_obj(&fastlocals_new[j], fastlocals_old[i]);
+=======
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[j] = fastlocals_old[i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // NOTE: if you want to evaluate frame instead of shadow in 3.12+,
@@ -488,12 +550,17 @@ static PyObject* dynamo__custom_eval_frame_shim(
   return dynamo__custom_eval_frame(tstate, frame, throw_flag, callback);
 }
 
+<<<<<<< HEAD
 #else // !(IS_PYTHON_3_15_PLUS)
+=======
+#else // !(IS_PYTHON_3_14_PLUS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Fake definitions for everything we removed
 
 static void enable_eval_frame_shim(PyThreadState* tstate) {}
 static void enable_eval_frame_default(PyThreadState* tstate) {}
+<<<<<<< HEAD
 PyObject* dynamo_eval_custom_code(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
@@ -514,6 +581,10 @@ PyObject* dynamo_eval_frame_default(
 }
 
 static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {{NULL}};
+=======
+
+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static PyTypeObject THPPyInterpreterFrameType = {
     PyVarObject_HEAD_INIT(NULL, 0)
@@ -523,6 +594,7 @@ static PyTypeObject THPPyInterpreterFrameType = {
     .tp_getset = THPPyInterpreterFrame_properties,
 };
 
+<<<<<<< HEAD
 #endif // !(IS_PYTHON_3_15_PLUS)
 
 void clear_old_frame_if_python_312_plus(
@@ -535,6 +607,9 @@ void clear_old_frame_if_python_312_plus(
 
 #endif
 }
+=======
+#endif // !(IS_PYTHON_3_14_PLUS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static PyObject* increment_working_threads(
     PyThreadState* tstate,
@@ -568,13 +643,21 @@ static PyObject* decrement_working_threads(
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
 static PyObject* set_eval_frame(PyObject* new_callback, PyObject* module) {
+=======
+static PyObject* set_eval_frame(
+    PyObject* new_callback,
+    PyThreadState* tstate,
+    PyObject* module) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Change the eval frame callback and return the old one
   //  - None: disables TorchDynamo
   //  - False: run-only mode (reuse existing compiles)
   //  - Python callable(): enables TorchDynamo
   PyObject* old_callback = eval_frame_callback_get();
 
+<<<<<<< HEAD
   // Common case: if Dynamo is actually off, we might see a lot of
   // traffic setting the callback to None when it was already
   // None. Skip messing with threading, thread-local storage, and
@@ -601,6 +684,24 @@ static PyObject* set_eval_frame(PyObject* new_callback, PyObject* module) {
     Py_INCREF(old_callback);
   }
 
+=======
+  // owned by caller
+  Py_INCREF(old_callback);
+
+  if (old_callback != Py_None && new_callback == Py_None) {
+    decrement_working_threads(tstate, module);
+  } else if (old_callback == Py_None && new_callback != Py_None) {
+    increment_working_threads(tstate, module);
+  }
+
+  Py_INCREF(new_callback);
+  Py_DECREF(old_callback);
+
+  // Set thread local callback. This will drive behavior of our shim, if/when it
+  // is installed.
+  eval_frame_callback_set(new_callback);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return old_callback;
 }
 
@@ -615,7 +716,11 @@ static PyObject* set_eval_frame_py(PyObject* module, PyObject* callback) {
       "python enabled=%d and is run_only=%d",
       callback != Py_None,
       callback == Py_False);
+<<<<<<< HEAD
   return set_eval_frame(callback, module);
+=======
+  return set_eval_frame(callback, PyThreadState_GET(), module);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* set_skip_guard_eval_unsafe(
diff --git a/torch/csrc/dynamo/eval_frame_cpp.cpp b/torch/csrc/dynamo/eval_frame_cpp.cpp
index e678bc7bad04a..e5e91f2b962ab 100644
--- a/torch/csrc/dynamo/eval_frame_cpp.cpp
+++ b/torch/csrc/dynamo/eval_frame_cpp.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/dynamo/cache_entry.h>
 #include <torch/csrc/dynamo/cpp_shim.h>
 #include <torch/csrc/dynamo/cpython_includes.h>
@@ -24,8 +27,15 @@ static py::object dynamo_call_callback(
     CacheEntry* cache_entry,
     FrameState* frame_state) {
   THPPyInterpreterFrame* frame = THPPyInterpreterFrame_New(_frame);
+<<<<<<< HEAD
   TORCH_CHECK(
       frame, "Dynamo failed to initialize CPython interpreter frame wrapper");
+=======
+  if (frame == nullptr) {
+    throw std::runtime_error(
+        "Dynamo failed to initialize CPython interpreter frame wrapper");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   frame->locals = (PyObject*)framelocals_mapping_to_dict(locals);
 
   py::object cache_entry_obj = py::none();
@@ -138,6 +148,7 @@ PyObject* dynamo__custom_eval_frame(
 
   auto fail = [&]() { clear_old_frame_if_python_312_plus(tstate, frame); };
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_12_PLUS
   // skip tracing the frame if CPython is in a tracing state (e.g.
   // sys.monitoring call)
@@ -147,6 +158,8 @@ PyObject* dynamo__custom_eval_frame(
   }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ExtraState* extra = get_extra_state(F_CODE(frame));
 
   if (callback.is(py::bool_(false)) && extra == nullptr) {
@@ -343,6 +356,7 @@ PyObject* set_code_exec_strategy(PyObject* dummy, PyObject* args) {
   extra_state_set_exec_strategy(extra, strategy);
   Py_RETURN_NONE;
 }
+<<<<<<< HEAD
 
 void skip_code_recursive(PyCodeObject* code) {
   ExtraState* extra = get_extra_state(code);
@@ -354,3 +368,5 @@ void skip_code_recursive(PyCodeObject* code) {
       FrameExecStrategy{FrameAction::SKIP, FrameAction::SKIP};
   extra_state_set_exec_strategy(extra, strategy);
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/dynamo/eval_frame_cpp.h b/torch/csrc/dynamo/eval_frame_cpp.h
index 2f3587094f763..f3e5e93d023b5 100644
--- a/torch/csrc/dynamo/eval_frame_cpp.h
+++ b/torch/csrc/dynamo/eval_frame_cpp.h
@@ -17,7 +17,10 @@ PyObject* dynamo__custom_eval_frame(
     PyObject* callback);
 
 PyObject* set_code_exec_strategy(PyObject* dummy, PyObject* obj);
+<<<<<<< HEAD
 void skip_code_recursive(PyCodeObject* code);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef __cplusplus
 
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index b9dccb456fd65..9de8ff4067beb 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/dynamo/extra_state.h>
 
 #include <torch/csrc/dynamo/cache_entry.h>
@@ -153,8 +156,13 @@ void lookup(
   for (CacheEntry& cache_entry : extra_state->cache_entry_list) {
     // Check backend. Py_False means run only mode.
 
+<<<<<<< HEAD
     bool valid = backend == Py_False ||
         backend_match(cache_entry.backend.ptr(), backend);
+=======
+    bool valid =
+        backend == Py_False || backend_match(cache_entry.backend, backend);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (valid) {
       try {
@@ -233,8 +241,14 @@ py::list _debug_get_cache_entry_list(const py::handle& code_obj) {
 
 PrecompileEntry::PrecompileEntry(py::object gm, py::object c)
     : guard_manager(std::move(gm)), code(std::move(c)) {
+<<<<<<< HEAD
   TORCH_CHECK(
       PyCode_Check(code.ptr()), "Expecting CodeType from PrecompileEntry.");
+=======
+  if (!PyCode_Check(code.ptr())) {
+    throw std::runtime_error("Expecting CodeType from PrecompileEntry.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   root_mgr =
       torch::dynamo::convert_to_root_guard_manager(guard_manager.attr("root"));
 }
@@ -268,6 +282,7 @@ void _load_precompile_entry(
       PrecompileEntry(std::move(guard_manager), std::move(dynamo_code));
   extra->precompile_entries.push_back(std::move(entry));
 }
+<<<<<<< HEAD
 
 py::list _debug_get_precompile_entries(const py::handle& code_obj) {
   if (!py::isinstance(code_obj, py::module::import("types").attr("CodeType"))) {
@@ -283,3 +298,5 @@ py::list _debug_get_precompile_entries(const py::handle& code_obj) {
   }
   return result;
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/dynamo/extra_state.h b/torch/csrc/dynamo/extra_state.h
index 1630ac90b21dd..caf4bc5b7529f 100644
--- a/torch/csrc/dynamo/extra_state.h
+++ b/torch/csrc/dynamo/extra_state.h
@@ -202,6 +202,9 @@ void _load_precompile_entry(
     const py::handle& code_obj,
     py::object guard_manager,
     py::object dynamo_code);
+<<<<<<< HEAD
 py::list _debug_get_precompile_entries(const py::handle& code_obj);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif
diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
index 38f83e6a778f5..25bdf2cb9975d 100644
--- a/torch/csrc/dynamo/framelocals_mapping.cpp
+++ b/torch/csrc/dynamo/framelocals_mapping.cpp
@@ -4,9 +4,13 @@
 #include <torch/csrc/dynamo/cpython_includes.h>
 #include <torch/csrc/dynamo/debug_macros.h>
 
+<<<<<<< HEAD
 #define Py_BUILD_CORE
 #include <internal/pycore_code.h>
 #undef Py_BUILD_CORE
+=======
+#include <internal/pycore_code.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if IS_PYTHON_3_11_PLUS
 
@@ -28,6 +32,7 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
   PyCodeObject* co = F_CODE(frame);
   _framelocals.resize(co->co_nlocalsplus, nullptr);
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
   TORCH_CHECK(false, "Python 3.15+ / 3.14 on Windows not supported");
 #elif IS_PYTHON_3_14_PLUS
@@ -39,6 +44,11 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
     return;
   }
 #endif
+=======
+  if (!frame->stacktop) {
+    return;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto update_framelocals = [&](int i, PyObject* value) {
     _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
@@ -63,6 +73,7 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
   };
 
   auto offset = co->co_nlocalsplus - co->co_nfreevars;
+<<<<<<< HEAD
 #if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
   TORCH_CHECK(false, "Python 3.15+ / 3.14 on Windows not supported");
 #elif IS_PYTHON_3_14_PLUS
@@ -82,6 +93,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
 #else
   PyObject* closure = FUNC(frame)->func_closure;
 #endif
+=======
+  for (int i = 0; i < offset; i++) {
+    update_framelocals(i, frame->localsplus[i]);
+  }
+  // Get references to closure variables
+  PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int i = 0; i < co->co_nfreevars; i++) {
     update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i));
   }
@@ -159,6 +177,7 @@ void FrameLocalsMapping::_realize_dict() {
   auto update_mapping = [&](int i) {
     DEBUG_CHECK(0 <= i && i < _framelocals.size());
     PyObject* value = _framelocals[i].ptr();
+<<<<<<< HEAD
     // NOTE: CPython's PyFrame_FastToLocalsWithError/map_to_dict
     // removes the local name from the locals dict if the value is NULL.
     // This is likely so that if a local variable is deleted in the fastlocals,
@@ -169,6 +188,11 @@ void FrameLocalsMapping::_realize_dict() {
     // It is unexpected that multiple fastlocal values corresponding to
     // the same variable name have both a null and non-null value.
     if (value != nullptr) {
+=======
+    if (value == nullptr) {
+      _dict.attr("pop")(framelocals_names[i], py::none());
+    } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _dict[framelocals_names[i]] = value;
     }
   };
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index bdcaf71c05d5f..97f95fcbe8b75 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -2,12 +2,18 @@
 #include <ATen/autocast_mode.h>
 #include <c10/core/SafePyObject.h>
 #include <c10/core/impl/PyInterpreter.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define PY_SSIZE_T_CLEAN
 #include <ATen/EmptyTensor.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <c10/util/flat_hash_map.h>
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/dynamo/guards.h>
@@ -19,7 +25,10 @@
 #include <torch/csrc/utils/python_symnode.h>
 #include <torch/csrc/utils/pythoncapi_compat.h>
 #include <torch/extension.h>
+<<<<<<< HEAD
 #include <cstdint>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/dynamo/debug_macros.h>
 
@@ -33,15 +42,19 @@
 #include <ATen/xpu/EmptyTensor.h>
 #endif
 
+<<<<<<< HEAD
 #ifdef USE_MTIA
 #include <ATen/native/mtia/EmptyTensor.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <chrono>
 #include <sstream>
 #include <tuple>
 #include <utility>
 
+<<<<<<< HEAD
 // Uncomment next line to count instructions for guard eval.
 // #define GUARD_INSTRUCTION_COUNT
 #ifdef GUARD_INSTRUCTION_COUNT
@@ -80,6 +93,8 @@ uint64_t count_instructions(const std::function<void()>& fn) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Certain CPython data structures are defined in `.c` files in earlier Python
 // versions, e.g., for TupleIteratorGetItemAccessor, we need a fast way to
 // retrieve the underlying tuple and access the item. Before Python 3.12
@@ -569,7 +584,11 @@ static PyMethodDef TensorGuards_methods[] = {
     {nullptr} /* Sentinel */
 };
 
+<<<<<<< HEAD
 static PyTypeObject TensorGuardsType = {PyVarObject_HEAD_INIT(nullptr, 0)
+=======
+static PyTypeObject TensorGuardsType = { PyVarObject_HEAD_INIT(nullptr, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct AutocastState {
@@ -623,7 +642,11 @@ struct AutocastState {
 struct GlobalStateGuard {
   PyObject_HEAD
 
+<<<<<<< HEAD
   void init() {
+=======
+  inline void init() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto& ctx = at::globalContext();
     _grad_mode = at::GradMode::is_enabled();
     _autocast_state = AutocastState();
@@ -635,16 +658,24 @@ struct GlobalStateGuard {
     _torch_function_all_disabled = at::impl::torch_function_all_disabled();
     _deterministic_algorithms = ctx.deterministicAlgorithms();
     _deterministic_algorithms_warn_only = ctx.deterministicAlgorithmsWarnOnly();
+<<<<<<< HEAD
     _allow_tf32 =
         ctx.float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
         at::Float32Precision::TF32;
+=======
+    _allow_tf32 = ctx.allowTF32CuBLAS();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _allow_fp16_reduce = ctx.allowFP16ReductionCuBLAS();
     _allow_bf16_reduce = ctx.allowBF16ReductionCuBLAS();
     _num_threads = at::get_num_threads();
     _default_dtype = at::get_default_dtype();
   }
 
+<<<<<<< HEAD
   bool check() const {
+=======
+  inline bool check() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto& ctx = at::globalContext();
     return (_grad_mode == at::GradMode::is_enabled() &&
             _autocast_state == AutocastState() &&
@@ -654,17 +685,25 @@ struct GlobalStateGuard {
             _deterministic_algorithms == ctx.deterministicAlgorithms() &&
             _deterministic_algorithms_warn_only ==
                 ctx.deterministicAlgorithmsWarnOnly() &&
+<<<<<<< HEAD
             _allow_tf32 ==
                 (ctx.float32Precision(
                      at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
                  at::Float32Precision::TF32) &&
+=======
+            _allow_tf32 == ctx.allowTF32CuBLAS() &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _allow_fp16_reduce == ctx.allowFP16ReductionCuBLAS() &&
             _allow_bf16_reduce == ctx.allowBF16ReductionCuBLAS() &&
             _num_threads == at::get_num_threads()) &&
         _default_dtype == at::get_default_dtype();
   }
 
+<<<<<<< HEAD
   std::string reason() const {
+=======
+  inline std::string reason() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::ostringstream os;
     auto& ctx = at::globalContext();
     if (_grad_mode != at::GradMode::is_enabled())
@@ -678,10 +717,14 @@ struct GlobalStateGuard {
     if (_deterministic_algorithms_warn_only !=
         ctx.deterministicAlgorithmsWarnOnly())
       os << "deterministic_algorithms_warn_only ";
+<<<<<<< HEAD
     if (_allow_tf32 !=
         (ctx.float32Precision(
              at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
          at::Float32Precision::TF32))
+=======
+    if (_allow_tf32 != ctx.allowTF32CuBLAS())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       os << "allow_tf32 ";
     if (_allow_fp16_reduce != ctx.allowFP16ReductionCuBLAS())
       os << "allow_fp16_reduce ";
@@ -704,10 +747,15 @@ struct GlobalStateGuard {
     json_j["deterministic_algorithms_warn_only"] =
         json_t._deterministic_algorithms_warn_only;
     json_j["allow_tf32"] = json_t._allow_tf32;
+<<<<<<< HEAD
     json_j["allow_fp16_reduce"] =
         static_cast<int64_t>(json_t._allow_fp16_reduce);
     json_j["allow_bf16_reduce"] =
         static_cast<int64_t>(json_t._allow_bf16_reduce);
+=======
+    json_j["allow_fp16_reduce"] = json_t._allow_fp16_reduce;
+    json_j["allow_bf16_reduce"] = json_t._allow_bf16_reduce;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     json_j["num_threads"] = json_t._num_threads;
     json_j["default_dtype"] = json_t._default_dtype.toScalarType();
   }
@@ -723,10 +771,15 @@ struct GlobalStateGuard {
     json_t._deterministic_algorithms_warn_only =
         json_j.at("deterministic_algorithms_warn_only");
     json_t._allow_tf32 = json_j.at("allow_tf32");
+<<<<<<< HEAD
     json_t._allow_fp16_reduce = static_cast<at::CuBLASReductionOption>(
         static_cast<int64_t>(json_j.at("allow_fp16_reduce")));
     json_t._allow_bf16_reduce = static_cast<at::CuBLASReductionOption>(
         static_cast<int64_t>(json_j.at("allow_bf16_reduce")));
+=======
+    json_t._allow_fp16_reduce = json_j.at("allow_fp16_reduce");
+    json_t._allow_bf16_reduce = json_j.at("allow_bf16_reduce");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     json_t._num_threads = json_j.at("num_threads");
     json_t._default_dtype =
         caffe2::TypeMeta::fromScalarType(json_j.at("default_dtype"));
@@ -739,8 +792,13 @@ struct GlobalStateGuard {
   bool _deterministic_algorithms;
   bool _deterministic_algorithms_warn_only;
   bool _allow_tf32;
+<<<<<<< HEAD
   at::CuBLASReductionOption _allow_fp16_reduce;
   at::CuBLASReductionOption _allow_bf16_reduce;
+=======
+  bool _allow_fp16_reduce;
+  bool _allow_bf16_reduce;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int _num_threads;
   caffe2::TypeMeta _default_dtype;
   // TODO(jansel): we should guard on more state as inductor starts using it
@@ -784,8 +842,14 @@ PyObject* GlobalStateGuard_load(
     PyObject* args,
     PyObject* kwargs) {
   char* json;
+<<<<<<< HEAD
   TORCH_CHECK(
       PyArg_ParseTuple(args, "s", &json), "Cannot parse as json string.");
+=======
+  if (!PyArg_ParseTuple(args, "s", &json)) {
+    throw std::runtime_error("Cannot parse as json string.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nlohmann::json::parse(json).get_to(*self);
   Py_RETURN_NONE;
 }
@@ -800,16 +864,28 @@ static PyMethodDef GlobalStateGuard_methods[] = {
      (PyCFunction)(void*)GlobalStateGuard_reason,
      METH_NOARGS,
      "Return string reason for guard check failing"},
+<<<<<<< HEAD
     {"__getstate__",
      (PyCFunction)(void*)GlobalStateGuard_dump,
      METH_NOARGS,
      "Return serialized json format"},
     {"__setstate__",
+=======
+    {"dump",
+     (PyCFunction)(void*)GlobalStateGuard_dump,
+     METH_NOARGS,
+     "Return serialized json format"},
+    {"load",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      (PyCFunction)(void*)GlobalStateGuard_load,
      METH_VARARGS,
      "Parse serialized json format"},
     {nullptr}};
+<<<<<<< HEAD
 static PyTypeObject GlobalStateGuardType = {PyVarObject_HEAD_INIT(nullptr, 0)
+=======
+static PyTypeObject GlobalStateGuardType = { PyVarObject_HEAD_INIT(nullptr, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static PyObject* check_type_id(PyObject* dummy, PyObject* args) {
@@ -846,7 +922,10 @@ static PyObject* check_obj_id(PyObject* dummy, PyObject* args) {
 
 static std::unordered_map<PyObject*, uint64_t> dict_version_map;
 static int dict_version_watcher_id;
+<<<<<<< HEAD
 static int dict_recursive_tag_watcher_id;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static uint64_t global_dict_version_id = 1;
 static int dict_version_watch_callback(
     PyDict_WatchEvent event,
@@ -866,9 +945,15 @@ static int dict_version_watch_callback(
 static uint64_t get_dict_version_unchecked(PyObject* dict) {
 #if IS_PYTHON_3_12_PLUS
 
+<<<<<<< HEAD
   TORCH_CHECK(
       !PyDict_Watch(dict_version_watcher_id, dict),
       "failed to add version watcher to dict!");
+=======
+  if (PyDict_Watch(dict_version_watcher_id, dict)) {
+    throw std::runtime_error("failed to add version watcher to dict!");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!dict_version_map.count(dict)) {
     dict_version_map[dict] = global_dict_version_id++;
   }
@@ -1055,8 +1140,12 @@ static void _parse_empty_strided_args(
 static PyObject* _empty_strided_device(
     PyObject* dummy,
     PyObject* args,
+<<<<<<< HEAD
     c10::DeviceType device_type,
     bool is_pinned = false) {
+=======
+    c10::DeviceType device_type) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS;
   at::SmallVector<int64_t, 8> sizes;
   at::SmallVector<int64_t, 8> strides;
@@ -1064,7 +1153,11 @@ static PyObject* _empty_strided_device(
   _parse_empty_strided_args(args, sizes, strides, dtype);
   if (device_type == c10::DeviceType::CPU) {
     return THPVariable_Wrap(
+<<<<<<< HEAD
         at::detail::empty_strided_cpu(sizes, strides, dtype, is_pinned));
+=======
+        at::detail::empty_strided_cpu(sizes, strides, dtype));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #ifdef USE_CUDA
   else if (device_type == c10::DeviceType::CUDA) {
@@ -1078,12 +1171,15 @@ static PyObject* _empty_strided_device(
         sizes, strides, dtype, c10::DeviceType::XPU));
   }
 #endif
+<<<<<<< HEAD
 #ifdef USE_MTIA
   else if (device_type == c10::DeviceType::MTIA) {
     return THPVariable_Wrap(at::detail::empty_strided_mtia(
         sizes, strides, dtype, c10::DeviceType::MTIA));
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else {
     TORCH_CHECK(
         false, "PyTorch compiled without support for the specified device.");
@@ -1098,6 +1194,7 @@ static PyObject* _empty_strided_cpu(PyObject* dummy, PyObject* args) {
   return _empty_strided_device(dummy, args, c10::DeviceType::CPU);
 }
 
+<<<<<<< HEAD
 static PyObject* _empty_strided_cpu_pinned(PyObject* dummy, PyObject* args) {
   // at::empty_strided is surprising slow.  This is a lower-overhead
   // version that saves ~2us on every allocation.
@@ -1105,6 +1202,8 @@ static PyObject* _empty_strided_cpu_pinned(PyObject* dummy, PyObject* args) {
       dummy, args, c10::DeviceType::CPU, /*is_pinned=*/true);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* _empty_strided_cuda(PyObject* dummy, PyObject* args) {
   // at::empty_strided is surprising slow.  This is lower-overhead.
   return _empty_strided_device(dummy, args, c10::DeviceType::CUDA);
@@ -1115,10 +1214,13 @@ static PyObject* _empty_strided_xpu(PyObject* dummy, PyObject* args) {
   return _empty_strided_device(dummy, args, c10::DeviceType::XPU);
 }
 
+<<<<<<< HEAD
 static PyObject* _empty_strided_mtia(PyObject* dummy, PyObject* args) {
   return _empty_strided_device(dummy, args, c10::DeviceType::MTIA);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* _reinterpret_tensor(PyObject* dummy, PyObject* args) {
   HANDLE_TH_ERRORS;
   static PythonArgParser parser(
@@ -1148,6 +1250,7 @@ static PyMethodDef _methods[] = {
     {"assert_alignment", assert_alignment, METH_VARARGS, nullptr},
     {"dict_version", dict_version, METH_VARARGS, nullptr},
     {"_empty_strided_cpu", _empty_strided_cpu, METH_VARARGS, nullptr},
+<<<<<<< HEAD
     {"_empty_strided_cpu_pinned",
      _empty_strided_cpu_pinned,
      METH_VARARGS,
@@ -1155,6 +1258,10 @@ static PyMethodDef _methods[] = {
     {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr},
     {"_empty_strided_xpu", _empty_strided_xpu, METH_VARARGS, nullptr},
     {"_empty_strided_mtia", _empty_strided_mtia, METH_VARARGS, nullptr},
+=======
+    {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr},
+    {"_empty_strided_xpu", _empty_strided_xpu, METH_VARARGS, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_reinterpret_tensor", _reinterpret_tensor, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
@@ -1197,12 +1304,18 @@ bool is_immutable_object(py::handle example_value) {
     return true;
   }
 
+<<<<<<< HEAD
   return (example_value.ptr() == Py_None) ||
       PyLong_Check(example_value.ptr()) || PyFloat_Check(example_value.ptr()) ||
       PyBool_Check(example_value.ptr()) ||
       PyUnicode_Check(example_value.ptr()) ||
       PyCode_Check(example_value.ptr()) ||
       (Py_TYPE(example_value.ptr()) == &PyCFunction_Type) ||
+=======
+  return PyLong_Check(example_value.ptr()) ||
+      PyFloat_Check(example_value.ptr()) || PyBool_Check(example_value.ptr()) ||
+      PyUnicode_Check(example_value.ptr()) ||
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       (is_tensor_immutable && THPVariable_Check(example_value.ptr()));
 }
 
@@ -1570,6 +1683,7 @@ class GuardManager;
 class RootGuardManager;
 class DictGuardManager;
 
+<<<<<<< HEAD
 // Global registry used by the *recursive-dict-tag* optimisation.
 //
 // Key   : `PyObject*` pointing to a watched `dict`
@@ -1601,11 +1715,22 @@ class DictGuardManager;
 // stores only lightweight pointers.
 std::unordered_map<PyObject*, std::list<GuardManager*>> dict_to_guard_managers;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * Base class for the leaf guard in the GuardManager hierarchy.
  */
 class LeafGuard {
  public:
+<<<<<<< HEAD
+=======
+  // Most guards do not need root guard manager.
+  LeafGuard(py::object verbose_code_parts)
+      : _verbose_code_parts(std::move(verbose_code_parts)) {}
+
+  // Guards like TENSOR_MATCH require root_guard_manager to access local_state
+  // shared across all leaf guards.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LeafGuard(RootGuardManager* root_guard_manager, py::object verbose_code_parts)
       : _root_guard_manager(root_guard_manager),
         _verbose_code_parts(std::move(verbose_code_parts)) {}
@@ -1667,11 +1792,16 @@ class LeafGuard {
  */
 class LAMBDA_GUARD : public LeafGuard {
  public:
+<<<<<<< HEAD
   LAMBDA_GUARD(
       RootGuardManager* root_guard_manager,
       py::object guard_check_fn,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+=======
+  LAMBDA_GUARD(py::object guard_check_fn, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (py::isinstance<py::function>(guard_check_fn)) {
       _guard_check_fn = py::cast<py::function>(std::move(guard_check_fn));
     } else {
@@ -1716,11 +1846,16 @@ class LAMBDA_GUARD : public LeafGuard {
 class TYPE_MATCH : public LeafGuard {
  public:
   // type_id = id(type(obj))
+<<<<<<< HEAD
   TYPE_MATCH(
       RootGuardManager* root_guard_manager,
       py::object type_id,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  TYPE_MATCH(py::object type_id, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _expected(py::cast<intptr_t>(std::move(type_id))) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -1736,11 +1871,16 @@ class TYPE_MATCH : public LeafGuard {
 class ID_MATCH : public LeafGuard {
  public:
   // obj_id = id(obj)
+<<<<<<< HEAD
   ID_MATCH(
       RootGuardManager* root_guard_manager,
       py::object obj_id,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  ID_MATCH(py::object obj_id, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _expected(py::cast<intptr_t>(std::move(obj_id))) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -1755,10 +1895,15 @@ class ID_MATCH : public LeafGuard {
 
 class NONE_MATCH : public LeafGuard {
  public:
+<<<<<<< HEAD
   NONE_MATCH(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  NONE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
     return value == Py_None;
@@ -1767,10 +1912,15 @@ class NONE_MATCH : public LeafGuard {
 
 class TRUE_MATCH : public LeafGuard {
  public:
+<<<<<<< HEAD
   TRUE_MATCH(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  TRUE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
     return value == Py_True;
@@ -1779,10 +1929,15 @@ class TRUE_MATCH : public LeafGuard {
 
 class FALSE_MATCH : public LeafGuard {
  public:
+<<<<<<< HEAD
   FALSE_MATCH(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  FALSE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
     return value == Py_False;
@@ -1791,11 +1946,16 @@ class FALSE_MATCH : public LeafGuard {
 
 class EQUALS_MATCH : public LeafGuard {
  public:
+<<<<<<< HEAD
   EQUALS_MATCH(
       RootGuardManager* root_guard_manager,
       py::object value,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  EQUALS_MATCH(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _value(value),
         _value_type(Py_TYPE(value.ptr())) {}
 
@@ -1832,13 +1992,20 @@ class EQUALS_MATCH : public LeafGuard {
 class RANGE_ITERATOR_MATCH : public LeafGuard {
  public:
   RANGE_ITERATOR_MATCH(
+<<<<<<< HEAD
       RootGuardManager* root_guard_manager,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       py::object start,
       py::object stop,
       py::object step,
       py::object type_id,
       py::object verbose_code_parts)
+<<<<<<< HEAD
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _type_id(py::cast<intptr_t>(std::move(type_id))) {
     PyObject* start_obj = start.ptr();
     PyObject* stop_obj = stop.ptr();
@@ -1879,11 +2046,18 @@ class RANGE_ITERATOR_MATCH : public LeafGuard {
 class TUPLE_ITERATOR_LEN : public LeafGuard {
  public:
   TUPLE_ITERATOR_LEN(
+<<<<<<< HEAD
       RootGuardManager* root_guard_manager,
       py::object length,
       py::object type_id,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+      py::object length,
+      py::object type_id,
+      py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _length(py::cast<Py_ssize_t>(std::move(length))),
         _type_id(py::cast<intptr_t>(std::move(type_id))) {}
 
@@ -1908,11 +2082,16 @@ class TUPLE_ITERATOR_LEN : public LeafGuard {
 
 class LENGTH_CHECK : public LeafGuard {
  public:
+<<<<<<< HEAD
   LENGTH_CHECK(
       RootGuardManager* root_guard_manager,
       py::object value,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  LENGTH_CHECK(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _length(py::cast<Py_ssize_t>(std::move(value))) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -1928,11 +2107,16 @@ class LENGTH_CHECK : public LeafGuard {
 
 class DICT_LENGTH : public LeafGuard {
  public:
+<<<<<<< HEAD
   DICT_LENGTH(
       RootGuardManager* root_guard_manager,
       py::object value,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  DICT_LENGTH(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _length(py::cast<Py_ssize_t>(std::move(value))) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -1946,8 +2130,13 @@ class DICT_LENGTH : public LeafGuard {
 
 class NOT_NONE : public LeafGuard {
  public:
+<<<<<<< HEAD
   NOT_NONE(RootGuardManager* root_guard_manager, py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  NOT_NONE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
     return value != Py_None;
@@ -1956,11 +2145,16 @@ class NOT_NONE : public LeafGuard {
 
 class MAPPING_KEYS_MATCH : public LeafGuard {
  public:
+<<<<<<< HEAD
   MAPPING_KEYS_MATCH(
       RootGuardManager* root_guard_manager,
       py::object value,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+=======
+  MAPPING_KEYS_MATCH(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // This is ok to stash in the state because we only support
     // MappingProxyType objects with constant keys. So, the mem overhead is
     // negligible.
@@ -1980,10 +2174,15 @@ class MAPPING_KEYS_MATCH : public LeafGuard {
 
 class DEFAULT_DEVICE : public LeafGuard {
  public:
+<<<<<<< HEAD
   DEFAULT_DEVICE(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+=======
+  DEFAULT_DEVICE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     py::handle device_module = py::module::import("torch.utils._device");
     // Save the dict using py::object
     _utils_device_dict = device_module.attr("__dict__");
@@ -2027,6 +2226,7 @@ class DEFAULT_DEVICE : public LeafGuard {
 
 class GLOBAL_STATE : public LeafGuard {
  public:
+<<<<<<< HEAD
   GLOBAL_STATE(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
@@ -2046,6 +2246,12 @@ class GLOBAL_STATE : public LeafGuard {
     if (!PyObject_TypeCheck(owner_.ptr(), &GlobalStateGuardType)) {
       throw py::type_error("GLOBAL_STATE expects a GlobalStateGuard");
     }
+=======
+  GLOBAL_STATE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    _guard = std::make_unique<GlobalStateGuard>();
+    _guard->init();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -2067,8 +2273,12 @@ class GLOBAL_STATE : public LeafGuard {
   }
 
  private:
+<<<<<<< HEAD
   py::object owner_;
   GlobalStateGuard* _guard;
+=======
+  std::unique_ptr<GlobalStateGuard> _guard;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Checks that an attr is absent in the object. We don't need the opposite
@@ -2076,11 +2286,16 @@ class GLOBAL_STATE : public LeafGuard {
 // HASATTR guard.
 class NO_HASATTR : public LeafGuard {
  public:
+<<<<<<< HEAD
   NO_HASATTR(
       RootGuardManager* root_guard_manager,
       py::object attr_name,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  NO_HASATTR(py::object attr_name, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _attr_name(std::move(attr_name)) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -2098,12 +2313,17 @@ class NO_HASATTR : public LeafGuard {
 // being faster.
 class DICT_CONTAINS : public LeafGuard {
  public:
+<<<<<<< HEAD
   DICT_CONTAINS(
       RootGuardManager* root_guard_manager,
       bool contains,
       py::object key,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  DICT_CONTAINS(bool contains, py::object key, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _contains(contains ? 1 : 0),
         _key(std::move(key)) {}
 
@@ -2121,6 +2341,7 @@ class DICT_CONTAINS : public LeafGuard {
   py::object _key;
 };
 
+<<<<<<< HEAD
 // Check that set contains an item.
 class SET_CONTAINS : public LeafGuard {
  public:
@@ -2226,11 +2447,17 @@ class DUAL_LEVEL_MATCH : public LeafGuard {
   py::object forward_ad_module;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * Relational guards compare more than one value. We implement Relational
  * guards by capturing some state in the guard object. For example for tensor
  * aliasing guards - tensor X is not tensor Y - we construct one leaf guard
+<<<<<<< HEAD
  * and install it at as a leaf of two guard managers (one for X and
+=======
+ * and and install it at as a leaf of two guard managers (one for X and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * another for Y). Therefore, this guard is run twice. In the first
  * invocation, it saves the first value (state) and returns True. In the
  * second invocation, it compares the saved value with the new value and
@@ -2243,10 +2470,15 @@ class DUAL_LEVEL_MATCH : public LeafGuard {
  */
 class RelationalGuard : public LeafGuard {
  public:
+<<<<<<< HEAD
   RelationalGuard(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  RelationalGuard(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // reset the relational guard state on guard failure. This is called by the
   // guard manager.
@@ -2258,10 +2490,15 @@ class RelationalGuard : public LeafGuard {
  */
 class OBJECT_ALIASING : public RelationalGuard {
  public:
+<<<<<<< HEAD
   OBJECT_ALIASING(
       RootGuardManager* root_guard_manager,
       py::object verbose_code_parts)
       : RelationalGuard(root_guard_manager, std::move(verbose_code_parts)) {}
+=======
+  OBJECT_ALIASING(py::object verbose_code_parts)
+      : RelationalGuard(std::move(verbose_code_parts)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
     if (_is_first_call) {
@@ -2287,10 +2524,16 @@ class OBJECT_ALIASING : public RelationalGuard {
 class NO_TENSOR_ALIASING : public RelationalGuard {
  public:
   NO_TENSOR_ALIASING(
+<<<<<<< HEAD
       RootGuardManager* root_guard_manager,
       const py::list& tensor_names,
       py::object verbose_code_parts)
       : RelationalGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+      const py::list& tensor_names,
+      py::object verbose_code_parts)
+      : RelationalGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _tensor_names(tensor_names) {
     _unique_tensors.reserve(tensor_names.size());
   }
@@ -2338,11 +2581,18 @@ class NO_TENSOR_ALIASING : public RelationalGuard {
 class STORAGE_OVERLAPPING : public RelationalGuard {
  public:
   STORAGE_OVERLAPPING(
+<<<<<<< HEAD
       RootGuardManager* root_guard_manager,
       bool overlapping,
       std::shared_ptr<StorageOverlapChecker> checker,
       py::object verbose_code_parts)
       : RelationalGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+      bool overlapping,
+      std::shared_ptr<StorageOverlapChecker> checker,
+      py::object verbose_code_parts)
+      : RelationalGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _overlapping(overlapping),
         _checker(std::move(checker)) {}
 
@@ -2370,13 +2620,20 @@ class STORAGE_OVERLAPPING : public RelationalGuard {
 class SYMBOLIC_SHAPE_GUARD : public RelationalGuard {
  public:
   SYMBOLIC_SHAPE_GUARD(
+<<<<<<< HEAD
       RootGuardManager* root_guard_manager,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       py::int_ nargs_int,
       py::int_ nargs_float,
       py::int_ py_addr,
       py::object py_addr_keep_alive,
       py::object verbose_code_parts)
+<<<<<<< HEAD
       : RelationalGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+      : RelationalGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _py_addr_keep_alive(std::move(py_addr_keep_alive)) {
     _nargs_int = PyLong_AsSize_t(nargs_int.ptr());
     _nargs_float = PyLong_AsSize_t(nargs_float.ptr());
@@ -2484,12 +2741,19 @@ class DYNAMIC_INDICES : public LeafGuard {
   //      f"(({tensor_name}._dynamo_dynamic_indices.issubset({value._dynamo_dynamic_indices}))
   //      if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  #
   //      noqa: B950
+<<<<<<< HEAD
  public:
   DYNAMIC_INDICES(
       RootGuardManager* root_guard_manager,
       py::set dynamic_indices,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+=======
+  //  )
+ public:
+  DYNAMIC_INDICES(py::set dynamic_indices, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _dynamic_indices(std::move(dynamic_indices)) {}
 
   bool check_nopybind(PyObject* value) override { // borrowed ref
@@ -2519,11 +2783,16 @@ class DYNAMIC_INDICES : public LeafGuard {
 
 class DICT_VERSION : public LeafGuard {
  public:
+<<<<<<< HEAD
   DICT_VERSION(
       RootGuardManager* root_guard_manager,
       py::object value,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+=======
+  DICT_VERSION(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!PyDict_Check(value.ptr())) {
       throw py::type_error("DICT_VERSION expects a dict");
     }
@@ -2548,6 +2817,7 @@ std::unique_ptr<GuardManager> make_guard_manager(
     py::handle example_value,
     py::handle guard_manager_enum);
 
+<<<<<<< HEAD
 // Forward declarations for tag safe related helpers. All of these require some
 // interaction between RootGuardManager and GuardManager. Since both of the
 // classes are forward declared, we have to forward declare these helpers as
@@ -2563,6 +2833,8 @@ bool is_recording_dict_pointers(RootGuardManager* root);
 void record_dict_pointer(RootGuardManager* root, PyObject* dict_pointer);
 void record_tensor_pointer(RootGuardManager* root, PyObject* tensor_pointer);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GuardManager* clone_guard_manager(
     GuardManager* from,
     RootGuardManager* root,
@@ -2570,6 +2842,7 @@ GuardManager* clone_guard_manager(
 void add_relational_guard_resetter_to_cloned_root(
     RootGuardManager* root,
     std::shared_ptr<RelationalGuard> guard);
+<<<<<<< HEAD
 std::shared_ptr<RelationalGuard> get_no_tensor_aliasing_guard(
     RootGuardManager* _root);
 // std::string get_compile_id(RootGuardManager* root);
@@ -2578,6 +2851,9 @@ struct WeakEntry {
   PyObject* wr; // weakref
   PyObject* cap; // capsule whose m_self is used by the callback
 };
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * Base class representing a pair of accessor and the associated guard
  * manager. The accessor defines how to access the child value from the
@@ -2728,6 +3004,7 @@ class GuardManager {
       py::handle example_value)
       : _root(root),
         _source(std::move(source)),
+<<<<<<< HEAD
         _is_dict(py::isinstance<py::dict>(example_value)),
         _is_immutable(is_immutable_object(example_value)) {
     if (_is_dict) {
@@ -2740,10 +3017,17 @@ class GuardManager {
     _max_saved_pointers_for_recursive_dict_tags_check =
         config_module.attr("max_saved_pointers_for_recursive_dict_tags_check")
             .cast<uint64_t>();
+=======
+        _is_dict(py::isinstance<py::dict>(example_value)) {
+    if (_is_dict) {
+      _dict_tag = get_dict_version_unchecked(example_value.ptr());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   GuardManager(const GuardManager& m) = delete;
   GuardManager& operator=(const GuardManager&) = delete;
+<<<<<<< HEAD
 
   virtual ~GuardManager() {
     cleanup_tag_safe_entries();
@@ -2761,6 +3045,9 @@ class GuardManager {
     }
     _tag_safe_entries.clear();
   }
+=======
+  virtual ~GuardManager() = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   RootGuardManager* get_root() {
     return _root;
@@ -2775,6 +3062,7 @@ class GuardManager {
   }
 
  public:
+<<<<<<< HEAD
   // relational guard helpers
   void set_has_object_aliasing_guard() {
     _has_object_aliasing_guard = true;
@@ -2865,6 +3153,11 @@ class GuardManager {
         _is_dict(is_dict),
         _is_immutable(is_immutable),
         _weak_type(weak_type) {}
+=======
+  // For cloning
+  GuardManager(RootGuardManager* root, std::string source, bool is_dict)
+      : _root(root), _source(std::move(source)), _is_dict(is_dict) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void clone_common(
       RootGuardManager* cloned_root,
@@ -2895,6 +3188,7 @@ class GuardManager {
     if (!py::cast<bool>(clone_filter_fn(this))) {
       return nullptr;
     }
+<<<<<<< HEAD
     GuardManager* cloned_mgr = new GuardManager(
         cloned_root, _source, _is_dict, _is_immutable, _weak_type);
     if (is_tag_safe()) {
@@ -2903,6 +3197,9 @@ class GuardManager {
         cloned_mgr->mark_tag_safe_root();
       }
     }
+=======
+    GuardManager* cloned_mgr = new GuardManager(cloned_root, _source, _is_dict);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     clone_common(cloned_root, cloned_mgr, clone_filter_fn);
     return cloned_mgr;
   }
@@ -2957,6 +3254,7 @@ class GuardManager {
     return this->check_accessors_nopybind(value);
   }
 
+<<<<<<< HEAD
   bool check_dict_pointer_tags(PyObject* value) {
     if (_dict_callback_installed) {
       // This means that for 3.12+, there are callbacks watching dict pointers.
@@ -3238,6 +3536,10 @@ class GuardManager {
       }
     }
 #endif
+=======
+  virtual bool check_nopybind(PyObject* value) {
+    return check_nopybind_template(value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   virtual bool check_nopybind(FrameLocalsMapping* value) {
@@ -3461,6 +3763,7 @@ class GuardManager {
   // to enable fail fast for the next check.
   std::vector<std::unique_ptr<GuardAccessor>> _accessors;
 
+<<<<<<< HEAD
   // relational guard helpers
   bool _has_object_aliasing_guard = false;
   bool _has_no_tensor_aliasing_guard = false;
@@ -3486,6 +3789,10 @@ class GuardManager {
   // weakref to the type of guarded value
   // protected because it is used for cloning by DictGuardManager
   py::object _weak_type;
+=======
+  bool _is_dict;
+  uint64_t _dict_tag{0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 GuardAccessor::GuardAccessor(
@@ -3532,6 +3839,7 @@ class RootGuardManager : public GuardManager {
   // This is the root node, set its _root member to nullptr
   RootGuardManager() : GuardManager(this, "L") {}
 
+<<<<<<< HEAD
   void add_no_tensor_aliasing_guard(
       std::shared_ptr<RelationalGuard> no_tensor_aliasing_guard) {
     // stash a pointer to the _no_tensor_alising_guard
@@ -3543,6 +3851,8 @@ class RootGuardManager : public GuardManager {
     return _no_tensor_aliasing_guard;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Adds the relational guard resetter
   void add_relational_guard_resetter(
       std::shared_ptr<RelationalGuard> relational_guard) {
@@ -3569,9 +3879,12 @@ class RootGuardManager : public GuardManager {
     std::lock_guard<std::mutex> lock_guard(_lock);
     Py_BLOCK_THREADS; // ; is added to avoid clang-formatting
 
+<<<<<<< HEAD
     // Clean up dict pointer recording for tag safe roots
     reset_dict_tag_recording_variables();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Get the local state. This will be used for TENSOR_MATCH guards.
     if (_init_local_state) {
       LocalState state;
@@ -3714,6 +4027,7 @@ class RootGuardManager : public GuardManager {
     return ret;
   }
 
+<<<<<<< HEAD
   void attach_compile_id(std::string compile_id) {
     _compile_id = compile_id;
   }
@@ -3722,6 +4036,8 @@ class RootGuardManager : public GuardManager {
   //   return _compile_id;
   // }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   // Reset the state of all the relational guards on failure.
   void _reset_relational_guard_state() {
@@ -3731,6 +4047,7 @@ class RootGuardManager : public GuardManager {
   }
 
  public:
+<<<<<<< HEAD
   // tag safe optimizations
   void start_recording_dict_pointers(GuardManager* tag_safe_root) {
     _current_tag_safe_root = tag_safe_root;
@@ -3769,6 +4086,8 @@ class RootGuardManager : public GuardManager {
   }
 
  public:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Local state for TENSOR_MATCH guards.
   LocalState _local_state;
 
@@ -3812,6 +4131,7 @@ class RootGuardManager : public GuardManager {
   // We init LocalState only when this flag it set. This flag is set during
   // TENSOR_MATCH guard init.
   bool _init_local_state = false;
+<<<<<<< HEAD
 
   // debug info
   std::string _compile_id;
@@ -3824,6 +4144,8 @@ class RootGuardManager : public GuardManager {
   GuardManager* _current_tag_safe_root{nullptr};
   std::vector<std::pair<PyObject*, uint64_t>> _recorded_dict_pointers;
   std::vector<PyObject*> _recorded_tensor_pointers;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /*
@@ -3842,7 +4164,11 @@ class DictGuardManager : public GuardManager {
       RootGuardManager* root,
       std::string source,
       py::handle example_value)
+<<<<<<< HEAD
       : GuardManager(root, std::move(source), example_value),
+=======
+      : GuardManager(root, std::move(source)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _size(PyDict_Size(example_value.ptr())),
         _expected_type(Py_TYPE(example_value.ptr())),
         _is_exact_dict_type(PyDict_CheckExact(example_value.ptr())) {}
@@ -4023,14 +4349,22 @@ class DictGuardManager : public GuardManager {
       const py::object& a,
       const std::string& source,
       const py::object& b) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Can not add an accessor to DictGuardManager");
+=======
+    throw std::runtime_error("Can not add an accessor to DictGuardManager");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void add_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) override {
     // If you are calling this, you probably want to go through a key, value
     // child manager and then add a leaf guard on them. DictGuardManager already
     // has TYPE_MATCH and LENGTH_CHECK built in.
+<<<<<<< HEAD
     TORCH_CHECK(false, "DictGuardManager does not support a leaf_guard");
+=======
+    throw std::runtime_error("DictGuardManager does not support a leaf_guard");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Debug helper - Returning raw pointers because we can't return unique_ptr
@@ -4057,6 +4391,7 @@ class DictGuardManager : public GuardManager {
       Py_ssize_t size,
       PyTypeObject* expected_type,
       bool is_exact_dict_type,
+<<<<<<< HEAD
       std::vector<Py_ssize_t> indices,
       py::object weak_type)
       : GuardManager(
@@ -4065,6 +4400,10 @@ class DictGuardManager : public GuardManager {
             true, // _is_dict
             false, // _is_immutable
             weak_type),
+=======
+      std::vector<Py_ssize_t> indices)
+      : GuardManager(cloned_root, std::move(source), true),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _size(size),
         _expected_type(expected_type),
         _is_exact_dict_type(is_exact_dict_type),
@@ -4083,6 +4422,7 @@ class DictGuardManager : public GuardManager {
         _size,
         _expected_type,
         _is_exact_dict_type,
+<<<<<<< HEAD
         _indices,
         _weak_type);
     if (is_tag_safe()) {
@@ -4091,6 +4431,10 @@ class DictGuardManager : public GuardManager {
         cloned_mgr->mark_tag_safe_root();
       }
     }
+=======
+        _indices);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     clone_common(cloned_root, cloned_mgr, clone_filter_fn);
     for (auto index : _indices) {
       KeyValueManager& key_value_manager = _key_value_managers[index];
@@ -4169,6 +4513,7 @@ void add_relational_guard_resetter_to_cloned_root(
   root->add_relational_guard_resetter(std::move(guard));
 }
 
+<<<<<<< HEAD
 #if IS_PYTHON_3_12_PLUS
 static int dict_recursive_tag_watch_callback(
     PyDict_WatchEvent event,
@@ -4190,6 +4535,8 @@ static int dict_recursive_tag_watch_callback(
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::unique_ptr<GuardManager> make_guard_manager(
     RootGuardManager* root,
     std::string source,
@@ -4233,6 +4580,7 @@ std::unique_ptr<GuardManager> make_guard_manager(
       throw py::type_error("Invalid guard manager enum");
     }
   }
+<<<<<<< HEAD
   return std::make_unique<GuardManager>(root, std::move(source), example_value);
 }
 
@@ -4277,6 +4625,17 @@ class TORCH_FUNCTION_MODE_STACK : public LeafGuard {
       const py::list& initial_stack,
       py::object verbose_code_parts)
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
+=======
+  return std::make_unique<GuardManager>(root, std::move(source));
+}
+
+class TORCH_FUNCTION_MODE_STACK : public LeafGuard {
+ public:
+  TORCH_FUNCTION_MODE_STACK(
+      const py::list& initial_stack,
+      py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Py_ssize_t len = PyList_Size(initial_stack.ptr());
     for (Py_ssize_t idx = 0; idx < len; idx++) {
       PyObject* mode = PyList_GetItem(initial_stack.ptr(), idx); // borrowed ref
@@ -4489,10 +4848,13 @@ class GetAttrGuardAccessor : public GuardAccessor {
         ")";
   }
 
+<<<<<<< HEAD
   std::string get_attr_name() {
     return py::str(_attr_name).cast<std::string>();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public: // cloning functions
   GetAttrGuardAccessor(GuardManager* guard_manager, GetAttrGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
@@ -4616,6 +4978,7 @@ class GetGenericDictGuardAccessor : public GuardAccessor {
   // check_verbose_nopybind.
   bool check_nopybind(PyObject* obj, bool matches_dict_tag = false)
       override { // borrowed ref
+<<<<<<< HEAD
     // NOTE for future guard optimization developers - We tried saving the dict
     // pointer and weakref of the original object to avoid calling
     // PyObject_GenericGetDict on a fast path, but this did not lead any
@@ -4623,6 +4986,8 @@ class GetGenericDictGuardAccessor : public GuardAccessor {
     // 1) Once __dict__ is generated, accessing it the second time is fast.
     // 2) Getting the object from weakref, from 3.13 onwards, requires
     // Py_DECREF, which further eats into the benefit.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* x = PyObject_GenericGetDict(obj, nullptr); // new ref
     if (x == nullptr) {
       // Attribute absent, clear the exception and return false.
@@ -4789,12 +5154,22 @@ class FrameLocalsGuardAccessor : public GuardAccessor {
   // NB: Intentional duplication between check_nopybind and
   // check_verbose_nopybind.
   bool check_nopybind(PyObject* obj, bool matches_dict_tag = false) override {
+<<<<<<< HEAD
     // This should not cause guard failure.
     // If this error is encountered, it probably means
     // we did not convert FrameLocalsMapping to dict (using to_dict()).
     TORCH_CHECK_TYPE(
         PyDict_Check(obj),
         "FrameLocalsGuardAccessor check expected dict() input");
+=======
+    if (!PyDict_Check(obj)) {
+      // This should not cause guard failure.
+      // If this error is encountered, it probably means
+      // we did not convert FrameLocalsMapping to dict (using to_dict()).
+      throw std::runtime_error(
+          "FrameLocalsGuardAccessor check expected dict() input");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (matches_dict_tag && _is_immutable_object) {
       // immutable object and dict tag matches, we can skip the guard subtree.
@@ -4891,7 +5266,10 @@ class DictGetItemGuardAccessor : public GuardAccessor {
   // check_verbose_nopybind.
   bool check_nopybind(PyObject* obj, bool matches_dict_tag = false) override {
     if (matches_dict_tag && _is_immutable_object &&
+<<<<<<< HEAD
         !is_recording_dict_pointers(get_guard_manager()->get_root()) &&
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _guard_manager->has_no_accessors()) {
       // immutable object and dict tag matches, we can skip the guard subtree.
       // NB: We only skip the subtree if there are no accessors in the subtree.
@@ -5025,6 +5403,7 @@ class ListGetItemGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represents set[index] accessor by converting the set into a list.
  */
 class SetGetItemGuardAccessor : public GuardAccessor {
@@ -5101,6 +5480,8 @@ class SetGetItemGuardAccessor : public GuardAccessor {
 };
 
 /**
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * Represents tuple[index] accessor. It is faster than generic
  * GetItemGuardAccessor.
  */
@@ -5240,7 +5621,11 @@ class TensorPropertyGuardAccessor : public GuardAccessor {
     } else if (_prop == TensorProperty::STORAGE_OFFSET) {
       opt_value = tensor.sym_storage_offset().maybe_as_int();
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown property");
+=======
+      throw std::runtime_error("Unknown property");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     if (!opt_value.has_value()) {
@@ -5641,11 +6026,14 @@ class GlobalsGuardAccessor : public GuardAccessor {
     return "GlobalsGuardAccessor";
   }
 
+<<<<<<< HEAD
   bool check_nopybind(FrameLocalsMapping* map, bool matches_dict_tag) override {
     // Ensure that we don't construct the framelocals to dict here.
     return _guard_manager->check_nopybind(_globals_dict);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public: // cloning functions
   GlobalsGuardAccessor(GuardManager* guard_manager, GlobalsGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
@@ -5721,6 +6109,7 @@ class TypeGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represent x.__dict__ accessor, where x is type object.
  */
 class TypeDictGuardAccessor : public GuardAccessor {
@@ -5833,6 +6222,8 @@ class TypeMROGuardAccessor : public GuardAccessor {
 };
 
 /**
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * Getitem tuple_iterator accessor.
  */
 class TupleIteratorGetItemAccessor : public GuardAccessor {
@@ -6108,6 +6499,7 @@ class WeakRefCallGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represent x.__code__
  */
 class CodeGuardAccessor : public GuardAccessor {
@@ -6260,6 +6652,8 @@ class ClosureGuardAccessor : public GuardAccessor {
 };
 
 /**
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * Implements function call no args - e.g, torch.cuda.current_device()
  */
 class CallFunctionNoArgsGuardAccessor : public GuardAccessor {
@@ -6421,16 +6815,24 @@ void install_object_aliasing_guard(
     py::object verbose_code_parts) {
   // Adds tensor X is tensor Y guard. This is a an example of relational guard.
   // There is one guard object that is shared between two guard managers.
+<<<<<<< HEAD
   std::shared_ptr<RelationalGuard> guard = std::make_shared<OBJECT_ALIASING>(
       x->get_root(), std::move(verbose_code_parts));
+=======
+  std::shared_ptr<RelationalGuard> guard =
+      std::make_shared<OBJECT_ALIASING>(std::move(verbose_code_parts));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Register the resetter on the root guard manager, so that it can reset
   // the newly added relational guard when the guard eval fails.
   x->get_root()->add_relational_guard_resetter(guard);
 
+<<<<<<< HEAD
   x->set_has_object_aliasing_guard();
   y->set_has_object_aliasing_guard();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // In case the guard is a DictGuardManager, OBJECT_ALIASING guard is a
   // permitted guard.
   x->add_permitted_leaf_guard(guard);
@@ -6445,19 +6847,29 @@ void install_no_tensor_aliasing_guard(
   // relational guard. There is one guard object that is shared between multiple
   // guard managers.
   std::shared_ptr<RelationalGuard> guard = std::make_shared<NO_TENSOR_ALIASING>(
+<<<<<<< HEAD
       py::cast<GuardManager*>(guard_managers[0])->get_root(),
       tensor_names,
       std::move(verbose_code_parts));
+=======
+      tensor_names, std::move(verbose_code_parts));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Register the resetter on the root guard manager, so that it can reset
   // the newly added relational guard when the guard eval fails.
   py::cast<GuardManager*>(guard_managers[0])
       ->get_root()
+<<<<<<< HEAD
       ->add_no_tensor_aliasing_guard(guard);
 
   for (const auto& guard_manager : guard_managers) {
     py::cast<GuardManager*>(guard_manager)->add_leaf_guard(guard);
     py::cast<GuardManager*>(guard_manager)->set_has_no_tensor_aliasing_guard();
+=======
+      ->add_relational_guard_resetter(guard);
+  for (const auto& guard_manager : guard_managers) {
+    py::cast<GuardManager*>(guard_manager)->add_leaf_guard(guard);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -6473,7 +6885,10 @@ void install_symbolic_shape_guard(
   // multiple guard managers.
   std::shared_ptr<RelationalGuard> guard =
       std::make_shared<SYMBOLIC_SHAPE_GUARD>(
+<<<<<<< HEAD
           py::cast<GuardManager*>(guard_managers[0])->get_root(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::move(nargs_int),
           std::move(nargs_float),
           std::move(py_addr),
@@ -6503,10 +6918,14 @@ void install_storage_overlapping_guard_with_checker(
 
   std::shared_ptr<RelationalGuard> guard =
       std::make_shared<STORAGE_OVERLAPPING>(
+<<<<<<< HEAD
           py::cast<GuardManager*>(guard_managers[0])->get_root(),
           overlapping,
           checker,
           verbose_code_parts);
+=======
+          overlapping, checker, verbose_code_parts);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::cast<GuardManager*>(guard_managers[0])
       ->get_root()
       ->add_relational_guard_resetter(guard);
@@ -6539,7 +6958,10 @@ void install_storage_overlapping_guard(
       /* overlapping= */ false);
 }
 
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated-volatile")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 char flush_cache_by_eviction() {
   constexpr size_t evict_size = 32 * 1024 * 1024;
   std::vector<char> buffer(evict_size, 1);
@@ -6550,7 +6972,10 @@ char flush_cache_by_eviction() {
   }
   return sink;
 }
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 double profile_guard_manager(
     RootGuardManager* root,
@@ -6583,9 +7008,18 @@ double profile_guard_manager(
 } // namespace
 
 static void* _torchinductor_pyobject_tensor_data_ptr(PyObject* obj) {
+<<<<<<< HEAD
   TORCH_CHECK(
       obj != nullptr && (THPVariable_CheckExact(obj) || THPVariable_Check(obj)),
       "_torchinductor_pyobject_tensor_data_ptr: non-tensor input");
+=======
+  if (C10_UNLIKELY(
+          obj == nullptr ||
+          (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)))) {
+    throw std::runtime_error(
+        "_torchinductor_pyobject_tensor_data_ptr: non-tensor input");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPVariable_Unpack(obj).data_ptr();
 }
 
@@ -6603,6 +7037,7 @@ bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals) {
   if (root == nullptr) {
     return false;
   }
+<<<<<<< HEAD
 
 #ifdef GUARD_INSTRUCTION_COUNT
   auto n = count_instructions(
@@ -6610,6 +7045,8 @@ bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals) {
   std::cout << "#instructions in guard eval = " << n << std::endl << std::flush;
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ((RootGuardManager*)root)->check_nopybind(f_locals);
 }
 
@@ -6690,6 +7127,7 @@ PyObject* torch_c_dynamo_guards_init() {
       .def("verbose_code_parts", &LeafGuard::verbose_code_parts);
   py::class_<LAMBDA_GUARD, LeafGuard, std::shared_ptr<LAMBDA_GUARD>>(
       py_m, "LAMBDA_GUARD")
+<<<<<<< HEAD
       .def(py::init<RootGuardManager*, py::function, py::list>())
       .def("__call__", &LAMBDA_GUARD::check);
   py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>(
@@ -6729,23 +7167,73 @@ PyObject* torch_c_dynamo_guards_init() {
       .def("__call__", &DEFAULT_DEVICE::check);
   py::class_<NOT_NONE, LeafGuard, std::shared_ptr<NOT_NONE>>(py_m, "NOT_NONE")
       .def(py::init<RootGuardManager*, py::list>())
+=======
+      .def(py::init<py::function, py::list>())
+      .def("__call__", &LAMBDA_GUARD::check);
+  py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>(
+      py_m, "TYPE_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &TYPE_MATCH::check);
+  py::class_<ID_MATCH, LeafGuard, std::shared_ptr<ID_MATCH>>(py_m, "ID_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &ID_MATCH::check);
+  py::class_<NONE_MATCH, LeafGuard, std::shared_ptr<NONE_MATCH>>(
+      py_m, "NONE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &NONE_MATCH::check);
+  py::class_<TRUE_MATCH, LeafGuard, std::shared_ptr<TRUE_MATCH>>(
+      py_m, "TRUE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &TRUE_MATCH::check);
+  py::class_<FALSE_MATCH, LeafGuard, std::shared_ptr<FALSE_MATCH>>(
+      py_m, "FALSE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &FALSE_MATCH::check);
+  py::class_<EQUALS_MATCH, LeafGuard, std::shared_ptr<EQUALS_MATCH>>(
+      py_m, "EQUALS_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &EQUALS_MATCH::check);
+  py::class_<LENGTH_CHECK, LeafGuard, std::shared_ptr<LENGTH_CHECK>>(
+      py_m, "LENGTH_CHECK")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &LENGTH_CHECK::check);
+  py::class_<DICT_LENGTH, LeafGuard, std::shared_ptr<DICT_LENGTH>>(
+      py_m, "DICT_LENGTH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &DICT_LENGTH::check);
+  py::class_<DEFAULT_DEVICE, LeafGuard, std::shared_ptr<DEFAULT_DEVICE>>(
+      py_m, "DEFAULT_DEVICE")
+      .def(py::init<py::list>())
+      .def("__call__", &DEFAULT_DEVICE::check);
+  py::class_<NOT_NONE, LeafGuard, std::shared_ptr<NOT_NONE>>(py_m, "NOT_NONE")
+      .def(py::init<py::list>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("__call__", &NOT_NONE::check);
   py::class_<
       MAPPING_KEYS_MATCH,
       LeafGuard,
       std::shared_ptr<MAPPING_KEYS_MATCH>>(py_m, "MAPPING_KEYS_MATCH")
+<<<<<<< HEAD
       .def(py::init<RootGuardManager*, py::object, py::list>())
+=======
+      .def(py::init<py::object, py::list>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("__call__", &MAPPING_KEYS_MATCH::check);
   py::class_<
       TUPLE_ITERATOR_LEN,
       LeafGuard,
       std::shared_ptr<TUPLE_ITERATOR_LEN>>(py_m, "TUPLE_ITERATOR_LEN")
+<<<<<<< HEAD
       .def(py::init<RootGuardManager*, py::object, py::object, py::list>())
+=======
+      .def(py::init<py::object, py::object, py::list>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("__call__", &TUPLE_ITERATOR_LEN::check);
   py::class_<
       RANGE_ITERATOR_MATCH,
       LeafGuard,
       std::shared_ptr<RANGE_ITERATOR_MATCH>>(py_m, "RANGE_ITERATOR_MATCH")
+<<<<<<< HEAD
       .def(py::init<
            RootGuardManager*,
            py::object,
@@ -6757,6 +7245,13 @@ PyObject* torch_c_dynamo_guards_init() {
   py::class_<GLOBAL_STATE, LeafGuard, std::shared_ptr<GLOBAL_STATE>>(
       py_m, "GLOBAL_STATE")
       .def(py::init<RootGuardManager*, py::list>())
+=======
+      .def(py::init<py::object, py::object, py::object, py::object, py::list>())
+      .def("__call__", &RANGE_ITERATOR_MATCH::check);
+  py::class_<GLOBAL_STATE, LeafGuard, std::shared_ptr<GLOBAL_STATE>>(
+      py_m, "GLOBAL_STATE")
+      .def(py::init<py::list>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("check_verbose", &GLOBAL_STATE::check_verbose)
       .def("__call__", &GLOBAL_STATE::check);
   py::class_<
@@ -6764,6 +7259,7 @@ PyObject* torch_c_dynamo_guards_init() {
       LeafGuard,
       std::shared_ptr<TORCH_FUNCTION_MODE_STACK>>(
       py_m, "TORCH_FUNCTION_MODE_STACK")
+<<<<<<< HEAD
       .def(py::init<RootGuardManager*, py::list, py::list>())
       .def("__call__", &TORCH_FUNCTION_MODE_STACK::check);
   py::class_<NO_HASATTR, LeafGuard, std::shared_ptr<NO_HASATTR>>(
@@ -6797,6 +7293,25 @@ PyObject* torch_c_dynamo_guards_init() {
   py::class_<DICT_VERSION, LeafGuard, std::shared_ptr<DICT_VERSION>>(
       py_m, "DICT_VERSION")
       .def(py::init<RootGuardManager*, py::object, py::list>())
+=======
+      .def(py::init<py::list, py::list>())
+      .def("__call__", &TORCH_FUNCTION_MODE_STACK::check);
+  py::class_<NO_HASATTR, LeafGuard, std::shared_ptr<NO_HASATTR>>(
+      py_m, "NO_HASATTR")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &NO_HASATTR::check);
+  py::class_<DICT_CONTAINS, LeafGuard, std::shared_ptr<DICT_CONTAINS>>(
+      py_m, "DICT_CONTAINS")
+      .def(py::init<bool, py::object, py::list>())
+      .def("__call__", &DICT_CONTAINS::check);
+  py::class_<DYNAMIC_INDICES, LeafGuard, std::shared_ptr<DYNAMIC_INDICES>>(
+      py_m, "DYNAMIC_INDICES")
+      .def(py::init<py::set, py::list>())
+      .def("__call__", &DYNAMIC_INDICES::check);
+  py::class_<DICT_VERSION, LeafGuard, std::shared_ptr<DICT_VERSION>>(
+      py_m, "DICT_VERSION")
+      .def(py::init<py::object, py::list>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("__call__", &DICT_VERSION::check);
   py::class_<
       DISPATCH_KEY_SET_MATCH,
@@ -6846,11 +7361,19 @@ PyObject* torch_c_dynamo_guards_init() {
   py::class_<GuardAccessor, std::unique_ptr<GuardAccessor>>(
       py_m, "GuardAccessor")
       .def("repr", &GuardAccessor::repr);
+<<<<<<< HEAD
   py::class_<
       GetAttrGuardAccessor,
       GuardAccessor,
       std::unique_ptr<GetAttrGuardAccessor>>(py_m, "GetAttrGuardAccessor")
       .def("get_attr_name", &GetAttrGuardAccessor::get_attr_name);
+=======
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GetAttrGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GetAttrGuardAccessor>>(py_m, "GetAttrGuardAccessor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
       GenericGetAttrGuardAccessor,
@@ -6916,6 +7439,7 @@ PyObject* torch_c_dynamo_guards_init() {
       std::unique_ptr<TypeGuardAccessor>>(py_m, "TypeGuardAccessor");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
+<<<<<<< HEAD
       TypeDictGuardAccessor,
       GuardAccessor,
       std::unique_ptr<TypeDictGuardAccessor>>(py_m, "TypeDictGuardAccessor");
@@ -6926,6 +7450,8 @@ PyObject* torch_c_dynamo_guards_init() {
       std::unique_ptr<TypeMROGuardAccessor>>(py_m, "TypeMROGuardAccessor");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       WeakRefCallGuardAccessor,
       GuardAccessor,
       std::unique_ptr<WeakRefCallGuardAccessor>>(
@@ -6944,6 +7470,7 @@ PyObject* torch_c_dynamo_guards_init() {
       py_m, "TupleIteratorGetItemAccessor");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
+<<<<<<< HEAD
       CodeGuardAccessor,
       GuardAccessor,
       std::unique_ptr<CodeGuardAccessor>>(py_m, "CodeGuardAccessor");
@@ -6954,6 +7481,8 @@ PyObject* torch_c_dynamo_guards_init() {
       std::unique_ptr<ClosureGuardAccessor>>(py_m, "ClosureGuardAccessor");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GlobalWeakRefGuardAccessor,
       GuardAccessor,
       std::unique_ptr<GlobalWeakRefGuardAccessor>>(
@@ -6966,6 +7495,7 @@ PyObject* torch_c_dynamo_guards_init() {
       .def("get_source", &GuardManager::get_source)
       .def("fail_count", &GuardManager::fail_count)
       .def(
+<<<<<<< HEAD
           "has_object_aliasing_guard", &GuardManager::has_object_aliasing_guard)
       .def(
           "is_guarded_value_immutable",
@@ -6981,6 +7511,8 @@ PyObject* torch_c_dynamo_guards_init() {
       .def(
           "get_type_of_guarded_value", &GuardManager::get_type_of_guarded_value)
       .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "get_accessors",
           &GuardManager::get_accessors,
           py::return_value_policy::reference)
@@ -7002,9 +7534,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object lambda,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<LAMBDA_GUARD>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(lambda),
                 std::move(verbose_code_parts)));
+=======
+                std::move(lambda), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_type_match_guard",
@@ -7013,9 +7549,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("TYPE_MATCH");
             self.add_leaf_guard(std::make_shared<TYPE_MATCH>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_id_match_guard",
@@ -7024,30 +7564,49 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("ID_MATCH");
             self.add_leaf_guard(std::make_shared<ID_MATCH>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_none_match_guard",
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("NONE_MATCH");
+<<<<<<< HEAD
             self.add_leaf_guard(std::make_shared<NONE_MATCH>(
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+            self.add_leaf_guard(
+                std::make_shared<NONE_MATCH>(std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_true_match_guard",
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("TRUE_MATCH");
+<<<<<<< HEAD
             self.add_leaf_guard(std::make_shared<TRUE_MATCH>(
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+            self.add_leaf_guard(
+                std::make_shared<TRUE_MATCH>(std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_false_match_guard",
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("FALSE_MATCH");
+<<<<<<< HEAD
             self.add_leaf_guard(std::make_shared<FALSE_MATCH>(
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+            self.add_leaf_guard(
+                std::make_shared<FALSE_MATCH>(std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_equals_match_guard",
@@ -7056,9 +7615,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("EQUALS_MATCH");
             self.add_leaf_guard(std::make_shared<EQUALS_MATCH>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_length_check_guard",
@@ -7067,9 +7630,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("LENGTH_CHECK");
             self.add_leaf_guard(std::make_shared<LENGTH_CHECK>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dict_length_check_guard",
@@ -7078,9 +7645,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("DICT_LENGTH");
             self.add_leaf_guard(std::make_shared<DICT_LENGTH>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_tuple_iterator_length_guard",
@@ -7090,7 +7661,10 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("TUPLE_ITERATOR_LEN");
             self.add_leaf_guard(std::make_shared<TUPLE_ITERATOR_LEN>(
+<<<<<<< HEAD
                 self.get_root(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::move(length),
                 std::move(type_id),
                 std::move(verbose_code_parts)));
@@ -7105,7 +7679,10 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("RANGE_ITERATOR_MATCH");
             self.add_leaf_guard(std::make_shared<RANGE_ITERATOR_MATCH>(
+<<<<<<< HEAD
                 self.get_root(),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::move(start),
                 std::move(stop),
                 std::move(step),
@@ -7116,14 +7693,23 @@ PyObject* torch_c_dynamo_guards_init() {
           "add_default_device_guard",
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<DEFAULT_DEVICE>(
+<<<<<<< HEAD
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+                std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_not_none_guard",
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("NOT_NONE");
+<<<<<<< HEAD
             self.add_leaf_guard(std::make_shared<NOT_NONE>(
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+            self.add_leaf_guard(
+                std::make_shared<NOT_NONE>(std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_mapping_keys_guard",
@@ -7132,9 +7718,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("MAPPING_KEYS_MATCH");
             self.add_leaf_guard(std::make_shared<MAPPING_KEYS_MATCH>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dispatch_key_set_guard",
@@ -7149,6 +7739,7 @@ PyObject* torch_c_dynamo_guards_init() {
           })
       .def(
           "add_global_state_guard",
+<<<<<<< HEAD
           [](GuardManager& self,
              py::object initial_state,
              py::object verbose_code_parts) -> void {
@@ -7156,6 +7747,11 @@ PyObject* torch_c_dynamo_guards_init() {
                 self.get_root(),
                 std::move(initial_state),
                 std::move(verbose_code_parts)));
+=======
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(
+                std::make_shared<GLOBAL_STATE>(std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_torch_function_mode_stack_guard",
@@ -7163,7 +7759,11 @@ PyObject* torch_c_dynamo_guards_init() {
              const py::list& initial_stack,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<TORCH_FUNCTION_MODE_STACK>(
+<<<<<<< HEAD
                 self.get_root(), initial_stack, std::move(verbose_code_parts)));
+=======
+                initial_stack, std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_no_hasattr_guard",
@@ -7171,9 +7771,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object attr_name,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<NO_HASATTR>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(attr_name),
                 std::move(verbose_code_parts)));
+=======
+                std::move(attr_name), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dict_contains_guard",
@@ -7182,6 +7786,7 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object key,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<DICT_CONTAINS>(
+<<<<<<< HEAD
                 self.get_root(),
                 contains,
                 std::move(key),
@@ -7218,6 +7823,9 @@ PyObject* torch_c_dynamo_guards_init() {
           [](GuardManager& self, py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<COMPLEX_IS_NAN>(
                 self.get_root(), std::move(verbose_code_parts)));
+=======
+                contains, std::move(key), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dynamic_indices_guard",
@@ -7225,9 +7833,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::set value,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<DYNAMIC_INDICES>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dict_version_guard",
@@ -7236,9 +7848,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("DICT_VERSION");
             self.add_leaf_guard(std::make_shared<DICT_VERSION>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_tensor_match_guard",
@@ -7429,6 +8045,7 @@ PyObject* torch_c_dynamo_guards_init() {
       // return by reference because GuardManager has the ownership of accessors
       // and guard managers
       .def(
+<<<<<<< HEAD
           "type_dict_manager",
           [](GuardManager& self,
              std::string source,
@@ -7469,6 +8086,8 @@ PyObject* torch_c_dynamo_guards_init() {
       // return by reference because GuardManager has the ownership of accessors
       // and guard managers
       .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "weakref_call_manager",
           [](GuardManager& self,
              std::string source,
@@ -7516,6 +8135,7 @@ PyObject* torch_c_dynamo_guards_init() {
           py::arg("example_value"),
           py::arg("guard_manager_enum"),
           py::return_value_policy::reference)
+<<<<<<< HEAD
       .def(
           "set_getitem_manager",
           &GuardManager::get_child_manager<SetGetItemGuardAccessor>,
@@ -7564,6 +8184,8 @@ PyObject* torch_c_dynamo_guards_init() {
           py::arg("example_value"),
           py::arg("guard_manager_enum"),
           py::return_value_policy::reference)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // return by reference because GuardManager has the ownership of accessors
       // and guard managers
       .def(
@@ -7651,7 +8273,10 @@ PyObject* torch_c_dynamo_guards_init() {
       .def(py::init<>())
       .def("check", &RootGuardManager::check)
       .def("check_verbose", &RootGuardManager::check_verbose)
+<<<<<<< HEAD
       .def("attach_compile_id", &RootGuardManager::attach_compile_id)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "clone_manager",
           &RootGuardManager::clone_manager,
@@ -7668,7 +8293,11 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object lambda,
              py::object verbose_code_parts) -> void {
             self.add_epilogue_lambda_guard(std::make_unique<LAMBDA_GUARD>(
+<<<<<<< HEAD
                 &self, std::move(lambda), std::move(verbose_code_parts)));
+=======
+                std::move(lambda), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           });
 
   // Dict Guard Manager
@@ -7731,10 +8360,14 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object key,
              py::object verbose_code_parts) -> void {
             self.add_permitted_leaf_guard(std::make_shared<DICT_CONTAINS>(
+<<<<<<< HEAD
                 self.get_root(),
                 contains,
                 std::move(key),
                 std::move(verbose_code_parts)));
+=======
+                contains, std::move(key), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_dict_version_guard",
@@ -7743,9 +8376,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object verbose_code_parts) -> void {
             SKIP_IF_GUARD_ALREADY_PRESENT("DICT_VERSION");
             self.add_permitted_leaf_guard(std::make_shared<DICT_VERSION>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(value),
                 std::move(verbose_code_parts)));
+=======
+                std::move(value), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def(
           "add_no_hasattr_guard",
@@ -7753,9 +8390,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object attr_name,
              py::object verbose_code_parts) -> void {
             self.add_permitted_leaf_guard(std::make_shared<NO_HASATTR>(
+<<<<<<< HEAD
                 self.get_root(),
                 std::move(attr_name),
                 std::move(verbose_code_parts)));
+=======
+                std::move(attr_name), std::move(verbose_code_parts)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       // Not permitted accessors
       .def("lambda_manager", &DictGuardManager::fail_on_get_child_manager)
@@ -7779,9 +8420,16 @@ PyObject* torch_c_dynamo_guards_init() {
              std::string source,
              py::handle example_value,
              py::handle guard_manager_enum) -> GuardManager* {
+<<<<<<< HEAD
             TORCH_CHECK(
                 !self.is_exact_dict_type(),
                 "getattr_manager on a DictGuardManager is supported only for dict subclasses");
+=======
+            if (self.is_exact_dict_type()) {
+              throw std::runtime_error(
+                  "getattr_manager on a DictGuardManager is supported only for dict subclasses");
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.get_child_manager<GetAttrGuardAccessor>(
                 std::move(attr_name),
                 std::move(source),
@@ -7819,6 +8467,7 @@ PyObject* torch_c_dynamo_guards_init() {
 #if IS_PYTHON_3_12_PLUS
 
   dict_version_watcher_id = PyDict_AddWatcher(dict_version_watch_callback);
+<<<<<<< HEAD
   TORCH_CHECK(
       dict_version_watcher_id != -1,
       "Failed to install dict_version_watch_callback");
@@ -7828,6 +8477,11 @@ PyObject* torch_c_dynamo_guards_init() {
   TORCH_CHECK(
       dict_recursive_tag_watcher_id != -1,
       "Failed to install dict_recursive_tag_watch_callback");
+=======
+  if (dict_version_watcher_id == -1) {
+    throw std::runtime_error("Failed to install dict_version_watch_callback");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif
 
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index f1590e19d49cf..dbc06453f8a47 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/dynamo/utils.h>
 
@@ -21,8 +24,23 @@ PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>)
 
 namespace torch::dynamo {
 
+<<<<<<< HEAD
 std::vector<uint8_t> _PyOpcode_Caches_vec;
 
+=======
+#if IS_PYTHON_3_11_PLUS
+
+std::vector<uint8_t> _PyOpcode_Caches_vec(
+    THP_PyOpcode_Caches,
+    THP_PyOpcode_Caches + THP_PyOpcode_Caches_size);
+
+#else
+
+std::vector<uint8_t> _PyOpcode_Caches_vec;
+
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using torch::dynamo::autograd::torch_c_dynamo_compiled_autograd_init;
 
 namespace {
@@ -102,7 +120,11 @@ THPObjectPtr _unicode_dispatch(PyObject* str) {
       return F::apply(str, PyUnicode_4BYTE_DATA(str), length);
     default:
       // This should be impossible - throw to make the compiler happy.
+<<<<<<< HEAD
       TORCH_CHECK(false, "unreachable");
+=======
+      throw std::runtime_error("unreachable");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -229,9 +251,12 @@ void initDynamoBindings(PyObject* torch) {
           "update_diff_guard_root_manager",
           &CacheEntry::update_diff_guard_root_manager);
 
+<<<<<<< HEAD
   py::class_<PrecompileEntry>(m, "_PrecompileEntry")
       .def_readonly("guard_manager", &PrecompileEntry::guard_manager);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<ExtraState>(m, "_ExtraState")
       .def("invalidate", &ExtraState::invalidate);
 
@@ -253,6 +278,7 @@ void initDynamoBindings(PyObject* torch) {
   m.def("_debug_get_cache_entry_list", &_debug_get_cache_entry_list);
   m.def("_reset_precompile_entries", &_reset_precompile_entries);
   m.def("_load_precompile_entry", &_load_precompile_entry);
+<<<<<<< HEAD
   m.def("_debug_get_precompile_entries", &_debug_get_precompile_entries);
   py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
   init_THPCaches();
@@ -262,6 +288,9 @@ void initDynamoBindings(PyObject* torch) {
         THP_PyOpcode_Caches,
         THP_PyOpcode_Caches + THP_PyOpcode_Caches_size);
   }
+=======
+  py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.attr("py_opcode_caches") = _PyOpcode_Caches_vec;
   m.def("code_framelocals_names", &code_framelocals_names);
   _register_functions(dynamo);
diff --git a/torch/csrc/export/pt2_archive_constants.h b/torch/csrc/export/pt2_archive_constants.h
index 8e4e2653265e3..a7509e6aa8d0d 100644
--- a/torch/csrc/export/pt2_archive_constants.h
+++ b/torch/csrc/export/pt2_archive_constants.h
@@ -33,22 +33,31 @@ namespace torch::_export::archive_spec {
   DO(WEIGHTS_DIR, "data/weights/")                                             \
   DO(WEIGHT_FILENAME_PREFIX, "weight_")                                        \
   DO(WEIGHTS_PARAM_CONFIG_FORMAT, "data/weights/{}_model_param_config.json")   \
+<<<<<<< HEAD
   DO(WEIGHTS_CONFIG_FILENAME_FORMAT, "data/weights/{}_weights_config.json")    \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /* constants, including tensor_constants, non-persistent buffers and script  \
    * objects */                                                                \
   DO(CONSTANTS_DIR, "data/constants/")                                         \
   DO(CONSTANTS_PARAM_CONFIG_FORMAT,                                            \
      "data/constants/{}_model_constants_config.json")                          \
+<<<<<<< HEAD
   DO(CONSTANTS_CONFIG_FILENAME_FORMAT,                                         \
      "data/constants/{}_constants_config.json")                                \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DO(TENSOR_CONSTANT_FILENAME_PREFIX, "tensor_")                               \
   DO(CUSTOM_OBJ_FILENAME_PREFIX, "custom_obj_")                                \
   /* example inputs */                                                         \
   DO(SAMPLE_INPUTS_DIR, "data/sample_inputs/")                                 \
   DO(SAMPLE_INPUTS_FILENAME_FORMAT,                                            \
      "data/sample_inputs/{}.pt") /* {model_name} */                            \
+<<<<<<< HEAD
   /* ExecuTorch artifacts, including PTE files */                              \
   DO(EXECUTORCH_DIR, "data/executorch/")                                       \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /* extra folder */                                                           \
   DO(EXTRA_DIR, "extra/")                                                      \
   DO(MODULE_INFO_PATH, "extra/module_info.json")                               \
diff --git a/torch/csrc/export/pybind.cpp b/torch/csrc/export/pybind.cpp
index eedd8666ea168..8d544f9760be3 100644
--- a/torch/csrc/export/pybind.cpp
+++ b/torch/csrc/export/pybind.cpp
@@ -1,7 +1,12 @@
+<<<<<<< HEAD
 #include <torch/csrc/export/example_upgraders.h>
 #include <torch/csrc/export/pt2_archive_constants.h>
 #include <torch/csrc/export/pybind.h>
 #include <torch/csrc/export/upgrader.h>
+=======
+#include <torch/csrc/export/pt2_archive_constants.h>
+#include <torch/csrc/export/pybind.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/generated_serialization_types.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -17,6 +22,7 @@ void initExportBindings(PyObject* module) {
 
   exportModule.def(
       "deserialize_exported_program", [](const std::string& serialized) {
+<<<<<<< HEAD
         auto parsed = nlohmann::json::parse(serialized);
 
         // Query the current Python schema version as target
@@ -29,12 +35,16 @@ void initExportBindings(PyObject* module) {
 
         auto upgraded = upgrade(parsed, target_version);
         return upgraded.get<ExportedProgram>();
+=======
+        return nlohmann::json::parse(serialized).get<ExportedProgram>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
 
   exportModule.def("serialize_exported_program", [](const ExportedProgram& ep) {
     return nlohmann::json(ep).dump();
   });
 
+<<<<<<< HEAD
   exportModule.def(
       "upgrade", [](const std::string& serialized_json, int target_version) {
         auto parsed = nlohmann::json::parse(serialized_json);
@@ -48,6 +58,8 @@ void initExportBindings(PyObject* module) {
   exportModule.def(
       "deregister_example_upgraders", []() { deregisterExampleUpgraders(); });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& entry : torch::_export::archive_spec::kAllConstants) {
     pt2ArchiveModule.attr(entry.first) = entry.second;
   }
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 32e781ce43056..ffd7a3221cf09 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -363,6 +363,7 @@ static int64_t maybe_get_level(const Tensor& tensor) {
   return -1;
 }
 
+<<<<<<< HEAD
 static void maybe_unsafe_set_level(const Tensor& tensor, int64_t level) {
   auto* batched = maybeGetBatchedImpl(tensor);
   if (batched) {
@@ -370,6 +371,8 @@ static void maybe_unsafe_set_level(const Tensor& tensor, int64_t level) {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static int64_t maybe_get_bdim(const Tensor& tensor) {
   auto* batched = maybeGetBatchedImpl(tensor);
   if (batched) {
@@ -526,7 +529,10 @@ void initFuncTorchBindings(PyObject* module) {
   m.def("is_functionaltensor", &is_functionaltensor);
   m.def("get_unwrapped", &get_unwrapped);
   m.def("maybe_get_level", &maybe_get_level);
+<<<<<<< HEAD
   m.def("_maybe_unsafe_set_level", &maybe_unsafe_set_level);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("maybe_get_bdim", &maybe_get_bdim);
   m.def("maybe_current_level", &maybe_current_level);
   m.def("current_level", &currentLevel);
diff --git a/torch/csrc/fx/node.cpp b/torch/csrc/fx/node.cpp
index 1669f79af72aa..bfb8cc0415dc5 100644
--- a/torch/csrc/fx/node.cpp
+++ b/torch/csrc/fx/node.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/fx/node.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
 #include <structmember.h>
@@ -10,6 +11,14 @@
 namespace {
 
 using NodeSortKey = c10::SmallVector<int64_t, 4>;
+=======
+#include <structmember.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+namespace {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct NodeBase;
 
 // Thrown to exit out of a C++ function and return an error to Python.
@@ -167,6 +176,7 @@ struct NodeBase {
   PyObject* users;
   PyObject* _repr_fn;
   PyObject* meta;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   alignas(NodeSortKey) char sort_key_buf[sizeof(NodeSortKey)];
 
@@ -202,6 +212,9 @@ struct NodeBase {
     p->set_next(n);
     n->set_prev(p);
   }
+=======
+  PyObject* _sort_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 static PyObject* NodeBase_new(
@@ -211,8 +224,11 @@ static PyObject* NodeBase_new(
   PyObject* self = type->tp_alloc(type, 0);
   if (!self)
     return nullptr;
+<<<<<<< HEAD
   new (reinterpret_cast<NodeBase*>(self)->sort_key_buf)
       NodeSortKey(); // placement new does not allocate
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return self;
 }
 
@@ -241,6 +257,10 @@ static int NodeBase_init_fn(NodeBase* self, PyObject* args, PyObject* kwds) {
   self->users = PyDict_New();
   self->_repr_fn = Py_NewRef(Py_None);
   self->meta = PyDict_New();
+<<<<<<< HEAD
+=======
+  self->_sort_key = PyTuple_New(0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return 0;
 }
 
@@ -260,6 +280,10 @@ static struct PyMemberDef NodeBase_members[] = {
     {"users", T_OBJECT_EX, offsetof(NodeBase, users), 0, nullptr},
     {"_repr_fn", T_OBJECT_EX, offsetof(NodeBase, _repr_fn), 0, nullptr},
     {"meta", T_OBJECT_EX, offsetof(NodeBase, meta), 0, nullptr},
+<<<<<<< HEAD
+=======
+    {"_sort_key", T_OBJECT_EX, offsetof(NodeBase, _sort_key), 0, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr} /* Sentinel */
 };
 
@@ -277,6 +301,10 @@ static int NodeBase_traverse(NodeBase* self, visitproc visit, void* arg) {
   Py_VISIT(self->users);
   Py_VISIT(self->_repr_fn);
   Py_VISIT(self->meta);
+<<<<<<< HEAD
+=======
+  Py_VISIT(self->_sort_key);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return 0;
 }
 
@@ -294,12 +322,19 @@ static int NodeBase_clear(NodeBase* self) {
   Py_CLEAR(self->users);
   Py_CLEAR(self->_repr_fn);
   Py_CLEAR(self->meta);
+<<<<<<< HEAD
+=======
+  Py_CLEAR(self->_sort_key);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return 0;
 }
 
 static void NodeBase_dealloc(PyObject* self) {
   PyObject_GC_UnTrack(self);
+<<<<<<< HEAD
   reinterpret_cast<NodeBase*>(self)->sort_key().~NodeSortKey();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   (void)NodeBase_clear((NodeBase*)self);
   Py_TYPE(self)->tp_free(self);
 }
@@ -358,6 +393,7 @@ static PyObject* NodeBase__update_args_kwargs(
   }
 }
 
+<<<<<<< HEAD
 static PyObject* NodeBase__remove_from_list(
     PyObject* self,
     PyObject* _ignored) {
@@ -541,12 +577,15 @@ static int NodeBase_set_sort_key(
   return 0;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static PyMethodDef NodeBase_methods[] = {
     {"_update_args_kwargs",
      (PyCFunction)(void*)(NodeBase__update_args_kwargs),
      METH_FASTCALL,
      "Internal method: do not call directly."},
+<<<<<<< HEAD
     {"_remove_from_list",
      (PyCFunction)(void*)(NodeBase__remove_from_list),
      METH_NOARGS,
@@ -588,6 +627,11 @@ static PyGetSetDef NodeBase_getset[] = {
     {nullptr, nullptr, nullptr, nullptr, nullptr} // Sentinel
 };
 
+=======
+    {nullptr, nullptr, 0, nullptr} // Sentinel
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyTypeObject NodeBaseType = {
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._NodeBase", /* tp_name */
@@ -619,7 +663,11 @@ PyTypeObject NodeBaseType = {
     nullptr, /* tp_iternext */
     NodeBase_methods, /* tp_methods */
     NodeBase_members, /* tp_members */
+<<<<<<< HEAD
     NodeBase_getset, /* tp_getset */
+=======
+    nullptr, /* tp_getset */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nullptr, /* tp_base */
     nullptr, /* tp_dict */
     nullptr, /* tp_descr_get */
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h
index 1575481148a07..35ce3aecbd042 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.h
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -105,7 +105,11 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
   void init_aoti_kernel_cache();
   // Load the AOTIModelContainerRunner object from the given file path.
   std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
+<<<<<<< HEAD
       const std::string& /*so_path*/);
+=======
+      const std::string&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index 1face0cd6b80b..434f69ed8ca44 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -9,10 +9,15 @@
 #include <fmt/format.h>
 #include <miniz.h>
 #include <nlohmann/json.hpp>
+<<<<<<< HEAD
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <regex>
+=======
+#include <fstream>
+#include <iostream>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef _WIN32
 #include <dirent.h>
@@ -24,7 +29,10 @@ namespace fs = std::filesystem;
 
 // TODO: C++17 has the filesystem header, which may replace these
 #ifdef _WIN32
+<<<<<<< HEAD
 #include <Windows.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // On Windows, the POSIX implementations are considered deprecated. We simply
 // map to the newer variant.
 #include <direct.h>
@@ -38,6 +46,7 @@ namespace fs = std::filesystem;
 #endif
 
 namespace {
+<<<<<<< HEAD
 
 const std::string k_separator = "/";
 
@@ -79,6 +88,8 @@ std::string normalize_path_separator(const std::string& orig_path) {
   return normalized_path;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool file_exists(const std::string& path) {
 #ifdef _WIN32
   return fs::exists(path);
@@ -90,6 +101,7 @@ bool file_exists(const std::string& path) {
 
 std::string create_temp_dir() {
 #ifdef _WIN32
+<<<<<<< HEAD
   try {
     fs::path temp_dir = fs::temp_directory_path();
     return temp_dir.string();
@@ -106,10 +118,21 @@ std::string create_temp_dir() {
       mkdtemp(temp_dir.data()) != nullptr,
       "Failed to create temporary directory: ",
       c10::utils::str_error(errno));
+=======
+  throw std::runtime_error("Not implemented");
+#else
+  std::string temp_dir = "/tmp/XXXXXX";
+  if (mkdtemp(temp_dir.data()) == nullptr) {
+    throw std::runtime_error(
+        std::string("Failed to create temporary directory: ") +
+        c10::utils::str_error(errno));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return temp_dir;
 #endif
 }
 
+<<<<<<< HEAD
 const char* object_file_ext() {
 #ifdef _WIN32
   return ".obj";
@@ -149,13 +172,26 @@ bool _is_windows_os() {
   return false;
 #endif
 }
+=======
+#ifdef _WIN32
+const std::string k_separator = "\\";
+#else
+const std::string k_separator = "/";
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 namespace torch::inductor {
 
 namespace {
 const nlohmann::json& load_json_file(const std::string& json_path) {
+<<<<<<< HEAD
   TORCH_CHECK(file_exists(json_path), "File not found: ", json_path);
+=======
+  if (!file_exists(json_path)) {
+    throw std::runtime_error("File not found: " + json_path);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::ifstream json_file(json_path);
   TORCH_CHECK(json_file.is_open());
@@ -166,23 +202,37 @@ const nlohmann::json& load_json_file(const std::string& json_path) {
 }
 
 std::tuple<std::string, std::string> get_cpp_compile_command(
+<<<<<<< HEAD
     const std::string& arg_filename,
+=======
+    const std::string& filename,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<std::string>& sources,
     const nlohmann::json& compile_options,
     const std::string& output_dir = "") {
   // Construct the cpp command
+<<<<<<< HEAD
   auto filename = normalize_path_separator(arg_filename);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::string compiler = compile_options["compiler"].get<std::string>();
   bool compile_only = compile_options["compile_only"].get<bool>();
 
   std::string source_args;
   for (const std::string& source : sources) {
+<<<<<<< HEAD
     source_args += normalize_path_separator(source) + " ";
   }
 
   std::string file_ext =
       compile_only ? object_file_ext() : extension_file_ext();
+=======
+    source_args += source + " ";
+  }
+
+  std::string file_ext = compile_only ? ".o" : ".so";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string target_file = output_dir + filename + file_ext;
   std::string target_dir = output_dir;
   if (target_dir.empty()) {
@@ -192,42 +242,63 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
 
   std::string cflags_args;
   for (auto& arg : compile_options["cflags"]) {
+<<<<<<< HEAD
     // [Windows compiler need it] convert first char arg to std::string, for
     // following plus(+) strings.
     cflags_args += std::string(_is_windows_os() ? "/" : "-") +
         arg.get<std::string>() + " ";
+=======
+    cflags_args += "-" + arg.get<std::string>() + " ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string definitions_args;
   for (auto& arg : compile_options["definitions"]) {
+<<<<<<< HEAD
     definitions_args += std::string(_is_windows_os() ? "/D" : "-D ") +
         arg.get<std::string>() + " ";
+=======
+    definitions_args += "-D " + arg.get<std::string>() + " ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string include_dirs_args;
   for (auto& arg : compile_options["include_dirs"]) {
+<<<<<<< HEAD
     include_dirs_args += std::string(_is_windows_os() ? "/I" : "-I") +
         arg.get<std::string>() + " ";
+=======
+    include_dirs_args += "-I" + arg.get<std::string>() + " ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string ldflags_args;
   for (auto& arg : compile_options["ldflags"]) {
+<<<<<<< HEAD
     ldflags_args += std::string(_is_windows_os() ? "/" : "-") +
         arg.get<std::string>() + " ";
+=======
+    ldflags_args += "-" + arg.get<std::string>() + " ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string libraries_dirs_args;
   for (auto& arg : compile_options["libraries_dirs"]) {
+<<<<<<< HEAD
     if (_is_windows_os()) {
       libraries_dirs_args +=
           fmt::format("/LIBPATH:\"{}\"", arg.get<std::string>()) + " ";
     } else {
       libraries_dirs_args += "-L" + arg.get<std::string>() + " ";
     }
+=======
+    libraries_dirs_args += "-L" + arg.get<std::string>() + " ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string libraries_args;
   for (auto& arg : compile_options["libraries"]) {
+<<<<<<< HEAD
     if (_is_windows_os()) {
       libraries_args += fmt::format("{}.lib", arg.get<std::string>()) + " ";
     } else {
@@ -288,6 +359,39 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
           " {} {} {}", ldflags_args, libraries_args, libraries_dirs_args);
     }
   }
+=======
+    libraries_args += "-l" + arg.get<std::string>() + " ";
+  }
+
+  std::string passthrough_parameters_args;
+  for (auto& arg : compile_options["passthrough_args"]) {
+    std::string arg_str = arg.get<std::string>();
+    std::string target = "script.ld";
+    std::string replacement = target_dir;
+    replacement.append(k_separator).append(target);
+    size_t pos = arg_str.find(target);
+    if (pos != std::string::npos) {
+      arg_str.replace(pos, target.length(), replacement);
+    }
+    passthrough_parameters_args += arg_str + " ";
+  }
+
+  std::string compile_only_arg = compile_only ? "-c" : "";
+
+  std::string cmd = fmt::format(
+      "{} {} {} {} {} {} {} {} {} {} -o {}",
+      compiler,
+      source_args,
+      definitions_args,
+      cflags_args,
+      include_dirs_args,
+      passthrough_parameters_args,
+      ldflags_args,
+      libraries_args,
+      libraries_dirs_args,
+      compile_only_arg,
+      target_file);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return std::make_tuple(cmd, target_file);
 }
@@ -396,15 +500,24 @@ std::string compile_so(
   size_t lastindex = cpp_filename.find_last_of('.');
   std::string filename = cpp_filename.substr(0, lastindex);
 
+<<<<<<< HEAD
   std::string compile_flags_path =
       normalize_path_separator(filename + "_compile_flags.json");
+=======
+  std::string compile_flags_path = filename + "_compile_flags.json";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const nlohmann::json compile_flags = load_json_file(compile_flags_path);
 
   auto [compile_cmd, output_o] =
       get_cpp_compile_command(filename, {cpp_filename}, compile_flags);
 
+<<<<<<< HEAD
   std::string linker_flags_path = normalize_path_separator(
       cpp_filename.substr(0, lastindex) + "_linker_flags.json");
+=======
+  std::string linker_flags_path =
+      cpp_filename.substr(0, lastindex) + "_linker_flags.json";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const nlohmann::json linker_flags = load_json_file(linker_flags_path);
 
   obj_filenames.push_back(output_o);
@@ -412,25 +525,48 @@ std::string compile_so(
       get_cpp_compile_command(filename, obj_filenames, linker_flags);
 
   // Run the commands to generate a .so file
+<<<<<<< HEAD
   TORCH_CHECK(system(compile_cmd.c_str()) == 0, "Failed to compile cpp file.");
   TORCH_CHECK(system(link_cmd.c_str()) == 0, "Failed to link files.");
+=======
+  int status = system(compile_cmd.c_str());
+  if (status != 0) {
+    throw std::runtime_error("Failed to compile cpp file.");
+  }
+  status = system(link_cmd.c_str());
+  if (status != 0) {
+    throw std::runtime_error("Failed to link files.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Move the mmapped weights onto the .so
   std::string serialized_weights_path = filename + "_serialized_weights.bin";
   if (file_exists(serialized_weights_path)) {
     std::ifstream serialized_weights_file(
         serialized_weights_path, std::ios::binary);
+<<<<<<< HEAD
     TORCH_CHECK(
         serialized_weights_file.is_open(),
         "Failed to open serialized weights file");
 
+=======
+    if (!serialized_weights_file.is_open()) {
+      throw std::runtime_error("Failed to open serialized weights file");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<char> serialized_weights(
         (std::istreambuf_iterator<char>(serialized_weights_file)),
         std::istreambuf_iterator<char>());
     serialized_weights_file.close();
 
     std::ofstream output_so_file(output_so, std::ios::binary | std::ios::app);
+<<<<<<< HEAD
     TORCH_CHECK(output_so_file.is_open(), "Failed to open output .so file");
+=======
+    if (!output_so_file.is_open()) {
+      throw std::runtime_error("Failed to open output .so file");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Page align the weights
     std::streampos so_size = output_so_file.tellp();
     std::vector<char> padding(16384 - so_size % 16384, ' ');
@@ -444,6 +580,7 @@ std::string compile_so(
 
   return output_so;
 }
+<<<<<<< HEAD
 
 std::unordered_set<std::string> find_model_names(
     const std::vector<std::string>& paths) {
@@ -466,6 +603,8 @@ std::unordered_set<std::string> find_model_names(
   return model_names;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 void AOTIModelPackageLoader::load_metadata(const std::string& cpp_filename) {
@@ -481,6 +620,7 @@ void AOTIModelPackageLoader::load_metadata(const std::string& cpp_filename) {
   }
 }
 
+<<<<<<< HEAD
 class RAIIMinizArchive {
  public:
   RAIIMinizArchive(const std::string& zip_path) {
@@ -662,6 +802,8 @@ std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
   return metadata;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTIModelPackageLoader::AOTIModelPackageLoader(
     const std::string& model_package_path,
     const std::string& model_name,
@@ -669,6 +811,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
     const size_t num_runners,
     const c10::DeviceIndex device_index) {
   if (run_single_threaded) {
+<<<<<<< HEAD
     TORCH_CHECK(
         num_runners == 1,
         "num_runners must be 1 when run_single_threaded is true");
@@ -682,6 +825,49 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   RAIIMinizArchive zip_archive{model_package_path};
   auto found_filenames{zip_archive.get_filenames()};
   TORCH_CHECK(!found_filenames.empty(), "No files found in zip archive.");
+=======
+    if (num_runners != 1) {
+      throw std::runtime_error(
+          "num_runners must be 1 when run_single_threaded is true");
+    }
+  } else {
+    if (num_runners < 1) {
+      throw std::runtime_error(
+          "num_runners must be >=1 when run_single_threaded is false");
+    }
+  }
+
+  // Extract all files within the zipfile to a temporary directory
+  mz_zip_archive zip_archive;
+  memset(&zip_archive, 0, sizeof(zip_archive));
+
+  if (!mz_zip_reader_init_file(&zip_archive, model_package_path.c_str(), 0)) {
+    throw std::runtime_error(
+        std::string("Failed to initialize zip archive: ") +
+        mz_zip_get_error_string(mz_zip_get_last_error(&zip_archive)));
+  }
+
+  std::vector<std::string> found_filenames;
+  for (uint32_t i = 0; i < zip_archive.m_total_files; i++) {
+    uint32_t filename_len =
+        mz_zip_reader_get_filename(&zip_archive, i, nullptr, 0);
+    if (filename_len == 0) {
+      throw std::runtime_error("Failed to read filename");
+    }
+    // filename_len returned by mz_zip_reader_get_filename includes the null
+    // terminator, so we need to subtract 1 here
+    std::string filename_str(filename_len - 1, '\0');
+    if (!mz_zip_reader_get_filename(
+            &zip_archive, i, filename_str.data(), filename_len)) {
+      throw std::runtime_error("Failed to read filename");
+    }
+    found_filenames.push_back(filename_str);
+  }
+
+  if (found_filenames.empty()) {
+    throw std::runtime_error("No files found in zip archive.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // All the paths are prepended with a tmp/ directory. We need to find the
   // prefix.
@@ -700,6 +886,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
         << found_filenames[1];
   }
 
+<<<<<<< HEAD
   temp_dir_ = normalize_path_separator(create_temp_dir());
 
   std::string so_filename;
@@ -731,6 +918,34 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
         std::string filename = cur_filename;
         if (lastSlash != std::string::npos) {
           filename = cur_filename.substr(lastSlash + 1);
+=======
+  temp_dir_ = create_temp_dir();
+
+  std::string so_filename;
+  std::string cpp_filename;
+  std::vector<std::string> obj_filenames;
+  std::string model_directory = file_prefix + "data" + k_separator +
+      "aotinductor" + k_separator + model_name;
+  std::string const_directory =
+      file_prefix + "data" + k_separator + "constants";
+
+  for (const std::string& filename_str : found_filenames) {
+    // Only compile files in the specified model directory
+    if (c10::starts_with(filename_str, model_directory) ||
+        c10::starts_with(filename_str, const_directory)) {
+      std::string output_path_str = temp_dir_;
+
+      if (c10::starts_with(filename_str, model_directory)) {
+        output_path_str += k_separator;
+        output_path_str += filename_str;
+      } else { // startsWith(filename_str, const_directory)
+        // Extract constants to the same directory as the rest of the files
+        // to be consistent with internal implementation
+        size_t lastSlash = filename_str.find_last_of(k_separator);
+        std::string filename = filename_str;
+        if (lastSlash != std::string::npos) {
+          filename = filename_str.substr(lastSlash + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         output_path_str.append(k_separator)
             .append(model_directory)
@@ -738,6 +953,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
             .append(filename);
       }
 
+<<<<<<< HEAD
       std::string output_file_path = normalize_path_separator(output_path_str);
       LOG(INFO) << "Extract file: " << zip_filename_str << " to "
                 << output_file_path;
@@ -770,16 +986,61 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
           so_filename = output_file_path;
         } else if (filename_extension == ".blob") {
           weight_blob_filename = output_file_path;
+=======
+      LOG(INFO) << "Extract file: " << filename_str << " to "
+                << output_path_str;
+
+      // Create the parent directory if it doesn't exist
+      size_t parent_path_idx = output_path_str.find_last_of(k_separator);
+      if (parent_path_idx == std::string::npos) {
+        throw std::runtime_error(
+            "Failed to find parent path in " + output_path_str);
+      }
+      std::string parent_path = output_path_str.substr(0, parent_path_idx);
+      if (!recursive_mkdir(parent_path)) {
+        throw std::runtime_error(fmt::format(
+            "Failed to create directory {}: {}",
+            parent_path,
+            c10::utils::str_error(errno)));
+      }
+
+      // Extracts file to the temp directory
+      mz_zip_reader_extract_file_to_file(
+          &zip_archive, filename_str.c_str(), output_path_str.c_str(), 0);
+
+      // Save the file for bookkeeping
+      size_t extension_idx = output_path_str.find_last_of('.');
+      if (extension_idx != std::string::npos) {
+        std::string filename_extension = output_path_str.substr(extension_idx);
+        if (filename_extension == ".cpp") {
+          cpp_filename = output_path_str;
+        } else if (filename_extension == ".o") {
+          obj_filenames.push_back(output_path_str);
+        } else if (filename_extension == ".so") {
+          so_filename = output_path_str;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
     }
   }
 
+<<<<<<< HEAD
+=======
+  // Close the zip archive as we have extracted all files to the temp
+  // directory
+  if (!mz_zip_reader_end(&zip_archive)) {
+    throw std::runtime_error(
+        std::string("Failed to close zip archive: {}") +
+        mz_zip_get_error_string(mz_zip_get_last_error(&zip_archive)));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (cpp_filename.empty() && so_filename.empty()) {
     std::string found_filenames_str;
     for (const std::string& filename : found_filenames) {
       found_filenames_str += filename + "\n";
     }
+<<<<<<< HEAD
     std::string model_names_str;
     for (const std::string& model_name_tmp :
          find_model_names(found_filenames)) {
@@ -795,6 +1056,11 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
         "\n\nTo load a specific model, please provide its name using the `model_name` parameter when calling AOTIModelPackageLoader() or torch._inductor.package.load_package.\n\n",
         "The following files were loaded from the archive:\n",
         found_filenames_str);
+=======
+    throw std::runtime_error(
+        "No AOTInductor generate cpp file or so file found in zip archive with the prefix " +
+        model_directory + "Loaded the following:\n" + found_filenames_str);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Compile the .so
@@ -807,15 +1073,28 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
   // Construct the runner depending on the device information
   std::string device_key = metadata_["AOTI_DEVICE_KEY"];
+<<<<<<< HEAD
   TORCH_CHECK(!device_key.empty(), "No device information found.");
+=======
+
+  if (device_key.empty()) {
+    throw std::runtime_error("No device information found.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::unordered_map<std::string, CreateAOTIModelRunnerFunc>
       registered_aoti_runner = getAOTIModelRunnerRegistry();
 
+<<<<<<< HEAD
   TORCH_CHECK(
       registered_aoti_runner.find(device_key) != registered_aoti_runner.end(),
       "Unsupported device key found: ",
       device_key);
+=======
+  if (registered_aoti_runner.find(device_key) == registered_aoti_runner.end()) {
+    throw std::runtime_error("Unsupported device key found: " + device_key);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   c10::Device device = c10::Device(device_key);
   device.set_index(device_index);
@@ -823,10 +1102,13 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   std::string cubin_dir = temp_dir_ + k_separator + model_directory;
   runner_ = registered_aoti_runner[device_key](
       so_path, num_runners, device.str(), cubin_dir, run_single_threaded);
+<<<<<<< HEAD
 
   if (weight_blob_filename != "") {
     runner_->update_constant_buffer_from_blob(weight_blob_filename);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 AOTIModelPackageLoader::~AOTIModelPackageLoader() {
@@ -878,7 +1160,11 @@ void AOTIModelPackageLoader::load_constants(
     if (fqn_to_constant_name.find(it.first) != fqn_to_constant_name.end()) {
       updated_constants_map.emplace(fqn_to_constant_name[it.first], it.second);
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "Constant not found: ", it.first);
+=======
+      throw std::runtime_error("Constant not found: " + it.first);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.h b/torch/csrc/inductor/aoti_package/model_package_loader.h
index 9c98dddef68ec..08619dea86b90 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.h
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.h
@@ -42,11 +42,14 @@ class TORCH_API AOTIModelPackageLoader {
       bool validate_full_updates,
       bool user_managed = false);
 
+<<<<<<< HEAD
   // Static function to load metadata directly from a model package
   static std::unordered_map<std::string, std::string> load_metadata_from_package(
       const std::string& model_package_path,
       const std::string& model_name);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   std::string temp_dir_;
   std::unique_ptr<AOTIModelContainerRunner> runner_;
diff --git a/torch/csrc/inductor/aoti_package/pybind.cpp b/torch/csrc/inductor/aoti_package/pybind.cpp
index 591153bb1f6c2..cfed04a04232a 100644
--- a/torch/csrc/inductor/aoti_package/pybind.cpp
+++ b/torch/csrc/inductor/aoti_package/pybind.cpp
@@ -86,11 +86,15 @@ void initAOTIPackageBindings(PyObject* module) {
           py::arg("tensor_map"),
           py::arg("use_inactive"),
           py::arg("validate_full_updates"),
+<<<<<<< HEAD
           py::arg("user_managed") = false)
       .def_static(
           "load_metadata_from_package",
           &AOTIModelPackageLoaderPybind::load_metadata_from_package,
           py::arg("model_package_path"),
           py::arg("model_name"));
+=======
+          py::arg("user_managed") = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
index 44517bcd702e8..0ebbb3db203e4 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 
+<<<<<<< HEAD
 #include <c10/util/FileSystem.h>
 
 #include <fcntl.h>
@@ -19,6 +20,25 @@
 #include <sys/mman.h>
 #include <unistd.h>
 #endif // _WIN32
+=======
+#ifndef _WIN32
+#include <sys/stat.h>
+#else
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
+
+namespace {
+bool file_exists(std::string& path) {
+#ifdef _WIN32
+  return fs::exists(path);
+#else
+  struct stat rc{};
+  return lstat(path.c_str(), &rc) == 0;
+#endif
+}
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::inductor {
 
@@ -29,6 +49,7 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
     const std::string& cubin_dir,
     const bool run_single_threaded) {
   if (run_single_threaded) {
+<<<<<<< HEAD
     TORCH_CHECK(
         num_models == 1,
         "num_models must be 1 when run_single_threaded is true");
@@ -36,6 +57,17 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
     TORCH_CHECK(
         num_models >= 1,
         "num_models must be >=1 when run_single_threaded is false");
+=======
+    if (num_models != 1) {
+      throw std::runtime_error(
+          "num_models must be 1 when run_single_threaded is true");
+    }
+  } else {
+    if (num_models < 1) {
+      throw std::runtime_error(
+          "num_models must be >=1 when run_single_threaded is false");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   model_so_ = std::make_unique<at::DynamicLibrary>(model_so_path.c_str());
   TORCH_CHECK(model_so_, "Failed to load model: ", model_so_path);
@@ -84,10 +116,18 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
       ? "AOTInductorModelContainerRunSingleThreaded"
       : "AOTInductorModelContainerRun";
   TRY_LOAD_SYMBOL(run_func_, run_func_name)
+<<<<<<< HEAD
   TORCH_CHECK(
       run_func_ != nullptr || !run_single_threaded,
       "No AOTInductorModelContainerRunSingleThreaded function in .so! To use AOTInductor-compiled model in the single-threaded mode,\
 consider rebuild your model with the latest AOTInductor.");
+=======
+  if (run_func_ == nullptr && run_single_threaded) {
+    throw std::runtime_error(
+        "No AOTInductorModelContainerRunSingleThreaded function in .so! To use AOTInductor-compiled model in the single-threaded mode,\
+consider rebuild your model with the latest AOTInductor.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TRY_LOAD_SYMBOL(
       free_inactive_constant_buffer_func_,
@@ -98,19 +138,26 @@ consider rebuild your model with the latest AOTInductor.");
   TRY_LOAD_SYMBOL(
       update_user_managed_constant_buffer_func_,
       "AOTInductorModelContainerUpdateUserManagedConstantBuffer")
+<<<<<<< HEAD
   TRY_LOAD_SYMBOL(
       get_constants_blob_size_func_,
       "AOTInductorModelContainerGetConstantsBlobSize")
   TRY_LOAD_SYMBOL(
       update_constants_from_blob_func_,
       "AOTInductorModelUpdateConstantsFromBlob")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef TRY_LOAD_SYMBOL
 
   // Hack to find the json file name from the model so file
   size_t lastindex = model_so_path.find_last_of('.');
   std::string json_filename = model_so_path.substr(0, lastindex) + ".json";
 
+<<<<<<< HEAD
   if (c10::filesystem::exists(json_filename)) {
+=======
+  if (file_exists(json_filename)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     proxy_executor_ = std::make_unique<torch::aot_inductor::OSSProxyExecutor>(
         json_filename, device_str == "cpu");
     proxy_executor_handle_ =
@@ -267,6 +314,7 @@ void AOTIModelContainerRunner::update_constant_buffer(
   }
 }
 
+<<<<<<< HEAD
 void AOTIModelContainerRunner::update_constant_buffer_from_blob(
     const std::string& weights_path) {
   uint64_t weights_size;
@@ -342,6 +390,8 @@ void AOTIModelContainerRunner::update_constant_buffer_from_blob(
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void AOTIModelContainerRunner::update_inactive_constant_buffer(
     const TensorConstantMap& const_map) {
   AOTI_RUNTIME_ERROR_CODE_CHECK(update_inactive_constant_buffer_func_(
@@ -363,9 +413,16 @@ void AOTIModelContainerRunner::swap_constant_buffer() {
 }
 
 void AOTIModelContainerRunner::free_inactive_constant_buffer() {
+<<<<<<< HEAD
   TORCH_CHECK(
       free_inactive_constant_buffer_func_ != nullptr,
       "No free_inactive_constant_buffer in .so! Consider rebuild your model with the latest AOTInductor.");
+=======
+  if (!free_inactive_constant_buffer_func_) {
+    throw std::runtime_error(
+        "No free_inactive_constant_buffer in .so! Consider rebuild your model with the latest AOTInductor.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AOTI_RUNTIME_ERROR_CODE_CHECK(
       free_inactive_constant_buffer_func_(container_handle_));
 }
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.h b/torch/csrc/inductor/aoti_runner/model_container_runner.h
index e9bf0daf7267c..de865e9320c28 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.h
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -55,7 +55,10 @@ class TORCH_API AOTIModelContainerRunner {
       AOTInductorStreamHandle cuda_stream_handle = nullptr);
   void swap_constant_buffer();
   void free_inactive_constant_buffer();
+<<<<<<< HEAD
   void update_constant_buffer_from_blob(const std::string& weights_path);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<std::string> get_call_spec();
 
@@ -100,10 +103,13 @@ class TORCH_API AOTIModelContainerRunner {
   decltype(&AOTInductorModelContainerFreeInactiveConstantBuffer)
       free_inactive_constant_buffer_func_{nullptr};
   decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
+<<<<<<< HEAD
   decltype(&AOTInductorModelContainerGetConstantsBlobSize)
       get_constants_blob_size_func_{nullptr};
   decltype(&AOTInductorModelUpdateConstantsFromBlob)
       update_constants_from_blob_func_{nullptr};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   AOTInductorModelContainerHandle container_handle_ = nullptr;
 
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
index a4f3f2ec564d2..28ff411f6c3a1 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
@@ -25,8 +25,14 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_cpu(
     const std::string& device_str,
     const std::string& cubin_dir,
     const bool run_single_threaded) {
+<<<<<<< HEAD
   TORCH_CHECK(
       device_str == "cpu", "Incorrect device passed to aoti_runner_cpu");
+=======
+  if (device_str != "cpu") {
+    throw std::runtime_error("Incorrect device passed to aoti_runner_cpu");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::make_unique<AOTIModelContainerRunnerCpu>(
       model_so_path, num_models, run_single_threaded);
 }
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
index a65496f268789..f9ee1849fe645 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
@@ -23,8 +23,14 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_mps(
     const std::string& device_str,
     const std::string& cubin_dir,
     const bool run_single_threaded) {
+<<<<<<< HEAD
   TORCH_CHECK(
       device_str == "mps", "Incorrect device passed to aoti_runner_mps");
+=======
+  if (device_str != "mps") {
+    throw std::runtime_error("Incorrect device passed to aoti_runner_mps");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::make_unique<AOTIModelContainerRunnerMps>(
       model_so_path, num_models, run_single_threaded);
 }
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
index d2cf3535f2d8b..0152f86003fa0 100644
--- a/torch/csrc/inductor/aoti_runner/pybind.cpp
+++ b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -51,11 +51,15 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerCpu::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
+<<<<<<< HEAD
           &AOTIModelContainerRunnerCpu::free_inactive_constant_buffer)
       .def(
           "update_constant_buffer_from_blob",
           &AOTIModelContainerRunnerCpu::update_constant_buffer_from_blob,
           py::arg("weights_path"));
+=======
+          &AOTIModelContainerRunnerCpu::free_inactive_constant_buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_CUDA
   py::class_<AOTIModelContainerRunnerCuda>(m, "AOTIModelContainerRunnerCuda")
@@ -95,11 +99,15 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerCuda::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
+<<<<<<< HEAD
           &AOTIModelContainerRunnerCuda::free_inactive_constant_buffer)
       .def(
           "update_constant_buffer_from_blob",
           &AOTIModelContainerRunnerCuda::update_constant_buffer_from_blob,
           py::arg("weights_path"));
+=======
+          &AOTIModelContainerRunnerCuda::free_inactive_constant_buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 #ifdef USE_XPU
   py::class_<AOTIModelContainerRunnerXpu>(m, "AOTIModelContainerRunnerXpu")
@@ -139,11 +147,16 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerXpu::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
+<<<<<<< HEAD
           &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer)
       .def(
           "update_constant_buffer_from_blob",
           &AOTIModelContainerRunnerXpu::update_constant_buffer_from_blob,
           py::arg("weights_path"));
+=======
+          &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 #if defined(USE_MPS) && defined(__APPLE__) && \
     !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
@@ -178,11 +191,16 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerMps::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
+<<<<<<< HEAD
           &AOTIModelContainerRunnerMps::free_inactive_constant_buffer)
       .def(
           "update_constant_buffer_from_blob",
           &AOTIModelContainerRunnerMps::update_constant_buffer_from_blob,
           py::arg("weights_path"));
+=======
+          &AOTIModelContainerRunnerMps::free_inactive_constant_buffer);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   m.def(
diff --git a/torch/csrc/inductor/aoti_runtime/device_utils.h b/torch/csrc/inductor/aoti_runtime/device_utils.h
index 8c75560f8d29b..8402660e40d54 100644
--- a/torch/csrc/inductor/aoti_runtime/device_utils.h
+++ b/torch/csrc/inductor/aoti_runtime/device_utils.h
@@ -14,7 +14,11 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
+<<<<<<< HEAD
 #define AOTI_RUNTIME_CUDA_CHECK(EXPR)                      \
+=======
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   do {                                                     \
     const cudaError_t code = EXPR;                         \
     const char* msg = cudaGetErrorString(code);            \
@@ -34,7 +38,11 @@ using DeviceStreamType = cudaStream_t;
 #include <level_zero/ze_api.h>
 #include <sycl/sycl.hpp>
 #include <sstream>
+<<<<<<< HEAD
 #define AOTI_RUNTIME_XPU_CHECK(EXPR)                                      \
+=======
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                   \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   do {                                                                    \
     const ze_result_t status = EXPR;                                      \
     if (status != ZE_RESULT_SUCCESS) {                                    \
@@ -52,7 +60,11 @@ using DeviceStreamType = sycl::queue*;
 
 #else
 
+<<<<<<< HEAD
 #define AOTI_RUNTIME_CPU_CHECK(EXPR)               \
+=======
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool ok = EXPR;                                  \
   if (!ok) {                                       \
     throw std::runtime_error("CPU runtime error"); \
diff --git a/torch/csrc/inductor/aoti_runtime/interface.h b/torch/csrc/inductor/aoti_runtime/interface.h
index ffccdd94e5be2..0cbd5ab9d72e8 100644
--- a/torch/csrc/inductor/aoti_runtime/interface.h
+++ b/torch/csrc/inductor/aoti_runtime/interface.h
@@ -6,6 +6,7 @@
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_runtime/utils.h>
 
+<<<<<<< HEAD
 #ifdef _WIN32
 /*
 On Windows, we need to explicit declaration for export APIs. And because the
@@ -17,6 +18,8 @@ the import case.
 #define AOTI_API __attribute__((__visibility__("default")))
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" {
 struct AOTInductorModelOpaque;
 using AOTInductorModelHandle = AOTInductorModelOpaque*;
@@ -30,6 +33,7 @@ using AOTInductorStreamHandle = AOTInductorStreamOpaque*;
 struct AOTInductorConstantMap;
 using AOTInductorConstantMapHandle = AOTInductorConstantMap*;
 
+<<<<<<< HEAD
 struct AOTInductorConstantMapEntry {
   const char* name;
   AtenTensorHandle handle;
@@ -38,6 +42,11 @@ struct AOTInductorConstantMapEntry {
 // TODO: Deprecate this API. This was kept for BC compatibility.
 // Please use AOTInductorModelContainerCreateWithDevice instead.
 AOTI_API AOTIRuntimeError AOTInductorModelContainerCreate(
+=======
+// TODO: Deprecate this API. This was kept for BC compatibility.
+// Please use AOTInductorModelContainerCreateWithDevice instead.
+AOTIRuntimeError AOTInductorModelContainerCreate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle* container_handle,
     size_t num_models,
     bool is_cpu,
@@ -50,18 +59,30 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerCreate(
 // "cpu", "cuda", "cuda:0", etc. If the device index is not specified for CUDA
 // device, runtime will use the device index returned by
 // "cudaGetDevice(&device_idx)"
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+=======
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle* container_handle,
     size_t num_models,
     const char* device_str,
     const char* cubin_dir);
 
 // Deletes the AOTInductor model container.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerDelete(
     AOTInductorModelContainerHandle container_handle);
 
 // Runs the inference.
 AOTI_API AOTIRuntimeError AOTInductorModelContainerRun(
+=======
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle);
+
+// Runs the inference.
+AOTIRuntimeError AOTInductorModelContainerRun(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
                                      // are stolen; the array itself is borrowed
@@ -75,7 +96,11 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerRun(
     AOTIProxyExecutorHandle proxy_executor_handle);
 
 // Single-threaded variant of previous.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+=======
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
                                      // are stolen; the array itself is borrowed
@@ -89,14 +114,22 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
     AOTIProxyExecutorHandle proxy_executor_handle);
 
 // Retrieves the number of constants for the model.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
 // Retrieves a constant's name.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     const char** name);
@@ -104,7 +137,11 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantName(
 // Retrieves a constant's original FQN.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     const char** original_fqn);
@@ -112,7 +149,11 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
 // Retrieves whether a constant is from folded.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     bool* from_folded);
@@ -120,7 +161,11 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
 // Retrieves the inductor constant type.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     int32_t* type);
@@ -128,7 +173,11 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantType(
 // Retrieves a constant's dtype.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     int32_t* dtype);
@@ -136,26 +185,39 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
 // Retrieves a constant's data size.
 // idx is the index of the internal's constants.
 // Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     size_t* data_size);
 
 // Extract the constants that is being used in the container.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+=======
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
     bool use_inactive);
 
 // Setup the constant buffer in model container with provided ConstantMap.
 // The ConstantMap is user managed, and the user would retain ownership.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError
 AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+=======
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
     bool use_inactive,
     bool validate_full_update);
 
+<<<<<<< HEAD
 // Same as AOTInductorModelContainerUpdateUserManagedConstantBuffer,
 // but no std::unordered_map crosses DLL boundaries for cross-compilation.
 AOTI_API AOTIRuntimeError
@@ -170,6 +232,12 @@ AOTInductorModelContainerUpdateUserManagedConstantBufferPairs(
 // use_inactive should be set as true if the inactive buffer is to be updated.
 // validate_full_update checks if all constants are included in the ConstantMap
 AOTI_API AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+=======
+// Setup the constant buffer in model container with provided ConstantMap
+// use_inactive should be set as true if the inactive buffer is to be updated.
+// validate_full_update checks if all constants are included in the ConstantMap
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
     bool use_inactive,
@@ -177,43 +245,75 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
 
 // Setup the inactive constant buffer in model container with provided
 // ConstantMap
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+=======
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle);
 
 // Free the inactive constant buffer in model container.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
     AOTInductorModelContainerHandle container_handle);
 
 // Run constant folding on constant buffer.
 AOTI_API AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+=======
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+// Run constant folding on constant buffer.
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     bool use_inactive,
     AOTInductorStreamHandle stream_handle,
     AOTIProxyExecutorHandle proxy_executor_handle);
 
 // Swap the constant buffer being used to the inactive one.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
     AOTInductorModelContainerHandle container_handle);
 
 // Retrieves the number of inputs for the model.
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+=======
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+// Retrieves the number of inputs for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t* ret_num_inputs);
 
 // Retrieves the input name at the given index.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetInputName(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t input_idx,
     const char** ret_input_names);
 
 // Retrieves the number of outputs for the model.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t* ret_num_outputs);
 
 // Retrieves the output name at the given index.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+=======
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     size_t output_idx,
     const char** ret_output_names);
@@ -225,19 +325,28 @@ AOTI_API AOTIRuntimeError AOTInductorModelContainerGetOutputName(
 //
 // constant_map_handle is an opaque type to satisfy the C ABI.  It should be a
 // std::unordered_map<std::string, at::Tensor*>*.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelCreate(
+=======
+AOTIRuntimeError AOTInductorModelCreate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelHandle* model_handle,
     AOTInductorConstantMapHandle constant_map_handle);
 
 // Run an AOTInductorModel (see AOTInductorModelCreate for when one should use
 // this function versus AOTInductorModelContainerRun).
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelRun(
+=======
+AOTIRuntimeError AOTInductorModelRun(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelHandle model_handle,
     AtenTensorHandle* input_handles,
     AtenTensorHandle* output_handles);
 
 // Replace AOTInductorModel's constant map. Note it doesn't handle concurrency
 // so be sure to handle ordering if AOTInductorModelRun is ran concurrently.
+<<<<<<< HEAD
 AOTI_API AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
     AOTInductorModelHandle model_handle,
     AOTInductorConstantMapHandle constant_map_handle);
@@ -261,6 +370,20 @@ AOTI_API AOTIRuntimeError AOTInductorModelGetNumOutputs(
     size_t* ret_num_outputs);
 
 AOTI_API AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+=======
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Delete an AOTInductorModel created by AOTInductorModelCreate.
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle);
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs);
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AOTInductorModelContainerHandle container_handle,
     const char** in_spec,
     const char** out_spec);
diff --git a/torch/csrc/inductor/aoti_runtime/model.h b/torch/csrc/inductor/aoti_runtime/model.h
index 253c5e917e76b..44f66b705db4b 100644
--- a/torch/csrc/inductor/aoti_runtime/model.h
+++ b/torch/csrc/inductor/aoti_runtime/model.h
@@ -1,13 +1,742 @@
 #pragma once
 
+<<<<<<< HEAD
+=======
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <optional>
+#include <regex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // WARNING: Be careful when adding new includes here. This header will be used
 // in model.so, and should not refer to any aten/c10 headers except the stable
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
+<<<<<<< HEAD
 #include <torch/csrc/inductor/aoti_runtime/model_base.h>
 
 namespace torch::aot_inductor {
 
+=======
+#include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+#ifdef USE_MPS
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#endif // USE_MPS
+#ifdef USE_XPU
+#include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
+#else
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+#endif // USE_XPU
+#include <torch/csrc/inductor/aoti_runtime/constant_type.h>
+
+#define AOTI_RUNTIME_CHECK(EXPR, MSG) \
+  do {                                \
+    bool ok = EXPR;                   \
+    if (!ok) {                        \
+      throw std::runtime_error(MSG);  \
+    }                                 \
+  } while (0)
+
+// At codegen time, we write out a binary file called constants.bin.
+// We then turn the raw binary to an object file that exposes this
+// symbol and link it into the final .so.
+// For information on the binary format, see `man objcopy`, under
+// the "binary-architecture" flag:
+// https://man7.org/linux/man-pages/man1/objcopy.1.html
+// todo: use #embed in C++ 23 once available
+// The constants are NOT readonly because they may be mutated.
+// NOLINTNEXTLINE(*array*)
+extern uint8_t _binary_constants_bin_start[];
+// NOLINTNEXTLINE(*array*)
+extern uint8_t _binary_constants_bin_end[];
+
+#if defined(USE_CUDA) || defined(USE_XPU)
+// Compute required blob size with 64-alignment if on GPU.
+#define AOTI_CONST_ALIGNMENT 64
+#else
+// Use 64-alignment (use something >=64)for better performance on CPU.
+#define AOTI_CONST_ALIGNMENT 64
+#endif
+
+namespace {
+
+using RAIIDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
+
+#ifdef USE_CUDA
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr;
+  AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
+  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#elif defined(USE_XPU)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  sycl::queue* queue_ptr = nullptr;
+  aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+  void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr);
+  auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#elif defined(USE_MPS)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr = nullptr;
+  aoti_torch_mps_malloc(&data_ptr, num_bytes);
+  auto deleter = [](void* ptr) { aoti_torch_mps_free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#else
+
+RAIIDataPtr RAII_cpuMalloc(size_t num_bytes) {
+  void* data_ptr = std::malloc(num_bytes);
+  if (!data_ptr) {
+    throw std::bad_alloc();
+  }
+  auto deleter = [](void* ptr) { std::free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#endif // USE_CUDA
+
+} // anonymous namespace
+
+namespace torch::aot_inductor {
+
+using ConstantMap =
+    std::unordered_map<std::string, MaybeOwningAtenTensorHandle>;
+
+// valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
+// Update the list here if more devices are supported in the future
+inline void parse_device_str(
+    const std::string& device_str,
+    int32_t& device_type,
+    int32_t& device_idx) {
+  std::regex re("(cpu|cuda|xpu|mps)(:([0-9]+))?");
+  std::smatch sm;
+  bool matched = std::regex_match(device_str, sm, re);
+  AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+
+  if (sm[1].str() == "cpu") {
+    device_type = aoti_torch_device_type_cpu();
+  } else if (sm[1].str() == "cuda") {
+    device_type = aoti_torch_device_type_cuda();
+#ifdef USE_XPU
+  } else if (sm[1].str() == "xpu") {
+    device_type = aoti_torch_device_type_xpu();
+#endif
+#ifdef USE_MPS
+  } else if (sm[1].str() == "mps") {
+    device_type = aoti_torch_device_type_mps();
+#endif
+  } else {
+    AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+  }
+
+  if (sm[3].matched) {
+    device_idx = stoi(sm[3].str());
+  } else {
+    device_idx = -1;
+  }
+}
+
+// Defines the base class for AOTInductorModel, which is generated by the
+// AOTInductor cpp codegen. Since we do not need dynamic dispatch, we rely
+// on curiously recurring template pattern (CRTP) to save some runtime
+// v-table overhead. The generated AOTInductorModel is specialized with
+// methods such as run_impl.
+template <typename Model>
+class AOTInductorModelBase {
+ public:
+  AOTInductorModelBase(
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_constants,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir,
+      bool include_weights = true)
+      : inputs_info_(num_inputs),
+        outputs_info_(num_outputs),
+        constants_info_(num_constants),
+        cubin_dir_(std::move(cubin_dir)),
+        include_weights(include_weights) {
+    parse_device_str(device_str, device_type_, device_idx_);
+
+#ifdef USE_CUDA
+    if (device_idx_ == -1) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+    } else {
+      // If device_idx_ is passed in, we need to set the current device to it
+      AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
+    }
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (device_idx_ == -1) {
+      aoti_torch_get_current_xpu_device(&device_idx_);
+    } else {
+      aoti_torch_set_current_xpu_device(device_idx_);
+    }
+#endif // USE_XPU
+#ifdef USE_MPS
+    if (device_idx_ == -1) {
+      device_idx_ = 0;
+    }
+#endif // USE_MPS
+  }
+
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  ~AOTInductorModelBase() {
+#ifdef USE_CUDA
+    if (run_finished_) {
+      auto code = cudaEventDestroy(*run_finished_);
+      if (code != cudaSuccess) {
+        std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
+                  << cudaGetErrorString(code) << std::endl;
+      }
+    }
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+    }
+#endif // USE_XPU
+  }
+
+  AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase(const AOTInductorModelBase&) = delete;
+  AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
+
+  void run(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+#elif defined(USE_XPU)
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+      run_finished_.reset();
+    }
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = false;
+#endif
+
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+
+#ifdef USE_CUDA
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#elif defined(USE_XPU)
+    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = true;
+#endif // USE_CUDA
+  }
+
+  // Non-thread-aware variant of run(). Obviously unsafe to use in a threaded
+  // environment :)
+  void run_single_threaded(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    // don't bother with any of the run_finished stuff; this is unsafe to call
+    // in a threaded context
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+  }
+
+  std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+#elif defined(USE_XPU)
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+      run_finished_.reset();
+    }
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = false;
+#endif
+
+    auto* model = static_cast<Model*>(this);
+    auto folded_constants =
+        model->const_run_impl(stream, proxy_executor, initialization);
+
+#ifdef USE_CUDA
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#elif defined(USE_XPU)
+    // sycl::queue* queue_ptr = nullptr;
+    // aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = true;
+#endif // USE_CUDA
+
+    return folded_constants;
+  }
+
+  void load_constants() {
+    size_t num_constants = this->num_constants();
+    size_t num_folded_constants = this->num_folded_constants();
+    constants_map_->reserve(num_constants);
+
+    std::vector<size_t> constants_internal_offset(
+        num_constants - num_folded_constants);
+    size_t blob_size = 0;
+    compute_constant_blob(blob_size, constants_internal_offset);
+    if (!include_weights) {
+      return;
+    }
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
+    constant_blob_ = RAII_gpuMalloc(blob_size);
+#else
+    constant_blob_ = RAII_cpuMalloc(blob_size);
+#endif
+
+    size_t bytes_read = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      bool from_folded = this->constant_from_folded(i);
+      if (from_folded) {
+        continue;
+      }
+      std::string name = this->constant_name(i);
+      size_t data_size = this->constant_data_size(i);
+      uint8_t* internal_ptr = (data_size != 0)
+          ? constant_ptr(
+                constants_internal_offset[i],
+                bytes_read,
+                data_size,
+                /* skip_copy = */ false)
+          : nullptr;
+      bytes_read += data_size;
+
+      // Create at::Tensor from copied memory.
+      auto dtype = this->constant_dtype(i);
+      auto ndim = this->constant_ndim(i);
+      auto size = this->constant_shape(i);
+      auto stride = this->constant_stride(i);
+#ifdef USE_MPS
+      auto offset = this->constant_offset(i) +
+          (constants_internal_offset[i] / aoti_torch_dtype_element_size(dtype));
+#else
+      auto offset = this->constant_offset(i);
+#endif
+      auto layout = this->constant_layout(i);
+      auto opaque_metadata_ptr = this->opaque_metadata(i);
+      auto opaque_metadata_size = this->opaque_metadata_size(i);
+
+      AtenTensorHandle tensor_handle = nullptr;
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_v2(
+          internal_ptr,
+          ndim,
+          size,
+          stride,
+          offset,
+          dtype,
+          device_type_,
+          device_idx_,
+          &tensor_handle,
+          layout,
+          opaque_metadata_ptr,
+          opaque_metadata_size));
+      constants_map_->emplace(std::move(name), tensor_handle);
+    }
+    if (constants_map_) {
+      this->update_constants_array_from_map();
+    }
+  }
+
+  RAIIDataPtr&& release_constant_blob() {
+    return std::move(constant_blob_);
+  }
+
+  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
+    return constants_;
+  }
+
+  int32_t get_device_type() const {
+    return device_type_;
+  }
+
+  int32_t get_device_idx() const {
+    return device_idx_;
+  }
+
+  uint8_t* constant_ptr(
+      size_t constant_offset,
+      size_t bytes_read,
+      size_t data_size,
+      bool skip_copy) {
+    auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
+    uint8_t* internal_ptr = constants_ptr + constant_offset;
+    // TODO: Handle shared storage case.
+    if (!skip_copy) {
+#ifdef USE_XPU
+      sycl::queue* queue_ptr = nullptr;
+      aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+      queue_ptr
+          ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
+          .wait();
+#elif USE_CUDA
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_ptr,
+          _get_constants_start() + bytes_read,
+          data_size,
+          cudaMemcpyHostToDevice));
+#elif USE_MPS
+      aoti_torch_mps_memcpy(
+          constants_ptr,
+          constant_offset,
+          bytes_read,
+          data_size,
+          _get_constants_start());
+      return constants_ptr;
+#else
+      memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size);
+#endif
+    }
+    return internal_ptr;
+  }
+
+  void compute_constant_blob(
+      size_t& blob_size,
+      std::vector<size_t>& constants_internal_offset) {
+    size_t num_constants = this->num_constants();
+    blob_size = 0;
+    size_t curr_idx = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      if (this->constant_from_folded(i)) {
+        continue;
+      }
+      size_t data_size = this->constant_data_size(i);
+      if (data_size % AOTI_CONST_ALIGNMENT) {
+        data_size = AOTI_CONST_ALIGNMENT +
+            (data_size / AOTI_CONST_ALIGNMENT) * AOTI_CONST_ALIGNMENT;
+      }
+      constants_internal_offset[curr_idx++] = blob_size;
+      blob_size += data_size;
+    }
+  }
+
+  size_t num_inputs() const {
+    return inputs_info_.size();
+  }
+
+  size_t num_outputs() const {
+    return outputs_info_.size();
+  }
+
+  size_t num_constants() const {
+    return constants_info_.size();
+  }
+
+  size_t num_folded_constants() const {
+    size_t total_consts = this->num_constants();
+    size_t folded_consts = 0;
+    for (size_t i = 0; i < total_consts; i++) {
+      if (this->constant_from_folded(i)) {
+        folded_consts++;
+      }
+    }
+    return folded_consts;
+  }
+
+  const char* input_name(int64_t idx) const {
+    return inputs_info_.at(idx).name;
+  }
+
+  const char* output_name(int64_t idx) const {
+    return outputs_info_.at(idx).name;
+  }
+
+  const char* constant_name(int64_t idx) const {
+    return constants_info_.at(idx).name;
+  }
+
+  size_t constant_ndim(int64_t idx) {
+    return constants_info_.at(idx).shape.size();
+  }
+
+  const int64_t* constant_shape(int64_t idx) const {
+    return constants_info_.at(idx).shape.data();
+  }
+
+  const int64_t* constant_stride(int64_t idx) const {
+    return constants_info_.at(idx).stride.data();
+  }
+
+  int32_t constant_dtype(int64_t idx) const {
+    return constants_info_.at(idx).dtype;
+  }
+
+  int32_t constant_layout(int64_t idx) const {
+    return constants_info_.at(idx).layout;
+  }
+
+  size_t constant_offset(int64_t idx) const {
+    return constants_info_.at(idx).offset;
+  }
+
+  size_t constant_data_size(int64_t idx) const {
+    return constants_info_.at(idx).data_size;
+  }
+
+  const char* constant_original_fqn(int64_t idx) const {
+    return constants_info_.at(idx).original_fqn;
+  }
+
+  const uint8_t* opaque_metadata(int64_t idx) const {
+    return constants_info_.at(idx).opaque_metadata.data();
+  }
+
+  size_t opaque_metadata_size(int64_t idx) {
+    return constants_info_.at(idx).opaque_metadata.size();
+  }
+
+  bool constant_from_folded(int64_t idx) const {
+    return constants_info_.at(idx).from_folded;
+  }
+
+  int32_t constant_type(int64_t idx) const {
+    return constants_info_.at(idx).type;
+  }
+
+  const char* get_in_spec() const {
+    return in_spec_.c_str();
+  }
+
+  const char* get_out_spec() const {
+    return out_spec_.c_str();
+  }
+
+  void update_constants_array_from_map() {
+    if (!constants_map_) {
+      throw std::runtime_error{
+          "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+    }
+    if (!constants_) {
+      constants_ =
+          std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
+    } else {
+      constants_->resize(constants_info_.size());
+    }
+    int idx = 0;
+    for (const auto& info : constants_info_) {
+      const auto it = constants_map_->find(info.name);
+      if (it != constants_map_->end()) {
+        constants_->at(idx) = ConstantHandle(it->second);
+      }
+      idx++;
+    }
+  }
+
+  void update_constants_map(
+      std::shared_ptr<ConstantMap> constants_map,
+      bool remap_constants_array = true) {
+    constants_map_ = std::move(constants_map);
+    if (remap_constants_array) {
+      update_constants_array_from_map();
+    }
+  }
+
+  // This function allows us to update the constants_ that is used to look up
+  // the corresponding constant tensor during runtime.
+  void update_constants_array(
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
+    constants_ = std::move(constants_array);
+  }
+
+  /// Returns true if the model is complete.
+  bool is_finished() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model CUDA event was not initialized"};
+    }
+
+    auto event_status = cudaEventQuery(*run_finished_);
+    if (event_status == cudaSuccess) {
+      return true;
+    } else if (event_status == cudaErrorNotReady) {
+      return false;
+    }
+
+    throw std::runtime_error(
+        std::string("The model did not finish successfully. Error: ") +
+        cudaGetErrorString(cudaGetLastError()));
+#elif defined(USE_XPU)
+    if (!run_finished_) {
+      throw std::runtime_error{"Model XPU event was not initialized"};
+    }
+    using namespace sycl::info;
+    return (*run_finished_)->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+
+#else // !USE_CUDA && !USE_XPU
+    return run_finished_;
+#endif // USE_CUDA
+  }
+
+  /// Synchronizes completion event.
+  void wait_for_completion() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+    (*run_finished_)->wait_and_throw();
+#endif
+  }
+
+ protected:
+  uint8_t* _get_constants_start() {
+#ifndef USE_MMAP_SELF
+    // NOLINTNEXTLINE(*const-cast*)
+    return const_cast<uint8_t*>(_binary_constants_bin_start);
+#else
+    if (self_mmap) {
+      return self_mmap;
+    }
+    Dl_info dl_info;
+    // get pointer to constant which are appended to the binary
+    AOTI_RUNTIME_CHECK(
+        dladdr(__func__, &dl_info), "Can't find shared library name");
+    int fd = open(dl_info.dli_fname, O_RDONLY);
+    AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened");
+    auto fsize = lseek(fd, 0, SEEK_END);
+    auto weights_size =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
+    auto magic_number =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[1];
+    auto weights_offset = fsize - weights_size;
+    AOTI_RUNTIME_CHECK(
+        (weights_offset & 0x3fff) == 0,
+        "weights_offset must be aligned to 16K boundary");
+    auto ptr = mmap(
+        NULL,
+        weights_size,
+        PROT_READ | PROT_WRITE,
+        MAP_PRIVATE,
+        fd,
+        weights_offset);
+    close(fd);
+    AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed");
+    self_mmap = static_cast<uint8_t*>(ptr);
+    AOTI_RUNTIME_CHECK(
+        reinterpret_cast<uint64_t*>(
+            self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
+        "Weights data seems corrupt");
+    return self_mmap;
+#endif
+  }
+  struct ParamInfo {
+    const char* name = nullptr;
+  };
+
+  struct ConstInfo {
+    const char* name = nullptr;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> stride;
+    int32_t dtype{};
+    int64_t offset{};
+    size_t data_size{};
+    int32_t layout{};
+    std::vector<uint8_t> opaque_metadata;
+    int64_t opaque_metadata_size{};
+    const char* original_fqn = nullptr;
+    bool from_folded{};
+    int32_t type{};
+  };
+
+  std::vector<ParamInfo> inputs_info_;
+  std::vector<ParamInfo> outputs_info_;
+  std::vector<ConstInfo> constants_info_;
+  std::string in_spec_;
+  std::string out_spec_;
+
+  std::shared_ptr<ConstantMap> constants_map_;
+  std::shared_ptr<std::vector<ConstantHandle>> constants_;
+
+  // Holds the blob storage for constants' at::Tensor.
+  RAIIDataPtr constant_blob_;
+
+#ifdef USE_MMAP_SELF
+  uint8_t* self_mmap = NULL;
+#endif
+
+  // A directory with CUDA binary files, e.g. compiled kernels, etc.
+  const std::optional<std::string> cubin_dir_;
+
+  // This is the flag that implies whether the weight is included in the model.
+  // If True, we would prepare the weight when loading the model, otherwise the
+  // model will be loaded without weights, and need to be provided by the user.
+  bool include_weights;
+
+  // Record if the model finishes an inference run so that its owning
+  // AOTModelContainer can reuse this instance.
+#ifdef USE_CUDA
+  std::optional<cudaEvent_t> run_finished_;
+#elif defined(USE_XPU)
+  std::optional<sycl::event*> run_finished_;
+#else // !USE_CUDA
+  bool run_finished_{};
+#endif
+
+  // Generated model uses this device index to create CUDA guards.
+  int32_t device_type_{};
+  int32_t device_idx_{};
+};
+
+// Codegen-ed classes can derive from this to keep pointers to loaded kernels.
+class AOTInductorModelKernelsBase {
+ public:
+  virtual ~AOTInductorModelKernelsBase() = default;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
  public:
   AOTInductorModel(
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
index 5cb7daa28a064..cc6c86d532a12 100644
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -255,6 +255,7 @@ class AOTInductorModelContainer {
     return models_[0]->constant_dtype(static_cast<int64_t>(idx));
   }
 
+<<<<<<< HEAD
   uint64_t constant_blob_size() const {
     if (this->num_models() == 0) {
       throw std::runtime_error("No available models in container!");
@@ -269,6 +270,8 @@ class AOTInductorModelContainer {
     return models_[0]->update_constants_from_blob(weight_blob_ptr);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run_const_fold(
       bool inactive_buffer,
       DeviceStreamType stream,
@@ -348,6 +351,7 @@ class AOTInductorModelContainer {
     return constant_type == ConstantType::Buffer;
   }
 
+<<<<<<< HEAD
   bool _is_empty_parameter_type(const size_t idx) const {
     auto constant_type = models_[0]->constant_type(static_cast<int64_t>(idx));
     auto constant_data_size =
@@ -361,6 +365,10 @@ class AOTInductorModelContainer {
       const size_t idx) const {
     return _is_tensor_constant_type(idx) || _is_buffer_type(idx) ||
         _is_empty_parameter_type(idx);
+=======
+  bool _is_tensor_constant_or_buffer_type(const size_t idx) const {
+    return _is_tensor_constant_type(idx) || _is_buffer_type(idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void assert_all_constants(
@@ -375,11 +383,19 @@ class AOTInductorModelContainer {
           std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
       auto it = constants_map.find(constant_name);
       if (it == constants_map.end()) {
+<<<<<<< HEAD
         if (_is_tensor_constant_or_buffer_type_or_empty_parameter(idx)) {
           // tracing sometimes creates tensors that are non-existent in
           // original graph. We could skip those and do a direct copy.
           std::cerr << "[WARNING] Found constant or module state buffer or "
                     << "empty module state parameter " << constant_name
+=======
+        if (_is_tensor_constant_or_buffer_type(idx)) {
+          // tracing sometimes creates tensors that are non-existent in
+          // original graph. We could skip those and do a direct copy.
+          std::cerr << "[WARNING] Found constant or module state buffer "
+                    << constant_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     << " in model, but not provided by user!\n";
           continue;
         }
@@ -464,8 +480,12 @@ class AOTInductorModelContainer {
           std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
       auto it = constants_map.find(constant_name);
       if (it == constants_map.end() &&
+<<<<<<< HEAD
           !(use_inactive &&
             _is_tensor_constant_or_buffer_type_or_empty_parameter(idx))) {
+=======
+          !(use_inactive && _is_tensor_constant_or_buffer_type(idx))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue;
       }
 
@@ -493,6 +513,7 @@ class AOTInductorModelContainer {
           constants_blob_ptr + constants_internal_offset_[idx];
       void* user_constant_ptr;
       int64_t constant_size;
+<<<<<<< HEAD
       int64_t* stride;
       int64_t offset;
       aoti_torch_get_data_ptr(tensor, &user_constant_ptr);
@@ -502,12 +523,17 @@ class AOTInductorModelContainer {
           aoti_torch_get_storage_offset(tensor, &offset));
       auto dtype = models_[0]->constant_dtype(idx);
 
+=======
+      aoti_torch_get_data_ptr(tensor, &user_constant_ptr);
+      aoti_torch_get_storage_size(tensor, &constant_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_XPU
       sycl::queue* queue_ptr = nullptr;
       aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
       queue_ptr
           ->memcpy(internal_constants_ptr, user_constant_ptr, constant_size)
           .wait();
+<<<<<<< HEAD
 #elif USE_MPS
       internal_constants_ptr = constants_blob_ptr;
       aoti_torch_mps_copy_buffer(
@@ -523,6 +549,10 @@ class AOTInductorModelContainer {
           aoti_torch_dtype_element_size(dtype);
 #elif USE_CUDA
       AOTI_RUNTIME_CUDA_CHECK(cudaMemcpy(
+=======
+#elif USE_CUDA
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           internal_constants_ptr,
           user_constant_ptr,
           constant_size,
@@ -534,15 +564,29 @@ class AOTInductorModelContainer {
       // We extract stride and offset from provided Tensor since we do not
       // guarantee that the tensor is contiguous.
       AtenTensorHandle tensor_handle;
+<<<<<<< HEAD
+      int device_type = models_[0]->get_device_type();
+      int device_idx = models_[0]->get_device_idx();
+=======
+      int64_t* stride;
+      int64_t offset;
       int device_type = models_[0]->get_device_type();
       int device_idx = models_[0]->get_device_idx();
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(tensor, &stride));
+      AOTI_TORCH_ERROR_CODE_CHECK(
+          aoti_torch_get_storage_offset(tensor, &offset));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
           internal_constants_ptr,
           models_[0]->constant_ndim(idx),
           models_[0]->constant_shape(idx),
           stride,
           offset,
+<<<<<<< HEAD
           dtype,
+=======
+          models_[0]->constant_dtype(idx),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           device_type,
           device_idx,
           &tensor_handle));
diff --git a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
index 3a2e91c37c916..276472840807a 100644
--- a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
+++ b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -128,10 +128,19 @@ static std::unique_ptr<sycl::kernel> _createKernel(
     uint32_t numWarps,
     uint32_t sharedMemory,
     void** params,
+<<<<<<< HEAD
     sycl::queue* queuePtr,
     uint32_t threadsPerWarp) {
   std::string kernelName =
       kernelPtr->get_info<sycl::info::kernel::function_name>();
+=======
+    sycl::queue* queuePtr) {
+  std::string kernelName =
+      kernelPtr->get_info<sycl::info::kernel::function_name>();
+  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
+  // stack.
+  int threadsPerWarp = 32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
   size_t globalRangeX = gridX * threadsPerWarp * numWarps;
   size_t globalRangeY = gridY;
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
index 7d1938f1c606b..d86c97d13b008 100644
--- a/torch/csrc/inductor/aoti_runtime/utils.h
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -12,7 +12,10 @@
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+<<<<<<< HEAD
 #include <torch/headeronly/util/shim_utils.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(__GNUC__) || defined(__clang__)
 #define AOTI_NOINLINE __attribute__((noinline))
@@ -22,36 +25,62 @@
 #define AOTI_NOINLINE
 #endif
 
+<<<<<<< HEAD
 #define AOTI_TORCH_ERROR_CODE_CHECK(call)                                  \
   if ((call) != AOTI_TORCH_SUCCESS) {                                      \
     torch::headeronly::detail::throw_exception(#call, __FILE__, __LINE__); \
+=======
+AOTI_NOINLINE static void throw_exception(
+    const char* call,
+    const char* file,
+    int64_t line) {
+  std::stringstream ss;
+  ss << call << " API call failed at " << file << ", line " << line;
+  throw std::runtime_error(ss.str());
+}
+
+#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
+  if ((call) != AOTI_TORCH_SUCCESS) {           \
+    throw_exception(#call, __FILE__, __LINE__); \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 using AOTIRuntimeError = int32_t;
 #define AOTI_RUNTIME_SUCCESS 0
 #define AOTI_RUNTIME_FAILURE 1
 
+<<<<<<< HEAD
 #define AOTI_RUNTIME_ERROR_CODE_CHECK(call)                                \
   if ((call) != AOTI_RUNTIME_SUCCESS) {                                    \
     torch::headeronly::detail::throw_exception(#call, __FILE__, __LINE__); \
+=======
+#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)     \
+  if ((call) != AOTI_RUNTIME_SUCCESS) {         \
+    throw_exception(#call, __FILE__, __LINE__); \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 namespace torch::aot_inductor {
 
 using DeleterFnPtr = void (*)(void*);
 
+<<<<<<< HEAD
 inline void noop_deleter(void* /*unused*/) {}
 
 inline void delete_record_function_object(void* ptr) {
   AOTI_TORCH_ERROR_CODE_CHECK(aoti_record_function_end(
       reinterpret_cast<AtenRecordFunctionHandle>(ptr)));
 }
+=======
+inline void noop_deleter(void*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inline void delete_tensor_object(void* ptr) {
   AOTI_TORCH_ERROR_CODE_CHECK(
       aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
 }
 
+<<<<<<< HEAD
 inline void delete_c10_value_object(void* ptr) {
   AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_c10_value_object(
       reinterpret_cast<C10IValueHandle>(ptr)));
@@ -121,6 +150,8 @@ class RAIIAtenRecordFunctionHandle {
   std::unique_ptr<AtenRecordFunctionOpaque, DeleterFnPtr> handle_;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
 class RAIIAtenTensorHandle {
  public:
@@ -201,6 +232,7 @@ class RAIIAtenTensorHandle {
   std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
 };
 
+<<<<<<< HEAD
 // RAIIC10IValueHandle steals the IValue objects created by the libtorch C ABI
 class RAIIC10IValueHandle {
  public:
@@ -245,6 +277,11 @@ class RAIIC10IValueHandle {
 class MaybeOwningAtenTensorHandle {
  public:
   MaybeOwningAtenTensorHandle() : handle_(nullptr) {}
+=======
+class MaybeOwningAtenTensorHandle {
+ public:
+  MaybeOwningAtenTensorHandle() : handle_(nullptr), raii_handle_() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We skip copy constructor as MaybeOwningAtenTensorHandle might be RAII which
   // makes it undefined.
   MaybeOwningAtenTensorHandle(const MaybeOwningAtenTensorHandle& other) =
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 4fb746ea15271..5882bb66d2afc 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -1,8 +1,13 @@
 #ifndef AOTI_TORCH_SHIM
 #define AOTI_TORCH_SHIM
 
+<<<<<<< HEAD
 #include <torch/csrc/inductor/aoti_torch/c/macros.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim_deprecated.h>
+=======
+#include <stddef.h>
+#include <stdint.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // This header defines a stable C API for certain ATen functionality in
 // libtorch. The AOTInductor compiled model.so will only refer to this header
@@ -36,6 +41,32 @@
 // maintain the old and new versions of the APIs until all old model.so
 // go out of use.
 
+<<<<<<< HEAD
+=======
+#ifdef __GNUC__
+#define AOTI_TORCH_EXPORT __attribute__((__visibility__("default")))
+#else // !__GNUC__
+#ifdef _WIN32
+// PyTorch2 doesn't currently work on Windows. Exporting these APIs can lead
+// to symbol clashes at link time if libtorch is included in a DLL and binary
+// that depends on the DLL. As a short term fix, we don't export the symbols.
+// In the long term, this will need to be addressed when Windows is supported.
+#ifdef OVRSOURCE
+// Do not export AOTI on Windows for internal builds
+#define AOTI_TORCH_EXPORT
+#else /* OVRSOURCE */
+#ifdef EXPORT_AOTI_FUNCTIONS
+#define AOTI_TORCH_EXPORT __declspec(dllexport)
+#else
+#define AOTI_TORCH_EXPORT __declspec(dllimport)
+#endif
+#endif /* OVRSOURCE */
+#else // !_WIN32
+#define AOTI_TORCH_EXPORT
+#endif // _WIN32
+#endif // __GNUC__
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // The following files are implemented in a header-only way and are guarded by
 // test/cpp/aoti_abi_check
 #include <c10/util/BFloat16.h>
@@ -46,6 +77,36 @@
 extern "C" {
 #endif
 
+<<<<<<< HEAD
+=======
+// AtenTensorHandle represents an abstract notion of Tensor that can be passed
+// between model.so and libtorch.so.  The contents of the structure itself
+// are private; model.so is not allowed to access any fields directly, it must
+// go through functions defined in this ABI.  Under the hood, this is
+// represented as at::Tensor*, but we reserve the right to change this (and in
+// fact, we probably should change it to at::TensorImpl* at least).
+//
+// An AtenTensorHandle can be owning (please check the API reference for exact
+// ownership/borrow semantics).  If you have an owning AtenTensorHandle
+// in model.so, you are obligated to aoti_torch_delete_tensor_object when you
+// are done.  You can use the helper C++ class RAIIAtenTensorHandle
+// (see aot_runtime/model.h) to ensure the deallocator is called in RAII style
+// (note that RAIIAtenTensorHandle is private to model.so, and never crosses
+// the ABI boundary.)
+struct AtenTensorOpaque;
+using AtenTensorHandle = AtenTensorOpaque*;
+
+struct AtenGeneratorOpaque;
+using AtenGeneratorHandle = AtenGeneratorOpaque*;
+
+struct AOTIProxyExecutorOpaque;
+using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
+
+using AOTITorchError = int32_t;
+#define AOTI_TORCH_SUCCESS 0
+#define AOTI_TORCH_FAILURE 1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Getter functions for retrieving various constants from the runtime, that
 // can subsequently be passed to other aoti_* functions.  By hiding these
 // behind functions, the precise value of device/dtype is NOT part of the
@@ -176,6 +237,7 @@ AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
 
+<<<<<<< HEAD
 // c10::IValue <int64_t> object conversion
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_int64_to_ivalue(int64_t val, C10IValueHandle* ivalue);
@@ -198,6 +260,8 @@ aoti_torch_tensor_to_ivalue(AtenTensorHandle val, C10IValueHandle* ivalue);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_delete_c10_value_object(C10IValueHandle handle);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Get a pointer to the underlying storage data
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
@@ -242,9 +306,12 @@ aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_layout(AtenTensorHandle tensor, int32_t* ret_layout);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
     AtenTensorHandle tensor,
     int64_t* ret_storage_offset);
@@ -252,9 +319,12 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_is_contiguous(AtenTensorHandle tensor, bool* ret_is_contiguous);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_is_defined(AtenTensorHandle tensor, bool* ret_is_defined);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle);
@@ -295,6 +365,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
     AtenTensorHandle* ret_new_tensor // returns new reference
 );
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided_pinned(
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -305,6 +376,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided_pinned(
     AtenTensorHandle* ret_new_tensor // returns new reference
 );
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_as_strided(
     AtenTensorHandle self,
     const int64_t* sizes_ptr,
@@ -337,6 +410,130 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size);
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
+    AtenTensorHandle weight,
+    AtenTensorHandle indices,
+    AtenTensorHandle offsets,
+    int32_t scale_grad_by_freq,
+    int32_t mode,
+    int32_t sparse,
+    AtenTensorHandle per_sample_weights, // optional argument
+    int32_t include_last_offset,
+    int32_t padding_idx,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
+    AtenTensorHandle self,
+    const int64_t* dim_ptr,
+    int64_t dim_size,
+    int64_t normalization,
+    int32_t forward,
+    AtenTensorHandle* ret // returns new reference
+);
+
+// This version is deprecated. We will remove it later
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    double scale,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_flash_attention_v2(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    int is_causal,
+    int return_debug_mask,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_efficient_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    AtenTensorHandle attn_bias, // optional argument
+    int compute_log_sumexp,
+    double dropout_p,
+    int is_causal,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
+    AtenTensorHandle self,
+    AtenTensorHandle mat2,
+    AtenTensorHandle bias,
+    int32_t* out_dtype,
+    AtenTensorHandle scale_a,
+    AtenTensorHandle scale_b,
+    AtenTensorHandle scale_result,
+    int8_t use_fast_accum,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2(
+    AtenTensorHandle self,
+    AtenTensorHandle mat2,
+    AtenTensorHandle scale_a,
+    AtenTensorHandle scale_b,
+    AtenTensorHandle bias,
+    AtenTensorHandle scale_result,
+    int32_t* out_dtype,
+    int8_t use_fast_accum,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias, // optional argument
+    const int64_t* stride_ptr,
+    int64_t stride_size,
+    const int64_t* padding_ptr,
+    int64_t padding_size,
+    const int64_t* dilation_ptr,
+    int64_t dilation_size,
+    int transposed,
+    const int64_t* output_padding_ptr,
+    int64_t output_padding_size,
+    int64_t groups,
+    AtenTensorHandle* ret // returns new reference
+);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // This function will create a new uninitialized tensor object
 // and its pointer is returned through *ret.
 AOTI_TORCH_EXPORT AOTITorchError
@@ -369,11 +566,35 @@ aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_clone_preserve_strides(AtenTensorHandle self, AtenTensorHandle* ret);
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat1,
+    AtenTensorHandle mat2,
+    float beta,
+    float alpha);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
     AtenTensorHandle self,
     AtenTensorHandle src,
     int32_t non_blocking);
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out(
     AtenTensorHandle out,
     AtenTensorHandle a,
@@ -403,7 +624,11 @@ AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
     AtenTensorHandle input,
     AtenTensorHandle weight,
+<<<<<<< HEAD
     AtenTensorHandle bias, // optional argument
+=======
+    AtenTensorHandle bias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t out_channel,
     AtenTensorHandle* out);
 
@@ -420,6 +645,7 @@ aoti_torch_cpu__wrapped_quantized_linear_prepacked(
     int64_t out_channel,
     AtenTensorHandle* out);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self);
 
 AOTI_TORCH_EXPORT AOTITorchError
@@ -441,6 +667,21 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_record_function_start(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_record_function_end(AtenRecordFunctionHandle guard);
 
+=======
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(
+    AtenTensorHandle repeats,
+    int64_t* output_size,
+    AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_check_inf_and_nan(const char* tensor_name, AtenTensorHandle tensor);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(
     AtenTensorHandle out,
     AtenTensorHandle self,
@@ -465,6 +706,20 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(
     const AtenTensorHandle values,
     bool accumulate);
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
+    AtenTensorHandle self,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
+    AtenTensorHandle self,
+    int32_t dtype,
+    AtenTensorHandle* ret // returns new reference
+);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
     AtenTensorHandle self,
     const char* msg);
@@ -537,6 +792,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_call_dispatcher(
     const char* overloadName,
     StableIValue* stack);
 
+<<<<<<< HEAD
 // Device-generic guard for managing device context
 struct DeviceGuardOpaque;
 using DeviceGuardHandle = DeviceGuardOpaque*;
@@ -570,6 +826,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_stream(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_current_device_index(int32_t* ret_device_index);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDA
 
 struct CUDAGuardOpaque;
@@ -601,6 +859,7 @@ aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 
+<<<<<<< HEAD
 // CUDA memory allocation using CUDACachingAllocator
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
     uint64_t nbytes,
@@ -610,6 +869,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_cuda_caching_allocator_raw_delete(void* ptr);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // USE_CUDA
 
 // See `ProxyExecutor Design Note` in ir.py for more details
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_cpu.h b/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
index 5a10290decd1d..45fa8cc04368b 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
@@ -245,6 +245,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
     AtenTensorHandle qScaleAndZeros,
     AtenTensorHandle* ret0);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
     AtenTensorHandle inp,
     const char* reduce_op,
@@ -261,6 +262,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
     AtenTensorHandle inp,
     AtenTensorHandle* ret0);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_mps.h b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
index 2ab0057805121..4e565ff64472b 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
@@ -3,16 +3,20 @@
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
+<<<<<<< HEAD
 struct AOTIMetalKernelFunctionOpaque;
 using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
 
 struct AOTIMetalShaderLibraryOpaque;
 using AOTIMetalShaderLibraryHandle = AOTIMetalShaderLibraryOpaque*;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+<<<<<<< HEAD
 // MetalShaderLibrary functions
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_create_shader_library(
     const char* metal_shader_source,
@@ -29,6 +33,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_get_kernel_function(
 // MetalKernelFunction functions
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_mps_start_encoding(AOTIMetalKernelFunctionHandle func);
+=======
+struct AOTIMetalKernelFunctionOpaque;
+using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_tensor(
     AOTIMetalKernelFunctionHandle func,
@@ -40,6 +48,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_int(
     unsigned idx,
     int64_t val);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_single(
     AOTIMetalKernelFunctionHandle func,
     uint64_t length);
@@ -61,6 +70,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_array_with_group_size(
     const uint64_t* group_size,
     size_t group_size_size);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_mps_malloc(void** buffer, size_t num_bytes);
 
@@ -73,6 +84,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_memcpy(
     size_t data_size,
     uint8_t* constants_start);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_copy_buffer(
     void* src_buffer,
     void* dst_buffer,
@@ -96,6 +108,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_run_command_block(
     aoti_torch_mps_command_block_callback_t callback,
     void* user_data);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
index c25fe6443c948..a8b997336a322 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
@@ -107,6 +107,7 @@ aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
     const char** unary_algorithm,
     AtenTensorHandle* ret0);
 
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
     AtenTensorHandle X,
     AtenTensorHandle act_scale,
@@ -201,6 +202,8 @@ aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
     const char** unary_algorithm,
     AtenTensorHandle* ret0);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // AT_MKLDNN_ENABLED()
 #ifdef __cplusplus
 } // extern "C"
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
index aced2b2f539de..02f9792b79976 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -29,7 +29,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c(AtenTensorHandle self,
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_rms_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, double* eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index c41487ae6dd89..40acdc657ce91 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -32,7 +32,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenT
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, int64_t* window_size_left, int64_t* window_size_right, AtenTensorHandle* seqused_k, AtenTensorHandle* alibi_slopes, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_rms_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, double* eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
@@ -44,7 +47,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_atten
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_grouped_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* offs, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
@@ -52,7 +58,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTenso
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
index e075956e14d73..c335684dfc718 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -15,6 +15,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d(AtenTensorH
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
@@ -28,6 +29,14 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_attention_ma
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
@@ -42,8 +51,11 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addmv(AtenTensorHandle self, Ate
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
@@ -72,17 +84,23 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_scatter_backward(AtenTens
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_median(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
index 39f0dec86165a..eee1609c371ff 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
@@ -13,13 +13,19 @@ extern "C" {
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__fused_rms_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, double* eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int4pack_mm_with_scales_and_zeros(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScale, AtenTensorHandle qZeros, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
index b0a8a09ffc3e4..4699dba2df200 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
@@ -4,7 +4,10 @@
 #include <vector>
 
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
 #include <c10/util/FileSystem.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 
@@ -19,6 +22,15 @@ bool has_key(
   return map.find(key) != map.end();
 }
 
+<<<<<<< HEAD
+=======
+#ifdef _WIN32
+const std::string k_separator = "\\";
+#else
+const std::string k_separator = "/";
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 namespace torch::aot_inductor {
@@ -418,7 +430,11 @@ void OSSProxyExecutor::get_input_info_from_serialized(
     const auto& arg = named_argument["arg"];
     const auto& name = named_argument["name"].get<std::string>();
 
+<<<<<<< HEAD
     std::string custom_obj_name;
+=======
+    std::string custom_obj_name = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (arg.contains("as_custom_obj")) {
       custom_obj_name = arg["as_custom_obj"]["name"].get<std::string>();
     }
@@ -619,8 +635,15 @@ OSSProxyExecutor::OSSProxyExecutor(
     // Load custom objects from custom_objs_config.json file
     // Get the constants json path from the extern_kernel_nodes .json file
 
+<<<<<<< HEAD
     auto folder_path = c10::filesystem::path(json_path).parent_path();
     auto custom_objs_json_path = folder_path / "custom_objs_config.json";
+=======
+    size_t lastSlash = json_path.find_last_of("/\\");
+    std::string folder_path = json_path.substr(0, lastSlash);
+    std::string custom_objs_json_path =
+        folder_path + k_separator + "custom_objs_config.json";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(INFO) << "Loading custom_objs_config .json file from "
               << custom_objs_json_path;
 
@@ -635,7 +658,12 @@ OSSProxyExecutor::OSSProxyExecutor(
       custom_objs_json_file >> custom_objs_json;
       // Load custom objects from binary torchbind file
       for (auto& [customObjName, file_name] : custom_objs_json.items()) {
+<<<<<<< HEAD
         auto customObjPath = folder_path / file_name.get<std::string>();
+=======
+        std::string customObjPath =
+            folder_path + k_separator + file_name.get<std::string>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LOG(INFO) << "Loading custom object to FbProxyExecutor from: "
                   << customObjPath;
 
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 04e633771ec27..ce1e9dea03ec0 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -1,5 +1,8 @@
 #include <ATen/native/quantized/cpu/qlinear.h>
+<<<<<<< HEAD
 #include <ATen/record_function.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/DeviceType.h>
 #include <c10/core/DispatchKey.h>
 #include <c10/core/GradMode.h>
@@ -25,11 +28,14 @@
 #include <iostream>
 #include <vector>
 
+<<<<<<< HEAD
 #include <c10/core/Device.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
 #include <c10/util/FileSystem.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
@@ -58,8 +64,63 @@
 #include <ATen/ops/scatter_reduce.h>
 #include <ATen/ops/view_as_real_ops.h>
 #include <ATen/ops/view_ops.h>
+<<<<<<< HEAD
 #endif
 
+=======
+
+#endif
+
+#ifndef _WIN32
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <climits>
+
+#else
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
+
+// HACK for failed builds in ARVR, where it cannot find these symbols within
+// std::experimental::filesystem
+namespace {
+std::string get_current_path() {
+#ifdef _WIN32
+  return fs::current_path().string();
+#else
+  // NOLINTNEXTLINE(*array*)
+  char currentPath[PATH_MAX]{};
+  if (getcwd(currentPath, sizeof(currentPath)) != nullptr) {
+    return std::string(currentPath);
+  } else {
+    throw std::runtime_error("Failed to get current path");
+  }
+#endif
+}
+
+bool file_exists(std::string& path) {
+#ifdef _WIN32
+  return fs::exists(path);
+#else
+  struct stat rc{};
+  return lstat(path.c_str(), &rc) == 0;
+#endif
+}
+
+bool create_directories(const std::string& path) {
+#ifdef _WIN32
+  return fs::create_directories(path);
+#else
+  if (mkdir(path.c_str(), 0777) == -1) {
+    throw std::runtime_error("Failed to create directory");
+  }
+  return true;
+#endif
+}
+} // namespace
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using namespace torch::aot_inductor;
 
 namespace {
@@ -221,6 +282,7 @@ AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor) {
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_delete_c10_value_object(C10IValueHandle handle) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     c10::IValue* t = reinterpret_cast<c10::IValue*>(handle);
@@ -270,6 +332,8 @@ AOTITorchError aoti_torch_tensor_to_ivalue(
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
     void** ret_data_ptr) {
@@ -389,6 +453,7 @@ AOTITorchError aoti_torch_get_device_index(
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_get_layout(
     AtenTensorHandle tensor,
     int32_t* ret_layout) {
@@ -398,6 +463,8 @@ AOTITorchError aoti_torch_get_layout(
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_get_storage_offset(
     AtenTensorHandle tensor,
     int64_t* ret_storage_offset) {
@@ -416,6 +483,7 @@ AOTITorchError aoti_torch_is_contiguous(
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_is_defined(
     AtenTensorHandle tensor,
     bool* ret_is_defined) {
@@ -425,6 +493,8 @@ AOTITorchError aoti_torch_is_defined(
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle) {
@@ -475,6 +545,7 @@ AOTITorchError aoti_torch_empty_strided(
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_empty_strided_pinned(
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -497,6 +568,8 @@ AOTITorchError aoti_torch_empty_strided_pinned(
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_create_tensor_from_blob(
     void* data,
     int64_t ndim,
@@ -1026,17 +1099,28 @@ AOTITorchError aoti_torch_cpu__wrapped_linear_prepack(
 AOTITorchError aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
     AtenTensorHandle input,
     AtenTensorHandle weight,
+<<<<<<< HEAD
     AtenTensorHandle bias, // optional argument
+=======
+    AtenTensorHandle bias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t out_channel,
     AtenTensorHandle* out) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     at::Tensor* input_tensor = tensor_handle_to_tensor_pointer(input);
     at::Tensor* weight_tensor = tensor_handle_to_tensor_pointer(weight);
+<<<<<<< HEAD
     auto optional_bias_tensor =
         pointer_to_optional(tensor_handle_to_tensor_pointer(bias));
 
     *out = new_tensor_handle(at::fbgemm_linear_fp16_weight_fp32_activation(
         *input_tensor, *weight_tensor, optional_bias_tensor));
+=======
+    at::Tensor* bias_tensor = tensor_handle_to_tensor_pointer(bias);
+
+    *out = new_tensor_handle(at::fbgemm_linear_fp16_weight_fp32_activation(
+        *input_tensor, *weight_tensor, *bias_tensor));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
@@ -1101,6 +1185,7 @@ AOTITorchError aoti_torch_check_inf_and_nan(
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_record_function_start(
     const char* name,
     IValueMapHandle kwargs,
@@ -1140,6 +1225,8 @@ AOTITorchError aoti_record_function_end(AtenRecordFunctionHandle guard) {
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_scatter_out(
     AtenTensorHandle out,
     AtenTensorHandle self,
@@ -1232,13 +1319,20 @@ void aoti_torch_save_tensor_handle(
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 #ifndef C10_MOBILE
   // Save tensor to tmp .pt file for tensors and can be torch.load'ed later
+<<<<<<< HEAD
   auto cwd = c10::filesystem::current_path();
   auto tmp_folder = cwd / "tmp" / "aoti_torch";
   if (!c10::filesystem::exists(tmp_folder)) {
+=======
+  std::string cwd = get_current_path();
+  std::string tmp_folder = cwd + "/tmp/aoti_torch/";
+  if (!file_exists(tmp_folder)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::cout
         << "aoti_torch_save_tensor_handle: Path does not exist, creating it..."
         << tmp_folder << '\n';
 
+<<<<<<< HEAD
     std::error_code ec{};
     if (!c10::filesystem::create_directories(tmp_folder, ec)) {
       std::cout << "aoti_torch_save_tensor_handle: Error creating directory: "
@@ -1248,6 +1342,16 @@ void aoti_torch_save_tensor_handle(
   }
   std::string tensor_filepath_to_save = tmp_folder.string() + launch_prefix +
       "_" + kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
+=======
+    if (!create_directories(tmp_folder)) {
+      std::cout << "aoti_torch_save_tensor_handle: Error creating directory: "
+                << tmp_folder << '\n';
+      return;
+    }
+  }
+  std::string tensor_filepath_to_save = tmp_folder + launch_prefix + "_" +
+      kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto bytes = torch::jit::pickle_save(c10::IValue(*t));
   std::ofstream fout(tensor_filepath_to_save, std::ios::out | std::ios::binary);
@@ -1267,7 +1371,12 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   if (msg) {
     std::cout << "  " << msg;
   }
+<<<<<<< HEAD
   std::cout << "  " << "]:" << '\n';
+=======
+  std::cout << "  "
+            << "]:" << '\n';
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Print exact tensor values for small size tensors
   const int64_t numel = t->numel();
@@ -1374,8 +1483,16 @@ void aoti_torch_warn(
     const char* file,
     uint32_t line,
     const char* msg) {
+<<<<<<< HEAD
   ::c10::warn(
       ::c10::Warning(::c10::UserWarning(), {func, file, line}, msg, false));
+=======
+  ::c10::warn(::c10::Warning(
+      ::c10::UserWarning(),
+      {func, file, static_cast<uint32_t>(line)},
+      msg,
+      false));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 AOTITorchError aoti_torch__alloc_from_pool(
@@ -1406,6 +1523,168 @@ AOTITorchError aoti_torch_zero_(AtenTensorHandle tensor) {
   });
 }
 
+<<<<<<< HEAD
+=======
+static StableIValue from_ivalue(
+    const c10::TypePtr& type,
+    const c10::IValue& ivalue) {
+  switch (type->kind()) {
+    case c10::TypeKind::TensorType: {
+      AtenTensorHandle ath = torch::aot_inductor::new_tensor_handle(
+          std::move(const_cast<at::Tensor&>(ivalue.toTensor())));
+      return from(ath);
+    }
+    case c10::TypeKind::IntType: {
+      return from(ivalue.toInt());
+    }
+    case c10::TypeKind::FloatType: {
+      return from(ivalue.toDouble());
+    }
+    case c10::TypeKind::BoolType: {
+      return from(ivalue.toBool());
+    }
+    case c10::TypeKind::ScalarTypeType: {
+      return from(ivalue.toScalarType());
+    }
+    case c10::TypeKind::DeviceObjType: {
+      return from(ivalue.toDevice());
+    }
+    case c10::TypeKind::LayoutType: {
+      return from(ivalue.toLayout());
+    }
+    case c10::TypeKind::MemoryFormatType: {
+      return from(ivalue.toMemoryFormat());
+    }
+    case c10::TypeKind::OptionalType: {
+      auto inner_type = type->castRaw<at::OptionalType>()->getElementType();
+
+      // ideally, if we had the C++ type corresponding to inner_type, which we
+      // will denote as inner_type::t (does not actually exist), we would be
+      // able to follow the patterned semantic of every other case here in one
+      // line:
+      //
+      // return from<std::optional<inner_type::t>>(ivalue.toInnerTypeT()));
+      //
+      // BUT we do NOT have that type inner_type::t readily available, so we
+      // will manually unwrap and recursively call. This implementation MUST
+      // be kept in sync with from<std::optional<T>> function in
+      // torch/csrc/stable/library.h
+      if (ivalue.isNone()) {
+        return from(std::nullopt);
+      }
+      StableIValue* sivp = new StableIValue(from_ivalue(inner_type, ivalue));
+      return from(sivp);
+    }
+    default: {
+      TORCH_CHECK(
+          false,
+          "Not yet supported conversion from IValue to StableIValue for schema type: ",
+          type->str());
+    }
+  }
+}
+
+static c10::IValue to_ivalue(
+    const c10::TypePtr& type,
+    const StableIValue stable_ivalue) {
+  switch (type->kind()) {
+    case c10::TypeKind::TensorType: {
+      auto ret_raiiath = torch::aot_inductor::RAIIAtenTensorHandle(
+          to<AtenTensorHandle>(stable_ivalue));
+      return (c10::IValue(*torch::aot_inductor::tensor_handle_to_tensor_pointer(
+          ret_raiiath.get())));
+    }
+    case c10::TypeKind::IntType: {
+      return c10::IValue(to<int64_t>(stable_ivalue));
+    }
+    case c10::TypeKind::FloatType: {
+      return c10::IValue(to<double>(stable_ivalue));
+    }
+    case c10::TypeKind::BoolType: {
+      return c10::IValue(to<bool>(stable_ivalue));
+    }
+    case c10::TypeKind::ScalarTypeType: {
+      return c10::IValue(to<c10::ScalarType>(stable_ivalue));
+    }
+    case c10::TypeKind::DeviceObjType: {
+      return c10::IValue(to<c10::Device>(stable_ivalue));
+    }
+    case c10::TypeKind::LayoutType: {
+      return c10::IValue(to<c10::Layout>(stable_ivalue));
+    }
+    case c10::TypeKind::MemoryFormatType: {
+      return c10::IValue(to<c10::MemoryFormat>(stable_ivalue));
+    }
+    case c10::TypeKind::OptionalType: {
+      auto inner_type = type->castRaw<at::OptionalType>()->getElementType();
+
+      // ideally, if we had the C++ type corresponding to inner_type, which we
+      // will denote as inner_type::t (does not actually exist), we would be
+      // able to follow the patterned semantic of every other case here in one
+      // line:
+      //
+      // return c10::IValue(to<std::optional<inner_type::t>>(stable_ivalue));
+      //
+      // BUT we do NOT have that type inner_type::t readily available, so we
+      // will manually unwrap and recursively call. This implementation MUST
+      // be kept in sync with the to<T> function in
+      // torch/csrc/stable/library.h
+      if (stable_ivalue == from(std::nullopt)) {
+        return c10::IValue();
+      }
+      auto sivp = to<StableIValue*>(stable_ivalue);
+      auto ival = to_ivalue(inner_type, *sivp);
+      delete sivp;
+      return ival;
+    }
+    default: {
+      TORCH_CHECK(
+          false,
+          "Not yet supported conversion from StableIValue to IValue for schema type: ",
+          type->str());
+    }
+  }
+}
+
+class StableIValueBoxedKernel : public c10::OperatorKernel {
+ public:
+  StableIValueBoxedKernel(void (*fn)(StableIValue*, uint64_t, uint64_t))
+      : fn_(fn) {}
+
+  void operator()(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack) {
+    const auto& schema = op.schema();
+    const auto num_returns = schema.returns().size();
+    const auto num_arguments = schema.arguments().size();
+
+    auto ministack =
+        std::make_unique<StableIValue[]>(std::max(num_arguments, num_returns));
+
+    for (const auto idx : c10::irange(num_arguments)) {
+      const auto ministack_idx = num_arguments - idx - 1;
+      const c10::TypePtr& arg_type = schema.arguments()[ministack_idx].type();
+      ministack[ministack_idx] = from_ivalue(arg_type, torch::jit::pop(stack));
+    }
+
+    // boxed function is going to take a stack of StableIValues, cast them to
+    // our schema values, and run the function and modify the StableIValue stack
+    fn_(ministack.get(), num_arguments, num_returns);
+
+    // read the output from the end of the stack and wrap that back into
+    // IValue from StableIValue
+    for (size_t idx = 0; idx < num_returns; idx++) {
+      const c10::TypePtr& ret_type = schema.returns()[idx].type();
+      torch::jit::push(stack, to_ivalue(ret_type, ministack[idx]));
+    }
+  }
+
+ private:
+  void (*fn_)(StableIValue*, uint64_t, uint64_t);
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_library_init_impl(
     const char* ns,
     const char* k,
@@ -1455,6 +1734,21 @@ AOTITorchError aoti_torch_library_init_fragment(
   });
 }
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_impl(
+    TorchLibraryHandle self,
+    const char* name,
+    void (*fn)(StableIValue*, uint64_t, uint64_t)) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    reinterpret_cast<torch::Library*>(self)->impl(
+        name,
+        torch::CppFunction::makeFromBoxedFunctor(
+            std::make_unique<StableIValueBoxedKernel>(fn)));
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_library_def(TorchLibraryHandle self, const char* name) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
@@ -1467,6 +1761,7 @@ aoti_torch_delete_library_object(TorchLibraryHandle tlh) {
       { delete reinterpret_cast<torch::Library*>(tlh); });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_create_device_guard(
     int32_t device_index,
     DeviceGuardHandle* ret_guard // returns new reference
@@ -1523,3 +1818,38 @@ AOTITorchError aoti_torch_get_current_device_index(int32_t* ret_device_index) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
       { *ret_device_index = at::accelerator::getDeviceIndex(); });
 }
+=======
+AOTITorchError aoti_torch_call_dispatcher(
+    const char* opName,
+    const char* overloadName,
+    StableIValue* stack) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    const auto op =
+        c10::Dispatcher::singleton().findSchemaOrThrow(opName, overloadName);
+    const auto& schema = op.schema();
+    const auto num_returns = schema.returns().size();
+    const auto num_arguments = schema.arguments().size();
+
+    torch::jit::Stack ivalue_stack;
+    // we will only need max(num_args, num_returns)
+    ivalue_stack.reserve(std::max(num_arguments, num_returns));
+
+    // convert StableIValue stack to c10::IValue stack
+    for (const auto idx : c10::irange(num_arguments)) {
+      auto stable_ivalue = stack[idx];
+      auto arg_type = schema.arguments()[idx].type();
+      torch::jit::push(ivalue_stack, to_ivalue(arg_type, stable_ivalue));
+    }
+
+    op.callBoxed(ivalue_stack);
+
+    // there should then be num_returns IValues on the stack, which
+    // we will convert to StableIValue and repopulate user input stack
+    for (const auto idx : c10::irange(num_returns)) {
+      const auto stack_idx = num_returns - idx - 1;
+      const c10::TypePtr& ret_type = schema.returns()[idx].type();
+      stack[stack_idx] = from_ivalue(ret_type, torch::jit::pop(ivalue_stack));
+    }
+  });
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
index b1c864bf3fbba..835f0b05d6910 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
@@ -1,7 +1,10 @@
 
+<<<<<<< HEAD
 #ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Functional.hpp>
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
@@ -19,6 +22,19 @@ using namespace torch::aot_inductor;
 
 #if AT_MKLDNN_ENABLED()
 
+<<<<<<< HEAD
+=======
+template <typename T>
+static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
+  c10::List<T> scalars_list;
+  scalars_list.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    scalars_list.emplace_back(scalars[i]);
+  }
+  return scalars_list;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
     AtenTensorHandle other,
@@ -532,6 +548,7 @@ AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
     *ret0 = new_tensor_handle(std::move(tmp_result));
   });
 }
+<<<<<<< HEAD
 
 #ifdef USE_DISTRIBUTED
 AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
@@ -567,3 +584,5 @@ AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
   });
 }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
index 305e7e0de6a99..1cc6487310f9e 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
@@ -2,7 +2,10 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
+<<<<<<< HEAD
 #include <c10/cuda/CUDACachingAllocator.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 
@@ -54,6 +57,7 @@ AOTITorchError aoti_torch_get_current_cuda_stream(
     *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index);
   });
 }
+<<<<<<< HEAD
 
 AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
     uint64_t nbytes,
@@ -83,3 +87,5 @@ AOTITorchError aoti_torch_cuda_caching_allocator_raw_delete(void* ptr) {
     }
   });
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.cpp b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
index 568350fa717d8..d341163676107 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mps.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
@@ -27,6 +27,7 @@ AOTITorchError aoti_torch_mps_set_arg_int(
     func->setArg(idx, val);
   });
 }
+<<<<<<< HEAD
 
 AOTITorchError aoti_torch_mps_create_shader_library(
     const char* metal_shader_source,
@@ -140,3 +141,5 @@ AOTITorchError aoti_torch_mps_run_command_block(
         [callback, func, user_data]() { callback(func, user_data); });
   });
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.mm b/torch/csrc/inductor/aoti_torch/shim_mps.mm
index a24fe1ca02149..fdf1860a8e71b 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mps.mm
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.mm
@@ -1,9 +1,17 @@
+<<<<<<< HEAD
+=======
+#include <ATen/native/mps/MetalShaderLibrary.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSDevice.h>
+<<<<<<< HEAD
 #include <ATen/mps/MPSStream.h>
 #include <ATen/mps/MPSProfiler.h>
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using namespace torch::aot_inductor;
 
@@ -31,6 +39,10 @@ AOTITorchError aoti_torch_mps_free(
   });
 }
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError
 aoti_torch_mps_memcpy(void* buffer, size_t constant_offset, size_t bytes_read, size_t data_size, uint8_t* constants_start) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
@@ -39,6 +51,7 @@ AOTITorchError aoti_torch_mps_free(
     memcpy(buffer_pointer + constant_offset, constants_start + bytes_read, data_size);
   });
 }
+<<<<<<< HEAD
 
 AOTITorchError
 aoti_torch_mps_copy_buffer(void* src_buffer, void* dst_buffer, size_t data_size, size_t src_offset, size_t dst_offset) {
@@ -51,3 +64,5 @@ AOTITorchError aoti_torch_mps_free(
     stream->copy_and_sync(src_mtl_buffer, dst_mtl_buffer, data_size, src_offset, dst_offset, true, profile_id);
   });
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
index c05872ae04239..1021b99ad6dda 100644
--- a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
@@ -80,8 +80,11 @@ AOTITorchError aoti_torch_get_current_sycl_queue(void** ret) {
 
 #if AT_MKLDNN_ENABLED()
 #include <ATen/native/mkldnn/xpu/Conv.h>
+<<<<<<< HEAD
 #include <ATen/native/mkldnn/xpu/qconv.h>
 #include <ATen/native/mkldnn/xpu/qlinear.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
@@ -206,6 +209,7 @@ AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
   });
 }
 
+<<<<<<< HEAD
 AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
     AtenTensorHandle X,
     AtenTensorHandle act_scale,
@@ -429,4 +433,6 @@ AOTITorchError aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // AT_MKLDNN_ENABLED()
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 22018cd70c829..e8038478daf3f 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -222,6 +222,7 @@ inline std::optional<c10::ArrayRef<T>> pointer_to_optional_list(
       : std::nullopt;
 }
 
+<<<<<<< HEAD
 template <typename T>
 static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
   c10::List<T> scalars_list;
@@ -232,4 +233,6 @@ static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
   return scalars_list;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/array_ref_impl.h b/torch/csrc/inductor/array_ref_impl.h
index 8cfbc12fb2c3d..05ccf28fbab75 100644
--- a/torch/csrc/inductor/array_ref_impl.h
+++ b/torch/csrc/inductor/array_ref_impl.h
@@ -77,11 +77,20 @@ void convert_handles_to_inputs(
 
 template <typename T>
 void assert_numel(const ArrayRefTensor<T>& tensor, uint64_t numel) {
+<<<<<<< HEAD
   TORCH_CHECK(
       tensor.numel() == numel,
       "incorrect numel for input tensor. expected ",
       numel,
       ", got ",
       tensor.numel());
+=======
+  if (tensor.numel() != numel) {
+    std::stringstream err;
+    err << "incorrect numel for input tensor. expected " << numel << ", got "
+        << tensor.numel();
+    throw std::runtime_error(err.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/cpp_prefix.h b/torch/csrc/inductor/cpp_prefix.h
index 8ae212d3d3db9..4557ab81b6706 100644
--- a/torch/csrc/inductor/cpp_prefix.h
+++ b/torch/csrc/inductor/cpp_prefix.h
@@ -75,6 +75,7 @@ struct IsVecMaskType<at::vec::VecMask<T, N>> : std::true_type {};
 #endif
 
 template <typename T, uint64_t kChunkSize>
+<<<<<<< HEAD
 struct CascadeSumHelper {
   // A data struct to help cascade summation:
   std::vector<T> sum_stk{};
@@ -134,6 +135,8 @@ inline T cascade_sum_final(CascadeSumHelper<T, kChunkSize>* c) {
 }
 
 template <typename T, uint64_t kChunkSize>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct WelfordHelper {
   // A data struct to help welford reduction:
   // 1. Save the reciprocal of weights to avoid redundant divisions.
@@ -270,6 +273,7 @@ Welford<T> welford_combine(
       out.index};
 }
 
+<<<<<<< HEAD
 template <typename T, uint64_t kChunkSize = 0>
 inline T cascade_sum_combine(
     T& data,
@@ -295,6 +299,8 @@ inline T cascade_sum_combine(
   return c->sum_stk[0];
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
   auto out = at::vec::maximum(a, b);
@@ -657,8 +663,13 @@ inline at::vec::Vectorized<float> vec_shuffle_down(
     case 4:
       return vec_t(_mm256_permute2f128_ps(x, x, SHUFFLE_MASK(1, 1, 1, 1)));
   }
+<<<<<<< HEAD
 
   TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
+=======
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
@@ -682,8 +693,13 @@ inline at::vec::Vectorized<float> vec_shuffle_down(
       return vec_t(_mm512_permutexvar_ps(
           _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x));
   }
+<<<<<<< HEAD
 
   TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
+=======
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h
index a2eebfcc86032..604e1965f766f 100644
--- a/torch/csrc/inductor/cpp_wrapper/common.h
+++ b/torch/csrc/inductor/cpp_wrapper/common.h
@@ -6,7 +6,12 @@
 #include <utility>
 
 #include <Python.h>
+<<<<<<< HEAD
 #include <pybind11/gil_simple.h>
+=======
+#define PYBIND11_SIMPLE_GIL_MANAGEMENT
+#include <pybind11/gil.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Include some often-used cpp_wrapper headers, for precompiling.
 #include <c10/util/BFloat16.h>
diff --git a/torch/csrc/inductor/resize_storage_bytes.cpp b/torch/csrc/inductor/resize_storage_bytes.cpp
index b41b99aca7479..32529dcca8840 100644
--- a/torch/csrc/inductor/resize_storage_bytes.cpp
+++ b/torch/csrc/inductor/resize_storage_bytes.cpp
@@ -14,7 +14,12 @@ using namespace at;
 static void resize_storage_bytes_(const Tensor& variable, SymInt new_size) {
   // similar to THPStorage_resize_ in StorageMethods.cpp, but is traceable
   if (variable.storage().device_type() == at::kCUDA) {
+<<<<<<< HEAD
 #if defined(USE_CUDA)
+=======
+    // rocm build has undefined reference to resize_bytes_cuda
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::native::resize_bytes_cuda(
         variable.storage().unsafeGetStorageImpl(), new_size.expect_int());
 #else
diff --git a/torch/csrc/inductor/static_cuda_launcher.cpp b/torch/csrc/inductor/static_cuda_launcher.cpp
index 59916b6763bfa..8379b410ed87b 100644
--- a/torch/csrc/inductor/static_cuda_launcher.cpp
+++ b/torch/csrc/inductor/static_cuda_launcher.cpp
@@ -369,7 +369,11 @@ PyObject* launch_kernel(PyObject* self, PyObject* args) {
   // Parse the fixed arguments and the format string
   if (!PyArg_ParseTuple(
           args,
+<<<<<<< HEAD
           "KiiiiisOK",
+=======
+          "KiiiiisOl",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           &func_ptr,
           &gridX,
           &gridY,
diff --git a/torch/csrc/instruction_counter/Module.cpp b/torch/csrc/instruction_counter/Module.cpp
index 757fa58727c7a..1fa23770f5ac0 100644
--- a/torch/csrc/instruction_counter/Module.cpp
+++ b/torch/csrc/instruction_counter/Module.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/error.h>
 #include <torch/csrc/instruction_counter/Module.h>
 #include <torch/csrc/utils/pybind.h>
@@ -21,7 +24,11 @@ namespace torch::instruction_counter {
 
 static long start() {
 #if !defined(__linux__)
+<<<<<<< HEAD
   TORCH_CHECK(false, "This systems seems not to be Linux");
+=======
+  throw std::runtime_error("This systems seems not to be Linux");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 
   // Construct base perf_event_attr struct
@@ -52,7 +59,11 @@ static long start() {
 
 static uint64_t end(int fd) {
 #if !defined(__linux__)
+<<<<<<< HEAD
   TORCH_CHECK(false, "This systems seems not to be Linux");
+=======
+  throw std::runtime_error("This systems seems not to be Linux");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   // Disable the event group
   if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index 1ef0522d2175a..47a57e37f03c1 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -958,7 +958,11 @@ torch._C._jit_set_fusion_strategy([
 ])
 ```
 
+<<<<<<< HEAD
 This will make two attempts to generate static-shape graphs, and after that fall back to generating dynamic-shape graphs. If for some reason compilation keeps occurring (even with dynamic-shape graphs - e.g. this could happen if ranks or dtypes vary), after 20 compilation attempts the graph executor will fall back to running the graph without any attempts to compile it.
+=======
+This will make two attempts to generate static-shape graphs, and after that fall back to generating dynamic-shape graphs. If for some reason compilation keeps occuring (even with dynamic-shape graphs - e.g. this could happen if ranks or dtypes vary), after 20 compilation attempts the graph executor will fall back to running the graph without any attempts to compile it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Pre-derivative Optimization ###
 
diff --git a/torch/csrc/jit/README.md b/torch/csrc/jit/README.md
index 4d9c2d07f3d1d..1fe6404cc7c83 100644
--- a/torch/csrc/jit/README.md
+++ b/torch/csrc/jit/README.md
@@ -26,5 +26,9 @@ A brief summary of the source tree:
 **Refer** to each folder for more in-depth documentation.
 
 Other relevant parts of the codebase not contained here:
+<<<<<<< HEAD
 - [aten/src/ATen/core](../../../aten/src/ATen/core): contains JIT code reused by other elements of the
+=======
+- [aten/src/ATen/core](../../../aten/src/ATen/core): contains JIT code re-used by other elements of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   runtime system (eager, mobile, etc.)
diff --git a/torch/csrc/jit/api/function_impl.cpp b/torch/csrc/jit/api/function_impl.cpp
index 0c911970347bd..993c977ff9745 100644
--- a/torch/csrc/jit/api/function_impl.cpp
+++ b/torch/csrc/jit/api/function_impl.cpp
@@ -62,7 +62,11 @@ T& toGraphFunctionImpl(F& function) {
 
 } // namespace
 
+<<<<<<< HEAD
 static void placeholderCreator(GraphFunction& /*unused*/) {
+=======
+static void placeholderCreator(GraphFunction&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   throw RecursiveMethodCallError();
 }
 
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
index 298ff1957c11c..ea630eaf31926 100644
--- a/torch/csrc/jit/api/function_impl.h
+++ b/torch/csrc/jit/api/function_impl.h
@@ -147,7 +147,11 @@ struct TORCH_API GraphFunction : public Function {
   mutable std::array<std::shared_ptr<Graph>, SpecializationKey::TotalCount>
       optimized_graphs_;
 
+<<<<<<< HEAD
   // GraphFunctions are invocable from multiple threads, so this lock needs to
+=======
+  // GraphFunctions are invokable from multiple threads, so this lock needs to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // be held when we're initializing graph executor for the first time or
   // computing the optimized graph. We're using reentrant mutex so that we don't
   // need to worry about causing a deadlock by calling one method from another
@@ -173,8 +177,14 @@ struct TORCH_API GraphFunction : public Function {
 };
 
 // Short hands for dynamic_cast<GraphFunction*>.
+<<<<<<< HEAD
 TORCH_API GraphFunction* tryToGraphFunction(Function& /*function*/) noexcept;
 TORCH_API GraphFunction& toGraphFunction(Function& /*function*/);
 TORCH_API const GraphFunction& toGraphFunction(const Function& /*function*/);
+=======
+TORCH_API GraphFunction* tryToGraphFunction(Function&) noexcept;
+TORCH_API GraphFunction& toGraphFunction(Function&);
+TORCH_API const GraphFunction& toGraphFunction(const Function&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
 C10_DECLARE_bool(torch_jit_do_not_store_optimized_graph);
diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h
index 906ef46c1ad6e..62406f3f69ad3 100644
--- a/torch/csrc/jit/api/method.h
+++ b/torch/csrc/jit/api/method.h
@@ -65,11 +65,17 @@ struct TORCH_API Method : public torch::IMethod {
   }
 
  private:
+<<<<<<< HEAD
   void setArgumentNames(
       std::vector<std::string>& /*argumentNames*/ /*argumentNamesOut*/)
       const override;
 
   // Methods are uniqued owned by a single module. This raw pointer allows
+=======
+  void setArgumentNames(std::vector<std::string>&) const override;
+
+  // Methods are uniqued onwed by a single module. This raw pointer allows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // looking up the module.
   ObjectPtr owner_;
 
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index c9b7793c89b6f..8eec4d2217d06 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -93,7 +93,11 @@ struct TORCH_API Module : public Object {
   Module(Module&&) noexcept = default;
   Module& operator=(Module&&) noexcept = default;
   Module(
+<<<<<<< HEAD
       c10::QualifiedName /*class_name*/,
+=======
+      c10::QualifiedName,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::shared_ptr<CompilationUnit> cu,
       bool shouldMangle = false);
   Module(ModulePtr module_value) : Object(std::move(module_value)) {}
@@ -327,7 +331,11 @@ struct TORCH_API Module : public Object {
   // Map of function names to the traced inputs that they have been traced with
   c10::Dict<std::string, c10::impl::GenericList> traced_inputs_;
 
+<<<<<<< HEAD
   // Mutex to keep registering buffer or parameter thread safe.
+=======
+  // Mutex to keep registring buffer or parameter thread safe.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::shared_ptr<std::mutex> register_mutex_ = std::make_shared<std::mutex>();
 };
 
diff --git a/torch/csrc/jit/backends/backend_debug_handler.cpp b/torch/csrc/jit/backends/backend_debug_handler.cpp
index 0d41034130395..350ead90b032e 100644
--- a/torch/csrc/jit/backends/backend_debug_handler.cpp
+++ b/torch/csrc/jit/backends/backend_debug_handler.cpp
@@ -26,7 +26,11 @@ int64_t BackendDebugInfoRecorder::getNextDebugHandle(const Node* node) {
 BackendDebugInfoMapType BackendDebugInfoRecorder::stopRecording() {
   // Note that this is return by copy and since
   // InlinedCallStackPtrs are intrusive ptr it will result in
+<<<<<<< HEAD
   // bump of refcount. Not performant, but this is not intended
+=======
+  // bump of refcount. Not performant, but this is not intented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // to be used in perf critical path.
   // Alternate might be do move but that will be destructive
   return handles_to_inlined_callstack_ptrs_;
diff --git a/torch/csrc/jit/backends/backend_debug_handler.h b/torch/csrc/jit/backends/backend_debug_handler.h
index 2e0145b56c294..c29ed3bd5176e 100644
--- a/torch/csrc/jit/backends/backend_debug_handler.h
+++ b/torch/csrc/jit/backends/backend_debug_handler.h
@@ -18,7 +18,11 @@ namespace torch::jit {
  *  Effectively debug handles are something that is given to backend and later
  *  when an exception occurs in the backend, backend can tell, using debug
  *  handle, that an exception occurred here. Then the runtime can generate
+<<<<<<< HEAD
  *  callstack corresponding to the exception.
+=======
+ *  callstack correspoding to the exception.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  *  There are two parts to BackendDebugHandleManager:
  *  1. static std::atomic debug_handle
  *  2. Map of [debug-handle, DebugInfoTuple]
diff --git a/torch/csrc/jit/backends/backend_exception.h b/torch/csrc/jit/backends/backend_exception.h
index 807ef38e28305..5362ee5468ef1 100644
--- a/torch/csrc/jit/backends/backend_exception.h
+++ b/torch/csrc/jit/backends/backend_exception.h
@@ -16,13 +16,21 @@ class TORCH_API BackendRuntimeException : public c10::Error {
   }
   // If rethrowing, can push another debug_handle
   // This is useful in couple of scenarios.
+<<<<<<< HEAD
   // 1. A submodule is lowered and lite interpreter has CallMethod
+=======
+  // 1. A submodule is lowered and lite interperter has CallMethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //    to lowered module's method. In this case lowered module will throw with
   //    a handle, plus there will be another debug handle corresponding
   //    to the CallMethod node in lite interpreter. Both together give complete
   //    trace. This function allows lite interpreter to rethrow with debug
   //    handle it has for CallMethod.
+<<<<<<< HEAD
   // 2. Another scenarios is when lite interpreter can make function calls or
+=======
+  // 2. Another scenarios is when lite interperter can make function calls or
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //    the lowered backend also has function call ability. Thus we have
   //    multiple function frames. Now we need a stack of handles to symbolicate
   //    entire stack trace.
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
index 4d8fe049134fe..2f9b6ed305376 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -37,7 +37,11 @@ class XNNSerializer {
 
   // Serialize add node, we are serializing the argument needed to call
   // xnn_define_add2. Serializing these values, and at run time we build
+<<<<<<< HEAD
   // the graph by re running xnn_define_add2
+=======
+  // teh graph by re running xnn_define_add2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void serializeAddNode(
       uint32_t input1_id,
       uint32_t input2_id,
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index 0428ac370b728..96013d67f6c0f 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -34,7 +34,11 @@ class XNNPackBackend : public PyTorchBackendInterface {
       c10::impl::GenericDict method_compile_spec) override {
     auto dict = processed.toGenericDict();
 
+<<<<<<< HEAD
     // Compiling and wrapping execution object
+=======
+    // Compiling and wrapping exeuction object
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::string& ser_model = dict.at("ser_model").toStringRef();
     XNNExecutor executor;
     XNNCompiler::compileModel(ser_model.data(), ser_model.length(), &executor);
diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md
index a68bc0491919b..ad822a9aa7b9d 100644
--- a/torch/csrc/jit/codegen/cuda/README.md
+++ b/torch/csrc/jit/codegen/cuda/README.md
@@ -78,7 +78,11 @@ Graph print out is straight forward and you should look for `prim::CudaFusionGro
     return (%o.5)
 ```
 
+<<<<<<< HEAD
 Note that one thing that could prevents fusion when you are running training is autodiff. Fusion pass only runs within `prim::DifferentiableGraph`, so the first thing you should check is to that targeted ops are within differentiable graph subgraphs.
+=======
+Note that one thing that could prevents fusion when you are running training is autodiff. Fusion pass only runs within `prim::DifferentiableGraph`, so the first thing you should check is to that targetted ops are within differentiable graph subgraphs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Graph dump could be quite confusing to look at, since it naively dumps all graphs executed by profiling executor and differentiable graphs are executed via a nested graph executor. So for each graph, you might see a few segmented `Optimized Graph` where each corresponds to a differentiable node in the original graph.
 
 #### 2. Cuda Fusion Graphs
diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h
index 2223c9b47b27e..21a0736a13e8a 100644
--- a/torch/csrc/jit/codegen/cuda/interface.h
+++ b/torch/csrc/jit/codegen/cuda/interface.h
@@ -38,7 +38,11 @@ TORCH_API CudaFuserInterface* getFuserInterface();
 
 TORCH_API void compileFusionGroup(Node* fusion_node);
 TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack);
+<<<<<<< HEAD
 TORCH_API void fuseGraph(std::shared_ptr<Graph>& /*graph*/);
+=======
+TORCH_API void fuseGraph(std::shared_ptr<Graph>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API bool canFuseNode(const Node* node);
 TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
 TORCH_API bool profileNode(const Node* node);
diff --git a/torch/csrc/jit/codegen/fuser/README.md b/torch/csrc/jit/codegen/fuser/README.md
index 3fdc6f371a5c9..0ba9c84b14437 100644
--- a/torch/csrc/jit/codegen/fuser/README.md
+++ b/torch/csrc/jit/codegen/fuser/README.md
@@ -7,7 +7,11 @@ The fuser accepts subgraphs wrapped in "fusion nodes" and tries to execute them
 The fuser is designed hierarchically with device-independent logic eventually deferring to device-specific logic and implementation. The device-specific code is (mostly) found in each devices' subdirectory. The device-independent logic has six components:
 
 * The Interface (interface.h/cpp) has functions to register and run fusions, interrogate fusion functionality, and perform debugging.
+<<<<<<< HEAD
 * The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic.
+=======
+* The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * The Code Generator (codegen.h/cpp) produces the string to be compiled on the device.
 * The Executor (executor.h/cpp) runs requested fusions. It performs shape inference, expands tensors as necessary, determines the device to run on, acquires a cached compiled kernel or requests the Compiler produce a new one, invokes device-specific code to launch the kernel and updates the stack.
 * The Fallback (fallback.h/cpp) runs subgraphs that can't be fused because shape inference didn't determine a common tensor size or the device the tensors are on doesn't support fusion.
diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp
index a5cd6f4e3a43d..bd014e7c6dfbc 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.cpp
+++ b/torch/csrc/jit/codegen/fuser/codegen.cpp
@@ -74,7 +74,11 @@ static const char* scalarTypeName(const at::ScalarType type) {
     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
 #undef DEFINE_CASE
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "unknown scalar type");
+=======
+      throw std::runtime_error("unknown scalar type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -99,7 +103,12 @@ static std::string variableType(const c10::Type& t) {
     return calcScalarTypeName(*scalar_type);
   }
   // something went wrong with the type analysis during shape propagation
+<<<<<<< HEAD
   TORCH_CHECK(false, "unknown type during JIT fusion code generation");
+=======
+  throw std::runtime_error(
+      "unknown scalar type during JIT fusion code generation");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static std::string typeCastedValueName(
@@ -128,7 +137,12 @@ static std::string typeCastedValueName(
     return vn;
   }
   // something went wrong with the type analysis during shape propagation
+<<<<<<< HEAD
   TORCH_CHECK(false, "unknown type during JIT fusion code generation");
+=======
+  throw std::runtime_error(
+      "unknown scalar type during JIT fusion code generation");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Writes RHS of special handling "simple mappable" ops
@@ -153,10 +167,18 @@ static std::string encodeSpecialRHS(const Node* n, at::jit::TemplateEnv& env) {
       env.s("1", valueName(min));
       return format("(${0} < ${1} ? ${1} : ${0})", env);
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "At least one of 'min' or 'max' must not be None");
     }
   } else {
     TORCH_CHECK(false, "Cannot encode RHS of the node, op not supported");
+=======
+      throw std::runtime_error(
+          "At least one of 'min' or 'max' must not be None");
+    }
+  } else {
+    throw std::runtime_error("Cannot encode RHS of the node, op not supported");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -632,7 +654,11 @@ std::string generateKernel(
   }
 
   // Includes headers
+<<<<<<< HEAD
   // Note: CUDA kernels support Halfs and random generation, CPU kernels do not
+=======
+  // Note: CUDA kernels support halfs and random generation, CPU kernels do not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (has_half_tensor) {
     env.s("HalfHeader", cuda::half_support_literal);
   } else {
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index cba3aff9efa64..845bcf198333c 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -8,7 +8,10 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/native/cuda/jit_utils.h>
 #include <c10/cuda/CUDAGuard.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/resource_guard.h>
 
 #include <cuda_runtime.h>
@@ -149,7 +152,13 @@ FusedKernelCUDA::FusedKernelCUDA(
     AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcGetProgramLogSize(program, &logsize));
     std::vector<char> log(logsize);
     AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcGetProgramLog(program, log.data()));
+<<<<<<< HEAD
     TORCH_CHECK(false, std::string(log.data(), log.size()));
+=======
+    std::stringstream cu;
+    cu << log.data();
+    throw std::runtime_error(cu.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   ResourceGuard holdProgram(
       [&] { AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcDestroyProgram(&program)); });
diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index 16ccc5002f9ab..0d094e907095b 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -260,6 +260,7 @@ typedef __half half;
 )";
 #endif
 
+<<<<<<< HEAD
 #if defined(USE_ROCM)
 
 #if ROCM_VERSION >= 70000
@@ -268,12 +269,88 @@ typedef __half half;
 #define BF16_UINT32_DEF ""
 #endif
 
+=======
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr auto bfloat16_support_literal =
     R"(
 #ifndef __align__
 #define __align__(x) __attribute__((aligned(x)))
 #endif
+<<<<<<< HEAD
 )" BF16_UINT32_DEF R"(
+=======
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  unsigned short __x;
+};
+
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+  return val;
+}
+
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  union
+  {
+      uint32_t int32;
+      float    fp32;
+  } u = {uint32_t(a.__x) << 16};
+  return u.fp32;
+}
+#endif /* defined(__cplusplus) */
+)";
+#elif defined(USE_ROCM) && ROCM_VERSION >= 70000
+constexpr auto bfloat16_support_literal =
+    R"(
+#ifndef __align__
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+typedef unsigned int uint32_t;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 typedef struct __align__(2) {
   unsigned short x;
 }
diff --git a/torch/csrc/jit/codegen/fuser/executor.cpp b/torch/csrc/jit/codegen/fuser/executor.cpp
index 67c4501dc2758..5757f18e81273 100644
--- a/torch/csrc/jit/codegen/fuser/executor.cpp
+++ b/torch/csrc/jit/codegen/fuser/executor.cpp
@@ -28,7 +28,11 @@ static std::optional<std::vector<int64_t>> getMapSize(
   // exactly how much storage do we need, so this could be fixed in-place at
   // every step. We're just missing a few functions for ATen, but the fix
   // should be straightforward.
+<<<<<<< HEAD
   // Note: left uninitialized since empty shape is broadcastable to any shape
+=======
+  // Note: left unitialized since empty shape is broadcastable to any shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int64_t> map_size;
   map_size.reserve(8);
   for (const auto arg_idx : arg_subset) {
@@ -201,7 +205,11 @@ static void launchFusion(
   for (const auto& c : fusion.concatDesc())
     flat_outputs_size += c.nSubTensors();
 
+<<<<<<< HEAD
   // Fails if the elements of the first (any) tensor are not expressible as
+=======
+  // Fails if the elements of the first (any) tensor are not expressable as
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // a 32-bit integer.
   // Note: this code assumes that inputs are 32-bit addressable
   // Note: this code assumes that all inputs are of the same size
diff --git a/torch/csrc/jit/codegen/fuser/fallback.cpp b/torch/csrc/jit/codegen/fuser/fallback.cpp
index 698e2882d6a55..3d2753e443210 100644
--- a/torch/csrc/jit/codegen/fuser/fallback.cpp
+++ b/torch/csrc/jit/codegen/fuser/fallback.cpp
@@ -2,11 +2,18 @@
 
 #include <ATen/core/functional.h> //fmap
 #include <ATen/core/stack.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <stdexcept>
 
 namespace torch::jit::fuser {
@@ -38,7 +45,12 @@ static RegisterOperators reg_fused_operators({Operator(
 
 void runFallback(int64_t key, Stack& stack) {
   auto maybe_spec = retrieve(key);
+<<<<<<< HEAD
   TORCH_CHECK(maybe_spec, "Failed to find fusion spec to run fallback.")
+=======
+  if (!maybe_spec)
+    throw std::runtime_error("Failed to find fusion spec to run fallback.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   InterpreterState{(*maybe_spec)->code()}.run(stack);
 }
diff --git a/torch/csrc/jit/codegen/fuser/fused_kernel.h b/torch/csrc/jit/codegen/fuser/fused_kernel.h
index 0f785c4506609..674edd0d6be5c 100644
--- a/torch/csrc/jit/codegen/fuser/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/fused_kernel.h
@@ -40,7 +40,11 @@ struct FusedKernel {
   // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
   // that compiled code uses to load Tensor data.
   // launch_with_tensors handles packing at::Tensors into this arguments array.
+<<<<<<< HEAD
   // CPU code uses the same convention so that launch_with_tensors can be
+=======
+  // CPU code uses the same convension so that launch_with_tensors can be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // shared.
   virtual void launch_raw(const uint32_t numel, std::vector<void*>& arguments)
       const = 0;
diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index 41efa23e2b434..79a94c23a2306 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -5,7 +5,10 @@
 #include <torch/csrc/jit/codegen/fuser/fallback.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Flags.h>
 #include <stdexcept>
 
@@ -94,8 +97,14 @@ std::string debugGetFusedKernelCode(
   const auto key = fuser::registerFusion(fusion_group);
 
   std::string code;
+<<<<<<< HEAD
   TORCH_CHECK(
       fuser::runFusion(key, stack, &code), "Could not run fusion for graph")
+=======
+  if (!fuser::runFusion(key, stack, &code)) {
+    throw std::runtime_error("Could not run fusion for graph");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return code;
 }
diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md
index d8b81bbb79818..1eae353919232 100644
--- a/torch/csrc/jit/codegen/onednn/README.md
+++ b/torch/csrc/jit/codegen/onednn/README.md
@@ -81,7 +81,11 @@ cmake/Modules/FindMKLDNN.cmake
 cmake/Dependencies.cmake
 ```
 
+<<<<<<< HEAD
 To map another op to oneDNN Graph, you should add an entry for it in createOperator in torch/csrc/jit/codegen/onednn/graph_helper.cpp.
+=======
+To map another op to oneDNN Graph, you should add an entry for it in in createOperator in torch/csrc/jit/codegen/onednn/graph_helper.cpp.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 If it has an inplace variant, you should add it in the lambda being passed to RemoveTensorMutation in
 torch/csrc/jit/codegen/onednn/interface.cpp. You might also want to add it to canFuseNode in torch/csrc/jit/codegen/onednn/register_interface.cpp.
 
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
index 2ef9f3cfa955c..44883a81212ee 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -70,7 +70,11 @@ Operator LlgaGraphHelper::makeBinaryOp(Node* node, opkind kind) {
 // third_party/ideep/mkl-dnn/src/interface/op_def.hpp.
 Operator LlgaGraphHelper::createOperator(Node* node) {
   auto nodeKind = node->kind();
+<<<<<<< HEAD
   // we're using an if-else clause instead of a switch statement
+=======
+  // we're using an if-else clause instead of a switch staement
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // because we would soon be adding custom ops with function schemas.
   // We would have to use Symbol::fromQualString at that time anyway,
   // but we are okay with this choice, since this code is not in the hot-path.
diff --git a/torch/csrc/jit/codegen/onednn/kernel.cpp b/torch/csrc/jit/codegen/onednn/kernel.cpp
index c5421643e8c43..375a47b3742cc 100644
--- a/torch/csrc/jit/codegen/onednn/kernel.cpp
+++ b/torch/csrc/jit/codegen/onednn/kernel.cpp
@@ -84,9 +84,15 @@ ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) {
   for (const auto i : c10::irange(nGraphInputs_)) {
     auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]);
     initializedInputIds_.insert(spec.tid());
+<<<<<<< HEAD
     int64_t occurrence = tensorIdToOccurence[spec.tid()];
     inputSpecs.insert(inputSpecs.end(), occurrence, spec);
     runArgsIdx_.insert(runArgsIdx_.end(), occurrence, i);
+=======
+    int64_t occurence = tensorIdToOccurence[spec.tid()];
+    inputSpecs.insert(inputSpecs.end(), occurence, spec);
+    runArgsIdx_.insert(runArgsIdx_.end(), occurence, i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   GRAPH_DEBUG("Initializing constant input tensors");
   initializeConstantInputs();
diff --git a/torch/csrc/jit/docs/serialization.md b/torch/csrc/jit/docs/serialization.md
index 43f7e261f0207..3514f2d8e3310 100644
--- a/torch/csrc/jit/docs/serialization.md
+++ b/torch/csrc/jit/docs/serialization.md
@@ -371,7 +371,11 @@ TorchScript class, or a `ScriptModule`. Owns other its attribute types
 **`Object`**: An instance of a particular class. Own the `CompilationUnit`
 that owns its `ClassType`. This is to ensure that if the user passes the
 object around in C++, all its code will stay around and methods will be
+<<<<<<< HEAD
 invocable.
+=======
+invokable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 **`Module`**: A view over a `ClassType` and the `Object` that holds its state.
 Also responsible for turning unqualified names (e.g. `forward()`) into
diff --git a/torch/csrc/jit/frontend/builtin_functions.cpp b/torch/csrc/jit/frontend/builtin_functions.cpp
index 2225f58e54e75..6c8cf61b86e96 100644
--- a/torch/csrc/jit/frontend/builtin_functions.cpp
+++ b/torch/csrc/jit/frontend/builtin_functions.cpp
@@ -103,10 +103,17 @@ struct BuiltinFunctionRegistry {
     // re-lock, the mutex without waiting), and report no loaded builtins during
     // init.
     std::lock_guard<std::recursive_mutex> guard(mutex);
+<<<<<<< HEAD
     if (state == INITIALIZING) {
       return empty;
     } else if (state == UNINITIALIZED) {
       state = INITIALIZING;
+=======
+    if (state == INTIIALIZING) {
+      return empty;
+    } else if (state == UNINITIALIZED) {
+      state = INTIIALIZING;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       loadBuiltinFunctions();
       state = INITIALIZED;
     }
@@ -168,6 +175,7 @@ struct BuiltinFunctionRegistry {
     loadSource(aten_ops_additional, "aten");
 
     // These are under `prim` instead of `aten` since they exist to bind certain
+<<<<<<< HEAD
     // tensor property getters to corresponding methods
     loadSource(tensor_properties, "prim");
   }
@@ -178,6 +186,12 @@ struct BuiltinFunctionRegistry {
     INTIIALIZING = 1, // codespell:ignore
     INITIALIZED = 2
   } state = UNINITIALIZED;
+=======
+    // tensor property getters to correpsonding methods
+    loadSource(tensor_properties, "prim");
+  }
+  enum { UNINITIALIZED, INTIIALIZING, INITIALIZED } state = UNINITIALIZED;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::recursive_mutex mutex;
   std::vector<std::shared_ptr<CompilationUnit>> modules;
   std::unordered_map<Symbol, std::vector<Function*>> builtins_by_name_;
diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp
index 91d41607f9df9..c89a965d1ad41 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.cpp
+++ b/torch/csrc/jit/frontend/concrete_module_type.cpp
@@ -204,8 +204,12 @@ std::shared_ptr<ConcreteModuleType> ConcreteModuleType::
       [&](const ConcreteModuleTypeBuilder::ModuleInfo& info) {
         return info.name_ == name;
       });
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(
       it != data_.modules_.end(), "Cannot find submodule with name/key ", name);
+=======
+  TORCH_INTERNAL_ASSERT(it != data_.modules_.end());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return it->meta_;
 }
 
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index d5a8408e971c0..6db7f0e25d4b0 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -6,6 +6,7 @@ namespace torch::jit {
 
 // Avoid storing objects with destructor in thread_local for mobile build.
 #ifndef C10_MOBILE
+<<<<<<< HEAD
 // [NOTE: Thread-safe CallStack]
 // `calls` maintains a stack of Python calls that resulted in the
 // currently compiled TorchScript code. RAII ErrorReport::CallStack
@@ -34,6 +35,9 @@ namespace torch::jit {
 //      (since now multiple threads access a given thread_local calls object)
 static thread_local std::shared_ptr<ErrorReport::Calls> calls =
     std::make_shared<ErrorReport::Calls>();
+=======
+static thread_local std::vector<Call> calls;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // C10_MOBILE
 
 ErrorReport::ErrorReport(const ErrorReport& e)
@@ -44,15 +48,23 @@ ErrorReport::ErrorReport(const ErrorReport& e)
 
 #ifndef C10_MOBILE
 ErrorReport::ErrorReport(const SourceRange& r)
+<<<<<<< HEAD
     : context(r), error_stack(calls->get_stack()) {}
 
 void ErrorReport::CallStack::update_pending_range(const SourceRange& range) {
   calls->update_pending_range(range);
+=======
+    : context(r), error_stack(calls.begin(), calls.end()) {}
+
+void ErrorReport::CallStack::update_pending_range(const SourceRange& range) {
+  calls.back().caller_range = range;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ErrorReport::CallStack::CallStack(
     const std::string& name,
     const SourceRange& range) {
+<<<<<<< HEAD
   source_callstack_ = calls;
   source_callstack_->push_back({name, range});
 }
@@ -61,6 +73,13 @@ ErrorReport::CallStack::~CallStack() {
   if (source_callstack_) {
     source_callstack_->pop_back();
   }
+=======
+  calls.push_back({name, range});
+}
+
+ErrorReport::CallStack::~CallStack() {
+  calls.pop_back();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #else // defined C10_MOBILE
 ErrorReport::ErrorReport(const SourceRange& r) : context(r) {}
@@ -91,7 +110,11 @@ static std::string get_stacked_errors(const std::vector<Call>& error_stack) {
 
 std::string ErrorReport::current_call_stack() {
 #ifndef C10_MOBILE
+<<<<<<< HEAD
   return get_stacked_errors(calls->get_stack());
+=======
+  return get_stacked_errors(calls);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   TORCH_CHECK(false, "Call stack not supported on mobile");
 #endif // C10_MOBILE
diff --git a/torch/csrc/jit/frontend/error_report.h b/torch/csrc/jit/frontend/error_report.h
index 9f5ad9bf3bb68..0efedd0d2bf9b 100644
--- a/torch/csrc/jit/frontend/error_report.h
+++ b/torch/csrc/jit/frontend/error_report.h
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <torch/csrc/jit/frontend/tree.h>
+<<<<<<< HEAD
 #include <mutex>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::jit {
 
@@ -19,6 +22,7 @@ struct TORCH_API ErrorReport : public std::exception {
 
   const char* what() const noexcept override;
 
+<<<<<<< HEAD
   class TORCH_API Calls {
    private:
     std::vector<Call> calls_;
@@ -51,6 +55,8 @@ struct TORCH_API ErrorReport : public std::exception {
     }
   };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct TORCH_API CallStack {
     // These functions are used to report why a function was being compiled
     // (i.e. what was the call stack of user functions at compilation time that
@@ -61,9 +67,12 @@ struct TORCH_API ErrorReport : public std::exception {
     // Change the range that is relevant for the current function (i.e. after
     // each successful expression compilation, change it to the next expression)
     static void update_pending_range(const SourceRange& range);
+<<<<<<< HEAD
 
    private:
     std::shared_ptr<Calls> source_callstack_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   static std::string current_call_stack();
diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp
index 48fc133fe3d04..5cfd799453910 100644
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@@ -333,7 +333,11 @@ struct ExitTransformer {
     std::vector<Value*> exit_block_vals;
     // after an exit, the only values that will get used
     // are the hasExited() and exitValues(), so we match the existing
+<<<<<<< HEAD
     // block outputs with uninitialized
+=======
+    // block outputs with unitialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     exit_block_vals = matchValuesWithUnitialized(block->outputs());
 
     // Set the new if to have the same outputs of the original block,
@@ -362,7 +366,11 @@ struct ExitTransformer {
   //    break
   //    j = j + 1
   // where the j + 1 value will be a block output, but since they will
+<<<<<<< HEAD
   // never be used, it is safe to replace them with uninitialized value
+=======
+  // never be used, it is safe to replace them with unitialized value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void destroyNodeAfterExit(Node* n) {
     for (auto output : n->outputs()) {
       if (!output->uses().empty()) {
diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp
index 24b3adfd98cd3..9bed1344c0ad4 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.cpp
+++ b/torch/csrc/jit/frontend/function_schema_parser.cpp
@@ -111,7 +111,11 @@ struct SchemaParser {
       L.expect(':');
       name = fmt::format("{}::{}", name, L.expect(TK_IDENT).text_view());
     }
+<<<<<<< HEAD
     std::string overload_name;
+=======
+    std::string overload_name = "";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (L.nextIf('.')) {
       overload_name = L.expect(TK_IDENT).text();
     }
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index e7949b0ac4bee..b908dd1fbde31 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -959,7 +959,11 @@ struct to_ir {
       emitDef(
           def,
           nullptr,
+<<<<<<< HEAD
           closure_block); // ignore schema return, we just won't use it for now
+=======
+          closure_block); // ignore schema return, we just wont use it for now
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           // since we never create a Method for the closure
     };
     auto closure_value = emitClosure(emit_body);
@@ -1578,7 +1582,11 @@ struct to_ir {
           /*default_to_union=*/true,
           elem_type_hint);
 
+<<<<<<< HEAD
       // Case: The list comprehension generated heterogeneous values,
+=======
+      // Case: The list comprehension generated heterogenous values,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // and we don't have a type hint to suggest that this is what the
       // user expected
       if (!type_hint && (*unified_elem_type)->isUnionType()) {
@@ -1701,7 +1709,11 @@ struct to_ir {
             << "the first generated key was " << k->type()->repr_str());
       } else if (
           first_generated_key_type && first_generated_key_type != k->type()) {
+<<<<<<< HEAD
         // Values can be heterogeneous, so we only need to check that the
+=======
+        // Values can be heterogenous, so we only need to check that the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // key types are all the same
         throw(
             ErrorReport(dc)
@@ -2118,7 +2130,11 @@ struct to_ir {
       // Try to unify the types. If we found a type annotation earlier
       // in the environment, and if that type annotation is some form
       // of union, then we need to tell `unifyTypes` not to throw an
+<<<<<<< HEAD
       // error if the branched return types we found are heterogeneous
+=======
+      // error if the branched return types we found are heterogenous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool default_to_union = full_type &&
           (full_type->kind() == UnionType::Kind ||
            full_type->kind() == OptionalType::Kind ||
@@ -2440,7 +2456,11 @@ struct to_ir {
     SugaredValuePtr iterable = sv->iter(loc, method);
 
     // We unroll the loop for iterables that contain ModuleLists so that we can
+<<<<<<< HEAD
     // compile Heterogeneous module lists.
+=======
+    // compile Heterogenous module lists.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!iterable->shouldEmitUnrolled()) {
       emitLoopCommon(loc, emit_body, iterable, targets, {});
     } else {
@@ -3259,7 +3279,11 @@ struct to_ir {
       case TK_IN:
         return aten::__contains__;
       default:
+<<<<<<< HEAD
         TORCH_CHECK(false, "unknown kind ", kind);
+=======
+        throw std::runtime_error("unknown kind " + std::to_string(kind));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -3306,7 +3330,11 @@ struct to_ir {
       case TK_RSHIFT:
         return "__rshift__";
       default:
+<<<<<<< HEAD
         TORCH_CHECK(false, "unknown kind ", kind);
+=======
+        throw std::runtime_error("unknown kind " + std::to_string(kind));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -4120,7 +4148,12 @@ struct to_ir {
     } else if (kind == aten::ge) {
       return aten::le;
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, "reverseComparision: unsupported NodeKind. File a bug");
+=======
+    throw std::runtime_error(
+        "reverseComparision: unsupported NodeKind. File a bug");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // any expression that can produce a SugaredValue is handled here
@@ -4259,7 +4292,11 @@ struct to_ir {
   }
 
   std::shared_ptr<SugaredValue> emitRpcExpr(const Apply& apply, Symbol rpc_op) {
+<<<<<<< HEAD
     // TODO: This is a temporary apporoach to enable calling user function
+=======
+    // TODO: This is a temporary apporoach to enable calling user fucntion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // through RPC in TorchScript,
     // Ideally, function value in JIT IR is first-class citizen and
     // The RPC C++ entry API can take c10::Function directly.
@@ -5398,7 +5435,11 @@ struct FunctionResolver : public Resolver {
 
 CompilationUnit::CompilationUnit(const std::string& source)
     : CompilationUnit() {
+<<<<<<< HEAD
   // calls the define with native resolver to generate the graph for functions
+=======
+  // calles the define with native resolver to generate the graph for functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   define(std::nullopt, source, nativeResolver(), nullptr);
 }
 
diff --git a/torch/csrc/jit/frontend/lexer.cpp b/torch/csrc/jit/frontend/lexer.cpp
index 187721671e6e2..760f405f14c1b 100644
--- a/torch/csrc/jit/frontend/lexer.cpp
+++ b/torch/csrc/jit/frontend/lexer.cpp
@@ -94,7 +94,11 @@ C10_EXPORT std::string kindToString(int kind) {
     TC_FORALL_TOKEN_KINDS(DEFINE_CASE)
 #undef DEFINE_CASE
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown kind: ", kind);
+=======
+      throw std::runtime_error("Unknown kind: " + std::to_string(kind));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 98c235bc24f14..c7d1b48bc8c46 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -412,7 +412,15 @@ struct Token {
 
 struct Lexer {
   explicit Lexer(std::shared_ptr<Source> source)
+<<<<<<< HEAD
       : source(std::move(source)), shared(sharedParserData()) {
+=======
+      : source(std::move(source)),
+
+        indent_stack(),
+        next_tokens(),
+        shared(sharedParserData()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto first_indent = lexRaw(true);
     indent_stack.push_back(first_indent.range.size());
     lex();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index cbc22fab84e23..a83851bcc09b6 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -333,12 +333,20 @@ bool isBlockListedSchema(const FunctionSchema& schema) {
   // Currently JIT does not distinguish ScalarType vs int, so there is really
   // no way to distinguish x.view(1) vs x.view(torch.int8). So we have to
   // hardcode the aten::view.dtype here to block this overload. This blocklist
+<<<<<<< HEAD
   // should be removed when JIT fully supports ScalarType as its own type.
+=======
+  // should be removed when JIT fully suports ScalarType as its own type.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (schema.name() == "aten::view" && schema.overload_name() == "dtype") {
     return true;
   }
   // Note (@tugsbayasgalan)
+<<<<<<< HEAD
   // TorchScript doesn't support kwargs so this op collides with aten.max.others
+=======
+  // TorchScript doesn't suport kwargs so this op collides with aten.max.others
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // since both of them have 2 Tensor inputs. Since we don't expect users to
   // use this op in TS, we just skip it
   if (schema.name() == "aten::max" && schema.overload_name() == "unary_out") {
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 735856dc10a7c..61a23aa89d1b3 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -8,7 +8,10 @@
 #include <torch/csrc/jit/frontend/parse_string_literal.h>
 #include <torch/custom_class.h>
 #include <string>
+<<<<<<< HEAD
 #include <unordered_set>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using c10::AliasInfo;
 using c10::AwaitType;
@@ -34,7 +37,10 @@ using c10::StorageType;
 using c10::StreamObjType;
 using c10::StringType;
 using c10::Symbol;
+<<<<<<< HEAD
 using c10::SymBoolType;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using c10::SymIntType;
 using c10::TensorType;
 using c10::TupleType;
@@ -43,6 +49,7 @@ using c10::VarType;
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 static std::unordered_set<std::string>& getOpaqueTypes() {
   static std::unordered_set<std::string> global_opaque_types;
   return global_opaque_types;
@@ -62,6 +69,8 @@ bool isRegisteredOpaqueType(const std::string& type_name) {
   return global_opaque_types.find(type_name) != global_opaque_types.end();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TypePtr SchemaTypeParser::parseBaseType() {
   static std::unordered_map<std::string, TypePtr> type_map = {
       {"Generator", c10::TypeFactory::get<GeneratorType>()},
@@ -87,7 +96,10 @@ TypePtr SchemaTypeParser::parseBaseType() {
       {"int", c10::TypeFactory::get<IntType>()},
       {"SymInt", c10::TypeFactory::get<SymIntType>()},
       {"bool", c10::TypeFactory::get<BoolType>()},
+<<<<<<< HEAD
       {"SymBool", c10::TypeFactory::get<SymBoolType>()},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {"None", c10::TypeFactory::get<NoneType>()},
       {"NoneType", c10::TypeFactory::get<NoneType>()},
       {"Capsule", c10::TypeFactory::get<CapsuleType>()},
@@ -101,11 +113,14 @@ TypePtr SchemaTypeParser::parseBaseType() {
   }
   std::string text = tok.text();
 
+<<<<<<< HEAD
   // Check if this type is registered as an opaque type first
   if (isRegisteredOpaqueType(text)) {
     return c10::PyObjectType::get();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto it = type_map.find(text);
   if (it == type_map.end()) {
     if (allow_typevars_ && !text.empty() && islower(text[0])) {
diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h
index 19f108fa17e82..4b2edec0d1c81 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.h
+++ b/torch/csrc/jit/frontend/schema_type_parser.h
@@ -10,9 +10,12 @@ namespace torch::jit {
 
 using TypePtr = c10::TypePtr;
 
+<<<<<<< HEAD
 TORCH_API void registerOpaqueType(const std::string& type_name);
 TORCH_API bool isRegisteredOpaqueType(const std::string& type_name);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct TORCH_API SchemaTypeParser {
   TypePtr parseBaseType();
   std::optional<c10::AliasInfo> parseAliasAnnotation();
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 31fc483812ab0..441de2607607c 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -448,7 +448,11 @@ std::vector<Argument> ScriptTypeParser::parseArgsFromDecl(
 }
 
 std::vector<Argument> ScriptTypeParser::parseReturnFromDecl(const Decl& decl) {
+<<<<<<< HEAD
   // we represent no annotation on a return type as having no values in the
+=======
+  // we represent no annoation on a return type as having no values in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // schema's return() list
   // in emitReturn we take the actual return value to be the value of the
   // return statement if no one was provided here
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 89815d386ac05..a862e9a28a3a0 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -42,12 +42,21 @@ size_t StringCordView::find(const std::string& tok, size_t start) const {
   size_t offset = start;
   for (; begin != end_iter; ++begin, ++offset) {
     if (*begin == tok[0]) {
+<<<<<<< HEAD
       auto mismatch = std::mismatch(begin, end_iter, tok.begin(), tok.end());
       if (mismatch.second == tok.end()) {
         // no mismatch, and second string (tok) is exhausted.
         return offset;
       }
       if (mismatch.first == end_iter) {
+=======
+      auto mis = std::mismatch(begin, end_iter, tok.begin(), tok.end());
+      if (mis.second == tok.end()) {
+        // no mismatch, and second string (tok) is exhausted.
+        return offset;
+      }
+      if (mis.first == end_iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // this str is exhausted but tok is not
         return std::string::npos;
       }
@@ -312,7 +321,11 @@ void SourceRange::print_with_context(
     }
     out << "\n";
   }
+<<<<<<< HEAD
   // print out initial context
+=======
+  // print out inital context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out << str.substr(begin_context, start() - begin_context);
   size_t line_start = start();
   size_t line_end = range_end;
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index f9a80cf4da5e4..1e23f46dc225d 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -359,8 +359,13 @@ void SimpleValue::setAttr(
         throw(
             ErrorReport(loc)
             << "Assignment to attribute '" << field
+<<<<<<< HEAD
             << "' cannot be of a type that contains class " << "'"
             << classType->repr_str() << "'.\n"
+=======
+            << "' cannot be of a type that contains class "
+            << "'" << classType->repr_str() << "'.\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             << "Classes that recursively contain instances of themselves"
             << " are not yet supported");
       }
@@ -826,6 +831,7 @@ SugaredValuePtr SugaredEnumClass::iter(
   return enum_values_list_constant;
 }
 
+<<<<<<< HEAD
 std::shared_ptr<SugaredValue> TorchCheckValue::call(
     const SourceRange& loc,
     GraphFunction& m,
@@ -904,4 +910,6 @@ std::shared_ptr<SugaredValue> TorchCheckValue::call(
   return std::make_shared<NoneValue>();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index 59ddea774d5d1..8ea63629b3fb0 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -118,7 +118,11 @@ struct TORCH_API SugaredValue
 
   // If we are iterating over a Sugared Value and it returns a value from this
   // function, then we emit an unrolled loop over the variable. This allows us
+<<<<<<< HEAD
   // to support containers of Heterogeneous types, like Module Containers &
+=======
+  // to support containers of Heterogenous types, like Module Containers &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Tuples
   virtual std::optional<int64_t> staticLen() {
     return std::nullopt;
@@ -136,10 +140,18 @@ struct TORCH_API SugaredValue
   // Value *
   virtual Value* len(const SourceRange& loc, GraphFunction& m) {
     throw(
+<<<<<<< HEAD
         ErrorReport(loc) << "'" << kind() << "'" << " object is not iterable");
   }
 
   // expression for ith element for iterable value
+=======
+        ErrorReport(loc) << "'" << kind() << "'"
+                         << " object is not iterable");
+  }
+
+  // expression for ith elemement for iterable value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual std::shared_ptr<SugaredValue> getitem(
       const SourceRange& loc,
       GraphFunction& m,
@@ -296,7 +308,11 @@ struct TORCH_API SugaredTupleValue : public SugaredValue {
     return shared_from_this();
   }
 
+<<<<<<< HEAD
   // Because this is used to contain SugaredValues of Heterogeneous types,
+=======
+  // Because this is used to contain SugaredValues of Heterogenous types,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // we define staticLen() so that when this is iterated over it is emitted
   // as an unrolled loop.
   std::optional<int64_t> staticLen() override {
@@ -318,7 +334,11 @@ struct TORCH_API BuiltinModule : public SugaredValue {
       GraphFunction& m,
       const std::string& field) override {
     if (field == "autograd") {
+<<<<<<< HEAD
       // When referring torch.autograd, it is also considered to be a
+=======
+      // When refering torch.autograd, it is also considered to be a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // BuiltinModule and we will dispatch to the aten operators for the
       // methods under its module.
       return std::make_shared<BuiltinModule>("aten", version);
@@ -330,12 +350,20 @@ struct TORCH_API BuiltinModule : public SugaredValue {
 
  private:
   std::string name;
+<<<<<<< HEAD
   // when we add operator versioning, emit this op as it existing at 'version'
+=======
+  // when we add operator versioning, emit this op as it exising at 'version'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // if not set, use the latest version
   std::optional<int64_t> version;
 };
 
+<<<<<<< HEAD
 // Represents a class, analogous to `int` or `dict`. Instances of classes,
+=======
+// Represents a class, analagous to `int` or `dict`. Instances of classes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // like `1` or `{"foo": 5}`, are represented as SimpleValues
 struct TORCH_API ClassValue : public SugaredValue {
   explicit ClassValue(ClassTypePtr type) : type_(std::move(type)) {}
@@ -857,6 +885,7 @@ struct TORCH_API SliceValue : public SugaredValue {
   Value* step_;
 };
 
+<<<<<<< HEAD
 struct TORCH_API TorchCheckValue : public SugaredValue {
   explicit TorchCheckValue() = default;
 
@@ -872,4 +901,6 @@ struct TORCH_API TorchCheckValue : public SugaredValue {
       size_t n_binders) override;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 3ccbd5257ae25..183ce3e49ea58 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -167,12 +167,21 @@ Value* TracingState::getValue(const IValue& var) {
     // Didn't find it. Bake in a constant
     if (ten.requires_grad()) {
       pauseTracing();
+<<<<<<< HEAD
       TORCH_CHECK(
           false,
           "Cannot insert a Tensor that requires grad as a constant. ",
           "Consider making it a parameter or input, or detaching the gradient\n",
           "Tensor:\n",
           ten);
+=======
+      std::ostringstream oss;
+      oss << "Cannot insert a Tensor that requires grad as a constant. "
+          << "Consider making it a parameter or input, or detaching the gradient\n"
+          << "Tensor:\n"
+          << ten;
+      throw std::runtime_error(oss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     Value* constant = graph->insertConstant(ten);
@@ -208,6 +217,7 @@ Value* TracingState::getValue(const IValue& var) {
       }
     }
 
+<<<<<<< HEAD
     if (var.isFuture()) {
       TORCH_CHECK(
           false,
@@ -221,6 +231,17 @@ Value* TracingState::getValue(const IValue& var) {
           " must be registered as submodules of the thing being traced.");
     }
 
+=======
+    std::ostringstream oss;
+    if (var.isFuture()) {
+      oss << "Tried to trace Future or Object that the tracer was not aware of.";
+    } else {
+      oss << "Tried to trace " << var
+          << " but it is not part of the active trace. Modules that are called during a trace"
+          << " must be registered as submodules of the thing being traced.";
+    }
+    throw std::runtime_error(oss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // If the values are non-tensors, we try to create constants
     // and bake those constants into the traced graph
@@ -229,12 +250,20 @@ Value* TracingState::getValue(const IValue& var) {
       recordSourceLocation(constant.value()->node());
       return *constant;
     }
+<<<<<<< HEAD
     TORCH_CHECK(
         false,
         "Tracer cannot get value trace for type ",
         var.tagKind(),
         ". The below value could not be materialized as a constant:\n",
         var);
+=======
+    std::ostringstream os;
+    os << "Tracer cannot get value trace for type " << var.tagKind() << ". "
+       << "The below value could not be materialized as a constant:\n"
+       << var;
+    throw std::runtime_error(os.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 bool TracingState::hasValue(const IValue& var) const {
@@ -257,6 +286,7 @@ Value* TracingState::getOutput(const IValue& iv, size_t i) {
 
     auto& value_map = getTracingState()->env_stack.back();
     auto it = value_map.find(iv);
+<<<<<<< HEAD
     TORCH_CHECK(
         it != value_map.end(),
         "output ",
@@ -265,6 +295,17 @@ Value* TracingState::getOutput(const IValue& iv, size_t i) {
         var,
         ") of traced region did not have observable data dependence with trace inputs; ",
         "this probably indicates your program cannot be understood by the tracer.");
+=======
+    if (it == value_map.end()) {
+      std::ostringstream os;
+      os << "output " << i << " (" << var
+         << ") of traced region did not have observable "
+         << "data dependence with trace inputs; this probably indicates your "
+            "program "
+         << "cannot be understood by the tracer.";
+      throw std::runtime_error(os.str());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return it->second;
   } else if (iv.isTensorList()) {
     if (tracing_mode_strict) {
@@ -285,10 +326,18 @@ Value* TracingState::getOutput(const IValue& iv, size_t i) {
     graph->insertNode(tuple_node);
     return tuple_node->output();
   } else if (iv.isGenericDict()) {
+<<<<<<< HEAD
     TORCH_CHECK(
         !tracing_mode_strict,
         "Encountering a dict at the output of the tracer",
         STRICT_TRACER_MSG);
+=======
+    if (tracing_mode_strict) {
+      throw std::runtime_error(
+          "Encountering a dict at the output of the tracer" +
+          std::string(STRICT_TRACER_MSG));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto dict = iv.toGenericDict();
     TypePtr key_type = dict.keyType();
     TypePtr value_type = dict.valueType();
@@ -307,6 +356,7 @@ Value* TracingState::getOutput(const IValue& iv, size_t i) {
         }
       }
     }
+<<<<<<< HEAD
     TORCH_CHECK(
         key_type_valid && value_type_valid,
         "output ",
@@ -316,6 +366,17 @@ Value* TracingState::getOutput(const IValue& iv, size_t i) {
         ") of traced region cannot be understood by the tracer, only outputs matching ",
         "dict[Union[str, Tensor], Union[Tensor, Tuple[Tensor, ...]]] ",
         "can be a dictionary output of a traced function");
+=======
+
+    if (!key_type_valid || !value_type_valid) {
+      std::ostringstream os;
+      os << "output " << i << " (" << dict << ") of traced region "
+         << "cannot be understood by the tracer, only outputs matching"
+         << "dict[Union[str, Tensor], Union[Tensor, Tuple[Tensor, ...]]] "
+         << "can be a dictionary output of a traced function";
+      throw std::runtime_error(os.str());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<Value*> keys;
     std::vector<Value*> values;
     for (const auto& entry : dict) {
@@ -560,7 +621,11 @@ void TracingState::setValue(const IValue& v, Value* value) {
 
     // If the value comes from a CallFunction or CallMethod, it may not have
     // shape information attached. For debuggability, we enhance the type
+<<<<<<< HEAD
     // information by assigning the concrete value's type to the jit::Value.
+=======
+    // information by assigning the concrete value's tupe to the jit::Value.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (auto tensor_type = value->type()->cast<TensorType>()) {
       if (!tensor_type->isComplete()) {
         value->inferTypeFrom(var);
@@ -601,11 +666,18 @@ void TracingState::setValue(const IValue& v, Value* value) {
       setValue(entry.value(), static_value);
     }
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(
         false,
         "Tracer cannot set value trace for type ",
         v.tagKind(),
         ". Supported types are tensor, tensor list, and tuple of tensors.");
+=======
+    std::ostringstream os;
+    os << "Tracer cannot set value trace for type " << v.tagKind() << ". "
+       << "Supported types are tensor, tensor list, and tuple of tensors.";
+    throw std::runtime_error(os.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -805,10 +877,18 @@ void addInputs(Node* n, const char* name, at::IntArrayRef value) {
     recordSourceLocation(info[i]->node());
   }
   for (jit::Value* v : info) {
+<<<<<<< HEAD
     TORCH_CHECK(
         *v->type() == *jit::IntType::get(),
         "Type mismatch in setposattr for IntArrayRef. Check that your program "
         "is valid without tracing, and please file a bug report if it is.");
+=======
+    if (*v->type() != *jit::IntType::get()) {
+      throw std::runtime_error(
+          "Type mismatch in setposattr for IntArrayRef. Check that your program "
+          "is valid without tracing, and please file a bug report if it is.");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   n->addInput(
       g->insertNode(g->createList(jit::IntType::get(), info))->output());
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 58f6260145da0..7210699219993 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -388,7 +388,11 @@ template <
          !std::is_convertible_v<
              std::decay_t<T>,
              c10::intrusive_ptr<c10::ivalue::Object>>)>>
+<<<<<<< HEAD
 void addOutput(Node* node, T&& /*unused*/) {
+=======
+void addOutput(Node* node, T&&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "Found an unsupported argument type ",
diff --git a/torch/csrc/jit/frontend/tree.h b/torch/csrc/jit/frontend/tree.h
index 12e75ec41c69d..10d67a1c4133f 100644
--- a/torch/csrc/jit/frontend/tree.h
+++ b/torch/csrc/jit/frontend/tree.h
@@ -5,7 +5,10 @@
 #include <unordered_map>
 #include <vector>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/SmallVector.h>
 #include <c10/util/intrusive_ptr.h>
 #include <torch/csrc/jit/frontend/lexer.h>
@@ -38,10 +41,17 @@ struct Tree : c10::intrusive_ptr_target {
     return true;
   }
   virtual const SourceRange& range() const {
+<<<<<<< HEAD
     TORCH_CHECK(false, "is an Atom");
   }
   virtual const std::string& stringValue() const {
     TORCH_CHECK(false, "stringValue can only be called on TK_STRING");
+=======
+    throw std::runtime_error("is an Atom");
+  }
+  virtual const std::string& stringValue() const {
+    throw std::runtime_error("stringValue can only be called on TK_STRING");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   virtual const TreeList& trees() const {
     static const TreeList empty_trees = {};
@@ -80,6 +90,7 @@ struct Tree : c10::intrusive_ptr_target {
       int lineno,
       size_t expected_subtrees,
       bool allow_more) const {
+<<<<<<< HEAD
     TORCH_CHECK(
         kind() == k,
         filename,
@@ -90,6 +101,15 @@ struct Tree : c10::intrusive_ptr_target {
         "' but found '",
         kindToString(kind()),
         "'\n");
+=======
+    if (kind() != k) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expecting kind '" << kindToString(k)
+         << "' but found '" << kindToString(kind()) << "'\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (trees().size() < expected_subtrees ||
         (!allow_more && trees().size() != expected_subtrees)) {
       std::stringstream ss;
@@ -97,7 +117,11 @@ struct Tree : c10::intrusive_ptr_target {
          << expected_subtrees << " subtrees, but found only " << trees().size()
          << "\n";
       range().highlight(ss);
+<<<<<<< HEAD
       TORCH_CHECK(false, ss.str());
+=======
+      throw std::runtime_error(ss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   ~Tree() override = default;
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 16edf669da9be..ac7635314098a 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -53,7 +53,11 @@ class MutableTypePtrHelper {
   //     Tensor with shape information removed. For example, a Tensor
   //     of dimension 4 would map to the same type as a Tensor of
   //     dimension 1. This allows us to treat all subclasses of Tensor
+<<<<<<< HEAD
   //     as a single, homogeneous "Tensor" type.
+=======
+  //     as a single, homogenous "Tensor" type.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) {
     if (mutable_type_cache_) {
       const AliasTypeSet* result = mapTypeToBorrowedAliasTypeSet(type);
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index 497412c6476e5..21edb041b44b4 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -48,7 +48,11 @@ class ValueAndMemoryLocationSet;
  *
  * `descendFunctionCalls` - recursively analyze function and method calls
  * instead of conservative analysis. Generally analysis should be done after
+<<<<<<< HEAD
  * inlining so the implementation for recursive analysis is unoptimized.
+=======
+ * inlining so the implmentation for recursive analysis is unoptimized.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class AliasDb {
  public:
@@ -102,7 +106,11 @@ class AliasDb {
   // Do any nodes write to an alias set output by `n`?
   TORCH_API bool hasOutputWriters(const Node* n) const;
 
+<<<<<<< HEAD
   // Do any nodes write to an alias set inputted/outputted by `n`?
+=======
+  // Do any nodes write to an alias set inputed/outputed by `n`?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_API bool hasWriters(const Node* n) const;
 
   // Do any nodes write to `v`s memory location?
@@ -338,7 +346,11 @@ TORCH_API void Lint(const AliasDb* db);
  *  * The AliasDb must not be mutated after construction of a
  *    ValueAndMemoryLocationsSet, or else the MemoryLocations stored in the
  *    ValueAndMemoryLocationSet will no longer be accurate.
+<<<<<<< HEAD
  *  * A ValueAndMemoryLocationsSet is tied to an instance of AliasDb but
+=======
+ *  * A ValueAndMemoryLocationsSet is tied to an instsance of AliasDb but
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  *    does not own the AliasDb. It is the user's responsibility to ensure
  *    that the AliasDb outlives the ValuesAndMemoryLocationsSet.
  *
diff --git a/torch/csrc/jit/ir/attributes.h b/torch/csrc/jit/ir/attributes.h
index de3a5ab42f35b..963855739e001 100644
--- a/torch/csrc/jit/ir/attributes.h
+++ b/torch/csrc/jit/ir/attributes.h
@@ -33,7 +33,11 @@ enum class AttributeKind {
 };
 static inline const char* toString(AttributeKind kind) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+<<<<<<< HEAD
   static constexpr const char* names[] = {
+=======
+  static const char* names[] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "f",
       "c",
       "cs",
diff --git a/torch/csrc/jit/ir/constants.cpp b/torch/csrc/jit/ir/constants.cpp
index e17c981a746e3..b1290c1224bcd 100644
--- a/torch/csrc/jit/ir/constants.cpp
+++ b/torch/csrc/jit/ir/constants.cpp
@@ -1,7 +1,14 @@
+<<<<<<< HEAD
 #include <ATen/core/functional.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/ir/constants.h>
+=======
+#include <torch/csrc/jit/ir/constants.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/autograd/variable.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/operator.h>
@@ -54,7 +61,12 @@ Value* insertConstant(
   if (value) {
     return *value;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unsupported value kind: ", val.tagKind());
+=======
+  throw constant_not_supported_error(
+      "Unsupported value kind: " + val.tagKind());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // IValue -> Constant node
@@ -214,7 +226,13 @@ std::optional<IValue> toIValue(const Value* v) {
     const auto& class_val = node->ival(attr::value);
     return class_val;
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "constant literal not supported for: ", type->str());
+=======
+    std::stringstream ss;
+    ss << "constant literal not supported for: " << type->str();
+    throw std::runtime_error(ss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 4368b3c8191d8..2f36351bbbc53 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -842,7 +842,14 @@ bool Value::isValidName(const std::string& name) {
 }
 
 Value* Value::setDebugName(const std::string& name) {
+<<<<<<< HEAD
   TORCH_CHECK(isValidName(name), "Invalid name: '", name, "'")
+=======
+  if (!isValidName(name)) {
+    throw std::runtime_error("Invalid name: '" + name + "'");
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& names = node()->owningGraph()->unique_names_;
 
   // clear any old name from the map
@@ -967,7 +974,12 @@ static size_t findArgument(
       return i;
     }
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Couldn't find an argument called ", unqualName);
+=======
+  throw std::runtime_error(
+      std::string("Couldn't find an argument called ") + unqualName);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
@@ -1139,7 +1151,11 @@ bool Node::isNondeterministic() const {
   if (!kind().is_aten()) {
     return false;
   }
+<<<<<<< HEAD
   // All aten ops are expected to have a schema. However this is left as a
+=======
+  // All aten ops are expecte to have a schema. However this is left as a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // warning instead of an assert to ensure that previous use cases do not
   // break.
   if (!schema) {
@@ -1175,10 +1191,18 @@ bool Node::hasSideEffects() const {
     case prim::rpc_sync: // It represents RPC message sent.
     case prim::rpc_remote: // It represents RPC message sent.
     case aten::wait: // It can represent RPC message received.
+<<<<<<< HEAD
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case cuda::set_stream:
     case cuda::_set_device:
     case cuda::_current_device:
     case cuda::synchronize:
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case prim::Enter:
     case prim::Exit:
       return true;
@@ -1642,7 +1666,11 @@ Block* Node::findCommonAncestorBlockWith(Node* n) {
     n2 = n2->owningBlock()->owningNode();
   }
 
+<<<<<<< HEAD
   // Now they are the same number of blocks from the graph block,
+=======
+  // Now they are the same numer of blocks from the graph block,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // recurse upwards, checking if they are on the same block
   while (true) {
     if (n1->owningBlock() == n2->owningBlock()) {
@@ -1767,7 +1795,11 @@ Node* Graph::createTupleSlice(
 
   int64_t i = beg;
   for ([[maybe_unused]] const auto j : c10::irange(num_values)) {
+<<<<<<< HEAD
     auto idx = insertConstant(IValue(i));
+=======
+    auto idx = insertConstant(IValue(static_cast<int64_t>(i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto tupleIndex = insertNode(createTupleIndex(tup, idx, tt->elements()[i]));
 
     new_vals.push_back(tupleIndex->output());
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 1169934def92e..1f1e7fbe97c26 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -78,7 +78,13 @@ namespace aten {
 using namespace ::c10::aten;
 }
 namespace cuda {
+<<<<<<< HEAD
 using namespace ::c10::cuda;
+=======
+#if !defined(USE_ROCM)
+using namespace ::c10::cuda;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace cuda
 
 struct Function;
@@ -614,7 +620,11 @@ struct TORCH_API Node {
   // as the equivalents phi-nodes in standard SSA form,
   // defining a new Value to represent any term that has multiple
   // definitions depending on how control flowed. Outputs of the node containing
+<<<<<<< HEAD
   // control flow serve a similar purpose defining new values for variables
+=======
+  // control flow serve a similiar purpose defining new values for variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // that would have different definitions depending on which way control
   // flowed.
 
@@ -1372,7 +1382,11 @@ struct Graph : std::enable_shared_from_this<Graph> {
   // kwargs using Python argument matching rules, and checks that the op matches
   // a known schema.
   //
+<<<<<<< HEAD
   // If this node successfully completes, it guarantees the node
+=======
+  // If this node successfully completes, it guarentees the node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // is a correctly-formed invocation of opname
   TORCH_API Value* insert(
       Symbol opname,
diff --git a/torch/csrc/jit/ir/ir_views.h b/torch/csrc/jit/ir/ir_views.h
index 94aec3bde85ae..e60a59a4f72cf 100644
--- a/torch/csrc/jit/ir/ir_views.h
+++ b/torch/csrc/jit/ir/ir_views.h
@@ -143,7 +143,11 @@ struct LoopView {
  private:
   Node* node_;
 
+<<<<<<< HEAD
   // adjust index_ordering by adding indices 0 - thorough adjust, and
+=======
+  // adjust index_ordering by adding indices 0 - thorugh adjust, and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // incrementing all existing inputs by adjust
   static std::vector<size_t> adjustIndices(
       size_t adjust,
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index bf7feda0ba48d..347a070769f68 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -182,6 +182,7 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
       r.s = parseStringLiteral(token.range, token.text());
       L.next();
       return r;
+<<<<<<< HEAD
     case TK_TRUE:
       r.k = AttributeKind::i;
       r.i = 1;
@@ -201,6 +202,11 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
         L.next();
         return r;
       }
+=======
+    case '-':
+      str = "-";
+      L.next();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (L.cur().kind != TK_NUMBER) {
         throw(
             ErrorReport(token.range)
@@ -254,6 +260,7 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
       L.next();
       return r;
     case TK_IDENT:
+<<<<<<< HEAD
       if (L.cur().text() == "inf") {
         r.k = AttributeKind::f;
         r.f = std::numeric_limits<double>::infinity();
@@ -261,6 +268,8 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
         return r;
       }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Type literal
       r.k = AttributeKind::ty;
       type_alias = type_parser.parseType();
diff --git a/torch/csrc/jit/ir/irparser.h b/torch/csrc/jit/ir/irparser.h
index 9b256b71487f6..c2d613014ac52 100644
--- a/torch/csrc/jit/ir/irparser.h
+++ b/torch/csrc/jit/ir/irparser.h
@@ -13,7 +13,11 @@ struct Value;
 
 // \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
 // if parse_tensor_constants is true will construct empty tensors
+<<<<<<< HEAD
 // for Tensor constants with random or uninitialized contents, otherwise will
+=======
+// for Tensor constants with random or unitialized contents, otherwise will
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // throw
 TORCH_API void parseIR(
     const std::string& str,
@@ -25,7 +29,11 @@ TORCH_API void parseIR(
  * \p VMAP is filled with String to Value pairs allowing to index Values in the
  * newly created graph by their name in the original IR string.
  * if parse_tensor_constants is true will construct empty tensors
+<<<<<<< HEAD
  * for Tensor constants with random or uninitialized contents, otherwise will
+=======
+ * for Tensor constants with random or unitialized contents, otherwise will
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * throw
  */
 TORCH_API void parseIR(
diff --git a/torch/csrc/jit/ir/node_hashing.cpp b/torch/csrc/jit/ir/node_hashing.cpp
index 1551e610c3d10..54307fc160b4c 100644
--- a/torch/csrc/jit/ir/node_hashing.cpp
+++ b/torch/csrc/jit/ir/node_hashing.cpp
@@ -16,7 +16,11 @@ namespace torch::jit {
 namespace {
 
 bool tensorEqual(const at::Tensor& lhs, const at::Tensor& rhs) {
+<<<<<<< HEAD
   // type_equal doesn't distinguish between mkldnn/pytorch cpu tensors,
+=======
+  // type_equal doesnt distinguish between mkldnn/pytorch cpu tensors,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // and we dont want to coalesce mkldnn tensors bc they do layout
   // transformations based on usage
   if (lhs.is_mkldnn() || rhs.is_mkldnn()) {
diff --git a/torch/csrc/jit/ir/scope.cpp b/torch/csrc/jit/ir/scope.cpp
index 0b1fbc3e40682..dc20c6fba658d 100644
--- a/torch/csrc/jit/ir/scope.cpp
+++ b/torch/csrc/jit/ir/scope.cpp
@@ -1,7 +1,14 @@
+<<<<<<< HEAD
 #include <ATen/core/class_type.h>
 #include <ATen/core/function.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/jit/ir/scope.h>
+=======
+#include <torch/csrc/jit/ir/scope.h>
+
+#include <ATen/core/class_type.h>
+#include <ATen/core/function.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::jit {
 // util functions
@@ -45,7 +52,13 @@ ScopePtr Scope::push(Symbol name) {
 }
 
 ScopePtr Scope::parent() {
+<<<<<<< HEAD
   TORCH_CHECK(parent_, "Cannot get parent from Scope with no parent");
+=======
+  if (!parent_) {
+    throw std::runtime_error("Cannot get parent from Scope with no parent");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return parent_;
 }
 
diff --git a/torch/csrc/jit/ir/scope.h b/torch/csrc/jit/ir/scope.h
index f94110508e871..930237fed68b2 100644
--- a/torch/csrc/jit/ir/scope.h
+++ b/torch/csrc/jit/ir/scope.h
@@ -190,7 +190,11 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
   // Return callstack as a vector of [Function, SourceRange] pairs.
   std::vector<InlinedCallStackEntry> vec();
 
+<<<<<<< HEAD
   void setCallee(std::optional<InlinedCallStackPtr> /*callee*/);
+=======
+  void setCallee(std::optional<InlinedCallStackPtr>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool operator==(const InlinedCallStack& rhs) const {
     // No need to compare fn_, since source_range equivalence check
@@ -208,7 +212,11 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
 };
 
 // {source range, node name, InlinedCallStack}
+<<<<<<< HEAD
 // We store node name because same debug info will be used for
+=======
+// We store node name because same debug infor will be used for
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // profiling as well, so we need to know op names as well.
 using DebugInfoTuple =
     std::tuple<SourceRange, std::string, InlinedCallStackPtr>;
diff --git a/torch/csrc/jit/ir/subgraph_matcher.h b/torch/csrc/jit/ir/subgraph_matcher.h
index 91e170c052750..d8a038359e76b 100644
--- a/torch/csrc/jit/ir/subgraph_matcher.h
+++ b/torch/csrc/jit/ir/subgraph_matcher.h
@@ -11,7 +11,11 @@ namespace torch::jit {
  * \brief A structure describing a match of a pattern in a graph.
  *
  * The structure contains an anchor node, from which the match was found, and
+<<<<<<< HEAD
  * match-maps for nodes and values. A match-map specifies the correspondence
+=======
+ * match-maps for nodes and values. A match-map specifies the correspondance
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * between nodes in the pattern graph (match-map keys) with nodes in the actual
  * graph (match-map values). We keep such maps for both nodes and values.
  */
@@ -38,7 +42,11 @@ struct Match {
  * graph are ignored during matching (IOW, we're essentially performing DCE on
  * the pattern).
  *  - Pattern graph nodes cannot alias. TODO: the check not implemented yet.
+<<<<<<< HEAD
  *  - Aliasing nodes in the graph cannot constitute a match (i.e. through all
+=======
+ *  - Aliasing nodes in the graph cannot consitute a match (i.e. through all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * found matches, no nodes in the subgraph alias with each other). TODO: check
  * not implemented yet.
  *  - The matcher will not mutate either the pattern graph or the matched graph.
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 4422608423ee7..4cfce2cb06211 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -125,7 +125,11 @@ void write_archive_current(
     std::string fname = tensor_dir + tensor_names[i++];
     if (use_storage_context &&
         pre_serialized_files.find(fname) != pre_serialized_files.end()) {
+<<<<<<< HEAD
       // storage has been serialized already, skip
+=======
+      // storage has been serialzed already, skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     }
     writer.writeRecord(fname, writable_td.data(), writable_td.sizeInBytes());
@@ -230,7 +234,11 @@ std::stringstream update_bytecode_version(
 
  How to add backport_v{i}_to_v{i-1} ?
  There are two options:
+<<<<<<< HEAD
  1) [Format change only, recommended] Construct a reader with the
+=======
+ 1) [Format change only, recommended] Constrcut a reader with the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  input_model_stream, modify the file, and use PyTorchWriter to write it to
  output_model_stream. See backport_v5_to_v4.
 
@@ -322,7 +330,11 @@ std::stringstream backport_v5_to_v4(std::stringstream& input_model_stream) {
 
   // The export function to generate bytecode.pkl for version 4. After bytecode
   // version bump, the old export function doesn't exist anymore, so keep a copy
+<<<<<<< HEAD
   // here for backport purpose.
+=======
+  // here for backport pupose.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto writeArchiveV4 = [](PyTorchStreamWriter& writer,
                            const std::string& archive_name,
                            const c10::IValue& value) {
@@ -502,7 +514,11 @@ std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) {
       torch::jit::load(input_model_stream, std::nullopt, extra_files);
   std::stringstream intermediate_model_stream;
   // TODO(@pavithran) : Check if debug info is available and use load/save while
+<<<<<<< HEAD
   // backporting hardcode debaug info to be false until supported.
+=======
+  // backporting hardcode debaug info to be false untill supported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool hasBytecodeDebug = false;
   {
     BytecodeEmitModeGuard argNumGuard(
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 23752d5f041c7..1e4585a2b0783 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -16,6 +16,13 @@
 #include <unordered_set>
 #include <vector>
 
+<<<<<<< HEAD
+=======
+namespace c10 {
+TypePtr parseType(const std::string& pythonStr);
+} // namespace c10
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 using caffe2::serialize::FileAdapter;
@@ -63,7 +70,12 @@ std::vector<IValue> get_bytecode_ivalues(PyTorchStreamReader& reader) {
 /********************** Bytecode **********************/
 
 // Forward declare
+<<<<<<< HEAD
 
+=======
+uint64_t _get_model_bytecode_version(
+    const std::vector<IValue>& bytecode_ivalues);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static uint64_t _get_model_bytecode_version_from_bytes(char* data, size_t size);
 
 uint64_t _get_model_bytecode_version(std::istream& in) {
@@ -245,6 +257,11 @@ std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
 /********************** Get Type Table **********************/
 
 // Forward declare
+<<<<<<< HEAD
+=======
+std::unordered_set<std::string> _get_mobile_model_contained_types(
+    const std::vector<IValue>& bytecode_ivalues);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 std::unordered_set<std::string> _get_mobile_model_contained_types(
     std::istream& in) {
@@ -386,7 +403,11 @@ ModelCompatCheckResult is_compatible(
       OperatorInfo runtime_op_info = runtime_info.operator_info.at(op_name);
 
       // If the runtime op has no schema information its a false alarm and isn't
+<<<<<<< HEAD
       // actually usable
+=======
+      // actually useable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!runtime_op_info.num_schema_args.has_value()) {
         result.status = ModelCompatibilityStatus::ERROR;
         std::ostringstream s;
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.h b/torch/csrc/jit/mobile/compatibility/model_compatibility.h
index 03be3dbeb1c61..ed429bc2eeb76 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.h
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.h
@@ -93,7 +93,11 @@ enum ModelCompatibilityStatus {
 
 struct ModelCompatCheckResult {
   ModelCompatibilityStatus status;
+<<<<<<< HEAD
   std::vector<std::string> errors;
+=======
+  std::vector<std::string> errors{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 // Takes in information about a runtime and a model and returns if the two are
 // compatible with one another.
diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
index 35aeb435330e1..be513dcd032dd 100644
--- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
@@ -7,6 +7,13 @@
 #include <torch/custom_class.h>
 #include <unordered_map>
 
+<<<<<<< HEAD
+=======
+namespace c10 {
+TypePtr parseType(const std::string& pythonStr);
+} // namespace c10
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 uint64_t _get_runtime_bytecode_version() {
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 0a410a42fef04..3339850050766 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -76,7 +76,11 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
 // This function construct stacktrace with module hierarchy
 // Module hierarchy will contain information about where in the
 // module hierarchy this source is. For example if conv2d op
+<<<<<<< HEAD
 // exist in hierarchy A->B->C->Conv2d with type annotations of
+=======
+// exist in hierarcy A->B->C->Conv2d with type annotations of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // A -> TopM, B->MyModule, C->SomeModule, then module hierarchy
 // will be TopM(A).MyModule(B).SomeModule(C).Conv2d(conv)
 // Source level stack information will be from model source code.
diff --git a/torch/csrc/jit/mobile/debug_info.h b/torch/csrc/jit/mobile/debug_info.h
index 14e1b1e4e7cd1..2f3d468ec580d 100644
--- a/torch/csrc/jit/mobile/debug_info.h
+++ b/torch/csrc/jit/mobile/debug_info.h
@@ -14,7 +14,11 @@ namespace torch::jit {
  * exception of BackendRuntimeException should raised using debug handles.
  * getSourceDebugString method is responsible for translating debug
  * handles to correspond debug information.
+<<<<<<< HEAD
  * This debug information includes stack trace of model level source code and
+=======
+ * This debug informatin includes stack trace of model level source code and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * module hierarchy where the exception occurred.
  */
 class MobileDebugTable {
diff --git a/torch/csrc/jit/mobile/file_format.h b/torch/csrc/jit/mobile/file_format.h
index 814d680f83ba7..374617373430d 100644
--- a/torch/csrc/jit/mobile/file_format.h
+++ b/torch/csrc/jit/mobile/file_format.h
@@ -153,8 +153,12 @@ static inline std::tuple<std::shared_ptr<char>, size_t> get_file_content(
   size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
   std::shared_ptr<char> data(
       static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+<<<<<<< HEAD
   auto nread = fread(data.get(), size, 1, f);
   TORCH_CHECK(nread == 1, "Failed to read file: ", filename);
+=======
+  fread(data.get(), size, 1, f);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fclose(f);
 #endif
   return std::make_tuple(data, size);
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 103fadaf3a57e..ab43c984e1ecc 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -154,6 +154,7 @@ class FlatbufferLoader final {
 };
 
 IValue parseList(
+<<<<<<< HEAD
     FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseTensor(
@@ -182,6 +183,36 @@ IValue parseBasic(
     const mobile::serialization::IValue& ivalue);
 IValue parseEnum(
     FlatbufferLoader& /*loader*/,
+=======
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseTensor(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseTuple(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseDict(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseObject(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseIntList(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseDoubleList(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseBoolList(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseBasic(
+    FlatbufferLoader&,
+    const mobile::serialization::IValue& ivalue);
+IValue parseEnum(
+    FlatbufferLoader&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const mobile::serialization::IValue& ivalue);
 
 TypePtr resolveType(
@@ -442,7 +473,11 @@ IValue parseEnum(
 }
 
 IValue parseBasic(
+<<<<<<< HEAD
     FlatbufferLoader& /*unused*/,
+=======
+    FlatbufferLoader&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const mobile::serialization::IValue& ivalue) {
   switch (ivalue.val_type()) {
     case mobile::serialization::IValueUnion::NONE:
@@ -546,21 +581,33 @@ std::vector<T> parseListNative(const U* list) {
 }
 
 IValue parseIntList(
+<<<<<<< HEAD
     FlatbufferLoader& /*unused*/,
+=======
+    FlatbufferLoader&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_IntList();
   return parseListNative<int64_t>(list);
 }
 
 IValue parseDoubleList(
+<<<<<<< HEAD
     FlatbufferLoader& /*unused*/,
+=======
+    FlatbufferLoader&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_DoubleList();
   return parseListNative<double>(list);
 }
 
 IValue parseBoolList(
+<<<<<<< HEAD
     FlatbufferLoader& /*unused*/,
+=======
+    FlatbufferLoader&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_BoolList();
   std::vector<uint8_t> res = parseListNative<uint8_t>(list);
@@ -690,8 +737,13 @@ IValue FlatbufferLoader::parseIValue(
       *this, *ivalue);
 }
 
+<<<<<<< HEAD
 void deleteNothing2(void* /*unused*/);
 void deleteNothing2(void* /*unused*/) {}
+=======
+void deleteNothing2(void*);
+void deleteNothing2(void*) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 c10::Storage FlatbufferLoader::getStorage(uint32_t index) {
   TORCH_CHECK(index < storage_loaded_.size());
@@ -760,7 +812,11 @@ void FlatbufferLoader::extractJitSourceAndConstants(
 mobile::Module parse_and_initialize_mobile_module(
     void* data,
     size_t size,
+<<<<<<< HEAD
     std::optional<at::Device> /*unused*/,
+=======
+    std::optional<at::Device>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ExtraFilesMap* extra_files,
     bool should_copy_tensor_memory) {
   // TODO(T128189662): If not copying, enforce that data is aligned to
@@ -806,7 +862,11 @@ mobile::Module parse_and_initialize_mobile_module_for_jit(
     size_t size,
     ExtraFilesMap& jit_sources,
     std::vector<IValue>& jit_constants,
+<<<<<<< HEAD
     std::optional<at::Device> /*unused*/,
+=======
+    std::optional<at::Device>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ExtraFilesMap* extra_files) {
   TORCH_CHECK(
       mobile::serialization::ModuleBufferHasIdentifier(data), "Format error");
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h
index b34bb88093800..1ab44ffbabc18 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.h
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -48,7 +48,11 @@ using ExtraFilesMap = std::unordered_map<std::string, std::string>;
 // shared_ptr overload of this function.
 //
 // If should_copy_tensor_memory is true, then the returned module will NOT have
+<<<<<<< HEAD
 // references to `data`, so `data` can be freed immediately.
+=======
+// refences to `data`, so `data` can be freed immediately.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // If should_copy_tensor_memory is false, then returned module will have tensors
 // that points inside of `data`; the caller will need to make sure that `data`
@@ -93,7 +97,11 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit(
 //
 // This function does steps 1+2+3 described above.
 //
+<<<<<<< HEAD
 // We need to have this as a convenience because Python API will need to wrap
+=======
+// We need to have this as a convienience because Python API will need to wrap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // this. C++ clients should use one of the versions of
 // parse_and_initialize_mobile_module() so they can manage the raw data more
 // directly.
@@ -110,7 +118,11 @@ TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer(
     char* flatbuffer_content);
 
 // The methods below are less efficient because it need to read the stream in
+<<<<<<< HEAD
 // its entirety to a buffer
+=======
+// its entirity to a buffer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API mobile::Module load_mobile_module_from_stream_with_copy(
     std::istream& in,
     std::optional<at::Device> device = std::nullopt,
@@ -121,6 +133,16 @@ TORCH_API mobile::Module parse_flatbuffer_no_object(
     size_t size,
     std::optional<at::Device> device);
 
+<<<<<<< HEAD
+=======
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t,
+    std::optional<at::Device>,
+    ExtraFilesMap* extra_files,
+    bool should_copy_tensor_memory);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // no op, TODO(qihan) delete
 TORCH_API bool register_flatbuffer_loader();
 
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index 87128a180a6d6..24d5f98e00c6f 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -149,9 +149,13 @@ size_t Function::num_inputs() const {
   return schema_->arguments().size();
 }
 
+<<<<<<< HEAD
 bool Function::call(
     Stack& /*unused*/,
     c10::function_ref<void(const mobile::Code&)> f) {
+=======
+bool Function::call(Stack&, c10::function_ref<void(const mobile::Code&)> f) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   initialize_operators(true);
   f(code_);
   return true;
diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h
index 1f0f90d345619..c421f7f9c9726 100644
--- a/torch/csrc/jit/mobile/function.h
+++ b/torch/csrc/jit/mobile/function.h
@@ -26,9 +26,13 @@ class TORCH_API Function : public torch::jit::Function {
   void ensure_defined() override {}
   size_t num_inputs() const override;
   const c10::QualifiedName& qualname() const override;
+<<<<<<< HEAD
   bool call(
       Stack& /*unused*/,
       c10::function_ref<void(const mobile::Code&)> /*f*/ /*unused*/) override;
+=======
+  bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // NOTE: the APIs below is dangerous: if you call append_instruction with
   // dbg_handle and then call it without; then the dbg_handle will become
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 6a0ba7e038ea3..da1a2bda01591 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -87,6 +87,11 @@ using caffe2::serialize::MemoryReadAdapter;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::ReadAdapterInterface;
 
+<<<<<<< HEAD
+=======
+OpCode parseOpCode(const char* str);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TypePtr resolveTypeNameMobile(
     const c10::QualifiedName& qn,
     const std::shared_ptr<CompilationUnit>& compilation_unit) {
@@ -214,7 +219,11 @@ class BytecodeDeserializer final {
       mobile::Function* function);
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::unordered_set<std::string> imported_libs_;
+<<<<<<< HEAD
   std::unique_ptr<PyTorchStreamReader> reader_;
+=======
+  std::unique_ptr<PyTorchStreamReader> reader_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<at::Device> device_;
   uint64_t module_load_options_;
   // From `version` or `.data/version` in model.ptl and it's compute
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index b5e67cd83cbb2..95f3b6ba1e167 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -17,7 +17,11 @@
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch::jit {
+<<<<<<< HEAD
 
+=======
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace mobile {
 InterpreterState::InterpreterState(const Code& code) {
   enterFrame(code);
diff --git a/torch/csrc/jit/mobile/interpreter.h b/torch/csrc/jit/mobile/interpreter.h
index 48755954e04bb..27d120db48609 100644
--- a/torch/csrc/jit/mobile/interpreter.h
+++ b/torch/csrc/jit/mobile/interpreter.h
@@ -12,7 +12,11 @@ struct InterpreterState {
   TORCH_API bool run(Stack& stack);
 
  private:
+<<<<<<< HEAD
   void enterFrame(const Code& /*code*/);
+=======
+  void enterFrame(const Code&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void leaveFrame();
   void saveExceptionDebugHandles();
   void callFunction(torch::jit::Function& f, Stack& stack);
diff --git a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp
index b02c7ef74096a..3c3a861d78358 100644
--- a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp
@@ -105,7 +105,11 @@ std::unordered_map<std::string, std::string> MobileModelRunner::
       function_and_info_dict[key.toStringRef()] = data_list;
     }
 
+<<<<<<< HEAD
     // Could store the full mapping of std types, but the 'info' section isn't
+=======
+    // Could store the full mapping of std types, but the 'info' section isnt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // needed here
     std::string input_function =
         function_and_info_dict["get_inputs_function_name"][0];
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index 4b22af1fda414..a38485b4e5191 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -67,6 +67,7 @@ class MobileModuleObserver {
  public:
   virtual ~MobileModuleObserver() = default;
 
+<<<<<<< HEAD
   virtual void onEnterRunMethod(const int32_t /*unused*/) {}
   virtual void onExitRunMethod(
       const std::unordered_map<std::string, std::string>& /*unused*/,
@@ -89,6 +90,28 @@ class MobileModuleObserver {
       const int32_t /*unused*/,
       const char* /*unused*/,
       const std::unordered_map<std::string, std::string>& /*unused*/) {}
+=======
+  virtual void onEnterRunMethod(const int32_t) {}
+  virtual void onExitRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t) {}
+  virtual void onFailRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t,
+      const char*) {}
+  virtual void onEnterLoadModel(const int32_t) {}
+  virtual void onExitLoadModel(
+      const int32_t,
+      const std::unordered_map<std::string, std::string>&) {
+  } // key: filename, value: file content
+  virtual void onFailLoadModel(const int32_t, const char*) {}
+  virtual void onFailLoadModel(
+      const int32_t,
+      const char*,
+      const std::unordered_map<std::string, std::string>&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual std::vector<std::string> getDefaultExtraFiles() = 0;
   virtual std::unordered_map<std::string, std::string> processMetadataFromExtra(
       const std::unordered_map<std::string, std::string>&) = 0;
diff --git a/torch/csrc/jit/mobile/parse_bytecode.cpp b/torch/csrc/jit/mobile/parse_bytecode.cpp
index 1a1e278e371f8..13179705a66c4 100644
--- a/torch/csrc/jit/mobile/parse_bytecode.cpp
+++ b/torch/csrc/jit/mobile/parse_bytecode.cpp
@@ -9,7 +9,11 @@
 #include <torch/custom_class_detail.h>
 
 namespace torch::jit {
+<<<<<<< HEAD
 
+=======
+OpCode parseOpCode(const char* str);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using c10::IValue;
 
 IValue expect_field(
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index 4acfb041fc41f..d308765c3072c 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -38,7 +38,11 @@ class TORCH_API KinetoEdgeCPUProfiler {
    *
    * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
    * within certain scope. In that scope, the captured reference to
+<<<<<<< HEAD
    * Module will outlive KinetoEdgeCPUProfiler. This is guaranteed because
+=======
+   * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
    *
    * An example of the anti-pattern and wrong usage is:
diff --git a/torch/csrc/jit/mobile/train/optim/sgd.cpp b/torch/csrc/jit/mobile/train/optim/sgd.cpp
index 1523c5629a9cb..3bd7ae1aa317a 100644
--- a/torch/csrc/jit/mobile/train/optim/sgd.cpp
+++ b/torch/csrc/jit/mobile/train/optim/sgd.cpp
@@ -84,7 +84,11 @@ Tensor SGD::step(const LossClosure& closure) {
     loss = closure();
   }
   for (auto& group : param_groups_) {
+<<<<<<< HEAD
     auto& options = group.options();
+=======
+    auto& options = static_cast<SGDOptions&>(group.options());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto weight_decay = options.weight_decay();
     auto momentum = options.momentum();
     auto dampening = options.dampening();
@@ -102,7 +106,11 @@ Tensor SGD::step(const LossClosure& closure) {
         Tensor buf;
         auto param_state = state_.find(p.unsafeGetTensorImpl());
         if (param_state == state_.end()) {
+<<<<<<< HEAD
           buf = d_p.detach().clone();
+=======
+          buf = torch::clone(d_p).detach();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto state = std::make_unique<SGDParamState>();
           state->momentum_buffer(buf);
           state_[p.unsafeGetTensorImpl()] = std::move(state);
diff --git a/torch/csrc/jit/mobile/type_parser.cpp b/torch/csrc/jit/mobile/type_parser.cpp
index f9287a5eb7040..1f9ce9ed7d74d 100644
--- a/torch/csrc/jit/mobile/type_parser.cpp
+++ b/torch/csrc/jit/mobile/type_parser.cpp
@@ -36,7 +36,11 @@ TypeParser::TypeParser(std::vector<std::string>& pythonStrs)
 // instruction. In nested type, the lowest level type will be at the beginning
 // of the type list. It is possible to parse it without worrying about
 // ordering, but it also introduces 1) extra cost to process nested type to
+<<<<<<< HEAD
 // the correct order 2) lost the benefit that the instruction order is likely
+=======
+// the correct order 2) lost the benifit that the instruction order is likely
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // problematic if type list parsing fails.
 std::vector<TypePtr> TypeParser::parseList() {
   std::vector<TypePtr> typePtrs;
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index 60558a308110b..3c76a80ce69d1 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -11,7 +11,11 @@ You can determine if your change in the operator is BC breaking, if it fails `te
 
 ### Some examples BC breaking changes
 
+<<<<<<< HEAD
 When making changes to the operators, the first thing to identify is if it's BC/FC breaking. Again, we only targeting for BC breaking changes on this guidance. Here are some examples to help understanding what a BC changes may look like:
+=======
+When making changes to the operators, the first thing to identify is if it's BC/FC breaking. Again, we only targetting for BC breaking changes on this guidance. Here are some examples to help understanding what a BC changes may look like:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #### Backward Compatibility Breakage:
 
@@ -32,7 +36,11 @@ When making changes to the operators, the first thing to identify is if it's BC/
 
 ### 1.Preparation
 
+<<<<<<< HEAD
 [Build PyTorch from source](https://github.com/pytorch/pytorch#from-source) and prepare a test model before making changes to the operator, following the process below. A test model before making the operator changes is needed to test the upgrader. Otherwise, after the change to operator, the new runtime will no longer be able to produce a model with the historic operator and can't test it anymore.
+=======
+[Build PyTorch from souce](https://github.com/pytorch/pytorch#from-source) and prepare a test model before making changes to the operator, following the process below. A test model before making the operator changes is needed to test the upgrader. Otherwise, after the change to operator, the new runtime will no longer be able to produce a model with the historic operator and can't test it anymore.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     1. Add a test module in `test/jit/fixtures_srcs/fixtures_src.py`. In `test/jit/fixtures_srcs/generate_models.py`,
   ```
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 38e4b5068e2ff..cab4014dc009d 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -319,7 +319,11 @@ static void BatchMMTreeReduce(Block* block, AliasDb& alias_db) {
 }
 
 static bool shape_is_fast_for_side(const at::Tensor& other_side_input) {
+<<<<<<< HEAD
   // Cutoff chose by benchmarking on a TITAN V
+=======
+  // Cutoff chosed by benchmarking on a TITAN V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return other_side_input.numel() <= 1024 * 2048;
 }
 
diff --git a/torch/csrc/jit/passes/canonicalize.cpp b/torch/csrc/jit/passes/canonicalize.cpp
index 1cc849d4a3cd7..430c6d6cf5a60 100644
--- a/torch/csrc/jit/passes/canonicalize.cpp
+++ b/torch/csrc/jit/passes/canonicalize.cpp
@@ -96,7 +96,11 @@ static bool isBefore(Node* n1, Node* n2) {
     }
   }
 
+<<<<<<< HEAD
   // Now they are the same number of blocks from the graph block,
+=======
+  // Now they are the same numer of blocks from the graph block,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // recurse upwards, checking if they are on the same block
   while (true) {
     if (n1->owningBlock() == n2->owningBlock()) {
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index af3a0d6410166..e2c649a55e3b8 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -98,7 +98,11 @@ void InplaceMKLDNNSubgraph(const std::shared_ptr<Graph>& graph) {
   // This function first calculates aliasing sets,
   // then calculates the last node each aliasing set is alive for.
   // Then we go through each node, if it's a node which has an equivalent
+<<<<<<< HEAD
   // inplace node and the aliasing set for its input is dead after this node, we
+=======
+  // inplace node and the aliasing set for its input is dead afer this node, we
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inplace it. Then we merge the aliasing sets for the input and output of the
   // node and extend the liveness of the set. To inplace a node you need to
   // prove device and dtype of the input and output are the same, which we've
@@ -272,7 +276,12 @@ Operation createUnaryOp(
     TORCH_INTERNAL_ASSERT(
         a_it.get_desc().get_size() % elementSize(a.scalar_type()) == 0);
 
+<<<<<<< HEAD
     auto out_aten = at::from_blob(out_raw_data, nelem, a_options_with_strided);
+=======
+    auto out_aten = at::from_blob(
+        out_raw_data, {static_cast<int64_t>(nelem)}, a_options_with_strided);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten_op(out_aten, in_aten);
     push(stack, out);
   };
@@ -811,7 +820,11 @@ void ComputeSubgraphInMKLDNN(Node* subgraph_node) {
 
     if (body_node->kind() == aten::conv2d ||
         body_node->kind() == aten::conv3d) {
+<<<<<<< HEAD
       // this node doesn't handle string padding yet...
+=======
+      // this node doesnt handle string padding yet...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!body_node->namedInput("padding")->type()->cast<StringType>()) {
         body_node->replaceWithNewSymbol(Symbol::prim("mkldnn_convolution"));
         body_node->destroy();
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index cddae77768228..53a56e9294e01 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -167,7 +167,11 @@ std::shared_ptr<Graph> ToONNX(
   ConstantValueMap::ClearMaps();
   auto new_graph = std::make_shared<Graph>(graph->current_scope());
   py::dict env;
+<<<<<<< HEAD
   // Kept identical to values in env. Used for constant-time existence check.
+=======
+  // Kept identical to values in env. Used for constant-time existance check.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::set values_in_env;
   try {
     BlockToONNX(
@@ -260,12 +264,19 @@ void NodeToONNX(
     ::torch::onnx::OperatorExportTypes operator_export_type,
     py::dict& env,
     py::set& values_in_env) {
+<<<<<<< HEAD
   py::object onnx_utils =
       py::module::import("torch.onnx._internal.torchscript_exporter.utils");
   py::object onnx_globals =
       py::module::import("torch.onnx._internal.torchscript_exporter._globals");
   py::object onnx_registration = py::module::import(
       "torch.onnx._internal.torchscript_exporter.registration");
+=======
+  py::object onnx = py::module::import("torch.onnx");
+  py::object onnx_globals = py::module::import("torch.onnx._globals");
+  py::object onnx_registration =
+      py::module::import("torch.onnx._internal.registration");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Setup all the lambda helper functions.
 
@@ -476,7 +487,11 @@ void NodeToONNX(
     // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
     // Python. Check #87343 for details.
     py::list new_nodes = py::list();
+<<<<<<< HEAD
     py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
+=======
+    py::object raw_output = onnx.attr("_run_symbolic_function")(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         g->shared_from_this(),
         new_block,
         n,
@@ -592,7 +607,11 @@ void NodeToONNX(
 
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
+<<<<<<< HEAD
       py::object raw_output = onnx_utils.attr("_run_symbolic_method")(
+=======
+      py::object raw_output = onnx.attr("_run_symbolic_method")(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           new_block->owningGraph()->shared_from_this(),
           op->name(),
           pyobj.attr("symbolic"),
@@ -607,7 +626,11 @@ void NodeToONNX(
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
       py::list new_nodes = py::list();
+<<<<<<< HEAD
       py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
+=======
+      py::object raw_output = onnx.attr("_run_symbolic_function")(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           new_block->owningGraph()->shared_from_this(),
           new_block,
           n,
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 0ac07adf0d45c..c6ba5f0dae905 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -76,8 +76,13 @@ static std::optional<at::Tensor> runTorchSlice_opset9(
   if (!(node->hasAttributeS("starts") && node->hasAttributeS("ends"))) {
     return std::nullopt;
   }
+<<<<<<< HEAD
   auto const& startsAttr = node->is(attr::starts);
   auto const& endsAttr = node->is(attr::ends);
+=======
+  auto startsAttr = node->is(attr::starts);
+  auto endsAttr = node->is(attr::ends);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (startsAttr.size() != endsAttr.size()) {
     return std::nullopt;
   }
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 7901b44bb85f5..80d7daa41c791 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -87,14 +87,22 @@ struct FunctionExtractor {
       const std::shared_ptr<Graph>& graph);
 
   static void HandleNoScopeNodes(
+<<<<<<< HEAD
       scope_ctx_map& /*scope_ctxs*/,
+=======
+      scope_ctx_map&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const node_list& no_scope_nlist);
   std::tuple<scope_ctx_map, node_list> PartitionNodesByScope(Block* b);
   scope_ctx_map PartitionNodesByScope(const std::shared_ptr<Graph>& graph);
   static std::unordered_map<ScopePtr, scope_list> PartitionIdenticalScopes(
       scope_ctx_map& scope_ctxs);
   static scope_list SortScopesByMaxDepth(
+<<<<<<< HEAD
       std::unordered_map<ScopePtr, scope_list>& /*identical_scope_map*/);
+=======
+      std::unordered_map<ScopePtr, scope_list>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Node* CreateFunctionDefNode(
       FunctionContext& func_ctx,
       const std::shared_ptr<Graph>& graph,
@@ -107,7 +115,11 @@ struct FunctionExtractor {
       const std::string& domain_name,
       const std::string& func_name);
 
+<<<<<<< HEAD
   static void DebugPrintScopeContexts(const scope_ctx_map& /*scope_ctxs*/);
+=======
+  static void DebugPrintScopeContexts(const scope_ctx_map&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static void DebugPrintGraphWithFunction(const std::shared_ptr<Graph>& g);
   static void DebugPrintConstantDiff(const FunctionContext&);
 
@@ -216,7 +228,11 @@ void FunctionExtractor::FunctionContext::SetAttrName(
   TORCH_INTERNAL_ASSERT(
       v_it != scope_ctxs_[scope_key_]->env_to_subgraph_.end());
   auto* n_in_def = v_it->second->node();
+<<<<<<< HEAD
   node_attr_to_name_[n_in_def][attr.toUnqualString()] = name;
+=======
+  auto n_attr_it = node_attr_to_name_[n_in_def][attr.toUnqualString()] = name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::optional<std::string> FunctionExtractor::FunctionContext::FindAttrName(
@@ -405,7 +421,11 @@ std::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
       auto common_ancestor = FindCommonAncestor(scopes);
       if (common_ancestor.has_value() &&
           IsValidScope(common_ancestor.value())) {
+<<<<<<< HEAD
         return common_ancestor;
+=======
+        return common_ancestor.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index cad60e8816d35..c478951f7ee1c 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -28,6 +28,12 @@ TORCH_API ValueToParamPairMap
 buildValueToParamsMap(Block* b, const ParamMap& paramsDict);
 TORCH_API void eraseUnusedValuesFromMap(ValueToParamPairMap& valsToParamsMap);
 TORCH_API void eraseUnusedBlockInputs(Block* b);
+<<<<<<< HEAD
+=======
+TORCH_API void buildParamsMapFromValueToParamsMap(
+    const ValueToParamPairMap& valsToParamsMap,
+    ParamMap& paramsDict);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_API Node* addNodeToBlock(
     Block* block,
diff --git a/torch/csrc/jit/passes/onnx/naming.cpp b/torch/csrc/jit/passes/onnx/naming.cpp
index 034c73beb4c70..f8543df68af38 100644
--- a/torch/csrc/jit/passes/onnx/naming.cpp
+++ b/torch/csrc/jit/passes/onnx/naming.cpp
@@ -85,7 +85,11 @@ class NodeNameGenerator {
 
  protected:
   virtual void CreateNodeName(Node* n) = 0;
+<<<<<<< HEAD
   void PopulateNodeNames(Block* /*b*/);
+=======
+  void PopulateNodeNames(Block*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void UpdateOutputsNames(Node* n);
   bool IsGraphOutput(const Value* v, const std::shared_ptr<Graph>& graph) const;
 
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
index 8d05fbe942651..85d9b25181a58 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
@@ -17,7 +17,11 @@ namespace torch::jit {
 // information. Shape and type information is only available after
 // _jit_pass_onnx, which converts aten nodes to onnx nodes. So there is a
 // interdependent issue. _jit_pass_onnx depends on preprocess passes to convert
+<<<<<<< HEAD
 // aten nodes into convertible condition, and preprocess passes depend on
+=======
+// aten nodes into convertable condition, and preprocess passes depend on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // _jit_pass_onnx to convert upstream nodes and apply onnx shape inference.
 // Separating the pass into two parts breaks the interdependency.
 //
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 71595b769ac1c..de5a747e4369e 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -35,8 +35,13 @@ static bool isRNN(const Node* node) {
 }
 
 static bool isNopTranspose(const std::vector<int64_t>& perm) {
+<<<<<<< HEAD
   for (size_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
     if (perm[i] != static_cast<int64_t>(i)) {
+=======
+  for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
+    if (perm[i] != i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     }
   }
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index a188eb0abd6b8..677f5d4472f51 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -10,6 +10,11 @@
 
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
+=======
+#include <limits>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 namespace {
@@ -191,7 +196,12 @@ std::pair<Value*, Value*> PrepareCopyForONNX(Node* node) {
   expanded_value->node()->copyMetadata(node);
 
   auto index_put = graph->insert(
+<<<<<<< HEAD
       aten::index_put_, {node->input(0), dummy_list, expanded_value});
+=======
+      aten::index_put_,
+      {node->input(0), dummy_list, expanded_value, node->input(2)});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   index_put->node()->copyMetadata(node);
   index_put->copyMetadata(node->output());
   node->output()->replaceAllUsesWith(index_put);
@@ -341,7 +351,11 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
         auto it =
             std::find(node->inputs().begin(), node->inputs().end(), input);
         if (it != node->inputs().end()) {
+<<<<<<< HEAD
           auto index = std::distance(node->inputs().begin(), it);
+=======
+          int index = std::distance(node->inputs().begin(), it);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           TORCH_WARN(
               "ONNX Preprocess - Removing mutation from node ",
               node->kind().toQualString(),
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index ccb6e0bc163a4..7dd326a05a553 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -282,7 +282,11 @@ Value* CloneValueFromListConstruct(
   auto input = n_graph->addInput();
   if (scalar_type) {
     auto v_type = TensorType::create(
+<<<<<<< HEAD
         scalar_type,
+=======
+        scalar_type.value(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::kCPU,
         c10::SymbolicShape(),
         c10::VaryingShape<c10::Stride>{},
@@ -411,9 +415,13 @@ void ConvertGraphToONNXProto(
   }
 }
 
+<<<<<<< HEAD
 std::optional<at::Tensor> ComputeConstantFolding(
     const Node* n,
     int opset_version) {
+=======
+std::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n->inputs().empty()) {
     return std::nullopt;
   }
@@ -465,7 +473,11 @@ std::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
   auto it_0 = std::find_if(shape_vector.begin(), shape_vector.end(), is_zero);
   bool shape_has_zero = it_0 != shape_vector.end();
 
+<<<<<<< HEAD
   int64_t minus_one_pos = -1;
+=======
+  int minus_one_pos = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto i : c10::irange(shape_vector.size())) {
     if (shape_vector[i].value() == -1) {
       minus_one_pos = i;
@@ -775,7 +787,11 @@ void ProcessBroadcastNode(Node* n) {
 }
 
 void ProcessShapeForConcatNode(Node* n) {
+<<<<<<< HEAD
   auto axis = n->i(attr::axis);
+=======
+  int axis = n->i(attr::axis);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (ConstantValueMap::HasRank(n->input(0)->debugName())) {
     auto rank = ConstantValueMap::GetRank(n->input(0)->debugName()).value();
     size_t axis_adjust = 0;
@@ -1246,7 +1262,11 @@ void ProcessUnsqueezeNode(Node* n) {
 void ComputeConstant(Node* n, int opset_version) {
   if (n->kind() == ::c10::onnx::Constant) {
     if (n->kindOf(attr::value) == AttributeKind::t) {
+<<<<<<< HEAD
       const at::Tensor& const_val = n->t(attr::value);
+=======
+      at::Tensor const_val = n->t(attr::value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::Tensor const_val_copy =
           at::empty(const_val.sizes(), const_val.options());
       const_val_copy.copy_(const_val);
@@ -1383,7 +1403,11 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
+<<<<<<< HEAD
           const auto& input0_shape_value = input0_shape_size.value();
+=======
+          auto input0_shape_value = input0_shape_size.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
             // When value of `shape` is statically known,
             // output shape can be computed.
@@ -1439,8 +1463,13 @@ void ComputeConstant(Node* n, int opset_version) {
                   for (auto cur_dim : shape_vector_0) {
                     num_elements *= cur_dim.static_size();
                   }
+<<<<<<< HEAD
                   dims.emplace_back(
                       c10::ShapeSymbol::fromStaticSize(num_elements));
+=======
+                  dims.emplace_back(c10::ShapeSymbol::fromStaticSize(
+                      static_cast<int64_t>(num_elements)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 }
               }
             }
@@ -1476,7 +1505,11 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
+<<<<<<< HEAD
           const auto& input0_shape_value = input0_shape_size.value();
+=======
+          auto input0_shape_value = input0_shape_size.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           int64_t total_size = 1;
           auto is_full_static = true;
           for (const auto i : c10::irange(input0_shape_value.size())) {
@@ -1512,7 +1545,11 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
+<<<<<<< HEAD
           const auto& input0_shape_value = input0_shape_size.value();
+=======
+          auto input0_shape_value = input0_shape_size.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
             auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(
                 n->input(1)->debugName());
@@ -1661,10 +1698,17 @@ void SpecialPostProcess(Node* n) {
       };
 
       auto find_sequence_empty = [](Value* input,
+<<<<<<< HEAD
                                     const TensorTypePtr& t_type) -> Node* {
         auto find_sequence_empty_impl =
             [](Value* input,
                const TensorTypePtr& t_type,
+=======
+                                    TensorTypePtr t_type) -> Node* {
+        auto find_sequence_empty_impl =
+            [](Value* input,
+               TensorTypePtr t_type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                auto& find_sequence_empty_ref) -> Node* {
           auto input_node = input->node();
           TORCH_INTERNAL_ASSERT(input_node);
@@ -1710,7 +1754,11 @@ void SpecialPostProcess(Node* n) {
           return nullptr;
         };
         return find_sequence_empty_impl(
+<<<<<<< HEAD
             input, t_type, find_sequence_empty_impl);
+=======
+            input, std::move(t_type), find_sequence_empty_impl);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       };
 
       if (seq_node && t_type && t_type->scalarType()) {
@@ -2257,7 +2305,11 @@ void ONNXSetDynamicInputShape(
   }
 }
 
+<<<<<<< HEAD
 static bool HasSequenceTypeOutput(const Node* node) {
+=======
+static bool HasSequenceTypeOutput(Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (node->kind() == ::c10::onnx::SplitToSequence ||
       node->kind() == ::c10::onnx::SequenceInsert ||
       node->kind() == ::c10::onnx::SequenceEmpty ||
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 63e6804c97eb3..aecf8533abe46 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -21,6 +21,86 @@ using namespace ::c10::onnx;
 
 }
 
+<<<<<<< HEAD
+=======
+// Get the scale of the input to quantized op. There are two cases here
+// 1. For ops with output_scale specified in op signature, we get the output
+// scale
+// 2. For ops with no output scale in op signature (like quantized::relu)
+// we traverse up the graph to get the scale from its input until we hit a node
+// where scale is explicitly specified.
+double getScaleFromInput(Node* input_node) {
+  std::optional<IValue> scale;
+  std::string input_name = input_node->kind().toQualString();
+  std::unordered_set<std::string> noscale_ops = {
+      "quantized::max_pool2d",
+      "aten::max_pool2d",
+      "aten::relu",
+      "prim::ListUnpack",
+      "aten::split_with_sizes",
+      "quantized::nchw2nhwc",
+      "quantized::nhwc2nchw",
+      "aten::slice",
+      "aten::avg_pool2d",
+      "quantized::cat",
+      "prim::ListConstruct",
+      "aten::upsample_nearest2d",
+      "aten::sigmoid",
+      "aten::reshape"};
+  if (input_name == "aten::quantize_per_tensor") {
+    TORCH_CHECK(
+        input_node->inputs().size() > 1,
+        "aten::quantize_per_tensor expected scale to be 2nd input");
+    scale = toIValue(input_node->inputs()[1]);
+    return scale.value().toDouble();
+  } else if (input_name == "quantized::linear") {
+    // %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
+    TORCH_CHECK(
+        input_node->inputs().size() > 2,
+        "quantized::linear expected scale to be 3rd input");
+    scale = toIValue(input_node->inputs()[2]);
+    return scale.value().toDouble();
+  } else if (input_name == "quantized::conv2d") {
+    // %r = quantized::conv2d(%input, %packed_weight, %w_scale, %w_zero_point)
+    TORCH_CHECK(
+        input_node->inputs().size() > 2,
+        "quantized::conv2d expected scale to be 3rd input");
+    auto num_inputs = input_node->inputs().size();
+    scale = toIValue(input_node->inputs()[num_inputs - 2]);
+    return scale.value().toDouble();
+  } else if (input_name == "quantized::conv2d_relu") {
+    // %r = quantized::conv2d_relu(%input, %packed_weight, %w_scale,
+    // %w_zero_point)
+    TORCH_CHECK(
+        input_node->inputs().size() > 2,
+        "quantized::conv2d_relu expected scale to be 3rd input");
+    auto num_inputs = input_node->inputs().size();
+    scale = toIValue(input_node->inputs()[num_inputs - 2]);
+    return scale.value().toDouble();
+  } else if (input_name == "quantized::add") {
+    // %r = quantized::add(%input_a, %input_b, %w_scale, %w_zero_point)
+    TORCH_CHECK(
+        input_node->inputs().size() > 2,
+        "quantized::add expected scale to be 3rd input");
+    scale = toIValue(input_node->inputs()[2]);
+    return scale.value().toDouble();
+  } else if (input_name == "aten::sigmoid") {
+    // For the _caffe2::Int8Sigmoid op output scale is 1.0/256
+    // And output zero_point is set to 0 (quint8 type).
+    return 1.0L / 256;
+  }
+  // For the ops below the scale is not part of the op signature, so we traverse
+  // up the graph to get the scale from its input when defined in the graph.
+  else if (noscale_ops.find(input_name) != noscale_ops.end()) {
+    return getScaleFromInput(input_node->inputs()[0]->node());
+  }
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "Unrecognized quantized operator while trying to compute q_scale for operator ",
+      input_name);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::vector<Node*> CreateQuantizedWeights(
     std::shared_ptr<Graph>& graph,
     const at::Tensor& weight,
@@ -238,7 +318,11 @@ static void unpackQuantizedWeightsHelper(
         auto config_vals = elements[1].to<std::vector<int64_t>>();
         auto tensors = elements[2].to<std::vector<std::optional<at::Tensor>>>();
 
+<<<<<<< HEAD
         const std::optional<at::Tensor>& weight = tensors[1];
+=======
+        std::optional<at::Tensor> weight = tensors[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TORCH_INTERNAL_ASSERT(
             weight, "Weight should always be present in serialized qconv.");
         unpacked_weight = *weight;
@@ -296,7 +380,11 @@ static void unpackQuantizedWeightsHelper(
         TORCH_INTERNAL_ASSERT(version == "2", "Unknown serialization version");
         std::vector<at::Tensor> non_optional = elements[1].toTensorVector();
 
+<<<<<<< HEAD
         const at::Tensor& conv_params_packed = non_optional[0];
+=======
+        at::Tensor conv_params_packed = non_optional[0];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unpacked_weight = non_optional[1];
 
         const int64_t kSpatialDim = conv_params_packed[0].item<int64_t>();
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index 36d6884637d2a..3db8d9bd3546b 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -116,7 +116,11 @@ static std::vector<std::string> _single_input_general_shape_aten_funcs = {
     "__getitem__",
 };
 
+<<<<<<< HEAD
 // These are prim::CallFunctions for ops that doesn't require observation and
+=======
+// Theses are prim::CallFunctions for ops that doesn't require observation and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // have a single input Tensor
 // Also these ops do computation on the value of Tensor
 // TODO: [Need verify] looks like we can quantize simple functionals that just
@@ -136,7 +140,11 @@ static std::vector<std::string> _single_input_general_value_call_funcs = {
     "leaky_relu",
 };
 
+<<<<<<< HEAD
 // These are aten functions for ops that doesn't require observation and
+=======
+// Theses are aten functions for ops that doesn't require observation and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // have a single input Tensor
 // Also these ops do computation on the value of Tensor
 // e.g. `aten::avg_pool2d(%input_tensor, ...)`
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp
index 5fab235044453..c153e74a7dc69 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp
@@ -1702,7 +1702,11 @@ Module InsertObserversForOnDevicePTQ(
   // you will have multiple getattrs for the same attribute and thus potentially
   // multiple observers observing the same value. This will also lead to
   // increased size of the packed param struct. I dont expect this to be a
+<<<<<<< HEAD
   // common pattern but something to be aware of Note that current quant
+=======
+  // common pattern but something to be aware fo Note that current quant
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // workflow does not prevent this anyway since during inset quant dequant
   // things are inlined anyway
   helper.fillBoundaryValueMap(cloned_module, observer_method_name);
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 8df57982bc331..c7f58c67bf280 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -750,7 +750,11 @@ class InsertQuantDeQuantHelper {
     }
   }
 
+<<<<<<< HEAD
   void collectObserverNodesAndValueToQuantize(Module& module, Value* /*v*/);
+=======
+  void collectObserverNodesAndValueToQuantize(Module& module, Value*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void cleanup(Module& module, Graph* g);
   void removeObserverNodes(Graph* g);
   void quantizeTensors(Module& module, Graph* g, Value* self);
@@ -1622,7 +1626,11 @@ void InsertQuantDeQuantHelper::insertCalculateQParamsAndQuantizationOps(
 void InsertQuantDeQuantHelper::runForOnDevicePTQ(
     Module& module,
     const std::string& method_name) {
+<<<<<<< HEAD
   // In all likelihood this really won't do anything because we expect that
+=======
+  // In all likelihood this really wont do anything because we expect that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // the input method for quantization's prepare step will be inlined. Thus
   // only call methods we will see will belong to observer's forward calls.
   for (auto& invoked_methods : getInvokedMethods(module, method_name)) {
@@ -1834,8 +1842,13 @@ Module InsertQuantDeQuantOnDevicePTQ(
   // ReplicateChooseQParamsQuantDequant: This is propagating dynamic quant's
   // quant dequant RemoveRedundantQuantizationOps: THis is removing activation
   // observers for dynamic quant when the op related to it is not dynamically
+<<<<<<< HEAD
   // quantizable. Doesn't really make sense. In our case we won't have those
   // anyway since for dynamic quant activations won't be observed We can still
+=======
+  // quantizable. Doesnt really make sense. In our case we wont have those
+  // anyway since for dynamic quant activations wont be observed We can still
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // use this function because the above two methods should really be a noop
   h.propagateQuantizationOps(module);
   return module;
diff --git a/torch/csrc/jit/passes/quantization/quantization_patterns.h b/torch/csrc/jit/passes/quantization/quantization_patterns.h
index e30688ed6e21a..475dfd21a3c0f 100644
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -16,7 +16,11 @@ struct QuantFusionInfo {
   std::string quantized_op_name;
   std::string pattern;
   std::string replacement;
+<<<<<<< HEAD
   std::vector<MatchFilter> filters;
+=======
+  std::vector<MatchFilter> filters = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 namespace {
@@ -206,7 +210,11 @@ QuantFusionInfo getFixedQParamOpFusionInfo(
           %r = )";
   op_pattern += op_name + "(" + "%a_dequant" + extra_op_arg_list + ")";
   // IR pattern common to all ops with fixed quantization parameters for
+<<<<<<< HEAD
   // asymmetric quantization
+=======
+  // asymetric quantization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string asym_fixed_qparam_op_suffix = R"(
           %r_scale : float = prim::Constant[value=0.00390625]()
           %r_zero_point : int = prim::Constant[value=0]()
diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.h b/torch/csrc/jit/passes/symbolic_shape_cache.h
index 4d0f1bdcd6298..17c04603dbfd2 100644
--- a/torch/csrc/jit/passes/symbolic_shape_cache.h
+++ b/torch/csrc/jit/passes/symbolic_shape_cache.h
@@ -8,7 +8,11 @@ namespace torch::jit {
 struct TORCH_API CanonicalizedSymbolicShape {
   // TODO: Consider in the future if it is reasonable to
   // merge code with SymbolicShape or VaryingShape while keeping
+<<<<<<< HEAD
   // the two not implicitly convertible (and cause bugs).
+=======
+  // the two not implicitly convertable (and cause bugs).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CanonicalizedSymbolicShape(
       const c10::SymbolicShape& orig_shape,
       std::unordered_map<int64_t, int64_t>& ss_map) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 672a9949c6b91..f87cd835c8d65 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -16,6 +16,10 @@
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/passes/remove_redundant_profiles.h>
 #include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
@@ -195,6 +199,7 @@ static void removeProfileNodesAndSpecializeTypes(Block* b) {
       if (it->input()->type()->kind() == c10::TypeKind::TensorType) {
         input_tensor_type = it->input()->type()->expect<TensorType>();
       } else {
+<<<<<<< HEAD
         auto element_type = it->input()
                               ->type();
         if (element_type->cast<OptionalType>()) {
@@ -209,6 +214,13 @@ static void removeProfileNodesAndSpecializeTypes(Block* b) {
           element_type->expect<NoneType>();
         }
 
+=======
+        input_tensor_type = it->input()
+                                ->type()
+                                ->expectRef<OptionalType>()
+                                .getElementType()
+                                ->expect<TensorType>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_is_optional = true;
       }
 
@@ -404,7 +416,11 @@ void insertTypeGuard(
 
 namespace {
 bool has_unsupported_pin_memory(const Node* node) {
+<<<<<<< HEAD
   // can't support non-constant pin_memory or pin_memory = True
+=======
+  // cant support non-constant pin_memory or pin_memory = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (auto maybe_index = node->schema().argumentIndexWithName("pin_memory")) {
     int index = *maybe_index;
     auto inp = node->input(index);
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index c9007c82b95e5..16ee0e7bf00ed 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -66,7 +66,11 @@ TORCH_API bool isSupported(Node* node);
 /// work with dynamic shapes unless explicitly register the shape function via
 /// `torch::jit::RegisterShapeComputeGraphForSchema` for the custom operator.
 ///
+<<<<<<< HEAD
 /// @return Reference of the custom operator set
+=======
+/// @return Reference of the custome operator set
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ///
 TORCH_API OperatorSet& getCustomOperatorSet();
 
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index f9fd65f9ce541..3e7fa1e2283bf 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -62,7 +62,11 @@ struct ValueMapper {
     auto new_outputs = merged_node->outputs();
     for (Value* v : new_outputs) {
       auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false);
+<<<<<<< HEAD
       // if it doesn't have a use it shouldn't have been added as output
+=======
+      // if it doesnt have a use it shouldnt have been added as output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_INTERNAL_ASSERT(maybe_last_use);
       const Use last_use = *maybe_last_use;
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index beb6f89519804..402606888c76a 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -15,7 +15,10 @@
 #endif
 #include <c10/core/SymNodeImpl.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/frontend/schema_type_parser.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -79,7 +82,10 @@
 #include <torch/csrc/jit/passes/vulkan_rewrite.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
 #include <torch/csrc/jit/python/init.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/python/opaque_obj.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 #include <torch/csrc/jit/python/python_custom_class.h>
@@ -1256,6 +1262,14 @@ void initJITBindings(PyObject* module) {
             return a->expect_true(file, line);
           })
       .def(
+<<<<<<< HEAD
+=======
+          "expect_size",
+          [](const c10::SymNode& a, const char* file, int64_t line) {
+            return a->expect_size(file, line);
+          })
+      .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "guard_size_oblivious",
           [](const c10::SymNode& a, const char* file, int64_t line) {
             return a->guard_size_oblivious(file, line);
@@ -1693,7 +1707,11 @@ void initJITBindings(PyObject* module) {
       [](const std::string& op_name, const std::string& overload_name) {
         try {
           auto symbol = Symbol::fromQualString(op_name);
+<<<<<<< HEAD
           const auto& operations = getAllOperatorsFor(symbol);
+=======
+          auto operations = getAllOperatorsFor(symbol);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
               return op->schema();
@@ -1714,7 +1732,11 @@ void initJITBindings(PyObject* module) {
          const std::string& overload_name) -> std::optional<py::tuple> {
         try {
           auto symbol = Symbol::fromQualString(op_name);
+<<<<<<< HEAD
           const auto& operations = getAllOperatorsFor(symbol);
+=======
+          auto operations = getAllOperatorsFor(symbol);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           bool allow_numbers_as_tensors = opAllowsNumbersAsTensors(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
@@ -1723,7 +1745,11 @@ void initJITBindings(PyObject* module) {
                       const py::args& args, const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
+<<<<<<< HEAD
                         op, symbol, args, kwargs, /*is_overload*/ true);
+=======
+                        {op}, symbol, args, kwargs, /*is_overload*/ true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   });
               auto func_dk =
                   py::cpp_function([op, symbol, allow_numbers_as_tensors](
@@ -1732,12 +1758,19 @@ void initJITBindings(PyObject* module) {
                                        const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
+<<<<<<< HEAD
                         op, symbol, args, kwargs, /*is_overload*/ true, dk_);
                   });
               return py::make_tuple(
                   std::move(func),
                   std::move(func_dk),
                   py::cast(op->getTags().vec()));
+=======
+                        {op}, symbol, args, kwargs, /*is_overload*/ true, dk_);
+                  });
+              return py::make_tuple(
+                  func, func_dk, py::cast(op->getTags().vec()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
           }
           return std::nullopt;
@@ -1862,6 +1895,7 @@ void initJITBindings(PyObject* module) {
       &parseSchema,
       py::arg("schema"),
       py::arg("allow_typevars") = true);
+<<<<<<< HEAD
   m.def(
       "_make_opaque_object",
       [](py::object payload) {
@@ -1903,6 +1937,8 @@ void initJITBindings(PyObject* module) {
         return torch::jit::isRegisteredOpaqueType(type_name);
       },
       R"doc(Checks if a type name is registered as an opaque type.)doc");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("unify_type_list", [](const std::vector<TypePtr>& types) {
     std::ostringstream s;
     auto type = unifyTypeList(types, s);
@@ -1998,6 +2034,7 @@ void initJITBindings(PyObject* module) {
            std::vector<Argument>,
            bool,
            bool>())
+<<<<<<< HEAD
       .def_property_readonly("name", &FunctionSchema::name)
       .def_property_readonly("overload_name", &FunctionSchema::overload_name)
       .def_property_readonly("arguments", &FunctionSchema::arguments)
@@ -2017,6 +2054,19 @@ void initJITBindings(PyObject* module) {
           // FunctionSchema::isBackwardCompatibleWith has an extra
           // defaulted argument, so we can't just use a
           // pointer-to-member here.
+=======
+      .def_property_readonly(
+          "name", [](FunctionSchema& self) { return self.name(); })
+      .def_property_readonly(
+          "overload_name",
+          [](FunctionSchema& self) { return self.overload_name(); })
+      .def_property_readonly(
+          "arguments", [](FunctionSchema& self) { return self.arguments(); })
+      .def_property_readonly(
+          "returns", [](FunctionSchema& self) { return self.returns(); })
+      .def(
+          "is_backward_compatible_with",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           [](const FunctionSchema& self, const FunctionSchema& old_schema) {
             return self.isBackwardCompatibleWith(old_schema);
           })
@@ -2039,14 +2089,22 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "__str__",
+<<<<<<< HEAD
           [](const FunctionSchema& self) {
+=======
+          [](FunctionSchema& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             std::stringstream ss;
             ss << self;
             return ss.str();
           })
       .def(
           "__repr__",
+<<<<<<< HEAD
           [](const FunctionSchema& self) {
+=======
+          [](FunctionSchema& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             std::stringstream ss;
             ss << self;
             return ss.str();
@@ -2060,9 +2118,14 @@ void initJITBindings(PyObject* module) {
           [](const py::str& schema) { // __setstate__, note: no `self` argument
             return parseSchema(schema);
           }))
+<<<<<<< HEAD
       .def_property_readonly("is_mutable", [](const FunctionSchema& self) {
         return self.is_mutable();
       });
+=======
+      .def_property_readonly(
+          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<Argument>(m, "Argument")
       .def(py::init<
            std::string,
@@ -2071,17 +2134,31 @@ void initJITBindings(PyObject* module) {
            std::optional<IValue>,
            bool,
            std::optional<AliasInfo>>())
+<<<<<<< HEAD
       .def_property_readonly("name", &Argument::name)
       .def_property_readonly("type", &Argument::type)
       .def_property_readonly("real_type", &Argument::real_type)
       .def_property_readonly(
           "N",
           [](const Argument& self) -> py::object {
+=======
+      .def_property_readonly("name", [](Argument& self) { return self.name(); })
+      .def_property_readonly("type", [](Argument& self) { return self.type(); })
+      .def_property_readonly(
+          "real_type", [](Argument& self) { return self.real_type(); })
+      .def_property_readonly(
+          "N",
+          [](Argument& self) -> py::object {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (self.N()) ? py::cast(*self.N()) : py::none();
           })
       .def_property_readonly(
           "default_value",
+<<<<<<< HEAD
           [](const Argument& self) -> py::object {
+=======
+          [](Argument& self) -> py::object {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (!self.default_value()) {
               return py::none();
             }
@@ -2090,6 +2167,7 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "has_default_value",
+<<<<<<< HEAD
           [](const Argument& self) -> py::bool_ {
             return self.default_value().has_value();
           })
@@ -2098,30 +2176,56 @@ void initJITBindings(PyObject* module) {
       .def_property_readonly(
           "is_write",
           [](const Argument& self) {
+=======
+          [](Argument& self) -> py::bool_ {
+            return self.default_value().has_value();
+          })
+      .def_property_readonly(
+          "alias_info", [](Argument& self) { return self.alias_info(); })
+      .def_property_readonly(
+          "is_write",
+          [](Argument& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (self.alias_info() == nullptr) {
               return false;
             }
             return self.alias_info()->isWrite();
           })
       .def_property_readonly(
+<<<<<<< HEAD
           "is_out", [](const Argument& self) { return self.is_out(); })
       .def_property_readonly("kwarg_only", [](const Argument& self) -> bool {
+=======
+          "is_out", [](Argument& self) { return self.is_out(); })
+      .def_property_readonly("kwarg_only", [](Argument& self) -> bool {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.kwarg_only();
       });
   py::class_<AliasInfo>(m, "_AliasInfo")
       .def(py::init<bool, std::set<std::string>, std::set<std::string>>())
       .def_property_readonly(
+<<<<<<< HEAD
           "is_write", [](const AliasInfo& self) { return self.isWrite(); })
       .def_property_readonly(
           "before_set",
           [](const AliasInfo& self) {
+=======
+          "is_write", [](AliasInfo& self) { return self.isWrite(); })
+      .def_property_readonly(
+          "before_set",
+          [](AliasInfo& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             std::set<py::str> before_set_python;
             for (const auto& set : self.beforeSets()) {
               before_set_python.insert(py::str(set.toUnqualString()));
             }
             return before_set_python;
           })
+<<<<<<< HEAD
       .def_property_readonly("after_set", [](const AliasInfo& self) {
+=======
+      .def_property_readonly("after_set", [](AliasInfo& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::set<py::str> after_set_python;
         for (const auto& set : self.afterSets()) {
           after_set_python.insert(py::str(set.toUnqualString()));
@@ -2364,7 +2468,11 @@ void initJITBindings(PyObject* module) {
               // Throw errors when calling wait() on the returned Future if
               // any of the original futures would throw.
               // NB: PythonFutureWrapper takes an unwrap_func which serves as a
+<<<<<<< HEAD
               // callback to evaluate the value in the Future. RPC uses this
+=======
+              // callback to evalute the value in the Future. RPC uses this
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               // unwrap_func to check whether the returned py::object is a
               // RemoteException object, and re-throw the exception if it is.
               // By extracting the c10::ivalue::Future from PythonFutureWrapper
diff --git a/torch/csrc/jit/python/pybind.h b/torch/csrc/jit/python/pybind.h
index 066ff7f77f56c..4397b9e162ae6 100644
--- a/torch/csrc/jit/python/pybind.h
+++ b/torch/csrc/jit/python/pybind.h
@@ -113,7 +113,11 @@ struct type_caster<torch::jit::IValue> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try {
       value = torch::jit::toTypeInferredIValue(src);
       return true;
@@ -136,7 +140,11 @@ struct type_caster<torch::jit::Symbol> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::Symbol, _("Symbol"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // TODO: Is there a way to py::cast that doesn't raise an exception on
     // failure?  Can we catch pybind11::cast_error here instead?
     std::string src_str;
@@ -164,7 +172,11 @@ struct type_caster<torch::jit::AttributeKind> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::AttributeKind, _("AttributeKind"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
 
@@ -186,7 +198,11 @@ template <>
 struct type_caster<std::vector<torch::jit::Node*>> : ListCasterBase {
   static handle cast(
       const std::vector<torch::jit::Node*>& src,
+<<<<<<< HEAD
       return_value_policy /*unused*/,
+=======
+      return_value_policy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       handle parent) {
     return ListCasterBase::cast(src, return_value_policy::reference, parent);
   }
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index d60a6a0990082..4929e760904e5 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -90,7 +90,11 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
         if (PyBool_Check(obj.ptr())) {
           scalar = at::Scalar(THPUtils_unpackBool(obj.ptr()));
         } else if (THPUtils_checkLong(obj.ptr())) {
+<<<<<<< HEAD
           scalar = THPUtils_unpackInteger<at::Scalar>(obj.ptr());
+=======
+          scalar = at::Scalar(THPUtils_unpackLong(obj.ptr()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         } else if (PyComplex_Check(obj.ptr())) {
           scalar = at::Scalar(THPUtils_unpackComplexDouble(obj.ptr()));
         } else if (THPUtils_checkDouble(obj.ptr())) {
@@ -313,7 +317,11 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
           bool is_symbolic = false;
           for (auto it = obj.begin(); it != obj.end(); it++) {
             auto elm = *it;
+<<<<<<< HEAD
             if (torch::is_symint(elm) || THPVariable_Check(elm.ptr())) {
+=======
+            if (torch::is_symint(elm)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               is_symbolic = true;
               break;
             }
@@ -512,7 +520,11 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
       if (py::isinstance<py::bool_>(obj)) {
         return py::cast<bool>(obj);
       } else if (py::isinstance<py::int_>(obj)) {
+<<<<<<< HEAD
         return THPUtils_unpackInteger<IValue>(obj.ptr());
+=======
+        return py::cast<int64_t>(obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else if (py::isinstance<py::float_>(obj)) {
         return py::cast<double>(obj);
       } else if (PyComplex_CheckExact(obj.ptr())) {
@@ -598,8 +610,11 @@ py::object toPyObject(IValue ivalue) {
           return py::cast(*tensor.const_data_ptr<bool>());
         case at::ScalarType::Long:
           return py::cast(*tensor.const_data_ptr<int64_t>());
+<<<<<<< HEAD
         case at::ScalarType::UInt64:
           return py::cast(*tensor.const_data_ptr<uint64_t>());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         case at::ScalarType::Double:
           return py::cast(*tensor.const_data_ptr<double>());
         case at::ScalarType::ComplexDouble:
@@ -765,8 +780,11 @@ py::object toPyObject(IValue ivalue) {
     return py::cast(std::move(ivalue).toSymFloat());
   } else if (ivalue.isSymBool()) {
     return py::cast(std::move(ivalue).toSymBool());
+<<<<<<< HEAD
   } else if (ivalue.isUnsigned()) {
     return py::cast(std::move(ivalue).toUInt());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(
         false,
@@ -780,6 +798,7 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs) {
+<<<<<<< HEAD
   return getOpWithStack(
       c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs);
 }
@@ -791,6 +810,11 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
   Stack stack;
   if (operations.size() == 1) {
     std::shared_ptr<Operator> op = operations[0];
+=======
+  Stack stack;
+  if (operations.size() == 1) {
+    std::shared_ptr<Operator> op = operations.at(0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Create a stack full of the arguments and keyword arguments.
     stack = createStackForSchema(op->schema(), args, kwargs, std::nullopt);
 
@@ -821,7 +845,11 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
 }
 
 // This function is used to check if the schema is valid for the given args and
+<<<<<<< HEAD
 // kwargs. It checks script object by checking whether the FakeScriptObject is
+=======
+// kwargs. It checks script object by checking wether the FakeScriptObject is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // an instance of the corresponding fake class for the actual class used in
 // schema.
 bool checkSchemaAllowFakeScriptObject(
@@ -842,6 +870,7 @@ py::object invokeOperatorFromPython(
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk) {
+<<<<<<< HEAD
   return invokeOperatorFromPython(
       c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs, dk);
 }
@@ -851,6 +880,8 @@ py::object invokeOperatorFromPython(
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk) {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto [found_op, stack] = getOpWithStack(operations, args, kwargs);
   {
     pybind11::gil_scoped_release no_gil_guard;
@@ -872,6 +903,7 @@ std::optional<py::object> _maybe_handle_torch_function(
     const py::args& args,
     const py::kwargs& kwargs) {
   std::vector<PyObject*> overloaded_args;
+<<<<<<< HEAD
   const auto args_size = args.size();
   size_t total_arg_num = args_size + kwargs.size();
   PyObject* const args_ptr = args.ptr();
@@ -895,6 +927,13 @@ std::optional<py::object> _maybe_handle_torch_function(
     is_tensor_and_append_overloaded(args_i_ptr, &overloaded_args);
     is_tensor_list_and_append_overloaded(
         args_i_ptr,
+=======
+  size_t total_arg_num = args.size() + kwargs.size();
+  for (const auto i : c10::irange(args.size())) {
+    is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        args[i].ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         &overloaded_args,
         static_cast<int>(total_arg_num),
         false /* throw_error */);
@@ -946,6 +985,7 @@ py::object _get_operation_for_overload_or_packet(
     const py::kwargs& kwargs,
     bool is_overload,
     std::optional<c10::DispatchKey> dk) {
+<<<<<<< HEAD
   return _get_operation_for_overload_or_packet(
       c10::ArrayRef(operations), symbol, args, kwargs, is_overload, dk);
 }
@@ -957,6 +997,8 @@ py::object _get_operation_for_overload_or_packet(
     const py::kwargs& kwargs,
     bool is_overload,
     std::optional<c10::DispatchKey> dk) {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string ns = symbol.ns().toUnqualString();
   std::string method_name = symbol.toUnqualString();
   std::string overload_name = operations[0]->schema().overload_name();
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 5ae84e3e0c68b..d106a070d0228 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -649,7 +649,11 @@ inline InferredType tryToInferContainerType(
           "."));
     } else {
       // TODO: this message is not correct anymore, since this InferredType is
+<<<<<<< HEAD
       // used from a bunch of circumstances unrelated to tracing. We can reuse
+=======
+      // used from a bunch of circumstances unrelated to tracing. We can re-use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // this instead of the attribute_failure stuff in concreteType
       return InferredType(c10::str(
           "Only tensors and (possibly nested) tuples of tensors, lists, or dicts ",
@@ -1277,6 +1281,7 @@ TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const py::args& args,
     const py::kwargs& kwargs);
 
+<<<<<<< HEAD
 // Efficient overload (does not require vector allocation) of the
 // above for use from C++ code.
 std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
@@ -1284,12 +1289,15 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const py::args& args,
     const py::kwargs& kwargs);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+<<<<<<< HEAD
 // Efficient overload (does not require vector allocation) of the
 // above for use from C++ code.
 py::object invokeOperatorFromPython(
@@ -1298,6 +1306,8 @@ py::object invokeOperatorFromPython(
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_PYTHON_API std::optional<py::object> _maybe_handle_torch_function(
     const std::string& ns,
     const std::string& method_name,
@@ -1319,6 +1329,7 @@ TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     bool is_overload,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+<<<<<<< HEAD
 // Efficient overload (does not require vector allocation) of the
 // above for use from C++ code.
 py::object _get_operation_for_overload_or_packet(
@@ -1329,4 +1340,6 @@ py::object _get_operation_for_overload_or_packet(
     bool is_overload,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index b71f21d043a31..ceb599ce9a9ed 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -78,7 +78,12 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Bool);
   } else if (PyLong_Check(obj)) { // Wrap longs in Long tensors
+<<<<<<< HEAD
     at::Tensor var = scalar_to_tensor(at::Scalar(THPUtils_unpackLong(obj)));
+=======
+    at::Tensor var = scalar_to_tensor(
+        at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(obj))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Long);
diff --git a/torch/csrc/jit/python/python_ivalue.h b/torch/csrc/jit/python/python_ivalue.h
index 73297c3ac0794..96b235f9307a6 100644
--- a/torch/csrc/jit/python/python_ivalue.h
+++ b/torch/csrc/jit/python/python_ivalue.h
@@ -99,7 +99,11 @@ struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder {
     py_obj_.ptr() = nullptr;
   }
 
+<<<<<<< HEAD
   // explicit construction to avoid erroneous implicit conversion and
+=======
+  // explicit construction to avoid errornous implicit conversion and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // copy-initialization
   explicit ConcretePyObjectHolder(py::object py_obj)
       : py_obj_(std::move(py_obj)) {}
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 8b16e089aa50e..390354f8a192b 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1222,10 +1222,15 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("isinstance").ptr()) {
     return SpecialFormValue::create(prim::isinstance);
+<<<<<<< HEAD
   } else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
     return std::make_shared<TorchCheckValue>();
 #ifdef USE_RPC
     // RPC module is only available when build flag "USE_DISTRIBUTED" is on.
+=======
+#ifdef USE_RPC
+    // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (
       isRpcAvailable &&
       obj.ptr() ==
@@ -1238,7 +1243,11 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     return SpecialFormValue::create(prim::rpc_sync);
   } else if (
       isRpcAvailable &&
+<<<<<<< HEAD
       // RPC module is only available  when build flag "USE_DISTRIBUTED" is on.
+=======
+      // RPC module is only avaialble  when build flag "USE_DISTRIBUTED" is on.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       obj.ptr() ==
           py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
     return SpecialFormValue::create(prim::rpc_remote);
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index c00eefa20df03..227af98a50c6b 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -68,7 +68,11 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
         ErrorReport(loc)
         << kind() << " cannot be used as a value. "
         << "Perhaps it is a closed over global variable? If so, please "
+<<<<<<< HEAD
         << "consider passing it in as an argument or use a local variable "
+=======
+        << "consider passing it in as an argument or use a local varible "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         << "instead.");
   }
 
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 81da1605fcbe2..80746e4ad18e4 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -89,7 +89,11 @@ std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracingWithDict(
   };
 
   // The argument_names parameter is parsed in python and its order
+<<<<<<< HEAD
   // is the same as the arguments' declaration order in forward() method.
+=======
+  // is the same as the arguments' decalaration order in forward() method.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // These name shall be added to the graph as debug name and the order
   // should align with the traceable stack we generated by the python dict.
   std::vector<std::string> compact_argument_names;
diff --git a/torch/csrc/jit/runtime/instruction.h b/torch/csrc/jit/runtime/instruction.h
index fbaca4b6ea782..01c5901616dba 100644
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@@ -95,5 +95,9 @@ std::ostream& operator<<(std::ostream& out, Instruction inst);
 bool isOpSupportedInMobile(OpCode op);
 char const* toString(OpCode op);
 OpCode parseOpCode(const char* str);
+<<<<<<< HEAD
+=======
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 9d4d681f8b32f..4dedcefd0b170 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -55,7 +55,11 @@ C10_DEFINE_bool(
 C10_DEFINE_bool(
     torch_jit_enable_expanded_stacks,
     false,
+<<<<<<< HEAD
     "When true we will attempts to pre-expand node stacks and cache expanded stacks.")
+=======
+    "When true we will attemps to pre-expand node stacks and cache expanded stacks.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_DEFINE_bool(
     torch_jit_expanded_stacks_mangled,
diff --git a/torch/csrc/jit/runtime/jit_exception.h b/torch/csrc/jit/runtime/jit_exception.h
index 580febe465ff2..27c957e092c2a 100644
--- a/torch/csrc/jit/runtime/jit_exception.h
+++ b/torch/csrc/jit/runtime/jit_exception.h
@@ -18,7 +18,11 @@ struct TORCH_API JITException : public std::runtime_error {
     return python_class_name_;
   }
 
+<<<<<<< HEAD
   // the original msg if this is from a python exception. The interpreter has
+=======
+  // the original msg if this is from a python exception. The interpretor has
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // changed the original message by adding "The following operation failed in
   // the TorchScript interpreter." in front of it in the handleError function.
   std::optional<std::string> getOriginalMsg() const {
diff --git a/torch/csrc/jit/runtime/jit_trace.cpp b/torch/csrc/jit/runtime/jit_trace.cpp
index 45be4fe21bb4b..cf0468de26e96 100644
--- a/torch/csrc/jit/runtime/jit_trace.cpp
+++ b/torch/csrc/jit/runtime/jit_trace.cpp
@@ -62,10 +62,14 @@ void eraseAllOutputs(Node* opt_pn) {
   }
 }
 
+<<<<<<< HEAD
 void insertTracingNodes(
     Block* /*block*/,
     ProfilingRecord* /*pr*/,
     TracingData& /*td*/);
+=======
+void insertTracingNodes(Block*, ProfilingRecord*, TracingData&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // The subtlety in `createPropNodeForIfBlock` is that we need to create
 // a "propagate" node that will propagate the mapping between the outputs
diff --git a/torch/csrc/jit/runtime/logging.cpp b/torch/csrc/jit/runtime/logging.cpp
index b8a6612af132f..3ea23f6004622 100644
--- a/torch/csrc/jit/runtime/logging.cpp
+++ b/torch/csrc/jit/runtime/logging.cpp
@@ -1,6 +1,9 @@
 #include <torch/csrc/jit/runtime/logging.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <atomic>
 #include <chrono>
 #include <mutex>
@@ -34,7 +37,11 @@ int64_t LockingLogger::getCounterValue(const std::string& name) const {
       return raw_counter.sum / raw_counter.count;
     } break;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown aggregation type!");
+=======
+  throw std::runtime_error("Unknown aggregation type!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void LockingLogger::setAggregationType(
diff --git a/torch/csrc/jit/runtime/profiling_record.h b/torch/csrc/jit/runtime/profiling_record.h
index 0dfdb246dd681..4255d14516c9e 100644
--- a/torch/csrc/jit/runtime/profiling_record.h
+++ b/torch/csrc/jit/runtime/profiling_record.h
@@ -81,8 +81,12 @@ namespace torch::jit {
 using ::c10::TensorTypePtr;
 using Dimension = int64_t;
 
+<<<<<<< HEAD
 TORCH_API void RegisterProfilingNode(
     const std::function<bool(const Node*)>& /*func*/);
+=======
+TORCH_API void RegisterProfilingNode(const std::function<bool(const Node*)>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct ProfilingRecord;
 
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index adbfdb46f0932..dff228841622b 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -11,7 +11,10 @@
 #include <torch/csrc/jit/runtime/slice_indices_adjust.h>
 #include <limits>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 namespace torch::jit {
@@ -113,6 +116,7 @@ void listRemove<at::Tensor>(Stack& stack) {
 }
 
 void checkImplicitTensorToNum(const at::Tensor& t, bool toInt) {
+<<<<<<< HEAD
   TORCH_CHECK(
       !t.requires_grad(),
       "Cannot input a tensor that requires grad as a scalar argument");
@@ -124,6 +128,22 @@ void checkImplicitTensorToNum(const at::Tensor& t, bool toInt) {
       "Cannot input a tensor of type ",
       t.scalar_type(),
       " as an integral argument");
+=======
+  if (t.requires_grad()) {
+    throw std::runtime_error(
+        "Cannot input a tensor that requires grad as a scalar argument");
+  }
+  if (!t.sizes().empty()) {
+    throw std::runtime_error(
+        "Cannot input a tensor of dimension other than 0 as a scalar argument");
+  }
+  if (toInt && !isIntegralType(t.scalar_type(), /*includeBool=*/false)) {
+    std::stringstream ss;
+    ss << "Cannot input a tensor of type " << t.scalar_type()
+       << " as an integral argument";
+    throw std::runtime_error(ss.str());
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void checkDoubleInRange(double a) {
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index 7578ea6b1f99c..e7b93fa91e87f 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -418,8 +418,13 @@ struct OperatorGeneratorArgs {
 
   template <typename... Args>
   explicit constexpr OperatorGeneratorArgs(
+<<<<<<< HEAD
       torch::detail::SelectiveStr<false> /*unused*/,
       Args... /*unused*/)
+=======
+      torch::detail::SelectiveStr<false>,
+      Args...)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : schema_str(nullptr),
         isOperationCreator(false),
         operation(nullptr),
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 310fe35ffaacb..d9dc9a1625b75 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1,6 +1,9 @@
 #include <ATen/autocast_mode.h>
 #include <ATen/core/Generator.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/mobile/promoted_prim_ops.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
@@ -116,8 +119,13 @@ bool isSortableListOfObjectsOrTuples(
   }
 
   auto type = ivalues.get(0).type();
+<<<<<<< HEAD
   // We assume lists have homogeneous types, use first element to determine
   // best sorting methods. If in the future we need to support heterogeneous
+=======
+  // We assume lists have homogenous types, use first element to determine
+  // best sorting methods. If in the future we need to support heterogenous
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // types inside list, then sorting needs to have runtime sortable checks.
   const size_t n = ivalues.size();
   for (const auto i : c10::irange(n)) {
@@ -160,8 +168,14 @@ void sort_op(Stack& stack) {
 
   if (!g_list.empty()) {
     std::stringstream error_str;
+<<<<<<< HEAD
     TORCH_CHECK(
         isSortableListOfObjectsOrTuples(g_list, error_str), error_str.str());
+=======
+    if (!isSortableListOfObjectsOrTuples(g_list, error_str)) {
+      throw std::runtime_error(error_str.str());
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     c10::IValueComparator comparator;
     if (reverse) {
@@ -254,7 +268,13 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           int64_t lo = 0, hi = 0, step = 0;
           pop(stack, lo, hi, step);
           // error handling when step_val = 0 during runtime
+<<<<<<< HEAD
           TORCH_CHECK(step != 0, "range() arg 3 must not be zero");
+=======
+          if (step == 0) {
+            throw std::runtime_error("range() arg 3 must not be zero");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (step > 0 && lo < hi) {
             push(stack, 1 + (hi - 1 - lo) / step);
           } else if (step < 0 && lo > hi) {
@@ -380,6 +400,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto s = pop(stack).toString();
           std::string::size_type sz = 0;
           int64_t val = static_cast<int64_t>(std::stoll(s->string(), &sz));
+<<<<<<< HEAD
           TORCH_CHECK(
               sz == s->string().size(),
               "invalid literal for int() ",
@@ -387,6 +408,16 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
               s->string(),
               "'");
           push(stack, val);
+=======
+          if (sz == s->string().size()) {
+            push(stack, val);
+          } else {
+            std::stringstream error_str;
+            error_str << "invalid literal for int() "
+                      << "with base 10: '" << s->string() << "'";
+            throw std::runtime_error(error_str.str());
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
@@ -433,6 +464,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto s = pop(stack).toString();
           std::string::size_type sz = 0;
           double b = std::stod(s->string(), &sz);
+<<<<<<< HEAD
           TORCH_CHECK(
               sz == s->string().size(),
               "could not convert string ",
@@ -440,6 +472,16 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
               s->string(),
               "'");
           push(stack, b);
+=======
+          if (sz == s->string().size()) {
+            push(stack, b);
+          } else {
+            std::stringstream error_str;
+            error_str << "could not convert string "
+                      << "to float: '" << s->string() << "'";
+            throw std::runtime_error(error_str.str());
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
@@ -1137,7 +1179,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
     //
     // create a clone of these declarations with a _hacked_twin overload name
     // and nullability scrubbed from TensorList arg types
+<<<<<<< HEAD
     // TODO find out why this exists and how to do it without the hack
+=======
+    // TOOD find out why this exists and how to do it without the hack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     //
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
@@ -1706,7 +1752,11 @@ int64_t stringFindImpl(
     bool reverse = false) {
   int64_t size = string.size();
   if (start < 0) {
+<<<<<<< HEAD
     start = std::max(int64_t(0), size + start);
+=======
+    start = std::max(int64_t(0), int64_t(size + start));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (end < 0) {
     end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -1789,7 +1839,14 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           }
 
           const std::string& separator = ivalue.toStringRef();
+<<<<<<< HEAD
           TORCH_CHECK(!separator.empty(), "ValueError: empty separator");
+=======
+
+          if (separator.empty()) {
+            throw std::runtime_error("ValueError: empty separator");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           auto count = 0;
 
@@ -1912,9 +1969,17 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string fillchar = pop(stack).toStringRef();
           int64_t width = pop(stack).toInt();
           std::string string = pop(stack).toStringRef();
+<<<<<<< HEAD
           TORCH_CHECK(
               fillchar.size() == 1,
               "TypeError: The fill character must be exactly one character long");
+=======
+          if (fillchar.size() != 1) {
+            // TODO: this should be a TypeError
+            throw std::runtime_error(
+                "TypeError: The fill character must be exactly one character long");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (string.size() > static_cast<std::string::size_type>(width)) {
             push(stack, string);
             return;
@@ -1955,7 +2020,11 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
             return;
           }
           if (start < 0) {
+<<<<<<< HEAD
             start = std::max(int64_t(0), size + start);
+=======
+            start = std::max(int64_t(0), int64_t(size + start));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -1984,7 +2053,11 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string string = pop(stack).toStringRef();
           int64_t size = string.size();
           if (start < 0) {
+<<<<<<< HEAD
             start = std::max(int64_t(0), (size + start));
+=======
+            start = std::max(int64_t(0), int64_t(size + start));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -2010,7 +2083,11 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string string = pop(stack).toStringRef();
           int64_t size = string.size();
           if (start < 0) {
+<<<<<<< HEAD
             start = std::max(int64_t(0), (size + start));
+=======
+            start = std::max(int64_t(0), int64_t(size + start));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -2083,7 +2160,13 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string substr = pop(stack).toStringRef();
           std::string string = pop(stack).toStringRef();
           auto result = stringFindImpl(string, substr, start, end);
+<<<<<<< HEAD
           TORCH_CHECK(result >= 0, "ValueError: substring not found");
+=======
+          if (result < 0) {
+            throw std::runtime_error("ValueError: substring not found");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           push(stack, result);
         },
         aliasAnalysisFromSchema()),
@@ -2096,7 +2179,13 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string substr = pop(stack).toStringRef();
           std::string string = pop(stack).toStringRef();
           auto result = stringFindImpl(string, substr, start, end, true);
+<<<<<<< HEAD
           TORCH_CHECK(result >= 0, "ValueError: substring not found");
+=======
+          if (result < 0) {
+            throw std::runtime_error("ValueError: substring not found");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           push(stack, result);
         },
         aliasAnalysisFromSchema()),
@@ -2170,9 +2259,17 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string fillchar = pop(stack).toStringRef();
           int64_t width = pop(stack).toInt();
           std::string string = pop(stack).toStringRef();
+<<<<<<< HEAD
           TORCH_CHECK(
               fillchar.size() == 1,
               "TypeError: The fill character must be exactly one character long");
+=======
+          if (fillchar.size() != 1) {
+            // TODO: this should be a TypeError
+            throw std::runtime_error(
+                "TypeError: The fill character must be exactly one character long");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto to_append =
               std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
 
@@ -2192,9 +2289,17 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string fillchar = pop(stack).toStringRef();
           int64_t width = pop(stack).toInt();
           std::string string = pop(stack).toStringRef();
+<<<<<<< HEAD
           TORCH_CHECK(
               fillchar.size() == 1,
               "TypeError: The fill character must be exactly one character long");
+=======
+          if (fillchar.size() != 1) {
+            // TODO: this should be a TypeError
+            throw std::runtime_error(
+                "TypeError: The fill character must be exactly one character long");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto to_append =
               std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
 
@@ -2822,7 +2927,11 @@ void hashValue(Stack& stack) {
 }
 
 static const std::vector<OperatorGeneratorArgs> opGenArgs2{
+<<<<<<< HEAD
     // registered as Any[] so that heterogeneous tuples can be called with len()
+=======
+    // registered as Any[] so that heterogenous tuples can be called with len()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::len.any(Any[] a) -> int"),
         listLen,
@@ -3341,8 +3450,15 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
           int64_t a = 0, b = 0;
           lldiv_t divresult = {};
           pop(stack, a, b);
+<<<<<<< HEAD
           TORCH_CHECK(
               b != 0, "ZeroDivisionError: integer division or modulo by zero");
+=======
+          if (b == 0) {
+            throw std::runtime_error(
+                "ZeroDivisionError: integer division or modulo by zero");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           divresult = lldiv(a, b);
           if (divresult.rem && (a < 0) != (b < 0)) {
             divresult.quot -= 1;
@@ -3360,7 +3476,13 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         [](Stack& stack) {
           double a = 0, b = 0;
           pop(stack, a, b);
+<<<<<<< HEAD
           TORCH_CHECK(b != 0, "ZeroDivisionError: float divmod()");
+=======
+          if (b == 0) {
+            throw std::runtime_error("ZeroDivisionError: float divmod()");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           double rem = fmod(a, b);
           // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
           if (rem && (a < 0) != (b < 0)) {
@@ -3405,7 +3527,13 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         type_a a;                                                            \
         type_b b;                                                            \
         pop(stack, a, b);                                                    \
+<<<<<<< HEAD
         TORCH_CHECK(b != 0, "ZeroDivisionError: float divmod()");            \
+=======
+        if (b == 0) {                                                        \
+          throw std::runtime_error("ZeroDivisionError: float divmod()");     \
+        }                                                                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         double quot = floor(a / b);                                          \
         double rem = a - (quot * b);                                         \
         push(stack, quot, rem);                                              \
diff --git a/torch/csrc/jit/runtime/script_profile.h b/torch/csrc/jit/runtime/script_profile.h
index 6c6588b2cec42..b544260cd6ff0 100644
--- a/torch/csrc/jit/runtime/script_profile.h
+++ b/torch/csrc/jit/runtime/script_profile.h
@@ -24,7 +24,11 @@ struct Datapoint {
 
 class TORCH_API InstructionSpan {
  public:
+<<<<<<< HEAD
   explicit InstructionSpan(Node& /*node*/);
+=======
+  explicit InstructionSpan(Node&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~InstructionSpan();
   InstructionSpan(InstructionSpan&&) = delete;
   InstructionSpan& operator=(InstructionSpan&&) = delete;
@@ -91,7 +95,11 @@ class TORCH_API ScriptProfile : public CustomClassHolder {
   void enable();
   void disable();
   const SourceMap& dumpStats();
+<<<<<<< HEAD
   void addDatapoint(std::shared_ptr<profiling::Datapoint> /*datapoint*/);
+=======
+  void addDatapoint(std::shared_ptr<profiling::Datapoint>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~ScriptProfile() override;
 
  private:
diff --git a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
index d77e0b3a10d64..f87f2411054f7 100644
--- a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
+++ b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
@@ -3204,7 +3204,11 @@ def _batch_norm_with_update(input: List[int],
 )=====")
 + std::string(R"=====(def broadcast_inplace(a: List[int],
     b: List[int]) -> List[int]:
+<<<<<<< HEAD
   _0 = "The dims of tensor b ({}) must be less than or equal to the dims of tensor a ({}) "
+=======
+  _0 = "The dims of tensor b ({}) must be less than or equal tothe dims of tensor a ({}) "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _1 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
   dimsA = torch.len(a)
   dimsB = torch.len(b)
diff --git a/torch/csrc/jit/runtime/static/README.md b/torch/csrc/jit/runtime/static/README.md
index ba5e057ca1ec8..3b44a9ea0b24d 100644
--- a/torch/csrc/jit/runtime/static/README.md
+++ b/torch/csrc/jit/runtime/static/README.md
@@ -71,7 +71,11 @@ Runtime instances in your code.
 Static runtime's memory planner does two things:
 
 1) Coalesces internal allocations for tensor storage
+<<<<<<< HEAD
 2) Does static analysis to figure out how to efficiently reuse memory.
+=======
+2) Does static analysis to figure out how to efficiently re-use memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Standard Resizing
 Static runtime will record the space required for each intermediate managed tensor it sees
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 0a6e0b3564add..ef918f4939f47 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1098,7 +1098,11 @@ namespace {
 
 void destroyNodeOutputs(ProcessedNode& p_node) {
   const auto borrows_outputs = borrowsOutputs(p_node.node()->kind());
+<<<<<<< HEAD
   const auto num_outputs = p_node.num_outputs();
+=======
+  const auto num_outputs = static_cast<uint32_t>(p_node.num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange<uint32_t>(num_outputs)) {
     auto& output = p_node.Output(i);
     if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) {
@@ -1863,7 +1867,11 @@ bool BlockRunner::check_for_memory_leak(
   const auto num_nodes = static_cast<uint32_t>(nodes_.size());
   for (const auto n : c10::irange(num_nodes)) {
     auto& pnode = nodes_[n];
+<<<<<<< HEAD
     const auto num_outputs = pnode.num_outputs();
+=======
+    const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(num_outputs)) {
       const IValue* ival = &pnode.Output(i);
       const Value* val = pnode.node()->output(i);
@@ -1943,7 +1951,11 @@ bool BlockRunner::checkOutputTensorMemoryLeaks() {
   const auto num_nodes = static_cast<uint32_t>(nodes_.size());
   for (const auto n : c10::irange(num_nodes)) {
     auto& pnode = nodes_[n];
+<<<<<<< HEAD
     const auto num_outputs = pnode.num_outputs();
+=======
+    const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(num_outputs)) {
       const IValue* ival = &pnode.Output(i);
       const Value* val = pnode.node()->output(i);
@@ -2042,7 +2054,11 @@ ProcessedFunction::ProcessedFunction(
         stack.emplace_back(static_cast<int>(size));
       }
       node_op(stack);
+<<<<<<< HEAD
       const auto num_outputs = pnode->num_outputs();
+=======
+      const auto num_outputs = static_cast<uint32_t>(pnode->num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_DCHECK_EQ(stack.size(), num_outputs);
       for (const auto i : c10::irange(num_outputs)) {
         pnode->Output(i) = std::move(stack[i]);
@@ -2158,7 +2174,11 @@ bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
 }
 
 bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
+<<<<<<< HEAD
   const auto n_outputs = num_outputs();
+=======
+  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(n_outputs)) {
     if (!Output(i).isTensor()) {
       continue;
@@ -2196,7 +2216,11 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
     return true;
   }
   const auto n_inputs = static_cast<uint32_t>(inputs_.size());
+<<<<<<< HEAD
   const auto n_outputs = num_outputs();
+=======
+  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange<uint32_t>(n_inputs)) {
     const IValue* in = &Input(i);
     if (!in->isTensor()) {
@@ -2235,7 +2259,11 @@ bool ProcessedNode::check_and_correct_overlap_with(
 
 void ProcessedNode::verify_and_correct_memory_overlap() {
   const auto n_inputs = static_cast<uint32_t>(inputs_.size());
+<<<<<<< HEAD
   const auto n_outputs = num_outputs();
+=======
+  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(n_inputs)) {
     const IValue& in = Input(i);
     if (!in.isTensor()) {
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 24f8f01d7547e..9389644c516af 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -29,7 +29,11 @@ TORCH_API std::string dumpValueSet(
     const c10::FastSet<const Value*>& value_set,
     const char* set_name = "");
 
+<<<<<<< HEAD
 inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
+=======
+TORCH_API inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (type.kind()) {
     // NOTE: NumberType may allocate because it includes complex.
     case TypeKind::NoneType:
@@ -44,11 +48,19 @@ inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
   }
 }
 
+<<<<<<< HEAD
 inline c10::Symbol getStaticRuntimeMetadataSymbol() {
   return Symbol::attr("static_runtime::metadata");
 }
 
 inline bool borrowsOutputs(c10::Symbol kind) {
+=======
+TORCH_API inline c10::Symbol getStaticRuntimeMetadataSymbol() {
+  return Symbol::attr("static_runtime::metadata");
+}
+
+TORCH_API inline bool borrowsOutputs(c10::Symbol kind) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static const std::array<c10::Symbol, 4> symbols_with_borrowed_outputs = {
       c10::Symbol::fromQualString("static_runtime::select_tensor"),
       c10::Symbol::fromQualString("static_runtime::dict_unpack"),
@@ -70,7 +82,11 @@ inline bool borrowsOutputs(c10::Symbol kind) {
 //     The output aliases that end up here are as a result of aliasDb failing to
 //     recognize them as outputs due to collection object (e.g., Tuple) aliasing
 //     inputs.
+<<<<<<< HEAD
 // Values that don't show up in output_aliases or external_aliases are created
+=======
+// Values that dont't show up in output_aliases or external_aliases are created
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // and consumed within the graph.
 class ValueGroup {
  public:
@@ -111,7 +127,11 @@ class TORCH_API ManagedTensorRanges {
 
   // If true, then this node is the last use of at least one
   // managed tensor. availableTensorValuesAfterNode(node) will return a vector
+<<<<<<< HEAD
   // of the managed tensors that are available for reuse
+=======
+  // of the managed tensors that are available for re-use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // in the nodes following this one.
   bool nodeFreesManagedTensors(Node* node) const;
   const std::vector<const Value*>& availableTensorValuesAfterNode(
@@ -141,10 +161,17 @@ class TORCH_API ManagedTensorRanges {
   void extendInputLifetime(Node* node, size_t new_end);
 
   // Maps Node* to the set of managed tensors that are now available
+<<<<<<< HEAD
   // for reuse after this node.
   c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_;
   // Maps each Value* to its lifetime (start node index, end node index)
   c10::FastMap<const Value*, Lifetime> value_lifetimes_;
+=======
+  // for re-use after this node.
+  c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_{};
+  // Maps each Value* to its lifetime (start node index, end node index)
+  c10::FastMap<const Value*, Lifetime> value_lifetimes_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API StaticModuleOptions {
@@ -395,7 +422,11 @@ class BlockInfo {
   c10::FastSet<const Value*> managed_output_tensor_values_;
   c10::FastSet<const Value*> leaked_values_;
 
+<<<<<<< HEAD
   ManagedTensorRanges managed_tensor_ranges_;
+=======
+  ManagedTensorRanges managed_tensor_ranges_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // The index of this block's inputs in the shared values_ array.
   const uint16_t input_idx_;
@@ -549,7 +580,11 @@ class TORCH_API StaticModule {
   // IValue table (defined by prim::Constant nodes)
   std::vector<IValue> constants_;
   // The functions to be called by corresponding ProcessedNode.
+<<<<<<< HEAD
   std::vector<ProcessedFunction> functions_;
+=======
+  std::vector<ProcessedFunction> functions_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // A list of pre-processed nodes from which ProcessedNode are created per
   // StaticRuntime instance.
   std::vector<StaticNodeInfo> nodes_;
diff --git a/torch/csrc/jit/runtime/static/memory_planner.cpp b/torch/csrc/jit/runtime/static/memory_planner.cpp
index 8660183867e08..5b642b12786f8 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.cpp
+++ b/torch/csrc/jit/runtime/static/memory_planner.cpp
@@ -76,7 +76,11 @@ std::vector<StorageGroup> assignStorageToManagedTensors(
   // This set maps each Value* to its assigned storage group.
   c10::FastMap<const Value*, size_t> storage_group_mapping;
   // On each iteration, this vector stores the set of storage groups that
+<<<<<<< HEAD
   // are available for reuse.
+=======
+  // are available for re-use.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<size_t> free_storage_groups;
 
   auto makeNewStorageGroup = [&](const Value* value) {
diff --git a/torch/csrc/jit/runtime/static/memory_planner.h b/torch/csrc/jit/runtime/static/memory_planner.h
index d9755d83048ce..0b605d4529b34 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.h
+++ b/torch/csrc/jit/runtime/static/memory_planner.h
@@ -35,7 +35,11 @@ class StorageGroup {
   // allocated for all tensors in this storage group. Initially it
   // is zero, eventually it gets updated by the MemoryPlanner.
   size_t max_tensor_size_ = 0;
+<<<<<<< HEAD
   std::vector<at::Tensor*> group_;
+=======
+  std::vector<at::Tensor*> group_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // A contiguous buffer of `StorageImpl`s
@@ -263,7 +267,11 @@ class MemoryPlanner {
   // to an ordinary "strong reference" state.
   std::vector<IValue*> borrowed_ivalues_needing_incref_;
 
+<<<<<<< HEAD
   std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_;
+=======
+  std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::DataPtr buffer_; // allocated each time we call Run()
   uint8_t* buffer_start_{nullptr};
   uint8_t* buffer_end_{nullptr};
@@ -292,7 +300,11 @@ class StandardMemoryPlanner : public MemoryPlanner {
   void allocateManagedTensors() override;
   void deallocateManagedTensors() override;
 
+<<<<<<< HEAD
   std::vector<StorageGroup> managed_tensors_;
+=======
+  std::vector<StorageGroup> managed_tensors_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 716202f45687a..d8cf025897999 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -529,7 +529,11 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(aten::to, aten_to, [](Node* n) -> SROperator {
       const auto in1_i = p_node->Input(1).toOptional<at::ScalarType>();
       const auto in2_i = p_node->Input(2).toBool();
       const auto in3_i = p_node->Input(3).toBool();
+<<<<<<< HEAD
       // To mimic the behavior of the JIT interpreter, if both dtype
+=======
+      // To mimick the behavior of the JIT interpreter, if both dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // and copy are not set, we return self. Otherwise, we assume
       // that dtype is set.
       if (!in1_i && !in3_i) {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 9e408682ca6c3..ffc36f8287267 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1910,7 +1910,11 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator {
     }
     auto& out_t = p_node->Output(0).toTensor();
 
+<<<<<<< HEAD
     if (te && te->checkInput<float>(in0_t) && in0_t.sizes() == in1_t.sizes() &&
+=======
+    if (in0_t.sizes() == in1_t.sizes() &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in0_t.scalar_type() == in1_t.scalar_type() &&
         in0_t.strides() == in1_t.strides() && in0_t.is_contiguous() &&
         in0_t.scalar_type() == at::kFloat) {
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index 69fbfc7d58fa7..fff278620c946 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -22,7 +22,11 @@ namespace torch::jit {
 
 using SROpFunctor = SROperator (*)(Node* n);
 struct SROperatorFunctor {
+<<<<<<< HEAD
   virtual SROperator Generate(Node* /*unused*/) {
+=======
+  virtual SROperator Generate(Node*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SROperator out;
     return out;
   }
@@ -165,7 +169,11 @@ inline void LogAndDumpSchema(const Node* node) {
   VLOG(1) << "Found schema mismatch for: " << node->schema();
 }
 
+<<<<<<< HEAD
 inline bool sr_schema_check(torch::jit::Node* /*unused*/) {
+=======
+inline bool sr_schema_check(torch::jit::Node*) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 59ed5281db6bc..077429de6afda 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -12,11 +12,18 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/instruction.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/export.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_export_helpers.h>
 #include <torch/csrc/jit/serialization/onnx.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/serialization/pickler.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/onnx/back_compat.h>
 #include <torch/csrc/onnx/onnx.h>
 #include <torch/version.h>
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index b8746d0722412..c46103dfd0373 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -5,6 +5,10 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/serialization/python_print.h>
 #include <torch/csrc/jit/serialization/storage_context.h>
 #include <torch/csrc/jit/serialization/type_name_uniquer.h>
@@ -214,7 +218,11 @@ struct TORCH_API BytecodeEmitMode {
 // true: instruction of default argument values (like LOADC) is emitted.
 // false: instruction of default argument values are not emitted. Instead
 // they are fetched from operator schema.
+<<<<<<< HEAD
 // default_args_before_out_args (to forward compatible support
+=======
+// default_args_before_out_args (to forward compatibile support
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // operators allowing out arguments and default arguments):
 // true: the number of specified arguments will deserialized to (#all_args -
 // #default_args). false: the number of specified arguments will deserialized to
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 36c1804a06b76..8d2b144f55bfd 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -131,7 +131,11 @@ std::string get_named_tuple_str_or_default(
           // str() return "Tensor" and repr_str() return "Tensor (inferred)". If
           // it's not inferred type, str() return "Tensor[]" and repr_str()
           // return "Tensor". In cpp, repr_str() will always return "Tensor"
+<<<<<<< HEAD
           // regardless inferred type. When exporting custom type in bytecode,
+=======
+          // regardless inferred type. When exporing custom type in bytecode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // "Tensor" is the preferred way to deserialize Tensor type
           std::string named_tuple_type_str = it->is_inferred_type()
               ? named_tuple_type->str()
@@ -554,7 +558,11 @@ void ScriptModuleSerializer::writeArchive(
     }
     WriteableTensorData writable_td = getWriteableTensorData(td);
     if (use_storage_context && serialized_tensors.count(tensor_name)) {
+<<<<<<< HEAD
       // storage has been serialized already, skip
+=======
+      // storage has been serialzed already, skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     }
     writer_.writeRecord(
@@ -661,10 +669,17 @@ void ScriptModuleSerializer::writeByteCode(
   BackendDebugInfoRecorder debug_info_recorder;
   int64_t version_to_write = caffe2::serialize::kProducedBytecodeVersion;
 
+<<<<<<< HEAD
   elements.emplace_back(version_to_write);
   std::vector<c10::IValue> debug_info_elements;
   // Always save debug handles
   debug_info_elements.emplace_back(version_to_write);
+=======
+  elements.emplace_back(static_cast<int64_t>(version_to_write));
+  std::vector<c10::IValue> debug_info_elements;
+  // Always save debug handles
+  debug_info_elements.emplace_back(static_cast<int64_t>(version_to_write));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   mobile::Module mobile_module =
       jitModuleToMobile(module, getOptionsFromGlobal());
@@ -698,10 +713,17 @@ void ScriptModuleSerializer::writeByteCode(
     // debug handles.
     // The reason we save debug handles conditionally is so that
     // we dont end up with a model that has debug handles but has not
+<<<<<<< HEAD
     // debug map to correlate debug handles with.
     // Once we have a model with both handles and debug map, we can
     // strip off debug map and have a lean model served to production.
     // If exception occurs we have a model with debug map that can be
+=======
+    // debug map to correlate debug handels with.
+    // Once we have a model with both handles and debug map, we can
+    // strip off debug map and have a lean model served to production.
+    // If exception ocurrs we have a model with debug map that can be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // used to symbolicate debug handles
     writeArchive(
         debug_info_telements,
@@ -913,7 +935,11 @@ void save_jit_module_to_write_func(
     const std::function<size_t(const void*, size_t)>& writer_func) {
   (void)save_mobile_debug_info;
   auto buffer = save_jit_module_to_bytes(module, extra_files);
+<<<<<<< HEAD
   writer_func(buffer->data(), buffer->size());
+=======
+  writer_func(reinterpret_cast<void*>(buffer->data()), buffer->size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void ExportModule(
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index dd27811d1028d..a8509f10d7f51 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -71,15 +71,25 @@ static void postSetStateValidate(const IValue& v) {
 c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     const at::StrongTypePtr& type,
     IValue input) {
+<<<<<<< HEAD
   const auto& cls = type.type_->expectRef<at::ClassType>();
   auto qn = cls.name();
   size_t n = cls.numAttributes();
+=======
+  auto cls = type.type_->expect<at::ClassType>();
+  auto qn = cls->name();
+  size_t n = cls->numAttributes();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (checkHasValidSetGetState(cls)) {
     auto obj = c10::ivalue::Object::create(type, n);
     // XXX: Do not optimize __setstate__, so that we don't try to
     // specialize the class before it is initialized.
     GraphOptimizerEnabledGuard guard(false);
+<<<<<<< HEAD
     Function& set_state = cls.getMethod("__setstate__");
+=======
+    Function& set_state = cls->getMethod("__setstate__");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // since we are in the middle of unpickling we might still have lists and
     // dicts that do not have accurate tags (e.g. they report they are
     // List[Any]). But we need to run __setstate__ which will check the input
@@ -96,7 +106,11 @@ c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     auto dict = std::move(input).toGenericDict();
     auto obj = c10::ivalue::Object::create(type, n);
     for (const auto i : c10::irange(n)) {
+<<<<<<< HEAD
       obj->setSlot(i, dict.at(cls.getAttributeName(i)));
+=======
+      obj->setSlot(i, dict.at(cls->getAttributeName(i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return obj;
   }
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index aa7d457d2b2ca..8c3ec325f2a43 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -140,6 +140,15 @@ TORCH_API Module load_jit_module_from_stream(
     ExtraFilesMap& extra_files,
     std::optional<at::Device> device = std::nullopt);
 
+<<<<<<< HEAD
+=======
+TORCH_API Module parse_and_initialize_jit_module(
+    const std::shared_ptr<char>& data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    std::optional<at::Device> device);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     const at::StrongTypePtr& type,
     IValue input);
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 0622dbb5cd98e..21cf2be43f200 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <string>
 #include <type_traits>
 
@@ -5,16 +6,34 @@
 #include <ATen/core/Dict.h>
 #include <ATen/quantized/Quantizer.h>
 
+=======
+#include <ATen/ATen.h>
+#include <ATen/core/Dict.h>
+#ifdef USE_RPC
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#endif
+#include <ATen/quantized/Quantizer.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/utils/byte_order.h>
+<<<<<<< HEAD
 #ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 
 namespace torch::jit {
 
+=======
+#include <string>
+#include <type_traits>
+
+namespace torch::jit {
+
+using ::c10::IValue;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Protocol 2 is the highest that can be decoded by Python 2
 // See https://docs.python.org/3/library/pickle.html#data-stream-format
 constexpr static uint8_t PROTOCOL_VERSION = 2;
@@ -110,7 +129,11 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     pushGlobal(type_name.prefix(), type_name.name());
     push<PickleOpCode>(PickleOpCode::EMPTY_TUPLE);
     push<PickleOpCode>(PickleOpCode::NEWOBJ);
+<<<<<<< HEAD
     if (checkHasValidSetGetState(*type)) {
+=======
+    if (checkHasValidSetGetState(type)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Function& getstate = type->getMethod("__getstate__");
       pushIValue(getstate({obj}));
     } else {
@@ -719,4 +742,95 @@ void Pickler::pushTuple(const IValue& ivalue) {
   }
 }
 
+<<<<<<< HEAD
+=======
+WriteableTensorData getWriteableTensorData(
+    const at::Tensor& tensor,
+    bool to_cpu) {
+  WriteableTensorData result;
+  result.tensor_ = tensor;
+  result.size_ = tensor.storage().nbytes();
+  // TODO HIP support
+  if (tensor.storage().device_type() != DeviceType::CPU && to_cpu) {
+    // NB: This new tensor is created to support cuda tensors.
+    // Storages can be mutated when converting tensors from cuda to cpu,
+    // and we need a cpu tensor to copy data from.
+    result.tensor_ =
+        at::empty({0}, tensor.options())
+            .set_(
+                tensor.storage(),
+                /* storage_offset = */ 0,
+                /* size = */
+                {static_cast<int64_t>(
+                    tensor.storage().nbytes() / tensor.element_size())},
+                /* stride = */ {1})
+            .cpu();
+    TORCH_CHECK(
+        result.tensor_.storage().nbytes() == result.size_,
+        "Storage tensor size did not match record size");
+  }
+  return result;
+}
+
+bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls) {
+  // Check that the schemas for __getstate__ and __setstate__ are correct
+  auto getstate = cls->findMethod("__getstate__");
+  if (getstate == nullptr) {
+    return false;
+  }
+  auto get_schema = getstate->getSchema();
+
+  // Check __getstate__
+  //   __getstate__ is expected to be (self) -> T
+  TORCH_CHECK(
+      get_schema.arguments().size() == 1,
+      "'__getstate__' must have 'self' as its only argument, but found ",
+      get_schema.arguments().size(),
+      " arguments");
+  TORCH_CHECK(
+      get_schema.returns().size() == 1,
+      "'__getstate__' must return 1 value, but found ",
+      get_schema.returns().size());
+
+  // Check __setstate__ if the method exists
+  //   __setstate__ is expected to be (self, T) -> None
+  auto setstate = cls->findMethod("__setstate__");
+  if (!setstate) {
+    return false;
+  }
+  auto set_schema = setstate->getSchema();
+
+  TORCH_CHECK(
+      set_schema.arguments().size() == 2,
+      "'__setstate__' must have 'self' and the state as its "
+      "only arguments, but found ",
+      set_schema.arguments().size(),
+      " arguments");
+  TORCH_CHECK(
+      set_schema.returns().size() == 1,
+      "'__setstate__' must return None, but found ",
+      set_schema.returns().size(),
+      " return values");
+  TORCH_CHECK(
+      set_schema.returns().at(0).type()->isSubtypeOf(*NoneType::get()),
+      "'__setstate__' must return None, but found value of type",
+      set_schema.returns().at(0).type()->annotation_str());
+
+  // Check that the return type of __getstate__ matches the input to
+  // __setstate__
+  auto get_type = get_schema.returns().at(0).type();
+  auto set_type = set_schema.arguments().at(1).type();
+
+  TORCH_CHECK(
+      get_type->isSubtypeOf(*set_type),
+      "'__getstate__'s return type (",
+      get_type->annotation_str(),
+      ") does not match '__setstate__'s argument type (",
+      set_type->annotation_str(),
+      ")");
+
+  return true;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 526c840bc10e8..b60c350ca0edf 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -1,5 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
+=======
+#include <ATen/core/qualified_name.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <string>
 #include <string_view>
 #include <utility>
@@ -8,17 +12,125 @@
 #include <ATen/Utils.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type.h>
+<<<<<<< HEAD
 #include <ATen/core/qualified_name.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/ArrayRef.h>
 #include <c10/util/FbcodeMaps.h>
 #include <c10/util/intrusive_ptr.h>
 #include <torch/csrc/Export.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/serialization/pickler_helper.h>
 
 namespace torch::jit {
 
 using ::c10::IValue;
 
+=======
+
+namespace torch::jit {
+
+// See Python's pickletools.py for a detailed description of each of these codes
+enum class PickleOpCode : char {
+  MARK = '(',
+  STOP = '.',
+  POP = '0',
+  POP_MARK = '1',
+  DUP = '2',
+  FLOAT = 'F',
+  INT = 'I',
+  BININT = 'J',
+  BININT1 = 'K',
+  LONG = 'L',
+  BININT2 = 'M',
+  NONE = 'N',
+  PERSID = 'P',
+  BINPERSID = 'Q',
+  REDUCE = 'R',
+  STRING = 'S',
+  BINSTRING = 'T',
+  SHORT_BINSTRING = 'U',
+  // NB: Avoid using UNICODE as it is a macro in the Windows API
+  UNICODE_ = 'V',
+  BINUNICODE = 'X',
+  APPEND = 'a',
+  BUILD = 'b',
+  GLOBAL = 'c',
+  DICT = 'd',
+  EMPTY_DICT = '}',
+  APPENDS = 'e',
+  GET = 'g',
+  BINGET = 'h',
+  INST = 'i',
+  LONG_BINGET = 'j',
+  LIST = 'l',
+  EMPTY_LIST = ']',
+  OBJ = 'o',
+  PUT = 'p',
+  BINPUT = 'q',
+  LONG_BINPUT = 'r',
+  SETITEM = 's',
+  TUPLE = 't',
+  EMPTY_TUPLE = ')',
+  SETITEMS = 'u',
+  BINFLOAT = 'G',
+
+  // Protocol 2
+  PROTO = char('\x80'),
+  NEWOBJ = '\x81',
+  EXT1 = '\x82',
+  EXT2 = '\x83',
+  EXT4 = '\x84',
+  TUPLE1 = '\x85',
+  TUPLE2 = '\x86',
+  TUPLE3 = '\x87',
+  NEWTRUE = '\x88',
+  NEWFALSE = '\x89',
+  LONG1 = '\x8a',
+  LONG4 = '\x8b',
+
+  // Protocol 3 (Python 3.x)
+  BINBYTES = 'B',
+  SHORT_BINBYTES = 'C',
+
+  // Protocol 4
+  SHORT_BINUNICODE = char('\x8c'),
+  BINUNICODE8 = '\x8d',
+  BINBYTES8 = '\x8e',
+  EMPTY_SET = '\x8f',
+  ADDITEMS = '\x90',
+  FROZENSET = '\x91',
+  NEWOBJ_EX = '\x92',
+  STACK_GLOBAL = '\x93',
+  MEMOIZE = '\x94',
+  FRAME = '\x95'
+};
+
+using ::c10::IValue;
+
+struct WriteableTensorData {
+  const char* data() const {
+    return static_cast<const char*>(tensor_.storage().data());
+  }
+  size_t sizeInBytes() const {
+    return size_;
+  }
+  size_t nbytes() const {
+    return tensor_.storage().nbytes();
+  }
+  bool storageHasDeleter() const {
+    return tensor_.storage().data_ptr().get_context() != nullptr;
+  }
+
+ private:
+  friend TORCH_API WriteableTensorData
+  getWriteableTensorData(const at::Tensor& tensor, bool to_cpu);
+  at::Tensor tensor_;
+  uint64_t size_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TORCH_API Pickler {
   AT_DISALLOW_COPY_AND_ASSIGN(Pickler);
 
@@ -182,4 +294,145 @@ class TORCH_API Pickler {
   bool tag_aggregates_;
 };
 
+<<<<<<< HEAD
+=======
+// returns a (tensor, record_size) for a tensor, converting it to a CPU tensor
+// if it was CUDA and to_cpu is True.
+TORCH_API WriteableTensorData
+getWriteableTensorData(const at::Tensor& tensor, bool to_cpu = true);
+
+// if the cls has __getstate__/__setstate__
+// assert they have the right schema and return true,
+// otherwise return false
+bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
+
+// Declare BackendMeta serialization and deserialization function pointer types.
+using BackendMetaPtr = std::function<
+    void(const at::Tensor&, std::unordered_map<std::string, bool>&)>;
+
+// A allowlist of device type, currently available is PrivateUse1
+inline std::unordered_set<c10::DeviceType>& GetBackendMetaAllowlist() {
+  static std::unordered_set<c10::DeviceType> DeviceTypeAllowlist{
+      c10::DeviceType::PrivateUse1};
+  return DeviceTypeAllowlist;
+}
+
+// Dynamically obtain serialization function pairs
+// that require the corresponding backend.
+inline std::array<
+    std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+    at::COMPILE_TIME_MAX_DEVICE_TYPES>&
+GetBackendMetaSerialization() {
+  // The array to save function pointer for BackendMeta serialization.
+  // key is the DeviceType, value is std::pair obj.
+  // value.first represent get function and value.seconde represent set function
+  static std::array<
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+      at::COMPILE_TIME_MAX_DEVICE_TYPES>
+      BackendMetaSerialization;
+  return BackendMetaSerialization;
+}
+
+// Register function pointer of Tensor BackendMetadata for serialization.
+TORCH_API inline void TensorBackendMetaRegistry(
+    c10::DeviceType t,
+    const BackendMetaPtr& get_fptr,
+    const BackendMetaPtr& set_fptr) {
+  // allowlist verification
+  // Only if the devicetype is in the allowlist,
+  // we allow the serialization extension to be registered for backendmeta data.
+  const auto& DeviceTypeAllowlist = GetBackendMetaAllowlist();
+  TORCH_CHECK(
+      DeviceTypeAllowlist.find(t) != DeviceTypeAllowlist.end(),
+      "It is not allowed to register the serialization method ",
+      "of backendMeta data for PrivateUse1. ",
+      "If you have related serialization requirements, ",
+      "please expand the allowlist");
+  // Register function pointer
+  int device_type = static_cast<int>(t);
+  auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  TORCH_CHECK(
+      !BackendMetaSerialization[device_type].has_value(),
+      "The tensor BackendMeta serialization function pointer for ",
+      t,
+      " has been registered.");
+  BackendMetaSerialization[device_type] =
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>(
+          std::make_pair(get_fptr, set_fptr));
+}
+
+// Return a map of Tensor Metadata which including BackendMetaData for
+// serialization. For now, it only takes care of `conj` and `neg` bit.
+inline std::unordered_map<std::string, bool> getTensorMetadata(
+    const at::Tensor& t) {
+  // We don't support serializing `ZeroTensor` as it is not public
+  // facing yet.
+  TORCH_CHECK(
+      !t._is_zerotensor(),
+      "ZeroTensor is not serializable,",
+      " please file an issue if required.");
+  std::unordered_map<std::string, bool> metadata{};
+
+  // Only add meta-data if the value is not default.
+  if (t.is_conj()) {
+    metadata["conj"] = true;
+  }
+  if (t.is_neg()) {
+    metadata["neg"] = true;
+  }
+  // Only add BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // serialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().first;
+    fptr(t, metadata);
+  }
+  return metadata;
+}
+
+// set Tensor Metadata based on the map.
+// Refer: getTensorMetadata
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool> metadata) {
+  auto iter_end = metadata.end();
+  auto iter_temp = metadata.find("conj");
+  if (iter_temp != iter_end) {
+    t._set_conj(true);
+    metadata.erase(iter_temp);
+  }
+  iter_temp = metadata.find("neg");
+  if (iter_temp != iter_end) {
+    t._set_neg(true);
+    metadata.erase(iter_temp);
+  }
+  // Only set BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // deserialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().second;
+    fptr(t, metadata);
+  }
+}
+
+// set Tensor metadata based on the map.
+// NOTE: This overload is required by unpickler.cpp
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    const c10::Dict<c10::IValue, c10::IValue>& metadata_idict) {
+  std::unordered_map<std::string, bool> metadata;
+  for (auto& pair : metadata_idict) {
+    auto key = *pair.key().toString();
+    metadata[key] = pair.value().toBool();
+  }
+  setTensorMetadata(t, std::move(metadata));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 70e188816fb4c..54b27743e5b9d 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -212,7 +212,11 @@ struct PythonPrintImpl {
   //     and would appear in the same order when the expression tree is
   //     reparsed.
   // The last case can be checked
+<<<<<<< HEAD
   // because when we emit a expression tree in the parser,
+=======
+  // because when we emit a expresion tree in the parser,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // we do a left-to-right postorder traversal of the expression tree (emit
   // children, then emit op). The reverse of this is a right-to-left preorder
   // traversal of the tree. By doing a right-to-left preorder traversal of the
@@ -222,12 +226,20 @@ struct PythonPrintImpl {
   // expression.
 
   // The inductive step is that the right-most input should be produced by the
+<<<<<<< HEAD
   // node immediately before the current node if it is in tree order.
+=======
+  // node immediatly before the current node if it is in tree order.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool canInline(Value* v) {
     Node* n = v->node();
     // there must be only 1 values, otherwise we need an assignment to handle
+<<<<<<< HEAD
     // the multiple output values
+=======
+    // the multiple outout values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (n->outputs().size() != 1)
       return false;
     // if it is used more than once, then we need a variable
@@ -651,7 +663,11 @@ struct PythonPrintImpl {
   // [reordering of inlines]
   // We inline anything that is semantically legal to inline, but sometimes
   // we find that these lines get too long. In that case we break the lines
+<<<<<<< HEAD
   /// and it is important that we un-inline all the inputs preceding the long
+=======
+  /// and it  is important that we un-inline all the inputs preceeding the long
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// input:
   //   r = foo(x.add_(b), some_long + expression)
   //  wrong!
@@ -1410,7 +1426,11 @@ struct PythonPrintImpl {
         enforce_importable_(enforce_importable) {}
 
   void printClass(const ClassTypePtr& classType) {
+<<<<<<< HEAD
     // If any of the methods are not Graph functions, this indicates that
+=======
+    // If any of the methods are not Graph funtions, this indicates that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // this class is a custom-bound C++ class. Skip serialization
     // of this class, we will depend on the ClassType being defined
     // in the target process.
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index caefafc6632ef..93f66c7744306 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -167,7 +167,13 @@ std::vector<char> SourceRangePickler::pickle(
     }
 
     ivalues.emplace_back(c10::ivalue::Tuple::create(
+<<<<<<< HEAD
         {(int64_t)range.bytes, srs->serialize(range.range), source_range_tag}));
+=======
+        {(int64_t)range.bytes,
+         srs->serialize(range.range),
+         static_cast<int64_t>(source_range_tag)}));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::vector<at::Tensor> table;
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 67f7ef01730f2..421c545010361 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -5,6 +5,10 @@
 #endif
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/serialization/storage_context.h>
 #include <torch/csrc/jit/serialization/unpickler.h>
 #include <torch/csrc/utils/byte_order.h>
@@ -44,7 +48,11 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
     to_process.pop_back();
     // ensure we only scan each pointer value once, otherwise this
     // can become exponential (and if we allow recursive data in the future,
+<<<<<<< HEAD
     // it would not terminate).
+=======
+    // it would not terminiate).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (w.value.isPtrType()) {
       const void* key = w.value.internalToPointer();
       auto it = scanned.find(key);
@@ -261,9 +269,18 @@ void Unpickler::run() {
 void Unpickler::setInput(size_t memo_id) {
   AT_ASSERT(!stack_.empty());
   if (memo_id >= memo_table_.size()) {
+<<<<<<< HEAD
     memo_table_.resize(memo_id + 1);
   }
   memo_table_[memo_id] = stack_.back();
+=======
+    memo_table_.insert(
+        memo_table_.end(), memo_id - memo_table_.size(), IValue());
+    memo_table_.push_back(stack_.back());
+  } else {
+    memo_table_[memo_id] = stack_.back();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static std::vector<int64_t> tupleToIntList(const IValue& v) {
@@ -351,6 +368,10 @@ PickleOpCode Unpickler::readInstruction() {
       TORCH_CHECK(!marks_.empty(), "Parsing error: marks_ is empty");
       size_t start = marks_.back();
       marks_.pop_back();
+<<<<<<< HEAD
+=======
+      std::vector<IValue> elements;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(
           stack_.size() >= start,
           "Parsing error: wrong start index ",
@@ -378,10 +399,18 @@ PickleOpCode Unpickler::readInstruction() {
           stack_.emplace_back(c10::ivalue::Tuple::create(pop(stack_)));
           break;
         default: {
+<<<<<<< HEAD
           auto start_it = stack_.begin() + static_cast<std::ptrdiff_t>(start);
           std::vector<IValue> elements{
               std::make_move_iterator(start_it),
               std::make_move_iterator(stack_.end())};
+=======
+          elements.reserve(stack_.size() - start);
+          auto start_it = stack_.begin() + static_cast<std::ptrdiff_t>(start);
+          for (auto it = start_it; it != stack_.end(); ++it) {
+            elements.emplace_back(std::move(*it));
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           stack_.erase(start_it, stack_.end());
           stack_.emplace_back(c10::ivalue::Tuple::create(std::move(elements)));
           break;
@@ -485,7 +514,11 @@ PickleOpCode Unpickler::readInstruction() {
           stack_.size(),
           " and start index is ",
           start,
+<<<<<<< HEAD
           ", but stack_ is iterated by two elements at a time");
+=======
+          ", but stack_ is iterated by two elemenst at a time");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (size_t i = start; i < stack_.size(); i += 2) {
         dict.insert_or_assign(stack_[i], stack_[i + 1]);
       }
@@ -664,6 +697,7 @@ void Unpickler::readGlobal(
     // See [NOTE] skip_next_read_global
     this->skip_next_read_global--;
     if (this->skip_next_read_global == 1) {
+<<<<<<< HEAD
       if (module_name == "torch" && class_name == "Tensor") {
         // This is a special case when we are unpickling a subclassed tensor
         // with type torch.nn.Buffer. We didn't frequently run into this because
@@ -674,6 +708,8 @@ void Unpickler::readGlobal(
         this->skip_next_read_global = 0;
         return;
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Pass through to the correct handler
     } else if (this->skip_next_read_global == 0) {
       // Corresponds to the type of `Tensor` being unpickled
@@ -779,10 +815,13 @@ void Unpickler::readGlobal(
     // a Subclassed Tensor.
     rebuildTensorFromTypeV2();
   } else if (
+<<<<<<< HEAD
       module_name == "torch._utils" && (class_name == "_rebuild_parameter")) {
     // Unpickle a Parameter
     rebuildParameter();
   } else if (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") {
     rebuildSparseTensor();
   } else if (module_name == "builtins" && class_name == "complex") {
@@ -1033,6 +1072,7 @@ void Unpickler::rebuildTensorFromTypeV2() {
   });
 }
 
+<<<<<<< HEAD
 void Unpickler::rebuildParameter() {
   globals_.emplace_back([this] {
     auto args = pop(stack_).toTuple();
@@ -1045,6 +1085,8 @@ void Unpickler::rebuildParameter() {
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
@@ -1061,10 +1103,17 @@ void Unpickler::rebuildRRef() {
     // const reference will extend the lifetime of the temporary variable
     const auto& rrefId = distributed::rpc::RRefId(
         static_cast<int16_t>(args.at(distributed::rpc::RREFID_ON_IDX).toInt()),
+<<<<<<< HEAD
         args.at(distributed::rpc::RREFID_ID_IDX).toInt());
     const auto& forkId = distributed::rpc::RRefId(
         static_cast<int16_t>(args.at(distributed::rpc::FORKID_ON_IDX).toInt()),
         args.at(distributed::rpc::FORKID_ID_IDX).toInt());
+=======
+        static_cast<int64_t>(args.at(distributed::rpc::RREFID_ID_IDX).toInt()));
+    const auto& forkId = distributed::rpc::RRefId(
+        static_cast<int16_t>(args.at(distributed::rpc::FORKID_ON_IDX).toInt()),
+        static_cast<int64_t>(args.at(distributed::rpc::FORKID_ID_IDX).toInt()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto parent =
         static_cast<int16_t>(args.at(distributed::rpc::PARENT_IDX).toInt());
     const auto& typeStr = static_cast<std::string>(
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 702a1d8816e7f..b896b69212a65 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -3,10 +3,16 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/ArrayRef.h>
 #include <caffe2/serialize/inline_container.h>
+<<<<<<< HEAD
 
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/serialization/pickler_helper.h>
+=======
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::jit {
 
@@ -137,7 +143,10 @@ class TORCH_API Unpickler {
       const std::string& module_name,
       const std::string& class_name);
   void rebuildTensor(bool quantized);
+<<<<<<< HEAD
   void rebuildParameter();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
 #ifdef USE_DISTRIBUTED
diff --git a/torch/csrc/jit/tensorexpr/ConditionalsInTE.md b/torch/csrc/jit/tensorexpr/ConditionalsInTE.md
index c7bcea4976483..596f6363160bd 100644
--- a/torch/csrc/jit/tensorexpr/ConditionalsInTE.md
+++ b/torch/csrc/jit/tensorexpr/ConditionalsInTE.md
@@ -14,7 +14,11 @@ So far the recommendation was to standardize on fused conditionals.
 
 ## Expression Conditionals vs Statement Conditionals
 
+<<<<<<< HEAD
 Tensor IR contains both expression conditionals (`CompareSelect` and `IfThenElse`), as well as statement conditionals (`Cond`).  Expression conditionals are defined by being functional in nature: there is no side effect from duplicating the conditional, evaluating it twice, etc.  They are an important ingredient in expressing important operators like ReLU:
+=======
+Tensor IR contains both expression conditionals (`CompareSelect` and `IfThenElse`), as well as statement conditionals (`Cond`).  Expression conditionals are defined by being functional in nature: there is no side effect from duplicating the conditional, evaluating it twice, etc.  They are an important ingredient in expression important operators like ReLU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```
 store (((load A) >= 0.0) ? (load A) : 0.0), B
diff --git a/torch/csrc/jit/tensorexpr/codegen_external.py b/torch/csrc/jit/tensorexpr/codegen_external.py
index 6c8316cc9a420..0dbb3d045cdd2 100644
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@@ -77,7 +77,11 @@ def gen_external(native_functions_path, tags_path, external_path):
   at::Tensor& r = tensors[0];
   {nl.join(tensor_decls)}
   try {{
+<<<<<<< HEAD
     at::{name}_out({", ".join(["r"] + arg_names)});
+=======
+    at::{name}_out({', '.join(['r'] + arg_names)});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }} catch (...) {{
   }}
 }}"""
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.h b/torch/csrc/jit/tensorexpr/cpp_codegen.h
index 6b6011b66a377..b0003ce22d8e2 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.h
@@ -26,6 +26,7 @@ class TORCH_API CppPrinter : public IRPrinter {
   using IRPrinter::visit;
 
   // Binary expressions.
+<<<<<<< HEAD
   void visit(const ModPtr& /*v*/) override;
   void visit(const MaxPtr& /*v*/) override;
   void visit(const MinPtr& /*v*/) override;
@@ -55,6 +56,37 @@ class TORCH_API CppPrinter : public IRPrinter {
   // Vector data types.
   void visit(const RampPtr& /*v*/) override;
   void visit(const BroadcastPtr& /*v*/) override;
+=======
+  void visit(const ModPtr&) override;
+  void visit(const MaxPtr&) override;
+  void visit(const MinPtr&) override;
+
+  // Conditional expressions.
+  void visit(const CompareSelectPtr&) override;
+  void visit(const IfThenElsePtr&) override;
+
+  // Tensor operations.
+  void visit(const AllocatePtr&) override;
+  void visit(const FreePtr&) override;
+  void visit(const LoadPtr&) override;
+  void visit(const StorePtr&) override;
+
+  // Casts.
+  void visit(const CastPtr&) override;
+  void visit(const BitCastPtr&) override;
+
+  // Calls.
+  void visit(const IntrinsicsPtr&) override;
+  void visit(const ExternalCallPtr&) override;
+
+  // Vars.
+  void visit(const LetPtr&) override;
+  void visit(const VarPtr&) override;
+
+  // Vector data types.
+  void visit(const RampPtr&) override;
+  void visit(const BroadcastPtr&) override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   int lane_;
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 6131b55883dfb..d0a62b0c805bc 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -843,14 +843,22 @@ static std::ostream& operator<<(
   return out;
 }
 
+<<<<<<< HEAD
 static constexpr const char* device_resource_string = R"(
+=======
+static const char* device_resource_string = R"(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define NAN __int_as_float(0x7fffffff)
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define NEG_INFINITY __int_as_float(0xff800000)
 
 )";
 
+<<<<<<< HEAD
 static constexpr const char* shared_resource_string = R"(
+=======
+static const char* shared_resource_string = R"(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<typename T>
 __device__ T maximum(T a, T b) {
   return isnan(a) ? a : (a > b ? a : b);
@@ -1082,7 +1090,12 @@ void CudaCodeGen::call_with_numel(void** args, int64_t numel) {
   // https://stackoverflow.com/questions/34388712/cannot-understand-how-jcuda-culaunchkernel-work
   std::vector<void*> ptr_to_args(buffer_args.size());
   for (size_t i = 0; i < buffer_args.size(); i++) {
+<<<<<<< HEAD
     ptr_to_args[i] = buffer_args[i].isVar() ? args[i] : (&args[i]);
+=======
+    ptr_to_args[i] =
+        buffer_args[i].isVar() ? args[i] : const_cast<void**>(&args[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const auto device = this->device().index();
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 2582ec5797dd5..1929824cc55b0 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -127,7 +127,11 @@ To raw_bitcast(const From& src) {
   TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation");
   To storage;
   std::memcpy(&storage, &src, sizeof(To));
+<<<<<<< HEAD
   return storage;
+=======
+  return reinterpret_cast<To&>(storage);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 class SimpleIREvaluatorImpl;
diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h
index 9963feccde2b1..0d7c4fa85c41e 100644
--- a/torch/csrc/jit/tensorexpr/exceptions.h
+++ b/torch/csrc/jit/tensorexpr/exceptions.h
@@ -14,10 +14,15 @@ class Stmt;
 
 // Forward declarations of functions
 namespace std {
+<<<<<<< HEAD
 TORCH_API std::string to_string(
     const torch::jit::tensorexpr::ExprPtr& /*expr*/);
 TORCH_API std::string to_string(
     const torch::jit::tensorexpr::StmtPtr& /*stmt*/);
+=======
+TORCH_API std::string to_string(const torch::jit::tensorexpr::ExprPtr&);
+TORCH_API std::string to_string(const torch::jit::tensorexpr::StmtPtr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace std
 
 namespace torch::jit::tensorexpr {
diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp
index ee43036d77c94..4a9e74bca6609 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@@ -378,7 +378,11 @@ void nnc_aten_quantized_conv1d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -408,7 +412,11 @@ void nnc_aten_quantized_conv1d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -442,7 +450,11 @@ void nnc_aten_quantized_conv2d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -470,7 +482,11 @@ void nnc_aten_quantized_conv2d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -502,7 +518,11 @@ void nnc_aten_quantized_conv2d_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -530,7 +550,11 @@ void nnc_aten_quantized_conv2d_relu_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -562,7 +586,11 @@ void nnc_aten_quantized_linear(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -590,7 +618,11 @@ void nnc_aten_quantized_linear_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -622,7 +654,11 @@ void nnc_aten_quantized_linear_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -651,7 +687,11 @@ void nnc_aten_quantized_add(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   // TORCH_INTERNAL_ASSERT(tensors.size() == 3);
 
@@ -684,7 +724,11 @@ void nnc_aten_quantized_mul(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double a_qscale = ((double*)extra_args)[0];
   const int64_t a_qzero = extra_args[1];
@@ -714,7 +758,11 @@ void nnc_aten_quantized_mul_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double a_qscale = ((double*)extra_args)[0];
@@ -748,7 +796,11 @@ void nnc_aten_quantized_mul_scalar(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -773,7 +825,11 @@ void nnc_aten_quantized_mul_scalar_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -802,7 +858,11 @@ void nnc_aten_quantized_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -826,7 +886,11 @@ void nnc_aten_quantized_sigmoid(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -851,7 +915,11 @@ void nnc_aten_quantized_sigmoid_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -880,7 +948,11 @@ void nnc_aten_quantized_cat(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   std::vector<std::pair<size_t, QIData>> qdata;
   const auto in_bufs_num = bufs_num - 1;
@@ -914,7 +986,11 @@ void nnc_aten_upsample_nearest2d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
   const double x_qscale = ((double*)extra_args)[0];
@@ -956,7 +1032,11 @@ void nnc_aten_upsample_nearest2d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
@@ -1008,7 +1088,11 @@ void nnc_aten_quantize_per_tensor(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   auto tensors = constructTensors(
       bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes);
@@ -1028,7 +1112,11 @@ void nnc_aten_quantize_per_tensor_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   auto tensors = constructTensors2(
@@ -1058,7 +1146,11 @@ void nnc_aten_dequantize(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const double qscale = ((double*)extra_args)[0];
   const int64_t qzero = extra_args[1];
@@ -1083,7 +1175,11 @@ void nnc_aten_dequantize_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double qscale = ((double*)extra_args)[0];
@@ -1275,7 +1371,11 @@ void nnc_aten_max_red_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
+<<<<<<< HEAD
     int64_t /*unused*/,
+=======
+    int64_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t* extra_args) {
   size_t bufs_out_num = 1u;
   auto tensors = constructTensors2(
@@ -1437,7 +1537,11 @@ void nnc_aten_embedding(
     r = at::embedding(weight, indices);
   } catch (...) {
   }
+<<<<<<< HEAD
   // TODO: have to copy output because at::embedding doesn't have an out
+=======
+  // TODO: have to copy output because at::embedding doesnt have an out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // variant and NNC's external calls don't support allocations
   memcpy(buf_data[0], r.const_data_ptr(), r.element_size() * r.numel());
 }
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index e75af13df9327..85c7d451d2d8a 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -125,7 +125,11 @@ Dtype Intrinsics::IntrinsicsDtype(
     IntrinsicsOp op_type,
     const std::vector<ExprPtr>& params) {
   // TODO: check the op_type and make a real decision
+<<<<<<< HEAD
   // Doesn't this fail with kRand?
+=======
+  // Doesnt this fail with kRand?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (params.empty()) {
     throw malformed_input("invalid params in Intrinsics");
   } else if (params.size() == 1) {
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 4f916c1181654..705a1a4377733 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -901,6 +901,7 @@ class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
 };
 
 TORCH_API std::vector<ExprPtr> ExprHandleVectorToExprVector(
+<<<<<<< HEAD
     const std::vector<ExprHandle>& /*v*/);
 TORCH_API std::vector<ExprHandle> ExprVectorToExprHandleVector(
     const std::vector<ExprPtr>& /*v*/);
@@ -908,6 +909,15 @@ TORCH_API std::vector<VarPtr> VarHandleVectorToVarVector(
     const std::vector<VarHandle>& /*v*/);
 TORCH_API std::vector<VarHandle> VarVectorToVarHandleVector(
     const std::vector<VarPtr>& /*v*/);
+=======
+    const std::vector<ExprHandle>&);
+TORCH_API std::vector<ExprHandle> ExprVectorToExprHandleVector(
+    const std::vector<ExprPtr>&);
+TORCH_API std::vector<VarPtr> VarHandleVectorToVarVector(
+    const std::vector<VarHandle>&);
+TORCH_API std::vector<VarHandle> VarVectorToVarHandleVector(
+    const std::vector<VarPtr>&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API ExprPtr flatten_index(
     const std::vector<ExprPtr>& dims,
     const std::vector<ExprPtr>& indices,
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index 10ba6f4fdaeba..ad5b2fa51f2ce 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -15,9 +15,15 @@ class TORCH_API IRPrinter : public IRVisitor {
  public:
   explicit IRPrinter(std::ostream& os) : printer_os_(this, os) {}
 
+<<<<<<< HEAD
   void print(ExprHandle /*expr*/);
   void print(Expr& /*expr*/);
   void print(Stmt& /*stmt*/);
+=======
+  void print(ExprHandle);
+  void print(Expr&);
+  void print(Stmt&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void visit(const AddPtr& v) override;
   void visit(const SubPtr& v) override;
   void visit(const MulPtr& v) override;
@@ -105,12 +111,19 @@ class TORCH_API IRPrinter : public IRVisitor {
   UniqueNameManager name_manager_;
 };
 
+<<<<<<< HEAD
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr& /*expr*/);
 TORCH_API std::ostream& operator<<(
     std::ostream& stream,
     const ExprHandle& /*expr*/);
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt& /*stmt*/);
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor& /*t*/);
+=======
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const ExprHandle&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_API void print(const ExprPtr& expr);
 TORCH_API void print(const StmtPtr& stmt);
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 0d51e11e446d1..508aa8fee41e0 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -885,7 +885,11 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   bool merged{false};
   for (const auto& component : term->variables()) {
     if (auto roundoff = isRoundOff(component, expr)) {
+<<<<<<< HEAD
       vars.push_back(std::move(roundoff));
+=======
+      vars.push_back(roundoff);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       merged = true;
     } else {
       vars.push_back(component);
@@ -897,10 +901,17 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   }
 
   if (vars.size() == 1 && immediateEquals(term->scalar(), 1)) {
+<<<<<<< HEAD
     return std::move(vars[0]);
   }
 
   return alloc<Term>(hasher_, term->scalar(), std::move(vars));
+=======
+    return vars[0];
+  }
+
+  return alloc<Term>(hasher_, term->scalar(), vars);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ExprPtr PolynomialTransformer::mutate(const MulPtr& v) {
@@ -930,7 +941,11 @@ ExprPtr PolynomialTransformer::mutate(const MulPtr& v) {
     variable = lhs_new;
   }
 
+<<<<<<< HEAD
   // Handle special case mul by 1 since that's safe for floating point, even if
+=======
+  // Handle special case mul by 1 since thats safe for floating point, even if
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // it's Nan/Inf.
   if (scalar && immediateEquals(scalar, 1)) {
     auto c = alloc<Cast>(v->dtype(), variable);
@@ -1105,8 +1120,13 @@ ExprPtr PolynomialTransformer::mutate(const DivPtr& v) {
     return lhs_new;
   }
 
+<<<<<<< HEAD
   // If numerator and denominator are equal the result is 1.
   // Unless the denominator could be zero.
+=======
+  // If numberator and denominator are equal the result is 1.
+  // Unless the demoninator could be zero.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // if (hasher_.hash(lhs_new) == hasher_.hash(rhs_new)) {
   //   return getImmediateByType(v->dtype(), 1);
   // }
@@ -1745,7 +1765,11 @@ ExprPtr TermExpander::mutate(const TermPtr& v) {
   std::vector<ExprPtr> vars;
   std::vector<ExprPtr> multilaneVars;
 
+<<<<<<< HEAD
   // Assume we can reorder here because we won't merge floating terms.
+=======
+  // Assume we can reorder here because we wont merge floating terms.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ExprPtr lastNode{nullptr};
   for (const auto& var : v->variables()) {
     ExprPtr node = var->accept_mutator(this);
@@ -1830,7 +1854,11 @@ static ExprPtr polyGCD(const PolynomialPtr& poly) {
   ExprPtr scalar = poly->scalar();
   const std::vector<TermPtr>& variables = poly->variables();
 
+<<<<<<< HEAD
   // We only want to factorize if we're saving complete operations, i.e. no
+=======
+  // We ony want to factorize if we're saving complete operations, i.e. no
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // value in factorizing 6x + 4y into 2 * (3x + 2y) since we don't save work.
   int opsSaved = 1; // default to saving the scalar.
   long GCD = std::abs(immediateAs<long>(scalar));
@@ -2088,7 +2116,11 @@ static ExprPtr simplifyRoundModPattern(const PolynomialPtr& poly) {
 
         // TODO: for now don't attempt partial factorization of this
         // optimization. E.g. it's possible to do: 2 * (x/y) * y + (x%y) => x +
+<<<<<<< HEAD
         // (x/y) * y but unsure that's actually much better, particularly with
+=======
+        // (x/y) * y but unsure thats actually much better, particularly with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // CSE.
         if (!immediateEquals(
                 evaluateOp(alloc<Sub>(r->scalar(), m->scalar())), 0)) {
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.h b/torch/csrc/jit/tensorexpr/ir_verifier.h
index d2043001184f8..1237c129d6c9e 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.h
@@ -47,8 +47,14 @@ class TORCH_API IRVerifier : public IRVisitor {
   void visit(const BlockPtr& v) override;
 };
 
+<<<<<<< HEAD
 TORCH_API void verify(const StmtPtr& /*s*/);
 TORCH_API void verify(const ExprPtr& /*e*/);
 TORCH_API void verify(const ExprHandle& /*e*/);
+=======
+TORCH_API void verify(const StmtPtr&);
+TORCH_API void verify(const ExprPtr&);
+TORCH_API void verify(const ExprHandle&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index cc15663720383..fe1194b5227e7 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1263,11 +1263,19 @@ Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(
     const std::vector<size_t>& sorted_stride_indices_descending,
     const std::vector<ExprPtr>& strides,
     BufPtr& buf) {
+<<<<<<< HEAD
   // We need to convert the output tensor so that its values are laid
   // so that when viewed from the output strides the values are correct.
   // A contiguous Tensor of size(2, 3) with values 0-5 is laid out as:
   // [0] [1] [2] [3] [4] [5]
   // The same valued tensor with strides (1, 2) would be laid out like
+=======
+  // We need to convert the output tensor so that its values are layed
+  // so that when viewed from the output strides the values are correct.
+  // A contiguous Tensor of size(2, 3) with values 0-5 is layed out as:
+  // [0] [1] [2] [3] [4] [5]
+  // The same valued tensor with strides (1, 2) would be layed out like
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // [0] [3] [1] [4] [2] [5]
   // When we are doing the re-ordering of values into the output tensor,
   // we are iterating per-element of the input, and we are fixed
@@ -1378,7 +1386,11 @@ Tensor TensorExprKernel::convertStaticShapeOutputToCorrectStrides(
       tt->strides().concrete_sizes(),
       buildErrorMessage("Output strides are unknown."));
   const std::vector<int64_t> strides = *tt->strides().concrete_sizes();
+<<<<<<< HEAD
   // All Tensors in NNC are laid out in default, contiguous layout.
+=======
+  // All Tensors in NNC are layed out in default, contiguous layout.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If the output is also default contiguous we don't need to do anything
   if (strides == default_strides) {
     return Tensor(buf, nullptr);
@@ -1482,7 +1494,11 @@ std::vector<BufPtr> TensorExprKernel::preAllocIntermediateBufs(
       remaining_interm_bufs.push_back(buf);
       continue;
     }
+<<<<<<< HEAD
     auto bp = malloc(size);
+=======
+    auto bp = (void*)malloc(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!bp) {
       remaining_interm_bufs.push_back(buf);
       continue;
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 918d82579444f..0ce79b7a71df5 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -83,7 +83,11 @@ using namespace torch::jit::tensorexpr;
 C10_DEFINE_bool(
     torch_jit_llvm_use_fast_intrinsics,
     false,
+<<<<<<< HEAD
     "Use fast (but slightly less accurate) implementations of tanh and sigmoid")
+=======
+    "Use fast (but slightly less accurate) implementations of tanh and sigmoid");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::jit::tensorexpr {
 
@@ -246,7 +250,11 @@ class LLVMCodeGenImpl : public IRVisitor {
   std::string kernel_func_name_;
 
 #define LLVM_TYPE_DECLARE(_1, Name) llvm::Type* Name##Ty_;
+<<<<<<< HEAD
   AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, LLVM_TYPE_DECLARE)
+=======
+  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, LLVM_TYPE_DECLARE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef LLVM_TYPE_DECLARE
 
 #if LLVM_VERSION_MAJOR >= 15
@@ -780,7 +788,11 @@ void LLVMCodeGenImpl::emitKernel(
   GRAPH_DEBUG("\nLLVM generated assembly code\n\n", asmCode_, "\n");
 }
 
+<<<<<<< HEAD
 // TODO: The binary ops are copypaste.
+=======
+// TODO: The binary ops are copypasta.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void LLVMCodeGenImpl::visit(const AddPtr& v) {
   v->lhs()->accept(this);
@@ -878,7 +890,11 @@ void LLVMCodeGenImpl::visit(const OrPtr& v) {
   bool rfp = rhs->getType()->isFPOrFPVectorTy();
 
   if (!lfp && !rfp) {
+<<<<<<< HEAD
     value_ = irb_.CreateOr(lhs, rhs); // codespell:ignore
+=======
+    value_ = irb_.CreateOr(lhs, rhs);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     throw malformed_input("llvm_codegen: bad type in Or", v);
   }
@@ -1101,7 +1117,11 @@ std::enable_if_t<std::is_floating_point_v<T>, llvm::Value*> getFromType(
   void LLVMCodeGenImpl::visit(const Name##ImmPtr& v) { \
     value_ = getFromType<Type>(Name##Ty_, v->value()); \
   }
+<<<<<<< HEAD
 AT_FORALL_SCALAR_TYPES(IMM_VISIT_DECLARE)
+=======
+AT_FORALL_SCALAR_TYPES(IMM_VISIT_DECLARE);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef IMM_VISIT_DECLARE
 
 void LLVMCodeGenImpl::visit(const HalfImmPtr& v) {
@@ -1225,7 +1245,11 @@ void LLVMCodeGenImpl::visit(const CastPtr& v) {
       }
       value_ = irb_.CreateFPCast(value_, dstType);
     } else if (dstType->isIntOrIntVectorTy()) {
+<<<<<<< HEAD
       // Strictly casting from Float -> i8 doesn't give correct results
+=======
+      // Strictly casting from Float -> i8 doesnt give correct results
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // set one bit true if the input float is not 0
       if (v->dtype().scalar_type() == ScalarType::Bool) {
         llvm::Value* zero =
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index 80d919a5674e6..f70000fd521c5 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -11,7 +11,10 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
 #include <llvm/ExecutionEngine/JITSymbol.h>
 C10_DIAGNOSTIC_POP()
 
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <llvm/ExecutionEngine/Orc/CompileUtils.h>
 #include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
 #include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
@@ -36,7 +39,10 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #endif
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 19a21329b64a7..433a23f916000 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -9,11 +9,17 @@
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
 #include <llvm/ExecutionEngine/JITSymbol.h>
 C10_DIAGNOSTIC_POP()
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <llvm/ExecutionEngine/Orc/Core.h>
 #include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
 #include <llvm/Target/TargetMachine.h>
 C10_DIAGNOSTIC_POP()
+=======
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/Target/TargetMachine.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <memory>
 #include <string>
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 7f0888666d3af..39b7b5c7ecc1c 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -987,7 +987,11 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
         }
       }
 
+<<<<<<< HEAD
       // all bufs will have at least one store (if they have > 1 they can't be
+=======
+      // all bufs will have at least one store (if they have > 1 they cant be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // inlined anyway)
       size_t reads = uses.size() - 1;
       // if only one read, we can inline it without duplicating work
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 802998aaa4b84..c3a8bce87ed96 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -43,11 +43,19 @@ class TORCH_API LoopNest {
     return root_stmt_;
   }
 
+<<<<<<< HEAD
   std::vector<ForPtr> getLoopStmtsFor(const Tensor& /*t*/) const;
   std::vector<ForPtr> getLoopStmtsFor(const BufPtr& /*buf*/) const;
   std::vector<ForPtr> getLoopStmtsFor(StmtPtr /*s*/) const;
   StmtPtr getLoopBodyFor(const Tensor& /*t*/) const;
   StmtPtr getLoopBodyFor(BufPtr /*buf*/) const;
+=======
+  std::vector<ForPtr> getLoopStmtsFor(const Tensor&) const;
+  std::vector<ForPtr> getLoopStmtsFor(const BufPtr&) const;
+  std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
+  StmtPtr getLoopBodyFor(const Tensor&) const;
+  StmtPtr getLoopBodyFor(BufPtr) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
   //'indices' indicates the path to the returned loop from 'root' in AST, e.g.,
@@ -77,7 +85,11 @@ class TORCH_API LoopNest {
   static std::vector<ForPtr> getEnclosingLoopNest(const StmtPtr& st);
 
   // Returns a list of all Stmts that write to the given buf.
+<<<<<<< HEAD
   std::vector<StmtPtr> getAllWritesToBuf(BufPtr /*buf*/) const;
+=======
+  std::vector<StmtPtr> getAllWritesToBuf(BufPtr) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // The following methods return the For loops that contain writes to
   // the given buf.
@@ -97,14 +109,22 @@ class TORCH_API LoopNest {
   // to buf.
   // For the above example:
   //   getAllInnermostLoopsWritingToBuf(a) => {j1, k2, j3}
+<<<<<<< HEAD
   std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr /*buf*/) const;
+=======
+  std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Returns a list of For loopnests which contain a Stmt that writes to
   // the given buf. Each loopnest here is a vector For loops.
   // For the above example:
   //   getAllLoopNestsWritingToBuf(a) => {{i1,j1}, {i2,j2,k2}, {i2,j3}}
+<<<<<<< HEAD
   std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(
       BufPtr /*buf*/) const;
+=======
+  std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(BufPtr) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   StmtPtr simplify();
 
@@ -562,7 +582,11 @@ class TORCH_API LoopNest {
   // Vectorize the given loop. This method requires that the given loop
   // does not perform a reduction.
   // It returns true if vectorization is successful and false otherwise.
+<<<<<<< HEAD
   static bool vectorize(const ForPtr& /*f*/);
+=======
+  static bool vectorize(const ForPtr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Find the inner-most loops and vectorize them. Currently, this only works
   // for the LLVM backend, when no reductions are involved.
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 46a09314fb7bf..30a0f9693281f 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -369,7 +369,11 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
 
           // Find a random number of loops to fuse
           int num_loops_to_fuse =
+<<<<<<< HEAD
               std::max(2, (std::rand() % (int)loops.size()));
+=======
+              std::max(2, (int)(std::rand() % (int)loops.size()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           auto [loops_to_fuse, chosen_indices] =
               randomization_helper::select_n_randomly<ForPtr>(
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
index 222ac5713d36b..4ef204b19fa06 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
@@ -240,7 +240,11 @@ class TORCH_API MemDependencyChecker : public IRVisitor {
   std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
       const StmtPtr& A) const;
   // TODO: this will return only the AccessInfo for A. It's included for
+<<<<<<< HEAD
   // completeness but be aware it won't return accesses used in the computation
+=======
+  // completeness but be aware it wont return accesses used in the computation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // of A.
   std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
       const ExprPtr& A) const;
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
index f6ca4defaf62f..df3d39a0af54e 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@@ -139,8 +139,13 @@ Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
+<<<<<<< HEAD
     const std::optional<ScalarType>& /*unused*/,
     at::Device /*unused*/) {
+=======
+    const std::optional<ScalarType>&,
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<VarPtr> vars;
   std::vector<ExprHandle> indices;
   for (const auto& os : outputShape) {
@@ -180,7 +185,11 @@ Tensor computeQuantizedAdd(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const BufHandle& QA = std::get<BufHandle>(inputs[0]);
   const BufHandle& QB = std::get<BufHandle>(inputs[1]);
   auto qa_scale = ExprHandle(QA.node()->qscale());
@@ -223,7 +232,11 @@ Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const BufHandle& x = std::get<BufHandle>(inputs[0]);
   const auto qscale = std::get<double>(inputs[1]);
   const auto qzero = std::get<int64_t>(inputs[2]);
@@ -255,7 +268,11 @@ Tensor computeDequantizeExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -280,7 +297,11 @@ Tensor computeQuantizedConv2dPrepack(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -634,7 +655,11 @@ Tensor computeDequantize(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -666,7 +691,11 @@ Tensor computeUpsampleNearest2d(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto& A = std::get<BufHandle>(inputs[0]);
   const auto& output_height = outputShape[2];
   const auto& output_width = outputShape[3];
@@ -713,7 +742,11 @@ Tensor computeUpsampleNearest2dExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -772,7 +805,11 @@ Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/) {
+=======
+    at::Device) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
 
   const auto out_qdtype = immQDType(qx);
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h
index ecc86c912b50c..71a1c9daf7e54 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.h
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -42,6 +42,16 @@ TORCH_API Tensor computeQuantizedConv2dPrepack(
     const std::optional<ScalarType>& outputType,
     at::Device device);
 
+<<<<<<< HEAD
+=======
+TORCH_API Tensor computeQuantizedConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API Tensor computeQuantizedConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -145,5 +155,9 @@ TORCH_API Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
+<<<<<<< HEAD
     at::Device /*unused*/);
+=======
+    at::Device);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 37f79d529238d..a2ae6af1ef5f0 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -225,7 +225,11 @@ void RegisterizerAnalysis::visit(const ForPtr& v) {
       // possible that an access at a higher scope could "unhide" the
       // conditional access, in which case we need to hoist. If there is no
       // access to this element at a higher scope then we cannot safely hoist.
+<<<<<<< HEAD
       // We cannot know at this level whether that will or won't occur.
+=======
+      // We cannot know at this level whether that will or wont occur.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       //
       // The solution we take here is to split the space-time continuum, and
       // keep both versions of the access handy. If the hoisted access is not
@@ -542,7 +546,11 @@ void RegisterizerAnalysis::mergeCurrentScopeIntoParent() {
         closeAccessIntoScope(pCandidate, parent);
         parentAccesses.erase(parentIt);
 
+<<<<<<< HEAD
         // the children access inserted into the parent scope.
+=======
+        // the childs access inserted into the parent scope.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         closeAccessIntoScope(candidate, parent);
         continue;
       }
@@ -567,7 +575,11 @@ void RegisterizerAnalysis::mergeCurrentScopeIntoParent() {
       ++it;
     }
 
+<<<<<<< HEAD
     // Insert the children closed access into the parent scope.
+=======
+    // Insert the childs closed access into the parent scope.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     closeAccessIntoScope(candidate, parent);
   }
 
diff --git a/torch/csrc/jit/tensorexpr/registerizer.h b/torch/csrc/jit/tensorexpr/registerizer.h
index c507d3b13a95e..f27638c5a2999 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.h
+++ b/torch/csrc/jit/tensorexpr/registerizer.h
@@ -186,7 +186,11 @@ class AccessInfo {
   bool firstUsageOverlapped_{false};
 
   // The cost in real ops that this access represents, to enable
+<<<<<<< HEAD
   // filtering accesses that won't save any loads or stores.
+=======
+  // filtering accesses that wont save any loads or stores.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ExprPtr store_cost_;
   ExprPtr load_cost_;
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index c3c070fc9607d..781bf131f2fa9 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -586,7 +586,11 @@ class TORCH_API LoopOptions {
     }
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+<<<<<<< HEAD
     static constexpr const char* kBlockIndexNames[] = {
+=======
+    static const char* kBlockIndexNames[] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "blockIdx.x",
         "blockIdx.y",
         "blockIdx.z",
@@ -629,7 +633,11 @@ class TORCH_API LoopOptions {
     }
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+<<<<<<< HEAD
     static constexpr const char* kThreadIndexNames[] = {
+=======
+    static const char* kThreadIndexNames[] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "threadIdx.x", "threadIdx.y", "threadIdx.z", "threadIdx.w"};
 
     if (gpu_thread_index_ < IDX_X || gpu_thread_index_ > IDX_MAX) {
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 3bdf3e0fc7365..2b6e7b59c7ed3 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -21,7 +21,11 @@ class TORCH_API LazyGraphExecutor {
   };
 
   // Register a lazy graph executor instance that can be retrieved using Get()
+<<<<<<< HEAD
   static void Register(LazyGraphExecutor* /*executor*/);
+=======
+  static void Register(LazyGraphExecutor*);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static LazyGraphExecutor* Get();
 
   virtual ~LazyGraphExecutor() = default;
diff --git a/torch/csrc/lazy/core/multi_wait.cpp b/torch/csrc/lazy/core/multi_wait.cpp
index c75cacb75b392..d3a6a32a21676 100644
--- a/torch/csrc/lazy/core/multi_wait.cpp
+++ b/torch/csrc/lazy/core/multi_wait.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/lazy/core/multi_wait.h>
 
 #include <chrono>
@@ -32,7 +35,11 @@ void MultiWait::Wait(double wait_seconds) {
   if (!cv_.wait_for(lock, std::chrono::duration<double>(wait_seconds), [this] {
         return completed_count_ >= count_;
       })) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "Timeout");
+=======
+    throw std::runtime_error("Timeout");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (exptr_ != nullptr) {
     std::rethrow_exception(exptr_);
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index e7ab494d18e32..68f91bc418e99 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -225,7 +225,11 @@ std::vector<Shape> compute_shape_constant_pad_nd(
     auto pad_idx = pad.size() - ((i + 1) * 2);
     auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
     TORCH_CHECK(
+<<<<<<< HEAD
         new_dim >= 0,
+=======
+        new_dim > 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "The input size ",
         input_sizes[l_diff + i],
         ", plus negative padding ",
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index bbe6fa1e5efb2..91c73612e2ecc 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -253,7 +253,11 @@ TORCH_API at::Tensor to_lazy_tensor(
 template <size_t... Indices>
 auto TupleAtenFromLtcTensorsImpl(
     const std::vector<LazyTensorPtr>& tensors,
+<<<<<<< HEAD
     std::index_sequence<Indices...> /*unused*/) {
+=======
+    std::index_sequence<Indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::make_tuple(CreateAtenFromLtcTensor(tensors[Indices])...);
 }
 
diff --git a/torch/csrc/lazy/core/tensor_impl.cpp b/torch/csrc/lazy/core/tensor_impl.cpp
index ce49338936e39..334e2e57daf93 100644
--- a/torch/csrc/lazy/core/tensor_impl.cpp
+++ b/torch/csrc/lazy/core/tensor_impl.cpp
@@ -195,14 +195,22 @@ bool LTCTensorImpl::is_strides_like_custom(
   return false;
 }
 
+<<<<<<< HEAD
 c10::SymBool LTCTensorImpl::sym_is_non_overlapping_and_dense_custom() const {
+=======
+bool LTCTensorImpl::is_non_overlapping_and_dense_custom() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // This should be true, but false as a temporary fix for a PyTorch core issue,
   // according to https://github.com/pytorch/xla/pull/2682.
   return false;
 }
 
+<<<<<<< HEAD
 c10::SymBool LTCTensorImpl::sym_is_contiguous_custom(
     c10::MemoryFormat _unused) const {
+=======
+bool LTCTensorImpl::is_contiguous_custom(c10::MemoryFormat _unused) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO(ezyang): I don't think this branch is actually necessary
   // TODO(ezyang): I don't think this logic is right, shouldn't we pass on
   // the memory format?
diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h
index 02f68c01c6f44..4b54ef23b473b 100644
--- a/torch/csrc/lazy/core/tensor_impl.h
+++ b/torch/csrc/lazy/core/tensor_impl.h
@@ -41,11 +41,18 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
   int64_t numel_custom() const override;
   int64_t storage_offset_custom() const override;
   int64_t dim_custom() const override;
+<<<<<<< HEAD
   bool is_strides_like_custom(at::MemoryFormat memory_format) const override;
   c10::SymBool sym_is_non_overlapping_and_dense_custom() const override;
 
   c10::SymBool sym_is_contiguous_custom(
       at::MemoryFormat memory_format) const override;
+=======
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  bool is_strides_like_custom(at::MemoryFormat memory_format) const override;
+  bool is_non_overlapping_and_dense_custom() const override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::SymIntArrayRef sym_sizes_custom() const override;
   c10::SymIntArrayRef sym_strides_custom() const override;
   c10::SymInt sym_numel_custom() const override;
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index 4807aa6a4c7d1..0d7f8a81c9d25 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -331,9 +331,19 @@ void initLazyBindings(PyObject* module) {
   // So far this problem has only been observed internally, so we will just
   // block it off there.
 
+<<<<<<< HEAD
   // When libtorch_python is loaded, we register the python frame getter
   // otherwise, debug util simply omits python frames
   GetPythonFramesFunction() = GetPythonFrames;
+=======
+#if !(defined(USE_DEPLOY))
+
+  // When libtorch_python is loaded, we register the python frame getter
+  // otherwise, debug util simply omits python frames
+  GetPythonFramesFunction() = GetPythonFrames;
+
+#endif // USE_DEPLOY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::lazy
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
index 9222b8b2997f3..e58647b1f46cc 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@@ -1,5 +1,8 @@
 #include <c10/core/ScalarType.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
 #include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
@@ -45,8 +48,13 @@ void TSLoweringContext::Lower(const Node* node) {
       AssignOutputOp(torch::lazy::Output(node, i), ops[i]);
     }
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(
         false, "Expected torch::lazy::TsNode but could not dynamic cast");
+=======
+    throw std::runtime_error(
+        "Expected torch::lazy::TsNode but could not dynamic cast");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index e75035815e260..4f8d5e18ae2a5 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -2,7 +2,10 @@
 
 #include <sstream>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/api/include/torch/jit.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/lazy/backend/lowering_context.h>
@@ -27,8 +30,13 @@ class TORCH_API TSComputation : public Computation {
   }
 
   const std::vector<Shape>& parameter_shapes() const override {
+<<<<<<< HEAD
     TORCH_CHECK(
         false, "TODO(whc) implement TS computation shapes or change interface");
+=======
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return parameter_shapes_;
   }
 
@@ -37,8 +45,13 @@ class TORCH_API TSComputation : public Computation {
   }
 
   const Shape& result_shape() const override {
+<<<<<<< HEAD
     TORCH_CHECK(
         false, "TODO(whc) implement TS computation shapes or change interface");
+=======
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result_shape_;
   }
 
@@ -92,7 +105,11 @@ class TORCH_API TSLoweringContext : public LoweringContext {
     for (torch::jit::Value* output : root_tuple_) {
       graph_->block()->registerOutput(output);
     }
+<<<<<<< HEAD
     return std::make_shared<TSComputation>(graph_);
+=======
+    return std::shared_ptr<Computation>(new TSComputation(graph_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Retrieves the lowered operation for an output. If the requested output is
diff --git a/torch/csrc/monitor/counters.cpp b/torch/csrc/monitor/counters.cpp
index 832d289d7885f..4037083290008 100644
--- a/torch/csrc/monitor/counters.cpp
+++ b/torch/csrc/monitor/counters.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/monitor/counters.h>
 
 #include <unordered_set>
@@ -22,10 +25,15 @@ const char* aggregationName(Aggregation agg) {
     case Aggregation::MIN:
       return "min";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(
           false,
           "unknown aggregation: ",
           std::to_string(static_cast<int>(agg)));
+=======
+      throw std::runtime_error(
+          "unknown aggregation: " + std::to_string(static_cast<int>(agg)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/monitor/events.cpp b/torch/csrc/monitor/events.cpp
index 43f5e54f54769..d5b37d3286a3d 100644
--- a/torch/csrc/monitor/events.cpp
+++ b/torch/csrc/monitor/events.cpp
@@ -37,8 +37,13 @@ class EventHandlers {
   }
 
  private:
+<<<<<<< HEAD
   std::mutex mu_;
   std::vector<std::shared_ptr<EventHandler>> handlers_;
+=======
+  std::mutex mu_{};
+  std::vector<std::shared_ptr<EventHandler>> handlers_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 } // namespace
 
diff --git a/torch/csrc/monitor/events.h b/torch/csrc/monitor/events.h
index 2ec89251c62e4..072ba47fa72fe 100644
--- a/torch/csrc/monitor/events.h
+++ b/torch/csrc/monitor/events.h
@@ -35,7 +35,11 @@ struct TORCH_API Event {
   std::unordered_map<std::string, data_value_t> data;
 };
 
+<<<<<<< HEAD
 inline bool operator==(const Event& lhs, const Event& rhs) {
+=======
+TORCH_API inline bool operator==(const Event& lhs, const Event& rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lhs.name == rhs.name && lhs.timestamp == rhs.timestamp &&
       lhs.data == rhs.data;
 }
diff --git a/torch/csrc/monitor/python_init.cpp b/torch/csrc/monitor/python_init.cpp
index 25b14c0a2b2c1..e5848475e223b 100644
--- a/torch/csrc/monitor/python_init.cpp
+++ b/torch/csrc/monitor/python_init.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <utility>
 
 #include <c10/util/WaitCounter.h>
@@ -24,7 +27,11 @@ struct type_caster<torch::monitor::data_value_t> {
   PYBIND11_TYPE_CASTER(torch::monitor::data_value_t, _("data_value_t"));
 
   // Python -> C++
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* source = src.ptr();
     if (THPUtils_checkLong(source)) {
       this->value = THPUtils_unpackLong(source);
@@ -59,7 +66,11 @@ struct type_caster<torch::monitor::data_value_t> {
       std::string& str = std::get<std::string>(src);
       return THPUtils_packString(str);
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, "unknown data_value_t type");
+=======
+    throw std::runtime_error("unknown data_value_t type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 } // namespace pybind11::detail
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
index 51c77aba6d765..b5f5ce73e90ac 100644
--- a/torch/csrc/mps/Module.cpp
+++ b/torch/csrc/mps/Module.cpp
@@ -501,12 +501,15 @@ void initModule(PyObject* module) {
     at::mps::getMPSProfiler().startCapture(fileName);
   });
   m.def("_mps_stopCapture", []() { at::mps::getMPSProfiler().stopCapture(); });
+<<<<<<< HEAD
   m.def("_mps_get_name", []() {
     return at::mps::MPSDevice::getInstance()->getName();
   });
   m.def("_mps_get_core_count", []() {
     return at::mps::MPSDevice::getInstance()->getCoreCount();
   });
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif /* USE_MPS */
 
diff --git a/torch/csrc/mtia/Module.cpp b/torch/csrc/mtia/Module.cpp
index 0b8171a372653..ee56b11fcb928 100644
--- a/torch/csrc/mtia/Module.cpp
+++ b/torch/csrc/mtia/Module.cpp
@@ -63,6 +63,7 @@ void initModule(PyObject* module) {
     return at::detail::getMTIAHooks().getDefaultStream(device_index);
   });
 
+<<<<<<< HEAD
   m.def(
       "_mtia_setStream",
       [](int64_t stream_id,
@@ -75,6 +76,8 @@ void initModule(PyObject* module) {
             static_cast<c10::DeviceType>(device_type)));
       });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_mtia_setCurrentStream", [](const c10::Stream& stream) {
     torch::utils::device_lazy_init(at::kMTIA);
     auto device = at::detail::getMTIAHooks().getCurrentDevice();
diff --git a/torch/csrc/profiler/README.md b/torch/csrc/profiler/README.md
index dc27337349ddc..ec1a19757506c 100644
--- a/torch/csrc/profiler/README.md
+++ b/torch/csrc/profiler/README.md
@@ -13,6 +13,7 @@ The profiler instruments PyTorch to collect information about the model's execut
 - [Codebase Structure](#codebase-structure)
 - [`RecordFunction`](#recordfunction)
 - [Autograd Integration](#autograd-integration)
+<<<<<<< HEAD
 - [Torch Operation Collection](#torch-operation-collection)
 - [Allocation Event Collection](#allocation-event-collection)
 - [Kineto Integration](#kineto-integration)
@@ -56,6 +57,16 @@ torch/
 │       └── record_function.h            # RecordFunction definitions
 └── LICENSE                  # License information
 ```
+=======
+- [Collection and Post-Processing](#collection-and-post-processing)
+- [Kineto Integration](#kineto-integration)
+- [Python Tracing](#python-tracing)
+
+## Codebase Structure ##
+
+TODO
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## `RecordFunction` ##
 
 [aten/src/ATen/record_function.h](../../../aten/src/ATen/record_function.h)
@@ -78,6 +89,7 @@ The profiler records two pieces of information from the autograd engine:
 
 (\*) Note that only op invocations whose inputs require gradients are assigned a sequence number
 
+<<<<<<< HEAD
 ## Torch Operation Collection ##
 This section describes the general flow for collecting torch operations during auto-trace (in-process, synchronous tracing). For details on on-demand tracing (out-of-process, asynchronous), please refer to the Libkineto README.
 
@@ -114,3 +126,16 @@ This setup allows for detailed and accurate data collection on both Python and C
 ## Clock Alignment ##
 
 Depending on the system environment, the profiler will use the most efficient clock when creating a timestamp. The default for most Linux systems is TSC, which records time in the form of CPU cycles. To convert from this time to the unix time in nanoseconds, we create a clock converter. If Kineto is included in the profiler, this converter will also be passed into Kineto as well to ensure alignment.
+=======
+## Collection and Post-Processing ##
+
+TODO
+
+## Kineto Integration ##
+
+TODO
+
+## Python Tracing ##
+
+TODO
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 133951dd817ca..5b0aed3e30fbb 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -172,7 +172,11 @@ bool InputOutputEncoder::isSupportedScalarList(
   return true;
 }
 
+<<<<<<< HEAD
 // This function returns a lambda which is a custom-iterator-like getter.
+=======
+// This function returns a lambda which is is a custom-iterator-like getter.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Each invocation of the lambda returns input values for one op.
 //
 // io_type is used to filter the ivalues between 'Shapes' and 'Concrete Args'.
@@ -396,10 +400,14 @@ std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
   }
 
   event->start_time_ = c10::getApproximateTime();
+<<<<<<< HEAD
   event->allow_tf32_cublas_ =
       at::globalContext().float32Precision(
           at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
       at::Float32Precision::TF32;
+=======
+  event->allow_tf32_cublas_ = at::globalContext().allowTF32CuBLAS();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!config_.experimental_config.performance_events.empty()) {
     const size_t n = config_.experimental_config.performance_events.size();
     event->counters_ = std::make_unique<perf_counters_t>(n, 0);
@@ -615,7 +623,10 @@ std::string Result::name() const {
       ATTRIBUTE(OutOfMemory, std::string("[OutOfMemory]")),
       ATTRIBUTE(PyCall, toString(e)),
       ATTRIBUTE(PyCCall, std::string(e.function_name_.str())),
+<<<<<<< HEAD
       ATTRIBUTE(PythonGC, std::string("Python GC")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       [](const auto& e) -> std::string { return e.name_; }));
 }
 
@@ -634,7 +645,10 @@ libkineto::ActivityType Result::kinetoType() const {
       ATTRIBUTE(OutOfMemory, libkineto::ActivityType::CPU_INSTANT_EVENT),
       ATTRIBUTE(PyCall, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(PyCCall, libkineto::ActivityType::PYTHON_FUNCTION),
+<<<<<<< HEAD
       ATTRIBUTE(PythonGC, libkineto::ActivityType::PYTHON_FUNCTION),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ATTRIBUTE(Kineto, e.activity_type_)));
 }
 
@@ -654,7 +668,10 @@ int64_t Result::endTimeNS() const {
       ATTRIBUTE(Allocation, start_time_ns_),
       ATTRIBUTE(OutOfMemory, start_time_ns_),
       ATTRIBUTE(Kineto, start_time_ns_ + e.duration_ns_),
+<<<<<<< HEAD
       ATTRIBUTE(PythonGC, start_time_ns_ + e.duration_ns_),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       [&](const auto& e) -> int64_t { return e.end_time_ns_; }));
 
   // In rare cases we're willing to tolerate ops which are missing an end time
@@ -705,9 +722,12 @@ RecordQueue::RecordQueue(
       activities_{std::move(activities)} {
   if (tracePython()) {
     python_tracer_ = python_tracer::PythonTracerBase::make(this);
+<<<<<<< HEAD
     if (getPythonGcEvents()) {
       python_tracer_->register_gc_callback();
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -715,10 +735,13 @@ bool RecordQueue::tracePython() const {
   return config_.with_stack && activities_.count(ActivityType::CPU);
 }
 
+<<<<<<< HEAD
 bool RecordQueue::getPythonGcEvents() const {
   return config_.experimental_config.record_python_gc_info;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ThreadLocalSubqueue* RecordQueue::getSubqueue() {
   // In the most common case, a thread will want to write to the same sub-queue
   // that it wrote to last call. The only time that isn't true is if:
@@ -963,9 +986,14 @@ class TransferEvents {
  public:
   TransferEvents(
       std::vector<std::shared_ptr<Result>>& results,
+<<<<<<< HEAD
       trace_ptr_t& trace,
       const ProfilerConfig& config)
       : results_{results}, config_{config} {
+=======
+      trace_ptr_t& trace)
+      : results_{results} {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto* trace_activities_ptr = trace->get()->activities();
     TORCH_INTERNAL_ASSERT(trace_activities_ptr != nullptr);
     trace_activities_ = *trace_activities_ptr;
@@ -1029,12 +1057,15 @@ class TransferEvents {
     }
   }
 
+<<<<<<< HEAD
   bool isHiddenEvent(const itrace_t* activity) const {
     TORCH_INTERNAL_ASSERT(activity != nullptr);
     // Kineto uses "hidden" metadata to mark events that should be hidden.
     return activity->getMetadataValue("hidden") == "1";
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::shared_ptr<Result> resultFromActivity(const itrace_t* activity) {
     TORCH_INTERNAL_ASSERT(activity != nullptr);
 
@@ -1055,7 +1086,11 @@ class TransferEvents {
             {/*id=*/static_cast<uint32_t>(activity->flowId()),
              /*type=*/static_cast<uint32_t>(activity->flowType()),
              /*start=*/activity->flowStart()}});
+<<<<<<< HEAD
     event->hidden_ = isHiddenEvent(activity);
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NB: It's tempting to set `event->kineto_activity_`; however we can only
     // guarantee that the events we passed to Kineto are of type
     // `GenericTraceActivity`. Others may derive from ITraceActivity and thus
@@ -1095,6 +1130,7 @@ class TransferEvents {
   void extractEventsFromTrace() {
     for (const auto* activity : trace_activities_) {
       auto e = toResult(activity);
+<<<<<<< HEAD
       if (e) {
         if (config_.experimental_config.expose_kineto_event_metadata) {
           e->visit(c10::overloaded(
@@ -1114,6 +1150,15 @@ class TransferEvents {
               },
               [](auto&) { TORCH_INTERNAL_ASSERT(false); }));
         }
+=======
+      const auto* linked_activity = activity->linkedActivity();
+      if (e && linked_activity) {
+        e->visit(c10::overloaded(
+            [&](ExtraFields<EventType::Kineto>& i) {
+              i.linked_activity_ = toResult(linked_activity);
+            },
+            [](auto&) { TORCH_INTERNAL_ASSERT(false); }));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -1190,7 +1235,10 @@ class TransferEvents {
   static constexpr long long unmatchedIndex = -1;
   static constexpr auto noTID = std::numeric_limits<uint64_t>::max();
   std::reference_wrapper<std::vector<std::shared_ptr<Result>>> results_;
+<<<<<<< HEAD
   const ProfilerConfig& config_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<const itrace_t*> trace_activities_;
   ska::flat_hash_map<const itrace_t*, std::shared_ptr<Result>> kineto_events_;
 };
@@ -1198,7 +1246,11 @@ class TransferEvents {
 class TransferEvents {
  public:
   template <class... Args>
+<<<<<<< HEAD
   TransferEvents(Args&&... /*unused*/) {}
+=======
+  TransferEvents(Args&&...) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 #endif
 
@@ -1217,7 +1269,11 @@ trace_ptr_t addKinetoEvents(
 
   auto trace = std::make_unique<ActivityTraceWrapper>(stopTrace());
   TORCH_INTERNAL_ASSERT(trace || !kKinetoAvailable);
+<<<<<<< HEAD
   TransferEvents transfer{results, trace, config};
+=======
+  TransferEvents transfer{results, trace};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return trace;
 }
 
@@ -1514,6 +1570,7 @@ RecordQueue::getRecords(
     queue.allocations_.clear();
     materialize(queue.ooms_);
 
+<<<<<<< HEAD
     std::optional<int64_t> pending_start;
     for (auto& e : queue.pythongc_) {
       if (e.first.find("start") != std::string::npos) {
@@ -1539,6 +1596,8 @@ RecordQueue::getRecords(
       }
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto& i : queue.py_calls_) {
       python_enters.push_back(
           {i.first, queue.tid(), queue.kineto_info(), converter(i.second)});
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index b05f4608fb77a..36b257ece9971 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -34,8 +34,12 @@ enum class EventType : uint8_t {
   OutOfMemory,
   PyCall,
   PyCCall,
+<<<<<<< HEAD
   Kineto,
   PythonGC
+=======
+  Kineto
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // ============================================================================
@@ -178,7 +182,10 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
   FallbackPair device_fallback_;
   bool allow_tf32_cublas_;
   std::unique_ptr<perf_counters_t> perf_event_counters_;
+<<<<<<< HEAD
   std::string metadata_json_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <>
@@ -194,12 +201,15 @@ struct ExtraFields<EventType::Backend> {
 };
 
 template <>
+<<<<<<< HEAD
 struct ExtraFields<EventType::PythonGC> {
   std::string phase;
   int64_t duration_ns_;
 };
 
 template <>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct ExtraFields<EventType::Vulkan> {
   using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
   std::string name_;
@@ -369,8 +379,12 @@ struct ExtraFields<EventType::Kineto> {
   uint64_t correlation_id_{0};
   libkineto::ActivityType activity_type_;
   Flow flow;
+<<<<<<< HEAD
   std::weak_ptr<Result> linked_activity_;
   std::string metadata_json_;
+=======
+  std::weak_ptr<Result> linked_activity_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API Result : public std::enable_shared_from_this<Result> {
@@ -424,14 +438,22 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
       ExtraFields<EventType::OutOfMemory>,
       ExtraFields<EventType::PyCall>,
       ExtraFields<EventType::PyCCall>,
+<<<<<<< HEAD
       ExtraFields<EventType::Kineto>,
       ExtraFields<EventType::PythonGC>>
+=======
+      ExtraFields<EventType::Kineto>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       extra_fields_;
 
   std::weak_ptr<Result> parent_;
   std::vector<std::shared_ptr<Result>> children_;
   bool finished_{false};
+<<<<<<< HEAD
   bool hidden_{false};
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
 
  private:
@@ -447,7 +469,11 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
         extra_fields_{std::move(extra_fields)} {}
 
   template <EventType E>
+<<<<<<< HEAD
   static EventType deduceTag(const ExtraFields<E>& /*unused*/) {
+=======
+  static EventType deduceTag(const ExtraFields<E>&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return E;
   }
 };
@@ -559,11 +585,14 @@ class TORCH_API ThreadLocalSubqueue {
     py_calls_.emplace_back(std::forward<Args>(args)...);
   }
 
+<<<<<<< HEAD
   template <class... Args>
   void emplace_gc_call(Args&&... args) {
     pythongc_.emplace_back(std::forward<Args>(args)...);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint64_t tid() const {
     return tid_;
   }
@@ -654,9 +683,12 @@ class TORCH_API ThreadLocalSubqueue {
       std::pair<python_tracer::TraceKey, c10::approx_time_t>,
       BlockSize>
       py_calls_;
+<<<<<<< HEAD
   // gc with_stack (Python)
   AppendOnlyList<std::pair<std::string, c10::approx_time_t>, BlockSize>
       pythongc_;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class TORCH_API RecordQueue {
@@ -664,7 +696,10 @@ class TORCH_API RecordQueue {
   RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
 
   bool tracePython() const;
+<<<<<<< HEAD
   bool getPythonGcEvents() const;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ThreadLocalSubqueue* getSubqueue();
   void stop();
   void restart();
@@ -689,6 +724,7 @@ class TORCH_API RecordQueue {
 };
 
 TORCH_API bool get_record_concrete_inputs_enabled();
+<<<<<<< HEAD
 TORCH_API void set_record_concrete_inputs_enabled_fn(
     std::function<bool()> /*fn*/);
 TORCH_API void set_record_concrete_inputs_enabled_val(bool /*val*/);
@@ -700,11 +736,28 @@ TORCH_API void set_fwd_bwd_enabled_val(bool /*val*/);
 TORCH_API bool get_cuda_sync_enabled();
 TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()> /*fn*/);
 TORCH_API void set_cuda_sync_enabled_val(bool /*val*/);
+=======
+TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>);
+TORCH_API void set_record_concrete_inputs_enabled_val(bool);
+
+TORCH_API bool get_fwd_bwd_enabled();
+TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>);
+TORCH_API void set_fwd_bwd_enabled_val(bool);
+
+TORCH_API bool get_cuda_sync_enabled();
+TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>);
+TORCH_API void set_cuda_sync_enabled_val(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Comms related RecordFunctions will record information about tensor storage
 // locations.
 TORCH_API bool get_record_tensor_addrs_enabled();
+<<<<<<< HEAD
 TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()> /*fn*/);
 TORCH_API void set_record_tensor_addrs_enabled_val(bool /*val*/);
+=======
+TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()>);
+TORCH_API void set_record_tensor_addrs_enabled_val(bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::profiler::impl
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index a9f98930f8c66..158ea62b7d53b 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -50,7 +50,11 @@ struct RawTensors {
   }
 
   template <typename T>
+<<<<<<< HEAD
   void operator()(T& /*unused*/) {}
+=======
+  void operator()(T&) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<RawTensorInfo> tensors_;
 };
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index ec9994e15ec9c..288e1cf2cdc18 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -50,7 +50,10 @@ const std::set<libkineto::ActivityType> kXpuTypes = {
 const std::set<libkineto::ActivityType> kMtiaTypes = {
     libkineto::ActivityType::MTIA_CCP_EVENTS,
     libkineto::ActivityType::MTIA_RUNTIME,
+<<<<<<< HEAD
     libkineto::ActivityType::MTIA_INSIGHT,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 const std::set<libkineto::ActivityType> hpuTypes = {
     libkineto::ActivityType::HPU_OP,
@@ -207,7 +210,10 @@ class ExperimentalConfigWrapper {
     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
              << (config_.profiler_measure_per_kernel ? "true" : "false")
              << "\n";
+<<<<<<< HEAD
     configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << "\n";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(INFO) << "Generated config = " << configss.str();
 
     libkineto::api().activityProfiler().prepareTrace(
@@ -240,6 +246,7 @@ static const std::string setTraceID(const std::string& trace_id) {
   configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << "\n";
   return configss.str();
 }
+<<<<<<< HEAD
 
 static const std::string appendCustomConfig(
     const std::string& config,
@@ -252,6 +259,8 @@ static const std::string appendCustomConfig(
   configss << "CUSTOM_CONFIG=" << custom_profiler_config << "\n";
   return configss.str();
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 void prepareTrace(
@@ -308,9 +317,13 @@ void prepareTrace(
     return;
   }
 
+<<<<<<< HEAD
   const std::string traceIdStr = setTraceID(trace_id);
   const std::string configStr =
       appendCustomConfig(traceIdStr, config.custom_profiler_config);
+=======
+  const std::string configStr = setTraceID(trace_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   libkineto::api().activityProfiler().prepareTrace(k_activities, configStr);
 #endif // USE_KINETO
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
index b2c4fc5071516..3d821ad7ef636 100644
--- a/torch/csrc/profiler/orchestration/observer.cpp
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -21,9 +21,12 @@ ExperimentalConfig::ExperimentalConfig(
     bool disable_external_correlation,
     bool profile_all_threads,
     bool capture_overload_names,
+<<<<<<< HEAD
     bool record_python_gc_info,
     bool expose_kineto_event_metadata,
     std::string custom_profiler_config,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool adjust_timestamps)
     : profiler_metrics{std::move(profiler_metrics)},
       profiler_measure_per_kernel{profiler_measure_per_kernel},
@@ -34,9 +37,12 @@ ExperimentalConfig::ExperimentalConfig(
       disable_external_correlation{disable_external_correlation},
       profile_all_threads{profile_all_threads},
       capture_overload_names{capture_overload_names},
+<<<<<<< HEAD
       record_python_gc_info{record_python_gc_info},
       expose_kineto_event_metadata{expose_kineto_event_metadata},
       custom_profiler_config(std::move(custom_profiler_config)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       adjust_timestamps{adjust_timestamps} {}
 
 /*explicit*/ ExperimentalConfig::operator bool() const {
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 3b59466e6060f..63cacdce2e39f 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -62,9 +62,12 @@ struct TORCH_API ExperimentalConfig {
       bool disable_external_correlation = false,
       bool profile_all_threads = false,
       bool capture_overload_names = false,
+<<<<<<< HEAD
       bool record_python_gc_info = false,
       bool expose_kineto_event_metadata = false,
       std::string custom_profiler_config = "",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool adjust_timestamps = false);
   explicit operator bool() const;
 
@@ -105,6 +108,7 @@ struct TORCH_API ExperimentalConfig {
   bool capture_overload_names;
 
   /*
+<<<<<<< HEAD
    * Controls whether or not python gc info is recorded. This is used to
    * determine if gc collect is slowing down your profile.
    */
@@ -121,6 +125,8 @@ struct TORCH_API ExperimentalConfig {
   std::string custom_profiler_config;
 
   /*
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * Controls whether or not timestamp adjustment occurs after profiling.
    * The purpose of this is to adjust Vulkan event timelines to align with those
    * of their parent CPU events.
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index f7f0ea584e646..7a640a80860b6 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -11,11 +11,18 @@ struct NoOpPythonTracer : public PythonTracerBase {
 
   void stop() override {}
   void restart() override {}
+<<<<<<< HEAD
   void register_gc_callback() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)> /*time_converter*/,
       std::vector<CompressedEvent>& /*enters*/,
       c10::time_t /*end_time_ns*/) override {
+=======
+  std::vector<std::shared_ptr<Result>> getEvents(
+      std::function<c10::time_t(c10::approx_time_t)>,
+      std::vector<CompressedEvent>&,
+      c10::time_t) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {};
   }
 };
@@ -25,7 +32,11 @@ struct NoOpMemoryPythonTracer : public PythonMemoryTracerBase {
   ~NoOpMemoryPythonTracer() override = default;
   void start() override {}
   void stop() override {}
+<<<<<<< HEAD
   void export_memory_history(const std::string& /*path*/) override {}
+=======
+  void export_memory_history(const std::string&) override {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
index 1011f75b82308..4e02662ae25a3 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -48,7 +48,10 @@ struct TORCH_API PythonTracerBase {
 
   virtual void stop() = 0;
   virtual void restart() = 0;
+<<<<<<< HEAD
   virtual void register_gc_callback() = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)> time_converter,
       std::vector<CompressedEvent>& enters,
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
index 906ee79e2cf42..f00985cf61a54 100644
--- a/torch/csrc/profiler/perf.h
+++ b/torch/csrc/profiler/perf.h
@@ -88,7 +88,11 @@ class PerfProfiler {
 
   /* Disable counting and fill in the caller supplied container with delta
    * calculated from the start count values since last Enable() */
+<<<<<<< HEAD
   void Disable(perf_counters_t& /*vals*/);
+=======
+  void Disable(perf_counters_t&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   uint64_t CalcDelta(uint64_t start, uint64_t end) const;
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index f057f736c4af5..20b006e54f016 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -2,7 +2,10 @@
 
 #include <ATen/record_function.h>
 #include <c10/core/impl/PyInterpreter.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/overloaded.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
@@ -89,7 +92,11 @@ struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
       std::shared_ptr<torch::CapturedTraceback>,
       _("torch._C._profiler.CapturedTraceback"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (Py_TYPE(src.ptr()) == &THPCapturedTracebackType) {
       value = reinterpret_cast<THPCapturedTraceback*>(src.ptr())->data;
       return true;
@@ -221,7 +228,12 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
     TORCH_INTERNAL_ASSERT(
         !self->guard,
         "Trying to enter a new record_function_fast context but the guard is unexpectedly already set");
+<<<<<<< HEAD
     auto scope = at::RecordScope::FUNCTION;
+=======
+    self->guard =
+        std::make_unique<at::RecordFunction>(at::RecordScope::FUNCTION);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<at::IValue> args;
     std::unordered_map<std::string, at::IValue> kwargs;
     bool profiler_need_input = torch::autograd::profiler::profilerEnabled() &&
@@ -251,6 +263,7 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         if (THPUtils_checkString(value)) {
           ivalue = at::IValue(THPUtils_unpackString(value));
         } else {
+<<<<<<< HEAD
           // Handle other types (not strings, not lists)
           auto match = torch::jit::tryToInferPrimitiveType(value);
           if (match.success()) {
@@ -279,6 +292,11 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
                   "Unable to infer type of value in the List for keyword: ",
                   key_str);
             }
+=======
+          auto match = torch::jit::tryToInferPrimitiveType(value);
+          if (match.success()) {
+            ivalue = torch::jit::toIValue(value, match.type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } else {
             TORCH_WARN("Unable to infer type of value for keyword: ", key_str);
             ivalue = at::IValue("NULL");
@@ -287,6 +305,7 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         kwargs[key_str] = ivalue;
       }
     }
+<<<<<<< HEAD
     auto it = kwargs.find("scope");
     if (it != kwargs.end()) {
       auto value = it->second;
@@ -298,6 +317,8 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
       }
     }
     self->guard = std::make_unique<at::RecordFunction>(scope);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self->guard->before(THPUtils_unpackString(self->name), &args, &kwargs);
   }
   Py_RETURN_NONE;
@@ -376,10 +397,14 @@ void initPythonBindings(PyObject* module) {
               bool /* adjust_profiler_step */,
               bool /* disable_external_correlation*/,
               bool /* profile_all_threads */,
+<<<<<<< HEAD
               bool /* capture_overload_names */,
               bool /* record_python_gc_info */,
               bool /* expose_kineto_event_metadata */,
               std::string /* custom_profiler_config*/
+=======
+              bool /* capture_overload_names */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               >(),
           "An experimental config for Kineto features. Please note that"
           "backward compatibility is not guaranteed.\n"
@@ -394,6 +419,7 @@ void initPythonBindings(PyObject* module) {
           "       that expose CUDA device, stream and event synchronization activities. This feature is new\n"
           "       and currently disabled by default.\n"
           "    adjust_profiler_step (bool) : whether to adjust the profiler step to\n"
+<<<<<<< HEAD
           "       match the parent python event duration. This feature is new and currently disabled by default.\n"
           "    disable_external_correlation (bool) : whether to disable external correlation\n"
           "    profile_all_threads (bool) : whether to profile all threads\n"
@@ -401,6 +427,12 @@ void initPythonBindings(PyObject* module) {
           "    record_python_gc_info (bool) : adds python gc events to profile\n"
           "    expose_kineto_event_metadata (bool) : whether to expose KinetoEvent metadata in the PyTorch Profiler\n"
           "    custom_profiler_config (string) : Used to pass some configurations to the custom profiler backend.\n",
+=======
+          "       match the parent python event duration. This feature is new and currently disabled by default.\n",
+          "    disable_external_correlation (bool) : whether to disable external correlation\n",
+          "    profile_all_threads (bool) : whether to profile all threads\n",
+          "    capture_overload_names (bool) : whether to include ATen overload names in the profile\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::arg("profiler_metrics") = std::vector<std::string>(),
           py::arg("profiler_measure_per_kernel") = false,
           py::arg("verbose") = false,
@@ -409,10 +441,14 @@ void initPythonBindings(PyObject* module) {
           py::arg("adjust_profiler_step") = false,
           py::arg("disable_external_correlation") = false,
           py::arg("profile_all_threads") = false,
+<<<<<<< HEAD
           py::arg("capture_overload_names") = false,
           py::arg("record_python_gc_info") = false,
           py::arg("expose_kineto_event_metadata") = false,
           py::arg("custom_profiler_config") = "")
+=======
+          py::arg("capture_overload_names") = false)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
             py::list py_metrics;
@@ -435,6 +471,7 @@ void initPythonBindings(PyObject* module) {
                 p.disable_external_correlation,
                 p.profile_all_threads,
                 p.capture_overload_names,
+<<<<<<< HEAD
                 p.record_python_gc_info,
                 p.expose_kineto_event_metadata,
                 p.custom_profiler_config,
@@ -442,6 +479,14 @@ void initPythonBindings(PyObject* module) {
           },
           [](const py::tuple& t) { // __setstate__
             TORCH_CHECK(t.size() < 5, "Expected at least 5 values in state");
+=======
+                p.performance_events);
+          },
+          [](const py::tuple& t) { // __setstate__
+            if (t.size() >= 5) {
+              throw std::runtime_error("Expected at least 5 values in state");
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             py::list py_metrics = t[0].cast<py::list>();
             std::vector<std::string> metrics{py_metrics.size()};
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 918cc554c5b16..09349e34f60e0 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -109,6 +109,7 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   using ID = size_t;
 
   // Mapping of each thread to its own operator stack
+<<<<<<< HEAD
   std::map<size_t, std::stack<ID>> opStack;
   // Uses the underlying TensorImpl object pointer as the key and map to its
   // unique id.
@@ -118,6 +119,17 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   std::unordered_map<const void*, ID> data_ptr_to_storage_id;
   std::unordered_map<const void*, weak_storage_ptr>
       data_ptr_to_weak_storage_ptr;
+=======
+  std::map<size_t, std::stack<ID>> opStack{};
+  // Uses the underlying TensorImpl object pointer as the key and map to its
+  // unique id.
+  std::map<const void*, ID> objectId{};
+
+  using weak_storage_ptr = c10::weak_intrusive_ptr<StorageImpl>;
+  std::unordered_map<const void*, ID> data_ptr_to_storage_id{};
+  std::unordered_map<const void*, weak_storage_ptr>
+      data_ptr_to_weak_storage_ptr{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ID get_tensor_storage_ID(const c10::Storage& t_storage) {
     const std::lock_guard<std::recursive_mutex> lock(gMutex);
@@ -138,7 +150,12 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
         // So we need to remove the key and insert the key with the new value.
         data_ptr_to_storage_id.erase(raw_data_ptr);
         data_ptr_to_storage_id[raw_data_ptr] = id;
+<<<<<<< HEAD
         data_ptr_to_weak_storage_ptr.insert_or_assign(
+=======
+        data_ptr_to_weak_storage_ptr.erase(raw_data_ptr);
+        data_ptr_to_weak_storage_ptr.emplace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raw_data_ptr, t_storage.getWeakStorageImpl());
         return id;
       } else {
@@ -151,6 +168,7 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   enum class RunState { uninitialized, disabled, enabled };
 
   // Mutex for multithreaded access to the shared containers.
+<<<<<<< HEAD
   std::recursive_mutex gMutex;
   // Stream to write output JSON.
   std::ofstream out;
@@ -159,13 +177,27 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   std::string fileName;
 
   std::string resourceDir;
+=======
+  std::recursive_mutex gMutex{};
+  // Stream to write output JSON.
+  std::ofstream out{};
+
+  // Full path to the output file.
+  std::string fileName{};
+
+  std::string resourceDir{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // RecordFunction callback handle for this observer.
   CallbackHandle cbHandle{INVALID_CALLBACK_HANDLE};
 
   // Process ID.
   int32_t pid{-1};
+<<<<<<< HEAD
   std::string recordTime;
+=======
+  std::string recordTime{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ExecutionTraceObserver() = default;
 
@@ -192,7 +224,11 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
 
   bool record_integral_tensor_range{false};
 
+<<<<<<< HEAD
   std::unordered_set<std::string> nodeListForSavingIntegerTensor;
+=======
+  std::unordered_set<std::string> nodeListForSavingIntegerTensor{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   static bool callbackShouldBeEnabled(RunState run_state) {
diff --git a/torch/csrc/profiler/standalone/itt_observer.cpp b/torch/csrc/profiler/standalone/itt_observer.cpp
index 6a1088c91e06e..49635f02ceb00 100644
--- a/torch/csrc/profiler/standalone/itt_observer.cpp
+++ b/torch/csrc/profiler/standalone/itt_observer.cpp
@@ -20,12 +20,17 @@ struct ITTThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::ITT;
   }
 
+<<<<<<< HEAD
   void reportMemoryUsage(
       void* /*ptr*/,
       int64_t /*alloc_size*/,
       size_t /*total_allocated*/,
       size_t /*total_reserved*/,
       c10::Device /*device*/) override {}
+=======
+  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static ITTThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/standalone/nvtx_observer.cpp b/torch/csrc/profiler/standalone/nvtx_observer.cpp
index 6631b2c132d12..cc7b9dc744f1e 100644
--- a/torch/csrc/profiler/standalone/nvtx_observer.cpp
+++ b/torch/csrc/profiler/standalone/nvtx_observer.cpp
@@ -20,12 +20,17 @@ struct NVTXThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::NVTX;
   }
 
+<<<<<<< HEAD
   void reportMemoryUsage(
       void* /*ptr*/,
       int64_t /*alloc_size*/,
       size_t /*total_allocated*/,
       size_t /*total_reserved*/,
       c10::Device /*device*/) override {}
+=======
+  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static NVTXThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index ea195f7bc71ad..218a5fed59a4a 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,11 +1,15 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
 #else
 #include <nvToolsExt.h>
 #endif
+=======
+#include <nvtx3/nvtx3.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
@@ -36,7 +40,11 @@ static void cudaCheck(cudaError_t result, const char* file, int line) {
     } else {
       ss << cudaGetErrorString(result);
     }
+<<<<<<< HEAD
     TORCH_CHECK(false, ss.str());
+=======
+    throw std::runtime_error(ss.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 #define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index 083578ec391e5..874355bf8e412 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -57,7 +57,11 @@ struct FDE {
       throw UnwindError("unsupported 'eh' augmentation string");
     }
     code_alignment_factor_ = static_cast<int64_t>(LC.readULEB128());
+<<<<<<< HEAD
     data_alignment_factor_ = LC.readSLEB128();
+=======
+    data_alignment_factor_ = static_cast<int64_t>(LC.readSLEB128());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (version == 1) {
       ra_register_ = LC.read<uint8_t>();
     } else {
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index d266958e2cb63..60da5b5b33b5f 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -1,5 +1,9 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/profiler/collection.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/profiler/kineto_shim.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/profiler/util.h>
 
 #include <c10/util/ArrayRef.h>
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index f2ae57fa0e591..4e24581a95531 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -43,7 +43,11 @@ TORCH_API void logSoftAssert(
     uint32_t line,
     const char* cond,
     const char* args);
+<<<<<<< HEAD
 inline void logSoftAssert(
+=======
+TORCH_API inline void logSoftAssert(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const char* func,
     const char* file,
     uint32_t line,
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index 4cd185493f1f9..e8adcdbaae1ae 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -257,7 +257,11 @@ void THPStorage_writeFileRaw(
         at::device(self->device()).dtype(c10::kByte),
         {self->device()});
     cpu_tensor = device_tensor.to(at::kCPU);
+<<<<<<< HEAD
     data = static_cast<uint8_t*>(cpu_tensor.data_ptr());
+=======
+    data = (uint8_t*)cpu_tensor.data_ptr();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (save_size) {
     if (torch::utils::THP_nativeByteOrder() ==
@@ -266,8 +270,13 @@ void THPStorage_writeFileRaw(
     else {
       int64_t nsize{}; // convert big endian cpu to little endian storage
       torch::utils::THP_encodeBuffer(
+<<<<<<< HEAD
           reinterpret_cast<uint8_t*>(&nsize),
           reinterpret_cast<const int64_t*>(&numel),
+=======
+          (uint8_t*)&nsize,
+          (const int64_t*)&numel,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
           1);
       doWrite(fd, &nsize, sizeof(int64_t));
@@ -279,7 +288,11 @@ void THPStorage_writeFileRaw(
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doWrite(fd, data, size_bytes);
   } else {
+<<<<<<< HEAD
     size_t buffer_size = std::min(numel, static_cast<size_t>(5000));
+=======
+    size_t buffer_size = std::min(numel, (size_t)5000);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<uint8_t> le_buffer;
     le_buffer.resize(buffer_size * element_size);
     for (size_t i = 0; i < numel; i += buffer_size) {
@@ -287,19 +300,31 @@ void THPStorage_writeFileRaw(
       if (element_size == 2) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
+<<<<<<< HEAD
             reinterpret_cast<const int16_t*>(data) + i,
+=======
+            (const int16_t*)data + i,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (element_size == 4) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
+<<<<<<< HEAD
             reinterpret_cast<const int32_t*>(data) + i,
+=======
+            (const int32_t*)data + i,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (element_size == 8) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
+<<<<<<< HEAD
             reinterpret_cast<const int64_t*>(data) + i,
+=======
+            (const int64_t*)data + i,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       }
@@ -333,8 +358,12 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
   if (torch::utils::THP_nativeByteOrder() ==
       torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
     int64_t tsize = size; // convert little endian storage to big endian cpu
+<<<<<<< HEAD
     torch::utils::THP_decodeBuffer(
         &size, reinterpret_cast<const uint8_t*>(&tsize), true, 1);
+=======
+    torch::utils::THP_decodeBuffer(&size, (const uint8_t*)&tsize, true, 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   size_t nbytes = element_size * size;
   if (!storage.defined()) {
@@ -352,14 +381,25 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
         _storage_nbytes);
   }
 
+<<<<<<< HEAD
   std::string cpu_data;
+=======
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  std::unique_ptr<char[]> cpu_data;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   uint8_t* data{};
   if (storage->device_type() == at::kCPU) {
     data = static_cast<uint8_t*>(storage->mutable_data());
   } else {
+<<<<<<< HEAD
     cpu_data.resize(nbytes);
     data = reinterpret_cast<uint8_t*>(cpu_data.data());
+=======
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    cpu_data = std::unique_ptr<char[]>(new char[nbytes]);
+    data = (uint8_t*)cpu_data.get();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // fast track for bytes and little endian
@@ -368,6 +408,7 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doRead(file, data, storage->nbytes());
   } else {
+<<<<<<< HEAD
     int64_t buffer_size = std::min(size, static_cast<int64_t>(5000));
     std::vector<uint8_t> le_buffer;
     le_buffer.resize(buffer_size * element_size);
@@ -375,10 +416,21 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
     for (int64_t i = 0; i < size; i += buffer_size) {
       size_t to_convert = std::min(size - i, buffer_size);
       doRead(file, le_buffer.data(), element_size * to_convert);
+=======
+    int64_t buffer_size = std::min(size, (int64_t)5000);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    std::unique_ptr<uint8_t[]> le_buffer(
+        new uint8_t[buffer_size * element_size]);
+
+    for (int64_t i = 0; i < size; i += buffer_size) {
+      size_t to_convert = std::min(size - i, buffer_size);
+      doRead(file, le_buffer.get(), element_size * to_convert);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       // NOLINTNEXTLINE(bugprone-branch-clone)
       if (element_size == 2) {
         torch::utils::THP_decodeBuffer(
+<<<<<<< HEAD
             reinterpret_cast<int16_t*>(data) + i,
             le_buffer.data(),
             true,
@@ -395,6 +447,15 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
             le_buffer.data(),
             true,
             to_convert);
+=======
+            (int16_t*)data + i, le_buffer.get(), true, to_convert);
+      } else if (element_size == 4) {
+        torch::utils::THP_decodeBuffer(
+            (int32_t*)data + i, le_buffer.get(), true, to_convert);
+      } else if (element_size == 8) {
+        torch::utils::THP_decodeBuffer(
+            (int64_t*)data + i, le_buffer.get(), true, to_convert);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/torch/csrc/stable/library.h b/torch/csrc/stable/library.h
index 61bc6d7249f5f..74e8ea60fe9d4 100644
--- a/torch/csrc/stable/library.h
+++ b/torch/csrc/stable/library.h
@@ -4,6 +4,7 @@
 // code for better UX.
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+<<<<<<< HEAD
 #include <torch/csrc/stable/c/shim.h>
 #include <torch/headeronly/macros/Macros.h>
 
@@ -14,6 +15,203 @@
 #include <torch/csrc/stable/version.h>
 
 HIDDEN_NAMESPACE_BEGIN(torch, stable, detail)
+=======
+#include <torch/csrc/stable/tensor.h>
+
+#include <optional>
+
+// use anonymous namespace to avoid collisions between differing
+// versions of this file that may be included by different sources
+namespace {
+
+// =============================================================================
+//  helpers for converting between StableIValue and T
+// =============================================================================
+
+// forward declare so that from/to() calls in detail work
+template <typename T>
+StableIValue from(T val);
+template <typename T>
+T to(StableIValue val);
+
+namespace detail {
+
+// =============================================================================
+// FROM CONVERSIONS (T -> StableIValue)
+// =============================================================================
+
+// Specialization for general copyable types (catch-all) => StableIValue
+template <typename T>
+struct FromImpl {
+  static StableIValue call(T val) {
+    static_assert(
+        sizeof(T) <= sizeof(StableIValue),
+        "StableLibrary stack does not support parameter types larger than 64 bits.");
+    static_assert(std::is_trivially_copyable_v<T>);
+    // Initialization should be cheap enough; let's give people well-specified
+    // reproducible behavior.
+    StableIValue result = 0;
+    // NOTE [ -Wclass-memaccess ]: reinterpret_cast to suppress
+    // overzealous -Wclass-memaccess. (see
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107361) We have a
+    // static_assert above that T is trivially copyable, which should be
+    // enough.
+    std::memcpy(&result, reinterpret_cast<const void*>(&val), sizeof(val));
+    return result;
+  }
+};
+
+// Specialization for std::nullopt_t => StableIValue
+template <>
+struct FromImpl<std::nullopt_t> {
+  static StableIValue call(std::nullopt_t val) {
+    return from(nullptr);
+  }
+};
+
+// Specialization for std::optional => StableIValue
+// [Handling std::optional]
+// When the schema is represented by an optional type, say int?, then we
+// expect the custom extension representation to be a std::optional<int>
+// (critically NOT int!). In order for all parameters to be stably parsed and
+// handled by our dispatcher, we liaison custom extension parameters through
+// boxed kernels, meaning that every value will make its way to be an IValue:
+//
+// custom extension value --(from)-> StableIValue --(to_ivalue)-> IValue
+//
+// When the custom extension value is a literal that can be trivially
+// casted to StableIValue, e.g., an int, a float, a pointer, this route is
+// ...trivial. The below specialization is for a case when the custom
+// extension value would NOT fit within a StableIValue: a std::optional.
+//
+// If the std::optional has no value, it is treated as std::nullopt,
+// whose StableIValue representation is from(nullptr). Otherwise, we:
+// 1. unwrap the std::optional<T>
+// 2. recursively convert its value of type T to a StableIValue
+// 3. allocate heap space for said StableIValue
+// 4. convert the resulting StableIValue* into a StableIValue
+//
+// note that this allocates heap memory! which we expect to be cleaned
+// up in the to_ivalue() function defined in shim_common.cpp. We
+// purposefully hide this implementation detail from the user so that
+// all the user needs to know is:
+//
+// The schema requests an optional (T?) so I must call `from` on a
+// std::optional<T> or a std::nullopt.
+template <typename T>
+struct FromImpl<std::optional<T>> {
+  static StableIValue call(const std::optional<T>& val) {
+    if (!val.has_value()) {
+      return from(std::nullopt);
+    }
+    StableIValue* heap_val = new StableIValue(from(val.value()));
+    return from(heap_val);
+  }
+};
+
+// Specialization for torch::stable::Tensor => StableIValue
+// Returns a new owning reference of the underlying Tensor.
+template <>
+struct FromImpl<torch::stable::Tensor> {
+  static StableIValue call(const torch::stable::Tensor& val) {
+    AtenTensorHandle new_ath;
+    aoti_torch_new_tensor_handle(val.get(), &new_ath);
+    return from(new_ath);
+  }
+};
+
+// =============================================================================
+// TO CONVERSIONS (StableIValue -> T)
+// =============================================================================
+
+// Specialization for StableIValue => general copyable types (catch-all)
+template <typename T>
+struct ToImpl {
+  static T call(StableIValue val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+    // T may not have a default constructor. (For example, it might be
+    // c10::Device.) However, std::memcpy implicitly creates a T at the
+    // destination. So, we can use a union to work around this lack of
+    // default constructor.
+    union Result {
+      Result() {}
+      T t;
+    };
+    Result result;
+    // See NOTE[ -Wclass-memaccess ] above.
+    std::memcpy(reinterpret_cast<void*>(&result.t), &val, sizeof(result));
+    return result.t;
+  }
+};
+
+// Specialization for StableIValue => std::nullopt_t
+template <>
+struct ToImpl<std::nullopt_t> {
+  static std::nullopt_t call(StableIValue val) {
+    // val should be equivalent to from(nullptr)
+    return std::nullopt;
+  }
+};
+
+// Specialization for StableIValue => std::optional, see [Handling
+// std::optional] as the semantic is the same but in reverse direction as we go
+// from IValue --(from_ivalue)-> StableIValue --(to<T>)-> T in custom extension
+template <typename T>
+struct ToImpl<std::optional<T>> {
+  static std::optional<T> call(StableIValue val) {
+    auto sivp = to<StableIValue*>(val);
+
+    // sivp is either nullptr or a pointer to a StableIValue
+    if (sivp == nullptr) {
+      return {};
+    }
+    auto inner_val = to<T>(*sivp);
+
+    // free the memory associated with StableIValue* sivp
+    delete sivp;
+
+    return std::make_optional(inner_val);
+  }
+};
+
+// Specialization for StableIValue => torch::stable::Tensor
+// The resulting stable::Tensor steals ownership of the input's
+// underlying AtenTensorHandle.
+template <>
+struct ToImpl<torch::stable::Tensor> {
+  static torch::stable::Tensor call(StableIValue val) {
+    return torch::stable::Tensor(to<AtenTensorHandle>(val));
+  }
+};
+
+} // namespace detail
+
+// Expose the partially templated class functions through single functions
+template <typename T>
+StableIValue from(T val) {
+  return detail::FromImpl<T>::call(val);
+}
+
+template <typename T>
+StableIValue from(const std::optional<T>& val) {
+  return detail::FromImpl<std::optional<T>>::call(val);
+}
+
+// The below overload is used! See https://godbolt.org/z/859cshxrW
+// We are suppressing the warning for versions clang12- and gcc11-
+[[maybe_unused]] StableIValue from(const torch::stable::Tensor& val) {
+  return detail::FromImpl<torch::stable::Tensor>::call(val);
+}
+
+template <typename T>
+T to(StableIValue val) {
+  return detail::ToImpl<T>::call(val);
+}
+
+// =============================================================================
+//  end to helpers for converting between StableIValue and T
+// =============================================================================
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class StableLibrary final {
  private:
@@ -83,11 +281,15 @@ class StableLibrary final {
   StableLibrary& impl(
       const char* name,
       void (*fn)(StableIValue*, uint64_t, uint64_t)) {
+<<<<<<< HEAD
 #if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0
     torch_library_impl(lib_, name, fn, TORCH_ABI_VERSION);
 #else
     aoti_torch_library_impl(lib_, name, fn);
 #endif
+=======
+    aoti_torch_library_impl(lib_, name, fn);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return *this;
   }
 
@@ -116,7 +318,11 @@ class StableTorchLibraryInit final {
   }
 };
 
+<<<<<<< HEAD
 HIDDEN_NAMESPACE_END(torch, stable, detail)
+=======
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // macros copied from c10/macros/Macros.h
 #ifdef __COUNTER__
@@ -134,6 +340,7 @@ HIDDEN_NAMESPACE_END(torch, stable, detail)
 
 #define _STABLE_TORCH_LIBRARY_IMPL(ns, k, m, uid)                             \
   static void STABLE_CONCATENATE(                                             \
+<<<<<<< HEAD
       STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_,                           \
       uid)(torch::stable::detail::StableLibrary&);                            \
   static const torch::stable::detail::StableTorchLibraryInit                  \
@@ -161,12 +368,37 @@ HIDDEN_NAMESPACE_END(torch, stable, detail)
           __FILE__,                                          \
           __LINE__);                                         \
   void STABLE_TORCH_LIBRARY_init_##ns(torch::stable::detail::StableLibrary& m)
+=======
+      STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary&);     \
+  static const StableTorchLibraryInit STABLE_CONCATENATE(                     \
+      STABLE_TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(              \
+      StableLibrary::Kind::IMPL,                                              \
+      &STABLE_CONCATENATE(STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid), \
+      #ns,                                                                    \
+      #k,                                                                     \
+      __FILE__,                                                               \
+      __LINE__);                                                              \
+  void STABLE_CONCATENATE(                                                    \
+      STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary & m)
+
+#define STABLE_TORCH_LIBRARY(ns, m)                                          \
+  static void STABLE_TORCH_LIBRARY_init_##ns(StableLibrary&);                \
+  static const StableTorchLibraryInit STABLE_TORCH_LIBRARY_static_init_##ns( \
+      StableLibrary::Kind::DEF,                                              \
+      &STABLE_TORCH_LIBRARY_init_##ns,                                       \
+      #ns,                                                                   \
+      nullptr,                                                               \
+      __FILE__,                                                              \
+      __LINE__);                                                             \
+  void STABLE_TORCH_LIBRARY_init_##ns(StableLibrary& m)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define STABLE_TORCH_LIBRARY_FRAGMENT(ns, m) \
   _STABLE_TORCH_LIBRARY_FRAGMENT(ns, m, STABLE_UID)
 
 #define _STABLE_TORCH_LIBRARY_FRAGMENT(ns, m, uid)                          \
   static void STABLE_CONCATENATE(                                           \
+<<<<<<< HEAD
       STABLE_TORCH_LIBRARY_FRAGMENT_init_##ns##_,                           \
       uid)(torch::stable::detail::StableLibrary&);                          \
   static const torch::stable::detail::StableTorchLibraryInit                \
@@ -181,3 +413,16 @@ HIDDEN_NAMESPACE_END(torch, stable, detail)
           __LINE__);                                                        \
   void STABLE_CONCATENATE(STABLE_TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)( \
       torch::stable::detail::StableLibrary & m)
+=======
+      STABLE_TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(StableLibrary&);     \
+  static const StableTorchLibraryInit STABLE_CONCATENATE(                   \
+      STABLE_TORCH_LIBRARY_FRAGMENT_static_init_##ns##_, uid)(              \
+      StableLibrary::Kind::FRAGMENT,                                        \
+      &STABLE_CONCATENATE(STABLE_TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid), \
+      #ns,                                                                  \
+      nullptr,                                                              \
+      __FILE__,                                                             \
+      __LINE__);                                                            \
+  void STABLE_CONCATENATE(                                                  \
+      STABLE_TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(StableLibrary & m)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/stable/tensor.h b/torch/csrc/stable/tensor.h
index 8762372a415cf..989776d426b31 100644
--- a/torch/csrc/stable/tensor.h
+++ b/torch/csrc/stable/tensor.h
@@ -1,4 +1,131 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <torch/csrc/stable/tensor_inl.h>
 #include <torch/csrc/stable/tensor_struct.h>
+=======
+// TODO ASAP: THIS FILE SHOULD BE HEADER ONLY BUT ISN'T ENFORCED:
+// I only need it for AOTI_TORCH_ERROR_CODE_CHECK, see #154908
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+namespace torch::stable {
+
+using DeviceIndex =
+    int8_t; // this is from c10/core/Device.h and can be header only
+
+// The torch::stable::Tensor class is a highlevel C++ wrapper around
+// the C shim Tensor APIs. We've modeled this class after TensorBase, as custom
+// op kernels only really need to interact with Tensor metadata (think sizes,
+// strides, device, dtype). Other functions on Tensor (like empty_like) should
+// live like the ATen op that they are and exist outside of this struct.
+//
+// There are several goals of this class over AtenTensorHandle and
+// RAIIAtenTensorHandle:
+// 1. torch::stable::Tensor is a nicer UX much closer to torch::Tensor than the
+//    C APIs with AtenTensorHandle. Under the hood we still call to these C shim
+//    APIs to preserve stability.
+// 2. RAIIAtenTensorHandle boils down to a uniq_ptr that forces the user to pass
+//    around ownership. This makes it difficult to pass one input into 2
+//    different functions, e.g., doing something like c = a(t) + b(t) for
+//    stable::Tensor t. Thus, we use a shared_ptr here.
+class Tensor {
+ private:
+  std::shared_ptr<AtenTensorOpaque> ath_;
+
+ public:
+  Tensor() = delete;
+
+  // Construct a stable::Tensor from an AtenTensorHandle (ATH)
+  // Steals ownership from the ATH
+  explicit Tensor(AtenTensorHandle ath)
+      : ath_(ath, [](AtenTensorHandle ath) {
+          AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object(ath));
+        }) {}
+
+  // Copy and move constructors can be default cuz the underlying handle is a
+  // shared_ptr
+  Tensor(const Tensor& other) = default;
+  Tensor(Tensor&& other) noexcept = default;
+
+  // Copy and move assignment operators can be default cuz the underlying handle
+  // is a shared_ptr
+  Tensor& operator=(const Tensor& other) = default;
+  Tensor& operator=(Tensor&& other) noexcept = default;
+
+  // Destructor can be default: shared ptr has custom deletion logic
+  ~Tensor() = default;
+
+  // Returns a borrowed reference to the AtenTensorHandle
+  AtenTensorHandle get() const {
+    return ath_.get();
+  }
+
+  // =============================================================================
+  // C-shimified TensorBase APIs: the below APIs have the same signatures and
+  // semantics as their counterparts in TensorBase.h.
+  // =============================================================================
+
+  void* data_ptr() const {
+    void* data_ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  int64_t dim() const {
+    int64_t dim;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));
+    return dim;
+  }
+
+  int64_t numel() const {
+    int64_t numel;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(ath_.get(), &numel));
+    return numel;
+  }
+
+  // note: this is a subset of the original TensorBase API. It takes no
+  // arguments whereas the original API takes in a kwarg of memory format.
+  // Here, we assume the default contiguous memory format.
+  bool is_contiguous() const {
+    bool is_contiguous;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_is_contiguous(ath_.get(), &is_contiguous));
+    return is_contiguous;
+  }
+
+  int64_t stride(int64_t dim) const {
+    int64_t stride;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_stride(ath_.get(), dim, &stride));
+    return stride;
+  }
+
+  DeviceIndex get_device() const {
+    int32_t device_index;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_index(ath_.get(), &device_index));
+    return static_cast<DeviceIndex>(device_index);
+  }
+
+  bool is_cuda() const {
+    int32_t device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_type(ath_.get(), &device_type));
+    return device_type == aoti_torch_device_type_cuda();
+  }
+
+  int64_t size(int64_t dim) const {
+    int64_t size;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(ath_.get(), dim, &size));
+    return size;
+  }
+
+  // =============================================================================
+  // END of C-shimified TensorBase APIs
+  // =============================================================================
+};
+
+} // namespace torch::stable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index f5eb5bd9fa84b..d57eed59386d1 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -55,17 +55,30 @@ std::vector<int64_t> THPUtils_unpackLongs(PyObject* arg) {
     for (int i = 0; i != nDim; ++i) {
       PyObject* item =
           tuple ? PyTuple_GET_ITEM(arg, i) : PyList_GET_ITEM(arg, i);
+<<<<<<< HEAD
       TORCH_CHECK(
           THPUtils_checkLong(item),
           "expected int at position ",
           i,
           ", but got: ",
           THPUtils_typename(item));
+=======
+      if (!THPUtils_checkLong(item)) {
+        std::ostringstream oss;
+        oss << "expected int at position " << i
+            << ", but got: " << THPUtils_typename(item);
+        throw std::runtime_error(oss.str());
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sizes[i] = THPUtils_unpackLong(item);
     }
     return sizes;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Expected tuple or list");
+=======
+  throw std::runtime_error("Expected tuple or list");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool THPUtils_checkIntTuple(PyObject* arg) {
@@ -81,10 +94,19 @@ bool THPUtils_checkIntTuple(PyObject* arg) {
 }
 
 std::vector<int> THPUtils_unpackIntTuple(PyObject* arg) {
+<<<<<<< HEAD
   TORCH_CHECK(THPUtils_checkIntTuple(arg), "Couldn't unpack int tuple");
   std::vector<int> values(PyTuple_GET_SIZE(arg));
   for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(arg); ++i) {
     values[i] = THPUtils_unpackInt(PyTuple_GET_ITEM(arg, i));
+=======
+  if (!THPUtils_checkIntTuple(arg)) {
+    throw std::runtime_error("Couldn't unpack int tuple");
+  }
+  std::vector<int> values(PyTuple_GET_SIZE(arg));
+  for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(arg); ++i) {
+    values[i] = (int)THPUtils_unpackLong(PyTuple_GET_ITEM(arg, i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return values;
 }
@@ -238,6 +260,7 @@ uint8_t storage_get(const at::Storage& self, ptrdiff_t idx) {
   return self_t[idx].item<uint8_t>();
 }
 
+<<<<<<< HEAD
 std::string uuid_to_string(const char* uuid_bytes) {
   // UUIDs are a 128-bit label. CUDA/HIP and XPU store this as char[16].
   // For string representation, the code here expands this to
@@ -266,6 +289,8 @@ std::string uuid_to_string(const char* uuid_bytes) {
       (uint8_t)uuid_bytes[15]);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template class THPPointer<THPStorage>;
 // NOLINTBEGIN(misc-use-internal-linkage)
 namespace torch::gdb {
@@ -354,7 +379,11 @@ std::string dispatch_keyset_string(c10::DispatchKeySet keyset) {
 
 namespace pybind11::detail {
 
+<<<<<<< HEAD
 bool type_caster<at::Tensor>::load(handle src, bool /*unused*/) {
+=======
+bool type_caster<at::Tensor>::load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* obj = src.ptr();
   if (THPVariable_Check(obj)) {
     value = THPVariable_Unpack(obj);
@@ -370,7 +399,11 @@ handle type_caster<at::Tensor>::cast(
   return handle(THPVariable_Wrap(src));
 }
 
+<<<<<<< HEAD
 bool type_caster<at::IntArrayRef>::load(handle src, bool /*unused*/) {
+=======
+bool type_caster<at::IntArrayRef>::load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* source = src.ptr();
   auto tuple = PyTuple_Check(source);
   if (tuple || PyList_Check(source)) {
@@ -403,7 +436,11 @@ handle type_caster<at::IntArrayRef>::cast(
   return handle(THPUtils_packInt64Array(src.size(), src.data()));
 }
 
+<<<<<<< HEAD
 bool type_caster<at::SymIntArrayRef>::load(handle src, bool /*unused*/) {
+=======
+bool type_caster<at::SymIntArrayRef>::load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* source = src.ptr();
 
   auto tuple = PyTuple_Check(source);
@@ -444,9 +481,13 @@ handle type_caster<at::SymIntArrayRef>::cast(
   return t.release();
 }
 
+<<<<<<< HEAD
 bool type_caster<at::ArrayRef<c10::SymNode>>::load(
     handle src,
     bool /*unused*/) {
+=======
+bool type_caster<at::ArrayRef<c10::SymNode>>::load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(0, "NYI");
 }
 handle type_caster<at::ArrayRef<c10::SymNode>>::cast(
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 0fa7584e1f712..89b0b6b7117d1 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -21,6 +21,12 @@
 #define THP_EXPECT(x, y) (x)
 #endif
 
+<<<<<<< HEAD
+=======
+#define THPUtils_checkReal_FLOAT(object) \
+  (PyFloat_Check(object) || PyLong_Check(object))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define THPUtils_unpackReal_FLOAT(object)           \
   (PyFloat_Check(object) ? PyFloat_AsDouble(object) \
        : PyLong_Check(object)                       \
@@ -50,11 +56,96 @@
        : (throw std::runtime_error("Could not parse real"),                   \
           c10::complex<double>(0, 0)))
 
+<<<<<<< HEAD
+#define THPBoolUtils_unpackReal(object) THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
+#define THPByteUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPByteUtils_unpackReal(object) \
+  (unsigned char)THPUtils_unpackReal_INT(object)
+=======
+#define THPUtils_checkReal_BOOL(object) PyBool_Check(object)
+
+#define THPUtils_checkReal_COMPLEX(object)                                    \
+  PyComplex_Check(object) || PyFloat_Check(object) || PyLong_Check(object) || \
+      PyInt_Check(object)
+
+#define THPUtils_newReal_FLOAT(value) PyFloat_FromDouble(value)
+#define THPUtils_newReal_INT(value) PyInt_FromLong(value)
+
+#define THPUtils_newReal_BOOL(value) PyBool_FromLong(value)
+
+#define THPUtils_newReal_COMPLEX(value) \
+  PyComplex_FromDoubles(value.real(), value.imag())
+
+#define THPDoubleUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPDoubleUtils_unpackReal(object) \
+  (double)THPUtils_unpackReal_FLOAT(object)
+#define THPDoubleUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPFloatUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPFloatUtils_unpackReal(object) \
+  (float)THPUtils_unpackReal_FLOAT(object)
+#define THPFloatUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPHalfUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPHalfUtils_unpackReal(object) \
+  (at::Half) THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newReal(value) PyFloat_FromDouble(value)
+#define THPHalfUtils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+#define THPComplexDoubleUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexDoubleUtils_unpackReal(object) \
+  THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexDoubleUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPComplexFloatUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexFloatUtils_unpackReal(object) \
+  (c10::complex<float>)THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexFloatUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPBFloat16Utils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPBFloat16Utils_unpackReal(object) \
+  (at::BFloat16) THPUtils_unpackReal_FLOAT(object)
+#define THPBFloat16Utils_newReal(value) PyFloat_FromDouble(value)
+#define THPBFloat16Utils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+
+#define THPBoolUtils_checkReal(object) THPUtils_checkReal_BOOL(object)
 #define THPBoolUtils_unpackReal(object) THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newReal(value) THPUtils_newReal_BOOL(value)
 #define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
+#define THPBoolUtils_unpackAccreal(object) \
+  (int64_t)THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
+#define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackReal(object) (int64_t)THPUtils_unpackReal_INT(object)
+#define THPLongUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPIntUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPIntUtils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPIntUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPShortUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPShortUtils_unpackReal(object) (short)THPUtils_unpackReal_INT(object)
+#define THPShortUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPCharUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPCharUtils_unpackReal(object) (char)THPUtils_unpackReal_INT(object)
+#define THPCharUtils_newReal(value) THPUtils_newReal_INT(value)
 #define THPByteUtils_checkReal(object) THPUtils_checkReal_INT(object)
 #define THPByteUtils_unpackReal(object) \
   (unsigned char)THPUtils_unpackReal_INT(object)
+#define THPByteUtils_newReal(value) THPUtils_newReal_INT(value)
+// quantized types
+#define THPQUInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt32Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt32Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt32Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt4x2Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt4x2Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt4x2Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt2x4Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt2x4Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt2x4Utils_newReal(value) THPUtils_newReal_INT(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /*
    From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
@@ -121,5 +212,8 @@ bool maybeThrowBackCompatKeepdimWarn(char* func);
 void storage_fill(const at::Storage& self, uint8_t value);
 void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
+<<<<<<< HEAD
 
 std::string uuid_to_string(const char* uuid_bytes);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/utils/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
index ccb8990e59154..7309f7fe3f24d 100644
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -172,7 +172,11 @@ template <>
 TORCH_API void THP_decodeBuffer<bool, bool>(
     bool* dst,
     const uint8_t* src,
+<<<<<<< HEAD
     bool /*unused*/,
+=======
+    bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] = (int)src[i] != 0 ? true : false;
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index becbe1681f000..0bd9026140219 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -5,7 +5,10 @@
 #include <torch/csrc/utils/python_strings.h>
 
 #include <ATen/PythonTorchFunctionTLS.h>
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch {
 static PyObject* disabled_torch_function = nullptr;
@@ -220,9 +223,14 @@ PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
   } else if (PyTuple_Check(args)) {
     py_args = py::reinterpret_borrow<py::tuple>(args);
   } else {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format("expected List or Tuple (got {})", Py_TYPE(args)->tp_name));
+=======
+    throw torch::TypeError(
+        "expected List or Tuple (got %s)", Py_TYPE(args)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // These are all C-API calls so no exceptions will be raised
@@ -255,9 +263,14 @@ PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* a) {
   } else if (PyTuple_Check(args)) {
     py_args = py::reinterpret_borrow<py::tuple>(args);
   } else {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format("expected List or Tuple (got {})", Py_TYPE(args)->tp_name));
+=======
+    throw torch::TypeError(
+        "expected List or Tuple (got %s)", Py_TYPE(args)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // This implementation is not completely correct.  The moral
@@ -348,7 +361,11 @@ inline static bool array_has_torch_function(
   return false;
 }
 
+<<<<<<< HEAD
 PyObject* THPModule_has_torch_function(PyObject* /*unused*/, PyObject* arg) {
+=======
+PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool result = false;
   if (PyTuple_CheckExact(arg) || PyList_CheckExact(arg)) {
     // Fast path:
@@ -372,9 +389,13 @@ PyObject* THPModule_has_torch_function(PyObject* /*unused*/, PyObject* arg) {
   Py_RETURN_FALSE;
 }
 
+<<<<<<< HEAD
 PyObject* THPModule_has_torch_function_unary(
     PyObject* /*unused*/,
     PyObject* obj) {
+=======
+PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Special case `THPModule_has_torch_function` for the single arg case.
   if (torch::check_has_torch_function(obj)) {
     Py_RETURN_TRUE;
@@ -383,7 +404,11 @@ PyObject* THPModule_has_torch_function_unary(
 }
 
 PyObject* THPModule_has_torch_function_variadic(
+<<<<<<< HEAD
     PyObject* /*unused*/,
+=======
+    PyObject*,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* const* args,
     Py_ssize_t nargs) {
   if (array_has_torch_function(args, nargs)) {
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index b52173c252a88..68586d7fd475d 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -37,11 +37,18 @@ PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_DisableTorchFunctionSubclassType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);
+<<<<<<< HEAD
 PyObject* THPModule_has_torch_function(PyObject* /*unused*/, PyObject* arg);
 PyObject* THPModule_has_torch_function_unary(
     PyObject* /*unused*/,
     PyObject* obj);
 PyObject* THPModule_has_torch_function_variadic(
     PyObject* /*unused*/,
+=======
+PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg);
+PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj);
+PyObject* THPModule_has_torch_function_variadic(
+    PyObject*,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* const* args,
     Py_ssize_t nargs);
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index 3090d58f5c094..9d6208752dd7d 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -1,5 +1,9 @@
 // @generated by update_schema.py
+<<<<<<< HEAD
 // checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
+=======
+// checksum<<110c364974d3b0f7dcbdf6862781212bdcc7178925c43c894c336fc2b6ca6628>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // clang-format off
 
 #pragma once
@@ -10,7 +14,10 @@
 #include <unordered_map>
 #include <variant>
 #include <vector>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <nlohmann/json.hpp>
 
@@ -62,7 +69,10 @@ class ForwardRef {
     ptr_ = std::make_unique<T>(*other.ptr_);
     return *this;
   }
+<<<<<<< HEAD
   ~ForwardRef();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const T& operator*() const {
     return *ptr_;
   }
@@ -130,7 +140,10 @@ inline void from_json(const nlohmann::json& j, F64& f) {
 class AOTInductorModelPickleData;
 class Argument;
 class BufferMutationSpec;
+<<<<<<< HEAD
 class ComplexValue;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConstantValue;
 class CustomObjArgument;
 class Device;
@@ -151,6 +164,10 @@ class InputToParameterSpec;
 class InputToTensorConstantSpec;
 class InputTokenSpec;
 class LossOutputSpec;
+<<<<<<< HEAD
+=======
+class Model;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModuleCallEntry;
 class ModuleCallSignature;
 class NamedArgument;
@@ -159,9 +176,13 @@ class Node;
 class OptionalTensorArgument;
 class OutputSpec;
 class OutputTokenSpec;
+<<<<<<< HEAD
 class ParameterMutationSpec;
 class PayloadConfig;
 class PayloadMeta;
+=======
+class Program;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RangeConstraint;
 class SchemaVersion;
 class SymBool;
@@ -191,7 +212,11 @@ inline std::string_view printEnum(const ArgumentKind& e) {
     case ArgumentKind::POSITIONAL: return "POSITIONAL";
     case ArgumentKind::KEYWORD: return "KEYWORD";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -199,7 +224,11 @@ inline void parseEnum(std::string_view s, ArgumentKind& t) {
   if (s == "UNKNOWN") { t = ArgumentKind::UNKNOWN; return; }
   if (s == "POSITIONAL") { t = ArgumentKind::POSITIONAL; return; }
   if (s == "KEYWORD") { t = ArgumentKind::KEYWORD; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 enum class Layout {
@@ -224,7 +253,11 @@ inline std::string_view printEnum(const Layout& e) {
     case Layout::_mkldnn: return "_mkldnn";
     case Layout::Strided: return "Strided";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -237,7 +270,11 @@ inline void parseEnum(std::string_view s, Layout& t) {
   if (s == "SparseBsc") { t = Layout::SparseBsc; return; }
   if (s == "_mkldnn") { t = Layout::_mkldnn; return; }
   if (s == "Strided") { t = Layout::Strided; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 enum class MemoryFormat {
@@ -256,7 +293,11 @@ inline std::string_view printEnum(const MemoryFormat& e) {
     case MemoryFormat::ChannelsLast3d: return "ChannelsLast3d";
     case MemoryFormat::PreserveFormat: return "PreserveFormat";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -266,7 +307,11 @@ inline void parseEnum(std::string_view s, MemoryFormat& t) {
   if (s == "ChannelsLast") { t = MemoryFormat::ChannelsLast; return; }
   if (s == "ChannelsLast3d") { t = MemoryFormat::ChannelsLast3d; return; }
   if (s == "PreserveFormat") { t = MemoryFormat::PreserveFormat; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 enum class ScalarType {
@@ -287,8 +332,11 @@ enum class ScalarType {
   UINT16 = 28,
   FLOAT8E4M3FN = 29,
   FLOAT8E5M2 = 30,
+<<<<<<< HEAD
   FLOAT8E4M3FNUZ = 31,
   FLOAT8E5M2FNUZ = 32,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 inline std::string_view printEnum(const ScalarType& e) {
@@ -310,10 +358,15 @@ inline std::string_view printEnum(const ScalarType& e) {
     case ScalarType::UINT16: return "UINT16";
     case ScalarType::FLOAT8E4M3FN: return "FLOAT8E4M3FN";
     case ScalarType::FLOAT8E5M2: return "FLOAT8E5M2";
+<<<<<<< HEAD
     case ScalarType::FLOAT8E4M3FNUZ: return "FLOAT8E4M3FNUZ";
     case ScalarType::FLOAT8E5M2FNUZ: return "FLOAT8E5M2FNUZ";
     default:
       TORCH_CHECK(false, "Unknown enum value");
+=======
+    default:
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -335,9 +388,13 @@ inline void parseEnum(std::string_view s, ScalarType& t) {
   if (s == "UINT16") { t = ScalarType::UINT16; return; }
   if (s == "FLOAT8E4M3FN") { t = ScalarType::FLOAT8E4M3FN; return; }
   if (s == "FLOAT8E5M2") { t = ScalarType::FLOAT8E5M2; return; }
+<<<<<<< HEAD
   if (s == "FLOAT8E4M3FNUZ") { t = ScalarType::FLOAT8E4M3FNUZ; return; }
   if (s == "FLOAT8E5M2FNUZ") { t = ScalarType::FLOAT8E5M2FNUZ; return; }
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -454,7 +511,11 @@ inline std::string_view printEnum(const SymExprHint::Tag& e) {
     case SymExprHint::Tag::AS_BOOL: return "AS_BOOL";
     case SymExprHint::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -462,7 +523,11 @@ inline void parseEnum(std::string_view s, SymExprHint::Tag& t) {
   if (s == "AS_INT") { t = SymExprHint::Tag::AS_INT; return; }
   if (s == "AS_BOOL") { t = SymExprHint::Tag::AS_BOOL; return; }
   if (s == "AS_FLOAT") { t = SymExprHint::Tag::AS_FLOAT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -560,14 +625,22 @@ inline std::string_view printEnum(const SymInt::Tag& e) {
     case SymInt::Tag::AS_EXPR: return "AS_EXPR";
     case SymInt::Tag::AS_INT: return "AS_INT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymInt::Tag& t) {
   if (s == "AS_EXPR") { t = SymInt::Tag::AS_EXPR; return; }
   if (s == "AS_INT") { t = SymInt::Tag::AS_INT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -638,14 +711,22 @@ inline std::string_view printEnum(const SymFloat::Tag& e) {
     case SymFloat::Tag::AS_EXPR: return "AS_EXPR";
     case SymFloat::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymFloat::Tag& t) {
   if (s == "AS_EXPR") { t = SymFloat::Tag::AS_EXPR; return; }
   if (s == "AS_FLOAT") { t = SymFloat::Tag::AS_FLOAT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -716,14 +797,22 @@ inline std::string_view printEnum(const SymBool::Tag& e) {
     case SymBool::Tag::AS_EXPR: return "AS_EXPR";
     case SymBool::Tag::AS_BOOL: return "AS_BOOL";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymBool::Tag& t) {
   if (s == "AS_EXPR") { t = SymBool::Tag::AS_EXPR; return; }
   if (s == "AS_BOOL") { t = SymBool::Tag::AS_BOOL; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -866,14 +955,22 @@ inline std::string_view printEnum(const SymIntArgument::Tag& e) {
     case SymIntArgument::Tag::AS_NAME: return "AS_NAME";
     case SymIntArgument::Tag::AS_INT: return "AS_INT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymIntArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymIntArgument::Tag::AS_NAME; return; }
   if (s == "AS_INT") { t = SymIntArgument::Tag::AS_INT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -944,14 +1041,22 @@ inline std::string_view printEnum(const SymFloatArgument::Tag& e) {
     case SymFloatArgument::Tag::AS_NAME: return "AS_NAME";
     case SymFloatArgument::Tag::AS_FLOAT: return "AS_FLOAT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymFloatArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymFloatArgument::Tag::AS_NAME; return; }
   if (s == "AS_FLOAT") { t = SymFloatArgument::Tag::AS_FLOAT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -1022,14 +1127,22 @@ inline std::string_view printEnum(const SymBoolArgument::Tag& e) {
     case SymBoolArgument::Tag::AS_NAME: return "AS_NAME";
     case SymBoolArgument::Tag::AS_BOOL: return "AS_BOOL";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, SymBoolArgument::Tag& t) {
   if (s == "AS_NAME") { t = SymBoolArgument::Tag::AS_NAME; return; }
   if (s == "AS_BOOL") { t = SymBoolArgument::Tag::AS_BOOL; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -1136,14 +1249,22 @@ inline std::string_view printEnum(const OptionalTensorArgument::Tag& e) {
     case OptionalTensorArgument::Tag::AS_TENSOR: return "AS_TENSOR";
     case OptionalTensorArgument::Tag::AS_NONE: return "AS_NONE";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 inline void parseEnum(std::string_view s, OptionalTensorArgument::Tag& t) {
   if (s == "AS_TENSOR") { t = OptionalTensorArgument::Tag::AS_TENSOR; return; }
   if (s == "AS_NONE") { t = OptionalTensorArgument::Tag::AS_NONE; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -1201,6 +1322,7 @@ class CustomObjArgument {
   friend void from_json(const nlohmann::json& nlohmann_json_j, CustomObjArgument& nlohmann_json_t);
 };
 
+<<<<<<< HEAD
 class ComplexValue {
  private:
   F64 real;
@@ -1228,16 +1350,26 @@ class ComplexValue {
   friend void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohmann_json_t);
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Argument {
   struct Void {};
 
  public:
   enum class Tag {
+<<<<<<< HEAD
     AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR, AS_COMPLEX
   };
 
  private:
   std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument, ComplexValue> variant_;
+=======
+    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR
+  };
+
+ private:
+  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument> variant_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tag tag_;
 
  public:
@@ -1479,6 +1611,7 @@ class Argument {
     tag_ = Tag::AS_OPTIONAL_TENSOR;
   }
 
+<<<<<<< HEAD
   const ComplexValue& get_as_complex() const {
     return std::get<27>(variant_);
   }
@@ -1488,6 +1621,8 @@ class Argument {
     tag_ = Tag::AS_COMPLEX;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   friend void to_json(nlohmann::json& nlohmann_json_j, const Argument& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::AS_NONE) {
@@ -1594,10 +1729,13 @@ class Argument {
       nlohmann_json_j["as_optional_tensor"] = nlohmann_json_t.get_as_optional_tensor();
       return;
     }
+<<<<<<< HEAD
     if (nlohmann_json_t.tag_ == Tag::AS_COMPLEX) {
       nlohmann_json_j["as_complex"] = nlohmann_json_t.get_as_complex();
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, Argument& nlohmann_json_t) {
@@ -1732,11 +1870,14 @@ class Argument {
       nlohmann_json_t.tag_ = Tag::AS_OPTIONAL_TENSOR;
       return;
     }
+<<<<<<< HEAD
     if (nlohmann_json_j.contains("as_complex")) {
       nlohmann_json_t.variant_.emplace<27>(nlohmann_json_j.at("as_complex").template get<ComplexValue>());
       nlohmann_json_t.tag_ = Tag::AS_COMPLEX;
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -1768,9 +1909,14 @@ inline std::string_view printEnum(const Argument::Tag& e) {
     case Argument::Tag::AS_SYM_FLOAT: return "AS_SYM_FLOAT";
     case Argument::Tag::AS_SYM_FLOATS: return "AS_SYM_FLOATS";
     case Argument::Tag::AS_OPTIONAL_TENSOR: return "AS_OPTIONAL_TENSOR";
+<<<<<<< HEAD
     case Argument::Tag::AS_COMPLEX: return "AS_COMPLEX";
     default:
       TORCH_CHECK(false, "Unknown enum value");
+=======
+    default:
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1801,8 +1947,12 @@ inline void parseEnum(std::string_view s, Argument::Tag& t) {
   if (s == "AS_SYM_FLOAT") { t = Argument::Tag::AS_SYM_FLOAT; return; }
   if (s == "AS_SYM_FLOATS") { t = Argument::Tag::AS_SYM_FLOATS; return; }
   if (s == "AS_OPTIONAL_TENSOR") { t = Argument::Tag::AS_OPTIONAL_TENSOR; return; }
+<<<<<<< HEAD
   if (s == "AS_COMPLEX") { t = Argument::Tag::AS_COMPLEX; return; }
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -2128,7 +2278,11 @@ inline std::string_view printEnum(const ConstantValue::Tag& e) {
     case ConstantValue::Tag::AS_STRING: return "AS_STRING";
     case ConstantValue::Tag::AS_BOOL: return "AS_BOOL";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -2138,7 +2292,11 @@ inline void parseEnum(std::string_view s, ConstantValue::Tag& t) {
   if (s == "AS_FLOAT") { t = ConstantValue::Tag::AS_FLOAT; return; }
   if (s == "AS_STRING") { t = ConstantValue::Tag::AS_STRING; return; }
   if (s == "AS_BOOL") { t = ConstantValue::Tag::AS_BOOL; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -2466,7 +2624,11 @@ inline std::string_view printEnum(const InputSpec::Tag& e) {
     case InputSpec::Tag::TOKEN: return "TOKEN";
     case InputSpec::Tag::CONSTANT_INPUT: return "CONSTANT_INPUT";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unknown enum value");
+=======
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -2478,7 +2640,11 @@ inline void parseEnum(std::string_view s, InputSpec::Tag& t) {
   if (s == "CUSTOM_OBJ") { t = InputSpec::Tag::CUSTOM_OBJ; return; }
   if (s == "TOKEN") { t = InputSpec::Tag::TOKEN; return; }
   if (s == "CONSTANT_INPUT") { t = InputSpec::Tag::CONSTANT_INPUT; return; }
+<<<<<<< HEAD
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -2545,6 +2711,7 @@ class BufferMutationSpec {
   friend void from_json(const nlohmann::json& nlohmann_json_j, BufferMutationSpec& nlohmann_json_t);
 };
 
+<<<<<<< HEAD
 class ParameterMutationSpec {
  private:
   TensorArgument arg;
@@ -2572,6 +2739,8 @@ class ParameterMutationSpec {
   friend void from_json(const nlohmann::json& nlohmann_json_j, ParameterMutationSpec& nlohmann_json_t);
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GradientToParameterSpec {
  private:
   TensorArgument arg;
@@ -2676,11 +2845,19 @@ class OutputSpec {
 
  public:
   enum class Tag {
+<<<<<<< HEAD
     USER_OUTPUT, LOSS_OUTPUT, BUFFER_MUTATION, GRADIENT_TO_PARAMETER, GRADIENT_TO_USER_INPUT, USER_INPUT_MUTATION, TOKEN, PARAMETER_MUTATION
   };
 
  private:
   std::variant<Void, UserOutputSpec, LossOutputSpec, BufferMutationSpec, GradientToParameterSpec, GradientToUserInputSpec, UserInputMutationSpec, OutputTokenSpec, ParameterMutationSpec> variant_;
+=======
+    USER_OUTPUT, LOSS_OUTPUT, BUFFER_MUTATION, GRADIENT_TO_PARAMETER, GRADIENT_TO_USER_INPUT, USER_INPUT_MUTATION, TOKEN
+  };
+
+ private:
+  std::variant<Void, UserOutputSpec, LossOutputSpec, BufferMutationSpec, GradientToParameterSpec, GradientToUserInputSpec, UserInputMutationSpec, OutputTokenSpec> variant_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tag tag_;
 
  public:
@@ -2751,6 +2928,7 @@ class OutputSpec {
     tag_ = Tag::TOKEN;
   }
 
+<<<<<<< HEAD
   const ParameterMutationSpec& get_parameter_mutation() const {
     return std::get<8>(variant_);
   }
@@ -2760,6 +2938,8 @@ class OutputSpec {
     tag_ = Tag::PARAMETER_MUTATION;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   friend void to_json(nlohmann::json& nlohmann_json_j, const OutputSpec& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::USER_OUTPUT) {
@@ -2790,10 +2970,13 @@ class OutputSpec {
       nlohmann_json_j["token"] = nlohmann_json_t.get_token();
       return;
     }
+<<<<<<< HEAD
     if (nlohmann_json_t.tag_ == Tag::PARAMETER_MUTATION) {
       nlohmann_json_j["parameter_mutation"] = nlohmann_json_t.get_parameter_mutation();
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, OutputSpec& nlohmann_json_t) {
@@ -2833,11 +3016,14 @@ class OutputSpec {
       nlohmann_json_t.tag_ = Tag::TOKEN;
       return;
     }
+<<<<<<< HEAD
     if (nlohmann_json_j.contains("parameter_mutation")) {
       nlohmann_json_t.variant_.emplace<8>(nlohmann_json_j.at("parameter_mutation").template get<ParameterMutationSpec>());
       nlohmann_json_t.tag_ = Tag::PARAMETER_MUTATION;
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -2850,9 +3036,14 @@ inline std::string_view printEnum(const OutputSpec::Tag& e) {
     case OutputSpec::Tag::GRADIENT_TO_USER_INPUT: return "GRADIENT_TO_USER_INPUT";
     case OutputSpec::Tag::USER_INPUT_MUTATION: return "USER_INPUT_MUTATION";
     case OutputSpec::Tag::TOKEN: return "TOKEN";
+<<<<<<< HEAD
     case OutputSpec::Tag::PARAMETER_MUTATION: return "PARAMETER_MUTATION";
     default:
       TORCH_CHECK(false, "Unknown enum value");
+=======
+    default:
+      throw std::runtime_error("Unknown enum value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -2864,8 +3055,12 @@ inline void parseEnum(std::string_view s, OutputSpec::Tag& t) {
   if (s == "GRADIENT_TO_USER_INPUT") { t = OutputSpec::Tag::GRADIENT_TO_USER_INPUT; return; }
   if (s == "USER_INPUT_MUTATION") { t = OutputSpec::Tag::USER_INPUT_MUTATION; return; }
   if (s == "TOKEN") { t = OutputSpec::Tag::TOKEN; return; }
+<<<<<<< HEAD
   if (s == "PARAMETER_MUTATION") { t = OutputSpec::Tag::PARAMETER_MUTATION; return; }
   TORCH_CHECK(false, "Unknown enum value: " + std::string{s});
+=======
+  throw std::runtime_error("Unknown enum value: " + std::string{s});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -3111,7 +3306,10 @@ class ExportedProgram {
   SchemaVersion schema_version;
   std::vector<std::string> verifiers = {};
   std::string torch_version = "<=2.4";
+<<<<<<< HEAD
   std::vector<std::string> guards_code = {};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
 
@@ -3163,6 +3361,7 @@ class ExportedProgram {
     torch_version = std::move(def);
   }
 
+<<<<<<< HEAD
   const std::vector<std::string>& get_guards_code() const {
     return guards_code;
   }
@@ -3171,10 +3370,13 @@ class ExportedProgram {
     guards_code = std::move(def);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   friend void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nlohmann_json_t);
   friend void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t);
 };
 
+<<<<<<< HEAD
 class PayloadMeta {
  private:
   std::string path_name;
@@ -3236,6 +3438,87 @@ class PayloadConfig {
 
   friend void to_json(nlohmann::json& nlohmann_json_j, const PayloadConfig& nlohmann_json_t);
   friend void from_json(const nlohmann::json& nlohmann_json_j, PayloadConfig& nlohmann_json_t);
+=======
+class Program {
+ private:
+  std::unordered_map<std::string, ExportedProgram> methods;
+
+ public:
+
+  const std::unordered_map<std::string, ExportedProgram>& get_methods() const {
+    return methods;
+  }
+
+  void set_methods(std::unordered_map<std::string, ExportedProgram> def) {
+    methods = std::move(def);
+  }
+
+  friend void to_json(nlohmann::json& nlohmann_json_j, const Program& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, Program& nlohmann_json_t);
+};
+
+class Model {
+ private:
+  std::string name;
+  std::unordered_map<std::string, std::string> tensorPaths;
+  Program program;
+  std::unordered_map<std::string, Program> delegates;
+  std::unordered_map<std::string, std::string> deviceAllocationMap;
+  std::unordered_map<std::string, std::string> constantPaths;
+
+ public:
+
+  const std::string& get_name() const {
+    return name;
+  }
+
+  void set_name(std::string def) {
+    name = std::move(def);
+  }
+
+  const std::unordered_map<std::string, std::string>& get_tensorPaths() const {
+    return tensorPaths;
+  }
+
+  void set_tensorPaths(std::unordered_map<std::string, std::string> def) {
+    tensorPaths = std::move(def);
+  }
+
+  const Program& get_program() const {
+    return program;
+  }
+
+  void set_program(Program def) {
+    program = std::move(def);
+  }
+
+  const std::unordered_map<std::string, Program>& get_delegates() const {
+    return delegates;
+  }
+
+  void set_delegates(std::unordered_map<std::string, Program> def) {
+    delegates = std::move(def);
+  }
+
+  const std::unordered_map<std::string, std::string>& get_deviceAllocationMap() const {
+    return deviceAllocationMap;
+  }
+
+  void set_deviceAllocationMap(std::unordered_map<std::string, std::string> def) {
+    deviceAllocationMap = std::move(def);
+  }
+
+  const std::unordered_map<std::string, std::string>& get_constantPaths() const {
+    return constantPaths;
+  }
+
+  void set_constantPaths(std::unordered_map<std::string, std::string> def) {
+    constantPaths = std::move(def);
+  }
+
+  friend void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class AOTInductorModelPickleData {
@@ -3376,6 +3659,7 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, BufferMutationSpec&
   nlohmann_json_t.buffer_name = nlohmann_json_j.value("buffer_name", nlohmann_json_default_obj.buffer_name);
 }
 
+<<<<<<< HEAD
 inline void to_json(nlohmann::json& nlohmann_json_j, const ComplexValue& nlohmann_json_t) {
   nlohmann_json_j["real"] = nlohmann_json_t.real;
   nlohmann_json_j["imag"] = nlohmann_json_t.imag;
@@ -3387,6 +3671,8 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohm
   nlohmann_json_t.imag = nlohmann_json_j.value("imag", nlohmann_json_default_obj.imag);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void to_json(nlohmann::json& nlohmann_json_j, const CustomObjArgument& nlohmann_json_t) {
   nlohmann_json_j["name"] = nlohmann_json_t.name;
   nlohmann_json_j["class_fqn"] = nlohmann_json_t.class_fqn;
@@ -3416,7 +3702,10 @@ inline void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nloh
   nlohmann_json_j["schema_version"] = nlohmann_json_t.schema_version;
   nlohmann_json_j["verifiers"] = nlohmann_json_t.verifiers;
   nlohmann_json_j["torch_version"] = nlohmann_json_t.torch_version;
+<<<<<<< HEAD
   nlohmann_json_j["guards_code"] = nlohmann_json_t.guards_code;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t) {
@@ -3427,7 +3716,10 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nl
   nlohmann_json_t.schema_version = nlohmann_json_j.value("schema_version", nlohmann_json_default_obj.schema_version);
   nlohmann_json_t.verifiers = nlohmann_json_j.value("verifiers", nlohmann_json_default_obj.verifiers);
   nlohmann_json_t.torch_version = nlohmann_json_j.value("torch_version", nlohmann_json_default_obj.torch_version);
+<<<<<<< HEAD
   nlohmann_json_t.guards_code = nlohmann_json_j.value("guards_code", nlohmann_json_default_obj.guards_code);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const ExternKernelNode& nlohmann_json_t) {
@@ -3611,6 +3903,28 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, LossOutputSpec& nlo
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
+<<<<<<< HEAD
+=======
+inline void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t) {
+  nlohmann_json_j["name"] = nlohmann_json_t.name;
+  nlohmann_json_j["tensorPaths"] = nlohmann_json_t.tensorPaths;
+  nlohmann_json_j["program"] = nlohmann_json_t.program;
+  nlohmann_json_j["delegates"] = nlohmann_json_t.delegates;
+  nlohmann_json_j["deviceAllocationMap"] = nlohmann_json_t.deviceAllocationMap;
+  nlohmann_json_j["constantPaths"] = nlohmann_json_t.constantPaths;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t) {
+  Model nlohmann_json_default_obj;
+  nlohmann_json_t.name = nlohmann_json_j.value("name", nlohmann_json_default_obj.name);
+  nlohmann_json_t.tensorPaths = nlohmann_json_j.value("tensorPaths", nlohmann_json_default_obj.tensorPaths);
+  nlohmann_json_t.program = nlohmann_json_j.value("program", nlohmann_json_default_obj.program);
+  nlohmann_json_t.delegates = nlohmann_json_j.value("delegates", nlohmann_json_default_obj.delegates);
+  nlohmann_json_t.deviceAllocationMap = nlohmann_json_j.value("deviceAllocationMap", nlohmann_json_default_obj.deviceAllocationMap);
+  nlohmann_json_t.constantPaths = nlohmann_json_j.value("constantPaths", nlohmann_json_default_obj.constantPaths);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void to_json(nlohmann::json& nlohmann_json_j, const ModuleCallEntry& nlohmann_json_t) {
   nlohmann_json_j["fqn"] = nlohmann_json_t.fqn;
   nlohmann_json_j["signature"] = nlohmann_json_t.signature;
@@ -3687,6 +4001,7 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, OutputTokenSpec& nl
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
+<<<<<<< HEAD
 inline void to_json(nlohmann::json& nlohmann_json_j, const ParameterMutationSpec& nlohmann_json_t) {
   nlohmann_json_j["arg"] = nlohmann_json_t.arg;
   nlohmann_json_j["parameter_name"] = nlohmann_json_t.parameter_name;
@@ -3720,6 +4035,15 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, PayloadMeta& nlohma
   nlohmann_json_t.is_param = nlohmann_json_j.value("is_param", nlohmann_json_default_obj.is_param);
   nlohmann_json_t.use_pickle = nlohmann_json_j.value("use_pickle", nlohmann_json_default_obj.use_pickle);
   nlohmann_json_t.tensor_meta = nlohmann_json_j.value("tensor_meta", nlohmann_json_default_obj.tensor_meta);
+=======
+inline void to_json(nlohmann::json& nlohmann_json_j, const Program& nlohmann_json_t) {
+  nlohmann_json_j["methods"] = nlohmann_json_t.methods;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, Program& nlohmann_json_t) {
+  Program nlohmann_json_default_obj;
+  nlohmann_json_t.methods = nlohmann_json_j.value("methods", nlohmann_json_default_obj.methods);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const RangeConstraint& nlohmann_json_t) {
@@ -3826,7 +4150,10 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, UserOutputSpec& nlo
 
 template <typename T> ForwardRef<T>::ForwardRef(ForwardRef<T>&&) = default;
 template <typename T> ForwardRef<T>& ForwardRef<T>::operator=(ForwardRef<T>&&) = default;
+<<<<<<< HEAD
 template <typename T> ForwardRef<T>::~ForwardRef() = default;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace _export
 } // namespace torch
 
diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
index cce34b7cf68b1..56786a83963ac 100644
--- a/torch/csrc/utils/pybind.cpp
+++ b/torch/csrc/utils/pybind.cpp
@@ -4,7 +4,11 @@
 
 namespace pybind11::detail {
 
+<<<<<<< HEAD
 bool type_caster<c10::SymInt>::load(py::handle src, bool /*unused*/) {
+=======
+bool type_caster<c10::SymInt>::load(py::handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (torch::is_symint(src)) {
     auto node = src.attr("node");
     if (py::isinstance<c10::SymNodeImpl>(node)) {
@@ -62,7 +66,11 @@ py::handle type_caster<c10::SymInt>::cast(
   }
 }
 
+<<<<<<< HEAD
 bool type_caster<c10::SymFloat>::load(py::handle src, bool /*unused*/) {
+=======
+bool type_caster<c10::SymFloat>::load(py::handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (torch::is_symfloat(src)) {
     value = c10::SymFloat(static_cast<c10::SymNode>(
         c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
@@ -92,7 +100,11 @@ py::handle type_caster<c10::SymFloat>::cast(
   }
 }
 
+<<<<<<< HEAD
 bool type_caster<c10::SymBool>::load(py::handle src, bool /*unused*/) {
+=======
+bool type_caster<c10::SymBool>::load(py::handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (torch::is_symbool(src)) {
     value = c10::SymBool(static_cast<c10::SymNode>(
         c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
@@ -122,7 +134,11 @@ py::handle type_caster<c10::SymBool>::cast(
   }
 }
 
+<<<<<<< HEAD
 bool type_caster<c10::Scalar>::load(py::handle src, bool /*unused*/) {
+=======
+bool type_caster<c10::Scalar>::load(py::handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(
       0, "pybind11 loading for c10::Scalar NYI (file a bug if you need it)");
 }
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index b2c0863148ad2..4087d2623e625 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -38,7 +38,11 @@ struct TORCH_PYTHON_API type_caster<at::Tensor> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Tensor, _("torch.Tensor"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/);
+=======
+  bool load(handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static handle cast(
       const at::Tensor& src,
@@ -53,7 +57,11 @@ struct type_caster<at::Storage> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Storage, _("torch.StorageBase"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (torch::isStorage(obj)) {
       value = torch::createStorage(obj);
@@ -76,7 +84,11 @@ struct type_caster<at::Generator> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Generator, _("torch.Generator"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (THPGenerator_Check(obj)) {
       value = reinterpret_cast<THPGenerator*>(obj)->cdata;
@@ -99,7 +111,11 @@ struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::IntArrayRef, _("Tuple[int, ...]"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/);
+=======
+  bool load(handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static handle cast(
       at::IntArrayRef src,
       return_value_policy /* policy */,
@@ -115,7 +131,11 @@ struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("List[int]"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/);
+=======
+  bool load(handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static handle cast(
       at::SymIntArrayRef src,
       return_value_policy /* policy */,
@@ -131,7 +151,11 @@ struct TORCH_PYTHON_API type_caster<at::ArrayRef<c10::SymNode>> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::ArrayRef<c10::SymNode>, _("List[SymNode]"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/);
+=======
+  bool load(handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static handle cast(
       at::ArrayRef<c10::SymNode> src,
       return_value_policy /* policy */,
@@ -147,7 +171,11 @@ struct type_caster<at::MemoryFormat> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::MemoryFormat, _("torch.memory_format"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (THPMemoryFormat_Check(obj)) {
       value = reinterpret_cast<THPMemoryFormat*>(obj)->memory_format;
@@ -175,7 +203,11 @@ struct type_caster<at::Device> {
   // after a successful call to load.
   type_caster() : value(c10::kCPU) {}
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (THPDevice_Check(obj)) {
       value = reinterpret_cast<THPDevice*>(obj)->device;
@@ -204,7 +236,11 @@ struct type_caster<at::ScalarType> {
   // after a successful call to load.
   type_caster() : value(at::kFloat) {}
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (THPDtype_Check(obj)) {
       value = reinterpret_cast<THPDtype*>(obj)->scalar_type;
@@ -233,7 +269,11 @@ struct type_caster<c10::Stream> {
   // after a successful call to load.
   type_caster() : value(c10::Stream::DEFAULT, c10::Device(c10::kCPU, 0)) {}
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
     if (THPStream_Check(obj)) {
       value = c10::Stream::unpack3(
@@ -286,7 +326,11 @@ struct TORCH_PYTHON_API type_caster<c10::Scalar> {
   PYBIND11_TYPE_CASTER(
       c10::Scalar,
       _("Union[Number, torch.SymInt, torch.SymFloat, torch.SymBool]"));
+<<<<<<< HEAD
   bool load(py::handle src, bool /*unused*/);
+=======
+  bool load(py::handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static py::handle cast(
       const c10::Scalar& si,
@@ -298,7 +342,11 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymInt> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymInt, _("Union[int, torch.SymInt]"));
+<<<<<<< HEAD
   bool load(py::handle src, bool /*unused*/);
+=======
+  bool load(py::handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static py::handle cast(
       const c10::SymInt& si,
@@ -310,7 +358,11 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymFloat> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymFloat, _("float"));
+<<<<<<< HEAD
   bool load(py::handle src, bool /*unused*/);
+=======
+  bool load(py::handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static py::handle cast(
       const c10::SymFloat& si,
@@ -322,7 +374,11 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymBool> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymBool, _("Union[bool, torch.SymBool]"));
+<<<<<<< HEAD
   bool load(py::handle src, bool /*unused*/);
+=======
+  bool load(py::handle src, bool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static py::handle cast(
       const c10::SymBool& si,
@@ -336,7 +392,11 @@ struct type_caster<c10::complex<T>> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(c10::complex<T>, _("complex"));
 
+<<<<<<< HEAD
   bool load(handle src, bool /*unused*/) {
+=======
+  bool load(handle src, bool) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* obj = src.ptr();
 
     // Referred from `THPUtils_unpackComplexDouble`
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index a51cfaf8c5c1c..eb9f584fb72d4 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -131,8 +131,14 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
       size(0),
       default_scalar(0) {
   auto space = fmt.find(' ');
+<<<<<<< HEAD
   TORCH_CHECK(
       space != std::string::npos, "FunctionParameter(): missing type: " + fmt);
+=======
+  if (space == std::string::npos) {
+    throw std::runtime_error("FunctionParameter(): missing type: " + fmt);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto type_str = fmt.substr(0, space);
 
@@ -153,9 +159,16 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
 
   auto name_str = fmt.substr(space + 1);
   auto it = type_map.find(type_str);
+<<<<<<< HEAD
   TORCH_CHECK(
       it != type_map.end(),
       "FunctionParameter(): invalid type string: " + type_str);
+=======
+  if (it == type_map.end()) {
+    throw std::runtime_error(
+        "FunctionParameter(): invalid type string: " + type_str);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   type_ = it->second;
 
   auto eq = name_str.find('=');
@@ -301,10 +314,13 @@ static py::object maybe_get_registered_torch_dispatch_rule(
   return result;
 }
 
+<<<<<<< HEAD
 // NB: Invariant: if you run this function, you MUST test if the returned
 // py::object is nullptr, as this will occur WITHOUT error condition being set.
 // And if an error happens, this function is responsible for throwing a C++
 // error.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static py::object dispatch_on_subclass(
     PyObject* args,
     PyObject* kwargs,
@@ -384,7 +400,10 @@ static py::object dispatch_on_subclass(
       break;
     }
   }
+<<<<<<< HEAD
   // NB: PyErr_Occurred is NOT set here, this means NO dispatch happened
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
@@ -586,6 +605,7 @@ auto handle_torch_function_no_python_arg_parser(
   }
 
   if (ret.ptr() == nullptr) {
+<<<<<<< HEAD
     // We didn't successfully dispatch anything, this should be impossible
     TORCH_INTERNAL_ASSERT(
         0,
@@ -595,6 +615,11 @@ auto handle_torch_function_no_python_arg_parser(
         overloaded_args,
         ", is_mode_active = ",
         is_mode_active());
+=======
+    // if an exception occurred in a user's implementation of
+    // __torch_function__, throw it
+    throw python_error();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (ret.ptr() == Py_NotImplemented) {
     // all __torch_function__ implementations in overloaded_args
     // returned NotImplemented, so we raise a TypeError.
@@ -675,6 +700,7 @@ auto handle_torch_function_indexing(
   auto size = PyTuple_GET_SIZE(index_tup.ptr());
   for (auto i : c10::irange(size)) {
     auto* obj = PyTuple_GetItem(index_tup.ptr(), i);
+<<<<<<< HEAD
     auto r = is_tensor_and_append_overloaded(obj, &overridable_args);
     if (!r && PySequence_Check(obj)) {
       auto inner_size = PySequence_Length(obj);
@@ -691,6 +717,9 @@ auto handle_torch_function_indexing(
         }
       }
     }
+=======
+    is_tensor_and_append_overloaded(obj, &overridable_args);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (val != nullptr) {
     is_tensor_and_append_overloaded(val, &overridable_args);
@@ -817,15 +846,20 @@ bool is_tensor_and_append_overloaded(
   return false;
 }
 
+<<<<<<< HEAD
 static bool is_scalar_list(
     PyObject* obj,
     std::vector<PyObject*>* overloaded_args = nullptr) {
+=======
+static bool is_scalar_list(PyObject* obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
   }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+<<<<<<< HEAD
   bool has_torch_func = false;
 
   for (const auto idx : c10::irange(size)) {
@@ -840,6 +874,12 @@ static bool is_scalar_list(
     }
 
     if (!THPUtils_checkScalar(iobj) && !has_torch_func) {
+=======
+  for (const auto idx : c10::irange(size)) {
+    PyObject* iobj =
+        tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
+    if (!THPUtils_checkScalar(iobj)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     }
   }
@@ -889,9 +929,13 @@ static bool is_float_or_symfloat(PyObject* obj) {
   return false;
 }
 
+<<<<<<< HEAD
 static bool is_float_or_complex_list(
     PyObject* obj,
     std::vector<PyObject*>* overloaded_args = nullptr) {
+=======
+static bool is_float_or_complex_list(PyObject* obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
@@ -899,6 +943,7 @@ static bool is_float_or_complex_list(
 
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+<<<<<<< HEAD
   bool has_torch_func = false;
 
   for (long idx = 0; idx < size; idx++) {
@@ -918,6 +963,12 @@ static bool is_float_or_complex_list(
           !has_torch_func) {
         return false;
       }
+=======
+  if (size > 0) {
+    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
+    if (!is_float_or_symfloat(iobj) && !PyComplex_Check(iobj)) {
+      return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -925,6 +976,7 @@ static bool is_float_or_complex_list(
 }
 
 static bool is_int_or_symint(PyObject* obj) {
+<<<<<<< HEAD
   // Call checkLong first so that actual ints go fast.
   if (THPUtils_checkLong(obj)) {
     return true;
@@ -939,6 +991,15 @@ static bool is_int_or_symint(PyObject* obj) {
   if (torch::is_dynint(py::handle(obj))) {
     return true;
   }
+=======
+  // THPUtils_checkIndex may call __index__ or __int__
+  // which may have side effects if obj is a symint node
+  // so we do `is_symint` check first
+  // TODO: maybe we should be using checkLong here?
+  if (torch::is_symint(py::handle(obj))) {
+    return true;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // FakeTensor(..., size=()) is qualified for SymInt param,
   // but we can't go via __index__ (below) as we would normally
@@ -965,6 +1026,7 @@ static bool is_int_or_symint(PyObject* obj) {
 static bool is_int_or_symint_list(
     PyObject* obj,
     int broadcast_size,
+<<<<<<< HEAD
     int64_t* failed_idx = nullptr,
     std::vector<PyObject*>* overloaded_args = nullptr) {
   const bool is_tuple = PyTuple_Check(obj);
@@ -1010,6 +1072,28 @@ static bool is_int_or_symint_list(
     }
 
     return true;
+=======
+    int64_t* failed_idx = nullptr) {
+  if (PyTuple_Check(obj) || PyList_Check(obj)) {
+    if (PySequence_Size(obj) == 0) {
+      return true;
+    }
+    auto item = py::reinterpret_steal<py::object>(PySequence_GetItem(obj, 0));
+
+    if (is_int_or_symint(item.ptr())) {
+      return true;
+    }
+
+    // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
+    // in an intlist argument. Even float or complex scalar tensors.
+    bool r =
+        (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
+         THPVariable_Unpack(item.ptr()).sizes().empty());
+    if (!r && failed_idx != nullptr) {
+      *failed_idx = 0;
+    }
+    return r;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single
@@ -1023,6 +1107,7 @@ auto FunctionParameter::check(
     std::vector<PyObject*>& overloaded_args,
     int argnum,
     int64_t* failed_idx) -> bool {
+<<<<<<< HEAD
   if (_check(obj, overloaded_args, argnum, failed_idx)) {
     return true;
   }
@@ -1044,6 +1129,8 @@ auto FunctionParameter::_check(
     std::vector<PyObject*>& overloaded_args,
     int argnum,
     int64_t* failed_idx) -> bool {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (type_) {
     case ParameterType::TENSOR: {
       if (is_tensor_and_append_overloaded(obj, &overloaded_args)) {
@@ -1073,8 +1160,12 @@ auto FunctionParameter::_check(
         return !var.requires_grad() && var.dim() == 0;
       }
       if (torch::is_symfloat(py::handle(obj)) ||
+<<<<<<< HEAD
           torch::is_symint(py::handle(obj)) ||
           torch::is_dynint(py::handle(obj))) {
+=======
+          torch::is_symint(py::handle(obj))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // This will induce a guard
         return true;
       }
@@ -1089,8 +1180,12 @@ auto FunctionParameter::_check(
         return at::isIntegralType(var.scalar_type(), /*includeBool=*/false) &&
             !var.requires_grad() && var.dim() == 0;
       }
+<<<<<<< HEAD
       if (torch::is_symint(py::handle(obj)) ||
           torch::is_dynint(py::handle(obj))) {
+=======
+      if (torch::is_symint(py::handle(obj))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // This will induce a guard
         return true;
       }
@@ -1111,7 +1206,11 @@ auto FunctionParameter::_check(
           obj, &overloaded_args, argnum, true /* throw_error */);
     }
     case ParameterType::FLOAT_LIST:
+<<<<<<< HEAD
       return is_float_or_complex_list(obj, &overloaded_args);
+=======
+      return is_float_or_complex_list(obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case ParameterType::GENERATOR:
       return THPGenerator_Check(obj);
     case ParameterType::BOOL:
@@ -1121,7 +1220,19 @@ auto FunctionParameter::_check(
     case ParameterType::PYOBJECT:
       return true;
     case ParameterType::SCALARTYPE:
+<<<<<<< HEAD
       return THPDtype_Check(obj) || THPPythonScalarType_Check(obj);
+=======
+      if (THPDtype_Check(obj) || THPPythonScalarType_Check(obj)) {
+        return true;
+      }
+      if (check_has_torch_function(obj, /*ignore_mode*/ true)) {
+        // tensor subclasses and unrelated objects with __torch_function__
+        append_overloaded_arg(&overloaded_args, obj, /*obj_is_type*/ false);
+        return true;
+      }
+      return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case ParameterType::LAYOUT:
       return THPLayout_Check(obj);
     case ParameterType::MEMORY_FORMAT:
@@ -1132,24 +1243,40 @@ auto FunctionParameter::_check(
       // Allow symint to be passed in as device, but we'll specialize and
       // guard in this case.
       return THPUtils_checkLong(obj) || THPUtils_checkString(obj) ||
+<<<<<<< HEAD
           THPDevice_Check(obj) || torch::is_symint(py::handle(obj)) ||
           torch::is_dynint(py::handle(obj));
+=======
+          THPDevice_Check(obj) || torch::is_symint(py::handle(obj));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case ParameterType::STREAM:
       return THPStream_Check(obj);
     case ParameterType::STRING:
       return THPUtils_checkString(obj);
     case ParameterType::SCALAR_LIST:
+<<<<<<< HEAD
       return is_scalar_list(obj, &overloaded_args);
+=======
+      return is_scalar_list(obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case ParameterType::SYM_INT:
       return is_int_or_symint(obj);
     // Allow SymInt where int is expected; we'll guard in this case
     case ParameterType::INT_LIST:
     case ParameterType::SYM_INT_LIST:
+<<<<<<< HEAD
       return is_int_or_symint_list(obj, size, failed_idx, &overloaded_args);
     case ParameterType::DISPATCH_KEY_SET:
       return py::isinstance<c10::DispatchKeySet>(py::handle(obj));
     default:
       TORCH_CHECK(false, "unknown parameter type");
+=======
+      return is_int_or_symint_list(obj, size, failed_idx);
+    case ParameterType::DISPATCH_KEY_SET:
+      return py::isinstance<c10::DispatchKeySet>(py::handle(obj));
+    default:
+      throw std::runtime_error("unknown parameter type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1206,7 +1333,11 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::DISPATCH_KEY_SET:
       return "DispatchKeySet";
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "unknown parameter type");
+=======
+      throw std::runtime_error("unknown parameter type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1328,8 +1459,15 @@ void FunctionParameter::set_default_str(const std::string& str) {
   }
   if (type_ == ParameterType::TENSOR ||
       type_ == ParameterType::DISPATCH_KEY_SET) {
+<<<<<<< HEAD
     TORCH_CHECK(
         str == "None", "default value for Tensor must be none, got: " + str);
+=======
+    if (str != "None") {
+      throw std::runtime_error(
+          "default value for Tensor must be none, got: " + str);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (type_ == ParameterType::INT64 || type_ == ParameterType::SYM_INT) {
     default_int = atol(str.c_str());
   } else if (type_ == ParameterType::BOOL) {
@@ -1353,14 +1491,24 @@ void FunctionParameter::set_default_str(const std::string& str) {
       default_intlist = parse_intlist_args(str, size);
     }
   } else if (type_ == ParameterType::FLOAT_LIST) {
+<<<<<<< HEAD
     TORCH_CHECK(str == "None", "Defaults not supported for float[]");
+=======
+    if (str != "None") {
+      throw std::runtime_error("Defaults not supported for float[]");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (type_ == ParameterType::SCALARTYPE) {
     if (str == "None") {
       default_scalartype = at::ScalarType::Undefined;
     } else if (str == "torch.int64") {
       default_scalartype = at::ScalarType::Long;
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "invalid default value for ScalarType: " + str);
+=======
+      throw std::runtime_error("invalid default value for ScalarType: " + str);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else if (type_ == ParameterType::LAYOUT) {
     if (str == "None") {
@@ -1370,12 +1518,25 @@ void FunctionParameter::set_default_str(const std::string& str) {
     } else if (str == "torch.sparse_coo") {
       default_layout = at::Layout::Sparse;
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "invalid default value for layout: " + str);
     }
   } else if (type_ == ParameterType::DEVICE) {
     TORCH_CHECK(str == "None", "invalid device: " + str);
   } else if (type_ == ParameterType::STREAM) {
     TORCH_CHECK(str == "None", "invalid stream: " + str);
+=======
+      throw std::runtime_error("invalid default value for layout: " + str);
+    }
+  } else if (type_ == ParameterType::DEVICE) {
+    if (str != "None") {
+      throw std::runtime_error("invalid device: " + str);
+    }
+  } else if (type_ == ParameterType::STREAM) {
+    if (str != "None") {
+      throw std::runtime_error("invalid stream: " + str);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (type_ == ParameterType::STRING) {
     if (str != "None") {
       default_string = parse_string_literal(str);
@@ -1404,7 +1565,11 @@ void FunctionParameter::set_default_str(const std::string& str) {
   } else if (type_ == ParameterType::QSCHEME) { // NOLINT
     // throw std::runtime_error("ParameterType::QSCHEME");
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "unknown parameter type");
+=======
+    throw std::runtime_error("unknown parameter type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   default_value = str;
 }
@@ -1419,7 +1584,11 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
       deprecated(false) {
   auto open_paren = fmt.find('(');
   if (open_paren == std::string::npos) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "missing opening parenthesis: " + fmt);
+=======
+    throw std::runtime_error("missing opening parenthesis: " + fmt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   name = fmt.substr(0, open_paren);
 
@@ -1441,9 +1610,18 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
         break;
       }
     }
+<<<<<<< HEAD
     TORCH_CHECK(
         offset != std::string::npos, "missing closing parenthesis: " + fmt);
     TORCH_CHECK(offset != last_offset, "malformed signature: " + fmt);
+=======
+    if (offset == std::string::npos) {
+      throw std::runtime_error("missing closing parenthesis: " + fmt);
+    }
+    if (offset == last_offset) {
+      throw std::runtime_error("malformed signature: " + fmt);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto param_str = fmt.substr(last_offset, offset - last_offset);
     last_offset = next_offset;
@@ -1507,6 +1685,7 @@ std::string FunctionSignature::toString() const {
   const auto min_args = signature.min_args;
   const long nargs_ = nargs;
   if (min_args != max_pos_args) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format(
@@ -1525,6 +1704,22 @@ std::string FunctionSignature::toString() const {
           max_pos_args == 1 ? "" : "s",
           nargs_,
           nargs == 1 ? "was" : "were"));
+=======
+    throw TypeError(
+        "%s() takes from %zu to %zu positional arguments but %ld were given",
+        signature.name.c_str(),
+        min_args,
+        max_pos_args,
+        nargs_);
+  }
+  throw TypeError(
+      "%s() takes %zu positional argument%s but %ld %s given",
+      signature.name.c_str(),
+      max_pos_args,
+      max_pos_args == 1 ? "" : "s",
+      nargs_,
+      nargs == 1 ? "was" : "were");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 [[noreturn]] static void missing_args(
@@ -1544,6 +1739,7 @@ std::string FunctionSignature::toString() const {
     }
   }
 
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(
       false,
       fmt::format(
@@ -1552,6 +1748,14 @@ std::string FunctionSignature::toString() const {
           num_missing,
           num_missing == 1 ? "s" : "",
           ss.str()));
+=======
+  throw TypeError(
+      "%s() missing %d required positional argument%s: %s",
+      signature.name.c_str(),
+      num_missing,
+      num_missing == 1 ? "s" : "",
+      ss.str().c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Py_ssize_t find_param(FunctionSignature& signature, PyObject* name) {
@@ -1580,11 +1784,16 @@ static Py_ssize_t find_param(FunctionSignature& signature, PyObject* name) {
   // accessible within this thread.
   while (PyDict_Next(kwargs, &pos, &key, &value)) {
     if (!THPUtils_checkString(key)) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(false, "keywords must be strings");
+=======
+      throw TypeError("keywords must be strings");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     auto param_idx = find_param(signature, key);
     if (param_idx < 0) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(
           false,
           fmt::format(
@@ -1600,11 +1809,28 @@ static Py_ssize_t find_param(FunctionSignature& signature, PyObject* name) {
               "{}() got multiple values for argument '{}'",
               signature.name,
               THPUtils_unpackString(key)));
+=======
+      throw TypeError(
+          "%s() got an unexpected keyword argument '%s'",
+          signature.name.c_str(),
+          THPUtils_unpackString(key).c_str());
+    }
+
+    if (param_idx < num_pos_args) {
+      throw TypeError(
+          "%s() got multiple values for argument '%s'",
+          signature.name.c_str(),
+          THPUtils_unpackString(key).c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
   // this should never be hit
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(false, "invalid keyword arguments");
+=======
+  throw TypeError("invalid keyword arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool FunctionSignature::parse(
@@ -1682,8 +1908,12 @@ bool FunctionSignature::parse(
       // should avoid having complex signatures that make use of it...
     } else if (
         varargs_eligible &&
+<<<<<<< HEAD
         (is_int_or_symint_list(
             args, param.size, &failed_idx, &overloaded_args))) {
+=======
+        (is_int_or_symint_list(args, param.size, &failed_idx))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // take all positional arguments as this parameter
       // e.g. permute(1, 2, 3) -> permute((1, 2, 3))
       dst[i++] = args;
@@ -1692,6 +1922,7 @@ bool FunctionSignature::parse(
     } else if (raise_exception) {
       if (is_kwd) {
         // foo(): argument 'other' must be str, not int
+<<<<<<< HEAD
         TORCH_CHECK_TYPE(
             false,
             fmt::format(
@@ -1700,6 +1931,14 @@ bool FunctionSignature::parse(
                 param.name,
                 param.type_name(),
                 Py_TYPE(obj)->tp_name));
+=======
+        throw TypeError(
+            "%s(): argument '%s' must be %s, not %s",
+            name.c_str(),
+            param.name.c_str(),
+            param.type_name().c_str(),
+            Py_TYPE(obj)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         // foo(): argument 'other' (position 2) must be str, not int
         if (failed_idx != -1) {
@@ -1708,6 +1947,7 @@ bool FunctionSignature::parse(
             obj = args;
           }
           TORCH_INTERNAL_ASSERT(failed_idx < PySequence_Size(obj));
+<<<<<<< HEAD
           TORCH_CHECK_TYPE(
               false,
               fmt::format(
@@ -1731,6 +1971,27 @@ bool FunctionSignature::parse(
                 arg_pos + 1,
                 param.type_name(),
                 Py_TYPE(obj)->tp_name));
+=======
+          throw TypeError(
+              "%s(): argument '%s' (position %ld) must be %s, but found element of type %s at pos %ld",
+              name.c_str(),
+              param.name.c_str(),
+              static_cast<long>(arg_pos + 1),
+              param.type_name().c_str(),
+              Py_TYPE(py::reinterpret_steal<py::object>(
+                          PySequence_GetItem(obj, failed_idx))
+                          .ptr())
+                  ->tp_name,
+              static_cast<long>(failed_idx));
+        }
+        throw TypeError(
+            "%s(): argument '%s' (position %ld) must be %s, not %s",
+            name.c_str(),
+            param.name.c_str(),
+            static_cast<long>(arg_pos + 1),
+            param.type_name().c_str(),
+            Py_TYPE(obj)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     } else {
       return false;
@@ -1852,7 +2113,11 @@ void PythonArgParser::print_error(
   auto options = get_signatures();
   auto msg =
       torch::format_invalid_args(args, kwargs, function_name + "()", options);
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(false, msg);
+=======
+  throw TypeError("%s", msg.c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::vector<std::string> PythonArgParser::get_signatures() const {
@@ -1879,7 +2144,25 @@ at::Tensor PythonArgs::tensor_slow(int i) {
   if (PyBool_Check(obj)) {
     scalar = at::Scalar(THPUtils_unpackBool(obj));
   } else if (THPUtils_checkLong(obj)) {
+<<<<<<< HEAD
     scalar = THPUtils_unpackInteger<at::Scalar>(obj);
+=======
+    int overflow = -1;
+    long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+    if (value == -1 && PyErr_Occurred()) {
+      throw python_error();
+    }
+    if (overflow != 0) {
+      // try unsigned
+      unsigned long long value = PyLong_AsUnsignedLongLong(obj);
+      if (value == static_cast<unsigned long long>(-1) && PyErr_Occurred()) {
+        throw python_error();
+      }
+      scalar = at::Scalar(static_cast<uint64_t>(value));
+    } else {
+      scalar = at::Scalar(static_cast<int64_t>(value));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (PyComplex_Check(obj)) {
     scalar = at::Scalar(THPUtils_unpackComplexDouble(obj));
   } else if (THPUtils_checkDouble(obj)) {
@@ -1887,8 +2170,12 @@ at::Tensor PythonArgs::tensor_slow(int i) {
     // NB: we DO NOT put symbolic ints/floats into the Scalar itself,
     // because although Scalar supports SymInt/SymFloat, the subsequent
     // conversion to Tensor does not.  Instead, do it out of band.
+<<<<<<< HEAD
   } else if (
       torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj))) {
+=======
+  } else if (torch::is_symint(py::handle(obj))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     save_symint = true;
     // This scalar value doesn't matter, it shouldn't ever actually
     // get read out.  Make it a big and weird looking number to help
@@ -1906,12 +2193,17 @@ at::Tensor PythonArgs::tensor_slow(int i) {
     // a test for Py_None here; instead, you need to mark the argument
     // as *allowing none*; you can do this by writing 'Tensor?' instead
     // of 'Tensor' in the ATen metadata.
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format(
             "expected Tensor as argument {}, but got {}",
             i,
             Py_TYPE(obj)->tp_name));
+=======
+    throw TypeError(
+        "expected Tensor as argument %d, but got %s", i, Py_TYPE(obj)->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
@@ -1976,10 +2268,13 @@ at::Scalar PythonArgs::scalar_slow(PyObject* arg) {
     return at::Scalar(py::cast<c10::SymInt>(arg));
   }
 
+<<<<<<< HEAD
   if (torch::is_dynint(arg)) {
     return at::Scalar(py::cast<int>(arg));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (torch::is_symfloat(arg)) {
     return at::Scalar(py::cast<c10::SymFloat>(arg));
   }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 5887235f72e50..778ff99670a67 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -39,7 +39,10 @@
 //      Scalar and Tensor, UNLESS they require grad (in which case
 //      they only bind to Tensor).
 
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <pybind11/pytypes.h>
 #include <torch/csrc/python_headers.h>
 
@@ -89,7 +92,11 @@ inline bool THPUtils_checkScalar(PyObject* obj) {
   }
 #endif
   return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) ||
+<<<<<<< HEAD
       torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj)) ||
+=======
+      torch::is_symint(py::handle(obj)) ||
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       torch::is_symfloat(py::handle(obj)) || torch::is_symbool(py::handle(obj));
 }
 
@@ -322,12 +329,15 @@ struct FunctionParameter {
       int argnum,
       int64_t* failed_idx = nullptr);
 
+<<<<<<< HEAD
   bool _check(
       PyObject* obj,
       std::vector<PyObject*>& overloaded_args,
       int argnum,
       int64_t* failed_idx = nullptr);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_default_str(const std::string& str);
   TORCH_PYTHON_API std::string type_name() const;
 
@@ -497,9 +507,13 @@ inline std::array<at::Tensor, N> PythonArgs::tensorlist_n(int i) {
   // NOLINTNEXTLINE(bugprone-branch-clone)
   auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
   if (size != N) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format("expected tuple of {} elements but got {}", N, size));
+=======
+    throw TypeError("expected tuple of %d elements but got %d", N, (int)size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   for (const auto idx : c10::irange(size)) {
     PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
@@ -537,6 +551,7 @@ inline void throw_intlist_exception(
       ? e.what()
       : std::string("type must be ") + args->signature.params[i].type_name() +
           ",but got " + Py_TYPE(obj)->tp_name;
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(
       false,
       fmt::format(
@@ -545,6 +560,14 @@ inline void throw_intlist_exception(
           args->signature.params[i].name,
           idx + 1,
           error));
+=======
+  throw TypeError(
+      "%s(): argument '%s' failed to unpack the object at pos %zu with error \"%s\"",
+      args->signature.name.c_str(),
+      args->signature.params[i].name.c_str(),
+      idx + 1,
+      error.c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
@@ -612,8 +635,11 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
         try {
           if (is_symint(py::handle(obj))) {
             res.push_back(py::handle(obj).cast<c10::SymInt>());
+<<<<<<< HEAD
           } else if (is_dynint(py::handle(obj))) {
             res.push_back(py::handle(obj).cast<int>());
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } else {
             res.emplace_back(THPUtils_unpackIndex(obj));
           }
@@ -642,9 +668,12 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
         size1,
         py::handle(arg).cast<c10::SymInt>().guard_int(__FILE__, __LINE__));
   }
+<<<<<<< HEAD
   if (size1 > 0 && torch::is_dynint(py::handle(arg))) {
     return std::vector<int64_t>(size1, py::handle(arg).cast<int>());
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto tuple = PyTuple_Check(arg);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -677,8 +706,11 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
       } else if (torch::is_symint(py::handle(obj))) {
         res[idx] = py::cast<c10::SymInt>(py::handle(obj))
                        .guard_int(__FILE__, __LINE__);
+<<<<<<< HEAD
       } else if (torch::is_dynint(py::handle(obj))) {
         res[idx] = py::handle(obj).cast<int>();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else if (THPVariable_Check(obj)) {
         auto& var = THPVariable_Unpack(obj);
         if (var.numel() != 1 ||
@@ -730,6 +762,7 @@ inline std::vector<double> PythonArgs::getDoublelist(int i) {
         res[idx] = THPUtils_unpackDouble(obj);
       }
     } catch (const std::exception&) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(
           false,
           fmt::format(
@@ -739,6 +772,15 @@ inline std::vector<double> PythonArgs::getDoublelist(int i) {
               signature.params[i].type_name(),
               Py_TYPE(obj)->tp_name,
               idx + 1));
+=======
+      throw TypeError(
+          "%s(): argument '%s' must be %s, but found element of type %s at pos %zu",
+          signature.name.c_str(),
+          signature.params[i].name.c_str(),
+          signature.params[i].type_name().c_str(),
+          Py_TYPE(obj)->tp_name,
+          idx + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return res;
@@ -853,10 +895,13 @@ inline at::Device toDevice(PyObject* obj) {
         py::cast<c10::SymInt>(py::handle(obj)).guard_int(__FILE__, __LINE__);
     return deviceFromLong(device_index);
   }
+<<<<<<< HEAD
   if (torch::is_dynint(py::handle(obj))) {
     auto device_index = py::cast<int>(py::handle(obj));
     return deviceFromLong(device_index);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const std::string& device_str = THPUtils_unpackString(obj);
   return at::Device(device_str);
 }
@@ -993,9 +1038,12 @@ inline int64_t PythonArgs::toInt64(int i) {
     return py::cast<c10::SymInt>(py::handle(args[i]))
         .guard_int(__FILE__, __LINE__);
   }
+<<<<<<< HEAD
   if (torch::is_dynint(py::handle(args[i]))) {
     return py::cast<int>(py::handle(args[i]));
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_unpackLong(args[i]);
 }
 
@@ -1069,13 +1117,17 @@ inline double PythonArgs::toDouble(int i) {
     return static_cast<double>(py::cast<c10::SymInt>(py::handle(args[i]))
                                    .guard_int(__FILE__, __LINE__));
   }
+<<<<<<< HEAD
   if (torch::is_dynint(py::handle(args[i]))) {
     return static_cast<double>(py::cast<int>(py::handle(args[i])));
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_unpackDouble(args[i]);
 }
 
 inline bool PythonArgs::toBool(int i) {
+<<<<<<< HEAD
   if (!args[i]) {
     return signature.params[i].default_bool;
   }
@@ -1085,11 +1137,19 @@ inline bool PythonArgs::toBool(int i) {
   if (args[i] == Py_False) {
     return false;
   }
+=======
+  if (!args[i])
+    return signature.params[i].default_bool;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (torch::is_symbool(py::handle(args[i]))) {
     return py::cast<c10::SymBool>(py::handle(args[i]))
         .guard_bool(__FILE__, __LINE__);
   }
+<<<<<<< HEAD
   return false;
+=======
+  return args[i] == Py_True;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline double PythonArgs::toDoubleWithDefault(int i, double default_double) {
@@ -1156,10 +1216,15 @@ inline c10::Stream PythonArgs::stream(int i) {
     return c10::Stream(
         c10::Stream::Default::DEFAULT, c10::Device(c10::DeviceType::CPU, -1));
   if (!THPStream_Check(args[i])) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(
         false,
         fmt::format(
             "expected Stream object. Got '{}'", Py_TYPE(args[i])->tp_name));
+=======
+    throw TypeError(
+        "expected Stream object. Got '%s'", Py_TYPE(args[i])->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return c10::Stream::unpack3(
       ((THPStream*)args[i])->stream_id,
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
index 16292e4fd0308..52ff18f2bea36 100644
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@@ -13,7 +13,10 @@ extern "C" {
 #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
 #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
 #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000
+<<<<<<< HEAD
 #define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static inline int PyCode_GetNCellvars(PyCodeObject* code) {
 // gh-26364 added co_ncellvars to Python 3.11.0rc1
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index eafd3755d8510..606b2734bf6c3 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -2,11 +2,17 @@
 #include <torch/csrc/utils/python_dispatch.h>
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <ATen/DTensorState.h>
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/autocast_mode.h>
+=======
+#include <ATen/FuncTorchTLS.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/TensorSubclassLikeUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/core/NestedIntSymNodeImpl.h>
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <ATen/core/dispatch/Dispatcher.h>
@@ -16,7 +22,10 @@
 
 #include <c10/core/SafePyObject.h>
 #include <torch/csrc/PyInterpreter.h>
+<<<<<<< HEAD
 #include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/tensor_new.h>
@@ -28,8 +37,11 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_raii.h>
 
+<<<<<<< HEAD
 #include <cstdlib>
 #include <cstring>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <iostream>
 #include <utility>
 
@@ -37,10 +49,13 @@ namespace py = pybind11;
 
 namespace torch::impl::dispatch {
 
+<<<<<<< HEAD
 // Global storage for leaked Python filenames to ensure they remain valid
 // for the lifetime of Library objects
 static std::vector<std::string> leaked_python_filenames_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NB: I'd like to index this on OperatorHandle, but I can't, as I can't
 // guarantee that the main interpreter has finish doing all registrations before
 // the other interpreters start banging on it
@@ -195,6 +210,18 @@ class PythonKernelHolder : public c10::OperatorKernel {
 
     auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
     py::gil_scoped_acquire g;
+<<<<<<< HEAD
+=======
+    // Jan 2024: We're slated to get rid of multipy, // codespell:ignore multipy
+    // so stop forcing hermetic mode unconditionally in all situations when
+    // you're using multipy.  // codespell:ignore multipy
+    // Eventually just delete this entirely.  (Note that you may break
+    // multipy anyway this way with dispatcher  // codespell:ignore multipy
+    // registered functions that require hermetic to be off.)
+#if defined(USE_DEPLOY)
+    EnableHermeticPyObject g2;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
     auto func =
         py::reinterpret_borrow<py::object>(func_.ptr(getPyInterpreter()));
@@ -217,10 +244,19 @@ class PythonKernelHolder : public c10::OperatorKernel {
   }
 };
 
+<<<<<<< HEAD
 // @todo sahanp: Afait only register is used in the codebase. This can be
 // removed / simplified
 static torch::_RegisterOrVerify register_or_verify() {
   return torch::_RegisterOrVerify::REGISTER;
+=======
+static torch::_RegisterOrVerify register_or_verify() {
+  if (isMainPyInterpreter()) {
+    return torch::_RegisterOrVerify::REGISTER;
+  } else {
+    return torch::_RegisterOrVerify::VERIFY;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static py::object ophandle_call_boxed(
@@ -293,6 +329,10 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "reset",
           [](const py::object& self) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().reset();
             return;
           },
@@ -302,6 +342,10 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "def_",
           [](py::object self, const char* schema, const char* alias) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().def(
                 torch::schema(schema, parseAliasAnalysisKind(alias)));
             return self;
@@ -315,6 +359,10 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "def_legacy",
           [](py::object self, const char* schema) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().def(torch::jit::parseSchema(schema));
             return self;
           },
@@ -334,6 +382,10 @@ void initDispatchBindings(PyObject* module) {
              const char* name,
              const char* dispatch,
              const char* debug) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().def(
                 name, dispatch_str(dispatch, [](const at::Tensor& a) {
                         return a;
@@ -351,6 +403,10 @@ void initDispatchBindings(PyObject* module) {
              const char* dispatch,
              const char* alias,
              const char* debug) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().def(
                 torch::schema(schema, parseAliasAnalysisKind(alias)),
                 dispatch_str(dispatch, [](const at::Tensor& a) {
@@ -371,6 +427,10 @@ void initDispatchBindings(PyObject* module) {
              const char* name,
              const char* dispatch,
              const char* debug) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().impl(
                 name, dispatch_str(dispatch, [](const at::Tensor& a) {
                         return a;
@@ -465,6 +525,10 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "fallback_fallthrough",
           [](py::object self, const char* dispatch) {
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cast<torch::Library&>().fallback(
                 dispatch_str(dispatch, CppFunction::makeFallthrough()));
             return self;
@@ -479,6 +543,10 @@ void initDispatchBindings(PyObject* module) {
              bool with_keyset) {
             HANDLE_TH_ERRORS
             auto& lib = self.cast<torch::Library&>();
+<<<<<<< HEAD
+=======
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (func.is(py::module::import("torch.library")
                             .attr("fallthrough_kernel"))) {
               lib.fallback(
@@ -495,6 +563,7 @@ void initDispatchBindings(PyObject* module) {
           "",
           py::arg("dispatch"),
           py::arg("func"),
+<<<<<<< HEAD
           py::arg("with_keyset") = false)
       .def(
           "register_ad_inplace_or_view_fallback",
@@ -509,6 +578,9 @@ void initDispatchBindings(PyObject* module) {
           },
           "",
           py::arg("name"));
+=======
+          py::arg("with_keyset") = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "_dispatch_library",
@@ -518,18 +590,25 @@ void initDispatchBindings(PyObject* module) {
          const char* file,
          uint32_t linenum) {
         HANDLE_TH_ERRORS
+<<<<<<< HEAD
         // Store the file string in global storage to ensure it remains valid
         // for the lifetime of the Library object
         leaked_python_filenames_.emplace_back(file);
         const char* leaked_file = leaked_python_filenames_.back().c_str();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return std::make_unique<torch::Library>(
             parseKind(kind),
             std::move(name),
             std::string(dispatch).empty()
                 ? std::nullopt
                 : std::make_optional(c10::parseDispatchKey(dispatch)),
+<<<<<<< HEAD
             leaked_file,
+=======
+            "/dev/null", // temporary workaround
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             linenum);
         END_HANDLE_TH_ERRORS_PYBIND
       },
@@ -541,12 +620,15 @@ void initDispatchBindings(PyObject* module) {
       py::arg("linenum") = 0);
 
   m.def(
+<<<<<<< HEAD
       "_dispatch_clear_leaked_python_filenames",
       []() { leaked_python_filenames_.clear(); },
       "Clear the global storage of leaked Python filenames. "
       "WARNING: Only call this if you're sure no Library objects are still using the filenames.");
 
   m.def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "_dispatch_find_schema_or_throw",
       [](const char* name, const char* overload_name) -> c10::OperatorHandle {
         return c10::Dispatcher::singleton().findSchemaOrThrow(
@@ -935,6 +1017,11 @@ void initDispatchBindings(PyObject* module) {
         handle.setReportErrorCallback_(std::move(callback_obj));
       });
 
+<<<<<<< HEAD
+=======
+  m.def(
+      "_dispatch_is_main_interpreter", []() { return isMainPyInterpreter(); });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_dispatch_pystub", [](const char* name, const char* overload) {
     return c10::Dispatcher::singleton().getPyStub(
         c10::OperatorName(name, overload));
@@ -970,6 +1057,7 @@ void initDispatchBindings(PyObject* module) {
         include_set.has(c10::DispatchKey::FuncTorchDynamicLayerBackMode));
   });
 
+<<<<<<< HEAD
   m.def("_autocast_supported_devices", []() {
     std::vector<std::string> result;
     for (const auto device_type : at::autocast::_AUTOCAST_SUPPORTED_DEVICES) {
@@ -979,6 +1067,8 @@ void initDispatchBindings(PyObject* module) {
     return result;
   });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_get_nested_int", [](int64_t data, int64_t coeff) {
     return c10::SymInt(c10::SymNode(
         c10::make_intrusive<c10::NestedIntSymNodeImpl>(data, coeff)));
@@ -1023,6 +1113,7 @@ void initDispatchBindings(PyObject* module) {
   m.def("_only_lift_cpu_tensors", &torch::utils::only_lift_cpu_tensors);
   m.def("_set_only_lift_cpu_tensors", &torch::utils::set_only_lift_cpu_tensors);
 
+<<<<<<< HEAD
   m.def(
       "_get_dtensor_allow_implicit_replication",
       &at::get_dtensor_allow_implicit_replication);
@@ -1030,6 +1121,8 @@ void initDispatchBindings(PyObject* module) {
       "_set_dtensor_allow_implicit_replication",
       &at::set_dtensor_allow_implicit_replication);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using c10::impl::TorchDispatchModeKey;
   py::enum_<TorchDispatchModeKey>(m, "_TorchDispatchModeKey")
       .value("FUNCTIONAL", TorchDispatchModeKey::FUNCTIONAL)
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index c7e6cc29bf783..be19a1f76c3eb 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -62,11 +62,21 @@ inline int32_t THPUtils_unpackInt(PyObject* obj) {
   if (value == -1 && PyErr_Occurred()) {
     throw python_error();
   }
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(overflow == 0, "Overflow when unpacking long long");
   TORCH_CHECK_VALUE(
       value <= std::numeric_limits<int32_t>::max() &&
           value >= std::numeric_limits<int32_t>::min(),
       "Overflow when unpacking long");
+=======
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+  if (value > std::numeric_limits<int32_t>::max() ||
+      value < std::numeric_limits<int32_t>::min()) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (int32_t)value;
 }
 
@@ -76,7 +86,13 @@ inline int64_t THPUtils_unpackLong(PyObject* obj) {
   if (value == -1 && PyErr_Occurred()) {
     throw python_error();
   }
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(overflow == 0, "Overflow when unpacking long long");
+=======
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (int64_t)value;
 }
 
@@ -85,9 +101,15 @@ inline uint32_t THPUtils_unpackUInt32(PyObject* obj) {
   if (PyErr_Occurred()) {
     throw python_error();
   }
+<<<<<<< HEAD
   TORCH_CHECK_VALUE(
       value <= std::numeric_limits<uint32_t>::max(),
       "Overflow when unpacking long long");
+=======
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    throw std::runtime_error("Overflow when unpacking unsigned long");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (uint32_t)value;
 }
 
@@ -120,7 +142,11 @@ inline bool THPUtils_unpackBool(PyObject* obj) {
   } else if (obj == Py_False) {
     return false;
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "couldn't convert python object to boolean");
+=======
+    throw std::runtime_error("couldn't convert python object to boolean");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -163,6 +189,7 @@ inline c10::complex<double> THPUtils_unpackComplexDouble(PyObject* obj) {
 }
 
 inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
+<<<<<<< HEAD
 #ifdef USE_NUMPY
   // Handle NumPy boolean scalars (np.bool_)
   if (torch::utils::is_numpy_bool(obj)) {
@@ -173,6 +200,8 @@ inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
     return truth != 0;
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (PyFloat_Check(obj)) {
     return (bool)PyFloat_AS_DOUBLE(obj);
   }
@@ -199,6 +228,7 @@ inline c10::DeviceIndex THPUtils_unpackDeviceIndex(PyObject* obj) {
   if (value == -1 && PyErr_Occurred()) {
     throw python_error();
   }
+<<<<<<< HEAD
   TORCH_CHECK(overflow == 0, "Overflow when unpacking DeviceIndex");
   TORCH_CHECK(
       value <= std::numeric_limits<c10::DeviceIndex>::max() &&
@@ -225,3 +255,14 @@ inline T THPUtils_unpackInteger(PyObject* obj) {
   }
   return static_cast<uint64_t>(uvalue);
 }
+=======
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  if (value > std::numeric_limits<c10::DeviceIndex>::max() ||
+      value < std::numeric_limits<c10::DeviceIndex>::min()) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  return (c10::DeviceIndex)value;
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 66e92ba8fd52d..1ef6b3213eb20 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -11,12 +11,22 @@ namespace torch::utils {
 
 template <typename T>
 inline T unpackIntegral(PyObject* obj, const char* type) {
+<<<<<<< HEAD
+=======
+#if PY_VERSION_HEX >= 0x030a00f0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // In Python-3.10 floats can no longer be silently converted to integers
   // Keep backward compatible behavior for now
   if (PyFloat_Check(obj)) {
     return c10::checked_convert<T>(THPUtils_unpackDouble(obj), type);
   }
   return c10::checked_convert<T>(THPUtils_unpackLong(obj), type);
+<<<<<<< HEAD
+=======
+#else
+  return static_cast<T>(THPUtils_unpackLong(obj));
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
@@ -97,7 +107,11 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
           at::convert<at::Float8_e8m0fnu, double>(THPUtils_unpackDouble(obj));
       break;
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "store_scalar: invalid type");
+=======
+      throw std::runtime_error("store_scalar: invalid type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -161,7 +175,11 @@ inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
       return PyFloat_FromDouble(
           at::convert<double, at::Float8_e8m0fnu>(*(at::Float8_e8m0fnu*)data));
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "load_scalar: invalid type");
+=======
+      throw std::runtime_error("load_scalar: invalid type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index fd36655f48872..df6459b20efde 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -3,7 +3,10 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/pybind.h>
+<<<<<<< HEAD
 #include <torch/csrc/utils/python_compat.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <stdexcept>
 #include <string>
 
@@ -27,10 +30,19 @@ inline std::string THPUtils_unpackString(PyObject* obj) {
   if (PyUnicode_Check(obj)) {
     Py_ssize_t size = 0;
     const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+<<<<<<< HEAD
     TORCH_CHECK(data, "error unpacking string as utf-8");
     return std::string(data, (size_t)size);
   }
   TORCH_CHECK(false, "unpackString: expected bytes or unicode object");
+=======
+    if (!data) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    return std::string(data, (size_t)size);
+  }
+  throw std::runtime_error("unpackString: expected bytes or unicode object");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Unpacks PyBytes (PyString) or PyUnicode as std::string_view
@@ -49,10 +61,19 @@ inline std::string_view THPUtils_unpackStringView(PyObject* obj) {
   if (PyUnicode_Check(obj)) {
     Py_ssize_t size = 0;
     const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+<<<<<<< HEAD
     TORCH_CHECK(data, "error unpacking string as utf-8");
     return std::string_view(data, (size_t)size);
   }
   TORCH_CHECK(false, "unpackString: expected bytes or unicode object");
+=======
+    if (!data) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    return std::string_view(data, (size_t)size);
+  }
+  throw std::runtime_error("unpackString: expected bytes or unicode object");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline PyObject* THPUtils_packString(const char* str) {
@@ -99,6 +120,7 @@ inline void THPUtils_internStringInPlace(PyObject** obj) {
  */
 
 inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
+<<<<<<< HEAD
 #if IS_PYTHON_3_13_PLUS
   PyObject* res = (PyObject*)nullptr;
   int result_code = PyObject_GetOptionalAttrString(obj, name, &res);
@@ -107,6 +129,8 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
   }
   return py::reinterpret_steal<py::object>(res);
 #else
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyTypeObject* tp = Py_TYPE(obj);
   PyObject* res = (PyObject*)nullptr;
 
@@ -121,7 +145,11 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
   }
   /* Attribute referenced by (PyObject *)name */
   else if (tp->tp_getattro != nullptr) {
+<<<<<<< HEAD
     auto w = py::reinterpret_steal<py::object>(PyUnicode_FromString(name));
+=======
+    auto w = py::reinterpret_steal<py::object>(THPUtils_internString(name));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (w.ptr() == nullptr) {
       return py::object();
     }
@@ -131,5 +159,8 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
     }
   }
   return py::reinterpret_steal<py::object>(res);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/csrc/utils/python_symnode.cpp b/torch/csrc/utils/python_symnode.cpp
index 9e17f8166a4b9..f794fbd864c72 100644
--- a/torch/csrc/utils/python_symnode.cpp
+++ b/torch/csrc/utils/python_symnode.cpp
@@ -53,6 +53,7 @@ py::handle get_symbool_class() {
 #endif
 }
 
+<<<<<<< HEAD
 py::handle get_dynint_class() {
   // NB: leak
 #if IS_PYBIND_2_13_PLUS
@@ -73,4 +74,6 @@ py::handle get_dynint_class() {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index ebdc90612a9ed..22472fdf11859 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -12,7 +12,10 @@ namespace torch {
 TORCH_PYTHON_API py::handle get_symint_class();
 TORCH_PYTHON_API py::handle get_symfloat_class();
 TORCH_PYTHON_API py::handle get_symbool_class();
+<<<<<<< HEAD
 TORCH_PYTHON_API py::handle get_dynint_class();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // NB: These functions must not be called too early, otherwise torch not setup.
 // Alternate design is to have torch "register" the object to us
@@ -25,9 +28,12 @@ inline bool is_symfloat(py::handle obj) {
 inline bool is_symbool(py::handle obj) {
   return py::isinstance(obj, get_symbool_class());
 }
+<<<<<<< HEAD
 inline bool is_dynint(py::handle obj) {
   return py::isinstance(obj, get_dynint_class());
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace impl {
 
@@ -129,6 +135,14 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("expect_true")(file, line).cast<bool>();
   }
 
+<<<<<<< HEAD
+=======
+  bool expect_size(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("expect_size")(file, line).cast<bool>();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool guard_size_oblivious(const char* file, int64_t line) override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("guard_size_oblivious")(file, line).cast<bool>();
diff --git a/torch/csrc/utils/pythoncapi_compat.h b/torch/csrc/utils/pythoncapi_compat.h
index 05e80b5ee8607..05a9bc3694813 100644
--- a/torch/csrc/utils/pythoncapi_compat.h
+++ b/torch/csrc/utils/pythoncapi_compat.h
@@ -68,6 +68,19 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
 // It is excluded from the limited C API.
 #if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
@@ -104,6 +117,40 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #  define Py_IsFalse(x) Py_Is(x, Py_False)
 #endif
 
+<<<<<<< HEAD
+=======
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 {
     PyCodeObject *code = PyFrame_GetCode(frame);
@@ -112,6 +159,18 @@ static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 }
 
 
+<<<<<<< HEAD
+=======
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
 {
@@ -229,6 +288,29 @@ PyFrame_GetVarString(PyFrameObject *frame, const char *name)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState *
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject*
 _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
@@ -240,6 +322,38 @@ _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState* PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
 #if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
 static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
@@ -269,6 +383,30 @@ static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+static inline PyObject* PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+#endif
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
 #if PY_VERSION_HEX < 0x030A00A3
 static inline int
@@ -294,6 +432,61 @@ PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
 // bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
 // Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
@@ -420,6 +613,84 @@ static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
 #endif
 
 
+<<<<<<< HEAD
+=======
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+static inline PyObject*
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // gh-106521 added PyObject_GetOptionalAttr() and
 // PyObject_GetOptionalAttrString() to Python 3.13.0a1
 #if PY_VERSION_HEX < 0x030D00A1
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index f25175af2dcc1..dc0c4fc366c84 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <ATen/functorch/TensorWrapper.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/tensor_list.h>
 
 #include <c10/util/irange.h>
@@ -40,12 +43,15 @@ static PyObject* recursive_to_list(
   return list.release();
 }
 
+<<<<<<< HEAD
 const Tensor& recursive_unwrap(const Tensor& tensor) {
   if (auto* wrapper = at::functorch::maybeGetTensorWrapper(tensor))
     return recursive_unwrap(wrapper->value());
   return tensor;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* tensor_to_list(const Tensor& tensor) {
   {
     py::object pytensor =
@@ -55,9 +61,13 @@ PyObject* tensor_to_list(const Tensor& tensor) {
         ".tolist() is not supported for tensor subclasses, got ",
         Py_TYPE(pytensor.ptr())->tp_name);
   }
+<<<<<<< HEAD
   // check if it is a grad tracking tensor and unwrap.
   Tensor data = tensor.resolve_conj().resolve_neg();
   data = recursive_unwrap(data);
+=======
+  Tensor data = tensor.resolve_conj().resolve_neg();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!data.device().is_cpu()) {
     pybind11::gil_scoped_release no_gil;
     data = data.toBackend(Backend::CPU);
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
index 4f08109284a4d..1e3af5f820a35 100644
--- a/torch/csrc/utils/tensor_memoryformats.h
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -9,7 +9,11 @@ namespace torch::utils {
 void initializeMemoryFormats();
 
 // This methods returns a borrowed reference!
+<<<<<<< HEAD
 TORCH_PYTHON_API PyObject* getTHPMemoryFormat(
     c10::MemoryFormat /*memory_format*/);
+=======
+TORCH_PYTHON_API PyObject* getTHPMemoryFormat(c10::MemoryFormat);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index c422e8af0ecdb..c711bd7682274 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -304,7 +304,11 @@ Tensor internal_new_from_data(
     TORCH_CHECK(
         !pin_memory,
         "Can't pin tensor constructed from __cuda_array_interface__");
+<<<<<<< HEAD
     auto tensor = tensor_from_cuda_array_interface(data, device_opt);
+=======
+    auto tensor = tensor_from_cuda_array_interface(data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto& inferred_scalar_type =
         type_inference ? tensor.scalar_type() : scalar_type;
 
@@ -556,7 +560,10 @@ void check_base_legacy_new(
         c10::DispatchKey::SparseCUDA,
         c10::DispatchKey::SparseHIP,
         c10::DispatchKey::SparseXPU,
+<<<<<<< HEAD
         c10::DispatchKey::SparseMPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10::DispatchKey::SparsePrivateUse1,
     });
     TORCH_CHECK(
@@ -670,6 +677,7 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
       if (ctor_or_new == CtorOrNew::CTOR) {
+<<<<<<< HEAD
         TORCH_CHECK_TYPE(
             false,
             "torch.sparse.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() "
@@ -677,6 +685,13 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
       } else {
         TORCH_CHECK_TYPE(
             false,
+=======
+        throw TypeError(
+            "torch.sparse.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() "
+            "or construct a strided tensor and convert it to sparse via to_sparse.");
+      } else {
+        throw TypeError(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "SparseTensor.new(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() "
             "or construct a strided tensor and convert it to sparse via to_sparse.");
       }
@@ -689,7 +704,11 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
     return new_with_sizes(
         options, scalar_type, deviceOptional, r.symintlist(0));
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "new(): invalid arguments");
+=======
+  throw std::runtime_error("new(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // NB: device_idx here is NOT a DeviceIndex, but index into PythonArgs
@@ -808,7 +827,11 @@ static Tensor legacy_tensor_generic_ctor_new(
     return legacy_new_from_sequence(
         options, scalar_type, deviceOptional, r.pyobject(0));
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "new(): invalid arguments");
+=======
+  throw std::runtime_error("new(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Handles ONLY torch.Tensor
@@ -1072,7 +1095,11 @@ static Tensor sparse_compressed_tensor_ctor_worker(
                values.options().layout(layout).pinned_memory(pin_memory))
         .set_requires_grad(r.toBool(ARG_REQUIRES_GRAD1));
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, name + ": invalid arguments");
+=======
+  throw std::runtime_error(name + ": invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor sparse_compressed_tensor_ctor(
@@ -1274,7 +1301,11 @@ Tensor sparse_coo_tensor_ctor(
                inferred_options.dtype(inferred_scalar_type).layout(at::kSparse))
         .set_requires_grad(r.toBool(ARG_REQUIRES_GRAD2));
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "sparse_coo_tensor(): invalid arguments");
+=======
+  throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void _validate_sparse_coo_tensor_args(
@@ -1497,14 +1528,22 @@ Tensor tensor_ctor(
         pin_memory);
     auto names = r.toDimnameListOptional(5);
     if (names) {
+<<<<<<< HEAD
       at::namedinference::propagate_names_if_nonempty(
+=======
+      at::namedinference::propagate_names(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           new_tensor, *names, /*validate_names=*/true);
     }
     new_tensor.detach_(); // ensure new_tensor a leaf node
     new_tensor.set_requires_grad(args_requires_grad);
     return new_tensor;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "tensor(): invalid arguments");
+=======
+  throw std::runtime_error("tensor(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor as_tensor(
@@ -1523,7 +1562,11 @@ Tensor as_tensor(
         /*copy_numpy=*/false,
         /*type_inference=*/type_inference);
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "tensor(): invalid arguments");
+=======
+  throw std::runtime_error("tensor(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor new_tensor(
@@ -1561,7 +1604,11 @@ Tensor new_tensor(
     new_tensor.set_requires_grad(args_requires_grad);
     return new_tensor;
   }
+<<<<<<< HEAD
   TORCH_CHECK(false, "new_tensor(): invalid arguments");
+=======
+  throw std::runtime_error("new_tensor(): invalid arguments");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor tensor_frombuffer(
@@ -1656,6 +1703,7 @@ Tensor tensor_frombuffer(
   return tensor;
 }
 
+<<<<<<< HEAD
 namespace {
 
 template <class T>
@@ -1673,6 +1721,21 @@ at::Tensor tensor_fromDLPackImpl(PyObject* data, T* tensor) {
       } else {
         tensor->deleter(tensor);
       }
+=======
+Tensor tensor_fromDLPack(PyObject* data) {
+  DLManagedTensor* dlMTensor =
+      (DLManagedTensor*)PyCapsule_GetPointer(data, "dltensor");
+  TORCH_CHECK(
+      dlMTensor,
+      "from_dlpack received an invalid capsule. "
+      "Note that DLTensor capsules can be consumed only once, "
+      "so you might have already constructed a tensor from it once.");
+
+  auto deleter_with_gil = [dlMTensor](void*) {
+    if (dlMTensor->deleter) {
+      pybind11::gil_scoped_acquire gil;
+      dlMTensor->deleter(dlMTensor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   };
 
@@ -1680,11 +1743,22 @@ at::Tensor tensor_fromDLPackImpl(PyObject* data, T* tensor) {
   // destructor function that will be called when the underlying storage goes
   // out of scope. When the destructor is called, the dlMTensor is destructed
   // too.
+<<<<<<< HEAD
   auto atensor =
       at::DLPackTraits<T>::fromDLPack(tensor, std::move(deleter_maybe_gil));
 
   // Make sure this capsule will never be used again.
   PyCapsule_SetName(data, at::DLPackTraits<T>::used);
+=======
+  // HACK: Ensure that we hold the GIL here just in case the
+  // managed tensor originating from a buggy NumPy build.
+  auto atensor = torch::utils::is_numpy_dlpack_deleter_bugged()
+      ? at::fromDLPack(dlMTensor, std::move(deleter_with_gil))
+      : at::fromDLPack(dlMTensor);
+
+  // Make sure this capsule will never be used again.
+  PyCapsule_SetName(data, "used_dltensor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // It is possible that the call to at::fromDLPack is the very first
   // call to create a Tensor in PyTorch. If so, then _lazy_init has
@@ -1696,6 +1770,7 @@ at::Tensor tensor_fromDLPackImpl(PyObject* data, T* tensor) {
   return atensor;
 }
 
+<<<<<<< HEAD
 // Check whether `data` is a valid DLPack capsule.
 // This function checks for the versioned and unversioned forms.
 bool isValidDLPackCapsule(PyObject* data) {
@@ -1734,6 +1809,8 @@ Tensor tensor_fromDLPack(PyObject* data) {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor asarray(
     PyObject* obj,
     std::optional<ScalarType> dtype,
@@ -1798,7 +1875,11 @@ Tensor asarray(
 #endif
 
   // Check whether 'obj' is a 'DLPack' capsule
+<<<<<<< HEAD
   if (!tensor.defined() && isValidDLPackCapsule(obj)) {
+=======
+  if (!tensor.defined() && PyCapsule_IsValid(obj, "dltensor") != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor = tensor_fromDLPack(obj);
   }
 
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index 1813d623af5e9..21b7afd741bb4 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 #include <fmt/format.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/THP.h>
 #include <torch/csrc/utils/tensor_numpy.h>
 #define WITH_NUMPY_IMPORT_ARRAY
@@ -9,11 +12,16 @@
 
 namespace torch::utils {
 PyObject* tensor_to_numpy(const at::Tensor&, bool) {
+<<<<<<< HEAD
   TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
+=======
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 at::Tensor tensor_from_numpy(
     PyObject* obj,
     bool warn_if_not_writeable /*=true*/) {
+<<<<<<< HEAD
   TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
 }
 
@@ -35,6 +43,27 @@ at::Tensor tensor_from_cuda_array_interface(
 
 void warn_numpy_not_writeable() {
   TORCH_CHECK(false, "PyTorch was compiled without NumPy support");
+=======
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+
+bool is_numpy_available() {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+
+bool is_numpy_int(PyObject* obj) {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+bool is_numpy_scalar(PyObject* obj) {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
+
+void warn_numpy_not_writeable() {
+  throw std::runtime_error("PyTorch was compiled without NumPy support");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // No-op stubs.
@@ -108,7 +137,11 @@ static std::vector<int64_t> to_aten_shape(int ndim, npy_intp* values) {
 static std::vector<int64_t> seq_to_aten_shape(PyObject* py_seq) {
   int ndim = PySequence_Length(py_seq);
   if (ndim == -1) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(false, "shape and strides must be sequences");
+=======
+    throw TypeError("shape and strides must be sequences");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto result = std::vector<int64_t>(ndim);
   for (const auto i : c10::irange(ndim)) {
@@ -146,10 +179,13 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
         "copy the tensor to host memory first.");
 
     TORCH_CHECK(
+<<<<<<< HEAD
         !at::_is_zerotensor(tensor),
         " Cannot convert a ZeroTensor to numpy. Set force=True if you need the zero array.");
 
     TORCH_CHECK(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         !(at::GradMode::is_enabled() && tensor.requires_grad()),
         "Can't call numpy() on Tensor that requires grad. "
         "Use tensor.detach().numpy() instead.");
@@ -190,9 +226,12 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
   if (!array)
     return nullptr;
 
+<<<<<<< HEAD
   if (at::_is_zerotensor(tensor))
     PyArray_FILLWBYTE(reinterpret_cast<PyArrayObject*>(array.get()), 0);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO: This attempts to keep the underlying memory alive by setting the base
   // object of the ndarray to the tensor and disabling resizes on the storage.
   // This is not sufficient. For example, the tensor's storage may be changed
@@ -222,7 +261,13 @@ void warn_numpy_not_writeable() {
 at::Tensor tensor_from_numpy(
     PyObject* obj,
     bool warn_if_not_writeable /*=true*/) {
+<<<<<<< HEAD
   TORCH_CHECK(is_numpy_available(), "Numpy is not available");
+=======
+  if (!is_numpy_available()) {
+    throw std::runtime_error("Numpy is not available");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK_TYPE(
       PyArray_Check(obj),
       "expected np.ndarray (got ",
@@ -311,8 +356,12 @@ int aten_to_numpy_dtype(const ScalarType scalar_type) {
     case kBool:
       return NPY_BOOL;
     default:
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(
           false, "Got unsupported ScalarType ", toString(scalar_type));
+=======
+      throw TypeError("Got unsupported ScalarType %s", toString(scalar_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -364,12 +413,19 @@ ScalarType numpy_dtype_to_aten(int dtype) {
   auto pytype = THPObjectPtr(PyArray_TypeObjectFromType(dtype));
   if (!pytype)
     throw python_error();
+<<<<<<< HEAD
   TORCH_CHECK_TYPE(
       false,
       fmt::format(
           "can't convert np.ndarray of type {}. The only supported types are: "
           "float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.",
           ((PyTypeObject*)pytype.get())->tp_name));
+=======
+  throw TypeError(
+      "can't convert np.ndarray of type %s. The only supported types are: "
+      "float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.",
+      ((PyTypeObject*)pytype.get())->tp_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool is_numpy_int(PyObject* obj) {
@@ -387,16 +443,27 @@ bool is_numpy_scalar(PyObject* obj) {
        PyArray_IsScalar(obj, ComplexFloating));
 }
 
+<<<<<<< HEAD
 at::Tensor tensor_from_cuda_array_interface(
     PyObject* obj,
     std::optional<c10::Device> device_opt) {
   TORCH_CHECK(is_numpy_available(), "Numpy is not available");
+=======
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+  if (!is_numpy_available()) {
+    throw std::runtime_error("Numpy is not available");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto cuda_dict =
       THPObjectPtr(PyObject_GetAttrString(obj, "__cuda_array_interface__"));
   TORCH_INTERNAL_ASSERT(cuda_dict);
 
   if (!PyDict_Check(cuda_dict.get())) {
+<<<<<<< HEAD
     TORCH_CHECK_TYPE(false, "`__cuda_array_interface__` must be a dict");
+=======
+    throw TypeError("`__cuda_array_interface__` must be a dict");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Extract the `obj.__cuda_array_interface__['shape']` attribute
@@ -407,7 +474,11 @@ at::Tensor tensor_from_cuda_array_interface(
       throw python_error();
     }
     if (py_shape == nullptr) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(false, "attribute `shape` must exist");
+=======
+      throw TypeError("attribute `shape` must exist");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     sizes = seq_to_aten_shape(py_shape);
   }
@@ -421,7 +492,11 @@ at::Tensor tensor_from_cuda_array_interface(
       throw python_error();
     }
     if (py_typestr == nullptr) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(false, "attribute `typestr` must exist");
+=======
+      throw TypeError("attribute `typestr` must exist");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     PyArray_Descr* descr = nullptr;
     TORCH_CHECK_VALUE(
@@ -443,10 +518,17 @@ at::Tensor tensor_from_cuda_array_interface(
       throw python_error();
     }
     if (py_data == nullptr) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(false, "attribute `shape` data exist");
     }
     if (!PyTuple_Check(py_data) || PyTuple_GET_SIZE(py_data) != 2) {
       TORCH_CHECK_TYPE(false, "`data` must be a 2-tuple of (int, bool)");
+=======
+      throw TypeError("attribute `shape` data exist");
+    }
+    if (!PyTuple_Check(py_data) || PyTuple_GET_SIZE(py_data) != 2) {
+      throw TypeError("`data` must be a 2-tuple of (int, bool)");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     data_ptr = PyLong_AsVoidPtr(PyTuple_GET_ITEM(py_data, 0));
     if (data_ptr == nullptr && PyErr_Occurred()) {
@@ -457,8 +539,13 @@ at::Tensor tensor_from_cuda_array_interface(
       throw python_error();
     }
     if (read_only) {
+<<<<<<< HEAD
       TORCH_CHECK_TYPE(
           false, "the read only flag is not supported, should always be False");
+=======
+      throw TypeError(
+          "the read only flag is not supported, should always be False");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -472,8 +559,13 @@ at::Tensor tensor_from_cuda_array_interface(
     if (py_strides != nullptr && py_strides != Py_None) {
       if (PySequence_Length(py_strides) == -1 ||
           static_cast<size_t>(PySequence_Length(py_strides)) != sizes.size()) {
+<<<<<<< HEAD
         TORCH_CHECK_TYPE(
             false, "strides must be a sequence of the same length as shape");
+=======
+        throw TypeError(
+            "strides must be a sequence of the same length as shape");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       strides = seq_to_aten_shape(py_strides);
 
@@ -496,6 +588,7 @@ at::Tensor tensor_from_cuda_array_interface(
     // ref:
     // https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html#cuda-array-interface-version-3
     if (data_ptr != nullptr) {
+<<<<<<< HEAD
       if (device_opt.has_value() && device_opt->has_index()) {
         // if device_opt is provided with explicit device index, use it
         return device_opt;
@@ -503,6 +596,9 @@ at::Tensor tensor_from_cuda_array_interface(
         // otherwise infer from cudaPointerGetAttributes later in from_blob
         return std::nullopt;
       }
+=======
+      return {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       const auto current_device = at::detail::getCUDAHooks().getCurrentDevice();
       return Device(
diff --git a/torch/csrc/utils/tensor_numpy.h b/torch/csrc/utils/tensor_numpy.h
index 5f93cbb089c21..b8990f69b10b3 100644
--- a/torch/csrc/utils/tensor_numpy.h
+++ b/torch/csrc/utils/tensor_numpy.h
@@ -22,9 +22,13 @@ TORCH_API bool is_numpy_bool(PyObject* obj);
 TORCH_API bool is_numpy_scalar(PyObject* obj);
 
 void warn_numpy_not_writeable();
+<<<<<<< HEAD
 at::Tensor tensor_from_cuda_array_interface(
     PyObject* obj,
     std::optional<c10::Device> device_opt = std::nullopt);
+=======
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void validate_numpy_for_dlpack_deleter_bug();
 bool is_numpy_dlpack_deleter_bugged();
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index d696a0cdf4ddd..ebd7e7bfa5425 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -39,8 +39,11 @@ const char* backend_to_string(const at::Backend& backend) {
       return "torch.cuda.sparse";
     case at::Backend::SparseXPU:
       return "torch.xpu.sparse";
+<<<<<<< HEAD
     case at::Backend::SparseMPS:
       return "torch.mps.sparse";
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::Backend::QuantizedCPU:
       return "torch.quantized";
     case at::Backend::HPU:
diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h
index ae40ff5ab8f2b..0231fca1c33e5 100644
--- a/torch/csrc/utils/variadic.h
+++ b/torch/csrc/utils/variadic.h
@@ -101,10 +101,14 @@ template <
     typename Function,
     typename Accessor,
     size_t... Is>
+<<<<<<< HEAD
 ReturnType unpack(
     Function function,
     Accessor accessor,
     Indices<Is...> /*unused*/) {
+=======
+ReturnType unpack(Function function, Accessor accessor, Indices<Is...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ReturnType(function(accessor.template operator()<Ts>(Is)...));
 }
 
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index 5398700e93274..61fba66257d1f 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -295,6 +295,7 @@ static void registerXpuDeviceProperties(PyObject* module) {
     return static_cast<int64_t>(prop.architecture);
   };
 #endif
+<<<<<<< HEAD
   // Wrapper class for XPU UUID
   struct XPUuuid {
     XPUuuid(const std::array<unsigned char, 16>& uuid) : bytes(uuid) {}
@@ -312,6 +313,10 @@ static void registerXpuDeviceProperties(PyObject* module) {
         return uuid_to_string(reinterpret_cast<const char*>(uuid.bytes.data()));
       });
 
+=======
+  auto m = py::handle(module).cast<py::module>();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define DEFINE_READONLY_MEMBER(member) \
   def_readonly(#member, &DeviceProp::member)
 
@@ -320,7 +325,10 @@ static void registerXpuDeviceProperties(PyObject* module) {
       ._(name)                                                   \
       ._(platform_name)                                          \
       ._(vendor)                                                 \
+<<<<<<< HEAD
       ._(device_id)                                              \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ._(driver_version)                                         \
       ._(version)                                                \
       ._(max_compute_units)                                      \
@@ -343,21 +351,29 @@ static void registerXpuDeviceProperties(PyObject* module) {
       .def_property_readonly("architecture", get_device_architecture)
 #endif
       .def_property_readonly("type", get_device_type)
+<<<<<<< HEAD
       .def_property_readonly(
           "uuid",
           [](const DeviceProp& prop) -> XPUuuid { return XPUuuid(prop.uuid); })
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "__repr__",
           [&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
             std::ostringstream stream;
             stream << "_XpuDeviceProperties(name='" << prop.name
                    << "', platform_name='" << prop.platform_name << "', type='"
+<<<<<<< HEAD
                    << get_device_type(prop) << "', device_id=0x" << std::hex
                    << std::uppercase << prop.device_id << std::dec << ", uuid="
                    << uuid_to_string(
                           reinterpret_cast<const char*>(prop.uuid.data()))
                    << ", driver_version='" << prop.driver_version
                    << "', total_memory="
+=======
+                   << get_device_type(prop) << "', driver_version='"
+                   << prop.driver_version << "', total_memory="
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    << prop.global_mem_size / (1024ull * 1024) << "MB"
                    << ", max_compute_units=" << prop.max_compute_units
                    << ", gpu_eu_count=" << prop.gpu_eu_count
@@ -415,6 +431,7 @@ static void initXpuMethodBindings(PyObject* module) {
         return std::make_tuple(
             stream.id(), stream.device_index(), stream.device_type());
       });
+<<<<<<< HEAD
   m.def(
       "_xpu_canDeviceAccessPeer",
       [](c10::DeviceIndex device, c10::DeviceIndex peer) {
@@ -423,6 +440,8 @@ static void initXpuMethodBindings(PyObject* module) {
   m.def("_xpu_setMemoryFraction", [](double fraction, c10::DeviceIndex device) {
     c10::xpu::XPUCachingAllocator::setMemoryFraction(fraction, device);
   });
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Callback for python part. Used for additional initialization of python
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 34723b0e4c2ba..037ee67eeed02 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -17,16 +17,25 @@
 import threading
 import traceback
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import lru_cache
 from typing import Any, cast, NewType, Optional, TYPE_CHECKING, Union
+=======
+from functools import lru_cache
+from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty
 
+<<<<<<< HEAD
 from . import _device_limits, gds
+=======
+from . import gds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._utils import _get_device_index
 from .graphs import (
     CUDAGraph,
@@ -35,7 +44,10 @@
     is_current_stream_capturing,
     make_graphed_callables,
 )
+<<<<<<< HEAD
 from .green_contexts import GreenContext
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .streams import Event, ExternalStream, Stream
 
 
@@ -238,10 +250,17 @@ def _sleep(cycles):
     torch._C._cuda_sleep(cycles)
 
 
+<<<<<<< HEAD
 def _extract_arch_version(arch_string: str) -> int:
     """Extracts the architecture string from a CUDA version"""
     base = arch_string.split("_", maxsplit=2)[1]
     base = base.removesuffix("a").removesuffix("f")
+=======
+def _extract_arch_version(arch_string: str):
+    """Extracts the architecture string from a CUDA version"""
+    base = arch_string.split("_")[1]
+    base = base.removesuffix("a")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return int(base)
 
 
@@ -261,7 +280,11 @@ def _check_capability():
     CUDA_ARCHES_SUPPORTED = {
         "12.6": {"min": 50, "max": 90},
         "12.8": {"min": 70, "max": 120},
+<<<<<<< HEAD
         "13.0": {"min": 75, "max": 120},
+=======
+        "12.9": {"min": 70, "max": 120},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     if (
@@ -293,8 +316,12 @@ def _check_capability():
                         min_arch % 10,
                         max_arch // 10,
                         max_arch % 10,
+<<<<<<< HEAD
                     ),
                     stacklevel=2,
+=======
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 matched_arches = ""
                 for arch, arch_info in CUDA_ARCHES_SUPPORTED.items():
@@ -304,9 +331,13 @@ def _check_capability():
                     ):
                         matched_arches += f" {arch}"
                 if matched_arches != "":
+<<<<<<< HEAD
                     warnings.warn(
                         matched_cuda_warn.format(matched_arches), stacklevel=2
                     )
+=======
+                    warnings.warn(matched_cuda_warn.format(matched_arches))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_cubins():
@@ -331,8 +362,12 @@ def _check_cubins():
             warnings.warn(
                 incompatible_device_warn.format(
                     device_name, capability, " ".join(arch_list), device_name
+<<<<<<< HEAD
                 ),
                 stacklevel=2,
+=======
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -413,6 +448,11 @@ def _lazy_init():
             )
         # This function throws if there's a driver initialization error, no GPUs
         # are found or any other error occurs
+<<<<<<< HEAD
+=======
+        if "CUDA_MODULE_LOADING" not in os.environ:
+            os.environ["CUDA_MODULE_LOADING"] = "LAZY"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._cuda_init()
         # Some of the queued calls may reentrantly call _lazy_init();
         # we need to just return without initializing in that case.
@@ -459,7 +499,11 @@ def cudart():
         >>> from torch.cuda import cudart, check_error
         >>> import os
         >>>
+<<<<<<< HEAD
         >>> os.environ["CUDA_PROFILE"] = "1"
+=======
+        >>> os.environ['CUDA_PROFILE'] = '1'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>
         >>> def perform_cuda_operations_with_streams():
         >>>     stream = torch.cuda.Stream()
@@ -501,14 +545,20 @@ class cudaStatus:
 
 class CudaError(RuntimeError):
     def __init__(self, code: int) -> None:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         msg = _cudart.cudaGetErrorString(_cudart.cudaError(code))
         super().__init__(f"{msg} ({code})")
 
 
 def check_error(res: int) -> None:
+<<<<<<< HEAD
     r"""Raise an error if the result of a CUDA runtime API call is not success."""
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if res != _cudart.cudaError.success:
         raise CudaError(res)
 
@@ -608,7 +658,10 @@ def get_device_capability(device: "Device" = None) -> tuple[int, int]:
     return prop.major, prop.minor
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [not-a-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_device_properties(device: "Device" = None) -> _CudaDeviceProperties:
     r"""Get the properties of a device.
 
@@ -659,7 +712,10 @@ def __init__(self, stream: Optional["torch.cuda.Stream"]):
         self.idx = _get_device_index(None, True)
         if not torch.jit.is_scripting():
             if self.idx is None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.idx = -1
 
         self.src_prev_stream = (
@@ -822,9 +878,13 @@ def _raw_device_count_amdsmi() -> int:
     try:
         amdsmi.amdsmi_init()
     except amdsmi.AmdSmiException as e:
+<<<<<<< HEAD
         warnings.warn(
             f"Can't initialize amdsmi - Error code: {e.err_code}", stacklevel=2
         )
+=======
+        warnings.warn(f"Can't initialize amdsmi - Error code: {e.err_code}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return -1
     socket_handles = amdsmi.amdsmi_get_processor_handles()
     return len(socket_handles)
@@ -837,12 +897,20 @@ def _raw_device_count_nvml() -> int:
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
+<<<<<<< HEAD
         warnings.warn("Can't initialize NVML", stacklevel=2)
+=======
+        warnings.warn("Can't initialize NVML")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return -1
     dev_count = c_int(-1)
     rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
+<<<<<<< HEAD
         warnings.warn("Can't get nvml device count", stacklevel=2)
+=======
+        warnings.warn("Can't get nvml device count")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return -1
     del nvml_h
     return dev_count.value
@@ -856,27 +924,43 @@ def _raw_device_uuid_amdsmi() -> Optional[list[str]]:
     try:
         amdsmi.amdsmi_init()
     except amdsmi.AmdSmiException:
+<<<<<<< HEAD
         warnings.warn("Can't initialize amdsmi", stacklevel=2)
+=======
+        warnings.warn("Can't initialize amdsmi")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     try:
         socket_handles = amdsmi.amdsmi_get_processor_handles()
         dev_count = len(socket_handles)
     except amdsmi.AmdSmiException:
+<<<<<<< HEAD
         warnings.warn("Can't get amdsmi device count", stacklevel=2)
+=======
+        warnings.warn("Can't get amdsmi device count")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     uuids: list[str] = []
     for idx in range(dev_count):
         try:
             handler = amdsmi.amdsmi_get_processor_handles()[idx]
         except amdsmi.AmdSmiException:
+<<<<<<< HEAD
             warnings.warn("Cannot get amd device handler", stacklevel=2)
+=======
+            warnings.warn("Cannot get amd device handler")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
         try:
             uuid = amdsmi.amdsmi_get_gpu_asic_info(handler)["asic_serial"][
                 2:
             ]  # Removes 0x prefix from serial
         except amdsmi.AmdSmiException:
+<<<<<<< HEAD
             warnings.warn("Cannot get uuid for amd device", stacklevel=2)
+=======
+            warnings.warn("Cannot get uuid for amd device")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
         uuids.append(
             str(uuid).lower()
@@ -891,25 +975,41 @@ def _raw_device_uuid_nvml() -> Optional[list[str]]:
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
+<<<<<<< HEAD
         warnings.warn("Can't initialize NVML", stacklevel=2)
+=======
+        warnings.warn("Can't initialize NVML")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     dev_count = c_int(-1)
     rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
+<<<<<<< HEAD
         warnings.warn("Can't get nvml device count", stacklevel=2)
+=======
+        warnings.warn("Can't get nvml device count")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
     uuids: list[str] = []
     for idx in range(dev_count.value):
         dev_id = c_void_p()
         rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
         if rc != 0:
+<<<<<<< HEAD
             warnings.warn("Can't get device handle", stacklevel=2)
+=======
+            warnings.warn("Can't get device handle")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
         buf_len = 96
         buf = create_string_buffer(buf_len)
         rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
         if rc != 0:
+<<<<<<< HEAD
             warnings.warn("Can't get device UUID", stacklevel=2)
+=======
+            warnings.warn("Can't get device UUID")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
         uuids.append(buf.raw.decode("ascii").strip("\0"))
     del nvml_h
@@ -964,9 +1064,13 @@ def _device_count_amdsmi() -> int:
             if raw_cnt <= 0:
                 return raw_cnt
             # Trim the list up to a maximum available device
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             for idx, val in enumerate(visible_devices):
                 # pyrefly: ignore [redundant-cast]
+=======
+            for idx, val in enumerate(visible_devices):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if cast(int, val) >= raw_cnt:
                     return idx
     except OSError:
@@ -1000,9 +1104,13 @@ def _device_count_nvml() -> int:
             if raw_cnt <= 0:
                 return raw_cnt
             # Trim the list up to a maximum available device
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             for idx, val in enumerate(visible_devices):
                 # pyrefly: ignore [redundant-cast]
+=======
+            for idx, val in enumerate(visible_devices):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if cast(int, val) >= raw_cnt:
                     return idx
     except OSError:
@@ -1038,7 +1146,11 @@ def device_count() -> int:
     r"""
     Return the number of GPUs available.
 
+<<<<<<< HEAD
     .. note:: This API will NOT poison fork if NVML discovery succeeds.
+=======
+    .. note:: This API will NOT posion fork if NVML discovery succeeds.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         See :ref:`multiprocessing-poison-fork-note` for more details.
     """
     global _cached_device_count
@@ -1217,10 +1329,15 @@ def get_sync_debug_mode() -> int:
 def _get_pynvml_handler(device: "Device" = None):
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
+<<<<<<< HEAD
             "nvidia-ml-py does not seem to be installed or it can't be imported."
             # pyrefly: ignore [invalid-inheritance]
         ) from _PYNVML_ERR
     # pyrefly: ignore [import-error]
+=======
+            "pynvml does not seem to be installed or it can't be imported."
+        ) from _PYNVML_ERR
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from pynvml import NVMLError_DriverNotLoaded
 
     try:
@@ -1237,7 +1354,10 @@ def _get_amdsmi_handler(device: "Device" = None):
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
             "amdsmi does not seem to be installed or it can't be imported."
+<<<<<<< HEAD
             # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) from _PYNVML_ERR
     try:
         amdsmi.amdsmi_init()
@@ -1501,7 +1621,10 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "cuda") -> int
     return default_generator.get_offset()
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .memory import *  # noqa: F403
 from .random import *  # noqa: F403
 
@@ -1714,11 +1837,20 @@ def __call__(self, *args, **kwargs):
 
 
 def _register_triton_kernels():
+<<<<<<< HEAD
+=======
+    if torch._running_with_deploy():
+        return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @_WrappedTritonKernel
     def kernel_impl(*args, **kwargs):
         from torch.sparse._triton_ops import bsr_dense_mm
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return bsr_dense_mm(*args, skip_checks=True, **kwargs)
 
     @_WrappedTritonKernel
@@ -1754,6 +1886,10 @@ def _compile_kernel(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
+<<<<<<< HEAD
+=======
+    header_code: str = "",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
 ):
@@ -1770,6 +1906,10 @@ def _compile_kernel(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, optional): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
+<<<<<<< HEAD
+=======
+        header_code (str, optional): Additional header code to prepend to the kernel source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cuda_include_dirs (list, optional): List of directories containing CUDA headers
         nvcc_options (list, optional): Additional options to pass to NVRTC
 
@@ -1790,6 +1930,7 @@ def _compile_kernel(
         >>> a = torch.randn(1024, device="cuda")
         >>> b = torch.randn(1024, device="cuda")
         >>> c = torch.empty_like(a)
+<<<<<<< HEAD
         >>> add_kernel(grid=(4, 1, 1), block=(256, 1, 1), args=[a, b, c, a.numel()])
     """
     from torch.cuda._utils import _cuda_load_module, _nvrtc_compile
@@ -1799,11 +1940,26 @@ def _compile_kernel(
         kernel_source,
         kernel_name,
         compute_capability,
+=======
+        >>> add_kernel(grid=(4,1,1), block=(256,1,1), args=[a, b, c, a.numel()])
+    """
+    import ctypes
+
+    from torch.cuda._utils import _cuda_load_module, _nvrtc_compile
+
+    # Compile the kernel to PTX
+    ptx = _nvrtc_compile(
+        kernel_source,
+        kernel_name,
+        compute_capability,
+        header_code,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cuda_include_dirs,
         nvcc_options,
     )
 
     # Load the module and get the kernel
+<<<<<<< HEAD
     result = _cuda_load_module(ptx, [mangled_name])
 
     if isinstance(result, dict):
@@ -1812,14 +1968,27 @@ def _compile_kernel(
         # This branch shouldn't be executed if kernel_names is provided,
         # but MyPy needs this to understand type narrowing
         return getattr(result, mangled_name)
+=======
+    result = _cuda_load_module(ptx, [kernel_name])
+
+    if isinstance(result, dict):
+        return result[kernel_name]
+    else:
+        # This branch shouldn't be executed if kernel_names is provided,
+        # but MyPy needs this to understand type narrowing
+        return getattr(result, kernel_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from . import amp, jiterator, nvtx, profiler, sparse, tunable
 
 
+<<<<<<< HEAD
 _POOL_HANDLE = NewType("_POOL_HANDLE", tuple[int, int])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     # Typed storage and tensors
     "BFloat16Storage",
@@ -1851,7 +2020,10 @@ def _compile_kernel(
     "ExternalStream",
     "Stream",
     "StreamContext",
+<<<<<<< HEAD
     "GreenContext",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "amp",
     "caching_allocator_alloc",
     "caching_allocator_delete",
diff --git a/torch/cuda/_gpu_trace.py b/torch/cuda/_gpu_trace.py
index d3b8f7e4626f9..73b95ecbf6ed0 100644
--- a/torch/cuda/_gpu_trace.py
+++ b/torch/cuda/_gpu_trace.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._utils import CallbackRegistry
 
diff --git a/torch/cuda/_memory_viz.py b/torch/cuda/_memory_viz.py
index 070a340f35fef..9a41c55bbaad2 100644
--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@@ -89,9 +89,13 @@ def _block_extra(b):
 
 def format_flamegraph(flamegraph_lines, flamegraph_script=None):
     if flamegraph_script is None:
+<<<<<<< HEAD
         cache_dir = os.path.expanduser("~/.cache/")
         os.makedirs(cache_dir, exist_ok=True)
         flamegraph_script = f"{cache_dir}/flamegraph.pl"
+=======
+        flamegraph_script = f"/tmp/{os.getuid()}_flamegraph.pl"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not os.path.exists(flamegraph_script):
         import tempfile
         import urllib.request
@@ -102,8 +106,13 @@ def format_flamegraph(flamegraph_lines, flamegraph_script=None):
                 "https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl",
                 f.name,
             )
+<<<<<<< HEAD
             try:
                 os.chmod(f.name, 0o755)
+=======
+            subprocess.check_call(["chmod", "+x", f.name])
+            try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 os.rename(f.name, flamegraph_script)
             except OSError:  # noqa: B001,E722
                 # Ok to skip, the file will be removed by tempfile
@@ -133,7 +142,11 @@ def frames_fragment(frames):
         if "history" not in b:
             frames, accounted_for_size = _block_extra(b)
             f.write(
+<<<<<<< HEAD
                 f"{prefix};{b['state']};{frames_fragment(frames)} {accounted_for_size}\n"
+=======
+                f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             accounted_for_size = 0
@@ -142,18 +155,31 @@ def frames_fragment(frames):
                 accounted_for_size += sz
                 if "frames" in h:
                     frames = h["frames"]
+<<<<<<< HEAD
                     f.write(f"{prefix};{b['state']};{frames_fragment(frames)} {sz}\n")
                 else:
                     f.write(f"{prefix};{b['state']};<no-context> {sz}\n")
         gaps = b["size"] - accounted_for_size
         if gaps:
             f.write(f"{prefix};{b['state']};<gaps> {gaps}\n")
+=======
+                    f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
+                else:
+                    f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
+        gaps = b["size"] - accounted_for_size
+        if gaps:
+            f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def segments(snapshot, format_flamegraph=format_flamegraph):
     f = io.StringIO()
     for seg in snapshot["segments"]:
+<<<<<<< HEAD
         prefix = f"stream_{seg['stream']};seg_{seg['address']}"
+=======
+        prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _write_blocks(f, prefix, seg["blocks"])
     return format_flamegraph(f.getvalue())
 
@@ -161,7 +187,11 @@ def segments(snapshot, format_flamegraph=format_flamegraph):
 def memory(snapshot, format_flamegraph=format_flamegraph):
     f = io.StringIO()
     for seg in snapshot["segments"]:
+<<<<<<< HEAD
         prefix = f"stream_{seg['stream']}"
+=======
+        prefix = f'stream_{seg["stream"]}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _write_blocks(f, prefix, seg["blocks"])
     return format_flamegraph(f.getvalue())
 
@@ -171,7 +201,11 @@ def _seg_key(seg):
         return (seg["address"], seg["total_size"])
 
     def _seg_info(seg):
+<<<<<<< HEAD
         return f"stream_{seg['stream']};seg_{seg['address']}"
+=======
+        return f'stream_{seg["stream"]};seg_{seg["address"]}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     f = io.StringIO()
 
@@ -301,11 +335,16 @@ def segsum(data):
                     occupied[j] = "0123456789*"[int(frac[j] * 10)]
                 else:
                     occupied[j] = m
+<<<<<<< HEAD
         stream = "" if seg["stream"] == 0 else f", stream_{seg['stream']}"
+=======
+        stream = "" if seg["stream"] == 0 else f', stream_{seg["stream"]}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         body = "".join(occupied)
         assert (
             seg_free_external + seg_free_internal + seg_allocated == seg["total_size"]
         )
+<<<<<<< HEAD
         stream = f" stream_{seg['stream']}" if seg["stream"] != 0 else ""
         if seg["total_size"] >= PAGE_SIZE:
             out.write(
@@ -313,6 +352,15 @@ def segsum(data):
                 f"{_report_free(seg_free_external, seg_free_internal)} free{stream}\n"
             )
     out.write(f"segments: {len(data['segments'])}\n")
+=======
+        stream = f' stream_{seg["stream"]}' if seg["stream"] != 0 else ""
+        if seg["total_size"] >= PAGE_SIZE:
+            out.write(
+                f'[{body}] {Bytes(seg["total_size"])} allocated, '
+                f"{_report_free(seg_free_external, seg_free_internal)} free{stream}\n"
+            )
+    out.write(f'segments: {len(data["segments"])}\n')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out.write(f"total_reserved: {Bytes(total_reserved)}\n")
     out.write(f"total_allocated: {Bytes(total_allocated)}\n")
     out.write(f"total_free: {_report_free(free_external, free_internal)}\n")
@@ -338,7 +386,11 @@ def _name():
                 return free_names.pop()
             r, m = next_name // 26, next_name % 26
             next_name += 1
+<<<<<<< HEAD
             return f"{chr(ord('a') + m)}{'' if r == 0 else r}"
+=======
+            return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def find_segment(addr):
             for name, saddr, size in segment_intervals:
@@ -446,6 +498,7 @@ def _format_viz(data, viz_kind, device):
     )
 
 
+<<<<<<< HEAD
 def filter_alloc_free_pairs(data):
     for dev_id in range(len(data["device_traces"])):
         # set of indexes of trace events for alloc-free pairs
@@ -483,6 +536,9 @@ def filter_alloc_free_pairs(data):
 
 
 def trace_plot(data, device=None, plot_segments=False, filter_freed=False):
+=======
+def trace_plot(data, device=None, plot_segments=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Generate a visualization over time of the memory usage recorded by the trace as an html file.
 
     Args:
@@ -490,15 +546,21 @@ def trace_plot(data, device=None, plot_segments=False, filter_freed=False):
         device (torch.device, optional): Generate the trace for this device, needed if multiple devices have allocations.
         plot_segments (bool, optional): Plots memory returned from cudaMalloc, rather than individual allocations.
                                         Defaults to False.
+<<<<<<< HEAD
         filter_freed (bool, optional): Filter out alloc-free paired events to only plot allocations that are not freed yet.
                                         Defaults to False to plot all trace events.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         str: HTML of visualization
     """
+<<<<<<< HEAD
     if filter_freed:
         data = filter_alloc_free_pairs(data)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _format_viz(
         data,
         "Active Memory Timeline"
@@ -739,6 +801,7 @@ def _output(p):
                 "-s", "--segments", action="store_true", help=help
             )
 
+<<<<<<< HEAD
             help = (
                 "filter out allocation-free pairs to only visualize the allocations that are not freed yet;"
                 "useful to reduce the number of events for large traces for debugging OOM"
@@ -747,6 +810,8 @@ def _output(p):
                 "-f", "--filter_freed", action="store_true", help=help
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = parser.parse_args()
 
     def _read(name):
@@ -783,12 +848,16 @@ def _write(name, data):
         data = _read(args.input)
         _write(
             args.output,
+<<<<<<< HEAD
             trace_plot(
                 data,
                 device=args.device,
                 plot_segments=args.segments,
                 filter_freed=args.filter_freed,
             ),
+=======
+            trace_plot(data, device=args.device, plot_segments=args.segments),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     elif args.action == "segment_plot":
         data = _read(args.input)
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index 90953d888d6c2..aed010258212e 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -200,7 +200,11 @@ def delete_tensor(self, data_ptr: DataPtr) -> None:
         del self.accesses[data_ptr]
 
     def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool:
+<<<<<<< HEAD
         return bool(self.accesses[data_ptr].reads)
+=======
+        return True if self.accesses[data_ptr].reads else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_allocation_stack_trace(
         self, data_ptr: DataPtr
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index a6e19212d111a..06685a0152b15 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -8,6 +8,7 @@
 from torch._utils import _get_device_index as _torch_get_device_index
 
 
+<<<<<<< HEAD
 def _get_hip_runtime_library() -> ctypes.CDLL:
     if sys.platform == "win32":
         lib = ctypes.CDLL(f"amdhip64_{torch.version.hip[0]}.dll")
@@ -22,12 +23,17 @@ def _get_hip_runtime_library() -> ctypes.CDLL:
 
 
 def _get_cuda_runtime_library() -> ctypes.CDLL:
+=======
+# Load CUDA driver and NVRTC
+def _get_cuda_library() -> ctypes.CDLL:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.platform == "win32":
         return ctypes.CDLL("nvcuda.dll")
     else:  # Unix-based systems
         return ctypes.CDLL("libcuda.so.1")
 
 
+<<<<<<< HEAD
 # Load GPU driver runtime
 def _get_gpu_runtime_library() -> ctypes.CDLL:
     if torch.version.hip:
@@ -36,12 +42,18 @@ def _get_gpu_runtime_library() -> ctypes.CDLL:
         return _get_cuda_runtime_library()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Helper: check CUDA errors
 def _check_cuda(result: int) -> None:
     if result == 0:
         return
     err_str = ctypes.c_char_p()
+<<<<<<< HEAD
     libcuda = _get_gpu_runtime_library()  # Get reference to CUDA library
+=======
+    libcuda = _get_cuda_library()  # Get reference to CUDA library
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     libcuda.cuGetErrorString(result, ctypes.byref(err_str))
     error_message = (
         err_str.value.decode() if err_str.value is not None else "Unknown CUDA error"
@@ -49,6 +61,7 @@ def _check_cuda(result: int) -> None:
     raise RuntimeError(f"CUDA error: {error_message}")
 
 
+<<<<<<< HEAD
 def _get_hiprtc_library() -> ctypes.CDLL:
     if sys.platform == "win32":
         version_str = "".join(["0", torch.version.hip[0], "0", torch.version.hip[2]])
@@ -120,16 +133,32 @@ def _get_gpu_rtc_compatible_flags() -> list[str]:
         compatible_flags.extend(COMMON_HIPCC_FLAGS)
 
     return compatible_flags
+=======
+def _get_nvrtc_library() -> ctypes.CDLL:
+    # Since PyTorch already loads NVRTC, we can use the system library
+    # which should be compatible with PyTorch's version
+    if sys.platform == "win32":
+        return ctypes.CDLL("nvrtc64_120_0.dll")
+    else:
+        return ctypes.CDLL("libnvrtc.so")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _nvrtc_compile(
     kernel_source: str,
     kernel_name: str,
     compute_capability: Optional[str] = None,
+<<<<<<< HEAD
     cuda_include_dirs: Optional[list] = None,
     nvcc_options: Optional[list] = None,
     auto_pch: bool = False,
 ) -> tuple[bytes, str]:
+=======
+    header_code: str = "",
+    cuda_include_dirs: Optional[list] = None,
+    nvcc_options: Optional[list] = None,
+) -> bytes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Compiles a CUDA kernel using NVRTC and returns the PTX code.
 
@@ -138,18 +167,31 @@ def _nvrtc_compile(
         kernel_name (str): The name of the kernel function to compile
         compute_capability (str, None): The compute capability to target (e.g., "86").
                                            If None, will detect from current device.
+<<<<<<< HEAD
         cuda_include_dirs (list, None): List of directories containing CUDA headers
         nvcc_options (list, None): Additional options to pass to NVRTC
         auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)
 
     Returns:
         Tuple[bytes, str]: The compiled PTX code and mangled kernel name
+=======
+        header_code (str, optional): Additional header code to prepend to the kernel source
+        cuda_include_dirs (list, None): List of directories containing CUDA headers
+        nvcc_options (list, None): Additional options to pass to NVRTC
+
+    Returns:
+        str: The compiled PTX code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # Ensure CUDA is initialized
     import torch.cuda
 
     # Load NVRTC library
+<<<<<<< HEAD
     libnvrtc = _get_gpu_rtc_library()
+=======
+    libnvrtc = _get_nvrtc_library()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NVRTC constants
     NVRTC_SUCCESS = 0
@@ -166,12 +208,28 @@ def check_nvrtc(result: int) -> None:
             )
             raise RuntimeError(f"CUDA error: {error_message}")
 
+<<<<<<< HEAD
     # Convert source to bytes
     source_bytes = kernel_source.encode("utf-8")
+=======
+    # Add 'extern "C"' if not already present to ensure C linkage
+    if not kernel_source.strip().startswith('extern "C"'):
+        kernel_source = f'extern "C" {kernel_source}'
+
+    # Combine header code and kernel source
+    if header_code:
+        full_source = header_code + "\n" + kernel_source
+    else:
+        full_source = kernel_source
+
+    # Convert source to bytes
+    source_bytes = full_source.encode("utf-8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Get compute capability if not provided
     if compute_capability is None:
         props = torch.cuda.get_device_properties(torch.cuda.current_device())
+<<<<<<< HEAD
         if torch.version.hip:
             compute_capability = f"{props.gcnArchName}"
         else:
@@ -190,12 +248,20 @@ def check_nvrtc(result: int) -> None:
     cuda_include_paths = include_paths("cuda")
     for cuda_path in cuda_include_paths:
         options.append(f"-I{cuda_path}".encode())
+=======
+        compute_capability = f"{props.major}{props.minor}"
+
+    # Prepare compilation options
+    options = []
+    options.append(f"--gpu-architecture=sm_{compute_capability}".encode())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Add custom include directories
     if cuda_include_dirs:
         for directory in cuda_include_dirs:
             options.append(f"-I{directory}".encode())
 
+<<<<<<< HEAD
     # Enable automatic precompiled headers (CUDA 12.8+)
     if auto_pch:
         assert str(torch.version.cuda) >= "12.8", "PCH requires CUDA 12.8+"
@@ -203,12 +269,24 @@ def check_nvrtc(result: int) -> None:
             nvcc_options = []
         nvcc_options.append("--pch")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Add custom NVCC options
     if nvcc_options:
         for option in nvcc_options:
             options.append(option.encode("utf-8"))
 
+<<<<<<< HEAD
     nvrtc_compatible_flags = _get_gpu_rtc_compatible_flags()
+=======
+    # TODO: Should we refactor flags into a common place?
+    from torch.utils.cpp_extension import COMMON_NVCC_FLAGS
+
+    # Filter out flags not supported by NVRTC
+    nvrtc_compatible_flags = [
+        flag for flag in COMMON_NVCC_FLAGS if flag != "--expt-relaxed-constexpr"
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     options.extend([flag.encode("utf-8") for flag in nvrtc_compatible_flags])
 
     # Convert options to C array
@@ -228,10 +306,13 @@ def check_nvrtc(result: int) -> None:
         )
     )
 
+<<<<<<< HEAD
     # Add kernel name, which can be a template expression
     c_kernel_name = kernel_name.encode("utf-8")
     check_nvrtc(libnvrtc.nvrtcAddNameExpression(prog, c_kernel_name))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Compile program
     res = libnvrtc.nvrtcCompileProgram(prog, num_options, options_array)
 
@@ -249,6 +330,7 @@ def check_nvrtc(result: int) -> None:
     check_nvrtc(libnvrtc.nvrtcGetPTXSize(prog, ctypes.byref(ptx_size)))
     ptx = ctypes.create_string_buffer(ptx_size.value)
     check_nvrtc(libnvrtc.nvrtcGetPTX(prog, ptx))
+<<<<<<< HEAD
 
     # Get mangled name
     c_mangled_name = ctypes.c_char_p()
@@ -267,6 +349,11 @@ def check_nvrtc(result: int) -> None:
     # likely due to the presence of '\0' in the string. So we use .raw instead.
     ptx_bytes = ptx.raw if torch.version.hip else ptx.value
     return ptx_bytes, mangled_name
+=======
+    libnvrtc.nvrtcDestroyProgram(ctypes.byref(prog))
+
+    return ptx.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _CudaModule:
@@ -279,10 +366,16 @@ def __getattr__(self, name: str) -> "_CudaKernel":
             return self._kernels[name]
 
         # Import the CUDA library inside the method
+<<<<<<< HEAD
         # pyrefly: ignore [missing-module-attribute]
         from torch.cuda._utils import _get_gpu_runtime_library
 
         libcuda = _get_gpu_runtime_library()
+=======
+        from torch.cuda._utils import _get_cuda_library
+
+        libcuda = _get_cuda_library()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         func = ctypes.c_void_p()
         try:
@@ -307,7 +400,10 @@ class _CudaKernel:
     def __init__(self, func: ctypes.c_void_p, module: ctypes.c_void_p) -> None:
         self.func = func
         self.module = module
+<<<<<<< HEAD
         self._max_shared_mem_bytes = 0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self,
@@ -330,7 +426,11 @@ def __call__(
         """
         import torch
 
+<<<<<<< HEAD
         libcuda = torch.cuda._utils._get_gpu_runtime_library()
+=======
+        libcuda = torch.cuda._utils._get_cuda_library()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not args:
             args = []
@@ -354,11 +454,20 @@ def __call__(
                 c_int = ctypes.c_int(arg)
                 # Store the C int for reference keeping, not in processed_args
                 c_args.append(ctypes.byref(c_int))
+<<<<<<< HEAD
             elif isinstance(arg, float):
                 # Python floats are doubles - use double by default
                 c_double = ctypes.c_double(arg)
                 # Store the C double for reference keeping, not in processed_args
                 c_args.append(ctypes.byref(c_double))
+=======
+            # TODO: Python floats are actually doubles
+            elif isinstance(arg, float):
+                # Convert floats to C float
+                c_float = ctypes.c_float(arg)
+                # Store the C float for reference keeping, not in processed_args
+                c_args.append(ctypes.byref(c_float))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise TypeError(f"Unsupported argument type: {type(arg)}")
 
@@ -374,6 +483,7 @@ def __call__(
 
             stream = torch.cuda.current_stream()
 
+<<<<<<< HEAD
         # Check if kernel requires large shared memory but hasn't been configured
         if shared_mem >= 48 * 1024 and (
             self._max_shared_mem_bytes == 0 or shared_mem > self._max_shared_mem_bytes
@@ -390,6 +500,8 @@ def __call__(
                 "and before launching the kernel."
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _check_cuda(
             libcuda.cuLaunchKernel(
                 self.func,
@@ -406,6 +518,7 @@ def __call__(
             )
         )
 
+<<<<<<< HEAD
     def set_shared_memory_config(self, shared_mem_bytes: int) -> None:
         if shared_mem_bytes < 48 * 1024:
             # No configuration needed for <= 48KB, just update the value
@@ -448,6 +561,8 @@ def set_shared_memory_config(self, shared_mem_bytes: int) -> None:
 
         self._max_shared_mem_bytes = shared_mem_bytes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _cuda_load_module(
     ptx: Union[str, bytes], kernel_names: Optional[list[str]] = None
@@ -468,7 +583,11 @@ def _cuda_load_module(
     import torch.cuda
 
     # Load CUDA driver library
+<<<<<<< HEAD
     libcuda = _get_gpu_runtime_library()
+=======
+    libcuda = _get_cuda_library()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Convert PTX to bytes if it's a string
     if isinstance(ptx, str):
diff --git a/torch/cuda/amp/__init__.py b/torch/cuda/amp/__init__.py
index 88ef0b5acac5e..b9bd21d75eedc 100644
--- a/torch/cuda/amp/__init__.py
+++ b/torch/cuda/amp/__init__.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .autocast_mode import autocast, custom_bwd, custom_fwd
 from .common import amp_definitely_not_available
 from .grad_scaler import GradScaler
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index e6b63c708d3f2..24a49abec3ee6 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 from typing_extensions import deprecated
 
@@ -10,17 +13,21 @@
 __all__ = ["autocast", "custom_fwd", "custom_bwd"]
 
 
+<<<<<<< HEAD
 @deprecated(
     "`torch.cuda.amp.autocast(args...)` is deprecated. "
     "Please use `torch.amp.autocast('cuda', args...)` instead.",
     category=FutureWarning,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class autocast(torch.amp.autocast_mode.autocast):
     r"""See :class:`torch.autocast`.
 
     ``torch.cuda.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cuda", args...)`` instead.
     """
 
+<<<<<<< HEAD
     # TODO: remove this conditional once we stop supporting Python < 3.13
     # Prior to Python 3.13, inspect.signature could not retrieve the correct
     # signature information for classes decorated with @deprecated (unless
@@ -40,6 +47,13 @@ def __new__(
         def __init_subclass__(cls):
             pass
 
+=======
+    @deprecated(
+        "`torch.cuda.amp.autocast(args...)` is deprecated. "
+        "Please use `torch.amp.autocast('cuda', args...)` instead.",
+        category=FutureWarning,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         enabled: bool = True,
diff --git a/torch/cuda/error.py b/torch/cuda/error.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/cuda/gds.py b/torch/cuda/gds.py
index 5a7dfa388caa0..39a0d8b744ee9 100644
--- a/torch/cuda/gds.py
+++ b/torch/cuda/gds.py
@@ -1,7 +1,11 @@
 import os
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.types import Storage
@@ -120,9 +124,15 @@ def register_handle(self) -> None:
 
         This is a wrapper around ``cuFileHandleRegister``.
         """
+<<<<<<< HEAD
         assert self.handle is None, (
             "Cannot register a handle that is already registered."
         )
+=======
+        assert (
+            self.handle is None
+        ), "Cannot register a handle that is already registered."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.handle = torch._C._gds_register_handle(self.fd)
 
     def deregister_handle(self) -> None:
@@ -130,9 +140,15 @@ def deregister_handle(self) -> None:
 
         This is a wrapper around ``cuFileHandleDeregister``.
         """
+<<<<<<< HEAD
         assert self.handle is not None, (
             "Cannot deregister a handle that is not registered."
         )
+=======
+        assert (
+            self.handle is not None
+        ), "Cannot deregister a handle that is not registered."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._gds_deregister_handle(self.handle)
         self.handle = None
 
@@ -146,9 +162,15 @@ def load_storage(self, storage: Storage, offset: int = 0) -> None:
             storage (Storage): Storage to load data into.
             offset (int, optional): Offset into the file to start loading from. (Default: 0)
         """
+<<<<<<< HEAD
         assert self.handle is not None, (
             "Cannot load data from a file that is not registered."
         )
+=======
+        assert (
+            self.handle is not None
+        ), "Cannot load data from a file that is not registered."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._gds_load_storage(self.handle, storage, offset)
 
     def save_storage(self, storage: Storage, offset: int = 0) -> None:
@@ -161,7 +183,13 @@ def save_storage(self, storage: Storage, offset: int = 0) -> None:
             storage (Storage): Storage to save data from.
             offset (int, optional): Offset into the file to start saving to. (Default: 0)
         """
+<<<<<<< HEAD
         assert self.handle is not None, (
             "Cannot save data to a file that is not registered."
         )
+=======
+        assert (
+            self.handle is not None
+        ), "Cannot save data to a file that is not registered."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._gds_save_storage(self.handle, storage, offset)
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 02fa9cac32f2b..5b2b8273b33bc 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 import gc
@@ -13,10 +14,18 @@
 if TYPE_CHECKING:
     # importing _POOL_HANDLE at runtime toplevel causes an import cycle
     from torch.cuda import _POOL_HANDLE
+=======
+# mypy: allow-untyped-defs
+import gc
+import typing
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .._utils import _dummy_type
 
 
+<<<<<<< HEAD
 __all__ = [
     "is_current_stream_capturing",
     "graph_pool_handle",
@@ -30,6 +39,8 @@
 _P = ParamSpec("_P")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not hasattr(torch._C, "_CudaStreamBase"):
     # Define dummy base classes
     torch._C.__dict__["_CUDAGraph"] = _dummy_type("_CUDAGraph")
@@ -45,7 +56,11 @@
 )
 
 
+<<<<<<< HEAD
 def is_current_stream_capturing() -> bool:
+=======
+def is_current_stream_capturing():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return True if CUDA graph capture is underway on the current CUDA stream, False otherwise.
 
     If a CUDA context does not exist on the current device, returns False without initializing the context.
@@ -54,7 +69,11 @@ def is_current_stream_capturing() -> bool:
 
 
 # Python shim helps Sphinx process docstrings more reliably.
+<<<<<<< HEAD
 def graph_pool_handle() -> _POOL_HANDLE:
+=======
+def graph_pool_handle():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return an opaque token representing the id of a graph memory pool.
 
     See :ref:`Graph memory management<graph-memory-management>`.
@@ -62,7 +81,11 @@ def graph_pool_handle() -> _POOL_HANDLE:
     .. warning::
         This API is in beta and may change in future releases.
     """
+<<<<<<< HEAD
     return torch.cuda._POOL_HANDLE(_graph_pool_handle())
+=======
+    return _graph_pool_handle()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Python shim helps Sphinx process docstrings more reliably.
@@ -74,11 +97,19 @@ class CUDAGraph(torch._C._CUDAGraph):
             cudaGraphExec_t will be instantiated on GPU at the end of
             ``capture_end`` and the underlying cudaGraph_t will be
             destroyed. Users who want to query or otherwise modify the
+<<<<<<< HEAD
             underlying cudaGraph_t before instantiation can set
             ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
             ``capture_end``. Note that the cudaGraphExec_t will not be
             instantiated at the end of ``capture_end`` in this
             case. Instead, it will be instantiated via an explicit called
+=======
+            underlying cudaGraph_t before instantiatiation can set
+            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
+            ``capture_end``. Note that the cudaGraphExec_t will not be
+            instantiated at the end of ``capture_end`` in this
+            case. Instead, it wil be instantiated via an explicit called
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             to ``instantiate`` or automatically on the first call to
             ``replay`` if ``instantiate`` was not already called. Calling
             ``instantiate`` manually before ``replay`` is recommended to
@@ -93,12 +124,19 @@ class CUDAGraph(torch._C._CUDAGraph):
 
     """
 
+<<<<<<< HEAD
     def __new__(cls, keep_graph: bool = False) -> Self:
         return super().__new__(cls, keep_graph)
 
     def capture_begin(
         self, pool: Optional[_POOL_HANDLE] = None, capture_error_mode: str = "global"
     ) -> None:
+=======
+    def __new__(cls, keep_graph=False):
+        return super().__new__(cls, keep_graph)
+
+    def capture_begin(self, pool=None, capture_error_mode="global"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Begin capturing CUDA work on the current stream.
 
         Typically, you shouldn't call ``capture_begin`` yourself.
@@ -117,7 +155,11 @@ def capture_begin(
         """  # noqa: B950
         super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
 
+<<<<<<< HEAD
     def capture_end(self) -> None:
+=======
+    def capture_end(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""End CUDA graph capture on the current stream.
 
         After ``capture_end``, ``replay`` may be called on this instance.
@@ -128,7 +170,11 @@ def capture_end(self) -> None:
         """
         super().capture_end()
 
+<<<<<<< HEAD
     def instantiate(self) -> None:
+=======
+    def instantiate(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Instantiate the CUDA graph. Will be called by
         ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
         ``keep_graph=True`` and ``instantiate`` has not already been
@@ -137,6 +183,7 @@ def instantiate(self) -> None:
         """
         super().instantiate()
 
+<<<<<<< HEAD
     def replay(self) -> None:
         r"""Replay the CUDA work captured by this graph."""
         super().replay()
@@ -146,6 +193,17 @@ def reset(self) -> None:
         super().reset()
 
     def pool(self) -> _POOL_HANDLE:
+=======
+    def replay(self):
+        r"""Replay the CUDA work captured by this graph."""
+        super().replay()
+
+    def reset(self):
+        r"""Delete the graph currently held by this instance."""
+        super().reset()
+
+    def pool(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return an opaque token representing the id of this graph's memory pool.
 
         This id can optionally be passed to another graph's ``capture_begin``,
@@ -153,11 +211,19 @@ def pool(self) -> _POOL_HANDLE:
         """
         return super().pool()
 
+<<<<<<< HEAD
     def enable_debug_mode(self) -> None:
         r"""Enable debugging mode for CUDAGraph.debug_dump."""
         return super().enable_debug_mode()
 
     def debug_dump(self, debug_path: str) -> None:
+=======
+    def enable_debug_mode(self):
+        r"""Enable debugging mode for CUDAGraph.debug_dump."""
+        return super().enable_debug_mode()
+
+    def debug_dump(self, debug_path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""
         Arguments:
             debug_path (required): Path to dump the graph to.
@@ -167,13 +233,18 @@ def debug_dump(self, debug_path: str) -> None:
         """
         return super().debug_dump(debug_path)
 
+<<<<<<< HEAD
     def raw_cuda_graph(self) -> int:
+=======
+    def raw_cuda_graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Returns the underlying cudaGraph_t. ``keep_graph`` must be True.
 
         See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
         """  # noqa: B950
         return super().raw_cuda_graph()
 
+<<<<<<< HEAD
     def raw_cuda_graph_exec(self) -> int:
         r"""Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.
 
@@ -181,6 +252,8 @@ def raw_cuda_graph_exec(self) -> int:
         """  # noqa: B950
         return super().raw_cuda_graph_exec()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class graph:
     r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.
@@ -212,6 +285,7 @@ class graph:
         https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
     """  # noqa: B950
 
+<<<<<<< HEAD
     default_capture_stream: Optional[torch.cuda.Stream] = None
 
     def __init__(
@@ -219,6 +293,15 @@ def __init__(
         cuda_graph: CUDAGraph,
         pool: Optional[_POOL_HANDLE] = None,
         stream: Optional[torch.cuda.Stream] = None,
+=======
+    default_capture_stream: typing.Optional["torch.cuda.Stream"] = None
+
+    def __init__(
+        self,
+        cuda_graph,
+        pool=None,
+        stream=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         capture_error_mode: str = "global",
     ):
         # Lazy-init of default_capture_stream helps avoid circular-import errors.
@@ -227,9 +310,13 @@ def __init__(
         if self.__class__.default_capture_stream is None:
             self.__class__.default_capture_stream = torch.cuda.Stream()
 
+<<<<<<< HEAD
         self.pool: Union[tuple[()], tuple[_POOL_HANDLE]] = (
             () if pool is None else (pool,)
         )
+=======
+        self.pool = () if pool is None else (pool,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.capture_stream = (
             stream if stream is not None else self.__class__.default_capture_stream
         )
@@ -238,6 +325,7 @@ def __init__(
         self.cuda_graph = cuda_graph
         self.capture_error_mode = capture_error_mode
 
+<<<<<<< HEAD
     def __enter__(self) -> None:
         # Free as much memory as we can for the graph
         torch.cuda.synchronize()
@@ -250,6 +338,12 @@ def __enter__(self) -> None:
             # when a dead python cycle is holding onto CUDA memory.
             gc.collect()
 
+=======
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.cuda.synchronize()
+        gc.collect()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.cuda.empty_cache()
 
         # Stackoverflow seems comfortable with this pattern
@@ -257,6 +351,7 @@ def __enter__(self) -> None:
         self.stream_ctx.__enter__()
 
         self.cuda_graph.capture_begin(
+<<<<<<< HEAD
             # type: ignore[misc]
             *self.pool,
             # pyrefly: ignore [bad-keyword-argument]
@@ -299,6 +394,20 @@ def make_graphed_callables(
     allow_unused_input: bool = False,
     pool: Optional[_POOL_HANDLE] = None,
 ) -> Union[_ModuleOrCallable, tuple[_ModuleOrCallable, ...]]:
+=======
+            *self.pool, capture_error_mode=self.capture_error_mode
+        )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.cuda_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(
+    callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
 
     Each graphed callable's forward pass runs its source callable's
@@ -372,6 +481,7 @@ def make_graphed_callables(
 
     just_one_callable = False
 
+<<<<<<< HEAD
     _sample_args: tuple[tuple[Tensor, ...], ...]
     if not isinstance(callables, tuple):
         just_one_callable = True
@@ -383,6 +493,16 @@ def make_graphed_callables(
     flatten_sample_args = []
 
     for c, args in zip(callables, _sample_args):
+=======
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    flatten_sample_args = []
+
+    for c, args in zip(callables, sample_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(c, torch.nn.Module):
             assert (
                 len(c._backward_hooks) == 0
@@ -427,7 +547,11 @@ def make_graphed_callables(
     torch.cuda.synchronize()
     with torch.cuda.stream(torch.cuda.Stream()):
         for func, args, static_input_surface in zip(
+<<<<<<< HEAD
             callables, _sample_args, per_callable_static_input_surfaces
+=======
+            callables, sample_args, per_callable_static_input_surfaces
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             grad_inputs, outputs, outputs_grad = None, None, None
             for _ in range(num_warmup_iters):
@@ -457,11 +581,19 @@ def make_graphed_callables(
     # Capture forward graphs
     per_callable_static_outputs = []
     per_callable_output_unflatten_spec = []
+<<<<<<< HEAD
     for func, args, fwd_graph in zip(callables, _sample_args, fwd_graphs):
         with torch.cuda.graph(fwd_graph, pool=mempool):
             func_outputs = func(*args)
 
         flatten_outputs, spec = torch.utils._pytree.tree_flatten(func_outputs)
+=======
+    for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs):
+        with torch.cuda.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        flatten_outputs, spec = torch.utils._pytree.tree_flatten(outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         per_callable_static_outputs.append(tuple(flatten_outputs))
         per_callable_output_unflatten_spec.append(spec)
 
@@ -513,6 +645,7 @@ def make_graphed_callables(
     # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
 
     def make_graphed_autograd_function(
+<<<<<<< HEAD
         fwd_graph: CUDAGraph,
         bwd_graph: CUDAGraph,
         module_params: tuple[torch.nn.Parameter, ...],
@@ -527,6 +660,21 @@ class Graphed(torch.autograd.Function):
             @staticmethod
             # pyrefly: ignore [bad-override]
             def forward(ctx: object, *inputs: Tensor) -> tuple[Tensor, ...]:
+=======
+        fwd_graph,
+        bwd_graph,
+        module_params,
+        len_user_args,
+        output_unflatten_spec,
+        static_input_surface,
+        static_outputs,
+        static_grad_outputs,
+        static_grad_inputs,
+    ):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # At this stage, only the user args may (potentially) be new tensors.
                 for i in range(len_user_args):
                     if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
@@ -537,8 +685,12 @@ def forward(ctx: object, *inputs: Tensor) -> tuple[Tensor, ...]:
 
             @staticmethod
             @torch.autograd.function.once_differentiable
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
             def backward(ctx: object, *grads: Tensor) -> tuple[Tensor, ...]:
+=======
+            def backward(ctx, *grads):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(grads) == len(static_grad_outputs)
                 for g, grad in zip(static_grad_outputs, grads):
                     if g is not None:
@@ -551,12 +703,19 @@ def backward(ctx: object, *grads: Tensor) -> tuple[Tensor, ...]:
                 # Input args that didn't require grad expect a None gradient.
                 assert isinstance(static_grad_inputs, tuple)
                 return tuple(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
                     b.detach() if b is not None else b
                     for b in static_grad_inputs
                 )
 
         def functionalized(*user_args: object) -> object:
+=======
+                    b.detach() if b is not None else b for b in static_grad_inputs
+                )
+
+        def functionalized(*user_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Runs the autograd function with inputs == all inputs to the graph that might require grad
             # (explicit user args + module parameters)
             # Assumes module params didn't change since capture.
@@ -567,7 +726,11 @@ def functionalized(*user_args: object) -> object:
         return functionalized
 
     # Put together the final graphed callables
+<<<<<<< HEAD
     ret: list[_ModuleOrCallable] = []
+=======
+    ret = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, func in enumerate(callables):
         graphed = make_graphed_autograd_function(
             fwd_graphs[i],
@@ -583,6 +746,7 @@ def functionalized(*user_args: object) -> object:
 
         if isinstance(func, torch.nn.Module):
 
+<<<<<<< HEAD
             def make_graphed_forward(
                 func: torch.nn.Module,
                 graph_training_state: bool,
@@ -602,6 +766,20 @@ def new_fwd(*user_args: _P.args, **user_kwargs: _P.kwargs) -> _R:
             func.forward = make_graphed_forward(
                 func, func.training, graphed, func.forward
             )
+=======
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+
+                return new_fwd
+
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret.append(func)
         else:
             ret.append(graphed)
diff --git a/torch/cuda/jiterator.py b/torch/cuda/jiterator.py
index 6eaa54b5b795c..701cbdf78eea5 100644
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import re
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -57,9 +61,15 @@ def __init__(
     ):
         self.code_string = code_string
 
+<<<<<<< HEAD
         assert return_by_ref or num_outputs == 1, (
             "Return by value only works for single output. "
         )
+=======
+        assert (
+            return_by_ref or num_outputs == 1
+        ), "Return by value only works for single output. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.return_by_ref = return_by_ref
         self.num_outputs = num_outputs
 
@@ -72,9 +82,15 @@ def __init__(
     def __call__(self, *tensors: Tensor, **kwargs):
         # Jiterator follow torch.cuda's lazy initialization behavior
         # Defer checking cuda's availability at the function invocation time
+<<<<<<< HEAD
         assert self.is_cuda_available, (
             "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
         )
+=======
+        assert (
+            self.is_cuda_available
+        ), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
 
@@ -114,8 +130,13 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
 
         code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
         jitted_fn = create_jit_fn(code_string, alpha=1.0)
+<<<<<<< HEAD
         a = torch.rand(3, device="cuda")
         b = torch.rand(3, device="cuda")
+=======
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # invoke jitted function like a regular python function
         result = jitted_fn(a, b, alpha=3.14)
 
@@ -123,6 +144,7 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
 
     Example::
 
+<<<<<<< HEAD
         code_string = (
             "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
         )
@@ -130,6 +152,13 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
         jitted_fn = create_jit_fn(code_string, val=0.0)
         a = torch.rand(3, device="cuda")
         b = torch.rand(3, device="cuda")
+=======
+        code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
+        code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
+        jitted_fn = create_jit_fn(code_string, val=0.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # invoke jitted function like a regular python function
         result = jitted_fn(a, b)  # using default val=0.0
 
@@ -141,9 +170,15 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
         code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
         my_gelu = create_jit_fn(code_string)
         my_lib = torch.library.Library("aten", "IMPL")
+<<<<<<< HEAD
         my_lib.impl("aten::gelu", my_gelu, "CUDA")
         # torch.nn.GELU and torch.nn.function.gelu are now overridden
         a = torch.rand(3, device="cuda")
+=======
+        my_lib.impl('aten::gelu', my_gelu, "CUDA")
+        # torch.nn.GELU and torch.nn.function.gelu are now overridden
+        a = torch.rand(3, device='cuda')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
 
     .. warning::
@@ -173,8 +208,13 @@ def _create_multi_output_jit_fn(
 
         code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
         jitted_fn = create_jit_fn(code_string, alpha=1.0)
+<<<<<<< HEAD
         a = torch.rand(3, device="cuda")
         b = torch.rand(3, device="cuda")
+=======
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # invoke jitted function like a regular python function
         result = jitted_fn(a, b, alpha=3.14)
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 2dfd5f9479499..9634c8a863925 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -255,9 +255,15 @@ def memory_stats(device: "Device" = None) -> dict[str, Any]:
 
     - ``all``: combined statistics across all memory pools.
     - ``large_pool``: statistics for the large allocation pool
+<<<<<<< HEAD
       (as of June 2025, for size >= 1MB allocations).
     - ``small_pool``: statistics for the small allocation pool
       (as of June 2025, for size < 1MB allocations).
+=======
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Metric type:
 
@@ -492,7 +498,10 @@ def reset_max_memory_allocated(device: "Device" = None) -> None:
         "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
         "which resets /all/ peak memory stats.",
         FutureWarning,
+<<<<<<< HEAD
         stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return reset_peak_memory_stats(device=device)
 
@@ -519,7 +528,10 @@ def reset_max_memory_cached(device: "Device" = None) -> None:
         "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
         "which resets /all/ peak memory stats.",
         FutureWarning,
+<<<<<<< HEAD
         stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return reset_peak_memory_stats(device=device)
 
@@ -771,8 +783,12 @@ def list_gpu_processes(device: "Device" = None) -> str:
         try:
             import pynvml  # type: ignore[import]
         except ModuleNotFoundError:
+<<<<<<< HEAD
             return "pynvml module not found, please install nvidia-ml-py"
         # pyrefly: ignore [import-error]
+=======
+            return "pynvml module not found, please install pynvml"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from pynvml import NVMLError_DriverNotLoaded
 
         try:
@@ -855,7 +871,10 @@ def _record_memory_history_legacy(
     _C._cuda_record_memory_history_legacy(  # type: ignore[call-arg]
         enabled,
         record_context,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         trace_alloc_max_entries,
         trace_alloc_record_context,
         record_context_cpp,
@@ -866,7 +885,11 @@ def _record_memory_history_legacy(
 
 
 def _record_memory_history(
+<<<<<<< HEAD
     enabled: Optional[Literal["state", "all"]] = "all", *args, **kwargs
+=======
+    enabled: Literal[None, "state", "all"] = "all", *args, **kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Enable recording of stack traces associated with memory
     allocations, so you can tell what allocated any piece of memory in
@@ -890,7 +913,11 @@ def _record_memory_history(
     store the last accumulated `max_entries` entries, meaning new entries overwrite
     older entries.
 
+<<<<<<< HEAD
     C++ implementation for reference to ring buffer implementation:
+=======
+    C++ implementation for reference to ring buffer implemenation:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. code-block:: cpp
 
@@ -918,7 +945,11 @@ def _record_memory_history(
     Args:
         enabled (Literal[None, "state", "all"], optional):
             `None`, disable recording memory history.
+<<<<<<< HEAD
             `"state"`, keep information for currently allocated memory.
+=======
+            `"state"`, keep information for currenly allocated memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             `"all"`, additionally keep a history of all alloc/free calls.
             Defaults to "all".
         context (Literal[None, "state", "alloc", "all"], optional):
@@ -972,10 +1003,16 @@ def _snapshot(device: "Device" = None):
     .. code-block:: python
 
         class Snapshot(TypedDict):
+<<<<<<< HEAD
             segments: List[Segment]
             device_traces: List[List[TraceEntry]]
 
 
+=======
+            segments : List[Segment]
+            device_traces: List[List[TraceEntry]]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Segment(TypedDict):
             # Segments are memory returned from a cudaMalloc call.
             # The size of reserved memory is the sum of all Segments.
@@ -984,6 +1021,7 @@ class Segment(TypedDict):
             # is split into more then one Block.
             # empty_cache() frees Segments that are entirely inactive.
             address: int
+<<<<<<< HEAD
             total_size: int  #  cudaMalloc'd size of segment
             stream: int
             segment_type: Literal["small", "large"]  # 'large' (>1MB)
@@ -991,11 +1029,20 @@ class Segment(TypedDict):
             active_size: int  # size of memory in use or in active_awaiting_free state
             blocks: List[Block]
 
+=======
+            total_size: int #  cudaMalloc'd size of segment
+            stream: int
+            segment_type: Literal['small', 'large'] # 'large' (>1MB)
+            allocated_size: int # size of memory in use
+            active_size: int # size of memory in use or in active_awaiting_free state
+            blocks : List[Block]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Block(TypedDict):
             # A piece of memory returned from the allocator, or
             # current cached but inactive.
             size: int
+<<<<<<< HEAD
             requested_size: int  # size requested during malloc, may be smaller than
             # size due to rounding
             address: int
@@ -1013,12 +1060,28 @@ class Frame(TypedDict):
             line: int
             name: str
 
+=======
+            requested_size: int # size requested during malloc, may be smaller than
+                                # size due to rounding
+            address: int
+            state: Literal['active_allocated', # used by a tensor
+                        'active_awaiting_free', # waiting for another stream to finish using
+                                                # this, then it will become free
+                        'inactive',] # free for reuse
+            frames: List[Frame] # stack trace from where the allocation occurred
+
+        class Frame(TypedDict):
+                filename: str
+                line: int
+                name: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class TraceEntry(TypedDict):
             # When `torch.cuda.memory._record_memory_history()` is enabled,
             # the snapshot will contain TraceEntry objects that record each
             # action the allocator took.
             action: Literal[
+<<<<<<< HEAD
                 "alloc"  # memory allocated
                 "free_requested",  # the allocated received a call to free memory
                 "free_completed",  # the memory that was requested to be freed is now
@@ -1040,6 +1103,29 @@ class TraceEntry(TypedDict):
             stream: int
             device_free: int  # only present for OOM, the amount of
             # memory cuda still reports to be free
+=======
+            'alloc'  # memory allocated
+            'free_requested', # the allocated received a call to free memory
+            'free_completed', # the memory that was requested to be freed is now
+                            # able to be used in future allocation calls
+            'segment_alloc', # the caching allocator ask cudaMalloc for more memory
+                            # and added it as a segment in its cache
+            'segment_free',  # the caching allocator called cudaFree to return memory
+                            # to cuda possibly trying free up memory to
+                            # allocate more segments or because empty_caches was called
+            'oom',          # the allocator threw an OOM exception. 'size' is
+                            # the requested number of bytes that did not succeed
+            'snapshot'      # the allocator generated a memory snapshot
+                            # useful to coorelate a previously taken
+                            # snapshot with this trace
+            ]
+            addr: int # not present for OOM
+            frames: List[Frame]
+            size: int
+            stream: int
+            device_free: int # only present for OOM, the amount of
+                            # memory cuda still reports to be free
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         The Snapshot dictionary object
@@ -1065,6 +1151,7 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
         pickle.dump(s, f)
 
 
+<<<<<<< HEAD
 def _set_memory_metadata(metadata: str):
     """
     Set custom metadata that will be attached to all subsequent CUDA memory allocations.
@@ -1091,6 +1178,8 @@ def _get_memory_metadata() -> str:
     return torch._C._cuda_getMemoryMetadata()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _save_segment_usage(filename="output.svg", snapshot=None):
     if snapshot is None:
         snapshot = _snapshot()
@@ -1105,6 +1194,7 @@ def _save_memory_usage(filename="output.svg", snapshot=None):
         f.write(_memory(snapshot))
 
 
+<<<<<<< HEAD
 @deprecated(
     "torch.cuda._set_allocator_settings is deprecated. Use torch._C._accelerator_setAllocatorSettings instead.",
     category=FutureWarning,
@@ -1112,6 +1202,10 @@ def _save_memory_usage(filename="output.svg", snapshot=None):
 def _set_allocator_settings(env: str):
     # pyrefly: ignore [missing-attribute]
     return torch._C._accelerator_setAllocatorSettings(env)
+=======
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_allocator_backend() -> str:
@@ -1204,15 +1298,26 @@ class MemPool(_MemPool):
         use_on_oom(bool): a bool that indicates if this pool can be used
             as a last resort if a memory allocation outside of the pool fails due
             to Out Of Memory. This is False by default.
+<<<<<<< HEAD
 
+=======
+        symmetric(bool): a bool that indicates if this pool is symmetrical
+            across ranks. This is False by default.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
         self,
         allocator: Optional[_cuda_CUDAAllocator] = None,
         use_on_oom: bool = False,
+<<<<<<< HEAD
     ):
         super().__init__(allocator, True, use_on_oom)
+=======
+        symmetric: bool = False,
+    ):
+        super().__init__(allocator, True, use_on_oom, symmetric)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def id(self) -> tuple[int, int]:
@@ -1220,6 +1325,14 @@ def id(self) -> tuple[int, int]:
         return super().id
 
     @property
+<<<<<<< HEAD
+=======
+    def is_symmetric(self) -> bool:
+        r"""Returns whether this pool is used for NCCL's symmetric memory."""
+        return super().is_symmetric
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def allocator(self) -> Optional[_cuda_CUDAAllocator]:
         r"""Returns the allocator this MemPool routes allocations to."""
         return super().allocator
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index bef781c19a0e9..2b3390c3f2f87 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -14,7 +14,11 @@
 
 def is_available(tensors):
     if not hasattr(torch._C, "_nccl_all_reduce"):
+<<<<<<< HEAD
         warnings.warn("PyTorch is not compiled with NCCL support", stacklevel=2)
+=======
+        warnings.warn("PyTorch is not compiled with NCCL support")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     devices = set()
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
index f77f9a0c9fd73..7e8716e6bc438 100644
--- a/torch/cuda/nvtx.py
+++ b/torch/cuda/nvtx.py
@@ -53,7 +53,10 @@ def range_start(msg) -> int:
     Args:
         msg (str): ASCII message to associate with the range.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _nvtx.rangeStartA(msg)
 
 
@@ -64,7 +67,10 @@ def range_end(range_id) -> None:
     Args:
         range_id (int): an unique handle for the start range.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _nvtx.rangeEnd(range_id)
 
 
@@ -85,7 +91,10 @@ def _device_range_start(msg: str, stream: int = 0) -> object:
         msg (str): ASCII message to associate with the range.
         stream (int): CUDA stream id.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _nvtx.deviceRangeStart(msg, stream)
 
 
@@ -98,7 +107,10 @@ def _device_range_end(range_handle: object, stream: int = 0) -> None:
         range_handle: an unique handle for the start range.
         stream (int): CUDA stream id.
     """
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _nvtx.deviceRangeEnd(range_handle, stream)
 
 
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 9c022d23beb68..45902180c0031 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -32,9 +32,12 @@ class Stream(torch._C._CudaStreamBase):
     """
 
     def __new__(cls, device=None, priority=0, **kwargs):
+<<<<<<< HEAD
         # Check CUDA availability
         if not torch.backends.cuda.is_built():
             raise RuntimeError("torch.cuda.Stream requires CUDA support")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # setting device manager is expensive, so we avoid it unless necessary
         if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
             return super().__new__(cls, priority=priority, **kwargs)
@@ -119,6 +122,7 @@ def __hash__(self):
     def __repr__(self):
         return f"<torch.cuda.Stream device={self.device} cuda_stream={self.cuda_stream:#x}>"
 
+<<<<<<< HEAD
     def __cuda_stream__(self):
         """Implements the CUDA Stream Protocol:
         https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol
@@ -129,6 +133,8 @@ def __cuda_stream__(self):
         """
         return (0, self.cuda_stream)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ExternalStream(Stream):
     r"""Wrapper around an externally allocated CUDA stream.
diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py
index 4a5ee73cbddd3..530ed700b2733 100644
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@@ -124,11 +124,19 @@
 There are basically two steps:
 1) Set the environment variables to collect the untuned GEMM and this will generate ``tunableop_untuned0.csv``:
 
+<<<<<<< HEAD
 .. code-block:: bash
 
    export PYTORCH_TUNABLEOP_ENABLED=1
    export PYTORCH_TUNABLEOP_TUNING=0
    export PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
+=======
+.. code-block:: python
+
+   PYTORCH_TUNABLEOP_ENABLED=1
+   PYTORCH_TUNABLEOP_TUNING=0
+   PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ...
 
 2) Run a Python script that reads the ``tunableop_untuned0.csv`` and generates the ``tunableop_results0.csv``, like this:
@@ -138,9 +146,15 @@
    import torch.cuda.tunable as tunable
    import os
 
+<<<<<<< HEAD
    os.putenv("PYTORCH_TUNABLEOP_ENABLED", "1")
    os.putenv("PYTORCH_TUNABLEOP_TUNING", "1")
    os.putenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED", "0")
+=======
+   os.putenv('PYTORCH_TUNABLEOP_ENABLED', '1')
+   os.putenv('PYTORCH_TUNABLEOP_TUNING', '1')
+   os.putenv('PYTORCH_TUNABLEOP_RECORD_UNTUNED', '0')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    tunable.tune_gemm_in_file("tunableop_untuned0.csv")
 
 
@@ -155,7 +169,11 @@
 .. code-block:: python
 
    if __name__ == "__main__":
+<<<<<<< HEAD
        num_gpus = 8  # number of GPUs that will be used during the tuning process
+=======
+       num_gpus = 8 # number of GPUs that will be used during the tuning process
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        tunable.mgpu_tune_gemm_in_file("tunableop_untuned?.csv", num_gpus)
 
 Note that the usage of the ``mgpu_tune_gemm_in_file`` API is different from its single GPU counterpart
@@ -179,7 +197,10 @@
 Use the C++ or Python APIs instead.
 
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import concurrent.futures
 import glob
 import multiprocessing as mp
@@ -206,12 +227,20 @@
     "get_filename",
     "get_results",
     "get_validators",
+<<<<<<< HEAD
+=======
+    "write_file_on_exit",
+    "write_file",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "read_file",
     "tune_gemm_in_file",
     "mgpu_tune_gemm_in_file",
     "set_rotating_buffer_size",
     "get_rotating_buffer_size",
+<<<<<<< HEAD
     "set_numerical_check_tolerances",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -285,7 +314,11 @@ def set_filename(filename: str, insert_device_ordinal: bool = False) -> None:
 
     If :attr:`insert_device_ordinal` is ``True`` then the current device ordinal
     will be added to the given filename automatically. This can be used in a
+<<<<<<< HEAD
     1-process-per-gpu scenario to ensure all processes write to a separate file.
+=======
+    1-process-per-gpu cenario to ensure all processes write to a separate file.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     torch._C._cuda_tunableop_set_filename(filename, insert_device_ordinal)  # type: ignore[attr-defined]
 
@@ -305,6 +338,28 @@ def get_validators() -> tuple[str, str]:
     return torch._C._cuda_tunableop_get_validators()  # type: ignore[attr-defined]
 
 
+<<<<<<< HEAD
+=======
+def write_file_on_exit(val: bool) -> None:
+    r"""During Tuning Context destruction, write file to disk.
+
+    This is useful as a final flush of your results to disk if your application
+    terminates as result of normal operation or an error. Manual flushing of
+    your results can be achieved by manually calling ``write_file()``."""
+    torch._C._cuda_tunableop_write_file_on_exit(val)  # type: ignore[attr-defined]
+
+
+def write_file(filename: Optional[str] = None) -> bool:
+    r"""Write results to a CSV file.
+
+    If :attr:`filename` is not given, ``get_filename()`` is called.
+    """
+    if filename is None:
+        filename = get_filename()
+    return torch._C._cuda_tunableop_write_file(filename)  # type: ignore[attr-defined]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def read_file(filename: Optional[str] = None) -> bool:
     r"""Read results from a TunableOp CSV file.
 
@@ -328,6 +383,7 @@ def get_rotating_buffer_size() -> int:
     return torch._C._cuda_tunableop_get_rotating_buffer_size()  # type: ignore[attr-defined]
 
 
+<<<<<<< HEAD
 def set_numerical_check_tolerances(
     enable: bool, atol: float = 1e-5, rtol: float = 1e-5
 ) -> None:
@@ -335,6 +391,8 @@ def set_numerical_check_tolerances(
     return torch._C._cuda_tunableop_set_numerical_check_tolerances(enable, atol, rtol)  # type: ignore[attr-defined]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def tune_gemm_in_file(filename: str) -> None:
     r"""tune GEMM in file."""
 
@@ -578,6 +636,10 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         transA = layout[1] == "T"
         dtype = dtype_dict.get(data_type)
         if data_type == "tf32":
+<<<<<<< HEAD
+=======
+            # User must still set HIPBLASLT_ALLOW_TF32=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.backends.cuda.matmul.allow_tf32 = True
         else:
             torch.backends.cuda.matmul.allow_tf32 = False
@@ -587,7 +649,11 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         assert count in [6, 7]
         untuned_gemm_temp = untuned_gemm[0].split("_")
         # dtypeC = might not be FP8 type, keep track
+<<<<<<< HEAD
         # of the number of underscores
+=======
+        # of the the number of underscores
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_sig = untuned_gemm_temp[0]
         data_typeA = untuned_gemm_temp[1] + "_" + untuned_gemm_temp[2]
         data_typeB = untuned_gemm_temp[3] + "_" + untuned_gemm_temp[4]
@@ -626,8 +692,12 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
             else:
                 warnings.warn(
                     "Offline tuning is not supported for this GEMM. Use online tuning instead. "
+<<<<<<< HEAD
                     + f"Skipped tuning for: {untuned_gemm[1]}",
                     stacklevel=2,
+=======
+                    + f"Skipped tuning for: {untuned_gemm[1]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return
 
@@ -645,8 +715,12 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         if m == 1 or n == 1 or k == 1:
             warnings.warn(
                 "Offline tuning is not support for this GEMM. Use online tuning instead. "
+<<<<<<< HEAD
                 + f"Skipped tuning for: {untuned_gemm[1]}",
                 stacklevel=2,
+=======
+                + f"Skipped tuning for: {untuned_gemm[1]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return
 
@@ -749,7 +823,11 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         matA = matA.t()
         torch.nn.functional.linear(X, matA, bias)
     else:
+<<<<<<< HEAD
         warnings.warn(f"error: unknown op {op_sig}", stacklevel=2)
+=======
+        warnings.warn(f"error: unknown op {op_sig}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_tuning_assertions() -> None:
@@ -758,7 +836,11 @@ def _check_tuning_assertions() -> None:
     """
 
     if is_enabled() is False:
+<<<<<<< HEAD
         warnings.warn("TunableOp was disabled. Trying to enable now.", stacklevel=2)
+=======
+        warnings.warn("TunableOp was disabled. Trying to enable now.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         enable(True)
     assert is_enabled() is True
     assert tuning_is_enabled() is True
@@ -776,6 +858,10 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
     mp_context = mp.get_context("spawn")
 
     futures = []  # empty list to hold futures
+<<<<<<< HEAD
+=======
+    flush_results = []  # empty list to hold futures
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # GEMM are assigned to GPUs in a round robin manner
     h = 0
@@ -797,6 +883,16 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
         for future in concurrent.futures.as_completed(futures):
             future.result()
 
+<<<<<<< HEAD
+=======
+        for g in range(num_gpus):
+            flush_result = executor.submit(write_file)
+            flush_results.append(flush_result)
+
+        for flush_result in concurrent.futures.as_completed(flush_results):
+            flush_result.result()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.synchronize()
 
     _gather_tunableop_results()
diff --git a/torch/custom_class.h b/torch/custom_class.h
index 4a3d1a39db31e..6b4490cb4801c 100644
--- a/torch/custom_class.h
+++ b/torch/custom_class.h
@@ -90,7 +90,11 @@ class class_ : public ::torch::detail::class_base {
   /// constructor taking an `int` and a `std::string` as argument.
   template <typename... Types>
   class_& def(
+<<<<<<< HEAD
       torch::detail::types<void, Types...> /*unused*/,
+=======
+      torch::detail::types<void, Types...>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::string doc_string = "",
       std::initializer_list<arg> default_args =
           {}) { // Used in combination with
@@ -287,7 +291,11 @@ class class_ : public ::torch::detail::class_base {
   ///     __getstate__(intrusive_ptr<CurClass>) -> T1
   ///     __setstate__(T2) -> intrusive_ptr<CurClass>
   ///
+<<<<<<< HEAD
   /// `T1` must be an object that is convertible to IValue by the same rules
+=======
+  /// `T1` must be an object that is convertable to IValue by the same rules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// for custom op/method registration.
   ///
   /// For the common case, T1 == T2. T1 can also be a subtype of T2. An
@@ -444,7 +452,11 @@ c10::IValue make_custom_class(CtorArgs&&... args) {
 }
 
 // Alternative api for creating a torchbind class over torch::class_ this api is
+<<<<<<< HEAD
 // preferred to prevent size regressions on Edge usecases. Must be used in
+=======
+// preffered to prevent size regressions on Edge usecases. Must be used in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // conjunction with TORCH_SELECTIVE_CLASS macro aka
 // selective_class<foo>("foo_namespace", TORCH_SELECTIVE_CLASS("foo"))
 template <class CurClass>
@@ -457,8 +469,13 @@ inline class_<CurClass> selective_class_(
 
 template <class CurClass>
 inline detail::ClassNotSelected selective_class_(
+<<<<<<< HEAD
     const std::string& /*unused*/,
     detail::SelectiveStr<false> /*unused*/) {
+=======
+    const std::string&,
+    detail::SelectiveStr<false>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return detail::ClassNotSelected();
 }
 
@@ -512,7 +529,11 @@ inline class_<CurClass> Library::class_(detail::SelectiveStr<true> className) {
 }
 
 template <class CurClass>
+<<<<<<< HEAD
 inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false> /*unused*/) {
+=======
+inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return detail::ClassNotSelected();
 }
 
diff --git a/torch/custom_class_detail.h b/torch/custom_class_detail.h
index 512320081b5d6..634103c121f25 100644
--- a/torch/custom_class_detail.h
+++ b/torch/custom_class_detail.h
@@ -128,7 +128,11 @@ typename c10::guts::infer_function_traits_t<Functor>::return_type
 call_torchbind_method_from_stack(
     Functor& functor,
     jit::Stack& stack,
+<<<<<<< HEAD
     std::index_sequence<ivalue_arg_indices...> /*unused*/) {
+=======
+    std::index_sequence<ivalue_arg_indices...>) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                  // be unused and we have to silence the compiler warning.
 
diff --git a/torch/distributed/CONTRIBUTING.md b/torch/distributed/CONTRIBUTING.md
index 7fd1288772e2a..5939163c984df 100644
--- a/torch/distributed/CONTRIBUTING.md
+++ b/torch/distributed/CONTRIBUTING.md
@@ -2,13 +2,21 @@
 
 Please go through PyTorch's top level [Contributing Guide](../../CONTRIBUTING.md) before proceeding with this guide.
 
+<<<<<<< HEAD
 [PyTorch Distributed Overview](https://pytorch.org/tutorials//beginner/dist_overview.html) is a great starting point with a lot of tutorials, documentation and design docs covering PyTorch Distributed. We highly recommend going through some of that material before you start working on PyTorch Distributed.
+=======
+[PyTorch Distributed Overview](https://pytorch.org/tutorials//beginner/dist_overview.html) is a great starting point with a lot of tutorials, documentation and design docs covering PyTorch Distributed. We would highly recommend going through some of that material before you start working on PyTorch Distributed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 In this document, we mostly focus on some of the code structure for PyTorch distributed and implementation details.
 
 ### Onboarding Tasks
 
+<<<<<<< HEAD
 A list of onboarding tasks can be found [here](https://github.com/pytorch/pytorch/issues?q=is%3Aopen%20is%3Aissue%20label%3A%22pt_distributed_rampup%22).
+=======
+A list of onboarding tasks can be found [here](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+distributed%22+label%3A%22topic%3A+bootcamp%22) and [here](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+distributed%22+label%3Apt_distributed_rampup).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ## Code Pointers
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index f0beed8f4d4c3..bd3d975853658 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -4,7 +4,10 @@
 import sys
 import traceback
 import typing
+<<<<<<< HEAD
 from datetime import timedelta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -83,7 +86,11 @@ def interaction(self, *args, **kwargs):
 
     _breakpoint_cache: dict[int, typing.Any] = {}
 
+<<<<<<< HEAD
     def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
+=======
+    def breakpoint(rank: int = 0, skip: int = 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
         done with the breakpoint before continuing.
@@ -100,6 +107,7 @@ def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
                 log.warning("Skip the breakpoint, counter=%d", counter)
                 return
 
+<<<<<<< HEAD
         # avoid having the default timeout (if short) interrupt your debug session
         if timeout_s is not None:
             for group in torch.distributed.distributed_c10d._pg_map:
@@ -107,6 +115,8 @@ def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
                     timedelta(seconds=timeout_s), group
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if get_rank() == rank:
             pdb = _DistributedPdb()
             pdb.message(
@@ -133,9 +143,14 @@ def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
     # Variables prefixed with underscore are not auto imported
     # See the comment in `distributed_c10d.py` above `_backend` on why we expose
     # this.
+<<<<<<< HEAD
     # pyrefly: ignore [deprecated]
     from .distributed_c10d import *  # noqa: F403
     from .distributed_c10d import (  # pyrefly: ignore  # deprecated
+=======
+    from .distributed_c10d import *  # noqa: F403
+    from .distributed_c10d import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _all_gather_base,
         _coalescing_manager,
         _CoalescingManager,
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
index 2d109ad56835b..ff70573157c18 100644
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -79,7 +79,10 @@ def checkpoint(module: nn.Module, **kwargs) -> nn.Module:
     user_context_fns = kwargs.pop("context_fn", None)
     determinism_check = kwargs.pop("determinism_check", _DEFAULT_DETERMINISM_MODE)
     debug = kwargs.pop("debug", False)
+<<<<<<< HEAD
     early_stop = kwargs.pop("early_stop", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if kwargs:
         raise ValueError(
@@ -104,7 +107,10 @@ def context_fns():
                 context_fns,
                 determinism_check,
                 debug,
+<<<<<<< HEAD
                 early_stop,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *args,
                 **kwargs,
             )
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
index 6dedc5d4600f2..9a4a5aa39445c 100644
--- a/torch/distributed/_composable/contract.py
+++ b/torch/distributed/_composable/contract.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-defs
 import uuid
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from typing import Concatenate, Generic, Optional, Protocol
 from typing_extensions import ParamSpec, TypeVar
+=======
+from functools import wraps
+from typing import Callable, Generic, Optional, Protocol
+from typing_extensions import Concatenate, ParamSpec, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -107,7 +113,10 @@ def wrapper(
                 # If the user passes a sequence of modules, then we assume that
                 # we only need to insert the state object on the root modules
                 # (i.e. those without a parent) among the passed-in modules.
+<<<<<<< HEAD
                 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 modules = _get_root_modules(list(module))
             state = state_cls()  # shared across all modules
             registry_item = RegistryItem()  # shared across all modules
@@ -119,7 +128,10 @@ def wrapper(
             all_orig_named_buffers: list[dict[str, torch.Tensor]] = []
             all_orig_named_modules: list[dict[str, nn.Module]] = []
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for module in modules:
                 default_all_state: dict[Callable, _State] = OrderedDict()
                 default_registry: dict[str, RegistryItem] = OrderedDict()
@@ -146,11 +158,16 @@ def wrapper(
                 all_state.setdefault(func, state)
                 registry.setdefault(func.__name__, registry_item)
 
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
                 all_orig_named_params.append(OrderedDict(module.named_parameters()))
                 # pyrefly: ignore [missing-attribute]
                 all_orig_named_buffers.append(OrderedDict(module.named_buffers()))
                 # pyrefly: ignore [missing-attribute]
+=======
+                all_orig_named_params.append(OrderedDict(module.named_parameters()))
+                all_orig_named_buffers.append(OrderedDict(module.named_buffers()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_orig_named_modules.append(OrderedDict(module.named_modules()))
 
             updated = func(inp_module, *args, **kwargs)
@@ -165,6 +182,7 @@ def wrapper(
             all_new_named_params: list[dict[str, nn.Parameter]] = []
             all_new_named_buffers: list[dict[str, torch.Tensor]] = []
             all_new_named_modules: list[dict[str, nn.Module]] = []
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             for module in updated_modules:
                 # pyrefly: ignore [missing-attribute]
@@ -172,6 +190,11 @@ def wrapper(
                 # pyrefly: ignore [missing-attribute]
                 all_new_named_buffers.append(OrderedDict(module.named_buffers()))
                 # pyrefly: ignore [missing-attribute]
+=======
+            for module in updated_modules:
+                all_new_named_params.append(OrderedDict(module.named_parameters()))
+                all_new_named_buffers.append(OrderedDict(module.named_buffers()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_new_named_modules.append(OrderedDict(module.named_modules()))
 
             num_orig_modules = len(all_orig_named_modules)
@@ -234,7 +257,10 @@ def check_fqn(orig_fqns: list[str], new_fqns: list[str], check_key: str):
             # TODO: verify that installed distributed paradigms are compatible with
             # each other.
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return updated
 
         def get_state(module: nn.Module) -> _State:
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index 3d9ab861ee5dd..f5d5aa0485a04 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -214,9 +214,17 @@ def replicate(
 
     state = replicate.state(module)
     module.register_forward_pre_hook(state.forward_pre_hook, with_kwargs=True)
+<<<<<<< HEAD
     device_mesh = kwargs.get("device_mesh")
     if device_mesh is not None:
         root_mesh = device_mesh._get_root_mesh()
+=======
+    device_mesh = kwargs.get("device_mesh", None)
+    if device_mesh is not None:
+        from torch.distributed.device_mesh import _mesh_resources
+
+        root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if a root mesh is not the same as device_mesh,
         # meaning the device_mesh is sliced out from the root mesh.
         if root_mesh != device_mesh:
diff --git a/torch/distributed/_composable_state.py b/torch/distributed/_composable_state.py
index 4f2808b545210..7ecf7b196f2fd 100644
--- a/torch/distributed/_composable_state.py
+++ b/torch/distributed/_composable_state.py
@@ -31,7 +31,10 @@ def _get_module_state(module: nn.Module) -> Optional[_State]:
     """
     global _module_state_mapping
     if isinstance(module, _State):
+<<<<<<< HEAD
         # pyrefly: ignore [redundant-cast]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cast(_State, module)
     else:
         # https://github.com/pytorch/pytorch/issues/107054
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 8574e25833523..a580a44faad5c 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -19,6 +19,7 @@
     from torch.utils._pytree import tree_map_only  # type: ignore[no-redef]
 
 
+<<<<<<< HEAD
 try:
     from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
 except Exception:
@@ -31,6 +32,25 @@ def is_torchdynamo_compiling():  # type: ignore[misc]
         return False
         return False
 
+=======
+if torch._running_with_deploy():
+
+    def is_torchdynamo_compiling():
+        """Can't import torchdynamo in torchdeploy builds currently."""
+        return False
+
+else:
+    try:
+        from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
+    except Exception:
+        warnings.warn(
+            "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly"
+        )
+
+        def is_torchdynamo_compiling():
+            return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 """
 New traceable, functional collectives.
@@ -634,7 +654,10 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[
         if func == torch.ops.aten.view.default:
             # Fast handle aten.view as a lot of view related op goes to aten.view
             # eventually, this avoids pytree slowdown
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = func(args[0].elem, args[1])
             wrapper_res = AsyncCollectiveTensor(res)
             return wrapper_res
@@ -788,7 +811,10 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
                 FutureWarning,
                 stacklevel=3,
             )
+<<<<<<< HEAD
         # pyrefly: ignore [redundant-cast]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return c10d._resolve_group_name_by_ranks_and_tag(cast(list[int], group), tag)
     else:
         raise ValueError(f"Unsupported group type: {type(group)}, {group}")
@@ -818,11 +844,14 @@ def _are_we_tracing() -> bool:
     # If fake mode is turned on, we are almost definitely compiling/tracing.
     if torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FAKE) is not None:
         return True
+<<<<<<< HEAD
     # See Note [enable_python_dispatcher in dynamo]
     if torch._C._dispatch_tls_is_dispatch_key_included(
         torch._C.DispatchKey.PythonDispatcher
     ):
         return True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return get_proxy_mode() is not None
 
 
@@ -872,6 +901,7 @@ def all_reduce_wait_compiled(y):
         )
 
 
+<<<<<<< HEAD
 def _make_all_gather_out_tensor(input, group_size):
     out_size = list(input.size())
     if len(out_size) == 0:
@@ -884,6 +914,16 @@ def _make_all_gather_out_tensor(input, group_size):
 
 def _all_gather_into_tensor_coalesced_meta(self, tag, rankset, group_size):
     return [_make_all_gather_out_tensor(t, group_size) for t in self]
+=======
+def _all_gather_into_tensor_coalesced_meta(self, tag, rankset, group_size):
+    def mk_out_tensor(shard):
+        out_size = list(shard.size())
+        out_size[0] *= group_size
+        out_tensor = shard.new_empty(out_size)
+        return out_tensor
+
+    return [mk_out_tensor(t) for t in self]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # We now register meta kernels to deal with tracing
@@ -900,7 +940,13 @@ def _wait_tensor_meta(self, *args):
 
 
 def _all_gather_into_tensor_meta(shard, tag, rankset, group_size):
+<<<<<<< HEAD
     return _make_all_gather_out_tensor(shard, group_size)
+=======
+    out_size = list(shard.size())
+    out_size[0] *= group_size
+    return shard.new_empty(out_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _reduce_scatter_tensor_meta(input, reduce_op, tag, rankset, group_size):
@@ -947,18 +993,34 @@ def _all_to_all_single_meta(
         return input.new_empty(input.size())
     else:
         for s in output_split_sizes:
+<<<<<<< HEAD
             torch._check(s >= 0)
+=======
+            torch._check_is_size(s)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_size = list(input.size())
         out_size[0] = sum(output_split_sizes)
         return input.new_empty(out_size)
 
 
 def _all_gather_into_tensor_out_native_meta(input, group_size, group_name, *, out):
+<<<<<<< HEAD
     return _make_all_gather_out_tensor(input, group_size)
 
 
 def _all_gather_into_tensor_native_meta(input, group_size, group_name):
     return _make_all_gather_out_tensor(input, group_size)
+=======
+    shape = list(input.size())
+    shape[0] *= group_size
+    return input.new_empty(shape)
+
+
+def _all_gather_into_tensor_native_meta(input, group_size, group_name):
+    shape = list(input.size())
+    shape[0] *= group_size
+    return input.new_empty(shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _all_gather_into_tensor_coalesced_native_meta(inputs, group_size, group_name):
@@ -983,6 +1045,7 @@ def _reduce_scatter_tensor_coalesced_native_meta(
     ]
 
 
+<<<<<<< HEAD
 # Library MUST be defined at module scope or it doesn't work
 lib_impl = torch.library.Library("_c10d_functional", "IMPL")
 lib_impl.impl("all_reduce", _all_reduce_meta, "Meta")
@@ -1035,6 +1098,68 @@ def _reduce_scatter_tensor_coalesced_native_meta(
     backend_impl = getattr(fun_col_impl, f"_{op_name}")
     legacy_lib.define(op_def, tags=torch.Tag.pt2_compliant_tag)
     legacy_lib_impl.impl(op_name, backend_impl, "CompositeImplicitAutograd")
+=======
+if not torch._running_with_deploy():
+    # Library MUST be defined at module scope or it doesn't work
+    # Creating a "DEF" Library always crashes torch::deploy so we create our
+    # Library instances here guarded against running inside it
+    lib_impl = torch.library.Library("_c10d_functional", "IMPL")
+    lib_impl.impl("all_reduce", _all_reduce_meta, "Meta")
+    lib_impl.impl("all_reduce_", _all_reduce__meta, "Meta")
+    lib_impl.impl("all_reduce_coalesced", _all_reduce_coalesced_meta, "Meta")
+    lib_impl.impl("all_reduce_coalesced_", _all_reduce_coalesced__meta, "Meta")
+    lib_impl.impl("wait_tensor", _wait_tensor_meta, "Meta")
+    lib_impl.impl(
+        "all_gather_into_tensor_out", _all_gather_into_tensor_out_native_meta, "Meta"
+    )
+    lib_impl.impl("all_gather_into_tensor", _all_gather_into_tensor_native_meta, "Meta")
+    lib_impl.impl(
+        "all_gather_into_tensor_coalesced",
+        _all_gather_into_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    lib_impl.impl("reduce_scatter_tensor", _reduce_scatter_tensor_native_meta, "Meta")
+    lib_impl.impl(
+        "reduce_scatter_tensor_coalesced",
+        _reduce_scatter_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    lib_impl.impl("all_to_all_single", _all_to_all_single_meta, "Meta")
+    lib_impl.impl("broadcast", _broadcast_meta, "Meta")
+    lib_impl.impl("broadcast_", _broadcast__meta, "Meta")
+
+    # mark these ops has side effect so that they won't be removed by DCE
+    torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor.default)
+    torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor)
+
+    # Register legacy ops for backward compatibility
+    # TODO(yifu): remove these in functional collective beta release
+    legacy_lib = torch.library.Library("c10d_functional", "DEF")
+    legacy_lib_impl = torch.library.Library("c10d_functional", "IMPL")
+    ops_defs = [
+        "broadcast(Tensor self, int src, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_reduce_coalesced(Tensor[] self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "wait_tensor(Tensor self) -> Tensor",
+        "all_gather_into_tensor(Tensor shard, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_gather_into_tensor_coalesced(Tensor[] input, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "reduce_scatter_tensor(Tensor input, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor",
+        "reduce_scatter_tensor_coalesced(Tensor[] inputs, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "all_to_all_single(Tensor input, SymInt[]? output_split_sizes, SymInt[]? input_split_sizes, str tag, int[] ranks, int group_size) -> Tensor",  # noqa: B950
+    ]
+
+    my_module = sys.modules[__name__]
+    for op_def in ops_defs:
+        op_name = op_def[0 : op_def.index("(")]
+        backend_impl = getattr(fun_col_impl, f"_{op_name}")
+        legacy_lib.define(op_def, tags=torch.Tag.pt2_compliant_tag)
+        legacy_lib_impl.impl(op_name, backend_impl, "CompositeImplicitAutograd")
+
+else:
+    warnings.warn(
+        "PyTorch Distributed functional collectives do not work with torch::deploy."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """
@@ -1151,7 +1276,11 @@ def all_gather_inplace(
     assert not async_op, (
         "Can't remap async version of inplace op to functional collective"
     )
+<<<<<<< HEAD
     assert tensor.dim() == 0 or all(t.size(0) == tensor.size(0) for t in tensor_list), (
+=======
+    assert all(t.size(0) == tensor.size(0) for t in tensor_list), (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Remapping variable size all_gather is not yet supported"
     )
 
@@ -1165,6 +1294,7 @@ def all_gather_inplace(
     output_splits = []
     offset = 0
     for t in tensor_list:
+<<<<<<< HEAD
         is_scalar = t.dim() == 0
         t_offset = 1 if is_scalar else t.size(0)
         # pyrefly: ignore [unsupported-operation]
@@ -1172,12 +1302,20 @@ def all_gather_inplace(
         output_splits.append(out)
         # pyrefly: ignore [unsupported-operation]
         offset += t_offset
+=======
+        output_splits.append(output[offset : offset + t.size(0)])
+        offset += t.size(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for dst, src in zip(tensor_list, output_splits):
         dst.copy_(src)
     return tensor_list
 
 
+<<<<<<< HEAD
 from torch.distributed.distributed_c10d import (  # pyrefly: ignore  # deprecated
+=======
+from torch.distributed.distributed_c10d import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _all_gather_base as legacy_all_gather_base,
     _reduce_scatter_base as legacy_reduce_scatter_base,
     all_gather as legacy_all_gather,
@@ -1191,6 +1329,7 @@ def all_gather_inplace(
 # This dict should contain sets of functions that dynamo is allowed to remap.
 # Functions in this set should accept the same args/kwargs 1:1 as their mapping.
 traceable_collective_remaps = {
+<<<<<<< HEAD
     legacy_allgather: all_gather_tensor_inplace,  # type: ignore[has-type]
     legacy_reducescatter: reduce_scatter_tensor_inplace,  # type: ignore[has-type]
     legacy_allreduce: all_reduce_inplace,  # type: ignore[has-type]
@@ -1198,4 +1337,13 @@ def all_gather_inplace(
     legacy_all_gather: all_gather_inplace,  # type: ignore[has-type]
     legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,  # type: ignore[has-type]
     legacy_all_gather_base: all_gather_tensor_inplace,  # type: ignore[has-type]
+=======
+    legacy_allgather: all_gather_tensor_inplace,
+    legacy_reducescatter: reduce_scatter_tensor_inplace,
+    legacy_allreduce: all_reduce_inplace,
+    legacy_all_to_all_single: all_to_all_inplace,
+    legacy_all_gather: all_gather_inplace,
+    legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,
+    legacy_all_gather_base: all_gather_tensor_inplace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/distributed/_serialization.py b/torch/distributed/_serialization.py
index d9c3bfe6b8d5a..85abc3f0868eb 100644
--- a/torch/distributed/_serialization.py
+++ b/torch/distributed/_serialization.py
@@ -57,6 +57,7 @@ def read_from(self, f: BufferedIOBase) -> None:
         for entry in entries:
             data = f.read(entry.length)
             if entry.is_storage:
+<<<<<<< HEAD
                 if entry.length == 0:
                     storage = torch.UntypedStorage(0)
                 else:
@@ -64,6 +65,12 @@ def read_from(self, f: BufferedIOBase) -> None:
                         data,
                         dtype=torch.uint8,
                     ).untyped_storage()
+=======
+                storage = torch.frombuffer(
+                    data,
+                    dtype=torch.uint8,
+                ).untyped_storage()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.records[entry.key] = (
                     storage,
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/_common.py b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
index 0a356e524a47a..1ef70e8411da0 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/_common.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
@@ -43,7 +43,10 @@ def decorator_sharded_func(wrapped_func):
         def wrapper(types, args=(), kwargs=None, pg=None):
             _basic_validation(op, args, kwargs)
 
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             st = args[0]
             if kwargs is None:
                 kwargs = {}
@@ -93,7 +96,10 @@ def _register_sharded_op_on_local_shards(
     @_sharded_op_impl(op)
     @_sharded_op_common(op, early_stop_func, extra_check)
     def sharded_tensor_op_on_local_shards(types, args=(), kwargs=None, pg=None):
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         st = args[0]
         st_metadata = st.metadata()
         local_shards = st.local_shards()
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/init.py b/torch/distributed/_shard/sharded_tensor/_ops/init.py
index d0e576b45ebee..a5714c60d14e8 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/init.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/init.py
@@ -20,6 +20,7 @@ def uniform_(types, args=(), kwargs=None, pg=None):
         b: the upper bound of the uniform distribution
     """
     validate_param(kwargs, "kwargs")
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
@@ -27,6 +28,12 @@ def uniform_(types, args=(), kwargs=None, pg=None):
     a = kwargs["a"]
     validate_param(a, "a")
     # pyrefly: ignore [unsupported-operation]
+=======
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    a = kwargs["a"]
+    validate_param(a, "a")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     b = kwargs["b"]
     validate_param(b, "b")
 
@@ -46,6 +53,7 @@ def normal_(types, args=(), kwargs=None, pg=None):
         std: the standard deviation of the normal distribution
     """
     validate_param(kwargs, "kwargs")
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
@@ -53,6 +61,12 @@ def normal_(types, args=(), kwargs=None, pg=None):
     mean = kwargs["mean"]
     validate_param(mean, "mean")
     # pyrefly: ignore [unsupported-operation]
+=======
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    mean = kwargs["mean"]
+    validate_param(mean, "mean")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std = kwargs["std"]
     validate_param(std, "std")
 
@@ -84,6 +98,7 @@ def kaiming_uniform_(types, args=(), kwargs=None, pg=None):
             recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
     """
     validate_param(kwargs, "kwargs")
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
@@ -94,6 +109,14 @@ def kaiming_uniform_(types, args=(), kwargs=None, pg=None):
     mode = kwargs["mode"]
     validate_param(mode, "mode")
     # pyrefly: ignore [unsupported-operation]
+=======
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    a = kwargs["a"]
+    validate_param(a, "a")
+    mode = kwargs["mode"]
+    validate_param(mode, "mode")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nonlinearity = kwargs["nonlinearity"]
     validate_param(nonlinearity, "nonlinearity")
 
@@ -113,10 +136,15 @@ def constant_(types, args=(), kwargs=None, pg=None):
         val: the value to fill the tensor with
     """
     validate_param(kwargs, "kwargs")
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
     # pyrefly: ignore [unsupported-operation]
+=======
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     val = kwargs["val"]
     validate_param(val, "val")
     for shard in sharded_tensor.local_shards():
@@ -143,13 +171,20 @@ def tensor_creation_op(types, args=(), kwargs=None, pg=None):
         takes a ShardedTensor as argument, such as ``torch.zeros_like`` or
         ``torch.full_like``.
         """
+<<<<<<< HEAD
         creation_op = tensor_like_creation_op_map.get(op)
+=======
+        creation_op = tensor_like_creation_op_map.get(op, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if creation_op is None:
             raise RuntimeError(f"Tensor creation {op} not supported!")
         if kwargs is None:
             kwargs = {}
 
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         st = args[0]
 
         new_st = creation_op(st.sharding_spec(), st.size(), *args[1:], **kwargs)  # type: ignore[operator]
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
index d5b7ad7c77b1b..3976ba46eb539 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -40,7 +40,10 @@
 # the device property on each rank
 @_sharded_op_impl(torch.Tensor.device.__get__)
 def tensor_device(types, args=(), kwargs=None, pg=None):
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self_st = args[0]
     # Validate types
     if not isinstance(self_st, ShardedTensor):
@@ -57,7 +60,10 @@ def tensor_device(types, args=(), kwargs=None, pg=None):
 
 @_sharded_op_impl(torch.Tensor.is_meta.__get__)  # type: ignore[attr-defined]
 def st_is_meta(types, args=(), kwargs=None, pg=None):
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return args[0].local_tensor().is_meta
 
 
@@ -198,7 +204,10 @@ def sharded_detach(args, kwargs, pg):
 
 @_sharded_op_impl(torch.Tensor.requires_grad_)
 def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self_st = args[0]
     # Validate types
     if not isinstance(self_st, ShardedTensor):
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 87fcc0142578e..d18acc8fef243 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -8,7 +8,11 @@
 import weakref
 from dataclasses import dataclass
 from functools import reduce
+<<<<<<< HEAD
 from typing import cast, Optional, TYPE_CHECKING
+=======
+from typing import Callable, cast, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -41,7 +45,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch.distributed._shard.metadata import ShardMetadata
 
@@ -299,9 +307,13 @@ def _post_init(self):
         if self._init_rrefs:
             with _sharded_tensor_lock:
                 global _sharded_tensor_current_id, _sharded_tensor_map
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 self._sharded_tensor_id = _sharded_tensor_current_id
                 # pyrefly: ignore [unsupported-operation]
+=======
+                self._sharded_tensor_id = _sharded_tensor_current_id
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _sharded_tensor_map[self._sharded_tensor_id] = weakref.ref(self)
                 _sharded_tensor_current_id += 1
 
@@ -470,8 +482,12 @@ def shard_size(shard_md):
                 src = shard.tensor.flatten()
                 if src.nelement() == 0:
                     warnings.warn(
+<<<<<<< HEAD
                         "Gathering a tensor with zero elements on rank " + str(rank),
                         stacklevel=2,
+=======
+                        "Gathering a tensor with zero elements on rank " + str(rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     continue
                 shard_offset = shard_placement[shard.metadata][1]
@@ -672,15 +688,23 @@ def to(self, *args, **kwargs) -> ShardedTensor:
             if device_to.index != current_idx:
                 warnings.warn(
                     "ShardedTensor.to only move tensor to its current device"
+<<<<<<< HEAD
                     "If you want to put to different device, use `reshard` instead.",
                     stacklevel=2,
+=======
+                    "If you want to put to different device, use `reshard` instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             device_to = torch.device(current_idx)
 
         copy_tensor = kwargs.get("copy", False)
         non_blocking = kwargs.get("non_blocking", False)
         memory_format = kwargs.get("memory_format", torch.preserve_format)
+<<<<<<< HEAD
         process_group = kwargs.get("process_group")
+=======
+        process_group = kwargs.get("process_group", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             not copy_tensor
@@ -1150,12 +1174,17 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
             resharding_spec, shard_spec.ChunkShardingSpec
         ) or not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec):
             raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
+<<<<<<< HEAD
 
         num_local_shards = len(self.local_shards())
         if num_local_shards != 1:
             raise NotImplementedError(
                 f"Only single local shard supported for reshard. Number of shards: {num_local_shards}"
             )
+=======
+        if len(self.local_shards()) != 1:
+            raise NotImplementedError("Only single local shard supported for reshard.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self._sharding_spec.dim == resharding_spec.dim:  # type: ignore[attr-defined]
             if self._sharding_spec.placements == resharding_spec.placements:  # type: ignore[attr-defined]
@@ -1188,11 +1217,16 @@ def local_tensor(self) -> torch.Tensor:
         Returns:
             A :class:`torch.Tensor` of the local shard.
         """
+<<<<<<< HEAD
         num_local_shards = len(self.local_shards())
         if num_local_shards != 1:
             raise NotImplementedError(
                 f"Only single local shard is supported. Number of shards: {num_local_shards}"
             )
+=======
+        if len(self.local_shards()) != 1:
+            raise NotImplementedError("Only single local shard is supported.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.local_shards()[0].tensor
 
     @classmethod
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index c32fe5f71956a..0c2cdd0904e5b 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -79,7 +79,10 @@ def _flatten_tensor_size(size) -> torch.Size:
     Checks if tensor size is valid, then flatten/return a torch.Size object.
     """
     if len(size) == 1 and isinstance(size[0], collections.abc.Sequence):
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dims = list(*size)
     else:
         dims = list(size)
@@ -209,7 +212,10 @@ def build_global_metadata(
     global_sharded_tensor_metadata = None
     global_metadata_rank = 0
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for rank, rank_metadata in enumerate(gathered_metadatas):
         if rank_metadata is None:
             continue
diff --git a/torch/distributed/_shard/sharding_spec/api.py b/torch/distributed/_shard/sharding_spec/api.py
index 87a49abdb5c05..787304ed207c4 100644
--- a/torch/distributed/_shard/sharding_spec/api.py
+++ b/torch/distributed/_shard/sharding_spec/api.py
@@ -2,9 +2,14 @@
 import functools
 import operator
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
+=======
+from dataclasses import dataclass
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
index d4cd5728b2a16..bd4b34652e300 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
@@ -167,7 +167,10 @@ def shard(
                     )
 
                 tensors_to_scatter[
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dist.get_group_rank(process_group, remote_global_rank)
                 ] = tensor_to_scatter
 
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
index af4f4f890e901..d8ae995ebd9c7 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@@ -58,7 +58,10 @@ def _register_sharded_op_on_local_tensor(
     @custom_sharding_spec_op(ChunkShardingSpec, op)
     @_sharded_op_common(op, early_stop_func, extra_check)
     def sharded_tensor_op_on_local_tensor(types, args=(), kwargs=None, pg=None):
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         st = args[0]
         sharding_spec = st.sharding_spec()
         if len(st.local_shards()) != 1:
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index f1581575f5f47..f377d59e8fe7e 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -425,9 +425,13 @@ def _handle_row_wise_sharding(
         else:
             split_sizes = torch.cat(
                 (
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
                     offsets[1 : offsets.size(0)] - offsets[0:-1],
                     # pyrefly: ignore [unsupported-operation]
+=======
+                    offsets[1 : offsets.size(0)] - offsets[0:-1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (input.size(0) - offsets[-1]).unsqueeze(0),
                 ),
                 dim=-1,
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index bcf5674833439..95ad90d9a0d27 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -3,8 +3,13 @@
 import io
 import math
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable, Mapping, MutableMapping
 from typing import Any, cast, NamedTuple, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Mapping, MutableMapping
+from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.cuda._pin_memory_utils as pin_memory_utils
@@ -195,13 +200,19 @@ def _iterate_state_dict(
                             ret.local_shards()[idx].tensor, non_blocking=non_blocking
                         )
                 else:
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     companion_obj.copy_(ret, non_blocking=non_blocking)
                 ret = companion_obj
     else:
         ret = {} if isinstance(ret, dict) else None
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret
 
 
@@ -425,7 +436,11 @@ def tensor_func(
             t = t.share_memory_()
             if pin_memory:
                 pin_memory_utils.pin_memory(t.data_ptr(), t.numel() * t.element_size())
+<<<<<<< HEAD
                 weakref.finalize(t, pin_memory_utils.unpin_memory, t.data_ptr())
+=======
+                weakref.finalize(t, pin_memory_utils.unpin_memory, t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return t
         elif pin_memory:
@@ -598,7 +613,11 @@ def _distribute_tensors(
     if pg is None:
         pg = dist.distributed_c10d._get_default_group()
     for key in keys:
+<<<<<<< HEAD
         _local_state = local_state_dict.get(key)
+=======
+        _local_state = local_state_dict.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if _local_state is None or torch.is_tensor(_local_state):
             continue
 
@@ -708,7 +727,11 @@ def _distribute_state_dict(
             local_state_dict[key] = value.cpu()
         else:
             assert isinstance(value, torch.Tensor)
+<<<<<<< HEAD
             local_state = local_state_dict.get(key)
+=======
+            local_state = local_state_dict.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if local_state is None:
                 continue
             elif isinstance(local_state, DTensor):
@@ -792,21 +815,32 @@ def extend_list(lst: list[Any], idx: int) -> None:
     for i in range(1, len(path)):
         prev_key = path[i - 1]
         key = path[i]
+<<<<<<< HEAD
         def_val: Union[CONTAINER_TYPE, list[Any]] = {} if type(key) is str else []
+=======
+        def_val: Union[CONTAINER_TYPE, list[Any]] = {} if type(key) == str else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(cur_container, Mapping):
             cur_container = cast(
                 CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
             )
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extend_list(cur_container, prev_key)
             if cur_container[prev_key] is None:
                 cur_container[prev_key] = def_val
             cur_container = cur_container[prev_key]
 
     key = path[-1]
+<<<<<<< HEAD
     if type(key) is int:
+=======
+    if type(key) == int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extend_list(cast(list[Any], cur_container), key)
 
     cur_container[key] = value
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 6aa4584b981bb..e27a384522be9 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1,15 +1,26 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import math
 import os
 import socket
 import uuid
+<<<<<<< HEAD
 from collections.abc import Callable, Generator
+=======
+from collections.abc import Generator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from datetime import timedelta
 from enum import Enum
 from functools import partial
+<<<<<<< HEAD
 from typing import Any, Literal
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -49,11 +60,19 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
+<<<<<<< HEAD
 _mocked_group_names: set[str] | None = None
 
 
 @contextmanager
 def _test_mode(group_names: set[str] | None = None) -> Generator[None, None, None]:
+=======
+_mocked_group_names: Optional[set[str]] = None
+
+
+@contextmanager
+def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
@@ -85,7 +104,11 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     return group_name in _group_name_to_store
 
 
+<<<<<<< HEAD
 _group_name_to_workspace_tensor: dict[str, torch.Tensor | None] = {}
+=======
+_group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_symm_mem_workspace(group_name: str, min_size: int) -> _SymmetricMemory:
@@ -317,7 +340,10 @@ def _pipelined_produce_and_all2all(
     chunk_producer: Callable[[int, torch.Tensor], None],
     output: torch.Tensor,
     group_name: str,
+<<<<<<< HEAD
     out_chunk_dim: int = 0,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Perform the following logic with micro-pipelined computation and
@@ -329,9 +355,13 @@ def _pipelined_produce_and_all2all(
         ]
         dist.all_to_all_single(output=output, input=torch.cat(chunks))
     """
+<<<<<<< HEAD
     out_chunks = output.chunk(
         c10d._get_group_size_by_name(group_name), dim=out_chunk_dim
     )
+=======
+    out_chunks = output.chunk(c10d._get_group_size_by_name(group_name))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     p2p_workspace_size_req = out_chunks[0].numel() * out_chunks[0].element_size() * 2
     symm_mem = get_symm_mem_workspace(group_name, min_size=p2p_workspace_size_req)
     group_size = symm_mem.world_size
@@ -453,7 +483,11 @@ def get_p2p_buf(rank: int, idx: int) -> torch.Tensor:
 lib.define(
     "fused_scaled_matmul_reduce_scatter("
     "Tensor A, Tensor B, Tensor A_scale, Tensor B_scale, "
+<<<<<<< HEAD
     "str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, SymInt[]? output_shape, "
+=======
+    "str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, int[]? output_shape, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Tensor? bias = None, "
     "Tensor? result_scale = None, "
     "ScalarType? out_dtype = None, "
@@ -474,7 +508,11 @@ class _ScaleMode(Enum):
 
 
 def _check_and_verify_fp8_all_gather_scale_mode(
+<<<<<<< HEAD
     shard: torch.Tensor, scale: torch.Tensor | None, gather_dim: int, group_size: int
+=======
+    shard: torch.Tensor, scale: Optional[torch.Tensor], gather_dim: int, group_size: int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> _ScaleMode:
     full_shape = list(shard.shape)
     full_shape[gather_dim] *= group_size
@@ -503,6 +541,7 @@ def _fused_all_gather_matmul_impl(
     mm_out_op: torch._ops.OpOverload,
     A_shard: torch.Tensor,
     Bs: list[torch.Tensor],
+<<<<<<< HEAD
     A_scale: torch.Tensor | None,
     kwargs_list: list[dict[str, Any]],
     out_dtypes: list[torch.dtype | None],
@@ -510,6 +549,15 @@ def _fused_all_gather_matmul_impl(
     group_name: str,
     return_A: bool,
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+=======
+    A_scale: Optional[torch.Tensor],
+    kwargs_list: list[dict[str, Any]],
+    out_dtypes: list[Optional[torch.dtype]],
+    gather_dim: int,
+    group_name: str,
+    return_A: bool,
+) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if A_shard.dim() < 2:
         raise ValueError("A_shard must be a matrix")
     for B in Bs:
@@ -524,6 +572,7 @@ def _fused_all_gather_matmul_impl(
 
     group = c10d._resolve_process_group(group_name)
 
+<<<<<<< HEAD
     if gather_dim == A_shard.ndim - 1 or gather_dim == -1:
         return _fused_all_gather_matmul_last_gather_dim_impl(
             mm_out_op,
@@ -537,6 +586,8 @@ def _fused_all_gather_matmul_impl(
             return_A,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Move the gather_dim to the front and flatten the tensor into a 2D matrix.
     # The flattened tensor doesn't need to be contiguous (for computation
     # efficiency), as _pipelined_all_gather_and_consume guarantees that shards
@@ -637,6 +688,7 @@ def default_consumer(shard: torch.Tensor, rank: int) -> None:
     return A, [unflatten(output) for output in outputs]
 
 
+<<<<<<< HEAD
 def _pipelined_all_gather_and_consume_last_dim(
     shard: torch.Tensor,
     shard_consumer: Callable[[torch.Tensor, int], None],
@@ -771,6 +823,8 @@ def default_consumer(shard: torch.Tensor, rank: int) -> None:
     return ret_A, [unflatten(output) for output in outputs]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.library.impl(lib, "fused_all_gather_matmul", "Meta")
 def _fused_all_gather_matmul_fallback(
     A_shard: torch.Tensor,
@@ -779,12 +833,17 @@ def _fused_all_gather_matmul_fallback(
     group_name: str,
     *,
     return_A: bool = True,
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+=======
+) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     group_size = c10d._get_group_size_by_name(group_name)
     A = torch.ops._c10d_functional.all_gather_into_tensor(
         A_shard.contiguous(), group_size, group_name
     )
     A = torch.ops._c10d_functional.wait_tensor(A)
+<<<<<<< HEAD
     if gather_dim == A.ndim - 1 or gather_dim == -1:
         A_splits = A.chunk(group_size)
         A_mm = torch.cat(A_splits, dim=-1)
@@ -794,6 +853,8 @@ def _fused_all_gather_matmul_fallback(
         else:
             return None, res
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     A = A.view(group_size, *A_shard.shape).movedim(gather_dim + 1, 1).flatten(0, 1)
     res = [torch.matmul(A, B).movedim(0, gather_dim) for B in Bs]
     if return_A:
@@ -810,7 +871,11 @@ def _fused_all_gather_matmul(
     group_name: str,
     *,
     return_A: bool = True,
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+=======
+) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Perform the following logic with micro-pipelined computation and
     communication:
@@ -980,9 +1045,15 @@ def _fused_all_gather_scaled_matmul_fallback(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
+<<<<<<< HEAD
     biases: list[torch.Tensor | None],
     result_scales: list[torch.Tensor | None],
     out_dtypes: list[torch.dtype | None],
+=======
+    biases: list[Optional[torch.Tensor]],
+    result_scales: list[Optional[torch.Tensor]],
+    out_dtypes: list[Optional[torch.dtype]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     out_dtypes = _maybe_convert_scalar_types_to_dtypes(out_dtypes)
@@ -1018,9 +1089,15 @@ def scaled_matmul(
         B: torch.Tensor,
         A_scale: torch.Tensor,
         B_scale: torch.Tensor,
+<<<<<<< HEAD
         bias: torch.Tensor | None,
         result_scale: torch.Tensor | None,
         out_dtype: torch.dtype | None,
+=======
+        bias: Optional[torch.Tensor],
+        result_scale: Optional[torch.Tensor],
+        out_dtype: Optional[torch.dtype],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_fast_accum: bool,
     ) -> torch.Tensor:
         leading_dims = A.shape[:-1]
@@ -1054,9 +1131,15 @@ def _fused_all_gather_scaled_matmul(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
+<<<<<<< HEAD
     biases: list[torch.Tensor | None],
     result_scales: list[torch.Tensor | None],
     out_dtypes: list[torch.dtype | None],
+=======
+    biases: list[Optional[torch.Tensor]],
+    result_scales: list[Optional[torch.Tensor]],
+    out_dtypes: list[Optional[torch.dtype]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     """
@@ -1207,7 +1290,11 @@ def _fused_matmul_reduce_scatter_impl(
     A: torch.Tensor,
     B: torch.Tensor,
     kwargs: dict[str, Any],
+<<<<<<< HEAD
     out_dtype: torch.dtype | None,
+=======
+    out_dtype: Optional[torch.dtype],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduce_op: str,
     scatter_dim: int,
     group_name: str,
@@ -1224,10 +1311,15 @@ def _fused_matmul_reduce_scatter_impl(
         reduce_fn = partial(torch.mean, dim=0)
     else:
         raise ValueError("reduce_op must be sum or avg")
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     group = c10d._resolve_process_group(group_name)
     out_shape = [*A.shape[:-1], B.shape[1]]
     out_shape[scatter_dim] //= group.size()
 
+<<<<<<< HEAD
     if scatter_dim == A.ndim - 1:
         B_shards = B.chunk(group.size(), dim=B.ndim - 1)
         A_flat = A.flatten(0, -2)
@@ -1258,6 +1350,8 @@ def _chunk_producer(rank: int, out: torch.Tensor) -> None:
             dim=-2,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Move the scatter_dim to the front and flatten the tensor into a 2D matrix
     x = A.movedim(scatter_dim, 0)
     leading_dims = [group.size()] + list(x.shape[:-1])
@@ -1298,9 +1392,15 @@ def _fused_scaled_matmul_reduce_scatter(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
+<<<<<<< HEAD
     bias: torch.Tensor | None = None,
     result_scale: torch.Tensor | None = None,
     out_dtype: torch.dtype | None = None,
+=======
+    bias: Optional[torch.Tensor] = None,
+    result_scale: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if _is_test_mode:
@@ -1352,9 +1452,15 @@ def _fused_scaled_matmul_reduce_scatter_fallback(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
+<<<<<<< HEAD
     bias: torch.Tensor | None = None,
     result_scale: torch.Tensor | None = None,
     out_dtype: torch.dtype | None = None,
+=======
+    bias: Optional[torch.Tensor] = None,
+    result_scale: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if A_scale.numel() > 1:
@@ -1398,7 +1504,11 @@ def _fused_scaled_matmul_reduce_scatter_impl(
     B: torch.Tensor,
     A_scale: torch.Tensor,
     kwargs: dict[str, Any],
+<<<<<<< HEAD
     out_dtype: torch.dtype | None,
+=======
+    out_dtype: Optional[torch.dtype],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduce_op: str,
     orig_scatter_dim: int,
     scatter_dim_after_maybe_reshape: int,
@@ -1460,11 +1570,14 @@ def _fused_scaled_matmul_reduce_scatter_impl(
             .flatten(0, -2)
         )
         A_scale_shards = list(A_scale.chunk(group.size()))
+<<<<<<< HEAD
         # cuBLAS's row-wise kernel requires scales to be aligned to 16 bytes.
         # When we slice them we might break this and need to reallocate them.
         A_scale_shards = [
             t if t.data_ptr() % 16 == 0 else t.clone() for t in A_scale_shards
         ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise ValueError("A_scale cannot be none for scaled_mm")
 
@@ -1472,7 +1585,11 @@ def _fused_scaled_matmul_reduce_scatter_impl(
     def chunk_producer(rank: int, out: torch.Tensor) -> None:
         mm_out_op(A_shards[rank], B, scale_a=A_scale_shards[rank], **kwargs, out=out)
 
+<<<<<<< HEAD
     # Stacked partials will be the 2D outputs of the pipelined scaled mm, and will
+=======
+    # Stacked partials will be the 2D outputs of the the pipelined scaled mm, and will
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # have the shape (A_with_scatter_dim_0_tensor.shape[0], B.shape[1]) to align with the formula:
     # (a*b,c) @ (c,d) = (a*b,d)
     stacked_partials = A_with_scatter_dim_0.new_empty(
@@ -1540,7 +1657,11 @@ def restride_A_for_fused_matmul_reduce_scatter(
 
 def _maybe_convert_scalar_types_to_dtypes(
     scalar_types: list[Any],
+<<<<<<< HEAD
 ) -> list[torch.dtype | None]:
+=======
+) -> list[Optional[torch.dtype]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     When a list of `torch.dtype`s is passed through the dispatcher as
     `ScalarType[]`, it is converted to a list of scalar type enum values. This
@@ -1572,12 +1693,20 @@ def _maybe_convert_scalar_types_to_dtypes(
     if any(not isinstance(x, (type(None), int)) for x in scalar_types):
         return scalar_types
 
+<<<<<<< HEAD
     dtypes: list[torch.dtype | None] = []
+=======
+    dtypes: list[Optional[torch.dtype]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for scalar_type in scalar_types:
         if scalar_type is None:
             dtypes.append(scalar_type)
         elif scalar_type not in _SCALAR_TYPE_TO_DTYPE:
+<<<<<<< HEAD
             raise ValueError(f"Unrecognized scalar type {scalar_type}")
+=======
+            raise ValueError("Unrecognized scalar type {scalar_type}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             dtypes.append(_SCALAR_TYPE_TO_DTYPE[scalar_type])
     return dtypes
@@ -1671,7 +1800,11 @@ def _low_contention_all_gather(
             local_buf.copy_(tensor)
         # pull
         symm_mem.barrier()
+<<<<<<< HEAD
         for step in range(world_size):
+=======
+        for step in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
@@ -1706,7 +1839,11 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
+<<<<<<< HEAD
         for step in range(world_size):
+=======
+        for step in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(
                 remote_rank,
@@ -1743,7 +1880,11 @@ def _low_contention_reduce_scatter_with_workspace(
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
+<<<<<<< HEAD
         for step in range(world_size):
+=======
+        for step in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             remote_rank = (rank - step) % world_size
             dst_buf = workspace.get_buffer(
                 remote_rank, chunks[0].shape, chunks[0].dtype, chunks[0].numel() * rank
@@ -1804,6 +1945,7 @@ def _low_contention_reduce_scatter(
         )
 
 
+<<<<<<< HEAD
 @torch.library.impl(lib, "all_to_all_vdev_2d", "Meta")
 def _all_to_all_vdev_2d_meta(
     input: torch.Tensor,
@@ -1827,42 +1969,74 @@ def _all_to_all_vdev_2d_offset_meta(
     return None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # =============================================================================
 # User-facing APIs
 # =============================================================================
 
 
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import overload, TYPE_CHECKING, Union
+=======
+from typing import Any, overload, TYPE_CHECKING, Union
+
+from torch.types import _device, _dtype, _int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     from torch._C._distributed_c10d import ProcessGroup
+<<<<<<< HEAD
     from torch.types import _device, _dtype, _int
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def empty(
+<<<<<<< HEAD
     *size: _int, dtype: _dtype | None = None, device: _device | None = None
+=======
+    *size: _int, dtype: Optional[_dtype] = None, device: Optional[_device] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor: ...
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
 def empty(
     size: Sequence[_int],
     *,
     dtype: _dtype | None = None,
     device: _device | None = None,
+=======
+def empty(
+    size: Sequence[_int],
+    *,
+    dtype: Optional[_dtype] = None,
+    device: Optional[_device] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor: ...
 
 
 def empty(  # type: ignore[misc]
     *size: Any,
+<<<<<<< HEAD
     dtype: _dtype | None = None,
     device: _device | None = None,
 ) -> torch.Tensor:
     r"""
+=======
+    dtype: Optional[_dtype] = None,
+    device: Optional[_device] = None,
+) -> torch.Tensor:
+    r"""
+    empty(*size, *, dtype=None, device=None) -> Tensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Similar to :func:`torch.empty()`. The returned tensor can be used by
     :func:`torch._distributed._symmetric_memory.rendezvous()` to establish a
     symmetric memory tensor among participating processes.
@@ -1899,7 +2073,11 @@ def empty(  # type: ignore[misc]
 
 
 def rendezvous(
+<<<<<<< HEAD
     tensor: torch.Tensor, group: Union[str, ProcessGroup]
+=======
+    tensor: torch.Tensor, group: Union[str, "ProcessGroup"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> _SymmetricMemory:
     r"""
     rendezvous(tensor, group) -> _SymmetricMemory
@@ -1943,6 +2121,7 @@ def is_nvshmem_available() -> bool:
     return _is_nvshmem_available()
 
 
+<<<<<<< HEAD
 def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL"]) -> None:
     r"""
     Set the backend for symmetric memory allocation. This is a global setting
@@ -1980,3 +2159,6 @@ def get_mempool_allocator(device: _device):  # type: ignore[no-untyped-def]
 
 
 __all__ = ["empty", "rendezvous", "is_nvshmem_available", "set_backend", "get_backend"]
+=======
+__all__ = ["empty", "rendezvous", "is_nvshmem_available"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
index 46e30376b5b11..156302647e1b2 100644
--- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import logging
 import os
 import subprocess
@@ -480,17 +481,126 @@ def putmem_signal_block_extern_wrapper(  # type: ignore[no-untyped-def]
         sig_op,
         pe,
         _semantic=None,
+=======
+import os
+import sysconfig
+from typing import Optional
+
+from torch.utils._triton import has_triton
+
+
+def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
+    """
+    Enable NVSHMEM device functions for Triton. It performs a NVSHMEM
+    device-side initialization on the kernel module created by Triton.
+
+    Args:
+        lib_dir (Optional[str]): The directory where the NVSHMEM device library
+        is located. If not provided, it will use the default path where NVSHMEM
+        wheel is installed.
+
+    Returns:
+        dict[str, str]: A dictionary containing the NVSHMEM device library name
+        and path.
+    """
+    from triton.runtime.jit import JITFunction
+
+    from torch._C._distributed_c10d import _nvshmemx_cumodule_init
+
+    # Detect NVSHMEM device library path from python library path
+    if lib_dir is None:
+        py_lib_path = sysconfig.get_path("purelib")
+        lib_dir = py_lib_path + "/nvidia/nvshmem/lib"
+
+    lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
+    if not os.path.exists(lib_path):
+        raise RuntimeError("NVSHMEM device library not found")
+
+    extern_libs = {"libnvshmem_device": lib_path}
+
+    # A hook function to initialize NVSHMEM in Triton
+    def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+        key = kwargs["key"]
+        device = kwargs["compile"]["device"]
+        jit_function = kwargs["fn"].jit_function
+        kernel_cache, _, _, _ = jit_function.device_caches[device]
+        kernel = kernel_cache.get(key, None)
+        kernel.run
+        _nvshmemx_cumodule_init(kernel.module)
+
+    # Register the function as a post-compile hook
+    JITFunction.compiled_hook = nvshmem_init_hook
+
+    # Return to user so that they can use it in Triton kernel invocation
+    return extern_libs
+
+
+if has_triton():
+    from triton.language import core
+
+    @core.extern
+    def putmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [dst, src, nelems, pe],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_putmem_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def getmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [dst, src, nelems, pe],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_getmem_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def putmem_signal_block(  # type: ignore[no-untyped-def]
+        dst,
+        src,
+        nelems,
+        sig_addr,
+        signal,
+        sig_op,
+        pe,
+        _builder=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
+<<<<<<< HEAD
             [dst, src, size_bytes, signal, sig_val, sig_op, pe],
+=======
+            [dst, src, nelems, sig_addr, signal, sig_op, pe],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {
                 (
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
+<<<<<<< HEAD
                     core.dtype("uint64"),
                     core.dtype("int32"),
                     core.dtype("int32"),
@@ -542,6 +652,19 @@ def wait_until(ivar, cmp_op, cmp_val):  # type: ignore[no-untyped-def]
 
     @core.extern
     def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
+=======
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_putmem_signal_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def wait_until(ivar, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return core.extern_elementwise(
             "",
             "",
@@ -549,6 +672,7 @@ def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: igno
             {
                 (
                     core.dtype("int64"),
+<<<<<<< HEAD
                     core.dtype("int32"),
                     core.dtype("int32"),
                 ): ("nvshmem_int_wait_until", core.dtype("int32"))
@@ -708,6 +832,35 @@ def fence(_semantic=None):  # type: ignore[no-untyped-def]
             nvshmem.put(dst2, src2, nelems, target_pe)
             ```
         """
+=======
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmem_longlong_wait_until", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def signal_wait_until(sig_addr, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [sig_addr, cmp, cmp_val],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmem_signal_wait_until", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def fence(_builder=None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return core.extern_elementwise(
             "",
             "",
@@ -716,6 +869,7 @@ def fence(_semantic=None):  # type: ignore[no-untyped-def]
                 (): ("nvshmem_fence", core.dtype("int32")),
             },
             is_pure=False,
+<<<<<<< HEAD
             _semantic=_semantic,
         )
 
@@ -756,6 +910,13 @@ def quiet(_semantic=None):  # type: ignore[no-untyped-def]
             )  # Signal completion
             ```
         """
+=======
+            _builder=_builder,
+        )
+
+    @core.extern
+    def quiet(_builder=None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return core.extern_elementwise(
             "",
             "",
@@ -764,6 +925,7 @@ def quiet(_semantic=None):  # type: ignore[no-untyped-def]
                 (): ("nvshmem_quiet", core.dtype("int32")),
             },
             is_pure=False,
+<<<<<<< HEAD
             _semantic=_semantic,
         )
 
@@ -1218,3 +1380,7 @@ def on_exit() -> None:
 
         if kernel not in triton_kernels:
             triton_kernels[kernel] = None
+=======
+            _builder=_builder,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/_tools/fake_collectives.py b/torch/distributed/_tools/fake_collectives.py
index 3b201b395334b..9ca664e8b489a 100644
--- a/torch/distributed/_tools/fake_collectives.py
+++ b/torch/distributed/_tools/fake_collectives.py
@@ -63,9 +63,16 @@ def create_fakework(args, return_first_arg=True):  # type: ignore[no-untyped-def
     "recv_any_source_": lambda *args: create_fakework(args, return_first_arg=False),
 }
 
+<<<<<<< HEAD
 lib_impl = torch.library.Library("c10d", "IMPL")  # noqa: TOR901
 for op, meta_func in _META_FUNCTIONS.items():
     lib_impl.impl(op, meta_func, "Meta")
+=======
+if not torch._running_with_deploy():
+    lib_impl = torch.library.Library("c10d", "IMPL")  # noqa: TOR901
+    for op, meta_func in _META_FUNCTIONS.items():
+        lib_impl.impl(op, meta_func, "Meta")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # List of collective operation functions including functional collectives
 # Note: The following collectives might be deprecated soon hence not adding them
diff --git a/torch/distributed/_tools/fsdp2_mem_tracker.py b/torch/distributed/_tools/fsdp2_mem_tracker.py
index 60ff77d0d4972..2ca53edb89534 100644
--- a/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/torch/distributed/_tools/fsdp2_mem_tracker.py
@@ -1,8 +1,15 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
 from typing import Any, NamedTuple, Optional, TypeVar, Union
+=======
+from copy import deepcopy
+from enum import auto, Enum
+from functools import partial, wraps
+from typing import Any, Callable, NamedTuple, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 import torch
@@ -231,7 +238,10 @@ def inner(
                         " or file a github issue if you need this feature."
                     )
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args, kwargs = orig_fsdp_state_pre_fw(*args, **kwargs)
 
             fsdp_state = fsdp_mod._get_fsdp_state()
@@ -365,7 +375,10 @@ def _instrument_fsdp_module(self) -> None:
         # `FSDPParamGroup.post_forward` because during AC these won't be called.
         # TODO(@sanketpurandare): This will need to be modified after this PR (https://github.com/pytorch/pytorch/pull/127786)
         # lands. For backward we monkey-patch the `FSDPParamGroup.pre_backward` and `FSDPParamGroup.post_backward`.
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for module in self._root_mod.modules():
             if isinstance(module, FSDPModule):
                 fsdp_state = module._get_fsdp_state()
@@ -374,7 +387,10 @@ def _instrument_fsdp_module(self) -> None:
                     fsdp_state._pre_forward_hook_handle.remove()
                     fsdp_state._post_forward_hook_handle.remove()
                     fsdp_state._pre_forward_hook_handle = (
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         module.register_forward_pre_hook(
                             self._fsdp_state_pre_forward(
                                 module, fsdp_state._pre_forward
@@ -383,7 +399,10 @@ def _instrument_fsdp_module(self) -> None:
                             with_kwargs=True,
                         )
                     )
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     fsdp_state._post_forward_hook_handle = module.register_forward_hook(
                         self._fsdp_state_post_forward(module, fsdp_state._post_forward),
                         prepend=False,
@@ -402,7 +421,10 @@ def _instrument_fsdp_module(self) -> None:
                         )
                     )
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for buffer in self._root_mod.buffers():
             self._update_and_maybe_create_winfos(
                 buffer,
@@ -512,7 +534,10 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
         ):
             # N.B: This is a hacky way to override the Meta IMPL of wait_tensor. The original impl returns
             # a new tensor which does not happen in eager mode, when a wait_tensor is called.
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = args[0]
         else:
             res = func(*args, **kwargs or {})
@@ -529,7 +554,10 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
             _FSDPState.PRE_FW,
             _FSDPState.PRE_BW,
         ]:
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_tensor = args[0]
             self._update_and_maybe_create_winfos(
                 output_tensor,
@@ -540,7 +568,10 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
             func == c10d._reduce_scatter_base_.default
             and self._fsdp_state == _FSDPState.POST_BW
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_tensor = args[1]
             self._update_and_maybe_create_winfos(
                 input_tensor,
diff --git a/torch/distributed/_tools/ilp_utils.py b/torch/distributed/_tools/ilp_utils.py
index 0e8ba4195ffd2..1051d289c3094 100644
--- a/torch/distributed/_tools/ilp_utils.py
+++ b/torch/distributed/_tools/ilp_utils.py
@@ -127,7 +127,11 @@ def aggregate_stats(
     }
 
     for mod in model.modules():
+<<<<<<< HEAD
         if mod_mem_stat := mod_mem_stats.get(mod):
+=======
+        if mod_mem_stat := mod_mem_stats.get(mod, None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if tradeoff_stats := mod_sac_tradeoff_stats.get(mod_mem_stat.mod_fqn, None):
                 sac_runtime = tradeoff_stats.sac_runtime
                 sac_memory = tradeoff_stats.sac_memory
diff --git a/torch/distributed/_tools/mem_tracker.py b/torch/distributed/_tools/mem_tracker.py
index 04f5482d7d128..713113ceea27b 100644
--- a/torch/distributed/_tools/mem_tracker.py
+++ b/torch/distributed/_tools/mem_tracker.py
@@ -2,12 +2,19 @@
 import os
 import re
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -143,7 +150,10 @@ def __init__(
         self.size = size
         self.element_size = element_size
         self.reftype = reftype
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self.mem_consumed = self._calculate_mem_consumed()
 
@@ -405,7 +415,10 @@ def _update_snap(
         # Initialize a flag to track if the total memory might drop to zero after updates.
         maybe_zero = False
         # Ensure the device entry exists in the current memory snapshot, initializing if necessary.
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dev_snap = self._curr_mem_snap.setdefault(
             winfo.device, dict.fromkeys(self._ref_class, 0)
         )
@@ -917,7 +930,10 @@ def __enter__(self) -> "MemTracker":
         self._depth += 1
         return self
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __exit__(self, *args: Any) -> None:
         self._depth -= 1
         if self._depth == 0:
@@ -935,7 +951,10 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):  # type: ignore
         ):
             # N.B: This is a hacky way to override the Meta IMPL of wait_tensor. The original impl returns
             # a new tensor which does not happen in eager mode, when a wait_tensor is called.
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = args[0]
         else:
             res = func(*args, **kwargs or {})
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 1dc01f62d94e7..78154fafb00c1 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -2,9 +2,15 @@
 import operator
 import pickle
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from itertools import chain
 from typing import Any, no_type_check, TYPE_CHECKING
+=======
+from collections.abc import Sequence
+from itertools import chain
+from typing import Any, Callable, no_type_check, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -232,9 +238,13 @@ def _create_pre_forward_hook(self, name: str) -> Callable:
         def _pre_forward_hook(module: nn.Module, inputs: Any) -> None:
             self._cur_module_name = f"{name}.forward"
             if (
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-argument]
                 hasattr(module, "_memory_tracker_is_root")
                 # pyrefly: ignore [not-callable]
+=======
+                hasattr(module, "_memory_tracker_is_root")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and module._memory_tracker_is_root
             ):
                 self._add_marker("fw_start")
@@ -250,9 +260,13 @@ def _post_forward_hook(
             outputs: Sequence[torch.Tensor],
         ) -> None:
             if (
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-argument]
                 hasattr(module, "_memory_tracker_is_root")
                 # pyrefly: ignore [not-callable]
+=======
+                hasattr(module, "_memory_tracker_is_root")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and module._memory_tracker_is_root
             ):
                 self._add_marker("fw_bw_boundary")
diff --git a/torch/distributed/_tools/mod_tracker.py b/torch/distributed/_tools/mod_tracker.py
index ad736a8302f68..48c4e43fd67f3 100644
--- a/torch/distributed/_tools/mod_tracker.py
+++ b/torch/distributed/_tools/mod_tracker.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import warnings
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.autograd.graph import register_multi_grad_hook
@@ -178,12 +182,19 @@ def fn(*args):
                 def custom_formatwarning(msg, category, filename, lineno, line=None):
                     return f"{filename}:{lineno}: {category.__name__}: {msg} \n"
 
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
                 warnings.formatwarning = custom_formatwarning
                 warnings.warn(
                     "The module hierarchy tracking maybe be messed up."
                     " Please file a bug to PyTorch, if it is the case.",
                     stacklevel=2,
+=======
+                warnings.formatwarning = custom_formatwarning
+                warnings.warn(
+                    "The module hierarchy tracking maybe be messed up."
+                    " Please file a bug to PyTorch, if it is the case."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if name not in self.parents:
                 self._active_module_cnt[name] = 1
diff --git a/torch/distributed/_tools/runtime_estimator.py b/torch/distributed/_tools/runtime_estimator.py
index b897e51cac9f3..614e2ad4b99e5 100644
--- a/torch/distributed/_tools/runtime_estimator.py
+++ b/torch/distributed/_tools/runtime_estimator.py
@@ -2,7 +2,11 @@
 import math
 import os
 from collections import defaultdict
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -16,10 +20,13 @@
 from torch.utils.flop_counter import flop_registry
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 
 # This value is hard-coded here:
@@ -519,7 +526,10 @@ def __enter__(self) -> Self:
         super().__enter__()
         return self
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __exit__(self, *args: Any) -> None:
         print(
             f"Estimated ({self._estimate_mode_type})"
diff --git a/torch/distributed/_tools/sac_estimator.py b/torch/distributed/_tools/sac_estimator.py
index eaad4d26aa36e..7a636848d4160 100644
--- a/torch/distributed/_tools/sac_estimator.py
+++ b/torch/distributed/_tools/sac_estimator.py
@@ -429,7 +429,10 @@ def __torch_dispatch__(  # type: ignore[no-untyped-def]
         # sdpa has non-deterministic seed, but might be deterministic
         # if no dropout is applied
         if func.overloadpacket.__name__ == "_scaled_dot_product_flash_attention":
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_rand_op = kwargs.get("dropout_p", 0) != 0
         # 5. Create metadata information per active non-leaf module
         for mod_fqn in self._mod_tracker.parents:
@@ -711,7 +714,11 @@ def display_sac_stats(
                 str(i in sac_stats.view_like_ops),
                 str(i in sac_stats.rand_ops),
                 str(i in sac_stats.saved_autograd_ops),
+<<<<<<< HEAD
                 str(op_parent.get(i)),
+=======
+                str(op_parent.get(i, None)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             table_data.append(row)
         # Define headers
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 3ce067f6cddc0..79f389ae9777b 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -1,10 +1,17 @@
 # mypy: allow-untyped-defs
 import warnings
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from enum import auto, Enum
 from functools import partial
 from typing import Any, Optional
+=======
+from collections.abc import Iterator
+from enum import auto, Enum
+from functools import partial
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
diff --git a/torch/distributed/algorithms/_quantization/quantization.py b/torch/distributed/algorithms/_quantization/quantization.py
index a1fa1fd64c060..0ca8320eb9e98 100644
--- a/torch/distributed/algorithms/_quantization/quantization.py
+++ b/torch/distributed/algorithms/_quantization/quantization.py
@@ -65,7 +65,10 @@ def _dequantize_tensor(tensor, qtype, quant_loss=None):
         elif tensor.dtype == torch.float16 and quant_loss is None:
             return tensor.float()
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return tensor.float() / quant_loss
     elif qtype == DQuantType.BFP16:
         if tensor.dtype != torch.float16:
@@ -107,7 +110,11 @@ def auto_quantize(func, qtype, quant_loss=None):
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         group = kwargs.get("group")
+=======
+        group = kwargs.get("group", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         async_op = kwargs.get("async_op", False)
         if async_op is True:
             raise RuntimeError("The async_op=True mode is not supported yet.")
@@ -133,8 +140,13 @@ def wrapper(*args, **kwargs):
 
         elif func == dist.all_to_all_single:
             tensors = args[0]
+<<<<<<< HEAD
             out_splits = kwargs.get("out_splits")
             in_splits = kwargs.get("in_splits")
+=======
+            out_splits = kwargs.get("out_splits", None)
+            in_splits = kwargs.get("in_splits", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Quantizing the input/output tensor
             input_tensors = _quantize_tensor(args[1], qtype)
             out_tensors = _quantize_tensor(tensors, qtype)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index d9cc6d12785cc..27e64316c86f5 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import sys
 from enum import Enum
 from functools import partial
@@ -16,6 +17,11 @@ def _enum_member(x):
         return x
 
 
+=======
+from enum import Enum
+from functools import partial
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed as dist
 
 from . import (
@@ -65,6 +71,7 @@ class DDPCommHookType(Enum):
     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
     """
 
+<<<<<<< HEAD
     ALLREDUCE = _enum_member(
         partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
     )
@@ -120,6 +127,47 @@ class DDPCommHookType(Enum):
             _ddp_comm_hook_wrapper,
             comm_hook=debugging.noop_hook,
         )
+=======
+    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+    FP16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+    )
+    BF16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
+    )
+    QUANTIZE_PER_TENSOR = partial(
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+    )
+    QUANTIZE_PER_CHANNEL = partial(
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+    )
+    POWER_SGD = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.powerSGD_hook,
+        matrix_approximation_rank=1,
+    )
+    # Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
+    # but it runs slower and consumes more memory.
+    POWER_SGD_RANK2 = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.powerSGD_hook,
+        matrix_approximation_rank=2,
+    )
+    # Batching can lead to a faster training at the cost of accuracy.
+    BATCHED_POWER_SGD = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.batched_powerSGD_hook,
+        matrix_approximation_rank=1,
+    )
+    BATCHED_POWER_SGD_RANK2 = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.batched_powerSGD_hook,
+        matrix_approximation_rank=2,
+    )
+    NOOP = partial(
+        _ddp_comm_hook_wrapper,
+        comm_hook=debugging.noop_hook,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
index 2e55941b370cd..9d6f96e4dd1d9 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -233,11 +237,21 @@ def hook_with_zero_step(
         )
     ddp_ref = weakref.ref(ddp)
 
+<<<<<<< HEAD
     # NOTE: Gloo may hang with this overlapping approach; see https://github.com/pytorch/pytorch/issues/62300
     pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
     if pg == dist.Backend.GLOO:
         raise RuntimeError(
             "Gloo backend using Overlapping DDP with ZeRO may meet hangs"
+=======
+    # NOTE: Gloo may hang with this overlapping approach, so we require
+    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
+    if (pg != dist.Backend.NCCL) and (pg != "hccl"):
+        raise RuntimeError(
+            "Overlapping DDP with ZeRO using this approach currently requires "
+            "NCCL/HCCL backend to avoid hangs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if shard_buckets:
@@ -393,11 +407,21 @@ def hook_with_zero_step_interleaved(
         )
     ddp_ref = weakref.ref(ddp)
 
+<<<<<<< HEAD
     # NOTE: Gloo may hang with this overlapping approach; see https://github.com/pytorch/pytorch/issues/62300
     pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
     if pg == dist.Backend.GLOO:
         raise RuntimeError(
             "Gloo backend using Overlapping DDP with ZeRO may meet hangs"
+=======
+    # NOTE: Gloo may hang with this overlapping approach, so we require
+    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
+    if (pg != dist.Backend.NCCL) and (pg != "hccl"):
+        raise RuntimeError(
+            "Overlapping DDP with ZeRO using this approach currently requires "
+            "NCCL/HCCL backend to avoid hangs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if shard_buckets:
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index 20a0de7ef318c..470a7deecbd39 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast
+=======
+from typing import Any, Callable, cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -22,7 +26,10 @@ def _allreduce_fut(
     group_to_use = process_group if process_group is not None else dist.group.WORLD
 
     # Apply the division first to avoid overflow, especially for FP16.
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor.div_(group_to_use.size())
 
     return (
@@ -60,7 +67,10 @@ def _compress_hook(
     bucket: dist.GradBucket,
 ) -> torch.futures.Future[torch.Tensor]:
     group_to_use = process_group if process_group is not None else dist.group.WORLD
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     world_size = group_to_use.size()
 
     buffer = (
@@ -80,10 +90,14 @@ def decompress(fut):
 
     if torch.compiler.is_compiling():
         grad = dist._functional_collectives.all_reduce(
+<<<<<<< HEAD
             compressed_tensor,
             "sum",
             # pyrefly: ignore [bad-argument-type]
             group_to_use,
+=======
+            compressed_tensor, "sum", group_to_use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return decompress(grad)
     else:
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
index 162160e394ad0..b23ce21a25f3c 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -1,8 +1,14 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, no_type_check
+=======
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, no_type_check
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index f1e95d12514ed..42c284fd9138c 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -434,7 +434,11 @@ def powerSGD_hook(
         # Keep a copy of the input tensor,
         # so that we can compute the local error caused by compression later,
         # by comparing this copy and the input tensor updated after decompression.
+<<<<<<< HEAD
         input_tensor_cp = input_tensor.detach().clone()
+=======
+        input_tensor_cp = torch.clone(input_tensor).detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Unflatten the input tensor into per-parameter tensors, for layer-wise compression.
     tensors = bucket.gradients()
@@ -631,7 +635,10 @@ def decompress(fut):
 
         if state.use_error_feedback:
             # Memorize the local errors.
+<<<<<<< HEAD
             assert input_tensor_cp is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         if not state.warm_start:
             state.p_memory_dict.clear()
@@ -757,7 +764,11 @@ def batched_powerSGD_hook(
         # Keep a copy of the input tensor,
         # so that we can compute the local error caused by compression later,
         # by comparing this copy and the input tensor updated after decompression.
+<<<<<<< HEAD
         input_tensor_cp = input_tensor.detach().clone()
+=======
+        input_tensor_cp = torch.clone(input_tensor).detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     matrix = input_tensor.view(square_side_length, square_side_length)
 
     # Reuse P and Q from the previous iteration if possible.
@@ -844,7 +855,10 @@ def decompress(fut):
 
         if state.use_error_feedback:
             # Memorize the local errors.
+<<<<<<< HEAD
             assert input_tensor_cp is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         # Removing this seemingly unnecessary sync somehow may cause failures.
         # See: https://github.com/pytorch/pytorch/pull/54838
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
index 886155908e1a7..e6925490a8f00 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
@@ -66,7 +66,10 @@ def quantization_pertensor_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     world_size = group_to_use.size()
 
     tensor = bucket.buffer()
@@ -148,7 +151,10 @@ def quantization_perchannel_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     world_size = group_to_use.size()
 
     tensor = bucket.buffer()
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index bf7cb117f87ee..50f73295c188f 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -210,7 +210,10 @@ def _extract_dist_info(self) -> None:
         """
         process_group = None
         device = None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for joinable in self._joinables:
             if process_group is None:
                 process_group = joinable.join_process_group
@@ -257,8 +260,12 @@ def __exit__(
                     f"{self._rank} has at least {WARN_THRESHOLD} "
                     f"fewer inputs than other currently-active ranks. "
                     "This level of skew could lead to performance "
+<<<<<<< HEAD
                     "degradation during training.",
                     stacklevel=2,
+=======
+                    "degradation during training."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             # Shadow the all-reduce in non-joined processes
             num_nonjoined_procs = self._get_num_nonjoined_procs()
diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
index dd97e5191808f..08633b7e8ae24 100644
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -101,8 +101,12 @@ def __init__(
                 "When period is 1, no need to use model averaging because the communication cost "
                 "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                 "by DistributedDataParallel in the backward pass. Therefore, only "
+<<<<<<< HEAD
                 "DistributedDataParallel should be used for this case.",
                 stacklevel=2,
+=======
+                "DistributedDataParallel should be used for this case."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         self.period = period
 
diff --git a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
index 33cde4cb3a743..ac9a15ac75092 100644
--- a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
+++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@@ -114,8 +114,12 @@ def __init__(self, period_group_size_dict=None, warmup_steps=0, process_group=No
                 "no need to use model averaging because the communication cost "
                 "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                 "by DistributedDataParallel in the backward pass. Therefore, only "
+<<<<<<< HEAD
                 "DistributedDataParallel should be used for this case.",
                 stacklevel=2,
+=======
+                "DistributedDataParallel should be used for this case."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         overall_group_size = dist.get_world_size(group=self.process_group)
         if list(period_group_size_dict.values())[-1] != overall_group_size:
diff --git a/torch/distributed/autograd/__init__.py b/torch/distributed/autograd/__init__.py
index 6a52c36942e48..1ce06fd4eff78 100644
--- a/torch/distributed/autograd/__init__.py
+++ b/torch/distributed/autograd/__init__.py
@@ -1,15 +1,23 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 from typing import Any, TYPE_CHECKING
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from types import TracebackType
 
 
 def is_available() -> bool:
+=======
+def is_available():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return hasattr(torch._C, "_dist_autograd_init")
 
 
@@ -31,8 +39,11 @@ def is_available() -> bool:
         get_gradients,
     )
 
+<<<<<<< HEAD
 __all__ = ["context", "is_available"]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class context:
     """
@@ -53,6 +64,7 @@ class context:
         >>>     dist_autograd.backward(context_id, [loss])
     """
 
+<<<<<<< HEAD
     def __enter__(self) -> int:
         self.autograd_context = _new_context()
         return self.autograd_context._context_id()
@@ -63,4 +75,11 @@ def __exit__(
         exc_value: BaseException | None,
         traceback: TracebackType | None,
     ) -> None:
+=======
+    def __enter__(self):
+        self.autograd_context = _new_context()
+        return self.autograd_context._context_id()
+
+    def __exit__(self, type, value, traceback):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _release_context(self.autograd_context._context_id())
diff --git a/torch/distributed/benchmarks/benchmark_ddp_rpc.py b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
index e0279966695bc..3a1efa98e44e4 100644
--- a/torch/distributed/benchmarks/benchmark_ddp_rpc.py
+++ b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
@@ -74,7 +74,10 @@ def forward(self, indices, offsets):
         assert NUM_PS * EMBEDDING_DIM >= 512
         dim_normalizer = int(NUM_PS * EMBEDDING_DIM / 512)
         emb_lookups_reshaped = emb_lookups_cat.reshape(  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [emb_lookups_cat.shape[0] * dim_normalizer, 512]
         )
 
diff --git a/torch/distributed/c10d_logger.py b/torch/distributed/c10d_logger.py
index 1dfae5b92962f..9845362257d27 100644
--- a/torch/distributed/c10d_logger.py
+++ b/torch/distributed/c10d_logger.py
@@ -9,8 +9,12 @@
 
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, TypeVar
+=======
+from typing import Any, Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -45,7 +49,10 @@ def _get_logging_handler(
     return (log_handler, log_handler_name)
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [unknown-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 global _c10d_logger
 _c10d_logger = _get_or_create_logger()
 
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index 8104a8df99f0b..ad44c00275010 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -11,11 +11,15 @@
 )
 from .optimizer import load_sharded_optimizer_state_dict
 from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
+<<<<<<< HEAD
 from .quantized_hf_storage import QuantizedHuggingFaceStorageReader
 
 # pyrefly: ignore [deprecated]
 from .state_dict_loader import load, load_state_dict
 
 # pyrefly: ignore [deprecated]
+=======
+from .state_dict_loader import load, load_state_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .state_dict_saver import async_save, save, save_state_dict
 from .storage import StorageReader, StorageWriter
diff --git a/torch/distributed/checkpoint/_async_executor.py b/torch/distributed/checkpoint/_async_executor.py
index 428c697b91e9b..c80dc141a8df4 100644
--- a/torch/distributed/checkpoint/_async_executor.py
+++ b/torch/distributed/checkpoint/_async_executor.py
@@ -15,14 +15,21 @@ class _AsyncCheckpointExecutor(abc.ABC):
     @abc.abstractmethod
     def execute_save(
         self,
+<<<<<<< HEAD
         staging_future_or_state_dict: Union[STATE_DICT_TYPE, Future[STATE_DICT_TYPE]],
+=======
+        staged_state_dict: STATE_DICT_TYPE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Future:
         """
         Execute the checkpoint save request asynchronously.
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index f7c045cdd27b4..5e42fec5085ae 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -44,8 +44,11 @@ class _AsyncCheckpointRequest:
     checkpoint_request_id: _CheckpointRequestIdentifier
     storage_writer: Optional[StorageWriter] = None
     planner: Optional[SavePlanner] = None
+<<<<<<< HEAD
     no_dist: bool = False
     use_collectives: bool = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(init=False)
@@ -109,6 +112,7 @@ def __init__(
         # Wait for the checkpoint background process to initialize.
         # Using default GLOO init timeout.
         response = self._wait_for_response(timeout=1800)
+<<<<<<< HEAD
         if not response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE:
             raise AssertionError(f"Expected INIT_COMPLETE response, got {response}")
 
@@ -124,6 +128,15 @@ def __del__(self) -> None:
                         "Checkpoint background process is still alive after termination request. Sending SIGTERM."
                     )
                     self._save_process.terminate()
+=======
+        assert response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE
+
+    def __del__(self) -> None:
+        if self._save_process.is_alive():
+            logger.info("Terminating the checkpoint background process...")
+            self._send(_CheckpointSaveProcessControlOpts.TERMINATE)
+            self._save_process.join()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _send(self, data: Any) -> None:
         self._process_pipe.send(data)
@@ -160,8 +173,11 @@ def save(
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Metadata:
         # Create a unique identifier to locate requests/responses
         # from the checkpoint daemon process.
@@ -171,6 +187,7 @@ def save(
             checkpoint_request_id=checkpoint_request_id,
             storage_writer=storage_writer,
             planner=planner,
+<<<<<<< HEAD
             no_dist=no_dist,
             use_collectives=use_collectives,
         )
@@ -178,6 +195,12 @@ def save(
         result = self._wait_for_response()
         if not isinstance(result, Metadata):
             raise AssertionError(f"Expected Metadata response, got {type(result)}")
+=======
+        )
+        self._send(async_cp_request)
+        result = self._wait_for_response()
+        assert isinstance(result, Metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
 
     @staticmethod
@@ -187,8 +210,11 @@ def _execute_save(
         checkpoint_request_id: _CheckpointRequestIdentifier,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Metadata:
         from torch.distributed.checkpoint.state_dict_saver import save
 
@@ -197,8 +223,11 @@ def _execute_save(
             checkpoint_id=checkpoint_request_id.checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
+<<<<<<< HEAD
             no_dist=no_dist,
             use_collectives=use_collectives,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return metadata
 
@@ -207,8 +236,11 @@ def _checkpointing_subprocess(
         pg_init_info: _ProcessGroupInitInfo,
         parent_conn,
     ) -> None:
+<<<<<<< HEAD
         # Phase 1: Process Group Initialization
         # Only needs to execute once during the lifetime of the checkpoint background process.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             _init_logger(pg_init_info.global_rank)
 
@@ -229,6 +261,7 @@ def _checkpointing_subprocess(
 
             logger.info("Checkpoint background process is running...")
             parent_conn.send(_CheckpointSaveProcessControlOpts.INIT_COMPLETE)
+<<<<<<< HEAD
         except BaseException as e:  # noqa: B036
             logger.error(
                 f"Checkpoint background process failed during initialization: {e}"  # noqa: G004
@@ -238,6 +271,10 @@ def _checkpointing_subprocess(
 
         # Phase 2: Serving Loop
         try:
+=======
+
+            # Serving loop.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while True:
                 logger.info("Waiting for checkpoint save request...")
                 obj = parent_conn.recv()
@@ -247,14 +284,19 @@ def _checkpointing_subprocess(
                 ):
                     logger.info("Terminating the checkpoint background process.")
                     return
+<<<<<<< HEAD
                 if not isinstance(obj, _AsyncCheckpointRequest):
                     raise AssertionError(
                         f"Expected _AsyncCheckpointRequest, got {type(obj)}"
                     )
+=======
+                assert isinstance(obj, _AsyncCheckpointRequest)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 logger.info(
                     f"Received async checkpoint request with id={obj.checkpoint_request_id.checkpoint_id}"  # noqa: G004
                 )
 
+<<<<<<< HEAD
                 try:
                     response = _AsyncCheckpointProcess._execute_save(
                         obj.staged_state_dict,
@@ -274,6 +316,24 @@ def _checkpointing_subprocess(
                     )
                     parent_conn.send(e)
                     # Continue serving loop - don't exit process
+=======
+                response = _AsyncCheckpointProcess._execute_save(
+                    obj.staged_state_dict,
+                    checkpoint_request_id=obj.checkpoint_request_id,
+                    storage_writer=obj.storage_writer,
+                    planner=obj.planner,
+                )
+                parent_conn.send(response)
+                logger.info(
+                    f"Submitted checkpoint save request for checkpoint_id={obj.checkpoint_request_id}"  # noqa: G004
+                )
+        except BaseException as e:
+            logger.error(
+                f"Checkpoint background process encountered an exception: {e}"  # noqa: G004
+            )
+            parent_conn.send(e)
+            raise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             logger.info("Checkpoint background process is shutting down...")
             dist.destroy_process_group()
@@ -291,11 +351,16 @@ def __init__(self) -> None:
     def _execute_save_impl(
         *,
         pg_init_info: Optional[_ProcessGroupInitInfo],
+<<<<<<< HEAD
         staging_future_or_state_dict: Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE],
+=======
+        staged_state_dict: STATE_DICT_TYPE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
     ) -> Metadata:
@@ -305,6 +370,12 @@ def _execute_save_impl(
                 raise AssertionError(
                     "pg_init_info must not be None when _CHECKPOINT_PROCESS is None"
                 )
+=======
+    ) -> Metadata:
+        global _CHECKPOINT_PROCESS
+        if _CHECKPOINT_PROCESS is None:
+            assert pg_init_info is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ckpt_kwargs = {}
             if (ckpt_id := getattr(storage_writer, "checkpoint_id", None)) is not None:
                 ckpt_kwargs["checkpoint_id"] = ckpt_id
@@ -313,11 +384,15 @@ def _execute_save_impl(
             @_dcp_method_logger(**ckpt_kwargs)
             def create_checkpoint_daemon_process() -> None:
                 global _CHECKPOINT_PROCESS
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _CHECKPOINT_PROCESS = _AsyncCheckpointProcess(pg_init_info=pg_init_info)
 
             create_checkpoint_daemon_process()
 
+<<<<<<< HEAD
         if _CHECKPOINT_PROCESS is None:
             raise AssertionError(
                 "_CHECKPOINT_PROCESS must not be None after initialization"
@@ -327,25 +402,38 @@ def create_checkpoint_daemon_process() -> None:
             if isinstance(staging_future_or_state_dict, Future)
             else staging_future_or_state_dict
         )
+=======
+        assert _CHECKPOINT_PROCESS is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _CHECKPOINT_PROCESS.save(
             staged_state_dict=staged_state_dict,
             checkpoint_id=checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
+<<<<<<< HEAD
             no_dist=no_dist,
             use_collectives=use_collectives,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def execute_save(
         self,
+<<<<<<< HEAD
         staging_future_or_state_dict: Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE],
+=======
+        staged_state_dict: STATE_DICT_TYPE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Future:
         """
         NOTE:
@@ -372,12 +460,19 @@ def execute_save(
         f: Future = self._executor.submit(
             self._execute_save_impl,
             pg_init_info=pg_init_info,
+<<<<<<< HEAD
             staging_future_or_state_dict=staging_future_or_state_dict,
             checkpoint_id=checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
             no_dist=no_dist,
             use_collectives=use_collectives,
+=======
+            staged_state_dict=staged_state_dict,
+            checkpoint_id=checkpoint_id,
+            storage_writer=storage_writer,
+            planner=planner,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         f.add_done_callback(lambda f: self._executor.shutdown(wait=False))
 
diff --git a/torch/distributed/checkpoint/_async_thread_executor.py b/torch/distributed/checkpoint/_async_thread_executor.py
index 8dfe63413d433..8385b9697c734 100644
--- a/torch/distributed/checkpoint/_async_thread_executor.py
+++ b/torch/distributed/checkpoint/_async_thread_executor.py
@@ -11,6 +11,7 @@
 from torch.distributed.checkpoint.storage import StorageWriter
 
 
+<<<<<<< HEAD
 def save_wrapper(
     staging_future_or_state_dict: Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE],
     *,
@@ -48,23 +49,44 @@ def __init__(self) -> None:
     def execute_save(
         self,
         staging_future_or_state_dict: Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE],
+=======
+class _ThreadBasedAsyncCheckpointExecutor(_AsyncCheckpointExecutor):
+    def __init__(self) -> None:
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    def execute_save(
+        self,
+        staged_state_dict: STATE_DICT_TYPE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+<<<<<<< HEAD
         no_dist: bool = False,
         use_collectives: bool = True,
     ) -> Future:
         f: Future = self._executor.submit(
             save_wrapper,
             staging_future_or_state_dict=staging_future_or_state_dict,
+=======
+    ) -> Future:
+        from torch.distributed.checkpoint.state_dict_saver import save
+
+        f: Future = self._executor.submit(
+            save,
+            staged_state_dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkpoint_id=checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
             process_group=process_group,
+<<<<<<< HEAD
             no_dist=no_dist,
             use_collectives=use_collectives,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         f.add_done_callback(lambda f: self._executor.shutdown(wait=False))
 
diff --git a/torch/distributed/checkpoint/_checkpointer.py b/torch/distributed/checkpoint/_checkpointer.py
index d54de9092a93f..ebd13b6dded5b 100644
--- a/torch/distributed/checkpoint/_checkpointer.py
+++ b/torch/distributed/checkpoint/_checkpointer.py
@@ -83,15 +83,22 @@ def async_save(
         Returns:
             Future: A future holding the resultant Metadata object from `save`.
         """
+<<<<<<< HEAD
         response = saver.async_save(
+=======
+        return saver.async_save(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             state_dict,
             storage_writer=self.storage_writer,
             process_group=self.process_group,
             planner=self.save_planner,
         )
+<<<<<<< HEAD
         if not isinstance(response, Future):
             raise AssertionError("response should be a Future instance")
         return response
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def load(self, state_dict: dict[str, Any]) -> None:
         """Calls :py:meth: `torch.distributed.state_dict_loader.load`. Utilizing values passed during initialization."""
diff --git a/torch/distributed/checkpoint/_dedup_save_plans.py b/torch/distributed/checkpoint/_dedup_save_plans.py
index acb81c4186285..8f764939da3d3 100644
--- a/torch/distributed/checkpoint/_dedup_save_plans.py
+++ b/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -54,8 +54,12 @@ def dedup_save_plans(
         for plan_idx in plan_indices - {select_plan_idx}:
             plan_to_item_indices[plan_idx].discard(write_item_idx)
     # Sanity check
+<<<<<<< HEAD
     if len(all_plans) != len(plan_to_item_indices):
         raise AssertionError("len(all_plans) != len(plan_to_item_indices)")
+=======
+    assert len(all_plans) == len(plan_to_item_indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Create new plans with the updated write items post deduplication
     return [
         dataclasses.replace(
diff --git a/torch/distributed/checkpoint/_extension.py b/torch/distributed/checkpoint/_extension.py
index 663caa8a85726..6259c4b4e8951 100644
--- a/torch/distributed/checkpoint/_extension.py
+++ b/torch/distributed/checkpoint/_extension.py
@@ -94,7 +94,10 @@ def is_available() -> bool:
         return zstandard is not None or pyzstd is not None
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def from_descriptor(version: str) -> "ZStandard":
         if version.partition(".")[0] != "1":
             raise ValueError(f"Unknown extension {version=}")
@@ -217,7 +220,10 @@ def from_descriptor(desc: str) -> Extension:
             ext = self.extensions.get(name)
             if not ext:
                 raise ValueError(f"Unknown extension {name=}")
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ext.from_descriptor(version)
 
         return [from_descriptor(desc) for desc in descriptors]
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index e239bbe891fb9..35a05d98b0bf6 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -37,8 +37,12 @@ def __init__(self) -> None:
     def create_stream(
         self, path: Union[str, os.PathLike], mode: str
     ) -> Generator[io.IOBase, None, None]:
+<<<<<<< HEAD
         if self.fs is None:
             raise AssertionError("fs should not be None")
+=======
+        assert self.fs is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         path = os.fspath(path)
 
         # fsspec does not support concurrent transactions, and not all
diff --git a/torch/distributed/checkpoint/_hf_utils.py b/torch/distributed/checkpoint/_hf_utils.py
index 0d14229b7f8cc..98263086b8c8c 100644
--- a/torch/distributed/checkpoint/_hf_utils.py
+++ b/torch/distributed/checkpoint/_hf_utils.py
@@ -41,16 +41,24 @@
 FORMAT_KEY = "format"
 FORMAT_VALUE = "pt"
 
+<<<<<<< HEAD
 NUM_BYTES_FOR_HEADER_LEN = 8
 
 SHARDED_DIR_NAME = "sharded"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclass
 class _HFStorageInfo:
     """This is the per entry storage info."""
 
     relative_path: str
+<<<<<<< HEAD
+=======
+    offset: int
+    length: int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape: torch.Size
     dtype: torch.dtype
 
@@ -82,11 +90,20 @@ def _get_safetensors_file_metadata(file_bytes: io.IOBase) -> tuple[Any, int]:
     # and follows their documentation on how their files are serialized
     # https://huggingface.co/docs/safetensors/index#format
 
+<<<<<<< HEAD
     header_len_bytes = file_bytes.read(NUM_BYTES_FOR_HEADER_LEN)
     header_len = struct.unpack("<Q", header_len_bytes)[0]
     header_json = file_bytes.read(header_len)
     metadata = json.loads(header_json)
     return (metadata, header_len + NUM_BYTES_FOR_HEADER_LEN)
+=======
+    num_bytes_for_header_len = 8
+    header_len_bytes = file_bytes.read(num_bytes_for_header_len)
+    header_len = struct.unpack("<Q", header_len_bytes)[0]
+    header_json = file_bytes.read(header_len)
+    metadata = json.loads(header_json)
+    return (metadata, header_len + num_bytes_for_header_len)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_dtype(dtype_str: str) -> torch.dtype:
diff --git a/torch/distributed/checkpoint/_state_dict_stager.py b/torch/distributed/checkpoint/_state_dict_stager.py
index 155a87b9dec5b..b9bac4c3cc2e6 100644
--- a/torch/distributed/checkpoint/_state_dict_stager.py
+++ b/torch/distributed/checkpoint/_state_dict_stager.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import types
 import warnings
 import weakref
 from copyreg import dispatch_table
+=======
+import logging
+import types
+import weakref
+from copyreg import dispatch_table
+from logging import getLogger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import torch
@@ -11,6 +19,13 @@
 from torch.utils.weak import WeakIdKeyDictionary
 
 
+<<<<<<< HEAD
+=======
+logger = getLogger()
+logger.setLevel(logging.INFO)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class StateDictStager:
     """
     A class for optimizing storage objects during staging for async checkpointing.
@@ -23,6 +38,7 @@ class StateDictStager:
     Attributes:
         pin_memory (bool): Whether to pin CPU memory for faster CPU-GPU transfers
         share_memory (bool): Whether to share memory across processes
+<<<<<<< HEAD
         pin_memory_min_bytes (int): Minimum tensor size in bytes to pin memory (default: 5)
         _cached_storage_mapping (WeakIdKeyDictionary): Maps storage objects to optimized CPU storages using weak references
     """
@@ -38,6 +54,16 @@ def __init__(
                 "Ignoring pin_memory flag for checkpoint staging as pinning memory"
                 "requires CUDA, but CUDA is not available. ",
                 stacklevel=2,
+=======
+        _cached_storage_mapping (WeakIdKeyDictionary): Maps storage objects to optimized CPU storages using weak references
+    """
+
+    def __init__(self, pin_memory: bool = False, share_memory: bool = False):
+        if pin_memory and not torch.cuda.is_available():
+            logger.warning(
+                "Ignoring pin_memory flag for checkpoint staging as pinning memory"
+                "requires CUDA, but CUDA is not available. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.pin_memory = False
         else:
@@ -45,16 +71,24 @@ def __init__(
         self.share_memory = share_memory
         # Mapping from original storage objects to CPU storages using weak references
         self._cached_storage_mapping = WeakIdKeyDictionary()
+<<<<<<< HEAD
         self.pin_memory_min_bytes = pin_memory_min_bytes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _deepcopy_atomic(x, _):
             return x
 
+<<<<<<< HEAD
         def _deepcopy_list(x, memo, non_blocking=False):
+=======
+        def _deepcopy_list(x, memo):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y: list = []
             memo[id(x)] = y
             append = y.append
             for a in x:
+<<<<<<< HEAD
                 append(
                     self.deepcopy_with_tensor_offload(
                         a, memo, non_blocking=non_blocking
@@ -67,6 +101,13 @@ def _deepcopy_tuple(x, memo, non_blocking=False):
                 self.deepcopy_with_tensor_offload(a, memo, non_blocking=non_blocking)
                 for a in x
             ]
+=======
+                append(self.deepcopy_with_tensor_offload(a, memo))
+            return y
+
+        def _deepcopy_tuple(x, memo):
+            y = [self.deepcopy_with_tensor_offload(a, memo) for a in x]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We're not going to put the tuple in the memo, but it's still important we
             # check for it, in case the tuple contains recursive mutable structures.
             try:
@@ -83,6 +124,7 @@ def _deepcopy_tuple(x, memo, non_blocking=False):
             # No elements changed, return original tuple
             return x
 
+<<<<<<< HEAD
         def _deepcopy_dict(x, memo, non_blocking=False):
             y: dict = {}
             memo[id(x)] = y
@@ -102,6 +144,20 @@ def _deepcopy_method(x, memo, non_blocking=False):  # Copy instance methods
                 self.deepcopy_with_tensor_offload(
                     x.__self__, memo, non_blocking=non_blocking
                 ),
+=======
+        def _deepcopy_dict(x, memo):
+            y: dict = {}
+            memo[id(x)] = y
+            for key, value in x.items():
+                y[self.deepcopy_with_tensor_offload(key, memo)] = (
+                    self.deepcopy_with_tensor_offload(value, memo)
+                )
+            return y
+
+        def _deepcopy_method(x, memo):  # Copy instance methods
+            return type(x)(
+                x.__func__, self.deepcopy_with_tensor_offload(x.__self__, memo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         d: dict[Any, Any] = {}
@@ -126,9 +182,13 @@ def _deepcopy_method(x, memo, non_blocking=False):  # Copy instance methods
         d[list] = _deepcopy_list
 
     def _stage_untyped_storage(
+<<<<<<< HEAD
         self,
         storage: UntypedStorage,
         non_blocking: bool = False,
+=======
+        self, storage: UntypedStorage, non_blocking: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Called from the hooked storage_deepcopy function in torch.Tensor.__deepcopy__.
@@ -162,10 +222,14 @@ def _stage_untyped_storage(
         else:
             new_storage = type(storage)(storage.size(), device="cpu")
 
+<<<<<<< HEAD
         # Skip pinning for tensors below the minimum size threshold
         # Small tensors (e.g., optimizer step counters, scalars) have negligible
         # transfer time improvement from pinning, but pinning overhead is significant
         if self.pin_memory and new_storage.nbytes() >= self.pin_memory_min_bytes:
+=======
+        if self.pin_memory and new_storage.nbytes() > 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pin_memory_utils.pin_memory(new_storage.data_ptr(), new_storage.nbytes())
             # Set up a weak reference to unpin when cpu storage is garbage collected
             f = weakref.finalize(
@@ -184,10 +248,17 @@ def _stage_untyped_storage(
     @torch.no_grad()
     def stage(
         self,
+<<<<<<< HEAD
         state_dict: Any,
         non_blocking: bool = False,
     ) -> Any:
         return self.deepcopy_with_tensor_offload(state_dict, None, [], non_blocking)
+=======
+        state_dict: dict[str, Any],
+        non_blocking: bool = False,
+    ) -> dict[str, Any]:
+        return self.deepcopy_with_tensor_offload(state_dict, non_blocking=non_blocking)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _offload_tensor(self, x, memo, non_blocking=False):
         """
@@ -204,15 +275,21 @@ def _offload_tensor(self, x, memo, non_blocking=False):
         Returns:
             A CPU copy of the tensor with optimized storage
         """
+<<<<<<< HEAD
         # if data_ptr is not 0, we allocate a new storage below. so we can skip
         # memory allocation by using [] for size.
         y = x.new_empty([] if x.data_ptr() != 0 else x.size(), device="cpu")
+=======
+        # Create a new empty tensor on CPU
+        y = x.new_empty([], device="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Store in memo dict early to handle recursive references
         d = id(x)
         memo[d] = y
 
         if type(x) is torch.Tensor or x.data_ptr() != 0:
+<<<<<<< HEAD
             # Get the untyped storage
             untyped_storage = x.untyped_storage()
             storage_id = id(untyped_storage)
@@ -232,6 +309,14 @@ def _offload_tensor(self, x, memo, non_blocking=False):
                 memo[storage_id] = copied_storage
 
             # Set the tensor data using the staged storage
+=======
+            # Try to get the untyped storage and optimize it
+            untyped_storage = x.untyped_storage()
+            copied_storage = self._stage_untyped_storage(
+                untyped_storage, non_blocking=non_blocking
+            )
+            # Set the tensor data using the optimized storage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y.set_(copied_storage, x.storage_offset(), x.size(), x.stride())
 
         # Copy any attributes the tensor might have
@@ -258,6 +343,7 @@ def _offload_tensor(self, x, memo, non_blocking=False):
 
         return y
 
+<<<<<<< HEAD
     def close(self):
         """
         Clean up all cached storages and release associated resources.
@@ -268,6 +354,8 @@ def close(self):
         """
         self._cached_storage_mapping.clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.no_grad()
     def deepcopy_with_tensor_offload(self, x, memo=None, _nil=[], non_blocking=False):  # noqa: B006
         """Deep copy operation on arbitrary Python objects with special handling for PyTorch tensors.
@@ -299,6 +387,7 @@ def deepcopy_with_tensor_offload(self, x, memo=None, _nil=[], non_blocking=False
         # tensors and subclasses of tensors are handled separately
         if isinstance(x, torch.Tensor):
             y = self._offload_tensor(x, memo, non_blocking=non_blocking)
+<<<<<<< HEAD
         else:
             # Use the dispatch table for standard types
             copier = self._deepcopy_dispatch.get(cls)
@@ -379,6 +468,40 @@ def deepcopy_with_tensor_offload(self, x, memo=None, _nil=[], non_blocking=False
                                 raise RuntimeError(
                                     f"Unexpected pickle protocol return value length: {len(rv)}"
                                 )
+=======
+
+        # Use the dispatch table for standard types
+        copier = self._deepcopy_dispatch.get(cls)
+        if copier is not None:
+            y = copier(x, memo)
+        else:
+            if issubclass(cls, type):
+                y = self._deepcopy_dispatch[type](x, memo)
+            else:
+                copier = getattr(x, "__deepcopy__", None)
+                if copier is not None:
+                    y = copier(memo)
+                else:
+                    reductor = dispatch_table.get(cls)
+                    if reductor:
+                        rv = reductor(x)
+                    else:
+                        reductor = getattr(x, "__reduce_ex__", None)
+                        if reductor is not None:
+                            rv = reductor(4)
+                        else:
+                            reductor = getattr(x, "__reduce__", None)
+                            if reductor:
+                                rv = reductor()
+                            else:
+                                raise RuntimeError(
+                                    f"un(deep)copyable object of type {cls}"
+                                )
+                    if isinstance(rv, str):
+                        y = x
+                    else:
+                        y = self._reconstruct(x, memo, *rv)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If is its own copy, don't memoize.
         if y is not x:
@@ -403,6 +526,7 @@ def _keep_alive(self, x, memo):
             memo[id(memo)] = [x]
 
     def _reconstruct(
+<<<<<<< HEAD
         self,
         x,
         memo,
@@ -419,15 +543,26 @@ def _reconstruct(
                 self.deepcopy_with_tensor_offload(arg, memo, non_blocking=non_blocking)
                 for arg in args
             )
+=======
+        self, x, memo, func, args, state=None, listiter=None, dictiter=None
+    ):
+        deep = memo is not None
+        if deep and args:
+            args = (self.deepcopy_with_tensor_offload(arg, memo) for arg in args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = func(*args)
         if deep:
             memo[id(x)] = y
 
         if state is not None:
             if deep:
+<<<<<<< HEAD
                 state = self.deepcopy_with_tensor_offload(
                     state, memo, non_blocking=non_blocking
                 )
+=======
+                state = self.deepcopy_with_tensor_offload(state, memo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if hasattr(y, "__setstate__"):
                 y.__setstate__(state)
             else:
@@ -444,9 +579,13 @@ def _reconstruct(
         if listiter is not None:
             if deep:
                 for item in listiter:
+<<<<<<< HEAD
                     item = self.deepcopy_with_tensor_offload(
                         item, memo, non_blocking=non_blocking
                     )
+=======
+                    item = self.deepcopy_with_tensor_offload(item, memo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     y.append(item)
             else:
                 for item in listiter:
@@ -454,12 +593,17 @@ def _reconstruct(
         if dictiter is not None:
             if deep:
                 for key, value in dictiter:
+<<<<<<< HEAD
                     key = self.deepcopy_with_tensor_offload(
                         key, memo, non_blocking=non_blocking
                     )
                     value = self.deepcopy_with_tensor_offload(
                         value, memo, non_blocking=non_blocking
                     )
+=======
+                    key = self.deepcopy_with_tensor_offload(key, memo)
+                    value = self.deepcopy_with_tensor_offload(value, memo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     y[key] = value
             else:
                 for key, value in dictiter:
diff --git a/torch/distributed/checkpoint/_traverse.py b/torch/distributed/checkpoint/_traverse.py
index 48eb67b4f7621..a38f55113c446 100644
--- a/torch/distributed/checkpoint/_traverse.py
+++ b/torch/distributed/checkpoint/_traverse.py
@@ -1,6 +1,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 from collections.abc import Callable, Collection, Mapping, MutableMapping
 from typing import cast, Optional, TypeVar, Union
+=======
+from collections.abc import Collection, Mapping, MutableMapping
+from typing import Callable, cast, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
@@ -121,21 +126,32 @@ def extend_list(lst: list[STATE_DICT_ITEM], idx: int) -> None:
     for i in range(1, len(path)):
         prev_key = path[i - 1]
         key = path[i]
+<<<<<<< HEAD
         def_val = cast(STATE_DICT_ITEM, {} if type(key) is str else [])
+=======
+        def_val = cast(STATE_DICT_ITEM, {} if type(key) == str else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(cur_container, Mapping):
             cur_container = cast(
                 CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
             )
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extend_list(cur_container, prev_key)
             if cur_container[prev_key] is None:
                 cur_container[prev_key] = def_val
             cur_container = cur_container[prev_key]
 
     key = path[-1]
+<<<<<<< HEAD
     if type(key) is int:
+=======
+    if type(key) == int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extend_list(cast(list[STATE_DICT_ITEM], cur_container), key)
 
     cur_container[key] = value
@@ -155,7 +171,10 @@ def get_element(
         elif not isinstance(cur_value, Mapping) or part not in cur_value:
             return default_value
 
+<<<<<<< HEAD
         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_value = cast(CONTAINER_TYPE, cur_value[part])
     return cast(Optional[T], cur_value)
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index ee0029ec7d63b..d0f17b2b424b9 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -313,8 +313,12 @@ def set_up_planner(
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> LoadPlan:
+<<<<<<< HEAD
         if self.metadata is None:
             raise AssertionError("self.metadata is not None")
+=======
+        assert self.metadata is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.flatten_state_dict:
             # To support checkpoints that are saved before v2.4, we have to
             # differentiate if the missing keys are due to old checkpoints.
@@ -409,7 +413,11 @@ def _should_include_key(self, key: str, metadata: Metadata) -> bool:
             return True
 
         if key in self.keys:
+<<<<<<< HEAD
             return True
+=======
+            True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         unflattened_keys: list[str] = []
         planner_data = metadata.planner_data.get(key)
@@ -433,10 +441,15 @@ def set_up_planner(
         metadata: Optional[Metadata] = None,
         is_coordinator: bool = False,
     ) -> None:
+<<<<<<< HEAD
         if state_dict:
             raise AssertionError("not state_dict")
         if metadata is None:
             raise AssertionError("metadata is not None")
+=======
+        assert not state_dict
+        assert metadata is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # rebuild the state dict from the metadata
         for k, v in metadata.state_dict_metadata.items():
@@ -551,16 +564,25 @@ def create_default_global_save_plan(
     for plan in all_plans:
         new_items = []
         for item in plan.items:
+<<<<<<< HEAD
             if item.type != WriteItemType.SHARD:
                 if item.index.fqn in md:
                     raise AssertionError("item.index.fqn not in md")
+=======
+            if not item.type == WriteItemType.SHARD:
+                assert item.index.fqn not in md
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if item.type == WriteItemType.BYTE_IO:
                 md[item.index.fqn] = BytesStorageMetadata()
                 new_items.append(item)
             else:
+<<<<<<< HEAD
                 if item.tensor_data is None:
                     raise AssertionError("item.tensor_data is not None")
+=======
+                assert item.tensor_data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tensor_md = cast(
                     TensorStorageMetadata,
                     md.setdefault(
@@ -580,11 +602,18 @@ def create_default_global_save_plan(
                     new_item = dataclasses.replace(item, index=new_index)
                 new_items.append(new_item)
 
+<<<<<<< HEAD
                 if item.tensor_data.chunk is None:
                     raise AssertionError(f"""
                     Cannot create MD for tensor without bounds.
                     FQN: {item.index.fqn}
                 """)
+=======
+                assert item.tensor_data.chunk is not None, f"""
+                    Cannot create MD for tensor without bounds.
+                    FQN: {item.index.fqn}
+                """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tensor_md.chunks.append(item.tensor_data.chunk)
         new_plans.append(dataclasses.replace(plan, items=new_items))
     return (new_plans, Metadata(md))
@@ -660,7 +689,11 @@ def _validate_global_plan(global_plan: list[SavePlan], metadata: Metadata) -> bo
 
         # Check whether combined chunk cover the whole tensor
         tensor_volume = reduce(operator.mul, value.size, 1)
+<<<<<<< HEAD
         if len(global_plan) > 1 and chunks_volume != tensor_volume:
+=======
+        if chunks_volume != tensor_volume:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             logger.warning(
                 """
                     key:%s invalid fill tensor-volume:
diff --git a/torch/distributed/checkpoint/examples/async_checkpointing_example.py b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
index 41bc9a28812ae..f63ec41b32aa9 100644
--- a/torch/distributed/checkpoint/examples/async_checkpointing_example.py
+++ b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
@@ -4,7 +4,10 @@
 import os
 import shutil
 import traceback
+<<<<<<< HEAD
 from concurrent.futures import Future
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -60,7 +63,10 @@ def _init_model(rank, world_size):
     optim = torch.optim.Adam(model.parameters(), lr=0.0001)
 
     _patch_model_state_dict(model)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _patch_optimizer_state_dict(model, optimizers=optim)
 
     return model, optim
@@ -93,7 +99,10 @@ def run(rank, world_size):
     loss_calc = torch.nn.BCELoss()
 
     f = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for epoch in range(NUM_EPOCHS):
         try:
             torch.manual_seed(epoch)
@@ -109,8 +118,11 @@ def run(rank, world_size):
 
             if epoch % SAVE_PERIOD == 0:
                 if f is not None:
+<<<<<<< HEAD
                     if not isinstance(f, Future):
                         raise AssertionError("f should be a Future instance")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f.result()
                 f = dcp.state_dict_saver.async_save(
                     state_dict, checkpoint_id=CHECKPOINT_DIR
@@ -127,8 +139,11 @@ def run(rank, world_size):
 
             _print("Reloading model from last checkpoint!")
             if f is not None:
+<<<<<<< HEAD
                 if not isinstance(f, Future):
                     raise AssertionError("f should be a Future instance") from None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f.result()
             dcp.load(state_dict)
 
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index b21cac12ff905..4042161eff5ba 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -11,13 +11,21 @@
 import uuid
 import warnings
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
+=======
+from collections.abc import Generator, Iterable, Iterator, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
 from io import UnsupportedOperation
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, cast, Final, IO, Optional, Union
+=======
+from typing import Any, Callable, cast, IO, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # introduced as collections.abc.Buffer in Python 3.12
 from typing_extensions import Buffer
@@ -68,8 +76,11 @@
 
 _metadata_fn: str = ".metadata"
 
+<<<<<<< HEAD
 CURRENT_DCP_VERSION: Final[str] = "1.0.0"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclass
 class _StorageInfo:
@@ -201,8 +212,12 @@ def _refill(self) -> None:
                 self.in_flight_data += tensor.numel() * tensor.element_size()
 
     def _finish(self) -> Iterable[tuple[torch.Tensor, object]]:
+<<<<<<< HEAD
         if not self._done:
             raise AssertionError("_finish called before all items were processed")
+=======
+        assert self._done
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(self.current_items) > 0:
             self.stream.synchronize()
         return self.current_items
@@ -282,8 +297,12 @@ def close(self):
 
 def _item_size(item: WriteItem) -> int:
     size = 1
+<<<<<<< HEAD
     if item.tensor_data is None:
         raise AssertionError("WriteItem tensor_data must not be None")
+=======
+    assert item.tensor_data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # can't use math.prod as PT needs to support older python
     for s in item.tensor_data.size:
         size *= s
@@ -331,6 +350,7 @@ def _write_item(
     )
 
     if write_item.type == WriteItemType.BYTE_IO:
+<<<<<<< HEAD
         if not isinstance(data, io.BytesIO):
             raise AssertionError("Data must be io.BytesIO for BYTE_IO write items")
         transform_to.write(data.getbuffer())
@@ -341,6 +361,13 @@ def _write_item(
             )
         if data.device != torch.device("cpu"):
             raise AssertionError("Tensor must be on CPU device")
+=======
+        assert isinstance(data, io.BytesIO)
+        transform_to.write(data.getbuffer())
+    else:
+        assert isinstance(data, torch.Tensor)
+        assert data.device == torch.device("cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if serialization_format == SerializationFormat.TORCH_SAVE:
             torch.save(data, transform_to)
 
@@ -435,8 +462,12 @@ def _write_files_from_queue(
                 tensor_dict = {}
                 metadata_dict = {}
                 for tensor, write_item in loader.values():
+<<<<<<< HEAD
                     if not tensor.is_cpu:
                         raise AssertionError("Tensor must be on CPU")
+=======
+                    assert tensor.is_cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     write_results.append(
                         _write_item(
                             transforms,
@@ -628,14 +659,18 @@ def __init__(
         self.overwrite = overwrite
         self.transforms = _StorageWriterTransforms(_extensions)
         self.serialization_format = serialization_format
+<<<<<<< HEAD
         self.rank: Optional[int] = None
         self.use_collectives: bool = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         if checkpoint_id:
             self.path = self.fs.init_path(checkpoint_id)
         self.save_id = _generate_uuid()
 
+<<<<<<< HEAD
     def set_up_storage_writer(
         self, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
@@ -662,22 +697,41 @@ def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
                     " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
                     " maintain this functionality or False to raise when an existing checkpoint is found.",
                     stacklevel=2,
+=======
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+        pass
+
+    def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
+        self.fs.mkdir(self.path)
+        if self.fs.exists(self.metadata_path):
+            if self.overwrite:
+                warnings.warn(
+                    f"Detected an existing checkpoint in {self.metadata_path}, overwriting since {self.overwrite=}."
+                    " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
+                    " maintain this functionality or False to raise when an existing checkpoint is found."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.")
 
+<<<<<<< HEAD
         if self.rank is not None and not self.use_collectives:
             plan = dataclasses.replace(
                 plan, storage_data=_StoragePrefix(f"__{self.rank}_")
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return plan
 
     def prepare_global_plan(self, plans: list[SavePlan]) -> list[SavePlan]:
         new_plans = [
             dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
+<<<<<<< HEAD
             if plan.storage_data is None
             else plan
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i, plan in enumerate(plans)
         ]
         return new_plans
@@ -761,20 +815,28 @@ def _write_data(
             return fut
 
     def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
+<<<<<<< HEAD
         metadata = dataclasses.replace(metadata, version=CURRENT_DCP_VERSION)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         storage_md = {}
         for wr_list in results:
             storage_md.update({wr.index: wr.storage_data for wr in wr_list})
         metadata.storage_data = storage_md
 
         metadata.storage_meta = self.storage_meta()
+<<<<<<< HEAD
         tmp_filename = (
             f"__{self.rank}{_metadata_fn}.tmp"
             if not self.use_collectives and self.rank is not None
             else f"{_metadata_fn}.tmp"
         )
         tmp_path = cast(Path, self.fs.concat_path(self.path, tmp_filename))
+=======
+
+        tmp_path = cast(Path, self.fs.concat_path(self.path, f"{_metadata_fn}.tmp"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.fs.create_stream(tmp_path, "wb") as metadata_file:
             pickle.dump(metadata, metadata_file)
             if self.sync_files:
@@ -784,6 +846,7 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
                     os.sync()
 
         # delete in-case other checkpoints were present.
+<<<<<<< HEAD
         if not self.use_collectives and self.rank is not None:
             metadata_path = self._get_metadata_path(self.rank)
         else:
@@ -793,13 +856,25 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
             self.fs.rm_file(metadata_path)
 
         self.fs.rename(tmp_path, metadata_path)
+=======
+        if self.fs.exists(self.metadata_path):
+            self.fs.rm_file(self.metadata_path)
+
+        self.fs.rename(tmp_path, self.metadata_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def storage_meta(self) -> Optional[StorageMeta]:
         return StorageMeta(checkpoint_id=self.checkpoint_id, save_id=self.save_id)
 
+<<<<<<< HEAD
     def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
         filename = f"{_metadata_fn}" if rank is None else f"__{rank}{_metadata_fn}"
         return cast(Path, self.fs.concat_path(self.path, filename))
+=======
+    @property
+    def metadata_path(self) -> Union[str, os.PathLike]:
+        return cast(Path, self.fs.concat_path(self.path, _metadata_fn))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def checkpoint_id(self) -> Union[str, os.PathLike]:
@@ -851,8 +926,11 @@ def __init__(
         self.storage_data: dict[Any, Any] = {}
         self.load_id = _generate_uuid()
         self.transforms = _StorageReaderTransforms(_extension_registry)
+<<<<<<< HEAD
         self.rank = None
         self.use_collectives = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _slice_file(self, file, sinfo: _StorageInfo) -> IO[bytes]:
         return cast(IO[bytes], _create_file_view(file, sinfo.offset, sinfo.length))
@@ -912,10 +990,16 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
                         )
                         target_tensor = planner.resolve_tensor(req).detach()
 
+<<<<<<< HEAD
                         if target_tensor.size() != tensor.size():
                             raise AssertionError(
                                 f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
                             )
+=======
+                        assert target_tensor.size() == tensor.size(), (
+                            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         target_tensor.copy_(tensor)
                         planner.commit_tensor(req, target_tensor)
 
@@ -923,6 +1007,7 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         fut.set_result(None)
         return fut
 
+<<<<<<< HEAD
     def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
         filename = f"{_metadata_fn}" if rank is None else f"__{rank}{_metadata_fn}"
         return cast(Path, self.fs.concat_path(self.path, filename))
@@ -931,6 +1016,11 @@ def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
     def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
         rank = kwargs.get("rank")
         path = self._get_metadata_path(rank)
+=======
+    # Implementing the abstract function in StorageReader
+    def read_metadata(self) -> Metadata:
+        path = self.fs.concat_path(self.path, ".metadata")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.fs.create_stream(path, "rb") as metadata_file:
             metadata = pickle.load(metadata_file)
 
@@ -940,6 +1030,7 @@ def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
 
         return metadata
 
+<<<<<<< HEAD
     def set_up_storage_reader(
         self, metadata: Metadata, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
@@ -948,6 +1039,11 @@ def set_up_storage_reader(
         self.use_collectives = kwargs.get("use_collectives", True)
         if self.storage_data is None:
             raise AssertionError("storage_data must not be None in metadata")
+=======
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        self.storage_data = metadata.storage_data
+        assert self.storage_data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         return plan
@@ -977,8 +1073,12 @@ class FileSystemWriter(_FileSystemWriter, BlockingAsyncStager):
     * File creation is atomic
 
     The checkpoint consist of one file per write request plus
+<<<<<<< HEAD
     a global `.metadata` file with the serialized metadata if rank coordination is enabled.
     a rank local `__{rank}.metadata` file with the serialized metadata if rank coordination is NOT enabled.
+=======
+    a `.metadata` file with the serialized metadata.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 
diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
index 129b7cf570c1d..5dad01c43ec3a 100644
--- a/torch/distributed/checkpoint/format_utils.py
+++ b/torch/distributed/checkpoint/format_utils.py
@@ -64,7 +64,10 @@ def __init__(
         self.checkpoint_id = checkpoint_id
         self.coordinator_rank = coordinator_rank
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def read_metadata(self) -> Metadata:
         """Extends the default StorageReader to support building the metadata file"""
         # Metadata is built in planner.set_up_planner, since we are not actually reading metadata from
@@ -84,8 +87,12 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         # the entire checkpoint on each rank, hopefully preventing OOM issues
         # TODO: read on each host, instead of only the coordinator
         if self.is_coordinator:
+<<<<<<< HEAD
             if self.checkpoint_id is None:
                 raise AssertionError("checkpoint_id must be set before reading data")
+=======
+            assert self.checkpoint_id is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch_state_dict = torch.load(
                 self.checkpoint_id, map_location="cpu", weights_only=False
             )
@@ -104,7 +111,10 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             #  Broadcast the tensor from the coordinator rank
             if self.is_coordinator:
                 pg_device = dist.distributed_c10d._get_pg_default_device()
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tensor = torch_state_dict[req.storage_index.fqn].to(pg_device)
             else:
                 tensor = torch.empty_like(planner.state_dict[req.storage_index.fqn])
@@ -113,11 +123,18 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
 
             tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
             target_tensor = planner.resolve_tensor(req).detach()
+<<<<<<< HEAD
             if not target_tensor.size() == tensor.size():
                 raise AssertionError(
                     f"req {req.storage_index} mismatch sizes, "
                     f"{target_tensor.size()} vs {tensor.size()}"
                 )
+=======
+            assert target_tensor.size() == tensor.size(), (
+                f"req {req.storage_index} mismatch sizes, "
+                f"{target_tensor.size()} vs {tensor.size()}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_tensor.copy_(tensor)
             planner.commit_tensor(req, target_tensor)
 
@@ -125,11 +142,15 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         fut.set_result(None)
         return fut
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         """Implementation of the StorageReader method"""
         self.is_coordinator = is_coordinator
         if self.is_coordinator:
+<<<<<<< HEAD
             if not dist.get_rank() == self.coordinator_rank:
                 raise AssertionError(
                     f"Coordinator rank mismatch: expected {self.coordinator_rank}, "
@@ -140,6 +161,11 @@ def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> Non
             raise AssertionError(
                 "checkpoint_id must be set before setting up storage reader"
             )
+=======
+            assert dist.get_rank() == self.coordinator_rank
+
+        assert self.checkpoint_id is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         """Implementation of the StorageReader method"""
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
index 52f9209da0ec5..a8b93781eb1c1 100644
--- a/torch/distributed/checkpoint/hf_storage.py
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import dataclasses
 import json
+<<<<<<< HEAD
 import logging
 import queue
 import threading
@@ -18,6 +19,27 @@
     CUSTOM_METADATA_KEY,
     SAVED_OFFSETS_KEY,
     SHARDED_DIR_NAME,
+=======
+import queue
+from typing import Any, Optional
+
+import torch
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed.checkpoint._fsspec_filesystem import FsspecReader, FsspecWriter
+from torch.distributed.checkpoint._hf_utils import (
+    _gen_file_name,
+    _get_dtype,
+    _get_safetensors_file_metadata,
+    _HFStorageInfo,
+    _metadata_fn,
+    CUSTOM_METADATA_KEY,
+    DATA_KEY,
+    DATA_OFFSETS_KEY,
+    DEFAULT_EXTRA_METADATA_KEY,
+    DTYPE_KEY,
+    SAVED_OFFSETS_KEY,
+    SHAPE_KEY,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SUFFIX,
 )
 from torch.distributed.checkpoint.filesystem import SerializationFormat
@@ -41,6 +63,7 @@
 from torch.futures import Future
 
 
+<<<<<<< HEAD
 logger: logging.Logger = logging.getLogger(__name__)
 
 __all__ = ["HuggingFaceStorageWriter", "HuggingFaceStorageReader"]
@@ -49,26 +72,48 @@
 class HuggingFaceStorageWriter(FileSystemWriter):
     """
     A writer that writes to storage in the huggingface safetensors format.
+=======
+__all__ = ["HuggingFaceStorageWriter", "HuggingFaceStorageReader"]
+
+
+class HuggingFaceStorageWriter(FsspecWriter):
+    """
+    A writer that writes to a huggingface repository in the huggingface format.
+    Uses Fsspec back-end to communicate with back-end storage.
+    Fsspec registration of the storage solution is required.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
         self,
         path: str,
         fqn_to_index_mapping: Optional[dict[str, int]] = None,
+<<<<<<< HEAD
         thread_count: int = 1,
         save_distributed: bool = False,
         enable_consolidation: bool = False,
         thread_count_consolidation: int = 1,
+=======
+        token: Optional[str] = None,
+        save_sharded: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Initialize the huggingface writer pointing to path.
 
         Args:
+<<<<<<< HEAD
             path: directory where the checkpoint will be read from.
+=======
+            path: hf directory where the checkpoint will be read from.
+                  Needs to have .safetensors files, but can be from any fsspec supported storage,
+                  including localFS and hf://.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fqn_to_index_mapping: A mapping from tensor FQN to the index of the file that the tensor should be written to.
                               Indices are from 1 to N, where N is the number of files. If not provided,
                               the tensors will be written to a single file. If none, then all the tensors on the
                               same rank will be written to the same file.
+<<<<<<< HEAD
             thread_count: Number of threads to use to write distributed checkpoint. Default to 1.
             save_distributed: If True, save the checkpoint using distributed APIs where every rank saves its own shard.
                         Default is False which assumes rank-0 checkpointing of the full state_dict.
@@ -91,14 +136,41 @@ def __init__(
             self.consolidated_output_path = str(self.path)
             self.path = self.fs.concat_path(self.path, SHARDED_DIR_NAME)
         self.thread_count_consolidation = thread_count_consolidation
+=======
+            token: The token to use to authenticate with huggingface hub.
+            save_sharded: If True, save the checkpoint as a sharded checkpoint where every rank saves its own shard.
+                        Default is False which assumes full tensors are being saved.
+
+        """
+
+        if token is not None:
+            super().__init__(
+                path=path,
+                token=token,
+                serialization_format=SerializationFormat.SAFETENSORS,
+            )
+        else:
+            super().__init__(
+                path=path,
+                serialization_format=SerializationFormat.SAFETENSORS,
+            )
+        self._fqn_to_index_mapping: Optional[dict[str, int]] = fqn_to_index_mapping
+        self._save_sharded = save_sharded
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def prepare_global_plan(self, plans: list[SavePlan]) -> list[SavePlan]:
         new_plans = []
         for i, plan in enumerate(plans, start=1):
             storage_data: dict[str, Any] = {}
+<<<<<<< HEAD
             if self.fqn_to_index_mapping is not None:
                 storage_data["fqn_to_index_mapping"] = self.fqn_to_index_mapping
             if self.save_distributed:
+=======
+            if self._fqn_to_index_mapping is not None:
+                storage_data["fqn_to_index_mapping"] = self._fqn_to_index_mapping
+            if self._save_sharded:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 storage_data["shard_index"] = i
 
             new_plans.append(dataclasses.replace(plan, storage_data=storage_data))
@@ -137,6 +209,7 @@ def write_data(
         return super()._write_data(planner, file_queue)
 
     def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
+<<<<<<< HEAD
         if self.save_distributed and not self.enable_consolidation:
             # if we are saving distributed, without consolidating,
             # then we have no metadata to write because a metadata
@@ -160,6 +233,11 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
 
         # writing a model.index.safetensors.json file with fqn to file mapping
         # for the rank-0 checkpointing case
+=======
+        if self._save_sharded:
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metadata_to_write = {}
         storage_md = {}
         total_size = 0
@@ -199,16 +277,28 @@ def metadata_path(self) -> str:
         return _metadata_fn
 
 
+<<<<<<< HEAD
 class HuggingFaceStorageReader(FileSystemReader):
     """
     A reader that reads a checkpoint in the huggingface safetensors format.
     """
 
     def __init__(self, path: str, thread_count: int = 1) -> None:
+=======
+class HuggingFaceStorageReader(FsspecReader):
+    """
+    A reader that reads from a huggingface repository in the huggingface format.
+    Uses in Fsspec back-end to communicate with storage.
+    Fsspec registration of the storage solution is required.
+    """
+
+    def __init__(self, path: str, token: Optional[str] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Initialize the huggingface reader pointing to path.
 
         Args:
+<<<<<<< HEAD
             path: directory where the checkpoint will be read from.
             thread_count: Number of threads to use to read distributed checkpoint. Default to 1.
         """
@@ -254,6 +344,21 @@ def _read_files_from_queue(
 
     def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         from safetensors import safe_open  # type: ignore[import]
+=======
+            path: hf directory where the checkpoint will be read from.
+            Needs to have .safetensors file, but can be from any fsspec supported storage,
+            including localFS and hf://.
+            token: The token to use to authenticate with huggingface hub.
+        """
+
+        if token is not None:
+            super().__init__(path=path, token=token)
+        else:
+            super().__init__(path=path)
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        from safetensors import deserialize  # type: ignore[import-not-found]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         per_file: dict[str, list[ReadItem]] = {}
 
@@ -262,6 +367,7 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             file_name = item_md.relative_path
             per_file.setdefault(file_name, []).append(read_item)
 
+<<<<<<< HEAD
         if self.thread_count <= 1 or len(per_file) <= 1:
             for file_name, reqs in per_file.items():
                 with safe_open(filename=file_name, framework="pt") as f:
@@ -304,16 +410,52 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
                 raise AssertionError(
                     f"Not all files were processed: {processed_count} out of {len(per_file)}"
                 )
+=======
+        for file_name, reqs in per_file.items():
+            with self.fs.create_stream(file_name, "rb") as stream:
+                # TODO: make this more efficient by doing offset reads instead of a
+                # full deserialization of the file
+                deserialized = deserialize(stream.read())
+                deserialized_dict: dict[str, dict[str, Any]] = {
+                    tensor_info[0]: tensor_info[1] for tensor_info in deserialized
+                }
+
+                for req in reqs:
+                    item_md = self.storage_data[req.storage_index]
+
+                    tensor_bytes = deserialized_dict[req.dest_index.fqn][DATA_KEY]
+
+                    tensor = torch.frombuffer(
+                        tensor_bytes,
+                        dtype=item_md.dtype,
+                    )
+                    tensor = tensor.reshape(item_md.shape)
+                    tensor = narrow_tensor_by_index(
+                        tensor, req.storage_offsets, req.lengths
+                    )
+                    target_tensor = planner.resolve_tensor(req).detach()
+
+                    assert target_tensor.size() == tensor.size(), (
+                        f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                    )
+
+                    target_tensor.copy_(tensor)
+                    planner.commit_tensor(req, target_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fut: Future = Future()
         fut.set_result(None)
         return fut
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def read_metadata(self) -> Metadata:
         from safetensors import safe_open  # type: ignore[import]
         from safetensors.torch import _getdtype  # type: ignore[import]
 
+=======
+    def read_metadata(self) -> Metadata:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state_dict_metadata: dict[str, TensorStorageMetadata] = {}
         storage_data: dict[MetadataIndex, _HFStorageInfo] = {}
 
@@ -323,6 +465,7 @@ def read_metadata(self) -> Metadata:
                 safetensors_files.append(file)
 
         for safetensor_file in safetensors_files:
+<<<<<<< HEAD
             with safe_open(safetensor_file, framework="pt") as f:
                 keys = f.keys()
                 extra_metadata = f.metadata()
@@ -336,10 +479,27 @@ def read_metadata(self) -> Metadata:
                 for key in keys:
                     shape = f.get_slice(key).get_shape()
                     dtype = f.get_slice(key).get_dtype()
+=======
+            with self.fs.create_stream(safetensor_file, "rb") as f:
+                safetensors_metadata, _ = _get_safetensors_file_metadata(f)
+                custom_metadata = safetensors_metadata.get(DEFAULT_EXTRA_METADATA_KEY)
+
+                dcp_sharding_info = None
+                if custom_metadata and custom_metadata.get(CUSTOM_METADATA_KEY):
+                    dcp_sharding_info = json.loads(
+                        custom_metadata.get(CUSTOM_METADATA_KEY)
+                    )
+
+                for key, val in safetensors_metadata.items():
+                    if key == DEFAULT_EXTRA_METADATA_KEY:
+                        continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # construct state_dict_metadata
                     if dcp_sharding_info is not None:
                         offset = dcp_sharding_info[key][SAVED_OFFSETS_KEY]
                     else:
+<<<<<<< HEAD
                         offset = [0] * len(shape)
 
                     if key not in state_dict_metadata:
@@ -347,23 +507,49 @@ def read_metadata(self) -> Metadata:
                             properties=TensorProperties(dtype=_getdtype(dtype)),
                             size=torch.Size(
                                 [saved + offset for saved, offset in zip(shape, offset)]
+=======
+                        offset = [0] * len(val[SHAPE_KEY])
+
+                    if key not in state_dict_metadata:
+                        state_dict_metadata[key] = TensorStorageMetadata(
+                            properties=TensorProperties(
+                                dtype=_get_dtype(val[DTYPE_KEY])
+                            ),
+                            size=torch.Size(
+                                [
+                                    saved + offset
+                                    for saved, offset in zip(val[SHAPE_KEY], offset)
+                                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             ),
                             chunks=[
                                 ChunkStorageMetadata(
                                     offsets=torch.Size(offset),
+<<<<<<< HEAD
                                     sizes=torch.Size(shape),
+=======
+                                    sizes=torch.Size(val[SHAPE_KEY]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 )
                             ],
                         )
                     else:
                         state_dict_metadata[key].chunks.append(
                             ChunkStorageMetadata(
+<<<<<<< HEAD
                                 torch.Size(offset), sizes=torch.Size(shape)
+=======
+                                torch.Size(offset), sizes=torch.Size(val[SHAPE_KEY])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             )
                         )
                         size = list(state_dict_metadata[key].size)
                         for i in range(len(size)):
+<<<<<<< HEAD
                             size[i] = max(size[i], shape[i] + offset[i])
+=======
+                            size[i] = max(size[i], val[SHAPE_KEY][i] + offset[i])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         state_dict_metadata[key].size = torch.Size(size)
 
                     # construct storage data
@@ -372,11 +558,23 @@ def read_metadata(self) -> Metadata:
                             fqn=key, offset=dcp_sharding_info[key][SAVED_OFFSETS_KEY]
                         )
                     else:
+<<<<<<< HEAD
                         metadata_index = MetadataIndex(fqn=key, offset=[0] * len(shape))
                     storage_data[metadata_index] = _HFStorageInfo(
                         relative_path=safetensor_file,
                         shape=torch.Size(shape),
                         dtype=_getdtype(dtype),
+=======
+                        metadata_index = MetadataIndex(
+                            fqn=key, offset=[0] * len(val[SHAPE_KEY])
+                        )
+                    storage_data[metadata_index] = _HFStorageInfo(
+                        relative_path=safetensor_file,
+                        offset=val[DATA_OFFSETS_KEY][0],
+                        length=val[DATA_OFFSETS_KEY][1] - val[DATA_OFFSETS_KEY][0],
+                        shape=torch.Size(val[SHAPE_KEY]),
+                        dtype=_get_dtype(val[DTYPE_KEY]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         metadata = Metadata(
diff --git a/torch/distributed/checkpoint/logger.py b/torch/distributed/checkpoint/logger.py
index 677cac0339cb9..2acb22e4a3ae0 100644
--- a/torch/distributed/checkpoint/logger.py
+++ b/torch/distributed/checkpoint/logger.py
@@ -2,8 +2,12 @@
 import functools
 import logging
 import time
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, TypeVar
+=======
+from typing import Any, Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 from uuid import uuid4
 
@@ -16,7 +20,10 @@
 
 __all__: list[str] = []
 
+<<<<<<< HEAD
 # pyrefly: ignore [unknown-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 global _dcp_logger
 _dcp_logger = c10d_logger._get_or_create_logger(DCP_LOGGER_NAME)
 
@@ -31,16 +38,27 @@ def _msg_dict_from_dcp_method_args(*args, **kwargs) -> dict[str, Any]:
     msg_dict = {}
 
     # checkpoint ID can be passed in through the serializer or through the checkpoint id directly
+<<<<<<< HEAD
     storage_writer = kwargs.get("storage_writer")
     storage_reader = kwargs.get("storage_reader")
     planner = kwargs.get("planner")
 
     checkpoint_id = kwargs.get("checkpoint_id")
+=======
+    storage_writer = kwargs.get("storage_writer", None)
+    storage_reader = kwargs.get("storage_reader", None)
+    planner = kwargs.get("planner", None)
+
+    checkpoint_id = kwargs.get("checkpoint_id", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not checkpoint_id and (serializer := storage_writer or storage_reader):
         checkpoint_id = getattr(serializer, "checkpoint_id", None)
 
     msg_dict["checkpoint_id"] = (
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         str(checkpoint_id) if checkpoint_id is not None else checkpoint_id
     )
 
diff --git a/torch/distributed/checkpoint/metadata.py b/torch/distributed/checkpoint/metadata.py
index 36864b6bf3ad6..528c6d87b176f 100644
--- a/torch/distributed/checkpoint/metadata.py
+++ b/torch/distributed/checkpoint/metadata.py
@@ -147,7 +147,10 @@ class Metadata:
     planner_data: Any = None
     storage_data: Any = None
     storage_meta: Optional[StorageMeta] = None
+<<<<<<< HEAD
     version: Optional[str] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(frozen=True)
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index 343497da0aa21..a477e1997dcc3 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -29,8 +29,11 @@
     _create_read_items,
     create_read_items_for_chunk_list,
 )
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict_loader import load_state_dict
 from torch.distributed.checkpoint.storage import StorageReader
 from torch.distributed.checkpoint.utils import (
@@ -137,10 +140,19 @@ def _get_state_dict_2d_layout(
     for key, value in state_dict.items():
         specs[key] = (None, value.size())
         if _is_nested_tensor(value):
+<<<<<<< HEAD
             if not len(value.local_shards()) == 1:
                 raise AssertionError("Cannot handle ST with multiple shards")
             if not isinstance(value, ShardedTensor):
                 raise AssertionError("Can only handle nested ShardedTensor")
+=======
+            assert len(value.local_shards()) == 1, (
+                "Cannot handle ST with multiple shards"
+            )
+            assert isinstance(value, ShardedTensor), (
+                "Can only handle nested ShardedTensor"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shard = value.local_shards()[0]
             specs[key] = (
                 shard.metadata.shard_offsets,
@@ -157,7 +169,10 @@ def _get_state_dict_2d_layout(
 class _ReaderWithOffset(DefaultLoadPlanner):
     translation: dict[MetadataIndex, MetadataIndex]
     state_dict: STATE_DICT_TYPE
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     metadata: Metadata
 
     def __init__(self, fqn_to_offset: dict[str, Sequence[int]]) -> None:
@@ -182,8 +197,12 @@ def create_local_plan(self) -> LoadPlan:
 
             offset = self.fqn_to_offset[fqn]
 
+<<<<<<< HEAD
             if not len(obj.local_shards()) == 1:
                 raise AssertionError("Expected exactly one local shard")
+=======
+            assert len(obj.local_shards()) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             original_shard = obj.local_shards()[0]
             local_chunks = [
                 ChunkStorageMetadata(
@@ -200,8 +219,12 @@ def create_local_plan(self) -> LoadPlan:
             # TODO: The ReadItems will have a displaced MetadataIndex, fix it.
             # TODO: we should change _create_sharded_read_items to have more ergonomic API
             for ri in reqs:
+<<<<<<< HEAD
                 if ri.dest_index.offset is None:
                     raise AssertionError("dest_index.offset must not be None")
+=======
+                assert ri.dest_index.offset is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 original_offset = _element_wise_sub(ri.dest_index.offset, offset)
                 original_index = dataclasses.replace(
                     ri.dest_index, offset=torch.Size(original_offset)
diff --git a/torch/distributed/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
index 9d7af7d7a821b..c872cff4e850d 100644
--- a/torch/distributed/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import io
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast
+=======
+from typing import Any, Callable, cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index 4bbacc66aaaff..2d62ec1efa0be 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import os
 import tempfile
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -34,6 +35,16 @@
     BlockingAsyncStager: Implementation of AsyncStager which stages the state_dict
     on CPU RAM and blocks until the copy is complete. Please use DefaultStager instead.
 """
+=======
+from typing import Optional
+from typing_extensions import Protocol, runtime_checkable
+
+from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+
+
+__all__ = ["AsyncStager", "BlockingAsyncStager"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @runtime_checkable
@@ -57,7 +68,11 @@ class AsyncStager(Protocol):
 
     3. If AsyncStager.should_synchronize_after_execute is True, this method will be called immediately after
         the serialization thread starts and before returning from dcp.async_save. If this is set to False,
+<<<<<<< HEAD
         the assumption is the user has defined a custom synchronization point for the purpose of further
+=======
+        the assumption is the user has defined a custom synchronization point for the the purpose of further
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optimizing save latency in the training loop (for example, by overlapping staging with the
         forward/backward pass), and it is the respondsibility of the user to call `AsyncStager.synchronize_staging`
         at the appropriate time.
@@ -72,11 +87,18 @@ def should_synchronize_after_execute(self) -> bool:
         """
         Whether to synchronize after executing the stage.
         """
+<<<<<<< HEAD
         return self._synchronize_after_execute
 
     def stage(
         self, state_dict: STATE_DICT_TYPE
     ) -> Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE]:
+=======
+
+        return self._synchronize_after_execute
+
+    def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns a "staged" copy of `state_dict`. The expectation of the staged copy is that it is
         inoculated from any updates incurred after the stage call is complete.
@@ -85,17 +107,21 @@ def stage(
             f"{self.__class__.__name__} must implement stage method"
         )
 
+<<<<<<< HEAD
     @deprecated(
         "`synchronize_staging` is deprecated and will be removed in future versions."
         "Please use staging_future from AsyncSaveResponse instead.",
         category=FutureWarning,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def synchronize_staging(self) -> None:
         """
         In the case `stage` is async in some way, this method should be called to ensure staging
         is complete and it is safe to begin modifying the original `state_dict`
         """
 
+<<<<<<< HEAD
     def close(self) -> None:
         """
         Clean up all resources used by the stager.
@@ -274,6 +300,8 @@ def synchronize_staging(self) -> None:
         if self._staging_future is not None:
             self._staging_future.result()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class BlockingAsyncStager(AsyncStager):
     """
@@ -325,6 +353,7 @@ def synchronize_staging(self) -> None:
         """
         No-op function, since staging is blocking.
         """
+<<<<<<< HEAD
 
     def close(self) -> None:
         pass
@@ -472,3 +501,5 @@ def close(self) -> None:
         """
         Clean up resources. Persisted files are intentionally left for future discovery.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index 479027a2ea9a5..1370d8d5fd082 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -3,10 +3,17 @@
 import functools
 import gc
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable
 from dataclasses import asdict, dataclass, field
 from itertools import chain
 from typing import Any, cast, no_type_check, Optional, Union
+=======
+from collections.abc import Generator, Iterable
+from dataclasses import asdict, dataclass, field
+from itertools import chain
+from typing import Any, Callable, cast, no_type_check, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -186,8 +193,12 @@ def _get_fqns(
     curr_obj = model
     for i, curr_obj_name in enumerate(obj_names):
         if isinstance(curr_obj, DDP):
+<<<<<<< HEAD
             if curr_obj_name != "module":
                 raise AssertionError(f"Expected 'module', got '{curr_obj_name}'")
+=======
+            assert curr_obj_name == "module"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             curr_obj = curr_obj.module
             if not skip_ddp_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -200,12 +211,19 @@ def _get_fqns(
                 return {f"{prefix}{fqn}" for fqn in flat_param._fqns}
             curr_obj = getattr(curr_obj, FSDP_WRAPPED_MODULE)
             if curr_obj_name != FSDP_WRAPPED_MODULE:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 fqn_obj_names.append(curr_obj_name)
                 curr_obj = getattr(curr_obj, curr_obj_name)
         elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
             if curr_obj_name != "_orig_mod":
                 raise AssertionError(f"Expected '_orig_mod', got '{curr_obj_name}'")
+=======
+                fqn_obj_names.append(curr_obj_name)
+                curr_obj = getattr(curr_obj, curr_obj_name)
+        elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
+            assert curr_obj_name == "_orig_mod"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             curr_obj = curr_obj._orig_mod
             if not skip_compiler_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -218,7 +236,10 @@ def _get_fqns(
                 ):
                     if hasattr(curr_obj, removed_fqn):
                         curr_obj = getattr(curr_obj, removed_fqn)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fqn_obj_names.append(curr_obj_name)
             if curr_obj_name == nn.modules.module._EXTRA_STATE_KEY_SUFFIX:
                 if i != len(obj_names) - 1:
@@ -290,7 +311,10 @@ def _verify_options(
             "will be removed in 2.5. This feature can be achieved by manually "
             "filtering out the state_dict returned from get_state_dict.",
             FutureWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if optim_only and not optims:
         raise RuntimeError(
@@ -310,7 +334,11 @@ def _verify_options(
             continue
 
         fqns = _get_fqns(model, name)
+<<<<<<< HEAD
         fqn = fqn_param_mapping.get(param)
+=======
+        fqn = fqn_param_mapping.get(param, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if fqn is not None:
             cast(set[str], fqn_param_mapping[param]).update(fqns)
             shared_params_mapping[param] = fqn_param_mapping[param]
@@ -332,8 +360,12 @@ def _verify_options(
             if module not in submodules:
                 continue
             fqns = _get_fqns(model, name)
+<<<<<<< HEAD
             if len(fqns) != 1:
                 raise AssertionError("Submodule FQN should only have 1 instance")
+=======
+            assert len(fqns) == 1, "Submodule FQN should only have 1 instance"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             submodule_prefixes.update(f"{fqn}." for fqn in fqns)
 
     if options.broadcast_from_rank0 and not options.full_state_dict:
@@ -412,8 +444,12 @@ def _verify_state_dict(
 ) -> None:
     for module in info.fsdp_modules:
         fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+<<<<<<< HEAD
         if fsdp_state is None:
             raise AssertionError("Expected a fsdp_state with a fsdp module.")
+=======
+        assert fsdp_state is not None, "Expected a fsdp_state with a fsdp module."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Verify if the model_state_dict and optim_state_dict are valid. This API
     # should give the users an explicit error message to debug or report.
@@ -488,10 +524,14 @@ def _get_model_state_dict(
 
     for key in list(state_dict.keys()):
         fqns = _get_fqns(model, key)
+<<<<<<< HEAD
         if len(fqns) != 1:
             raise AssertionError(
                 f"Expected 1 FQN for key '{key}', got {len(fqns)}: {fqns}"
             )
+=======
+        assert len(fqns) == 1, (key, fqns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fqn = next(iter(fqns))
         if fqn != key:
             # As we only support FSDP, DDP, and TP, the only cases are
@@ -669,7 +709,11 @@ def _flatten_optim_state_dict(state_dict: OptimizerStateType) -> dict[str, Value
                 "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
             },
         },
+<<<<<<< HEAD
         "param_groups": [
+=======
+        "param_group": [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {
                 "lr": 0.0,
                 "betas": (0.9, 0.95), ...,
@@ -686,6 +730,7 @@ def _flatten_optim_state_dict(state_dict: OptimizerStateType) -> dict[str, Value
         "state.layer2.weight.exp_avg": SomeTensor,
         "state.layer1.weight.exp_avg_sq": SomeTensor,
         "state.layer2.weight.exp_avg_sq": SomeTensor,
+<<<<<<< HEAD
         "param_groups.layer1.weight.lr": 0.1,
         "param_groups.layer2.weight.lr": 0.1,
         "param_groups.layer1.weight.betas": (0.9, 0.95),
@@ -730,10 +775,28 @@ def _raise_if_type_not_supported(v):
             raise NotImplementedError(
                 "Flattening optimizer state_dict only supports "
                 "tensor, int, float, dict states now. "
+=======
+        "param_group.layer1.weight.lr" : 0.1,
+        "param_group.layer2.weight.lr" : 0.1,
+        "param_group.layer1.weight.betas" : (0.9, 0.95),
+        "param_group.layer2.weight.betas" : (0.9, 0.95),
+    }
+
+    Note that if any of the value is a container, like the betas in the example,
+    this API won't flattent it.
+    """
+
+    def _raise_if_type_not_supported(v):
+        if not isinstance(v, (torch.Tensor, int, float)):
+            raise NotImplementedError(
+                "Flattening optimizer state_dict only supports "
+                "tensor, int, float states now. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"Type is {type(v)}."
             )
 
     ret: dict[str, ValueType] = {}
+<<<<<<< HEAD
 
     # Handle the "state" section with recursive flattening
     for fqn, state in cast(DictValueType, state_dict[_STATE]).items():
@@ -743,12 +806,22 @@ def _raise_if_type_not_supported(v):
         )
 
     # Handle the "param_groups" section with two-level flattening
+=======
+    for fqn, state in cast(DictValueType, state_dict[_STATE]).items():
+        for k, v in cast(DictValueType, state).items():
+            _raise_if_type_not_supported(v)
+            ret[f"{_STATE}.{fqn}.{k}"] = v
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for param_group in cast(ListDictValueType, state_dict[_PG]):
         fqns = param_group.pop(_PARAMS)
         for fqn in cast(list[str], fqns):
             for k, v in param_group.items():
                 ret[f"{_PG}.{fqn}.{k}"] = v
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret
 
 
@@ -759,6 +832,7 @@ def _unflatten_optim_state_dict(
 ) -> OptimizerStateType:
     """
     This API unflattens the state_dict generated by _flatten_optim_state_dict().
+<<<<<<< HEAD
     Supports arbitrary levels of nesting in the state section through recursive reconstruction.
 
     See the docstring of _flatten_optim_state_dict() for more detail.
@@ -813,6 +887,10 @@ def _reconstruct_nested_dict(
         # Return the reconstructed nested dictionary (empty dict if no keys matched at all)
         return nested_dict
 
+=======
+    See the docstring of _flatten_optim_state_dict() for more detail.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     state: DictValueType = {}
     pg_state: ListDictValueType = []
     return_osd: OptimizerStateType = {_STATE: state, _PG: pg_state}
@@ -840,6 +918,7 @@ def _reconstruct_nested_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
+<<<<<<< HEAD
                 if not isinstance(params, list):
                     raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
@@ -866,6 +945,17 @@ def _reconstruct_nested_dict(
                         cast(DictValueType, state[fqn])[state_name] = state_dict[
                             flattened_state_key
                         ]
+=======
+                assert isinstance(params, list)  # typing
+                params.append(fqn)
+                if not param.requires_grad:
+                    continue
+                state[fqn] = {}
+                for state_name in optim.state[param].keys():
+                    cast(DictValueType, state[fqn])[state_name] = state_dict[
+                        f"{_STATE}.{fqn}.{state_name}"
+                    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         first_param_fqn = cast(list[str], pg_state[-1][_PARAMS])[0]
         for k in param_group.keys():
@@ -919,10 +1009,14 @@ def _get_optim_state_dict(
             fqn_pid_mapping = {}
             for key, param in model.named_parameters():
                 fqns = _get_fqns(model, key)
+<<<<<<< HEAD
                 if len(fqns) != 1:
                     raise AssertionError(
                         f"Expected 1 FQN for key '{key}', got {len(fqns)}"
                     )
+=======
+                assert len(fqns) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fqn = next(iter(fqns))
                 if param not in param_pid_mapping:
                     continue
@@ -930,11 +1024,16 @@ def _get_optim_state_dict(
                 fqn_pid_mapping[fqn] = pid
                 fqn_pid_mapping[pid] = fqn
 
+<<<<<<< HEAD
             # Only convert top-level parameter IDs to FQNs, preserve nested key types
             for key in list(osd[_STATE].keys()):
                 fqn = fqn_pid_mapping[key]
                 # Move the entire state dict value (which may contain nested integer keys)
                 # without modifying its internal structure
+=======
+            for key in list(osd[_STATE].keys()):
+                fqn = fqn_pid_mapping[key]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 osd[_STATE][fqn] = osd[_STATE].pop(key)
 
             for group in osd[_PG]:
@@ -1003,8 +1102,12 @@ def _split_optim_state_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
+<<<<<<< HEAD
                 if not isinstance(params, list):
                     raise AssertionError(f"Expected list, got {type(params)}")
+=======
+                assert isinstance(params, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 params.append(fqn)
                 if param.requires_grad:
                     state[fqn] = cast(DictValueType, optim_state_dict[_STATE])[fqn]
@@ -1083,10 +1186,14 @@ def _load_optim_state_dict(
                 if fqns == fqns_with_compiler:
                     continue
 
+<<<<<<< HEAD
                 if len(fqns) != 1:
                     raise AssertionError(
                         f"Expected 1 FQN for '{original_fqn}', got {len(fqns)}"
                     )
+=======
+                assert len(fqns) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fqn = fqns.pop()
                 fqn_with_compiler = fqns_with_compiler.pop()
                 for g in optim_state_dict[_PG]:
@@ -1120,8 +1227,12 @@ def _device(t):
                 return t
 
             _ = tree_map_only(torch.Tensor, _device, local_state_dict)
+<<<<<<< HEAD
             if device is None:
                 raise AssertionError("Expected device to be set")
+=======
+            assert device is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             flatten_osd, osd_mapping = _flatten_state_dict(optim_state_dict)
             flatten_local_osd, local_osd_mapping = _flatten_state_dict(local_state_dict)
             if info.broadcast_from_rank0:
@@ -1134,10 +1245,14 @@ def _device(t):
             # having additional parameters ultimately.
             for optim_key in flatten_osd.keys():
                 if optim_key not in flatten_local_osd:
+<<<<<<< HEAD
                     if optim_key not in osd_mapping:
                         raise AssertionError(
                             f"Expected key '{optim_key}' in osd_mapping"
                         )
+=======
+                    assert optim_key in osd_mapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     flatten_local_osd[optim_key] = flatten_osd[optim_key]
                     local_osd_mapping[optim_key] = osd_mapping[optim_key]
             optim_state_dict = _unflatten_state_dict(
@@ -1340,7 +1455,10 @@ def _unflatten_model_state_dict(
             "feature, please preprocessing the model_state_dict to achieve the "
             "same functionality.",
             FutureWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         cast_state_dict = cast(dict[nn.Module, dict[str, ValueType]], state_dict)
         new_state_dict: dict[str, ValueType] = {}
@@ -1350,10 +1468,14 @@ def _unflatten_model_state_dict(
                     continue
 
                 fqns = _get_fqns(model, name)
+<<<<<<< HEAD
                 if len(fqns) != 1:
                     raise AssertionError(
                         "FQNs for a submodule should only have 1 element"
                     )
+=======
+                assert len(fqns) == 1, "FQNs for a submodule should only have 1 element"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 prefix = f"{next(iter(fqns))}."
                 new_state_dict.update(
                     {prefix + subfqn: value for subfqn, value in sub_state_dict.items()}
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index 178e190e937fb..f8e39ea2a8ccf 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import inspect
 import logging
 import os
 import warnings
 from typing import Any, cast, Optional, TYPE_CHECKING, Union
+=======
+import os
+import warnings
+from typing import Any, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -20,6 +26,7 @@
 from .utils import _api_bc_check, _DistWrapper, _profile
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from torch.distributed.checkpoint.metadata import Metadata
 
@@ -27,6 +34,10 @@
 
 logger = logging.getLogger()
 
+=======
+__all__ = ["load_state_dict", "load"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @deprecated(
     "`load_state_dict` is deprecated and will be removed in future versions. "
@@ -158,8 +169,12 @@ def load(
     no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
     if no_dist:
         warnings.warn(
+<<<<<<< HEAD
             "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process.",
             stacklevel=2,
+=======
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     with _profile():
@@ -221,6 +236,7 @@ def _load_state_dict(
         ckpt_kwargs["checkpoint_id"] = ckpt_id
         ckpt_kwargs["process_group"] = distW.group
 
+<<<<<<< HEAD
     use_collectives = True
     metadata: Optional[Metadata] = None
 
@@ -265,6 +281,14 @@ def local_step():
             )
         else:
             storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
+=======
+    @_dcp_method_logger(**ckpt_kwargs)
+    def local_step():
+        assert planner is not None
+        metadata = storage_reader.read_metadata()
+        planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
+        storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         local_plan = planner.create_local_plan()
         local_plan = storage_reader.prepare_local_plan(local_plan)
@@ -272,12 +296,17 @@ def local_step():
 
     @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans):
+<<<<<<< HEAD
         if planner is None:
             raise AssertionError("planner is None")
+=======
+        assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_local_plans = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
         return all_local_plans
 
+<<<<<<< HEAD
     central_plan: Optional[LoadPlan] = None
     if use_collectives:
         central_plan = distW.reduce_scatter("plan", local_step, global_step)
@@ -292,17 +321,28 @@ def read_data():
             raise AssertionError("planner is None")
         if central_plan is None:
             raise AssertionError("central_plan is None")
+=======
+    central_plan: LoadPlan = distW.reduce_scatter("plan", local_step, global_step)
+
+    @_dcp_method_logger(**ckpt_kwargs)
+    def read_data():
+        assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         final_local_plan = planner.finish_plan(central_plan)
         all_reads = storage_reader.read_data(final_local_plan, planner)
 
         all_reads.wait()
         return None
 
+<<<<<<< HEAD
     if use_collectives:
         _ = distW.all_gather("read", read_data)
     else:
         read_data()
         distW.barrier()
+=======
+    _ = distW.all_gather("read", read_data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _load_state_dict_from_keys(
@@ -366,8 +406,12 @@ def _load_state_dict_from_keys(
     no_dist = not (dist.is_available() and dist.is_initialized())
     if no_dist:
         warnings.warn(
+<<<<<<< HEAD
             "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process.",
             stacklevel=2,
+=======
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     storage_reader = cast(
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 38ab2dcb510a8..c0e0c0e5770f7 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -4,14 +4,21 @@
 import os
 import warnings
 from concurrent.futures import Future
+<<<<<<< HEAD
 from dataclasses import dataclass
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from enum import Enum
 from typing import cast, Optional, Union
 from typing_extensions import deprecated
 
 import torch
 import torch.distributed as dist
+<<<<<<< HEAD
 from torch.distributed._state_dict_utils import STATE_DICT_TYPE
+=======
+from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint._async_executor import (  # noqa: TC001
     _AsyncCheckpointExecutor,
 )
@@ -24,6 +31,7 @@
 from torch.distributed.checkpoint._storage_utils import _storage_setup
 from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
 from torch.distributed.checkpoint.logger import _dcp_method_logger
+<<<<<<< HEAD
 from torch.distributed.checkpoint.metadata import Metadata
 from torch.distributed.checkpoint.planner import SavePlan, SavePlanner
 from torch.distributed.checkpoint.staging import (
@@ -33,11 +41,19 @@
 )
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.distributed.checkpoint.storage import StorageWriter, WriteResult
+=======
+from torch.distributed.checkpoint.metadata import Metadata, STATE_DICT_TYPE
+from torch.distributed.checkpoint.planner import SavePlan, SavePlanner
+from torch.distributed.checkpoint.staging import AsyncStager
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.checkpoint.storage import StorageWriter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.distributed_c10d import _get_default_group
 
 from .utils import _api_bc_check, _DistWrapper, _profile
 
 
+<<<<<<< HEAD
 __all__ = [
     "save_state_dict",
     "save",
@@ -45,6 +61,9 @@
     "AsyncCheckpointerType",
     "AsyncSaveResponse",
 ]
+=======
+__all__ = ["save_state_dict", "save", "async_save", "AsyncCheckpointerType"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AsyncCheckpointerType(Enum):
@@ -92,7 +111,10 @@ def save(
     planner: Optional[SavePlanner] = None,
     process_group: Optional[dist.ProcessGroup] = None,
     no_dist: bool = False,
+<<<<<<< HEAD
     use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Metadata:
     """
     Save a distributed model in SPMD style.
@@ -144,6 +166,7 @@ def save(
             (Default: ``None``)
         no_dist (bool):
             If ``True``, this function will assume the intent is to load
+<<<<<<< HEAD
             a checkpoint on a single rank/process.
             (Default: ``False``)
         use_collectives (bool): If ``False``, this function will assume the intent is to save
@@ -151,6 +174,10 @@ def save(
             (Default: ``True``)
             This configuration is experimental and should be used with caution.
             It will change the format of the saved checkpoint and may not be backward compatible.
+=======
+            a checkpoint without using cross-rank synchronization.
+            (Default: ``False``)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         Metadata: Metadata object for the saved checkpoint.
@@ -182,8 +209,12 @@ def save(
     no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
     if no_dist:
         warnings.warn(
+<<<<<<< HEAD
             "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process.",
             stacklevel=2,
+=======
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     with _profile():
@@ -197,6 +228,7 @@ def save(
             process_group=process_group,
             no_dist=no_dist,
             planner=planner,
+<<<<<<< HEAD
             use_collectives=use_collectives,
         )
 
@@ -215,6 +247,11 @@ class AsyncSaveResponse:
     upload_completion: Future[None]
 
 
+=======
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @_dcp_method_logger(log_exceptions=True)
 def async_save(
     state_dict: STATE_DICT_TYPE,
@@ -224,16 +261,23 @@ def async_save(
     planner: Optional[SavePlanner] = None,
     process_group: Optional[dist.ProcessGroup] = None,
     async_checkpointer_type: AsyncCheckpointerType = AsyncCheckpointerType.THREAD,
+<<<<<<< HEAD
     async_stager: Optional[AsyncStager] = None,
     no_dist: bool = False,
     use_collectives: bool = True,
 ) -> Union[Future, AsyncSaveResponse]:
+=======
+) -> Future:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Asynchronous version of ``save``. This code first de-stages the state_dict on to the
     staging storage (defaults to CPU memory), and then calls the `save` in a separate thread.
 
     .. warning::
         This feature is experimental and subject to change.
+<<<<<<< HEAD
         MUST CALL CLOSE AFTER LAST CHECKPOINT IS SAVED
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         state_dict (Dict[str, Any]): The state_dict to save.
@@ -253,6 +297,7 @@ def async_save(
         process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
             (Default: ``None``)
+<<<<<<< HEAD
         async_checkpointer_type (AsyncCheckpointerType):
             whether to do checkpoint in separate thread or process
             (Default: ``AsyncCheckpointerType.THREAD``)
@@ -266,6 +311,8 @@ def async_save(
         use_collectives: If False, Save the checkpoint without rank coordination. (Default: ``True``)
             This configuration is experimental and should be used with caution.
             It will change the format of the saved checkpoint and may not be backward compatible.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         Future: A future holding the resultant Metadata object from `save`.
@@ -293,6 +340,7 @@ def async_save(
 
     if dist.is_available() and dist.is_initialized():
         pg = process_group or _get_default_group()
+<<<<<<< HEAD
         if torch.device("cpu") not in pg._device_types:
             raise AssertionError(
                 "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
@@ -311,21 +359,48 @@ def async_save(
                     False,
                 )
             )
+=======
+        assert (
+            torch.device("cpu") in pg._device_types  # type: ignore[attr-defined]
+        ), (
+            "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
+        )
+
+    storage_writer = cast(
+        StorageWriter, _storage_setup(storage_writer, checkpoint_id, reader=False)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     state_dict = _stateful_to_state_dict(state_dict)
 
     @_dcp_method_logger(log_exceptions=True)
+<<<<<<< HEAD
     def stage_state_dict() -> Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE]:
         return async_stager.stage(state_dict)
 
     staging_future_or_state_dict = stage_state_dict()
 
     upload_executor: _AsyncCheckpointExecutor = (
+=======
+    def stage_state_dict():
+        if isinstance(storage_writer, AsyncStager):
+            staged_state_dict = storage_writer.stage(state_dict)
+        else:  # provides bwc for storage_writers not implementing AsyncStager
+            staged_state_dict = _create_cpu_state_dict(state_dict)
+            _copy_state_dict(state_dict, staged_state_dict, type_check=False)
+
+        return staged_state_dict
+
+    staged_state_dict = stage_state_dict()
+
+    executor: _AsyncCheckpointExecutor = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _ProcessBasedAsyncCheckpointExecutor()
         if async_checkpointer_type == AsyncCheckpointerType.PROCESS
         else _ThreadBasedAsyncCheckpointExecutor()
     )
 
+<<<<<<< HEAD
     upload_future: Future = upload_executor.execute_save(
         staging_future_or_state_dict,
         checkpoint_id=checkpoint_id,
@@ -369,6 +444,27 @@ def maybe_synchronize_staging():
 
         maybe_synchronize_staging()
         return upload_future
+=======
+    f: Future = executor.execute_save(
+        staged_state_dict,
+        checkpoint_id=checkpoint_id,
+        storage_writer=storage_writer,
+        planner=planner,
+        process_group=process_group,
+    )
+
+    @_dcp_method_logger(log_exceptions=True)
+    def maybe_synchronize_staging():
+        if (
+            isinstance(storage_writer, AsyncStager)
+            and storage_writer.should_synchronize_after_execute
+        ):
+            storage_writer.synchronize_staging()
+
+    maybe_synchronize_staging()
+
+    return f
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_dcp_method_logger(log_exceptions=True)
@@ -389,15 +485,22 @@ def _save_state_dict(
     coordinator_rank: int = 0,
     no_dist: bool = False,
     planner: Optional[SavePlanner] = None,
+<<<<<<< HEAD
     use_collectives: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Metadata:
     torch._C._log_api_usage_once("torch.distributed.checkpoint.save_state_dict")
 
     distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
     if planner is None:
         planner = DefaultSavePlanner()
+<<<<<<< HEAD
     if planner is None:
         raise AssertionError("planner is None")
+=======
+    assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     global_metadata = None
 
@@ -408,15 +511,23 @@ def _save_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
+<<<<<<< HEAD
         if planner is None:
             raise AssertionError("planner is None")
+=======
+        assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         storage_meta = storage_writer.storage_meta()
         if "storage_meta" not in inspect.signature(planner.set_up_planner).parameters:
             warnings.warn(
                 "The function definition for SavePlanner.set_up_planner has been updated"
                 " to include the storage_meta argument. Please update your implementation"
+<<<<<<< HEAD
                 " to include this parameter.",
                 stacklevel=2,
+=======
+                " to include this parameter."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             planner.set_up_planner(state_dict, distW.is_coordinator)  # type: ignore[call-arg, arg-type]
         else:
@@ -425,6 +536,7 @@ def local_step():
                 storage_meta=storage_meta,
                 is_coordinator=distW.is_coordinator,
             )
+<<<<<<< HEAD
 
         if (
             "kwargs"
@@ -437,6 +549,9 @@ def local_step():
             )
         else:
             storage_writer.set_up_storage_writer(distW.is_coordinator)
+=======
+        storage_writer.set_up_storage_writer(distW.is_coordinator)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         local_plan = planner.create_local_plan()
         local_plan = storage_writer.prepare_local_plan(local_plan)
@@ -446,12 +561,17 @@ def local_step():
     def global_step(all_local_plans):
         nonlocal global_metadata
 
+<<<<<<< HEAD
         if planner is None:
             raise AssertionError("planner is None")
+=======
+        assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_local_plans, global_metadata = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
 
+<<<<<<< HEAD
     central_plan: Optional[SavePlan] = None
     if use_collectives:
         central_plan = distW.reduce_scatter("plan", local_step, global_step)
@@ -466,6 +586,13 @@ def write_data():
             raise AssertionError("planner is None")
         if central_plan is None:
             raise AssertionError("central_plan is None")
+=======
+    central_plan: SavePlan = distW.reduce_scatter("plan", local_step, global_step)
+
+    @_dcp_method_logger(**ckpt_kwargs)
+    def write_data():
+        assert planner is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         final_local_plan = planner.finish_plan(central_plan)
         all_writes = storage_writer.write_data(final_local_plan, planner)
 
@@ -474,6 +601,7 @@ def write_data():
 
     @_dcp_method_logger(**ckpt_kwargs)
     def finish_checkpoint(all_results):
+<<<<<<< HEAD
         if global_metadata is None:
             raise AssertionError("global_metadata is None")
         storage_writer.finish(metadata=global_metadata, results=all_results)
@@ -487,3 +615,10 @@ def finish_checkpoint(all_results):
         distW.barrier()
 
     return metadata
+=======
+        assert global_metadata is not None
+        storage_writer.finish(metadata=global_metadata, results=all_results)
+        return global_metadata
+
+    return distW.all_reduce("write", write_data, finish_checkpoint)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
index b184d7b170052..656976a2e47b1 100644
--- a/torch/distributed/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -61,9 +61,13 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         ...
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def set_up_storage_writer(
         self, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
+=======
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Initialize this instance.
 
@@ -202,7 +206,11 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         ...
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
+=======
+    def read_metadata(self) -> Metadata:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Read the checkpoint metadata.
 
@@ -212,9 +220,13 @@ def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
         """
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def set_up_storage_reader(
         self, metadata: Metadata, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
+=======
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Initialize this instance.
 
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 073649c5f124d..f0dd4c85cbabd 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -5,11 +5,19 @@
 import itertools
 import os
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
 from functools import wraps
 from pstats import Stats
 from typing import Any, cast, Optional, TypeVar, Union
+=======
+from collections.abc import Sequence
+from contextlib import contextmanager
+from functools import wraps
+from pstats import Stats
+from typing import Any, Callable, cast, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -168,8 +176,12 @@ def scatter_object(self, object_list: Optional[list[T]]) -> T:
 
             local_reply = gather_result[0]
         else:
+<<<<<<< HEAD
             if object_list is None:
                 raise AssertionError("object_list is None")
+=======
+            assert object_list is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_reply = object_list[0]
         return local_reply
 
@@ -191,14 +203,22 @@ def reduce_scatter(
         local_data: Union[WRAPPED_EXCEPTION, T]
         try:
             local_data = map_fun()
+<<<<<<< HEAD
         except BaseException as e:  # noqa: B036
+=======
+        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_data = _wrap_exception(e)
 
         all_data = self.gather_object(local_data)
         all_results: Optional[list[Union[R, CheckpointException]]] = None
         if self.is_coordinator:
+<<<<<<< HEAD
             if all_data is None:
                 raise AssertionError("all_data is None")
+=======
+            assert all_data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node_failures = _get_failure_dict(all_data)
 
             if len(node_failures) == 0:
@@ -208,7 +228,11 @@ def reduce_scatter(
                         list[Union[R, CheckpointException]],
                         reduce_fun(cast(list[T], all_data)),
                     )
+<<<<<<< HEAD
                 except BaseException as e:  # noqa: B036
+=======
+                except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node_failures[self.rank] = _wrap_exception(e)
 
             if len(node_failures) > 0:
@@ -239,25 +263,40 @@ def all_reduce(
         local_data: Union[T, WRAPPED_EXCEPTION]
         try:
             local_data = map_fun()
+<<<<<<< HEAD
         except BaseException as e:  # noqa: B036
+=======
+        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_data = _wrap_exception(e)
 
         all_data = self.gather_object(local_data)
         result: Optional[Union[R, CheckpointException]] = None
         if self.is_coordinator:
+<<<<<<< HEAD
             if all_data is None:
                 raise AssertionError("all_data is None")
+=======
+            assert all_data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node_failures = _get_failure_dict(all_data)
             if len(node_failures) == 0:
                 try:
                     result = reduce_fun(cast(list[T], all_data))
+<<<<<<< HEAD
                 except BaseException as e:  # noqa: B036
+=======
+                except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node_failures[self.rank] = _wrap_exception(e)
 
             if len(node_failures) > 0:
                 result = CheckpointException(step, node_failures)
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         final_result = self.broadcast_object(result)
         if isinstance(final_result, CheckpointException):
             raise final_result
@@ -278,7 +317,11 @@ def all_gather(
         result: Union[T, WRAPPED_EXCEPTION]
         try:
             result = map_fun()
+<<<<<<< HEAD
         except BaseException as e:  # noqa: B036
+=======
+        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = _wrap_exception(e)
 
         all_results = self.all_gather_object(result)
@@ -304,9 +347,14 @@ def broadcast(
         if self.is_coordinator:
             try:
                 result = map_fun()
+<<<<<<< HEAD
             except BaseException as e:  # noqa: B036
                 result = CheckpointException(step, {self.rank: _wrap_exception(e)})
         # pyrefly: ignore [bad-argument-type]
+=======
+            except BaseException as e:
+                result = CheckpointException(step, {self.rank: _wrap_exception(e)})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         final_result = self.broadcast_object(result)
         if isinstance(final_result, CheckpointException):
             raise final_result
@@ -461,20 +509,31 @@ def inner_func(*args, **kwargs) -> Any:
         if len(args) == 2:
             warnings.warn(
                 f"The argument order of {func.__name__} has been changed. "
+<<<<<<< HEAD
                 "Please check the document to avoid future breakages.",
                 stacklevel=2,
+=======
+                "Please check the document to avoid future breakages."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             sig = inspect.signature(func)
             kwonlyargs = [
                 p.name for p in sig.parameters.values() if p.kind == p.KEYWORD_ONLY
             ]
             if "storage_writer" in kwonlyargs:
+<<<<<<< HEAD
                 if "storage_writer" in kwargs:
                     raise AssertionError(f"storage_writer in kwargs: {(args, kwargs)}")
                 kwargs["storage_writer"] = args[1]
             elif "storage_reader" in kwonlyargs:
                 if "storage_reader" in kwargs:
                     raise AssertionError(f"storage_reader in kwargs: {(args, kwargs)}")
+=======
+                assert "storage_writer" not in kwargs, (args, kwargs)
+                kwargs["storage_writer"] = args[1]
+            elif "storage_reader" in kwonlyargs:
+                assert "storage_reader" not in kwargs, (args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["storage_reader"] = args[1]
             else:
                 raise RuntimeError(f"Unexpected kwonlyargs = {kwonlyargs}")
diff --git a/torch/distributed/collective_utils.py b/torch/distributed/collective_utils.py
index b45f0b5cbb4ff..96292f428361e 100644
--- a/torch/distributed/collective_utils.py
+++ b/torch/distributed/collective_utils.py
@@ -9,6 +9,7 @@
 
 from __future__ import annotations
 
+<<<<<<< HEAD
 import importlib
 import logging
 from collections import defaultdict
@@ -32,6 +33,14 @@
 
 logger = logging.getLogger(__name__)
 
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Generic, Optional, TypeVar, Union
+
+import torch.distributed as dist
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 T = TypeVar("T")
 
 
@@ -114,7 +123,10 @@ def broadcast(
             error_msg += f": stage {sync_obj.stage_name}"
         if sync_obj.exception is not None:
             error_msg += f": exception {sync_obj.exception}"
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(error_msg) from sync_obj.exception
 
     return cast(T, sync_obj.payload)
@@ -184,16 +196,23 @@ def all_gather(
 
         if len(exception_list) > 0:
             raise RuntimeError(  # type: ignore[misc]
+<<<<<<< HEAD
                 error_msg,
                 exception_list,
                 # pyrefly: ignore [invalid-inheritance]
+=======
+                error_msg, exception_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) from exception_list[0]
         return ret_list
     else:
         if not sync_obj.success:
             raise RuntimeError(
                 f"all_gather failed with exception {sync_obj.exception}",
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) from sync_obj.exception
         return [sync_obj.payload]  # type: ignore[list-item]
 
@@ -208,7 +227,11 @@ def all_gather_object_enforce_type(
     # pyre-fixme[2]: Parameter must have a type other than `Any`
     obj: Any,
     # pyre-fixme[2]: Parameter must have a type that does not contain `Any`
+<<<<<<< HEAD
     type_checker: Callable[[Any, Any], bool] = lambda x, y: type(x) is type(y),
+=======
+    type_checker: Callable[[Any, Any], bool] = lambda x, y: type(x) == type(y),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Similar to plain all_gather_object but with additional type checking
@@ -236,6 +259,7 @@ def all_gather_object_enforce_type(
                 f"Object type at index {i} is {type(object_list[i])}, "
                 f"while first object type is {type(first_obj)}"
             )
+<<<<<<< HEAD
 
 
 def _summarize_ranks(ranks: Iterable[int]) -> str:
@@ -348,3 +372,5 @@ def _check_rng_sync(
         log_str = f"Generator desync detected:\n{_desync_table_str(value_header, value_ranks)}"
         logger.error(log_str)
     return log_str
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index cab3e71d32068..82f262e257272 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -1,17 +1,29 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
+<<<<<<< HEAD
 import os
 import threading
 import warnings
 from collections.abc import Iterator
 from itertools import zip_longest
+=======
+import math
+import os
+import threading
+import warnings
+from functools import reduce
+from itertools import chain
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.distributed import is_available
+<<<<<<< HEAD
 from torch.distributed._mesh_layout import _MeshLayout
 from torch.distributed._pycute import IntTuple, is_int, suffix_product
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._typing_utils import not_none
 
 
@@ -65,23 +77,191 @@ def _init_device_mesh_stub():
                 "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
             )
 
+<<<<<<< HEAD
     BackendConfig = tuple[Optional[str], Optional[C10dBackend.Options]]
     torch.serialization.add_safe_globals([_MeshLayout])
 
     class _MeshEnv(threading.local):
         def __init__(self) -> None:
             self.mesh_stack: list[DeviceMesh] = []
+=======
+    class _MeshEnv(threading.local):
+        def __init__(self) -> None:
+            self.mesh_stack: list[DeviceMesh] = []
+            self.child_to_root_mapping: dict[DeviceMesh, DeviceMesh] = {}
+            self.mesh_dim_group_options: dict[
+                int, tuple[str, Optional[C10dBackend.Options]]
+            ] = {}
+            self.root_to_flatten_mapping: dict[DeviceMesh, dict[str, DeviceMesh]] = {}
+            # Record flatten mesh name to its mesh dim index in root mesh.
+            self.flatten_name_to_root_dims: dict[
+                DeviceMesh, dict[str, tuple[int, ...]]
+            ] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_current_mesh(self) -> "DeviceMesh":
             if len(self.mesh_stack) == 0:
                 raise RuntimeError("No device mesh is currently active!")
             return self.mesh_stack[-1]
 
+<<<<<<< HEAD
         # TODO: to remove it once we move all use cases into new API.
+=======
+        def create_sub_mesh(
+            self,
+            device_mesh: "DeviceMesh",
+            submesh_dim_names: tuple[str, ...],
+            submesh_dims: list[tuple[int, ...]],
+        ) -> "DeviceMesh":
+            # Get the submesh dim size from the submesh_dims.
+            # For example, if we have a 3D mesh with mesh_shape (2, 2, 2) mesh_dim_names ("dp", "cp", "tp") and we want
+            # to slice out mesh["dp_cp"], then submesh_dims = [(0, 1), (2,)] and submesh_dim_size = [2 * 2, 2] = [4, 2].
+            # If we want to slice out mesh["dp", "cp"], then submesh_dims = [(0,), (1,)] and submesh_dim_size = [2, 2].
+            slice_dim_size = [
+                reduce(
+                    lambda x, y: x * device_mesh.mesh.size(y),
+                    mesh_dim,
+                    1,
+                )
+                for mesh_dim in submesh_dims
+            ]
+
+            mesh_tensor = device_mesh.mesh
+            # slice_dim_idx could be different from submesh_dims, as we may need to flatten out some dims.
+            slice_dim_idx = []
+            slice_dim_group_name = []
+            # keep track of the number of dims that have been flattened so we can get the correct slice_dim_idx in the
+            # flattened mesh tensor.
+            num_dims_flatten = 0
+            for mesh_dim_indices, mesh_dim_name in zip(submesh_dims, submesh_dim_names):
+                # Currently, this only allows slicing out a contiguous flattened dim.
+                # TODO: we need to handle reconstructing a non-contiguous flattened dim.
+                if len(mesh_dim_indices) > 1:
+                    # We need to move the start_dim and end_dim to the left if some dims are already flattened.
+                    mesh_tensor = mesh_tensor.flatten(
+                        start_dim=mesh_dim_indices[0] - num_dims_flatten,
+                        end_dim=mesh_dim_indices[-1] - num_dims_flatten,
+                    )
+                    # If some dims are already flattened, we need to adjust the slice_dim_idx accordingly.
+                    # For example, if the submesh_dims = [(0, 1), (2,), (3, 4)] with 0-1 flattened and 3-4 flattened,
+                    # then the final slice_dim_idx should be [0, 1, 2].
+                    slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
+                    num_dims_flatten += len(mesh_dim_indices) - 1
+                    slice_dim_group_name.append(
+                        self.root_to_flatten_mapping[device_mesh][
+                            mesh_dim_name
+                        ]._dim_group_names[0]
+                    )
+                else:
+                    slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
+                    slice_dim_group_name.append(
+                        device_mesh._dim_group_names[mesh_dim_indices[0]]
+                    )
+
+            # mesh_tensor has already been flattened if needed. So mesh_tensor.ndim <= device_mesh.mesh.ndim now.
+            mesh_dims_remained_idx = list(range(mesh_tensor.ndim))
+            for idx in slice_dim_idx:
+                mesh_dims_remained_idx.remove(idx)
+
+            # pg_ranks_by_dim is the size of [number of local ranks of the outermost submesh dimension, *slice_dim_idx]
+            # This means on each local rank of the outermost slice mesh dim, we have a tensor of submesh size with
+            # the pg ranks of the submesh. From this, we can extract the submesh mesh tensor contains the current rank.
+            pg_ranks_by_dim = mesh_tensor.permute(
+                *mesh_dims_remained_idx, *slice_dim_idx
+            ).reshape(-1, *slice_dim_size)
+
+            cur_rank = device_mesh.get_rank()
+            for mesh_nd in pg_ranks_by_dim:
+                submesh = DeviceMesh(
+                    device_mesh.device_type,
+                    mesh_nd,
+                    mesh_dim_names=submesh_dim_names,
+                    _init_backend=False,
+                )
+                if cur_rank in mesh_nd:
+                    res_submesh = submesh
+
+            res_submesh._dim_group_names = slice_dim_group_name  # type: ignore[possibly-undefined]
+            self.child_to_root_mapping[res_submesh] = device_mesh
+
+            return res_submesh
+
+        def create_flatten_mesh(
+            self, device_mesh: "DeviceMesh", mesh_dim_name: Optional[str] = None
+        ) -> "DeviceMesh":
+            root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+
+            flatten_dims_in_root = [
+                not_none(root_mesh.mesh_dim_names).index(flattened_mesh_dim_name)
+                for flattened_mesh_dim_name in not_none(device_mesh.mesh_dim_names)
+            ]
+
+            if not mesh_dim_name:
+                mesh_dim_name = "_".join(
+                    [
+                        not_none(root_mesh.mesh_dim_names)[dim]
+                        for dim in flatten_dims_in_root
+                    ]
+                )
+
+            # Check whether the mesh_dim_name for flattened mesh is valid.
+            self.flatten_name_to_root_dims.setdefault(root_mesh, {})
+            invalid_dim_names = chain(
+                *list(not_none(root_mesh.mesh_dim_names)),
+                *self.flatten_name_to_root_dims[root_mesh].keys(),
+            )
+            if mesh_dim_name in invalid_dim_names:
+                raise RuntimeError(
+                    f"{mesh_dim_name} already exists for submesh of the {root_mesh}. ",
+                    f"The mesh_dim_names of submesh and flattened mesh are {invalid_dim_names}. "
+                    f"Please specify another valid mesh_dim_name.",
+                )
+
+            # Quick return if the flatten mesh has been created before.
+            # TODO: If we decide to restrict flatten initialization once, we should remove
+            # this check and throw an error if the flatten mesh is already created before.
+            if (
+                root_mesh in self.root_to_flatten_mapping
+                and mesh_dim_name in self.root_to_flatten_mapping[root_mesh]
+            ):
+                return self.root_to_flatten_mapping[root_mesh][mesh_dim_name]
+
+            flattened_mesh_dim_size = math.prod(device_mesh.mesh.size())
+
+            remained_dims_in_root = list(range(root_mesh.mesh.ndim))
+            for flatten_dim_in_root in flatten_dims_in_root:
+                remained_dims_in_root.remove(flatten_dim_in_root)
+
+            pg_ranks_by_dim = root_mesh.mesh.permute(
+                *remained_dims_in_root, *flatten_dims_in_root
+            ).reshape(-1, flattened_mesh_dim_size)
+
+            cur_rank = root_mesh.get_rank()
+            for mesh_nd in pg_ranks_by_dim:
+                # need to init backend here since the flattened pg doesn't exist in root mesh.
+                flattened_mesh = DeviceMesh(
+                    root_mesh.device_type,
+                    mesh_nd,
+                    mesh_dim_names=(mesh_dim_name,),
+                )
+                if cur_rank in mesh_nd:
+                    res_flattened_mesh = flattened_mesh
+            self.child_to_root_mapping[res_flattened_mesh] = root_mesh  # type: ignore[possibly-undefined]
+            self.root_to_flatten_mapping.setdefault(root_mesh, {})[mesh_dim_name] = (
+                res_flattened_mesh  # type: ignore[possibly-undefined]
+            )
+            self.flatten_name_to_root_dims[root_mesh][mesh_dim_name] = tuple(
+                flatten_dims_in_root
+            )  # type: ignore[possibly-undefined]
+
+            return res_flattened_mesh
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def get_root_mesh(self, device_mesh: "DeviceMesh") -> "DeviceMesh":
             # If a mesh could not be found in the child_to_root_mapping, it is a root mesh itself.
             # A root mesh is not created through slicing.
             # We considers the root mesh of a root mesh is itself.
+<<<<<<< HEAD
             # We keep this function for backward compatibility.
             warnings.warn(
                 "This get_root_mesh API will be deprecated soon."
@@ -91,6 +271,26 @@ def get_root_mesh(self, device_mesh: "DeviceMesh") -> "DeviceMesh":
             if not device_mesh:
                 return device_mesh
             return device_mesh._get_root_mesh()
+=======
+            root_mesh = self.child_to_root_mapping.get(device_mesh, None)
+            return device_mesh if not root_mesh else root_mesh
+
+        def get_root_mesh_dim(self, device_mesh: "DeviceMesh") -> Optional[int]:
+            """
+            Returns the index of the mesh dim in the root mesh.
+            The device_mesh passed in needs to be sliced out from the root mesh
+            or submesh of the root mesh.
+            """
+            root_mesh = self.get_root_mesh(device_mesh)
+            child_mesh_dim_names = device_mesh.mesh_dim_names
+            if root_mesh and child_mesh_dim_names:
+                assert len(child_mesh_dim_names) == 1, (
+                    "The submesh can only be a 1D mesh."
+                )
+                child_mesh_dim_name = child_mesh_dim_names[0]
+                return self.get_mesh_dim_by_name(root_mesh, child_mesh_dim_name)
+            return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @staticmethod
         def num_devices_per_host(device_type: str) -> int:
@@ -102,6 +302,7 @@ def num_hosts(device_type: str) -> int:
             # homogeneous hardware for now
             return get_world_size() // _MeshEnv.num_devices_per_host(device_type)
 
+<<<<<<< HEAD
         # TODO: to remove it once we move all use cases into new API.
         # We keep this API for backward compatibility.
         def _get_all_submeshes(
@@ -113,6 +314,113 @@ def _get_all_submeshes(
                 stacklevel=2,
             )
             return device_mesh._get_all_submeshes(mesh_dim_name)
+=======
+        def get_mesh_dim_by_name(
+            self, device_mesh: "DeviceMesh", mesh_dim_name: str
+        ) -> int:
+            if (
+                device_mesh.mesh_dim_names is None
+                or len(device_mesh.mesh_dim_names) == 0
+            ):
+                raise KeyError(
+                    "No `mesh_dim_names` found.",
+                )
+            if mesh_dim_name not in device_mesh.mesh_dim_names:
+                raise KeyError(
+                    f"Mesh dimension '{mesh_dim_name}' does not exist.",
+                    f"Available mesh dimensions are: mesh_dim_names={device_mesh.mesh_dim_names}",
+                )
+            return not_none(device_mesh.mesh_dim_names.index(mesh_dim_name))
+
+        def _set_mesh_dim_group_options(
+            self,
+            dim: int,
+            backend: str,
+            pg_options: Optional[C10dBackend.Options] = None,
+        ) -> None:
+            self.mesh_dim_group_options[dim] = (backend, pg_options)
+
+        def _get_slice_mesh_dims(
+            self, device_mesh, mesh_dim_names
+        ) -> list[tuple[int, ...]]:
+            """
+            Validate whether the mesh_dim_names is valid for slicing the given device_mesh.
+            If valid, return dim indexes of the slice mesh in the device mesh.
+            """
+            if device_mesh != self.get_root_mesh(device_mesh):
+                raise RuntimeError("Cannot create a submesh from a submesh.")
+
+            # The slice mesh_dim_names should consist either the device_mesh's mesh_dim_names
+            # or its flattened mesh's mesh_dim_names.
+            self.flatten_name_to_root_dims.setdefault(device_mesh, {})
+            flatten_name_to_root_dims = self.flatten_name_to_root_dims[device_mesh]
+            valid_mesh_dim_names = [
+                *device_mesh.mesh_dim_names,
+                *flatten_name_to_root_dims,
+            ]
+
+            if not all(
+                mesh_dim_name in valid_mesh_dim_names
+                for mesh_dim_name in mesh_dim_names
+            ):
+                raise KeyError(
+                    f"Invalid mesh_dim_names {mesh_dim_names} specified. "
+                    f"Valid mesh_dim_names are {valid_mesh_dim_names}."
+                )
+
+            # Validate the order of the slice mesh dim indices.
+            # This needs to be in ascending order.
+            curr_idx = -1
+            slice_mesh_dims = []
+            for mesh_dim_name in mesh_dim_names:
+                if mesh_dim_name in flatten_name_to_root_dims:
+                    mesh_indices = flatten_name_to_root_dims[mesh_dim_name]
+                    # TODO: this doesn't allow non-contiguous slicing with flatten dim yet. next_idx
+                    # should be mesh_indices[0] once we support non-contiguous slicing with flatten dim.
+                    next_idx = mesh_indices[-1]
+                    slice_mesh_dims.append(mesh_indices)
+                else:
+                    next_idx = device_mesh.mesh_dim_names.index(mesh_dim_name)
+                    slice_mesh_dims.append((next_idx,))
+                if next_idx <= curr_idx:
+                    raise KeyError(
+                        f"Invalid mesh_dim_names {mesh_dim_names} specified. ",
+                        f"Found mesh dim indices to slice: {slice_mesh_dims}. ",
+                        "Mesh dim indices should be in ascending order.",
+                    )
+                curr_idx = next_idx
+
+            return slice_mesh_dims
+
+        def _get_all_submeshes(
+            self, device_mesh: "DeviceMesh", mesh_dim_name: str
+        ) -> list["DeviceMesh"]:
+            """
+            Return all the submeshes of a given mesh dimension of the device mesh.
+            """
+            mesh_dim = self.get_mesh_dim_by_name(device_mesh, mesh_dim_name)
+            pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, mesh_dim).reshape(
+                -1, device_mesh.mesh.size(mesh_dim)
+            )
+
+            cur_rank = device_mesh.get_rank()
+            res_submeshes = []
+            for mesh_1d in pg_ranks_by_dim:
+                submesh = DeviceMesh(
+                    device_mesh.device_type,
+                    mesh_1d,
+                    mesh_dim_names=(mesh_dim_name,),
+                    _init_backend=False,
+                )
+                submesh._dim_group_names = (
+                    [device_mesh._dim_group_names[mesh_dim]]
+                    if cur_rank in mesh_1d
+                    else []
+                )
+                res_submeshes.append(submesh)
+
+            return res_submeshes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _mesh_resources: _MeshEnv = _MeshEnv()
 
@@ -150,9 +458,12 @@ class DeviceMesh:
             device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
             mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
                 of devices, where the IDs are global IDs of the default process group.
+<<<<<<< HEAD
             _rank (int): (experimental/internal)
                 The global rank of the current process. If not provided, it will
                 be inferred from the default process group.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
@@ -173,6 +484,7 @@ class DeviceMesh:
             >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
         """
 
+<<<<<<< HEAD
         _device_type: str
         _rank_map: torch.Tensor
         _mesh_dim_names: Optional[tuple[str, ...]]
@@ -180,10 +492,16 @@ class DeviceMesh:
         _root_mesh: Optional["DeviceMesh"] = None
         # Record flatten mesh name to its flattened mesh in root mesh.
         _flatten_mapping: dict[str, "DeviceMesh"]
+=======
+        device_type: str
+        mesh: torch.Tensor
+        mesh_dim_names: Optional[tuple[str, ...]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __init__(
             self,
             device_type: str,
+<<<<<<< HEAD
             mesh: Optional[Union[torch.Tensor, "ArrayLike"]] = None,
             *,
             mesh_dim_names: Optional[tuple[str, ...]] = None,
@@ -241,12 +559,36 @@ def __init__(
             # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
             self._thread_id = None
+=======
+            mesh: Union[torch.Tensor, "ArrayLike"],
+            *,
+            mesh_dim_names: Optional[tuple[str, ...]] = None,
+            _init_backend: bool = True,
+        ) -> None:
+            self.device_type = device_type
+            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
+                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
+            self.mesh = (
+                mesh.detach().to(dtype=torch.int)
+                if isinstance(mesh, torch.Tensor)
+                else torch.tensor(mesh, device="cpu", dtype=torch.int)
+            )
+            self.mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
+
+            # private field to pre-generate DeviceMesh's hash
+            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
+            self._thread_id = None
+
+            # Skip process group initialization if xla device or init backend is False
+            # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device_type != "xla":
                 # always try to create default (world) pg, even if it is not initialized
                 # already. The world pg is used for device mesh identity (rank) on each
                 # process (we need to know if the current global rank is in the mesh or not).
                 if _init_backend:
                     self._setup_world_group_and_device()
+<<<<<<< HEAD
                     self._dim_group_names = self._init_process_groups(
                         self._layout,
                         self._rank_map,
@@ -263,11 +605,21 @@ def __init__(
 
                 # calculate the coordinates of the current global rank on the mesh
                 rank_coords = (self.mesh == _rank).nonzero()
+=======
+                    self._init_process_groups()
+
+                if is_initialized() and get_backend() == "threaded":
+                    self._thread_id = threading.get_ident()
+
+                # calculate the coordinates of the current global rank on the mesh
+                rank_coords = (self.mesh == get_rank()).nonzero()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert rank_coords.size(0) in (0, 1)
                 self._coordinate_on_dim: Optional[list[int]] = (
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
                 )
 
+<<<<<<< HEAD
             # private field to pre-generate DeviceMesh's hash
             self._flatten_rank_map = tuple(self._rank_map.tolist())
             # Initialize instance-specific flatten mapping
@@ -298,6 +650,8 @@ def mesh_dim_names(self) -> Optional[tuple[str, ...]]:
             """Returns the names of mesh dimensions."""
             return self._mesh_dim_names
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _setup_world_group_and_device(self):
             default_initialized = is_initialized()
             # TODO: think about how to allow pg options to be passed to world group
@@ -306,14 +660,24 @@ def _setup_world_group_and_device(self):
                 init_process_group()
 
             world_size = get_world_size()
+<<<<<<< HEAD
             if self._layout.numel() > world_size:
                 raise RuntimeError(
                     f"Mesh should not be bigger than default world size {world_size}, but found {self._layout.numel()} ranks!"
+=======
+            if self.mesh.numel() > world_size:
+                raise RuntimeError(
+                    f"Mesh should not be bigger than default world size {world_size}, but found {self.mesh.numel()} ranks!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # ONLY set the device if the current device is not initialized, if user already
             # set the device before DeviceMesh init, we respect the user's choice.
+<<<<<<< HEAD
             device_handle = _get_device_handle(self._device_type)
+=======
+            device_handle = _get_device_handle(self.device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device_handle and not device_handle.is_initialized():
                 # auto set the cuda/cuda-like device only if user has not set it, if there's LOCAL_RANK
                 # env variable from launchers, we use it to set the device.
@@ -331,8 +695,12 @@ def _setup_world_group_and_device(self):
                         "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that "
                         "the underlying communicator (i.e. NCCL) can be initialized properly. "
                         "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the "
+<<<<<<< HEAD
                         "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. ",
                         stacklevel=2,
+=======
+                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
                     # NOTE: This device selection would only work for homogeneous hardware.
@@ -343,12 +711,17 @@ def _setup_world_group_and_device(self):
                     ):
                         raise RuntimeError(
                             f"DeviceMesh only support homogeneous hardware, but found "
+<<<<<<< HEAD
                             f"{world_size} ranks and {num_devices_per_host} {self._device_type} devices!"
+=======
+                            f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     device_handle.set_device(get_rank() % num_devices_per_host)
 
             return _get_default_group()
 
+<<<<<<< HEAD
         @staticmethod
         def _init_process_groups(
             layout: _MeshLayout,
@@ -356,18 +729,26 @@ def _init_process_groups(
             mesh_dim_names: Optional[tuple[str, ...]],
             backend_override: tuple[BackendConfig, ...],
         ) -> list[str]:
+=======
+        def _init_process_groups(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
             dim_group_names: list[str] = []
             default_group = _get_default_group()
 
+<<<<<<< HEAD
             if (
                 len(layout) == 1
                 and layout.numel() == get_world_size()
                 and backend_override[0] == (None, None)
             ):
                 # Append the default pg to the first dim groups only if the default pg is compatible with `self._device_type`.
+=======
+            if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
+                # Append the default pg to the first dim groups only if the default pg is compatible with `self.device_type`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Otherwise, create new pg.
                 ranks = list(range(get_world_size()))
                 dim_group = (
@@ -383,6 +764,7 @@ def _init_process_groups(
                 dim_group_names.append(dim_group.group_name)
             else:
                 # create sub pgs base on the mesh argument specified
+<<<<<<< HEAD
                 for dim in range(len(layout)):
                     # swap the current dim to the last dim
                     # then reshape to flatten out other dims
@@ -392,14 +774,37 @@ def _init_process_groups(
                     # the default timeout will be used to override the timeout set in option.
                     # TODO: remove this once we have fixed inside c10d level.
                     timeout = pg_options._timeout if pg_options else None
+=======
+                for dim in range(self.mesh.ndim):
+                    # swap the current dim to the last dim
+                    # then reshape to flatten out other dims
+                    pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
+                        -1, self.mesh.size(dim)
+                    )
+
+                    # Respect dim group options specified via _MeshEnv.set_dim_group_options().
+                    # Inherit from the parent group if no options are specified for the group.
+                    if dim in _mesh_resources.mesh_dim_group_options:
+                        (
+                            backend,
+                            pg_options,
+                        ) = _mesh_resources.mesh_dim_group_options[dim]
+                    else:
+                        backend, pg_options = None, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     # If we have a 2D mesh with mesh_dim_names ("dp", "tp"), the group description
                     # of the subgroups would be `mesh_dim_dp` and `mesh_name_tp`.
                     # If the mesh doesn't not have a mesh_dim_names, then the group description of the
                     # subgroup would be `mesh_dim_0` and `mesh_dim_1`.
                     group_desc = (
+<<<<<<< HEAD
                         f"mesh_{mesh_dim_names[dim]}"
                         if mesh_dim_names
+=======
+                        f"mesh_{self.mesh_dim_names[dim]}"
+                        if self.mesh_dim_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         else f"mesh_dim_{dim}"
                     )
 
@@ -414,6 +819,7 @@ def _init_process_groups(
                     dim_group = None
                     has_split_group = False
                     if (
+<<<<<<< HEAD
                         (
                             bound_device_id := getattr(
                                 default_group, "bound_device_id", None
@@ -430,6 +836,14 @@ def _init_process_groups(
                         dim_group = split_group(
                             parent_pg=default_group,
                             timeout=timeout,
+=======
+                        bound_device_id := getattr(
+                            default_group, "bound_device_id", None
+                        )
+                    ) is not None and torch.cuda.is_available():
+                        dim_group = split_group(
+                            parent_pg=default_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             pg_options=pg_options,
                             split_ranks=pg_ranks_by_dim.tolist(),
                             group_desc=group_desc,
@@ -446,17 +860,24 @@ def _init_process_groups(
                         # We temporarily revert the reuse subgroup, since it breaks two internal tests.
                         # Temporarily reverting to resolve test timeout while root-causing.
                         # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
                         if bound_device_id is None or not has_split_group:
                             dim_group = new_group(
                                 ranks=subgroup_ranks,
                                 timeout=timeout,
+=======
+                        if bound_device_id is None or not has_split_group:
+                            dim_group = new_group(
+                                ranks=subgroup_ranks,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 backend=backend,
                                 pg_options=pg_options,
                                 group_desc=group_desc,
                             )
 
                         # only add to dim_groups if the current rank in the subgroup
+<<<<<<< HEAD
                         if get_rank() in subgroup_ranks:
                             if len(dim_group_names) > dim:
                                 raise RuntimeError(
@@ -468,6 +889,16 @@ def _init_process_groups(
 
         def _get_root_mesh(self) -> "DeviceMesh":
             return self._root_mesh if self._root_mesh else self
+=======
+                        if self.get_rank() in subgroup_ranks:
+                            if len(dim_group_names) > dim:
+                                raise RuntimeError(
+                                    f"Each device mesh dimension should get only one process group, but got {self.get_rank()} "
+                                    f"in {subgroup_ranks}!"
+                                )
+                            dim_group_names.append(dim_group.group_name)  # type: ignore[union-attr]
+            self._dim_group_names = dim_group_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __enter__(self) -> "DeviceMesh":
             # set this mesh as the current mesh in mesh env
@@ -481,6 +912,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
 
         def __repr__(self) -> str:
             device_mesh_repr = (
+<<<<<<< HEAD
                 f"({', '.join(f'{k}={v}' for k, v in zip(self._mesh_dim_names, self._layout.top_level_sizes))})"
                 if self._mesh_dim_names
                 else f"{self._layout.top_level_sizes}"
@@ -490,6 +922,13 @@ def __repr__(self) -> str:
             if os.environ.get("TORCH_DISTRIBUTED_DEBUG", "") == "DETAIL":
                 device_mesh_repr += f", Mesh: {self.mesh.tolist()}"
             return f"{device_mesh_repr})"
+=======
+                f"DeviceMesh('{self.device_type}', {self.mesh.tolist()})"
+                if not self.mesh_dim_names
+                else f"DeviceMesh('{self.device_type}', {self.mesh.tolist()}, mesh_dim_names={self.mesh_dim_names})"
+            )
+            return device_mesh_repr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __hash__(self):
             # lazily compute hash
@@ -497,16 +936,24 @@ def __hash__(self):
             if not self._hash:
                 self._hash = hash(
                     (
+<<<<<<< HEAD
                         self._flatten_rank_map,
                         self._layout,
                         self._device_type,
                         self._mesh_dim_names,
+=======
+                        self._flatten_mesh_list,
+                        self.mesh.shape,
+                        self.device_type,
+                        self.mesh_dim_names,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self._thread_id,
                     )
                 )
             return self._hash
 
         def __eq__(self, other: object) -> bool:
+<<<<<<< HEAD
             if self is other:
                 return True
             if not isinstance(other, DeviceMesh):
@@ -518,6 +965,20 @@ def __eq__(self, other: object) -> bool:
                 and self._mesh_dim_names == other._mesh_dim_names
                 and self._thread_id == other._thread_id
             )
+=======
+            if not isinstance(other, DeviceMesh):
+                return False
+            if id(self) == id(other):
+                return True
+            else:
+                return (
+                    self._flatten_mesh_list == other._flatten_mesh_list
+                    and self.mesh.shape == other.mesh.shape
+                    and self.device_type == other.device_type
+                    and self.mesh_dim_names == other.mesh_dim_names
+                    and self._thread_id == other._thread_id
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __getitem__(
             self, mesh_dim_names: Union[str, tuple[str, ...]]
@@ -565,13 +1026,18 @@ def __getitem__(
                 >>> dp_cp_mesh = mesh_3d["dp", "cp"]
                 >>> cp_dp_mesh = mesh_3d["cp", "dp"]
             """
+<<<<<<< HEAD
             if not self._mesh_dim_names:
+=======
+            if not self.mesh_dim_names:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise RuntimeError("Cannot slice a DeviceMesh without mesh_dim_names!")
 
             mesh_dim_names = (
                 (mesh_dim_names,) if isinstance(mesh_dim_names, str) else mesh_dim_names
             )
 
+<<<<<<< HEAD
             if mesh_dim_names == self._mesh_dim_names:
                 return self
             else:
@@ -580,13 +1046,31 @@ def __getitem__(
                 # fail as it will require a real tensor to manipulate.
                 # `unset_fake_temporarily()` will allow us to materialize the tensors
                 # within `_create_sub_mesh`, which should not affect modling.
+=======
+            if mesh_dim_names == self.mesh_dim_names:
+                return self
+            else:
+                slice_mesh_dims = _mesh_resources._get_slice_mesh_dims(
+                    self, mesh_dim_names
+                )
+                # When using FakeTensorMode to trace the model, `create_sub_mesh()` will
+                # fail as it will require a real tensor to manipulate.
+                # `unset_fake_temporarily()` will allow us to materialize the tensors
+                # within `_mesh_resources`, which should not affect modling.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 #
                 # Note that this should be orthogonal to torch.compile(). But whether
                 # we can compile device_mesh `slicing` (no graph break) is not verified
                 # yet and need a follow-up,
                 # TODO: compiler + device_mesh slicing.
                 with torch._subclasses.fake_tensor.unset_fake_temporarily():
+<<<<<<< HEAD
                     submesh = self._create_sub_mesh(sliced_mesh_layout, mesh_dim_names)
+=======
+                    submesh = _mesh_resources.create_sub_mesh(
+                        self, mesh_dim_names, slice_mesh_dims
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return submesh
 
         def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
@@ -604,20 +1088,36 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
             if not hasattr(self, "_dim_group_names"):
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
+<<<<<<< HEAD
             if len(self._layout) > 1 and mesh_dim is None:
                 raise RuntimeError(
                     f"Found the DeviceMesh have {len(self._layout)} dimensions",
+=======
+            if self.mesh.ndim > 1 and mesh_dim is None:
+                raise RuntimeError(
+                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                     "If you want to get the list of all the ProcessGroups in the DeviceMesh,"
                     "please use `get_all_groups()` instead.",
                 )
 
             # Quick return if the current device_mesh is a 1D mesh.
+<<<<<<< HEAD
             if len(self._layout) == 1 and mesh_dim is None:
                 return not_none(_resolve_process_group(self._dim_group_names[0]))
 
             root_mesh = self._get_root_mesh()
             root_to_flatten_mapping = root_mesh._flatten_mapping
+=======
+            if self.mesh.ndim == 1 and mesh_dim is None:
+                return not_none(_resolve_process_group(self._dim_group_names[0]))
+
+            root_mesh = _mesh_resources.get_root_mesh(self)
+            root_to_flatten_mapping = _mesh_resources.root_to_flatten_mapping.get(
+                root_mesh, None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping.keys():
                 dim_group_name = root_to_flatten_mapping[
                     mesh_dim  # type: ignore[index]
@@ -625,7 +1125,11 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
                 return not_none(_resolve_process_group(dim_group_name))
             else:
                 mesh_dim = (
+<<<<<<< HEAD
                     self._get_mesh_dim_by_name(mesh_dim)
+=======
+                    _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if isinstance(mesh_dim, str)
                     else mesh_dim
                 )
@@ -639,6 +1143,7 @@ def get_all_groups(self) -> list[ProcessGroup]:
             Returns:
                 A list of :class:`ProcessGroup` object.
             """
+<<<<<<< HEAD
             return [self.get_group(i) for i in range(len(self._layout))]
 
         def _create_sub_mesh(
@@ -870,6 +1375,9 @@ def _get_all_submeshes(self, mesh_dim_name: str) -> list["DeviceMesh"]:
                 res_submeshes.append(submesh)
 
             return res_submeshes
+=======
+            return [self.get_group(i) for i in range(self.mesh.ndim)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @staticmethod
         def from_group(
@@ -946,10 +1454,13 @@ def from_group(
                 raise ValueError(
                     "Must pass mesh_dim_names if passing multiple ProcessGroups"
                 )
+<<<<<<< HEAD
             # When init a DeviceMesh with multiple ProcessGroups directly, we need to make sure
             # the mesh tensor is contiguous. Otherwise, the layout we inferred from the mesh tensor
             # will have larger span than the actual tensor. This is just internal implementation detail
             # and does not affect user facing behavior.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mesh = (
                 mesh.detach().to(dtype=torch.int, device="cpu")
                 if isinstance(mesh, torch.Tensor)
@@ -967,6 +1478,7 @@ def from_group(
             return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int:
+<<<<<<< HEAD
             if mesh_dim is not None:
                 return self._layout[mesh_dim].numel()
             return self._layout.numel()
@@ -978,6 +1490,17 @@ def ndim(self) -> int:
         @property
         def shape(self) -> tuple[int, ...]:
             return self._layout.top_level_sizes
+=======
+            return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
+
+        @property
+        def ndim(self) -> int:
+            return self.mesh.ndim
+
+        @property
+        def shape(self) -> tuple[int, ...]:
+            return tuple(self.mesh.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_rank(self) -> int:
             """
@@ -1016,7 +1539,11 @@ def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
             """
             if self.ndim > 1 and mesh_dim is None:
                 raise RuntimeError(
+<<<<<<< HEAD
                     f"Found the DeviceMesh have {len(self._layout)} dimensions",
+=======
+                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                 )
             elif mesh_dim is None:
@@ -1035,6 +1562,7 @@ def get_coordinate(self) -> Optional[list[int]]:
             """
             return self._coordinate_on_dim if self._coordinate_on_dim else None
 
+<<<<<<< HEAD
         def _flatten(
             self,
             mesh_dim_name: Optional[str] = None,
@@ -1042,23 +1570,36 @@ def _flatten(
                 None, str, C10dBackend.Options, tuple[str, C10dBackend.Options]
             ] = None,
         ) -> "DeviceMesh":
+=======
+        def _flatten(self, mesh_dim_name: Optional[str] = None) -> "DeviceMesh":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             Returns a 1D DeviceMesh by flattening the current DeviceMesh.
 
             If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
             given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
             DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
+<<<<<<< HEAD
             mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 2, 4, 6], mesh_dim_names=("dp_cp",))
             on rank 0, 2, 4, 6 and a 1D submesh DeviceMesh([1, 3, 5, 7], mesh_dim_names=("dp_cp",)) on rank 1, 3, 5, 7.
+=======
+            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 1, 2, 3], mesh_dim_names=("dp_cp",))
+            on rank 0, 1, 2, 3 and a 1D submesh DeviceMesh([4, 5, 6, 7], mesh_dim_names=("dp_cp",)) on rank 4, 5, 6, 7.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
             existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
             """
+<<<<<<< HEAD
             if not self._mesh_dim_names:
+=======
+            if not self.mesh_dim_names:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise RuntimeError(
                     "Cannot flatten a DeviceMesh without mesh_dim_names!"
                 )
 
+<<<<<<< HEAD
             if backend_override is not None:
                 (backend_override_tuple,) = _normalize_backend_override(
                     {0: backend_override}, 1
@@ -1252,18 +1793,24 @@ def _normalize_backend_override(
                 f"Found invalid keys in backend_override: got {list(backend_override.keys())}, "
                 f"expected integers in range [0, {ndim}) or one of {mesh_dim_names}"
             )
+=======
+            return _mesh_resources.create_flatten_mesh(self, mesh_dim_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def init_device_mesh(
         device_type: str,
         mesh_shape: tuple[int, ...],
         *,
         mesh_dim_names: Optional[tuple[str, ...]] = None,
+<<<<<<< HEAD
         backend_override: Optional[
             dict[
                 Union[int, str],
                 Union[str, C10dBackend.Options, tuple[str, C10dBackend.Options]],
             ]
         ] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> DeviceMesh:
         """
         Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.
@@ -1281,18 +1828,25 @@ def init_device_mesh(
             required for distributed communications behind the scene.
 
         Args:
+<<<<<<< HEAD
             device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like", "xpu".
+=======
+            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Passing in a device type with a GPU index, such as "cuda:0", is not allowed.
             mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
                 describing the layout of devices.
             mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
                 of the multi-dimensional array describing the layout of devices. Its length must match the length
                 of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
+<<<<<<< HEAD
             backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional): Overrides for some or all of
                 the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a
                 dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name
                 of the backend and its options, or just one of these two components (in which case the other will be
                 set to its default value).
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
@@ -1319,6 +1873,7 @@ def init_device_mesh(
                     f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
                 )
 
+<<<<<<< HEAD
         if backend_override is not None:
             backend_override_tuple = tuple(
                 _normalize_backend_override(
@@ -1328,6 +1883,8 @@ def init_device_mesh(
         else:
             backend_override_tuple = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # assume valid device types are all letters
         if device_type and not device_type.isalpha():
             raise RuntimeError(
@@ -1335,6 +1892,7 @@ def init_device_mesh(
                 "If you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.",
             )
 
+<<<<<<< HEAD
         layout = _MeshLayout(tuple(mesh_shape), suffix_product(tuple(mesh_shape)))
         # Always initialize the (identity) rank map on CPU, regardless of what the
         # external device type has been set to be (e.g. meta)
@@ -1346,6 +1904,16 @@ def init_device_mesh(
             _rank_map=rank_map,
             mesh_dim_names=mesh_dim_names,
             backend_override=backend_override_tuple,
+=======
+        # Always initialize the mesh's tensor on CPU, regardless of what the
+        # external device type has been set to be (e.g. meta)
+        with torch.device("cpu"):
+            mesh = torch.arange(math.prod(mesh_shape), dtype=torch.int).view(mesh_shape)
+        device_mesh = DeviceMesh(
+            device_type=device_type,
+            mesh=mesh,
+            mesh_dim_names=mesh_dim_names,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return device_mesh
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 1fdc2a13bcdd0..97a86fcaa6c50 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -14,9 +14,14 @@
 import time
 import warnings
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable
 from datetime import timedelta
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from datetime import timedelta
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -351,6 +356,7 @@ def register_backend(
             # assume default devices "cpu" and "cuda", but warn
             warnings.warn(
                 f"Device capability of {name} unspecified, assuming `cpu` and "
+<<<<<<< HEAD
                 "`cuda` or `xpu`. Please specify it via the `devices` argument of "
                 "`register_backend`.",
                 stacklevel=2,
@@ -358,6 +364,12 @@ def register_backend(
             Backend.backend_capability[name.lower()] = (
                 ["cpu", "cuda", "xpu"] if torch.xpu.is_available() else ["cpu", "cuda"]
             )
+=======
+                "`cuda`. Please specify it via the `devices` argument of "
+                "`register_backend`."
+            )
+            Backend.backend_capability[name.lower()] = ["cpu", "cuda"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(devices, str):
             # Single device string specified. Simply convert to list.
             Backend.backend_capability[name.lower()] = [devices]
@@ -373,7 +385,10 @@ class BackendConfig:
     def __init__(self, backend: Backend):
         """Init."""
         self.device_backend_map: dict[str, Backend] = {}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend = str(backend)
 
         if backend == Backend.UNDEFINED:
@@ -394,7 +409,10 @@ def __init__(self, backend: Backend):
             # e.g. "nccl", "gloo", "ucc", "mpi"
             supported_devices = Backend.backend_capability[backend.lower()]
             backend_val = Backend(backend)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.device_backend_map = dict.fromkeys(supported_devices, backend_val)
         elif ":" in backend.lower():
             # Backend specified in "device:backend" format
@@ -413,7 +431,10 @@ def __init__(self, backend: Backend):
                         f"Invalid device:backend pairing: \
                                      {device_backend_pair_str}. {backend_str_error_message}"
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device, backend = device_backend_pair
                 if device in self.device_backend_map:
                     raise ValueError(
@@ -428,8 +449,12 @@ def __init__(self, backend: Backend):
             warnings.warn(
                 f"Device capability of {backend} unknown, assuming `cpu` and "
                 "`cuda`. You can specify it in `device:backend` format in "
+<<<<<<< HEAD
                 "`init_process_group` call.",
                 stacklevel=2,
+=======
+                "`init_process_group` call."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             backend_val = Backend(backend)
             self.device_backend_map = {
@@ -753,8 +778,12 @@ def _get_default_timeout(backend: Backend) -> timedelta:
             # TODO moco benchmark on CPU initializes pgnccl backend today, triggered this assert in CI before it was
             # changed to be a warning.  We should fix the moco model.
             warnings.warn(
+<<<<<<< HEAD
                 "Attempted to get default timeout for nccl backend, but NCCL support is not compiled",
                 stacklevel=2,
+=======
+                "Attempted to get default timeout for nccl backend, but NCCL support is not compiled"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return default_pg_timeout
         return default_pg_nccl_timeout
@@ -805,7 +834,10 @@ def _get_object_coll_device(group: Optional[ProcessGroup] = None) -> str:
             f"You are using a Backend {type(group)} as a ProcessGroup. "
             "This usage is deprecated since PyTorch 2.0. Please use a public API "
             "of PyTorch Distributed instead.",
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Provide backward compatibility to cases where `group` passed in is
         # actually a Backend (like `ProcessGroupGloo`) rather than a
@@ -872,8 +904,12 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
         "backward-compatiblity reason. If you need to find a device for object "
         "collectives, please use `_get_object_coll_device`. If you need to query "
         "the device types supported by group, please use "
+<<<<<<< HEAD
         "`_device_capability(group)`. ",
         stacklevel=2,
+=======
+        "`_device_capability(group)`. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     group = group or _get_default_group()
 
@@ -915,8 +951,12 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
         warnings.warn(
             "Multiple backends are registered with this ProcessGroup. We cannot "
             f"determine which one is the default. Returning {rv}. "
+<<<<<<< HEAD
             "Please consider using other APIs.",
             stacklevel=2,
+=======
+            "Please consider using other APIs."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return rv
 
@@ -978,7 +1018,11 @@ def _store_based_barrier(
         except RuntimeError as e:
             worker_count = store.add(store_key, 0)
             # Print status periodically to keep track.
+<<<<<<< HEAD
             logger.debug(  # noqa: G200
+=======
+            logger.debug(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Waiting in store based barrier to initialize process group for %s seconds"
                 "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s error=%s)",
                 time.time() - start,
@@ -1016,8 +1060,12 @@ def _warn_not_in_group(op_name) -> None:
     global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank()
     warnings.warn(
         f"Running {op_name} on global rank {global_rank} which does not "
+<<<<<<< HEAD
         "belong to the given group.",
         stacklevel=2,
+=======
+        "belong to the given group."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -1192,7 +1240,10 @@ def _as_iterable(obj) -> collections.abc.Iterable:
 
 def _ensure_all_tensors_same_dtype(*tensors) -> None:
     last_dtype = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for tensor in itertools.chain.from_iterable(map(_as_iterable, tensors)):
         tensor_dtype = tensor.dtype
         # Mixing complex and its element type is allowed
@@ -1265,6 +1316,7 @@ def is_xccl_available() -> bool:
     return _XCCL_AVAILABLE
 
 
+<<<<<<< HEAD
 def _check_single_backend_availability(backend_name: str) -> bool:
     """
     Helper function to check if a single backend is available.
@@ -1277,6 +1329,8 @@ def _check_single_backend_availability(backend_name: str) -> bool:
     return str(backend_name).lower() in Backend.backend_list
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_backend_available(backend: str) -> bool:
     """
     Check backend availability.
@@ -1290,6 +1344,7 @@ def is_backend_available(backend: str) -> bool:
         bool: Returns true if the backend is available otherwise false.
     """
     # If the backend has an ``is_backend_available`` function, return the result of that function directly
+<<<<<<< HEAD
     if ":" in backend.lower():  # composite backend like "cpu:gloo"
         backend_config = BackendConfig(Backend(backend))
         device_backend_map = backend_config.get_device_backend_map()
@@ -1300,6 +1355,13 @@ def is_backend_available(backend: str) -> bool:
     else:
         # Handle simple backend strings like "nccl", "gloo"
         return _check_single_backend_availability(backend)
+=======
+    available_func = getattr(torch.distributed, f"is_{backend.lower()}_available", None)
+    if available_func:
+        return available_func()
+
+    return backend.lower() in Backend.backend_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_initialized() -> bool:
@@ -1564,9 +1626,13 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
     if len(backends) == 0:
+<<<<<<< HEAD
         warnings.warn(
             "Set timeout is now only supported for either nccl or gloo.", stacklevel=2
         )
+=======
+        warnings.warn("Set timeout is now only supported for either nccl or gloo.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for backend in backends:
         backend._set_default_timeout(timeout)
 
@@ -1583,7 +1649,10 @@ def init_process_group(
     group_name: str = "",
     pg_options: Optional[Any] = None,
     device_id: Optional[Union[torch.device, int]] = None,
+<<<<<<< HEAD
     _ranks: Optional[list[int]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Initialize the default distributed process group.
@@ -1602,12 +1671,20 @@ def init_process_group(
     Args:
         backend (str or Backend, optional): The backend to use. Depending on
             build-time configurations, valid values include ``mpi``, ``gloo``,
+<<<<<<< HEAD
             ``nccl``, ``ucc``, ``xccl`` or one that is registered by a third-party
+=======
+            ``nccl``, ``ucc``, or one that is registered by a third-party
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             plugin.
             Since 2.6, if ``backend`` is not provided, c10d will use a backend
             registered for the device type indicated by the `device_id` kwarg
             (if provided). The known default registrations today are: ``nccl``
+<<<<<<< HEAD
             for ``cuda``, ``gloo`` for ``cpu``, ``xccl`` for ``xpu``.
+=======
+            for ``cuda``, ``gloo`` for ``cpu``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             If neither ``backend`` nor ``device_id`` is provided, c10d will
             detect the accelerator on the run-time machine and use a backend
             registered for that detected accelerator (or ``cpu``).
@@ -1658,8 +1735,11 @@ def init_process_group(
             want to know NCCL initialization error early, you can also use this
             field. If an `int` is provided, the API assumes that the accelerator
             type at compile time will be used.
+<<<<<<< HEAD
         _ranks: The ranks in the process group. If provided, the process
                group name will be the hash of all the ranks in the group.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source
         on a system that supports MPI.
@@ -1764,17 +1844,25 @@ def init_process_group(
     internals of c10d. This means we can ignore the value
     they provide as it not exposed in a public way.
     """
+<<<<<<< HEAD
     if _ranks is None or len(_ranks) == 0:
         group_name = _process_group_name([], use_hashed_name=False)
     else:
         group_name = _process_group_name(_ranks, use_hashed_name=True)
+=======
+    group_name = _process_group_name([], use_hashed_name=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if backend == Backend.MPI:
         if world_size != -1 or rank != -1:
             warnings.warn(
                 f"For MPI backend, world_size ({world_size}) and rank ({rank}) "
                 "are ignored since they are assigned by the "
+<<<<<<< HEAD
                 "MPI runtime.",
                 stacklevel=2,
+=======
+                "MPI runtime."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         default_pg, _ = _new_process_group_helper(
@@ -1787,6 +1875,7 @@ def init_process_group(
             timeout=timeout,
             group_desc="default_pg",
         )
+<<<<<<< HEAD
     else:
         # backward compatible API
         if store is None:
@@ -1800,6 +1889,17 @@ def init_process_group(
                 )
                 store, rank, world_size = next(rendezvous_iterator)
                 store.set_timeout(timeout)
+=======
+        _update_default_pg(default_pg)
+    else:
+        # backward compatible API
+        if store is None:
+            rendezvous_iterator = rendezvous(
+                not_none(init_method), rank, world_size, timeout=timeout
+            )
+            store, rank, world_size = next(rendezvous_iterator)
+            store.set_timeout(timeout)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Use a PrefixStore to avoid accidental overrides of keys used by
             # different systems (e.g. RPC) in case the store is multi-tenant.
@@ -1817,8 +1917,12 @@ def init_process_group(
             device_id=device_id,
             group_desc="default_pg",
         )
+<<<<<<< HEAD
 
     _update_default_pg(default_pg)
+=======
+        _update_default_pg(default_pg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _world.pg_group_ranks[GroupMember.WORLD] = {  # type: ignore[index]
         i: i
@@ -1874,7 +1978,10 @@ def _get_split_source(pg):
         split_from = pg._get_backend(pg.bound_device_id)
     elif pg is _world.default_pg:
         try:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             split_from = pg._get_backend(torch.device("cuda"))
         except RuntimeError:
             # no cuda device associated with this backend
@@ -1980,9 +2087,15 @@ def _new_process_group_helper(
     if "," not in str(backend) and ":" not in str(backend):
         assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
         if backend == Backend.UNDEFINED:
+<<<<<<< HEAD
             # Currently when backend is UNDEFINED, only one backend will be initialized
             # we use nccl (if cuda is available) or gloo as default backend
             # so we can correctly call getDefaultBackend which in ProcessGroup.
+=======
+            # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
+            # will be created, we use nccl(if cuda is available) or gloo as default
+            # backend so we can correctly call getDefaultBackend which in ProcessGroup.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if Backend.NCCL in backend_config.get_device_backend_map().values():
                 pg._set_default_backend(ProcessGroup.BackendType.NCCL)
             else:
@@ -2035,11 +2148,15 @@ def _new_process_group_helper(
             if not is_gloo_available():
                 raise RuntimeError("Distributed package doesn't have Gloo built in")
             backend_class = ProcessGroupGloo(
+<<<<<<< HEAD
                 backend_prefix_store,
                 group_rank,
                 group_size,
                 # pyrefly: ignore [bad-argument-type]
                 timeout=timeout,
+=======
+                backend_prefix_store, group_rank, group_size, timeout=timeout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             backend_class.options.global_ranks_in_group = global_ranks_in_group
             backend_class.options.group_name = group_name
@@ -2054,14 +2171,21 @@ def _new_process_group_helper(
                 if backend_options._timeout != timeout:
                     warnings.warn(
                         "backend_options._timeout was specified, "
+<<<<<<< HEAD
                         "but timeout kwarg has a default value that will always override it. ",
                         stacklevel=2,
+=======
+                        "but timeout kwarg has a default value that will always override it. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
             else:
                 # default backend_options for NCCL
                 backend_options = ProcessGroupNCCL.Options()
                 backend_options.is_high_priority_stream = False
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             backend_options._timeout = timeout
 
             if split_from:
@@ -2081,16 +2205,21 @@ def _new_process_group_helper(
             # RuntimeError if is_ucc_available() returns false.
 
             backend_class = ProcessGroupUCC(
+<<<<<<< HEAD
                 backend_prefix_store,
                 group_rank,
                 group_size,
                 # pyrefly: ignore [bad-argument-type]
                 timeout=timeout,
+=======
+                backend_prefix_store, group_rank, group_size, timeout=timeout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             backend_type = ProcessGroup.BackendType.UCC
         elif backend_str == Backend.XCCL:
             if not is_xccl_available():
                 raise RuntimeError("Distributed package doesn't have XCCL built in")
+<<<<<<< HEAD
             backend_options = ProcessGroupXCCL.Options()
             backend_options.global_ranks_in_group = global_ranks_in_group
             backend_options.group_name = group_name
@@ -2098,6 +2227,10 @@ def _new_process_group_helper(
             backend_options._timeout = timeout
             backend_class = ProcessGroupXCCL(
                 backend_prefix_store, group_rank, group_size, backend_options
+=======
+            backend_class = ProcessGroupXCCL(
+                backend_prefix_store, group_rank, group_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             backend_type = ProcessGroup.BackendType.XCCL
         else:
@@ -2119,7 +2252,10 @@ def _new_process_group_helper(
                 dist_backend_opts.store = backend_prefix_store
                 dist_backend_opts.group_rank = group_rank
                 dist_backend_opts.group_size = group_size
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dist_backend_opts.timeout = timeout
                 dist_backend_opts.group_id = group_name
                 dist_backend_opts.global_ranks_in_group = global_ranks_in_group
@@ -2163,7 +2299,10 @@ def _new_process_group_helper(
                         store=backend_prefix_store,
                         rank=group_rank,
                         world_size=group_size,
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         timeout=timeout,
                     )
 
@@ -2237,7 +2376,11 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
     # alive until all works and hooks are done. The current implementation does the
     # latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
     # for the pending hooks to finish.
+<<<<<<< HEAD
     if type(pg) is ProcessGroup and pg._has_hooks():
+=======
+    if type(pg) == ProcessGroup and pg._has_hooks():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg._wait_for_pending_works()
 
     if group is None or group == GroupMember.WORLD:
@@ -2276,8 +2419,12 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         if pg in _world.pg_coalesce_state.keys():
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
+<<<<<<< HEAD
                 "ProcessGroup is destroyed. They will be cleaned.",
                 stacklevel=2,
+=======
+                "ProcessGroup is destroyed. They will be cleaned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             del _world.pg_coalesce_state[pg]
 
@@ -2367,8 +2514,12 @@ def _abort_process_group(group: Optional[ProcessGroup] = None):
         if pg in _world.pg_coalesce_state.keys():
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
+<<<<<<< HEAD
                 "ProcessGroup is aborted. They will be cleaned.",
                 stacklevel=2,
+=======
+                "ProcessGroup is aborted. They will be cleaned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             del _world.pg_coalesce_state[pg]
 
@@ -2819,7 +2970,11 @@ def peer_kwarg(op: P2POp) -> dict[str, int]:
         key = "group_dst" if op.op == isend else "group_src"
         return {key: op.group_peer}
 
+<<<<<<< HEAD
     if type(group) is ProcessGroup and group._get_backend(device).supports_coalescing:
+=======
+    if type(group) == ProcessGroup and group._get_backend(device).supports_coalescing:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NCCL style coalescing
         with _coalescing_manager(group, device, async_ops=True) as cm:
             for p2p_op in p2p_op_list:
@@ -2886,8 +3041,11 @@ def broadcast(
     opts.rootRank = group_src
     opts.rootTensor = 0
     opts.asyncOp = async_op
+<<<<<<< HEAD
     if tensor.is_complex():
         tensor = torch.view_as_real(tensor)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     work = group.broadcast([tensor], opts)
     if async_op:
         return work
@@ -3375,7 +3533,10 @@ def gather_object(
         return
 
     assert object_gather_list is not None, "Must provide object_gather_list on dst rank"
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, tensor in enumerate(output_tensors):
         tensor = tensor.type(torch.uint8)
         tensor_size = object_size_list[i]
@@ -3389,7 +3550,10 @@ def send_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_dst: Optional[int] = None,
+<<<<<<< HEAD
     use_batch: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Sends picklable objects in ``object_list`` synchronously.
@@ -3410,10 +3574,13 @@ def send_object_list(
             ``device`` before sending. Default is ``None``.
         group_dst (int, optional): Destination rank on ``group``.
             Must specify one of ``dst`` and ``group_dst`` but not both
+<<<<<<< HEAD
         use_batch (bool, optional): If True, use batch p2p operations instead of
             regular send operations. This avoids initializing 2-rank communicators and
             uses existing entire group communicators. See batch_isend_irecv for usage and
             assumptions. Default is ``False``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Returns:
         ``None``.
 
@@ -3477,12 +3644,16 @@ def send_object_list(
     object_sizes_tensor = torch.cat(size_list)
 
     # Send object sizes
+<<<<<<< HEAD
     if use_batch:
         batch_isend_irecv(
             [P2POp(isend, object_sizes_tensor, group_peer=group_dst, group=group)]
         ).pop().wait()
     else:
         send(object_sizes_tensor, group_dst=group_dst, group=group)
+=======
+    send(object_sizes_tensor, group_dst=group_dst, group=group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Concatenate and send serialized object tensors
     # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
@@ -3492,12 +3663,16 @@ def send_object_list(
     else:
         object_tensor = torch.cat(tensor_list)
 
+<<<<<<< HEAD
     if use_batch:
         batch_isend_irecv(
             [P2POp(isend, object_tensor, group_peer=group_dst, group=group)]
         ).pop().wait()
     else:
         send(object_tensor, group_dst=group_dst, group=group)
+=======
+    send(object_tensor, group_dst=group_dst, group=group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -3507,7 +3682,10 @@ def recv_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_src: Optional[int] = None,
+<<<<<<< HEAD
     use_batch: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Receives picklable objects in ``object_list`` synchronously.
@@ -3525,10 +3703,13 @@ def recv_object_list(
         device (``torch.device``, optional): If not None, receives on this device.
             Default is ``None``.
         group_src (int, optional): Destination rank on ``group``.  Invalid to specify both ``src`` and ``group_src``.
+<<<<<<< HEAD
         use_batch (bool, optional): If True, use batch p2p operations instead of
             regular send operations. This avoids initializing 2-rank communicators and
             uses existing entire group communicators. See batch_isend_irecv for usage and
             assumptions. Default is ``False``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         Sender rank. -1 if rank is not part of the group. If rank is part of the group,
@@ -3572,10 +3753,13 @@ def recv_object_list(
         >>> objects
         ['foo', 12, {1: 2}]
     """
+<<<<<<< HEAD
     group = _group_or_default_group(group)
     group_src = _canonicalize_group_rank(group, src, group_src)
     _check_not_self_rank(group, group_src, "source")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _rank_not_in_group(group):
         _warn_not_in_group("recv_object_list")
         return -1
@@ -3592,6 +3776,7 @@ def recv_object_list(
     )
 
     # Receive object sizes
+<<<<<<< HEAD
     if use_batch:
         work = batch_isend_irecv(
             [
@@ -3607,6 +3792,9 @@ def recv_object_list(
         rank_sizes = get_global_rank(group, group_src)
     else:
         rank_sizes = recv(object_sizes_tensor, group=group, group_src=group_src)
+=======
+    rank_sizes = recv(object_sizes_tensor, src=src, group=group, group_src=group_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Tensor to receive serialized objects into.
     object_tensor = torch.empty(  # type: ignore[call-overload]
@@ -3615,6 +3803,7 @@ def recv_object_list(
         device=current_device,
     )
 
+<<<<<<< HEAD
     if use_batch:
         work = batch_isend_irecv(
             [
@@ -3630,6 +3819,9 @@ def recv_object_list(
         rank_objects = get_global_rank(group, group_src)
     else:
         rank_objects = recv(object_tensor, group=group, group_src=group_src)
+=======
+    rank_objects = recv(object_tensor, src=src, group=group, group_src=group_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert rank_sizes == rank_objects, (
         "Mismatch in return ranks for object sizes and objects."
     )
@@ -3752,10 +3944,15 @@ def broadcast_object_list(
     # has only one element, we can skip the copy.
     if my_group_rank == group_src:
         if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             object_tensor = tensor_list[0]
         else:
             # pyrefly: ignore [unbound-name]
+=======
+            object_tensor = tensor_list[0]
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             object_tensor = torch.cat(tensor_list)
     else:
         object_tensor = torch.empty(  # type: ignore[call-overload]
@@ -3884,7 +4081,10 @@ def scatter_object_list(
     broadcast(max_tensor_size, group_src=group_src, group=group)
 
     # Scatter actual serialized objects
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_tensor = torch.empty(
         max_tensor_size.item(), dtype=torch.uint8, device=pg_device
     )
@@ -4901,11 +5101,14 @@ def barrier(
         None, if not async_op or if not part of the group
 
     .. note:: `ProcessGroupNCCL` now blocks the cpu thread till the completion of the barrier collective.
+<<<<<<< HEAD
     .. note:: `ProcessGroupNCCL` implements barrier as an all_reduce of a 1-element tensor. A device must be chosen
        for allocating this tensor.  The device choice is made by checking in this order (1) the first device passed to
        `device_ids` arg of barrier if not None, (2) the device passed to init_process_group if not None, (3) the device
        that was first used with this process group, if another collective with tensor inputs has been performed, (4)
        the device index indicated by the global rank mod local device count.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     group = group or _get_default_group()
 
@@ -4921,18 +5124,25 @@ def barrier(
     if isinstance(device_ids, list):
         opts.device_ids = device_ids
         # use only the first device id
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.device = torch.device(device.type, device_ids[0])
     elif getattr(group, "bound_device_id", None) is not None:
         # Use device id from `init_process_group(device_id=...)`
         opts.device = group.bound_device_id  # type: ignore[assignment]
     elif device.type == "cpu" or _get_object_coll_device(group) == "cpu":
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.device = torch.device("cpu")
     else:
         # Use the current device set by the user. If user did not set any, this
         # may use default device 0, causing issues like hang or all processes
         # creating context on device 0.
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
         opts.device = device
         if group.rank() == 0:
@@ -4941,6 +5151,12 @@ def barrier(
                 "You can specify `device_id` in `init_process_group` to mute this warning.",
                 stacklevel=2,
             )
+=======
+        opts.device = device
+        warnings.warn(  # warn only once
+            "No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     work = group.barrier(opts=opts)
 
@@ -5021,7 +5237,10 @@ def monitored_barrier(
         warnings.warn(
             "Please specify timeout arg as a timedelta. "
             f"Converting current value of {timeout} assuming it represents seconds",
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         timeout = timedelta(seconds=timeout)
 
@@ -5066,7 +5285,10 @@ def _hash_ranks_to_str(ranks: list[int]) -> str:
 # Takes a list of ranks and computes an integer color
 def _process_group_color(ranks: list[int]) -> int:
     # Convert list to tuple to make it hashable
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ranks = tuple(ranks)
     hash_value = hash(ranks)
     # Split color must be:
@@ -5105,7 +5327,11 @@ def _is_safe_to_split() -> bool:
     users must be aware that a pg is only splittable after the first collective is
     issued.
     """
+<<<<<<< HEAD
     return _get_default_group().bound_device_id is not None
+=======
+    return False if _get_default_group().bound_device_id is None else True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_time_logger
@@ -5147,8 +5373,13 @@ def split_group(
 
     """
     # check inputs
+<<<<<<< HEAD
     if split_ranks is None or len(split_ranks) == 0:
         raise ValueError("split_ranks cannot be None or empty")
+=======
+    if split_ranks is None:
+        raise ValueError("split_ranks cannot be None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     global _world
     default_pg = _get_default_group()
@@ -5157,6 +5388,10 @@ def split_group(
         raise RuntimeError(
             "No device associated with the default pg, not safe to split any process groups"
         )
+<<<<<<< HEAD
+=======
+    _default_backend, default_store = _world.pg_map[default_pg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global_rank = default_pg.rank()
     global_world_size = default_pg.size()
 
@@ -5187,8 +5422,16 @@ def split_group(
         )
 
     # set the group_desc before the color or no_cloor split
+<<<<<<< HEAD
     if hasattr(parent_backend, "comm_split_count") and group_desc is None:
         group_desc = f"{parent_pg.group_desc}:split:{parent_backend.comm_split_count()}"  # type: ignore[attr-defined]
+=======
+    group_desc = (
+        f"{parent_pg.group_desc}:split:{parent_backend.comm_split_count()}"  # type: ignore[attr-defined]
+        if group_desc is None
+        else group_desc
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     parent_backend_str, _ = _world.pg_map[parent_pg]
     # same type of backend as the parent process group
@@ -5206,9 +5449,14 @@ def split_group(
     _check_valid_timeout(timeout)
 
     # find my group of ranks and my group local rank in split_ranks
+<<<<<<< HEAD
     # for ranks which are not in any split PGs, we just pass in this the first split group
     # and None will be returned.
     my_group = split_ranks[0]
+=======
+    my_group = None
+    group_rank = -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for split_group in split_ranks:
         if len(split_group) == 0:
@@ -5222,6 +5470,7 @@ def split_group(
         split_group = sorted(split_group)
         if parent_group_rank in split_group:
             my_group = split_group
+<<<<<<< HEAD
             break
 
     # use_hashed_name is True to ensure that subgroups have unique names.
@@ -5258,11 +5507,94 @@ def split_group(
 
     # Create the global rank to group rank mapping
     _world.pg_group_ranks[split_pg] = {
+=======
+            group_rank = split_group.index(parent_group_rank)
+            break
+    # if my rank does not belong to any sub group,
+    # no_color split should be called
+    if my_group is None or group_rank == -1:
+        parent_backend.perform_nocolor_split(device_id)  # type: ignore[attr-defined]
+        return None
+
+    group_name = _process_group_name(my_group, use_hashed_name=False)
+    global_ranks_in_my_group = [parent_group_to_global_ranks[rank] for rank in my_group]
+
+    prefix_store = PrefixStore(f"{group_name}/", default_store)
+    # We register the backend after initializing and timeout is set in pg_options.
+    pg: ProcessGroup = ProcessGroup(
+        prefix_store,
+        group_rank,
+        len(my_group),
+    )
+    pg.bound_device_id = device_id  # type: ignore[union-attr]
+    pg_options._timeout = timeout  # type: ignore[union-attr]
+    pg_options.split_from = parent_backend  # type: ignore[union-attr]
+    pg_options.split_color = _process_group_color(my_group)  # type: ignore[union-attr]
+    pg_options.global_ranks_in_group = global_ranks_in_my_group  # type: ignore[union-attr]
+    pg_options.group_name = group_name  # type: ignore[union-attr]
+
+    if parent_backend_str == Backend.NCCL:
+        backend_type = ProcessGroup.BackendType.NCCL
+        if not isinstance(pg_options, ProcessGroupNCCL.Options):
+            raise RuntimeError(
+                "Expected pg_options argument to be of type ProcessGroupNCCL.Options"
+            )
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, len(my_group), pg_options
+        )
+    else:
+        assert parent_backend_str.upper() in Backend._plugins, (
+            f"Unknown c10d backend type {parent_backend_str.upper()}"
+        )
+        backend_plugin = Backend._plugins[parent_backend_str.upper()]
+        creator_fn = backend_plugin.creator_fn
+        extended_api = backend_plugin.extended_api
+        backend_type = ProcessGroup.BackendType.CUSTOM
+        if not extended_api:
+            backend_class = creator_fn(prefix_store, group_rank, len(my_group), timeout)
+        else:
+            dist_backend_opts = _DistributedBackendOptions()
+            dist_backend_opts.store = prefix_store
+            dist_backend_opts.group_rank = group_rank
+            dist_backend_opts.group_size = len(my_group)
+            backend_class = creator_fn(dist_backend_opts, pg_options)
+
+    pg._set_default_backend(backend_type)
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(torch.device("cuda"), backend_type, backend_class)
+
+    # set group_name and group_desc to backend
+    assert group_name is not None
+    assert group_desc is not None
+    pg._set_group_name(group_name)
+    pg._set_group_desc(group_desc)
+
+    # always eagerly initialize the backend in split_group
+    eager_backend = pg._get_backend(device_id)
+    eager_backend.eager_connect_single_device(device_id)
+
+    # update global state
+    _world.pg_map[pg] = (backend, prefix_store)
+    _world.pg_names[pg] = group_name
+    _register_process_group(group_name, pg)
+    _world.pg_backend_config[pg] = str(backend_config)
+    pg_tag = f"ptd:{group_name}"
+    _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
+    _world.pg_to_tag[pg] = pg_tag
+
+    # Create the global rank to group rank mapping
+    _world.pg_group_ranks[pg] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global_rank: group_rank
         for group_rank, global_rank in enumerate(global_ranks_in_my_group)
     }
 
+<<<<<<< HEAD
     return split_pg
+=======
+    return pg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_time_logger
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index d56d61e7eaac2..59756208ae714 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -15,11 +15,18 @@
 import traceback
 import warnings
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Optional, Union
+=======
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.distributed.elastic.rendezvous as rdzv
 import torch.distributed.elastic.utils.store as store_util
@@ -28,7 +35,10 @@
 from torch.distributed.elastic.multiprocessing import ProcessFailure, SignalException
 from torch.distributed.elastic.rendezvous import RendezvousGracefulExitError
 from torch.distributed.elastic.utils.logging import get_logger
+<<<<<<< HEAD
 from torch.numa.binding import NumaOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -75,10 +85,13 @@ class WorkerSpec:
              takes precedence over ``redirects`` settings.
         event_log_handler: name of the event logging handler as registered in
           `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
+<<<<<<< HEAD
         duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                                  that match _any_ of the filter strings.
         duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                  that match _any_ of the filter strings.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     role: str
@@ -94,9 +107,12 @@ class WorkerSpec:
     master_addr: Optional[str] = None
     local_addr: Optional[str] = None
     event_log_handler: str = "null"
+<<<<<<< HEAD
     numa_options: Optional[NumaOptions] = None
     duplicate_stdout_filters: Optional[list[str]] = None
     duplicate_stderr_filters: Optional[list[str]] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         assert self.local_world_size > 0
@@ -106,7 +122,10 @@ def __post_init__(self):
             warnings.warn(
                 "WorkerSpec.fn will be deprecated,"
                 " please use WorkerSpec.entrypoint instead",
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 category=DeprecationWarning,
             )
             self.entrypoint = self.fn
@@ -728,7 +747,11 @@ def run(self, role: str = DEFAULT_ROLE) -> RunResult:
             self._record_worker_events(result)
             return result
         except RendezvousGracefulExitError as e:
+<<<<<<< HEAD
             logger.info("Rendezvous gracefully exited: %s", e)  # noqa: G200
+=======
+            logger.info("Rendezvous gracefully exited: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except SignalException as e:
             logger.warning("Received %s death signal, shutting down workers", e.sigval)
             self._shutdown(e.sigval)
@@ -758,6 +781,7 @@ def _record_worker_events(self, result: RunResult) -> None:
             failure = result.failures.get(worker.global_rank)
             state: str = self._get_worker_state(worker, result)
             raw_error = json.dumps(failure.error_file_data) if failure else None
+<<<<<<< HEAD
             exit_code = failure.exitcode if failure else None
             worker_pid = failure.pid if failure else None
             record(
@@ -769,6 +793,10 @@ def _record_worker_events(self, result: RunResult) -> None:
                     exit_code=exit_code,
                     worker_pid=worker_pid,
                 ),
+=======
+            record(
+                self._construct_event(state, EventSource.WORKER, worker, raw_error),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._worker_group.spec.event_log_handler,
             )
 
@@ -804,8 +832,11 @@ def _construct_event(
         worker: Optional[Worker] = None,
         raw_error: Optional[str] = None,
         duration_ms: Optional[float] = None,
+<<<<<<< HEAD
         exit_code: Optional[int] = None,
         worker_pid: Optional[int] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Event:
         wg = self._worker_group
         spec = wg.spec
@@ -817,8 +848,11 @@ def _construct_event(
             md["local_rank"] = (worker.local_rank,)
             md["role_rank"] = (worker.role_rank,)
             md["role_world_size"] = (worker.role_world_size,)
+<<<<<<< HEAD
             md["exit_code"] = (exit_code,)
             md["worker_pid"] = (worker_pid,)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             global_rank = worker.global_rank
             worker_id = str(worker.id)
         else:
diff --git a/torch/distributed/elastic/agent/server/health_check_server.py b/torch/distributed/elastic/agent/server/health_check_server.py
index 4815d86aa289c..fe542311a5091 100644
--- a/torch/distributed/elastic/agent/server/health_check_server.py
+++ b/torch/distributed/elastic/agent/server/health_check_server.py
@@ -6,7 +6,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.utils.logging import get_logger
 
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index f643de5f9b25d..0e073bc045260 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -333,10 +333,15 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
                     rank=worker.global_rank,
                     local_rank=local_rank,
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
                 log_line_prefixes[local_rank] = log_line_prefix
 
             # pyrefly: ignore [unsupported-operation]
+=======
+                log_line_prefixes[local_rank] = log_line_prefix
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             envs[local_rank] = worker_env
             worker_args = list(spec.args)
             worker_args = macros.substitute(worker_args, str(local_rank))
@@ -355,9 +360,12 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
             logs_specs=self._logs_specs,
             log_line_prefixes=log_line_prefixes,
             start_method=self._start_method,
+<<<<<<< HEAD
             numa_options=spec.numa_options,
             duplicate_stdout_filters=spec.duplicate_stdout_filters,
             duplicate_stderr_filters=spec.duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return self._pcontext.pids()
diff --git a/torch/distributed/elastic/events/api.py b/torch/distributed/elastic/events/api.py
index 939ab0793f65d..e3c471fb17953 100644
--- a/torch/distributed/elastic/events/api.py
+++ b/torch/distributed/elastic/events/api.py
@@ -54,7 +54,10 @@ def deserialize(data: Union[str, "Event"]) -> "Event":
         if isinstance(data, str):
             data_dict = json.loads(data)
         data_dict["source"] = EventSource[data_dict["source"]]  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return Event(**data_dict)
 
     def serialize(self) -> str:
@@ -109,7 +112,10 @@ def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
         if isinstance(data, str):
             data_dict = json.loads(data)
         data_dict["node_state"] = NodeState[data_dict["node_state"]]  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return RdzvEvent(**data_dict)
 
     def serialize(self) -> str:
diff --git a/torch/distributed/elastic/metrics/api.py b/torch/distributed/elastic/metrics/api.py
index 07d0f9fc43cc7..4fb4a5e6139ab 100644
--- a/torch/distributed/elastic/metrics/api.py
+++ b/torch/distributed/elastic/metrics/api.py
@@ -88,7 +88,14 @@ def configure(handler: MetricHandler, group: Optional[str] = None):
 
 
 def getStream(group: str):
+<<<<<<< HEAD
     handler = _metrics_map.get(group, _default_metrics_handler)
+=======
+    if group in _metrics_map:
+        handler = _metrics_map[group]
+    else:
+        handler = _default_metrics_handler
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return MetricStream(group, handler)
 
 
@@ -168,15 +175,23 @@ def wrapper(*args, **kwargs):
             try:
                 start_time = time.time()
                 result = func(*args, **kwargs)
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 publish_metric(group, f"{func.__name__}.success", 1)
             except Exception:
                 # pyrefly: ignore [bad-argument-type]
+=======
+                publish_metric(group, f"{func.__name__}.success", 1)
+            except Exception:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 publish_metric(group, f"{func.__name__}.failure", 1)
                 raise
             finally:
                 publish_metric(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     group,
                     f"{func.__name__}.duration.ms",
                     get_elapsed_time_ms(start_time),  # type: ignore[possibly-undefined]
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
index a68968bac8f4d..e65f925605c0b 100644
--- a/torch/distributed/elastic/multiprocessing/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -63,8 +63,12 @@ def trainer(a, b, c):
 implementations of the parent :class:`api.PContext` class.
 """
 
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.multiprocessing.api import (  # noqa: F401
     _validate_full_rank,
@@ -81,7 +85,10 @@ def trainer(a, b, c):
     to_map,
 )
 from torch.distributed.elastic.utils.logging import get_logger
+<<<<<<< HEAD
 from torch.numa.binding import NumaOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -108,9 +115,12 @@ def start_processes(
     logs_specs: LogsSpecs,
     log_line_prefixes: Optional[dict[int, str]] = None,
     start_method: str = "spawn",
+<<<<<<< HEAD
     numa_options: Optional[NumaOptions] = None,
     duplicate_stdout_filters: Optional[list[str]] = None,
     duplicate_stderr_filters: Optional[list[str]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> PContext:
     """
     Start ``n`` copies of ``entrypoint`` processes with the provided options.
@@ -132,6 +142,7 @@ def start_processes(
               this is done by default and there is no need to manually annotate
               with the ``@record`` annotation.
 
+<<<<<<< HEAD
     Inside ``logs_specs``, ``redirects`` and ``tee`` are bitmasks specifying which std
     stream(s) to redirect to a log file in the ``log_dir``. Valid mask values are defined
     in ``Std``.  To redirect/tee only certain local ranks, pass ``redirects`` as a map
@@ -143,16 +154,29 @@ def start_processes(
     to a file containing only lines that match _any_ of the filter strings. The log
     file is aggregated across all ranks selected by ``tee``.
 
+=======
+    ``redirects`` and ``tee`` are bitmasks specifying which std stream(s) to redirect
+    to a log file in the ``log_dir``. Valid mask values are defined in ``Std``.
+    To redirect/tee only certain local ranks, pass ``redirects`` as a map with the key as
+    the local rank to specify the redirect behavior for.
+    Any missing local ranks will default to ``Std.NONE``.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ``tee`` acts like the unix "tee" command in that it redirects + prints to console.
     To avoid worker stdout/stderr from printing to console, use the ``redirects`` parameter.
 
     For each process, the ``log_dir`` will contain:
 
     #. ``{local_rank}/error.json``: if the process failed, a file with the error info
+<<<<<<< HEAD
     #. ``{local_rank}/stdout.log``: if ``redirect & STDOUT == STDOUT``
     #. ``{local_rank}/stderr.log``: if ``redirect & STDERR == STDERR``
     #. ``filtered_stdout.log``: if ``duplicate_stdout_filters`` is non-empty
     #. ``filtered_stderr.log``: if ``duplicate_stderr_filters`` is non-empty
+=======
+    #. ``{local_rank}/stdout.json``: if ``redirect & STDOUT == STDOUT``
+    #. ``{local_rank}/stderr.json``: if ``redirect & STDERR == STDERR``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note:: It is expected that the ``log_dir`` exists, is empty, and is a directory.
 
@@ -207,6 +231,7 @@ def start_processes(
         log_dir: directory used to write log files
         start_method: multiprocessing start method (spawn, fork, forkserver)
                       ignored for binaries
+<<<<<<< HEAD
         logs_specs: defines ``log_dir``, ``redirects``, and ``tee``.
                     inside ``logs_specs``:
                     - redirects: which std streams to redirect to a log file
@@ -214,6 +239,11 @@ def start_processes(
         local_ranks_filter: which ranks' logs to print to console
         duplicate_stdout_filters: filters for the duplicated stdout logs
         duplicate_stderr_filters: filters for the duplicated stderr logs
+=======
+        redirects: which std streams to redirect to a log file
+        tee: which std streams to redirect + print to console
+        local_ranks_filter: which ranks' logs to print to console
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 
@@ -228,11 +258,16 @@ def start_processes(
             entrypoint=entrypoint,
             args=args,
             envs=envs,
+<<<<<<< HEAD
             duplicate_stdout_filters=duplicate_stdout_filters,
             duplicate_stderr_filters=duplicate_stderr_filters,
             logs_specs=logs_specs,
             log_line_prefixes=log_line_prefixes,
             numa_options=numa_options,
+=======
+            logs_specs=logs_specs,
+            log_line_prefixes=log_line_prefixes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         context = MultiprocessContext(
@@ -240,12 +275,18 @@ def start_processes(
             entrypoint=entrypoint,
             args=args,
             envs=envs,
+<<<<<<< HEAD
             duplicate_stdout_filters=duplicate_stdout_filters,
             duplicate_stderr_filters=duplicate_stderr_filters,
             log_line_prefixes=log_line_prefixes,
             start_method=start_method,
             logs_specs=logs_specs,
             numa_options=numa_options,
+=======
+            log_line_prefixes=log_line_prefixes,
+            start_method=start_method,
+            logs_specs=logs_specs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     try:
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index a6b9aa79dc668..178a9c5efd1e3 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -19,13 +19,20 @@
 import threading
 import time
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from enum import IntFlag
 from multiprocessing import synchronize
 from types import FrameType
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.multiprocessing as mp
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure, record
@@ -38,7 +45,10 @@
     SubprocessHandler,
 )
 from torch.distributed.elastic.multiprocessing.tail_log import TailLog
+<<<<<<< HEAD
 from torch.numa.binding import maybe_wrap_with_numa_binding, NumaOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 IS_WINDOWS = sys.platform == "win32"
@@ -193,8 +203,11 @@ class LogsDest:
     tee_stdouts: dict[int, str] = field(default_factory=dict)
     tee_stderrs: dict[int, str] = field(default_factory=dict)
     error_files: dict[int, str] = field(default_factory=dict)
+<<<<<<< HEAD
     filtered_stdout: str = field(default_factory=str)
     filtered_stderr: str = field(default_factory=str)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LogsSpecs(ABC):
@@ -292,8 +305,11 @@ def reify(
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stdout.log`
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stderr.log`
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/error.json`
+<<<<<<< HEAD
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/filtered_stdout.log`
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/filtered_stderr.log`
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         nprocs = len(envs)
         global_env = {}  # use only to query properties that are not dependent on a rank
@@ -390,6 +406,7 @@ def reify(
                 )
                 envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = error_file
 
+<<<<<<< HEAD
         return LogsDest(
             stdouts,
             stderrs,
@@ -399,6 +416,9 @@ def reify(
             os.path.join(attempt_log_dir, "filtered_stdout.log"),
             os.path.join(attempt_log_dir, "filtered_stderr.log"),
         )
+=======
+        return LogsDest(stdouts, stderrs, tee_stdouts, tee_stderrs, error_files)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self) -> str:
         return (
@@ -450,6 +470,7 @@ class PContext(abc.ABC):
     .. warning:: stdouts and stderrs should ALWAYS be a superset of
                  tee_stdouts and tee_stderrs (respectively) this is b/c
                  tee is implemented as a redirect + tail -f <stdout/stderr.log>
+<<<<<<< HEAD
 
     Args:
         duplicate_stdout_filters:
@@ -460,6 +481,8 @@ class PContext(abc.ABC):
             If non-empty, duplicates stderrs specified in ``logs_specs``'s ``tee``
             to a file containing only lines that match _any_ of the filter strings.
             The log file is aggregated across all ranks selected by ``tee``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -470,8 +493,11 @@ def __init__(
         envs: dict[int, dict[str, str]],
         logs_specs: LogsSpecs,
         log_line_prefixes: Optional[dict[int, str]] = None,
+<<<<<<< HEAD
         duplicate_stdout_filters: Optional[list[str]] = None,
         duplicate_stderr_filters: Optional[list[str]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.name = name
         # validate that all mappings have the same number of keys and
@@ -491,6 +517,7 @@ def __init__(
         self.stderrs = logs_dest.stderrs
         self.error_files = logs_dest.error_files
         self.nprocs = nprocs
+<<<<<<< HEAD
         self.filtered_stdout = logs_dest.filtered_stdout
         self.filtered_stderr = logs_dest.filtered_stderr
 
@@ -524,10 +551,20 @@ def __init__(
                     ),
                 )
             )
+=======
+
+        self._stdout_tail = TailLog(
+            name, logs_dest.tee_stdouts, sys.stdout, log_line_prefixes
+        )
+        self._stderr_tail = TailLog(
+            name, logs_dest.tee_stderrs, sys.stderr, log_line_prefixes
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def start(self) -> None:
         """Start processes using parameters defined in the constructor."""
         if threading.current_thread() is threading.main_thread():
+<<<<<<< HEAD
             # Register signal handlers for the signals specified in the environment variable
             signals_to_handle = os.environ.get(
                 "TORCHELASTIC_SIGNALS_TO_HANDLE", "SIGTERM,SIGINT,SIGHUP,SIGQUIT"
@@ -561,14 +598,26 @@ def start(self) -> None:
                             sig_name,
                             exc_info=True,
                         )
+=======
+            signal.signal(signal.SIGTERM, _terminate_process_handler)
+            signal.signal(signal.SIGINT, _terminate_process_handler)
+            if not IS_WINDOWS:
+                signal.signal(signal.SIGHUP, _terminate_process_handler)
+                signal.signal(signal.SIGQUIT, _terminate_process_handler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             logger.warning(
                 "Failed to register signal handlers since torchelastic is running on a child thread. "
                 "This could lead to orphaned worker processes if the torchrun is terminated."
             )
         self._start()
+<<<<<<< HEAD
         for tail_log in self._tail_logs:
             tail_log.start()
+=======
+        self._stdout_tail.start()
+        self._stderr_tail.start()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abc.abstractmethod
     def _start(self) -> None:
@@ -655,8 +704,15 @@ def close(
         if not death_sig:
             death_sig = _get_default_signal()
         self._close(death_sig=death_sig, timeout=timeout)
+<<<<<<< HEAD
         for tail_log in self._tail_logs:
             tail_log.stop()
+=======
+        if self._stdout_tail:
+            self._stdout_tail.stop()
+        if self._stderr_tail:
+            self._stderr_tail.stop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_std_cm(std_rd: str, redirect_fn):
@@ -675,7 +731,10 @@ def _wrap(
     stderr_redirects: dict[int, str],  # redirect file for stderr (to console if None)
     ret_vals: dict[int, mp.SimpleQueue],
     queue_finished_reading_event: synchronize.Event,
+<<<<<<< HEAD
     numa_options: Optional[NumaOptions],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     # get the per-rank params up front so we fail fast if no mapping is found
     args_ = args[local_rank]
@@ -692,9 +751,12 @@ def _wrap(
         os.environ[k] = v
 
     with stdout_cm, stderr_cm:
+<<<<<<< HEAD
         fn = maybe_wrap_with_numa_binding(
             fn, gpu_index=local_rank, numa_options=numa_options
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ret = record(fn)(*args_)
     ret_val_.put(ret)
     queue_finished_reading_event.wait()
@@ -712,9 +774,12 @@ def __init__(
         start_method: str,
         logs_specs: LogsSpecs,
         log_line_prefixes: Optional[dict[int, str]] = None,
+<<<<<<< HEAD
         numa_options: Optional[NumaOptions] = None,
         duplicate_stdout_filters: Optional[list[str]] = None,
         duplicate_stderr_filters: Optional[list[str]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         super().__init__(
             name,
@@ -723,8 +788,11 @@ def __init__(
             envs,
             logs_specs,
             log_line_prefixes,
+<<<<<<< HEAD
             duplicate_stdout_filters,
             duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.start_method = start_method
@@ -741,8 +809,11 @@ def __init__(
         # successfully. If any process died on event.wait() calling set() method will deadlock.
         self._worker_finished_event = mp.get_context(self.start_method).Event()
 
+<<<<<<< HEAD
         self._numa_options: Optional[NumaOptions] = numa_options
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _start(self):
         if self._pc:
             raise ValueError(
@@ -759,7 +830,10 @@ def _start(self):
                 self.stderrs,
                 self._ret_vals,
                 self._worker_finished_event,
+<<<<<<< HEAD
                 self._numa_options,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             nprocs=self.nprocs,
             join=False,
@@ -787,7 +861,11 @@ def _poll(self) -> Optional[RunProcsResult]:
             # pipe. Hence to prevent deadlocks on large return values,
             # we opportunistically try queue.get on each join call
             # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
+<<<<<<< HEAD
             for local_rank in range(self.nprocs):
+=======
+            for local_rank in range(0, self.nprocs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return_queue = self._ret_vals[local_rank]
                 if not return_queue.empty():
                     # save the return values temporarily into a member var
@@ -901,9 +979,12 @@ def __init__(
         envs: dict[int, dict[str, str]],
         logs_specs: LogsSpecs,
         log_line_prefixes: Optional[dict[int, str]] = None,
+<<<<<<< HEAD
         numa_options: Optional[NumaOptions] = None,
         duplicate_stdout_filters: Optional[list[str]] = None,
         duplicate_stderr_filters: Optional[list[str]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         super().__init__(
             name,
@@ -912,15 +993,21 @@ def __init__(
             envs,
             logs_specs,
             log_line_prefixes,
+<<<<<<< HEAD
             duplicate_stdout_filters,
             duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # state vector; _vdone[local_rank] -> is local_rank finished or not
         self._running_local_ranks: set[int] = set(range(self.nprocs))
         self._failures: dict[int, ProcessFailure] = {}
         self.subprocess_handlers: dict[int, SubprocessHandler] = {}
+<<<<<<< HEAD
         self._numa_options: Optional[NumaOptions] = numa_options
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _start(self):
         if self.subprocess_handlers:
@@ -935,12 +1022,20 @@ def _start(self):
                 stdout=self.stdouts[local_rank],
                 stderr=self.stderrs[local_rank],
                 local_rank_id=local_rank,
+<<<<<<< HEAD
                 numa_options=self._numa_options,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for local_rank in range(self.nprocs)
         }
 
+<<<<<<< HEAD
     def _capture_process_failures(self, done_local_ranks: set[int]):
+=======
+    def _poll(self) -> Optional[RunProcsResult]:
+        done_local_ranks = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for local_rank in self._running_local_ranks:
             handler = self.subprocess_handlers[local_rank]
             exitcode = handler.proc.poll()
@@ -955,19 +1050,25 @@ def _capture_process_failures(self, done_local_ranks: set[int]):
                     )
                 # else: --> succeeded; nothing to do
 
+<<<<<<< HEAD
     def _poll(self) -> Optional[RunProcsResult]:
         done_local_ranks: set[int] = set()
         self._capture_process_failures(done_local_ranks)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._running_local_ranks.difference_update(done_local_ranks)
 
         # if ALL procs are finished or ANY have failed
         if not self._running_local_ranks or self._failures:
             self.close()  # terminate all running procs
+<<<<<<< HEAD
             self._capture_process_failures(
                 done_local_ranks
             )  # log sigterms and sigkill exit codes in the self._failures for bookkeeping purposes
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = RunProcsResult(
                 failures=self._failures,
                 stdouts=self.stdouts,
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index fa6abc8794b65..ff4330311f82a 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -54,12 +54,19 @@
 import signal
 import socket
 import time
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass, field
 from datetime import datetime
 from functools import wraps
 from string import Template
+<<<<<<< HEAD
 from typing import Any, Optional, TypeVar, Union
+=======
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 from torch.distributed.elastic.utils.logging import get_logger
@@ -79,9 +86,15 @@
 logger = get_logger(__name__)
 
 
+<<<<<<< HEAD
 JSON = dict[str, Any]
 
 _EMPTY_ERROR_DATA: dict[str, Any] = {"message": "<NONE>"}
+=======
+JSON = dict
+
+_EMPTY_ERROR_DATA = {"message": "<NONE>"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _NOT_AVAILABLE = "<N/A>"
 
 _R = TypeVar("_R")
@@ -143,10 +156,13 @@ def __post_init__(self):
                     f" received by PID {self.pid}"
                 )
             else:
+<<<<<<< HEAD
                 self.error_file_data["errorTraits"] = {
                     "category": "system_terminated_error",
                     "retryability": "False",
                 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
 
     def _get_error_data(self, error_file_data: dict[str, Any]) -> tuple[str, int]:
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index 437a9c07d2cf9..74a3a69a9104b 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -52,9 +52,13 @@ def initialize(self) -> None:
         try:
             faulthandler.enable(all_threads=True)
         except Exception as e:
+<<<<<<< HEAD
             warnings.warn(
                 f"Unable to enable fault handler. {type(e).__name__}: {e}", stacklevel=2
             )
+=======
+            warnings.warn(f"Unable to enable fault handler. {type(e).__name__}: {e}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _write_error_file(self, file_path: str, error_msg: str) -> None:
         """Write error message to the file."""
@@ -62,9 +66,13 @@ def _write_error_file(self, file_path: str, error_msg: str) -> None:
             with open(file_path, "w") as fp:
                 fp.write(error_msg)
         except Exception as e:
+<<<<<<< HEAD
             warnings.warn(
                 f"Unable to write error to file. {type(e).__name__}: {e}", stacklevel=2
             )
+=======
+            warnings.warn(f"Unable to write error to file. {type(e).__name__}: {e}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def record_exception(self, e: BaseException) -> None:
         """
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
index 947ce7b001ef7..9819becda4a7b 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
@@ -3,12 +3,18 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+<<<<<<< HEAD
 from typing import Optional
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
     SubprocessHandler,
 )
+<<<<<<< HEAD
 from torch.numa.binding import NumaOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["get_subprocess_handler"]
@@ -21,7 +27,10 @@ def get_subprocess_handler(
     stdout: str,
     stderr: str,
     local_rank_id: int,
+<<<<<<< HEAD
     numa_options: Optional[NumaOptions] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> SubprocessHandler:
     return SubprocessHandler(
         entrypoint=entrypoint,
@@ -30,5 +39,8 @@ def get_subprocess_handler(
         stdout=stdout,
         stderr=stderr,
         local_rank_id=local_rank_id,
+<<<<<<< HEAD
         numa_options=numa_options,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
index eae4e632e0856..f51a8785195d7 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -7,12 +7,19 @@
 # LICENSE file in the root directory of this source tree.
 import os
 import signal
+<<<<<<< HEAD
 import sys
 from subprocess import Popen
 from typing import Any, Optional
 
 from torch.numa.binding import maybe_wrap_command_args_with_numa_binding, NumaOptions
 
+=======
+import subprocess
+import sys
+from typing import Any, Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = ["SubprocessHandler"]
 
@@ -41,7 +48,10 @@ def __init__(
         stdout: Optional[str],
         stderr: Optional[str],
         local_rank_id: int,
+<<<<<<< HEAD
         numa_options: Optional[NumaOptions],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self._stdout = open(stdout, "w") if stdout else None
         self._stderr = open(stderr, "w") if stderr else None
@@ -50,6 +60,7 @@ def __init__(
         env_vars.update(env)
 
         args_str = (entrypoint, *[str(e) for e in args])
+<<<<<<< HEAD
         args_str = maybe_wrap_command_args_with_numa_binding(
             args_str,
             gpu_index=local_rank_id,
@@ -66,6 +77,16 @@ def _popen(self, args: tuple, env: dict[str, str]) -> Popen:
             kwargs["start_new_session"] = True
 
         return Popen(
+=======
+        self.local_rank_id = local_rank_id
+        self.proc: subprocess.Popen = self._popen(args_str, env_vars)
+
+    def _popen(self, args: tuple, env: dict[str, str]) -> subprocess.Popen:
+        kwargs: dict[str, Any] = {}
+        if not IS_WINDOWS:
+            kwargs["start_new_session"] = True
+        return subprocess.Popen(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
             #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
             #  `Tuple[str, *Tuple[Any, ...]]`.
diff --git a/torch/distributed/elastic/multiprocessing/tail_log.py b/torch/distributed/elastic/multiprocessing/tail_log.py
index 7ad35115cd34a..104e792413e00 100644
--- a/torch/distributed/elastic/multiprocessing/tail_log.py
+++ b/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -12,12 +12,19 @@
 import time
 from concurrent.futures.thread import ThreadPoolExecutor
 from threading import Event
+<<<<<<< HEAD
 from typing import Callable, Optional, TextIO, TYPE_CHECKING, Union
+=======
+from typing import Optional, TextIO, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     from concurrent.futures._base import Future
+<<<<<<< HEAD
     from io import TextIOWrapper
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = ["tail_logfile", "TailLog"]
 
@@ -25,12 +32,16 @@
 
 
 def tail_logfile(
+<<<<<<< HEAD
     header: str,
     file: str,
     dst: TextIO,
     finished: Event,
     interval_sec: float,
     log_line_filter: Optional[Callable[[str], bool]] = None,
+=======
+    header: str, file: str, dst: TextIO, finished: Event, interval_sec: float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     while not os.path.exists(file):
         if finished.is_set():
@@ -42,8 +53,12 @@ def tail_logfile(
             line = fp.readline()
 
             if line:
+<<<<<<< HEAD
                 if log_line_filter and log_line_filter(line):
                     dst.write(f"{header}{line}")
+=======
+                dst.write(f"{header}{line}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:  # reached EOF
                 if finished.is_set():
                     # log line producer is finished
@@ -97,21 +112,31 @@ def __init__(
         self,
         name: str,
         log_files: dict[int, str],
+<<<<<<< HEAD
         dst: Union[TextIO, str],
         log_line_prefixes: Optional[dict[int, str]] = None,
         interval_sec: float = 0.1,
         log_line_filter: Callable[[str], bool] = (lambda _: True),
+=======
+        dst: TextIO,
+        log_line_prefixes: Optional[dict[int, str]] = None,
+        interval_sec: float = 0.1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         n = len(log_files)
         self._threadpool = None
         if n > 0:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._threadpool = ThreadPoolExecutor(
                 max_workers=n,
                 thread_name_prefix=f"{self.__class__.__qualname__}_{name}",
             )
 
         self._name = name
+<<<<<<< HEAD
         self._dst_file: Optional[TextIOWrapper] = None
         self._dst: Optional[Union[TextIO, TextIOWrapper]] = None
         if isinstance(dst, str):
@@ -128,6 +153,11 @@ def __init__(
         self._log_files = log_files
         self._log_line_prefixes = log_line_prefixes
         self._log_line_filter = log_line_filter
+=======
+        self._dst = dst
+        self._log_files = log_files
+        self._log_line_prefixes = log_line_prefixes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._finished_events: dict[int, Event] = {
             local_rank: Event() for local_rank in log_files.keys()
         }
@@ -136,7 +166,11 @@ def __init__(
         self._stopped = False
 
     def start(self) -> "TailLog":
+<<<<<<< HEAD
         if not self._threadpool or not self._dst:
+=======
+        if not self._threadpool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self
 
         for local_rank, file in self._log_files.items():
@@ -151,7 +185,10 @@ def start(self) -> "TailLog":
                     dst=self._dst,
                     finished=self._finished_events[local_rank],
                     interval_sec=self._interval_sec,
+<<<<<<< HEAD
                     log_line_filter=self._log_line_filter,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
         return self
@@ -164,19 +201,31 @@ def stop(self) -> None:
             try:
                 f.result()
             except Exception as e:
+<<<<<<< HEAD
                 logger.exception(
                     "error in log tailor for %s%s. %s",
                     self._name,
                     local_rank,
                     e.__class__.__qualname__,
+=======
+                logger.error(
+                    "error in log tailor for %s%s. %s: %s",
+                    self._name,
+                    local_rank,
+                    e.__class__.__qualname__,
+                    e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         if self._threadpool:
             self._threadpool.shutdown(wait=True)
 
+<<<<<<< HEAD
         if self._dst_file:
             self._dst_file.close()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._stopped = True
 
     def stopped(self) -> bool:
diff --git a/torch/distributed/elastic/rendezvous/api.py b/torch/distributed/elastic/rendezvous/api.py
index 9e66c0228daa7..c4dc8b2884e84 100644
--- a/torch/distributed/elastic/rendezvous/api.py
+++ b/torch/distributed/elastic/rendezvous/api.py
@@ -7,9 +7,14 @@
 
 import socket
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, ClassVar, Optional
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, ClassVar, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed import Store
 from torch.distributed.elastic.utils.distributed import get_free_port
diff --git a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
index 982ff267a06a9..a3268aeebd8ee 100644
--- a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -87,7 +87,14 @@ def set_state(
             if not isinstance(token, bytes):
                 result = self.get_state()
                 if result is not None:
+<<<<<<< HEAD
                     return *result, False
+=======
+                    tmp = *result, False
+                    # Python 3.6 does not support tuple unpacking in return
+                    # statements.
+                    return tmp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return None
 
             token = token.decode()
diff --git a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
index 2a0e44aef31af..f953f5213891e 100644
--- a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
@@ -14,11 +14,18 @@
 import time
 import weakref
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from enum import Enum
 from typing import Any, Optional
+=======
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from enum import Enum
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.distributed as dist
 from torch.distributed import Store
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 300399414d9ce..396425697db9b 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -208,8 +208,13 @@ def shutdown(self) -> bool:
         try:
             self.set_closed()
             return True
+<<<<<<< HEAD
         except BaseException:  # noqa: B036
             logger.warning("Shutdown failed", exc_info=True)
+=======
+        except BaseException as e:
+            logger.warning("Shutdown failed. Error occurred: %s", str(e))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
 
@@ -333,7 +338,11 @@ def rendezvous_barrier(self):
                 # to avoid spamming etcd
                 # FIXME: there are a few things that fall under this like
                 # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
+<<<<<<< HEAD
                 logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)  # noqa: G200
+=======
+                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 time.sleep(1)
 
     def init_phase(self):
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
index a0012607ce36f..a3a30a9db2f96 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
@@ -96,7 +96,14 @@ def set_state(
         def get_state():
             result = self.get_state()
             if result is not None:
+<<<<<<< HEAD
                 return *result, False
+=======
+                tmp = *result, False
+                # Python 3.6 does not support tuple unpacking in return
+                # statements.
+                return tmp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         if token:
@@ -126,7 +133,10 @@ def get_state():
         return tmp
 
     def _decode_state(self, result: etcd.EtcdResult) -> tuple[bytes, Token]:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base64_state = result.value.encode()
 
         try:
@@ -136,7 +146,10 @@ def _decode_state(self, result: etcd.EtcdResult) -> tuple[bytes, Token]:
                 "The state object is corrupt. See inner exception for details."
             ) from exc
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return state, result.modifiedIndex
 
 
diff --git a/torch/distributed/elastic/rendezvous/etcd_server.py b/torch/distributed/elastic/rendezvous/etcd_server.py
index 7e54fdd9839af..3d879a094a449 100644
--- a/torch/distributed/elastic/rendezvous/etcd_server.py
+++ b/torch/distributed/elastic/rendezvous/etcd_server.py
@@ -176,7 +176,11 @@ def start(
             except Exception as e:
                 curr_retries += 1
                 stop_etcd(self._etcd_proc)
+<<<<<<< HEAD
                 logger.warning(  # noqa: G200
+=======
+                logger.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Failed to start etcd server, got error: %s, retrying", str(e)
                 )
                 if curr_retries >= num_retries:
diff --git a/torch/distributed/elastic/rendezvous/etcd_store.py b/torch/distributed/elastic/rendezvous/etcd_store.py
index 781a40e20e91c..732ee9f5ff422 100644
--- a/torch/distributed/elastic/rendezvous/etcd_store.py
+++ b/torch/distributed/elastic/rendezvous/etcd_store.py
@@ -149,9 +149,15 @@ def check(self, keys) -> bool:
     # In case of `str`, utf-8 encoding is assumed.
     #
     def _encode(self, value) -> str:
+<<<<<<< HEAD
         if type(value) is bytes:
             return b64encode(value).decode()
         elif type(value) is str:
+=======
+        if type(value) == bytes:
+            return b64encode(value).decode()
+        elif type(value) == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return b64encode(value.encode()).decode()
         raise ValueError("Value must be of type str or bytes")
 
@@ -160,9 +166,15 @@ def _encode(self, value) -> str:
     # Return type is `bytes`, which is more convenient with the Store interface.
     #
     def _decode(self, value) -> bytes:
+<<<<<<< HEAD
         if type(value) is bytes:
             return b64decode(value)
         elif type(value) is str:
+=======
+        if type(value) == bytes:
+            return b64decode(value)
+        elif type(value) == str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return b64decode(value.encode())
         raise ValueError("Value must be of type str or bytes")
 
diff --git a/torch/distributed/elastic/rendezvous/registry.py b/torch/distributed/elastic/rendezvous/registry.py
index ebada4623a814..1b41e268e3063 100644
--- a/torch/distributed/elastic/rendezvous/registry.py
+++ b/torch/distributed/elastic/rendezvous/registry.py
@@ -5,7 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+<<<<<<< HEAD
 from importlib.metadata import entry_points
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .api import (
     rendezvous_handler_registry as handler_registry,
@@ -15,6 +19,14 @@
 from .dynamic_rendezvous import create_handler
 
 
+<<<<<<< HEAD
+=======
+if sys.version_info < (3, 10):
+    from importlib_metadata import entry_points
+else:
+    from importlib.metadata import entry_points
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 __all__ = ["get_rendezvous_handler"]
diff --git a/torch/distributed/elastic/rendezvous/utils.py b/torch/distributed/elastic/rendezvous/utils.py
index e4717959232d1..c611537467996 100644
--- a/torch/distributed/elastic/rendezvous/utils.py
+++ b/torch/distributed/elastic/rendezvous/utils.py
@@ -11,10 +11,16 @@
 import socket
 import time
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable
 from datetime import timedelta
 from threading import Event, Thread
 from typing import Any, Optional, Union
+=======
+from datetime import timedelta
+from threading import Event, Thread
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["parse_rendezvous_endpoint"]
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index d0f61bf1cef32..1652659cd4c94 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -13,8 +13,12 @@
 import sys
 import threading
 import time
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, TypeVar
+=======
+from typing import Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index 5e66ef3fae349..8f6e23054f5d3 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -59,7 +59,11 @@ def size(self) -> int:
     def get(self, size, timeout: float) -> list[TimerRequest]:
         requests = []
         wait = timeout
+<<<<<<< HEAD
         for _ in range(size):
+=======
+        for _ in range(0, size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             start = time.time()
 
             try:
diff --git a/torch/distributed/elastic/utils/data/cycling_iterator.py b/torch/distributed/elastic/utils/data/cycling_iterator.py
index 291a04226db79..ac889e98ec36b 100644
--- a/torch/distributed/elastic/utils/data/cycling_iterator.py
+++ b/torch/distributed/elastic/utils/data/cycling_iterator.py
@@ -1,7 +1,12 @@
 #!/usr/bin/env python3
 
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from typing import TypeVar
+=======
+from collections.abc import Iterator
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 
diff --git a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
index a10d49ae4897f..05febc1ead541 100644
--- a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
+++ b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
@@ -53,7 +53,10 @@ def __init__(
             raise TypeError("Dataset must be an instance of collections.abc.Sized")
 
         # Cast to Sized for mypy
+<<<<<<< HEAD
         # pyrefly: ignore [redundant-cast]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sized_dataset = cast(Sized, dataset)
 
         if start_index >= len(sized_dataset):
@@ -63,8 +66,13 @@ def __init__(
 
         self.start_index = start_index
         sized_dataset = cast(Sized, self.dataset)
+<<<<<<< HEAD
         self.num_samples = math.ceil(
             float(len(sized_dataset) - self.start_index) / self.num_replicas
+=======
+        self.num_samples = int(
+            math.ceil(float(len(sized_dataset) - self.start_index) / self.num_replicas)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.total_size = self.num_samples * self.num_replicas
 
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
index c7d56374e7d38..d78475924668c 100644
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@@ -65,6 +65,9 @@ def _derive_module_name(depth: int = 1) -> Optional[str]:
         warnings.warn(
             f"Error deriving logger module name, using <None>. Exception: {e}",
             RuntimeWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return None
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index e01991114bef8..45c71265ba817 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -7,10 +7,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from datetime import timedelta
 from typing import Optional
+=======
+from collections.abc import Iterable
+from contextlib import contextmanager
+from datetime import timedelta
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py
index 1e4219250c39d..450eed814f03a 100644
--- a/torch/distributed/fsdp/__init__.py
+++ b/torch/distributed/fsdp/__init__.py
@@ -6,7 +6,10 @@
     MixedPrecisionPolicy,
     OffloadPolicy,
     register_fsdp_forward_method,
+<<<<<<< HEAD
     share_comm_ctx,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnshardHandle,
 )
 from .fully_sharded_data_parallel import (
@@ -55,7 +58,10 @@
     "OffloadPolicy",
     "register_fsdp_forward_method",
     "UnshardHandle",
+<<<<<<< HEAD
     "share_comm_ctx",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # Set namespace for exposed private names
@@ -66,4 +72,7 @@
 OffloadPolicy.__module__ = "torch.distributed.fsdp"
 register_fsdp_forward_method.__module__ = "torch.distributed.fsdp"
 UnshardHandle.__module__ = "torch.distributed.fsdp"
+<<<<<<< HEAD
 share_comm_ctx.__module__ = "torch.distributed.fsdp"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 54d6c974caedf..e96d062d2fd24 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -7,11 +7,19 @@
 import traceback
 import warnings
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable
 from enum import auto, Enum
 from functools import partial
 from itertools import chain
 from typing import Any, cast, no_type_check, Optional, TYPE_CHECKING
+=======
+from collections.abc import Generator, Iterable
+from enum import auto, Enum
+from functools import partial
+from itertools import chain
+from typing import Any, Callable, cast, no_type_check, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -65,7 +73,10 @@ def __init__(self, device: torch.device, backend: Any = None):
         if backend is None:
             try:
                 self.__backend = getattr(torch, device.type)
+<<<<<<< HEAD
                 # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.__device = device
             except AttributeError as exc:
                 raise AttributeError(
@@ -203,10 +214,16 @@ def _module_handle(state: _FSDPState, module: nn.Module) -> Optional["FlatParamH
         # handles, meaning no entry in `_fully_sharded_module_to_handles`
         if state._handle is None:
             return None
+<<<<<<< HEAD
         if module not in state._fully_sharded_module_to_handle:
             raise AssertionError(
                 f"Expects a fully sharded module but got {module} on rank {state.rank}"
             )
+=======
+        assert module in state._fully_sharded_module_to_handle, (
+            f"Expects a fully sharded module but got {module} on rank {state.rank}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return state._fully_sharded_module_to_handle[module]
     else:
         # NOTE: This assumes `module` is a `FullyShardedDataParallel` instance.
@@ -259,10 +276,16 @@ def _named_parameters_with_duplicates(
     This API is required as some modules overwrite `named_parameters()` but do not support
     `remove_duplicate`.
     """
+<<<<<<< HEAD
     if "remove_duplicate" in kwargs:
         raise AssertionError(
             "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
         )
+=======
+    assert "remove_duplicate" not in kwargs, (
+        "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs["remove_duplicate"] = False
     try:
         ret = list(module.named_parameters(**kwargs))
@@ -336,8 +359,12 @@ def module_fn(module, prefix, tree_level, param_to_fqns):
                     warnings.warn(
                         "FlatParameter is being traversed more than once. "
                         "This case should only happen when using "
+<<<<<<< HEAD
                         "DistributedModelParallel with FullyShardedDataParallel.",
                         stacklevel=2,
+=======
+                        "DistributedModelParallel with FullyShardedDataParallel."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     param_to_fqns[param] = global_fqns
                 elif not dedup_shared_params:
diff --git a/torch/distributed/fsdp/_debug_utils.py b/torch/distributed/fsdp/_debug_utils.py
index cf5a411f8c556..7f898b1b300b2 100644
--- a/torch/distributed/fsdp/_debug_utils.py
+++ b/torch/distributed/fsdp/_debug_utils.py
@@ -39,12 +39,20 @@ def reset(cls) -> None:
     @classmethod
     @contextmanager
     def profile(cls, profile_type: str) -> Iterator[None]:
+<<<<<<< HEAD
         if profile_type in cls.profiling:
             raise AssertionError(
                 f"{profile_type} is already being profiled. "
                 "SimpleProfiler does not support profiling multiple instances at "
                 "the same time. "
             )
+=======
+        assert profile_type not in cls.profiling, (
+            f"{profile_type} is already being profiled. "
+            "SimpleProfiler does not support profiling multiple instances at "
+            "the same time. "
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cls.profiling.add(profile_type)
         begin = time.monotonic()
@@ -130,8 +138,12 @@ def module_fn(
 
         if handle:
             param = handle.flat_param
+<<<<<<< HEAD
             if not isinstance(param, flat_param_file.FlatParameter):
                 raise AssertionError(f"Expected FlatParameter, got {type(param)}")
+=======
+            assert isinstance(param, flat_param_file.FlatParameter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             global_fqns = [
                 clean_tensor_name(prefix + name) for name in param._fqns
             ]  # prefixed from the top level `model` (i.e. including `prefix`)
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index db2ea7bfae0b9..7da953c5a76a5 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -214,8 +214,12 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
             # parameters
             # TODO (awgu): Since every module has at most one handle in the
             # current implementation, this should never raise the error.
+<<<<<<< HEAD
             if self.world_size is None:
                 raise AssertionError("Expected world_size to not be None")
+=======
+            assert self.world_size is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
                 # TODO(voz): Don't graph break on this - dynamo hates the n1 != n2
                 # tensor comparison control flow.
@@ -299,8 +303,12 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
                 warnings.warn(
                     "Forward order differs from that of the first iteration "
                     f"on rank {self.rank}. Collectives are unchecked and may "
+<<<<<<< HEAD
                     f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}",
                     stacklevel=2,
+=======
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.warn_status = _ExecOrderWarnStatus.WARNING
             self.current_order_index += 1
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 8adde16de6b91..c3424af3016eb 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -4,10 +4,17 @@
 import logging
 import os
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterator, Sequence
 from enum import auto, Enum
 from itertools import accumulate, chain
 from typing import Any, cast, NamedTuple, no_type_check, Optional, Union
+=======
+from collections.abc import Generator, Iterator, Sequence
+from enum import auto, Enum
+from itertools import accumulate, chain
+from typing import Any, Callable, cast, NamedTuple, no_type_check, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -360,8 +367,12 @@ class FlatParameter(nn.Parameter, metaclass=_FlatParameterMeta):
     _is_padding_mask: list[bool]
 
     def __new__(cls, data=None, requires_grad=True):
+<<<<<<< HEAD
         if cls is not FlatParameter:
             raise AssertionError("subclasses FlatParameter not supported")
+=======
+        assert cls is FlatParameter, "subclasses FlatParameter not supported"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = nn.Parameter.__new__(nn.Parameter, data, requires_grad)  # type: ignore[call-arg]
         r._is_flat_param = True  # type: ignore[attr-defined]
         return r
@@ -399,6 +410,7 @@ def _init_metadata(
         Args:
             See the Attributes in the class docstring.
         """
+<<<<<<< HEAD
         if len(param_infos) != len(shapes):
             raise AssertionError(
                 f"Expected param_infos length {len(param_infos)} to match shapes length {len(shapes)}"
@@ -419,6 +431,13 @@ def _init_metadata(
             raise AssertionError(
                 f"Expected param_infos length {len(param_infos)} to match param_extensions length {len(param_extensions)}"
             )
+=======
+        assert len(param_infos) == len(shapes)
+        assert len(param_infos) == len(strides)
+        assert len(param_infos) == len(contiguities)
+        assert len(param_infos) == len(fqns)
+        assert len(param_infos) == len(param_extensions)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._num_params = len(param_infos)
         self._param_infos = param_infos
         self._shapes = shapes
@@ -434,15 +453,20 @@ def _init_metadata(
                 numels_without_padding.append(numel)
         self._numels = tuple(numels_without_padding)
         self._numels_with_padding = tuple(numels)
+<<<<<<< HEAD
         if len(self._numels) != self._num_params:
             raise AssertionError(
                 f"Expected _numels length {len(self._numels)} to equal _num_params {self._num_params}"
             )
+=======
+        assert len(self._numels) == self._num_params
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._shared_param_infos = tuple(shared_param_infos)
         self._modules = {pi.module for pi in self._param_infos}.union(
             {spi.module for spi in self._shared_param_infos}
         )
+<<<<<<< HEAD
         if (params is None) != (shared_params is None):
             raise AssertionError(
                 "Expected params and shared_params to both be None or both be not None"
@@ -452,14 +476,25 @@ def _init_metadata(
                 raise AssertionError(
                     f"Expected shared_params to be not None and have length {len(shared_param_infos)}, got {shared_params}"
                 )
+=======
+        assert (params is None) == (shared_params is None)
+        if params is not None:
+            assert shared_params is not None and len(shared_params) == len(
+                shared_param_infos
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._params = []
             for param, is_padding in zip(params, is_padding_mask):
                 if not is_padding:
                     self._params.append(param)
+<<<<<<< HEAD
             if shared_params is not None:
                 self._shared_params = shared_params
             else:
                 self._shared_params = []
+=======
+            self._shared_params = shared_params
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Mark the original parameters to avoid flattening them into
             # another `FlatParameter` during recursive construction
             for param in chain(self._params, self._shared_params):
@@ -565,12 +600,19 @@ def __init__(
         # Only align addresses for `use_orig_params=True` (for now)
         align_addresses = use_orig_params
         self._init_get_unflat_views_fn(align_addresses)
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self._device_handle = _FSDPDeviceHandle.from_device(self.device)
         self.process_group = process_group
         if self._use_fake_all_gather or self._use_fake_reduce:
+<<<<<<< HEAD
             self._fake_process_group = FakeProcessGroup._create_internal(
+=======
+            self._fake_process_group = FakeProcessGroup(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rank=process_group.rank(), world_size=process_group.size()
             )
         self.rank = process_group.rank()
@@ -605,8 +647,12 @@ def __init__(
         # before `_init_flat_param()`, which performs the actual validation
         self._orig_param_dtype = params[0].dtype
         self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype)
+<<<<<<< HEAD
         if self._fwd_bwd_param_dtype is None:
             raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")  # mypy
+=======
+        assert self._fwd_bwd_param_dtype is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._aligned_numel = (
             _get_aligned_numel(unsharded_dtype=self._fwd_bwd_param_dtype)
             if align_addresses
@@ -834,8 +880,12 @@ def _validate_tensors_to_flatten(
             dtype = tensor.dtype
             flat_param_requires_grad = flat_param_requires_grad or tensor.requires_grad
             device = tensor.device
+<<<<<<< HEAD
         if flat_param_requires_grad is None:
             raise AssertionError("Requires non-empty `tensors` list")
+=======
+        assert flat_param_requires_grad is not None, "Requires non-empty `tensors` list"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return dtype, flat_param_requires_grad, device
 
     def flatten_tensors(
@@ -936,10 +986,15 @@ def _init_param_reduce_dtypes(
         else:
             self._fwd_bwd_param_dtype = mp_param_dtype or self._orig_param_dtype
             self._reduce_dtype = mp_reduce_dtype or self._orig_param_dtype
+<<<<<<< HEAD
         if self._fwd_bwd_param_dtype is None:
             raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")
         if self._reduce_dtype is None:
             raise AssertionError("Expected _reduce_dtype to be not None")
+=======
+        assert self._fwd_bwd_param_dtype is not None
+        assert self._reduce_dtype is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ###################################
     # SHARD INITIALIZATION & METADATA #
@@ -1015,10 +1070,16 @@ def _init_shard_metadata(
         shard_param_infos = self._get_shard_metadata(
             unsharded_start_idx, unsharded_end_idx
         )
+<<<<<<< HEAD
         if len(shard_param_infos) != flat_param._num_params:
             raise AssertionError(
                 f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
             )
+=======
+        assert len(shard_param_infos) == flat_param._num_params, (
+            f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_param._shard_param_infos = shard_param_infos  # type: ignore[attr-defined]
         flat_param._shard_numel_padded = numel_padded  # type: ignore[attr-defined]
 
@@ -1034,10 +1095,16 @@ def _get_shard_metadata(
         unsharded flat parameter specifying the shard.
         """
         flat_param_offsets = self._get_flat_param_offsets()
+<<<<<<< HEAD
         if len(flat_param_offsets) != len(self.flat_param._numels_with_padding):
             raise AssertionError(
                 f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
             )
+=======
+        assert len(flat_param_offsets) == len(self.flat_param._numels_with_padding), (
+            f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_param_infos: list[_ShardParamInfo] = []
         sharded_flat_param_numel = unsharded_end_idx - unsharded_start_idx + 1
         # `unsharded_param_start_idx` and `unsharded_param_end_idx` are indices
@@ -1065,6 +1132,7 @@ def _get_shard_metadata(
                         unsharded_start_idx - unsharded_param_start_idx
                     )
                     offset_in_shard = 0
+<<<<<<< HEAD
                 if not (
                     offset_in_shard >= 0 and offset_in_shard < sharded_flat_param_numel
                 ):
@@ -1072,6 +1140,14 @@ def _get_shard_metadata(
                         f"Invalid `offset_in_shard` of {offset_in_shard} for "
                         f"sharded flat parameter with {sharded_flat_param_numel} numel"
                     )
+=======
+                assert (
+                    offset_in_shard >= 0 and offset_in_shard < sharded_flat_param_numel
+                ), (
+                    f"Invalid `offset_in_shard` of {offset_in_shard} for "
+                    f"sharded flat parameter with {sharded_flat_param_numel} numel"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 intra_param_end_idx = (
                     min(unsharded_param_end_idx, unsharded_end_idx)
                     - unsharded_param_start_idx
@@ -1115,10 +1191,16 @@ def _get_unpadded_shard(
         else:
             chunk = chunks[rank]
         numel_to_pad = chunks[0].numel() - chunk.numel()
+<<<<<<< HEAD
         if numel_to_pad < 0:
             raise AssertionError(
                 "Chunk's size should be at most the first chunk's size"
             )
+=======
+        assert numel_to_pad >= 0, (
+            "Chunk's size should be at most the first chunk's size"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return chunk, numel_to_pad
 
     @staticmethod
@@ -1149,16 +1231,24 @@ def _get_sharded_size(tensor: Tensor, rank: int, world_size: int) -> torch.Size:
         This requires ``tensor`` to have 1D shape and ensures that the returned
         shape is 1D.
         """
+<<<<<<< HEAD
         if len(tensor.shape) != 1:
             raise AssertionError(f"Expected 1D tensor shape, got {tensor.shape}")
+=======
+        assert len(tensor.shape) == 1, f"{tensor.shape}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unpadded_sharded_tensor, numel_to_pad = FlatParamHandle._get_unpadded_shard(
             tensor, rank, world_size
         )
         unpadded_sharded_size = unpadded_sharded_tensor.size()
+<<<<<<< HEAD
         if len(unpadded_sharded_size) != 1:
             raise AssertionError(
                 f"Expected 1D unpadded_sharded_size, got {unpadded_sharded_size}"
             )
+=======
+        assert len(unpadded_sharded_size) == 1, f"{unpadded_sharded_size}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.Size([unpadded_sharded_size[0] + numel_to_pad])
 
     def _get_flat_param_offsets(self) -> list[tuple[int, int]]:
@@ -1585,8 +1675,12 @@ def unshard_grad(self):
                 warnings.warn(
                     f"[Rank {self.rank}] Only some but not all ranks have a "
                     "`None` `FlatParameter` gradient, so FSDP is using zeros to "
+<<<<<<< HEAD
                     "approximate those ranks' sharded gradients being `None`",
                     stacklevel=2,
+=======
+                    "approximate those ranks' sharded gradients being `None`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             flat_param._saved_grad_shard = None  # type: ignore[assignment]
             sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device)  # type: ignore[attr-defined]
@@ -2098,7 +2192,11 @@ def _use_unsharded_grad_views(self) -> None:
             _p_assert(
                 hasattr(module, param_name),
                 f"{module_name + '.' + param_name if module_name else param_name} is missing",
+<<<<<<< HEAD
             )
+=======
+            )  # did not save FQN info in `_shared_param_infos`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             param = getattr(module, param_name)
             prim_param = getattr(prim_module, prim_param_name)
             if (
@@ -2169,8 +2267,12 @@ def _use_sharded_views(self) -> None:
                 offset = shard_param_info.offset_in_shard
                 numel_in_shard = shard_param_info.numel_in_shard
                 param.data = flat_param[offset : offset + numel_in_shard]
+<<<<<<< HEAD
         if self.flat_param._shared_params is None:
             raise AssertionError("Expected _shared_params to be not None")
+=======
+        assert self.flat_param._shared_params is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, (
             param,
             (param_name, module, _, prim_param_name, prim_module, _),
@@ -2234,8 +2336,12 @@ def _use_sharded_grad_views(self) -> None:
                         )
                 else:
                     param.grad = None
+<<<<<<< HEAD
         if flat_param._shared_params is None:
             raise AssertionError("Expected _shared_params to be not None")
+=======
+        assert flat_param._shared_params is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param, (_, _, _, prim_param_name, prim_module, _) in zip(
             flat_param._shared_params, flat_param._shared_param_infos
         ):
@@ -2336,9 +2442,14 @@ def _writeback_orig_params(self) -> bool:
                 flat_param._params[i] = param
             if needs_param_writeback:
                 expected_shape = torch.Size([numel_in_shard])
+<<<<<<< HEAD
                 src = param if self.uses_sharded_strategy else param.view(-1)
                 self._writeback_tensor(
                     src, flat_param, i, expected_shape, offset_in_shard, True
+=======
+                self._writeback_tensor(
+                    param, flat_param, i, expected_shape, offset_in_shard, True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 wroteback = True
 
@@ -2370,6 +2481,7 @@ def _writeback_orig_params(self) -> bool:
                     if flat_param_grad is None:
                         flat_param_grad = torch.zeros_like(flat_param)
                     expected_shape = torch.Size([numel_in_shard])
+<<<<<<< HEAD
                     src = (
                         param.grad
                         if self.uses_sharded_strategy
@@ -2377,6 +2489,10 @@ def _writeback_orig_params(self) -> bool:
                     )
                     self._writeback_tensor(
                         src,
+=======
+                    self._writeback_tensor(
+                        param.grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         flat_param_grad,
                         i,
                         expected_shape,
@@ -2435,8 +2551,12 @@ def _writeback_tensor(
                 f"[Rank {rank}] {'Parameter' if is_param else 'Gradient'} needs "
                 f"writeback in {self._training_state}\n"
                 f"expected shape={expected_shape} shape={src_shape} "
+<<<<<<< HEAD
                 f"expected device={dst_tensor.device} device={src_device}",
                 stacklevel=2,
+=======
+                f"expected device={dst_tensor.device} device={src_device}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         if src_tensor is not None and src_tensor.shape != expected_shape:
             # NOTE: Gradient shape mismatch is not possible in practice since
@@ -2450,8 +2570,12 @@ def _writeback_tensor(
             dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor)
         else:
             dst_tensor[offset : offset + expected_shape.numel()].zero_()
+<<<<<<< HEAD
             if self.flat_param._is_grad_none_mask is None:
                 raise AssertionError("Expected _is_grad_none_mask to be not None")
+=======
+            assert self.flat_param._is_grad_none_mask is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.flat_param._is_grad_none_mask[tensor_index] = True
 
     def _reset_flat_param_grad_info_if_needed(self):
@@ -2470,8 +2594,12 @@ def _reset_flat_param_grad_info_if_needed(self):
         if not self._use_orig_params:
             return
         flat_param = self.flat_param
+<<<<<<< HEAD
         if flat_param._params is None:
             raise AssertionError("Expected _params to be not None")  # mypy
+=======
+        assert flat_param._params is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_grad_none = True
         requires_grad = False
         for param in flat_param._params:
@@ -2497,7 +2625,10 @@ def _deregister_orig_params(self):
     ###########
     def flat_param_to(self, *args, **kwargs):
         """Wrap an in-place call to ``.to()`` for ``self.flat_param``."""
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.flat_param.data = self.flat_param.to(*args, **kwargs)
         if self._use_orig_params:
             # Refresh the views because their storage may have changed
@@ -2616,16 +2747,24 @@ def _reset_is_grad_none(self) -> None:
             "Expects to only be called in the post-backward after gradient computation",
         )
         flat_param = self.flat_param
+<<<<<<< HEAD
         if flat_param._params is None:
             raise AssertionError("Expected _params to be not None")  # mypy
+=======
+        assert flat_param._params is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, param in enumerate(flat_param._params):  # type: ignore[arg-type]
             # As long as the parameter requires gradient, it should receive a
             # meaningful gradient (even if the gradient happens to be zeros)
             if param.requires_grad:
+<<<<<<< HEAD
                 if flat_param._is_grad_none_mask is None:
                     raise AssertionError(
                         "Expected _is_grad_none_mask to be not None"
                     )  # mypy
+=======
+                assert flat_param._is_grad_none_mask is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 flat_param._is_grad_none_mask[i] = False
 
     #######################
diff --git a/torch/distributed/fsdp/_fsdp_extensions.py b/torch/distributed/fsdp/_fsdp_extensions.py
index 699274ba50f9a..275fe3009c459 100644
--- a/torch/distributed/fsdp/_fsdp_extensions.py
+++ b/torch/distributed/fsdp/_fsdp_extensions.py
@@ -161,8 +161,12 @@ def _ext_pre_load_state_dict_transform(
     if fsdp_extension is not None:
         return fsdp_extension.pre_load_state_dict_transform(tensor)
 
+<<<<<<< HEAD
     if type(tensor) is not ShardedTensor:
         raise AssertionError(f"Expected ShardedTensor, got {type(tensor)}")
+=======
+    assert type(tensor) is ShardedTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shards = tensor.local_shards()
     return (tensor, shards)
 
diff --git a/torch/distributed/fsdp/_fully_shard/__init__.py b/torch/distributed/fsdp/_fully_shard/__init__.py
index d4d0b341a3f82..f38de730204d8 100644
--- a/torch/distributed/fsdp/_fully_shard/__init__.py
+++ b/torch/distributed/fsdp/_fully_shard/__init__.py
@@ -3,7 +3,10 @@
     FSDPModule,
     fully_shard,
     register_fsdp_forward_method,
+<<<<<<< HEAD
     share_comm_ctx,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnshardHandle,
 )
 
@@ -16,5 +19,8 @@
     "OffloadPolicy",
     "register_fsdp_forward_method",
     "UnshardHandle",
+<<<<<<< HEAD
     "share_comm_ctx",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_api.py b/torch/distributed/fsdp/_fully_shard/_fsdp_api.py
index 38650323f5e99..49ad0431dd0e8 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_api.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_api.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -9,6 +10,12 @@
 
 
 _ReduceOp = Union[dist.ReduceOp, dist.ReduceOp.RedOpType]
+=======
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(frozen=True)
@@ -53,6 +60,7 @@ class MixedPrecisionPolicy:
     cast_forward_inputs: bool = True
 
 
+<<<<<<< HEAD
 class Comm(ABC):
     """
     Interface for communication primitives.
@@ -127,6 +135,8 @@ def __call__(
     ) -> Optional[dist.Work]: ...
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class OffloadPolicy:
     """
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
index bf3f8eadaaf15..d6485a83b4bc1 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -1,16 +1,27 @@
+<<<<<<< HEAD
 import math
 from collections.abc import Callable, Sequence
 from itertools import chain
 from typing import Any, cast, NamedTuple, Optional, Union
+=======
+from itertools import chain
+from typing import Callable, cast, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 from torch.distributed.device_mesh import _get_device_handle
+<<<<<<< HEAD
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.distributed.fsdp._fully_shard._fsdp_api import AllGather, ReduceScatter
 from torch.distributed.tensor import DTensor
 
 from ._fsdp_api import _ReduceOp
+=======
+from torch.distributed.distributed_c10d import _resolve_process_group, ReduceOp
+from torch.distributed.tensor import DTensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._fsdp_common import (
     _get_dim0_padded_size,
     _raise_assert_with_print,
@@ -33,21 +44,50 @@ class AllGatherResult(NamedTuple):
     all_gather_input_split_sizes: list[int]
 
 
+<<<<<<< HEAD
+=======
+def allocate_memory(
+    size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    group: dist.ProcessGroup,
+    from_process_group: bool,
+) -> torch.Tensor:
+    if from_process_group:
+        backend = group._get_backend(device)
+        if backend.supports_tensor_alloc(device):
+            return backend.allocate_tensor(size, dtype=dtype, device=device)
+    return torch.empty((size,), dtype=dtype, device=device)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lib = torch.library.Library("fsdp", "FRAGMENT")  # noqa: TOR901
 
 lib.define(
     """
     all_gather_copy_in(
         Tensor[] all_gather_inputs,
+<<<<<<< HEAD
         Tensor all_gather_output,
         SymInt[] inp_split_sizes,
         SymInt all_gather_input_numel,
         SymInt rank
+=======
+        SymInt[] inp_split_sizes,
+        SymInt all_gather_input_numel,
+        SymInt world_size,
+        SymInt rank,
+        ScalarType dtype,
+        Device device,
+        str group_name,
+        bool allocate_memory_from_process_group
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> (Tensor, Tensor)
     """
 )
 
 
+<<<<<<< HEAD
 class DefaultAllocMixin:
     def allocate(
         self,
@@ -160,6 +200,23 @@ def all_gather_copy_in_meta(
     all_gather_input_numel: int,
     rank: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+=======
+@torch.library.impl(lib, "all_gather_copy_in", "Meta")
+def all_gather_copy_in_meta(
+    all_gather_inputs: list[torch.Tensor],
+    inp_split_sizes: list[int],
+    all_gather_input_numel: int,
+    world_size: int,
+    rank: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    group_name: str,
+    allocate_memory_from_process_group: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    all_gather_output = torch.empty(
+        (all_gather_input_numel * world_size,), dtype=dtype, device="meta"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_gather_input = all_gather_output.narrow(
         0, all_gather_input_numel * rank, all_gather_input_numel
     )
@@ -174,11 +231,30 @@ def all_gather_copy_in_meta(
 @torch.library.impl(lib, "all_gather_copy_in", "PrivateUse1")
 def all_gather_copy_in_cuda(
     all_gather_inputs: list[torch.Tensor],
+<<<<<<< HEAD
     all_gather_output: torch.Tensor,
     inp_split_sizes: list[int],
     all_gather_input_numel: int,
     rank: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+=======
+    inp_split_sizes: list[int],
+    all_gather_input_numel: int,
+    world_size: int,
+    rank: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    group_name: str,
+    allocate_memory_from_process_group: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    all_gather_output = allocate_memory(
+        all_gather_input_numel * world_size,
+        dtype=dtype,
+        device=device,
+        group=_resolve_process_group(group_name),
+        from_process_group=allocate_memory_from_process_group,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_gather_input = all_gather_output.narrow(
         0, all_gather_input_numel * rank, all_gather_input_numel
     )
@@ -240,7 +316,11 @@ def foreach_all_gather(
     all_gather_copy_in_stream: torch.Stream,
     all_gather_stream: torch.Stream,
     device: torch.device,
+<<<<<<< HEAD
     all_gather_comm: AllGather,
+=======
+    allocate_memory_from_process_group: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[AllGatherResult]:
     world_size, rank = group.size(), group.rank()
     device_handle = _get_device_handle(device.type)
@@ -259,6 +339,7 @@ def foreach_all_gather(
             all_gather_inputs = [*chain.from_iterable(param_all_gather_inputs)]
         inp_split_sizes = [t.numel() for t in all_gather_inputs]
         all_gather_input_numel = sum(inp_split_sizes)
+<<<<<<< HEAD
         all_gather_output = all_gather_comm.allocate(
             (all_gather_input_numel * world_size,), dtype=dtype, device=device
         )
@@ -268,11 +349,27 @@ def foreach_all_gather(
             inp_split_sizes,
             all_gather_input_numel,
             rank,
+=======
+        all_gather_input, all_gather_output = torch.ops.fsdp.all_gather_copy_in(
+            all_gather_inputs,
+            inp_split_sizes,
+            all_gather_input_numel,
+            world_size,
+            rank,
+            dtype,
+            device,
+            group.group_name,
+            allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         del param_all_gather_inputs
     all_gather_stream.wait_stream(all_gather_copy_in_stream)
     with device_handle.stream(all_gather_stream):
+<<<<<<< HEAD
         all_gather_work = all_gather_comm(
+=======
+        all_gather_work = dist.all_gather_into_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_tensor=all_gather_output,
             input_tensor=all_gather_input,
             group=group,
@@ -449,7 +546,10 @@ def foreach_reduce(
     unsharded_grads: list[torch.Tensor],
     reduce_scatter_group: dist.ProcessGroup,
     reduce_scatter_stream: torch.Stream,
+<<<<<<< HEAD
     reduce_scatter_comm: ReduceScatter,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_dtype: Optional[torch.dtype],
     reduce_dtype: Optional[torch.dtype],
     device: torch.device,
@@ -459,6 +559,10 @@ def foreach_reduce(
     all_reduce_grads: bool,
     partial_reduce_output: Optional[torch.Tensor],  # only used for HSDP
     all_reduce_hook: Optional[Callable[[torch.Tensor], None]],
+<<<<<<< HEAD
+=======
+    allocate_memory_from_process_group: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     force_sum_reduction_for_comms: bool = False,
 ) -> tuple[
     torch.Tensor,
@@ -472,7 +576,10 @@ def foreach_reduce(
     ``unsharded_grads`` owns the references to the gradients computed by
     autograd, so clearing the list frees the gradients.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_dtypes = {grad.dtype for grad in unsharded_grads}
     if len(grad_dtypes) != 1:
         # Check this at runtime since it could be a real runtime error if e.g.
@@ -493,6 +600,7 @@ def foreach_reduce(
         )
     )
     world_size = reduce_scatter_group.size()
+<<<<<<< HEAD
     device_handle = _get_device_handle(device.type)
     current_stream = device_handle.current_stream()
 
@@ -509,11 +617,22 @@ def foreach_reduce(
             chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
             unsharded_grads[i] = torch.cat(chunks, dim=0)
 
+=======
+    for i, (fsdp_param, unsharded_grad) in enumerate(zip(fsdp_params, unsharded_grads)):
+        if (shard_dim := fsdp_param.fsdp_placement.dim) == 0:
+            continue
+        assert unsharded_grad.size(shard_dim) % world_size == 0, (
+            f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
+        )
+        chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
+        unsharded_grads[i] = torch.cat(chunks, dim=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padded_unsharded_sizes = tuple(
         _get_dim0_padded_size(grad.size(), world_size) for grad in unsharded_grads
     )
     reduce_scatter_input_numel = sum(s.numel() for s in padded_unsharded_sizes)
     reduce_scatter_output_numel = reduce_scatter_input_numel // world_size
+<<<<<<< HEAD
     reduce_scatter_input = reduce_scatter_comm.allocate(
         (reduce_scatter_input_numel,),
         dtype=reduce_dtype,
@@ -522,11 +641,24 @@ def foreach_reduce(
 
     foreach_reduce_scatter_copy_in(unsharded_grads, reduce_scatter_input, world_size)
 
+=======
+    reduce_scatter_input = allocate_memory(
+        reduce_scatter_input_numel,
+        dtype=reduce_dtype,
+        device=device,
+        group=reduce_scatter_group,
+        from_process_group=allocate_memory_from_process_group,
+    )
+    device_handle = _get_device_handle(device.type)
+    foreach_reduce_scatter_copy_in(unsharded_grads, reduce_scatter_input, world_size)
+    current_stream = device_handle.current_stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Only after the copy-in finishes can we free the gradients
     unsharded_grads.clear()
     reduce_scatter_stream.wait_stream(current_stream)
     all_reduce_input = None
     all_reduce_event = None
+<<<<<<< HEAD
 
     with device_handle.stream(reduce_scatter_stream):
         reduce_output = reduce_scatter_comm.allocate(
@@ -545,6 +677,23 @@ def foreach_reduce(
         else:
             # For single GPU, just copy the input to output (no actual reduce-scatter needed)
             reduce_output.copy_(reduce_scatter_input)
+=======
+    with device_handle.stream(reduce_scatter_stream):
+        reduce_output = allocate_memory(
+            reduce_scatter_output_numel,
+            dtype=reduce_dtype,
+            device=device,
+            group=reduce_scatter_group,
+            from_process_group=allocate_memory_from_process_group,
+        )
+        _div_if_needed(reduce_scatter_input, predivide_factor)
+        dist.reduce_scatter_tensor(
+            output=reduce_output,
+            input=reduce_scatter_input,
+            group=reduce_scatter_group,
+            op=reduce_scatter_op,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduce_scatter_event = reduce_scatter_stream.record_event()
         post_reduce_stream = reduce_scatter_stream
         if all_reduce_group is not None:  # HSDP
@@ -565,10 +714,14 @@ def foreach_reduce(
             if partial_reduce_output is not None:
                 reduce_output += partial_reduce_output
             post_reduce_stream = all_reduce_stream
+<<<<<<< HEAD
             if world_size >= 1:
                 all_reduce_stream.wait_stream(reduce_scatter_stream)
             else:
                 all_reduce_stream.wait_stream(current_stream)
+=======
+            all_reduce_stream.wait_stream(reduce_scatter_stream)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with device_handle.stream(all_reduce_stream):
                 dist.all_reduce(
                     reduce_output,
@@ -620,12 +773,18 @@ def foreach_reduce(
                 if non_blocking:
                     # Record an event on which to block the CPU thread to
                     # ensure that the D2H copy finishes before the optimizer
+<<<<<<< HEAD
                     fsdp_param.grad_offload_event = post_reduce_stream.record_event()
             if to_accumulate_grad:
                 if not isinstance(fsdp_param.sharded_param.grad, DTensor):
                     raise AssertionError(
                         f"Expected fsdp_param.sharded_param.grad to be DTensor, got {type(fsdp_param.sharded_param.grad)}"
                     )
+=======
+                    fsdp_param.grad_offload_event = reduce_scatter_stream.record_event()
+            if to_accumulate_grad:
+                assert isinstance(fsdp_param.sharded_param.grad, DTensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fsdp_param.sharded_param.grad._local_tensor += new_sharded_grad
             else:
                 new_sharded_dtensor_grad = fsdp_param.to_sharded_dtensor(
@@ -721,8 +880,11 @@ def _get_gradient_divide_factors(
         if factor == data_parallel_size:
             # Warning: NCCL ReduceOp.AVG may produce incorrect results with
             # world size 1.
+<<<<<<< HEAD
             if data_parallel_size == 1:
                 return None, None, ReduceOp.SUM, ReduceOp.SUM
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None, None, ReduceOp.AVG, ReduceOp.AVG
         else:
             reduce_scatter_op = torch.distributed._make_nccl_premul_sum(1 / factor)
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_common.py b/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
index a72c09fd80f2d..36a895adfb47a 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
@@ -15,6 +15,7 @@
 
 _compiled_autograd_enabled: bool = False
 
+<<<<<<< HEAD
 
 def detect_compiled_autograd():
     if torch.compiler.is_compiling():
@@ -34,6 +35,34 @@ def detect_compiled_autograd():
 def compiled_autograd_enabled():
     global _compiled_autograd_enabled
     return _compiled_autograd_enabled
+=======
+if torch._running_with_deploy():
+
+    def detect_compiled_autograd():
+        pass
+
+    def compiled_autograd_enabled():
+        return False
+
+else:
+
+    def detect_compiled_autograd():
+        assert not torch.compiler.is_compiling(), (
+            "`detect_compiled_autograd()` is designed to be called in eager mode"
+        )
+        global _compiled_autograd_enabled
+        import torch._dynamo.compiled_autograd as ca
+
+        _compiled_autograd_enabled = (
+            ca.compiled_autograd_enabled
+            or ca.compiled_autograd_enabled_force_eager
+            or ca.in_compiled_autograd_region
+        )
+
+    def compiled_autograd_enabled():
+        global _compiled_autograd_enabled
+        return _compiled_autograd_enabled
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -139,6 +168,7 @@ def _from_local_no_grad(
     """
 
     if not compiled_autograd_enabled():
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return DTensor(
             # Use the local tensor directly instead of constructing a new tensor
@@ -147,6 +177,13 @@ def _from_local_no_grad(
             local_tensor,
             sharding_spec,
             # pyrefly: ignore [unexpected-keyword]
+=======
+        return DTensor(
+            # Use the local tensor directly instead of constructing a new tensor
+            # variable, e.g. with `view_as()`, since this is not differentiable
+            local_tensor,
+            sharding_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=local_tensor.requires_grad,
         )
     else:
@@ -175,7 +212,10 @@ def _cast_fp_tensor(dtype: torch.dtype, x: torch.Tensor) -> torch.Tensor:
     ):
         return x
     return x.to(dtype)
+<<<<<<< HEAD
 
 
 def is_bw() -> bool:
     return torch._C._current_graph_task_id() != -1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
index 01d196795c3d8..58c5a5dc9f504 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
@@ -220,7 +220,10 @@ def _move_states_to_device(
     the future.
     """
     # Follow the logic in `nn.Module._apply`
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for tensor in itertools.chain(params, buffers):
         if tensor.device == device or tensor.device.type == "meta":
             # Keep meta-device tensors on meta device for deferred init
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
index d059f697f12ea..0f7fbeadf7682 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -1,18 +1,31 @@
 # mypy: allow-untyped-defs
 import inspect
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass, field
 from enum import auto, Enum
 from typing import Any, cast, Optional
+=======
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from enum import auto, Enum
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
 from torch._prims_common import make_contiguous_strides_for
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
+<<<<<<< HEAD
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import DTensor, Replicate, Shard
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+=======
+from torch.distributed.tensor import DTensor, Replicate, Shard
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor.device_mesh import _mesh_resources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.placement_types import _StridedShard, Placement
 
 from ._fsdp_api import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
@@ -140,7 +153,12 @@ def copy__functionalize(tensor, data):
         torch.ops.fsdp.copy_.default(tensor_inner, data_inner)
 
 
+<<<<<<< HEAD
 torch.fx.node.has_side_effect(torch.ops.fsdp.copy_.default)
+=======
+if not torch._running_with_deploy():
+    torch.fx.node.has_side_effect(torch.ops.fsdp.copy_.default)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ShardedState(Enum):
@@ -232,7 +250,10 @@ def __init__(
         self._module_info: ParamModuleInfo = module_info
         self.mesh_info = mesh_info
         self.post_forward_mesh_info = post_forward_mesh_info
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self.mp_policy = mp_policy
         self.offload_to_cpu: bool = isinstance(offload_policy, CPUOffloadPolicy)
@@ -275,10 +296,14 @@ def _init_sharded_param(
             fsdp_placement = Shard(0)
         elif fsdp_placement.dim < 0:
             fsdp_placement = Shard(fsdp_placement.dim + param.ndim)
+<<<<<<< HEAD
         if not isinstance(fsdp_placement, Shard):
             raise AssertionError(
                 f"Expected Shard, got {type(fsdp_placement)}: {fsdp_placement}"
             )
+=======
+        assert isinstance(fsdp_placement, Shard), f"{fsdp_placement}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fsdp_placement = fsdp_placement
         shard_dim = fsdp_placement.dim
         # TODO: Replace the sharded DTensor parameter construction logic with
@@ -289,6 +314,7 @@ def _init_sharded_param(
         if self.is_dtensor:
             self._tp_spec = cast(DTensor, param)._spec
             dp_mesh, tp_mesh = (self.mesh_info.mesh, self._tp_spec.mesh)
+<<<<<<< HEAD
             if dp_mesh is None or tp_mesh is None:
                 raise AssertionError(
                     "FSDP requires the DP and model parallel TP/EP mesh to be not None but got: \n"
@@ -305,6 +331,30 @@ def _init_sharded_param(
                     "_spmd_mesh.ndim can only be 2 (FSDP+TP/EP), 3 (FSDP+EP+TP, HSDP+TP/EP), "
                     f"or 4 (HSDP+EP+TP) but got {self._spmd_mesh.ndim}."
                 )
+=======
+            dp_global_mesh = _mesh_resources.get_root_mesh(dp_mesh)
+            tp_global_mesh = _mesh_resources.get_root_mesh(tp_mesh)
+            if dp_global_mesh != tp_global_mesh or (
+                dp_global_mesh is None or tp_global_mesh is None
+            ):
+                raise AssertionError(
+                    "FSDP requires the DP and TP mesh to have the same parent mesh but got: \n"
+                    f"DP's global mesh: {dp_global_mesh}\nTP's global mesh: {tp_global_mesh}"
+                )
+            name_dims_error = "FSDP requires named DeviceMesh dims for ND parallelism"
+            assert dp_mesh.mesh_dim_names is not None, name_dims_error
+            assert tp_mesh.mesh_dim_names is not None, name_dims_error
+            submesh_names = dp_mesh.mesh_dim_names + tp_mesh.mesh_dim_names
+            self._spmd_mesh = dp_global_mesh[submesh_names]
+            if len(self._tp_spec.placements) != 1:
+                raise NotImplementedError(
+                    f"FSDP only supports 1D TP, not {self._tp_spec.placements}"
+                )
+            split_factor = self._tp_spec.num_shards_map[shard_dim]
+            assert 2 <= self._spmd_mesh.ndim <= 3, (
+                f"_spmd_mesh.ndim can only be 2 or 3 but got {self._spmd_mesh.ndim}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._spmd_placements: tuple[Placement, ...]
             dp_shard_tp_placement = (
                 (
@@ -312,6 +362,7 @@ def _init_sharded_param(
                     if split_factor > 1
                     else fsdp_placement
                 ),
+<<<<<<< HEAD
                 *self._tp_spec.placements,
             )
             if dp_mesh.ndim == 1:  # FSDP
@@ -321,6 +372,14 @@ def _init_sharded_param(
                     raise AssertionError(
                         f"Expected replicate_mesh_dim to be 0, got {self.mesh_info.replicate_mesh_dim}"
                     )
+=======
+                self._tp_spec.placements[0],
+            )
+            if self._spmd_mesh.ndim == 2:
+                self._spmd_placements = dp_shard_tp_placement
+            else:
+                assert self.mesh_info.replicate_mesh_dim == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._spmd_placements = (Replicate(),) + dp_shard_tp_placement
             self._sharding_spec = DTensorSpec(
                 self._spmd_mesh,
@@ -340,10 +399,14 @@ def _init_sharded_param(
                 tensor_meta=TensorMeta(param.size(), param.stride(), param.dtype),
             )
             param_data = param
+<<<<<<< HEAD
         if not param_data.is_contiguous():
             raise AssertionError(
                 f"Expected contiguous tensor, got {param_data.shape=} {param_data.stride()=}"
             )
+=======
+        assert param_data.is_contiguous(), f"{param_data.shape=} {param_data.stride()=}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_dim = fsdp_placement.dim
         if shard_dim >= param_data.ndim:
             raise AssertionError(
@@ -379,16 +442,26 @@ def _init_sharded_param(
         if self.offload_to_cpu and not padded_sharded_param.is_meta:
             padded_sharded_param = padded_sharded_param.cpu()
             if self.pin_memory:
+<<<<<<< HEAD
                 padded_sharded_param = padded_sharded_param.pin_memory()
+=======
+                padded_sharded_param = padded_sharded_param.pin_memory(
+                    device=self.device
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._sharded_param_data = padded_sharded_param.view(-1)
         length = sharded_param.size(shard_dim) if sharded_param.numel() > 0 else 0
         sharded_param = padded_sharded_param.narrow(
             dim=shard_dim, start=0, length=length
         )
+<<<<<<< HEAD
         if not sharded_param.is_contiguous():
             raise AssertionError(
                 f"Expected contiguous tensor with {self.fsdp_placement=}"
             )
+=======
+        assert sharded_param.is_contiguous(), f"{self.fsdp_placement=}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
         self.sharded_param.requires_grad_(param.requires_grad)
         # Let `param_data` be freed normally when its ref count reaches 0 when
@@ -398,8 +471,12 @@ def _init_sharded_param(
 
     def _init_sharded_post_forward_param_metadata(self, param: torch.Tensor) -> None:
         mesh_info = self.post_forward_mesh_info
+<<<<<<< HEAD
         if mesh_info is None:
             raise AssertionError("Expected post_forward_mesh_info to not be None")
+=======
+        assert mesh_info is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         param_data = param._local_tensor if isinstance(param, DTensor) else param
         chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
         self.sharded_post_forward_size = _get_dim_chunked_size(
@@ -504,10 +581,14 @@ def init_unsharded_param(self):
         else:
             # For the default path (no post-all-gather), the all-gather output
             # gives the unsharded parameter data directly
+<<<<<<< HEAD
             if len(self.all_gather_outputs) != 1:
                 raise AssertionError(
                     f"Expected 1 all_gather_output, got {len(self.all_gather_outputs)}"
                 )
+=======
+            assert len(self.all_gather_outputs) == 1, f"{len(self.all_gather_outputs)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unsharded_tensor = self.all_gather_outputs[0]
         unsharded_param = torch.as_strided(
             unsharded_tensor,
@@ -518,8 +599,12 @@ def init_unsharded_param(self):
         if self.is_dtensor:
             unsharded_param = _from_local_no_grad(unsharded_param, self._tp_spec)
         if hasattr(self, "_unsharded_param"):
+<<<<<<< HEAD
             if not compiled_autograd_enabled():
                 raise AssertionError("Expected compiled_autograd to be enabled")
+=======
+            assert compiled_autograd_enabled()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with (
                 torch.no_grad(),
                 torch.autograd._unsafe_preserve_version_counter(self._unsharded_param),
@@ -556,12 +641,17 @@ def to_sharded_post_forward(self) -> None:
                 "Resharding to smaller mesh with TP is not supported yet"
             )
         self._assert_in_states(ShardedState.UNSHARDED)
+<<<<<<< HEAD
         if self.post_forward_mesh_info is None:
             raise AssertionError("Expected post_forward_mesh_info to not be None")
         if len(self.all_gather_outputs) != 1:
             raise AssertionError(
                 f"Expected 1 all_gather_output, got {len(self.all_gather_outputs)}"
             )
+=======
+        assert self.post_forward_mesh_info is not None  # mypy
+        assert len(self.all_gather_outputs) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shard_world_size = self.post_forward_mesh_info.shard_mesh_size
         if (numel := self.all_gather_outputs[0].numel()) % shard_world_size != 0:
             _raise_assert_with_print(
@@ -569,7 +659,10 @@ def to_sharded_post_forward(self) -> None:
                 f"world size ({shard_world_size})"
             )
         shard_rank = self.post_forward_mesh_info.shard_mesh_rank
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sharded_numel = numel // shard_world_size
         self._sharded_post_forward_param_data = (
             self.all_gather_outputs[0].narrow(
@@ -630,10 +723,14 @@ def to_sharded_post_forward_dtensor(self, tensor: torch.Tensor) -> DTensor:
             _raise_assert_with_print(
                 f"Expects size {self.sharded_post_forward_size} but got {tensor.shape}"
             )
+<<<<<<< HEAD
         if not isinstance(self.post_forward_mesh_info, HSDPMeshInfo):
             raise AssertionError(
                 f"Expected HSDPMeshInfo, got {type(self.post_forward_mesh_info)}"
             )
+=======
+        assert isinstance(self.post_forward_mesh_info, HSDPMeshInfo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Prefer this DTensor to be read-only and generalize the
         # placement once we support TP.
         post_forward_sharding_spec = DTensorSpec(
@@ -703,11 +800,15 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                         self.device, non_blocking=True
                     )
                 pre_all_gather_signature = inspect.signature(
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     sharded_local_tensor.fsdp_pre_all_gather
                 )
                 num_fn_params = len(pre_all_gather_signature.parameters)
                 # Old signature only passes mesh; keep for BC for now
+<<<<<<< HEAD
                 if num_fn_params not in (1, 5):
                     raise AssertionError(
                         f"Invalid fsdp_pre_all_gather: {pre_all_gather_signature}\n"
@@ -715,11 +816,24 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                         "outer_size: torch.Size, outer_stride: tuple[int, ...], "
                         "module: nn.Module, mp_policy: MixedPrecisionPolicy)"
                     )
+=======
+                assert num_fn_params in (
+                    1,
+                    5,
+                ), (
+                    f"Invalid fsdp_pre_all_gather: {pre_all_gather_signature}\n"
+                    "Expects fsdp_pre_all_gather(self, mesh: DeviceMesh, "
+                    "module: nn.Module, mp_policy: MixedPrecisionPolicy)"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if num_fn_params == 1:
                     (
                         all_gather_inputs,
                         self._extensions_data.all_gather_metadata,
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ) = sharded_local_tensor.fsdp_pre_all_gather(
                         self.shard_mesh_from_root
                     )
@@ -727,7 +841,10 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                     (
                         all_gather_inputs,
                         self._extensions_data.all_gather_metadata,
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ) = sharded_local_tensor.fsdp_pre_all_gather(
                         self.shard_mesh_from_root,
                         self._orig_size,
@@ -780,21 +897,30 @@ def unsharded_param(self) -> nn.Parameter:  # ND
     @property
     def unsharded_grad_data(self) -> torch.Tensor:
         grad = self.unsharded_param.grad
+<<<<<<< HEAD
         if grad is None:
             raise AssertionError("Expects unsharded_param.grad to not be None")
+=======
+        assert grad is not None, "Expects unsharded_param.grad to not be None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._get_grad_inner_tensor(grad)
 
     @property
     def unsharded_accumulated_grad_data(self) -> torch.Tensor:
         grad = self.unsharded_accumulated_grad
+<<<<<<< HEAD
         if grad is None:
             raise AssertionError("Expects unsharded_accumulated_grad to not be None")
+=======
+        assert grad is not None, "Expects unsharded_accumulated_grad to not be None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._get_grad_inner_tensor(grad)
 
     def _get_grad_inner_tensor(self, grad: torch.Tensor) -> torch.Tensor:
         if self.is_dtensor:
             if isinstance(grad, AsyncCollectiveTensor):
                 grad = grad.wait()
+<<<<<<< HEAD
             if not isinstance(grad, DTensor):
                 raise AssertionError(f"Expected DTensor, got {type(grad)}")
             placements = self._tp_spec.placements
@@ -803,6 +929,14 @@ def _get_grad_inner_tensor(self, grad: torch.Tensor) -> torch.Tensor:
                     raise AssertionError(
                         f"Expected same placement length: {self._tp_spec=} {grad.placements=}"
                     )
+=======
+            assert isinstance(grad, DTensor), f"{type(grad)}"
+            placements = self._tp_spec.placements
+            if placements != grad.placements:
+                assert len(self._tp_spec.placements) == len(grad.placements), (
+                    f"{self._tp_spec=} {grad.placements=}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 grad = grad.redistribute(placements=placements)
             grad = grad._local_tensor
         return grad
@@ -817,8 +951,12 @@ def shard_mesh(self):
         if mesh.ndim == 1:
             return mesh
         elif mesh.ndim == 2:
+<<<<<<< HEAD
             if mesh.mesh_dim_names is None:
                 raise AssertionError("Expected mesh_dim_names to not be None")
+=======
+            assert mesh.mesh_dim_names is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return mesh[mesh.mesh_dim_names[-1]]
         raise ValueError(f"Invalid mesh: {mesh}")
 
@@ -829,11 +967,18 @@ def shard_mesh_from_root(self):
         if mesh.ndim == 1:
             return mesh
         else:
+<<<<<<< HEAD
             if mesh.mesh_dim_names is None:
                 raise AssertionError("Expected mesh_dim_names to not be None")
             shard_dim_name = mesh.mesh_dim_names[-1]
 
             root_mesh = mesh._get_root_mesh()
+=======
+            assert mesh.mesh_dim_names is not None
+            shard_dim_name = mesh.mesh_dim_names[-1]
+
+            root_mesh = _mesh_resources.get_root_mesh(mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return root_mesh[shard_dim_name]
 
     def _assert_in_states(self, *states: ShardedState) -> None:
@@ -855,11 +1000,15 @@ def reset_sharded_param(self):
                     f"instead of {self.sharded_param}"
                 )
             self.sharded_param = new_param
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_tensor = new_param._local_tensor
         if local_tensor.is_meta:
             return
         updated_local_tensor = False
+<<<<<<< HEAD
         # local_tensor can be padded twice
         # 1st time in fully_shard(model)
         # 2nd time in model(input) lazy_init
@@ -885,6 +1034,15 @@ def reset_sharded_param(self):
                 raise AssertionError(
                     f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}"
                 )
+=======
+        padded_sharded_size = self.padded_sharded_param_size
+        shard_dim = self.fsdp_placement.dim
+        length = local_tensor.size(shard_dim) if local_tensor.numel() > 0 else 0
+        if local_tensor.size() != padded_sharded_size:
+            assert shard_dim == 0, (
+                f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padded_local_tensor = local_tensor.new_zeros(padded_sharded_size)
             padded_local_tensor.narrow(dim=shard_dim, start=0, length=length).copy_(
                 local_tensor
@@ -892,21 +1050,32 @@ def reset_sharded_param(self):
             local_tensor = padded_local_tensor
             updated_local_tensor = True
         if self.pin_memory and not local_tensor.is_pinned():
+<<<<<<< HEAD
             local_tensor = local_tensor.cpu().pin_memory()
             updated_local_tensor = True
         if not same_local_tensor:
             self._sharded_param_data = local_tensor.view(-1)
         if not isinstance(self.sharded_param, DTensor):
             raise AssertionError(f"Expected DTensor, got {type(self.sharded_param)}")
+=======
+            local_tensor = local_tensor.cpu().pin_memory(device=self.device)
+            updated_local_tensor = True
+        self._sharded_param_data = local_tensor.view(-1)
+        assert isinstance(self.sharded_param, DTensor)  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if updated_local_tensor:
             # Only change the local tensor object if needed
             self.sharded_param._local_tensor = local_tensor.narrow(
                 dim=shard_dim, start=0, length=length
             )
+<<<<<<< HEAD
             if not self.sharded_param._local_tensor.is_contiguous():
                 raise AssertionError(
                     "Expected sharded_param._local_tensor to be contiguous"
                 )
+=======
+            assert self.sharded_param._local_tensor.is_contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._sharding_spec = self.sharded_param._spec
 
     def __repr__(self):
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
index f2eac802bb672..938787f9c56f8 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import contextlib
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast, NamedTuple, Optional
+=======
+from typing import Any, Callable, cast, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -16,6 +20,7 @@
 
 from ._fsdp_api import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
 from ._fsdp_collectives import (
+<<<<<<< HEAD
     AllGather,
     AllGatherResult,
     DefaultAllGather,
@@ -26,15 +31,27 @@
     ProcessGroupAllocAllGather,
     ProcessGroupAllocReduceScatter,
     ReduceScatter,
+=======
+    AllGatherResult,
+    foreach_all_gather,
+    foreach_all_gather_copy_out,
+    foreach_reduce,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from ._fsdp_common import (
     compiled_autograd_enabled,
     FSDPMeshInfo,
     HSDPMeshInfo,
+<<<<<<< HEAD
     is_bw,
     TrainingState,
 )
 from ._fsdp_param import alloc_storage, FSDPParam, ParamModuleInfo, ShardedState
+=======
+    TrainingState,
+)
+from ._fsdp_param import FSDPParam, ParamModuleInfo, ShardedState
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
@@ -151,7 +168,10 @@ def __init__(
         ]
         self.mesh_info = mesh_info
         self.post_forward_mesh_info = post_forward_mesh_info
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self.device_handle = _get_device_handle(device.type)
         self.mp_policy = mp_policy
@@ -168,9 +188,12 @@ def __init__(
         self._module_to_pre_save_state_dict_hook_handle: _ModuleToHandleDict = {}
         self._module_to_pre_load_state_dict_hook_handle: _ModuleToHandleDict = {}
         self._all_reduce_hook: Optional[Callable[[torch.Tensor], None]] = None
+<<<<<<< HEAD
         self._all_gather_comm: AllGather = DefaultAllGather()
         self._all_gather_output = torch.empty(0, device=self.device)
         self._reduce_scatter_comm: ReduceScatter = DefaultReduceScatter()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Optional stream to run the user-defined all-reduce hook in
         # Saved here and not in the comm. context because we allow the user to
         # specify it, possibly at construction time before lazy init
@@ -202,6 +225,12 @@ def __init__(
         # Whether to unshard in backward: can be overridden by the user if the
         # parameters in this group are not needed for backward (e.g. embedding)
         self.unshard_in_backward: bool = True
+<<<<<<< HEAD
+=======
+        # Whether to (try to) use the ProcessGroup's allocate_tensor method for
+        # the staging buffers for collective comms.
+        self.allocate_memory_from_process_group = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # - CUDA events for stream synchronization
         # Holds the all-gather output buffer, sync objects, and metadata
@@ -237,7 +266,11 @@ def _init_mp_dtypes(self) -> None:
             raise AssertionError(
                 f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
             )
+<<<<<<< HEAD
         self._orig_dtype = next(iter(orig_dtypes)) if trainable_params else None
+=======
+        self._orig_dtype = next(iter(orig_dtypes)) if len(trainable_params) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(trainable_params) > 0 and len(reduce_dtypes) != 1:
             # This can be relaxed if we issue one reduce-scatter per reduce
             # dtype (but we would need a way for users to specify multiple
@@ -245,7 +278,13 @@ def _init_mp_dtypes(self) -> None:
             raise AssertionError(
                 f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
             )
+<<<<<<< HEAD
         self._reduce_dtype = next(iter(reduce_dtypes)) if trainable_params else None
+=======
+        self._reduce_dtype = (
+            next(iter(reduce_dtypes)) if len(trainable_params) else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def lazy_init(self):
         # Lazy init should be idempotent
@@ -266,6 +305,7 @@ def lazy_init(self):
         self._init_mp_dtypes()
         self._register_state_dict_hooks()
 
+<<<<<<< HEAD
     def set_allocate_memory_from_process_group(self, enable: bool) -> None:
         """
         Whether to (try to) use the ProcessGroup's allocate_tensor method for
@@ -298,6 +338,8 @@ def set_allocate_memory_from_process_group(self, enable: bool) -> None:
             else DefaultReduceScatter()
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Runtime #
     def unshard(self, async_op: bool = False):
         if self._all_gather_result is not None:  # already called, pending wait
@@ -314,6 +356,7 @@ def unshard(self, async_op: bool = False):
             # used in the all-gather streams
             self._wait_all_gather_streams_on_event(self._reshard_after_forward_event)
             self._reshard_after_forward_event = None
+<<<<<<< HEAD
 
         world_size = self._all_gather_process_group.size()
         if world_size == 1:
@@ -330,6 +373,8 @@ def unshard(self, async_op: bool = False):
 
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with record_function(self._with_fqn("FSDP::all_gather")):
             self._all_gather_result = foreach_all_gather(
                 self.fsdp_params,
@@ -337,7 +382,11 @@ def unshard(self, async_op: bool = False):
                 async_op,
                 *self.comm_ctx.get_all_gather_streams(async_op, self._training_state),
                 self.device,
+<<<<<<< HEAD
                 self._all_gather_comm,
+=======
+                self.allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def wait_for_unshard(self):
@@ -356,6 +405,7 @@ def wait_for_unshard(self):
             if prev_all_gather_state := self.comm_ctx.all_gather_state:
                 self._wait_all_gather_streams_on_event(prev_all_gather_state.event)
                 self.comm_ctx.all_gather_state = None  # free the all-gather result
+<<<<<<< HEAD
         world_size = self._all_gather_process_group.size()
         if world_size == 1:
             # directly initialize unsharded parameters from sharded parameters
@@ -402,6 +452,20 @@ def wait_for_unshard(self):
             and self._training_state == TrainingState.FORWARD
             and world_size > 1
         ):
+=======
+        with record_function(self._with_fqn("FSDP::all_gather_copy_out")):
+            foreach_all_gather_copy_out(
+                self._all_gather_result,
+                self.fsdp_params,
+                self._all_gather_process_group,
+            )
+        for fsdp_param in self.fsdp_params:
+            fsdp_param.init_unsharded_param()
+        self._to_unsharded()
+        all_gather_copy_out_event = self.device_handle.Event()
+        all_gather_copy_out_event.record()
+        if not async_op and self._training_state == TrainingState.FORWARD:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Defer free to allow for overlap of this copy-out with next
             # all-gather collective
             self.comm_ctx.all_gather_state = AllGatherState(
@@ -409,7 +473,10 @@ def wait_for_unshard(self):
             )
         else:
             self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._all_gather_result = None  # free unless saved in `all_gather_state`
 
     def _wait_all_gather_streams_on_event(self, event: Optional[torch.Event]):
@@ -447,6 +514,7 @@ def post_forward(self, module: nn.Module, input: Any, output: Any):
         if not compiled_autograd_enabled():
             logger.debug("%s", self._with_fqn("FSDP::post_forward"))
         with record_function(self._with_fqn("FSDP::post_forward")):
+<<<<<<< HEAD
             if not compiled_autograd_enabled():
                 # for AC(fully_shard(model)), AC runs fsdp's _pre_forward
                 # it shouldn't change post_forward_order
@@ -456,6 +524,10 @@ def post_forward(self, module: nn.Module, input: Any, output: Any):
             else:
                 self.reshard()
                 self._record_post_forward()
+=======
+            self.reshard()
+            self._record_post_forward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._training_state = TrainingState.IDLE
             return output
 
@@ -536,10 +608,16 @@ def post_backward(self, *unused: Any):
             if all_reduce_pg is None and self._all_reduce_hook_stream is not None:
                 # this means the native HSDP is not enabled,
                 # but user may want to have a custom HSDP setup
+<<<<<<< HEAD
                 if self._all_reduce_hook is None:
                     raise AssertionError(
                         "all reduce hook stream is specified but hook itself is missing."
                     )
+=======
+                assert self._all_reduce_hook is not None, (
+                    "all reduce hook stream is specified but hook itself is missing."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_reduce_stream = self._all_reduce_hook_stream
             else:
                 all_reduce_stream = self.comm_ctx.all_reduce_stream
@@ -557,7 +635,10 @@ def post_backward(self, *unused: Any):
                 unsharded_grads,
                 self._reduce_scatter_process_group,
                 self.comm_ctx.reduce_scatter_stream,
+<<<<<<< HEAD
                 self._reduce_scatter_comm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._orig_dtype,
                 self._reduce_dtype,
                 self.device,
@@ -567,6 +648,10 @@ def post_backward(self, *unused: Any):
                 self.all_reduce_grads,
                 self._partial_reduce_output,
                 self._all_reduce_hook,
+<<<<<<< HEAD
+=======
+                self.allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.force_sum_reduction_for_comms,
             )
             self.comm_ctx.reduce_scatter_state = ReduceScatterState(
@@ -574,10 +659,14 @@ def post_backward(self, *unused: Any):
             )
             if all_reduce_input is not None:
                 if self.device.type != "cpu":
+<<<<<<< HEAD
                     if all_reduce_event is None:
                         raise AssertionError(
                             "Expected all_reduce_event to be set for non-CPU device"
                         )
+=======
+                    assert all_reduce_event is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._all_reduce_state = AllReduceState(
                     all_reduce_input, all_reduce_event
                 )
@@ -621,7 +710,10 @@ def _backward_prefetch(self) -> None:
             # Prefetch naively using the reverse post-forward order, which may
             # have mistargeted prefetches if not all modules used in forward
             # are used in this backward
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_fsdp_param_group = self.comm_ctx.post_forward_order[target_index]
             self._prefetch_unshard(target_fsdp_param_group, "backward")
 
@@ -716,10 +808,16 @@ def _register_post_backward_hook(
     def _register_state_dict_hooks(self) -> None:
         num_pre_save_hooks = len(self._module_to_pre_save_state_dict_hook_handle)
         num_pre_load_hooks = len(self._module_to_pre_load_state_dict_hook_handle)
+<<<<<<< HEAD
         if num_pre_save_hooks != num_pre_load_hooks:
             raise AssertionError(
                 f"Pre-save: {num_pre_save_hooks} pre-load: {num_pre_load_hooks}"
             )
+=======
+        assert num_pre_save_hooks == num_pre_load_hooks, (
+            f"Pre-save: {num_pre_save_hooks} pre-load: {num_pre_load_hooks}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if num_pre_save_hooks > 0:
             return  # already registered
         modules_with_fsdp_params: set[nn.Module] = {
@@ -760,26 +858,38 @@ def _all_gather_process_group(self) -> dist.ProcessGroup:
             if self.is_sharded_post_forward
             else self.mesh_info
         )
+<<<<<<< HEAD
         if not isinstance(mesh_info, FSDPMeshInfo):
             raise AssertionError(
                 f"Expected mesh_info to be FSDPMeshInfo, got {type(mesh_info)}"
             )
+=======
+        assert isinstance(mesh_info, FSDPMeshInfo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mesh_info.shard_process_group
 
     @property
     def _reduce_scatter_process_group(self) -> dist.ProcessGroup:
+<<<<<<< HEAD
         if not isinstance(self.mesh_info, FSDPMeshInfo):
             raise AssertionError(
                 f"Expected mesh_info to be FSDPMeshInfo, got {type(self.mesh_info)}"
             )
+=======
+        assert isinstance(self.mesh_info, FSDPMeshInfo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.mesh_info.shard_process_group
 
     @property
     def _all_reduce_process_group(self) -> dist.ProcessGroup:
+<<<<<<< HEAD
         if not isinstance(self.mesh_info, HSDPMeshInfo):
             raise AssertionError(
                 f"Expected mesh_info to be HSDPMeshInfo, got {type(self.mesh_info)}"
             )
+=======
+        assert isinstance(self.mesh_info, HSDPMeshInfo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.mesh_info.replicate_process_group
 
     def _with_fqn(self, label: str) -> str:
@@ -848,7 +958,11 @@ def _get_param_module_infos(
                             param_name
                         )
     if len(param_to_module_info) != len(params):
+<<<<<<< HEAD
         raise AssertionError(f"Some parameters are not in the module tree of {modules}")
+=======
+        raise AssertionError(f"Some parameters are not in the module tree of {module}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [param_to_module_info[param] for param in params]
 
 
@@ -868,7 +982,10 @@ def _assert_not_tracing_fsdp():
             raise RuntimeError(msg)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, param_group: FSDPParamGroup, *inputs: torch.Tensor):
         # All tensors in `inputs` should require gradient
         RegisterPostBackwardFunction._assert_not_tracing_fsdp()
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
index d68dfbf2ddcb0..79f2a346aff4d 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
@@ -2,8 +2,13 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -96,7 +101,10 @@ def init(
         for module in modules:
             _insert_module_state(module, self)
         self._modules = modules
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._device = device
         self._device_handle = _get_device_handle(device.type)
         self._mp_policy = mp_policy
@@ -203,8 +211,12 @@ def _init_shared_state(self) -> None:
 
     def _init_fqns(self) -> None:
         """Sets module and parameter FQN attributes for debugging."""
+<<<<<<< HEAD
         if not self._is_root:
             raise AssertionError("Expected _is_root to be True")
+=======
+        assert self._is_root
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         root_module = self._modules[0]
         param_to_fsdp_param: dict[nn.Parameter, FSDPParam] = {}
         module_to_fsdp_param_group: dict[nn.Module, FSDPParamGroup] = {}
@@ -223,10 +235,14 @@ def _init_fqns(self) -> None:
                 if module_fqn is None:
                     module_to_fsdp_param_group[module]._module_fqn = module_name
                 else:
+<<<<<<< HEAD
                     if not isinstance(module_fqn, str):
                         raise AssertionError(
                             f"Expected module_fqn to be str, got {type(module_fqn)}: {module_fqn}"
                         )
+=======
+                    assert isinstance(module_fqn, str), f"{module_fqn}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     module_fqn += f", {module_name}"
                     module_to_fsdp_param_group[module]._module_fqn = module_fqn
 
@@ -235,7 +251,11 @@ def _pre_forward(
         self, module: nn.Module, args: tuple[Any, ...], kwargs: dict[str, Any]
     ) -> tuple[tuple[Any, ...], dict[str, Any]]:
         # When composing with module-hook-based activation checkpointing, the
+<<<<<<< HEAD
         # pre-backward hook is responsible for the unshard
+=======
+        # the pre-backward hook is responsible for the unshard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._training_state == TrainingState.PRE_BACKWARD:
             return args, kwargs
         self._training_state = TrainingState.FORWARD
diff --git a/torch/distributed/fsdp/_fully_shard/_fully_shard.py b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
index 998a33746f961..d514aca778dea 100644
--- a/torch/distributed/fsdp/_fully_shard/_fully_shard.py
+++ b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@@ -4,8 +4,21 @@
 from __future__ import annotations
 
 import functools
+<<<<<<< HEAD
 from contextlib import contextmanager
 from typing import Any, cast, NoReturn, Optional, overload, TYPE_CHECKING, Union
+=======
+from typing import (
+    Any,
+    Callable,
+    cast,
+    NoReturn,
+    Optional,
+    overload,
+    TYPE_CHECKING,
+    Union,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -13,7 +26,11 @@
 from torch.distributed._composable import contract
 from torch.distributed.utils import _get_root_modules
 
+<<<<<<< HEAD
 from ._fsdp_api import AllGather, MixedPrecisionPolicy, OffloadPolicy, ReduceScatter
+=======
+from ._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._fsdp_common import FSDPMeshInfo, HSDPMeshInfo
 from ._fsdp_init import (
     _get_device_from_mesh,
@@ -28,7 +45,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Iterator
+=======
+    from collections.abc import Iterable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch.distributed.tensor import DeviceMesh, Shard
 
@@ -37,21 +58,28 @@
     "FSDPModule",
     "UnshardHandle",
     "register_fsdp_forward_method",
+<<<<<<< HEAD
     "get_cls_to_fsdp_cls",
     "disable_fsdp_module_new_init",
     "share_comm_ctx",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
 cls_to_fsdp_cls: dict[type, type] = {}
 
 
+<<<<<<< HEAD
 def get_cls_to_fsdp_cls() -> dict[type, type]:
     return cls_to_fsdp_cls
 
 
 @overload
 # pyrefly: ignore [inconsistent-overload]
+=======
+@overload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def fully_shard(
     module: nn.Module,
     *,
@@ -65,7 +93,10 @@ def fully_shard(
 
 
 @overload
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def fully_shard(
     module: list[nn.Module],
     *,
@@ -242,7 +273,11 @@ def fully_shard(
     # Place FSDP leftmost for highest priority in the method resolution order
     for module in modules:
         cls = module.__class__
+<<<<<<< HEAD
         new_cls = cls_to_fsdp_cls.get(cls)
+=======
+        new_cls = cls_to_fsdp_cls.get(cls, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not new_cls:
             dct = {"__deepcopy__": _unimplemented_deepcopy}
             new_cls = type(f"FSDP{cls.__name__}", (FSDPModule, cls), dct)
@@ -257,6 +292,7 @@ def _unimplemented_deepcopy(*args: Any, **kwargs: Any) -> NoReturn:
     )
 
 
+<<<<<<< HEAD
 _enable_fsdp_module_new_init: bool = True
 
 
@@ -270,6 +306,8 @@ def disable_fsdp_module_new_init() -> Iterator[None]:
         _enable_fsdp_module_new_init = prev
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FSDPModule:
     def __new__(cls, *args, **kwargs):
         """
@@ -280,8 +318,12 @@ def __new__(cls, *args, **kwargs):
         # and index 1 is the `FSDPModule` class itself
         orig_cls = cls.__mro__[2]
         self = orig_cls.__new__(orig_cls, *args, **kwargs)
+<<<<<<< HEAD
         if _enable_fsdp_module_new_init:
             self.__init__(*args, **kwargs)
+=======
+        self.__init__(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def reshard(self) -> None:
@@ -470,6 +512,7 @@ def set_modules_to_backward_prefetch(self, modules: list[FSDPModule]) -> None:
             module._get_fsdp_state() for module in modules
         ]
 
+<<<<<<< HEAD
     def set_custom_all_gather(self, comm: AllGather) -> None:
         """
         Overrides the default ``all_gather`` communication behavior,
@@ -496,6 +539,8 @@ def set_custom_reduce_scatter(self, comm: ReduceScatter) -> None:
         if (fsdp_param_group := state._fsdp_param_group) is not None:
             fsdp_param_group._reduce_scatter_comm = comm
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_all_reduce_hook(
         self,
         hook: Callable[[torch.Tensor], None],
@@ -599,17 +644,24 @@ def set_allocate_memory_from_process_group_for_comm(self, enable: bool) -> None:
         using NCCL, this enables it to leverage zero-copy transfers over SHARP
         (for NVLink and/or InfiniBand).
 
+<<<<<<< HEAD
         This cannot be used together with :meth:`set_custom_all_gather` or
         :meth:`set_custom_reduce_scatter` as those APIs allow for
         finer-grained control over each communication, and this method cannot
         determine their staging buffer allocation strategy.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Args:
             enable (bool): Whether to turn on ProcessGroup allocation.
         """
         state = self._get_fsdp_state()
         if (fsdp_param_group := state._fsdp_param_group) is not None:
+<<<<<<< HEAD
             fsdp_param_group.set_allocate_memory_from_process_group(enable)
+=======
+            fsdp_param_group.allocate_memory_from_process_group = enable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _set_unshard_async_op(self, async_op: bool):
         """
@@ -712,6 +764,7 @@ def wrapped_method(self, *args, **kwargs):
     )
 
 
+<<<<<<< HEAD
 def share_comm_ctx(modules: list[FSDPModule]) -> None:
     """
     Share cuda streams for multiple FSDPModules
@@ -740,6 +793,8 @@ def share_comm_ctx(modules: list[FSDPModule]) -> None:
             fsdp_param_group.comm_ctx = comm_ctx
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _assert_all_fsdp_modules(modules: Iterable[Any]) -> None:
     for module in modules:
         if not isinstance(module, FSDPModule):
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 36bdc23e741c0..5a3b3821f43a0 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -3,8 +3,13 @@
 import itertools
 import os
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable, Iterator
 from typing import Any, no_type_check, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Generator, Iterable, Iterator
+from typing import Any, Callable, no_type_check, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -13,7 +18,11 @@
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 from torch.distributed.algorithms._comm_hooks import default_hooks
+<<<<<<< HEAD
 from torch.distributed.device_mesh import DeviceMesh
+=======
+from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
     _FSDPDeviceHandle,
@@ -56,7 +65,11 @@
 except ImportError:
     _TORCHDISTX_AVAIL = False
 
+<<<<<<< HEAD
 PARAM_BROADCAST_BUCKET_SIZE = 250 * 1024 * 1024
+=======
+PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FSDP_SYNCED = "_fsdp_synced"
 # Specification of process groups for hybrid sharding strategies.
 HybridShardProcessGroupType = tuple[dist.ProcessGroup, dist.ProcessGroup]
@@ -243,10 +256,16 @@ def _init_inter_node_process_group(
         if local_rank == my_local_rank:
             inter_node_pg = grp
 
+<<<<<<< HEAD
     if inter_node_pg is None:
         raise AssertionError(
             f"{my_local_rank} expected to assign inter-node pg, but did not"
         )
+=======
+    assert inter_node_pg is not None, (
+        f"{my_local_rank} expected to assign inter-node pg, but did not"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return inter_node_pg
 
 
@@ -431,8 +450,12 @@ def _init_core_state(
             warnings.warn(
                 "FSDP is switching to use `NO_SHARD` instead of "
                 f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since "
+<<<<<<< HEAD
                 "the world size is 1.",
                 stacklevel=2,
+=======
+                "the world size is 1."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         sharding_strategy = ShardingStrategy.NO_SHARD
     elif sharding_strategy == ShardingStrategy.NO_SHARD:
@@ -510,11 +533,18 @@ def _init_prefetching_state(
 
 
 @no_type_check
+<<<<<<< HEAD
 # pyrefly: ignore [bad-function-definition]
 def _init_extension(state: _FSDPState, device_mesh: DeviceMesh = None) -> _FSDPState:
     # TODO: we need to add additional check once we support FSDP + PiPPy.
     # This check is currently sufficient, since we only support FSDP + TP.
     root_mesh = device_mesh._get_root_mesh() if device_mesh is not None else None
+=======
+def _init_extension(state: _FSDPState, device_mesh: DeviceMesh = None) -> _FSDPState:
+    # TODO: we need to add additional check once we support FSDP + PiPPy.
+    # This check is currently sufficient, since we only support FSDP + TP.
+    root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # if a root mesh is not the same as device_mesh,
     # meaning the device_mesh is sliced out from the root mesh.
     if device_mesh and root_mesh != state._device_mesh:
@@ -550,8 +580,12 @@ def _verify_managed_params(module: nn.Module, params: list[nn.Parameter]) -> Non
                 if param is param_:
                     param_name = name
                     break
+<<<<<<< HEAD
             if not param_name:
                 raise AssertionError("Expected param_name to be set")
+=======
+            assert param_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise ValueError(
                 "FSDP doesn't support scalar parameters. "
                 f"Change {param_name} to a 1D tensor with numel equal to 1."
@@ -649,8 +683,12 @@ def _init_param_handle_from_params(
         fsdp_extension=state._fsdp_extension,
     )
     handle.shard()
+<<<<<<< HEAD
     if state._handle:
         raise AssertionError("Expected state._handle to be None")
+=======
+    assert not state._handle
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     state.params.append(handle.flat_param)
     state._handle = handle
     state._fully_sharded_module_to_handle[handle._fully_sharded_module] = handle
@@ -705,17 +743,25 @@ def _get_ignored_modules(
         warnings.warn(
             "Trying to ignore the top-level module passed into the FSDP "
             "constructor itself will result in all parameters being "
+<<<<<<< HEAD
             f"ignored and is not well-supported: {module}",
             stacklevel=2,
+=======
+            f"ignored and is not well-supported: {module}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     # Include nested FSDP modules' ignored modules
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
+<<<<<<< HEAD
             if not hasattr(optional_fsdp_state, "_ignored_modules"):
                 raise AssertionError(
                     "Expected optional_fsdp_state to have _ignored_modules attribute"
                 )
+=======
+            assert hasattr(optional_fsdp_state, "_ignored_modules")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ignored_modules.update(optional_fsdp_state._ignored_modules)
     return ignored_modules
 
@@ -748,10 +794,14 @@ def _get_ignored_params(
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
+<<<<<<< HEAD
             if not hasattr(optional_fsdp_state, "_ignored_params"):
                 raise AssertionError(
                     "Expected optional_fsdp_state to have _ignored_params attribute"
                 )
+=======
+            assert hasattr(optional_fsdp_state, "_ignored_params")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             all_ignored_params.update(optional_fsdp_state._ignored_params)
 
     return all_ignored_params
@@ -780,10 +830,14 @@ def _get_ignored_buffer_names(
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
+<<<<<<< HEAD
             if not hasattr(optional_fsdp_state, "_ignored_buffer_names"):
                 raise AssertionError(
                     "Expected optional_fsdp_state to have _ignored_buffer_names attribute"
                 )
+=======
+            assert hasattr(optional_fsdp_state, "_ignored_buffer_names")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             all_ignored_buffer_names.update(optional_fsdp_state._ignored_buffer_names)
 
     return all_ignored_buffer_names
@@ -849,8 +903,12 @@ def _get_device_from_device_id(
             f"FSDP will use the current device {device_handle.current_device()}. "
             f"If this is incorrect, please explicitly call `torch.{device.type}.set_device()` "
             "before FSDP initialization or pass in the explicit device "
+<<<<<<< HEAD
             "index as the `device_id` argument.",
             stacklevel=2,
+=======
+            "index as the `device_id` argument."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         device = torch.device(device_handle.current_device())
     return device
@@ -920,9 +978,13 @@ def _materialize_meta_module(
                 # As a contract to the user, only call `reset_parameters()` if
                 # the module has directly managed parameters/buffers
                 module_state_iter = itertools.chain(
+<<<<<<< HEAD
                     module.parameters(recurse=False),
                     # pyrefly: ignore [bad-argument-type]
                     module.buffers(recurse=False),
+=======
+                    module.parameters(recurse=False), module.buffers(recurse=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 has_module_states = len(list(module_state_iter)) > 0
                 if has_module_states:
@@ -932,8 +994,12 @@ def _materialize_meta_module(
         warnings.warn(
             "Unable to call `reset_parameters()` for module on meta "
             f"device with error {str(e)}. Please ensure that your module of"
+<<<<<<< HEAD
             f"type {type(module)} implements a `reset_parameters()` method.",
             stacklevel=2,  # type: ignore[possibly-undefined]
+=======
+            f"type {type(module)} implements a `reset_parameters()` method."  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         raise e
 
@@ -1053,8 +1119,12 @@ def _warn_cpu_init():
         "recommend passing in the `device_id` argument for FSDP to move "
         "`module` to GPU for the sharding initialization. `module` must also "
         "be on GPU device to work with the `sync_module_states=True` flag "
+<<<<<<< HEAD
         "since that requires GPU communication.",
         stacklevel=2,
+=======
+        "since that requires GPU communication."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 60e3f37a99919..e955ba9295bc5 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -146,8 +146,14 @@ def _unflatten_optim_state(
         dict will need to map these entries using the proper unflattened
         parameter IDs.
     """
+<<<<<<< HEAD
     if shard_state and not to_save:
         raise AssertionError("If ``shard_state`` is True, ``to_save`` has to be True.")
+=======
+    assert not shard_state or to_save, (
+        "If ``shard_state`` is True, ``to_save`` has to be True."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     consolidated_state = _communicate_optim_state(
         fsdp_param_info,
         flat_param_state,
@@ -218,8 +224,14 @@ def _communicate_optim_state(
             ):
                 tensor_state[state_name] = value
                 continue
+<<<<<<< HEAD
             if fsdp_state.compute_device is None:
                 raise AssertionError("compute_device has not been initialized")
+=======
+            assert fsdp_state.compute_device is not None, (
+                "compute_device has not been initialized"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if value.device.type != fsdp_state.compute_device.type:
                 value = value.to(fsdp_state.compute_device)
             # Assume that positive-dimension tensor optimizer state
@@ -292,10 +304,14 @@ def _unflatten_communicated_optim_state(
             if shard_state:
                 osd_config = fsdp_state._optim_state_dict_config
                 if getattr(osd_config, "_use_dtensor", False):
+<<<<<<< HEAD
                     if fsdp_state._device_mesh is None:
                         raise AssertionError(
                             f"Expected _device_mesh to be not None, got {fsdp_state._device_mesh}"
                         )
+=======
+                    assert fsdp_state._device_mesh is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     optim_state = _ext_chunk_dtensor(
                         optim_state,
                         fsdp_state.rank,
@@ -303,10 +319,14 @@ def _unflatten_communicated_optim_state(
                         fsdp_state._fsdp_extension,
                     )
                 else:
+<<<<<<< HEAD
                     if fsdp_state.process_group is None:
                         raise AssertionError(
                             f"Expected process_group to be not None, got {fsdp_state.process_group}"
                         )
+=======
+                    assert fsdp_state.process_group is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     optim_state = _ext_chunk_tensor(
                         optim_state,
                         fsdp_state.rank,
@@ -353,11 +373,18 @@ def _broadcast_state(
         tensor = state.to(fsdp_state.compute_device)
     else:
         if isinstance(state, torch.Tensor):
+<<<<<<< HEAD
             if state.dim() != 0:
                 raise AssertionError(
                     "For non-zero ranks, a tensor state should have zero dimension, "
                     f"but got the state with shape {state.shape}."
                 )
+=======
+            assert state.dim() == 0, (
+                "For non-zero ranks, a tensor state should have zero dimension, "
+                "but got the state with shape {state.shape()}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return state
         elif not isinstance(state, _PosDimTensorInfo):
             return state
@@ -431,7 +458,11 @@ def _flatten_optim_state_dict(
     Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
     flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
     all the states even if the corresponding parameters are empty. To this end,
+<<<<<<< HEAD
     ``optim`` will be used to get the initial state of the empty parameters.
+=======
+    ``optim`` will be used to to get the initial state of the empty parameters.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
     NamedOptimizer.
 
@@ -496,18 +527,28 @@ def _flatten_optim_state_dict(
             if flat_state:
                 flat_osd_state[key] = flat_state
             elif use_orig_params:
+<<<<<<< HEAD
                 if len(fqns) != 1:
                     raise AssertionError(
                         f"use_orig_params is True but there are multiple FQNs, {fqns}."
                     )
+=======
+                assert len(fqns) == 1, (
+                    f"use_orig_params is True but there are multiple FQNs, {fqns}."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if optim is not None:  # NamedOptimizer or KeyedOptimizer case.
                     state = optim.state.get(param, None)  # type: ignore[call-overload]
                     if state is not None:
                         flat_osd_state[key] = copy.deepcopy(state)
                     else:
                         warnings.warn(
+<<<<<<< HEAD
                             f"optim_state[{key}] is not on rank{fsdp_state.rank}.",
                             stacklevel=2,
+=======
+                            f"optim_state[{key}] is not on rank{fsdp_state.rank}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             else:
@@ -516,8 +557,12 @@ def _flatten_optim_state_dict(
                     "use_orig_params=True."
                 )
         else:  # do not flatten non-FSDP parameters' states
+<<<<<<< HEAD
             if len(fqns) != 1:
                 raise AssertionError(f"Expected len(fqns) == 1, got {len(fqns)}")
+=======
+            assert len(fqns) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key = _OptimStateKey(tuple(fqns), False)
             flat_osd_state[key] = copy.copy(unflat_osd_state[fqn])
 
@@ -579,6 +624,7 @@ def _flatten_optim_state(
     handle = fsdp_param_info.handle
     flat_param = handle.flat_param
     num_unflat_params = len(unflat_param_names)
+<<<<<<< HEAD
     if num_unflat_params <= 0:
         raise AssertionError(
             "Expects at least one unflattened parameter corresponding to the flat parameter"
@@ -589,6 +635,16 @@ def _flatten_optim_state(
         raise AssertionError(
             f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
         )
+=======
+    assert num_unflat_params > 0, (
+        "Expects at least one unflattened parameter corresponding to the flat parameter"
+    )
+    unflat_param_shapes = flat_param._shapes
+    num_unflat_param_shapes = len(unflat_param_shapes)
+    assert num_unflat_params == num_unflat_param_shapes, (
+        f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Check if these unflattened parameters have any optimizer state
     has_state = [
@@ -613,7 +669,10 @@ def _flatten_optim_state(
     ]
     # Check that the unflattened parameters have the same state names
     state_names = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for unflat_param_state in unflat_param_states:
         if unflat_param_state is None:
             continue
@@ -625,8 +684,12 @@ def _flatten_optim_state(
                     "Differing optimizer state names for the unflattened "
                     f"parameters: {unflat_param_names}"
                 )
+<<<<<<< HEAD
     if state_names is None:
         raise AssertionError(f"Expected state_names to be not None, got {state_names}")
+=======
+    assert state_names is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Flatten the state
     flat_state: dict[str, Optional[torch.Tensor]] = {}
@@ -683,10 +746,14 @@ def _flatten_optim_state(
                 unflat_param_names,
             )
         else:
+<<<<<<< HEAD
             if not are_non_tensors:
                 raise AssertionError(
                     f"Expected are_non_tensors to be True, got {are_non_tensors}"
                 )
+=======
+            assert are_non_tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             flat_state[state_name] = _flatten_non_tensor_optim_state(
                 state_name,
                 state_values,
@@ -774,10 +841,16 @@ def _flatten_tensor_optim_state(
     ]
     flat_tensor = handle.flatten_tensors(tensors_to_flatten, handle._aligned_numel)
     flat_param_shape = flat_param._unpadded_unsharded_size  # type: ignore[attr-defined]
+<<<<<<< HEAD
     if flat_tensor.shape != flat_param_shape:
         raise AssertionError(
             f"tensor optim state: {flat_tensor.shape} flat parameter: {flat_param_shape}"
         )
+=======
+    assert flat_tensor.shape == flat_param_shape, (
+        f"tensor optim state: {flat_tensor.shape} flat parameter: {flat_param_shape}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return flat_tensor
 
 
@@ -908,10 +981,14 @@ def _rekey_sharded_optim_state_dict(
     # All parameter keys in `param_to_param_key` should be in
     # `param_to_fqns` -- strict inequality follows when not all parameters are
     # passed to the optimizer
+<<<<<<< HEAD
     if len(param_to_param_key) > len(param_to_fqns):
         raise AssertionError(
             f"Expected len(param_to_param_key) <= len(param_to_fqns), got {len(param_to_param_key)} > {len(param_to_fqns)}"
         )
+=======
+    assert len(param_to_param_key) <= len(param_to_fqns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     unflat_param_names_to_flat_param_key: dict[
         tuple[str, ...], Union[int, str]
@@ -937,7 +1014,10 @@ def _rekey_sharded_optim_state_dict(
         flat_param_key = unflat_param_names_to_flat_param_key.get(
             key.unflat_param_names, key.unflat_param_names
         )
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rekeyed_osd_state[flat_param_key] = param_state
 
     # Only process param_groups if it exists in sharded_osd
@@ -1000,8 +1080,11 @@ def _get_param_id_to_param_from_optim_input(
     if optim_input is None:
         return dict(enumerate(model.parameters()))
     try:
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
         # pyrefly: ignore [redundant-cast]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         params = cast(list[nn.Parameter], list(optim_input))
     except TypeError as e:
         raise TypeError(
@@ -1021,6 +1104,7 @@ def _get_param_id_to_param_from_optim_input(
         raise TypeError("Optimizer input should be an iterable of Tensors or dicts")
     if all_tensors:
         return dict(enumerate(params))
+<<<<<<< HEAD
     if not all_dicts:
         raise AssertionError(f"Expected all_dicts to be True, got {all_dicts}")
     param_id_to_param: list[nn.Parameter] = []
@@ -1030,6 +1114,16 @@ def _get_param_id_to_param_from_optim_input(
             raise AssertionError(
                 'A parameter group should map "params" to a list of the parameters in the group'
             )
+=======
+    assert all_dicts
+    param_id_to_param: list[nn.Parameter] = []
+    for param_group in params:
+        has_params_key = "params" in param_group  # type: ignore[operator]
+        assert has_params_key, (
+            'A parameter group should map "params" to a list of the '
+            "parameters in the group"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Implicitly map `flat_param_id` (current length of the list) to
         # `param`
         param_id_to_param.extend(param_group["params"])  # type: ignore[index]
@@ -1088,12 +1182,19 @@ def _get_param_key_to_param(
     """
     clean_fqn_to_curr_fqn: dict[str, str] = {}
     if is_named_optimizer:
+<<<<<<< HEAD
         if param_to_fqns is None or flat_param_to_fqn is None:
             raise AssertionError(
                 "The optimizer is a NamedOptimizer, `param_to_fqns` must not be None."
             )
         if model is None:
             raise AssertionError(f"Expected model to be not None, got {model}")
+=======
+        assert param_to_fqns is not None and flat_param_to_fqn is not None, (
+            "The optimizer is a NamedOptimizer, `param_to_fqns` must not be None."
+        )
+        assert model is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, _ in _named_parameters_with_duplicates(model):
             clean_fqn_to_curr_fqn[clean_tensor_name(key)] = key
 
@@ -1102,14 +1203,19 @@ def _get_param_key_to_param(
     for param_group in optim.param_groups:
         if is_named_optimizer:
             for param in param_group["params"]:
+<<<<<<< HEAD
                 if flat_param_to_fqn is None:
                     raise AssertionError(
                         f"Expected flat_param_to_fqn to be not None, got {flat_param_to_fqn}"
                     )
+=======
+                assert flat_param_to_fqn is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if param in flat_param_to_fqn:
                     # FlatParameter case
                     key = flat_param_to_fqn[param]
                 else:
+<<<<<<< HEAD
                     if param_to_fqns is None:
                         raise AssertionError(
                             f"Expected param_to_fqns to be not None, got {param_to_fqns}"
@@ -1119,6 +1225,11 @@ def _get_param_key_to_param(
                         raise AssertionError(
                             f"Expected len(param_to_fqns[param]) == 1, got {len(param_to_fqns[param])}"
                         )
+=======
+                    assert param_to_fqns is not None
+                    # use_orig_params case
+                    assert len(param_to_fqns[param]) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     key = param_to_fqns[param][0]
                 try:
                     key = clean_fqn_to_curr_fqn[key]
@@ -1184,8 +1295,14 @@ def _check_missing_keys_on_rank(
             continue
         param_key = optim_state_key_to_param_key[r0_optim_state_key]
         if isinstance(param_key, int):
+<<<<<<< HEAD
             if not (param_key >= 0 and param_key < len(param_key_to_param)):
                 raise AssertionError("Check the `param_key_to_param` construction")
+=======
+            assert param_key >= 0 and param_key < len(param_key_to_param), (
+                "Check the `param_key_to_param` construction"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We cannot use FSDPState.compute_device as this API is a global view.
     device = _get_pg_default_device(group)
     num_missing = torch.tensor([len(missing_keys)], dtype=torch.int32, device=device)
@@ -1234,10 +1351,17 @@ def _map_param_key_to_optim_keys(
         fqns = param_to_fqns[param]
         is_fsdp_managed = isinstance(param, FlatParameter)
         if is_fsdp_managed:
+<<<<<<< HEAD
             if fqns[0] not in fqn_to_fsdp_param_info:
                 raise AssertionError(
                     f"Expected {fqns[0]} to be in fqn_to_fsdp_param_info, got keys: {list(fqn_to_fsdp_param_info.keys())}"
                 )
+=======
+            assert fqns[0] in fqn_to_fsdp_param_info, (
+                fqns[0],
+                list(fqn_to_fsdp_param_info.keys()),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_fsdp_managed = fqns[0] in fqn_to_fsdp_param_info
         optim_state_key = _OptimStateKey(
             unflat_param_names=tuple(fqns),
@@ -1259,10 +1383,14 @@ def _map_param_key_to_optim_keys(
             [all_optim_state_keys] if rank == 0 else [None]
         )
         dist.broadcast_object_list(key_obj_list, src=0, group=group)
+<<<<<<< HEAD
         if key_obj_list[0] is None:
             raise AssertionError(
                 f"Expected key_obj_list[0] to be not None, got {key_obj_list[0]}"
             )
+=======
+        assert key_obj_list[0] is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_optim_state_keys = key_obj_list[0]
         _check_missing_keys_on_rank(
             all_optim_state_keys,
@@ -1303,7 +1431,11 @@ def _is_named_optimizer(optim_state_dict: dict[str, Any]) -> bool:
     (which usually are FQNs) versus integers (which usually refer to param_ids
     from a vanilla torch.optim.Optimizer).
     """
+<<<<<<< HEAD
     state = optim_state_dict.get("state")
+=======
+    state = optim_state_dict.get("state", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not state:
         # If we cannot find a state, assume it is not NamedOptimizer as
         # NamedOptimizer has eager initialization.
@@ -1395,6 +1527,7 @@ def _convert_all_state_info(
                     if not dtype:
                         dtype = info.dtype
                     else:
+<<<<<<< HEAD
                         if dtype != info.dtype:
                             raise AssertionError(
                                 f"Expected dtype == info.dtype, got {dtype} != {info.dtype}"
@@ -1406,6 +1539,13 @@ def _convert_all_state_info(
                 raise AssertionError(
                     f"Expected empty_ranks to be empty or equal to _empty_ranks, got {empty_ranks} vs {_empty_ranks}"
                 )
+=======
+                        assert dtype == info.dtype
+                if numels[-1] == 0:
+                    _empty_ranks.add(rank)
+
+            assert not empty_ranks or empty_ranks == _empty_ranks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             empty_ranks = _empty_ranks
             if state_name not in state_buffers:
                 state_buffers[state_name] = [
@@ -1427,6 +1567,7 @@ def _convert_all_state_info(
                 continue
             for name, non_tensor_value in object_state.non_tensors.items():
                 curr_non_tensor_value = gathered_state.get(name, None)
+<<<<<<< HEAD
                 if not (
                     curr_non_tensor_value is None
                     or curr_non_tensor_value == non_tensor_value
@@ -1435,10 +1576,20 @@ def _convert_all_state_info(
                         f"Rank {rank} has different values for {name}: {non_tensor_value}."
                         + f" Other ranks: {curr_non_tensor_value}"
                     )
+=======
+                assert (
+                    curr_non_tensor_value is None
+                    or curr_non_tensor_value == non_tensor_value
+                ), (
+                    f"Rank {rank} has different values for {name}: {non_tensor_value}."
+                    + f" Other ranks: {curr_non_tensor_value}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gathered_state[name] = non_tensor_value
 
             for name, scalar_tensor_value in object_state.scalar_tensors.items():
                 curr_scalar_tensor_value = gathered_state.get(name, None)
+<<<<<<< HEAD
                 if not (
                     curr_scalar_tensor_value is None
                     or torch.equal(scalar_tensor_value, curr_scalar_tensor_value)
@@ -1447,6 +1598,14 @@ def _convert_all_state_info(
                         f"Rank {rank} has different values for {name}: {scalar_tensor_value}."
                         + f" Other ranks: {curr_scalar_tensor_value}"
                     )
+=======
+                assert curr_scalar_tensor_value is None or torch.equal(
+                    scalar_tensor_value, curr_scalar_tensor_value
+                ), (
+                    f"Rank {rank} has different values for {name}: {scalar_tensor_value}."
+                    + f" Other ranks: {curr_scalar_tensor_value}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gathered_state[name] = scalar_tensor_value
 
     return dtype, state_buffers  # type: ignore[possibly-undefined]
@@ -1497,10 +1656,14 @@ def _unflatten_orig_param_states(
         if shard_state:
             osd_config = fsdp_state._optim_state_dict_config
             if getattr(osd_config, "_use_dtensor", False):
+<<<<<<< HEAD
                 if fsdp_state._device_mesh is None:
                     raise AssertionError(
                         f"Expected _device_mesh to be not None, got {fsdp_state._device_mesh}"
                     )
+=======
+                assert fsdp_state._device_mesh is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 value = _ext_chunk_dtensor(
                     value,
                     fsdp_state.rank,
@@ -1508,10 +1671,14 @@ def _unflatten_orig_param_states(
                     fsdp_state._fsdp_extension,
                 )
             else:
+<<<<<<< HEAD
                 if fsdp_state.process_group is None:
                     raise AssertionError(
                         f"Expected process_group to be not None, got {fsdp_state.process_group}"
                     )
+=======
+                assert fsdp_state.process_group is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 value = _ext_chunk_tensor(
                     value,
                     fsdp_state.rank,
@@ -1559,7 +1726,12 @@ def _allgather_orig_param_states(
         return output_states
 
     has_state_params: list[bool] = [
+<<<<<<< HEAD
         fqn in output_states for fqn, idx in fsdp_param_info.param_indices.items()
+=======
+        True if fqn in output_states else False
+        for fqn, idx in fsdp_param_info.param_indices.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     # Loop through the ``state_buffers`` and construct the flattened, concatenated,
@@ -1646,6 +1818,7 @@ def _allgather_orig_param_states(
             sum(t.numel() for t in local_buffers)
         )
 
+<<<<<<< HEAD
         if flat_param._shard_numel_padded != shard_numel_padded:
             raise AssertionError(
                 "Manually calculated _sharded_numel_padded is incorrect. "
@@ -1655,10 +1828,21 @@ def _allgather_orig_param_states(
                 f"_numels_with_padding={flat_param._numels_with_padding}, "
                 f"begin={begin}, end={end},"
             )
+=======
+        assert flat_param._shard_numel_padded == shard_numel_padded, (
+            "Manually calculated _sharded_numel_padded is incorrect. "
+            f"_shard_numel_padded={flat_param._shard_numel_padded}, "
+            f"shard_numel_padded={shard_numel_padded}, "
+            f"_sharded_size.numel={flat_param._sharded_size.numel()}, "
+            f"_numels_with_padding={flat_param._numels_with_padding}, "
+            f"begin={begin}, end={end},"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if shard_numel_padded > 0:
             # Add right-handed padding.
             local_buffers.append(empty_func(shard_numel_padded))
         local_shard = torch.cat(local_buffers)
+<<<<<<< HEAD
         if local_shard.numel() * fsdp_state.world_size != gathered_tensor.numel():
             raise AssertionError(
                 "The size of local shard times the world size should equal to the "
@@ -1666,6 +1850,14 @@ def _allgather_orig_param_states(
                 "FlatParameter's metadata or the reconstruction logic in optimizer "
                 "state dict."
             )
+=======
+        assert local_shard.numel() * fsdp_state.world_size == gathered_tensor.numel(), (
+            "The size of local shard times the world size should equal to the "
+            "gathered tensor size. The inconsistency may be from a bug of "
+            "FlatParameter's metadata or the reconstruction logic in optimizer "
+            "state dict."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_state._device_handle.synchronize()
         with SimpleProfiler.profile(SimpleProfiler.Type.ALLGATHER):
             dist.all_gather_into_tensor(
@@ -1677,12 +1869,20 @@ def _allgather_orig_param_states(
         unpadded_tensor = gathered_tensor[: flat_param._unpadded_unsharded_size.numel()]
         flat_param_handle = fsdp_param_info.handle
         orig_states = flat_param_handle._get_unflat_views_aligned(unpadded_tensor)
+<<<<<<< HEAD
         if len(orig_states) != len(fsdp_param_info.param_indices):
             raise AssertionError(
                 "The number of parameters from FlatParameter is not consistent to "
                 "the number of states used by optimizer state dict reconstruction "
                 "logic."
             )
+=======
+        assert len(orig_states) == len(fsdp_param_info.param_indices), (
+            "The number of parameters from FlatParameter is not consistent to "
+            "the number of states used by optimizer state dict reconstruction "
+            "logic."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for fqn, idx in fsdp_param_info.param_indices.items():
             if fsdp_param_info.param_requires_grad[idx] or fqn in output_states:
                 output_states[fqn][state_name] = orig_states[idx]
@@ -1769,7 +1969,11 @@ def _convert_state_with_orig_params(
     # across ranks
     for optim_state_key in all_optim_state_keys:
         param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
+<<<<<<< HEAD
             optim_state_key
+=======
+            optim_state_key, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if param_key is None and not optim_state_key.is_fsdp_managed:
@@ -1777,7 +1981,11 @@ def _convert_state_with_orig_params(
 
         if optim_state_key.is_fsdp_managed:
             fqn = optim_state_key.unflat_param_names[0]
+<<<<<<< HEAD
             fsdp_param_info = fqn_to_fsdp_param_info.get(fqn)
+=======
+            fsdp_param_info = fqn_to_fsdp_param_info.get(fqn, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if fsdp_param_info is None:
                 # This can happen if the not all FSDP instances have all the
                 # parameters. This can happen with FSDP + some MPMD style
@@ -1792,10 +2000,14 @@ def _convert_state_with_orig_params(
             all_states[id(fsdp_param_info)][fqn] = state
 
         elif to_save:
+<<<<<<< HEAD
             if len(optim_state_key.unflat_param_names) != 1:
                 raise AssertionError(
                     f"Expected len(optim_state_key.unflat_param_names) == 1, got {len(optim_state_key.unflat_param_names)}"
                 )
+=======
+            assert len(optim_state_key.unflat_param_names) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unflat_param_name = optim_state_key.unflat_param_names[0]
             with SimpleProfiler.profile("none_fsdp_managed_copy"):
                 param_key = cast(Union[str, int], param_key)
@@ -1815,11 +2027,18 @@ def _convert_state_with_orig_params(
     for _all_states in all_states.values():
         fqn = next(iter(_all_states.keys()))
         fsdp_param_info = fqn_to_fsdp_param_info[fqn]
+<<<<<<< HEAD
         if len(fsdp_param_info.param_requires_grad) <= 0:
             raise AssertionError(
                 "With use_orig_params, FSDPParamInfo should have requires_grad "
                 "information. However, the length is zero."
             )
+=======
+        assert len(fsdp_param_info.param_requires_grad) > 0, (
+            "With use_orig_params, FSDPParamInfo should have requires_grad "
+            "information. However, the length is zero."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, idx in fsdp_param_info.param_indices.items():
             if key in _all_states:
                 continue
@@ -1859,6 +2078,7 @@ def _convert_state_with_flat_params(
     # across ranks
     for optim_state_key in all_optim_state_keys:
         param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
+<<<<<<< HEAD
             optim_state_key
         )
 
@@ -1867,6 +2087,15 @@ def _convert_state_with_flat_params(
                 "If use_orig_params is False, we must be able to find the "
                 f"corresponding param id. {optim_state_key} {param_key}"
             )
+=======
+            optim_state_key, None
+        )
+
+        assert param_key is not None, (
+            "If use_orig_params is False, we must be able to find the "
+            f"corresponding param id. {optim_state_key} {param_key}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if optim_state_key.is_fsdp_managed:
             # If there are multiple unflat_param_names (not use_orig_params),
@@ -1882,11 +2111,15 @@ def _convert_state_with_flat_params(
                 cpu_offload,
             )
             if to_save:
+<<<<<<< HEAD
                 if len(unflat_state) != len(optim_state_key.unflat_param_names):
                     raise AssertionError(
                         f"Expected len(unflat_state) == len(optim_state_key.unflat_param_names), "
                         f"got {len(unflat_state)} != {len(optim_state_key.unflat_param_names)}"
                     )
+=======
+                assert len(unflat_state) == len(optim_state_key.unflat_param_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fsdp_osd_state.update(
                     zip(
                         optim_state_key.unflat_param_names,
@@ -1894,10 +2127,14 @@ def _convert_state_with_flat_params(
                     )
                 )
         elif to_save:
+<<<<<<< HEAD
             if len(optim_state_key.unflat_param_names) != 1:
                 raise AssertionError(
                     f"Expected len(optim_state_key.unflat_param_names) == 1, got {len(optim_state_key.unflat_param_names)}"
                 )
+=======
+            assert len(optim_state_key.unflat_param_names) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unflat_param_name = optim_state_key.unflat_param_names[0]
             fsdp_osd_state[unflat_param_name] = copy.copy(optim_state_dict[param_key])
             if cpu_offload:
@@ -2053,8 +2290,12 @@ def _optim_state_dict(
             "most cases, this is a user-defined state that is not "
             "associated with any particular parameter. Another possible "
             "case is this state is managed by TorchRec. Otherwise, there may "
+<<<<<<< HEAD
             " be a mismatched assumption of optim_state_dict of this mode.",
             stacklevel=2,
+=======
+            " be a mismatched assumption of optim_state_dict of this mode."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         fsdp_osd_state[key] = value
 
@@ -2094,10 +2335,14 @@ def module_fn(module, prefix, tree_level, fqn_to_param_info):
         for idx, local_fqn in enumerate(flat_param._fqns):
             fqn = clean_tensor_name(prefix + local_fqn)
             if fqn in fqn_to_param_info:
+<<<<<<< HEAD
                 if fqn_to_param_info[fqn].handle.flat_param is not flat_param:
                     raise AssertionError(
                         f"Expected fqn_to_param_info[fqn].handle.flat_param is flat_param for {fqn}"
                     )
+=======
+                assert fqn_to_param_info[fqn].handle.flat_param is flat_param, fqn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fqn_to_param_info[fqn] = fsdp_param_info
             fsdp_param_info.param_indices[fqn] = idx
             if flat_param._params is not None:
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index eab47412f5d25..991ad9021dee2 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,9 +1,14 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import auto, Enum
 from typing import Any, no_type_check, Optional
+=======
+from enum import auto, Enum
+from typing import Any, Callable, no_type_check, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -103,8 +108,12 @@ def _is_fsdp_root(state: _FSDPState, module: nn.Module) -> bool:
     """
     # Force a lazy initialization to determine the FSDP root
     _lazy_init(state, module)
+<<<<<<< HEAD
     if state._is_root is None:
         raise AssertionError("Expected _is_root to be set after lazy init")
+=======
+    assert state._is_root is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return state._is_root
 
 
@@ -241,10 +250,15 @@ def _init_streams(
     Initializes CUDA streams for overlapping communication, computation, and
     data transfers. The streams should be shared across FSDP instances.
     """
+<<<<<<< HEAD
     if not state._is_root:
         raise AssertionError("Expected state to be root")
     if not state._device_handle.is_available():
         raise AssertionError("Expected device handle to be available")
+=======
+    assert state._is_root
+    assert state._device_handle.is_available()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses_hybrid_sharding = any(
         fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES
         for fsdp_state in state._all_fsdp_states
@@ -1462,8 +1476,12 @@ def _register_post_backward_hook(
             "register the post-backward hook",
         )
         acc_grad = temp_flat_param.grad_fn.next_functions[0][0]  # type: ignore[union-attr]
+<<<<<<< HEAD
         if acc_grad is None:
             raise AssertionError("Expected acc_grad to be set")
+=======
+        assert acc_grad is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hook_handle = acc_grad.register_hook(
             functools.partial(_post_backward_hook, state, handle)
         )
@@ -1505,8 +1523,12 @@ def _register_post_backward_reshard_only_hook(
         inp_tensors = [
             obj for obj in args_flat if torch.is_tensor(obj) and obj.requires_grad
         ]
+<<<<<<< HEAD
     if inp_tensors is None:
         raise AssertionError("Expected inp_tensors to be set")
+=======
+    assert inp_tensors is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hook_handle = register_multi_grad_hook(
         inp_tensors, functools.partial(_post_backward_reshard_only_hook, state, handle)
     )
@@ -1604,10 +1626,14 @@ def _get_buffers_and_dtypes_for_computation(
                 continue
             buffers.append(buffer)
             buffer_dtypes.append(fsdp_state.mixed_precision.buffer_dtype)
+<<<<<<< HEAD
     if len(buffers) != len(buffer_dtypes):
         raise AssertionError(
             f"Expected buffers and buffer_dtypes to have the same length, got {len(buffers)} and {len(buffer_dtypes)}"
         )
+=======
+    assert len(buffers) == len(buffer_dtypes), f"{len(buffers)} {len(buffer_dtypes)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return buffers, buffer_dtypes
 
 
diff --git a/torch/distributed/fsdp/_shard_utils.py b/torch/distributed/fsdp/_shard_utils.py
index eca5b9bd39874..9dcea0470f0fb 100644
--- a/torch/distributed/fsdp/_shard_utils.py
+++ b/torch/distributed/fsdp/_shard_utils.py
@@ -68,11 +68,15 @@ def _create_chunk_sharded_tensor(
         )
         for r in range(len(chunk_sizes))
     ]
+<<<<<<< HEAD
     if len(chunk_sizes) != len(chunk_offsets) or len(chunk_sizes) != len(placements):
         raise AssertionError(
             f"Expected chunk_sizes, chunk_offsets, and placements to have the same length, "
             f"got {len(chunk_sizes)}, {len(chunk_offsets)}, {len(placements)}"
         )
+=======
+    assert len(chunk_sizes) == len(chunk_offsets) == len(placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shard_metadata = [
         ShardMetadata(offset, size, placement)
         for offset, size, placement in zip(chunk_offsets, chunk_sizes, placements)
@@ -125,8 +129,14 @@ def _all_gather_dtensor(
     """
     All gather a DTensor in its sharded dimension and return the local tensor.
     """
+<<<<<<< HEAD
     if root_mesh != tensor.device_mesh:
         raise AssertionError("The device mesh of a tensor should be a root mesh.")
+=======
+    assert root_mesh == tensor.device_mesh, (
+        "The device mesh of a tensor should be a root mesh."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP placements: [Shard(0)] -> [Replicate()]
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index ec648ced837e1..ab6e4a043ed10 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -3,8 +3,13 @@
 import logging
 import math
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterator
 from typing import Any, cast, no_type_check
+=======
+from collections.abc import Generator, Iterator
+from typing import Any, Callable, cast, no_type_check
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -16,6 +21,10 @@
     Shard,
     ShardedTensor,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import _mesh_resources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp._common_utils import (
     _FSDPState,
     _get_module_fsdp_state_if_fully_sharded_module,
@@ -109,11 +118,18 @@ def _enter_unshard_params_ctx(
     requires to enter the context in the pre-hook but leave the context in the
     post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
     """
+<<<<<<< HEAD
     if module in fsdp_state._unshard_params_ctx:
         raise AssertionError(
             "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
             "is not None."
         )
+=======
+    assert module not in fsdp_state._unshard_params_ctx, (
+        "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
+        "is not None."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fsdp_state._unshard_params_ctx[module] = _unshard_fsdp_state_params(
         module,
         fsdp_state,
@@ -219,6 +235,7 @@ def _common_unshard_post_state_dict_hook(
         if no_fsdp_return:
             state_dict.pop(fqn)
             continue
+<<<<<<< HEAD
         if fqn not in state_dict:
             raise AssertionError(
                 f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
@@ -226,6 +243,14 @@ def _common_unshard_post_state_dict_hook(
                 f"prefix={prefix}, module_name={module_name}, "
                 f"param_name={param_name} rank={fsdp_state.rank}."
             )
+=======
+        assert fqn in state_dict, (
+            f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
+            f"has {state_dict.keys()}. "
+            f"prefix={prefix}, module_name={module_name}, "
+            f"param_name={param_name} rank={fsdp_state.rank}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         param_hook(state_dict, prefix, fqn)
 
@@ -289,7 +314,11 @@ def _full_pre_state_dict_hook(
     ``nn.Module``.
     """
     if getattr(fsdp_state, "_device_mesh", False):
+<<<<<<< HEAD
         fsdp_state._device_mesh._get_root_mesh()
+=======
+        _mesh_resources.get_root_mesh(fsdp_state._device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _common_pre_state_dict_hook(module, fsdp_state)
     _common_unshard_pre_state_dict_hook(
@@ -331,14 +360,22 @@ def param_hook(
             try:
                 state_dict[fqn] = state_dict[fqn].detach().clone()
                 state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
+<<<<<<< HEAD
             except BaseException as e:  # noqa: B036
+=======
+            except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 warnings.warn(
                     f"Failed to clone() tensor with name {fqn} on rank {fsdp_state.rank}. "
                     "This may mean that this state_dict entry could point to invalid "
                     "memory regions after returning from state_dict() call if this "
                     "parameter is managed by FSDP. Please check clone "
+<<<<<<< HEAD
                     f"implementation of {fqn}. Error: {str(e)}",
                     stacklevel=2,
+=======
+                    f"implementation of {fqn}. Error: {str(e)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     return _common_unshard_post_state_dict_hook(
@@ -412,8 +449,12 @@ def _local_post_state_dict_hook(
     # value as the flat_param but it is a pure Tensor because
     # nn.Module.state_dict() will detach the parameter. Therefore, we need
     # to get flat_param to get the metadata.
+<<<<<<< HEAD
     if not _module_handle(fsdp_state, module):
         raise AssertionError("Should have returned early")
+=======
+    assert _module_handle(fsdp_state, module), "Should have returned early"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flat_param = _module_handle(fsdp_state, module).flat_param
     # Constructs a ShardedTensor from the flat_param "without" padding.
     # Removing the padding allows users to change the number of ranks
@@ -463,6 +504,7 @@ def _local_pre_load_state_dict_hook(
     _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
     fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
     if fqn not in state_dict:
+<<<<<<< HEAD
         if _has_fsdp_params(fsdp_state, module):
             raise AssertionError(
                 "No `FlatParameter` in `state_dict` for this FSDP instance "
@@ -484,16 +526,42 @@ def _local_pre_load_state_dict_hook(
             raise AssertionError(
                 "load_local_state_dict assume one shard per ShardedTensor."
             )
+=======
+        assert not _has_fsdp_params(fsdp_state, module), (
+            "No `FlatParameter` in `state_dict` for this FSDP instance "
+            "but it has parameters"
+        )
+        return
+    load_tensor = state_dict[fqn]
+    assert isinstance(load_tensor, ShardedTensor), (
+        "Tensors in local_state_dict should be ShardedTensor."
+    )
+
+    # Convert the ShardedTensor to a Tensor.
+    flat_param = _module_handle(fsdp_state, module).flat_param
+    assert flat_param is not None
+    valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
+    shards = load_tensor.local_shards()
+    if valid_data_size > 0:
+        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         load_tensor = shards[0].tensor
 
         # Get the metadata of the flat_param to decide whether to pad the loaded
         # tensor.
         if flat_param._shard_numel_padded > 0:
+<<<<<<< HEAD
             if load_tensor.numel() >= flat_param.numel():
                 raise AssertionError(
                     f"Local shard size = {flat_param.numel()} and the tensor in "
                     f"the state_dict is {load_tensor.numel()}."
                 )
+=======
+            assert load_tensor.numel() < flat_param.numel(), (
+                f"Local shard size = {flat_param.numel()} and the tensor in "
+                f"the state_dict is {load_tensor.numel()}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
     else:
         load_tensor = flat_param
@@ -626,11 +694,18 @@ def _sharded_pre_load_state_dict_hook(
                 param, fsdp_state._fsdp_extension
             )
 
+<<<<<<< HEAD
             if len(shards) >= 2:
                 raise AssertionError(
                     "Expects 0 or 1 shard per rank "
                     f"but got {len(shards)} shards on rank {fsdp_state.rank}."
                 )
+=======
+            assert len(shards) < 2, (
+                "Expects 0 or 1 shard per rank "
+                f"but got {len(shards)} shards on rank {fsdp_state.rank}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             param_numel = param.size().numel()
             dim_0_size = param.size()[0]
             chunk_size = (
@@ -664,7 +739,11 @@ def _sharded_pre_load_state_dict_hook(
             if param.device != fsdp_state._device_mesh.device_type:
                 param = param.to(fsdp_state._device_mesh.device_type)
 
+<<<<<<< HEAD
             root_mesh = fsdp_state._device_mesh._get_root_mesh()
+=======
+            root_mesh = _mesh_resources.get_root_mesh(fsdp_state._device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_tensor = _ext_all_gather_dtensor(
                 param, root_mesh, fsdp_state._fsdp_extension
             )
@@ -709,8 +788,12 @@ def _post_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
+<<<<<<< HEAD
             "be returned.",
             stacklevel=2,
+=======
+            "be returned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         context = contextlib.nullcontext()
@@ -772,8 +855,12 @@ def _pre_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
+<<<<<<< HEAD
             "be returned.",
             stacklevel=2,
+=======
+            "be returned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         _set_use_dtensor(fsdp_state)
@@ -827,8 +914,12 @@ def _pre_load_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+<<<<<<< HEAD
             "be returned.",
             stacklevel=2,
+=======
+            "be returned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         _set_use_dtensor(fsdp_state)
@@ -865,8 +956,12 @@ def _post_load_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+<<<<<<< HEAD
             "be returned.",
             stacklevel=2,
+=======
+            "be returned."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         context = contextlib.nullcontext()
diff --git a/torch/distributed/fsdp/_trace_utils.py b/torch/distributed/fsdp/_trace_utils.py
index c4d514c5c6474..b39127a98f3fc 100644
--- a/torch/distributed/fsdp/_trace_utils.py
+++ b/torch/distributed/fsdp/_trace_utils.py
@@ -1,9 +1,15 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from typing import Any, NamedTuple, Optional
+=======
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Any, Callable, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -144,10 +150,16 @@ def _patched_call_module(
         named_params = list(module.named_parameters())
         curr_module = exec_info.curr_module
         if named_params:
+<<<<<<< HEAD
             if curr_module not in exec_info.module_to_param_usage_infos:
                 raise AssertionError(
                     "The current module should have already been processed by a patched `call_module`"
                 )
+=======
+            assert curr_module in exec_info.module_to_param_usage_infos, (
+                "The current module should have already been processed by a patched `call_module`"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             exec_info.module_to_param_usage_infos[exec_info.curr_module].append(
                 _ParamUsageInfo(module, named_params)
             )
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
index 71dc1a9f4e28c..866422bd6ddfd 100644
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -66,8 +66,12 @@ def _get_shard(flat_param_or_grad: torch.Tensor) -> torch.Tensor:
     if writeback_grad:
         existing_grad = handle.sharded_grad
         if existing_grad is not None:
+<<<<<<< HEAD
             if handle.flat_param.grad is None:
                 raise AssertionError("Expected handle.flat_param.grad to not be None")
+=======
+            assert handle.flat_param.grad is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             grad_shard = _get_shard(handle.flat_param.grad)
             existing_grad[: grad_shard.numel()].copy_(grad_shard)
 
@@ -153,8 +157,12 @@ def _validate_unshard_params_args(
             "offload_to_cpu=True and rank0_only=False may result in the"
             "unsharded parameters being redundantly copied to CPU memory for "
             "GPUs sharing the same CPU memory, which risks CPU OOM. We "
+<<<<<<< HEAD
             "recommend using offload_to_cpu=True with rank0_only=True.",
             stacklevel=2,
+=======
+            "recommend using offload_to_cpu=True with rank0_only=True."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -187,10 +195,16 @@ def _unshard_fsdp_state_params(
         yield
         return
 
+<<<<<<< HEAD
     if handle._training_state != HandleTrainingState.IDLE:
         raise AssertionError(
             f"Expects the handle training to be IDLE but got {handle._training_state}"
         )
+=======
+    assert handle._training_state == HandleTrainingState.IDLE, (
+        f"Expects the handle training to be IDLE but got {handle._training_state}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
 
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index 41dc4d8575198..06eb9ff094f1a 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -3,9 +3,14 @@
 import functools
 import inspect
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial
 from typing import Any, Union
+=======
+from functools import partial
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
@@ -120,8 +125,12 @@ def _warn_on_overridden_mixed_precision(
         "Both mixed precision and an auto_wrap_policy were specified to FSDP, "
         f"where the wrapped module has submodules of type:\n{overridden_module_classes}\n"
         "These modules will be wrapped as separate FSDP instacnes with mixed "
+<<<<<<< HEAD
         "precision disabled.",
         stacklevel=2,
+=======
+        "precision disabled."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -173,7 +182,11 @@ def _validate_frozen_params(
                     f"The following parameters have requires_grad=False:\n{frozen_param_fqns}"
                 )
                 if use_orig_params:
+<<<<<<< HEAD
                     warnings.warn(msg, stacklevel=2)
+=======
+                    warnings.warn(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     raise ValueError(msg)
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index cdc5ef424e705..7fc6546476447 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -6,10 +6,17 @@
 import math
 import traceback
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable, Iterator
 from contextlib import contextmanager
 from enum import auto, Enum
 from typing import Any, Optional, Union
+=======
+from collections.abc import Generator, Iterable, Iterator
+from contextlib import contextmanager
+from enum import auto, Enum
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -121,6 +128,12 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
     well as the ZeRO Stage 3 from `DeepSpeed <https://www.deepspeed.ai/>`_.
     FullyShardedDataParallel is commonly shortened to FSDP.
 
+<<<<<<< HEAD
+=======
+    To understand FSDP internals, refer to the
+    :ref:`fsdp_notes`.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Example::
 
         >>> # xdoctest: +SKIP("undefined variables")
@@ -680,7 +693,10 @@ def set_state_dict_type(
             "#torch.distributed.checkpoint.state_dict.get_state_dict ."
             "Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .",
             FutureWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         _state_dict_type_to_config = {
             StateDictType.FULL_STATE_DICT: FullStateDictConfig,
@@ -700,12 +716,20 @@ def set_state_dict_type(
             state_dict_config = state_dict_config_type()
         if optim_state_dict_config is None:
             optim_state_dict_config = optim_state_dict_config_type()
+<<<<<<< HEAD
         if state_dict_config_type is not type(state_dict_config):
+=======
+        if state_dict_config_type != type(state_dict_config):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"Expected state_dict_config of type {state_dict_config_type} "
                 f"but got {type(state_dict_config)}"
             )
+<<<<<<< HEAD
         if optim_state_dict_config_type is not type(optim_state_dict_config):
+=======
+        if optim_state_dict_config_type != type(optim_state_dict_config):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"Expected optim_state_dict_config of type {optim_state_dict_config_type} "
                 f"but got {type(optim_state_dict_config)}"
@@ -719,6 +743,7 @@ def set_state_dict_type(
             if prev_state_dict_type is None:
                 prev_state_dict_type = submodule._state_dict_type
             else:
+<<<<<<< HEAD
                 if prev_state_dict_type != submodule._state_dict_type:
                     raise AssertionError(
                         "All FSDP modules should have the same state_dict_type."
@@ -742,6 +767,26 @@ def set_state_dict_type(
                     raise AssertionError(
                         "All FSDP modules must have the same type of optim_state_dict_config."
                     )
+=======
+                assert prev_state_dict_type == submodule._state_dict_type, (
+                    "All FSDP modules should have the same state_dict_type."
+                )
+            if prev_state_dict_config is None:
+                prev_state_dict_config = submodule._state_dict_config
+            else:
+                assert isinstance(
+                    submodule._state_dict_config, type(prev_state_dict_config)
+                ), "All FSDP modules must have the same type of state_dict_config."
+            if prev_optim_state_dict_config is None:
+                prev_optim_state_dict_config = submodule._optim_state_dict_config
+            else:
+                assert isinstance(
+                    submodule._optim_state_dict_config,
+                    type(prev_optim_state_dict_config),
+                ), (
+                    "All FSDP modules must have the same type of optim_state_dict_config."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             submodule._state_dict_type = state_dict_type
             submodule._state_dict_config = state_dict_config
@@ -780,11 +825,18 @@ def get_state_dict_type(module: nn.Module) -> StateDictSettings:
                     submodule._state_dict_config,
                     submodule._optim_state_dict_config,
                 )
+<<<<<<< HEAD
                 if state_dict_settings != submodule_settings:
                     raise AssertionError(
                         "All FSDP modules must have the same state dict settings."
                         f"Got {submodule_settings} and {state_dict_settings}."
                     )
+=======
+                assert state_dict_settings == submodule_settings, (
+                    "All FSDP modules must have the same state dict settings."
+                    f"Got {submodule_settings} and {state_dict_settings}."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _set_optim_use_dtensor(submodule, submodule_settings)
         return state_dict_settings
 
@@ -1061,11 +1113,18 @@ def no_sync(self) -> Generator:
             yield
         finally:
             for m, old_flag in old_flags:
+<<<<<<< HEAD
                 if m._sync_gradients:
                     raise AssertionError(
                         "`_sync_gradients` was incorrectly set to "
                         "`True` while in the `no_sync()` context manager"
                     )
+=======
+                assert not m._sync_gradients, (
+                    "`_sync_gradients` was incorrectly set to "
+                    "`True` while in the `no_sync()` context manager"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 m._sync_gradients = old_flag
 
     @torch.no_grad()
@@ -1209,8 +1268,12 @@ def clip_grad_norm_(
             warnings.warn(
                 f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no "
                 "gradients -- returning the total norm in the default dtype "
+<<<<<<< HEAD
                 f"{total_norm.dtype}",
                 stacklevel=2,
+=======
+                f"{total_norm.dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )  # warn since this is generally unexpected
             return total_norm
         total_norm_dtype = functools.reduce(
@@ -1284,15 +1347,20 @@ def _optim_state_dict_impl(
             )
         else:
             using_optim_input = False
+<<<<<<< HEAD
             if optim_input is not None or rank0_only:
                 raise AssertionError(
                     f"Expected optim_input to be None and rank0_only to be False, "
                     f"got optim_input={optim_input}, rank0_only={rank0_only}"
                 )
+=======
+            assert optim_input is None and not rank0_only
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
             0
         ]._use_orig_params
+<<<<<<< HEAD
         if not all(
             use_orig_params == m._use_orig_params
             for m in FullyShardedDataParallel.fsdp_modules(model)
@@ -1300,6 +1368,12 @@ def _optim_state_dict_impl(
             raise AssertionError(
                 "Not all FSDP modules have the same _use_orig_params value"
             )
+=======
+        assert all(
+            use_orig_params == m._use_orig_params
+            for m in FullyShardedDataParallel.fsdp_modules(model)
+        ), "Not all FSDP modules have the same _use_orig_params value"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return _optim_state_dict(
             model=model,
@@ -1345,15 +1419,20 @@ def _optim_state_dict_to_load_impl(
             )
         else:
             using_optim_input = False
+<<<<<<< HEAD
             if optim_input is not None or rank0_only:
                 raise AssertionError(
                     f"Expected optim_input to be None and rank0_only to be False, "
                     f"got optim_input={optim_input}, rank0_only={rank0_only}"
                 )
+=======
+            assert optim_input is None and not rank0_only
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
             0
         ]._use_orig_params
+<<<<<<< HEAD
         if not all(
             use_orig_params == m._use_orig_params
             for m in FullyShardedDataParallel.fsdp_modules(model)
@@ -1361,6 +1440,12 @@ def _optim_state_dict_to_load_impl(
             raise AssertionError(
                 "Not all FSDP modules have the same _use_orig_params value"
             )
+=======
+        assert all(
+            use_orig_params == m._use_orig_params
+            for m in FullyShardedDataParallel.fsdp_modules(model)
+        ), "Not all FSDP modules have the same _use_orig_params value"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if rank0_only and dist.get_rank(group) > 0:
             optim_state_dict = {}
@@ -1742,6 +1827,7 @@ def rekey_optim_state_dict(
             optim_input,
             optim,
         )
+<<<<<<< HEAD
         if optim_state_key_type not in (
             OptimStateKeyType.PARAM_NAME,
             OptimStateKeyType.PARAM_ID,
@@ -1749,6 +1835,12 @@ def rekey_optim_state_dict(
             raise AssertionError(
                 f"Expected optim_state_key_type to be PARAM_NAME or PARAM_ID, got {optim_state_key_type}"
             )
+=======
+        assert optim_state_key_type in (
+            OptimStateKeyType.PARAM_NAME,
+            OptimStateKeyType.PARAM_ID,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         osd = optim_state_dict  # alias
         # Validate that the existing parameter keys are uniformly typed
         uses_param_name_mask = [type(param_key) is str for param_key in osd["state"]]
@@ -2176,10 +2268,16 @@ def _get_param_to_fqn(
     """
     param_to_param_names = _get_param_to_fqns(model)
     for param_names in param_to_param_names.values():
+<<<<<<< HEAD
         if len(param_names) == 0:
             raise AssertionError(
                 "`_get_param_to_fqns()` should not construct empty lists"
             )
+=======
+        assert len(param_names) > 0, (
+            "`_get_param_to_fqns()` should not construct empty lists"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(param_names) > 1:
             raise RuntimeError(
                 "Each parameter should only map to one parameter name but got "
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
index 3986d733328c8..d5d39599e456c 100644
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -35,10 +35,14 @@ class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
     """
 
     def __init__(self, master_tensor: torch.Tensor) -> None:
+<<<<<<< HEAD
         if not _is_supported_device(master_tensor):
             raise AssertionError(
                 f"Expected supported device, got {master_tensor.device}"
             )
+=======
+        assert _is_supported_device(master_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.master = master_tensor
         self._per_device_tensors: dict[torch.device, torch.Tensor] = {}
 
@@ -133,12 +137,19 @@ def scale(
             return outputs
 
         if isinstance(outputs, torch.Tensor):
+<<<<<<< HEAD
             if not _is_supported_device(outputs):
                 raise AssertionError(f"Expected supported device, got {outputs.device}")
             if self._scale is None:
                 self._lazy_init_scale_growth_tracker(outputs.device)
             if self._scale is None:
                 raise AssertionError("Expected _scale to be initialized, got None")
+=======
+            assert _is_supported_device(outputs)
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             scaled_output = outputs * self._scale.to(
                 device=outputs.device, non_blocking=True
             )
@@ -151,6 +162,7 @@ def scale(
 
         def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
             if isinstance(val, torch.Tensor):
+<<<<<<< HEAD
                 if not _is_supported_device(val):
                     raise AssertionError(f"Expected supported device, got {val.device}")
                 if len(stash) == 0:
@@ -160,6 +172,13 @@ def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
                         raise AssertionError(
                             "Expected _scale to be initialized, got None"
                         )
+=======
+                assert _is_supported_device(val)
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     stash.append(_GeneralMultiDeviceReplicator(self._scale))
                 scaled_val = val * stash[0].get(val.device)
                 # Here we ensure the return dtype is the same as the outputs dtype.
@@ -227,8 +246,12 @@ def _unscale_grads_(
         # ranks may have no (non-zero sized) parameter shards, necessitating the
         # initialization of `per_device_found_inf._per_device_tensors` here
         if not per_device_found_inf._per_device_tensors:
+<<<<<<< HEAD
             if self._scale is None:
                 raise AssertionError("Expected _scale to be initialized, got None")
+=======
+            assert self._scale is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             per_device_found_inf.get(self._scale.device)
         return per_device_found_inf._per_device_tensors
 
@@ -248,8 +271,12 @@ def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
             raise RuntimeError("unscale_() is being called after step().")
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+<<<<<<< HEAD
         if self._scale is None:
             raise AssertionError("Expected _scale to be initialized, got None")
+=======
+        assert self._scale is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inv_scale = self._scale.double().reciprocal().float()
         found_inf = torch.full(
             (1,), 0.0, dtype=torch.float32, device=self._scale.device
@@ -290,10 +317,14 @@ def _amp_update_scale_cpu_(self, found_inf: torch.Tensor) -> None:
         If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
         Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
         """
+<<<<<<< HEAD
         if self._scale is None or self._growth_tracker is None:
             raise AssertionError(
                 "Expected _scale and _growth_tracker to be initialized, got None"
             )
+=======
+        assert self._scale is not None and self._growth_tracker is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if found_inf.item() >= 1.0:
             self._scale *= self._backoff_factor
@@ -334,6 +365,7 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
                 self._scale.fill_(new_scale)  # type: ignore[union-attr]
             else:
                 reason = (
+<<<<<<< HEAD
                     "new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
                     "torch.FloatTensor with requires_grad=False."
                 )
@@ -343,6 +375,14 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
                     raise AssertionError(reason)
                 if new_scale.requires_grad is not False:
                     raise AssertionError(reason)
+=======
+                    "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+                )
+                assert new_scale.device.type == self._device, reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._scale.copy_(new_scale)  # type: ignore[union-attr]
         else:
             # Consume shared inf/nan data collected from optimizers to update the scale.
@@ -353,8 +393,12 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
                 for found_inf in state["found_inf_per_device"].values()
             ]
 
+<<<<<<< HEAD
             if len(found_infs) == 0:
                 raise AssertionError("No inf checks were recorded prior to update.")
+=======
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             found_inf_combined = found_infs[0]
             if len(found_infs) > 1:
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index f0a210eca8a6b..9fa676f7912a4 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -7,8 +7,13 @@
 import contextlib
 import copy
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterable, Sequence
 from typing import Any, cast, Optional, Union
+=======
+from collections.abc import Generator, Iterable, Sequence
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.nn as nn
 
@@ -53,6 +58,7 @@ def _post_order_apply_inner(
                 _post_order_apply_inner(child_module, child_module_name, module)
         optional_module = fn(module)
         if optional_module is not None:
+<<<<<<< HEAD
             if not isinstance(parent_module, nn.Module):
                 raise AssertionError(
                     "Non-root modules should have their parent module set but got "
@@ -67,6 +73,19 @@ def _post_order_apply_inner(
                 raise AssertionError(
                     f"fn should return None or an nn.Module but got {optional_module}"
                 )
+=======
+            assert isinstance(parent_module, nn.Module), (
+                "Non-root modules should have their parent module set but got "
+                f"{parent_module} for {module}"
+            )
+            assert module_name, (
+                "Non-root modules should have their module name set but got "
+                f"an empty module name for {module}"
+            )
+            assert isinstance(optional_module, nn.Module), (
+                f"fn should return None or an nn.Module but got {optional_module}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             setattr(parent_module, module_name, optional_module)
 
     _post_order_apply_inner(root_module, "", None)
@@ -459,8 +478,12 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
             the values provided by the :func:`enable_wrap` context
     """
     if _ConfigAutoWrap.in_autowrap_context:
+<<<<<<< HEAD
         if _ConfigAutoWrap.wrapper_cls is None:
             raise AssertionError("Expected _ConfigAutoWrap.wrapper_cls to be set")
+=======
+        assert _ConfigAutoWrap.wrapper_cls is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         wrap_overrides = {**_ConfigAutoWrap.kwargs, **wrap_overrides}
         return _wrap(
@@ -472,8 +495,12 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
 
 
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
+<<<<<<< HEAD
     if wrapper_cls is None:
         raise AssertionError("Expected wrapper_cls to be set")
+=======
+    assert wrapper_cls is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if hasattr(module, "_wrap_overrides"):
         # If module has a _wrap_overrides attribute, we force overriding the
         # FSDP config with these attributes for this module. Currently this
@@ -511,19 +538,28 @@ def _recursive_wrap(
         (nn.Module, int):
             ``module`` after wrapping and the numel recursively wrapped.
     """
+<<<<<<< HEAD
     if auto_wrap_policy is None:
         raise AssertionError("Must specify auto_wrap_policy.")
     if wrapper_cls is None:
         raise AssertionError("Must specify wrapper_cls")
+=======
+    assert auto_wrap_policy is not None, "Must specify auto_wrap_policy."
+    assert wrapper_cls is not None, "Must specify wrapper_cls"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Make sure no child is already wrapped.
     for _, child in module.named_modules():
         if child in ignored_modules:
             continue
         try:
+<<<<<<< HEAD
             if isinstance(child, cast(type, wrapper_cls)):
                 raise AssertionError(
                     f"Child module {child} is already wrapped by {wrapper_cls}"
                 )
+=======
+            assert not isinstance(child, cast(type, wrapper_cls))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError:
             # wrapper_cls is a function as opposed to a class type, just bypass above check.
             pass
@@ -533,8 +569,12 @@ def _recursive_wrap(
         p.numel() for p in module.parameters() if p not in ignored_params
     )
 
+<<<<<<< HEAD
     if auto_wrap_policy is None:
         raise AssertionError("Expected auto_wrap_policy to be set")
+=======
+    assert auto_wrap_policy is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if auto_wrap_policy(module=module, recurse=True, nonwrapped_numel=nonwrapped_numel):
         total_wrapped_numel = 0
         # Iterate through the children, recursively wrap if necessary
@@ -586,10 +626,16 @@ def enable_autowrap_context(kwargs: Any) -> None:
             )
         _ConfigAutoWrap.in_autowrap_context = True
         # Get and save the wrapper cls for the context.
+<<<<<<< HEAD
         if "wrapper_cls" not in kwargs.keys():
             raise AssertionError(
                 "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
             )
+=======
+        assert "wrapper_cls" in kwargs.keys(), (
+            "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _ConfigAutoWrap.wrapper_cls = cast(Callable, kwargs["wrapper_cls"])
         del kwargs["wrapper_cls"]
         # Save the rest.
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index b6f7cc4085b16..127bd2aebd407 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -6,6 +6,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+<<<<<<< HEAD
 import os
 import sys
 import uuid
@@ -16,6 +17,14 @@
 import torch
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
 from torch._utils_internal import get_default_numa_options
+=======
+import sys
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Callable, Optional, Union
+
+import torch.distributed.elastic.rendezvous.registry as rdzv_registry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.elastic import events, metrics
 from torch.distributed.elastic.agent.server.api import WorkerSpec
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
@@ -28,7 +37,10 @@
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
+<<<<<<< HEAD
 from torch.numa.binding import NumaOptions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["LaunchConfig", "elastic_launch", "launch_agent"]
@@ -71,10 +83,13 @@ class LaunchConfig:
         local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
         event_log_handler: name of the event logging handler as registered in
           `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
+<<<<<<< HEAD
         duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                                 that match _any_ of the filter strings.
         duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                 that match _any_ of the filter strings.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
     .. note::
@@ -100,10 +115,13 @@ class LaunchConfig:
     metrics_cfg: dict[str, str] = field(default_factory=dict)
     local_addr: Optional[str] = None
     event_log_handler: str = "null"
+<<<<<<< HEAD
     numa_options: Optional[NumaOptions] = None
     signals_to_handle: str = "SIGTERM,SIGINT,SIGHUP,SIGQUIT"
     duplicate_stdout_filters: Optional[list[str]] = None
     duplicate_stderr_filters: Optional[list[str]] = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         default_timeout = 900
@@ -116,6 +134,7 @@ def __post_init__(self):
         if self.logs_specs is None:
             self.logs_specs = DefaultLogsSpecs()
 
+<<<<<<< HEAD
         if (
             self.numa_options is None
             and torch.cuda.is_available()
@@ -125,6 +144,8 @@ def __post_init__(self):
             self.numa_options = get_default_numa_options()
             logger.info("Using default numa options = %r", self.numa_options)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class elastic_launch:
     """
@@ -220,6 +241,7 @@ def launch_agent(
 
     logger.info(
         "Starting elastic_operator with launch configs:\n"
+<<<<<<< HEAD
         "  entrypoint               : %(entrypoint)s\n"
         "  min_nodes                : %(min_nodes)s\n"
         "  max_nodes                : %(max_nodes)s\n"
@@ -236,6 +258,21 @@ def launch_agent(
         "  numa_options             : %(numa_options)s\n",
         "  duplicate_stdout_filters : %(duplicate_stdout_filters)s\n",
         "  duplicate_stderr_filters : %(duplicate_stderr_filters)s\n",
+=======
+        "  entrypoint         : %(entrypoint)s\n"
+        "  min_nodes          : %(min_nodes)s\n"
+        "  max_nodes          : %(max_nodes)s\n"
+        "  nproc_per_node     : %(nproc_per_node)s\n"
+        "  run_id             : %(run_id)s\n"
+        "  rdzv_backend       : %(rdzv_backend)s\n"
+        "  rdzv_endpoint      : %(rdzv_endpoint)s\n"
+        "  rdzv_configs       : %(rdzv_configs)s\n"
+        "  max_restarts       : %(max_restarts)s\n"
+        "  monitor_interval   : %(monitor_interval)s\n"
+        "  log_dir            : %(log_dir)s\n"
+        "  metrics_cfg        : %(metrics_cfg)s\n"
+        "  event_log_handler  : %(event_log_handler)s\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {
             "entrypoint": entrypoint_name,
             "min_nodes": config.min_nodes,
@@ -250,10 +287,13 @@ def launch_agent(
             "log_dir": config.logs_specs.root_log_dir,  # type: ignore[union-attr]
             "metrics_cfg": config.metrics_cfg,
             "event_log_handler": config.event_log_handler,
+<<<<<<< HEAD
             "numa_options": config.numa_options,
             "signals_to_handle": config.signals_to_handle,
             "duplicate_stdout_filters": config.duplicate_stdout_filters,
             "duplicate_stderr_filters": config.duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
     )
 
@@ -269,9 +309,12 @@ def launch_agent(
 
     master_addr, master_port = _get_addr_and_port(rdzv_parameters)
 
+<<<<<<< HEAD
     # Set the signals to handle in the environment variable
     os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = config.signals_to_handle
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     spec = WorkerSpec(
         role=config.role,
         local_world_size=config.nproc_per_node,
@@ -284,9 +327,12 @@ def launch_agent(
         master_port=master_port,
         local_addr=config.local_addr,
         event_log_handler=config.event_log_handler,
+<<<<<<< HEAD
         numa_options=config.numa_options,
         duplicate_stdout_filters=config.duplicate_stdout_filters,
         duplicate_stderr_filters=config.duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     agent = LocalElasticAgent(
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 066197fad24a7..470cc1c88dbc3 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -4,8 +4,13 @@
 import io
 import sys
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator, Mapping
 from typing import Any, Optional, TypeVar, Union
+=======
+from collections.abc import Iterator, Mapping
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -354,9 +359,13 @@ def to(self, *args, **kwargs) -> T:  # type: ignore[misc, return, type-var]
         _raise_not_supported(self.to.__name__)
 
     def register_backward_hook(  # type: ignore[return]
+<<<<<<< HEAD
         self,
         hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]],
         # pyrefly: ignore [bad-return]
+=======
+        self, hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> RemovableHandle:
         _raise_not_supported(self.register_backward_hook.__name__)
 
@@ -371,7 +380,10 @@ def register_forward_pre_hook(  # type: ignore[return]
         ],
         prepend: bool = False,
         with_kwargs: bool = False,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_pre_hook.__name__)
 
@@ -383,7 +395,10 @@ def register_forward_hook(  # type: ignore[return, override]
         ],
         prepend: bool = False,
         with_kwargs: bool = False,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_hook.__name__)
 
@@ -404,11 +419,15 @@ def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
         )
 
     def named_parameters(  # type: ignore[return]
+<<<<<<< HEAD
         self,
         prefix: str = "",
         recurse: bool = True,
         remove_duplicate: bool = True,
         # pyrefly: ignore [bad-return]
+=======
+        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Iterator[tuple[str, Parameter]]:
         _raise_not_supported(self.named_parameters.__name__)
 
@@ -416,11 +435,15 @@ def buffers(self, recurse: bool = True) -> Iterator[Tensor]:  # type: ignore[ret
         _raise_not_supported(self.buffers.__name__)
 
     def named_buffers(  # type: ignore[return]
+<<<<<<< HEAD
         self,
         prefix: str = "",
         recurse: bool = True,
         remove_duplicate: bool = True,
         # pyrefly: ignore [bad-return]
+=======
+        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Iterator[tuple[str, Tensor]]:
         _raise_not_supported(self.named_buffers.__name__)
 
@@ -584,19 +607,28 @@ def init_from_module_rref(
 
         remote_module = object.__new__(RemoteModule)
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         enable_moving_cpu_tensors_to_cuda = remote_module._prepare_init(remote_device)
 
         if _module_interface_cls is not None:
             # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             remote_module.is_scriptable = True
 
             # pyrefly: ignore [missing-attribute]
+=======
+            remote_module.is_scriptable = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             remote_module._init_template(
                 _module_interface_cls, enable_moving_cpu_tensors_to_cuda
             )
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             remote_module.is_scriptable = False
             # pyrefly: ignore [missing-attribute]
@@ -609,6 +641,15 @@ def init_from_module_rref(
         # pyrefly: ignore [missing-attribute]
         remote_module._install_generated_methods()
         # pyrefly: ignore [missing-attribute]
+=======
+            remote_module.is_scriptable = False
+            remote_module.generated_methods = (
+                _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
+            )
+        remote_module.module_rref = module_rref
+
+        remote_module._install_generated_methods()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         remote_module._check_attribute_picklability()
 
         return remote_module
@@ -711,11 +752,17 @@ def _remote_module_receiver(
     m.__dict__.update(serialized_remote_module._asdict())
 
     # Unpickling the attribute `module_rref` must invoke RRef's `_deserialize()` method.
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     m.module_rref = rpc.PyRRef._deserialize(m.module_rref)
 
     # Install generated methods when unpickled.
     # pyrefly: ignore [missing-attribute]
+=======
+    m.module_rref = rpc.PyRRef._deserialize(m.module_rref)
+
+    # Install generated methods when unpickled.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for method in m.generated_methods:
         method_name = method.__name__
         method = torch.jit.export(method)
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index 287775be924a3..6ada83d0d4eec 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -225,7 +225,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=group.WORLD):
 
 class _Broadcast(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, src, group, tensor):
         ctx.src = src
         ctx.group = group
@@ -237,7 +240,10 @@ def forward(ctx, src, group, tensor):
         return tensor
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         gx = _Reduce.apply(ctx.src, ReduceOp.SUM, ctx.group, grad_output)
         if ctx.src != ctx.rank:
@@ -247,7 +253,10 @@ def backward(ctx, grad_output):
 
 class _Gather(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, dst, group, tensor):
         ctx.dst = dst
         ctx.group = group
@@ -273,7 +282,10 @@ def backward(ctx, *grad_outputs):
 
 class _Scatter(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, src, group, *tensors):
         ctx.src = src
         ctx.group = group
@@ -286,14 +298,20 @@ def forward(ctx, src, group, *tensors):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return (None, None) + _Gather.apply(ctx.src, ctx.group, grad_output)
 
 
 class _Reduce(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, src, op, group, tensor):
         ctx.src = src
         ctx.group = group
@@ -302,14 +320,20 @@ def forward(ctx, src, op, group, tensor):
         return tensor
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return (None, None, None) + (_Broadcast.apply(ctx.src, ctx.group, grad_output),)
 
 
 class _Reduce_Scatter(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, op, group, tensor, *input_tensor_list):
         ctx.group = group
         # Need contiguous tensors for collectives.
@@ -319,14 +343,20 @@ def forward(ctx, op, group, tensor, *input_tensor_list):
         return tensor
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return (None, None, None) + _AllGather.apply(ctx.group, grad_output)
 
 
 class _AllGather(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, group, tensor):
         # Need contiguous tensors for collectives.
         tensor = tensor.contiguous()
@@ -356,14 +386,20 @@ def backward(ctx, *grad_outputs):
 
 class _AllGatherBase(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, output_tensor, input_tensor, group):
         ctx.group = group
         dist._all_gather_base(output_tensor, input_tensor.contiguous(), group=group)
         return output_tensor
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
             world_size = dist.get_world_size(group=ctx.group)
@@ -385,7 +421,10 @@ def backward(ctx, grad_output):
 
 class _AlltoAll(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, group, out_tensor_list, *tensors):
         ctx.group = group
         ctx.input_tensor_size_list = [
@@ -421,7 +460,10 @@ def backward(ctx, *grad_outputs):
 
 class _AlltoAllSingle(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, group, output, output_split_sizes, input_split_sizes, input):
         ctx.group = group
         ctx.input_size = input.size()
@@ -437,7 +479,10 @@ def forward(ctx, group, output, output_split_sizes, input_split_sizes, input):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         tensor = torch.empty(
             ctx.input_size, device=grad_output.device, dtype=grad_output.dtype
@@ -455,7 +500,10 @@ def backward(ctx, grad_output):
 
 class _AllReduce(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, op, group, tensor):
         ctx.group = group
         ctx.op = op
@@ -464,6 +512,9 @@ def forward(ctx, op, group, tensor):
         return tensor
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return (None, None) + (_AllReduce.apply(ctx.op, ctx.group, grad_output),)
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index b5135ae5411ef..295ad60a677f6 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -1,8 +1,14 @@
 import logging
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Collection, Mapping
 from copy import deepcopy
 from typing import Any, Optional, overload, Union
+=======
+from collections.abc import Collection, Mapping
+from copy import deepcopy
+from typing import Any, Callable, Optional, overload, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -87,8 +93,12 @@ def __init__(
         else:
             warnings.warn(
                 "Since we pass in param_groups, we will use param_groups to "
+<<<<<<< HEAD
                 "initialize the optimizer, not all parameters of the module.",
                 stacklevel=2,
+=======
+                "initialize the optimizer, not all parameters of the module."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
             ordered_param_keys = []
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 9d17601a4e3fb..f657381f873c9 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -52,7 +52,11 @@ def step(self, autograd_ctx_id: int):
         all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
         # apply functional optimizer step with a list of gradients
         grads: list[Optional[Tensor]] = [
+<<<<<<< HEAD
             all_local_grads[p] if p in all_local_grads else None  # noqa: SIM401
+=======
+            all_local_grads[p] if p in all_local_grads else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for p in self._local_params
         ]
 
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index c7b78510ed1a1..773c56bb7ceb4 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -92,8 +92,12 @@ def load_state_dict(self, state_dict):
         else:
             warnings.warn(
                 "Loaded state dict does not contain a step counter for an averager. "
+<<<<<<< HEAD
                 "Setting step counter to 0.",
                 stacklevel=2,
+=======
+                "Setting step counter to 0."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.averager.step = 0
 
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index 8c82b53eff757..e2f5728437e9e 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -11,9 +11,14 @@
 import inspect
 import io
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from itertools import chain
 from typing import Any, Optional, Union
+=======
+from itertools import chain
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -100,19 +105,29 @@ def _broadcast_object(
         data = bytearray(buffer.getbuffer())
         length_tensor = torch.LongTensor([len(data)]).to(device)
         data_send_tensor = torch.ByteTensor(data).to(device)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
         # pyrefly: ignore [bad-argument-type]
+=======
+        dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.broadcast(data_send_tensor, src=src_rank, group=group, async_op=False)
     else:
         # Receive the object
         length_tensor = torch.LongTensor([0]).to(device)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
         data_recv_tensor = torch.empty(
             [int(length_tensor.item())], dtype=torch.uint8, device=device
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.broadcast(data_recv_tensor, src=src_rank, group=group, async_op=False)
         buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
         obj = torch.load(buffer, map_location=device, weights_only=False)
@@ -171,7 +186,10 @@ def __init__(
         if len(self.parameters) == 0:
             raise ValueError("Empty bucket assignment")
         # DDP guarantees all parameters in the bucket have the same device
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device: torch.device = self.parameters[0].device
         self.tensor: Optional[torch.Tensor] = None
 
@@ -420,9 +438,13 @@ def __init__(
         self.world_size: int = dist.get_world_size(self.process_group)
         self.rank: int = dist.get_rank(self.process_group)
         self.global_rank: int = dist.distributed_c10d.get_global_rank(
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             self.process_group,
             self.rank,
+=======
+            self.process_group, self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._overlap_with_ddp: bool = overlap_with_ddp
@@ -542,9 +564,13 @@ def consolidate_state_dict(self, to: int = 0) -> None:
         self._all_state_dicts = []
         for rank in range(self.world_size):
             global_rank = dist.distributed_c10d.get_global_rank(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 self.process_group,
                 rank,
+=======
+                self.process_group, rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if self.rank == to:
                 # Consolidate all local `state_dict`s on this rank, storing on
@@ -776,9 +802,13 @@ def _broadcast_params_from_rank(self, rank: int):
             for dev_i_buckets in self._buckets:
                 bucket = dev_i_buckets[rank]
                 global_rank = dist.distributed_c10d.get_global_rank(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
                     self.process_group,
                     rank,
+=======
+                    self.process_group, rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 handles.append(
                     dist.broadcast(
@@ -791,9 +821,13 @@ def _broadcast_params_from_rank(self, rank: int):
         else:
             param_groups = self._partition_parameters()[rank]
             global_rank = dist.distributed_c10d.get_global_rank(
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 self.process_group,
                 rank,
+=======
+                self.process_group, rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for param_group in param_groups:
                 handles.extend(
@@ -992,14 +1026,21 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                 for param_index, param in enumerate(bucket_params):
                     param_numel = param.numel()
                     if (
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         assignment_size + param_numel >= threshold
                         and param_index > bucket_offset
                     ):
                         assigned_rank = self._get_min_index(
+<<<<<<< HEAD
                             # pyrefly: ignore [unbound-name]
                             size_per_rank,
                             assigned_ranks_per_bucket[bucket_index],
+=======
+                            size_per_rank, assigned_ranks_per_bucket[bucket_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         # Include up to but not including the parameter that
                         # exceeded the threshold
@@ -1010,7 +1051,10 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                             assigned_rank,
                             assigned_ranks_per_bucket,
                         )
+<<<<<<< HEAD
                         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         size_per_rank[assigned_rank] += assignment_size
                         bucket_offset = param_index
                         assignment_size = 0
@@ -1018,9 +1062,13 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                 # Assign the remainder of the bucket so that no assignment
                 # spans across two buckets
                 assigned_rank = self._get_min_index(
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
                     size_per_rank,
                     assigned_ranks_per_bucket[bucket_index],
+=======
+                    size_per_rank, assigned_ranks_per_bucket[bucket_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self._assign_bucket_subset_to_rank(
                     bucket_index,
@@ -1029,7 +1077,10 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                     assigned_rank,
                     assigned_ranks_per_bucket,
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 size_per_rank[assigned_rank] += assignment_size
 
         return self._bucket_assignments_per_rank_cache
@@ -1108,7 +1159,10 @@ def _local_step(
 
         return loss
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def step(
         self,
         closure: Optional[Callable[[], float]] = None,
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.pyi b/torch/distributed/optim/zero_redundancy_optimizer.pyi
index 8ffbb04f13ffc..9a43ed9b809d8 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.pyi
+++ b/torch/distributed/optim/zero_redundancy_optimizer.pyi
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import enum
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, overload
+=======
+from typing import Any, Callable, overload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed.algorithms.join import Joinable, JoinHook
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index 52e56dd3f95ba..f710eba6ab85e 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -4,11 +4,18 @@
 import logging
 import operator
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from inspect import Parameter, Signature, signature
 from types import MethodType
 from typing import Any, Optional, Union
+=======
+from enum import Enum
+from inspect import Parameter, Signature, signature
+from types import MethodType
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx as fx
@@ -36,6 +43,7 @@
 # 2. Add parameter movement to split_module
 
 
+<<<<<<< HEAD
 PP_SUBMOD_PREFIX = "submod_pp"
 
 
@@ -46,6 +54,8 @@ def get_submod_name(stage_idx: int):
     return "_".join([PP_SUBMOD_PREFIX, str(stage_idx)])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _find_loss_from_output_and_spec(output_val, spec_val):
     if spec_val is False:
         return None
@@ -189,7 +199,11 @@ def add_to_live_nodes(n):
                 output_grads: Union[tuple[Optional[fx.Node], ...], Optional[fx.Node]]
                 if node in tuples:
                     stage_output = tuples[node]
+<<<<<<< HEAD
                     output_grads = tuple(val_to_grad.get(n) for n in tuples[node])
+=======
+                    output_grads = tuple(val_to_grad.get(n, None) for n in tuples[node])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     outputs_with_grads_idxs = [
                         i for i, n in enumerate(tuples[node]) if n in live_nodes
                     ]
@@ -282,7 +296,10 @@ def forward(self, *args, **kwargs):
 
 
 class TrivialLossWrapper(LossWrapper):
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(self, x, targets):
         model_out = self.module(x)
         return self.loss_fn(model_out, targets)
@@ -391,7 +408,11 @@ def detach_tensors(a):
 
         """
         def dont_traverse_size(a):
+<<<<<<< HEAD
             return type(a) is not torch.Size
+=======
+            return type(a) != torch.Size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         args = map_aggregate(
@@ -605,7 +626,11 @@ def throw(self, *args, **kwargs):
         i = 0
         while True:
             try:
+<<<<<<< HEAD
                 name = get_submod_name(i)
+=======
+                name = f"submod_{i}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 submod = getattr(self.split_gm, name)
                 submod.__class__.__reduce__ = _direct_serialization_reduce
                 i += 1
@@ -651,17 +676,26 @@ def get_stage_module(self, stage_idx: int) -> torch.nn.Module:
         """
         if stage_idx < 0 or stage_idx >= self.num_stages:
             raise ValueError(f"Invalid stage index {stage_idx}!")
+<<<<<<< HEAD
 
         submod_name = get_submod_name(stage_idx)
         return getattr(self.split_gm, submod_name)
+=======
+        return getattr(self.split_gm, f"submod_{stage_idx}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _number_and_count_forward_stages(gm: fx.GraphModule):
         num_stages = 0
         found_idxs: dict[int, None] = {}
         for node in gm.graph.nodes:
+<<<<<<< HEAD
             if node.op == "call_module" and node.target.startswith(PP_SUBMOD_PREFIX):
                 node.meta["stage_idx"] = int(node.target[len(PP_SUBMOD_PREFIX) + 1 :])
+=======
+            if node.op == "call_module" and node.target.startswith("submod_"):
+                node.meta["stage_idx"] = int(node.target[len("submod_") :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 found_idxs.setdefault(node.meta["stage_idx"])
                 num_stages += 1
 
@@ -695,7 +729,11 @@ def _from_traced(
         ``output_loss_value_spec={'loss': True, 'model_out': False}``
         """
 
+<<<<<<< HEAD
         traced = exported_program.module(check_guards=False)
+=======
+        traced = exported_program.module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if split_policy is not None:
             logger.info("Auto-splitting model")
@@ -743,7 +781,11 @@ def split_callback(n: fx.Node):
 
         # TODO: what does split do with module invocations? does it move the modules
         # into the submodules?
+<<<<<<< HEAD
         split = split_module(traced, mod, split_callback, partition_affix="pp")  # type: ignore[arg-type]
+=======
+        split = split_module(traced, mod, split_callback)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # a (custom) tracer can produce dead code like orphan get_attr nodes
         split.graph.eliminate_dead_code()
 
@@ -1016,7 +1058,13 @@ def _trace_with_export(
     ) -> ExportedProgram:
         logger.info("Tracing model ...")
         try:
+<<<<<<< HEAD
             ep = torch.export.export(mod, example_args, example_kwargs)
+=======
+            ep = torch.export.export_for_training(
+                mod, example_args, example_kwargs, strict=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Exception as e:
             raise RuntimeError(
                 "It seems that we cannot capture your model as a full graph. "
diff --git a/torch/distributed/pipelining/__init__.py b/torch/distributed/pipelining/__init__.py
index aacaf0b7f5e4a..5478e604243c3 100644
--- a/torch/distributed/pipelining/__init__.py
+++ b/torch/distributed/pipelining/__init__.py
@@ -3,7 +3,10 @@
 from .schedules import (
     _ScheduleForwardOnly,
     Schedule1F1B,
+<<<<<<< HEAD
     ScheduleDualPipeV,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
@@ -26,5 +29,8 @@
     "ScheduleLoopedBFS",
     "ScheduleInterleavedZeroBubble",
     "ScheduleZBVZeroBubble",
+<<<<<<< HEAD
     "ScheduleDualPipeV",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index 38d30c793e89d..4a96a09780495 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -114,7 +114,11 @@ def get_param_groups(
             "intermediates": intersected,
         }
         for input_node in intersected:
+<<<<<<< HEAD
             existing = param_groups.get(input_node)
+=======
+            existing = param_groups.get(input_node, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if existing is not None:
                 existing["params"] = existing["params"].union(param_group["params"])
                 existing["intermediates"] = existing["intermediates"].union(
@@ -235,6 +239,7 @@ def stage_backward_weight(
         weight_grads.append(weight.grad)
 
     for param_group in param_groups:
+<<<<<<< HEAD
         valid_edges = []
         valid_grad_outputs: list[torch.Tensor] = []
 
@@ -247,6 +252,13 @@ def stage_backward_weight(
                 valid_edges.append(GradientEdge(intermediate, 0))
                 # pyrefly: ignore [bad-argument-type]
                 valid_grad_outputs.append(summed_grad)
+=======
+        # TODO: Handle case where intermediate can have multiple outputs
+        intermediate_edges = tuple(
+            GradientEdge(i, 0) for i in param_group["intermediates"]
+        )
+        weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Break a reference cycle caused inside stage_backward_input->get_hook->hook
         # The summarized cycle is:
@@ -255,6 +267,7 @@ def stage_backward_weight(
         # We need to keep intermediates alive up until backward_weight, but we can free it now.
         del param_group["intermediates"]
 
+<<<<<<< HEAD
         if valid_edges:  # Only call autograd.grad if we have valid gradients
             # [NEW!] Able to pass a GradientEdge to autograd.grad as output
             weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
@@ -274,6 +287,27 @@ def stage_backward_weight(
                     weight.grad = dw
                 else:
                     weight.grad += dw
+=======
+        assert all(len(g) == 1 for g in param_group["grads"])
+        # [NEW!] Able to pass a GradientEdge to autograd.grad as output
+        # We do not need to retain_graph because... guarantee no overlap?
+        # print("trying to execute: ", intermediate_edges, weights_edges)
+        dweights = torch.autograd.grad(
+            intermediate_edges,
+            weights_edges,
+            grad_outputs=sum(param_group["grads"], tuple()),
+            retain_graph=retain_graph,
+        )
+        # release grad memory early after use
+        del param_group["grads"]
+
+        for grad_acc, dw in zip(param_group["params"], dweights):
+            weight, index = grad_acc_to_weight[grad_acc]
+            if weight.grad is None:
+                weight.grad = dw
+            else:
+                weight.grad += dw
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # return grads in the original order weights were provided in
     return tuple(weight_grads)
 
@@ -367,6 +401,7 @@ def extract_tensors_with_grads(
         for val in input_values:
             if isinstance(val, torch.Tensor):
                 grad_inputs.append(val.grad)
+<<<<<<< HEAD
                 # Since gradients that will pass back to previous stages do not require gradient accumulation,
                 # by decrementing the gradients' reference count at this point, the memory of gradients will be
                 # returned to the allocator as soon as the next micro batch's get_bwd_send_ops comes and current
@@ -374,6 +409,8 @@ def extract_tensors_with_grads(
                 # This prevents the gradients from persisting in GPU memory for the entire duration of step_microbatches
                 # until clear_runtime_states() is called.
                 val.grad = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 grad_inputs.append(None)
 
diff --git a/torch/distributed/pipelining/_schedule_visualizer.py b/torch/distributed/pipelining/_schedule_visualizer.py
index e5891c775a687..431a302e0e5e7 100644
--- a/torch/distributed/pipelining/_schedule_visualizer.py
+++ b/torch/distributed/pipelining/_schedule_visualizer.py
@@ -9,15 +9,22 @@
 visualize_schedule(ops, "test.png")
 """
 
+<<<<<<< HEAD
 import collections
 from typing import NamedTuple, Optional, Union
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 from torch.distributed.pipelining.schedules import (
     _Action,
     _ComputationType,
     _PipelineSchedule,
+<<<<<<< HEAD
     _PipelineScheduleRuntime,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_schedule_class,
     PipelineScheduleMulti,
     PipelineScheduleSingle,
@@ -25,6 +32,7 @@
 from torch.distributed.pipelining.stage import PipelineStage
 
 
+<<<<<<< HEAD
 class OpKey(NamedTuple):
     stage_index: int
     computation_type: _ComputationType
@@ -38,6 +46,13 @@ def get_schedule_ops(
     num_stages_per_rank: Optional[int] = None,
     add_spacing: bool = False,
     with_comms: bool = False,
+=======
+def get_schedule_ops(
+    schedule: Union[str, _PipelineSchedule],
+    pp_degree: int,
+    num_microbatches: int,
+    num_stages_per_rank: Optional[int] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[list[Optional[_Action]]]:
     """
     Get all actions for a given schedule, pp_degree, and num_microbatches. The actions are returned in a list of lists
@@ -45,12 +60,19 @@ def get_schedule_ops(
 
     The schedule can be specified as a string which is passed into get_schedule_class() or a _PipelineSchedule instance.
     """
+<<<<<<< HEAD
     if add_spacing and with_comms:
         raise ValueError("Cannot add spacing and view comms at the same time")
 
     if isinstance(schedule, str):
         schedule_class = get_schedule_class(schedule)
     elif issubclass(schedule, _PipelineSchedule):
+=======
+
+    if isinstance(schedule, str):
+        schedule_class = get_schedule_class(schedule)
+    elif type(schedule) == _PipelineSchedule:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         schedule_class = schedule
     else:
         raise ValueError(f"Invalid schedule: {schedule}")
@@ -81,6 +103,7 @@ def get_schedule_ops(
         raise ValueError(f"Invalid schedule: {schedule_class}")
 
     # Instantiate the schedule class
+<<<<<<< HEAD
     # pyrefly: ignore [bad-instantiation, bad-argument-type]
     schedule_instance = schedule_class(stages, num_microbatches)
     assert schedule_instance.pipeline_order is not None
@@ -104,12 +127,24 @@ def get_schedule_ops(
             [action for action in rank if action is not None] for rank in all_actions
         ]
         all_actions = add_schedule_op_spacing(all_actions)
+=======
+    schedule_instance = schedule_class(stages, num_microbatches)
+
+    # Convert to List[List[_Action]]
+    all_actions = []
+    for rank in range(pp_degree):
+        all_actions.append(schedule_instance.pipeline_order[rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Return the pipeline order
     return all_actions
 
 
+<<<<<<< HEAD
 class _ComputationTypeVisual:
+=======
+class _ComputationTypeColor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         color: str,
@@ -121,6 +156,7 @@ def __init__(
         self.text = text
 
 
+<<<<<<< HEAD
 # Update the mapping to use _ComputationTypeVisual instances
 action_type_to_color_mapping = {
     _ComputationType.FORWARD: _ComputationTypeVisual("blue", "Forward"),
@@ -333,18 +369,34 @@ def schedule_action(action: _Action, rank: int, timestep: int) -> int:
 def visualize_schedule(
     schedule: list[list[Optional[_Action]]],
     filename: Optional[str] = None,
+=======
+# Update the mapping to use _ComputationTypeColor instances
+action_type_to_color_mapping = {
+    _ComputationType.FORWARD: _ComputationTypeColor("blue", "Forward"),
+    _ComputationType.BACKWARD_INPUT: _ComputationTypeColor("teal", "Backward Input"),
+    _ComputationType.BACKWARD_WEIGHT: _ComputationTypeColor("green", "Backward Weight"),
+    _ComputationType.FULL_BACKWARD: _ComputationTypeColor("orange", "Full Backward", 2),
+}
+
+
+def visualize_schedule(
+    schedule: list[list[Optional[_Action]]], filename: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Visualize the schedule using matplotlib.
     The schedule is a list of lists where each inner list represents a rank and each element in the inner list represents an action.
     The actions are represented as rectangles with different colors based on their computation type.
     The filename is optional and if provided, the plot will be saved to that file.
+<<<<<<< HEAD
 
     Args:
         schedule: The schedule to visualize.
         filename: The filename to save the plot to. If not provided, the plot will be displayed.
         add_schedule_spacing: If True, add spacing to the schedule based on dependencies between ranks.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     import matplotlib.pyplot as plt
@@ -367,11 +419,16 @@ def visualize_schedule(
         for action in actions:
             if action is not None:
                 comp_type_color = action_type_to_color_mapping.get(
+<<<<<<< HEAD
                     action.computation_type, _ComputationTypeVisual("black")
+=======
+                    action.computation_type, _ComputationTypeColor("black")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 used_computation.add(action.computation_type)
                 color = comp_type_color.color
                 width = comp_type_color.width
+<<<<<<< HEAD
 
                 # Check if action has sub_actions to determine styling
                 if action.sub_actions is not None:
@@ -381,6 +438,8 @@ def visualize_schedule(
                     linewidth = 1  # Default linewidth for regular actions
                     text_weight = "normal"  # Default text weight
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Draw the rectangle to represent the action duration
                 rect = Rectangle(
                     (draw_position, num_ranks - rank_idx - 1),
@@ -388,10 +447,15 @@ def visualize_schedule(
                     1,
                     facecolor=color,
                     edgecolor="black",
+<<<<<<< HEAD
                     linewidth=linewidth,
                 )
                 ax.add_patch(rect)
 
+=======
+                )
+                ax.add_patch(rect)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Draw the text centered within the rectangle
                 ax.text(
                     draw_position + width / 2,
@@ -401,9 +465,14 @@ def visualize_schedule(
                     va="center",
                     fontsize=font_size,
                     color="white",
+<<<<<<< HEAD
                     weight=text_weight,
                 )
 
+=======
+                )
+                # Increment the drawing position by the width of the current action
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 draw_position += width
             else:
                 draw_position += 1  # Move to the next
diff --git a/torch/distributed/pipelining/_utils.py b/torch/distributed/pipelining/_utils.py
index 2f0472211b8c8..2532fed78f546 100644
--- a/torch/distributed/pipelining/_utils.py
+++ b/torch/distributed/pipelining/_utils.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 from dataclasses import dataclass
 from typing import Union
@@ -123,6 +126,7 @@ def generate_stage_to_rank_mapping(
     return mapping
 
 
+<<<<<<< HEAD
 def generate_rank_to_stage_mapping(
     pp_size: int, num_stages: int, style: str = "loop"
 ) -> dict[int, list[int]]:
@@ -149,6 +153,8 @@ def generate_rank_to_stage_mapping(
     return rank_to_stages
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class PipeInfo:
     """
diff --git a/torch/distributed/pipelining/microbatch.py b/torch/distributed/pipelining/microbatch.py
index 9a576d2a829a3..569a830a72883 100644
--- a/torch/distributed/pipelining/microbatch.py
+++ b/torch/distributed/pipelining/microbatch.py
@@ -2,13 +2,20 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
 import operator
+<<<<<<< HEAD
 from collections.abc import Sequence
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional
 
 import torch
 from torch.fx.node import map_aggregate
+<<<<<<< HEAD
 from torch.nn.attention.flex_attention import BlockMask
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+=======
+from torch.utils._pytree import tree_flatten, tree_unflatten
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -117,6 +124,7 @@ class _Replicate:
     pass
 
 
+<<<<<<< HEAD
 def _split_block_mask(
     block_mask: BlockMask,
     num_chunks: int,
@@ -222,6 +230,8 @@ def _split_tensor(
     return expanded_chunks
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _shard_dict_of_args(
     args_dict,
     args_chunk_spec,
@@ -239,6 +249,7 @@ def _shard_dict_of_args(
     Returns:
         args_split: List of sharded args
     """
+<<<<<<< HEAD
 
     if not args_dict:
         return [{} for _ in range(num_chunks)]
@@ -303,6 +314,116 @@ def _shard_dict_of_args(
         tree_unflatten(_flat_split_result, tree_spec)
         for _flat_split_result in flat_split_results
     ]
+=======
+    # Stage 1+2: flatten and shard/replicate
+
+    # args_sharded_replicated : [num args, num flat values, num chunks]
+    args_sharded_replicated = {}
+    arg_specs = []
+
+    real_num_chunks = num_chunks
+    first_tensor = True
+
+    assert len(args_dict) == len(args_chunk_spec), (
+        f"args_dict.keys() = {list(args_dict.keys())} args_chunk_spec.keys() = {list(args_chunk_spec.keys())}"
+    )
+
+    for arg_key, arg in args_dict.items():
+        flat, spec = tree_flatten(arg)
+        arg_specs.append(spec)
+
+        chunk_spec = args_chunk_spec[arg_key]
+        assert chunk_spec is not None  # Should have been set by caller
+        chunk_spec_flat, _ = tree_flatten(chunk_spec)
+        if len(flat) != len(chunk_spec_flat):
+            raise ValueError(
+                f"Argument value {arg} did not have the same number of "
+                f"values as as chunk spec {chunk_spec}"
+            )
+
+        sharded_arg_flat = []
+
+        for v, chunk_v in zip(flat, chunk_spec_flat):
+            if chunk_v is _Replicate or not isinstance(v, torch.Tensor):
+                sharded_arg_flat.append([v] * real_num_chunks)
+            elif isinstance(chunk_v, TensorChunkSpec):
+                # TODO: check type of v. If it's a tensor, use chunk (or debug mask).
+                # If it's a collection type, split it as you would expect. Otherwise,
+                # Throw an error
+                assert isinstance(v, torch.Tensor), f"{v} is not a tensor"
+
+                v_split_dim_size = v.size(chunk_v.split_dim)
+                if v_split_dim_size < real_num_chunks:
+                    if first_tensor:
+                        # We can only adjust number of chunks when we hit this
+                        # issue at the first tensor encountered
+                        logger.warning(
+                            f"Tensor size on chunking dimension is {v_split_dim_size}, "  # noqa: G004
+                            f"downsizing the number of chunks from {num_chunks} to {v_split_dim_size}."
+                        )
+                        real_num_chunks = v_split_dim_size
+                    else:
+                        raise RuntimeError(
+                            f"Arg {arg_key} on chunking dimension has a size of {v_split_dim_size}, "
+                            f"smaller than the number of chunks {num_chunks}. "
+                            "PiPPy cannot reduce the number of chunks because "
+                            "other arguments have bigger chunk-dimension sizes. "
+                            "Please adjust your num_chunks setting."
+                        )
+
+                chunk_tensors = torch.tensor_split(
+                    v, real_num_chunks, chunk_v.split_dim
+                )
+
+                if _debug_mask_minibatches:
+                    expanded_chunks = []
+
+                    split_dim_idx = 0
+                    for chunk_tensor in chunk_tensors:
+                        new_val = torch.zeros_like(v)
+                        upper_idx = split_dim_idx + chunk_tensor.size(chunk_v.split_dim)
+
+                        slice_indices = [slice(None, None, None)] * new_val.ndim
+                        slice_indices[chunk_v.split_dim] = slice(
+                            split_dim_idx, upper_idx
+                        )
+                        new_val[slice_indices] = chunk_tensor
+
+                        expanded_chunks.append(new_val)
+
+                        split_dim_idx += chunk_tensor.size(chunk_v.split_dim)
+
+                    sharded_arg_flat.append(expanded_chunks)
+                else:
+                    sharded_arg_flat.append(chunk_tensors)  # type: ignore[arg-type]
+
+                first_tensor = False
+            else:
+                raise TypeError(f"Unrecognized chunk spec: {chunk_v}")
+
+        args_sharded_replicated[arg_key] = sharded_arg_flat
+
+    # chunks_flat : [num chunks, num args, num flat values]
+    chunks_flat = []
+    for chunk_idx in range(real_num_chunks):
+        chunk_args = {}
+        for key, arg in args_sharded_replicated.items():
+            arg_single_chunk = [v_flat[chunk_idx] for v_flat in arg]
+            chunk_args[key] = arg_single_chunk
+        chunks_flat.append(chunk_args)
+
+    # args_split : [num chunks, num args]
+    args_split = []
+
+    for chunk in chunks_flat:
+        per_chunk_args = {}
+        assert len(arg_specs) == len(chunk)
+        for (key, arg), arg_spec in zip(chunk.items(), arg_specs):
+            per_chunk_args[key] = tree_unflatten(arg, arg_spec)
+        args_split.append(per_chunk_args)
+
+    return args_split
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def split_args_kwargs_into_chunks(
@@ -366,6 +487,7 @@ def split_args_kwargs_into_chunks(
 
     # If user did not provide args_chunk_spec or kwargs_chunk_spec, we extend
     # their format and use default chunking along dim 0
+<<<<<<< HEAD
     def default_spec(v):
         if isinstance(v, torch.Tensor | BlockMask):
             return TensorChunkSpec(DEFAULT_CHUNK_DIM)
@@ -381,6 +503,13 @@ def default_spec(v):
         kwargs_chunk_spec = tree_map(
             default_spec, kwargs, is_leaf=lambda v: isinstance(v, BlockMask)
         )
+=======
+    if args_chunk_spec is None:
+        args_chunk_spec = (TensorChunkSpec(DEFAULT_CHUNK_DIM),) * len(args)
+
+    if kwargs_chunk_spec is None:
+        kwargs_chunk_spec = dict.fromkeys(kwargs, TensorChunkSpec(DEFAULT_CHUNK_DIM))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     args_split_dict = _shard_dict_of_args(
         dict(enumerate(args)),
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index 39da483fe002b..faed0b88d5095 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -8,10 +8,15 @@
 import re
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from functools import lru_cache
 from typing import Any, cast, NamedTuple, Optional, Protocol, Union
+=======
+from enum import Enum
+from typing import Any, Callable, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -20,7 +25,11 @@
 from torch.nn.modules.loss import _Loss
 from torch.profiler import record_function
 
+<<<<<<< HEAD
 from ._utils import generate_rank_to_stage_mapping, generate_stage_to_rank_mapping
+=======
+from ._utils import generate_stage_to_rank_mapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .microbatch import merge_chunks, split_args_kwargs_into_chunks, TensorChunkSpec
 from .stage import _PipelineStageBase
 
@@ -35,7 +44,10 @@
     "ScheduleLoopedBFS",
     "ScheduleInterleavedZeroBubble",
     "ScheduleZBVZeroBubble",
+<<<<<<< HEAD
     "ScheduleDualPipeV",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 logger = logging.getLogger(__name__)
@@ -53,7 +65,10 @@ class _ComputationType(Enum):
     SEND_B = 8
     RECV_B = 9
     FULL_BACKWARD = 10
+<<<<<<< HEAD
     OVERLAP_F_B = 11
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self):
         str_map = {
@@ -67,7 +82,10 @@ def __str__(self):
             _ComputationType.SEND_B: "SEND_B",
             _ComputationType.RECV_B: "RECV_B",
             _ComputationType.FULL_BACKWARD: "B",
+<<<<<<< HEAD
             _ComputationType.OVERLAP_F_B: "OVERLAP_F_B",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         return str_map[self]
 
@@ -93,8 +111,11 @@ def from_str(action):
             return _ComputationType.RECV_B
         elif action == "B":
             return _ComputationType.FULL_BACKWARD
+<<<<<<< HEAD
         elif action == "OVERLAP_F_B":
             return _ComputationType.OVERLAP_F_B
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise RuntimeError(f"Invalid computation type {action}")
 
@@ -109,7 +130,10 @@ def from_str(action):
 SEND_B = _ComputationType.SEND_B
 RECV_B = _ComputationType.RECV_B
 FULL_BACKWARD = _ComputationType.FULL_BACKWARD
+<<<<<<< HEAD
 OVERLAP_F_B = _ComputationType.OVERLAP_F_B
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Convenience shorthand for compute actions only since they are used in 'simple schedule format'
 F = FORWARD
@@ -127,6 +151,7 @@ class _Action(NamedTuple):
     stage_index: int
     computation_type: _ComputationType
     microbatch_index: Optional[int] = None
+<<<<<<< HEAD
     sub_actions: Optional[tuple["_Action", ...]] = None
 
     def __str__(self):
@@ -153,6 +178,15 @@ def is_compute_op(self) -> bool:
             BACKWARD_WEIGHT,
             OVERLAP_F_B,
         )
+=======
+
+    def __repr__(self):
+        repr = str(self.stage_index)
+        repr += str(self.computation_type)
+        if self.microbatch_index is not None:
+            repr += str(self.microbatch_index)
+        return repr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def from_str(action_string: str):
@@ -163,6 +197,7 @@ def from_str(action_string: str):
             e.g. `2F0`, `1UNSHARD`, `3SEND_F1`
         """
         action_string = action_string.strip()
+<<<<<<< HEAD
         if action_string == "":
             return None
 
@@ -195,6 +230,8 @@ def from_str(action_string: str):
             )
 
         # Handle regular single action format
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if match := _action_regex.match(action_string):
             stage_index, computation_type, microbatch_index = match.groups()
             return _Action(
@@ -209,11 +246,14 @@ def from_str(action_string: str):
         )
 
 
+<<<<<<< HEAD
 @lru_cache
 def _get_profiler_function_name(action: _Action) -> str:
     return f"PP:{str(action)}"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _format_pipeline_order(
     pipeline_order: dict[int, list[Optional[_Action]]],
     error_step_number: Optional[int] = None,
@@ -246,7 +286,10 @@ def _format_pipeline_order(
         pipeline_order.get(key, [""] * num_steps) for key in sorted(pipeline_order)
     ]
     # Transpose the list of lists (rows to columns)
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     transposed_actions = list(itertools.zip_longest(*rank_actions, fillvalue=""))
     # Generate column labels for ranks
     num_ranks = len(pipeline_order)
@@ -313,13 +356,21 @@ def __init__(
         logger.info("Using %s", self.__class__.__name__)
 
     def _maybe_compute_loss(self, stage, output, target_mbs, mb_index):
+<<<<<<< HEAD
         if stage.is_last and self._loss_fn is not None:
+=======
+        if stage.is_last and self._has_backward:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss = self._compute_loss(output, target_mbs[mb_index])  # type: ignore[index]
             self._internal_losses.append(loss)
 
     def _maybe_get_loss(self, stage, mb_index):
         valid_index = 0 <= mb_index < len(self._internal_losses)
+<<<<<<< HEAD
         if stage.is_last and self._loss_fn is not None and valid_index:
+=======
+        if stage.is_last and self._has_backward and valid_index:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._internal_losses[mb_index]
         elif len(self._internal_losses) != 0 and not valid_index:
             raise RuntimeError(
@@ -359,7 +410,10 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -368,11 +422,15 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+<<<<<<< HEAD
             return_outputs: whether to return the outputs from the last stage.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         raise NotImplementedError
 
     @abstractmethod
+<<<<<<< HEAD
     def step(
         self,
         *args,
@@ -381,6 +439,9 @@ def step(
         return_outputs=True,
         **kwargs,
     ):
+=======
+    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -390,6 +451,7 @@ def step(
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+<<<<<<< HEAD
         return_outputs: whether to return the outputs from the last stage.
         """
         raise NotImplementedError
@@ -414,13 +476,22 @@ def eval(self, *args, target=None, losses: Optional[list] = None, **kwargs):
             # Restore the original state
             self._has_backward = original_has_backward
 
+=======
+        """
+        raise NotImplementedError
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _check_inputs(
         self,
         arg_mbs: Optional[list] = None,
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
     ) -> tuple[list, list]:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Pre-process/check inputs
         """
@@ -570,8 +641,14 @@ def __init__(
         # Self attributes
         self._stage = stage
         self._num_stages = stage.num_stages
+<<<<<<< HEAD
         self._stage_forward_initialized = False
         self._stage_backward_initialized = False
+=======
+        # Set the same has_backward flag for stage object
+        self._stage.has_backward = self._has_backward
+        self._stage_initialized = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if n_microbatches < self._num_stages:
             raise ValueError(
@@ -584,6 +661,7 @@ def __init__(
         )
 
     def _initialize_stage(self, args, kwargs):
+<<<<<<< HEAD
         if not self._stage_forward_initialized:
             # Prepare the communication needed for the pipeline schedule execution
             # This is needed because during execution we always perform a series of batch P2P ops
@@ -607,6 +685,14 @@ def step(
         return_outputs: bool = True,
         **kwargs,
     ):
+=======
+        self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
+        if self._has_backward:
+            self._stage._prepare_backward_infra(self._n_microbatches)
+        self._stage_initialized = True
+
+    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -616,6 +702,7 @@ def step(
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+<<<<<<< HEAD
         return_outputs: whether to return the outputs from the last stage.
         """
         if self._has_backward and not torch.is_grad_enabled():
@@ -627,6 +714,9 @@ def step(
 
         # Set the same has_backward flag for stage object
         self._stage.has_backward = self._has_backward
+=======
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Clean per iteration
         self._stage.clear_runtime_states()
@@ -641,6 +731,7 @@ def step(
             targets_split = None
 
         # Run microbatches
+<<<<<<< HEAD
         self._step_microbatches(
             args_split, kwargs_split, targets_split, losses, return_outputs
         )
@@ -651,6 +742,12 @@ def step(
 
         # Return merged results per original format
         if self._stage.is_last and return_outputs:
+=======
+        self._step_microbatches(args_split, kwargs_split, targets_split, losses)
+
+        # Return merged results per original format
+        if self._stage.is_last:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._merge_outputs(self._stage.output_chunks)
         else:
             return None
@@ -686,7 +783,10 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Run one iteration of the pipeline schedule
@@ -697,7 +797,12 @@ def _step_microbatches(
             )
 
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
+<<<<<<< HEAD
         self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+=======
+        if not self._stage_initialized:
+            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Delay send waits
         fwd_sends_to_wait: list[list[dist.Work]] = []
@@ -737,7 +842,10 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -745,10 +853,18 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+<<<<<<< HEAD
             return_outputs: whether to return the outputs from the last stage.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
         self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+=======
+        """
+        arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
+
+        if not self._stage_initialized:
+            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Delay send waits
         fwd_sends_to_wait: list[list[dist.Work]] = []
@@ -761,9 +877,13 @@ def _step_microbatches(
                 for work in works.values():
                     _wait_batch_p2p(work)
 
+<<<<<<< HEAD
                 output = self._stage.forward_one_chunk(
                     i, arg_mbs[i], kwarg_mbs[i], save_forward_output=return_outputs
                 )  # type: ignore[index]
+=======
+                output = self._stage.forward_one_chunk(i, arg_mbs[i], kwarg_mbs[i])  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 ops = self._stage.get_fwd_send_ops(i)
                 works = _sorted_batch_p2p(ops, desc="fwd_send")
@@ -779,6 +899,13 @@ def _step_microbatches(
         for work in fwd_sends_to_wait:
             _wait_batch_p2p(work)
 
+<<<<<<< HEAD
+=======
+        # No loss function, no need to run backward
+        if not self._has_backward:
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Run backward
         # Delay send waits
         bwd_sends_to_wait: list[list[dist.Work]] = []
@@ -802,13 +929,26 @@ def _step_microbatches(
 
             logger.debug("[%s] Backwarded microbatch %s", self._stage.stage_index, i)
 
+<<<<<<< HEAD
+=======
+        self._stage.scale_grads(
+            grad_scale_factor=self._n_microbatches if self.scale_grads else 1
+        )
+
+        # Return losses if there is a container passed in
+        self._update_losses(self._stage, losses)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Wait for all backward sends to finish
         for work in bwd_sends_to_wait:
             _wait_batch_p2p(work)
 
+<<<<<<< HEAD
         # Update losses if there is a container passed in
         self._update_losses(self._stage, losses)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
         """
         Returns the pipeline order for GPipe schedule.
@@ -854,7 +994,10 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -862,10 +1005,18 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+<<<<<<< HEAD
             return_outputs: whether to return the outputs from the last stage.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
         self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+=======
+        """
+        arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
+
+        if not self._stage_initialized:
+            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Last stage has 1 warmup, second-to-last 2 warmups, ...
         # first stage `num_stages` warmups
@@ -888,10 +1039,14 @@ def _step_microbatches(
 
             # Compute
             output = self._stage.forward_one_chunk(
+<<<<<<< HEAD
                 fwd_mb_index,
                 arg_mbs[fwd_mb_index],
                 kwarg_mbs[fwd_mb_index],
                 save_forward_output=return_outputs,
+=======
+                fwd_mb_index, arg_mbs[fwd_mb_index], kwarg_mbs[fwd_mb_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )  # type: ignore[index]
 
             # Clear previous chunk's forward sends (hopefully they have well
@@ -946,10 +1101,14 @@ def _step_microbatches(
 
             # Now do the fwd
             output = self._stage.forward_one_chunk(
+<<<<<<< HEAD
                 fwd_mb_index,
                 arg_mbs[fwd_mb_index],
                 kwarg_mbs[fwd_mb_index],
                 save_forward_output=return_outputs,
+=======
+                fwd_mb_index, arg_mbs[fwd_mb_index], kwarg_mbs[fwd_mb_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )  # type: ignore[index]
 
             # Compute loss
@@ -984,6 +1143,13 @@ def _step_microbatches(
             send_work = _batch_p2p(bwd_sends, desc="bwd_send")
             bwd_mb_index += 1
 
+<<<<<<< HEAD
+=======
+        self._stage.scale_grads(
+            grad_scale_factor=self._n_microbatches if self.scale_grads else 1
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Wait for the last backward send to finish
         _wait_batch_p2p(send_work)
 
@@ -1063,7 +1229,11 @@ def _add_unshard_reshard(
     compute_actions: list[Optional[_Action]],
     max_active_stages: int = 3,
 ) -> list[_Action]:
+<<<<<<< HEAD
     """Given a basic schedule involving only compute actions (F,B,W,OVERLAP_F_B), add UNSHARD/RESHARD actions for FSDP.
+=======
+    """Given a basic schedule involving only compute actions (F,B,W), add UNSHARD/RESHARD actions for FSDP.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     UNSHARD refers to fetching the full contents of an FSDP-sharded layer, requiring an all-gather operation.
     RESHARD does the opposite, releasing memory (but doing no communication)
@@ -1083,6 +1253,7 @@ def next_stage_indices(
         ret: list[int] = []
 
         for a in next_actions:
+<<<<<<< HEAD
             if a is not None:
                 # Handle OVERLAP_F_B actions by checking their sub_actions
                 if a.computation_type == OVERLAP_F_B and a.sub_actions is not None:
@@ -1099,6 +1270,13 @@ def next_stage_indices(
                         ret.append(a.stage_index)
                         if len(ret) == count:
                             break
+=======
+            if a is not None and a.stage_index not in seen:
+                seen.add(a.stage_index)
+                ret.append(a.stage_index)
+                if len(ret) == count:
+                    break
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ret
 
     active_stages: set[int] = set()
@@ -1137,10 +1315,13 @@ def _reshard(stage_index: int):
             _unshard(stage)
         fsdp_aware_actions.append(action)
 
+<<<<<<< HEAD
     # Reshard all remaining active stages after processing all operations
     for stage in list(active_stages):
         _reshard(stage)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fsdp_aware_actions
 
 
@@ -1159,6 +1340,7 @@ def _merge_bw(
         if action is None:
             continue
 
+<<<<<<< HEAD
         # Remove any None actions and find the next non-None action
         while len(compute_actions) and compute_actions[0] is None:
             compute_actions.pop(0)
@@ -1166,6 +1348,12 @@ def _merge_bw(
         # Get the next action if it exists
         next_action = compute_actions[0] if len(compute_actions) > 0 else None
 
+=======
+        while len(compute_actions) and (next_action := compute_actions[0]) is None:
+            # remove any None actions between 'action' and 'next_action'
+            compute_actions.pop(0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             action.computation_type == BACKWARD_INPUT
             and next_action is not None
@@ -1187,12 +1375,15 @@ def _add_send_recv(
     stage_to_rank: Callable[[int], int],
     num_stages: int,
 ) -> dict[int, list[_Action]]:
+<<<<<<< HEAD
     """
     Transforms a compute-only schedule into a complete schedule with communication actions.
 
     For actions with sub-actions (OVERLAP_F_B) we ensure that all the subactions have been
     computed and the communication is ready
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     comm_actions: dict[int, list[_Action]] = {rank: [] for rank in compute_actions}
     prev_actions: dict[int, set[_Action]] = {rank: set() for rank in compute_actions}
 
@@ -1226,7 +1417,11 @@ def _ready_to_schedule(
         """
         if action is None:
             return True
+<<<<<<< HEAD
         elif action.computation_type == F and action.stage_index != 0:
+=======
+        elif action.computation_type == F and not action.stage_index == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 _Action(action.stage_index, RECV_F, action.microbatch_index)
                 in prev_actions
@@ -1240,7 +1435,11 @@ def _ready_to_schedule(
             return False
         elif (
             action.computation_type in (BACKWARD_INPUT, FULL_BACKWARD)
+<<<<<<< HEAD
             and action.stage_index != num_stages - 1
+=======
+            and not action.stage_index == num_stages - 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if (
                 _Action(action.stage_index, RECV_B, action.microbatch_index)
@@ -1269,6 +1468,7 @@ def _ready_to_schedule(
                 f"{rank=}, {len(compute_actions[rank])=}"
             )
             action = compute_actions[rank][0]
+<<<<<<< HEAD
             # handle case where parent action (e.g. OVERLAP_F_B) can be comprised of subactions
             if action is not None and action.sub_actions is not None:
                 all_actions = action.sub_actions
@@ -1291,6 +1491,23 @@ def _ready_to_schedule(
                         prev_actions[rank].add(send)
                         comm_actions[stage_to_rank(recv.stage_index)].append(recv)
                         prev_actions[stage_to_rank(recv.stage_index)].add(recv)
+=======
+
+            if not _ready_to_schedule(action, prev_actions[rank]):
+                continue
+
+            if action is not None:
+                comm_actions[rank].append(action)
+                prev_actions[rank].add(action)
+                if _has_comms(action):
+                    send, recv = _get_comms(action)
+                    # TODO we can avoid send/recv if the 2 stages are on the same rank.
+                    # should we avoid that in the runtime or here?
+                    comm_actions[rank].append(send)
+                    prev_actions[rank].add(send)
+                    comm_actions[stage_to_rank(recv.stage_index)].append(recv)
+                    prev_actions[stage_to_rank(recv.stage_index)].add(recv)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             compute_actions[rank].pop(0)
             if len(compute_actions[rank]) == 0:
@@ -1324,6 +1541,7 @@ def _validate_schedule(
         for stage_id in range(num_stages)
     }
     stage_index_to_rank_mapping = {}
+<<<<<<< HEAD
 
     def _process_action(action: _Action, rank: int, step: int):
         """Process a single action and update stage_actions and stage_index_to_rank_mapping"""
@@ -1400,6 +1618,42 @@ def _process_action(action: _Action, rank: int, step: int):
             else:
                 # Process the main action normally
                 _process_action(action, rank, step)
+=======
+    for rank in actions:
+        for action in actions[rank]:
+            if action is None:
+                continue
+            assert isinstance(action, _Action), (
+                f"Got an invalid action: {action}, expected instance of _Action"
+            )
+            s_id = action.stage_index
+            ctype = action.computation_type
+            mb_id = action.microbatch_index
+            if ctype == F:
+                stage_actions[s_id][F].add(mb_id)
+            elif ctype == B:
+                assert mb_id in stage_actions[s_id][F], (
+                    f"Running Full Backward for stage {s_id}, microbatch {mb_id} without first running Forward"
+                )
+                stage_actions[s_id][B].add(mb_id)
+            elif ctype == I:
+                assert mb_id in stage_actions[s_id][F], (
+                    f"Running Backward Input for stage {s_id}, microbatch {mb_id} without first running Forward"
+                )
+                stage_actions[s_id][I].add(mb_id)
+            elif ctype == W:
+                assert mb_id in stage_actions[s_id][I], (
+                    f"Running Backward Weight for stage {s_id}, microbatch {mb_id} without first running Backward Input"
+                )
+                stage_actions[s_id][W].add(mb_id)
+            if s_id not in stage_index_to_rank_mapping:
+                stage_index_to_rank_mapping[s_id] = rank
+            else:
+                existing_rank = stage_index_to_rank_mapping[s_id]
+                assert rank == existing_rank, (
+                    f"Stage {s_id} is assigned to both rank {rank} and rank {existing_rank}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for s_id in stage_actions:
         f_mb = len(stage_actions[s_id][F])
@@ -1411,11 +1665,14 @@ def _process_action(action: _Action, rank: int, step: int):
             f"Got {f_mb} {F} microbatches for stage {s_id}, expected {num_microbatches}"
         )
 
+<<<<<<< HEAD
         assert i_mb == w_mb, (
             f"Invalid backward microbatches for stage {s_id}: I and W must have equal counts, \
             but got I={i_mb}, W={w_mb}"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert b_mb + (i_mb + w_mb) // 2 == num_microbatches, (
             f"Invalid backward microbatches for stage {s_id}: expected {num_microbatches} total backwards, \
             but got B={b_mb}, I={i_mb}, W={w_mb}"
@@ -1465,8 +1722,15 @@ def __init__(
         for stage in self._stages:
             stage.stage_index_to_group_rank = self.stage_index_to_group_rank
 
+<<<<<<< HEAD
         self._stages_forward_initialized = False
         self._stages_backward_initialized = False
+=======
+        # Set the same has_backward flag for stage object
+        for stage in self._stages:
+            stage.has_backward = self._has_backward
+        self._stages_initialized = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # avoid putting a reference to 'self' inside the lambda, it creates a ref cycle
         has_loss: bool = self._loss_fn is not None
@@ -1482,6 +1746,7 @@ def __init__(
             )
 
     def _initialize_stages(self, args: tuple[Any, ...], kwargs):
+<<<<<<< HEAD
         if not self._stages_forward_initialized:
             # Prepare the communication needed for the pipeline schedule execution
             # This is needed because during execution we always perform a series of batch P2P ops
@@ -1509,6 +1774,24 @@ def _initialize_stages(self, args: tuple[Any, ...], kwargs):
             for stage in self._stages:
                 stage._prepare_backward_infra(self._n_microbatches)
             self._stages_backward_initialized = True
+=======
+        # may be 'none' value (if this stage sends its output shapes to the next stage via P2P)
+        # or real value (if this stage and next stage are on the same device)
+        next_stage_args: tuple[Any, ...] = tuple()
+        for stage in self._stages:
+            if stage.is_first:
+                next_stage_args = stage._prepare_forward_infra(
+                    self._n_microbatches, args, kwargs
+                )
+            else:
+                next_stage_args = stage._prepare_forward_infra(
+                    self._n_microbatches, next_stage_args, kwargs
+                )
+
+            if self._has_backward:
+                stage._prepare_backward_infra(self._n_microbatches)
+        self._stages_initialized = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _validate_and_set_stage_mapping(
         self, actions: dict[int, list[Optional[_Action]]]
@@ -1548,6 +1831,7 @@ def _load_csv(self, filename, format="compute_only"):
         # This will overwrite the default stage_to_rank_mapping created in the constructor
         self._validate_and_set_stage_mapping(self.pipeline_order)
 
+<<<<<<< HEAD
     def step(
         self,
         *args,
@@ -1556,6 +1840,9 @@ def step(
         return_outputs: bool = True,
         **kwargs,
     ):
+=======
+    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -1565,6 +1852,7 @@ def step(
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+<<<<<<< HEAD
         return_outputs: whether to return the outputs from the last stage.
         """
         if self._has_backward and not torch.is_grad_enabled():
@@ -1578,6 +1866,9 @@ def step(
         for stage in self._stages:
             stage.has_backward = self._has_backward
 
+=======
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Clean per iteration
         for stage in self._stages:
             stage.clear_runtime_states()
@@ -1592,6 +1883,7 @@ def step(
             targets_split = None
 
         # Run microbatches
+<<<<<<< HEAD
         self._step_microbatches(
             args_split, kwargs_split, targets_split, losses, return_outputs
         )
@@ -1607,6 +1899,15 @@ def step(
             if stage.is_last and return_outputs:
                 return self._merge_outputs(stage.output_chunks)
         # Does not contain the last stage or we do not return output chunks
+=======
+        self._step_microbatches(args_split, kwargs_split, targets_split, losses)
+
+        # Return merged results per original format
+        for stage in self._stages:
+            if stage.is_last:
+                return self._merge_outputs(stage.output_chunks)
+        # Does not contain the last stage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     def _step_microbatches(
@@ -1615,7 +1916,10 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Operate on the microbatches for looped schedules (multiple stages on each rank).
@@ -1625,7 +1929,12 @@ def _step_microbatches(
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
 
+<<<<<<< HEAD
         self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+=======
+        if not self._stages_initialized:
+            self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Based on the plan in Step 1 created in __init__:
         # 2. Perform communication based on the pipeline_order
@@ -1659,10 +1968,14 @@ def _step_microbatches(
                         # perform forward computation
                         stage = stage_index_to_stage[stage_index]
                         output = stage.forward_one_chunk(
+<<<<<<< HEAD
                             mb_index,
                             arg_mbs[mb_index],
                             kwarg_mbs[mb_index],
                             save_forward_output=return_outputs,
+=======
+                            mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         self._maybe_compute_loss(stage, output, target_mbs, mb_index)
                         ops.extend(stage.get_fwd_send_ops(mb_index))
@@ -1781,12 +2094,20 @@ def _step_microbatches(
                 # do the communication
                 _wait_batch_p2p(_batch_p2p(ops))
             except Exception as e:
+<<<<<<< HEAD
                 logger.error(  # noqa: G200
                     "[Rank %s] pipeline schedule %s caught the following exception '%s' \
 at time_step %s when running action %s",
                     self.rank,
                     self.__class__.__name__,
                     str(e),
+=======
+                logger.error(
+                    "[Rank %s] pipeline schedule %s caught the following exception \
+                     at time_step %s when running action %s",
+                    self.rank,
+                    self.__class__.__name__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     time_step,
                     action,
                 )
@@ -1801,6 +2122,7 @@ def _step_microbatches(
         self._update_losses(self._stages, losses)
 
 
+<<<<<<< HEAD
 class _PipelineContext:
     def __init__(
         self,
@@ -1821,6 +2143,8 @@ class _CustomFunctionProtocol(Protocol):
     def __call__(self, action: _Action, ctx: _PipelineContext) -> None: ...
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _PipelineScheduleRuntime(PipelineScheduleMulti):
     """
     Provides a simple runtime that requires a 'schedule IR' including specified communication operations.
@@ -1829,6 +2153,7 @@ class _PipelineScheduleRuntime(PipelineScheduleMulti):
     subclassed and the subclass can be responsible for creating a schedule IR.
     """
 
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # Action to custom function mapping
@@ -1881,6 +2206,9 @@ def register_custom_function(
         self._comp_type_to_function_map[computation_type] = custom_function
 
     def _prepare_schedule_with_comms(
+=======
+    def _load_actions(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         actions: dict[int, list[Optional[_Action]]],
         format: str = "compute_only",
@@ -1901,6 +2229,7 @@ def _prepare_schedule_with_comms(
                     self.pipeline_order_with_comms[rank].append(action)
             # TODO what level of validation should we offer for compute+comms schedule?
         elif format == "compute_only":
+<<<<<<< HEAD
             # Validate that the schedule does not have comms already added to it
             for rank, action_list in actions.items():
                 for i, action in enumerate(action_list):
@@ -1912,6 +2241,8 @@ def _prepare_schedule_with_comms(
                             f"should not be present when format='compute_only'."
                         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Perform schedule lowering
             for rank in actions:
                 self.pipeline_order_with_comms[rank] = _add_unshard_reshard(
@@ -1936,13 +2267,18 @@ def _load_csv(self, filename: str, format: str = "compute_only"):
             # this will populate self.pipeline_order
             super()._load_csv(filename)
             # this will populate self.pipeline_order_with_comms
+<<<<<<< HEAD
             self._prepare_schedule_with_comms(self.pipeline_order)
+=======
+            self._load_actions(self.pipeline_order)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif format == "compute_comms":
             actions = {}
             with open(filename, newline="") as csvfile:
                 reader = csv.reader(csvfile)
                 for rank, row in enumerate(reader):
                     actions[rank] = [_Action.from_str(s) for s in row]
+<<<<<<< HEAD
                 self._prepare_schedule_with_comms(actions, format=format)
         else:
             raise NotImplementedError(f"{format=} is not implemented")
@@ -1965,6 +2301,23 @@ def _dump_csv(self, filename: str, format: str = "compute_comms"):
                 writer = csv.writer(csvfile)
                 for rank in self.pipeline_order_with_comms:
                     writer.writerow(self.pipeline_order_with_comms[rank])
+=======
+                self._load_actions(actions, format=format)
+        else:
+            raise NotImplementedError(f"{format=} is not implemented")
+
+    def _dump_csv(self, filename: str):
+        """Dump a CSV representation of the compute + comms schedule into a file with the provided filename."""
+        # TODO should there be an option to dump the compute_only schedule from PipelineScheduleRuntime? It's possible
+        # that it does not exist if it was created from a compute_comms schedule.
+        assert self.pipeline_order_with_comms is not None, (
+            "Must initialize compute_comms schedule before dump_csv"
+        )
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+            for rank in self.pipeline_order_with_comms:
+                writer.writerow(self.pipeline_order_with_comms[rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _simulate(self):
         return _simulate_comms_compute(
@@ -1973,6 +2326,7 @@ def _simulate(self):
             self._num_stages,
         )
 
+<<<<<<< HEAD
     def _assert_unsharded(self, stage: _PipelineStageBase):
         """If an unshard is active for `stage_idx`, wait() it and mark `stage_idx` unshared."""
         stage_uses_fsdp = isinstance(stage.submod, FSDPModule)
@@ -1987,13 +2341,18 @@ def _assert_unsharded(self, stage: _PipelineStageBase):
                 f"Attempted to compute on sharded {stage_idx=}"
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _step_microbatches(
         self,
         arg_mbs: Optional[list] = None,
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+<<<<<<< HEAD
         return_outputs: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Operate on the microbatches for looped schedules (multiple stages on each rank).
@@ -2002,7 +2361,12 @@ def _step_microbatches(
         not support models with skip connections.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
+<<<<<<< HEAD
         self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+=======
+        if not self._stages_initialized:
+            self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Based on the plan in Step 1 created in __init__:
         # 2. Perform communication based on the pipeline_order
@@ -2011,6 +2375,7 @@ def _step_microbatches(
         }
 
         assert self.pipeline_order_with_comms is not None, (
+<<<<<<< HEAD
             "Must call _prepare_schedule_with_comms() before calling _step_microbatches()"
         )
 
@@ -2210,6 +2575,201 @@ def _perform_action(action: _Action) -> None:
                             _perform_action(sub_a)
                     else:
                         _perform_action(action)
+=======
+            "Must call _load_actions() before calling _step_microbatches()"
+        )
+
+        # recv ops indexed by (stage_idx, mb_idx) need to be waited on before use
+        bwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+        fwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+
+        # send ops should be waited on before step() exists, mainly for hygiene
+        send_ops: list[list[dist.Work]] = []
+
+        # we track which stages are 'active' when used with FSDP, and wait on unshard ops before computing on stages
+        unshard_ops: dict[int, UnshardHandle] = {}
+        unsharded_stages = set()
+
+        def _assert_unsharded(stage_idx: int):
+            """If an unshard is active for `stage_idx`, wait() it and mark `stage_idx` unshared."""
+            if stage_idx in unshard_ops:
+                unshard_ops[stage_idx].wait()
+                del unshard_ops[stage_idx]
+                unsharded_stages.add(stage_idx)
+            assert stage_idx in unsharded_stages, (
+                f"Attempted to compute on sharded {stage_idx=}"
+            )
+
+        # count either full_backward or backward_weight together, to determine when to sync DP grads
+        backward_counter: Counter[int] = Counter()
+        for time_step, action in enumerate(self.pipeline_order_with_comms[self.rank]):
+            try:
+                comp_type = action.computation_type
+                mb_index: int = (
+                    action.microbatch_index
+                    if action.microbatch_index is not None
+                    else -1
+                )
+                assert mb_index >= 0 or comp_type in (
+                    UNSHARD,
+                    RESHARD,
+                ), f"{action=} missing mb_index"
+                stage_idx = action.stage_index
+                stage = stage_index_to_stage[stage_idx]
+                stage_uses_fsdp = isinstance(stage.submod, FSDPModule)
+                # see [Note: V-schedule special case]
+                is_next_stage_on_this_rank = stage_idx + 1 in stage_index_to_stage
+                is_prev_stage_on_this_rank = stage_idx - 1 in stage_index_to_stage
+
+                logger.debug(
+                    "_PipelineScheduleRuntime running time_step %d, action %s",
+                    time_step,
+                    action,
+                )
+
+                # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
+                # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
+                # safe to use instead.
+                # However, I was wondering if I should avoid calling batched operators at all in the case that there is
+                # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
+                if comp_type == SEND_F:
+                    send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
+                elif comp_type == SEND_B:
+                    send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
+                elif comp_type == RECV_F:
+                    assert (
+                        stage_idx,
+                        mb_index,
+                    ) not in fwd_recv_ops, (
+                        "Recv twice for {stage_idx=} {mb_index=} without executing forward"
+                    )
+                    fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                        stage.get_fwd_recv_ops(mb_index)
+                    )
+                elif comp_type == RECV_B:
+                    assert (
+                        stage_idx,
+                        mb_index,
+                    ) not in bwd_recv_ops, (
+                        "Recv twice for {stage_idx=} {mb_index=} without executing backward"
+                    )
+                    bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                        stage.get_bwd_recv_ops(mb_index)
+                    )
+                elif comp_type == UNSHARD:
+                    if stage_uses_fsdp:
+                        assert (
+                            stage_idx not in unsharded_stages
+                            and stage_idx not in unshard_ops
+                        ), f"Unsharding the same {stage_idx=} twice"
+                        unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
+                elif comp_type == RESHARD:
+                    if stage_uses_fsdp:
+                        assert stage_idx in unsharded_stages, (
+                            f"Resharding {stage_idx=} without unsharding"
+                        )
+                        assert stage_idx not in unshard_ops, (
+                            f"Resharding {stage_idx=} before finishing unshard"
+                        )
+                        stage.submod.reshard()  # type: ignore[operator]
+                elif comp_type == FORWARD:
+                    if stage_uses_fsdp:
+                        _assert_unsharded(stage_idx)
+
+                    if (
+                        not stage.is_first
+                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                        and not is_prev_stage_on_this_rank
+                    ):
+                        assert (
+                            stage_idx,
+                            mb_index,
+                        ) in fwd_recv_ops, f"Computing {action=} before receiving input"
+                        _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
+
+                    output = stage.forward_one_chunk(
+                        mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
+                    )
+                    self._maybe_compute_loss(stage, output, target_mbs, mb_index)
+
+                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                    # see [Note: V-schedule special case]
+                    if is_next_stage_on_this_rank:
+                        stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
+                            output, mb_index
+                        )
+
+                elif comp_type == FULL_BACKWARD:
+                    if stage_uses_fsdp:
+                        _assert_unsharded(stage_idx)
+
+                    if (
+                        not stage.is_last
+                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                        and not is_next_stage_on_this_rank
+                    ):
+                        assert (
+                            stage_idx,
+                            mb_index,
+                        ) in bwd_recv_ops, (
+                            f"Attempted to run compute {action=} before receiving input"
+                        )
+                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                    loss = self._maybe_get_loss(stage, mb_index)
+                    backward_counter[stage_idx] += 1
+                    last_backward = backward_counter[stage_idx] == self._n_microbatches
+                    grad_scale_factor = self._n_microbatches if self.scale_grads else 1
+                    stage.backward_one_chunk(
+                        mb_index,
+                        loss=loss,
+                        full_backward=True,
+                        last_backward=last_backward,
+                    )
+                    if last_backward:
+                        stage.scale_grads(grad_scale_factor)
+                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                    # see [Note: V-schedule special case]
+                    if is_prev_stage_on_this_rank:
+                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                            stage.get_local_bwd_output(mb_index), mb_index
+                        )
+                elif comp_type == BACKWARD_INPUT:
+                    if stage_uses_fsdp:
+                        _assert_unsharded(stage_idx)
+
+                    if not stage.is_last and not is_next_stage_on_this_rank:
+                        assert (
+                            stage_idx,
+                            mb_index,
+                        ) in bwd_recv_ops, (
+                            f"Attempted to run compute {action=} before receiving input"
+                        )
+                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                    loss = self._maybe_get_loss(stage, mb_index)
+                    stage.backward_one_chunk(
+                        mb_index,
+                        loss=loss,
+                        full_backward=False,
+                        last_backward=False,
+                    )
+                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                    # see [Note: V-schedule special case]
+                    if is_prev_stage_on_this_rank:
+                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                            stage.get_local_bwd_output(mb_index), mb_index
+                        )
+                elif comp_type == BACKWARD_WEIGHT:
+                    if stage_uses_fsdp:
+                        _assert_unsharded(stage_idx)
+                    backward_counter[stage_idx] += 1
+                    stage.backward_weight_one_chunk(
+                        mb_index,
+                        last_backward=backward_counter[stage_idx]
+                        == self._n_microbatches,
+                    )
+                else:
+                    raise ValueError(f"{action=} is unknown or unsupported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 logger.error(
                     "_PipelineScheduleRuntime caught exception at step %s when running action %s.  Full Schedule:",
@@ -2227,16 +2787,27 @@ def _perform_action(action: _Action) -> None:
                 raise e
 
         # Mostly these operations should have finished long ago, but there isn't an obvious time when to wait for them
+<<<<<<< HEAD
         while send_ops:
             _wait_batch_p2p(send_ops.pop())
 
         assert len(self.unshard_ops) == 0, "Unused unshard operations"
+=======
+        while len(send_ops):
+            _wait_batch_p2p(send_ops.pop())
+
+        assert len(unshard_ops) == 0, "Unused unshard operations"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Return losses if there is a container passed in
         self._update_losses(self._stages, losses)
 
 
+<<<<<<< HEAD
 class ScheduleLoopedBFS(_PipelineScheduleRuntime):
+=======
+class ScheduleLoopedBFS(PipelineScheduleMulti):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Breadth-First Pipeline Parallelism.
     See https://arxiv.org/abs/2211.05953 for details.
@@ -2271,9 +2842,12 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+<<<<<<< HEAD
         # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
         self._prepare_schedule_with_comms(self.pipeline_order)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _calculate_single_rank_operations(self, rank):
         n_local_stages = len(self._stages)
         stage_indices = range(
@@ -2440,7 +3014,11 @@ def _get_1f1b_rank_ops(
     return rank_ops
 
 
+<<<<<<< HEAD
 class ScheduleInterleaved1F1B(_PipelineScheduleRuntime):
+=======
+class ScheduleInterleaved1F1B(PipelineScheduleMulti):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The Interleaved 1F1B schedule.
     See https://arxiv.org/pdf/2104.04473 for details.
@@ -2496,9 +3074,12 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+<<<<<<< HEAD
         # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
         self._prepare_schedule_with_comms(self.pipeline_order)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         def get_rank_warmup_ops(rank):
             # Warms up operations for last stage
@@ -2559,7 +3140,11 @@ def backward_stage_index(step):
         )
 
 
+<<<<<<< HEAD
 class ScheduleInterleavedZeroBubble(_PipelineScheduleRuntime):
+=======
+class ScheduleInterleavedZeroBubble(PipelineScheduleMulti):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The Interleaved Zero Bubble schedule.
     See https://arxiv.org/pdf/2401.10241 for details.
@@ -2580,8 +3165,20 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
     ):
+<<<<<<< HEAD
         # TODO: we dont support input/weight backward split with torch.compile
         _check_torch_compile_compatibility(stages, self.__class__.__name__)
+=======
+        # TODO: we don't support Zero Bubble with torch.compile so we
+        # should disable it for now
+        for stage in stages:
+            if isinstance(stage.submod, OptimizedModule):
+                raise RuntimeError(
+                    "The Zero Bubble schedule is not supported with \
+stage modules that have used torch.compile"
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,
@@ -2617,9 +3214,12 @@ def __init__(
             self.n_local_stages * self.pp_group_size,
         )
 
+<<<<<<< HEAD
         # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
         self._prepare_schedule_with_comms(self.pipeline_order)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         def get_rank_warmup_ops(rank):
             # Warms up operations for last stage
@@ -2723,7 +3323,11 @@ def need_bubble(stage, op, microbatch, num_stages_global, seen_ops):
                 if actions[rank][timestamp] is not None:
                     temp_action = actions[rank][timestamp]
                     assert temp_action is not None
+<<<<<<< HEAD
                     stage_index, op, microbatch, _ = temp_action
+=======
+                    stage_index, op, microbatch = temp_action
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not need_bubble(
                         stage_index, op, microbatch, num_stages_global, seen_ops
                     ):
@@ -2751,7 +3355,11 @@ def need_bubble(stage, op, microbatch, num_stages_global, seen_ops):
         return result
 
 
+<<<<<<< HEAD
 class ScheduleZBVZeroBubble(_PipelineScheduleRuntime):
+=======
+class ScheduleZBVZeroBubble(PipelineScheduleMulti):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The Zero Bubble schedule (ZBV variant).
     See https://arxiv.org/pdf/2401.10241 Section 6 for details.
@@ -2777,8 +3385,11 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
     ):
+<<<<<<< HEAD
         # TODO: we dont support input/weight backward split with torch.compile
         _check_torch_compile_compatibility(stages, self.__class__.__name__)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,
@@ -2813,9 +3424,12 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+<<<<<<< HEAD
         # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
         self._prepare_schedule_with_comms(self.pipeline_order)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         # max(2 * self.pp_group_size - 1, ...) ensure the number of microbatches is at least
         # as large of the number of microbatches needed to fully utilize the pipeline
@@ -2935,6 +3549,7 @@ def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         return rank_ops
 
 
+<<<<<<< HEAD
 class ScheduleDualPipeV(_PipelineScheduleRuntime):
     """
     The DualPipeV schedule. A more efficient schedule variant based on the
@@ -3158,6 +3773,8 @@ def add_weight_action_if_pending(actions: list):
         return actions
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_schedule_class(schedule_name: str):
     """
     Maps a schedule name (case insensitive) to its corresponding class object.
@@ -3174,7 +3791,10 @@ def get_schedule_class(schedule_name: str):
         "PipelineScheduleSingle": PipelineScheduleSingle,
         "PipelineScheduleMulti": PipelineScheduleMulti,
         "ZBVZeroBubble": ScheduleZBVZeroBubble,
+<<<<<<< HEAD
         "DualPipeV": ScheduleDualPipeV,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     lowercase_keys = {k.lower(): k for k in schedule_map.keys()}
     lowercase_schedule_name = schedule_name.lower()
@@ -3352,6 +3972,7 @@ def _dump_chrometrace(schedule, filename):
 
     with open(filename, "w") as f:
         json.dump({"traceEvents": events}, f)
+<<<<<<< HEAD
 
 
 def _check_torch_compile_compatibility(
@@ -3378,3 +3999,5 @@ def _check_torch_compile_compatibility(
                     "stage modules that have used torch.compile. "
                     f"Found OptimizedModule in {type(module).__name__}"
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index c18c4d6f67854..409b437ea7b5d 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -3,15 +3,22 @@
 import logging
 import operator
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast, Optional, Union
+=======
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.fx as fx
 import torch.nn as nn
 from torch._subclasses.fake_tensor import FakeTensor
+<<<<<<< HEAD
 from torch.distributed._composable.replicate_with_fsdp import replicate, ReplicateModule
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.fx.node import Argument, map_aggregate
 from torch.nn.parallel import DistributedDataParallel
@@ -155,7 +162,10 @@ def __init__(
         self.submod = submodule
         self.stage_index = stage_index
         self.num_stages = num_stages
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = device
         self.group = group
 
@@ -465,10 +475,18 @@ def get_bwd_send_ops(self, bwd_chunk_id: int) -> list[dist.P2POp]:
         """
         Get the gradient send ops for current stage's backward.
         """
+<<<<<<< HEAD
         if not self.has_backward or self.is_first:
             return []
 
         self._check_chunk_id(bwd_chunk_id)
+=======
+        self._check_chunk_id(bwd_chunk_id)
+
+        if not self.has_backward or self.is_first:
+            return []
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Create bwd send infra lazily
         if self.grad_send_info is None:
             # Send info for input grads during backward:
@@ -590,7 +608,11 @@ def backward_maybe_with_nosync(
         last_backward: bool = False,
     ) -> tuple[tuple[Optional[torch.Tensor], ...], Optional[list[dict[str, Any]]]]:
         """
+<<<<<<< HEAD
         Whether using PP with FSDP, DDP, or replicate there are some runtime differences between the last backward step and the
+=======
+        Whether using PP with FSDP or DDP, there are some runtime differences between the last backward step and the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         other steps.  Namely, we need to accumulate gradients on previous steps and reduce them on the last step, but
         there are additional state-variables and performance considerations depending on the data parallelism used.
         This helper should adapt any pipeline parallel schedule to work with common/supported data parallel libraries.
@@ -644,13 +666,37 @@ def perform_backward(
             else:
                 with self.submod.no_sync():  # type: ignore[operator]
                     result = perform_backward(backward_type)()
+<<<<<<< HEAD
 
         # If submod is a FSDP or replicate module
+=======
+        # If submod is a FSDP module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(self.submod, FSDPModule):
             self.submod.set_is_last_backward(False)
             self.submod.set_reshard_after_backward(False)
             self.submod.set_requires_gradient_sync(False)
             result = perform_backward(backward_type)()
+<<<<<<< HEAD
+=======
+            if last_backward:
+                # Manually call post backward for FSDP
+                def run_post_backward(fsdp_module: FSDPModule) -> None:
+                    fsdp_module.set_is_last_backward(True)
+                    fsdp_module.set_reshard_after_backward(True)
+                    fsdp_module.set_requires_gradient_sync(True)
+                    fsdp_state = fully_shard.state(fsdp_module)  # type: ignore[attr-defined]
+                    for state in fsdp_state._state_ctx.all_states:
+                        if state._fsdp_param_group:
+                            state._fsdp_param_group.post_backward()
+
+                    # it would be much better if pipelining backward invoked .backward so autograd hooks
+                    # worked and modules like DDP/FSDP behaved as expected.  Working around this for the time being,
+                    # we need to call this too to ensure FSDP syncs its grad reduction ops back to the default stream.
+                    fsdp_state._root_post_backward_final_callback()
+
+                run_post_backward(self.submod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         else:
             # Non-DP submodule, regular backward
@@ -664,7 +710,10 @@ def forward_one_chunk(
         fwd_chunk_id: int,
         args: tuple[Any, ...],
         kwargs: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
         save_forward_output: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Perform forward pass on the stage with one microbatch.
@@ -704,8 +753,14 @@ def forward_one_chunk(
 
         # Prepare for final output merge or reduction
         # Output chunks is only used for the last stage since we only merge the output of the last stage
+<<<<<<< HEAD
         if self.is_last and save_forward_output:
             self.output_chunks.append(output)
+=======
+        if self.is_last:
+            self.output_chunks.append(output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Save activations and inputs for backward
         flat_args = flatten_args(composite_args)
         flat_kwargs = flatten_args(composite_kwargs)
@@ -747,10 +802,13 @@ def backward_one_chunk(
         last_backward is controlled by the schedule and signals synchronization of gradients across DP groups
         after the last backward.
         """
+<<<<<<< HEAD
         # skip backward computation if backward is not enabled
         if not self.has_backward:
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check_chunk_id(bwd_chunk_id)
 
         (
@@ -838,10 +896,13 @@ def backward_one_chunk(
         logger.debug("%s Backwarded chunk %s", self.log_prefix, bwd_chunk_id)
 
     def backward_weight_one_chunk(self, bwd_chunk_id: int, last_backward=False):
+<<<<<<< HEAD
         # skip backward computation if backward is not enabled
         if not self.has_backward:
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert bwd_chunk_id in self.dw_runner, (
             f"{self.log_prefix} Attempted to run backward_weight_one_chunk for chunk {bwd_chunk_id}"
             " without first calling `backward_one_chunk(full_backward=False)`"
@@ -922,6 +983,7 @@ def _validate_fwd_outputs(self, outputs: tuple[torch.Tensor, ...]):
             f"Stage {self.stage_index} forward outputs", expected_tensors_meta, outputs
         )
 
+<<<<<<< HEAD
     def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
         """
         Get the operations to initialize the p2p communicators between previous and next stages.
@@ -1003,6 +1065,8 @@ def _post_backward(self, grad_scale_factor: int):
         # NOTE: this must happen after FSDP post_backward is FSDP is enabled
         self.scale_grads(grad_scale_factor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _PipelineStage(_PipelineStageBase):
     def __init__(
@@ -1438,7 +1502,10 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+<<<<<<< HEAD
                 use_batch=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             recv_args = objects[0]
             assert isinstance(recv_args, tuple), type(recv_args)
@@ -1504,7 +1571,10 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+<<<<<<< HEAD
                 use_batch=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             outputs_meta = tuple()
 
@@ -1530,12 +1600,23 @@ def _prepare_forward_infra(
             if not self.is_first:
                 # We assume that we always receive from stage - 1
                 recv_infos = tuple(
+<<<<<<< HEAD
                     _RecvInfo(
                         f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
                         self.stage_index - 1,
                         _make_tensor_from_meta(inp, self.device),
                     )
                     for inp in self.inputs_meta
+=======
+                    [
+                        _RecvInfo(
+                            f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
+                            self.stage_index - 1,
+                            _make_tensor_from_meta(inp, self.device),
+                        )
+                        for inp in self.inputs_meta
+                    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # In case there is backward pass, set requires_grad for receive buffers
                 if self.has_backward:
@@ -1545,7 +1626,11 @@ def _prepare_forward_infra(
                 self.args_recv_info[chunk_id] = recv_infos
             else:
                 self.args_recv_info[chunk_id] = tuple(
+<<<<<<< HEAD
                     _RootArgPlaceholder(i) for i in self.inputs_meta
+=======
+                    [_RootArgPlaceholder(i) for i in self.inputs_meta]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         # Send info during forward for each activation
@@ -1570,11 +1655,24 @@ def _create_grad_recv_info(
             # Receiving gradients from multiple sources is not supported
             # hence we only take the first destination
             grad_recv_info = tuple(
+<<<<<<< HEAD
                 _RecvInfo(
                     f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
                     dst_list[0],
                     _make_tensor_from_meta(self.get_outputs_meta()[idx], self.device),
                 )
                 for idx, dst_list in act_send_info.items()
+=======
+                [
+                    _RecvInfo(
+                        f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
+                        dst_list[0],
+                        _make_tensor_from_meta(
+                            self.get_outputs_meta()[idx], self.device
+                        ),
+                    )
+                    for idx, dst_list in act_send_info.items()
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return grad_recv_info
diff --git a/torch/distributed/remote_device.py b/torch/distributed/remote_device.py
index a71e15c9c349b..f927a6d78f4e7 100644
--- a/torch/distributed/remote_device.py
+++ b/torch/distributed/remote_device.py
@@ -36,14 +36,20 @@ def __init__(self, remote_device: Union[str, torch.device]):
         elif isinstance(remote_device, str):
             fields = remote_device.split("/")
             if len(fields) == 2:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._worker_name, self._device = fields
             elif len(fields) == 1:
                 # Check if this is a valid device.
                 if _remote_device._is_valid_local_device(fields[0]):
                     self._device = fields[0]
                 else:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._worker_name = fields[0]
                     self._device = "cpu"
             else:
@@ -65,7 +71,10 @@ def __init__(self, remote_device: Union[str, torch.device]):
                 # rank:<rank>/device format, extract rank
                 if fields[0] == "rank" and fields[1].isdigit():
                     self._rank = int(fields[1])  # type: ignore[assignment]
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._worker_name = None
                 else:
                     raise ValueError(PARSE_ERROR)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 0c9ebc468be16..c61f0e83b6505 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -9,9 +9,15 @@
 import numbers
 import os
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from datetime import timedelta
 from typing import Optional
+=======
+from collections.abc import Iterator
+from datetime import timedelta
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed import FileStore, Store, TCPStore
 
@@ -93,7 +99,10 @@ def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwa
         result = result._replace(
             query=f"{'&'.join([f'{k}={v}' for k, v in query_dict.items()])}"
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         url = urlunparse(result)
 
     if result.scheme not in _rendezvous_handlers:
@@ -111,7 +120,10 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
     if not isinstance(world_size, numbers.Integral):
         raise RuntimeError(f"`world_size` must be an integer. {world_size}")
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _rendezvous_helper(url, rank, world_size, **kwargs)
 
 
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index 845ce0b7faf6c..ce1b2f794eef0 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -295,8 +295,13 @@ def _barrier(worker_names):
     """
     try:
         _all_gather(None, set(worker_names))
+<<<<<<< HEAD
     except RuntimeError:
         logger.exception("Failed to complete barrier")
+=======
+    except RuntimeError as ex:
+        logger.error("Failed to complete barrier, got error %s", ex)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_require_initialized
@@ -311,7 +316,13 @@ def _wait_all_workers(timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     try:
         _all_gather(None, timeout=timeout)
     except RuntimeError as ex:
+<<<<<<< HEAD
         logger.exception("Failed to respond to 'Shutdown Proceed' in time")
+=======
+        logger.error(
+            "Failed to respond to 'Shutdown Proceed' in time, got error %s", ex
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ex
 
 
@@ -471,7 +482,10 @@ def _rref_typeof_on_user(
 
 
 T = TypeVar("T")
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-annotation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GenericWithOneTypeVar = Generic[T]
 
 
@@ -718,7 +732,10 @@ def _invoke_rpc(
         is_async_exec = hasattr(func, "_wrapped_async_rpc_function")
 
         if is_async_exec:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             wrapped = func._wrapped_async_rpc_function
             if isinstance(wrapped, torch.jit.ScriptFunction):
                 func = wrapped
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 16299404c6b65..1098b2d2899a5 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -95,7 +95,10 @@ def register_backend(
     BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
     if BackendType.__doc__:
         BackendType.__doc__ = _backend_type_doc
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return BackendType[backend_name]
 
 
@@ -351,7 +354,11 @@ def _tensorpipe_init_backend_handler(
 
     device_count = torch.cuda.device_count()
 
+<<<<<<< HEAD
     is_static_group = bool(world_size)
+=======
+    is_static_group = True if world_size else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # world_size is specified so this is a static group (ranks cannot join and leave)
     if is_static_group:
         # The agent's join method is required to behave like a barrier and perform
diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index c830fc11d8edd..dec73c80b66a1 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -226,7 +226,11 @@ def _handle_exception(result):
         exc = None
         try:
             exc = result.exception_type(exception_msg)
+<<<<<<< HEAD
         except BaseException as e:  # noqa: B036
+=======
+        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(  # noqa: B904
                 f"Failed to create original exception type. Error msg was {str(e)}"
                 f" Original exception on remote side was {exception_msg}"
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index 7c1e3d4b5a04f..bcef011169bd2 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -48,7 +48,10 @@ def _to_device_list(devices: list[DeviceType]) -> list[torch.device]:
     _TensorPipeRpcBackendOptionsBase = object  # type: ignore[assignment, misc]
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     r"""
     The backend options for
diff --git a/torch/distributed/rpc/rref_proxy.py b/torch/distributed/rpc/rref_proxy.py
index 71c111b2f2e65..d06502d692375 100644
--- a/torch/distributed/rpc/rref_proxy.py
+++ b/torch/distributed/rpc/rref_proxy.py
@@ -53,13 +53,21 @@ def _rref_type_cont(rref_fut):
         def _wrap_rref_type_cont(fut):
             try:
                 _rref_type_cont(fut).then(_complete_op)
+<<<<<<< HEAD
             except BaseException as ex:  # noqa: B036
+=======
+            except BaseException as ex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result.set_exception(ex)
 
         def _complete_op(fut):
             try:
                 result.set_result(fut.value())
+<<<<<<< HEAD
             except BaseException as ex:  # noqa: B036
+=======
+            except BaseException as ex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result.set_exception(ex)
 
         rref_fut.then(_wrap_rref_type_cont)
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 29a916772d330..9922ee16895cc 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -4,8 +4,11 @@
 import itertools
 
 import torch
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.autograd.profiler_legacy import profile
 
 from . import (
@@ -176,13 +179,19 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         flattened_function_events = list(
             itertools.chain.from_iterable(process_global_function_events)
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.function_events = torch.autograd.profiler_util.EventList(
             flattened_function_events,
             use_device="cuda" if self.use_cuda else None,
             profile_memory=self.profile_memory,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.function_events._build_tree()
 
         self.process_global_function_events = process_global_function_events
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index a076c8d5798a3..a107c37f5c3d1 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -77,9 +77,13 @@
 .. note:: ``--nproc-per-node`` may be
           ``"gpu"`` (spawn one process per GPU),
           ``"cpu"`` (spawn one process per CPU),
+<<<<<<< HEAD
           ``"xpu"`` (spawn one process per XPU),
           ``"auto"`` (equivalent to ``"gpu"`` if CUDA is available,
           else equivalent to ``"xpu"`` if XPU is available,
+=======
+          ``"auto"`` (equivalent to ``"gpu"`` if CUDA is available,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           else equivalent to ``"cpu"``),
           or an integer specifying the number of processes.
           See `torch.distributed.run.determine_local_world_size
@@ -373,9 +377,14 @@ def main():
 import sys
 import uuid
 from argparse import ArgumentParser, REMAINDER
+<<<<<<< HEAD
 from collections.abc import Callable
 from importlib import metadata
 from typing import Optional, Union
+=======
+from importlib import metadata
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed.argparse_util import check_env, env
@@ -385,10 +394,13 @@ def main():
 from torch.distributed.elastic.utils import macros
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.distributed.launcher.api import elastic_launch, LaunchConfig
+<<<<<<< HEAD
 from torch.numa.binding import (
     AffinityMode as _AffinityMode,  # Signify as private with _
     NumaOptions as _NumaOptions,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.backend_registration import _get_custom_mod_func
 
 
@@ -399,6 +411,7 @@ def get_args_parser() -> ArgumentParser:
     """Parse the command line options."""
     parser = ArgumentParser(description="Torch Distributed Elastic Training Launcher")
 
+<<<<<<< HEAD
     def comma_separated_list(value):
         placeholder = "<COMMA_PLACEHOLDER>"
         value = value.replace(",,", placeholder)
@@ -406,6 +419,8 @@ def comma_separated_list(value):
         items = [item.replace(placeholder, ",") for item in items]
         return items
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # Worker/node size related arguments.
     #
@@ -423,7 +438,11 @@ def comma_separated_list(value):
         action=env,
         type=str,
         default="1",
+<<<<<<< HEAD
         help="Number of workers per node; supported values: [auto, cpu, gpu, xpu, int].",
+=======
+        help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     #
@@ -578,6 +597,7 @@ def comma_separated_list(value):
         "log files saved via --redirect or --tee",
     )
 
+<<<<<<< HEAD
     parser.add_argument(
         "--duplicate-stdout-filters",
         "--duplicate_stdout_filters",
@@ -600,6 +620,8 @@ def comma_separated_list(value):
         "OR 'orange'. An empty filters list won't duplicate any lines. Use double comma to escape a comma) ",
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # Backwards compatible parameters with caffe2.distributed.launch.
     #
@@ -652,6 +674,7 @@ def comma_separated_list(value):
         "Can be used to override custom logging behavior.",
     )
 
+<<<<<<< HEAD
     parser.add_argument(
         "--numa-binding",
         "--numa_binding",
@@ -688,6 +711,8 @@ def comma_separated_list(value):
         "Common additional signals: SIGUSR1,SIGUSR2 (used in SLURM environments).",
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # Positional arguments.
     #
@@ -737,20 +762,35 @@ def determine_local_world_size(nproc_per_node: str):
                 raise ValueError("Cuda is not available.") from e
             device_type = "gpu"
             num_proc = torch.cuda.device_count()
+<<<<<<< HEAD
         elif nproc_per_node == "xpu":
             if not torch.xpu.is_available():
                 raise ValueError("Xpu is not available.") from e
             device_type = "xpu"
             num_proc = torch.xpu.device_count()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif nproc_per_node == torch._C._get_privateuse1_backend_name():
             if not _get_custom_mod_func("is_available")():
                 raise ValueError(f"{nproc_per_node} is not available.") from e
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
+<<<<<<< HEAD
             if torch.accelerator.is_available():
                 num_proc = torch.accelerator.device_count()
                 device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
+=======
+            if torch.cuda.is_available():
+                num_proc = torch.cuda.device_count()
+                device_type = "gpu"
+            elif (
+                hasattr(torch, torch._C._get_privateuse1_backend_name())
+                and _get_custom_mod_func("is_available")()
+            ):
+                num_proc = _get_custom_mod_func("device_count")()
+                device_type = torch._C._get_privateuse1_backend_name()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"
@@ -800,9 +840,20 @@ def _get_logs_specs_class(logs_specs_name: Optional[str]) -> type[LogsSpecs]:
     logs_specs_cls = None
     if logs_specs_name is not None:
         eps = metadata.entry_points()
+<<<<<<< HEAD
         group = eps.select(group="torchrun.logs_specs")
         if group.select(name=logs_specs_name):
             logs_specs_cls = group[logs_specs_name].load()
+=======
+        if hasattr(eps, "select"):  # >= 3.10
+            group = eps.select(group="torchrun.logs_specs")
+            if group.select(name=logs_specs_name):
+                logs_specs_cls = group[logs_specs_name].load()
+
+        elif specs := eps.get("torchrun.logs_specs"):  # < 3.10
+            if entrypoint_list := [ep for ep in specs if ep.name == logs_specs_name]:
+                logs_specs_cls = entrypoint_list[0].load()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if logs_specs_cls is None:
             raise ValueError(
@@ -869,18 +920,24 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
             ) from e
 
     logs_specs_cls: type[LogsSpecs] = _get_logs_specs_class(args.logs_specs)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-instantiation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     logs_specs = logs_specs_cls(
         log_dir=args.log_dir,
         redirects=Std.from_str(args.redirects),
         tee=Std.from_str(args.tee),
         local_ranks_filter=ranks,
     )
+<<<<<<< HEAD
     numa_options = (
         None
         if args.numa_binding is None
         else _NumaOptions(affinity_mode=_AffinityMode(args.numa_binding))
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     config = LaunchConfig(
         min_nodes=min_nodes,
@@ -898,10 +955,13 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
         local_addr=args.local_addr,
         logs_specs=logs_specs,
         event_log_handler=args.event_log_handler,
+<<<<<<< HEAD
         numa_options=numa_options,
         signals_to_handle=args.signals_to_handle,
         duplicate_stdout_filters=args.duplicate_stdout_filters,
         duplicate_stderr_filters=args.duplicate_stderr_filters,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     with_python = not args.no_python
diff --git a/torch/distributed/tensor/__init__.py b/torch/distributed/tensor/__init__.py
index 067d4c0917e9d..a72c12fc37e0d 100644
--- a/torch/distributed/tensor/__init__.py
+++ b/torch/distributed/tensor/__init__.py
@@ -46,11 +46,15 @@
 ]
 
 # For weights_only torch.load
+<<<<<<< HEAD
 from ._dtensor_spec import (
     DTensorSpec as _DTensorSpec,
     ShardOrderEntry as _ShardOrderEntry,
     TensorMeta as _TensorMeta,
 )
+=======
+from ._dtensor_spec import DTensorSpec as _DTensorSpec, TensorMeta as _TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 torch.serialization.add_safe_globals(
@@ -58,7 +62,10 @@
         DeviceMesh,
         _DTensorSpec,
         _TensorMeta,
+<<<<<<< HEAD
         _ShardOrderEntry,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         DTensor,
         Partial,
         Replicate,
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index 865de11daccb2..be99dacd4b89f 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -3,8 +3,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import inspect
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, cast, Optional
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -25,7 +30,10 @@
     normalize_to_torch_size,
 )
 from torch.distributed.tensor.placement_types import (
+<<<<<<< HEAD
     _StridedShard,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Partial,
     Placement,
     Replicate,
@@ -107,12 +115,18 @@ def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
         )
 
         return (
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             DTensor(
                 # pyrefly: ignore [bad-argument-count]
                 grad_output,
                 grad_spec,
                 # pyrefly: ignore [unexpected-keyword]
+=======
+            DTensor(
+                grad_output,
+                grad_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 requires_grad=grad_output.requires_grad,
             ),
             None,
@@ -178,14 +192,21 @@ def forward(  # type: ignore[override]
         )
 
         # We want a fresh Tensor object that shares memory with the input tensor
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         dist_tensor = DTensor(
             # pyrefly: ignore [bad-argument-count]
+=======
+        dist_tensor = DTensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input.view_as(input),
             dist_spec,
             # requires_grad of the dist tensor depends on if input
             # requires_grad or not
+<<<<<<< HEAD
             # pyrefly: ignore [unexpected-keyword]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=input.requires_grad,
         )
         return dist_tensor
@@ -247,8 +268,13 @@ class DTensor(torch.Tensor):
     # _op_dispatcher instance as a class attribute to handle runtime dispatching logic
     _op_dispatcher: op_dispatch.OpDispatcher = op_dispatch.OpDispatcher()
 
+<<<<<<< HEAD
     # This implementation is just to convince mypy _spec and _local_tensor are
     # initialized; it is immediately overridden below.
+=======
+    @staticmethod
+    @torch._disable_dynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __new__(
         cls,
         local_tensor: torch.Tensor,
@@ -256,6 +282,7 @@ def __new__(
         *,
         requires_grad: bool,
     ) -> "DTensor":
+<<<<<<< HEAD
         r = torch.Tensor._dtensor__new__(
             cls, local_tensor, spec, requires_grad=requires_grad
         )
@@ -271,6 +298,12 @@ def __init__(self, *args, **kwargs):
         """
         Construct a DTensor from a local tensor, device mesh, and placement and
         other tensor properties (i.e. shape, requires_grad, strides, etc).
+=======
+        """
+        Construct a DTensor from a local tensor, device mesh, and placement and
+        other tensor properties (i.e. shape, requires_grad, strides, etc).
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .. note:: This is not a public API and it's only supposed to be used by the
             operator implementations and internals. If you want to construct a
             DTensor from a local tensor, consider using ``DTensor.from_local``, if
@@ -278,6 +311,35 @@ def __init__(self, *args, **kwargs):
             already have tensor initialized and want to shard this tensor),
             consider using ``distribute_tensor``.
         """
+<<<<<<< HEAD
+=======
+        if local_tensor.requires_grad and not requires_grad:
+            warnings.warn(
+                "To construct DTensor from torch.Tensor, it's recommended to "
+                "use local_tensor.detach() and make requires_grad consistent."
+            )
+
+        # new method instruct wrapper tensor from local_tensor and add
+        # placement spec, it does not do actual distribution
+        assert spec.tensor_meta is not None, "TensorMeta should not be None!"
+        r = torch.Tensor._make_wrapper_subclass(
+            cls,
+            spec.tensor_meta.shape,
+            strides=spec.tensor_meta.stride,
+            dtype=local_tensor.dtype,
+            device=local_tensor.device,
+            layout=local_tensor.layout,
+            requires_grad=requires_grad,
+        )
+
+        r._spec = spec
+        r._local_tensor = local_tensor
+        return r
+
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental
+    def __init__(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
 
     # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
@@ -310,12 +372,18 @@ def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):
             spec.placements,
             tensor_meta=unflatten_tensor_meta,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return DTensor(
             # pyrefly: ignore [bad-argument-count]
             local_tensor,
             unflatten_spec,
             # pyrefly: ignore [unexpected-keyword]
+=======
+        return DTensor(
+            local_tensor,
+            unflatten_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=requires_grad,
         )
 
@@ -397,12 +465,15 @@ def from_local(
         .. note:: ``from_local`` is differentiable, the `requires_grad` of the created
             `DTensor` object will depend on if `local_tensor` requires_grad or not.
         """
+<<<<<<< HEAD
         # `local_tensor` argument cannot be DTensor
         if isinstance(local_tensor, DTensor):
             raise RuntimeError(
                 f"the local_tensor argument only accepts torch.Tensor but got {type(local_tensor)} value."
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if same shape/dtype, no need to run_check, if not, must allgather
         # the metadatas to check the size/dtype across ranks
         # There should be no data communication unless there's replication
@@ -544,10 +615,16 @@ def redistribute(
 
         placements = list(placements)
         for i, placement in enumerate(placements):
+<<<<<<< HEAD
             if placement.is_partial() and self.placements[i] != placement:
                 raise RuntimeError(
                     f"Can not redistribute from {self.placements[i]} to {placement}, "
                     "redistributing to Partial is for internal use only!"
+=======
+            if placement.is_partial():
+                raise RuntimeError(
+                    "Can not redistribute to Partial, redistributing to Partial is for internal use only!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif isinstance(placement, Shard) and placement.dim < 0:
                 # normalize shard dim to be positive
@@ -565,7 +642,11 @@ def full_tensor(
         """
         Return the full tensor of this DTensor. It will perform necessary collectives
         to gather the local tensors from other ranks in its DeviceMesh and concatenate
+<<<<<<< HEAD
         them together. It's a syntactic sugar of the following code:
+=======
+        them together. It's a syntatic sugar of the following code:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ``dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()``
 
@@ -609,6 +690,7 @@ def placements(self) -> tuple[Placement, ...]:
         """
         return self._spec.placements
 
+<<<<<<< HEAD
     def _raise_if_contains_partial_placements(self) -> None:
         """
         Raise an error if the DTensor contains partial placements.
@@ -624,6 +706,9 @@ def _raise_if_contains_partial_placements(self) -> None:
 
     def __create_write_items__(self, fqn: str, object: Any):
         self._raise_if_contains_partial_placements()
+=======
+    def __create_write_items__(self, fqn: str, object: Any):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.distributed.checkpoint.planner_helpers import (
             _create_write_items_for_dtensor,
         )
@@ -646,7 +731,10 @@ def __create_chunk_list__(self):
         Returns:
             A List[:class:`ChunkStorageMetadata`] object that represents the shard size/offset on the current rank.
         """
+<<<<<<< HEAD
         self._raise_if_contains_partial_placements()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.distributed.checkpoint.planner_helpers import (
             _create_chunk_from_dtensor,
         )
@@ -659,7 +747,10 @@ def __create_chunk_list__(self):
             raise RuntimeError("Unsupported tensor type!")
 
     def __get_tensor_shard__(self, index):
+<<<<<<< HEAD
         self._raise_if_contains_partial_placements()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(self._local_tensor, "__get_tensor_shard__"):
             return self._local_tensor.__get_tensor_shard__(index)  # type: ignore[attr-defined]
         elif isinstance(self._local_tensor, torch.Tensor):
@@ -667,6 +758,7 @@ def __get_tensor_shard__(self, index):
         else:
             raise RuntimeError("Unsupported tensor type!")
 
+<<<<<<< HEAD
     @classmethod
     def __metadata_guard__(
         cls, orig: tuple[DTensorSpec, bool], other: tuple[DTensorSpec, bool]
@@ -678,6 +770,8 @@ def __metadata_guard__(
             and orig_requires_grad == other_requires_grad
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def distribute_tensor(
     tensor: torch.Tensor,
@@ -786,6 +880,7 @@ def distribute_tensor(
     # distribute the tensor according to the placements.
     placements = list(placements)
     for idx, placement in enumerate(placements):
+<<<<<<< HEAD
         if isinstance(placement, Shard):
             placement_dim = (
                 placement.dim + tensor.ndim if placement.dim < 0 else placement.dim
@@ -809,6 +904,20 @@ def distribute_tensor(
                 placements[idx] = Shard(placement_dim)
         elif isinstance(placement, Replicate):
             local_tensor = Replicate._make_replicate_tensor(
+=======
+        if placement.is_shard():
+            placement = cast(Shard, placement)
+            if placement.dim < 0:
+                # normalize shard placement dim
+                placement = Shard(placement.dim + tensor.ndim)
+                placements[idx] = placement
+            local_tensor = placement._shard_tensor(
+                local_tensor, device_mesh, idx, src_data_rank
+            )
+        elif placement.is_replicate():
+            placement = cast(Replicate, placement)
+            local_tensor = placement._replicate_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 local_tensor, device_mesh, idx, src_data_rank
             )
         else:
@@ -829,12 +938,18 @@ def distribute_tensor(
             dtype=tensor.dtype,
         ),
     )
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     return DTensor(
         # pyrefly: ignore [bad-argument-count]
         local_tensor.requires_grad_(tensor.requires_grad),
         spec,
         # pyrefly: ignore [unexpected-keyword]
+=======
+    return DTensor(
+        local_tensor.requires_grad_(tensor.requires_grad),
+        spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         requires_grad=tensor.requires_grad,
     )
 
@@ -1046,7 +1161,11 @@ def _dtensor_init_helper(  # type: ignore[no-untyped-def]
     # set default placements to replicated if not specified
     placements = placements or tuple(Replicate() for _ in range(device_mesh.ndim))
 
+<<<<<<< HEAD
     # check device_mesh against placements
+=======
+    # check device_mesh againts placements
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert device_mesh.ndim == len(placements), (
         "mesh dimension does not match the length of placements"
     )
@@ -1089,12 +1208,18 @@ def _dtensor_init_helper(  # type: ignore[no-untyped-def]
         ),
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     return DTensor(
         # pyrefly: ignore [bad-argument-count]
         local_tensor,
         spec,
         # pyrefly: ignore [unexpected-keyword]
+=======
+    return DTensor(
+        local_tensor,
+        spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         requires_grad=kwargs["requires_grad"],
     )
 
diff --git a/torch/distributed/tensor/_collective_utils.py b/torch/distributed/tensor/_collective_utils.py
index dff426a6d5e5a..32002b0f10545 100644
--- a/torch/distributed/tensor/_collective_utils.py
+++ b/torch/distributed/tensor/_collective_utils.py
@@ -10,10 +10,13 @@
 import torch.distributed.tensor._dtensor_spec as dtensor_spec
 from torch._C._distributed_c10d import _resolve_process_group
 from torch._logging import warning_once
+<<<<<<< HEAD
 from torch.distributed._local_tensor import (
     local_tensor_mode,
     maybe_run_for_local_tensor,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.distributed_c10d import (
     _get_group_size_by_name,
@@ -29,6 +32,7 @@
 logger = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 @torch.library.register_fake("_dtensor::shard_dim_alltoall")
 def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
     group_size = _get_group_size_by_name(group_name)
@@ -40,11 +44,37 @@ def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
         torch.cat(stacked_list, dim=gather_dim)
         .chunk(group_size, dim=shard_dim)[group_rank]
         .contiguous()
+=======
+if not torch._running_with_deploy():
+
+    @torch.library.register_fake("_dtensor::shard_dim_alltoall")
+    def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
+        group_size = _get_group_size_by_name(group_name)
+        stacked_list = [torch.empty_like(input) for _ in range(group_size)]
+        group = _resolve_process_group(group_name)
+        group_rank = get_group_rank(group, get_rank())
+
+        return (
+            torch.cat(stacked_list, dim=gather_dim)
+            .chunk(group_size, dim=shard_dim)[group_rank]
+            .contiguous()
+        )
+
+else:
+    import warnings
+
+    warnings.warn(
+        "PyTorch Distributed functional collectives do not work with torch::deploy."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 def shard_dim_alltoall(input, gather_dim, shard_dim, mesh, mesh_dim):
+<<<<<<< HEAD
     if mesh.device_type == "cpu" and local_tensor_mode() is None:
+=======
+    if mesh.device_type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Gloo does not support alltoall, so falling back to allgather + chunk
         warning_once(
             logger,
@@ -171,7 +201,10 @@ def mesh_broadcast(
     return broadcast(tensor, group=dim_group, async_op=async_op, group_src=group_src)
 
 
+<<<<<<< HEAD
 @maybe_run_for_local_tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def pad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tensor:
     if pad_size == 0:
         return tensor
@@ -180,7 +213,10 @@ def pad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tenso
     return torch.nn.functional.pad(tensor, pad)
 
 
+<<<<<<< HEAD
 @maybe_run_for_local_tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def unpad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tensor:
     if pad_size == 0:
         return tensor
@@ -322,7 +358,11 @@ def redistribute_cost(
 
     NOTE:
     1. Only consider communication cost here, since computation costs for redistribute
+<<<<<<< HEAD
        are quite trivial (i.e. we only need to narrow or simple division)
+=======
+       are quite trival (i.e. we only need to narrow or simple division)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     2. Only consider redistribute cost on same mesh, cross mesh communication cost is
        not quite needed for operator strategy estimation/selection.
     """
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index ba04eeb30df86..1c014997f6600 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -1,6 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import contextlib
+<<<<<<< HEAD
 import logging
+=======
+import functools
+import logging
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from collections.abc import Sequence
 from typing import cast, Optional
@@ -21,8 +27,11 @@
 )
 from torch.distributed.tensor._utils import try_find_mesh_from_args
 from torch.distributed.tensor.placement_types import Partial, Placement, Replicate
+<<<<<<< HEAD
 from torch.utils._debug_mode import get_active_debug_mode
 from torch.utils._python_dispatch import return_and_correct_aliasing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -78,11 +87,16 @@ def found_inf_reduce_handler(
             dtype=target_tensor.dtype,
         ),
     )
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     found_inf_dtensor = dtensor.DTensor(
         local_tensor=target_tensor,  # pyrefly: ignore [unexpected-keyword]
         spec=spec,  # pyrefly: ignore [unexpected-keyword]
         requires_grad=False,  # pyrefly: ignore [unexpected-keyword]
+=======
+    found_inf_dtensor = dtensor.DTensor(
+        local_tensor=target_tensor, spec=spec, requires_grad=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     found_inf = found_inf_dtensor.full_tensor()
     target_tensor.copy_(found_inf)
@@ -123,6 +137,7 @@ def __init__(self) -> None:
             aten._amp_foreach_non_finite_check_and_unscale_.default: found_inf_reduce_handler,
         }
 
+<<<<<<< HEAD
     # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
     # as implicitly replicated or we throw error to user.
     # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
@@ -134,6 +149,13 @@ def _allow_implicit_replication(self) -> bool:
     @_allow_implicit_replication.setter
     def _allow_implicit_replication(self, value: bool) -> None:
         return torch._C._set_dtensor_allow_implicit_replication(value)
+=======
+        # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
+        # as implicitly replicated or we throw error to user.
+        # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
+        # it as False by default.
+        self._allow_implicit_replication = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def dispatch(
         self,
@@ -142,16 +164,31 @@ def dispatch(
         kwargs: dict[str, object],
     ) -> object:
         """
+<<<<<<< HEAD
         Main dispatching logic.  Follows precedence order:
         (1) custom_op_handler
         (2) registered sharding strategy, then rule
         (3) composite implicit autograd decomposition
         """
+=======
+        Main dispatching logic
+        """
+        # operators that does not need to go through sharding propagation
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            op_call.name(), torch._C.DispatchKey.CompositeImplicitAutograd
+        ):
+            # When running under inference mode, CompositeImplicitAutograd ops show up in __torch_dispatch__,
+            # so we manually decompose them, here
+            out = op_call.decompose(*args, **kwargs)
+            assert out is not NotImplemented
+            return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op_call in self._custom_op_handlers:
             return self._custom_op_handlers[op_call](op_call, args, kwargs)  # type: ignore[operator]
 
         # extract local tensor and sharding infos to a OpInfo
         op_info = self.unwrap_to_op_info(op_call, args, kwargs)
+<<<<<<< HEAD
 
         try:
             self.sharding_propagator.propagate(op_info)
@@ -178,22 +215,41 @@ def dispatch(
         participating = mesh.get_coordinate() is not None
         local_results = None
         if participating:
+=======
+        logger.debug("Dispatching op_call: %s", op_info.schema)
+
+        self.sharding_propagator.propagate(op_info)
+        output_sharding = op_info.output_sharding
+        logger.debug("output_sharding for %s: %s", op_call, output_sharding)
+        assert output_sharding is not None, "output sharding should not be None"
+
+        mesh = op_info.compute_mesh
+        if mesh.get_coordinate() is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # computation that happens in the current rank of the mesh, normal case
             if output_sharding.needs_redistribute:
                 # If sharding propagation decision needs redistribute, perform redistribute
                 # on args first, which could potentially modify args (i.e. allgather certain arg)
                 assert output_sharding.redistribute_schema is not None
                 self.redistribute_local_args(
+<<<<<<< HEAD
                     op_info,
                     output_sharding.redistribute_schema,
                     output_sharding.use_val_from_redistribute_schema,
+=======
+                    op_info, output_sharding.redistribute_schema
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             local_tensor_args = (
                 pytree.tree_unflatten(
+<<<<<<< HEAD
                     cast(list[object], op_info.local_args),
                     # pyrefly: ignore [bad-argument-type]
                     op_info.args_tree_spec,
+=======
+                    cast(list[object], op_info.local_args), op_info.args_tree_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if op_info.args_tree_spec
                 else op_info.local_args
@@ -211,6 +267,7 @@ def dispatch(
                     cast(dtensor.DTensor, args[0]),
                     cast(torch.Tensor, local_tensor_args[0]),
                 )
+<<<<<<< HEAD
 
                 # If the user provided a generator, we hook it up to our RNG manager, but we also pop it from kwargs
                 # so the op_call does not directly use it (we want op_call to fall back to the 'default' which is
@@ -224,6 +281,10 @@ def dispatch(
                     random._rng_tracker._distribute_region(
                         first_arg._spec, generator=maybe_user_generator
                     )
+=======
+                rng_context = (
+                    random._rng_tracker._distribute_region(first_arg._spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if random._rng_tracker and not first_local_arg.is_meta
                     else contextlib.nullcontext()
                 )
@@ -280,6 +341,7 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
 
         if output_sharding.output_spec is None:
             if op_call == aten.equal.default:
+<<<<<<< HEAD
                 # The output of the equal op is a bool, by converting it into a
                 # a single value tensor, we can use all-reduce with min reduce op
                 # to simulate logical and.
@@ -290,10 +352,21 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                 )
                 dist.all_reduce(r, op=dist.ReduceOp.MIN)
                 local_results = bool(r.item())
+=======
+                # For equal operator, The local results from all devices should be all-gathered
+                # and a reduce op (AND) will be performed on the list of results to ensure SPMD
+                # execution. We can extend this for more ops if necessary.
+                obj_list = [None for _ in range(dist.get_world_size())]
+                dist.all_gather_object(obj_list, local_results)  # type: ignore[possibly-undefined]
+                obj_list = list(filter(lambda x: x is not None, obj_list))
+                # perform reduce on the collection with AND op
+                local_results = functools.reduce(operator.and_, obj_list, True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if op_info.schema.is_inplace_op():
             # inplace op should return self instead of re-wrapping
             if output_sharding.output_spec is not None:
+<<<<<<< HEAD
                 # NOTE: aten.squeeze_.dim is an inplace op but it also may change
                 # the inplace argument's tensor meta. Here we choose to special case
                 # this op because as far as I know this is the only inplace op that
@@ -308,6 +381,9 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                     return return_and_correct_aliasing(op_call, args, kwargs, args[0])
                 else:
                     return args[0]
+=======
+                return args[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return None
         elif op_info.schema.is_out_variant_op():
@@ -329,20 +405,28 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
             assert len(out_dts) >= 1, "out variant should have at least one out arg"
             return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
         else:
+<<<<<<< HEAD
             ret = self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
             if participating and op_info.schema.is_view_op():
                 return return_and_correct_aliasing(op_call, args, kwargs, ret)
             else:
                 return ret
+=======
+            return self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def redistribute_local_args(
         op_info: OpInfo,
         suggested_input_schema: OpSchema,
+<<<<<<< HEAD
         use_val_from_redistribute_schema: bool,
     ) -> None:
         debug_mode = get_active_debug_mode()
 
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: it's very rare that we need to reshard kwargs so we intentionally skip it
         if op_info.args_tree_spec is not None:
             flatten_args_schema_to_reshard = tuple(
@@ -357,6 +441,7 @@ def redistribute_local_args(
             if isinstance(arg_spec, DTensorSpec):
                 local_tensor = cast(torch.Tensor, op_info.local_args[i])
                 if arg_spec != reshard_arg_spec:
+<<<<<<< HEAD
                     redistribute_context = (
                         debug_mode.record_redistribute_calls(  # type: ignore[union-attr]
                             i, arg_spec, reshard_arg_spec
@@ -372,16 +457,25 @@ def redistribute_local_args(
                             # pyrefly: ignore [bad-argument-type]
                             reshard_arg_spec,
                         )
+=======
+                    resharded_local_tensor = redistribute_local_tensor(
+                        local_tensor, arg_spec, reshard_arg_spec
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_local_args.append(resharded_local_tensor)
                 else:
                     new_local_args.append(local_tensor)
             else:
+<<<<<<< HEAD
                 if use_val_from_redistribute_schema:
                     # args can be updated for view related ops, we refer to the
                     # update in redistribute_schema.
                     new_local_args.append(reshard_arg_spec)
                 else:
                     new_local_args.append(arg_spec)
+=======
+                new_local_args.append(reshard_arg_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         op_info.local_args = tuple(new_local_args)
 
@@ -440,10 +534,14 @@ def unwrap_to_op_info(
                     op_call, args_list
                 )
                 kwargs_schema[k] = self._try_replicate_spec_for_scalar_tensor(
+<<<<<<< HEAD
                     op_call,
                     v,
                     # pyrefly: ignore [bad-argument-type]
                     compute_mesh,
+=======
+                    op_call, v, compute_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 local_kwargs[k] = v
             else:
@@ -459,7 +557,10 @@ def unwrap_to_op_info(
             OpSchema(
                 op_call,
                 (
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     pytree.tree_unflatten(args_schema, args_spec)
                     if args_spec
                     else tuple(args_schema)
@@ -481,7 +582,10 @@ def wrap(res: object, spec: OutputSpecType) -> object:
                 assert isinstance(spec, DTensorSpec), (
                     f"output spec does not match with output! Expected DTensorSpec, got {spec}."
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type, bad-argument-count, unexpected-keyword]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return dtensor.DTensor(res, spec, requires_grad=res.requires_grad)
             else:
                 # if output does not have a DTensorSpec due to specific ops, it must be a scalar tensor
@@ -513,8 +617,12 @@ def _try_replicate_spec_for_scalar_tensor(
                 "Found a non-scalar tensor with numel=1 and ndim!=0, "
                 "we are implicitly creating a replicated DTensor for it. "
                 "However, please consider changing it to a scalar tensor "
+<<<<<<< HEAD
                 "or explicitly create a DTensor under distributed environment.",
                 stacklevel=2,
+=======
+                "or explicitly create a DTensor under distributed enviroment."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if tensor_arg.numel() == 1 or self._allow_implicit_replication:
@@ -532,7 +640,10 @@ def _try_replicate_spec_for_scalar_tensor(
             raise RuntimeError(
                 f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
                 " torch.Tensor to DTensor before calling distributed operators!"
+<<<<<<< HEAD
                 " Please see https://docs.pytorch.org/docs/main/distributed.tensor.html#mixed-tensor-and-dtensor-operations"
                 " for more details."
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return replication_spec
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index 5e7d7b3c842d2..2119f866e2844 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -1,17 +1,24 @@
+<<<<<<< HEAD
 import itertools
 from collections import defaultdict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from typing import Any, cast, NamedTuple, Optional
 
 import torch
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
+<<<<<<< HEAD
     _StridedShard,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Partial,
     Placement,
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.utils._debug_mode import _stringify_shape
 from torch.utils._dtype_abbrs import dtype_abbrs
 
@@ -51,6 +58,8 @@ class ShardOrderEntry(NamedTuple):
 #     - Tensor dimension 0 is sharded on mesh dimension 1
 #     - Tensor dimension 2 is sharded on mesh dimension 0 first, then mesh dimension 3
 ShardOrder = tuple[ShardOrderEntry, ...]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TensorMeta(NamedTuple):
@@ -71,6 +80,7 @@ class DTensorSpec:
     # tensor meta will only be set during sharding propagation
     tensor_meta: Optional[TensorMeta] = None
 
+<<<<<<< HEAD
     # When a tensor dimension is sharded across multiple mesh axes,
     # `shard_order` specifies the sequence in which these shardings are applied.
     # This order determines how tensor shards are mapped and distributed across
@@ -178,6 +188,19 @@ def __setattr__(self, attr: str, value: Any) -> None:
             # test/distributed/tensor/experimental/test_tp_transform.py::TensorParallelTest::test_tp_transform_e2e
             # but I actually can't reproduce it, maybe it is also a bug!
             assert isinstance(value, TensorMeta | TensorMetadata), value
+=======
+    def __post_init__(self) -> None:
+        if not isinstance(self.placements, tuple):
+            self.placements = tuple(self.placements)
+        self._hash: Optional[int] = None
+
+    def __setattr__(self, attr: str, value: Any) -> None:
+        super().__setattr__(attr, value)
+        # Make sure to recompute the hash in case any of the hashed attributes
+        # change (though we do not expect `mesh` or `placements` to change)
+        if hasattr(self, "_hash") and attr in ("mesh", "placements", "tensor_meta"):
+            self._hash = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _hash_impl(self) -> int:
         # hashing and equality check for DTensorSpec are used to cache the sharding
@@ -190,13 +213,20 @@ def _hash_impl(self) -> int:
                 (
                     self.mesh,
                     self.placements,
+<<<<<<< HEAD
                     self.shard_order,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.tensor_meta.shape,
                     self.tensor_meta.stride,
                     self.tensor_meta.dtype,
                 )
             )
+<<<<<<< HEAD
         return hash((self.mesh, self.placements, self.shard_order))
+=======
+        return hash((self.mesh, self.placements))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self) -> int:
         # We lazily cache the spec to avoid recomputing the hash upon each
@@ -207,32 +237,46 @@ def __hash__(self) -> int:
             self._hash = self._hash_impl()
         return self._hash
 
+<<<<<<< HEAD
     def _check_equals(self, other: object, skip_shapes: bool = False) -> bool:
+=======
+    def __eq__(self, other: object, /) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not (
             isinstance(other, DTensorSpec)
             and self.mesh == other.mesh
             and self.placements == other.placements
+<<<<<<< HEAD
             and self.shard_order == other.shard_order
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return False
         if self.tensor_meta is None or other.tensor_meta is None:
             return self.tensor_meta == other.tensor_meta
 
+<<<<<<< HEAD
         if skip_shapes:
             return self.tensor_meta.dtype == other.tensor_meta.dtype
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             self.tensor_meta.shape == other.tensor_meta.shape  # type: ignore[union-attr]
             and self.tensor_meta.stride == other.tensor_meta.stride  # type: ignore[union-attr]
             and self.tensor_meta.dtype == other.tensor_meta.dtype  # type: ignore[union-attr]
         )
 
+<<<<<<< HEAD
     def __eq__(self, other: object, /) -> bool:
         return self._check_equals(other)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __str__(self) -> str:
         """
         human readable representation of the DTensorSpec
         """
+<<<<<<< HEAD
         placement_str = self.format_shard_order_str(self.placements, self.shard_order)
         if self.tensor_meta is not None:
             tensor_shape = _stringify_shape(self.tensor_meta.shape)
@@ -324,6 +368,19 @@ def format_shard_order_str(
             else:
                 out_str += str(placement)
         return out_str
+=======
+        if len(self.placements) == 1:
+            placement_str = str(self.placements[0])
+        else:
+            placement_str = str(self.placements)
+
+        if self.tensor_meta is not None:
+            tensor_shape = str(tuple(self.tensor_meta.shape))
+        else:
+            tensor_shape = "unknown shape"
+
+        return f"Spec({placement_str} on {tensor_shape})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def shape(self) -> torch.Size:
@@ -465,7 +522,11 @@ def from_dim_map(
                 if placement.is_shard():
                     placement = cast(Shard, placement)
                     raise RuntimeError(
+<<<<<<< HEAD
                         f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+=======
+                        f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 elif placement.is_partial():
                     raise RuntimeError(
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 8d3a89a1a647b..1627bb508f666 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 """
 DTensor operator schema definitions and utilities.
 
@@ -34,6 +35,14 @@
     _DTensor_OpSchema_post_init,
     _DTensor_OpSchema_recompute_comparison_key,
 )
+=======
+from collections.abc import Sequence
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Any, Optional, Union
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import OpOverload
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
@@ -41,6 +50,7 @@
 
 
 try:
+<<<<<<< HEAD
     from torch.utils._cxx_pytree import (
         register_pytree_node,
         tree_leaves,
@@ -50,6 +60,11 @@
 except ImportError:
     from torch.utils._pytree import (  # type: ignore[no-redef, assignment]
         register_pytree_node,
+=======
+    from torch.utils._cxx_pytree import tree_leaves, tree_map_only, TreeSpec
+except ImportError:
+    from torch.utils._pytree import (  # type: ignore[no-redef, assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tree_leaves,
         tree_map_only,
         TreeSpec,
@@ -62,7 +77,11 @@
 
 PlacementList = list[Optional[Placement]]
 
+<<<<<<< HEAD
 # ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type should
+=======
+# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # be the same set of possibilities.
 OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]
 
@@ -99,6 +118,7 @@ class OpSpec:
     note: when the op return value is a single DTensor object, output_specs is
     DTensorSpec; when the return value is a tuple of Optional[DTensor],
     output_specs is a tuple of Optional[DTensorSpec].
+<<<<<<< HEAD
 
     note: we MUST produce an DTensorSpec for every output that is a Tensor.  None
     entries only occur for non-Tensor outputs (e.g., operators that return Optional[Tensor],
@@ -138,6 +158,16 @@ class OpSpec:
             K,    # cost of redistributing tensor_a from 'Shard(0)'
         ],
     """
+=======
+    """
+
+    output_specs: Union[DTensorSpec, tuple[Optional[DTensorSpec], ...]]
+    input_specs: Optional[Sequence[DTensorSpec]] = None
+
+    # redistribute costs to redistribute the operator input shardings to this OpSpec.
+    # Note that We need a nested list to record the cost for each operand of this
+    # operator, and for each operand of this operator it might have multiple OpSpecs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     redistribute_cost: Optional[list[list[float]]] = None
 
     @cached_property
@@ -194,8 +224,11 @@ class OpStrategy(StrategyType):
     """
     OpStrategy that consists of a list of sharding strategies associated with the op,
     where each strategy is an OpSpec that describes the acceptable input/output sharding.
+<<<<<<< HEAD
 
     invariant: the DeviceMesh on all OpSpec must be the same
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self, strategies: list[OpSpec]) -> None:
@@ -205,7 +238,11 @@ def __init__(self, strategies: list[OpSpec]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.mesh_shape
+<<<<<<< HEAD
         return f"OpStragety[{strategy_list_str}] @ mesh: {mesh_shape}"
+=======
+        return f"[{strategy_list_str}] @ mesh: {mesh_shape}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def max_num_shards(self) -> int:
         """
@@ -232,6 +269,7 @@ def shape(self):
 
 class TupleStrategy(StrategyType):
     """
+<<<<<<< HEAD
     TupleStrategy is a special case for operators that are fundamentally compound or batched such that some subset
     of the inputs and outputs are completely unrelated to some other subset.
 
@@ -266,16 +304,39 @@ def childs(self) -> Sequence[StrategyType]:  # codespell:ignore childs
 
     def child_mesh(self, index: int) -> DeviceMesh:
         op_strategy = self.children[index]
+=======
+    TupleStrategy represents the output strategy of this op is a tuple of OpStrategies,
+    i.e. If the output of this op is a tuple of tensors or list of tensors with possibly
+    different OpStrategies, we should return a TupleStrategy that contains a tuple of
+    OpStrategy, where each child represents the sharding strategy of "each element" of
+    the tuple/list of tensors the op returns.
+
+    NOTE: if the output of the op is a List[Tensor] and they share the same OpStrategy,
+    then we should return a single OpStrategy instead of a TupleStrategy
+    """
+
+    def __init__(self, childs: Sequence[StrategyType]) -> None:
+        super().__init__()
+        self.childs: Sequence[StrategyType] = childs
+
+    def child_mesh(self, index: int) -> DeviceMesh:
+        op_strategy = self.childs[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(op_strategy, OpStrategy)
         return op_strategy.mesh
 
     def __str__(self) -> str:
         child_strategies_str = ", ".join(
+<<<<<<< HEAD
             [f"{str(strat)}" for idx, strat in enumerate(self.children)]
+=======
+            [f"{str(strat)}" for idx, strat in enumerate(self.childs)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return f"TupleStrategy({child_strategies_str})"
 
 
+<<<<<<< HEAD
 try:
     register_pytree_node(
         TupleStrategy,
@@ -287,6 +348,8 @@ def __str__(self) -> str:
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class RuntimeSchemaInfo:
     """
@@ -316,7 +379,11 @@ class OpSchema:
     order preserved). It is mainly used by the DTensor's dispatching logic to perform various
     actions (i.e. sharding propagation, caching sharding decisions, redistribute, etc.)
 
+<<<<<<< HEAD
     NOTE: this must be used as a read only data class
+=======
+    NOTE: this should be used as a read only data class
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TODO: make this a frozen dataclass
 
     Args:
@@ -333,10 +400,13 @@ class OpSchema:
 
     schema_info: Optional[RuntimeSchemaInfo] = None
 
+<<<<<<< HEAD
     _comparison_key: Optional[tuple[object, ...]] = None
 
     has_symints: bool = field(init=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def args_spec(self) -> tuple[DTensorSpec, ...]:
         """
@@ -373,6 +443,7 @@ def __repr__(self) -> str:
 
     def __str__(self) -> str:
         args_schema: list[str] = []
+<<<<<<< HEAD
         device_mesh = None
 
         for arg in self.args_schema:
@@ -397,6 +468,37 @@ def __post_init__(self) -> None:
         _DTensor_OpSchema_post_init(self)
 
     def arg_type_tensor_or_tensor_list_like(self, arg: object) -> bool:
+=======
+        mesh_shape = None
+        for arg in self.args_schema:
+            if isinstance(arg, DTensorSpec):
+                args_schema.append(str(arg))
+                mesh_shape = arg.mesh.shape
+            elif isinstance(arg, OpStrategy):
+                assert len(arg.strategies) == 1
+                args_schema.append(_pretty_print_spec(arg.strategies[0].output_specs))
+                mesh_shape = arg.mesh_shape
+            elif isinstance(arg, TupleStrategy):
+                first_op_strategy = arg.childs[0]
+                assert isinstance(first_op_strategy, OpStrategy)
+                mesh_shape = first_op_strategy.mesh_shape
+                args_schema.append(str(arg))
+            else:
+                args_schema.append(str(arg))
+        return f"Op(op={self.op}, args_schema={', '.join(args_schema)} @ mesh: {mesh_shape})"
+
+    def __post_init__(self) -> None:
+        has_symints = False
+        for a in self.args_schema:
+            if isinstance(a, DTensorSpec) and a.tensor_meta is not None:
+                if any(isinstance(s, torch.SymInt) for s in a.tensor_meta.shape):
+                    has_symints = True
+                    break
+        self.has_symints = has_symints
+
+    def arg_type_tensor_or_tensor_list_like(self, arg_idx: int) -> bool:
+        arg = self.args_schema[arg_idx]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_tensor = isinstance(arg, DTensorSpec)
         if is_tensor:
             return True
@@ -414,6 +516,7 @@ def return_type_tuple_tensor_like(self) -> bool:
             return_types[0].type, torch.TensorType
         )
 
+<<<<<<< HEAD
     def return_type_list_tensor_like(self) -> bool:
         # returns True if the return type is a List
         return_types = self.op._schema.returns
@@ -421,6 +524,8 @@ def return_type_list_tensor_like(self) -> bool:
             return_types[0].type, torch.ListType
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def return_type_tensor(self) -> bool:
         return_types = self.op._schema.returns
         # all dispatch ops only return Tensor or Tuple[Tensor] for tensor like
@@ -445,7 +550,11 @@ def get_mesh_from_args(self, validate: bool = True) -> DeviceMesh:
             mesh = first_arg.mesh
         elif isinstance(first_arg, (list, tuple, TupleStrategy)):
             first_elem = (
+<<<<<<< HEAD
                 first_arg.children[0]
+=======
+                first_arg.childs[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(first_arg, TupleStrategy)
                 else first_arg[0]
             )
@@ -477,6 +586,7 @@ def is_out_variant_op(self) -> bool:
         # be entirely correct, but it's good enough for now.
         return "out" in self.op._schema.overload_name
 
+<<<<<<< HEAD
     def is_view_op(self) -> bool:
         return self.op._schema._is_view_op()
 
@@ -485,6 +595,29 @@ def _recompute_comparison_key(self) -> None:
 
     def __hash__(self) -> int:
         return hash(self._comparison_key)
+=======
+    def __hash__(self) -> int:
+        # Only hash args and kwargs that op indicates to hash
+        if not self.schema_info:
+            static_argnum = len(self.args_schema)
+            static_kwargkey = None
+        else:
+            static_argnum = self.schema_info.static_argnum
+            static_kwargkey = self.schema_info.static_kwargkey
+
+        args_to_hash = tuple(
+            tuple(e) if isinstance(e, list) else e
+            for i, e in enumerate(self.args_schema)
+            if self.arg_type_tensor_or_tensor_list_like(i) or i >= static_argnum
+        )
+        if static_kwargkey is not None:
+            kwargs_to_hash = tuple(
+                self.kwargs_schema.get(k, None) for k in static_kwargkey
+            )
+            return hash((self.op, args_to_hash, kwargs_to_hash))
+        else:
+            return hash((self.op, args_to_hash))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __eq__(self, other: object) -> bool:
         # early return checks
@@ -497,7 +630,35 @@ def __eq__(self, other: object) -> bool:
         if len(self.args_schema) != len(other.args_schema):
             return False
 
+<<<<<<< HEAD
         return self._comparison_key == other._comparison_key
+=======
+        # compare each element and early return if any of them is different
+        if not self.schema_info:
+            static_argnum = len(self.args_schema)
+            static_kwargkey = None
+        else:
+            static_argnum = self.schema_info.static_argnum
+            static_kwargkey = self.schema_info.static_kwargkey
+
+        for i, (self_arg, other_arg) in enumerate(
+            zip(self.args_schema, other.args_schema)
+        ):
+            if isinstance(self_arg, DTensorSpec) and self_arg != other_arg:
+                return False
+            elif i >= static_argnum and self_arg != other_arg:
+                return False
+
+        # check kwarg equality when there's a static kwarg key
+        if static_kwargkey:
+            for key in static_kwargkey:
+                if self.kwargs_schema.get(key, None) != other.kwargs_schema.get(
+                    key, None
+                ):
+                    return False
+
+        return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def gen_fake_args(self) -> ArgsType:
         """
@@ -544,7 +705,10 @@ def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
                 new_arg_schema.append(arg)
         self.args_schema = tuple(new_arg_schema)
         self.kwargs_schema = origin_schema.kwargs_schema
+<<<<<<< HEAD
         self._recompute_comparison_key()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -559,6 +723,7 @@ class OutputSharding:
     exactly the same as the operator OpSchema, except the DTensorSpecs
     """
 
+<<<<<<< HEAD
     # specifies the output sharding pattern
     output_spec: OutputSpecType
     # schema for redistribution if needed
@@ -567,6 +732,11 @@ class OutputSharding:
     needs_redistribute: bool = False
     # flag to use values from `redistribute_schema`
     use_val_from_redistribute_schema: bool = False
+=======
+    output_spec: OutputSpecType
+    redistribute_schema: Optional[OpSchema] = None
+    needs_redistribute: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @cached_property
     def mesh(self):
diff --git a/torch/distributed/tensor/_ops/_common_rules.py b/torch/distributed/tensor/_ops/_common_rules.py
index 1e7ff648f7fbd..6e9661265dafc 100644
--- a/torch/distributed/tensor/_ops/_common_rules.py
+++ b/torch/distributed/tensor/_ops/_common_rules.py
@@ -171,7 +171,10 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
                             global_shape, input_spec.mesh, input_spec.placements
                         )
                         cost += prod(local_shape) * input_spec.mesh.size(mesh_dim)
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 costs.append(cost)
             d_to_keep_sharding = dims[costs.index(max(costs))]
             for d in dims:
diff --git a/torch/distributed/tensor/_ops/_conv_ops.py b/torch/distributed/tensor/_ops/_conv_ops.py
index bcb9e01b5ed9b..a97e3cd94ed0d 100644
--- a/torch/distributed/tensor/_ops/_conv_ops.py
+++ b/torch/distributed/tensor/_ops/_conv_ops.py
@@ -35,6 +35,7 @@ def convolution_rules(op_schema: OpSchema) -> OutputSharding:
     assert isinstance(padding, list)
     assert isinstance(dilation, list)
     assert isinstance(weight_shape, torch.Size)
+<<<<<<< HEAD
     out_conv_shape = [
         (d + 2 * padding[i] - dilation[i] * (weight_shape[i + 1] - 1) - 1) // stride[i]
         + 1
@@ -44,12 +45,28 @@ def convolution_rules(op_schema: OpSchema) -> OutputSharding:
     output_stride = [1]
     for i in range(1, len(output_shape)):
         output_stride.insert(0, output_stride[0] * output_shape[-i])
+=======
+    N, H_in, W_in = in_shape[0], in_shape[2], in_shape[3]
+    C_out = weight_shape[0]
+    H_out = (H_in + 2 * padding[0] - dilation[0] * (weight_shape[2] - 1) - 1) // stride[
+        0
+    ] + 1
+    W_out = (W_in + 2 * padding[1] - dilation[1] * (weight_shape[3] - 1) - 1) // stride[
+        1
+    ] + 1
+    output_shape = [N, C_out, H_out, W_out]
+    output_stride = (C_out * H_out * W_out, H_out * W_out, W_out, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_dim_map = input_spec.dim_map
     pending_sums = input_spec.sums
 
     tensor_meta = TensorMeta(
         torch.Size(output_shape),
+<<<<<<< HEAD
         tuple(output_stride),
+=======
+        output_stride,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_spec.tensor_meta.dtype,
     )
     return OutputSharding(
diff --git a/torch/distributed/tensor/_ops/_einsum_strategy.py b/torch/distributed/tensor/_ops/_einsum_strategy.py
index 506103d70a599..bb34032e39a04 100644
--- a/torch/distributed/tensor/_ops/_einsum_strategy.py
+++ b/torch/distributed/tensor/_ops/_einsum_strategy.py
@@ -90,6 +90,7 @@ def gen_einsum_strategies(
 ) -> OpStrategy:
     """
     Generate a strategy list for the ops that follow einsum style notation.
+<<<<<<< HEAD
 
     In principle, each mesh dim is independent of other device mesh dim when we
     generate strategies. So we generate strategy over each device mesh dim and
@@ -107,10 +108,13 @@ def gen_einsum_strategies(
 
     3. Linearity (Partial): If enabled, set Partial on output and inputs over
        the same device mesh dim.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # parse einop equation and extract dims
     input_dims, output_dim = EinsumDims.parse_equation(equation)
     edims = EinsumDims.parse_dims(input_dims, output_dim)
+<<<<<<< HEAD
     all_mesh_dim_strategies = []
 
     # generate strategies for each mesh dim and do cartesian product for final strategy. E.g., for a 2D mesh, we can have [P(),R,R]
@@ -177,6 +181,78 @@ def gen_einsum_strategies(
     # generate strategies for entire mesh
     all_mesh_dim_strategies = [strategies_over_one_mesh_dim] * mesh.ndim
     strategy_combs = itertools.product(*all_mesh_dim_strategies)
+=======
+
+    all_mesh_dim_strategies = []
+
+    # generate strategies for each mesh dim
+    for mesh_dim in range(mesh.ndim):
+        mesh_dim_strategies = []
+
+        # placement list stores placements of [output, input1, input2, ...]
+        # first we always have replicate all for inputs and output
+        placement_list: list[Placement] = [Replicate()] * (len(input_dims) + 1)
+        mesh_dim_strategies.append(placement_list)
+
+        # split batch dim
+        for batch_dim in edims.batch_dims:
+            output_batch_dim = output_dim.index(batch_dim)
+            placement_list = [Shard(output_batch_dim)]
+            for input_dim in input_dims:
+                input_batch_dim = input_dim.index(batch_dim)
+                placement_list.append(Shard(input_batch_dim))
+
+            mesh_dim_strategies.append(placement_list)
+
+        # split contracting dim
+        for contracting_dim in edims.contracting_dims:
+            placement_list = [Partial()]
+            for input_dim in input_dims:
+                input_contracting_dim = input_dim.index(contracting_dim)
+                placement_list.append(Shard(input_contracting_dim))
+
+            mesh_dim_strategies.append(placement_list)
+
+        # split lhs free dim
+        for lhs_dim in edims.lhs_out_only_dims:
+            lhs_free_dim = output_dim.index(lhs_dim)
+            # this means split the lhs input and output
+            # i.e. S(0), R -> S(0)
+            lhs_placement_list: list[Placement] = [
+                Shard(lhs_free_dim),
+                Shard(lhs_free_dim),
+                Replicate(),
+            ]
+            mesh_dim_strategies.append(lhs_placement_list)
+
+        # split rhs free dim
+        for rhs_dim in edims.rhs_out_only_dims:
+            rhs_free_dim = output_dim.index(rhs_dim)
+            rhs_placement_list: list[Placement] = [
+                Shard(rhs_free_dim),
+                Replicate(),
+                Shard(rhs_free_dim),
+            ]
+            mesh_dim_strategies.append(rhs_placement_list)
+
+        # linearity strategy
+        if linearity:
+            linearity_placement_list: list[Placement] = [Partial()]
+            for input_dim in input_dims:
+                linearity_placement_list.append(Partial())
+            mesh_dim_strategies.append(linearity_placement_list)
+
+        all_mesh_dim_strategies.append(mesh_dim_strategies)
+
+    # generate strategies for entire mesh
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    # TODO: filter out invalid strategies, at this point we generate
+    # all possible strategies without considering the whether the tensor
+    # dim could be sharded or not, we would need to filter out invalid
+    # strategies base on the actual tensor shape
+    # (i.e. for Shard, tensor dim size must > mesh size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_strategies = []
     for strategy_comb in strategy_combs:
         spec_list = [DTensorSpec(mesh, tuple(specs)) for specs in zip(*strategy_comb)]
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 41272c0f31a92..c651e89babc59 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -1,9 +1,18 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
+<<<<<<< HEAD
 from typing import cast
 
 import torch
+=======
+from dataclasses import dataclass, field
+from typing import cast, Optional
+
+import torch
+import torch.distributed._functional_collectives as funcol
+from torch.distributed.device_mesh import DeviceMesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._op_schema import (
     OpSchema,
     OpStrategy,
@@ -15,8 +24,13 @@
     register_op_strategy,
 )
 from torch.distributed.tensor.placement_types import (
+<<<<<<< HEAD
     MaskPartial,
     Partial,
+=======
+    Partial,
+    Placement,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Replicate,
     Shard,
 )
@@ -25,6 +39,165 @@
 aten = torch.ops.aten
 
 
+<<<<<<< HEAD
+=======
+@dataclass
+class MaskBuffer:
+    data: Optional[torch.Tensor] = None
+    # refcount allows shared usage of the MaskBuffer, as long as all users have the same data
+    refcount: int = 0
+
+    def materialize_mask(self, mask):
+        if self.refcount == 0:
+            self.data = mask
+        else:
+            assert self.data is not None
+            if not torch.equal(self.data, mask):
+                raise RuntimeError(
+                    "MaskBuffer has been materialized with conflicting data"
+                )
+        self.refcount += 1
+
+    def release_mask(self):
+        if self.refcount == 0 or self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+        self.refcount -= 1
+        if self.refcount == 0:
+            self.data = None
+
+    def apply_mask(self, tensor):
+        if self.refcount == 0 or self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+
+        # NOTE: _MaskPartial is being used by the embedding op and the gather op.
+        # For gather, the mask has the same dimension as the output tensor, whereas
+        # the output of the embedding op has an additional dimension compare to the input,
+        # hence the output masking logic below having two different cases.
+        if tensor.ndim == self.data.ndim:
+            tensor[self.data] = 0.0
+        else:
+            tensor[self.data, :] = 0.0
+
+
+@dataclass(frozen=True)
+class _MaskPartial(Partial):
+    """
+    A partial mask placement devised for rowwise sharded embedding op, where we need
+    to mask and adjust the indices to the local embedding shard, embedding masking
+    is a special type of the Partial placement
+
+    NOTE: the lifecycle of this MaskPartial placement follows the corresponding DTensor
+    lifecycle, i.e. the indices_mask would only be alive during the lifetime of the DTensor.
+    """
+
+    mask_buffer: MaskBuffer = field(default_factory=MaskBuffer)
+
+    # required fields for computing the local offset and deriving the mask
+    offset_shape: Optional[torch.Size] = None
+    offset_dim: int = 0
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # override parent logic to perform partial mask for embedding
+        num_chunks = mesh.size(mesh_dim)
+        # get local shard size and offset on the embedding_dim
+        assert self.offset_shape is not None, (
+            "offset_shape needs to be set for _MaskPartial"
+        )
+        local_shard_size, local_offset_on_dim = Shard._local_shard_size_and_offset(
+            self.offset_shape[self.offset_dim],
+            num_chunks,
+            mesh.get_local_rank(mesh_dim),
+        )
+        # Build the input mask and save it for the current partial placement
+        # this is so that the output of embedding op can reuse the same partial
+        # placement saved mask to perform mask + reduction
+        mask = (tensor < local_offset_on_dim) | (
+            tensor >= local_offset_on_dim + local_shard_size
+        )
+        # mask the input tensor
+        masked_tensor = tensor.clone() - local_offset_on_dim
+        masked_tensor[mask] = 0
+        # materialize the mask buffer to be used for reduction
+        self.mask_buffer.materialize_mask(mask)
+        return masked_tensor
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # perform sum reduction
+        return funcol.all_reduce(
+            tensor, reduceOp=self.reduce_op, group=(mesh, mesh_dim)
+        )
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # call reduce_shard_tensor of the shard_spec.
+        shard_spec = cast(Shard, shard_spec)
+        return shard_spec._reduce_shard_tensor(tensor, mesh, self.reduce_op, mesh_dim)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _MaskPartial):
+            return False
+
+        # if either data is not None, we invalidate the sharding cache, as this indicates
+        # the current MaskPartial placement is still in use and should not be used for cache hit.
+        if self.mask_buffer.data is not None or other.mask_buffer.data is not None:
+            return False
+
+        return (
+            self.reduce_op == other.reduce_op
+            and self.offset_shape == other.offset_shape
+            and self.offset_dim == other.offset_dim
+        )
+
+    def __hash__(self) -> int:
+        return 1 + hash(
+            (
+                self.reduce_op,
+                self.offset_shape,
+                self.offset_dim,
+            )
+        )
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the MaskPartial placement
+        """
+        return f"_MaskPartial(offset_shape={self.offset_shape}, offset_dim={self.offset_dim})"
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the MaskPartial placement
+        """
+        return "MaskP"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_strategy(aten.embedding.default)
 def embedding_strategy(op_schema: OpSchema) -> StrategyType:
     """
@@ -51,7 +224,11 @@ def embedding_strategy(op_schema: OpSchema) -> StrategyType:
     single_mesh_dim_strategies.append(colwise_sharding)
 
     # rowwise sharding, output is embedding partial, weight shard on dim 0, input accepts embedding partial
+<<<<<<< HEAD
     embedding_partial_placement = MaskPartial(offset_shape=weight_shape, offset_dim=0)
+=======
+    embedding_partial_placement = _MaskPartial(offset_shape=weight_shape, offset_dim=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE we want to reuse the same mask partial placement so that we can reuse the same mask that generates
     # from the input indices and use it for output reduction
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index 93030c7142b3e..d268570b472ad 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -22,7 +22,10 @@
     expand_to_full_mesh_op_strategy,
     generate_redistribute_costs,
     is_tensor_evenly_shardable,
+<<<<<<< HEAD
     is_tensor_evenly_shardable_on_dim,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     normalize_dim,
     normalize_dims,
     register_op_strategy,
@@ -73,6 +76,7 @@ class _NormPartial(Partial):
 
     norm_type: Union[int, float, str] = 2
 
+<<<<<<< HEAD
     def __init__(self, norm_type: Union[int, float, str] = 2):
         reduce_op = None
         if norm_type in (float("inf"), "inf"):
@@ -86,6 +90,19 @@ def __init__(self, norm_type: Union[int, float, str] = 2):
 
         super().__init__(reduce_op)
         object.__setattr__(self, "norm_type", norm_type)
+=======
+    def __post_init__(self):
+        """Set the appropriate reduce op based on the norm type."""
+        # Use `object.__setattr__` to bypass frozen checks
+        if self.norm_type in (float("inf"), "inf"):
+            object.__setattr__(self, "reduce_op", "max")
+        elif self.norm_type in (float("-inf"), "-inf"):
+            object.__setattr__(self, "reduce_op", "min")
+        elif isinstance(self.norm_type, (int, float)):
+            object.__setattr__(self, "reduce_op", "sum")
+        else:
+            raise NotImplementedError(f"Unsupported norm type: {self.norm_type}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _partition_value(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
@@ -106,10 +123,14 @@ def _partition_value(
                 raise NotImplementedError(f"Unsupported norm type:: {self.norm_type}")
             elif self.norm_type == 1:
                 return tensor / mesh.size(mesh_dim)
+<<<<<<< HEAD
             if not isinstance(self.norm_type, (int, float)):
                 raise AssertionError(
                     f"Expected int or float, got {type(self.norm_type)}"
                 )
+=======
+            assert isinstance(self.norm_type, (int, float))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return tensor / math.pow(mesh.size(mesh_dim), 1 / self.norm_type)
         raise NotImplementedError(self.reduce_op)
 
@@ -120,8 +141,12 @@ def _reduce_shard_value(
         mesh_dim: int,
         shard_spec: Placement,
     ) -> torch.Tensor:
+<<<<<<< HEAD
         if not isinstance(shard_spec, Shard):
             raise AssertionError(f"Expected Shard, got {type(shard_spec)}")
+=======
+        assert isinstance(shard_spec, Shard), f"{shard_spec}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor = self._pre_reduce_transform(tensor)
         reduced_tensor = super()._reduce_shard_value(tensor, mesh, mesh_dim, shard_spec)
         return self._post_reduce_transform(reduced_tensor)
@@ -135,23 +160,33 @@ def _reduce_value(
 
     def _pre_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
         if self.reduce_op == "sum":
+<<<<<<< HEAD
             if not isinstance(self.norm_type, (int, float)):
                 raise AssertionError(
                     f"Expected int or float, got {type(self.norm_type)}"
                 )
             if self.norm_type != 0 and self.norm_type != 1:
                 # pyrefly: ignore [unsupported-operation]
+=======
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return tensor**self.norm_type
         return tensor
 
     def _post_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
         if self.reduce_op == "sum":
+<<<<<<< HEAD
             if not isinstance(self.norm_type, (int, float)):
                 raise AssertionError(
                     f"Expected int or float, got {type(self.norm_type)}"
                 )
             if self.norm_type != 0 and self.norm_type != 1:
                 # pyrefly: ignore [unsupported-operation]
+=======
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return tensor ** (1.0 / self.norm_type)
         return tensor
 
@@ -248,8 +283,12 @@ def map_placements_after_reduction(
         if isinstance(placement, (Replicate, Partial)):
             new_placements.append(placement)
         else:
+<<<<<<< HEAD
             if not isinstance(placement, Shard):
                 raise AssertionError(f"Expected Shard, got {type(placement)}")
+=======
+            assert isinstance(placement, Shard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shard_dim = placement.dim
             new_shard_dim = reduction_dims_map[shard_dim]
             if new_shard_dim == -1 or shard_dim in reduction_dims:
@@ -284,6 +323,7 @@ def common_reduction_strategy(
     reduction_strategy = OpStrategy([])
 
     for op_spec in input_strategy.strategies:
+<<<<<<< HEAD
         if reduction_op == "avg":
             output_spec = op_spec.output_spec
             local_shape = list(output_spec.tensor_meta.shape)  # type:ignore[union-attr]
@@ -300,6 +340,8 @@ def common_reduction_strategy(
                 reduction_linear = False
                 break
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not reduction_linear:
             # input placements for this strategy should clear out pending sum and sharding
             # on the reduction dimension
@@ -339,6 +381,7 @@ def common_reduction_strategy(
     aten.all.dim: "sum",
     aten.sum.default: "sum",
     aten.sum.dim_IntList: "sum",
+<<<<<<< HEAD
     aten.any.default: "sum",
     aten.any.dim: "sum",
     aten.any.out: "sum",
@@ -347,6 +390,11 @@ def common_reduction_strategy(
     aten.prod.dim_int: "product",
     aten.prod.int_out: "product",
     # avg is only linear when there is no padding
+=======
+    aten.prod.default: "product",
+    aten.prod.dim_int: "product",
+    aten.prod.int_out: "product",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.mean.default: "avg",
     aten.mean.dim: "avg",
     aten.mean.out: "avg",
@@ -356,6 +404,12 @@ def common_reduction_strategy(
     aten.min.default: "min",
     aten.min.dim: "min",
     aten.min.out: "min",
+<<<<<<< HEAD
+=======
+    aten.any.default: "sum",
+    aten.any.dim: "sum",
+    aten.any.out: "sum",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.amax.default: "max",
     aten.amax.out: "max",
     aten.amin.default: "min",
@@ -369,8 +423,12 @@ def common_reduction_strategy(
 def linear_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dims = None
     if len(op_schema.args_schema) > 1:
@@ -393,11 +451,17 @@ def linear_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
 def cumsum_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     dim = args_schema[1]
     if not isinstance(dim, int):
         raise AssertionError(f"Expected int, got {type(dim)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+    dim = args_schema[1]
+    assert isinstance(dim, int), f"{dim}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return common_reduction_strategy(
         input_strategy, [dim], keep_dim=True, reduction_linear=False
@@ -411,8 +475,12 @@ def cumsum_strategy(op_schema: OpSchema) -> OpStrategy:
 def var_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dims = None
     if len(op_schema.args_schema) > 1:
         dims = _infer_reduction_dims(args_schema[1], input_strategy.ndim)
@@ -431,25 +499,39 @@ def var_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
 def vector_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
 
     norm_type = args_schema[1] if len(args_schema) > 1 else 2
     if not isinstance(norm_type, (int, float, str)):
         raise AssertionError(f"Expected int, float, or str, got {type(norm_type)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+
+    norm_type = args_schema[1] if len(args_schema) > 1 else 2
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = args_schema[2] if len(args_schema) > 2 else None
     keepdim = args_schema[3] if len(args_schema) > 3 else False
     dims = _infer_reduction_dims(dim, input_strategy.ndim)
     reduce_dims = list(range(input_strategy.ndim)) if dims is None else dims
+<<<<<<< HEAD
     reduction_linear = all(
         all(not p.is_partial() for p in op_spec.output_spec.placements)
         for op_spec in input_strategy.strategies
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return common_reduction_strategy(
         input_strategy,
         reduce_dims,
         keep_dim=cast(bool, keepdim),
+<<<<<<< HEAD
         reduction_linear=reduction_linear,
+=======
+        reduction_linear=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_op=NormReduction(norm_type),
     )
 
@@ -460,6 +542,7 @@ def vector_norm_strategy(op_schema: OpSchema) -> OpStrategy:
 def foreach_norm_strategy(op_schema: OpSchema) -> TupleStrategy:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_tuple_strategy, TupleStrategy):
         raise AssertionError(
             f"Expected TupleStrategy, got {type(input_tuple_strategy)}"
@@ -484,6 +567,23 @@ def foreach_norm_strategy(op_schema: OpSchema) -> TupleStrategy:
         )
         output_tuple_strategy_children.append(output_strategy)
     return TupleStrategy(output_tuple_strategy_children)
+=======
+    assert isinstance(input_tuple_strategy, TupleStrategy)
+    norm_type = args_schema[1] if len(args_schema) > 1 else 2
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    output_tuple_strategy_childs: list[OpStrategy] = []
+    for op_strategy in input_tuple_strategy.childs:
+        assert isinstance(op_strategy, OpStrategy), f"{op_strategy}"
+        reduce_dims = list(range(op_strategy.ndim))
+        output_strategy = common_reduction_strategy(
+            op_strategy,
+            reduce_dims,
+            reduction_linear=True,
+            reduction_op=NormReduction(norm_type),
+        )
+        output_tuple_strategy_childs.append(output_strategy)
+    return TupleStrategy(output_tuple_strategy_childs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_op_strategy(
@@ -515,8 +615,12 @@ def linalg_replicate_strategy(op_schema: OpSchema) -> OpStrategy:
     """
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mesh = input_strategy.mesh
 
     output_strategies: list[OpSpec] = []
@@ -610,6 +714,7 @@ def softmax_backward_strategy(op_schema: OpSchema) -> OpStrategy:
             mesh=grad_out_strategy.mesh,
             placements=replicate_reduction_dims(src_spec.placements, [softmax_dim]),
         )
+<<<<<<< HEAD
         new_grad_out_spec = DTensorSpec(
             mesh=tgt_spec.mesh,
             placements=tgt_spec.placements,
@@ -620,12 +725,17 @@ def softmax_backward_strategy(op_schema: OpSchema) -> OpStrategy:
             placements=tgt_spec.placements,
             tensor_meta=out_src_spec.tensor_meta,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         redist_grad_out_cost = generate_redistribute_costs(grad_out_strategy, tgt_spec)
         redist_out_cost = generate_redistribute_costs(out_strategy, tgt_spec)
         grad_in_strategy.strategies.append(
             OpSpec(
                 output_specs=tgt_spec,
+<<<<<<< HEAD
                 input_specs=(new_grad_out_spec, new_out_spec),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 redistribute_cost=[redist_grad_out_cost, redist_out_cost],
             )
         )
@@ -640,8 +750,12 @@ def softmax_backward_strategy(op_schema: OpSchema) -> OpStrategy:
 def nll_loss_forward_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
 
+<<<<<<< HEAD
     if not len(op_schema.args_schema) == 5:
         raise AssertionError(f"Expected 5 args, got {len(op_schema.args_schema)}")
+=======
+    assert len(op_schema.args_schema) == 5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     (
         input_strategy,
@@ -691,10 +805,14 @@ def nll_loss_forward_strategy(op_schema: OpSchema) -> OpStrategy:
         # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
         # make sure it is replicated
         if weight_strategy is not None:
+<<<<<<< HEAD
             if not isinstance(weight_strategy, OpStrategy):
                 raise AssertionError(
                     f"Expected OpStrategy, got {type(weight_strategy)}"
                 )
+=======
+            assert isinstance(weight_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_src_spec = weight_strategy.strategies[idx].output_spec
             weight_expected_spec = DTensorSpec(
                 mesh=mesh,
@@ -769,8 +887,12 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     # backward op does not need to validate the mesh since forward op has already done it
     mesh = op_schema.get_mesh_from_args(validate=False)
 
+<<<<<<< HEAD
     if not len(op_schema.args_schema) == 7:
         raise AssertionError(f"Expected 7 args, got {len(op_schema.args_schema)}")
+=======
+    assert len(op_schema.args_schema) == 7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (
         grad_out_strategy,
         input_strategy,
@@ -839,10 +961,14 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
         # make sure it is replicated
         if weight_strategy is not None:
+<<<<<<< HEAD
             if not isinstance(weight_strategy, OpStrategy):
                 raise AssertionError(
                     f"Expected OpStrategy, got {type(weight_strategy)}"
                 )
+=======
+            assert isinstance(weight_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_src_spec = weight_strategy.strategies[idx].output_spec
             weight_expected_spec = DTensorSpec(
                 mesh=mesh,
@@ -880,6 +1006,7 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     return grad_in_strategy
 
 
+<<<<<<< HEAD
 def _common_norm_forward_strategy(
     op_schema: OpSchema,
     rms_norm: bool = False,
@@ -921,12 +1048,42 @@ def _common_norm_forward_strategy(
         raise AssertionError(
             f"Expected int, Sequence, or torch.Size, got {type(normalized_shape)}"
         )
+=======
+@register_op_strategy(
+    [aten.native_layer_norm.default],
+    schema_info=RuntimeSchemaInfo(1),
+)
+def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
+    mesh = op_schema.get_mesh_from_args()
+
+    # args must be: input, normalized_shape, weight, bias, eps
+    # for None weight and bias, their corresponding objects will
+    # be None as well. layer_norm_strategy returns one OpStrategy
+    # for the triple return values (out, mean, rstd).
+    assert len(op_schema.args_schema) == 5
+    (
+        input_strategy,
+        normalized_shape,
+        weight_strategy,
+        bias_strategy,
+        _,
+    ) = op_schema.args_schema
+
+    # the current layer norm implementation requires that all
+    # input DTensor's sharding must be in form of OpStrategy
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     normalized_size = normalize_to_torch_size(normalized_shape)
 
     input_ndim = input_strategy.ndim
     axis = input_ndim - len(normalized_size)
 
+<<<<<<< HEAD
     # we use OpStrategy because the output values (out, mean, rstd)
+=======
+    # we use OpStrategy because the output (out, mean, rstd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # should have the same placements
     output_strategy = OpStrategy([])
     for idx, input_placement_strategy in enumerate(input_strategy.strategies):
@@ -948,10 +1105,14 @@ def _common_norm_forward_strategy(
         )
 
         if weight_strategy is not None:
+<<<<<<< HEAD
             if not isinstance(weight_strategy, OpStrategy):
                 raise AssertionError(
                     f"Expected OpStrategy, got {type(weight_strategy)}"
                 )
+=======
+            assert isinstance(weight_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_src_spec = weight_strategy.strategies[idx].output_spec
 
             # for the weight tensor, we replicate it on all dims if necessary
@@ -968,8 +1129,12 @@ def _common_norm_forward_strategy(
             )
 
         if bias_strategy is not None:
+<<<<<<< HEAD
             if not isinstance(bias_strategy, OpStrategy):
                 raise AssertionError(f"Expected OpStrategy, got {type(bias_strategy)}")
+=======
+            assert isinstance(bias_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             bias_src_spec = bias_strategy.strategies[idx].output_spec
 
             # for the bias tensor, we replicate it on all dims if necessary
@@ -999,6 +1164,7 @@ def _common_norm_forward_strategy(
 
 
 @register_op_strategy(
+<<<<<<< HEAD
     [aten.native_layer_norm.default],
     schema_info=RuntimeSchemaInfo(1),
 )
@@ -1067,11 +1233,43 @@ def _common_norm_backward_strategy(
         raise AssertionError(
             f"Expected int, Sequence, or torch.Size, got {type(normalized_shape)}"
         )
+=======
+    [aten.native_layer_norm_backward.default],
+    schema_info=RuntimeSchemaInfo(2),
+)
+def layer_norm_bwd_strategy(op_schema: OpSchema) -> OpStrategy:
+    # backward op does not need to validate the mesh since forward op has already done it
+    mesh = op_schema.get_mesh_from_args(validate=False)
+
+    # args must be: grad_out, input, normalized_shape, mean, rstd,
+    # weight, bias, output_mask. For None weight and bias, their
+    # corresponding objects will be None as well.
+
+    assert len(op_schema.args_schema) == 8
+    (
+        grad_out_strategy,
+        input_strategy,
+        normalized_shape,
+        mean_strategy,
+        rstd_strategy,
+        weight_strategy,
+        bias_strategy,
+        output_mask,
+    ) = op_schema.args_schema
+
+    assert isinstance(grad_out_strategy, OpStrategy)
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(mean_strategy, OpStrategy)
+    assert isinstance(rstd_strategy, OpStrategy)
+
+    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     normalized_size = normalize_to_torch_size(normalized_shape)
     input_ndim = input_strategy.ndim
     axis = input_ndim - len(normalized_size)
     outer_dims = list(range(axis))
 
+<<<<<<< HEAD
     if not rms_norm:
         if not (isinstance(output_mask, list) and len(output_mask) == 3):
             raise AssertionError(
@@ -1086,6 +1284,11 @@ def _common_norm_backward_strategy(
             )
 
     # output tuple: (d_input, d_weight[, d_bias])
+=======
+    assert isinstance(output_mask, list) and len(output_mask) == 3
+
+    # output triple: (d_input, d_weight, d_bias)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_tuple_strategy = OpStrategy([])
     for idx, input_placement_strategy in enumerate(input_strategy.strategies):
         # args for OpSpec
@@ -1126,6 +1329,7 @@ def _common_norm_backward_strategy(
             generate_redistribute_costs(input_strategy, input_target_spec)
         )
 
+<<<<<<< HEAD
         # arg: mean
         if not rms_norm:
             if mean_strategy is None:
@@ -1135,14 +1339,24 @@ def _common_norm_backward_strategy(
             redistribute_costs.append([0.0 for _ in mean_strategy.strategies])
 
         # arg: rstd
+=======
+        # arg: mean, rstd
+        mean_src_spec = mean_strategy.strategies[idx].output_spec
+        input_specs_list.append(mean_src_spec)
+        redistribute_costs.append([0.0 for _ in mean_strategy.strategies])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rstd_src_spec = rstd_strategy.strategies[idx].output_spec
         input_specs_list.append(rstd_src_spec)
         redistribute_costs.append([0.0 for _ in rstd_strategy.strategies])
 
         def _add_target_input_spec(strategy) -> DTensorSpec:
             # shared logic for setting the weight and bias target input specs
+<<<<<<< HEAD
             if not isinstance(strategy, OpStrategy):
                 raise AssertionError(f"Expected OpStrategy, got {type(strategy)}")
+=======
+            assert isinstance(strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             src_spec = strategy.strategies[idx].output_spec
             # no need to redistribute since they should be replicated in forward pass
             input_specs_list.append(src_spec)
@@ -1151,7 +1365,10 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
 
         # arg: weight
         # d_weight = sum(grad_out * (input - mean) / rstd, outer_dim, keepdim=False)
+<<<<<<< HEAD
         # For RMS norm, mean is 0, so it's just: sum(grad_out * input / rstd, outer_dim, keepdim=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if weight_strategy is not None:
             weight_src_spec = _add_target_input_spec(weight_strategy)
             # TODO: now d_weight spec follows input spec w/ a reduction.
@@ -1171,16 +1388,23 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
             )
             output_specs_list.append(weight_out_spec if output_mask[1] else None)
         else:
+<<<<<<< HEAD
             if not rms_norm:
                 error_msg = "output_mask[1] should not be `True` while weight argument is `None` in native_layer_norm_backward."
             else:
                 error_msg = "output_mask[1] should not be `True` while weight argument is `None` in _fused_rms_norm_backward."
             if output_mask[1] is not False:
                 raise AssertionError(error_msg)
+=======
+            assert output_mask[1] is False, (
+                "output_mask[1] should not be `True` while weight argument is `None` in native_layer_norm_backward."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs_list.append(None)
 
         # arg: bias
         # d_bias = sum(grad_out, outer_dim, keepdim=False)
+<<<<<<< HEAD
         if not rms_norm:
             if bias_strategy is not None:
                 bias_src_spec = _add_target_input_spec(bias_strategy)
@@ -1206,6 +1430,31 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
                         "output_mask[2] should not be `True` while bias argument is `None` in native_layer_norm_backward."
                     )
                 output_specs_list.append(None)
+=======
+        if bias_strategy is not None:
+            bias_src_spec = _add_target_input_spec(bias_strategy)
+            # d_bias spec follows a reduction over grad_out
+            inp_placements = _replicate_dims_start_at(
+                grad_out_target_spec.placements, axis
+            )
+            reduce_dims_map = _infer_reduce_dims_map(
+                outer_dims, grad_out_target_spec.ndim, False
+            )
+            out_placements = map_placements_after_reduction(
+                inp_placements, outer_dims, reduce_dims_map, "sum"
+            )
+            bias_out_spec = DTensorSpec(
+                mesh=mesh,
+                placements=out_placements,
+                tensor_meta=bias_src_spec.tensor_meta,
+            )
+            output_specs_list.append(bias_out_spec if output_mask[2] else None)
+        else:
+            assert output_mask[2] is False, (
+                "output_mask[2] should not be `True` while bias argument is `None` in native_layer_norm_backward."
+            )
+            output_specs_list.append(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out_tuple_strategy.strategies.append(
             OpSpec(
@@ -1219,6 +1468,7 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
 
 
 @register_op_strategy(
+<<<<<<< HEAD
     [aten.native_layer_norm_backward.default],
     schema_info=RuntimeSchemaInfo(2),
 )
@@ -1250,10 +1500,13 @@ def sort_strategy(op_schema: OpSchema, sort_dim: int) -> OpStrategy:
 
 
 @register_op_strategy(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [aten.topk.default],
     schema_info=RuntimeSchemaInfo(2),
 )
 def topk_strategy(op_schema: OpSchema) -> OpStrategy:
+<<<<<<< HEAD
     topk_dim = (
         cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else -1
     )
@@ -1354,4 +1607,28 @@ def logsumexp_strategy(op_schema: OpSchema) -> OpStrategy:
         reduce_dims,
         keep_dim=keep_dim,
         reduction_linear=False,
+=======
+    input_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    topk_dim = (
+        cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else -1
+    )
+    topk_dim = normalize_dim(topk_dim, input_strategy.ndim)
+
+    single_mesh_dim_strategies = []
+
+    # two outputs (values, indices), 1 input
+    # replicate always works
+    all_replicate: PlacementList = [Replicate()] * 3
+    single_mesh_dim_strategies.append(all_replicate)
+
+    # every dim except topk dim should work
+    for dim in range(input_strategy.ndim):
+        if dim != topk_dim:
+            dim_shardings: PlacementList = [Shard(dim)] * 3
+            single_mesh_dim_strategies.append(dim_shardings)
+    # TODO: topk on sharded dim requries non-trival reduction, address it later
+
+    return expand_to_full_mesh_op_strategy(
+        input_strategy.mesh, op_schema, single_mesh_dim_strategies, input_index=2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/torch/distributed/tensor/_ops/_matrix_ops.py b/torch/distributed/tensor/_ops/_matrix_ops.py
index cd6ba48d9832b..d2b5ca0b1805c 100644
--- a/torch/distributed/tensor/_ops/_matrix_ops.py
+++ b/torch/distributed/tensor/_ops/_matrix_ops.py
@@ -6,7 +6,11 @@
 
 import torch
 from torch.distributed.device_mesh import DeviceMesh
+<<<<<<< HEAD
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+=======
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._op_schema import (
     OpSchema,
     OpSpec,
@@ -24,10 +28,13 @@
     prod,
     register_op_strategy,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor._utils import (
     compute_local_shape_and_global_offset,
     compute_local_stride,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.placement_types import (
     Partial,
     Placement,
@@ -42,8 +49,12 @@
 @register_op_strategy(aten.t.default)
 def transpose_strategy(op_schema: OpSchema) -> OpStrategy:
     self_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(self_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
+=======
+    assert isinstance(self_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     transpose_strategies = []
     for input_strategy in self_strategy.strategies:
@@ -69,20 +80,29 @@ def _mm_like_strategy(
     mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
 ) -> OpStrategy:
     self_strategy, mat2_strategy = op_schema.args_schema
+<<<<<<< HEAD
     if not isinstance(self_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
     if not isinstance(mat2_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
+=======
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(mat2_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # generate all possible strategies for mm
     mm_strategy = gen_einsum_strategies(mm_equation, mesh)
     # filter out invalid strategies and associate costs
     strategies = mm_strategy.strategies
     filtered_strategies = []
     for strtg in strategies:
+<<<<<<< HEAD
         if strtg.input_specs is None:
             raise AssertionError(
                 f"Expected input_specs to be not None, got {strtg.input_specs}"
             )
+=======
+        assert strtg.input_specs is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         if is_tensor_shardable(self_strategy.shape, self_spec) and is_tensor_shardable(
@@ -104,12 +124,18 @@ def _addmm_like_strategy(
     mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
 ) -> OpStrategy:
     self_strategy, mat1_strategy, mat2_strategy = op_schema.args_schema
+<<<<<<< HEAD
     if not isinstance(self_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
     if not isinstance(mat1_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(mat1_strategy)}")
     if not isinstance(mat2_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
+=======
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(mat1_strategy, OpStrategy)
+    assert isinstance(mat2_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self_shape = self_strategy.shape
     mm_out_shape = torch.Size(
         [
@@ -124,10 +150,14 @@ def _addmm_like_strategy(
     filtered_strategies = []
     for strtg in strategies:
         # construct new strategy by consider the self arg
+<<<<<<< HEAD
         if strtg.input_specs is None:
             raise AssertionError(
                 f"Expected input_specs to be not None, got {strtg.input_specs}"
             )
+=======
+        assert strtg.input_specs is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mat1_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         out_spec = strtg.output_spec
@@ -172,6 +202,7 @@ def _scaled_mm_like_strategy(
         scale_result_strategy,
         *_,
     ) = op_schema.args_schema
+<<<<<<< HEAD
     if not isinstance(self_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
     if not isinstance(mat2_strategy, OpStrategy):
@@ -185,16 +216,31 @@ def _scaled_mm_like_strategy(
         raise AssertionError("_scaled_mm on DTensors doesn't support bias")
     if scale_result_strategy is not None:
         raise AssertionError("_scaled_mm on DTensors doesn't support scale_result")
+=======
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(mat2_strategy, OpStrategy)
+    assert isinstance(scale_self_strategy, OpStrategy)
+    assert isinstance(scale_mat2_strategy, OpStrategy)
+    # TODO: add support for these later
+    assert bias_strategy is None, "_scaled_mm on DTensors doesn't support bias"
+    assert scale_result_strategy is None, (
+        "_scaled_mm on DTensors doesn't support scale_result"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # generate all possible strategies for mm
     mm_strategy = gen_einsum_strategies(mm_equation, mesh)
     # filter out invalid strategies and associate costs
     strategies = mm_strategy.strategies
     filtered_strategies = []
     for strtg in strategies:
+<<<<<<< HEAD
         if strtg.input_specs is None:
             raise AssertionError(
                 f"Expected input_specs to be not None, got {strtg.input_specs}"
             )
+=======
+        assert strtg.input_specs is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         # propagate the operands' specs to their scales, except for tensor-wise
@@ -279,8 +325,12 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
 
     return_debug_mask = len(op_schema.args_schema) >= 6 and op_schema.args_schema[5]
     q_input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(q_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
+=======
+    assert isinstance(q_input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -332,7 +382,10 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
     # Shard on the batch dimension
+<<<<<<< HEAD
     debug_attn_mask_sharding = Shard(0) if return_debug_mask else Replicate()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     single_mesh_dim_strategies.append(
         [
             Shard(0),  # output
@@ -343,7 +396,11 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
             None,  # max_k
             Replicate(),  # rng_state
             None,  # unused
+<<<<<<< HEAD
             debug_attn_mask_sharding,  # debugattn
+=======
+            Shard(0),  # debugattn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Shard(0),  # q
             Shard(0),  # k
             Shard(0),  # v
@@ -351,7 +408,10 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
     )
 
     # Context Parallelism: shards on the sequence dim
+<<<<<<< HEAD
     debug_attn_mask_sharding = Shard(2) if return_debug_mask else Replicate()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     single_mesh_dim_strategies.append(
         [
             Shard(2),  # output
@@ -362,7 +422,11 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
             None,  # max_k
             Replicate(),  # rng_state
             None,  # unused
+<<<<<<< HEAD
             debug_attn_mask_sharding,  # debugattn
+=======
+            Shard(2),  # debugattn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Shard(2),  # q
             Shard(2),  # k
             Shard(2),  # v
@@ -381,8 +445,12 @@ def scaled_dot_product_flash_attention_backward_strategy(
     mesh = op_schema.get_mesh_from_args(validate=False)
 
     q_input_strategy = op_schema.args_schema[1]
+<<<<<<< HEAD
     if not isinstance(q_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
+=======
+    assert isinstance(q_input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
 
     tensor_input_indices = [
@@ -494,8 +562,12 @@ def scaled_dot_product_efficient_attention_strategy(op_schema: OpSchema) -> OpSt
     # NOTE: currently we only support some simple strategies to support tensor parallelism
     mesh = op_schema.get_mesh_from_args()
     q_input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(q_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
+=======
+    assert isinstance(q_input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
 
     has_attn_bias = op_schema.args_schema[3] is not None
@@ -592,8 +664,12 @@ def scaled_dot_product_efficient_attention_backward_strategy(
     mesh = op_schema.get_mesh_from_args(validate=False)
 
     q_input_strategy = op_schema.args_schema[1]
+<<<<<<< HEAD
     if not isinstance(q_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
+=======
+    assert isinstance(q_input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
     has_attn_bias = op_schema.args_schema[4] is not None
 
@@ -712,8 +788,12 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
         Replicate() if return_debug_mask else None
     )
 
+<<<<<<< HEAD
     if not isinstance(query_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(query_strategy)}")
+=======
+    assert isinstance(query_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -730,7 +810,11 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
         None,  # max_k
         None,  # philox_seed
         None,  # philox_offset
+<<<<<<< HEAD
         # NOTE: debug_attn_mask is not supported by pytorch and is always an empty tensor
+=======
+        # NOTE: debug_attn_mask is not supproted by pytorch and is always an empty tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://github.com/pytorch/pytorch/blob/60205b0eb2602317856312a66d955c88334ade0b/aten/src/ATen/native/transformers/cuda/attention.cu#L839-L840
         debug_attn_mask_sharding,  # debug_attn_mask
         Replicate(),  # q
@@ -818,16 +902,24 @@ def scaled_scaled_dot_product_cudnn_attention_backward_strategy(
     # backward op does not need to validate the mesh since forward op has already done it
     mesh = op_schema.get_mesh_from_args(validate=False)
 
+<<<<<<< HEAD
     if len(op_schema.args_schema) < 15:
         raise AssertionError(
             f"Expected at least 15 args_schema, got {len(op_schema.args_schema)}"
         )
+=======
+    assert len(op_schema.args_schema) >= 15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     has_attn_bias = op_schema.args_schema[8] is not None
     has_scale = len(op_schema.args_schema) >= 16 and False
 
     query_strategy = op_schema.args_schema[1]
+<<<<<<< HEAD
     if not isinstance(query_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(query_strategy)}")
+=======
+    assert isinstance(query_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -939,6 +1031,7 @@ def grouped_mm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
 
     mat1_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(mat1_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(mat1_strategy)}")
     mat2_strategy = op_schema.args_schema[1]
@@ -948,6 +1041,14 @@ def grouped_mm_strategy(op_schema: OpSchema) -> OpStrategy:
         bias_strategy = op_schema.args_schema[3]
         if bias_strategy is not None:
             raise AssertionError("grouped_mm doesn't support bias yet")
+=======
+    assert isinstance(mat1_strategy, OpStrategy)
+    mat2_strategy = op_schema.args_schema[1]
+    assert isinstance(mat2_strategy, OpStrategy)
+    if len(op_schema.args_schema) > 3:
+        bias_strategy = op_schema.args_schema[3]
+        assert bias_strategy is None, "grouped_mm doesn't support bias yet"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     single_mesh_dim_strategies = []
 
@@ -1072,6 +1173,7 @@ def grouped_mm_strategy(op_schema: OpSchema) -> OpStrategy:
             ]
         )
 
+<<<<<<< HEAD
     def valid_grouped_mm_strides(
         input_specs: list[DTensorSpec], output_specs: tuple[Optional[DTensorSpec], ...]
     ) -> bool:
@@ -1127,4 +1229,8 @@ def check_valid_strides(meta: TensorMeta) -> bool:
         single_mesh_dim_strategies,
         input_index=1,
         is_valid_strategy_cb=valid_grouped_mm_strides,
+=======
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
diff --git a/torch/distributed/tensor/_ops/_pointwise_ops.py b/torch/distributed/tensor/_ops/_pointwise_ops.py
index 084fa62706e0d..1f2b414f11806 100644
--- a/torch/distributed/tensor/_ops/_pointwise_ops.py
+++ b/torch/distributed/tensor/_ops/_pointwise_ops.py
@@ -1,6 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import cast, Optional
+=======
+from typing import cast
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
@@ -25,7 +29,10 @@
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.utils._typing_utils import not_none
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
@@ -46,6 +53,18 @@
 # ]
 
 
+<<<<<<< HEAD
+=======
+linear_pointwise_ops = [
+    aten.div.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.div_.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.to.dtype,
+    aten.add.Tensor,
+    aten.add_.Tensor,
+]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pointwise_ops = [
     # please keep the entries below alphabetically sorted
     aten.__ilshift__.Scalar,
@@ -134,6 +153,7 @@
     aten.ceil.out,
     aten.ceil_.default,
     aten.clamp.default,
+<<<<<<< HEAD
     aten.clamp.Tensor,
     aten.clamp.out,
     aten.clamp_.default,
@@ -142,6 +162,10 @@
     aten.clamp_min.Tensor,
     aten.clamp_max.default,
     aten.clamp_max.Tensor,
+=======
+    aten.clamp.out,
+    aten.clamp_.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.clip.default,
     aten.clip.out,
     aten.clip_.default,
@@ -295,7 +319,15 @@
     aten.maximum.out,
     aten.minimum.default,
     aten.minimum.out,
+<<<<<<< HEAD
+    aten.mul.out,
+=======
+    aten.mul.Scalar,
+    aten.mul.Tensor,
     aten.mul.out,
+    aten.mul_.Scalar,
+    aten.mul_.Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.mvlgamma.default,
     aten.mvlgamma.out,
     aten.mvlgamma_.default,
@@ -410,6 +442,7 @@
     aten.threshold_backward.default,
 ]
 
+<<<<<<< HEAD
 # the linear pointwise ops map, key is op, value is the type of linearity
 linear_pointwise_ops = {
     aten.to.dtype: 0,
@@ -427,12 +460,18 @@
 
 def pointwise_strategy(op_schema: OpSchema, linearity: int = -1) -> OpStrategy:
     followed_strategy_index = -1
+=======
+
+def pointwise_strategy(op_schema: OpSchema, linearity: bool = False) -> OpStrategy:
+    max_shards_strategy_index = -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     max_shards = -1
     max_ndim = -1
 
     if op_schema.is_inplace_op():
         # inplace op should follow the first arg strategy
         followed_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
         followed_strategy_index = 0
     elif op_schema.is_out_variant_op():
         # out variant op should follow the out kwarg strategy
@@ -441,6 +480,11 @@ def pointwise_strategy(op_schema: OpSchema, linearity: int = -1) -> OpStrategy:
         # have an "index", we set it to a reasonably large number just to indicate it's
         # not a valid index
         followed_strategy_index = 100
+=======
+    elif op_schema.is_out_variant_op():
+        # out variant op should follow the out kwarg strategy
+        followed_strategy = op_schema.kwargs_schema["out"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         # normal pointwise op, we choose to follow the arg with
         # the max shards in case operands needs reshard
@@ -455,16 +499,25 @@ def pointwise_strategy(op_schema: OpSchema, linearity: int = -1) -> OpStrategy:
             if (arg_max_shards > max_shards) or (
                 arg_max_shards == max_shards and arg_max_ndim > max_ndim
             ):
+<<<<<<< HEAD
                 followed_strategy_index = idx
                 max_shards = arg_max_shards
                 max_ndim = arg_max_ndim
 
         followed_strategy = op_schema.args_schema[followed_strategy_index]
+=======
+                max_shards_strategy_index = idx
+                max_shards = arg_max_shards
+                max_ndim = arg_max_ndim
+
+        followed_strategy = op_schema.args_schema[max_shards_strategy_index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert isinstance(followed_strategy, OpStrategy), (
         f"no strategy to follow for {op_schema}!"
     )
     return common_pointwise_strategy(
+<<<<<<< HEAD
         op_schema.args_schema,
         followed_strategy,
         followed_strategy_index,
@@ -513,15 +566,31 @@ def common_pointwise_strategy(
         scalar_tensor_idx: Index of the Replicate scalar tensor for which we allow the mesh
             to be different from the mesh of followed_strategy
     """
+=======
+        op_schema.args_schema, followed_strategy, linearity
+    )
+
+
+def common_pointwise_strategy(
+    args_schema: Sequence[object],
+    followed_strategy: OpStrategy,
+    linearity: bool,
+) -> OpStrategy:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # handle broadcasting
     common_shape = torch.broadcast_shapes(
         *[arg.shape for arg in args_schema if isinstance(arg, OpStrategy)]
     )
     pointwise_strategy = OpStrategy([])
 
+<<<<<<< HEAD
     for op_spec in followed_strategy.strategies:
         spec_to_follow = op_spec.output_spec
 
+=======
+    for placement_strategy in followed_strategy.strategies:
+        spec_to_follow = placement_strategy.output_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out_placements: list[Placement] = []
         for placement in spec_to_follow.placements:
             if isinstance(placement, Shard):
@@ -529,6 +598,7 @@ def common_pointwise_strategy(
                 common_ndim = len(common_shape)
                 new_shard_dim = common_ndim - len(spec_to_follow.shape) + shard_dim
                 out_placements.append(Shard(new_shard_dim))
+<<<<<<< HEAD
             elif isinstance(placement, Partial):
                 # note that only partial-sum and partial-avg are supported for linearity
                 partial_supports_linearity = placement.is_partial(
@@ -542,11 +612,19 @@ def common_pointwise_strategy(
                     # by default we just replicate the partial, need to see if this
                     # is optimal for all cases
                     out_placements.append(Replicate())
+=======
+            elif isinstance(placement, Partial) and not linearity:
+                # clear the partial placemnet if op does not support linearity
+                # by default we just replicate the partial, need to see if this
+                # is optimal for all cases
+                out_placements.append(Replicate())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 out_placements.append(placement)
 
         input_specs: list[DTensorSpec] = []
         redistribute_costs: list[list[float]] = []
+<<<<<<< HEAD
         for input_idx, input_arg in enumerate(args_schema):
             if isinstance(input_arg, OpStrategy):
                 input_arg_spec = input_arg.strategies[0].output_spec
@@ -588,13 +666,34 @@ def common_pointwise_strategy(
                     != followed_strategy_index  # Don't convert the "followed" strategy
                 )
 
+=======
+        for arg_idx, input_arg in enumerate(args_schema):
+            if isinstance(input_arg, OpStrategy):
+                # sanity check that all args that follow the same strategy
+                # are on the same DeviceMesh
+                if input_arg.mesh != followed_strategy.mesh:
+                    raise ValueError(
+                        f"Could not run pointwise computation across different mesh: "
+                        f"Found {input_arg.mesh} and {followed_strategy.mesh}!"
+                    )
+
+                # every arg follow the out_placements, but need to handle broadcasting
+                input_arg_spec = input_arg.strategies[0].output_spec
+                input_arg_dims_map = infer_broadcast_dims_map(
+                    common_shape, input_arg_spec.shape
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_target_placements = map_placements_after_broadcast(
                     tuple(out_placements),
                     common_shape,
                     input_arg_dims_map,
+<<<<<<< HEAD
                     partial_to_replicate=should_convert_partial,
                 )
 
+=======
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_arg_target_spec = DTensorSpec(
                     mesh=followed_strategy.mesh,
                     placements=input_target_placements,
@@ -618,7 +717,20 @@ def common_pointwise_strategy(
     return pointwise_strategy
 
 
+<<<<<<< HEAD
 for op in linear_pointwise_ops.keys():
+=======
+def linear_pointwise_strategy(op_schema: OpSchema) -> StrategyType:
+    """
+    Linear pointwise operators can propagate pending reductions.
+    For example, c = add(a, b); if a is pending sum, then c will be
+    pending sum as well without any communication overhead.
+    """
+    return pointwise_strategy(op_schema, linearity=True)
+
+
+for op in linear_pointwise_ops:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_op_strategy(op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"]))(
         linear_pointwise_strategy
     )
@@ -660,8 +772,11 @@ def common_pointwise_strategy(
     aten._foreach_mul_.ScalarList,
     aten._foreach_mul_.Tensor,
     aten._foreach_mul_.List,
+<<<<<<< HEAD
     aten._foreach_pow.List,
     aten._foreach_pow.ScalarList,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten._foreach_neg.default,
     aten._foreach_neg_.default,
     aten._foreach_reciprocal_.default,
@@ -711,6 +826,7 @@ def list_pointwise_strategy(
         OpStrategy: generated strategy
     """
 
+<<<<<<< HEAD
     def args_tuple_strategies(
         args_schema: tuple[object, ...],
     ) -> list[Optional[TupleStrategy]]:
@@ -722,6 +838,17 @@ def args_tuple_strategies(
             if isinstance(arg, TupleStrategy):
                 # every tuple strategy should have the same length
                 assert len(arg.children) == strategy_len
+=======
+    def args_tuple_strategies(args_schema: tuple[object, ...]) -> list[TupleStrategy]:
+        first_arg = args_schema[0]
+        assert isinstance(first_arg, TupleStrategy)
+        strategy_len = len(first_arg.childs)
+        tuple_strategies: list[TupleStrategy] = []
+        for arg_idx, arg in enumerate(args_schema):
+            if isinstance(arg, TupleStrategy):
+                # every tuple strategy should have the same length
+                assert len(arg.childs) == strategy_len
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tuple_strategies.append(arg)
             elif isinstance(arg, OpStrategy):
                 if arg_idx > 0:  # implicitly broadcast
@@ -732,6 +859,7 @@ def args_tuple_strategies(
                     raise RuntimeError(
                         f"list op only supports tuple strategy! {op_schema}"
                     )
+<<<<<<< HEAD
             else:
                 # insert None as placeholder so that the idx of arg is kept
                 tuple_strategies.append(None)
@@ -754,6 +882,21 @@ def args_tuple_strategies(
             scalar_tensor_idx=(
                 _FUSED_OP_SCALAR_IDX if op_schema.op in fused_ops else None
             ),
+=======
+        return tuple_strategies
+
+    args_strategies = args_tuple_strategies(op_schema.args_schema)
+    follow_strategy: TupleStrategy = args_strategies[0]
+    list_strategy: list[OpStrategy] = []
+    for child_idx, child_strtgy in enumerate(follow_strategy.childs):
+        assert isinstance(child_strtgy, OpStrategy)
+        args_schema: list[OpStrategy] = [
+            cast(OpStrategy, arg_strategy.childs[child_idx])
+            for arg_strategy in args_strategies
+        ]
+        pointwise_strategy: OpStrategy = common_pointwise_strategy(
+            args_schema, child_strtgy, linearity
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         list_strategy.append(pointwise_strategy)
     return TupleStrategy(list_strategy)
@@ -787,6 +930,7 @@ def list_linear_pointwise_strategy(op_schema: OpSchema) -> StrategyType:
     aten._fused_adamw_.tensor_lr,
 ]
 
+<<<<<<< HEAD
 
 # The state_steps arg of fused adam / adamw is a Replicate scalar tensor, which will be put on
 # the compute_mesh of an op across all parameter groups, even when not all parameter groups
@@ -794,6 +938,8 @@ def list_linear_pointwise_strategy(op_schema: OpSchema) -> StrategyType:
 # redistribute during sharding propagation.
 _FUSED_OP_SCALAR_IDX = 5
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 for op in fused_ops:
     register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
         list_pointwise_strategy
diff --git a/torch/distributed/tensor/_ops/_random_ops.py b/torch/distributed/tensor/_ops/_random_ops.py
index 9db9b85e58d2d..2136d4164c055 100644
--- a/torch/distributed/tensor/_ops/_random_ops.py
+++ b/torch/distributed/tensor/_ops/_random_ops.py
@@ -31,6 +31,7 @@ def random_op_strategy(op_schema: OpSchema) -> StrategyType:
         if is_tensor_partial(arg_spec):
             # TODO: figure out how inplace random op should behave when it's partial
             raise RuntimeError(f"{op_schema.op} with Partial is not supported yet!")
+<<<<<<< HEAD
         random_strategy.strategies.append(
             OpSpec(
                 output_specs=arg_spec,
@@ -38,5 +39,8 @@ def random_op_strategy(op_schema: OpSchema) -> StrategyType:
                 redistribute_cost=[[0.0] * len(self_strategy.strategies)],
             )
         )
+=======
+        random_strategy.strategies.append(OpSpec(output_specs=arg_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return random_strategy
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index e3134c26a9158..abdec2a5a1504 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -4,7 +4,10 @@
 from typing import cast, Optional
 
 import torch
+<<<<<<< HEAD
 from torch._prims_common import IntLike
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
     OpSchema,
@@ -17,7 +20,11 @@
     TupleStrategy,
 )
 from torch.distributed.tensor._ops._common_rules import pointwise_rule
+<<<<<<< HEAD
 from torch.distributed.tensor._ops._embedding_ops import MaskPartial
+=======
+from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._ops.utils import (
     expand_to_full_mesh_op_strategy,
     generate_redistribute_costs,
@@ -27,8 +34,11 @@
     normalize_dim,
     register_op_strategy,
     register_prop_rule,
+<<<<<<< HEAD
     shift_shard_dims_after_insert,
     shift_shard_dims_after_remove,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.tensor.placement_types import (
     Partial,
@@ -41,6 +51,7 @@
 aten = torch.ops.aten
 
 
+<<<<<<< HEAD
 def propagate_single_input_strategy(op_schema: OpSchema) -> StrategyType:
     # For ops with a single tensor input, we perform a 1:1 mapping such that
     # for each strategy that the input supports, we create a corresponding strategy.
@@ -77,23 +88,53 @@ def propagate_single_input_strategy(op_schema: OpSchema) -> StrategyType:
             for strategy in first_input_strategy.strategies
         ]
     )
+=======
+def default_strategy(op_schema: OpSchema) -> StrategyType:
+    # Default strategy by default just propagate the first input strategy
+    select_strategy = op_schema.args_schema[0]
+    assert isinstance(select_strategy, OpStrategy)
+    # we create new DTensorSpecs even for default strategy to assure that
+    # the tensor metas are distinct between the arguments and outputs
+    default_strategy = [
+        OpSpec(
+            output_specs=DTensorSpec(
+                mesh=select_strategy.mesh,
+                placements=strategy.output_spec.placements,
+            )
+        )
+        for strategy in select_strategy.strategies
+    ]
+    return OpStrategy(default_strategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 register_op_strategy(
     [
         aten.clone.default,
         aten.contiguous.default,
+<<<<<<< HEAD
+=======
+        aten.copy_.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aten.detach.default,
         aten.fill_.Scalar,
         aten.view.dtype,
         aten.zero_.default,
     ]
+<<<<<<< HEAD
 )(propagate_single_input_strategy)
 
 
 register_op_strategy(
     aten._to_copy.default, schema_info=RuntimeSchemaInfo(static_kwargkey=["dtype"])
 )(propagate_single_input_strategy)
+=======
+)(default_strategy)
+
+register_op_strategy(
+    aten._to_copy.default, schema_info=RuntimeSchemaInfo(static_kwargkey=["dtype"])
+)(default_strategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_op_strategy(
@@ -109,10 +150,15 @@ def equal_strategy(op_schema: OpSchema) -> StrategyType:
     # same strategy in theory.
     mesh = op_schema.get_mesh_from_args()
     self_strategy, other_strategy = op_schema.args_schema
+<<<<<<< HEAD
     if not isinstance(self_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
     if not isinstance(other_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(other_strategy)}")
+=======
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(other_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     select_strategy = (
         self_strategy
@@ -168,8 +214,12 @@ def create_like_strategy(op_schema: OpSchema) -> StrategyType:
     # move from partial to replicated.
     select_strategy = op_schema.args_schema[0]
     create_like_strategy = OpStrategy([])
+<<<<<<< HEAD
     if not isinstance(select_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(select_strategy)}")
+=======
+    assert isinstance(select_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg_strategy in select_strategy.strategies:
         arg_spec = arg_strategy.output_spec
         output_spec = DTensorSpec(
@@ -201,14 +251,22 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
     # 1. let the output be replicated
     # 2. let the output follow the input if input and output have the same shape
     input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     mesh = input_strategy.mesh
     input_shape = input_strategy.shape
     output_shape = op_schema.args_schema[1]
+<<<<<<< HEAD
     if not isinstance(output_shape, list):
         raise AssertionError(f"Expected list, got {type(output_shape)}")
+=======
+    assert isinstance(output_shape, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_factory_strategy = OpStrategy([])
     for arg_strategy in input_strategy.strategies:
@@ -218,7 +276,11 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
             OpSpec(
                 output_specs=replica_spec,
                 input_specs=(input_spec,),
+<<<<<<< HEAD
                 redistribute_cost=[[0.0] * len(input_strategy.strategies)],
+=======
+                redistribute_cost=[[0.0] * mesh.ndim],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
@@ -236,7 +298,11 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
                     output_specs=input_spec,
                     input_specs=(input_spec,),
                     # encouraging new tensor placement to be the same as input
+<<<<<<< HEAD
                     redistribute_cost=[[-0.1] * len(input_strategy.strategies)],
+=======
+                    redistribute_cost=[[-0.1] * mesh.ndim],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
@@ -247,6 +313,7 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
 def gen_bucketize_strategy(op_schema: OpSchema) -> StrategyType:
     """Just propagate input sharding, but expect replicated for boundaries input."""
     mesh = op_schema.get_mesh_from_args()
+<<<<<<< HEAD
     input_strategy, boundaries_strategy = op_schema.args_schema
     bucketize_strategy = OpStrategy([])
     if not isinstance(input_strategy, OpStrategy):
@@ -273,6 +340,16 @@ def gen_bucketize_strategy(op_schema: OpSchema) -> StrategyType:
                     generate_redistribute_costs(boundaries_strategy, replica_spec),
                 ],
             )
+=======
+    input_strategy = op_schema.args_schema[0]
+    bucketize_strategy = OpStrategy([])
+    assert isinstance(input_strategy, OpStrategy)
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = DTensorSpec(mesh, arg_strategy.output_spec.placements)
+        replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+        bucketize_strategy.strategies.append(
+            OpSpec(output_specs=arg_spec, input_specs=(arg_spec, replica_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return bucketize_strategy
@@ -292,10 +369,15 @@ def select_int_strategy(op_schema: OpSchema) -> StrategyType:
         - Case 3 shard_dim > selected_dim: shard_dim -= 1.
     """
     input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     if len(op_schema.args_schema) != 3:
         raise AssertionError(f"Expected 3 args, got {len(op_schema.args_schema)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+    assert len(op_schema.args_schema) == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     selected_dim, index = (
         cast(int, op_schema.args_schema[1]),
         cast(int, op_schema.args_schema[2]),
@@ -322,11 +404,24 @@ def select_int_strategy(op_schema: OpSchema) -> StrategyType:
         output_specs = input_specs
         if input_specs.is_sharded():
             # handle cases with sharded_dim != selected_dim
+<<<<<<< HEAD
             output_placements = shift_shard_dims_after_remove(
                 input_specs.placements, selected_dim
             )
             output_specs = DTensorSpec(
                 arg_spec.mesh, placements=tuple(output_placements)
+=======
+            output_spec_placements = []
+            for placement in input_specs.placements:
+                if placement.is_shard():
+                    shard_dim = cast(Shard, placement).dim
+                    if shard_dim > selected_dim:
+                        shard_dim -= 1
+                    placement = Shard(dim=shard_dim)
+                output_spec_placements.append(placement)
+            output_specs = DTensorSpec(
+                arg_spec.mesh, placements=tuple(output_spec_placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         select_strategy.strategies.append(
@@ -346,6 +441,7 @@ def select_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     # func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
     args_schema = op_schema.args_schema
     input_strategy, dim = args_schema[0], args_schema[2]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {input_strategy}")
     if not isinstance(dim, int):
@@ -357,6 +453,26 @@ def select_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         # grad_input has one more dim than grad_output
         output_placements = shift_shard_dims_after_insert(input_spec.placements, dim)
         output_specs = DTensorSpec(input_spec.mesh, tuple(output_placements))
+=======
+    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+    assert isinstance(dim, int)
+    output_strategies: list[OpSpec] = []
+    for placement_strategy in input_strategy.strategies:
+        input_spec = placement_strategy.output_spec
+        output_spec_placements: list[Placement] = []
+        for placement in input_spec.placements:
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                if shard_dim >= dim:
+                    # NOTE: shard_dim is guaranteed to exist because
+                    # grad_input has one more dim than grad_output
+                    output_spec_placements.append(Shard(shard_dim + 1))
+                else:
+                    output_spec_placements.append(Shard(shard_dim))
+            else:
+                output_spec_placements.append(placement)
+        output_specs = DTensorSpec(input_spec.mesh, tuple(output_spec_placements))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_strategies.append(
             OpSpec(output_specs=output_specs, input_specs=(input_spec,))
         )
@@ -370,18 +486,27 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
     input_strategy, dim, start, end, step = (
         op_schema.args_schema + defaults[len(op_schema.args_schema) :]
     )
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     mesh = input_strategy.mesh
     input_shape = input_strategy.shape
     input_ndim = input_strategy.ndim
+<<<<<<< HEAD
     if not isinstance(dim, int):
         raise AssertionError(f"Expected int, got {type(dim)}")
+=======
+    assert isinstance(dim, int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if start is None:
         start = 0
     if end is None or end > input_shape[dim]:
         end = input_shape[dim]
+<<<<<<< HEAD
     if not isinstance(start, IntLike):
         raise AssertionError(f"Expected IntLike, got {type(start)}")
     if not isinstance(end, IntLike):
@@ -393,6 +518,16 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
     slice_dim = normalize_dim(dim, input_ndim)  # type: ignore[arg-type]
     start = normalize_dim(start, input_shape[dim])  # type: ignore[arg-type]
     end = normalize_dim(end, input_shape[dim])  # type: ignore[arg-type]
+=======
+    assert isinstance(start, int)
+    assert isinstance(end, int)
+    assert isinstance(step, int)
+
+    # normalize args
+    slice_dim = normalize_dim(dim, input_ndim)
+    start = normalize_dim(start, input_shape[dim])
+    end = normalize_dim(end, input_shape[dim])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     redundant_slice = start == 0 and end == input_shape[dim] and step == 1
 
@@ -403,6 +538,7 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
         if not is_tensor_dim_sharded(arg_spec, dim=slice_dim) or redundant_slice:
             # only add the strategy if the slice dim is not sharded
             out_spec = DTensorSpec(mesh, arg_spec.placements)
+<<<<<<< HEAD
             slice_strategy.strategies.append(
                 OpSpec(
                     output_specs=out_spec,
@@ -410,6 +546,9 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
                     redistribute_cost=[[0.0] * len(input_strategy.strategies)],
                 )
             )
+=======
+            slice_strategy.strategies.append(OpSpec(output_specs=out_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not slice_strategy.strategies:
         # if all strategies are filtered out, unsharding all specs on slice dim
         # of the input strategy, and use that as the op strategy
@@ -418,6 +557,7 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
             unshard_spec = DTensorSpec(
                 mesh, unshard_tensor_dim(arg_spec.placements, dim=slice_dim)
             )
+<<<<<<< HEAD
             slice_strategy.strategies.append(
                 OpSpec(
                     output_specs=unshard_spec,
@@ -426,6 +566,9 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
                     ],
                 )
             )
+=======
+            slice_strategy.strategies.append(OpSpec(output_specs=unshard_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return slice_strategy
 
 
@@ -437,8 +580,12 @@ def slice_backward_rules(op_schema: OpSchema) -> OpStrategy:
     # func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
     args_schema = op_schema.args_schema
     input_strategy, dim = args_schema[0], args_schema[2]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {input_strategy}")
+=======
+    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_strategies: list[OpSpec] = []
     for placement_strategy in input_strategy.strategies:
         output_spec = placement_strategy.output_spec
@@ -451,9 +598,14 @@ def slice_backward_rules(op_schema: OpSchema) -> OpStrategy:
                 new_placements.append(placement)
         new_spec = DTensorSpec(output_spec.mesh, tuple(new_placements))
         redistribute_cost = [generate_redistribute_costs(input_strategy, new_spec)]
+<<<<<<< HEAD
         new_strategy = OpSpec(
             output_specs=new_spec, redistribute_cost=redistribute_cost
         )
+=======
+        placement_strategy.redistribute_cost = redistribute_cost
+        new_strategy = OpSpec(output_specs=new_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_strategies.append(new_strategy)
     return OpStrategy(output_strategies)
 
@@ -491,11 +643,15 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
     #   TODO: Ideally we'd like to make sure the output is re-sharded afterwards to keep input sharding.
     mesh = op_schema.get_mesh_from_args()
     input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     src_strategy = op_schema.args_schema[1]
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     if not isinstance(src_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(src_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_ndim = input_strategy.ndim
     slice_dim = (
         cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
@@ -510,6 +666,7 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
             is_tensor_dim_sharded(arg_spec, dim=slice_dim)
             or is_tensor_partial(arg_spec)
         ):
+<<<<<<< HEAD
             input_spec = DTensorSpec(mesh, arg_spec.placements, arg_spec.tensor_meta)
             # TODO: need to relax the constraint to src
             src_spec = DTensorSpec(mesh, arg_spec.placements)
@@ -524,12 +681,17 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
                     ],
                 )
             )
+=======
+            # only add the strategy if the slice_scatter dim is not sharded or partial
+            slice_scatter_strategy.strategies.append(OpSpec(output_specs=arg_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not slice_scatter_strategy.strategies:
         # if all strategies are filtered out, replicating all specs on slice_scatter dim
         # of the input strategy, and use that as the op strategy
         for arg_strategy in input_strategy.strategies:
             arg_spec = arg_strategy.output_spec
+<<<<<<< HEAD
             new_placement = replicate_tensor_dim(arg_spec.placements, dim=slice_dim)
             input_spec = DTensorSpec(mesh, new_placement)
             src_spec = DTensorSpec(mesh, new_placement)
@@ -542,6 +704,13 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
                         generate_redistribute_costs(src_strategy, src_spec),
                     ],
                 )
+=======
+            replicate_spec = DTensorSpec(
+                mesh, replicate_tensor_dim(arg_spec.placements, dim=slice_dim)
+            )
+            slice_scatter_strategy.strategies.append(
+                OpSpec(output_specs=replicate_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
     return slice_scatter_strategy
 
@@ -550,20 +719,28 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
 def replica_only_strategy(op_schema: OpSchema) -> StrategyType:
     """Only allow replication on the input/output."""
     input_strategy = op_schema.args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mesh = input_strategy.mesh
     replicate_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
     return OpStrategy([OpSpec(replicate_spec)])
 
 
 @register_op_strategy(
+<<<<<<< HEAD
     [
         aten.scatter_.value,
         aten.scatter.value,
         aten.scatter_.src,
         aten.scatter.src,
     ],
+=======
+    [aten.scatter_.value, aten.scatter.value, aten.scatter_.src, aten.scatter.src],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     schema_info=RuntimeSchemaInfo(1),
 )
 def scatter_strategy(op_schema: OpSchema) -> StrategyType:
@@ -589,6 +766,7 @@ def scatter_strategy(op_schema: OpSchema) -> StrategyType:
     return op_strategy
 
 
+<<<<<<< HEAD
 @register_op_strategy(aten.scatter_add.default, schema_info=RuntimeSchemaInfo(1))
 def scatter_add_strategy(op_schema: OpSchema) -> StrategyType:
     input_strategy = op_schema.args_schema[0]
@@ -625,11 +803,17 @@ def scatter_add_strategy(op_schema: OpSchema) -> StrategyType:
 
 
 @register_op_strategy(aten.gather.default, schema_info=RuntimeSchemaInfo(1))
+=======
+@register_op_strategy(aten.gather.default)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gather_strategy(op_schema: OpSchema) -> StrategyType:
     mesh = op_schema.get_mesh_from_args()
     input_strategy = cast(OpStrategy, op_schema.args_schema[0])
     dim = cast(int, op_schema.args_schema[1])
+<<<<<<< HEAD
     dim = normalize_dim(dim, input_strategy.ndim)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     index_strategy = cast(OpStrategy, op_schema.args_schema[2])
 
     input_shape = input_strategy.shape
@@ -645,8 +829,13 @@ def gather_strategy(op_schema: OpSchema) -> StrategyType:
     # input sharding, input sharded, index accepts mask partial, output follows index
     # this only works when the input is sharded on the gather dimension, and
     # index has size 1 on the gather dimension
+<<<<<<< HEAD
     if dim < len(index_shape) and index_shape[dim] == 1:
         index_partial_placement = MaskPartial(offset_shape=input_shape, offset_dim=dim)
+=======
+    if index_shape[dim] == 1:
+        index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_sharding: PlacementList = [
             index_partial_placement,
             Shard(dim),
@@ -659,12 +848,15 @@ def gather_strategy(op_schema: OpSchema) -> StrategyType:
     index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim)]
     single_mesh_dim_strategies.append(index_sharding)
 
+<<<<<<< HEAD
     if len(input_shape) == len(index_shape):
         for d in range(len(input_shape)):
             if d != dim:
                 sharding: PlacementList = [Shard(d), Shard(d), Shard(d)]
                 single_mesh_dim_strategies.append(sharding)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return expand_to_full_mesh_op_strategy(
         mesh, op_schema, single_mesh_dim_strategies, input_index=1
     )
@@ -714,9 +906,14 @@ def merge_placement(
 
     follow_placements: Optional[list[Placement]] = None
     mesh = tuple_strategy.child_mesh(0)
+<<<<<<< HEAD
     for arg_strategy in tuple_strategy.children:
         if not isinstance(arg_strategy, OpStrategy):
             raise AssertionError(f"Expected OpStrategy, got {type(arg_strategy)}")
+=======
+    for arg_strategy in tuple_strategy.childs:
+        assert isinstance(arg_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if arg_strategy.mesh != mesh:
             raise ValueError(
                 f"All operands in {op} must have the same mesh, "
@@ -728,29 +925,60 @@ def merge_placement(
             if follow_placements is None:
                 follow_placements = list(arg_placements)
                 continue
+<<<<<<< HEAD
             if follow_placements is None:
                 raise AssertionError(
                     "follow_placements should not be None at this point"
                 )
+=======
+            assert follow_placements is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for mesh_idx in range(mesh.ndim):
                 # merge placements with the priority
                 follow_placements[mesh_idx] = merge_placement(
                     follow_placements[mesh_idx], arg_placements[mesh_idx]
                 )
+<<<<<<< HEAD
     if follow_placements is None:
         raise AssertionError("follow placements should not be None!")
     return follow_placements
 
 
+=======
+    assert follow_placements is not None, "follow placements should not be None!"
+    return follow_placements
+
+
+def normalize_shard_for_stack(
+    placements: Sequence[Placement], insert_dim: int = 0
+) -> Sequence[Placement]:
+    # stack op would "insert" new dim, so all sharded dim >= the inserted dim need to
+    # be normalized with the new Shard placement
+    normalized_placements: list[Placement] = []
+    for placement in placements:
+        if isinstance(placement, Shard) and placement.dim >= insert_dim:
+            normalized_placements.append(Shard(placement.dim + 1))
+        else:
+            normalized_placements.append(placement)
+    return normalized_placements
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_strategy(aten.stack.default, RuntimeSchemaInfo(1, needs_pytree=True))
 def stack_strategy(op_schema: OpSchema) -> StrategyType:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_tuple_strategy, TupleStrategy):
         raise AssertionError(f"Expected TupleStrategy, got {input_tuple_strategy}")
     first_input_strategy = input_tuple_strategy.children[0]
     if not isinstance(first_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {first_input_strategy}")
+=======
+    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    first_input_strategy = input_tuple_strategy.childs[0]
+    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     common_input_ndim = first_input_strategy.ndim
     dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
     # normalize the dim to be within the common input ndim
@@ -767,6 +995,7 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
 
     input_specs = tuple(
         DTensorSpec(mesh, tuple(follow_placements))
+<<<<<<< HEAD
         for _ in range(len(input_tuple_strategy.children))
     )
 
@@ -789,6 +1018,19 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
                 redistribute_cost=redistribute_cost,
             )
         )
+=======
+        for _ in range(len(input_tuple_strategy.childs))
+    )
+
+    follow_placements = normalize_shard_for_stack(follow_placements, dim)
+
+    op_strategy.strategies.append(
+        OpSpec(
+            output_specs=DTensorSpec(mesh, tuple(follow_placements)),
+            input_specs=input_specs,
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return op_strategy
 
 
@@ -796,12 +1038,18 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
 def cat_strategy(op_schema: OpSchema) -> StrategyType:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
+<<<<<<< HEAD
     if not isinstance(input_tuple_strategy, TupleStrategy):
         raise AssertionError(f"Expected TupleStrategy, got {input_tuple_strategy}")
     num_input_tensor = len(input_tuple_strategy.children)
     first_input_strategy = input_tuple_strategy.children[0]
     if not isinstance(first_input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {first_input_strategy}")
+=======
+    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    first_input_strategy = input_tuple_strategy.childs[0]
+    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     common_input_ndim = first_input_strategy.ndim
     dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
     # normalize the dim to be within the common input ndim
@@ -809,6 +1057,7 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
 
     mesh = first_input_strategy.mesh
 
+<<<<<<< HEAD
     op_strategy = OpStrategy([])
     # use a set to deduplicate strategies with the same placement
     strategies_placement_pool = set()
@@ -864,6 +1113,27 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
                         redistribute_cost=redistribute_costs,
                     )
                 )
+=======
+    follow_placements = _derive_follow_placements_from_tuple_strategy(
+        op_schema.op, input_tuple_strategy
+    )
+    # for cat we unshard the cat dim if it is sharded
+    follow_placements = unshard_tensor_dim(follow_placements, dim)
+
+    # create op strategy base on the follow placements
+    op_strategy = OpStrategy([])
+
+    input_specs = tuple(
+        DTensorSpec(mesh, tuple(follow_placements))
+        for _ in range(len(input_tuple_strategy.childs))
+    )
+    op_strategy.strategies.append(
+        OpSpec(
+            output_specs=DTensorSpec(mesh, tuple(follow_placements)),
+            input_specs=input_specs,
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return op_strategy
 
 
@@ -871,12 +1141,18 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
 def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     values_spec, dim, indices_spec = op_schema.args_schema
 
+<<<<<<< HEAD
     if not isinstance(values_spec, DTensorSpec):
         raise AssertionError(f"Expected DTensorSpec, got {type(values_spec)}")
     if not isinstance(dim, int):
         raise AssertionError(f"Expected int, got {type(dim)}")
     if not isinstance(indices_spec, DTensorSpec):
         raise AssertionError(f"Expected DTensorSpec, got {type(indices_spec)}")
+=======
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(dim, int)
+    assert isinstance(indices_spec, DTensorSpec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     all_indices_spec: list[Optional[DTensorSpec]] = [
         indices_spec if dim == i else None for i in range(values_spec.ndim)
@@ -903,6 +1179,7 @@ def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     return result
 
 
+<<<<<<< HEAD
 @register_op_strategy(
     [
         aten.index_put.default,
@@ -1013,6 +1290,8 @@ def prop_index_put(op_schema: OpSchema) -> StrategyType:
     return op_strategy
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_prop_rule(aten.index.Tensor, schema_info=RuntimeSchemaInfo(needs_pytree=True))
 def prop_index(op_schema: OpSchema) -> OutputSharding:
     """
@@ -1031,10 +1310,15 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
     #   into either sharded or replicated)
 
     values_spec, multi_indices_spec = op_schema.args_schema
+<<<<<<< HEAD
     if not isinstance(values_spec, DTensorSpec):
         raise AssertionError(f"Expected DTensorSpec, got {type(values_spec)}")
     if not isinstance(multi_indices_spec, list):
         raise AssertionError(f"Expected list, got {type(multi_indices_spec)}")
+=======
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(multi_indices_spec, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     multi_indices_spec = cast(list[Optional[DTensorSpec]], multi_indices_spec)
     valid_indices_spec: list[tuple[int, DTensorSpec]] = [
         (i, a) for i, a in enumerate(multi_indices_spec) if a is not None
@@ -1053,6 +1337,7 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
 
     if not need_reshard_on_indices:
         # this means that our inputs are already sharded properly and we will use that as our indices_spec
+<<<<<<< HEAD
         if not isinstance(indices_out.output_spec, DTensorSpec):
             raise AssertionError(
                 f"Expected DTensorSpec, got {type(indices_out.output_spec)}"
@@ -1061,16 +1346,26 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
     else:
         if indices_out.redistribute_schema is None:
             raise AssertionError("redistribute_schema should not be None")
+=======
+        assert isinstance(indices_out.output_spec, DTensorSpec)
+        indices_spec: DTensorSpec = indices_out.output_spec
+    else:
+        assert indices_out.redistribute_schema is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         valid_indices_suggestion = indices_out.redistribute_schema
         for i, v in enumerate(valid_indices_suggestion.args_spec):
             multi_indices_spec[valid_indices_spec[i][0]] = v
         # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
         # use that to compute our ideal values_spec
         indices_output_spec = pointwise_rule(valid_indices_suggestion).output_spec
+<<<<<<< HEAD
         if not isinstance(indices_output_spec, DTensorSpec):
             raise AssertionError(
                 f"Expected DTensorSpec, got {type(indices_output_spec)}"
             )
+=======
+        assert isinstance(indices_output_spec, DTensorSpec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         indices_spec = indices_output_spec
 
     lookup_dims = {v[0] for v in valid_indices_spec}
@@ -1129,8 +1424,15 @@ def place(vp: Placement, ip: Placement) -> Placement:
                     DTensorSpec(
                         mesh=values_spec.mesh,
                         placements=tuple(
+<<<<<<< HEAD
                             Replicate() if need_reshard_on_values[i] else v
                             for i, v in enumerate(values_spec.placements)
+=======
+                            [
+                                Replicate() if need_reshard_on_values[i] else v
+                                for i, v in enumerate(values_spec.placements)
+                            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ),
                         tensor_meta=values_spec.tensor_meta,
                     ),
@@ -1150,22 +1452,46 @@ def place(vp: Placement, ip: Placement) -> Placement:
     ],
     RuntimeSchemaInfo(1),
 )
+<<<<<<< HEAD
 def split_strategy(op_schema: OpSchema) -> OpStrategy:
     input_strategy = op_schema.args_schema[0]
     split_size_or_sections = op_schema.args_schema[1]
     if not isinstance(input_strategy, OpStrategy):
         raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+=======
+def split_strategy(op_schema: OpSchema) -> TupleStrategy:
+    input_strategy = op_schema.args_schema[0]
+    split_size_or_sections = op_schema.args_schema[1]
+    assert isinstance(input_strategy, OpStrategy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_ndim = input_strategy.ndim
     split_dim = (
         cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
     )
     dim = normalize_dim(split_dim, input_ndim)
 
+<<<<<<< HEAD
     def size_split(N, i) -> list:
         # Last chunk will be smaller if the tensor size N
         # along the given dimension dim is not divisible by i.
         if not i > 0:
             raise AssertionError(f"Split size must be positive, got {i}")
+=======
+    # tensor to split cannot have Partial for now
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_partial(arg_spec):
+            raise NotImplementedError(
+                f"splitting distributed tensor with "
+                f"Partial placement is not implemented!\n"
+                f"DTensorSpec={arg_strategy}"
+            )
+
+    def size_split(N, i) -> list:
+        # Last chunk will be smaller if the tensor size N
+        # along the given dimension dim is not divisible by i.
+        assert i > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [i] * (N // i) + ([N % i] if N % i != 0 else [])
 
     output_size_list = (
@@ -1173,6 +1499,7 @@ def size_split(N, i) -> list:
         if isinstance(split_size_or_sections, int)
         else split_size_or_sections
     )
+<<<<<<< HEAD
     if not isinstance(output_size_list, Sized):
         raise AssertionError(f"Expected Sized, got {type(output_size_list)}")
 
@@ -1243,3 +1570,27 @@ def gen_unbind_strategy(op_schema: OpSchema) -> StrategyType:
             )
         )
     return unbind_strategy
+=======
+    assert isinstance(output_size_list, Sized)
+
+    split_strategies = []
+
+    for _ in range(len(output_size_list)):
+        op_strategy = OpStrategy([])
+
+        for strategy in input_strategy.strategies:
+            spec = strategy.output_spec
+            placements = spec.placements
+            if is_tensor_dim_sharded(spec, dim=dim):
+                # if the input is sharded on the split dim, we need to unshard it
+                placements = unshard_tensor_dim(spec.placements, dim=dim)
+
+            spec = DTensorSpec(spec.mesh, placements)
+
+            op_strategy.strategies.append(
+                OpSpec(output_specs=spec, input_specs=([spec]))
+            )
+        split_strategies.append(op_strategy)
+
+    return TupleStrategy(split_strategies)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index 2d9e33402c607..459b3a3be17e2 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -1,8 +1,14 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from dataclasses import dataclass
 from typing import cast, Optional, Union
+=======
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from typing import Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -22,12 +28,16 @@
     prod,
     register_op_strategy,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.placement_types import (
     _StridedShard,
     Placement,
     Replicate,
     Shard,
 )
+=======
+from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
@@ -141,6 +151,7 @@ class Split(DimSpec):
 
     @classmethod
     def new(cls, dim: DimSpec, group_shape: tuple[int, ...], idx: int) -> DimSpec:
+<<<<<<< HEAD
         if not len(group_shape) > 0:
             raise AssertionError(
                 f"Expected group_shape length > 0, got {len(group_shape)}"
@@ -149,6 +160,12 @@ def new(cls, dim: DimSpec, group_shape: tuple[int, ...], idx: int) -> DimSpec:
             # not really a group, just return the input dim back
             if not idx == 0:
                 raise AssertionError(f"Expected idx == 0, got {idx}")
+=======
+        assert len(group_shape) > 0
+        if len(group_shape) == 1:
+            # not really a group, just return the input dim back
+            assert idx == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return dim
         elif group_shape[idx] == 1:
             return Singleton()
@@ -185,10 +202,14 @@ def dim_atleast_3d(ndim: int) -> DimMap:
 
 def expand(input_shape: Shape, shape: Shape) -> DimMap:
     """Implement broadcast on multiple dimensions."""
+<<<<<<< HEAD
     if not len(shape) >= len(input_shape):
         raise AssertionError(
             f"Expected len(shape) >= len(input_shape), got {len(shape)} < {len(input_shape)}"
         )
+=======
+    assert len(shape) >= len(input_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # 1. create padded input dimensions
     padded_input = dim_pad_left(len(input_shape), len(shape))
@@ -197,6 +218,7 @@ def expand(input_shape: Shape, shape: Shape) -> DimMap:
     for p, desired_s in zip(padded_input, shape):
         if isinstance(p, Singleton):
             actual_s = 1
+<<<<<<< HEAD
             if not desired_s >= 0:
                 raise AssertionError(f"Expected desired_s >= 0, got {desired_s}")
         else:
@@ -208,6 +230,13 @@ def expand(input_shape: Shape, shape: Shape) -> DimMap:
                     f"Expected actual_s == 1 or desired_s == -1 or "
                     f"desired_s == actual_s, got actual_s={actual_s}, desired_s={desired_s}"
                 )
+=======
+            assert desired_s >= 0
+        else:
+            assert isinstance(p, InputDim), f"DimSpec not supported in expand: {p}"
+            actual_s = input_shape[p.input_dim]
+            assert actual_s == 1 or desired_s == -1 or desired_s == actual_s
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mapping.append(
             p
             if desired_s in (1, -1) or desired_s == actual_s
@@ -251,6 +280,7 @@ def dim_movedim(
     input = normalize_dims(input, ndim)
     destination = normalize_dims(destination, ndim)
 
+<<<<<<< HEAD
     if not len(input) == len(destination):
         raise AssertionError(
             f"Expected len(input) == len(destination), got {len(input)} != {len(destination)}"
@@ -266,6 +296,14 @@ def dim_movedim(
         raise AssertionError(
             f"Expected max(destination) < ndim, got {max(destination)} >= {ndim}"
         )
+=======
+    assert len(input) == len(destination)
+    input_set = set(input)
+    assert len(input_set) == len(input), "Found repeated input dims"
+    assert len(set(destination)) == len(destination), "Found repeated output dims"
+    assert max(input) < ndim
+    assert max(destination) < ndim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dest = [-1] * ndim
     for i, d in zip(input, destination):
@@ -281,10 +319,16 @@ def dim_movedim(
 
 def dim_repeat(ndim: int, sizes: Shape) -> DimMap:
     sizes = normalize_sizes(sizes)
+<<<<<<< HEAD
     if not len(sizes) >= ndim:
         raise AssertionError(
             f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
         )
+=======
+    assert len(sizes) >= ndim, (
+        f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pad = len(sizes) - ndim
     return tuple(Repeat.new(Singleton(), s) for s in sizes[:pad]) + tuple(
         Repeat.new(InputDim(i), s) for i, s in enumerate(sizes[pad:])
@@ -299,6 +343,7 @@ def infer_size(total_size: int, sizes: Shape) -> Shape:
     """
     infers = [i for i, s in enumerate(sizes) if s == -1]
     size = prod(sizes)
+<<<<<<< HEAD
     if not len(infers) <= 1:
         raise AssertionError("can only infer one size")
     if infers:
@@ -311,6 +356,17 @@ def infer_size(total_size: int, sizes: Shape) -> Shape:
         return tuple(s if s != -1 else missing_size for s in sizes)
     if not size == total_size:
         raise AssertionError(f"sizes do not match {total_size} vs {size}")
+=======
+    assert len(infers) <= 1, "can only infer one size"
+    if infers:
+        size = -size
+        missing_size = total_size // size
+        assert total_size % size == 0, (
+            f"size inferred for -1 is not integral {sizes} should have {total_size} elements."
+        )
+        return tuple(s if s != -1 else missing_size for s in sizes)
+    assert size == total_size, f"sizes do not match {total_size} vs {size}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return sizes
 
 
@@ -331,7 +387,11 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
             Flatten((InputDim(1), InputDim(2)))
         )
 
+<<<<<<< HEAD
     - output dimension 0 maps to input dimension 0
+=======
+    - ouptut dimension 0 maps to input dimension 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - output dimension 1 maps to a flattened input dimensions 1 and 2
 
 
@@ -346,8 +406,12 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
     from_nelem = prod(from_size)
     to_size = infer_size(from_nelem, normalize_sizes(to_size))
 
+<<<<<<< HEAD
     if not from_nelem == prod(to_size):
         raise AssertionError("Total view shape does not add up")
+=======
+    assert from_nelem == prod(to_size), "Total view shape does not add up"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from_idx = 0
     to_idx = 0
@@ -417,10 +481,15 @@ def dim_tile(ndim: int, dims: tuple[int, ...]) -> DimMap:
 def dim_transpose(ndim: int, dim1: int, dim2: int) -> DimMap:
     dim1 = normalize_dim(dim1, ndim)
     dim2 = normalize_dim(dim2, ndim)
+<<<<<<< HEAD
     if not dim1 < ndim:
         raise AssertionError(f"Expected dim1 < ndim, got {dim1} >= {ndim}")
     if not dim2 < ndim:
         raise AssertionError(f"Expected dim2 < ndim, got {dim2} >= {ndim}")
+=======
+    assert dim1 < ndim
+    assert dim2 < ndim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dimmap = [InputDim(i) for i in range(ndim)]
     swapdim = dimmap[dim1]
     dimmap[dim1] = dimmap[dim2]
@@ -519,8 +588,12 @@ def propagate_shape_and_sharding(
     - An output dimension that is a split of the input dimension can only be sharded
       if the leftmost split size is divisible by the mesh dimension
     """
+<<<<<<< HEAD
     if not len(input_src_placements) == len(mesh_sizes):
         raise AssertionError(f"{input_src_placements} != {mesh_sizes}")
+=======
+    assert len(input_src_placements) == len(mesh_sizes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
     mesh_ndim = len(mesh_sizes)
     shardable_dims: dict[int, list[bool]] = {}
@@ -549,6 +622,7 @@ def maybe_get_shard_mesh_dim_and_placement(
                 return i, placement
         return None, None
 
+<<<<<<< HEAD
     # NOTE: This function has three responsibilities:
     # 1. determine "theoretically" if an output dimension can be sharded, i.e. fill the shardable_dims map
     # 2. determine "theoretically" the corresponding input dimension to shard on, via return value
@@ -557,10 +631,15 @@ def maybe_get_shard_mesh_dim_and_placement(
     # 3 requires that info, to decide whether we can error out. Maybe we can refactor
     # to make this function purely "theoretical".
     def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
+=======
+    def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
+        # TODO(whc) this helper is pretty hard to understand, at least it should be better documented if not refactored
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(cmd, InputDim):
             return cmd
         elif isinstance(cmd, Flatten):
             for i, dim in enumerate(cmd.input_dims):
+<<<<<<< HEAD
                 # so far all Flatten is always composed of InputDims; revisit this if needed
                 if not isinstance(dim, InputDim):
                     raise AssertionError(f"Expected InputDim, got {type(dim)}")
@@ -598,6 +677,41 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                     f"Expected InputDim, got {type(cmd.input_dims[0])}"
                 )
             return cmd.input_dims[0]
+=======
+                if isinstance(dim, InputDim):
+                    can_shard_dim = True
+                    shard_mesh_dim, shard_placement = (
+                        maybe_get_shard_mesh_dim_and_placement(dim)
+                    )
+                    input_sharded = shard_mesh_dim is not None
+                    if i > 0:
+                        can_shard_dim = False
+                        if strict_view and input_sharded:
+                            raise RuntimeError(
+                                f"Attempted to flatten sharded dimension {i}, ",
+                                "but only the leftmost dim of a Flatten can be sharded.",
+                            )
+                    elif input_sharded:
+                        assert (
+                            shard_placement is not None and shard_mesh_dim is not None
+                        )
+                        tensor_dim_size = global_input_shape[shard_placement.dim]
+                        mesh_dim_size = mesh_sizes[shard_mesh_dim]
+                        if tensor_dim_size % mesh_dim_size != 0:
+                            can_shard_dim = False
+                            if strict_view:
+                                raise RuntimeError(
+                                    f"Attempted to flatten unevenly sharded dimension {i}, "
+                                    "which would require resharding the input. "
+                                    "Please explicitly redistribute the tensor instead."
+                                )
+
+                    shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
+            dim0 = cmd.input_dims[0]
+            # TODO(whc) dim0 can be sharded or not sharded, can't it?
+            # should we only return it if its sharded in the placement?
+            return dim0 if isinstance(dim0, InputDim) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(cmd, Split):
             in_dim = get_in_dim_to_shard(cmd.input_dim)
             out_size = cmd.group_shape[cmd.split_id]
@@ -616,6 +730,7 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                     out_size % mesh_dim_size == 0 for mesh_dim_size in mesh_sizes
                 ]
 
+<<<<<<< HEAD
                 shard_mesh_dim, _ = maybe_get_shard_mesh_dim_and_placement(in_dim)
                 if strict_view and shard_mesh_dim is not None:
                     if not shardable_dims[in_dim.input_dim][shard_mesh_dim]:
@@ -624,15 +739,23 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                             "It cannot be performed without redistribution, which is disallowed by the current operator.",
                         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # 2. here we special case things like [Shard(0), Shard(0)]
                 submesh_size = 1
                 for size, shard in zip(mesh_sizes, input_src_placements):
                     if isinstance(shard, Shard) and shard.dim == in_dim:
                         submesh_size *= size
+<<<<<<< HEAD
                 if not out_size % submesh_size == 0:
                     raise AssertionError(
                         f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
                     )
+=======
+                assert out_size % submesh_size == 0, (
+                    f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # we will only shard our first component of the split
             return in_dim if cmd.split_id == 0 else None
@@ -659,6 +782,7 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
         )
         for mesh_dim, p in enumerate(input_src_placements)
     ]
+<<<<<<< HEAD
 
     def _rewrite_shard_dim(p: Shard):
         """
@@ -683,6 +807,10 @@ def _rewrite_shard_dim(p: Shard):
 
     output_placements = [
         _rewrite_shard_dim(p) if isinstance(p, Shard) else p
+=======
+    output_placements = [
+        Shard(shard_dim_map[p.dim]) if isinstance(p, Shard) else p
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for p in input_tgt_placements
     ]
 
@@ -713,8 +841,12 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
         mesh = op_schema.get_mesh_from_args(validate=False)
 
         global_in_shape = input_strategy.shape
+<<<<<<< HEAD
         if global_in_shape is None:
             raise AssertionError("Shape required.")
+=======
+        assert global_in_shape is not None, "Shape required."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_strategy = OpStrategy([])
         for input_placement_strategy in input_strategy.strategies:
@@ -755,9 +887,12 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
 
 register_op_strategy_map(aten.squeeze.default, torch.squeeze)
 register_op_strategy_map(
+<<<<<<< HEAD
     aten.squeeze_.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
 )
 register_op_strategy_map(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.squeeze.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
 )
 register_op_strategy_map(
diff --git a/torch/distributed/tensor/_ops/utils.py b/torch/distributed/tensor/_ops/utils.py
index 9a4ce12ed82fa..2d6645112e87c 100644
--- a/torch/distributed/tensor/_ops/utils.py
+++ b/torch/distributed/tensor/_ops/utils.py
@@ -3,8 +3,13 @@
 import functools
 import itertools
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from typing import cast, Optional, TypeVar, Union
+=======
+from collections.abc import Iterable, Sequence
+from typing import Callable, cast, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -19,7 +24,10 @@
     OutputSharding,
     PlacementList,
     RuntimeSchemaInfo,
+<<<<<<< HEAD
     StrategyType,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.tensor.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
@@ -35,12 +43,23 @@
 
 
 # convenient wrapper to register sharding propagation rules
+<<<<<<< HEAD
+=======
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def register_prop_rule(
     op: Union[torch._ops.OpOverload, list[torch._ops.OpOverload]],
     schema_info: Optional[RuntimeSchemaInfo] = None,
 ) -> Callable[
     [Callable[[OpSchema], OutputSharding]], Callable[[OpSchema], OutputSharding]
 ]:
+<<<<<<< HEAD
+=======
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def wrapper(
         impl: Callable[[OpSchema], OutputSharding],
     ) -> Callable[[OpSchema], OutputSharding]:
@@ -57,6 +76,11 @@ def wrapper(
 def register_op_strategy(
     op, schema_info=None
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+<<<<<<< HEAD
+=======
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # pyre-fixme[2]: Parameter must be annotated.
 
     # For every ATen op that accepts any args in this list,
@@ -96,6 +120,7 @@ def wrapper(impl):
     return wrapper
 
 
+<<<<<<< HEAD
 def replicate_op_strategy(op_schema: OpSchema) -> StrategyType:
     """
     Fallback strategy all use Replication()
@@ -126,6 +151,8 @@ def replicate_op_strategy(op_schema: OpSchema) -> StrategyType:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def as_list(
     x: Union[list[object], object],
     # pyre-fixme[11]: Annotation `immutable_list` is not defined as a type.
@@ -150,7 +177,11 @@ def normalize_dims(dims: DimsType, ndim: int) -> DimsSequenceType:
     elif isinstance(dims, list):
         dims = [normalize_dim(dim, ndim) for dim in dims]
     elif isinstance(dims, tuple):
+<<<<<<< HEAD
         dims = tuple(normalize_dim(dim, ndim) for dim in dims)
+=======
+        dims = tuple([normalize_dim(dim, ndim) for dim in dims])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return dims
 
 
@@ -165,8 +196,11 @@ def is_tensor_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
     for i, placement in enumerate(spec.placements):
         if placement.is_shard():
             shard_dim = cast(Shard, placement).dim
+<<<<<<< HEAD
             if shard_dim >= len(shape):
                 return False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shards_map[shard_dim] *= spec.mesh.size(i)
 
     for i, dim_size in enumerate(shape):
@@ -194,6 +228,7 @@ def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
     return True
 
 
+<<<<<<< HEAD
 def is_tensor_evenly_shardable_on_dim(
     shape: Sequence[int], spec: DTensorSpec, dim: int
 ) -> bool:
@@ -210,6 +245,8 @@ def is_tensor_evenly_shardable_on_dim(
     return shape[dim] % num_shards == 0
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_tensor_dim_sharded(spec: DTensorSpec, dim: int) -> bool:
     """Return True if tensor dim is sharded."""
     return any(p.is_shard(dim) for p in spec.placements)
@@ -238,11 +275,15 @@ def map_placements_after_broadcast(
     placements: tuple[Placement, ...],
     shape: torch.Size,
     broadcast_dims_map: list[int],
+<<<<<<< HEAD
     partial_to_replicate: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[Placement, ...]:
     """Map each placement based on the output shape after broadcast."""
     new_placements: list[Placement] = []
     for placement in placements:
+<<<<<<< HEAD
         if isinstance(placement, Partial):
             if partial_to_replicate:
                 # map the partial placement to replicate
@@ -250,6 +291,9 @@ def map_placements_after_broadcast(
             else:
                 new_placements.append(placement)
         elif isinstance(placement, Replicate):
+=======
+        if isinstance(placement, (Replicate, Partial)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_placements.append(placement)
         else:
             assert isinstance(placement, Shard)
@@ -265,7 +309,11 @@ def map_placements_after_broadcast(
                 # the input shape shard dim before broadcasting,
                 # in this case it means implicit broadcasting happen
                 # in this dim, so we can just mark it as replicate
+<<<<<<< HEAD
                 # and implicit broadcast will broadcast automatically
+=======
+                # and implict broadcast will broadcast automatically
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # to the sharded shape
                 new_placements.append(Replicate())
 
@@ -275,11 +323,14 @@ def map_placements_after_broadcast(
 def generate_redistribute_costs(
     src_strategy: OpStrategy, dst_spec: DTensorSpec
 ) -> list[float]:
+<<<<<<< HEAD
     """Generates one row in the 'redistribute_costs' matrix in an OpSpec
     The length of the returned list will match the number of strategies in 'src_strategy'.
 
     Each value in the row is the cost of redistributing from a particular src_strategy to dst_spec.
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     redistribute_costs: list[float] = [
         redistribute_cost(strat.output_spec, dst_spec)
         for strat in src_strategy.strategies
@@ -295,6 +346,7 @@ def expand_to_full_mesh_op_strategy(
     *,
     input_index: int = 1,
     inplace_op: bool = False,
+<<<<<<< HEAD
     is_valid_strategy_cb: Optional[
         Callable[[list[DTensorSpec], tuple[Optional[DTensorSpec], ...]], bool]
     ] = None,
@@ -325,6 +377,9 @@ def expand_to_full_mesh_op_strategy(
             [Replicate(), Replicate(), Replicate()]
         ]
     """
+=======
+) -> OpStrategy:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Expand the single_mesh_dim_strategies to full mesh dim strategies.
     all_mesh_dim_strategies = [single_mesh_dim_strategies] * mesh.ndim
 
@@ -335,8 +390,11 @@ def expand_to_full_mesh_op_strategy(
         spec_list: list[Optional[DTensorSpec]] = []
         for specs in zip(*strategy_comb):
             if specs[0] is not None:
+<<<<<<< HEAD
                 # TODO: we should fill in tensor_meta here.  If nothing else, it helps the filter strategy callback
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 spec_list.append(DTensorSpec(mesh, specs))
             else:
                 spec_list.append(None)
@@ -354,6 +412,7 @@ def expand_to_full_mesh_op_strategy(
             # input_spec matches the first argument's runtime sharding, otherwise we skip
             continue
 
+<<<<<<< HEAD
         output_specs: tuple[Optional[DTensorSpec], ...]
         if input_index > 1:
             output_specs = tuple(spec_list[:input_index])
@@ -411,3 +470,32 @@ def shift_shard_dims_after_remove(
         else:
             normalized_placements.append(placement)
     return normalized_placements
+=======
+        # check inputs shardable
+        inputs_shardable = all(
+            is_tensor_shardable(inp.shape, s)
+            for inp, s in zip(input_args_strategy, input_specs)
+        )
+
+        # only add to the all_strategies list when all inputs are shardable
+        if inputs_shardable:
+            redistribute_cost = [
+                generate_redistribute_costs(input_strategy, input_spec)
+                for input_strategy, input_spec in zip(input_args_strategy, input_specs)
+            ]
+            if input_index > 1:
+                output_specs = tuple(spec_list[:input_index])
+            else:
+                if spec_list[0] is not None:
+                    output_specs = spec_list[0]  # type: ignore[assignment]
+                else:
+                    raise RuntimeError("output spec is None")
+            strategy = OpSpec(
+                output_specs=output_specs,
+                input_specs=input_specs,
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strategy)
+
+    return OpStrategy(all_strategies)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index f8325c83d55e4..63da18481d804 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -2,17 +2,28 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import contextlib
 import warnings
+<<<<<<< HEAD
 from logging import getLogger
 from typing import Optional, Union
 
 import torch
+=======
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _get_device_handle, DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import Shard
 
 
+<<<<<<< HEAD
 logger = getLogger(__name__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "is_rng_supported_mesh",
     "manual_seed",
@@ -43,8 +54,12 @@ def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool:
     else:
         # TODO: Logs way too much
         warnings.warn(
+<<<<<<< HEAD
             f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh",
             stacklevel=2,
+=======
+            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return False
 
@@ -73,6 +88,7 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     if not is_rng_supported_mesh(device_mesh):
         warnings.warn(
             "DTensor manual_seed() may not have complete support "
+<<<<<<< HEAD
             f"on {device_mesh.device_type} device mesh",
             stacklevel=2,
         )
@@ -86,19 +102,33 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     # )
     # Note: we still need to ensure setting `run_state_sync=False` to support the pp case
 
+=======
+            f"on {device_mesh.device_type} device mesh"
+        )
+        return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
     global _rng_tracker
     if not _rng_tracker:
         _rng_tracker = OffsetBasedRNGTracker(device_mesh, run_state_sync=False)
 
+<<<<<<< HEAD
     if device_mesh.get_coordinate() is None:
+=======
+    # the current rank is in mesh
+    if device_mesh.get_coordinate() is not None:
+        _rng_tracker._manual_seed(seed)
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             "manual_seed requires the current rank to be a part of the device mesh "
             "otherwise DTensor RNG state on the rank will not be initialized and "
             "the behavior of DTensor random ops is undefined."
         )
 
+<<<<<<< HEAD
     # DTensor no longer maintains a copy of rng state. manual seed on dtensor is the same thing
     # as manual seed on torch.
     torch.manual_seed(seed)
@@ -141,6 +171,8 @@ def seed(self, seed: int) -> None:
         )
         self._state[:8] = seed_tensor
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _RNGStateTracker:
     """
@@ -152,7 +184,10 @@ class _RNGStateTracker:
     """
 
     def __init__(self, device: torch.device):
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._device = device
         self._device_handle = _get_device_handle(self._device.type)
         if not (self._device_handle and self._device_handle.is_available()):
@@ -160,9 +195,21 @@ def __init__(self, device: torch.device):
                 f"{self.__class__.__name__} instantiation requires the presence of "
                 f"{device.type} device but couldn't find."
             )
+<<<<<<< HEAD
         self._use_distribute_region = True
 
     @property
+=======
+
+        self._states: dict[str, Tensor] = {}
+        self._use_distribute_region = True
+
+    @property
+    def rng_states(self) -> dict[str, Tensor]:
+        return self._states
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def distribute_region_enabled(self) -> bool:
         return self._use_distribute_region
 
@@ -170,9 +217,34 @@ def distribute_region_enabled(self) -> bool:
     def distribute_region_enabled(self, value) -> None:
         self._use_distribute_region = value
 
+<<<<<<< HEAD
     def _distribute_region(
         self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
     ):
+=======
+    def rng_state_is_sync(self, name) -> bool:
+        return name in self.rng_states
+
+    def get_seed(self, name: str) -> int:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        seed_tensor = (self.rng_states[name])[0:8].view(dtype=torch.int64)
+        return int(seed_tensor.item())
+
+    def set_seed(self, name: str, seed: int) -> None:
+        seed_tensor = torch.tensor([seed], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        offset_tensor = torch.tensor([0], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+
+    def _distribute_region(self, spec: DTensorSpec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
     def _manual_seed(self, parallel_seed: int) -> None:
@@ -202,6 +274,7 @@ def __init__(
                 f"CUDA/CUDA-like/XPU device. Got {self._device.type} instead."
             )
 
+<<<<<<< HEAD
         rng_state = self._get_device_state()
         if run_state_sync:
             # synchronize RNG state using rank 0's current one
@@ -252,15 +325,44 @@ def _distribute_region(
                 self._device_handle.set_rng_ctx("philox")
             old_offset = state.offset
             self._set_pre_op_offset(state, spec)
+=======
+        rng_state = self._device_handle.get_rng_state().to(self._device)
+        if run_state_sync:
+            # synchronize RNG state using rank 0's current one
+            dist.broadcast(rng_state, 0)
+
+        self.rng_states["parallel-rng"] = rng_state.to("cpu")
+
+    def _manual_seed(self, parallel_seed: int) -> None:
+        self.set_seed("parallel-rng", parallel_seed)
+
+    @contextlib.contextmanager
+    def _distribute_region(self, spec: DTensorSpec):
+        # check if the parallel rng state has been synchronized or not
+        if not self.rng_state_is_sync("parallel-rng"):
+            raise RuntimeError(
+                "OffsetBasedRNGTracker requires the random state to be synchronized "
+                "before entering into a distribute region!"
+            )
+
+        if self.distribute_region_enabled:
+            old_offset = self.get_offset("parallel-rng")
+            self._set_pre_op_offset(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with torch.random.fork_rng(
                 devices=[self._device], device_type=self._device.type
             ):
                 assert self._device_handle is not None
+<<<<<<< HEAD
                 self._device_handle.set_rng_state(state.state)
+=======
+                self._device_handle.set_rng_state(self.rng_states["parallel-rng"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     yield  # execute the region code
                 finally:
                     # update offset to synchronize among ranks
+<<<<<<< HEAD
                     self._set_post_op_offset(state, spec, old_offset)
             if self._device.type == "hpu":
                 self._device_handle.unset_rng_ctx("philox")
@@ -276,6 +378,34 @@ def _distribute_region(
             self._set_device_state(state.state)
 
     def _set_pre_op_offset(self, state: _PhiloxState, spec: DTensorSpec) -> None:
+=======
+                    self._set_post_op_offset(spec, old_offset)
+        else:
+            yield
+
+    def get_offset(self, name: str) -> int:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        offset_tensor = (self.rng_states[name])[8:].view(dtype=torch.int64)
+        return int(offset_tensor.item())
+
+    def set_offset(self, name: str, offset: int) -> None:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        seed_tensor = (self.rng_states[name])[0:8]
+        offset_tensor = torch.tensor([offset], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+
+    def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Set the starting RNG offset for current device's local shard before actual
         op execution. The pre_op_offset value should start from the current RNG offset
         and increment by the size of local shard until it reaches the size of the whole
@@ -283,7 +413,10 @@ def _set_pre_op_offset(self, state: _PhiloxState, spec: DTensorSpec) -> None:
         will be the same.
 
         Args:
+<<<<<<< HEAD
             state (:class:`Tensor`): The generator state to modify
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we prepare the offset for running random ops.
 
@@ -386,23 +519,36 @@ def _set_pre_op_offset(self, state: _PhiloxState, spec: DTensorSpec) -> None:
         local_size = prod(local_size_on_rank_0)
 
         # get current RNG offset
+<<<<<<< HEAD
         current_offset = state.offset
+=======
+        current_offset = self.get_offset("parallel-rng")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         offset_incr = (shard_linear_idx * local_size + 3) // 4 * 4
+<<<<<<< HEAD
         state.offset = current_offset + offset_incr
 
     def _set_post_op_offset(
         self, state: _PhiloxState, spec: DTensorSpec, old_offset: int
     ) -> None:
+=======
+        self.set_offset("parallel-rng", current_offset + offset_incr)
+
+    def _set_post_op_offset(self, spec: DTensorSpec, old_offset: int) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Sets the RNG to a synchronized state after running the local random op. Every
         rank should set its RNG offset to `old_offset + DTensor.numel()` where old_offset is
         the offset before calling `set_pre_op_offset` i.e. the offset before running DTensor
         random ops.
 
         Args:
+<<<<<<< HEAD
             state (:class:`Tensor`): The generator state to modify.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we post-process the offset for running random ops.
 
@@ -417,7 +563,11 @@ def _set_post_op_offset(
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         numel = (numel + 3) // 4 * 4
+<<<<<<< HEAD
         state.offset = old_offset + numel
+=======
+        self.set_offset("parallel-rng", old_offset + numel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _calc_shard_linear_idx(
         self, shard_coord: list[int], shard_size: list[int]
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index 463c34c8fb436..8ed2ec1bcea08 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 import contextlib
 import dataclasses
 import itertools
@@ -7,12 +8,16 @@
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import cache
 from typing import cast, NamedTuple, Optional
 
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._api as dtensor
+<<<<<<< HEAD
 from torch.distributed._functional_collectives import _are_we_tracing
 from torch.distributed.tensor._dtensor_spec import (
     DTensorSpec,
@@ -20,6 +25,9 @@
     ShardOrderEntry,
     TensorMeta,
 )
+=======
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
     Partial,
@@ -27,7 +35,10 @@
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.utils._debug_mode import get_active_debug_mode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger = logging.getLogger(__name__)
@@ -40,6 +51,7 @@ class _TransformInfo(NamedTuple):
     logical_shape: list[int]
 
 
+<<<<<<< HEAD
 # Global cache for DTensorRedistributePlanner instances
 _planner_cache: dict[
     tuple[weakref.ReferenceType, int], "DTensorRedistributePlanner"
@@ -631,6 +643,97 @@ def generate_greedy_transform_infos(
         for mesh_dim, (current, target) in enumerate(
             zip(current_placements, target_placements)
         ):
+=======
+def _gen_transform_infos_non_cached(
+    src_spec: DTensorSpec,
+    dst_spec: DTensorSpec,
+) -> list[_TransformInfo]:
+    """
+    Generate the transform infos from the source placements to the target placements.
+
+    To transform from source to target placement it might have multiple steps, i.e. it
+    might decompose Si -> Sj into Si -> R -> Sj.
+    This would detect if there're mis-aligned/nested shardings between src/dst placements.
+    E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
+    in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
+    the former is a nested-sharding of a tensor already already sharded dimension 0, whereras
+    the latter is the first sharding on tensor dimension 0.
+    """
+    transform_infos: list[_TransformInfo] = []
+
+    device_mesh = src_spec.device_mesh
+    my_coordinate = device_mesh.get_coordinate()
+    assert my_coordinate is not None
+
+    # logical shape records the logic tensor shape on the mesh dimension
+    # this is useful to ensure uneven sharding gets correct output shape
+    initial_logical_shape = list(src_spec.shape)
+    mesh_dims_to_logical_shape = [initial_logical_shape]
+
+    if device_mesh.ndim == 1:
+        # if device_mesh is 1D, redistribute is a simple direct transformation
+        transform_infos.append(
+            _TransformInfo(
+                mesh_dim=0,
+                src_dst_placements=(src_spec.placements[0], dst_spec.placements[0]),
+                logical_shape=initial_logical_shape,
+            )
+        )
+        return transform_infos
+
+    # Handle multi-dim device mesh placement redistribution
+    # First, we need to build the logical shape for each mesh dim
+    # for correct allgathering uneven shards on each mesh dim (with dynamic padding)
+    for i, src in enumerate(src_spec.placements):
+        current_logical_shape = mesh_dims_to_logical_shape[i]
+        if isinstance(src, Shard):
+            if i < device_mesh.ndim - 1:
+                # calculate and save the logical shape for this sharding
+                mesh_dim_size = device_mesh.size(mesh_dim=i)
+                local_shard_size, _ = src._local_shard_size_and_offset(
+                    current_logical_shape[src.dim],
+                    mesh_dim_size,
+                    my_coordinate[i],
+                )
+                new_logical_shape = list(current_logical_shape)
+                new_logical_shape[src.dim] = local_shard_size
+                mesh_dims_to_logical_shape.append(new_logical_shape)
+        else:
+            mesh_dims_to_logical_shape.append(current_logical_shape)
+
+    # Next, we need to derive the transform infos from src to dst placements,
+    # here we use a greedy search with step by step state transformations
+    current_placements = list(src_spec.placements)
+    target_placements = list(dst_spec.placements)
+
+    if src_spec.num_shards > 1:
+        # If src_spec have sharding, it could potentially have sharding that is misaligned with dst_spec
+        # a common case of this is nested sharding (i.e. (S(0), S(0)) -> (R, S(0))).
+        # In those cases, we first traverse from inner placement to outer placement
+        # to detect misaligned shardings and properly replicate nested sharding first.
+        for mesh_dim in reversed(range(len(current_placements))):
+            current = current_placements[mesh_dim]
+            target = target_placements[mesh_dim]
+            # If target is not Shard, we can directly redistribute since we are traversing from innner
+            # to outer placements here
+            if isinstance(target, Shard):
+                # If target is Shard, check for nested sharding on the tensor dim BEFORE the current mesh_dim
+                shard_dim = target.dim
+                current_mesh_sharding, target_mesh_sharding = [], []
+                for i, (s, p) in enumerate(zip(current_placements, target_placements)):
+                    if i >= mesh_dim:
+                        break
+                    if s.is_shard(shard_dim):
+                        current_mesh_sharding.append(i)
+                    if p.is_shard(shard_dim):
+                        target_mesh_sharding.append(i)
+
+                if current_mesh_sharding != target_mesh_sharding:
+                    # if current/target_placements have misaligned sharding on the tensor dim BEFORE the current
+                    # mesh_dim, we need to replicate the tensor on the mesh dim first to clear the nested sharding
+                    target = Replicate()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if current != target:
                 transform_infos.append(
                     _TransformInfo(
@@ -640,6 +743,7 @@ def generate_greedy_transform_infos(
                     )
                 )
                 current_placements[mesh_dim] = target
+<<<<<<< HEAD
         return transform_infos
 
 
@@ -671,6 +775,25 @@ def _gen_transform_infos_non_cached(
         )
     else:
         transform_infos = drp.generate_greedy_transform_infos(src_spec, dst_spec)
+=======
+
+    # We always traverse from outer placement to inner placement to collect the remaining
+    # needed transform infos (i.e. the replication from nested sharding might need to further
+    # perform resharding to Shard again)
+    for mesh_dim, (current, target) in enumerate(
+        zip(current_placements, target_placements)
+    ):
+        if current != target:
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=mesh_dim,
+                    src_dst_placements=(current, target),
+                    logical_shape=mesh_dims_to_logical_shape[mesh_dim],
+                )
+            )
+            current_placements[mesh_dim] = target
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return transform_infos
 
 
@@ -678,11 +801,16 @@ def _gen_transform_infos_non_cached(
 def _gen_transform_infos(
     src_spec: DTensorSpec,
     dst_spec: DTensorSpec,
+<<<<<<< HEAD
     use_graph_based_transform: Optional[bool] = None,
 ) -> list[_TransformInfo]:
     return _gen_transform_infos_non_cached(
         src_spec, dst_spec, use_graph_based_transform
     )
+=======
+) -> list[_TransformInfo]:
+    return _gen_transform_infos_non_cached(src_spec, dst_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def redistribute_local_tensor(
@@ -692,7 +820,10 @@ def redistribute_local_tensor(
     *,
     async_op: bool = False,
     is_backward: bool = False,
+<<<<<<< HEAD
     use_graph_based_transform: Optional[bool] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.Tensor:
     """
     This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
@@ -714,6 +845,7 @@ def redistribute_local_tensor(
         # which should be an empty tensor
         return local_tensor
 
+<<<<<<< HEAD
     if _are_we_tracing():
         transform_infos = _gen_transform_infos_non_cached(
             current_spec, target_spec, use_graph_based_transform
@@ -834,6 +966,104 @@ def redistribute_local_tensor(
             ):
                 new_local_tensor = new_local_tensor.wait()
             local_tensor = new_local_tensor
+=======
+    has_symints = any(isinstance(s, torch.SymInt) for s in current_spec.shape) or any(
+        isinstance(s, torch.SymInt) for s in target_spec.shape
+    )
+    if has_symints:
+        transform_infos = _gen_transform_infos_non_cached(current_spec, target_spec)
+    else:
+        transform_infos = _gen_transform_infos(current_spec, target_spec)
+
+    for transform_info in transform_infos:
+        i = transform_info.mesh_dim
+        current, target = transform_info.src_dst_placements
+        device_mesh.size(mesh_dim=i)
+
+        if current == target:
+            # short cut, just use the original local tensor
+            new_local_tensor = local_tensor
+            continue
+
+        logger.debug("redistribute from %s to %s on mesh dim %s", current, target, i)
+
+        if target.is_replicate():
+            # Case 1: target is Replicate
+            if current.is_partial():
+                partial_spec = cast(Partial, current)
+                new_local_tensor = partial_spec._reduce_value(
+                    local_tensor, device_mesh, i
+                )
+            elif current.is_shard():
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, device_mesh, i, transform_info.logical_shape
+                )
+            else:
+                raise RuntimeError(
+                    f"redistribute from {current} to {target} not supported yet"
+                )
+        elif target.is_shard():
+            # Case 2: target is Shard
+            target_placement = cast(Shard, target)
+            if current.is_partial():
+                partial_spec = cast(Partial, current)
+                new_local_tensor = partial_spec._reduce_shard_value(
+                    local_tensor, device_mesh, i, target_placement
+                )
+            elif current.is_replicate():
+                # split the tensor and return the corresponding cloned local shard
+                new_local_tensor = target_placement._replicate_to_shard(
+                    local_tensor, device_mesh, i, my_coordinate[i]
+                )
+            else:
+                assert current.is_shard(), (
+                    f"Current placement should be shard but found {current}"
+                )
+                shard_spec = cast(Shard, current)
+                if shard_spec.dim != target_placement.dim:
+                    new_local_tensor = shard_spec._to_new_shard_dim(
+                        local_tensor,
+                        device_mesh,
+                        i,
+                        transform_info.logical_shape,
+                        target_placement.dim,
+                    )
+        elif target.is_partial():
+            if current.is_replicate():
+                partial_spec = cast(Partial, target)
+                # skip the replicate to partial transformation when we are in backward pass
+                # In this case we keep the grad as replicate, this is because we don't
+                # want to convert the replicated gradients back to partial, although
+                # that's logically conform with the same layout, converting the gradients
+                # back to partial is actually useless as you would have to do reduce later
+                # which would be more expensive than keeping it replicate! For this reason,
+                # we keep the replicate grad here.
+                new_local_tensor = (
+                    partial_spec._partition_value(local_tensor, device_mesh, i)
+                    if not is_backward
+                    else local_tensor
+                )
+            elif current.is_shard():
+                if not is_backward:
+                    raise RuntimeError(
+                        f"redistribute from {current} to {target} not supported yet"
+                    )
+                # for backward shard -> partial, we just need to convert the shard to replicate
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, device_mesh, i, transform_info.logical_shape
+                )
+            else:
+                # partial -> partial no op, should never hit
+                new_local_tensor = local_tensor
+
+        local_tensor = new_local_tensor
+
+    if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
+        new_local_tensor = new_local_tensor.wait()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return new_local_tensor
 
 
@@ -883,12 +1113,18 @@ def forward(  # type: ignore[override]
             output = local_tensor
             target_spec = current_spec
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return dtensor.DTensor(
             # pyrefly: ignore [bad-argument-count]
             output,
             target_spec,
             # pyrefly: ignore [unexpected-keyword]
+=======
+        return dtensor.DTensor(
+            output,
+            target_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=input.requires_grad,
         )
 
@@ -947,12 +1183,18 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
                 dtype=output.dtype,
             ),
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         output_dtensor = dtensor.DTensor(
             # pyrefly: ignore [bad-argument-count]
             output,
             spec,
             # pyrefly: ignore [unexpected-keyword]
+=======
+        output_dtensor = dtensor.DTensor(
+            output,
+            spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=grad_output.requires_grad,
         )
 
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index c1af2c1317174..6aed6e0d8ed45 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -1,14 +1,24 @@
 # mypy: allow-untyped-defs
 import threading
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from functools import lru_cache
 from itertools import chain
 from typing import cast, Optional, Union
+=======
+from collections.abc import Sequence
+from functools import lru_cache
+from itertools import chain
+from typing import Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensorMode
+<<<<<<< HEAD
 from torch.distributed._functional_collectives import _are_we_tracing
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._op_schema import (
     OpInfo,
@@ -48,9 +58,12 @@ def __call__(self, *args, **kwargs) -> object:
     def cache_info(self):
         return self.cache.cache_info()
 
+<<<<<<< HEAD
     def cache_clear(self):
         return self.cache.cache_clear()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ShardingPropagator:
     def __init__(self) -> None:
@@ -105,6 +118,7 @@ def register_op_strategy(
         schema_info: Optional[RuntimeSchemaInfo] = None,
     ):
         """
+<<<<<<< HEAD
         Register a :class:`OpStrategy` generator for an operator.
 
         During the sharding propagation, DTensor wants to enumerate all
@@ -148,6 +162,9 @@ def register_op_strategy(
         last two would affect sharding propagation along with the :class:`DTensor` argument
         ``self``. Since the argument index of ``min`` is 2, the `schema_info` should be
         `RuntimeSchemaInfo(static_argnum=2)`.
+=======
+        Register a sharding strategy generator for an operator.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.op_strategy_funcs[op_overload] = strategy_func
         if schema_info is not None:
@@ -164,12 +181,18 @@ def _propagate_tensor_meta_non_cached(
             # data dependent ops can't be used for fake propagation
             return None
 
+<<<<<<< HEAD
         # NOTE: We must call the tracing in fake tensor mode so that it avoids
         # materializing memory. Also disable the proxy mode tracing to prevent
         # these operators to be inserted in the fx graph.
         from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
 
         with FakeTensorMode(), disable_proxy_modes_tracing():
+=======
+        # NOTE: We must call the tracing in fake tensor mode so that it
+        # avoids materializing memory
+        with FakeTensorMode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fake_args = op_schema.gen_fake_args()
             fake_kwargs = op_schema.gen_fake_kwargs()
             fake_out = op_schema.op(*fake_args, **fake_kwargs)
@@ -205,6 +228,7 @@ def _propagate_tensor_meta_non_cached(
     def _propagate_tensor_meta(
         self, op_schema: OpSchema
     ) -> Union[None, TensorMeta, Sequence[Optional[TensorMeta]]]:
+<<<<<<< HEAD
         """
         Cached version of _propagate_tensor_meta_non_cached
         This is a private API. Use propagate_tensor_meta instead.
@@ -225,11 +249,20 @@ def propagate_tensor_meta(
             return self._propagate_tensor_meta(op_schema)
 
     def _create_output_spec_with_new_tensor_meta(
+=======
+        return self._propagate_tensor_meta_non_cached(op_schema)
+
+    def _wrap_output_spec_tensor_meta(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         op: OpOverload,
         output_specs: OutputSpecType,
         output_tensor_meta: Union[None, TensorMeta, Sequence[Optional[TensorMeta]]],
+<<<<<<< HEAD
     ) -> OutputSpecType:
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Wrap the output_specs with the tensor metadata from the output.
         """
@@ -247,9 +280,14 @@ def _create_output_spec_with_new_tensor_meta(
                     "not equal the "
                     f"number of op outputs: {len(output_tensor_meta)}."
                 )
+<<<<<<< HEAD
             return output_specs.shallow_copy_with_tensor_meta(output_tensor_meta)
         elif isinstance(output_specs, (tuple, list)):
             new_specs: list[Optional[DTensorSpec]] = []
+=======
+            output_specs.tensor_meta = output_tensor_meta
+        elif isinstance(output_specs, (tuple, list)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not isinstance(output_tensor_meta, (tuple, list)) or len(
                 output_specs
             ) != len(output_tensor_meta):
@@ -275,7 +313,11 @@ def _create_output_spec_with_new_tensor_meta(
                             and output_tensor_meta_i is None
                         ):
                             assert isinstance(output_specs, list)
+<<<<<<< HEAD
                             new_specs.append(None)
+=======
+                            output_specs[i] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             continue
                         else:
                             raise ValueError(
@@ -283,6 +325,7 @@ def _create_output_spec_with_new_tensor_meta(
                                 "does not have an associated TensorMeta"
                             )
 
+<<<<<<< HEAD
                     new_specs.append(
                         spec.shallow_copy_with_tensor_meta(output_tensor_meta_i)
                     )
@@ -293,6 +336,9 @@ def _create_output_spec_with_new_tensor_meta(
         else:
             assert output_specs is None
             return output_specs
+=======
+                    spec.tensor_meta = output_tensor_meta_i
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _wrap_with_op_strategy(self, op_schema: OpSchema) -> OpSchema:
         """
@@ -328,7 +374,10 @@ def spec_to_strategy(spec: object) -> object:
             op=op_schema.op,
             args_schema=tuple(args_op_strategy),
             kwargs_schema=kwargs_op_strategy,
+<<<<<<< HEAD
             schema_info=op_schema.schema_info,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def propagate(self, op_info: OpInfo) -> None:
@@ -336,7 +385,11 @@ def propagate(self, op_info: OpInfo) -> None:
         # because SymInts are not hashable.
         # This is generally ok because this only happens during tracing in torch.compile,
         # and tracing does not need to be as fast as eagermode DTensor usages.
+<<<<<<< HEAD
         if _are_we_tracing():
+=======
+        if op_info.schema.has_symints:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_sharding = self.propagate_op_sharding_non_cached(op_info.schema)
         else:
             output_sharding = cast(
@@ -354,6 +407,10 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
             return OutputSharding(None, op_schema)
 
         out_tensor_meta = self._propagate_tensor_meta_non_cached(op_schema)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op_schema.op in self.op_strategy_funcs:
             # wrap the op_schema with op strategy for sharding strategy propagation
             strategy_schema = self._wrap_with_op_strategy(op_schema)
@@ -363,12 +420,19 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
 
             if isinstance(op_strategy, OpStrategy):
                 # single Op strategy
+<<<<<<< HEAD
                 output_strategy = self._select_strategy(op_strategy, op_schema)
 
                 # check if we need to redistribute the input
                 needs_redistribute = False
                 # check if we want to use args value from redistribute_schema
                 use_val_from_redistribute_schema = False
+=======
+                output_strategy = self._select_strategy(op_strategy)
+
+                # check if we need to redistribute the input
+                needs_redistribute = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 expected_input_specs: list[DTensorSpec] = []
 
                 # in case where the op does not specify input_specs and output_specs
@@ -411,7 +475,10 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                             out_tensor_meta, schema, output_strategy.output_spec
                         )
                         needs_redistribute = True
+<<<<<<< HEAD
                         use_val_from_redistribute_schema = True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # construct output spec for the op
                 if op_schema.return_type_tuple_tensor_like():
@@ -421,6 +488,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     output_specs: OutputSpecType = output_strategy.output_specs
                     if isinstance(output_specs, DTensorSpec):
                         output_specs = tuple(
+<<<<<<< HEAD
                             # create a new DTensorSpec with the same placement as the
                             # output_specs in output_strategy
                             DTensorSpec(
@@ -434,6 +502,20 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     op_schema.return_type_tensor()
                     or op_schema.return_type_list_tensor_like()
                 ):
+=======
+                            [
+                                # create a new DTensorSpec with the same placement as the
+                                # output_specs in output_strategy
+                                DTensorSpec(
+                                    mesh=output_specs.mesh,
+                                    placements=output_specs.placements,
+                                    tensor_meta=output_specs.tensor_meta,
+                                )
+                                for _ in range(len(op_schema.op._schema.returns))
+                            ]
+                        )
+                elif op_schema.return_type_tensor():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     output_specs = output_strategy.output_specs
                 else:
                     output_specs = None
@@ -442,14 +524,21 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     output_specs,
                     suggestion_schema,
                     needs_redistribute=needs_redistribute,
+<<<<<<< HEAD
                     use_val_from_redistribute_schema=use_val_from_redistribute_schema,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif isinstance(op_strategy, TupleStrategy):
                 # tuple strategy output sharding processing
                 # runtime select OpSpec for each TupleStrategy input arg
                 selected_strategies: list[OpSpec] = []
                 out_spec_list: list[DTensorSpec] = []
+<<<<<<< HEAD
                 for strategy in op_strategy.children:
+=======
+                for strategy in op_strategy.childs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert isinstance(strategy, OpStrategy)
                     selected_strategy = self._select_strategy(strategy)
                     selected_strategies.append(selected_strategy)
@@ -511,16 +600,25 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     tuple(out_spec_list) if out_tensor_meta is not None else None,
                     suggestion_schema,
                     needs_redistribute=needs_redistribute,
+<<<<<<< HEAD
                     use_val_from_redistribute_schema=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 raise ValueError("Unsupported op strategy type")
 
             # associate the output sharding with the output tensor metadata
+<<<<<<< HEAD
             new_output_spec = self._create_output_spec_with_new_tensor_meta(
                 op_schema.op, output_sharding.output_spec, out_tensor_meta
             )
             output_sharding.output_spec = new_output_spec
+=======
+            self._wrap_output_spec_tensor_meta(
+                op_schema.op, output_sharding.output_spec, out_tensor_meta
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return output_sharding
         elif op_schema.op in self.op_to_rules:
             # propagate the sharding with rule
@@ -560,10 +658,16 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     output_sharding.needs_redistribute = True
 
             # associate the output sharding with the output tensor metadata
+<<<<<<< HEAD
             new_output_spec = self._create_output_spec_with_new_tensor_meta(
                 op_schema.op, output_sharding.output_spec, out_tensor_meta
             )
             output_sharding.output_spec = new_output_spec
+=======
+            self._wrap_output_spec_tensor_meta(
+                op_schema.op, output_sharding.output_spec, out_tensor_meta
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return output_sharding
         else:
@@ -571,22 +675,31 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 f"Operator {op_schema.op} does not have a sharding strategy registered."
             )
 
+<<<<<<< HEAD
     def _select_strategy(
         self, strategy: OpStrategy, op_schema: Optional[OpSchema] = None
     ) -> OpSpec:
+=======
+    def _select_strategy(self, strategy: OpStrategy) -> OpSpec:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(strategy.strategies) == 1:
             # short cut with only one possible OpSpec
             return strategy.strategies[0]
 
         op_spec_costs: list[float] = []
+<<<<<<< HEAD
         no_redistribute_strategy_index: int = -1
         for strategy_idx, op_spec in enumerate(strategy.strategies):
+=======
+        for op_spec in strategy.strategies:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert op_spec.redistribute_cost is not None, (
                 "must set redistribute cost each OpSpec!"
             )
             redistribute_cost = sum(chain.from_iterable(op_spec.redistribute_cost))
             op_spec_costs.append(redistribute_cost)
 
+<<<<<<< HEAD
             # If there's no redistribute cost, we record the index of the strategy
             # which doesn't need redistribute.
             # TODO: Currently this only applies to OpStrategy selection. Requires extra
@@ -621,6 +734,10 @@ def _select_strategy(
             selected_strategy_index = op_spec_costs.index(min_cost)
 
         return strategy.strategies[selected_strategy_index]
+=======
+        # for eager execution, we just select the one with the minimal redistribute cost
+        return strategy.strategies[op_spec_costs.index(min(op_spec_costs))]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _adjust_shape_and_stride_args(
         self,
diff --git a/torch/distributed/tensor/_shards_wrapper.py b/torch/distributed/tensor/_shards_wrapper.py
index 1673dd7e34b99..a3b41c30bcb89 100644
--- a/torch/distributed/tensor/_shards_wrapper.py
+++ b/torch/distributed/tensor/_shards_wrapper.py
@@ -27,7 +27,11 @@
 class LocalShardsWrapper(torch.Tensor):
     """
     A wrapper class to hold local shards of a DTensor.
+<<<<<<< HEAD
     This class is used largely for checkpointing purposes and implicitly subtypes
+=======
+    This class is used largely for checkpointing purposes and implicity subtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     the _Checkpointable protocol.
     """
 
@@ -159,7 +163,11 @@ def handle_view(args, kwargs) -> "LocalShardsWrapper":
                 ]
             elif args[0].local_shards()[0].ndim == 1:
                 assert args[0].storage_metadata().size[0] == view_shape[0]
+<<<<<<< HEAD
                 # This case is for optimizer sharding as regardless of sharding type, optimizer state is row wise sharded
+=======
+                # This case is for optimizer sharding as regardles of sharding type, optimizer state is row wise sharded
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res_shards_list = [
                     aten.view.default(shard, shard.shape, **kwargs)
                     for shard in args[0].local_shards()
@@ -187,7 +195,11 @@ def handle_equal(args, kwargs) -> bool:
             aten.equal.default(x, y) for x, y in zip(a.local_shards(), b.local_shards())
         ):
             return False
+<<<<<<< HEAD
         if a.storage_metadata() != b.storage_metadata():
+=======
+        if not a.storage_metadata() == b.storage_metadata():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
         return True
 
diff --git a/torch/distributed/tensor/_tp_conv.py b/torch/distributed/tensor/_tp_conv.py
index 2b3f126c7e506..8c00b87a28409 100644
--- a/torch/distributed/tensor/_tp_conv.py
+++ b/torch/distributed/tensor/_tp_conv.py
@@ -13,6 +13,7 @@
 
 def _requires_data_exchange(padding):
     # TODO: whether there requires data exchange is currently determined by padding
+<<<<<<< HEAD
     return padding[-1] != 0
 
 
@@ -32,6 +33,27 @@ def _is_supported(input_size, kernel_size, stride, padding, dilation):
         if not (input_size[-1] % stride[-1] == 0 and stride[-1] == kernel_size[-1]):
             raise RuntimeError(
                 "It requires that input_size[-1] is divisible by stride[-1] and stride[-1] equals kernel_size[-1] "
+=======
+    return padding[1] != 0
+
+
+def _is_supported(input_size, kernel_size, stride, padding, dilation):
+    if dilation[1] != 1:
+        raise RuntimeError("Dilation must be 1 for tensor parallel convolution.")
+    if padding[1] != 0:
+        if stride[1] != 1:
+            raise RuntimeError(
+                "Stride must be 1 when there is padding for tensor parallel convolution."
+            )
+        if kernel_size[3] // 2 > input_size[3]:
+            raise RuntimeError(
+                "kernel_size[3] // 2 should be less than or equal to input_size[3] for tensor parallel convolution."
+            )
+    else:
+        if not (input_size[3] % stride[1] == 0 and stride[1] == kernel_size[3]):
+            raise RuntimeError(
+                "It requires that input_size[3] is divisible by stride[1] and stride[1] equals kernel_size[3] "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "when there is padding for tensor parallel convolution."
             )
     return True
@@ -39,8 +61,13 @@ def _is_supported(input_size, kernel_size, stride, padding, dilation):
 
 def _ring_send_recv_construct(in_tensor, d1, d2, left, right, rank, size):
     # dist comms and reconstruct local input tensor
+<<<<<<< HEAD
     send_to_right = in_tensor[..., -d1:].contiguous()
     send_to_left = in_tensor[..., :d2].contiguous()
+=======
+    send_to_right = in_tensor[:, :, :, -d1:].contiguous()
+    send_to_left = in_tensor[:, :, :, :d2].contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     recv_from_right = torch.zeros_like(send_to_left)
     recv_from_left = torch.zeros_like(send_to_right)
 
@@ -125,7 +152,11 @@ def tp_convolution(
         return local_results
     else:
         # step 0 compute the overlap pixels of the input tensor
+<<<<<<< HEAD
         d = weight.shape[-1] - 1
+=======
+        d = weight.shape[3] - 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         d1 = d // 2
         d2 = d - d1
         assert d1 + d2 == d
@@ -144,6 +175,7 @@ def tp_convolution(
         local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
 
         # step3 remove extra outputs from the results
+<<<<<<< HEAD
         padding_w = padding[-1]
         w = local_results.size(-1)
         if rank == 0:
@@ -152,6 +184,16 @@ def tp_convolution(
             local_results = local_results[..., padding_w:]
         else:
             local_results = local_results[..., padding_w : w - padding_w]
+=======
+        padding_w = padding[1]
+        w = local_results.size(3)
+        if rank == 0:
+            local_results = local_results[:, :, :, : w - padding_w]
+        elif rank == size - 1:
+            local_results = local_results[:, :, :, padding_w:]
+        else:
+            local_results = local_results[:, :, :, padding_w : w - padding_w]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return local_results
 
@@ -256,10 +298,15 @@ def convolution_backward_handler(
     kwargs: dict[str, object],
 ) -> object:
     # Redistribute grad_output tensor to the same placement as input tensor
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     args = list(args)
     assert isinstance(args[0], dtensor.DTensor) and isinstance(args[1], dtensor.DTensor)
     # pyrefly: ignore [unsupported-operation]
+=======
+    args = list(args)
+    assert isinstance(args[0], dtensor.DTensor) and isinstance(args[1], dtensor.DTensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args[0] = args[0].redistribute(args[1].device_mesh, args[1].placements)
     args = tuple(args)
 
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index 7325fc2daf095..727b599cc03a2 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -15,7 +15,10 @@
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.utils._typing_utils import not_none
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _explicit_order_placements(
@@ -124,6 +127,7 @@ def _compute_local_shape_and_global_offset(
     my_coordinate: Optional[list[int]],
     placements: Sequence[Placement],
 ) -> tuple[tuple[int, ...], tuple[int, ...]]:
+<<<<<<< HEAD
     """
     Suppose you have a full tensor with size global_shape, and you have sharded
     it according to placements for mesh_shape.  This function returns, for a
@@ -139,10 +143,14 @@ def _compute_local_shape_and_global_offset(
     is around uneven splits.  There is also some complication for handling StridedShard,
     which changes the order you should apply sharding.
     """
+=======
+    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if my_coordinate is None:
         # if rank not in the mesh, return empty offset
         return ((0,), ())
+<<<<<<< HEAD
 
     # StridedShard implies a non-standard order to apply shards; get the
     # correct order to start applying splits
@@ -218,6 +226,66 @@ def _compute_local_shape_and_global_offset(
     # TODO: change this function to correctly address this.
     # TODO: this logic can be applied to contiguous sharding as well
     return tuple(local_shape), tuple(global_offset)
+=======
+    else:
+        local_shape = list(global_shape)
+        global_offset = [0] * len(global_shape)
+        for mesh_dim, placement in ordered_placements:
+            mesh_dim_size = mesh_shape[mesh_dim]
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                local_offset = [0] * len(global_shape)
+                assert shard_dim < len(local_shape), (
+                    f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
+                )
+                shard_size, shard_offset = placement._local_shard_size_and_offset(
+                    local_shape[shard_dim],
+                    mesh_dim_size,
+                    my_coordinate[mesh_dim],
+                )
+
+                local_shape[shard_dim] = shard_size
+                local_offset[shard_dim] = shard_offset
+                if shard_size == 0:
+                    # Special case to fill in a standardized non-garbage value for the global_offset
+                    # of zero-sized shards.  This value is out of bounds of the tensor, so it won't conflict
+                    # with any real offsets.  DCP may rely on this value to de-duplicate shards.
+                    global_offset[shard_dim] = global_shape[shard_dim]
+                else:
+                    # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
+                    # it means that this dimension has been already sharded in previous placement.
+                    # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
+                    # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
+                    if global_offset[shard_dim] <= local_offset[shard_dim]:
+                        global_offset[shard_dim] = local_offset[shard_dim]
+                    else:
+                        global_offset[shard_dim] += local_offset[shard_dim]
+
+        # NOTE: the offset compute relies on the local shard index and it has no
+        # problem when strided sharding is not present. To correctly compute, we assume
+        # that the ``_StridedShard.split_factor`` field encodes how many partitions
+        # each local tensor will be further split into when sharding on higher mesh
+        # dimensions. However, this number is only correct if the DTensor is not
+        # sharded after the strided sharding completes. For example,
+        # [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] is the placements
+        # where the DTensor's dim-0 is first sharded on device mesh dim-0, then on
+        # device mesh dim-2, and last on mesh dim-1. We define the
+        # "_StridedShard(0, split_factor=2), Shard(0)" part as the strided sharding
+        # part because strided sharding happens on mesh dim-1 and it was caused by
+        # the fact that sharding on dim-2 occurred ahead. In this case, there's no
+        # further sharding after this strided sharding part and ``split_factor``
+        # correctly encodes the number. Another example is
+        # [_StridedShard(0, split_factor=2), Shard(0), Shard(0)] where the DTensor's
+        # dim-0 is first sharded on mesh dim-1, then on mesh dim-0, and last on mesh
+        # dim-2. This violates our assumption that no further sharding shall occur
+        # after the strided sharding part and ``split_factor`` won't correctly
+        # encode the number of further split. So far, the only case where _StridedShard
+        # placement would appear is FSDP2 + TP on 2D mesh and the above case could only
+        # happen on mesh of 3 or more dimensions.
+        # TODO: change this function to correctly address this.
+        # TODO: this logic can be applied to contiguous sharding as well
+        return tuple(local_shape), tuple(global_offset)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def compute_global_tensor_info(
@@ -279,6 +347,7 @@ def compute_global_tensor_info(
     return tensor_shape, tensor_stride
 
 
+<<<<<<< HEAD
 def compute_local_tensor_info(
     global_tensor: torch.Tensor,
     mesh: DeviceMesh,
@@ -343,6 +412,8 @@ def compute_local_tensor_info(
     return local_shape, local_stride
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compute_global_tensor_shape(
     shape: torch.Size, mesh: DeviceMesh, placements: Sequence[Placement]
 ) -> torch.Size:
@@ -380,19 +451,31 @@ def compute_global_tensor_shape(
     if isinstance(placements[0], Replicate):
         return shape
     elif isinstance(placements[0], Shard):
+<<<<<<< HEAD
         local_shape = torch.tensor(list(shape), device=mesh.device_type)
+=======
+        local_shape = torch.tensor(list(shape))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gathered_shaped_tensors = [
             torch.empty_like(local_shape, device=local_shape.device)
             for _ in range(mesh.size())
         ]
+<<<<<<< HEAD
         funcol.all_gather_inplace(gathered_shaped_tensors, local_shape, mesh)
+=======
+        funcol.all_gather_inplace(gathered_shaped_tensors, local_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sharded_dim_sum = 0
         shard_dim = placements[0].dim
         other_dims = [d for d in range(mesh.ndim) if d != shard_dim]
         for shape_tensor in gathered_shaped_tensors:
             if not torch.equal(local_shape[other_dims], shape_tensor[other_dims]):
                 raise RuntimeError(
+<<<<<<< HEAD
                     "Non-sharded dimensions should have identical size across ranks."
+=======
+                    "Non-sharded dimentions should have identical size across ranks."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             shape_tensor_list = shape_tensor.tolist()
             sharded_dim_sum += shape_tensor_list[shard_dim]
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index a74f1449ad125..a97e8a490e5fe 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -19,6 +19,7 @@ def _get_sharding_prop_cache_info():
     )
 
 
+<<<<<<< HEAD
 def _clear_sharding_prop_cache():
     """
     Clears the cache for the sharding propagation cache, used for debugging purpose only.
@@ -30,6 +31,8 @@ def _clear_sharding_prop_cache():
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/debug/_comm_mode.py b/torch/distributed/tensor/debug/_comm_mode.py
index e494b07d96bfa..43409b34c815a 100644
--- a/torch/distributed/tensor/debug/_comm_mode.py
+++ b/torch/distributed/tensor/debug/_comm_mode.py
@@ -395,7 +395,11 @@ def add_json_information(json_dict, fqn):
         json_dict: dict[str, Any] = {}
         add_json_information(json_dict, "Global")
 
+<<<<<<< HEAD
         # converts dictionary into json file
+=======
+        # converts dictonary into json file
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(file_name, "w") as json_file:
             json.dump(json_dict, json_file, indent=4)
 
@@ -594,7 +598,10 @@ def __enter__(self):
         self.advanced_module_tracker.__enter__()
         return self
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __exit__(self, *args):
         self.advanced_module_tracker.__exit__()
         super().__exit__(*args)
@@ -734,6 +741,9 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         ].append(operation_dict)
 
         return out
+<<<<<<< HEAD
 
     def __repr__(self):
         return f"CommDebugMode(get_total_counts()={self.get_total_counts()})"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/debug/_op_coverage.py b/torch/distributed/tensor/debug/_op_coverage.py
index 7315d64d697a8..58577f78e2592 100644
--- a/torch/distributed/tensor/debug/_op_coverage.py
+++ b/torch/distributed/tensor/debug/_op_coverage.py
@@ -90,7 +90,10 @@ def print_op_coverage_summary(model: nn.Module, args, kwargs, *, output_csv=Fals
     op_infos.sort(key=itemgetter(count_idx), reverse=True)
 
     headers = ["Operator", "Schema", "Total Count", "Supported"]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     print(tabulate(op_infos, headers=headers))
 
     if output_csv:
@@ -102,5 +105,8 @@ def print_op_coverage_summary(model: nn.Module, args, kwargs, *, output_csv=Fals
             csv_writer.writerow(headers)
             # Write each table row to the CSV file
             for row in op_infos:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 csv_writer.writerow(row)
diff --git a/torch/distributed/tensor/examples/comm_mode_features_example.py b/torch/distributed/tensor/examples/comm_mode_features_example.py
index 6744448527821..daf87a5af1fa9 100644
--- a/torch/distributed/tensor/examples/comm_mode_features_example.py
+++ b/torch/distributed/tensor/examples/comm_mode_features_example.py
@@ -5,7 +5,11 @@
 
 import argparse
 import os
+<<<<<<< HEAD
 from typing import TYPE_CHECKING, Union
+=======
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -26,6 +30,7 @@
 from torch.utils.checkpoint import checkpoint
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
@@ -35,6 +40,14 @@ def get_device_type() -> str:
     if torch.accelerator.device_count() >= 4:
         device_type = getattr(torch.accelerator.current_accelerator(), "type", "cpu")
     return device_type
+=======
+def get_device_type() -> str:
+    return (
+        "cuda"
+        if torch.cuda.is_available() and torch.cuda.device_count() >= 4
+        else "cpu"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 c10d_functional = torch.ops.c10d_functional
@@ -714,7 +727,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def run_example(world_size: int, rank: int, example_name: str) -> None:
     # set manual seed
+<<<<<<< HEAD
     # initializing class with all of the functions
+=======
+    # intializing class with all of the functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiated_example = CommDebugModeExample(world_size, rank)
     # dict that stores example code function names
     name_to_example_code: dict[str, Callable[[], None]] = {
diff --git a/torch/distributed/tensor/examples/convnext_example.py b/torch/distributed/tensor/examples/convnext_example.py
index c1bd542922afb..1b2ebda8934fe 100644
--- a/torch/distributed/tensor/examples/convnext_example.py
+++ b/torch/distributed/tensor/examples/convnext_example.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 """
 The following example demonstrates how to train a ConvNeXt model
+<<<<<<< HEAD
 with intermediate activations sharded across multiple GPUs via DTensor
+=======
+with intermediate activations sharded across mutliple GPUs via DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 To run the example, use the following command:
 torchrun --standalone --nnodes=1 --nproc-per-node=4 convnext_example.py
@@ -34,7 +38,11 @@ def __init__(self, normalized_shape, eps=1e-6, data_format=torch.contiguous_form
         self.bias = nn.Parameter(torch.zeros(normalized_shape))
         self.eps = eps
         self.data_format = data_format
+<<<<<<< HEAD
         if self.data_format != torch.contiguous_format:
+=======
+        if self.data_format not in [torch.contiguous_format]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError
         self.normalized_shape = (normalized_shape,)
 
@@ -110,7 +118,11 @@ def forward(self, x):
 
 @torch.no_grad()
 def init_weights(m):
+<<<<<<< HEAD
     if type(m) is nn.Conv2d or type(m) is nn.Linear:
+=======
+    if type(m) == nn.Conv2d or type(m) == nn.Linear:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nn.init.ones_(m.weight)
         if m.bias is not None:
             nn.init.zeros_(m.bias)
diff --git a/torch/distributed/tensor/examples/torchrec_sharding_example.py b/torch/distributed/tensor/examples/torchrec_sharding_example.py
index 713dba994e790..6aee438be19ee 100644
--- a/torch/distributed/tensor/examples/torchrec_sharding_example.py
+++ b/torch/distributed/tensor/examples/torchrec_sharding_example.py
@@ -90,11 +90,16 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[
         # TODO: we shall continually extend this function to support more ops if needed
         if func in supported_ops:
             res_shards_list = [
+<<<<<<< HEAD
                 func(shard, *args[1:], **kwargs)
                 # pyrefly: ignore [index-error]
                 for shard in args[0].shards
             ]
             # pyrefly: ignore [index-error]
+=======
+                func(shard, *args[1:], **kwargs) for shard in args[0].shards
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return LocalShardsWrapper(res_shards_list, args[0].shard_offsets)
         else:
             raise NotImplementedError(
@@ -144,7 +149,10 @@ def run_torchrec_row_wise_even_sharding_example(rank, world_size):
     local_tensor = torch.randn(local_shard_shape, device=device)
     # row-wise sharding: one shard per rank
     # create the local shards wrapper
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     local_shards_wrapper = LocalShardsWrapper(
         local_shards=[local_tensor],
         offsets=[local_shard_offset],
@@ -223,7 +231,10 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
     # local shards
     # row-wise sharding: one shard per rank
     # create the local shards wrapper
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     local_shards_wrapper = LocalShardsWrapper(
         local_shards=[local_tensor],
         offsets=[local_shard_offset],
@@ -236,7 +247,11 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
 
     # note: for uneven sharding, we need to specify the shape and stride because
     # DTensor would assume even sharding and compute shape/stride based on the
+<<<<<<< HEAD
     # assumption. Torchrec needs to pass in this information explicitly.
+=======
+    # assumption. Torchrec needs to pass in this information explicitely.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # shape/stride are global tensor's shape and stride
     dtensor = DTensor.from_local(
         local_shards_wrapper,  # a torch.Tensor subclass
@@ -302,7 +317,10 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
         local_shard_offset = torch.Size((0, 0))
         # wrap local shards into a wrapper
         local_shards_wrapper = (
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             LocalShardsWrapper(
                 local_shards=[local_tensor],
                 offsets=[local_shard_offset],
@@ -330,7 +348,11 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
         # create a DTensor from the local shard for the current table
         # note: for uneven sharding, we need to specify the shape and stride because
         # DTensor would assume even sharding and compute shape/stride based on the
+<<<<<<< HEAD
         # assumption. Torchrec needs to pass in this information explicitly.
+=======
+        # assumption. Torchrec needs to pass in this information explicitely.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtensor = DTensor.from_local(
             local_shards,
             device_submesh,
diff --git a/torch/distributed/tensor/examples/visualize_sharding_example.py b/torch/distributed/tensor/examples/visualize_sharding_example.py
index 7c0ab3adfffae..f8598a40d0e5a 100644
--- a/torch/distributed/tensor/examples/visualize_sharding_example.py
+++ b/torch/distributed/tensor/examples/visualize_sharding_example.py
@@ -18,9 +18,12 @@
 rank = int(os.environ["RANK"])
 
 
+<<<<<<< HEAD
 device_type = getattr(torch.accelerator.current_accelerator(), "type", "cpu")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def section(msg: str) -> None:
     if rank == 0:
         rich.print(rich.rule.Rule(msg))
@@ -34,7 +37,11 @@ def visualize(t: dt.DTensor, msg: str = "") -> None:
 
 
 section("[bold]1D Tensor; 1D Mesh[/bold]")
+<<<<<<< HEAD
 m = dist.init_device_mesh(device_type, (4,))
+=======
+m = dist.init_device_mesh("cuda", (4,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 t = torch.ones(4)
 visualize(
     dt.distribute_tensor(t, m, [dt.Replicate()]),
@@ -46,7 +53,11 @@ def visualize(t: dt.DTensor, msg: str = "") -> None:
 )
 
 section("[bold]2D Tensor; 1D Mesh[/bold]")
+<<<<<<< HEAD
 m = dist.init_device_mesh(device_type, (4,))
+=======
+m = dist.init_device_mesh("cuda", (4,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 t = torch.ones(4, 4)
 visualize(
     dt.distribute_tensor(t, m, [dt.Replicate()]),
@@ -62,7 +73,11 @@ def visualize(t: dt.DTensor, msg: str = "") -> None:
 )
 
 section("[bold]1D Tensor; 2D Mesh[/bold]")
+<<<<<<< HEAD
 m = dist.init_device_mesh(device_type, (2, 2))
+=======
+m = dist.init_device_mesh("cuda", (2, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 t = torch.ones(4)
 visualize(
     dt.distribute_tensor(t, m, [dt.Replicate(), dt.Replicate()]),
@@ -82,7 +97,11 @@ def visualize(t: dt.DTensor, msg: str = "") -> None:
 )
 
 section("[bold]2D Tensor; 2D Mesh[/bold]")
+<<<<<<< HEAD
 m = dist.init_device_mesh(device_type, (2, 2))
+=======
+m = dist.init_device_mesh("cuda", (2, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 t = torch.ones(4, 4)
 visualize(
     dt.distribute_tensor(t, m, [dt.Replicate(), dt.Replicate()]),
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 566390b8a039a..9c6abda0e15dd 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -1,17 +1,32 @@
+<<<<<<< HEAD
+=======
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import itertools
 import logging
 import types
+<<<<<<< HEAD
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Generator, Mapping, Sequence
 from dataclasses import dataclass
 from enum import auto, Enum
 from functools import partial
 from typing import Any, cast, Optional, Protocol, TypeAlias
+=======
+import weakref
+from abc import ABC, abstractmethod
+from collections.abc import Generator
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import Any, Callable, Optional, Protocol, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
+<<<<<<< HEAD
 import torch.distributed.distributed_c10d as c10d
 import torch.nn as nn
 import torch.nn.functional as F
@@ -30,6 +45,14 @@
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
 from ._cp_custom_ops import flex_cp_allgather
+=======
+import torch.nn.functional as F
+from torch import nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import distribute_module, DTensor, Replicate, Shard
+from torch.distributed.tensor.parallel.style import ParallelStyle
+from torch.overrides import TorchFunctionMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["context_parallel", "set_rotate_method"]
@@ -52,7 +75,12 @@ class _RotateMethod(Enum):
 
 class _DispatchMode(Enum):
     MONKEY_PATCH = auto()
+<<<<<<< HEAD
     MODULE_WRAPPER = auto()
+=======
+    TORCH_FUNCTION = auto()
+    TORCH_DISPATCH = auto()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _dispatch_mode: _DispatchMode = _DispatchMode.MONKEY_PATCH
@@ -61,10 +89,17 @@ class _DispatchMode(Enum):
 @dataclass
 class _ContextParallelOptions:
     # Whether to upcast parameters and gradients to float32 to avoid accumulation
+<<<<<<< HEAD
     # errors. It is likely this is always True, but we currently keep this variable
     # for experimental purposes.
     convert_to_f32: bool = True
     enable_load_balance: bool = True
+=======
+    # errors. It is likely this is always True but we currently keep this variable
+    # for the experimental purpose.
+    convert_to_f32: bool = True
+    enable_load_balance = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rotate_method: _RotateMethod = _RotateMethod.ALL_GATHER
 
 
@@ -110,10 +145,17 @@ def _partial_update(
     add: bool,
 ) -> torch.Tensor:
     """
+<<<<<<< HEAD
     This API partially updates a chunk of ``original`` tensor. The ``original``
     tensor will be first chunked along ``dim`` dimension, then the ``idx`` chunk
     will be updated with ``new``. If ``add`` is True, the chunk will be added
     with ``new``, otherwise the chunk will be replaced by ``new``.
+=======
+    This API partially update a chunk of ``original`` tensor. The ``original``
+    tensor will be first chunked along ``dim`` dimension then the ``idx`` chunk
+    will be updated with ``new``. If ``add`` is True, the chunk will be added
+    with ``new``, otherwise the chunk with be replaced by ``add``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     The result is a tensor that is the same size as ``original``.
     """
@@ -127,13 +169,20 @@ def _partial_update(
 
 
 class _SDPAMerger:
+<<<<<<< HEAD
     """A class to help merge the local SDPA result."""
+=======
+    """A class to help to merge the local SDPA result."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, convert_to_f32: bool, seq_dim: int):
         self._seq_dim = seq_dim
         self._out: Optional[torch.Tensor] = None
         self._lse: Optional[torch.Tensor] = None
+<<<<<<< HEAD
         self._should_lse_squeeze = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._convert_to_f32 = convert_to_f32
         self._out_dtype = torch.float32
         self._lse_dtype = torch.float32
@@ -141,6 +190,7 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
     def _merge_one(
         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
     ) -> None:
+<<<<<<< HEAD
         # The cuDNN backend preserves the last dimension for LSE.
         # Apply unsqueeze only if the input does not already have
         # the required dimensionality.
@@ -149,6 +199,9 @@ def _merge_one(
             self._should_lse_squeeze = True
         assert len(block_lse.shape) == len(block_out.shape)
 
+=======
+        block_lse = block_lse.unsqueeze(dim=-1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._lse is None:
             self._lse = block_lse
             self._out = block_out
@@ -206,12 +259,117 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
         assert self._out is not None
         assert self._lse is not None
+<<<<<<< HEAD
         out = self._out.to(self._out_dtype)
         if self._should_lse_squeeze:
             lse = self._lse.squeeze(-1).to(self._lse_dtype)
         else:
             lse = self._lse.to(self._lse_dtype)
         return out, lse
+=======
+        out, lse = self._out, self._lse.squeeze(-1)
+        return out.to(self._out_dtype), lse.to(self._lse_dtype)
+
+
+def _scaled_dot_product_ring_flash_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: Optional[float] = None,
+) -> tuple[torch.Tensor, ...]:
+    if return_debug_mask:
+        raise NotImplementedError("return_debug_mask is not supported yet")
+
+    seq_dim = 2
+    return _templated_ring_attention(
+        mesh,
+        seq_dim,
+        aten._scaled_dot_product_flash_attention,
+        query=query,
+        key=key,
+        value=value,
+        is_causal=is_causal,
+        dropout_p=dropout_p,
+        scale=scale,
+    )
+
+
+def _scaled_dot_product_ring_efficient_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    compute_log_sumexp: bool = True,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    *,
+    scale: Optional[float] = None,
+) -> tuple[torch.Tensor, ...]:
+    if attn_bias is not None:
+        raise NotImplementedError("attn_bias is not supported yet")
+
+    if not compute_log_sumexp:
+        # CP requires compute_log_sumexp to be True because it always merges LSE
+        compute_log_sumexp = True
+
+    seq_dim = 2
+    return _templated_ring_attention(
+        mesh,
+        seq_dim,
+        aten._scaled_dot_product_efficient_attention,
+        query=query,
+        key=key,
+        value=value,
+        is_causal=is_causal,
+        attn_bias=attn_bias,
+        dropout_p=dropout_p,
+        scale=scale,
+        compute_log_sumexp=compute_log_sumexp,
+    )
+
+
+def _scaled_dot_product_ring_cudnn_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    compute_log_sumexp: bool = True,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: Optional[float] = None,
+) -> tuple[torch.Tensor, ...]:
+    if attn_bias is not None:
+        raise NotImplementedError("attn_bias is not supported yet")
+
+    if not compute_log_sumexp:
+        # CP requires compute_log_sumexp to be True because it always merges LSE
+        compute_log_sumexp = True
+
+    seq_dim = 2
+    return _templated_ring_attention(
+        mesh,
+        seq_dim,
+        aten._scaled_dot_product_cudnn_attention,
+        query=query,
+        key=key,
+        value=value,
+        attn_bias=attn_bias,
+        compute_log_sumexp=compute_log_sumexp,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        return_debug_mask=return_debug_mask,
+        scale=scale,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _AttentionOp(Protocol):
@@ -236,7 +394,11 @@ def next_buffer(self) -> torch.Tensor: ...
 
 
 class _AllToAllRotater(_RingRotater):
+<<<<<<< HEAD
     """Use all_to_all to send the kv to the next rank."""
+=======
+    """Use all_to_all to send the kv to the next rank"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, pg: dist.ProcessGroup, seq_dim: int) -> None:
         self._pg = pg
@@ -256,7 +418,11 @@ def next_buffer(self) -> torch.Tensor:
 
 class _AllGatherRotater(_RingRotater):
     """
+<<<<<<< HEAD
     Allgather the kv and return only the required kv.
+=======
+    Allgather the kv and return the only the requried kv.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Only one communication will be done.
     """
 
@@ -267,7 +433,11 @@ def __init__(self, pg: dist.ProcessGroup, seq_dim: int) -> None:
         self._idx = 0
 
     def exchange_buffers(self, curr_buffer: torch.Tensor) -> None:
+<<<<<<< HEAD
         # We only need to perform allgather once.
+=======
+        # We only need to perform the allgather once.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._idx += 1
         if self._aggregated_buffer is None:
             self._aggregated_buffer = ft_c.all_gather_tensor(
@@ -294,11 +464,32 @@ def _create_rotater(
     elif method == _RotateMethod.ALL_GATHER:
         return _AllGatherRotater(pg, seq_dim)
     else:
+<<<<<<< HEAD
         raise NotImplementedError(f"Unknown method {method}")
 
 
 def _templated_ring_attention(
     group: dist.ProcessGroup,
+=======
+        raise NotImplementedError(f"Unkonwn method {method}")
+
+
+def _ring_rotate(
+    block: torch.Tensor, pg: dist.ProcessGroup, send_to_next: bool
+) -> torch.Tensor:
+    block = block.contiguous()
+    size = dist.get_world_size(pg)
+    dsts = (
+        list(range(1, size)) + [0]
+        if send_to_next
+        else [size - 1] + list(range(0, size - 1))
+    )
+    return ft_c.permute_tensor(block, dsts, pg)
+
+
+def _templated_ring_attention(
+    mesh: DeviceMesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     seq_dim: int,
     op: _AttentionOp,
     query: torch.Tensor,
@@ -308,7 +499,11 @@ def _templated_ring_attention(
     **kwargs: object,
 ) -> tuple[torch.Tensor, ...]:
     """
+<<<<<<< HEAD
     A generalized ring attention implementation that can support multiple attention ops.
+=======
+    This is a generalized ring attention implementation that can support multiple attention ops.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note [Context parallelism load balance algorithm for causal masking]
     =====================
@@ -356,13 +551,22 @@ def _templated_ring_attention(
 
     First Iteration: Both ranks perform SDPA with their local qkv pairs, similar to the
     no-load-balance case. This iteration corresponds to the `if` of the
+<<<<<<< HEAD
     (`if, `elif`, `else`) in the implementation.
+=======
+    (`if, `elif`, `else`) in the implemementation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Second Iteration: Rank0 now has (q0, q3) and (k1, k2); rank1 has (q1, q2) and
     (k0, k3). For rank0, no computation is needed for q0. However, computations for
     q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
+<<<<<<< HEAD
     `else` of the (`if`, `elif`, `else`) in the implementation.
     For rank1, k3 is not needed for q1 and q2, so only k0 is used for SDPA. This
+=======
+    `else` of the (`if`, `elif`, `else`) in the implemementation.
+    For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.
 
     Parameters
@@ -388,6 +592,7 @@ def _templated_ring_attention(
     if not is_causal and _cp_options.enable_load_balance:
         raise RuntimeError("Load balancing requires `is_causal=True`.")
 
+<<<<<<< HEAD
     assert isinstance(group, dist.ProcessGroup), (
         "process group must be single dimension"
     )
@@ -397,6 +602,19 @@ def _templated_ring_attention(
     next_kv = None
 
     # Without making key and value contiguous(), the loss curve is bad.
+=======
+    if isinstance(mesh, dist.ProcessGroup):
+        pg: Union[dist.ProcessGroup, list[dist.ProcessGroup]] = mesh
+    else:
+        pg = mesh.get_group()
+    assert isinstance(pg, dist.ProcessGroup), "process group must be single dimension"
+    rank = dist.get_rank(pg)
+    size = dist.get_world_size(pg)
+
+    next_kv = None
+
+    # Without making key and value contiguous(), the lose curve is bad.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO(fegin): figure out why this is a requirement since SDPA does not have
     # this requirement.
     key = key.contiguous()
@@ -408,7 +626,11 @@ def _templated_ring_attention(
     out: torch.Tensor
     logsumexp: torch.Tensor
 
+<<<<<<< HEAD
     rotater = _create_rotater(group, 2)
+=======
+    rotater = _create_rotater(pg, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for i in range(size):
         if i > 0:
@@ -438,8 +660,13 @@ def _templated_ring_attention(
             q, k, v, partial = (query, key, value, False)
         elif i <= rank:
             # Round-robin load balancing case, and i <= rank.
+<<<<<<< HEAD
             # We need to do SDPA with only the first local chunk of k, v.
             # Note that q, k, v each contains two local chunks.
+=======
+            # We need to do SPDA, with only the first local chunk of the k, v.
+            # Note that q, k, v, each contains two local chunks.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ROUND_ROBIN_CYCLE = 2
             q, k, v, partial = (
                 query,
@@ -449,9 +676,15 @@ def _templated_ring_attention(
             )
         else:
             # Round-robin load balancing case, and i > rank.
+<<<<<<< HEAD
             # We need to do SDPA with only the second half of q, and update
             # only the second part of logsumexp. So partial is True.
             # Note that q, k, v each contains two chunks.
+=======
+            # We need to do SPDA with only the second half of the q, and update
+            # only the the second part of  logsumexp. So partial is True.
+            # Note that q, k, v, each contains two chunks.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q, k, v, partial = query.chunk(2, dim=2)[1], key, value, True
 
         # See https://github.com/pytorch/pytorch/blob/release/2.4/aten/src/ATen/native/native_functions.yaml#L14695
@@ -465,12 +698,107 @@ def _templated_ring_attention(
         )
         sdpa_merger.step(out, logsumexp, partial)
 
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
     return *sdpa_merger.results(), *rest
 
 
 def _templated_ring_attention_backward(
     group: dist.ProcessGroup,
+=======
+    return *sdpa_merger.results(), *rest
+
+
+def _sdpa_handler(
+    op_call: torch._ops.OpOverload,
+    args: tuple[object, ...],
+    kwargs: dict[str, object],
+) -> object:
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+    logger.debug("Dispatching op_call: %s", op_info.schema)
+
+    # sharding propagation
+    # TODO: remove the context parallel strategy from the default propagation
+    # rule. Either figure out how to dynamically enable it or just don't call
+    # propagate.
+    DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+    assert not output_sharding.needs_redistribute, "inputs need to be redistributed"
+
+    if op_call == aten._scaled_dot_product_flash_attention.default:
+        local_results = _scaled_dot_product_ring_flash_attention(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    elif op_call == aten._scaled_dot_product_efficient_attention.default:
+        local_results = _scaled_dot_product_ring_efficient_attention(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    elif op_call == aten._scaled_dot_product_cudnn_attention.default:
+        local_results = _scaled_dot_product_ring_cudnn_attention(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    else:
+        raise NotImplementedError(
+            "CP only supports flash attention and memory efficient attention now."
+        )
+
+    return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
+
+
+def _sdpa_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: tuple[object, ...],
+    kwargs: dict[str, object],
+) -> object:
+    # Redistribute grad_output tensor to the same placement as output tensor
+    args = list(args)
+    args = tuple(args)
+
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+    logger.debug("Dispatching op_call: %s", op_info.schema)
+
+    # sharding propagation
+    DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+    assert not output_sharding.needs_redistribute, "inputs need to be redistributed"
+
+    if op_call == aten._scaled_dot_product_flash_attention_backward.default:
+        local_results = _scaled_dot_product_ring_flash_attention_backward(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    elif op_call == aten._scaled_dot_product_efficient_attention_backward.default:
+        local_results = _scaled_dot_product_ring_efficient_attention_backward(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    elif op_call == aten._scaled_dot_product_cudnn_attention_backward.default:
+        local_results = _scaled_dot_product_ring_cudnn_attention_backward(
+            op_info.compute_mesh,
+            *op_info.local_args,  # type: ignore[arg-type]
+            **op_info.local_kwargs,  # type: ignore[arg-type]
+        )
+    else:
+        raise NotImplementedError(f"{op_call=}")
+
+    return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
+
+
+def _templated_ring_attention_backward(
+    mesh: DeviceMesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     seq_dim: int,
     op: _AttentionOp,
     grad_out: torch.Tensor,
@@ -483,11 +811,21 @@ def _templated_ring_attention_backward(
     is_causal: bool,
     **kwargs: Any,
 ) -> tuple[torch.Tensor, ...]:
+<<<<<<< HEAD
     """This API implements the backward pass of the ring attention."""
     if not is_causal and _cp_options.enable_load_balance:
         raise RuntimeError("Load balancing requires `is_causal=True`.")
     rank = dist.get_rank(group)
     size = dist.get_world_size(group)
+=======
+    """This API implements the backward of the ring attention."""
+    if not is_causal and _cp_options.enable_load_balance:
+        raise RuntimeError("Load balancing requires `is_causal=True`.")
+    pg = mesh.get_group()
+    assert isinstance(pg, dist.ProcessGroup), "must be single dimension"
+    rank = dist.get_rank(pg)
+    size = dist.get_world_size(pg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     next_kv = None
     next_grad_kv = None
     rest: list[Any]
@@ -500,8 +838,13 @@ def _templated_ring_attention_backward(
 
     key = key.contiguous()
     value = value.contiguous()
+<<<<<<< HEAD
     kv_rotater = _create_rotater(group, 2)
     dkv_rotater = _create_rotater(group, 2, method=_RotateMethod.ALL_TO_ALL)
+=======
+    kv_rotater = _create_rotater(pg, 2)
+    dkv_rotater = _create_rotater(pg, 2, method=_RotateMethod.ALL_TO_ALL)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i in range(size):
         if i > 0:
             # Wait for the kv from the (cp_rank - 1) rank.
@@ -527,8 +870,13 @@ def _templated_ring_attention_backward(
                 q, k, v, out_, dout, lse = (query, key, value, out, grad_out, logsumexp)
             elif i <= rank:
                 # Round-robin load balancing case, and i <= rank.
+<<<<<<< HEAD
                 # We need to do SDPA with only the first half of k, v.
                 # Note that q, k, v each contains two chunks.
+=======
+                # We need to do SPDA with only the first half of the k, v.
+                # Note that q, k, v, each contains two chunks.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 q, k, v, out_, dout, lse = (
                     query,
                     key.chunk(2, dim=seq_dim)[0],
@@ -539,8 +887,13 @@ def _templated_ring_attention_backward(
                 )
             else:
                 # Round-robin load balancing case, and i > rank.
+<<<<<<< HEAD
                 # We need to do SDPA with only the second half of q.
                 # Note that q, k, v each contains two chunks.
+=======
+                # We need to do SPDA with only the second half of the q
+                # Note that q, k, v, each contains two chunks.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 q, k, v, out_, dout, lse = (
                     query.chunk(2, dim=seq_dim)[1],
                     key,
@@ -607,7 +960,11 @@ def _templated_ring_attention_backward(
                 grad_value += grad_value_
 
         next_grad_kv = torch.cat([grad_key.flatten(), grad_value.flatten()])
+<<<<<<< HEAD
         # Send the grad key and grad value to the next rank.
+=======
+        # Send the grad key, and grad value to the next rank.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dkv_rotater.exchange_buffers(next_grad_kv)
 
         if i <= rank or not _cp_options.enable_load_balance:
@@ -632,11 +989,15 @@ def _templated_ring_attention_backward(
         grad_query,
         grad_key,
         grad_value,
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *rest,
     )
 
 
+<<<<<<< HEAD
 def _scaled_dot_product_ring_flash_attention(
     mesh: DeviceMesh,
     query: torch.Tensor,
@@ -743,6 +1104,8 @@ def _scaled_dot_product_ring_cudnn_attention(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _scaled_dot_product_ring_flash_attention_backward(
     mesh: DeviceMesh,
     grad_out: torch.Tensor,
@@ -762,11 +1125,17 @@ def _scaled_dot_product_ring_flash_attention_backward(
     *,
     scale: Optional[float] = None,
 ) -> tuple[torch.Tensor, ...]:
+<<<<<<< HEAD
     # TODO: remove this hardcoding
     seq_dim = 2
     group = mesh.get_group()
     return _templated_ring_attention_backward(
         group,
+=======
+    seq_dim = 2
+    return _templated_ring_attention_backward(
+        mesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_dim,
         aten._scaled_dot_product_flash_attention_backward.default,
         grad_out=grad_out,
@@ -805,11 +1174,17 @@ def _scaled_dot_product_ring_efficient_attention_backward(
     *,
     scale: Optional[float] = None,
 ) -> tuple[torch.Tensor, ...]:
+<<<<<<< HEAD
     # TODO: remove this hardcoding
     seq_dim = 2
     group = mesh.get_group()
     return _templated_ring_attention_backward(
         group,
+=======
+    seq_dim = 2
+    return _templated_ring_attention_backward(
+        mesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_dim,
         aten._scaled_dot_product_efficient_attention_backward.default,
         grad_out=grad_out,
@@ -849,11 +1224,17 @@ def _scaled_dot_product_ring_cudnn_attention_backward(
     *,
     scale: Optional[float] = None,
 ) -> tuple[torch.Tensor, ...]:
+<<<<<<< HEAD
     # TODO: remove this hardcoding
     seq_dim = 2
     group = mesh.get_group()
     return _templated_ring_attention_backward(
         group,
+=======
+    seq_dim = 2
+    return _templated_ring_attention_backward(
+        mesh,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_dim,
         aten._scaled_dot_product_cudnn_attention_backward.default,
         grad_out=grad_out,
@@ -876,6 +1257,7 @@ def _scaled_dot_product_ring_cudnn_attention_backward(
     )
 
 
+<<<<<<< HEAD
 def _sdpa_handler(
     op_call: torch._ops.OpOverload,
     args: tuple[object, ...],
@@ -932,6 +1314,18 @@ def _sdpa_handler(
 InputFnType = Callable[[Optional[nn.Module], ArgsType, KwargsType, DeviceMesh], Any]
 OutputFnType = Callable[[Optional[nn.Module], Any, Any, DeviceMesh], Any]
 
+=======
+customized_ops = {
+    aten._scaled_dot_product_flash_attention.default: _sdpa_handler,
+    aten._scaled_dot_product_flash_attention_backward.default: _sdpa_backward_handler,
+    aten._scaled_dot_product_efficient_attention.default: _sdpa_handler,
+    aten._scaled_dot_product_efficient_attention_backward.default: _sdpa_backward_handler,
+    aten._scaled_dot_product_cudnn_attention.default: _sdpa_handler,
+    aten._scaled_dot_product_cudnn_attention_backward.default: _sdpa_backward_handler,
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _replaced_functions: dict[Callable, tuple[str, Callable]] = {}
 
 
@@ -939,6 +1333,7 @@ def _distribute_function(
     fn: Callable,
     fn_module: types.ModuleType,
     device_mesh: DeviceMesh,
+<<<<<<< HEAD
     input_fn: InputFnType,
     output_fn: OutputFnType,
 ) -> None:
@@ -956,6 +1351,48 @@ def inner_fn(*args: ArgsType, **kwargs: KwargsType) -> Any:
             args, kwargs = input_fn(None, args, kwargs, device_mesh)
             outputs = target_fn(*args, **kwargs)
             return output_fn(None, (args, kwargs), outputs, device_mesh)
+=======
+    input_fn: Optional[Callable] = None,
+    output_fn: Optional[Callable] = None,
+) -> None:
+    """
+    ``distribute_function`` is an experimental API that allows users to "distribute"
+    the inputs and outputs of a function. Similar to ``distribute_module``, this API
+    installs hooks to the ``fn`` to convert the inputs and outputs. There are two
+    major differences between ``distribute_function`` and ``distribute_module``.
+    First, a function does not have parammeters and buffers, as a result,
+    ``distribute_function`` itself won't convert any parameters/buffers but simply
+    install the input and output hooks.  The tensor conversion will happen in the hooks.
+    Another difference is an nn.Module subclass can have several instances and each
+    instance be fed into ``distribute_module`` independently with affecting other
+    instance. On the other hand, function is a singleton object. So if a function
+    is distributed by ``distribute_function`` all subsequent calls to the function
+    will invoke the installed hooks.
+
+    Args:
+        fn (Callable): the function to be distributed.
+        fn_module (types.ModuleType): the Python module that the function is declared.
+            e.g., if ``fn`` is ``torch.nn.functional.scaled_dot_product_attention``,
+            ``fn_module`` is ``torch.nn.functional``.
+        device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
+            input and output hooks to distribute the tensors.
+        input_fn (Optioinal[Callable]): the hook to distribute or convert the input
+            arguments of ``fn``.
+        output_fn (Optioinal[Callable]): the hook to distribute or convert the output
+            arguments of ``fn``.
+    """
+
+    def wrapper(
+        target_fn: Callable, input_fn: Optional[Callable], output_fn: Optional[Callable]
+    ) -> Callable:
+        def inner_fn(*args: tuple[Any, ...], **kwargs: dict[str, Any]) -> Any:
+            if input_fn is not None:
+                args, kwargs = input_fn(device_mesh, *args, **kwargs)
+            output = target_fn(*args, **kwargs)
+            if output_fn is not None:
+                output = output_fn(device_mesh, output)
+            return output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return inner_fn
 
@@ -971,6 +1408,12 @@ def inner_fn(*args: ArgsType, **kwargs: KwargsType) -> Any:
 
 def _restore_function(fn: Callable, fn_module: types.ModuleType) -> None:
     """Restore the function that is replaced by _distribute_function."""
+<<<<<<< HEAD
+=======
+    global _original_functions
+    global _wrapper_functions
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if fn not in _replaced_functions:
         return
 
@@ -978,6 +1421,7 @@ def _restore_function(fn: Callable, fn_module: types.ModuleType) -> None:
     setattr(fn_module, original_name, original_fn)
 
 
+<<<<<<< HEAD
 def _enable_cp_dtensor_dispatcher() -> None:
     """Enables DTensor dispatcher to dispatch SDPA to CP."""
     DTensor._op_dispatcher._custom_op_handlers = {
@@ -1333,6 +1777,139 @@ def sdpa_input_fn(
                     assert arg._spec.placements == placement
                 else:
                     arg = DTensor.from_local(arg, mesh, placement, run_check=False)
+=======
+@contextlib.contextmanager
+def _enable_cp_dispatcher() -> Generator[None, None, None]:
+    """Enables DTensor dispatcher to dispatch SDPA to CP."""
+    old_handlers = DTensor._op_dispatcher._custom_op_handlers
+    DTensor._op_dispatcher._custom_op_handlers = {**old_handlers, **customized_ops}
+
+    yield
+
+    DTensor._op_dispatcher._custom_op_handlers = old_handlers
+
+
+class _AttentionContextParallel(ParallelStyle):
+    """
+    Applies context parallel optimizations to the attention layer.
+
+    This will work for nn.MultiHeadedAttention and custom attention layers that
+    call F.scaled_dotproduct_attention with a simliar signature.
+
+    This expects the `forward` method consumes either:
+
+    * a single tensor for self attention
+    * one argument for each of: query, key, value
+
+    This currently only supports ring attention and the
+    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.
+
+    Non-flash attention backends will result in incorrect results.
+    """
+
+    # use a weakref dictionary to store context managers for each nn.Module
+    _CONTEXT_MANAGERS: "weakref.WeakKeyDictionary[nn.Module, Any]" = (
+        weakref.WeakKeyDictionary()
+    )
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if not isinstance(device_mesh, DeviceMesh):
+            raise ValueError(
+                f"{type(device_mesh)} is not supported by {type(self)} yet."
+            )
+
+        if not device_mesh.ndim == 1:
+            raise ValueError
+
+        return distribute_module(
+            module,
+            device_mesh,
+            input_fn=self._input_fn,  # type: ignore[arg-type]
+            output_fn=self._output_fn,  # type: ignore[arg-type]
+        )
+
+    @classmethod
+    def _input_fn(
+        cls,
+        module: nn.Module,
+        inputs: tuple[Union[torch.Tensor, int, float], ...],
+        device_mesh: DeviceMesh,
+    ) -> tuple[Union[torch.Tensor, int, float], ...]:
+        # TODO(d4l3k); this should be Shard(2), need to fix Linear layer rules
+        placement = [Replicate()]
+
+        def backward_hook(grad: torch.Tensor) -> None:
+            if module in cls._CONTEXT_MANAGERS:
+                cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
+                del cls._CONTEXT_MANAGERS[module]
+
+        # convert inputs to DTensor
+        inp = []
+        for input in inputs:
+            if isinstance(input, torch.Tensor) and not isinstance(input, DTensor):
+                input = DTensor.from_local(
+                    input.contiguous(), device_mesh, placement, run_check=False
+                )
+
+            if isinstance(input, torch.Tensor) and input.requires_grad:
+                input.register_hook(backward_hook)
+
+            inp.append(input)
+
+        manager = _enable_cp_dispatcher()
+        manager.__enter__()
+        cls._CONTEXT_MANAGERS[module] = manager
+
+        return tuple(inp)
+
+    @classmethod
+    def _output_fn(
+        cls,
+        module: nn.Module,
+        outputs: Union[torch.Tensor, tuple[Union[torch.Tensor, int, float], ...]],
+        device_mesh: DeviceMesh,
+    ) -> Union[
+        Union[torch.Tensor, int, float], tuple[Union[torch.Tensor, int, float], ...]
+    ]:
+        cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
+        del cls._CONTEXT_MANAGERS[module]
+
+        def backward_hook(grad: torch.Tensor) -> None:
+            if module not in cls._CONTEXT_MANAGERS:
+                manager = _enable_cp_dispatcher()
+                manager.__enter__()
+                cls._CONTEXT_MANAGERS[module] = manager
+
+        # back to local tensor
+        out = []
+        for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
+            output = output.to_local() if isinstance(output, DTensor) else output
+
+            if isinstance(output, torch.Tensor) and output.requires_grad:
+                output.register_hook(backward_hook)
+
+            out.append(output)
+
+        if isinstance(outputs, torch.Tensor):
+            return out[0]
+
+        return tuple(out)
+
+
+@contextlib.contextmanager
+def _context_parallel(seq_dim: int, mesh: DeviceMesh) -> Generator[None, None, None]:
+    """Replace SDPA with the CP-wrapped version and enable DTensor CP dispatcher."""
+
+    def attention_input_fn(
+        mesh: DeviceMesh, *args: tuple[Any, ...], **kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        placement = [Shard(seq_dim)]
+        all_args = []
+
+        for arg in itertools.chain(args, kwargs.values()):
+            if isinstance(arg, torch.Tensor) and not isinstance(arg, DTensor):
+                arg = DTensor.from_local(arg, mesh, placement, run_check=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             all_args.append(arg)
 
@@ -1340,9 +1917,13 @@ def sdpa_input_fn(
         new_kwargs = dict(zip(kwargs.keys(), all_args[len(args) :]))
         return new_args, new_kwargs
 
+<<<<<<< HEAD
     def sdpa_output_fn(
         self, module: Optional[nn.Module], inputs: Any, outputs: Any, mesh: DeviceMesh
     ) -> Any:
+=======
+    def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_outputs = []
         for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
             output = output.to_local() if isinstance(output, DTensor) else output
@@ -1353,6 +1934,7 @@ def sdpa_output_fn(
 
         return tuple(new_outputs)
 
+<<<<<<< HEAD
 
 CPBuffer: TypeAlias = torch.Tensor | BlockMask
 CPBufferContainer: TypeAlias = Sequence[CPBuffer] | Mapping[str, CPBuffer]
@@ -1449,6 +2031,169 @@ def _disable_context_parallel_dispatcher() -> None:
 #####################################################
 # Current public APIs, but are also subject to change
 #####################################################
+=======
+    class DistributeFunction(TorchFunctionMode):
+        def __init__(
+            self,
+            fn: Callable,
+            device_mesh: DeviceMesh,
+            input_fn: Optional[Callable] = None,
+            output_fn: Optional[Callable] = None,
+        ):
+            self._device_mesh = device_mesh
+            self._input_fn = input_fn
+            self._output_fn = output_fn
+            self._fn = fn
+
+        def __torch_function__(
+            self,
+            func: Callable,
+            types: Any,
+            args: tuple[Any, ...] = (),
+            kwargs: Optional[dict[str, Any]] = None,
+        ) -> Any:
+            kwargs = kwargs or {}
+
+            if func != self._fn:
+                return func(*args, **kwargs)
+
+            if self._input_fn is not None:
+                args, kwargs = self._input_fn(self._device_mesh, *args, **kwargs)
+            output = func(*args, **kwargs)
+            if self._output_fn is not None:
+                output = self._output_fn(self._device_mesh, output)
+            return output
+
+    if _dispatch_mode == _DispatchMode.MONKEY_PATCH:
+        _distribute_function(
+            F.scaled_dot_product_attention,
+            F,
+            mesh,
+            attention_input_fn,
+            attention_output_fn,
+        )
+        with _enable_cp_dispatcher():
+            yield
+        _restore_function(F.scaled_dot_product_attention, F)
+    elif _dispatch_mode == _DispatchMode.TORCH_FUNCTION:
+        with DistributeFunction(
+            F.scaled_dot_product_attention,
+            mesh,
+            attention_input_fn,
+            attention_output_fn,
+        ):
+            with _enable_cp_dispatcher():
+                yield
+    else:
+        raise NotImplementedError("torch dispatch mode is not supported yet.")
+
+
+class _LoadBalancer(ABC):
+    @classmethod
+    @abstractmethod
+    def shard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor: ...
+
+    @classmethod
+    @abstractmethod
+    def unshard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor: ...
+
+
+class _SequentialSharder(_LoadBalancer):
+    """
+    This load balancer chunks the buffer into cp_world_size and rank0 gets
+    0th shard, rank1 gets 1st shard, ...
+    So this doesn't have any load balancing effect when using the causal masking.
+    """
+
+    @classmethod
+    def shard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor:
+        assert buffer.size()[seq_dim] % mesh.size() == 0
+        return buffer.chunk(mesh.size(), dim=seq_dim)[mesh.get_local_rank()]
+
+    @classmethod
+    def unshard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor:
+        buffer = buffer.contiguous()
+        all_buffers = [torch.empty_like(buffer) for _ in range(mesh.size())]
+        ft_c.all_gather_inplace(all_buffers, buffer, mesh)
+        return torch.cat(all_buffers, dim=seq_dim)
+
+
+class _RoundRobinLoadBalancer(_LoadBalancer):
+    """
+    This load balancer chunk the buffer into cp_world_size * ROUND_ROBIN_CYCLE
+    shards, and uses a round robin approach to achieve load balancing.
+    Since ROUND_ROBIN_CYCLE being 2 will achieve perfect load balancing for
+    causal masking, we assume ROUND_ROBIN_CYCLE is always 2 to simplify the
+    implementation.
+    """
+
+    ROUND_ROBIN_CYCLE = 2
+
+    @classmethod
+    def shard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor:
+        assert cls.ROUND_ROBIN_CYCLE == 2, (
+            "The current implementation only works if ROUND_ROBIN_CYCLE is 2."
+        )
+        cp_world_size = mesh.size()
+        cp_rank = mesh.get_local_rank()
+        assert buffer.size()[seq_dim] % (cp_world_size * 2) == 0
+        chunks = buffer.chunk(cp_world_size * 2, dim=seq_dim)
+        return torch.cat(
+            (chunks[cp_rank], chunks[cp_world_size * 2 - cp_rank - 1]),
+            dim=seq_dim,
+        )
+
+    @classmethod
+    def unshard(
+        cls, buffer: torch.Tensor, mesh: DeviceMesh, seq_dim: int
+    ) -> torch.Tensor:
+        assert cls.ROUND_ROBIN_CYCLE == 2, (
+            "The current implementation only works if ROUND_ROBIN_CYCLE is 2."
+        )
+        buffer = buffer.contiguous()
+        cp_world_size = mesh.size()
+
+        all_buffers = [torch.empty_like(buffer) for _ in range(cp_world_size)]
+        ft_c.all_gather_inplace(all_buffers, buffer, mesh)
+        sliced_buffers = [sb for b in all_buffers for sb in b.chunk(2, dim=seq_dim)]
+        ordered_buffers = list(sliced_buffers)
+        for i, b in enumerate(sliced_buffers):
+            if i % 2 == 0:
+                ordered_buffers[i // 2] = b
+            else:
+                ordered_buffers[cp_world_size * 2 - (i // 2) - 1] = b
+        return torch.cat(ordered_buffers, dim=seq_dim)
+
+
+def _context_parallel_buffers(
+    mesh: DeviceMesh,
+    buffers: list[torch.Tensor],
+    buffer_seq_dims: list[int],
+) -> list[torch.Tensor]:
+    """Shard the buffers along the sequence dimensions according to CP rules."""
+    new_buffers = []
+    sharder = (
+        _RoundRobinLoadBalancer
+        if _cp_options.enable_load_balance
+        else _SequentialSharder
+    )
+    for buffer, seq_dim in zip(buffers, buffer_seq_dims):
+        new_buffers.append(sharder.shard(buffer, mesh, seq_dim))
+
+    return new_buffers
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextlib.contextmanager
 @torch.no_grad()
 def context_parallel(
@@ -1487,11 +2232,14 @@ def context_parallel(
         `torch.distributed.tensor.experimental.context_parallel` is a
         prototype feature in PyTorch. The API is subject to change.
     """
+<<<<<<< HEAD
     # For the legacy API, we only support the monkey-patch mode.
     # We will deprecate this API once the new API is widely used.
     global _dispatch_mode
     _dispatch_mode = _DispatchMode.MONKEY_PATCH
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     buffers = [] if buffers is None else buffers
     buffer_seq_dims = [] if buffer_seq_dims is None else buffer_seq_dims
     no_restore_buffers = set() if no_restore_buffers is None else no_restore_buffers
@@ -1507,6 +2255,7 @@ def context_parallel(
             raise ValueError("`no_restore_buffers` must be a subset of `buffers`.")
 
     original_buffers = [None if b in no_restore_buffers else b.clone() for b in buffers]
+<<<<<<< HEAD
 
     device = buffers[0].device
     seq_length = buffers[0].shape[buffer_seq_dims[0]]
@@ -1532,6 +2281,16 @@ def context_parallel(
     _enable_context_parallel_dispatcher_impl(seq_dim=2, mesh=mesh)
     yield
     _disable_context_parallel_dispatcher_impl()
+=======
+    chunks = _context_parallel_buffers(mesh, buffers, buffer_seq_dims)
+    for buffer, chunk in zip(buffers, chunks):
+        chunk = chunk.clone()
+        buffer.resize_(chunk.shape)
+        buffer.copy_(chunk)
+
+    with _context_parallel(seq_dim=2, mesh=mesh):
+        yield
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for buffer, original_buffer in zip(buffers, original_buffers):
         if original_buffer is not None:
@@ -1544,7 +2303,10 @@ def context_parallel_unshard(
     mesh: DeviceMesh,
     buffers: list[torch.Tensor],
     seq_dims: list[int],
+<<<<<<< HEAD
     load_balancer: Optional[_LoadBalancer] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[torch.Tensor]:
     """
     Unshard the tensors (e.g., output) that are sharded due to context parallelism.
@@ -1554,6 +2316,7 @@ def context_parallel_unshard(
         buffers (List[torch.Tensor]): the buffers to be unsharded.
         seq_dims (List[int]): the sequence dimensions of ``buffers``. This list
             must have the same length as ``buffers``.
+<<<<<<< HEAD
         load_balancer (Optional[:class:`_Loadbalancer`]): an optional `_LoadBalancer`
             object. If this argument is `None`, it means the `buffers` were not
             rearranged when being sharded and there's no need to put it back to order
@@ -1620,6 +2383,18 @@ def context_parallel_unshard(
         unsharded_buffers.append(unsharded_b)
 
     return unsharded_buffers
+=======
+
+    Returns:
+        List[torch.Tensor]: the unsharded buffers.
+    """
+    sharder = (
+        _RoundRobinLoadBalancer
+        if _cp_options.enable_load_balance
+        else _SequentialSharder
+    )
+    return [sharder.unshard(b, mesh, dim) for b, dim in zip(buffers, seq_dims)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def set_rotate_method(rotate_method: str) -> None:
@@ -1638,7 +2413,10 @@ def set_rotate_method(rotate_method: str) -> None:
     Returns:
         None
     """
+<<<<<<< HEAD
     logger.info("Note that FlexAttention CP doesn't support alltoall yet.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rotate_method == "allgather":
         _cp_options.rotate_method = _RotateMethod.ALL_GATHER
     elif rotate_method == "alltoall":
diff --git a/torch/distributed/tensor/experimental/_func_map.py b/torch/distributed/tensor/experimental/_func_map.py
index cf0e9df1ab332..ff9602327ee8e 100644
--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Optional, Union
+=======
+from collections.abc import Sequence
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
@@ -24,10 +29,17 @@
 
 
 def local_map(
+<<<<<<< HEAD
     func: Optional[Callable] = None,
     out_placements: OutputPlacements = None,
     in_placements: InputPlacements = None,
     in_grad_placements: InputPlacements = None,
+=======
+    func: Callable,
+    out_placements: OutputPlacements,
+    in_placements: Optional[InputPlacements] = None,
+    in_grad_placements: Optional[InputPlacements] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_mesh: Optional[DeviceMesh] = None,
     *,
     redistribute_inputs: bool = False,
@@ -76,11 +88,18 @@ def local_map(
             tensor input remains the same as the original :class:`DTensor` input
             and use that for gradient computation. Default: None.
         device_mesh (:class:`DeviceMesh`, optional):
+<<<<<<< HEAD
             the device mesh that the output :class:`DTensor` s are placed on. If not
             specified, this will be inferred from the first input :class:`DTensor`'s device
             mesh. Default: None.
 
     Keyword Args:
+=======
+            the device mesh that all the :class:`DTensor` s are placed on. If not
+            specified, this will be inferred from the input :class:`DTensor` s' device
+            mesh. `local_map` requires every :class:`DTensor` s to be placed on the same
+            device mesh. Default: None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         redistribute_inputs (bool, optional):
             the bool value indicating whether to reshard the input :class:`DTensor` s when
             their placements are different from the required input placements. If this
@@ -92,6 +111,13 @@ def local_map(
         and returns a :class:`DTensor` constructed from the return value of ``func``.
 
     Raises:
+<<<<<<< HEAD
+=======
+        AssertionError: If the input :class:`DTensor` is not placed on the same device
+            mesh, or if they are placed on a different device mesh than the ``device_mesh``
+            argument passed in.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         AssertionError: For any non-DTensor output, we require its corresponding
             output placement in ``out_placements`` be None. An AssertionError will be raised
             if this is not the case.
@@ -112,7 +138,11 @@ def local_map(
         >>> row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
         >>> col_wise = [Shard(1)]  # col-wise sharding placements on 1-d mesh
         >>>
+<<<<<<< HEAD
         >>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor conversion
+=======
+        >>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor convertion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> local_mm_allreduce_forward = local_map(
         >>>     mm_allreduce_forward,
         >>>     out_placements=[Replicate()],
@@ -133,6 +163,7 @@ def local_map(
     .. note:: This API is currently experimental and subject to change
     """
 
+<<<<<<< HEAD
     if func is None:
         # decorator mode
         def decorated(func):
@@ -276,3 +307,121 @@ def _local_map_wrapped(
         return pytree.tree_unflatten(flat_dist_out, out_spec)
     else:
         return out
+=======
+    def wrapped(device_mesh: Optional[DeviceMesh], *args, **kwargs):
+        # process input args
+        flat_args, args_spec = pytree.tree_flatten(args)
+        if in_placements is not None:
+            assert len(in_placements) == len(flat_args), (
+                f"in_placements length {len(in_placements)} does not match the number "
+                f"of input args {len(flat_args)}!"
+            )
+
+        # we assume every DTensor object is placed on the same device mesh
+        flat_local_args = []
+        seen_dtensor_arg = False
+        for idx, arg in enumerate(flat_args):
+            if isinstance(arg, DTensor):
+                # TODO: the current code doesn't consider the uneven sharding case
+                # Need to think about what the consequence is when the input DTensor
+                # is uneven sharded.
+                if device_mesh is None:  # infer device mesh from the DTensor arg
+                    device_mesh = arg.device_mesh
+
+                # this function is applied to at least one DTensor argument
+                seen_dtensor_arg = True
+
+                assert arg.device_mesh == device_mesh, (
+                    f"arg {arg} in local_map has a mismatched device mesh: "
+                    f"{arg} has device mesh {arg.device_mesh} while "
+                    f"the expected device mesh is {device_mesh}!"
+                )
+                if in_placements is not None:
+                    spec = in_placements[idx]
+                    assert spec is not None, (
+                        f"DTensor input {arg} expects placements but received {spec}!"
+                    )
+
+                    if not isinstance(spec, tuple):
+                        spec = tuple(spec)
+
+                    if arg.placements != spec:
+                        if redistribute_inputs:
+                            # redistribute to input placements
+                            arg = arg.redistribute(device_mesh, spec)
+                        else:
+                            raise ValueError(
+                                f"arg {arg} in local_map has a mismatched placements: "
+                                f"arg placements is {arg.placements} but the input "
+                                f"placements is {spec}! "
+                                "If redistribute_inputs is wanted, set "
+                                "redistribute_inputs=True to local_map."
+                            )
+
+                if in_grad_placements is not None:
+                    spec = in_grad_placements[idx]
+                    assert spec is not None, (
+                        f"DTensor input {arg} expects in grad placements but received {spec}!"
+                    )
+                    if not isinstance(spec, tuple):
+                        spec = tuple(spec)
+                    local_arg = arg.to_local(grad_placements=spec)
+                else:
+                    local_arg = arg.to_local()
+
+                if isinstance(local_arg, AsyncCollectiveTensor):
+                    local_arg = local_arg.wait()
+
+                flat_local_args.append(local_arg)
+            else:
+                # Non-Tensor input must have None in `in_placements`
+                if in_placements is not None and not isinstance(arg, torch.Tensor):
+                    spec = in_placements[idx]
+                    assert spec is None, (
+                        f"Non-Tensor input {arg} expects None placements "
+                        f"but received {spec}!"
+                    )
+
+                flat_local_args.append(arg)
+
+        local_args = pytree.tree_unflatten(flat_local_args, args_spec)
+
+        out = func(*local_args, **kwargs)
+
+        if seen_dtensor_arg:
+            # process output
+            flat_out, out_spec = pytree.tree_flatten(out)
+
+            flat_dist_out = []
+            out_placements_tuple = (
+                out_placements
+                if isinstance(out_placements, tuple)
+                else (out_placements,)
+            )
+            assert len(flat_out) == len(out_placements_tuple), (
+                "local_map requires one PlacementType be provided for each output value,"
+                f" received {len(out_placements_tuple)} out_placements but"
+                f" {len(flat_out)} is expected!"
+            )
+            for out, spec in zip(flat_out, out_placements_tuple):
+                if isinstance(out, torch.Tensor):
+                    assert not isinstance(out, DTensor), (
+                        f"torch.Tensor output expected but received {type(out)}: {out}"
+                    )
+
+                    flat_dist_out.append(
+                        DTensor.from_local(out, device_mesh, spec, run_check=False)
+                    )
+                else:
+                    assert spec is None, (
+                        f"Non-tensor output {out} expects None placements but received {spec}!"
+                    )
+
+                    flat_dist_out.append(out)
+
+            return pytree.tree_unflatten(flat_dist_out, out_spec)
+        else:
+            return out
+
+    return functools.partial(wrapped, device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/experimental/_register_sharding.py b/torch/distributed/tensor/experimental/_register_sharding.py
index 9879946f54bc1..9e3f1c5721ab2 100644
--- a/torch/distributed/tensor/experimental/_register_sharding.py
+++ b/torch/distributed/tensor/experimental/_register_sharding.py
@@ -1,8 +1,14 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from functools import partial
 from typing import Union
+=======
+from collections.abc import Sequence
+from functools import partial
+from typing import Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._ops import OpOverload
@@ -41,7 +47,11 @@ def register_sharding(op: Union[OpOverload, list[OpOverload]]):
         as the original op (except that if an arg is a :class:`torch.Tensor`, it will be
         replaced by a tensor-like object that DTensor uses internally). The function should
         return a sequence of 2-tuples, each specifying acceptable output placements and its
+<<<<<<< HEAD
         corresponding input placements.
+=======
+        corresponding intput placements.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example:
         >>> # xdoctest: +SKIP("distributed")
@@ -77,7 +87,11 @@ def strategy_to_spec(strategy: object) -> object:
                 # take the output spec from the first strategy
                 return strategy.strategies[0].output_spec
             elif isinstance(strategy, TupleStrategy):
+<<<<<<< HEAD
                 return tuple(strategy_to_spec(s) for s in strategy.children)
+=======
+                return tuple(strategy_to_spec(s) for s in strategy.childs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return strategy
 
diff --git a/torch/distributed/tensor/experimental/_tp_transform.py b/torch/distributed/tensor/experimental/_tp_transform.py
index f66ab2b2e39d2..fbbd1bcc6eaa7 100644
--- a/torch/distributed/tensor/experimental/_tp_transform.py
+++ b/torch/distributed/tensor/experimental/_tp_transform.py
@@ -237,11 +237,16 @@ def _mark_sharding(
                         op_schema,
                     )
                 placement_strategies[node] = OpSpec(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
                     output_specs=_get_output_spec_from_output_sharding(output_sharding),
                     # pyrefly: ignore [missing-attribute]
                     input_specs=output_sharding.redistribute_schema.args_spec
                     # pyrefly: ignore [missing-attribute]
+=======
+                    output_specs=_get_output_spec_from_output_sharding(output_sharding),
+                    input_specs=output_sharding.redistribute_schema.args_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if output_sharding.redistribute_schema is not None
                     else _get_input_node_specs(node, placement_strategies),
                 )
@@ -467,7 +472,11 @@ def reshard_fn(local_tensor: torch.Tensor) -> torch.Tensor:
             if reshard_node.op not in ["placeholder", "output"]:
                 reshard_node.meta["nn_module_stack"] = (
                     copy.copy(input_arg.meta["nn_module_stack"])
+<<<<<<< HEAD
                     if input_arg.op != "placeholder"
+=======
+                    if not input_arg.op == "placeholder"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else copy.copy(node.meta["nn_module_stack"])
                 )
         output_node = gm.graph.graph_copy(
diff --git a/torch/distributed/tensor/parallel/_data_parallel_utils.py b/torch/distributed/tensor/parallel/_data_parallel_utils.py
index c41da260a02f9..3c0520a5d796a 100644
--- a/torch/distributed/tensor/parallel/_data_parallel_utils.py
+++ b/torch/distributed/tensor/parallel/_data_parallel_utils.py
@@ -30,7 +30,11 @@ def _flatten_tensor(
 
 @no_type_check
 def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
+<<<<<<< HEAD
     # unflatten would mainly be called every time FSDP allgather parameters.
+=======
+    # unflatten would mainly be called everytime FSDP allgather parameters.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = DTensor.from_local(
         tensor,
         spec.mesh,
diff --git a/torch/distributed/tensor/parallel/_utils.py b/torch/distributed/tensor/parallel/_utils.py
new file mode 100644
index 0000000000000..0a78872f57d8b
--- /dev/null
+++ b/torch/distributed/tensor/parallel/_utils.py
@@ -0,0 +1,67 @@
+# mypy: allow-untyped-defs
+import warnings
+from typing import Union
+
+from torch.distributed.device_mesh import _mesh_resources
+from torch.distributed.tensor import DeviceMesh
+from torch.distributed.tensor.placement_types import Placement
+
+
+try:
+    from torch._dynamo.external_utils import is_compiling as is_torchdynamo_compiling
+except Exception:
+
+    def is_torchdynamo_compiling():  # type: ignore[misc]
+        return False
+
+
+LayoutsType = Union[Placement, tuple[Placement, ...]]
+
+
+def _deprecate_warnings(func_name: str, extra_msg: str) -> None:
+    """
+    Inject common validation logics for `_prepare_input` funcs via this decorator.
+
+    Include verifying that input needs to be either a :class:`Tensor` or :class:`DTensor`
+    and only 1D :class:`DeviceMesh` is passed in.
+    """
+    # TODO: Will follow up with dynamo POC to make warnings.warn working with dynamo.
+    if not is_torchdynamo_compiling():
+        warnings.warn(
+            f"{func_name} is deprecated and will be removed soon. {extra_msg}",
+            FutureWarning,
+            stacklevel=3,
+        )
+
+
+def _validate_tp_mesh_dim(
+    device_mesh: DeviceMesh,
+) -> None:
+    """
+    Check whether TP mesh dimension is valid or not.
+
+    Args:
+        device_mesh (:class:`DeviceMesh`):
+            The `device_mesh` where we perform
+            Tensor Parallelism on.
+
+    Return:
+        `True` if the mesh dimension
+        is valid, `False` otherwise.
+    """
+    if device_mesh.ndim > 1:
+        raise ValueError(
+            f"Tensor Parallel only accepts a 1D DeviceMesh, but found {device_mesh.ndim}D!"
+            'If you have a 2-D or N-D device_mesh, consider passing in device_mesh["tp"]'
+        )
+
+    root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+    # if a root mesh is not the same as device_mesh,
+    # meaning the device_mesh is sliced out from the root mesh.
+    if root_mesh and root_mesh != device_mesh:
+        tp_mesh_dim_in_root = _mesh_resources.get_root_mesh_dim(device_mesh)
+        if tp_mesh_dim_in_root != root_mesh.ndim - 1:
+            raise RuntimeError(
+                f"Found TP device_mesh on the {tp_mesh_dim_in_root} dimension of its parent mesh.",
+                "Currently we only support intranode TP and TP needs to be the innermost dimension on its parent mesh.",
+            )
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
index 51cfd0f144b3f..b7283c7e6aa0c 100644
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -6,6 +6,10 @@
 import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor.parallel._utils import _validate_tp_mesh_dim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
@@ -70,12 +74,20 @@ def parallelize_module(  # type: ignore[return]
     torch._C._log_api_usage_once("torch.distributed.tensor.parallel.parallelize_module")
 
     device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+<<<<<<< HEAD
+=======
+    _validate_tp_mesh_dim(device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if parallelize_plan is None:
         warnings.warn(
             "No parallelize_plan is provided and auto-parallel is not supported "
+<<<<<<< HEAD
             "at the moment, so this parallelize_module call will do nothing.",
             stacklevel=2,
+=======
+            "at the moment, so this parallelize_module call will do nothing."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return module
 
@@ -87,6 +99,7 @@ def parallelize_module(  # type: ignore[return]
         return parallelize_plan._apply(module, device_mesh)
     elif isinstance(parallelize_plan, dict):
         for module_path, parallelize_style in parallelize_plan.items():
+<<<<<<< HEAD
             if module_path == "":
                 # shortcut: empty string means to apply the plan to the current module
                 parallelize_module(module, device_mesh, parallelize_style)
@@ -135,6 +148,41 @@ def parallelize_module(  # type: ignore[return]
                         parallelize_style,
                         src_data_rank=src_data_rank,
                     )
+=======
+            path_splits = module_path.split(".")
+            if len(path_splits) == 0:
+                raise ValueError(
+                    "Expect module path to be non-empty, but got empty string!"
+                )
+            while path_splits:
+                atom = path_splits.pop(0)
+                matched_children = filter(
+                    # `t[0]` is child name
+                    lambda t: fnmatch(t[0], atom),
+                    module.named_children(),
+                )
+                # apply the plan to all matched submodules
+                for _, submodule in matched_children:
+                    if path_splits:
+                        # we haven't reached the leaf, apply in dict style
+                        leaf_path = ".".join(
+                            path_splits
+                        )  # rest of the path after `atom`
+                        parallelize_module(
+                            submodule,
+                            device_mesh,
+                            {leaf_path: parallelize_style},
+                            src_data_rank=src_data_rank,
+                        )
+                    else:
+                        # otherwise, directly apply style to this submodule
+                        parallelize_module(
+                            submodule,
+                            device_mesh,
+                            parallelize_style,
+                            src_data_rank=src_data_rank,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return module
     else:
         raise TypeError(  # pyre-ignore[7]
diff --git a/torch/distributed/tensor/parallel/ddp.py b/torch/distributed/tensor/parallel/ddp.py
index 7b19f97675197..881ced4561dfd 100644
--- a/torch/distributed/tensor/parallel/ddp.py
+++ b/torch/distributed/tensor/parallel/ddp.py
@@ -36,7 +36,11 @@ def _update_module_param(param_list: list[tuple[nn.Module, str, nn.Parameter]]):
 
 def _reconstruct_dtensor(module: nn.Module, _input: Any):
     """
+<<<<<<< HEAD
     Reconstruct DTensor parameters from local tensors
+=======
+    Recontruct DTensor parameters from local tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     param_list = []
     # TODO: To add perf optimizations to this iterations
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index f491624b5aaea..3016f178a4a18 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -14,6 +14,10 @@
 )
 from torch.distributed._shard.sharding_spec import ShardMetadata
 from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import _mesh_resources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
 from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
 from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
@@ -134,11 +138,17 @@ def _rewrite_spec_if_needed(
             break
     if rewrite:
         spec = copy.deepcopy(spec)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         for i, placement in enumerate(spec.placements):
             placement = cast(_remote_device, placement)
             if placement.rank() == rank and placement.device() != tensor.device:
                 # pyrefly: ignore [missing-attribute]
+=======
+        for i, placement in enumerate(spec.placements):
+            placement = cast(_remote_device, placement)
+            if placement.rank() == rank and placement.device() != tensor.device:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 spec.placements[i] = _remote_device(f"rank:{rank}/{tensor.device}")
 
     return spec
@@ -228,7 +238,11 @@ def _chunk_dtensor(
 
     The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
     """
+<<<<<<< HEAD
     root_mesh = device_mesh._get_root_mesh() if device_mesh is not None else None
+=======
+    root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if root_mesh is None:
         raise RuntimeError("No parent device_mesh is found for FSDP device_mesh.")
     if root_mesh.ndim < 2:
@@ -305,7 +319,11 @@ def _all_gather_dtensor(
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
     # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
+<<<<<<< HEAD
     for i in range(len(placements) - 1):
+=======
+    for i in range(0, len(placements) - 1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
@@ -327,7 +345,11 @@ def __init__(self, device_handle) -> None:
         super().__init__()
         self.compute_stream = None
         self.device_handle = device_handle
+<<<<<<< HEAD
         # we have to use the dynamo disable this way to disable dynamo as the decorator way would
+=======
+        # we have to use the dynamo disable this way to disable dynamo as the decorater way would
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # trigger build failure with torch deploy...
         self.post_unflatten_transform = torch._dynamo.disable(  # type: ignore[method-assign]
             self.post_unflatten_transform
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
index 7cb26bf699650..2c93a9a8d309c 100644
--- a/torch/distributed/tensor/parallel/loss.py
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -11,7 +11,11 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import DTensor, Replicate, Shard
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+<<<<<<< HEAD
 from torch.distributed.tensor._ops._embedding_ops import MaskPartial
+=======
+from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._ops._math_ops import (
     _skip_dim,
     Reduction,
@@ -112,7 +116,11 @@ def _propagate_tensor_meta(
     kwargs: dict[str, object],
 ) -> TensorMeta:
     op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+<<<<<<< HEAD
     tensor_meta = DTensor._op_dispatcher.sharding_propagator.propagate_tensor_meta(
+=======
+    tensor_meta = DTensor._op_dispatcher.sharding_propagator._propagate_tensor_meta(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_info.schema
     )
     if isinstance(tensor_meta, TensorMeta):
@@ -174,12 +182,18 @@ def _log_softmax_handler(
         tensor_meta=output_tensor_meta,
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     return DTensor(
         # pyrefly: ignore [bad-argument-count]
         res,
         res_spec,
         # pyrefly: ignore [unexpected-keyword]
+=======
+    return DTensor(
+        res,
+        res_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         requires_grad=res.requires_grad,
     )
 
@@ -236,7 +250,11 @@ def _weight_view(weight: Tensor) -> Tensor:
 
     # The following code block is a distributed version of
     # result = -torch.gather(self, channel_dim, safe_target_).squeeze(channel_dim)
+<<<<<<< HEAD
     partial_placement = MaskPartial(offset_shape=input_shape, offset_dim=channel_dim)
+=======
+    partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=channel_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     safe_target_partial_ = partial_placement._partition_value(
         safe_target_, mesh, mesh_dim
     )
@@ -254,7 +272,10 @@ def _weight_view(weight: Tensor) -> Tensor:
     if weight is not None:
         new_shape = list(x.shape)
         new_shape[channel_dim] = -1
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         w = w.expand(new_shape)
         wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
         wsum = torch.where(target != ignore_index, wsum, 0)
@@ -312,9 +333,13 @@ def _nll_loss_forward_handler(
         output_placements = all_replicate_placements
 
     # tensor inputs to _propagate_tensor_meta need to be DTensors
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     args = list(args)
     # pyrefly: ignore [unsupported-operation]
+=======
+    args = list(args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args[1], args[2] = target, weight
     output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
 
@@ -333,12 +358,18 @@ def _nll_loss_forward_handler(
     out_spec = DTensorSpec(spec.mesh, output_placements, tensor_meta=output_tensor_meta)
 
     return (
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         DTensor(
             # pyrefly: ignore [bad-argument-count]
             result,
             out_spec,
             # pyrefly: ignore [unexpected-keyword]
+=======
+        DTensor(
+            result,
+            out_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=result.requires_grad,
         ),
         total_weight,
@@ -375,7 +406,11 @@ def _nll_loss_and_log_softmax_backward(
 
     # The following code block is a distributed version of
     # grad_input = torch.scatter(grad_input, channel_dim, safe_target, -1.0)
+<<<<<<< HEAD
     partial_placement = MaskPartial(offset_shape=input_shape, offset_dim=channel_dim)
+=======
+    partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=channel_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     safe_target = safe_target.squeeze(channel_dim).flatten()
     masked_safe_target = partial_placement._partition_value(safe_target, mesh, mesh_dim)
     # only update grad_input to -1 if not masked
@@ -448,11 +483,16 @@ def _nll_loss_backward_handler(
         weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
 
     # tensor inputs to _propagate_tensor_meta need to be DTensors
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     args = list(args)
     # pyrefly: ignore [unsupported-operation]
     args[2], args[3] = target, weight
     # pyrefly: ignore [unsupported-operation]
+=======
+    args = list(args)
+    args[2], args[3] = target, weight
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args[6] = _cast_to_dtensor(total_weight, all_replicate_placements, spec.mesh)
     output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
 
@@ -476,12 +516,18 @@ def _nll_loss_backward_handler(
         tensor_meta=output_tensor_meta,
     )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     return DTensor(
         # pyrefly: ignore [bad-argument-count]
         result,
         out_spec,
         # pyrefly: ignore [unexpected-keyword]
+=======
+    return DTensor(
+        result,
+        out_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         requires_grad=result.requires_grad,
     )
 
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 032179bafa3eb..90b0fbaedb3dd 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -473,11 +473,17 @@ class PrepareModuleInput(ParallelStyle):
     def __init__(
         self,
         *,
+<<<<<<< HEAD
         input_layouts: Optional[
             Union[Placement, tuple[Optional[Placement], ...]]
         ] = None,
         desired_input_layouts: Optional[
             Union[Placement, tuple[Optional[Placement], ...]]
+=======
+        input_layouts: Optional[Union[Placement, tuple[Optional[Placement]]]] = None,
+        desired_input_layouts: Optional[
+            Union[Placement, tuple[Optional[Placement]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] = None,
         input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         desired_input_kwarg_layouts: Optional[dict[str, Placement]] = None,
@@ -548,7 +554,10 @@ def _prepare_input_fn(self, inputs, device_mesh):
         assert self.desired_input_layouts is not None, (
             "desired module inputs should not be None!"
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inp, input_layout, desired_layout in zip(
             inputs, self.input_layouts, self.desired_input_layouts
         ):
@@ -637,8 +646,13 @@ class PrepareModuleOutput(ParallelStyle):
     def __init__(
         self,
         *,
+<<<<<<< HEAD
         output_layouts: Union[Placement, tuple[Optional[Placement], ...]],
         desired_output_layouts: Union[Placement, tuple[Placement, ...]],
+=======
+        output_layouts: Union[Placement, tuple[Placement]],
+        desired_output_layouts: Union[Placement, tuple[Placement]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_local_output: bool = True,
     ):
         self.output_layouts = (
@@ -664,7 +678,10 @@ def _prepare_out_fn(self, outputs, device_mesh):
             raise ValueError(
                 "module outputs and output_layouts should have same length!"
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for out, out_layout, desired_out_layout in zip(
             outputs, self.output_layouts, self.desired_output_layouts
         ):
@@ -768,17 +785,28 @@ class PrepareModuleInputOutput(ParallelStyle):
     def __init__(
         self,
         *,
+<<<<<<< HEAD
         input_layouts: Optional[
             Union[Placement, tuple[Optional[Placement], ...]]
         ] = None,
         desired_input_layouts: Optional[
             Union[Placement, tuple[Optional[Placement], ...]]
+=======
+        input_layouts: Optional[Union[Placement, tuple[Optional[Placement]]]] = None,
+        desired_input_layouts: Optional[
+            Union[Placement, tuple[Optional[Placement]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] = None,
         input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         desired_input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         use_local_input: bool = False,
+<<<<<<< HEAD
         output_layouts: Union[Placement, tuple[Optional[Placement], ...]],
         desired_output_layouts: Union[Placement, tuple[Placement, ...]],
+=======
+        output_layouts: Union[Placement, tuple[Placement]],
+        desired_output_layouts: Union[Placement, tuple[Placement]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_local_output: bool = True,
     ):
         self.prepare_module_input = PrepareModuleInput(
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index 2cf6e572dcdf7..4ddcd4e340368 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+<<<<<<< HEAD
 from dataclasses import dataclass, field
 from typing import cast, Optional
 
@@ -9,6 +10,13 @@
 import torch.distributed._functional_collectives as funcol
 from torch._C._distributed import Placement
 from torch.distributed._local_tensor import maybe_run_for_local_tensor
+=======
+from dataclasses import dataclass
+from typing import cast, Optional
+
+import torch
+import torch.distributed._functional_collectives as funcol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._collective_utils import (
     fill_empty_tensor_to_shards,
@@ -18,6 +26,7 @@
     shard_dim_alltoall,
     unpad_tensor,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor._ops._mask_buffer import MaskBuffer
 
 
@@ -29,6 +38,40 @@
 
 
 class Shard(torch._C._distributed.Shard):
+=======
+
+
+__all__ = ["Placement", "Shard", "Replicate", "Partial"]
+
+
+class Placement:
+    """
+    The base class for the Placement type, where it describes how a DTensor is placed onto the
+    ``DeviceMesh``. ``Placement`` and ``DeviceMesh`` together could describe the DTensor Layout.
+    It is the base class of the three main DTensor Placement types: ``Shard``, ``Replicate``,
+    and ``Partial``.
+
+    This class is not meant to be used directly, mainly served as a typing stub.
+    """
+
+    # convenient utils to check for placement types
+    def is_shard(self, dim: Optional[int] = None) -> bool:
+        is_shard_instance = isinstance(self, Shard)
+        if dim is not None and is_shard_instance:
+            return cast(Shard, self).dim == dim
+        else:
+            return is_shard_instance
+
+    def is_replicate(self) -> bool:
+        return isinstance(self, Replicate)
+
+    def is_partial(self) -> bool:
+        return isinstance(self, Partial)
+
+
+@dataclass(frozen=True)
+class Shard(Placement):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The ``Shard(dim)`` placement describes the DTensor sharding on tensor dimension
     ``dim`` over a corresponding ``DeviceMesh`` dimension, where each rank on the
@@ -46,6 +89,11 @@ class Shard(torch._C._distributed.Shard):
         evenly divisible on a DeviceMesh dimension is currently experimental and subject to change.
     """
 
+<<<<<<< HEAD
+=======
+    dim: int
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -80,7 +128,11 @@ def _split_tensor(
         pad_sizes: list[int] = []
         for shard in tensor_list:
             if with_padding:
+<<<<<<< HEAD
                 pad_size = Shard._get_shard_pad_size(full_chunk_size, shard, self.dim)
+=======
+                pad_size = full_chunk_size - shard.size(self.dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 shard = pad_tensor(shard, self.dim, pad_size)
                 pad_sizes.append(pad_size)
             if contiguous:
@@ -89,8 +141,12 @@ def _split_tensor(
         return shard_list, pad_sizes
 
     @staticmethod
+<<<<<<< HEAD
     @maybe_run_for_local_tensor
     def local_shard_size_and_offset(
+=======
+    def _local_shard_size_and_offset(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         curr_local_size: int,
         num_chunks: int,
         rank: int,
@@ -124,6 +180,7 @@ def local_shard_size_and_offset(
             )
             return local_shard_size, shard_starting_idx
 
+<<<<<<< HEAD
     def _local_shard_size_and_offset(
         self,
         curr_local_size: int,
@@ -146,6 +203,8 @@ def _maybe_unpad_tensor_with_sizes(
                 local_tensor = local_tensor.contiguous()
         return local_tensor
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _shard_tensor(
         self,
         tensor: torch.Tensor,
@@ -173,11 +232,16 @@ def _shard_tensor(
                 tensor, num_chunks, with_padding=False, contiguous=True
             )
 
+<<<<<<< HEAD
             return self._select_shard(scatter_list, mesh_dim_local_rank)
+=======
+            return scatter_list[mesh_dim_local_rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         scatter_list, pad_sizes = self._split_tensor(
             tensor, num_chunks, with_padding=True, contiguous=True
         )
+<<<<<<< HEAD
 
         it = iter(scatter_list)
         first = next(it)
@@ -186,12 +250,16 @@ def _shard_tensor(
         assert all(first.shape == v.shape for v in it)
 
         output = torch.empty_like(first)
+=======
+        output = torch.empty_like(scatter_list[mesh_dim_local_rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # perform scatter from the src_data_rank as data source when it is not None
         mesh_scatter(
             output, scatter_list, mesh, mesh_dim=mesh_dim, group_src=src_data_rank
         )
 
+<<<<<<< HEAD
         return Shard._maybe_unpad_tensor_with_sizes(
             self.dim, output, pad_sizes, mesh_dim_local_rank, True
         )
@@ -207,6 +275,14 @@ def _make_shard_tensor(
     ) -> torch.Tensor:
         shard_placement = cls(dim)
         return shard_placement._shard_tensor(tensor, mesh, mesh_dim, src_data_rank)
+=======
+        # Only unpad if the local_tensor was padded on the dimension.
+        if pad_sizes[mesh_dim_local_rank] > 0:
+            output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank])
+            # Unpad might return a view, hence we need to remake it contiguous
+            output = output.contiguous()
+        return output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _reduce_shard_tensor(
         self,
@@ -227,7 +303,10 @@ def _reduce_shard_tensor(
             return tensor
 
         is_padded = tensor.size(self.dim) % num_chunks != 0
+<<<<<<< HEAD
         pad_sizes = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_padded:
             scattered_list, pad_sizes = self._split_tensor(
                 tensor, num_chunks, with_padding=True, contiguous=True
@@ -241,6 +320,7 @@ def _reduce_shard_tensor(
         )
 
         if is_padded:
+<<<<<<< HEAD
             assert pad_sizes is not None
             output = Shard._maybe_unpad_tensor_with_sizes(
                 self.dim, output, pad_sizes, my_coordinate[mesh_dim], False
@@ -282,6 +362,11 @@ def _maybe_unpad_tensor(
 
         return local_tensor
 
+=======
+            output = unpad_tensor(output, self.dim, pad_sizes[my_coordinate[mesh_dim]])  # type: ignore[possibly-undefined]
+        return output
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _to_replicate_tensor(
         self,
         local_tensor: torch.Tensor,
@@ -294,17 +379,32 @@ def _to_replicate_tensor(
         is replicated on the previously sharded mesh dimension
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
+<<<<<<< HEAD
         logical_dim_size = current_logical_shape[self.dim]
 
         local_tensor = self._maybe_pad_tensor(
             local_tensor, logical_dim_size, num_chunks
         )
+=======
+
+        logical_dim_size = current_logical_shape[self.dim]
+        is_padded = logical_dim_size % num_chunks != 0
+
+        if is_padded:
+            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+            pad_size = full_chunk_size - local_tensor.size(self.dim)
+            local_tensor = pad_tensor(local_tensor, self.dim, pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result = funcol.all_gather_tensor(
             local_tensor,
             gather_dim=self.dim,
             group=(mesh, mesh_dim),
         )
+<<<<<<< HEAD
 
         result = self._maybe_unpad_tensor(result, logical_dim_size, num_chunks)
 
@@ -315,6 +415,13 @@ def _to_replicate_tensor(
     def _select_shard(shards: list[torch.Tensor], shard_index) -> torch.Tensor:
         return shards[shard_index].clone()
 
+=======
+        if is_padded:
+            unpad_size = full_chunk_size * num_chunks - logical_dim_size  # type: ignore[possibly-undefined]
+            result = unpad_tensor(result, self.dim, unpad_size)
+        return result
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _replicate_to_shard(
         self,
         local_tensor: torch.Tensor,
@@ -333,6 +440,7 @@ def _replicate_to_shard(
             with_padding=False,
             contiguous=False,
         )
+<<<<<<< HEAD
 
         return Shard._select_shard(shards, shard_index)
 
@@ -433,6 +541,9 @@ def _unpad_for_new_shard_dim(
             local_tensor = unpad_tensor(local_tensor, new_shard_dim, new_dim_unpad_size)  # type: ignore[possibly-undefined]
 
         return local_tensor
+=======
+        return shards[shard_index].clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _to_new_shard_dim(
         self,
@@ -454,14 +565,39 @@ def _to_new_shard_dim(
 
         num_chunks = mesh.size(mesh_dim=mesh_dim)
 
+<<<<<<< HEAD
         local_tensor = Shard._pad_for_new_shard_dim(
             current_logical_shape, local_tensor, num_chunks, self.dim, new_shard_dim
         )
+=======
+        old_dim_logical_size = current_logical_shape[self.dim]
+        new_dim_logical_size = current_logical_shape[new_shard_dim]
+        old_dim_padding = old_dim_logical_size % num_chunks != 0
+        new_dim_padding = new_dim_logical_size % num_chunks != 0
+        if old_dim_padding:
+            old_dim_full_chunk_size = (
+                old_dim_logical_size + num_chunks - 1
+            ) // num_chunks
+            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
+            local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
+        if new_dim_padding:
+            new_dim_full_chunk_size = (
+                new_dim_logical_size + num_chunks - 1
+            ) // num_chunks
+            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
+                new_shard_dim
+            )
+            local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_tensor = shard_dim_alltoall(
             local_tensor, self.dim, new_shard_dim, mesh, mesh_dim
         )
 
+<<<<<<< HEAD
         new_tensor = Shard._unpad_for_new_shard_dim(
             current_logical_shape,
             new_tensor,
@@ -473,6 +609,28 @@ def _to_new_shard_dim(
 
         return new_tensor
 
+=======
+        if old_dim_padding:
+            old_dim_unpad_size = (
+                old_dim_full_chunk_size * num_chunks - current_logical_shape[self.dim]  # type: ignore[possibly-undefined]
+            )
+            new_tensor = unpad_tensor(new_tensor, self.dim, old_dim_unpad_size)  # type: ignore[possibly-undefined]
+
+        if new_dim_padding:
+            local_shard_size_on_new_dim = self._local_shard_size_and_offset(
+                new_dim_logical_size, num_chunks, my_coordinate[mesh_dim]
+            )[0]
+            new_dim_unpad_size = new_dim_full_chunk_size - local_shard_size_on_new_dim  # type: ignore[possibly-undefined]
+            new_tensor = unpad_tensor(new_tensor, new_shard_dim, new_dim_unpad_size)  # type: ignore[possibly-undefined]
+
+        return new_tensor
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Shard):
+            return False
+        return self.dim == other.dim
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __hash__(self) -> int:
         return hash(self.dim)
 
@@ -487,8 +645,17 @@ def __str__(self) -> str:
         return f"S({self.dim})"
 
 
+<<<<<<< HEAD
 # Need to inherit from Shard here so that isinstance(some_strided_shard, Shard) will work.
 class _StridedShard(torch._C._distributed.StridedShard, Shard):
+=======
+# kw_only is only available in python >= 3.10
+kw_only_dataclass = dict(kw_only=True) if "kw_only" in dataclass.__kwdefaults__ else {}
+
+
+@dataclass(frozen=True, **kw_only_dataclass)
+class _StridedShard(Shard):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     _StridedShard is only introduced to support 2D FSDP2 + TP sharding where the tensor
     is sharded on the TP mesh dimension first, then sharded on the FSDP mesh dimension.
@@ -546,6 +713,21 @@ class _StridedShard(torch._C._distributed.StridedShard, Shard):
     TODO: we should remove _StridedShard placement once we can unify it with Shard
     """
 
+<<<<<<< HEAD
+=======
+    split_factor: int
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, _StridedShard):
+            return self.dim == other.dim and self.split_factor == other.split_factor
+        elif isinstance(other, Shard):
+            # TODO: this is to avoid extra all-gather in dtensor op dispatch
+            # note that sharding prop would not produce _StridedShard and an
+            # placement inequality would introduce an all-gather for resharding
+            return self.dim == other.dim
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __hash__(self) -> int:
         return hash((self.dim, self.split_factor))
 
@@ -559,6 +741,7 @@ def __str__(self) -> str:
         """human readable representation of the _StridedShard placement"""
         return f"_S({self.dim}, {self.split_factor})"
 
+<<<<<<< HEAD
     @classmethod
     def _make_shard_tensor(
         cls,
@@ -574,6 +757,8 @@ def _make_shard_tensor(
             tensor, mesh, mesh_dim, src_data_rank
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -586,6 +771,7 @@ def _split_tensor(
             f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
         )
 
+<<<<<<< HEAD
         # Essentially _StridedShard express the right-to-left sharding in the
         # reversed order. Here we perform first_split as the virtual "right" sharding,
         # and then second_split as the virtual "left" sharding, and finally assemble
@@ -616,6 +802,35 @@ def _split_tensor(
         if with_padding:
             pad_sizes = [max_chunk_size - shard.size(self.dim) for shard in shard_list]
 
+=======
+        # num_chunks represents the size of this StridedShard mesh dim, while self.split_factor
+        # represents the aggregate num chunks for other shardings applied logically earlier than this strided shard.
+        # (e.g. in FSDP+TP case, num_chunks is size(dp dim), split_factor is size(tp dim))
+        total_split = num_chunks * self.split_factor
+
+        tensor_list = list(torch.chunk(tensor, total_split, dim=self.dim))
+        tensor_list = fill_empty_tensor_to_shards(
+            tensor_list, self.dim, total_split - len(tensor_list)
+        )
+
+        # compute the chunk size inline with ``torch.chunk`` to calculate padding
+        full_chunk_size = (tensor.size(self.dim) + total_split - 1) // total_split
+
+        shard_list: list[torch.Tensor] = []
+        pad_sizes: list[int] = []
+        for i in range(num_chunks):
+            shard = torch.cat(
+                [tensor_list[i + j * num_chunks] for j in range(self.split_factor)],
+                dim=self.dim,
+            )
+            if with_padding:
+                pad_size = full_chunk_size * self.split_factor - shard.size(self.dim)
+                shard = pad_tensor(shard, self.dim, pad_size)
+                pad_sizes.append(pad_size)
+            if contiguous:
+                shard = shard.contiguous()
+            shard_list.append(shard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return shard_list, pad_sizes
 
     def _to_replicate_tensor(
@@ -626,6 +841,7 @@ def _to_replicate_tensor(
         current_logical_shape: list[int],
     ) -> torch.Tensor:
         """
+<<<<<<< HEAD
         replay the replicate-to-shard process to understand how to stitch shards back
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
@@ -713,6 +929,112 @@ def _local_shard_size_and_offset(
 
 
 class Replicate(torch._C._distributed.Replicate):
+=======
+        Given a tensor with strided sharding (e.g. [StridedShard(d), Shard(d)]),
+        this function is called during the process of converting to [Replicate(), Replicate()],
+        and `local_tensor` represents the portion of the tensor on this rank after the intermediate step of
+        converting to [StridedShard(d), Replicate()] in right-to-left unsharding order.
+
+        note: this conversion logic is pretty specialized on this 2D case.  It could be generalized further. This
+        is a common enough case to be worth fixing (since it occurs when applying TP and then FSDP to a model).
+
+        note: this does not support 'reduce_scatter' for StridedShard.
+
+        Example
+        -------
+        mesh = (DP=2, TP=2)
+        # single-gpu "weight" of size 5, will be 'uneven' for sharding
+        original = torch.arange(5)
+
+        tp sharded tensor
+        -----------------
+        `tp = distribute_tensor(x, world_mesh['tp'], [Shard(0)])`
+
+        local_tensors:
+        rank0: [0,1,2]    rank1: [3,4]
+        rank1: [0,1,2]    rank3: [3,4]
+
+        fsdp+tp sharded tensor
+        ----------------------
+        `dp_tp = ...` (the process of creating a strided-shard tensor is skipped over as it is complicated
+        dp_tp has placement (_StridedShard(0, split_factor=2), Shard(0))
+        local_tensors:
+        rank0: [0,1]  rank1: [3]
+        rank1: [2]    rank3: [4]
+
+        Now, say someone wants to reconstruct dp_tp's full tensor. This will invoke 'redistribute' to replicate.
+        redistribute will first replicate the "Shard(0)" placement on the rightmost mesh dim, then replicate the
+        StridedShard placement second, which is implemented by this function.
+        So our starting point (`local_tensor` arg) is the result of replicating the Shard(0) placement across the
+        TP dim, which looks like this.
+
+        Note the discrepancy with the 'tp sharded tensor' line above!  We'll fix it by locally shuffling data.
+
+        local_tensors:
+        rank0: [0,1,3]  rank1: [0,1,3]
+        rank2: [2,4]    rank3: [2,4]
+
+        Step 1: replicate over the DP dimension.  Afterwards, each rank can locally sort the values.
+          note: we need padding to do this allgather, and we'll need to keep track of the padding amount for later
+                local_tensors:
+        rank0: [0,1,3,2,4]    rank1: [0,1,3,2,4]
+        rank2: [0,1,3,2,4]    rank3: [0,1,3,2,4]
+
+        Step 2: chunk and shuffle values around to account for the wrong order of operations above
+        and get the original tensor content back
+
+        01324#       <- our allgather includes padding, if padding was applied in step 1
+        01324        <- Remove the padding
+        013, 24      <- chunk once, 'undoing' the DP allgather
+        01, 3, 2, 4  <- chunk each chunk, 'undoing' the initial (wrong) TP allgather performed by Shard(0)->Replicate()
+        012, 34      <- interleave with stride=TP mesh dim size
+        01234        <- concatenate
+
+        Note: the current implementation of this function is incomplete, and supports only the common pattern of one
+        strided shard placement, which is used in the FSDP + TP case.  We could extend this implementation to handle
+        multiple strided shardings (e.g. [StridedShard, StridedShard, Shard]), by repeating the chunking step more times
+        and handling more complex shuffling in the last step.  On the other hand, we plan to replace 'StridedShard'
+        with using just Shard and specifying a sharding order, so it may be ok to leave this as-is for the time being.
+        """
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        logical_dim_size = current_logical_shape[self.dim]
+        full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+        local_pad_size = full_chunk_size - local_tensor.size(self.dim)
+
+        local_tensor = pad_tensor(local_tensor, self.dim, local_pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+
+        result = funcol.all_gather_tensor(
+            local_tensor,
+            gather_dim=self.dim,
+            group=(mesh, mesh_dim),
+        )
+        if isinstance(result, funcol.AsyncCollectiveTensor):
+            result = result.wait()
+
+        if result.shape[self.dim] > logical_dim_size:
+            result = unpad_tensor(
+                result, self.dim, result.shape[self.dim] - logical_dim_size
+            )
+
+        # this reverses our 'all_gather' but gives every rank a copy
+        outer_shards = torch.chunk(result, num_chunks, dim=self.dim)
+        # this undoes the 'Shard(0)' -> Replicate() that happened over the wrong mesh dim in the first place
+        inner_shards: list[torch.Tensor] = []
+        for p in outer_shards:
+            inner_shards.extend(torch.chunk(p, self.split_factor, dim=self.dim))
+        # now we just have to correctly stride the shards
+        reordered_shards = []
+        for i in range(self.split_factor):
+            reordered_shards.extend(inner_shards[i :: self.split_factor])
+        return torch.cat(reordered_shards, dim=self.dim).contiguous()
+
+
+@dataclass(frozen=True)
+class Replicate(Placement):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The ``Replicate()`` placement describes the DTensor replicating on a corresponding
     ``DeviceMesh`` dimension, where each rank on the DeviceMesh dimension holds a
@@ -720,6 +1042,12 @@ class Replicate(torch._C._distributed.Replicate):
     DTensor APIs (i.e. ``distribute_tensor``, ``DTensor.from_local``, etc.)
     """
 
+<<<<<<< HEAD
+=======
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, Replicate)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __hash__(self) -> int:
         # every replicate placement is the same
         return -1
@@ -736,9 +1064,14 @@ def __str__(self) -> str:
         """
         return "R"
 
+<<<<<<< HEAD
     @classmethod
     def _make_replicate_tensor(
         cls,
+=======
+    def _replicate_tensor(
+        self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
@@ -760,6 +1093,7 @@ def _make_replicate_tensor(
             mesh_broadcast(tensor, mesh, mesh_dim=mesh_dim, group_src=src_data_rank)
         return tensor
 
+<<<<<<< HEAD
     def _replicate_tensor(
         self,
         tensor: torch.Tensor,
@@ -771,6 +1105,11 @@ def _replicate_tensor(
 
 
 class Partial(torch._C._distributed.Partial):
+=======
+
+@dataclass(frozen=True)
+class Partial(Placement):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     The ``Partial(reduce_op)`` placement describes the DTensor that is pending
     reduction on a specified ``DeviceMesh`` dimension, where each rank on the
@@ -789,6 +1128,11 @@ class Partial(torch._C._distributed.Partial):
         and can only be used by the ``DTensor.from_local`` API.
     """
 
+<<<<<<< HEAD
+=======
+    reduce_op: str = "sum"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _reduce_value(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
     ) -> torch.Tensor:
@@ -817,7 +1161,11 @@ def _partition_value(
         # _partition_value: partition the value of a replicated tensor on the mesh dimension
 
         # _partition_value is the conjugate operation of _reduce_value
+<<<<<<< HEAD
         # - i.e. _partition_value on a sum reduce op is just a division operation
+=======
+        # - i.e. _partition_value on a sum reduce op is just a divison operation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
         # TODO: if the reduce_op is min/max, etc. the _partition_value should be a
         # different operation
@@ -825,6 +1173,14 @@ def _partition_value(
         num_chunks = mesh.size(mesh_dim=mesh_dim)
         return tensor / num_chunks
 
+<<<<<<< HEAD
+=======
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Partial):
+            return False
+        return self.reduce_op == other.reduce_op
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __hash__(self) -> int:
         return 1 + hash(self.reduce_op)
 
@@ -843,6 +1199,7 @@ def __str__(self) -> str:
 
 # We keep the old _Partial name for a while for BC reason
 _Partial = Partial
+<<<<<<< HEAD
 
 
 @dataclass(frozen=True)
@@ -989,3 +1346,5 @@ def __str__(self) -> str:
         human readable representation of the MaskPartial placement
         """
         return "MaskP"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
index 2b1a88f1a126f..7e0b961653023 100644
--- a/torch/distributed/utils.py
+++ b/torch/distributed/utils.py
@@ -2,8 +2,13 @@
 import dataclasses
 import traceback
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Container
 from typing import Any, Optional, overload, TypeVar
+=======
+from collections.abc import Container
+from typing import Any, Callable, Optional, overload, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -59,7 +64,10 @@ def _cast_forward_inputs(
     def cast_fn(x: torch.Tensor) -> torch.Tensor:
         if not torch.is_floating_point(x) or x.dtype == dtype:
             return x
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x.to(dtype)
 
     return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))
@@ -134,6 +142,7 @@ def to_map(obj):
         from torch.nn.parallel.scatter_gather import _is_namedtuple
 
         if _is_namedtuple(obj):
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
             return [type(obj)(*args) for args in zip(*map(to_map, obj))]
         if isinstance(obj, tuple) and len(obj) > 0:
@@ -144,6 +153,14 @@ def to_map(obj):
             return [list(i) for i in zip(*map(to_map, obj))]
         if isinstance(obj, dict) and len(obj) > 0:
             # pyrefly: ignore [no-matching-overload]
+=======
+            return [type(obj)(*args) for args in zip(*map(to_map, obj))]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(to_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return [list(i) for i in zip(*map(to_map, obj))]
+        if isinstance(obj, dict) and len(obj) > 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [type(obj)(i) for i in zip(*map(to_map, obj.items()))]
         return [obj]
 
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 99193da689e8d..7c59adcee09d1 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -39,7 +39,10 @@ class Bernoulli(ExponentialFamily):
         validate_args (bool, optional): whether to validate arguments, None by default
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.boolean
     has_enumerate_support = True
@@ -57,12 +60,18 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -140,6 +149,9 @@ def enumerate_support(self, expand=True):
     def _natural_params(self) -> tuple[Tensor]:
         return (torch.logit(self.probs),)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x):
         return torch.log1p(torch.exp(x))
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index 1131dacf7d09e..e53952bc9ad9e 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -31,7 +31,10 @@ class Beta(ExponentialFamily):
             (often referred to as beta)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "concentration1": constraints.positive,
         "concentration0": constraints.positive,
@@ -114,6 +117,9 @@ def concentration0(self) -> Tensor:
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.concentration1, self.concentration0)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x, y):
         return torch.lgamma(x) + torch.lgamma(y) - torch.lgamma(x + y)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 87936c9febf8c..c96ed9d3c07e0 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -45,7 +45,10 @@ class Binomial(Distribution):
         logits (Tensor): Event log-odds
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "total_count": constraints.nonnegative_integer,
         "probs": constraints.unit_interval,
@@ -67,7 +70,10 @@ def __init__(
         if probs is not None:
             (
                 self.total_count,
+<<<<<<< HEAD
                 # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.probs,
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
@@ -75,7 +81,10 @@ def __init__(
             assert logits is not None  # helps mypy
             (
                 self.total_count,
+<<<<<<< HEAD
                 # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.logits,
             ) = broadcast_all(total_count, logits)
             self.total_count = self.total_count.type_as(self.logits)
@@ -102,7 +111,10 @@ def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=0)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         return constraints.integer_interval(0, self.total_count)
 
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 6a798b6c28f21..c67ecfc8a05a5 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -50,7 +50,10 @@ class Categorical(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     has_enumerate_support = True
 
@@ -67,14 +70,20 @@ def __init__(
         if probs is not None:
             if probs.dim() < 1:
                 raise ValueError("`probs` parameter must be at least one-dimensional.")
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.probs = probs / probs.sum(-1, keepdim=True)
         else:
             assert logits is not None  # helps mypy
             if logits.dim() < 1:
                 raise ValueError("`logits` parameter must be at least one-dimensional.")
             # Normalize
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
         self._param = self.probs if probs is not None else self.logits
         self._num_events = self._param.size()[-1]
@@ -102,7 +111,10 @@ def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=0)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         return constraints.integer_interval(0, self._num_events - 1)
 
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 5a7f60e03c2f6..07a0315b35fd4 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -31,7 +31,10 @@ class Cauchy(Distribution):
         scale (float or Tensor): half width at half maximum.
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 9181a87abe4df..2753a353e3e3e 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 r"""
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index 94220cb8d6d42..795d0afd69bd9 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -47,7 +47,10 @@ class ContinuousBernoulli(ExponentialFamily):
     https://arxiv.org/abs/1907.06845
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.unit_interval
     _mean_carrier_measure = 0
@@ -66,19 +69,28 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.probs,) = broadcast_all(probs)
             # validate 'probs' here if necessary as it is later clamped for numerical stability
             # close to 0 and 1, later on; otherwise the clamped 'probs' would always pass
             if validate_args is not None:
                 if not self.arg_constraints["probs"].check(self.probs).all():
                     raise ValueError("The parameter probs has invalid values")
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.probs = clamp_probs(self.probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -234,7 +246,10 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (self.logits,)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x):
         """computes the log normalizing constant as a function of the natural parameter"""
         out_unst_reg = torch.max(
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index cdcbe8f6e9964..5cd684bc94bea 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -22,7 +22,10 @@ def _Dirichlet_backward(x, concentration, grad_output):
 
 class _Dirichlet(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, concentration):
         x = torch._sample_dirichlet(concentration)
         ctx.save_for_backward(x, concentration)
@@ -30,7 +33,10 @@ def forward(ctx, concentration):
 
     @staticmethod
     @once_differentiable
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         x, concentration = ctx.saved_tensors
         return _Dirichlet_backward(x, concentration, grad_output)
@@ -52,7 +58,10 @@ class Dirichlet(ExponentialFamily):
             (often referred to as alpha)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "concentration": constraints.independent(constraints.positive, 1)
     }
@@ -133,6 +142,9 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (self.concentration,)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x):
         return x.lgamma().sum(-1) - torch.lgamma(x.sum(-1))
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index dcdb2762cfffe..2e909f4691273 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -62,8 +62,12 @@ def __init__(
                 warnings.warn(
                     f"{self.__class__} does not define `arg_constraints`. "
                     + "Please set `arg_constraints = {}` or initialize the distribution "
+<<<<<<< HEAD
                     + "with `validate_args=False` to turn off validation.",
                     stacklevel=2,
+=======
+                    + "with `validate_args=False` to turn off validation."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             for param, constraint in arg_constraints.items():
                 if constraints.is_dependent(constraint):
@@ -314,8 +318,12 @@ def _validate_sample(self, value: Tensor) -> None:
             warnings.warn(
                 f"{self.__class__} does not define `support` to enable "
                 + "sample validation. Please initialize the distribution with "
+<<<<<<< HEAD
                 + "`validate_args=False` to turn off validation.",
                 stacklevel=2,
+=======
+                + "`validate_args=False` to turn off validation."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return
         assert support is not None
diff --git a/torch/distributions/exp_family.py b/torch/distributions/exp_family.py
index ab8d340bd7931..e72d3157c553d 100644
--- a/torch/distributions/exp_family.py
+++ b/torch/distributions/exp_family.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Union
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions.distribution import Distribution
@@ -57,7 +60,11 @@ def entropy(self):
         """
         Method to compute the entropy using Bregman divergence of the log normalizer.
         """
+<<<<<<< HEAD
         result: Union[Tensor, float] = -self._mean_carrier_measure
+=======
+        result = -self._mean_carrier_measure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nparams = [p.detach().requires_grad_() for p in self._natural_params]
         lg_normal = self._log_normalizer(*nparams)
         gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True)
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index b9596b7c0488f..32fa12037d45b 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -27,7 +27,10 @@ class Exponential(ExponentialFamily):
         rate (float or Tensor): rate = 1 / scale of the distribution
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"rate": constraints.positive}
     support = constraints.nonnegative
     has_rsample = True
@@ -90,6 +93,9 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (-self.rate,)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x):
         return -torch.log(-x)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 8ebe2900f9b0a..68fcc31249d26 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -29,7 +29,10 @@ class FisherSnedecor(Distribution):
         df2 (float or Tensor): degrees of freedom parameter 2
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"df1": constraints.positive, "df2": constraints.positive}
     support = constraints.positive
     has_rsample = True
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 9044c2a7a0e03..2e564f4171f5e 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -34,7 +34,10 @@ class Gamma(ExponentialFamily):
             (often referred to as beta), rate = 1 / scale
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "concentration": constraints.positive,
         "rate": constraints.positive,
@@ -110,7 +113,10 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.concentration - 1, -self.rate)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x, y):
         return torch.lgamma(x + 1) + (x + 1) * torch.log(-y.reciprocal())
 
diff --git a/torch/distributions/generalized_pareto.py b/torch/distributions/generalized_pareto.py
index e04f38b87c707..727ac333ba5ee 100644
--- a/torch/distributions/generalized_pareto.py
+++ b/torch/distributions/generalized_pareto.py
@@ -35,7 +35,10 @@ class GeneralizedPareto(Distribution):
         concentration (float or Tensor): Concentration parameter of the distribution
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "loc": constraints.real,
         "scale": constraints.positive,
@@ -131,7 +134,10 @@ def variance(self):
         concentration = self.concentration
         valid = concentration < 0.5
         safe_conc = torch.where(valid, concentration, 0.25)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = self.scale**2 / ((1 - safe_conc) ** 2 * (1 - 2 * safe_conc))
         return torch.where(valid, result, nan)
 
@@ -144,7 +150,10 @@ def mode(self):
         return self.loc
 
     @constraints.dependent_property(is_discrete=False, event_dim=0)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         lower = self.loc
         upper = torch.where(
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index fe0aca5f2b1f3..e64109229af34 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -44,7 +44,10 @@ class Geometric(Distribution):
         logits (Number, Tensor): the log-odds of sampling `1`.
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.nonnegative_integer
 
@@ -59,11 +62,17 @@ def __init__(
                 "Either `probs` or `logits` must be specified, but not both."
             )
         if probs is not None:
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
             # pyrefly: ignore [read-only]
+=======
+            (self.probs,) = broadcast_all(probs)
+        else:
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.logits,) = broadcast_all(logits)
         probs_or_logits = probs if probs is not None else logits
         if isinstance(probs_or_logits, _Number):
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index c1c45221c92ef..695f4d8989265 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -32,7 +32,10 @@ class Gumbel(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     support = constraints.real
 
     def __init__(
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index cc6518f286b52..a6ee39eb2743b 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -32,10 +32,15 @@ class HalfCauchy(TransformedDistribution):
     """
 
     arg_constraints = {"scale": constraints.positive}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.nonnegative
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.nonnegative
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: Cauchy
 
     def __init__(
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 148e69ac9ce10..b6c9818280a3a 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -32,10 +32,15 @@ class HalfNormal(TransformedDistribution):
     """
 
     arg_constraints = {"scale": constraints.positive}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.nonnegative
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.nonnegative
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: Normal
 
     def __init__(
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 31443d772e1bd..f1144050ad954 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -91,7 +91,10 @@ def has_enumerate_support(self) -> bool:  # type: ignore[override]
         return self.base_dist.has_enumerate_support
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         result = self.base_dist.support
         if self.reinterpreted_batch_ndims:
diff --git a/torch/distributions/inverse_gamma.py b/torch/distributions/inverse_gamma.py
index e87169c98de67..bfd28a9a2fb0f 100644
--- a/torch/distributions/inverse_gamma.py
+++ b/torch/distributions/inverse_gamma.py
@@ -38,10 +38,15 @@ class InverseGamma(TransformedDistribution):
         "concentration": constraints.positive,
         "rate": constraints.positive,
     }
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.positive
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.positive
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: Gamma
 
     def __init__(
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index 85932828d21af..6e1f1c435c1a5 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import math
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import total_ordering
+=======
+from functools import total_ordering
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import inf, Tensor
@@ -133,7 +138,10 @@ def _dispatch_kl(type_p, type_q):
             f"Ambiguous kl_divergence({type_p.__name__}, {type_q.__name__}). "
             f"Please register_kl({left_p.__name__}, {right_q.__name__})",
             RuntimeWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     return left_fun
 
@@ -281,7 +289,11 @@ def _kl_exponential_exponential(p, q):
 
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
+<<<<<<< HEAD
     if type(p) is not type(q):
+=======
+    if not type(p) == type(q):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(
             "The cross KL-divergence between different exponential families cannot \
                             be computed using Bregman divergences"
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
index 2c70b66e07a88..e9de61b97553e 100644
--- a/torch/distributions/kumaraswamy.py
+++ b/torch/distributions/kumaraswamy.py
@@ -44,7 +44,10 @@ class Kumaraswamy(TransformedDistribution):
         "concentration1": constraints.positive,
         "concentration0": constraints.positive,
     }
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     support = constraints.unit_interval
     has_rsample = True
 
@@ -67,7 +70,10 @@ def __init__(
             AffineTransform(loc=1.0, scale=-1.0),
             PowerTransform(exponent=self.concentration1.reciprocal()),
         ]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index 5d0573bda6f73..60213072edf2d 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -28,7 +28,10 @@ class Laplace(Distribution):
         scale (float or Tensor): scale of the distribution
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index 631abdd7dd5aa..64631a9bfdc09 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -60,7 +60,10 @@ class LKJCholesky(Distribution):
     Journal of Multivariate Analysis. 100. 10.1016/j.jmva.2009.04.008
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"concentration": constraints.positive}
     support = constraints.corr_cholesky
 
@@ -131,7 +134,11 @@ def log_prob(self, value):
         # Additionally, the Jacobian of the transformation from Cholesky factor to
         # correlation matrix is:
         #   prod(L_ii ^ (D - i))
+<<<<<<< HEAD
         # So the probability of a Cholesky factor is proportional to
+=======
+        # So the probability of a Cholesky factor is propotional to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #   prod(L_ii ^ (2 * concentration - 2 + D - i)) = prod(L_ii ^ order_i)
         # with order_i = 2 * concentration - 2 + D - i
         if self._validate_args:
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index f6ad414ce8275..de7d06c971218 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -32,10 +32,15 @@ class LogNormal(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.positive
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.positive
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: Normal
 
     def __init__(
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index ef23b8c8a7afe..64983ba709b5c 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -36,10 +36,15 @@ class LogisticNormal(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.simplex
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.simplex
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: Independent[Normal]
 
     def __init__(
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 682a01a1098b0..62a32df7cae30 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -86,7 +86,10 @@ class LowRankMultivariateNormal(Distribution):
             capacitance = I + cov_factor.T @ inv(cov_diag) @ cov_factor
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "loc": constraints.real_vector,
         "cov_factor": constraints.independent(constraints.real, 2),
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index f2066688339fe..4b1485cc2a79d 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -124,7 +124,10 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         return MixtureSameFamilyConstraint(self._component_distribution.support)
 
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index e6e957d63106c..5120f88f2fb22 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -50,7 +50,10 @@ class Multinomial(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     total_count: int
 
@@ -93,7 +96,10 @@ def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=1)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         return constraints.multinomial(self.total_count)
 
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 9b006e400ebd1..164a7b64dc214 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -123,7 +123,10 @@ class MultivariateNormal(Distribution):
         the corresponding lower triangular matrices using a Cholesky decomposition.
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "loc": constraints.real_vector,
         "covariance_matrix": constraints.positive_definite,
@@ -157,7 +160,10 @@ def __init__(
                     "with optional leading batch dimensions"
                 )
             batch_shape = torch.broadcast_shapes(scale_tril.shape[:-2], loc.shape[:-1])
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.scale_tril = scale_tril.expand(batch_shape + (-1, -1))
         elif covariance_matrix is not None:
             if covariance_matrix.dim() < 2:
@@ -168,7 +174,10 @@ def __init__(
             batch_shape = torch.broadcast_shapes(
                 covariance_matrix.shape[:-2], loc.shape[:-1]
             )
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
         else:
             assert precision_matrix is not None  # helps mypy
@@ -180,7 +189,10 @@ def __init__(
             batch_shape = torch.broadcast_shapes(
                 precision_matrix.shape[:-2], loc.shape[:-1]
             )
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.precision_matrix = precision_matrix.expand(batch_shape + (-1, -1))
         self.loc = loc.expand(batch_shape + (-1,))
 
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index a8f4356212bd0..3793906f0463d 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -33,7 +33,10 @@ class NegativeBinomial(Distribution):
         logits (Tensor): Event log-odds for probabilities of success
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "total_count": constraints.greater_than_eq(0),
         "probs": constraints.half_open_interval(0.0, 1.0),
@@ -55,7 +58,10 @@ def __init__(
         if probs is not None:
             (
                 self.total_count,
+<<<<<<< HEAD
                 # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.probs,
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
@@ -63,7 +69,10 @@ def __init__(
             assert logits is not None  # helps mypy
             (
                 self.total_count,
+<<<<<<< HEAD
                 # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.logits,
             ) = broadcast_all(total_count, logits)
             self.total_count = self.total_count.type_as(self.logits)
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 5a7b40aa1c102..ceafc5eaabbb9 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -31,7 +31,10 @@ class Normal(ExponentialFamily):
             (often referred to as sigma)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
@@ -89,7 +92,10 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         # compute the variance
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var = self.scale**2
         log_scale = (
             math.log(self.scale)
@@ -119,6 +125,9 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.loc / self.scale.pow(2), -0.5 * self.scale.pow(2).reciprocal())
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x, y):
         return -0.25 * x.pow(2) / y + 0.5 * torch.log(-math.pi / y)
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index c0bfb59293a9c..7a3b399a8b6ef 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -42,7 +42,10 @@ class OneHotCategorical(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     support = constraints.one_hot
     has_enumerate_support = True
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 66e16b11353b3..03f86e3791284 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -39,7 +39,10 @@ def __init__(
         self.scale, self.alpha = broadcast_all(scale, alpha)
         base_dist = Exponential(self.alpha, validate_args=validate_args)
         transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 452c6eeb6c91e..a79eef58d9441 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -32,7 +32,10 @@ class Poisson(ExponentialFamily):
         rate (Number, Tensor): the rate parameter
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"rate": constraints.nonnegative}
     support = constraints.nonnegative_integer
 
@@ -83,6 +86,9 @@ def log_prob(self, value):
     def _natural_params(self) -> tuple[Tensor]:
         return (torch.log(self.rate),)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x):
         return torch.exp(x)
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 51f59245c4c84..8493273657ff1 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -40,7 +40,10 @@ class LogitRelaxedBernoulli(Distribution):
     (Jang et al., 2017)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.real
 
@@ -58,12 +61,18 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -141,10 +150,15 @@ class RelaxedBernoulli(TransformedDistribution):
     """
 
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.unit_interval
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.unit_interval
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: LogitRelaxedBernoulli
 
     def __init__(
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 00052ad909889..c8f17ad15b43d 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -38,7 +38,10 @@ class ExpRelaxedCategorical(Distribution):
     (Jang et al., 2017)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     support = (
         constraints.real_vector
@@ -128,10 +131,15 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     """
 
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     support = constraints.simplex
     has_rsample = True
     # pyrefly: ignore [bad-override]
+=======
+    support = constraints.simplex
+    has_rsample = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     base_dist: ExpRelaxedCategorical
 
     def __init__(
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index 1249255409a9c..5baebf6eb9c2d 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -31,7 +31,10 @@ class StudentT(Distribution):
         scale (float or Tensor): scale of the distribution
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {
         "df": constraints.positive,
         "loc": constraints.real,
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 4fd375b12fedb..825297c793996 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -123,7 +123,10 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         if not self.transforms:
             return self.base_dist.support
@@ -171,7 +174,11 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         event_dim = len(self.event_shape)
+<<<<<<< HEAD
         log_prob: Union[Tensor, float] = 0.0
+=======
+        log_prob = 0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = value
         for transform in reversed(self.transforms):
             x = transform.inv(y)
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 26644775aff11..973b612a8d561 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -226,13 +226,19 @@ def __init__(self, transform: Transform) -> None:
         self._inv: Transform = transform  # type: ignore[assignment]
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         assert self._inv is not None
         return self._inv.codomain
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         assert self._inv is not None
         return self._inv.domain
@@ -302,7 +308,10 @@ def __eq__(self, other):
         return self.parts == other.parts
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         if not self.parts:
             return constraints.real
@@ -318,7 +327,10 @@ def domain(self):
         return domain
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         if not self.parts:
             return constraints.real
@@ -438,14 +450,20 @@ def with_cache(self, cache_size=1):
         )
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         return constraints.independent(
             self.base_transform.domain, self.reinterpreted_batch_ndims
         )
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         return constraints.independent(
             self.base_transform.codomain, self.reinterpreted_batch_ndims
@@ -513,12 +531,18 @@ def __init__(
         super().__init__(cache_size=cache_size)
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         return constraints.independent(constraints.real, len(self.in_shape))
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         return constraints.independent(constraints.real, len(self.out_shape))
 
@@ -772,14 +796,20 @@ def event_dim(self) -> int:
         return self._event_dim
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         if self.event_dim == 0:
             return constraints.real
         return constraints.independent(constraints.real, self.event_dim)
 
     @constraints.dependent_property(is_discrete=False)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         if self.event_dim == 0:
             return constraints.real
@@ -850,7 +880,11 @@ def inverse_shape(self, shape):
 
 class CorrCholeskyTransform(Transform):
     r"""
+<<<<<<< HEAD
     Transforms an unconstrained real vector :math:`x` with length :math:`D*(D-1)/2` into the
+=======
+    Transforms an uncontrained real vector :math:`x` with length :math:`D*(D-1)/2` into the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Cholesky factor of a D-dimension correlation matrix. This Cholesky factor is a lower
     triangular matrix with positive diagonals and unit Euclidean norm for each row.
     The transform is processed as follows:
@@ -877,7 +911,10 @@ def _call(self, x):
         # apply stick-breaking on the squared values
         # Note that y = sign(r) * sqrt(z * z1m_cumprod)
         #             = (sign(r) * sqrt(z)) * sqrt(z1m_cumprod) = r * sqrt(z1m_cumprod)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         z = r**2
         z1m_cumprod_sqrt = (1 - z).sqrt().cumprod(-1)
         # Diagonal elements must be 1.
@@ -918,7 +955,11 @@ def forward_shape(self, shape):
         N = shape[-1]
         D = round((0.25 + 2 * N) ** 0.5 + 0.5)
         if D * (D - 1) // 2 != N:
+<<<<<<< HEAD
             raise ValueError("Input is not a flattened lower-diagonal number")
+=======
+            raise ValueError("Input is not a flattend lower-diagonal number")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return shape[:-1] + (D, D)
 
     def inverse_shape(self, shape):
@@ -1166,14 +1207,20 @@ def bijective(self) -> bool:  # type: ignore[override]
         return all(t.bijective for t in self.transforms)
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         return constraints.cat(
             [t.domain for t in self.transforms], self.dim, self.lengths
         )
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         return constraints.cat(
             [t.codomain for t in self.transforms], self.dim, self.lengths
@@ -1246,12 +1293,18 @@ def bijective(self) -> bool:  # type: ignore[override]
         return all(t.bijective for t in self.transforms)
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def domain(self):
         return constraints.stack([t.domain for t in self.transforms], self.dim)
 
     @constraints.dependent_property
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codomain(self):
         return constraints.stack([t.codomain for t in self.transforms], self.dim)
 
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index bd9c68ca15fb8..5618c18e8b2b1 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -79,7 +79,10 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property(is_discrete=False, event_dim=0)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def support(self):
         return constraints.interval(self.low, self.high)
 
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index a5afc5395ee78..4124621dadb59 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -1,6 +1,12 @@
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from functools import update_wrapper
 from typing import Any, Final, Generic, Optional, overload, TypeVar, Union
+=======
+from collections.abc import Sequence
+from functools import update_wrapper
+from typing import Any, Callable, Final, Generic, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
diff --git a/torch/distributions/von_mises.py b/torch/distributions/von_mises.py
index 47588ffedd0b5..a732c1cb08c26 100644
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@@ -92,7 +92,10 @@ def _log_modified_bessel_fn(x, order=0):
 @torch.jit.script_if_tracing
 def _rejection_sample(loc, concentration, proposal_r, x):
     done = torch.zeros(x.shape, dtype=torch.bool, device=loc.device)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while not done.all():
         u = torch.rand((3,) + x.shape, dtype=loc.dtype, device=loc.device)
         u1, u2, u3 = u.unbind()
@@ -101,7 +104,10 @@ def _rejection_sample(loc, concentration, proposal_r, x):
         c = concentration * (proposal_r - f)
         accept = ((c * (2 - c) - u2) > 0) | ((c / u2).log() + 1 - c >= 0)
         if accept.any():
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.where(accept, (u3 - 0.5).sign() * f.acos(), x)
             done = done | accept
     return (x + math.pi + loc) % (2 * math.pi) - math.pi
@@ -125,7 +131,10 @@ class VonMises(Distribution):
     :param torch.Tensor concentration: concentration parameter
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arg_constraints = {"loc": constraints.real, "concentration": constraints.positive}
     support = constraints.real
     has_rsample = False
@@ -163,10 +172,15 @@ def _concentration(self) -> Tensor:
     @lazy_property
     def _proposal_r(self) -> Tensor:
         kappa = self._concentration
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         tau = 1 + (1 + 4 * kappa**2).sqrt()
         rho = (tau - (2 * tau).sqrt()) / (2 * kappa)
         # pyrefly: ignore [unsupported-operation]
+=======
+        tau = 1 + (1 + 4 * kappa**2).sqrt()
+        rho = (tau - (2 * tau).sqrt()) / (2 * kappa)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _proposal_r = (1 + rho**2) / (2 * rho)
         # second order Taylor expansion around 0 for small kappa
         _proposal_r_taylor = 1 / kappa + kappa
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index dfc440296ad6c..351c45cd93abb 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -35,7 +35,10 @@ class Weibull(TransformedDistribution):
         "scale": constraints.positive,
         "concentration": constraints.positive,
     }
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     support = constraints.positive
 
     def __init__(
@@ -53,7 +56,10 @@ def __init__(
             PowerTransform(exponent=self.concentration_reciprocal),
             AffineTransform(loc=0, scale=self.scale),
         ]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index 2d230d48533b1..f95562c368cab 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -116,6 +116,7 @@ def __init__(
             )
 
         if scale_tril is not None:
+<<<<<<< HEAD
             # pyrefly: ignore [read-only]
             self.scale_tril = param.expand(batch_shape + (-1, -1))
         elif covariance_matrix is not None:
@@ -123,12 +124,22 @@ def __init__(
             self.covariance_matrix = param.expand(batch_shape + (-1, -1))
         elif precision_matrix is not None:
             # pyrefly: ignore [read-only]
+=======
+            self.scale_tril = param.expand(batch_shape + (-1, -1))
+        elif covariance_matrix is not None:
+            self.covariance_matrix = param.expand(batch_shape + (-1, -1))
+        elif precision_matrix is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.precision_matrix = param.expand(batch_shape + (-1, -1))
 
         if self.df.lt(event_shape[-1]).any():
             warnings.warn(
+<<<<<<< HEAD
                 "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim.",
                 stacklevel=2,
+=======
+                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         super().__init__(batch_shape, event_shape, validate_args=validate_args)
@@ -280,7 +291,11 @@ def rsample(
         else:
             # More optimized version with data-dependent control flow.
             if is_singular.any():
+<<<<<<< HEAD
                 warnings.warn("Singular sample detected.", stacklevel=2)
+=======
+                warnings.warn("Singular sample detected.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 for _ in range(max_try_correction):
                     sample_new = self._bartlett_sampling(is_singular[is_singular].shape)
@@ -339,7 +354,10 @@ def _natural_params(self) -> tuple[Tensor, Tensor]:
         p = self._event_shape[-1]  # has singleton shape
         return -self.precision_matrix / 2, (nu - p - 1) / 2
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _log_normalizer(self, x, y):
         p = self._event_shape[-1]
         return (y + (p + 1) / 2) * (
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index c300df11a0c50..0d613f1ead889 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import logging
 import os
 import warnings
@@ -34,6 +35,63 @@
     "ShapesCollection",
     "unflatten",
     "UnflattenedModule",
+=======
+import builtins
+import copy
+import dataclasses
+import inspect
+import os
+import sys
+import typing
+import warnings
+import zipfile
+from collections.abc import Iterator
+from enum import auto, Enum
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx._compatibility import compatibility
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import PassManager
+from torch.types import FileLike
+from torch.utils._pytree import (
+    FlattenFunc,
+    FromDumpableContextFn,
+    ToDumpableContextFn,
+    UnflattenFunc,
+)
+
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # Do not import unconditionally, as they import sympy and importing sympy is very slow
+    from torch._ops import OpOverload
+    from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+
+
+__all__ = [
+    "Constraint",
+    "Dim",
+    "ExportBackwardSignature",
+    "ExportGraphSignature",
+    "ExportedProgram",
+    "CustomDecompTable",
+    "ModuleCallEntry",
+    "ModuleCallSignature",
+    "default_decompositions",
+    "dims",
+    "export",
+    "export_for_training",
+    "load",
+    "register_dataclass",
+    "save",
+    "unflatten",
+    "FlatArgsAdapter",
+    "UnflattenedModule",
+    "AdditionalInputs",
+    "draft_export",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # To make sure export specific custom ops are loaded
@@ -53,6 +111,7 @@
 
 PassType = Callable[[torch.fx.GraphModule], Optional[PassResult]]
 
+<<<<<<< HEAD
 log: logging.Logger = logging.getLogger(__name__)
 
 
@@ -70,6 +129,17 @@ def export_for_training(
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+
+def export_for_training(
+    mod: torch.nn.Module,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    strict: bool = False,
+    preserve_module_call_signature: tuple[str, ...] = (),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     """
     :func:`export_for_training` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -158,19 +228,30 @@ def export_for_training(
         dynamic_shapes,
         strict=strict,
         preserve_module_call_signature=preserve_module_call_signature,
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 def export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
+<<<<<<< HEAD
     kwargs: Optional[Mapping[str, Any]] = None,
     *,
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    strict: bool = False,
+    preserve_module_call_signature: tuple[str, ...] = (),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     """
     :func:`export` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -282,7 +363,10 @@ def export(
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
             pre_dispatch=True,
+<<<<<<< HEAD
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     except Exception as e:
         draft_export_msg = (
@@ -448,7 +532,10 @@ def load(
             expected_opset_version=expected_opset_version,
         )
     except RuntimeError:
+<<<<<<< HEAD
         log.warning("Ran into the following error when deserializing", exc_info=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pt2_contents = PT2ArchiveContents({}, {}, {})
 
     if len(pt2_contents.exported_programs) > 0 or len(pt2_contents.extra_files) > 0:
@@ -458,6 +545,7 @@ def load(
         return pt2_contents.exported_programs["model"]
 
     # TODO: For backward compatibility, we support loading a zip file from 2.7. Delete this path in 2.9(?)
+<<<<<<< HEAD
     with zipfile.ZipFile(f, "r") as zipf:
         if "version" not in zipf.namelist():
             raise RuntimeError(
@@ -470,6 +558,12 @@ def load(
             "deprecated. Please generate a new pt2 saved file."
         )
 
+=======
+    warnings.warn(
+        "This version of file is deprecated. Please generate a new pt2 saved file."
+    )
+    with zipfile.ZipFile(f, "r") as zipf:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Check the version
         version = zipf.read("version").decode().split(".")
         from torch._export.serde.schema import (
@@ -500,10 +594,17 @@ def load(
             if file_info.filename == "serialized_exported_program.json":
                 serialized_exported_program = file_content
             elif file_info.filename == "serialized_state_dict.json":
+<<<<<<< HEAD
                 warnings.warn("This version of file is deprecated", stacklevel=2)
                 serialized_state_dict = file_content
             elif file_info.filename == "serialized_constants.json":
                 warnings.warn("This version of file is deprecated", stacklevel=2)
+=======
+                warnings.warn("This version of file is deprecated")
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.json":
+                warnings.warn("This version of file is deprecated")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 serialized_constants = file_content
             elif file_info.filename == "serialized_state_dict.pt":
                 serialized_state_dict = file_content
@@ -535,12 +636,20 @@ def load(
 def draft_export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
+<<<<<<< HEAD
     kwargs: Optional[Mapping[str, Any]] = None,
     *,
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    preserve_module_call_signature: tuple[str, ...] = (),
+    strict: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     """
     A version of torch.export.export which is designed to consistently produce
@@ -556,7 +665,10 @@ def draft_export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         strict=strict,
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/export/_draft_export.py b/torch/export/_draft_export.py
index 1e2a108d263df..4525960c358d0 100644
--- a/torch/export/_draft_export.py
+++ b/torch/export/_draft_export.py
@@ -4,6 +4,7 @@
 import os
 import re
 import tempfile
+<<<<<<< HEAD
 import time
 from collections.abc import Callable, Mapping
 from dataclasses import dataclass
@@ -14,14 +15,29 @@
 import torch._logging._internal
 import torch.utils._pytree as pytree
 from torch._dynamo.exc import UserError, UserErrorType
+=======
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch._logging._internal
+import torch._logging.structured
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.passes.insert_custom_op_guards import (
     get_op_profiles,
     insert_custom_op_guards,
     OpProfile,
 )
+<<<<<<< HEAD
 from torch._utils_internal import log_draft_export_usage
 
 from ._trace import _export, get_ep_stats
+=======
+
+from ._trace import _export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .dynamic_shapes import _DimHint, _DimHintType, Dim
 from .exported_program import ExportedProgram
 
@@ -295,7 +311,10 @@ def __enter__(self) -> "CaptureStructuredTrace":
 
         self.logger.addHandler(self)
         self.prev_get_dtrace = torch._logging._internal.GET_DTRACE_STRUCTURED
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._logging._internal.GET_DTRACE_STRUCTURED = True
         return self
 
@@ -303,7 +322,10 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:  # type: ignore[no-u
         self.log_record = LogRecord()
         self.expression_created_logs = {}
         self.logger.removeHandler(self)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._logging._internal.GET_DTRACE_STRUCTURED = self.prev_get_dtrace
         self.prev_get_dtrace = False
 
@@ -367,12 +389,17 @@ def _log_expression_created(
 def draft_export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
+<<<<<<< HEAD
     kwargs: Optional[Mapping[str, Any]] = None,
+=======
+    kwargs: Optional[dict[str, Any]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
     pre_dispatch: bool = True,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     start_time = time.time()
@@ -380,6 +407,12 @@ def draft_export(
     dynamic_shapes = dynamic_shapes or {}
 
     constraint_violation_msg = None
+=======
+) -> ExportedProgram:
+    kwargs = kwargs or {}
+    dynamic_shapes = dynamic_shapes or {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     capture_structured_log = CaptureStructuredTrace()
 
     with (
@@ -399,6 +432,7 @@ def draft_export(
                 strict=strict,
                 pre_dispatch=pre_dispatch,
                 preserve_module_call_signature=preserve_module_call_signature,
+<<<<<<< HEAD
                 prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
             )
         except Exception as exc:
@@ -435,6 +469,28 @@ def convert_dim_to_auto(dim: Any) -> Any:
                     type=f"{type(exc).__name__}.{type(exc).__qualname__}",
                 )
                 raise exc
+=======
+            )
+        except torch._dynamo.exc.UserError:
+
+            def convert_dim_to_auto(dim: Any) -> Any:
+                if isinstance(dim, Dim):
+                    return Dim.AUTO(min=dim.min, max=dim.max)
+                elif isinstance(dim, _DimHint) and dim.type == _DimHintType.DYNAMIC:
+                    return Dim.AUTO(min=dim.min, max=dim.max)
+                return dim
+
+            new_shapes = pytree.tree_map(convert_dim_to_auto, dynamic_shapes)
+            ep = _export(
+                mod,
+                args,
+                kwargs,
+                dynamic_shapes=new_shapes,
+                strict=strict,
+                pre_dispatch=pre_dispatch,
+                preserve_module_call_signature=preserve_module_call_signature,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         torch._logging.dtrace_structured("exported_program", payload_fn=lambda: str(ep))
 
@@ -533,6 +589,7 @@ def convert_dim_to_auto(dim: Any) -> Any:
     """
         )
 
+<<<<<<< HEAD
     log_draft_export_usage(
         error=False,
         export_time=time.time() - start_time,
@@ -541,4 +598,6 @@ def convert_dim_to_auto(dim: Any) -> Any:
         report=ep._report,
         **get_ep_stats(ep),
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ep
diff --git a/torch/export/_remove_effect_tokens_pass.py b/torch/export/_remove_effect_tokens_pass.py
index bde7eb6042245..2bc737abe877c 100644
--- a/torch/export/_remove_effect_tokens_pass.py
+++ b/torch/export/_remove_effect_tokens_pass.py
@@ -23,7 +23,11 @@ def _remove_effect_tokens_from_graph_helper(
     output_node = None
     with_effect_nodes: list[torch.fx.Node] = []
 
+<<<<<<< HEAD
     # Output node need to check its args against output_token_names (collected from output_spec)
+=======
+    # Output node need to check its args agianst output_token_names (collected from output_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Therefore, we only need to find the top-levele output node
     output_node = next(reversed(ep.graph_module.graph.find_nodes(op="output")))
     for module in ep.graph_module.modules():
@@ -126,7 +130,11 @@ def _remove_effect_tokens_from_graph_helper(
 
 def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
     """
+<<<<<<< HEAD
     Removes the existence of tokens from the exported program, including:
+=======
+    Removes the existance of tokens from the exported program, including:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - Removes the input and output tokens
     - Replaces with_effects(token, func, args) with just func(args)
 
diff --git a/torch/export/_swap.py b/torch/export/_swap.py
index 333d70c2b64d5..9ec05e0840ced 100644
--- a/torch/export/_swap.py
+++ b/torch/export/_swap.py
@@ -107,11 +107,16 @@ def _try_remove_connecting_pytrees(curr_module_node: torch.fx.Node) -> None:
             return
 
         if not (
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             arg.op == "call_function"
             # pyrefly: ignore [missing-attribute]
             and arg.target == operator.getitem
             # pyrefly: ignore [missing-attribute]
+=======
+            arg.op == "call_function"
+            and arg.target == operator.getitem
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and arg.args[1] == i
         ):
             log.debug(
@@ -142,7 +147,11 @@ def _try_remove_connecting_pytrees(curr_module_node: torch.fx.Node) -> None:
         return
 
     next_module_node = next(iter(unflatten_getitem_getitem_users))
+<<<<<<< HEAD
     if next_module_node.op != "call_module":
+=======
+    if not (next_module_node.op == "call_module"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug(
             "Unflatten node %s's user is not a call_module. "
             "Instead it is: %s. Passing...",
@@ -166,7 +175,11 @@ def _remove_extraneous_pytrees(gm: torch.fx.GraphModule) -> None:
     """
 
     for node in gm.graph.nodes:
+<<<<<<< HEAD
         if node.op == "call_module" and node.target != "_guards_fn":
+=======
+        if node.op == "call_module":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _try_remove_connecting_pytrees(node)
 
     gm.graph.eliminate_dead_code()
@@ -195,16 +208,28 @@ def _construct_inputs(
     unflatten_node = _generate_unflatten(gm, tree_unflatten_args, signature.in_spec)
 
     assert signature.in_spec.num_children == 2
+<<<<<<< HEAD
     assert signature.in_spec.type is tuple
     args_spec, kwargs_spec = signature.in_spec.children()
     assert args_spec.type is tuple
     assert kwargs_spec.type is dict
 
+=======
+
+    args_spec = signature.in_spec.children_specs[0]
+    assert args_spec.context is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args_node = gm.graph.call_function(operator.getitem, (unflatten_node, 0))
     args_nodes = [
         gm.graph.call_function(operator.getitem, (args_node, i))
         for i in range(args_spec.num_children)
     ]
+<<<<<<< HEAD
+=======
+
+    kwargs_spec = signature.in_spec.children_specs[1]
+    assert kwargs_spec.context is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs_node = gm.graph.call_function(operator.getitem, (unflatten_node, 1))
     kwargs_nodes = {
         k: gm.graph.call_function(operator.getitem, (kwargs_node, k))
@@ -371,10 +396,17 @@ def _fix_input_output_signature(
     if forward_arg_names is None:
         forward_arg_names = []
         assert signature.in_spec.num_children == 2
+<<<<<<< HEAD
         arg_spec = signature.in_spec.child(0)
         kwarg_spec = signature.in_spec.child(1)
         assert arg_spec.type is tuple
         assert kwarg_spec.type is dict
+=======
+        arg_spec = signature.in_spec.children_specs[0]
+        kwarg_spec = signature.in_spec.children_specs[1]
+        assert arg_spec.type == tuple
+        assert kwarg_spec.type == dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(arg_spec.num_children):
             forward_arg_names.append(f"arg_{i}")
         forward_arg_names.extend(kwarg_spec.context)
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index b1926abebaa8b..976d3e1ca15b3 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -8,6 +8,7 @@
 import sys
 import time
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager, ExitStack, nullcontext
 from itertools import chain
@@ -16,6 +17,10 @@
 
 if TYPE_CHECKING:
     import weakref
+=======
+from contextlib import contextmanager, nullcontext
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
@@ -54,27 +59,43 @@
 )
 from torch._export.verifier import SpecViolationError
 from torch._export.wrappers import _wrap_submodules
+<<<<<<< HEAD
 from torch._functorch._aot_autograd.graph_capture_wrappers import create_functional_call
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch._aot_autograd.input_output_analysis import (
     _graph_input_names,
     _graph_output_names,
 )
 from torch._functorch._aot_autograd.schemas import GraphSignature
 from torch._functorch._aot_autograd.subclass_utils import get_subclass_typing_container
+<<<<<<< HEAD
+=======
+from torch._functorch._aot_autograd.traced_function_transforms import (
+    create_functional_call,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch._aot_autograd.utils import (
     create_tree_flattened_fn,
     register_buffer_assignment_hook,
 )
 from torch._functorch.aot_autograd import (
     _detect_attribute_assignment,
+<<<<<<< HEAD
     aot_export_joint_with_descriptors,
+=======
+    aot_export_module,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._guards import detect_fake_mode, tracing, TracingContext
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._logging import dtrace_structured
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._utils_internal import log_export_usage
+<<<<<<< HEAD
 from torch.export._leakage_detection_utils import find_legit_leaks_from_referrers
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export._unlift import _check_input_constraints_pre_hook
 from torch.export.dynamic_shapes import (
     _check_dynamic_shapes,
@@ -97,10 +118,19 @@
     GuardOnDataDependentSymNode,
     ShapeEnv,
 )
+<<<<<<< HEAD
 from torch.fx.graph import _PyTreeInfo
 from torch.utils._pytree import TreeSpec
 from torch.utils._sympy.value_ranges import ValueRangeError
 
+=======
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from torch.utils._pytree import TreeSpec
+from torch.utils._sympy.value_ranges import ValueRangeError
+
+from ._safeguard import AutogradStateOpsFailSafeguard
+from ._wrapper_utils import _WrapperModule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .exported_program import (
     _disable_prexisiting_fake_mode,
     ExportedProgram,
@@ -113,9 +143,12 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 # Type alias for dynamic shapes specification
 _DynamicShapesSpec: TypeAlias = Union[dict[str, Any], tuple[Any, ...], list[Any]]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class ExportDynamoConfig:
@@ -186,7 +219,10 @@ def _ignore_backend_decomps():
 def _disable_custom_triton_op_functional_decomposition():
     old = torch._functorch.config.decompose_custom_triton_ops
     try:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._functorch.config.decompose_custom_triton_ops = False
         yield torch._functorch.config.decompose_custom_triton_ops
     finally:
@@ -208,6 +244,7 @@ def _strip_root(x):
     return x
 
 
+<<<<<<< HEAD
 def _is_bogus_const_name(name: str):
     splitted_names = name.split(".")
     if len(splitted_names) < 1:
@@ -219,6 +256,11 @@ def _is_bogus_const_name(name: str):
 def _rewrite_tracepoint_node(gm: torch.fx.GraphModule):
     """
     In-place modify input graph module by replacing the export tracepoint with a new node
+=======
+def _rewrite_tracepoint_node(gm: torch.fx.GraphModule):
+    """
+    In-place modifiy input graph module by replacing the export tracepoint with a new node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     that has the same target and args, but with the _export_root stripped from path.
     """
     for node in gm.graph.nodes:
@@ -276,7 +318,11 @@ def _extract_fake_inputs(gm, args, kwargs):
 
     # We get both because now we might have a combination of symint and tensor
     # inputs, and we want to check that the shape env is consistent between
+<<<<<<< HEAD
     # both. Unfortunately we can't see what fake mode is attached to the shape
+=======
+    # both. Unforunately we can't see what fake mode is attached to the shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # env, then we can just compare fake modes.
     detected_fake_mode = detect_fake_mode(fake_inps + fake_vals)
     detected_shape_env = detect_shape_env(fake_inps + fake_vals)
@@ -357,6 +403,7 @@ def _normalize_nn_module_stack(gm_torch_level, root_cls):
             if add_root:
 
                 def normalize_path(path):
+<<<<<<< HEAD
                     if path == "L['self']":
                         return ""
                     if path.startswith("L['self']."):
@@ -366,6 +413,28 @@ def normalize_path(path):
                 nn_module_stack = {
                     root_key: (root, root_cls.__module__ + "." + root_cls.__qualname__),
                     # pyrefly: ignore [unbound-name]
+=======
+                    try:
+                        parts = []
+
+                        class Path:
+                            def __getattr__(self, name):
+                                if name != "_modules":
+                                    parts.append(name)
+                                return self
+
+                            def __getitem__(self, idx):
+                                parts.append(str(idx))
+                                return self
+
+                        eval(path, {"L": {"self": Path()}})
+                        return ".".join(parts)
+                    except Exception:  # TODO(zhxchen17) Remove this.
+                        return path
+
+                nn_module_stack = {
+                    root_key: (root, root_cls.__module__ + "." + root_cls.__qualname__),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     **nn_module_stack,
                 }
                 node.meta["nn_module_stack"] = {
@@ -387,9 +456,13 @@ def _get_param_buffer_mapping(
     param_lookup: dict[int, str] = {}
     buffer_lookup: dict[int, str] = {}
     for name, param in original_module.named_parameters(remove_duplicate=False):
+<<<<<<< HEAD
         if param_lookup.get(id(param)) is None:
             # we only want to keep the first occurrence of a parameter to guarantee parity of original and traced module.
             param_lookup[id(param)] = name
+=======
+        param_lookup[id(param)] = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name, buffer in original_module.named_buffers(remove_duplicate=False):
         buffer_lookup[id(buffer)] = name
 
@@ -680,6 +753,7 @@ def _restore_state_dict(
     Restores the state dict of the traced module to that of the original module.
     """
     param_buffer_table = _get_param_buffer_mapping(original_module, traced_module)
+<<<<<<< HEAD
     # Don't want to change the convention of previous call.
     param_buffer_table_reverse = {v: k for k, v in param_buffer_table.items()}
 
@@ -696,6 +770,26 @@ def _restore_state_dict(
             param = torch.fx.graph_module._get_attr(traced_module, dynamo_name)
             torch.fx.graph_module._assign_attr(param, traced_module, name)
             torch.fx.graph_module._del_attr(traced_module, dynamo_name)
+=======
+    # Since the graph module is flattened (no module heirarchy), we
+    # need to noramlize the module by replacing "." with "_". If we
+    # don't, it will try to save the weight to a submodule which no
+    # longer exists.
+    for name, fqn in param_buffer_table.items():
+        param_buffer_table[name] = fqn.replace(".", "_")
+
+    # Replace state dict attr names with the fqn
+    for name, fqn in param_buffer_table.items():
+        if not hasattr(traced_module, name):
+            continue
+
+        attr = getattr(traced_module, name)
+        if isinstance(attr, torch.Tensor) and not isinstance(attr, torch.nn.Parameter):
+            traced_module.register_buffer(fqn, attr)
+        else:
+            setattr(traced_module, fqn, attr)
+        delattr(traced_module, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Replace graph getattr nodes with the correct name
     for node in traced_module.graph.nodes:
@@ -739,10 +833,13 @@ def _make_module_call_graph(
     return [*original, *additional]
 
 
+<<<<<<< HEAD
 class _ExportModuleSpecTrackerDict(dict):
     pass
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _export_to_torch_ir(
     f: Callable,
     args: tuple[Any, ...],
@@ -751,7 +848,11 @@ def _export_to_torch_ir(
     *,
     preserve_module_call_signature: tuple[str, ...] = (),
     disable_constraint_solver: bool = False,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+    allow_complex_guards_as_runtime_asserts: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     restore_fqn: bool = True,
     _log_export_usage: bool = True,
     same_signature: bool = True,
@@ -793,6 +894,7 @@ def _export_to_torch_ir(
         (args, kwargs),
     )
 
+<<<<<<< HEAD
     dynamo_cfg = dataclasses.replace(
         DEFAULT_EXPORT_DYNAMO_CONFIG,
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
@@ -803,12 +905,18 @@ def _export_to_torch_ir(
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
                 _ExportModuleSpecTrackerDict()
             )
+=======
+    with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
+        try:
+            module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ctx = nullcontext()
             if not isinstance(f, torch.fx.GraphModule):
                 ctx = _wrap_submodules(  # type: ignore[assignment]
                     f, preserve_module_call_signature, module_call_specs
                 )
             with ctx, _ignore_backend_decomps():
+<<<<<<< HEAD
                 if torch._export.config.use_new_tracer_experimental:
                     from torch._dynamo.functional_export import (
                         _dynamo_graph_capture_for_export,
@@ -840,6 +948,25 @@ def _export_to_torch_ir(
                         **kwargs,
                     )
                     gm_torch_level.meta["module_call_specs"] = module_call_specs
+=======
+                gm_torch_level, _ = torch._dynamo.export(
+                    f,
+                    dynamic_shapes=dynamic_shapes,  # type: ignore[arg-type]
+                    constraints=constraints,  # type: ignore[arg-type]
+                    assume_static_by_default=True,
+                    tracing_mode="symbolic",
+                    disable_constraint_solver=disable_constraint_solver,
+                    # currently the following 2 flags are tied together for export purposes,
+                    # but untangle for sake of dynamo export api
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    _log_export_usage=_log_export_usage,
+                    same_signature=same_signature,
+                )(
+                    *args,
+                    **kwargs,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except (ConstraintViolationError, ValueRangeError) as e:
             raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: B904
         except GuardOnDataDependentSymNode as e:
@@ -849,12 +976,18 @@ def _export_to_torch_ir(
                 case_name="constrain_as_size_example",
             )
 
+<<<<<<< HEAD
+=======
+    gm_torch_level.meta["module_call_specs"] = module_call_specs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(f, torch.nn.Module) and restore_fqn:
         _restore_state_dict(f, gm_torch_level)
 
     return gm_torch_level
 
 
+<<<<<<< HEAD
 def _aot_export_joint_with_descriptors(
     stack,
     mod,
@@ -903,6 +1036,8 @@ def _aot_export_joint_with_descriptors(
     return gm, graph_signature
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _export_to_aten_ir(
     mod: torch.nn.Module,
     fake_args,
@@ -914,9 +1049,29 @@ def _export_to_aten_ir(
     transform=lambda x: x,  # TODO(zhxchen17) Revisit if this is needed later.
     pre_dispatch=False,
     decomp_table=None,
+<<<<<<< HEAD
     _prettify_placeholder_names: bool = True,
     decompose_custom_triton_ops: bool = False,
 ) -> ATenExportArtifact:
+=======
+    _check_autograd_state: bool = True,
+    _is_torch_jit_trace: bool = False,
+    _prettify_placeholder_names: bool = True,
+    decompose_custom_triton_ops: bool = False,
+) -> ATenExportArtifact:
+    # [NOTE] If the user is exporting under training mode, we want to detect if there is any
+    # state change in the autograd global state and error. If the user is exporting under inference
+    # mode, we don't care. At predispatch level, we don't care about the state change.
+    is_grad_enabled = torch._C.is_grad_enabled()
+    grad_safe_guard = nullcontext()
+    # export_to_aten_ir is called when we decompose the ep into inference IR
+    # In that setting, we actually shouldn't check the state change as at this point,
+    # because the intention is specalizing to inference.
+    if _check_autograd_state:
+        if not pre_dispatch and is_grad_enabled:
+            grad_safe_guard = AutogradStateOpsFailSafeguard()  # type: ignore[assignment]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_triton_ops_decomposition_ctx = (
         nullcontext
         if decompose_custom_triton_ops
@@ -925,6 +1080,7 @@ def _export_to_aten_ir(
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
+<<<<<<< HEAD
     with ExitStack() as stack:
         stack.enter_context(
             torch.nn.utils.stateless._reparametrize_module(
@@ -948,6 +1104,28 @@ def _export_to_aten_ir(
             decompositions=decomp_table,
             fake_params_buffers=fake_params_buffers,
             _record_nn_module_stack=True,
+=======
+    with (
+        torch.nn.utils.stateless._reparametrize_module(
+            mod,
+            fake_params_buffers,
+            tie_weights=True,
+            strict=True,
+            stack_weights=True,
+        ),
+        grad_safe_guard,
+        _ignore_backend_decomps(),
+        _compiling_state_context(),
+        custom_triton_ops_decomposition_ctx(),
+    ):
+        gm, graph_signature = transform(aot_export_module)(
+            mod,
+            fake_args,
+            trace_joint=False,
+            pre_dispatch=pre_dispatch,
+            decompositions=decomp_table,
+            kwargs=fake_kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _maybe_fixup_gm_and_output_node_meta(old_gm, new_gm):
@@ -958,8 +1136,12 @@ def _maybe_fixup_gm_and_output_node_meta(old_gm, new_gm):
             new_output_node = list(new_gm.graph.nodes)[-1]
             assert old_output_node.op == "output" and new_output_node.op == "output"
             # make sure we don't override any meta
+<<<<<<< HEAD
             if "desc" in new_output_node.meta:
                 del new_output_node.meta["desc"]
+=======
+            assert len(new_output_node.meta) == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_output_node.meta.update(old_output_node.meta)
 
     # TODO unfortunately preserving graph-level metadata and output node's meta
@@ -1268,6 +1450,7 @@ def _get_original_state_dict(mod: torch.nn.Module) -> dict[str, Any]:
     return original_state_dict
 
 
+<<<<<<< HEAD
 def _process_export_inputs(
     mod: torch.nn.Module,
     args: tuple[object, ...],
@@ -1313,12 +1496,16 @@ def _process_export_inputs(
     Raises:
         UserError: If args is not a tuple.
     """
+=======
+def _process_export_inputs(mod, args, kwargs, dynamic_shapes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(args, tuple):
         raise UserError(
             UserErrorType.INVALID_INPUT,
             f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
         )
     kwargs = kwargs if kwargs is not None else {}
+<<<<<<< HEAD
     if pytree.is_namedtuple_instance(args):
         args = tuple(args)
 
@@ -1337,6 +1524,19 @@ def _process_export_inputs(
             out_dynamic_shapes = dynamic_shapes
 
     return args, kwargs, original_in_spec, out_dynamic_shapes, verify_additional_inputs
+=======
+    _, original_in_spec = pytree.tree_flatten((args, kwargs))
+
+    if isinstance(dynamic_shapes, torch.export.AdditionalInputs):
+        verify_additional_inputs = dynamic_shapes.verify
+        dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
+    else:
+        verify_additional_inputs = lambda ep: None  # noqa: E731
+        if isinstance(dynamic_shapes, torch.export.ShapesCollection):
+            dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
+
+    return args, kwargs, original_in_spec, dynamic_shapes, verify_additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_module_call_graph(
@@ -1366,7 +1566,11 @@ def _get_module_call_graph(
             outputs=[],
             in_spec=specs["in_spec"],
             out_spec=specs["out_spec"],
+<<<<<<< HEAD
             forward_arg_names=None,  # we only propagate forward_arg_names for the top level module
+=======
+            forward_arg_names=None,  # we only propage forward_arg_names for the top level module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if len(preserve_module_call_signature) > 0:
@@ -1392,6 +1596,10 @@ def _get_range_constraints(
     args,
     kwargs,
     dynamic_shapes,
+<<<<<<< HEAD
+=======
+    _is_torch_jit_trace=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     gm: torch.fx.GraphModule = export_artifact.aten.gm
     export_graph_signature: ExportGraphSignature = export_artifact.aten.sig
@@ -1404,6 +1612,7 @@ def _get_range_constraints(
         ),
         len(export_graph_signature.input_specs),
     )
+<<<<<<< HEAD
     combined_args = _combine_args(mod, args, kwargs)
 
     # This is because we trace based on the kwargs passed in from user
@@ -1419,6 +1628,26 @@ def _get_range_constraints(
         combined_args_traced_order[key] = kwargs[key]
 
     combined_args = combined_args_traced_order
+=======
+    combined_args = _combine_args(
+        mod, args, kwargs, _is_torch_jit_trace=_is_torch_jit_trace
+    )
+
+    # This is because we trace based on the kewargs passed in from user
+    # not based on the signature. I feel it would be better to just enforce
+    # one ordering at the start of tracing to avoid confusions, but that is
+    # bigger refactor, so do this to unblock for now.
+    if not _is_torch_jit_trace:
+        combined_args_traced_order = {}
+        for arg in combined_args:
+            if arg not in kwargs:
+                combined_args_traced_order[arg] = combined_args[arg]
+
+        for key in kwargs:
+            combined_args_traced_order[key] = kwargs[key]
+
+        combined_args = combined_args_traced_order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     range_constraints = make_constraints(
         fake_mode,
@@ -1467,6 +1696,47 @@ def _temp_disable_texpr_fuser():
         torch._C._jit_set_texpr_fuser_enabled(original_state)
 
 
+<<<<<<< HEAD
+=======
+def _convert_ts_to_export_experimental(traced_callable, args, kwargs=None):
+    with _temp_disable_texpr_fuser():
+        from torch.jit._trace import TopLevelTracedModule
+
+        export_args, export_kwargs = _process_jit_trace_inputs_for_export(args, kwargs)
+
+        if isinstance(traced_callable, (TopLevelTracedModule, torch._C.ScriptModule)):  # type: ignore[operator]
+            return _export(
+                traced_callable,
+                export_args,
+                export_kwargs,
+                strict=False,
+                _is_torch_jit_trace=True,
+            ).module()
+
+        elif isinstance(traced_callable, torch.ScriptMethod) and isinstance(
+            traced_callable.owner(),  # type: ignore[operator]
+            (torch._C.ScriptModule, torch.nn.Module),
+        ):
+            with patch_forward(traced_callable.owner(), traced_callable):  # type: ignore[operator]
+                return _export(
+                    traced_callable.owner(),  # type: ignore[operator]
+                    export_args,
+                    export_kwargs,
+                    strict=False,
+                    _is_torch_jit_trace=True,
+                ).module()
+
+        else:
+            return _export(
+                _WrapperModule(traced_callable),
+                export_args,
+                export_kwargs,
+                strict=False,
+                _is_torch_jit_trace=True,
+            ).module()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _strict_export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
@@ -1474,7 +1744,12 @@ def _strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool,
+=======
+    allow_complex_guards_as_runtime_asserts: bool,
+    _is_torch_jit_trace: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1488,7 +1763,11 @@ def _strict_export(
         dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         restore_fqn=False,  # don't need to restore because we will do it later
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _log_export_usage=False,
     )
 
@@ -1533,6 +1812,7 @@ def _strict_export(
 
     # aot_export expect the return type to always be a tuple.
     if out_spec.type not in (list, tuple):
+<<<<<<< HEAD
         out_spec = pytree.treespec_tuple([out_spec])
 
     orig_arg_names = gm_torch_level.graph._codegen.pytree_info.orig_args  # type: ignore[attr-defined]
@@ -1541,6 +1821,18 @@ def _strict_export(
         orig_arg_names,
         gm_torch_level._in_spec,
         out_spec,
+=======
+        out_spec = pytree.TreeSpec(tuple, None, [out_spec])
+
+    orig_arg_names = gm_torch_level.graph._codegen.pytree_info.orig_args  # type: ignore[attr-defined]
+
+    gm_torch_level.graph._codegen = _PyTreeCodeGen(
+        _PyTreeInfo(
+            orig_arg_names,
+            gm_torch_level._in_spec,
+            out_spec,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     gm_torch_level.recompile()
 
@@ -1628,7 +1920,11 @@ def _export_to_aten_ir_make_fx(
     produce_guards_callback=None,
     transform=lambda x: x,
 ) -> ATenExportArtifact:
+<<<<<<< HEAD
     def _make_fx_helper(stack, mod, args, kwargs, **flags):
+=======
+    def _make_fx_helper(mod, args, kwargs, **flags):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = kwargs or {}
 
         named_parameters = dict(mod.named_parameters(remove_duplicate=False))
@@ -1752,6 +2048,7 @@ def override_getattribute_for_subclasses(args):
                     for k, (old_getattr, _) in tensor_type_to_old_getattribute.items():
                         k.__getattribute__ = old_getattr  # type: ignore[method-assign, attr-defined]
 
+<<<<<<< HEAD
             @contextmanager
             def _maybe_restore_grad_state():
                 """
@@ -1770,6 +2067,9 @@ def _maybe_restore_grad_state():
                 override_getattribute_for_subclasses(flat_args),
                 _maybe_restore_grad_state(),
             ):
+=======
+            with ctx, override_getattribute_for_subclasses(flat_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gm = make_fx(
                     wrapped_fn,
                     record_module_stack=True,
@@ -1820,7 +2120,10 @@ def _is_impure(node):
             gm.graph.eliminate_dead_code(_is_impure)
 
         # create graph signature
+<<<<<<< HEAD
         assert out_spec.spec is not None, "out_spec.spec is None!"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_names = _graph_input_names(gm)
         output_names = _graph_output_names(gm)
         sig = GraphSignature(
@@ -1833,10 +2136,16 @@ def _is_impure(node):
                 zip(input_names[param_len : param_len + buffer_len], named_buffers)
             ),
             buffers_to_mutate={},
+<<<<<<< HEAD
             parameters_to_mutate={},
             user_inputs_to_mutate={},
             in_spec=in_spec,
             out_spec=out_spec.spec,
+=======
+            user_inputs_to_mutate={},
+            in_spec=in_spec,
+            out_spec=out_spec,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             backward_signature=None,
             input_tokens=[],
             output_tokens=[],
@@ -1846,6 +2155,7 @@ def _is_impure(node):
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
+<<<<<<< HEAD
     with ExitStack() as stack:
         stack.enter_context(
             torch.nn.utils.stateless._reparametrize_module(
@@ -1860,6 +2170,20 @@ def _is_impure(node):
         stack.enter_context(_compiling_state_context())
         gm, graph_signature = transform(_make_fx_helper)(
             stack,
+=======
+    with (
+        torch.nn.utils.stateless._reparametrize_module(
+            mod,
+            fake_params_buffers,
+            tie_weights=True,
+            strict=True,
+            stack_weights=True,
+        ),
+        _ignore_backend_decomps(),
+        _compiling_state_context(),
+    ):
+        gm, graph_signature = transform(_make_fx_helper)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod,
             fake_args,
             trace_joint=False,
@@ -1931,7 +2255,12 @@ def _non_strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool,
+=======
+    allow_complex_guards_as_runtime_asserts: bool,
+    _is_torch_jit_trace: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1944,7 +2273,11 @@ def _non_strict_export(
     module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = {}
 
     def _tuplify_outputs(aot_export):
+<<<<<<< HEAD
         def _aot_export_non_strict(stack, mod, args, *, kwargs=None, **flags):
+=======
+        def _aot_export_non_strict(mod, args, kwargs=None, **flags):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kwargs = kwargs or {}
 
             class Wrapper(torch.nn.Module):
@@ -1988,8 +2321,13 @@ def forward(self, *args, **kwargs):
                     wrapped_mod, new_preserved_call_signatures, module_call_specs
                 )
             with ctx:
+<<<<<<< HEAD
                 gm, sig = aot_export(stack, wrapped_mod, args, kwargs=kwargs, **flags)
             log.debug("Exported program from AOTAutograd:\n%s", gm)
+=======
+                gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags)
+                log.debug("Exported program from AOTAutograd:\n%s", gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             sig.parameters = pytree.tree_map(_strip_root, sig.parameters)
             sig.buffers = pytree.tree_map(_strip_root, sig.buffers)
@@ -1998,9 +2336,12 @@ def forward(self, *args, **kwargs):
                 _strip_root, sig.inputs_to_parameters
             )
             sig.buffers_to_mutate = pytree.tree_map(_strip_root, sig.buffers_to_mutate)
+<<<<<<< HEAD
             sig.parameters_to_mutate = pytree.tree_map(
                 _strip_root, sig.parameters_to_mutate
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for node in gm.graph.nodes:
                 if "nn_module_stack" in node.meta:
@@ -2028,7 +2369,12 @@ def forward(self, *args, **kwargs):
         args,
         kwargs,
         dynamic_shapes,
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,  # for shape env initialization
+=======
+        _is_torch_jit_trace=_is_torch_jit_trace,
+        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,  # for shape env initialization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     fake_params_buffers = _fakify_params_buffers(fake_mode, mod)
@@ -2040,6 +2386,10 @@ def _produce_guards_callback(gm):
             dynamic_shapes=dynamic_shapes,
             equalities_inputs=equalities_inputs,
             original_signature=original_signature,
+<<<<<<< HEAD
+=======
+            _is_torch_jit_trace=_is_torch_jit_trace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     tx = TracingContext(fake_mode)
@@ -2068,9 +2418,13 @@ def _produce_guards_callback(gm):
             _fakify_module_inputs(fake_args, fake_kwargs, fake_mode),
             _override_builtin_ops(),
         ):
+<<<<<<< HEAD
             # _to_aten_func is _export_to_aten_ir when using the default non-strict export
             # We need to pass positional args correctly
             aten_export_artifact = _to_aten_func(
+=======
+            aten_export_artifact = _to_aten_func(  # type: ignore[operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 patched_mod,
                 new_fake_args,
                 new_fake_kwargs,
@@ -2111,7 +2465,10 @@ def _export_for_training(
     *,
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     global _EXPORT_MODULE_HIERARCHY
     _EXPORT_MODULE_HIERARCHY = _get_module_hierarchy(mod)
@@ -2126,6 +2483,7 @@ def _export_for_training(
 
     original_state_dict = _get_original_state_dict(mod)
 
+<<<<<<< HEAD
     has_ambient_mode = False
     if not strict:
         flat_args, _ = pytree.tree_flatten((args, kwargs))
@@ -2139,6 +2497,11 @@ def _export_for_training(
 
         fake_tensor_tls.non_strict_export_fake_tensor_tracker.clear()
 
+=======
+    # Call the appropriate export function based on the strictness of tracing.
+    export_func = _strict_export if strict else _non_strict_export
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     export_artifact = export_func(
         mod=mod,
         args=args,
@@ -2146,6 +2509,7 @@ def _export_for_training(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=orig_in_spec,
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=_export_to_aten_ir_make_fx,
     )
@@ -2169,6 +2533,13 @@ def _export_for_training(
                 else:
                     warnings.warn(error_msg, stacklevel=2)
 
+=======
+        allow_complex_guards_as_runtime_asserts=False,
+        _is_torch_jit_trace=False,
+        _to_aten_func=_export_to_aten_ir_make_fx,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     export_graph_signature = export_artifact.aten.sig
 
     forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
@@ -2213,6 +2584,7 @@ def _export_for_training(
     )
 
     verify_additional_inputs(exported_program)
+<<<<<<< HEAD
 
     if not strict and torch._export.config.detect_non_strict_fake_tensor_leaks:
         # See NOTE [export non-strict fake tensor leak detection]
@@ -2249,6 +2621,8 @@ def _export_for_training(
 
             del legit_leak
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return exported_program
 
 
@@ -2263,7 +2637,12 @@ def _export(
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
     pre_dispatch: bool = False,
+<<<<<<< HEAD
     prefer_deferred_runtime_asserts_over_guards: bool = False,
+=======
+    allow_complex_guards_as_runtime_asserts: bool = False,
+    _is_torch_jit_trace: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
@@ -2294,7 +2673,11 @@ def _export(
         preserve_module_call_signature: A list of submodule paths for which the original
             calling conventions are preserved as metadata.
 
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards:
+=======
+        allow_complex_guards_as_runtime_asserts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          With the current dynamic shapes language for dims and derived dims, we can run into constraints
          that are not expressible with the language. For example, flattening a matrix and adding to a vector,
          both fully dynamic (i.e. x.reshape([-1]) + y) emits a guard s0 * s1 = s2, which is not expressible.
@@ -2338,7 +2721,10 @@ def _export(
             dynamic_shapes,
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
+<<<<<<< HEAD
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dtrace_structured("exported_program", payload_fn=lambda: str(ep))
         return ep
@@ -2363,15 +2749,30 @@ def _export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=original_in_spec,
+<<<<<<< HEAD
         prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=functools.partial(
             _export_to_aten_ir,
             pre_dispatch=pre_dispatch,
+=======
+        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        _is_torch_jit_trace=_is_torch_jit_trace,
+        _to_aten_func=functools.partial(
+            _export_to_aten_ir,
+            pre_dispatch=pre_dispatch,
+            _is_torch_jit_trace=_is_torch_jit_trace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     export_graph_signature: ExportGraphSignature = export_artifact.aten.sig
 
+<<<<<<< HEAD
     forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
+=======
+    forward_arg_names = (
+        _get_forward_arg_names(mod, args, kwargs) if not _is_torch_jit_trace else None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inline_constraints = _get_inline_constraints(export_artifact.fake_mode)
     # The unbacked symint symbols are updated in aot_export
     # so we serialize them here instead of inside dynamo.
@@ -2383,6 +2784,10 @@ def _export(
         args,
         kwargs,
         dynamic_shapes,
+<<<<<<< HEAD
+=======
+        _is_torch_jit_trace=_is_torch_jit_trace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     gm, module_call_graph = _get_module_call_graph(
         export_artifact,
@@ -2393,7 +2798,12 @@ def _export(
 
     _verify_nn_module_stack(gm)
     _verify_stack_trace(gm)
+<<<<<<< HEAD
     _verify_placeholder_names(gm, export_graph_signature)
+=======
+    if not _is_torch_jit_trace:
+        _verify_placeholder_names(gm, export_graph_signature)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Remove Proxy because they cannot be deepcopied or pickled.
     torch._export.utils.remove_proxy_from_state_dict(original_state_dict, in_place=True)
diff --git a/torch/export/_tree_utils.py b/torch/export/_tree_utils.py
index de768c38f21f1..8353e149fe32c 100644
--- a/torch/export/_tree_utils.py
+++ b/torch/export/_tree_utils.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils._pytree import Context, TreeSpec
 
@@ -15,7 +19,11 @@ def reorder_kwargs(user_kwargs: dict[str, Any], spec: TreeSpec) -> dict[str, Any
     # Make sure that the spec is actually shaped like (args, kwargs)
     assert spec.type is tuple
     assert spec.num_children == 2
+<<<<<<< HEAD
     kwargs_spec = spec.child(1)
+=======
+    kwargs_spec = spec.children_specs[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert kwargs_spec.type is dict
 
     if set(user_kwargs) != set(kwargs_spec.context):
@@ -55,10 +63,17 @@ def is_equivalent(
         return False
 
     # Recurse on children
+<<<<<<< HEAD
     if spec1.num_children != spec2.num_children:
         return False
 
     for child_spec1, child_spec2 in zip(spec1.children(), spec2.children()):
+=======
+    if len(spec1.children_specs) != len(spec2.children_specs):
+        return False
+
+    for child_spec1, child_spec2 in zip(spec1.children_specs, spec2.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not is_equivalent(child_spec1, child_spec2, equivalence_fn):
             return False
 
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index 52d06a294fac1..862be6a385088 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -1,14 +1,20 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 import inspect
 import math
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from collections.abc import Sequence
 from itertools import chain
 from typing import Any, Optional
 
+<<<<<<< HEAD
 import sympy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.utils._pytree as pytree
 from torch._export.non_strict_utils import (
@@ -16,16 +22,22 @@
     _exit_enable_graph_inputs_of_type_nn_module,
     _get_graph_inputs_of_type_nn_module,
 )
+<<<<<<< HEAD
 from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
     _convert_range_to_int,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.utils import _check_input_constraints_for_graph
 from torch.export.unflatten import _assign_attr, _AttrKind
 from torch.fx.experimental.proxy_tensor import _pytree_subclasses_that_lose_info
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.fx.traceback import NodeSource, NodeSourceAction
+<<<<<<< HEAD
 from torch.utils._sympy.solve import try_solve
 from torch.utils._sympy.value_ranges import ValueRanges
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ._remove_effect_tokens_pass import _remove_effect_tokens
 from ._tree_utils import reorder_kwargs
@@ -51,6 +63,7 @@ def _match_normalized_structure(a, b):
             return True
         if _normalize_type(a.type) != _normalize_type(b.type):
             return False
+<<<<<<< HEAD
         if a.type is dict and b.type is dict:
             # in the case of dict, the context is list of keys and we allow the keys to be in any order
             if set(a.context) != set(b.context):
@@ -62,6 +75,15 @@ def _match_normalized_structure(a, b):
         return all(
             _match_normalized_structure(a, b)
             for a, b in zip(a.children(), b.children())
+=======
+        if a.context != b.context:
+            return False
+        if len(a.children_specs) != len(b.children_specs):
+            return False
+        return all(
+            _match_normalized_structure(a, b)
+            for a, b in zip(a.children_specs, b.children_specs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return _match_normalized_structure(self, other)
@@ -86,6 +108,7 @@ def _check_inputs_match(args, kwargs, in_spec: pytree.TreeSpec) -> list:
     return flat_args_with_path
 
 
+<<<<<<< HEAD
 def _force_ep_signature_match(ep_guards_code: list[str], input_paths):
     # TODO (tmanlaibaatar)
     # This is band-aid solution to export new tracer replacing
@@ -246,6 +269,20 @@ def _check_input_constraints_pre_hook(self, args, kwargs):
     # constraint checks.
     if not torch.compiler.is_dynamo_compiling():
         _check_input_constraints_for_module(self, args, kwargs)
+=======
+@torch._dynamo.disable
+def _check_input_constraints_pre_hook(self, args, kwargs):
+    if not self.validate_inputs:
+        return
+
+    flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec)
+
+    _check_input_constraints_for_graph(
+        [node for node in self.graph.nodes if node.op == "placeholder"],
+        flat_args_with_path,
+        self.range_constraints,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _unlift_inputs_as_getattr(
@@ -299,7 +336,16 @@ def _insert_copy_for_mutations(
     Find the all the buffers and inputs that were mutated and insert copy_
     operators to reflect mutations.
     """
+<<<<<<< HEAD
     output_node = gm.graph.output_node()
+=======
+    output_node = None
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            output_node = node
+            break
+    assert output_node is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     outputs = pytree.tree_flatten(output_node.args)[0]
     assert len(outputs) == len(mutated_outputs)
 
@@ -325,12 +371,22 @@ def _insert_copy_for_mutations(
             )
             return_nodes_to_copy[return_node] = copy_node
 
+<<<<<<< HEAD
     output_args = tuple(
         return_nodes_to_copy.get(node, node) for node in user_output_nodes
     )
     with gm.graph.inserting_before(output_node):
         # Only return user outputs
         new_output = gm.graph.output(output_args)
+=======
+    output_args = [
+        return_nodes_to_copy[node] if node in return_nodes_to_copy else node
+        for node in user_output_nodes
+    ]
+    with gm.graph.inserting_before(output_node):
+        # Only return user outputs
+        new_output = gm.graph.output(tuple(output_args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_node.replace_all_uses_with(new_output)
         gm.graph.erase_node(output_node)
         new_output.name = output_node.name
@@ -354,6 +410,7 @@ def _get_codegen(
     """
     if forward_arg_names:
         names = forward_arg_names
+<<<<<<< HEAD
     elif (
         in_spec.type is tuple
         and in_spec.num_children == 2
@@ -366,6 +423,21 @@ def _get_codegen(
         names.extend(in_spec.child(1).context)
     else:
         names = [f"arg_{i}" for i in range(in_spec.num_children)]
+=======
+    else:
+        if (
+            in_spec.type == tuple
+            and in_spec.num_children == 2
+            and in_spec.children_specs[0].type == tuple
+            and in_spec.children_specs[1].type == dict
+        ):
+            # if in_spec contains the args (tuple) and kwargs (dict)
+            names = [f"arg_{i}" for i in range(in_spec.children_specs[0].num_children)]
+            # add kwarg names
+            names.extend(in_spec.children_specs[1].context)
+        else:
+            names = [f"arg_{i}" for i in range(in_spec.num_children)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return _PyTreeCodeGen(
         _PyTreeInfo(
@@ -382,6 +454,11 @@ def _unlift(
     mutated_outputs: Sequence[Optional[str]],
     in_spec: pytree.TreeSpec,
     out_spec: Optional[pytree.TreeSpec],
+<<<<<<< HEAD
+=======
+    state_dict: dict[str, Any],
+    constants: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     forward_arg_names: Optional[list[str]] = None,
 ):
     """
@@ -521,7 +598,11 @@ def _create_stateful_graph_module(
     for constant_fqn in ep.graph_signature.lifted_tensor_constants:
         # Sometimes, the constant can require gradient, this is probably a bug in user code,
         # e.g. `self.const = torch.randn(2, 2, requires_grad=True)`.
+<<<<<<< HEAD
         # We call detach on the constant_val since they're tensor constants and we don't need to
+=======
+        # We call detach on the constant_val since they're tensor contants and we don't need to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # compute their gradients anyway.
         # Users should properly register it as parameter if they want it to require gradient.
         buffer = stateful_gm.get_buffer(constant_fqn)
@@ -530,8 +611,12 @@ def _create_stateful_graph_module(
                 f"A model attribute `{constant_fqn}` requires gradient. "
                 f"but it's not properly registered as a parameter. "
                 f"torch.export will detach it and treat it as a constant tensor "
+<<<<<<< HEAD
                 f"but please register it as parameter instead.",
                 stacklevel=2,
+=======
+                f"but please register it as parameter instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             detached_buffer = buffer.detach()
             original_tensor_to_detached_tensor[buffer] = detached_buffer
@@ -550,8 +635,12 @@ def _create_stateful_graph_module(
                         f"A model attribute `{const_name}` requires gradient "
                         f"but it's not properly registered as a parameter. "
                         f"torch.export will detach it and treat it as a constant tensor "
+<<<<<<< HEAD
                         f"but please register it as parameter instead.",
                         stacklevel=2,
+=======
+                        f"but please register it as parameter instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if value in original_tensor_to_detached_tensor:
                         value = original_tensor_to_detached_tensor[value]
@@ -581,6 +670,7 @@ def _create_stateful_graph_module(
     return stateful_gm
 
 
+<<<<<<< HEAD
 def _get_input_paths(example_inputs, signature):
     """
     Generate paths of placeholders, needed for generating the guards function.
@@ -752,6 +842,12 @@ def _unlift_exported_program_lifted_states(
     if ep.verifiers[0].dialect != "TRAINING":
         ep = _remove_effect_tokens(ep)
 
+=======
+def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.nn.Module:
+    # TODO T206340015
+    if ep.verifiers[0].dialect != "TRAINING":
+        ep = _remove_effect_tokens(ep)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
     _register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
     forward_arg_names = (
@@ -776,16 +872,21 @@ def _unlift_exported_program_lifted_states(
         (
             out_spec.target
             if out_spec.kind
+<<<<<<< HEAD
             in (
                 OutputKind.BUFFER_MUTATION,
                 OutputKind.USER_INPUT_MUTATION,
                 OutputKind.PARAMETER_MUTATION,
             )
+=======
+            in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else None
         )
         for out_spec in ep.graph_signature.output_specs
     ]
 
+<<<<<<< HEAD
     source_node_dict = {
         node.name: node for node in ep.graph.nodes if node.op != "placeholder"
     }
@@ -808,16 +909,29 @@ def _unlift_exported_program_lifted_states(
         ]
 
     assert ep.call_spec.in_spec is not None
+=======
+    for node in new_gm.graph.nodes:
+        node.meta["from_node"] = [
+            NodeSource(node, "ExportedProgram.module()", NodeSourceAction.CREATE)
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_gm = _unlift(
         new_gm,
         lifted_inputs,
         mutated_outputs,
         ep.call_spec.in_spec,
         ep.call_spec.out_spec,
+<<<<<<< HEAD
+=======
+        ep.state_dict,
+        ep.constants,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         forward_arg_names=forward_arg_names,
     )
     unlift_gm = _create_stateful_graph_module(new_gm, ep.range_constraints, ep)
     unlift_gm.meta.update(ep.graph_module.meta)
+<<<<<<< HEAD
 
     # create a _guards_fn submodule and insert a call to it after placeholders
     graph = unlift_gm.graph
@@ -870,3 +984,6 @@ class GuardsFn(torch.nn.Module):
 
     def forward(self, *args):
         pass
+=======
+    return unlift_gm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/custom_ops.py b/torch/export/custom_ops.py
index 9df7988da9314..ddedc03a11489 100644
--- a/torch/export/custom_ops.py
+++ b/torch/export/custom_ops.py
@@ -1,6 +1,9 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 import importlib
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 
 
@@ -27,6 +30,7 @@ def _access_subclass_inner_tensor(
             f"Attribute {attr} is not a tensor or doesn't exist in {src_subclass_tensor}"
         )
     return val
+<<<<<<< HEAD
 
 
 def _call_custom_autograd_function_in_pre_dispatch(function_cls_name, *args, **kwargs):
@@ -47,3 +51,5 @@ def _call_custom_autograd_function_in_pre_dispatch(function_cls_name, *args, **k
     function_cls = getattr(module, class_name)
     assert hasattr(function_cls, "apply")
     return function_cls.apply(*args, **kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/decomp_utils.py b/torch/export/decomp_utils.py
index a261ce3c8b2c8..732011b18bd8e 100644
--- a/torch/export/decomp_utils.py
+++ b/torch/export/decomp_utils.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._export.utils import (
@@ -49,7 +53,11 @@ def __init__(self):
         self.decomp_table = _core_aten_decompositions_post_autograd()
 
         for op in _collect_all_valid_cia_ops_for_aten_namespace():
+<<<<<<< HEAD
             if op not in PRESERVED_ATEN_CIA_OPS and op not in self.decomp_table:
+=======
+            if op not in PRESERVED_ATEN_CIA_OPS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.decomp_table[op] = _get_decomp_for_cia(op)
 
         # This is to track the *pending* deleted custom ops that haven't been materialized yet
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index fd26d684b2b38..b6d78a1a154ad 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -4,14 +4,20 @@
 import logging
 import sys
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import auto, Enum
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from enum import auto, Enum
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.utils._pytree import (
     _get_node_type,
     BUILTIN_TYPES,
+<<<<<<< HEAD
     KeyPath,
     keystr,
     MappingKey,
@@ -22,6 +28,16 @@
     tree_map_with_path,
     tree_structure,
     TreeSpec,
+=======
+    keystr,
+    LeafSpec,
+    MappingKey,
+    SequenceKey,
+    SUPPORTED_NODES,
+    tree_flatten,
+    tree_map,
+    tree_map_with_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 from .exported_program import ExportedProgram
@@ -60,6 +76,7 @@ class _DimHintType(Enum):
 
 @dataclasses.dataclass
 class _DimHint:
+<<<<<<< HEAD
     """
     Internal class for dynamic shape hints.
     - min and max are optional.
@@ -69,6 +86,8 @@ class _DimHint:
         bounded_hint(min=5, max=50)  # Will fail, non-factory instance cannot be called
     """
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     type: _DimHintType
     min: Optional[int] = None
     max: Optional[int] = None
@@ -94,6 +113,7 @@ def __call__(self, min=None, max=None) -> "_DimHint":
         assert min is None or max is None or min <= max, "min must be <= max"
         return _DimHint(self.type, min=min, max=max, _factory=False)
 
+<<<<<<< HEAD
     def __repr__(self):
         parts = [self.type.name]
         if self.min is not None:
@@ -118,6 +138,20 @@ class Dim:
     compiler to decide (``Dim.AUTO``). The export process will automatically
     infer the remaining constraints on min/max ranges and relationships between
     dimensions.
+=======
+
+class Dim:
+    """
+    The `Dim` class allows users to specify dynamism in their exported programs. By marking a dimension with a `Dim`,
+    the compiler associates the dimension with a symbolic integer containing a dynamic range.
+
+    The API can be used in 2 ways: Dim hints (i.e. automatic dynamic shapes: `Dim.AUTO`, `Dim.DYNAMIC`, `Dim.STATIC`),
+    or named Dims (i.e. `Dim("name", min=1, max=2)`).
+
+    Dim hints provide the lowest barrier to exportability, with the user only needing to specify if a dimension
+    if dynamic, static, or left for the compiler to decide (`Dim.AUTO`). The export process will automatically
+    infer the remaining constraints on min/max ranges and relationships between dimensions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example::
 
@@ -136,6 +170,7 @@ def forward(self, x, y):
         }
         ep = torch.export(Foo(), (x, y), dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
     Here, export would raise an exception if we replaced all uses of ``Dim.AUTO`` with ``Dim.DYNAMIC``,
     as ``x.shape[0]`` is constrained to be static by the model.
 
@@ -143,12 +178,25 @@ def forward(self, x, y):
     e.g. ``(x.shape[0] + y.shape[1]) % 4 == 0``, to be raised if runtime inputs do not satisfy such constraints.
 
     You may also specify min-max bounds for Dim hints, e.g. ``Dim.AUTO(min=16, max=32)``, ``Dim.DYNAMIC(max=64)``,
+=======
+    Here, export would raise an exception if we replaced all uses of `Dim.AUTO` with `Dim.DYNAMIC`,
+    as x.shape[0] is constrained to be static by the model.
+
+    More complex relations between dimensions may also be codegened as runtime assertion nodes by the compiler,
+    e.g. (x.shape[0] + y.shape[1]) % 4 == 0, to be raised if runtime inputs do not satisfy such constraints.
+
+    You may also specify min-max bounds for Dim hints, e.g. `Dim.AUTO(min=16, max=32)`, `Dim.DYNAMIC(max=64)`,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with the compiler inferring the remaining constraints within the ranges. An exception will be raised if
     the valid range is entirely outside the user-specified range.
 
     Named Dims provide a stricter way of specifying dynamism, where exceptions are raised if the compiler
     infers constraints that do not match the user specification. For example, exporting the previous
+<<<<<<< HEAD
     model, the user would need the following ``dynamic_shapes`` argument::
+=======
+    model, the user would need the following `dynamic_shapes` argument::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         s0 = Dim("s0")
         s1 = Dim("s1", min=16)
@@ -158,9 +206,14 @@ def forward(self, x, y):
         }
         ep = torch.export(Foo(), (x, y), dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
     Named Dims also allow specification of relationships between dimensions, up
     to univariate linear relations.  For example, the following indicates one
     dimension is a multiple of another plus 4::
+=======
+    Named Dims also allow specification of relationships between dimensions, up to univariate linear relations.
+    For example, the following indicates one dimension is a multiple of another plus 4::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         s0 = Dim("s0")
         s1 = 3 * s0 + 4
@@ -657,6 +710,7 @@ def raise_mismatch_error(msg):
                     case_name="dynamic_shapes_validation",
                 )
 
+<<<<<<< HEAD
             def _compare(
                 treespec: TreeSpec, other_treespec: TreeSpec, path: KeyPath
             ) -> None:
@@ -710,10 +764,64 @@ def _compare(
 
 
 def _combine_args(f, args, kwargs) -> dict[str, Any]:
+=======
+            def _compare(tree, dynamic_shapes, path):
+                # raise an error at the point where tree and dynamic_shapes differ,
+                # including the path to that point and the reason for the difference
+                rendered_path = keystr(path)
+                if isinstance(tree, LeafSpec):
+                    return
+                if isinstance(dynamic_shapes, LeafSpec):
+                    raise_mismatch_error(
+                        f"`{tree_name}{rendered_path}` is a {tree.type}, "
+                        f"but `dynamic_shapes{rendered_path}` is not"
+                    )
+                if tree.type != dynamic_shapes.type:
+                    raise_mismatch_error(
+                        f"`{tree_name}{rendered_path}` is a {tree.type}, "
+                        f"but `dynamic_shapes{rendered_path}` is a {dynamic_shapes.type}"
+                    )
+                if len(tree.children_specs) != len(dynamic_shapes.children_specs):
+                    raise_mismatch_error(
+                        f"`{tree_name}{rendered_path}` has {len(tree.children_specs)} elements, "
+                        f"but `dynamic_shapes{rendered_path}` has {len(dynamic_shapes.children_specs)} elements"
+                    )
+                if tree.type is dict:
+                    # context, children could be out of order
+                    if sorted(tree.context) != sorted(dynamic_shapes.context):
+                        raise_mismatch_error(
+                            f"`{tree_name}{rendered_path}` has keys {tree.context}, "
+                            f"but `dynamic_shapes{rendered_path}` has keys {dynamic_shapes.context}"
+                        )
+                    _remap = dict(
+                        zip(dynamic_shapes.context, dynamic_shapes.children_specs)
+                    )
+                    dynamic_shapes_children_specs = [_remap[k] for k in tree.context]
+                else:
+                    dynamic_shapes_children_specs = dynamic_shapes.children_specs
+                for i, (tree_, dynamic_shapes_) in enumerate(
+                    zip(tree.children_specs, dynamic_shapes_children_specs)
+                ):
+                    _compare(
+                        tree_,
+                        dynamic_shapes_,
+                        path + [_key(tree.type, tree.context, i)],
+                    )
+
+            _, tree_spec = tree_flatten(tree, is_leaf=is_leaf)
+            for other_tree in dynamic_shapes:
+                _, other_tree_spec = tree_flatten(other_tree, is_leaf)
+                _compare(tree_spec, other_tree_spec, [])
+        raise
+
+
+def _combine_args(f, args, kwargs, _is_torch_jit_trace=False) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # combine args and kwargs following the signature of f, as it happens
     # in the body of f when called with *args, **kwargs
     if isinstance(f, ExportedProgram):
         f = f.module()
+<<<<<<< HEAD
 
     signature = (
         inspect.signature(f.forward)
@@ -722,6 +830,17 @@ def _combine_args(f, args, kwargs) -> dict[str, Any]:
     )
     kwargs = kwargs if kwargs is not None else {}
     return signature.bind(*args, **kwargs).arguments
+=======
+    if not _is_torch_jit_trace:
+        signature = (
+            inspect.signature(f.forward)
+            if isinstance(f, torch.nn.Module)
+            else inspect.signature(f)
+        )
+        kwargs = kwargs if kwargs is not None else {}
+        return signature.bind(*args, **kwargs).arguments
+    return args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ShapesCollection:
@@ -876,7 +995,11 @@ def dynamic_shapes(self, m, args, kwargs=None):
         ]
 
         def _mark_dynamism(v, *other_vs):
+<<<<<<< HEAD
             if not all(type(v) is type(other) for other in other_vs):
+=======
+            if not all(type(v) == type(other) for other in other_vs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise ValueError(
                     "The following inputs were found to have differing types, "
                     f"so they cannot be marked as dynamic: {(v,) + other_vs}."
@@ -909,7 +1032,11 @@ def verify(self, ep):
 
         epm = ep.module()
         for args, kwargs in self._examples:
+<<<<<<< HEAD
             torch.export._unlift._check_input_constraints_for_module(
+=======
+            torch.export._unlift._check_input_constraints_pre_hook(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 epm, args, kwargs or {}
             )
 
@@ -967,7 +1094,11 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension mapped to index {i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
+<<<<<<< HEAD
                         f" but got {dim!r} instead)",
+=======
+                        f" but got {dim} instead)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         case_name="dynamic_shapes_validation",
                     )
         elif isinstance(shape, (tuple, list)):
@@ -990,7 +1121,11 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension #{i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
+<<<<<<< HEAD
                         f"but got {dim!r} instead)",
+=======
+                        f"but got {dim} instead)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         case_name="dynamic_shapes_validation",
                     )
         elif shape is not None:
@@ -1235,7 +1370,14 @@ def _get_dim_name_mapping(
     dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None],
 ):
     name_to_dim = {}
+<<<<<<< HEAD
     for dim in tree_iter(dynamic_shapes, is_leaf=lambda x: isinstance(x, Dim)):
+=======
+    for dim in tree_flatten(
+        dynamic_shapes,
+        is_leaf=lambda x: isinstance(x, Dim),
+    )[0]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dim is None:
             # NOTE: this must denote a non-Tensor or automatic at this point.
             continue
diff --git a/torch/export/experimental/__init__.py b/torch/export/experimental/__init__.py
index 2705b59a9075a..88a8d53190d49 100644
--- a/torch/export/experimental/__init__.py
+++ b/torch/export/experimental/__init__.py
@@ -1,6 +1,7 @@
 import copy
 import dataclasses
 import functools
+<<<<<<< HEAD
 import os
 import types
 import typing
@@ -22,6 +23,18 @@
 
 def _copy_graph_module_and_signature(
     ep: torch.export.ExportedProgram,
+=======
+import types
+import typing
+import typing_extensions
+
+import torch
+from torch.export.exported_program import _decompose_exported_program
+
+
+def _copy_graph_module_and_signature(
+    ep: torch.fx.GraphModule,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[torch.fx.GraphModule, torch.export.graph_signature.ExportGraphSignature]:
     # copy.deepcopy lets the objects override __deepcopy__ methods with graph_copy() and node_copy(),
     # and this can break placeholder names in some particular cases.
@@ -39,7 +52,11 @@ def _copy_graph_module_and_signature(
         for old_node, new_node in zip(old_phs, new_phs):
             new_node.name = old_node.name
 
+<<<<<<< HEAD
     return gm, new_graph_signature
+=======
+    return gm, new_graph_signature  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _remove_detach_pass(
@@ -84,6 +101,7 @@ def _export_forward_backward(
     return ep._update(gm, new_graph_signature)
 
 
+<<<<<<< HEAD
 def _sticky_export(
     forward_func: typing.Callable[_InputT, _RetT],
     dynamic_shapes_callback: typing.Optional[
@@ -95,16 +113,28 @@ def _sticky_export(
         ]
     ] = None,
 ) -> typing.Callable[_InputT, _RetT]:
+=======
+@typing.no_type_check
+def _sticky_export(forward_func, dynamic_shapes_callback=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Lazily export the model on first forward call.
     Usage:
         model.forward = _sticky_export(model.forward, dynamic_shapes_callback=callback)
     """
+<<<<<<< HEAD
     model = forward_func.__self__  # type: ignore[attr-defined]
     original_forward = forward_func.__func__  # type: ignore[attr-defined]
 
     @functools.wraps(forward_func)
     def wrapper(*args: _InputT.args, **kwargs: _InputT.kwargs) -> _RetT:
+=======
+    model = forward_func.__self__
+    original_forward = forward_func.__func__
+
+    @functools.wraps(forward_func)
+    def wrapper(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Unpatch forward to avoid recursion during export
         model.forward = types.MethodType(original_forward, model)
 
@@ -119,7 +149,11 @@ def wrapper(*args: _InputT.args, **kwargs: _InputT.kwargs) -> _RetT:
                 kwargs,
                 dynamic_shapes=dynamic_shapes_spec,
             ).module()
+<<<<<<< HEAD
             wrapper._exported_artifact = exported  # type: ignore[attr-defined]
+=======
+            wrapper._exported_artifact = exported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             # Restore the wrapper after export
             model.forward = wrapper
@@ -135,6 +169,13 @@ class _ExportMethod:
     fallbacks: list[torch.export.ExportedProgram]
 
 
+<<<<<<< HEAD
+=======
+_InputT = typing_extensions.ParamSpec("_InputT")
+_RetT = typing.TypeVar("_RetT")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _ExportPackage:
     """
     An export package is a collection of torch.export()-ed PyTorch models consisting of
@@ -217,7 +258,11 @@ def _exporter(
             - Returns an optional dynamic shape spec.
 
         Exporter will only export an overload when the spec callable successfully returns
+<<<<<<< HEAD
         a result without raising AssertionError.
+=======
+        a result without rasing AssertionError.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         For example:
         ```
@@ -293,7 +338,10 @@ def _exporter_context(*args, **kwargs):  # type: ignore[no-untyped-def]
                     if isinstance(fn, torch.nn.Module):
                         dynamic_shapes = v(fn, *args, **kwargs)  # type: ignore[arg-type]
                     else:
+<<<<<<< HEAD
                         # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         dynamic_shapes = v(*args, **kwargs)
                 except AssertionError:
                     continue
@@ -325,8 +373,12 @@ def _exporter_context(*args, **kwargs):  # type: ignore[no-untyped-def]
 
         if isinstance(fn, torch.nn.Module):
             _exporter_context = torch._dynamo.eval_frame.OptimizedModule(  # type: ignore[assignment] # noqa: F811
+<<<<<<< HEAD
                 fn,
                 lambda _: _exporter_context,  # type: ignore[arg-type]
+=======
+                fn, lambda _: _exporter_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         def _define_overload(
@@ -341,6 +393,7 @@ def _define_overload(
         assert not hasattr(fn, "_define_overload")
         _exporter_context._define_overload = _define_overload  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
         return _exporter_context
 
@@ -430,3 +483,6 @@ def _compiled_and_package(
         )
         with open(Path(base_directory) / "main.cpp", "w") as file:
             file.write(main_file_str)
+=======
+        return _exporter_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index f7bc5531677f9..4b8019b9817b0 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -7,10 +7,17 @@
 import operator
 import types
 import warnings
+<<<<<<< HEAD
 from collections import defaultdict
 from collections.abc import Callable, Iterator
 from contextlib import contextmanager
 from typing import Any, final, NamedTuple, Optional, TYPE_CHECKING, Union
+=======
+from collections import defaultdict, namedtuple
+from collections.abc import Iterator
+from contextlib import contextmanager
+from typing import Any, Callable, final, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._guards import tracing, TracingContext
 from torch._higher_order_ops.utils import autograd_not_implemented
@@ -40,7 +47,10 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._export.utils import (
+<<<<<<< HEAD
     _build_cache,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _collect_all_valid_cia_ops,
     _collect_and_set_constant_attrs,
     _collect_param_buffer_metadata,
@@ -289,7 +299,11 @@ def _split_decomp_table_to_cia_and_python_decomp(
         #        decomp_table = decomp_table_to_core_aten()
         #        del decomp_table[aten.linear]
         #     In this case, user says decompose everything except for aten.linear
+<<<<<<< HEAD
         #  2. Has been marked with custom decomp behaviour. Example:
+=======
+        #  2. Has been marked with custom decomp behavour. Example:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #        decomp_table = {aten.linear: some_op}
         # For (1), we want to remove all the CIA ops that weren't handled by user as
         # it suggests they are safe to decompose, so we should remove from preservable_list.
@@ -326,7 +340,11 @@ def default_decompositions() -> "CustomDecompTable":
 
 
 def _decompose_and_get_gm_with_new_signature_constants(
+<<<<<<< HEAD
     ep: "ExportedProgram",
+=======
+    ep,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     cia_to_decomp: dict[torch._ops.OperatorBase, Callable],
     python_decomp_table: dict[torch._ops.OperatorBase, Callable],
@@ -365,7 +383,11 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
 
         # [NOTE] Unwrapping subclasses AOT
         # In torch.compile, the subclass unwrapping/wrapping happen at runtime
+<<<<<<< HEAD
         # but at export, this is impossible as it is intended to be run on
+=======
+        # but at export, this is impossible as it is intented to be run on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # C++ environment. As a result, we unwrap subclass parameters AOT. After this,
         # ExportedProgram state_dict won't be same as eager model because eager model
         # could have subclass weights while ExportedProgram will have desugared versions.
@@ -385,6 +407,7 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         # Fix the graph output signature to be tuple if scalar
         out_spec = mod._out_spec
 
+<<<<<<< HEAD
         assert isinstance(mod.graph._codegen, _PyTreeCodeGen)
         orig_arg_names = mod.graph._codegen.pytree_info.orig_args
 
@@ -392,6 +415,13 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         assert out_spec is not None
         if out_spec.type not in (list, tuple):
             out_spec = pytree.treespec_tuple([out_spec])
+=======
+        orig_arg_names = mod.graph._codegen.pytree_info.orig_args
+
+        # aot_export expect the return type to always be a tuple.
+        if out_spec.type not in (list, tuple):
+            out_spec = pytree.TreeSpec(tuple, None, [out_spec])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mod.graph._codegen = _PyTreeCodeGen(
             _PyTreeInfo(
@@ -425,7 +455,11 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
 
         # TODO (tmanlaibaatar) Ideally run_decomp should just call _non_strict_export
         # but due to special handling of constants as non-persistent buffers make it little
+<<<<<<< HEAD
         # difficult. But we should unify this code path together. T206837815
+=======
+        # diffucult. But we should unify this code path together. T206837815
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._export.non_strict_utils import (
             _enable_graph_inputs_of_type_nn_module,
             _fakify_script_objects,
@@ -480,6 +514,10 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
                     fake_params_buffers,
                     new_fake_constant_attrs,
                     decomp_table=python_decomp_table,
+<<<<<<< HEAD
+=======
+                    _check_autograd_state=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     _prettify_placeholder_names=False,
                     decompose_custom_triton_ops=decompose_custom_triton_ops,
                 )
@@ -532,16 +570,28 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         _verify_stack_trace(gm)
         _verify_placeholder_names(gm, new_graph_signature)
 
+<<<<<<< HEAD
         gm, new_graph_signature = _remove_unnecessary_copy_op_pass(
             gm, new_graph_signature
         )
 
         # When we apply parameterization rule to unwrap
+=======
+        gm, new_graph_signature = _remove_unneccessary_copy_op_pass(
+            gm, new_graph_signature
+        )
+
+        # When we apply parameterixzation rule to unwrap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # subclasses, the state dict will now have different
         # desugared parameters. We need to manually filter those
         # and update the ep.state_dict. Ideally, we should just return
         # the state dict of ep.module but ep.module only stores params
+<<<<<<< HEAD
         # buffers that participate in forward. If we undo this behavior,
+=======
+        # buffers that participate in forward. If we undo this behaviour,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # it would break some downstream users.
         new_state_dict = {
             **ep.state_dict,
@@ -573,8 +623,13 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         delattr(ep.graph_module, name)
 
     # TODO(zhxhchen17) Return the new graph_signature directly.
+<<<<<<< HEAD
     fake_mode_det = detect_fake_mode(fake_args)
     fake_mode_ctx = contextlib.nullcontext() if fake_mode_det is None else fake_mode_det  # type: ignore[assignment]
+=======
+    fake_mode = detect_fake_mode(fake_args)
+    fake_mode = contextlib.nullcontext() if fake_mode is None else fake_mode  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_triton_ops_decomposition_ctx = (
         contextlib.nullcontext
         if decompose_custom_triton_ops
@@ -582,7 +637,11 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
     )
     with (
         _ignore_backend_decomps(),
+<<<<<<< HEAD
         fake_mode_ctx,
+=======
+        fake_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _override_composite_implicit_decomp(cia_to_decomp),
         custom_triton_ops_decomposition_ctx(),
     ):
@@ -590,7 +649,11 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
             ep.graph_module,
             fake_args,
             decompositions=python_decomp_table,
+<<<<<<< HEAD
             trace_joint=joint_loss_index is not None,
+=======
+            trace_joint=True if joint_loss_index is not None else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_loss_index=(
                 joint_loss_index if joint_loss_index is not None else None
             ),
@@ -613,7 +676,11 @@ def update_arg(old_arg, new_ph):
         raise RuntimeError(f"Type of old_arg not supported: {type(old_arg)}")
 
     new_placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+<<<<<<< HEAD
     new_outputs: tuple[torch.fx.Node, ...] = tuple(gm.graph.output_node().args[0])  # type: ignore[arg-type]
+=======
+    new_outputs = list(gm.graph.nodes)[-1].args[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # rename the placeholders
     assert len(new_placeholders) == len(old_placeholders)
@@ -621,6 +688,7 @@ def update_arg(old_arg, new_ph):
         new_ph.name = new_ph.target = old_ph.name
 
     # handle name collisions with newly decomposed graph nodes
+<<<<<<< HEAD
     name_map = {}
     find_available: dict[str, int] = defaultdict(int)
     used_names: set[str] = set()
@@ -633,6 +701,13 @@ def update_arg(old_arg, new_ph):
         node.name = _rename_without_collisions(
             name_map, find_available, used_names, node.name, node.name
         )
+=======
+    name_map = {ph.name: ph.name for ph in new_placeholders}
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            continue
+        node.name = _rename_without_collisions(name_map, node.name, node.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # propagate names to higher order op subgraphs
     _name_hoo_subgraph_placeholders(gm)
@@ -653,10 +728,14 @@ def update_arg(old_arg, new_ph):
         shape_env = _get_shape_env(gm)
         if shape_env is not None:
             with _set_node_metadata_hook(
+<<<<<<< HEAD
                 gm,
                 functools.partial(
                     _node_metadata_hook, metadata={"stack_trace": stack_trace}
                 ),
+=======
+                gm, functools.partial(_node_metadata_hook, stack_trace=stack_trace)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 insert_deferred_runtime_asserts(
                     gm,
@@ -667,9 +746,15 @@ def update_arg(old_arg, new_ph):
 
     # update output specs
     gm.recompile()
+<<<<<<< HEAD
     for output, name in zip(new_outputs, _graph_output_names(gm)):
         if name is not None:
             output.name = name
+=======
+    for i, name in enumerate(_graph_output_names(gm)):
+        if isinstance(new_outputs[i], torch.fx.Node):
+            new_outputs[i].name = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # To match the output target with correct input for input mutations
     # need to find the old to new placeholder map
@@ -740,7 +825,11 @@ def update_arg(old_arg, new_ph):
             for i, spec in enumerate(ep.graph_signature.input_specs)
             if isinstance(spec.arg, TensorArgument)
         }
+<<<<<<< HEAD
         for node in new_outputs[len(output_specs) :]:
+=======
+        for i, node in enumerate(new_outputs[len(output_specs) :]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             source = gradients[node.name]
             spec = specs[source]  # type: ignore[index]
             if spec.kind == InputKind.PARAMETER:
@@ -781,7 +870,11 @@ def update_arg(old_arg, new_ph):
     return gm, new_graph_signature, ep.state_dict
 
 
+<<<<<<< HEAD
 def _remove_unnecessary_copy_op_pass(
+=======
+def _remove_unneccessary_copy_op_pass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm: torch.fx.GraphModule, new_graph_signature: ExportGraphSignature
 ) -> tuple[torch.fx.GraphModule, ExportGraphSignature]:
     """
@@ -792,9 +885,15 @@ def _remove_unnecessary_copy_op_pass(
             if node.op == "output":
                 args, _ = pytree.tree_flatten(node.args)
                 for out in args:
+<<<<<<< HEAD
                     if isinstance(out, torch.fx.Node) and (
                         out.name in new_graph_signature.buffers_to_mutate
                         or out.name in new_graph_signature.parameters_to_mutate
+=======
+                    if (
+                        isinstance(out, torch.fx.Node)
+                        and out.name in new_graph_signature.buffers_to_mutate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         if (
                             out.op == "call_function"
@@ -894,7 +993,11 @@ def _get_updated_module_call_graph(
                     user_input_counter += 1
 
     # For all the parameters and buffers, we first see
+<<<<<<< HEAD
     # if they are result of parametrizations and if they
+=======
+    # if they are result of paramerizaitons and if they
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # are, we log them and error later
     old_param_to_desugared = defaultdict(list)
     for name, target in new_graph_params_buffers.items():
@@ -1023,6 +1126,7 @@ class ExportedProgram:
     again to construct a correct ExportedProgram.
     """
 
+<<<<<<< HEAD
     _graph_module: torch.fx.GraphModule
     """The underlying GraphModule containing the exported computation graph."""
 
@@ -1049,6 +1153,8 @@ class ExportedProgram:
 
     _guards_code: list[str]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         root: Union[torch.nn.Module, dict[str, Any]],
@@ -1086,8 +1192,11 @@ def __init__(
         # Validate should be always the last step of the constructor.
         self.validate()
 
+<<<<<<< HEAD
         self._guards_code = _convert_guards_to_code(self._graph_module)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     @compatibility(is_backward_compatible=False)
     def graph_module(self):
@@ -1225,9 +1334,13 @@ def example_inputs(self, value):
     @property
     @compatibility(is_backward_compatible=False)
     def call_spec(self):
+<<<<<<< HEAD
         class CallSpec(NamedTuple):
             in_spec: Optional[pytree.TreeSpec]
             out_spec: Optional[pytree.TreeSpec]
+=======
+        CallSpec = namedtuple("CallSpec", ["in_spec", "out_spec"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if len(self.module_call_graph) == 0:
             return CallSpec(in_spec=None, out_spec=None)
@@ -1304,7 +1417,11 @@ def _get_flat_args_with_check(self, args, kwargs):
 
         Returns:
             A tuple of (flat_args, received_spec)
+<<<<<<< HEAD
             flat_args is flattened args / kwargs
+=======
+            flat_args is flattend args / kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             received_spec is the pytree spec produced while flattening the
             tuple (args, kwargs)
         """
@@ -1370,6 +1487,64 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
             "You should use `exported_program.module()` instead."
         )
 
+<<<<<<< HEAD
+=======
+    def _postprocess_graph_module_outputs(self, res, orig_args, orig_kwargs):
+        """Process potential mutations to the input.
+
+        Because self.graph_module is functional, so mutations has to be written
+        back after execution of graph_module.
+        """
+        import torch._export.error as error
+
+        flat_args, _ = self._get_flat_args_with_check(orig_args, orig_kwargs)
+        if self.call_spec.out_spec is not None:
+            buffer_mutation = self.graph_signature.buffers_to_mutate
+            user_input_mutation = self.graph_signature.user_inputs_to_mutate
+            num_mutated = len(buffer_mutation) + len(user_input_mutation)
+            mutated_values = res[:num_mutated]
+
+            # Exclude dependency token from final result.
+            assertion_dep_token = self.graph_signature.assertion_dep_token
+            if assertion_dep_token is not None:
+                assertion_dep_token_index = next(iter(assertion_dep_token.keys()))
+                res = res[:assertion_dep_token_index]
+
+            res = res[num_mutated:]
+            try:
+                res = pytree.tree_unflatten(res, self.call_spec.out_spec)
+            except Exception:
+                _, received_spec = pytree.tree_flatten(res)
+                raise error.InternalError(  # noqa: B904
+                    "Trying to flatten user outputs with exported output tree spec: \n"
+                    f"{self.call_spec.out_spec}\n"
+                    "but actually got outputs with tree spec of: \n"
+                    f"{received_spec}"
+                )
+            finally:
+                user_inputs = [
+                    spec
+                    for spec in self.graph_signature.input_specs
+                    if spec.kind == InputKind.USER_INPUT
+                ]
+                for i, value in enumerate(mutated_values):
+                    output_spec = self.graph_signature.output_specs[i]
+                    if output_spec.kind == OutputKind.BUFFER_MUTATION:
+                        assert output_spec.target is not None
+                        self.state_dict[output_spec.target] = value
+                    elif output_spec.kind == OutputKind.USER_INPUT_MUTATION:
+                        assert output_spec.target is not None
+                        index = next(
+                            i
+                            for i, spec in enumerate(user_inputs)
+                            if spec.arg.name == output_spec.target
+                        )
+                        flat_args[index].copy_(value)
+                    else:
+                        raise AssertionError(f"Unexpected kind: {output_spec.kind}")
+        return res
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __str__(self) -> str:
         graph_module = self.graph_module.print_readable(
             print_output=False, colored=False
@@ -1383,6 +1558,7 @@ def __str__(self) -> str:
         )
         return string
 
+<<<<<<< HEAD
     def module(self, check_guards=True) -> torch.fx.GraphModule:
         """
         Returns a self contained GraphModule with all the parameters/buffers inlined.
@@ -1397,6 +1573,15 @@ def module(self, check_guards=True) -> torch.fx.GraphModule:
         from ._unlift import _unlift_exported_program_lifted_states
 
         module = _unlift_exported_program_lifted_states(self, check_guards=check_guards)
+=======
+    def module(self) -> torch.nn.Module:
+        """
+        Returns a self contained GraphModule with all the parameters/buffers inlined.
+        """
+        from ._unlift import _unlift_exported_program_lifted_states
+
+        module = _unlift_exported_program_lifted_states(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _train(self, mode: bool = True):
             raise NotImplementedError("Calling train() is not supported yet.")
@@ -1464,7 +1649,11 @@ def run_decompositions(
         if isinstance(_decomp_table, CustomDecompTable):
             _decomp_table = _decomp_table.materialize()
 
+<<<<<<< HEAD
         # Note [Separating decomp_table into CIA decomps and non-CIA decomps]
+=======
+        # Note [Seperating decomp_table into CIA decomps and non-CIA decomps]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # At this point, we have a decomp_table that contains decomp behaviour for
         # both CIA and post-autograd ops.
         # We need to separate the op into two categories:
@@ -1500,7 +1689,10 @@ def _transform_do_not_use(self, *passes: PassType) -> "ExportedProgram":
         transformed_gm = res.graph_module if res is not None else self.graph_module
         assert transformed_gm is not None
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if transformed_gm is self.graph_module and not res.modified:
             return self
 
@@ -1579,7 +1771,10 @@ def _get_updated_graph_signature(
             verifiers=self.verifiers,
         )
         transformed_ep.graph_module.meta.update(self.graph_module.meta)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         transformed_ep.graph_module.meta.update(res.graph_module.meta)
         return transformed_ep
 
@@ -1684,13 +1879,18 @@ def _create_graph_module_for_export(root, graph):
             "Unable to execute the generated python source code from "
             "the graph. The graph module will no longer be directly callable, "
             "but you can still run the ExportedProgram, and if needed, you can "
+<<<<<<< HEAD
             "run the graph module eagerly using torch.fx.Interpreter.",
             stacklevel=2,
+=======
+            "run the graph module eagerly using torch.fx.Interpreter."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         gm = torch.fx.GraphModule(root, torch.fx.Graph())
         gm._graph = graph
 
     return gm
+<<<<<<< HEAD
 
 
 def _convert_guards_to_code(graph_module):
@@ -1714,3 +1914,5 @@ def _convert_guards_to_code(graph_module):
         for guard in shape_env.guards
         if guard.expr.free_symbols.issubset(local_vars)
     ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/graph_signature.py b/torch/export/graph_signature.py
index e8935e359b0ee..f234867cd9ea2 100644
--- a/torch/export/graph_signature.py
+++ b/torch/export/graph_signature.py
@@ -121,7 +121,10 @@ class OutputKind(Enum):
     USER_OUTPUT = auto()
     LOSS_OUTPUT = auto()
     BUFFER_MUTATION = auto()
+<<<<<<< HEAD
     PARAMETER_MUTATION = auto()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GRADIENT_TO_PARAMETER = auto()
     GRADIENT_TO_USER_INPUT = auto()
     USER_INPUT_MUTATION = auto()
@@ -164,11 +167,19 @@ class ExportBackwardSignature:
 class ExportGraphSignature:
     """
     :class:`ExportGraphSignature` models the input/output signature of Export Graph,
+<<<<<<< HEAD
     which is a fx.Graph with stronger invariants guarantees.
 
     Export Graph is functional and does not access "states" like parameters
     or buffers within the graph via ``getattr`` nodes. Instead, :func:`export`
     guarantees that parameters, buffers, and constant tensors are lifted out of
+=======
+    which is a fx.Graph with stronger invariants gurantees.
+
+    Export Graph is functional and does not access "states" like parameters
+    or buffers within the graph via ``getattr`` nodes. Instead, :func:`export`
+    gurantees that parameters, buffers, and constant tensors are lifted out of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     the graph as inputs.  Similarly, any mutations to buffers are not included
     in the graph either, instead the updated values of mutated buffers are
     modeled as additional outputs of Export Graph.
@@ -372,7 +383,11 @@ def user_outputs(self) -> Collection[Union[int, float, bool, None, str]]:
         return tuple(user_outputs)
 
     # A dictionary mapping graph input node names to parameters. If a graph input
+<<<<<<< HEAD
     # name is found in this dictionary, it is guaranteed to be a lifted parameter.
+=======
+    # name is found in this dictionary, it is guranteed to be a lifted parameter.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def inputs_to_parameters(self) -> Mapping[str, str]:
         return _immutable_dict(
@@ -384,7 +399,11 @@ def inputs_to_parameters(self) -> Mapping[str, str]:
         )
 
     # A dictionary mapping graph input node names to buffers. If a graph input
+<<<<<<< HEAD
     # name is found in this dictionary, it is guaranteed to be a lifted buffer.
+=======
+    # name is found in this dictionary, it is guranteed to be a lifted buffer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def inputs_to_buffers(self) -> Mapping[str, str]:
         return _immutable_dict(
@@ -408,6 +427,7 @@ def buffers_to_mutate(self) -> Mapping[str, str]:
         )
 
     @property
+<<<<<<< HEAD
     def parameters_to_mutate(self) -> Mapping[str, str]:
         return _immutable_dict(
             (s.arg.name, s.target)
@@ -418,6 +438,8 @@ def parameters_to_mutate(self) -> Mapping[str, str]:
         )
 
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def user_inputs_to_mutate(self) -> Mapping[str, str]:
         return _immutable_dict(
             (s.arg.name, s.target)
@@ -612,7 +634,10 @@ def _convert_to_export_graph_signature(
     inputs_to_buffers = graph_signature.inputs_to_buffers
     user_outputs = set(graph_signature.user_outputs)
     buffer_mutations = graph_signature.buffers_to_mutate
+<<<<<<< HEAD
     parameter_mutations = graph_signature.parameters_to_mutate
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     user_input_mutations = graph_signature.user_inputs_to_mutate
     grad_params = (
         graph_signature.backward_signature.gradients_to_parameter  # type: ignore[union-attr]
@@ -674,21 +699,28 @@ def to_output_spec(idx: int, o: ArgumentSpec) -> OutputSpec:
         if not isinstance(o, TensorArgument):
             return OutputSpec(kind=OutputKind.USER_OUTPUT, arg=o, target=None)
         name = o.name
+<<<<<<< HEAD
         if idx < len(buffer_mutations) + len(parameter_mutations) + len(
             user_input_mutations
         ) + len(output_tokens):
+=======
+        if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if name in buffer_mutations:
                 return OutputSpec(
                     kind=OutputKind.BUFFER_MUTATION,
                     arg=o,
                     target=buffer_mutations[name],  # type: ignore[index]
                 )
+<<<<<<< HEAD
             elif name in parameter_mutations:
                 return OutputSpec(
                     kind=OutputKind.PARAMETER_MUTATION,
                     arg=o,
                     target=parameter_mutations[name],  # type: ignore[index]
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif name in user_input_mutations:
                 return OutputSpec(
                     kind=OutputKind.USER_INPUT_MUTATION,
diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index d36985180f5fd..b7f5dc545bf6d 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -52,6 +52,7 @@ def _get_new_device(
         if isinstance(v, torch.Tensor):
             ep._constants[k] = v.to(_get_new_device(v.device, location))
 
+<<<<<<< HEAD
     # move example_inputs if they exist
     if ep.example_inputs is not None:
         args, kwargs = ep.example_inputs
@@ -92,6 +93,21 @@ def _get_new_device(
                     else v,
                     node.meta.get("val"),
                 )
+=======
+    for node in ep.graph.nodes:
+        # move all the nodes kwargs with burnt-in device
+        if "device" in node.kwargs:
+            kwargs = node.kwargs.copy()
+            kwargs["device"] = _get_new_device(kwargs["device"], location)
+            node.kwargs = kwargs
+        # move all the tensor metadata
+        node.meta["val"] = pytree.tree_map(
+            lambda v: v.to(_get_new_device(v.device, location))
+            if isinstance(v, torch.Tensor)
+            else v,
+            node.meta.get("val"),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ep.validate()
     return ep
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index 302854891f199..c45057df662c2 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -6,6 +6,7 @@
 import tempfile
 import zipfile
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, IO, Optional, TYPE_CHECKING, TypeAlias, Union
 
 import torch
@@ -32,6 +33,19 @@
     get_complete,
     group_weights,
     TensorProperties,
+=======
+from typing import Any, IO, Optional, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias
+
+import torch
+import torch.utils._pytree as pytree
+from torch._export.serde.serialize import deserialize, serialize, SerializedArtifact
+from torch.export._tree_utils import reorder_kwargs
+from torch.export.exported_program import ExportedProgram
+from torch.export.pt2_archive._package_weights import (
+    get_complete,
+    group_weights,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Weights,
 )
 from torch.export.pt2_archive.constants import (
@@ -40,17 +54,26 @@
     ARCHIVE_FORMAT_VALUE,
     ARCHIVE_VERSION_PATH,
     ARCHIVE_VERSION_VALUE,
+<<<<<<< HEAD
     CONSTANTS_CONFIG_FILENAME_FORMAT,
     CONSTANTS_DIR,
     CUSTOM_OBJ_FILENAME_PREFIX,
     EXECUTORCH_DIR,
+=======
+    CONSTANTS_DIR,
+    CUSTOM_OBJ_FILENAME_PREFIX,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EXTRA_DIR,
     MODELS_DIR,
     MODELS_FILENAME_FORMAT,
     SAMPLE_INPUTS_FILENAME_FORMAT,
+<<<<<<< HEAD
     TENSOR_CONSTANT_FILENAME_PREFIX,
     WEIGHT_FILENAME_PREFIX,
     WEIGHTS_CONFIG_FILENAME_FORMAT,
+=======
+    WEIGHT_FILENAME_PREFIX,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WEIGHTS_DIR,
 )
 from torch.types import FileLike
@@ -83,8 +106,13 @@ def is_pt2_package(serialized_model: Union[bytes, str]) -> bool:
         archive_format_path = f"{root_folder}/{ARCHIVE_FORMAT_PATH}"
         if archive_format_path in zip_reader.namelist():
             return zip_reader.read(archive_format_path) == b"pt2"
+<<<<<<< HEAD
     except Exception:
         logger.info("Model is not a PT2 package")
+=======
+    except Exception as ex:
+        logger.info("Model is not a PT2 package: %s", str(ex))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return False
 
 
@@ -94,8 +122,11 @@ class PT2ArchiveWriter:
     """
 
     def __init__(self, archive_path_or_buffer: FileLike):
+<<<<<<< HEAD
         if isinstance(archive_path_or_buffer, str):
             archive_path_or_buffer = normalize_path_separator(archive_path_or_buffer)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.archive_file = torch._C.PyTorchFileWriter(archive_path_or_buffer)  # type: ignore[arg-type]
         # NOTICE: version here is different from the archive_version
         # this is the version of zip file format, which is used by PyTorchFileWriter, which write to /.data/version
@@ -173,10 +204,15 @@ def write_folder(self, archive_dir: str, folder_dir: str) -> None:
             os.path.isfile, glob.glob(f"{folder_dir}/**", recursive=True)
         )
         for file_path in file_paths:
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
             filename = os.path.relpath(file_path, folder_dir)
             archive_path = os.path.join(archive_dir, filename)
             # pyrefly: ignore [bad-argument-type]
+=======
+            filename = os.path.relpath(file_path, folder_dir)
+            archive_path = os.path.join(archive_dir, filename)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.write_file(archive_path, file_path)
 
     def close(self) -> None:
@@ -192,8 +228,11 @@ class PT2ArchiveReader:
     """
 
     def __init__(self, archive_path_or_buffer: FileLike):
+<<<<<<< HEAD
         if isinstance(archive_path_or_buffer, str):
             archive_path_or_buffer = normalize_path_separator(archive_path_or_buffer)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.archive_file = torch._C.PyTorchFileReader(archive_path_or_buffer)  # type: ignore[arg-type]
         assert self.read_string(ARCHIVE_FORMAT_PATH) == ARCHIVE_FORMAT_VALUE, (
             "Invalid archive format"
@@ -241,11 +280,14 @@ def get_file_names(self) -> list[str]:
         return self.archive_file.get_all_records()
 
 
+<<<<<<< HEAD
 is_pt2_package.__module__ = "torch.export.pt2_archive"
 PT2ArchiveWriter.__module__ = "torch.export.pt2_archive"
 PT2ArchiveReader.__module__ = "torch.export.pt2_archive"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _package_aoti_files(
     archive_writer: PT2ArchiveWriter,
     aoti_files: Optional[AOTI_FILES],
@@ -328,6 +370,7 @@ def _package_aoti_files(
             logger.debug(weights_config)
 
 
+<<<<<<< HEAD
 def _is_fake_tensor(t: torch.Tensor) -> bool:
     return isinstance(t, FakeTensor)
 
@@ -567,6 +610,8 @@ def _package_payload_config(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _package_exported_programs(
     archive_writer: PT2ArchiveWriter,
     exported_programs: Optional[Union[ExportedProgram, dict[str, ExportedProgram]]],
@@ -577,11 +622,16 @@ def _package_exported_programs(
         return
 
     if isinstance(exported_programs, ExportedProgram):
+<<<<<<< HEAD
         exported_programs = {"model": exported_programs}
+=======
+        exported_programs = {"model", exported_programs}  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert isinstance(exported_programs, dict)
 
     for model_name, ep in exported_programs.items():
+<<<<<<< HEAD
         weights_config = _package_state_dict(
             model_name, ep, archive_writer, pickle_protocol
         )
@@ -599,10 +649,21 @@ def _package_exported_programs(
             opset_version,
             pickle_protocol,
         )
+=======
+        artifact: SerializedArtifact = serialize(ep, opset_version, pickle_protocol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         archive_writer.write_bytes(
             MODELS_FILENAME_FORMAT.format(model_name), artifact.exported_program
         )
+<<<<<<< HEAD
+=======
+        # TODO:Consider dedup this with the weights saved in package_aoti_files
+        archive_writer.write_bytes(f"{WEIGHTS_DIR}{model_name}.pt", artifact.state_dict)
+        archive_writer.write_bytes(
+            f"{CONSTANTS_DIR}{model_name}.pt", artifact.constants
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         archive_writer.write_bytes(
             SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name),
             artifact.example_inputs,
@@ -619,6 +680,7 @@ def _package_extra_files(
         archive_writer.write_string(f"{EXTRA_DIR}{extra_file_name}", content)
 
 
+<<<<<<< HEAD
 def _package_executorch_files(
     archive_writer: PT2ArchiveWriter, executorch_files: Optional[dict[str, bytes]]
 ) -> None:
@@ -629,6 +691,8 @@ def _package_executorch_files(
         archive_writer.write_bytes(f"{EXECUTORCH_DIR}{file_name}", content)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def package_pt2(
     f: FileLike,
     *,
@@ -639,6 +703,7 @@ def package_pt2(
     extra_files: Optional[dict[str, Any]] = None,
     opset_version: Optional[dict[str, int]] = None,
     pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+<<<<<<< HEAD
     executorch_files: Optional[dict[str, bytes]] = None,
 ) -> FileLike:
     r"""
@@ -647,15 +712,32 @@ def package_pt2(
 
     Args:
         f (str | os.PathLike[str] | IO[bytes]): A file-like object (has to
+=======
+) -> FileLike:
+    """
+    Saves the artifacts to a PT2Archive format
+    (https://docs.google.com/document/d/1RQ4cmywilnFUT1VE-4oTGxwXdc8vowCSZsrRgo3wFA8/edit?tab=t.0#heading=h.v2y2jgnwc56a).
+    The artifact can then be loaded using ``load_pt2``.
+
+    Args:
+        f (str | os.PathLike[str] | IO[bytes]) A file-like object (has to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          implement write and flush) or a string containing a file name.
 
         exported_programs (Union[ExportedProgram, dict[str, ExportedProgram]]):
          The exported program to save, or a dictionary mapping model name to an
          exported program to save. The exported program will be saved under
+<<<<<<< HEAD
          models/\*.json. If only one ExportedProgram is specified, this will
          automatically be named "model".
 
         aoti_files (Union[list[str], dict[str, list[str]]]): A list of files
+=======
+         models/*.json. If only one ExportedProgram is specified, this will
+         automatically be named "model".
+
+        aoti_files (Union[list[str], dict[str, list[str]]): A list of files
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          generated by AOTInductor via
          ``torch._inductor.aot_compile(..., {"aot_inductor.package": True})``,
          or a dictionary mapping model name to its AOTInductor generated files.
@@ -670,9 +752,12 @@ def package_pt2(
 
         pickle_protocol: can be specified to override the default protocol
 
+<<<<<<< HEAD
         executorch_files (Optional[dict[str, bytes]]): Optional executorch
          artifacts to save.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     assert not (
         exported_programs is None and aoti_files is None and extra_files is None
@@ -684,7 +769,10 @@ def package_pt2(
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.writable() and f.seekable())
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
+<<<<<<< HEAD
         or (isinstance(f, tempfile._TemporaryFileWrapper) and f.name.endswith(".pt2"))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # TODO: turn this into an error
         logger.warning(
@@ -696,7 +784,10 @@ def package_pt2(
     if isinstance(f, (str, os.PathLike)):
         f = os.fspath(f)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with PT2ArchiveWriter(f) as archive_writer:
         _package_exported_programs(
             archive_writer, exported_programs, pickle_protocol=pickle_protocol
@@ -707,11 +798,17 @@ def package_pt2(
             pickle_protocol=pickle_protocol,
         )
         _package_extra_files(archive_writer, extra_files)
+<<<<<<< HEAD
         _package_executorch_files(archive_writer, executorch_files)
 
     if isinstance(f, (io.IOBase, IO)):
         f.seek(0)
     # pyrefly: ignore [bad-return]
+=======
+
+    if isinstance(f, (io.IOBase, IO)):
+        f.seek(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return f
 
 
@@ -773,6 +870,7 @@ class PT2ArchiveContents:
     extra_files: dict[str, Any]
 
 
+<<<<<<< HEAD
 def _create_flat_tensor_from_bytes(
     tensor_bytes: bytes,
     tensor_meta: schema.TensorMeta,
@@ -954,6 +1052,8 @@ def _load_constants(
         return constants
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _load_exported_programs(
     archive_reader: PT2ArchiveReader,
     file_names: list[str],
@@ -971,6 +1071,7 @@ def _load_exported_programs(
             len(prefix) : -len(suffix)
         ]  # given "models/foo.json" we can now get "foo"
 
+<<<<<<< HEAD
         sample_inputs_file = SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name)
         serialized_sample_inputs = archive_reader.read_bytes(sample_inputs_file)
 
@@ -990,6 +1091,26 @@ def _load_exported_programs(
             serialized_sample_inputs,
         )
 
+=======
+        weights_file = f"{WEIGHTS_DIR}{model_name}.pt"
+        constants_file = f"{CONSTANTS_DIR}{model_name}.pt"
+        sample_inputs_file = SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name)
+
+        serialized_exported_program = archive_reader.read_bytes(file)
+        serialized_weights = archive_reader.read_bytes(weights_file)
+        serialized_constants = archive_reader.read_bytes(constants_file)
+        serialized_sample_inputs = archive_reader.read_bytes(sample_inputs_file)
+
+        artifact: SerializedArtifact = SerializedArtifact(
+            serialized_exported_program,
+            serialized_weights,
+            serialized_constants,
+            serialized_sample_inputs,
+        )
+
+        # Deserialize ExportedProgram
+        ep = deserialize(artifact, expected_opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exported_programs[model_name] = ep
 
     return exported_programs
@@ -1008,6 +1129,7 @@ def _load_extra_files(
     return extra_file_contents
 
 
+<<<<<<< HEAD
 def _load_aoti(
     file: str,
     model_name: str,
@@ -1046,6 +1168,8 @@ def _load_aoti(
     return aoti_compiled_model
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def load_pt2(
     f: FileLike,
     *,
@@ -1080,8 +1204,11 @@ def load_pt2(
         A ``PT2ArchiveContents`` object which contains all the objects in the PT2.
     """
 
+<<<<<<< HEAD
     from torch._inductor.cpp_builder import normalize_path_separator
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.readable() and f.seekable())
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
@@ -1098,7 +1225,10 @@ def load_pt2(
 
     weights = {}
     weight_maps = {}
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with PT2ArchiveReader(f) as archive_reader:
         version = archive_reader.read_string(ARCHIVE_VERSION_PATH)
         if version != ARCHIVE_VERSION_VALUE:
@@ -1121,9 +1251,12 @@ def load_pt2(
                 file_end = file[
                     len(AOTINDUCTOR_DIR) :
                 ]  # remove data/aotinductor/ prefix
+<<<<<<< HEAD
                 file_end = normalize_path_separator(
                     file_end
                 )  # Win32 need normalize path before split.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model_name = file_end.split("/")[
                     0
                 ]  # split "model_name/...cpp" into "model_name"
@@ -1149,12 +1282,23 @@ def load_pt2(
                 logger.debug("Writing buffer to tmp file located at %s.", tf.name)
 
                 aoti_runners = {
+<<<<<<< HEAD
                     model_name: _load_aoti(
                         tf.name,
                         model_name,
                         run_single_threaded,
                         num_runners,
                         device_index,
+=======
+                    model_name: AOTICompiledModel(
+                        torch._C._aoti.AOTIModelPackageLoader(
+                            tf.name,
+                            model_name,
+                            run_single_threaded,
+                            num_runners,
+                            device_index,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     for model_name in aoti_model_names
                 }
@@ -1162,12 +1306,19 @@ def load_pt2(
             aoti_runners = {}
     else:
         aoti_runners = {
+<<<<<<< HEAD
             model_name: _load_aoti(
                 f,
                 model_name,
                 run_single_threaded,
                 num_runners,
                 device_index,
+=======
+            model_name: AOTICompiledModel(
+                torch._C._aoti.AOTIModelPackageLoader(
+                    f, model_name, run_single_threaded, num_runners, device_index
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for model_name in aoti_model_names
         }
diff --git a/torch/export/pt2_archive/_package_weights.py b/torch/export/pt2_archive/_package_weights.py
index 5acd86feebf0a..be38aac04f443 100644
--- a/torch/export/pt2_archive/_package_weights.py
+++ b/torch/export/pt2_archive/_package_weights.py
@@ -1,8 +1,13 @@
 import collections
+<<<<<<< HEAD
 import warnings
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor
+=======
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 
@@ -16,6 +21,7 @@ def _end_ptr(tensor: torch.Tensor) -> int:
 
 class TensorProperties:
     def __init__(self, tensor: torch.Tensor):
+<<<<<<< HEAD
         self.is_fake = isinstance(tensor, FakeTensor)
         self.is_contiguous = tensor.is_contiguous()
         self.storage_ptr = None
@@ -35,12 +41,18 @@ def __init__(self, tensor: torch.Tensor):
                 self.start = tensor.data_ptr()
                 # pyrefly: ignore [bad-assignment]
                 self.end = _end_ptr(tensor)
+=======
+        # info about underlying storage
+        self.storage_ptr = tensor.untyped_storage().data_ptr()
+        self.storage_size = tensor.untyped_storage().nbytes()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # info to recover tensor
         self.shape = tensor.shape
         self.stride = tensor.stride()
         self.offset = tensor.storage_offset()
 
+<<<<<<< HEAD
     def is_complete(self) -> bool:
         """
         Whether the tensor completely overlaps with its underlying storage
@@ -56,6 +68,15 @@ def is_complete(self) -> bool:
         assert self.storage_size is not None
         assert self.start is not None
         assert self.end is not None
+=======
+        self.start = tensor.data_ptr()
+        self.end = _end_ptr(tensor)
+
+    def is_complete(self) -> bool:
+        """
+        Whehter the tensor completely overlaps with its underlying storage
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             self.start == self.storage_ptr
             and self.end == self.storage_ptr + self.storage_size
@@ -65,7 +86,11 @@ def is_complete(self) -> bool:
 class Weights(dict):
     """
     A dictionary mapping from weight name to a tuple of (tensor, TensorProperties).
+<<<<<<< HEAD
     tensor represents the actual initial value of the weight.
+=======
+    tensor represents the actual intial value of the weight.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorProperties represents the properties of the weight that are needed to recover the weight.
 
     We use two separate entries because `tensor` could be a clone of the original weight tensor,
@@ -106,6 +131,7 @@ def get_tensor_properties(name_tuple: tuple[str, str]) -> TensorProperties:
         if tensor_property.is_complete():
             return name_tuple
 
+<<<<<<< HEAD
     warnings.warn(
         "No complete tensor found in the group! Returning the first one. "
         "This may cause issues when your weights are not on CPU.",
@@ -113,6 +139,9 @@ def get_tensor_properties(name_tuple: tuple[str, str]) -> TensorProperties:
     )
     assert len(group) > 0
     return next(iter(group))
+=======
+    raise RuntimeError("No complete tensor found in the group!")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str, str]]]:
@@ -122,6 +151,7 @@ def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str,
     Returns a list of sets, each set contains a tuple of (model_name, weight_name).
     """
 
+<<<<<<< HEAD
     weights_dict: dict[tuple[int, torch.dtype], OrderedSet[tuple[str, str]]] = (
         collections.defaultdict(OrderedSet)
     )  # (storage_key, dtype) -> set(weight)
@@ -131,5 +161,14 @@ def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str,
             weights_dict[(properties.storage_ptr, tensor.dtype)].add(
                 (model_name, weight_name)
             )
+=======
+    weights_dict: dict[int, OrderedSet[tuple[str, str]]] = collections.defaultdict(
+        OrderedSet
+    )  # storage_key -> set(weight)
+
+    for model_name, weights in all_weights.items():
+        for weight_name, (_, properties) in weights.items():
+            weights_dict[properties.storage_ptr].add((model_name, weight_name))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return list(weights_dict.values())
diff --git a/torch/export/pt2_archive/constants.py b/torch/export/pt2_archive/constants.py
index 4b05e257b8f3d..0a34d54f499f8 100644
--- a/torch/export/pt2_archive/constants.py
+++ b/torch/export/pt2_archive/constants.py
@@ -9,11 +9,15 @@
 ARCHIVE_VERSION_PATH: str = pt2_archive_constants.ARCHIVE_VERSION_PATH
 ARCHIVE_VERSION_VALUE: str = pt2_archive_constants.ARCHIVE_VERSION_VALUE
 CONSTANTS_DIR: str = pt2_archive_constants.CONSTANTS_DIR
+<<<<<<< HEAD
 CONSTANTS_CONFIG_FILENAME_FORMAT: str = (
     pt2_archive_constants.CONSTANTS_CONFIG_FILENAME_FORMAT
 )
 CUSTOM_OBJ_FILENAME_PREFIX: str = pt2_archive_constants.CUSTOM_OBJ_FILENAME_PREFIX
 EXECUTORCH_DIR: str = pt2_archive_constants.EXECUTORCH_DIR
+=======
+CUSTOM_OBJ_FILENAME_PREFIX: str = pt2_archive_constants.CUSTOM_OBJ_FILENAME_PREFIX
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 EXTRA_DIR: str = pt2_archive_constants.EXTRA_DIR
 MODELS_DIR: str = pt2_archive_constants.MODELS_DIR
 MODELS_FILENAME_FORMAT: str = pt2_archive_constants.MODELS_FILENAME_FORMAT
@@ -24,9 +28,12 @@
 TENSOR_CONSTANT_FILENAME_PREFIX: str = (
     pt2_archive_constants.TENSOR_CONSTANT_FILENAME_PREFIX
 )
+<<<<<<< HEAD
 WEIGHTS_CONFIG_FILENAME_FORMAT: str = (
     pt2_archive_constants.WEIGHTS_CONFIG_FILENAME_FORMAT
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 WEIGHT_FILENAME_PREFIX: str = pt2_archive_constants.WEIGHT_FILENAME_PREFIX
 WEIGHTS_DIR: str = pt2_archive_constants.WEIGHTS_DIR
 XL_MODEL_WEIGHTS_DIR: str = pt2_archive_constants.XL_MODEL_WEIGHTS_DIR
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 3701ba99047fb..0598f7410c759 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -5,21 +5,35 @@
 import operator
 import re
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, cast, Optional, Union
+=======
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx._pytree as fx_pytree
 import torch.utils._pytree as pytree
 from torch._library.fake_class_registry import FakeScriptObject
+<<<<<<< HEAD
 from torch.export import ExportedProgram
 from torch.export._tree_utils import reorder_kwargs
 from torch.export.exported_program import (
     ConstantArgument,
+=======
+from torch.export._tree_utils import reorder_kwargs
+from torch.export.exported_program import (
+    ConstantArgument,
+    ExportedProgram,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ExportGraphSignature,
     InputKind,
     ModuleCallSignature,
@@ -28,7 +42,11 @@
     SymIntArgument,
     TensorArgument,
 )
+<<<<<<< HEAD
 from torch.fx._symbolic_trace import is_fx_symbolic_tracing
+=======
+from torch.fx._symbolic_trace import is_fx_tracing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.graph_module import _get_attr, _get_attr_via_attr_list, _print_readable
 from torch.utils._pytree import GetAttrKey, SequenceKey
 
@@ -54,6 +72,7 @@ class _AttrKind(Enum):
     MODULE = "module"
 
 
+<<<<<<< HEAD
 @dataclass(frozen=True)
 class _TensorID:
     """Custom tensor identifier containing storage, stride, and size information."""
@@ -64,6 +83,8 @@ class _TensorID:
     storage_offset: int
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN_WITH_INTERPRETER = True
 
 
@@ -135,11 +156,14 @@ class _SubmoduleBase:
     _ty: Optional[str]
 
     def type_name(self) -> Optional[str]:
+<<<<<<< HEAD
         """
         Subclass of this class - InterpreterModule, InterpreterModuleDispatcher, represents
         corresponding model in eager model. To get this type information for those modules
         in eager model we need to use this method.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._ty
 
 
@@ -164,7 +188,11 @@ def __init__(
 
     def forward(self, *args, **kwargs):
         assert self.graph_module is not None, "Didn't finalize this InterpreterModule"
+<<<<<<< HEAD
         if not is_fx_symbolic_tracing() and (
+=======
+        if not is_fx_tracing() and (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compiler.is_dynamo_compiling() or not self._run_with_interpreter
         ):
             # Dynamo cannot trace through torch.fx.Interpreter, so fall back to
@@ -296,12 +324,17 @@ def adapt(
         """NOTE: This adapter may mutate given ``input_args_with_path``."""
         ...
 
+<<<<<<< HEAD
     def get_flat_arg_paths(self) -> list[str]:
         """Returns a list of paths that are used to access the flat args."""
         return []
 
 
 class UnflattenedModule(_SubmoduleBase, torch.nn.Module):
+=======
+
+class UnflattenedModule(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         export_module: ExportedProgram,
@@ -311,6 +344,7 @@ def __init__(
         if export_module.graph_signature.backward_signature is not None:
             raise ValueError("Unflattening on JointExportModule NYI")
 
+<<<<<<< HEAD
         def _id(obj):
             """Returns _TensorID dataclass for tensors, otherwise id()."""
             if isinstance(obj, torch.Tensor):
@@ -322,6 +356,8 @@ def _id(obj):
                 )
             return id(obj)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fqn_list = [entry.fqn for entry in export_module.module_call_graph]
         assert fqn_list[0] == ""
         export_graph = deepcopy(export_module.graph)
@@ -339,8 +375,11 @@ def _id(obj):
         self._run_with_interpreter = RUN_WITH_INTERPRETER
 
         _inplace_buffer_and_input_mutations(export_graph, self.graph_signature)
+<<<<<<< HEAD
         _fix_nn_module_stacks(export_graph)
         self._ty = _root_module_type(export_graph)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.ivals = _IVals()
         # for any intermediate value of a mutation that is read, track the mutation
@@ -367,6 +406,7 @@ def _id(obj):
         # graph's forward pass (_sink_params).
         state_dict = export_module.state_dict
         assigned_params: set[str] = set()  # tracking unused params
+<<<<<<< HEAD
         id_to_param: dict[
             Union[int, _TensorID], torch.nn.Parameter
         ] = {}  # handling weight-sharing
@@ -374,11 +414,22 @@ def _id(obj):
             param = state_dict[name]
             if _id(param) not in id_to_param:
                 id_to_param[_id(param)] = torch.nn.Parameter(
+=======
+        id_to_param: dict[int, torch.nn.Parameter] = {}  # handling weight-sharing
+        for name in self.graph_signature.parameters:  # this loop adds used params
+            param = state_dict[name]
+            if id(param) not in id_to_param:
+                id_to_param[id(param)] = torch.nn.Parameter(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     param.clone(), requires_grad=param.requires_grad
                 )
 
             _assign_attr(
+<<<<<<< HEAD
                 id_to_param[_id(param)],
+=======
+                id_to_param[id(param)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self,
                 name,
                 attr_kind=_AttrKind.PARAMETER,
@@ -387,7 +438,11 @@ def _id(obj):
 
         non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
         assigned_buffers: set[str] = set()  # tracking unused buffers
+<<<<<<< HEAD
         id_to_buffer: dict[Union[int, _TensorID], tuple[torch.nn.Parameter, bool]] = {}
+=======
+        id_to_buffer: dict[int, tuple[torch.nn.Parameter, bool]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for name in self.graph_signature.buffers:  # this loop adds used buffers
             if name in non_persistent_buffers:
                 persistent = False
@@ -396,11 +451,19 @@ def _id(obj):
                 persistent = True
                 buffer = state_dict[name]
 
+<<<<<<< HEAD
             if _id(buffer) not in id_to_buffer:
                 id_to_buffer[_id(buffer)] = (buffer.clone(), persistent)
 
             _assign_attr(
                 id_to_buffer[_id(buffer)][0],
+=======
+            if id(buffer) not in id_to_buffer:
+                id_to_buffer[id(buffer)] = (buffer.clone(), persistent)
+
+            _assign_attr(
+                id_to_buffer[id(buffer)][0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self,
                 name,
                 attr_kind=_AttrKind.BUFFER,
@@ -415,37 +478,59 @@ def _id(obj):
                 continue
 
             is_buffer = False
+<<<<<<< HEAD
             if _id(tensor) in id_to_buffer or not isinstance(
+=======
+            if id(tensor) in id_to_buffer or not isinstance(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tensor, torch.nn.Parameter
             ):  # aliased buffer
                 is_buffer = True
 
             if is_buffer:
                 if (
+<<<<<<< HEAD
                     _id(tensor) not in id_to_buffer
                 ):  # this is completely unused (not weight-sharing)
                     id_to_buffer[_id(tensor)] = (
+=======
+                    id(tensor) not in id_to_buffer
+                ):  # this is completely unused (not weight-sharing)
+                    id_to_buffer[id(tensor)] = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         tensor,
                         True,
                     )  # assign to respect original model
                 _assign_attr(
+<<<<<<< HEAD
                     id_to_buffer[_id(tensor)][0],
+=======
+                    id_to_buffer[id(tensor)][0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self,
                     name,
                     attr_kind=_AttrKind.BUFFER,
                     persistent=True,
                 )
             else:
+<<<<<<< HEAD
                 if _id(tensor) not in id_to_param:  # this is unused
                     id_to_param[_id(tensor)] = tensor
                 _assign_attr(
                     id_to_param[_id(tensor)],
+=======
+                if id(tensor) not in id_to_param:  # this is unused
+                    id_to_param[id(tensor)] = tensor
+                _assign_attr(
+                    id_to_param[id(tensor)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self,
                     name,
                     attr_kind=_AttrKind.PARAMETER,
                 )
 
         # use id map so we don't double-clone aliased constants
+<<<<<<< HEAD
         id_to_const: dict[
             Union[int, _TensorID], Union[torch.Tensor, torch._C.ScriptObject]
         ] = {}
@@ -455,6 +540,15 @@ def _id(obj):
                     constant = constant.clone()
                 id_to_const[_id(constant)] = constant
             _constant = id_to_const[_id(constant)]
+=======
+        id_to_const: dict[int, Union[torch.Tensor, torch._C.ScriptObject]] = {}
+        for fqn, constant in export_module.constants.items():
+            if id(constant) not in id_to_const:
+                if isinstance(constant, torch.Tensor):
+                    constant = constant.clone()
+                id_to_const[id(constant)] = constant
+            _constant = id_to_const[id(constant)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _assign_attr(
                 _constant,
                 self,
@@ -464,18 +558,26 @@ def _id(obj):
 
         # This is to handle parameters/buffers that point to the same tensor
         # object id -> list of (node_name, target_name)
+<<<<<<< HEAD
         consts_map: dict[Union[int, _TensorID], list[tuple[str, str]]] = defaultdict(
             list
         )
+=======
+        consts_map: dict[int, list[tuple[str, str]]] = defaultdict(list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         consts_targets: set[str] = set()
 
         def add_to_consts_map(obj_id, node_name, target_name):
             name_list = consts_map[obj_id]
             name_list.append((node_name, target_name))
 
+<<<<<<< HEAD
         # track aliased/unused params, buffers
         # prefer using untyped_storage() over id() when it's available
         added_params_buffers: set[str] = set()
+=======
+        added_params_buffers: set[str] = set()  # track aliased/unused params, buffers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for s in self.graph_signature.input_specs:
             if s.kind == InputKind.PARAMETER or (
                 s.kind == InputKind.BUFFER and s.persistent
@@ -483,47 +585,76 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 assert hasattr(s.arg, "name")
                 assert isinstance(s.target, str)
                 add_to_consts_map(
+<<<<<<< HEAD
                     _id(export_module.state_dict[s.target]),
                     s.arg.name,
                     s.target,
+=======
+                    id(export_module.state_dict[s.target]), s.arg.name, s.target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 consts_targets.add(s.target)
                 added_params_buffers.add(s.target)
             elif (
+<<<<<<< HEAD
                 s.kind == InputKind.BUFFER
                 and not s.persistent
+=======
+                (s.kind == InputKind.BUFFER and not s.persistent)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or s.kind == InputKind.CONSTANT_TENSOR
                 or s.kind == InputKind.CUSTOM_OBJ
             ):
                 assert hasattr(s.arg, "name")
                 assert isinstance(s.target, str)
                 add_to_consts_map(
+<<<<<<< HEAD
                     _id(export_module.constants[s.target]),
                     s.arg.name,
                     s.target,
+=======
+                    id(export_module.constants[s.target]), s.arg.name, s.target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 consts_targets.add(s.target)
 
         # add constants that are aliased and don't appear in graph signature
         for const_name, const in export_module.constants.items():
             if const_name not in consts_targets:
+<<<<<<< HEAD
                 const_id = _id(const)
                 assert const_id in consts_map
                 ph_name, _ = consts_map[const_id][0]
                 add_to_consts_map(const_id, ph_name, const_name)
+=======
+                assert id(const) in consts_map, (
+                    "Constants should be either aliased or appear in graph signature"
+                )
+                ph_name, _ = consts_map[id(const)][0]
+                add_to_consts_map(id(const), ph_name, const_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 added_params_buffers.add(s.target)
 
         # add aliased/unused params and buffers that don't appear in graph signature
         for fqn, tensor in export_module.state_dict.items():
             if fqn not in added_params_buffers:
+<<<<<<< HEAD
                 tensor_id = _id(tensor)
                 if tensor_id not in consts_map:
+=======
+                if id(tensor) not in consts_map:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # completely unused (no weight-sharing), ignore.
                     # this weight doesn't appear in graph module,
                     # so won't cause FQN assignment issues
                     continue
+<<<<<<< HEAD
                 ph_name, _ = consts_map[tensor_id][0]
                 add_to_consts_map(tensor_id, ph_name, fqn)
+=======
+                ph_name, _ = consts_map[id(tensor)][0]
+                add_to_consts_map(id(tensor), ph_name, fqn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # node name -> list of possible targets
         inputs_to_state: dict[str, list[str]] = {}
@@ -602,7 +733,11 @@ def process_forward_inputs(self, *args, **kwargs):
         )
         flat_args = [x[1] for x in flat_args_with_path]
 
+<<<<<<< HEAD
         if is_fx_symbolic_tracing():
+=======
+        if is_fx_tracing():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return flat_args
 
         if in_spec != signature.in_spec:
@@ -622,6 +757,7 @@ def process_forward_inputs(self, *args, **kwargs):
             from torch._export.utils import _check_input_constraints_for_graph
 
             if self.adapted is True:
+<<<<<<< HEAD
                 flat_arg_paths = (
                     self.flat_args_adapter.get_flat_arg_paths()
                     if self.flat_args_adapter
@@ -641,6 +777,14 @@ def process_forward_inputs(self, *args, **kwargs):
                         arg,
                     )
                     for idx, arg in enumerate(flat_args)
+=======
+                # TODO(suo): The FlatArgsAdapter returns a list of flat args,
+                # which we don't have keypaths for. For now, just create a dummy
+                # keypath to associate with the arg.
+                new_flat_args_with_path = [  # type: ignore[var-annotated]
+                    ((SequenceKey(idx=0), GetAttrKey(name="<unknown location>")), arg)
+                    for arg in flat_args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             else:
                 new_flat_args_with_path = flat_args_with_path  # type: ignore[assignment]
@@ -652,10 +796,20 @@ def process_forward_inputs(self, *args, **kwargs):
         return flat_args
 
     def forward(self, *args, **kwargs):
+<<<<<<< HEAD
         flat_args = self.process_forward_inputs(*args, **kwargs)
         signature = self.module_call_graph[0].signature
 
         if is_fx_symbolic_tracing():
+=======
+        flat_args = torch._dynamo.disable(
+            self.process_forward_inputs,
+            reason="do not trace into preprocessing the inputs",
+        )(*args, **kwargs)
+        signature = self.module_call_graph[0].signature
+
+        if is_fx_tracing():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return_val = torch.fx.Interpreter(self, graph=self.graph).run(
                 *flat_args, enable_io_processing=False
             )
@@ -757,7 +911,11 @@ def unflatten(
     """Unflatten an ExportedProgram, producing a module with the same module
     hierarchy as the original eager module. This can be useful if you are trying
     to use :mod:`torch.export` with another system that expects a module
+<<<<<<< HEAD
     hierarchy instead of the flat graph that :mod:`torch.export` usually produces.
+=======
+    hierachy instead of the flat graph that :mod:`torch.export` usually produces.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note:: The args/kwargs of unflattened modules will not necessarily match
         the eager module, so doing a module swap (e.g. :code:`self.submod =
@@ -774,6 +932,7 @@ def unflatten(
         hierarchy as the original eager module pre-export.
     """
     module = _remove_effect_tokens(module)
+<<<<<<< HEAD
     m = UnflattenedModule(module, flat_args_adapter)
 
     # Disable process_forward_inputs as the adapter has many
@@ -785,6 +944,9 @@ def unflatten(
     )
 
     return m
+=======
+    return UnflattenedModule(module, flat_args_adapter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _inplace_buffer_and_input_mutations(
@@ -859,6 +1021,7 @@ def forward(self, buffer, x):
     output_node.args = ((user_outputs),)
 
 
+<<<<<<< HEAD
 def _root_module_type(graph: torch.fx.Graph) -> Optional[str]:
     for node in graph.nodes:
         if "nn_module_stack" not in node.meta:
@@ -909,6 +1072,8 @@ def _fix_nn_module_stacks(graph):
             )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_prefix(candidate, target):
     """Check whether `candidate` is a prefix of `target`."""
     return len(candidate) < len(target) and target[: len(candidate)] == candidate
@@ -949,7 +1114,10 @@ def arg_dump(arg) -> str:
                 for key, value in pytree.tree_map(arg_dump, node.kwargs).items()
             ]
             target = node.target if node.op in ("call_function", "get_attr") else ""
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret.append(f"{i}: {node.op}[{target}]({', '.join(args_dump)})")
             nodes_idx[id(node)] = i
         return "\n".join(ret)
@@ -1124,10 +1292,17 @@ def create_module(fqn):
         signature = module_call_graph.get(self.child_fqn)
         if signature is not None and self.parent is not None:
             assert signature.in_spec.num_children == 2
+<<<<<<< HEAD
             assert signature.in_spec.type is tuple
             args_spec, kwargs_spec = signature.in_spec.children()
             assert args_spec.type is tuple
             assert kwargs_spec.type is dict
+=======
+            args_spec = signature.in_spec.children_specs[0]
+            kwargs_spec = signature.in_spec.children_specs[1]
+            assert args_spec.context is None
+            assert kwargs_spec.context is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with self.graph.inserting_after(None):
                 arg_nodes = [
@@ -1206,7 +1381,10 @@ def create_module(fqn):
                     for k in kwargs_spec.context
                 }
             assert self.parent_call_module is not None
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.parent_call_module.args = tuple(arg_nodes)
             self.parent_call_module.kwargs = kwarg_nodes  # type: ignore[assignment]
 
@@ -1394,7 +1572,10 @@ def run_outer(self):
 
     def print(self, *args, **kwargs):
         if self.verbose:
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print(*args, **kwargs)
 
     def run_from(self, node_idx):
@@ -1488,7 +1669,10 @@ def run_from(self, node_idx):
                 self.seen_attrs[self.child_fqn].add(node.target)
 
             self.copy_node(node)
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node_idx += 1
 
 
diff --git a/torch/functional.py b/torch/functional.py
index 3054f54b7cd40..e8ec679d51d42 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -105,9 +105,64 @@ def broadcast_shapes(*shapes):
     # This wrapper exists to support variadic args.
     # TODO Move this to C++ once the jit has better support for torch.Size.
     if not torch.jit.is_tracing():
+<<<<<<< HEAD
         result = torch._refs._broadcast_shapes(*shapes)
         if result is None:
             return torch.Size([])
+=======
+        max_len = 0
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                if max_len < 1:
+                    max_len = 1
+            elif isinstance(shape, (tuple, list)):
+                s = len(shape)
+                if max_len < s:
+                    max_len = s
+        result = [1] * max_len
+
+        from torch.fx.experimental.symbolic_shapes import (
+            guard_size_oblivious,
+            is_nested_int,
+        )
+
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                shape = (shape,)
+            if isinstance(shape, (tuple, list)):
+                for i in range(-1, -1 - len(shape), -1):
+                    if shape[i] < 0:
+                        raise RuntimeError(
+                            f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})"
+                        )
+
+                    # NB: handle nested ints specially to avoid invalid guarding on Ne(j0, 1).
+                    if is_nested_int(shape[i]):
+                        # Broadcasting is allowed for (j0, 1) or (j0, j0);
+                        # not (j0, j1), (j0, 5), etc.
+                        if is_nested_int(result[i]) and guard_size_oblivious(
+                            shape[i] == result[i]
+                        ):
+                            continue
+                    else:
+                        # NB: result is initialized to 1 so this is effectively an
+                        # equals one test
+                        if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(
+                            shape[i] == result[i]
+                        ):
+                            continue
+
+                    if result[i] != 1:
+                        raise RuntimeError(
+                            "Shape mismatch: objects cannot be broadcast to a single shape"
+                        )
+                    result[i] = shape[i]
+            else:
+                raise RuntimeError(
+                    "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
+                    shape,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.Size(result)
     else:
         # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail
@@ -363,7 +418,11 @@ def parse_subscript(n: int) -> str:
     if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
         # the old interface of passing the operands as one list argument
         _operands = operands[0]
+<<<<<<< HEAD
         # recurse in case operands contains value that has torch function
+=======
+        # recurse incase operands contains value that has torch function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in the original implementation this line is omitted
         return einsum(equation, *_operands)
 
@@ -1473,7 +1532,11 @@ def atleast_1d(*tensors):
     Input tensors with one or more dimensions are returned as-is.
 
     Args:
+<<<<<<< HEAD
         input (Tensor or sequence of Tensors): tensor(s) to be converted to at least 1-dimensional.
+=======
+        input (Tensor or list of Tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         output (Tensor or tuple of Tensors)
@@ -1494,8 +1557,11 @@ def atleast_1d(*tensors):
         >>> y = torch.tensor(1.)
         >>> torch.atleast_1d((x, y))
         (tensor([0.5000]), tensor([1.]))
+<<<<<<< HEAD
         >>> torch.atleast_1d()
         ()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
@@ -1511,7 +1577,11 @@ def atleast_2d(*tensors):
     Input tensors with two or more dimensions are returned as-is.
 
     Args:
+<<<<<<< HEAD
         input (Tensor or sequence of Tensors): tensor(s) to be converted to at least 2-dimensional.
+=======
+        input (Tensor or list of Tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         output (Tensor or tuple of Tensors)
@@ -1534,8 +1604,11 @@ def atleast_2d(*tensors):
         >>> y = torch.tensor(1.)
         >>> torch.atleast_2d((x, y))
         (tensor([[0.5000]]), tensor([[1.]]))
+<<<<<<< HEAD
         >>> torch.atleast_2d()
         ()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
@@ -1551,7 +1624,11 @@ def atleast_3d(*tensors):
     Input tensors with three or more dimensions are returned as-is.
 
     Args:
+<<<<<<< HEAD
         input (Tensor or sequence of Tensors): tensor(s) to be converted to at least 3-dimensional.
+=======
+        input (Tensor or list of Tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         output (Tensor or tuple of Tensors)
@@ -1582,8 +1659,11 @@ def atleast_3d(*tensors):
         >>> y = torch.tensor(1.0)
         >>> torch.atleast_3d((x, y))
         (tensor([[[0.5000]]]), tensor([[[1.]]]))
+<<<<<<< HEAD
         >>> torch.atleast_3d()
         ()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
@@ -1784,9 +1864,13 @@ def norm(  # noqa: F811
 
         if isinstance(p, str):
             if p == "fro" and (
+<<<<<<< HEAD
                 dim is None
                 or isinstance(dim, (int, torch.SymInt))
                 or len(dim) <= 2  # pyrefly: ignore  # bad-argument-type
+=======
+                dim is None or isinstance(dim, (int, torch.SymInt)) or len(dim) <= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 if out is None:
                     return torch.linalg.vector_norm(
@@ -1942,7 +2026,11 @@ def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
     torch._check_type(
         not indices.is_complex()
         and not indices.is_floating_point()
+<<<<<<< HEAD
         and indices.dtype != torch.bool,
+=======
+        and not indices.dtype == torch.bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"expected 'indices' to be integer dtype, but got {indices.dtype}",
     )
 
@@ -1952,7 +2040,11 @@ def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
     )
 
     if isinstance(shape, (int, torch.SymInt)):
+<<<<<<< HEAD
         shape = torch.Size([shape])  # pyrefly: ignore [bad-argument-type]
+=======
+        shape = torch.Size([shape])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         for dim in shape:
             torch._check_type(
@@ -2077,8 +2169,12 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
 
     Args:
         A (Tensor): the tensor to factor of size :math:`(*, m, n)`
+<<<<<<< HEAD
         pivot (bool, optional): Whether to compute the LU decomposition with partial pivoting, or the regular LU
                                 decomposition. :attr:`pivot`\ `= False` not supported on CPU. Default: `True`.
+=======
+        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                     Default: ``False``
         out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py
index cee84f5452733..f6b07158a8161 100644
--- a/torch/futures/__init__.py
+++ b/torch/futures/__init__.py
@@ -1,15 +1,22 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 from typing import cast, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Callable, cast, Generic, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from collections.abc import Callable
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = ["Future", "collect_all", "wait_all"]
 
 
@@ -17,7 +24,15 @@
 S = TypeVar("S")
 
 
+<<<<<<< HEAD
 class Future(torch._C.Future, Generic[T]):
+=======
+class _PyFutureMeta(type(torch._C.Future), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+
+
+class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Wrapper around a ``torch._C.Future`` which encapsulates an asynchronous
     execution of a callable, e.g. :meth:`~torch.distributed.rpc.rpc_async`. It
diff --git a/torch/fx/README.md b/torch/fx/README.md
index 3d42cb9375d43..ea241276e82b6 100644
--- a/torch/fx/README.md
+++ b/torch/fx/README.md
@@ -70,7 +70,11 @@ Here, we set up a simple Module that exercises different language features: fetc
 The `fx.Graph` is a core data structure in FX that represents the operations and their dependencies in a structured format. It consists of a List of `fx.Node` representing individual operations and their inputs and outputs. The Graph enables simple manipulation and analysis of the model structure, which is essential for implementing various transformations and optimizations.
 
 ## Node
+<<<<<<< HEAD
 An `fx.Node` is a data structure that represents individual operations within an `fx.Graph`, it maps to callsites such as operators, methods and modules. Each `fx.Node` keeps track of its inputs, the previous and next nodes, the stacktrace so you can map back the node to a line of code in your python file and some optional metadata stored in a `meta` dict.
+=======
+An `fx.Node` is a datastructure that represent individual operations within an `fx.Graph`, it maps to callsites such as operators, methods and modules. Each `fx.Node` keeps track of its inputs, the previous and next nodes, the stacktrace so you can map back the node to a line of code in your python file and some optional metadata stored in a `meta` dict.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## [GraphModule](https://pytorch.org/docs/main/fx.html#torch.fx.GraphModule) ##
 The `fx.GraphModule` is a subclass of `nn.Module` that holds the transformed Graph, the original module's parameter attributes and its source code. It serves as the primary output of FX transformations and can be used like any other `nn.Module`. `fx.GraphModule` allows for the execution of the transformed model, as it generates a valid forward method based on the Graph's structure.
@@ -115,11 +119,19 @@ Tracing captures an intermediate representation (IR), which is represented as a
 
 Node is the data structure that represents individual operations within a Graph. For the most part, Nodes represent callsites to various entities, such as operators, methods, and Modules (some exceptions include Nodes that specify function inputs and outputs). Each Node has a function specified by its `op` property. The Node semantics for each value of `op` are as follows:
 
+<<<<<<< HEAD
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on. `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument denoting the default parameter of the function input. `kwargs` is ignored. Placeholders correspond to the function parameters (e.g. `x`) in the graph printout.
 - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy. `args` and `kwargs` are ignored
 - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign to. `target` is the function to be applied. `args` and `kwargs` represent the arguments to the function, following the Python calling convention
 - `call_module` applies a module in the module hierarchy's `forward()` method to given arguments. `name` is as previous. `target` is the fully-qualified name of the module in the module hierarchy to call. `args` and `kwargs` represent the arguments to invoke the module on, *including the self argument*.
 - `call_method` calls a method on a value. `name` is similar. `target` is the string name of the method to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on, *including the self argument*
+=======
+- `placeholder` represents a function input. The `name` attribute specifies the name this value will take on. `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument denoting the default parameter of the function input. `kwargs` is don't-care. Placeholders correspond to the function parameters (e.g. `x`) in the graph printout.
+- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy. `args` and `kwargs` are don't-care
+- `call_function` applies a free function to some values. `name` is similarly the name of the value to assign to. `target` is the function to be applied. `args` and `kwargs` represent the arguments to the function, following the Python calling convention
+- `call_module` applies a module in the module hierarchy's `forward()` method to given arguments. `name` is as previous. `target` is the fully-qualified name of the module in the module hierarchy to call. `args` and `kwargs` represent the arguments to invoke the module on, *including the self argument*.
+- `call_method` calls a method on a value. `name` is as similar. `target` is the string name of the method to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on, *including the self argument*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - `output` contains the output of the traced function in its `args[0]` attribute. This corresponds to the "return" statement in the Graph printout.
 
 To facilitate easier analysis of data dependencies, Nodes have read-only properties `input_nodes` and `users`, which specify which Nodes in the Graph are used by this Node and which Nodes use this Node, respectively. Although Nodes are represented as a doubly-linked list, the use-def relationships form an acyclic graph and can be traversed as such.
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index c048b4fdd8f89..eb0cc9ba3a0c1 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -52,7 +52,11 @@ def forward(self, x):
 
 The **symbolic tracer** performs "symbolic execution" of the Python
 code. It feeds fake values, called Proxies, through the code. Operations
+<<<<<<< HEAD
 on these Proxies are recorded. More information about symbolic tracing
+=======
+on theses Proxies are recorded. More information about symbolic tracing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 can be found in the :func:`symbolic_trace` and :class:`Tracer`
 documentation.
 
diff --git a/torch/fx/_compatibility.py b/torch/fx/_compatibility.py
index c07dd1b51bc05..56a8310d9fe99 100644
--- a/torch/fx/_compatibility.py
+++ b/torch/fx/_compatibility.py
@@ -1,6 +1,10 @@
 import textwrap
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, TypeVar
+=======
+from typing import Any, Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _BACK_COMPAT_OBJECTS: dict[Any, None] = {}
diff --git a/torch/fx/_graph_pickler.py b/torch/fx/_graph_pickler.py
index eb6465680570b..a915194133bae 100644
--- a/torch/fx/_graph_pickler.py
+++ b/torch/fx/_graph_pickler.py
@@ -3,14 +3,21 @@
 import io
 import pickle
 from abc import abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, NewType, Optional, TypeVar, Union
+=======
+from typing import Any, Callable, NewType, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import override, Self
 
 import torch
 import torch.utils._pytree as pytree
 from torch._guards import TracingContext
+<<<<<<< HEAD
 from torch._inductor.standalone_compile import AOTCompiledArtifact
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode, Tensor
 from torch._subclasses.meta_utils import (
     MetaConverter,
@@ -67,7 +74,10 @@ def __init__(self, file: io.BytesIO, options: Optional[Options] = None) -> None:
         self._meta_tensor_describer = MetaTensorDescriber(copy_data=False)
 
     @override
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reducer_override(
         self, obj: object
     ) -> tuple[Callable[..., Any], tuple[Any, ...]]:
@@ -204,7 +214,10 @@ def reduce_helper(
     ]:
         args = (cls(obj.node), pickler._unpickle_state)
         if isinstance(obj, torch.SymInt):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return _SymNodePickleData.unpickle_sym_int, args
         else:
             raise NotImplementedError(f"Unhandled SymNode type {type(obj)}")
@@ -216,6 +229,11 @@ def __init__(self, node: SymNode) -> None:
         self.hint = node._hint
 
     def _to_sym_node(self) -> SymNode:
+<<<<<<< HEAD
+=======
+        from torch.fx.experimental.sym_node import SymNode
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.shape_env is not None
         return SymNode(self.expr, self.shape_env, self.pytype, self.hint)
 
@@ -266,6 +284,7 @@ def unpickle(self, unpickle_state: _UnpickleState) -> FakeTensor:
             fake_mode=unpickle_state.fake_mode,
         )
 
+<<<<<<< HEAD
         # also need to set the fake_mode on the base of a tensor if it's a view
         if metadata.is_view and metadata.base is not None:
             new_base = dataclasses.replace(
@@ -274,6 +293,8 @@ def unpickle(self, unpickle_state: _UnpickleState) -> FakeTensor:
             )
             metadata = dataclasses.replace(metadata, base=new_base)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def with_fake(
             make_meta_t: Callable[[], torch.Tensor], device: Union[torch.device, str]
         ) -> FakeTensor:
@@ -281,7 +302,10 @@ def with_fake(
                 return FakeTensor(
                     unpickle_state.fake_mode,
                     make_meta_t(),
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     device,
                 )
 
@@ -334,9 +358,13 @@ def from_object(cls, tnp: object) -> Optional[Self]:
         if not (name := getattr(np, "__name__", None)):
             return None
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         assert np == getattr(importlib.import_module(mod), name)
         # pyrefly: ignore [unbound-name]
+=======
+        assert np == getattr(importlib.import_module(mod), name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(mod, name)
 
 
@@ -423,6 +451,7 @@ def pickle(cls, op: object, options: Options) -> "_OpPickleData":
         if isinstance(op, str):
             return _OpStrPickleData(op)
 
+<<<<<<< HEAD
         if isinstance(getattr(op, "__wrapped__", None), AOTCompiledArtifact):
             assert hasattr(op, "__wrapped__")
             artifact = op.__wrapped__
@@ -431,13 +460,25 @@ def pickle(cls, op: object, options: Options) -> "_OpPickleData":
 
         name = torch.fx.Node._pretty_print_target(op)
 
+=======
+        name = torch.fx.Node._pretty_print_target(op)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(op, torch._ops.OpOverload):
             return cls._pickle_op(name, _OpOverloadPickleData, options)
         elif isinstance(op, torch._ops.OpOverloadPacket):
             return cls._pickle_op(name, _OpOverloadPacketPickleData, options)
+<<<<<<< HEAD
         elif name.startswith(_OpFunctionPickleData.SUPPORTED_ROOTS):
             root, detail = name.split(".", 1)
             return _OpFunctionPickleData(root, detail)
+=======
+        elif name.startswith(("builtins.", "math.", "torch.")):
+            root, detail = name.split(".", 1)
+            return _OpBuiltinPickleData(root, detail)
+        elif name.startswith("operator."):
+            _, detail = name.split(".", 1)
+            return _OpOperatorPickleData(detail)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # TODO: raise a BypassFxGraphCache so we will just bypass this one...
             raise NotImplementedError(f"TARGET: {type(op)} {op} {name}")
@@ -511,6 +552,7 @@ def unpickle(self, unpickle_state: _UnpickleState) -> torch._ops.OpOverloadPacke
         return obj
 
 
+<<<<<<< HEAD
 class _OpPrecompiledPickleData(_OpPickleData):
     def __init__(self, artifact: AOTCompiledArtifact) -> None:
         self.contents = artifact.serialize()
@@ -536,6 +578,9 @@ class _OpFunctionPickleData(_OpPickleData):
     # Static variable listing supported root names
     SUPPORTED_ROOTS = ("builtins.", "math.", "torch.", "operator.", "einops.")
 
+=======
+class _OpBuiltinPickleData(_OpPickleData):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, root: str, name: str) -> None:
         self.root = root
         self.name = name
@@ -549,6 +594,7 @@ def unpickle(self, unpickle_state: _UnpickleState) -> object:
             return self._getattr_by_name(math, self.name)
         elif self.root == "torch":
             return self._getattr_by_name(torch, self.name)
+<<<<<<< HEAD
         elif self.root == "operator":
             import operator
 
@@ -557,10 +603,25 @@ def unpickle(self, unpickle_state: _UnpickleState) -> object:
             import einops
 
             return self._getattr_by_name(einops, self.name)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise NotImplementedError
 
 
+<<<<<<< HEAD
+=======
+class _OpOperatorPickleData(_OpPickleData):
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    def unpickle(self, unpickle_state: _UnpickleState) -> object:
+        import operator
+
+        return self._getattr_by_name(operator, self.name)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _GraphPickleData:
     def __init__(self, graph: torch.fx.Graph, options: Options) -> None:
         self.tracer_cls = graph._tracer_cls
diff --git a/torch/fx/_lazy_graph_module.py b/torch/fx/_lazy_graph_module.py
index 83ce51fddd040..50d64d0dc1779 100644
--- a/torch/fx/_lazy_graph_module.py
+++ b/torch/fx/_lazy_graph_module.py
@@ -127,6 +127,14 @@ def _lazy_forward(self, *args, **kwargs):
 
     forward = _lazy_forward
 
+<<<<<<< HEAD
+=======
+    # TODO: we shold handle __reduce_deploy__ the same way as __reduce_package__,
+    # or __reduce__ by calling _real_recompile. But I don't find a good way
+    # to test __reduce_deploy__ out. Also it's very unlikely that LazyGraphModule
+    # will be used in torch::deploy. So it's skipped for now.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __reduce_package__(self, exporter: PackageExporter):
         """
         Follow GraphModule.__reduce__ but call 'self._real_recompile' rather
diff --git a/torch/fx/_pytree.py b/torch/fx/_pytree.py
index bfb62f871eb6f..6625b43f5c5e8 100644
--- a/torch/fx/_pytree.py
+++ b/torch/fx/_pytree.py
@@ -1,6 +1,10 @@
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import NamedTuple
 
 import torch.return_types
@@ -43,13 +47,21 @@ def tree_flatten_spec(
     # I guess these exist for BC, FC reasons.
     # In general, we should be able to directly
     # use pytree tree flattener to flatten them,
+<<<<<<< HEAD
     # as export serializes the pytree separately.
+=======
+    # as export serializes the pytree seperately.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Will remove it in follow up PR.
     if spec.type in SUPPORTED_NODES:
         flatten_fn_spec = SUPPORTED_NODES[spec.type]
         child_pytrees = flatten_fn_spec(pytree, spec)
         result = []
+<<<<<<< HEAD
         for child, child_spec in zip(child_pytrees, spec.children()):
+=======
+        for child, child_spec in zip(child_pytrees, spec.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             flat = tree_flatten_spec(child, child_spec)
             result += flat
         return result
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index cfce00fb05ea2..c0829de42b1c7 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -5,6 +5,7 @@
 import copy
 import functools
 import inspect
+<<<<<<< HEAD
 import logging
 import math
 import os
@@ -13,6 +14,15 @@
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
 from typing import Any, get_args, NamedTuple, Optional, TypeAlias, Union
+=======
+import math
+import os
+import warnings
+from itertools import chain
+from types import CodeType, FunctionType, ModuleType
+from typing import Any, Callable, get_args, NamedTuple, Optional, Union
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -27,8 +37,11 @@
 from .proxy import ParameterProxy, Proxy, Scope, ScopeContextManager, TracerBase
 
 
+<<<<<<< HEAD
 log = logging.getLogger(__name__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
 # These need to run in global scope to handle nested calls correctly
@@ -46,6 +59,7 @@
 _constant_attribute_types = get_args(_ConstantAttributeType)
 
 
+<<<<<<< HEAD
 # We only want to print this once to avoid flooding logs
 @functools.lru_cache
 def is_fx_tracing_warning():
@@ -66,6 +80,12 @@ def is_fx_symbolic_tracing():
     return _is_fx_tracing_flag and not torch.compiler.is_compiling()
 
 
+=======
+def is_fx_tracing():
+    return _is_fx_tracing_flag
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
@@ -164,7 +184,11 @@ def _patch_function(fn: FunctionType, nargs: int) -> FunctionType:
             co.co_name,
             co.co_qualname,  # type: ignore[attr-defined]
             co.co_firstlineno,
+<<<<<<< HEAD
             co.co_linetable,
+=======
+            co.co_lnotab,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             co.co_exceptiontable,  # type: ignore[attr-defined]
             co.co_freevars,
             co.co_cellvars,
@@ -454,6 +478,10 @@ def create_arg(self, a: Any) -> "Argument":
             setattr(self.root, qualname, a)
 
             return self.create_node("get_attr", qualname, (), {})
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().create_arg(a)
 
     @compatibility(is_backward_compatible=True)
@@ -603,7 +631,10 @@ def maybe_get_proxy_for_attr(
                             in inspect.signature(self.create_proxy).parameters
                         ):
                             kwargs["proxy_factory_fn"] = (
+<<<<<<< HEAD
                                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 None
                                 if not self.param_shapes_constant
                                 else lambda node: ParameterProxy(
@@ -709,11 +740,19 @@ def proxy_placeholder(name):
             root_fn = _patch_function(root_fn, len(args))
 
         flat_args, in_spec = pytree.tree_flatten(tuple(args))
+<<<<<<< HEAD
         if not all(child.is_leaf() for child in in_spec.children()):
             # In the case that we have pytree-flattened inputs in
             # `concrete_args`, generate a flattening wrapper around the
             # original root function and return that.
             self.graph._codegen = _PyTreeCodeGen(  # type: ignore[has-type]
+=======
+        if not all(child.is_leaf() for child in in_spec.children_specs):
+            # In the case that we have pytree-flattened inputs in
+            # `concrete_args`, generate a flattening wrapper around the
+            # original root function and return that.
+            self.graph._codegen = _PyTreeCodeGen(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _PyTreeInfo(orig_args[:total_args], in_spec, None)
             )
 
@@ -721,7 +760,11 @@ def flatten_fn(*args):
                 tree_args = pytree.tree_unflatten(list(args), in_spec)
                 tree_out = root_fn(*tree_args)
                 out_args, out_spec = pytree.tree_flatten(tree_out)
+<<<<<<< HEAD
                 assert isinstance(self.graph._codegen, _PyTreeCodeGen)  # type: ignore[has-type]
+=======
+                assert isinstance(self.graph._codegen, _PyTreeCodeGen)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.graph._codegen.pytree_info = (
                     self.graph._codegen.pytree_info._replace(out_spec=out_spec)
                 )
@@ -891,7 +934,11 @@ def __deepcopy__(self, memo):
         new_tracer = Tracer.__new__(Tracer)
 
         for k, v in self.__dict__.items():
+<<<<<<< HEAD
             if k == "_autowrap_search":
+=======
+            if k in {"_autowrap_search"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_obj = copy.copy(v)
             else:
                 new_obj = copy.deepcopy(v, memo)
@@ -927,11 +974,15 @@ def replace_ph(x):
 
                     return out
                 # Union[int, bool] == bool in Python <= 3.6
+<<<<<<< HEAD
                 if (
                     type(x) is bool
                     or type(x) in base_types
                     and type(x) is not torch.Tensor
                 ):
+=======
+                if type(x) == bool or type(x) in base_types and type(x) != torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch._assert(
                         out == x,
                         f"{name} has been specialized to have value {x} but got another value",
diff --git a/torch/fx/config.py b/torch/fx/config.py
index db06176c43e13..fcd6d1ef6ebf9 100644
--- a/torch/fx/config.py
+++ b/torch/fx/config.py
@@ -1,5 +1,9 @@
 # Whether to disable showing progress on compilation passes
+<<<<<<< HEAD
 # Need to add a new config otherwise will get a circular import if dynamo config is imported here
+=======
+# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 disable_progress = True
 
 # If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
diff --git a/torch/fx/experimental/_constant_symnode.py b/torch/fx/experimental/_constant_symnode.py
index b3b40bda324c8..565dad0dc7356 100644
--- a/torch/fx/experimental/_constant_symnode.py
+++ b/torch/fx/experimental/_constant_symnode.py
@@ -41,12 +41,15 @@ def __repr__(self) -> str:
     def _graph_repr(self) -> str:
         return self._str()
 
+<<<<<<< HEAD
     def add(self, other: Any) -> Any:
         return other.add(self)
 
     def sub(self, other: Any) -> Any:
         return other.neg().add(self.val)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def mul(self, other: Any) -> Any:
         return other.mul(self)
 
@@ -73,6 +76,9 @@ def is_symbolic(self) -> bool:
 
     def constant_int(self) -> int:
         return self.val
+<<<<<<< HEAD
 
     def guard_int(self, file: str, line: int) -> int:
         return self.val
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/fx/experimental/_dynamism.py b/torch/fx/experimental/_dynamism.py
index f6f30779ecc28..17588356344a9 100644
--- a/torch/fx/experimental/_dynamism.py
+++ b/torch/fx/experimental/_dynamism.py
@@ -1,6 +1,10 @@
 import re
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.utils._pytree import tree_flatten_with_path, tree_map
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index 5aa976a2a1218..9aff1b4072ec7 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -657,10 +657,14 @@ def combine_partitions_based_on_size(
                 # Mark bfs level
                 get_bfs_level_partition(self.partitions)
                 find_combination, partitions = find_partition_to_combine_based_on_size(
+<<<<<<< HEAD
                     sorted_partitions,
                     available_mem_bytes,
                     # pyrefly: ignore [bad-argument-type]
                     partitions,
+=======
+                    sorted_partitions, available_mem_bytes, partitions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             return
 
@@ -705,7 +709,10 @@ def reset_partition_in_sparse_nn(partition, new_partition=True):
                 non_embedding_partitions.append(partition)
             if new_partition:
                 partition = self.create_partition()
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 partition.left_mem_bytes = available_mem_bytes
                 return partition
             return None
@@ -1001,7 +1008,10 @@ def swap_node_to_partition(
                     node, n1, p0, p1, node_to_latency_mapping, transfer_rate_per_sec
                 )
                 if cost < min_cost:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node_pair = [node, n1]
                     min_cost = cost
             return cost, node_pair  # type: ignore[possibly-undefined]
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index d4a56a808bc1b..512ede0e6aab5 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import re
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.fx
 from torch.fx.node import map_arg
@@ -165,9 +169,12 @@ def split_const_subgraphs(
     attributes on the module prior to running the non-constant portion of the
     graph.
     """
+<<<<<<< HEAD
 
     import sympy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(module, torch.fx.GraphModule):
         mod_traced = torch.fx.symbolic_trace(module)
     else:
@@ -198,10 +205,13 @@ def split_const_subgraphs(
         if node.is_impure():
             continue
 
+<<<<<<< HEAD
         # Skip folding nodes that have symbolic fill_value
         if isinstance(node.kwargs.get("fill_value", None), sympy.Expr):
             continue
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Must be a constant foldable node at this point.
         const_nodes.add(node)
         if node.op != "get_attr":
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index d1ca9bc0c8805..b550850455a53 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -1,9 +1,14 @@
 # mypy: allow-untyped-defs
 import itertools
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import reduce
 from typing import TypeVar
+=======
+from functools import reduce
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import sympy
@@ -83,7 +88,11 @@ def expand_to_tensor_dim(t, n):
 def broadcast_types(t1, t2):
     """
     Applies broadcasting to both given types such that they
+<<<<<<< HEAD
     become consistent with each other and returns two new
+=======
+    become consistent with eachother and returns two new
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     resulting types
     """
 
@@ -181,12 +190,20 @@ def add_inference_rule(n: Node):
     t2 = n.args[1].type
 
     # handle scalar addition
+<<<<<<< HEAD
     if t1 is int and isinstance(t2, TensorType):
+=======
+    if t1 == int and isinstance(t2, TensorType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         n.type = t2
         return n.type
 
     # handle scalar addition
+<<<<<<< HEAD
     elif t2 is int and isinstance(t1, TensorType):
+=======
+    elif t2 == int and isinstance(t1, TensorType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         n.type = t1
         return n.type
 
@@ -847,7 +864,11 @@ def flatten_refinement_rule(n: Node):
 @register_algebraic_expressions_inference_rule(Conv2d)
 def conv_rule(n: Node, module_instance):
     """
+<<<<<<< HEAD
     Represents the output in terms of an algrbraic expression w.r.t
+=======
+    Represents the outout in terms of an algrbraic expression w.r.t
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     the input when possible
     """
     assert isinstance(n.args[0], Node)
@@ -943,11 +964,21 @@ def refine_node(self, n: Node):
         if n.op == "call_function":
             if n.target in _REFINEMENT_RULES:
                 self.constraints += _REFINEMENT_RULES[n.target](n)
+<<<<<<< HEAD
+=======
+            else:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if n.op == "call_module":
             module_instance = self.traced.get_submodule(n.target)
             if type(module_instance) in _REFINEMENT_RULES:
                 self.constraints += _REFINEMENT_RULES[type(module_instance)](n)
+<<<<<<< HEAD
+=======
+            else:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if n.op == "output":
 
@@ -957,16 +988,32 @@ def get_node_type(a):
             n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
             return n.type
 
+<<<<<<< HEAD
+=======
+        else:
+            pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def infer_symbolic_relations(self, n: Node):
         n.type = self.convert_to_sympy_symbols(n.type)
         if n.op == "call_function":
             if n.target in _RULES:
                 return _RULES[n.target](n)
+<<<<<<< HEAD
+=======
+            else:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if n.op == "call_module":
             module_instance = self.traced.get_submodule(n.target)
             if type(module_instance) in _RULES:
                 return _RULES[type(module_instance)](n, module_instance)
+<<<<<<< HEAD
+=======
+            else:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if n.op == "output":
 
@@ -976,6 +1023,12 @@ def get_node_type(a):
             n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
             return n.type
 
+<<<<<<< HEAD
+=======
+        else:
+            pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_parameter(traced, target: str):
     """
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
index bd239d78842dd..cc483e0f3600a 100644
--- a/torch/fx/experimental/merge_matmul.py
+++ b/torch/fx/experimental/merge_matmul.py
@@ -30,7 +30,10 @@ def split_result_tensors(
     else:
         splits = [x.shape[0] for x in inputs]
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.split(result, splits)
 
 
diff --git a/torch/fx/experimental/meta_tracer.py b/torch/fx/experimental/meta_tracer.py
index cb3adfba8d412..15b313ef88495 100644
--- a/torch/fx/experimental/meta_tracer.py
+++ b/torch/fx/experimental/meta_tracer.py
@@ -2,8 +2,12 @@
 import builtins
 import functools
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -172,6 +176,7 @@ def create_proxy(
         proxy_factory_fn=None,
     ):
         rv = super().create_proxy(
+<<<<<<< HEAD
             kind,
             target,
             args,
@@ -180,6 +185,9 @@ def create_proxy(
             type_expr,
             # pyrefly: ignore [bad-argument-type]
             proxy_factory_fn,
+=======
+            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if kind == "placeholder" and target in self.meta_args:
@@ -201,7 +209,10 @@ def create_proxy(
 
             if kind == "call_function":
                 meta_target = manual_meta_overrides.get(target, target)
+<<<<<<< HEAD
                 # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 meta_out = meta_target(*args_metas, **kwargs_metas)
             elif kind == "call_method":
                 meta_target = getattr(args_metas[0], target)  # type: ignore[index]
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint.py b/torch/fx/experimental/migrate_gradual_types/constraint.py
index 388d716245d4f..40506479a9530 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint.py
@@ -164,7 +164,11 @@ class TGreatestUpperBound(Constraint):
 
     def __init__(self, res, rhs1, rhs2):
         """
+<<<<<<< HEAD
         :param res: tensor variable that stores the result of the output
+=======
+        :param res: tensor variable that stores the result of the outout
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :param rhs1: tensor or tensor variable
         :param rhs2: tensor or tensor variabke
         """
@@ -407,7 +411,11 @@ def __init__(
         """
         :param conv_result: the convolution result
         :param input_var: input to convolution
+<<<<<<< HEAD
         :param c_out: output channel type
+=======
+        :param c_out: output chanel type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :param kernel: kernel tuple
         """
         self.conv_result = conv_result
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
index 6ff260f227e9e..82038d64d0854 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import operator
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import TypeVar
+=======
+from collections.abc import Iterable
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -528,6 +533,7 @@ def view_inference_rule(n: Node, symbols, constraints, counter):
         if t == -1:
             var, counter = gen_dvar(counter)
             t2_type.append(var)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             num_constraints.append(BinConstraintD(var, Dyn, op_neq))
 
@@ -535,6 +541,13 @@ def view_inference_rule(n: Node, symbols, constraints, counter):
             # pyrefly: ignore [bad-argument-type]
             num_constraints.append(BinConstraintD(t, Dyn, op_neq))
             t2_type.append(t)  # type: ignore[arg-type]
+=======
+            num_constraints.append(BinConstraintD(var, Dyn, op_neq))
+
+        else:
+            num_constraints.append(BinConstraintD(t, Dyn, op_neq))
+            t2_type.append(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     t2_type = TensorType(t2_type)  # type: ignore[assignment]
 
@@ -683,7 +696,11 @@ def getitem_inference_rule(n: Node, symbols, constraints, counter):
     # tensor output case
     elif isinstance(n.args[1], tuple):
         # create and store the new tensor variable
+<<<<<<< HEAD
         get_item_output, counter = gen_tvar(counter)  # type: ignore[arg-type,assignment]
+=======
+        get_item_output, counter = gen_tvar(counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         symbols[n] = get_item_output
 
         # retrieve arg variables
@@ -1075,7 +1092,11 @@ def broadcasting_inference_rule(n: Node, symbols, constraints, counter):
             e1 = symbols[n.args[0]]
             return [BinConstraintT(my_output, e1, op_eq)], counter
         elif isinstance(symbols[n.args[0]], DVar):
+<<<<<<< HEAD
             my_output, counter = gen_dvar(counter)  # type: ignore[arg-type,assignment]
+=======
+            my_output, counter = gen_dvar(counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             symbols[n] = my_output
             e1 = symbols[n.args[0]]
 
@@ -1097,7 +1118,11 @@ def broadcasting_inference_rule(n: Node, symbols, constraints, counter):
             e2 = symbols[n.args[1]]
             return [BinConstraintT(my_output, e2, op_eq)], counter
         elif isinstance(symbols[n.args[1]], DVar):
+<<<<<<< HEAD
             my_output, counter = gen_dvar(counter)  # type: ignore[arg-type,assignment]
+=======
+            my_output, counter = gen_dvar(counter)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             symbols[n] = my_output
             e2 = symbols[n.args[1]]
 
@@ -1477,7 +1502,10 @@ def generate_constraints(self, counter=0):
 
         all_constraints = []
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for n in graph.nodes:
             (constraints, counter) = self.generate_constraints_node(n, counter)
             all_constraints += constraints
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
index 0782ba5affc9c..c8597e69cd977 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
@@ -1,7 +1,11 @@
 # mypy: ignore-errors
 import copy
 import itertools
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.fx.experimental.migrate_gradual_types.constraint import (
     ApplyBroadcasting,
@@ -823,7 +827,11 @@ def calc_last_two_dims(constraint, d: list[DVar]):
         [BinConstraintD(d[3], Dyn, op_neq), BinConstraintD(b4, Dyn, op_neq)]
     )
 
+<<<<<<< HEAD
     # transform parameters into tuples in case they are not already
+=======
+    # transform parameters into tuples incase they are not already
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding = (
         (constraint.padding, constraint.padding)
         if isinstance(constraint.padding, int)
diff --git a/torch/fx/experimental/migrate_gradual_types/util.py b/torch/fx/experimental/migrate_gradual_types/util.py
index b160ec8de70f9..5e1a85a19a3e9 100644
--- a/torch/fx/experimental/migrate_gradual_types/util.py
+++ b/torch/fx/experimental/migrate_gradual_types/util.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.migrate_gradual_types.constraint import (
     BinConstraintD,
     BVar,
@@ -7,7 +11,11 @@
 from torch.fx.experimental.migrate_gradual_types.operation import op_leq
 
 
+<<<<<<< HEAD
 def gen_tvar(curr: int) -> tuple[TVar, int]:
+=======
+def gen_tvar(curr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a tensor variable
     :param curr: The current counter
@@ -17,7 +25,11 @@ def gen_tvar(curr: int) -> tuple[TVar, int]:
     return TVar(curr), curr
 
 
+<<<<<<< HEAD
 def gen_dvar(curr: int) -> tuple[DVar, int]:
+=======
+def gen_dvar(curr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a dimension variable
     :param curr: the current counter
@@ -27,7 +39,11 @@ def gen_dvar(curr: int) -> tuple[DVar, int]:
     return DVar(curr), curr
 
 
+<<<<<<< HEAD
 def gen_bvar(curr: int) -> tuple[BVar, int]:
+=======
+def gen_bvar(curr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a boolean variable
     :param curr: the current counter
@@ -37,7 +53,11 @@ def gen_bvar(curr: int) -> tuple[BVar, int]:
     return BVar(curr), curr
 
 
+<<<<<<< HEAD
 def gen_tensor_dims(n: int, curr: int) -> tuple[list[DVar], int]:
+=======
+def gen_tensor_dims(n, curr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate a list of tensor dimensions
     :param n:  the number of dimensions
@@ -51,7 +71,11 @@ def gen_tensor_dims(n: int, curr: int) -> tuple[list[DVar], int]:
     return dims, curr
 
 
+<<<<<<< HEAD
 def gen_nat_constraints(list_of_dims: list[DVar]) -> list[BinConstraintD]:
+=======
+def gen_nat_constraints(list_of_dims):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generate natural number constraints for dimensions
     """
diff --git a/torch/fx/experimental/normalize.py b/torch/fx/experimental/normalize.py
index e2dd3c962bbe4..fc65cf77a6f8a 100644
--- a/torch/fx/experimental/normalize.py
+++ b/torch/fx/experimental/normalize.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -47,12 +51,20 @@ def run_node(self, n: Node) -> Any:
 
         def get_type(arg):
             if isinstance(arg, fx.Node):
+<<<<<<< HEAD
                 return n.meta.get("type")
+=======
+                return n.meta["type"] if "type" in n.meta else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return type(arg)
 
         arg_types = map_aggregate(n.args, get_type)
         assert isinstance(arg_types, tuple)
+<<<<<<< HEAD
         arg_types = tuple(create_type_hint(i) for i in arg_types)
+=======
+        arg_types = tuple([create_type_hint(i) for i in arg_types])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwarg_types = {k: get_type(v) for k, v in kwargs.items()}
         if n.op == "call_function":
             out = self.call_function(n.target, args, kwargs, arg_types, kwarg_types)
diff --git a/torch/fx/experimental/optimization.py b/torch/fx/experimental/optimization.py
index 219e6f66c7bf5..5109af8a99179 100644
--- a/torch/fx/experimental/optimization.py
+++ b/torch/fx/experimental/optimization.py
@@ -193,7 +193,10 @@ def modules_to_mkldnn(nodes: list[fx.Node], modules: dict[str, nn.Module]):
             assert isinstance(node.target, str)
             cur_module = modules[node.target]
             if type(cur_module) in mkldnn_map:
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_module = mkldnn_map[type(cur_module)](cur_module, torch.float)
                 assert isinstance(new_module, nn.Module)
                 old_modules[new_module] = copy.deepcopy(cur_module)
@@ -264,10 +267,14 @@ def benchmark(f):
         )
 
         reset_modules(
+<<<<<<< HEAD
             submodule.graph.nodes,
             dict(submodule.named_modules()),
             # pyrefly: ignore [bad-argument-type]
             old_modules,
+=======
+            submodule.graph.nodes, dict(submodule.named_modules()), old_modules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         no_mkl_time = benchmark(lambda: submodule(*sample_inputs))
         return mkl_time < no_mkl_time
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index f4152621a5dd7..e7275e95854f4 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -11,17 +11,29 @@
 import inspect
 import logging
 import operator
+<<<<<<< HEAD
 import threading
+=======
+import traceback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import typing
 import typing_extensions
 import weakref
 from collections import defaultdict, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Mapping, Sequence
+=======
+from collections.abc import Generator, Mapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import _GeneratorContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
 from typing import (
     Any,
+<<<<<<< HEAD
     Concatenate,
+=======
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Optional,
     overload,
     Protocol,
@@ -29,7 +41,11 @@
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import ParamSpec, Self, TypeVarTuple, Unpack
+=======
+from typing_extensions import Concatenate, ParamSpec, Self, TypeVarTuple, Unpack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakKeyDictionary
 
 import torch
@@ -67,6 +83,10 @@
 )
 from torch.utils._stats import count
 from torch.utils._thunk import Thunk
+<<<<<<< HEAD
+=======
+from torch.utils._traceback import CapturedTraceback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.weak import _WeakHashRef, WeakIdKeyDictionary, WeakTensorKeyDictionary
 
 from ._backward_state import BackwardState
@@ -124,7 +144,10 @@
     torch.Size,
     lambda xs: (list(xs), None),
     lambda xs, _: tuple(xs),
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flatten_with_keys_fn=lambda xs: (
         [(pytree.SequenceKey(i), x) for i, x in enumerate(xs)],
         None,
@@ -181,7 +204,11 @@ def is_sym_node(node: _HasMeta) -> bool:
     return "val" in node.meta and isinstance(node.meta["val"], py_sym_types)
 
 
+<<<<<<< HEAD
 @overload  # type: ignore[no-overload-impl]
+=======
+@overload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def set_proxy_slot(obj: Tensor, tracer: _ProxyTracer, proxy: _ProxyTensor) -> None: ...
 
 
@@ -197,6 +224,7 @@ def set_proxy_slot(
 ) -> None: ...
 
 
+<<<<<<< HEAD
 class _DisableUpdateTensorTracker(threading.local):
     value: bool = False
 
@@ -260,6 +288,9 @@ def f_graph_2(x, y, z):
 
 
 def set_proxy_slot(  # type: ignore[no-redef]
+=======
+def set_proxy_slot(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     obj: Union[PySymType, _AnyScriptObjectType, Tensor],
     tracer: _ProxyTracer,
     proxy: object,
@@ -269,9 +300,13 @@ def set_proxy_slot(  # type: ignore[no-redef]
         # We DO want to clobber proxies whenever we run an inplace operation
         # on a tensor, and it affects the metadata on the proxy.
         assert isinstance(proxy, _ProxyTensor)
+<<<<<<< HEAD
         # see NOTE [Do not clobber inplace ops]
         if not _is_proxy_tensor_update_tensor_tracker_disabled():
             tracer.tensor_tracker[obj] = proxy
+=======
+        tracer.tensor_tracker[obj] = proxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(obj, (_AnyScriptObject)):
         # We DO want to clobber proxies, with a similar rationale as for tensors.
         assert isinstance(proxy, Proxy)
@@ -285,8 +320,12 @@ def set_proxy_slot(  # type: ignore[no-redef]
         # is derivable from a primal that we use that.
         assert isinstance(obj, py_sym_types), type(obj)
         if obj not in tracer.symnode_tracker:
+<<<<<<< HEAD
             proxy = typing.cast(_PySymProxyType, proxy)
             tracer.symnode_tracker[obj] = proxy
+=======
+            tracer.symnode_tracker[obj] = typing.cast(_PySymProxyType, proxy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # WAR: python test/dynamo/test_subclasses.py
             # TestNestedTensor.test_basic_autograd
@@ -303,14 +342,21 @@ def set_proxy_slot(  # type: ignore[no-redef]
             import sympy
 
             if isinstance(obj.node.expr, sympy.Symbol):
+<<<<<<< HEAD
                 tracer.sympy_expr_tracker[obj.node.expr] = _SympyExprTrackerValue(
                     proxy, obj
                 )
+=======
+                tracer.sympy_expr_tracker[obj.node.expr] = proxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def has_proxy_slot(obj: Tensor, tracer: _ProxyTracer) -> bool:
     assert isinstance(obj, (Tensor, SymNode)), type(obj)
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return bool(get_proxy_slot(obj, tracer, False, lambda _: True))
 
 
@@ -407,6 +453,7 @@ def get_proxy_slot(
         assert isinstance(obj, py_sym_types), type(obj)
         tracker = tracer.symnode_tracker
 
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
     value = tracker.get(obj)
 
@@ -431,10 +478,25 @@ def get_proxy_slot(
             )
         return default
 
+=======
+    if obj not in tracker:
+        # Last ditch
+        if isinstance(obj, py_sym_types) and obj.node.expr in tracer.sympy_expr_tracker:
+            value = tracer.sympy_expr_tracker[obj.node.expr]
+        else:
+            if isinstance(default, _NoDefault):
+                raise RuntimeError(
+                    f"{obj} ({id(obj)})is not tracked with proxy for {tracer}"
+                )
+            return default
+    else:
+        value = tracker[obj]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     res = transform(value)
     return res
 
 
+<<<<<<< HEAD
 @functools.cache
 def _sympy_handlers() -> dict[type[sympy.Expr], Callable[..., Any]]:
     """
@@ -546,6 +608,8 @@ def _build_proxy_for_sym_expr(
     return out
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def snapshot_fake(val: Tensor, include_real: bool = False) -> Optional[Tensor]:
     # val.detach() will also eventually call fast_detach(),
     # but this saves us a full trip into __torch_dispatch__
@@ -926,17 +990,29 @@ def _maybe_record_pointwise_barrier(
     func: object, proxy_mode: ProxyTorchDispatchMode
 ) -> None:
     """
+<<<<<<< HEAD
     Records operators whose tensor outputs or inputs are fp16/bf16 so downstream pointwise code can
     emulate eager's rounding behavior when emulate_precision_casts is enabled.
+=======
+    Records pointwise operators in user program (non decomposed) that were output in fp16/bf16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if proxy_mode.decomp_layers or not proxy_mode.emulate_precision_casts:
         return
 
+<<<<<<< HEAD
     if not isinstance(func, torch._ops.OpOverload):
+=======
+    if (
+        not isinstance(func, torch._ops.OpOverload)
+        or torch.Tag.pointwise not in func.tags
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     last_node = next(iter(reversed(proxy_mode.tracer.graph.nodes)))
     t = last_node.meta.get("val")
+<<<<<<< HEAD
     low_pr_fp = (torch.bfloat16, torch.float16)
 
     output_low_precision = isinstance(t, torch.Tensor) and t.dtype in low_pr_fp
@@ -949,11 +1025,18 @@ def _maybe_record_pointwise_barrier(
                 break
 
     if not output_low_precision:
+=======
+    if not isinstance(t, torch.Tensor) or t.dtype not in (
+        torch.bfloat16,
+        torch.float16,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     last_node.meta["low_precision_pointwise_barrier"] = True
 
 
+<<<<<<< HEAD
 def _fetch_proxies_and_all_constant_flag(
     flat_args_kwargs: Union[list[object], tuple[object, ...]], tracer: _ProxyTracer
 ) -> tuple[list[object], tuple[object, ...], bool]:
@@ -997,6 +1080,8 @@ def _fetch_proxies_and_all_constant_flag(
     return f_flat_args_kwargs, tuple(proxy_flat_args_kwargs), all_constant
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def proxy_call(
     proxy_mode: ProxyTorchDispatchMode,
     func: OpOverload,
@@ -1010,7 +1095,11 @@ def proxy_call(
     def can_handle_tensor(x: Tensor) -> bool:
         r = type(x) in HANDLED_TYPES or has_proxy_slot(x, proxy_mode.tracer)
         if proxy_mode._allow_fake_constant:
+<<<<<<< HEAD
             r = r or type(x) is torch._subclasses.FakeTensor
+=======
+            r = r or type(x) in (torch._subclasses.FakeTensor,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not r:
             unrecognized_types.append(type(x))
         return r
@@ -1049,8 +1138,32 @@ def can_handle_tensor(x: Tensor) -> bool:
             return (args[0] != 0).item()  # type: ignore[attr-defined]
 
     tracer = proxy_mode.tracer
+<<<<<<< HEAD
     f_flat_args_kwargs, proxy_flat_args_kwargs, all_constant = (
         _fetch_proxies_and_all_constant_flag(flat_args_kwargs, tracer)
+=======
+    f_flat_args_kwargs = [
+        (
+            fetch_object_proxy(tracer, x)
+            if isinstance(x, (Tensor, _AnyScriptObject))
+            else x
+        )
+        for x in flat_args_kwargs
+    ]
+
+    # If there are SymInts, we also should not consider this constant.
+    # However, fake tensor handling of SymInts is sufficiently broken that
+    # I couldn't write a test for this case
+    all_constant = (
+        not any(
+            t.constant is None
+            for t in f_flat_args_kwargs
+            if isinstance(t, _ProxyTensor)
+        )
+        # TODO: maybe constant SymInts should also be allowed?  Not sure if
+        # this can happen
+        and not any(isinstance(x, py_sym_types) for x in flat_args_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if torch.Tag.data_dependent_output in func.tags:
@@ -1078,6 +1191,16 @@ def can_handle_tensor(x: Tensor) -> bool:
                 "in your make_fx call."
             )
 
+<<<<<<< HEAD
+=======
+    proxy_flat_args_kwargs = [
+        e.proxy if isinstance(e, _ProxyTensor) else e for e in f_flat_args_kwargs
+    ]
+    proxy_flat_args_kwargs = [
+        (fetch_sym_proxy(proxy_mode.tracer)(e) if isinstance(e, py_sym_types) else e)
+        for e in proxy_flat_args_kwargs
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     proxy_args, proxy_kwargs = pytree.tree_unflatten(proxy_flat_args_kwargs, spec)
 
     # When we trace through a torch.tensor invocation, you never actually
@@ -1224,6 +1347,7 @@ def __len__(self) -> int:
         return len(self.sym_node_dict)
 
 
+<<<<<<< HEAD
 @dataclass
 class _SympyExprTrackerValue:
     proxy: _PySymProxyType
@@ -1237,6 +1361,16 @@ class PythonKeyTracer(Tracer):
     tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
     torch_fn_counts: dict[OpOverload, int]
     enable_thunkify: bool = False
+=======
+class PythonKeyTracer(Tracer):
+    script_object_tracker: MutableMapping[_AnyScriptObjectType, Proxy]
+    symnode_tracker: _SymNodeDict
+    sympy_expr_tracker: dict[sympy.Symbol, object]
+    tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
+    torch_fn_counts: dict[OpOverload, int]
+    enable_thunkify: bool = False
+    stack_trace: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self) -> None:
         super().__init__(autowrap_modules=())  # type: ignore[arg-type]
@@ -1245,7 +1379,11 @@ def __init__(self) -> None:
         self.script_object_tracker = WeakIdKeyDictionary(
             dict=None, ref_type=_WeakHashRef
         )
+<<<<<<< HEAD
         self.sympy_expr_tracker = {}
+=======
+        self.sympy_expr_tracker = dict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Stores the torch function that was called during tracing
         self.torch_fn_metadata = None
@@ -1300,7 +1438,11 @@ def unwrap_proxy(
 
     def unwrap_proxy(self, e: T) -> object:
         if isinstance(e, Tensor):
+<<<<<<< HEAD
             return get_proxy_slot(e, self, e, lambda x: x.proxy)  # type: ignore[attr-defined]
+=======
+            return get_proxy_slot(e, self, e, lambda x: x.proxy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(e, py_sym_types):
             return get_proxy_slot(e, self, e, lambda e: e.force())
         elif isinstance(e, _AnyScriptObject):
@@ -1319,8 +1461,43 @@ def create_node(
     ) -> torch.fx.Node:
         node = super().create_node(kind, target, args, kwargs, name, type_expr)  # type: ignore[arg-type]
 
+<<<<<<< HEAD
         if node.op in ["placeholder", "output"] and "stack_trace" in node.meta:
             del node.meta["stack_trace"]
+=======
+        # stack_trace
+        if (
+            self.stack_trace
+            and "stack_trace" not in node.meta
+            and node.op not in ["placeholder", "output"]
+        ):
+            user_frame_summary = CapturedTraceback.extract().summary()
+            if user_frame_summary:
+                # we retain frames from forward() calls, or ops
+                # located in torch/__init__.py (e.g. sym_int, sym_constrain_range, vmap)
+                stack_trace = [
+                    frame
+                    for frame in user_frame_summary
+                    if (
+                        frame.name == "forward"
+                        or frame.filename.endswith("torch/__init__.py")
+                    )
+                ]
+                # filter out forward() frames from fx/_symbolic_trace.py, export/_trace.py
+                # this is hardcoded, but leads to a much cleaner stack trace
+                stack_trace = [
+                    frame
+                    for frame in stack_trace
+                    if not frame.filename.endswith(
+                        ("fx/_symbolic_trace.py", "export/_trace.py")
+                    )
+                ]
+                if (
+                    stack_trace
+                ):  # empty list for strict mode, dynamo should handle stack_trace
+                    stack_trace = traceback.StackSummary.from_list(stack_trace)
+                    node.meta["stack_trace"] = "".join(stack_trace.format()).strip()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if kind == "get_attr":
             assert isinstance(target, str)
@@ -1489,20 +1666,26 @@ def wrap_key(
 
     @functools.wraps(f)
     def wrapped(*proxies: _P.args, **_unused: _P.kwargs) -> R:
+<<<<<<< HEAD
         nonlocal tensors
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_proxies, _proxies_spec = pytree.tree_flatten(proxies)
         assert len(flat_proxies) == len(flat_tensors)
         with disable_proxy_modes_tracing() as m:
             assert isinstance(m, ProxyTorchDispatchMode)
             track_tensor_tree(flat_tensors, flat_proxies, constant=None, tracer=tracer)
 
+<<<<<<< HEAD
         if getattr(tracer, "proxy_module_inputs", False):
             tensors = [  # type: ignore[assignment, var-annotated]
                 p if isinstance(t, torch.nn.Module) else t
                 for t, p in zip(tensors, proxies)  # type: ignore[arg-type]
             ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def get_tensor_proxy_slot(t: Tensor) -> Union[Tensor, Proxy]:
             return get_proxy_slot(t, tracer, t, lambda x: x.proxy)  # type: ignore[attr-defined]
 
@@ -1552,7 +1735,10 @@ def __torch_function__(
         kwargs: Optional[dict[str, object]] = None,
     ) -> object:
         kwargs = kwargs or {}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.tracer.torch_fn_metadata = func
         self.tracer.torch_fn_counts[func] = self.tracer.torch_fn_counts.get(func, 0) + 1
         return func(*args, **kwargs)
@@ -1584,7 +1770,11 @@ def __torch_function__(
         kwargs = kwargs or {}
         if func in _side_effectful_need_to_be_preserved_pre_dispatch:
             # It's for passing the export verifier which needs to verify the meta['val']
+<<<<<<< HEAD
             # TODO(tmanlaibaatar): we should systematically couple it with export verifier,
+=======
+            # TODO(tmanlaibaatar): we should systematically couple it with expoert verifier,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # instead of hardcoding it here.
             # T203648563
             if func == torch.amp.autocast_mode._exit_autocast:
@@ -1599,6 +1789,7 @@ def __torch_function__(
                 torch.amp.autocast_mode._exit_autocast,
             ]:
                 node.meta["val"] = None
+<<<<<<< HEAD
             # For autocast, the python APIs run so we don't have to run them again
             # here.
             if func is torch._C._set_grad_enabled:
@@ -1626,6 +1817,11 @@ def __torch_function__(
             res = func(*args, **kwargs)
             track_tensor_tree(res, out_proxy, constant=None, tracer=self.tracer)
             return res
+=======
+            return node
+            # Don't actually run the function! We just want to trace the calls
+            # into a graph. We don't actualy want to change global autograd state.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return func(*args, **kwargs)
 
 
@@ -1678,7 +1874,11 @@ def __torch_dispatch__(
         with set_original_aten_op(func):
             kwargs = kwargs or {}
 
+<<<<<<< HEAD
             if func == prim.device.default:
+=======
+            if func in (prim.device.default,):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return func(*args, **kwargs)
 
             return proxy_call(self, func, self.pre_dispatch, args, kwargs)
@@ -1708,6 +1908,42 @@ def __exit__(
     def is_infra_mode(cls) -> bool:
         return True
 
+<<<<<<< HEAD
+=======
+    def _compute_proxy(
+        self, func: OpOverload, args: tuple[object, ...], out: PySymType
+    ) -> Proxy:
+        # Handle torch.sym_sum
+        n_args: tuple[object, ...]
+        if len(args) == 1 and isinstance(args[0], (list, tuple)):
+            n_args = (
+                tuple(
+                    (
+                        get_proxy_slot(a, self.tracer).force().node
+                        if isinstance(a, py_sym_types)
+                        else a
+                    )
+                    for a in args[0]
+                ),
+            )
+        else:
+            n_args = tuple(
+                (
+                    get_proxy_slot(a, self.tracer).force().node
+                    if isinstance(a, py_sym_types)
+                    else a
+                )
+                for a in args
+            )
+
+        # func doesn't have a __torch_function__ that Proxy can interpose, so
+        # we gotta do it manually
+        n_out = self.tracer.create_node("call_function", func, n_args, {})  # type: ignore[arg-type]
+        p_out = fx.Proxy(n_out, self.tracer)
+        set_meta(p_out, out)
+        return p_out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __sym_dispatch__(
         self,
         func: OpOverload,
@@ -1728,6 +1964,7 @@ def __sym_dispatch__(
         # We also assume there are no keyword arguments.
         assert not kwargs
         out = func(*args, **kwargs)
+<<<<<<< HEAD
         _sym_register(self.tracer, func, args, out)
         return out
 
@@ -1778,13 +2015,31 @@ def _compute_proxy(
     p_out = fx.Proxy(n_out, tracer)
     set_meta(p_out, out)
     return p_out
+=======
+
+        # If func returned a constant, we don't need to trace; we have
+        # determined that the result is constant (no matter if the inputs
+        # were symbolic) and it is no longer necessary to trace the
+        # computation.  This could occur if func triggered some guards.
+        if isinstance(out, py_sym_types):
+            p_out_thunk = thunkify(
+                self.tracer, self._compute_proxy, func=func, args=args, out=out
+            )
+            set_proxy_slot(out, self.tracer, p_out_thunk)
+
+        return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _GraphAppendingTracerEx(fx.proxy.GraphAppendingTracer):
     script_object_tracker: MutableMapping[_AnyScriptObjectType, Proxy]
     symnode_tracker: MutableMapping[PySymType, _PySymProxyType]
     tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
+<<<<<<< HEAD
     sympy_expr_tracker: dict[sympy.Symbol, _SympyExprTrackerValue]
+=======
+    sympy_expr_tracker: dict[sympy.Symbol, object]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch_fn_metadata: Optional[OpOverload]
     torch_fn_counts: dict[OpOverload, int]
     enable_thunkify: bool = False
@@ -1821,7 +2076,10 @@ def __init__(
         self.decomposition_table = decomposition_table or {}
         self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def placeholder(
         self,
         target: str,  # type: ignore[override]
@@ -1834,7 +2092,10 @@ def placeholder(
         # TODO handle case where the first character of target is '*'
         return out
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_attr(
         self,
         target: str,  # type: ignore[override]
@@ -1848,7 +2109,10 @@ def get_attr(
 
     # call_function, call_method, call_module get traced automatically by the outer mode.
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def output(
         self,
         target: str,  # type: ignore[override]
@@ -1928,8 +2192,12 @@ class _ModuleStackTracer(PythonKeyTracer):
 
     def __init__(self, scope_root: GraphModule) -> None:
         super().__init__()
+<<<<<<< HEAD
         self.record_stack_traces = True
         self._record_forward_stack_traces_only = True
+=======
+        self.stack_trace = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.scope_root = scope_root
         self.enable_attr_proxy = False
         self.submodule_paths = {}
@@ -1967,7 +2235,10 @@ def __init__(self, base: Union[Module, _AttrProxy], path: str) -> None:
                 # Class is modified to be a subclass of torch.nn.Module
                 # Warning: We blow away our own attributes here to mimic the base class
                 # - so don't expect `self.x` to do anything useful.
+<<<<<<< HEAD
                 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.__class__ = type(
                     base.__class__.__name__,
                     (self.__class__, base.__class__),
@@ -1990,7 +2261,10 @@ def __getattr__(self, name: str) -> AttrProxy:
                 if not isinstance(attr_val, Module):
                     return attr_val
 
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return AttrProxy(attr_val, tracer.proxy_paths[self] + "." + name)
 
             def get_base(self) -> Module:
@@ -2003,12 +2277,18 @@ def __getitem__(self, idx: Union[int, slice]) -> AttrProxy:
                         res = torch.nn.Sequential(
                             OrderedDict(list(self._modules.items())[idx])
                         )
+<<<<<<< HEAD
                         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return AttrProxy(res, f"{tracer.proxy_paths[self]}.{idx}")
                     elif isinstance(self, torch.nn.ModuleList):
                         # Copied from nn/modules/container.py
                         res = torch.nn.ModuleList(list(self._modules.values())[idx])
+<<<<<<< HEAD
                         # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return AttrProxy(res, f"{tracer.proxy_paths[self]}.{idx}")
 
                 return super().__getitem__(idx)  # type: ignore[misc]
@@ -2069,6 +2349,7 @@ def trace(  # type: ignore[override]
     ) -> fx.Graph:
         res = super().trace(root, concrete_args)
 
+<<<<<<< HEAD
         # NOTE [export non-strict fake tensor leak detection]
         # In non-strict export, we don't have dynamo's side effect
         # tracking logic which makes some cases hard to detect.
@@ -2086,6 +2367,8 @@ def trace(  # type: ignore[override]
         for key, val in self.tensor_tracker.items():
             _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT[id(key)] = val.proxy.node
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Since we are making _AttrProxy mimic the original
         # submodule, when someone registers a module directly
         # to the tracer while tracing, the proxy object gets registered
@@ -2180,8 +2463,13 @@ def create_node(self, *args: object, **kwargs: object) -> fx.node.Node:
 
         # nn_module_stack
         if node.op not in ["placeholder", "output"]:
+<<<<<<< HEAD
             if node.meta.get("nn_module_stack") is None:
                 node.meta["nn_module_stack"] = self.module_stack.copy()
+=======
+            if "nn_module_stack" not in node.meta:
+                node.meta["nn_module_stack"] = self.module_stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # convert nn_module_stack from Dict[key, (FQN, class)] -> Dict[str, Tuple[str, str]]
             for key, (fqn, mod_cls) in node.meta["nn_module_stack"].items():
                 if isinstance(mod_cls, type):
@@ -2214,9 +2502,13 @@ def __init__(
         record_module_stack: bool,
         _allow_fake_constant: bool,
         _error_on_data_dependent_ops: bool,
+<<<<<<< HEAD
         record_stack_traces: bool = False,
         parent_tracer: Optional[_MakefxTracer] = None,
         proxy_module_inputs: bool = False,
+=======
+        stack_trace: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         # Configurations that are used to initialize the context managers and their states.
         # Should not modify them during tracing.
@@ -2247,9 +2539,13 @@ def __init__(
         self.torch_fn_metadata_mode: Union[nullcontext, TorchFunctionMetadataMode] = (
             nullcontext()
         )
+<<<<<<< HEAD
         self.record_stack_traces = record_stack_traces
         self.parent_tracer: Optional[_MakefxTracer] = parent_tracer
         self.proxy_module_inputs = proxy_module_inputs
+=======
+        self.stack_trace = stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _checkpoint_modes(self) -> list[Any]:
         return [
@@ -2289,6 +2585,7 @@ def _init_modes_from_inputs(
             if hasattr(f, "_orig_mod") and self.record_module_stack:
                 scope_root = f._orig_mod
                 # _ModuleStackTracer always try to preserve stack trace
+<<<<<<< HEAD
                 # in forward functions
                 self.fx_tracer = _ModuleStackTracer(scope_root)
             else:
@@ -2296,6 +2593,12 @@ def _init_modes_from_inputs(
                 self.fx_tracer.record_stack_traces = self.record_stack_traces
                 if self.record_stack_traces:
                     self.fx_tracer._record_forward_stack_traces_only = True
+=======
+                self.fx_tracer = _ModuleStackTracer(scope_root)
+            else:
+                self.fx_tracer = PythonKeyTracer()
+                self.fx_tracer.stack_trace = self.stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.tracing_mode == "fake":
                 import torch._dynamo
@@ -2359,7 +2662,10 @@ def _construct_modes_with_fx_tracer(self, fx_tracer: _ProxyTracer) -> None:
             self.python_dispatcher_mode = enable_python_dispatcher()
 
         self.torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
+<<<<<<< HEAD
         fx_tracer.proxy_module_inputs = self.proxy_module_inputs  # type: ignore[union-attr]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @contextmanager
     def _init_modes_from_parent(
@@ -2373,9 +2679,15 @@ def _init_modes_from_parent(
             self.fake_tensor_mode = parent_tracer.fake_tensor_mode
 
             def _create_sub_fx_tracer(parent_tracer: _ProxyTracer) -> PythonKeyTracer:
+<<<<<<< HEAD
                 if type(parent_tracer) is PythonKeyTracer:
                     return PythonKeyTracer()
                 elif type(parent_tracer) is _ModuleStackTracer:
+=======
+                if type(parent_tracer) == PythonKeyTracer:
+                    return PythonKeyTracer()
+                elif type(parent_tracer) == _ModuleStackTracer:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return _ModuleStackTracer(parent_tracer.scope_root)
                 else:
                     raise RuntimeError(
@@ -2499,6 +2811,7 @@ def _wrap_func(f: Callable[_P, R], phs: Sequence[PHBase]) -> Callable[_P, R]:
                 )
                 raise
 
+<<<<<<< HEAD
         if (
             self.is_hop_subgraph_tracer()
             and (fake_mode := torch._guards.detect_fake_mode(args))
@@ -2508,6 +2821,8 @@ def _wrap_func(f: Callable[_P, R], phs: Sequence[PHBase]) -> Callable[_P, R]:
 
             insert_deferred_runtime_asserts(t, fake_mode.shape_env, "reenter_make_fx")
             t.recompile()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: kind of a bad way to do it, should maybe figure out a better way
         if self.tracing_mode == "symbolic":
             assert self.fake_tensor_mode is not None
@@ -2518,9 +2833,12 @@ def trace(self, f: Callable, *args: object) -> fx.GraphModule:
         with self._init_modes_from_inputs(f, args):
             return self._trace_inner(f, *args)
 
+<<<<<<< HEAD
     def is_hop_subgraph_tracer(self) -> bool:
         return self.parent_tracer is not None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
         # Create a new tracer based on parent's config
         sub_tracer = _MakefxTracer(
@@ -2531,7 +2849,10 @@ def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
             self.record_module_stack,
             self._allow_fake_constant,
             self._error_on_data_dependent_ops,
+<<<<<<< HEAD
             parent_tracer=self,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with sub_tracer._init_modes_from_parent(self):
             return sub_tracer._trace_inner(f, *args)
@@ -2561,15 +2882,23 @@ def make_fx(
     record_module_stack: bool = False,
     _allow_fake_constant: bool = False,
     _error_on_data_dependent_ops: bool = True,
+<<<<<<< HEAD
     record_stack_traces: bool = False,
     proxy_module_inputs: bool = False,
+=======
+    stack_trace: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[..., GraphModule]:
     """
     Given a function f, return a new function which when executed with valid
     arguments to f, returns an FX GraphModule representing the set of operations that
     were executed during the course of execution.
 
+<<<<<<< HEAD
     If record_stack_traces is True, the stack trace will be preserved on node.meta["stack_trace"]
+=======
+    If stack_trace is True, the stack_trace will be preserved on node.meta["stack_trace"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     assert tracing_mode in ["real", "fake", "symbolic"]
@@ -2584,9 +2913,13 @@ def make_fx(
         record_module_stack,
         _allow_fake_constant,
         _error_on_data_dependent_ops,
+<<<<<<< HEAD
         record_stack_traces=record_stack_traces
         or config.trace.provenance_tracking_level == 1,
         proxy_module_inputs=proxy_module_inputs,
+=======
+        stack_trace=stack_trace or config.trace.enabled,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     @functools.wraps(f)
diff --git a/torch/fx/experimental/recording.py b/torch/fx/experimental/recording.py
index 4ec092898cd69..ec24429b704f0 100644
--- a/torch/fx/experimental/recording.py
+++ b/torch/fx/experimental/recording.py
@@ -3,9 +3,14 @@
 import inspect
 import itertools
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional, Union
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
index 2cc902599aeb0..9dd5b7bfb02ac 100644
--- a/torch/fx/experimental/rewriter.py
+++ b/torch/fx/experimental/rewriter.py
@@ -5,9 +5,14 @@
 import functools
 import inspect
 import textwrap
+<<<<<<< HEAD
 from collections.abc import Callable
 from types import FunctionType
 from typing import Any, cast, Optional, Union
+=======
+from types import FunctionType
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._sources import normalize_source_lines
diff --git a/torch/fx/experimental/shape_inference/infer_symbol_values.py b/torch/fx/experimental/shape_inference/infer_symbol_values.py
index 1ee6d2a939ae2..4413808cee8f1 100644
--- a/torch/fx/experimental/shape_inference/infer_symbol_values.py
+++ b/torch/fx/experimental/shape_inference/infer_symbol_values.py
@@ -64,7 +64,11 @@ def infer_symbol_values(
         for right_var in right_vars:
             if sp.sympify(right_var) == sp.sympify("s0"):
                 right_equation = sp.cancel(right_equation / right_var)
+<<<<<<< HEAD
                 right_vars.remove(right_var)  # noqa: B909
+=======
+                right_vars.remove(right_var)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         var = right_vars[0]
         idx = symbol_idx_dict[str(var)]
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index a617d4fe558cd..f6687b6c59840 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -49,7 +49,11 @@
 sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node")
 
 
+<<<<<<< HEAD
 __all__ = ["SymNode", "method_to_operator", "magic_methods", "DynamicInt"]
+=======
+__all__ = ["SymNode", "method_to_operator", "magic_methods"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from torch.types import py_sym_types as SymTypes
@@ -560,6 +564,23 @@ def expect_true(self, file, line):
             self.expr, f"{file}:{line}", fx_node=self.fx_node
         )
 
+<<<<<<< HEAD
+=======
+    def expect_size(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import _advise_is_size
+
+        b = self.ge(self.wrap_int(0))
+        # Generate a deferred runtime assert
+        r = b.expect_true(file, line)
+        # Refine compile time range, but only if it's unbacked.
+        # If you refine range for hinted variables, you can end up making
+        # improper deductions since compile time reasoning may be
+        # incompatible with runtime reasoning.
+        if r and not self.has_hint():
+            _advise_is_size(SymInt(self))
+        return r
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_true(self, file, line):
         from torch.fx.experimental.symbolic_shapes import statically_known_true
 
@@ -611,6 +632,7 @@ def is_constant(self):
         return False
 
 
+<<<<<<< HEAD
 class _DynamicScalar:
     def __new__(cls, *args):
         if cls is _DynamicScalar:
@@ -645,6 +667,8 @@ def __rfloordiv__(self, other):
         return DynamicInt(other // self.real)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: this probably needs the sizes-strides eval functions
 METHOD_TO_OPERATOR = {
     "pos": operator.pos,
@@ -1670,6 +1694,10 @@ def sizes_strides_user(sizes, strides):
 def _make_user_magic(method, user_type):
     # User magic takes care of wrapping the other operand into a node,
     # so that our internal logic can assume everything is nodes
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if method in magic_methods_on_operator_with_trailing_underscore:
         method_attr = f"sym_{method}"
     else:
@@ -1678,8 +1706,11 @@ def _make_user_magic(method, user_type):
     def get_constant(x: Union[SymInt, int, SymFloat, float, SymBool, bool]):
         if isinstance(x, (int, float, bool)):
             return x
+<<<<<<< HEAD
         if isinstance(x, SymInt):
             return x.node.guard_int("", 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(x, SymBool):
             return x.node.guard_bool("", 0)
         raise AssertionError("expect to be called with constant SymBools")
@@ -1802,7 +1833,11 @@ def rbinary_magic_impl(self, other):
         other = promote(other)
         self, other = promote2(self, other)
         if is_constant(self):
+<<<<<<< HEAD
             return (method_to_operator(method))(other, get_constant(self))
+=======
+            return (method_to_operator(method))(get_constant(self), other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_constant(other):
             other = get_constant(other)
         other_node = to_node(self.node, other)
@@ -1811,6 +1846,7 @@ def rbinary_magic_impl(self, other):
         ret = wrap_node(getattr(other_node, method_attr)(self.node))
         return get_constant(ret) if is_constant(ret) else ret
 
+<<<<<<< HEAD
     def setattrs(user_type, attr, symnode_impl):
         """
         Registers the SymNode magic method on SymInt/Float/Bool,
@@ -1836,6 +1872,13 @@ def dynamic_int_impl(*args):
     elif method in unary_nonmagic_methods:
         orig = getattr(user_type, method)
         setattrs(user_type, method, update_wrapper(unary_magic_impl, orig))
+=======
+    if method in unary_magic_methods:
+        setattr(user_type, f"__{method}__", unary_magic_impl)
+    elif method in unary_nonmagic_methods:
+        orig = getattr(user_type, method)
+        setattr(user_type, method, update_wrapper(unary_magic_impl, orig))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif method == "sym_ite":
 
         def sym_ite_magic_impl(pred, then_val, else_val):
@@ -1852,7 +1895,11 @@ def sym_ite_magic_impl(pred, then_val, else_val):
             ret = wrap_node(getattr(pred.node, method_attr)(then_node, else_node))
             return get_constant(ret) if ret.node.is_constant() else ret
 
+<<<<<<< HEAD
         setattrs(user_type, f"__{method}__", sym_ite_magic_impl)
+=======
+        setattr(user_type, f"__{method}__", sym_ite_magic_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif method == "round":
 
         def round_magic_impl(self, ndigits=None):
@@ -1861,14 +1908,24 @@ def round_magic_impl(self, ndigits=None):
 
             return wrap_node(getattr(self.node, method)(ndigits))
 
+<<<<<<< HEAD
         setattrs(user_type, f"__{method}__", round_magic_impl)
+=======
+        setattr(user_type, f"__{method}__", round_magic_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         method_name = method
         if method in bitwise_ops:
             method_name = bitwise_ops[method]
+<<<<<<< HEAD
         setattrs(user_type, f"__{method_name}__", binary_magic_impl)
         if method in reflectable_magic_methods:
             setattrs(user_type, f"__r{method_name}__", rbinary_magic_impl)
+=======
+        setattr(user_type, f"__{method_name}__", binary_magic_impl)
+        if method in reflectable_magic_methods:
+            setattr(user_type, f"__r{method_name}__", rbinary_magic_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 for method, func in magic_methods.items():  # type: ignore[assignment]
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index b2cd01cbc5105..1bf13f5d31ead 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -31,24 +31,39 @@
 import threading
 import traceback
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Generator, Iterator, Mapping, Sequence
+=======
+from collections.abc import Generator, Iterator, Mapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import _GeneratorContextManager, contextmanager
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from typing import (
     Any,
+<<<<<<< HEAD
+=======
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cast,
     Generic,
     NamedTuple,
     NoReturn,
     Optional,
     TYPE_CHECKING,
+<<<<<<< HEAD
     TypeAlias,
     TypeGuard,
     TypeVar,
     Union,
 )
 from typing_extensions import deprecated, ParamSpec
+=======
+    TypeVar,
+    Union,
+)
+from typing_extensions import deprecated, ParamSpec, TypeAlias, TypeGuard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -57,7 +72,10 @@
 
 # NB: The sym_* functions are used via getattr() and must be imported here.
 from torch import SymBool, SymFloat, SymInt
+<<<<<<< HEAD
 from torch._C._functorch import get_unwrapped, is_batchedtensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._guards import ShapeGuard, SLoc, Source, TracingContext
 from torch._logging import dtrace_structured, LazyString, structured, trace_structured
 from torch._subclasses.meta_utils import is_sparse_any
@@ -83,6 +101,10 @@
     IntTrueDiv,
     IsNonOverlappingAndDenseIndicator,
     Max,
+<<<<<<< HEAD
+=======
+    Min,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Mod,
     PythonMod,
     TruncToInt,
@@ -213,7 +235,11 @@ def log_lru_cache_stats(wrapped_f: functools._lru_cache_wrapper[object]) -> None
 class SymIntEqByExpr:
     """
     This is a wrapper around SymInt which has alternative semantics for
+<<<<<<< HEAD
     equality and pickling.  Specifically, instead of erroring or guarding, we
+=======
+    equality.  Specifically, instead of erroring or guarding, we
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instead will hash/compare equality based on the underlying sympy
     expression; e.g., s0 and s1 will always compare as False.
 
@@ -222,6 +248,7 @@ class SymIntEqByExpr:
     canonicalize to the same expression via regular simplification.
     """
 
+<<<<<<< HEAD
     @staticmethod
     def _extract(val: Union[torch.SymInt, int]) -> sympy.Expr:
         if isinstance(val, torch.SymInt):
@@ -231,16 +258,42 @@ def _extract(val: Union[torch.SymInt, int]) -> sympy.Expr:
 
     def __init__(self, val: Union[torch.SymInt, int]) -> None:
         self.val: sympy.Expr = SymIntEqByExpr._extract(val)
+=======
+    val: Union[torch.SymInt, int]
+
+    def __init__(self, val: Union[torch.SymInt, int]) -> None:
+        self.val = val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self) -> str:
         return repr(self.val)
 
+<<<<<<< HEAD
     def __eq__(self, other: object) -> bool:
         assert isinstance(other, SymIntEqByExpr)
         return self.val == other.val
 
     def __hash__(self) -> int:
         return hash(self.val)
+=======
+    def _extract(self) -> sympy.Expr:
+        if isinstance(self.val, torch.SymInt):
+            return self.val.node.expr
+        else:
+            return sympy.Integer(self.val)
+
+    def __eq__(self, other: object) -> bool:
+        assert isinstance(other, SymIntEqByExpr)
+
+        # int equality fastpath
+        if type(self.val) is int and type(other.val) is int:
+            return self.val == other.val
+
+        return self._extract() == other._extract()
+
+    def __hash__(self) -> int:
+        return hash(self._extract())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _nested_int_aware_sort(
@@ -306,6 +359,7 @@ def uninteresting_files() -> set[str]:
     import torch._logging
     import torch._subclasses.fake_tensor
     import torch._subclasses.meta_utils
+<<<<<<< HEAD
     import torch.export._trace
 
     mods = [
@@ -315,6 +369,14 @@ def uninteresting_files() -> set[str]:
         torch.fx.experimental.sym_node,
         torch.fx.interpreter,
         torch.fx._symbolic_trace,
+=======
+
+    mods = [
+        sys.modules[__name__],
+        torch.fx.experimental.recording,
+        torch.fx.experimental.sym_node,
+        torch.fx.interpreter,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch,
         torch._compile,
         torch._dynamo.eval_frame,
@@ -588,7 +650,11 @@ def rebind_unbacked(
             # exist in the ShapeEnv but are never bound anywhere.  You might
             # like an invariant that unbacked symbols never get lost.  But
             # we do not have this invariant, so do not try to enforce it.
+<<<<<<< HEAD
             if isinstance(u1, (int, float)):
+=======
+            if isinstance(u1, int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 log.info(
                     "rebind_unbacked: discard %s %s %s -> %s",
                     n.target,
@@ -626,7 +692,10 @@ def rebind_unbacked(
                 assert repacked == raw_u1, f"{repacked} != {raw_u1}"
                 # Cancel the to_int(to_bool(x)). This is sound because x in
                 # [0, 1]
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raw_u1 = new_raw_u1
 
             if not isinstance(raw_u1, sympy.Symbol):
@@ -841,7 +910,10 @@ def div_by_factor(x: sympy.Expr, factor: int) -> sympy.Expr:
         factor = functools.reduce(math.gcd, map(integer_coefficient, atoms))
         if factor == 1:
             return expr
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         atoms = [div_by_factor(x, factor) for x in atoms]
         return _sympy_from_args(
             sympy.Add, atoms, sort=True, is_commutative=expr.is_commutative
@@ -882,16 +954,22 @@ def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
     Raises:
         AssertionError: If the value is of an unsupported type.
     """
+<<<<<<< HEAD
     # This is almost close enough to implement in terms of _iterate_nodes()
     # except that it needs to handle `list[sympy.Basic]` which _iterate_nodes()
     # can't handle.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(val, SymTypes):
         # This allow applies to the jagged layout NestedTensor case as
         # nested ints are not symbolic
         if is_symbolic(val):
             yield val.node.expr
+<<<<<<< HEAD
     elif isinstance(val, SymNode):
         yield val.expr
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(val, sympy.Basic):
         yield val
     elif isinstance(val, (int, float, bool)):
@@ -914,6 +992,7 @@ def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
         raise AssertionError(f"cannot extract sympy expressions from {val} {type(val)}")
 
 
+<<<<<<< HEAD
 def _iterate_nodes(val: Any) -> Iterator[SymNode]:
     """
     Recursively iterate through a value and yield all SymNodes contained
@@ -936,6 +1015,8 @@ def _iterate_nodes(val: Any) -> Iterator[SymNode]:
             yield from _iterate_nodes(val.storage_offset())
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def free_symbols(val: IterateExprs) -> OrderedSet[sympy.Symbol]:
     """
     Recursively collect all free symbols from a value.
@@ -996,6 +1077,7 @@ def free_unbacked_symbols(x: IterateExprs) -> OrderedSet[sympy.Symbol]:
     )
 
 
+<<<<<<< HEAD
 def _free_non_source_unbacked_symbols(
     x: IterateExprs, unbacked_inputs: OrderedSet[sympy.Symbol]
 ) -> OrderedSet[sympy.Symbol]:
@@ -1006,6 +1088,8 @@ def _free_non_source_unbacked_symbols(
     return non_source_symbols
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # WARNING: Don't use this on Dynamo produced graphs, they don't have meta
 # setup!
 def is_symbol_binding_fx_node(node: torch.fx.Node) -> Optional[sympy.Symbol]:
@@ -1180,10 +1264,14 @@ def expr(s: Union[SymInt, SymFloat, SymBool]) -> sympy.Expr:
         for attr in attrs:
             sub = getattr(a, attr)
             r.update(go(sub, path + (InnerTensorKey(attr),)))
+<<<<<<< HEAD
     elif isinstance(a, torch.Tensor) and is_batchedtensor(a):
         unwrapped_tensor = get_unwrapped(a)
         r.update(go(unwrapped_tensor, path))
     elif isinstance(a, torch.Tensor) and not is_batchedtensor(a):
+=======
+    elif isinstance(a, torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._subclasses.fake_tensor import FakeTensor
 
         assert isinstance(a, FakeTensor)
@@ -1227,9 +1315,13 @@ def expr(s: Union[SymInt, SymFloat, SymBool]) -> sympy.Expr:
         r[s] = path
         if shape_env and real is not None:
             assert isinstance(real, (int, float))
+<<<<<<< HEAD
 
             shape_env.set_unbacked_var_to_val(s, real)
 
+=======
+            shape_env.set_unbacked_var_to_val(s, real)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pending.remove(s)
     # When an unbacked SymInt is perfectly divisible by an integer
     # constant, we replace it with the integer constant to improve
@@ -1266,7 +1358,10 @@ def _symint_wrap(s: sympy.Symbol) -> SymInt:
             else _symint_wrap(coeff)
         )
         # TODO: DivideByKey needs to test divisibility at runtime!
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r[unbacked] = path + (DivideByKey(divisor),)
         if real is not None:
             assert isinstance(real, int)
@@ -1294,7 +1389,10 @@ def _symint_wrap(s: sympy.Symbol) -> SymInt:
             assert type(real) is bool
             if shape_env:
                 shape_env.set_unbacked_var_to_val(s, int(real))
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pending.remove(s.lhs)
 
     return r
@@ -1323,7 +1421,10 @@ def compute_unbacked_bindings(
         return None
 
     fs = shape_env.pending_fresh_unbacked_symbols
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pending = set(fs)
     if not pending:
         return None
@@ -1372,6 +1473,7 @@ def compute_unbacked_bindings(
                     isinstance(old_sym, SymTypes)
                     and (old_s := old_sym.node.expr) != new_s
                 ):
+<<<<<<< HEAD
                     # If old_s is not an unbacked_symbol,
                     # we assume that the original unbacked symbol is replaced
                     # by a backed symbol (old_s). This can happen
@@ -1381,6 +1483,9 @@ def compute_unbacked_bindings(
                     # because we know the value is the same.
 
                     if isinstance(old_s, sympy.Symbol) and free_unbacked_symbols(old_s):
+=======
+                    if isinstance(old_s, sympy.Symbol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         shape_env._rename_unbacked_to(new_s, old_s)
                     else:
                         shape_env._eliminate_unbacked(new_s, old_s)
@@ -1506,6 +1611,10 @@ def statically_known_true(x: BoolLikeType) -> bool:
     if not isinstance(x, SymBool):
         assert isinstance(x, bool)
         return x
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = _static_eval_sym_bool(x)
     if result is None:
         return False
@@ -1682,7 +1791,11 @@ def constrain_range(
     if max < min:
         raise ValueError(
             "Maximum value to constrain_as_size can't be less than the specified min value, "
+<<<<<<< HEAD
             f"received min={min} and max={max}"
+=======
+            "received min={min} and max={max}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if isinstance(a, int):
@@ -1794,14 +1907,20 @@ def fx_placeholder_targets(gm: torch.fx.GraphModule) -> list[str]:
 def eval_guards(
     gm: torch.fx.GraphModule, *args: Tensor, ignore_static: bool = True
 ) -> bool:
+<<<<<<< HEAD
     assert gm.shape_env is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return gm.shape_env.evaluate_guards_for_args(  # type: ignore[operator, union-attr]
         fx_placeholder_vals(gm), args, ignore_static=ignore_static
     )
 
 
 def bind_symbols(gm: torch.fx.GraphModule, *args: Tensor) -> dict[sympy.Symbol, int]:
+<<<<<<< HEAD
     assert gm.shape_env is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return gm.shape_env.bind_symbols(fx_placeholder_vals(gm), args)  # type: ignore[operator, union-attr]
 
 
@@ -1983,7 +2102,11 @@ def __post_init__(self) -> None:
         for source, root, fn in self.derived_equalities:
             # preprocess into a transitively-closed map
             # NOTE(avik): we reuse the union-find forest for canonicalizing input sources
+<<<<<<< HEAD
             if isinstance(root, (sympy.Symbol, sympy.Integer)):
+=======
+            if isinstance(root, sympy.Symbol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._defs[self._find(source)] = fn(root)
             else:
                 self._defs[self._find(source)] = fn(self._rewrite(root))
@@ -2092,7 +2215,11 @@ class SymIntSymbolicContext(SymbolicContext):
 
 
 @dataclass(frozen=True)
+<<<<<<< HEAD
 class StatelessSymbolicContext(SymbolicContext, Generic[_P1, _T1]):
+=======
+class StatelessSymbolicContext(Generic[_P1, _T1], SymbolicContext):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Create symbols in ``create_symbolic_sizes_strides_storage_offset`` via
     a symbolic_context determination as given by ``DimDynamic`` and ``DimConstraint``.
@@ -2207,7 +2334,10 @@ class SubclassSymbolicContext(StatefulSymbolicContext):
     def __post_init__(self) -> None:
         super().__post_init__()
         if self.inner_contexts is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.inner_contexts = {}
 
 
@@ -2296,12 +2426,18 @@ def _fast_expand(expr: _SympyT) -> _SympyT:
     # only re-create the objects if any of the args changed to avoid expensive
     # checks when re-creating objects.
     new_args = [_fast_expand(arg) for arg in expr.args]  # type: ignore[arg-type]
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     if any(arg is not new_arg for arg, new_arg in zip(expr.args, new_args)):
         # pyrefly: ignore [missing-attribute]
         return _fast_expand(expr.func(*new_args))
 
     # pyrefly: ignore [missing-attribute]
+=======
+    if any(arg is not new_arg for arg, new_arg in zip(expr.args, new_args)):
+        return _fast_expand(expr.func(*new_args))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if expr.is_Pow:
         base: sympy.Expr
         exp: sympy.Expr
@@ -2311,11 +2447,17 @@ def _fast_expand(expr: _SympyT) -> _SympyT:
                 return sympy.expand_multinomial(expr, deep=False)
             elif exp < 0:
                 return S.One / sympy.expand_multinomial(S.One / expr, deep=False)
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
     elif expr.is_Mul:
         num: list[sympy.Expr] = []
         den: list[sympy.Expr] = []
         # pyrefly: ignore [missing-attribute]
+=======
+    elif expr.is_Mul:
+        num: list[sympy.Expr] = []
+        den: list[sympy.Expr] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in expr.args:
             if arg.is_Pow and arg.args[1] == -1:
                 den.append(S.One / arg)  # type: ignore[operator, arg-type]
@@ -2425,7 +2567,11 @@ def _maybe_evaluate_static_worker(
 
         # Note:
         #   Offset might be a fraction(e.g. aten.split.Tensor), but shapes are always integers.
+<<<<<<< HEAD
         #   Sympy might give unexpected results when comparing an integer with a non-integer
+=======
+        #   Sympy might give unexepected results when comparing an integer with a non-integer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #   Therefore, we cast offset to int here.
         #   For example:
         #       shape_0 = sympy.Symbol("shape_0", positive=True, integer=True)
@@ -2437,7 +2583,10 @@ def _maybe_evaluate_static_worker(
 
     # TODO: remove this try catch (esp for unbacked_only)
     try:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_expr = expr.xreplace(new_shape_env)
     except RecursionError:
         log.warning("RecursionError in sympy.xreplace(%s, %s)", expr, new_shape_env)
@@ -2975,6 +3124,7 @@ def floor_div_handler(*args: sympy.Expr) -> sympy.Expr:
             # is_integer tests though haha
             return (base - mod_reduced) / divisor
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         if expr.has(Mod):
             # pyrefly: ignore [missing-attribute]
@@ -2988,6 +3138,15 @@ def floor_div_handler(*args: sympy.Expr) -> sympy.Expr:
         # pyrefly: ignore [missing-attribute]
         if expr.has(FloorDiv):
             # pyrefly: ignore [missing-attribute]
+=======
+        if expr.has(Mod):
+            expr = expr.replace(Mod, mod_handler)
+        # 7 // -3 is -3, 7 % -3 is -2, and 7 - (-2) / -3 is -3.0 so negative
+        # arguments should be OK.
+        if expr.has(PythonMod):
+            expr = expr.replace(PythonMod, mod_handler)
+        if expr.has(FloorDiv):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expr = expr.replace(FloorDiv, floor_div_handler)
         return expr
 
@@ -3193,8 +3352,13 @@ def solve(self) -> None:
                         self._dynamic_results.add(self._dcp.doprint(arg))
                 else:
                     self._dynamic_results.add(self._dcp.doprint(solution))
+<<<<<<< HEAD
             except (NotImplementedError, AssertionError):
                 log.warning("Failed to reduce inequalities", exc_info=True)
+=======
+            except (NotImplementedError, AssertionError) as e:
+                log.warning("Failed to reduce inequalities: %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for expr2 in exprs:
                     self._dynamic_results.add(self._dcp.doprint(expr2))
 
@@ -3349,7 +3513,10 @@ def _check_same_range(c: Mapping[str, int], dim: object) -> bool:
                     and str(symbol := next(iter(c["eq"].free_symbols))) == old_root
                 ):  # derived dim with root = old_root
                     new_root_expr = results[str(old_root)]["eq"]  # dx=3*_dx+1
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_expr = c["eq"].subs({symbol: new_root_expr})  # dy=(3*_dx+1)+1
                     c["eq"] = new_expr
 
@@ -3441,7 +3608,11 @@ def prettify_results(
         constraint_violation_error: object,
         forced_specializations: dict[str, str],
     ) -> str:
+<<<<<<< HEAD
         """Format a message for constraint violation errors"""
+=======
+        """Format a message for constraint violation erros"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.export.dynamic_shapes import _get_dim_name_mapping
 
         if not self._dcp.source_name_to_debug_name:
@@ -3590,6 +3761,10 @@ class ShapeEnvSettings:
     specialize_zero_one: bool
     duck_shape: bool
     prefer_deferred_runtime_asserts_over_guards: bool
+<<<<<<< HEAD
+=======
+    allow_complex_guards_as_runtime_asserts: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     trace_asserts: bool
 
 
@@ -3727,6 +3902,13 @@ def _init(
         # in guards is helpful, since these guards in some sense are overly
         # pedantic.  See also https://github.com/pytorch/pytorch/issues/121749
         prefer_deferred_runtime_asserts_over_guards: bool = False,
+<<<<<<< HEAD
+=======
+        # When True, does not emit or raise constraint violation errors on
+        # implicit guards generated by ops, and defers to runtime assertions
+        # in the graph instead. For export.
+        allow_complex_guards_as_runtime_asserts: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # XXX Add any new settings that could affect FakeTensor evaluation
         # to: torch._subclasses.fake_tensor._ShapeEnvSettings
         trace_asserts: bool = False,
@@ -3743,6 +3925,10 @@ def _init(
             specialize_zero_one=specialize_zero_one,
             duck_shape=duck_shape,
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+<<<<<<< HEAD
+=======
+            allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             trace_asserts=trace_asserts,
         )
 
@@ -3771,10 +3957,14 @@ def _init(
         self.var_to_range_sloc: dict[sympy.Symbol, ValueRangesSLoc] = {}
         self.source_name_to_debug_name: dict[str, str] = {}
         self.var_to_sources: dict[sympy.Symbol, list[Source]] = {}
+<<<<<<< HEAD
         # A set of unbacked symbols that are inputs (i.e: not data dependent).
         self.unbacked_inputs: OrderedSet[sympy.Symbol] = OrderedSet()
         self.var_to_stack: dict[sympy.Symbol, CapturedTraceback] = {}
         self.var_to_hint_override: dict[sympy.Symbol, int] = {}
+=======
+        self.var_to_stack: dict[sympy.Symbol, CapturedTraceback] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Maps a source to the *original* symbol that was assigned to it
         self.source_to_var: dict[str, sympy.Symbol] = {}
         # Maps from sympy ints to expressions representing them
@@ -3791,8 +3981,13 @@ def _init(
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
         self.val_to_var: dict[int, sympy.Symbol] = {}
+<<<<<<< HEAD
         self.unbacked_symfloat_counter = 0
         self.unbacked_symint_counter = 0
+=======
+        self.unbacked_symfloat_counter = itertools.count()
+        self.unbacked_symint_counter = itertools.count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Similar to guards, but these MUST evaluate to true and can
         # only be evaluated at runtime midway through (i.e., they always
         # involve unbacked symints)
@@ -3904,7 +4099,12 @@ def _init(
         # with something like effect token tracking.
         self.unbacked_alloc_order: dict[sympy.Symbol, int] = {}
 
+<<<<<<< HEAD
         self.specialization_stacks: dict[Source, traceback.StackSummary] = {}
+=======
+        self.user_specialization_stacks: dict[Source, traceback.StackSummary] = {}
+        self.framework_specialization_stacks: dict[Source, traceback.StackSummary] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.trace_asserts = trace_asserts
 
@@ -3957,6 +4157,13 @@ def duck_shape(self) -> bool:
     def prefer_deferred_runtime_asserts_over_guards(self) -> bool:
         return self.settings.prefer_deferred_runtime_asserts_over_guards
 
+<<<<<<< HEAD
+=======
+    @property
+    def allow_complex_guards_as_runtime_asserts(self) -> bool:
+        return self.settings.allow_complex_guards_as_runtime_asserts
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @contextmanager
     def patch_source_specialization(
         self, source: Source, check_fn: Callable[[sympy.Symbol], sympy.Expr]
@@ -3991,7 +4198,11 @@ def patch_source_specialization(
                 added_replacements[axiom.lhs] = axiom.rhs
         self.axioms.update(new_axioms)
 
+<<<<<<< HEAD
         # We need to freeze the ShapeEnv because any additional modification of
+=======
+        # We need to freeze the ShapeEnv becuase any additional modification of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # the ShapeEnv will cause unsoundness for subsequent specialization calls.
         self.frozen = True
         try:
@@ -4031,7 +4242,12 @@ def check_equal(self, other: ShapeEnv) -> None:
             "replacements_slocs",
             "_resimplify_floor_div_axioms",
             "_expr_sym_node_id",
+<<<<<<< HEAD
             "specialization_stacks",
+=======
+            "user_specialization_stacks",
+            "framework_specialization_stacks",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Mapping of the value of each to-be-compared field into the values that
@@ -4042,7 +4258,18 @@ def check_equal(self, other: ShapeEnv) -> None:
         # and the stack when it was added to the set of guards. In order to compare
         # it, we throw away the stack information.
         def map_value(key: str, value: Any) -> Any:
+<<<<<<< HEAD
             if key == "guards":
+=======
+            if key in ("unbacked_symfloat_counter", "unbacked_symint_counter"):
+                from copy import copy
+
+                # For itertools.count(), we compare the next integer returned
+                # by the count iterators. Not that we need to copy the iterator
+                # first. Otherwise we are mutating the object.
+                return next(copy(value))
+            elif key == "guards":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Transform the list of ShapeGuard into a list of expressions.
                 return [g.expr for g in value]
             elif key == "deferred_runtime_asserts":
@@ -4136,7 +4363,11 @@ def _constrain_range_for_size(
         if max < min:
             raise ValueError(
                 "Maximum value to constrain_as_size can't be less than the specified min value, "
+<<<<<<< HEAD
                 f"received min={min} and max={max}"
+=======
+                "received min={min} and max={max}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self.constrain_symbol_range(
@@ -4410,23 +4641,33 @@ def _produce_dyn_sizes_from_int_tuple(
         tensor_size: Sequence[IntLikeType],
         source: Source,
         symbolic_context: SymbolicContext,
+<<<<<<< HEAD
         hint_overrides: Optional[dict[int, int]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[sympy.Expr]:
         assert all(not is_symbolic(val) for val in tensor_size), (
             f"Expect size to be a plain tuple of ints but got {tensor_size}"
         )
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
+<<<<<<< HEAD
         if not hint_overrides:
             hint_overrides = {}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _assert_symbol_context(symbolic_context)
         dynamic_dims = symbolic_context.dynamic_sizes  # type: ignore[attr-defined]
         constraint_dims = symbolic_context.constraint_sizes  # type: ignore[attr-defined]
         size = []
         for i, val in enumerate(tensor_size):
             sym = self.create_symbol(
+<<<<<<< HEAD
                 hint_overrides.get(i, val),
+=======
+                val,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 TensorPropertySource(source, TensorProperty.SIZE, i),
                 dynamic_dims[i],
                 constraint_dims[i],
@@ -4522,7 +4763,11 @@ def create_symbolic_sizes_strides_storage_offset(
 
     # The order of checking the guards matters. In this specific example:
     # If True branch guard check precedes False branch and for True branch, y.size(0) check precedes x == True,
+<<<<<<< HEAD
     # we may have an unnecessary shape speciliazation for y.
+=======
+    # we may have an unnessary shape speciliazation for y.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _maybe_specialize_sym_int_with_hint(
         self, maybe_sym: IntLikeType
     ) -> IntLikeType:
@@ -4546,7 +4791,10 @@ def _create_symbolic_sizes_strides_storage_offset(
         source: Source,
         *,
         symbolic_context: Optional[SymbolicContext] = None,
+<<<<<<< HEAD
         hint_overrides: Optional[dict[int, int]] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[
         tuple[IntLikeType, ...],
         tuple[IntLikeType, ...],
@@ -4554,9 +4802,12 @@ def _create_symbolic_sizes_strides_storage_offset(
     ]:
         dim = len(ex_size)
 
+<<<<<<< HEAD
         if not hint_overrides:
             hint_overrides = {}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Reimplement the legacy behavior
         if symbolic_context is None:
             constraint_sizes: list[DimConstraint] = [None] * dim
@@ -4611,7 +4862,11 @@ def _create_symbolic_sizes_strides_storage_offset(
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
         size: list[sympy.Expr] = self._produce_dyn_sizes_from_int_tuple(
+<<<<<<< HEAD
             ex_size, source, symbolic_context, hint_overrides=hint_overrides
+=======
+            ex_size, source, symbolic_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         stride = self._compute_symbolic_stride(
             source,
@@ -4627,16 +4882,23 @@ def _create_symbolic_sizes_strides_storage_offset(
         sym_sizes = [
             self.create_symintnode(
                 sym,
+<<<<<<< HEAD
                 hint=hint_overrides.get(i, hint),
+=======
+                hint=hint,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 source=TensorPropertySource(source, TensorProperty.SIZE, i),
             )
             for i, (sym, hint) in enumerate(zip(size, ex_size))
         ]
+<<<<<<< HEAD
 
         for i, sym in enumerate(sym_sizes):
             if isinstance(sym, torch.SymInt) and i in hint_overrides:
                 self.var_to_hint_override[sym.node.expr] = hint_overrides[i]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sym_stride = []
         for i, stride_expr in enumerate(stride):
             # NB: Don't duck size the stride; instead use the expression
@@ -4769,7 +5031,11 @@ def create_symfloatnode(
         self,
         sym: sympy.Expr,
         *,
+<<<<<<< HEAD
         hint: Optional[int | float | bool],
+=======
+        hint: Optional[int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source: Optional[Source] = None,
     ) -> FloatLikeType:
         """Create a SymFloat value from a symbolic expression"""
@@ -4863,9 +5129,14 @@ def _log_create_unbacked_symbol(
     def create_unbacked_symfloat(self) -> SymFloat:
         """Create a symbolic float without a hint value"""
         symbol: sympy.Symbol = make_symbol(
+<<<<<<< HEAD
             SymT.UNBACKED_FLOAT, self.unbacked_symfloat_counter
         )
         self.unbacked_symfloat_counter += 1
+=======
+            SymT.UNBACKED_FLOAT, next(self.unbacked_symfloat_counter)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.counter["create_unbacked_symbol"] += 1
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
@@ -4889,9 +5160,14 @@ def create_unbacked_symfloat(self) -> SymFloat:
     def create_unbacked_symint(self, source: Optional[Source] = None) -> SymInt:
         """Create a symbolic integer without a hint value"""
         symbol: sympy.Symbol = make_symbol(
+<<<<<<< HEAD
             SymT.UNBACKED_INT, self.unbacked_symint_counter, integer=True
         )
         self.unbacked_symint_counter += 1
+=======
+            SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
@@ -4908,6 +5184,10 @@ def create_unbacked_symint(self, source: Optional[Source] = None) -> SymInt:
         self._log_create_unbacked_symbol(
             "create_unbacked_symint", symbol, vr, source, sym_node=sym_node
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return SymInt(sym_node)
 
     def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
@@ -4918,9 +5198,14 @@ def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
     def create_unbacked_symbool(self) -> SymBool:
         """Create a symbolic boolean without a hint value"""
         symbol: sympy.Symbol = make_symbol(
+<<<<<<< HEAD
             SymT.UNBACKED_INT, self.unbacked_symint_counter, integer=True
         )
         self.unbacked_symint_counter += 1
+=======
+            SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
@@ -5026,9 +5311,12 @@ def create_symbol(
         if dynamic_dim in (DimDynamic.SIZE_LIKE_UNBACKED, DimDynamic.OBLIVIOUS_SIZE):
             out = self.create_unbacked_symint(source).node.expr
             self._constrain_range_for_size(out)
+<<<<<<< HEAD
 
             self.unbacked_inputs.add(out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(symbolic_context, StatefulSymbolicContext) and source_name:
                 symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][
                     source_name
@@ -5106,7 +5394,10 @@ def create_symbol(
 
             if duck:
                 # Make sure to reuse this symbol for subsequent duck shaping
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.val_to_var[val] = sympy_expr
 
             if isinstance(val, int):
@@ -5122,7 +5413,11 @@ def create_symbol(
                         self._get_sloc(
                             "user code shown is first use of this value--the guard itself is not "
                             "due user code but due to 0/1 specialization in the framework; to "
+<<<<<<< HEAD
                             "avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim)"
+=======
+                            "avoid specialization try torch._dynamo.mark_unbacked(tensor, dim)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if self.specialize_zero_one
                             else None
                         ),
@@ -5313,7 +5608,11 @@ def produce_guards_verbose(
         # calls on this new instance. Finally, it will check whether this new instance
         # has equal state.
         #
+<<<<<<< HEAD
         # It's important that we do it in the beginning of this function, since it modifies
+=======
+        # It's important that we do it in the begining of this function, since it modifies
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # self.dim_constraints through its execution. Changes that happen in this method
         # aren't interesting, since this is the function call we wish to reproduce at the
         # end. If we wish to simply reproduce ShapeEnv instances even after this call,
@@ -5338,19 +5637,29 @@ def _create_no_constraints_context(t: Tensor) -> StatelessSymbolicContext:
 
         # Expand optional inputs, or verify invariants are upheld
         if input_contexts is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             input_contexts = [
                 # pyrefly: ignore [bad-argument-type]
+=======
+            input_contexts = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _create_no_constraints_context(t) if isinstance(t, Tensorlike) else None
                 for t in placeholders
             ]
         else:
             assert len(input_contexts) == len(placeholders)
+<<<<<<< HEAD
 
             for i, (t, context) in enumerate(zip(placeholders, input_contexts)):
                 if isinstance(t, Tensorlike):
                     if context is None:
                         # pyrefly: ignore [bad-argument-type]
+=======
+            for i, (t, context) in enumerate(zip(placeholders, input_contexts)):
+                if isinstance(t, Tensorlike):
+                    if context is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         input_contexts[i] = _create_no_constraints_context(t)
                 else:
                     assert isinstance(t, (SymInt, int, SymFloat, float))
@@ -5492,12 +5801,20 @@ def get_expression(tensor_dim_src: Source) -> sympy.Expr:
             for srcEq, root, fn in equalities_inputs.derived_equalities:
                 expr1 = get_expression(srcEq)
                 # recall that root is either a phantom symbol or an input source
+<<<<<<< HEAD
                 if isinstance(root, sympy.Symbol):
                     expr2, debug_name = root, self.var_to_sources[root][0].name()
                 elif isinstance(root, sympy.Integer):
                     expr2, debug_name = root, str(root)
                 else:
                     expr2, debug_name = get_expression(root), self._debug_name(root)
+=======
+                expr2, debug_name = (
+                    (root, self.var_to_sources[root][0].name())
+                    if isinstance(root, sympy.Symbol)
+                    else (get_expression(root), self._debug_name(root))
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 expr2_ = fn(expr2)
                 # Check whether given input shape values satisfy a specified equation s = fn(s').
                 # - Raise when the equation was violated by the given input shape values.
@@ -5512,11 +5829,18 @@ def get_expression(tensor_dim_src: Source) -> sympy.Expr:
                     )
 
             for phantom_symbol in equalities_inputs.phantom_symbols:
+<<<<<<< HEAD
                 if isinstance(phantom_symbol, sympy.Symbol):
                     # we created additional phantom symbols that are not input shape dimensions
                     symbol_to_source[phantom_symbol].extend(
                         self.var_to_sources[phantom_symbol]
                     )
+=======
+                # we created additional phantom symbols that are not input shape dimensions
+                symbol_to_source[phantom_symbol].extend(
+                    self.var_to_sources[phantom_symbol]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # How do we know what the value of s0 is?  Fresh variables can only be
         # bound by inputs, so there MUST be some other input which binds the
@@ -5604,13 +5928,28 @@ def hint(s: sympy.Expr) -> str:
                     var_with_range = self._render_range_for_constraint_violation(
                         source, constraint
                     )
+<<<<<<< HEAD
                     user_stack = self.specialization_stacks.get(source, None)
+=======
+                    user_stack = self.user_specialization_stacks.get(source, None)
+                    framework_stack = self.framework_specialization_stacks.get(
+                        source, None
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     msg = (
                         f"You marked {self._debug_name(source)} as dynamic but your code "
                         f"specialized it to be a constant ({val}). If you're using mark_dynamic, "
                         f"either remove it or use maybe_mark_dynamic. If you're using Dim.DYNAMIC, "
                         f"replace it with either Dim.STATIC or Dim.AUTO."
                         + (
+<<<<<<< HEAD
+=======
+                            "\n\nFramework stack:\n" + "".join(framework_stack.format())
+                            if framework_stack
+                            else ""
+                        )
+                        + (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "\n\nUser stack:\n" + "".join(user_stack.format())
                             if user_stack
                             else ""
@@ -5636,7 +5975,10 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
                 s = sympy.Float(val)
                 input_guards.append((source, s))
 
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for t, source, context in zip(placeholders, sources, input_contexts):
             if isinstance(source, str):
                 from torch._dynamo.source import LocalSource
@@ -5701,7 +6043,10 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
                             src, TensorProperty.SIZE, i
                         )
                         track_symint(property_source, ss, constraint_size[i])
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for i, ss in enumerate(curr_t.stride()):
                         property_source = TensorPropertySource(
                             src, TensorProperty.STRIDE, i
@@ -5717,7 +6062,10 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
         #    if we have an input (2, 3), we must show s0*2 == 2 and s1 == 3.
         #    This does a lot of work: it covers duck sizing and equality guards.
         all_exprs: list[list[str]] = [[] for _ in langs]
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dim_constraints = DimConstraints(
             symbol_to_source,
             self.var_to_val,
@@ -5894,7 +6242,10 @@ def issue_guard(guard: ShapeGuard) -> None:
                 is not None
             ):
                 continue
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             issue_guard(guard)
 
         # Because there are guards that export's constraint solver can suggest good fixes for, that we may have
@@ -5906,7 +6257,10 @@ def issue_guard(guard: ShapeGuard) -> None:
             if self._maybe_evaluate_static(ra.expr, axioms=()) is not None:
                 continue
             expr = self.simplify(ra.expr)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.dim_constraints.add(expr)
 
         # 3. Every symbol must be within its value range (this handles 0/1
@@ -5999,7 +6353,10 @@ def issue_guard(guard: ShapeGuard) -> None:
                 else:
                     str_msg = f"  - {msg_cb()}"
                     error_msgs.append(str_msg)
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     debug_names.add(debug_name)
             if len(error_msgs) > 0:
                 debug_names_str = ", ".join(sorted(debug_names))
@@ -6133,7 +6490,10 @@ def get_pruned_guards(self, symints: Sequence[torch.SymInt]) -> list[ShapeGuard]
         Get a list of guards, but pruned so it only provides guards that
         reference symints from the passed in input
         """
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         symints = {
             s.node.expr for s in symints if isinstance(s.node.expr, sympy.Symbol)
         }
@@ -6323,7 +6683,11 @@ def _maybe_evaluate_static(
 
         Use compute_hint == True if you are trying to compute a non-binding
         hint for the particular hint values of backed and unbacked SymInts,
+<<<<<<< HEAD
         e.g., if s0 happens to be 3 this run, compute_hint will substitute s0 with 3.
+=======
+        e.g., if s0 happens to be 3 this run, compute_hint will subsitute s0 with 3.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         # axioms with compute hint NYE
@@ -6340,11 +6704,19 @@ def resimplify_floor_div(axioms: dict[sympy.Expr, sympy.Expr]) -> None:
                 return
             self._resimplify_floor_div_axioms = False
             new_items = {}
+<<<<<<< HEAD
             for k, v in list(axioms.items()):
                 # A FloorDiv in implications could have became CleanDiv at this point, due to new facts
                 # to the shapeEnv. This handles such issue but its not ideal. This is the only expression
                 # simplification that depends on the global state of shape env.
                 # TODO try to get rid of CleanDiv since it breaks the invariant that's simplifications of sympy
+=======
+            for k, v in axioms.items():
+                # A FloorDiv in implications could have became CleanDiv at this point, due to new facts
+                # to the shapeEnv. This handles such issue but its not ideal. This is the only expression
+                # simplification that depends on the global state of shape env.
+                # TODO try to get rid of CleanDiv since it breaks the invariant thats simplifications of sympy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # expressions only depend on the expression itself.
                 if k.has(FloorDiv):
                     new_items.update({self.simplify(k): v})
@@ -6396,7 +6768,10 @@ def replace(self, expr: _SympyT) -> _SympyT:
         Apply symbol replacements to any symbols in the given expression.
         """
         replacements = {}
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for s in expr.free_symbols:
             r = self._find(s)
 
@@ -6406,7 +6781,10 @@ def replace(self, expr: _SympyT) -> _SympyT:
             if not r.is_Symbol or r != s:
                 replacements[s] = r
         if replacements:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return safe_expand(expr.xreplace(replacements))
         else:
             return expr
@@ -6428,21 +6806,36 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
         expr = safe_expand(expr)
         expr = self.replace(expr)
 
+<<<<<<< HEAD
         # Simplify max(0/1, x) to x when x >= 0/1. max(1, x) is a commonly introduced
         # expression when creating contiguous strides.
         if not size_oblivious:
             min_max_replacements = {}
             for atom in expr.atoms(Max):  # type: ignore[has-type]
+=======
+        if size_oblivious and (expr.has(Max) or expr.has(Min)):  # type: ignore[has-type]
+            min_max_replacements = {}
+            for atom in (*expr.atoms(Max), *expr.atoms(Min)):  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if len(atom.args) > 2:
                     continue
                 a, b = atom.args
                 if b == 1 or b == 0:
                     a, b = b, a
+<<<<<<< HEAD
 
                 if a == 1 and self._maybe_evaluate_static(sympy.Ge(b, 1)):
                     min_max_replacements[atom] = b
                 if a == 0 and self._maybe_evaluate_static(sympy.Ge(b, 0)):
                     min_max_replacements[atom] = b
+=======
+                if a == 1 or a == 0:
+                    vr = self.bound_sympy(b, size_oblivious=True)
+                    if vr.lower >= a:
+                        min_max_replacements[atom] = b if atom.func is Max else a
+                    elif vr.upper <= a:
+                        min_max_replacements[atom] = a if atom.func is Max else b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if min_max_replacements:
                 expr = expr.xreplace(min_max_replacements)
 
@@ -6452,10 +6845,14 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
                 if isinstance(atom.args[0], IntTrueDiv):
                     base, divisor = atom.args[0].args
                     if base % divisor == 0:
+<<<<<<< HEAD
                         trunc_replacements[atom] = CleanDiv(base, divisor)
                     else:
                         # TruncToInt(IntTrueDiv(a,b)) == FloorDiv(a, b)
                         trunc_replacements[atom] = FloorDiv(base, divisor)
+=======
+                        trunc_replacements[atom] = base // divisor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if trunc_replacements:
                 expr = expr.xreplace(trunc_replacements)
 
@@ -6590,6 +6987,10 @@ def _make_data_dependent_error(
         expr: sympy.Basic,
         unhinted_expr: sympy.Basic,
         *,
+<<<<<<< HEAD
+=======
+        size_oblivious_result: Optional[sympy.Basic] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expr_sym_node_id: Optional[int] = None,
     ) -> GuardOnDataDependentSymNode:
         # TODO: in a Dynamo context, having user code, and having the
@@ -6603,6 +7004,14 @@ def _make_data_dependent_error(
             if s in self.size_like:
                 size_like_symbols.append(s)
         size_oblivious_result_msg = ""
+<<<<<<< HEAD
+=======
+        if size_oblivious_result is not None:
+            size_oblivious_result_msg = (
+                f"ATTENTION: guard_size_oblivious would fix the error, evaluating expression to {size_oblivious_result}.\n"
+                "Maybe you need to add guard_size_oblivious to framework code, see doc below for more guidance.\n\n"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sloc, maybe_extra_debug = self._get_stack_summary(True)
         if expr.is_integer:  # type: ignore[attr-defined]
             desc = (
@@ -6610,6 +7019,7 @@ def _make_data_dependent_error(
             )
         else:
             desc = "Could not guard on data-dependent expression"
+<<<<<<< HEAD
             size_oblivious_result_msg = (
                 "consider using data-dependent friendly APIs such as "
                 "guard_or_false, guard_or_true and statically_known_true."
@@ -6619,6 +7029,12 @@ def _make_data_dependent_error(
             f"{desc} {expr} (unhinted: {unhinted_expr}).  "
             f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
             f"{size_oblivious_result_msg}\n"
+=======
+        msg = (
+            f"{desc} {expr} (unhinted: {unhinted_expr}).  "
+            f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
+            f"{size_oblivious_result_msg}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Caused by: {sloc}\n"
             'For more information, run with TORCH_LOGS="dynamic"\n'
             "For extended logs when we create symbols, also add "
@@ -6700,7 +7116,10 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
         Adds or updates a replacement for a symbol.
         Use this instead of `self.replacements[a] = tgt`.
         """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if tgt == self.replacements.get(a, None):
             return
 
@@ -6711,7 +7130,11 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
         assert isinstance(a, sympy.Symbol)
 
         if (
+<<<<<<< HEAD
             self.prefer_deferred_runtime_asserts_over_guards
+=======
+            self.allow_complex_guards_as_runtime_asserts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and not _is_supported_equivalence(tgt)
         ):
             return  # continuing leads to placeholder shapes having complex expressions that we can't resolve
@@ -6845,7 +7268,14 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
 
             for source in self.var_to_sources.get(a, []):
                 if user_tb:
+<<<<<<< HEAD
                     self.specialization_stacks[source] = user_tb
+=======
+                    self.user_specialization_stacks[source] = user_tb
+                self.framework_specialization_stacks[source] = (
+                    CapturedTraceback.extract(cpp=True)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if config.print_specializations:
                 self.log.warning(
@@ -6958,10 +7388,14 @@ def _smart_symbol_sort(x: sympy.Symbol) -> tuple[int, int, str]:
                 ):
                     raise NotImplementedError
 
+<<<<<<< HEAD
                 # Never replace unbacked symbols with other unbacked symbols that are
                 # not function arguments. (ex:mark_unbacked symbols are fine to replace
                 # other unbacked, but not those coming from .item() calls).
 
+=======
+                # Never replace unbacked symbols with other unbacked symbols.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # This is error prone because you can cause references to
                 # unbacked symbols to time travel backwards.  E.g.,
                 #
@@ -6977,10 +7411,15 @@ def _smart_symbol_sort(x: sympy.Symbol) -> tuple[int, int, str]:
                 # dependencies for substitutions, so ban it entirely.
                 def trivial_solve(lhs: sympy.Expr, rhs: sympy.Expr) -> bool:
                     if isinstance(lhs, sympy.Symbol):
+<<<<<<< HEAD
                         if free_unbacked_symbols(
                             lhs
                         ) and not _free_non_source_unbacked_symbols(
                             rhs, self.unbacked_inputs
+=======
+                        if free_unbacked_symbols(lhs) and not free_unbacked_symbols(
+                            rhs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ):
                             return True
                         if symbol_is_type(lhs, SymT.FLOAT):
@@ -7098,7 +7537,11 @@ def _check_frozen(self, expr: sympy.Basic, concrete_val: sympy.Basic) -> None:
                 expr,
                 concrete_val,
                 # only print stack trace when debug mode is on (e.g. TORCH_LOGS="dynamic")
+<<<<<<< HEAD
                 stack_info=log.getEffectiveLevel() < logging.WARNING,
+=======
+                stack_info=True if log.getEffectiveLevel() < logging.WARNING else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def _get_user_frame(self) -> Optional[types.FrameType]:
@@ -7181,7 +7624,10 @@ def _find_frame_locals(self) -> _FrameLocalResult:
         instructions = list(dis.Bytecode(frame.f_code))
         co_lines, offset = inspect.getsourcelines(frame.f_code)
         start, end, cur = None, None, None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, instr in enumerate(instructions):
             if instr.starts_line is not None:
                 cur = instr.starts_line
@@ -7470,6 +7916,10 @@ def _evaluate_expr(
         forcing_spec: bool = False,
     ) -> sympy.Basic:
         # TODO: split conjunctions and evaluate them separately
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             orig_expr,
             (sympy.logic.boolalg.BooleanTrue, sympy.logic.boolalg.BooleanFalse),
@@ -7616,7 +8066,10 @@ def compute_concrete_val() -> sympy.Basic:
                             orig_expr,
                             correct_hint,
                         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         concrete_val = correct_hint
                         # NB: do NOT transmute into runtime assert
                         ok = True
@@ -7635,7 +8088,10 @@ def compute_concrete_val() -> sympy.Basic:
                     ):
                         self._log_real_tensor_propagation(orig_expr, unsound_result)
                         transmute_into_runtime_assert = True
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         concrete_val = unsound_result
                         ok = True
 
@@ -7647,9 +8103,22 @@ def compute_concrete_val() -> sympy.Basic:
                         ok = True
 
                     if not ok:
+<<<<<<< HEAD
+                        raise self._make_data_dependent_error(
+                            expr.xreplace(self.var_to_val),
+                            expr,
+=======
+                        size_oblivious_result = None
+                        # compute size_oblivious_result to suggest it as a fix for the user if it works.
+                        if not size_oblivious:
+                            size_oblivious_result = self._maybe_evaluate_static(
+                                expr, size_oblivious=True
+                            )
                         raise self._make_data_dependent_error(
                             expr.xreplace(self.var_to_val),
                             expr,
+                            size_oblivious_result=size_oblivious_result,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             expr_sym_node_id=self._expr_sym_node_id,
                         )
                 else:
@@ -7693,6 +8162,7 @@ def compute_concrete_val() -> sympy.Basic:
                 # is no longer necessary)
                 self._maybe_guard_rel(g)
 
+<<<<<<< HEAD
                 if (
                     torch.compiler.is_exporting()
                     and self.prefer_deferred_runtime_asserts_over_guards
@@ -7702,6 +8172,9 @@ def compute_concrete_val() -> sympy.Basic:
                     # and so the result here will be statically known
                     self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
                 else:
+=======
+                if not self.allow_complex_guards_as_runtime_asserts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # at this point, we've evaluated the concrete expr value, and have
                     # flipped/negated the guard if necessary. Now we know what to guard
                     # or defer to runtime assert on.
@@ -7710,6 +8183,14 @@ def compute_concrete_val() -> sympy.Basic:
                     )
                     self.guards.append(guard)
                     self.axioms.update(dict(self.get_implications(self.simplify(g))))
+<<<<<<< HEAD
+=======
+                else:
+                    # it's fine to defer simple guards here without checking,
+                    # the _maybe_guard_rel() call above will set replacements if possible,
+                    # and so the result here will be statically known
+                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 self._log_guard("eval [guard suppressed]", g, forcing_spec=forcing_spec)
 
@@ -7816,7 +8297,10 @@ def guard_or_defer_runtime_assert(
             expr = canonicalize_bool_expr(expr)
             stack = CapturedTraceback.extract(skip=1)
             ra = RuntimeAssert(expr, msg, stack)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: Do this in a way that is less janky than int(s.name[1:])
             cands = sorted(
                 (s for s in expr.free_symbols if symbol_is_type(s, SymT.UNBACKED_INT)),
@@ -7880,13 +8364,21 @@ def _refine_ranges(self, expr: SympyBoolean) -> None:
             # sympy.Eq may update both lower and upper bounds.
             # sympy.G{t,e} may update the lower bound, only.
             # sympy.L{t,e} may update the upper bound, only.
+<<<<<<< HEAD
             if lower <= rhs_vr.lower and isinstance(
+=======
+            if lower < rhs_vr.lower and isinstance(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r_expr, (sympy.Eq, sympy.Ge, sympy.Gt)
             ):
                 # Strictly greater relations allow us to refine a bit more, since
                 # x < y implies that the lower bound for x is: y + 1.
                 lower = rhs_vr.lower + int(isinstance(r_expr, sympy.Gt))
+<<<<<<< HEAD
             if upper >= rhs_vr.upper and isinstance(
+=======
+            if upper > rhs_vr.upper and isinstance(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r_expr, (sympy.Eq, sympy.Le, sympy.Lt)
             ):
                 upper = rhs_vr.upper - int(isinstance(r_expr, sympy.Lt))
@@ -7939,9 +8431,13 @@ def run_node(self, n: torch.fx.Node) -> Result:
         from torch._guards import detect_fake_mode
 
         result = super().run_node(n)
+<<<<<<< HEAD
         fake_mode = detect_fake_mode()
         assert fake_mode is not None
         rebind_unbacked(fake_mode.shape_env, n, result)
+=======
+        rebind_unbacked(detect_fake_mode().shape_env, n, result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
 
 
@@ -7984,6 +8480,20 @@ def _print_Symbol(self, sym: sympy.Symbol) -> str:
         return self.src_map[sym.name][0]
 
 
+<<<<<<< HEAD
+=======
+def _is_non_negative_check(cond: sympy.Basic) -> Optional[str]:
+    """
+    Check if a condition (SymPy expression) is checking for non-negative values (>= 0).
+    Returns the variable name if it's a non-negative check (>= 0), None otherwise.
+    """
+    if isinstance(cond, sympy.Rel):
+        if cond.rel_op == ">=" and cond.rhs == 0:
+            return str(cond.lhs)
+    return None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _suggest_torch_checks(
     e: GuardOnDataDependentSymNode, src_map: defaultdict[str, list[str]]
 ) -> None:
@@ -8012,6 +8522,7 @@ def _suggest_torch_checks(
     msg += "\nTo fix the error, insert one of the following checks before this call:"
 
     not_cond_str = printer.doprint(sympy.Not(cond))
+<<<<<<< HEAD
 
     # suggested fixes to resolve `cond` are to tell the compiler to assume
     # either `cond` or its negation (the user will need to select which)
@@ -8019,6 +8530,28 @@ def _suggest_torch_checks(
         f"torch._check({printer.doprint(cond)})",
         f"torch._check({not_cond_str})",
     ]
+=======
+    var_name = _is_non_negative_check(cond)
+
+    # suggested fixes to resolve `cond` are to tell the compiler to assume
+    # either `cond` or its negation (the user will need to select which)
+    suggested_fixes = []
+
+    if var_name:
+        suggested_fixes = [
+            f"You can add either: torch._check_is_size({var_name}) or torch._check({var_name}>=0)"
+            f" Note: torch._check_is_size({var_name}) could prevent data dependent errors that"
+            + " happen in a guard_size_oblivious(..) context by opting into guard_size_oblivious reasoning."
+            + " See documentation on guard_size_oblivious for more details:"
+            + " https://pytorch.org/docs/stable/generated/torch.fx.experimental.symbolic_shapes.guard_size_oblivious.html",
+            f"torch._check({not_cond_str})",
+        ]
+    else:
+        suggested_fixes = [
+            f"torch._check({printer.doprint(cond)})",
+            f"torch._check({not_cond_str})",
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for i, fix in enumerate(suggested_fixes):
         msg += f"\n  {i + 1}. {fix}"
diff --git a/torch/fx/experimental/unification/core.py b/torch/fx/experimental/unification/core.py
index 3d8071c847ae5..3a3e6c99e7734 100644
--- a/torch/fx/experimental/unification/core.py
+++ b/torch/fx/experimental/unification/core.py
@@ -77,7 +77,11 @@ def reify(e, s):
 seq = tuple, list, Iterator
 
 
+<<<<<<< HEAD
 @dispatch(seq, seq, dict)  # type: ignore[arg-type]
+=======
+@dispatch(seq, seq, dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _unify(u, v, s):
     if len(u) != len(v):
         return False
diff --git a/torch/fx/experimental/unification/dispatch.py b/torch/fx/experimental/unification/dispatch.py
index 72b950c5b36d6..3813349731d01 100644
--- a/torch/fx/experimental/unification/dispatch.py
+++ b/torch/fx/experimental/unification/dispatch.py
@@ -1,8 +1,16 @@
 from functools import partial
 
+<<<<<<< HEAD
 from .multipledispatch import dispatch as _dispatch  # type: ignore[import]
+=======
+from .multipledispatch import dispatch  # type: ignore[import]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 namespace = {}  # type: ignore[var-annotated]
 
+<<<<<<< HEAD
 dispatch = partial(_dispatch, namespace=namespace)
+=======
+dispatch = partial(dispatch, namespace=namespace)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/fx/experimental/unification/more.py b/torch/fx/experimental/unification/more.py
index 42074a46a4202..e3e663b0ac3c7 100644
--- a/torch/fx/experimental/unification/more.py
+++ b/torch/fx/experimental/unification/more.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from .core import (  # type: ignore[attr-defined]
     _reify as core_reify,
     _unify as core_unify,
@@ -11,6 +12,12 @@
 __all__ = ["unifiable", "reify_object", "unify_object"]
 
 
+=======
+from .core import reify, unify  # type: ignore[attr-defined]
+from .dispatch import dispatch
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def unifiable(cls):
     """Register standard unify and reify operations on class
     This uses the type and __dict__ or __slots__ attributes to define the
@@ -29,8 +36,13 @@ def unifiable(cls):
     >>> unify(a, b, {})
     {~x: 2}
     """
+<<<<<<< HEAD
     core_unify.add((cls, cls, dict), unify_object)  # type: ignore[attr-defined]
     core_reify.add((cls, dict), reify_object)  # type: ignore[attr-defined]
+=======
+    _unify.add((cls, cls, dict), unify_object)
+    _reify.add((cls, dict), reify_object)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return cls
 
@@ -87,7 +99,10 @@ def _reify_object_slots(o, s):
 @dispatch(slice, dict)
 def _reify(o, s):
     """Reify a Python ``slice`` object"""
+<<<<<<< HEAD
     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return slice(*reify((o.start, o.stop, o.step), s))
 
 
@@ -113,7 +128,11 @@ def unify_object(u, v, s):
     >>> unify_object(f, g, {})
     {~x: 2}
     """
+<<<<<<< HEAD
     if type(u) is not type(v):
+=======
+    if type(u) != type(v):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     if hasattr(u, "__slots__"):
         return unify(
diff --git a/torch/fx/experimental/unification/multipledispatch/core.py b/torch/fx/experimental/unification/multipledispatch/core.py
index 69b9f3b2b5a2c..36c1d5f1595d8 100644
--- a/torch/fx/experimental/unification/multipledispatch/core.py
+++ b/torch/fx/experimental/unification/multipledispatch/core.py
@@ -1,8 +1,11 @@
 # mypy: allow-untyped-defs
 import inspect
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, TypeVar
 from typing_extensions import TypeVarTuple, Unpack
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .dispatcher import Dispatcher, MethodDispatcher
 
@@ -11,6 +14,7 @@
 
 __all__ = ["dispatch", "ismethod"]
 
+<<<<<<< HEAD
 T = TypeVar("T")
 Ts = TypeVarTuple("Ts")
 
@@ -18,6 +22,10 @@
 def dispatch(
     *types: Unpack[Ts], **kwargs: Any
 ) -> Callable[[Callable[..., T]], Callable[..., T]]:
+=======
+
+def dispatch(*types, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Dispatch function on the types of the inputs
     Supports dispatch on all non-keyword arguments.
     Collects implementations based on the function name.  Ignores namespaces.
@@ -58,7 +66,11 @@ def dispatch(
     """
     namespace = kwargs.get("namespace", global_namespace)
 
+<<<<<<< HEAD
     types_tuple: tuple[type, ...] = tuple(types)  # type: ignore[arg-type]
+=======
+    types = tuple(types)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _df(func):
         name = func.__name__
@@ -73,7 +85,11 @@ def _df(func):
                 namespace[name] = Dispatcher(name)
             dispatcher = namespace[name]
 
+<<<<<<< HEAD
         dispatcher.add(types_tuple, func)
+=======
+        dispatcher.add(types, func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return dispatcher
 
     return _df
diff --git a/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
index e2459b82247bc..d719d35b57564 100644
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -238,7 +238,10 @@ def add(self, signature, func):
                         "To use a variadic union type place the desired types "
                         "inside of a tuple, e.g., [(int, str)]"
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-specialization]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_signature.append(Variadic[typ[0]])
             else:
                 new_signature.append(typ)
@@ -266,7 +269,11 @@ def reorder(self, on_ambiguity=ambiguity_warn):
         return od
 
     def __call__(self, *args, **kwargs):
+<<<<<<< HEAD
         types = tuple(type(arg) for arg in args)
+=======
+        types = tuple([type(arg) for arg in args])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             func = self._cache[types]
         except KeyError as e:
@@ -407,7 +414,10 @@ class MethodDispatcher(Dispatcher):
         Dispatcher
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __slots__ = ("obj", "cls")
 
     @classmethod
@@ -422,7 +432,11 @@ def __get__(self, instance, owner):
         return self
 
     def __call__(self, *args, **kwargs):
+<<<<<<< HEAD
         types = tuple(type(arg) for arg in args)
+=======
+        types = tuple([type(arg) for arg in args])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func = self.dispatch(*types)
         if not func:
             raise NotImplementedError(
diff --git a/torch/fx/experimental/unification/multipledispatch/utils.py b/torch/fx/experimental/unification/multipledispatch/utils.py
index 0b21183c40b97..a1b30a63818de 100644
--- a/torch/fx/experimental/unification/multipledispatch/utils.py
+++ b/torch/fx/experimental/unification/multipledispatch/utils.py
@@ -5,9 +5,15 @@
 __all__ = ["raises", "expand_tuples", "reverse_dict", "groupby", "typename"]
 
 
+<<<<<<< HEAD
 def raises(err, lamda):  # codespell:ignore lamda
     try:
         lamda()  # codespell:ignore lamda
+=======
+def raises(err, lamda):
+    try:
+        lamda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     except err:
         return True
diff --git a/torch/fx/experimental/unification/unification_tools.py b/torch/fx/experimental/unification/unification_tools.py
index 8b4216a79ad03..f16a17e78e784 100644
--- a/torch/fx/experimental/unification/unification_tools.py
+++ b/torch/fx/experimental/unification/unification_tools.py
@@ -298,7 +298,10 @@ def update_in(d, keys, func, default=None, factory=dict):
     rv = inner = factory()
     rv.update(d)
 
+<<<<<<< HEAD
     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for key in ks:
         if k in d:
             d = d[k]
diff --git a/torch/fx/experimental/unification/utils.py b/torch/fx/experimental/unification/utils.py
index ab99ad1b4f0d4..f5ee429fa2a32 100644
--- a/torch/fx/experimental/unification/utils.py
+++ b/torch/fx/experimental/unification/utils.py
@@ -23,9 +23,15 @@ def transitive_get(key, d):
     return key
 
 
+<<<<<<< HEAD
 def raises(err, lamda):  # codespell:ignore lamda
     try:
         lamda()  # codespell:ignore lamda
+=======
+def raises(err, lamda):
+    try:
+        lamda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     except err:
         return True
@@ -60,7 +66,11 @@ def _toposort(edges):
             incoming_edges[m].remove(n)
             if not incoming_edges[m]:
                 S.add(m)
+<<<<<<< HEAD
     if any(incoming_edges.get(v) for v in edges):
+=======
+    if any(incoming_edges.get(v, None) for v in edges):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError("Input has cycles")
     return L
 
diff --git a/torch/fx/experimental/unification/variable.py b/torch/fx/experimental/unification/variable.py
index 1b5b51aaf99a5..a77788471deec 100644
--- a/torch/fx/experimental/unification/variable.py
+++ b/torch/fx/experimental/unification/variable.py
@@ -31,7 +31,11 @@ def __str__(self):
     __repr__ = __str__
 
     def __eq__(self, other):
+<<<<<<< HEAD
         return type(self) is type(other) and self.token == other.token  # type: ignore[attr-defined]
+=======
+        return type(self) == type(other) and self.token == other.token  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self):
         return hash((type(self), self.token))  # type: ignore[attr-defined]
@@ -55,7 +59,11 @@ def isvar(v):
 
 @dispatch(object)  # type: ignore[no-redef]
 def isvar(o):
+<<<<<<< HEAD
     return _glv and hashable(o) and o in _glv
+=======
+    return not not _glv and hashable(o) and o in _glv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextmanager
diff --git a/torch/fx/experimental/validator.py b/torch/fx/experimental/validator.py
index eb55b6c2050ca..4ff43494f51cf 100644
--- a/torch/fx/experimental/validator.py
+++ b/torch/fx/experimental/validator.py
@@ -4,9 +4,14 @@
 import logging
 import math
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional, Union
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -652,7 +657,11 @@ def _validate(self) -> None:
 
 
 def translation_validation_enabled() -> bool:
+<<<<<<< HEAD
     # Checks every time this function is called, in case the Dynamo
+=======
+    # Checks everytime this function is called, in case the Dynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # option is set, but Z3 is not installed.
     _assert_z3_installed_if_tv_set()
     return _HAS_Z3 and config.translation_validation
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index fc6f4c5b27021..6dd43c5ca98e0 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -8,15 +8,25 @@
 import keyword
 import math
 import os
+<<<<<<< HEAD
 import pprint
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import typing
 import warnings
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Literal, NamedTuple, Optional, TYPE_CHECKING
+=======
+from collections.abc import Iterable, Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any, Callable, Literal, NamedTuple, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -325,6 +335,7 @@ def __init__(self):
         self._body_transformer: Optional[TransformCodeFunc] = None
         self._func_name: str = "forward"
 
+<<<<<<< HEAD
     def _format_multiline_args(self, args: list[str]) -> str:
         """Helper to format function arguments in expanded multiline format."""
         return "".join(self._format_single_arg(arg) for arg in args)
@@ -374,6 +385,9 @@ def gen_fn_def(
         *,
         expanded_def: bool = False,
     ) -> str:
+=======
+    def gen_fn_def(self, free_vars: list[str], maybe_return_annotation: str) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Given the free variables and a return annotation, generates the beginning of the FX function.
         By default, `gen_fn_def(['a', 'b'], '') == 'def {self._func_name}(a, b):'`
@@ -382,6 +396,7 @@ def gen_fn_def(
         # would have added it.
         if len(free_vars) == 0 or free_vars[0] != "self":
             free_vars.insert(0, "self")
+<<<<<<< HEAD
 
         if expanded_def:
             args_formatted = self._format_multiline_args(free_vars)
@@ -394,14 +409,25 @@ def gen_fn_def(
     def generate_output(
         self, output_args: Argument, *, descs: Optional[Any] = None
     ) -> str:
+=======
+        return (
+            f"def {self._func_name}({', '.join(free_vars)}){maybe_return_annotation}:"
+        )
+
+    def generate_output(self, output_args: Argument) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Given the output arguments, generates the return statement of the FX function.
         Note: The returned statement should not be indented.
         """
+<<<<<<< HEAD
         if descs is not None and isinstance(output_args, (list, tuple)):
             return self._format_multiline_container(output_args, descs, "return ")
         else:
             return f"return {repr(output_args)}"
+=======
+        return f"return {repr(output_args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def process_inputs(self, *args: Any) -> Any:
         """
@@ -439,8 +465,11 @@ def _gen_python_code(
         include_stride: bool = False,
         include_device: bool = False,
         colored: bool = False,
+<<<<<<< HEAD
         # Render each argument on its own line
         expanded_def: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> PythonCode:
         free_vars: list[str] = []
         body: list[str] = []
@@ -455,7 +484,10 @@ def _gen_python_code(
         include_device = include_device or (
             os.environ.get("FX_GRAPH_SHOW_DEVICE", "0") == "1"
         )
+<<<<<<< HEAD
         include_meta = os.environ.get("FX_GRAPH_SHOW_META", "0") == "1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def add_global(name_hint: str, obj: Any):
             """Add an obj to be tracked as a global.
@@ -502,10 +534,26 @@ def type_repr(o: Any):
 
                 origin_typename = add_global(_type_repr(origin_type), origin_type)
 
+<<<<<<< HEAD
                 if hasattr(o, "__args__") and o.__args__:
                     args = [type_repr(arg) for arg in o.__args__]
                     return f"{origin_typename}[{','.join(args)}]"
                 else:
+=======
+                if hasattr(o, "__args__"):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
+
+                    if len(args) == 0:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python < 3.9
+                        return origin_typename
+
+                    return f"{origin_typename}[{','.join(args)}]"
+                else:
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return origin_typename
 
             # Common case: this is a regular module name like 'foo.bar.baz'
@@ -608,13 +656,18 @@ def delete_unused_values(user: Node):
             else:
                 body.append("\n")
 
+<<<<<<< HEAD
         prev_summary_str = None
+=======
+        prev_stacktrace = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def append_stacktrace_summary(node: Node):
             """
             Append a summary of the stacktrace to the generated code. This is
             useful for debugging.
             """
+<<<<<<< HEAD
             nonlocal prev_summary_str
 
             if node.op not in {"placeholder", "output"}:
@@ -633,6 +686,24 @@ def append_stacktrace_summary(node: Node):
                 if summary_str != prev_summary_str:
                     prev_summary_str = summary_str
                     body.append(summary_str)
+=======
+            nonlocal prev_stacktrace
+
+            if node.op not in {"placeholder", "output"}:
+                stack_trace = node.stack_trace
+                if stack_trace:
+                    if stack_trace != prev_stacktrace:
+                        prev_stacktrace = stack_trace
+                        if parsed_stack_trace := _parse_stack_trace(stack_trace):
+                            summary_str = parsed_stack_trace.get_summary_str()
+                        else:
+                            summary_str = ""
+                        body.append(f"\n {dim(f'# {summary_str}')}\n")
+                elif prev_stacktrace != "":
+                    prev_stacktrace = ""
+                    no_stacktrace_msg = "# No stacktrace found for following nodes"
+                    body.append(f"\n{dim(no_stacktrace_msg)}\n")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def stringify_shape(shape: Iterable) -> str:
             return f"[{', '.join([str(x) for x in shape])}]"
@@ -641,7 +712,10 @@ def emit_node(node: Node):
             maybe_type_annotation = (
                 "" if node.type is None else f" : {type_repr(node.type)}"
             )
+<<<<<<< HEAD
             maybe_comment = ""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if verbose:
                 # override annotation with more detailed information
@@ -652,6 +726,7 @@ def emit_node(node: Node):
                     "val",
                     node.meta.get("tensor_meta", node.meta.get("example_value", None)),
                 )
+<<<<<<< HEAD
 
                 def _tensor_annotation(t: torch.Tensor) -> str:
                     stride = stringify_shape(t.stride()) if include_stride else ""
@@ -663,11 +738,14 @@ def _tensor_annotation(t: torch.Tensor) -> str:
                         f"{dim_green(device)}"
                     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # use string as annotation, to make it valid python code
                 if isinstance(meta_val, torch.Tensor) and meta_val.layout not in (
                     torch.sparse_csc,
                     torch.sparse_csr,
                 ):
+<<<<<<< HEAD
                     # Fake tensors cause tests to wobble, so do not custom print them.
                     is_plain = type(meta_val) is torch.Tensor or isinstance(
                         meta_val, torch._subclasses.FakeTensor
@@ -704,13 +782,35 @@ def _tensor_annotation(t: torch.Tensor) -> str:
                     )
                 body.append('"""\n')
 
+=======
+                    stride_annotation = (
+                        f"{stringify_shape(meta_val.stride())}"
+                        if include_stride
+                        else ""
+                    )
+                    device_annotation = f"{meta_val.device}" if include_device else ""
+                    maybe_type_annotation = (
+                        f': "{red(dtype_abbrs[meta_val.dtype])}{blue(stringify_shape(meta_val.shape))}'
+                        f'{dim_blue(stride_annotation)}{dim_green(device_annotation)}"'
+                    )
+                elif isinstance(meta_val, py_sym_types):
+                    val_str = CodeGen._sym_repr(meta_val)
+                    maybe_type_annotation = f': "Sym({val_str})"'
+                elif isinstance(meta_val, TensorMetadata):
+                    maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node.op == "placeholder":
                 assert isinstance(node.target, str)
                 maybe_default_arg = (
                     "" if not node.args else f" = {_get_repr(node.args[0])}"
                 )
                 free_vars.append(
+<<<<<<< HEAD
                     f"{node.target}{maybe_type_annotation}{maybe_default_arg}{maybe_comment}"
+=======
+                    f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 raw_name = node.target.replace("*", "")
                 if raw_name != repr(node):
@@ -786,6 +886,7 @@ def _tensor_annotation(t: torch.Tensor) -> str:
             elif node.op == "output":
                 if node.type is not None:
                     maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+<<<<<<< HEAD
                 body.append(
                     self._call_method_with_signature_check(
                         self.generate_output,
@@ -793,6 +894,9 @@ def _tensor_annotation(t: torch.Tensor) -> str:
                         descs=desc if expanded_def else None,
                     )
                 )
+=======
+                body.append(self.generate_output(node.args[0]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return
             raise NotImplementedError(f"node: {node.op} {node.target}")
 
@@ -826,12 +930,16 @@ def _tensor_annotation(t: torch.Tensor) -> str:
         for name, value in self.additional_globals():
             add_global(name, value)
 
+<<<<<<< HEAD
         prologue = self._call_method_with_signature_check(
             self.gen_fn_def,
             free_vars,
             maybe_return_annotation[0],
             expanded_def=expanded_def,
         )
+=======
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # remove counter and generate lineno to node index mapping
         lineno_map: dict[int, Optional[int]] = {}
@@ -863,6 +971,7 @@ def _tensor_annotation(t: torch.Tensor) -> str:
 # 2. In the FX graph, we need to access 2 attributes - in_spec and out_spec.
 #    Since we can't access .graph within the FX forward, we need to copy the attribute to the module.
 # 3. We currently can't register the pytree imports with `add_global` - not sure why.
+<<<<<<< HEAD
 class _BoxedCodeGen(CodeGen):
     """
     CodeGen subclass that generates code using the "boxed" calling convention.
@@ -901,6 +1010,8 @@ def gen_fn_def(
         return fn_def
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _PyTreeCodeGen(CodeGen):
     def __init__(self, pytree_info: _PyTreeInfo):
         super().__init__()
@@ -918,6 +1029,7 @@ def process_outputs(self, out: Any) -> Any:
         assert self.pytree_info.out_spec is not None
         return pytree.tree_unflatten(out, self.pytree_info.out_spec)
 
+<<<<<<< HEAD
     def _format_annotations(self, free_vars: list[str], expanded_def: bool) -> str:
         """Helper to format annotations for variables in pytree codegen."""
         if not free_vars:
@@ -972,6 +1084,9 @@ def gen_var_bindings(self, fn_args, free_vars, expanded_def) -> str:
     def gen_fn_def(
         self, free_vars, maybe_return_annotation, *, expanded_def: bool = False
     ):
+=======
+    def gen_fn_def(self, free_vars, maybe_return_annotation):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Given a user function/model:
         #   myargs = [myargs0, myargs1]
         #   mykwargs = {'mykwargs0': ..., 'mykwargs1': ...}
@@ -988,14 +1103,19 @@ def gen_fn_def(
         # If the user function/model does not have keywords, the dict is suppressed from tree_flatten_spec
         #   e.g. tree_flatten_spec([mypos, myargs0, myargs1]), self._in_spec)
         if self.pytree_info is None:
+<<<<<<< HEAD
             return super().gen_fn_def(
                 free_vars, maybe_return_annotation, expanded_def=expanded_def
             )
+=======
+            return super().gen_fn_def(free_vars, maybe_return_annotation)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fn_args = self.pytree_info.orig_args
         has_orig_self = (fn_args[0] == "self") if len(fn_args) > 0 else False
         if has_orig_self:
             free_vars.insert(0, "self")
+<<<<<<< HEAD
         fn_definition = super().gen_fn_def(
             fn_args[:], maybe_return_annotation, expanded_def=expanded_def
         )
@@ -1065,6 +1185,53 @@ def gen_var_bindings(self, fn_args, free_vars, expanded_def) -> str:
     def generate_output(self, output_args, *args, **kwargs) -> str:
         output = f"self._out_shuffle_graph({', '.join(self.tree_leaf_names)}, {', '.join([str(a) for a in output_args])})"
         return f"return pytree.tree_unflatten({output}, self._out_spec)"
+=======
+        fn_definition = super().gen_fn_def(fn_args[:], maybe_return_annotation)
+
+        if len(free_vars) > 0:  # pytree has placeholders in it
+            # when kwargs is present, in_spec is tuple(args, kwargs)
+            has_args_kwargs_tuple = (
+                self.pytree_info.in_spec.type == tuple
+                and self.pytree_info.in_spec.num_children == 2
+                and self.pytree_info.in_spec.children_specs[0].type == tuple
+                and self.pytree_info.in_spec.children_specs[1].type == dict
+            )
+            fn_kwargs = "{}"
+            fn_signature = f"[{', '.join(fn_args)}], self._in_spec"
+            if has_args_kwargs_tuple:
+                count_args = self.pytree_info.in_spec.children_specs[0].num_children
+                fn_args = self.pytree_info.orig_args[:count_args]
+                fn_kwargs = (
+                    "{"
+                    + ", ".join(
+                        f"'{k}':{v}"
+                        for k, v in zip(
+                            self.pytree_info.in_spec.children_specs[1].context,
+                            self.pytree_info.orig_args[count_args:],
+                        )
+                    )
+                    + "}"
+                )
+                fn_signature = f"([{', '.join(fn_args)}], {fn_kwargs}), self._in_spec"
+
+            # in Python, `var1: annotation1, var2: annotation2 = function_call()` is invalid.
+            # we need to split it to two lines:
+            # one for annotation: `var1: annotation1; var2: annotation2;` (note the semicolon)
+            # one for code: `var1, var2, = function_call()`
+            without_annotation = [x.split(":")[0] for x in free_vars]
+            has_annotation = [x + "; " for x in free_vars if ":" in x]
+            if len(has_annotation) > 0:
+                fn_definition += "\n    " + "".join(has_annotation) + "\n"
+            fn_definition += f"""
+    {", ".join(without_annotation)}, = fx_pytree.tree_flatten_spec({fn_signature})"""
+        return fn_definition
+
+    def generate_output(self, output_args):
+        if self.pytree_info and self.pytree_info.out_spec:
+            return f"return pytree.tree_unflatten({repr(output_args)}, self._out_spec)"
+        else:
+            return super().generate_output(output_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _FindNodesLookupTable:
@@ -1220,7 +1387,11 @@ def find_nodes(
 
         Returns:
 
+<<<<<<< HEAD
             Iterable of nodes with the requested op and target.
+=======
+            Iteratable of nodes with the requested op and target.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         node_list = self._find_nodes_lookup_table.find_nodes(op=op, target=target)
         if sort:
@@ -1380,7 +1551,10 @@ def erase_node(self, to_erase: Node) -> None:
                 f(to_erase)
 
         self._find_nodes_lookup_table.remove(to_erase)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to_erase._remove_from_list()
         to_erase._erased = True  # iterators may retain handles to erased nodes
         self._len -= 1
@@ -1730,7 +1904,11 @@ def output(self, result: "Argument", type_expr: Optional[Any] = None):
             op="output", target="output", args=(result,), type_expr=type_expr
         )
 
+<<<<<<< HEAD
     def _target_to_str(self, target: Optional[Target]) -> str:
+=======
+    def _target_to_str(self, target: Target) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if callable(target):
             op = target.__name__
         else:
@@ -1750,7 +1928,10 @@ def python_code(
         include_stride: bool = False,
         include_device: bool = False,
         colored: bool = False,
+<<<<<<< HEAD
         expanded_def: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> PythonCode:
         """
         Turn this ``Graph`` into valid Python code.
@@ -1782,7 +1963,11 @@ def python_code(
         # To do this, we create a new namespace just for this source. All names
         # that get printed must come from this namespace.
         #
+<<<<<<< HEAD
         # Why can't we reuse node.name? Because it was generated within the
+=======
+        # Why can't we re-use node.name? Because it was generated within the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # namespace `self._graph_namespace`. In order to provide uniqueness
         # over both locals (node.name) *and* globals, we create a completely
         # new namespace to put all identifiers in.
@@ -1790,7 +1975,11 @@ def python_code(
 
         # Override Node's repr to generate a valid name within our namespace.
         # Since repr() is designed to produce a valid Python expression, it
+<<<<<<< HEAD
         # makes sense to reuse it. This way, it's easy to print something like
+=======
+        # makes sense to re-use it. This way, it's easy to print something like
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Tuple[Node, Node] by simply calling repr() on it. Node's __repr__ is
         # implemented cooperatively to allow this.
         def node_repr(n: Node):
@@ -1817,7 +2006,10 @@ def override_node_repr(graph: Graph):
                 include_stride=include_stride,
                 include_device=include_device,
                 colored=colored,
+<<<<<<< HEAD
                 expanded_def=expanded_def,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def _python_code(
@@ -1829,7 +2021,10 @@ def _python_code(
         include_stride: bool = False,
         include_device: bool = False,
         colored: bool = False,
+<<<<<<< HEAD
         expanded_def: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> PythonCode:
         return self._codegen._gen_python_code(
             self.nodes,
@@ -1839,7 +2034,10 @@ def _python_code(
             include_stride=include_stride,
             include_device=include_device,
             colored=colored,
+<<<<<<< HEAD
             expanded_def=expanded_def,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __str__(self) -> str:
@@ -1941,7 +2139,10 @@ def check_arg(arg: Node, n: Optional[Node] = None) -> None:
                             "a str is expected"
                         )
                 if node.op in ["get_attr", "call_module"]:
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     target_atoms = node.target.split(".")
                     m_itr = self.owning_module
                     for i, atom in enumerate(target_atoms):
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 159926bc8ba49..13cd4b3131129 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -7,9 +7,14 @@
 import sys
 import traceback
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any, Optional, Union
+=======
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -19,7 +24,10 @@
 
 from ._compatibility import compatibility
 from .graph import (
+<<<<<<< HEAD
     _BoxedCodeGen,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _custom_builtins,
     _is_from_torch,
     _override_sym_repr,
@@ -32,6 +40,10 @@
 __all__ = [
     "reduce_graph_module",
     "reduce_package_graph_module",
+<<<<<<< HEAD
+=======
+    "reduce_deploy_graph_module",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "GraphModule",
 ]
 
@@ -148,6 +160,21 @@ def reduce_package_graph_module(
     return _deserialize_graph_module(forward, body)
 
 
+<<<<<<< HEAD
+=======
+@compatibility(is_backward_compatible=True)
+def reduce_deploy_graph_module(
+    importer: PackageImporter, body: dict[Any, Any], import_block: str
+) -> torch.nn.Module:
+    ns = {}
+    ns["__builtins__"] = importer.patched_builtins
+    fn_src = body.get("_code")
+    assert fn_src is not None
+    forward = _forward_from_src(import_block + fn_src, ns)
+    return _deserialize_graph_module(forward, body)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # We create a dummy class here because symbolic_trace pulls the forward()
 # function off of the class, rather than the instance. This class is used
 # in _deserialize_graph_module() below.
@@ -194,7 +221,11 @@ def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
     graph = KeepModules().trace(com, **tracer_extras)
 
     # Recover node.meta["stack_trace"] after re-tracing
+<<<<<<< HEAD
     node_meta_stack_trace = body.get("_graphmodule_graph_node_meta_stack_trace")
+=======
+    node_meta_stack_trace = body.get("_graphmodule_graph_node_meta_stack_trace", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if node_meta_stack_trace is not None:
         del body["_graphmodule_graph_node_meta_stack_trace"]
         for node in graph.nodes:
@@ -311,7 +342,10 @@ def _print_readable(
     include_stride=False,
     include_device=False,
     colored=False,
+<<<<<<< HEAD
     expanded_def=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     graph = module.graph
     assert graph is not None and isinstance(graph, torch.fx.Graph), (
@@ -324,7 +358,10 @@ def _print_readable(
         include_stride=include_stride,
         include_device=include_device,
         colored=colored,
+<<<<<<< HEAD
         expanded_def=expanded_def,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     module_code = verbose_python_code.src
     module_code = module_code.lstrip("\n")
@@ -535,7 +572,10 @@ def __init__(
             self.graph._tracer_cls
             and "<locals>" not in self.graph._tracer_cls.__qualname__
         ):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._tracer_cls = self.graph._tracer_cls
 
         self._tracer_extras = {}
@@ -549,17 +589,24 @@ def __init__(
         self._erase_node_hooks: list[Callable] = []
         # Used to remove hooks from deepcopied graph modules within a context manager.
         self._deepcopy_hooks: list[Callable] = []
+<<<<<<< HEAD
         self.shape_env = None  # optional not always set even when dynamic shapes exist.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TorchScript breaks trying to compile the graph setter because of the
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
     #
     # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
+<<<<<<< HEAD
     __jit_unused_properties__ = ["graph", "_boxed_call"]
 
     @property
     def _boxed_call(self) -> bool:
         return isinstance(self._graph._codegen, _BoxedCodeGen)
+=======
+    __jit_unused_properties__ = ["graph"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def graph(self) -> Graph:
@@ -841,8 +888,11 @@ def recompile(self) -> PythonCode:
         if "_wrapped_call" not in vars(cls):
             cls._wrapped_call = _WrappedCall(cls, cls_call)  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
         self._recompile_submodules()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def call_wrapped(self, *args, **kwargs):
             return self._wrapped_call(self, *args, **kwargs)
 
@@ -850,6 +900,7 @@ def call_wrapped(self, *args, **kwargs):
 
         return python_code
 
+<<<<<<< HEAD
     def _recompile_submodules(self) -> list[tuple[str, PythonCode]]:
         """
         Recompile all submodules of this graph module, returning their respective PythonCodes
@@ -863,6 +914,18 @@ def _recompile_submodules(self) -> list[tuple[str, PythonCode]]:
 
     # Passing Tracer as argument allows subclasses extending fx.GraphModule
     # define their own Tracer (extending fx.Tracer).
+=======
+    # Passing Tracer as argument allows subclasses extending fx.GraphModule
+    # define their own Tracer (extending fx.Tracer).
+    def __reduce_deploy__(self, importer: Importer):
+        dict_without_graph = self.__dict__.copy()
+        dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
+        del dict_without_graph["_graph"]
+
+        python_code = self.recompile()
+        import_block = _format_import_block(python_code.globals, importer)
+        return (reduce_deploy_graph_module, (dict_without_graph, import_block))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __reduce_package__(self, exporter: PackageExporter):
         dict_without_graph = self.__dict__.copy()
@@ -958,7 +1021,10 @@ def print_readable(
         # If `fast_sympy_print` is True then we use a sympy printer which is faster
         # but may result in less-readable output.
         fast_sympy_print: bool = False,
+<<<<<<< HEAD
         expanded_def: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Return the Python code generated for current GraphModule and its children GraphModules
@@ -980,7 +1046,10 @@ def fast_repr(expr: torch.types.PySymType) -> str:
                 include_stride,
                 include_device,
                 colored,
+<<<<<<< HEAD
                 expanded_def,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return r
 
@@ -999,7 +1068,11 @@ def _replicate_for_data_parallel(self):
     @contextlib.contextmanager
     def _set_replace_hook(self, f):
         """
+<<<<<<< HEAD
         Takes a callable which will be called every time when we replace a node
+=======
+        Takes a callable which will be called everytime when we replace a node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to a new node, or change the node's name. Callable takes three arguments:
         the old node we're changing, and NAME of the new node, followed by the
         user node which consumes the old node to be replaced.
@@ -1013,7 +1086,11 @@ def _set_replace_hook(self, f):
 
     def _register_replace_node_hook(self, f):
         """
+<<<<<<< HEAD
         Takes a callable which will be called every time when we replace a node
+=======
+        Takes a callable which will be called everytime when we replace a node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to a new node, or change the node's name. Callable takes three arguments:
         the old node we're changing, and NAME of the new node, followed by the
         user node which consumes the old node to be replaced.
@@ -1023,7 +1100,11 @@ def _register_replace_node_hook(self, f):
 
     def _unregister_replace_node_hook(self, f):
         """
+<<<<<<< HEAD
         Takes a callable which was previously registered to be called every time when we replace a node.
+=======
+        Takes a callable which was previously registered to be called everytime when we replace a node.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         This function will unregister that callable so it is no longer invoked on node replacement.
         """
         assert callable(f), "create_node hook must be a callable."
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index a6cbe1cfe2c82..5b3e72ed8be73 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -5,7 +5,10 @@
 
 import torch
 import torch.fx.traceback as fx_traceback
+<<<<<<< HEAD
 from torch._logging import trace_structured
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.hub import tqdm
 
 from . import config
@@ -33,7 +36,11 @@ class Interpreter:
     transformations as well as analysis passes.
 
     Methods in the Interpreter class can be overridden to customize
+<<<<<<< HEAD
     the behavior of execution. The map of overridable methods
+=======
+    the behavior of execution. The map of overrideable methods
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     in terms of call hierarchy::
 
         run()
@@ -176,12 +183,16 @@ def run(
                 if self.extra_traceback:
                     msg = f"While executing {node.format_node()}"
                     msg = f"{e.args[0]}\n\n{msg}" if e.args else str(msg)
+<<<<<<< HEAD
                     msg += f"\nOriginal traceback:\n{node.stack_trace}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (
                         isinstance(self.module, GraphModule)
                         and self.module.graph is not None
                         and isinstance(self.module.graph, torch.fx.Graph)
                     ):
+<<<<<<< HEAD
                         trace_structured(
                             "artifact",
                             metadata_fn=lambda: {
@@ -196,6 +207,10 @@ def run(
 
                     msg += "\nUse tlparse to see full graph. "
                     msg += "(https://github.com/pytorch/tlparse?tab=readme-ov-file#tlparse-parse-structured-pt2-logs)"
+=======
+                        msg += f"\nGraphModule: {self.module.print_readable(print_output=False, include_stride=True)}\n"
+                    msg += f"\nOriginal traceback:\n{node.stack_trace}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     e.args = (msg,) + e.args[1:]
                     if isinstance(e, KeyError):
                         raise RuntimeError(*e.args) from e
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 48f57d588631c..0b392266ec02c 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -4,9 +4,15 @@
 import logging
 import operator
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
 from typing_extensions import ParamSpec, TypeVar
+=======
+from collections.abc import Mapping, Sequence
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C import _fx_map_aggregate, _fx_map_arg, _NodeBase
@@ -15,7 +21,10 @@
     normalize_function,
     normalize_module,
 )
+<<<<<<< HEAD
 from torch.utils._dtype_abbrs import dtype_abbrs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .._ops import ops as _ops
 from ._compatibility import compatibility
@@ -46,7 +55,11 @@
 ]
 base_types = BaseArgumentTypes.__args__  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
 Target: TypeAlias = Union[Callable[..., Any], str]
+=======
+Target = Union[Callable[..., Any], str]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Argument = Optional[
     Union[
@@ -59,7 +72,10 @@
         BaseArgumentTypes,
     ]
 ]
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-annotation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ArgumentT = TypeVar("ArgumentT", bound=Argument)
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
@@ -152,6 +168,7 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
     if getattr(builtins, func.__name__, None) is func:
         return func.__name__
     # torch.Tensor.{fn}
+<<<<<<< HEAD
     if (
         isinstance(func, (types.MethodDescriptorType, types.WrapperDescriptorType))
         and func is getattr(torch.Tensor, func.__name__, None)
@@ -159,6 +176,11 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
         func.__module__ == torch._tensor.__name__
         and func.__qualname__ == f"Tensor.{func.__name__}"
     ):
+=======
+    if isinstance(
+        func, (types.MethodDescriptorType, types.WrapperDescriptorType)
+    ) and func is getattr(torch.Tensor, func.__name__, None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"torch.Tensor.{func.__name__}"
     name = func.__name__
     if name == "<lambda>":
@@ -251,7 +273,11 @@ class Node(_NodeBase):
     # should not be accessed directly.
     _input_nodes: dict["Node", None]
     # All of the nodes that use the value produced by this Node
+<<<<<<< HEAD
     # Note one user may correspond to several uses, e.g. the node for ``x + x``
+=======
+    # Note one user may correspond to several uses, e.g. the node fo ``x + x``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # would appear once here, but represents two uses.
     # Is a dict to act as an "ordered set". Keys are significant, value dont-care
     users: dict["Node", None]
@@ -385,8 +411,46 @@ def prepend(self, x: "Node") -> None:
         Args:
             x (Node): The node to put before this node. Must be a member of the same graph.
         """
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         self._prepend(x)
+=======
+        assert self.graph == x.graph, "Attempting to move a Node into a different Graph"
+        if self == x:
+            log.debug(
+                "Trying to prepend a node to itself. This behavior has no effect on the graph."
+            )
+            return
+        x._remove_from_list()
+        p = self._prev
+        p._next, x._prev = x, p
+        x._next, self._prev = self, x
+
+        # compute x._sort_key
+        psk = x._prev._sort_key
+        nsk = x._next._sort_key
+        if len(psk) > len(nsk):
+            idx: int
+            *prefix, idx = psk[: len(nsk) + 1]
+            x._sort_key = (*prefix, idx + 1)
+        elif len(psk) < len(nsk):
+            *prefix, idx = nsk[: len(psk) + 1]
+            x._sort_key = (*prefix, idx - 1)
+        else:  # same length, increase length by 1
+            x._sort_key = (*psk, 0)
+
+    def __gt__(self, other: "Node") -> bool:
+        return self._sort_key > other._sort_key
+
+    def __lt__(self, other: "Node") -> bool:
+        return self._sort_key < other._sort_key
+
+    def __ge__(self, other: "Node") -> bool:
+        return self > other or self == other
+
+    def __le__(self, other: "Node") -> bool:
+        return self < other or self == other
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @compatibility(is_backward_compatible=True)
     def append(self, x: "Node") -> None:
@@ -397,8 +461,16 @@ def append(self, x: "Node") -> None:
         Args:
             x (Node): The node to put after this node. Must be a member of the same graph.
         """
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         self._next._prepend(x)
+=======
+        self._next.prepend(x)
+
+    def _remove_from_list(self) -> None:
+        p, n = self._prev, self._next
+        p._next, n._prev = n, p
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def args(self) -> tuple[Argument, ...]:
@@ -567,8 +639,11 @@ def format_node(
         self,
         placeholder_names: Optional[list[str]] = None,
         maybe_return_typename: Optional[list[str]] = None,
+<<<<<<< HEAD
         *,
         include_tensor_metadata: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[str]:
         """
         Return a descriptive string representation of ``self``.
@@ -590,7 +665,10 @@ def format_node(
             maybe_return_typename: A single-element list that will store
                 a formatted string representing the output of the
                 generated ``forward`` function. Internal use only.
+<<<<<<< HEAD
             include_tensor_metadata: Whether to include tensor metadata
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             str: If 1) we're using ``format_node`` as an internal helper
@@ -622,6 +700,7 @@ def format_node(
                 maybe_return_typename[0] = f" -> {_type_repr(self.type)}"
             return f"return {self.args[0]}"
         else:
+<<<<<<< HEAD
 
             def stringify_shape(shape: Iterable) -> str:
                 return f"[{', '.join([str(x) for x in shape])}]"
@@ -652,6 +731,13 @@ def stringify_shape(shape: Iterable) -> str:
                 )
             return (
                 f"%{self.name} : {type_annotation}[num_users={len(self.users)}] = "
+=======
+            maybe_typename = (
+                f"{_type_repr(self.type)} " if self.type is not None else ""
+            )
+            return (
+                f"%{self.name} : {maybe_typename}[num_users={len(self.users)}] = "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"{self.op}[target={self._pretty_print_target(self.target)}]("
                 f"args = {_format_arg(self.args)}, kwargs = {_format_arg(self.kwargs)})"
             )
@@ -660,7 +746,11 @@ def stringify_shape(shape: Iterable) -> str:
     def replace_all_uses_with(
         self,
         replace_with: "Node",
+<<<<<<< HEAD
         delete_user_cb: Optional[Callable[["Node"], bool]] = None,
+=======
+        delete_user_cb: Callable[["Node"], bool] = lambda user: True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         propagate_meta: bool = False,
     ) -> list["Node"]:
@@ -688,6 +778,7 @@ def replace_all_uses_with(
             )
             for k, v in self.meta.items():
                 replace_with.meta[k] = v
+<<<<<<< HEAD
         to_process = [*self.users]
         replace_hooks = getattr(self.graph.owning_module, "_replace_hooks", None)
         result = []
@@ -701,6 +792,34 @@ def replace_all_uses_with(
             # pyrefly: ignore [missing-attribute]
             use_node._replace_input_with(self, replace_with)  # type: ignore[attr-defined]
         return result
+=======
+        to_process = list(self.users)
+        skipped = []
+        m = self.graph.owning_module
+        for use_node in to_process:
+            if not delete_user_cb(use_node):
+                skipped.append(use_node)
+                continue
+
+            def maybe_replace_node(n: Node) -> Node:
+                if n == self:
+                    return replace_with
+                else:
+                    return n
+
+            if getattr(m, "_replace_hooks", None):
+                for replace_hook in m._replace_hooks:
+                    replace_hook(old=self, new=replace_with.name, user=use_node)
+
+            new_args = _fx_map_arg(use_node.args, maybe_replace_node)
+            new_kwargs = _fx_map_arg(use_node.kwargs, maybe_replace_node)
+            assert isinstance(new_args, tuple)
+            assert isinstance(new_kwargs, dict)
+            use_node._update_args_kwargs(new_args, new_kwargs)
+
+        assert len(self.users) - len(skipped) == 0
+        return [n for n in to_process if n not in skipped]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @compatibility(is_backward_compatible=False)
     def is_impure(self, impure_random: bool = True) -> bool:
@@ -729,6 +848,7 @@ def is_impure(self, impure_random: bool = True) -> bool:
                     # impure since it mutates RNG state
                     return True
 
+<<<<<<< HEAD
             # Handle Python random functions that don't have _nondeterministic_seeded
             # but still affect global RNG state (issue #151524)
             # These should be impure regardless of impure_random setting to maintain
@@ -752,6 +872,8 @@ def is_impure(self, impure_random: bool = True) -> bool:
                 # between eager and compiled execution, regardless of generator usage
                 return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.target in _side_effectful_functions
 
         # Check if an impure module.
@@ -831,13 +953,27 @@ def replace_input_with(self, old_input: "Node", new_input: "Node") -> None:
             new_input (Node): The new input node to replace ``old_input``.
         """
 
+<<<<<<< HEAD
+=======
+        def maybe_replace_node(n: Node) -> Node:
+            return new_input if n == old_input else n
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = self.graph.owning_module
         if getattr(m, "_replace_hooks", None):
             for replace_hook in m._replace_hooks:
                 replace_hook(old=old_input, new=new_input.name, user=self)
 
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         self._replace_input_with(old_input, new_input)  # type: ignore[attr-defined]
+=======
+        new_args = _fx_map_arg(self.args, maybe_replace_node)
+        new_kwargs = _fx_map_arg(self.kwargs, maybe_replace_node)
+        assert isinstance(new_args, tuple)
+        assert isinstance(new_kwargs, dict)
+        self._update_args_kwargs(new_args, new_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _rename(self, candidate: str) -> None:
         if candidate == self.name:
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 1234d13b3b11f..492a624dc9ad3 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -5,8 +5,12 @@
 import types
 import typing
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast, NamedTuple, Optional, TYPE_CHECKING
+=======
+from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._jit_internal import boolean_dispatched
@@ -121,7 +125,10 @@ def _torchscript_schema_to_signature_impl(
             # which makes it hard to do type annotation
             kind = Parameter.POSITIONAL_ONLY  # type: ignore[assignment]
             # This renders all previous arguments to positional only
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for idx, p in enumerate(parameters):
                 assert p.kind == Parameter.POSITIONAL_OR_KEYWORD
                 parameters[idx] = Parameter(
@@ -130,7 +137,10 @@ def _torchscript_schema_to_signature_impl(
                     default=p.default,
                     annotation=p.annotation,
                 )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         parameters.append(
             Parameter(name=name, kind=kind, default=default, annotation=arg_type)
         )
diff --git a/torch/fx/passes/__init__.py b/torch/fx/passes/__init__.py
index 3bcb6e1d75a17..26c628babee8b 100644
--- a/torch/fx/passes/__init__.py
+++ b/torch/fx/passes/__init__.py
@@ -4,7 +4,10 @@
     net_min_base,
     operator_support,
     param_fetch,
+<<<<<<< HEAD
     regional_inductor,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reinplace,
     runtime_assert,
     shape_prop,
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
index 7d7a4c04cff2f..acd0df34f7118 100644
--- a/torch/fx/passes/_tensorify_python_scalars.py
+++ b/torch/fx/passes/_tensorify_python_scalars.py
@@ -64,8 +64,13 @@
 # manage to eliminate all float compute, this ends up being equivalent, but
 # there is a critical difference when some floats cannot be eliminated: when
 # we call item() on them, what should it's SymFloat be? Ideally, it would
+<<<<<<< HEAD
 # be the same backed SymFloat we had before. But without symbolic expression
 # propagation on tensor quantities, repropagating would instead give you an
+=======
+# be the same backed SymFloat we had before. But without symbolic expresssion
+# propogation on tensor quantities, repropagating would instead give you an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # unbacked SymFloat. Maybe it is a good idea to implement symbolic propagation
 # on 0d scalar tensors, but I decided to go for something simpler to start.
 #
@@ -164,6 +169,7 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 c = float(expr)
 
             node = graph.call_function(
+<<<<<<< HEAD
                 torch.ops.aten.scalar_tensor.default,
                 # pyrefly: ignore [unbound-name]
                 (c,),
@@ -171,6 +177,11 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
             )
             with fake_mode:
                 # pyrefly: ignore [unbound-name]
+=======
+                torch.ops.aten.scalar_tensor.default, (c,), {"dtype": dtype}
+            )
+            with fake_mode:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node.meta["val"] = torch.ops.aten.scalar_tensor.default(c, dtype=dtype)
             expr_to_tensor_proxy[expr] = MetaProxy(
                 node,
@@ -207,7 +218,11 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 and node.target is torch.ops.aten._local_scalar_dense.default
             ):
                 dtype = node.args[0].meta["val"].dtype
+<<<<<<< HEAD
                 if not dtype.is_floating_point:
+=======
+                if dtype != torch.float64:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
 
                 assert isinstance(node.args[0], fx.Node), node.args[0]
@@ -216,6 +231,7 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 expr_to_tensor_proxy[s] = MetaProxy(
                     node.args[0], tracer=tracer, fake_mode=fake_mode
                 )
+<<<<<<< HEAD
                 # Upcast the float tensor to torch.float64 to avoid precision problem
                 expr_to_tensor_proxy[s] = torch.ops.prims.convert_element_type.default(
                     expr_to_tensor_proxy[s], torch.float64
@@ -224,21 +240,33 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                     node, tracer=tracer, fake_mode=fake_mode
                 )
             # pyrefly: ignore [bad-argument-type]
+=======
+                expr_to_sym_proxy[s] = MetaProxy(
+                    node, tracer=tracer, fake_mode=fake_mode
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif (sym_expr := _get_sym_val(node)) is not None:
                 if sym_expr not in expr_to_sym_proxy and not isinstance(
                     sym_expr, (sympy.Number, sympy.logic.boolalg.BooleanAtom)
                 ):
                     expr_to_sym_proxy[sym_expr] = MetaProxy(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
                         node,
                         tracer=tracer,
                         fake_mode=fake_mode,
+=======
+                        node, tracer=tracer, fake_mode=fake_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             # Specialize all dimensions that contain symfloats. Here's
             # an example test that requires this:
             # PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=4 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 # noqa: B950
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             val = node.meta.get("val")
             if isinstance(val, FakeTensor):
                 for dim in val.shape:
@@ -257,17 +285,26 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                                 should_restart = True
 
             # Look for functions to convert
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             if node.op == "call_function" and (
                 # pyrefly: ignore [missing-attribute]
+=======
+            if node.op == "call_function" and (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 replacement_op := SUPPORTED_OPS.get(node.target)
             ):
                 args: list[Any] = []
                 transform = False
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
                 compute_dtype = get_computation_dtype(node.meta["val"].dtype)
 
                 # pyrefly: ignore [missing-attribute]
+=======
+                compute_dtype = get_computation_dtype(node.meta["val"].dtype)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for a in node.args:
                     if (
                         isinstance(a, fx.Node)
@@ -304,7 +341,10 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 if transform:
                     replacement_proxy = replacement_op(*args)
 
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if compute_dtype != node.meta["val"].dtype:
                         replacement_proxy = (
                             torch.ops.prims.convert_element_type.default(
@@ -313,9 +353,13 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                             )
                         )
 
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
                     node.replace_all_uses_with(replacement_proxy.node)
                     # pyrefly: ignore [bad-argument-type]
+=======
+                    node.replace_all_uses_with(replacement_proxy.node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     graph.erase_node(node)
 
                     metrics_context = get_metrics_context()
@@ -324,16 +368,23 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                             "tensorify_float_success", True, overwrite=True
                         )
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for a in node.args:
                     if (
                         isinstance(a, fx.Node)
                         and "val" in a.meta
                         and isinstance(zf := a.meta["val"], torch.SymFloat)
                     ):
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
                         failed_tensorify_ops.update(str(node.target))
                         # pyrefly: ignore [missing-attribute]
+=======
+                        failed_tensorify_ops.update(str(node.target))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         log.info("Failed to tensorify %s", str(node.target))
 
     # Now do one more pass that specializes all symfloats we didn't manage
@@ -369,7 +420,11 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
     # Sometimes by the time we get to tensorify, there have already been
     # specializations, eg. in python_arg_parser.h. In these cases,
     # placeholder nodes no longer have a reference to their original
+<<<<<<< HEAD
     # symfloat and thus we need to deduce specializations have happened
+=======
+    # symfloat and thus we need to deduce specializations have happend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # via shape_env.replacements. NB: there's an important invariant here
     # that symfloats keep consistent names across restarts.
     for k, v in shape_env.var_to_val.items():
diff --git a/torch/fx/passes/backends/cudagraphs.py b/torch/fx/passes/backends/cudagraphs.py
index 657c7578f5fa5..73fcc86a6a241 100644
--- a/torch/fx/passes/backends/cudagraphs.py
+++ b/torch/fx/passes/backends/cudagraphs.py
@@ -15,10 +15,17 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         if node.op not in CALLABLE_NODE_OPS:
             return False
 
+<<<<<<< HEAD
         if node.target == torch.ops.aten.embedding_dense_backward.default:
             return False
 
         if node.target == operator.getitem:
+=======
+        if node.target in [torch.ops.aten.embedding_dense_backward.default]:
+            return False
+
+        if node.target in [operator.getitem]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
 
         found_not_cuda = False
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index 92ce645df8fa9..3e30b3be254de 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -437,14 +437,21 @@ def _to_dot(
                         )
                     current_graph = buf_name_to_subgraph.get(buf_name)  # type: ignore[assignment]
 
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 current_graph.add_node(dot_node)
 
                 def get_module_params_or_buffers():
                     for pname, ptensor in chain(
+<<<<<<< HEAD
                         leaf_module.named_parameters(),
                         # pyrefly: ignore [bad-argument-type]
                         leaf_module.named_buffers(),
+=======
+                        leaf_module.named_parameters(), leaf_module.named_buffers()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         pname1 = node.name + "." + pname
                         label1 = (
diff --git a/torch/fx/passes/graph_manipulation.py b/torch/fx/passes/graph_manipulation.py
index 6026e9ca25c05..5c6f54553ccb7 100644
--- a/torch/fx/passes/graph_manipulation.py
+++ b/torch/fx/passes/graph_manipulation.py
@@ -88,7 +88,11 @@ def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
     """
     # Total num of elements
     total_num_of_elems = 0
+<<<<<<< HEAD
     # For a module, consider all parameters
+=======
+    # For a module, conside all parameters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if node.op == "call_module":
         submodule_dict = dict(fx_module.named_modules())
         submodule = submodule_dict[node.target]
diff --git a/torch/fx/passes/graph_transform_observer.py b/torch/fx/passes/graph_transform_observer.py
index e762b8a60d10c..3958f980482e6 100644
--- a/torch/fx/passes/graph_transform_observer.py
+++ b/torch/fx/passes/graph_transform_observer.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import os
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, TypeVar
+=======
+from typing import Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.fx import Graph, Node
 from torch.fx._compatibility import compatibility
@@ -32,13 +36,18 @@ def __init__(
         """
         log_url is inferred to be torch._inductor.config.trace.log_url_for_graph_xform unless otherwise specified
         """
+<<<<<<< HEAD
         from torch._inductor import config as inductor_config
+=======
+        from torch._inductor.config import trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.gm = gm
         self.passname = passname
         self.subsystem = subsystem
 
         if log_url is None:
+<<<<<<< HEAD
             log_url = inductor_config.trace.log_url_for_graph_xform
 
         self.log_url = log_url
@@ -47,12 +56,23 @@ def __init__(
             self.log_url is not None
             or inductor_config.trace.provenance_tracking_level == 1
         )
+=======
+            log_url = trace.log_url_for_graph_xform
+
+        self.log_url = log_url
+
+        self.active = trace.enabled or self.log_url is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.active:
             self.erased_nodes: set[str] = set()
             self.created_nodes: set[str] = set()
             self.name_to_node: dict[str, Node] = {}
+<<<<<<< HEAD
             # record graph modules deepcopied from self.gm, so we can remove hooks on them when exiting the context
+=======
+            # record graph modules deepcopied from self.gm, so we can remove hoooks on them when exiting the context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.copied_gms: list[GraphModule] = []
 
             self._node_creation_hook = self.get_node_creation_hook()
@@ -194,12 +214,15 @@ def on_node_replace(old: Node, new: str, user: Node):
 
             assert isinstance(new_node, Node)
 
+<<<<<<< HEAD
             # replace hook is called once for each user of old
             # this avoids adding duplicated source nodes
             added_nodes = {s.name for s in new_node.meta.get("from_node", [])}
             if old.name in added_nodes:
                 return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             action = [NodeSourceAction.REPLACE]
             if new_node.name in self.created_nodes:
                 action.append(NodeSourceAction.CREATE)
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 7bb536dbba939..7e91dd8d45795 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -18,6 +18,7 @@
 
 class Partition:
     def __init__(
+<<<<<<< HEAD
         self,
         id: Optional[int] = None,
         nodes: Optional[Iterable[Node]] = None,
@@ -35,12 +36,23 @@ def __init__(
                     "nodes and node_orders must have the same length"
                 )
                 self.nodes = dict(zip(nodes_list, node_orders_list))
+=======
+        self, id: Optional[int] = None, nodes: Optional[Iterable[Node]] = None
+    ):
+        self.id = id
+        self.nodes = dict.fromkeys(nodes) if nodes is not None else {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self) -> str:
         return str(self.nodes)
 
+<<<<<<< HEAD
     def add_node(self, node: Node, node_order: Optional[int] = None):
         self.nodes.update({node: node_order})
+=======
+    def add_node(self, node: Node):
+        self.nodes.update({node: None})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def remove_node(self, node: Node):
         del self.nodes[node]
@@ -185,12 +197,20 @@ def dfs_iter_find_cycle(all_user_nodes: set[Node]):
 
             return merge_id, True
 
+<<<<<<< HEAD
         def merge_single_node(node: Node, node_order: Optional[int], id: Optional[int]):
+=======
+        def merge_single_node(node: Node, id: Optional[int]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def _update_partition_map(node: Node, id: int):
                 # Iterate through all the users of this node and update the partition map to indicate
                 # that there is a path from the partition id of this node to the target partition id.
                 for user_node in node.users:
+<<<<<<< HEAD
                     target_id = assignment.get(user_node)
+=======
+                    target_id = assignment.get(user_node, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if target_id is not None:
                         partition_map[id].add(target_id)
                         partition_map[id].update(partition_map[target_id])
@@ -202,19 +222,31 @@ def _update_partition_map(node: Node, id: int):
                 assignment.pop(node)
             elif id not in partitions_by_id:
                 assignment[node] = id
+<<<<<<< HEAD
                 assert node_order is not None
                 partitions_by_id[id] = Partition(
                     id=id, nodes=[node], node_orders=[node_order]
                 )
+=======
+                partitions_by_id[id] = Partition(id=id, nodes=[node])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 partition_users[id] = set(node.users)
                 _update_partition_map(node, id)
             else:
                 assignment[node] = id
+<<<<<<< HEAD
                 partitions_by_id[id].add_node(node, node_order)
 
         logger.debug("Proposing partitions...")
 
         for node_order, node in enumerate(reversed(self.graph_module.graph.nodes)):
+=======
+                partitions_by_id[id].add_node(node)
+
+        logger.debug("Proposing partitions...")
+
+        for node in reversed(self.graph_module.graph.nodes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use Dict as an ordered set to ensure deterministic partitioning result, don't care value
             merge_candidates: dict[int, None] = {}
 
@@ -227,7 +259,11 @@ def _update_partition_map(node: Node, id: int):
                 partition_id = next(new_partition_id)
                 nodes_order[node] = partition_id
                 partitions_order[partition_id] = partition_id
+<<<<<<< HEAD
                 merge_single_node(node, node_order, partition_id)
+=======
+                merge_single_node(node, partition_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 merge_candidates[partition_id] = None
 
             # merge all possible partitions
@@ -244,6 +280,7 @@ def _update_partition_map(node: Node, id: int):
                     # in the graph, otherwise, this is a no-op
                     self_id, _ = maybe_merge_partition(self_id, other_id)
 
+<<<<<<< HEAD
         # sort partition nodes based on descending node order
         for partition in partitions_by_id.values():
             partition.nodes = dict(
@@ -252,6 +289,8 @@ def _update_partition_map(node: Node, id: int):
                 )
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # post processing to re-assign "getitem" nodes into upstream partition
         logger.debug("Reassigning getitem nodes to its producer node's partition...")
         nodes_reassignment: dict[Node, int] = {}
@@ -267,12 +306,21 @@ def _update_partition_map(node: Node, id: int):
 
             # node has tuple outputs, re-assign all following getitem node into node's partition
             if is_tuple_output:
+<<<<<<< HEAD
                 id = assignment.get(node)  # type: ignore[arg-type]
                 for user in node.users:
                     if assignment.get(user) != id:  # type: ignore[arg-type]
                         nodes_reassignment[user] = id  # type: ignore[assignment]
         for node, id in nodes_reassignment.items():
             merge_single_node(node, None, id)
+=======
+                id = assignment.get(node, None)  # type: ignore[arg-type]
+                for user in node.users:
+                    if assignment.get(user, None) != id:  # type: ignore[arg-type]
+                        nodes_reassignment[user] = id  # type: ignore[assignment]
+        for node, id in nodes_reassignment.items():
+            merge_single_node(node, id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # filter out single node partitions
         if not self.allows_single_node_partition:
diff --git a/torch/fx/passes/infra/pass_base.py b/torch/fx/passes/infra/pass_base.py
index 32c641031b31f..610c5dca1222a 100644
--- a/torch/fx/passes/infra/pass_base.py
+++ b/torch/fx/passes/infra/pass_base.py
@@ -11,7 +11,10 @@
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class PassResult(namedtuple("PassResult", ["graph_module", "modified"])):
     """
     Result of a pass:
diff --git a/torch/fx/passes/infra/pass_manager.py b/torch/fx/passes/infra/pass_manager.py
index 87fb6e70037f9..41835c6ea72a8 100644
--- a/torch/fx/passes/infra/pass_manager.py
+++ b/torch/fx/passes/infra/pass_manager.py
@@ -1,9 +1,15 @@
 # mypy: allow-untyped-defs
 import inspect
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from queue import Queue
+=======
+from functools import wraps
+from queue import Queue
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.nn as nn
 from torch.fx._compatibility import compatibility
@@ -31,7 +37,10 @@ def pass_result_wrapper(fn: Callable) -> Callable:
         wrapped_fn (Callable[Module, PassResult])
     """
     if fn is None:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     @wraps(fn)
@@ -79,7 +88,11 @@ def _topological_sort_passes(
     if len(constraints) == 0:
         return passes
 
+<<<<<<< HEAD
     # Construct a graph mapping nodes to a list of their users
+=======
+    # Contruct a graph mapping nodes to a list of their users
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph: dict[Callable, list[Callable]] = {p: [] for p in passes}
     indegree_map: dict[Callable, int] = dict.fromkeys(passes, 0)
     candidates: Queue = Queue()
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index e98bad06e5a55..8156928b666ee 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, cast, Optional
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -96,7 +101,11 @@ class _MinimizerBase:
 
     Currently we provides two ways to traverse the graph and generate submodules.
         1. Sequential traversal: this will traverse the graph node by node and generate
+<<<<<<< HEAD
            one submodule with one single node.
+=======
+           one submodule with one sigle node.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         2. Binary searching: this will do a binary search style traversal on the graph.
 
     For internal Users, a guide can be found here https://fb.quip.com/HDtuAgiKGfkP.
@@ -396,25 +405,37 @@ def _run_and_compare(
             report.append(f"Result mismatch for {result_key}")  # type: ignore[possibly-undefined]
             if self.module_exporter:
                 if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result_key = result_key[-1]
                 # If the result is still a tuple (happens in non-sequential mode),
                 # we only use the first element as name.
                 if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result_key = str(result_key[0])
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     a_input,
                     submodule,
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result_key + "_cpu",
                 )
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     b_input,
                     submodule,
+<<<<<<< HEAD
                     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     result_key + "_acc",
                 )
             raise FxNetMinimizerResultMismatchError(f"Result mismatch for {result_key}")  # type: ignore[possibly-undefined]
@@ -653,7 +674,11 @@ def _block_traverse(
     ) -> NodeSet:
         """
         Traverse topologically sorted node list
+<<<<<<< HEAD
         Find minimum block (start_idx, end_idx) which contains the culprit
+=======
+        Find minimium block (start_idx, end_idx) which contains the culprit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         1st pass: search for end_idx by finding the last node in culprit block
         where Numerical accuracy (0, end_idx) > threshold
         2nd pass: search for start_idx by finding the first node in culprit block
@@ -673,7 +698,11 @@ def _block_traverse(
         final_start_idx: Optional[int] = start_idx
         final_end_idx: Optional[int] = end_idx
 
+<<<<<<< HEAD
         run_both = find_last_node is None
+=======
+        run_both = True if find_last_node is None else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # step 1: find (0, end_idx) of culprit block
         if run_both or find_last_node:
diff --git a/torch/fx/passes/param_fetch.py b/torch/fx/passes/param_fetch.py
index 5e17a8040e6a9..66c360ab7fefd 100644
--- a/torch/fx/passes/param_fetch.py
+++ b/torch/fx/passes/param_fetch.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index 297d50a68f474..789269850d859 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -1,9 +1,15 @@
 # mypy: allow-untyped-defs
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from inspect import unwrap
 from typing import Optional
+=======
+from functools import wraps
+from inspect import unwrap
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger = logging.getLogger(__name__)
diff --git a/torch/fx/passes/reinplace.py b/torch/fx/passes/reinplace.py
index 30f1549389610..d37c94211f9a2 100644
--- a/torch/fx/passes/reinplace.py
+++ b/torch/fx/passes/reinplace.py
@@ -2,9 +2,14 @@
 import _operator
 import itertools
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from typing import Any
+=======
+from enum import Enum
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -267,7 +272,11 @@ def matching_view_metadata(a, b):
                 continue
             self_alias_base = self_alias.meta["view_of"]
             try:
+<<<<<<< HEAD
                 # The we're trying to reuse the args from the view_scatter call inside of the corresponding
+=======
+                # The we're trying to re-use the args from the view_scatter call inside of the corresponding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # view op, which might throw. This just indicates that view_scatter op isn't a valid inverse
                 # of the current alias we're looking at.
                 view_replay_metadata = original_view(
@@ -292,7 +301,11 @@ def reinplace(gm, *sample_args):
     mutating the nodes of the graph.
     We look for out-of-place op call sites like `b = a.add(...)`,
     and convert them to be inplace (`b = a.add_(...)`),
+<<<<<<< HEAD
     as long as the input to the current operator ("a") isn't reused
+=======
+    as long as the input to the current operator ("a") isn't re-used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     anywhere later in the graph.
 
     This pass currently expects to operate on a **functional, ATen** graph.
@@ -343,7 +356,11 @@ def reinplace(gm, *sample_args):
           NOTE: there's a future optimization that we should make:
           if "a" is a (alias of a)  program input, but later in the program
           there is a node that looks like "a.copy_(...)",
+<<<<<<< HEAD
           Then re-inplacing is ok to do - we are temporarily reusing a's buffer,
+=======
+          Then re-inplacing is ok to do - we are temporarily re-using a's buffer,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           which will later be overwritten by the copy_() call.
 
           This will be an important optimization to have for programs that mutate
@@ -543,7 +560,11 @@ def _add_to_map(x):
                 continue
             if len(node.target._schema.arguments) < 1:
                 continue
+<<<<<<< HEAD
             if type(node.target._schema.arguments[0].type) is not torch.TensorType:
+=======
+            if type(node.target._schema.arguments[0].type) != torch.TensorType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             # Step 1a: Check that the self argument we're attempting to reinplace
@@ -600,7 +621,11 @@ def _add_to_map(x):
                 later_node_usages, self_aliases
             )
 
+<<<<<<< HEAD
             # Step 2: Check to see if the input to the op is reused later in the graph.
+=======
+            # Step 2: Check to see if the input to the op is re-used later in the graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # If not (same goes for its aliases), then this op is safe to re-in place.
             # This is a slightly roundabout way to check that there are no later usages of the current self argument.
             # (later_view_inverse_node_usages corresponds to "view_scatter" nodes that we are allowed to delete)
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index 85a36a1346e3b..a97f4e01a8af2 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -61,7 +61,11 @@ def insert_deferred_runtime_asserts(
     """
     During tracing, we may have discovered that some data-dependent values
     had runtime assert on them; e.g., torch.empty(x.item()) induces a runtime
+<<<<<<< HEAD
     that x.item() >= 0.  These asserts can happen unpredictably during fake
+=======
+    that x.item() >= 0.  This asserts can happen unpredictably during fake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor propagation, so we cannot conveniently insert them into the FX graph
     when they occur.  Instead, we accumulate them in the ShapeEnv, and in this
     pass insert them into the graph as proper tests.
@@ -299,11 +303,17 @@ def match_symbol(symint, cb):
                     ):
                         with _set_node_metadata_hook(gm, _node_metadata_hook):
                             expr_to_proxy[s] = fx.Proxy(cb(), tracer=tracer)
+<<<<<<< HEAD
 
                         log.debug("expr_to_proxy[%s] = %s", s, expr_to_proxy[s])
 
                 match_symbol(example_value, lambda: node)
 
+=======
+                        log.debug("expr_to_proxy[%s] = %s", s, expr_to_proxy[s])
+
+                match_symbol(example_value, lambda: node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(t := example_value, torch.Tensor):
                     for i, s in enumerate(t.size()):
                         match_symbol(
@@ -339,6 +349,7 @@ def match_symbol(symint, cb):
                 torch._check,
                 torch.ops.aten._assert_scalar.default,
             ):
+<<<<<<< HEAD
                 cond = node.args[0] if node.args else node.kwargs.get("cond")
                 if (
                     cond == True  # noqa: E712
@@ -346,6 +357,14 @@ def match_symbol(symint, cb):
                     and assert_expr in added_asserts
                 ):
                     arg = cond
+=======
+                if (
+                    node.args[0] == True  # noqa: E712
+                    or (assert_expr := _get_sym_val(node.args[0])) in expr_to_proxy
+                    and assert_expr in added_asserts
+                ):
+                    arg = node.args[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     gm.graph.erase_node(node)
                     if isinstance(arg, fx.Node) and not arg.users:
                         gm.graph.erase_node(arg)
@@ -360,7 +379,10 @@ def match_symbol(symint, cb):
             ):
                 # this guards against deleting calls like item() that produce new untracked symbols
                 def has_new_untracked_symbols():
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for symbol in sym_expr.free_symbols:
                         if symbol not in expr_to_proxy:
                             return True
@@ -376,7 +398,10 @@ def has_new_untracked_symbols():
                 assert resolved_unbacked_bindings is not None
 
                 def has_new_unbacked_bindings():
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for key in resolved_unbacked_bindings.keys():
                         if key not in expr_to_proxy:
                             return True
@@ -403,14 +428,19 @@ def has_new_unbacked_bindings():
                             ),
                         ):
                             expr_to_proxy[sym_expr] = _sympy_interp(
+<<<<<<< HEAD
                                 expr_to_proxy,
                                 sym_expr,
+=======
+                                expr_to_proxy, sym_expr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             )  # type: ignore[arg-type]
                         # won't try DCE-ing tensor compute here
                     hash_node = expr_to_proxy[sym_expr].node  # type: ignore[arg-type]
                     node.replace_all_uses_with(hash_node)
                     gm.graph.erase_node(node)
                     log.debug(
+<<<<<<< HEAD
                         "CSE node %s -> %s for expr %s",
                         node,
                         hash_node,
@@ -422,6 +452,14 @@ def has_new_unbacked_bindings():
                 elif sym_expr not in expr_to_proxy and not isinstance(
                     sym_expr,
                     (sympy.Number, sympy.logic.boolalg.BooleanAtom),
+=======
+                        "CSE node %s -> %s for expr %s", node, hash_node, sym_expr
+                    )
+
+                # store node in hash cons, don't delete/replace
+                elif sym_expr not in expr_to_proxy and not isinstance(
+                    sym_expr, (sympy.Number, sympy.logic.boolalg.BooleanAtom)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):  # don't hash cons primitives
                     expr_to_proxy[sym_expr] = fx.Proxy(node, tracer=tracer)  # type: ignore[arg-type]
 
@@ -472,7 +510,10 @@ def go(node, keypath):
                                     ),
                                     keypath[2:],
                                 )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             return go(
                                 graph.call_method(
                                     keypath[0].name, (node, keypath[1].idx)
@@ -480,6 +521,7 @@ def go(node, keypath):
                                 keypath[2:],
                             )
                         elif isinstance(keypath[0], CallMethodKey):
+<<<<<<< HEAD
                             if keypath[0].name == "storage_offset":
                                 return go(
                                     graph.call_function(
@@ -489,6 +531,8 @@ def go(node, keypath):
                                     keypath[1:],
                                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             return go(
                                 graph.call_method(keypath[0].name, (node,)), keypath[1:]
                             )
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 62ea218356138..c98c7ca7ac252 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -7,7 +7,11 @@
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._guards import detect_fake_mode
+<<<<<<< HEAD
 from torch._prims_common import is_contiguous_for_memory_format_or_false
+=======
+from torch._prims_common import definitely_contiguous_for_memory_format
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_aggregate, Node
@@ -35,8 +39,13 @@ class TensorMetadata(NamedTuple):
 
 # When include_contiguity is True, we will set contiguity when its always true for the tensor.
 # Some tensors can represent both contiguous and non-contiguous tensors. e.g: (u0, u1) with (u2, u3).
+<<<<<<< HEAD
 # In such situation contiguity is not set. We could also make it a tri-state i.e: (def_contiguous,
 # def_not_contiguous and unknown).
+=======
+# In such situation contiguity is not set. We could also make it a tri-state i.e: (definitely_contiguous,
+# contiguous, and unknown).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _extract_tensor_metadata(
     result: torch.Tensor, include_contiguity=True
 ) -> TensorMetadata:
@@ -51,6 +60,7 @@ def _extract_tensor_metadata(
     memory_format = None
 
     if include_contiguity and not is_sparse_any(result):
+<<<<<<< HEAD
         memory_formats = (
             torch.contiguous_format,
             torch.channels_last,
@@ -58,6 +68,15 @@ def _extract_tensor_metadata(
         )
         for query_format in memory_formats:
             if is_contiguous_for_memory_format_or_false(
+=======
+        memory_formats = {
+            torch.contiguous_format,
+            torch.channels_last,
+            torch.channels_last_3d,
+        }
+        for query_format in memory_formats:
+            if definitely_contiguous_for_memory_format(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result, memory_format=query_format
             ):
                 memory_format = query_format
@@ -68,6 +87,7 @@ def _extract_tensor_metadata(
     if is_quantized:
         qscheme = result.qscheme()
         qparams["qscheme"] = qscheme
+<<<<<<< HEAD
         if qscheme in (torch.per_tensor_affine, torch.per_tensor_symmetric):
             qparams["scale"] = result.q_scale()  # type: ignore[assignment]
             qparams["zero_point"] = result.q_zero_point()  # type: ignore[assignment]
@@ -76,6 +96,16 @@ def _extract_tensor_metadata(
             torch.per_channel_affine_float_qparams,
             torch.per_channel_symmetric,
         ):
+=======
+        if qscheme in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+            qparams["scale"] = result.q_scale()  # type: ignore[assignment]
+            qparams["zero_point"] = result.q_zero_point()  # type: ignore[assignment]
+        elif qscheme in {
+            torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams,
+            torch.per_channel_symmetric,
+        }:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # In this branch, scale and zero_point are expected to be tensors,
             # we store the values as immutable_list in TensorMetadata for
             # easier serialization downstream
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index e23045caccbec..ccea58478770c 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -2,8 +2,12 @@
 import inspect
 import logging
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.fx._compatibility import compatibility
@@ -60,8 +64,11 @@ def split_module(
     keep_original_order: Optional[bool] = False,
     keep_original_node_name: Optional[bool] = False,
     keep_original_input_name: bool = True,
+<<<<<<< HEAD
     *,
     partition_affix: Optional[str] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Creates subgraphs out of main graph
@@ -84,8 +91,11 @@ def split_module(
             have the same node names as the original graph.
         keep_original_input_name: bool: If the partitioned graphs should
             have the same input names as the original graph.
+<<<<<<< HEAD
         partition_affix: Optional[str]: If specified, the submodules' names will contain
             the affix, e.g. "submod_<affix>_<idx>".
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         GraphModule: the module after split.
@@ -95,7 +105,11 @@ def split_module(
         This is a sample setup:
 
             import torch
+<<<<<<< HEAD
             from torch.fx._symbolic_trace import symbolic_trace
+=======
+            from torch.fx.symbolic_trace import symbolic_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.fx.graph_module import GraphModule
             from torch.fx.node import Node
             from torch.fx.passes.split_module import split_module
@@ -253,11 +267,15 @@ def record_cross_partition_use(def_node: Node, use_node: Optional[Node]):
                                 s_def_partition = partitions[s_defined]
                                 s_def_partition.outputs.setdefault(s_node.name)
                                 s_def_partition.dependents.setdefault(used)
+<<<<<<< HEAD
                                 use_partition.dependencies.setdefault(s_defined)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if defined is not None:
                     use_partition.dependencies.setdefault(defined)
 
     def instantiate_node_partition_mapping(node):
+<<<<<<< HEAD
         partition_idx = split_callback(node)
         partition_name = str(partition_idx)
         if partition_affix is not None:
@@ -265,6 +283,9 @@ def instantiate_node_partition_mapping(node):
             # partition name will be "pp_0", "pp_1", etc
             partition_name = "_".join([partition_affix, partition_name])
 
+=======
+        partition_name = str(split_callback(node))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug(
             "instantiate_node_partition_mapping %s (%s)", node.name, partition_name
         )
@@ -351,9 +372,13 @@ def instantiate_node_partition_mapping(node):
 
     assert all(v is not None for v in autocast_exits.values()), "autocast must exit"
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     autocast_regions = {k: sorted(v) for k, v in autocast_regions.items()}
     # pyrefly: ignore [bad-assignment]
+=======
+    autocast_regions = {k: sorted(v) for k, v in autocast_regions.items()}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_regions = {k: sorted(v) for k, v in grad_regions.items()}
 
     if _LOGGER.isEnabledFor(logging.DEBUG):
@@ -408,7 +433,11 @@ def instantiate_node_partition_mapping(node):
         root_partition = root_partitions.pop()
         sorted_partitions.append(root_partition)
         for dependent in partitions[root_partition].dependents:
+<<<<<<< HEAD
             partitions[dependent].dependencies.pop(root_partition)  # noqa: B909
+=======
+            partitions[dependent].dependencies.pop(root_partition)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not partitions[dependent].dependencies:
                 root_partitions.append(dependent)
     if len(sorted_partitions) != len(partitions):
@@ -418,9 +447,13 @@ def instantiate_node_partition_mapping(node):
     for regions_mapping in [autocast_regions, grad_regions]:
         for node, regions in regions_mapping.items():
             assert len(regions) > 0
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
             partitions[str(regions[0])].environment[node] = node
             # pyrefly: ignore [index-error]
+=======
+            partitions[str(regions[0])].environment[node] = node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for r in regions[1:]:
                 partition = partitions[str(r)]
                 new_node = partition.graph.create_node(
@@ -520,7 +553,10 @@ def add_placeholder():
         for node in reversed(regions_mapping):
             regions = regions_mapping[node]
             assert len(regions) > 0
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for r in regions[:-1]:
                 partition = partitions[str(r)]
                 exit_node = autocast_exits[node]
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index 88da7ac7c4f55..808cc2145cf7e 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -17,12 +17,16 @@
 @compatibility(is_backward_compatible=False)
 def getattr_recursive(obj, name):
     for layer in name.split("."):
+<<<<<<< HEAD
         if isinstance(obj, torch.nn.ModuleList):
             if hasattr(obj, "_modules") and layer in obj._modules:
                 obj = obj._modules[layer]
             else:
                 return None
         elif hasattr(obj, layer):
+=======
+        if hasattr(obj, layer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             obj = getattr(obj, layer)
         else:
             return None
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 6cf708a619069..49bba25bcf94d 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import argparse
 import copy
+<<<<<<< HEAD
 import json
 import logging
 import os
@@ -11,6 +12,15 @@
 
 import torch
 from torch._logging import trace_structured
+=======
+import logging
+from collections import defaultdict
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from typing import Any, NamedTuple, Optional
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_arg
 from torch.fx.passes.graph_manipulation import get_size_of_node
@@ -35,8 +45,11 @@
     "Subgraph",
     "SplitResult",
     "generate_inputs_for_submodules",
+<<<<<<< HEAD
     "NodeEvent",
     "NodeEventTracker",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 _LOGGER = logging.getLogger(__name__)
 
@@ -44,6 +57,7 @@
 DEFAULT_SKIP_FUSION = False
 DEFAULT_ALLOW_NON_TENSOR = False
 
+<<<<<<< HEAD
 # ENV var and constants for node tracker
 
 TRACKER_DUMP_PATH = "_fx_net_tracker"
@@ -73,6 +87,8 @@
     ENV_FX_NET_ACC_SPLITTER_TRACKER_MODE, "0"
 )  # type: ignore[assignment]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _SplitterSettingBase:
     def __init__(
@@ -134,6 +150,7 @@ def __init__(
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 class NodeEvent:
     """
     An event in graph split that happened on a node.
@@ -273,6 +290,8 @@ def dump_selected_nodes(nodes):
 
 
 @compatibility(is_backward_compatible=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FxNetAccNodesFinder:
     """
     Finds a set of nodes that can be supported on ACC, excluding nodes that have non-tensor
@@ -298,8 +317,11 @@ def __init__(
         self.allow_non_tensor = allow_non_tensor
         self.acc_nodes: NodeSet = set()
 
+<<<<<<< HEAD
         self.tracker = NodeEventTracker(int(TRACKER_MODE), DUMP_PREFIX)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
         """
         Transitively excludes nodes from ACC supported set.
@@ -314,9 +336,13 @@ def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
             for user in node.users:
                 if user in self.acc_nodes:
                     self.acc_nodes.remove(user)
+<<<<<<< HEAD
                     self.tracker.add(user, "acc_del|user_of_new_cpu_node", node)
                     if not is_node_output_tensor(user):
                         self.tracker.add(user, "new_cpu_node|non_tensor_output")
+=======
+                    if not is_node_output_tensor(user):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         cpu_worklist.append(user)
 
     def reduce_acc_nodes_non_tensor_input(self):
@@ -333,7 +359,10 @@ def reduce_acc_nodes_non_tensor_input(self):
                 continue
             if is_node_output_tensor(node):
                 continue
+<<<<<<< HEAD
             self.tracker.add(node, "new_cpu_node|callable_non_tensor_input")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             non_tensor_cpu_nodes.append(node)
 
         self.reduce_acc_nodes_non_tensor_input_helper(non_tensor_cpu_nodes)
@@ -352,9 +381,12 @@ def reduce_acc_nodes_non_tensor_output(self):
                 for user in acc_node.users:
                     if user not in self.acc_nodes:
                         new_cpu_nodes.append(acc_node)
+<<<<<<< HEAD
                         self.tracker.add(
                             acc_node, "acc_del|non_tensor_output_with_cpu_user", user
                         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         break
 
             if not new_cpu_nodes:
@@ -367,6 +399,7 @@ def reduce_acc_nodes_non_tensor_output(self):
 
     def __call__(self) -> NodeSet:
         submodules = dict(self.module.named_modules())
+<<<<<<< HEAD
         self.acc_nodes = set()
         for n in self.module.graph.nodes:
             if n.op not in CALLABLE_NODE_OPS:
@@ -378,11 +411,23 @@ def __call__(self) -> NodeSet:
 
             self.tracker.add(n, "init_acc|callable_and_operator_supported")
             self.acc_nodes.add(n)
+=======
+        self.acc_nodes = {
+            n
+            for n in self.module.graph.nodes
+            if n.op in CALLABLE_NODE_OPS
+            and self.operator_support.is_node_supported(submodules, n)
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not self.allow_non_tensor:
             self.reduce_acc_nodes_non_tensor_input()
             self.reduce_acc_nodes_non_tensor_output()
+<<<<<<< HEAD
         self.tracker.dump()
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.acc_nodes
 
 
@@ -408,7 +453,11 @@ class SplitResult(NamedTuple):
         split_module: root module after splitting.
         submodule_inputs: a dict that maps submodule name to its inputs.
         non_acc_submodule_prefix: the prefix for non acc submodules. For
+<<<<<<< HEAD
             acc submodule the prefix is always "_run_on_acc_".
+=======
+            acc submodule the prefix is alwasy "_run_on_acc_".
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     split_module: torch.fx.GraphModule
@@ -447,8 +496,12 @@ def pre_forward(module, module_inputs):
 
     for name, mod in model.named_modules():
         if name in target_submodules:
+<<<<<<< HEAD
             if not isinstance(mod, torch.jit.ScriptModule):
                 handles.append(mod.register_forward_pre_hook(pre_forward))
+=======
+            handles.append(mod.register_forward_pre_hook(pre_forward))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def clean_up_handles():
         for h in handles:
@@ -905,7 +958,11 @@ def extend_acc_subgraph(self, tag: str):
         """
         # Dict that maps node to its users and ignore users that
         # are in the subgraph that has greater tag
+<<<<<<< HEAD
         deps = self.find_reverse_deps(tag_id=int(tag.rsplit("_", maxsplit=1)[-1]))
+=======
+        deps = self.find_reverse_deps(tag_id=int(tag.split("_")[-1]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.update_reverse_deps_for_fusions(deps)
 
         # Parent nodes of the subgraph
@@ -952,6 +1009,7 @@ def starter_nodes(self) -> tuple[NodeSet, NodeSet]:
         starter_cpu_nodes: NodeSet = set()
         starter_acc_nodes: NodeSet = set()
         for node in self.module.graph.nodes:
+<<<<<<< HEAD
             # edge case, call_function, but with no dependencies
             if node.op == "call_function" and len(node.all_input_nodes) == 0:
                 if node in self.acc_nodes:
@@ -962,12 +1020,19 @@ def starter_nodes(self) -> tuple[NodeSet, NodeSet]:
             if node.op not in {"placeholder", "get_attr"}:
                 continue
 
+=======
+            if node.op not in {"placeholder", "get_attr"}:
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for user in node.users:
                 if user in self.acc_nodes:
                     starter_acc_nodes.add(user)
                 else:
                     starter_cpu_nodes.add(user)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return starter_cpu_nodes, starter_acc_nodes
 
     def put_nodes_into_subgraphs(self) -> list[Subgraph]:
diff --git a/torch/fx/passes/utils/common.py b/torch/fx/passes/utils/common.py
index 4c97aa4093571..5f4b55457285a 100644
--- a/torch/fx/passes/utils/common.py
+++ b/torch/fx/passes/utils/common.py
@@ -64,7 +64,10 @@ def lift_subgraph_as_module(
 
         for name in target_name_parts[:-1]:
             if not hasattr(curr, name):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 curr.add_module(name, HolderModule({}))
 
             curr = getattr(curr, name)
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index 33db9fd03d790..c46cb14a134e1 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -44,7 +44,11 @@ def topo_sort(nodes: NodeList) -> NodeList:
 
 @compatibility(is_backward_compatible=False)
 def validate_partition(partition: NodeList) -> bool:
+<<<<<<< HEAD
     # verify the partition doesn't form a dependency cycle in the original graph
+=======
+    # verify the partition does't form a dependency cycle in the original graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # returns True for valid partition, False for invalid
 
     partition_set = set(partition)
@@ -96,7 +100,11 @@ def fuse_as_graphmodule(
     gm: GraphModule,
     nodes: NodeList,
     module_name: str,
+<<<<<<< HEAD
     partition_lookup_table: _Optional[dict[Node, _Optional[int]]] = None,
+=======
+    partition_lookup_table: _Optional[dict[Node, None]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     always_return_tuple: bool = False,
 ) -> tuple[GraphModule, tuple[Node, ...], tuple[Node, ...]]:
@@ -157,13 +165,21 @@ def remap_inputs(x: Node) -> Node:
 
         if x in partition_lookup_table:
             # x is inside subgraph, return the copied node
+<<<<<<< HEAD
             # the node should have been copied already, as we are copying graph in the topological order
+=======
+            # the node should have been copied aleady, as we are copying graph in the topological order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return node_map[x]
 
         if x not in node_to_placeholder:
             # x is not in subgraph, create a new placeholder for subgraph
             placeholder_node = subgraph.placeholder(x.name, type_expr=x.type)
+<<<<<<< HEAD
             # copy all meta fields, even if some fields might be irrelevant for the placeholder node
+=======
+            # copy all meta fields, even if some fields might be irrelvant for the placeholder node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             placeholder_node.meta = copy.copy(x.meta)
             node_to_placeholder[x] = placeholder_node
 
@@ -249,7 +265,11 @@ def erase_nodes(gm: GraphModule, nodes: NodeList) -> None:
 @compatibility(is_backward_compatible=False)
 def fuse_by_partitions(
     gm: GraphModule,
+<<<<<<< HEAD
     partitions: list[dict[Node, _Optional[int]]],
+=======
+    partitions: list[dict[Node, None]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prefix: str = "fused_",
     always_return_tuple: bool = False,
 ) -> GraphModule:
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 6f253cb292860..c29a380ae4af2 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -95,7 +95,11 @@ def __init__(
             )
 
         for node in pattern.nodes:
+<<<<<<< HEAD
             if node.op != "output" and not node.is_impure():
+=======
+            if node.op != "output":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert len(node.users) > 0, (
                     "SubgraphMatcher cannot be initialized with an pattern with dead code"
                 )
@@ -127,7 +131,11 @@ def _match_attributes(self, pn: Node, gn: Node) -> bool:
         pn_value = torch.fx.graph_module._get_attr(pn.graph.owning_module, pn.target)
         gn_value = torch.fx.graph_module._get_attr(gn.graph.owning_module, gn.target)
 
+<<<<<<< HEAD
         if type(pn_value) is not type(gn_value):
+=======
+        if type(pn_value) != type(gn_value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         # Don't require exact match on tensor values.
@@ -137,14 +145,21 @@ def _match_attributes(self, pn: Node, gn: Node) -> bool:
             raise RuntimeError(f"Unsupported type {pn_value} when matching attributes")
         return False
 
+<<<<<<< HEAD
     def _nodes_are_equal(self, pn: Node, gn: Node, node_name_match: str = "") -> bool:
+=======
+    def _nodes_are_equal(self, pn: Node, gn: Node) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # if exact match for placeholder is not required, then use placeholder as a wildcard
         if not self.match_placeholder and pn.op == "placeholder":
             return True
 
+<<<<<<< HEAD
         if node_name_match and node_name_match in gn.name:
             return True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if pn.op == gn.op:
             if pn.op == "placeholder" or pn.op == "output":
                 return True
@@ -213,11 +228,17 @@ def _match_literals(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
         elif not isinstance(pn, Node) and isinstance(gn, Node):
             return False
         else:
+<<<<<<< HEAD
             return type(gn) is type(pn) and gn == pn
 
     def _match_nodes(
         self, pn: Node, gn: Node, match: InternalMatch, node_name_match: str = ""
     ) -> bool:
+=======
+            return type(gn) == type(pn) and gn == pn
+
+    def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logger.info("  matching %s to %s", pn, gn)
 
         assert isinstance(pn, Node) and isinstance(gn, Node), str(
@@ -233,7 +254,11 @@ def _match_nodes(
         if gn in match.nodes_map.values():
             return False
 
+<<<<<<< HEAD
         if not self._nodes_are_equal(pn, gn, node_name_match):
+=======
+        if not self._nodes_are_equal(pn, gn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         # Optimistically mark `pn` as a match for `gn`, and save a local copy of match
@@ -318,11 +343,19 @@ def get_all_arguments(orig_args, orig_kwargs):
 
         return True
 
+<<<<<<< HEAD
     def match(self, graph: Graph, node_name_match: str = "") -> list[InternalMatch]:
         """
         Returns:
             The matched subgraphs.
             The returned subgraph would be fully self-contained, meaning the nodes (except placeholder
+=======
+    def match(self, graph: Graph) -> list[InternalMatch]:
+        """
+        Returns:
+            The matched subgraphs.
+            Thre returned subgraph would be fully self-contained, meaning the nodes (except placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and nodes returned by output) can only be consumed by nodes within the matched subgraph.
 
         Subgraph pattern matcher is implemented with the backtracking style in the following steps:
@@ -360,7 +393,11 @@ def match(self, graph: Graph, node_name_match: str = "") -> list[InternalMatch]:
         match_candidates: dict[Node, list[Node]] = defaultdict(list)
         for pattern_anchor in self.pattern_anchors:
             for node in graph.nodes:
+<<<<<<< HEAD
                 if self._nodes_are_equal(pattern_anchor, node, node_name_match):
+=======
+                if self._nodes_are_equal(pattern_anchor, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     match_candidates[pattern_anchor].append(node)
         match_candidates_list = list(match_candidates.items())
 
@@ -387,9 +424,13 @@ def backtracking(anchor_index, match):
             for node in candidate_nodes:
                 logger.info("Trying to match anchor %s to %s", pattern_anchor, node)
 
+<<<<<<< HEAD
                 match_found = self._match_nodes(
                     pattern_anchor, node, match, node_name_match
                 )
+=======
+                match_found = self._match_nodes(pattern_anchor, node, match)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if match_found:
                     # match next anchor
                     backtracking(anchor_index + 1, match)
diff --git a/torch/fx/passes/utils/matcher_with_name_node_map_utils.py b/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
index 3114d55b635fc..f3548b98a1309 100644
--- a/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
+++ b/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
@@ -88,7 +88,11 @@ def __init__(
             ignore_literals,
         )
 
+<<<<<<< HEAD
     def match(self, graph: Graph, node_name_match: str = "") -> list[InternalMatch]:
+=======
+    def match(self, graph: Graph) -> list[InternalMatch]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """The returned InternalMatch will have name_node_map populated with a map
         from node name (str) to the target node, e.g.
         {"conv": target_conv_ndoe, "relu": target_relu_node}
@@ -107,7 +111,11 @@ def pattern(...):
             return relu, {"conv": conv, "relu": relu}
         ``` instead
         """
+<<<<<<< HEAD
         internal_matches = super().match(graph, node_name_match)
+=======
+        internal_matches = super().match(graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for internal_match in internal_matches:
             for k, n in self.name_node_map.items():
                 internal_match.name_node_map[k] = internal_match.nodes_map[n]
diff --git a/torch/fx/passes/utils/source_matcher_utils.py b/torch/fx/passes/utils/source_matcher_utils.py
index 043c65e6b77d2..1ba4c0700efaa 100644
--- a/torch/fx/passes/utils/source_matcher_utils.py
+++ b/torch/fx/passes/utils/source_matcher_utils.py
@@ -1,8 +1,13 @@
 import logging
 import os
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Any, Optional
+=======
+from dataclasses import dataclass, field
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.fx._compatibility import compatibility
 from torch.fx.graph import Graph
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 552046d30cc97..716efff83adab 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -10,14 +10,23 @@
 import sys
 import traceback
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from dataclasses import fields, is_dataclass
 from typing import Any, Optional
+=======
+from collections.abc import Iterator
+from dataclasses import fields, is_dataclass
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.traceback as fx_traceback
 from torch._C import _fx_map_aggregate as map_aggregate, _fx_map_arg as map_arg
+<<<<<<< HEAD
 from torch._logging import getArtifactLogger
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._traceback import CapturedTraceback
 
 from ._compatibility import compatibility
@@ -41,7 +50,10 @@
 
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
 annotation_log = getArtifactLogger(__name__, "annotation")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @compatibility(is_backward_compatible=False)
@@ -126,10 +138,13 @@ def __exit__(self, *args):
 class TracerBase:
     graph: Graph
     record_stack_traces: bool = False
+<<<<<<< HEAD
     # When record_stack_traces is True, only reocrd stack traces
     # with forward function names.
     # This helps when we want stack trace back to model code
     _record_forward_stack_traces_only: bool = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Feature flag for mutable schema checking
     # Enableby default in 1.12
     check_mutable_operations: bool = False
@@ -204,6 +219,7 @@ def create_node(
             # BWD pass we retrieve the sequence_nr stored on the current
             # executing autograd Node. See NOTE [ Sequence Number ].
             if current_meta.get("in_grad_fn", 0) > 0:
+<<<<<<< HEAD
                 annotation_log.debug("seq_nr from current_meta")
                 new_seq_nr = current_meta["grad_fn_seq_nr"][-1]
 
@@ -222,11 +238,15 @@ def create_node(
                     node.stack_trace = replay_node.meta.get("stack_trace")
 
             annotation_log.debug("Assigning new_seq_nr %s to %s", new_seq_nr, node.name)
+=======
+                new_seq_nr = current_meta["grad_fn_seq_nr"][-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node.meta["seq_nr"] = new_seq_nr
 
         elif self.module_stack:
             node.meta["nn_module_stack"] = copy.copy(self.module_stack)
 
+<<<<<<< HEAD
         if self.record_stack_traces and not node.stack_trace:
             user_stack_summary = CapturedTraceback.extract().summary()
             if user_stack_summary:
@@ -275,6 +295,11 @@ def _filter_traceback_frames(
 
         return traceback.StackSummary.from_list(user_frames)
 
+=======
+        log.debug("create_node %s", node)
+        return node
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @compatibility(is_backward_compatible=True)
     def proxy(self, node: Node) -> "Proxy":
         return Proxy(node, self)
@@ -313,6 +338,34 @@ def create_proxy(
         else:
             proxy = proxy_factory_fn(node)
 
+<<<<<<< HEAD
+=======
+        if self.record_stack_traces and not proxy.node.stack_trace:
+            from torch.fx.experimental.symbolic_shapes import uninteresting_files
+
+            user_frame_summary = CapturedTraceback.extract().summary()
+            if user_frame_summary:
+                first_forward = -1
+                for i, frame in enumerate(user_frame_summary):
+                    if frame.name == "forward":
+                        user_frame_summary = user_frame_summary[i:]
+                        first_forward = i
+                        break
+
+                # Not having a "forward" call in the stacktrace implies the
+                # stacktrace will probably be irrelevant
+                if first_forward == -1:
+                    user_frame_summary = []
+
+                stack_trace = [
+                    frame
+                    for frame in user_frame_summary
+                    if frame.filename not in uninteresting_files()
+                ]
+                stack_trace = traceback.StackSummary.from_list(stack_trace)
+                proxy.node.stack_trace = "".join(stack_trace.format()).strip()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return proxy
 
     def _find_user_frame(self):
@@ -835,7 +888,11 @@ def _create_arg_dict(self, a):
     ]
 }
 _create_arg_bypass[Proxy] = lambda self, a: a.node
+<<<<<<< HEAD
 _create_arg_bypass[tuple] = lambda self, a: tuple(self.create_arg(elem) for elem in a)
+=======
+_create_arg_bypass[tuple] = lambda self, a: tuple([self.create_arg(elem) for elem in a])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _create_arg_bypass[list] = lambda self, a: [self.create_arg(elem) for elem in a]
 _create_arg_bypass[dict] = _create_arg_dict
 _create_arg_bypass[immutable_list] = _create_arg_bypass[list]
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 2253da19d3642..6fbf092caab35 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -1,7 +1,12 @@
 import copy
+<<<<<<< HEAD
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
+=======
+from dataclasses import dataclass
+from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -235,7 +240,10 @@ def replace_pattern_with_filters(
     replacement_callback: Optional[
         Callable[["InternalMatch", Graph, Graph], Graph]
     ] = None,
+<<<<<<< HEAD
     node_name_match: str = "",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[ReplacedPatterns]:
     """
     See replace_pattern for documentation. This function is an overload with an additional match_filter argument.
@@ -248,6 +256,7 @@ def replace_pattern_with_filters(
         ``replacement_callback``: A function that takes in a match and returns a
             Graph to be used as the replacement. This allows you to construct a
             replacement graph based on the match.
+<<<<<<< HEAD
         ``replacement_callback``: Node name to match. If not empty, it will try to match the node name.
     """
 
@@ -259,6 +268,12 @@ def replace_pattern_with_filters(
         ignore_literals,
         replacement_callback,
         node_name_match,
+=======
+    """
+
+    return _replace_pattern(
+        gm, pattern, replacement, match_filters, ignore_literals, replacement_callback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -274,7 +289,10 @@ def _replace_pattern(
     replacement_callback: Optional[
         Callable[["InternalMatch", Graph, Graph], Graph]
     ] = None,
+<<<<<<< HEAD
     node_name_match: str = "",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[ReplacedPatterns]:
     from torch.fx.passes.utils.matcher_utils import InternalMatch, SubgraphMatcher
 
@@ -289,7 +307,11 @@ def _replace_pattern(
     elif isinstance(pattern, Graph):
         pattern_graph = pattern
     else:
+<<<<<<< HEAD
         pattern_graph = symbolic_trace(pattern).graph  # type: ignore[arg-type]
+=======
+        pattern_graph = symbolic_trace(pattern).graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     matcher = SubgraphMatcher(
         pattern_graph,
@@ -298,9 +320,13 @@ def _replace_pattern(
         remove_overlapping_matches=True,
         ignore_literals=ignore_literals,
     )
+<<<<<<< HEAD
     _matches: list[InternalMatch] = matcher.match(
         original_graph, node_name_match=node_name_match
     )
+=======
+    _matches: list[InternalMatch] = matcher.match(original_graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Filter out matches that don't match the filter
     _matches = [
@@ -322,7 +348,11 @@ def _replace_pattern(
         assert replacement_callback is not None, (
             "Must provide either a replacement GraphModule or a replacement callback"
         )
+<<<<<<< HEAD
         common_replacement_graph = None  # type: ignore[assignment]
+=======
+        common_replacement_graph = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # As we progressively replace nodes, we'll need to keep track of how the match results should change
     match_changed_node: dict[Node, Node] = {}
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index a143119cd78b0..a7710873457a7 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -1,11 +1,15 @@
 # mypy: allow-untyped-defs
 import copy
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import traceback
 from contextlib import contextmanager
 from enum import Enum
 from typing import Any, Optional, Union
 
+<<<<<<< HEAD
 from torch._utils_internal import signpost_event
 
 from ._compatibility import compatibility
@@ -19,6 +23,14 @@
 __all__ = [
     "annotate",
     "annotate_fn",
+=======
+from ._compatibility import compatibility
+from .graph import Graph
+from .node import Node
+
+
+__all__ = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "preserve_node_meta",
     "has_preserved_node_meta",
     "set_stack_trace",
@@ -30,12 +42,18 @@
     "NodeSource",
     "NodeSourceAction",
     "get_graph_provenance_json",
+<<<<<<< HEAD
     "set_current_replay_node",
     "get_current_replay_node",
 ]
 
 current_meta: dict[str, Any] = {}
 current_replay_node: Optional[Node] = None
+=======
+]
+
+current_meta: dict[str, Any] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 should_preserve_node_meta = False
 
 
@@ -62,8 +80,11 @@ def __init__(self, name: str, target: str, graph_id: int):
     action: list["NodeSourceAction"]
     from_node: list["NodeSource"]
     node_info: Optional["NodeInfo"]
+<<<<<<< HEAD
     _dict: Optional[dict[str, Any]]
     _action_string: Optional[str]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -93,10 +114,13 @@ def __init__(
             self.node_info = None
             self.from_node = []
 
+<<<<<<< HEAD
         # cache the action string and dict representation for performance.
         self._action_string: Optional[str] = None
         self._dict: Optional[dict[str, Any]] = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def name(self) -> str:
         return self.node_info.name if self.node_info else ""
@@ -113,9 +137,13 @@ def __repr__(self):
         return self.print_readable()
 
     def _get_action_string(self):
+<<<<<<< HEAD
         if self._action_string is None:
             self._action_string = "+".join([a.name.lower() for a in self.action])
         return self._action_string
+=======
+        return "+".join([a.name.lower() for a in self.action])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def print_readable(self, indent=0):
         if indent > 9:
@@ -131,6 +159,7 @@ def print_readable(self, indent=0):
         return result
 
     def to_dict(self) -> dict:
+<<<<<<< HEAD
         if self._dict is None:
             # Convert the object to a dictionary
             action_string = self._get_action_string()
@@ -217,6 +246,18 @@ def _from_dict(cls, d: Optional[dict]) -> Optional["NodeSource"]:
         else:
             node_source.from_node = []
         return node_source
+=======
+        # Convert the object to a dictionary
+        action_string = self._get_action_string()
+        return {
+            "name": self.name,
+            "target": self.target,
+            "graph_id": self.graph_id,
+            "pass_name": self.pass_name,
+            "action": action_string,
+            "from_node": [node.to_dict() for node in self.from_node],
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @compatibility(is_backward_compatible=False)
@@ -224,6 +265,7 @@ def _from_dict(cls, d: Optional[dict]) -> Optional["NodeSource"]:
 def preserve_node_meta(enable=True):
     global should_preserve_node_meta
     global current_meta
+<<<<<<< HEAD
     saved_should_preserve_node_meta = should_preserve_node_meta
     # Shallow copy is OK since fields of current_meta are not mutated
     saved_current_meta = current_meta.copy()
@@ -233,6 +275,21 @@ def preserve_node_meta(enable=True):
     finally:
         should_preserve_node_meta = saved_should_preserve_node_meta
         current_meta = saved_current_meta
+=======
+    # If enable is False, this context manager is a no-op
+    if not enable:
+        yield
+    else:
+        saved_should_preserve_node_meta = should_preserve_node_meta
+        # Shallow copy is OK since fields of current_meta are not mutated
+        saved_current_meta = current_meta.copy()
+        try:
+            should_preserve_node_meta = True
+            yield
+        finally:
+            should_preserve_node_meta = saved_should_preserve_node_meta
+            current_meta = saved_current_meta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @compatibility(is_backward_compatible=False)
@@ -244,6 +301,7 @@ def set_stack_trace(stack: list[str]):
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 @contextmanager
 def annotate(annotation_dict: dict):
     """
@@ -334,6 +392,8 @@ def wrapper(*args, **kwargs):
 
 
 @compatibility(is_backward_compatible=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def set_grad_fn_seq_nr(seq_nr):
     global current_meta
 
@@ -404,6 +464,7 @@ def get_current_meta() -> dict[str, Any]:
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 @contextmanager
 def set_current_replay_node(node):
     """
@@ -429,10 +490,13 @@ def get_current_replay_node():
 
 
 @compatibility(is_backward_compatible=False)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_graph_provenance_json(graph: Graph) -> dict[str, Any]:
     """
     Given an fx.Graph, return a json that contains the provenance information of each node.
     """
+<<<<<<< HEAD
     try:
         provenance_tracking_json = {}
         for node in graph.nodes:
@@ -473,3 +537,14 @@ def helper(gm: GraphModule):
         return custom_metadata
 
     return "\n".join(str(x) for x in helper(gm))
+=======
+    provenance_tracking_json = {}
+    for node in graph.nodes:
+        if node.op == "call_function":
+            provenance_tracking_json[node.name] = (
+                [source.to_dict() for source in node.meta["from_node"]]
+                if "from_node" in node.meta
+                else []
+            )
+    return provenance_tracking_json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
index 70165a7493e59..cf027f879edb5 100644
--- a/torch/header_only_apis.txt
+++ b/torch/header_only_apis.txt
@@ -3,6 +3,7 @@
 # to guarantee that compiling these symbols do not require linking libtorch
 # to ensure header-only-ness.
 
+<<<<<<< HEAD
 # torch/headeronly/util/shim_utils.h
 TORCH_ERROR_CODE_CHECK
 
@@ -43,6 +44,33 @@ fp16_ieee_to_fp32_value
 # fp32_to_bits called from fp16_ieee_from_fp32_value
 
 # c10/util/complex.h, torch/headeronly/util/complex.h
+=======
+# c10/util/TypeCast.h
+convert
+
+# c10/util/bit_cast.h
+bit_cast
+
+# c10/util/BFloat16-math.h, c10/util/BFloat16.h
+BFloat16
+
+# c10/util/Float8_e4m3fn.h
+Float8_e4m3fn
+
+# c10/util/Float8_e4m3fnuz.h
+Float8_e4m3fnuz
+
+# c10/util/Float8_e5m2.h
+Float8_e5m2
+
+# c10/util/Float8_e5m2fnuz.h
+Float8_e5m2fnuz
+
+# c10/util/Half.h
+Half
+
+# c10/util/complex.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 complex
 
 # ATen/NumericUtils.h, c10/util/generic_math.h
@@ -63,6 +91,7 @@ maximum
 minimum
 size
 
+<<<<<<< HEAD
 # torch/headeronly/cpu/vec/vec_half.h
 float2half_scalar
 half2float_scalar
@@ -136,3 +165,7 @@ AT_FORALL_COMPLEX_TYPES
 toString
 <<
 toUnderlying
+=======
+# torch/headeronly/macros/Export.h
+C10_API
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/headeronly/BUILD.bazel b/torch/headeronly/BUILD.bazel
index 030651b120436..1bcfa077b716b 100644
--- a/torch/headeronly/BUILD.bazel
+++ b/torch/headeronly/BUILD.bazel
@@ -1,5 +1,16 @@
 load("@rules_cc//cc:defs.bzl", "cc_library")
+<<<<<<< HEAD
 load("//:tools/bazel.bzl", "rules")
 load(":build.bzl", "define_targets")
 
 define_targets(rules = rules)
+=======
+
+cc_library(
+    name = "torch_headeronly",
+    hdrs = glob([
+        "**/*.h"
+    ]),
+    visibility = ["//visibility:public"],
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/headeronly/macros/Export.h b/torch/headeronly/macros/Export.h
index 2222baedb4b1b..8daf6c2fdaf0e 100644
--- a/torch/headeronly/macros/Export.h
+++ b/torch/headeronly/macros/Export.h
@@ -1,5 +1,6 @@
 #pragma once
 
+<<<<<<< HEAD
 #ifndef C10_MACROS_EXPORT_H_
 #define C10_MACROS_EXPORT_H_
 
@@ -7,6 +8,8 @@
 #include <torch/headeronly/macros/cmake_macros.h>
 #endif // C10_USING_CUSTOM_GENERATED_MACROS
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /* Header file to define the common scaffolding for exported symbols.
  *
  * Export is by itself a quite tricky situation to deal with, and if you are
@@ -92,6 +95,7 @@
 #else
 #define C10_API C10_IMPORT
 #endif
+<<<<<<< HEAD
 
 // This one is being used by libtorch.so
 #ifdef CAFFE2_BUILD_MAIN_LIB
@@ -143,3 +147,5 @@
 #define C10_API_ENUM
 #endif
 #endif // C10_MACROS_EXPORT_H_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/hub.py b/torch/hub.py
index 3d6183ee7b282..9655809d396aa 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -200,12 +200,16 @@ def _validate_not_a_forked_repo(repo_owner, repo_name, ref):
         while True:
             page += 1
             url = f"{url_prefix}?per_page=100&page={page}"
+<<<<<<< HEAD
             try:
                 response = json.loads(_read_url(Request(url, headers=headers)))
             except HTTPError:
                 # Retry without token in case it had insufficient permissions.
                 del headers["Authorization"]
                 response = json.loads(_read_url(Request(url, headers=headers)))
+=======
+            response = json.loads(_read_url(Request(url, headers=headers)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Empty response means no more data to process
             if not response:
                 break
@@ -272,15 +276,23 @@ def _get_cache_or_reload(
         except HTTPError as err:
             if err.code == 300:
                 # Getting a 300 Multiple Choices error likely means that the ref is both a tag and a branch
+<<<<<<< HEAD
                 # in the repo. This can be disambiguated by explicitly using refs/heads/ or refs/tags
+=======
+                # in the repo. This can be disambiguated by explicitely using refs/heads/ or refs/tags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # See https://git-scm.com/book/en/v2/Git-Internals-Git-References
                 # Here, we do the same as git: we throw a warning, and assume the user wanted the branch
                 warnings.warn(
                     f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? "
                     "Torchhub will now assume that it's a branch. "
                     "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or "
+<<<<<<< HEAD
                     "refs/tags/tag_name as the ref. That might require using skip_validation=True.",
                     stacklevel=2,
+=======
+                    "refs/tags/tag_name as the ref. That might require using skip_validation=True."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 disambiguated_branch_ref = f"refs/heads/{ref}"
                 url = _git_archive_link(
@@ -335,12 +347,20 @@ def _check_repo_is_trusted(
         if not is_trusted:
             warnings.warn(
                 "You are about to download and run code from an untrusted repository. In a future release, this won't "
+<<<<<<< HEAD
                 f"be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
                 "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
                 f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
                 f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
                 f"confirmation if the repo is not already trusted. This will eventually be the default behaviour",
                 stacklevel=2,
+=======
+                "be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
+                "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
+                f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
+                f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
+                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return
 
@@ -374,7 +394,11 @@ def _check_dependencies(m):
 
     if dependencies is not None:
         missing_deps = [pkg for pkg in dependencies if not _check_module_exists(pkg)]
+<<<<<<< HEAD
         if missing_deps:
+=======
+        if len(missing_deps):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(f"Missing dependencies: {', '.join(missing_deps)}")
 
 
@@ -408,9 +432,13 @@ def get_dir() -> str:
     """
     # Issue warning to move data if old env is set
     if os.getenv("TORCH_HUB"):
+<<<<<<< HEAD
         warnings.warn(
             "TORCH_HUB is deprecated, please use env TORCH_HOME instead", stacklevel=2
         )
+=======
+        warnings.warn("TORCH_HUB is deprecated, please use env TORCH_HOME instead")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if _hub_dir is not None:
         return _hub_dir
@@ -857,8 +885,12 @@ def load_state_dict_from_url(
     # Issue warning to move data if old env is set
     if os.getenv("TORCH_MODEL_ZOO"):
         warnings.warn(
+<<<<<<< HEAD
             "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead",
             stacklevel=2,
+=======
+            "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if model_dir is None:
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 9decaeecc86d0..ac449ca7ca619 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -257,7 +257,11 @@ def foo(x):
 
     def __init__(self) -> None:
         if not torch._jit_internal.is_scripting():
+<<<<<<< HEAD
             warnings.warn("Only works in script mode", stacklevel=2)
+=======
+            warnings.warn("Only works in script mode")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __enter__(self):
         pass
diff --git a/torch/jit/_check.py b/torch/jit/_check.py
index 261a2ce554b5f..f051391c7805b 100644
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@@ -1,7 +1,10 @@
 # mypy: allow-untyped-defs
 import ast
 import inspect
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import warnings
 
@@ -75,6 +78,7 @@ def is_useless_comment(line):
         init_ast = ast.parse(textwrap.dedent(source_lines))
 
         # Get items annotated in the class body
+<<<<<<< HEAD
         if sys.version_info >= (3, 14):
             import annotationlib
 
@@ -85,6 +89,9 @@ def is_useless_comment(line):
             )
         else:
             self.class_level_annotations = list(nn_module.__annotations__.keys())
+=======
+        self.class_level_annotations = list(nn_module.__annotations__.keys())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Flag for later
         self.visiting_class_level_ann = False
@@ -190,8 +197,12 @@ def visit_AnnAssign(self, node):
             "instance-level annotations on empty non-base "
             "types in `__init__`. Instead, either 1) use a "
             "type annotation in the class body, or 2) wrap "
+<<<<<<< HEAD
             "the type in `torch.jit.Attribute`.",
             stacklevel=2,
+=======
+            "the type in `torch.jit.Attribute`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def visit_Call(self, node):
@@ -256,6 +267,10 @@ def visit_Call(self, node):
             "instance-level annotations on empty non-base "
             "types in `__init__`. Instead, either 1) use a "
             "type annotation in the class body, or 2) wrap "
+<<<<<<< HEAD
             "the type in `torch.jit.Attribute`.",
             stacklevel=2,
+=======
+            "the type in `torch.jit.Attribute`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
diff --git a/torch/jit/_dataclass_impls.py b/torch/jit/_dataclass_impls.py
index 67da5e2020632..cf795078f5c0a 100644
--- a/torch/jit/_dataclass_impls.py
+++ b/torch/jit/_dataclass_impls.py
@@ -4,8 +4,13 @@
 import dataclasses
 import inspect
 import os
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial
+=======
+from functools import partial
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._jit_internal import FAKE_FILENAME_PREFIX, is_optional
 from torch._sources import ParsedDef, SourceContext
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index c855606865adb..b4c8039a4a1f1 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -6,8 +6,12 @@
 aten = torch.ops.aten
 import inspect
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, TypeVar
+=======
+from typing import Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 from torch.types import Number
@@ -41,16 +45,24 @@ def signatures_match(decomposition_sig, torch_op_sig):
         return False
 
     for decomp_param, op_param in zip(decomp_params.values(), op_params.values()):
+<<<<<<< HEAD
         # can't check full equality yet because not all fields are correctly deduced
+=======
+        # can't check full equality yet because not all fields are correcly deduced
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in the torch_op_sig - like default value
         # can't check 'kind' bc
         # kwarg-only values with defaults not yet supported in TS
         inspect_empty = inspect._empty  # type: ignore[attr-defined]
         for field in ["name", "annotation"]:
             if field == "name" and decomp_param.name == "self":
+<<<<<<< HEAD
                 warnings.warn(
                     "PyTorch uses 'input' instead of 'self' on public api", stacklevel=2
                 )
+=======
+                warnings.warn("PyTorch uses 'input' instead of 'self' on public api")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if getattr(decomp_param, field) != getattr(op_param, field):
                 return False
@@ -132,7 +144,10 @@ def var_decomposition(
         else:
             raise RuntimeError("correction must be int or float")
 
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return sum / max(0, denom)
 
 
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index b61a2dd6207d1..2c698a16383b0 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -150,7 +150,11 @@ def run_frozen_optimizations(
         None
 
     Note:
+<<<<<<< HEAD
         In rare occasions, this can result in slower execution.
+=======
+        In rare occassions, this can result in slower execution.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example (Freezing a module with Conv->Batchnorm)
     .. code-block:: python
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index a15d140dc7944..77a7bed8fab50 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -14,8 +14,11 @@
 _IS_MONKEYTYPE_INSTALLED = True
 try:
     import monkeytype  # type: ignore[import]
+<<<<<<< HEAD
 
     # pyrefly: ignore [import-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from monkeytype import trace as monkeytype_trace
     from monkeytype.config import _startswith, LIB_PATHS  # type: ignore[import]
     from monkeytype.db.base import (  # type: ignore[import]
@@ -28,7 +31,11 @@
     _IS_MONKEYTYPE_INSTALLED = False
 
 
+<<<<<<< HEAD
 # Checks whether a class is defined in `torch.*` modules
+=======
+# Checks whether a class is defind in `torch.*` modules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_torch_native_class(cls):
     if not hasattr(cls, "__module__"):
         return False
@@ -68,7 +75,11 @@ def get_optional_of_element_type(types):
     from the list of consolidated types and returns `Optional[element type]`.
     TODO: To remove this check once Union support lands.
     """
+<<<<<<< HEAD
     elem_type = types[1] if type(None) is types[0] else types[0]
+=======
+    elem_type = types[1] if type(None) == types[0] else types[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elem_type = get_type(elem_type)
 
     # Optional type is internally converted to Union[type, NoneType], which
@@ -89,7 +100,10 @@ def __init__(self, store: CallTraceStore):
             super().__init__(store)
 
         def log(self, trace: CallTrace) -> None:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.traces.append(trace)
 
     class JitTypeTraceStore(CallTraceStore):
@@ -133,7 +147,11 @@ def consolidate_types(self, qualified_name: str) -> dict:
                 types = list(types)
                 type_length = len(types)
                 if type_length == 2 and type(None) in types:
+<<<<<<< HEAD
                     # TODO: To remove this check once Union support in TorchScript lands.
+=======
+                    # TODO: To remove this check once Union suppport in TorchScript lands.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     all_args[arg] = get_optional_of_element_type(types)
                 elif type_length > 1:
                     all_args[arg] = "Any"
@@ -151,7 +169,10 @@ def __init__(self, s: JitTypeTraceStore):
 
         def trace_logger(self) -> JitTypeTraceStoreLogger:
             """Return a JitCallTraceStoreLogger that logs to the configured trace store."""
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return JitTypeTraceStoreLogger(self.trace_store())
 
         def trace_store(self) -> CallTraceStore:
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 3a2b3ef8b6001..40e45557a463f 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -2,6 +2,10 @@
 import collections
 import functools
 import inspect
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import types
 import warnings
@@ -157,6 +161,11 @@ def __init__(self, source, filename, file_lineno, leading_whitespace_len):
 
 
 def get_annotations(obj):
+<<<<<<< HEAD
+=======
+    if sys.version_info < (3, 10):
+        return getattr(obj, "__annotations__", {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # In Python-3.10+ it is recommended to use inspect.get_annotations
     # See https://docs.python.org/3.10/howto/annotations.html
     # But also, in 3.10 annotations from base class are not inherited
@@ -309,8 +318,12 @@ def infer_type(name, item):
 
             warnings.warn(
                 f"'{name}' was found in ScriptModule constants, "
+<<<<<<< HEAD
                 f" but it is a non-constant {hint}. Consider removing it.",
                 stacklevel=2,
+=======
+                f" but it is a non-constant {hint}. Consider removing it."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             continue
         if not hasattr(nn_module, name):
@@ -319,8 +332,12 @@ def infer_type(name, item):
             warnings.warn(
                 f"'{name}' was found in ScriptModule constants, "
                 "but was not actually set in __init__. "
+<<<<<<< HEAD
                 "Consider removing it.",
                 stacklevel=2,
+=======
+                "Consider removing it."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             continue
         value = getattr(nn_module, name)
@@ -370,7 +387,11 @@ def infer_type(name, item):
                 hint = (
                     "(This function exists as an attribute on the Python module, "
                     "but we failed to compile it to a TorchScript function. "
+<<<<<<< HEAD
                     f"\nThe error stack is reproduced here:\n{e})"
+=======
+                    f"\nThe error stack is reproduced here:\n{e}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 concrete_type_builder.add_failed_attribute(name, hint)
 
@@ -430,7 +451,11 @@ def __init__(self) -> None:
         self.methods_compiled = set()
 
     def get_or_create_concrete_type(self, nn_module):
+<<<<<<< HEAD
         """Infer a ConcreteType from this `nn.Module` instance. Underlying JIT types are reused if possible."""
+=======
+        """Infer a ConcreteType from this `nn.Module` instance. Underlying JIT types are re-used if possible."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         concrete_type_builder = infer_concrete_type_builder(nn_module)
 
         nn_module_type = type(nn_module)
@@ -501,7 +526,11 @@ def get_module_concrete_type(nn_module, share_types=True):
         # Look into the store of cached JIT types
         concrete_type = concrete_type_store.get_or_create_concrete_type(nn_module)
     else:
+<<<<<<< HEAD
         # Get a concrete type directly, without trying to reuse an existing JIT
+=======
+        # Get a concrete type directly, without trying to re-use an existing JIT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # type from the type store.
         concrete_type_builder = infer_concrete_type_builder(nn_module, share_types)
         concrete_type_builder.set_poisoned()
@@ -749,7 +778,10 @@ def get_overload_annotations(mod, jit_ignored_properties):
             if method_overloads is None:
                 continue
 
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if item.__func__ in method_overloads:
                 raise RuntimeError(
                     _jit_internal.get_overload_no_implementation_error_message(
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 8b2ecf566a351..ab40e2823a2eb 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -14,9 +14,13 @@
 import inspect
 import pickle
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Union
 from typing_extensions import deprecated
+=======
+from typing import Any, Callable, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._jit_internal as _jit_internal
@@ -311,7 +315,11 @@ def init_then_script(self, *args, **kwargs):
             original_init(self, *args, **kwargs)
             added_methods_in_init = len(cls._methods) > num_methods
 
+<<<<<<< HEAD
             if type(self) is cls:
+=======
+            if type(self) == cls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def make_stubs(module):
                     cls = type(module)
@@ -545,7 +553,10 @@ def __setattr__(self, attr, value):
                 #
                 # This ensures that if we use the attr again in `__init__`, it
                 # will look like the actual value, not an instance of Attribute.
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(value, Attribute):
                     # NB: Ensure that we set __annotations__ on the specific
                     # class in question, and not on a superclass (which would
@@ -657,7 +668,10 @@ def _construct(cpp_module, init_fn):
 
             # Finalize the ScriptModule: replace the nn.Module state with our
             # custom implementations and flip the _initializing bit.
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             RecursiveScriptModule._finalize_scriptmodule(script_module)
             return script_module
 
@@ -708,7 +722,14 @@ def _reconstruct(self, cpp_module):
 
         @property
         def graph(self):
+<<<<<<< HEAD
             r"""Return a string representation of the internal graph for the ``forward`` method."""
+=======
+            r"""Return a string representation of the internal graph for the ``forward`` method.
+
+            See :ref:`interpreting-graphs` for details.
+            """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._c._get_method("forward").graph
 
         @property
@@ -717,6 +738,10 @@ def inlined_graph(self):
             Return a string representation of the internal graph for the ``forward`` method.
 
             This graph will be preprocessed to inline all function and method calls.
+<<<<<<< HEAD
+=======
+            See :ref:`interpreting-graphs` for details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             return self.forward.inlined_graph  # type: ignore[attr-defined]
 
@@ -725,6 +750,10 @@ def code(self):
             r"""
             Return a pretty-printed representation (as valid Python syntax) of the internal graph for the ``forward`` method.
 
+<<<<<<< HEAD
+=======
+            See :ref:`inspecting-code` for details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             return self.forward.code  # type: ignore[attr-defined]
 
@@ -739,6 +768,10 @@ def code_with_constants(self):
             [1] a ConstMap following the CONSTANT.cN format of the output in [0].
             The indices in the [0] output are keys to the underlying constant's values.
 
+<<<<<<< HEAD
+=======
+            See :ref:`inspecting-code` for details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             r = self.forward.code_with_constants  # type: ignore[attr-defined]
             return (r[0], ConstMap(r[1]))
@@ -754,10 +787,13 @@ def save(self, f, **kwargs):
             """
             return self._c.save(str(f), **kwargs)
 
+<<<<<<< HEAD
         @deprecated(
             "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
             https://docs.pytorch.org/executorch/stable/getting-started.html"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _save_for_lite_interpreter(self, *args, **kwargs):
             r"""Add (or update) the bytecode session to the script model.
 
@@ -771,6 +807,7 @@ def _save_for_lite_interpreter(self, *args, **kwargs):
                 _extra_files: Map from filename to contents which will be stored as part of 'f'.
 
             """
+<<<<<<< HEAD
             warnings.warn(
                 "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
                 https://docs.pytorch.org/executorch/stable/getting-started.html",
@@ -790,6 +827,11 @@ def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs):
                 DeprecationWarning,
                 stacklevel=2,
             )
+=======
+            return self._c._save_for_mobile(*args, **kwargs)
+
+        def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self._c._save_to_buffer_for_mobile(*args, **kwargs)
 
         def save_to_buffer(self, *args, **kwargs):
@@ -806,7 +848,11 @@ def graph_for(self, *args, **kwargs):
 
         @property
         def original_name(self):
+<<<<<<< HEAD
             if type(self) is str(self._c._type().name()):
+=======
+            if type(self) == str(self._c._type().name()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return ""
             return str(self._c._type().name())
 
@@ -933,7 +979,10 @@ def init_fn(script_module):
                 # Don't do anything here, we'll initialize the ScriptModule below
                 return
 
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return RecursiveScriptModule._construct(
                 self._c._replicate_for_data_parallel(), init_fn
             )
@@ -943,7 +992,10 @@ def init_fn(script_module):
     # This is because `super().foo()` does not use
     # `__getattr__` to look up `foo`. So we need to make each method available on
     # the ScriptModule manually.
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name, item in RecursiveScriptModule.__dict__.items():
         if not callable(item) and not isinstance(item, property):
             continue
@@ -1012,7 +1064,10 @@ def fail(self, *args, **kwargs):
         if name.startswith("__") or name.endswith("_call_impl"):
             continue
         if (
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name not in RecursiveScriptModule.__dict__
             and name not in _compiled_methods_allowlist
         ):
@@ -1045,7 +1100,10 @@ def call_prepare_scriptable_func_impl(obj, memo):
         return memo[id(obj)]
 
     obj = (
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj.__prepare_scriptable__() if hasattr(obj, "__prepare_scriptable__") else obj
     )  # type: ignore[operator]
     # Record obj in memo to avoid infinite recursion in the case of cycles in the module
@@ -1143,7 +1201,10 @@ def _script_impl(
         # the provide example inputs. This logs all the traces in type_trace_db
         type_trace_db = JitTypeTraceStore()
         if monkeytype_trace:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             monkeytype_config = JitTypeTraceConfig(type_trace_db)
             with monkeytype_trace(monkeytype_config):
                 if isinstance(example_inputs, dict):
@@ -1167,8 +1228,12 @@ def _script_impl(
             warnings.warn(
                 "Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
                 "to enable Profile-Directed Typing in TorchScript. Refer to "
+<<<<<<< HEAD
                 "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ",
                 stacklevel=2,
+=======
+                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     if isinstance(obj, torch.nn.Module):
@@ -1270,7 +1335,11 @@ def script(
     subsequently passed by reference between Python and TorchScript with zero copy overhead.
 
     ``torch.jit.script`` can be used as a function for modules, functions, dictionaries and lists
+<<<<<<< HEAD
      and as a decorator ``@torch.jit.script`` for torchscript-classes and functions.
+=======
+     and as a decorator ``@torch.jit.script`` for :ref:`torchscript-classes` and functions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         obj (Callable, class, or nn.Module):  The ``nn.Module``, function, class type,
diff --git a/torch/jit/_script.pyi b/torch/jit/_script.pyi
index 7d3a5de62a969..1b2c41881b744 100644
--- a/torch/jit/_script.pyi
+++ b/torch/jit/_script.pyi
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 # mypy: disable-error-code="type-arg"
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, NamedTuple, overload, TypeAlias, TypeVar
 from typing_extensions import Never
+=======
+from typing import Any, Callable, NamedTuple, overload, TypeVar
+from typing_extensions import Never, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from _typeshed import Incomplete
 
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index 02004a1122013..6aa51c309d7c7 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -166,6 +166,7 @@ def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
     cu = torch._C.CompilationUnit()
     if isinstance(f, (str, os.PathLike)):
         cpp_module = torch._C.import_ir_module(
+<<<<<<< HEAD
             cu,
             os.fspath(f),
             map_location,
@@ -182,6 +183,13 @@ def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
             _extra_files,
             # pyrefly: ignore [bad-argument-count]
             _restore_shapes,
+=======
+            cu, os.fspath(f), map_location, _extra_files, _restore_shapes
+        )  # type: ignore[call-arg]
+    else:
+        cpp_module = torch._C.import_ir_module_from_buffer(
+            cu, f.read(), map_location, _extra_files, _restore_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )  # type: ignore[call-arg]
 
     # TODO: Pretty sure this approach loses ConstSequential status and such
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
index f2a6f4a841763..7f90e0dcaf3d0 100644
--- a/torch/jit/_shape_functions.py
+++ b/torch/jit/_shape_functions.py
@@ -86,7 +86,11 @@ def broadcast_inplace(a: list[int], b: list[int]):
     dimsB = len(b)
     if dimsB > dimsA:
         raise AssertionError(
+<<<<<<< HEAD
             f"The dims of tensor b ({dimsB}) must be less than or equal to the dims of tensor a ({dimsA}) "
+=======
+            f"The dims of tensor b ({dimsB}) must be less than or equal tothe dims of tensor a ({dimsA}) "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     for dimA in range(dimsA):
         dimB = dimsB - dimsA + dimA
@@ -1172,7 +1176,11 @@ def cross_entropy_loss(
 adding ops.
 There are currently cases in the test case where this is being called
 in the SSA opinfo tests with with unexpected values (eg list of two ints, see the first
+<<<<<<< HEAD
 opinfo test). The behavior of index is significantly dependent on the inputs.
+=======
+opinfo test). The behavoir of index is significantly dependent on the inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 This could be an error with how we are matching up shape functions, or that this
 function needs to just implement everything.
@@ -1452,7 +1460,11 @@ def add_bounded_compute_mapping(
 # add_shape_compute_mapping("aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", index_Tensor)
 
 # TODO: migrate over all of symbolic_shape_registry_util.cpp
+<<<<<<< HEAD
 # These are duplicated here so that the functions will be serialized
+=======
+# These are duplicated here so that the functions will be serialiazed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 add_shape_compute_mapping(
     "aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor",
     broadcast_three,
diff --git a/torch/jit/_state.py b/torch/jit/_state.py
index f48dd80a0b36f..07966e4cfb2a9 100644
--- a/torch/jit/_state.py
+++ b/torch/jit/_state.py
@@ -76,11 +76,19 @@ def _get_script_class(python_class):
     override = getattr(python_class, "_jit_override_qualname", None)
     if override is not None:
         python_class = _get_python_class(override)
+<<<<<<< HEAD
     return _script_classes.get(python_class)
 
 
 def _get_python_class(qualified_name):
     return _name_to_pyclass.get(qualified_name)
+=======
+    return _script_classes.get(python_class, None)
+
+
+def _get_python_class(qualified_name):
+    return _name_to_pyclass.get(qualified_name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _clear_class_state():
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 9576ebb1d41e4..d9ff7b1b7b58f 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -16,9 +16,14 @@
 import os
 import re
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from typing import Any, Optional, TypeVar
+=======
+from enum import Enum
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -169,7 +174,10 @@ def clone_input(a):
         else:
             return a.clone(memory_format=torch.preserve_format)
 
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return function._nested_map(
         lambda x: isinstance(x, torch.Tensor), clone_input, condition_msg="tensors"
     )(args)
@@ -336,7 +344,10 @@ def _check_trace(
 
         if is_trace_module:
             copied_dict = {}
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for name, data in inputs.items():
                 copied_dict[name] = _clone_inputs(data)
             check_mod = torch.jit.trace_module(
@@ -651,7 +662,11 @@ def analyze_ts_result_with_export_result(export, trace):
         # mkldnn is not supported for torch.allclose
         if orig.layout == torch._mkldnn:  # type: ignore[attr-defined]
             return True
+<<<<<<< HEAD
         if type(orig) is not type(loaded):
+=======
+        if type(orig) != type(loaded):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         if isinstance(orig, torch._subclasses.FakeTensor):
@@ -686,8 +701,12 @@ def _trace_impl(
         # it is hard to trace it because the forward method on ScriptModule is already defined, so it
         # would result in an error.
         warnings.warn(
+<<<<<<< HEAD
             "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is.",
             stacklevel=2,
+=======
+            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return func
 
@@ -742,7 +761,10 @@ def _trace_impl(
         example_inputs = (example_inputs,)
     # done primarily so that weird iterables fail here and not pybind11 code
     elif example_kwarg_inputs is None and not isinstance(example_inputs, tuple):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_inputs = tuple(example_inputs)
 
     var_lookup_fn = _create_interpreter_name_lookup_fn(0)
@@ -769,7 +791,10 @@ def _trace_impl(
         traced = torch._C._create_function_from_trace(
             name,
             func,
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_inputs,
             var_lookup_fn,
             strict,
@@ -999,7 +1024,15 @@ def forward(self, x):
             stacklevel=2,
         )
 
+<<<<<<< HEAD
     from torch._utils_internal import log_torchscript_usage
+=======
+    from torch._utils_internal import (
+        check_if_torch_exportable,
+        log_torch_jit_trace_exportability,
+        log_torchscript_usage,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     traced_func = _trace_impl(
         func,
@@ -1016,6 +1049,106 @@ def forward(self, x):
         _store_inputs,
     )
     log_torchscript_usage("trace", model_id=_get_model_id(traced_func))
+<<<<<<< HEAD
+=======
+
+    if check_if_torch_exportable():
+        from torch._export.converter import TS2EPConverter
+        from torch.export._trace import (
+            _convert_ts_to_export_experimental,
+            _process_jit_trace_inputs_for_export,
+        )
+
+        traced_func_for_export = _trace_impl(
+            func,
+            example_inputs=example_inputs,
+            optimize=optimize,
+            check_trace=False,
+            check_inputs=check_inputs,
+            check_tolerance=check_tolerance,
+            strict=strict,
+            _force_outplace=_force_outplace,
+            _module_class=_module_class,
+            _compilation_unit=_compilation_unit,
+            example_kwarg_inputs=example_kwarg_inputs,
+            _store_inputs=_store_inputs,
+        )
+
+        export_args, _ = _process_jit_trace_inputs_for_export(
+            example_inputs, example_kwarg_inputs
+        )
+
+        def _log_exportability(func_to_export, export_func, export_args, export_type):
+            try:
+                traced_result = func_to_export(*export_args)
+            except Exception as e:
+                _ = e
+                log_torch_jit_trace_exportability(
+                    "trace", str(export_type), str(_ExportOutcome.SUCCESS), "succeeded"
+                )
+                return
+
+            try:
+                ep_module = export_func(func_to_export, export_args)
+            except Exception as e:
+                log_torch_jit_trace_exportability(
+                    "trace",
+                    str(export_type),
+                    str(_ExportOutcome.FAILED_TO_EXPORT),
+                    str(e),
+                )
+                return
+
+            try:
+                export = ep_module(*export_args)
+            except Exception as e:
+                log_torch_jit_trace_exportability(
+                    "trace", str(export_type), str(_ExportOutcome.FAILED_TO_RUN), str(e)
+                )
+                return
+
+            if not analyze_ts_result_with_export_result(export, traced_result):
+                log_torch_jit_trace_exportability(
+                    "trace",
+                    str(export_type),
+                    str(_ExportOutcome.ACCURACY_ERROR),
+                    "accuracy error",
+                )
+                return
+
+            log_torch_jit_trace_exportability(
+                "trace", str(export_type), str(_ExportOutcome.SUCCESS), "succeeded"
+            )
+
+        def _direct_export_and_lower(func, export_args):
+            return torch.export.export(func, export_args, strict=False).module()
+
+        def _convert_ts_to_export_source_to_source(func, export_args):
+            return TS2EPConverter(func, export_args).convert().module()
+
+        # torch.jit.trace is noop when the original module is torch.jit.ScriptModule
+        if not isinstance(traced_func_for_export, torch.jit.ScriptModule):
+            _log_exportability(
+                traced_func_for_export,
+                _direct_export_and_lower,
+                export_args,
+                _ExportType.DIRECT_EXPORT,
+            )
+
+        _log_exportability(
+            traced_func_for_export,
+            _convert_ts_to_export_experimental,
+            export_args,
+            _ExportType.TRACE_AND_EXPORT,
+        )
+        _log_exportability(
+            traced_func_for_export,
+            _convert_ts_to_export_source_to_source,
+            export_args,
+            _ExportType.SOURCE_TO_SOURCE,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return traced_func
 
 
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index f1ede0bd2450d..a9bd87096236b 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -98,7 +98,10 @@ class EvalEnv:
     def __init__(self, rcb):
         self.rcb = rcb
         if torch.distributed.rpc.is_available():
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.env["RRef"] = RRef
 
     def __getitem__(self, name):
@@ -389,8 +392,12 @@ def is_tensor(ann):
         warnings.warn(
             "TorchScript will treat type annotations of Tensor "
             "dtype-specific subtypes as if they are normal Tensors. "
+<<<<<<< HEAD
             "dtype constraints are not enforced in compilation either.",
             stacklevel=2,
+=======
+            "dtype constraints are not enforced in compilation either."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return True
 
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 5b1db800a7838..afd2639f6fcb1 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -73,6 +73,17 @@
 from torch.jit._monkeytype_config import get_qualified_name, monkeytype_trace
 
 
+<<<<<<< HEAD
+=======
+_IS_ASTUNPARSE_INSTALLED = False
+try:
+    import astunparse  # type: ignore[import]
+
+    _IS_ASTUNPARSE_INSTALLED = True
+except ImportError:
+    pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Borrowed from cPython implementation
 # https://github.com/python/cpython/blob/561612d8456cfab5672c9b445521113b847bd6b3/Lib/textwrap.py#L411#
 
@@ -115,7 +126,10 @@ def is_reserved_name(name):
     ast.Continue: "continue",
 }
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pretty_node_names.update(
     {
         ast.AsyncFunctionDef: "async function definitions",
@@ -126,7 +140,10 @@ def is_reserved_name(name):
     }
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 node_start_tokens.update(
     {
         ast.AsyncFunctionDef: "async def",
@@ -137,7 +154,10 @@ def is_reserved_name(name):
     }
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pretty_node_names.update(
     {
         ast.AnnAssign: "annotated assignments",
@@ -433,11 +453,15 @@ def build_def(ctx, py_def, type_line, def_name, self_name=None, pdt_arg_types=No
     is_method = self_name is not None
     if type_line is not None:
         type_comment_decl = torch._C.parse_type_comment(type_line)
+<<<<<<< HEAD
         decl = torch._C.merge_type_from_type_comment(
             decl,  # type: ignore[arg-type]
             type_comment_decl,
             is_method,  # type: ignore[assignment]
         )
+=======
+        decl = torch._C.merge_type_from_type_comment(decl, type_comment_decl, is_method)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return Def(Ident(r, def_name), decl, build_stmts(ctx, body))
 
@@ -586,7 +610,11 @@ def build_args(args):
 from typing import List, Dict, Tuple
 
 @torch.jit.ignore
+<<<<<<< HEAD
 {ast.unparse(ignore_function)}
+=======
+{astunparse.unparse(ignore_function)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
     g = copy.copy(globals())
     exec(ignore_func_str, g)  # noqa: P204
@@ -714,7 +742,11 @@ def build_AnnAssign(ctx, stmt):
 
         # Disallow type annotations on instance attributes outside of __init__
         if (
+<<<<<<< HEAD
             type(stmt.target) is ast.Attribute
+=======
+            type(stmt.target) == ast.Attribute
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and stmt.target.value.id == "self"  # type: ignore[attr-defined]
             and ctx.funcname != "__init__"
         ):
@@ -841,6 +873,14 @@ def build_With(ctx, stmt):
         r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("with"))
         # Handle ignore context manager
         if is_torch_jit_ignore_context_manager(stmt):
+<<<<<<< HEAD
+=======
+            if not _IS_ASTUNPARSE_INSTALLED:
+                raise RuntimeError(
+                    "torch.jit._IgnoreContextManager requires installing Python library `astunparse`, \
+                                   please install it in your Python environment"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assign_ast = build_ignore_context_manager(ctx, stmt)
             return build_stmt(ctx, assign_ast)
         return With(r, build_withitems(ctx, stmt.items), build_stmts(ctx, stmt.body))
@@ -862,7 +902,10 @@ class ExprBuilder(Builder):
         ast.RShift: ">>",
     }
 
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     binop_map[ast.MatMult] = "@"
 
     unop_map = {
@@ -1050,12 +1093,20 @@ def build_Compare(ctx, expr):
                 in_expr = BinOp("in", lhs, rhs)
                 cmp_expr = UnaryOp(r, "not", in_expr)
             else:
+<<<<<<< HEAD
                 cmp_expr = BinOp(op_token, lhs, rhs)  # type: ignore[assignment]
+=======
+                cmp_expr = BinOp(op_token, lhs, rhs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if result is None:
                 result = cmp_expr
             else:
+<<<<<<< HEAD
                 result = BinOp("and", result, cmp_expr)  # type: ignore[assignment]
+=======
+                result = BinOp("and", result, cmp_expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
 
     @staticmethod
@@ -1130,7 +1181,11 @@ def build_ExtSlice(ctx, base, extslice):
             return Subscript(base, [build_SliceExpr(ctx, base, expr.slice)])
         elif sub_type is ast.ExtSlice:
             return Subscript(base, build_ExtSlice(ctx, base, expr.slice))
+<<<<<<< HEAD
         else:  # In Python3.9 array indices are not wrapped in ast.Index
+=======
+        else:  # In Python3.9 array indicies are not wrapped in ast.Index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sub_type is ast.Tuple:
                 # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k]
                 indices = []
@@ -1224,7 +1279,10 @@ def build_JoinedStr(ctx, expr):
                 s += "{}"
                 args.append(build_expr(ctx, value.value))
             elif isinstance(value, ast.Constant):
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 s += value.value
             else:
                 raise NotSupportedError(r, "Unsupported value in JoinedStr")
diff --git a/torch/jit/mobile/__init__.py b/torch/jit/mobile/__init__.py
index 32c2f5b321ee3..94b76d5ebc8f2 100644
--- a/torch/jit/mobile/__init__.py
+++ b/torch/jit/mobile/__init__.py
@@ -47,9 +47,13 @@ def _load_for_lite_interpreter(f, map_location=None):
         cpp_module = torch._C._load_for_lite_interpreter(os.fspath(f), map_location)
     else:
         cpp_module = torch._C._load_for_lite_interpreter_from_buffer(
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             f.read(),
             map_location,
+=======
+            f.read(), map_location
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return LiteScriptModule(cpp_module)
@@ -107,7 +111,10 @@ def _get_model_bytecode_version(f_input) -> int:
     if isinstance(f_input, (str, os.PathLike)):
         return torch._C._get_model_bytecode_version(os.fspath(f_input))
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._get_model_bytecode_version_from_buffer(f_input.read())
 
 
@@ -140,7 +147,10 @@ def _get_mobile_model_contained_types(f_input) -> int:
     if isinstance(f_input, (str, os.PathLike)):
         return torch._C._get_mobile_model_contained_types(os.fspath(f_input))
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._get_mobile_model_contained_types_from_buffer(f_input.read())
 
 
@@ -165,6 +175,7 @@ def _backport_for_mobile(f_input, f_output, to_version):
         isinstance(f_output, (str, os.PathLike))
     ):
         return torch._C._backport_for_mobile(
+<<<<<<< HEAD
             os.fspath(f_input),
             os.fspath(f_output),
             to_version,
@@ -175,6 +186,13 @@ def _backport_for_mobile(f_input, f_output, to_version):
             f_input.read(),
             str(f_output),
             to_version,
+=======
+            os.fspath(f_input), os.fspath(f_output), to_version
+        )
+    else:
+        return torch._C._backport_for_mobile_from_buffer(
+            f_input.read(), str(f_output), to_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -196,9 +214,13 @@ def _backport_for_mobile_to_buffer(f_input, to_version):
         return torch._C._backport_for_mobile_to_buffer(os.fspath(f_input), to_version)
     else:
         return torch._C._backport_for_mobile_from_buffer_to_buffer(
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             f_input.read(),
             to_version,
+=======
+            f_input.read(), to_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -240,5 +262,8 @@ def _get_model_ops_and_info(f_input):
     if isinstance(f_input, (str, os.PathLike)):
         return torch._C._get_model_ops_and_info(os.fspath(f_input))
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._get_model_ops_and_info(f_input.read())
diff --git a/torch/jit/supported_ops.py b/torch/jit/supported_ops.py
index 6cbca07966da5..2d891190fa624 100644
--- a/torch/jit/supported_ops.py
+++ b/torch/jit/supported_ops.py
@@ -243,8 +243,13 @@ def _get_global_builtins():
         "getattr": "Attribute name must be a literal string",
         "hasattr": "Attribute name must be a literal string",
         "isinstance": "Result is static",
+<<<<<<< HEAD
         "zip": "Arguments must be iterable.",
         "enumerate": "Arguments must be iterable.",
+=======
+        "zip": "Arguments must be iterable. See :ref:`Iterables <jit_iterables>` for details.",
+        "enumerate": "Arguments must be iterable. See :ref:`Iterables <jit_iterables>` for details.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "range": "Can only be used as an iterator in a for loop",
     }
 
@@ -261,7 +266,10 @@ def _get_global_builtins():
 
     magic_methods_rows = []
     for fn, magic_method in magic_methods:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         magic_methods_rows.append(f'"{fn}", "``{magic_method}``"')
 
     schematized_ops = []
@@ -280,7 +288,10 @@ def _get_global_builtins():
             table_row = (
                 f'":external+python:py:obj:`{fn}`", "{schemaless_op_explanations[fn]}"'
             )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             schemaless_ops.append(table_row)
 
     schematized_ops_str = "\n".join(schematized_ops)
@@ -297,7 +308,11 @@ def _get_global_builtins():
 
 {schemaless_ops_str}
 
+<<<<<<< HEAD
 The following functions will use the corresponding magic method on TorchScript classes
+=======
+The following functions will use the corresponding magic method on :any:`TorchScript classes`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. csv-table::
     :header: "Function", "Magic Method"
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index 72edb235888c4..96c38a25780f2 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -16,7 +16,13 @@ static AllocInfo get_alloc_info(const char* filename) {
   info.pid = getpid();
   info.free = false;
   size_t len = strlen(filename);
+<<<<<<< HEAD
   TORCH_CHECK(len < sizeof(info.filename), "MapAllocatorContext_filename too long");
+=======
+  if (len >= sizeof(info.filename)) {
+    throw std::runtime_error("MapAllocatorContext_filename too long");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   memcpy(info.filename, filename, len + 1);
   return info;
 }
@@ -55,6 +61,7 @@ static void start_manager() {
     handle.append(buffer.data(), bytes_read);
   }
   SYSCHECK_ERR_RETURN_NEG1(close(pipe_ends[0]));
+<<<<<<< HEAD
 
   TORCH_CHECK(handle.length() != 0, "no response from torch_shm_manager at \"", manager_executable_path, "\"");
 
@@ -65,6 +72,23 @@ static void start_manager() {
       manager_executable_path,
       "\": ",
       handle.substr(7));
+=======
+  if (handle.length() == 0) {
+    std::string msg("no response from torch_shm_manager at \"");
+    msg += manager_executable_path;
+    msg += "\"";
+    throw std::runtime_error(msg);
+  }
+
+  handle.pop_back(); // remove \n
+  if (handle.rfind("ERROR: ", 0) == 0) {
+    std::string msg("torch_shm_manager at \"");
+    msg += manager_executable_path;
+    msg += "\": ";
+    msg += handle.substr(7); // remove "ERROR: "
+    throw std::runtime_error(msg);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ClientSocket manager{handle};
   managers.emplace(std::move(handle), std::move(manager));
diff --git a/torch/lib/libshm/libshm.h b/torch/lib/libshm/libshm.h
index d3f7c7061abc9..e425a0adfd065 100644
--- a/torch/lib/libshm/libshm.h
+++ b/torch/lib/libshm/libshm.h
@@ -36,7 +36,11 @@ class THManagedMapAllocator : private THManagedMapAllocatorInit,
       const char* filename,
       int flags,
       size_t size);
+<<<<<<< HEAD
   static THManagedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
+=======
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const char* manager_handle() const {
     return manager_handle_.c_str();
diff --git a/torch/lib/libshm/manager.cpp b/torch/lib/libshm/manager.cpp
index ec0519d83b752..87d6592e26a4f 100644
--- a/torch/lib/libshm/manager.cpp
+++ b/torch/lib/libshm/manager.cpp
@@ -10,7 +10,10 @@
 #include <vector>
 
 #include <c10/util/tempfile.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <libshm/err.h>
 #include <libshm/socket.h>
@@ -97,9 +100,16 @@ int main(int argc, char* argv[]) {
   std::optional<c10::TempDir> tempdir;
   try {
     tempdir = c10::try_make_tempdir(/*name_prefix=*/"torch-shm-dir-");
+<<<<<<< HEAD
     TORCH_CHECK(
         tempdir.has_value(),
         "could not generate a random directory for manager socket");
+=======
+    if (!tempdir.has_value()) {
+      throw std::runtime_error(
+          "could not generate a random directory for manager socket");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     std::string tempfile = tempdir->name + "/manager.sock";
 
diff --git a/torch/lib/libshm/socket.h b/torch/lib/libshm/socket.h
index e048098b94efa..a0379eb3e72f2 100644
--- a/torch/lib/libshm/socket.h
+++ b/torch/lib/libshm/socket.h
@@ -58,6 +58,7 @@ class Socket {
         SYSCHECK_ERR_RETURN_NEG1(
             step_received =
                 ::read(socket_fd, buffer, num_bytes - bytes_received));
+<<<<<<< HEAD
         TORCH_CHECK(step_received != 0, "Other end has closed the connection");
         bytes_received += step_received;
         buffer += step_received;
@@ -65,6 +66,18 @@ class Socket {
         TORCH_CHECK(false, "An error occurred while waiting for the data");
       } else {
         TORCH_CHECK(false, "Shared memory manager connection has timed out");
+=======
+        if (step_received == 0)
+          throw std::runtime_error("Other end has closed the connection");
+        bytes_received += step_received;
+        buffer += step_received;
+      } else if (pfd.revents & (POLLERR | POLLHUP)) {
+        throw std::runtime_error(
+            "An error occurred while waiting for the data");
+      } else {
+        throw std::runtime_error(
+            "Shared memory manager connection has timed out");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -153,9 +166,15 @@ class ClientSocket : public Socket {
     char buffer[3] = {0, 0, 0};
     send(&info, sizeof(info));
     recv(buffer, 2);
+<<<<<<< HEAD
     TORCH_CHECK(
         strcmp(buffer, "OK") == 0,
         "Shared memory manager didn't respond with an OK");
+=======
+    if (strcmp(buffer, "OK") != 0)
+      throw std::runtime_error(
+          "Shared memory manager didn't respond with an OK");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void register_deallocation(AllocInfo& info) {
diff --git a/torch/library.h b/torch/library.h
index 816f88b13f30d..f52dd609e97f7 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -89,7 +89,11 @@ struct NoInferSchemaTag {};
 
 #define HAS_PT2_COMPLIANT_TAG
 
+<<<<<<< HEAD
 // For multipy/torchdeploy use case  // codespell:ignore multipy
+=======
+// For multipy/torchdeploy use case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enum class _RegisterOrVerify { REGISTER, VERIFY };
 
 template <class CurClass>
@@ -115,7 +119,11 @@ class TORCH_API CppFunction final {
       Func* f,
       std::enable_if_t<
           c10::guts::is_function_type<Func>::value,
+<<<<<<< HEAD
           std::nullptr_t>  /*unused*/= nullptr)
+=======
+          std::nullptr_t> = nullptr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
         cpp_signature_(c10::impl::CppSignature::make<Func>()),
         schema_(
@@ -129,7 +137,11 @@ class TORCH_API CppFunction final {
       FuncPtr f,
       std::enable_if_t<
           c10::is_compile_time_function_pointer<FuncPtr>::value,
+<<<<<<< HEAD
           std::nullptr_t>  /*unused*/= nullptr)
+=======
+          std::nullptr_t> = nullptr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
         cpp_signature_(
             c10::impl::CppSignature::make<typename FuncPtr::FuncType>()),
@@ -144,7 +156,11 @@ class TORCH_API CppFunction final {
       Lambda&& f,
       std::enable_if_t<
           c10::guts::is_functor<std::decay_t<Lambda>>::value,
+<<<<<<< HEAD
           std::nullptr_t>  /*unused*/= nullptr)
+=======
+          std::nullptr_t> = nullptr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : func_(c10::KernelFunction::makeFromUnboxedLambda(
             std::forward<Lambda>(f))),
         cpp_signature_(c10::impl::CppSignature::make<Lambda>()),
@@ -310,7 +326,11 @@ class TORCH_API CppFunction final {
 
   // The "setter" for dispatch_key_
   template <typename Func>
+<<<<<<< HEAD
   friend CppFunction dispatch(c10::DispatchKey /*k*/, Func&& /*raw_f*/);
+=======
+  friend CppFunction dispatch(c10::DispatchKey, Func&&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // The only class which actually pulls out values from CppFunction (does so
   // destructively, felt too lazy to write accessors that I don't even
@@ -746,14 +766,22 @@ class TORCH_API Library final {
   // These overloads cover cases when a SelectiveStr (see Note [Selective
   // build]) has been disabled at compile time.  In that case, don't generate
   // any code referencing the passed in functions at all.
+<<<<<<< HEAD
   Library& def(detail::SelectiveStr<false> /*unused*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+=======
+  Library& def(detail::SelectiveStr<false>, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return *this;
   }
   Library& def(detail::SelectiveStr<true> raw_schema, const std::vector<at::Tag>& tags = {}) & {
     return def(raw_schema.operator const char*(), tags);
   }
   template <typename Func>
+<<<<<<< HEAD
   Library& def(detail::SelectiveStr<false> /*unused*/, Func&& /*raw_f*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+=======
+  Library& def(detail::SelectiveStr<false>, Func&& /*raw_f*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return *this;
   }
   template <typename Func>
@@ -764,12 +792,20 @@ class TORCH_API Library final {
 
   template <typename Func>
   // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+<<<<<<< HEAD
   Library& impl(detail::SelectiveStr<false> /*unused*/, Func&& /*raw_f*/) & {
+=======
+  Library& impl(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return *this;
   }
   template <typename Dispatch, typename Func>
   Library& impl(
+<<<<<<< HEAD
       detail::SelectiveStr<false> /*unused*/,
+=======
+      detail::SelectiveStr<false>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
       Dispatch&& /*key*/,
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
@@ -877,7 +913,11 @@ class TORCH_API Library final {
       const std::vector<at::Tag>& tags = {},
       _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
   Library& _def(
+<<<<<<< HEAD
       std::variant<c10::OperatorName, c10::FunctionSchema>&& /*name_or_schema*/,
+=======
+      std::variant<c10::OperatorName, c10::FunctionSchema>&&,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       CppFunction&& f,
       const std::vector<at::Tag>& tags = {}) &;
   Library& _impl(
@@ -926,7 +966,11 @@ class TorchLibraryInit final {
             }
 
       void initialize() {
+<<<<<<< HEAD
         lib = std::make_unique<Library>(kind, ns, key, file, line);
+=======
+        lib = std::unique_ptr<Library>(new Library(kind, ns, key, file, line));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init_function(*lib);
       }
 };
@@ -1022,7 +1066,11 @@ class TorchLibraryInit final {
 /// Macro for defining a function that will be run at static
 /// initialization time to define operator overrides for dispatch key
 /// `k` (must be an unqualified enum member of c10::DispatchKey) in
+<<<<<<< HEAD
 /// namespace `ns` (must be a valid C++ identifier, no quotes).  Use this
+=======
+/// namespace `ns` (must be a valid C++ identifer, no quotes).  Use this
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /// macro when you want to implement a preexisting set of custom
 /// operators on a new dispatch key (e.g., you want to provide CUDA
 /// implementations of already existing operators).  One common usage
diff --git a/torch/library.py b/torch/library.py
index 2b37b8a737784..e473bbece0cf0 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -6,8 +6,22 @@
 import sys
 import traceback
 import weakref
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar, Union
+=======
+from collections.abc import Sequence
+from typing import (
+    Any,
+    Callable,
+    Literal,
+    Optional,
+    overload,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -93,8 +107,16 @@ def __init__(self, ns, kind, dispatch_key=""):
                 ns,
                 " is a reserved namespace. Please try creating a library with another name.",
             )
+<<<<<<< HEAD
 
         frame = traceback.extract_stack(limit=2)[0]
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+
+        frame = traceback.extract_stack(limit=3)[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         filename, lineno = frame.filename, frame.lineno
         self.m: Optional[Any] = torch._C._dispatch_library(
             kind, ns, dispatch_key, filename, lineno
@@ -144,6 +166,12 @@ def define(self, schema, alias_analysis="", *, tags=()):
             >>> my_lib = Library("mylib", "DEF")
             >>> my_lib.define("sum(Tensor self) -> Tensor")
         """
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # This is added because we also want to disallow PURE_FUNCTION alias analysis which is a valid
         # AliasAnalysis type in C++
@@ -176,6 +204,12 @@ def define(self, schema, alias_analysis="", *, tags=()):
 
     def _register_fake(self, op_name, fn, _stacklevel=1, *, allow_override=False):
         r"""Registers the fake impl for an operator defined in the library."""
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         source = torch._library.utils.get_source(_stacklevel + 1)
         frame = sys._getframe(_stacklevel)
@@ -219,6 +253,12 @@ def _register_torch_dispatch_rule(self, op_name, torch_dispatch_class, fn):
         If it is a TorchDispatchMode, we expect fn to have the following signature:
         (mode, func: OpOverload, types: Tuple[type, ...], args, kwargs) -> Any
         """
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         qualname = f"{self.ns}::{op_name}"
         entry = torch._library.simple_registry.singleton.find(qualname)
@@ -238,10 +278,19 @@ def _impl_with_aoti_compile(self, op_name, dispatch_key=""):
             >>> my_lib = Library("aten", "IMPL")
             >>> my_lib._impl_with_aoti_compile("div.Tensor", "CPU")
         """
+<<<<<<< HEAD
 
         if dispatch_key == "":
             dispatch_key = self.dispatch_key
         # pyrefly: ignore [bad-argument-type]
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+
+        if dispatch_key == "":
+            dispatch_key = self.dispatch_key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert torch.DispatchKeySet(dispatch_key).has(torch._C.DispatchKey.Dense)
 
         if isinstance(op_name, str):
@@ -301,6 +350,12 @@ def impl(
             >>>     return self * (1 / other)
             >>> my_lib.impl("div.Tensor", div_cpu, "CPU")
         """
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not callable(fn):
             raise TypeError(
@@ -383,13 +438,23 @@ def fallback(self, fn, dispatch_key="", *, with_keyset=False):
             >>>     # ...
             >>> my_lib.fallback(fallback_kernel, "Autocast")
         """
+<<<<<<< HEAD
+=======
+        if torch._running_with_deploy():
+            _library.utils.warn_deploy()
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dispatch_key == "":
             dispatch_key = self.dispatch_key
 
         if self.ns != "_":
             raise RuntimeError(
+<<<<<<< HEAD
                 f"""Fallback can only be registered using library fragment on the global namespace "_" but it is {self.ns}"""
+=======
+                f"""Fallback can only be registered using libary fragment on the global namespace "_" but it is {self.ns}"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         assert dispatch_key != ""
@@ -553,7 +618,11 @@ def wrap(f):
 def impl(
     qualname: str,
     types: Union[str, Sequence[str]],
+<<<<<<< HEAD
     func: None = None,
+=======
+    func: Literal[None] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     lib: Optional[Library] = None,
 ) -> Callable[[Callable[..., object]], None]: ...
@@ -643,7 +712,10 @@ def impl(
         >>> y2 = torch.sin(x) + 1
         >>> assert torch.allclose(y1, y2)
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _impl(qualname, types, func, lib=lib, disable_dynamo=False)
 
 
@@ -666,7 +738,11 @@ def wrap(f: Callable[_P, _T]) -> Callable[_P, _T]:
 def _impl(
     qualname: str,
     types: Union[str, Sequence[str]],
+<<<<<<< HEAD
     func: None = None,
+=======
+    func: Literal[None] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     lib: Optional[Library] = None,
     disable_dynamo: bool = False,
@@ -899,6 +975,7 @@ def register_autocast(
         lib = Library(namespace, "FRAGMENT")
         _keep_alive.append(lib)
 
+<<<<<<< HEAD
     def _maybe_override_py_impl(op: torch._ops.OpOverload, dispatch_key):
         def inner(kernel):
             if op.has_kernel_for_dispatch_key(dispatch_key):
@@ -910,6 +987,9 @@ def inner(kernel):
     @_maybe_override_py_impl(_op, torch._C.DispatchKey.AutocastCPU)
     @_maybe_override_py_impl(_op, torch._C.DispatchKey.AutocastCUDA)
     def _autocast_py_impl(*args, **kwargs):
+=======
+    def kernel(_, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(kwargs) == 0, "Custom ops do not support kwargs yet."
         autocast_keyset = torch._C.DispatchKeySet(
             torch._C.DispatchKey.AutocastCPU
@@ -917,10 +997,13 @@ def _autocast_py_impl(*args, **kwargs):
         with torch._C._ExcludeDispatchKeyGuard(autocast_keyset):
             return _op(*_cast(args, device_type, cast_inputs))
 
+<<<<<<< HEAD
     def kernel(_, *args, **kwargs):
         assert len(kwargs) == 0, "Custom ops do not support kwargs yet."
         return _autocast_py_impl(*args, **kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if device_type == "cuda":
         return lib.impl(opname, kernel, "AutocastCUDA", with_keyset=True)
     else:
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 336798f2a3cfc..2ee840707b064 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1,7 +1,14 @@
+<<<<<<< HEAD
 from torch._C import (  # type: ignore[attr-defined]  # pyrefly: ignore  # missing-module-attribute
     _add_docstr,
     _linalg,
     _LinAlgError as LinAlgError,  # pyrefly: ignore  # missing-module-attribute
+=======
+from torch._C import (  # type: ignore[attr-defined]
+    _add_docstr,
+    _linalg,
+    _LinAlgError as LinAlgError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index 4bae914f0292b..3e74f9cd2e650 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, TYPE_CHECKING, TypeAlias, TypeVar, Union
 from typing_extensions import ParamSpec
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import ParamSpec, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import sym_float, Tensor
@@ -44,8 +49,12 @@ def _apply_docstring_templates(func: Callable[_P, _T]) -> Callable[_P, _T]:
         warnings.warn(
             f"No documentation string available for {func.__name__}."
             " PyTorch team should run `python tools/update_masked_docs.py`"
+<<<<<<< HEAD
             " to generate the missing docstrings.",
             stacklevel=2,
+=======
+            " to generate the missing docstrings."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         func.__doc__ = doc_string
@@ -168,6 +177,7 @@ def _generate_docstring(func):
 """,
     )
 
+<<<<<<< HEAD
     args_and_kwargs = {
         # argument name sufficies separated by double underscore will
         # be removed in the final documentation string.
@@ -182,12 +192,29 @@ def _generate_docstring(func):
         "mean": (("dim",), ("keepdim=False", "dtype=None", "mask=None")),
         "median": (("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
         "norm": (
+=======
+    args_and_kwargs = dict(
+        # argument name sufficies separated by double underscore will
+        # be removed in the final documentation string.
+        sum=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        prod=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        cumsum=(("dim__as_int",), ("dtype=None", "mask=None")),
+        cumprod=(("dim__as_int",), ("dtype=None", "mask=None")),
+        amin=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        amax=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        argmin=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        argmax=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        mean=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        median=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        norm=(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 "ord",
                 "dim",
             ),
             ("keepdim=False", "dtype=None", "mask=None"),
         ),
+<<<<<<< HEAD
         "var": (("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
         "std": (("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
         "logsumexp": (("dim",), ("keepdim=False", "dtype=None", "mask=None")),
@@ -195,12 +222,22 @@ def _generate_docstring(func):
         "log_softmax": (("dim__as_int",), ("dtype=None", "mask=None")),
         "softmin": (("dim__as_int",), ("dtype=None", "mask=None")),
         "normalize": (
+=======
+        var=(("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
+        std=(("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
+        logsumexp=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        softmax=(("dim__as_int",), ("dtype=None", "mask=None")),
+        log_softmax=(("dim__as_int",), ("dtype=None", "mask=None")),
+        softmin=(("dim__as_int",), ("dtype=None", "mask=None")),
+        normalize=(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 "ord__required",
                 "dim__as_int",
             ),
             ("eps=1e-12", "dtype=None", "mask=None"),
         ),
+<<<<<<< HEAD
     }
 
     argument_declarations = {
@@ -285,6 +322,92 @@ def _generate_docstring(func):
         "cumsum": "cumulative_sum",
         "cumprod": "cumulative_prod",
     }
+=======
+    )
+
+    argument_declarations = dict(
+        dim="""\
+dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+  Default: None that is equivalent to ``tuple(range(input.ndim))``.""",
+        dim__as_int="""\
+dim (int): the dimension along which {operation name} is computed.""",
+        ord="""\
+ord (int, float, optional): the order of vector norm. Default: 2.
+  See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
+        ord__required="""\
+ord (int, float): the order of vector norm. Default: 2.
+  See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
+        unbiased="""\
+unbiased (bool): when True, use Bessel's correction, otherwise, compute
+  the uncorrected sample variance.""",
+        eps="""\
+eps (float, optional): small value to avoid division by zero. Default: {default}.""",
+        keepdim="""\
+keepdim (bool, optional): whether the output tensor has
+  :attr:`dim` retained or not. Default: {default}.""",
+        dtype="""\
+dtype (:class:`torch.dtype`, optional): the desired data type
+  of returned tensor.  If specified, the input tensor is
+  casted to :attr:`dtype` before the operation is
+  performed. Default: {default}.""",
+        mask="""\
+mask (:class:`torch.Tensor`, optional): the boolean tensor
+  containing the binary mask of validity of input tensor
+  elements.
+  Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.""",
+    )
+
+    definitions = dict(
+        softmax="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmax of i-th element in ``x`` is
+defined as ``exp(x[i])/sum(exp(x))``.""",
+        log_softmax="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is
+defined as ``log(exp(x[i])/sum(exp(x)))``.""",
+        softmin="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmin of i-th element in ``x`` is
+defined as ``exp(-x[i])/sum(exp(-x))``.""",
+        normalize="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Normalize of i-th element in ``x`` is
+defined as ``x[i]/max(norm(x, p), eps)``.""",
+        cumsum="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``sum(x[:i])``.""",
+        cumprod="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``prod(x[:i])``.""",
+    )
+
+    reduction_names = dict(
+        sum="sum",
+        prod="product",
+        amax="maximum",
+        amin="minimum",
+        argmax="argmax",
+        argmin="argmin",
+        mean="mean",
+        median="median",
+        norm="norm",
+        var="variance",
+        std="standard_deviation",
+        logsumexp="logsumexp",
+    )
+
+    normalization_names = dict(
+        softmax="softmax",
+        log_softmax="log_softmax",
+        softmin="softmin",
+        normalize="normalize",
+        cumsum="cumulative_sum",
+        cumprod="cumulative_prod",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     operation_names = {}
     operation_names.update(reduction_names)
@@ -429,7 +552,11 @@ def _reduction_identity(op_name: str, input: Tensor, *args):
             return torch.tensor(-torch.inf, dtype=dtype, device=device)
         elif torch.is_signed(input) or dtype == torch.uint8:
             return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
+<<<<<<< HEAD
     elif op_name == "logsumexp":
+=======
+    elif op_name in {"logsumexp"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.is_floating_point(input):
             return torch.tensor(-torch.inf, dtype=dtype, device=device)
         elif torch.is_complex(input):
@@ -484,7 +611,10 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> tuple[int, ...]:
             raise IndexError(
                 f"Dimension out of range (expected to be in range of [{-ndim}, {ndim - 1}], but got {d})"
             )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dims.append(d % ndim)
     return tuple(sorted(dims))
 
@@ -1017,7 +1147,10 @@ def helper(input, mask):
 
     class Combine(torch.autograd.Function):
         @staticmethod
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(ctx, input, mask):
             """Return input with masked-out elements eliminated for the given operations."""
             ctx.save_for_backward(mask)
@@ -1028,7 +1161,10 @@ def forward(ctx, input, mask):
             return helper(input, mask)
 
         @staticmethod
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def backward(ctx, grad_output):
             (mask,) = ctx.saved_tensors
             grad_data = (
@@ -1403,18 +1539,27 @@ def mean(
     if input.layout == torch.strided:
         if mask is None:
             # TODO: compute count analytically
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             count = sum(
                 torch.ones(input.shape, dtype=torch.int64, device=input.device),
                 dim,
                 keepdim=keepdim,
             )
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             total = sum(input, dim, keepdim=keepdim, dtype=dtype)
         else:
             inmask = _input_mask(input, mask=mask)
             count = inmask.sum(dim=dim, keepdim=bool(keepdim))
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask)
         return total / count
     elif input.layout == torch.sparse_csr:
@@ -1625,18 +1770,27 @@ def _std_var(
     if input.layout == torch.strided:
         if mask is None:
             # TODO: compute count analytically
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             count = sum(
                 torch.ones(input.shape, dtype=torch.int64, device=input.device),
                 dim,
                 keepdim=True,
             )
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sample_total = sum(input, dim, keepdim=True, dtype=dtype)
         else:
             inmask = _input_mask(input, mask=mask)
             count = inmask.sum(dim=dim, keepdim=True)
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask)
         # TODO: replace torch.subtract/divide/square/maximum with
         # masked subtract/divide/square/maximum when these will be
@@ -1644,7 +1798,10 @@ def _std_var(
         sample_mean = torch.divide(sample_total, count)
         x = torch.subtract(input, sample_mean)
         if mask is None:
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
         else:
             total = sum(
diff --git a/torch/masked/maskedtensor/_ops_refs.py b/torch/masked/maskedtensor/_ops_refs.py
index d9add0a1dfbae..39916cbda4df8 100644
--- a/torch/masked/maskedtensor/_ops_refs.py
+++ b/torch/masked/maskedtensor/_ops_refs.py
@@ -1,9 +1,14 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial
 from typing import Any, TYPE_CHECKING
+=======
+from functools import partial
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -47,7 +52,10 @@ def _check_args_kwargs_length(
 
 class _MaskedContiguous(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedContiguous forward: input must be a MaskedTensor.")
@@ -61,14 +69,20 @@ def forward(ctx, input):
         return MaskedTensor(data.contiguous(), mask.contiguous())
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return grad_output
 
 
 class _MaskedToDense(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToDense forward: input must be a MaskedTensor.")
@@ -83,7 +97,10 @@ def forward(ctx, input):
         return MaskedTensor(data.to_dense(), mask.to_dense())
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         layout = ctx.layout
 
@@ -98,7 +115,10 @@ def backward(ctx, grad_output):
 
 class _MaskedToSparse(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToSparse forward: input must be a MaskedTensor.")
@@ -115,14 +135,20 @@ def forward(ctx, input):
         return MaskedTensor(sparse_data, sparse_mask)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return grad_output.to_dense()
 
 
 class _MaskedToSparseCsr(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToSparseCsr forward: input must be a MaskedTensor.")
@@ -143,21 +169,30 @@ def forward(ctx, input):
         return MaskedTensor(sparse_data, sparse_mask)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         return grad_output.to_dense()
 
 
 class _MaskedWhere(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, cond, self, other):
         ctx.mark_non_differentiable(cond)
         ctx.save_for_backward(cond)
         return torch.ops.aten.where(cond, self, other)
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         (cond,) = ctx.saved_tensors
 
@@ -296,9 +331,13 @@ def layout(func, *args, **kwargs):
     return _get_data(args[0]).layout
 
 
+<<<<<<< HEAD
 @register_dispatch_func(
     [torch.ops.aten.is_contiguous, torch.ops.aten.sym_is_contiguous]
 )
+=======
+@register_dispatch_func([torch.ops.aten.is_contiguous])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_contiguous(func, *args, **kwargs):
     data = _get_data(args[0])
     if data.is_sparse:
@@ -364,10 +403,14 @@ def _apply_fn_on_data(func, *args, **kwargs):
 @register_dispatch_func([torch.ops.aten._to_copy])
 def _to_copy(func, *args, **kwargs):
     new_data = func(_get_data(args[0]), *args[1:], **kwargs)
+<<<<<<< HEAD
     cloned_kwargs = kwargs.copy()
     cloned_kwargs["dtype"] = torch.bool
     new_mask = func(_maybe_get_mask(args[0]), *args[1:], **cloned_kwargs)
     return MaskedTensor(new_data, new_mask)
+=======
+    return MaskedTensor(new_data, _maybe_get_mask(args[0]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_dispatch_func([torch.ops.aten._softmax])
@@ -395,7 +438,11 @@ def _softmax_backward_data(func, *args, **kwargs):
     if is_masked_tensor(grad) and is_masked_tensor(output):
         if not _masks_match(grad, output):
             raise ValueError(
+<<<<<<< HEAD
                 f"__torch_dispatch__, {func}: expected the masks of grad and output to match"
+=======
+                "__torch_dispatch__, {func}: expected the masks of grad and output to match"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         grad_data = _get_data(grad)
         new_grad_data = torch.ops.aten._masked_softmax_backward(
@@ -427,7 +474,11 @@ def where(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=3, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
+<<<<<<< HEAD
         raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
+=======
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mx = args[1]
     my = args[2]
     if not is_masked_tensor(mx):
@@ -445,7 +496,11 @@ def _to_sparse(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
+<<<<<<< HEAD
         raise TypeError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
+=======
+        raise TypeError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt, dtype=torch.bool))
@@ -462,7 +517,11 @@ def _to_sparse_csr(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
+<<<<<<< HEAD
         raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
+=======
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt).bool())
@@ -479,7 +538,11 @@ def _to_dense(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
+<<<<<<< HEAD
         raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
+=======
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt).bool())
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 111680c1f019e..bea118177bb76 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -174,7 +174,10 @@ def __new__(cls, data, mask, requires_grad=False):
                 UserWarning,
                 stacklevel=2,
             )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)
 
     def _preprocess_data(self, data, mask):
@@ -198,7 +201,11 @@ def _preprocess_data(self, data, mask):
     def _validate_members(self):
         data = self._masked_data
         mask = self.get_mask()
+<<<<<<< HEAD
         if type(data) is not type(mask):
+=======
+        if type(data) != type(mask):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise TypeError(
                 f"data and mask must have the same type. Got {type(data)} and {type(mask)}"
             )
@@ -244,12 +251,18 @@ def _from_values(data, mask):
 
         class Constructor(torch.autograd.Function):
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(ctx, data, mask):
                 return MaskedTensor(data, mask)
 
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def backward(ctx, grad_output):
                 return grad_output, None
 
@@ -322,7 +335,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs):  # type: ignore[override
             "In the case that the semantics for the operator are not trivial, it would be appreciated "
             "to also include a proposal for the semantics."
         )
+<<<<<<< HEAD
         warnings.warn(msg, stacklevel=2)
+=======
+        warnings.warn(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return NotImplemented
 
     def __lt__(self, other):
@@ -336,12 +353,18 @@ def to_tensor(self, value):
     def get_data(self):
         class GetData(torch.autograd.Function):
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(ctx, self):
                 return self._masked_data.detach()
 
             @staticmethod
+<<<<<<< HEAD
             # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def backward(ctx, grad_output):
                 if is_masked_tensor(grad_output):
                     return grad_output
diff --git a/torch/masked/maskedtensor/reductions.py b/torch/masked/maskedtensor/reductions.py
index 6acc8415267bb..12fa4255b6082 100644
--- a/torch/masked/maskedtensor/reductions.py
+++ b/torch/masked/maskedtensor/reductions.py
@@ -90,7 +90,11 @@ def reduce_dim(self, dim, keepdim=False, dtype=None):
                 "In the case that the semantics for the operator are not trivial, it would be appreciated "
                 "to also include a proposal for the semantics."
             )
+<<<<<<< HEAD
             warnings.warn(msg, stacklevel=2)
+=======
+            warnings.warn(msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return NotImplemented
         if not is_masked_tensor(self):
             raise TypeError("Input to reduce_dim must be a MaskedTensor")
diff --git a/torch/mps/profiler.py b/torch/mps/profiler.py
index daaaf729ff2e6..78c4de7c16bcc 100644
--- a/torch/mps/profiler.py
+++ b/torch/mps/profiler.py
@@ -1,6 +1,11 @@
+<<<<<<< HEAD
 import contextlib
 from collections.abc import Iterator
 from typing import Literal
+=======
+# mypy: allow-untyped-defs
+import contextlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -15,10 +20,14 @@
 ]
 
 
+<<<<<<< HEAD
 ProfilerMode = Literal["interval", "event", "interval,event"]
 
 
 def start(mode: ProfilerMode = "interval", wait_until_completed: bool = False) -> None:
+=======
+def start(mode: str = "interval", wait_until_completed: bool = False) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Start OS Signpost tracing from MPS backend.
 
     The generated OS Signposts could be recorded and viewed in
@@ -39,6 +48,7 @@ def start(mode: ProfilerMode = "interval", wait_until_completed: bool = False) -
        https://developer.apple.com/documentation/os/logging/recording_performance_data
     """
     mode_normalized = mode.lower().replace(" ", "")
+<<<<<<< HEAD
     torch._C._mps_profilerStartTrace(  # type: ignore[attr-defined]
         mode_normalized, wait_until_completed
     )
@@ -53,6 +63,18 @@ def stop() -> None:
 def profile(
     mode: ProfilerMode = "interval", wait_until_completed: bool = False
 ) -> Iterator[None]:
+=======
+    torch._C._mps_profilerStartTrace(mode_normalized, wait_until_completed)
+
+
+def stop():
+    r"""Stops generating OS Signpost tracing from MPS backend."""
+    torch._C._mps_profilerStopTrace()
+
+
+@contextlib.contextmanager
+def profile(mode: str = "interval", wait_until_completed: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Context Manager to enabling generating OS Signpost tracing from MPS backend.
 
     Args:
@@ -80,6 +102,7 @@ def is_metal_capture_enabled() -> bool:
     """Checks if `metal_capture` context manager is usable
     To enable metal capture, set MTL_CAPTURE_ENABLED envvar
     """
+<<<<<<< HEAD
     return torch._C._mps_isCaptureEnabled()  # type: ignore[attr-defined, no-any-return]
 
 
@@ -91,6 +114,19 @@ def is_capturing_metal() -> bool:
 @contextlib.contextmanager
 def metal_capture(fname: str) -> Iterator[None]:
     """Context manager that enables capturing of Metal calls into gputrace"""
+=======
+    return torch._C._mps_isCaptureEnabled()  # type: ignore[attr-defined]
+
+
+def is_capturing_metal() -> bool:
+    """Cheks if metal capture is in progress"""
+    return torch._C._mps_isCapturing()  # type: ignore[attr-defined]
+
+
+@contextlib.contextmanager
+def metal_capture(fname: str):
+    """Conext manager that enables capturing of Metal calls into gputrace"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         torch._C._mps_startCapture(fname)  # type: ignore[attr-defined]
         yield
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
index c381d99747c0a..fab5156926aed 100644
--- a/torch/mtia/__init__.py
+++ b/torch/mtia/__init__.py
@@ -5,8 +5,12 @@
 
 import threading
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import device as _device, Tensor
@@ -205,11 +209,14 @@ def attach_out_of_memory_observer(
     torch._C._mtia_attachOutOfMemoryObserver(observer)
 
 
+<<<<<<< HEAD
 def is_bf16_supported(including_emulation: bool = True):
     r"""Return a bool indicating if the current MTIA device supports dtype bfloat16."""
     return True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_device_capability(device: Optional[_device_t] = None) -> tuple[int, int]:
     r"""Return capability of a given device as a tuple of (major version, minor version).
 
@@ -303,7 +310,11 @@ def __init__(self, stream: Optional["torch.mtia.Stream"]):
         self.idx = _get_device_index(None, True)
         if not torch.jit.is_scripting():
             if self.idx is None:
+<<<<<<< HEAD
                 self.idx = -1  # pyrefly: ignore [bad-assignment]
+=======
+                self.idx = -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.src_prev_stream = (
             None if not torch.jit.is_scripting() else torch.mtia.default_stream(None)
@@ -341,6 +352,7 @@ def __exit__(self, type: Any, value: Any, traceback: Any):
         torch.mtia.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
 
 
+<<<<<<< HEAD
 def _set_stream_by_id(stream_id, device_index, device_type):
     r"""set stream specified by the stream id, device index and
         device type
@@ -352,6 +364,8 @@ def _set_stream_by_id(stream_id, device_index, device_type):
     torch._C._mtia_setStream(stream_id, device_index, device_type)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def stream(stream: Optional["torch.mtia.Stream"]) -> StreamContext:
     r"""Wrap around the Context-manager StreamContext that selects a given stream.
 
@@ -409,7 +423,10 @@ def set_rng_state(
     "default_stream",
     "memory_stats",
     "max_memory_allocated",
+<<<<<<< HEAD
     "memory_allocated",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "reset_peak_memory_stats",
     "get_device_capability",
     "get_device_properties",
@@ -423,5 +440,8 @@ def set_rng_state(
     "device",
     "set_rng_state",
     "get_rng_state",
+<<<<<<< HEAD
     "is_bf16_supported",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/mtia/memory.py b/torch/mtia/memory.py
index ff350f42b7020..7965ddea1e148 100644
--- a/torch/mtia/memory.py
+++ b/torch/mtia/memory.py
@@ -36,6 +36,7 @@ def max_memory_allocated(device: Optional[_device_t] = None) -> int:
     return memory_stats(device).get("dram", 0).get("peak_bytes", 0)
 
 
+<<<<<<< HEAD
 def memory_allocated(device: Optional[_device_t] = None) -> int:
     r"""Return the current MTIA memory occupied by tensors in bytes for a given device.
 
@@ -49,6 +50,8 @@ def memory_allocated(device: Optional[_device_t] = None) -> int:
     return memory_stats(device).get("dram", 0).get("allocated_bytes", 0)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def reset_peak_memory_stats(device: Optional[_device_t] = None) -> None:
     r"""Reset the peak memory stats for a given device.
 
@@ -66,6 +69,9 @@ def reset_peak_memory_stats(device: Optional[_device_t] = None) -> None:
 __all__ = [
     "memory_stats",
     "max_memory_allocated",
+<<<<<<< HEAD
     "memory_allocated",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "reset_peak_memory_stats",
 ]
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index cbd6eee571f13..c51743ec8181a 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -290,7 +290,11 @@ def reduce_tensor(tensor):
     # 0xE000 ->  --------CUDA allocation-----
     #
     # To send tensor1, the following info are required from sender to receiver for
+<<<<<<< HEAD
     # storage reconstruction.
+=======
+    # storage recontruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #   1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process).
     #      basePtr may not be exactly 0xA000 since it's a different process.
     #   2. offset(0xA100) of storage1 in the CUDA allocation.
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index f553f7cacd753..d6c7d73c03ab9 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -114,7 +114,10 @@ def _join_procs_with_timeout(self, timeout: float):
         """Attempt to join all processes with a shared timeout."""
         end = time.monotonic() + timeout
         for process in self.processes:
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             time_to_wait = max(0, end - time.monotonic())
             process.join(time_to_wait)
 
@@ -218,9 +221,13 @@ def join(
 
 class SpawnContext(ProcessContext):
     def __init__(self, processes, error_files):
+<<<<<<< HEAD
         warnings.warn(
             "SpawnContext is renamed to ProcessContext since 1.4 release.", stacklevel=2
         )
+=======
+        warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(processes, error_files)
 
 
@@ -270,13 +277,20 @@ def start_process(i):
         )
         tf.close()
         os.unlink(tf.name)
+<<<<<<< HEAD
 
         process = mp.Process(  # pyrefly: ignore  # missing-attribute
+=======
+        process = mp.Process(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target=_wrap,
             args=(fn, i, args, tf.name),
             daemon=daemon,
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         process.start()
         return i, process, tf.name
 
diff --git a/torch/nativert/OVERVIEW.md b/torch/nativert/OVERVIEW.md
index bfe97c9aefc75..24d0efcf25536 100644
--- a/torch/nativert/OVERVIEW.md
+++ b/torch/nativert/OVERVIEW.md
@@ -244,7 +244,11 @@ For CPU kernels, it is extremely inefficient to go through the dispatcher. For
 one, the dispatcher doesn't deal with kernel out-variants.
 
 > **_NOTE:_** an out-variant of a kernel is one that takes the outputs as
+<<<<<<< HEAD
 > mutable references. this has a few benefits... namely, it allows us to reuse
+=======
+> mutable references. this has a few benefits... namely, it allows us to re-use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 > the storage/manage from the previous execution.
 
 In addition, the dispatcher acts as a stack machine. You push the inputs to the
@@ -281,7 +285,11 @@ RuntimeConfigs {
 ### Constant Folding
 
 Constant folding is the process of finding all of the constant-evaluable
+<<<<<<< HEAD
 subgraphs, evaluating them at startup, and then storing their results as
+=======
+subgraphs, evaluating them at startup, and then storing thier results as
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constants as opposed to re-evaluting them every time.
 
 To enable constant folding, you can set the following configurations.
@@ -311,7 +319,11 @@ torch.ops.quantized.linear_dynamic_fp16.default
 which should give a ~2x speedup over the fp32 variant with minimal effect on
 correctness.
 
+<<<<<<< HEAD
 The linear_prepack_fp16 op will be constant-folded, so it's imperative that
+=======
+The linear_prepack_fp16 op will be constant-folded, so it's imperitive that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 these two features are used together.
 
 To enable this feature, use the following configurations.
@@ -327,7 +339,11 @@ RuntimeConfigs {
 
 > :warning: **This is an experimental feature**
 
+<<<<<<< HEAD
 The main upside of memory planning comes from the efficient reuse of tensor
+=======
+The main upside of memory planning comes from the efficient re-use of tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 buffers, which is extremely important in memory-bound services. That is, if two
 tensors don’t have an overlapping lifetime during execution, and the first
 tensor is larger than the second, then the second tensor can share the same
diff --git a/torch/nativert/common/FileUtil.cpp b/torch/nativert/common/FileUtil.cpp
index 798a76ee00f62..dbf541117f84c 100644
--- a/torch/nativert/common/FileUtil.cpp
+++ b/torch/nativert/common/FileUtil.cpp
@@ -12,7 +12,10 @@
 #endif
 #include <cerrno>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fmt/core.h>
 
 namespace torch::nativert {
@@ -27,7 +30,11 @@ int unistd_close(int fh) {
 #endif
 }
 
+<<<<<<< HEAD
 inline void incr(ssize_t /*unused*/) {}
+=======
+inline void incr(ssize_t) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename Offset>
 inline void incr(ssize_t n, Offset& offset) {
   offset += static_cast<Offset>(n);
@@ -77,7 +84,11 @@ int filterCloseReturn(int r) {
   return r;
 }
 
+<<<<<<< HEAD
 //  The following wrapX() functions are private functions for wrapping file-io
+=======
+//  The following wrapX() funcions are private functions for wrapping file-io
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //  against interrupt and partial op completions.
 
 // Wrap call to f(args) in loop to retry on EINTR
@@ -131,6 +142,7 @@ File::File(int fd, bool ownsFd) noexcept : fd_(fd), ownsFd_(ownsFd) {
 
 File::File(std::string_view name, int flags, mode_t mode)
     : fd_(::open(std::string(name).c_str(), flags, mode)), ownsFd_(false) {
+<<<<<<< HEAD
   TORCH_CHECK(
       fd_ != 1,
       "open(\"",
@@ -140,6 +152,16 @@ File::File(std::string_view name, int flags, mode_t mode)
       ", 0",
       mode,
       ") returned stdout.")
+=======
+  if (fd_ == -1) {
+    throw std::runtime_error(fmt::format(
+        "open(\"{}\", {}, 0{}) failed with errno {}.",
+        name,
+        flags,
+        mode,
+        errno));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ownsFd_ = true;
 }
 
@@ -168,11 +190,23 @@ File::~File() {
 /* static */ File File::temporary() {
   // make a temp file with tmpfile(), dup the fd, then return it in a File.
   FILE* tmpFile = tmpfile();
+<<<<<<< HEAD
   TORCH_CHECK(tmpFile != nullptr, "tmpfile() failed");
   auto guard = c10::make_scope_exit([&]() { fclose(tmpFile); });
 
   int fd = ::dup(fileno(tmpFile));
   TORCH_CHECK(fd != -1, "dup() failed");
+=======
+  if (!tmpFile) {
+    throw std::runtime_error("tmpfile() failed");
+  }
+  auto guard = c10::make_scope_exit([&]() { fclose(tmpFile); });
+
+  int fd = ::dup(fileno(tmpFile));
+  if (fd == -1) {
+    throw std::runtime_error("dup() failed");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return File(fd, true);
 }
@@ -191,7 +225,13 @@ void File::swap(File& other) noexcept {
 }
 
 void File::close() {
+<<<<<<< HEAD
   TORCH_CHECK(closeNoThrow(), "close() failed");
+=======
+  if (!closeNoThrow()) {
+    throw std::runtime_error("close() failed");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 [[nodiscard]] bool File::closeNoThrow() {
diff --git a/torch/nativert/common/FileUtil.h b/torch/nativert/common/FileUtil.h
index 6fa82347ac2b9..2bc2a3b7cb588 100644
--- a/torch/nativert/common/FileUtil.h
+++ b/torch/nativert/common/FileUtil.h
@@ -111,8 +111,13 @@ class File {
   void swap(File& other) noexcept;
 
   // movable
+<<<<<<< HEAD
   File(File&& /*other*/) noexcept;
   File& operator=(File&& /*other*/) noexcept;
+=======
+  File(File&&) noexcept;
+  File& operator=(File&&) noexcept;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   // unique
diff --git a/torch/nativert/detail/ITree.cpp b/torch/nativert/detail/ITree.cpp
index b24ee65f162b6..14a5d32fa150e 100644
--- a/torch/nativert/detail/ITree.cpp
+++ b/torch/nativert/detail/ITree.cpp
@@ -46,7 +46,11 @@ class PytreeNodeRegistry {
                const ITreeSpec& spec,
                std::vector<c10::IValue>& ivalues) {
               const auto& tuple = nested.toTupleRef().elements();
+<<<<<<< HEAD
               TORCH_CHECK(tuple.size() == spec.children().size());
+=======
+              TORCH_CHECK_EQ(tuple.size(), spec.children().size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               for (size_t i = 0; i < tuple.size(); i++) {
                 itreeFlatten(tuple[i], spec.children(i), ivalues);
               }
@@ -60,7 +64,11 @@ class PytreeNodeRegistry {
                const c10::IValue& nested,
                const ITreeSpec& spec) {
               const auto& tuple = nested.toTupleRef().elements();
+<<<<<<< HEAD
               TORCH_CHECK(tuple.size() == spec.children().size());
+=======
+              TORCH_CHECK_EQ(tuple.size(), spec.children().size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               for (size_t i = 0; i < tuple.size(); i++) {
                 ivalueApply(fn, tuple[i], spec.children(i));
               }
@@ -119,7 +127,11 @@ class PytreeNodeRegistry {
               const auto& contextKeys = spec.contextKeys();
               // allow the dict size less than the spec, missing key will be
               // filled with empty tensor
+<<<<<<< HEAD
               TORCH_CHECK(dict.size() <= contextKeys.size());
+=======
+              TORCH_CHECK_LE(dict.size(), contextKeys.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               size_t i = 0;
               for (const auto& key : contextKeys) {
                 auto it = dict.find(key);
@@ -143,7 +155,11 @@ class PytreeNodeRegistry {
               c10::Dict<c10::IValue, c10::IValue> dict(
                   c10::AnyType::get(), c10::AnyType::get());
               TORCH_CHECK(obj.is_array());
+<<<<<<< HEAD
               TORCH_CHECK(obj.size() == flats.size());
+=======
+              TORCH_CHECK_EQ(obj.size(), flats.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               dict.reserve(flats.size());
               for (size_t i = 0; i < flats.size(); i++) {
                 dict.insert(dynamicToIValue(obj[i]), std::move(flats[i]));
@@ -172,6 +188,7 @@ class PytreeNodeRegistry {
     registerNode(
         "torch.fx.immutable_collections.immutable_dict",
         getNodeDef("builtins.dict"));
+<<<<<<< HEAD
     // Register JaggedTensor pytree node
     registerNode(
         "torchrec.sparse.jagged_tensor.JaggedTensor",
@@ -314,6 +331,8 @@ class PytreeNodeRegistry {
               // Context contains the keys list as JSON
               return nlohmann::json::parse(context);
             }});
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   bool hasNodeDef(std::string_view typeName) const {
     return registry_.find(std::string{typeName}) != registry_.end();
@@ -342,7 +361,11 @@ ITreeSpec makeITreeSpec(
   TORCH_CHECK(obj.is_object());
   TORCH_CHECK(obj.find("type") != obj.end());
   if (obj["type"].is_null()) {
+<<<<<<< HEAD
     TORCH_CHECK(obj["children_spec"].empty());
+=======
+    TORCH_CHECK_EQ(obj["children_spec"].size(), 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(obj["context"].is_null());
 
     const Value* value = values[start];
@@ -386,11 +409,19 @@ ITreeSpec itreeSpecLoads(
     const std::vector<const Value*>& values) {
   const auto obj = nlohmann::json::parse(json);
   TORCH_CHECK(obj.is_array());
+<<<<<<< HEAD
   TORCH_CHECK(obj.size() == 2);
   TORCH_CHECK(obj[0].get<int64_t>() == kDefaultTreeSpecSerializationProtocol);
   auto result = makeITreeSpec(obj[1], values, 0);
 
   TORCH_CHECK(result.numIValues() == values.size());
+=======
+  TORCH_CHECK_EQ(obj.size(), 2);
+  TORCH_CHECK_EQ(obj[0].get<int64_t>(), kDefaultTreeSpecSerializationProtocol);
+  auto result = makeITreeSpec(obj[1], values, 0);
+
+  TORCH_CHECK_EQ(result.numIValues(), values.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return result;
 }
 
@@ -398,7 +429,11 @@ c10::IValue itreeUnflatten(
     std::vector<c10::IValue> ivalues,
     const ITreeSpec& spec) {
   RECORD_USER_SCOPE("nativert::itreeUnflatten");
+<<<<<<< HEAD
   TORCH_CHECK(ivalues.size() == spec.numIValues());
+=======
+  TORCH_CHECK_EQ(ivalues.size(), spec.numIValues());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (spec.isIValue()) {
     return std::move(ivalues[0]);
   }
@@ -441,20 +476,32 @@ std::vector<c10::IValue> itreeFlattenFromArgs(
     const ITreeSpec& spec) {
   RECORD_USER_SCOPE("nativert::itreeFlattenFromArgs");
   TORCH_CHECK(!spec.isIValue());
+<<<<<<< HEAD
   TORCH_CHECK(spec.children().size() == 2);
+=======
+  TORCH_CHECK_EQ(spec.children().size(), 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<c10::IValue> ivalues;
   ivalues.reserve(spec.numIValues());
   const auto& specArgs = spec.children(0);
   TORCH_CHECK(!specArgs.isIValue());
+<<<<<<< HEAD
   TORCH_CHECK(specArgs.children().size() == args.size());
+=======
+  TORCH_CHECK_EQ(specArgs.children().size(), args.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t i = 0; i < args.size(); i++) {
     itreeFlatten(args[i], specArgs.children(i), ivalues);
   }
 
   const auto& specKwargs = spec.children(1);
   TORCH_CHECK(!specKwargs.isIValue());
+<<<<<<< HEAD
   TORCH_CHECK(specKwargs.context().size() == kwargs.size());
+=======
+  TORCH_CHECK_EQ(specKwargs.context().size(), kwargs.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t i = 0; i < specKwargs.context().size(); i++) {
     itreeFlatten(
         kwargs.at(specKwargs.context()[i].get_ref<const std::string&>()),
@@ -471,11 +518,19 @@ void ivalueApplyFromArgs(
     const ITreeSpec& spec) {
   RECORD_USER_SCOPE("nativert::ivalueApplyFromArgs");
   TORCH_CHECK(!spec.isIValue());
+<<<<<<< HEAD
   TORCH_CHECK(spec.children().size() == 2);
 
   const auto& specArgs = spec.children(0);
   TORCH_CHECK(!specArgs.isIValue());
   TORCH_CHECK(specArgs.children().size() == args.size());
+=======
+  TORCH_CHECK_EQ(spec.children().size(), 2);
+
+  const auto& specArgs = spec.children(0);
+  TORCH_CHECK(!specArgs.isIValue());
+  TORCH_CHECK_EQ(specArgs.children().size(), args.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t i = 0; i < args.size(); i++) {
     ivalueApply(fn, args[i], specArgs.children(i));
   }
@@ -484,7 +539,11 @@ void ivalueApplyFromArgs(
   TORCH_CHECK(!specKwargs.isIValue());
 
   const auto& ctx = specKwargs.context();
+<<<<<<< HEAD
   TORCH_CHECK(ctx.size() == kwargs.size());
+=======
+  TORCH_CHECK_EQ(ctx.size(), kwargs.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (size_t i = 0; i < ctx.size(); i++) {
     ivalueApply(
diff --git a/torch/nativert/detail/ITree.h b/torch/nativert/detail/ITree.h
index 5448fb2dead76..017f337fb9406 100644
--- a/torch/nativert/detail/ITree.h
+++ b/torch/nativert/detail/ITree.h
@@ -16,6 +16,11 @@
 
 namespace torch::nativert::detail {
 
+<<<<<<< HEAD
+=======
+using torch::nativert::Value;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ITreeSpec;
 
 using ITreeFlattenFn =
@@ -32,7 +37,11 @@ using ITreeMapNoReturnFn =
 using IValueApplyFn =
     void (*)(ITreeMapNoReturnFn, const c10::IValue&, const ITreeSpec&);
 
+<<<<<<< HEAD
 nlohmann::json defaultContextLoadFn(std::string_view /*context*/);
+=======
+nlohmann::json defaultContextLoadFn(std::string_view);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct NodeDef {
   ITreeFlattenFn flattenFn;
diff --git a/torch/nativert/detail/MPMCQueue.h b/torch/nativert/detail/MPMCQueue.h
index 8301ce3fdb4c5..3be917ac5e476 100644
--- a/torch/nativert/detail/MPMCQueue.h
+++ b/torch/nativert/detail/MPMCQueue.h
@@ -55,6 +55,7 @@ class MPMCQueue {
     return true;
   }
 
+<<<<<<< HEAD
   /**
    * Get the current size of the queue.
    * @return The number of elements in the queue.
@@ -64,6 +65,8 @@ class MPMCQueue {
     return storage_.size();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   std::mutex mutex_;
   std::deque<T> storage_;
diff --git a/torch/nativert/executor/DelegateExecutor.cpp b/torch/nativert/executor/DelegateExecutor.cpp
index 6585ac34ddd6c..b5a447f226128 100644
--- a/torch/nativert/executor/DelegateExecutor.cpp
+++ b/torch/nativert/executor/DelegateExecutor.cpp
@@ -8,8 +8,13 @@
 
 #include <c10/util/Logging.h>
 
+<<<<<<< HEAD
 #include <torch/nativert/common/FileUtil.h>
 #include <string>
+=======
+#include <c10/util/string_view.h>
+#include <torch/nativert/common/FileUtil.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
@@ -28,7 +33,10 @@ char* _mkdtemp(char* outputDir) {
 std::string extractToTemporaryFolder(
     caffe2::serialize::PyTorchStreamReader& packageReader,
     const std::string& targetPath) {
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   char outputDir[] = "/tmp/delegate_model_XXXXXX";
   char* tempdir = _mkdtemp(outputDir);
   TORCH_CHECK(
@@ -54,7 +62,11 @@ std::string extractToTemporaryFolder(
             << " from archive path: " << path << " size: " << dataSize;
 
     File extracted(extractedFilename, O_CREAT | O_WRONLY, 0640);
+<<<<<<< HEAD
     const auto bytesWritten = writeFull(
+=======
+    const auto bytesWritten = torch::nativert::writeFull(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extracted.fd(), const_cast<void*>(dataPointer.get()), dataSize);
     TORCH_CHECK(
         bytesWritten != -1,
diff --git a/torch/nativert/executor/DelegateExecutor.h b/torch/nativert/executor/DelegateExecutor.h
index 7d88f98987764..b4a9dd26038ae 100644
--- a/torch/nativert/executor/DelegateExecutor.h
+++ b/torch/nativert/executor/DelegateExecutor.h
@@ -26,7 +26,11 @@ class DelegateExecutor {
 
   // Runtime calls processWeights() to pass the weights to the delegate backend.
   // Typically, a backend would perform some form of validation and processing,
+<<<<<<< HEAD
   // such as constant folding. The processed weights stays in the deactivate
+=======
+  // such as constant folding. The processed weights stays in the inactivate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // state until commitWeights() is called.
   //
   // Weights tensors are co-owned by the runtime and the delegate backend.
@@ -38,7 +42,11 @@ class DelegateExecutor {
   // affect the weight tensors in the delegate backend.
   // When a weight tensor is no longer used by the delegate backend, the backend
   // must release it by decreasing a refcount. Runtime would
+<<<<<<< HEAD
   // also release the refcount for weight tensor if it's no longer activate. The
+=======
+  // also release the refcount for weight tensor if it's no longer activte. The
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // underlying storage for weight tensors will be freed when the refcount
   // reaches 0.
   virtual void processWeights(std::shared_ptr<Weights> weights) = 0;
@@ -46,8 +54,11 @@ class DelegateExecutor {
   // This call activate the processed weights.
   virtual void commitWeights() = 0;
 
+<<<<<<< HEAD
   virtual void initWeights(std::shared_ptr<Weights> weights) = 0;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual std::vector<at::Tensor> run(std::vector<at::Tensor>& inputs) = 0;
 };
 
diff --git a/torch/nativert/executor/ExecutionFrame.cpp b/torch/nativert/executor/ExecutionFrame.cpp
index 2cef8e2086708..ccf2e0173fe22 100644
--- a/torch/nativert/executor/ExecutionFrame.cpp
+++ b/torch/nativert/executor/ExecutionFrame.cpp
@@ -1,8 +1,15 @@
 #include <c10/util/Enumerate.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 
 #include <torch/nativert/executor/ExecutionFrame.h>
+=======
+#include <c10/util/Logging.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
@@ -11,6 +18,7 @@ ExecutionFrame::ExecutionFrame(const Graph& graph)
       allValues_(graph.numValues()),
       persistent_(graph.numValues()),
       moveable_output_mask_(graph.userOutputs().size()) {
+<<<<<<< HEAD
   updatePersistentValues(/* weights = nullptr */);
   updateMovableOutputs();
 }
@@ -69,15 +77,29 @@ void ExecutionFrame::setWeights(const Weights& weights) {
 
   std::unordered_map<std::string, ValueId> foldedConstIds;
   for (const Node& node : graph.nodes()) {
+=======
+  // load constant SymInts into execution frame
+  for (const auto& [valueId, constSymintValue] :
+       graph_.getConstantSymIntValues()) {
+    setPersistentIValue(valueId, constSymintValue);
+  }
+
+  for (const Node& node : graph_.nodes()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (node.target() == "torch.ops.higher_order.run_const_graph") {
       const auto& const_graph =
           std::get<std::unique_ptr<Graph>>(node.attributes().at(0).value);
       for (size_t i = 0; i < node.outputs().size(); ++i) {
+<<<<<<< HEAD
         foldedConstIds[std::string{const_graph->outputs().at(i)->name()}] =
+=======
+        foldedConstIds_[std::string{const_graph->outputs().at(i)->name()}] =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node.outputs()[i]->id();
       }
     }
   }
+<<<<<<< HEAD
   for (const auto& [name, tensor] : weights->getFoldedConsts()) {
     persistentValues.emplace_back(foldedConstIds.at(name), tensor);
   }
@@ -98,6 +120,40 @@ void ExecutionFrame::updatePersistentValues(const Weights* weights) {
     auto&& [value, iv] = *it;
     setPersistentIValue(value, std::move(iv));
   }
+=======
+}
+
+ExecutionFrame::ExecutionFrame(const Graph& graph, const Weights& weights)
+    : ExecutionFrame(graph) {
+  setWeights(weights);
+}
+
+void ExecutionFrame::setWeights(const Weights& weights) {
+  weightVersion_ = weights.version();
+
+  const auto& inputsToWeights = graph_.signature().inputsToWeights();
+  for (const auto& [inputName, weightName] : inputsToWeights) {
+    const Value* value = graph_.getValue(inputName);
+    setPersistentIValue(value->id(), weights.at(weightName));
+  }
+
+  const auto& inputsToCustomObjs = graph_.signature().inputsToCustomObjs();
+  for (const auto& [inputName, customObjName] : inputsToCustomObjs) {
+    const Value* value = graph_.getValue(inputName);
+    setPersistentIValue(value->id(), weights.getCustomObj(customObjName));
+  }
+
+  for (const auto& [value, tensor] : weights.getFoldedConsts()) {
+    setPersistentIValue(foldedConstIds_.at(value), tensor);
+  }
+
+  for (const auto& [n, iv] : weights.getConstFoldedValues()) {
+    const Value* v = graph_.getValue(n);
+    setPersistentIValue(v->id(), iv);
+  }
+
+  updateMovableOutputs();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void ExecutionFrame::updateMovableOutputs() {
@@ -138,8 +194,13 @@ void ExecutionFrame::updateMovableOutputs() {
 ExecutionFrame::ExecutionFrame(
     const Graph& graph,
     size_t numValues,
+<<<<<<< HEAD
     const std::vector<ValueId>& /*unused*/,
     const std::vector<ValueId>& /*unused*/)
+=======
+    const std::vector<ValueId>&,
+    const std::vector<ValueId>&)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : graph_(graph) {
   allValues_.resize(numValues);
 }
@@ -157,8 +218,16 @@ void ExecutionFrame::setBorrowedIValue(ValueId id, c10::IValue ivalue) {
 
 at::Tensor ExecutionFrame::getTensor(ValueId id) const {
   const auto& ivalue = getIValue(id);
+<<<<<<< HEAD
   TORCH_CHECK(ivalue.isTensor(), "getTensor called on non-tensor value");
   return ivalue.toTensor();
+=======
+  if (C10_LIKELY(ivalue.isTensor())) {
+    return ivalue.toTensor();
+  } else {
+    throw std::runtime_error("getTensor called on non-tensor value");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::vector<c10::IValue> ExecutionFrame::tryMoveUserOutputs() {
diff --git a/torch/nativert/executor/ExecutionFrame.h b/torch/nativert/executor/ExecutionFrame.h
index 1f256bb6b534b..7faa6bb25887f 100644
--- a/torch/nativert/executor/ExecutionFrame.h
+++ b/torch/nativert/executor/ExecutionFrame.h
@@ -2,9 +2,14 @@
 
 #include <unordered_map>
 
+<<<<<<< HEAD
 #include <torch/nativert/executor/ExecutorConfig.h>
 #include <torch/nativert/executor/Weights.h>
 #include <torch/nativert/executor/memory/LayoutManager.h>
+=======
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/nativert/executor/Weights.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/nativert/graph/Graph.h>
 
 #include <c10/util/Logging.h>
@@ -22,11 +27,15 @@ class ExecutionFrame {
   // torch.cond
   explicit ExecutionFrame(const Graph& graph);
 
+<<<<<<< HEAD
   explicit ExecutionFrame(
       const Graph& graph,
       const Weights& weights,
       const torch::nativert::ExecutorConfig& executorConfig = {},
       LayoutPlanner* layoutPlanner = nullptr);
+=======
+  explicit ExecutionFrame(const Graph& graph, const Weights& weights);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Constructor for testing purpose
   explicit ExecutionFrame(
@@ -35,15 +44,19 @@ class ExecutionFrame {
       const std::vector<ValueId>& graphInputIds,
       const std::vector<ValueId>& graphOutputIds);
 
+<<<<<<< HEAD
   ExecutionFrame(const ExecutionFrame&) = delete;
   ExecutionFrame& operator=(const ExecutionFrame&) = delete;
   ExecutionFrame(ExecutionFrame&&) = delete;
   ExecutionFrame& operator=(ExecutionFrame&&) = delete;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~ExecutionFrame() {
     destroyBorrowedIValues();
   }
 
+<<<<<<< HEAD
   template <typename CB>
   auto withManagedMemory(CB&& cb) {
     if (!layoutManager_) {
@@ -55,6 +68,8 @@ class ExecutionFrame {
         const_cast<const LayoutManager*>(layoutManager_.get()));
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<c10::IValue> tryMoveUserOutputs();
 
   c10::IValue moveIValue(ValueId id) {
@@ -96,19 +111,28 @@ class ExecutionFrame {
     return getIValue(id).toDouble();
   }
 
+<<<<<<< HEAD
   C10_ALWAYS_INLINE bool isManagedValue(const ValueId id) const {
     return layoutPlanner_ != nullptr && layoutPlanner_->is_managed(id);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void setPersistentIValue(ValueId id, c10::IValue ivalue) {
     setIValue(id, std::move(ivalue));
     persistent_[id] = true;
   }
 
+<<<<<<< HEAD
   void releaseValueIfNeeded(ValueId id) {
     if (!isManagedValue(id) && !persistent_[id]) {
       allValues_[id] = c10::IValue();
     }
+=======
+  void releaseValue(ValueId id) {
+    CHECK(!persistent_[id]) << "Cannot release persistent value";
+    allValues_[id] = c10::IValue();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void destroyBorrowedIValues() {
@@ -118,12 +142,26 @@ class ExecutionFrame {
     borrowedValueIds_.clear();
   }
 
+<<<<<<< HEAD
+=======
+  void setWork(int64_t workId, const c10::intrusive_ptr<c10d::Work>& work) {
+    work_[workId] = work;
+  }
+
+  c10::intrusive_ptr<c10d::Work> getWork(int64_t workId) const {
+    CHECK(work_.find(workId) != work_.end())
+        << "Couldn't find work with Id: " << workId;
+    return work_.at(workId);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   WeightVersion weightVersion() const {
     return weightVersion_;
   }
 
   void setWeights(const Weights& weights);
 
+<<<<<<< HEAD
   static std::vector<std::pair<ValueId, c10::IValue>> getPersistentValues(
       const Graph& graph,
       const Weights* weights = nullptr);
@@ -145,11 +183,19 @@ class ExecutionFrame {
   }
 
   void updatePersistentValues(const Weights* weights = nullptr);
+=======
+ private:
+  bool isOutputMovable(size_t idx) const {
+    TORCH_CHECK_LT(idx, moveable_output_mask_.size());
+    return moveable_output_mask_[idx];
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void updateMovableOutputs();
 
   const Graph& graph_;
   WeightVersion weightVersion_ = -1;
 
+<<<<<<< HEAD
   std::unique_ptr<LayoutManager> layoutManager_;
   LayoutPlanner* layoutPlanner_{nullptr};
 
@@ -161,6 +207,19 @@ class ExecutionFrame {
 
   std::vector<ValueId> borrowedValueIds_;
 
+=======
+  // All the intermediate values for the entire graph, including graph inputs
+  // and outputs This table is fixed once constructed
+  std::vector<c10::IValue> allValues_;
+  std::vector<bool> persistent_;
+
+  std::unordered_map<int64_t, c10::intrusive_ptr<c10d::Work>> work_;
+
+  std::vector<ValueId> borrowedValueIds_;
+
+  std::unordered_map<std::string, ValueId> foldedConstIds_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // moveable_output_mask_[i] corresponds to user_outputs_[i]
   //
   // if moveable_output_mask_[i] is true, then user_outputs_[i]
diff --git a/torch/nativert/executor/ExecutorConfig.h b/torch/nativert/executor/ExecutorConfig.h
index fb57f2b6f2ef6..3e602ab12ca27 100644
--- a/torch/nativert/executor/ExecutorConfig.h
+++ b/torch/nativert/executor/ExecutorConfig.h
@@ -1,6 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <torch/nativert/executor/memory/LayoutPlannerSettings.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdint>
 #include <string>
 
@@ -9,17 +12,27 @@ namespace torch::nativert {
 struct ExecutorConfig {
   bool validateInputs = false;
   bool debugNan = false;
+<<<<<<< HEAD
   bool enableStaticCPUKernels = true;
   bool runConstFolding = false;
   bool doExecutionFrameCleanup = true;
   bool tryFreeUnmanagedValuesAfterUse = true;
+=======
+  bool enableStaticCPUKernels = false;
+  bool enableStaticMemoryPlanning = false;
+  bool runConstFolding = false;
+  bool doExecutionFrameCleanup = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // allows up to max number of concurrent threads.
   int64_t maxNumConcurrentThreads = 8;
   // allows up to max number of parallel ops.
   int64_t maxParallelOps = 1;
   int64_t minNumExecutionFrames = 1;
   int64_t executionFramePoolCleanupIntervalSec = 600;
+<<<<<<< HEAD
   LayoutPlannerSettings layoutPlannerSettings;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string modelName = "unknown";
 };
 
diff --git a/torch/nativert/executor/GraphExecutorBase.cpp b/torch/nativert/executor/GraphExecutorBase.cpp
index a623d5873ea56..c2c6696ce4425 100644
--- a/torch/nativert/executor/GraphExecutorBase.cpp
+++ b/torch/nativert/executor/GraphExecutorBase.cpp
@@ -13,14 +13,22 @@ GraphExecutorBase::GraphExecutorBase(
     : graph_(graph),
       nodeKernels_(std::move(nodeKernels)),
       executorConfig_(executorConfig),
+<<<<<<< HEAD
       execPlan_(ExecutionPlanner{graph_}.createPlan()) {}
+=======
+      execPlan_(ExecutionPlanner{graph_}.createPlan()) {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void GraphExecutorBase::fillUserInputs(
     ExecutionFrame& frame,
     std::vector<c10::IValue> inputs) {
   RECORD_USER_SCOPE("Executor::fillUserInputs");
   const auto& inputValues = graph_.userInputs();
+<<<<<<< HEAD
   TORCH_CHECK(inputValues.size() == inputs.size());
+=======
+  TORCH_CHECK_EQ(inputValues.size(), inputs.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // load user input tensor into execution frame
   for (size_t i = 0; i < inputValues.size(); i++) {
@@ -32,7 +40,11 @@ void GraphExecutorBase::fillUserInputs(
 
 ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
     ExecutionFrame& executionFrame,
+<<<<<<< HEAD
     const std::vector<std::vector<c10::IValue>>& inputsList,
+=======
+    std::vector<std::vector<c10::IValue>> inputsList,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const uint32_t warmupRuns,
     const uint32_t mainRuns) {
   // TODO: add support for memory profiling
@@ -40,6 +52,7 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
 
   ProfileMetrics results;
   const auto numNodes = static_cast<uint32_t>(nodeKernels_.size());
+<<<<<<< HEAD
 
   results.percentPerNode.resize(numNodes, 0.0f);
   results.nodeTypes.reserve(numNodes);
@@ -47,6 +60,8 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
     results.nodeTypes.emplace_back(nodeKernel->node()->target());
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   results.timePerNode.resize(numNodes, 0);
   if (inputsList.empty()) {
     auto i = 0;
@@ -81,6 +96,7 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
 
   // Execute kernels
   caffe2::Timer timer;
+<<<<<<< HEAD
   executionFrame.withManagedMemory([&](auto) {
     for (uint32_t i = 0; i < mainRuns; i++) {
       for (auto inputs : inputsList) {
@@ -99,6 +115,24 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
       }
     }
   });
+=======
+  for (uint32_t i = 0; i < mainRuns; i++) {
+    for (auto inputs : inputsList) {
+      const auto& inputValues = graph_.userInputs();
+
+      TORCH_CHECK_EQ(inputValues.size(), inputs.size());
+      for (size_t j = 0; j < inputValues.size(); j++) {
+        executionFrame.setIValue(inputValues[j]->id(), std::move(inputs[j]));
+      }
+      for (NodeIndex nodeIdx = 0; nodeIdx < nodeKernels_.size(); ++nodeIdx) {
+        timer.Start();
+        nodeKernels_[nodeIdx]->compute(executionFrame);
+        float millis = timer.MilliSeconds();
+        results.timePerNode[nodeIdx] += millis;
+      }
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Summarize results
   const float numTotalIters =
@@ -121,11 +155,15 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
   results.totalNodesCount = numNodes;
   for (const auto& r : results.timePerNodeType) {
     const std::string& target = r.first;
+<<<<<<< HEAD
     results.percentPerNodeType[target] = r.second * 100.0f / results.totalTime;
   }
   for (const auto i : c10::irange(numNodes)) {
     results.percentPerNode[i] =
         results.timePerNode[i] * 100.0f / results.totalTime;
+=======
+    results.percentPerNodeType[target] = r.second * 100.0 / results.totalTime;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return results;
 }
diff --git a/torch/nativert/executor/GraphExecutorBase.h b/torch/nativert/executor/GraphExecutorBase.h
index dfe020ebae29e..5da7005a5e39c 100644
--- a/torch/nativert/executor/GraphExecutorBase.h
+++ b/torch/nativert/executor/GraphExecutorBase.h
@@ -14,15 +14,23 @@ struct ProfileMetrics {
   size_t staticDispatchNodesCount{0};
   size_t totalNodesCount{0};
   std::vector<float> timePerNode;
+<<<<<<< HEAD
   std::vector<std::string> nodeTypes;
   std::unordered_map<std::string, float> timePerNodeType;
   std::unordered_map<std::string, float> percentPerNodeType;
   std::vector<float> percentPerNode;
+=======
+  std::unordered_map<std::string, float> timePerNodeType;
+  std::unordered_map<std::string, float> percentPerNodeType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<std::string, int> instancesPerNodeType;
   std::unordered_set<std::string> staticDispatchNodes;
   std::unordered_set<std::string> primNodes;
   float totalTime{0};
+<<<<<<< HEAD
   std::string name;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -54,7 +62,11 @@ class GraphExecutorBase {
 
   ProfileMetrics benchmarkIndividualNodes(
       ExecutionFrame& executionFrame,
+<<<<<<< HEAD
       const std::vector<std::vector<c10::IValue>>& inputs,
+=======
+      std::vector<std::vector<c10::IValue>> inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const uint32_t warmup_runs,
       const uint32_t main_runs);
 
diff --git a/torch/nativert/executor/OpKernel.h b/torch/nativert/executor/OpKernel.h
index 4e3fd2dd9a8e8..a970f43be8ef3 100644
--- a/torch/nativert/executor/OpKernel.h
+++ b/torch/nativert/executor/OpKernel.h
@@ -98,8 +98,15 @@ class OpKernel {
  public:
   explicit OpKernel(
       const Node* node,
+<<<<<<< HEAD
       OpKernelKind kind = OpKernelKind::kInterpreterFallbackKernel)
       : node_(node), kind_(kind) {
+=======
+      std::optional<c10::Device> device = std::nullopt,
+      torch::nativert::OpKernelKind kind =
+          torch::nativert::OpKernelKind::kInterpreterFallbackKernel)
+      : node_(node), device_(device), kind_(kind) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VLOG(1) << "Initializing kernel for node: " << *node_;
   }
 
@@ -108,17 +115,30 @@ class OpKernel {
   }
   void compute(ExecutionFrame& executionFrame) const;
 
+<<<<<<< HEAD
   OpKernelKind kind() const {
+=======
+  torch::nativert::OpKernelKind kind() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kind_;
   }
 
   bool hasPrimKernel() const {
+<<<<<<< HEAD
     return kind() == OpKernelKind::kPrimKernel;
   }
 
   bool hasStaticDispatch() const {
     return kind() == OpKernelKind::kStaticDispatchKernel ||
         kind() == OpKernelKind::kNativeStaticDispatchKernel;
+=======
+    return kind() == torch::nativert::OpKernelKind::kPrimKernel;
+  }
+
+  bool hasStaticDispatch() const {
+    return kind() == torch::nativert::OpKernelKind::kStaticDispatchKernel ||
+        kind() == torch::nativert::OpKernelKind::kNativeStaticDispatchKernel;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   size_t numInputs() const {
@@ -149,9 +169,16 @@ class OpKernel {
   virtual void computeInternal(ExecutionFrame& executionFrame) const = 0;
 
   const Node* node_;
+<<<<<<< HEAD
   const static bool blockingEnabled_;
   // this should be set in the ctor!
   const OpKernelKind kind_;
+=======
+  std::optional<c10::Device> device_;
+  const static bool blockingEnabled_;
+  // this should be set in the ctor!
+  const torch::nativert::OpKernelKind kind_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/executor/OpKernelKind.h b/torch/nativert/executor/OpKernelKind.h
index 5a8ba38316f67..6822ae5401946 100644
--- a/torch/nativert/executor/OpKernelKind.h
+++ b/torch/nativert/executor/OpKernelKind.h
@@ -8,10 +8,16 @@ enum class OpKernelKind : uint8_t {
   kPrimKernel,
   kStaticDispatchKernel,
   kInterpreterFallbackKernel,
+<<<<<<< HEAD
   // static dispatch kernels that don't reuse
   // out TensorImpl
   kNativeStaticDispatchKernel,
   kTritonKernel,
+=======
+  // static dispatch kernels that don't re-use
+  // out TensorImpl
+  kNativeStaticDispatchKernel,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/executor/Placement.cpp b/torch/nativert/executor/Placement.cpp
index 0432ecdc2a7c3..5c58b3abb246c 100644
--- a/torch/nativert/executor/Placement.cpp
+++ b/torch/nativert/executor/Placement.cpp
@@ -32,6 +32,7 @@ std::ostream& operator<<(std::ostream& os, const Placement& placement) {
   return os;
 }
 
+<<<<<<< HEAD
 namespace {
 void assertCudaDeviceHasIndex(const c10::Device& device) {
   if (device.is_cuda()) {
@@ -41,6 +42,8 @@ void assertCudaDeviceHasIndex(const c10::Device& device) {
 }
 } // namespace
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Placement::Placement(std::optional<c10::Device> defaultDevice)
     : Placement({}, defaultDevice) {}
 
@@ -48,6 +51,7 @@ Placement::Placement(
     const std::unordered_map<c10::Device, c10::Device>& deviceMap,
     std::optional<c10::Device> defaultDevice) {
   for (const auto& [srcDevice, dstDevice] : deviceMap) {
+<<<<<<< HEAD
     assertCudaDeviceHasIndex(srcDevice);
     assertCudaDeviceHasIndex(dstDevice);
 
@@ -57,11 +61,22 @@ Placement::Placement(
   if (defaultDevice.has_value()) {
     assertCudaDeviceHasIndex(defaultDevice.value());
     defaultDevice_ = defaultDevice.value();
+=======
+    deviceMap_.try_emplace(
+        normalizeDevice(srcDevice), normalizeDevice(dstDevice));
+  }
+  if (defaultDevice.has_value()) {
+    defaultDevice_ = normalizeDevice(defaultDevice.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 c10::Device Placement::getMappedDevice(const c10::Device& srcDevice) const {
+<<<<<<< HEAD
   auto it = deviceMap_.find(srcDevice);
+=======
+  auto it = deviceMap_.find(normalizeDevice(srcDevice));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (it != deviceMap_.end()) {
     return it->second;
   }
diff --git a/torch/nativert/executor/Placement.h b/torch/nativert/executor/Placement.h
index 6ea86348973ee..1b7237c9f9076 100644
--- a/torch/nativert/executor/Placement.h
+++ b/torch/nativert/executor/Placement.h
@@ -9,6 +9,24 @@
 namespace torch::nativert {
 
 /**
+<<<<<<< HEAD
+=======
+ * This function returns a normalized version of the input device:
+ * - For CPU devices, the returned device will have no index (i.e., the default
+ * CPU device).
+ * - For CUDA devices, if no index is specified, index 0 is assumed.
+ * - For other device types, the function will raise an error.
+ *
+ * @param device The input c10::Device to normalize.
+ * @return A normalized c10::Device with standardized indexing.
+ *
+ * @throws c10::Error If the device type is not CPU or CUDA.
+ */
+
+c10::Device normalizeDevice(const c10::Device& device);
+
+/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * Returns true if the two devices are the same and has the same device index
  * (if cuda).
  */
diff --git a/torch/nativert/executor/PlacementUtils.cpp b/torch/nativert/executor/PlacementUtils.cpp
index afd1009c62833..8b53a842f9936 100644
--- a/torch/nativert/executor/PlacementUtils.cpp
+++ b/torch/nativert/executor/PlacementUtils.cpp
@@ -4,6 +4,23 @@
 
 namespace torch::nativert {
 
+<<<<<<< HEAD
+=======
+c10::Device normalizeDevice(const c10::Device& device) {
+  // cpu device doesn't have index
+  // cuda device index must have a index
+  if (device.is_cpu()) {
+    return c10::Device(c10::DeviceType::CPU);
+  } else if (device.is_cuda()) {
+    return c10::Device(
+        c10::DeviceType::CUDA,
+        device.has_index() ? device.index() : static_cast<c10::DeviceIndex>(0));
+  } else {
+    TORCH_CHECK(false, "Unsupported device type", device);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool isSameDevice(const c10::Device& a, const c10::Device& b) {
   if (a.is_cpu()) {
     return b.is_cpu();
@@ -17,9 +34,12 @@ bool isSameDevice(const c10::Device& a, const c10::Device& b) {
       return false;
     }
   }
+<<<<<<< HEAD
   if (a.is_meta()) {
     return b.is_meta();
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "Unsupported device type", a, " and ", b);
   return false;
 }
diff --git a/torch/nativert/executor/SerialGraphExecutor.cpp b/torch/nativert/executor/SerialGraphExecutor.cpp
index 58a7cd1c4307c..7a91ad640d1be 100644
--- a/torch/nativert/executor/SerialGraphExecutor.cpp
+++ b/torch/nativert/executor/SerialGraphExecutor.cpp
@@ -14,6 +14,7 @@ std::vector<c10::IValue> SerialGraphExecutor::execute(
 
 std::vector<c10::IValue> SerialGraphExecutor::executeWithPrefilledFrame(
     ExecutionFrame& executionFrame) {
+<<<<<<< HEAD
   executionFrame.withManagedMemory([&](const LayoutManager* layout_manager) {
     // Execute kernels for all nodes except prim.Input and prim.Output
     for (NodeIndex nodeIdx = 1; nodeIdx < nodeKernels_.size() - 1; ++nodeIdx) {
@@ -34,6 +35,21 @@ std::vector<c10::IValue> SerialGraphExecutor::executeWithPrefilledFrame(
       }
     }
   });
+=======
+  // Execute kernels for all nodes except prim.Input and prim.Output
+  for (NodeIndex nodeIdx = 1; nodeIdx < nodeKernels_.size() - 1; ++nodeIdx) {
+    nodeKernels_[nodeIdx]->compute(executionFrame);
+
+    // don't free intermediate values when static memory planning is enabled
+    if (!executorConfig_.enableStaticMemoryPlanning) {
+      // Free the intermediate values that are no used anymore
+      for (const auto& valueKey : execPlan_->valuesToFree[nodeIdx]) {
+        executionFrame.releaseValue(valueKey);
+      }
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return executionFrame.tryMoveUserOutputs();
 }
 
diff --git a/torch/nativert/executor/SessionState.h b/torch/nativert/executor/SessionState.h
index 37cdf32b3fd3e..f715f71a2facb 100644
--- a/torch/nativert/executor/SessionState.h
+++ b/torch/nativert/executor/SessionState.h
@@ -9,6 +9,12 @@
 
 namespace torch::nativert {
 
+<<<<<<< HEAD
+=======
+using torch::nativert::ExecutionFrame;
+using torch::nativert::Node;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T, typename __atomic_base = std::atomic<T>>
 struct copyable_atomic : public __atomic_base {
  public:
diff --git a/torch/nativert/executor/Weights.cpp b/torch/nativert/executor/Weights.cpp
index 4a64935945c4f..17d643d778791 100644
--- a/torch/nativert/executor/Weights.cpp
+++ b/torch/nativert/executor/Weights.cpp
@@ -16,6 +16,10 @@
 #include <ATen/ops/scalar_tensor.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#include <c10/util/string_view.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <caffe2/serialize/inline_container.h>
 
 namespace torch::nativert {
@@ -26,14 +30,22 @@ Weights::Weights(
     const Graph* graph,
     const std::optional<std::unordered_map<std::string, c10::IValue>>&
         stateDict,
+<<<<<<< HEAD
     const std::optional<std::unordered_map<std::string, c10::IValue>>&
         constants)
     : graph_(graph),
       weightsMeta_(graph->weightsMeta()),
+=======
+    Placement placement)
+    : graph_(graph),
+      weightsMeta_(graph->weightsMeta()),
+      placement_(std::move(placement)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       version_(globalVersion_++) {
   if (stateDict.has_value()) {
     loadStateDict(stateDict.value());
   }
+<<<<<<< HEAD
   if (constants.has_value()) {
     for (const auto& [name, value] : constants.value()) {
       if (value.isTensor()) {
@@ -45,6 +57,8 @@ Weights::Weights(
       }
     }
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Weights::Weights(
@@ -54,10 +68,18 @@ Weights::Weights(
     std::string_view stateDictPathPrefix,
     const std::unordered_map<std::string, std::string>& constantPaths,
     std::string_view constantPathPrefix,
+<<<<<<< HEAD
+=======
+    Placement placement,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::function<bool(const std::string&)> skipSizeCheck,
     std::function<bool(const std::string&)> skipDtypeCheck)
     : graph_(graph),
       weightsMeta_(graph->weightsMeta()),
+<<<<<<< HEAD
+=======
+      placement_(std::move(placement)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       version_(globalVersion_++),
       skipSizeCheck_(std::move(skipSizeCheck)),
       skipDtypeCheck_(std::move(skipDtypeCheck)) {
@@ -106,7 +128,11 @@ Weights::Weights(
 
         if (!isUsed) {
           VLOG(1) << "Tensor " << tensorName << " is not used during inference";
+<<<<<<< HEAD
           auto targetDevice = tensorMeta->device();
+=======
+          auto targetDevice = placement_.getMappedDevice(tensorMeta->device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           allValues_[tensorName] =
               at::scalar_tensor(0, at::TensorOptions().device(targetDevice));
           return;
@@ -129,7 +155,11 @@ Weights::Weights(
             at::empty({0}, tensorOptions)
                 .set_(storage, 0, tensorMeta->sizes(), tensorMeta->strides());
 
+<<<<<<< HEAD
         auto targetDevice = tensorMeta->device();
+=======
+        auto targetDevice = placement_.getMappedDevice(tensorMeta->device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         VLOG(1) << "Loading weight " << tensorName << " on " << targetDevice;
         if (!isSameDevice(targetDevice, tensor.device())) {
           tensor = tensor.to(targetDevice);
@@ -317,7 +347,11 @@ void Weights::loadStateDict(
     TORCH_CHECK(
         it != weightsMeta_.end(), "Couldn't find ", name, " in weightsMeta");
 
+<<<<<<< HEAD
     auto targetDevice = it->second.device();
+=======
+    auto targetDevice = placement_.getMappedDevice(it->second.device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto tensor = stateDictIt->second.toTensor().to(targetDevice);
 
     TORCH_CHECK(tensor.sizes() == it->second.sizes());
@@ -337,6 +371,7 @@ void Weights::loadStateDict(
 
 void Weights::validateValue(const std::string& name, const at::Tensor& newValue)
     const {
+<<<<<<< HEAD
   validateValue(name, newValue, /*skipDeviceCheck=*/false);
 }
 
@@ -344,6 +379,8 @@ void Weights::validateValue(
     const std::string& name,
     const at::Tensor& newValue,
     bool skipDeviceCheck) const {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& weightMeta = weightsMeta_.at(name);
 
   TORCH_CHECK(
@@ -367,6 +404,7 @@ void Weights::validateValue(
       " vs ",
       newValue.dtype());
 
+<<<<<<< HEAD
   if (!skipDeviceCheck) {
     auto targetDevice = weightMeta.device();
     if (targetDevice.is_cpu() && targetDevice.has_index()) {
@@ -393,6 +431,25 @@ void Weights::setValue(
     bool skipDeviceCheck) {
   if (allValues_.find(name) != allValues_.end()) {
     validateValue(name, newValue, skipDeviceCheck);
+=======
+  auto targetDevice = placement_.getMappedDevice(weightMeta.device());
+  if (targetDevice.is_cpu() && targetDevice.has_index()) {
+    LOG(WARNING) << "Target device is cpu but has index: " << targetDevice;
+  }
+  TORCH_CHECK(
+      isSameDevice(targetDevice, newValue.device()),
+      "Mismatched device for ",
+      name,
+      ": ",
+      targetDevice,
+      " vs ",
+      newValue.device());
+}
+
+void Weights::setValue(const std::string& name, const at::Tensor& newValue) {
+  if (allValues_.find(name) != allValues_.end()) {
+    validateValue(name, newValue);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     LOG(WARNING) << name << " is not found in the registered weights";
   }
diff --git a/torch/nativert/executor/Weights.h b/torch/nativert/executor/Weights.h
index 7791a329ec498..8190e71338857 100644
--- a/torch/nativert/executor/Weights.h
+++ b/torch/nativert/executor/Weights.h
@@ -1,8 +1,17 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/util/FbcodeMaps.h>
 #include <c10/util/Logging.h>
 #include <caffe2/serialize/inline_container.h>
+=======
+#include <unordered_map>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/nativert/executor/Placement.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/nativert/graph/Graph.h>
 
@@ -20,12 +29,20 @@ using WeightVersion = int;
  */
 class Weights {
  public:
+<<<<<<< HEAD
   Weights(
       const Graph* graph,
       const std::optional<std::unordered_map<std::string, c10::IValue>>&
           stateDict = std::nullopt,
       const std::optional<std::unordered_map<std::string, c10::IValue>>&
           constants = std::nullopt);
+=======
+  explicit Weights(
+      const Graph* graph,
+      const std::optional<std::unordered_map<std::string, c10::IValue>>&
+          stateDict = std::nullopt,
+      Placement placement = Placement());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Arguments
   // - pytorchStreamReader: the reader for the model archive
@@ -36,6 +53,11 @@ class Weights {
   // - constantPaths: a map from constant name to file path in the archive
   // - constantPathPrefix: a prefix that will be prepended to paths in
   // constantPathPrefix
+<<<<<<< HEAD
+=======
+  // - placement: the device placement of the weights, default to follow the
+  //   original device in the weight's metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   explicit Weights(
       const Graph* graph,
       std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
@@ -44,6 +66,10 @@ class Weights {
       std::string_view stateDictPathPrefix,
       const std::unordered_map<std::string, std::string>& constantPaths,
       std::string_view constantPathPrefix,
+<<<<<<< HEAD
+=======
+      Placement placement = Placement(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::function<bool(const std::string&)> skipSizeCheck = {},
       std::function<bool(const std::string&)> skipDtypeCheck = {});
 
@@ -66,10 +92,13 @@ class Weights {
    * Replace the value stored at the weight with name "name".
    */
   void setValue(const std::string& name, const at::Tensor& newValue);
+<<<<<<< HEAD
   void setValue(
       const std::string& name,
       const at::Tensor& newValue,
       bool skipDeviceCheck);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /*
    * Update the value stored at the weight with name "name".
@@ -81,10 +110,13 @@ class Weights {
       const std::unordered_map<std::string, at::Tensor>& newValues);
 
   void validateValue(const std::string& name, const at::Tensor& newValue) const;
+<<<<<<< HEAD
   void validateValue(
       const std::string& name,
       const at::Tensor& newValue,
       bool skipDeviceCheck) const;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void validateAllWeightsLoaded();
 
@@ -112,13 +144,21 @@ class Weights {
  private:
   const Graph* graph_;
   const std::unordered_map<std::string, TensorMeta>& weightsMeta_;
+<<<<<<< HEAD
+=======
+  Placement placement_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // keys are parameter/buffer/constant names, not graph input names!
   std::unordered_map<std::string, at::Tensor> allValues_;
 
   std::unordered_map<std::string, c10::IValue> customObjs_;
 
+<<<<<<< HEAD
   // contains CustomClassHolder map from a file name to an arbitrary
+=======
+  // contains CustomClassHolder map from a file name to an arbitray
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // key in customObjs_ that hold the loaded content of the file.
   // This is used in AOTIDelegateExecutor.
   std::unordered_map<std::string, std::string> customObjsPaths_;
diff --git a/torch/nativert/executor/memory/FunctionSchema.cpp b/torch/nativert/executor/memory/FunctionSchema.cpp
index 264ed702cbc0d..b3cea31beede0 100644
--- a/torch/nativert/executor/memory/FunctionSchema.cpp
+++ b/torch/nativert/executor/memory/FunctionSchema.cpp
@@ -11,16 +11,20 @@ bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
     }
   }
 
+<<<<<<< HEAD
   VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << " "
           << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << " "
           << (c10_fn_schema_.is_vararg() ? "vararg" : "non-vararg");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!aliasing_spec_.empty()) {
     VLOG(1) << "aliasing spec is not empty but no entry found for ("
             << input_idx << "-->" << output_idx
             << ") -- falling back to schema->may_contain_alias()";
   }
 
+<<<<<<< HEAD
   /*
     varret and vararg will contribute to the input/output idx's
     but because we don't know how many inputs/outputs there are,
@@ -49,6 +53,8 @@ bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
     return true;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return c10_fn_schema_.may_contain_alias(
       {c10::SchemaArgType::output, output_idx},
       {c10::SchemaArgType::input, input_idx},
diff --git a/torch/nativert/executor/memory/FunctionSchema.h b/torch/nativert/executor/memory/FunctionSchema.h
index 713df50805847..adf461f3863be 100644
--- a/torch/nativert/executor/memory/FunctionSchema.h
+++ b/torch/nativert/executor/memory/FunctionSchema.h
@@ -17,7 +17,12 @@ class FunctionSchema {
   explicit FunctionSchema(
       const c10::FunctionSchema& schema,
       AliasingSpec&& aliasing_spec = {},
+<<<<<<< HEAD
       OpKernelKind kernel_kind = OpKernelKind::kInterpreterFallbackKernel)
+=======
+      torch::nativert::OpKernelKind kernel_kind =
+          torch::nativert::OpKernelKind::kInterpreterFallbackKernel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : aliasing_spec_(std::move(aliasing_spec)),
         kernel_kind_(kernel_kind),
         c10_fn_schema_(schema) {}
@@ -32,13 +37,21 @@ class FunctionSchema {
 
   bool alias(size_t input_idx, size_t output_idx) const;
 
+<<<<<<< HEAD
   C10_ALWAYS_INLINE OpKernelKind kernel_kind() const {
+=======
+  C10_ALWAYS_INLINE torch::nativert::OpKernelKind kernel_kind() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kernel_kind_;
   }
 
  private:
   AliasingSpec aliasing_spec_;
+<<<<<<< HEAD
   OpKernelKind kernel_kind_;
+=======
+  torch::nativert::OpKernelKind kernel_kind_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::FunctionSchema c10_fn_schema_;
 };
 
diff --git a/torch/nativert/executor/memory/LayoutPlannerSettings.h b/torch/nativert/executor/memory/LayoutPlannerSettings.h
index 8ade27997bdfc..5755e591d7cf0 100644
--- a/torch/nativert/executor/memory/LayoutPlannerSettings.h
+++ b/torch/nativert/executor/memory/LayoutPlannerSettings.h
@@ -7,7 +7,10 @@ namespace torch::nativert {
 enum class LayoutPlannerAlgorithmType {
   Bump,
   GreedyBySize,
+<<<<<<< HEAD
   DisjointStorageGroups,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class LayoutManagerSettings {
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
index 260af58a2a492..03faca100db74 100644
--- a/torch/nativert/graph/Graph.cpp
+++ b/torch/nativert/graph/Graph.cpp
@@ -8,8 +8,14 @@
 #include <c10/util/Enumerate.h>
 #include <c10/util/FbcodeMaps.h>
 #include <c10/util/StringUtil.h>
+<<<<<<< HEAD
 #include <torch/nativert/executor/Placement.h>
 #include <torch/nativert/graph/TensorMeta.h>
+=======
+#include <c10/util/string_view.h>
+#include <torch/nativert/executor/Placement.h> // @manual
+#include <torch/nativert/graph/TensorMeta.h> // @manual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
@@ -39,7 +45,11 @@ size_t expectImpl(
   TORCH_CHECK(
       expected == actual,
       fmt::format(
+<<<<<<< HEAD
           "Parser error: expected '{}' at position {}, but found '{}'.",
+=======
+          "Parser error: expected '{}' at postition {}, but found '{}'.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           expected,
           curPos,
           actual));
@@ -54,7 +64,11 @@ size_t expectImpl(std::string_view source, char expected, size_t curPos) {
   }
   TORCH_CHECK(
       expected == source[curPos],
+<<<<<<< HEAD
       "Parser error: expected '{}' at position {}, but found '{}'.",
+=======
+      "Parser error: expected '{}' at postition {}, but found '{}'.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       expected,
       curPos,
       source[curPos]);
@@ -281,7 +295,11 @@ void Node::applyDevicePlacement(const Placement& placement) {
       auto device = std::get<c10::Device>(attribute.value);
       auto targetDevice =
           placement.getMappedDevice(std::get<c10::Device>(attribute.value));
+<<<<<<< HEAD
       if (!isSameDevice(targetDevice, device)) {
+=======
+      if (!torch::nativert::isSameDevice(targetDevice, device)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LOG(INFO) << "Overriding " << device.str() << " to "
                   << targetDevice.str() << " for node " << *this;
         attribute.value = targetDevice;
@@ -568,7 +586,11 @@ void Graph::lint() const {
     }
   }
   for (const auto& node : nodes()) {
+<<<<<<< HEAD
     TORCH_CHECK(node.owningGraph() == this);
+=======
+    TORCH_CHECK_EQ(node.owningGraph(), this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // Check that every list type is either produced by a prim.ListPack or
   // immediately consumed by a prim.ListUnpack. We make use of this invariant
@@ -661,6 +683,7 @@ void Graph::replaceAllUsesAfterNode(
 }
 
 void Graph::applyDevicePlacement(const Placement& placement) {
+<<<<<<< HEAD
   TORCH_CHECK(
       !placementApplied_,
       "placement has been applied to the graph! placement must be applied once and once only.");
@@ -715,6 +738,16 @@ void Graph::overrideWeightsDevice(
 
 Node* Graph::nodeAfter(Node* n) {
   TORCH_CHECK(n->owningGraph() == this);
+=======
+  // TODO: consolidate device info in weight loading here as well.
+  for (auto& node : nodes_) {
+    node.applyDevicePlacement(placement);
+  }
+}
+
+Node* Graph::nodeAfter(Node* n) {
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n == outputNode_) {
     return nullptr;
   }
@@ -723,7 +756,11 @@ Node* Graph::nodeAfter(Node* n) {
 }
 
 const Node* Graph::nodeAfter(const Node* n) const {
+<<<<<<< HEAD
   TORCH_CHECK(n->owningGraph() == this);
+=======
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n == outputNode_) {
     return nullptr;
   }
@@ -732,7 +769,11 @@ const Node* Graph::nodeAfter(const Node* n) const {
 }
 
 Node* Graph::nodeBefore(Node* n) {
+<<<<<<< HEAD
   TORCH_CHECK(n->owningGraph() == this);
+=======
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n == inputNode_) {
     return nullptr;
   }
@@ -741,7 +782,11 @@ Node* Graph::nodeBefore(Node* n) {
 }
 
 const Node* Graph::nodeBefore(const Node* n) const {
+<<<<<<< HEAD
   TORCH_CHECK(n->owningGraph() == this);
+=======
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n == inputNode_) {
     return nullptr;
   }
@@ -750,7 +795,12 @@ const Node* Graph::nodeBefore(const Node* n) const {
 }
 
 void Graph::removeNode(Node* n) {
+<<<<<<< HEAD
   TORCH_CHECK(n->owningGraph() == this, "Node does not belong to this graph!");
+=======
+  TORCH_CHECK_EQ(n->owningGraph(), this)
+      << "Node does not belong to this graph!";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (auto* outputVal : n->outputs()) {
     TORCH_CHECK(
@@ -792,7 +842,12 @@ std::vector<Value*> Graph::insertGraph(
     const Graph& subgraph,
     std::vector<Value*> inputs,
     std::unordered_map<const Value*, Value*>& valueMap) {
+<<<<<<< HEAD
   TORCH_CHECK(subgraph.inputs().size() == inputs.size(), "Input size mismatch");
+=======
+  TORCH_CHECK_EQ(subgraph.inputs().size(), inputs.size())
+      << "Input size mismatch";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto i : c10::irange(subgraph.inputs().size())) {
     valueMap[subgraph.inputs()[i]] = inputs[i];
   }
@@ -898,7 +953,11 @@ void Node::addOutput() {
 }
 
 Value* Node::addOutput(const Type& type) {
+<<<<<<< HEAD
   TORCH_CHECK(type == Type::Kind::None);
+=======
+  TORCH_CHECK_EQ(type, Type::Kind::None);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Value* v = owningGraph_->addValue(std::nullopt, type, this);
   outputs_.push_back(v);
   return v;
@@ -937,9 +996,15 @@ std::vector<const Value*> Value::getListElements() const {
       ret.push_back(tv.value);
     }
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(users().size() == 1);
     const auto listUnpack = users()[0];
     TORCH_CHECK(listUnpack->target() == "prim.ListUnpack");
+=======
+    TORCH_CHECK_EQ(users().size(), 1);
+    const auto listUnpack = users()[0];
+    TORCH_CHECK_EQ(listUnpack->target(), "prim.ListUnpack");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto v : listUnpack->outputs()) {
       ret.push_back(v);
     }
@@ -1114,6 +1179,7 @@ std::ostream& operator<<(std::ostream& out, const Graph& graph) {
 c10::Device convertDevice(std::string_view symbol) {
   // Symbol looks like `Device{cuda:1}`
   const auto typeStart = symbol.find('{') + 1;
+<<<<<<< HEAD
   TORCH_CHECK(typeStart < symbol.size());
 
   const auto typeEnd = symbol.find(':');
@@ -1125,6 +1191,19 @@ c10::Device convertDevice(std::string_view symbol) {
 
   const auto indexEnd = symbol.find('}');
   TORCH_CHECK(indexEnd != std::string_view::npos);
+=======
+  TORCH_CHECK_LT(typeStart, symbol.size());
+
+  const auto typeEnd = symbol.find(':');
+  TORCH_CHECK_NE(typeEnd, std::string_view::npos);
+
+  const auto type = symbol.substr(typeStart, typeEnd - typeStart);
+  const auto indexStart = typeEnd + 1;
+  TORCH_CHECK_LT(indexStart, symbol.size());
+
+  const auto indexEnd = symbol.find('}');
+  TORCH_CHECK_NE(indexEnd, std::string_view::npos);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const auto index = symbol.substr(indexStart, indexEnd - indexStart);
 
@@ -1143,7 +1222,11 @@ c10::Device convertDevice(std::string_view symbol) {
 Constant convertAtomicConstant(std::string_view symbol) {
   if (c10::starts_with(symbol, "\"")) {
     // chop off the outer quotes and return the string
+<<<<<<< HEAD
     TORCH_CHECK(symbol.size() >= 2);
+=======
+    TORCH_CHECK_GE(symbol.size(), 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     symbol.remove_prefix(1);
     symbol.remove_suffix(1);
     return std::string(symbol);
@@ -1222,8 +1305,13 @@ Constant convertListConstant(std::string_view source) {
         TORCH_CHECK(false, "constant lists only support int, float, bool");
       }
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(
           type.index() == val.index(), "lists must have all the same type");
+=======
+      TORCH_CHECK_EQ(type.index(), val.index())
+          << "lists must have all the same type";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     values.push_back(std::move(val));
     if (source.at(curPos) == ']') {
@@ -1326,7 +1414,11 @@ std::unique_ptr<Graph> Parser::parse() {
   }
   // For graph textual format, it should be safe to assume all
   // inputs/outputs are from users.
+<<<<<<< HEAD
   graph_->setSignature(GraphSignature{signature_});
+=======
+  graph_->setSignature(torch::nativert::GraphSignature{signature_});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   graph_->finalize();
   graph_->lint();
   // TODO: Might have some source left over, should check it if so.
@@ -1350,7 +1442,11 @@ bool Parser::nextIf(char expected) {
 }
 
 void Parser::parseGraphInputs() {
+<<<<<<< HEAD
   TORCH_CHECK(curPos_ == 0);
+=======
+  TORCH_CHECK_EQ(curPos_, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   expect("graph");
   const auto inputs = parseList<std::string_view>(
       '(', ')', [&]() { return parseAtomicSymbol(); });
@@ -1413,7 +1509,11 @@ std::string_view Parser::parseUntil(
   return source_.substr(start, curPos_ - start);
 }
 
+<<<<<<< HEAD
 // Parse a string, including the outer quotes
+=======
+// Parse a strng, including the outer quotes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string_view Parser::parseString() {
   size_t start = curPos_;
   expect('"');
diff --git a/torch/nativert/graph/Graph.h b/torch/nativert/graph/Graph.h
index bbd87a8e2014b..a7447b950396c 100644
--- a/torch/nativert/graph/Graph.h
+++ b/torch/nativert/graph/Graph.h
@@ -71,7 +71,11 @@ class Type {
 // These are all the constant types that are allowed as attributes on Nodes.
 struct None {};
 // None always equals itself
+<<<<<<< HEAD
 inline bool operator==(const None& /*unused*/, const None& /*unused*/) {
+=======
+inline bool operator==(const None&, const None&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
@@ -442,11 +446,14 @@ class Graph {
 
   void applyDevicePlacement(const Placement& placement);
 
+<<<<<<< HEAD
   // Override all weights in the graph if matching name is found in the map.
   void overrideWeightsDevice(
       const std::unordered_map<std::string, std::optional<c10::Device>>&
           submodNameToDevice);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string getUniqueValueName();
 
   ValueId getNextValueId() {
@@ -589,8 +596,11 @@ class Graph {
   void setWeightsMeta(
       const std::unordered_map<std::string, torch::_export::TensorMeta>&
           tensorsMeta) {
+<<<<<<< HEAD
     TORCH_CHECK(!placementApplied_);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto [name, tensorMeta] : tensorsMeta) {
       weightsMeta_.emplace(name, TensorMeta{tensorMeta});
     }
@@ -612,8 +622,11 @@ class Graph {
   void setTensorValuesMeta(
       const std::unordered_map<std::string, torch::_export::TensorMeta>&
           tensorsMeta) {
+<<<<<<< HEAD
     TORCH_CHECK(!placementApplied_);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto [name, tensorMeta] : tensorsMeta) {
       tensorValuesMeta_.emplace(name, TensorMeta{tensorMeta});
     }
@@ -639,8 +652,11 @@ class Graph {
   friend std::ostream& operator<<(std::ostream& out, const Graph& g);
   GraphSignature signature_;
 
+<<<<<<< HEAD
   bool placementApplied_ = false;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // keys are parameters, buffers, tensor_constants' names
   std::unordered_map<std::string, TensorMeta> weightsMeta_;
 
diff --git a/torch/nativert/graph/GraphPasses.cpp b/torch/nativert/graph/GraphPasses.cpp
index 6cb378af80dbd..b569376322047 100644
--- a/torch/nativert/graph/GraphPasses.cpp
+++ b/torch/nativert/graph/GraphPasses.cpp
@@ -101,10 +101,14 @@ std::string selectScalarOverloadName(const Node& node) {
       "floor_divide_out",
       "_conj"};
   std::vector<std::string_view> atoms = c10::split(node.target(), '.');
+<<<<<<< HEAD
 
   if (atoms.size() < 3) {
     return "";
   }
+=======
+  TORCH_CHECK_GE(atoms.size(), 3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::string ns = std::string{atoms[atoms.size() - 3]};
   std::string opName = std::string{atoms[atoms.size() - 2]};
@@ -113,7 +117,11 @@ std::string selectScalarOverloadName(const Node& node) {
       overloadName != "Tensor_mode") {
     return overloadName;
   }
+<<<<<<< HEAD
   if (allowed.find(opName) == allowed.end()) {
+=======
+  if (allowed.find(std::string{opName}) == allowed.end()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return overloadName;
   }
   auto op = c10::Dispatcher::singleton().findSchemaOrThrow(
diff --git a/torch/nativert/graph/GraphPasses.h b/torch/nativert/graph/GraphPasses.h
index 7971aeb6b2242..45d08993c57ce 100644
--- a/torch/nativert/graph/GraphPasses.h
+++ b/torch/nativert/graph/GraphPasses.h
@@ -4,8 +4,14 @@
 
 namespace torch::nativert {
 
+<<<<<<< HEAD
 void selectScalarOverload(Graph* graph);
 
 std::string selectScalarOverloadName(const Node& node);
+=======
+void selectScalarOverload(torch::nativert::Graph* graph);
+
+std::string selectScalarOverloadName(const torch::nativert::Node& node);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::nativert
diff --git a/torch/nativert/graph/GraphSignature.h b/torch/nativert/graph/GraphSignature.h
index a9e2a95bbaa6b..1ac4f25273b9e 100644
--- a/torch/nativert/graph/GraphSignature.h
+++ b/torch/nativert/graph/GraphSignature.h
@@ -14,7 +14,11 @@ namespace torch::nativert {
  *
  * The GraphSignature class models the input and output specs of an exported
  * graph produced by torch.export, which is a fx.Graph with stronger invariants
+<<<<<<< HEAD
  * guarantees. It holds the graph information deserialized from the pt2 archive
+=======
+ * gurantees. It holds the graph information deserialized from the pt2 archive
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * package. Runtime relies on the GraphSignature for weight name lookup and
  * weight loading. The serialization schema is defined in
  * torch/_export/serde/schema.py See more at:
diff --git a/torch/nativert/graph/Serialization.cpp b/torch/nativert/graph/Serialization.cpp
index 4c45edd1f5751..40318f0b67b32 100644
--- a/torch/nativert/graph/Serialization.cpp
+++ b/torch/nativert/graph/Serialization.cpp
@@ -184,7 +184,11 @@ std::unique_ptr<Graph> jsonToSubgraph(
     graphInputs = std::move(reorderedGraphInputs);
     auto reorderedSignature = *signature;
     reorderedSignature.set_input_specs(reorderedInputSpecs);
+<<<<<<< HEAD
     graph->setSignature(GraphSignature{reorderedSignature});
+=======
+    graph->setSignature(torch::nativert::GraphSignature{reorderedSignature});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   for (const auto& input : graphInputs) {
@@ -408,7 +412,11 @@ std::unique_ptr<Graph> jsonToSubgraph(
     }
     sig.set_output_specs(std::move(outputSpecs));
 
+<<<<<<< HEAD
     graph->setSignature(GraphSignature{sig});
+=======
+    graph->setSignature(torch::nativert::GraphSignature{sig});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // weightsTensorMeta are indexed by weight's name, not graph input's name
@@ -422,11 +430,17 @@ std::unique_ptr<Graph> jsonToSubgraph(
     }
 
     auto it = jsonTensorValue.find(inputName);
+<<<<<<< HEAD
     TORCH_CHECK(
         it != jsonTensorValue.end(),
         "Missing tensor metadata for ",
         inputName,
         "in thriftGraph.tensorValue");
+=======
+    CHECK(it != jsonTensorValue.end())
+        << "Missing tensor metadata for " << inputName
+        << "in thriftGraph.tensorValue";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weightsTensorMeta[weightName] = it->second;
   }
   graph->setWeightsMeta(weightsTensorMeta);
@@ -464,7 +478,11 @@ Constant constantToValue(
     bool loadNodeMetadata) {
   switch (jsonArg.tag()) {
     case torch::_export::Argument::Tag::AS_NONE:
+<<<<<<< HEAD
       return None();
+=======
+      return torch::nativert::None();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case torch::_export::Argument::Tag::AS_INT:
       return jsonArg.get_as_int();
     case torch::_export::Argument::Tag::AS_INTS: {
@@ -493,6 +511,7 @@ Constant constantToValue(
       return ret;
     }
     case torch::_export::Argument::Tag::AS_SCALAR_TYPE:
+<<<<<<< HEAD
       return convertJsonScalarType(jsonArg.get_as_scalar_type());
     case torch::_export::Argument::Tag::AS_MEMORY_FORMAT:
       return convertJsonMemoryFormat(jsonArg.get_as_memory_format());
@@ -500,6 +519,17 @@ Constant constantToValue(
       return convertJsonLayout(jsonArg.get_as_layout());
     case torch::_export::Argument::Tag::AS_DEVICE:
       return convertJsonDevice(jsonArg.get_as_device());
+=======
+      return torch::nativert::convertJsonScalarType(
+          jsonArg.get_as_scalar_type());
+    case torch::_export::Argument::Tag::AS_MEMORY_FORMAT:
+      return torch::nativert::convertJsonMemoryFormat(
+          jsonArg.get_as_memory_format());
+    case torch::_export::Argument::Tag::AS_LAYOUT:
+      return torch::nativert::convertJsonLayout(jsonArg.get_as_layout());
+    case torch::_export::Argument::Tag::AS_DEVICE:
+      return torch::nativert::convertJsonDevice(jsonArg.get_as_device());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case torch::_export::Argument::Tag::AS_BOOL:
       return jsonArg.get_as_bool();
     case torch::_export::Argument::Tag::AS_BOOLS: {
diff --git a/torch/nativert/graph/TensorMeta.cpp b/torch/nativert/graph/TensorMeta.cpp
index 68d47a58fb68a..9b42d5cabb7ce 100644
--- a/torch/nativert/graph/TensorMeta.cpp
+++ b/torch/nativert/graph/TensorMeta.cpp
@@ -41,10 +41,13 @@ c10::ScalarType convertJsonScalarType(
       return c10::ScalarType::Float8_e4m3fn;
     case torch::_export::ScalarType::FLOAT8E5M2:
       return c10::ScalarType::Float8_e5m2;
+<<<<<<< HEAD
     case torch::_export::ScalarType::FLOAT8E4M3FNUZ:
       return c10::ScalarType::Float8_e4m3fnuz;
     case torch::_export::ScalarType::FLOAT8E5M2FNUZ:
       return c10::ScalarType::Float8_e5m2fnuz;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK(false, "unknown scalar type", static_cast<int>(scalarType));
   }
@@ -106,6 +109,7 @@ TensorMeta::TensorMeta(const torch::_export::TensorMeta& tensorMeta)
       layout_(convertJsonLayout(tensorMeta.get_layout())),
       requiresGrad_(tensorMeta.get_requires_grad()),
       device_(convertJsonDevice(tensorMeta.get_device())) {
+<<<<<<< HEAD
   const auto& storageOffset = tensorMeta.get_storage_offset();
   if (storageOffset.tag() == torch::_export::SymInt::Tag::AS_INT) {
     storage_offset_ = tensorMeta.get_storage_offset().get_as_int();
@@ -114,6 +118,13 @@ TensorMeta::TensorMeta(const torch::_export::TensorMeta& tensorMeta)
     // setting the storage offset to 0 for now
     hasSymbolicShape_ = true;
     storage_offset_ = 0;
+=======
+  if (tensorMeta.get_storage_offset().tag() ==
+      torch::_export::SymInt::Tag::AS_INT) {
+    storage_offset_ = tensorMeta.get_storage_offset().get_as_int();
+  } else {
+    CHECK(false) << "SymInt not supported yet";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   for (const auto& size : tensorMeta.get_sizes()) {
@@ -123,7 +134,11 @@ TensorMeta::TensorMeta(const torch::_export::TensorMeta& tensorMeta)
       numel_ *= val;
     } else if (size.tag() == torch::_export::SymInt::Tag::AS_EXPR) {
       // TODO: it's still unclear how SymInt shape should be used in runtime
+<<<<<<< HEAD
       // One potential use cases is for verifying inputs shape matches constrain
+=======
+      // One potential use cases is for verifing inputs shape matches constrain
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // This would require unpacking the serialized constrain, which is NYI
       //
       // For the time being, we just set the symbolic dim to -1
diff --git a/torch/nativert/graph/TensorMeta.h b/torch/nativert/graph/TensorMeta.h
index 5d1d39e5d2d60..a1c9a4d14cc25 100644
--- a/torch/nativert/graph/TensorMeta.h
+++ b/torch/nativert/graph/TensorMeta.h
@@ -10,7 +10,10 @@
 #include <c10/util/ArrayRef.h>
 
 #include <torch/csrc/utils/generated_serialization_types.h>
+<<<<<<< HEAD
 #include <torch/nativert/executor/Placement.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
@@ -26,12 +29,20 @@ class TensorMeta {
   explicit TensorMeta(const torch::_export::TensorMeta& tensorMeta);
 
   c10::IntArrayRef sizes() const {
+<<<<<<< HEAD
     TORCH_CHECK(!hasSymbolicShape_, "TensorMeta has symbolic shape");
+=======
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return sizes_;
   }
 
   c10::IntArrayRef strides() const {
+<<<<<<< HEAD
     TORCH_CHECK(!hasSymbolicShape_, "TensorMeta has symbolic shape");
+=======
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return strides_;
   }
 
@@ -56,7 +67,11 @@ class TensorMeta {
   }
 
   int64_t numel() const {
+<<<<<<< HEAD
     TORCH_CHECK(!hasSymbolicShape_, "TensorMeta has symbolic shape");
+=======
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return numel_;
   }
 
@@ -64,21 +79,27 @@ class TensorMeta {
     return device_;
   }
 
+<<<<<<< HEAD
   // override device according to placement
   void setDevice(c10::Device device) {
     device_ = device;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::TensorOptions asTensorOptions() const {
     return c10::TensorOptions().dtype(dtype_).layout(layout_).requires_grad(
         requiresGrad_);
   }
 
+<<<<<<< HEAD
   // override device according to placement
   void applyDevicePlacement(const Placement& placement) {
     device_ = placement.getMappedDevice(device_);
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NYI
   // c10::SymIntArrayRef sym_sizes() const {}
   // c10::SymIntArrayRef sym_strides() const {}
diff --git a/torch/nativert/kernels/AutoFunctionalizeKernel.cpp b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
index 0b649d90074db..0ca51b235d011 100644
--- a/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
+++ b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
@@ -1,7 +1,13 @@
 #include <torch/nativert/kernels/AutoFunctionalizeKernel.h>
 
+<<<<<<< HEAD
 #include <c10/util/Enumerate.h>
 #include <c10/util/Exception.h>
+=======
+#include <fmt/format.h>
+
+#include <c10/util/Enumerate.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
@@ -10,14 +16,23 @@ UnsafeAutoFunctionalizeKernel::UnsafeAutoFunctionalizeKernel(const Node* node)
       op_(getOperatorForTarget(
           std::get<std::string>(node->attributes()[0].value))),
       schema_(op_.schema()),
+<<<<<<< HEAD
       arguments_(prefillStackWithStaticArgs(node, schema_)),
       numOutputs_(static_cast<int>(schema_.returns().size())) {
+=======
+      arguments_(prefillStackWithStaticArgs(node, schema_)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& [idx, schemaArg] : c10::enumerate(schema_.arguments())) {
     if (schemaArg.alias_info() != nullptr &&
         schemaArg.alias_info()->isWrite()) {
       mutatingInputArgs_.push_back(node->getInput(schemaArg.name()).value);
     }
   }
+<<<<<<< HEAD
+=======
+
+  numOutputs_ = schema_.returns().size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void UnsafeAutoFunctionalizeKernel::computeInternal(
@@ -36,12 +51,19 @@ void UnsafeAutoFunctionalizeKernel::computeInternal(
     // IndexError, ValueError). If retaining this information is important
     // to us, we'll have to change this up a little.
     auto stackTrace = node_->getMetadata("stack_trace");
+<<<<<<< HEAD
     TORCH_CHECK(
         false,
         "Oringinal Python stacktrace:\n",
         stackTrace ? *stackTrace : "<no stack trace>",
         "\n",
         ex.what())
+=======
+    throw std::runtime_error(fmt::format(
+        "Original Python stacktrace:\n{}\n{}",
+        stackTrace ? *stackTrace : "<no stack trace>",
+        ex.what()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const auto& outputValues = node_->outputs();
diff --git a/torch/nativert/kernels/AutoFunctionalizeKernel.h b/torch/nativert/kernels/AutoFunctionalizeKernel.h
index f9d6e6e58c6c9..9d85c256fe410 100644
--- a/torch/nativert/kernels/AutoFunctionalizeKernel.h
+++ b/torch/nativert/kernels/AutoFunctionalizeKernel.h
@@ -4,8 +4,13 @@
 #include <ATen/core/function_schema.h>
 #include <c10/core/Device.h>
 
+<<<<<<< HEAD
 #include <torch/nativert/executor/ExecutionFrame.h>
 #include <torch/nativert/executor/OpKernel.h>
+=======
+#include <torch/nativert/executor/ExecutionFrame.h> // @manual
+#include <torch/nativert/executor/OpKernel.h> // @manual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::nativert {
 
diff --git a/torch/nativert/kernels/C10Kernel.cpp b/torch/nativert/kernels/C10Kernel.cpp
index 079b8f1735b87..da2c5680ff84e 100644
--- a/torch/nativert/kernels/C10Kernel.cpp
+++ b/torch/nativert/kernels/C10Kernel.cpp
@@ -3,7 +3,10 @@
 #include <fmt/ostream.h>
 
 #include <c10/util/Enumerate.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef __SIGRID_USE_GPU__
 #include <ATen/cuda/CUDAContext.h>
@@ -14,9 +17,16 @@ namespace torch::nativert {
 
 C10Kernel::C10Kernel(
     const Node* node,
+<<<<<<< HEAD
     OpKernelKind kind,
     AliasingSpec&& aliasingSpec)
     : OpKernel(node, kind),
+=======
+    c10::Device device,
+    OpKernelKind kind,
+    AliasingSpec&& aliasingSpec)
+    : OpKernel(node, device, kind),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       op_(getOperatorForTarget(node->target(), node)),
       schema_(op_.schema(), std::move(aliasingSpec), kind_),
       arguments_(prefillStackWithStaticArgs(node, op_.schema())) {}
@@ -32,6 +42,7 @@ void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
     op_.callBoxed(stack);
   } catch (const std::exception& ex) {
     auto stackTrace = node_->getMetadata("stack_trace");
+<<<<<<< HEAD
     TORCH_CHECK(
         false,
         "Exception while executing node: ",
@@ -44,6 +55,17 @@ void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
         "\n",
         "Original Python stacktrace:\n",
         stackTrace ? *stackTrace : "<no stack trace>")
+=======
+    throw std::runtime_error(fmt::format(
+        "Exception while executing node: {}\n"
+        "with args:\n{}\n"
+        "{}\n"
+        "Original Python stacktrace:\n{}",
+        fmt::streamed(*node_),
+        readableArgs(op_.schema(), stack),
+        ex.what(),
+        stackTrace ? *stackTrace : "<no stack trace>"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Write out results
@@ -52,10 +74,15 @@ void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
   // these are named I don't think it will ever happen in practice. We need to
   // enforce it though.
   const auto& outputValues = node_->outputs();
+<<<<<<< HEAD
   TORCH_CHECK(
       outputValues.size() == stack.size(),
       "Output size mismatch for ",
       node_->toString());
+=======
+  TORCH_CHECK_EQ(outputValues.size(), stack.size())
+      << "Output size mismatch for " << node_->toString();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto&& [i, actualOutput] : c10::enumerate(stack)) {
     executionFrame.setIValue(outputValues[i]->id(), std::move(actualOutput));
   }
@@ -71,7 +98,11 @@ std::unordered_map<std::string, c10::IValue> getSymInputs(
     if (val.isInt() || val.isDouble() || val.isBool()) {
       inputs[input.name] = val;
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "unsupported type for symbolic input");
+=======
+      throw std::runtime_error("unsupported type for symbolic input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   for (const auto& attribute : node.attributes()) {
@@ -82,7 +113,11 @@ std::unordered_map<std::string, c10::IValue> getSymInputs(
     } else if (std::holds_alternative<bool>(attribute.value)) {
       inputs[attribute.name] = std::get<bool>(attribute.value);
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "unsupported type for symbolic input");
+=======
+      throw std::runtime_error("unsupported type for symbolic input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return inputs;
@@ -106,7 +141,12 @@ void computeScalarBinaryOp(
   } else if (target == "_operator.pow") {
     out = std::pow(a, b);
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "unsupported operator for scalar binary op: ", target);
+=======
+    throw std::runtime_error(
+        fmt::format("unsupported operator for symbolic values: {}", target));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   executionFrame.setIValue(node.outputs()[0]->id(), out);
@@ -133,7 +173,11 @@ void ScalarBinaryOpKernel::computeInternal(
     } else if (x.isDouble()) {
       return x.toDouble();
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "unsupported type for symbolic input");
+=======
+      throw std::runtime_error("unsupported type for symbolic input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   };
 
@@ -174,7 +218,12 @@ void SymIntOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
   } else if (target == "torch.sym_min") {
     out = std::min(a, b);
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "unsupported operator for SymInt: ", node_->target())
+=======
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymInt: {}", node_->target()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   executionFrame.setIValue(node_->outputs()[0]->id(), out);
@@ -221,7 +270,12 @@ void SymBoolOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     bool b = inputs.at("b").toBool();
     out = a && b;
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "unsupported operator for SymBool: ", node_->target())
+=======
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymBool: {}", node_->target()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   executionFrame.setIValue(node_->outputs()[0]->id(), out);
@@ -248,7 +302,11 @@ void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     } else if (a.isDouble()) {
       out = -a.toDouble();
     } else {
+<<<<<<< HEAD
       TORCH_CHECK(false, "unsupported type for symbolic input");
+=======
+      throw std::runtime_error("unsupported type for symbolic input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     executionFrame.setIValue(node_->outputs()[0]->id(), out);
   } else if (target == "_operator.truediv") {
@@ -259,7 +317,15 @@ void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     double out = a / b;
     executionFrame.setIValue(node_->outputs()[0]->id(), out);
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(false, "unsupported operator for SymFloat: ", node_->target());
   }
 }
+=======
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymFloat: {}", node_->target()));
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::nativert
diff --git a/torch/nativert/kernels/C10Kernel.h b/torch/nativert/kernels/C10Kernel.h
index d477524103819..114351689b92f 100644
--- a/torch/nativert/kernels/C10Kernel.h
+++ b/torch/nativert/kernels/C10Kernel.h
@@ -24,9 +24,16 @@ class C10Kernel : public OpKernel {
   C10Kernel() = delete; // deleted default constructor
   C10Kernel(
       const Node* node,
+<<<<<<< HEAD
       OpKernelKind kind = OpKernelKind::kInterpreterFallbackKernel,
       AliasingSpec&& aliasingSpec = {});
   ~C10Kernel() override = default;
+=======
+      c10::Device device,
+      OpKernelKind kind = OpKernelKind::kInterpreterFallbackKernel,
+      AliasingSpec&& aliasingSpec = {});
+  virtual ~C10Kernel() override = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   [[nodiscard]] const c10::IValue& input(
       uint32_t i,
@@ -57,19 +64,31 @@ class C10Kernel : public OpKernel {
 class SymIntOpKernel : public OpKernel {
  public:
   explicit SymIntOpKernel(const Node* node) : OpKernel(node) {}
+<<<<<<< HEAD
   void computeInternal(ExecutionFrame& executionFrame) const final;
+=======
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class SymBoolOpKernel : public OpKernel {
  public:
   explicit SymBoolOpKernel(const Node* node) : OpKernel(node) {}
+<<<<<<< HEAD
   void computeInternal(ExecutionFrame& executionFrame) const final;
+=======
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class SymFloatOpKernel : public OpKernel {
  public:
   explicit SymFloatOpKernel(const Node* node) : OpKernel(node) {}
+<<<<<<< HEAD
   void computeInternal(ExecutionFrame& executionFrame) const final;
+=======
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // ScalarOpKernel does binary arithmetic operations on scalar values.
@@ -78,7 +97,11 @@ class SymFloatOpKernel : public OpKernel {
 class ScalarBinaryOpKernel : public OpKernel {
  public:
   explicit ScalarBinaryOpKernel(const Node* node) : OpKernel(node) {}
+<<<<<<< HEAD
   void computeInternal(ExecutionFrame& executionFrame) const final;
+=======
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/kernels/HigherOrderKernel.cpp b/torch/nativert/kernels/HigherOrderKernel.cpp
index 0a39da4f28e67..bc54e74bf7a9a 100644
--- a/torch/nativert/kernels/HigherOrderKernel.cpp
+++ b/torch/nativert/kernels/HigherOrderKernel.cpp
@@ -1,39 +1,72 @@
 #include <torch/nativert/kernels/HigherOrderKernel.h>
 
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
+=======
+#include <fmt/format.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/string_view.h>
 
 namespace torch::nativert {
 
+<<<<<<< HEAD
+=======
+using torch::nativert::Graph;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HigherOrderKernel::HigherOrderKernel(
     const Node* node,
     std::vector<std::unique_ptr<GraphExecutorBase>> graphExecutors)
     : OpKernel(node), graphExecutors_(std::move(graphExecutors)) {
   static constexpr std::string_view prefix = "torch.ops.higher_order.";
+<<<<<<< HEAD
   TORCH_CHECK(c10::starts_with(node->target(), prefix));
+=======
+  CHECK(c10::starts_with(node->target(), prefix));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto opName = node->target().substr(prefix.size());
   if (opName == "cond") {
     opType_ = OpType::COND;
     // Checking torch.cond schema is as expected:
     // torch.cond(Tensor predicate, Graph graph1, Graph graph2, Tensor[] args)
     // -> Tensor[]
+<<<<<<< HEAD
     TORCH_CHECK(node_->attributes().size() == 2);
     TORCH_CHECK(node_->inputs().size() == 2);
+=======
+    TORCH_CHECK_EQ(node_->attributes().size(), 2);
+    TORCH_CHECK_EQ(node_->inputs().size(), 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (opName == "while_loop") {
     opType_ = OpType::WHILE_LOOP;
     // Checking torch.while_loop schema is as expected:
     // torch.while_loop(Graph cond, Graph body, Tensor[] args, Tensor[]
+<<<<<<< HEAD
     // additional) -> Tensor[]
     TORCH_CHECK(node_->attributes().size() == 2);
     TORCH_CHECK(node_->inputs().size() == 2);
+=======
+    // additonal) -> Tensor[]
+    TORCH_CHECK_EQ(node_->attributes().size(), 2);
+    TORCH_CHECK_EQ(node_->inputs().size(), 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (opName == "run_const_graph") {
     opType_ = OpType::RUN_CONST_GRAPH;
     // Checking torch.run_const_graph schema is as expected:
     // torch.run_const_graph(Graph graph, Tensor[] args) -> Tensor[]
+<<<<<<< HEAD
     TORCH_CHECK(!node_->attributes().empty());
     TORCH_CHECK(node_->inputs().size() == 1);
   } else {
     TORCH_CHECK(false, "Unknown higher order op: ", opName);
+=======
+    TORCH_CHECK_GE(node_->attributes().size(), 1);
+    TORCH_CHECK_EQ(node_->inputs().size(), 1);
+  } else {
+    throw std::runtime_error(
+        fmt::format("Unknown higher order op: {}", opName));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -51,7 +84,11 @@ void HigherOrderKernel::computeInternal(ExecutionFrame& executionFrame) const {
       } else if (cond.isBool()) {
         branchIdx = cond.toBool() ? 0 : 1;
       } else {
+<<<<<<< HEAD
         TORCH_CHECK(false, "Unsupported type for cond predicate");
+=======
+        throw std::runtime_error("Unsupported type for cond predicate");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       ExecutionFrame branchFrame(*std::get<std::unique_ptr<Graph>>(
           node_->attributes()[branchIdx].value));
diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index 5aa739efd2edb..8e002b8faaeb4 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -392,8 +392,13 @@ def nested_tensor_from_jagged(
         offsets (optional :class:`torch.Tensor`): Offsets into the jagged dimension of shape B + 1.
         lengths (optional :class:`torch.Tensor`): Lengths of the batch elements of shape B.
         jagged_dim (optional int): Indicates which dimension in values is the packed jagged
+<<<<<<< HEAD
             dimension. Must be >= 1 as the batch dimension (dim=0) cannot be ragged.
             If None, this is set to dim=1 (i.e. the dimension immediately following the batch dimension). Default: None
+=======
+            dimension. If None, this is set to dim=1 (i.e. the dimension immediately following
+            the batch dimension). Default: None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         min_seqlen (optional int): If set, uses the specified value as the cached minimum sequence
             length for the returned nested tensor. This can be a useful alternative to computing
             this value on-demand, possibly avoiding a GPU -> CPU sync. Default: None
@@ -450,8 +455,11 @@ def nested_tensor_from_jagged(
 
     if jagged_dim is None:
         jagged_dim = 1
+<<<<<<< HEAD
     elif jagged_dim < 1:
         raise ValueError(f"Expected jagged_dim >=1, but got {jagged_dim}.")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch.nested._internal.nested_tensor import (
         nested_view_from_values_offsets_lengths,
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index 8d446a7bd518d..8c3a0b06f641b 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -234,6 +234,7 @@ def _maybe_min_seqlen(self) -> Optional[int]:
         mt = self._min_seqlen_tensor
         return None if mt is None else _load_val_from_tensor(mt)
 
+<<<<<<< HEAD
     def _is_contiguous_or_false(self):
         if self.lengths() is not None:
             return False
@@ -243,16 +244,24 @@ def _is_contiguous_or_false(self):
             self._values, memory_format=torch.contiguous_format
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __repr__(self):  # type: ignore[override]
         # We should implement this in torch/_tensor_str.py instead
         grad_fn_str = (
             f", requires_grad={self.requires_grad}" if self.requires_grad else ""
         )
+<<<<<<< HEAD
 
         if self.grad_fn:
             grad_fn_str = f", grad_fn={self.grad_fn}"
 
         return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._is_contiguous_or_false()})"
+=======
+        if self.grad_fn:
+            grad_fn_str = f", grad_fn={self.grad_fn}"
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self.is_contiguous()})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: Remove this in favor of the default tensor subclass serialization logic.
     # We don't do this today because of https://github.com/pytorch/pytorch/issues/125622.
@@ -406,7 +415,11 @@ def backward(ctx, gO: torch.Tensor):  # type: ignore[override]
 # Not actually a view!
 class ViewNestedFromBuffer(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     def forward(  # pyrefly: ignore  # bad-override
+=======
+    def forward(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctx,
         values: torch.Tensor,
         offsets: torch.Tensor,
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index b0cce5a09392e..22e46b337a7d4 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -18,6 +18,7 @@
 JAGGED_OPS_TABLE: Dict[Any, Any] = {}
 
 
+<<<<<<< HEAD
 def _get_padding_value(dtype, padding_type):
     if dtype.is_floating_point:
         return (
@@ -35,6 +36,8 @@ def _get_padding_value(dtype, padding_type):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _outer_to_inner_dim(ndim, dim, ragged_dim, canonicalize=False):
     from torch._prims_common import canonicalize_dims
 
@@ -46,6 +49,7 @@ def _outer_to_inner_dim(ndim, dim, ragged_dim, canonicalize=False):
     if canonicalize:
         dim = canonicalize_dims(ndim, dim)
 
+<<<<<<< HEAD
     assert dim >= 0 and dim < ndim  # pyrefly: ignore [unsupported-operation]
 
     # Map dim=0 (AKA batch dim) -> packed dim i.e. outer ragged dim - 1.
@@ -54,6 +58,13 @@ def _outer_to_inner_dim(ndim, dim, ragged_dim, canonicalize=False):
         # pyrefly: ignore [unsupported-operation]
         ragged_dim - 1 if dim == 0 else dim - 1
     )
+=======
+    assert dim >= 0 and dim < ndim
+
+    # Map dim=0 (AKA batch dim) -> packed dim i.e. outer ragged dim - 1.
+    # For other dims, subtract 1 to convert to inner space.
+    return ragged_dim - 1 if dim == 0 else dim - 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _wrap_jagged_dim(
@@ -433,6 +444,7 @@ def _flatten_sig(input, start_dim=0, end_dim=-1):
 
         return inp.reshape(*new_shape)
 
+<<<<<<< HEAD
     # Handle NestedTensor share_memory_.
     if func.__name__ == "share_memory_":
         nt = args[0]
@@ -463,6 +475,8 @@ def _flatten_sig(input, start_dim=0, end_dim=-1):
             for name in names
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Handle nested-specific input validation for CompositeImplicit rms_norm
     if func.__name__ == "rms_norm":
 
@@ -567,6 +581,7 @@ def is_contiguous_general(func, *args, **kwargs):
 
 
 @register_jagged_func(
+<<<<<<< HEAD
     torch.ops.aten.sym_is_contiguous.default, "self: jt_all, memory_format: any?"
 )
 def sym_is_contiguous_general(func, *args, **kwargs):
@@ -590,6 +605,8 @@ def sym_is_contiguous_general(func, *args, **kwargs):
 
 
 @register_jagged_func(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.ops.aten.clone.default, "input: jt_all, memory_format: any?"
 )
 def clone_default(func, *args, **kwargs):
@@ -915,6 +932,7 @@ def _softmax_default(func, *args, **kwargs):
 
 
 @register_jagged_func(
+<<<<<<< HEAD
     torch.ops.aten._log_softmax.default, "self: jt_all, dim: any, half_to_float: any"
 )
 def _log_softmax_default(func, *args, **kwargs):
@@ -955,6 +973,8 @@ def _log_softmax_default(func, *args, **kwargs):
 
 
 @register_jagged_func(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.ops.aten._softmax_backward_data.default,
     "grad_output: jt, output: jt, dim: any, input_dtype: any",
 )
@@ -1112,7 +1132,11 @@ def chunk_default(func, *args, **kwargs):
         # the input number; it can be counter-intuitive, but it matches dense behavior.
         return [
             NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
+<<<<<<< HEAD
             for i in range(len(chunk_values))
+=======
+            for i in range(0, len(chunk_values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     else:
         return [
@@ -1139,7 +1163,11 @@ def unbind_int(func, *args, **kwargs):
     ragged_idx = inp._ragged_idx
 
     def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
+<<<<<<< HEAD
         # This torch._check are needed for torch.compile
+=======
+        # This torch._check and torch._check_is_size are needed for torch.compile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # symbolic shapes processing.
         # offsets and lengths are symbolic variables during compilation,
         # we guarantee the correct offsets/lengths correspondence:
@@ -1151,7 +1179,11 @@ def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
         lengths_sum = 0
         ragged_dim_size = values.shape[ragged_idx - 1]
         for i in range(len(_lengths)):
+<<<<<<< HEAD
             torch._check(_lengths[i] >= 0)
+=======
+            torch._check_is_size(_lengths[i])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(_lengths[i] <= ragged_dim_size)
 
             lengths_sum += _lengths[i]
@@ -1164,7 +1196,11 @@ def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
 
         if _offsets is not None:
             for i in range(len(_offsets)):
+<<<<<<< HEAD
                 torch._check(_offsets[i] >= 0)
+=======
+                torch._check_is_size(_offsets[i])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._check(_offsets[i] <= ragged_dim_size)
 
     if lengths is None:
@@ -1232,7 +1268,11 @@ def unsqueeze_default(func, *args, **kwargs):
     return NestedTensor(func(values, **new_kwargs), **output_kwargs)
 
 
+<<<<<<< HEAD
 @register_jagged_func(torch.ops.aten.cat.default, "tensors: any, dim: any?")
+=======
+@register_jagged_func(torch.ops.aten.cat.default, "tensors: any, dim: any")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cat_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
@@ -2008,7 +2048,10 @@ def index_put_(func, *args, **kwargs):
     else:
         lengths = inp.lengths()
     torch._assert_async(
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.all(indices[inp._ragged_idx] < lengths),
         "Some indices in the ragged dimension are out of bounds!",
     )
@@ -2152,6 +2195,7 @@ def all_any_max_min_default(func, *args, **kwargs):
     return func(inp._values, **new_kwargs)
 
 
+<<<<<<< HEAD
 @register_jagged_func(
     [torch.ops.aten._is_all_true.default, torch.ops.aten._is_any_true.default],
     "self: jt_all",
@@ -2165,14 +2209,20 @@ def _is_true_default(func, *args, **kwargs):
     return func(inp._values)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_jagged_func(torch.ops.aten.min.dim, "self: jt_all, dim: any, keepdim: any?")
 def min_dim(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_max = _get_padding_value(dtype, "max")
+=======
+    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "min", dtype_max, *args, **kwargs)
 
 
@@ -2182,8 +2232,12 @@ def max_dim(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_min = _get_padding_value(dtype, "min")
+=======
+    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "max", dtype_min, *args, **kwargs)
 
 
@@ -2195,8 +2249,12 @@ def amin_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_max = _get_padding_value(dtype, "max")
+=======
+    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "amin", dtype_max, *args, **kwargs)
 
 
@@ -2208,8 +2266,12 @@ def amax_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_min = _get_padding_value(dtype, "min")
+=======
+    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "amax", dtype_min, *args, **kwargs)
 
 
@@ -2221,8 +2283,12 @@ def argmin_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_max = _get_padding_value(dtype, "max")
+=======
+    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "argmin", dtype_max, *args, **kwargs)
 
 
@@ -2234,8 +2300,12 @@ def argmax_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
+<<<<<<< HEAD
     dtype = new_kwargs["input"].dtype
     dtype_min = _get_padding_value(dtype, "min")
+=======
+    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _apply_reduction(func, "argmax", dtype_min, *args, **kwargs)
 
 
@@ -2275,7 +2345,11 @@ def value_selecting_reduction_backward_default(func, *args, **kwargs):
     return NestedTensor(func(**new_kwargs), **output_kwargs)
 
 
+<<<<<<< HEAD
 @register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any?")
+=======
+@register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def stack_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
@@ -2735,6 +2809,136 @@ def matmul_backward_default(func, *args, **kwargs):
     return (grad_self, grad_other)
 
 
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.flex_attention import (
+    flex_attention as flex_attention_hop,
+    flex_attention_backward as flex_attention_backward_hop,
+)
+from torch.fx.graph_module import GraphModule
+
+
+@flex_attention_hop.py_impl(NestedTensor)  # type: ignore[misc]
+def flex_njt(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: Tuple,
+    scale: float,
+    kernel_options: Dict[str, Any],
+    score_mod_other_buffers: Tuple = (),
+    mask_mod_other_buffers: Tuple = (),
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert query.dim() == 4 and key.dim() == 4 and value.dim() == 4
+
+    # TODO: Support this if needed; determine if NJT buffers need be unwrapped as dense.
+    if any(
+        isinstance(buf, torch.Tensor) and buf.is_nested
+        for buf in score_mod_other_buffers + mask_mod_other_buffers
+    ):
+        raise RuntimeError(
+            "flex_attention(): Nested tensor score_mod / mask_mod buffers are not "
+            "currently supported. Please file an issue if this is important to you."
+        )
+
+    # need to pass dense tensor of shape (B, n_heads, sum(seq_len), D)
+    output = flex_attention_hop(
+        query.values().unsqueeze(0),
+        key.values().unsqueeze(0),
+        value.values().unsqueeze(0),
+        score_mod=score_mod,
+        block_mask=block_mask,
+        scale=scale,
+        kernel_options=kernel_options,
+        score_mod_other_buffers=score_mod_other_buffers,
+        mask_mod_other_buffers=mask_mod_other_buffers,
+    )
+
+    # wrap outputs as NJT
+    output_njt = torch.nested.nested_tensor_from_jagged(
+        output[0].transpose(1, 2).squeeze(0),
+        query._offsets,  # type: ignore[attr-defined]
+        query._lengths,  # type: ignore[attr-defined]
+        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+
+    logsumexp_njt = torch.nested.nested_tensor_from_jagged(
+        output[1].transpose(1, 2).squeeze(0),
+        query._offsets,  # type: ignore[attr-defined]
+        query._lengths,  # type: ignore[attr-defined]
+        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+
+    return (output_njt, logsumexp_njt)
+
+
+@flex_attention_backward_hop.py_impl(NestedTensor)  # type: ignore[misc]
+def flex_njt_backward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    block_mask: Tuple,
+    scale: float,
+    kernel_options: Dict[str, Any],
+    score_mod_other_buffers: Tuple = (),
+    mask_mod_other_buffers: Tuple = (),
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, Tuple[Optional[torch.Tensor], ...]
+]:
+    output = flex_attention_backward_hop(
+        query.values().unsqueeze(0),
+        key.values().unsqueeze(0),
+        value.values().unsqueeze(0),
+        out=out.values().unsqueeze(0),
+        logsumexp=logsumexp.values().unsqueeze(0),
+        grad_out=grad_out.values().unsqueeze(0),
+        grad_logsumexp=grad_logsumexp.values().unsqueeze(0),
+        fw_graph=fw_graph,
+        joint_graph=joint_graph,
+        block_mask=block_mask,
+        scale=scale,
+        kernel_options=kernel_options,
+        score_mod_other_buffers=score_mod_other_buffers,
+        mask_mod_other_buffers=mask_mod_other_buffers,
+    )
+
+    # wrap grads as NJTs
+    dense_q_grad, dense_k_grad, dense_v_grad, score_mod_other_buffer_grads = output
+    njt_q_grad = torch.nested.nested_tensor_from_jagged(
+        dense_q_grad.transpose(1, 2).squeeze(0),
+        query._offsets,  # type: ignore[attr-defined]
+        query._lengths,  # type: ignore[attr-defined]
+        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+    njt_k_grad = torch.nested.nested_tensor_from_jagged(
+        dense_k_grad.transpose(1, 2).squeeze(0),
+        key._offsets,  # type: ignore[attr-defined]
+        key._lengths,  # type: ignore[attr-defined]
+        min_seqlen=key._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=key._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+    njt_v_grad = torch.nested.nested_tensor_from_jagged(
+        dense_v_grad.transpose(1, 2).squeeze(0),
+        value._offsets,  # type: ignore[attr-defined]
+        value._lengths,  # type: ignore[attr-defined]
+        min_seqlen=value._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=value._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+
+    return (njt_q_grad, njt_k_grad, njt_v_grad, score_mod_other_buffer_grads)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Make the dummy available on the C++ side.
 @register_jagged_func(torch.ops.aten._nested_get_jagged_dummy.default, "self: any")
 def _nested_get_jagged_dummy(func, *args, **kwargs):
diff --git a/torch/nested/_internal/sdpa.py b/torch/nested/_internal/sdpa.py
index 4e8d430e845b5..7b380db6a482f 100644
--- a/torch/nested/_internal/sdpa.py
+++ b/torch/nested/_internal/sdpa.py
@@ -369,7 +369,11 @@ def _is_safe_to_get_storage_as_tensor(tensor: torch.Tensor):
     # use with the flash-attention and efficient_attention kernels without
     # needing to call contiguous on the nested tensor input.
     # It checks that the storage offsets' adjacent_differences are a constant
+<<<<<<< HEAD
     # multiple of the previous tensor in the nested tensor and that the strides
+=======
+    # mutiple of the previous tensor in the nested tensor and that the strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # are monitonically decreasing. This check is done after calling transpose on
     # the nested tensor resulting in a Nt of shape [bsz, {seq_len}, num_heads, dim]
 
@@ -658,7 +662,11 @@ def _is_computing_meta_flops(x):
             torch.utils._python_dispatch._get_current_dispatch_mode_stack()
         )
         return any(
+<<<<<<< HEAD
             type(x) is torch.utils.flop_counter._FlopCounterMode
+=======
+            type(x) == torch.utils.flop_counter._FlopCounterMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for x in torch_dispatch_mode_stack
         )
     return False
diff --git a/torch/nn/_reduction.py b/torch/nn/_reduction.py
index 9764f935b7c3d..de03ceb2ae16b 100644
--- a/torch/nn/_reduction.py
+++ b/torch/nn/_reduction.py
@@ -13,8 +13,12 @@ def get_enum(reduction: str) -> int:
     elif reduction == "elementwise_mean":
         warnings.warn(
             "reduction='elementwise_mean' is deprecated. "
+<<<<<<< HEAD
             "Please use reduction='mean' instead.",
             stacklevel=2,
+=======
+            "Please use reduction='mean' instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ret = 1
     elif reduction == "sum":
@@ -49,7 +53,11 @@ def legacy_get_string(
     else:
         ret = "none"
     if emit_warning:
+<<<<<<< HEAD
         warnings.warn(warning.format(ret), stacklevel=2)
+=======
+        warnings.warn(warning.format(ret))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret
 
 
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 9113fd7e37912..d03c2c9e343b7 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -14,6 +14,7 @@
     SDPAParams,
 )
 
+<<<<<<< HEAD
 from .varlen import varlen_attn
 
 
@@ -23,6 +24,10 @@
     "WARN_FOR_UNFUSED_KERNELS",
     "varlen_attn",
 ]
+=======
+
+__all__: list[str] = ["SDPBackend", "sdpa_kernel", "WARN_FOR_UNFUSED_KERNELS"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Note: [SDPA warnings]
 # TODO: Consider using this for sdpa regardless of subclasses
@@ -34,6 +39,12 @@
 WARN_FOR_UNFUSED_KERNELS = False
 
 
+<<<<<<< HEAD
+=======
+# Hacks for Sphinx documentation:
+# https://stackoverflow.com/questions/38765577/overriding-sphinx-autodoc-alias-of-for-import-of-private-class
+SDPBackend = SDPBackend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""An enum-like class that contains the different backends for scaled dot product attention.
     This backend class is designed to be used with the sdpa_kernel context manager.
 
@@ -43,7 +54,10 @@
         - FLASH_ATTENTION: The flash attention backend for scaled dot product attention.
         - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention.
         - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention.
+<<<<<<< HEAD
         - OVERRIDEABLE: The overridable backend for extension.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :func:`torch.nn.attention.sdpa_kernel` for more details.
 
@@ -60,10 +74,17 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
     """
     if WARN_FOR_UNFUSED_KERNELS:
         if not can_use_efficient_attention(params):
+<<<<<<< HEAD
             warn("Efficient attention can't be used because:", stacklevel=2)
             can_use_efficient_attention(params, True)
         if not can_use_flash_attention(params):
             warn("Flash attention can't be used because:", stacklevel=2)
+=======
+            warn("Efficient attention can't be used because:")
+            can_use_efficient_attention(params, True)
+        if not can_use_flash_attention(params):
+            warn("Flash attention can't be used because:")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             can_use_flash_attention(params, True)
 
 
@@ -72,7 +93,10 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
     "flash": "FLASH_ATTENTION",
     "mem_efficient": "EFFICIENT_ATTENTION",
     "math": "MATH",
+<<<<<<< HEAD
     "overrideable": "OVERRIDEABLE",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -83,7 +107,11 @@ def _backend_from_string(name: str):
 def _cur_sdpa_kernel_backends(with_priority: bool = False):
     backends = []
     for name, val in _backend_names.items():
+<<<<<<< HEAD
         if getattr(torch._C, f"_get_{name}_sdp_enabled")():
+=======
+        if getattr(torch.backends.cuda, f"{name}_sdp_enabled")():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             backends.append(getattr(SDPBackend, val))
     if with_priority:
         curr_priority = torch._C._get_sdp_priority_order()
@@ -96,7 +124,11 @@ def _cur_sdpa_kernel_backends(with_priority: bool = False):
 def _sdpa_kernel(backends: Iterable, set_priority: bool = False):
     for name, val in _backend_names.items():
         enabled = getattr(SDPBackend, val) in backends
+<<<<<<< HEAD
         getattr(torch._C, f"_set_sdp_use_{name}")(enabled)
+=======
+        getattr(torch.backends.cuda, f"enable_{name}_sdp")(enabled)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if set_priority:
         # backends should be a unique list
         user_priority = [int(backend) for backend in backends]
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index 551a57e6963e0..785f8563de8af 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -134,8 +134,12 @@ def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int):
         self.seq_len_kv = seq_len_kv
         if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT:
             warn(
+<<<<<<< HEAD
                 "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!",
                 stacklevel=2,
+=======
+                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def _upper_left(self, device: torch.device) -> torch.Tensor:
@@ -154,7 +158,10 @@ def _lower_right(self, device: torch.device) -> torch.Tensor:
             diagonal=diagonal_offset,
         )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _materialize(self, device: Optional[torch.device] = None) -> torch.Tensor:
         """
         Materializes the causal bias into a tensor form.
@@ -271,7 +278,11 @@ def _dispatch(
                 )[0].transpose(1, 2)
             else:
                 _raise_kernel_warnings(sdpa_params)
+<<<<<<< HEAD
                 # We can't use efficient attention the only support for lower right is via materialization
+=======
+                # We cant use efficient attention the only support for lower right is via materialization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return F.scaled_dot_product_attention(
                     query,
                     key,
diff --git a/torch/nn/attention/experimental/__init__.py b/torch/nn/attention/experimental/__init__.py
index 4a6694bbe3990..ede944bdfc6bd 100644
--- a/torch/nn/attention/experimental/__init__.py
+++ b/torch/nn/attention/experimental/__init__.py
@@ -1,2 +1,6 @@
 # Experimental features are not mature yet and are subject to change.
+<<<<<<< HEAD
 # We do not provide any BC/FC guarantees
+=======
+# We do not provide any BC/FC guarntees
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/attention/experimental/_paged_attention.py b/torch/nn/attention/experimental/_paged_attention.py
index 70eadcdadfaa0..d71734d4b821e 100644
--- a/torch/nn/attention/experimental/_paged_attention.py
+++ b/torch/nn/attention/experimental/_paged_attention.py
@@ -29,7 +29,11 @@ class PagedAttention:
     """
     PagedAttention supports flex attention inference with a large batch size.
     With PagedAttention, a batch of key/value tensors with varying kv length
+<<<<<<< HEAD
     is split into tensor blocks of fixed length and cached in a compact way.
+=======
+    is splitted into tensor blocks of fixed length and cached in a compact way.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Thus we can avoid redundant memory consumption due to varying kv length and
     support a larger batch size.
     """
@@ -198,7 +202,10 @@ def convert_logical_block_mask(
         self,
         block_mask: BlockMask,
         batch_idx: Optional[torch.Tensor] = None,
+<<<<<<< HEAD
         kv_len: Optional[torch.Tensor] = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> BlockMask:
         """
         Converts a logical block mask by mapping its logical kv indices to the corresponding
@@ -211,8 +218,11 @@ def convert_logical_block_mask(
                 batch dimension. This provides flexibility to convert a
                 block mask with smaller batch size than the page table;
                 shape :math:`(B)`.
+<<<<<<< HEAD
             kv_len (Optional[Tensor]): actual KV sequence length for upper bound check;
                 shape :math:`(B,)` to handle multiple batches.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         B, H, ROWS, MAX_BLOCKS_IN_COL = block_mask.kv_indices.shape
 
@@ -264,7 +274,11 @@ def convert_logical_block_mask(
                 .to(torch.int32)
             )
 
+<<<<<<< HEAD
         new_mask_mod = self.get_mask_mod(block_mask.mask_mod, kv_len)
+=======
+        new_mask_mod = self.get_mask_mod(block_mask.mask_mod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         seq_lengths = (block_mask.seq_lengths[0], self.n_pages * self.page_size)
         return BlockMask.from_kv_blocks(
@@ -278,9 +292,13 @@ def convert_logical_block_mask(
         )
 
     def get_mask_mod(
+<<<<<<< HEAD
         self,
         mask_mod: Optional[_mask_mod_signature],
         kv_len: Optional[torch.Tensor] = None,
+=======
+        self, mask_mod: Optional[_mask_mod_signature]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> _mask_mod_signature:
         """
         Converts a mask_mod based on mapping from the physical block index to the logical
@@ -288,7 +306,10 @@ def get_mask_mod(
 
         Args:
             mask_mod (_mask_mod_signature): mask_mod based on the logical block index.
+<<<<<<< HEAD
             kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if mask_mod is None:
             mask_mod = noop_mask
@@ -303,6 +324,7 @@ def new_mask_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
+<<<<<<< HEAD
             live_block = logical_block_idx >= 0
             within_upper_bound = (
                 logical_kv_idx < kv_len[b] if kv_len is not None else True
@@ -311,13 +333,22 @@ def new_mask_mod(
             is_valid = live_block & within_upper_bound & within_lower_bound
 
             return torch.where(is_valid, mask_mod(b, h, q_idx, logical_kv_idx), False)
+=======
+            return torch.where(
+                logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return new_mask_mod
 
     def get_score_mod(
+<<<<<<< HEAD
         self,
         score_mod: Optional[_score_mod_signature],
         kv_len: Optional[torch.Tensor] = None,
+=======
+        self, score_mod: Optional[_score_mod_signature]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> _score_mod_signature:
         """
         Converts a score_mod based on mapping from the physical block index to the logical
@@ -325,8 +356,11 @@ def get_score_mod(
 
         Args:
             score_mod (_score_mod_signature): score_mod based on the logical block index.
+<<<<<<< HEAD
             `kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if score_mod is None:
             score_mod = _identity
@@ -342,6 +376,7 @@ def new_score_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
+<<<<<<< HEAD
             live_block = logical_block_idx >= 0
             within_upper_bound = (
                 logical_kv_idx < kv_len[b] if kv_len is not None else True
@@ -351,6 +386,10 @@ def new_score_mod(
 
             return torch.where(
                 is_valid,
+=======
+            return torch.where(
+                logical_block_idx >= 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 score_mod(score, b, h, q_idx, logical_kv_idx),
                 float("-inf"),
             )
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index cbf1efdd7571d..f66e25e35e658 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -8,6 +8,7 @@
 import math
 import operator
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from enum import Enum
 from typing import Any, NamedTuple, Optional, Union
@@ -26,6 +27,13 @@
 except ImportError:
     from typing_extensions import NotRequired
 
+=======
+from enum import Enum
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
 from torch._higher_order_ops.utils import _set_compilation_env
 from torch._prims_common import DeviceLikeType
@@ -34,6 +42,7 @@
     _temp_remove_pre_dispatch_torch_function_mode,
 )
 from torch.nn.attention._utils import _validate_sdpa_input
+<<<<<<< HEAD
 from torch.utils._pytree import GetAttrKey, tree_map_only
 
 
@@ -65,16 +74,25 @@ def _warn_once(
     if warning_id not in _WARNINGS_SHOWN:
         warnings.warn(message, category, stacklevel=2)
         _WARNINGS_SHOWN.add(warning_id)
+=======
+from torch.utils._pytree import tree_map_only
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
     "BlockMask",
     "flex_attention",
+<<<<<<< HEAD
     "AuxOutput",
     "AuxRequest",
     "FlexKernelOptions",
     "create_block_mask",
     "create_mask",
+=======
+    "create_block_mask",
+    "create_mask",
+    "create_nested_block_mask",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "or_masks",
     "and_masks",
     "noop_mask",
@@ -84,6 +102,7 @@ def _warn_once(
 _mask_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor], Tensor]
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
 class FlexKernelOptions(TypedDict, total=False):
     """Options for controlling the behavior of FlexAttention kernels.
@@ -248,6 +267,8 @@ class AuxOutput(NamedTuple):
     max_scores: Optional[Tensor] = None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _ModificationType(Enum):
     """Enum for the type of modification function.
     - SCORE_MOD: score_mod function which accepts a score as the first argument
@@ -267,6 +288,7 @@ def _get_mod_type(fn: Callable) -> _ModificationType:
     considered as a score_mod function. If the function has 4 positional arguments, it is
     considered as a mask function.
     """
+<<<<<<< HEAD
     if hasattr(fn, "__code__"):
         code = fn.__code__
         num_positional_total = code.co_argcount
@@ -281,6 +303,13 @@ def _get_mod_type(fn: Callable) -> _ModificationType:
             for param in inspect.signature(fn).parameters.values()
             if param.default is inspect.Parameter.empty
         )
+=======
+    num_positional_args = sum(
+        1
+        for param in inspect.signature(fn).parameters.values()
+        if param.default == inspect.Parameter.empty
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert num_positional_args == 5 or num_positional_args == 4
     if num_positional_args == 5:
         return _ModificationType.SCORE_MOD
@@ -357,6 +386,7 @@ def noop_mask(
     return batch.new_ones(size=(), dtype=torch.bool, device=batch.device)
 
 
+<<<<<<< HEAD
 def _sliced_mask_mod_error(
     batch: Tensor,
     head: Tensor,
@@ -384,6 +414,8 @@ def _sliced_mask_mod_error(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _DEFAULT_SPARSE_BLOCK_SIZE = 128
 _LARGE_SPARSE_BLOCK_SIZE = 1 << 30
 
@@ -519,6 +551,7 @@ class BlockMask:
     BLOCK_SIZE: tuple[int, int]
     mask_mod: _mask_mod_signature
 
+<<<<<<< HEAD
     # Attribute lists for pytree flatten/unflatten
     _TENSOR_ATTRS = [
         "kv_num_blocks",
@@ -537,6 +570,8 @@ class BlockMask:
         "mask_mod",
     ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         seq_lengths: tuple[int, int],
@@ -555,6 +590,11 @@ def __init__(
             raise RuntimeError("BlockMask must have at least 2 dimensions")
         assert kv_num_blocks is not None, "kv_num_blocks must be provided"
         assert kv_indices is not None, "kv_indices must be provided"
+<<<<<<< HEAD
+=======
+        assert q_num_blocks is not None, "q_num_blocks must be provided"
+        assert q_indices is not None, "q_indices must be provided"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert (full_kv_num_blocks is None) == (full_kv_indices is None), (
             "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
         )
@@ -584,7 +624,10 @@ def from_kv_blocks(
         BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
         mask_mod: Optional[_mask_mod_signature] = None,
         seq_lengths: Optional[tuple[int, int]] = None,
+<<<<<<< HEAD
         compute_q_blocks: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Creates a BlockMask instance from key-value block information.
@@ -612,6 +655,7 @@ def from_kv_blocks(
         )
 
         # Generate q_num_blocks and q_indices
+<<<<<<< HEAD
         if compute_q_blocks:
             q_num_blocks, q_indices = _transpose_ordered(kv_num_blocks, kv_indices)
             if full_kv_num_blocks is not None:
@@ -623,6 +667,15 @@ def from_kv_blocks(
                 full_q_num_blocks, full_q_indices = None, None
         else:
             q_num_blocks, q_indices = None, None
+=======
+        q_num_blocks, q_indices = _transpose_ordered(kv_num_blocks, kv_indices)
+        if full_kv_num_blocks is not None:
+            assert full_kv_indices is not None
+            full_q_num_blocks, full_q_indices = _transpose_ordered(
+                full_kv_num_blocks, full_kv_indices
+            )
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             full_q_num_blocks, full_q_indices = None, None
 
         if isinstance(BLOCK_SIZE, int):
@@ -631,7 +684,11 @@ def from_kv_blocks(
         mask_mod = mask_mod if mask_mod is not None else noop_mask
         if seq_lengths is None:
             q_length = kv_indices.shape[-2] * BLOCK_SIZE[0]
+<<<<<<< HEAD
             kv_length = kv_indices.shape[-1] * BLOCK_SIZE[1]
+=======
+            kv_length = q_indices.shape[-2] * BLOCK_SIZE[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             seq_lengths = (q_length, kv_length)
 
         return cls(
@@ -662,7 +719,10 @@ def as_tuple(self, flatten: bool = True):
             block_size = (self.BLOCK_SIZE,)  # type: ignore[assignment]
             seq_lengths = (self.seq_lengths,)  # type: ignore[assignment]
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             *seq_lengths,
             self.kv_num_blocks,
@@ -731,6 +791,7 @@ def causal_mask(b, h, q_idx, kv_idx):
                 assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
                 assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
         """
+<<<<<<< HEAD
         index = (index,) if not isinstance(index, tuple) else index
         padded = (*index, slice(None), slice(None), slice(None))[:3]
         sizes = self.kv_num_blocks.shape[:3]
@@ -740,6 +801,8 @@ def causal_mask(b, h, q_idx, kv_idx):
             else i
             for i, n in zip(padded, sizes)
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_kv_num_blocks = self.kv_num_blocks[index]
         new_kv_indices = self.kv_indices[index]
         if self.full_kv_num_blocks is not None:
@@ -755,9 +818,14 @@ def causal_mask(b, h, q_idx, kv_idx):
             new_full_kv_num_blocks,
             new_full_kv_indices,
             BLOCK_SIZE=self.BLOCK_SIZE,
+<<<<<<< HEAD
             mask_mod=_sliced_mask_mod_error,
             seq_lengths=self.seq_lengths,
             compute_q_blocks=self.q_indices is not None,
+=======
+            mask_mod=None,
+            seq_lengths=self.seq_lengths,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __repr__(self):
@@ -835,7 +903,10 @@ def to_dense(self) -> Tensor:
         partial_dense = _ordered_to_dense(self.kv_num_blocks, self.kv_indices)
         if self.full_kv_num_blocks is not None:
             assert self.full_kv_indices is not None
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return partial_dense | _ordered_to_dense(
                 self.full_kv_num_blocks, self.full_kv_indices
             )
@@ -920,7 +991,11 @@ def to(self, device: Union[torch.device, str]) -> "BlockMask":
 
         Note:
             This method does not modify the original BlockMask in-place.
+<<<<<<< HEAD
             Instead, it returns a new BlockMask instance where individual tensor attributes
+=======
+            Instead, it returns a new BlockMask instance where invidual tensor attributes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             may or may not be moved to the specified device, depending on their
             current device placement.
         """
@@ -931,6 +1006,7 @@ def to(self, device: Union[torch.device, str]) -> "BlockMask":
         )
         return BlockMask(*mapped_attributes)
 
+<<<<<<< HEAD
     def _flatten(self):
         """Flatten BlockMask into a list of tensors and context."""
         tensors = tuple(getattr(self, attr) for attr in self._TENSOR_ATTRS)
@@ -957,6 +1033,8 @@ def _flatten_with_keys(self):
         )
         return tensors, context
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _broadcast_to_dim(x, dim):
     while x.dim() < dim:
@@ -1090,7 +1168,11 @@ def create_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
+<<<<<<< HEAD
     device: Optional[DeviceLikeType] = None,
+=======
+    device: DeviceLikeType = "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     r"""This function creates a mask tensor from a mod_fn function.
 
@@ -1105,8 +1187,11 @@ def create_mask(
     Returns:
         mask (Tensor): A mask tensor with shape (B, H, M, N).
     """
+<<<<<<< HEAD
     if device is None:
         device = torch.accelerator.current_accelerator() or "cpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if B is None:
         B = 1
     if H is None:
@@ -1141,7 +1226,11 @@ def create_block_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
+<<<<<<< HEAD
     device: Optional[DeviceLikeType] = None,
+=======
+    device: DeviceLikeType = "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
     _compile=False,
 ) -> BlockMask:
@@ -1176,8 +1265,11 @@ def causal_mask(b, h, q_idx, kv_idx):
             value = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
             output = flex_attention(query, key, value, block_mask=block_mask)
     """
+<<<<<<< HEAD
     if device is None:
         device = torch.accelerator.current_accelerator() or "cpu"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mod_type = _get_mod_type(mask_mod)
     assert mod_type == _ModificationType.MASK_MOD, (
         f"create-block_mask requires a mask_mod function! Got {mask_mod}"
@@ -1196,7 +1288,10 @@ def causal_mask(b, h, q_idx, kv_idx):
         warnings.warn(
             "_compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.",
             DeprecationWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return torch.compile(create_block_mask)(
             mask_mod, B, H, Q_LEN, KV_LEN, device, BLOCK_SIZE
@@ -1234,6 +1329,7 @@ def _create_empty_block_mask(query: Tensor, key: Tensor) -> BlockMask:
     )
 
 
+<<<<<<< HEAD
 def _apply_kernel_options(
     query: Tensor,
     key: Tensor,
@@ -1241,6 +1337,183 @@ def _apply_kernel_options(
     return_lse: bool,
     kernel_options,
     return_aux: Optional[AuxRequest] = None,
+=======
+def _nested_mod_func_adapter(
+    orig_mod_func: Union[_score_mod_signature, _mask_mod_signature],
+    q_nt: torch.Tensor,
+    kv_nt: torch.Tensor,
+    is_score_mod: bool,
+) -> Union[_score_mod_signature, _mask_mod_signature]:
+    r"""Adapter to convert a score_mod / mask_mod to be NJT-compatible. The given mod func
+    should be written as if operating over a single sequence at a item. This adapter will
+    handle conversion from indices operating over a "stacked sequence" of length ``sum(S)``
+    for sequence length ``S`` in the NJT to "sequence relative" indices in range ``[0, S)``.
+
+    Args:
+        orig_mod_func (Callable): Function to modify attention scores. It takes four or five
+            arguments, depending on whether a mask_mod or score_mod func is passed.
+        q_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
+            structure for query.
+        kv_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
+            structure for key / value.
+        is_score_mod (bool): Indicates whether the mod function is a score_mod.
+
+    Returns:
+        nt_score_mod: An NJT-compatible version of orig_score_mod
+    """
+
+    # Used to convert indices within the "stacked" sequence (range [0, sum(*)))
+    # to "sequence local" indices (range [0, S) for each S).
+    def _build_seq_idx(offsets, total_length):
+        range_tensor = torch.arange(
+            total_length, device=offsets.device, dtype=torch.int32
+        )
+
+        # Use searchsorted to find the index for each position
+        # NB: This assumes offsets[0] to offsets[-1] spans the packed dim of values.
+        # If we ever loosen this restriction, this logic will need to be updated.
+        seq_idx = torch.searchsorted(offsets, range_tensor, right=True) - 1
+        return seq_idx
+
+    q_offsets = q_nt._offsets  # type: ignore[attr-defined]
+    kv_offsets = kv_nt._offsets  # type: ignore[attr-defined]
+    q_seq_idx = _build_seq_idx(q_offsets, q_nt._values.shape[q_nt._ragged_idx - 1])  # type: ignore[attr-defined]
+    if q_nt is kv_nt:
+        kv_seq_idx = q_seq_idx
+    else:
+        # cross attention case
+        kv_seq_idx = _build_seq_idx(
+            kv_offsets,
+            kv_nt._values.shape[kv_nt._ragged_idx - 1],  # type: ignore[attr-defined]
+        )
+
+    # Converts q_idx / kv_idx from [0, total_length) -> [0, S), where S refers
+    # to the sequence length for each sequence in the NJT, for use in given
+    # score_mod. This allows the user to write a score_mod as if it were
+    # operating on a single sequence and the "stacked sequence" is split
+    # automatically into individual sequences for them.
+    if is_score_mod:
+
+        def nt_score_mod(score, b, h, q_idx, kv_idx):
+            b_nested = q_seq_idx[q_idx]
+            q_nested = q_idx - q_offsets[q_seq_idx[q_idx]]
+            kv_nested = kv_idx - kv_offsets[kv_seq_idx[kv_idx]]
+            is_same_sequence = q_seq_idx[q_idx] == kv_seq_idx[kv_idx]
+            return torch.where(
+                is_same_sequence,
+                orig_mod_func(score, b_nested, h, q_nested, kv_nested),  # type: ignore[call-arg]
+                # don't allow inter-sequence attention
+                float("-inf"),
+            )
+
+        return nt_score_mod
+    else:
+
+        def nt_mask_mod(b, h, q_idx, kv_idx):
+            b_nested = q_seq_idx[q_idx]
+            q_nested = q_idx - q_offsets[q_seq_idx[q_idx]]
+            kv_nested = kv_idx - kv_offsets[kv_seq_idx[kv_idx]]
+            # don't allow inter-sequence attention
+            is_same_sequence = q_seq_idx[q_idx] == kv_seq_idx[kv_idx]
+            return orig_mod_func(b_nested, h, q_nested, kv_nested) & is_same_sequence  # type: ignore[call-arg]
+
+        return nt_mask_mod
+
+
+def create_nested_block_mask(
+    mask_mod: _mask_mod_signature,
+    B: Optional[int],
+    H: Optional[int],
+    q_nt: torch.Tensor,
+    kv_nt: Optional[torch.Tensor] = None,
+    BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
+    _compile=False,
+) -> BlockMask:
+    r"""This function creates a nested tensor compatible block mask tuple from a mask_mod
+    function. The returned BlockMask will be on the device specified by the input nested tensor.
+
+    Args:
+        mask_mod (Callable): mask_mod function. This is a callable that defines the
+            masking pattern for the attention mechanism. It takes four arguments:
+            b (batch size), h (number of heads), q_idx (query index), and kv_idx (key/value index).
+            It should return a boolean tensor indicating which attention connections are allowed
+            (True) or masked out (False).
+        B (int): Batch size.
+        H (int): Number of query heads.
+        q_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
+            structure for query. The block mask will be constructed to operate on a "stacked
+            sequence" of length ``sum(S)`` for sequence length ``S`` from the NJT.
+        kv_nt (torch.Tensor): Jagged layout nested tensor (NJT) that defines the sequence length
+            structure for key / value, allowing for cross attention. The block mask will be
+            constructed to operate on a "stacked sequence" of length ``sum(S)`` for sequence
+            length ``S`` from the NJT. If this is None, ``q_nt`` is used to define the structure
+            for key / value as well. Default: None
+        BLOCK_SIZE (int or tuple[int, int]): Block size for the block mask. If a single int is
+            provided it is used for both query and key/value.
+
+    Returns:
+        BlockMask:  A BlockMask object that contains the block mask information.
+
+    Example Usage:
+        .. code-block:: python
+
+            # shape (B, num_heads, seq_len*, D) where seq_len* varies across the batch
+            query = torch.nested.nested_tensor(..., layout=torch.jagged)
+            key = torch.nested.nested_tensor(..., layout=torch.jagged)
+            value = torch.nested.nested_tensor(..., layout=torch.jagged)
+
+
+            def causal_mask(b, h, q_idx, kv_idx):
+                return q_idx >= kv_idx
+
+
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, _compile=True
+            )
+            output = flex_attention(query, key, value, block_mask=block_mask)
+
+        .. code-block:: python
+
+            # shape (B, num_heads, seq_len*, D) where seq_len* varies across the batch
+            query = torch.nested.nested_tensor(..., layout=torch.jagged)
+            key = torch.nested.nested_tensor(..., layout=torch.jagged)
+            value = torch.nested.nested_tensor(..., layout=torch.jagged)
+
+
+            def causal_mask(b, h, q_idx, kv_idx):
+                return q_idx >= kv_idx
+
+
+            # cross attention case: pass both query and key/value NJTs
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, key, _compile=True
+            )
+            output = flex_attention(query, key, value, block_mask=block_mask)
+    """
+    # use same structure for kv as for q by default
+    if kv_nt is None:
+        kv_nt = q_nt
+    if q_nt.device != kv_nt.device:
+        raise ValueError(
+            "create_nested_block_mask(): Expected q_nt and kv_nt to be on the same device"
+        )
+    return create_block_mask(
+        _nested_mod_func_adapter(mask_mod, q_nt, kv_nt, is_score_mod=False),  # type: ignore[arg-type]
+        B,
+        H,
+        q_nt._values.shape[q_nt._ragged_idx - 1],  # type: ignore[attr-defined]
+        kv_nt._values.shape[kv_nt._ragged_idx - 1],  # type: ignore[attr-defined]
+        device=q_nt.device,  # type: ignore[arg-type]
+        # compile is important so we don't materialize a mask_tensor of
+        # shape (1, 1, total_seqlen, total_seqlen)
+        BLOCK_SIZE=BLOCK_SIZE,
+        _compile=_compile,
+    )
+
+
+def _apply_kernel_options(
+    query: Tensor, key: Tensor, value: Tensor, return_lse: bool, kernel_options
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     kernel_options = {} if kernel_options is None else dict(kernel_options)
 
@@ -1250,6 +1523,7 @@ def _apply_kernel_options(
     # This forces all biases grad scatters to be done in the DQ iteration loop of the backwards
     kernel_options.setdefault("WRITE_DQ", True)
 
+<<<<<<< HEAD
     any_inputs_on_cpu_device = (
         query.device.type == "cpu"
         or key.device.type == "cpu"
@@ -1286,6 +1560,26 @@ def _apply_kernel_options(
         raise NotImplementedError("Returning max scores is not supported on CPU.")
         kernel_options["OUTPUT_MAX"] = False
 
+=======
+    # If forward kernel needs to return logsumexp is decided by this rule internally.
+    assert "OUTPUT_LOGSUMEXP" not in kernel_options
+    kernel_options["OUTPUT_LOGSUMEXP"] = True
+    if not return_lse:
+        # We used to check if q,k,v required grads but since captured buffers can require grad
+        # we always write unless in no_grad
+        output_logsumexp = torch.is_grad_enabled()
+        kernel_options["OUTPUT_LOGSUMEXP"] = output_logsumexp
+        any_inputs_on_cpu_device = (
+            query.device.type == "cpu"
+            or key.device.type == "cpu"
+            or value.device.type == "cpu"
+        )
+        if any_inputs_on_cpu_device:
+            # CPU with torch.compile now supports infernece, and will not return lse
+            # TODO: support CPU for training and return lse
+            kernel_options["OUTPUT_LOGSUMEXP"] = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kernel_options
 
 
@@ -1301,14 +1595,44 @@ def _validate_device(query: Tensor, key: Tensor, value: Tensor):
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
+<<<<<<< HEAD
     supported_devices = {"cuda", "cpu", "xpu", "hpu"}
     if query.device.type not in supported_devices:
+=======
+    if (
+        query.device.type != "cuda"
+        and query.device.type != "cpu"
+        and query.device.type != "hpu"
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError(
             "FlexAttention is only supported on CUDA, CPU or HPU devices. "
             f"Found input tensors on {query.device.type} device."
         )
 
 
+<<<<<<< HEAD
+=======
+def _validate_nestedness(query: Tensor, key: Tensor, value: Tensor):
+    # Currently, inputs can only be all nested or no nested.
+    if query.is_nested != key.is_nested or key.is_nested != value.is_nested:
+        raise ValueError(
+            "FlexAttention does not support mixed nested tensor / non-nested tensor inputs. "
+            "Please file an issue requesting this if it is important to you."
+        )
+
+    if (
+        (query.is_nested and query._lengths is not None)  # type: ignore[attr-defined]
+        or (key.is_nested and key._lengths is not None)  # type: ignore[attr-defined]
+        or (value.is_nested and value._lengths is not None)  # type: ignore[attr-defined]
+    ):
+        raise ValueError(
+            "FlexAttention does not support nested tensors that are non-contiguous with holes. "
+            "Please file an issue requesting this if it is important to you."
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _enforce_mem_layouts(
     query: Tensor, key: Tensor, value: Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -1377,10 +1701,15 @@ def flex_attention(
     scale: Optional[float] = None,
     enable_gqa: bool = False,
     return_lse: bool = False,
+<<<<<<< HEAD
     kernel_options: Optional[FlexKernelOptions] = None,
     *,
     return_aux: Optional[AuxRequest] = None,
 ) -> Union[Tensor, tuple[Tensor, Tensor], tuple[Tensor, AuxOutput]]:
+=======
+    kernel_options: Optional[dict[str, Any]] = None,
+) -> Union[Tensor, tuple[Tensor, Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
 
     This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
@@ -1414,6 +1743,7 @@ def score_mod(
         block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
         scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
         enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads.
+<<<<<<< HEAD
         return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead.
         kernel_options (Optional[FlexKernelOptions]):
             Options to control the behavior of the underlying Triton kernels.
@@ -1421,16 +1751,23 @@ def score_mod(
         return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return.
             If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)``
             to request both auxiliary outputs.
+=======
+        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False.
+        kernel_options (Optional[Dict[str, Any]]): Options to pass into the Triton kernels.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`.
 
+<<<<<<< HEAD
         When ``return_aux`` is not None:
             aux (AuxOutput): Auxiliary outputs with requested fields populated.
 
         When ``return_aux`` is None (deprecated paths):
             lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Shape legend:
         - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
         - :math:`S: \text{Source sequence length}`
@@ -1448,6 +1785,10 @@ def score_mod(
     _validate_sdpa_input(query, key, value)
     _validate_embed_dim(query, key, value)
     _validate_device(query, key, value)
+<<<<<<< HEAD
+=======
+    _validate_nestedness(query, key, value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     query, key, value = _enforce_mem_layouts(query, key, value)
     if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
         raise NotImplementedError("NYI: query, key, and value must be 4D tensors")
@@ -1482,21 +1823,44 @@ def score_mod(
 
     if score_mod is None:
         score_mod = _identity
+<<<<<<< HEAD
+=======
+    elif query.is_nested:
+        # use same NJT if the ragged structures for sequence lengths match between q and kv
+        kv = (
+            query
+            if query.size(query._ragged_idx) == key.size(query._ragged_idx)  # type: ignore[attr-defined]
+            else key
+        )
+        score_mod = _nested_mod_func_adapter(score_mod, query, kv, is_score_mod=True)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if block_mask is None:
         block_mask = _create_empty_block_mask(query, key)
 
+<<<<<<< HEAD
     # If BlockMask was sliced, its mask_mod is intentionally replaced with an error-raising stub.
     # This guard ensures we surface the intended error message before any shape-based checks.
     if getattr(block_mask, "mask_mod", None) is _sliced_mask_mod_error:
         raise RuntimeError("Cannot use mask_mod from a sliced BlockMask")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (
         block_mask.BLOCK_SIZE[0] == _LARGE_SPARSE_BLOCK_SIZE
         and block_mask.BLOCK_SIZE[1] == _LARGE_SPARSE_BLOCK_SIZE
     ):
         # This corresponds to the case where we essentially have a "no-op" block mask.
         pass
+<<<<<<< HEAD
+=======
+    elif query.is_nested:
+        if block_mask.shape[-2] != query._values.size(query._ragged_idx - 1):  # type: ignore[attr-defined]
+            raise RuntimeError(
+                f"block_mask of shape {block_mask.shape} is not compatible with nested tensor input "
+                f"with total sequence length of {query._values.size(query._ragged_idx - 1)}"  # type: ignore[attr-defined]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         block_mask_q_len = block_mask.shape[-2]
         block_mask_kv_len = block_mask.shape[-1]
@@ -1524,6 +1888,7 @@ def score_mod(
             f"but got {query.device} and {block_mask.kv_num_blocks.device}."  # type: ignore[union-attr]
         )
 
+<<<<<<< HEAD
     # Handle deprecation warnings for old parameters
     if return_lse and return_aux is not None:
         raise ValueError(
@@ -1538,12 +1903,15 @@ def score_mod(
             category=FutureWarning,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kernel_options = _apply_kernel_options(
         query,
         key,
         value,
         return_lse,
         kernel_options,
+<<<<<<< HEAD
         return_aux,
     )
 
@@ -1576,13 +1944,21 @@ def _finalize_outputs(
 
         return out
 
+=======
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch.compiler.is_dynamo_compiling():
         # mark head_dim and number of heads to be static
         for x in [query, key, value]:
             torch._dynamo.mark_static(x, -3)
             torch._dynamo.mark_static(x, -1)
 
+<<<<<<< HEAD
         out, lse, max_scores = flex_attention_hop(
+=======
+        out, lse = flex_attention_hop(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             query,
             key,
             value,
@@ -1591,6 +1967,7 @@ def _finalize_outputs(
             scale,
             kernel_options,  # type: ignore[union-attr]
         )
+<<<<<<< HEAD
         return _finalize_outputs(
             out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
         )
@@ -1606,6 +1983,12 @@ def _finalize_outputs(
                 "This will allow you to use print statements or breakpoints. Note: This doesn't work with the backwards pass and may produce incorrect results."
             ),
         )
+=======
+        if return_lse:
+            return out, lse * math.log(2)
+        else:
+            return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not torch._dynamo.is_dynamo_supported():
         raise RuntimeError("flex_attention requires dynamo support")
@@ -1624,6 +2007,7 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
             with _temp_remove_pre_dispatch_torch_function_mode():
                 with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                     if metadata_mode:
+<<<<<<< HEAD
                         backend: Union[str, Callable[..., Any]] = (
                             make_eager_backend_with_torch_function_mode(metadata_mode)
                         )
@@ -1638,6 +2022,16 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                         )
 
                     out, lse, max_scores = flex_fn(
+=======
+                        backend = make_eager_backend_with_torch_function_mode(
+                            metadata_mode
+                        )
+                    else:
+                        backend = "eager"
+                    out, lse = torch.compile(
+                        _flex_attention_hop_wrapper, backend=backend, fullgraph=True
+                    )(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         query,
                         key,
                         value,
@@ -1646,6 +2040,13 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                         scale,
                         kernel_options,
                     )
+<<<<<<< HEAD
     return _finalize_outputs(
         out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
     )
+=======
+                    if return_lse:
+                        return out, lse * math.log(2)
+                    else:
+                        return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index a8a9d70f4002c..b57f28aaa7479 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,4 +1,9 @@
+<<<<<<< HEAD
 from typing import Optional, TypeAlias as _TypeAlias, TypeVar, Union
+=======
+from typing import Optional, TypeVar, Union
+from typing_extensions import TypeAlias as _TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 
diff --git a/torch/nn/cpp.py b/torch/nn/cpp.py
index e447284ad82ba..1f3199558e2ce 100644
--- a/torch/nn/cpp.py
+++ b/torch/nn/cpp.py
@@ -78,7 +78,10 @@ def _apply(self, fn, recurse=True):
 
     # nn.Module defines training as a boolean
     @property  # type: ignore[override]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def training(self):
         return self.cpp_module.training
 
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 360d687094d9b..1c91c3567424b 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -3,6 +3,7 @@
 import importlib
 import math
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any as _Any, Optional, TYPE_CHECKING, Union
 
@@ -14,6 +15,13 @@
     _ScalingType as ScalingType,
     _SwizzleType as SwizzleType,
 )
+=======
+from typing import Callable, Optional, TYPE_CHECKING, Union
+
+import torch
+from torch import _VF, sym_int as _sym_int, Tensor
+from torch._C import _add_docstr, _infer_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._jit_internal import (
     _overload,
     boolean_dispatch,
@@ -32,10 +40,13 @@
 )
 
 
+<<<<<<< HEAD
 # Set visibility of the bound enums to this module
 ScalingType.__module__ = "torch.nn.functional"
 SwizzleType.__module__ = "torch.nn.functional"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from torch.types import _dtype as DType
 else:
@@ -348,6 +359,12 @@
 Applies a 1D average pooling over an input signal composed of several
 input planes.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool1d` for details and output shape.
 
 Args:
@@ -356,9 +373,14 @@
       tuple `(kW,)`
     stride: the stride of the window. Can be a single number or a tuple
       `(sW,)`. Default: :attr:`kernel_size`
+<<<<<<< HEAD
     padding: implicit zero paddings on both sides of the input. Can be a single
       number or a tuple `(padW,)`. Should be at most half of effective kernel
       size, that is :math:`((kernelSize - 1) * dilation + 1) / 2`. Default: 0
+=======
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padW,)`. Default: 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ceil_mode: when True, will use `ceil` instead of `floor` to compute the
         output shape. Default: ``False``
     count_include_pad: when True, will include the zero-padding in the
@@ -384,6 +406,12 @@
 :math:`sH \times sW` steps. The number of output features is equal to the number of
 input planes.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool2d` for details and output shape.
 
 Args:
@@ -393,9 +421,13 @@
     stride: stride of the pooling operation. Can be a single number, a single-element tuple or a
       tuple `(sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
+<<<<<<< HEAD
       single number, a single-element tuple or a tuple `(padH, padW)`.
       Should be at most half of effective kernel size, that
       is :math:`((kernelSize - 1) * dilation + 1) / 2`. Default: 0
+=======
+      single number, a single-element tuple or a tuple `(padH, padW)`. Default: 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ceil_mode: when True, will use `ceil` instead of `floor` in the formula
         to compute the output shape. Default: ``False``
     count_include_pad: when True, will include the zero-padding in the
@@ -414,6 +446,12 @@
 size :math:`sT \times sH \times sW` steps. The number of output features is equal to
 :math:`\lfloor\frac{\text{input planes}}{sT}\rfloor`.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool3d` for details and output shape.
 
 Args:
@@ -423,9 +461,13 @@
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
+<<<<<<< HEAD
       single number or a tuple `(padT, padH, padW)`. Should be at most half
       of effective kernel size, that is :math:`((kernelSize - 1) * dilation + 1) / 2`.
       Default: 0
+=======
+      single number or a tuple `(padT, padH, padW)`, Default: 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ceil_mode: when True, will use `ceil` instead of `floor` in the formula
         to compute the output shape
     count_include_pad: when True, will include the zero-padding in the
@@ -1089,9 +1131,12 @@ def lp_pool3d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+<<<<<<< HEAD
     When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
     padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     See :class:`~torch.nn.LPPool3d` for details.
     """
     if has_torch_function_unary(input):
@@ -1130,9 +1175,12 @@ def lp_pool2d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+<<<<<<< HEAD
     When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
     padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     See :class:`~torch.nn.LPPool2d` for details.
     """
     if has_torch_function_unary(input):
@@ -1168,9 +1216,12 @@ def lp_pool1d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+<<<<<<< HEAD
     When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
     padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     See :class:`~torch.nn.LPPool1d` for details.
     """
     if has_torch_function_unary(input):
@@ -1275,7 +1326,10 @@ def adaptive_max_pool2d_with_indices(
             output_size,
             return_indices=return_indices,
         )
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_max_pool2d(input, output_size)
 
@@ -1333,7 +1387,10 @@ def adaptive_max_pool3d_with_indices(
             output_size,
             return_indices=return_indices,
         )
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_max_pool3d(input, output_size)
 
@@ -1392,7 +1449,10 @@ def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> T
     """
     if has_torch_function_unary(input):
         return handle_torch_function(adaptive_avg_pool2d, (input,), input, output_size)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_avg_pool2d(input, _output_size)
 
@@ -1408,7 +1468,10 @@ def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList3[int]) -> T
     """
     if has_torch_function_unary(input):
         return handle_torch_function(adaptive_avg_pool3d, (input,), input, output_size)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_avg_pool3d(input, _output_size)
 
@@ -1551,7 +1614,11 @@ def dropout2d(
             "exists to provide channel-wise dropout on inputs with 2 spatial dimensions, "
             "a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs)."
         )
+<<<<<<< HEAD
         warnings.warn(warn_msg, stacklevel=2)
+=======
+        warnings.warn(warn_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: Properly support no-batch-dim inputs. For now, these are NOT supported; passing
     # a 3D input will perform dropout1d behavior instead. This was done historically and the
@@ -1563,8 +1630,12 @@ def dropout2d(
             "1D dropout behavior is desired - input is interpreted as shape (N, C, L), where C "
             "is the channel dim. This behavior will change in a future release to interpret the "
             "input as one without a batch dimension, i.e. shape (C, H, W). To maintain the 1D "
+<<<<<<< HEAD
             "channel-wise dropout behavior, please switch to using dropout1d instead.",
             stacklevel=2,
+=======
+            "channel-wise dropout behavior, please switch to using dropout1d instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     result = (
@@ -1611,7 +1682,11 @@ def dropout3d(
             "exists to provide channel-wise dropout on inputs with 3 spatial dimensions, "
             "a channel dimension, and an optional batch dimension (i.e. 4D or 5D inputs)."
         )
+<<<<<<< HEAD
         warnings.warn(warn_msg, stacklevel=2)
+=======
+        warnings.warn(warn_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     is_batched = inp_dim == 5
     if not is_batched:
@@ -2211,7 +2286,11 @@ def gumbel_softmax(
             gumbel_softmax, (logits,), logits, tau=tau, hard=hard, eps=eps, dim=dim
         )
     if eps != 1e-10:
+<<<<<<< HEAD
         warnings.warn("`eps` parameter is deprecated and has no effect.", stacklevel=2)
+=======
+        warnings.warn("`eps` parameter is deprecated and has no effect.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     gumbels = (
         -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format)
@@ -2445,7 +2524,10 @@ def _no_grad_embedding_renorm_(
     input: Tensor,
     max_norm: float,
     norm_type: float,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[Tensor, Tensor]:
     torch.embedding_renorm_(weight.detach(), input, max_norm, norm_type)
 
@@ -2682,8 +2764,12 @@ def embedding_bag(
         warnings.warn(
             "Argument order of nn.functional.embedding_bag was changed. "
             "Usage `embedding_bag(weight, input, ...)` is deprecated, "
+<<<<<<< HEAD
             "and should now be `embedding_bag(input, weight, ...)`.",
             stacklevel=2,
+=======
+            "and should now be `embedding_bag(input, weight, ...)`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         weight, input = input, weight
 
@@ -2700,7 +2786,10 @@ def embedding_bag(
 
     if not torch.jit.is_scripting() and input.dim() == 2 and input.is_nested:
         include_last_offset = True
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offsets = input.offsets()
         input = input.values().reshape(-1)
         if per_sample_weights is not None:
@@ -2835,7 +2924,10 @@ def batch_norm(
             eps=eps,
         )
     if training:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _verify_batch_size(input.size())
 
     return torch.batch_norm(
@@ -2891,7 +2983,10 @@ def instance_norm(
             eps=eps,
         )
     if use_input_stats:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _verify_spatial_size(input.size())
     return torch.instance_norm(
         input,
@@ -3017,13 +3112,19 @@ def local_response_norm(
     div = input.mul(input)
     if dim == 3:
         div = div.unsqueeze(1)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         div = pad(div, (0, 0, size // 2, (size - 1) // 2))
         div = avg_pool2d(div, (size, 1), stride=1).squeeze(1)
     else:
         sizes = input.size()
         div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         div = pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
         div = avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
         div = div.view(sizes)
@@ -3172,12 +3273,16 @@ def nll_loss(
     if size_average is not None or reduce is not None:
         reduction = _Reduction.legacy_get_string(size_average, reduce)
     return torch._C._nn.nll_loss_nd(
+<<<<<<< HEAD
         input,
         target,
         weight,
         # pyrefly: ignore [bad-argument-type]
         _Reduction.get_enum(reduction),
         ignore_index,
+=======
+        input, target, weight, _Reduction.get_enum(reduction), ignore_index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -3298,6 +3403,7 @@ def gaussian_nll_loss(
         if input.size()[:-1] == var.size():
             var = torch.unsqueeze(var, -1)
 
+<<<<<<< HEAD
         # This checks if the var is broadcastable to the input and there is only one mismatched dimension.
         # This is also a homoscedastic case.
         # e.g. input.size = (10, 2, 3), var.size = (10, 2, 1)
@@ -3305,6 +3411,13 @@ def gaussian_nll_loss(
         elif (
             input.ndim == var.ndim
             and sum(y for x, y in zip(input.size(), var.size()) if x != y) == 1
+=======
+        # This checks if the sizes match up to the final dimension, and the final dimension of var is of size 1.
+        # This is also a homoscedastic case.
+        # e.g. input.size = (10, 2, 3), var.size = (10, 2, 1)
+        elif (
+            input.size()[:-1] == var.size()[:-1] and var.size(-1) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):  # Heteroscedastic case
             pass
 
@@ -3322,7 +3435,10 @@ def gaussian_nll_loss(
         var.clamp_(min=eps)
 
     # Calculate the loss
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     loss = 0.5 * (torch.log(var) + (input - target) ** 2 / var)
     if full:
         loss += 0.5 * math.log(2 * math.pi)
@@ -3394,8 +3510,12 @@ def kl_div(
             warnings.warn(
                 "reduction: 'mean' divides the total loss by both the batch size and the support size."
                 "'batchmean' divides only by the batch size, and aligns with the KL div math definition."
+<<<<<<< HEAD
                 "'mean' will be changed to behave the same as 'batchmean' in the next major release.",
                 stacklevel=2,
+=======
+                "'mean' will be changed to behave the same as 'batchmean' in the next major release."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # special case for batchmean
@@ -3499,7 +3619,10 @@ def cross_entropy(
         input,
         target,
         weight,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _Reduction.get_enum(reduction),
         ignore_index,
         label_smoothing,
@@ -3564,7 +3687,10 @@ def binary_cross_entropy(
         new_size = _infer_size(target.size(), weight.size())
         weight = weight.expand(new_size)
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
 
 
@@ -3679,7 +3805,11 @@ def smooth_l1_loss(
             reduction=reduction,
             beta=beta,
         )
+<<<<<<< HEAD
     if target.size() != input.size():
+=======
+    if not (target.size() == input.size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3693,6 +3823,7 @@ def smooth_l1_loss(
 
     if beta == 0.0:
         return torch._C._nn.l1_loss(
+<<<<<<< HEAD
             expanded_input,
             expanded_target,
             # pyrefly: ignore [bad-argument-type]
@@ -3705,6 +3836,13 @@ def smooth_l1_loss(
             # pyrefly: ignore [bad-argument-type]
             _Reduction.get_enum(reduction),
             beta,
+=======
+            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+        )
+    else:
+        return torch._C._nn.smooth_l1_loss(
+            expanded_input, expanded_target, _Reduction.get_enum(reduction), beta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -3749,7 +3887,11 @@ def huber_loss(
             weight=weight,
         )
 
+<<<<<<< HEAD
     if target.size() != input.size():
+=======
+    if not (target.size() == input.size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3762,11 +3904,15 @@ def huber_loss(
     if weight is None:
         # Use the optimized C++ backend for standard Huber loss
         return torch._C._nn.huber_loss(
+<<<<<<< HEAD
             expanded_input,
             expanded_target,
             # pyrefly: ignore [bad-argument-type]
             _Reduction.get_enum(reduction),
             delta,
+=======
+            expanded_input, expanded_target, _Reduction.get_enum(reduction), delta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         if weight.size() != input.size():
@@ -3774,11 +3920,15 @@ def huber_loss(
 
         # Calculate the unweighted loss first
         unweighted_loss = torch._C._nn.huber_loss(
+<<<<<<< HEAD
             expanded_input,
             expanded_target,
             # pyrefly: ignore [bad-argument-type]
             _Reduction.get_enum("none"),
             delta,
+=======
+            expanded_input, expanded_target, _Reduction.get_enum("none"), delta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Apply weight to the unweighted loss
@@ -3834,7 +3984,11 @@ def l1_loss(
             reduce=reduce,
             reduction=reduction,
         )
+<<<<<<< HEAD
     if target.size() != input.size():
+=======
+    if not (target.size() == input.size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3865,10 +4019,14 @@ def l1_loss(
             )
     else:
         return torch._C._nn.l1_loss(
+<<<<<<< HEAD
             expanded_input,
             expanded_target,
             # pyrefly: ignore [bad-argument-type]
             _Reduction.get_enum(reduction),
+=======
+            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -3910,7 +4068,11 @@ def mse_loss(
             weight=weight,
         )
 
+<<<<<<< HEAD
     if target.size() != input.size():
+=======
+    if not (target.size() == input.size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3943,10 +4105,14 @@ def mse_loss(
             )
     else:
         return torch._C._nn.mse_loss(
+<<<<<<< HEAD
             expanded_input,
             expanded_target,
             # pyrefly: ignore [bad-argument-type]
             _Reduction.get_enum(reduction),
+=======
+            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -4083,7 +4249,10 @@ def multilabel_margin_loss(
         reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
     else:
         reduction_enum = _Reduction.get_enum(reduction)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._C._nn.multilabel_margin_loss(input, target, reduction_enum)
 
 
@@ -4125,7 +4294,10 @@ def soft_margin_loss(
         reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
     else:
         reduction_enum = _Reduction.get_enum(reduction)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._C._nn.soft_margin_loss(input, target, reduction_enum)
 
 
@@ -4290,6 +4462,7 @@ def multi_margin_loss(
             raise ValueError("weight must be one-dimensional")
 
     return torch._C._nn.multi_margin_loss(
+<<<<<<< HEAD
         input,
         target,
         p,
@@ -4297,6 +4470,9 @@ def multi_margin_loss(
         weight,
         # pyrefly: ignore [bad-argument-type]
         reduction_enum,
+=======
+        input, target, p, margin, weight, reduction_enum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -4442,7 +4618,10 @@ def upsample(  # noqa: F811
     scale_factor: Optional[float] = None,
     mode: str = "nearest",
     align_corners: Optional[bool] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4454,7 +4633,10 @@ def upsample(  # noqa: F811
     scale_factor: Optional[float] = None,
     mode: str = "nearest",
     align_corners: Optional[bool] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4557,7 +4739,10 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4571,7 +4756,10 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4585,7 +4773,10 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4599,7 +4790,10 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -4760,7 +4954,11 @@ def interpolate(  # noqa: F811
         )
 
     # "area" mode always requires an explicit size rather than scale factor.
+<<<<<<< HEAD
     # Reuse the recompute_scale_factor code path.
+=======
+    # Re-use the recompute_scale_factor code path.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if mode == "area" and output_size is None:
         recompute_scale_factor = True
 
@@ -4774,7 +4972,10 @@ def interpolate(  # noqa: F811
                 (
                     torch.floor(
                         (
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             input.size(i + 2).float()
                             * torch.tensor(scale_factors[i], dtype=torch.float32)
                         ).float()
@@ -4784,7 +4985,11 @@ def interpolate(  # noqa: F811
             ]
         elif torch.jit.is_scripting():
             output_size = [
+<<<<<<< HEAD
                 math.floor(float(input.size(i + 2)) * scale_factors[i])
+=======
+                int(math.floor(float(input.size(i + 2)) * scale_factors[i]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for i in range(dim)
             ]
         else:
@@ -4799,6 +5004,7 @@ def interpolate(  # noqa: F811
         )
 
     if input.dim() == 3 and mode == "nearest":
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
     if input.dim() == 4 and mode == "nearest":
@@ -4816,11 +5022,27 @@ def interpolate(  # noqa: F811
         return torch._C._nn._upsample_nearest_exact2d(input, output_size, scale_factors)
     if input.dim() == 5 and mode == "nearest-exact":
         # pyrefly: ignore [bad-argument-type]
+=======
+        return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
+    if input.dim() == 4 and mode == "nearest":
+        return torch._C._nn.upsample_nearest2d(input, output_size, scale_factors)
+    if input.dim() == 5 and mode == "nearest":
+        return torch._C._nn.upsample_nearest3d(input, output_size, scale_factors)
+
+    if input.dim() == 3 and mode == "nearest-exact":
+        return torch._C._nn._upsample_nearest_exact1d(input, output_size, scale_factors)
+    if input.dim() == 4 and mode == "nearest-exact":
+        return torch._C._nn._upsample_nearest_exact2d(input, output_size, scale_factors)
+    if input.dim() == 5 and mode == "nearest-exact":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._nn._upsample_nearest_exact3d(input, output_size, scale_factors)
 
     if input.dim() == 3 and mode == "area":
         assert output_size is not None
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return adaptive_avg_pool1d(input, output_size)
     if input.dim() == 4 and mode == "area":
         assert output_size is not None
@@ -4832,26 +5054,40 @@ def interpolate(  # noqa: F811
     if input.dim() == 3 and mode == "linear":
         assert align_corners is not None
         return torch._C._nn.upsample_linear1d(
+<<<<<<< HEAD
             input,
             # pyrefly: ignore [bad-argument-type]
             output_size,
             align_corners,
             scale_factors,
+=======
+            input, output_size, align_corners, scale_factors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if input.dim() == 4 and mode == "bilinear":
         assert align_corners is not None
         if antialias:
             return torch._C._nn._upsample_bilinear2d_aa(
+<<<<<<< HEAD
                 input,
                 # pyrefly: ignore [bad-argument-type]
                 output_size,
                 align_corners,
                 scale_factors,
+=======
+                input, output_size, align_corners, scale_factors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # Two levels are necessary to prevent TorchScript from touching
         # are_deterministic_algorithms_enabled.
         if not torch.jit.is_scripting():
+<<<<<<< HEAD
             if not input.is_cpu and torch.are_deterministic_algorithms_enabled():
+=======
+            if torch.are_deterministic_algorithms_enabled() and (
+                input.is_cuda or input.is_xpu
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Use slow decomp whose backward will be in terms of index_put
                 # importlib is required because the import cannot be top level
                 # (cycle) and cannot be nested (TS doesn't support)
@@ -4859,6 +5095,7 @@ def interpolate(  # noqa: F811
                     "torch._decomp.decompositions"
                 )._upsample_linear_vec(input, output_size, align_corners, scale_factors)
         return torch._C._nn.upsample_bilinear2d(
+<<<<<<< HEAD
             input,
             # pyrefly: ignore [bad-argument-type]
             output_size,
@@ -4883,11 +5120,20 @@ def interpolate(  # noqa: F811
             output_size,
             align_corners,
             scale_factors,
+=======
+            input, output_size, align_corners, scale_factors
+        )
+    if input.dim() == 5 and mode == "trilinear":
+        assert align_corners is not None
+        return torch._C._nn.upsample_trilinear3d(
+            input, output_size, align_corners, scale_factors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if input.dim() == 4 and mode == "bicubic":
         assert align_corners is not None
         if antialias:
             return torch._C._nn._upsample_bicubic2d_aa(
+<<<<<<< HEAD
                 input,
                 # pyrefly: ignore [bad-argument-type]
                 output_size,
@@ -4900,6 +5146,12 @@ def interpolate(  # noqa: F811
             output_size,
             align_corners,
             scale_factors,
+=======
+                input, output_size, align_corners, scale_factors
+            )
+        return torch._C._nn.upsample_bicubic2d(
+            input, output_size, align_corners, scale_factors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if input.dim() == 3 and mode == "bilinear":
@@ -4931,7 +5183,10 @@ def upsample_nearest(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[float] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -4941,7 +5196,10 @@ def upsample_nearest(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[float] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -4983,7 +5241,10 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[float] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -4993,7 +5254,10 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[float] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -5003,7 +5267,10 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[list[float]] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -5013,7 +5280,10 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[list[float]] = None,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     pass
 
@@ -5026,7 +5296,11 @@ def upsample_bilinear(input, size=None, scale_factor=None):  # noqa: F811
         This is equivalent with
         ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
 
+<<<<<<< HEAD
     Expected inputs are spatial (4 dimensional). Use `upsample_trilinear` for
+=======
+    Expected inputs are spatial (4 dimensional). Use `upsample_trilinear` fo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     volumetric (5 dimensional) inputs.
 
     Args:
@@ -5216,8 +5490,12 @@ def grid_sample(
             "Default grid_sample and affine_grid behavior has changed "
             "to align_corners=False since 1.3.0. Please specify "
             "align_corners=True if the old behavior is desired. "
+<<<<<<< HEAD
             "See the documentation of grid_sample for details.",
             stacklevel=2,
+=======
+            "See the documentation of grid_sample for details."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         align_corners = False
 
@@ -5284,8 +5562,12 @@ def affine_grid(
             "Default grid_sample and affine_grid behavior has changed "
             "to align_corners=False since 1.3.0. Please specify "
             "align_corners=True if the old behavior is desired. "
+<<<<<<< HEAD
             "See the documentation of grid_sample for details.",
             stacklevel=2,
+=======
+            "See the documentation of grid_sample for details."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         align_corners = False
 
@@ -5319,8 +5601,12 @@ def affine_grid(
             "Since version 1.3.0, affine_grid behavior has changed "
             "for unit-size grids when align_corners=True. "
             "This is not an intended use case of affine_grid. "
+<<<<<<< HEAD
             "See the documentation of affine_grid for details.",
             stacklevel=2,
+=======
+            "See the documentation of affine_grid for details."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     elif min(size) <= 0:
         raise ValueError(f"Expected non-zero, positive output size. Got {size}")
@@ -5823,7 +6109,10 @@ def _in_projection_packed(
                 .squeeze(-2)
                 .contiguous()
             )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return proj[0], proj[1], proj[2]
         else:
             # encoder-decoder attention
@@ -5842,7 +6131,10 @@ def _in_projection_packed(
                 .squeeze(-2)
                 .contiguous()
             )
+<<<<<<< HEAD
             # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (q_proj, kv_proj[0], kv_proj[1])
     else:
         w_q, w_k, w_v = w.chunk(3)
@@ -5850,7 +6142,10 @@ def _in_projection_packed(
             b_q = b_k = b_v = None
         else:
             b_q, b_k, b_v = b.chunk(3)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 
 
@@ -5944,6 +6239,10 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
                 assert attn_mask is None
                 temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
                 attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+<<<<<<< HEAD
+=======
+                attn_bias.to(query.dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if attn_mask is not None:
                 if attn_mask.dtype == torch.bool:
@@ -6164,8 +6463,12 @@ def _canonical_mask(
             if _mask_dtype != other_type:
                 warnings.warn(
                     f"Support for mismatched {mask_name} and {other_name} "
+<<<<<<< HEAD
                     "is deprecated. Use same type for both instead.",
                     stacklevel=2,
+=======
+                    "is deprecated. Use same type for both instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         if not _mask_is_float:
             mask = torch.zeros_like(mask, dtype=target_type).masked_fill_(
@@ -6482,10 +6785,15 @@ def multi_head_attention_forward(
         k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
         v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
         if attn_mask is not None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             attn_mask = pad(attn_mask, (0, 1))
         if key_padding_mask is not None:
             # pyrefly: ignore [bad-argument-type]
+=======
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key_padding_mask = pad(key_padding_mask, (0, 1))
     else:
         assert bias_k is None
@@ -6494,10 +6802,15 @@ def multi_head_attention_forward(
     #
     # reshape q, k, v for multihead attention and make them batch first
     #
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
     q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
     if static_k is None:
         # pyrefly: ignore [no-matching-overload]
+=======
+    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
@@ -6509,7 +6822,10 @@ def multi_head_attention_forward(
         )
         k = static_k
     if static_v is None:
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
@@ -6525,6 +6841,7 @@ def multi_head_attention_forward(
     if add_zero_attn:
         zero_attn_shape = (bsz * num_heads, 1, head_dim)
         k = torch.cat(
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
             [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)],
             dim=1,
@@ -6539,6 +6856,16 @@ def multi_head_attention_forward(
             attn_mask = pad(attn_mask, (0, 1))
         if key_padding_mask is not None:
             # pyrefly: ignore [bad-argument-type]
+=======
+            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1
+        )
+        v = torch.cat(
+            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1
+        )
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             key_padding_mask = pad(key_padding_mask, (0, 1))
 
     # update source sequence length after adjustments
@@ -6588,7 +6915,10 @@ def multi_head_attention_forward(
         attn_output = torch.bmm(attn_output_weights, v)
 
         attn_output = (
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
         )
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
@@ -6615,16 +6945,23 @@ def multi_head_attention_forward(
                 attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
 
         q = q.view(bsz, num_heads, tgt_len, head_dim)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
         k = k.view(bsz, num_heads, src_len, head_dim)
         # pyrefly: ignore [no-matching-overload]
+=======
+        k = k.view(bsz, num_heads, src_len, head_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         v = v.view(bsz, num_heads, src_len, head_dim)
 
         attn_output = scaled_dot_product_attention(
             q, k, v, attn_mask, dropout_p, is_causal
         )
         attn_output = (
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
         )
 
@@ -6634,6 +6971,7 @@ def multi_head_attention_forward(
             # squeeze the output if input was unbatched
             attn_output = attn_output.squeeze(1)
         return attn_output, None
+<<<<<<< HEAD
 
 
 def scaled_mm(
@@ -6805,3 +7143,5 @@ def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
     )
 
     return out
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index d0b64447e900b..c5db59c8b57d7 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -396,6 +396,7 @@ def instance_norm(
 __all__ += ["instance_norm"]
 
 def interpolate(
+<<<<<<< HEAD
     input: Tensor,
     size: int | Sequence[int] | None = ...,
     scale_factor: float | Sequence[float] | None = ...,
@@ -404,6 +405,16 @@ def interpolate(
     recompute_scale_factor: bool | None = ...,
     antialias: bool = ...,
 ) -> Tensor: ...
+=======
+    input: Any,
+    size: Any | None = ...,
+    scale_factor: Any | None = ...,
+    mode: str = ...,
+    align_corners: Any | None = ...,
+    recompute_scale_factor: Any | None = ...,
+    antialias: bool = ...,
+): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ += ["interpolate"]
 
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 78fe7cd7ff80c..f01c3ab2ce712 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -2,8 +2,12 @@
 
 import math
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Literal, Optional as _Optional, TypeVar, Union
+=======
+from typing import Callable, Literal, Optional as _Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -65,7 +69,11 @@
 # These no_grad_* functions are necessary as wrappers around the parts of these
 # functions that use `with torch.no_grad()`. The JIT doesn't support context
 # managers, so these need to be implemented as builtins. Using these wrappers
+<<<<<<< HEAD
 # lets us keep those builtins small and reusable.
+=======
+# lets us keep those builtins small and re-usable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _no_grad_uniform_(
     tensor: Tensor, a: float, b: float, generator: _Optional[torch.Generator] = None
 ) -> Tensor:
@@ -460,6 +468,17 @@ def xavier_uniform_(
     Examples:
         >>> w = torch.empty(3, 5)
         >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain("relu"))
+<<<<<<< HEAD
+=======
+
+    Note:
+        Be aware that ``fan_in`` and ``fan_out`` are calculated assuming
+        that the weight matrix is used in a transposed manner,
+        (i.e., ``x @ w.T`` in ``Linear`` layers, where ``w.shape = [fan_out, fan_in]``).
+        This is important for correct initialization.
+        If you plan to use ``x @ w``, where ``w.shape = [fan_in, fan_out]``,
+        pass in a transposed weight matrix, i.e. ``nn.init.xavier_uniform_(w.T, ...)``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -492,6 +511,17 @@ def xavier_normal_(
     Examples:
         >>> w = torch.empty(3, 5)
         >>> nn.init.xavier_normal_(w)
+<<<<<<< HEAD
+=======
+
+    Note:
+        Be aware that ``fan_in`` and ``fan_out`` are calculated assuming
+        that the weight matrix is used in a transposed manner,
+        (i.e., ``x @ w.T`` in ``Linear`` layers, where ``w.shape = [fan_out, fan_in]``).
+        This is important for correct initialization.
+        If you plan to use ``x @ w``, where ``w.shape = [fan_in, fan_out]``,
+        pass in a transposed weight matrix, i.e. ``nn.init.xavier_normal_(w.T, ...)``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -500,7 +530,10 @@ def xavier_normal_(
 
 
 def _calculate_correct_fan(tensor: Tensor, mode: _FanMode) -> int:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
@@ -565,7 +598,11 @@ def kaiming_uniform_(
         )
 
     if 0 in tensor.shape:
+<<<<<<< HEAD
         warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2)
+=======
+        warnings.warn("Initializing zero-element tensors is a no-op")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tensor
     fan = _calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
@@ -619,7 +656,11 @@ def kaiming_normal_(
         pass in a transposed weight matrix, i.e. ``nn.init.kaiming_normal_(w.T, ...)``.
     """
     if 0 in tensor.shape:
+<<<<<<< HEAD
         warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2)
+=======
+        warnings.warn("Initializing zero-element tensors is a no-op")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tensor
     fan = _calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
@@ -706,7 +747,11 @@ def sparse_(
         raise ValueError("Only tensors with 2 dimensions are supported")
 
     rows, cols = tensor.shape
+<<<<<<< HEAD
     num_zeros = math.ceil(sparsity * rows)
+=======
+    num_zeros = int(math.ceil(sparsity * rows))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with torch.no_grad():
         tensor.normal_(0, std, generator=generator)
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index f8dc1d49aad31..0c8f50bc6fcc0 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# flake8: noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_fused.py b/torch/nn/intrinsic/qat/modules/linear_fused.py
index 79567d67bd1f9..90401d3b8fd13 100644
--- a/torch/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/nn/intrinsic/qat/modules/linear_fused.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# flake8: noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
index 71705320075ef..42ebcfdcbc259 100644
--- a/torch/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# flake8: noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index 408e6ef42f128..1ac3a5d90cde0 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -6,7 +6,10 @@
 
 class SyncBatchNorm(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(
         self,
         input,
@@ -211,7 +214,10 @@ def backward(self, grad_output):
 
 class CrossMapLRN2d(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
         ctx.size = size
         ctx.alpha = alpha
@@ -267,7 +273,10 @@ def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         input, output = ctx.saved_tensors
         grad_input = grad_output.new()
@@ -309,7 +318,10 @@ def backward(ctx, grad_output):
 
 class BackwardHookFunction(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, *args):
         ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad])
         return args
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index edd65601db985..910fc905d08b3 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -89,6 +89,7 @@ def __init__(self, threshold: float, value: float, inplace: bool = False) -> Non
         # TODO: check in THNN (if inplace == True, then assert value <= threshold)
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -98,6 +99,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.threshold(input, self.threshold, self.value, self.inplace)
+
+    def extra_repr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"threshold={self.threshold}, value={self.value}{inplace_str}"
 
@@ -133,11 +139,16 @@ class ReLU(Module):
     __constants__ = ["inplace"]
     inplace: bool
 
+<<<<<<< HEAD
     def __init__(self, inplace: bool = False) -> None:
+=======
+    def __init__(self, inplace: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -147,6 +158,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.relu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
@@ -197,13 +213,18 @@ class RReLU(Module):
 
     def __init__(
         self, lower: float = 1.0 / 8, upper: float = 1.0 / 3, inplace: bool = False
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.lower = lower
         self.upper = upper
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -213,6 +234,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
+
+    def extra_repr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"lower={self.lower}, upper={self.upper}{inplace_str}"
 
@@ -286,6 +312,7 @@ def __init__(
         assert self.max_val > self.min_val
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -295,6 +322,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"min_val={self.min_val}, max_val={self.max_val}{inplace_str}"
 
@@ -321,6 +353,7 @@ class ReLU6(Hardtanh):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def __init__(self, inplace: bool = False) -> None:
         super().__init__(0.0, 6.0, inplace)
 
@@ -328,6 +361,12 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+    def __init__(self, inplace: bool = False):
+        super().__init__(0.0, 6.0, inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
@@ -353,9 +392,12 @@ class Sigmoid(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.sigmoid(input)
 
 
@@ -396,9 +438,12 @@ def __init__(self, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.hardsigmoid(input, self.inplace)
 
 
@@ -424,9 +469,12 @@ class Tanh(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.tanh(input)
 
 
@@ -462,11 +510,16 @@ class SiLU(Module):
     __constants__ = ["inplace"]
     inplace: bool
 
+<<<<<<< HEAD
     def __init__(self, inplace: bool = False) -> None:
+=======
+    def __init__(self, inplace: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -476,6 +529,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.silu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
@@ -507,11 +565,16 @@ class Mish(Module):
     __constants__ = ["inplace"]
     inplace: bool
 
+<<<<<<< HEAD
     def __init__(self, inplace: bool = False) -> None:
+=======
+    def __init__(self, inplace: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -521,6 +584,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.mish(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
@@ -564,9 +632,12 @@ def __init__(self, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.hardswish(input, self.inplace)
 
 
@@ -611,6 +682,7 @@ def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -620,6 +692,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.elu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"alpha={self.alpha}{inplace_str}"
 
@@ -662,6 +739,7 @@ def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -671,6 +749,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.celu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"alpha={self.alpha}{inplace_str}"
 
@@ -718,6 +801,7 @@ def __init__(self, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -727,6 +811,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.selu(input, self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
@@ -762,6 +851,7 @@ def __init__(self, dim: int = -1) -> None:
         self.dim = dim
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -771,6 +861,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.glu(input, self.dim)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"dim={self.dim}"
 
 
@@ -810,6 +905,7 @@ def __init__(self, approximate: str = "none") -> None:
         self.approximate = approximate
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -819,6 +915,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.gelu(input, approximate=self.approximate)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"approximate={repr(self.approximate)}"
 
 
@@ -859,6 +960,7 @@ def __init__(self, lambd: float = 0.5) -> None:
         self.lambd = lambd
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Run forward pass.
         """
@@ -868,6 +970,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.hardshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.lambd}"
 
 
@@ -916,6 +1023,7 @@ def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Run forward pass.
         """
@@ -925,6 +1033,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inplace_str = ", inplace=True" if self.inplace else ""
         return f"negative_slope={self.negative_slope}{inplace_str}"
 
@@ -949,9 +1062,12 @@ class LogSigmoid(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Run forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.logsigmoid(input)
 
 
@@ -994,6 +1110,7 @@ def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
         self.threshold = threshold
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Run forward pass.
         """
@@ -1003,6 +1120,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.softplus(input, self.beta, self.threshold)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"beta={self.beta}, threshold={self.threshold}"
 
 
@@ -1041,6 +1163,7 @@ def __init__(self, lambd: float = 0.5) -> None:
         self.lambd = lambd
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Run forward pass.
         """
@@ -1050,6 +1173,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.softshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return str(self.lambd)
 
 
@@ -1074,6 +1202,7 @@ def _is_make_fx_tracing():
         torch_dispatch_mode_stack = (
             torch.utils._python_dispatch._get_current_dispatch_mode_stack()
         )
+<<<<<<< HEAD
         # this can be triggered when dynamo inlining the module too.
         return (
             any(
@@ -1081,6 +1210,11 @@ def _is_make_fx_tracing():
                 for x in torch_dispatch_mode_stack
             )
             or torch.compiler.is_exporting()
+=======
+        return any(
+            type(x) == torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode
+            for x in torch_dispatch_mode_stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     else:
         return False
@@ -1230,7 +1364,11 @@ def __init__(
 
         self._reset_parameters()
 
+<<<<<<< HEAD
     def _reset_parameters(self) -> None:
+=======
+    def _reset_parameters(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._qkv_same_embed_dim:
             xavier_uniform_(self.in_proj_weight)
         else:
@@ -1629,6 +1767,7 @@ def __init__(
         self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs))
         self.reset_parameters()
 
+<<<<<<< HEAD
     def reset_parameters(self) -> None:
         """
         Resets parameters based on their initialization used in ``__init__``.
@@ -1645,6 +1784,15 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+    def reset_parameters(self):
+        torch.nn.init.constant_(self.weight, self.init)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.prelu(input, self.weight)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"num_parameters={self.num_parameters}"
 
 
@@ -1668,9 +1816,12 @@ class Softsign(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.softsign(input)
 
 
@@ -1694,9 +1845,12 @@ class Tanhshrink(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.tanhshrink(input)
 
 
@@ -1744,6 +1898,7 @@ def __setstate__(self, state):
             self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -1753,6 +1908,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.softmin(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"dim={self.dim}"
 
 
@@ -1809,6 +1969,7 @@ def __setstate__(self, state):
             self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -1818,6 +1979,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"dim={self.dim}"
 
 
@@ -1844,9 +2010,12 @@ class Softmax2d(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (3, 4):
             raise ValueError(
                 f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead"
@@ -1894,6 +2063,7 @@ def __setstate__(self, state):
             self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -1903,4 +2073,9 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.log_softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"dim={self.dim}"
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 26103c4f2a7b9..42809ba2ac89c 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -174,18 +174,24 @@ def __init__(
             self.tail.append(projection)
 
     def reset_parameters(self) -> None:
+<<<<<<< HEAD
         """
         Resets parameters based on their initialization used in ``__init__``.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.head.reset_parameters()
         for i2h, h2o in self.tail:  # type: ignore[misc]
             i2h.reset_parameters()  # type: ignore[has-type]
             h2o.reset_parameters()  # type: ignore[has-type]
 
     def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         targ_dim = target_.dim()
 
         if targ_dim == 1:
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 6a78aba2ad7db..aaca8829b251d 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -72,7 +72,10 @@ def __init__(
                 torch.tensor(
                     0,
                     dtype=torch.long,
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
                 ),
             )
@@ -115,7 +118,11 @@ def _load_from_state_dict(
         missing_keys,
         unexpected_keys,
         error_msgs,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         version = local_metadata.get("version", None)
 
         if (version is None or version < 2) and self.track_running_stats:
@@ -194,11 +201,17 @@ def forward(self, input: Tensor) -> Tensor:
         return F.batch_norm(
             input,
             # If buffers are not to be tracked, ensure that they won't be updated
+<<<<<<< HEAD
             (
                 self.running_mean
                 if not self.training or self.track_running_stats
                 else None
             ),
+=======
+            self.running_mean
+            if not self.training or self.track_running_stats
+            else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.running_var if not self.training or self.track_running_stats else None,
             self.weight,
             self.bias,
@@ -222,7 +235,10 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             # affine and track_running_stats are hardcoded to False to
             # avoid creating tensors that will soon be overwritten.
@@ -236,6 +252,7 @@ def __init__(
         self.affine = affine
         self.track_running_stats = track_running_stats
         if self.affine:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             self.weight = UninitializedParameter(**factory_kwargs)
             # pyrefly: ignore [bad-argument-type]
@@ -244,21 +261,36 @@ def __init__(
             # pyrefly: ignore [bad-argument-type]
             self.running_mean = UninitializedBuffer(**factory_kwargs)
             # pyrefly: ignore [bad-argument-type]
+=======
+            self.weight = UninitializedParameter(**factory_kwargs)
+            self.bias = UninitializedParameter(**factory_kwargs)
+        if self.track_running_stats:
+            self.running_mean = UninitializedBuffer(**factory_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.running_var = UninitializedBuffer(**factory_kwargs)
             self.num_batches_tracked = torch.tensor(
                 0,
                 dtype=torch.long,
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
             )
 
     def reset_parameters(self) -> None:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.has_uninitialized_params() and self.num_features != 0:
             super().reset_parameters()
 
     def initialize_parameters(self, input) -> None:  # type: ignore[override]
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.has_uninitialized_params():
             self.num_features = input.shape[1]
             if self.affine:
@@ -347,12 +379,19 @@ class BatchNorm1d(_BatchNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 2 and input.dim() != 3:
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
     r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization.
 
@@ -382,7 +421,11 @@ class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
 
     cls_to_become = BatchNorm1d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 2 and input.dim() != 3:
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
@@ -459,12 +502,19 @@ class BatchNorm2d(_BatchNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 4:
             raise ValueError(f"expected 4D input (got {input.dim()}D input)")
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
     r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization.
 
@@ -494,7 +544,11 @@ class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
 
     cls_to_become = BatchNorm2d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 4:
             raise ValueError(f"expected 4D input (got {input.dim()}D input)")
 
@@ -571,12 +625,19 @@ class BatchNorm3d(_BatchNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 5:
             raise ValueError(f"expected 5D input (got {input.dim()}D input)")
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [inconsistent-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
     r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization.
 
@@ -606,7 +667,11 @@ class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
 
     cls_to_become = BatchNorm3d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 5:
             raise ValueError(f"expected 5D input (got {input.dim()}D input)")
 
@@ -731,20 +796,31 @@ def __init__(
         )
         self.process_group = process_group
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
         if input.dim() < 2:
             raise ValueError(f"expected at least 2D input (got {input.dim()}D input)")
 
     def _check_non_zero_input_channels(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+        if input.dim() < 2:
+            raise ValueError(f"expected at least 2D input (got {input.dim()}D input)")
+
+    def _check_non_zero_input_channels(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.size(1) == 0:
             raise ValueError(
                 "SyncBatchNorm number of input channels should be non-zero"
             )
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check_input_dim(input)
         self._check_non_zero_input_channels(input)
 
@@ -797,7 +873,10 @@ def forward(self, input: Tensor) -> Tensor:
             # currently only GPU/PrivateUse1 input is supported
             if input.device.type not in [
                 "cuda",
+<<<<<<< HEAD
                 "hpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "xpu",
                 torch._C._get_privateuse1_backend_name(),
             ]:
diff --git a/torch/nn/modules/channelshuffle.py b/torch/nn/modules/channelshuffle.py
index 34a48f04f853d..65af56a7d39c7 100644
--- a/torch/nn/modules/channelshuffle.py
+++ b/torch/nn/modules/channelshuffle.py
@@ -50,6 +50,7 @@ def __init__(self, groups: int) -> None:
         self.groups = groups
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -59,4 +60,9 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.channel_shuffle(input, self.groups)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"groups={self.groups}"
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index a93843f859a7b..8f3f194f0e6a8 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -109,7 +109,10 @@ class Sequential(Module):
     def __init__(self, *args: Module) -> None: ...
 
     @overload
+<<<<<<< HEAD
     # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, arg: OrderedDict[str, Module]) -> None: ...
 
     def __init__(self, *args):
@@ -171,9 +174,12 @@ def __add__(self, other) -> Sequential:
             )
 
     def pop(self, key: Union[int, slice]) -> Module:
+<<<<<<< HEAD
         """
         Pop ``key`` from self.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         v = self[key]
         del self[key]
         return v
@@ -244,9 +250,12 @@ def __iter__(self) -> Iterator[Module]:
     # TestScript.test_sequential_intermediary_types).  Cannot annotate
     # with Any as TorchScript expects a more precise type
     def forward(self, input):
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for module in self:
             input = module(input)
         return input
@@ -493,7 +502,11 @@ def extend(self, modules: Iterable[Module]) -> Self:
             self.add_module(str(offset + i), module)
         return self
 
+<<<<<<< HEAD
     # remove forward altogether to fallback on Module's _forward_unimplemented
+=======
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ModuleDict(Module):
@@ -624,18 +637,28 @@ def update(self, modules: Mapping[str, Module]) -> None:
                         "ModuleDict update sequence element "
                         "#" + str(j) + " should be Iterable; is" + type(m).__name__
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 if not len(m) == 2:
                     raise ValueError(
                         "ModuleDict update sequence element "
                         # pyrefly: ignore [bad-argument-type]
+=======
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
                     )
                 # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
                 # that's too cumbersome to type correctly with overloads, so we add an ignore here
                 self[m[0]] = m[1]  # type: ignore[assignment]
 
+<<<<<<< HEAD
     # remove forward altogether to fallback on Module's _forward_unimplemented
+=======
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ParameterList(Module):
@@ -687,7 +710,10 @@ def _get_abs_string_index(self, idx):
     def __getitem__(self, idx: int) -> Any: ...
 
     @overload
+<<<<<<< HEAD
     # pyrefly: ignore [inconsistent-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __getitem__(self: T, idx: slice) -> T: ...
 
     def __getitem__(self, idx):
@@ -756,9 +782,12 @@ def extend(self, values: Iterable[Any]) -> Self:
         return self
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         child_lines = []
         for k, p in enumerate(self):
             if isinstance(p, torch.Tensor):
@@ -773,11 +802,17 @@ def extra_repr(self) -> str:
                     size_str,
                     device_str,
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 child_lines.append("  (" + str(k) + "): " + parastr)
             else:
                 child_lines.append(
                     # pyrefly: ignore [bad-argument-type]
+=======
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "  (" + str(k) + "): Object of type: " + type(p).__name__
                 )
 
@@ -929,7 +964,11 @@ def get(self, key: str, default: Optional[Any] = None) -> Any:
             key (str): key to get from the ParameterDict
             default (Parameter, optional): value to return if key not present
         """
+<<<<<<< HEAD
         return self[key] if key in self else default  # noqa: SIM401
+=======
+        return self[key] if key in self else default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def fromkeys(
         self, keys: Iterable[str], default: Optional[Any] = None
@@ -985,11 +1024,17 @@ def update(self, parameters: Union[Mapping[str, Any], ParameterDict]) -> None:
                         "ParameterDict update sequence element "
                         "#" + str(j) + " should be Iterable; is" + type(p).__name__
                     )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 if not len(p) == 2:
                     raise ValueError(
                         "ParameterDict update sequence element "
                         # pyrefly: ignore [bad-argument-type]
+=======
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
                     )
                 # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
@@ -1010,11 +1055,17 @@ def extra_repr(self) -> str:
                     size_str,
                     device_str,
                 )
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
                 child_lines.append("  (" + str(k) + "): " + parastr)
             else:
                 child_lines.append(
                     # pyrefly: ignore [bad-argument-type]
+=======
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "  (" + str(k) + "): Object of type: " + type(p).__name__
                 )
         tmpstr = "\n".join(child_lines)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index f06e38c2abae2..da5137d0f36c3 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
 from typing import Literal, Optional, Union
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -80,7 +84,11 @@ def _conv_forward(  # type: ignore[empty-body]
     transposed: bool
     output_padding: tuple[int, ...]
     groups: int
+<<<<<<< HEAD
     padding_mode: Literal["zeros", "reflect", "replicate", "circular"]
+=======
+    padding_mode: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weight: Tensor
     bias: Optional[Tensor]
 
@@ -96,7 +104,11 @@ def __init__(
         output_padding: tuple[int, ...],
         groups: int,
         bias: bool,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"],
+=======
+        padding_mode: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -324,7 +336,11 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",  # TODO: refine this type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -363,7 +379,10 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.conv1d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -504,7 +523,11 @@ def __init__(
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",  # TODO: refine this type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -541,7 +564,10 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.conv2d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -674,7 +700,11 @@ def __init__(
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -711,7 +741,10 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.conv3d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -901,6 +934,7 @@ class ConvTranspose1d(_ConvTransposeNd):
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
 
+<<<<<<< HEAD
     Examples::
 
         >>> # With square kernels and equal stride
@@ -918,6 +952,8 @@ class ConvTranspose1d(_ConvTransposeNd):
         >>> output.size()
         torch.Size([1, 16, 12])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     .. _`here`:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
@@ -937,7 +973,11 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_1_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -1125,7 +1165,11 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_2_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -1316,7 +1360,11 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_3_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
@@ -1394,7 +1442,11 @@ class _ConvTransposeMixin(_ConvTransposeNd):
         "Please consider using public APIs.",
         category=FutureWarning,
     )
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None:
+=======
+    def __init__(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(*args, **kwargs)
 
 
@@ -1509,12 +1561,19 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1529,11 +1588,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1581,12 +1646,19 @@ def __init__(
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",  # TODO: refine this type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1601,11 +1673,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1654,12 +1732,19 @@ def __init__(
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1674,11 +1759,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1725,12 +1816,19 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_1_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1746,11 +1844,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1797,12 +1901,19 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: int = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1818,11 +1929,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1869,12 +1986,19 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_3_t = 1,
+<<<<<<< HEAD
         padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+=======
+        padding_mode: str = "zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=None,
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             0,
             0,
@@ -1890,11 +2014,17 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, bad-argument-type]
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
             # pyrefly: ignore [bad-override, bad-argument-type]
+=======
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py
index 27ab92fef5eb4..023184d945919 100644
--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@@ -55,9 +55,12 @@ def __init__(
         self.keepdim = keepdim
 
     def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
 
 
@@ -94,7 +97,10 @@ def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
         self.eps = eps
 
     def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.cosine_similarity(x1, x2, self.dim, self.eps)
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index ee3de5d61dc0b..e4237c33ee8c0 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -67,9 +67,12 @@ class Dropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.dropout(input, self.p, self.training, self.inplace)
 
 
@@ -115,9 +118,12 @@ class Dropout1d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.dropout1d(input, self.p, self.training, self.inplace)
 
 
@@ -170,9 +176,12 @@ class Dropout2d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.dropout2d(input, self.p, self.training, self.inplace)
 
 
@@ -218,9 +227,12 @@ class Dropout3d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.dropout3d(input, self.p, self.training, self.inplace)
 
 
@@ -263,9 +275,12 @@ class AlphaDropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.alpha_dropout(input, self.p, self.training)
 
 
@@ -317,7 +332,10 @@ class FeatureAlphaDropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.feature_alpha_dropout(input, self.p, self.training)
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index c4920ccd65b00..64582c2bf1614 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -50,6 +50,7 @@ def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
         self.end_dim = end_dim
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -59,6 +60,11 @@ def extra_repr(self) -> str:
         """
         Returns the extra representation of the module.
         """
+=======
+        return input.flatten(self.start_dim, self.end_dim)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"start_dim={self.start_dim}, end_dim={self.end_dim}"
 
 
@@ -130,7 +136,11 @@ def __init__(
         self.dim = dim
         self.unflattened_size = unflattened_size
 
+<<<<<<< HEAD
     def _require_tuple_tuple(self, input) -> None:
+=======
+    def _require_tuple_tuple(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(input, tuple):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, tuple):
@@ -144,7 +154,11 @@ def _require_tuple_tuple(self, input) -> None:
             + f"but found type {type(input).__name__}"
         )
 
+<<<<<<< HEAD
     def _require_tuple_int(self, input) -> None:
+=======
+    def _require_tuple_int(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(input, (tuple, list)):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, int):
@@ -158,6 +172,7 @@ def _require_tuple_int(self, input) -> None:
         )
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -167,4 +182,9 @@ def extra_repr(self) -> str:
         """
         Returns the extra representation of the module.
         """
+=======
+        return input.unflatten(self.dim, self.unflattened_size)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"dim={self.dim}, unflattened_size={self.unflattened_size}"
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index ab1a58882c852..a2525ca8b474b 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -147,9 +147,12 @@ def __init__(
         self.stride = stride
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.fold(
             input,
             self.output_size,
@@ -160,9 +163,12 @@ def forward(self, input: Tensor) -> Tensor:
         )
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             "output_size={output_size}, kernel_size={kernel_size}, "
             "dilation={dilation}, padding={padding}, stride={stride}".format(
@@ -318,17 +324,23 @@ def __init__(
         self.stride = stride
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.unfold(
             input, self.kernel_size, self.dilation, self.padding, self.stride
         )
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             "kernel_size={kernel_size}, dilation={dilation}, padding={padding},"
             " stride={stride}".format(**self.__dict__)
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index da3d3658553f0..f44124390a795 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -64,7 +64,11 @@ def _load_from_state_dict(
         missing_keys,
         unexpected_keys,
         error_msgs,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         version = local_metadata.get("version", None)
         # at version 1: removed running_mean and running_var when
         # track_running_stats=False (default)
@@ -115,8 +119,12 @@ def forward(self, input: Tensor) -> Tensor:
                 warnings.warn(
                     f"input's size at dim={feature_dim} does not match num_features. "
                     "You can silence this warning by not passing in num_features, "
+<<<<<<< HEAD
                     "which is not used because affine=False",
                     stacklevel=2,
+=======
+                    "which is not used because affine=False"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         if input.dim() == self._get_no_batch_dim():
@@ -194,10 +202,17 @@ class InstanceNorm1d(_InstanceNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 2
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 2
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (2, 3):
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
@@ -231,10 +246,17 @@ class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm):
 
     cls_to_become = InstanceNorm1d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 2
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 2
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (2, 3):
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
@@ -310,10 +332,17 @@ class InstanceNorm2d(_InstanceNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 3
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 3
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (3, 4):
             raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
 
@@ -348,10 +377,17 @@ class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm):
 
     cls_to_become = InstanceNorm2d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 3
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 3
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (3, 4):
             raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
 
@@ -426,10 +462,17 @@ class InstanceNorm3d(_InstanceNorm):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 4
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 4
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (4, 5):
             raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
 
@@ -464,9 +507,16 @@ class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm):
 
     cls_to_become = InstanceNorm3d  # type: ignore[assignment]
 
+<<<<<<< HEAD
     def _get_no_batch_dim(self) -> int:
         return 4
 
     def _check_input_dim(self, input) -> None:
+=======
+    def _get_no_batch_dim(self):
+        return 4
+
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() not in (4, 5):
             raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index d4c192ee8ce4a..83d020d21e4a2 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -170,11 +170,17 @@ class LazyModuleMixin:
     cls_to_become: Optional[type[Any]] = None
 
     def __init__(self: _LazyProtocol, *args, **kwargs):
+<<<<<<< HEAD
         # Mypy doesn't like this super call in a mixin
         super().__init__(*args, **kwargs)  # type: ignore[misc]
         # pyrefly: ignore [read-only]
         self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
         # pyrefly: ignore [read-only]
+=======
+        # Mypy doesnt like this super call in a mixin
+        super().__init__(*args, **kwargs)  # type: ignore[misc]
+        self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._initialize_hook = self.register_forward_pre_hook(
             self._infer_parameters, with_kwargs=True
         )
@@ -252,7 +258,11 @@ def has_uninitialized_params(self: _LazyProtocol):
     def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
         r"""Infers the size and initializes the parameters according to the provided input batch.
 
+<<<<<<< HEAD
         Given a module that contains parameters that were declared inferable
+=======
+        Given a module that contains parameters that were declared inferrable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass
         in the complete module using the provided input to initialize all the parameters
         as needed.
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index c58bdcefd0e0a..34d9d1ca9bf7f 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -44,9 +44,12 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__()
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return input
 
 
@@ -115,9 +118,12 @@ def __init__(
         self.reset_parameters()
 
     def reset_parameters(self) -> None:
+<<<<<<< HEAD
         """
         Resets parameters based on their initialization used in ``__init__``.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
         # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
         # https://github.com/pytorch/pytorch/issues/57109
@@ -128,6 +134,7 @@ def reset_parameters(self) -> None:
             init.uniform_(self.bias, -bound, bound)
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -137,6 +144,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.linear(input, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
 
 
@@ -214,6 +226,11 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+<<<<<<< HEAD
+=======
+        if in1_features <= 0:
+            raise ValueError(f"in1_features must be > 0, but got {in1_features}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.in1_features = in1_features
         self.in2_features = in2_features
         self.out_features = out_features
@@ -228,6 +245,7 @@ def __init__(
         self.reset_parameters()
 
     def reset_parameters(self) -> None:
+<<<<<<< HEAD
         """
         Resets parameters based on their initialization used in ``__init__``.
         """
@@ -235,12 +253,15 @@ def reset_parameters(self) -> None:
             raise ValueError(
                 f"in1_features must be > 0, but got (in1_features={self.in1_features})"
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bound = 1 / math.sqrt(self.weight.size(1))
         init.uniform_(self.weight, -bound, bound)
         if self.bias is not None:
             init.uniform_(self.bias, -bound, bound)
 
     def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -250,6 +271,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.bilinear(input1, input2, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             f"in1_features={self.in1_features}, in2_features={self.in2_features}, "
             f"out_features={self.out_features}, bias={self.bias is not None}"
@@ -286,7 +312,10 @@ class LazyLinear(LazyModuleMixin, Linear):
     """
 
     cls_to_become = Linear  # type: ignore[assignment]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weight: UninitializedParameter
     bias: UninitializedParameter  # type: ignore[assignment]
 
@@ -296,6 +325,7 @@ def __init__(
         factory_kwargs = {"device": device, "dtype": dtype}
         # bias is hardcoded to False to avoid creating tensor
         # that will soon be overwritten.
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         super().__init__(0, 0, False)
         # pyrefly: ignore [bad-argument-type]
@@ -310,14 +340,26 @@ def reset_parameters(self) -> None:
         Resets parameters based on their initialization used in ``__init__``.
         """
         # pyrefly: ignore [bad-argument-type]
+=======
+        super().__init__(0, 0, False)
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_features = out_features
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def reset_parameters(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.has_uninitialized_params() and self.in_features != 0:
             super().reset_parameters()
 
     def initialize_parameters(self, input) -> None:  # type: ignore[override]
+<<<<<<< HEAD
         """
         Infers ``in_features`` based on ``input`` and initializes parameters.
         """
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.has_uninitialized_params():
             with torch.no_grad():
                 self.in_features = input.shape[-1]
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index c5d2a94cf3941..549d9a6de16b5 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 from torch import Tensor
@@ -127,9 +131,12 @@ def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> N
         super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.l1_loss(input, target, reduction=self.reduction)
 
 
@@ -254,9 +261,12 @@ def __init__(
         self.ignore_index = ignore_index
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.nll_loss(
             input,
             target,
@@ -361,9 +371,12 @@ def __init__(
         self.eps = eps
 
     def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.poisson_nll_loss(
             log_input,
             target,
@@ -455,9 +468,12 @@ def __init__(
     def forward(
         self, input: Tensor, target: Tensor, var: Union[Tensor, float]
     ) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.gaussian_nll_loss(
             input, target, var, full=self.full, eps=self.eps, reduction=self.reduction
         )
@@ -558,9 +574,12 @@ def __init__(
         self.log_target = log_target
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.kl_div(
             input, target, reduction=self.reduction, log_target=self.log_target
         )
@@ -629,9 +648,12 @@ def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> N
         super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.mse_loss(input, target, reduction=self.reduction)
 
 
@@ -722,9 +744,12 @@ def __init__(
         super().__init__(weight, size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.binary_cross_entropy(
             input, target, weight=self.weight, reduction=self.reduction
         )
@@ -814,7 +839,11 @@ class BCEWithLogitsLoss(_Loss):
             operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
             size [B, C, H, W] will apply different pos_weights to each element of the batch or
             [C, H, W] the same pos_weights across the batch. To apply the same positive weight
+<<<<<<< HEAD
             along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+=======
+            along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Default: ``None``
 
     Shape:
@@ -847,7 +876,10 @@ def __init__(
         self.pos_weight: Optional[Tensor]
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.binary_cross_entropy_with_logits(
             input,
             target,
@@ -921,7 +953,10 @@ def __init__(
         self.margin = margin
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.hinge_embedding_loss(
             input, target, margin=self.margin, reduction=self.reduction
         )
@@ -989,7 +1024,10 @@ def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> N
         super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.multilabel_margin_loss(input, target, reduction=self.reduction)
 
 
@@ -1074,7 +1112,10 @@ def __init__(
         self.beta = beta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
 
 
@@ -1136,7 +1177,10 @@ def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None:
         self.delta = delta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
 
 
@@ -1179,7 +1223,10 @@ def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> N
         super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.soft_margin_loss(input, target, reduction=self.reduction)
 
 
@@ -1288,9 +1335,13 @@ class probabilities only when a single class label per minibatch item is too res
           :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The
           target data type is required to be long when using class indices. If containing class probabilities, the
           target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target
+<<<<<<< HEAD
           data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce
           probability constraints on the class probabilities and that it is the user's responsibility to ensure
           ``target`` contains valid probability distributions (see below examples section for more details).
+=======
+          data type is required to be float when using class probabilities.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
           in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
 
@@ -1317,6 +1368,7 @@ class probabilities only when a single class label per minibatch item is too res
         >>> target = torch.randn(3, 5).softmax(dim=1)
         >>> output = loss(input, target)
         >>> output.backward()
+<<<<<<< HEAD
 
     .. note::
         When ``target`` contains class probabilities, it should consist of soft labels—that is,
@@ -1362,6 +1414,8 @@ class probabilities only when a single class label per minibatch item is too res
         tensor([1.0000, 1.0000, 1.0000])
         >>> loss(input, target_new).item()
         2.55349063873291
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     __constants__ = ["ignore_index", "reduction", "label_smoothing"]
@@ -1382,7 +1436,10 @@ def __init__(
         self.label_smoothing = label_smoothing
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.cross_entropy(
             input,
             target,
@@ -1444,7 +1501,10 @@ def __init__(
         super().__init__(weight, size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.multilabel_soft_margin_loss(
             input, target, weight=self.weight, reduction=self.reduction
         )
@@ -1516,7 +1576,10 @@ def __init__(
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.cosine_embedding_loss(
             input1, input2, target, margin=self.margin, reduction=self.reduction
         )
@@ -1583,7 +1646,10 @@ def __init__(
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.margin_ranking_loss(
             input1, input2, target, margin=self.margin, reduction=self.reduction
         )
@@ -1674,7 +1740,10 @@ def __init__(
         self.margin = margin
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.multi_margin_loss(
             input,
             target,
@@ -1770,7 +1839,11 @@ def __init__(
         size_average=None,
         reduce=None,
         reduction: str = "mean",
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(size_average, reduce, reduction)
         if margin <= 0:
             raise ValueError(
@@ -1782,7 +1855,10 @@ def __init__(
         self.swap = swap
 
     def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.triplet_margin_loss(
             anchor,
             positive,
@@ -1905,7 +1981,11 @@ def __init__(
         margin: float = 1.0,
         swap: bool = False,
         reduction: str = "mean",
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(size_average=None, reduce=None, reduction=reduction)
         if margin <= 0:
             raise ValueError(
@@ -1918,7 +1998,10 @@ def __init__(
         self.swap = swap
 
     def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.triplet_margin_with_distance_loss(
             anchor,
             positive,
@@ -2063,11 +2146,18 @@ class CTCLoss(_Loss):
         https://www.cs.toronto.edu/~graves/icml_2006.pdf
 
     Note:
+<<<<<<< HEAD
         In order to use CuDNN, the following must be satisfied: the :attr:`targets` must be
         in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
         :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
         dtype :attr:`torch.int32`, and the :attr:`log_probs` itself must be of
         dtype :attr:`torch.float32`.
+=======
+        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
+        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
+        dtype :attr:`torch.int32`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
 
@@ -2087,7 +2177,11 @@ class CTCLoss(_Loss):
 
     def __init__(
         self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(reduction=reduction)
         self.blank = blank
         self.zero_infinity = zero_infinity
@@ -2099,7 +2193,10 @@ def forward(
         input_lengths: Tensor,
         target_lengths: Tensor,
     ) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.ctc_loss(
             log_probs,
             targets,
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index f7e3d2f262def..a0237acab515d 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -6,8 +6,13 @@
 import warnings
 import weakref
 from collections import namedtuple, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator, Mapping
 from typing import Any, Optional, overload, TypeVar, Union
+=======
+from collections.abc import Iterator, Mapping
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -38,13 +43,20 @@
 
 
 class _IncompatibleKeys(
+<<<<<<< HEAD
     # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]),
 ):
     __slots__ = ()
 
+<<<<<<< HEAD
     def __repr__(self) -> str:
         # pyrefly: ignore [missing-attribute]
+=======
+    def __repr__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.missing_keys and not self.unexpected_keys:
             return "<All keys matched successfully>"
         return super().__repr__()
@@ -72,7 +84,11 @@ def _addindent(s_, numSpaces):
 
 
 class _WrappedHook:
+<<<<<<< HEAD
     def __init__(self, hook: Callable, module: Optional["Module"] = None) -> None:
+=======
+    def __init__(self, hook: Callable, module: Optional["Module"] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.hook: Callable = hook
         functools.update_wrapper(self, hook)
 
@@ -93,7 +109,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
     def __getstate__(self) -> dict:
         result = {"hook": self.hook, "with_module": self.with_module}
         if self.with_module:
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result["module"] = self.module()
 
         return result
@@ -479,7 +498,11 @@ def forward(self, x):
     call_super_init: bool = False
     _compiled_call_impl: Optional[Callable] = None
 
+<<<<<<< HEAD
     def __init__(self, *args: Any, **kwargs: Any) -> None:
+=======
+    def __init__(self, *args, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Initialize internal Module state, shared by both nn.Module and ScriptModule."""
         torch._C._log_api_usage_once("python.nn_module")
 
@@ -571,9 +594,13 @@ def register_buffer(
             raise KeyError('buffer name can\'t be empty string ""')
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError(f"attribute '{name}' already exists")
+<<<<<<< HEAD
         elif tensor is not None and not (
             isinstance(tensor, torch.Tensor) or hasattr(tensor, "__torch_function__")
         ):
+=======
+        elif tensor is not None and not isinstance(tensor, torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise TypeError(
                 f"cannot assign '{torch.typename(tensor)}' object to buffer '{name}' "
                 "(torch Tensor or None required)"
@@ -932,12 +959,17 @@ def _apply(self, fn, recurse=True):
             for module in self.children():
                 module._apply(fn)
 
+<<<<<<< HEAD
         from torch._subclasses.fake_tensor import FakeTensor
 
         def compute_should_use_set_data(tensor, tensor_applied) -> bool:
             if torch._has_compatible_shallow_copy_type(
                 tensor, tensor_applied
             ) and not isinstance(tensor_applied, FakeTensor):
+=======
+        def compute_should_use_set_data(tensor, tensor_applied):
+            if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # If the new tensor has compatible tensor type as the existing tensor,
                 # the current behavior is to change the tensor in-place using `.data =`,
                 # and the future behavior is to overwrite the existing tensor. However,
@@ -964,6 +996,11 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
 
+<<<<<<< HEAD
+=======
+            from torch._subclasses.fake_tensor import FakeTensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # subclasses may have multiple child tensors so we need to use swap_tensors
             p_should_use_swap_tensors = (
                 should_use_swap_tensors
@@ -979,9 +1016,13 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                         # Decrement use count of the gradient by setting to None
                         param.grad = None
                     param_applied = torch.nn.Parameter(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
                         param_applied,
                         requires_grad=param.requires_grad,
+=======
+                        param_applied, requires_grad=param.requires_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     torch.utils.swap_tensors(param, param_applied)
                 except Exception as e:
@@ -992,13 +1033,19 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                     ) from e
                 out_param = param
             elif p_should_use_set_data:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 param.data = param_applied
                 out_param = param
             else:
                 assert isinstance(param, Parameter)
                 assert param.is_leaf
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_param = Parameter(param_applied, param.requires_grad)
                 self._parameters[key] = out_param
 
@@ -1049,7 +1096,11 @@ def apply(self, fn: Callable[["Module"], None]) -> Self:
             >>> @torch.no_grad()
             >>> def init_weights(m):
             >>>     print(m)
+<<<<<<< HEAD
             >>>     if type(m) is nn.Linear:
+=======
+            >>>     if type(m) == nn.Linear:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>>         m.weight.fill_(1.0)
             >>>         print(m.weight)
             >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
@@ -1337,9 +1388,13 @@ def to(self, *args, **kwargs):
 
         """
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
             *args,
             **kwargs,
+=======
+            *args, **kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if dtype is not None:
@@ -1353,8 +1408,12 @@ def to(self, *args, **kwargs):
                     "Complex modules are a new feature under active development whose design may change, "
                     "and some modules might not work as expected when using complex tensors as parameters or buffers. "
                     "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+<<<<<<< HEAD
                     "if a complex module does not work as expected.",
                     stacklevel=2,
+=======
+                    "if a complex module does not work as expected."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         def convert(t):
@@ -1550,7 +1609,11 @@ def _get_backward_pre_hooks(self):
 
         return backward_pre_hooks
 
+<<<<<<< HEAD
     def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn) -> None:
+=======
+    def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(result, torch.Tensor):
             if not (
                 isinstance(result, tuple)
@@ -1764,7 +1827,15 @@ def _slow_forward(self, *input, **kwargs):
         if recording_scopes:
             # type ignore was added because at this point one knows that
             # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any]
+<<<<<<< HEAD
             name = torch.jit._trace._trace_module_map.get(self, None)  # type: ignore[operator, union-attr]
+=======
+            name = (
+                torch.jit._trace._trace_module_map[self]  # type: ignore[index]
+                if self in torch.jit._trace._trace_module_map  # type: ignore[operator]
+                else None
+            )  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if name:
                 tracing_state.push_scope(name)
             else:
@@ -1856,7 +1927,11 @@ def inner():
                 if not isinstance(result, (torch.Tensor, tuple)):
                     warnings.warn("For backward hooks to be called,"
                                   " module output should be a Tensor or a tuple of Tensors"
+<<<<<<< HEAD
                                   f" but received {type(result)}", stacklevel=2)
+=======
+                                  f" but received {type(result)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result = bw_hook.setup_output_hook(result)
 
             # Handle the non-full backward hooks
@@ -1899,7 +1974,11 @@ def inner():
                             result = hook_result
                     except Exception as e:
                         warnings.warn("global module forward hook with ``always_call=True`` raised an exception "
+<<<<<<< HEAD
                                       f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
+=======
+                                      f"that was silenced as another error was raised in forward: {str(e)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         continue
 
             for hook_id, hook in self._forward_hooks.items():
@@ -1913,7 +1992,11 @@ def inner():
                             result = hook_result
                     except Exception as e:
                         warnings.warn("module forward hook with ``always_call=True`` raised an exception "
+<<<<<<< HEAD
                                       f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
+=======
+                                      f"that was silenced as another error was raised in forward: {str(e)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         continue
             # raise exception raised in try block
             raise
@@ -1974,7 +2057,11 @@ def __getattr__(self, name: str) -> Union[Tensor, "Module"]:
         )
 
     def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
+<<<<<<< HEAD
         def remove_from(*dicts_or_sets) -> None:
+=======
+        def remove_from(*dicts_or_sets):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for d in dicts_or_sets:
                 if name in d:
                     if isinstance(d, dict):
@@ -2034,10 +2121,14 @@ def remove_from(*dicts_or_sets) -> None:
             else:
                 buffers = self.__dict__.get("_buffers")
                 if isinstance(value, Buffer) or buffers is not None and name in buffers:
+<<<<<<< HEAD
                     if value is not None and not (
                         isinstance(value, torch.Tensor)
                         or hasattr(value, "__torch_function__")
                     ):
+=======
+                    if value is not None and not isinstance(value, torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         raise TypeError(
                             f"cannot assign '{torch.typename(value)}' as buffer '{name}' "
                             "(torch.nn.Buffer, torch.Tensor or None expected)"
@@ -2078,7 +2169,11 @@ def remove_from(*dicts_or_sets) -> None:
                 else:
                     super().__setattr__(name, value)
 
+<<<<<<< HEAD
     def __delattr__(self, name) -> None:
+=======
+    def __delattr__(self, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in self._parameters:
             del self._parameters[name]
         elif name in self._buffers:
@@ -2145,7 +2240,11 @@ def register_state_dict_pre_hook(self, hook):
         self._state_dict_pre_hooks[handle.id] = hook
         return handle
 
+<<<<<<< HEAD
     def _save_to_state_dict(self, destination, prefix, keep_vars) -> None:
+=======
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Save module state to the `destination` dictionary.
 
         The `destination` dictionary will contain the state
@@ -2259,7 +2358,10 @@ def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
 
         if destination is None:
             destination = OrderedDict()
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             destination._metadata = OrderedDict()
 
         local_metadata = dict(version=self._version)
@@ -2356,7 +2458,11 @@ def _load_from_state_dict(
         missing_keys,
         unexpected_keys,
         error_msgs,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Copy parameters and buffers from :attr:`state_dict` into only this module, but not its descendants.
 
         This is called on every submodule
@@ -2409,9 +2515,13 @@ def _load_from_state_dict(
             if k not in self._non_persistent_buffers_set
         }
         local_name_params = itertools.chain(
+<<<<<<< HEAD
             self._parameters.items(),
             # pyrefly: ignore [bad-argument-type]
             persistent_buffers.items(),
+=======
+            self._parameters.items(), persistent_buffers.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         local_state = {k: v for k, v in local_name_params if v is not None}
         assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
@@ -2458,8 +2568,12 @@ def _load_from_state_dict(
                         f"for {key}: copying from a non-meta parameter in the checkpoint to a meta "
                         "parameter in the current model, which is a no-op. (Did you mean to "
                         "pass `assign=True` to assign items in the state dictionary to their "
+<<<<<<< HEAD
                         "corresponding key in the module instead of copying them in place?)",
                         stacklevel=2,
+=======
+                        "corresponding key in the module instead of copying them in place?)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                 try:
@@ -2585,7 +2699,11 @@ def load_state_dict(
             # mypy isn't aware that "_metadata" exists in state_dict
             state_dict._metadata = metadata  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
         def load(module, local_state_dict, prefix="") -> None:
+=======
+        def load(module, local_state_dict, prefix=""):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
             if assign:
                 local_metadata["assign_to_params_buffers"] = assign
@@ -2958,8 +3076,12 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                 "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
                 "The parameters are copied (in a differentiable manner) from the original module. "
                 "This means they are not leaf nodes in autograd and so don't accumulate gradients. "
+<<<<<<< HEAD
                 "If you need gradients in your forward method, consider using autograd.grad instead.",
                 stacklevel=2,
+=======
+                "If you need gradients in your forward method, consider using autograd.grad instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         for p in self.parameters():
@@ -2989,7 +3111,11 @@ def extra_repr(self) -> str:
         """
         return ""
 
+<<<<<<< HEAD
     def __repr__(self) -> str:
+=======
+    def __repr__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We treat the extra repr like the sub-module, one item per line
         extra_lines = []
         extra_repr = self.extra_repr()
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 1474de008c185..e741af67f8b69 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -60,6 +60,7 @@ def __init__(
         self.k = k
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -69,6 +70,11 @@ def extra_repr(self):
         """
         Return the extra representation of the module.
         """
+=======
+        return F.local_response_norm(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
 
 
@@ -88,6 +94,7 @@ def __init__(
         self.k = k
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -97,6 +104,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
 
 
@@ -249,7 +261,11 @@ class GroupNorm(Module):
     The input channels are separated into :attr:`num_groups` groups, each containing
     ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
     :attr:`num_groups`. The mean and standard-deviation are calculated
+<<<<<<< HEAD
     separately over each group. :math:`\gamma` and :math:`\beta` are learnable
+=======
+    separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     per-channel affine transform parameter vectors of size :attr:`num_channels` if
     :attr:`affine` is ``True``.
     The variance is calculated via the biased estimator, equivalent to
@@ -409,13 +425,21 @@ def reset_parameters(self) -> None:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
+<<<<<<< HEAD
         Runs the forward pass.
+=======
+        Runs forward pass.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
 
     def extra_repr(self) -> str:
         """
+<<<<<<< HEAD
         Return the extra representation of the module.
+=======
+        Extra information about the module.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         return (
             "{normalized_shape}, eps={eps}, "
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index d5aa1e0d42554..4867a1f04f2da 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -84,14 +84,21 @@ class CircularPad1d(_CircularPadNd):
                  [5., 6., 7., 4., 5., 6., 7., 4.]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
         super().__init__()
         self.padding = _pair(padding)
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 2 and input.dim() != 3:
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
@@ -145,14 +152,21 @@ class CircularPad2d(_CircularPadNd):
                   [8., 6., 7., 8., 6.]]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
         super().__init__()
         self.padding = _quadruple(padding)
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 3 and input.dim() != 4:
             raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
 
@@ -196,14 +210,21 @@ class CircularPad3d(_CircularPadNd):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
         super().__init__()
         self.padding = _ntuple(6)(padding)
 
+<<<<<<< HEAD
     def _check_input_dim(self, input) -> None:
+=======
+    def _check_input_dim(self, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input.dim() != 4 and input.dim() != 5:
             raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
 
@@ -268,10 +289,16 @@ class ConstantPad1d(_ConstantPadNd):
                  [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t, value: float) -> None:
+=======
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t, value: float):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(value)
         self.padding = _pair(padding)
 
@@ -320,7 +347,10 @@ class ConstantPad2d(_ConstantPadNd):
     """
 
     __constants__ = ["padding", "value"]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t, value: float) -> None:
@@ -361,7 +391,10 @@ class ConstantPad3d(_ConstantPadNd):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t, value: float) -> None:
@@ -415,7 +448,10 @@ class ReflectionPad1d(_ReflectionPadNd):
                  [7., 6., 5., 4., 5., 6., 7., 6.]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
@@ -469,7 +505,10 @@ class ReflectionPad2d(_ReflectionPadNd):
                   [7., 6., 7., 8., 7.]]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
@@ -525,7 +564,10 @@ class ReflectionPad3d(_ReflectionPadNd):
                    [1., 0., 1., 0.]]]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
@@ -579,7 +621,10 @@ class ReplicationPad1d(_ReplicationPadNd):
                  [4., 4., 4., 4., 5., 6., 7., 7.]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
@@ -633,7 +678,10 @@ class ReplicationPad2d(_ReplicationPadNd):
                   [6., 6., 7., 8., 8.]]]])
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
@@ -676,7 +724,10 @@ class ReplicationPad3d(_ReplicationPadNd):
         >>> output = m(input)
     """
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
@@ -734,9 +785,12 @@ def __init__(self, padding: _size_2_t) -> None:
         super().__init__(padding, 0.0)
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.padding}"
 
 
@@ -791,9 +845,12 @@ def __init__(self, padding: _size_4_t) -> None:
         super().__init__(padding, 0.0)
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.padding}"
 
 
@@ -836,7 +893,10 @@ def __init__(self, padding: _size_6_t) -> None:
         super().__init__(padding, 0.0)
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.padding}"
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index 74c9e0878f0b5..57e0e782c8290 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -56,6 +56,7 @@ def __init__(self, upscale_factor: int) -> None:
         self.upscale_factor = upscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -65,6 +66,11 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.pixel_shuffle(input, self.upscale_factor)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"upscale_factor={self.upscale_factor}"
 
 
@@ -115,6 +121,7 @@ def __init__(self, downscale_factor: int) -> None:
         self.downscale_factor = downscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
@@ -124,4 +131,9 @@ def extra_repr(self) -> str:
         """
         Return the extra representation of the module.
         """
+=======
+        return F.pixel_unshuffle(input, self.downscale_factor)
+
+    def extra_repr(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"downscale_factor={self.downscale_factor}"
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 777e6b0abd8c4..760a4b905d4aa 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -142,7 +142,10 @@ class MaxPool1d(_MaxPoolNd):
     dilation: _size_1_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_pool1d(
             input,
             self.kernel_size,
@@ -222,7 +225,10 @@ class MaxPool2d(_MaxPoolNd):
     dilation: _size_2_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_pool2d(
             input,
             self.kernel_size,
@@ -298,7 +304,11 @@ class MaxPool3d(_MaxPoolNd):
 
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+<<<<<<< HEAD
     """
+=======
+    """  # noqa: E501
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     kernel_size: _size_3_t
     stride: _size_3_t
@@ -306,7 +316,10 @@ class MaxPool3d(_MaxPoolNd):
     dilation: _size_3_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_pool3d(
             input,
             self.kernel_size,
@@ -400,7 +413,10 @@ def __init__(
     def forward(
         self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
     ) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_unpool1d(
             input, indices, self.kernel_size, self.stride, self.padding, output_size
         )
@@ -496,7 +512,10 @@ def __init__(
     def forward(
         self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
     ) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_unpool2d(
             input, indices, self.kernel_size, self.stride, self.padding, output_size
         )
@@ -575,7 +594,10 @@ def __init__(
     def forward(
         self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
     ) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.max_unpool3d(
             input, indices, self.kernel_size, self.stride, self.padding, output_size
         )
@@ -668,7 +690,10 @@ def __init__(
         self.count_include_pad = count_include_pad
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.avg_pool1d(
             input,
             self.kernel_size,
@@ -777,7 +802,10 @@ def __init__(
         self.divisor_override = divisor_override
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.avg_pool2d(
             input,
             self.kernel_size,
@@ -894,7 +922,10 @@ def __init__(
         self.divisor_override = divisor_override
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.avg_pool3d(
             input,
             self.kernel_size,
@@ -1014,8 +1045,12 @@ class FractionalMaxPool3d(Module):
 
     Args:
         kernel_size: the size of the window to take a max over.
+<<<<<<< HEAD
                      Can be a single number `k` (for a square kernel of `k x k x k`) or a tuple `(kt x kh x kw)`,
                      `k` must greater than 0.
+=======
+                     Can be a single number k (for a square kernel of k x k x k) or a tuple `(kt x kh x kw)`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_size: the target output size of the image of the form `oT x oH x oW`.
                      Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH`
         output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
@@ -1056,11 +1091,14 @@ def __init__(
         _random_samples=None,
     ) -> None:
         super().__init__()
+<<<<<<< HEAD
         if (isinstance(kernel_size, int) and kernel_size <= 0) or (
             isinstance(kernel_size, (tuple, list))
             and not all(k > 0 for k in kernel_size)
         ):
             raise ValueError(f"kernel_size must greater than 0, but got {kernel_size}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.kernel_size = _triple(kernel_size)
         self.return_indices = return_indices
         self.register_buffer("_random_samples", _random_samples)
@@ -1141,10 +1179,13 @@ class LPPool1d(_LPPoolNd):
         stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+<<<<<<< HEAD
     Note:
         When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
         left padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Shape:
         - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
         - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
@@ -1163,7 +1204,10 @@ class LPPool1d(_LPPoolNd):
     stride: _size_1_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.lp_pool1d(
             input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
         )
@@ -1194,10 +1238,13 @@ class LPPool2d(_LPPoolNd):
         stride: the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+<<<<<<< HEAD
     Note:
         When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
         left padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
         - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
@@ -1223,7 +1270,10 @@ class LPPool2d(_LPPoolNd):
     stride: _size_2_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.lp_pool2d(
             input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
         )
@@ -1254,10 +1304,13 @@ class LPPool3d(_LPPoolNd):
         stride: the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+<<<<<<< HEAD
     Note:
         When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
         left padding or the input. Sliding windows that would start in the right padded region are ignored.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
         - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
@@ -1287,7 +1340,10 @@ class LPPool3d(_LPPoolNd):
     stride: _size_3_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.lp_pool3d(
             input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
         )
@@ -1339,7 +1395,10 @@ class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
     output_size: _size_1_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
 
 
@@ -1382,7 +1441,10 @@ class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
     output_size: _size_2_opt_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
 
 
@@ -1426,7 +1488,10 @@ class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
     output_size: _size_3_opt_t
 
     def forward(self, input: Tensor):
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
 
 
@@ -1466,9 +1531,12 @@ class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
     output_size: _size_1_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_avg_pool1d(input, self.output_size)
 
 
@@ -1508,7 +1576,10 @@ class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
     output_size: _size_2_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_avg_pool2d(input, self.output_size)
 
 
@@ -1548,5 +1619,8 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
     output_size: _size_3_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """Runs the forward pass."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.adaptive_avg_pool3d(input, self.output_size)
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index c7b44b61354a6..918e14d3d14e2 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -124,8 +124,12 @@ def __init__(
                 "dropout option adds dropout after all but last "
                 "recurrent layer, so non-zero dropout expects "
                 f"num_layers greater than 1, but got dropout={dropout} and "
+<<<<<<< HEAD
                 f"num_layers={num_layers}",
                 stacklevel=2,
+=======
+                f"num_layers={num_layers}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if not isinstance(hidden_size, int):
@@ -205,7 +209,11 @@ def __init__(
 
         self.reset_parameters()
 
+<<<<<<< HEAD
     def _init_flat_weights(self) -> None:
+=======
+    def _init_flat_weights(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._flat_weights = [
             getattr(self, wn) if hasattr(self, wn) else None
             for wn in self._flat_weights_names
@@ -215,7 +223,11 @@ def _init_flat_weights(self) -> None:
         ]
         self.flatten_parameters()
 
+<<<<<<< HEAD
     def __setattr__(self, attr, value) -> None:
+=======
+    def __setattr__(self, attr, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(self, "_flat_weights_names") and attr in self._flat_weights_names:
             # keep self._flat_weights up to date if you do self.weight = ...
             idx = self._flat_weights_names.index(attr)
@@ -243,7 +255,11 @@ def flatten_parameters(self) -> None:
         for fw in self._flat_weights:
             if (
                 not isinstance(fw, Tensor)
+<<<<<<< HEAD
                 or fw.dtype != dtype
+=======
+                or not (fw.dtype == dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or not fw.is_cuda
                 or not torch.backends.cudnn.is_acceptable(fw)
             ):
@@ -361,7 +377,11 @@ def _weights_have_changed(self):
 
     def check_forward_args(
         self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
 
@@ -388,7 +408,11 @@ def extra_repr(self) -> str:
             s += ", bidirectional={bidirectional}"
         return s.format(**self.__dict__)
 
+<<<<<<< HEAD
     def _update_flat_weights(self) -> None:
+=======
+    def _update_flat_weights(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not torch.jit.is_scripting():
             if self._weights_have_changed():
                 self._init_flat_weights()
@@ -616,7 +640,11 @@ def __init__(
     ) -> None: ...
 
     @overload
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None: ...
+=======
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         if "proj_size" in kwargs:
@@ -641,25 +669,36 @@ def __init__(self, *args, **kwargs):
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: Tensor,
         hx: Optional[Tensor] = None,
+=======
+        self, input: Tensor, hx: Optional[Tensor] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, Tensor]:
         pass
 
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: PackedSequence,
         hx: Optional[Tensor] = None,
+=======
+        self, input: PackedSequence, hx: Optional[Tensor] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[PackedSequence, Tensor]:
         pass
 
     def forward(self, input, hx=None):  # noqa: F811
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._update_flat_weights()
 
         num_directions = 2 if self.bidirectional else 1
@@ -777,10 +816,14 @@ def forward(self, input, hx=None):  # noqa: F811
 
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
+<<<<<<< HEAD
                 output,
                 batch_sizes,
                 sorted_indices,
                 unsorted_indices,
+=======
+                output, batch_sizes, sorted_indices, unsorted_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
 
@@ -982,7 +1025,11 @@ def __init__(
     ) -> None: ...
 
     @overload
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None: ...
+=======
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         super().__init__("LSTM", *args, **kwargs)
@@ -1009,7 +1056,11 @@ def check_forward_args(
         input: Tensor,
         hidden: tuple[Tensor, Tensor],  # type: ignore[override]
         batch_sizes: Optional[Tensor],
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_input(input, batch_sizes)
         self.check_hidden_size(
             hidden[0],
@@ -1038,9 +1089,13 @@ def permute_hidden(  # type: ignore[override]
     @overload  # type: ignore[override]
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: Tensor,
         hx: Optional[tuple[Tensor, Tensor]] = None,
+=======
+        self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, tuple[Tensor, Tensor]]:  # noqa: F811
         pass
 
@@ -1048,9 +1103,13 @@ def forward(
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: PackedSequence,
         hx: Optional[tuple[Tensor, Tensor]] = None,
+=======
+        self, input: PackedSequence, hx: Optional[tuple[Tensor, Tensor]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]:  # noqa: F811
         pass
 
@@ -1164,10 +1223,14 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
+<<<<<<< HEAD
                 output,
                 batch_sizes,
                 sorted_indices,
                 unsorted_indices,
+=======
+                output, batch_sizes, sorted_indices, unsorted_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
@@ -1322,7 +1385,11 @@ def __init__(
     ) -> None: ...
 
     @overload
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None: ...
+=======
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         if "proj_size" in kwargs:
@@ -1334,18 +1401,26 @@ def __init__(self, *args, **kwargs):
     @overload  # type: ignore[override]
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: Tensor,
         hx: Optional[Tensor] = None,
+=======
+        self, input: Tensor, hx: Optional[Tensor] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, Tensor]:  # noqa: F811
         pass
 
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
+<<<<<<< HEAD
         self,
         input: PackedSequence,
         hx: Optional[Tensor] = None,
+=======
+        self, input: PackedSequence, hx: Optional[Tensor] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[PackedSequence, Tensor]:  # noqa: F811
         pass
 
@@ -1439,10 +1514,14 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
+<<<<<<< HEAD
                 output,
                 batch_sizes,
                 sorted_indices,
                 unsorted_indices,
+=======
+                output, batch_sizes, sorted_indices, unsorted_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 5f445bf26c755..0aa8f1cb11442 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import copy
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -135,11 +139,15 @@ def __init__(
                 **factory_kwargs,
             )
             encoder_norm = LayerNorm(
+<<<<<<< HEAD
                 d_model,
                 eps=layer_norm_eps,
                 bias=bias,
                 # pyrefly: ignore [bad-argument-type]
                 **factory_kwargs,
+=======
+                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.encoder = TransformerEncoder(
                 encoder_layer, num_encoder_layers, encoder_norm
@@ -161,11 +169,15 @@ def __init__(
                 **factory_kwargs,
             )
             decoder_norm = LayerNorm(
+<<<<<<< HEAD
                 d_model,
                 eps=layer_norm_eps,
                 bias=bias,
                 # pyrefly: ignore [bad-argument-type]
                 **factory_kwargs,
+=======
+                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.decoder = TransformerDecoder(
                 decoder_layer, num_decoder_layers, decoder_norm
@@ -310,7 +322,11 @@ def generate_square_subsequent_mask(
         """
         return _generate_square_subsequent_mask(sz, dtype=dtype, device=device)
 
+<<<<<<< HEAD
     def _reset_parameters(self) -> None:
+=======
+    def _reset_parameters(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Initiate parameters in the transformer model."""
         for p in self.parameters():
             if p.dim() > 1:
@@ -390,7 +406,11 @@ def __init__(
             why_not_sparsity_fast_path = (
                 f"{enc_layer}.activation_relu_or_gelu was not True"
             )
+<<<<<<< HEAD
         elif encoder_layer.norm1.eps != encoder_layer.norm2.eps:
+=======
+        elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             why_not_sparsity_fast_path = (
                 f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
             )
@@ -399,8 +419,12 @@ def __init__(
 
         if enable_nested_tensor and why_not_sparsity_fast_path:
             warnings.warn(
+<<<<<<< HEAD
                 f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}",
                 stacklevel=2,
+=======
+                f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.use_nested_tensor = False
 
@@ -453,7 +477,10 @@ def forward(
         str_first_layer = "self.layers[0]"
         batch_first = first_layer.self_attn.batch_first
         is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+<<<<<<< HEAD
         do_mask_check = getattr(self, "mask_check", True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not is_fastpath_enabled:
             why_not_sparsity_fast_path = (
@@ -467,12 +494,17 @@ def forward(
             )
         elif first_layer.training:
             why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
+<<<<<<< HEAD
         elif src.dim() != 3:
+=======
+        elif not src.dim() == 3:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             why_not_sparsity_fast_path = (
                 f"input not batched; expected src.dim() of 3 but got {src.dim()}"
             )
         elif src_key_padding_mask is None:
             why_not_sparsity_fast_path = "src_key_padding_mask was None"
+<<<<<<< HEAD
         # This check avoids a call to torch._nested_tensor_from_mask_left_aligned() that
         # breaks in torch.compile.
         elif do_mask_check and torch.compiler.is_compiling():
@@ -480,6 +512,11 @@ def forward(
                 "mask_check enabled with torch.compile or torch.export"
             )
         elif do_mask_check and not torch._nested_tensor_from_mask_left_aligned(
+=======
+        elif (
+            (not hasattr(self, "mask_check")) or self.mask_check
+        ) and not torch._nested_tensor_from_mask_left_aligned(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             src, src_key_padding_mask.logical_not()
         ):
             why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
@@ -769,9 +806,13 @@ def __init__(
         self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
 
         self.norm_first = norm_first
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         self.dropout1 = Dropout(dropout)
         self.dropout2 = Dropout(dropout)
@@ -843,7 +884,11 @@ def forward(
             why_not_sparsity_fast_path = (
                 "torch.backends.mha.get_fastpath_enabled() was not True"
             )
+<<<<<<< HEAD
         elif src.dim() != 3:
+=======
+        elif not src.dim() == 3:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             why_not_sparsity_fast_path = (
                 f"input not batched; expected src.dim() of 3 but got {src.dim()}"
             )
@@ -857,7 +902,11 @@ def forward(
             why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
         elif not self.activation_relu_or_gelu:
             why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
+<<<<<<< HEAD
         elif self.norm1.eps != self.norm2.eps:
+=======
+        elif not (self.norm1.eps == self.norm2.eps):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
         elif src.is_nested and (
             src_key_padding_mask is not None or src_mask is not None
@@ -1063,11 +1112,16 @@ def __init__(
         self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
 
         self.norm_first = norm_first
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         # pyrefly: ignore [bad-argument-type]
         self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         # pyrefly: ignore [bad-argument-type]
+=======
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         self.dropout1 = Dropout(dropout)
         self.dropout2 = Dropout(dropout)
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 7fd102a768225..18497b625d1da 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -169,9 +169,12 @@ def __init__(
         self.recompute_scale_factor = recompute_scale_factor
 
     def forward(self, input: Tensor) -> Tensor:
+<<<<<<< HEAD
         """
         Runs the forward pass.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return F.interpolate(
             input,
             self.size,
@@ -188,9 +191,12 @@ def __setstate__(self, state):
         super().__setstate__(state)
 
     def extra_repr(self) -> str:
+<<<<<<< HEAD
         """
         Return the extra representation of the module.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.scale_factor is not None:
             info = "scale_factor=" + repr(self.scale_factor)
         else:
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index cfe621983dc21..e4c1b66ba5b0a 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -36,7 +36,10 @@ def _list_with_default(out_size: list[int], defaults: list[int]) -> list[int]:
     import torch
 
     if isinstance(out_size, (int, torch.SymInt)):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out_size
     if len(defaults) <= len(out_size):
         raise ValueError(f"Input dimension should be at least {len(out_size) + 1}")
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index e88a8e1795fb1..b3cbe499b6eea 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -71,8 +71,12 @@ def forward(ctx, target_device, dim, *inputs):
             warnings.warn(
                 "Was asked to gather along dimension 0, but all "
                 "input tensors were scalars; will instead unsqueeze "
+<<<<<<< HEAD
                 "and return a vector.",
                 stacklevel=2,
+=======
+                "and return a vector."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             ctx.unsqueezed_scalar = True
         else:
diff --git a/torch/nn/parallel/comm.py b/torch/nn/parallel/comm.py
index 3df1b4b4eadcf..0fd122809c726 100644
--- a/torch/nn/parallel/comm.py
+++ b/torch/nn/parallel/comm.py
@@ -43,7 +43,10 @@ def broadcast(tensor, devices=None, *, out=None):
         devices = [_get_device_index(d) for d in devices]
         return torch._C._broadcast(tensor, devices)
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._broadcast_out(tensor, out)
 
 
@@ -201,7 +204,10 @@ def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=
     """
     tensor = _handle_complex(tensor)
     if out is None:
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         devices = [_get_device_index(d) for d in devices]
         return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
     else:
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 9a0f4973d31b2..6a4b1af00f201 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -36,8 +36,12 @@ def warn_imbalance(get_prop):
         max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
         if min_val / max_val < 0.75:
             warnings.warn(
+<<<<<<< HEAD
                 imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]),
                 stacklevel=2,
+=======
+                imbalance_warn.format(device_ids[min_pos], device_ids[max_pos])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return True
         return False
@@ -161,7 +165,10 @@ def __init__(
         self.module = module
         self.device_ids = [_get_device_index(x, True) for x in device_ids]
         self.output_device = _get_device_index(output_device, True)
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.src_device_obj = torch.device(device_type, self.device_ids[0])
 
         if device_type == "cuda":
@@ -175,7 +182,10 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
             if not self.device_ids:
                 return self.module(*inputs, **kwargs)
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for t in chain(self.module.parameters(), self.module.buffers()):
                 if t.device != self.src_device_obj:
                     raise RuntimeError(
@@ -262,10 +272,15 @@ def data_parallel(
 
     device_ids = [_get_device_index(x, True) for x in device_ids]
     output_device = _get_device_index(output_device, True)
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
     src_device_obj = torch.device(device_type, device_ids[0])
 
     # pyrefly: ignore [bad-argument-type]
+=======
+    src_device_obj = torch.device(device_type, device_ids[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for t in chain(module.parameters(), module.buffers()):
         if t.device != src_device_obj:
             raise RuntimeError(
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1072b68ea1154..6bb60d0c6e911 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -9,11 +9,18 @@
 import warnings
 import weakref
 from collections import defaultdict, deque
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, fields, is_dataclass
 from enum import auto, Enum
 from typing import Any, Optional, TYPE_CHECKING
+=======
+from contextlib import contextmanager
+from dataclasses import dataclass, fields, is_dataclass
+from enum import auto, Enum
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -218,7 +225,11 @@ def _dump_DDP_relevant_env_vars():
     ]
     formatted_output = ""
     for var in relevant_env_vars:
+<<<<<<< HEAD
         value = os.environ.get(var, "N/A")
+=======
+        value = os.environ[var] if var in os.environ else "N/A"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         formatted_output += f"env:{var}={value}\n"
     print(formatted_output)
 
@@ -241,7 +252,10 @@ class _BufferCommHook:
 # is completed.
 class _DDPSink(Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, ddp_weakref, *inputs):
         # set_materialize_grads(False) will ensure that None gradients stay as
         # None and are not filled with zeros.
@@ -349,20 +363,28 @@ class DistributedDataParallel(Module, Joinable):
     To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
     up ``N`` processes, ensuring that each process exclusively works on a single
     GPU from 0 to N-1. This can be done by either setting
+<<<<<<< HEAD
     ``CUDA_VISIBLE_DEVICES`` for every process or by calling the following API for GPUs,
+=======
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> # xdoctest: +SKIP("undefined variables")
         >>> torch.cuda.set_device(i)
 
+<<<<<<< HEAD
     or calling the unified API for :ref:`accelerator<accelerators>`,
 
         >>> # xdoctest: +SKIP("undefined variables")
         >>> torch.accelerator.set_device_index(i)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     where i is from 0 to N-1. In each process, you should refer the following
     to construct this module:
 
         >>> # xdoctest: +SKIP("undefined variables")
+<<<<<<< HEAD
         >>> if torch.accelerator.is_available():
         >>>     device_type = torch.accelerator.current_accelerator().type
         >>>     vendor_backend = torch.distributed.get_default_backend_for_device(device_type)
@@ -376,6 +398,13 @@ class DistributedDataParallel(Module, Joinable):
 
         >>> torch.distributed.init_process_group(device_id=i)
 
+=======
+        >>> torch.distributed.init_process_group(
+        >>>     backend='nccl', world_size=N, init_method='...'
+        >>> )
+        >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     In order to spawn up multiple processes per node, you can use either
     ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
 
@@ -692,7 +721,10 @@ def __init__(
         elif process_group is None and device_mesh is None:
             self.process_group = _get_default_group()
         elif device_mesh is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.process_group = process_group
         else:
             if device_mesh.ndim != 1:
@@ -701,8 +733,14 @@ def __init__(
                 )
             self.device_mesh = device_mesh
             self.process_group = device_mesh.get_group(mesh_dim=0)
+<<<<<<< HEAD
 
             root_mesh = device_mesh._get_root_mesh()
+=======
+            from torch.distributed.device_mesh import _mesh_resources
+
+            root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # if a root mesh is not the same as device_mesh,
             # meaning the device_mesh is sliced out from the root mesh.
             if root_mesh != device_mesh:
@@ -774,19 +812,29 @@ def __init__(
                     "DistributedDataParallel device_ids and output_device arguments "
                     "only work with single-device/multiple-device GPU modules or CPU modules, "
                     f"but got device_ids {device_ids}, output_device {output_device}, "
+<<<<<<< HEAD
                     f"and module parameters { ({p.device for p in self._module_parameters}) }.",
+=======
+                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",  # noqa: E201,E202
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             self.device_ids = None
             self.output_device = None
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.device_ids = [_get_device_index(x, True) for x in device_ids]
 
             if output_device is None:
                 output_device = device_ids[0]
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.output_device = _get_device_index(output_device, True)
 
         self.static_graph = False
@@ -822,7 +870,11 @@ def __init__(
                     "Run a dummy forward pass to correctly initialize the modules",
                 )
         # used for intra-node param sync and inter-node sync as well
+<<<<<<< HEAD
         self.broadcast_bucket_size = 250 * 1024 * 1024
+=======
+        self.broadcast_bucket_size = int(250 * 1024 * 1024)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # reduction bucket size
         if bucket_cap_mb is None:
@@ -936,7 +988,10 @@ def __init__(
         # enabled.
         self._accum_grad_hooks: list[RemovableHandle] = []
         if self._use_python_reducer:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._inductor.config._fuse_ddp_communication = True
             torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb
             # Directly adding this to the trace rule will disturb the users
@@ -2189,7 +2244,11 @@ def _sync_buffers(self):
             else:
                 # The process with rank 0 is considered the authoritative copy.
                 authoritative_rank = 0
+<<<<<<< HEAD
             # Update self.modules_buffers in case any buffers were
+=======
+            # Update self.modules_buffers incase any buffers were
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # reassigned.
             self._assign_modules_buffers()
             self._sync_module_buffers(authoritative_rank)
@@ -2365,8 +2424,12 @@ def _set_static_graph(self):
         # If self.static_graph has been set, no need to set it again
         if self.static_graph:
             warnings.warn(
+<<<<<<< HEAD
                 "You've set static_graph to be True, no need to set it again.",
                 stacklevel=2,
+=======
+                "You've set static_graph to be True, no need to set it again."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return
         self.static_graph = True
@@ -2380,8 +2443,12 @@ def _set_static_graph(self):
                 "`_set_static_graph` will detect unused parameters automatically, so "
                 "you do not need to set find_unused_parameters=true, just be sure these "
                 "unused parameters will not change during training loop while calling "
+<<<<<<< HEAD
                 "`_set_static_graph`.",
                 stacklevel=2,
+=======
+                "`_set_static_graph`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def _remove_autograd_hooks(self):
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index aa8db823e1185..0ede6ae8d48e6 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -58,9 +58,13 @@ def parallel_apply(
     else:
         devices = [None] * len(modules)
     devices = [_get_device_index(x, True) for x in devices]
+<<<<<<< HEAD
     streams = [torch.accelerator.current_stream(x) for x in devices]
     assert torch.accelerator.is_available(), "No available accelerator found."
     device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
+=======
+    streams = [torch.cuda.current_stream(x) for x in devices]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lock = threading.Lock()
     results = {}
     grad_enabled, autocast_enabled = (
@@ -74,7 +78,11 @@ def _worker(
         input: Any,
         kwargs: dict[str, Any],
         device: Optional[Union[int, torch.device]] = None,
+<<<<<<< HEAD
         stream: Optional[torch.Stream] = None,
+=======
+        stream: Optional[torch.cuda.Stream] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         torch.set_grad_enabled(grad_enabled)
         if device is None:
@@ -87,6 +95,7 @@ def _worker(
                     )
                 return
             device = t.get_device()
+<<<<<<< HEAD
         if isinstance(device, torch.device):
             device = device.index
         if stream is None:
@@ -96,6 +105,15 @@ def _worker(
                 torch.accelerator.device_index(device),
                 stream,
                 torch.amp.autocast(device_type, enabled=autocast_enabled),
+=======
+        if stream is None:
+            stream = torch.cuda.current_stream(device)
+        try:
+            with (
+                torch.cuda.device(device),
+                torch.cuda.stream(stream),
+                torch.amp.autocast("cuda", enabled=autocast_enabled),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 # this also avoids accidental slicing of `input` if it is a Tensor
                 if not isinstance(input, (list, tuple)):
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index 6c6e4567efa11..49dd35294a62f 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -111,7 +111,12 @@ def replicate(
 ) -> list[T]:
     if not _replicatable_module(network):
         raise RuntimeError(
+<<<<<<< HEAD
             "Cannot replicate network where python modules are children of ScriptModule"
+=======
+            "Cannot replicate network where python modules are "
+            "childrens of ScriptModule"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if not devices:
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index a2917bddd0327..a1f0ffd3a6112 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -56,6 +56,7 @@ def scatter_map(obj):
         if isinstance(obj, torch.Tensor):
             return Scatter.apply(target_gpus, None, dim, obj)
         if _is_namedtuple(obj):
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
             return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
         if isinstance(obj, tuple) and len(obj) > 0:
@@ -66,6 +67,14 @@ def scatter_map(obj):
             return [list(i) for i in zip(*map(scatter_map, obj))]
         if isinstance(obj, dict) and len(obj) > 0:
             # pyrefly: ignore [no-matching-overload]
+=======
+            return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return [list(i) for i in zip(*map(scatter_map, obj))]
+        if isinstance(obj, dict) and len(obj) > 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
         return [obj for _ in target_gpus]
 
@@ -127,12 +136,18 @@ def gather_map(outputs):
         if isinstance(out, dict):
             if not all(len(out) == len(d) for d in outputs):
                 raise ValueError("All dicts must have the same number of keys")
+<<<<<<< HEAD
             # pyrefly: ignore [not-callable]
             return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
         if _is_namedtuple(out):
             # pyrefly: ignore [no-matching-overload]
             return type(out)._make(map(gather_map, zip(*outputs)))
         # pyrefly: ignore [no-matching-overload]
+=======
+            return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
+        if _is_namedtuple(out):
+            return type(out)._make(map(gather_map, zip(*outputs)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return type(out)(map(gather_map, zip(*outputs)))
 
     # Recursive function calls like this create reference cycles.
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index c03c85f48fc35..deefcfbb0ea00 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -1,10 +1,14 @@
 from collections import OrderedDict
+<<<<<<< HEAD
 from typing import Any
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C import _disabled_torch_function_impl
 
 
+<<<<<<< HEAD
 __all__ = [
     "Parameter",
     "UninitializedParameter",
@@ -15,6 +19,8 @@
 ]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Metaclass to combine _TensorMeta and the instance check override for Parameter.
 class _ParameterMeta(torch._C._TensorMeta):
     # Make `isinstance(t, Parameter)` return True for custom tensor instances that have the _is_param flag.
@@ -81,7 +87,10 @@ def __deepcopy__(self, memo):
             memo[id(self)] = result
             return result
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __repr__(self):
         return "Parameter containing:\n" + super().__repr__()
 
@@ -144,7 +153,10 @@ def materialize(self, shape, device=None, dtype=None):
         if dtype is None:
             dtype = self.data.dtype
         self.data = torch.empty(shape, device=device, dtype=dtype)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-override, missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.__class__ = self.cls_to_become
 
     @property
@@ -168,7 +180,10 @@ def __repr__(self):
 
     def __reduce_ex__(self, proto):
         # See Note [Don't serialize hooks]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (self.__class__, (self.requires_grad,))
 
     @classmethod
@@ -178,7 +193,10 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         if func in cls._allowed_methods or func.__class__.__name__ == "method-wrapper":
             if kwargs is None:
                 kwargs = {}
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return super().__torch_function__(func, types, args, kwargs)
         raise ValueError(
             f"Attempted to use an uninitialized parameter in {func}. "
@@ -189,6 +207,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         )
 
 
+<<<<<<< HEAD
 def is_lazy(param: Any) -> bool:
     """
     Returns whether ``param`` is an ``UninitializedParameter`` or ``UninitializedBuffer``.
@@ -196,6 +215,9 @@ def is_lazy(param: Any) -> bool:
     Args:
         param (Any): the input to check.
     """
+=======
+def is_lazy(param):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(param, UninitializedTensorMixin)
 
 
@@ -220,7 +242,10 @@ class UninitializedParameter(UninitializedTensorMixin, Parameter):
     def __new__(cls, requires_grad=True, device=None, dtype=None) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         data = torch.empty(0, **factory_kwargs)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.Tensor._make_subclass(cls, data, requires_grad)
 
     def __deepcopy__(self, memo):
@@ -266,9 +291,13 @@ def __new__(cls, data=None, *, persistent=True):
             data = torch.empty(0)
 
         t = data.detach().requires_grad_(data.requires_grad)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         t.persistent = persistent
         # pyrefly: ignore [missing-attribute]
+=======
+        t.persistent = persistent
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t._is_buffer = True
         return t
 
@@ -299,9 +328,14 @@ def __new__(
         factory_kwargs = {"device": device, "dtype": dtype}
         data = torch.empty(0, **factory_kwargs)
         ret = torch.Tensor._make_subclass(cls, data, requires_grad)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         ret.persistent = persistent
         # pyrefly: ignore [missing-attribute]
         ret._is_buffer = True
         # pyrefly: ignore [bad-return]
+=======
+        ret.persistent = persistent
+        ret._is_buffer = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ret
diff --git a/torch/nn/utils/__init__.py b/torch/nn/utils/__init__.py
index ed9a83b133896..448148a30ba41 100644
--- a/torch/nn/utils/__init__.py
+++ b/torch/nn/utils/__init__.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 from . import parametrizations, parametrize, rnn, stateless
 from .clip_grad import (  # pyrefly: ignore  # deprecated
+=======
+from . import parametrizations, rnn, stateless
+from .clip_grad import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _clip_grads_with_norm_ as clip_grads_with_norm_,
     _get_total_norm as get_total_norm,
     clip_grad_norm,
@@ -36,7 +41,10 @@
     "get_total_norm",
     "parameters_to_vector",
     "parametrizations",
+<<<<<<< HEAD
     "parametrize",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "remove_spectral_norm",
     "remove_weight_norm",
     "rnn",
diff --git a/torch/nn/utils/_deprecation_utils.py b/torch/nn/utils/_deprecation_utils.py
index a25b647307900..74addc97f970e 100644
--- a/torch/nn/utils/_deprecation_utils.py
+++ b/torch/nn/utils/_deprecation_utils.py
@@ -1,6 +1,10 @@
 import importlib
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _MESSAGE_TEMPLATE = (
@@ -45,7 +49,11 @@ def getattr_dunder(name: str) -> None:
         if name in all:
             # We are using the "RuntimeWarning" to make sure it is not
             # ignored by default.
+<<<<<<< HEAD
             warnings.warn(warning_message, RuntimeWarning, stacklevel=2)
+=======
+            warnings.warn(warning_message, RuntimeWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             package = importlib.import_module(new_module)
             return getattr(package, name)
         raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.")
diff --git a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
index da7d8f3dfabb8..1c17259ec7978 100644
--- a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, TypeVar
+=======
+from typing import Any, Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -24,7 +28,10 @@
 @implements_per_sample_grads(F.conv3d)
 class ConvPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(
         ctx: Any,
         kwarg_names: list[str],
@@ -57,7 +64,10 @@ def forward(
                 f"unbatched input of dim {input.dim()}, expected input of dim {batched_dim_size}"
             )
 
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctx.conv_fn = conv_fn
 
         ctx.batch_size = orig_input.shape[0]
diff --git a/torch/nn/utils/_expanded_weights/conv_utils.py b/torch/nn/utils/_expanded_weights/conv_utils.py
index e755362a4f201..7256e8f8b7824 100644
--- a/torch/nn/utils/_expanded_weights/conv_utils.py
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -237,7 +237,10 @@ def conv_unfold_weight_grad_sample(
     # n=batch_sz; o=num_out_channels; p=(num_in_channels/groups)*kernel_sz
     weight_grad_sample = torch.einsum("noq,npq->nop", grad_output, input)
     # rearrange the above tensor and extract diagonals.
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     weight_grad_sample = weight_grad_sample.view(
         n,
         groups,
diff --git a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
index 3b4f0ce46b95b..dfa312dd62e1d 100644
--- a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
@@ -1,4 +1,9 @@
+<<<<<<< HEAD
 from typing import Any, Optional
+=======
+# mypy: allow-untyped-defs
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn.functional as F
@@ -14,10 +19,14 @@
 @implements_per_sample_grads(F.embedding)
 class EmbeddingPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def forward(
         ctx: Any, kwarg_names: list[str], _: Any, *expanded_args_and_kwargs: Any
     ) -> torch.Tensor:
+=======
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
         )
@@ -35,10 +44,14 @@ def forward(
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def backward(
         ctx: Any, grad_output: torch.Tensor
     ) -> tuple[Optional[torch.Tensor], ...]:
+=======
+    def backward(ctx, grad_output):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input, weight = ctx.input, ctx.weight
         padding_idx, scale_grad_by_freq, sparse = (
             ctx.padding_idx,
@@ -46,7 +59,11 @@ def backward(
             ctx.sparse,
         )
 
+<<<<<<< HEAD
         def weight_per_sample_grad(weight: torch.Tensor) -> torch.Tensor:
+=======
+        def weight_per_sample_grad(weight):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batch_size = input.shape[0]
             embedding_dim = weight.shape[1]
             index = (
@@ -54,7 +71,11 @@ def weight_per_sample_grad(weight: torch.Tensor) -> torch.Tensor:
                 .expand(*input.shape, embedding_dim)
                 .reshape(batch_size, -1, embedding_dim)
             )
+<<<<<<< HEAD
             grad_sample = torch.zeros(  # type: ignore[attr-defined]
+=======
+            grad_sample = torch.zeros(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 batch_size, *weight.shape, device=weight.device, dtype=grad_output.dtype
             )
             return grad_sample.scatter_add_(
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
index 36e3ee7a58909..f4287238a7591 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import contextmanager
+=======
+from contextlib import contextmanager
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._decomp import decomposition_table
@@ -100,7 +105,11 @@ def decorator(autograd_func):
 # This is a __torch_function__ object but it could have also been a Tensor Extension
 # with a dispatch key.
 #
+<<<<<<< HEAD
 # Needs to be a tensor subclass to allow reparameterization
+=======
+# Needs to be a tensor subclass to allow reparamaterization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ExpandedWeight(torch.Tensor):
     def __init__(self, orig_weight, batch_size, loss_reduction):
         self.batch_size = batch_size
@@ -131,9 +140,13 @@ def __torch_function__(cls, func, _, args=(), kwargs=None):
             # in aten, choosing the input or data variants is done by parsing logic. This mimics some of that
             decomp_opts = expanded_weights_rnn_decomps[func]
             use_input_variant = isinstance(
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
                 args[2],
                 list,
+=======
+                args[2], list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )  # data variant uses a list here
             decomp = decomp_opts[0] if use_input_variant else decomp_opts[1]
 
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
index b3f674d3233d8..bb02bff5704f0 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -8,7 +8,10 @@
 
 def is_batch_first(expanded_args_and_kwargs):
     batch_first = None
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for arg in expanded_args_and_kwargs:
         if not isinstance(arg, ExpandedWeight):
             continue
diff --git a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
index 9ddf60e0a54ea..99c267bfd9818 100644
--- a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
@@ -18,7 +18,10 @@
 @implements_per_sample_grads(F.group_norm)
 class GroupNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
@@ -47,7 +50,10 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         input, num_groups = ctx.input, ctx.num_groups
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
@@ -96,9 +102,13 @@ def backward(ctx, grad_output):
             set_grad_sample_if_exists(
                 weight,
                 lambda _: torch.einsum(
+<<<<<<< HEAD
                     "ni...->ni",
                     # pyrefly: ignore [unsupported-operation]
                     F.group_norm(input, num_groups, eps=eps) * grad_output,
+=======
+                    "ni...->ni", F.group_norm(input, num_groups, eps=eps) * grad_output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
         if hasattr(ctx, "bias"):
diff --git a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
index 613ce90431b8b..be3ea97f82397 100644
--- a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
@@ -17,7 +17,10 @@
 @implements_per_sample_grads(F.instance_norm)
 class InstanceNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         instance_norm = partial(torch.instance_norm, cudnn_enabled=True)
         expanded_args, expanded_kwargs = standard_kwargs(
@@ -37,7 +40,10 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         input, running_mean, running_var = ctx.input, ctx.running_mean, ctx.running_var
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
diff --git a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
index ff5b5a61e7f5d..b18ff21d4b61c 100644
--- a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
@@ -17,7 +17,10 @@
 @implements_per_sample_grads(F.layer_norm)
 class LayerNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
@@ -43,7 +46,10 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         def weight_per_sample_grad(weight):
             return sum_over_all_but_batch_and_last_n(
diff --git a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
index 80903782db18e..aa8216c5eb394 100644
--- a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -16,7 +16,10 @@
 @implements_per_sample_grads(F.linear)
 class LinearPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, _, __, *expanded_args_and_kwargs):
         if len(expanded_args_and_kwargs[0].shape) <= 1:
             raise RuntimeError(
@@ -36,7 +39,10 @@ def forward(ctx, _, __, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backward(ctx, grad_output):
         input, weight = ctx.args
         bias = ctx.kwargs["bias"]
diff --git a/torch/nn/utils/_named_member_accessor.py b/torch/nn/utils/_named_member_accessor.py
index e815265fec633..79b7c063a08d1 100644
--- a/torch/nn/utils/_named_member_accessor.py
+++ b/torch/nn/utils/_named_member_accessor.py
@@ -77,7 +77,10 @@ def swap_tensor(
             setattr(module, name, tensor)
         elif hasattr(module, name):
             delattr(module, name)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return orig_tensor
 
 
@@ -146,7 +149,11 @@ def get_submodule(self, name: str) -> "torch.nn.Module":
                     f"{module._get_name()} has no attribute `{attr}`"
                 ) from ex
             if not isinstance(submodule, torch.nn.Module):
+<<<<<<< HEAD
                 raise TypeError(
+=======
+                raise TypeError(  # noqa: B904
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
                 )
             self.memo[name] = submodule
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 99c2abe4e56c1..b46469e50f304 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -4,9 +4,14 @@
 import types
 import typing
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import cast, Optional, TypeAlias, TypeVar, Union
 from typing_extensions import deprecated, ParamSpec
+=======
+from typing import cast, Optional, Union
+from typing_extensions import deprecated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -17,6 +22,7 @@
 )
 
 
+<<<<<<< HEAD
 __all__: list[str] = [
     "clip_grad_norm",
     "clip_grad_norm_",
@@ -25,15 +31,26 @@
 
 
 _tensor_or_tensors: TypeAlias = Union[  # noqa: PYI042
+=======
+__all__: list[str] = []
+
+
+_tensor_or_tensors = Union[
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.Tensor,
     typing.Iterable[torch.Tensor],  # noqa: UP006 - needed until XLA's patch is updated
 ]
 
+<<<<<<< HEAD
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
 
 def _no_grad(func: Callable[_P, _R]) -> Callable[_P, _R]:
+=======
+
+def _no_grad(func):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions
     clip_grad_norm_ and clip_grad_value_ themselves.
@@ -41,11 +58,17 @@ def _no_grad(func: Callable[_P, _R]) -> Callable[_P, _R]:
 
     def _no_grad_wrapper(*args, **kwargs):
         with torch.no_grad():
+<<<<<<< HEAD
             # pyrefly: ignore [invalid-param-spec]
             return func(*args, **kwargs)
 
     functools.update_wrapper(_no_grad_wrapper, func)
     # pyrefly: ignore [bad-return]
+=======
+            return func(*args, **kwargs)
+
+    functools.update_wrapper(_no_grad_wrapper, func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _no_grad_wrapper
 
 
@@ -132,6 +155,7 @@ def _clip_grads_with_norm_(
     The gradients will be scaled by the following calculation
 
     .. math::
+<<<<<<< HEAD
         grad = grad * \min(\frac{max\_norm}{total\_norm + 1e-6}, 1)
 
     Gradients are modified in-place.
@@ -139,6 +163,12 @@ def _clip_grads_with_norm_(
     Note: The scale coefficient is clamped to a maximum of 1.0 to prevent gradient amplification.
     This ensures that gradients are only scaled down when the total norm exceeds max_norm.
 
+=======
+        grad = grad * \frac{max\_norm}{total\_norm + 1e-6}
+
+    Gradients are modified in-place.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     This function is equivalent to :func:`torch.nn.utils.clip_grad_norm_` with a pre-calculated
     total norm.
 
@@ -283,7 +313,10 @@ def clip_grad_value_(
     clip_value = float(clip_value)
 
     grads = [p.grad for p in parameters if p.grad is not None]
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grouped_grads = _group_tensors_by_device_and_dtype([grads])
 
     for (device, _), ([grads], _) in grouped_grads.items():
@@ -300,3 +333,11 @@ def clip_grad_value_(
         else:
             for grad in grads:
                 cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
+<<<<<<< HEAD
+=======
+
+
+clip_grad_norm.__module__ = "torch.nn.utils"
+clip_grad_norm_.__module__ = "torch.nn.utils"
+clip_grad_value_.__module__ = "torch.nn.utils"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/utils/memory_format.py b/torch/nn/utils/memory_format.py
index 06eb55a02572d..3ac0177ce6476 100644
--- a/torch/nn/utils/memory_format.py
+++ b/torch/nn/utils/memory_format.py
@@ -84,7 +84,10 @@ def convert_conv2d_weight_memory_format(
         )
     for child in module.children():
         convert_conv2d_weight_memory_format(child, memory_format)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return module
 
 
@@ -164,7 +167,10 @@ def convert_conv3d_weight_memory_format(
         )
     for child in module.children():
         convert_conv3d_weight_memory_format(child, memory_format)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return module
 
 
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index 7706be61e39f1..938f4ea3ebbe8 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -98,7 +98,10 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
                 )
             # Q is now orthogonal (or unitary) of size (..., n, n)
             if n != k:
+<<<<<<< HEAD
                 # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Q = Q[..., :k]
             # Q is now the size of the X (albeit perhaps transposed)
         else:
@@ -111,10 +114,15 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2)
 
         if hasattr(self, "base"):
+<<<<<<< HEAD
             # pyrefly: ignore [unbound-name]
             Q = self.base @ Q
         if transposed:
             # pyrefly: ignore [unbound-name]
+=======
+            Q = self.base @ Q
+        if transposed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Q = Q.mT
         return Q  # type: ignore[possibly-undefined]
 
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 88eeb3aaf50c3..d7e12bc2048ef 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -179,11 +179,15 @@ def __init__(
 
         # Register the tensor(s)
         if self.is_tensor:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if original.dtype != new.dtype:
                 raise ValueError(
                     "When `right_inverse` outputs one tensor, it may not change the dtype.\n"
                     f"original.dtype: {original.dtype}\n"
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
                     f"right_inverse(original).dtype: {new.dtype}"
                 )
@@ -201,6 +205,13 @@ def __init__(
             # manually in the optimiser
             with torch.no_grad():
                 # pyrefly: ignore [bad-argument-type]
+=======
+                    f"right_inverse(original).dtype: {new.dtype}"
+                )
+            # Set the original to original so that the user does not need to re-register the parameter
+            # manually in the optimiser
+            with torch.no_grad():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _maybe_set(original, new)
             _register_parameter_or_buffer(self, "original", original)
         else:
@@ -401,7 +412,10 @@ def get_parametrized(self) -> Tensor:
         if torch.jit.is_scripting():
             raise RuntimeError("Parametrization is not working with scripting.")
         parametrization = self.parametrizations[tensor_name]
+<<<<<<< HEAD
         # pyrefly: ignore [redundant-condition]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if _cache_enabled:
             if torch.jit.is_scripting():
                 # Scripting
@@ -701,7 +715,10 @@ def remove_parametrizations(
     # Fetch the original tensor
     assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
     parametrizations = module.parametrizations[tensor_name]
+<<<<<<< HEAD
     # pyrefly: ignore [invalid-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if parametrizations.is_tensor:
         original = parametrizations.original
         assert isinstance(original, torch.Tensor), "is_tensor promised us a Tensor"
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 99a1439ec5c8f..4b209aa0484bf 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -274,11 +274,16 @@ def __init__(self, *args):
         if not isinstance(args, Iterable):  # only 1 item
             self._tensor_name = args._tensor_name
             self.add_pruning_method(args)
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         elif len(args) == 1:  # only 1 item in a tuple
             # pyrefly: ignore [index-error]
             self._tensor_name = args[0]._tensor_name
             # pyrefly: ignore [index-error]
+=======
+        elif len(args) == 1:  # only 1 item in a tuple
+            self._tensor_name = args[0]._tensor_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.add_pruning_method(args[0])
         else:  # manual construction from list or other iterable (or no args)
             for method in args:
@@ -1100,7 +1105,10 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
 
     # flatten importance scores to consider them all at once in global pruning
     relevant_importance_scores = torch.nn.utils.parameters_to_vector(
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [
             importance_scores.get((module, name), getattr(module, name))
             for (module, name) in parameters
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 5529cbc83ef0a..f9e6436faab8c 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -1,6 +1,11 @@
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any, NamedTuple, Optional, overload, TypeVar, Union
+=======
+from collections.abc import Iterable
+from typing import Any, Callable, NamedTuple, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -255,6 +260,7 @@ def _packed_sequence_init(
 
 
 def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
+<<<<<<< HEAD
     """Returns the inverse of ``permutation``.
 
     This is useful for converting between sorted and unsorted indices in
@@ -263,6 +269,8 @@ def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
     Args:
         permutation (Tensor, optional): a 1-D tensor of indices to invert
     """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if permutation is None:
         return None
     output = torch.empty_like(permutation, memory_format=torch.legacy_contiguous_format)
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
index d40e3a35e55eb..1d55501377afa 100644
--- a/torch/nn/utils/spectral_norm.py
+++ b/torch/nn/utils/spectral_norm.py
@@ -332,7 +332,10 @@ def spectral_norm(
         else:
             dim = 0
     SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return module
 
 
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index 148052740922f..227b03ab49a19 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -103,6 +103,12 @@ def _reparametrize_module(
     strict: bool = False,
     stack_weights: bool = False,
 ):
+<<<<<<< HEAD
+=======
+    parameters_and_buffers = parameters_and_buffers
+    stack_weights = stack_weights
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if tie_weights:
         untied_parameters_and_buffers = _untie_named_tensors_map(
             module, parameters_and_buffers
diff --git a/torch/onnx/README.md b/torch/onnx/README.md
index 3878f48d70be0..69ab033435b13 100644
--- a/torch/onnx/README.md
+++ b/torch/onnx/README.md
@@ -4,3 +4,95 @@ Torch->ONNX converter / exporter.
 
 - User-facing docs: https://pytorch.org/docs/main/onnx.html
 - Developer docs: https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter
+<<<<<<< HEAD
+=======
+
+> Read the following if you are contributing to `torch.onnx`
+
+## Symbolic functions Opsets
+
+Opset 9 is the base version. It is selected as the base version because
+
+1. It is the first opset version supported by PyTorch export.
+2. Opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
+    that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
+    we chose to handle them as special cases separately.
+
+Backward support for opset versions beyond opset 7 is not in our roadmap.
+
+For opset versions other than 9, by default they will inherit the symbolic functions defined in
+symbolic_opset9.py.
+
+To extend support for updated operators in different opset versions on top of opset 9,
+simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
+Checkout topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
+
+## Editing Symbolic Files
+
+- Use the internal `registration.onnx_symbolic` decorator to register a new symbolic function. Search for `def reshape(g, self, shape):` to see an example.
+- Parameter names must *exactly* match the names in
+  aten/src/ATen/native/native_functions.yaml, because
+  dispatch is done with keyword arguments.
+- Looking for inplace ops? They're detected by
+  `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
+  transparently dispatched to their non inplace versions in
+  "run_symbolic_function". See Note [Export inplace](#export-inplace)
+
+### A note on Tensor types
+
+In general, we should avoid depending on the type of Tensor Values contained
+within the trace graph. However, this is sometimes unavoidable (due to ONNX
+spec requirements, etc). The TensorType object has accessors for these properties that return the property if it is statically known and return nullopt otherwise.
+
+In general, we should prefer to rely on the least specific information possible.
+For example, not relying on tensor properties at all is better than relying
+on the number of dimensions which is better than relying on
+concrete shapes. Doing so will make the export symbolics
+more robust to different graphs.
+
+### Extra context for symbolic functions
+
+The first argument of a symbolic function is always a `GraphContext` object.
+
+`GraphContext` contains all methods defined in a `torch.Graph` object and context
+for the symbolic function.
+
+In general, symbolic functions only require inputs and attributes to
+the original node. An example of a symbolic function needing context is
+`prim::Loop`. It needs access to the sub-block of the original node.
+
+### Export inplace
+
+It would be better for us to export inplace annotations,
+than to not export them, since it is useful information that can
+help the target of an ONNX export export more efficiently. However,
+ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
+inplace annotations, but we are losing information this way.
+
+### Pointwise by scalar
+
+What happens if you add a tensor with a constant (e.g., x + 2)?  There are
+some moving parts to implementing the ONNX translation in this case:
+
+- By the time we get the scalar in a symbolic function here, it is no longer a
+  Python long/float, but a PyTorch tensor with `numel == 1` (eventually, we want
+  it to be a zero dim tensor but this change has not happened yet.) However, the
+  type of this scalar is *exactly* what the user wrote in Python, which may not
+  match the tensor it is being added to. PyTorch will do implicit conversions on
+  scalars; however, ONNX will not, so we must do the conversion ourselves. This
+  is what `symbolic_helper._if_scalar_type_as()` and
+  `_jit_pass_onnx_scalar_type_analysis` does.
+
+- Dispatch to these functions takes advantage an outrageous coincidence
+    between the tensor and scalar name.  When we add two tensors together,
+    you get the dispatch:
+
+    add(*[self, other], **{"alpha": alpha})
+
+    When you add a tensor and a scalar, you get the dispatch:
+
+    add(*[self], **{"other": other, "alpha": alpha})
+
+    By having the argument name line up with the name of the scalar attribute
+    if it exists, we can write a single function for both overloads.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index df0bf2cd1a225..e1c40c27366e7 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -6,6 +6,7 @@
     # Modules
     "errors",
     "ops",
+<<<<<<< HEAD
     # Public functions
     "export",
     "is_in_onnx_export",
@@ -41,10 +42,99 @@
     unregister_custom_op_symbolic,
 )
 from .errors import OnnxExporterError
+=======
+    "symbolic_helper",
+    "utils",
+    # All opsets
+    "symbolic_caffe2",
+    "symbolic_opset7",
+    "symbolic_opset8",
+    "symbolic_opset9",
+    "symbolic_opset10",
+    "symbolic_opset11",
+    "symbolic_opset12",
+    "symbolic_opset13",
+    "symbolic_opset14",
+    "symbolic_opset15",
+    "symbolic_opset16",
+    "symbolic_opset17",
+    "symbolic_opset18",
+    "symbolic_opset19",
+    "symbolic_opset20",
+    # Enums
+    "OperatorExportTypes",
+    "TrainingMode",
+    "TensorProtoDataType",
+    "JitScalarType",
+    # Public functions
+    "export",
+    "is_in_onnx_export",
+    "select_model_mode_for_export",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+    # Base error
+    "OnnxExporterError",
+    "ExportOptions",
+    "ONNXProgram",
+    "dynamo_export",
+    "enable_fake_mode",
+    # DORT / torch.compile
+    "is_onnxrt_backend_supported",
+]
+
+from typing import Any, Callable, TYPE_CHECKING
+from typing_extensions import deprecated
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
+
+from ._internal._exporter_legacy import enable_fake_mode
+from ._internal.exporter._onnx_program import ONNXProgram
+from ._internal.onnxruntime import (
+    is_onnxrt_backend_supported,
+    OrtBackend as _OrtBackend,
+    OrtBackendOptions as _OrtBackendOptions,
+    OrtExecutionProvider as _OrtExecutionProvider,
+)
+from ._type_utils import JitScalarType
+from .errors import OnnxExporterError
+from .utils import (
+    _run_symbolic_function,
+    _run_symbolic_method,
+    register_custom_op_symbolic,
+    select_model_mode_for_export,
+    unregister_custom_op_symbolic,
+)
+
+
+from . import (  # usort: skip. Keep the order instead of sorting lexicographically
+    errors,
+    ops,
+    symbolic_caffe2,
+    symbolic_helper,
+    symbolic_opset7,
+    symbolic_opset8,
+    symbolic_opset9,
+    symbolic_opset10,
+    symbolic_opset11,
+    symbolic_opset12,
+    symbolic_opset13,
+    symbolic_opset14,
+    symbolic_opset15,
+    symbolic_opset16,
+    symbolic_opset17,
+    symbolic_opset18,
+    symbolic_opset19,
+    symbolic_opset20,
+    utils,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     import os
+<<<<<<< HEAD
     from collections.abc import Callable, Collection, Mapping, Sequence
 
 # Set namespace for exposed private names
@@ -52,6 +142,20 @@
 OnnxExporterError.__module__ = "torch.onnx"
 
 # TODO(justinchuby): Remove these two properties
+=======
+    from collections.abc import Collection, Mapping, Sequence
+
+# Set namespace for exposed private names
+JitScalarType.__module__ = "torch.onnx"
+ONNXProgram.__module__ = "torch.onnx"
+OnnxExporterError.__module__ = "torch.onnx"
+_OrtBackend.__module__ = "torch.onnx"
+_OrtBackendOptions.__module__ = "torch.onnx"
+_OrtExecutionProvider.__module__ = "torch.onnx"
+enable_fake_mode.__module__ = "torch.onnx"
+is_onnxrt_backend_supported.__module__ = "torch.onnx"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 producer_name = "pytorch"
 producer_version = _C_onnx.PRODUCER_VERSION
 
@@ -65,11 +169,23 @@ def export(
     f: str | os.PathLike | None = None,
     *,
     kwargs: dict[str, Any] | None = None,
+<<<<<<< HEAD
+=======
+    export_params: bool = True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     verbose: bool | None = None,
     input_names: Sequence[str] | None = None,
     output_names: Sequence[str] | None = None,
     opset_version: int | None = None,
+<<<<<<< HEAD
     dynamo: bool = True,
+=======
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool = False,
+    dynamo: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Dynamo only options
     external_data: bool = True,
     dynamic_shapes: dict[str, Any] | tuple[Any, ...] | list[Any] | None = None,
@@ -82,12 +198,15 @@ def export(
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
     fallback: bool = False,
+<<<<<<< HEAD
     # BC options
     export_params: bool = True,
     keep_initializers_as_inputs: bool = False,
     dynamic_axes: Mapping[str, Mapping[int, str]]
     | Mapping[str, Sequence[int]]
     | None = None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Deprecated options
     training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
     operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
@@ -100,7 +219,11 @@ def export(
 
     Setting ``dynamo=True`` enables the new ONNX export logic
     which is based on :class:`torch.export.ExportedProgram` and a more modern
+<<<<<<< HEAD
     set of translation logic. This is the recommended and default way to export models
+=======
+    set of translation logic. This is the recommended way to export models
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     to ONNX.
 
     When ``dynamo=True``:
@@ -110,22 +233,38 @@ def export(
     #. If the model is already an ExportedProgram, it will be used as-is.
     #. Use :func:`torch.export.export` and set ``strict=False``.
     #. Use :func:`torch.export.export` and set ``strict=True``.
+<<<<<<< HEAD
+=======
+    #. Use ``draft_export`` which removes some soundness guarantees in data-dependent
+       operations to allow export to proceed. You will get a warning if the exporter
+       encounters any unsound data-dependent operation.
+    #. Use :func:`torch.jit.trace` to trace the model then convert to ExportedProgram.
+       This is the most unsound strategy but may be useful for converting TorchScript
+       models to ONNX.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         model: The model to be exported.
         args: Example positional inputs. Any non-Tensor arguments will be hard-coded into the
             exported model; any Tensor arguments will become inputs of the exported model,
             in the order they occur in the tuple.
+<<<<<<< HEAD
         f: Path to the output ONNX model file. E.g. "model.onnx". This argument is kept for
             backward compatibility. It is recommended to leave unspecified (None)
             and use the returned :class:`torch.onnx.ONNXProgram` to serialize the model
             to a file instead.
         kwargs: Optional example keyword inputs.
+=======
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Optional example keyword inputs.
+        export_params: If false, parameters (weights) will not be exported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         verbose: Whether to enable verbose logging.
         input_names: names to assign to the input nodes of the graph, in order.
         output_names: names to assign to the output nodes of the graph, in order.
         opset_version: The version of the
             `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+<<<<<<< HEAD
             to target. You should set ``opset_version`` according to the supported opset versions
             of the runtime backend or compiler you want to run the exported model with.
             Leave as default (``None``) to use the recommended version, or refer to
@@ -176,6 +315,10 @@ def export(
         dynamic_axes:
             Prefer specifying ``dynamic_shapes`` when ``dynamo=True`` and when ``fallback``
             is not enabled.
+=======
+            to target. Must be >= 7.
+        dynamic_axes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             By default the exported model will have the shapes of all input and output tensors
             set to exactly match those given in ``args``. To specify axes of tensors as
@@ -257,12 +400,93 @@ def forward(self, x):
                           dim_param: "sum_dynamic_axes_1"  # axis 0
                 ...
 
+<<<<<<< HEAD
         training: Deprecated option. Instead, set the training mode of the model before exporting.
         operator_export_type: Deprecated option. Only ONNX is supported.
         do_constant_folding: Deprecated option.
         custom_opsets: Deprecated option.
         export_modules_as_functions: Deprecated option.
         autograd_inlining: Deprecated option.
+=======
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to model weights) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the user inputs are added as inputs.
+
+            Set this to True if you intend to supply model weights at runtime.
+            Set it to False if the weights are static to allow for better optimizations
+            (e.g. constant folding) by backends/runtimes.
+
+        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
+        external_data: Whether to save the model weights as an external data file.
+            This is required for models with large weights that exceed the ONNX file size limit (2GB).
+            When False, the weights are saved in the ONNX file with the model architecture.
+        dynamic_shapes: A dictionary or a tuple of dynamic shapes for the model inputs. Refer to
+            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
+            Note that dynamic_shapes is designed to be used when the model is exported with dynamo=True, while
+            dynamic_axes is used when dynamo=False.
+        custom_translation_table: A dictionary of custom decompositions for operators in the model.
+            The dictionary should have the callable target in the fx Node as the key (e.g. ``torch.ops.aten.stft.default``),
+            and the value should be a function that builds that graph using ONNX Script. This option
+            is only valid when dynamo is True.
+        report: Whether to generate a markdown report for the export process. This option
+            is only valid when dynamo is True.
+        optimize: Whether to optimize the exported model. This option
+            is only valid when dynamo is True. Default is True.
+        verify: Whether to verify the exported model using ONNX Runtime. This option
+            is only valid when dynamo is True.
+        profile: Whether to profile the export process. This option
+            is only valid when dynamo is True.
+        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
+            This is useful for debugging the exporter. This option is only valid when dynamo is True.
+        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
+            exported program. This option is only valid when dynamo is True.
+        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
+            This option is only valid when dynamo is True. When fallback is enabled, It is
+            recommended to set dynamic_axes even when dynamic_shapes is provided.
+
+        training: Deprecated option. Instead, set the training mode of the model before exporting.
+        operator_export_type: Deprecated option. Only ONNX is supported.
+        do_constant_folding: Deprecated option.
+        custom_opsets: Deprecated.
+            A dictionary:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+        export_modules_as_functions: Deprecated option.
+
+            Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+        autograd_inlining: Deprecated.
+            Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         :class:`torch.onnx.ONNXProgram` if dynamo is True, otherwise None.
@@ -275,8 +499,11 @@ def forward(self, x):
         *autograd_inlining* is now deprecated.
     .. versionchanged:: 2.7
         *optimize* is now True by default.
+<<<<<<< HEAD
     .. versionchanged:: 2.9
         *dynamo* is now True by default.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if dynamo is True or isinstance(model, torch.export.ExportedProgram):
         from torch.onnx._internal.exporter import _compat
@@ -320,7 +547,11 @@ def forward(self, x):
     else:
         import warnings
 
+<<<<<<< HEAD
         from ._internal.torchscript_exporter.utils import export
+=======
+        from torch.onnx.utils import export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         warnings.warn(
             "You are using the legacy TorchScript-based ONNX export. Starting in PyTorch 2.9, "
@@ -362,9 +593,118 @@ def forward(self, x):
         return None
 
 
+<<<<<<< HEAD
 def is_in_onnx_export() -> bool:
     """Returns whether it is in the middle of ONNX export."""
     from torch.onnx._internal.exporter import _flags
     from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+=======
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead."
+)
+class ExportOptions:
+    """Options for dynamo_export.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    Attributes:
+        dynamic_shapes: Shape information hint for input/output tensors.
+            When ``None``, the exporter determines the most compatible setting.
+            When ``True``, all input shapes are considered dynamic.
+            When ``False``, all input shapes are considered static.
+    """
+
+    def __init__(self, *, dynamic_shapes: bool | None = None):
+        self.dynamic_shapes: bool | None = dynamic_shapes
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead."
+)
+def dynamo_export(
+    model: torch.nn.Module | Callable | torch.export.ExportedProgram,  # type: ignore[name-defined]
+    /,
+    *model_args,
+    export_options: ExportOptions | None = None,
+    **model_kwargs,
+) -> ONNXProgram:
+    """Export a torch.nn.Module to an ONNX graph.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    Args:
+        model: The PyTorch model to be exported to ONNX.
+        model_args: Positional inputs to ``model``.
+        model_kwargs: Keyword inputs to ``model``.
+        export_options: Options to influence the export to ONNX.
+
+    Returns:
+        An in-memory representation of the exported ONNX model.
+    """
+
+    import warnings
+
+    from torch.onnx._internal.exporter import _compat
+    from torch.utils import _pytree
+
+    if isinstance(model, torch.export.ExportedProgram):
+        return _compat.export_compat(
+            model,  # type: ignore[arg-type]
+            model_args,
+            f=None,
+            kwargs=model_kwargs,
+            opset_version=18,
+            external_data=True,
+            export_params=True,
+            fallback=True,
+        )
+    if export_options is not None:
+        warnings.warn(
+            "You are using an experimental ONNX export logic, which currently only supports dynamic shapes. "
+            "For a more comprehensive set of export options, including advanced features, please consider using "
+            "`torch.onnx.export(..., dynamo=True)`. ",
+            category=DeprecationWarning,
+        )
+
+    if export_options is not None and export_options.dynamic_shapes:
+        # Make all shapes dynamic if it's possible
+        def _to_dynamic_shape(x):
+            if isinstance(x, torch.Tensor):
+                rank = len(x.shape)
+                dynamic_shape = {}
+                for i in range(rank):
+                    dynamic_shape[i] = torch.export.Dim.AUTO
+                return dynamic_shape
+            else:
+                return None
+
+        # model_args could be nested
+        dynamic_shapes = _pytree.tree_map(
+            _to_dynamic_shape,
+            model_args,
+        )
+    else:
+        dynamic_shapes = None
+
+    return _compat.export_compat(
+        model,  # type: ignore[arg-type]
+        model_args,
+        f=None,
+        kwargs=model_kwargs,
+        dynamic_shapes=dynamic_shapes,
+        opset_version=18,
+        external_data=True,
+        export_params=True,
+        fallback=True,
+    )
+
+
+def is_in_onnx_export() -> bool:
+    """Returns whether it is in the middle of ONNX export."""
+    from torch.onnx._globals import GLOBALS
+    from torch.onnx._internal.exporter import _flags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return GLOBALS.in_onnx_export or _flags._is_onnx_exporting
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index 87ff04da8cd1e..a1cdbbdc708c4 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -6,7 +6,11 @@
 ONNX_MIN_OPSET = 7
 ONNX_MAX_OPSET = 23
 ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 20
+<<<<<<< HEAD
 ONNX_DEFAULT_OPSET = 20
+=======
+ONNX_DEFAULT_OPSET = 18
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
 
 PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
diff --git a/torch/onnx/_experimental.py b/torch/onnx/_experimental.py
new file mode 100644
index 0000000000000..0fac4450a71c8
--- /dev/null
+++ b/torch/onnx/_experimental.py
@@ -0,0 +1,28 @@
+"""Experimental classes and functions used by ONNX export."""
+
+import dataclasses
+from collections.abc import Mapping, Sequence
+from typing import Optional, Union
+
+import torch
+import torch._C._onnx as _C_onnx
+
+
+@dataclasses.dataclass
+class ExportOptions:
+    """Arguments used by :func:`torch.onnx.export`."""
+
+    # TODO(justinchuby): Deprecate and remove this class.
+
+    export_params: bool = True
+    verbose: bool = False
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+    input_names: Optional[Sequence[str]] = None
+    output_names: Optional[Sequence[str]] = None
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX
+    opset_version: Optional[int] = None
+    do_constant_folding: bool = True
+    dynamic_axes: Optional[Mapping[str, Union[Mapping[int, str], Sequence[int]]]] = None
+    keep_initializers_as_inputs: Optional[bool] = None
+    custom_opsets: Optional[Mapping[str, int]] = None
+    export_modules_as_functions: Union[bool, set[type[torch.nn.Module]]] = False
diff --git a/torch/onnx/_flags.py b/torch/onnx/_flags.py
index bcc9f633e7584..e200f552cec51 100644
--- a/torch/onnx/_flags.py
+++ b/torch/onnx/_flags.py
@@ -43,6 +43,7 @@ def _load_boolean_flag(
     return state
 
 
+<<<<<<< HEAD
 ENABLE_DRAFT_EXPORT: bool = _load_boolean_flag(
     "TORCH_ONNX_ENABLE_DRAFT_EXPORT",
     this_will="enable torch.export.draft_export as a strategy for capturing models",
@@ -51,5 +52,10 @@ def _load_boolean_flag(
 PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS: bool = _load_boolean_flag(
     "TORCH_ONNX_PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS",
     this_will="set prefer_deferred_runtime_asserts_over_guards when calling torch.export",
+=======
+PLACEHOLDER: bool = _load_boolean_flag(
+    "TORCH_ONNX_PLACEHOLDER",
+    this_will="do nothing",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default=True,
 )
diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
new file mode 100644
index 0000000000000..55d0550324e73
--- /dev/null
+++ b/torch/onnx/_globals.py
@@ -0,0 +1,82 @@
+# mypy: allow-untyped-defs
+"""Globals used internally by the ONNX exporter.
+
+Do not use this module outside of `torch.onnx` and its tests.
+
+Be very judicious when adding any new global variables. Do not create new global
+variables unless they are absolutely necessary.
+"""
+
+import torch._C._onnx as _C_onnx
+
+# This module should only depend on _constants and nothing else in torch.onnx to keep
+# dependency direction clean.
+from torch.onnx import _constants
+
+
+class _InternalGlobals:
+    """Globals used internally by ONNX exporter.
+
+    NOTE: Be very judicious when adding any new variables. Do not create new
+    global variables unless they are absolutely necessary.
+    """
+
+    def __init__(self) -> None:
+        self._export_onnx_opset_version = _constants.ONNX_DEFAULT_OPSET
+        self._training_mode: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+        self._in_onnx_export: bool = False
+        # Whether the user's model is training during export
+        self.export_training: bool = False
+        self.operator_export_type: _C_onnx.OperatorExportTypes = (
+            _C_onnx.OperatorExportTypes.ONNX
+        )
+        self.onnx_shape_inference: bool = True
+        self._autograd_inlining: bool = True
+
+    @property
+    def training_mode(self):
+        """The training mode for the exporter."""
+        return self._training_mode
+
+    @training_mode.setter
+    def training_mode(self, training_mode: _C_onnx.TrainingMode):
+        if not isinstance(training_mode, _C_onnx.TrainingMode):
+            raise TypeError(
+                "training_mode must be of type 'torch.onnx.TrainingMode'. This is "
+                "likely a bug in torch.onnx."
+            )
+        self._training_mode = training_mode
+
+    @property
+    def export_onnx_opset_version(self) -> int:
+        """Opset version used during export."""
+        return self._export_onnx_opset_version
+
+    @export_onnx_opset_version.setter
+    def export_onnx_opset_version(self, value: int):
+        self._export_onnx_opset_version = value
+
+    @property
+    def in_onnx_export(self) -> bool:
+        """Whether it is in the middle of ONNX export."""
+        return self._in_onnx_export
+
+    @in_onnx_export.setter
+    def in_onnx_export(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("in_onnx_export must be a boolean")
+        self._in_onnx_export = value
+
+    @property
+    def autograd_inlining(self) -> bool:
+        """Whether Autograd must be inlined."""
+        return self._autograd_inlining
+
+    @autograd_inlining.setter
+    def autograd_inlining(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("autograd_inlining must be a boolean")
+        self._autograd_inlining = value
+
+
+GLOBALS = _InternalGlobals()
diff --git a/torch/onnx/_internal/_exporter_legacy.py b/torch/onnx/_internal/_exporter_legacy.py
new file mode 100644
index 0000000000000..b3150ef9cdeb3
--- /dev/null
+++ b/torch/onnx/_internal/_exporter_legacy.py
@@ -0,0 +1,496 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+
+__all__ = [
+    "ExportOptions",
+    "ONNXRuntimeOptions",
+    "OnnxRegistry",
+    "enable_fake_mode",
+]
+
+
+import abc
+import contextlib
+import dataclasses
+import logging
+import warnings
+from collections import defaultdict
+from typing import Any, Callable, TYPE_CHECKING
+from typing_extensions import deprecated
+
+import torch
+import torch._ops
+from torch.onnx._internal import io_adapter
+from torch.onnx._internal._lazy_import import onnxscript_apis
+from torch.onnx._internal.exporter import _constants
+from torch.onnx._internal.fx import (
+    decomposition_table,
+    patcher as patcher,
+    registration,
+)
+
+
+# We can only import onnx from this module in a type-checking context to ensure that
+# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
+# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
+if TYPE_CHECKING:
+    import io
+    from collections.abc import Mapping, Sequence
+
+    import onnxruntime
+    import onnxscript
+
+    from torch._subclasses import fake_tensor
+
+log = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class ONNXFakeContext:
+    """A dataclass used to store context for model export using FakeTensor.
+
+    This dataclass stores the FakeTensorMode instance used to convert
+    real tensors and model parameters into fake tensors. This :attr:`ONNXFakeContext.fake_mode` is
+    reused internally during tracing of a :class:`torch.nn.Module` into a FX :class:`GraphModule`.
+    """
+
+    fake_mode: fake_tensor.FakeTensorMode
+    """The fake tensor mode used for tracing model using fake tensors and parameters."""
+
+    state_dict_paths: tuple[str | io.BytesIO | dict[str, Any]] | None = None
+    """List of paths of files that contain the model :meth:`state_dict`"""
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead.",
+)
+class OnnxRegistry:
+    """Registry for ONNX functions.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    The registry maintains a mapping from qualified names to symbolic functions under a
+    fixed opset version. It supports registering custom onnx-script functions and for
+    dispatcher to dispatch calls to the appropriate function.
+
+    """
+
+    def __init__(self) -> None:
+        """Initializes the registry"""
+
+        # NOTE: _registry is the registry maps OpNameto a list of ONNXFunctions. It is important
+        # not to directly modify this variable. Instead, access to it should be done through
+        # the public methods: register_custom_op, get_ops, and is_registered_op.
+        self._registry: dict[registration.OpName, list[registration.ONNXFunction]] = (
+            defaultdict(list)
+        )
+
+        self._opset_version = _constants.TORCHLIB_OPSET
+        warnings.warn(
+            f"torch.onnx.dynamo_export only implements opset version {self._opset_version} for now. If you need to use a "
+            "different opset version, please register them with register_custom_op."
+        )
+
+        self._initiate_registry_from_torchlib()
+
+    @property
+    def opset_version(self) -> int:
+        """The ONNX opset version the exporter should target."""
+
+        return self._opset_version
+
+    def _initiate_registry_from_torchlib(self) -> None:
+        """Populates the registry with ATen functions from torchlib.
+
+        Args:
+            torchlib_registry: The torchlib registry to use for populating the registry.
+        """
+        for meta in onnxscript_apis.get_torchlib_ops():
+            internal_name_instance = registration.OpName.from_qualified_name(
+                meta.qualified_name
+            )
+            symbolic_function = registration.ONNXFunction(
+                onnx_function=meta.function,  # type: ignore[arg-type]
+                op_full_name=internal_name_instance.qualified_name(),
+                is_custom=False,
+                is_complex=meta.is_complex,
+            )
+            self._register(internal_name_instance, symbolic_function)
+
+    def _register(
+        self,
+        internal_qualified_name: registration.OpName,
+        symbolic_function: registration.ONNXFunction,
+    ) -> None:
+        """Registers a ONNXFunction to an operator.
+
+        Args:
+            internal_qualified_name: The qualified name of the operator to register: OpName.
+            symbolic_function: The ONNXFunction to register.
+        """
+        self._registry[internal_qualified_name].append(symbolic_function)
+
+    def register_op(
+        self,
+        function: onnxscript.OnnxFunction | onnxscript.TracedOnnxFunction,
+        namespace: str,
+        op_name: str,
+        overload: str | None = None,
+        is_complex: bool = False,
+    ) -> None:
+        """Registers a custom operator: torch.ops.<namespace>.<op_name>.<overload>.
+
+        Args:
+            function: The onnx-sctip function to register.
+            namespace: The namespace of the operator to register.
+            op_name: The name of the operator to register.
+            overload: The overload of the operator to register. If it's default overload,
+                leave it to None.
+            is_complex: Whether the function is a function that handles complex valued inputs.
+
+        Raises:
+            ValueError: If the name is not in the form of 'namespace::op'.
+        """
+        internal_name_instance = registration.OpName.from_name_parts(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        symbolic_function = registration.ONNXFunction(
+            onnx_function=function,
+            op_full_name=internal_name_instance.qualified_name(),
+            is_custom=True,
+            is_complex=is_complex,
+        )
+        self._register(internal_name_instance, symbolic_function)
+
+    def get_op_functions(
+        self, namespace: str, op_name: str, overload: str | None = None
+    ) -> list[registration.ONNXFunction] | None:
+        """Returns a list of ONNXFunctions for the given op: torch.ops.<namespace>.<op_name>.<overload>.
+
+        The list is ordered by the time of registration. The custom operators should be
+        in the second half of the list.
+
+        Args:
+            namespace: The namespace of the operator to get.
+            op_name: The name of the operator to get.
+            overload: The overload of the operator to get. If it's default overload,
+                leave it to None.
+        Returns:
+            A list of ONNXFunctions corresponding to the given name, or None if
+            the name is not in the registry.
+        """
+        internal_name_instance = registration.OpName.from_name_parts(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        return self._registry.get(internal_name_instance)
+
+    def is_registered_op(
+        self, namespace: str, op_name: str, overload: str | None = None
+    ) -> bool:
+        """Returns whether the given op is registered: torch.ops.<namespace>.<op_name>.<overload>.
+
+        Args:
+            namespace: The namespace of the operator to check.
+            op_name: The name of the operator to check.
+            overload: The overload of the operator to check. If it's default overload,
+                leave it to None.
+
+        Returns:
+            True if the given op is registered, otherwise False.
+        """
+        functions = self.get_op_functions(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        return functions is not None
+
+    def _all_registered_ops(self) -> set[str]:
+        """Returns the set of all registered function names."""
+        return {
+            op_name_class.qualified_name() for op_name_class in self._registry.keys()
+        }
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead.",
+    category=None,
+)
+class ExportOptions:
+    """Options to influence the TorchDynamo ONNX exporter.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    Attributes:
+        dynamic_shapes: Shape information hint for input/output tensors.
+            When ``None``, the exporter determines the most compatible setting.
+            When ``True``, all input shapes are considered dynamic.
+            When ``False``, all input shapes are considered static.
+        fake_context: The fake context used for symbolic tracing.
+        onnx_registry: The ONNX registry used to register ATen operators to ONNX functions.
+    """
+
+    def __init__(
+        self,
+        *,
+        dynamic_shapes: bool | None = True,
+        fake_context: ONNXFakeContext | None = None,
+        onnx_registry: OnnxRegistry | None = None,
+    ):
+        self.dynamic_shapes = dynamic_shapes
+        self.fake_context = fake_context
+        self.onnx_registry = onnx_registry
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead.",
+    category=None,
+)
+class ResolvedExportOptions(ExportOptions):
+    """Consolidates :class:`ExportOptions` with default values.
+    All unspecified options from :class:`ExportOptions` are assigned a default value.
+    This is an internal class and its API may be changed at any time without notice.
+    """
+
+    def __init__(self):
+        from torch.onnx._internal.fx import (
+            dynamo_graph_extractor,
+            onnxfunction_dispatcher,
+        )
+
+        self.dynamic_shapes: bool = True
+        self.fx_tracer: dynamo_graph_extractor.DynamoExport = (
+            dynamo_graph_extractor.DynamoExport()
+        )
+        self.fake_context = None
+        self.onnx_registry: OnnxRegistry = OnnxRegistry()
+        self.decomposition_table = (
+            decomposition_table.create_onnx_friendly_decomposition_table(  # type: ignore[assignment]
+                self.onnx_registry
+            )
+        )
+        self.onnxfunction_dispatcher = onnxfunction_dispatcher.OnnxFunctionDispatcher(
+            self.onnx_registry,
+        )
+
+
+@contextlib.contextmanager
+def enable_fake_mode():
+    """Enable fake mode for the duration of the context.
+
+    Internally it instantiates a :class:`torch._subclasses.fake_tensor.FakeTensorMode` context manager
+    that converts user input and model parameters into :class:`torch._subclasses.fake_tensor.FakeTensor`.
+
+    A :class:`torch._subclasses.fake_tensor.FakeTensor`
+    is a :class:`torch.Tensor` with the ability to run PyTorch code without having to
+    actually do computation through tensors allocated on a ``meta`` device. Because
+    there is no actual data being allocated on the device, this API allows for
+    initializing and exporting large models without the actual memory footprint needed for executing it.
+
+    It is highly recommended to initialize the model in fake mode when exporting models that
+    are too large to fit into memory.
+
+    .. note::
+        This function does not support torch.onnx.export(..., dynamo=True, optimize=True).
+        Please call ONNXProgram.optimize() outside of the function after the model is exported.
+
+    Example::
+
+        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> class MyModel(torch.nn.Module):  # Model with a parameter
+        ...     def __init__(self) -> None:
+        ...         super().__init__()
+        ...         self.weight = torch.nn.Parameter(torch.tensor(42.0))
+        ...     def forward(self, x):
+        ...         return self.weight + x
+        >>> with torch.onnx.enable_fake_mode():
+        ...     # When initialized in fake mode, the model's parameters are fake tensors
+        ...     # They do not take up memory so we can initialize large models
+        ...     my_nn_module = MyModel()
+        ...     arg1 = torch.randn(2, 2, 2)
+        >>> onnx_program = torch.onnx.export(my_nn_module, (arg1,), dynamo=True, optimize=False)
+        >>> # Saving model WITHOUT initializers (only the architecture)
+        >>> onnx_program.save(
+        ...     "my_model_without_initializers.onnx",
+        ...     include_initializers=False,
+        ...     keep_initializers_as_inputs=True,
+        ... )
+        >>> # Saving model WITH initializers after applying concrete weights
+        >>> onnx_program.apply_weights({"weight": torch.tensor(42.0)})
+        >>> onnx_program.save("my_model_with_initializers.onnx")
+
+    .. warning::
+        This API is experimental and is *NOT* backward-compatible.
+
+    """
+    from torch._subclasses import fake_tensor
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+    # This overrides the internal `FakeTensorMode` instance created by `torch._dynamo.export`[1].
+    # It is a good idea to keep them in sync (constructor args) to maintain the same default behavior
+    # [1] `torch/_dynamo/output_graph.py::InstructionTranslator::OutputGraph.__init__`
+    # Mixed fake/real tensors are only allowed when `torch.onnx.dynamo_export` is not called within `FakeTensorMode`
+    # This is needed because models can create new parameters during `forward(self, *args, **kwargs)` run
+    fake_mode = fake_tensor.FakeTensorMode(
+        allow_non_fake_inputs=not torch._guards.detect_fake_mode(),
+        shape_env=ShapeEnv(
+            allow_scalar_outputs=False, allow_dynamic_output_shape_ops=False
+        ),
+    )
+    # The patcher is needed for when user calls `fake_model.load_state_dict(...)` within fake mode
+    patcher_context = patcher.ONNXTorchPatcher()
+    fake_context = ONNXFakeContext(fake_mode=fake_mode)
+    with fake_mode, patcher_context:
+        yield fake_context
+    fake_context.state_dict_paths = tuple(
+        patcher_context.paths,
+    )  # type: ignore[assignment]
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead.",
+)
+class ONNXRuntimeOptions:
+    """Options to influence the execution of the ONNX model through ONNX Runtime.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    Attributes:
+        session_options: ONNX Runtime session options.
+        execution_providers: ONNX Runtime execution providers to use during model execution.
+        execution_provider_options: ONNX Runtime execution provider options.
+    """
+
+    session_options: Sequence[onnxruntime.SessionOptions] | None = None
+    """ONNX Runtime session options."""
+
+    execution_providers: Sequence[str | tuple[str, dict[Any, Any]]] | None = None
+    """ONNX Runtime execution providers to use during model execution."""
+
+    execution_provider_options: Sequence[dict[Any, Any]] | None = None
+    """ONNX Runtime execution provider options."""
+
+    def __init__(
+        self,
+        *,
+        session_options: Sequence[onnxruntime.SessionOptions] | None = None,
+        execution_providers: Sequence[str | tuple[str, dict[Any, Any]]] | None = None,
+        execution_provider_options: Sequence[dict[Any, Any]] | None = None,
+    ):
+        self.session_options = session_options
+        self.execution_providers = execution_providers
+        self.execution_provider_options = execution_provider_options
+
+
+class FXGraphExtractor(abc.ABC):
+    """Abstract interface for FX graph extractor engines.
+    This class isolates FX extraction logic from the rest of the export logic.
+    That allows a single ONNX exporter that can leverage different FX graphs."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.input_adapter: io_adapter.InputAdapter = io_adapter.InputAdapter()
+        self.output_adapter: io_adapter.OutputAdapter = io_adapter.OutputAdapter()
+
+    @abc.abstractmethod
+    def generate_fx(
+        self,
+        options: ResolvedExportOptions,
+        model: torch.nn.Module | Callable,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        """Analyzes user ``model`` and generates a FX graph.
+        Args:
+            options: The export options.
+            model: The user model.
+            model_args: The model's positional input arguments.
+            model_kwargs: The model's keyword input arguments.
+        Returns:
+            The generated FX Graph.
+        """
+        ...
+
+    # TODO: Design the passes API
+    @abc.abstractmethod
+    def pre_export_passes(
+        self,
+        options: ResolvedExportOptions,
+        original_model: torch.nn.Module | Callable,
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        """Applies pre-export passes to the FX graph.
+
+        Pre-export passes are FX-to-FX graph transformations that make the graph
+        more palatable for the FX-to-ONNX conversion.
+        For example, it can be used to flatten model input/output, add explicit
+        casts to the graph, replace/decompose operators, functionalize the graph, etc.
+        """
+        ...
+
+
+def common_pre_export_passes(
+    options: ResolvedExportOptions,
+    original_model: torch.nn.Module | Callable,
+    fx_module: torch.fx.GraphModule,
+    fx_module_args: Sequence[Any],
+):
+    # TODO: Import here to prevent circular dependency
+    from torch.onnx._internal.fx import passes
+
+    # Apply decomposition table to the input graph.
+    module = passes.Decompose(
+        fx_module,
+        options.decomposition_table,  # type: ignore[arg-type]
+        enable_dynamic_axes=options.dynamic_shapes,
+        allow_fake_constant=options.fake_context is not None,
+    ).run(*fx_module_args)
+
+    # ONNX does not support views and mutations.
+    # Functionalize to get a semantically equivalent graph without mutations.
+    module = passes.Functionalize(
+        module,
+        enable_dynamic_axes=options.dynamic_shapes,
+        allow_fake_constant=options.fake_context is not None,
+    ).run(*fx_module_args)
+
+    # Input mutations are detected and distilled after `Functionalize` pass.
+    # Remove them since ONNX inference does not need them.
+    module = passes.RemoveInputMutation(module).run(*fx_module_args)
+
+    # ONNX does not support concept of (implicit) type promotion.
+    # Insert type casts explicitly where needed.
+    module = passes.InsertTypePromotion(module).run()
+
+    if isinstance(original_model, torch.nn.Module):
+        module = passes.RestoreParameterAndBufferNames(module, original_model).run()
+
+    # ONNX does not support None inputs. During graph building, all None inputs
+    # are removed. Here we register this step to input adapter.
+    options.fx_tracer.input_adapter.append_step(io_adapter.RemoveNoneInputStep())
+
+    # NOTE: temp workaround for https://github.com/pytorch/pytorch/issues/99534
+    # Dynamo doesn't support non-tensor inputs.
+    options.fx_tracer.input_adapter.append_step(io_adapter.RemoveNonTensorInputStep())
+
+    # ONNX does not support complex inputs. During graph building, all complex inputs
+    # are converted to real representation inputs. Here we register this step to
+    # input/output adapter.
+    options.fx_tracer.input_adapter.append_step(
+        io_adapter.ConvertComplexToRealRepresentationInputStep()
+    )
+
+    # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+    # tensor, etc), we flatten the collection and register each element as output.
+    options.fx_tracer.output_adapter.append_step(io_adapter.FlattenOutputStep())
+
+    # Output post-processing steps should happen after `FlattenOutputStep`.
+    options.fx_tracer.output_adapter.append_step(
+        io_adapter.ConvertComplexToRealRepresentationOutputStep()
+    )
+
+    return module
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index c1e58dc34fc03..78ea5c98da454 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -28,9 +28,15 @@ def __getattr__(self, attr: str) -> object:
 # NOTE: Add additional used imports here.
 if TYPE_CHECKING:
     import onnx
+<<<<<<< HEAD
     import onnx_ir  # type: ignore[import-untyped, import-not-found]
     import onnxscript
     import onnxscript._framework_apis.torch_2_9 as onnxscript_apis
+=======
+    import onnx_ir  # type: ignore[import-untyped]
+    import onnxscript
+    import onnxscript._framework_apis.torch_2_8 as onnxscript_apis
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     onnxscript_ir = onnx_ir
 
@@ -38,4 +44,8 @@ def __getattr__(self, attr: str) -> object:
     onnx = _LazyModule("onnx")
     onnxscript = _LazyModule("onnxscript")
     onnxscript_ir = _LazyModule("onnx_ir")
+<<<<<<< HEAD
     onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_9")
+=======
+    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_analysis.py b/torch/onnx/_internal/exporter/_analysis.py
index 4a5b27e41c0ce..07ad1485c6d3a 100644
--- a/torch/onnx/_internal/exporter/_analysis.py
+++ b/torch/onnx/_internal/exporter/_analysis.py
@@ -122,13 +122,19 @@ def _format_model_info(model_info: ModelInfo) -> str:
 
         target_to_nodes = defaultdict(list)
         for node, _ in model_info.dispatch_failures:
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_to_nodes[str(node.target)].append(node)
 
         target_to_messages = {}
         for node, message in model_info.dispatch_failures:
             if str(node.target) not in target_to_messages:
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 target_to_messages[str(node.target)] = message
 
         for target, nodes in sorted(
@@ -161,6 +167,7 @@ def _get_io_specs(exported_program: torch.export.ExportedProgram) -> tuple[dict,
         for spec in exported_program.graph_signature.output_specs
         if spec.kind == graph_signature.OutputKind.USER_OUTPUT
     ]
+<<<<<<< HEAD
     inputs: dict[str, torch._export.serde.schema.TensorMeta | str] = {}
     outputs: dict[str, torch._export.serde.schema.TensorMeta | str] = {}
     for spec in user_inputs:
@@ -203,6 +210,24 @@ def _log_spec_into_io_specs(
     return inputs_or_outputs
 
 
+=======
+    inputs: dict[str, torch._export.serde.schema.TensorMeta] = {}
+    outputs: dict[str, torch._export.serde.schema.TensorMeta] = {}
+    for spec in user_inputs:
+        if isinstance(spec.arg, graph_signature.ConstantArgument):
+            continue
+        name = spec.arg.name
+        # FIXME: tensor_meta is None sometimes when the exported program still knows the shape/type
+        inputs[name] = nodes[name].meta["tensor_meta"]
+    for spec in user_outputs:
+        if isinstance(spec.arg, graph_signature.ConstantArgument):
+            continue
+        name = spec.arg.name
+        outputs[name] = nodes[name].meta["tensor_meta"]
+    return inputs, outputs
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _count_fx_targets(
     exported_program: torch.export.ExportedProgram,
 ) -> defaultdict[str, int]:
diff --git a/torch/onnx/_internal/exporter/_building.py b/torch/onnx/_internal/exporter/_building.py
index bb07bf97d7bdd..9a096e7ddf30d 100644
--- a/torch/onnx/_internal/exporter/_building.py
+++ b/torch/onnx/_internal/exporter/_building.py
@@ -267,7 +267,10 @@ def _get_or_create_constant(
     # float representation of complex numbers
     if isinstance(arg, complex):
         # Convert the complex number to a float
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = (arg.real, arg.imag)
 
     if isinstance(arg, list):
@@ -646,6 +649,48 @@ def eval_function(  # type: ignore[override]
         kwargs: Mapping[str, AllowedArgType],
     ) -> _tensors.SymbolicTensor | Sequence[_tensors.SymbolicTensor] | bool | int:
         try:
+<<<<<<< HEAD
+=======
+            # TODO(justinchuby): Remove this once IsScalar and Rank are removed
+            # Special cases for handling IsScalar and Rank
+            if function.name == "IsScalar":
+                if len(args) != 1:
+                    raise TypeError(
+                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
+                    )
+                if isinstance(args[0], _tensors.SymbolicTensor):
+                    if args[0].rank is not None:
+                        return args[0].rank == 0
+                    else:
+                        # Fall to call add_function_call
+                        pass
+                elif isinstance(args[0], Sequence):
+                    return False
+                else:
+                    # Python constants are scalars
+                    return True
+            if function.name == "Rank":
+                if len(args) != 1:
+                    raise TypeError(
+                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
+                    )
+                if isinstance(args[0], _tensors.SymbolicTensor):
+                    if args[0].rank is not None:
+                        return args[0].rank
+                    else:
+                        # Fall to call add_function_call
+                        pass
+                elif isinstance(args[0], Sequence):
+                    if all(isinstance(arg, (int, float)) for arg in args[0]):
+                        return 1
+                    else:
+                        # Fall to call add_function_call
+                        pass
+                else:
+                    # Python constants are scalars
+                    return 0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # NOTE: signature should be written to function in the registration process
             if hasattr(function, "_pt_onnx_signature"):
                 op_signature = function._pt_onnx_signature  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index 63421ff5bb947..16db1584729f1 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -9,15 +9,25 @@
 import datetime
 import logging
 import pathlib
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.onnx import _flags
+=======
+from typing import Any, Callable, TYPE_CHECKING
+
+import torch
+from torch.export import _draft_export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     import os
+<<<<<<< HEAD
     from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger = logging.getLogger(__name__)
@@ -27,7 +37,10 @@ def _verbose_printer(verbose: bool | None) -> Callable[..., None]:
     """Prints messages based on `verbose`."""
     if verbose is False:
         return lambda *_, **__: None
+<<<<<<< HEAD
     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return lambda *args, **kwargs: print("[torch.onnx]", *args, **kwargs)
 
 
@@ -48,7 +61,10 @@ def _patch_dynamo_unsupported_functions():
 
     # Replace torch.jit.isinstance with isinstance
     jit_isinstance = torch.jit.isinstance
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.jit.isinstance = isinstance
     logger.info("Replaced torch.jit.isinstance with isinstance to allow dynamo tracing")
     try:
@@ -167,7 +183,10 @@ def _capture(
                     kwargs=kwargs,
                     dynamic_shapes=dynamic_shapes,
                     strict=True,
+<<<<<<< HEAD
                     prefer_deferred_runtime_asserts_over_guards=_flags.PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             except torch._dynamo.exc.UserError as exc:
                 # Refine the dynamic shapes based on the suggested fixes.
@@ -179,12 +198,16 @@ def _capture(
                     # If the dynamic shapes cannot be refined, re-raise the exception.
                     raise exc from None
                 return torch.export.export(
+<<<<<<< HEAD
                     model,
                     args,
                     kwargs=kwargs,
                     dynamic_shapes=new_shapes,
                     strict=True,
                     prefer_deferred_runtime_asserts_over_guards=_flags.PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS,
+=======
+                    model, args, kwargs=kwargs, dynamic_shapes=new_shapes, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def _enter(self, model) -> None:
@@ -222,7 +245,10 @@ def _capture(
                     kwargs=kwargs,
                     dynamic_shapes=dynamic_shapes,
                     strict=False,
+<<<<<<< HEAD
                     prefer_deferred_runtime_asserts_over_guards=_flags.PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             except torch._dynamo.exc.UserError as exc:
                 # Refine the dynamic shapes based on the suggested fixes.
@@ -234,12 +260,16 @@ def _capture(
                     # If the dynamic shapes cannot be refined, re-raise the exception.
                     raise exc from None
                 return torch.export.export(
+<<<<<<< HEAD
                     model,
                     args,
                     kwargs=kwargs,
                     dynamic_shapes=new_shapes,
                     strict=False,
                     prefer_deferred_runtime_asserts_over_guards=_flags.PREFER_DEFERRED_RUNTIME_ASSERTS_OVER_GUARDS,
+=======
+                    model, args, kwargs=kwargs, dynamic_shapes=new_shapes, strict=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def _enter(self, model) -> None:
@@ -266,7 +296,11 @@ class TorchExportDraftExportStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
     ) -> torch.export.ExportedProgram:
+<<<<<<< HEAD
         ep = torch.export.draft_export(
+=======
+        ep = _draft_export.draft_export(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model, args, kwargs=kwargs, dynamic_shapes=dynamic_shapes
         )
         report = ep._report  # type: ignore[attr-defined]
@@ -278,19 +312,28 @@ def _capture(
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`..."
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`..."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _success(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ✅"
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ✅"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _failure(self, model, e) -> None:
         del e  # Unused
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ❌"
         )
 
@@ -302,3 +345,14 @@ def _failure(self, model, e) -> None:
 
 if _flags.ENABLE_DRAFT_EXPORT:
     CAPTURE_STRATEGIES = (*CAPTURE_STRATEGIES, TorchExportDraftExportStrategy)
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ❌"
+        )
+
+
+CAPTURE_STRATEGIES = (
+    TorchExportNonStrictStrategy,  # strict=False is preferred over strict=True because it does not have dynamo issues
+    TorchExportStrictStrategy,
+    TorchExportDraftExportStrategy,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index 71803852690e3..f9c43cc72b5d2 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -4,6 +4,7 @@
 # mypy: disable-error-code=attr-defined
 from __future__ import annotations
 
+<<<<<<< HEAD
 import io
 import logging
 import warnings
@@ -13,6 +14,15 @@
 import torch
 from torch.onnx import _constants as onnx_constants
 from torch.onnx._internal._lazy_import import onnx, onnxscript_apis, onnxscript_ir as ir
+=======
+import logging
+import warnings
+from collections.abc import Mapping, Sequence
+from typing import Any, Callable, TYPE_CHECKING
+
+import torch
+from torch.onnx._internal._lazy_import import onnxscript_apis, onnxscript_ir as ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.exporter import (
     _constants,
     _core,
@@ -52,7 +62,11 @@ def export_compat(
     verbose: bool | None = None,
     input_names: Sequence[str] | None = None,
     output_names: Sequence[str] | None = None,
+<<<<<<< HEAD
     opset_version: int | None = onnx_constants.ONNX_DEFAULT_OPSET,
+=======
+    opset_version: int | None = _constants.TORCHLIB_OPSET,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_translation_table: dict[Callable, Callable | Sequence[Callable]]
     | None = None,
     dynamic_axes: Mapping[str, Mapping[int, str]]
@@ -62,7 +76,11 @@ def export_compat(
     keep_initializers_as_inputs: bool = False,
     external_data: bool = True,
     report: bool = False,
+<<<<<<< HEAD
     optimize: bool = True,
+=======
+    optimize: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     verify: bool = False,
     profile: bool = False,
     dump_exported_program: bool = False,
@@ -72,6 +90,7 @@ def export_compat(
     legacy_export_kwargs: dict[str, Any] | None = None,
 ) -> _onnx_program.ONNXProgram:
     if opset_version is None:
+<<<<<<< HEAD
         opset_version = onnx_constants.ONNX_DEFAULT_OPSET
 
     if isinstance(model, torch.nn.Module):
@@ -84,6 +103,9 @@ def export_compat(
                 UserWarning,
                 stacklevel=2,
             )
+=======
+        opset_version = _constants.TORCHLIB_OPSET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(model, torch.export.ExportedProgram):
         # We know the model is already exported program, so the args, kwargs, and dynamic_shapes
@@ -120,6 +142,7 @@ def export_compat(
     dynamic_shapes_with_export_dim, need_axis_mapping = (
         _dynamic_shapes.convert_str_to_export_dim(dynamic_shapes)
     )
+<<<<<<< HEAD
 
     if opset_version < _constants.TORCHLIB_OPSET:
         logger.warning(
@@ -141,6 +164,9 @@ def export_compat(
     registry = _registration.ONNXRegistry().from_torchlib(
         opset_version=registry_opset_version
     )
+=======
+    registry = _registration.ONNXRegistry().from_torchlib(opset_version=opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if custom_translation_table is not None:
         for torch_op, onnx_ops in custom_translation_table.items():
             # TODO(justinchuby): Support complex inputs with annotations
@@ -223,6 +249,7 @@ def export_compat(
         onnx_program.optimize()
 
     if f is not None:
+<<<<<<< HEAD
         if isinstance(f, io.BytesIO):
             # For legacy export compatibility, we allow f to be a BytesIO object.
             # This is not explicitly supported but we may need to maintain the
@@ -241,5 +268,13 @@ def export_compat(
                 keep_initializers_as_inputs=keep_initializers_as_inputs,
                 external_data=external_data,
             )
+=======
+        onnx_program.save(
+            f,
+            include_initializers=export_params,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            external_data=external_data,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return onnx_program
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index 4458e00d7679a..a2d7cbd96787c 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -12,8 +12,13 @@
 import textwrap
 import traceback
 import typing
+<<<<<<< HEAD
 from collections.abc import Callable, Mapping, Sequence
 from typing import Any, Literal
+=======
+from collections.abc import Mapping, Sequence
+from typing import Any, Callable, Literal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import onnxscript
 import onnxscript.evaluator
@@ -79,7 +84,11 @@
     f"""\
     Failed to export the model with torch.export. {_BLUE}This is step 1/3{_END} of exporting the model to ONNX. Next steps:
     - Modify the model code for `torch.export.export` to succeed. Refer to https://pytorch.org/docs/stable/generated/exportdb/index.html for more information.
+<<<<<<< HEAD
     - Debug `torch.export.export` and submit a PR to PyTorch.
+=======
+    - Debug `torch.export.export` and summit a PR to PyTorch.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - Create an issue in the PyTorch GitHub repository against the {_BLUE}*torch.export*{_END} component and attach the full error stack as well as reproduction scripts."""
 )
 
@@ -94,7 +103,11 @@
     f"""\
     Failed to convert the exported program to an ONNX model. {_BLUE}This is step 3/3{_END} of exporting the model to ONNX. Next steps:
     - If there is a missing ONNX function, implement it and register it to the registry.
+<<<<<<< HEAD
     - If there is an internal error during ONNX conversion, debug the error and submit a PR to PyTorch.
+=======
+    - If there is an internal error during ONNX conversion, debug the error and summit a PR to PyTorch.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - Create an error report with `torch.onnx.export(..., report=True)`, and save the ExportedProgram as a pt2 file. Create an issue in the PyTorch GitHub repository against the {_BLUE}*onnx*{_END} component. Attach the error report and the pt2 model."""
 )
 
@@ -132,10 +145,15 @@ def numpy(self) -> npt.NDArray:
         # view the tensor as that dtype so that it is convertible to NumPy,
         # and then view it back to the proper dtype (using ml_dtypes obtained by
         # calling dtype.numpy()).
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         if self.dtype == ir.DataType.BFLOAT16:
             return (
                 # pyrefly: ignore [missing-attribute]
+=======
+        if self.dtype == ir.DataType.BFLOAT16:
+            return (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.raw.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
             )
         if self.dtype in {
@@ -144,11 +162,17 @@ def numpy(self) -> npt.NDArray:
             ir.DataType.FLOAT8E5M2,
             ir.DataType.FLOAT8E5M2FNUZ,
         }:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
             return self.raw.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
         if self.dtype == ir.DataType.FLOAT4E2M1:
             return _type_casting.unpack_float4x2_as_uint8(self.raw).view(
                 # pyrefly: ignore [missing-attribute]
+=======
+            return self.raw.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
+        if self.dtype == ir.DataType.FLOAT4E2M1:
+            return _type_casting.unpack_float4x2_as_uint8(self.raw).view(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.dtype.numpy()
             )
 
@@ -160,8 +184,15 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> npt.NDArray:
             return self.numpy()
         return self.numpy().__array__(dtype)
 
+<<<<<<< HEAD
     def _get_cbytes(self):
         """Get a ctypes byte array pointing to the tensor data."""
+=======
+    def tobytes(self) -> bytes:
+        # Implement tobytes to support native PyTorch types so we can use types like bloat16
+        # Reading from memory directly is also more efficient because
+        # it avoids copying to a NumPy array
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import torch._subclasses.fake_tensor
 
         with torch._subclasses.fake_tensor.unset_fake_temporarily():
@@ -170,12 +201,16 @@ def _get_cbytes(self):
 
         if isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor):
             raise TypeError(
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"Cannot take content out from the FakeTensor ('{self.name}'). Please replace the tensor "
                 "with a tensor backed by real data using ONNXProgram.apply_weights() "
                 "or save the model without initializers by setting include_initializers=False."
             )
 
+<<<<<<< HEAD
         # Return the tensor to ensure it is not garbage collected while the ctypes array is in use
         return tensor, (
             ctypes.c_ubyte * tensor.element_size() * tensor.numel()
@@ -191,6 +226,13 @@ def tobytes(self) -> bytes:
     def tofile(self, file) -> None:
         _, data = self._get_cbytes()
         return file.write(data)
+=======
+        return bytes(
+            (ctypes.c_ubyte * tensor.element_size() * tensor.numel()).from_address(
+                tensor.data_ptr()
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # https://github.com/pytorch/pytorch/blob/ee6cb6daa173896f8ea1876266a19775aaa4f610/torch/export/graph_signature.py#L56C1-L62C19
@@ -251,7 +293,10 @@ def _set_shape_type(
             if isinstance(dim, int):
                 dims.append(dim)
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dims.append(str(dim.node))
 
         # If the dtype is set already (e.g. by the onnx_symbolic ops),
@@ -284,6 +329,11 @@ def _set_shape_type(
     elif isinstance(meta_val, (float, torch.SymFloat)):
         value.dtype = ir.DataType.FLOAT
         value.shape = ir.Shape([])
+<<<<<<< HEAD
+=======
+    else:
+        pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_qualified_module_name(cls: Any) -> str:
@@ -738,12 +788,15 @@ def _handle_output_node(
     # node.args[0] can be a tuple with more than one elements. This happens when,
     # for example, a subgraph has multiple outputs. We flatten them all as ONNX graph outputs
     for output in node.args[0]:  # type: ignore[index,union-attr]
+<<<<<<< HEAD
         if output is None:
             logger.warning(
                 "Output node %s has None output. The output is ignored in the exported graph. Please ensure the graph output order is expected",
                 node.name,
             )
             continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_value_name = output.name  # type: ignore[union-attr]
         assert isinstance(output_value_name, str), (
             f"Bug: Expected {output_value_name!r} to be a string"
@@ -966,7 +1019,11 @@ def exported_program_to_ir(
     Args:
         exported_program: The exported program to convert.
         lower: Whether to lower the graph to core ONNX operators.
+<<<<<<< HEAD
             at_conversion: Lower when translating the FX graph to ONNX IR.
+=======
+            at_conversion: Lower whe translating the FX graph to ONNX IR.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             none: Do not lower the graph.
         registry: The registry of all ONNX Script decomposition.
     """
@@ -988,6 +1045,7 @@ def _prepare_exported_program_for_export(
 ) -> torch.export.ExportedProgram:
     """Decompose and apply pre-export transformations to the exported program."""
 
+<<<<<<< HEAD
     with (
         # Support the dynamism with 0/1 input dim
         torch.fx.experimental._config.patch(backed_size_oblivious=True),  # type: ignore[attr-defined]
@@ -1004,6 +1062,18 @@ def _prepare_exported_program_for_export(
         # Reassign the graph module to save some runtime.
         exported_program._graph_module = graph_module
         return exported_program
+=======
+    # Decompose the graph given the implemented torch ops in ONNX
+    exported_program = _fx_passes.decompose_with_registry(exported_program, registry)
+
+    graph_module = exported_program.graph_module
+    # Include explicit type promotion nodes
+    _fx_passes.insert_type_promotion_nodes(graph_module)
+    graph_module = _fx_passes.remove_assertion_nodes(graph_module)
+    # Reassign the graph module to save some runtime.
+    exported_program._graph_module = graph_module
+    return exported_program
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_scope_name(scoped_name: str) -> tuple[str, str]:
@@ -1050,7 +1120,11 @@ def _exported_program_to_onnx_program(
         exported_program: The exported program to convert. The exported program
             should be the one that is after decompositions have been applied.
         lower: Whether to lower the graph to core ONNX operators.
+<<<<<<< HEAD
             at_conversion: Lower when translating the FX graph to ONNX IR.
+=======
+            at_conversion: Lower whe translating the FX graph to ONNX IR.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             none: Do not lower the graph.
         registry: The registry of all ONNX Script decomposition.
     """
@@ -1232,7 +1306,10 @@ def _exported_program_to_onnx_program(
     # so we need to get them from the name_* apis.
     for name, torch_tensor in itertools.chain(
         exported_program.named_parameters(),
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exported_program.named_buffers(),
         exported_program.constants.items(),
     ):
@@ -1255,6 +1332,12 @@ def _exported_program_to_onnx_program(
 
     # TODO: Decide if we should keep mutated buffers as inputs/outputs
 
+<<<<<<< HEAD
+=======
+    # TODO(justinchuby): Remove the hack
+    _ir_passes.add_torchlib_common_imports(model)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Collect and add opset imports to the model
     _ir_passes.add_opset_imports(model)
 
@@ -1265,7 +1348,10 @@ def _verbose_printer(verbose: bool | None) -> Callable[..., None]:
     """Prints messages based on `verbose`."""
     if verbose is False:
         return lambda *_, **__: None
+<<<<<<< HEAD
     # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return lambda *args, **kwargs: print("[torch.onnx]", *args, **kwargs)
 
 
diff --git a/torch/onnx/_internal/exporter/_decomp.py b/torch/onnx/_internal/exporter/_decomp.py
index 4988706404e97..d25a20debe2bb 100644
--- a/torch/onnx/_internal/exporter/_decomp.py
+++ b/torch/onnx/_internal/exporter/_decomp.py
@@ -2,15 +2,22 @@
 from __future__ import annotations
 
 import itertools
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._ops
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.onnx._internal.exporter import _registration
 
 
diff --git a/torch/onnx/_internal/exporter/_dispatching.py b/torch/onnx/_internal/exporter/_dispatching.py
index 1f935cfed192d..f01e89f2a3fe9 100644
--- a/torch/onnx/_internal/exporter/_dispatching.py
+++ b/torch/onnx/_internal/exporter/_dispatching.py
@@ -2,8 +2,13 @@
 from __future__ import annotations
 
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any
+=======
+from collections.abc import Sequence
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from onnxscript import ir
 
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index e128ecf74e9e4..185c06be9e5bc 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -39,6 +39,7 @@ def from_dynamic_axes_to_dynamic_shapes(
 
     Detail on Dim.DYNAMIC: `#133620 <https://github.com/pytorch/pytorch/pull/133620>`_
     """
+<<<<<<< HEAD
 
     warnings.warn(
         "from_dynamic_axes_to_dynamic_shapes is deprecated and will be removed in a future release. "
@@ -48,6 +49,8 @@ def from_dynamic_axes_to_dynamic_shapes(
         stacklevel=2,
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/pull/128371
     # 1. The function does not need to provide dynamic_shapes to torch.export.export
     if dynamic_axes is None:
@@ -71,8 +74,14 @@ def from_dynamic_axes_to_dynamic_shapes(
                 raise ValueError(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
+<<<<<<< HEAD
             # str will be converted to Dim.DYNAMIC in convert_str_to_export_dim
             dynamic_shapes[input_name] = axes
+=======
+            dynamic_shapes[input_name] = {
+                k: torch.export.Dim.DYNAMIC for k, _ in axes.items()
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(axes, list):
             if any(not isinstance(k, int) for k in axes):
                 raise ValueError(
@@ -271,8 +280,12 @@ def create_rename_mapping(
                 if input.shape[dim].value in rename_mapping:
                     warnings.warn(
                         f"# The axis name: {custom_name} will not be used, since it shares "
+<<<<<<< HEAD
                         f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}.",
                         stacklevel=2,
+=======
+                        f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     continue
                 rename_mapping[input.shape[dim].value] = custom_name
diff --git a/torch/onnx/_internal/exporter/_flags.py b/torch/onnx/_internal/exporter/_flags.py
index 8e9d8c9db6e44..360e79ba20942 100644
--- a/torch/onnx/_internal/exporter/_flags.py
+++ b/torch/onnx/_internal/exporter/_flags.py
@@ -3,16 +3,21 @@
 from __future__ import annotations
 
 import functools
+<<<<<<< HEAD
 from typing import TYPE_CHECKING, TypeVar
 from typing_extensions import ParamSpec
 
 
 if TYPE_CHECKING:
     from collections.abc import Callable
+=======
+from typing import Any, Callable, cast, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _is_onnx_exporting = False
 
+<<<<<<< HEAD
 # Use ParamSpec to preserve parameter types instead of erasing to Any
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
@@ -21,6 +26,14 @@
 def set_onnx_exporting_flag(func: Callable[_P, _R]) -> Callable[_P, _R]:
     @functools.wraps(func)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+=======
+TCallable = TypeVar("TCallable", bound=Callable[..., Any])
+
+
+def set_onnx_exporting_flag(func: TCallable) -> TCallable:
+    @functools.wraps(func)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global _is_onnx_exporting
         _is_onnx_exporting = True
         try:
@@ -29,4 +42,8 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
             # Ensure it resets even if an exception occurs
             _is_onnx_exporting = False
 
+<<<<<<< HEAD
     return wrapper
+=======
+    return cast(TCallable, wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_fx_passes.py b/torch/onnx/_internal/exporter/_fx_passes.py
index 98359f2ebaff1..b890d2bf48eb9 100644
--- a/torch/onnx/_internal/exporter/_fx_passes.py
+++ b/torch/onnx/_internal/exporter/_fx_passes.py
@@ -37,9 +37,16 @@ def remove_assertion_nodes(graph_module: torch.fx.GraphModule) -> torch.fx.Graph
         torch.ops.aten._assert_scalar.default,
         torch.ops.aten._assert_tensor_metadata.default,
     }
+<<<<<<< HEAD
     for gm in graph_module.modules():
         for node in gm.graph.nodes:  # type: ignore[union-attr]
             if node.op == "call_function" and node.target in aten_assertion_targets:
                 gm.graph.erase_node(node)  # type: ignore[operator, union-attr]
         gm.recompile()  # type: ignore[operator]
+=======
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function" and node.target in aten_assertion_targets:
+            graph_module.graph.erase_node(node)
+    graph_module.recompile()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return graph_module
diff --git a/torch/onnx/_internal/exporter/_ir_passes.py b/torch/onnx/_internal/exporter/_ir_passes.py
index 9391b642b009b..7092bf892a200 100644
--- a/torch/onnx/_internal/exporter/_ir_passes.py
+++ b/torch/onnx/_internal/exporter/_ir_passes.py
@@ -90,6 +90,31 @@ def rename_axis(model: ir.Model, rename_mapping: dict[str, str]) -> None:
             value.shape = ir.Shape(new_shape)
 
 
+<<<<<<< HEAD
+=======
+def add_torchlib_common_imports(
+    model: ir.Model, opset_version: int = _constants.TORCHLIB_OPSET
+) -> None:
+    """Hack to add torchlib common imports to the model."""
+
+    try:
+        # TODO(justinchuby): Remove this hack and improved onnxscript
+        from onnxscript.function_libs.torch_lib.ops import common as common_ops
+
+        model.opset_imports["pkg.onnxscript.torch_lib.common"] = 1
+        rank_func = ir.serde.deserialize_function(common_ops.Rank.to_function_proto())
+        rank_func.opset_imports[""] = opset_version
+        is_scalar_func = ir.serde.deserialize_function(
+            common_ops.IsScalar.to_function_proto()
+        )
+        is_scalar_func.opset_imports[""] = opset_version
+        model.functions[rank_func.identifier()] = rank_func
+        model.functions[is_scalar_func.identifier()] = is_scalar_func
+    except Exception:
+        logger.exception("Failed to add torchlib common imports to the model.")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _maybe_set_opset_version(
     opset_imports: dict[str, int], domain: str, version: int | None
 ) -> None:
diff --git a/torch/onnx/_internal/exporter/_isolated.py b/torch/onnx/_internal/exporter/_isolated.py
index 246797d44f1cb..4f8200d70d614 100644
--- a/torch/onnx/_internal/exporter/_isolated.py
+++ b/torch/onnx/_internal/exporter/_isolated.py
@@ -5,6 +5,7 @@
 import multiprocessing
 import os
 import warnings
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING, TypeVar, TypeVarTuple, Union, Unpack
 from typing_extensions import ParamSpec
 
@@ -13,6 +14,12 @@
     from collections.abc import Callable
 
 
+=======
+from typing import Any, Callable, TypeVar, TypeVarTuple, Union, Unpack
+from typing_extensions import ParamSpec
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 _Ts = TypeVarTuple("_Ts")
@@ -26,7 +33,10 @@ def _call_function_and_return_exception(
     """Call function and return a exception if there is one."""
 
     try:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return func(*args, **kwargs)
     except Exception as e:
         return e
diff --git a/torch/onnx/_internal/exporter/_onnx_program.py b/torch/onnx/_internal/exporter/_onnx_program.py
index 942638598047f..84445da701ee0 100644
--- a/torch/onnx/_internal/exporter/_onnx_program.py
+++ b/torch/onnx/_internal/exporter/_onnx_program.py
@@ -13,8 +13,12 @@
 import tempfile
 import textwrap
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.onnx._internal._lazy_import import onnx, onnxscript_apis, onnxscript_ir as ir
@@ -26,7 +30,12 @@
 # because ONNXProgram is exposed to the public API
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     import numpy as np
+=======
+    from collections.abc import Sequence
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import onnxruntime as ort
 
 _LARGE_MODEL_THRESHOLD = 1536 * 1024 * 1024  # 1536MB
@@ -117,6 +126,7 @@ def _create_value_mapping(graph: ir.Graph) -> dict[str, ir.Value]:
     return values
 
 
+<<<<<<< HEAD
 def _to_numpy_array(input: torch.Tensor | int | float | str | bool) -> np.ndarray:
     if isinstance(input, (int, float, str, bool)):
         return ir.tensor(input).numpy()
@@ -147,10 +157,15 @@ def _from_numpy_array(array: np.ndarray) -> torch.Tensor:
 def _to_ort_value(input: torch.Tensor | int | float | str | bool) -> ort.OrtValue:
     """Convert a PyTorch tensor to an ONNX Runtime OrtValue."""
     import numpy as np
+=======
+def _to_ort_value(tensor: torch.Tensor) -> ort.OrtValue:
+    """Convert a PyTorch tensor to an ONNX Runtime OrtValue."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import onnxruntime as ort
 
     from torch.onnx._internal.exporter import _core
 
+<<<<<<< HEAD
     if isinstance(input, (int, float, str, bool)):
         # Convert scalar values to OrtValue
         dtype_mapping = {
@@ -180,6 +195,27 @@ def _to_ort_value(input: torch.Tensor | int | float | str | bool) -> ort.OrtValu
         )
     # TODO(#151064): Use dlpack when ORT properly supports it
     return ort.OrtValue.ortvalue_from_numpy(input.numpy(force=True))
+=======
+    if tensor.dtype == torch.bfloat16 or tensor.dtype in _NP_UNSUPPORTED_DTYPES_8BIT:
+        if hasattr(ort.OrtValue, "ortvalue_from_numpy_with_onnx_type"):
+            # This requires ONNX Runtime 1.21 or newer
+            if tensor.dtype == torch.bfloat16:
+                uint_type = torch.uint16
+            else:
+                uint_type = torch.uint8
+            onnx_type = _core.torch_dtype_to_onnx_dtype(tensor.dtype)
+            # Make tensor contiguous to ensure view() works
+            tensor = tensor.contiguous()
+            return ort.OrtValue.ortvalue_from_numpy_with_onnx_type(
+                tensor.view(uint_type).numpy(force=True), onnx_element_type=onnx_type
+            )
+        raise RuntimeError(
+            f"Failed to convert tensor of type '{tensor.dtype}' to OrtValue. "
+            "Please ensure that ONNX Runtime is built with DLPack support or is the latest version"
+        )
+    # TODO(#151064): Use dlpack when ORT properly supports it
+    return ort.OrtValue.ortvalue_from_numpy(tensor.numpy(force=True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _from_ort_value(value: ort.OrtValue) -> torch.Tensor:
@@ -246,6 +282,10 @@ def __call__(self, *args, **kwargs) -> Sequence[torch.Tensor]:
 
         assert self._inference_session is not None
 
+<<<<<<< HEAD
+=======
+        # We don't expect non-tensor as inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ort_input = {
             k.name: _to_ort_value(v)
             for k, v in zip(self.model.graph.inputs, flatten_args)
@@ -253,13 +293,17 @@ def __call__(self, *args, **kwargs) -> Sequence[torch.Tensor]:
         run_options = ort.RunOptions()
         run_options.log_severity_level = 3  # 3: Error
         logger.debug("Running the inference session with %s arguments.", len(ort_input))
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs = self._inference_session.run_with_ort_values(
             None, ort_input, run_options=run_options
         )
         logger.debug("Inference session run completed.")
         return tuple(_from_ort_value(output) for output in outputs)
 
+<<<<<<< HEAD
     def call_reference(self, *args, **kwargs) -> Sequence[torch.Tensor]:
         """Run the ONNX model using the reference backend."""
         import onnx.reference
@@ -275,6 +319,8 @@ def call_reference(self, *args, **kwargs) -> Sequence[torch.Tensor]:
         assert isinstance(outputs, Sequence)
         return tuple(_from_numpy_array(output) for output in outputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def compute_values(
         self, value_names: Sequence[str], args=(), kwargs=None
     ) -> Sequence[torch.Tensor]:
@@ -467,6 +513,10 @@ def _process_args(args, kwargs) -> tuple[torch.Tensor, ...]:
     """Process input arguments for the ONNX model."""
     args = _flatten_inputs(args, kwargs)
     args = _remove_none_from_inputs(args)
+<<<<<<< HEAD
+=======
+    args = _remove_non_tensor(args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = _convert_complex_to_real_representation(args)
     return args
 
@@ -480,6 +530,50 @@ def _remove_none_from_inputs(model_args):
     return tuple(arg for arg in model_args if arg is not None)
 
 
+<<<<<<< HEAD
+=======
+def _remove_non_tensor(model_args):
+    """Remove the non-tensor input arguments.
+
+    Dynamo does not support non-tensor input arguments (https://github.com/pytorch/pytorch/issues/99534).
+
+    Specifically, it does put the input into graph with an empty node, but consumed by no ones.
+    The concrete value is embedded into the graph as a constant arg of a target node. Meta
+    suggests in this case that one should rewrite the model code to make it tensor if the
+    input value is supposed to change at runtime. We might need to further investigate
+    the feasibility of that suggestion.
+
+    For example,
+
+        def func(x, b=1.0):
+            y = x + b
+            z = y.relu()
+            return (y, z)
+
+        x = torch.randn(1, 1, 2, dtype=torch.float32)
+        gm_fun, _ = dynamo.export(func, x, b=8.0, aten_graph=True, tracing_mode="real")
+
+        # class GraphModule(torch.nn.Module):
+        #     def forward(self, x, b):
+        #         arg0: f32[1, 1, 2], arg1, = fx_pytree.tree_flatten_spec(([x, b], {}), self._in_spec)
+        #         # File: path/to/pytorch/test_constant_input.py:5, code: y = x + b
+        #         add_tensor: f32[1, 1, 2] = torch.ops.aten.add.Tensor(arg0, 8.0);  arg0 = None
+
+        #         # File: path/to/pytorch/test_constant_input.py:6, code: z = y.relu()
+        #         relu_default: f32[1, 1, 2] = torch.ops.aten.relu.default(add_tensor)
+        #         return pytree.tree_unflatten([add_tensor, relu_default], self._out_spec)
+
+    Empty torch.fx.Node input leading to a mismatched number of input with PyTorch, as
+    it's ignored in ONNX graph. Thus, we delete the useless input here.
+
+    """
+
+    return tuple(
+        arg for arg in model_args if not isinstance(arg, (int, float, bool, str))
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _convert_complex_to_real_representation(model_args):
     """Convert complex dtype tensors to real representation tensors.
 
diff --git a/torch/onnx/_internal/exporter/_registration.py b/torch/onnx/_internal/exporter/_registration.py
index 873281625bc02..a1dce509912d0 100644
--- a/torch/onnx/_internal/exporter/_registration.py
+++ b/torch/onnx/_internal/exporter/_registration.py
@@ -18,8 +18,13 @@
 import math
 import operator
 import types
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Literal, TypeAlias, Union
+=======
+from typing import Callable, Literal, Union
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._ops
@@ -64,11 +69,16 @@ def __post_init__(self) -> None:
                 if isinstance(self.onnx_function, onnxscript.OnnxFunction):
                     signature = _schemas.OpSignature.from_function(  # type: ignore[attr-defined]
                         self.onnx_function,
+<<<<<<< HEAD
                         # pyrefly: ignore [missing-attribute]
                         self.onnx_function.function_ir.domain,
                         # pyrefly: ignore [missing-attribute]
                         self.onnx_function.name,
                         # pyrefly: ignore [missing-attribute]
+=======
+                        self.onnx_function.function_ir.domain,
+                        self.onnx_function.name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         opset_version=self.onnx_function.opset.version,
                     )
                 else:
@@ -83,7 +93,11 @@ def __post_init__(self) -> None:
                     # When the function is targeting an HOP, for example, it will accept
                     # functions as arguments and fail to generate an ONNX signature.
                     # In this case we set signature to None and dispatch to this function always.
+<<<<<<< HEAD
                     logger.warning(  # noqa: G200
+=======
+                    logger.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "Failed to infer the signature for function '%s' because '%s'"
                         "All nodes targeting `%s` will be dispatched to this function",
                         self.onnx_function,
diff --git a/torch/onnx/_internal/exporter/_reporting.py b/torch/onnx/_internal/exporter/_reporting.py
index e2e02e089c5d1..0a52776284884 100644
--- a/torch/onnx/_internal/exporter/_reporting.py
+++ b/torch/onnx/_internal/exporter/_reporting.py
@@ -22,7 +22,11 @@ class ExportStatus:
     torch_export_strict: bool | None = None
     # Whether torch.export.export(..., strict=False) succeeds
     torch_export_non_strict: bool | None = None
+<<<<<<< HEAD
     # Whether torch.export.draft_export() succeeds
+=======
+    # Whether torch.export._draft_export.draft_export() succeeds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch_export_draft_export: bool | None = None
     # Whether decomposition succeeds
     decomposition: bool | None = None
@@ -47,7 +51,11 @@ def _format_export_status(status: ExportStatus) -> str:
         f"```\n"
         f"{_status_emoji(status.torch_export_non_strict)} Obtain model graph with `torch.export.export(..., strict=False)`\n"
         f"{_status_emoji(status.torch_export_strict)} Obtain model graph with `torch.export.export(..., strict=True)`\n"
+<<<<<<< HEAD
         f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export.draft_export`\n"
+=======
+        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export._draft_export.draft_export`\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"{_status_emoji(status.decomposition)} Decompose operators for ONNX compatibility\n"
         f"{_status_emoji(status.onnx_translation)} Translate the graph into ONNX\n"
         f"{_status_emoji(status.onnx_checker)} Run `onnx.checker` on the ONNX model\n"
diff --git a/torch/onnx/_internal/exporter/_schemas.py b/torch/onnx/_internal/exporter/_schemas.py
index 0ed3791c46fc7..2d685476a0d7d 100644
--- a/torch/onnx/_internal/exporter/_schemas.py
+++ b/torch/onnx/_internal/exporter/_schemas.py
@@ -541,7 +541,10 @@ def from_function(
                 if (
                     return_param_name := _get_type_constraint_name(return_type_i)
                 ) in type_constraints:
+<<<<<<< HEAD
                     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     type_constraint = type_constraints[return_param_name]
                 else:
                     return_param_name = f"TReturn{i}"
@@ -554,7 +557,10 @@ def from_function(
                     type_constraints[return_param_name] = type_constraint
                 outputs.append(
                     Parameter(
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         name=return_param_name,
                         type_constraint=type_constraint,
                         required=True,
diff --git a/torch/onnx/_internal/exporter/_tensors.py b/torch/onnx/_internal/exporter/_tensors.py
index 2a6c74120d568..6aeee2254288b 100644
--- a/torch/onnx/_internal/exporter/_tensors.py
+++ b/torch/onnx/_internal/exporter/_tensors.py
@@ -30,16 +30,24 @@ def __init__(
 
     @property
     def rank(self) -> int | None:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         if self.shape is None:
             return None
         # pyrefly: ignore [bad-argument-type]
+=======
+        if self.shape is None:
+            return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.shape)
 
     # TODO: Implement indexing
 
     def __mod__(self, other):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.dtype in {
             ir.DataType.FLOAT,
             ir.DataType.DOUBLE,
diff --git a/torch/onnx/_internal/exporter/_testing.py b/torch/onnx/_internal/exporter/_testing.py
index c34c2f1a38c3d..8a5f376d043b9 100644
--- a/torch/onnx/_internal/exporter/_testing.py
+++ b/torch/onnx/_internal/exporter/_testing.py
@@ -5,7 +5,11 @@
 
 __all__ = ["assert_onnx_program"]
 
+<<<<<<< HEAD
 from typing import Any, Literal, TYPE_CHECKING
+=======
+from typing import Any, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.utils import _pytree
@@ -23,7 +27,10 @@ def assert_onnx_program(
     args: tuple[Any, ...] | None = None,
     kwargs: dict[str, Any] | None = None,
     strategy: str | None = "TorchExportNonStrictStrategy",
+<<<<<<< HEAD
     backend: Literal["onnxruntime", "reference"] = "onnxruntime",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Assert that the ONNX model produces the same output as the PyTorch ExportedProgram.
 
@@ -38,8 +45,11 @@ def assert_onnx_program(
         strategy: Assert the capture strategy used to export the program. Values can be
             class names like "TorchExportNonStrictStrategy".
             If None, the strategy is not asserted.
+<<<<<<< HEAD
         backend: The backend to use for evaluating the ONNX program.
             Supported values are "onnxruntime" and "reference".
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if strategy is not None:
         if program._capture_strategy != strategy:
@@ -71,15 +81,19 @@ class names like "TorchExportNonStrictStrategy".
     # ONNX outputs are always real, so we need to convert torch complex outputs to real representations
     torch_outputs_adapted = []
     for output in torch_outputs:
+<<<<<<< HEAD
         # ONNX graph does not support None outputs, so we skip them
         if output is None:
             continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(output, torch.Tensor):
             torch_outputs_adapted.append(torch.tensor(output))
         elif torch.is_complex(output):
             torch_outputs_adapted.append(torch.view_as_real(output))
         else:
             torch_outputs_adapted.append(output)
+<<<<<<< HEAD
 
     # Obtain the ONNX outputs using the specified backend
     if backend == "onnxruntime":
@@ -91,6 +105,9 @@ class names like "TorchExportNonStrictStrategy".
             f"Unsupported backend '{backend}'. Supported backends are 'onnxruntime' and 'reference'."
         )
 
+=======
+    onnx_outputs = program(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO(justinchuby): Include output names in the error message
     torch.testing.assert_close(
         tuple(onnx_outputs),
diff --git a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
index a2f86a6ccf266..df2eb256ceea1 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
@@ -8,9 +8,14 @@
 __all__ = ["onnx_impl", "get_torchlib_ops"]
 
 import logging
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, TypeVar
 from typing_extensions import ParamSpec
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import onnxscript
 
@@ -18,9 +23,13 @@
 from torch.onnx._internal.exporter import _constants, _registration
 
 
+<<<<<<< HEAD
 # Use ParamSpec for better type preservation instead of bound Callable TypeVar
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
+=======
+_T = TypeVar("_T", bound=Callable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 logger = logging.getLogger("__name__")
 
@@ -36,7 +45,11 @@ def onnx_impl(
     opset_introduced: int = 18,
     no_compile: bool = False,
     private: bool = False,
+<<<<<<< HEAD
 ) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
+=======
+) -> Callable[[_T], _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Register an ONNX implementation of a torch op."""
 
     if isinstance(target, torch._ops.OpOverloadPacket):
@@ -47,8 +60,13 @@ def onnx_impl(
         )
 
     def wrapper(
+<<<<<<< HEAD
         func: Callable[_P, _R],
     ) -> Callable[_P, _R]:
+=======
+        func: _T,
+    ) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         processed_func: Any
         if no_compile:
             processed_func = func
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
index 1623028e1f553..c84134f1ce76e 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
@@ -50,16 +50,22 @@ def aten_group_norm(
 
     c = op21.Shape(input, start=1, end=2)
     if weight is None:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         weight = op21.ConstantOfShape(c, value=ir.tensor(1.0, dtype=input.dtype))
     if bias is None:
         # pyrefly: ignore [missing-attribute]
+=======
+        weight = op21.ConstantOfShape(c, value=ir.tensor(1.0, dtype=input.dtype))
+    if bias is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias = op21.ConstantOfShape(c, value=ir.tensor(0.0, dtype=input.dtype))
     return op21.GroupNormalization(
         input, weight, bias, epsilon=eps, num_groups=num_groups
     )
 
 
+<<<<<<< HEAD
 @onnx_impl(aten.rms_norm.default, trace_only=True, opset_introduced=23)
 def aten_rms_norm(
     input: TFloat,
@@ -88,6 +94,8 @@ def aten_rms_norm(
     return op23.RMSNormalization(input, weight, axis=axis, epsilon=eps)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @onnx_impl(
     aten.scaled_dot_product_attention.default, trace_only=True, opset_introduced=23
 )
@@ -107,7 +115,11 @@ def aten_scaled_dot_product_attention_23(
         1. https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
         2. https://onnx.ai/onnx/operators/onnx__Attention.html
 
+<<<<<<< HEAD
     Attempts to convert SDPA to Attention onnx op and fallbacks to an onnx graph equivalent to the following PyTorch code::
+=======
+    Attempts to convert SDPA to Attention onnx op and fallbacks to an onnx graph equivivalent to the following PyTorch code::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale
         attn_mask = (
             torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
@@ -131,7 +143,10 @@ def aten_scaled_dot_product_attention_23(
     assert (not is_causal) or (is_causal and attn_mask is None), (
         "is_causal and attn_mask cannot be set at the same time"
     )
+<<<<<<< HEAD
     # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert len(query.shape) == 4 and len(key.shape) == 4 and len(value.shape) == 4, (
         "only 4D query, key, and value are supported"
     )
@@ -140,21 +155,32 @@ def aten_scaled_dot_product_attention_23(
     if dropout_p == 0:
         if enable_gqa:
             assert (
+<<<<<<< HEAD
                 # pyrefly: ignore [index-error]
                 query.shape[1] > key.shape[1] == value.shape[1]
                 # pyrefly: ignore [index-error]
+=======
+                query.shape[1] > key.shape[1] == value.shape[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and query.shape[1] % key.shape[1] == 0
             ), (
                 "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
             )
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert query.shape[1] == key.shape[1] == value.shape[1], (
                 "SDPA (MHA) requires q_num_heads = kv_num_heads"
             )
 
         # NOTE: num_heads attributes (q_num_heads/kv_num_heads) should not be specified for 4D.
+<<<<<<< HEAD
         # They are not populated with 4D inputs because this information directly comes from input shapes:
+=======
+        # They are not populated with 4D inputs because this information directy comes from input shapes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # `q_num_heads=query.shape[1]` and `kv_num_heads=key.shape[1]`.
         # This dimension is usually static but it could not be dynamic if also given as an attribute.
         # num_heads attributes are needed for 3D attention inputs:
@@ -177,9 +203,12 @@ def aten_scaled_dot_product_attention_23(
     if is_causal:
         attn_mask = _causal_attention_mask(query, key, op23)
 
+<<<<<<< HEAD
     if enable_gqa:
         key, value = _attention_repeat_kv_for_group_query(query, key, value, op23)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if attn_mask is None:
         return _aten_scaled_dot_product_attention_no_mask_onnx(
             query, key, value, scale, dropout_p, op23
@@ -190,6 +219,7 @@ def aten_scaled_dot_product_attention_23(
     )
 
 
+<<<<<<< HEAD
 def _attention_repeat_kv_for_group_query(
     query: TFloat, key: TFloat, value: TFloat, op: Opset
 ) -> tuple[TFloat, TFloat]:
@@ -253,6 +283,8 @@ def _attention_repeat_kv_for_group_query(
     return expanded_key, expanded_value
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _attention_scale(query: TFloat, op: Opset) -> TFloat:
     """Calculate the scale factor for the attention result.
 
diff --git a/torch/onnx/_internal/exporter/_verification.py b/torch/onnx/_internal/exporter/_verification.py
index 9741ae81bfffb..537deed13221d 100644
--- a/torch/onnx/_internal/exporter/_verification.py
+++ b/torch/onnx/_internal/exporter/_verification.py
@@ -317,9 +317,18 @@ def run_node(self, n: torch.fx.Node) -> Any:
             return result
         try:
             (onnx_result,) = self._onnx_program.compute_values([node_name], self._args)
+<<<<<<< HEAD
         except Exception:
             logger.warning(
                 "Failed to compute value for node %s", node_name, exc_info=True
+=======
+        except Exception as e:
+            logger.warning(
+                "Failed to compute value for node %s: %s",
+                node_name,
+                e,
+                exc_info=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return result
         info = VerificationInfo.from_tensors(
diff --git a/torch/onnx/_internal/fx/__init__.py b/torch/onnx/_internal/fx/__init__.py
index e69de29bb2d1d..b5716bdafced7 100644
--- a/torch/onnx/_internal/fx/__init__.py
+++ b/torch/onnx/_internal/fx/__init__.py
@@ -0,0 +1,8 @@
+from .patcher import ONNXTorchPatcher
+from .serialization import save_model_with_external_data
+
+
+__all__ = [
+    "save_model_with_external_data",
+    "ONNXTorchPatcher",
+]
diff --git a/torch/onnx/_internal/fx/_pass.py b/torch/onnx/_internal/fx/_pass.py
index b1fad573f2902..3e3690c19e96f 100644
--- a/torch/onnx/_internal/fx/_pass.py
+++ b/torch/onnx/_internal/fx/_pass.py
@@ -7,7 +7,11 @@
 import difflib
 import io
 import sys
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -15,8 +19,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._subclasses import fake_tensor
 
 
diff --git a/torch/onnx/_internal/fx/decomposition_table.py b/torch/onnx/_internal/fx/decomposition_table.py
new file mode 100644
index 0000000000000..71715e1ad2344
--- /dev/null
+++ b/torch/onnx/_internal/fx/decomposition_table.py
@@ -0,0 +1,116 @@
+# mypy: allow-untyped-defs
+"""Dispatcher for AtenLib functions from onnx-script."""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import torch
+import torch._ops
+import torch.fx
+from torch.onnx._internal.fx import registration
+
+
+def _create_onnx_supports_op_overload_table(
+    registry,
+) -> set[torch._ops.OperatorBase | Callable]:
+    """
+    Creates a set of OperatorBase and Callable objects that represent ONNX-supported PyTorch operations.
+
+    Args:
+        registry (OnnxRegistry): The ONNX registry for PyTorch.
+
+    Returns:
+        A collection of OperatorBase and Callable objects representing ONNX-supported PyTorch operations.
+    """
+    table: set[torch._ops.OperatorBase | Callable] = set()
+
+    # Some ops in `torch.ops.aten` are not discoverable through `dir(torch.ops.aten)`,
+    # but retrievable via explicit lookup.
+    # https://github.com/pytorch/pytorch/issues/99681
+    # This is a workaround to make sure we register ONNX symbolic functions for these.
+    onnx_supported_aten_lookup_table = [
+        k.split("::")[1].split(".")[0]
+        for k in registry._all_registered_ops()
+        if k.startswith("aten::")
+    ]
+
+    for op_namespace in (torch.ops.aten, torch.ops.prims):
+        attr_names = dir(op_namespace)
+        if op_namespace is torch.ops.aten:
+            attr_names += onnx_supported_aten_lookup_table
+        for attr_name in attr_names:
+            if not hasattr(op_namespace, attr_name):
+                # torchlib owns some attributes that are not aten ops.
+                continue
+            op_overload_packet = getattr(op_namespace, attr_name)
+            if not isinstance(op_overload_packet, torch._ops.OpOverloadPacket):
+                continue
+
+            for overload_name in op_overload_packet.overloads():
+                op_overload = getattr(op_overload_packet, overload_name)
+                internal_op_name = registration.OpName.from_qualified_name(
+                    qualified_name=op_overload.name()
+                )
+                # NOTE: If the overload is supported in registry or it's default overload is supported in registry,
+                # we add it to the table.
+                if registry.is_registered_op(
+                    namespace=internal_op_name.namespace,
+                    op_name=internal_op_name.op_name,
+                    overload=internal_op_name.overload,
+                ) or registry.is_registered_op(
+                    namespace=internal_op_name.namespace,
+                    op_name=internal_op_name.op_name,
+                    overload=None,
+                ):
+                    # This line maps torch.ops.aten.add.Tensor, torch.ops.aten.add.Scalar, torch.ops.aten.add.out, etc
+                    # to "aten::add". This means the exporter for "aten::add" is used for all overloads of "aten::add".
+                    # This is applied to all ops under torch.ops.aten.
+                    table.add(op_overload)
+    return table
+
+
+def create_onnx_friendly_decomposition_table(
+    registry,
+) -> dict[torch._ops.OperatorBase, Callable]:
+    """
+    This function creates a dictionary of op overloads and their decomposition functions
+    for ops that do not have ONNX symbolic functions. If an op already has an ONNX symbolic function,
+    its decomposition function is excluded from the table. The decomposition table is a subset of PyTorch's
+    built-in aten-to-aten decomposition.
+
+    Args:
+        registry: The ONNX registry for PyTorch.
+
+    Returns:
+        Dict[torch._ops.OperatorBase, Callable]: A dictionary that maps op overloads to their corresponding
+        decomposition functions.
+    """
+    decomposition_table: dict[torch._ops.OperatorBase, Callable] = {}
+    # Dictionary that maps torch.ops.aten.* to exporter look up key; e.g.,
+    # _OP_OVERLOAD_TO_EXPORTER_KEY_TABLE[torch.add.Tensor] is "aten::add".
+    _ONNX_SUPPORT_OP_OVERLOADS = _create_onnx_supports_op_overload_table(registry)
+
+    # NOTE: If we import torch._decomp, we will get RuntimeError: Only a single
+    # TORCH_LIBRARY can be used to register the namespace nvprims; please put all of your
+    # definitions in a single TORCH_LIBRARY block.
+    for op_overload, decomp_fn in torch._decomp.decomposition_table.items():
+        # Skip decomposition into "prim::*" ops (defined in 'torch._refs'), because they
+        # are not generally supported by ONNX.
+        # Skip decomposition for op_overload as long as that op_overload has a corresponding ONNX
+        # symbolic function.
+        if (
+            "torch._refs" in decomp_fn.__module__
+            or op_overload in _ONNX_SUPPORT_OP_OVERLOADS
+        ):
+            continue
+        decomposition_table[op_overload] = decomp_fn
+
+    # NOTE: There are ops in core ATen and under torch._refs,
+    # that are not decomposed to prim::ops. We need to pick them
+    # back
+    for op_overload, decomp_fn in torch._decomp.core_aten_decompositions().items():
+        if op_overload in _ONNX_SUPPORT_OP_OVERLOADS:
+            continue
+        decomposition_table[op_overload] = decomp_fn
+    return decomposition_table
diff --git a/torch/onnx/_internal/fx/dynamo_graph_extractor.py b/torch/onnx/_internal/fx/dynamo_graph_extractor.py
new file mode 100644
index 0000000000000..b11903619c080
--- /dev/null
+++ b/torch/onnx/_internal/fx/dynamo_graph_extractor.py
@@ -0,0 +1,232 @@
+# mypy: allow-untyped-defs
+# NOTE: This file is referenced by name at
+#       /opt/pytorch/torch/_dynamo/eval_frame.py::DONT_WRAP_FILES.
+#       introduced by https://github.com/pytorch/pytorch/pull/98894.
+#       If this file is renamed, moved, etc please update the reference there!
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import inspect
+from typing import Any, Callable, TYPE_CHECKING
+
+import torch._dynamo
+import torch.export as torch_export
+import torch.fx
+import torch.onnx
+from torch.onnx._internal import _exporter_legacy, io_adapter
+from torch.utils import _pytree as pytree
+
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+
+
+class _PyTreeExtensionContext:
+    """Context manager to register PyTree extension."""
+
+    _extensions: dict[type, tuple[pytree.FlattenFunc, pytree.UnflattenFunc]]
+
+    def __init__(self) -> None:
+        self._extensions = {}
+        # Register PyTree extension for HuggingFace model output.
+        self._register_huggingface_model_output_extension()
+
+    def __enter__(self):
+        for class_type, (flatten_func, unflatten_func) in self._extensions.items():
+            pytree._private_register_pytree_node(
+                class_type,
+                flatten_func,
+                unflatten_func,
+            )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for class_type in self._extensions:
+            pytree.SUPPORTED_NODES.pop(class_type)
+
+    def register_pytree_node(
+        self,
+        class_type: type,
+        flatten_func: pytree.FlattenFunc,
+        unflatten_func: pytree.UnflattenFunc,
+    ):
+        """Register PyTree extension for a custom python type.
+
+        Args:
+            class_type: The custom python type.
+            flatten_func: The flatten function.
+            unflatten_func: The unflatten function.
+
+        Raises:
+            AssertionError: If the custom python type is already registered.
+        """
+        if class_type in pytree.SUPPORTED_NODES or class_type in self._extensions:
+            # PyTree node already registered.
+            # E.g., `huggingface/transformer` registers `ModelOutput` as PyTree node after
+            # https://github.com/huggingface/transformers/pull/25358.
+            return
+        self._extensions[class_type] = (flatten_func, unflatten_func)
+
+    def _register_huggingface_model_output_extension(self):
+        try:
+            from transformers import modeling_outputs  # type: ignore[import]
+        except ImportError:
+            return
+
+        def model_output_flatten(
+            output: modeling_outputs.ModelOutput,
+        ) -> tuple[list[Any], pytree.Context]:
+            return list(output.values()), (type(output), list(output.keys()))
+
+        def model_output_unflatten(
+            values: list[Any], context: pytree.Context
+        ) -> modeling_outputs.ModelOutput:
+            output_type, keys = context
+            return output_type(**dict(zip(keys, values)))
+
+        # All 'ModelOutput' subclasses are defined under module 'modeling_outputs'.
+        named_model_output_classes = inspect.getmembers(
+            modeling_outputs,
+            lambda x: (
+                inspect.isclass(x)
+                and issubclass(x, modeling_outputs.ModelOutput)
+                and x is not modeling_outputs.ModelOutput
+            ),
+        )
+
+        for _, class_type in named_model_output_classes:
+            self.register_pytree_node(
+                class_type,
+                model_output_flatten,
+                model_output_unflatten,  # type: ignore[arg-type ]
+            )
+
+
+class DynamoFlattenOutputStep(io_adapter.FlattenOutputStep):
+    """Flatten nested collection and custom python types and return a flat list of elements.
+
+    Extended from :class:`io_adapter.FlattenOutputStep` to support flattening arbitrary
+    types via pytree extension. By default this supports many common user defined python
+    types such as :class:`ModelOutput` from HuggingFace transformers.
+
+    The pytree extension can be customized by passing in a ``_PyTreeExtensionContext``
+    object. See :meth:`_PyTreeExtensionContext.register_pytree_node`.
+    """
+
+    def __init__(self, pytree_extension_context: _PyTreeExtensionContext | None = None):
+        super().__init__()
+        self._pytree_extension_context = (
+            pytree_extension_context or _PyTreeExtensionContext()
+        )
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs, under the context of pytree extension."""
+        with self._pytree_extension_context:
+            return super().apply(model_outputs, model=model)
+
+
+def _wrap_model_with_output_adapter(
+    model: torch.nn.Module | Callable,
+    output_adapter: DynamoFlattenOutputStep,
+) -> Callable:
+    """Wrap model with output adapter.
+
+    This is a helper function to enable :func:`dynamo.export` on models that produce
+    custom user defined types outputs. It wraps the model with an output adapter to
+    convert the outputs to :func:`dynamo.export` compatible types, i.e. :class:`torch.Tensor`.
+
+    The adapting logic is controlled by ``output_adapter``.
+
+    Args:
+        model: PyTorch model or function.
+        output_adapter: Output adapter to apply to model output.
+    Returns:
+        Wrapped model.
+    """
+    model_func = model.forward if isinstance(model, torch.nn.Module) else model
+
+    # Preserve original function signature.
+    @functools.wraps(model_func)
+    def wrapped(*args, **kwargs):
+        return output_adapter.apply(model_func(*args, **kwargs), model=model)
+
+    return wrapped
+
+
+class DynamoExport(_exporter_legacy.FXGraphExtractor):
+    """Generates a FX GraphModule using torch.dynamo.export API
+    Args:
+        aten_graph: If True, exports a graph with ATen operators.
+                    If False, exports a graph with Python operators.
+    """
+
+    def __init__(
+        self,
+        aten_graph: bool | None = None,
+    ):
+        super().__init__()
+        self.aten_graph = aten_graph or True
+
+    def generate_fx(
+        self,
+        options: _exporter_legacy.ResolvedExportOptions,
+        model: torch.nn.Module | Callable,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        # `dynamo.export` does not recognize custom user defined classes as output type.
+        # Apply wrapper to adapt the outputs back to `dynamo.export` compatible types,
+        # i.e. :class:`torch.Tensor`.
+        dynamo_flatten_output_step = DynamoFlattenOutputStep()
+        wrapped_model = _wrap_model_with_output_adapter(
+            model, dynamo_flatten_output_step
+        )
+        # Record the output adapter step.
+        self.output_adapter.append_step(dynamo_flatten_output_step)
+
+        # Translate callable to FX graph.
+        #
+        fake_mode = (
+            options.fake_context.fake_mode
+            if options.fake_context
+            else contextlib.nullcontext()
+        )
+        fx_mode = "symbolic" if options.dynamic_shapes else "fake"
+        with fake_mode:  # type: ignore[attr-defined]
+            graph_module, graph_guard = torch._dynamo.export(
+                wrapped_model,
+                tracing_mode=fx_mode,
+            )(
+                *model_args,
+                **model_kwargs,
+            )
+        del graph_guard  # Unused
+        torch._dynamo.reset()
+
+        # Export FX graph to ONNX ModelProto.
+        self.input_adapter.append_step(
+            io_adapter.FlattenInputWithTreeSpecValidationInputStep()
+        )
+
+        updated_model_args = self.input_adapter.apply(
+            *model_args, model=model, **model_kwargs
+        )
+
+        return self.pre_export_passes(options, model, graph_module, updated_model_args)  # type: ignore[return-value]
+
+    def pre_export_passes(
+        self,
+        options: _exporter_legacy.ResolvedExportOptions,
+        original_model: torch.nn.Module | Callable,
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        return _exporter_legacy.common_pre_export_passes(
+            options, original_model, fx_module, fx_module_args
+        )
diff --git a/torch/onnx/_internal/fx/fx_onnx_interpreter.py b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
new file mode 100644
index 0000000000000..424f2d171b978
--- /dev/null
+++ b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
@@ -0,0 +1,718 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import inspect
+import operator
+from typing import Callable, TYPE_CHECKING
+
+import onnxscript
+from onnxscript.function_libs.torch_lib import (
+    graph_building as onnxscript_graph_building,
+)
+
+import torch
+import torch.fx
+from torch.onnx import _type_utils as jit_type_utils
+from torch.onnx._internal.fx import (
+    _pass,
+    onnxfunction_dispatcher,
+    type_utils as fx_type_utils,
+)
+from torch.utils import _pytree
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+def _fx_node_to_onnx_message_formatter(
+    fn: Callable,
+    self,
+    node: torch.fx.Node,
+    *args,
+    **kwargs,
+) -> str:
+    return f"FX Node: {node.op}:{node.target}[name={node.name}]. "
+
+
+def _fx_graph_to_onnx_message_formatter(
+    fn: Callable,
+    self,
+    fx_graph_module: torch.fx.GraphModule,
+    *args,
+    **kwargs,
+) -> str:
+    return f"FX Graph: {fx_graph_module._get_name()}. "
+
+
+def _retrieve_or_adapt_input_to_graph_set(
+    fx_node_arg: fx_type_utils.Argument,
+    fx_name_to_onnxscript_value: dict[
+        str,
+        onnxscript_graph_building.TorchScriptTensor
+        | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+    ],
+    tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+):
+    """Map FX value to TorchScript value.
+
+    When creating TorchScript graph from FX graph, we need a mapping from FX variable
+    to TorchScript variable. This function maps FX variable, fx_node_arg, to torch.jit.Value.
+    """
+    from onnxscript import opset18 as op
+
+    onnx_tensor = fx_node_arg
+    if isinstance(onnx_tensor, torch.fx.Node):
+        # 1. fx_node_arg is a torch.fx.Node, which means
+        #    fx_node_arg stands for the output of that torch.fx.Node.
+        # 2. fx_node_arg (variable in torch.fx.Graph) is be mapped to
+        #    torch.jit.Value, fx_name_to_onnxscript_value[fx_node_arg.name],
+        #    in TorchScript graph.
+        return fx_name_to_onnxscript_value[onnx_tensor.name]
+    elif isinstance(onnx_tensor, (tuple, list)) and any(
+        isinstance(node, torch.fx.Node)
+        and fx_type_utils.is_torch_symbolic_type(node.meta.get("val"))
+        for node in onnx_tensor
+    ):
+        # This intends to handle dynamic axes. for example, if the input size of op.Expand
+        # is dynamic, each dimension would be variable (i.e., sym variable in Pytorch
+        # FX graph. Note that sym variable is mapped to tensor in ONNX Script world)
+        # calculated by other operators.
+        sequence_mixed_elements: list[
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...]
+            | list[int]
+        ] = []
+        # onnx_tensor contains a list of scalars which could be one of
+        #   - tensor with empty shape,
+        #   - tensor with tensor with shape (1,),
+        #   - torch.SymInt,
+        #   - int
+        #   - ...
+        # They should all be promoted to tensor with shape (1,)
+        # in order to call ONNX's Concat.
+        for tensor in onnx_tensor:
+            # Prepare `tensor` as input of ONNX's Concat.
+
+            if isinstance(
+                tensor, torch.fx.Node
+            ) and fx_type_utils.is_torch_symbolic_type(tensor.meta.get("val")):
+                # In this case, tensor is a torch.SymInt from Dynamo's perspective.
+                # It might be mapped to tensor with shape () or (1,) in ONNX.
+                element_value = fx_name_to_onnxscript_value[tensor.name]
+                if isinstance(
+                    element_value, onnxscript_graph_building.TorchScriptTensor
+                ):
+                    # All elements sequence_mixed_elements will be send to onnx's Concat
+                    # as inputs. Therefore, they are required to have the same rank.
+                    # Since tensors with rank=0 (i.e., scalar) cannot be concated, all
+                    # scalars are promoted to tensors with shape (1,).
+                    with onnxscript.evaluator.default_as(tracer):
+                        element_value = op.Reshape(
+                            element_value,  # type: ignore[arg-type, type-var]
+                            [1],  # type: ignore[arg-type, type-var]
+                        )
+                sequence_mixed_elements.append(element_value)
+            elif isinstance(tensor, int):
+                # NOTE: op.Concat doesn't support scalar, so we need to wrap it with
+                # dim, and onnx-script will promote it to tensor(int64)
+                sequence_mixed_elements.append([tensor])
+            else:
+                raise RuntimeError(
+                    f"Unsupported type in sequence_mixed_elements: {type(tensor)}"
+                )
+        # Concat all the elements in the sequence.
+        # shapes are mapped to tensors in ONNX graph (TorchScriptGraph),
+        # so list of sym_ints is concatenated to a tensor before calling ONNX op.
+
+        # For example:
+        #    inputs: [[2], [4], fx.Node(SymIntA), [1], fx.Node(SymIntB)]
+        #    outputs: op.Concat([op.Constant(2), op.Constant(4), TorchScriptTensor(A), op.Constant(1), TorchScriptTensor(B)])
+
+        # onnx-script auto wraps python number with op.Constants,
+        # so we don't need to specifically process them.
+        with onnxscript.evaluator.default_as(tracer):
+            output = op.Concat(*sequence_mixed_elements, axis=0)  # type: ignore[type-var]
+        output.dtype = torch.int64  # type: ignore[union-attr]
+        output.shape = [len(sequence_mixed_elements)]  # type: ignore[union-attr]
+        return output
+    elif isinstance(onnx_tensor, (tuple, list)) and all(
+        isinstance(node, torch.fx.Node) or node is None for node in onnx_tensor
+    ):
+        sequence_elements: list[
+            onnxscript_graph_building.TorchScriptTensor
+            | None
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...]
+        ] = []
+        for tensor in onnx_tensor:
+            sequence_elements.append(
+                fx_name_to_onnxscript_value[tensor.name] if tensor is not None else None  # type: ignore[index, union-attr]
+            )
+        return sequence_elements
+    if isinstance(onnx_tensor, torch.dtype):
+        onnx_tensor = int(  # type: ignore[call-overload]
+            jit_type_utils.JitScalarType.from_dtype(onnx_tensor).onnx_type()
+        )
+    # NOTE: if device is specified in kwargs (not consumed), it's free to ignored. But
+    # if it's in args, we need to set it to string for dispatcher to match schema.
+    if isinstance(onnx_tensor, torch.device):
+        # torch.device is not supported by onnxscript (no op). We turn it into
+        # a string.
+        return str(onnx_tensor)
+    # all other cases, we do nothing.
+    return onnx_tensor
+
+
+def filter_incompatible_and_dtype_convert_kwargs(kwargs):
+    """Filter out kwargs that are not supported by onnxscript."""
+    filtered = {}
+    for key, value in kwargs.items():
+        if key in {
+            "layout",
+            "device",
+            "requires_grad",
+            "pin_memory",
+            "memory_format",
+            "implicit",
+        }:
+            continue
+        if key == "dtype":
+            if value is None:
+                # We omit if dtype is not provided, because onnxscript handles the
+                # default case.
+                continue
+            else:
+                value = int(jit_type_utils.JitScalarType.from_dtype(value).onnx_type())  # type: ignore[call-overload]
+        filtered[key] = value
+    return filtered
+
+
+def _fill_tensor_shape_type(
+    onnxscript_values: onnxscript_graph_building.TorchScriptTensor
+    | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+    name: str,
+    expected_values: fx_type_utils.META_VALUE_TYPE
+    | list[fx_type_utils.META_VALUE_TYPE]
+    | tuple[fx_type_utils.META_VALUE_TYPE | None, ...],
+):
+    """Fill the meta information of onnxscript_values with that from the fx FakeTensor."""
+
+    if isinstance(expected_values, (list, tuple)) and not isinstance(
+        onnxscript_values, (list, tuple)
+    ):
+        # ex: aten::split - in onnx_dtype: seq(tensor)
+        # onnxscript_values is a single tensor, but expected_values is a list of tensors.
+        return
+
+    flat_onnxscript_values, _ = _pytree.tree_flatten(onnxscript_values)
+    flat_expected_values, _ = _pytree.tree_flatten(expected_values)
+    for i, (onnxscript_value, expected_value) in enumerate(
+        zip(flat_onnxscript_values, flat_expected_values)
+    ):
+        if expected_value is None:
+            # There is no shape/type from None.
+            # NOTE: according to https://github.com/pytorch/pytorch/blob/main/torch/_meta_registrations.py,
+            # None could be a valid value for return type, so we need to handle it.
+            # e.g. the function: meta__scaled_dot_product_flash() in cpu mode.
+            continue
+        elif fx_type_utils.is_torch_symbolic_type(expected_value):
+            # aten::sym_size output is a int, not a tensor, which stands
+            # for the size of one dim. We treat it as 1-D tensor.
+            onnxscript_value.dtype = fx_type_utils.from_sym_value_to_torch_dtype(
+                expected_value
+            )
+            onnxscript_value.shape = torch.Size([1])
+        elif isinstance(expected_value, (int, float, bool)):
+            onnxscript_value.dtype = fx_type_utils.from_scalar_type_to_torch_dtype(
+                type(expected_value)
+            )
+            onnxscript_value.shape = torch.Size([])
+        elif isinstance(expected_value, complex):
+            # From complex scalar to real representation
+            onnxscript_value_to_torch_dtype = (
+                fx_type_utils.from_scalar_type_to_torch_dtype(type(expected_value))
+            )
+            onnxscript_value.dtype = (
+                fx_type_utils.from_complex_to_float(onnxscript_value_to_torch_dtype)
+                if onnxscript_value_to_torch_dtype is not None
+                else None
+            )
+            onnxscript_value.shape = torch.Size([2])
+        elif fx_type_utils.is_torch_complex_dtype(expected_value.dtype):
+            # Like torch.view_as_real, we flatten complex tensors to real tensors with
+            # additional last dimension of 2
+            onnxscript_value.shape = torch.Size((*expected_value.size(), 2))
+            # complex64 -> float32, complex128 -> float64, etc.
+            onnxscript_value.dtype = fx_type_utils.from_complex_to_float(
+                expected_value.dtype
+            )
+            # Dispatcher needs to know the value is complex
+            onnxscript_value.is_complex = True
+        else:
+            # We set node output sizes to be dynamic to continue the model conversion,
+            # and inputs are also set to be dynamic in add_input().
+            onnxscript_value.shape = expected_value.size()
+            onnxscript_value.dtype = expected_value.dtype
+
+        # naming
+        if i > 0:
+            onnxscript_value.name = f"{name}_{i}"
+        else:
+            onnxscript_value.name = name
+
+
+def _fill_in_default_kwargs(
+    node: torch.fx.Node,
+) -> tuple[list[fx_type_utils.Argument], dict[str, fx_type_utils.Argument]]:
+    """Find and Fill in the not provided kwargs with default values."""
+
+    # TODO: aten::sym_size has overload, but fx graph is using
+    # overloadpacket for some reasons.
+    # https://github.com/pytorch/pytorch/issues/97201
+    # We manually assigned overload for aten::sym_size.
+    if hasattr(node.target, "_schema"):
+        node_schema = node.target._schema  # type: ignore[union-attr]
+    else:
+        node_schema = torch.ops.aten.sym_size.int._schema  # type: ignore[union-attr]
+
+    # This function assumes the order of arguments in FX op is the
+    # same as the order of arguments in TorchScript op.
+    complete_args: list[fx_type_utils.Argument] = []
+    complete_kwargs: dict[str, fx_type_utils.Argument] = {}
+
+    if inspect.isbuiltin(node.target):
+        complete_args = list(node.args)
+    else:
+        for i, expected_arg in enumerate(node_schema.arguments):
+            if i < len(node.args):
+                complete_args.append(node.args[i])
+            elif expected_arg.name in node.kwargs:
+                complete_kwargs[expected_arg.name] = node.kwargs[expected_arg.name]
+            else:
+                # Get default from schema.
+                complete_kwargs[expected_arg.name] = expected_arg.default_value
+
+    return complete_args, complete_kwargs
+
+
+def _wrap_fx_args_as_onnxscript_args(
+    complete_args: list[fx_type_utils.Argument],
+    complete_kwargs: dict[str, fx_type_utils.Argument],
+    fx_name_to_onnxscript_value: dict[
+        str,
+        onnxscript_graph_building.TorchScriptTensor
+        | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+    ],
+    tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+) -> tuple[
+    Sequence[
+        onnxscript_graph_building.TorchScriptTensor
+        | str
+        | int
+        | float
+        | bool
+        | list
+        | complex
+        | None
+    ],
+    dict[str, fx_type_utils.Argument],
+]:
+    """Map all FX arguments of a node to arguments in TorchScript graph."""
+
+    onnxscript_args = tuple(
+        _retrieve_or_adapt_input_to_graph_set(arg, fx_name_to_onnxscript_value, tracer)
+        for arg in complete_args
+    )
+    onnxscript_kwargs = filter_incompatible_and_dtype_convert_kwargs(complete_kwargs)
+
+    return onnxscript_args, onnxscript_kwargs
+
+
+class FxOnnxInterpreter:
+    """Stateless class to process FX graph Nodes and translate them into their ONNX counterparts.
+
+    All FX nodes described by [FX Graph](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph) are supported.
+    Similarly to [FX Interpreter pattern](https://pytorch.org/docs/stable/fx.html#torch.fx.Interpreter), each FX node
+    must be implemented on its own method in this class.
+
+    Each operator's implementation returns either an `onnxscript.OnnxFunction` or
+    `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm. They can
+    also raise RuntimeError: If there are no overloaded functions available for the given FX node.
+    """
+
+    def run_node(
+        self,
+        node,
+        fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        onnxscript_tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+    ):
+        """Execute a single FX node to produce its ONNX counterpart.
+
+        Args:
+            node: The FX node to be translated.
+            fx_graph_module: The FX graph module containing the node.
+            onnxfunction_dispatcher: The dispatcher to find the best matched ONNX op.
+            onnxscript_graph: The ONNX graph to be populated.
+            onnxscript_tracer: The tracer to trace the ONNX graph.
+            fx_name_to_onnxscript_value: The mapping from FX node name to ONNX Script value.
+
+        Raises:
+            RuntimeError: When a node.op is not supported.
+        """
+        if node.op == "placeholder":
+            self.placeholder(node, onnxscript_graph, fx_name_to_onnxscript_value)
+        elif node.op == "get_attr":
+            self.get_attr(
+                node,
+                onnxscript_graph,
+                fx_name_to_onnxscript_value,
+                fx_graph_module,
+            )
+        elif node.op == "call_function":
+            self.call_function(
+                node,
+                onnxscript_tracer,
+                fx_name_to_onnxscript_value,
+                onnxfunction_dispatcher,
+                fx_graph_module,
+            )
+        elif node.op == "call_method":
+            self.call_method(node)
+        elif node.op == "call_module":
+            self.call_module(
+                node,
+                onnxscript_graph,
+                fx_name_to_onnxscript_value,
+                onnxscript_tracer,
+                fx_graph_module,
+                onnxfunction_dispatcher,
+            )
+        elif node.op == "output":
+            self.output(node, onnxscript_graph, fx_name_to_onnxscript_value)
+        else:
+            raise RuntimeError(f"Found node type not defined in torch.fx: {node.op}")
+
+    def run(
+        self,
+        fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        parent_onnxscript_graph: onnxscript_graph_building.TorchScriptGraph
+        | None = None,
+    ) -> onnxscript_graph_building.TorchScriptGraph:
+        """Analyze all FX nodes and trigger their ONNX translation.
+
+        Args:
+            fx_graph_module: FX graph module to be translated.
+            onnxfunction_dispatcher: ONNX function dispatcher.
+            parent_onnxscript_graph: The parent TorchScript graph. Must be provided if
+                `fx_graph_module` is a submodule. If not provided,
+                `fx_graph_module` is assumed to be the root module.
+        """
+        if parent_onnxscript_graph is not None:
+            # If parent_onnxscript_graph is provided, we assume fx_graph_module is a
+            # submodule representing a forward call of an nn.Module.
+            # Compose package and version where the nn.Module is defined as domain name
+            # for the local function.
+
+            onnx_meta: _pass.GraphModuleOnnxMeta | None = fx_graph_module.meta.get(
+                "onnx"
+            )
+            if onnx_meta is None:
+                raise RuntimeError(
+                    f"ONNX meta is not found in submodule {fx_graph_module._get_name()}. "
+                    f"Only submodules produced by `Modularize` pass is supported in ONNX export."
+                )
+
+            onnx_domain = onnx_meta.package_info.to_onnx_domain_string()
+        else:
+            # Leave as default domain name for the root module.
+            onnx_domain = None
+
+        onnxscript_graph = onnxscript_graph_building.TorchScriptGraph(
+            parent_onnxscript_graph, domain_name=onnx_domain
+        )
+        onnxscript_tracer = onnxscript_graph_building.TorchScriptTracingEvaluator(
+            onnxscript_graph
+        )
+        # In the following loop, a TorchScript graph is created to
+        # represent the input FX graph with ONNX symbols (e.g., onnx::add).
+        # To connect the values to nodes in the TorchScript graph, we maintain
+        # fx_name_to_onnxscript_value. Basically, we want to translate
+        #   fx_tensor_x (type: torch.fx.Node) -> fx_node_1 -> fx_tensor_y (type: torch.fx.Node)
+        # to
+        #   fx_name_to_onnxscript_value[fx_tensor_x.name] -> onnx_node_1 -> fx_name_to_onnxscript_value[fx_tensor_y.name]
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ] = {}
+
+        # TODO: Fix FakeTensorMode limitation asap
+        # We want to pass list of ints and floats to TorchScript graph correctly
+        # in _export_fx_to_ts, so we must disable FakeTensorMode. Otherwise, graph may
+        # receive FakeTensor and results runtime error. In addition, TorchScript-based
+        # ONNX exporter used in _ts_graph_to_onnx_model_in_protobuf is not compatible
+        # with FakeTensorMode.
+        with torch.utils._mode_utils.no_dispatch():
+            for node in fx_graph_module.graph.nodes:
+                self.run_node(
+                    node,
+                    fx_graph_module,
+                    onnxfunction_dispatcher,
+                    onnxscript_graph,
+                    onnxscript_tracer,
+                    fx_name_to_onnxscript_value,
+                )
+
+        return onnxscript_graph
+
+    def placeholder(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+    ):
+        # Input of graph.
+        # The node.meta["val"] is generated by FakeTensorProp.
+        # NOTE: add_input() intends to create nodes with shape/type
+        fake_tensor = node.meta.get("val", None)
+        # NOTE: During the tracing, when inputs are constants, they are represented
+        # by nodes with node.meta['val'] being None (nn.Module to dynamo_export)
+        # or nodes with node.meta['val'] being a builtin value (ExportedProgram to dynamo_export).
+        # Nonethless, the nodes are not consumed by others, so we don't need to
+        # create a TorchScriptTensor for them.
+        if fake_tensor is None or isinstance(fake_tensor, (int, float, bool, str)):
+            output = onnxscript_graph.add_input(
+                input_name=None,
+            )
+        elif isinstance(fake_tensor, torch.Tensor):
+            # NOTE: ONNX doesn't support tensor of complex64/complex128, so we
+            # convert them to float32/float64 with real representation.
+            if fx_type_utils.is_torch_complex_dtype(fake_tensor.dtype):
+                fake_tensor = torch.view_as_real(fake_tensor.resolve_conj())
+            output = onnxscript_graph.add_input(
+                input_name=node.name,
+                shape=fake_tensor.shape,
+                dtype=fake_tensor.dtype,
+            )
+
+        elif fx_type_utils.is_torch_symbolic_type(fake_tensor):
+            output = onnxscript_graph.add_input(
+                input_name=node.name,
+                shape=torch.Size([]),
+                dtype=fx_type_utils.from_sym_value_to_torch_dtype(fake_tensor),
+            )
+        else:
+            raise RuntimeError(
+                f"Unsupported type(node.meta['val']) for placeholder: {type(fake_tensor)}"
+            )
+        assert output is not None, (
+            f"Node creates None with target={node.target} and name={node.name}"
+        )
+
+        assert isinstance(output, onnxscript_graph_building.TorchScriptTensor)
+        assert isinstance(output, onnxscript.tensor.Tensor)
+
+        fx_name_to_onnxscript_value[node.name] = output
+
+    def call_function(
+        self,
+        node: torch.fx.Node,
+        onnxscript_tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        fx_graph_module: torch.fx.GraphModule,
+    ):
+        # aten ops and other stateless functions.
+        if node.target == operator.getitem and isinstance(
+            fx_name_to_onnxscript_value[node.args[0].name],  # type: ignore[union-attr,index]
+            tuple,
+        ):
+            onnx_tensor_tuple = fx_name_to_onnxscript_value[node.args[0].name]  # type: ignore[union-attr,index]
+            index = node.args[1]
+            value = onnx_tensor_tuple[index]  # type: ignore[index]
+            assert value is not None, (
+                f"Node creates None with target={node.target} and name={node.name}"
+            )
+            assert isinstance(
+                value, (onnxscript_graph_building.TorchScriptTensor, tuple)
+            ), type(value)
+
+            fx_name_to_onnxscript_value[node.name] = value
+            return
+
+        # Map FX inputs to ONNX inputs and fill optional inputs with default values.
+        # torch_args and torch_kwargs are for op-level validation
+        fx_args, fx_kwargs = _fill_in_default_kwargs(node)
+
+        onnx_args, onnx_kwargs = _wrap_fx_args_as_onnxscript_args(
+            fx_args,
+            fx_kwargs,
+            fx_name_to_onnxscript_value,
+            onnxscript_tracer,
+        )
+        # Dispatch to ONNX op through OpShema. The input argument dtypes are compared to
+        # function signature in OpSchema, and find the best matched overload.
+        symbolic_fn = onnxfunction_dispatcher.dispatch(
+            node=node,
+            onnx_args=onnx_args,  # type: ignore[arg-type]
+            onnx_kwargs=onnx_kwargs,
+        )
+        with onnxscript.evaluator.default_as(onnxscript_tracer):
+            output: (
+                onnxscript_graph_building.TorchScriptTensor
+                | tuple[onnxscript_graph_building.TorchScriptTensor, ...]
+            ) = symbolic_fn(*onnx_args, **onnx_kwargs)
+        assert output is not None, (
+            f"Node creates None with target={node.target}, name={node.name}, args={onnx_args}, kwargs={onnx_kwargs}"
+        )
+        # Assign type and shape from fx graph.
+        _fill_tensor_shape_type(output, node.name, node.meta["val"])
+        # One fx node could produce multiple outputs (e.g., tuple of tensors); in
+        # that case, v is a tuple of TorchScriptTensors.
+        assert isinstance(
+            output, (onnxscript_graph_building.TorchScriptTensor, tuple)
+        ), type(output)
+        fx_name_to_onnxscript_value[node.name] = output
+
+    def output(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+    ):
+        if isinstance(node.args[0], torch.fx.Node):
+            onnx_tensor_or_tensor_tuple = fx_name_to_onnxscript_value[node.args[0].name]
+            onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+        else:
+            # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+            # tensor, etc), we flatten the collection and register each element as output.
+            flat_args, _ = _pytree.tree_flatten(node.args[0])
+            for arg in flat_args:
+                assert isinstance(arg, torch.fx.Node), (
+                    f"arg must be a torch.fx.Node, not {type(arg)}"
+                )
+                onnx_tensor_or_tensor_tuple = fx_name_to_onnxscript_value[arg.name]
+                onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+
+    def call_method(self, node: torch.fx.Node):
+        # TODO(wechi): Support call_method.
+        raise RuntimeError("call_method is not supported yet.")
+
+    def call_module(
+        self,
+        node: torch.fx.Node,
+        parent_onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+        tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        root_fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+    ) -> None:
+        """Export a fx.GraphModule submodule to ONNXScript graph.
+
+        The export process specifically targets `call_module` nodes that are created by
+        the exporter's `Modularize` pass. Each `call_module` node has an associated fx.GraphModule
+        by `node.target` underneath the root fx.GraphModule. These `call_module` nodes are exported as ONNX
+        function nodes. The related `sub_module` is then exported as an ONNX model local function,
+        which is represented by another `TorchScriptGraph`. This `TorchScriptGraph` sets the current
+        `onnxscript_graph` as its parent.
+
+        Args:
+            node: The call_module node in the FX graph that represents the submodule call.
+            parent_onnxscript_graph: The parent ONNXScript graph to which the ONNX function and
+                function node belong.
+            fx_name_to_onnxscript_value: The mapping from FX node name to ONNXScript value.
+            tracer: The tracer used to trace the ONNXScript graph.
+            root_fx_graph_module: The root FX module.
+            onnxfunction_dispatcher: The dispatcher.
+        """
+        assert isinstance(node.target, str), (
+            f"node.target must be a str, not {type(node.target)} for node {node}."
+        )
+
+        sub_module = root_fx_graph_module.get_submodule(node.target)
+
+        assert isinstance(sub_module, torch.fx.GraphModule), (
+            f"sub_module must be a torch.fx.GraphModule, not {type(sub_module)} for node {node}."
+        )
+
+        sub_onnxscript_graph = self.run(
+            sub_module, onnxfunction_dispatcher, parent_onnxscript_graph
+        )
+
+        onnx_args, _ = _wrap_fx_args_as_onnxscript_args(
+            list(node.args), {}, fx_name_to_onnxscript_value, tracer
+        )
+
+        # TODO: We may want to consider other naming styles. The goal is to be stable and
+        # unique such that it can be easily identified in case of kernel substitution.
+        # Example for current style is combination of qualified module class name and
+        # module attribute name: `torch_nn_modules_conv_Conv2d_conv1`.
+        # Other naming styles such as qualified module class name made unique can also
+        # be considered.
+        unique_module_name = f"{sub_module._get_name()}_{node.target}"
+
+        outputs: (
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...]
+        ) = parent_onnxscript_graph.add_module_call(  # type: ignore[assignment]
+            unique_module_name, sub_onnxscript_graph, onnx_args
+        )
+
+        assert isinstance(
+            outputs, (onnxscript_graph_building.TorchScriptTensor, tuple)
+        ), f"Unexpected outputs type {type(outputs)} for node {node}."
+
+        _fill_tensor_shape_type(outputs, node.name, node.meta["val"])
+        fx_name_to_onnxscript_value[node.name] = outputs
+
+        # Skip op_level_validation for call_module. Subgraph nodes are validated individually.
+
+    def get_attr(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: dict[
+            str,
+            onnxscript_graph_building.TorchScriptTensor
+            | tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+        fx_graph_module: torch.fx.GraphModule,
+    ):
+        assert isinstance(node.target, str), f"node.target {node.target} is not a str."
+        attr_tensor = getattr(fx_graph_module, node.target)
+        assert isinstance(attr_tensor, torch.Tensor), f"{attr_tensor} is not a tensor."
+
+        # Parameter/buffer name cannot contain "."
+        # Revert from "/" to restore namespace formatting.
+        input_ = onnxscript_graph.add_initializer(
+            name=node.target.replace("/", "."),
+            value=attr_tensor,
+        )
+
+        assert isinstance(input_, onnxscript_graph_building.TorchScriptTensor)
+        assert isinstance(input_, onnxscript.tensor.Tensor)
+        fx_name_to_onnxscript_value[node.name] = input_
diff --git a/torch/onnx/_internal/fx/onnxfunction_dispatcher.py b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
new file mode 100644
index 0000000000000..516eb36368886
--- /dev/null
+++ b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
@@ -0,0 +1,731 @@
+# mypy: allow-untyped-defs
+"""Dispatcher for AtenLib functions from onnx-script.
+
+This is a deprecated module to be removed.
+"""
+
+from __future__ import annotations
+
+import logging
+import operator
+import types
+from typing import Any, TYPE_CHECKING
+
+import torch
+import torch._ops
+import torch.fx
+from torch.onnx._internal.fx import registration, type_utils as fx_type_utils
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    import onnxscript  # type: ignore[import]
+    from onnxscript.function_libs.torch_lib import (  # type: ignore[import]
+        graph_building as onnxscript_graph_building,
+    )
+
+    from torch.onnx._internal._exporter_legacy import OnnxRegistry
+
+
+logger = logging.getLogger(__name__)
+
+
+class OnnxFunctionDispatcher:
+    """A dispatcher that finds the best ONNX Function for ATen/Custom operators.
+
+    It uses the `torch.ops` name to find the function. If not found, it falls back to default.
+    Otherwise, the best match is found among all function overloads. An exact match has
+    higher precedence over the closest ones.
+
+    Below is a breakdown on how the dispatch mechanism works:
+
+    1. Use the torch.ops name to find the function:
+        a. Check if the ATen overload exists in the registry.
+        b. If not, check if the default overload exists in the registry.
+
+    2. Find the nearest match among all overloaded functions:
+        a. If the types match perfectly, select the function.
+        b. Otherwise, find the nearest one with the highest matching score. Because of
+            the potential wrongly annotated dtypes and attributes matching, we use
+            nearest match to find the best function once the aten name is targeted.
+
+    3. Tie-breaker: If there are multiple nearest matches, we will select the one with
+        the highest matching score.
+
+    NOTE: The nearest match `doesn't guarantee` a correct match, and a warning message is logged.
+    """
+
+    def __init__(
+        self,
+        onnx_registry: OnnxRegistry,
+    ):
+        """Initialize the ONNX Function dispatcher.
+
+        Args:
+            onnx_registry: The ONNX registry.
+        """
+        self.onnx_registry = onnx_registry
+
+    def dispatch(
+        self,
+        node: torch.fx.Node,
+        onnx_args: Sequence[
+            fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
+        ],
+        onnx_kwargs: dict[str, fx_type_utils.Argument],
+    ) -> onnxscript.OnnxFunction | onnxscript.TracedOnnxFunction:
+        """Dispatches an ONNX function based on the given FX node, arguments, and keyword arguments.
+        Args:
+            node: The TorchFX node to dispatch the function for.
+            onnx_args: The arguments of the ONNX function.
+            onnx_kwargs: The keyword arguments of the ONNX function.
+
+        Returns:
+            Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
+        Raises:
+            RuntimeError: If there are no overloaded functions available for the given FX node.
+        """
+        # If there are no overloaded functions available for the given FX node, raise an
+        # unsupported error
+        default_and_custom_functions = self.get_function_overloads(node)
+
+        # If there are overloaded functions available, we will find one that perfect or
+        # nearest matches the given arguments and keyword arguments
+        return self._find_the_perfect_or_nearest_match_onnxfunction(
+            node,
+            default_and_custom_functions,
+            onnx_args,
+            onnx_kwargs,
+        )
+
+    def _filter_or_keep_complex(
+        self,
+        node,
+        default_and_custom_functions: list[registration.ONNXFunction],
+    ) -> list[registration.ONNXFunction]:
+        """Filter the complex functions if the input has complex dtype."""
+
+        args_with_complex_dtype = [_is_arg_with_complex_dtype(arg) for arg in node.args]
+        if any(args_with_complex_dtype):
+            default_and_custom_functions = [
+                func for func in default_and_custom_functions if func.is_complex
+            ]
+            # If we can't find the complex function group, raise error.
+            if not default_and_custom_functions:
+                op_full_name = self._get_aten_name(node).qualified_name()
+                raise RuntimeError(
+                    f"Cannot find any COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                )
+        else:
+            default_and_custom_functions = [
+                func for func in default_and_custom_functions if not func.is_complex
+            ]
+            # If we can't find the complex function group, raise error.
+            if not default_and_custom_functions:
+                op_full_name = self._get_aten_name(node).qualified_name()
+                raise RuntimeError(
+                    f"Can ONLY find COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                )
+        return default_and_custom_functions
+
+    def _find_the_perfect_or_nearest_match_onnxfunction(
+        self,
+        node: torch.fx.Node,
+        default_and_custom_functions: list[registration.ONNXFunction],
+        onnx_args: Sequence[
+            fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
+        ],
+        onnx_kwargs: dict[str, fx_type_utils.Argument],
+    ):
+        """Find the perfect/nearest matched OnnxFunction for the given FX node, arguments, and keyword arguments.
+
+        Args:
+            default_and_custom_functions: The list includes overloaded functions, with
+                custom ones appearing after the default ones.
+            onnx_args: Arguments organized in PyTorch inputs way.
+            onnx_kwargs: Keyword arguments organized in PyTorch inputs way.
+
+            Returns:
+                Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
+            Raises:
+                RuntimeError: If there are no overloaded functions available for the given FX node.
+        """
+        overload_match_ranking: dict[registration.ONNXFunction, int | None] = {}
+
+        # Iterate the overloaded functions in reverse order to prioritize the custom ones
+        # over the default ones, and find the perfect match.
+        for symbolic_function in reversed(default_and_custom_functions):
+            function_opschema = _OnnxSchemaChecker(symbolic_function.onnx_function)
+
+            # NOTE: 1. If the perfect match is found, return the function
+            if function_opschema.perfect_match_inputs(onnx_args, onnx_kwargs):
+                return symbolic_function.onnx_function
+            # Record the match score for the nearest match if it's not the perfect match
+            overload_match_ranking[symbolic_function] = function_opschema.match_score
+
+        # NOTE: 2. If there is no perfect match, find the nearest match among the nearest matche candidates
+        # If there is no nearest match, raise an error
+        overload_match_ranking = {
+            k: v for k, v in overload_match_ranking.items() if v is not None
+        }
+        if not overload_match_ranking:
+            # If there are no overloaded functions available for the given FX node, raise an
+            # unsupported error
+            op_full_name = self._get_aten_name(node).qualified_name()
+            raise RuntimeError(
+                f"Cannot find any perfect/nearest match of symbolic function for {op_full_name},"
+                f"which should be registered under {node.target}.",
+            )
+
+        # NOTE: 3. Tie breaker: if there are multiple nearest matches, we will choose the one
+        # that is custom first. If there are multiple custom ones, we will choose the one
+        # that is added lastly in the list.
+        symbolic_function_list: list[registration.ONNXFunction] = sorted(
+            overload_match_ranking,
+            key=lambda k: (
+                overload_match_ranking[k],
+                k.is_custom,
+                default_and_custom_functions.index(k),
+            ),
+            reverse=True,
+        )
+        return symbolic_function_list[0].onnx_function
+
+    def _get_aten_name(self, node: torch.fx.Node) -> registration.OpName:
+        """Get the OpName from the target.
+
+        Args:
+            node: The TorchFX node to get the aten name for.
+
+        Returns:
+            The internal op name within dataclass: registration.OpName.
+        """
+        if node.target == operator.getitem:
+            return registration.OpName.from_name_parts(
+                namespace="aten", op_name="getitem"
+            )
+        if isinstance(node.target, torch._ops.OpOverloadPacket):
+            # aten::sym_size is the only OverloadPacket that we support.
+            # schema: aten::sym_size(Tensor self, int dim) -> Tensor
+            if node.target != torch.ops.aten.sym_size:
+                raise RuntimeError(
+                    f"Unsupported OverloadPacket: {node.target}, aten.sym_size is the only allowed OverloadPacket!",
+                )
+            # TODO(titaiwang): aten::sym_size has overload, but fx graph is using
+            # overloadpacket for some reasons.
+            # https://github.com/pytorch/pytorch/issues/97201
+            aten_op_default = node.target.default
+            return registration.OpName.from_op_overload(op_overload=aten_op_default)  # type: ignore[no-any-return]
+
+        if isinstance(node.target, types.BuiltinFunctionType):
+            # Make sure it's symint/symfloat consuming builtin ops.
+            for node_arg in node.args:
+                if (not isinstance(node_arg, (torch.fx.Node, int, float))) or (
+                    isinstance(node_arg, torch.fx.Node)
+                    and not fx_type_utils.is_torch_symbolic_type(node_arg.meta["val"])
+                ):
+                    raise RuntimeError(
+                        f"Unsupported node arg: {node_arg} (type {type(node_arg)}) with builtin function: {node.target},"
+                        " only int/float/SymInt/SymFloat is supported with built-in ops!",
+                    )
+            return registration.OpName.from_builtin_function(node.target)
+
+        if isinstance(node.target, torch._ops.OpOverload):
+            return registration.OpName.from_op_overload(op_overload=node.target)
+
+        # Unexpected target, raise error.
+        raise RuntimeError(f"Unknown call_function target: {node.target}")
+
+    def get_function_overloads(
+        self,
+        node: torch.fx.Node,
+    ) -> list[registration.ONNXFunction]:
+        """Get the function overloads from the registry.
+
+        Args:
+            node: The node to get the function overloads for.
+
+        Returns:
+            The list contains ONNXFunctions, starting with the default ones and
+            followed by any custom ones.
+        """
+
+        internal_opname: registration.OpName = self._get_aten_name(node=node)
+
+        # If the ATen/Custom operators are not registered, the group will be None.
+        # And non-registered ATen/Custom operators will trigger error in the next step.
+        function_group: list[registration.ONNXFunction] | None = None
+
+        function_group = self.onnx_registry.get_op_functions(
+            namespace=internal_opname.namespace,
+            op_name=internal_opname.op_name,
+            overload=internal_opname.overload,
+        )
+
+        # NOTE: Fall back to default overload if the ONNX registry doesn't have the overload.
+        if function_group is None:
+            function_group = self.onnx_registry.get_op_functions(
+                namespace=internal_opname.namespace,
+                op_name=internal_opname.op_name,
+                overload=None,
+            )
+            if function_group is not None:
+                op_full_name = internal_opname.qualified_name()
+
+        if function_group is not None:
+            # NOTE: If the input has complex dtype, we will only dispatch to the complex functions.
+            function_group = self._filter_or_keep_complex(node, function_group)
+            return function_group  # type: ignore[return-value]
+
+        op_full_name = internal_opname.qualified_name()
+        raise RuntimeError(
+            f"Cannot find symbolic function for {op_full_name}, "
+            f"which should be registered under {node.target}.",
+        )
+
+
+class _OnnxSchemaChecker:
+    """
+    The OnnxSchemaChecker class is a checker for ONNX OpSchema and param schema.
+
+    It provides methods to check for input compatibility based on the OpSchema. It also
+    provides a matching score to indicate how well the OpSchema matches the input and
+    kwargs types. A function will be evaluated as perfect match, nearest match eligible,
+    or no match.
+
+    Here are some common examples in categories:
+
+    1. [NOTE: Perfect match]: The number of inputs and attributes are exactly the same as
+        the OpSchema. The types of inputs and attributes are exactly the same as the
+        OpSchema.
+
+        ```python
+        inputs = (Tensor[2, 3], Tensor[2, 3])
+        attributes = {"alpha": 1.0}
+
+
+        @torch_op("aten::op")
+        def aten_op(self: TReal, other: TReal, alpha: float = 1) -> TReal: ...
+        ```
+        Result: Perfect match.
+
+    2. [NOTE: Optional input]: The dispatcher recognizes optional inputs. However,
+        the input can't be ignored. None must be provided.
+
+        ```python
+        inputs = (Tensor([2, 3]), None)
+        attributes = {}
+
+        aten_op(X: TTensor, Y: Optional[INT64]):
+            ...
+        ```
+        Result: Perfect match.
+        Real example: `aten::convolution`.
+
+    3. [NOTE: Different attributes]: If an attribute is provided with value, it's
+        a must to match the attribute in function signature.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a":1, "b":2}
+
+        aten_op(X: TTensor, a: int):
+            ...
+        ```
+        Result: No match.
+        Real example: `aten::div` vs `aten::div.Tensor_mode`.
+
+    4. [NOTE: Default attributes]: Default attribute will fill in the value into
+        inputs/attributes.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {}
+
+        aten_op(X: TTensor, a: int = 3):
+            ...
+        ```
+        Result: Perfect match.
+        Real example: `aten::clone`
+
+    5. [NOTE: Ignore attribute with None value]: The attributes with None value
+        will be ignored in matching.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a": None}
+
+        aten_op(X: TTensor):
+            ...
+        ```
+        Result: Perfect match.
+
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a": None}
+
+        aten_op(X: TTensor, a: int = 3):
+            ...
+        ```
+        Result: Nearest match eligible.
+
+        Real example: `aten::div` vs `aten::div.Tensor_mode`.
+
+    Attributes:
+        onnxfunction: The OnnxFunction.
+        param_schema: The parameter schema defined in the OnnxFunction.
+        op_schema: The ONNX OpSchema.
+        type_constraints: The type constraints defined in the OpSchema.
+        attributes: The attributes defined in the OpSchema.
+        _matching_score: The matching score of the OnnxSchemaChecker .
+
+    """
+
+    def __init__(
+        self,
+        onnxfunction: onnxscript.OnnxFunction | onnxscript.TracedOnnxFunction,
+    ):
+        """Initialize the OnnxSchemaChecker .
+
+        Args:
+            onnxfunction: The OnnxFunction.
+        """
+        self.onnxfunction = onnxfunction
+        self.param_schema = self.onnxfunction.param_schemas()
+        op_schema = self.onnxfunction.op_schema
+        # Both `OnnxFunction` and `TracedOnnxFunction` never return None for `op_schema`.
+        # However their base class would. Hence return type is annotated as Optional[OpSchema].
+        assert op_schema is not None
+        self.op_schema = op_schema
+        self.type_constraints = {
+            # "T": {"tensor(int64)"}
+            constraint.type_param_str: set(constraint.allowed_type_strs)
+            for constraint in self.op_schema.type_constraints
+        }
+        self.attributes = self.op_schema.attributes
+        self._matching_score: int | None = None
+
+    @property
+    def match_score(self) -> int | None:
+        """The matching score of the OnnxSchemaChecker .
+
+        If this remains None, it means the matching score has not been calculated,
+        and it's not a nearest match candidate.
+
+        Returns:
+            The matching score of the OnnxSchemaChecker .
+        """
+        return self._matching_score
+
+    def perfect_match_inputs(
+        self,
+        args: Sequence[
+            fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
+        ],
+        kwargs: dict[str, fx_type_utils.Argument],
+    ) -> bool:
+        """Check if the inputs perfectly match the OpSchema requirements.
+
+        The definition of perfect match is that the input types are all in the type
+        constraints and the number of inputs matches the number of inputs in the
+        OpSchema.
+
+        Checking steps:
+        1. The function signature matches the inputs number, and attribute names.
+        2. The input/attribute types are all in the type constraints.
+
+        A function should at least pass the first step to be eligible for the
+        nearest matching.
+
+        Args:
+            args: The input arguments organized in PyTorch inputs way.
+            kwargs: The input keyword arguments organized in PyTorch inputs way.
+
+        Returns:
+            True if the inputs match the requirements, False otherwise.
+        """
+
+        # NOTE: OnnxFunction does not have the same function signature as the original
+        # PyTorch operator. We need to separate the input/attributes from the arguments.
+        (
+            function_inputs,
+            function_attributes,
+        ) = self._separate_input_attributes_from_arguments(
+            self.param_schema,
+            args,
+            kwargs,
+            fill_defaults=True,  # fill defaults for optional arguments to match
+        )
+        # NOTE: 1. Check if the input number and attribute names match the
+        # OpSchema. If it's not, we know the function is not eligible to be a perfect
+        # match, nor a nearest match.
+        # We use is_perfect_match to postpone the return value to the end
+        # of the function, as we want to log all the mismatch info.
+        is_perfect_match = True
+        if len(function_inputs) != len(self.op_schema.inputs):
+            logger.info(
+                "Actual %d vs expected %d",
+                len(function_inputs),
+                len(self.op_schema.inputs),
+            )
+            logger.info("The function is not a nearest match candidate.")
+            is_perfect_match = False
+
+        if set(function_attributes) != set(self.attributes):
+            logger.info("The function is not a nearest match candidate.")
+            is_perfect_match = False
+
+        # If it's already not a perfect match, we can return False directly. Further
+        # checking is only for the functions that are eligible for nearest match.
+        if not is_perfect_match:
+            return False
+
+        # NOTE: 2. The dtypes of inputs and attributes should be in the
+        # type constraints of the OpSchema. If they are not, we know the function is not
+        # eligible to be a perfect match, but can be a nearest match candidate.
+        for schema_input, torch_input in zip(self.op_schema.inputs, function_inputs):
+            torch_input_compatible_types = _find_onnx_data_type(torch_input)
+            allowed_types = self.type_constraints[schema_input.type_str]
+            if not allowed_types.intersection(torch_input_compatible_types) and not any(
+                fx_type_utils.is_optional_onnx_dtype_str(onnx_type_str)
+                for onnx_type_str in allowed_types
+            ):
+                # If torch_input_compatible_types isn't in allowed_types
+                # of this input defined in the OpSchema, we know the function
+                # and the input are not compatible
+                logger.info(
+                    "Actual %s vs\nExpected %s",
+                    torch_input_compatible_types,
+                    allowed_types,
+                )
+                is_perfect_match = False
+
+        for attribute_name, attribute in function_attributes.items():
+            if not self._match_onnx_attribute_type(attribute_name, attribute):
+                # If the attribute type of the OpSchema and the attribute type don't match,
+                # we know the function and the input are not compatible
+                logger.info(
+                    "Actual %s vs\nExpected %s",
+                    type(attribute),
+                    self.attributes[attribute_name].type,
+                )
+                is_perfect_match = False
+
+        # NOTE: This is still a candidate for nearest match, as it only mismatches attributes on dtype.
+        self._record_matching_score(function_inputs, function_attributes)
+        logger.info("match score: %d", self.match_score)
+        return is_perfect_match
+
+    def _match_onnx_attribute_type(
+        self,
+        attribute_name: str,
+        attribute: fx_type_utils.Argument | onnxscript_graph_building.TorchScriptTensor,
+        is_sequence: bool = False,
+    ) -> bool:
+        if isinstance(attribute, (int, float, bool, str)):
+            attribute_onnx_type = fx_type_utils.from_python_type_to_onnx_attribute_type(
+                type(attribute), is_sequence=is_sequence
+            )
+            if attribute_onnx_type != self.attributes[attribute_name].type:
+                return False
+        # If the attribute is an empty list, we don't know the type of the list
+        # so it's a mismatch
+        elif isinstance(attribute, (list, tuple)) and attribute:
+            return self._match_onnx_attribute_type(
+                attribute_name, attribute[0], is_sequence=True
+            )
+        else:
+            # NOTE: Unrecognized attribute type
+            return False
+        return True
+
+    def _record_matching_score(
+        self,
+        inputs: Sequence[
+            fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
+        ],
+        attributes: dict[str, fx_type_utils.Argument],
+    ):
+        """Calculate the inputs matching score of the OpSchema requirements to find the nearest match.
+
+        Only the functions which have the same number of inputs and attributes as the
+        OpSchema are eligible to be a nearest match candidate. Thus, we don't need to
+        check the length of inputs and attributes here, and only check the types of
+        inputs and attributes.
+
+        How the matchsing score is calculated:
+            score += 1 if one input/attribute type is in the type constraints.
+
+        Limitations:
+            None/NoeType/[] could result in zero matches, and the same score of overloads.
+
+        Args:
+            inputs: The input arguments.
+            attributes: The input keyword arguments.
+
+        Returns:
+            True if the inputs match the requirements, False otherwise.
+        """
+        self._matching_score = 0
+        # If they have different length of arguments, the score would be lower to those
+        # functions which have the same length of arguments.
+        for schema_input, torch_input in zip(self.op_schema.inputs, inputs):
+            torch_input_compatible_types = _find_onnx_data_type(torch_input)
+            allowed_types = self.type_constraints[schema_input.type_str]
+            if allowed_types.intersection(torch_input_compatible_types):
+                # If torch_input_compatible_types is in allowed_types
+                # of this input defined in the OpSchema, we know the function
+                # and the input are compatible
+                self._matching_score += 1
+        # NOTE: The penalty is applied to those functions which have different attributes.
+        for attribute_name, attribute_proto in self.attributes.items():
+            attribute = attributes[attribute_name]
+            attribute_onnx_type = fx_type_utils.from_python_type_to_onnx_attribute_type(
+                type(attribute)
+            )
+            if attribute_onnx_type != attribute_proto.type:
+                # If the attribute type of the OpSchema and the attribute type don't match,
+                # we know the function and the input are not compatible
+                self._matching_score -= 1
+
+    # NOTE: Referenced from onnxscript internal function.
+    # Importing this function makes the code less robust, as it is not a public API.
+
+    def _separate_input_attributes_from_arguments(
+        self,
+        param_schemas: Sequence[onnxscript.values.ParamSchema],
+        args: Sequence[
+            fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
+        ],
+        kwargs: dict[str, fx_type_utils.Argument],
+        fill_defaults: bool = True,
+    ) -> tuple[list[Any], dict[str, Any]]:
+        """Separate Python args and kwargs into ONNX inputs and attributes.
+
+        Extra_kwargs are ignored if their values are None. For example, if the
+        OpSchema has an attribute "rounding_mode" and the caller provides
+        "rounding_mode=None", the attribute "rounding_mode" will not be included
+        in the returned attributes when the OnnxFunction signature doesn't have
+        "rounding_mode" as an attribute.
+
+        Args:
+            param_schemas: The parameter schemas of an Op or a OnnxFunction.
+            args: The Python positional arguments supplied by the caller.
+            kwargs: The Python keyword arguments supplied by the caller.
+            fill_defaults: Whether to fill the default values for attributes.
+
+        Returns:
+            A tuple of two elements:
+            - A list of ONNX inputs.
+            - An dictionary of ONNX attribute names and values.
+
+        Raises:
+            TypeError: When allow_extra_kwargs is False and there are unknown kwargs.
+            TypeError: When a required input is not provided.
+        """
+        # args, kwargs and param_schemas should be all in order
+        # user may not specify all inputs or attributes
+
+        import onnx
+
+        onnx_inputs: list[Any] = []
+        onnx_attributes: dict[str, Any] = {}
+        # NOTE: We need to copy kwargs because we will mutate it
+        copy_kwargs = kwargs.copy()
+        for i, param in enumerate(param_schemas):
+            if param.is_variadic_input:
+                # Exhaust all remaining args
+                onnx_inputs.extend(args[i:])
+                args = []
+                continue
+            if i < len(args):
+                if param.is_input:
+                    onnx_inputs.append(args[i])
+                else:
+                    onnx_attributes[param.name] = args[i]
+            elif param.name in copy_kwargs:
+                if param.is_input:
+                    # Move the input from kwargs to inputs
+                    onnx_inputs.append(copy_kwargs[param.name])
+                    copy_kwargs.pop(param.name)
+                else:
+                    onnx_attributes[param.name] = copy_kwargs[param.name]
+            elif (
+                param.is_attribute
+                and self.attributes[param.name].default_value.type
+                != onnx.AttributeProto.UNDEFINED  # type: ignore[attr-defined]
+            ):
+                # User did not provide the attribute
+                if fill_defaults:
+                    onnx_attributes[param.name] = param.default
+            # optional input
+            elif param.is_input:
+                if fill_defaults:
+                    onnx_inputs.append(None)
+
+        # NOTE: Pick up extra kwargs if it's not None. None is not expected
+        # as an attribute value in torchlib.
+        for k, v in copy_kwargs.items():
+            if k not in onnx_attributes and v is not None:
+                onnx_attributes[k] = v
+        return onnx_inputs, onnx_attributes
+
+
+def _is_arg_with_complex_dtype(arg: fx_type_utils.Argument) -> bool:
+    """Check if the node has complex dtype recursively."""
+    if (
+        isinstance(arg, torch.fx.Node)
+        and "val" in arg.meta
+        and isinstance(arg.meta["val"], torch.Tensor)
+        and torch.is_complex(arg.meta["val"])
+    ):
+        return True
+    elif isinstance(arg, list):
+        for item in arg:
+            return _is_arg_with_complex_dtype(item)
+    return False
+
+
+def _find_onnx_data_type(
+    torch_input: fx_type_utils.TensorLike
+    | str
+    | int
+    | float
+    | bool
+    | list
+    | tuple
+    | complex
+    | None,
+) -> set[str]:
+    """Convert inputs data type from torch acceptable dtype to the compatible onnx dtype string."""
+    if (
+        isinstance(torch_input, fx_type_utils.TensorLike)
+        and torch_input.dtype is not None
+    ):
+        return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(torch_input.dtype)
+    if isinstance(torch_input, (int, float, bool, str, complex)):
+        return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(type(torch_input))
+    if isinstance(torch_input, (list, tuple)) and torch_input:  # [Tensor, Tensor]
+        the_first_non_none_item = next(
+            (item for item in torch_input if item is not None), None
+        )
+        set_dtype = _find_onnx_data_type(the_first_non_none_item)
+        if any(isinstance(input, fx_type_utils.TensorLike) for input in torch_input):
+            # NOTE: Any Tensor involved in a list would make it a seq(tensor(onnx_type))
+            return {f"seq({dtype})" for dtype in set_dtype}
+        else:
+            # constant list of non-tensor type
+            return set_dtype
+    if (
+        torch_input is None
+        or (
+            isinstance(torch_input, fx_type_utils.TensorLike)
+            and torch_input.dtype is None
+        )
+        or (isinstance(torch_input, (list, tuple)) and not torch_input)
+    ):
+        # NOTE: None, No dtype, and empty list are edge cases, we allow it to be any type to relax the type check
+        # seq(tensor) also goes to here, as it is not supported in torchscript, and it would be None in this case.
+        return set()
+
+    raise RuntimeError(f"Unknown input type from input: {torch_input}")
diff --git a/torch/onnx/_internal/fx/passes/__init__.py b/torch/onnx/_internal/fx/passes/__init__.py
index eff83563a5a08..b161d89bcf9f9 100644
--- a/torch/onnx/_internal/fx/passes/__init__.py
+++ b/torch/onnx/_internal/fx/passes/__init__.py
@@ -1,6 +1,26 @@
+<<<<<<< HEAD
 from .type_promotion import InsertTypePromotion
 
 
 __all__ = [
     "InsertTypePromotion",
+=======
+from .decomp import Decompose
+from .functionalization import Functionalize, RemoveInputMutation
+from .modularization import Modularize
+from .readability import RestoreParameterAndBufferNames
+from .type_promotion import InsertTypePromotion
+from .virtualization import MovePlaceholderToFront, ReplaceGetAttrWithPlaceholder
+
+
+__all__ = [
+    "Decompose",
+    "InsertTypePromotion",
+    "Functionalize",
+    "Modularize",
+    "MovePlaceholderToFront",
+    "RemoveInputMutation",
+    "RestoreParameterAndBufferNames",
+    "ReplaceGetAttrWithPlaceholder",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/onnx/_internal/fx/passes/_utils.py b/torch/onnx/_internal/fx/passes/_utils.py
new file mode 100644
index 0000000000000..a7b05786ab171
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/_utils.py
@@ -0,0 +1,114 @@
+# mypy: allow-untyped-defs
+"""Common utility functions for FX passes.
+
+These functions should NOT be directly invoked outside of `passes` package.
+"""
+
+from __future__ import annotations
+
+import collections
+import re
+from typing import Callable
+
+import torch.fx
+import torch.fx.traceback as fx_traceback
+
+
+def wrap_graph_module_for_node_meta_preservation(
+    graph_module: torch.fx.GraphModule,
+) -> Callable:
+    """Wrap a GraphModule with contexts to preserve node meta information, such as stacktrace info.
+
+    This is typically useful before calling `make_fx`. Without this wrapper, the
+    stacktrace information will be lost afterwards.
+    """
+
+    def wrapped(*args):
+        with fx_traceback.preserve_node_meta():
+            return torch.fx.Interpreter(graph_module).run(*args)
+
+    return wrapped
+
+
+def _get_node_base_name(node_name: str) -> tuple[str, int | None]:
+    pattern = r"(.*)\.(\d+)"
+    match = re.match(pattern, node_name)
+    if match is not None:
+        base_name, count_str = match.groups()
+        return base_name, int(count_str)
+    return node_name, None
+
+
+def set_node_name(
+    node: torch.fx.Node,
+    new_name: str,
+    name_to_node_cache: dict[str, torch.fx.Node],
+):
+    """Safely set the unique name of a node.
+
+    If the new name is already taken by another node, the name of the other node will be
+    updated. If `new_name` is a string of format f"{base_name}.{count}", where `count`
+    is an integer, the other node will be renamed as f"{base_name}.{count+1}". If not,
+    the other node will be renamed as "{new_name}.1". This function will iteratively
+    update the names until there is no conflict.
+
+    ``name_to_node_cache`` is required as an argument to avoid recomputation. The caller
+    is responsible for ensuring the cache is accurate and in sync with the owning module
+    of the node. The values in the cache will be updated accordingly.
+
+    Args:
+        node: The node to update.
+        new_name: The new name to use.
+        name_to_node_cache: A cache of node names to nodes.
+    """
+    node_name_to_set = collections.deque([(node, new_name)])
+
+    while node_name_to_set:
+        node, new_name = node_name_to_set.pop()
+        if new_name in name_to_node_cache and name_to_node_cache[new_name] != node:
+            base_name, postfix_count = _get_node_base_name(new_name)
+            if postfix_count is None:
+                postfix_count = 0
+            node_name_to_set.append(
+                (name_to_node_cache[new_name], f"{base_name}.{postfix_count + 1}")
+            )
+        node.name = new_name
+        name_to_node_cache[new_name] = node
+
+
+def replace_placeholder_name_and_target(
+    module: torch.fx.GraphModule, reference_module: torch.fx.GraphModule
+):
+    """Replace the argument names in module with those in reference_module.
+
+    This function assumes the two modules have the same signature structure.
+    The caller is responsible for ensuring this. Otherwise, the behavior of this
+    function is undefined. This function only does minimal sanity check that the two
+    modules have the same number of arguments.
+
+    Name conflicts between new names and existing node names in the graph are handled.
+    Check the documentation of :func:`set_node_name` for more details.
+
+    Raises:
+        RuntimeError: If the two modules have different number of arguments.
+    """
+    placeholders = [node for node in module.graph.nodes if node.op == "placeholder"]
+    reference_placeholders = [
+        node for node in reference_module.graph.nodes if node.op == "placeholder"
+    ]
+
+    if len(placeholders) != len(reference_placeholders):
+        raise RuntimeError(
+            "The two modules have different number of arguments. "
+            f"module: {len(placeholders)}, reference_module: {len(reference_placeholders)}"
+        )
+
+    name_to_node: dict[str, torch.fx.Node] = {}
+    for node in module.graph.nodes:
+        name_to_node[node.name] = node
+
+    for placeholder, reference_placeholder in zip(placeholders, reference_placeholders):
+        placeholder.target = reference_placeholder.target
+        set_node_name(placeholder, reference_placeholder.name, name_to_node)
+
+    module.recompile()
diff --git a/torch/onnx/_internal/fx/passes/decomp.py b/torch/onnx/_internal/fx/passes/decomp.py
new file mode 100644
index 0000000000000..1573264d6fc76
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/decomp.py
@@ -0,0 +1,87 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import contextlib
+from typing import Callable, TYPE_CHECKING
+
+import torch
+import torch._ops
+from torch._dispatch import python as python_dispatch
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+from torch.onnx._internal.fx import _pass
+from torch.onnx._internal.fx.passes import _utils
+
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import torch.fx
+
+
+class Decompose(_pass.Transform):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        decomposition_table: Mapping[torch._ops.OpOverload, Callable],
+        enable_dynamic_axes: bool,
+        allow_fake_constant: bool | None = False,
+    ):
+        super().__init__(module)
+        self.decomposition_table = decomposition_table
+        self.enable_dynamic_axes = enable_dynamic_axes
+        self.allow_fake_constant = allow_fake_constant
+
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        assert not kwargs, "kwargs is not supported in Decompose."
+
+        # To preserve stack trace info after `make_fx`.
+        module = _utils.wrap_graph_module_for_node_meta_preservation(self.module)
+
+        # fake mode use static size to trace the size of tensors. while symbolic
+        # mode generates aten::sym_size to dynamically trace the size of tensors.
+
+        # e.g. fake mode:
+        #  view: f32[3, 5, 20] = torch.ops.aten.view.default(x, [3, 5, 20])
+
+        # e.g. symbolic mode:
+        #  sym_size = torch.ops.aten.sym_size(x, 0)
+        #  sym_size_1 = torch.ops.aten.sym_size(x, 1)
+        #  sym_size_2 = torch.ops.aten.sym_size(x, 2)
+        #  sym_size_3 = torch.ops.aten.sym_size(x, 3)
+        #  mul = sym_size_2 * sym_size_3;  sym_size_2 = sym_size_3 = None
+        #  view: f32[3, 5, 20] = torch.ops.aten.view.default(x, [sym_size, sym_size_1, mul])
+
+        # Mimic `torch._dynamo.export(aten_graph=True)` behavior in invoking `make_fx`.
+        # TODO: May need revisit for user fake mode export + dynamic shape scenario.
+        fake_mode: fake_tensor.FakeTensorMode | None = self.fake_mode
+        maybe_fake_args = self._maybe_fakefy_args(fake_mode, *args)
+        if fake_mode is not None:
+            # Using existing fake mode as context, signal `make_fx` that it does not need
+            # to create a new fake mode by passing tracing_mode as "real".
+            tracing_mode = "real"
+        else:
+            # Existing fake mode not found, signal `make_fx` to create one.
+            fake_mode = contextlib.nullcontext()  # type: ignore[assignment]
+            tracing_mode = "symbolic" if self.enable_dynamic_axes else "fake"
+
+        # Apply decomposition table to the input graph.
+        assert fake_mode is not None  # for mypy
+        with (
+            fake_tensor.unset_fake_temporarily(),
+            python_dispatch.enable_python_dispatcher(),
+            fake_mode,
+        ):
+            decomposed_module = proxy_tensor.make_fx(
+                module,
+                decomposition_table=self.decomposition_table,
+                tracing_mode=tracing_mode,
+                _allow_non_fake_inputs=True,
+                _allow_fake_constant=bool(self.allow_fake_constant),
+            )(*maybe_fake_args)
+
+        # Rename placeholder targets to match the original module's signature since
+        # We don't want to map forward(x, y, z) to forward(arg0, arg1, arg2).
+        _utils.replace_placeholder_name_and_target(decomposed_module, self.module)
+
+        return decomposed_module
diff --git a/torch/onnx/_internal/fx/passes/functionalization.py b/torch/onnx/_internal/fx/passes/functionalization.py
new file mode 100644
index 0000000000000..fd8d3c7d48ac5
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/functionalization.py
@@ -0,0 +1,152 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import contextlib
+from typing import Callable
+
+import torch
+import torch._ops
+import torch.func
+import torch.fx
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+from torch.onnx._internal.fx import _pass
+from torch.onnx._internal.fx.passes import _utils
+from torch.utils import _pytree as pytree
+
+
+class Functionalize(_pass.Transform):
+    """Functionalize a GraphModule.
+
+    This pass utilizes ``functionalization`` utility of ``torch._functorch`` to convert
+    a GraphModule into a functional form. The two main functionalities are (copied from
+    its documentations):
+
+    * ``functionalization`` removes (intermediate) mutations and aliasing from a
+    function, while preserving the function's semantics.
+
+    * ``functionalization`` also removes mutations (and views) that were performed
+    on function inputs. However to preserve semantics, functionalize will "fix up" the
+    mutations after the transform has finished running, by detecting if any tensor inputs
+    "should have" been mutated, and copying the new data back to the inputs if necessary.
+    For example, consider::
+
+        def fn(a, b):
+            a.add_(b)
+            return a
+
+      For a call like `fn(x, y)`, the variable `x` outside is also mutated. Hence just
+      functionalizing is not enough for preserving the original semantics. A "special"
+      input mutation step needs to be inserted at the end.::
+
+        # After functionalization, without input mutation "fix up".
+        # This is not semantically the same. The variable outside the function call that
+        # was passed in as `a` is not mutated.
+        def fn(a, b):
+            new_a = a + b
+            return new_a
+
+        # Functionalization with input mutation "fix up" that preserves semantics.
+        def fn(a, b):
+            new_a = a + b
+
+            # Copying the new data back to the inputs
+            a.copy_(new_a)
+
+            return new_a
+
+    For ONNX inference, it is recommended to run ``RemoveInputMutation`` after this pass.
+    ``RemoveInputMutation`` removes the "fix up" nodes that were added by ``Functionalize``,
+    which are not needed for ONNX inference.
+    """
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        enable_dynamic_axes: bool,
+        allow_fake_constant: bool | None = False,
+    ):
+        super().__init__(module)
+        self.enable_dynamic_axes = enable_dynamic_axes
+        self.allow_fake_constant = allow_fake_constant
+
+    def _functionalize(self, function: Callable) -> Callable:
+        # Working around a dispatcher issue with `torch.func.functionalize` when used
+        # together with `make_fx`.
+        # Ref: https://github.com/pytorch/pytorch/issues/99774#issuecomment-1527949391
+        def wrapped(*inputs):
+            inputs_functional = pytree.tree_map_only(
+                torch.Tensor, torch._to_functional_tensor, inputs
+            )
+            torch._enable_functionalization(reapply_views=True)
+            try:
+                out = function(*inputs_functional)
+            finally:
+                torch._disable_functionalization()
+
+            flat_inputs_functional = pytree.tree_leaves(inputs_functional)
+            for input_functional in flat_inputs_functional:
+                if isinstance(input_functional, torch.Tensor):
+                    torch._sync(input_functional)
+            pytree.tree_map(torch._sync, out)
+            out_unwrapped = pytree.tree_map(torch._from_functional_tensor, out)
+            return out_unwrapped
+
+        return wrapped
+
+    def _run(self, *args) -> torch.fx.GraphModule:
+        # To preserve stack trace info after `make_fx`.
+        module = _utils.wrap_graph_module_for_node_meta_preservation(self.module)
+
+        functionalized_callable = self._functionalize(module)
+
+        # Mimic `torch._dynamo.export(aten_graph=True)` behavior in invoking `make_fx`.
+        # TODO: May need revisit for user fake mode export + dynamic shape scenario.
+        fake_mode: fake_tensor.FakeTensorMode | None = self.fake_mode
+        maybe_fake_args = self._maybe_fakefy_args(fake_mode, *args)
+        if fake_mode is not None:
+            # Using existing fake mode as context, signal `make_fx` that it does not need
+            # to create a new fake mode by passing tracing_mode as "real".
+            tracing_mode = "real"
+        else:
+            # Existing fake mode not found, signal `make_fx` to create one.
+            fake_mode = contextlib.nullcontext()  # type: ignore[assignment]
+            tracing_mode = "symbolic" if self.enable_dynamic_axes else "fake"
+
+        assert fake_mode is not None  # for mypy
+        with fake_tensor.unset_fake_temporarily(), fake_mode:
+            graph_module = proxy_tensor.make_fx(
+                functionalized_callable,
+                decomposition_table={},
+                tracing_mode=tracing_mode,
+                _allow_non_fake_inputs=True,
+                _allow_fake_constant=bool(self.allow_fake_constant),
+            )(*maybe_fake_args)
+
+        # Rename placeholder targets to match the original module's signature since
+        # We don't want to map forward(x, y, z) to forward(arg0, arg1, arg2).
+        _utils.replace_placeholder_name_and_target(graph_module, self.module)
+
+        return graph_module
+
+
+class RemoveInputMutation(_pass.Transform):
+    """Remove `aten.copy_.default` nodes that mutate module inputs.
+
+    This pass is recommended to be used after ``Functionalization`` pass.
+    ``Functionalization`` pass adds `aten.copy_.default` nodes to the graph
+    when it detects mutations to inputs. These nodes are not needed for ONNX export
+    for inference. They could be useful for training.
+    """
+
+    def _run(self, *args) -> torch.fx.GraphModule:
+        for node in reversed(self.module.graph.nodes):
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.copy_.default
+                and len(node.users) == 0
+                and isinstance(node.args[0], torch.fx.Node)
+                and node.args[0].op == "placeholder"
+            ):
+                self.module.graph.erase_node(node)
+        return self.module
diff --git a/torch/onnx/_internal/fx/passes/modularization.py b/torch/onnx/_internal/fx/passes/modularization.py
new file mode 100644
index 0000000000000..18a424826bfef
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/modularization.py
@@ -0,0 +1,857 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import abc
+import collections
+import copy
+import operator
+from typing import Any, Final, TYPE_CHECKING
+
+import torch
+import torch.fx
+from torch.onnx._internal.fx import _pass
+from torch.utils import _pytree as pytree
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator, Iterator, Sequence
+
+
+_FX_TRACER_NN_MODULE_META_TYPE = tuple[str, type]
+"""Legacy type of item from `node.meta["nn_module_stack"].items()` produced by FX symbolic tracer."""
+_FX_TRACER_NN_MODULE_STACK_META_TYPE = collections.OrderedDict
+"""Legacy type of `node.meta["nn_module_stack"]` produced by FX symbolic tracer."""
+
+_DYNAMO_NN_MODULE_META_TYPE = tuple[str, tuple[str, type]]
+"""Type of item from `node.meta["nn_module_stack"].items()` produced by FX dynamo tracer."""
+_DYNAMO_NN_MODULE_STACK_META_TYPE = dict[str, _DYNAMO_NN_MODULE_META_TYPE]
+"""Type of `node.meta["nn_module_stack"]` produced by FX dynamo tracer."""
+
+
+class _ModuleMeta:
+    """Meta information about a module.
+
+    This class is used to represent the module information in a more structured way.
+    It parses raw module information from a single item from
+    `node.meta["nn_module_stack"].items()`.
+
+    See the uses of `from_raw_meta`, `from_fx_tracer_produced_raw_meta`, and
+    `from_dynamo_produced_raw_meta` for how to create an instance.
+
+    Attributes:
+        _module_class: The class of the module. E.g. `torch.nn.module.sparse.Embedding`.
+        _module_name: The name of the module. E.g. `L__self___h_1_mlp_c_proj`.
+        _raw_meta: The raw meta '(module_name, node.meta["nn_module_stack"][module_name])'.
+    """
+
+    _module_class: Final[type | str | None]  # type: ignore[misc]
+    _module_name: Final[str]  # type: ignore[misc]
+    _raw_meta: Final[tuple[Any, Any]]  # type: ignore[misc]
+
+    def __init__(
+        self,
+        module_name: str,
+        module_class: type | str | None,
+        raw_meta: tuple[Any, Any],
+    ):
+        self._module_name = module_name
+        self._module_class = module_class
+        self._raw_meta = raw_meta
+
+    @property
+    def module_display_name(self) -> str:
+        """The display name of the module.
+
+        E.g. `h_1_mlp_c_proj`.
+        """
+        # E.g., from 'L__self___h_1_mlp_c_proj' to 'h_1_mlp_c_proj'.
+        name = self.module_name
+        name = name.removeprefix("L__self___")
+        return name
+
+    @property
+    def qualified_module_class_name(self) -> str:
+        """Qualified name of the module class.
+
+        E.g. `torch_nn_module_sparse_Embedding`.
+        """
+        if self._module_class is None:
+            return ""
+        mod_cls = self._module_class
+        if isinstance(mod_cls, type):
+            mod_cls = mod_cls.__module__ + "." + mod_cls.__qualname__
+        return mod_cls.replace(".", "_")
+
+    @property
+    def module_class_name(self) -> str:
+        """Name of the module class.
+
+        E.g. `Embedding`.
+        """
+        if self._module_class is None:
+            return ""
+        if isinstance(self._module_class, type):
+            return self._module_class.__name__
+        return self._module_class
+
+    @property
+    def module_name(self) -> str:
+        """Name of the module.
+
+        E.g. `L__self___h_1_mlp_c_proj`.
+        """
+        return self._module_name
+
+    @property
+    def raw_meta(self) -> tuple[Any, Any]:
+        """Returns the raw module meta data.
+
+        I.e. (module_name, node.meta['nn_module_stack'][module_name]).
+        """
+        return self._raw_meta
+
+    def __eq__(self, other: object, /) -> bool:
+        if not isinstance(other, _ModuleMeta):
+            return False
+        return (
+            self._module_name == other._module_name
+            and self._module_class == other._module_class
+        )
+
+    def __hash__(self) -> int:
+        return hash((self._module_name, self._module_class))
+
+    def __repr__(self) -> str:
+        return f"ModuleMeta(name={self._module_name}, class={self._module_class})"
+
+    @classmethod
+    def create_root(cls) -> _ModuleMeta:
+        """Create an empty module meta representing root module."""
+        return _ModuleMeta("", None, ("", None))
+
+    @classmethod
+    def from_fx_tracer_produced_raw_meta(
+        cls, raw_meta: _FX_TRACER_NN_MODULE_META_TYPE
+    ) -> _ModuleMeta:
+        """Create a module meta from raw meta produced by FX symbolic tracer."""
+        module_name, module_class = raw_meta
+        return _ModuleMeta(module_name, module_class, raw_meta)
+
+    @classmethod
+    def from_dynamo_produced_raw_meta(
+        cls, raw_meta: _DYNAMO_NN_MODULE_META_TYPE
+    ) -> _ModuleMeta:
+        """Create a module meta from raw meta produced by FX dynamo tracer."""
+        module_name, (_qualified_name, module_class) = raw_meta
+        return _ModuleMeta(module_name.split("@")[0], module_class, raw_meta)
+
+    @classmethod
+    def from_raw_meta(
+        cls,
+        raw_meta: _FX_TRACER_NN_MODULE_META_TYPE | _DYNAMO_NN_MODULE_META_TYPE,
+    ) -> _ModuleMeta:
+        if (
+            isinstance(raw_meta, tuple)
+            and len(raw_meta) == 2
+            and isinstance(raw_meta[1], type)
+        ):
+            # Trying to do `instance(raw_meta, _FX_TRACER_NN_MODULE_META_TYPE)`
+            return _ModuleMeta.from_fx_tracer_produced_raw_meta(raw_meta)
+        if (
+            isinstance(raw_meta, tuple)
+            and len(raw_meta) == 2
+            and isinstance(raw_meta[1], tuple)
+        ):
+            # Trying to do `instance(raw_meta, _DYNAMO_NN_MODULE_META_TYPE)`
+            return _ModuleMeta.from_dynamo_produced_raw_meta(raw_meta)
+        raise TypeError(
+            f"Unknown type of raw meta item from node.meta['nn_module_stack'].items(): {type(raw_meta)}"
+        )
+
+
+class _ModuleStackMeta:
+    """Meta information about the module call stack.
+
+    This class is used to represent the module call stack information in a more
+    structured way. It parses raw module stack information from `node.meta["nn_module_stack"]`.
+
+    Example of raw module stack information:
+
+        If produced by dynamo:
+
+            {
+                'L__self___h_1': (
+                    "L['self'].h[1]",
+                    <class 'transformers.models.gpt2.modeling_gpt2.GPT2Block'>
+                ),
+                'L__self___h_1_attn': (
+                    "L['self'].h[1].attn",
+                    <class 'transformers.models.gpt2.modeling_gpt2.GPT2Attention'>
+                )
+            }
+
+        If produced by fx.symbolic_trace:
+
+            {
+                'h.1': <class 'transformers.models.gpt2.modeling_gpt2.GPT2Block'>,
+                'h.1.attn': <class 'transformers.models.gpt2.modeling_gpt2.GPT2Attention'>
+            }
+    """
+
+    _module_stack: Final[list[_ModuleMeta]]  # type: ignore[misc]
+
+    def __init__(
+        self,
+        nn_module_stack_meta: _FX_TRACER_NN_MODULE_STACK_META_TYPE
+        | _DYNAMO_NN_MODULE_STACK_META_TYPE
+        | None,
+        is_exported_program: bool = True,
+    ):
+        self._module_stack = []
+        if nn_module_stack_meta is None:
+            return
+        raw_meta = copy.copy(nn_module_stack_meta)
+        for item in raw_meta.items():
+            # If produced by torch.export.export, there is another call stack layer
+            # that we need to skip
+            if is_exported_program:
+                is_exported_program = False
+                continue
+            self.push(_ModuleMeta.from_raw_meta(item))  # type: ignore[arg-type]
+
+    def __len__(self) -> int:
+        return len(self._module_stack)
+
+    def __getitem__(self, index: int) -> _ModuleMeta:
+        return self._module_stack[index]
+
+    def __iter__(self) -> Iterator[_ModuleMeta]:
+        return iter(self._module_stack)
+
+    def is_empty_or_root(self) -> bool:
+        return len(self._module_stack) == 0
+
+    def top(self) -> _ModuleMeta:
+        """Returns the top module meta in the stack. I.e., the meta for leaf module.
+
+        Example:
+
+            Consider the following module stack:
+
+            stack = [GPT, block1, Attention_1, MLP]
+
+            stack.top() == MLP
+        """
+        if self.is_empty_or_root():
+            return _ModuleMeta.create_root()
+        return self._module_stack[-1]
+
+    def is_superset_of(
+        self,
+        module_stack: _ModuleStackMeta,
+    ) -> bool:
+        """Determines if self is a superset of the provided module stack.
+
+        I.e., If self includes all elements from the provided module stack, plus additional
+        elements on top. If self is empty or root, this method always return False.
+
+        Example:
+
+            Consider the following module stack:
+
+            stack_1 = [GPT, block1, Attention_1, MLP]
+            stack_2 = [GPT, block1]
+
+            stack_1.is_superset_of(stack_2) == True
+            stack_2.is_superset_of(stack_1) == False
+
+            stack_3 = [GPT, block2, Attention_1]
+
+            stack_1.is_superset_of(stack_3) == False
+            stack_3.is_superset_of(stack_1) == False
+        """
+        if self.is_empty_or_root():
+            return False
+
+        if module_stack.is_empty_or_root() is None:
+            return True
+
+        if len(self) <= len(module_stack):
+            return False
+
+        for i, parent_key in enumerate(module_stack):
+            if self[i] != parent_key:
+                return False
+
+        return True
+
+    def push(self, module_meta: _ModuleMeta) -> None:
+        """Pushes a module meta to the stack."""
+        self._module_stack.append(module_meta)
+
+    def __eq__(self, other: object, /) -> bool:
+        if not isinstance(other, _ModuleStackMeta):
+            return False
+        return self._module_stack == other._module_stack
+
+    @property
+    def raw_meta(self) -> dict[str, tuple[str, type]] | None:
+        """Returns the raw module stack meta data, i.e. node.meta['nn_module_stack']."""
+        return {
+            module_meta.raw_meta[0]: module_meta.raw_meta[1]
+            for module_meta in self._module_stack
+        }
+
+    def __repr__(self) -> str:
+        return f"ModuleStackMeta({self._module_stack})"
+
+    @property
+    def module_display_name(self) -> str:
+        """Returns the module display name of the top module."""
+        return self.top().module_display_name
+
+    @property
+    def qualified_module_class_name(self) -> str:
+        """Returns the qualified module class name of the top module."""
+        return self.top().qualified_module_class_name
+
+    @property
+    def module_class(self) -> type | str | None:
+        """Returns the module class of the top module."""
+        return self.top()._module_class
+
+
+def _module_stack_meta_from_node(
+    node: torch.fx.Node, is_exported_program: bool = False
+) -> _ModuleStackMeta:
+    return _ModuleStackMeta(
+        node.meta.get("nn_module_stack"), is_exported_program=is_exported_program
+    )
+
+
+def _get_unique_module_name(module_names: dict[str, int], module_name: str) -> str:
+    module_names.setdefault(module_name, 0)
+    module_names[module_name] += 1
+    return f"{module_name}_{module_names[module_name]}"
+
+
+class _IRNode(abc.ABC):
+    """Base class for IR nodes.
+
+    IR nodes are used for Modularize pass only. They add a layer of abstraction on top of
+    torch.fx.Node.
+
+    [NOTE: Modularize Pass Implementation]
+    The main job of the pass is to group `fx.Node`s that belong to the same `nn.Module`
+    forward call, and then create `call_module` node and sub `fx.GraphModule` from them.
+    Each `fx.Node` possesses an `nn_module_stack` meta data that contains information
+    about the module call stack. See `_ModuleStackMeta` for examples.
+
+    Analysis step
+    -------------
+
+    Each module call is identified by a set of base stack layers. For each module call,
+    the pass creates a `_ModuleNode` and groups the sequence of nodes that shares the
+    same base stack layers.
+
+    For example,
+
+        stack_of_node_0 = [GPT, block0]
+        stack_of_node_1 = [GPT, block1]
+        stack_of_node_2 = [GPT, block1, Attention1, MLP]
+        stack_of_node_3 = [GPT, block1, Attention1]
+        stack_of_node_4 = [GPT, block2]
+
+    All nodes belong to the `GPT` module call, since they share the base stack layers [GPT].
+    [node_1, node_2, node_3] are grouped for `GPT.block1`, because they share the base
+    stack layers [GPT, block1]. And [node_2, node_3] for `GPT.block1.Attention1`, [node_0]
+    for `GPT.block0`, and [node_4] for `GPT.block2` respectfully.
+
+    After the analysis step, a hierarchical representation is generated.
+
+    For above example, the representation is:
+
+        _ModuleNode(GPT)
+            _ModuleNode(block0)
+                _LeafNode(node_0)
+            _ModuleNode(block1)
+                _LeafNode(node_1)
+                _ModuleNode(Attention1)
+                    _ModuleNode(MLP)
+                        _LeafNode(node_2)
+                _LeafNode(node_3)
+            _ModuleNode(block2)
+                _LeafNode(node_4)
+
+    Construction step
+    -----------------
+
+    The second step is to build the actual `call_module` node and the sub `fx.GraphModule`.
+    This is done recursively from the leaf `_ModuleNode` to the root.
+
+    For example, the first submodule to be built is `GPT.block1.Attention1.MLP`. Below pair
+    is generated from `_ModuleNode(MLP)`.
+
+        fx.GraphModule(GPT.block1.Attention1.MLP)
+            graph:
+                node_2
+
+        new_mlp_node = `call_module[GPT.block1.Attention1.MLP](...)`
+
+    Next, the `GPT.block1.Attention1` submodule is built. Below is generated from
+    `_ModuleNode(Attention1)`.
+
+        fx.GraphModule(GPT.block1.Attention1)
+            graph:
+                new_mlp_node
+                node_3
+
+        new_attention1_node = `call_module[GPT.block1.Attention1](...)`
+
+    Until every submodule is built, the new modularized `fx.GraphModule` is generated.
+
+    Alternatives
+    ------------
+
+    The current algorithm adopts a top down approach. A bottom up approach is similar.
+    In contrast to these two, an alternative flat order approach is also possible, where
+    each node is traversed and copied to the corresponding submodule.
+
+    The advantage of the current approach lies in the encapsulation of the fx.GraphModule
+    construction for each individual submodule within a single `build_module` method, which
+    can be called separately once the analysis phase is completed, making debugging more
+    convenient.
+
+    Regarding construction step, an alternative implementation is to utilize `fx.Interpreter`
+    for traversing all the nodes under the flattened root module and copying the nodes
+    into their respective submodule under construction. This approach is not adopted because
+
+        1. It uses the flat order approach discussed above. This means one cannot individually
+    construct a submodule and examine it while debugging.
+
+        2. The graph execution functionality of `fx.Interpreter` is not necessary for the
+    purpose of this pass. Ignoring that, `fx.Interpreter.run` achieves the same effect
+    as a for loop over all the nodes.
+    """
+
+    @property
+    @abc.abstractmethod
+    def stack_meta(self) -> _ModuleStackMeta:
+        """The module stack meta data associated with this node."""
+        ...
+
+    @property
+    @abc.abstractmethod
+    def stack_trace(self) -> str | None:
+        """The stack trace associated with this node."""
+        ...
+
+
+class _ModuleNode(_IRNode):
+    """Representing a sequence of fx.Nodes to be formed into a fx.GraphModule.
+
+    This class encapsulates metadata and provides building block methods to construct this
+    layered abstraction from a sequence of flat fx.Nodes.
+
+    Attributes:
+    - _stack_meta: Metadata of the module stack.
+    - _nodes: List of IR nodes in the module.
+    - _reference_root_module: Reference to the root flat fx.GraphModule instance.
+    """
+
+    def __init__(
+        self, reference_root_module: torch.fx.GraphModule, stack_meta: _ModuleStackMeta
+    ):
+        self._stack_meta = stack_meta
+        self._nodes: list[_IRNode] = []
+        self._reference_module = reference_root_module
+
+    @property
+    def stack_meta(self) -> _ModuleStackMeta:
+        return self._stack_meta
+
+    @property
+    def stack_trace(self) -> str | None:
+        assert self._nodes
+        return self._nodes[0].stack_trace
+
+    def __str__(self) -> str:
+        return f"ModuleNode({self._stack_meta})"
+
+    def is_same_module_as(self, node: _IRNode) -> bool:
+        """Determines if the provided node pertains to the same module as this node."""
+        return self.stack_meta == node.stack_meta
+
+    def is_parent_module_of(self, node: _IRNode) -> bool:
+        """Determines if this node represents a parent module of the provided node."""
+        return node.stack_meta.is_superset_of(self.stack_meta)
+
+    def add_leaf_node(self, leaf_node: _LeafNode) -> None:
+        """Adds a leaf node to the module.
+
+        The leaf node must belong to the same or a child module. This method will recursively
+        construct _ModuleNode instance based on the stack_meta information of the leaf node.
+        """
+        if self.is_same_module_as(leaf_node) or leaf_node.fx_op == "call_module":
+            self._nodes.append(leaf_node)
+        elif leaf_node.fx_op == "placeholder":
+            # Although the original placeholder has empty nn_module_stack, the placeholder lifted
+            # from get_attr nodes by exported program has their original nn_module_stack. Here
+            # we need to avoid them building submodule.
+            self._nodes.append(leaf_node)
+        elif self.is_parent_module_of(leaf_node):
+            # This node belongs in a submodule.
+            # Check if the last node is a submodule and if it is the parent of this node.
+            last_node = self._nodes[-1] if self._nodes else None
+            if isinstance(last_node, _ModuleNode) and (
+                last_node.is_parent_module_of(leaf_node)
+                or last_node.is_same_module_as(leaf_node)
+            ):
+                # This node belongs to the last_node.
+                last_node.add_leaf_node(leaf_node)
+            else:
+                # Create a new SubmoduleNode for the immediate child module of the current
+                # module. The leaf node may be a grandchild of the current module.
+                # Example:
+                #   self.stack_meta = [A, B, C]
+                #   leaf_node.stack_meta = [A, B, C, D, E, F]
+                # Create a new ModuleNode with stack_meta = [A, B, C, D] and add leaf_node to it.
+                stack_meta = copy.deepcopy(self.stack_meta)
+                stack_meta.push(leaf_node.stack_meta[len(self.stack_meta)])
+                last_node = _ModuleNode(
+                    self._reference_module,
+                    stack_meta,
+                )
+                self._nodes.append(last_node)
+                last_node.add_leaf_node(leaf_node)
+        else:
+            raise AssertionError(
+                f"Node {leaf_node} ({leaf_node.stack_meta}) does not belong to module "
+                f"{self._stack_meta}."
+            )
+
+    def fx_nodes(self) -> Generator[torch.fx.Node, None, None]:
+        """Returns an iterator for the sequence of fx nodes this instance holds."""
+        for node in self._nodes:
+            if isinstance(node, _ModuleNode):
+                yield from node.fx_nodes()
+            else:
+                assert isinstance(node, _LeafNode)
+                yield node.fx_node
+
+    def module_inputs(self) -> Sequence[torch.fx.Node]:
+        """Extract module inputs from the sequence of fx nodes this instance holds.
+
+        All node args that are produced by nodes outside of the module are considered module
+        inputs. The order of returned module inputs is the same as the their use order.
+
+        ### Known limitations
+
+        The original ordering of module inputs is not preserved. There is no meta information
+        to be found from the `fx.GraphModule` that can be used to recover the original ordering.
+
+        Returns:
+            Sequence of module inputs.
+        """
+        nodes = list(self.fx_nodes())
+        assert len(nodes) > 0, "Cannot extract module inputs from empty nodes."
+        module_inputs: dict[torch.fx.Node, None] = {}
+        node_set: set[torch.fx.Node] = set(nodes)
+
+        def _extract_arg_if_node_outside_module(arg: Any):
+            if isinstance(arg, torch.fx.Node) and arg not in node_set:
+                module_inputs[arg] = None
+
+        for node in nodes:
+            pytree.tree_map(_extract_arg_if_node_outside_module, node.args)
+            pytree.tree_map(_extract_arg_if_node_outside_module, node.kwargs)
+        return list(module_inputs.keys())
+
+    def module_outputs(self) -> Sequence[torch.fx.Node]:
+        """Extract module outputs from the sequence of fx nodes this instance holds.
+
+        All nodes that are used by nodes outside of the module are considered module
+        outputs. The order of returned module outputs is the same as the their creation order.
+
+        ### Known limitations
+
+        The original ordering of module outputs is not preserved. There is no meta information
+        to be found from the `fx.GraphModule` that can be used to recover the original ordering.
+
+        Returns:
+            Sequence of module outputs.
+        """
+        nodes = list(self.fx_nodes())
+        assert len(nodes) > 0, "Cannot extract module inputs from empty nodes."
+        # Need ordered set. Emulate with dict.
+        module_outputs: dict[torch.fx.Node, None] = {}
+        node_set: set[torch.fx.Node] = set(nodes)
+
+        for node in nodes:
+            if any(user not in node_set for user in node.users):
+                module_outputs[node] = None
+        return list(module_outputs.keys())
+
+    def build_module(self, module_names: dict[str, int]) -> torch.fx.GraphModule:
+        """
+        Constructs the fx.GraphModule for this node, registering submodules as necessary.
+
+        Args:
+            module_names: A dictionary of module names and their counts. This is used to
+                generate unique module names for submodules. This should be an empty
+                dictionary when the method is called on a root module.
+        """
+        module_class_name = self._stack_meta.qualified_module_class_name
+        fx_graph = torch.fx.Graph()
+        copy_env: dict[torch.fx.Node, torch.fx.Node] = {}
+
+        def _arg_transform(node: torch.fx.Node) -> torch.fx.Node:
+            return copy_env[node]
+
+        ref_inputs = self.module_inputs()
+        for node in ref_inputs:
+            copy_env[node] = fx_graph.placeholder(node.name, node.type)
+            copy_env[node].meta = copy.copy(node.meta)
+
+        for ir_node in self._nodes:
+            if isinstance(ir_node, _LeafNode):
+                fx_node = ir_node.fx_node
+                copy_env[fx_node] = fx_graph.node_copy(
+                    fx_node, arg_transform=_arg_transform
+                )
+                continue
+
+            assert isinstance(ir_node, _ModuleNode)
+            # Create fx.GraphModule for child submodule.
+            submodule = ir_node.build_module(module_names)
+            ref_submodule_inputs = ir_node.module_inputs()
+            ref_submodule_outputs = ir_node.module_outputs()
+            unique_submodule_name = _get_unique_module_name(
+                module_names, ir_node.stack_meta.module_display_name
+            )
+            # Link the newly generated sub fx.GraphModule with the root reference module.
+            # This step is essential to meet the needs of the subsequent fx.GraphModule initialization
+            # for the fx.GraphModule being created by this method.
+            # The initialization of fx.GraphModule will replicate all necessary attributes from a reference
+            # fx.GraphModule for the fx.Graph. While the root reference module possesses all
+            # parameters and buffers, it does not include the newly created sub fx.GraphModule.
+            # Therefore, it's necessary to register it under the root reference at this stage.
+            self._reference_module.add_submodule(unique_submodule_name, submodule)
+
+            # create call_module fx.Node
+            submodule_node = fx_graph.call_module(
+                unique_submodule_name,
+                tuple(_arg_transform(node) for node in ref_submodule_inputs),
+            )
+            if len(ref_submodule_outputs) > 1:
+                # Module node has multiple output. Create 'getitem' node for each output.
+                submodule_node.meta["val"] = tuple(
+                    ref_output.meta.get("val") for ref_output in ref_submodule_outputs
+                )
+                for i, ref_output in enumerate(ref_submodule_outputs):
+                    getitem_node = fx_graph.call_function(
+                        operator.getitem,
+                        args=(submodule_node, i),
+                        type_expr=ref_output.type,
+                    )
+                    getitem_node.meta = copy.copy(ref_output.meta)
+                    # Make a copy for "nn_module_stack" since the current module will be
+                    # popped from the stack for this 'getitem' node.
+                    getitem_node.meta["nn_module_stack"] = copy.copy(
+                        ref_output.meta["nn_module_stack"]
+                    )
+                    # The node is associated with the parent module.
+                    getitem_node.meta["nn_module_stack"].popitem()
+                    copy_env[ref_output] = getitem_node
+            else:
+                # Module node has single output. Use module node directly.
+                copy_env[ref_submodule_outputs[0]] = submodule_node
+                submodule_node.meta = copy.copy(ref_submodule_outputs[0].meta)
+
+            # Update meta for new call_module node.
+            if (stack_trace := ir_node.stack_trace) is not None:
+                submodule_node.meta["stack_trace"] = stack_trace
+            raw_module_stack_meta = ir_node.stack_meta.raw_meta
+            assert raw_module_stack_meta is not None
+            submodule_node.meta["nn_module_stack"] = copy.copy(raw_module_stack_meta)
+            # The node is associated with the parent module.
+            submodule_node.meta["nn_module_stack"].popitem()
+
+        new_nodes = fx_graph.nodes
+        # Skip if the last node is already 'output'. This is the case for root module.
+        # Otherwise create an 'output' node for the inferred outputs.
+        if next(iter(reversed(new_nodes))).op != "output":
+            ref_submodule_outputs = self.module_outputs()
+            new_outputs = [copy_env[ref_output] for ref_output in self.module_outputs()]
+            node = fx_graph.output(
+                new_outputs[0] if len(new_outputs) == 1 else new_outputs
+            )
+
+        graph_module = torch.fx.GraphModule(
+            self._reference_module, fx_graph, module_class_name
+        )
+        if (module_class := self._stack_meta.module_class) is not None:
+            graph_module.meta["onnx"] = _pass.GraphModuleOnnxMeta(
+                _pass.PackageInfo.from_python_class(module_class)
+            )
+        return graph_module
+
+
+class _LeafNode(_IRNode):
+    """Representing a single fx.Node."""
+
+    def __init__(self, node: torch.fx.Node, is_exported_program: bool = False):
+        self._node = node
+        self._stack_meta = _module_stack_meta_from_node(
+            node, is_exported_program=is_exported_program
+        )
+
+    @property
+    def fx_op(self) -> str:
+        """Syntax sugar for self.fx_node.op."""
+        return self._node.op
+
+    @property
+    def fx_node(self) -> torch.fx.Node:
+        """Returns the fx.Node this instance represents."""
+        return self._node
+
+    @property
+    def stack_meta(self) -> _ModuleStackMeta:
+        """Returns the module stack meta data associated with this node."""
+        return self._stack_meta
+
+    @property
+    def stack_trace(self) -> str | None:
+        """Returns the stack trace associated with this node."""
+        return self.fx_node.meta.get("stack_trace")
+
+    def __str__(self) -> str:
+        return f"LeafNode({self._node})"
+
+
+class Modularize(_pass.Transform):
+    """Transforms a flattened `fx.GraphModule` into a modular structure.
+
+    In the flattened `fx.GraphModule`, each `nn.Module` forward call has been traced as
+    a sequence of `fx.Node`s. All these `fx.Node`s are flattened and reside in the same
+    `fx.GraphModule`. `fx.GraphModule` could be from `torch.export.ExportedProgram` or
+    directly generated by `torch._dynamo.export` with torch.nn.Module.
+
+    This pass generates a new `fx.GraphModule`. It groups the flattened `fx.Node`s that belong
+    to the same `nn.Module` forward call into a sub `fx.GraphModule`. It then replaces the
+    sequence of flattened `fx.Node`s with a single `call_module` node, which is linked with
+    the sub `fx.GraphModule` by `node.target`. The sub `fx.GraphModule` is registered as a
+    submodule of the new `fx.GraphModule`.
+
+    The process is done based on information from the `nn_module_stack` metadata of each node, i.e.
+    `node.meta["nn_module_stack"]`. For more implementation details, see [NOTE: Modularize Pass Implementation].
+
+    An fx submodule under this context can typically be interpreted in three different ways:
+
+        1. As an embodiment of an nn.Module class, which is considered stateless.
+        Its execution path can vary depending on the configuration of module initialization,
+        which should also be part of the inputs.
+
+        2. As a representation of an nn.Module instance. It maintains the state initialized in the module.
+        The execution path can vary based on actual input data.
+
+        3. As a captured call of an nn.Module instance, where the execution path
+        is set.
+
+    The generality decreases along this list. Within the scope of this function, the pass
+    creates fx submodules according to the third interpretation.
+
+    The first interpretation is the most general case. It requires complex analysis and additional
+    metadata and code information to construct its general form. Consider an example nn.Module
+    that generates arbitrary submodules based on an initialization configuration file. It's impractical
+    to extract this logic for the generated fx submodule to function with arbitrary configuration.
+
+    The second interpretation demands less analysis and is sturdier than the
+    first. In most use cases, it's equivalent to the third. It only differs in exceptional situations
+    where a complex nn.Module instance is called multiple times, each with a different set of inputs
+    leading to a unique execution branching path.
+
+    The third interpretation is the most specific scenario. It necessitates the minimum
+    analysis and creates the most stable representation. The drawback is that it
+    generates more redundancy than the other two methods. If needed, a subsequent post-processing
+    pass can be applied to consolidate completely identical functions and reduce duplication.
+
+    ### Known constraints
+    Two successive calls to the same module instance will be conflated. They are indistinguishable.
+    This is due to limitations of the current fx metadata "nn_module_stack".
+
+    [NOTE: Modularize pass ordering]
+    This pass groups fx nodes into subgraphs that reside within the `call_module` fx node.
+    Other fx passes (including some outside the exporter) might not recognize `call_module`.
+    They may assume that all nodes are flattened. Hence it is recommended to invoke this pass
+    as the last pre onnx export fx pass. If not for this consideration, this operation could
+    potentially be relocated anywhere earlier in the pipeline.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> from torch.onnx._internal.fx import passes
+        >>>
+        >>> class CustomModule(torch.nn.Module):
+        >>>     def __init__(self) -> None:
+        >>>         super().__init__()
+        >>>         self.embedding = torch.nn.Embedding(10, 32)
+        >>>         self.relu = torch.nn.ReLU()
+        >>>
+        >>>     def forward(self, x):
+        >>>         out = self.embedding(x)
+        >>>         out = self.relu(out)
+        >>>         return out
+        >>>
+        >>> class TestModule(torch.nn.Module):
+        >>>     def __init__(self) -> None:
+        >>>         super().__init__()
+        >>>         self.layer = CustomModule()
+        >>>         self.linear = torch.nn.Linear(32, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         out = self.layer(x)
+        >>>         out = self.linear(out)
+        >>>         return out
+        >>>
+        >>> gm, _ = torch._dynamo.export(TestModule(), aten_graph=True)(
+        ...     torch.tensor([0, 1, 2])
+        ... )
+        >>> gm.print_readable()
+
+        >>> gm = passes.Modularize(
+        ...     gm,
+        ... ).run()
+        >>> gm.print_readable()
+
+    """
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        is_exported_program: bool = False,
+    ):
+        super().__init__(module)
+        self.module = module
+        self.is_exported_program = is_exported_program
+
+    def _run(self) -> torch.fx.GraphModule:
+        # DCE to remove unused nodes.
+        # If a submodule is unused, it is hard to analyze which nodes constitutes the submodule
+        # outputs. But since it is unused, we can just remove it.
+        self.module.graph.eliminate_dead_code()
+
+        reference_module = torch.fx.GraphModule(self.module, self.module.graph)
+        root_module_node = _ModuleNode(
+            reference_module,
+            _ModuleStackMeta(
+                nn_module_stack_meta=None, is_exported_program=self.is_exported_program
+            ),
+        )
+        for fx_node in self.module.graph.nodes:
+            root_module_node.add_leaf_node(
+                _LeafNode(fx_node, is_exported_program=self.is_exported_program)
+            )
+        return root_module_node.build_module({})
diff --git a/torch/onnx/_internal/fx/passes/readability.py b/torch/onnx/_internal/fx/passes/readability.py
new file mode 100644
index 0000000000000..a14d07b9aa197
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/readability.py
@@ -0,0 +1,130 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from torch.onnx._internal.fx import _pass
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+logger = logging.getLogger(__name__)
+
+
+class RestoreParameterAndBufferNames(_pass.Transform):
+    """Restore parameter and buffer names from original nn.module.
+
+    This pass is useful for readability of the exported ONNX graph. It restores the
+    parameter and buffer names from the original nn.module. For example, if the original
+    nn.module has a parameter named `root.linear.0.weight`, and the parameter is renamed to
+    `_param_constant9` by FX, this pass will rename it back.
+
+    This pass must be run after `Decompose` pass. Because this pass is expected to be called on
+    `fx.GraphModule` produced by `proxy_tensor.make_fx`, where all parameters and buffers
+    are registered at root level.
+    """
+
+    def __init__(
+        self,
+        fx_module: torch.fx.GraphModule,
+        original_nn_module: torch.nn.Module,
+    ):
+        super().__init__(fx_module)
+        self.original_nn_module = original_nn_module
+
+    def _rename_param_and_buffer(
+        self,
+        nodes: Sequence[torch.fx.Node],
+        new_name: str,
+    ) -> None:
+        """Rename the parameter/buffer and replace corresponding nodes with new nodes of updated target."""
+        assert len(nodes) > 0, "`nodes` cannot be empty"
+        assert len({node.target for node in nodes}) == 1, (
+            "`nodes` must all have same `target`"
+        )
+        old_name = nodes[0].target
+        assert isinstance(old_name, str), f"Expected str, got type({old_name})"
+        # Parameter/buffer name cannot contain "."
+        normalized_name = new_name.replace(".", "/")
+        attr_value = getattr(self.module, old_name)
+        setattr(self.module, normalized_name, attr_value)
+        delattr(self.module, old_name)
+        for node in nodes:
+            with self.module.graph.inserting_before(node):
+                new_node = self.module.graph.get_attr(normalized_name)
+                new_node.meta = node.meta
+                node.replace_all_uses_with(new_node)
+                self.module.graph.erase_node(node)
+        logger.info(
+            "Renamed 'self.%s' to 'self.%s', "
+            "normalized from original parameter name '%s'.",
+            old_name,
+            normalized_name,
+            new_name,
+        )
+
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        """Restore parameter and buffer names from original module.
+
+        For each `get_attr` node, if the target is a str representing a parameter or buffer
+        under `self.module`, we rename the parameter or buffer to its original name.
+        The parameters and buffers between `self.module` and `self.original_nn_module` refer
+        to the same objects, allowing us to use it as key to retrieve the original name.
+        """
+        assert len(args) == 0, "RestoreParameterAndBufferNames does not take any args"
+        assert len(kwargs) == 0, (
+            "RestoreParameterAndBufferNames does not take any kwargs"
+        )
+        # state_to_readable_name[parameter/buffer] returns the original readable name of
+        # the parameter/buffer. E.g., "self.linear.weight".
+        state_to_readable_name: dict[torch.nn.Parameter | torch.Tensor, str] = {}
+        state_to_readable_name.update(
+            {v: k for k, v in self.original_nn_module.named_parameters()}
+        )
+        state_to_readable_name.update(
+            {v: k for k, v in self.original_nn_module.named_buffers()}
+        )
+
+        # old_name_to_nodes[old_name] returns a tuple of (nodes, new_name)
+        # where `nodes` is a list of `get_attr` nodes with `old_name` as `target` and
+        # `new_name` is the new readable name.
+        old_name_to_nodes: dict[str, tuple[list[torch.fx.Node], str]] = {}
+
+        for node in self.module.graph.nodes:
+            if node.op == "get_attr":
+                assert isinstance(node.target, str), (
+                    f"Expected str, got type({node.target})"
+                )
+                if node.target.find(".") != -1:
+                    raise RuntimeError(
+                        f"Unexpected target {node.target} in get_attr, found '.' in target. "
+                        f"All parameters and buffers are expected to be registered at root level, "
+                        f"i.e., self.module. "
+                    )
+                if node.target in old_name_to_nodes:
+                    # We have already processed this parameter/buffer.
+                    old_name_to_nodes[node.target][0].append(node)
+                    continue
+                attr_value = getattr(self.module, node.target)
+                if (
+                    isinstance(attr_value, (torch.nn.Parameter, torch.Tensor))
+                    and attr_value in state_to_readable_name
+                ):
+                    readable_name = state_to_readable_name[attr_value]
+                    old_name_to_nodes[node.target] = ([node], readable_name)
+                    continue
+
+                logger.info(
+                    "Cannot find readable name for self.%s: %s. The name is unchanged.",
+                    node.target,
+                    type(attr_value),
+                )
+
+        for nodes, new_name in old_name_to_nodes.values():
+            self._rename_param_and_buffer(nodes, new_name)
+
+        return self.module
diff --git a/torch/onnx/_internal/fx/passes/type_promotion.py b/torch/onnx/_internal/fx/passes/type_promotion.py
index 0dea1aa15317e..c944a69548668 100644
--- a/torch/onnx/_internal/fx/passes/type_promotion.py
+++ b/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -6,7 +6,11 @@
 import dataclasses
 import inspect
 import logging
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dispatch.python
@@ -26,7 +30,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Mapping, Sequence
+=======
+    from collections.abc import Mapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from types import ModuleType
 
     from torch._subclasses import fake_tensor
@@ -149,7 +157,10 @@ def __repr__(self):
             f"{self.promote_args_positions}, {self.promote_kwargs_names}, {self.promotion_kind})"
         )
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __eq__(self, other: object, /) -> bool:
         if not isinstance(other, ElementwiseTypePromotionRule):
             return False
@@ -228,7 +239,11 @@ def __init__(self):
     def preview_type_promotion(
         self, args: tuple, kwargs: dict
     ) -> TypePromotionSnapshot:
+<<<<<<< HEAD
         rounding_mode = kwargs.get("rounding_mode")
+=======
+        rounding_mode = kwargs.get("rounding_mode", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if rounding_mode is None:
             # true_divide
             self.promotion_kind = (
@@ -266,7 +281,10 @@ def __init__(
     def __repr__(self):
         return f"ReductionTypePromotionRule('{self.namespace}', '{self.op_name}', {self.promotion_kind})"
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __eq__(self, other: object, /) -> bool:
         if not isinstance(other, ElementwiseTypePromotionRule):
             return False
@@ -287,7 +305,11 @@ def preview_type_promotion(
         )
         arg = args[0]
         assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
+<<<<<<< HEAD
         dtype: torch.dtype | None = kwargs.get("dtype")
+=======
+        dtype: torch.dtype | None = kwargs.get("dtype", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         computation_dtype, result_dtype = _prims_common.reduction_dtypes(
             arg, self.promotion_kind, dtype
@@ -351,7 +373,11 @@ def preview_type_promotion(
         )
         arg = args[0]
         assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
+<<<<<<< HEAD
         dtype: torch.dtype | None = kwargs.get("dtype")
+=======
+        dtype: torch.dtype | None = kwargs.get("dtype", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The below logic is copied from `torch/_refs/__init__.py` reduction ops impl.
         if dtype is None:
             if _prims_common.is_boolean_dtype(
diff --git a/torch/onnx/_internal/fx/passes/virtualization.py b/torch/onnx/_internal/fx/passes/virtualization.py
new file mode 100644
index 0000000000000..504dea1d84247
--- /dev/null
+++ b/torch/onnx/_internal/fx/passes/virtualization.py
@@ -0,0 +1,96 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+from torch.onnx._internal.fx import _pass
+
+
+if TYPE_CHECKING:
+    import torch.fx
+
+
+class MovePlaceholderToFront(_pass.Transform):
+    """This pass move all placeholder nodes to the front of the graph node list.
+
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        graph_module = self.module
+        graph = graph_module.graph
+        placeholders = []
+        first_not_placeholder = None
+        for node in graph.nodes:
+            if node.op == "placeholder":
+                placeholders.append(node)
+            if first_not_placeholder is None and node.op != "placeholder":
+                first_not_placeholder = node
+        if first_not_placeholder is None:
+            return graph_module
+        for placeholder in placeholders:
+            first_not_placeholder.prepend(placeholder)
+        return graph_module
+
+
+class ReplaceGetAttrWithPlaceholder(_pass.Transform):
+    """Replace get_attr with placeholder.
+
+    The parameters and buffers accessed by the original get_attr are returned;
+    they are useful when creating random inputs for the modified graph_module.
+    """
+
+    _replaced_attrs: tuple[torch.Tensor, ...] | None
+
+    @property
+    def replaced_attrs(self) -> tuple[torch.Tensor, ...]:
+        """The list of replaced weight tensors."""
+        assert self._replaced_attrs is not None, (
+            "Must run ReplaceGetAttrWithPlaceholder first"
+        )
+        return self._replaced_attrs
+
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        graph_module = self.module
+        graph = graph_module.graph
+        replaced_attrs: list[torch.Tensor] = []
+        for node in graph.nodes:
+            if node.op == "get_attr":
+                replaced_attr: torch.Tensor | None = None
+                # get_attr could retrieve either parameter or buffer, so
+                # we need to try both.
+                try:
+                    replaced_attr = graph_module.get_parameter(node.target)
+                except AttributeError:
+                    # It's possible that model author use buffer instead of
+                    # parameter to store trainable weights. In this case,
+                    # 1. get_parameter will throw something like
+                    #    AttributeError: `bias` is not an nn.Parameter.
+                    # 2. get_buffer should work.
+                    replaced_attr = graph_module.get_buffer(node.target)
+
+                # Reassign op type so that get_attr node becomes placeholder node.
+                node.op = "placeholder"
+                # The target name in placeholder must be a valid Python identifier.
+                # Thus, we replace, e.g., "module.submodule.weight" with
+                # "module_submodule_weight".
+                node.target = node.target.replace(".", "_")
+                # Default value is None. This is needed as long as the "graph_module"
+                # has optional inputs. Assume the original forward signature is
+                #  def forward(self, x, y=None)
+                # and the replaced get_attr node has target "z". Then, the modified
+                # signature should be
+                #  def forward(self, x, y=None, z=None)
+                # Without the following line, the signature will be
+                #  def forward(self, x, y=None, z)
+                # , which is not valid Python code.
+                node.args = (None,)
+
+                replaced_attrs.append(replaced_attr)
+
+        self._replaced_attrs = tuple(replaced_attrs)
+
+        return graph_module
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
new file mode 100644
index 0000000000000..6c9724e9f5a73
--- /dev/null
+++ b/torch/onnx/_internal/fx/patcher.py
@@ -0,0 +1,143 @@
+# mypy: allow-untyped-defs
+import copy
+import functools
+from typing import TYPE_CHECKING, Union
+
+import torch
+
+
+if TYPE_CHECKING:
+    import io
+
+
+# TODO: Remove after https://github.com/huggingface/safetensors/pull/318
+@functools.cache
+def has_safetensors_and_transformers():
+    try:
+        # safetensors is not an exporter requirement, but needed for some huggingface models
+        import safetensors  # type: ignore[import]  # noqa: F401
+        import transformers  # type: ignore[import]  # noqa: F401
+        from safetensors import torch as safetensors_torch  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+class ONNXTorchPatcher:
+    """Context manager to temporarily patch PyTorch during FX-to-ONNX export.
+
+    This class is a collection of "patches" required by FX-to-ONNX exporter.
+
+    This context overrides several torch functions to support symbolic
+    export of large scale models.
+
+    torch.load:
+        This function is patched to record the files PyTorch stores model
+        parameters and buffers. Downstream FX-to-ONNX exporter can create
+        initializers from these files.
+    torch.fx._symbolic_trace._wrapped_methods_to_patch:
+        This list is extended with (torch.Tensor, "__getitem__") so that
+        weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
+    safetensors.torch.load_file:
+        This function is patched to allow safetensors to be loaded within
+        FakeTensorMode. Remove after https://github.com/huggingface/safetensors/pull/318
+
+    Search for ONNXTorchPatcher in test_fx_to_onnx_with_onnxruntime.py for
+    example usage.
+
+    TODO: Should this really be a global patcher? Can we make it a local patcher?
+        A reason for splitting this into several patchers is to patch one part of the code
+        as a collateral damage of patching another part of the code. For example, we
+        for tracing model with torch._dynamo.export, we don't need to patch
+        `torch.fx._symbolic_trace._wrapped_methods_to_patch`
+    """
+
+    def __init__(self) -> None:
+        # List of file paths processed by torch.load.
+        self.paths: list[Union[str, io.BufferedIOBase]] = []
+
+        def torch_load_wrapper(f, *args, **kwargs):
+            # Record path for later serialization into ONNX proto
+            self.paths.append(f)
+            # Then, call the original torch.load.
+            return self.torch_load(f, *args, **kwargs)
+
+        # Original version of torch.load.
+        self.torch_load = torch.load
+
+        # Wrapper or modified version of torch functions.
+        self.torch_load_wrapper = torch_load_wrapper
+
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            def safetensors_load_file_wrapper(filename, device="cpu"):
+                # Record path for later serialization into ONNX proto
+                self.paths.append(filename)
+                result = {}
+                with safetensors.torch.safe_open(  # type: ignore[attr-defined]
+                    filename, framework="pt", device=device
+                ) as f:
+                    for k in f.keys():
+                        fake_mode = torch._guards.detect_fake_mode()
+                        if not fake_mode:
+                            result[k] = f.get_tensor(k)
+                        else:
+                            empty_tensor = f.get_slice(k)
+                            result[k] = torch.empty(
+                                tuple(empty_tensor.get_shape()),
+                                dtype=safetensors.torch._getdtype(
+                                    empty_tensor.get_dtype()
+                                ),
+                            )
+                return result
+
+            self.safetensors_torch_load_file = safetensors.torch.load_file
+            self.safetensors_torch_load_file_wrapper = safetensors_load_file_wrapper
+            self.transformers_modeling_utils_safe_load_file = (
+                transformers.modeling_utils.safe_load_file
+            )
+
+    def __enter__(self):
+        torch.load = self.torch_load_wrapper
+
+        self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        desired_wrapped_methods = copy.deepcopy(
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        if (torch.Tensor, "__getitem__") not in desired_wrapped_methods:
+            # Adding `__getitem__` to the patching list will make tensor indexing traceable via
+            # torch.fx.symbolic_trace. Otherwise, `tensor[x, :, y]` cannot be traced.
+            # This happens because `__getitem__` is neither under torch domain nor an aten operator,
+            # so the patching (or similar Proxy-generating mechanism) doesn't happen automatically.
+            # Note that torch.fx.symbolic_trace defines FX_PATCH_GETITEM environment variable for
+            # enabling the line below for patching.
+            desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
+
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            safetensors.torch.load_file = self.safetensors_torch_load_file_wrapper
+            transformers.modeling_utils.safe_load_file = (
+                self.safetensors_torch_load_file_wrapper
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.load = self.torch_load
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = (
+            self.torch_fx__symbolic_trace__wrapped_methods_to_patch
+        )
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            safetensors.torch.load_file = self.safetensors_torch_load_file
+            transformers.modeling_utils.safe_load_file = (
+                self.transformers_modeling_utils_safe_load_file
+            )
diff --git a/torch/onnx/_internal/fx/registration.py b/torch/onnx/_internal/fx/registration.py
new file mode 100644
index 0000000000000..e855f98f044f6
--- /dev/null
+++ b/torch/onnx/_internal/fx/registration.py
@@ -0,0 +1,87 @@
+"""Module for handling ATen to ONNX functions registration."""
+
+from __future__ import annotations
+
+import dataclasses
+from typing import TYPE_CHECKING
+
+
+# We can only import onnx from this module in a type-checking context to ensure that
+# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
+# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
+if TYPE_CHECKING:
+    import types
+
+    import onnxscript  # type: ignore[import]
+
+    import torch._ops
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class ONNXFunction:
+    """A wrapper of onnx-script function.
+
+    op_full_name: The qualified name of the function. In the form of '<namespace>::<op_name>.<overload>'.
+    onnx_function: The onnx-script function from torchlib.
+    is_custom: Whether the function is a custom function.
+    is_complex: Whether the function is a function that handles complex valued inputs.
+
+    """
+
+    onnx_function: onnxscript.OnnxFunction | onnxscript.TracedOnnxFunction
+    op_full_name: str
+    is_custom: bool = False
+    is_complex: bool = False
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class OpName:
+    """A class representing an operator name in internal ONNX converter."""
+
+    namespace: str
+    op_name: str
+    overload: str
+
+    @classmethod
+    def from_name_parts(
+        cls, namespace: str, op_name: str, overload: str | None = None
+    ) -> OpName:
+        # NOTE: in PyTorch, the overload could be unprovided to indicate the
+        # default overload
+        if overload is None or overload == "":
+            overload = "default"
+        return cls(namespace, op_name, overload)
+
+    @classmethod
+    def from_qualified_name(cls, qualified_name: str) -> OpName:
+        """When the name is <namespace>::<op_name>[.<overload>]"""
+        namespace, opname_overload = qualified_name.split("::")
+        op_name, *overload = opname_overload.split(".", 1)
+        overload = overload[0] if overload else "default"
+        return cls(namespace, op_name, overload)
+
+    @classmethod
+    def from_op_overload(cls, op_overload: torch._ops.OpOverload) -> OpName:
+        return cls.from_qualified_name(op_overload.name())
+
+    @classmethod
+    def from_builtin_function(
+        cls, builtin_function: types.BuiltinFunctionType
+    ) -> OpName:
+        """From a builtin function, e.g. operator.add, math.ceil, etc, get the OpName.
+
+        FX graph uses built-in functions to caculate sympy expression. This function
+        is used to get the OpName from a builtin function.
+
+        Args:
+            builtin_function (types.BuiltinFunctionType): operator.add, math.ceil, etc.
+
+        Returns:
+            OpName: _description_
+        """
+        op = builtin_function.__name__  # add, sub, etc.
+        module = builtin_function.__module__  # _operators or math
+        return cls.from_qualified_name(module + "::" + op)
+
+    def qualified_name(self) -> str:
+        return f"{self.namespace}::{self.op_name}.{self.overload}"
diff --git a/torch/onnx/_internal/fx/serialization.py b/torch/onnx/_internal/fx/serialization.py
new file mode 100644
index 0000000000000..cda71e465758d
--- /dev/null
+++ b/torch/onnx/_internal/fx/serialization.py
@@ -0,0 +1,250 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import io
+import logging
+import os
+from typing import IO, TYPE_CHECKING
+
+import torch
+from torch.onnx import _type_utils as jit_type_utils
+
+
+if TYPE_CHECKING:
+    import onnx
+
+    from torch.types import FileLike
+
+log = logging.getLogger(__name__)
+
+
+def _create_tensor_proto_with_external_data(
+    tensor: torch.Tensor,
+    name: str,
+    location: str,
+    basepath: str,
+    dtype_override: onnx.TypeProto | None = None,  # type: ignore[name-defined]
+) -> onnx.TensorProto:  # type: ignore[name-defined]
+    """Create a TensorProto with external data from a PyTorch tensor.
+    The external data is saved to os.path.join(basepath, location).
+
+    Args:
+        tensor: Tensor to be saved.
+        name: Name of the tensor (i.e., initializer name in ONNX graph).
+        location: Relative location of the external data file
+            (e.g., "/tmp/initializers/weight_0" when model is "/tmp/model_name.onnx").
+        basepath: Base path of the external data file (e.g., "/tmp/external_data" while model must be in "/tmp").
+
+
+    Reference for ONNX's external data format:
+        How to load?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L187
+        How to save?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L43
+        How to set ONNX fields?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L88
+    """
+    # FIXME: Avoid importing onnx into torch.onnx.
+    import onnx
+
+    scalar_type = (
+        jit_type_utils.JitScalarType.from_onnx_type(
+            dtype_override.tensor_type.elem_type
+        )
+        if dtype_override is not None
+        else jit_type_utils.JitScalarType.from_dtype(tensor.dtype)
+    )
+
+    # Checkpoints can be stored with a different dtype as the model expects because
+    # the user script can explicitly cast the original type to something or maybe
+    # PyTorch's type promotion might do it
+    if dtype_override is not None and scalar_type.dtype() != tensor.dtype:
+        tensor = tensor.to(scalar_type.dtype())
+
+    tensor_proto = onnx.TensorProto()  # type: ignore[attr-defined]
+    tensor_proto.name = name
+    tensor_proto.data_type = scalar_type.onnx_type()  # type: ignore[assignment]
+
+    tensor_proto.dims.extend(tensor.shape)
+    tensor_proto.data_location = onnx.TensorProto.EXTERNAL  # type: ignore[attr-defined]
+
+    # Settings for saving one tensor per file.
+    # Offset is zero because there is no other tensor in the same file.
+    key_value_pairs = {
+        "location": location,
+        "offset": 0,
+        "length": tensor.untyped_storage().nbytes(),
+    }
+    for k, v in key_value_pairs.items():
+        entry = tensor_proto.external_data.add()
+        entry.key = k
+        entry.value = str(v)
+
+    # Actual path to write content of tensor.
+    external_data_file_path = os.path.join(basepath, location)
+    if os.path.exists(external_data_file_path):
+        os.remove(external_data_file_path)
+
+    # Create external data's folder if not exists.
+    external_data_dir_path = os.path.dirname(external_data_file_path)
+    if not os.path.exists(external_data_dir_path):
+        # if the demo_folder directory is not present
+        # then create it.
+        os.makedirs(external_data_dir_path)
+
+    # Create a fresh file.
+    with open(external_data_file_path, "xb") as data_file:
+        # No need to call "seek" because offset is 0.
+        # data_file.seek(0)
+        # Write tensor content to the file.
+        data_file.write(tensor.numpy(force=True).tobytes())
+
+    return tensor_proto
+
+
+def _convert_safetensors_to_torch_format(safetensors_file):
+    # It this function is called, safetensors is guaranteed to exist
+    # because the HF model with safetensors was already loaded and exported to ONNX
+    from safetensors import safe_open  # type: ignore[import-not-found, import-untyped]
+
+    tensors = {}
+    with safe_open(safetensors_file, framework="pt", device="cpu") as f:  # type: ignore[attr-defined]
+        for k in f.keys():
+            tensors[k] = f.get_tensor(k).cpu()
+    return tensors
+
+
+# TODO: generalize to allow more checkpoints formats (torch or gguf)
+def save_model_with_external_data(
+    basepath: str,
+    model_location: str,
+    initializer_location: str,
+    torch_state_dicts: tuple[dict | FileLike, ...],
+    onnx_model: onnx.ModelProto,  # type: ignore[name-defined]
+    rename_initializer: bool = False,
+) -> None:
+    """Load PyTorch tensors from files and add to "onnx_model" as external initializers.
+
+    Output files:
+        ONNX model file path:
+        ONNX initializer folder: os.path.join(basepath, initializer_location)
+
+    After running this function, you can do
+        ort_sess = onnxruntime.InferenceSession(os.path.join(basepath, model_location))
+    to execute the model.
+
+    Arguments:
+        basepath: Base path of the ONNX external data file (e.g., "/path/to/large_model/").
+        model_location: Relative location of the ONNX model file.
+            E.g., "model.onnx" so that the model file is saved to
+            "<basepath>/model.onnx".
+        initializer_location: Relative location of the ONNX initializer folder.
+            E.g., "initializers" so that the initializers are saved to
+            "<basepath>/initializers/".
+            Note: When initializers are >2GB, must be the same as `model_location`.
+        torch_state_dicts: Dictionaries or files which contain PyTorch tensors to be saved
+            as ONNX initializers. For non-dict arguments, `torch.load` will be used to load them from file-like objects.
+        onnx_model: ONNX model to be saved with external initializers.
+            If an input name matches a tensor loaded from "torch_state_dicts",
+            the tensor will be saved as that input's external initializer.
+        rename_initializer: Replaces "." by "_" for all ONNX initializer names.
+            Not needed by the official torch.onnx.dynamo_export. This is a hack
+            for supporting `FXSymbolicTracer` tracer with fake tensor mode.
+            In short, `FXSymbolicTracer` lifts FX parameters (self.linear_weight)
+            as inputs (`def forward(self, linear_weight)`) and therefore, `.` cannot be used.
+    """
+    # FIXME: Avoid importing onnx into torch.onnx.
+    import onnx
+
+    initializers_to_be_deleted = {}  # Using dict because it is **ordered**
+    existing_initializers = {
+        k.name: idx for idx, k in enumerate(onnx_model.graph.initializer)
+    }
+    onnx_input_names = {input.name for input in onnx_model.graph.input}
+    for el in torch_state_dicts:
+        if isinstance(el, dict):
+            # Useful for when state_dict is loaded with torch.load(..., mmap=True, map_location="cpu") by the user
+            # Using torch.save wouldn't leverage mmap, leading to higher memory usage
+            state_dict = el
+        else:
+            if isinstance(el, (str, os.PathLike)) and os.fspath(el).endswith(
+                ".safetensors"
+            ):
+                state_dict = _convert_safetensors_to_torch_format(el)
+            else:
+                try:
+                    # Loads checkpoint using memory-map on CPU to support really large models
+                    # The underlying torch.UntypedStorage is memory mapped, so state_dict is lazy loaded
+                    state_dict = torch.load(el, map_location="cpu", mmap=True)
+                except (RuntimeError, ValueError) as e:
+                    if "mmap can only be used with files saved with" in str(e) or (
+                        isinstance(el, (io.IOBase, IO))
+                        and el.readable()
+                        and el.seekable()
+                    ):
+                        log.warning(
+                            "Failed to load the checkpoint with memory-map enabled, retrying without memory-map."
+                            "Consider updating the checkpoint with mmap by using torch.save() on PyTorch version >= 1.6."
+                        )
+                        if isinstance(el, (io.IOBase, IO)):
+                            el.seek(0)  # torch.load from `try:` has read the file.
+                        state_dict = torch.load(el, map_location="cpu")
+                    else:
+                        raise e
+
+        for name, tensor in state_dict.items():
+            if rename_initializer:
+                # Basically, "transformer.attention.self.query.weight" is mapped
+                # to "transformer_attention_self_query_weight" for mimicking the
+                # name-modifying code in FX-to-ONNX exporter.
+                # See function _replace_get_attr_with_placeholder for details.
+                name = name.replace(".", "_")
+
+            # This block tries to match the onnx initializer name with torch parameter/buffer
+            #  e.g. A pytorch buffer 'transformer.h.0.attn.bias' can be named 'h.0.attn.bias' in a ONNX initializer
+            # For each PyTorch tensor name loaded by torch.load,
+            #  1.  Search its best match in ONNX model. E.g., the match of
+            #       "transformer_attention_weight" could be "attention_weight".
+            #  2.  Set "tensor" as the initializer of the matched ONNX input.
+            #      E.g., "tensor" is stored as the initializer of "attention_weight".
+            # Step 1 is required because sometimes, tensor names are stored with prefix the dictionary
+            # loaded by torch.load.
+            if name in onnx_input_names:
+                # Same input name shouldn't be matched again
+                onnx_input_names.remove(name)
+            else:
+                for onnx_input_name in onnx_input_names:
+                    if onnx_input_name.endswith(name) or name.endswith(onnx_input_name):
+                        # Find a match. Change name to the matched ONNX input name, so that we
+                        # create initializer with the right ONNX name.
+                        name = onnx_input_name
+                        onnx_input_names.remove(onnx_input_name)
+                        break
+
+            relative_tensor_file_path = os.path.join(initializer_location, name)
+            # Create one file per tensor.
+            # tensor_proto.raw_data is stored to external file at
+            # os.path.join(basepath, relative_tensor_file_path).
+            model_input_types = {k.name: k.type for k in onnx_model.graph.input}
+
+            # Mark for deletion - a replacement will be appended next
+            if name in existing_initializers:
+                initializers_to_be_deleted[existing_initializers[name]] = name
+            tensor_proto = _create_tensor_proto_with_external_data(
+                tensor,
+                name,
+                relative_tensor_file_path,
+                basepath,
+                model_input_types.pop(name, None),
+            )
+            # Add the tensor_proto to the ONNX model as an initializer with external data.
+            onnx_model.graph.initializer.append(tensor_proto)
+    # Remove old duplicated initializers, if any. delete in desc order to not invalidate deletion indices
+    initializers_to_be_deleted = dict(
+        sorted(initializers_to_be_deleted.items(), reverse=True)
+    )
+    for idx in initializers_to_be_deleted.keys():
+        del onnx_model.graph.initializer[idx]
+
+    # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
+    onnx.save(onnx_model, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/io_adapter.py b/torch/onnx/_internal/io_adapter.py
new file mode 100644
index 0000000000000..6c414e8d54e78
--- /dev/null
+++ b/torch/onnx/_internal/io_adapter.py
@@ -0,0 +1,652 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+from typing import Any, Callable, TYPE_CHECKING
+from typing_extensions import Protocol, runtime_checkable
+
+import torch
+import torch.export as torch_export
+from torch.utils import _pytree as pytree
+
+
+if TYPE_CHECKING:
+    import inspect
+    from collections.abc import Mapping, Sequence
+
+
+@runtime_checkable
+class InputAdaptStep(Protocol):
+    """A protocol that defines a step in the input adapting process.
+
+    The input adapting process is a sequence of steps that are applied to the
+    PyTorch model inputs to transform them into the inputs format expected by the
+    exported ONNX model. Each step takes the PyTorch model inputs as arguments and
+    returns the transformed inputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    input signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]: ...
+
+
+class InputAdapter:
+    """A class that adapts the PyTorch model inputs to exported ONNX model inputs format."""
+
+    def __init__(self, steps: list[InputAdaptStep] | None = None):
+        self._steps = steps or []
+
+    def append_step(self, step: InputAdaptStep) -> None:
+        """Appends a step to the input adapt steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    def apply(
+        self,
+        *model_args,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+        **model_kwargs,
+    ) -> Sequence[int | float | bool | str | torch.Tensor | torch.dtype | None]:
+        """Converts the PyTorch model inputs to exported ONNX model inputs format.
+
+        Args:
+            model_args: The PyTorch model inputs.
+            model: The PyTorch model.
+            model_kwargs: The PyTorch model keyword inputs.
+        Returns:
+            A sequence of tensors converted from PyTorch model inputs.
+        """
+        args: Sequence[Any] = model_args
+        kwargs: Mapping[str, Any] = model_kwargs
+        for step in self._steps:
+            args, kwargs = step.apply(args, kwargs, model=model)
+        assert not kwargs
+        return args
+
+
+@runtime_checkable
+class OutputAdaptStep(Protocol):
+    """A protocol that defines a step in the output adapting process.
+
+    The output adapting process is a sequence of steps that are applied to the
+    PyTorch model outputs to transform them into the outputs format produced by the
+    exported ONNX model. Each step takes the PyTorch model outputs as arguments and
+    returns the transformed outputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    output signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Any: ...
+
+
+class OutputAdapter:
+    """A class that adapts the PyTorch model outputs to exported ONNX model outputs format."""
+
+    def __init__(self, steps: list[OutputAdaptStep] | None = None):
+        self._steps = steps or []
+
+    def append_step(self, step: OutputAdaptStep) -> None:
+        """Appends a step to the output format steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[torch.Tensor | int | float | bool | str]:
+        """Converts the PyTorch model outputs to exported ONNX model outputs format.
+
+        Args:
+            model_outputs: The PyTorch model outputs.
+            model: The PyTorch model.
+
+        Returns:
+            PyTorch model outputs in exported ONNX model outputs format.
+        """
+        for step in self._steps:
+            model_outputs = step.apply(model_outputs, model=model)
+        return model_outputs
+
+
+# TODO: make_fx lose stack info https://github.com/pytorch/pytorch/issues/90276
+
+
+# TODO(XuehaiPan): Dynamo does not support `dummy_leaf = object()` as a sentinel value in the frame.
+class _DummyLeaf:  # use a class instead.
+    pass
+
+
+def _replace_list_with_tuple(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    def replace_list_with_tuple(x: Any) -> Any:
+        if type(x) is list:
+            return pytree.tree_map(
+                replace_list_with_tuple,
+                tuple(x),
+                is_leaf=lambda x: type(x) is list,
+            )
+        return x
+
+    dummy_leaf = _DummyLeaf()
+    dummy_tree = pytree.tree_unflatten([dummy_leaf] * spec.num_leaves, spec)
+    dummy_tree = pytree.tree_map(
+        replace_list_with_tuple,
+        dummy_tree,
+        is_leaf=lambda x: type(x) is list,
+    )
+    return pytree.tree_structure(dummy_tree)
+
+
+def _open_top_level_sequence_if_single_element(
+    spec: pytree.TreeSpec,
+) -> pytree.TreeSpec:
+    if spec.type in (tuple, list) and spec.num_children == 1:
+        return spec.children_specs[0]
+    return spec
+
+
+def _assert_identical_pytree_spec(
+    spec1: pytree.TreeSpec, spec2: pytree.TreeSpec, error_message: str
+) -> None:
+    """Assert the two `TreeSpec` objects are identical.
+
+    Args:
+        spec1: The first `TreeSpec` object.
+        spec2: The second `TreeSpec` object.
+        error_message: The error message to raise if the two `TreeSpec` objects are not
+            identical.
+
+    Raises:
+        ValueError: If the two `TreeSpec` objects are not identical.
+    """
+    pass_if_any_checks: Sequence[Callable[[], bool]] = [
+        lambda: spec1 == spec2,
+        # FIXME: Bug in `dynamo.export`. Sometimes outputs returned in 'list' instead of 'tuple'.
+        lambda: _replace_list_with_tuple(spec1) == _replace_list_with_tuple(spec2),
+        # FIXME: Bug in `dynamo.export`. Sometimes single function return is wrapped in list.
+        lambda: _open_top_level_sequence_if_single_element(spec1) == spec2,
+        lambda: spec1 == _open_top_level_sequence_if_single_element(spec2),
+    ]
+
+    if not any(check() for check in pass_if_any_checks):
+        raise ValueError(f"{error_message}\nExpect {spec1}.\nActual {spec2}.")
+
+
+class BindInputStep(InputAdaptStep):
+    """Bind the input arguments to the model signature."""
+
+    def __init__(self, model_signature: inspect.Signature):
+        self._model_signature = model_signature
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Bind the input arguments to the model signature.
+
+        We hope the input kwargs will be mapped to bound.args after binding.
+        If not, we will raise an error.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. args is always empty.
+
+        Raises:
+            ValueError: If there are keyword-only arguments left after binding args and
+                kwargs to model signature.
+        """
+        bound = self._model_signature.bind(*model_args, **model_kwargs)
+        bound.apply_defaults()
+
+        # keyword-only arguments are not handled.
+        # bound.kwargs only contains keyword-only arguments after calling
+        # bind & apply_defaults, so we raise if it's not empty.
+        if bound.kwargs:
+            raise ValueError("Keyword-only arguments are not supported.")
+        return (), bound.arguments
+
+
+class MergeKwargsIntoArgsInputStep(InputAdaptStep):
+    """Merge the input kwargs into the input args."""
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Merge the input kwargs into the input args.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. kwargs is always empty.
+        """
+        return tuple(model_args) + tuple(model_kwargs.values()), {}
+
+
+class LiftParametersAndBuffersIntoArgsInputStep(InputAdaptStep):
+    """Append parameters and buffers to model's positional argument list."""
+
+    def __init__(self, inputs: tuple[torch.Tensor, ...]) -> None:
+        self.inputs = inputs
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Append model's parameters and buffers into its input.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args + appended inputs and kwargs.
+        """
+        return (*model_args, *self.inputs), model_kwargs
+
+
+class ConvertComplexToRealRepresentationInputStep(InputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        return (
+            tuple(
+                torch.view_as_real(arg.resolve_conj())
+                if isinstance(arg, torch.Tensor) and arg.is_complex()
+                else arg
+                for arg in model_args
+            ),
+            model_kwargs,
+        )
+
+
+class RemoveNoneInputStep(InputAdaptStep):
+    """Remove `None` from arguments.
+
+    This adapt step assumes ``model_kwargs`` is empty. It also assumes ``model_args``
+    is flattened, i.e. it does not check `None` inside nested collections.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove `None` from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return tuple(arg for arg in model_args if arg is not None), {}
+
+
+class RemoveNonTensorInputStep(InputAdaptStep):
+    """Remove the non-tensor input arguments.
+
+    Dynamo does not support non-tensor input arguments (https://github.com/pytorch/pytorch/issues/99534).
+
+    Specifically, it does put the input into graph with an empty node, but consumed by no ones.
+    The concrete value is embedded into the graph as a constant arg of a target node. Meta
+    suggests in this case that one should rewrite the model code to make it tensor if the
+    input value is supposed to change at runtime. We might need to further investigate
+    the feasibility of that suggestion.
+
+    For example,
+
+        def func(x, b=1.0):
+            y = x + b
+            z = y.relu()
+            return (y, z)
+
+        x = torch.randn(1, 1, 2, dtype=torch.float32)
+        gm_fun, _ = dynamo.export(func, x, b=8.0, aten_graph=True, tracing_mode="real")
+
+        # class GraphModule(torch.nn.Module):
+        #     def forward(self, x, b):
+        #         arg0: f32[1, 1, 2], arg1, = fx_pytree.tree_flatten_spec(([x, b], {}), self._in_spec)
+        #         # File: path/to/pytorch/test_constant_input.py:5, code: y = x + b
+        #         add_tensor: f32[1, 1, 2] = torch.ops.aten.add.Tensor(arg0, 8.0);  arg0 = None
+
+        #         # File: path/to/pytorch/test_constant_input.py:6, code: z = y.relu()
+        #         relu_default: f32[1, 1, 2] = torch.ops.aten.relu.default(add_tensor)
+        #         return pytree.tree_unflatten([add_tensor, relu_default], self._out_spec)
+
+    Empty torch.fx.Node input leading to a mismatched number of input with PyTorch, as
+    it's ignored in ONNX graph. Thus, we delete the useless input here.
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove Constant from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return (
+            tuple(
+                arg
+                for arg in model_args
+                if not isinstance(arg, (int, float, bool, str))
+            ),
+            {},
+        )
+
+
+class FlattenInputWithTreeSpecValidationInputStep(InputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: pytree.TreeSpec | None = None
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Flatten the model args and kwargs and validate the `SpecTree` output.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model args and kwargs. The kwargs is empty, because
+            they are flattened and merged into the args.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_args, spec = pytree.tree_flatten((model_args, model_kwargs))
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model inputs incompatible with the format that was exported. ",
+            )
+        return flattened_args, {}
+
+
+class FlattenOutputStep(OutputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    NOTE: Ideally we would want to use ``FlattenOutputWithTreeSpecValidationOutputStep``, such
+    that `SpecTree` can be validate for new model outputs. However, this is not possible
+    currently because we never have access to real PyTorch model outputs during export.
+    Only traced outputs may be available, but they are not an accurate reflection of the
+    original PyTorch model outputs format as they are typically in their own unique format,
+    depending on the tracing strategy.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model outputs.
+        """
+        return pytree.tree_leaves(model_outputs)
+
+
+class ConvertComplexToRealRepresentationOutputStep(OutputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Any:
+        """Convert float tensors to complex tensors.
+
+        Args:
+            model_output: The model output.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model output.
+        """
+        return [
+            torch.view_as_real(output.resolve_conj())
+            if isinstance(output, torch.Tensor) and torch.is_complex(output)
+            else output
+            for output in model_outputs
+        ]
+
+
+class FlattenOutputWithTreeSpecValidationOutputStep(OutputAdaptStep):
+    """Same as ``FlattenOutputStep``, with additional `TreeSpec` validation.
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: pytree.TreeSpec | None = None
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_outputs, spec = pytree.tree_flatten(model_outputs)
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model outputs incompatible with the format that was exported. ",
+            )
+        return flattened_outputs
+
+
+class PrependParamsBuffersConstantAotAutogradInputStep(InputAdaptStep):
+    """Prepend model parameters, buffers and constants to the user input.
+
+    :func:`torch.export.export` lifts model parameters, buffers and constants as model input, thus, they
+    must be added to the user input before the model is executed.
+
+    Args:
+        model: The PyTorch model with embedded parameters and buffers.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        ordered_params = tuple(
+            model.state_dict[name]  # type: ignore[union-attr,index]
+            for name in model.graph_signature.parameters  # type: ignore[union-attr]
+        )
+        non_persistent_buffers = set(model.graph_signature.non_persistent_buffers)  # type: ignore[arg-type, union-attr]
+        ordered_buffers = []
+        for name in model.graph_signature.buffers:  # type: ignore[union-attr]
+            if name in non_persistent_buffers:
+                ordered_buffers.append(model.constants[name])  # type: ignore[index, union-attr]
+            else:
+                ordered_buffers.append(model.state_dict[name])  # type: ignore[union-attr,index]
+        ordered_constant_tensors = tuple(
+            model.constants[fqn]  # type: ignore[union-attr,index]
+            for fqn in model.graph_signature.lifted_tensor_constants  # type: ignore[union-attr]
+        )
+
+        # NOTE: calling convention is first params, then buffers, then args as user supplied them.
+        # See: torch/_functorch/aot_autograd.py#L1034
+        updated_args = (
+            *ordered_params,
+            *ordered_buffers,
+            *ordered_constant_tensors,
+            *model_args,
+        )
+        if model_kwargs:
+            return MergeKwargsIntoArgsInputStep().apply(
+                updated_args, model_kwargs, model=model
+            )
+        return updated_args, {}
+
+
+class PrependParamsAndBuffersAotAutogradOutputStep(OutputAdaptStep):
+    """Prepend model's mutated buffers to the user output.
+
+    :func:`torch.export.export` lifts model's mutated buffers as outputs, thus, they
+    must be added to the user output after the model is executed.
+
+    Args:
+        model: The PyTorch model with mutated buffers.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+        """
+
+        assert isinstance(model, torch_export.ExportedProgram), (
+            "'model' must be torch_export.ExportedProgram"
+        )
+        ordered_buffers = tuple(
+            model.state_dict[name]
+            if name in model.state_dict
+            else model.constants[name]
+            for name in model.graph_signature.buffers_to_mutate.values()
+        )
+
+        # NOTE: calling convention is first mutated buffers, then outputs args as model returned them.
+        updated_outputs = (*ordered_buffers, *model_outputs)
+        return updated_outputs
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/jit_utils.py
new file mode 100644
index 0000000000000..5db66f6c83a4e
--- /dev/null
+++ b/torch/onnx/_internal/jit_utils.py
@@ -0,0 +1,374 @@
+# mypy: allow-untyped-defs
+"""Utilities for manipulating the torch.Graph object and the torchscript."""
+
+# TODO(justinchuby): Move more of the symbolic helper functions here and expose
+# them to the user.
+
+from __future__ import annotations
+
+import dataclasses
+import re
+import typing
+from collections.abc import Iterable, Sequence
+from typing import Any
+
+import torch
+from torch import _C
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import registration
+
+
+_ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
+_SKIP_NODE_ATTRIBUTES = {"inplace", "aten"}
+
+
+@dataclasses.dataclass
+class GraphContext:
+    """Extra context for symbolic functions with all methods from torch.Graph.
+
+    NOTE: This class is not meant for external consumption. Please do not depend on
+    it outside of torch.onnx as the interface may evolve.
+
+    Attributes:
+        graph: The _C.Graph being constructed.
+        block: The current _C.Block being constructed.
+        opset: The opset version.
+        original_node: Current node that is being converted from.
+        params_dict: Mapping from graph initializer name to IValue.
+        env: Mapping from Torch domain graph Value to ONNX domain graph Value.
+        values_in_env: Set of all values in env, for constant-time lookups.
+        new_nodes: List that tracks all new nodes that are added (used to make
+            sure metadata is propagated to all new nodes).
+    """
+
+    graph: _C.Graph
+    block: _C.Block
+    opset: int
+    original_node: _C.Node
+    params_dict: dict[str, _C.IValue]
+    env: dict[_C.Value, _C.Value]
+    values_in_env: set[_C.Value]
+    new_nodes: list[_C.Node] = dataclasses.field(default_factory=list)
+
+    # Relay methods from _C.Graph for compatibility with symbolic functions that expect
+    # a _C.Graph
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.graph, name)
+
+    def op(
+        self,
+        opname: str,
+        *raw_args: torch.Tensor | _C.Value,
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator "opname", taking "raw_args" as inputs and "kwargs" as attributes.
+
+        The set of operators and the inputs/attributes they take
+        is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+        Args:
+            opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+                with a namespace, e.g., `aten::add`.
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # FIXME(justinchuby): Add the return type back once we know how to handle mypy
+        return _add_op(self, opname, *raw_args, outputs=outputs, **kwargs)
+
+    def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
+        """Generates an ONNX ATen op node.
+
+        This function is for backward compatibility with the old symbolic functions.
+        """
+        return self.op(
+            "aten::ATen",
+            *args,
+            operator_s=operator,
+            overload_name_s=overload_name,
+            **kwargs,
+        )
+
+    # NOTE: For backward compatibility with the old symbolic functions.
+    # We are probably going to remove this only after the fx exporter is established.
+    at = aten_op
+
+    def onnxscript_op(
+        self,
+        onnx_fn,
+        *raw_args: torch.Tensor | _C.Value,
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator from onnx-script function, taking "raw_args" as inputs and "kwargs" as attributes.
+
+        onnx-script repository: https://github.com/microsoft/onnx-script
+
+        Args:
+            onnx_fn: ONNXFunction from onnx-script; An example can be found at
+                https://github.com/microsoft/onnx-script#example
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # NOTE(titaiwang): This is using class attributes, and it needs to be updated
+        # if onnx-script makes any change on these.
+        symbolic_name = f"{onnx_fn.opset.domain}::{onnx_fn.name}"
+        opset_version = onnx_fn.opset.version
+
+        registration.custom_onnx_symbolic(symbolic_name, opset_version)(onnx_fn)
+
+        return _add_op(self, symbolic_name, *raw_args, outputs=outputs, **kwargs)
+
+
+def add_op_with_blocks(
+    graph_context: GraphContext,
+    opname: str,
+    *inputs: _C.Value,
+    outputs: int = 1,
+    n_blocks: int = 1,
+    **attributes,
+) -> tuple[Any, tuple[GraphContext, ...], _C.Node]:
+    """Creates an ONNX operator "opname", taking inputs and attributes.
+
+    Args:
+        graph_context: The context for the current graph.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        inputs: The inputs to the operator.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        n_blocks: The number of sub-blocks to create in the node.
+        attributes: The attributes of the ONNX operator.
+
+    Returns:
+        A tuple of (output_values, new_contexts, node) where:
+            output_values: One or more output value of this operator
+                (see the `outputs` keyword argument for multi-return nodes).
+            new_contexts: A tuple of new graph contexts for each sub-block.
+            node: The node representing the operator.
+    """
+
+    output_values = graph_context.op(opname, *inputs, outputs=outputs, **attributes)
+    if isinstance(output_values, Sequence):
+        node = output_values[0].node()
+    else:
+        node = output_values.node()
+
+    new_contexts = []
+    for _ in range(n_blocks):
+        new_block = node.addBlock()
+        # Create shallow copy of the graph context and update the block
+        new_context = dataclasses.replace(graph_context, block=new_block)
+        new_contexts.append(new_context)
+
+    return output_values, tuple(new_contexts), node
+
+
+def _add_op(
+    graph_context: GraphContext,
+    opname: str,
+    *args: torch.Tensor | _C.Value,
+    outputs: int = 1,
+    **kwargs,
+):
+    """Creates an ONNX operator "opname", taking "args" as inputs and attributes "kwargs".
+
+    The set of operators and the inputs/attributes they take
+    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+    This function is monkey-patched onto Graph.
+
+    Args:
+        graph_context: The Torch Graph or Block.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        args: The inputs to the operator; usually provided
+            as arguments to the `symbolic` definition.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        kwargs: The attributes of the ONNX operator, whose keys are named
+            according to the following convention: `alpha_f` indicates
+            the `alpha` attribute with type `f`.  The valid type specifiers are
+            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+            specified with type float accepts either a single float, or a
+            list of floats (e.g., you would say `dims_i` for a `dims` attribute
+            that takes a list of integers).
+
+    Returns:
+        (Union[_C.Value, Tuple[_C.Value, ...]])
+        The value representing the single output of this operator (see the `outputs`
+        keyword argument for multi-return nodes).
+    """
+    inputs = [_const_if_tensor(graph_context, arg) for arg in args]
+    # Filter out None attributes, this can be convenient client side because
+    # now they can pass through None attributes, and have them not show up
+    attributes = {k: v for k, v in kwargs.items() if v is not None}
+
+    if "::" not in opname:
+        opname = "onnx::" + opname
+
+    node = _create_node(
+        graph_context.block,
+        opname,
+        inputs,
+        attributes,
+        params_dict=graph_context.params_dict,
+        opset_version=graph_context.opset,
+        n_outputs=outputs,
+        shape_inference=GLOBALS.onnx_shape_inference,
+    )
+    graph_context.new_nodes.append(node)
+
+    if outputs == 1:
+        return node.output()
+    return tuple(node.outputs())
+
+
+def _const_if_tensor(graph_context: GraphContext, arg):
+    if arg is None:
+        return arg
+    if isinstance(arg, _C.Value):
+        return arg
+
+    return _add_op(graph_context, "onnx::Constant", value_z=arg)
+
+
+def _create_node(
+    graph_or_block: _C.Graph | _C.Block,
+    domain_op: str,
+    inputs: Sequence,
+    attributes: dict,
+    params_dict: dict,
+    opset_version: int,
+    n_outputs: int,
+    shape_inference: bool = True,
+) -> _C.Node:
+    """Creates an node 'domain_op', taking inputs and attributes."""
+    if isinstance(graph_or_block, _C.Graph):
+        graph = graph_or_block
+        node = graph.create(domain_op, inputs, n_outputs)
+        node = graph.insertNode(node)
+    elif isinstance(graph_or_block, _C.Block):
+        block = graph_or_block
+        node = block.addNode(domain_op, inputs)
+
+        # Block does not have create defined, so we need to add outputs manually
+        if n_outputs > 1:
+            for _ in range(1, n_outputs):
+                node.addOutput()
+
+    node_outputs = tuple(node.outputs())  # type: ignore[possibly-undefined]
+    assert len(node_outputs) == n_outputs
+
+    aten = domain_op.startswith("aten::")
+
+    # Add all attributes
+    for key, value in sorted(attributes.items()):
+        if key in _SKIP_NODE_ATTRIBUTES:
+            continue
+        _add_attribute(node, key, value, aten=aten)
+    if shape_inference:
+        _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
+    return node
+
+
+def _is_onnx_list(value):
+    return isinstance(value, Iterable) and not isinstance(
+        value, (str, bytes, torch.Tensor)
+    )
+
+
+def _scalar(x: torch.Tensor):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x[0]
+
+
+def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
+    r"""Initializes the right attribute based on type of value."""
+    m = _ATTR_PATTERN.match(key)
+    if m is None:
+        raise ValueError(
+            f"Invalid attribute specifier '{key}' names "
+            "must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
+        )
+    name, kind = m.group(1), m.group(2)
+    if _is_onnx_list(value):
+        kind += "s"
+
+    return getattr(node, f"{kind}_")(name, value)
+
+
+# TODO: Expose this to user when migrating symbolic helper functions to here.
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+def get_device_from_value(value: _C.Value) -> torch.device | None:
+    if not _is_tensor(value):
+        return None
+    tensor_type = typing.cast(_C.TensorType, value.type())
+    return tensor_type.device()
+
+
+def parse_node_kind(kind: str) -> tuple[str, str]:
+    """Parse node kind into domain and Op name."""
+    if "::" not in kind:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' is not in node kind.")
+    domain, opname = kind.split("::", 1)
+    if "::" in opname:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' should only apear once.")
+    return domain, opname
+
+
+def is_aten(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "aten"
+
+
+def is_prim(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "prim"
+
+
+def is_onnx(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "onnx"
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
new file mode 100644
index 0000000000000..04ed0f83ef84c
--- /dev/null
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -0,0 +1,250 @@
+# mypy: allow-untyped-defs
+"""Utilities for manipulating the onnx and onnx-script dependencies and ONNX proto."""
+
+from __future__ import annotations
+
+import glob
+import os
+import shutil
+from typing import Any, TYPE_CHECKING
+
+import torch
+import torch.jit._trace
+import torch.serialization
+from torch.onnx import errors
+from torch.onnx._internal import jit_utils, registration
+
+
+if TYPE_CHECKING:
+    import io
+    from collections.abc import Mapping
+
+
+def export_as_test_case(
+    model_bytes: bytes, inputs_data, outputs_data, name: str, dir: str
+) -> str:
+    """Export an ONNX model as a self contained ONNX test case.
+
+    The test case contains the model and the inputs/outputs data. The directory structure
+    is as follows:
+
+    dir
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
+
+    Args:
+        model_bytes: The ONNX model in bytes.
+        inputs_data: The inputs data, nested data structure of numpy.ndarray.
+        outputs_data: The outputs data, nested data structure of numpy.ndarray.
+
+    Returns:
+        The path to the test case directory.
+    """
+    try:
+        import onnx
+    except ImportError as exc:
+        raise ImportError(
+            "Export test case to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    test_case_dir = os.path.join(dir, "test_" + name)
+    os.makedirs(test_case_dir, exist_ok=True)
+    _export_file(
+        model_bytes,
+        os.path.join(test_case_dir, "model.onnx"),
+        {},
+    )
+    data_set_dir = os.path.join(test_case_dir, "test_data_set_0")
+    if os.path.exists(data_set_dir):
+        shutil.rmtree(data_set_dir)
+    os.makedirs(data_set_dir)
+
+    proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    for i, (input_proto, input) in enumerate(zip(proto.graph.input, inputs_data)):
+        export_data(input, input_proto, os.path.join(data_set_dir, f"input_{i}.pb"))
+    for i, (output_proto, output) in enumerate(zip(proto.graph.output, outputs_data)):
+        export_data(output, output_proto, os.path.join(data_set_dir, f"output_{i}.pb"))
+
+    return test_case_dir
+
+
+def load_test_case(dir: str) -> tuple[bytes, Any, Any]:
+    """Load a self contained ONNX test case from a directory.
+
+    The test case must contain the model and the inputs/outputs data. The directory structure
+    should be as follows:
+
+    dir
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
+
+    Args:
+        dir: The directory containing the test case.
+
+    Returns:
+        model_bytes: The ONNX model in bytes.
+        inputs: the inputs data, mapping from input name to numpy.ndarray.
+        outputs: the outputs data, mapping from output name to numpy.ndarray.
+    """
+    try:
+        import onnx
+        from onnx import numpy_helper  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError(
+            "Load test case from ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(os.path.join(dir, "model.onnx"), "rb") as f:
+        model_bytes = f.read()
+
+    test_data_dir = os.path.join(dir, "test_data_set_0")
+
+    inputs = {}
+    input_files = glob.glob(os.path.join(test_data_dir, "input_*.pb"))
+    for input_file in input_files:
+        tensor = onnx.load_tensor(input_file)  # type: ignore[attr-defined]
+        inputs[tensor.name] = numpy_helper.to_array(tensor)
+    outputs = {}
+    output_files = glob.glob(os.path.join(test_data_dir, "output_*.pb"))
+    for output_file in output_files:
+        tensor = onnx.load_tensor(output_file)  # type: ignore[attr-defined]
+        outputs[tensor.name] = numpy_helper.to_array(tensor)
+
+    return model_bytes, inputs, outputs
+
+
+def export_data(data, value_info_proto, f: str) -> None:
+    """Export data to ONNX protobuf format.
+
+    Args:
+        data: The data to export, nested data structure of numpy.ndarray.
+        value_info_proto: The ValueInfoProto of the data. The type of the ValueInfoProto
+            determines how the data is stored.
+        f: The file to write the data to.
+    """
+    try:
+        from onnx import numpy_helper  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError(
+            "Export data to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(f, "wb") as opened_file:
+        if value_info_proto.type.HasField("map_type"):
+            opened_file.write(
+                numpy_helper.from_dict(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("sequence_type"):
+            opened_file.write(
+                numpy_helper.from_list(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("optional_type"):
+            opened_file.write(
+                numpy_helper.from_optional(
+                    data, value_info_proto.name
+                ).SerializeToString()
+            )
+        else:
+            assert value_info_proto.type.HasField("tensor_type")
+            opened_file.write(
+                numpy_helper.from_array(data, value_info_proto.name).SerializeToString()
+            )
+
+
+def _export_file(
+    model_bytes: bytes,
+    f: io.BytesIO | str,
+    export_map: Mapping[str, bytes],
+) -> None:
+    """export/write model bytes into directory/protobuf/zip"""
+    assert len(export_map) == 0
+    with torch.serialization._open_file_like(f, "wb") as opened_file:
+        opened_file.write(model_bytes)
+
+
+def _add_onnxscript_fn(
+    model_bytes: bytes,
+    custom_opsets: Mapping[str, int],
+) -> bytes:
+    """Insert model-included custom onnx-script function into ModelProto"""
+    try:
+        import onnx
+    except ImportError as e:
+        raise errors.OnnxExporterError("Module onnx is not installed!") from e
+
+    # For > 2GB model, onnx.load_fromstring would fail. However, because
+    # in _export_onnx, the tensors should be saved separately if the proto
+    # size > 2GB, and if it for some reason did not, the model would fail on
+    # serialization anyway in terms of the protobuf limitation. So we don't
+    # need to worry about > 2GB model getting here.
+    model_proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    # Iterate graph nodes to insert only the included custom
+    # function_proto into model_proto
+    onnx_function_list = []  # type: ignore[var-annotated]
+    included_node_func: set[str] = set()
+    # onnx_function_list and included_node_func are expanded in-place
+    _find_onnxscript_op(
+        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
+    )
+
+    if onnx_function_list:
+        model_proto.functions.extend(onnx_function_list)
+        model_bytes = model_proto.SerializeToString()
+    return model_bytes
+
+
+def _find_onnxscript_op(
+    graph_proto,
+    included_node_func: set[str],
+    custom_opsets: Mapping[str, int],
+    onnx_function_list: list,
+):
+    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
+    for node in graph_proto.node:
+        node_kind = node.domain + "::" + node.op_type
+        # Recursive needed for control flow nodes: IF/Loop which has inner graph_proto
+        for attr in node.attribute:
+            if attr.g is not None:
+                _find_onnxscript_op(
+                    attr.g, included_node_func, custom_opsets, onnx_function_list
+                )
+        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
+        onnx_function_group = registration.registry.get_function_group(node_kind)
+        # Ruled out corner cases: onnx/prim in registry
+        if (
+            node.domain
+            and not jit_utils.is_aten(node.domain)
+            and not jit_utils.is_prim(node.domain)
+            and not jit_utils.is_onnx(node.domain)
+            and onnx_function_group is not None
+            and node_kind not in included_node_func
+        ):
+            specified_version = custom_opsets.get(node.domain, 1)
+            onnx_fn = onnx_function_group.get(specified_version)
+            if onnx_fn is not None:
+                if hasattr(onnx_fn, "to_function_proto"):
+                    onnx_function_proto = onnx_fn.to_function_proto()  # type: ignore[attr-defined]
+                    onnx_function_list.append(onnx_function_proto)
+                    included_node_func.add(node_kind)
+                continue
+
+            raise errors.UnsupportedOperatorError(
+                node_kind,
+                specified_version,
+                onnx_function_group.get_min_supported()
+                if onnx_function_group
+                else None,
+            )
+    return onnx_function_list, included_node_func
diff --git a/torch/onnx/_internal/onnxruntime.py b/torch/onnx/_internal/onnxruntime.py
new file mode 100644
index 0000000000000..b994328fcdd82
--- /dev/null
+++ b/torch/onnx/_internal/onnxruntime.py
@@ -0,0 +1,1260 @@
+# mypy: allow-untyped-defs
+import dataclasses
+import importlib
+import logging
+import os
+from collections.abc import Mapping, Sequence
+from typing import Any, Callable, Final, Optional, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias
+
+import torch
+import torch._C
+import torch._ops
+import torch._prims.executor
+import torch.fx
+import torch.onnx._internal._lazy_import
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx._compatibility import compatibility
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
+from torch.utils import _pytree
+
+
+if TYPE_CHECKING:
+    import onnx
+    import onnxruntime
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    import torch.onnx
+    import torch.onnx._internal
+    import torch.onnx._internal._exporter_legacy
+    import torch.onnx._internal.fx.decomposition_table
+    import torch.onnx._internal.fx.passes  # noqa: TCH004
+
+
+_SUPPORT_ONNXRT: Optional[bool] = None
+
+__all__ = [
+    "is_onnxrt_backend_supported",
+    "torch_compile_backend",
+    "OrtExecutionProvider",
+    "OrtBackendOptions",
+    "OrtBackend",
+]
+
+
+def is_onnxrt_backend_supported() -> bool:
+    """Returns ``True`` if ONNX Runtime dependencies are installed and usable
+    to support TorchDynamo backend integration; ``False`` otherwise.
+
+    Example::
+
+        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> if torch.onnx.is_onnxrt_backend_supported():
+        ...     @torch.compile(backend="onnxrt")
+        ...     def f(x):
+        ...             return x * x
+        ...     print(f(torch.randn(10)))
+        ... else:
+        ...     print("pip install onnx onnxscript onnxruntime")
+        ...
+    """
+    global _SUPPORT_ONNXRT
+
+    if _SUPPORT_ONNXRT is None:
+        # `onnxruntime` might import a lot of other runtime packages,
+        # e.g. apex, deepspeed, transformers.
+        # So lazy-importing onnxruntime to avoid possible circular import.
+        try:
+            importlib.import_module("onnxruntime")
+            importlib.import_module("onnxruntime.capi._pybind_state")
+
+            # This is not use directly in DORT but needed by underlying exporter,
+            # so we still need to check if it exists.
+            importlib.import_module("onnxscript")
+
+            import torch.onnx  # noqa: F401
+            import torch.onnx._internal  # noqa: F401
+            import torch.onnx._internal._exporter_legacy  # noqa: F401
+            from torch.onnx._internal.fx import (  # noqa: F401
+                decomposition_table,
+                fx_onnx_interpreter,
+                passes,
+                type_utils,
+            )
+
+            _SUPPORT_ONNXRT = True
+        except ImportError:
+            _SUPPORT_ONNXRT = False
+
+    return _SUPPORT_ONNXRT
+
+
+_dumped_onnx_model: dict[str, int] = {}
+
+
+def _dump_onnx_model(
+    model_string: bytes, graph_module: Optional[torch.fx.GraphModule] = None
+) -> str:
+    """Stores the onnx model into a file.
+    The name is "{ONNXRT_DUMP_PATH}{N}.onnx"
+    where *N* is the number of files already stored with
+    this prefix.
+    If graph_module is not None, the graph is stored as a string with
+    the same filename except the extension (.txt).
+    """
+    prefix = os.environ.get("ONNXRT_DUMP_PATH", None)
+    if not prefix:
+        return ""
+    n = _dumped_onnx_model.get(prefix, -1) + 1
+    filename = f"{prefix}{n}.onnx"
+    with open(filename, "wb") as f:
+        f.write(model_string)
+    _dumped_onnx_model[prefix] = n
+    if graph_module is not None:
+        filename_txt = f"{prefix}{n}.txt"
+        with open(filename_txt, "w", encoding="utf-8") as f:
+            f.write(str(graph_module.graph))
+    return filename
+
+
+def _infer_default_eps() -> Sequence[str]:
+    # TODO: select a good default based on the capabilities of the host
+    # e.g. DML on Windows, etc.
+    return ["CPUExecutionProvider"]
+
+
+def _nvtx_range_push(name: str):
+    """If PyTorch is installed with CUDA support, this starts NVTX range.
+
+    Check torch.cuda.nvtx.range_push's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_push(name)
+
+
+def _nvtx_range_pop():
+    """If PyTorch is installed with CUDA support, this terminates NVTX range.
+
+    Check torch.cuda.nvtx.range_pop's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_pop()
+
+
+def _get_ort_device_type(device_type: str):
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    if device_type == "cuda":
+        return ORTC.OrtDevice.cuda()
+    if device_type == "cpu":
+        return ORTC.OrtDevice.cpu()
+    # ort pytorch device is mapped to NPU OrtDevice type
+    if device_type == "maia":
+        return ORTC.OrtDevice.npu()
+    raise ValueError("Unsupported device type: " + device_type)
+
+
+logger = logging.getLogger(__name__)
+# Uncomment the following lines to print out development info.
+# logging.basicConfig(level=logging.WARNING)
+# logger.setLevel(logging.WARNING)
+
+
+class OrtOperatorSupport(OperatorSupport):
+    """Operator support for ONNXRuntime backend.
+
+    It has two-level of support decision. One is via support_dict and the other one
+    is via extra_support_dict. The logic of using support_dict is implemented in
+    OrtOperatorSupport and extra_support_dict is used by OperatorSupport.is_node_supported.
+    """
+
+    def __init__(self, support_dict: set[Any], extra_support_dict: dict[str, Any]):
+        # Use extra_support_dict[op_name] = None to indicate
+        # we support op_name with all input types. Otherwise,
+        # see support_dict (type: SupportDict) in operator_support.py
+        # for specifying supported types.
+        super().__init__(extra_support_dict)
+        self._onnx_support_dict = support_dict
+
+    def is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        # OperatorSupport.is_node_supported returns True for non-callable nodes.
+        # Since ORT can't execute them, we return False here to override the base
+        # behavior.
+        if node.op not in CALLABLE_NODE_OPS:
+            return False
+        # This is the and the only place to decide if aten op is supported.
+        if node.op == "call_function" and node.target in self._onnx_support_dict:
+            logger.info(
+                "support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        # If node.target is not in support_dict, we still want to check if torch.jit.script
+        # can convert it to ONNX equivalence. Let's use base mechanism to do this.
+        # See extra_support_dict  for supported ops.
+        if super().is_node_supported(submodules, node):
+            logger.info(
+                "extra_support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        logger.warning(
+            "support_dict and extra_support_dict don't support node.target: %s (type: %s)",
+            node.target,
+            type(node.target),
+        )
+        return False
+
+
+def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
+    """
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    graph = graph_module.graph
+    placeholders = []
+    first_not_placeholder = None
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            placeholders.append(node)
+        if first_not_placeholder is None and node.op != "placeholder":
+            first_not_placeholder = node
+    if first_not_placeholder is None:
+        return
+    for placeholder in placeholders:
+        first_not_placeholder.prepend(placeholder)
+
+
+def _infer_ep_from_device(*args) -> tuple[str, ...]:
+    """Return the first valid device (i.e., GPU or CPU) in argument list."""
+    eps = []
+    for arg in args:
+        if hasattr(arg, "device"):
+            device = arg.device
+            if device.type == "cuda":
+                eps.append("CUDAExecutionProvider")
+            elif device.type == "cpu":
+                eps.append("CPUExecutionProvider")
+    return tuple(eps)
+
+
+def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> tuple[Any, ...]:
+    placeholders = []
+    for node in graph_module.graph.nodes:
+        if node.op == "placeholder":
+            if hasattr(node, "meta") and "val" in node.meta:
+                assert isinstance(node.meta["val"], torch.Tensor)
+            placeholders.append(node)
+    return tuple(placeholders)
+
+
+def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
+    """Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
+    for node in graph_module.graph.nodes:
+        if node.op == "output":
+            # Output node is unique. Let's retrieve output values from
+            # this node's input list. And then just return.
+            return node.args[0]
+    raise ValueError("No output node found in this torch.fx.GraphModule.")
+
+
+def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> tuple[str, ...]:
+    """Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
+    flattened_output_args, _ = _pytree.tree_flatten(
+        _extract_graph_module_outputs(graph_module)
+    )
+    # Output arguments with example value (type: torch.Tensor) in the `graph_module`.
+    selected_output_args = [
+        output_arg.meta["val"]
+        for output_arg in flattened_output_args
+        # output_arg must have tensor for its device information.
+        # Otherwise, skip it.
+        if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
+    ]
+    return _infer_ep_from_device(*selected_output_args)
+
+
+def _sort_eps(eps: tuple[str, ...]) -> tuple[str, ...]:
+    """Sort execution providers in eps based on pre-set priority."""
+
+    def get_execution_provider_priority(ep: str) -> int:
+        if ep == "CPUExecutionProvider":
+            # Lowest priority.
+            return 2
+        if ep == "CUDAExecutionProvider":
+            # Higher priority than CPU but lower than
+            # other specialized EPs.
+            return 1
+        # Highest priority.
+        return 0
+
+    unique_eps = set(eps)
+    return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))
+
+
+def _get_onnx_devices(
+    values: tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> tuple["ORTC.OrtDevice", ...]:
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    def _device_id_or_zero(device_id: int) -> int:
+        return device_id or 0
+
+    def _map_tensor_or_sym_to_device(
+        value: Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+    ) -> int:
+        if isinstance(value, torch.Tensor):
+            return ORTC.OrtDevice(
+                _get_ort_device_type(value.device.type),
+                ORTC.OrtDevice.default_memory(),
+                _device_id_or_zero(value.device.index),
+            )
+        elif isinstance(
+            value, (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool)
+        ):
+            return ORTC.OrtDevice(
+                _get_ort_device_type("cpu"), ORTC.OrtDevice.default_memory(), 0
+            )
+        else:
+            raise ValueError("Unsupported value type: " + str(type(value)))
+
+    if len(values) > 0:
+        ort_devices = tuple(_map_tensor_or_sym_to_device(value) for value in values)
+        return ort_devices
+    else:
+        return (_map_tensor_or_sym_to_device(1),)
+
+
+def _get_ortvalues_from_torch_tensors(
+    tensors: tuple[torch.Tensor, ...], devices: tuple["ORTC.OrtDevice", ...]
+) -> tuple[torch.Tensor, ...]:
+    # TODO(justinchuby): Refactor this function
+    import numpy as np
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    torch_dtype_to_numpy_dtype = {
+        torch.float16: np.float16,
+        torch.float32: np.float32,
+        torch.float64: np.float64,
+        torch.uint8: np.uint8,
+        torch.int8: np.int8,
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.longlong,
+        torch.bool: np.bool_,
+    }
+    ortvalues = ORTC.OrtValueVector()
+    ortvalues.reserve(len(tensors))
+    dtypes = []
+    shapes = []
+    data_ptrs = []
+
+    for tensor in tensors:
+        dtypes.append(torch_dtype_to_numpy_dtype[tensor.dtype])
+        shapes.append(tensor.size())
+        data_ptrs.append(tensor.data_ptr())
+    ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
+    return ortvalues
+
+
+def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
+    if tensor.is_sparse:
+        raise ValueError("sparse tensor is not yet supported.")
+    out = torch.empty(tensor.size(), dtype=tensor.dtype, device=tensor.device)
+    return out
+
+
+def _adjust_scalar_from_fx_to_onnx(
+    dynamo_value: Union[
+        torch.Tensor,
+        int,
+        float,
+        bool,
+    ],
+    value_info: "onnx.ValueInfoProto",  # type: ignore[name-defined]
+) -> torch.Tensor:
+    """Helper function to wrap PyTorch variables as torch.Tensor"""
+    if (
+        isinstance(dynamo_value, torch.Tensor)
+        and len(value_info.type.tensor_type.shape.dim) == 0
+        and dynamo_value.shape == (1,)
+    ):
+        # ONNX expect a scalar with empty shape.
+        # In contrast, PyTorch usually allows implicit
+        # conversion between shape=() and shape=(1,).
+        #
+        # Below, PyTorch's shape (1,) is reshaped to ().
+        return torch.squeeze(dynamo_value)
+    elif isinstance(dynamo_value, int):
+        return torch.tensor(dynamo_value, dtype=torch.int64)
+    elif isinstance(dynamo_value, float):
+        return torch.tensor(dynamo_value, dtype=torch.float32)
+    elif isinstance(dynamo_value, bool):
+        return torch.tensor(dynamo_value, dtype=torch.bool)
+    else:
+        assert isinstance(dynamo_value, torch.Tensor)
+        return dynamo_value.contiguous()
+
+
+def _adjust_scalar_from_onnx_to_fx(
+    tensor: torch.Tensor,
+    prim_value: Union[
+        torch.Tensor,
+        torch.SymInt,
+        int,
+        torch.SymFloat,
+        float,
+        torch.SymBool,
+        bool,
+    ],
+) -> Union[
+    torch.Tensor,
+    int,
+    float,
+    bool,
+]:
+    """Helper function to wrap ORT-produced torch.Tensor as PyTorch variables"""
+    assert isinstance(tensor, torch.Tensor), "ORT's output must be tensor."
+    if isinstance(
+        prim_value,
+        (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool),
+    ):
+        # Convert tensor back to scalar to match Dynamo's expectation.
+        return tensor.item()
+    return tensor
+
+
+def _run_onnx_session_with_ortvaluevector(
+    sess: "onnxruntime.InferenceSession",
+    input_names: tuple[str, ...],
+    inputs: tuple[torch.Tensor, ...],
+    input_devices: tuple["ORTC.OrtDevice", ...],
+    output_names: tuple[str, ...],
+    outputs: tuple[torch.Tensor, ...],
+    output_devices: tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> tuple[Union[torch.Tensor, int, float, bool], ...]:
+    import onnxruntime
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    _nvtx_range_push("contiguous")
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    _nvtx_range_pop()
+
+    _nvtx_range_push("push_back_batch")
+    ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)
+
+    # preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
+    # Because the output ortvalue is not allocated and owned by ort, it does not need to convert the output ortvalue
+    # to torch Tensor transferring the ownership.
+    if preallocate_output:
+        pth_outputs = tuple(
+            _to_real_tensor(t) if isinstance(t, FakeTensor) else t for t in outputs
+        )
+        ort_outputs = _get_ortvalues_from_torch_tensors(pth_outputs, output_devices)
+    else:
+        ort_outputs = ORTC.OrtValueVector()
+    _nvtx_range_pop()
+
+    _nvtx_range_push("run_with_ortvaluevector")
+    run_options = onnxruntime.RunOptions()
+    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
+    sess.run_with_ortvaluevector(
+        run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices
+    )
+    _nvtx_range_pop()
+
+    # Post-processing step:
+    #  wrap ORT's outputs to the schema represented by
+    #  `prim_output` (obtained by running the original
+    #  torch.fx.GraphModule).
+    if preallocate_output:
+        # Profile the ORT-to-PyTorch type cast below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Outputs are stored on pre-allocated torch.Tensors' memory,
+        # so this case doesn't need to convert ORTValue to torch.Tensor.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+    else:
+        import onnxruntime.training
+
+        # Profile the two ORT-to-PyTorch type casts below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Map ORTValue to torch.Tensor.
+        pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(
+            ort_outputs
+        )
+        # Change some torch.Tensor to int, float, bool.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+
+
+def _run_onnx_session_with_fetch(
+    sess: "onnxruntime.InferenceSession",
+    input_names: tuple[str, ...],
+    inputs: tuple[torch.Tensor, ...],
+    input_devices: tuple["ORTC.OrtDevice", ...],
+    output_names: tuple[str, ...],
+    outputs: tuple[torch.Tensor, ...],
+    output_devices: tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> tuple[Union[torch.Tensor, int, float, bool], ...]:
+    import onnxruntime
+
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    feed = {
+        name: onnxruntime.OrtValue.ortvalue_from_numpy(tensor.cpu().numpy())
+        for name, tensor in zip(input_names, inputs)
+    }
+    ort_outputs = sess.run(output_names, feed)
+    pth_outputs = tuple(
+        _adjust_scalar_from_onnx_to_fx(
+            torch.from_numpy(value),
+            prim_output,
+        )
+        for value, prim_output in zip(ort_outputs, normalized_prim_outputs)
+    )
+    return pth_outputs
+
+
+def _from_python_type_to_onnx_tensor_element_type(type: type):
+    """
+    Converts a Python type to the corresponding ONNX tensor element type.
+    For example, `_from_python_type_to_onnx_tensor_element_type(float)` returns
+    `onnx.TensorProto.FLOAT`.
+
+    Args:
+      type (type): The Python type to convert.
+
+    Returns:
+      int: The corresponding ONNX tensor element type.
+
+    """
+    import onnx
+
+    _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
+        float: onnx.TensorProto.FLOAT,  # type: ignore[attr-defined]
+        int: onnx.TensorProto.INT64,  # type: ignore[attr-defined]
+        bool: onnx.TensorProto.BOOL,  # type: ignore[attr-defined]
+    }
+    return _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE.get(type)
+
+
+class OrtExecutionInfoPerSession:
+    """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
+
+    def __init__(
+        self,
+        session: "onnxruntime.InferenceSession",
+        input_names: tuple[str, ...],
+        input_value_infos: tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        output_names: tuple[str, ...],
+        output_value_infos: tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        input_devices: tuple["ORTC.OrtDevice", ...],
+        output_devices: tuple["ORTC.OrtDevice", ...],
+        example_outputs: Union[tuple[torch.Tensor, ...], torch.Tensor],
+    ):
+        # Carrier of ONNX model and its executor.
+        self.session: onnxruntime.InferenceSession = session
+        # For the ONNX model stored in self.session, self.input_names[i] is the
+        # name of the i-th positional input.
+        self.input_names: tuple[str, ...] = input_names
+        # self.input_name[i]'s type information is stored in self.input_value_infos[i].
+        self.input_value_infos: tuple[onnx.ValueInfoProto, ...] = input_value_infos  # type: ignore[name-defined]
+        # Similar to self.input_names, but for outputs.
+        self.output_names: tuple[str, ...] = output_names
+        # Similar to self.input_value_infos but for outputs.
+        self.output_value_infos: tuple[onnx.ValueInfoProto, ...] = output_value_infos  # type: ignore[name-defined]
+        # For the ONNX model stored in self.session, self.input_devices[i] is the
+        # i-th positional input's device.
+        self.input_devices: tuple[ORTC.OrtDevice, ...] = input_devices
+        # Similar to self.input_devices, but for outputs.
+        self.output_devices: tuple[ORTC.OrtDevice, ...] = output_devices
+        # This is the outputs of executing the original torch.fx.GraphModule with example inputs
+        # (i.e., args passed into OrtBackend._ort_acclerated_call).
+        self.example_outputs: Union[tuple[torch.Tensor, ...], torch.Tensor] = (
+            example_outputs
+        )
+
+    def is_supported(self, *args):
+        # TODO(justinchuby): Simplify
+        import onnx
+
+        _onnx_tensor_element_type_to_torch_dtype = {
+            onnx.TensorProto.FLOAT: torch.float32,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT16: torch.float16,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E5M2: torch.float8_e5m2,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E4M3FN: torch.float8_e4m3fn,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,  # type: ignore[attr-defined]
+            onnx.TensorProto.DOUBLE: torch.float64,  # type: ignore[attr-defined]
+            onnx.TensorProto.BOOL: torch.bool,  # type: ignore[attr-defined]
+            onnx.TensorProto.UINT8: torch.uint8,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT8: torch.int8,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT16: torch.int16,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT32: torch.int32,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT64: torch.int64,  # type: ignore[attr-defined]
+        }
+        _torch_dtype_to_onnx_tensor_element_type = {
+            value: key
+            for key, value in _onnx_tensor_element_type_to_torch_dtype.items()
+        }
+
+        # Compare the args and the input schema in ONNX model and
+        # return the first match.
+        if len(args) != len(self.input_value_infos):
+            return False
+        for arg, value_info in zip(args, self.input_value_infos):
+            if not isinstance(arg, (torch.Tensor, float, int)):
+                return False
+
+            # Check Python scalars such as int, float, and bool.
+            if isinstance(arg, (int, float, bool)):
+                # Map, e.g., float to onnx.TensorProto.FLOAT.
+                onnx_dtype = _from_python_type_to_onnx_tensor_element_type(type(arg))
+                if onnx_dtype != value_info.type.tensor_type.elem_type:
+                    return False
+                if len(value_info.type.tensor_type.shape.dim) != 0:
+                    return False
+                continue
+
+            # Check tensor.
+            onnx_dtype = _torch_dtype_to_onnx_tensor_element_type[arg.dtype]
+            if onnx_dtype != value_info.type.tensor_type.elem_type:
+                return False
+            for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
+                if isinstance(dim, int) and (
+                    onnx_dim.dim_value == dim or onnx_dim.dim_param
+                ):
+                    continue
+                elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
+                    continue
+                else:
+                    return False
+        return True
+
+
+@dataclasses.dataclass
+class OrtExecutionInfoForAllGraphModules:
+    def __init__(self) -> None:
+        # All sessions (and their related information) created by exporting the same GraphModule
+        # with different inputs.
+        self.execution_info_per_graph_module: dict[
+            torch.fx.GraphModule, list[OrtExecutionInfoPerSession]
+        ] = {}
+
+    def search_reusable_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, *args
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            return None
+        # All execution information for ONNX models exported from the same `graph_module`
+        # with different inputs.
+        candidates = self.execution_info_per_graph_module[graph_module]
+
+        for candidate in candidates:
+            if candidate.is_supported(*args):
+                # Returns the first session that accepts this input schema.
+                return candidate
+        # No reusable session found.
+        return None
+
+    def cache_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            self.execution_info_per_graph_module[graph_module] = [info]
+        else:
+            self.execution_info_per_graph_module[graph_module].append(info)
+
+
+OrtExecutionProvider: TypeAlias = Union[str, tuple[str, Mapping[str, Any]]]
+"""Either the name of an ONNX Runtime execution provider as a string or
+a 2-tuple of the name and a dictionary of execution provider options.
+
+Examples::
+
+    >>> "CPUExecutionProvider"
+
+    >>> ("CUDAExecutionProvider", {"device_id": 3})
+
+"""
+
+
+@dataclasses.dataclass(frozen=True)
+@compatibility(is_backward_compatible=False)
+class OrtBackendOptions:
+    """Options for constructing an ``OrtBackend``, the ONNX Runtime
+    backend (``"onnxrt"``) for ``torch.compile``.
+
+    Example::
+
+        >>> @torch.compile(
+        ...     backend="onnxrt",
+        ...     options=torch.onnx._OrtBackendOptions(...),
+        ... )
+        ... def ort_function(x):
+        ...     return x ** x
+    """
+
+    preferred_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """An optional sequence of execution providers to be prioritized ahead of any
+    execution providers that may be inferred (see ``infer_execution_providers``).
+    """
+
+    infer_execution_providers: bool = True
+    """Whether to infer an execution provider from ``torch.device`` bound to inputs or found in the graph."""
+
+    default_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """The default fallback execution providers. If not specified, one will be
+    be selected based on the host environment (most likely ``"CPUExecutionProvider"``).
+    """
+
+    # preallocate_output allows for allocating output torch Tensor buffers and feeding them to InferenceSession
+    # in order to avoid internal allocation of output buffers in InferenceSession.
+    # If output ortvalue returned from InferenceSession is allocated internally,
+    # it needs to be converted to torch Tensor for return, and the torch Tensor should hold the ownership.
+    # When a custom torch device is used with a custom aten allocator, the conversion from ortvalue to torch Tensor
+    # should be supported, which is currently done through dlpack. Note that dlpack might not support a custom torch device.
+    # It can be avoided by allowing for preallocation for output buffers allocated by a custom aten allocator,
+    # and use the preallocated output buffers for InferenceSession not holding any ownership for them.
+    # TODO(wschin): Make it to inference session level flag.
+    # See https://github.com/pytorch/pytorch/issues/106869.
+    preallocate_output: bool = False
+    """If ``True``, allocate memory for ONNX Runtime's outputs on the PyTorch side."""
+
+    use_aot_autograd: bool = True
+    """Whether to wrap the ``OrtBackend`` with TorchDynamo's aot_autograd backend
+    to support training (i.e., backward graphs are also sent to ``OrtBackend``).
+
+    Symbolic execution is used to capture the forward pass and backward passes as a single graph.
+    Then, a selected graph partition algorithm (``min_cut_rematerialization_partition``) is used
+    to split the entire graph into forward sub-graph and backward sub-graph. Finally, both
+    sub-graphs are compiled by ``OrtBackend``.
+    """
+
+    ort_session_options: Optional["onnxruntime.SessionOptions"] = None
+    """Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""
+
+    pre_ort_model_transforms: Optional[  # type: ignore[name-defined]
+        Sequence[Callable[["onnx.ModelProto"], None]]
+    ] = None
+    """A list of graph transforms to be applied to the ONNX model before it
+    is fed to ONNXRuntime's InferenceSession."""
+
+
+@compatibility(is_backward_compatible=False)
+class OrtBackend:
+    """A backend compiles (sub-)graphs in torch.fx.GraphModule to onnxruntime.InferenceSession calls.
+
+    The compiler entry point is OrtBackend.compile, which
+        1. partitions the original graph into supported sub-graphs (type: torch.fx.GraphModule) and unsupported
+           sub-graphs.
+        2. For each supported sub-graph, it replaces its _wrapped_call function with _ort_accelerated_call.
+        3. Inside _ort_accelerated_call, it creates onnxruntime.InferenceSession and calls it to execute the sub-graph.
+    """
+
+    def __init__(self, options: Optional[OrtBackendOptions] = None):
+        from onnxruntime.capi import _pybind_state as ORTC
+
+        import torch.onnx
+        import torch.onnx._internal._exporter_legacy
+        import torch.onnx._internal.fx.decomposition_table
+
+        self._options: Final = OrtBackendOptions() if options is None else options
+
+        # options.export_options contains information shared between exporter and DORT.
+        # For example, they should use the same decomposition table when
+        #  1. capturing FX graph in torch.compile (see how we create aot_ort in register_backend.py)
+        #  2. call exporter's API to convert `torch.fx.GraphModule` to ONNX model
+        #     (see onnxfunction_dispatcher passed to FxOnnxInterpreter.run below).
+        #
+        # Convert user-facing option to internal option used by ONNX exporter
+        # to access required information.
+        # Some useful fields:
+        # - Decomposition table for decomposing FX operators in exporter is
+        #   self._resolved_onnx_exporter_options.decomposition_table.
+        # - self._resolved_onnx_exporter_options.onnx_registry records what
+        #   aten/prim ops are supported by exporter and their exporters (type: callable).
+        self._resolved_onnx_exporter_options = (
+            torch.onnx._internal._exporter_legacy.ResolvedExportOptions()
+        )
+
+        #  Given DORT's computation flow:
+        #   1. OrtOperatorSupport uses support_dict and extra_support_dict to select operators
+        #      and send them to DORT.
+        #   2. Then, DORT exports the selected sub-graphs into ONNX.
+        #   3. Finally DORT calls ORT to do the computation.
+        #  OrtOperatorSupport and create_onnx_friendly_decomposition_table(...)
+        #  must use the same support_dict. If the support_dict here contains something not
+        #  supported by exporter, exporter will fails in step 2 since the selected graphs may
+        #  contains unsupported operators such as aten::_who_you_are.
+        #  This restriction is automatically done since DORT and exporter shares the same
+        #  self._resolved_onnx_exporter_options.
+        support_dict = torch.onnx._internal.fx.decomposition_table._create_onnx_supports_op_overload_table(
+            self._resolved_onnx_exporter_options.onnx_registry
+        )
+
+        extra_support_dict: dict[str, Any] = {
+            "getattr": None,
+            # To send operator.getitem to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.getitem": None,
+            # To send operator.mul to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.mul": None,
+            "_operator.add": None,
+            "_operator.sub": None,
+        }
+
+        self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
+        # TODO(wschin): this is a naive implementation of cache without proper guard
+        # See https://github.com/pytorch/pytorch/issues/106868.
+        self._partitioner_cache: dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
+        # Conceptually, this filed is a 2-layer dictionary
+        #   GraphModule 0
+        #     ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 1
+        #     ...
+        #   GraphModule 1
+        #     ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 3
+        #     ...
+        #   ...
+        # , which caches all previous compilation result so that we can reuse them.
+        # ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
+        # (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
+        # graphs captured by Dynamo and sent to OrtBackend.compile.
+        self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()
+
+        self._assert_allclose_to_baseline = False
+
+        self.execution_count = 0
+
+        # Function which invokes ORT do to the real computation.
+        self.run = (
+            _run_onnx_session_with_ortvaluevector
+            if hasattr(ORTC.OrtValueVector, "push_back_batch")
+            else _run_onnx_session_with_fetch
+        )
+
+    def _select_eps(
+        self, graph_module: torch.fx.GraphModule, *args
+    ) -> Sequence[tuple[str, Mapping[str, Any]]]:
+        inferred_eps: tuple[str, ...] = ()
+        if self._options.infer_execution_providers:
+            if eps_from_args := _infer_ep_from_device(*args):
+                # If user feeds CUDA tensor as input argument,
+                # we want to use CUDA EP.
+                # Thus, `eps_from_args` (deduced from input arguments)
+                # has highest priority.
+                inferred_eps = eps_from_args
+            elif eps_from_graph_module := _infer_ep_from_graph_module(graph_module):
+                # If there is no EP in input arguments, we deduce EP from
+                # graph_module's outputs. Those outputs may come from
+                # FakeTensorProp or Dynamo's built-in symbolic shape inference.
+                inferred_eps = eps_from_graph_module
+
+        selected_eps = []
+
+        for ep in (
+            *(self._options.preferred_execution_providers or []),
+            *_sort_eps(inferred_eps),
+            *(self._options.default_execution_providers or _infer_default_eps()),
+        ):
+            if isinstance(ep, str):
+                ep = (ep, {})
+            elif isinstance(ep, tuple) and ep[1] is None:
+                ep = (ep[0], {})
+            if ep is not None and ep not in selected_eps:
+                selected_eps.append(ep)
+
+        return selected_eps
+
+    def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
+        """This function replaces GraphModule._wrapped_call in compiled model.
+
+        The _wrapped_call is the underlying implementation of forward method. Replacing
+        it means we delegate the computation to _ort_acclerated_call and therefore
+        onnxruntime.InferenceSession.
+        """
+        import onnxruntime
+
+        from torch.onnx._internal.fx import fx_onnx_interpreter, passes
+
+        cached_execution_info_per_session = (
+            self._all_ort_execution_info.search_reusable_session_execution_info(
+                graph_module, *args
+            )
+        )
+        if cached_execution_info_per_session:
+            onnx_session = cached_execution_info_per_session.session
+            input_names = cached_execution_info_per_session.input_names
+            output_names = cached_execution_info_per_session.output_names
+            input_value_infos = cached_execution_info_per_session.input_value_infos
+            output_value_infos = cached_execution_info_per_session.output_value_infos
+            input_devices = cached_execution_info_per_session.input_devices
+            output_devices = cached_execution_info_per_session.output_devices
+            prim_outputs = cached_execution_info_per_session.example_outputs
+        else:
+            # It's first time seeing such as graph. Let's make a new session
+            # (type: onnxruntime.InferenceSession) for it.
+
+            graph_module = passes.MovePlaceholderToFront(
+                graph_module,
+            ).run()
+            # Generate reference outputs. They are used to indicate output
+            # tensors' types and devices when calling ORT.
+            #
+            # WARNING: The downstream code should not change prim_outputs and
+            # this backend should always produces output with schema identical to prim_outputs'.
+
+            if self._resolved_onnx_exporter_options.dynamic_shapes:
+                # No pre-allocation when dynamic shape is enabled.
+                self.preallocate_output = False
+                extracted_outputs = _extract_graph_module_outputs(graph_module)
+
+                def maybe_map_to_meta_val(value):
+                    if hasattr(value, "meta") and "val" in value.meta:
+                        # Select outputs with "val" information. Without "val",
+                        # it's not possible access output_arg.meta["val"].device.
+                        return value.meta["val"]
+                    else:
+                        return value
+
+                prim_outputs = _pytree.tree_map(
+                    maybe_map_to_meta_val, extracted_outputs
+                )
+            else:
+                try:
+                    prim_outputs = FakeTensorProp(graph_module).propagate(
+                        *args, **kwargs
+                    )
+                except Exception:
+                    logger.warning("FakeTensorProb failed for %s", graph_module)
+                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
+                    # because the output shapes are not inferred.
+                    self.preallocate_output = False
+
+                    # rethrow FakeTensorProb failure because it is not yet currently handled.
+                    raise
+
+            # Create the object to iterate through the nodes in graph one-by-one
+            # and calls the corresponding ONNX exporter for each node.
+            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter()
+            # Cast FX variables if they will result schema-mismatch when searching
+            # for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
+            # but ONNX expects add(double_tensor, double_tensor).
+            graph_module = passes.InsertTypePromotion(graph_module).run()
+            # Start the per-node exporting process. It's conceptually a for loop
+            # scanning through the nodes in the graph.
+            exported = fx_interpreter.run(
+                fx_graph_module=graph_module,
+                onnxfunction_dispatcher=self._resolved_onnx_exporter_options.onnxfunction_dispatcher,
+            )
+            # Convert the exported result to ONNX ModelProto.
+            onnx_model = exported.to_model_proto(
+                opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
+            )
+
+            # Modify ONNX model using pre-registered graph transforms.
+            # They are in-place modifications for avoiding unnecessary
+            # copy of ONNX initializers.
+            if self._options.pre_ort_model_transforms:
+                for transform in self._options.pre_ort_model_transforms:
+                    transform(onnx_model)
+
+            onnx_model_bytes = onnx_model.SerializeToString()
+            if os.environ.get("ONNXRT_DUMP_PATH", None):
+                # If not empty, environment variable ONNXRT_DUMP_PATH defined the path
+                # where generated onnx files should be stored.
+                # This module keeps a global variables keeping track of the
+                # stored models.
+                # If ONNXRT_DUMP_PATH="dumped/dumped_model_"
+                # The first file name will be 'dumped/dumped_model_0.onnx'.
+                # For every dumped model, a text file 'dumped/dumped_model_0.txt'
+                # is created as well to contain the string representing the graph_module.
+                _dump_onnx_model(onnx_model_bytes, graph_module=graph_module)
+
+            # Initialize a ORT session to execute this ONNX model.
+            # Note that TorchDynamo assumes all inputs/outputs are on the
+            # same device, but it's subject to change (very likely with
+            # dynamic shape support), so we add execution providers
+            # based on the logic in _select_eps: (explicitly preferred EPs,
+            # EPs inferred from inputs or graph, and the fallback default EP)/
+            #
+            # TODO(wschin): enable external allocators.
+            # See https://github.com/pytorch/pytorch/issues/106867
+            onnx_session = onnxruntime.InferenceSession(
+                path_or_bytes=onnx_model_bytes,
+                sess_options=self._options.ort_session_options,
+                providers=self._select_eps(graph_module, *args),
+            )
+
+            # Cache ORT session. It's reused for the same "graph_module".
+            # Generate ONNX model and extract its input and output names.
+            input_names = tuple(input.name for input in onnx_model.graph.input)
+            output_names = tuple(output.name for output in onnx_model.graph.output)
+            input_devices = _get_onnx_devices(args)
+            # Cache devices for inputs and outputs. They are used to invoke
+            # ORT session. Output devices indicate where (e.g., GPU or CPU)
+            # to store outputs
+            if isinstance(prim_outputs, tuple):
+                output_devices = _get_onnx_devices(prim_outputs)
+            else:
+                output_devices = _get_onnx_devices((prim_outputs,))
+
+            input_value_infos = tuple(input for input in onnx_model.graph.input)
+            output_value_infos = tuple(output for output in onnx_model.graph.output)
+
+            execution_info_per_session = OrtExecutionInfoPerSession(
+                session=onnx_session,
+                input_names=input_names,
+                input_value_infos=input_value_infos,
+                output_names=output_names,
+                output_value_infos=output_value_infos,
+                input_devices=input_devices,
+                output_devices=output_devices,
+                example_outputs=prim_outputs,
+            )
+
+            self._all_ort_execution_info.cache_session_execution_info(
+                graph_module, execution_info_per_session
+            )
+
+        self.execution_count += 1
+
+        # ORT always returns a tuple of outputs. If the original output is a tensor,
+        # ORT output's first element must be extracted and returned. Otherwise, type
+        # mismatch may happen in downstream computation.
+        is_single_tensor_output = isinstance(prim_outputs, torch.Tensor)
+        normalized_prim_outputs = (
+            (prim_outputs,) if is_single_tensor_output else prim_outputs
+        )
+        assert isinstance(normalized_prim_outputs, tuple)
+        assert all(
+            isinstance(elem, (torch.Tensor, torch.SymInt, int))
+            for elem in normalized_prim_outputs
+        )
+
+        _nvtx_range_push("run_onnx_session_with_ortvaluevector")
+        onnx_outputs = self.run(
+            onnx_session,
+            input_names,
+            args,
+            input_devices,
+            output_names,
+            normalized_prim_outputs,
+            output_devices,
+            self._options.preallocate_output,
+            input_value_infos,
+            normalized_prim_outputs,
+        )
+        _nvtx_range_pop()
+
+        if self._assert_allclose_to_baseline:
+            # Compute baseline.
+            baseline_outputs = torch._prims.executor.execute(
+                graph_module, *args, executor="aten"
+            )
+            normalized_baseline_ouptuts = (
+                (baseline_outputs,) if is_single_tensor_output else baseline_outputs
+            )
+            # Ensure every output tensor is close to the corresponding baseline.
+            for onnx_output, baseline_output in zip(
+                onnx_outputs, normalized_baseline_ouptuts
+            ):
+                torch.testing.assert_close(onnx_output, baseline_output)
+        return onnx_outputs[0] if is_single_tensor_output else onnx_outputs
+
+    def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
+        # Deferred import since CapabilityBasedPartitioner is not decorated with
+        # @compatibility; importing it at the module level will result in the test
+        # failing: pytest test/test_fx.py -k test_public_api_surface
+        # because this module is imported into torch.onnx.
+        from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+        # FX graph based partitioning based on ONNX supported ops.
+        # Given a graph module
+        #  GraphModule0
+        #   node_0
+        #   node_1
+        #   node_2
+        #   node_3
+        #   node_4
+        # If only node_2 is not supported by ONNX, this graph module will be partitioned into
+        #  GraphModule0
+        #   GraphModule1
+        #    node_0
+        #    node_1
+        #   node_2
+        #   GraphModule2
+        #    node_3
+        #    node_4
+        # by calling CapabilityBasedPartitioner.partition_and_fuse.
+        # Then, GraphModule1's and GraphModule2's forward method (GraphModule._wrapped_call)
+        # will be replaced by OrtBackend._ort_accelerated_call to delegate computation to ORT.
+        if graph_module in self._partitioner_cache:
+            partitioned_prim_graph_module = self._partitioner_cache[graph_module]
+        else:
+            prim_graph_module = graph_module
+            partitioner = CapabilityBasedPartitioner(
+                prim_graph_module,
+                self._supported_ops,
+                allows_single_node_partition=True,
+            )
+            partitioned_prim_graph_module = partitioner.partition_and_fuse()
+            self._partitioner_cache[graph_module] = partitioned_prim_graph_module
+
+            # Overriding fused_module's __call__() function with ort_acclerated_call()
+            # This loop goes through all graph partitions (each of them is an ONNX-representable graph)
+            # and override their _wrapped_call function with _ort_accelerated_call.
+            # Inside _ort_accelerated_call, the partition's graph is exported into ONNX and executed by ORT.
+            for node in partitioned_prim_graph_module.graph.nodes:
+                # TODO(wschin): use a better way to identify fused submodule
+                # See https://github.com/pytorch/pytorch/issues/106872.
+                if node.op == "call_module" and "fused_" in node.name:
+                    fused_module = getattr(partitioned_prim_graph_module, node.name)
+                    # self.ort_acclerated_call is responsible for exporting graph to ONNX,
+                    # creating ORT session, and running ORT session.
+                    fused_module._wrapped_call = self._ort_acclerated_call
+
+        return partitioned_prim_graph_module
+
+    def __call__(
+        self, graph_module: torch.fx.GraphModule, args
+    ) -> torch.fx.GraphModule:
+        """If ``OrtBackendOptions.use_aot_autograd`` is ``True``, the `auto_autograd` compiler
+        will be invoked, wrapping this ``OrtBackend`` instance's ``compile`` method. Otherwise,
+        the ``compile`` method is invoked directly."""
+        if self._options.use_aot_autograd:
+            from functorch.compile import min_cut_rematerialization_partition
+            from torch._dynamo.backends.common import aot_autograd
+
+            return aot_autograd(
+                fw_compiler=self.compile,
+                partition_fn=min_cut_rematerialization_partition,
+                decompositions=self._resolved_onnx_exporter_options.decomposition_table,
+            )(graph_module, args)
+
+        return self.compile(graph_module, args)
+
+    __instance_cache_max_count: Final = 8
+    __instance_cache: Final[list["OrtBackend"]] = []
+
+    @staticmethod
+    def get_cached_instance_for_options(
+        options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+    ) -> "OrtBackend":
+        """Returns a possibly cached instance of an ``OrtBackend``. If an existing
+        backend was created previously through this function with the same options,
+        it will be returned. Otherwise a new backend will be created, cached, and
+        returned.
+
+        Note: if ``options`` sets ``ort_session_options``, a new ``OrtBackend``
+        will always be returned, since ``onnxruntime.SessionOptions`` cannot
+        participate in caching."""
+
+        def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
+            if (
+                a.preferred_execution_providers != b.preferred_execution_providers
+                or a.infer_execution_providers != b.infer_execution_providers
+                or a.default_execution_providers != b.default_execution_providers
+                or a.preallocate_output != b.preallocate_output
+                or a.use_aot_autograd != b.use_aot_autograd
+                or a.pre_ort_model_transforms != b.pre_ort_model_transforms
+            ):
+                return False
+
+            # onnxruntime.SessionOptions is a pybind11 object, cannot be pickled,
+            # and holds too much potential state to reasonably check manually;
+            # ort_session_options is provided at all, the backend does not participate
+            # in caching.
+            if a.ort_session_options is not None or b.ort_session_options is not None:
+                return False
+
+            return True
+
+        if not isinstance(options, OrtBackendOptions):
+            options = OrtBackendOptions(**(options or {}))
+
+        backend = next(
+            (b for b in OrtBackend.__instance_cache if reusable(b._options, options)),
+            None,
+        )
+
+        if backend is None:
+            assert (
+                len(OrtBackend.__instance_cache) < OrtBackend.__instance_cache_max_count
+            ), (
+                f"No more than {OrtBackend.__instance_cache_max_count} instances of "
+                f"{OrtBackend} allowed. Please instantiate `{OrtBackend}` explicitly "
+                "to pass to `torch.compile`. "
+                "See https://github.com/pytorch/pytorch/pull/107973#discussion_r1306144795 "
+                "for discussion."
+            )
+            OrtBackend.__instance_cache.append(backend := OrtBackend(options))
+
+        return backend
+
+    @staticmethod
+    def clear_cached_instances():
+        OrtBackend.__instance_cache.clear()
+
+    @staticmethod
+    def get_cached_instances():
+        return tuple(OrtBackend.__instance_cache)
+
+
+@compatibility(is_backward_compatible=False)
+def torch_compile_backend(
+    graph_module: torch.fx.GraphModule,
+    args,
+    *,
+    options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+):
+    return OrtBackend.get_cached_instance_for_options(options)(graph_module, args)
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/registration.py
new file mode 100644
index 0000000000000..b8bba134f36b6
--- /dev/null
+++ b/torch/onnx/_internal/registration.py
@@ -0,0 +1,335 @@
+# mypy: allow-untyped-defs
+"""Module for handling symbolic function registration."""
+
+import warnings
+from collections.abc import Collection, Sequence
+from typing import Callable, Generic, Optional, TypeVar, Union
+from typing_extensions import ParamSpec
+
+from torch.onnx import _constants, errors
+
+
+OpsetVersion = int
+
+
+def _dispatch_opset_version(
+    target: OpsetVersion, registered_opsets: Collection[OpsetVersion]
+) -> Optional[OpsetVersion]:
+    """Finds the registered opset given a target opset version and the available opsets.
+
+    Args:
+        target: The target opset version.
+        registered_opsets: The available opsets.
+
+    Returns:
+        The registered opset version.
+    """
+    if not registered_opsets:
+        return None
+
+    descending_registered_versions = sorted(registered_opsets, reverse=True)
+    # Linear search for the opset version, which is fine since the number of opset
+    # versions is small.
+
+    if target >= _constants.ONNX_BASE_OPSET:
+        # Always look down toward opset 1 when the target is >= ONNX_BASE_OPSET (opset 9).
+        # When a custom op is register at opset 1, we want to be able to discover it as a
+        # fallback for all opsets >= ONNX_BASE_OPSET.
+        for version in descending_registered_versions:
+            if version <= target:
+                return version
+        return None
+
+    # target < opset 9. This is the legacy behavior to support opset 7 and opset 8.
+    # for caffe2 support. We search up toward opset 9.
+    for version in reversed(descending_registered_versions):
+        # Count back up until _constants.ONNX_BASE_OPSET
+        if target <= version <= _constants.ONNX_BASE_OPSET:
+            return version
+
+    return None
+
+
+_K = TypeVar("_K")
+_V = TypeVar("_V")
+_R = TypeVar("_R")
+_P = ParamSpec("_P")
+
+
+class OverrideDict(Collection[_K], Generic[_K, _V]):
+    """A dictionary that merges built-in and custom symbolic functions.
+
+    It supports overriding and un-overriding built-in symbolic functions with custom
+    ones.
+    """
+
+    def __init__(self) -> None:
+        self._base: dict[_K, _V] = {}
+        self._overrides: dict[_K, _V] = {}
+        self._merged: dict[_K, _V] = {}
+
+    def set_base(self, key: _K, value: _V) -> None:
+        self._base[key] = value
+        if key not in self._overrides:
+            self._merged[key] = value
+
+    def in_base(self, key: _K) -> bool:
+        """Checks if a key is in the base dictionary."""
+        return key in self._base
+
+    def override(self, key: _K, value: _V) -> None:
+        """Overrides a base key-value with a new pair."""
+        self._overrides[key] = value
+        self._merged[key] = value
+
+    def remove_override(self, key: _K) -> None:
+        """Un-overrides a key-value pair."""
+        self._overrides.pop(key, None)  # type: ignore[arg-type]
+        self._merged.pop(key, None)  # type: ignore[arg-type]
+        if key in self._base:
+            self._merged[key] = self._base[key]
+
+    def overridden(self, key: _K) -> bool:
+        """Checks if a key-value pair is overridden."""
+        return key in self._overrides
+
+    def __getitem__(self, key: _K) -> _V:
+        return self._merged[key]
+
+    def get(self, key: _K, default: Optional[_V] = None):
+        return self._merged.get(key, default)
+
+    def __contains__(self, key: object) -> bool:
+        return key in self._merged
+
+    def __iter__(self):
+        return iter(self._merged)
+
+    def __len__(self) -> int:
+        return len(self._merged)
+
+    def __repr__(self) -> str:
+        return f"OverrideDict(base={self._base}, overrides={self._overrides})"
+
+    def __bool__(self) -> bool:
+        return bool(self._merged)
+
+
+class _SymbolicFunctionGroup:
+    """Different versions of symbolic functions registered to the same name.
+
+    O(number of registered versions of an op) search is performed to find the most
+    recent version of the op.
+
+    The registration is delayed until op is used to improve startup time.
+
+    Function overloads with different arguments are not allowed.
+    Custom op overrides are supported.
+    """
+
+    def __init__(self, name: str) -> None:
+        self._name = name
+        # A dictionary of functions, keyed by the opset version.
+        self._functions: OverrideDict[OpsetVersion, Callable] = OverrideDict()
+
+    def __repr__(self) -> str:
+        return f"_SymbolicFunctionGroup({self._name}, registered={self._functions})"
+
+    def __getitem__(self, key: OpsetVersion) -> Callable:
+        result = self.get(key)
+        if result is None:
+            raise KeyError(key)
+        return result
+
+    # TODO(justinchuby): Add @functools.lru_cache(maxsize=None) if lookup time becomes
+    # a problem.
+    def get(self, opset: OpsetVersion) -> Optional[Callable]:
+        """Find the most recent version of the function."""
+        version = _dispatch_opset_version(opset, self._functions)
+        if version is None:
+            return None
+
+        return self._functions[version]
+
+    def add(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a symbolic function.
+
+        Args:
+            func: The function to add.
+            opset: The opset version of the function to add.
+        """
+        if self._functions.in_base(opset):
+            warnings.warn(
+                f"Symbolic function '{self._name}' already registered for opset {opset}. "
+                f"Replacing the existing function with new function. This is unexpected. "
+                f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+                errors.OnnxExporterWarning,
+            )
+        self._functions.set_base(opset, func)
+
+    def add_custom(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a custom symbolic function.
+
+        Args:
+            func: The symbolic function to register.
+            opset: The corresponding opset version.
+        """
+        self._functions.override(opset, func)
+
+    def remove_custom(self, opset: OpsetVersion) -> None:
+        """Removes a custom symbolic function.
+
+        Args:
+            opset: The opset version of the custom function to remove.
+        """
+        if not self._functions.overridden(opset):
+            warnings.warn(
+                f"No custom function registered for '{self._name}' opset {opset}"
+            )
+            return
+        self._functions.remove_override(opset)
+
+    def get_min_supported(self) -> OpsetVersion:
+        """Returns the lowest built-in opset version supported by the function."""
+        return min(self._functions)
+
+
+class SymbolicRegistry:
+    """Registry for symbolic functions.
+
+    The registry maintains a mapping from qualified names to symbolic functions.
+    It is used to register new symbolic functions and to dispatch calls to
+    the appropriate function.
+    """
+
+    def __init__(self) -> None:
+        self._registry: dict[str, _SymbolicFunctionGroup] = {}
+
+    def register(
+        self, name: str, opset: OpsetVersion, func: Callable, custom: bool = False
+    ) -> None:
+        """Registers a symbolic function.
+
+        Args:
+            name: The qualified name of the function to register. In the form of 'domain::op'.
+                E.g. 'aten::add'.
+            opset: The opset version of the function to register.
+            func: The symbolic function to register.
+            custom: Whether the function is a custom function that overrides existing ones.
+
+        Raises:
+            ValueError: If the separator '::' is not in the name.
+        """
+        if "::" not in name:
+            raise ValueError(
+                f"The name must be in the form of 'domain::op', not '{name}'"
+            )
+        symbolic_functions = self._registry.setdefault(
+            name, _SymbolicFunctionGroup(name)
+        )
+        if custom:
+            symbolic_functions.add_custom(func, opset)
+        else:
+            symbolic_functions.add(func, opset)
+
+    def unregister(self, name: str, opset: OpsetVersion) -> None:
+        """Unregisters a symbolic function.
+
+        Args:
+            name: The qualified name of the function to unregister.
+            opset: The opset version of the function to unregister.
+        """
+        if name not in self._registry:
+            return
+        self._registry[name].remove_custom(opset)
+
+    def get_function_group(self, name: str) -> Optional[_SymbolicFunctionGroup]:
+        """Returns the function group for the given name."""
+        return self._registry.get(name)
+
+    def is_registered_op(self, name: str, version: int) -> bool:
+        """Returns whether the given op is registered for the given opset version."""
+        functions = self.get_function_group(name)
+        if functions is None:
+            return False
+        return functions.get(version) is not None
+
+    def all_functions(self) -> set[str]:
+        """Returns the set of all registered function names."""
+        return set(self._registry)
+
+
+def onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+    custom: bool = False,
+) -> Callable:
+    """Registers a symbolic function.
+
+    Usage::
+
+    ```
+    @onnx_symbolic(
+        "aten::symbolic_b",
+        opset=10,
+        decorate=[quantized_aten_handler(scale=1 / 128, zero_point=0)],
+    )
+    @symbolic_helper.parse_args("v", "v", "b")
+    def symbolic_b(g: _C.Graph, x: _C.Value, y: _C.Value, arg1: bool) -> _C.Value: ...
+    ```
+
+    Args:
+        name: The qualified name of the function in the form of 'domain::op'.
+            E.g. 'aten::add'.
+        opset: The opset versions of the function to register at.
+        decorate: A sequence of decorators to apply to the function.
+        custom: Whether the function is a custom symbolic function.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+
+    def wrapper(func: Callable[_P, _R]) -> Callable[_P, _R]:
+        decorated = func
+        if decorate is not None:
+            for decorate_func in decorate:
+                decorated = decorate_func(decorated)
+
+        global registry
+        nonlocal opset
+        if isinstance(opset, OpsetVersion):
+            opset = (opset,)
+        for opset_version in opset:
+            registry.register(name, opset_version, decorated, custom=custom)
+
+        # Return the original function because the decorators in "decorate" are only
+        # specific to the instance being registered.
+        return func
+
+    return wrapper
+
+
+def custom_onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+) -> Callable:
+    """Registers a custom symbolic function.
+
+    Args:
+        name: the qualified name of the function.
+        opset: the opset version of the function.
+        decorate: a sequence of decorators to apply to the function.
+
+    Returns:
+        The decorator.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+    return onnx_symbolic(name, opset, decorate, custom=True)
+
+
+# The registry for all symbolic functions.
+registry = SymbolicRegistry()
diff --git a/torch/onnx/_onnx_supported_ops.py b/torch/onnx/_onnx_supported_ops.py
new file mode 100644
index 0000000000000..f3d703ffc227f
--- /dev/null
+++ b/torch/onnx/_onnx_supported_ops.py
@@ -0,0 +1,98 @@
+# mypy: allow-untyped-defs
+import inspect
+from typing import Union
+
+from torch import _C
+from torch.onnx import _constants
+from torch.onnx._internal import registration
+
+
+class _TorchSchema:
+    def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
+        if isinstance(schema, _C.FunctionSchema):
+            self.name: str = schema.name
+            self.overload_name: str = schema.overload_name
+            self.arguments: list[str] = [arg.name for arg in schema.arguments]
+            self.optional_arguments: list[str] = []
+            self.returns: list[str] = [ret.name for ret in schema.returns]
+            self.opsets: list[int] = []
+        else:
+            self.name = schema
+            self.overload_name = ""
+            self.arguments = []
+            self.optional_arguments = []
+            self.returns = []
+            self.opsets = []
+
+    def __str__(self) -> str:
+        s = (
+            f"{self.name}.{self.overload_name}("
+            + ", ".join(self.arguments)
+            + ") -> ("
+            + ", ".join(self.returns)
+            + ")"
+            + " in opsets "
+            + ", ".join(str(opset) for opset in self.opsets)
+        )
+        return s
+
+    def __hash__(self):
+        # TODO(thiagocrepaldi): handle overload_name?
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, _TorchSchema):
+            return False
+        # TODO(thiagocrepaldi): handle overload_name?
+        return self.name == other.name
+
+    def is_aten(self) -> bool:
+        return self.name.startswith("aten::")
+
+    def is_backward(self) -> bool:
+        return "backward" in self.name
+
+
+def _symbolic_argument_count(func):
+    params = []
+    signature = inspect.signature(func)
+    optional_params = []
+    for name, parameter in signature.parameters.items():
+        if name in {"_outputs", "g"}:
+            continue
+        if parameter.default is parameter.empty:
+            optional_params.append(parameter)
+        else:
+            params.append(str(parameter))
+    return params
+
+
+def all_forward_schemas() -> dict[str, _TorchSchema]:
+    """Returns schemas for all TorchScript forward ops."""
+    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
+    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
+
+
+def all_symbolics_schemas() -> dict[str, _TorchSchema]:
+    """Returns schemas for all onnx supported ops."""
+    symbolics_schemas = {}
+
+    for name in registration.registry.all_functions():
+        func_group = registration.registry.get_function_group(name)
+        assert func_group is not None
+        symbolics_schema = _TorchSchema(name)
+        func = func_group.get(_constants.ONNX_MAX_OPSET)
+        if func is not None:
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(
+                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
+            )
+        else:
+            # Only support opset < 9
+            func = func_group.get(7)
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
+
+        symbolics_schemas[name] = symbolics_schema
+
+    return symbolics_schemas
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_type_utils.py
new file mode 100644
index 0000000000000..81bcaeef1107a
--- /dev/null
+++ b/torch/onnx/_type_utils.py
@@ -0,0 +1,391 @@
+# mypy: allow-untyped-defs
+"""Utilities for converting and operating on ONNX, JIT and torch types."""
+
+from __future__ import annotations
+
+import enum
+import typing
+from typing import Literal
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+
+
+if typing.TYPE_CHECKING:
+    # Hack to help mypy to recognize torch._C.Value
+    from torch import _C  # noqa: F401
+
+ScalarName = Literal[
+    "Byte",
+    "Char",
+    "Double",
+    "Float",
+    "Half",
+    "Int",
+    "Long",
+    "Short",
+    "Bool",
+    "ComplexHalf",
+    "ComplexFloat",
+    "ComplexDouble",
+    "QInt8",
+    "QUInt8",
+    "QInt32",
+    "BFloat16",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E5M2FNUZ",
+    "Float8E4M3FNUZ",
+    "Undefined",
+]
+
+TorchName = Literal[
+    "bool",
+    "uint8_t",
+    "int8_t",
+    "double",
+    "float",
+    "half",
+    "int",
+    "int64_t",
+    "int16_t",
+    "complex32",
+    "complex64",
+    "complex128",
+    "qint8",
+    "quint8",
+    "qint32",
+    "bfloat16",
+    "float8_e5m2",
+    "float8_e4m3fn",
+    "float8_e5m2fnuz",
+    "float8_e4m3fnuz",
+]
+
+
+class JitScalarType(enum.IntEnum):
+    """Scalar types defined in torch.
+
+    Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
+        >>> JitScalarType.from_value(torch.ones(1, 2)).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_value(torch_c_value_with_type_float).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_dtype(torch.get_default_dtype).onnx_type()
+        TensorProtoDataType.FLOAT
+
+    """
+
+    # Order defined in https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+    UINT8 = 0
+    INT8 = enum.auto()  # 1
+    INT16 = enum.auto()  # 2
+    INT = enum.auto()  # 3
+    INT64 = enum.auto()  # 4
+    HALF = enum.auto()  # 5
+    FLOAT = enum.auto()  # 6
+    DOUBLE = enum.auto()  # 7
+    COMPLEX32 = enum.auto()  # 8
+    COMPLEX64 = enum.auto()  # 9
+    COMPLEX128 = enum.auto()  # 10
+    BOOL = enum.auto()  # 11
+    QINT8 = enum.auto()  # 12
+    QUINT8 = enum.auto()  # 13
+    QINT32 = enum.auto()  # 14
+    BFLOAT16 = enum.auto()  # 15
+    FLOAT8E5M2 = enum.auto()  # 16
+    FLOAT8E4M3FN = enum.auto()  # 17
+    FLOAT8E5M2FNUZ = enum.auto()  # 18
+    FLOAT8E4M3FNUZ = enum.auto()  # 19
+    UNDEFINED = enum.auto()  # 20
+
+    @classmethod
+    def _from_name(cls, name: ScalarName | TorchName | str | None) -> JitScalarType:
+        """Convert a JIT scalar type or torch type name to ScalarType.
+
+        Note: DO NOT USE this API when `name` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            name: JIT scalar type name (Byte) or torch type name (uint8_t).
+
+        Returns:
+            JitScalarType
+
+        Raises:
+           OnnxExporterError: if name is not a valid scalar type name or if it is None.
+        """
+        if name is None:
+            raise errors.OnnxExporterError("Scalar type name cannot be None")
+        if valid_scalar_name(name):
+            return _SCALAR_NAME_TO_TYPE[name]  # type: ignore[index]
+        if valid_torch_name(name):
+            return _TORCH_NAME_TO_SCALAR_TYPE[name]  # type: ignore[index]
+
+        raise errors.OnnxExporterError(f"Unknown torch or scalar type: '{name}'")
+
+    @classmethod
+    def from_dtype(cls, dtype: torch.dtype | None) -> JitScalarType:
+        """Convert a torch dtype to JitScalarType.
+
+        Note: DO NOT USE this API when `dtype` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            dtype: A torch.dtype to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if dtype not in _DTYPE_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
+        return _DTYPE_TO_SCALAR_TYPE[dtype]
+
+    @classmethod
+    def from_onnx_type(
+        cls, onnx_type: int | _C_onnx.TensorProtoDataType | None
+    ) -> JitScalarType:
+        """Convert a ONNX data type to JitScalarType.
+
+        Args:
+            onnx_type: A torch._C._onnx.TensorProtoDataType to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if onnx_type not in _ONNX_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown onnx_type: {onnx_type}")
+        return _ONNX_TO_SCALAR_TYPE[typing.cast(_C_onnx.TensorProtoDataType, onnx_type)]
+
+    @classmethod
+    def from_value(
+        cls, value: None | torch._C.Value | torch.Tensor, default=None
+    ) -> JitScalarType:
+        """Create a JitScalarType from an value's scalar type.
+
+        Args:
+            value: An object to fetch scalar type from.
+            default: The JitScalarType to return if a valid scalar cannot be fetched from value
+
+        Returns:
+            JitScalarType.
+
+        Raises:
+            OnnxExporterError: if value does not have a valid scalar type and default is None.
+            SymbolicValueError: when value.type()'s info are empty and default is None
+        """
+
+        if not isinstance(value, (torch._C.Value, torch.Tensor)) or (
+            isinstance(value, torch._C.Value) and value.node().mustBeNone()
+        ):
+            # default value of type JitScalarType is returned when value is not valid
+            if default is None:
+                raise errors.OnnxExporterError(
+                    "value must be either torch._C.Value or torch.Tensor objects."
+                )
+            elif not isinstance(default, JitScalarType):
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return default
+
+        # Each value type has their own way of storing scalar type
+        if isinstance(value, torch.Tensor):
+            return cls.from_dtype(value.dtype)
+        if isinstance(value.type(), torch.ListType):
+            try:
+                return cls.from_dtype(value.type().getElementType().dtype())
+            except RuntimeError:
+                return cls._from_name(str(value.type().getElementType()))
+        if isinstance(value.type(), torch._C.OptionalType):
+            if value.type().getElementType().dtype() is None:
+                if isinstance(default, JitScalarType):
+                    return default
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return cls.from_dtype(value.type().getElementType().dtype())
+
+        scalar_type = None
+        if value.node().kind() != "prim::Constant" or not isinstance(
+            value.type(), torch._C.NoneType
+        ):
+            # value must be a non-list torch._C.Value scalar
+            scalar_type = value.type().scalarType()
+
+        if scalar_type is not None:
+            return cls._from_name(scalar_type)
+
+        # When everything fails... try to default
+        if default is not None:
+            return default
+        raise errors.SymbolicValueError(
+            f"Cannot determine scalar type for this '{type(value.type())}' instance and "
+            "a default value was not provided.",
+            value,
+        )
+
+    def scalar_name(self) -> ScalarName:
+        """Convert a JitScalarType to a JIT scalar type name."""
+        return _SCALAR_TYPE_TO_NAME[self]
+
+    def torch_name(self) -> TorchName:
+        """Convert a JitScalarType to a torch type name."""
+        return _SCALAR_TYPE_TO_TORCH_NAME[self]
+
+    def dtype(self) -> torch.dtype:
+        """Convert a JitScalarType to a torch dtype."""
+        return _SCALAR_TYPE_TO_DTYPE[self]
+
+    def onnx_type(self) -> _C_onnx.TensorProtoDataType:
+        """Convert a JitScalarType to an ONNX data type."""
+        if self not in _SCALAR_TYPE_TO_ONNX:
+            raise errors.OnnxExporterError(
+                f"Scalar type {self} cannot be converted to ONNX"
+            )
+        return _SCALAR_TYPE_TO_ONNX[self]
+
+    def onnx_compatible(self) -> bool:
+        """Return whether this JitScalarType is compatible with ONNX."""
+        return (
+            self in _SCALAR_TYPE_TO_ONNX
+            and self != JitScalarType.UNDEFINED
+            and self != JitScalarType.COMPLEX32
+        )
+
+
+def valid_scalar_name(scalar_name: ScalarName | str) -> bool:
+    """Return whether the given scalar name is a valid JIT scalar type name."""
+    return scalar_name in _SCALAR_NAME_TO_TYPE
+
+
+def valid_torch_name(torch_name: TorchName | str) -> bool:
+    """Return whether the given torch name is a valid torch type name."""
+    return torch_name in _TORCH_NAME_TO_SCALAR_TYPE
+
+
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+_SCALAR_TYPE_TO_NAME: dict[JitScalarType, ScalarName] = {
+    JitScalarType.BOOL: "Bool",
+    JitScalarType.UINT8: "Byte",
+    JitScalarType.INT8: "Char",
+    JitScalarType.INT16: "Short",
+    JitScalarType.INT: "Int",
+    JitScalarType.INT64: "Long",
+    JitScalarType.HALF: "Half",
+    JitScalarType.FLOAT: "Float",
+    JitScalarType.DOUBLE: "Double",
+    JitScalarType.COMPLEX32: "ComplexHalf",
+    JitScalarType.COMPLEX64: "ComplexFloat",
+    JitScalarType.COMPLEX128: "ComplexDouble",
+    JitScalarType.QINT8: "QInt8",
+    JitScalarType.QUINT8: "QUInt8",
+    JitScalarType.QINT32: "QInt32",
+    JitScalarType.BFLOAT16: "BFloat16",
+    JitScalarType.FLOAT8E5M2: "Float8E5M2",
+    JitScalarType.FLOAT8E4M3FN: "Float8E4M3FN",
+    JitScalarType.FLOAT8E5M2FNUZ: "Float8E5M2FNUZ",
+    JitScalarType.FLOAT8E4M3FNUZ: "Float8E4M3FNUZ",
+    JitScalarType.UNDEFINED: "Undefined",
+}
+
+_SCALAR_NAME_TO_TYPE: dict[ScalarName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_NAME.items()
+}
+
+_SCALAR_TYPE_TO_TORCH_NAME: dict[JitScalarType, TorchName] = {
+    JitScalarType.BOOL: "bool",
+    JitScalarType.UINT8: "uint8_t",
+    JitScalarType.INT8: "int8_t",
+    JitScalarType.INT16: "int16_t",
+    JitScalarType.INT: "int",
+    JitScalarType.INT64: "int64_t",
+    JitScalarType.HALF: "half",
+    JitScalarType.FLOAT: "float",
+    JitScalarType.DOUBLE: "double",
+    JitScalarType.COMPLEX32: "complex32",
+    JitScalarType.COMPLEX64: "complex64",
+    JitScalarType.COMPLEX128: "complex128",
+    JitScalarType.QINT8: "qint8",
+    JitScalarType.QUINT8: "quint8",
+    JitScalarType.QINT32: "qint32",
+    JitScalarType.BFLOAT16: "bfloat16",
+    JitScalarType.FLOAT8E5M2: "float8_e5m2",
+    JitScalarType.FLOAT8E4M3FN: "float8_e4m3fn",
+    JitScalarType.FLOAT8E5M2FNUZ: "float8_e5m2fnuz",
+    JitScalarType.FLOAT8E4M3FNUZ: "float8_e4m3fnuz",
+}
+
+_TORCH_NAME_TO_SCALAR_TYPE: dict[TorchName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_TORCH_NAME.items()
+}
+
+_SCALAR_TYPE_TO_ONNX = {
+    JitScalarType.BOOL: _C_onnx.TensorProtoDataType.BOOL,
+    JitScalarType.UINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.INT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.INT16: _C_onnx.TensorProtoDataType.INT16,
+    JitScalarType.INT: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.INT64: _C_onnx.TensorProtoDataType.INT64,
+    JitScalarType.HALF: _C_onnx.TensorProtoDataType.FLOAT16,
+    JitScalarType.FLOAT: _C_onnx.TensorProtoDataType.FLOAT,
+    JitScalarType.DOUBLE: _C_onnx.TensorProtoDataType.DOUBLE,
+    JitScalarType.COMPLEX64: _C_onnx.TensorProtoDataType.COMPLEX64,
+    JitScalarType.COMPLEX128: _C_onnx.TensorProtoDataType.COMPLEX128,
+    JitScalarType.BFLOAT16: _C_onnx.TensorProtoDataType.BFLOAT16,
+    JitScalarType.UNDEFINED: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.COMPLEX32: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.QINT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.QUINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.QINT32: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.FLOAT8E5M2: _C_onnx.TensorProtoDataType.FLOAT8E5M2,
+    JitScalarType.FLOAT8E4M3FN: _C_onnx.TensorProtoDataType.FLOAT8E4M3FN,
+    JitScalarType.FLOAT8E5M2FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E5M2FNUZ,
+    JitScalarType.FLOAT8E4M3FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E4M3FNUZ,
+}
+
+_ONNX_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_ONNX.items()}
+
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+_SCALAR_TYPE_TO_DTYPE = {
+    JitScalarType.BOOL: torch.bool,
+    JitScalarType.UINT8: torch.uint8,
+    JitScalarType.INT8: torch.int8,
+    JitScalarType.INT16: torch.short,
+    JitScalarType.INT: torch.int,
+    JitScalarType.INT64: torch.int64,
+    JitScalarType.HALF: torch.half,
+    JitScalarType.FLOAT: torch.float,
+    JitScalarType.DOUBLE: torch.double,
+    JitScalarType.COMPLEX32: torch.complex32,
+    JitScalarType.COMPLEX64: torch.complex64,
+    JitScalarType.COMPLEX128: torch.complex128,
+    JitScalarType.QINT8: torch.qint8,
+    JitScalarType.QUINT8: torch.quint8,
+    JitScalarType.QINT32: torch.qint32,
+    JitScalarType.BFLOAT16: torch.bfloat16,
+    JitScalarType.FLOAT8E5M2: torch.float8_e5m2,
+    JitScalarType.FLOAT8E4M3FN: torch.float8_e4m3fn,
+    JitScalarType.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,
+    JitScalarType.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,
+}
+
+_DTYPE_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_DTYPE.items()}
diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index d5483dc67e3b1..194e3369d486b 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -44,7 +44,11 @@ def __init__(self, name: str, version: int, supported_version: int | None):
             )
         else:
             msg = (
+<<<<<<< HEAD
                 f"ONNX export failed on an operator with unrecognized namespace {name}. "
+=======
+                "ONNX export failed on an operator with unrecognized namespace {op_name}. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "If you are trying to export a custom operator, make sure you registered it with "
                 "the right domain and version."
             )
diff --git a/torch/onnx/ops/__init__.py b/torch/onnx/ops/__init__.py
index 8da3fc8e58723..5097f8db09a04 100644
--- a/torch/onnx/ops/__init__.py
+++ b/torch/onnx/ops/__init__.py
@@ -17,14 +17,22 @@
 ]
 
 
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.onnx.ops import _impl, _symbolic_impl
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # https://github.com/onnx/onnx/blob/f542e1f06699ea7e1db5f62af53355b64338c723/onnx/onnx.proto#L597
@@ -61,6 +69,7 @@ def aten_decompositions() -> dict[torch._ops.OpOverload, Callable]:
 
 
 def _parse_domain_op_type(domain_op: str) -> tuple[str, str]:
+<<<<<<< HEAD
     split = domain_op.split("::", 1)
     if len(split) == 1:
         domain = ""
@@ -68,6 +77,15 @@ def _parse_domain_op_type(domain_op: str) -> tuple[str, str]:
     else:
         domain = split[0]
         op_type = split[1]
+=======
+    splitted = domain_op.split("::", 1)
+    if len(splitted) == 1:
+        domain = ""
+        op_type = splitted[0]
+    else:
+        domain = splitted[0]
+        op_type = splitted[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return domain, op_type
 
 
@@ -208,7 +226,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
                 # Create a symbolic ONNX operator with the name "CustomOp" in the "custom_domain" domain.
                 # The output tensors will have the specified dtypes and shapes
+<<<<<<< HEAD
                 (out1, out2) = torch.onnx.ops.symbolic_multi_out(
+=======
+                (out1, out2) = torch.onnx.ops.symbolic(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "custom_domain::CustomOp",
                     (x,),
                     dict(attr_key="attr_value"),
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
index fb2c509eb436c..65771a747aace 100644
--- a/torch/onnx/ops/_impl.py
+++ b/torch/onnx/ops/_impl.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Implementations of ONNX operators as native Torch ops.
 
 NOTE: Fake implementations:
@@ -10,14 +11,24 @@
 from collections.abc import Callable
 from typing import Optional, TypeVar
 from typing_extensions import ParamSpec
+=======
+# flake8: noqa: B950
+import math
+import typing
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.onnx.ops import _dtype_mappings
 
 
+<<<<<<< HEAD
 # Use ParamSpec for better type preservation instead of bound Callable TypeVar
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
+=======
+_T = typing.TypeVar("_T", bound=Callable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # ONNX to ATen decomp table
 ONNX_ATEN_DECOMP_TABLE: dict[torch._ops.OpOverload, Callable] = {}
@@ -31,12 +42,19 @@
 )
 
 
+<<<<<<< HEAD
 def _onnx_op(
     op_type: str, opset_version: int, fake_impl: Callable[_P, _R]
 ) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
     """Decorator to register an ONNX operator with a custom implementation."""
 
     def decorator(func: Callable[_P, _R]) -> Callable[_P, _R]:
+=======
+def _onnx_op(op_type: str, opset_version: int) -> Callable[[_T], _T]:
+    """Decorator to register an ONNX operator with a custom implementation."""
+
+    def decorator(func: _T) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         overload = f"opset{opset_version}"
         torch_op = torch.library.custom_op(
             f"onnx::{op_type}.{overload}", mutates_args=()
@@ -44,12 +62,19 @@ def decorator(func: Callable[_P, _R]) -> Callable[_P, _R]:
         ONNX_ATEN_DECOMP_TABLE[getattr(getattr(torch.ops.onnx, op_type), overload)] = (
             func  # type: ignore[assignment]
         )
+<<<<<<< HEAD
         torch_op.register_fake(fake_impl)
+=======
+        # Use the same implementation for the fake implementation
+        # This is possible because we use pure aten ops to implement ONNX ops
+        torch_op.register_fake(func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch_op  # type: ignore[return-value]
 
     return decorator
 
 
+<<<<<<< HEAD
 def _rotary_embedding_23_fake_impl(
     x: torch.Tensor,
     cos_cache: torch.Tensor,
@@ -65,6 +90,9 @@ def _rotary_embedding_23_fake_impl(
 
 
 @_onnx_op("RotaryEmbedding", 23, _rotary_embedding_23_fake_impl)
+=======
+@_onnx_op("RotaryEmbedding", 23)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def rotary_embedding_23(
     x: torch.Tensor,
     cos_cache: torch.Tensor,
@@ -76,6 +104,7 @@ def rotary_embedding_23(
     rotary_embedding_dim: int = 0,
 ) -> torch.Tensor:
     """RotaryEmbedding-23 https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html#rotaryembedding-23"""
+<<<<<<< HEAD
     # x has shape (batch_size, num_heads, sequence_length, head_size)
     # or (batch_size, sequence_length, hidden_size)
     input_shape = x.shape
@@ -125,6 +154,20 @@ def rotary_embedding_23(
         new_shape = [batch_size, sequence_length, num_heads, head_size]
         x = torch.reshape(x, new_shape)
 
+=======
+    # First ensure x has shape [batch_size, num_heads, seq_len, head_size]
+    batch_size = x.shape[0]
+    sequence_length = x.shape[1]
+    if len(x.shape) == 3:
+        hidden_size = x.shape[2]
+        torch._check(
+            num_heads != 0,
+            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {x.shape}",
+        )
+        head_size = hidden_size // num_heads
+        new_shape = [batch_size, sequence_length, num_heads, head_size]
+        x = torch.reshape(x, new_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(len(x.shape) == 4, lambda: "x should be a 4D tensor by now")
     head_size = x.shape[3]
 
@@ -145,6 +188,7 @@ def rotary_embedding_23(
             position_ids
         ]  # Shape: [batch_size, sequence_length, head_size/2]
     else:
+<<<<<<< HEAD
         cos = cos_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
         sin = sin_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
 
@@ -164,6 +208,16 @@ def rotary_embedding_23(
         sin.shape[-1] == rotary_embedding_dim_half,
         lambda: f"Last dimension of sin cache ({sin.shape[-1]}) should match rotary_embedding_dim/2 ({rotary_embedding_dim_half}).",
     )
+=======
+        cos = cos_cache
+        sin = sin_cache
+    cos = cos[
+        :, :, :rotary_embedding_dim_half
+    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+    sin = sin[
+        :, :, :rotary_embedding_dim_half
+    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cos = torch.unsqueeze(
         cos, 2
     )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
@@ -193,11 +247,17 @@ def rotary_embedding_23(
     else:
         x_rotate = torch.cat((real, imag), dim=-1)
     output = torch.cat((x_rotate, x_not_rotate), dim=-1)
+<<<<<<< HEAD
     if input_rank == 3:
         return torch.reshape(output, input_shape)
 
     # Return the dimensions to the original order
     return torch.permute(output, (0, 2, 1, 3))
+=======
+    if len(x.shape) == 3:
+        output = torch.reshape(output, x.shape)
+    return output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_scale_factor(scale: Optional[float], head_size: int) -> float:
@@ -268,6 +328,7 @@ def _compute_qk_output_for_mode_0(
     return torch.matmul(Q_scaled, K_scaled.transpose(-2, -1))
 
 
+<<<<<<< HEAD
 def _attention_23_fake_impl(
     Q: torch.Tensor,
     K: torch.Tensor,
@@ -353,6 +414,9 @@ def _attention_23_fake_impl(
 
 
 @_onnx_op("Attention", 23, _attention_23_fake_impl)
+=======
+@_onnx_op("Attention", 23)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def attention_23(
     Q: torch.Tensor,
     K: torch.Tensor,
@@ -430,11 +494,25 @@ def attention_23(
 
     if can_use_sdpa:
         # Use PyTorch's optimized scaled_dot_product_attention
+<<<<<<< HEAD
+=======
+
+        # Prepare attention mask for SDPA
+        sdpa_attn_mask = None
+        if attn_mask is not None:
+            # Convert boolean mask: True means participate, SDPA expects True to mask out
+            sdpa_attn_mask = ~attn_mask if attn_mask.dtype == torch.bool else attn_mask
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = torch.nn.functional.scaled_dot_product_attention(
             Q,
             K,
             V,
+<<<<<<< HEAD
             attn_mask=attn_mask,
+=======
+            attn_mask=sdpa_attn_mask,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dropout_p=0.0,
             is_causal=is_causal,
             scale=scale,
diff --git a/torch/onnx/symbolic_caffe2.py b/torch/onnx/symbolic_caffe2.py
new file mode 100644
index 0000000000000..83a2ff6c32ec9
--- /dev/null
+++ b/torch/onnx/symbolic_caffe2.py
@@ -0,0 +1,361 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+import importlib
+import inspect
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+def register_quantized_ops(domain: str, version: int):
+    # Register all quantized ops
+    module = importlib.import_module("torch.onnx.symbolic_caffe2")
+    quant_version_ops = inspect.getmembers(module)
+    aten_q_ops = {
+        "relu",
+        "_empty_affine_quantized",
+        "dequantize",
+        "quantize_per_tensor",
+        "upsample_nearest2d",
+        "avg_pool2d",
+        "reshape",
+        "slice",
+        "cat",
+        "max_pool2d",
+        "sigmoid",
+    }
+    for op, func in quant_version_ops:
+        name = f"{domain}::{op}"
+        if inspect.isfunction(func) and not registration.registry.is_registered_op(
+            name, version
+        ):
+            if op in aten_q_ops:
+                # Override the builtin aten ops
+                registration.registry.register(
+                    f"aten::{op}", version, func, custom=True
+                )
+            registration.registry.register(name, version, func)
+
+
+def _permute_helper(g: jit_utils.GraphContext, input, axes):
+    quant_args = {
+        "axes_i": axes,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Transpose", input, **quant_args)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def nchw2nhwc(g: jit_utils.GraphContext, input):
+    axes = [0, 2, 3, 1]
+    return _permute_helper(g, input, axes)
+
+
+def nhwc2nchw(g: jit_utils.GraphContext, input):
+    axes = [0, 3, 1, 2]
+    return _permute_helper(g, input, axes)
+
+
+def linear_prepack(g: jit_utils.GraphContext, weight, bias):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "f", "i")
+def linear(g: jit_utils.GraphContext, input, weight, bias, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8FC", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def conv_prepack(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", input, weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Conv", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d_relu(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8ConvRelu", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "f", "i")
+def add(g: jit_utils.GraphContext, input_a, input_b, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Add", input_a, input_b, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def relu(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.relu(g, input)
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Relu", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "f", "i", "t")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Quantize", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def dequantize(g: jit_utils.GraphContext, input):
+    return g.op("_caffe2::Int8Dequantize", input)
+
+
+@symbolic_helper.parse_args("v", "t", "t", "t", "t", "t", "t", "t")
+def _empty_affine_quantized(
+    g: jit_utils.GraphContext,
+    input,
+    shape,
+    scale,
+    zero_point,
+    dtype,
+    pin_memory,
+    memory_format,
+    layout,
+):
+    return input
+
+
+def upsample_nearest2d(
+    g: jit_utils.GraphContext,
+    input,
+    output_size,
+    align_corners=None,
+    scales_h=None,
+    scales_w=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.upsample_nearest2d(g, input, output_size, align_corners)  # type: ignore[attr-defined]
+
+    output_size = symbolic_helper._parse_arg(output_size, "is")
+    kwargs = {
+        "output_size_i": output_size,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8ResizeNearest", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+def max_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.max_pool2d(  # type: ignore[attr-defined]
+            g, input, kernel_size, stride, padding, dilation, ceil_mode
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8MaxPool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+def avg_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.avg_pool2d(  # type: ignore[attr-defined]
+            g,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8AveragePool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def reshape(g: jit_utils.GraphContext, input, shape):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.reshape(g, input, shape)
+
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Reshape", input, shape, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def slice(g: jit_utils.GraphContext, input, dim, start, end, step):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.slice(g, input, dim, start, end, step)
+
+    if step != 1:
+        raise RuntimeError("ONNX quantized slice export only works for step 1.")
+    start = symbolic_helper._parse_arg(start, "i")
+    end = symbolic_helper._parse_arg(end, "i")
+    dim = symbolic_helper._parse_arg(dim, "i")
+
+    kwargs = {
+        "start_idx_i": start,
+        "end_idx_i": end,
+        "dim_i": dim,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Slice", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def cat(g: jit_utils.GraphContext, tensor_list, dim, scale=None, zero_point=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    input = tensors[0]
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.cat(g, tensor_list, dim)
+
+    dim = symbolic_helper._parse_arg(dim, "i")
+    kwargs = {
+        "Y_scale_f": tensors[0].node()["Y_scale"],
+        "Y_zero_point_i": tensors[0].node()["Y_zero_point"],
+    }
+    output = g.op("_caffe2::Int8Concat", *tensors, axis_i=dim, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def sigmoid(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.sigmoid(g, input)
+    # Caffe2 expects the output scale to be 1/2^8
+    # and output zero_point to be 0 (quint8 type)
+    out_scale = 1.0 / 256
+    zero_point = 0
+    kwargs = {
+        "Y_scale_f": out_scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Sigmoid", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 76b50a8eb3f77..1e104c8f5b93c 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_helper."""
 
 from __future__ import annotations
@@ -6,3 +7,2272 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_helper import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import functools
+import inspect
+import math
+import sys
+import typing
+import warnings
+from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
+from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _type_utils, errors, utils
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+_T = _TypeVar("_T")
+_U = _TypeVar("_U")
+_P = _ParamSpec("_P")
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+_ValueDescriptor = Literal[
+    "v",
+    "i",
+    "is",
+    "f",
+    "fs",
+    "b",
+    "s",
+    "t",
+    "none",
+]
+
+
+def _parse_arg(
+    value,
+    desc: _ValueDescriptor,
+    arg_name: str | None = None,
+    node_name: str | None = None,
+):
+    if desc == "none":
+        return value
+    if desc == "v" or not _is_value(value):
+        return value
+
+    node = value.node()
+    if node.mustBeNone():
+        return None
+    if node.kind() == "onnx::Constant":
+        node_val = _node_get(node, "value")
+        if desc == "i":
+            return int(node_val)
+        elif desc == "f":
+            return float(node_val)
+        elif desc == "b":
+            return bool(node_val)
+        elif desc == "s":
+            return str(node_val)
+        elif desc == "t":
+            return node_val
+        elif desc == "is":
+            return [int(v) for v in node_val]
+        elif desc == "fs":
+            return [float(v) for v in node_val]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not understand the Constant node '{node}' "
+                f"specified with descriptor '{desc}'.",
+                value,
+            )
+    elif node.kind() == "prim::ListConstruct":
+        if desc == "is":
+            for v in node.inputs():
+                element_node = v.node()
+                if element_node.kind() != "onnx::Constant":
+                    raise errors.SymbolicValueError(
+                        f"Failed to export a node '{element_node}' "
+                        f"(in list node {node}) "
+                        f"because it is not constant. "
+                        f"Please try to make things (e.g. kernel sizes) static if possible.",
+                        value,
+                    )
+            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
+                f"is not a list of integers: '{node}'",
+                value,
+            )
+
+    if arg_name is None or node_name is None:
+        raise errors.SymbolicValueError(
+            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
+            value,
+        )
+
+    raise errors.SymbolicValueError(
+        "Expected node type 'onnx::Constant' "
+        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
+        value,
+    )
+
+
+def _node_get(node: _C.Node, key: str):
+    """Gets attributes of a node which is polymorphic over return type."""
+    assert isinstance(node, _C.Node)
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
+
+
+def _is_onnx_constant(value: _C.Value):
+    """Whether a Value is an ONNX constant."""
+    return value.node().kind() == "onnx::Constant"
+
+
+def _maybe_get_const(
+    value: _C.Value | torch.Tensor | Number | Sequence | None,
+    descriptor: _ValueDescriptor,
+):
+    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
+    # otherwise it'd be converted to onnx::Constant
+    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
+    if isinstance(value, _C.Value) and _is_onnx_constant(value):
+        return _parse_arg(value, descriptor)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, "t")
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if not _is_constant(value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
+            f"got '{value}'",
+            value,
+        )
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
+    list_node = list_value.node()
+    if list_node.kind() != "prim::ListConstruct":
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type prim::ListConstruct, got '{list_node}'.",
+            list_value,
+        )
+    return list(list_node.inputs())
+
+
+def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    tuple_node = tuple_value.node()
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
+            f"got '{tuple_node.kind()}'.",
+            tuple_value,
+        )
+    return tuple(tuple_node.inputs())
+
+
+def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
+    Args:
+        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
+    Returns:
+        A tuple of tensor, scale, zero_point, and optionally axis.
+    """
+    tuple_node = tuple_value.node()
+    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
+            f"tensor. Is this likely due to missing support for quantized "
+            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
+            tuple_value,
+        )
+    unpacked = tuple(tuple_node.inputs())
+    assert len(unpacked) == 3 or len(unpacked) == 4
+    return unpacked
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be unpacked.
+def _is_packed_list(list_value: Any) -> bool:
+    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
+
+
+def parse_args(
+    *arg_descriptors: _ValueDescriptor,
+) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
+    """A decorator which converts args from torch._C.Value to built-in types.
+
+    For example:
+
+    ```
+    @parse_args('v', 'i', 'fs')
+    foo(g, a, b, c):
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
+
+    Args:
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
+            "none": the variable is unused
+    """
+
+    def decorator(
+        fn: Callable[_Concatenate[_U, _P], _T],
+    ) -> Callable[_Concatenate[_U, _P], _T]:
+        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
+
+        @functools.wraps(fn)
+        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+            # some args may be optional, so the length may be smaller
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            try:
+                sig = inspect.signature(fn)
+                arg_names = list(sig.parameters.keys())[1:]
+                fn_name = fn.__name__
+            except Exception:
+                # FIXME(justinchuby): Avoid catching Exception.
+                # Catch a more specific exception instead.
+                arg_names = [None] * len(args)  # type: ignore[list-item]
+                fn_name = None
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
+            # only support _outputs in kwargs
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
+                f"key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            if len(kwargs) == 1:
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
+                    f"'_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
+            return fn(g, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def quantized_args(
+    *arg_q_descriptors: bool,
+    scale: float | None = None,
+    zero_point: int | None = None,
+    quantize_output: bool = True,
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """A decorator which extends support for quantized version of the base operator.
+
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument de-quantization and output quantization.
+
+    Otherwise, only the base symbolic function will be invoked.
+
+    For example:
+
+    ```
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+    ```
+
+    is equivalent to
+
+    ```
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+    ```
+
+    Args:
+        arg_q_descriptors: A sequence of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator. It defaults
+          to False for unspecified (variable length) arguments.
+        scale: Quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: Quantized output zero point. If None,
+          derive from the first quantized input zero point.
+        quantize_output: If True, quantize the output of the base operator. Default is True
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            nonlocal scale
+            nonlocal zero_point
+            if scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(scale))
+            else:
+                _scale = None
+            if zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
+            else:
+                _zero_point = None
+
+            # Support variable length arguments by marking unspecified ones as non-quantized
+            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
+                len(args) - len(arg_q_descriptors)
+            )
+            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
+
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
+            # Run regular symbolic function if none of the argument is QTensor.
+            is_quantized: list[bool] = []
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    is_quantized.extend(
+                        _is_arg_quantized(descriptor, arg_input)
+                        for arg_input in arg.node().inputs()
+                    )
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
+                return fn(g, *args, **kwargs)
+
+            # Dequantize arguments that are quantized
+            non_quantized_args = []
+            for descriptor, arg in descriptor_args:
+                if _is_arg_quantized(descriptor, arg):
+                    # Quantized arg is a tuple of (value, scale, zero_point)
+                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
+                        g, arg
+                    )
+                    non_quantized_args.append(dequantized_arg)
+                    # Set scale and zero_point to the first quantized input if not already set
+                    if _scale is None:
+                        _scale = arg_scale
+                    if _zero_point is None:
+                        _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
+                else:
+                    # Non-quantized arg
+                    non_quantized_args.append(arg)
+            # TODO(justinchuby): Only single output is supported for now. We may want to
+            # support multiple outputs in the future.
+            output = fn(g, *non_quantized_args, **kwargs)
+
+            assert _scale is not None, "Bug: Scale must be set for quantized operator"
+            assert _zero_point is not None, (
+                "Bug: Zero point must be set for quantized operator"
+            )
+
+            if quantize_output:
+                return quantize_helper(g, output, _scale, _zero_point)
+            return output
+
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x: Any) -> Number | None:
+    """Convert a scalar tensor into a Python value."""
+    if isinstance(x, torch.Tensor) and x.shape == ():
+        return x.item()
+    return None
+
+
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, _C.Value):
+        return self
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
+        return getattr(self, ty)()
+    return self
+
+
+def _is_none(x: Any) -> bool:
+    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
+
+
+def _is_value(x: Any) -> bool:
+    return isinstance(x, _C.Value)
+
+
+def _is_constant(value: Any) -> bool:
+    return not _is_value(value) or value.node().kind() in {
+        "onnx::Constant",
+        "prim::Constant",
+    }
+
+
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
+def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
+    if isinstance(jit_type, _C.ListType):
+        return jit_type
+    return None
+
+
+def _is_list(x: _C.Value) -> bool:
+    return _as_list_type(x.type()) is not None
+
+
+def _is_tensor_list(x: _C.Value) -> bool:
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    return isinstance(x_type.getElementType(), _C.TensorType)
+
+
+def _is_scalar_list(x: _C.Value) -> bool:
+    """Checks if x is a scalar list, for example: List[float], List[int].
+
+    Besides checking the type is ListType, we also check if the data type is
+    a valid ONNX data type.
+    """
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
+
+
+def _is_tuple_construct(x: _C.Value) -> bool:
+    return x.node().kind() == "prim::TupleConstruct"
+
+
+def is_complex_value(x: _C.Value) -> bool:
+    assert _is_value(x)
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.COMPLEX32,
+        _type_utils.JitScalarType.COMPLEX64,
+        _type_utils.JitScalarType.COMPLEX128,
+    }
+
+
+def _get_tensor_rank(x: _C.Value) -> int | None:
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    return x_type.dim()
+
+
+def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    if allow_nonstatic:
+        # Each individual symbol is returned as None.
+        # e.g. [1, "a", "b"] -> [1, None, None]
+        return x_type.varyingSizes()
+    # returns None, if exists any symbol in sizes.
+    # e.g. [1, "a", "b"] -> None
+    return x_type.sizes()
+
+
+def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
+    sizes = _get_tensor_sizes(x)
+    return sizes[dim] if sizes else None
+
+
+def _get_dim_for_cross(x: _C.Value, dim: int | None):
+    if dim == -1:
+        tensor_rank = _get_tensor_rank(x)
+        assert tensor_rank is not None
+        return dim + tensor_rank
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(x)
+        assert sizes is not None
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
+def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}", value)
+
+
+def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of operator {op_name}. "
+        f"Please feel free to request support or submit a pull request "
+        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
+        f"Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported_detailed(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    reason: str,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in "
+        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _block_list_in_opset(name: str):
+    def symbolic_fn(*args, **kwargs):
+        raise errors.OnnxExporterError(
+            f"ONNX export failed on {name}, which is not implemented for opset "
+            f"{GLOBALS.export_onnx_opset_version}. "
+            "Try exporting with other opset versions."
+        )
+
+    return symbolic_fn
+
+
+def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
+    for arg in args:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
+    return None
+
+
+def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
+    undef = _type_utils.JitScalarType.UNDEFINED
+    jit_types = [_try_get_scalar_type(arg) for arg in args]
+    if len(jit_types) == 0:
+        return undef
+    if len(jit_types) == 1:
+        return jit_types[0]  # type: ignore[return-value]
+    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
+    for t in jit_types:
+        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
+    return _type_utils.JitScalarType.from_dtype(new_dtype)
+
+
+def _maybe_cast_to_type(
+    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
+):
+    if (
+        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
+        != jit_type
+    ):
+        return g.op(
+            "Cast",
+            value,
+            to_i=jit_type.onnx_type(),
+        )
+    return value
+
+
+def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = _get_tensor_rank(index)
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
+        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+def _slice_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes,
+    starts,
+    ends,
+    steps=None,
+):
+    if g.opset <= 9:
+        from torch.onnx.symbolic_opset9 import _slice as _slice9
+
+        return _slice9(g, input, axes, starts, ends)
+    else:
+        from torch.onnx.symbolic_opset10 import _slice as _slice10
+
+        return _slice10(g, input, axes, starts, ends, steps)
+
+
+def _is_fp(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
+
+
+def _is_bool(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
+
+
+def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
+    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
+
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
+
+
+def _sort_helper(g: jit_utils.GraphContext, input, dim, decending=True, out=None):
+    if out is not None:
+        _unimplemented("Sort", "Out parameter is not supported")
+    shape_ = g.op("Shape", input)
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if g.opset <= 10:
+        if not decending:
+            _unimplemented("Sort", "Ascending is not supported")
+        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=decending, outputs=2
+        )
+
+
+def _topk_helper(
+    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
+):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported")
+    if not _is_value(k):
+        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
+            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
+    if g.opset <= 10:
+        if not largest:
+            _unimplemented("TopK", "Ascending is not supported")
+        return g.op("TopK", input, k, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
+
+
+def _lt_helper(g: jit_utils.GraphContext, input, other):
+    if g.opset <= 8:
+        from torch.onnx.symbolic_opset8 import lt as _lt8
+
+        return _lt8(g, input, other)
+    else:
+        from torch.onnx.symbolic_opset9 import lt as _lt9
+
+        return _lt9(g, input, other)
+
+
+def _interpolate_warning(interpolate_mode):
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
+
+def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if len(axes_i) == 0:
+        # unnecessary unsqueeze if axes length==0
+        return input
+    elif _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
+
+def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
+        return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    assert axes_rank is not None
+    if axes_rank > 1:
+        raise errors.SymbolicValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
+
+def _reducesum_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes_i=None,
+    keepdims_i=1,
+    noop_with_empty_axes_i=0,
+):
+    keepdims_i = _maybe_get_const(keepdims_i, "i")
+    if g.opset >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
+
+def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, "is")
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
+        )
+        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        scale_dims = g.op("Div", dividend, divisor)
+        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
+    return scales
+
+
+def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
+
+    if not available_scales:
+        return None
+
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
+    scales = g.op("Concat", offsets, scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
+    if mode == "nearest":
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scale_factor_rank = _get_tensor_rank(scale_factor)
+    if isinstance(scale_factor.type(), _C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
+        return g.op("Concat", offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
+        scale_factor = g.op(
+            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _interpolate_get_scales_and_mode(
+    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    _interpolate_warning(mode)
+
+    align_corners = _maybe_get_const(align_corners, "b")
+    if isinstance(align_corners, bool) and align_corners:
+        return _unimplemented("interpolate", "align_corners == True")
+
+    if not input.type().dim():
+        return _unimplemented("interpolate", "missing input shape")
+    dim = input.type().dim()
+
+    if not _is_none(scale_factor):
+        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
+    elif not _is_none(size):
+        if not _is_packed_list(size):
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
+            if is_scalar:
+                size = _unsqueeze_helper(g, size, [0])
+                size = [size for i in range(dim - 2)]
+                size = g.op("Concat", *size, axis_i=0)
+        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
+    else:
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
+    return scale_factor, mode
+
+
+def _argmin_argmax_helper(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+    op_name: str,
+):
+    def op_wrapper(input, axis_i, keepdims_i):
+        if g.opset >= 12:
+            return g.op(
+                op_name,
+                input,
+                axis_i=axis_i,
+                keepdims_i=keepdims_i,
+                select_last_index_i=False,
+            )
+        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
+
+    if _is_none(dim):
+        flattened = _reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
+        if keepdim:
+            input_shape = g.op("Shape", input)
+            input_shape_shape = g.op("Shape", input_shape)
+            new_shape = g.op(
+                "ConstantOfShape",
+                input_shape_shape,
+                value_t=torch.tensor([1], dtype=torch.int64),
+            )
+            output = g.op("Reshape", output, new_shape)
+        return output
+
+    dim = _parse_arg(dim, "i")
+    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
+
+
+def _interpolate_helper(name, dim, interpolate_mode):
+    @quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "half_pixel"
+        )
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
+            output_size = g.op(
+                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
+            )
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+        else:
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+def __interpolate_helper(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    align_corners = _maybe_get_const(align_corners, "b")
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "half_pixel"
+    )
+
+    if not _is_none(size):
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and (
+                _maybe_get_const(size, "t").dim() == 0
+            )
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
+
+
+def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
+    if g.opset < 11:
+        from torch.onnx.symbolic_opset9 import unbind
+    elif g.opset <= 12:
+        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
+    else:
+        from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
+    return unbind(g, self, dim, _outputs)
+
+
+def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+    return scatter(g, self, dim, index, src)
+
+
+def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
+    if g.opset <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    else:
+        from torch.onnx.symbolic_opset13 import split
+
+        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
+
+def _repeat_interleave_single_value_repeat_helper(
+    g: jit_utils.GraphContext, self, repeats, dim
+):
+    from torch.onnx.symbolic_opset9 import flatten, unsqueeze
+
+    if not _is_tensor(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+
+    const_repeats: bool = _is_constant(repeats)
+    reps = _maybe_get_const(repeats, "t")
+
+    # Convert 'repeats' to 1-d if it is 0-d.
+    if _get_tensor_rank(repeats) == 0:
+        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
+
+    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
+    unsqueezed = unsqueeze(g, self, dim + 1)
+
+    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
+    if const_repeats:
+        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
+        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
+        onehot[dim + 1] = reps
+        repeats_per_dim = g.op("Constant", value_t=onehot)
+    else:
+        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
+        onehot = g.op(
+            "OneHot",
+            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
+            g.op(
+                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
+            ),  # depth
+            g.op(
+                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
+            ),  # on/off values
+        )
+        repeats_per_dim = flatten(g, onehot, 0, 1)
+
+    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
+    return flatten(g, tiled, dim, dim + 1)
+
+
+def _arange_cast_helper(
+    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
+) -> tuple[
+    _type_utils.JitScalarType,
+    _C.Value | None,
+    _C.Value | None,
+    _C.Value | None,
+]:
+    def _is_all_integral(scalars):
+        for scalar in scalars:
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
+        return True
+
+    # This logic is based on torch.arange docs. If "dtype" is provided,
+    # infer input types from dtype. If not, then check if any of start, stop,
+    # or step are floating point, and infer the type from get_default.
+    # Otherwise, the dtype is inferred to be torch.int64.
+    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
+        if _is_all_integral([start, end, step]):
+            scalar_type = _type_utils.JitScalarType.INT64
+        else:
+            scalar_type = _type_utils.JitScalarType.from_dtype(
+                torch.get_default_dtype()
+            )
+    else:
+        assert isinstance(dtype, int)
+        # TODO(justinchuby): Check if dtype is indeed a int.
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
+    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
+    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
+    return scalar_type, end, start, step
+
+
+def _arange_helper(g: jit_utils.GraphContext, *args):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import arange
+    else:
+        from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
+    return arange(g, *args)
+
+
+def _size_helper(g: jit_utils.GraphContext, self, dim):
+    full_shape = g.op("Shape", self)
+    from torch.onnx.symbolic_opset9 import select
+
+    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
+
+
+def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
+    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
+    # 2. expand index => [..., dim, ...], same shape as self except for dim.
+    # 3. expand value as well.
+    # 4. apply onnx::scatter.
+
+    from torch.onnx.symbolic_opset9 import expand
+
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+
+    if self.type().dim() is None:
+        return _unimplemented("index_fill", "input rank not accessible")
+    self_dim = self.type().dim()
+    dim_value = _parse_arg(dim, "i")
+    if dim_value < 0:
+        dim_value += self_dim
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
+    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
+    return expanded_index_shape, expanded_index
+
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
+def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if g.opset <= 13:
+        if allowzero == 1:
+            _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
+            )
+        return g.op("Reshape", input, shape)
+    else:
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+
+def _batchnorm_helper(
+    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
+):
+    from torch.onnx.symbolic_opset9 import _var_mean
+
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
+    return weight, bias, running_mean, running_var
+
+
+def _avgpool_helper(
+    tuple_fn: Callable[[Any], Sequence[int]],
+    padding: int | Sequence[int],
+    kernel_size,
+    stride,
+    divisor_override,
+    name,
+) -> tuple[int, ...]:
+    if divisor_override and divisor_override.node().kind() != "prim::Constant":
+        _unimplemented(name, "divisor_override")
+    return tuple(tuple_fn(padding))
+
+
+def check_training_mode(op_train_mode: int, op_name: str) -> None:
+    """Warns the user if the model's training mode and the export mode do not agree."""
+    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
+        return
+
+    if op_train_mode:
+        op_mode_enum = _C_onnx.TrainingMode.TRAINING
+    else:
+        op_mode_enum = _C_onnx.TrainingMode.EVAL
+    if op_mode_enum == GLOBALS.training_mode:
+        # The modes agree. Do nothing
+        return
+
+    op_mode_text = f"train={bool(op_train_mode)}"
+    # Setting the model mode could result in op_mode != GLOBALS.training_mode
+    # if the model is a FuncModule. In this case we warn the user of
+    # the state and export depending on op_mode
+    # This is to support use-cases of fixing certain layer weights
+    # in training.
+    warnings.warn(
+        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+    )
+
+
+def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
+    input_size = g.op("Shape", input)
+    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
+    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    if end_dim < dim - 1:
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
+
+    final_shape = g.op("Concat", *slices, axis_i=0)
+    from torch.onnx.symbolic_opset9 import _reshape_from_tensor
+
+    return _reshape_from_tensor(g, input, final_shape)
+
+
+def _is_split_static(split_size_or_sizes, _outputs):
+    if _outputs is None:
+        return False
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
+        return False
+    return True
+
+
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
+    rank = _get_tensor_rank(self)
+    if rank is not None and any(
+        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
+    ):
+        # If input tensor is empty, according to ONNX ReduceSum definition,
+        # set keepdims=1 so that the resulted tensor has the same rank as the input.
+        return g.op(op_name, self, keepdims_i=1)
+    return g.op(op_name, self, keepdims_i=0)
+
+
+def dequantize_helper(
+    g: jit_utils.GraphContext,
+    qtensor: _C.Value,
+    qdtype: _C_onnx.TensorProtoDataType | None = None,
+) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
+            for per tensor quantization, or
+            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
+            representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
+            data type of quantized tensor. It must be either
+            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype.onnx_type()
+        else:
+            qdtype = _C_onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            qtensor,
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+def quantize_helper(
+    g: jit_utils.GraphContext,
+    tensor: _C.Value,
+    scale: _C.Value,
+    zero_point: _C.Value,
+    axis: _C.Value | None = None,
+) -> _C.Value:
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+
+    Returns:
+        A TupleConstruct storing information of the quantized tensor.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            tensor,
+        )
+
+    assert scale is not None
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+def requantize_bias_helper(
+    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
+):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
+    return has_same_dtype
+
+
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not _is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not _is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == ():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return _handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            keepdim = _get_const(keepdim, "i", "keepdim")
+            if g.opset < 18:
+                desc = "is" if allow_multi_dim_support else "i"
+                dim = _get_const(dim, desc, "dim")
+                dim_list = dim if allow_multi_dim_support else [dim]
+                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+            else:
+                if _is_value(dim):
+                    axes = dim
+                else:
+                    if allow_multi_dim_support:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
+                        )
+                    else:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
+                        )
+                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
+
+    return symbolic
+
+
+def _overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
+
+    return wrapper
+
+
+def _reduce_with_dtype_helper(
+    onnx_op: str, name: str, allow_multi_dim_support: bool = True
+):
+    symbolic = _reduce_op_symbolic_helper(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @_overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @quantized_args(True)
+        @parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @quantized_args(True)
+        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+def _numel_helper(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@parse_args("v", "is", "i", "i")
+def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if g.opset < 18:
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+    else:
+        axes = None
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        if axes is None:
+            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
+        else:
+            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+
+
+def _embedding_bag_helper(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return _onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = _unsqueeze_helper(
+        g,
+        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = _slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = _slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not _is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = _unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = _reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
+    else:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+def _linalg_vector_norm_helper(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    axes = None
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if _is_none(dim):
+        self = _reshape_helper(g, self, [-1])
+        keepdim = False
+    elif g.opset >= 18:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+
+    if ord == math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == 0:
+        if g.opset < 11:
+            return _onnx_opset_unsupported_detailed(
+                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+            )
+        else:
+            if dim is None:
+                self = _reshape_helper(
+                    g,
+                    self,
+                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+                )
+                keepdim = False
+
+            cond_op = g.op(
+                "Not",
+                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
+            )
+            cond_op = g.op(
+                "Cast",
+                cond_op,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    elif ord == 1:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL1")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, axes, keepdim=keepdim
+                )
+    elif ord == 2:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL2")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, axes, keepdim=keepdim
+                )
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = _reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not _is_none(dtype):
+        dtype = _get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    "Byte": _C_onnx.TensorProtoDataType.UINT8,
+    "Char": _C_onnx.TensorProtoDataType.INT8,
+    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
+    "Float": _C_onnx.TensorProtoDataType.FLOAT,
+    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
+    "Int": _C_onnx.TensorProtoDataType.INT32,
+    "Long": _C_onnx.TensorProtoDataType.INT64,
+    "Short": _C_onnx.TensorProtoDataType.INT16,
+    "Bool": _C_onnx.TensorProtoDataType.BOOL,
+    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
+    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
+    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
+    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_name_to_pytorch = {
+    "uint8_t": "Byte",
+    "int8_t": "Char",
+    "double": "Double",
+    "float": "Float",
+    "half": "Half",
+    "int": "Int",
+    "int64_t": "Long",
+    "int16_t": "Short",
+    "bool": "Bool",
+    "complex64": "ComplexFloat",
+    "complex128": "ComplexDouble",
+    "qint8": "QInt8",
+    "quint8": "QUInt8",
+    "qint32": "QInt32",
+    "bfloat16": "BFloat16",
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
+]
+
+# Deprecated. Internally use _type_utils.ScalarType
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
+]
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
+_quantized_ops: set[int] = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 9bda69b81ab60..e82b1324b67c2 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset10."""
 
 from __future__ import annotations
@@ -9,3 +10,1193 @@
 from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import (  # noqa: F401
     _slice,
 )
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+__all__ = [
+    "dequantize",
+    "div",
+    "embedding_bag",
+    "fake_quantize_per_tensor_affine",
+    "flip",
+    "fmod",
+    "isfinite",
+    "isinf",
+    "nan_to_num",
+    "quantize_per_tensor",
+    "quantized_add_relu",
+    "quantized_add",
+    "quantized_cat",
+    "quantized_conv1d_relu",
+    "quantized_conv2d_relu",
+    "quantized_conv3d_relu",
+    "quantized_conv1d",
+    "quantized_conv2d",
+    "quantized_conv3d",
+    "quantized_conv_transpose1d",
+    "quantized_conv_transpose2d",
+    "quantized_conv_transpose3d",
+    "quantized_group_norm",
+    "quantized_hardswish",
+    "quantized_instance_norm",
+    "quantized_layer_norm",
+    "quantized_leaky_relu",
+    "quantized_linear",
+    "quantized_linear_relu",
+    "quantized_mul",
+    "quantized_sigmoid",
+    "slice",
+    "sort",
+    "topk",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return opset9.true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    else:
+        return opset9._div_rounding_mode(g, self, other, rounding_mode)
+
+
+@_onnx_symbolic("aten::_floor_divide")
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = opset9.true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Mod", self, other, fmod_i=0)
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Sub", div, one)
+        return g.op("Where", fixup_mask, fixup, div)
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+def _aten_max_pool_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+) -> _C.Value:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, _ = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze",
+            pool_result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    return pool_result
+
+
+# For MaxPool
+def _adjust_attributes_of_max_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+    dilation: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(dilation, int):
+        dilation = [dilation] * expand_size
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        # 2D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 3:
+        # 3D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    else:
+        # When padding is already done for all dimensions,
+        # we don't need to double it
+        # eg: (1, 1, 1, 1, 1, 1)
+        pads = padding  # type: ignore[assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads, dilation)
+
+
+def _aten_max_pool_with_indices_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+    n_dims_one: Sequence[int],
+    n_dims_zero: Sequence[int],
+    n_dims_axes: Sequence[int],
+) -> tuple[_C.Value, Sequence[int]]:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+    _, flatten_indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        dilations_i=dilations,
+        kernel_shape_i=n_dims_one,
+        strides_i=n_dims_one,
+    )
+
+    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
+    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
+    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
+
+    delta = g.op("Slice", flatten_indices, starts, ends, axes)
+    indices = g.op("Sub", indices, delta)
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
+        )
+        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
+
+    return (pool_result, indices)
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool1d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d_with_indices",
+            1,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d_with_indices",
+            2,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d_with_indices",
+            3,
+            return_indices=True,
+        )
+    ],
+)
+def _max_pool(name: str, expand_size: int, return_indices: bool):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(
+        g: jit_utils.GraphContext,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        dilation: Sequence[int],
+        ceil_mode: bool,
+    ):
+        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
+            expand_size, kernel_size, stride, padding, dilation
+        )
+
+        if return_indices:
+            return _aten_max_pool_with_indices_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+                ([1] * expand_size),
+                ([0] * expand_size),
+                ([2 + i for i in range(expand_size)]),
+            )
+        else:
+            return _aten_max_pool_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+            )
+
+    return symbolic_fn
+
+
+# For AvgPool
+def _adjust_attributes_of_avg_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        pads = padding * expand_size  # type: ignore[operator, assignment]
+    else:
+        pads = padding * 2  # type: ignore[operator, assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
+)
+def _avg_pool(name, expand_size):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
+            expand_size, kernel_size, stride, padding
+        )
+
+        result = g.op(
+            "AveragePool",
+            input,
+            ceil_mode_i=ceil_mode,
+            count_include_pad_i=count_include_pad,
+            kernel_shape_i=kernel_shape,
+            pads_i=pads,
+            strides_i=strides,
+        )
+
+        return result
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    @symbolic_helper.quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Resize", input, scales, mode_s=mode)
+
+
+def _slice(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    axes: list | torch.Tensor | torch._C.Value,
+    starts: list | torch.Tensor | torch._C.Value,
+    ends: list | torch.Tensor | torch._C.Value,
+    steps: list | torch.Tensor | torch._C.Value | None = None,
+):
+    def is_none_value(value):
+        if value is None:
+            return True
+        return (
+            isinstance(value, torch._C.Value)
+            and value.node().kind() == "prim::Constant"
+            and isinstance(value.type(), _C.NoneType)
+        )
+
+    def to_slice_input(list_or_value, default_value=None):
+        # Convert input param into a 1D torch.Value.
+        if is_none_value(list_or_value) and default_value is not None:
+            list_or_value = [default_value]
+
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            return g.op("Constant", value_t=torch.tensor(list_or_value))
+
+        rank = symbolic_helper._get_tensor_rank(list_or_value)
+        if rank == 0:
+            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
+        if rank == 1:
+            return list_or_value
+        raise errors.SymbolicValueError(
+            f"Rank must be 0 or 1, not {rank}", list_or_value
+        )
+
+    def get_const_value(list_or_value):
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            if len(list_or_value) == 1:
+                return list_or_value[0]
+            return None
+        return symbolic_helper._maybe_get_const(list_or_value, "i")
+
+    # Check if slice is a no-op
+    if (
+        get_const_value(starts) == 0
+        and get_const_value(ends) == _constants.INT64_MAX
+        and (steps is None or get_const_value(steps) == 1)
+    ):
+        return input
+
+    axes = to_slice_input(axes)
+    starts = to_slice_input(starts, default_value=0)
+    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
+    if steps is None:
+        return g.op("Slice", input, starts, ends, axes)
+    steps = to_slice_input(steps, default_value=1)
+    return g.op("Slice", input, starts, ends, axes, steps)
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
+        dims, start, end, step = args
+    elif len(args) == 3:
+        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
+        start, end, step = args
+        dims = [0]
+    else:
+        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
+
+    return symbolic_helper._slice_helper(
+        g,
+        self,
+        axes=dims,
+        starts=start,
+        ends=end,
+        steps=step,
+    )
+
+
+@_onnx_symbolic("aten::flip")
+@symbolic_helper.parse_args("v", "is")
+def flip(g: jit_utils.GraphContext, input, dims):
+    return symbolic_helper._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
+        steps=[-1] * len(dims),
+    )
+
+
+@_onnx_symbolic("aten::fmod")
+def fmod(g: jit_utils.GraphContext, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
+    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
+    if offsets_dim_0 is not None:
+        if include_last_offset:
+            offset_len = offsets_dim_0 - 1
+            offsets_extended = offsets
+        else:
+            offset_len = offsets_dim_0
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
+            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
+        list_ = []
+        for i in range(offset_len):
+            start_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
+                [0],
+            )
+            end_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(
+                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
+                ),
+                [0],
+            )
+            axes_ = g.op("Constant", value_t=torch.tensor([0]))
+            indices_row = g.op("Slice", indices, start_, end_, axes_)
+
+            embeddings = g.op("Gather", embedding_matrix, indices_row)
+            if not symbolic_helper._is_none(per_sample_weights):
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
+                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
+            if mode == 0:
+                embeddings = symbolic_helper._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
+            elif mode == 1:
+                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+            else:
+                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
+            list_.append(embeddings)
+
+        output = g.op("Concat", *list_, axis_i=0)
+        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+        return output, None, None, None
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+            inputs,
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
+        raise errors.SymbolicValueError(
+            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    scale = symbolic_helper._maybe_get_scalar(scale)
+    if scale is None:
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+            inputs,
+        )
+    scale = scale.float().data  # Avoid exporter generating double type
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
+
+
+@_onnx_symbolic("aten::isinf")
+def isinf(g: jit_utils.GraphContext, input):
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
+
+
+@_onnx_symbolic("aten::isfinite")
+def isfinite(g: jit_utils.GraphContext, input):
+    inf_node = isinf(g, input)
+    nan_node = opset9.isnan(g, input)
+    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
+
+
+@_onnx_symbolic("aten::quantize_per_tensor")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    # TODO(justinchuby): Extract all the cast ops into a helper function.
+    zero_point = g.op(
+        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
+
+
+@_onnx_symbolic("aten::dequantize")
+def dequantize(g: jit_utils.GraphContext, input):
+    return symbolic_helper.dequantize_helper(g, input)[0]
+
+
+@_onnx_symbolic("aten::nan_to_num")
+@symbolic_helper.parse_args("v", "f", "f", "f")
+def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not symbolic_helper._is_fp(input):
+        return input
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
+    if nan is None:
+        nan = 0.0
+    nan_cond = opset9.isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input's dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_result),
+        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        opset9.lt(
+            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
+        ),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# Quantized symbolics ---------------------------------------------------------
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
+# introduced in opset version 10.
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add")
+def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add_relu")
+def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::mul")
+def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.mul(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::sigmoid")
+def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.sigmoid(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::leaky_relu")
+def quantized_leaky_relu(
+    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.leaky_relu(g, x, negative_slope, inplace)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::group_norm")
+def quantized_group_norm(
+    g: jit_utils.GraphContext,
+    x,
+    num_groups,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
+def quantized_instance_norm(
+    g: jit_utils.GraphContext,
+    q_input,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+
+    output = opset9.instance_norm(
+        g, input, weight, bias, None, None, False, 0.0, eps, False
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::cat")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def quantized_cat(
+    g: jit_utils.GraphContext,
+    q_inputs: _C.Value,
+    dim: int,
+    op_scale: _C.Value,
+    op_zero_point: _C.Value,
+) -> _C.Value:
+    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
+    dequantized = [
+        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
+    ]
+    concatenated = g.op("Concat", *dequantized, axis_i=dim)
+    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 276ef7209bf69..adfdc0f89dbf5 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset11."""
 
 from __future__ import annotations
@@ -6,3 +7,1474 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 11."""
+
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset10 as opset10,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "add",
+    "append",
+    "arange",
+    "argsort",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "cat",
+    "chunk",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "constant_pad_nd",
+    "cumsum",
+    "Delete",
+    "embedding_bag",
+    "embedding_renorm",
+    "flatten",
+    "gather",
+    "hardtanh",
+    "hstack",
+    "im2col",
+    "index_fill",
+    "index",
+    "index_copy",
+    "index_put",
+    "insert",
+    "linalg_det",
+    "linalg_vector_norm",
+    "logdet",
+    "masked_scatter",
+    "masked_select",
+    "mm",
+    "narrow",
+    "normal",
+    "pad",
+    "pixel_shuffle",
+    "pop",
+    "prim_constant_chunk",
+    "reflection_pad",
+    "relu6",
+    "remainder",
+    "replication_pad",
+    "round",
+    "scatter",
+    "select",
+    "size",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "squeeze",
+    "stack",
+    "topk",
+    "unbind",
+    "unique_dim",
+    "unsqueeze",
+    "vstack",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
+    )
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    def _cast_if_not_none(tensor, dtype):
+        if tensor is not None and not symbolic_helper._is_none(tensor):
+            return g.op(
+                "Cast",
+                tensor,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            return tensor
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
+
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if (
+            symbolic_helper._get_tensor_rank(min) == 0
+            and symbolic_helper._get_tensor_rank(max) == 0
+        ):
+            return symbolic_helper._op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(min) == 0:
+        max = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(max) == 0:
+        min = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::relu6")
+def relu6(g: jit_utils.GraphContext, input):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
+    )
+    return clamp(g, input, min_val, max_val)
+
+
+@_onnx_symbolic("aten::select")
+# Opset 11 gather accepts negative indices
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(
+    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
+):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        return values
+
+    if len(indices_list) > 1:
+        for idx_ in range(len(indices_list)):
+            if symbolic_helper._is_bool(indices_list[idx_]):
+                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
+        index = indices_list[0]
+
+        for ind in indices_list[1:]:
+            index = opset9.add(g, index, ind)
+        broadcast_index_shape = g.op("Shape", index)
+        indices_list = [
+            symbolic_helper._unsqueeze_helper(
+                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
+        ]
+        index = g.op("Concat", *indices_list, axis_i=-1)
+    else:
+        # Replace index_put node with masked_scatter or masked_fill
+        # when inputs to the index_put node contains a single boolean input.
+        #
+        # index_put -> masked_fill
+        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
+        #   * input value contains single element (e.g.: %18).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
+        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
+        #   %24 : Tensor?[] = prim::ListConstruct(%23)
+        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #                aten::index_put(%mask, %24, %18, %30)
+        #   return (%25)
+        #
+        #
+        # index_put -> masked_scatter
+        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
+        #   * input value contains multiple elements (e.g.: %28).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
+        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
+        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::ne(%mask, %some_const)
+        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
+        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        #   %30 : int[] = prim::Constant[value=[-1]]()
+        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
+        #   %32 : Tensor?[] = prim::ListConstruct(%31)
+        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #               = aten::index_put(%mask, %32, %28, %38)
+        #   return (%33)
+        index = indices_list[0]
+        bool_inp = index
+        if symbolic_helper._is_bool(bool_inp):
+            rank = symbolic_helper._get_tensor_rank(values)
+            if rank is not None and rank == 0:
+                return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
+            return masked_scatter(g, self, bool_inp, values)
+        broadcast_index_shape = g.op("Shape", index)
+        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
+    sub_data_shape = symbolic_helper._slice_helper(
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
+    )
+    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    # Check if values is a singular value and expand accordingly
+    rank = symbolic_helper._get_tensor_rank(values)
+    if rank is not None and rank == 0:
+        values = opset9.expand(g, values, values_shape, None)
+    values = symbolic_helper._reshape_helper(g, values, values_shape)
+
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
+        )
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
+
+    if accumulate:
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
+        )
+        result = g.op("ScatterND", zeros, index, values)
+        result = add(g, self, result)
+    else:
+        result = g.op("ScatterND", self, index, values)
+
+    return result
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None and rank != 4:
+        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
+    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bicubic2d",
+    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
+
+
+@_onnx_symbolic("aten::__interpolate")
+@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    return symbolic_helper.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
+    return g.op("GatherElements", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(src)
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+        return g.op(
+            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
+        )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
+    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        cast = g.op(
+            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    else:
+        cast = self
+    csum = g.op("CumSum", cast, dim_tensor)
+    return csum
+
+
+@_onnx_symbolic("aten::masked_select")
+def masked_select(g: jit_utils.GraphContext, self, mask):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    return g.op("GatherND", self, index)
+
+
+@_onnx_symbolic("aten::masked_scatter")
+def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    # NOTE: source can have more elements than needed.
+    # It could also have arbitrary shape.
+    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
+    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
+    source = symbolic_helper._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=opset9.size(g, index, torch.LongTensor([0])),
+    )
+    return g.op("ScatterND", self, index, source)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    if (
+        symbolic_helper._is_tensor_list(self)
+        or self.node().kind() == "onnx::SplitToSequence"
+    ):
+        return g.op("SequenceLength", self)
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    if symbolic_helper._is_tensor_list(self):
+        # SequenceAt requires that the input be a List of Tensors
+        return g.op("SequenceAt", self, i)
+    else:
+        from torch.onnx.symbolic_opset9 import __getitem_ as getitem
+
+        return getitem(g, self, i)
+
+
+@_onnx_symbolic("aten::_set_item")
+def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
+    tensor_list = g.op("SequenceErase", tensor_list, i)
+    return g.op("SequenceInsert", tensor_list, v, i)
+
+
+@_onnx_symbolic("aten::append")
+def append(g: jit_utils.GraphContext, self, tensor):
+    return g.op("SequenceInsert", self, tensor)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        tensor_list_node = other.node()
+        if tensor_list_node.kind() != "prim::ListConstruct":
+            return symbolic_helper._unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
+        tensors = symbolic_helper._unpack_list(other)
+        l = self
+        for t in tensors:
+            l = g.op("SequenceInsert", l, t)
+        return l
+
+    return opset9.add(g, self, other, alpha)
+
+
+@_onnx_symbolic("aten::insert")
+def insert(g: jit_utils.GraphContext, self, pos, tensor):
+    return g.op("SequenceInsert", self, tensor, pos)
+
+
+@_onnx_symbolic("aten::pop")
+def pop(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::Delete")
+def Delete(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.cat(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.stack(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::unique_dim")
+@symbolic_helper.parse_args("v", "i", "i", "i", "i")
+def unique_dim(
+    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
+):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::argsort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def argsort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    _, indices = symbolic_helper._sort_helper(
+        g, self, dim, decending=decending, out=out
+    )
+    return indices
+
+
+@_onnx_symbolic("aten::round")
+@symbolic_helper.parse_args("v", "i")
+def round(g: jit_utils.GraphContext, self, decimals=0):
+    if not symbolic_helper._is_fp(self):
+        return self
+    if decimals == 0:
+        return g.op("Round", self)
+    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
+    round = g.op("Round", mul)
+    return g.op(
+        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
+    )
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
+        return opset9.remainder(g, input, other)
+    return g.op("Mod", input, other, fmod_i=0)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+    else:
+        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+    else:
+        return opset9.unbind(g, self, dim, _outputs)
+
+
+def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+
+    Args:
+        input: the input tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+            where m is in range [0, n].
+    """
+    if (
+        not symbolic_helper._is_packed_list(pad)
+        and symbolic_helper._is_list(pad)
+        and symbolic_helper._is_scalar_list(pad)
+    ):
+        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
+    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    rank = symbolic_helper._get_tensor_rank(input)
+    if rank is None:
+        rank = g.op("Size", g.op("Shape", input))
+    else:
+        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
+    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
+    # Currently ONNX only supports int64 type for Pad
+    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
+    # Reshape and reverse order and collate first beginnings and then ends
+    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
+    #               [..., 0, dim_n-1_end, dim_n_end]]
+    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
+    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return padding_c
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
+    mode = "constant"
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, input)
+    pad = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, pad, value, mode_s=mode)
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return opset9._pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic("aten::linalg_det")
+def linalg_det(g: jit_utils.GraphContext, self):
+    return g.op("Det", self)
+
+
+@_onnx_symbolic("aten::logdet")
+def logdet(g: jit_utils.GraphContext, input):
+    return opset9.log(g, linalg_det(g, input))
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    if len(args) == 2 and all(isinstance(val, int) for val in args):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=type_.dtype()),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start_default, end, delta_default)
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        _, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        return g.op("Range", start, end, step)
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start, end, delta_default)
+    else:
+        return symbolic_helper._unimplemented(
+            "aten::arange", f"with {len(args)} arguments"
+        )
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    # dim as a tensor
+    if not symbolic_helper._is_constant(dim):
+        return symbolic_helper._squeeze_helper(g, self, [dim])
+
+    dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    input_rank = symbolic_helper._get_tensor_rank(self)
+    adjusted_dim = dim
+    if input_rank is not None and dim < 0:
+        adjusted_dim += input_rank
+    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
+    if (dim < 0 and input_rank is None) or dim_size is None:
+        # If onnx shape inference is not on, export always as dynamic.
+        # Because we cannot tell if observed static shape is also static at runtime.
+        # create "cond" node (condition is shape[i]==1)
+        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
+        size = symbolic_helper._size_helper(g, self, dim_constant)
+        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
+        cond = g.op("Equal", size, const_one)
+        # create the "If" node and add the "then" and "else" blocks to it.
+        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+            g, "If", cond, n_blocks=2
+        )
+        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
+        utils._add_output_to_block(if_context.block, squeeze_)
+        identity_ = else_context.op("Identity", self)
+        utils._add_output_to_block(else_context.block, identity_)
+        return if_op
+
+    # For static input shape
+    dim = adjusted_dim
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
+        return self
+    return symbolic_helper._squeeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::unsqueeze")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    if symbolic_helper._is_constant(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    return symbolic_helper._unsqueeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not symbolic_helper._is_none(index) and (
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
+        ):
+            index = opset9.nonzero(g, index)
+            return g.op("GatherND", self, index)
+    return opset9.index(g, self, index)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bitwise_right_shift")
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="RIGHT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::bitwise_left_shift")
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="LEFT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+def _get_im2col_indices_along_dim(
+    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = symbolic_helper._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = symbolic_helper._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
+    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op("Pad", input, pad)
+
+
+def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
+
+    return g.op(
+        "Concat",
+        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
+        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
+
+
+@_onnx_symbolic("aten::im2col")
+@symbolic_helper.parse_args("v", "is", "is", "is", "is")
+def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
+    # [[[[1., 2., 3.,],
+    #    [4., 5., 6.,],
+    #    [7., 8., 9.,]]]]
+    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[1., 2., 3.],
+    #     [4., 5., 6.]],
+    #    [[4., 5., 6.],
+    #     [7., 8., 9.]]]]]
+    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[[1., 2.],
+    #      [4., 5.]],
+    #     [[2., 3.],
+    #      [5., 6]]],
+    #    [[[4., 5.],
+    #      [7., 8.]],
+    #     [[5., 6.],
+    #      [8., 9.]]]]]]
+    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
+    #  [[[1., 2., 4., 5.],
+    #    [2., 3., 5., 6.],
+    #    [4., 5., 7., 8.],
+    #    [5., 6., 8., 9.]]]
+    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
+    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
+    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
+    return symbolic_helper._reshape_helper(g, output, output_shape)
+
+
+@_onnx_symbolic("aten::narrow")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    end = g.op("Add", start, length)
+    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim == 1:
+        return input
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1:
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
+            return g.op("Flatten", input, axis_i=start_dim)
+    elif start_dim == 0:
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
+            return g.op("Flatten", input, axis_i=end_dim + 1)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    # if end_dim is negative add dim
+    if end_dim < 0:
+        end_dim = dim + end_dim
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self,
+    ord,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::embedding_renorm")
+@symbolic_helper.parse_args("v", "v", "f", "f")
+def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_i = int(norm_type)
+    if norm_i == 1:
+        norm_type = "ReduceL1"
+    elif norm_i == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise errors.SymbolicValueError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
+            "Only 1. and 2. are supported.",
+            weight,
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
+
+@_onnx_symbolic("aten::chunk")
+def chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    # Calculate chunk size for dynamic chunk
+    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
+    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
+    # Create splits vector
+    chunk_vec = [
+        opset9.expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
+    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
+    return split(g, self, chunk_vec, dim)
+
+
+@_onnx_symbolic("aten::normal")
+def normal(
+    g: jit_utils.GraphContext,
+    mean,
+    std,
+    sizes=None,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
+    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
+    # from a mean 0 and variance 1 distribution then
+    #       sigma x+mu
+    # is a sample with mean mu and variance sigma's square.
+    if sizes is not None and not symbolic_helper._is_none(sizes):
+        mean = opset9.expand(g, mean, sizes, None)
+    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
+    return add(g, result, mean)
+
+
+@_onnx_symbolic("aten::atleast_1d")
+def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 1D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1]))
+        )
+    return self
+
+
+@_onnx_symbolic("aten::atleast_2d")
+def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 2D
+    #       If it's 1D, unsqueeze to 2D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+    return self
+
+
+@_onnx_symbolic("aten::atleast_3d")
+def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 3D
+    #       If it's 1D, unsqueeze to 3D
+    #       If it's 2D, unsqueeze to 3D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            elif tensor_rank == 2:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    elif tensor_rank == 2:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    return self
+
+
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op(
+        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+    )
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res
+
+
+@_onnx_symbolic("aten::hstack")
+def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_1d(g, tensor_list)
+    first_tensor = g.op(
+        "SequenceAt",
+        tensor_list,
+        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
+    )
+    first_tensor_shape = g.op("Shape", first_tensor)
+    first_tensor_dim = g.op("Size", first_tensor_shape)
+
+    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
+
+    (
+        if_op_greater,
+        (if_context_equal, else_context_equal),
+        _,
+    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
+    result_if = if_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
+    )
+    utils._add_output_to_block(if_context_equal.block, result_if)
+    result_else = else_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
+    )
+    utils._add_output_to_block(else_context_equal.block, result_else)
+    result = if_op_greater.node().output()
+
+    return result
+
+
+@_onnx_symbolic("aten::vstack")
+def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_2d(g, tensor_list)
+    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 63e137734e8a7..b847cd2f351db 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset12."""
 
 from __future__ import annotations
@@ -6,3 +7,469 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset12 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 12
+
+__all__ = [
+    "argmax",
+    "argmin",
+    "binary_cross_entropy_with_logits",
+    "celu",
+    "cross_entropy_loss",
+    "dropout",
+    "einsum",
+    "ge",
+    "le",
+    "native_dropout",
+    "nll_loss",
+    "nll_loss2d",
+    "nll_loss_nd",
+    "outer",
+    "pow",
+    "tensordot",
+    "unfold",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
+
+
+def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
+    if not tensors:
+        raise RuntimeError("Einsum inputs are empty.")
+    # ONNX does not support bool for Einsum inputs.
+    if symbolic_helper._is_bool(tensors[0]):
+        tensors = [
+            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=_C_onnx.TensorProtoDataType.BOOL,
+        )
+    else:
+        return g.op("Einsum", *tensors, equation_s=equation)
+
+
+@_onnx_symbolic("aten::einsum")
+@symbolic_helper.parse_args("s", "v", "is")
+def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return _einsum_helper(g, equation, tensors)
+
+
+@_onnx_symbolic("aten::outer")
+@symbolic_helper.parse_args("v", "v")
+def outer(g: jit_utils.GraphContext, input, other):
+    # make sure to cast other to self's type
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
+        )
+    return _einsum_helper(g, "i,j->ij", [input, other])
+
+
+def _dropout_returns_masked_input_and_mask(
+    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
+) -> tuple[torch._C.Value, torch._C.Value | None]:
+    symbolic_helper.check_training_mode(train, "dropout")
+    # In eval mode, dropout is non-op. That is, if the node's
+    # train param is set to False, dropout just returns its inputs.
+    if not train:
+        return input, None
+    p = g.op("Constant", value_t=torch.tensor(p))
+    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
+    r, mask = g.op("Dropout", input, p, t, outputs=2)
+    return r, mask
+
+
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
+    return masked
+
+
+@_onnx_symbolic("aten::native_dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def native_dropout(g: jit_utils.GraphContext, input, p, train):
+    return _dropout_returns_masked_input_and_mask(g, input, p, train)
+
+
+@_onnx_symbolic("aten::nll_loss")
+def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return nllloss
+
+
+@_onnx_symbolic("aten::nll_loss2d")
+def nll_loss2d(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::nll_loss_nd")
+def nll_loss_nd(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::cross_entropy_loss")
+def cross_entropy_loss(
+    g: jit_utils.GraphContext,
+    self,
+    target,
+    weight,
+    reduction,
+    ignore_index,
+    label_smoothing,
+):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
+    if label_smoothing is not None and label_smoothing > 0.0:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX does not support label_smoothing", self
+        )
+
+    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return celoss
+
+
+@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def binary_cross_entropy_with_logits(
+    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
+):
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = opset9.sigmoid(g, input)
+    log_sig_x = opset9.log(g, sig_x)
+    sub_1_x = opset9.sub(g, p, sig_x)
+    sub_1_y = opset9.sub(g, p, target)
+    log_1_x = opset9.log(g, sub_1_x)
+    if pos_weight is None or symbolic_helper._is_none(pos_weight):
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
+            ),
+        )
+    else:
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g,
+                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
+                opset9.mul(g, sub_1_y, log_1_x),
+            ),
+        )
+
+    if weight is not None and not symbolic_helper._is_none(weight):
+        output = opset9.mul(g, weight, output)
+
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return g.op("ReduceSum", output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
+            input,
+        )
+
+
+@_onnx_symbolic("aten::celu")
+def celu(g: jit_utils.GraphContext, self, alpha):
+    alpha = symbolic_helper._maybe_get_const(alpha, "f")
+    # if the input is of type double cast it to float
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        out = g.op("Celu", self, alpha_f=alpha)
+        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+    return g.op("Celu", self, alpha_f=alpha)
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    return g.op("Pow", self, exponent)
+
+
+@_onnx_symbolic("aten::ge")
+def ge(g: jit_utils.GraphContext, input, other):
+    return g.op("GreaterOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::le")
+def le(g: jit_utils.GraphContext, input, other):
+    return g.op("LessOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    const_size = symbolic_helper._maybe_get_const(size, "i")
+    const_step = symbolic_helper._maybe_get_const(step, "i")
+    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
+        const_step
+    ):
+        return opset9.unfold(g, input, dimension, const_size, const_step)
+
+    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
+    if sizedim is not None:
+        low_start = g.op("Constant", value_t=torch.tensor(0))
+        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
+        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
+        low_indices = g.op("Range", low_start, low_end, step)
+        hi_indices = g.op("Range", size, hi_end, step)
+
+        low_size = symbolic_helper._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = symbolic_helper._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+
+        ndim = symbolic_helper._get_tensor_rank(input)
+        assert ndim is not None
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+
+        unsqueeze_list = []
+        loop_condition = g.op("Constant", value_t=torch.tensor(1))
+        loop_condition = g.op(
+            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+        )
+        loop_len = g.op("Min", low_size, hi_size)
+
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+
+        starts = loop_context.op("Gather", low_indices, block_input_iter)
+        ends = loop_context.op("Gather", hi_indices, block_input_iter)
+        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
+        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
+        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
+        stack = loop_context.op("Slice", input, starts, ends, axes)
+
+        unsqueeze = symbolic_helper._unsqueeze_helper(
+            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
+        )
+        unsqueeze_list.append(unsqueeze)
+        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
+
+        cond_out = loop_context.op(
+            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+        )
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, concat)
+
+        loop_output = loop.node().output()
+        perm = [0, 1, 2, 3, 4]
+        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
+        transpose = g.op("Transpose", loop_output, perm_i=perm)
+        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
+
+        return squeeze
+
+    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
+
+
+@_onnx_symbolic("aten::tensordot")
+@symbolic_helper.parse_args("v", "v", "is", "is", "v")
+def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Tensordot", "Out parameter is not supported for tensordot."
+        )
+
+    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
+    if dim_count_a is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
+            input_a,
+        )
+
+    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
+    if dim_count_b is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
+            input_b,
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
+
+    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
+    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
+
+    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
+    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
+
+    input_shape = g.op("Shape", new_input_a)
+    left_sizes_a = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", output_a)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", new_input_b)
+    left_sizes_b = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
+    )
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    input_shape = g.op("Shape", output_b)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+
+    shape_sizes = [left_sizes_a, left_sizes_b]
+    return opset9._reshape_from_tensor(g, output, shape_sizes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index 18aff9295be8c..9aaef2b5a36a1 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset13."""
 
 from __future__ import annotations
@@ -6,3 +7,1118 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 13
+import functools
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset11 as opset11,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    softmax = g.op("Softmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+
+    return softmax
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return return_op
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    dim_val = symbolic_helper._maybe_get_const(dim, "is")
+    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
+        return g.op("ReduceL2", self, keepdims_i=0)
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            raise errors.SymbolicValueError(
+                "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    splits = g.op("Constant", value_t=torch.tensor(splits))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::tensor_split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def tensor_split(
+    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
+):
+    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    axis = opset11.unsqueeze(g, axis, 0)
+    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+
+    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
+        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
+
+        if split_val.dim() > 0:
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            res = []
+            assert _outputs is not None
+            for i in range(_outputs - 1):
+                end = g.op(
+                    "Gather",
+                    indices_or_sections,
+                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+                    axis_i=0,
+                )
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+
+            end = symbolic_helper._size_helper(g, self, axis)
+            res.append(g.op("Slice", self, start, end, axis))
+            return res
+
+        split_size = symbolic_helper._get_const(
+            indices_or_sections, "i", "indices_or_sections"
+        )
+
+        size = symbolic_helper._get_tensor_dim_size(self, dim)
+        if size is None:
+            if _outputs is not None:
+                size = split_size * _outputs
+            else:
+                raise errors.SymbolicValueError(
+                    "Unknown dimension size not supported", self
+                )
+
+        min_split_size = size // split_size
+        num_splits_one_extra = size % split_size
+
+        splits = num_splits_one_extra * [min_split_size + 1]
+        leftover = (split_size - num_splits_one_extra) * [min_split_size]
+
+        splits = g.op(
+            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
+        )
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+    if (
+        symbolic_helper._is_tensor(indices_or_sections)
+        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
+    ):
+        loop_len = symbolic_helper._size_helper(
+            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
+        )
+        loop_len = opset11.unsqueeze(g, loop_len, 0)
+        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+        # To make the first slice in the below loop work,
+        # we pad a zero to the first position so that it will be the initial start of slice.
+        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
+
+        final_splits = g.op("SequenceEmpty")
+        # Loop inputs
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+        final_splits = utils._add_input_to_block(loop_block)
+
+        start = loop_context.op(
+            "Gather", indices_or_sections, block_input_iter, axis_i=0
+        )
+        end = loop_context.op(
+            "Gather",
+            indices_or_sections,
+            loop_context.op("Add", block_input_iter, const_1),
+            axis_i=0,
+        )
+
+        slice = loop_context.op("Slice", self, start, end, axis)
+        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
+
+        # Loop outputs
+        cond_out = loop_context.op("Identity", loop_condition)
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, final_splits)
+
+        loop_out = loop.node().output()
+        start = g.op(
+            "Gather",
+            indices_or_sections,
+            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
+            axis_i=0,
+        )
+        start = opset11.unsqueeze(g, start, 0)
+        end = symbolic_helper._size_helper(g, self, axis)
+
+        last_slice = g.op("Slice", self, start, end, axis)
+
+        return g.op("SequenceInsert", loop_out, last_slice)
+
+    else:  # scalar tensor
+        dim_size = symbolic_helper._size_helper(g, self, axis)
+        min_split_size = g.op("Div", dim_size, indices_or_sections)
+        min_split_size_plus_1 = g.op(
+            "Add",
+            min_split_size,
+            const_1,
+        )
+        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
+        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
+        leftover = g.op(
+            "Tile",
+            min_split_size,
+            g.op(
+                "Sub",
+                opset11.unsqueeze(g, indices_or_sections, 0),
+                num_splits_one_extra,
+            ),
+        )
+
+        splits = g.op("Concat", splits, leftover, axis_i=0)
+        if _outputs is None:
+            return g.op("SplitToSequence", self, splits, axis_i=dim)
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
+    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = opset9.nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
+def fake_quantize_per_channel_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    axis,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_onnx_symbolic(
+    "aten::sum",
+    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
+)
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @symbolic_helper._overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        @symbolic_helper.parse_args("v", "v", "i", "none")
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
+# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
+@_onnx_symbolic("aten::unflatten")
+def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+
+    # dim could be negative
+    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
+    dim = g.op("Add", input_dim, dim)
+    dim = g.op("Mod", dim, input_dim)
+
+    input_size = g.op("Shape", input)
+
+    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
+    head_end_idx = g.op(
+        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
+
+    dim_plus_one = g.op(
+        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    tail_start_idx = g.op(
+        "Reshape",
+        dim_plus_one,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
+    )
+    tail_end_idx = g.op(
+        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+    )
+    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
+
+    final_shape = g.op(
+        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
+    )
+
+    return symbolic_helper._reshape_helper(g, input, final_shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+
+    # TODO: So far we don"t have a module using this method. We"ll keep
+    # this as a constant unless we see a request of dynamics in any
+    # user's modules.
+    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::tile")
+def tile(g: jit_utils.GraphContext, self, dims):
+    self_shape = g.op("Shape", self)
+    self_rank = g.op("Size", self_shape)
+    dims_rank = g.op("Size", dims)
+    diff = g.op("Sub", self_rank, dims_rank)
+    const_zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    # 1. If dims is shorter than self.shape pad dims with 1
+    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
+    (
+        if_op_greater,
+        (if_context_greater, else_context_greater),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
+    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
+    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
+    utils._add_output_to_block(if_context_greater.block, dims_)
+    identity_dim = else_context_greater.op("Identity", dims)
+    utils._add_output_to_block(else_context_greater.block, identity_dim)
+    dims_final = if_op_greater.node().output()
+
+    # 2. If dims is longer than self.shape pad self.shape with 1
+    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
+    (
+        if_op_less,
+        (if_context_less, else_context_less),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_less = if_context_less.op(
+        "Reshape",
+        if_context_less.op("Abs", diff),
+        const_one,
+    )
+    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
+    self_final_shape = if_context_less.op(
+        "Concat", exapnd_ones_less, self_shape, axis_i=0
+    )
+    self_ = if_context_less.op("Reshape", self, self_final_shape)
+    utils._add_output_to_block(if_context_less.block, self_)
+    identity_self = else_context_less.op("Identity", self)
+    utils._add_output_to_block(else_context_less.block, identity_self)
+    self_final = if_op_less.node().output()
+
+    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Tile", self_final, dims_final)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+
+    # Check if all indices should be repeated the same number of times.
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = symbolic_helper._size_helper(g, self, dim)
+        reps = opset11.unsqueeze(g, reps, 0)
+
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        if cond_dynamic_repeats:
+            repeat_dim = symbolic_helper._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, self, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+
+    # Loop inputs
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
+    )
+
+    loop_block = loop_context.block
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)  # noqa: F841
+    final_splits = utils._add_input_to_block(loop_block)
+
+    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
+    r_concat = [
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
+    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
+    i_split = opset9.expand(loop_context, i_split, r_concat, None)
+    i_split = symbolic_helper._reshape_helper(
+        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
+    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
+
+
+@_onnx_symbolic("aten::diagonal")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
+    rank = symbolic_helper._get_tensor_rank(self)
+    # Replace negative indexing when rank is known
+    if rank is not None:
+        dim1 = dim1 if dim1 >= 0 else dim1 + rank
+        dim2 = dim2 if dim2 >= 0 else dim2 + rank
+
+    dim1_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
+    )
+    dim2_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
+    )
+    # Create appropriate mask
+    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
+    mask = opset9.zeros(g, mask_shape, None, None, None)
+    mask = g.op("EyeLike", mask, k_i=offset)
+    # dim1 and dim2 appended as a dimension at the end of the shape
+
+    if rank is not None:
+        axes = list(range(rank))
+        axes.remove(dim1)
+        axes.remove(dim2)
+        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
+    else:
+        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
+
+    # Multiply input and mask to calculate values along diagonal
+    # The mask consists of one values where diagonal values are to be calculated
+    # For example:
+    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
+    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
+    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
+    result = g.op("Mul", self, mask)
+    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
+
+    # Calculate gather indices based on offset and dims
+    # If offset is greater than zero, set offset to zero as this aids in
+    # calculation of selection window
+    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
+    if offset >= 0:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+        offset = 0
+    else:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+    diag_size = g.op("Concat", diag_size, axis_i=0)
+
+    # Calculate which diagonal values to select
+    # For example, in cases with offsets:
+    # [[0, 1.1, 0]
+    #  [0, 0, 2.2]]
+    # we need to select the last two columns, so we create a tensor
+    # with all columns that are to be selected
+    # So in this example, it is [1, 2]
+    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
+    gather_shape.append(diag_size)
+    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
+    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
+
+    # There might be cases where offset value is greater than number of rows/columns
+    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
+    # For example, if
+    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
+    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
+    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
+    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
+    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
+    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
+    # returning an empty tensor
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
+
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", overrun_cond, n_blocks=2
+    )
+
+    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
+    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
+        if_context, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun = if_context.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
+    utils._add_output_to_block(if_context.block, final_non_overrun)
+    utils._add_output_to_block(else_context.block, final_overrun)
+    return if_op
+
+
+# Quantized ops
+
+
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index 367aa9eb0832a..106566e26c05d 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset14."""
 
 from __future__ import annotations
@@ -6,3 +7,290 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset14 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 14.
+
+Note [ONNX operators that are added/updated in opset 14]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    HardSwish, Trilu
+
+Updated operators:
+    Reshape
+    Add, Sub, Mul, Div
+    GRU, LSTM, RNN
+    BatchNorm, Cumsum, Relu
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+from __future__ import annotations
+
+import functools
+
+import torch
+from torch.onnx import _constants, _type_utils, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+__all__ = [
+    "hardswish",
+    "tril",
+    "triu",
+    "reshape",
+    "batch_norm",
+    "quantized_hardswish",
+    "scaled_dot_product_attention",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    return g.op("HardSwish", self)
+
+
+@_onnx_symbolic("aten::tril")
+def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=0)
+
+
+@_onnx_symbolic("aten::triu")
+def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=1)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v")
+def reshape(g: jit_utils.GraphContext, self, shape):
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    symbolic_helper.check_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
+# aten_scaled_dot_product_attention
+# NOTE: Need op.Trilu
+@_onnx_symbolic("aten::scaled_dot_product_attention")
+@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
+def scaled_dot_product_attention(
+    g: jit_utils.GraphContext,
+    query: torch._C.Value,
+    key: torch._C.Value,
+    value: torch._C.Value,
+    attn_mask: torch._C.Value | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: torch._C.Value | None = None,
+    enable_gqa: bool = False,
+):
+    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
+        "is_causal and attn_mask cannot be set at the same time"
+    )
+    assert not enable_gqa, (
+        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+    )
+
+    if symbolic_helper._is_none(scale):
+        scale = _attention_scale(g, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(g, query, key)
+
+    # Swap the last two axes of key
+    # NOTE: onnx-script has different logic here, because the attribute perms in
+    # transpose needs list of ints
+    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    key_transposed_axes = list(range(key_shape_builtin))
+    key_transposed_axes[-1], key_transposed_axes[-2] = (
+        key_transposed_axes[-2],
+        key_transposed_axes[-1],
+    )
+    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+    if symbolic_helper._is_none(attn_mask):
+        mul_qk_add = mul_qk
+    elif (
+        _type_utils.JitScalarType.from_value(attn_mask)
+        == _type_utils.JitScalarType.BOOL
+    ):
+        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    elif _type_utils.JitScalarType.from_value(attn_mask) in (
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    ):
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    else:
+        raise ValueError(
+            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
+        )
+
+    attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+
+    if dropout_p != 0:
+        attn_weight = g.op(
+            "Dropout",
+            attn_weight,
+            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+        )
+
+    return g.op("MatMul", attn_weight, value)
+
+
+def _attention_scale(
+    g: jit_utils.GraphContext, query: torch._C.Value
+) -> torch._C.Value:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    query_shape = g.op("Shape", query)
+    query_shape_last = g.op(
+        "Slice",
+        query_shape,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+        g.op(
+            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+        ),
+    )
+    embedding_size = g.op(
+        "Cast",
+        query_shape_last,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
+    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
+    # Add a Cast to convert the scale back to original type
+    scale = g.op(
+        "Cast",
+        scale,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    return scale
+
+
+def _causal_attention_mask(
+    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
+) -> torch._C.Value:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+
+    query_shape = g.op("Shape", query)
+    key_shape = g.op("Shape", key)
+
+    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
+    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
+    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
+    # attn_mask = torch.ones(L, S) := {
+    size = g.op("Concat", target_length, source_length, axis_i=0)
+    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
+    attn_mask = g.op("Expand", const_one, size)
+    # }
+    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+    attn_mask = g.op(
+        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
+    )
+    return attn_mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset15.py b/torch/onnx/symbolic_opset15.py
index e04e3b0452127..dd30af7729de7 100644
--- a/torch/onnx/symbolic_opset15.py
+++ b/torch/onnx/symbolic_opset15.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset15."""
 
 from __future__ import annotations
@@ -6,3 +7,85 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset15 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 15.
+
+Note [ONNX operators that are added/updated in opset 15]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
+New operators:
+    Bernoulli
+    CastLike
+    Optional
+    OptionalGetElement
+    OptionalHasElement
+
+Updated operators:
+    BatchNormalization https://github.com/onnx/onnx/pull/3545
+                        Backwards compatible
+                        TODO: test coverage for mixed types inputs.
+    Pow                https://github.com/onnx/onnx/pull/3412
+                        Backwards compatible
+                        TODO: bfloat16 support.
+    Shape              https://github.com/onnx/onnx/pull/3580
+                        Backwards compatible
+                        TODO: optional start/end attribute.
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch import _C
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
+
+
+@_onnx_symbolic("aten::__is_")
+def aten__is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if isinstance(self.type(), _C.OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return opset9.eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
+def aten__isnot_(g: jit_utils.GraphContext, self, other):
+    return aten__is_(g, self, other)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    # exists to refine the type of the Value
+    # if x is Optional[Tensor], unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor.
+    if isinstance(self.type(), _C.OptionalType):
+        return g.op("OptionalGetElement", self)
+
+    return self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index 9a248bb0f26c5..adde30c642dbe 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset16."""
 
 from __future__ import annotations
@@ -6,3 +7,190 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset16 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 16.
+
+Note [ONNX Operators that are added/updated in opset 16]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+New operators:
+    GridSample https://github.com/onnx/onnx/pull/3557
+
+Updated operators:
+    Identity
+    If
+    LeakyRelu
+    Loop
+    PRelu
+    RoiAlign
+    Scan
+    ScatterElements
+    ScatterND
+    Where
+    GreaterOrEqual
+    LessOrEqual
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx import _type_utils, errors, symbolic_helper, utils
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def grid_sampler(
+    g: jit_utils.GraphContext,
+    input,
+    grid,
+    mode_enum,
+    padding_mode_enum,
+    align_corners,
+):
+    # Check the input and grid tensor rank beforehand.
+    if symbolic_helper._get_tensor_rank(input) == 5:
+        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
+        padding_mode_enum
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src_sizes = symbolic_helper._get_tensor_sizes(src)
+    index_sizes = symbolic_helper._get_tensor_sizes(index)
+
+    if len(src_sizes) != len(index_sizes):
+        return symbolic_helper._unimplemented(
+            "scatter_add",
+            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
+        )
+
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
+
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+
+        return g.op(
+            "ScatterElements",
+            self,
+            index,
+            src,
+            axis_i=dim,
+            reduction_s="add",
+        )
+
+
+@_onnx_symbolic("aten::scatter_reduce")
+@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
+def scatter_reduce(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    dim: int,
+    index: torch._C.Value,
+    src: torch._C.Value,
+    reduce: str,
+    include_self: bool,
+):
+    if reduce == "mean":
+        raise errors.OnnxExporterError(
+            "ONNX does not support mean reduction for scatter_reduce"
+        )
+    if not include_self:
+        raise errors.OnnxExporterError(
+            "ONNX does not support include_self=False for scatter_reduce"
+        )
+
+    reduce_mode = {  # convert torch string name to onnx string name
+        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
+        "sum": "add",
+        "prod": "mul",
+        "amin": "min",
+        "amax": "max",
+    }
+    onnx_reduce = reduce_mode[reduce]
+
+    self_rank = g.op("Size", g.op("Shape", self))
+
+    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
+    self_rank_is_zero = g.op(
+        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+    )
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
+    )
+    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+
+    self_reshape = if_context.op("Reshape", self, neg_1)
+    utils._add_output_to_block(if_context.block, self_reshape)
+    index_reshape = if_context.op("Reshape", index, neg_1)
+    utils._add_output_to_block(if_context.block, index_reshape)
+    src_reshape = if_context.op("Reshape", src, neg_1)
+    utils._add_output_to_block(if_context.block, src_reshape)
+
+    self_identity = else_context.op("Identity", self)
+    utils._add_output_to_block(else_context.block, self_identity)
+    index_identitye = else_context.op("Identity", index)
+    utils._add_output_to_block(else_context.block, index_identitye)
+    src_identity = else_context.op("Identity", src)
+    utils._add_output_to_block(else_context.block, src_identity)
+
+    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
+
+    # if self_rank == 0:
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
+    )
+    result_squeezed = if_context.op("Squeeze", result)
+    utils._add_output_to_block(if_context.block, result_squeezed)
+    result_identity = else_context.op("Identity", result)
+    utils._add_output_to_block(else_context.block, result_identity)
+    result_final = if_op.node().output()
+
+    return result_final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset17.py b/torch/onnx/symbolic_opset17.py
index 800acd446b5dc..d028b8cf9c109 100644
--- a/torch/onnx/symbolic_opset17.py
+++ b/torch/onnx/symbolic_opset17.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset17."""
 
 from __future__ import annotations
@@ -6,3 +7,244 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset17 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 17.
+
+Note [ONNX Operators that are added/updated in opset 17]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
+New operators:
+    BlackmanWindow
+    DFT
+    HammingWindow
+    HannWindow
+    LayerNormalization
+    MelWeightMatrix
+    STFT
+    SequenceMap
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx import _type_utils, errors, symbolic_helper
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    dtype = scalar_type.dtype()
+    if symbolic_helper._is_none(weight):
+        weight_value = torch.ones(normalized_shape, dtype=dtype)
+        weight = g.op("Constant", value_t=weight_value)
+    if symbolic_helper._is_none(bias):
+        bias_value = torch.zeros(normalized_shape, dtype=dtype)
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+    )
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+def _compute_edge_sizes(n_fft, window_size):
+    """Helper function to compute the sizes of the edges (left and right)
+    of a given window centered within an FFT size."""
+    left = (n_fft - window_size) // 2
+    right = n_fft - left - window_size
+    return left, right
+
+
+@_onnx_symbolic("aten::stft")
+@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b", "b")
+def stft(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[_C.Value] = None,
+    normalized: bool = False,
+    onesided: Optional[bool] = True,
+    return_complex: Optional[bool] = False,
+    align_to_window: Optional[bool] = None,
+) -> _C.Value:
+    """Associates `torch.stft` with the `STFT` ONNX operator.
+    Note that torch.stft calls _VF.stft, without centering or padding options.
+    Hence, this function does not contain these two arguments.
+    See torch.stft source code for more info.
+
+    Args:
+        g: Graph to write the ONNX representation into
+        input: Input tensor for the transformation
+        n_fft: FFT size
+        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
+        win_length: Size of the analysis window. Defaults to `n_fft`
+        window: Analysis window. Defaults to a window of all ones
+        normalized: Whether to return a normalized STFT
+        onesided: Whether to return only half (+1) of the results, given the
+            symmetry of the STFT
+        return_complex: Whether to return the complex value (Note: Must be
+            `False` or `None`)
+
+    Returns:
+        op: Operator for torch.stft associated with STFT (ONNX)
+    """
+    # Checks
+    if return_complex:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support complex types", value=input
+        )
+
+    if align_to_window is not None:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support the align_to_window option",
+            value=input,
+        )  # TODO(#145944): add compatibility with align_to_window option.
+
+    # Get STFT sizes
+    frame_step_value = hop_length if hop_length is not None else n_fft // 4
+    frame_step_const = g.op(
+        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
+    )
+    frame_length_const = g.op(
+        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
+    )
+
+    # Pre-process input if needed
+    signal = input
+    signal_rank = symbolic_helper._get_tensor_rank(signal)
+    if signal_rank == 1:
+        # Add batch dimension
+        signal = g.op(
+            "Unsqueeze",
+            signal,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+    elif signal_rank is None or signal_rank > 2:
+        raise errors.SymbolicValueError(
+            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
+            f"Current rank of signal is {signal_rank}, please reduce it.",
+            value=input,
+        )
+
+    # Get window and make sure it's the same size as `win_length` or `n_fft`
+    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
+    if n_win is not None:
+        win_length_default = win_length if win_length else n_fft
+        assert n_win == win_length_default, (
+            "Analysis window size must equal `win_length` or `n_fft`. "
+            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
+        )
+
+        # Center window around zeros if needed (required by ONNX's STFT)
+        if n_win < n_fft:
+            left, right = _compute_edge_sizes(n_fft, n_win)
+            left_win = g.op("Constant", value_t=torch.zeros(left))
+            right_win = g.op("Constant", value_t=torch.zeros(right))
+            window = g.op("Concat", left_win, window, right_win, axis_i=0)
+
+    # Create window, if needed
+    if symbolic_helper._is_none(window):
+        if win_length:
+            if win_length > n_fft:
+                raise errors.SymbolicValueError(
+                    msg="The analysis window can't be longer than the size of the FFT. "
+                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
+                    value=input,
+                )
+
+            # Center window, if needed
+            left, right = _compute_edge_sizes(n_fft, win_length)
+            torch_window = torch.hstack(
+                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
+            )
+        else:
+            # Rectangle window
+            torch_window = torch.ones(n_fft)
+        assert torch_window.shape[0] == n_fft
+        window = g.op("Constant", value_t=torch_window)
+    window = g.op(
+        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+    )
+
+    # Run STFT
+    result = g.op(
+        "STFT",
+        signal,
+        frame_step_const,
+        window,
+        frame_length_const,
+        onesided_i=1 if onesided is None or onesided else 0,
+    )
+
+    # Transpose to mimic torch.stft's behavior
+    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
+
+    # Remove batch dimension, if needed
+    if signal_rank == 1:
+        result = g.op(
+            "Squeeze",
+            result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    # Normalize, if needed
+    if normalized:
+        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
+        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
+
+    return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset18.py b/torch/onnx/symbolic_opset18.py
index cc07a60f018d8..5ade100a5b031 100644
--- a/torch/onnx/symbolic_opset18.py
+++ b/torch/onnx/symbolic_opset18.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset18."""
 
 from __future__ import annotations
@@ -6,3 +7,270 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset18 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    BitwiseAnd
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+    Split
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx import _type_utils, symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = [
+    "col2im",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::__and_")
+@_onnx_symbolic("aten::bitwise_and")
+def __and_(g: jit_utils.GraphContext, self, other):
+    # do type promotion (scalars don't seem to apply)
+    args = [self, other]
+    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
+    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
+    if len(prom_args) == 0:
+        prom_args = args
+    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
+    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
+    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
+    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
+        return g.op("And", self, other)
+    return g.op("BitwiseAnd", self, other)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding: list[int] = []
+    for pad in padding:
+        adjusted_padding.extend(pad for _ in range(2))
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
+
+
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def _native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def _glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, axes, keepdims_i=keepdim
+        )
+    else:
+        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, keepdims_i=keepdim
+        )
+
+
+@_onnx_symbolic("aten::var_mean")
+def _var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
+    else:
+        return symbolic_helper._var_mean_helper(g, input, *args)
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    if dim is None:
+        return g.op("ReduceLogSumExp", input, keepdims_i=0)
+    else:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def _linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset19.py b/torch/onnx/symbolic_opset19.py
index 4f7a54fc1dd38..9a5908ac8cf02 100644
--- a/torch/onnx/symbolic_opset19.py
+++ b/torch/onnx/symbolic_opset19.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset19."""
 
 from __future__ import annotations
@@ -6,3 +7,36 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset19 import *  # noqa: F401,F403
+=======
+"""This file exports ONNX ops for opset 19.
+
+Note [ONNX Operators that are added/updated in opset 19]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
+New operators:
+AveragePool
+Cast
+CastLike
+Constant
+DeformConv
+DequantizeLinear
+Equal
+Identity
+If
+Loop
+Pad
+QuantizeLinear
+Reshape
+Resize
+Scan
+Shape
+Size
+"""
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__: list[str] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset20.py b/torch/onnx/symbolic_opset20.py
index 56635a7811611..5b588565aa0fe 100644
--- a/torch/onnx/symbolic_opset20.py
+++ b/torch/onnx/symbolic_opset20.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset20."""
 
 from __future__ import annotations
@@ -6,3 +7,97 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset20 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 20.
+
+Note [ONNX Operators that are added/updated in opset 20]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
+New operators:
+    AffineGrid
+    ConstantOfShape
+    DFT
+    Gelu
+    GridSample
+    ImageDecoder
+    IsInf
+    IsNaN
+    ReduceMax
+    ReduceMin
+    RegexFullMatch
+    StringConcat
+    StringSplit
+"""
+
+import functools
+
+import torch.nn.functional as F
+from torch import _C
+from torch.onnx import symbolic_helper
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
+
+
+def convert_grid_sample_mode(mode_s):
+    return (
+        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
+    )
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
+
+
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def _grid_sampler(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    grid: _C.Value,
+    mode_enum: int,
+    padding_mode_enum: int,
+    align_corners: bool,
+):
+    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
+    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
+    mode_s = convert_grid_sample_mode(mode_s)
+    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
+        padding_mode_enum  # type: ignore[index]
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::affine_grid_generator")
+@symbolic_helper.parse_args("v", "v", "b")
+def _affine_grid_generator(
+    g: jit_utils.GraphContext,
+    theta: _C.Value,
+    size: _C.Value,
+    align_corners: bool,
+):
+    return g.op(
+        "AffineGrid",
+        theta,
+        size,
+        align_corners_i=int(align_corners),
+    )
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
+    return g.op("Gelu", self, approximate_s=approximate)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset7.py b/torch/onnx/symbolic_opset7.py
index c11e769677ec4..7bf870a322c68 100644
--- a/torch/onnx/symbolic_opset7.py
+++ b/torch/onnx/symbolic_opset7.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset7."""
 
 from __future__ import annotations
@@ -6,3 +7,72 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset7 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 7 to opset 8]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+  Expand
+
+Updated operators:
+  Min, Max, Sum, Mean: supports multidirectional broadcasting.
+  MaxPool: added optional indices output.
+  Scan
+"""
+
+import functools
+import warnings
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
+
+block_listed_operators = (
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+)
+
+
+# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+@_onnx_symbolic("aten::max")
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
+    return opset9.max(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::min")
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
+    return opset9.min(g, self, dim_or_y, keepdim)
+
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 0e4411649f3e0..ba799bd6a5721 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset8."""
 
 from __future__ import annotations
@@ -6,3 +7,468 @@
 __all__: list[str] = []
 
 from torch.onnx._internal.torchscript_exporter.symbolic_opset8 import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 8 to opset 9]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    Compress
+    ConstantOfShape
+    EyeLike
+    MaxUnpool
+    OneHot
+    Sinh
+    Cosh
+    Asinh
+    Acosh
+    Atanh
+    Shrink
+    IsNaN
+    Sign
+    Erf
+    Scatter
+    Where
+    NonZero
+    TfIdfVectorizer
+    MeanVarianceNormalization
+
+Updated operators:
+    BatchNormalization: removed spatial attribute.
+    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
+    Cast: more data types{string} supported.
+    Upsample: moved scales from attribute to input.
+    Scan
+"""
+
+import functools
+import warnings
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
+
+block_listed_operators = (
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
+    "any",
+    "all",
+)
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        output_size = symbolic_helper._maybe_get_const(output_size, "is")
+        if symbolic_helper._is_value(output_size):
+            return symbolic_helper._unimplemented(
+                name, "torch._C.Value (output_size) indexing"
+            )
+        if scales is None:
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
+        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
+    if not symbolic_helper._is_none(align_corners) and align_corners:
+        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
+
+    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
+        scale_factor
+    ):
+        return symbolic_helper._unimplemented(
+            "interpolate", "dynamic scales in opset 8"
+        )
+
+    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
+        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
+
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
+
+
+# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
+#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
+#       is lost after casting.
+def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
+    old_type = None
+    # Cast the input tensor to Float if its scalarType is known and is not floating number.
+    # If casting is performed, return the old scalarType, otherwise return None.
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
+        old_type = arg0_type
+        if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()  # type: ignore[assignment]
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
+        else:
+            return (None,) + args
+    else:
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
+    return (old_type,) + args
+
+
+def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
+    if to_type is None:
+        return input
+    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
+
+
+def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
+    other = symbolic_helper._maybe_get_scalar(other)
+    other = symbolic_helper._if_scalar_type_as(other, input)
+    _, input, other = _try_cast_integer_to_float(g, input, other)
+    return g.op(op_name, input, other)
+
+
+# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
+#       integer input type not supported in opset8. Cast to float if possible.
+@_onnx_symbolic("aten::gt")
+def gt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Greater")
+
+
+@_onnx_symbolic("aten::lt")
+def lt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Less")
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other = _try_cast_integer_to_float(g, self, other)
+        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
+    else:
+        return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return bmm(g, self, other)
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    if self_rank is not None and self_rank > 2:
+        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+    elif self_rank == 0 and weight_sizes == [1]:
+        # self and weight are both scalar but weight has rank == 1, squeeze weight.
+        weight = symbolic_helper._squeeze_helper(g, weight, [0])
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
+        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
+    else:
+        return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
+    if scalar_type is None:
+        raise errors.SymbolicValueError(
+            "mm can only operate on tensors with known types", self
+        )
+    zero_constant = g.op(
+        "Constant",
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other, zero_constant = _try_cast_integer_to_float(
+            g, self, other, zero_constant
+        )
+        return _cast_to_type(
+            g,
+            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
+            old_type,
+        )
+    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
+        return _cast_to_type(
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=symbolic_helper._scalar(beta),
+                alpha_f=symbolic_helper._scalar(alpha),
+            ),
+            old_type,
+        )
+    else:
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=symbolic_helper._scalar(beta),
+            alpha_f=symbolic_helper._scalar(alpha),
+        )
+
+
+@_onnx_symbolic("aten::flatten")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
+    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
+
+    dim = input.type().dim()
+    if end_dim_i < 0:
+        end_dim_i = dim + end_dim_i
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim_i == 1 and end_dim_i == dim - 1:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=start_dim_i)
+    if start_dim_i == 0 and end_dim_i == dim - 2:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=end_dim_i + 1)
+
+    return opset9.flatten(g, input, start_dim, end_dim)
+
+
+def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if not scalar_type.dtype().is_floating_point:
+        result = g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return g.op("Cast", result, to_i=scalar_type.onnx_type())
+    else:
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=scalar_type.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device and layout in ONNX, so we ignore it
+    return _constant_fill(g, sizes, dtype, 0)
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 0)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    return _constant_fill(g, sizes, dtype, 1)
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 1)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return _constant_fill(g, sizes, dtype, const_value)
+
+
+@_onnx_symbolic("aten::full_like")
+@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, fill_value)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    if not symbolic_helper._is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    if symbolic_helper._is_packed_list(repeats):
+        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
+    else:
+        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
+        repeat_size_len = len(const_repeats)
+    if self.isCompleteTensor():
+        sizes = self.type().sizes()
+        diff_dims = repeat_size_len - len(sizes)
+        if diff_dims > 0:
+            self = opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
+    return g.op("Tile", self, repeats)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index bd0f4795340ae..68d83695d692c 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.symbolic_opset9."""
 
 from __future__ import annotations
@@ -12,3 +13,6658 @@
     _slice,
     _var_mean,
 )
+=======
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 9.
+
+Opset 9 is supported by ONNX release 1.4.1
+release on 01/23/19
+"""
+
+from __future__ import annotations
+
+import builtins
+import functools
+import math
+import sys
+import warnings
+from typing import Callable, TYPE_CHECKING
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.nn.modules.utils
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _type_utils, errors, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "abs",
+    "acos",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
+    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding_bag",
+    "embedding",
+    "empty_like",
+    "empty",
+    "eq",
+    "erf",
+    "exp",
+    "expand_as",
+    "expand",
+    "eye",
+    "fill",
+    "flatten",
+    "floor_divide",
+    "floor",
+    "floordiv",
+    "frobenius_norm",
+    "full_like",
+    "full",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "index",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log_sigmoid",
+    "log_softmax",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm_cell",
+    "lstm",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "max",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero_numpy",
+    "nonzero",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones_like",
+    "ones",
+    "onnx_placeholder",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_constant",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand_like",
+    "rand",
+    "randint_like",
+    "randint",
+    "randn_like",
+    "randn",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat_interleave",
+    "repeat",
+    "replication_pad",
+    "reshape_as",
+    "reshape",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter_add",
+    "scatter",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std_mean",
+    "std",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split_with_sizes",
+    "unsafe_split",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "noop_complex_operators",
+    "unused",
+    "var_mean",
+    "var",
+    "view_as",
+    "view",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zeros_like",
+    "zeros",
+    "zero",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
+
+
+def _export(name: str):
+    """Exports the function in the current global namespace."""
+
+    def wrapper(func):
+        globals()[name] = func
+        __all__.append(name)
+        return func
+
+    return wrapper
+
+
+def unused(g):
+    """Represents "missing" optional inputs."""
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_onnx_symbolic("aten::_shape_as_tensor")
+def _shape_as_tensor(g: jit_utils.GraphContext, input):
+    return g.op("Shape", input)
+
+
+@_onnx_symbolic("aten::_reshape_from_tensor")
+def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
+    if isinstance(shape, list):
+        shape = g.op("Concat", *shape, axis_i=0)
+    return reshape(g, input, shape)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+def reshape(g: jit_utils.GraphContext, self, shape):
+    return symbolic_helper._reshape_helper(g, self, shape)
+
+
+@_onnx_symbolic("aten::reshape_as")
+@symbolic_helper.quantized_args(True)
+def reshape_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    This function takes the add function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
+
+    Returns:
+        ONNX operator.
+    """
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported", self
+        )
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Add", self, other)
+
+
+@_onnx_symbolic("aten::sub")
+def sub(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    Consumes sub function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
+            If `alpha` is not provided, it defaults to 1.
+
+    Returns:
+        ONNX operator
+    """
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Sub", self, other)
+
+
+@_onnx_symbolic("aten::rsub")
+def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
+    return sub(g, other, self, alpha=alpha)
+
+
+@_onnx_symbolic("aten::mul")
+def mul(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
+        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
+        return g.op("And", self, other)
+    else:
+        return g.op("Mul", self, other)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@_onnx_symbolic("aten::addcmul")
+@symbolic_helper.parse_args("v", "v", "v", "f")
+def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode is None:
+        return true_divide(g, self, other)
+    elif rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    elif rounding_mode == "trunc":
+        return _trunc_divide(g, self, other)
+    else:
+        raise errors.SymbolicValueError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
+            self,
+        )
+
+
+def _trunc_divide(g: jit_utils.GraphContext, self, other):
+    out = g.op("Div", self, other)
+    # the correct operation is truncate, which is not supported in ONNX,
+    # we cannot call floor since it will behave differently for negative numbers
+    # (eg. -0.1 should become -0 )
+    # - if scalar_type information are not available, assume that
+    # we need to call floor (treat as float)
+    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+    # Matching PyTorch's behavior:
+    # - if self is fp the output's type is self's type
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
+    # - self is not fp and other is not fp, the output's type is self's output type
+    # - the output type defaults to Float
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
+            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        else:
+            out = g.op(
+                "Cast",
+                out,
+                to_i=scalar_type.onnx_type(),
+            )
+    else:
+        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return out
+
+
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op(
+            "Xor",
+            symbolic_helper._lt_helper(g, self, zero),
+            symbolic_helper._lt_helper(g, other, zero),
+        )
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Sub", self, g.op("Mul", div, other))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Mul", fixup_mask, one)
+        return g.op("Sub", div, fixup)
+
+
+@_onnx_symbolic("aten::floor_divide")
+def floor_divide(g: jit_utils.GraphContext, self, other):
+    # Deprecated behavior, floor_divide actually truncates
+    return _trunc_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::floordiv")
+def floordiv(g: jit_utils.GraphContext, self, other):
+    return floor_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::true_divide")
+def true_divide(g: jit_utils.GraphContext, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
+    # Case 1: either values are floating
+    # Performs div as usual.
+    # Implicit casting will be handled in scalar type analysis pass.
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        return g.op("Div", self, other)
+
+    # Case 2: neither is floating
+    # Casts both inputs to the default scalar type
+    scalar_type = torch.get_default_dtype()
+    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
+    assert scalar_type is torch.float or scalar_type is torch.double
+    if torch.get_default_dtype() is torch.double:
+        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
+
+    self = g.op("Cast", self, to_i=onnx_scalar_type)
+    other = g.op("Cast", other, to_i=onnx_scalar_type)
+    return g.op("Div", self, other)
+
+
+@_onnx_symbolic("aten::reciprocal")
+def reciprocal(g: jit_utils.GraphContext, self):
+    # torch.reciprocal implicitly casts to float, so we do the same.
+    if not symbolic_helper._is_fp(self):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return g.op("Reciprocal", self)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.parse_args("v", "i")
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
+
+    Parameters:
+        g (jit_utils.GraphContext): Graph context.
+        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
+        dim (int): Dimension along which to concatenate the tensors.
+
+    Returns:
+        ONNX graph node representing the concatenated tensor.
+    """
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+        or symbolic_helper._get_tensor_rank(t) is None
+        or symbolic_helper._get_tensor_rank(t)
+        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+        for t in nonempty_tensors
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@symbolic_helper.parse_args("v", "i")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    unsqueezed = [
+        symbolic_helper._unsqueeze_helper(g, t, [dim])
+        for t in symbolic_helper._unpack_list(tensor_list)
+    ]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+@_onnx_symbolic("aten::list")
+def _list(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    C = g.op("Constant", value_t=torch.tensor([1]))
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
+
+    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
+    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
+
+    def is_not_none_nor(v, u):
+        return v is not None and v != u
+
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
+    ):
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = symbolic_helper._scalar(alpha)
+        beta = symbolic_helper._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op(
+                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
+            )
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
+                ),
+            )
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=symbolic_helper._scalar(beta),
+        alpha_f=symbolic_helper._scalar(alpha),
+    )
+
+
+@_onnx_symbolic("aten::neg")
+def neg(g: jit_utils.GraphContext, self):
+    return g.op("Neg", self)
+
+
+@_onnx_symbolic("aten::sqrt")
+def sqrt(g: jit_utils.GraphContext, self):
+    if _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT16,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT64,
+    }:
+        # torch converts all int inputs to sqrt to float
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    return g.op("Sqrt", self)
+
+
+@_onnx_symbolic("aten::rsqrt")
+def rsqrt(g: jit_utils.GraphContext, self):
+    return g.op(
+        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
+    )
+
+
+@_onnx_symbolic("aten::tanh")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
+@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
+def tanh(g: jit_utils.GraphContext, self):
+    return g.op("Tanh", self)
+
+
+@_onnx_symbolic("aten::sin")
+def sin(g: jit_utils.GraphContext, self):
+    return g.op("Sin", self)
+
+
+@_onnx_symbolic("aten::cos")
+def cos(g: jit_utils.GraphContext, self):
+    return g.op("Cos", self)
+
+
+@_onnx_symbolic("aten::tan")
+def tan(g: jit_utils.GraphContext, self):
+    return g.op("Tan", self)
+
+
+@_onnx_symbolic("aten::asin")
+def asin(g: jit_utils.GraphContext, self):
+    return g.op("Asin", self)
+
+
+@_onnx_symbolic("aten::acos")
+def acos(g: jit_utils.GraphContext, self):
+    return g.op("Acos", self)
+
+
+@_onnx_symbolic("aten::atan")
+def atan(g: jit_utils.GraphContext, self):
+    return g.op("Atan", self)
+
+
+@_onnx_symbolic("aten::atan2")
+def atan2(g: jit_utils.GraphContext, self, other):
+    # self is y, and other is x on coordinate
+    slope = g.op("Div", self, other)
+    atan = g.op("Atan", slope)
+    const_zero = g.op("Constant", value_t=torch.tensor(0))
+    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
+
+    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
+    second_third_quadrant = g.op(
+        "Where",
+        condition_second_or_third_quadrant,
+        g.op("Add", atan, const_pi),
+        g.op("Sub", atan, const_pi),
+    )
+
+    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
+    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
+
+    return result
+
+
+@_onnx_symbolic("aten::sigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+def sigmoid(g: jit_utils.GraphContext, self):
+    """Converts the corresponding PyTorch function into ONNX operators.
+
+    It is not meant to be called directly by a user.
+
+    Args:
+        g (jit_utils.GraphContext): Graph context.
+        self (Tensor): the input tensor.
+    Returns:
+        ONNX operator
+    """
+    return g.op("Sigmoid", self)
+
+
+@_onnx_symbolic("aten::sign")
+def sign(g: jit_utils.GraphContext, self):
+    return g.op("Sign", self)
+
+
+@symbolic_helper.quantized_args(True)
+def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
+    assert len(starts) == len(ends)
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
+        return input
+    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
+
+
+@_onnx_symbolic(
+    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
+)
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+# torch.prod does not support multidimensional "dim"
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_sample_dirichlet")
+def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
+
+
+@_onnx_symbolic("aten::_standard_gamma")
+def _standard_gamma(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
+
+
+@_onnx_symbolic("aten::t")
+def t(g: jit_utils.GraphContext, self):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is None or rank < 2:
+        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
+        # clearly and onnxruntime fails on these cases. So we add an Identity node to
+        # mirror the behavior of eager mode.
+        return g.op("Identity", self)
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+@_onnx_symbolic("aten::numpy_T")
+@symbolic_helper.quantized_args(True)
+def numpy_T(g: jit_utils.GraphContext, input):
+    ndim = symbolic_helper._get_tensor_rank(input)
+    assert ndim is not None
+    perm = list(reversed(range(0, ndim)))
+    return g.op("Transpose", input, perm_i=perm)
+
+
+@_onnx_symbolic("aten::expand")
+@symbolic_helper.quantized_args(True)
+def expand(g: jit_utils.GraphContext, self, size, implicit):
+    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::broadcast_to")
+@symbolic_helper.quantized_args(True)
+def broadcast_to(g: jit_utils.GraphContext, self, size):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::expand_as")
+@symbolic_helper.quantized_args(True, True)
+def expand_as(g: jit_utils.GraphContext, self, other):
+    self_t = symbolic_helper._maybe_get_const(self, "t")
+    if isinstance(self_t, torch.Tensor):
+        orig_type = self_t.dtype
+        self_t = self_t.to(torch.double)
+        dims = []
+        for d in range(self_t.dim()):
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+                dims.append(d)
+                self = g.op(
+                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
+                )
+
+    shape = g.op("Shape", other)
+    return g.op("Expand", self, shape)
+
+
+@_onnx_symbolic("aten::embedding")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i", "b", "v")
+def embedding(
+    g: jit_utils.GraphContext,
+    weight,
+    indices,
+    padding_idx,
+    scale_grad_by_freq,
+    sparse,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients.",
+            weight,
+        )
+    if padding_idx >= 0 and GLOBALS.export_training:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
+
+    return g.op("Gather", weight, indices)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if not symbolic_helper._is_none(per_sample_weights):
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with per_sample_weights"
+        )
+
+    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    if symbolic_helper._maybe_get_const(dim, "i") < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
+            dim = g.op("Constant", value_t=torch.tensor(dim))
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::transpose")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "i")
+def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None:
+        axes = list(range(rank))
+        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+        return g.op("Transpose", self, perm_i=axes)
+    else:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
+            self,
+        )
+
+
+@_onnx_symbolic("aten::permute")
+@symbolic_helper.parse_args("v", "is")
+def permute(g: jit_utils.GraphContext, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+@_onnx_symbolic("aten::view")
+@symbolic_helper.quantized_args(True)
+def view(g: jit_utils.GraphContext, self, size):
+    return reshape(g, self, size)
+
+
+@_onnx_symbolic("aten::view_as")
+def view_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "unsafe_chunk", "unknown dimension size", self
+        )
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported", self
+        )
+
+    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::select")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    """Implement the select functionality for a pytorch tensor in ONNX.
+
+    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
+    """
+    index = symbolic_helper._maybe_get_scalar(index)
+    if (not symbolic_helper._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = _constants.INT64_MAX
+        else:
+            end_index = index + 1
+        slice_node = symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
+        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
+    else:
+        # FIXME(justinchuby): can index be an int and not a value?
+        return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::square")
+def square(g: jit_utils.GraphContext, self):
+    return g.op("Mul", self, self)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
+    # Handle negative dims
+    if squeeze_dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            squeeze_dim += rank
+        else:
+            return symbolic_helper._unimplemented(
+                "squeeze", "negative axis with unknown input rank", self
+            )
+
+    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
+    if dim_size is None:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
+        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
+        return self
+
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
+    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    weight_rank = len(weight_sizes)
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = symbolic_helper._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0 and weight_sizes == [1]:
+            # self and weight are both scalar but weight has rank == 1, squeeze weight.
+            weight = symbolic_helper._squeeze_helper(g, weight, [0])
+            weight_rank = 0
+
+    if self_rank is not None and weight_rank is not None:
+        assert self_rank >= weight_rank, (
+            f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
+        )
+    return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::silu")
+def silu(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Sigmoid", input))
+
+
+@_onnx_symbolic("aten::mish")
+def mish(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
+
+
+@_onnx_symbolic("aten::relu")
+@symbolic_helper.quantized_args(True)
+def relu(g: jit_utils.GraphContext, input):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Relu", input, opset_before=14
+    )
+
+
+@_onnx_symbolic("aten::relu6")
+@symbolic_helper.quantized_args(True)
+def relu6(g: jit_utils.GraphContext, input):
+    return clamp(g, input, 0, 6)
+
+
+@_onnx_symbolic("aten::ceil")
+def ceil(g: jit_utils.GraphContext, input):
+    return g.op("Ceil", input)
+
+
+@_onnx_symbolic("aten::floor")
+def floor(g: jit_utils.GraphContext, input):
+    return g.op("Floor", input)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::threshold")
+@symbolic_helper.parse_args("v", "t", "t")
+def threshold(g: jit_utils.GraphContext, self, threshold, value):
+    # See Note [Export inplace]
+    if symbolic_helper._scalar(threshold) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
+    if symbolic_helper._scalar(value) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
+    return g.op("Relu", self)
+
+
+@_onnx_symbolic("aten::leaky_relu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "b")
+def leaky_relu(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    negative_slope: float,
+    inplace: bool = False,
+):
+    # See Note [Export inplace]
+    return g.op("LeakyRelu", input, alpha_f=negative_slope)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    # So use softmax when dim and axis both equal to ndim - 1,
+    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
+    # When input rank is not known at export time we compute softmax using a subgraph
+    # with other operators
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is not None:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+
+        is_transpose_required = input_dim != dim + 1
+
+        if is_transpose_required:
+            axes = list(range(input_dim))
+            axes[dim], axes[-1] = axes[-1], axes[dim]
+            input = g.op("Transpose", input, perm_i=axes)
+            dim = input_dim - 1
+
+        softmax = g.op("Softmax", input, axis_i=dim)
+        if dtype and dtype.node().kind() != "prim::Constant":
+            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+            softmax = g.op(
+                "Cast",
+                softmax,
+                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
+            )
+
+        if is_transpose_required:
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
+        return softmax
+
+    # Apply max normalization.
+    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
+
+    exp = g.op("Exp", input)
+    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
+    softmax = g.op("Div", exp, sum)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return softmax
+
+
+@_onnx_symbolic("aten::softplus")
+def softplus(g: jit_utils.GraphContext, self, beta, threshold):
+    beta_const = symbolic_helper._maybe_get_const(beta, "f")
+    if beta_const != 1:
+        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
+    return g.op("Softplus", self)
+
+
+@_onnx_symbolic("aten::get_pool_ceil_padding")
+def get_pool_ceil_padding(input, kernel_size, stride, padding):
+    # TODO(justinchuby): Looks like this op is deprecated in torch
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    dim = sizes[-len(padding) :] if sizes is not None else None
+    if dim is None or any(i is None for i in dim):
+        return symbolic_helper._unimplemented(
+            "get_pool_ceil_padding", "input size not accessible", input
+        )
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
+    # ensure last pooling starts inside
+    ceiled_output_dim = [
+        (
+            ceiled_output_dim[i] - 1
+            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+            else ceiled_output_dim[i]
+        )
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        (
+            0
+            if (stride[i] == 1)
+            else (
+                kernel_size[i]
+                - (
+                    dim[i]
+                    + 2 * padding[i]
+                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
+                )
+            )
+        )
+        for i in range(0, len(padding))
+    ]
+    # ensure padding is not > kernel_size
+    padding_ceil = [
+        (
+            (
+                int(padding_ceil[i])
+                if padding_ceil[i] < kernel_size[i] - 1
+                else int(kernel_size[i] - 1)
+            )
+            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+            else int(padding_ceil[i])
+        )
+        for i in range(0, len(padding_ceil))
+    ]
+    return padding_ceil
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
+        ),
+        _export("max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
+        ),
+        _export("max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
+        ),
+        _export("max_pool3d"),
+    ],
+)
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if set(tuple_fn(dilation)) != {1}:
+            return symbolic_helper._unimplemented(name, "dilation", input)
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
+        else:
+            padding = padding * 2
+        kwargs = {
+            "kernel_shape_i": tuple_fn(kernel_size),
+            "pads_i": padding,
+            "strides_i": tuple_fn(stride),
+        }
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and subtract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
+            # convert indices to have non-flattened indices values
+            s = symbolic_helper._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=list(tuple_fn(0)),
+                ends=list(tuple_fn(1)),
+            )
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
+    _max_pool(
+        "max_pool1d_with_indices",
+        torch.nn.modules.utils._single,
+        1,
+        return_indices=True,
+    )
+)
+max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
+    _max_pool(
+        "max_pool2d_with_indices",
+        torch.nn.modules.utils._pair,
+        2,
+        return_indices=True,
+    )
+)
+max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
+    _max_pool(
+        "max_pool3d_with_indices",
+        torch.nn.modules.utils._triple,
+        3,
+        return_indices=True,
+    )
+)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        _export("avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        _export("avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        _export("avg_pool3d"),
+    ],
+)
+def _avg_pool(name, tuple_fn):
+    @symbolic_helper.quantized_args(True)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        if not stride:
+            stride = kernel_size
+        padding = symbolic_helper._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
+        assert isinstance(padding, tuple)
+        adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if count_include_pad:
+            input = symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=adjusted_padding,
+        )
+        return output
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
+        ),
+        _export("adaptive_avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
+        ),
+        _export("adaptive_avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
+        ),
+        _export("adaptive_avg_pool3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool1d",
+            "MaxPool",
+            torch.nn.modules.utils._single,
+            max_pool1d_with_indices,
+        ),
+        _export("adaptive_max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool2d",
+            "MaxPool",
+            torch.nn.modules.utils._pair,
+            max_pool2d_with_indices,
+        ),
+        _export("adaptive_max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool3d",
+            "MaxPool",
+            torch.nn.modules.utils._triple,
+            max_pool3d_with_indices,
+        ),
+        _export("adaptive_max_pool3d"),
+    ],
+)
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @symbolic_helper.quantized_args(True, False)
+    def symbolic_fn(g, input, output_size):
+        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
+        # by executing a GlobalPool.
+        # It is also supported for cases where the output size is a factor of the input size.
+        # For these cases the stride and kernel size are uniform along all the indices of
+        # the same dimension, which makes it possible to export it to ONNX.
+        # for MaxPool, GlobalMaxPool does not return indices,
+        # so we try using max_poolxd_with_indices, and if it is not possible
+        # (input is not a complete tensor or output size not factor of input size)
+        # then we call GlobalAveragePool and return None for the indices
+        output_size_value = output_size
+        try:
+            output_size = symbolic_helper._parse_arg(output_size, "is")
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant.", input
+            )
+        if output_size == [1] * len(output_size) and type == "AveragePool":
+            return g.op("GlobalAveragePool", input)
+        sizes = symbolic_helper._get_tensor_sizes(input)
+        try:
+            dim = sizes[2:]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            dim = None
+        if dim is None or any(i is None for i in dim):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "input size not accessible", input
+            )
+        # verify if output size % input size = 0 for all dim
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        if mod != [0] * len(mod):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "output size that are not factor of input size", output_size_value
+            )
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        # call max_poolxd_with_indices to get indices in the output
+        if type == "MaxPool":
+            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
+        return output
+
+    return symbolic_fn
+
+
+def _prepare_onnx_paddings(dim: int, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+    Args:
+        dim: the dimension of the tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+    """
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    return paddings
+
+
+def _convert_padding_node(input):
+    padding = symbolic_helper._maybe_get_const(input, "is")
+    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
+        input_list = symbolic_helper._unpack_list(padding)
+        try:
+            padding = [
+                symbolic_helper._get_const(v, "i", "padding") for v in input_list
+            ]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant", input
+            )
+    return padding
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
+    mode = "constant"
+    try:
+        value = symbolic_helper._get_const(value, "f", "value")
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant", value
+        )
+
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        tensors = []
+        if pad_l > 0:
+            left = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
+            middle = symbolic_helper._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[start],
+                ends=[end],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
+        _export("upsample_nearest1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
+        _export("upsample_nearest2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
+        _export("upsample_nearest3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
+        _export("upsample_linear1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
+        _export("upsample_bilinear2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
+        _export("upsample_trilinear3d"),
+    ],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, scales, mode_s=mode)
+
+
+@_onnx_symbolic("aten::bitwise_not")
+def bitwise_not(g: jit_utils.GraphContext, input):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            input,
+        )
+    return g.op("Not", input)
+
+
+@_onnx_symbolic("aten::bitwise_or")
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()[f"_cast_{to_type}"]
+            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+
+        return wrap_with_cast
+
+    return decorator
+
+
+def wrap_logical_op_with_negation(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrap_with_not(g, input, other):
+        return g.op("Not", func(g, input, other))
+
+    return wrap_with_not
+
+
+@_onnx_symbolic("aten::__not_")
+def __not_(g: jit_utils.GraphContext, self):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            self,
+        )
+    return g.op("Not", self)
+
+
+@_onnx_symbolic("aten::eq")
+@symbolic_helper.quantized_args(True, True)
+def eq(g: jit_utils.GraphContext, self, other):
+    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
+        other.type(), _C.DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
+    self_node = self.node()
+    other_node = other.node()
+    if self_node.kind() == other_node.kind() == "onnx::Constant":
+        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
+            # Exporting strings to ONNX is not supported.
+            # If both strings are constant, we can compare them directly.
+            # The no-op check for equality will get constant-folded.
+            return g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    self_node.s("value") == other_node.s("value"),
+                    dtype=torch.bool,
+                ),
+            )
+
+    return g.op("Equal", self, other)
+
+
+@_onnx_symbolic("aten::ne")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ne(g: jit_utils.GraphContext, self, other):
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::gt")
+@symbolic_helper.quantized_args(True, True)
+def gt(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+def _gt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Greater", input, other)
+
+
+@_onnx_symbolic("aten::lt")
+@symbolic_helper.quantized_args(True, True)
+def lt(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+def _lt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Less", input, other)
+
+
+@_onnx_symbolic("aten::ge")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ge(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::le")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def le(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::__and_")
+def __and_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::__or_")
+def __or_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::__xor_")
+def __xor_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_and")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_and(g: jit_utils.GraphContext, input, other):
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::logical_or")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_or(g: jit_utils.GraphContext, input, other):
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::logical_xor")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_xor(g: jit_utils.GraphContext, input, other):
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_not")
+def logical_not(g: jit_utils.GraphContext, input):
+    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
+
+
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # PyTorch dim and ONNX axis have different meanings.
+    # See Softmax comment for details.
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    if dim < 0:
+        dim = input_dim + dim
+    is_transpose_required = input_dim != dim + 1
+    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
+    if is_transpose_required:
+        axes = list(range(input_dim))
+        axes[dim], axes[-1] = axes[-1], axes[dim]
+        input = g.op("Transpose", input, perm_i=axes)
+        dim = input_dim - 1
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    if is_transpose_required:
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
+    return return_op
+
+
+@_onnx_symbolic("aten::_log_softmax")
+@symbolic_helper.parse_args("v", "i", "i")
+def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return log_softmax(g, input, dim)
+
+
+@_onnx_symbolic("aten::_convolution")
+@symbolic_helper.parse_args(
+    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
+)
+def _convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::_convolution_mode")
+@symbolic_helper.parse_args(
+    "v",
+    "v",
+    "v",
+    "is",
+    "s",
+    "is",
+    "i",
+)
+def _convolution_mode(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    if padding == "valid":
+        padding = "VALID"
+    elif padding == "same":
+        padding = "SAME_UPPER"
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        "auto_pad_s": padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    n = g.op("Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::convolution")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
+def convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv1d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv2d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv3d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv_transpose1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose1d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose3d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    symbolic_helper.check_training_mode(training, "batch_norm")
+
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
+        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
+        return res
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    axes = [-i for i in range(len(normalized_shape), 0, -1)]
+
+    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
+    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
+
+    if g.opset < 18:
+        mean = g.op("ReduceMean", input, axes_i=axes)
+    else:
+        mean = g.op(
+            "ReduceMean",
+            input,
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
+    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
+    if g.opset < 18:
+        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    else:
+        variance = g.op(
+            "ReduceMean",
+            pow(g, numerator, two_cst),
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
+    normalized = g.op("Div", numerator, denominator)
+
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
+    if not (weight is None or symbolic_helper._is_none(weight)):
+        normalized = mul(g, normalized, weight)
+    if not (bias is None or symbolic_helper._is_none(bias)):
+        normalized = add(g, normalized, bias)
+
+    # rdenominator := 1 / sqrt(variance + eps)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast",
+            denominator,
+            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
+    return normalized, mean, rdenominator
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+) -> _C.Value:
+    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+    return normalized
+
+
+@_onnx_symbolic("aten::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
+def instance_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats: bool,
+    momentum: Number,
+    eps: Number,
+    cudnn_enabled: bool,
+):
+    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if weight is None or symbolic_helper._is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or symbolic_helper._is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    if (
+        running_mean is None
+        or symbolic_helper._is_none(running_mean)
+        or running_var is None
+        or symbolic_helper._is_none(running_var)
+    ):
+        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+    else:
+        input_size = symbolic_helper._get_tensor_sizes(input)
+        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
+        # For more information instance_norm():
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
+        input_size_reshape = input_size.copy()
+        n = input_size[0]
+        if n is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size.",
+                input,
+            )
+        c = input_size[1]
+        input_size_reshape[0] = 1
+        input_size_reshape[1] = n * c
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
+        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    # FIXME(justinchuby): Get rid of the try catch here to improve readability
+    try:
+        sizedim = sizes[dimension]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        sizedim = None
+    if sizedim is not None:
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [
+            symbolic_helper._slice_helper(
+                g, input, axes=[dimension], starts=[low], ends=[hi]
+            )
+            for low, hi in zip(low_indices, hi_indices)
+        ]
+        ndim = len(sizes)
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [
+            symbolic_helper._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return symbolic_helper._unimplemented(
+            "Unfold", "input size not accessible", input
+        )
+
+
+@_onnx_symbolic("aten::elu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "t", "t", "t")
+def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
+    if scale and scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "scale", "does not support scale in Elu", scale
+        )
+    if input_scale and input_scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "input_scale", "does not support input_scale in Elu", input_scale
+        )
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
+
+
+@_onnx_symbolic("aten::selu")
+@symbolic_helper.quantized_args(True)
+def selu(g: jit_utils.GraphContext, input):
+    return g.op("Selu", input)
+
+
+@_onnx_symbolic("aten::index_select")
+@symbolic_helper.parse_args("v", "i", "v")
+def index_select(g: jit_utils.GraphContext, self, dim, index):
+    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
+    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
+    # also produces a tensor with the same rank as the input.
+    return symbolic_helper._select_helper(g, self, dim, index)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        if accumulate:
+            return add(g, self, values)
+        return values
+    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = expand(g, value, expanded_index_shape, None)
+
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bucketize")
+@symbolic_helper.parse_args("v", "v", "b", "b")
+def bucketize(
+    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
+):
+    out_type = _C_onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = _C_onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    assert tensor_rank is not None
+    unsqueeze_axes = list(range(1, tensor_rank + 1))
+    expanded_boundaries = expand(
+        g,
+        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
+        new_shape,
+        None,
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
+@_onnx_symbolic("aten::type_as")
+def type_as(g: jit_utils.GraphContext, self, other):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    other_dtype = symbolic_helper._try_get_scalar_type(other)
+    if self_dtype == other_dtype and self_dtype is not None:
+        return self
+    if other_dtype is not None:
+        return g.op(
+            "Cast",
+            self,
+            to_i=other_dtype.onnx_type(),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of type_as for tensor "
+        "of unknown dtype. Please check if the dtype of the "
+        "parameter passed to the type_as function is correct.",
+        other,
+    )
+
+
+@_onnx_symbolic("aten::cosine_similarity")
+@symbolic_helper.parse_args("v", "v", "i", "f")
+def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
+    cross = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
+    )
+    x1_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
+    )
+    x2_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
+    )
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+@_onnx_symbolic("aten::pairwise_distance")
+def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
+    if not symbolic_helper._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
+
+
+@_onnx_symbolic("aten::clone")
+# ignore clone operators that are inserted by PyTorch autograd
+def clone(g: jit_utils.GraphContext, input, unused_memory_format):
+    return input
+
+
+@_onnx_symbolic("aten::abs")
+def abs(g: jit_utils.GraphContext, self):
+    return g.op("Abs", self)
+
+
+@_onnx_symbolic("aten::log")
+def log(g: jit_utils.GraphContext, self):
+    return g.op("Log", self)
+
+
+@_onnx_symbolic("aten::log1p")
+def log1p(g: jit_utils.GraphContext, self):
+    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
+
+
+@_onnx_symbolic("aten::log10")
+def log10(g: jit_utils.GraphContext, self):
+    _ln10 = 2.30258509299404568401
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    f_dtype = _type_utils.JitScalarType.from_value(self)
+    if not symbolic_helper._is_fp(self):
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
+    if not symbolic_helper._is_fp(exponent):
+        exponent = g.op(
+            "Cast",
+            exponent,
+            to_i=f_dtype.onnx_type(),
+        )
+    pow = g.op("Pow", self, exponent)
+    return pow
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    # min or max may be None that we need to dispatch to
+    # Clip separately, as ONNX does not have None syntax
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
+            return symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=symbolic_helper._parse_arg(min, "f"),
+                max_f=symbolic_helper._parse_arg(max, "f"),
+                opset_before=12,
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    if symbolic_helper._is_constant(min):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    if symbolic_helper._is_constant(max):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
+@_onnx_symbolic("aten::exp")
+def exp(g: jit_utils.GraphContext, self):
+    return g.op("Exp", self)
+
+
+@_onnx_symbolic("aten::dropout_")
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "i")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    symbolic_helper.check_training_mode(train, "dropout")
+    # if train is False, dropout is no-op
+    if not train:
+        return input
+    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
+    return r
+
+
+@_onnx_symbolic(
+    "aten::alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
+)  # See Note [Export inplace]
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
+)
+def _unsupported_dropout(name: str):
+    @symbolic_helper.parse_args("v", "none", "b")
+    def feature_dropout(g, input, p, train):
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        if train:
+            return symbolic_helper._unimplemented(name, "training mode", input)
+        return input
+
+    return feature_dropout
+
+
+@_onnx_symbolic("aten::norm")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
+    if p == 1:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
+    elif p == 2:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
+    else:
+        raise errors.SymbolicValueError(
+            "ONNX export only p-norms with p of 1 or 2", self
+        )
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
+
+
+@_onnx_symbolic("aten::conv_tbc")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
+    # input must have 3 dimensions, see:
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
+    # input = (time, batch, in_channels)
+    # weight = (kernel_width, in_channels, out_channels)
+    # bias = (out_channels,)
+    input = g.op("Transpose", input, perm_i=[1, 2, 0])
+    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
+    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
+    return g.op("Transpose", conv, perm_i=[2, 0, 1])
+
+
+@_onnx_symbolic("aten::_unique")
+@symbolic_helper.parse_args("v", "i", "i")
+def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
+    return symbolic_helper._onnx_unsupported("_unique", input)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_cast_Byte")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::new_empty")
+def new_empty(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return empty(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::scalar_tensor")
+def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.FLOAT
+    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return scalar
+
+
+@_onnx_symbolic("aten::tensor")
+def tensor(
+    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if symbolic_helper._is_packed_list(data):
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
+        input_list = []
+        for t in symbolic_helper._unpack_list(data):
+            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
+            t = symbolic_helper._reshape_helper(g, t, shape_reference)
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            input_list.append(t)
+        return g.op("Concat", *input_list, axis_i=0)
+    else:
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(data)
+        if symbolic_helper._is_list(data) and (
+            symbolic_helper._is_tensor_list(data)
+            or symbolic_helper._is_scalar_list(data)
+        ):
+            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
+    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::as_tensor")
+def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
+    return tensor(g, data, dtype, device)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_zeros")
+def new_zeros(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zero")
+def zero(g: jit_utils.GraphContext, self):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    return zeros_like(g, self, self_dtype)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_ones")
+def new_ones(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return ones(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        if dtype is None:
+            scalar_type = _type_utils.JitScalarType.FLOAT
+        else:
+            scalar_type = _type_utils.JitScalarType(dtype)
+        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+        if isinstance(sizes_, list) and len(sizes_) == 0:
+            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::full_like")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if symbolic_helper._is_value(fill_value):
+        tmp = zeros_like(g, input, dtype, layout, device)
+        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
+        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        shape = g.op("Shape", input)
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::new_full")
+def new_full(
+    g: jit_utils.GraphContext,
+    self,
+    size,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::eye")
+def eye(g: jit_utils.GraphContext, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, _pin_memory = args
+        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    if len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, _pin_memory = args
+        shape = g.op(
+            "Concat",
+            symbolic_helper._unsqueeze_helper(g, n, [0]),
+            symbolic_helper._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+
+    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
+        dim, start, end, step = args
+        step = symbolic_helper._parse_arg(step, "i")
+        if step != 1:
+            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        is_start_onnx_const = start.node().kind() == "onnx::Constant"
+        is_end_onnx_const = end.node().kind() == "onnx::Constant"
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+                raise errors.SymbolicValueError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version.",
+                    self,
+                )
+            else:
+                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
+        else:
+            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
+            dim = symbolic_helper._parse_arg(dim, "i")
+            return symbolic_helper._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
+    elif len(args) == 3:
+        # aten::slice(t[] l, int start, int end, int step) -> t[]
+        start, end, step = args
+        dim = 0
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
+        return symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[start], ends=[end]
+        )
+
+    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    hs = hardsigmoid(g, self)
+    return g.op("Mul", self, hs)
+
+
+@_onnx_symbolic("aten::hardsigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@symbolic_helper.parse_args("v")
+def hardsigmoid(g: jit_utils.GraphContext, self):
+    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    return g.op("HardSigmoid", self, alpha_f=1 / 6)
+
+
+@_onnx_symbolic("aten::tanhshrink")
+@symbolic_helper.parse_args("v")
+def tanhshrink(g: jit_utils.GraphContext, self):
+    return g.op("Sub", self, tanh(g, self))
+
+
+@_onnx_symbolic("aten::hardshrink")
+@symbolic_helper.parse_args("v", "f")
+def hardshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
+    return g.op(
+        "Where",
+        cond,
+        self,
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::softshrink")
+@symbolic_helper.parse_args("v", "f")
+def softshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    gt_cond = gt(g, self, lambd_op)
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    lt_cond = lt(g, self, neg(g, lambd_op))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    return add(g, gt_out, lt_out)
+
+
+@_onnx_symbolic("aten::alias")
+def alias(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@symbolic_helper.parse_args("v", "i")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
+    # Handle negative dim
+    if dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            dim = dim + rank + 1
+        else:
+            return symbolic_helper._unimplemented(
+                "unsqueeze", "negative axis with unknown input rank", self
+            )
+
+    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
+
+
+@_onnx_symbolic("aten::sort")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Sort", "Out parameter is not supported for sort", self
+        )
+    self_sizes = symbolic_helper._get_tensor_sizes(self)
+    try:
+        dim_size = self_sizes[dim]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        dim_size = None
+
+    if dim_size is None:
+        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
+
+    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("aten::numel")
+def numel(g: jit_utils.GraphContext, self):
+    return symbolic_helper._numel_helper(g, self)
+
+
+@_onnx_symbolic("aten::topk")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "TopK", "Out parameter is not supported for topk", self
+        )
+    if not largest:
+        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("prim::convert_element_type")
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::to")
+def to(g: jit_utils.GraphContext, self, *args):
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
+                or isinstance(args[0].type(), _C.DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if (
+            symbolic_helper._is_value(args[0])
+            and args[0].node().kind() == "onnx::Constant"
+        ):
+            tval = symbolic_helper._node_get(args[0].node(), "value")
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = _type_utils.JitScalarType.from_value(args[0])
+            return g.op(
+                "Cast",
+                self,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    dtype = _type_utils.JitScalarType.INT64
+    shape_ = ones_like(g, repeats, dtype)
+    self = g.op("Expand", self, shape_)
+    return g.op("Tile", self, repeats)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        if repeats_sizes[0] is None:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported for cases with dynamic repeats",
+                self,
+            )
+        assert repeats_sizes[0] == input_sizes[dim], (
+            "repeats must have the same size as input along dim"
+        )
+        reps = repeats_sizes[0]
+    else:
+        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
+
+    final_splits = []
+    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
+    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = symbolic_helper._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_onnx_symbolic("aten::pixel_unshuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+def _generic_rnn(
+    g: jit_utils.GraphContext,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
+    weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
+        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
+    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
+    if batch_first:
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if dropout and train:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "dropout in training mode", input
+        )
+
+    if variant.startswith("RNN"):
+        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
+        variant = "RNN"
+
+    w_hh = all_weights[1]
+    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
+    if hidden_size is None:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "unknown hidden size", input
+        )
+
+    unidirectional = not bidirectional
+
+    prev_output = input
+
+    h_outs = []
+    if variant == "RNN" or variant == "GRU":
+        h0 = initial_states
+    elif variant == "LSTM":
+        h0, c0 = initial_states
+        c_outs = []
+
+    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+    if variant == "GRU":
+        # pytorch is reset, input, hidden
+        # onnx is    input, reset, hidden
+        reform_permutation = [(1, 2), (0, 1), (2, 3)]
+    elif variant == "LSTM":
+        # pytorch is input, forget, cell, output.
+        # onnx is    input, output, forget, cell.
+        reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+    def reform_weights(g, w, n, intervals):
+        slices = [
+            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
+        return g.op("Concat", *slices, axis_i=0)
+
+    def transform_weights_no_bias(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
+        )
+
+    def transform_weights(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh, bias_ih, bias_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh, bias_ih, bias_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
+        )
+
+    def retrieve_state(x, start, end):
+        return (
+            x
+            if num_layers == 1
+            else symbolic_helper._slice_helper(
+                g, x, axes=[0], starts=[start], ends=[end]
+            )
+        )
+
+    for i in range(num_layers):
+        if unidirectional:
+            if weights_per_layer == 4:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+            else:
+                weight_ih, weight_hh = transform_weights_no_bias(i)
+                bias_concat = unused(g)
+
+            state_indices = i, i + 1
+        else:
+            if weights_per_layer == 4:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
+            else:
+                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
+                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
+                bias_concat = unused(g)
+
+            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
+            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
+
+            state_indices = 2 * i, 2 * i + 2
+
+        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
+
+        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
+        if variant == "RNN":
+            if bidirectional:
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
+            else:
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
+
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
+        elif variant == "GRU":
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
+        elif variant == "LSTM":
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
+
+        if bidirectional:
+            # The ONNX RNN/GRU/LSTM produce an output of dimensions
+            #   seq_len, num_directions, batch, hidden_size
+            # We have to convert to match pytorch's expected
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
+            # Transpose, and then combining it with hidden_size
+            # with Reshape.
+            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
+            prev_output = symbolic_helper._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
+        else:
+            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
+
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
+    if variant == "RNN" or variant == "GRU":
+        return prev_output, h_outs
+    elif variant == "LSTM":
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
+        return prev_output, h_outs, c_outs
+
+
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+def _lstm_full(
+    g: jit_utils.GraphContext,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+def _lstm_packed(
+    g: jit_utils.GraphContext,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
+
+
+@_onnx_symbolic("aten::lstm")
+def lstm(g: jit_utils.GraphContext, *args):
+    if symbolic_helper._is_tensor_list(args[3]):
+        return _lstm_packed(g, *args)
+    else:
+        return _lstm_full(g, *args)
+
+
+@_onnx_symbolic("aten::lstm_cell")
+def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = symbolic_helper._unsqueeze_helper(g, self, [0])
+    hidden = symbolic_helper._unpack_list(hidden)
+    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (
+        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
+    )
+    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return symbolic_helper._squeeze_helper(
+        g, h_outs, [0]
+    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
+
+
+@_onnx_symbolic(
+    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
+)
+@_onnx_symbolic(
+    "aten::rnn_tanh",
+    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
+)
+@_onnx_symbolic(
+    "aten::rnn_relu",
+    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
+)
+def _one_hidden_rnn(kind: str):
+    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
+
+    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
+
+    def symbolic(g, *args):
+        if symbolic_helper._is_tensor_list(args[3]):
+            return _rnn_packed(g, *args)
+        else:
+            return _rnn_full(g, *args)
+
+    return symbolic
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::detach")
+def detach(g: jit_utils.GraphContext, input):
+    # Erase aten::detach nodes because ONNX is inference only
+    return input
+
+
+@_onnx_symbolic("aten::contiguous")
+@symbolic_helper.parse_args("v", "i")
+def contiguous(g: jit_utils.GraphContext, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise errors.SymbolicValueError(
+            "onnx memory_format support is not implemented", input
+        )
+    return input
+
+
+@_onnx_symbolic("aten::_pack_padded_sequence")
+@symbolic_helper.parse_args("v", "v", "i")
+def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
+    # Currently there is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+    if batch_first:
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
+        raise errors.SymbolicValueError(
+            "'lengths' must be a Tensor for ONNX export", input
+        )
+    # We know it's a TensorType so this check is now safe.
+    # It's really only necessary because those operators expand to something that
+    # only works with int32 types in Caffe2...
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+
+@_onnx_symbolic("aten::_pad_packed_sequence")
+@symbolic_helper.parse_args("v", "v", "i", "t", "v")
+def _pad_packed_sequence(
+    g: jit_utils.GraphContext,
+    data,
+    batch_sizes,
+    batch_first,
+    padding_value,
+    total_length,
+):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+    if batch_first:
+        data = g.op("Transpose", data, perm_i=[1, 0, 2])
+    return data, lengths
+
+
+@_onnx_symbolic("aten::randint")
+def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        randn = g.op(
+            "RandomUniformLike",
+            shape_const,
+            low_f=low_i,
+            high_f=high_i,
+        )
+    else:
+        randn = g.op(
+            "RandomUniform",
+            shape_i=shape,
+            low_f=low_i,
+            high_f=high_i,
+        )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randint_like")
+def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    randn = g.op(
+        "RandomUniformLike",
+        self,
+        low_f=low_i,
+        high_f=high_i,
+    )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randn")
+def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomNormalLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomNormal",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::rand")
+def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomUniform",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::randn_like")
+def randn_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
+
+
+@_onnx_symbolic("aten::rand_like")
+def rand_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    return g.op(
+        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+
+
+@_onnx_symbolic("aten::rrelu")
+@symbolic_helper.parse_args("v", "f", "f", "i", "none")
+def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
+    if not training:
+        slope = (upper + lower) / 2.0
+        return g.op("LeakyRelu", input, alpha_f=slope)
+    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
+    return g.op("PRelu", input, p)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
+        return symbolic_helper._unimplemented(
+            "Bernoulli", "input dtype not accessible", input
+        )
+
+    rands = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=dtype.onnx_type(),
+    )
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
+    return g.op("Cast", output, to_i=dtype.onnx_type())
+
+
+@_onnx_symbolic("aten::log_sigmoid")
+@symbolic_helper.parse_args("v")
+def log_sigmoid(g: jit_utils.GraphContext, input):
+    p = g.op("Sigmoid", input)
+    return g.op("Log", p)
+
+
+@_onnx_symbolic("aten::erf")
+@symbolic_helper.parse_args("v")
+def erf(g: jit_utils.GraphContext, input):
+    return g.op("Erf", input)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+            input,
+        )
+
+    if dim == 0:
+        return symbolic_helper._reshape_helper(g, input, [1])
+    if dim == 1:
+        return g.op("Identity", input)
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    if end_dim < 0:
+        end_dim = dim + end_dim
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1 and end_dim == dim - 1:
+        return g.op("Flatten", input, axis_i=start_dim)
+    if start_dim == 0 and end_dim == dim - 2:
+        return g.op("Flatten", input, axis_i=end_dim + 1)
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::nonzero")
+@symbolic_helper.parse_args("v")
+def nonzero(g: jit_utils.GraphContext, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
+    return t(g, g.op("NonZero", input))
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::isnan")
+@symbolic_helper.parse_args("v")
+def isnan(g: jit_utils.GraphContext, input):
+    output = g.op("IsNaN", input)
+    return output
+
+
+@_onnx_symbolic("aten::any")
+def _any(g: jit_utils.GraphContext, *args):
+    # aten::any(Tensor self)
+    if len(args) == 1:
+        input = args[0]
+        dim, keepdim = None, 0
+    # aten::any(Tensor self, int[]? dim, bool keepdim)
+    else:
+        input, dim, keepdim = args
+        # Can be int list or single int
+        dim = symbolic_helper._parse_arg(dim, "t")
+        dim = [int(d) for d in dim.view(-1)]
+        keepdim = symbolic_helper._parse_arg(keepdim, "i")
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+    input_sum = symbolic_helper._reducesum_helper(
+        g, input, axes_i=dim, keepdims_i=keepdim
+    )
+    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
+
+
+@_onnx_symbolic("aten::all")
+def _all(g: jit_utils.GraphContext, *args):
+    input = g.op("Not", args[0])
+    # aten::all(Tensor self)
+    if len(args) == 1:
+        return g.op("Not", _any(g, input))
+    # aten::all(Tensor self, int[]? dim, bool keepdim)
+    else:
+        return g.op("Not", _any(g, input, args[1], args[2]))
+
+
+@_onnx_symbolic("aten::narrow")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    return symbolic_helper._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("Scatter", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
+        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        return symbolic_helper._unimplemented(
+            "scatter_add", "input dtype not accessible", self
+        )
+    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
+    if sizes:
+        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
+    else:
+        to_add = zeros_like(g, self, scalar_type)
+    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
+@_onnx_symbolic("aten::log2")
+def log2(g: jit_utils.GraphContext, self):
+    _ln2 = 0.693147180559945309
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
+
+
+@_onnx_symbolic("aten::is_floating_point")
+def is_floating_point(g: jit_utils.GraphContext, self):
+    if symbolic_helper._is_fp(self):
+        return g.op("Constant", value_t=torch.BoolTensor([1]))
+    return g.op("Constant", value_t=torch.BoolTensor([0]))
+
+
+@_onnx_symbolic("aten::__is_")
+def __is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if symbolic_helper._is_none(self):
+            return g.op("Constant", value_t=torch.BoolTensor([1]))
+        return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@wrap_logical_op_with_negation
+def __isnot_(g: jit_utils.GraphContext, self, other):
+    return __is_(g, self, other)
+
+
+@_onnx_symbolic("aten::one_hot")
+def one_hot(g: jit_utils.GraphContext, self, num_classes):
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    # onnxruntime supports limited type combinations for OneHot.
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
+        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("OneHot", self, num_classes, values, axis_i=-1)
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
+    # NOTE: This workaround is needed since GatherElement is only supported
+    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=scalar_type.onnx_type(),
+    )
+    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
+    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
+
+
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
+
+
+@_onnx_symbolic("aten::std")
+def std(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return g.op("Sqrt", var)
+
+
+@_onnx_symbolic("aten::var")
+def var(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return var
+
+
+@_onnx_symbolic("aten::var_mean")
+def var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return _var_mean(g, input, None, args[0], None)
+    else:
+        return _var_mean(g, input, *args)
+
+
+@_onnx_symbolic("aten::std_mean")
+def std_mean(g: jit_utils.GraphContext, input, *args):
+    var, mean = var_mean(g, input, *args)
+    return g.op("Sqrt", var), mean
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    def _float_step_convert(range_tensor):
+        if symbolic_helper._is_fp(range_tensor):
+            range_tensor = g.op(
+                "Cast",
+                g.op("Ceil", range_tensor),
+                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
+            )
+        return range_tensor
+
+    if len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        range_tensor = _float_step_convert(end)
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        step = symbolic_helper._unsqueeze_helper(g, step, [0])
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
+        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Sub", end, start))
+        arange_tensor = g.op(
+            "Add",
+            symbolic_helper._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+
+    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::linspace")
+def linspace(
+    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
+):
+    range_tensor = symbolic_helper._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+@_onnx_symbolic("aten::lift")
+def lift(g: jit_utils.GraphContext, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
+
+@_onnx_symbolic("aten::masked_fill")
+def masked_fill(g: jit_utils.GraphContext, self, mask, value):
+    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
+
+    Fills elements of the input tensor with `value` where `mask` is True.
+    """
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    value = symbolic_helper._maybe_get_scalar(value)
+    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
+
+
+@_onnx_symbolic("aten::masked_fill_")
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    def try_mask_to_index(index):
+        if not symbolic_helper._is_none(index) and (
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
+        ):
+            if g.opset < 9:
+                raise errors.SymbolicValueError(
+                    "Exporting masked indices are only supported after ONNX opset 9.",
+                    self,
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
+            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
+        return index
+
+    indices = [try_mask_to_index(idx) for idx in indices]
+    if len(indices) == 1:
+        return symbolic_helper._select_helper(
+            g, self, 0, indices[0], apply_reshape=False
+        )
+    else:
+        # Multiple tensors as indices. Each tensor could either be
+        #   1. prim::Constant()
+        #           representing ":" in python indexing. E.g. tensor[:, :]
+        #   2. prim::Constant[value=...] or tensor output
+        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
+        # For more info on advanced indexing,
+        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
+
+        # Consider a general case of
+        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
+        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
+        # Same results can be achieved through transposing t into
+        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
+        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
+        # and process the tensor indices.
+        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
+        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
+        # After gather, reshape and transpose back.
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
+        ]
+
+        if len(adv_idx_indices) == 0:
+            return self
+        elif len(adv_idx_indices) == 1:
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
+        else:
+            rank = symbolic_helper._get_tensor_rank(self)
+            if rank is None:
+                return symbolic_helper._unimplemented(
+                    "aten::index",
+                    "operator of advanced indexing on tensor of unknown rank. ",
+                    self,
+                )
+            # TODO: If indexing is supported natively in ONNX in future opsets,
+            #       update the warning to recommend exporting with higher opset version.
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                f"{GLOBALS.export_onnx_opset_version}"
+                " is achieved by combination of multiple ONNX operators, "
+                "including Reshape, Transpose, Concat, and Gather. "
+                "If indices include negative values, the exported graph will produce incorrect results."
+            )
+            adv_idx_count = len(adv_idx_indices)
+            shape_tensor = _shape_as_tensor(g, self)
+            dim_tensor_list = [
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
+            ]
+
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
+            self = g.op("Flatten", self, axis_i=adv_idx_count)
+
+            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
+            cum_adv_index = indices[adv_idx_indices[-1]]
+            multiplier = dim_tensor_list[adv_idx_indices[-1]]
+            for i in range(adv_idx_count - 2, -1, -1):
+                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
+                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
+
+            # perform gather
+            self = index_select(g, self, 0, cum_adv_index)
+
+            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
+            # check if all advanced indices are consecutive.
+            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+            # to understand how the subarray position is decided.
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
+                # unfold regular index axes
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
+                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
+
+                # Transpose folded advanced indexed axis to its original location.
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
+                self = g.op("Transpose", self, perm_i=adv_idx_permute)
+
+                # unfold advanced index axes
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
+                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
+            else:
+                final_shape = g.op(
+                    "Concat",
+                    cum_adv_index_shape_tensor,
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
+
+            return symbolic_helper._reshape_helper(g, self, final_shape)
+
+
+@_onnx_symbolic("aten::linalg_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if symbolic_helper._is_none(ord):
+            self = symbolic_helper._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "dim", "Input rank must be known at export time.", self
+            )
+        if self_dim == 1:
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if symbolic_helper._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = symbolic_helper._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
+    else:
+        ord_value = symbolic_helper._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
+        # Wrap the dim vector to handle negative dim values
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time.", self
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = symbolic_helper._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, _indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, _indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@_onnx_symbolic("aten::linalg_cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "is", "b")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::multinomial")
+@symbolic_helper.parse_args("v", "i", "b", "v")
+def multinomial(
+    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
+):
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Multinomial", "generator is not supported for multinomial", input
+        )
+    if not replacement and num_samples > 1:
+        symbolic_helper._unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+            input,
+        )
+
+    log_input = log(g, input)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=_C_onnx.TensorProtoDataType.INT64,
+        sample_size_i=num_samples,
+    )
+
+
+@_onnx_symbolic("aten::baddbmm")
+def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(
+        g,
+        batch_mul,
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
+    )
+    mul_b = mul(
+        g,
+        self,
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
+    )
+    return add(g, mul_a, mul_b)
+
+
+@_onnx_symbolic("aten::meshgrid")
+@symbolic_helper.parse_args("v", "s")
+def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
+    if indexing is None:
+        indexing = "ij"
+    elif indexing not in {"ij", "xy"}:
+        raise errors.SymbolicValueError(
+            f"Unsupported indexing: {indexing}", tensor_list
+        )
+    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
+    if indexing == "xy":
+        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
+    tensors = [
+        symbolic_helper._reshape_helper(
+            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
+        )
+        for t in unpacked_tensor_list
+    ]
+    tensors_shape = [g.op("Shape", t) for t in tensors]
+    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
+    out = []
+    for i, t in enumerate(tensors):
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
+        shape_i[i] = tensors_shape[i]
+        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
+        out.append(g.op("Expand", t_reshaped, out_shape))
+    if indexing == "xy":
+        out[0], out[1] = out[1], out[0]
+    return g.op("prim::ListConstruct", *out)
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    div = _floor_divide(g, input, other)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
+
+@_onnx_symbolic("aten::group_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
+def group_norm(
+    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
+):
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if channel_size is not None:
+        assert channel_size % num_groups == 0
+    input_rank = symbolic_helper._get_tensor_rank(input)
+    if input_rank is None:
+        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
+    # 0 in the shape list keeps dimension value unchanged.
+    shape = [0, num_groups, -1]
+    input_reshaped = symbolic_helper._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
+
+    # C is always divisible by num_groups
+    # Due to shape difference. we need to apply weight and bias after
+    # instance norm computation and reshape
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [1.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [0.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
+    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
+
+    if weight is None or weight.node().mustBeNone():
+        weight_value = torch.tensor(
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().mustBeNone():
+        bias_value = torch.tensor(
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        bias = g.op("Constant", value_t=bias_value)
+
+    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
+    axes = list(range(1, input_rank - 1))
+    return add(
+        g,
+        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
+        symbolic_helper._unsqueeze_helper(g, bias, axes),
+    )
+
+
+@_onnx_symbolic("aten::_weight_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
+    rank = symbolic_helper._get_tensor_rank(weight_v)
+    if rank is not None:
+        # W = g * ((v) / ||v||)
+        # Compute norm_except_dim for l2 norm. dim = None means over all dims
+        # torch's weight_norm module sets dim = -1 if it's None.
+        # This conflicts the logic for negative axes to access dims backwards
+        # TODO: Might need a fix in torch group_norm module
+        axes = list(range(rank))
+        if dim is not None:
+            if dim < -1:
+                dim += rank
+            if dim != -1:
+                axes.remove(dim)
+        norm_v = norm(g, weight_v, 2, axes, 1)
+        div = g.op("Div", weight_v, norm_v)
+        return g.op("Mul", div, weight_g)
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
+        weight_v,
+    )
+
+
+@_onnx_symbolic("aten::dim")
+def dim(g: jit_utils.GraphContext, self):
+    """Implement the dim functionality available for a pytorch tensor in ONNX"""
+    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
+    shape = g.op("Shape", self)
+    return g.op("Size", shape)
+
+
+@_onnx_symbolic("aten::__contains_")
+def __contains_(g: jit_utils.GraphContext, self, element):
+    unpacked_list = symbolic_helper._unpack_list(self)
+    if all(
+        symbolic_helper._is_constant(x) for x in unpacked_list
+    ) and symbolic_helper._is_constant(element):
+        return g.op(
+            "Constant",
+            value_t=torch.tensor(
+                symbolic_helper._node_get(element.node(), "value")
+                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
+            ),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
+        self,
+    )
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
+
+
+@_onnx_symbolic("aten::item")
+def item(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::take")
+def take(g: jit_utils.GraphContext, self, index):
+    self_flattened = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    out = index_select(g, self_flattened, 0, index)
+    out = reshape_as(g, out, index)
+    return out
+
+
+def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
+    diff_ = sub(g, target, input)
+    exp_ = exp(g, target)
+    output = mul(g, exp_, diff_)
+    return output
+
+
+def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
+    log_ = log(g, target)
+    diff_ = sub(g, log_, input)
+    output_pos = mul(g, target, diff_)
+    zeros_ = zeros_like(g, output_pos)
+    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
+    output = where(g, mask_, output_pos, zeros_)
+    return output
+
+
+@_onnx_symbolic("aten::kl_div")
+@symbolic_helper.parse_args("v", "v", "i", "b")
+def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
+    if log_target:
+        output = _kl_div_log_target_impl(g, input, target)
+    else:
+        output = _kl_div_non_log_target_impl(g, input, target)
+
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::mse_loss")
+@symbolic_helper.parse_args("v", "v", "i")
+def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
+    output = mul(g, sub(g, input, target), sub(g, input, target))
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "mse_loss with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::as_strided")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "is", "i")
+def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
+    sizes = symbolic_helper._maybe_get_const(sizes, "is")
+    rank = len(strides)
+    self_1d = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    ind: torch.Tensor | None
+    if not symbolic_helper._is_value(sizes):
+        ind = torch.tensor([0], dtype=torch.long)
+        for i, (size, stride) in enumerate(zip(sizes, strides)):
+            r_size = [1] * rank
+            r_size[i] = -1
+            ind = ind + torch.arange(size).view(r_size) * stride
+        if offset:
+            ind = ind + offset
+        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
+    else:
+        ind = None
+        for i, stride in enumerate(strides):
+            r_size = [1] * rank
+            r_size[i] = -1
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = symbolic_helper._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
+            if ind is None:
+                ind = tmp_ind
+            else:
+                ind = g.op("Add", ind, tmp_ind)
+        if offset:
+            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        return g.op("Gather", self_1d, ind)
+
+
+@_onnx_symbolic("aten::__derive_index")
+def __derive_index(g: jit_utils.GraphContext, index, start, step):
+    return g.op("Add", start, g.op("Mul", index, step))
+
+
+@_onnx_symbolic("aten::__range_length")
+# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
+# if (step > 0 && lo < hi) {
+#   push(stack, 1 + (hi - 1 - lo) / step);
+# } else if (step < 0 && lo > hi) {
+#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
+# } else {
+#  push(stack, 0);
+# }
+def __range_length(g: jit_utils.GraphContext, lo, hi, step):
+    sub = g.op("Sub", hi, lo)
+    div = g.op("Ceil", true_divide(g, sub, step))
+    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::linear")
+def linear(g: jit_utils.GraphContext, input, weight, bias):
+    rank = symbolic_helper._get_tensor_rank(input)
+    weight = t(g, weight)
+    if rank == 2 and not bias.node().mustBeNone():
+        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        output = addmm(g, bias, input, weight, alpha, beta)
+    else:
+        output = matmul(g, input, weight)
+        if not bias.node().mustBeNone():
+            output = add(g, bias, output)
+
+    return output
+
+
+@_onnx_symbolic("aten::hann_window")
+@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
+def hann_window(
+    g: jit_utils.GraphContext,
+    window_length,
+    periodic=True,
+    dtype: int | None = None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype_ = torch.get_default_dtype()
+        if not dtype_ or not dtype_.is_floating_point:
+            dtype_ = torch.float
+        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    n_array = arange(g, window_length, 4, None, None, None)
+    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
+
+    if periodic is False:
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
+    output = div(g, output, window_length)
+    output = g.op(
+        "Cast",
+        square(g, sin(g, output)),
+        to_i=scalar_type.onnx_type(),
+    )
+
+    return output
+
+
+@_onnx_symbolic("aten::mv")
+def mv(g: jit_utils.GraphContext, self, vec):
+    return matmul(g, self, vec)
+
+
+@_onnx_symbolic("aten::dot")
+def dot(g: jit_utils.GraphContext, self, other):
+    return matmul(g, self, other)
+
+
+@_onnx_symbolic("aten::movedim")
+@symbolic_helper.parse_args("v", "t", "t")
+def movedim(g: jit_utils.GraphContext, self, source, destination):
+    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
+    source = source.view(-1)
+    destination = destination.view(-1)
+
+    assert source.size() == destination.size()
+
+    if (source == destination).all():
+        return self
+
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    assert self_rank is not None
+
+    perm = list(range(self_rank))
+
+    src_dims = perm.copy()
+    dst_dims = perm.copy()
+
+    for src, dst in zip(source.tolist(), destination.tolist()):
+        perm[dst] = src
+        src_dims[src] = -1
+        dst_dims[dst] = -1
+
+    src_dims = [dim for dim in src_dims if dim != -1]
+    dst_dims = [dim for dim in dst_dims if dim != -1]
+
+    for src, dst in zip(src_dims, dst_dims):
+        perm[dst] = src
+
+    return g.op("Transpose", self, perm_i=perm)
+
+
+@_onnx_symbolic("aten::fill")
+@symbolic_helper.parse_args("v", "v")
+def fill(g: jit_utils.GraphContext, self, value):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
+
+
+@_onnx_symbolic("aten::index_add")
+def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
+
+    # ONNX does not support "alpha" argument, unlike aten index_add
+    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
+
+    dim = symbolic_helper._maybe_get_const(dim, "i")
+    if dim is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            "unknown 'dim' value.",
+            self,
+        )
+
+    self_dim_rank = symbolic_helper._get_tensor_rank(self)
+    other_dim_rank = symbolic_helper._get_tensor_rank(other)
+
+    if self_dim_rank is None or other_dim_rank is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            "the rank of self tensor or tensor to be added is unknown.",
+            self,
+        )
+
+    if other_dim_rank != self_dim_rank:
+        delta = self_dim_rank - other_dim_rank
+        for i in range(delta):
+            other = symbolic_helper._unsqueeze_helper(
+                g, other, [symbolic_helper._get_tensor_rank(other)]
+            )
+
+    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
+    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+
+    if (other_dim_size is not None) and (self_dim_size is not None):
+        if other_dim_size > self_dim_size:
+            raise errors.SymbolicValueError(
+                "ONNX export does not support exporting 'index_add_()' function with "
+                "duplicated values in 'index' parameter yet.",
+                self,
+            )
+
+    # Construct a new shape. It's almost as same as self except the size of the 'dim'
+    # dimension is 1, so that we can expand other dimensions as expected.
+    new_shape_axes = list(range(self_dim_rank))
+    new_shape_starts = [0 for i in range(self_dim_rank)]
+    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = symbolic_helper._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
+    other = expand_as(g, other, new_shape)
+
+    for i in range(dim):
+        index = symbolic_helper._unsqueeze_helper(g, index, [0])
+
+    for i in range(self_dim_rank - dim - 1):
+        index = symbolic_helper._unsqueeze_helper(
+            g, index, [symbolic_helper._get_tensor_rank(index)]
+        )
+
+    return scatter_add(g, self, dim, expand_as(g, index, other), other)
+
+
+@_onnx_symbolic("aten::roll")
+@symbolic_helper.parse_args("v", "is", "is")
+def roll(g: jit_utils.GraphContext, self, shifts, dims):
+    assert len(shifts) == len(dims)
+
+    result = self
+    for i in range(len(shifts)):
+        shapes = []
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
+        )
+        shapes.append(shape)
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
+        shapes.append(shape)
+        result = g.op("Concat", *shapes, axis_i=dims[i])
+
+    return result
+
+
+@_onnx_symbolic("aten::cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def cross(g: jit_utils.GraphContext, input, other, dim=None):
+    dim = symbolic_helper._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+@_onnx_symbolic("aten::cdist")
+def cdist(
+    g: jit_utils.GraphContext,
+    x1,
+    x2,
+    p=2.0,
+    compute_mode="use_mm_for_euclid_dist_if_necessary",
+):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    row_size_x1 = symbolic_helper._get_tensor_dim_size(x1, -2)
+    row_size_x2 = symbolic_helper._get_tensor_dim_size(x2, -2)
+    assert row_size_x1 is not None
+    assert row_size_x2 is not None
+    p_float = symbolic_helper._parse_arg(p, "f")
+    compute_mode = symbolic_helper._parse_arg(compute_mode, "i")
+    if p_float == 2.0 and (
+        compute_mode == 1
+        or (compute_mode is None and row_size_x1 >= 25 and row_size_x2 >= 25)
+    ):
+        return _euclidean_dist(g, x1, x2)
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
+def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # using matrix multiplication to accelerate the calculation of
+    # the euclidean distance
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    x1_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x1_pad = ones_like(g, x1_norm)
+    x2_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x2_pad = ones_like(g, x2_norm)
+    x1_ = g.op(
+        "Concat",
+        *[
+            mul(g, symbolic_helper._generate_wrapped_number(g, -2.0), x1),
+            x1_norm,
+            x1_pad,
+        ],
+        axis_i=-1,
+    )
+    x2_ = g.op("Concat", *[x2, x2_pad, x2_norm], axis_i=-1)
+    result = matmul(g, x1_, transpose(g, x2_, -2, -1))
+    dtype = _type_utils.JitScalarType.from_value(result)
+    min = g.op(
+        "Cast", symbolic_helper._generate_wrapped_number(g, 0.0), to_i=dtype.onnx_type()
+    )
+    result = symbolic_helper._op_with_optional_float_cast(
+        g, "Max", result, min, opset_before=12
+    )
+    result = sqrt(g, result)
+    return result
+
+
+@_onnx_symbolic("aten::lerp")
+def lerp(g: jit_utils.GraphContext, self, end, weight):
+    # Conditional for better numeric. This has been discussed in
+    # https://github.com/pytorch/pytorch/pull/18871
+    diff = g.op("Sub", end, self)
+    return where(
+        g,
+        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
+        g.op("Add", self, g.op("Mul", weight, diff)),
+        g.op(
+            "Sub",
+            end,
+            g.op(
+                "Mul",
+                diff,
+                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
+            ),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::broadcast_tensors")
+def broadcast_tensors(g: jit_utils.GraphContext, self):
+    all_tensors = symbolic_helper._unpack_list(self)
+    t_with_final_shape = zeros_like(g, all_tensors[0])
+
+    # Add operator supports multidirectional broadcasting. So we leverage this function
+    # to infer the final shape generated by the broadcast.
+    for t in all_tensors:
+        t_with_final_shape = add(g, t_with_final_shape, t)
+
+    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
+    return g.op("prim::ListConstruct", *t_list)
+
+
+@_onnx_symbolic("aten::is_pinned")
+def is_pinned(g: jit_utils.GraphContext, self, device=None):
+    # Unused by ONNX.
+    return None
+
+
+@_onnx_symbolic("prim::ConstantSplit")
+def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantSplit", "unknown dimension size", self
+        )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if dim_size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantChunk", "unknown dimension size", self
+        )
+    split_size = (dim_size + chunks - 1) // chunks
+    return prim_constant_split(g, self, split_size, dim)
+
+
+@_onnx_symbolic("prim::shape")
+def prim_shape(g: jit_utils.GraphContext, self):
+    return g.op("Shape", self)
+
+
+@_onnx_symbolic("prim::max")
+def prim_max(g: jit_utils.GraphContext, self, other):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Max", self, other, opset_before=12
+    )
+
+
+@_onnx_symbolic("prim::min")
+def prim_min(g: jit_utils.GraphContext, self, other=None):
+    if not other:
+        if symbolic_helper._is_packed_list(self):
+            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+        return min(g, self)
+    return min(g, self, other)
+
+
+@_onnx_symbolic("prim::data")
+def prim_data(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::layout")
+def prim_layout(g: jit_utils.GraphContext, self):
+    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
+    # Layout class defined in 'c10/core/Layout.h'.
+    return g.op("Constant", value_t=torch.tensor(0))
+
+
+@_onnx_symbolic("prim::ListConstruct")
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::ListUnpack")
+def prim_list_unpack(
+    g: jit_utils.GraphContext, *inputs, **kwargs
+) -> list[_C.Value] | None:
+    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
+        # Cancel the previous node if it is ListConstruct by returning its inputs
+        # TODO(justinchuby): Use a public method in the helper module
+        return symbolic_helper._unpack_list(inputs[0])
+
+    return None
+
+
+@_onnx_symbolic("prim::TupleConstruct")
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::Uninitialized")
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+# exists to refine the type of the Value
+# if x is an optional Tensor, unchecked_cast will cast
+# x to Tensor, so the rest of the graph knows that x is a Tensor
+# this doesn't do anything in runtime and is a noop in ONNX
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::dtype")
+def prim_dtype(g: jit_utils.GraphContext, self):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    # This node records a torch dtype as int
+    return g.op("Constant", value_t=torch.tensor(scalar_type))
+
+
+@_onnx_symbolic("prim::tolist")
+def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
+    """tolist is currently supported only for 1D input tensors.
+
+    dim_val and elem_ty_val represent dimension and type annotations
+    that need to match dimension and type of the input tensor.
+    """
+    dim = symbolic_helper._maybe_get_const(dim_val, "i")
+    if dim > 1:
+        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
+    return input
+
+
+# -----------------------------------------------------------------------------
+# Symbolic functions that need extra context
+# -----------------------------------------------------------------------------
+@_onnx_symbolic("prim::device")
+def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
+    output_type = g.original_node.output().type()
+    if isinstance(output_type, _C.DeviceObjType):
+        return None
+
+    return symbolic_helper._unimplemented(
+        "prim::device",
+        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
+        g.original_node.output(),
+    )
+
+
+@_onnx_symbolic("prim::Loop")
+def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    node = g.original_node
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    old_blocks = tuple(node.blocks())
+    _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
+    )
+
+    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+        # Copy input metadata to subblock
+        #
+        #   prim::Loop(iter, cond, input_1, ..., input_n)
+        #     block0(iter, input_1, ..., input_n)
+        #
+        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+        for i, b_in in enumerate(old_block.inputs()):
+            if i == 0 and i < len(inputs):
+                b_in.setType(inputs[i].type())
+            # For optional block inputs, they may switch between None not-None inside
+            # the loop body, so if the loop input is not optional, the block input may
+            # still need to be optional.
+            if (
+                i > 0
+                and (i + 1) < len(inputs)
+                and not isinstance(b_in.type(), _C.OptionalType)
+            ):
+                b_in.setType(inputs[i + 1].type())
+        torch._C._jit_pass_onnx_block(
+            old_block,
+            new_block_context.block,
+            operator_export_type,
+            env,
+            values_in_env,
+            False,
+        )
+    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+        new_node, opset_version
+    )
+    # Run shape type inference for Loop after subblock is converted.
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            new_node, params_dict, opset_version
+        )
+    return fixed_outputs
+
+
+@_onnx_symbolic("prim::If")
+def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    n = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    static_if = inputs[0].node().kind() == "onnx::Constant"
+    if static_if:
+        # Fold static if
+        #
+        # The torch IR
+        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        # %21 : Long(device=cpu) = aten::eq(%20, %64)
+        # %22 : Long(device=cpu) = prim::If(%21)
+        #     block0():
+        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+        #     -> (%23)
+        #     block1():
+        #     -> (%65)
+        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+        #     block0():
+        #     -> (%embedding_matrix.1, %input.1)
+        #     block1():
+        #     -> (%input.1, %embedding_matrix.1)
+        # %26 : int[] = aten::size(%input.53)
+        #
+        # The converted ONNX graph
+        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
+        const_value = (
+            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+        )
+        block_idx = 0 if const_value else 1
+        current_b = list(n.blocks())[block_idx]
+        env = torch._C._jit_pass_onnx_block(
+            current_b,
+            block,
+            operator_export_type,
+            env,
+            values_in_env,
+            True,
+        )
+        if_output_list = list(n.outputs())
+        current_b_list = list(current_b.outputs())
+
+        final_b_list = []
+        for idx in range(len(if_output_list)):
+            if current_b_list[idx] not in env:
+                raise errors.SymbolicValueError(
+                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
+                    current_b_list[idx],
+                )  # type:ignore[operator]
+            onnx_b = env[current_b_list[idx]]
+            final_b_list.append(onnx_b)
+        return final_b_list
+    else:
+        old_blocks = tuple(n.blocks())
+        _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
+        )
+
+        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+            torch._C._jit_pass_onnx_block(
+                old_block,
+                new_block_context.block,
+                operator_export_type,
+                env,
+                values_in_env,
+                False,
+            )
+        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for If after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return fixed_outputs
+
+
+@_onnx_symbolic("prim::Constant")
+def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+
+    if node.mustBeNone():
+        return None
+    # This must go before checking for string values, because some device constants
+    # have string values, but we want to keep them as unconverted Device types so
+    # that eq() can work on them.
+    if isinstance(node.output().type(), _C.DeviceObjType):
+        return None
+    if node.kindOf("value") == "t":
+        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
+    if node.kindOf("value") == "s":
+        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
+    if node.output().type().isSubtypeOf(
+        _C.ListType.ofInts()
+    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
+        return g.op(
+            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
+        )
+    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
+        str_constants = [
+            g.op("Constant", value_s=s)
+            for s in symbolic_helper._node_get(node, "value")
+        ]
+        return g.op("prim::ListConstruct", *str_constants)
+
+    raise errors.SymbolicValueError(
+        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
+        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+        node.output(),
+    )
+
+
+@_onnx_symbolic("prim::type")
+def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
+    if device_value.node().kind() == "prim::device":
+        device = jit_utils.get_device_from_value(device_value.node().input())
+        if device is not None:
+            return g.op("Constant", value_s=str(device))
+
+    return symbolic_helper._unimplemented(
+        "prim::type",
+        "Device type cannot be statically determined.",
+        device_value,
+    )
+
+
+@_onnx_symbolic("onnx::Placeholder")
+def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+
+    return torch._C._jit_onnx_convert_pattern_from_subblock(
+        block, node, env, values_in_env
+    )
+
+
+@_onnx_symbolic("aten::resolve_conj")
+@_onnx_symbolic("aten::resolve_neg")
+def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
+    return input
+
+
+@_onnx_symbolic("aten::_conj")
+@_onnx_symbolic("aten::conj_physical")
+def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
+    if symbolic_helper.is_complex_value(input):
+        # FIXME(justinchuby): report correct name for symbolic being executed
+        return symbolic_helper._onnx_unsupported(
+            "aten::_conj, aten::conj_physical",
+            input,
+        )
+
+    # they can safely be implemented as no-op for real numbers only
+    return noop_complex_operators(g, input)
+
+
+@_onnx_symbolic("aten::logit")
+def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
+    one = g.op("Constant", value_t=torch.tensor(1.0))
+
+    if not symbolic_helper._is_none(eps):
+        eps = g.op(
+            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
+        )
+        one_sub_eps = g.op("Sub", one, eps)
+        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
+        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
+
+        temporary_self_less_eps = g.op("Less", temporary_self, eps)
+        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
+    else:
+        z = self
+
+    sub = g.op("Sub", one, z)
+    div = g.op("Div", z, sub)
+    return g.op("Log", div)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index ec748568f642e..18fc701896dfc 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """Backward compatibility module for torch.onnx.utils."""
 
 from __future__ import annotations
@@ -7,3 +8,1885 @@
 
 # pyrefly: ignore [deprecated]
 from torch.onnx._internal.torchscript_exporter.utils import *  # noqa: F401,F403
+=======
+# mypy: allow-untyped-defs
+"""Functions to export models into the ONNX IR format.
+
+These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import copy
+import inspect
+import re
+import typing
+import warnings
+from typing import Any, Callable, cast
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.jit._trace
+import torch.serialization
+from torch import _C
+from torch.onnx import _constants, errors, symbolic_helper  # noqa: F401
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, onnx_proto_utils, registration
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Collection, Mapping, Sequence
+
+
+__all__ = [
+    "select_model_mode_for_export",
+    "disable_apex_o2_state_dict_hook",
+    "setup_onnx_logging",
+    "exporter_context",
+    "export",
+    "model_signature",
+    "warn_on_static_input_change",
+    "unpack_quantized_tensor",
+    "unconvertible_ops",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+]
+
+
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore[var-annotated]
+
+
+@deprecated("Please set training mode before exporting the model", category=None)
+@contextlib.contextmanager
+def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, resetting it when we exit the with-block.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+
+    Args:
+        model: Same type and meaning as ``model`` arg to :func:`export`.
+        mode: Same type and meaning as ``training`` arg to :func:`export`.
+    """
+    if not isinstance(mode, _C_onnx.TrainingMode):
+        raise TypeError(
+            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
+        )
+    originally_training: bool = False
+
+    if hasattr(model, "training"):
+        originally_training = model.training
+
+        # ONNX opset 12 has better support for training amenable models, with updated
+        # versions of the dropout and batch_norm operators
+        if mode == _C_onnx.TrainingMode.TRAINING or (
+            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
+        ):
+            GLOBALS.export_training = True
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset "
+                    f"version {GLOBALS.export_onnx_opset_version}. "
+                    "Opset versions lower than opset 12 will not be able to export "
+                    "nodes such as Dropout and BatchNorm correctly."
+                )
+        else:
+            GLOBALS.export_training = False
+
+        GLOBALS.training_mode = mode
+        if mode == _C_onnx.TrainingMode.TRAINING:
+            model.train(True)
+        elif mode == _C_onnx.TrainingMode.EVAL:
+            model.train(False)
+        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
+
+    try:
+        yield
+    finally:
+        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+            model.train(originally_training)
+
+
+@deprecated(
+    "Please remove usage of this function. Copy its logic if it is required in user code",
+    category=None,
+)
+@contextlib.contextmanager
+def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
+    """A context manager to temporarily disable the Apex O2 hook that returns.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    # Apex O2 hook state_dict to return fp16 weights as fp32.
+    # Exporter cannot identify them as same tensors.
+    # Since this hook is only used by optimizer, it is safe to
+    # remove this hook while exporting.
+    if not isinstance(model, torch.jit.ScriptFunction):
+        model_hooks = {}  # type: ignore[var-annotated]
+        for module in model.modules():
+            for key, hook in module._state_dict_hooks.items():
+                if type(hook).__name__ == "O2StateDictHook":
+                    if module not in model_hooks:
+                        model_hooks[module] = {}
+                    model_hooks[module][key] = hook
+            if module in model_hooks:
+                for key in model_hooks[module]:
+                    module._state_dict_hooks.pop(key)
+        try:
+            yield
+        finally:
+            # Add the hooks back
+            for module, m_map in model_hooks.items():
+                for key, hook in m_map.items():
+                    module._state_dict_hooks[key] = hook
+    else:
+        try:
+            yield
+        finally:
+            pass
+
+
+@deprecated("The feature will be removed. Please remove usage of this function")
+@contextlib.contextmanager
+def setup_onnx_logging(verbose: bool):
+    """A context manager to temporarily set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    is_originally_enabled = _C._jit_is_onnx_log_enabled
+    if is_originally_enabled or verbose:  # type: ignore[truthy-function]
+        _C._jit_set_onnx_log_enabled(True)
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:  # type: ignore[truthy-function]
+            _C._jit_set_onnx_log_enabled(False)
+
+
+@deprecated(
+    "The feature will be removed. Please remove usage of this function "
+    "and implement equivalent logic if needed",
+    category=None,
+)
+@contextlib.contextmanager
+def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, disable the Apex O2 hook, and set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+    """
+    with (
+        select_model_mode_for_export(model, mode) as mode_ctx,
+        disable_apex_o2_state_dict_hook(model) as apex_ctx,
+        setup_onnx_logging(verbose) as log_ctx,
+    ):
+        yield (mode_ctx, apex_ctx, log_ctx)
+
+
+def _get_torch_export_args(
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any] | None,
+) -> tuple[tuple[Any, ...], dict[str, Any] | None]:
+    """Obtain the arguments for torch.onnx.export from the model and the input arguments."""
+    if not kwargs and args and isinstance(args[-1], dict):
+        kwargs = args[-1]
+        args = args[:-1]
+    return args, kwargs
+
+
+def export(
+    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
+    args: tuple[Any, ...] | torch.Tensor,
+    f: str,
+    *,
+    kwargs: dict[str, Any] | None = None,
+    export_params: bool = True,
+    verbose: bool = False,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    opset_version: int | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool | None = None,
+    custom_opsets: Mapping[str, int] | None = None,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
+    autograd_inlining: bool = True,
+) -> None:
+    r"""Exports a model into ONNX format.
+
+    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
+    :class:`torch.jit.ScriptFunction`, this runs
+    ``model`` once in order to convert it to a TorchScript graph to be exported
+    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
+    for dynamic control flow as :func:`torch.jit.trace`.
+
+    Args:
+        model: The model to be exported.
+        args:
+
+            args can be structured either as:
+
+            1. ONLY A TUPLE OF ARGUMENTS::
+
+                args = (x, y, z)
+
+            The tuple should contain model inputs such that ``model(*args)`` is a valid
+            invocation of the model. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+
+            2. A TENSOR::
+
+                args = torch.Tensor([1])
+
+            This is equivalent to a 1-ary tuple of that Tensor.
+
+            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
+
+                args = (x, {"y": input_y, "z": input_z})
+
+            All but the last element of the tuple will be passed as non-keyword arguments,
+            and named arguments will be set from the last element. If a named argument is
+            not present in the dictionary, it is assigned the default value, or None if a
+            default value is not provided.
+
+            .. warning::
+                This behavior will be deprecated in a future release. Please use the
+                kwargs argument instead.
+
+            .. note::
+                If a dictionary is the last element of the args tuple, it will be
+                interpreted as containing named arguments. In order to pass a dict as the
+                last non-keyword arg, provide an empty dict as the last element of the args
+                tuple. For example, instead of::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            # WRONG: will be interpreted as named arguments
+                            {y: z},
+                        ),
+                        "test.onnx.pb",
+                    )
+
+                Write::
+
+                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
+
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Named arguments to the model.
+        export_params: If True, all parameters will
+            be exported. Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
+        verbose: if True, prints a description of the
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``. If True, ONNX exporter logging will be turned on.
+        training:
+            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
+                False and in training mode if model.training is True.
+            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
+                which might interfere with training.
+        input_names (list of str, default empty list): names to assign to the
+            input nodes of the graph, in order.
+        output_names (list of str, default empty list): names to assign to the
+            output nodes of the graph, in order.
+        operator_export_type (enum, default OperatorExportTypes.ONNX):
+
+            .. warning::
+                This option will be deprecated in a future release. Future exported
+                graphs will always use the default opset domain.
+
+            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
+                (in the default opset domain).
+            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
+                to standard ONNX ops in the default opset domain. If unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting the op into a custom opset domain without conversion. Applies
+                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+                as well as ATen ops. For the exported model to be usable, the runtime must support
+                these non-standard ops.
+            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
+                are exported as ATen ops (in opset domain "org.pytorch.aten").
+                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
+                this instructs the runtime to use PyTorch's implementation of these ops.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+                    This may be useful if the numeric differences in implementations of operators are
+                    causing large differences in behavior between PyTorch and Caffe2 (which is more
+                    common on untrained models).
+
+            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
+                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
+                context.
+                For example::
+
+                    graph(%0 : Float):
+                    %3 : int = prim::Constant[value=0]()
+                    # conversion unsupported
+                    %4 : Float = aten::triu(%0, %3)
+                    # conversion supported
+                    %5 : Float = aten::mul(%4, %0)
+                    return (%5)
+
+                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
+
+                    graph(%0 : Float):
+                    %1 : Long() = onnx::Constant[value={0}]()
+                    # not converted
+                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
+                    # converted
+                    %3 : Float = onnx::Mul(%2, %0)
+                    return (%3)
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+        opset_version (int, default 18): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7.
+        do_constant_folding: Apply the constant-folding optimization.
+            Constant-folding will replace some of the ops that have all constant inputs
+            with pre-computed constant nodes.
+        dynamic_axes:
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    },
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to parameters) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
+            This may allow for better optimizations (e.g. constant folding) by
+            backends/runtimes.
+
+            If True, `deduplicate_initializers` pass will not be executed. This means
+            initializers with duplicated values will not be deduplicated and
+            will be treated as distinct inputs to the graph. This allows different
+            input initializers to be supplied at the runtime following export.
+
+            If ``opset_version < 9``, initializers MUST be part of graph
+            inputs and this argument will be ignored and the behavior will be
+            equivalent to setting this argument to True.
+
+        custom_opsets (dict[str, int], default empty dict): A dict with schema:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+
+        export_modules_as_functions: Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+
+        autograd_inlining: Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+
+    Raises:
+        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
+        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
+            uses an operator that is not supported by the exporter.
+        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
+            All errors are subclasses of :class:`errors.OnnxExporterError`.
+    """
+    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
+        warnings.warn(
+            "Setting `operator_export_type` to something other than default is deprecated. "
+            "The option will be removed in a future release.",
+            category=DeprecationWarning,
+        )
+    if training == _C_onnx.TrainingMode.TRAINING:
+        warnings.warn(
+            "Setting `training` to something other than default is deprecated. "
+            "The option will be removed in a future release. Please set the training mode "
+            "before exporting the model.",
+            category=DeprecationWarning,
+        )
+
+    args = (args,) if isinstance(args, torch.Tensor) else args
+    if kwargs is not None:
+        args = args + (kwargs,)
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+        autograd_inlining=autograd_inlining,
+    )
+
+    return None
+
+
+def _is_constant_tensor_list(node):
+    if node.kind() != "prim::Constant":
+        return False
+    output_type = node.output().type()
+    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
+        return True
+    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
+        return True
+
+
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+
+
+def _split_tensor_list_constants(g, block):
+    for node in block.nodes():
+        for subblock in node.blocks():
+            _split_tensor_list_constants(g, subblock)
+        if _is_constant_tensor_list(node):
+            inputs = []
+            for val in node.output().toIValue():
+                input = g.insertConstant(val)
+                input.node().moveBefore(node)
+                input.node().copyMetadata(node)
+                inputs.append(input)
+
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(_C.ListType.ofTensors())
+            )
+            lc.node().copyMetadata(node)
+            node.output().replaceAllUsesWith(lc)
+
+
+def _optimize_graph(
+    graph: _C.Graph,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
+    if params_dict is None:
+        params_dict = {}
+
+    # Inline everything
+    _C._jit_pass_inline(graph)
+
+    # Remove fork/wait nodes
+    _C._jit_pass_inline_fork_wait(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.autograd_inlining:
+        _C._jit_pass_onnx_autograd_function_process(graph)
+    _C._jit_pass_lower_all_tuples(graph)
+
+    # we now record some ops like ones/zeros
+    # into a trace where we previously recorded constants.
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    if _disable_torch_constant_prop is False:
+        _C._jit_pass_constant_propagation(graph)
+
+    _split_tensor_list_constants(graph, graph)
+    # run dce to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+    _C._jit_pass_dce(graph)
+    _C._jit_pass_lint(graph)
+
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
+    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    _C._jit_pass_lint(graph)
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_fuse_addmm(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_lower_all_tuples(graph)
+    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
+    # However, there are nodes that cannot be converted without additional context.
+    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
+    # until the point where it is unpacked by listUnpack node.
+    # This pass does a preprocess, and prepares the nodes such that enough context can be received
+    # by the symbolic function.
+    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+    _C._jit_pass_onnx_preprocess(graph)
+
+    # onnx does not support tuples, so try to remove them
+    _C._jit_pass_lint(graph)
+
+    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
+    _C._jit_pass_prepare_division_for_onnx(graph)
+
+    _C._jit_pass_onnx_remove_print(graph)
+    _C._jit_pass_onnx_preprocess_caffe2(graph)
+
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
+    # onnx only supports tensors, so we turn all out number types into tensors
+    _C._jit_pass_erase_number_types(graph)
+    if GLOBALS.onnx_shape_inference:
+        input_names = [] if input_names is None else input_names
+        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+    _C._jit_pass_onnx_lint(graph)
+
+    graph = _C._jit_pass_onnx(graph, operator_export_type)
+    _C._jit_pass_onnx_lint(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
+    _C._jit_pass_lint(graph)
+
+    # graph is not a valid jit graph anymore because types have been replaced
+    # (e.g. int with Tensor), so it now contains operators that don't actually
+    # exist. We can't run normal dead code elimination because it'd fail trying
+    # to look up if an operator has side effects, but we can run a dead code
+    # elimination variant that doesn't need to look up if an op has side effects.
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    _C._jit_pass_lint(graph)
+    graph = _C._jit_pass_canonicalize(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    return graph
+
+
+def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
+    for input, traced_input in zip(input_states[0], input_states[1]):
+        if isinstance(input, dict):
+            if list(input.keys()) != list(traced_input.keys()):
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
+                warnings.warn(warning)
+        elif isinstance(input, str):
+            if input != traced_input:
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
+                warnings.warn(warning)
+
+
+def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    return arg_value
+
+
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: bool | None,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
+    if opset_version < 9:
+        if keep_initializers_as_inputs is False:
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
+        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
+    ):
+        val_keep_init_as_ip = False
+    return val_keep_init_as_ip
+
+
+def _decide_add_node_names(add_node_names, operator_export_type):
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
+
+
+def _decide_constant_folding(do_constant_folding, operator_export_type, training):
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not _C_onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
+    return do_constant_folding
+
+
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
+def _decide_input_format(model, args):
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping _decide_input_format")
+        return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
+    # Cases of models with no input args
+    except IndexError:
+        warnings.warn("No input args, skipping _decide_input_format")
+    except Exception as e:
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+    return args
+
+
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    warn_on_static_input_change(inputs_states)
+
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
+    if return_outs:
+        return trace_graph, torch_out
+    return trace_graph
+
+
+def _trace_and_get_graph_from_model(model, args):
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
+
+    # Disable Autocast cache because it replaces kernel's weight and bias
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
+    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
+
+    warn_on_static_input_change(inputs_states)
+
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
+
+    return trace_graph, torch_out
+
+
+def _get_param_count_list(method_graph, args_params):
+    param_count_list = []
+    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
+        if "PackedParams" in str(input_.type()):
+            in_vars, _ = torch.jit._flatten(arg_params_)
+            param_count_list.append(len(in_vars))
+        else:
+            param_count_list.append(arg_params_ is not None)
+
+    return param_count_list
+
+
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                yield from flatten(inner)
+        elif isinstance(x, dict):
+            for inner in x.values():
+                yield from flatten(inner)
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
+def _create_jit_graph(
+    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
+) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
+        torch_out = None
+
+        if isinstance(model, torch.jit.ScriptModule):
+            try:
+                graph = model.forward.graph  # type: ignore[attr-defined]
+            except AttributeError as e:
+                raise RuntimeError("'forward' method must be a script method") from e
+            _C._jit_pass_onnx_function_substitution(graph)
+            freezed_module = _C._freeze_module(
+                cast(_C.ScriptModule, model._c), preserveParameters=True
+            )
+            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
+            method_graph = module._get_method("forward").graph
+            args_params = tuple(args) + tuple(params)
+            param_count_list = _get_param_count_list(method_graph, args_params)
+            in_vars, _ = torch.jit._flatten(args_params)
+            graph = _C._propagate_and_assign_input_shapes(
+                method_graph, tuple(in_vars), param_count_list, False, False
+            )
+            return graph, params, torch_out, module
+
+        # torch.jit.ScriptFunction
+        params = []
+        graph = model.graph
+        _C._jit_pass_onnx_function_substitution(graph)
+        param_count_list = _get_param_count_list(graph, args)
+        graph = _C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
+        return graph, params, torch_out, None
+
+    graph, torch_out = _trace_and_get_graph_from_model(model, args)
+    _C._jit_pass_onnx_lint(graph)
+    state_dict = torch.jit._unique_state_dict(model)
+    params = list(state_dict.values())
+    graph_inputs = list(graph.inputs())
+    user_input_num = len(graph_inputs) - len(state_dict)
+    param_names = list(state_dict.keys())
+    for i, inp in enumerate(graph_inputs):
+        if i >= user_input_num:
+            inp.setDebugName(param_names[i - user_input_num])
+    _C._jit_pass_onnx_function_substitution(graph)
+    return graph, params, torch_out, None
+
+
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
+
+
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
+    return example_outputs
+
+
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+def unpack_quantized_tensor(value, cast_onnx_accepted=True):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = (
+            torch.tensor(value.q_scale(), dtype=torch.double)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_scale(), dtype=torch.float32)
+        )
+        q_zero_point = (
+            torch.tensor(value.q_zero_point(), dtype=torch.int64)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
+        )
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    dynamic_axes=None,
+) -> tuple[
+    _C.Graph,
+    dict[str, torch.Tensor],
+    torch.Tensor
+    | tuple[torch.Tensor, ...]
+    | list[torch.Tensor]
+    | dict[str, torch.Tensor]
+    | Any
+    | None,
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph: A TorchScript IR Graph with ONNX nodes.
+        params_dict: Dict from input param name to param value.
+        torch_out: The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
+    """
+    # TODO: can we simplify this to always return a tuple of Tensor or None?
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, (torch.Tensor, int, float, bool)):
+        args = (args,)
+
+    model = _pre_trace_quant_model(model, args)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
+    params_dict = _get_named_param_dict(graph, params)
+
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception:
+        _C._jit_onnx_log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
+        example_outputs = _get_example_outputs(model, args)
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        _C._jit_pass_onnx_assign_output_shape(
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
+        )
+
+    # NB: ONNX requires complete information about output types, which might be
+    # erased by some optimizations, so we need to set it explicitly again.
+    else:
+        if not isinstance(torch_out, (list, tuple)):
+            output_wrapped = [torch_out]
+        else:
+            output_wrapped = torch_out  # type: ignore[assignment]
+
+        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            _C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+                GLOBALS.export_onnx_opset_version,
+            )
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    params_dict = _get_named_param_dict(graph, params)
+
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        if training is None or training == _C_onnx.TrainingMode.EVAL:
+            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+        params_dict = _C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if GLOBALS.export_onnx_opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    # If output names lack a proper name and are identified only by their unique
+    # give them a legible name for debugging purposes
+    _apply_friendly_debug_names(graph, params_dict)
+
+    return graph, params_dict, torch_out
+
+
+@deprecated(
+    "Unconvertible ops are not definitive. Please remove usage of this function"
+)
+def unconvertible_ops(
+    model,
+    args,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+) -> tuple[_C.Graph, list[str]]:
+    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
+
+    .. deprecated:: 2.5
+        Unconvertible ops are not definitive. Please remove usage of this function.
+
+    The list is approximated because some ops may be removed during the conversion
+    process and don't need to be converted. Some other ops may have partial support
+    that will fail conversion with particular inputs. Please open a Github Issue
+    for op support requests.
+
+    Args:
+        model: Same as the `model` parameter in :func:`torch.onnx.export`.
+        args: Same as the `args` parameter in :func:`torch.onnx.export`.
+        training: Same as the `training` parameter in :func:`torch.onnx.export`.
+        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
+
+    Returns:
+        The JIT graph and a list of unconvertible ops in the format of "domain::op".
+    """
+
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
+    GLOBALS.export_onnx_opset_version = opset_version
+
+    try:
+        with exporter_context(model, training, verbose=False):
+            # Create a mostly clean JIT graph that contains the plain aten and
+            # other ops we can check with the symbolic registry.
+            # NOTE: We don't want to actually convert any ops to ONNX or run any
+            # symbolic functions because there is a higher chance that a pass
+            # fails or an unconvertible op messes up the graph during ONNX conversion.
+            # This way we can always generate a list just by looking at the names
+            # of the ops in the graph.
+            args = _decide_input_format(model, args)
+            model = _pre_trace_quant_model(model, args)
+            graph, _, _, module = _create_jit_graph(model, args)
+            _C._jit_pass_inline(graph)
+            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+            _C._jit_pass_erase_number_types(graph)
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    except Exception as e:
+        raise errors.OnnxExporterError(
+            "Failed to discover unconvertible ops because of errors during the JIT graph "
+            "generation process."
+        ) from e
+
+    unsupported_ops = []
+    for node in graph.nodes():
+        domain_op = node.kind()
+        if domain_op.startswith(("onnx::", "prim::")):
+            # We consider onnx and prim ops as supported ops, even though some "prim"
+            # ops are not implemented as symbolic functions, because they may be
+            # eliminated in the conversion passes. Users may still see errors caused
+            # by prim ops even though they don't show up in the list.
+            continue
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
+            # We consider all registered ops supported, even though some of them are
+            # only partially supported, because there is not yet a good way to check
+            # if an op is fully supported.
+            # TODO(justinchuby): Create a way to check if an op is fully supported.
+            unsupported_ops.append(domain_op)
+    return graph, unsupported_ops
+
+
+def _setup_trace_module_map(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
+) -> set[str]:
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+
+        Example:
+            >>> _unqualified_variable_name("__main__.Foo.bar")
+            'bar'
+            >>> _unqualified_variable_name("__main__.Foo.bar.0")
+            'bar.0'
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(
+            torch.typename(type(_m)), _unqualified_variable_name(_n)
+        )
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    f"Got `{type(v).__name__}`."
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+def _reset_trace_module_map():
+    torch.jit._trace._trace_module_map = None
+    _C._jit_pass_onnx_clear_scope_records()
+
+
+def _get_module_attributes(module):
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    # Check whether module attributes can be accessed. Some classes
+    # define attributes but don't provide access to them in their
+    # constructor.
+    #
+    # For example, torch.nn.Embedding has the `freeze` variable and its
+    # type specified in the class but the attribute is not created in the
+    # constructor. In other words, there is no `self.freeze = <True | False>`
+    # in the constructor.
+    #
+    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions: Any = False,
+    autograd_inlining=True,
+):
+    assert GLOBALS.in_onnx_export is False
+
+    if isinstance(model, torch.nn.DataParallel):
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
+
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+        warnings.warn(
+            f"Exporting to ONNX opset version {opset_version} is not supported. "
+            f"by 'torch.onnx.export()'. "
+            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
+            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
+            category=errors.OnnxExporterWarning,
+        )
+
+    if export_modules_as_functions and opset_version < 15:
+        raise ValueError(
+            "`export_modules_as_functions` is not supported for `opset_version` < 15."
+            "This is because `opset_version` < 15 implies IR version < 8, which means "
+            "no local function support. "
+        )
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    # By default, training=TrainingMode.EVAL,
+    # which is good because running a model in training mode could result in
+    # internal buffers getting updated, dropout getting applied, etc.
+    # If you really know what you're doing, you can turn
+    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
+    # (to preserve whatever the original training mode was.)
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    try:
+        GLOBALS.in_onnx_export = True
+        _autograd_inlining_previous = GLOBALS.autograd_inlining
+        GLOBALS.autograd_inlining = autograd_inlining
+
+        module_typenames_to_export_as_functions: set[str] = set()
+        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
+            module_typenames_to_export_as_functions = _setup_trace_module_map(
+                model, export_modules_as_functions
+            )
+
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs,
+                operator_export_type,
+                opset_version,
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
+            # Normally f can be a file-like object, but for large models, the external data format requires a
+            # valid `model_file_location`. Code in export.cpp will enforce this.
+            if isinstance(f, str):
+                model_file_location = f
+            else:
+                model_file_location = ""
+            args = _decide_input_format(model, args)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
+
+            if custom_opsets is None:
+                custom_opsets = {}
+
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+            node_attr_to_name = {}  # type: ignore[var-annotated]
+            if module_typenames_to_export_as_functions:
+                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
+                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
+                    graph,
+                    module_typenames_to_export_as_functions,
+                    list(params_dict.keys()),
+                )
+
+            if keep_initializers_as_inputs is not True:
+                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                    graph,
+                    params_dict,  # type: ignore[arg-type]
+                    getattr(model, "training", False),  # type: ignore[arg-type]
+                )
+            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
+            defer_weight_export = False
+            if export_params:
+                (
+                    proto,
+                    export_map,
+                    _val_use_external_data_format,
+                    _node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            else:
+                (
+                    proto,
+                    export_map,
+                    _,
+                    _,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            # insert function_proto into model_proto.
+            proto = onnx_proto_utils._add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
+            if verbose:
+                _C._jit_onnx_log("Exported graph: ", graph)
+            onnx_proto_utils._export_file(proto, f, export_map)
+    finally:
+        assert GLOBALS.in_onnx_export
+        GLOBALS.in_onnx_export = False
+        GLOBALS.autograd_inlining = _autograd_inlining_previous
+        _reset_trace_module_map()
+
+    return torch_out
+
+
+def _apply_friendly_debug_names(graph, params):
+    for n in graph.nodes():
+        for v in n.inputs():
+            old_name = v.debugName()
+            if old_name != str(v.unique()):
+                continue
+            new_name = f"{n.kind()}_{v.unique()}"
+            v.setDebugName(new_name)
+            if old_name in params:
+                params[new_name] = params.pop(old_name)
+
+
+def _set_input_and_output_names(graph, input_names, output_names):
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                f"number of {descriptor} names provided ({len(name_list)}) "
+                f"exceeded number of {descriptor}s ({len(node_list)})"
+            )
+
+        # Mark if the output node DebugName is set before.
+        output_node_set = set()
+        for i, (name, node) in enumerate(zip(name_list, node_list)):
+            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
+            if descriptor == "output":
+                if node in output_node_set:
+                    identity_node = graph.create("onnx::Identity")
+                    identity_node.insertAfter(node.node())
+                    identity_node.addInput(node)
+                    identity_node.output().setType(node.type())
+                    graph.return_node().replaceInput(i, identity_node.output())
+                    node = identity_node.output()
+                output_node_set.add(node)
+
+            if node.debugName() != name:
+                node.setDebugName(name)
+
+    set_names(list(graph.inputs()), input_names, "input")
+    set_names(list(graph.outputs()), output_names, "output")
+
+
+def _run_symbolic_method(g, op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+            values_in_env=set(),
+            new_nodes=[],
+        )
+        return symbolic_fn(graph_context, *args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
+        raise
+
+
+def _add_block(node: _C.Node) -> _C.Block:
+    return node.addBlock()
+
+
+def _add_input_to_block(block: _C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
+
+
+def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
+    return block.registerOutput(value)
+
+
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
+    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
+    is_aten_fallback_export = (
+        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+
+    if not name.startswith("aten::"):
+        return False
+
+    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
+        return True
+
+    return False
+
+
+def _get_aten_op_overload_name(n: _C.Node) -> str:
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::"):
+        return ""
+    return _C.parse_schema(schema).overload_name
+
+
+def _run_symbolic_function(
+    graph: _C.Graph,
+    block: _C.Block,
+    node: _C.Node,
+    inputs: Any,
+    env: dict[_C.Value, _C.Value],
+    values_in_env: set[_C.Value],
+    new_nodes: list[_C.Node],
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+) -> _C.Value | Sequence[_C.Value | None] | None:
+    """Runs a symbolic function.
+
+    The function is used in C++ to export the node to ONNX.
+
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
+
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    # See Note [Export inplace]
+    node_kind = node.kind()
+    if node_kind.endswith("_"):
+        # Treat relu_ -> relu; add_ -> add etc.
+        ns_op_name = node_kind[:-1]
+    else:
+        ns_op_name = node_kind
+
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
+
+    graph_context = jit_utils.GraphContext(
+        graph=graph,
+        block=block,
+        opset=opset_version,
+        original_node=node,
+        params_dict=_params_dict,
+        env=env,
+        values_in_env=values_in_env,
+        new_nodes=new_nodes,
+    )
+
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.aten_op(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
+    try:
+        domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
+
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
+                attrs = {
+                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
+                }
+                return symbolic_fn(graph_context, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        if namespace == "onnx":
+            # Clone node to trigger ONNX shape inference
+            return graph_context.op(
+                op_name, *inputs, **attrs, outputs=node.outputsSize()
+            )  # type: ignore[attr-defined]
+
+        raise errors.UnsupportedOperatorError(
+            symbolic_function_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
+    except RuntimeError:
+        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            return None
+        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {
+                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+                for k in node.attributeNames()
+            }
+            return graph_context.aten_op(
+                op_name,
+                *inputs,
+                overload_name=_get_aten_op_overload_name(node),
+                **attrs,
+            )
+        raise
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
+        raise
+
+
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
+    if ns == "onnx":
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
+
+
+def register_custom_op_symbolic(
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
+):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        symbolic_fn (Callable): A function that takes in the ONNX graph and
+            the input arguments to the current operator, and returns new
+            operator nodes to add to the graph.
+        opset_version (int): The ONNX opset version in which to register.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
+
+
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+    """Unregisters ``symbolic_name``.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        opset_version (int): The ONNX opset version in which to unregister.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.registry.unregister(symbolic_name, opset_version)
+
+
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
+    if len(dynamic_axes) == 0:
+        return
+
+    if hasattr(model, "graph"):
+        # Extracting set of valid input/output names that shall be used for dynamic_axes
+        if (input_names is None) or len(input_names) == 0:
+            input_names = [x.debugName() for x in model.graph.inputs()]
+        if (output_names is None) or len(output_names) == 0:
+            output_names = [y.debugName() for y in model.graph.outputs()]
+
+    valid_names = set((input_names or []) + (output_names or []))
+
+    # If dynamic axes are provided as a list rather than dictionary, they should
+    # first get converted to a dictionary in expected format. If desired axes names
+    # are not provided for dynamic axes, automatic names shall be generated for
+    # provided dynamic axes of specified input/output
+    for key, value in dynamic_axes.items():
+        if key not in valid_names:
+            warnings.warn(
+                f"Provided key {key} for dynamic axes is not a valid input/output name"
+            )
+        if isinstance(value, list):
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+            )
+
+            value_dict = {}
+            for i, x in enumerate(value):
+                if not isinstance(x, int):
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
+                if x in value_dict:
+                    warnings.warn(
+                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                    )
+                else:
+                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
+            dynamic_axes[key] = value_dict
+
+
+def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
+    return inspect.signature(
+        model.forward if isinstance(model, torch.nn.Module) else model
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index 70d901acb47a9..eb428dad8c2ab 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -1,12 +1,1886 @@
+<<<<<<< HEAD
 """A set of tools to verify the correctness of ONNX models."""
 
 __all__ = ["VerificationInfo", "verify_onnx_program"]
 
+=======
+# mypy: allow-untyped-defs
+"""The ONNX verification module provides a set of tools to verify the correctness of ONNX models."""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "OnnxBackend",
+    "VerificationOptions",
+    "verify",
+    "check_export_model_diff",
+    "VerificationInfo",
+    "verify_onnx_program",
+    "GraphInfo",
+    "GraphInfoPrettyPrinter",
+    "OnnxTestCaseRepro",
+    "find_mismatch",
+    "verify_aten_graph",
+]
+
+import contextlib
+import copy
+import dataclasses
+import datetime
+import difflib
+import enum
+import functools
+import io
+import itertools
+import os
+import tempfile
+import typing_extensions
+import warnings
+from collections.abc import Collection, Mapping, Sequence
+from typing import Any, Callable, Union
+
+import numpy as np
+import numpy.typing as npt
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, _experimental, utils
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import onnx_proto_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.exporter._verification import (
     VerificationInfo,
     verify_onnx_program,
 )
+<<<<<<< HEAD
 
 
 VerificationInfo.__module__ = "torch.onnx.verification"
 verify_onnx_program.__module__ = "torch.onnx.verification"
+=======
+from torch.types import Number
+
+
+# TODO: Update deprecation messages to recommend the new classes
+
+VerificationInfo.__module__ = "torch.onnx.verification"
+verify_onnx_program.__module__ = "torch.onnx.verification"
+
+# Everything below are deprecated ##############################################
+
+_ORT_PROVIDERS = ("CPUExecutionProvider",)
+
+_NumericType = Union[Number, torch.Tensor, np.ndarray]
+_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
+_InputArgsType = Union[torch.Tensor, tuple[Any, ...]]
+_InputKwargsType = Mapping[str, Any]
+_OutputsType = Union[Sequence[_NumericType], Sequence]
+
+
+class OnnxBackend(enum.Enum):
+    """Enum class for ONNX backend used for export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+    """
+
+    REFERENCE = "ONNXReferenceEvaluator"
+    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
+    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
+
+
+@dataclasses.dataclass
+class VerificationOptions:
+    """Options for ONNX export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Attributes:
+        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
+            Tensors for ONNX. Set this to False if nested structures are to be preserved
+            for ONNX, which is usually the case with exporting ScriptModules. Default True.
+        ignore_none: Whether to ignore None type in torch output, which is usually the
+            case with tracing. Set this to False, if torch output should keep None type,
+            which is usually the case with exporting ScriptModules. Default to True.
+        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
+            are exactly the same. Set this to False to allow output shape broadcasting.
+            Default to True.
+        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
+            are consistent. Default to True.
+        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
+        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
+        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
+        remained_onnx_input_idx: If provided, only the specified inputs will be passed
+            to the ONNX model. Supply a list when there are unused inputs in the model.
+            Since unused inputs will be removed in the exported ONNX model, supplying
+            all inputs will cause an error on unexpected inputs. This parameter tells
+            the verifier which inputs to pass into the ONNX model.
+        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
+            It should be a float of value between 0.0 and 1.0.
+    """
+
+    flatten: bool = True
+    ignore_none: bool = True
+    check_shape: bool = True
+    check_dtype: bool = True
+    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
+    rtol: float = 1e-3
+    atol: float = 1e-7
+    remained_onnx_input_idx: Sequence[int] | None = None
+    acceptable_error_percentage: float | None = None
+
+
+def _flatten_tuples(elem):
+    flattened = []
+    for t in elem:
+        if isinstance(t, tuple):
+            flattened.extend(_flatten_tuples(t))
+        else:
+            flattened.append(t)
+    return flattened
+
+
+# TODO(justinchuby): Add type checking by narrowing down the return type when input is None
+def _to_numpy(elem) -> list | npt.NDArray:
+    if isinstance(elem, torch.Tensor):
+        if elem.requires_grad:
+            return elem.detach().cpu().numpy()
+        else:
+            return elem.cpu().numpy()
+    elif isinstance(elem, (list, tuple)):
+        return [_to_numpy(inp) for inp in elem]
+    elif isinstance(elem, (bool, int, float)):
+        return np.array(elem)
+    elif isinstance(elem, dict):
+        flattened = []
+        for k in elem:
+            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
+        return flattened
+    return elem
+
+
+def _inline_flatten_list(inputs, res_list) -> list:
+    for i in inputs:
+        res_list.append(i) if not isinstance(
+            i, (list, tuple)
+        ) else _inline_flatten_list(i, res_list)
+    return res_list
+
+
+def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(
+            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
+        )
+    return [_to_numpy(v) for v in value_unpacked]
+
+
+def _run_onnx(onnx_session, inputs) -> _OutputsType:
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = _to_numpy(input)
+    inputs = _to_numpy(inputs)
+    if hasattr(onnx_session, "get_inputs"):
+        # onnxruntime.InferenceSession
+        input_names = [i.name for i in onnx_session.get_inputs()]
+    elif hasattr(onnx_session, "input_names"):
+        # onnx.reference.ReferenceEvaluator
+        input_names = onnx_session.input_names
+    else:
+        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
+
+    for i, input in enumerate(inputs):
+        if i == len(input_names) or input_names[i] in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
+                f"input names: {input_names}."
+            )
+        ort_inputs[input_names[i]] = input
+    onnx_outs = onnx_session.run(None, ort_inputs)
+    return onnx_outs
+
+
+def _ort_session(
+    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
+):
+    try:
+        import onnxruntime  # type: ignore[import]
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
+
+    if ort_providers is None:
+        ort_providers = _ORT_PROVIDERS
+
+    session_options = onnxruntime.SessionOptions()
+    # suppress ort warnings.
+    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+    session_options.log_severity_level = 3
+    ort_session = onnxruntime.InferenceSession(
+        model if isinstance(model, str) else model.getvalue(),
+        session_options,
+        providers=ort_providers,
+    )
+    return ort_session
+
+
+def _onnx_reference_evaluator_session(model: str | io.BytesIO):
+    try:
+        import onnx
+        from onnx import reference as onnx_reference  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError("onnx >= 1.13 is required for reference evaluator.") from exc
+
+    proto = (
+        onnx.load(model)  # type: ignore[attr-defined]
+        if isinstance(model, str)
+        else onnx.load_model_from_string(model.getvalue())  # type: ignore[attr-defined]
+    )
+    onnx_session = onnx_reference.ReferenceEvaluator(proto)
+    return onnx_session
+
+
+def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
+    if backend == OnnxBackend.REFERENCE:
+        onnx_session = _onnx_reference_evaluator_session(model)
+    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
+        onnx_session = _ort_session(model, (backend.value,))
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+    return onnx_session
+
+
+def _compare_onnx_pytorch_outputs_in_np(
+    onnx_outs: _OutputsType,
+    pt_outs: _OutputsType,
+    options: VerificationOptions,
+):
+    assert len(onnx_outs) == len(pt_outs), (
+        f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
+    )
+    acceptable_error_percentage = options.acceptable_error_percentage
+    if acceptable_error_percentage and (
+        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
+    ):
+        raise ValueError(
+            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
+        )
+
+    for ort_out, pt_out in zip(onnx_outs, pt_outs):
+        try:
+            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
+            if not options.check_shape:
+                # Allow different but broadcastable output shapes.
+                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
+            torch.testing.assert_close(
+                ort_out,
+                pt_out,
+                rtol=options.rtol,
+                atol=options.atol,
+                check_dtype=options.check_dtype,
+                equal_nan=True,
+            )
+        except AssertionError as e:
+            if acceptable_error_percentage:
+                error_percentage = 1 - np.sum(
+                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
+                ) / np.prod(ort_out.shape)
+                if error_percentage <= acceptable_error_percentage:
+                    warnings.warn(
+                        f"Suppressed AssertionError:\n{e}.\n"
+                        f"Error percentage {error_percentage} "
+                        f"within acceptable range {acceptable_error_percentage}."
+                    )
+                    continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
+            raise
+
+
+def _compare_onnx_pytorch_outputs(
+    onnx_outs: _OutputsType,
+    pt_outs: Any,
+    options: VerificationOptions,
+):
+    """
+    Compare ONNX and PyTorch outputs.
+
+    Args:
+        onnx_outs: outputs from ONNX backend.
+        pt_outs: outputs from PyTorch.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options.ignore_none:
+        # torch.jit._flatten filters None type
+        pt_outs, _ = torch.jit._flatten(pt_outs)
+    else:
+        pt_outs = _inline_flatten_list([pt_outs], [])
+    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
+    onnx_outs = _inline_flatten_list(onnx_outs, [])
+    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
+
+
+def _prepare_input_for_pytorch(args, kwargs):
+    """Prepare input for PyTorch model execution.
+
+    Any future changes/formatting to the input before dispatching to the PyTorch
+    model should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+    """
+    if isinstance(args, (torch.Tensor, dict)):
+        args = (args,)
+    # In-place operators will update input tensor data as well.
+    # Thus inputs are replicated before every forward call.
+    args = copy.deepcopy(args)
+    if kwargs:
+        kwargs = copy.deepcopy(kwargs)
+    else:
+        kwargs = {}
+    return args, kwargs
+
+
+def _prepare_input_for_export(args, kwargs):
+    """Prepare input for ONNX model export.
+
+    Any future changes/formatting to the input before dispatching to the
+    :func:`torch.onnx.export` api should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model export, as `args` in
+            :func:`torch.onnx.export`.
+    """
+    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
+    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
+        onnx_inputs = args + ({},)
+    elif kwargs:
+        onnx_inputs = args + (kwargs,)
+    else:
+        onnx_inputs = args
+    return onnx_inputs
+
+
+def _prepare_input_for_onnx(
+    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
+):
+    """Prepare input for ONNX model execution in ONNX backend.
+
+    Any future changes/formatting to the input before dispatching to the ONNX backend
+    run should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
+        flatten: whether to flatten the input before dispatching to the ONNX model execution.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
+    """
+    onnx_inputs = _prepare_input_for_export(args, kwargs)
+    if flatten:
+        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
+    elif onnx_inputs and onnx_inputs[-1] == {}:
+        # Handle empty kwargs (normally removed by flatten).
+        onnx_inputs = onnx_inputs[:-1]
+    if remained_onnx_input_idx is not None:
+        return [onnx_inputs[i] for i in remained_onnx_input_idx]
+    else:
+        return onnx_inputs
+
+
+def _try_clone_model(model):
+    """Used for preserving original model in case forward mutates model states."""
+    try:
+        return copy.deepcopy(model)
+    except Exception:
+        warnings.warn(
+            "Failed to clone model. Model state might be mutated during verification."
+        )
+        return model
+
+
+def _compare_onnx_pytorch_model(
+    pt_model: _ModelType,
+    onnx_model_f: str | io.BytesIO,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None,
+    additional_test_inputs: Sequence[_InputArgsType] | None,
+    options: VerificationOptions,
+):
+    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
+
+    Args:
+        pt_model: PyTorch model.
+        onnx_model_f: ONNX model file path or file-like object.
+        input_args: positional arguments for PyTorch model forward method.
+        input_kwargs: keyword arguments for PyTorch model forward method.
+        additional_test_inputs: additional positional arguments for PyTorch model
+            forward method.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+    """
+    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
+
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
+        # TODO: remove this and treat mutating model separately. See #77679
+        pt_model_copy = _try_clone_model(pt_model)
+        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
+        )
+
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+
+        _compare_onnx_pytorch_outputs(
+            onnx_outs=onnx_outs,
+            pt_outs=pt_outs,
+            options=options,
+        )
+
+    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
+
+    if additional_test_inputs:
+        for test_input_args in additional_test_inputs:
+            compare_onnx_pytorch_model_with_input(test_input_args, {})
+
+
+class _GraphDiff:
+    """A class to represent the difference between two graphs."""
+
+    def __init__(self, graph_a: _C.Graph, graph_b: _C.Graph):
+        """Construct a _GraphDiff object.
+
+        Args:
+            graph_a (_C.Graph): First graph to compare.
+            graph_b (_C.Graph): Second graph to compare.
+        """
+        self.graph_a = graph_a
+        self.graph_b = graph_b
+
+    def __str__(self):
+        """See function :func:`diff_report`."""
+        return self.diff_report()
+
+    def _indent(self, lines: str) -> str:
+        return "\n".join(["\t" + line for line in lines.splitlines()])
+
+    def diff_report(self) -> str:
+        """Return a string representation of the graph difference.
+
+        The report shows the first pair of nodes that diverges. It also shows the source
+        location of the pair of nodes.
+
+        Returns:
+            graph_diff_report (str): A string representation of the graph difference.
+        """
+        graph_a = self.graph_a
+        graph_b = self.graph_b
+
+        graph_a_str = str(graph_a)
+        graph_b_str = str(graph_b)
+
+        if graph_a_str == graph_b_str:
+            return ""
+
+        graph_diff = difflib.ndiff(
+            graph_a_str.splitlines(True), graph_b_str.splitlines(True)
+        )
+        graph_diff_report = ["Graph diff:", self._indent("".join(graph_diff))]
+
+        for node_a, node_b in itertools.zip_longest(graph_a.nodes(), graph_b.nodes()):
+            if str(node_a) != str(node_b):
+                graph_diff_report.append("First diverging operator:")
+                node_diff = difflib.ndiff(
+                    str(node_a).splitlines(True), str(node_b).splitlines(True)
+                )
+                source_printout = ["node diff:", self._indent("".join(node_diff))]
+
+                stack_a = node_a.sourceRange() if node_a else None
+                if stack_a:
+                    source_printout.extend(
+                        ["Former source location:", self._indent(str(stack_a))]
+                    )
+                stack_b = node_b.sourceRange() if node_b else None
+                if stack_b:
+                    source_printout.extend(
+                        ["Latter source location:", self._indent(str(stack_b))]
+                    )
+
+                graph_diff_report.extend(source_printout)
+
+                break
+
+        return "\n".join(graph_diff_report)
+
+
+def _check_graph_diff(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
+    export_options: _experimental.ExportOptions,
+    model_to_graph_func: Callable[
+        [
+            torch.nn.Module,
+            tuple[Any, ...],
+            Mapping[str, Any],
+            _experimental.ExportOptions,
+        ],
+        _C.Graph,
+    ],
+) -> str:
+    """Check if graph produced by `model_to_graph_func` is the same across `test_input_groups`.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        test_input_groups: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+        model_to_graph_func: A function to convert a PyTorch model to a JIT IR graph.
+
+    Returns:
+        graph_diff_report (str): A string representation of the graph difference.
+    """
+    if len(test_input_groups) < 2:
+        raise ValueError("Need at least two groups of test inputs to compare.")
+
+    ref_jit_graph = None
+    for args, kwargs in test_input_groups:
+        jit_graph = model_to_graph_func(model, args, kwargs, export_options)
+        if ref_jit_graph is None:
+            ref_jit_graph = jit_graph
+            continue
+
+        graph_diff_report = _GraphDiff(ref_jit_graph, jit_graph).diff_report()
+        if graph_diff_report:
+            return graph_diff_report
+    return ""
+
+
+def _traced_graph_from_model(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    args: tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, create a traced JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        jit_graph (_C.Graph): A traced JIT graph.
+    """
+    training = export_options.training
+    verbose = export_options.verbose
+
+    with utils.exporter_context(model, training, verbose):
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        model = utils._pre_trace_quant_model(model, export_inputs)
+        jit_graph, _, _, _ = utils._create_jit_graph(model, export_inputs)
+        return jit_graph
+
+
+def _onnx_graph_from_model(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    args: tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, export an ONNX JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        onnx_graph (_C.Graph): An ONNX JIT graph.
+    """
+    # TODO: refactor utils.py to remove duplicated code of context setup. See #78834
+    opset_version = export_options.opset_version
+    operator_export_type = export_options.operator_export_type
+    export_modules_as_functions = export_options.export_modules_as_functions
+    training = export_options.training
+    verbose = export_options.verbose
+    dynamic_axes = export_options.dynamic_axes
+    input_names = export_options.input_names
+    output_names = export_options.output_names
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    utils._setup_trace_module_map(model, export_modules_as_functions)
+
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    with utils.exporter_context(model, training, verbose):
+        do_constant_folding = utils._decide_constant_folding(
+            export_options.do_constant_folding, operator_export_type, training
+        )
+
+        if dynamic_axes is None:
+            dynamic_axes = {}
+        utils._validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        export_inputs = utils._decide_input_format(model, export_inputs)
+        onnx_graph, _, _ = utils._model_to_graph(
+            model,
+            export_inputs,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return onnx_graph
+
+
+def _onnx_graph_from_aten_graph(
+    graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any] | None = None,
+) -> tuple[torch.Graph, dict[str, Any]]:
+    if params_dict is None:
+        params_dict = {}
+    operator_export_type = export_options.operator_export_type
+    dynamic_axes = export_options.dynamic_axes or {}
+    input_names = export_options.input_names
+    training = export_options.training
+    do_constant_folding = export_options.do_constant_folding
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    do_constant_folding = utils._decide_constant_folding(
+        do_constant_folding, operator_export_type, training
+    )
+
+    # TODO: Below is doing aten graph to onnx. It should be abstracted as a
+    # function in torch/onnx/utils.py.
+    graph = graph.copy()
+    graph = utils._optimize_graph(
+        graph,
+        operator_export_type,
+        params_dict=params_dict,
+        dynamic_axes=dynamic_axes,
+        input_names=input_names,
+    )
+
+    if training is None or training == _C_onnx.TrainingMode.EVAL:
+        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+    if (
+        do_constant_folding
+        and opset_version >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        params_dict = _C._jit_pass_onnx_constant_fold(graph, params_dict, opset_version)
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        _C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, opset_version)
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if export_options.verbose:
+        print("ONNX graph: ", graph)
+
+    return graph, params_dict
+
+
+def _onnx_proto_from_onnx_graph(
+    onnx_graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any],
+) -> tuple[bytes, Mapping[str, bytes]]:
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+    dynamic_axes = export_options.dynamic_axes or {}
+    operator_export_type = export_options.operator_export_type
+    val_keep_init_as_ip = utils._decide_keep_init_as_input(
+        export_options.keep_initializers_as_inputs,
+        operator_export_type,
+        opset_version,
+    )
+    val_add_node_names = utils._decide_add_node_names(True, operator_export_type)
+    custom_opsets = export_options.custom_opsets or {}
+
+    proto, export_map, _, _ = onnx_graph._export_onnx(  # type: ignore[attr-defined]
+        params_dict,
+        opset_version,
+        dynamic_axes,
+        False,
+        operator_export_type,
+        not export_options.verbose,
+        val_keep_init_as_ip,
+        custom_opsets,
+        val_add_node_names,
+        "",
+        {},
+    )
+
+    return proto, export_map
+
+
+def check_export_model_diff(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
+    export_options: _experimental.ExportOptions | None = None,
+) -> str:
+    """Verify exported model discrepancy between different groups of inputs.
+
+    A graph is exported for each group of inputs. The exported graphs are then compared
+    to each other, and discrepancies of first pair of nodes are reported. This function
+    first checks the jit graph. If no discrepancies were found, it then checks the onnx
+    graph.
+
+    Unless otherwise specified, the jit/ONNX graph is expected to be the same, regardless
+    of the inputs used for exporting. A discrepancy implies the graph exported is
+    not accurate when run on other groups of inputs, which will typically results in
+    runtime errors or mismatching output.
+
+    Args:
+        model (torch.nn.Module or torch.jit.ScriptModule): The model to be exported.
+        test_input_groups (Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]]): A sequence
+            of input groups to be used to export the model. Each input group is a pair of
+            (args, kwargs).
+        export_options (_experimental.ExportOptions, optional): An _experimental.ExportOptions
+            object that controls the export behavior.
+
+    Returns:
+        str: A string containing the diff of the exported models.
+    """
+    export_options = (
+        _experimental.ExportOptions() if export_options is None else export_options
+    )
+
+    jit_diff_report = _check_graph_diff(
+        model, test_input_groups, export_options, _traced_graph_from_model
+    )
+    if jit_diff_report:
+        return jit_diff_report
+
+    return _check_graph_diff(
+        model, test_input_groups, export_options, _onnx_graph_from_model
+    )
+
+
+@typing_extensions.deprecated(
+    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
+    "and use ONNXProgram to test the ONNX model",
+    category=None,
+)
+def verify(
+    model: _ModelType,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
+    | None = None,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    fixed_batch_size: bool = False,
+    use_external_data: bool = False,
+    additional_test_inputs: Sequence[_InputArgsType] | None = None,
+    options: VerificationOptions | None = None,
+):
+    """Verify model export to ONNX against original PyTorch model.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Args:
+        model: See :func:`torch.onnx.export`.
+        input_args: See :func:`torch.onnx.export`.
+        input_kwargs: See :func:`torch.onnx.export`.
+        do_constant_folding: See :func:`torch.onnx.export`.
+        dynamic_axes: See :func:`torch.onnx.export`.
+        input_names: See :func:`torch.onnx.export`.
+        output_names: See :func:`torch.onnx.export`.
+        training: See :func:`torch.onnx.export`.
+        opset_version: See :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: See :func:`torch.onnx.export`.
+        verbose: See :func:`torch.onnx.export`.
+        fixed_batch_size: Legacy argument, used only by rnn test cases.
+        use_external_data: Explicitly specify whether to export the model with external data.
+        additional_test_inputs: List of tuples. Each tuple is a group of
+            input arguments to test. Currently only ``*args`` are supported.
+        options: A VerificationOptions object that controls the verification behavior.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options is None:
+        options = VerificationOptions()
+
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad(), contextlib.ExitStack() as stack:
+        model_f: str | io.BytesIO = io.BytesIO()
+        if use_external_data:
+            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+            model_f = os.path.join(tmpdir_path, "model.onnx")
+
+        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
+
+        # TODO(#77679): remove this and treat mutating model separately.
+        model_copy = _try_clone_model(model)
+        utils._export(
+            model,
+            inputs_for_export,
+            model_f,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
+
+        _compare_onnx_pytorch_model(
+            pt_model=model_copy,
+            onnx_model_f=model_f,
+            input_args=input_args,
+            input_kwargs=input_kwargs,
+            additional_test_inputs=additional_test_inputs,
+            options=options,
+        )
+
+
+@typing_extensions.deprecated(
+    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
+    "and use ONNXProgram to test the ONNX model"
+)
+def verify_aten_graph(
+    graph: torch.Graph,
+    input_args: tuple[Any, ...],
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any] | None = None,
+    verification_options: VerificationOptions | None = None,
+) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
+    """Verify aten graph export to ONNX against original PyTorch model.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+    """
+    if verification_options is None:
+        verification_options = VerificationOptions()
+    if params_dict is None:
+        params_dict = {}
+
+    original_jit_graph = graph
+    graph = graph.copy()
+
+    # Execute aten graph and get reference torch jit outputs.
+    graph_inputs = list(graph.inputs())
+    jit_inputs = tuple([arg for arg in input_args if arg is not None])
+    weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
+    assert all(w is not None for w in weights)
+    # TODO: Only copy the argument if mutation is detected in Graph.
+    jit_inputs = copy.deepcopy(jit_inputs)
+    jit_input_and_parameters = jit_inputs + tuple(weights)
+    jit_outs = torch._C._jit_interpret_graph(graph, jit_input_and_parameters)  # type: ignore[attr-defined]
+    if not isinstance(jit_outs, (list, tuple)):
+        jit_outs = [jit_outs]
+
+    # Convert aten graph to onnx graph.
+    graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+        graph, export_options, params_dict
+    )
+
+    proto, export_map = _onnx_proto_from_onnx_graph(
+        graph, export_options, onnx_params_dict
+    )
+    model_f: str | io.BytesIO = io.BytesIO()
+    onnx_proto_utils._export_file(proto, model_f, export_map)
+
+    # NOTE: Verification is unstable. Try catch to emit information for debugging.
+    try:
+        # NOTE: Input might be dce'ed, so we need to remove those from the input args.
+        new_input_names = {v.debugName() for v in graph.inputs()}
+        new_input_args = []
+        for v, arg in zip(original_jit_graph.inputs(), input_args):
+            if v.debugName() in new_input_names:
+                new_input_args.append(arg)
+        input_args = tuple(new_input_args)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args,
+            {},
+            verification_options.remained_onnx_input_idx,
+            verification_options.flatten,
+        )
+
+        onnx_session = _onnx_backend_session(model_f, verification_options.backend)
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+        del onnx_session  # To free device memory
+
+        try:
+            _compare_onnx_pytorch_outputs(
+                onnx_outs=onnx_outs,
+                pt_outs=jit_outs,
+                options=verification_options,
+            )
+        except AssertionError as e:
+            return e, graph, jit_outs, onnx_outs
+
+        return None, graph, jit_outs, onnx_outs
+
+    except Exception as e:
+        print("Unexpected error during verification.")
+        print("jit graph: ", original_jit_graph)
+        print("onnx graph: ", graph)
+        raise e
+
+
+class GraphInfoPrettyPrinter:
+    graph_info: GraphInfo | None
+    upper_printer: GraphInfoPrettyPrinter | None
+    lower_printer: GraphInfoPrettyPrinter | None
+
+    graph_str_lambdas: Mapping[int, str]
+    connector_str_lambdas: Mapping[int, str]
+    children_str_lambdas: Mapping[int, str]
+
+    def __init__(self, graph_info: GraphInfo | None):
+        self.graph_info = graph_info
+        if (
+            graph_info is not None
+            and graph_info.upper_graph_info is not None
+            and graph_info.lower_graph_info is not None
+        ):
+            self.upper_printer = GraphInfoPrettyPrinter(graph_info.upper_graph_info)
+            self.lower_printer = GraphInfoPrettyPrinter(graph_info.lower_graph_info)
+        else:
+            self.upper_printer = None
+            self.lower_printer = None
+
+    def _total_rows(self) -> int:
+        if self.graph_info is None:
+            return 1
+        if self.upper_printer and self.lower_printer:
+            return (
+                self.upper_printer._total_rows() + self.lower_printer._total_rows() + 1
+            )
+        return 2  # Two lines: node count + id.
+
+    def _node_count_segment_str(self) -> str:
+        if self.graph_info is None:
+            return "..."
+        node_count = self.graph_info.essential_node_count()
+        has_mismatch = self.graph_info.has_mismatch()
+        error_node_kind = (
+            f"({self.graph_info.essential_node_kinds().pop()})"
+            if node_count == 1 and has_mismatch
+            else ""
+        )
+
+        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
+
+    def _graph_id_segment_str(self) -> str:
+        if self.graph_info is None:
+            return ""
+        return f"id: {self.graph_info.id}"
+
+    def _max_segment_columns(self) -> int:
+        return max(
+            map(len, (self._node_count_segment_str(), self._graph_id_segment_str()))
+        )
+
+    def _graph_segment_str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph segment at the given line."""
+        if line == 0:
+            result_str = self._node_count_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if line == 1:
+            result_str = self._graph_id_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if 0 <= line < self._total_rows():
+            return " " * self._max_segment_columns()
+        return ""
+
+    def _connector_segment_str_at_line(self, line: int) -> str:
+        """Get the connector segment string at the given line."""
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if line == 0:
+            return "  __"
+        elif line < upper_total_rows + 1:
+            return " |  "
+        elif line == upper_total_rows + 1:
+            return " |__"
+        elif line < upper_total_rows + lower_total_rows + 1:
+            return "    "
+        return ""
+
+    def _children_str_at_line(self, line: int) -> str:
+        """Get the string representation of the children at the given line.
+
+        Recursively calls `_str_at_line` on children nodes.
+        """
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if 0 <= line < upper_total_rows:
+            return (
+                self.upper_printer._str_at_line(line) if self.upper_printer else "..."
+            )
+        elif upper_total_rows < line < upper_total_rows + lower_total_rows + 1:
+            return (
+                self.lower_printer._str_at_line(line - upper_total_rows - 1)
+                if self.lower_printer
+                else "..."
+            )
+        return ""
+
+    def _str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph at the given line."""
+        return (
+            self._graph_segment_str_at_line(line)
+            + self._connector_segment_str_at_line(line)
+            + self._children_str_at_line(line)
+        )
+
+    def pretty_print(self):
+        if self.graph_info is None:
+            print(None)
+            return
+        # Print tree.
+        print(" Tree: ".center(80, "="))
+        total_rows = self._total_rows()
+        for line in range(total_rows):
+            print(self._str_at_line(line).rstrip())
+        if self.graph_info.has_mismatch():
+            # Summarize leaf subgraphs with mismatch.
+            print(" Mismatch leaf subgraphs: ".center(80, "="))
+            print(
+                [
+                    graph_info.id
+                    for graph_info in self.graph_info.all_mismatch_leaf_graph_info()
+                ]
+            )
+            # Summarize node kinds with mismatch.
+            mismatch_node_kinds: dict[str, int] = {}
+            for graph_info in self.graph_info.all_mismatch_leaf_graph_info():
+                node_kinds = graph_info.essential_node_kinds()
+                if len(node_kinds) == 1:
+                    node_kind = node_kinds.pop()
+                    mismatch_node_kinds[node_kind] = (
+                        mismatch_node_kinds.get(node_kind, 0) + 1
+                    )
+            print(" Mismatch node kinds: ".center(80, "="))
+            print(mismatch_node_kinds)
+        else:
+            print(" No mismatch found. ".center(80, "="))
+
+
+class OnnxTestCaseRepro:
+    def __init__(self, repro_dir):
+        self.repro_dir = repro_dir
+        self.proto, self.inputs, self.outputs = onnx_proto_utils.load_test_case(
+            repro_dir
+        )
+
+    @classmethod
+    def create_test_case_repro(
+        cls, proto: bytes, inputs, outputs, dir: str, name: str | None = None
+    ):
+        """Create a repro under "{dir}/test_{name}" for an ONNX test case.
+
+        The test case contains the model and the inputs/outputs data. The directory
+        structure is as follows:
+
+        dir
+        \u251c\u2500\u2500 test_<name>
+        \u2502   \u251c\u2500\u2500 model.onnx
+        \u2502   \u2514\u2500\u2500 test_data_set_0
+        \u2502       \u251c\u2500\u2500 input_0.pb
+        \u2502       \u251c\u2500\u2500 input_1.pb
+        \u2502       \u251c\u2500\u2500 output_0.pb
+        \u2502       \u2514\u2500\u2500 output_1.pb
+
+        Args:
+            proto: ONNX model proto.
+            inputs: Inputs to the model.
+            outputs: Outputs of the model.
+            dir: Directory to save the repro.
+            name: Name of the test case. If not specified, a name based on current time
+                will be generated.
+        Returns:
+            Path to the repro.
+        """
+        if name is None:
+            name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+        return onnx_proto_utils.export_as_test_case(
+            proto,
+            _to_numpy(inputs),
+            _to_numpy(outputs),
+            name,
+            dir,
+        )
+
+    def validate(self, options: VerificationOptions):
+        """Run the ONNX test case with options.backend, and compare with the expected outputs.
+
+        Args:
+            options: Options for validation.
+
+        Raise:
+            AssertionError: if outputs from options.backend and expected outputs are not
+                equal up to specified precision.
+        """
+        onnx_session = _onnx_backend_session(io.BytesIO(self.proto), options.backend)
+        run_outputs = onnx_session.run(None, self.inputs)
+        if hasattr(onnx_session, "get_outputs"):
+            output_names = [o.name for o in onnx_session.get_outputs()]
+        elif hasattr(onnx_session, "output_names"):
+            output_names = onnx_session.output_names
+        else:
+            raise ValueError(f"Unknown onnx session type: {type(onnx_session)}")
+        expected_outs = [self.outputs[name] for name in output_names]
+        _compare_onnx_pytorch_outputs_in_np(run_outputs, expected_outs, options)
+
+
+@typing_extensions.deprecated(
+    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
+    "and use ONNXProgram to test the ONNX model"
+)
+@dataclasses.dataclass
+class GraphInfo:
+    """GraphInfo contains validation information of a TorchScript graph and its converted ONNX graph.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+    """
+
+    graph: torch.Graph
+    input_args: tuple[Any, ...]
+    params_dict: dict[str, Any]
+    export_options: _experimental.ExportOptions = dataclasses.field(
+        default_factory=_experimental.ExportOptions
+    )
+    mismatch_error: AssertionError | None = dataclasses.field(default=None, init=False)
+    pt_outs: Sequence[_NumericType] | None = dataclasses.field(default=None, init=False)
+    upper_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
+    lower_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
+    id: str = dataclasses.field(default="")
+    _onnx_graph: torch.Graph | None = dataclasses.field(init=False, default=None)
+
+    _EXCLUDED_NODE_KINDS: frozenset[str] = frozenset(
+        {"prim::Constant", "prim::ListConstruct", "aten::ScalarImplicit"}
+    )
+
+    def clear(self):
+        """Clear states and results of previous verification."""
+        self.mismatch_error = None
+        self.pt_outs = None
+        self._onnx_graph = None
+        self.upper_graph_info = None
+        self.lower_graph_info = None
+
+    def pretty_print_tree(self):
+        """Pretty print `GraphInfo` tree.
+
+        Each node represents a subgraph, showing the number of nodes in the subgraph and
+        a check mark if the subgraph has output mismatch between torch and ONNX.
+
+        The id of the subgraph is shown under the node. The `GraphInfo` object for any
+        subgraph can be retrieved by calling `graph_info.find_partition(id)`.
+
+        Example::
+
+            ==================================== Tree: =====================================
+            5 X   __2 X    __1 \u2713
+            id:  |  id: 0 |  id: 00
+                 |        |
+                 |        |__1 X (aten::relu)
+                 |           id: 01
+                 |
+                 |__3 X    __1 \u2713
+                    id: 1 |  id: 10
+                          |
+                          |__2 X     __1 X (aten::relu)
+                             id: 11 |  id: 110
+                                    |
+                                    |__1 \u2713
+                                       id: 111
+            =========================== Mismatch leaf subgraphs: ===========================
+            ['01', '110']
+            ============================= Mismatch node kinds: =============================
+            {'aten::relu': 2}
+
+        """
+        GraphInfoPrettyPrinter(self).pretty_print()
+
+    def pretty_print_mismatch(self, graph: bool = False):
+        """Pretty print details of the mismatch between torch and ONNX.
+
+        Args:
+            graph: If True, print the ATen JIT graph and ONNX graph.
+        """
+        print(f" Mismatch info for graph partition {self.id}: ".center(80, "="))
+        if graph:
+            print(" ATen JIT graph ".center(80, "="))
+            # TODO: A more compact graph printer.
+            #   * Drop stride, grad, device information.
+            #   * Show source location on a separate line.
+            print(self.graph)
+            if self._onnx_graph is not None:
+                print(" ONNX graph ".center(80, "="))
+                print(self._onnx_graph)
+        if self.has_mismatch():
+            print(" Mismatch error ".center(80, "="))
+            print(self.mismatch_error)
+        else:
+            print(" No mismatch ".center(80, "="))
+
+    def has_mismatch(self) -> bool:
+        """Return True if the subgraph has output mismatch between torch and ONNX."""
+        return self.mismatch_error is not None
+
+    def essential_node_count(self) -> int:
+        """Return the number of nodes in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return sum(
+            1 for n in self.graph.nodes() if n.kind() not in self._EXCLUDED_NODE_KINDS
+        )
+
+    def essential_node_kinds(self) -> set[str]:
+        """Return the set of node kinds in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return {
+            n.kind()
+            for n in self.graph.nodes()
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        }
+
+    def all_mismatch_leaf_graph_info(self) -> list[GraphInfo]:
+        """Return a list of all leaf `GraphInfo` objects that have mismatch."""
+        if not self.has_mismatch():
+            return []
+
+        no_mismatch_children = (
+            self.upper_graph_info is None or not self.upper_graph_info.has_mismatch()
+        ) and (
+            self.lower_graph_info is None or not self.lower_graph_info.has_mismatch()
+        )
+
+        if no_mismatch_children:
+            return [self]
+
+        results = []
+        if self.upper_graph_info is not None:
+            results += self.upper_graph_info.all_mismatch_leaf_graph_info()
+        if self.lower_graph_info is not None:
+            results += self.lower_graph_info.all_mismatch_leaf_graph_info()
+
+        return results
+
+    def find_partition(self, id: str) -> GraphInfo | None:
+        """Find the `GraphInfo` object with the given id."""
+        if id == self.id:
+            return self
+        current_length = len(self.id)
+        if len(id) > current_length:
+            if id[current_length] == "0" and self.upper_graph_info is not None:
+                return self.upper_graph_info.find_partition(id)
+            elif id[current_length] == "1" and self.lower_graph_info is not None:
+                return self.lower_graph_info.find_partition(id)
+        return None
+
+    def export_repro(
+        self, repro_dir: str | None = None, name: str | None = None
+    ) -> str:
+        """Export the subgraph to ONNX along with the input/output data for repro.
+
+        The repro directory will contain the following files::
+
+            dir
+            \u251c\u2500\u2500 test_<name>
+            \u2502   \u251c\u2500\u2500 model.onnx
+            \u2502   \u2514\u2500\u2500 test_data_set_0
+            \u2502       \u251c\u2500\u2500 input_0.pb
+            \u2502       \u251c\u2500\u2500 input_1.pb
+            \u2502       \u251c\u2500\u2500 output_0.pb
+            \u2502       \u2514\u2500\u2500 output_1.pb
+
+        Args:
+            repro_dir: The directory to export the repro files to. Defaults to current
+                working directory if None.
+            name: An optional name for the test case folder: "test_{name}".
+
+        Returns:
+            The path to the exported repro directory.
+        """
+
+        if repro_dir is None:
+            repro_dir = os.getcwd()
+        repro_dir = os.path.join(repro_dir, "onnx_debug")
+
+        onnx_graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+            self.graph, self.export_options, self.params_dict
+        )
+
+        proto, _ = _onnx_proto_from_onnx_graph(
+            onnx_graph, self.export_options, onnx_params_dict
+        )
+        return OnnxTestCaseRepro.create_test_case_repro(
+            proto, self.input_args, self.pt_outs, repro_dir, name
+        )
+
+    def _graph_partition_pivot(self) -> int:
+        """Find the pivot index to partition the graph.
+
+        The pivot is the node that splits the graph into two parts. Each part should
+        have the similar amount of nodes, excluding non essential ops, defined in
+        `_EXCLUDED_NODE_KINDS`, such as `prim::Constant`.
+        If the graph has an odd number of nodes, the upper part will have one more node.
+        If the graph does not have any node that can be partitioned, return -1.
+
+        Returns:
+            The index of the pivot node.
+        """
+        included_node_indices = [
+            i
+            for i, n in enumerate(self.graph.nodes())
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        ]
+        half_idx = len(included_node_indices) // 2 - 1
+        if half_idx >= 0 and len(included_node_indices) > half_idx:
+            return included_node_indices[half_idx] + 1
+        return -1
+
+    def _partition_upper_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+
+        def _process_bridge_value_for_upper(
+            new_outputs: list[torch.Value], bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as upper graph outputs.
+            new_outputs.append(bridge_value)
+            return bridge_value
+
+        new_outputs: list[torch.Value] = []
+        process_bridge_value_for_upper = functools.partial(
+            _process_bridge_value_for_upper, new_outputs
+        )
+        _, dropped_nodes, complete_upper_nodes_set, _ = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_upper
+        )
+
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for node in reversed(dropped_nodes):
+            node.destroy()
+
+        for i, input in reversed(list(enumerate(list(graph.inputs())))):
+            if (
+                not _has_uses_by_nodes(input, complete_upper_nodes_set)
+                and input not in new_outputs
+            ):
+                try:
+                    graph.eraseInput(i)
+                except RuntimeError as e:
+                    print(input, graph)
+                    raise e
+
+        return graph
+
+    def _partition_lower_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+        original_inputs = list(graph.inputs())
+
+        def _process_bridge_value_for_lower(
+            graph: torch.Graph, bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as lower graph inputs.
+            new_input = graph.addInput()
+            bridge_value.replaceAllUsesWith(new_input)
+            new_input.copyMetadata(bridge_value)
+            return new_input
+
+        process_bridge_value_for_lower = functools.partial(
+            _process_bridge_value_for_lower, graph
+        )
+
+        upper_nodes, lower_nodes, _, complete_lower_nodes_set = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_lower
+        )
+
+        new_outputs = [
+            output for output in original_outputs if _produced_by(output, lower_nodes)
+        ]
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for input in original_inputs:
+            if _has_uses_by_nodes(input, complete_lower_nodes_set):
+                new_input = graph.addInput()
+                input.replaceAllUsesWith(new_input)
+                new_input.copyMetadata(input)
+
+        for node in reversed(upper_nodes):
+            if node not in complete_lower_nodes_set:
+                try:
+                    node.destroy()
+                except RuntimeError as e:
+                    print(node, graph)
+                    raise e
+
+        for _ in original_inputs:
+            graph.eraseInput(0)
+
+        return graph
+
+    def _partition_node(
+        self,
+        node: torch.Node,
+        complete_upper_nodes_set: set[torch.Node],
+        complete_lower_nodes_set: set[torch.Node],
+        original_graph_outputs: set[torch.Value],
+        covered_bridge_values: set[torch.Value],
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ):
+        if node in complete_lower_nodes_set:
+            return
+
+        if (
+            _node_has_uses_by(node, complete_lower_nodes_set)
+            and node.kind() in self._EXCLUDED_NODE_KINDS
+        ):
+            complete_lower_nodes_set.update(_all_nodes([node]))
+            for input in node.inputs():
+                if input in covered_bridge_values:
+                    continue
+                self._partition_node(
+                    input.node(),
+                    complete_upper_nodes_set,
+                    complete_lower_nodes_set,
+                    original_graph_outputs,
+                    covered_bridge_values,
+                    process_bridge_value,
+                )
+        else:
+            for output in node.outputs():
+                if output in covered_bridge_values:
+                    continue
+                if (
+                    _has_uses_by_nodes(output, complete_lower_nodes_set)
+                    or output in original_graph_outputs
+                ):
+                    covered_bridge_values.add(process_bridge_value(output))
+
+    def _partition_nodes(
+        self,
+        graph: torch.Graph,
+        pivot: int,
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ) -> tuple[list[torch.Node], list[torch.Node], set[torch.Node], set[torch.Node]]:
+        nodes = list(graph.nodes())
+        upper_nodes = nodes[:pivot]
+        lower_nodes = nodes[pivot:]
+        # `upper_nodes` and `complete_upper_nodes_set` differs in that the latter
+        # recursively contains nodes in subblock of `upper_nodes`.
+        # The same applies for `lower_nodes` and `complete_lower_nodes_set`.
+        # With addition that `complete_lower_nodes_set` will include nodes that
+        # are determined to be copied from `upper_nodes` to `lower_nodes`.
+        complete_upper_nodes_set = _all_nodes(upper_nodes)
+        complete_lower_nodes_set = _all_nodes(lower_nodes)
+        original_graph_outputs = set(graph.outputs())
+        # Bridge values are values produced from upper graph, and consumed
+        # by lower graph. These values need to be become upper graph outputs
+        # and lower graph inputs, to bridge the interaction.
+        # Start with all graph inputs marked as covered. If any graph input is
+        # needed by lower graph, just keep it in lower graph inputs later.
+        covered_bridge_values = set(graph.inputs())
+        for node in upper_nodes:
+            self._partition_node(
+                node,
+                complete_upper_nodes_set,
+                complete_lower_nodes_set,
+                original_graph_outputs,
+                covered_bridge_values,
+                process_bridge_value,
+            )
+        return (
+            upper_nodes,
+            lower_nodes,
+            complete_upper_nodes_set,
+            complete_lower_nodes_set,
+        )
+
+    def _bridge_kwargs(self):
+        pt_outs = self.pt_outs
+        graph_outputs = list(self.graph.outputs())
+        assert pt_outs is not None
+        assert len(graph_outputs) == len(pt_outs), (
+            f"{len(graph_outputs)} vs {len(pt_outs)}\nGraph: {self.graph}"
+        )
+        return {v.debugName(): o for v, o in zip(graph_outputs, pt_outs)}
+
+    def _args_and_params_for_partition_graph(
+        self,
+        graph: torch.Graph,
+        bridge_kwargs: Mapping[str, _NumericType | Sequence[_NumericType]],
+        full_kwargs: Mapping[str, torch.Tensor],
+        full_params: Mapping[str, torch.Tensor],
+    ):
+        input_names = [input.debugName() for input in graph.inputs()]
+        args = tuple(bridge_kwargs[k] for k in input_names if k in bridge_kwargs)
+        args += tuple(full_kwargs[k] for k in input_names if k in full_kwargs)
+        params = {k: full_params[k] for k in input_names if k in full_params}
+        assert len(args) + len(params) == len(input_names), (
+            f"{len(args)} + {len(params)} vs {len(input_names)}: {input_names}"
+        )
+        return args, params
+
+    def verify_export(
+        self, options: VerificationOptions
+    ) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
+        """
+        Verify the export from TorchScript IR graph to ONNX.
+
+        Export the TorchScript IR graph to ONNX, with the inputs, parameters and export
+        options recorded in this object. Then verify the exported ONNX graph against
+        the original TorchScript IR graph under the provided verification options.
+
+        Args:
+            options: The verification options.
+
+        Returns:
+            error: The AssertionError raised during the verification. Returns None if no
+            error is raised.
+            onnx_graph: The exported ONNX graph in TorchScript IR format.
+            onnx_outs: The outputs from running exported ONNX model under the onnx
+            backend in `options`.
+            pt_outs: The outputs from running the TorchScript IR graph.
+        """
+        return verify_aten_graph(
+            self.graph,
+            input_args=self.input_args,
+            params_dict=self.params_dict,
+            export_options=self.export_options,
+            verification_options=options,
+        )
+
+    def find_mismatch(
+        self,
+        options: VerificationOptions | None = None,
+    ):
+        """
+        Find all mismatches between the TorchScript IR graph and the exported onnx model.
+
+        Binary searches the model graph to find the minimal subgraph that exhibits the
+        mismatch. A `GraphInfo` object is created for each subgraph, recording the test
+        inputs and export options, as well as the validation results.
+
+        Args:
+            options: The verification options.
+        """
+        self.clear()
+
+        if options is None:
+            options = VerificationOptions()
+
+        if self.export_options.verbose:
+            print(self.graph)
+
+        if len(list(self.graph.outputs())) == 0:
+            return
+
+        assert len(self.input_args) + len(self.params_dict) == len(
+            list(self.graph.inputs())
+        ), (
+            f"Number of graph inputs({len(list(self.graph.inputs()))}) does not match "
+            f"the provided tensor arguments({len(self.input_args)} + {len(self.params_dict)})."
+        )
+
+        self.mismatch_error, self._onnx_graph, self.pt_outs, _ = self.verify_export(
+            options
+        )
+
+        if self.mismatch_error is None:
+            # No mismatch found in graph.
+            return
+
+        if self.essential_node_count() <= 1:
+            # Reached leaf node, no more partitioning.
+            return
+
+        full_kwargs = {
+            k.debugName(): v for k, v in zip(self.graph.inputs(), self.input_args)
+        }
+        full_params = self.params_dict
+
+        upper_graph = self._partition_upper_graph()
+        upper_args, upper_params = self._args_and_params_for_partition_graph(
+            upper_graph, {}, full_kwargs, full_params
+        )
+        self.upper_graph_info = GraphInfo(
+            upper_graph,
+            upper_args,
+            upper_params,
+            self.export_options,
+            id=self.id + "0",
+        )
+
+        self.upper_graph_info.find_mismatch(options)
+
+        bridge_kwargs = self.upper_graph_info._bridge_kwargs()
+        lower_graph = self._partition_lower_graph()
+        lower_args, lower_params = self._args_and_params_for_partition_graph(
+            lower_graph, bridge_kwargs, full_kwargs, full_params
+        )
+        self.lower_graph_info = GraphInfo(
+            lower_graph,
+            lower_args,
+            lower_params,
+            self.export_options,
+            id=self.id + "1",
+        )
+
+        self.lower_graph_info.find_mismatch(options)
+
+
+def _all_nodes(nodes: Collection[torch.Node]) -> set[torch.Node]:
+    all_nodes = set(nodes)
+    for n in nodes:
+        for b in n.blocks():
+            all_nodes.update(_all_nodes(list(b.nodes())))
+    return all_nodes
+
+
+def _has_uses_by_nodes(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    return any(use.user in nodes for use in value.uses())
+
+
+def _node_has_uses_by(node: torch.Node, nodes: Collection[torch.Node]) -> bool:
+    for output in node.outputs():
+        if _has_uses_by_nodes(output, nodes):
+            return True
+    return False
+
+
+def _produced_by(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    return value.node() in nodes
+
+
+@typing_extensions.deprecated(
+    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
+    "and use ONNXProgram to test the ONNX model"
+)
+def find_mismatch(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    input_args: tuple[Any, ...],
+    do_constant_folding: bool = True,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    options: VerificationOptions | None = None,
+) -> GraphInfo:
+    r"""Find all mismatches between the original model and the exported model.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Experimental. The API is subject to change.
+
+    This tool helps debug the mismatch between the original PyTorch model and exported
+    ONNX model. It binary searches the model graph to find the minimal subgraph that
+    exhibits the mismatch.
+
+    Args:
+        model: The model to be exported.
+        input_args: The input arguments to the model.
+        do_constant_folding: Same as `do_constant_folding` in :func:`torch.onnx.export`.
+        training: Same as `training` in :func:`torch.onnx.export`.
+        opset_version: Same as `opset_version` in :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: Same as `keep_initializers_as_inputs` in :func:`torch.onnx.export`.
+        verbose: Same as `verbose` in :func:`torch.onnx.export`.
+        options: The options for the mismatch verification.
+
+    Returns:
+        A GraphInfo object that contains the mismatch information.
+
+    Example::
+
+        >>> import torch
+        >>> import torch.onnx.verification
+        >>> torch.manual_seed(0)
+        >>> opset_version = 15
+        >>> # Define a custom symbolic function for aten::relu.
+        >>> # The custom symbolic function is incorrect, which will result in mismatches.
+        >>> def incorrect_relu_symbolic_function(g, self):
+        ...     return self
+        >>> torch.onnx.register_custom_op_symbolic(
+        ...     "aten::relu",
+        ...     incorrect_relu_symbolic_function,
+        ...     opset_version=opset_version,
+        ... )
+        >>> class Model(torch.nn.Module):
+        ...     def __init__(self) -> None:
+        ...         super().__init__()
+        ...         self.layers = torch.nn.Sequential(
+        ...             torch.nn.Linear(3, 4),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(4, 5),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(5, 6),
+        ...         )
+        ...     def forward(self, x):
+        ...         return self.layers(x)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> graph_info = torch.onnx.verification.find_mismatch(
+        ...     Model(),
+        ...     (torch.randn(2, 3),),
+        ...     opset_version=opset_version,
+        ... )
+        ===================== Mismatch info for graph partition : ======================
+        ================================ Mismatch error ================================
+        Tensor-likes are not close!
+        Mismatched elements: 12 / 12 (100.0%)
+        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
+        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
+        ==================================== Tree: =====================================
+        5 X   __2 X    __1 \u2713
+        id:  |  id: 0 |  id: 00
+             |        |
+             |        |__1 X (aten::relu)
+             |           id: 01
+             |
+             |__3 X    __1 \u2713
+                id: 1 |  id: 10
+                      |
+                      |__2 X     __1 X (aten::relu)
+                         id: 11 |  id: 110
+                                |
+                                |__1 \u2713
+                                   id: 111
+        =========================== Mismatch leaf subgraphs: ===========================
+        ['01', '110']
+        ============================= Mismatch node kinds: =============================
+        {'aten::relu': 2}
+
+    """
+    if options is None:
+        options = VerificationOptions()
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+    """From aten graph, do binary search on graph partition to find operator export discrepancy."""
+    # TODO: Copied from utils.py `export` until `_optimize_graph`.
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad():
+        inputs_for_export = _prepare_input_for_export(input_args, {})
+        args = utils._decide_input_format(model, inputs_for_export)
+
+        model = utils._pre_trace_quant_model(model, args)
+        graph, params, _torch_out, _module = utils._create_jit_graph(model, args)
+        params_dict = utils._get_named_param_dict(graph, params)
+
+        utils._apply_friendly_debug_names(graph, params_dict)
+
+        graph_info = GraphInfo(
+            graph,
+            input_args,
+            params_dict,
+            _experimental.ExportOptions(
+                do_constant_folding=do_constant_folding,
+                training=training,
+                opset_version=opset_version,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                verbose=verbose,
+            ),
+        )
+        graph_info.find_mismatch(options)
+        graph_info.pretty_print_mismatch()
+        graph_info.pretty_print_tree()
+
+        return graph_info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 1060a6287a8e6..00981e061754c 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -8,7 +8,10 @@
 
 from torch.optim import lr_scheduler as lr_scheduler, swa_utils as swa_utils
 from torch.optim._adafactor import Adafactor as Adafactor
+<<<<<<< HEAD
 from torch.optim._muon import Muon as Muon
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.optim.adadelta import Adadelta as Adadelta
 from torch.optim.adagrad import Adagrad as Adagrad
 from torch.optim.adam import Adam as Adam
@@ -26,7 +29,10 @@
 
 
 Adafactor.__module__ = "torch.optim"
+<<<<<<< HEAD
 Muon.__module__ = "torch.optim"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 del adadelta  # type: ignore[name-defined] # noqa: F821
@@ -54,7 +60,10 @@
     "ASGD",
     "LBFGS",
     "lr_scheduler",
+<<<<<<< HEAD
     "Muon",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "NAdam",
     "Optimizer",
     "RAdam",
diff --git a/torch/optim/_adafactor.py b/torch/optim/_adafactor.py
index 3b529201d7ff2..dbaf9c9bbf549 100644
--- a/torch/optim/_adafactor.py
+++ b/torch/optim/_adafactor.py
@@ -47,6 +47,7 @@ def __init__(
             raise ValueError(f"Clipping threshold d should be >= 1 but is: {d}")
         if not 0.0 <= weight_decay:
             raise ValueError(f"weight_decay should be >= 0 but is: {weight_decay}")
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "beta2_decay": beta2_decay,
@@ -56,6 +57,17 @@ def __init__(
             "foreach": foreach,
             "maximize": maximize,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            beta2_decay=beta2_decay,
+            eps=eps,
+            d=d,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -315,9 +327,12 @@ def step(self, closure=None):
                 &\hspace{5mm}U_t \leftarrow \frac{G_t}{\sqrt{\widehat{V}_t}}                                            \\
             \end{aligned}
 
+<<<<<<< HEAD
         You may note that Noam Shazeer and Mitchell Stern describe using the sum of squared gradients,
         while this implementation uses the mean instead. This choice is mathematically equivalent and
         allows for greater numerical stability for large sums.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. _Adafactor\: Adaptive Learning Rates with Sublinear Memory Cost:
         https://arxiv.org/pdf/1804.04235
@@ -350,16 +365,26 @@ def _single_tensor_adafactor(
     maximize: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Grad scaling should occur outside of optimizer.step()")
+=======
+    assert grad_scale is None and found_inf is None, (
+        "Grad scaling should occur outside of optimizer.step()"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch.jit.is_scripting():
         # this assert is due to JIT being dumb and not realizing that the ops below
         # have overloads to handle both float and Tensor lrs, so we just assert it's
         # a float since most people using JIT are using floats
+<<<<<<< HEAD
         if not isinstance(lr, float):
             raise AssertionError(f"Expected lr to be a float, but got {type(lr)}")
 
+=======
+        assert isinstance(lr, float)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         lr = _to_scalar(lr)
 
@@ -385,10 +410,16 @@ def _single_tensor_adafactor(
             param.mul_(1 - lr * weight_decay)
 
         if grad.dim() > 1:
+<<<<<<< HEAD
             if row_var is None or col_var is None:
                 raise AssertionError(
                     "row_var and col_var should be defined when grad is multidimensional"
                 )
+=======
+            assert row_var is not None and col_var is not None, (
+                "row_var and col_var should be defined when grad is multidimensional"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # same as (g * g).mean(dim=-1) w/o materializing an intermediate size g
             row_mean = (
                 torch.norm(grad, dim=-1, keepdim=True).square_().div_(grad.size(-1))
@@ -402,8 +433,14 @@ def _single_tensor_adafactor(
             var_estimate = row_var @ col_var
             var_estimate.div_(row_var.mean(dim=-2, keepdim=True).clamp_(min=eps1))
         else:
+<<<<<<< HEAD
             if variance is None:
                 raise AssertionError("variance should be defined when grad is a vector")
+=======
+            assert variance is not None, (
+                "variance should be defined when grad is a vector"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             grad_squared = grad * grad
             variance.lerp_(grad_squared, one_minus_beta2_t)
             # avoid writing into variance during update
@@ -436,8 +473,12 @@ def _group_tensors_by_device_dtype_and_is_multidim(
 
         # assumes grad is the second tensorlist
         for j, tensor in enumerate(tensorlists[1]):
+<<<<<<< HEAD
             if tensor is None:
                 raise AssertionError("grad should not be None")
+=======
+            assert tensor is not None, "grad should not be None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if tensor.dim() > 1:
                 if matrix_key not in ultra_grouped_tensors:
                     ultra_grouped_tensors[matrix_key] = [[] for _ in tensorlists]
@@ -477,8 +518,14 @@ def _multi_tensor_adafactor(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Grad scaling should occur outside of optimizer.step()")
+=======
+    assert grad_scale is None and found_inf is None, (
+        "Grad scaling should occur outside of optimizer.step()"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
@@ -499,10 +546,16 @@ def _multi_tensor_adafactor(
         device_grads = cast(list[Tensor], device_grads_)
         device_state_steps = cast(list[Tensor], device_state_steps_)
         if eps1 is None:
+<<<<<<< HEAD
             if dtype is None:
                 raise AssertionError(
                     "dtype is needed to compute eps1 when eps1 is unset"
                 )
+=======
+            assert dtype is not None, (
+                "dtype is needed to compute eps1 when eps1 is unset"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eps1 = torch.finfo(dtype).eps
 
         if TYPE_CHECKING:
@@ -542,10 +595,16 @@ def _multi_tensor_adafactor(
         if is_multidim:
             device_row_vars = cast(list[Tensor], device_row_vars_)
             device_col_vars = cast(list[Tensor], device_col_vars_)
+<<<<<<< HEAD
             if device_row_vars[0] is None or device_col_vars[0] is None:
                 raise AssertionError(
                     "row_var and col_var should be defined when grad is multidimensional"
                 )
+=======
+            assert device_row_vars[0] is not None and device_col_vars[0] is not None, (
+                "row_var and col_var should be defined when grad is multidimensional"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # same as (g * g).mean(dim=-1) w/o materializing an intermediate size g
             row_means = [
                 torch.norm(grad, dim=-1, keepdim=True) for grad in device_grads
@@ -576,8 +635,14 @@ def _multi_tensor_adafactor(
             del row_var_means
         else:
             device_variances = cast(list[Tensor], device_variances_)
+<<<<<<< HEAD
             if device_variances[0] is None:
                 raise AssertionError("variance should be defined when grad is a vector")
+=======
+            assert device_variances[0] is not None, (
+                "variance should be defined when grad is a vector"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             grads_squared = torch._foreach_mul(device_grads, device_grads)
             torch._foreach_lerp_(device_variances, grads_squared, one_minus_beta2_ts)
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
index face68d0bc504..9351672b130ab 100644
--- a/torch/optim/_multi_tensor/__init__.py
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -13,7 +13,10 @@
 
 def partialclass(cls, *args, **kwargs):  # noqa: D103
     class NewCls(cls):
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         __init__ = partialmethod(cls.__init__, *args, **kwargs)
 
     return NewCls
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 2261faa3908da..64c1f90983b42 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -50,6 +50,7 @@ def __init__(
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "rho": rho,
@@ -60,6 +61,18 @@ def __init__(
             "foreach": foreach,
             "differentiable": differentiable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            rho=rho,
+            eps=eps,
+            weight_decay=weight_decay,
+            maximize=maximize,
+            capturable=capturable,
+            foreach=foreach,
+            differentiable=differentiable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -263,6 +276,7 @@ def _single_tensor_adadelta(
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -271,6 +285,15 @@ def _single_tensor_adadelta(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
@@ -318,14 +341,19 @@ def _multi_tensor_adadelta(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -334,6 +362,15 @@ def _multi_tensor_adadelta(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(params) == 0:
         return
@@ -375,7 +412,11 @@ def _multi_tensor_adadelta(
             device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         if weight_decay != 0:
+<<<<<<< HEAD
             # Reuse the intermediate memory (device_grads) already allocated for maximize
+=======
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 4afb0b60d4951..3b8ca69039e34 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -54,6 +54,7 @@ def __init__(
         if not 0.0 <= eps:
             raise ValueError(f"Invalid epsilon value: {eps}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "lr_decay": lr_decay,
@@ -65,6 +66,19 @@ def __init__(
             "differentiable": differentiable,
             "fused": fused,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            lr_decay=lr_decay,
+            eps=eps,
+            weight_decay=weight_decay,
+            initial_accumulator_value=initial_accumulator_value,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            fused=fused,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
         if fused:
@@ -117,7 +131,10 @@ def __setstate__(self, state):
                 )
 
     def share_memory(self):
+<<<<<<< HEAD
         """Calls tensor.share_memory_() on the state sum tensors."""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for group in self.param_groups:
             for p in group["params"]:
                 state = self.state[p]
@@ -337,8 +354,12 @@ def _single_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
+=======
+    assert grad_scale is None and found_inf is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
@@ -403,10 +424,15 @@ def _multi_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+    assert grad_scale is None and found_inf is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Foreach functions will throw errors if given empty lists
     if len(params) == 0:
@@ -470,7 +496,11 @@ def _multi_tensor_adagrad(
             torch._foreach_add_(device_state_steps, 1)
 
         if weight_decay != 0:
+<<<<<<< HEAD
             # Reuse the intermediate memory (device_grads) already allocated for maximize
+=======
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
@@ -488,7 +518,11 @@ def _multi_tensor_adagrad(
         torch._foreach_add_(std, eps)
 
         if weight_decay != 0 or maximize:
+<<<<<<< HEAD
             # Again, reuse the intermediate memory (device_grads) already allocated
+=======
+            # Again, re-use the intermediate memory (device_grads) already allocated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._foreach_mul_(device_grads, minus_clr)
             numerator = device_grads
         else:
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 94cea34acf2a6..aa9ffb70522b4 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -84,6 +84,7 @@ def __init__(
                 )
             if betas[1].numel() != 1:
                 raise ValueError("Tensor betas[1] must be 1-element")
+<<<<<<< HEAD
         betas = tuple(map(_to_scalar, betas))
 
         defaults = {
@@ -99,6 +100,22 @@ def __init__(
             "fused": fused,
             "decoupled_weight_decay": decoupled_weight_decay,
         }
+=======
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            maximize=maximize,
+            foreach=foreach,
+            capturable=capturable,
+            differentiable=differentiable,
+            fused=fused,
+            decoupled_weight_decay=decoupled_weight_decay,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
         if fused:
@@ -316,9 +333,14 @@ def step(self, closure=None):
         lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
             is not yet supported for all our implementations. Please use a float
             LR if you are not also specifying fused=True or capturable=True.
+<<<<<<< HEAD
         betas (tuple[Union[float, Tensor], Union[float, Tensor]], optional):
             coefficients used for computing running averages of gradient and
             its square. If a tensor is provided, must be 1-element. (default: (0.9, 0.999))
+=======
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
@@ -366,13 +388,18 @@ def _single_tensor_adam(
     differentiable: bool,
     decoupled_weight_decay: bool,
 ):
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
+=======
+    assert grad_scale is None and found_inf is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch.jit.is_scripting():
         # this assert is due to JIT being dumb and not realizing that the ops below
         # have overloads to handle both float and Tensor lrs, so we just assert it's
         # a float since most people using JIT are using floats
+<<<<<<< HEAD
         if not isinstance(lr, float):
             raise AssertionError(f"Expected lr to be a float, but got {type(lr)}")
         if not isinstance(beta1, float):
@@ -383,6 +410,14 @@ def _single_tensor_adam(
         lr = _to_scalar(lr)
         beta1 = _to_scalar(beta1)
         beta2 = _to_scalar(beta2)
+=======
+        assert isinstance(lr, float)
+        assert isinstance(beta1, float)
+        assert isinstance(beta2, float)
+    else:
+        lr = _to_scalar(lr)
+        # TODO: Support nonzero-dim Tensor betas, see #147921
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # We only shuffle around the beta when it is a Tensor, otherwise, we prefer
     # treating it as a scalar.
@@ -402,6 +437,7 @@ def _single_tensor_adam(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
@@ -409,6 +445,14 @@ def _single_tensor_adam(
                 raise AssertionError(
                     f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update step
         step_t += 1
@@ -423,7 +467,10 @@ def _single_tensor_adam(
                     if weight_decay.requires_grad:
                         grad = grad.addcmul_(param.clone(), weight_decay)
                     else:
+<<<<<<< HEAD
                         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         grad = grad.add(param, alpha=weight_decay)
                 else:
                     grad = grad.add(param, alpha=weight_decay)
@@ -453,7 +500,10 @@ def _single_tensor_adam(
             device_beta1 = beta1
 
         # Decay the first and second moment running average coefficient
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exp_avg.lerp_(grad, 1 - device_beta1)
 
         # Nested if is necessary to bypass jitscript rules
@@ -469,11 +519,17 @@ def _single_tensor_adam(
                 # expavg.lerp(grad^2, 1-beta2)
                 exp_avg_sq.lerp_(torch.square(grad), weight=1 - beta2)
             else:
+<<<<<<< HEAD
                 exp_avg_sq.mul_(beta2).addcmul_(
                     grad, grad, value=cast(float, 1 - beta2)
                 )
         else:
             exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)  # type: ignore[arg-type]
+=======
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        else:
+            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if capturable or differentiable:
             step = step_t
@@ -544,7 +600,11 @@ def _single_tensor_adam(
             else:
                 denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
 
+<<<<<<< HEAD
             param.addcdiv_(exp_avg, denom, value=-step_size)  # type: ignore[arg-type]
+=======
+            param.addcdiv_(exp_avg, denom, value=-step_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Lastly, switch back to complex view
         if amsgrad and torch.is_complex(params[i]):
@@ -605,6 +665,7 @@ def _multi_tensor_adam(
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -623,6 +684,22 @@ def _multi_tensor_adam(
     lr = _to_scalar(lr)
     beta1 = _to_scalar(beta1)
     beta2 = _to_scalar(beta2)
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    assert grad_scale is None and found_inf is None
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    lr = _to_scalar(lr)
+    # TODO: Support nonzero-dim Tensor betas, see #147921
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]  # type: ignore[list-item]
@@ -691,7 +768,11 @@ def _multi_tensor_adam(
                 # Perform stepweight decay
                 torch._foreach_mul_(device_params, 1 - lr * weight_decay)
             else:
+<<<<<<< HEAD
                 # Reuse the intermediate memory (device_grads) already allocated for maximize
+=======
+                # Re-use the intermediate memory (device_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if maximize:
                     torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
                 else:
@@ -702,9 +783,13 @@ def _multi_tensor_adam(
         # Decay the first and second moment running average coefficient
         # Use device beta1 if beta1 is a tensor to ensure all
         # tensors are on the same device
+<<<<<<< HEAD
         torch._foreach_lerp_(
             device_exp_avgs, device_grads, cast(float, 1 - device_beta1)
         )
+=======
+        torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - device_beta1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         torch._foreach_mul_(device_exp_avg_sqs, beta2)
 
@@ -812,8 +897,13 @@ def _fused_adam(
     *,
     amsgrad: bool,
     has_complex: bool,  # Needed for consistency.
+<<<<<<< HEAD
     beta1: Union[float, Tensor],
     beta2: Union[float, Tensor],
+=======
+    beta1: float,
+    beta2: float,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
@@ -827,9 +917,12 @@ def _fused_adam(
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
+<<<<<<< HEAD
     beta1 = _to_scalar(beta1)
     beta2 = _to_scalar(beta2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_scale_dict: DeviceDict = (
         {grad_scale.device: grad_scale} if grad_scale is not None else {}
     )
@@ -919,8 +1012,13 @@ def adam(
     decoupled_weight_decay: bool = False,
     *,
     amsgrad: bool,
+<<<<<<< HEAD
     beta1: Union[float, Tensor],
     beta2: Union[float, Tensor],
+=======
+    beta1: float,
+    beta2: float,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 42440024f1249..2acf2fb8f4246 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -53,6 +53,7 @@ def __init__(
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "betas": betas,
@@ -63,6 +64,18 @@ def __init__(
             "differentiable": differentiable,
             "capturable": capturable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            capturable=capturable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -253,6 +266,7 @@ def _single_tensor_adamax(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
@@ -260,6 +274,14 @@ def _single_tensor_adamax(
                 raise AssertionError(
                     f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update step
         step_t += 1
@@ -320,8 +342,12 @@ def _multi_tensor_adamax(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(params) == 0:
         return
@@ -331,6 +357,7 @@ def _multi_tensor_adamax(
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -339,6 +366,15 @@ def _multi_tensor_adamax(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
@@ -379,7 +415,11 @@ def _multi_tensor_adamax(
 
         if weight_decay != 0:
             if maximize:
+<<<<<<< HEAD
                 # Reuse the intermediate memory (grouped_grads) already allocated for maximize
+=======
+                # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
             else:
                 grouped_grads = torch._foreach_add(  # type: ignore[assignment]
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 0558cbddd883b..1308dd38dfd74 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -102,9 +102,14 @@ def __setstate__(self, state):
         lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
             is not yet supported for all our implementations. Please use a float
             LR if you are not also specifying fused=True or capturable=True.
+<<<<<<< HEAD
         betas (tuple[Union[float, Tensor], Union[float, Tensor]], optional):
             coefficients used for computing running averages of gradient and
             its square. If a tensor is provided, must be 1-element. (default: (0.9, 0.999))
+=======
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay coefficient (default: 1e-2)
@@ -146,8 +151,13 @@ def adamw(
     has_complex: bool = False,
     *,
     amsgrad: bool,
+<<<<<<< HEAD
     beta1: Union[float, Tensor],
     beta2: Union[float, Tensor],
+=======
+    beta1: float,
+    beta2: float,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index f758e649f0ce1..8b98530747e4a 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -47,6 +47,7 @@ def __init__(
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "lambd": lambd,
@@ -58,6 +59,19 @@ def __init__(
             "differentiable": differentiable,
             "capturable": capturable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            lambd=lambd,
+            alpha=alpha,
+            t0=t0,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            capturable=capturable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -226,17 +240,28 @@ def _single_tensor_asgd(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
+=======
+            assert (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 param.device.type
                 == mu.device.type
                 == eta.device.type
                 == step_t.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ):
                 raise AssertionError(
                     f"If capturable=True, params, mus, etas, and state_steps must be "
                     f"on supported devices: {capturable_supported_devices}."
                 )
+=======
+            ), (
+                f"If capturable=True, params, mus, etas, and state_steps must be "
+                f"on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -264,7 +289,10 @@ def _single_tensor_asgd(
             ax.copy_(param)
 
         if capturable:
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eta.copy_(lr / ((1 + lambd * lr * step_t) ** alpha))
             mu.copy_(1 / torch.maximum(step_t - t0, torch.ones_like(step_t)))
         else:
@@ -296,14 +324,19 @@ def _multi_tensor_asgd(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == mu.device.type == eta.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -313,6 +346,15 @@ def _multi_tensor_asgd(
                 f"If capturable=True, params, mus, etas, and state_steps must be on "
                 f"supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == mu.device.type == eta.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, mu, eta, step in zip(params, mus, etas, state_steps)
+        ), (
+            f"If capturable=True, params, mus, etas, and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 1bcd18279a3ca..521d3a6a12fd9 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -113,6 +113,7 @@ def _strong_wolfe(
 
         # compute new trial value
         t = _cubic_interpolate(
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
             bracket[0],
             # pyrefly: ignore [unbound-name]
@@ -123,6 +124,13 @@ def _strong_wolfe(
             # pyrefly: ignore [unbound-name]
             bracket_f[1],
             # pyrefly: ignore [unbound-name]
+=======
+            bracket[0],
+            bracket_f[0],
+            bracket_gtd[0],  # type: ignore[possibly-undefined]
+            bracket[1],
+            bracket_f[1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             bracket_gtd[1],
         )
 
@@ -133,6 +141,7 @@ def _strong_wolfe(
         #   + `t` is at one of the boundary,
         # we will move `t` to a position which is `0.1 * len(bracket)`
         # away from the nearest boundary point.
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         eps = 0.1 * (max(bracket) - min(bracket))
         # pyrefly: ignore [unbound-name]
@@ -147,6 +156,16 @@ def _strong_wolfe(
                     t = max(bracket) - eps
                 else:
                     # pyrefly: ignore [unbound-name]
+=======
+        eps = 0.1 * (max(bracket) - min(bracket))
+        if min(max(bracket) - t, t - min(bracket)) < eps:
+            # interpolation close to boundary
+            if insuf_progress or t >= max(bracket) or t <= min(bracket):
+                # evaluate at 0.1 away from boundary
+                if abs(t - max(bracket)) < abs(t - min(bracket)):
+                    t = max(bracket) - eps
+                else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     t = min(bracket) + eps
                 insuf_progress = False
             else:
@@ -160,6 +179,7 @@ def _strong_wolfe(
         gtd_new = g_new.dot(d)
         ls_iter += 1
 
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         if f_new > (f + c1 * t * gtd) or f_new >= bracket_f[low_pos]:
             # Armijo condition not satisfied or not lower than lowest point
@@ -171,11 +191,20 @@ def _strong_wolfe(
             # pyrefly: ignore [unbound-name]
             bracket_gtd[high_pos] = gtd_new
             # pyrefly: ignore [unbound-name]
+=======
+        if f_new > (f + c1 * t * gtd) or f_new >= bracket_f[low_pos]:
+            # Armijo condition not satisfied or not lower than lowest point
+            bracket[high_pos] = t
+            bracket_f[high_pos] = f_new
+            bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
+            bracket_gtd[high_pos] = gtd_new
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0)
         else:
             if abs(gtd_new) <= -c2 * gtd:
                 # Wolfe conditions satisfied
                 done = True
+<<<<<<< HEAD
             # pyrefly: ignore [index-error]
             elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0:
                 # old high becomes new low
@@ -194,11 +223,27 @@ def _strong_wolfe(
             bracket_f[low_pos] = f_new
             bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
             # pyrefly: ignore [unbound-name]
+=======
+            elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0:
+                # old high becomes new low
+                bracket[high_pos] = bracket[low_pos]
+                bracket_f[high_pos] = bracket_f[low_pos]
+                bracket_g[high_pos] = bracket_g[low_pos]  # type: ignore[possibly-undefined]
+                bracket_gtd[high_pos] = bracket_gtd[low_pos]
+
+            # new point becomes new low
+            bracket[low_pos] = t
+            bracket_f[low_pos] = f_new
+            bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             bracket_gtd[low_pos] = gtd_new
 
     # return stuff
     t = bracket[low_pos]  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     f_new = bracket_f[low_pos]
     g_new = bracket_g[low_pos]  # type: ignore[possibly-undefined]
     return f_new, g_new, t, ls_func_evals
@@ -255,6 +300,7 @@ def __init__(
             raise ValueError(f"Invalid learning rate: {lr}")
         if max_eval is None:
             max_eval = max_iter * 5 // 4
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "max_iter": max_iter,
@@ -264,6 +310,17 @@ def __init__(
             "history_size": history_size,
             "line_search_fn": line_search_fn,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            max_iter=max_iter,
+            max_eval=max_eval,
+            tolerance_grad=tolerance_grad,
+            tolerance_change=tolerance_change,
+            history_size=history_size,
+            line_search_fn=line_search_fn,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
         if len(self.param_groups) != 1:
@@ -276,7 +333,10 @@ def __init__(
 
     def _numel(self):
         if self._numel_cache is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._numel_cache = sum(
                 2 * p.numel() if torch.is_complex(p) else p.numel()
                 for p in self._params
@@ -307,8 +367,12 @@ def _add_grad(self, step_size, update):
             # view as to avoid deprecated pointwise semantics
             p.add_(update[offset : offset + numel].view_as(p), alpha=step_size)
             offset += numel
+<<<<<<< HEAD
         if offset != self._numel():
             raise AssertionError(f"Expected offset {offset} to equal {self._numel()}")
+=======
+        assert offset == self._numel()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _clone_param(self):
         return [p.clone(memory_format=torch.contiguous_format) for p in self._params]
@@ -332,10 +396,14 @@ def step(self, closure):  # type: ignore[override]
             closure (Callable): A closure that reevaluates the model
                 and returns the loss.
         """
+<<<<<<< HEAD
         if len(self.param_groups) != 1:
             raise AssertionError(
                 f"Expected exactly one param_group, but got {len(self.param_groups)}"
             )
+=======
+        assert len(self.param_groups) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Make sure the closure is always called with grad enabled
         closure = torch.enable_grad()(closure)
@@ -471,6 +539,7 @@ def obj_func(x, t, d):
                         return self._directional_evaluate(closure, x, t, d)
 
                     loss, flat_grad, t, ls_func_evals = _strong_wolfe(
+<<<<<<< HEAD
                         obj_func,
                         x_init,
                         t,
@@ -479,6 +548,9 @@ def obj_func(x, t, d):
                         flat_grad,
                         gtd,
                         max_ls=max_eval - current_evals,
+=======
+                        obj_func, x_init, t, d, loss, flat_grad, gtd
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 self._add_grad(t, d)
                 opt_cond = flat_grad.abs().max() <= tolerance_grad
@@ -490,8 +562,12 @@ def obj_func(x, t, d):
                     # the reason we do this: in a stochastic setting,
                     # no use to re-evaluate that function here
                     with torch.enable_grad():
+<<<<<<< HEAD
                         loss = closure()
                     loss = float(loss)
+=======
+                        loss = float(closure())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     flat_grad = self._gather_flat_grad()
                     opt_cond = flat_grad.abs().max() <= tolerance_grad
                     ls_func_evals = 1
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index c3b7ee7659956..e51339cdefbba 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -11,6 +11,10 @@
 from functools import partial, wraps
 from typing import (
     Any,
+<<<<<<< HEAD
+=======
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cast,
     Literal,
     Optional,
@@ -28,7 +32,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Sequence
+=======
+    from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -78,6 +86,7 @@ def _copy(_param):
     return list(map(_copy, param))
 
 
+<<<<<<< HEAD
 def _param_groups_val_list(optimizer: Optimizer, key: str) -> list[Any]:
     """Create a list containing group[key] for each optimizer param_group.
     Prevents aliasing when group[key] could be a Tensor.
@@ -119,6 +128,10 @@ class LRScheduler:
         :meth:`~torch.optim.Optimizer.load_state_dict` to avoid overwriting the
         loaded learning rates.
     """
+=======
+class LRScheduler:
+    r"""Adjusts the learning rate during optimization."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _get_lr_called_within_step: bool = False
     _is_initial: bool = False
@@ -144,6 +157,7 @@ def __init__(
             for i, group in enumerate(optimizer.param_groups):
                 if "initial_lr" not in group:
                     raise KeyError(
+<<<<<<< HEAD
                         f"param 'initial_lr' is not specified in param_groups[{i}] when resuming scheduler with last_epoch >= 0.\n"
                         "This typically happens when:\n"
                         "1. You're trying to resume training from a checkpoint but haven't properly loaded the optimizer state\n"
@@ -152,6 +166,14 @@ def __init__(
         self.base_lrs: list[float | Tensor] = _param_groups_val_list(
             optimizer, "initial_lr"
         )
+=======
+                        "param 'initial_lr' is not specified "
+                        f"in param_groups[{i}] when resuming an optimizer"
+                    )
+        self.base_lrs: list[float] = [
+            group["initial_lr"] for group in optimizer.param_groups
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.last_epoch = last_epoch
 
         # Following https://github.com/pytorch/pytorch/issues/20124
@@ -189,7 +211,11 @@ def _initial_step(self) -> None:
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
+<<<<<<< HEAD
         It contains an entry for every variable in ``self.__dict__`` which
+=======
+        It contains an entry for every variable in self.__dict__ which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is not the optimizer.
         """
         return {
@@ -205,6 +231,7 @@ def load_state_dict(self, state_dict: dict[str, Any]):
         """
         self.__dict__.update(state_dict)
 
+<<<<<<< HEAD
     def get_last_lr(self) -> list[float | Tensor]:
         r"""Get the most recent learning rates computed by this scheduler.
 
@@ -257,6 +284,18 @@ def step(self, epoch: Optional[int] = None) -> None:
             Call this method after calling the optimizer's
             :meth:`~torch.optim.Optimizer.step`.
         """
+=======
+    def get_last_lr(self) -> list[float]:
+        """Return last computed learning rate by current scheduler."""
+        return self._last_lr
+
+    def get_lr(self) -> list[float]:
+        """Compute learning rate using chainable form of the scheduler."""
+        raise NotImplementedError
+
+    def step(self, epoch: Optional[int] = None) -> None:
+        """Perform a step."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Raise a warning if old pattern is detected
         # https://github.com/pytorch/pytorch/issues/20124
         if self._step_count == 1:
@@ -267,7 +306,10 @@ def step(self, epoch: Optional[int] = None) -> None:
                     "`lr_scheduler.step()`. See more details at "
                     "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
@@ -280,6 +322,7 @@ def step(self, epoch: Optional[int] = None) -> None:
                     "See more details at "
                     "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
                 )
 
@@ -289,25 +332,49 @@ def step(self, epoch: Optional[int] = None) -> None:
         self._update_lr(epoch)
 
     def _update_lr(self, epoch: Optional[int] = None):
+=======
+                )
+
+        self._step_count += 1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with _enable_get_lr_call(self):
             if epoch is None:
                 self.last_epoch += 1
                 values = self.get_lr()
             else:
+<<<<<<< HEAD
                 self.last_epoch = epoch
                 if hasattr(self, "_get_closed_form_lr"):
                     values = cast(
                         list[Union[float, Tensor]], self._get_closed_form_lr()
                     )
+=======
+                warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+                self.last_epoch = epoch
+                if hasattr(self, "_get_closed_form_lr"):
+                    values = cast(list[float], self._get_closed_form_lr())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     values = self.get_lr()
 
         for param_group, lr in zip(self.optimizer.param_groups, values):
+<<<<<<< HEAD
             _update_param_group_val(param_group, "lr", lr)
 
         self._last_lr: list[float | Tensor] = _param_groups_val_list(
             self.optimizer, "lr"
         )
+=======
+            if isinstance(param_group["lr"], Tensor):
+                param_group["lr"].fill_(_to_scalar(lr))
+            else:
+                param_group["lr"] = lr
+
+        self._last_lr: list[float] = [
+            group["lr"] for group in self.optimizer.param_groups
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _warn_get_lr_called_within_step(lr_scheduler: LRScheduler) -> None:
@@ -407,7 +474,12 @@ def __init__(
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
+<<<<<<< HEAD
         It contains an entry for every variable in ``self.__dict__`` which is not the optimizer.
+=======
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
 
@@ -422,7 +494,10 @@ def state_dict(self) -> dict[str, Any]:
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
@@ -448,6 +523,7 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -468,6 +544,10 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute learning rate."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         return [
@@ -528,7 +608,11 @@ def __init__(
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
+<<<<<<< HEAD
         It contains an entry for every variable in ``self.__dict__`` which
+=======
+        It contains an entry for every variable in self.__dict__ which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is not the optimizer.
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
@@ -542,7 +626,10 @@ def state_dict(self) -> dict[str, Any]:
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
@@ -566,6 +653,7 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -587,6 +675,10 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         if not self._is_initial:
@@ -595,7 +687,11 @@ def get_lr(self) -> list[float | Tensor]:
                 for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)
             ]
         else:
+<<<<<<< HEAD
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class StepLR(LRScheduler):
@@ -639,6 +735,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -679,6 +776,17 @@ def _get_closed_form_lr(self) -> list[float | Tensor]:
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
+            return [group["lr"] for group in self.optimizer.param_groups]
+        return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             base_lr * self.gamma ** (self.last_epoch // self.step_size)
             for base_lr in self.base_lrs
@@ -725,6 +833,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -754,12 +863,21 @@ def get_lr(self) -> list[float | Tensor]:
 
         if self.last_epoch not in self.milestones:
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        if self.last_epoch not in self.milestones:
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             group["lr"] * self.gamma ** self.milestones[self.last_epoch]
             for group in self.optimizer.param_groups
         ]
 
     def _get_closed_form_lr(self):
+<<<<<<< HEAD
         r"""Compute learning rates for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
         a closed-form formula.
@@ -772,6 +890,8 @@ def _get_closed_form_lr(self):
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         milestones = sorted(self.milestones.elements())
         return [
             base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
@@ -829,6 +949,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -851,19 +972,28 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
             return [group["lr"] * self.factor for group in self.optimizer.param_groups]
 
         if self.last_epoch != self.total_iters:
+<<<<<<< HEAD
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return [
             group["lr"] * (1.0 / self.factor) for group in self.optimizer.param_groups
         ]
 
     def _get_closed_form_lr(self):
+<<<<<<< HEAD
         r"""Compute learning rates for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
         a closed-form formula.
@@ -876,6 +1006,8 @@ def _get_closed_form_lr(self):
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             base_lr
             * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
@@ -943,6 +1075,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -965,6 +1098,10 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
@@ -973,7 +1110,11 @@ def get_lr(self) -> list[float | Tensor]:
             ]
 
         if self._is_initial or self.last_epoch > self.total_iters:
+<<<<<<< HEAD
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return [
             group["lr"]
@@ -989,6 +1130,7 @@ def get_lr(self) -> list[float | Tensor]:
         ]
 
     def _get_closed_form_lr(self):
+<<<<<<< HEAD
         r"""Compute learning rates for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
         a closed-form formula.
@@ -1001,6 +1143,8 @@ def _get_closed_form_lr(self):
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             base_lr
             * (
@@ -1044,6 +1188,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -1064,11 +1209,16 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         # when loading from a checkpoint, we don't want _initial_step (called from the constructor)
         # to update the lr one more step ahead of itself.
         if self._is_initial:
+<<<<<<< HEAD
             return _param_groups_val_list(self.optimizer, "lr")
         return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
@@ -1085,6 +1235,12 @@ def _get_closed_form_lr(self):
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+            return [group["lr"] for group in self.optimizer.param_groups]
+        return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [base_lr * self.gamma**self.last_epoch for base_lr in self.base_lrs]
 
 
@@ -1168,7 +1324,11 @@ def __init__(
 
         # Reset learning rates back to initial values
         for group in self.optimizer.param_groups:
+<<<<<<< HEAD
             _update_param_group_val(group, "lr", group["initial_lr"])
+=======
+            group["lr"] = group["initial_lr"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # "Undo" the step performed by other schedulers
         self.recursive_undo()
@@ -1197,7 +1357,11 @@ def step(self) -> None:  # type: ignore[override]
         idx = bisect_right(self._milestones, self.last_epoch)
         scheduler = self._schedulers[idx]
         if idx > 0 and self._milestones[idx - 1] == self.last_epoch:
+<<<<<<< HEAD
             scheduler._update_lr(0)
+=======
+            scheduler.step(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             scheduler.step()
 
@@ -1207,7 +1371,11 @@ def step(self) -> None:  # type: ignore[override]
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
+<<<<<<< HEAD
         It contains an entry for every variable in ``self.__dict__`` which
+=======
+        It contains an entry for every variable in self.__dict__ which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
@@ -1219,7 +1387,10 @@ def state_dict(self) -> dict[str, Any]:
         state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
@@ -1281,6 +1452,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -1313,6 +1485,14 @@ def get_lr(self) -> list[float | Tensor]:
 
         if self._is_initial or self.last_epoch > self.total_iters:
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate."""
+        _warn_get_lr_called_within_step(self)
+
+        if self._is_initial or self.last_epoch > self.total_iters:
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         decay_factor = (
             (1.0 - self.last_epoch / self.total_iters)
@@ -1320,6 +1500,7 @@ def get_lr(self) -> list[float | Tensor]:
         ) ** self.power
         return [group["lr"] * decay_factor for group in self.optimizer.param_groups]
 
+<<<<<<< HEAD
     def _get_closed_form_lr(self) -> list[float | Tensor]:
         r"""Compute learning rates for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
@@ -1333,6 +1514,9 @@ def _get_closed_form_lr(self) -> list[float | Tensor]:
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+    def _get_closed_form_lr(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             (
                 base_lr
@@ -1405,6 +1589,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -1435,6 +1620,14 @@ def get_lr(self) -> list[float | Tensor]:
 
         if self._is_initial:
             return _param_groups_val_list(self.optimizer, "lr")
+=======
+    def get_lr(self) -> list[float]:
+        """Retrieve the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        if self._is_initial:
+            return [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self._step_count == 1 and self.last_epoch > 0:
             return [
                 self.eta_min
@@ -1457,6 +1650,7 @@ def get_lr(self) -> list[float | Tensor]:
             for group in self.optimizer.param_groups
         ]
 
+<<<<<<< HEAD
     def _get_closed_form_lr(self) -> list[float | Tensor]:
         r"""Compute learning rates for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
@@ -1470,6 +1664,9 @@ def _get_closed_form_lr(self) -> list[float | Tensor]:
             the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
             same types as their current ``group["lr"]``\s.
         """
+=======
+    def _get_closed_form_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             self.eta_min
             + (base_lr - self.eta_min)
@@ -1538,19 +1735,35 @@ def __init__(
                 )
         self._schedulers = schedulers
         self.optimizer = optimizer
+<<<<<<< HEAD
         self._last_lr = _param_groups_val_list(self._schedulers[-1].optimizer, "lr")
+=======
+        self._last_lr = [
+            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def step(self) -> None:  # type: ignore[override]
         """Perform a step."""
         for scheduler in self._schedulers:
             scheduler.step()
+<<<<<<< HEAD
         self._last_lr = _param_groups_val_list(self._schedulers[-1].optimizer, "lr")
+=======
+        self._last_lr = [
+            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @override
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
+<<<<<<< HEAD
         It contains an entry for every variable in ``self.__dict__`` which
+=======
+        It contains an entry for every variable in self.__dict__ which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
@@ -1562,7 +1775,10 @@ def state_dict(self) -> dict[str, Any]:
         state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
@@ -1671,7 +1887,10 @@ def __init__(
             self.default_min_lr = None
             self.min_lrs = list(min_lr)
         else:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.default_min_lr = min_lr
             self.min_lrs = [min_lr] * len(optimizer.param_groups)
 
@@ -1679,7 +1898,11 @@ def __init__(
         self.cooldown = cooldown
         self.eps = eps
         self.last_epoch = 0
+<<<<<<< HEAD
         self._last_lr = _param_groups_val_list(self.optimizer, "lr")
+=======
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._init_is_better(
             mode=mode, threshold=threshold, threshold_mode=threshold_mode
         )
@@ -1698,10 +1921,17 @@ def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[over
         if epoch is None:
             epoch = self.last_epoch + 1
         else:
+<<<<<<< HEAD
             warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2)
         self.last_epoch = epoch
 
         if self._is_better(current, self.best):
+=======
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+        self.last_epoch = epoch
+
+        if self.is_better(current, self.best):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.best = current
             self.num_bad_epochs = 0
         else:
@@ -1716,7 +1946,11 @@ def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[over
             self.cooldown_counter = self.cooldown
             self.num_bad_epochs = 0
 
+<<<<<<< HEAD
         self._last_lr = _param_groups_val_list(self.optimizer, "lr")
+=======
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _reduce_lr(self, epoch):
         if len(self.optimizer.param_groups) != len(self.min_lrs):
@@ -1731,20 +1965,31 @@ def _reduce_lr(self, epoch):
                     "of the `optimizer` param groups."
                 )
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.min_lrs = [self.default_min_lr] * len(self.optimizer.param_groups)
 
         for i, param_group in enumerate(self.optimizer.param_groups):
             old_lr = float(param_group["lr"])
             new_lr = max(old_lr * self.factor, self.min_lrs[i])
             if old_lr - new_lr > self.eps:
+<<<<<<< HEAD
                 _update_param_group_val(param_group, "lr", new_lr)
+=======
+                param_group["lr"] = new_lr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def in_cooldown(self):  # noqa: D102
         return self.cooldown_counter > 0
 
+<<<<<<< HEAD
     def _is_better(self, a, best):  # noqa: D102
+=======
+    def is_better(self, a, best):  # noqa: D102
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.mode == "min" and self.threshold_mode == "rel":
             rel_epsilon = 1.0 - self.threshold
             return a < best * rel_epsilon
@@ -1907,6 +2152,7 @@ def __init__(
         base_lrs = _format_param("base_lr", optimizer, base_lr)
         if last_epoch == -1:
             for lr, group in zip(base_lrs, optimizer.param_groups):
+<<<<<<< HEAD
                 _update_param_group_val(group, "lr", lr)
 
         self.max_lrs = _format_param("max_lr", optimizer, max_lr)
@@ -1918,6 +2164,20 @@ def __init__(
             float(step_size_down) if step_size_down is not None else step_size_up
         )
         # pyrefly: ignore [unsupported-operation]
+=======
+                if isinstance(group["lr"], Tensor):
+                    lr_val = lr.item() if isinstance(lr, Tensor) else lr
+                    group["lr"].fill_(lr_val)
+                else:
+                    group["lr"] = lr
+
+        self.max_lrs = _format_param("max_lr", optimizer, max_lr)
+
+        step_size_up = float(step_size_up)
+        step_size_down = (
+            float(step_size_down) if step_size_down is not None else step_size_up
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.total_size = step_size_up + step_size_down
         self.step_ratio = step_size_up / self.total_size
 
@@ -1994,6 +2254,7 @@ def _exp_range_scale_fn(gamma: float, x: float) -> float:
         return gamma**x
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -2022,6 +2283,15 @@ def get_lr(self) -> list[float | Tensor]:
         .. note::
             When :attr:`cycle_momentum` is ``True``, this method has a side
             effect of updating the optimizer's momentum.
+=======
+    def get_lr(self) -> list[float]:
+        """Calculate the learning rate at batch index.
+
+        This function treats `self.last_epoch` as the last batch index.
+
+        If `self.cycle_momentum` is ``True``, this function has a side effect of
+        updating the optimizer's momentum.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         _warn_get_lr_called_within_step(self)
 
@@ -2064,6 +2334,7 @@ def get_lr(self) -> list[float | Tensor]:
 
     @override
     def state_dict(self) -> dict[str, Any]:  # noqa: D102
+<<<<<<< HEAD
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in ``self.__dict__`` which
@@ -2073,6 +2344,8 @@ def state_dict(self) -> dict[str, Any]:  # noqa: D102
 
         When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state = super().state_dict()
         # We are dropping the `_scale_fn_ref` attribute because it is a
         # `weakref.WeakMethod` and can't be pickled.
@@ -2161,6 +2434,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -2191,6 +2465,10 @@ def get_lr(self) -> list[float | Tensor]:
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the initial learning rate."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         return [
@@ -2261,9 +2539,15 @@ def step(self, epoch=None) -> None:
 
         with _enable_get_lr_call(self):
             for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+<<<<<<< HEAD
                 _update_param_group_val(param_group, "lr", lr)
 
         self._last_lr = _param_groups_val_list(self.optimizer, "lr")
+=======
+                param_group["lr"] = lr
+
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _SchedulePhase(TypedDict):
@@ -2535,6 +2819,7 @@ def _annealing_linear(start, end, pct):
         return (end - start) * pct + start
 
     @override
+<<<<<<< HEAD
     def get_lr(self) -> list[float | Tensor]:
         r"""Compute the next learning rate for each of the optimizer's
         :attr:`~torch.optim.Optimizer.param_groups`.
@@ -2560,6 +2845,10 @@ def get_lr(self) -> list[float | Tensor]:
             When :attr:`cycle_momentum` is ``True``, this method has a side
             effect of updating the optimizer's momentum.
         """
+=======
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _warn_get_lr_called_within_step(self)
 
         lrs = []
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 59ed0e5d54bf9..daecd4c7ae9be 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -59,6 +59,7 @@ def __init__(
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
         if not 0.0 <= momentum_decay:
             raise ValueError(f"Invalid momentum_decay value: {momentum_decay}")
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "betas": betas,
@@ -71,6 +72,20 @@ def __init__(
             "capturable": capturable,
             "differentiable": differentiable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            momentum_decay=momentum_decay,
+            decoupled_weight_decay=decoupled_weight_decay,
+            maximize=maximize,
+            foreach=foreach,
+            capturable=capturable,
+            differentiable=differentiable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):  # noqa: D105
@@ -317,6 +332,7 @@ def _single_tensor_nadam(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == mu_product.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
@@ -325,6 +341,15 @@ def _single_tensor_nadam(
                     f"If capturable=True, params, mu_products and state_steps must be "
                     f"on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == mu_product.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params, mu_products and state_steps must be "
+                f"on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update step
         step_t += 1
@@ -372,9 +397,13 @@ def _single_tensor_nadam(
                 grad, denom, value=(-lr * (1.0 - mu) / (1.0 - _get_value(mu_product)))
             )
             param.addcdiv_(
+<<<<<<< HEAD
                 exp_avg,
                 denom,
                 value=cast(float, (-lr * mu_next) / (1.0 - mu_product_next)),
+=======
+                exp_avg, denom, value=(-lr * mu_next) / (1.0 - mu_product_next)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -401,14 +430,19 @@ def _multi_tensor_nadam(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == mp.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -419,6 +453,17 @@ def _multi_tensor_nadam(
                 "params, mu_products, and state_steps must be on supported devices: "
                 f"{capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == mp.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, mp, step in zip(params, mu_products, state_steps)
+        ), (
+            "If capturable=True, "
+            "params, mu_products, and state_steps must be on supported devices: "
+            f"{capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
@@ -465,7 +510,11 @@ def _multi_tensor_nadam(
                 # Perform stepweight decay
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
+<<<<<<< HEAD
                 # Reuse the intermediate memory (grouped_grads) already allocated for maximize
+=======
+                # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if maximize:
                     torch._foreach_add_(
                         grouped_grads, grouped_params, alpha=weight_decay
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index f7c8ef65a7bd7..772279b61c752 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -4,11 +4,19 @@
 import functools
 import warnings
 from collections import defaultdict, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Hashable, Iterable, Sequence
 from copy import deepcopy
 from itertools import chain
 from typing import Any, cast, Optional, overload, TypeAlias, TypeVar, Union
 from typing_extensions import ParamSpec, Self
+=======
+from collections.abc import Hashable, Iterable, Sequence
+from copy import deepcopy
+from itertools import chain
+from typing import Any, Callable, cast, Optional, overload, TypeVar, Union
+from typing_extensions import ParamSpec, Self, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils.hooks as hooks
@@ -28,10 +36,16 @@
 Args: TypeAlias = tuple[Any, ...]
 Kwargs: TypeAlias = dict[str, Any]
 StateDict: TypeAlias = dict[str, Any]
+<<<<<<< HEAD
 DeviceDict: TypeAlias = dict[Optional[torch.device], torch.Tensor]
 DeviceDtypeDict: TypeAlias = dict[
     Optional[tuple[torch.device, torch.dtype]], torch.Tensor
 ]
+=======
+DeviceDict = dict[Optional[torch.device], torch.Tensor]
+DeviceDtypeDict = dict[Optional[tuple[torch.device, torch.dtype]], torch.Tensor]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 GlobalOptimizerPreHook: TypeAlias = Callable[
     ["Optimizer", Args, Kwargs], Optional[tuple[Args, Kwargs]]
@@ -62,7 +76,10 @@ def _use_grad_for_differentiable(func: Callable[_P, _T]) -> Callable[_P, _T]:
     def _use_grad(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         import torch._dynamo
 
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = cast(Optimizer, args[0])  # assume first positional arg is `self`
         prev_grad = torch.is_grad_enabled()
         try:
@@ -136,13 +153,19 @@ def maybe_fallback(*args: _P.args, **kwargs: _P.kwargs):
             if torch.compiler.is_compiling() and (
                 not kwargs.get("capturable", False)
                 and has_state_steps
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and (arg := args[state_steps_ind])
                 and isinstance(arg, Sequence)
                 and arg[0].is_cuda
                 or (
                     "state_steps" in kwargs
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and (kwarg := kwargs["state_steps"])
                     and isinstance(kwarg, Sequence)
                     and kwarg[0].is_cuda
@@ -230,7 +253,11 @@ def _get_capturable_supported_devices(supports_xla: bool = True) -> list[str]:
     return capturable_supported_devices
 
 
+<<<<<<< HEAD
 def _to_scalar(x: Union[float, torch.Tensor]):
+=======
+def _to_scalar(x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""This function converts a hyperparameter to a 0-dimension (scalar) tensor
     if it is a nonzero-dimensions 1-element tensor. If it is not a tensor, it is
     kept as is.
@@ -362,6 +389,7 @@ class Optimizer:
 
     _optimizer_step_pre_hooks: dict[int, OptimizerPreHook]
     _optimizer_step_post_hooks: dict[int, OptimizerPostHook]
+<<<<<<< HEAD
     # pyrefly: ignore [not-a-type]
     _optimizer_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
     _optimizer_state_dict_post_hooks: (
@@ -374,6 +402,16 @@ class Optimizer:
     )
     _optimizer_load_state_dict_post_hooks: (
         # pyrefly: ignore [not-a-type]
+=======
+    _optimizer_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
+    _optimizer_state_dict_post_hooks: (
+        'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    )
+    _optimizer_load_state_dict_pre_hooks: (
+        'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    )
+    _optimizer_load_state_dict_post_hooks: (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         'OrderedDict[int, Callable[["Optimizer"], None]]'
     )
 
@@ -483,8 +521,12 @@ def _cuda_graph_capture_health_check(self) -> None:
                 warnings.warn(
                     "This instance was constructed with capturable=True or some of all the param_groups came with capturable=True, "
                     "but step() is running without CUDA graph capture. If you never intend to graph-capture this "
+<<<<<<< HEAD
                     "instance, capturable=True can impair performance, and you should set capturable=False.",
                     stacklevel=2,
+=======
+                    "instance, capturable=True can impair performance, and you should set capturable=False."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self._warned_capturable_if_run_uncaptured = True
 
@@ -522,7 +564,10 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
                                 f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
                             )
 
+<<<<<<< HEAD
                 # pyrefly: ignore [invalid-param-spec]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = func(*args, **kwargs)
                 self._optimizer_step_code()
 
@@ -780,12 +825,20 @@ def _process_value_according_to_param_policy(
         # UNLESS fused or capturable, see note [special device hosting for step]
         fused = False
         capturable = False
+<<<<<<< HEAD
         if param_groups is None:
             raise AssertionError("Expected param_groups to be set")
         for pg in param_groups:
             if param_id in pg["params"]:
                 fused = pg.get("fused", False)
                 capturable = pg.get("capturable", False)
+=======
+        assert param_groups is not None
+        for pg in param_groups:
+            if param_id in pg["params"]:
+                fused = pg["fused"] if "fused" in pg else False
+                capturable = pg["capturable"] if "capturable" in pg else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 break
         if key == "step":
             if capturable or fused:
@@ -959,6 +1012,7 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
             r"""Make a deep copy of value, casting all tensors to device of param."""
             if isinstance(value, torch.Tensor):
                 return Optimizer._process_value_according_to_param_policy(
+<<<<<<< HEAD
                     param,
                     value,
                     # pyrefly: ignore [bad-argument-type]
@@ -966,6 +1020,9 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
                     # pyrefly: ignore [bad-argument-type]
                     param_groups,
                     key,
+=======
+                    param, value, param_id, param_groups, key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif isinstance(value, dict):
                 return {
@@ -976,7 +1033,10 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
                 }
             elif isinstance(value, Iterable):
                 return type(value)(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-count]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     _cast(param, v, param_id=param_id, param_groups=param_groups)
                     for v in value
                 )  # type: ignore[call-arg]
@@ -1016,6 +1076,7 @@ def zero_grad(self, set_to_none: bool = True) -> None:
         r"""Reset the gradients of all optimized :class:`torch.Tensor` s.
 
         Args:
+<<<<<<< HEAD
             set_to_none (bool, optional): Instead of setting to zero, set the grads to None. Default: ``True``
 
                 This will in general have lower memory footprint, and can modestly improve performance.
@@ -1028,6 +1089,18 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                 3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
                    (in one case it does the step with a gradient of 0 and in the other it skips
                    the step altogether).
+=======
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                This will in general have lower memory footprint, and can modestly improve performance.
+                However, it changes certain behaviors. For example:
+                1. When the user tries to access a gradient and perform manual ops on it,
+                a None attribute or a Tensor full of 0s will behave differently.
+                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
+                are guaranteed to be None for params that did not receive a gradient.
+                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
+                (in one case it does the step with a gradient of 0 and in the other it skips
+                the step altogether).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         foreach = self.defaults.get("foreach", False) or self.defaults.get(
             "fused", False
@@ -1058,18 +1131,26 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                             if not foreach or p.grad.is_sparse:
                                 p.grad.zero_()
                             else:
+<<<<<<< HEAD
                                 if per_device_and_dtype_grads is None:
                                     raise AssertionError(
                                         "Expected per_device_and_dtype_grads to be set"
                                     )
+=======
+                                assert per_device_and_dtype_grads is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 per_device_and_dtype_grads[p.grad.device][
                                     p.grad.dtype
                                 ].append(p.grad)
             if foreach:
+<<<<<<< HEAD
                 if per_device_and_dtype_grads is None:
                     raise AssertionError(
                         "Expected per_device_and_dtype_grads to be set"
                     )
+=======
+                assert per_device_and_dtype_grads is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for per_dtype_grads in per_device_and_dtype_grads.values():
                     for grads in per_dtype_grads.values():
                         torch._foreach_zero_(grads)
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index b57e8544781f4..03ef1f380a7f6 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -56,6 +56,7 @@ def __init__(
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "betas": betas,
@@ -67,6 +68,19 @@ def __init__(
             "decoupled_weight_decay": decoupled_weight_decay,
             "differentiable": differentiable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            maximize=maximize,
+            foreach=foreach,
+            capturable=capturable,
+            decoupled_weight_decay=decoupled_weight_decay,
+            differentiable=differentiable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):  # noqa: D105
@@ -283,6 +297,7 @@ def _single_tensor_radam(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
@@ -290,6 +305,14 @@ def _single_tensor_radam(
                 raise AssertionError(
                     f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if torch.is_complex(param):
             param = torch.view_as_real(param)
@@ -323,7 +346,10 @@ def _single_tensor_radam(
         rho_t = rho_inf - 2 * step * (beta2**step) / bias_correction2
 
         def _compute_rect():
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (
                 (rho_t - 4)
                 * (rho_t - 2)
@@ -338,7 +364,10 @@ def _compute_adaptive_lr():
             else:
                 exp_avg_sq_sqrt = exp_avg_sq_sqrt.add_(eps)
 
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (bias_correction2**0.5) / exp_avg_sq_sqrt
 
         # Compute the variance rectification term and update parameters accordingly
@@ -381,14 +410,19 @@ def _multi_tensor_radam(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices(
             supports_xla=False
         )
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -397,6 +431,15 @@ def _multi_tensor_radam(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
@@ -466,7 +509,11 @@ def _multi_tensor_radam(
             if decoupled_weight_decay:
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
+<<<<<<< HEAD
                 # Reuse the intermediate memory (grouped_grads) already allocated for maximize
+=======
+                # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if maximize:
                     torch._foreach_add_(
                         grouped_grads, grouped_params, alpha=weight_decay
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 10e7ce74509b0..c01bc5d99b645 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -55,6 +55,7 @@ def __init__(
         if not 0.0 <= alpha:
             raise ValueError(f"Invalid alpha value: {alpha}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "momentum": momentum,
@@ -67,6 +68,20 @@ def __init__(
             "maximize": maximize,
             "differentiable": differentiable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            alpha=alpha,
+            eps=eps,
+            centered=centered,
+            weight_decay=weight_decay,
+            capturable=capturable,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):  # noqa: D105
@@ -290,6 +305,7 @@ def _single_tensor_rmsprop(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == step.device.type
                 and param.device.type in capturable_supported_devices
@@ -297,6 +313,14 @@ def _single_tensor_rmsprop(
                 raise AssertionError(
                     f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == step.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -361,12 +385,17 @@ def _multi_tensor_rmsprop(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -375,6 +404,15 @@ def _multi_tensor_rmsprop(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     lr = _to_scalar(lr)
 
@@ -423,7 +461,11 @@ def _multi_tensor_rmsprop(
             torch._foreach_add_(grouped_state_steps, 1)
 
         if weight_decay != 0:
+<<<<<<< HEAD
             # Reuse the intermediate memory (grouped_grads) already allocated for maximize
+=======
+            # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if maximize:
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
             else:
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 1ca4aefae3456..4011632a74031 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -47,6 +47,7 @@ def __init__(
         if not 0.0 < etas[0] < 1.0 < etas[1]:
             raise ValueError(f"Invalid eta values: {etas[0]}, {etas[1]}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "etas": etas,
@@ -56,6 +57,17 @@ def __init__(
             "differentiable": differentiable,
             "capturable": capturable,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            etas=etas,
+            step_sizes=step_sizes,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            capturable=capturable,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
     def __setstate__(self, state):  # noqa: D105
@@ -200,9 +212,15 @@ def step(self, closure=None):
 
     For further details regarding the algorithm we refer to the paper
     `A Direct Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm
+<<<<<<< HEAD
     <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.1417>`_."""  # codespell:ignore
     + rf"""
 
+=======
+    <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.1417>`_.
+    """
+    + rf"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         {_params_doc}
         lr (float, optional): learning rate (default: 1e-2)
@@ -246,6 +264,7 @@ def _single_tensor_rprop(
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch.compiler.is_compiling() and capturable:
             capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
             if not (
                 param.device.type == step.device.type
                 and param.device.type in capturable_supported_devices
@@ -253,6 +272,14 @@ def _single_tensor_rprop(
                 raise AssertionError(
                     f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
                 )
+=======
+            assert (
+                param.device.type == step.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         step += 1
 
@@ -310,12 +337,17 @@ def _multi_tensor_rprop(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if differentiable:
         raise AssertionError("_foreach ops don't support autograd")
+=======
+    assert not differentiable, "_foreach ops don't support autograd"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch.compiler.is_compiling() and capturable:
         capturable_supported_devices = _get_capturable_supported_devices()
+<<<<<<< HEAD
         if not all(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
@@ -324,6 +356,15 @@ def _multi_tensor_rprop(
             raise AssertionError(
                 f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
             )
+=======
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, prevs, step_sizes, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index a1e7852e9210b..5adce1b59a3ef 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -49,6 +49,7 @@ def __init__(
         if weight_decay < 0.0:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "momentum": momentum,
@@ -60,6 +61,19 @@ def __init__(
             "differentiable": differentiable,
             "fused": fused,
         }
+=======
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            maximize=maximize,
+            foreach=foreach,
+            differentiable=differentiable,
+            fused=fused,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
         super().__init__(params, defaults)
@@ -332,8 +346,12 @@ def _single_tensor_sgd(
     maximize: bool,
     has_sparse_grad: bool,
 ):
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
+=======
+    assert grad_scale is None and found_inf is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not torch.jit.is_scripting():
         lr = _to_scalar(lr)
@@ -348,7 +366,10 @@ def _single_tensor_sgd(
                     # usually this is the differentiable path, which is why the param.clone() is needed
                     grad = grad.addcmul_(param.clone(), weight_decay)
                 else:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     grad = grad.add(param, alpha=weight_decay)
             else:
                 grad = grad.add(param, alpha=weight_decay)
@@ -357,7 +378,11 @@ def _single_tensor_sgd(
             buf = momentum_buffer_list[i]
 
             if buf is None:
+<<<<<<< HEAD
                 buf = grad.detach().clone()
+=======
+                buf = torch.clone(grad).detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 momentum_buffer_list[i] = buf
             else:
                 buf.mul_(momentum).add_(grad, alpha=1 - dampening)
@@ -372,7 +397,10 @@ def _single_tensor_sgd(
             if lr.requires_grad:
                 param.addcmul_(grad, lr, value=-1)
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 param.add_(grad, alpha=-lr)
         else:
             param.add_(grad, alpha=-lr)
@@ -393,8 +421,12 @@ def _multi_tensor_sgd(
     maximize: bool,
     has_sparse_grad: bool,
 ):
+<<<<<<< HEAD
     if grad_scale is not None or found_inf is not None:
         raise AssertionError("Expected grad_scale and found_inf to be None")
+=======
+    assert grad_scale is None and found_inf is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(params) == 0:
         return
@@ -421,7 +453,11 @@ def _multi_tensor_sgd(
             device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         if weight_decay != 0:
+<<<<<<< HEAD
             # Reuse the intermediate memory (device_grads) already allocated for maximize
+=======
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
@@ -445,12 +481,19 @@ def _multi_tensor_sgd(
                 torch._foreach_add_(bufs, device_grads, alpha=1 - dampening)
             else:
                 bufs = []
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for i in range(len(device_momentum_buffer_list)):
                     if device_momentum_buffer_list[i] is None:
                         buf = device_momentum_buffer_list[i] = momentum_buffer_list[
                             indices[i]
+<<<<<<< HEAD
                         ] = device_grads[i].detach().clone()
+=======
+                        ] = torch.clone(device_grads[i]).detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else:
                         buf = cast(Tensor, device_momentum_buffer_list[i])
                         buf.mul_(momentum).add_(device_grads[i], alpha=1 - dampening)
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index ca87e87ce8674..ac0e0a27f26fa 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -31,21 +31,31 @@ def __init__(
         if not 0.0 <= betas[1] < 1.0:
             raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
 
+<<<<<<< HEAD
         defaults = {
             "lr": lr,
             "betas": betas,
             "eps": eps,
             "maximize": maximize,
         }
+=======
+        defaults = dict(lr=lr, betas=betas, eps=eps, maximize=maximize)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(params, defaults)
 
         sparse_params = []
         complex_params = []
         for index, param_group in enumerate(self.param_groups):
+<<<<<<< HEAD
             if not isinstance(param_group, dict):
                 raise AssertionError(
                     f"param_groups must be a list of dicts, but got {type(param_group)}"
                 )
+=======
+            assert isinstance(param_group, dict), (
+                f"param_groups must be a list of dicts, but got {type(param_group)}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # given param group, convert given params to a list first before iterating
             for d_index, d_param in enumerate(param_group["params"]):
                 if d_param.is_sparse:
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 24452edcea1cd..4e4b308fbc759 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -4,10 +4,16 @@
 import itertools
 import math
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from copy import deepcopy
 from typing import Any, cast, Literal, Optional, Union
 from typing_extensions import override
+=======
+from collections.abc import Iterable
+from copy import deepcopy
+from typing import Any, Callable, Literal, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -70,9 +76,13 @@ def swa_update(
             averaged_param_list[0]
         ):
             torch._foreach_lerp_(
+<<<<<<< HEAD
                 averaged_param_list,
                 current_param_list,
                 cast(float, 1 / (num_averaged + 1)),
+=======
+                averaged_param_list, current_param_list, 1 / (num_averaged + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             diffs = torch._foreach_sub(current_param_list, averaged_param_list)
@@ -229,10 +239,16 @@ def __init__(
         use_buffers=False,
     ):  # noqa: D107
         super().__init__()
+<<<<<<< HEAD
         if avg_fn is not None and multi_avg_fn is not None:
             raise AssertionError(
                 "Only one of avg_fn and multi_avg_fn should be provided"
             )
+=======
+        assert avg_fn is None or multi_avg_fn is None, (
+            "Only one of avg_fn and multi_avg_fn should be provided"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module = deepcopy(model)
         if device is not None:
             self.module = self.module.to(device)
@@ -250,25 +266,38 @@ def forward(self, *args, **kwargs):
     def update_parameters(self, model: Module):
         """Update model parameters."""
         self_param = (
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             itertools.chain(self.module.parameters(), self.module.buffers())
             if self.use_buffers
             else self.parameters()
         )
         model_param = (
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             itertools.chain(model.parameters(), model.buffers())
             if self.use_buffers
             else model.parameters()
         )
         self_param_detached: list[Optional[Tensor]] = []
         model_param_detached: list[Optional[Tensor]] = []
+<<<<<<< HEAD
         copy_param = bool(self.n_averaged == 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for p_averaged, p_model in zip(self_param, model_param):
             p_model_ = p_model.detach().to(p_averaged.device)
             self_param_detached.append(p_averaged.detach())
             model_param_detached.append(p_model_)
+<<<<<<< HEAD
             if copy_param:
+=======
+            if self.n_averaged == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 p_averaged.detach().copy_(p_model_)
 
         if self.n_averaged > 0:
@@ -298,17 +327,25 @@ def update_parameters(self, model: Module):
                         avg_fn = get_swa_avg_fn()
                         n_averaged = self.n_averaged.to(device)
                         for p_averaged, p_model in zip(self_params, model_params):  # type: ignore[assignment]
+<<<<<<< HEAD
                             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged))
             else:
                 for p_averaged, p_model in zip(  # type: ignore[assignment]
                     self_param_detached, model_param_detached
                 ):
+<<<<<<< HEAD
                     # pyrefly: ignore [missing-attribute]
                     n_averaged = self.n_averaged.to(p_averaged.device)
                     # pyrefly: ignore [missing-attribute]
                     p_averaged.detach().copy_(
                         # pyrefly: ignore [missing-attribute, bad-argument-type]
+=======
+                    n_averaged = self.n_averaged.to(p_averaged.device)
+                    p_averaged.detach().copy_(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.avg_fn(p_averaged.detach(), p_model, n_averaged)
                     )
 
@@ -439,7 +476,14 @@ def __init__(
                 "anneal_strategy must by one of 'cos' or 'linear', "
                 f"instead got {anneal_strategy}"
             )
+<<<<<<< HEAD
         self._set_anneal_func(anneal_strategy)
+=======
+        elif anneal_strategy == "cos":
+            self.anneal_func = self._cosine_anneal
+        elif anneal_strategy == "linear":
+            self.anneal_func = self._linear_anneal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
             raise ValueError(
                 f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}"
@@ -461,6 +505,7 @@ def _get_initial_lr(lr, swa_lr, alpha):
             return swa_lr
         return (lr - alpha * swa_lr) / (1 - alpha)
 
+<<<<<<< HEAD
     @override
     def get_lr(self):
         r"""Compute the next learning rate for each of the optimizer's
@@ -484,6 +529,10 @@ def get_lr(self):
             The returned :class:`~torch.Tensor`\s are copies, and never alias
             the optimizer's ``group["lr"]``\s.
         """
+=======
+    def get_lr(self):
+        """Get learning rate."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # `_get_lr_called_within_step` is only available `_enable_get_lr_call`,
         # so we ignore the type error here. See `LRScheduler.step()` for more details.
         if not self._get_lr_called_within_step:
@@ -491,26 +540,36 @@ def get_lr(self):
                 "To get the last learning rate computed by the scheduler, "
                 "please use `get_last_lr()`.",
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # Set in `LRScheduler._initial_step()`
         step = self._step_count - 1
         if self.anneal_epochs == 0:
             step = max(1, step)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prev_t = max(0, min(1, (step - 1) / max(1, self.anneal_epochs)))
         prev_alpha = self.anneal_func(prev_t)
         prev_lrs = [
             self._get_initial_lr(group["lr"], group["swa_lr"], prev_alpha)
             for group in self.optimizer.param_groups
         ]
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t = max(0, min(1, step / max(1, self.anneal_epochs)))
         alpha = self.anneal_func(t)
         return [
             group["swa_lr"] * alpha + lr * (1 - alpha)
             for group, lr in zip(self.optimizer.param_groups, prev_lrs)
         ]
+<<<<<<< HEAD
 
     def _set_anneal_func(self, anneal_strategy: Literal["cos", "linear"]):
         self._anneal_strategy = anneal_strategy
@@ -542,3 +601,5 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """
         self.__dict__.update(state_dict)
         self._set_anneal_func(self._anneal_strategy)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/overrides.py b/torch/overrides.py
index db4a7535a36fd..3afe84169758e 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -27,9 +27,15 @@
 import functools
 import types
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from functools import wraps
 from typing import Any, Optional, TypeVar
+=======
+from collections.abc import Iterable
+from functools import wraps
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -249,8 +255,11 @@ def get_ignored_functions() -> set[Callable]:
         torch.nn.functional.has_torch_function_unary,
         torch.nn.functional.has_torch_function_variadic,
         torch.nn.functional.handle_torch_function,
+<<<<<<< HEAD
         torch.nn.functional.scaled_grouped_mm,
         torch.nn.functional.scaled_mm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.nn.functional.sigmoid,
         torch.nn.functional.hardsigmoid,
         torch.nn.functional.tanh,
@@ -364,7 +373,10 @@ def get_ignored_functions() -> set[Callable]:
         Tensor._view_func,
         Tensor._view_func_unsafe,
         Tensor._rev_view_func_unsafe,
+<<<<<<< HEAD
         Tensor._dtensor__new__,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,
@@ -612,8 +624,13 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.fused_moving_avg_obs_fake_quant: (
             lambda x, observer_on, fake_quant_on, averaging_const, running_min, running_max, scale, zero_point, quant_min, quant_max, ch_axis, per_row_fake_quant=False, symmetric_quant=False: -1  # noqa: B950
         ),
+<<<<<<< HEAD
         torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias, output: -1,
         torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias, output: -1,
+=======
+        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,  # noqa: B950
         torch.fbgemm_linear_int8_weight_fp32_activation: (
             lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1
@@ -678,7 +695,10 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.gt: lambda input, other, out=None: -1,
         torch.greater: lambda input, other, out=None: -1,
         torch.hardshrink: lambda input, lambd=0.5: -1,
+<<<<<<< HEAD
         torch.hash_tensor: lambda input, dim=None, keepdim=False, mode=0, out=None: -1,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.heaviside: lambda input, values, out=None: -1,
         torch.hinge_embedding_loss: lambda input, target, margin=1.0, size_average=None, reduce=None, reduction="mean": -1,  # noqa: B950
         torch.histc: lambda input, bins=100, min=0, max=0, out=None: -1,
@@ -824,7 +844,10 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch._native_batch_norm_legit: lambda input, weight, bias, training, momentum, eps: -1,
         torch.native_dropout: lambda input, p, train: -1,
         torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
+<<<<<<< HEAD
         torch._fused_rms_norm: lambda input, normalized_shape, weight=None, eps=1e-05: -1,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
         torch.native_norm: lambda input, p=2, dim=None, keepdim=False, dtype=None: -1,
         torch.native_channel_shuffle: lambda input, groups: -1,
@@ -1353,7 +1376,10 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor._grad.__get__: lambda self: -1,
         Tensor._grad_fn.__get__: lambda self: -1,
         Tensor.grad_fn.__get__: lambda self: -1,
+<<<<<<< HEAD
         Tensor.grad_dtype.__get__: lambda self: -1,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensor._version.__get__: lambda self: -1,
         Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1,
         Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1,
@@ -1517,9 +1543,14 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor.view: lambda self, shape: -1,
         Tensor.view_as: lambda self, other: -1,
         Tensor.zero_: lambda self: -1,
+<<<<<<< HEAD
         Tensor.__dlpack__: lambda self, stream=None, max_version=None, dl_device=None, copy=None: -1,
         Tensor.__dlpack_device__: lambda self: -1,
         Tensor.index: lambda self, a, b: -1,
+=======
+        Tensor.__dlpack__: lambda self, stream=None: -1,
+        Tensor.__dlpack_device__: lambda self: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
     }  # fmt: skip
 
@@ -1747,7 +1778,10 @@ def handle_torch_function(
                 "Defining your `__torch_function__ as a plain method is deprecated and "
                 "will be an error in future, please define it as a classmethod.",
                 DeprecationWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # Use `public_api` instead of `implementation` so __torch_function__
@@ -2011,7 +2045,11 @@ def is_tensor_like(inp):
 class TorchFunctionMode:
     """
     A ``TorchFunctionMode`` allows you to override the meaning of all
+<<<<<<< HEAD
     ``__torch_function__`` overridable functions within a dynamic scope,
+=======
+    ``__torch_function__`` overrideable functions within a dynamic scope,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     without having to actually create a tensor subclass or manually
     monkey-patch functions in the PyTorch API.  Some common situations
     where you should use a mode:
@@ -2058,8 +2096,12 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     @classmethod
     def push(cls, *args, **kwargs):
         warnings.warn(
+<<<<<<< HEAD
             "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`",
             stacklevel=2,
+=======
+            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         instance = cls(*args, **kwargs)
         return instance
diff --git a/torch/package/_mangling.py b/torch/package/_mangling.py
index 08b0560f79322..9a962b383f165 100644
--- a/torch/package/_mangling.py
+++ b/torch/package/_mangling.py
@@ -2,7 +2,10 @@
 """Import mangling.
 See mangling.md for details.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 
 
diff --git a/torch/package/_package_pickler.py b/torch/package/_package_pickler.py
index a66c14adfe86f..32858c727e966 100644
--- a/torch/package/_package_pickler.py
+++ b/torch/package/_package_pickler.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 # pyrefly: ignore [missing-module-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pickle import (  # type: ignore[attr-defined]
     _compat_pickle,
     _extension_registry,
diff --git a/torch/package/_stdlib.py b/torch/package/_stdlib.py
index 57a51ac41cfd9..a871242d5abb6 100644
--- a/torch/package/_stdlib.py
+++ b/torch/package/_stdlib.py
@@ -17,12 +17,20 @@ def is_stdlib_module(module: str) -> bool:
 
 
 def _get_stdlib_modules():
+<<<<<<< HEAD
     if sys.version_info.major == 3:  # noqa: UP036
+=======
+    if sys.version_info.major == 3:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info.minor == 9:
             return stdlib3_9
         if sys.version_info.minor >= 10:  # noqa: YTT204
             return sys.stdlib_module_names  # type: ignore[attr-defined]
+<<<<<<< HEAD
     elif sys.version_info.major > 3:  # noqa: UP036
+=======
+    elif sys.version_info.major > 3:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return sys.stdlib_module_names  # type: ignore[attr-defined]
 
     raise RuntimeError(f"Unsupported Python version: {sys.version_info}")
diff --git a/torch/package/analyze/is_from_package.py b/torch/package/analyze/is_from_package.py
index 800f87eb48672..320052a97f831 100644
--- a/torch/package/analyze/is_from_package.py
+++ b/torch/package/analyze/is_from_package.py
@@ -10,7 +10,11 @@ def is_from_package(obj: Any) -> bool:
 
     Note: packaged objects from externed modules will return ``False``.
     """
+<<<<<<< HEAD
     if type(obj) is ModuleType:
+=======
+    if type(obj) == ModuleType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return is_mangled(obj.__name__)
     else:
         return is_mangled(type(obj).__module__)
diff --git a/torch/package/analyze/trace_dependencies.py b/torch/package/analyze/trace_dependencies.py
index 839c2da8cabc4..73aacf50a5027 100644
--- a/torch/package/analyze/trace_dependencies.py
+++ b/torch/package/analyze/trace_dependencies.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Any
+=======
+from collections.abc import Iterable
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["trace_dependencies"]
diff --git a/torch/package/importer.py b/torch/package/importer.py
index e6cc021336a3c..93fc82e0d6fce 100644
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import importlib
+<<<<<<< HEAD
 import logging
 from abc import ABC, abstractmethod
 
@@ -8,6 +9,13 @@
     _getattribute,
     _Pickler,
     whichmodule as _pickle_whichmodule,  # pyrefly: ignore  # missing-module-attribute
+=======
+from abc import ABC, abstractmethod
+from pickle import (  # type: ignore[attr-defined]
+    _getattribute,
+    _Pickler,
+    whichmodule as _pickle_whichmodule,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from types import ModuleType
 from typing import Any, Optional
@@ -16,7 +24,10 @@
 
 
 __all__ = ["ObjNotFoundError", "ObjMismatchError", "Importer", "OrderedImporter"]
+<<<<<<< HEAD
 log = logging.getLogger(__name__)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ObjNotFoundError(Exception):
@@ -208,6 +219,7 @@ def _is_torchpackage_dummy(self, module):
             return True
         return module.__file__ is None
 
+<<<<<<< HEAD
     def get_name(self, obj: Any, name: Optional[str] = None) -> tuple[str, str]:
         for importer in self._importers:
             try:
@@ -222,6 +234,8 @@ def get_name(self, obj: Any, name: Optional[str] = None) -> tuple[str, str]:
             f"Could not find obj {obj} and name {name} in any of the importers {self._importers}"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def import_module(self, module_name: str) -> ModuleType:
         last_err = None
         for importer in self._importers:
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index b25ebca23095f..1d1bdd236b8a7 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -8,12 +8,20 @@
 import platform
 import types
 from collections import defaultdict, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
+=======
+from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from enum import Enum
 from importlib.machinery import SourceFileLoader
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, cast, IO, Optional, Union
+=======
+from typing import Any, Callable, cast, IO, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.serialization import location_tag, normalize_storage_type
@@ -605,9 +613,15 @@ def save_pickle(
             dependencies (bool, optional): If ``True``, we scan the source for dependencies.
         """
 
+<<<<<<< HEAD
         assert (pickle_protocol == 4) or (pickle_protocol == 3), (
             "torch.package only supports pickle protocols 3 and 4"
         )
+=======
+        assert (pickle_protocol == 4) or (
+            pickle_protocol == 3
+        ), "torch.package only supports pickle protocols 3 and 4"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         filename = self._filename(package, resource)
         # Write the pickle data for `obj`
@@ -652,7 +666,10 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
             memo: defaultdict[int, str] = defaultdict(None)
             memo_count = 0
             # pickletools.dis(data_value)
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for opcode, arg, _pos in pickletools.genops(data_value):
                 if pickle_protocol == 4:
                     if (
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 10bf8981e28ae..e826a8148a2ad 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -8,9 +8,15 @@
 import os
 import sys
 import types
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from typing import Any, cast, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Iterable
+from contextlib import contextmanager
+from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakValueDictionary
 
 import torch
@@ -423,12 +429,16 @@ def _load_module(self, name: str, parent: str):
                         module.__dict__.setdefault(old_name, new_name)
 
                 return module
+<<<<<<< HEAD
         return self._make_module(
             name,
             cur.source_file,  # type: ignore[attr-defined]
             isinstance(cur, _PackageNode),
             parent,
         )
+=======
+        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode), parent)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _compile_source(self, fullpath: str, mangled_filename: str):
         source = self.zip_reader.get_record(fullpath)
diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index 153d4560e2641..ac4e0347daf4f 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""
 PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
 Profiler's context manager API can be used to better understand what model operators are the most expensive,
@@ -7,16 +11,24 @@
     An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
 
 """
+<<<<<<< HEAD
 
 import os
 from typing import Any
 from typing_extensions import TypeVarTuple, Unpack
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._C._autograd import _supported_activities, DeviceType, kineto_available
 from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
 from torch._environment import is_fbcode
 from torch.autograd.profiler import KinetoStepTracker, record_function
+<<<<<<< HEAD
 from torch.optim.optimizer import Optimizer, register_optimizer_step_post_hook
+=======
+from torch.optim.optimizer import register_optimizer_step_post_hook
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .profiler import (
     _KinetoProfile,
@@ -45,12 +57,16 @@
 from . import itt
 
 
+<<<<<<< HEAD
 _Ts = TypeVarTuple("_Ts")
 
 
 def _optimizer_post_hook(
     optimizer: Optimizer, args: tuple[Unpack[_Ts]], kwargs: dict[str, Any]
 ) -> None:
+=======
+def _optimizer_post_hook(optimizer, args, kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     KinetoStepTracker.increment_step("Optimizer")
 
 
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 0d571df064f7b..cfb4114fadd7d 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -5,7 +5,12 @@
 import itertools as it
 import logging
 from collections.abc import Iterator
+<<<<<<< HEAD
 from typing import Any, cast, Literal, Optional, Union
+=======
+from typing import Any, cast, Optional, Union
+from typing_extensions import Literal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C import FunctionSchema
@@ -230,7 +235,10 @@ def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> tuple[Optional[bool], ..
         for schema in cls.match_schemas(t):
             mutable = mutable or [False for _ in schema.arguments]
             for i, arg in enumerate(schema.arguments):
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mutable[i] |= getattr(arg.alias_info, "is_write", False)
 
         return tuple(mutable or (None for _ in t.inputs))
@@ -239,12 +247,19 @@ def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> tuple[Optional[bool], ..
     def match_schemas(cls, t: _ExtraFields_TorchOp) -> tuple[FunctionSchema, ...]:
         signature = tuple(
             # Tensor
+<<<<<<< HEAD
             TensorKey.from_tensor(i)
             if isinstance(i, _TensorMetadata)
             #
             # TensorList
             else [TensorKey.from_tensor(j) for j in i]
             if isinstance(i, list)
+=======
+            TensorKey.from_tensor(i) if isinstance(i, _TensorMetadata)
+            #
+            # TensorList
+            else [TensorKey.from_tensor(j) for j in i] if isinstance(i, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #
             # Scalar and uncaptured inputs.
             else i
@@ -516,7 +531,11 @@ def __init__(self, op_tree: OpTree) -> None:
     def flow_nodes(self) -> tuple[DataFlowNode, ...]:
         return tuple(self._flow_nodes)
 
+<<<<<<< HEAD
     def validate(self) -> None:
+=======
+    def validate(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Check that each (Tensor, version) pair has a unique creation node
         outputs: set[tuple[TensorKey, int]] = set()
         for node in self.flow_nodes:
@@ -672,7 +691,10 @@ def timeline(self) -> tuple[tuple[int, Action, KeyAndID, int], ...]:
         output: list[tuple[int, Action, KeyAndID, int]] = []
         allocation_times: dict[tuple[TensorKey, bool], int] = {}
         live_unknown: dict[tuple[int, torch.device], Literal[True]] = {}
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for event in self._op_tree.dfs():
             if event.typed[0] == _EventType.Allocation:
                 alloc_fields = event.typed[1]
@@ -779,7 +801,10 @@ def _any_version_depends_on_gradient(self) -> set[int]:
 
                 if ids:
                     depends_on_gradient.update(ids)
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     depends_on_gradient.update(key.id for key in node.outputs)
 
             # We are guaranteed to exit because there is a finite set of
@@ -968,7 +993,11 @@ def _set_optimizer_state(self) -> None:
                     if key is not None:
                         self._categories.set_by_id(key, Category.OPTIMIZER_STATE)
 
+<<<<<<< HEAD
     def _set_autograd_detail(self) -> None:
+=======
+    def _set_autograd_detail(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior = {None, Category.AUTOGRAD_DETAIL}
         for node in self._data_flow_graph.flow_nodes:
             if RecordScope.BACKWARD_FUNCTION in get_scopes(node._event):
@@ -980,7 +1009,11 @@ def _set_autograd_detail(self) -> None:
 
 
 class MemoryProfileTimeline:
+<<<<<<< HEAD
     def __init__(self, memory_profile) -> None:
+=======
+    def __init__(self, memory_profile):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """The minimum representation of the memory profile timeline
         includes the memory timeline and categories. The timeline
         consists of [timestamp, action, (TensorKey, version), numbytes]
@@ -1003,7 +1036,11 @@ def _coalesce_timeline(self, device_str):
         times: list[int] = []
         sizes: list[list[int]] = []
 
+<<<<<<< HEAD
         def update(key, version, delta) -> None:
+=======
+        def update(key, version, delta):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             category = (
                 self.categories.get(key, version)
                 if isinstance(key, TensorKey)
@@ -1084,7 +1121,10 @@ def get_category_index(key, version):
 
             if action in (Action.PREEXISTING, Action.CREATE):
                 raw_events.append(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1095,7 +1135,10 @@ def get_category_index(key, version):
 
             elif action == Action.INCREMENT_VERSION:
                 raw_events.append(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1104,7 +1147,10 @@ def get_category_index(key, version):
                     )
                 )
                 raw_events.append(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1115,7 +1161,10 @@ def get_category_index(key, version):
 
             elif action == Action.DESTROY:
                 raw_events.append(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (
                         t,
                         _ACTION_TO_INDEX[action],
diff --git a/torch/profiler/_pattern_matcher.py b/torch/profiler/_pattern_matcher.py
index cee47f28eb04a..5c3c7d6e94664 100644
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@@ -26,7 +26,11 @@ class Pattern:
     In subclass, define description and skip property.
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.prof = prof
         self.should_benchmark = should_benchmark
         self.name = "Please specify a name for pattern"
@@ -39,7 +43,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
             self.tid_root.setdefault(event.start_tid, []).append(event)
 
     @property
+<<<<<<< HEAD
     def skip(self) -> bool:
+=======
+    def skip(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     def report(self, event: _ProfilerEvent):
@@ -66,8 +74,13 @@ def summary(self, events: list[_ProfilerEvent]):
             )
         return default_summary
 
+<<<<<<< HEAD
     def benchmark_summary(self, events: list[_ProfilerEvent]) -> str:
         def format_time(time_ns: int) -> str:
+=======
+    def benchmark_summary(self, events: list[_ProfilerEvent]):
+        def format_time(time_ns: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unit_lst = ["ns", "us", "ms"]
             for unit in unit_lst:
                 if time_ns < 1000:
@@ -135,9 +148,13 @@ def go_up_until(self, event: _ProfilerEvent, predicate):
 
 
 class NamePattern(Pattern):
+<<<<<<< HEAD
     def __init__(
         self, prof: profile, name: str, should_benchmark: bool = False
     ) -> None:
+=======
+    def __init__(self, prof: profile, name: str, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.description = f"Matched Name Event: {name}"
         self.name = name
@@ -163,7 +180,11 @@ class ExtraCUDACopyPattern(Pattern):
     If at any step we failed, it is not a match.
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Extra CUDA Copy Pattern"
         self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
@@ -176,7 +197,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
         }
 
     @property
+<<<<<<< HEAD
     def skip(self) -> bool:
+=======
+    def skip(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return not self.prof.with_stack or not self.prof.record_shapes
 
     def match(self, event):
@@ -250,7 +275,11 @@ class ForLoopIndexingPattern(Pattern):
     We also keep a dictionary to avoid duplicate match in the for loop.
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "For Loop Indexing Pattern"
         self.description = "For loop indexing detected. Vectorization recommended."
@@ -273,7 +302,11 @@ def match(self, event: _ProfilerEvent):
             return False
 
         # Custom event list matching
+<<<<<<< HEAD
         def same_ops(list1, list2) -> bool:
+=======
+        def same_ops(list1, list2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(list1) != len(list2):
                 return False
             for op1, op2 in zip(list1, list2):
@@ -297,7 +330,11 @@ def same_ops(list1, list2) -> bool:
 
 
 class FP32MatMulPattern(Pattern):
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "FP32 MatMul Pattern"
         self.description = (
@@ -312,6 +349,7 @@ def skip(self):
             has_tf32 = False
         else:
             # Anything less than sm_80 is not Ampere which doesn't support TF32
+<<<<<<< HEAD
             has_tf32 = all(
                 int(re.sub("sm_|compute_", "", arch)) >= 80
                 for arch in torch.cuda.get_arch_list()
@@ -319,6 +357,12 @@ def skip(self):
         return has_tf32 is False or super().skip or not self.prof.record_shapes
 
     def match(self, event: _ProfilerEvent) -> bool:
+=======
+            has_tf32 = all(int(arch[3:]) >= 80 for arch in torch.cuda.get_arch_list())
+        return has_tf32 is False or super().skip or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If we saw this pattern once, we don't need to match it again
         if event.tag != _EventType.TorchOp:
             return False
@@ -367,7 +411,11 @@ class OptimizerSingleTensorPattern(Pattern):
     String match
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Optimizer Single Tensor Pattern"
         self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
@@ -377,7 +425,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
         )
         self.url = ""
 
+<<<<<<< HEAD
     def match(self, event: _ProfilerEvent) -> bool:
+=======
+    def match(self, event: _ProfilerEvent):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for optimizer in self.optimizers_with_foreach:
             if event.name.endswith(f"_single_tensor_{optimizer}"):
                 return True
@@ -402,7 +454,11 @@ class SynchronizedDataLoaderPattern(Pattern):
 
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Synchronized DataLoader Pattern"
         self.description = (
@@ -414,7 +470,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
             "#enable-async-data-loading-and-augmentation"
         )
 
+<<<<<<< HEAD
     def match(self, event: _ProfilerEvent) -> bool:
+=======
+    def match(self, event: _ProfilerEvent):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def is_dataloader_function(name: str, function_name: str):
             return name.startswith(
                 os.path.join("torch", "utils", "data", "dataloader.py")
@@ -461,7 +521,11 @@ class GradNotSetToNonePattern(Pattern):
     String match
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Gradient Set To Zero Instead of None Pattern"
         self.description = (
@@ -473,7 +537,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
             "#disable-gradient-calculation-for-validation-or-inference"
         )
 
+<<<<<<< HEAD
     def match(self, event: _ProfilerEvent) -> bool:
+=======
+    def match(self, event: _ProfilerEvent):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not event.name.endswith(": zero_grad"):
             return False
         if not event.children:
@@ -502,7 +570,11 @@ class Conv2dBiasFollowedByBatchNorm2dPattern(Pattern):
     String match
     """
 
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Enabling Bias in Conv2d Followed By BatchNorm Pattern"
         self.description = "Detected bias enabled in Conv2d that is followed by BatchNorm2d. Please set 'bias=False' in Conv2d."
@@ -533,17 +605,28 @@ def match(self, event: _ProfilerEvent):
 
 
 class MatMulDimInFP16Pattern(Pattern):
+<<<<<<< HEAD
     def __init__(self, prof: profile, should_benchmark: bool = False) -> None:
+=======
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(prof, should_benchmark)
         self.name = "Matrix Multiplication Dimension Not Aligned Pattern"
         self.description = "Detected matmul with dimension not aligned. Please use matmul with aligned dimension."
         self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-mixed-precision-and-amp"
 
     @property
+<<<<<<< HEAD
     def skip(self) -> bool:
         return not self.prof.with_stack or not self.prof.record_shapes
 
     def match(self, event: _ProfilerEvent) -> bool:
+=======
+    def skip(self):
+        return not self.prof.with_stack or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mutiple_of(shapes, multiple):
             return all(dim % multiple == 0 for shape in shapes for dim in shape[-2:])
 
@@ -586,7 +669,11 @@ def closest_multiple(shapes, multiple):
         return shapes_factor_map
 
 
+<<<<<<< HEAD
 def source_code_location(event: Optional[_ProfilerEvent]) -> str:
+=======
+def source_code_location(event: Optional[_ProfilerEvent]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while event:
         if event.tag == _EventType.PyCall or event.tag == _EventType.PyCCall:
             assert isinstance(
@@ -613,7 +700,11 @@ def report_all_anti_patterns(
     should_benchmark: bool = False,
     print_enable: bool = True,
     json_report_dir: Optional[str] = None,
+<<<<<<< HEAD
 ) -> None:
+=======
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     report_dict: dict = {}
     anti_patterns = [
         ExtraCUDACopyPattern(prof, should_benchmark),
diff --git a/torch/profiler/_utils.py b/torch/profiler/_utils.py
index 23d2812afa8f5..8478a1c68904d 100644
--- a/torch/profiler/_utils.py
+++ b/torch/profiler/_utils.py
@@ -52,7 +52,11 @@ class Interval:
 
 
 class EventKey:
+<<<<<<< HEAD
     def __init__(self, event) -> None:
+=======
+    def __init__(self, event):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.event = event
 
     def __hash__(self):
@@ -61,7 +65,11 @@ def __hash__(self):
     def __eq__(self, other):
         return self.event.id == other.event.id
 
+<<<<<<< HEAD
     def __repr__(self) -> str:
+=======
+    def __repr__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.event.name}"
 
     def intervals_overlap(self, intervals: list[Interval]):
@@ -98,7 +106,11 @@ def intervals_overlap(self, intervals: list[Interval]):
 
 
 class BasicEvaluation:
+<<<<<<< HEAD
     def __init__(self, prof: profile) -> None:
+=======
+    def __init__(self, prof: profile):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.profile = prof
         self.metrics: dict[EventKey, EventMetrics] = {}
         self.compute_self_time()
@@ -110,7 +122,11 @@ def __init__(self, prof: profile) -> None:
         self.queue_depth_list = self.compute_queue_depth()
         self.compute_idle_time()
 
+<<<<<<< HEAD
     def compute_self_time(self) -> None:
+=======
+    def compute_self_time(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Computes event's self time(total time - time in child ops).
         """
@@ -124,9 +140,15 @@ def compute_self_time(self) -> None:
             for child_event in curr_event.children:
                 self_time -= child_event.duration_time_ns
                 stack.append(child_event)
+<<<<<<< HEAD
             assert EventKey(curr_event) not in self.metrics, (
                 f"Duplicate id: {curr_event.id}, {curr_event.name}"
             )
+=======
+            assert (
+                EventKey(curr_event) not in self.metrics
+            ), f"Duplicate id: {curr_event.id}, {curr_event.name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.metrics[EventKey(curr_event)] = EventMetrics(self_time_ns=self_time)
             self.metrics[
                 EventKey(curr_event)
@@ -142,6 +164,7 @@ def compute_queue_depth(self):
         cuda_event_list = self.profile.kineto_results.events()
 
         def is_cuda_launch_kernel(e):
+<<<<<<< HEAD
             """Check if the event is a CUDA launch kernel."""
             launch_patterns = {
                 "cudaLaunchKernel",  # Standard CUDA
@@ -165,6 +188,14 @@ def is_cuda_kernel(e):
             exclude_patterns = {"mem", "cpy", "alloc", "free"}
 
             return not any(pattern in name for pattern in exclude_patterns)
+=======
+            # TODO: find a better way to identify cudaLaunchKernel
+            return e.name == "cudaLaunchKernel"
+
+        def is_cuda_kernel(e):
+            # TODO: find a better way to identify CUDA Kernel
+            return e.device_type() == DeviceType.CUDA and "mem" not in e.name.lower()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cuda_launch_events = sorted(
             (e for e in cuda_event_list if is_cuda_launch_kernel(e)),
@@ -211,7 +242,10 @@ def new_old_event_comparator(event):
             # Find latest cuda kernel event
             if hasattr(event, "start_us"):
                 start_time = event.start_us() * 1000
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 end_time = (event.start_us() + event.duration_us()) * 1000
                 # Find current spawned cuda kernel event
                 if event in kernel_mapping and kernel_mapping[event] is not None:
@@ -228,7 +262,12 @@ def new_old_event_comparator(event):
 
             while (
                 current_kernel_index < len(cuda_kernel_events)
+<<<<<<< HEAD
                 and (cuda_kernel_events[current_kernel_index].start_ns()) <= start_time  # type: ignore[possibly-undefined]
+=======
+                and (cuda_kernel_events[current_kernel_index].start_ns())
+                <= start_time  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 current_kernel_index += 1
             current_queue_depth = spawned_kernel_index - current_kernel_index + 1
@@ -243,7 +282,11 @@ def new_old_event_comparator(event):
 
         return queue_depth_list
 
+<<<<<<< HEAD
     def compute_idle_time(self) -> None:
+=======
+    def compute_idle_time(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Computes idle time of the profile.
         """
@@ -352,11 +395,19 @@ def get_optimizable_events(self, length: int = 1, print_enable: bool = True):
 
         output += "\n".join(
             [
+<<<<<<< HEAD
                 f"""{"-" * 80}
 Event:                {event}
 Source code location: {source_code_location(event.event)}
 Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
 {"-" * 80}"""
+=======
+                f"""{'-' * 80}
+Event:                {event}
+Source code location: {source_code_location(event.event)}
+Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
+{'-' * 80}"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for event in event_list
             ]
         )
@@ -395,7 +446,11 @@ def source_code_location(event):
 # https://github.com/pytorch/pytorch/issues/75504
 # TODO(dberard) - deprecate / remove workaround for CUDA >= 12, when
 # we stop supporting older CUDA versions.
+<<<<<<< HEAD
 def _init_for_cuda_graphs() -> None:
+=======
+def _init_for_cuda_graphs():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.autograd.profiler import profile
 
     with profile():
diff --git a/torch/profiler/itt.py b/torch/profiler/itt.py
index 7b1a6eac0f0bc..764c1b0ed447b 100644
--- a/torch/profiler/itt.py
+++ b/torch/profiler/itt.py
@@ -1,6 +1,9 @@
 # mypy: allow-untyped-defs
 from contextlib import contextmanager
+<<<<<<< HEAD
 from typing import NoReturn
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -9,13 +12,21 @@
 
     class _ITTStub:
         @staticmethod
+<<<<<<< HEAD
         def _fail(*args, **kwargs) -> NoReturn:
+=======
+        def _fail(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 "ITT functions not installed. Are you sure you have a ITT build?"
             )
 
         @staticmethod
+<<<<<<< HEAD
         def is_available() -> bool:
+=======
+        def is_available():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         rangePush = _fail
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index ee0ea85e1694b..0bc59ac0e4a3d 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -5,10 +5,17 @@
 import shutil
 import tempfile
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from enum import Enum
 from functools import partial
 from typing import Any, Optional
+=======
+from collections.abc import Iterable
+from enum import Enum
+from functools import partial
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 from warnings import warn
 
@@ -145,7 +152,11 @@ def __init__(
         execution_trace_observer: Optional[_ITraceObserver] = None,
         acc_events: bool = False,
         custom_trace_id_callback: Optional[Callable[[], str]] = None,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.activities = set(activities) if activities else supported_activities()
         self.record_shapes = record_shapes
         self.with_flops = with_flops
@@ -161,6 +172,7 @@ def __init__(
         self.mem_tl: Optional[MemoryProfileTimeline] = None
         self.use_device = None
         if ProfilerActivity.CUDA in self.activities:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             self.use_device = "cuda"
         elif ProfilerActivity.XPU in self.activities:
@@ -174,11 +186,22 @@ def __init__(
             self.use_device = "hpu"
         elif ProfilerActivity.PrivateUse1 in self.activities:
             # pyrefly: ignore [bad-assignment]
+=======
+            self.use_device = "cuda"
+        elif ProfilerActivity.XPU in self.activities:
+            self.use_device = "xpu"
+        elif ProfilerActivity.MTIA in self.activities:
+            self.use_device = "mtia"
+        elif ProfilerActivity.HPU in self.activities:
+            self.use_device = "hpu"
+        elif ProfilerActivity.PrivateUse1 in self.activities:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.use_device = _get_privateuse1_backend_name()
 
         # user-defined metadata to be amended to the trace
         self.preset_metadata: dict[str, str] = {}
 
+<<<<<<< HEAD
     def start(self) -> None:
         self.prepare_trace()
         self.start_trace()
@@ -187,6 +210,16 @@ def stop(self) -> None:
         self.stop_trace()
 
     def prepare_trace(self) -> None:
+=======
+    def start(self):
+        self.prepare_trace()
+        self.start_trace()
+
+    def stop(self):
+        self.stop_trace()
+
+    def prepare_trace(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(torch, "_inductor"):
             import torch._inductor.config as inductor_config
 
@@ -207,7 +240,11 @@ def prepare_trace(self) -> None:
             )
         self.profiler._prepare_trace()
 
+<<<<<<< HEAD
     def start_trace(self) -> None:
+=======
+    def start_trace(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.execution_trace_observer:
             self.execution_trace_observer.start()
         assert self.profiler is not None
@@ -253,7 +290,11 @@ def start_trace(self) -> None:
             for k, v in self.preset_metadata.items():
                 self.add_metadata_json(k, v)
 
+<<<<<<< HEAD
     def stop_trace(self) -> None:
+=======
+    def stop_trace(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.execution_trace_observer:
             self.execution_trace_observer.stop()
         assert self.profiler is not None
@@ -289,7 +330,11 @@ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
 
     def toggle_collection_dynamic(
         self, enable: bool, activities: Iterable[ProfilerActivity]
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Toggle collection of activities on/off at any point of collection. Currently supports toggling Torch Ops
         (CPU) and CUDA activity supported in Kineto
 
@@ -346,7 +391,11 @@ def events(self):
         assert self.profiler
         return self.profiler.function_events
 
+<<<<<<< HEAD
     def add_metadata(self, key: str, value: str) -> None:
+=======
+    def add_metadata(self, key: str, value: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Adds a user defined metadata with a string key and a string value
         into the trace file
@@ -354,14 +403,22 @@ def add_metadata(self, key: str, value: str) -> None:
         wrapped_value = '"' + value.replace('"', '\\"') + '"'
         torch.autograd._add_metadata_json(key, wrapped_value)
 
+<<<<<<< HEAD
     def add_metadata_json(self, key: str, value: str) -> None:
+=======
+    def add_metadata_json(self, key: str, value: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Adds a user defined metadata with a string key and a valid json value
         into the trace file
         """
         torch.autograd._add_metadata_json(key, value)
 
+<<<<<<< HEAD
     def preset_metadata_json(self, key: str, value: str) -> None:
+=======
+    def preset_metadata_json(self, key: str, value: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Preset a user defined metadata when the profiler is not started
         and added into the trace file later.
@@ -385,7 +442,10 @@ def _get_distributed_info(self):
         }
         if backend == "nccl":
             nccl_version = torch.cuda.nccl.version()
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
         return dist_info
 
@@ -512,10 +572,14 @@ def schedule_fn(step: int) -> ProfilerAction:
         wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
     ), "Invalid profiler schedule arguments"
     if warmup == 0:
+<<<<<<< HEAD
         warn(
             "Profiler won't be using warmup, this can skew profiler results",
             stacklevel=2,
         )
+=======
+        warn("Profiler won't be using warmup, this can skew profiler results")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return schedule_fn
 
 
@@ -536,6 +600,10 @@ def tensorboard_trace_handler(
     ``worker_name`` should be unique for each worker in distributed scenario,
     it will be set to '[hostname]_[pid]' by default.
     """
+<<<<<<< HEAD
+=======
+    import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import socket
     import time
 
@@ -632,7 +700,12 @@ class profile(_KinetoProfile):
             ]
         ) as p:
             code_to_profile()
+<<<<<<< HEAD
         print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+=======
+        print(p.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
 
@@ -642,17 +715,28 @@ class profile(_KinetoProfile):
         # on different iterations of the training loop;
         # trace_handler is called every time a new trace becomes available
         def trace_handler(prof):
+<<<<<<< HEAD
             print(
                 prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
             )
             # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
 
 
+=======
+            print(prof.key_averages().table(
+                sort_by="self_cuda_time_total", row_limit=-1))
+            # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
                 torch.profiler.ProfilerActivity.CUDA,
             ],
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # In this example with wait=1, warmup=1, active=2, repeat=1,
             # profiler will skip the first step/iteration,
             # start warming up on the second, record
@@ -660,6 +744,7 @@ def trace_handler(prof):
             # after which the trace will become available
             # and on_trace_ready (when set) is called;
             # the cycle repeats starting with the next step
+<<<<<<< HEAD
             schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),
             on_trace_ready=trace_handler,
             # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
@@ -669,6 +754,22 @@ def trace_handler(prof):
                 code_iteration_to_profile(iter)
                 # send a signal to the profiler that the next iteration has started
                 p.step()
+=======
+
+            schedule=torch.profiler.schedule(
+                wait=1,
+                warmup=1,
+                active=2,
+                repeat=1),
+            on_trace_ready=trace_handler
+            # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
+            # used when outputting for tensorboard
+            ) as p:
+                for iter in range(N):
+                    code_iteration_to_profile(iter)
+                    # send a signal to the profiler that the next iteration has started
+                    p.step()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
 
@@ -705,7 +806,11 @@ def __init__(
         # deprecated:
         use_cuda: Optional[bool] = None,
         custom_trace_id_callback: Optional[Callable[[], str]] = None,
+<<<<<<< HEAD
     ) -> None:
+=======
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         activities_set = set(activities) if activities else supported_activities()
         if use_cuda is not None:
             warn(
@@ -821,7 +926,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if self.execution_trace_observer:
             self.execution_trace_observer.cleanup()
 
+<<<<<<< HEAD
     def start(self) -> None:
+=======
+    def start(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._transit_action(ProfilerAction.NONE, self.current_action)
         if self.record_steps:
             self.step_rec_fn = prof.record_function(
@@ -829,12 +938,20 @@ def start(self) -> None:
             )
             self.step_rec_fn.__enter__()
 
+<<<<<<< HEAD
     def stop(self) -> None:
+=======
+    def stop(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.record_steps and self.step_rec_fn:
             self.step_rec_fn.__exit__(None, None, None)
         self._transit_action(self.current_action, None)
 
+<<<<<<< HEAD
     def step(self) -> None:
+=======
+    def step(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Signals the profiler that the next profiling step has started.
         """
@@ -856,7 +973,11 @@ def step(self) -> None:
             )
             self.step_rec_fn.__enter__()
 
+<<<<<<< HEAD
     def set_custom_trace_id_callback(self, callback) -> None:
+=======
+    def set_custom_trace_id_callback(self, callback):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Sets a callback to be called when a new trace ID is generated.
         """
@@ -870,11 +991,19 @@ def get_trace_id(self):
             return None
         return self.profiler.trace_id
 
+<<<<<<< HEAD
     def _trace_ready(self) -> None:
         if self.on_trace_ready:
             self.on_trace_ready(self)
 
     def _transit_action(self, prev_action, current_action) -> None:
+=======
+    def _trace_ready(self):
+        if self.on_trace_ready:
+            self.on_trace_ready(self)
+
+    def _transit_action(self, prev_action, current_action):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         action_list = self.action_map.get((prev_action, current_action))
         if action_list:
             for action in action_list:
@@ -912,7 +1041,11 @@ def __init__(self) -> None:
         self.output_file_path: str = ""
         self.output_file_path_observer: str = ""
 
+<<<<<<< HEAD
     def __del__(self) -> None:
+=======
+    def __del__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Calls unregister_callback() to make sure to finalize outputs.
         """
@@ -933,8 +1066,12 @@ def build_execution_trace_obs_from_env() -> Optional["ExecutionTraceObserver"]:
                 fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
             except Exception as e:
                 warn(
+<<<<<<< HEAD
                     f"Execution trace will not be recorded. Exception on creating default temporary file: {e}",
                     stacklevel=2,
+=======
+                    f"Execution trace will not be recorded. Exception on creating default temporary file: {e}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return None
             fp.close()
@@ -1019,16 +1156,24 @@ def get_resources_dir_for_et_path(
                 try:
                     os.mkdir(resource_dir)
                 except Exception:
+<<<<<<< HEAD
                     warn(
                         f"Execution trace exception when creating {resource_dir}",
                         stacklevel=2,
                     )
+=======
+                    warn(f"Execution trace exception when creating {resource_dir}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return None
             else:
                 return None
         return resource_dir
 
+<<<<<<< HEAD
     def unregister_callback(self) -> None:
+=======
+    def unregister_callback(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Removes ET observer from record function callbacks.
         """
@@ -1038,15 +1183,23 @@ def _save_triton_kernels() -> None:
                 resource_dir = self.get_resources_dir()
             except Exception as e:
                 warn(
+<<<<<<< HEAD
                     f"Execution trace exception when generating resource directory: {e}",
                     stacklevel=2,
+=======
+                    f"Execution trace exception when generating resource directory: {e}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return
             if not resource_dir:
                 return
 
             # Save the kernel paths for the generated kernels
+<<<<<<< HEAD
             from torch._inductor.codecache import PyCodeCache
+=======
+            from torch._inductor.codecache import PyCodeCache as PyCodeCache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             kernel_files = [
                 v.__file__
@@ -1074,7 +1227,11 @@ def _save_gz_file(uncompressed_file: str, output_file: str) -> None:
             try:
                 _save_triton_kernels()
             except Exception as e:
+<<<<<<< HEAD
                 warn(f"Execution trace failed to save kernels: {e}", stacklevel=2)
+=======
+                warn(f"Execution trace failed to save kernels: {e}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             _remove_execution_trace_observer()
             if self.output_file_path.endswith("gz"):
@@ -1095,7 +1252,11 @@ def is_running(self):
         """
         return self._execution_trace_running
 
+<<<<<<< HEAD
     def start(self) -> None:
+=======
+    def start(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Starts to capture.
         """
@@ -1104,7 +1265,11 @@ def start(self) -> None:
             self._execution_trace_running = True
             self._record_pg_config()
 
+<<<<<<< HEAD
     def stop(self) -> None:
+=======
+    def stop(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Stops to capture.
         """
@@ -1112,7 +1277,11 @@ def stop(self) -> None:
             _disable_execution_trace_observer()
             self._execution_trace_running = False
 
+<<<<<<< HEAD
     def cleanup(self) -> None:
+=======
+    def cleanup(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Calls unregister_callback() to make sure to finalize outputs.
         """
diff --git a/torch/quantization/_quantized_conversions.py b/torch/quantization/_quantized_conversions.py
index 0fcb1004f7047..074c523dd19dc 100644
--- a/torch/quantization/_quantized_conversions.py
+++ b/torch/quantization/_quantized_conversions.py
@@ -71,7 +71,10 @@ def quantized_weight_reorder_for_mixed_dtypes_linear_cutlass(
                 nrows // 16, 16
             )
         ).view(-1)
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     outp = outp.index_copy(1, cols_permuted, outp)
 
     # interleave_column_major_tensor
diff --git a/torch/quantization/fuser_method_mappings.py b/torch/quantization/fuser_method_mappings.py
index 5a68fbf02015f..42fe593ce3f27 100644
--- a/torch/quantization/fuser_method_mappings.py
+++ b/torch/quantization/fuser_method_mappings.py
@@ -6,7 +6,10 @@
 `torch/ao/quantization/fuser_method_mappings.py`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fuser_method_mappings import (
     _DEFAULT_OP_LIST_TO_FUSER_METHOD,
     fuse_conv_bn,
diff --git a/torch/quantization/fx/_equalize.py b/torch/quantization/fx/_equalize.py
index d6b8611d4a769..4aa11e55aa66f 100644
--- a/torch/quantization/fx/_equalize.py
+++ b/torch/quantization/fx/_equalize.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx._equalize import (
     _convert_equalization_ref,
     _InputEqualizationObserver,
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index 30a661da41e5e..ba6cdc9caefe2 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -6,5 +6,8 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.convert import convert
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 22ad750e9f878..3c11a8b22c653 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -6,5 +6,8 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.fuse import fuse
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index 982d919655f36..bc25e504da796 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -6,5 +6,8 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.fuse_handler import DefaultFuseHandler, FuseHandler
diff --git a/torch/quantization/fx/graph_module.py b/torch/quantization/fx/graph_module.py
index 74b63903d7400..f44f79ca0a068 100644
--- a/torch/quantization/fx/graph_module.py
+++ b/torch/quantization/fx/graph_module.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.graph_module import (
     _is_observed_module,
     _is_observed_standalone_module,
diff --git a/torch/quantization/fx/match_utils.py b/torch/quantization/fx/match_utils.py
index 8585a21ad445d..12dadb093a119 100644
--- a/torch/quantization/fx/match_utils.py
+++ b/torch/quantization/fx/match_utils.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.match_utils import (
     _find_matches,
     _is_match,
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index fa601d1eb619c..64b4c39931370 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.pattern_utils import (
     _register_fusion_pattern,
     _register_quant_pattern,
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index a6007ef242af5..6a54eaa62d203 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -6,5 +6,8 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.prepare import prepare
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 89f8d4406e912..5229bae9d74aa 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.quantize_handler import (
     BatchNormQuantizeHandler,
     BinaryOpQuantizeHandler,
diff --git a/torch/quantization/fx/quantization_types.py b/torch/quantization/fx/quantization_types.py
index 0820ea057078e..06502e3ce3b82 100644
--- a/torch/quantization/fx/quantization_types.py
+++ b/torch/quantization/fx/quantization_types.py
@@ -6,5 +6,8 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.utils import Pattern, QuantizerCls
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index e45c82b8fb6f2..bd72032a279b9 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -6,7 +6,10 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fx.utils import (
     all_node_args_have_no_tensors,
     assert_and_get_unique_device,
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 2163e2717b069..b3c663a854ab4 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -6,7 +6,10 @@
 `torch/ao/quantization/observer.py`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.observer import (
     _is_activation_post_process,
     _is_per_channel_script_obs_instance,
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 75398d3343f93..98e109a6e51d1 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -6,7 +6,10 @@
 `torch/ao/quantization/qconfig.py`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.qconfig import (
     _add_module_to_qconfig_obs_ctr,
     _assert_valid_qconfig,
@@ -27,5 +30,9 @@
     QConfig,
     qconfig_equals,
     QConfigAny,
+<<<<<<< HEAD
     QConfigDynamic,  # pyrefly: ignore  # deprecated
+=======
+    QConfigDynamic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index faa24d391d31a..fe143f13f08b9 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -6,7 +6,10 @@
 `torch/ao/quantization/quantization_mappings.py`, while adding an import statement
 here.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.quantization_mappings import (
     _get_special_act_post_process,
     _has_special_act_post_process,
diff --git a/torch/random.py b/torch/random.py
index cf23e52db320e..f42a86d991a10 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -184,7 +184,11 @@ def fork_rng(
                 f"and suppress this warning, set the '{_devices_kw}' keyword argument to "
                 f"`range(torch.{device_type}.device_count())`."
             )
+<<<<<<< HEAD
             warnings.warn(message, stacklevel=2)
+=======
+            warnings.warn(message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _fork_rng_warned_already = True
         devices = list(range(num_devices))
     else:
diff --git a/torch/serialization.py b/torch/serialization.py
index bc209350708cf..b7e4bdb1142c7 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -13,11 +13,18 @@
 import tempfile
 import threading
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from contextlib import closing, contextmanager
 from enum import Enum
 from typing import Any, cast, Generic, IO, Optional, TypeAlias, TypeVar, Union
 from typing_extensions import TypeIs
+=======
+from contextlib import closing, contextmanager
+from enum import Enum
+from typing import Any, Callable, cast, Generic, IO, Optional, TypeVar, Union
+from typing_extensions import TypeAlias, TypeIs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._weights_only_unpickler as _weights_only_unpickler
@@ -190,7 +197,11 @@ def set_crc32_options(compute_crc32: bool):
         able to load the file.
 
     Args:
+<<<<<<< HEAD
         compute_crc32 (bool): set crc32 computation flag
+=======
+        compute_crc32 (bool): set crc32 compuation flag
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     from torch.utils.serialization import config
 
@@ -524,10 +535,14 @@ def check_module_version_greater_or_equal(
         if error_if_malformed:
             raise RuntimeError(message) from e
         else:
+<<<<<<< HEAD
             warnings.warn(
                 message + ", but continuing assuming that requirement is met",
                 stacklevel=2,
             )
+=======
+            warnings.warn(message + ", but continuing assuming that requirement is met")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requirement_is_met = True
 
     return requirement_is_met
@@ -664,11 +679,14 @@ def _deserialize(backend_name, obj, location):
     functools.partial(_backend_tag, "xpu"),
     functools.partial(_deserialize, "xpu"),
 )
+<<<<<<< HEAD
 register_package(
     26,
     functools.partial(_backend_tag, "mtia"),
     functools.partial(_deserialize, "mtia"),
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def location_tag(
@@ -790,10 +808,16 @@ def __init__(self, name: str) -> None:
             # PyTorchFileWriter only supports ascii filename.
             # For filenames with non-ascii characters, we rely on Python
             # for writing out the file.
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             self.file_stream = io.FileIO(self.name, mode="w")
             super().__init__(
                 torch._C.PyTorchFileWriter(  # pyrefly: ignore  # no-matching-overload
+=======
+            self.file_stream = io.FileIO(self.name, mode="w")
+            super().__init__(
+                torch._C.PyTorchFileWriter(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.file_stream, get_crc32_options(), _get_storage_alignment()
                 )
             )
@@ -1024,8 +1048,12 @@ def persistent_id(obj: Any) -> Optional[tuple]:
                 warnings.warn(
                     "Couldn't retrieve source code for container of "
                     "type " + obj.__name__ + ". It won't be checked "
+<<<<<<< HEAD
                     "for correctness upon loading.",
                     stacklevel=2,
+=======
+                    "for correctness upon loading."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             return ("module", obj, source_file, source)
 
@@ -1077,7 +1105,11 @@ def persistent_id(obj: Any) -> Optional[tuple]:
             # tensor]`, where `tensor.storage()` is the same as `storage`, and
             # `tensor.element_size() > 1`. Let's say that `tensor.dtype ==
             # torch.float`.  The storage will be serialized with element size
+<<<<<<< HEAD
             # of 1, since we're choosing to serialize the first occurrence of
+=======
+            # of 1, since we're choosing to serialize the first occurance of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # a duplicate storage. Since this legacy serialization format saves
             # the numel of the storage, rather than nbytes directly, we'll be
             # effectively saving nbytes in this case.  We'll be able to load it
@@ -1120,6 +1152,7 @@ def persistent_id(obj: Any) -> Optional[tuple]:
             return res
         return None
 
+<<<<<<< HEAD
     sys_info = {
         "protocol_version": PROTOCOL_VERSION,
         "little_endian": sys.byteorder == "little",
@@ -1129,6 +1162,17 @@ def persistent_id(obj: Any) -> Optional[tuple]:
             "long": LONG_SIZE,
         },
     }
+=======
+    sys_info = dict(
+        protocol_version=PROTOCOL_VERSION,
+        little_endian=sys.byteorder == "little",
+        type_sizes=dict(
+            short=SHORT_SIZE,
+            int=INT_SIZE,
+            long=LONG_SIZE,
+        ),
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
     pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
@@ -1136,14 +1180,21 @@ def persistent_id(obj: Any) -> Optional[tuple]:
 
     class PyTorchLegacyPickler(pickle_module.Pickler):
         def persistent_id(self, obj):
+<<<<<<< HEAD
             return persistent_id(obj)  # noqa: F821
+=======
+            return persistent_id(obj)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol)
     pickler.dump(obj)
 
+<<<<<<< HEAD
     # The class def keeps the persistent_id closure alive, leaking memory.
     del persistent_id
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     serialized_storage_keys = sorted(serialized_storages.keys())
     pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
     f.flush()
@@ -1161,7 +1212,11 @@ def _save(
     pickle_protocol,
     _disable_byteorder_record,
 ):
+<<<<<<< HEAD
     serialized_storages: dict[str, torch.storage.UntypedStorage] = {}
+=======
+    serialized_storages = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     id_map: dict[int, str] = {}
 
     # Since loading storages that view the same data with different dtypes is
@@ -1221,6 +1276,7 @@ def persistent_id(obj):
 
     class PyTorchPickler(pickle_module.Pickler):  # type: ignore[name-defined]
         def persistent_id(self, obj):
+<<<<<<< HEAD
             return persistent_id(obj)  # noqa: F821
 
     pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
@@ -1229,6 +1285,12 @@ def persistent_id(self, obj):
     # The class def keeps the persistent_id closure alive, leaking memory.
     del persistent_id
 
+=======
+            return persistent_id(obj)
+
+    pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
+    pickler.dump(obj)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     data_value = data_buf.getvalue()
     zip_file.write_record("data.pkl", data_value, len(data_value))
     # .format_version is used to track
@@ -1304,11 +1366,14 @@ def load(
 
     Loads an object saved with :func:`torch.save` from a file.
 
+<<<<<<< HEAD
     .. warning::
         :func:`torch.load()` uses an unpickler under the hood. **Never load data from an untrusted source.**
 
         See :ref:`weights-only-security` for more details.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     :func:`torch.load` uses Python's unpickling facilities but treats storages,
     which underlie tensors, specially. They are first deserialized on the
     CPU and are then moved to the device they were saved from. If this fails
@@ -1351,16 +1416,34 @@ def load(
             loading only tensors, primitive types, dictionaries
             and any types added via :func:`torch.serialization.add_safe_globals`.
             See :ref:`weights-only` for more details.
+<<<<<<< HEAD
         mmap: Indicates whether the file should be mapped rather than loading all the storages into memory.
             Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
             are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
             second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
             tensor storages from disk to CPU memory in the first step, ``f`` is mapped, which means tensor storages
+=======
+        mmap: Indicates whether the file should be mmaped rather than loading all the storages into memory.
+            Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
+            are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
+            second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
+            tensor storages from disk to CPU memory in the first step, ``f`` is mmaped, which means tensor storages
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             will be lazily loaded when their data is accessed.
         pickle_load_args: (Python 3 only) optional keyword arguments passed over to
             :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
             :attr:`errors=...`.
 
+<<<<<<< HEAD
+=======
+    .. warning::
+        :func:`torch.load()` unless `weights_only` parameter is set to `True`,
+        uses ``pickle`` module implicitly, which is known to be insecure.
+        It is possible to construct malicious pickle data which will execute arbitrary code
+        during unpickling. Never load data that could have come from an untrusted
+        source in an unsafe mode, or that could have been tampered with. **Only load data you trust**.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     .. note::
         When you call :func:`torch.load()` on a file which contains GPU tensors, those tensors
         will be loaded to GPU by default. You can call ``torch.load(.., map_location='cpu')``
@@ -1442,7 +1525,11 @@ def _get_wo_message(message: str) -> str:
                         "Please file an issue with the following so that we can make "
                         "`weights_only=True` compatible with your use case: WeightsUnpickler error: "
                     )
+<<<<<<< HEAD
             updated_message += "\n\n" + message
+=======
+            updated_message += message
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return updated_message + DOCS_MESSAGE
 
     weights_only_not_set = weights_only is None
@@ -1511,7 +1598,10 @@ def _get_wo_message(message: str) -> str:
                         " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
                         " silence this warning)",
                         UserWarning,
+<<<<<<< HEAD
                         stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     if weights_only:
                         raise RuntimeError(
@@ -1531,9 +1621,13 @@ def _get_wo_message(message: str) -> str:
                     else:
                         shared = False
                     overall_storage = torch.UntypedStorage.from_file(
+<<<<<<< HEAD
                         os.fspath(f),
                         shared,
                         size,
+=======
+                        os.fspath(f), shared, size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 if weights_only:
                     try:
@@ -1613,8 +1707,12 @@ def _check_container_source(container_type, source_file, original_source):
             warnings.warn(
                 "Couldn't retrieve source code for container of "
                 "type " + container_type.__name__ + ". It won't be checked "
+<<<<<<< HEAD
                 "for correctness upon loading.",
                 stacklevel=2,
+=======
+                "for correctness upon loading."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return
         if original_source != current_source:
@@ -1656,7 +1754,11 @@ def _check_container_source(container_type, source_file, original_source):
                     "patch tool to revert the changes."
                 )
             msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}"
+<<<<<<< HEAD
             warnings.warn(msg, SourceChangeWarning, stacklevel=2)
+=======
+            warnings.warn(msg, SourceChangeWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def legacy_load(f):
         deserialized_objects: dict[int, Any] = {}
@@ -1960,7 +2062,10 @@ def _load(
             "torch.serialization.set_default_load_endianness to set "
             "the desired default load endianness",
             UserWarning,
+<<<<<<< HEAD
             stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     from torch.utils.serialization import config
@@ -2009,7 +2114,11 @@ def _get_offset(key, name, numel):
         # for a given key.
         offsets[name] = storage_offset
 
+<<<<<<< HEAD
         # Increment current_offset to offset where next zipfile header starts
+=======
+        # Increment current_offset of offset where next zipfile header starts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         current_offset = storage_offset + numel
         # add size of data descriptor after payload
         if numel > 0:
@@ -2025,10 +2134,14 @@ def load_tensor(dtype, numel, key, location):
         if torch._guards.detect_fake_mode(None) is not None:
             nbytes = numel * torch._utils._element_size(dtype)
             storage = torch.UntypedStorage(nbytes, device="meta")
+<<<<<<< HEAD
             if can_calculate_storage_offsets:
                 storage._checkpoint_offset = _get_offset(key, name, numel)
             else:
                 storage._checkpoint_offset = zip_file.get_record_offset(name)
+=======
+            storage._checkpoint_offset = zip_file.get_record_offset(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif _serialization_tls.skip_data:
             nbytes = numel * torch._utils._element_size(dtype)
             storage = torch.UntypedStorage(nbytes)
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index 8478d0df574dc..d07d1161b2671 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -1,7 +1,13 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from math import sqrt
 from typing import Optional, TypeVar
+=======
+from collections.abc import Iterable
+from math import sqrt
+from typing import Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -128,7 +134,13 @@ def _window_function_checks(
     >>> # Generates a periodic exponential window and decay factor equal to .5
     >>> torch.signal.windows.exponential(10, sym=False,tau=.5)
     tensor([4.5400e-05, 3.3546e-04, 2.4788e-03, 1.8316e-02, 1.3534e-01, 1.0000e+00, 1.3534e-01, 1.8316e-02, 2.4788e-03, 3.3546e-04])
+<<<<<<< HEAD
     """.format(**window_common_args),
+=======
+    """.format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def exponential(
     M: int,
@@ -326,7 +338,11 @@ def gaussian(
         requires_grad=requires_grad,
     )
 
+<<<<<<< HEAD
     return torch.exp(-(k**2))  # pyrefly: ignore [unsupported-operation]
+=======
+    return torch.exp(-(k**2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_add_docstr(
@@ -397,17 +413,24 @@ def kaiser(
         )
 
     # Avoid NaNs by casting `beta` to the appropriate dtype.
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     beta = torch.tensor(beta, dtype=dtype, device=device)
 
     start = -beta
     constant = 2.0 * beta / (M if not sym else M - 1)
+<<<<<<< HEAD
     end = torch.minimum(
         # pyrefly: ignore [bad-argument-type]
         beta,
         # pyrefly: ignore [bad-argument-type]
         start + (M - 1) * constant,
     )
+=======
+    end = torch.minimum(beta, start + (M - 1) * constant)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     k = torch.linspace(
         start=start,
@@ -419,10 +442,14 @@ def kaiser(
         requires_grad=requires_grad,
     )
 
+<<<<<<< HEAD
     return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(
         # pyrefly: ignore [bad-argument-type]
         beta
     )
+=======
+    return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(beta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_add_docstr(
@@ -459,7 +486,13 @@ def kaiser(
     >>> # Generates a periodic Hamming window.
     >>> torch.signal.windows.hamming(10, sym=False)
     tensor([0.0800, 0.1679, 0.3979, 0.6821, 0.9121, 1.0000, 0.9121, 0.6821, 0.3979, 0.1679])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def hamming(
     M: int,
@@ -513,7 +546,13 @@ def hamming(
     >>> # Generates a periodic Hann window.
     >>> torch.signal.windows.hann(10, sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def hann(
     M: int,
@@ -567,7 +606,13 @@ def hann(
     >>> # Generates a periodic Blackman window.
     >>> torch.signal.windows.blackman(5, sym=False)
     tensor([-1.4901e-08,  2.0077e-01,  8.4923e-01,  8.4923e-01,  2.0077e-01])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def blackman(
     M: int,
@@ -628,7 +673,13 @@ def blackman(
     >>> # Generates a periodic Bartlett window.
     >>> torch.signal.windows.bartlett(10, sym=False)
     tensor([0.0000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 0.8000, 0.6000, 0.4000, 0.2000])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def bartlett(
     M: int,
@@ -703,7 +754,13 @@ def bartlett(
     >>> # Generates a periodic general cosine window with 2 coefficients.
     >>> torch.signal.windows.general_cosine(10, a=[0.5, 1 - 0.5], sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def general_cosine(
     M,
@@ -796,7 +853,13 @@ def general_cosine(
     >>> # Generates a periodic Hann window with the general Hamming window.
     >>> torch.signal.windows.general_hamming(10, alpha=0.5, sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def general_hamming(
     M,
@@ -861,7 +924,13 @@ def general_hamming(
     >>> # Generates a periodic Nuttall window.
     >>> torch.signal.windows.general_hamming(5, sym=False)
     tensor([3.6280e-04, 1.1052e-01, 7.9826e-01, 7.9826e-01, 1.1052e-01])
+<<<<<<< HEAD
 """.format(**window_common_args),
+=======
+""".format(
+        **window_common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def nuttall(
     M: int,
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 2ddd930cd8521..bdd37a0302529 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -559,11 +559,15 @@ def as_sparse_gradcheck(gradcheck):
     For example:
 
     >>> gradcheck = torch.sparse.as_sparse_gradcheck(torch.autograd.gradcheck)
+<<<<<<< HEAD
     >>> x = (
     ...     torch.tensor([[0, 1], [2, 3]], dtype=torch.float64)
     ...     .to_sparse_coo()
     ...     .requires_grad_(True)
     ... )
+=======
+    >>> x = torch.tensor([[0, 1], [2, 3]], dtype=torch.float64).to_sparse_coo().requires_grad_(True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> gradcheck(lambda x: x.to_sparse_csr(), x)
     True
     """
@@ -602,10 +606,14 @@ def convert_to_strided_representation(args):
                     and obj.requires_grad
                     and obj.layout in sparse_layouts
                 ):
+<<<<<<< HEAD
                     d = {
                         "layout": obj.layout,
                         "shape": obj.shape,
                     }
+=======
+                    d = dict(layout=obj.layout, shape=obj.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not masked:
                         # Materialize unspecified elements with zero values
                         batch_dim = obj.ndim - obj.dense_dim() - obj.sparse_dim()
@@ -623,20 +631,29 @@ def convert_to_strided_representation(args):
                         )
                         obj = obj.to_dense().sparse_mask(full_mask)
                     if obj.layout is torch.sparse_coo:
+<<<<<<< HEAD
                         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         d.update(
                             indices=obj._indices(), is_coalesced=obj.is_coalesced()
                         )
                         values = obj._values()
                     elif obj.layout in {torch.sparse_csr, torch.sparse_bsr}:
+<<<<<<< HEAD
                         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         d.update(
                             compressed_indices=obj.crow_indices(),
                             plain_indices=obj.col_indices(),
                         )
                         values = obj.values()
                     else:
+<<<<<<< HEAD
                         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         d.update(
                             compressed_indices=obj.ccol_indices(),
                             plain_indices=obj.row_indices(),
@@ -674,7 +691,11 @@ def restore_from_strided_representation(args):
                         )
                     else:
                         raise NotImplementedError(
+<<<<<<< HEAD
                             f"conversion of {d['layout']} strided representation to tensor"
+=======
+                            f'conversion of {d["layout"]} strided representation to tensor'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                 new_args.append(a)
             return tuple(new_args)
diff --git a/torch/sparse/_semi_structured_conversions.py b/torch/sparse/_semi_structured_conversions.py
index 354acdee16a26..f84fd15238a8f 100644
--- a/torch/sparse/_semi_structured_conversions.py
+++ b/torch/sparse/_semi_structured_conversions.py
@@ -140,7 +140,10 @@ def sparse_semi_structured_from_dense_cutlass(dense):
 
     if dense.dtype != torch.float:
         sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
         sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
     else:
@@ -173,7 +176,10 @@ def sparse_semi_structured_from_dense_cutlass(dense):
     meta_offsets = _calculate_meta_reordering_scatter_offsets(
         m, meta_ncols, meta_dtype, device
     )
+<<<<<<< HEAD
     # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
 
     return (sparse, meta_reordered.view(m, meta_ncols))
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
index 8870dce504190..d8caaefaec1ea 100644
--- a/torch/sparse/_semi_structured_ops.py
+++ b/torch/sparse/_semi_structured_ops.py
@@ -40,7 +40,11 @@ def semi_sparse_values(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
+<<<<<<< HEAD
         return A.packed.ravel()[:num_kept_elements:].view(m, -1)
+=======
+        return A.packed[:num_kept_elements:].view(m, -1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return A.packed.detach()
 
@@ -53,7 +57,11 @@ def semi_sparse_indices(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
+<<<<<<< HEAD
         metadata = A.packed.ravel()[num_kept_elements:].view(m, -1)
+=======
+        metadata = A.packed[num_kept_elements:].view(m, -1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return metadata.view(torch.int32 if A.dtype == torch.int32 else torch.int16)
     else:
         return A.meta
@@ -67,7 +75,10 @@ def semi_sparse_t(func, types, args=(), kwargs=None) -> torch.Tensor:
     # Because we cannot go from the compressed representation back to the dense representation currently,
     # we just keep track of how many times we have been transposed. Depending on whether the sparse matrix
     # is the first or second argument, we expect an even / odd number of calls to transpose respectively.
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self.__class__(
         torch.Size([self.shape[-1], self.shape[0]]),
         packed=self.packed_t,
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index 040ceed94e8d3..b6e4e2d80fa61 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -296,11 +296,19 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
         for b in range(nbatches):
             for i, r in enumerate(r_offsets):
                 r0, r1 = divmod(r, N)
+<<<<<<< HEAD
                 acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
                 for g in range(c_indices[i], c_indices[i + 1]):
                     p = p_offsets[g]
                     q0, q1 = divmod(q_offsets[g], N)
                     acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]
+=======
+                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                for g in range(c_indices[i], c_indices[i+1]):
+                    p = p_offsets[g]
+                    q0, q1 = divmod(q_offsets[g], N)
+                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
       integer multiples of ``Ms`` and ``Ks``, respectively.
@@ -320,11 +328,19 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
                 n = (r % N) // Ns
                 r0, r1 = divmod(r, N)
                 c0, c1 = c_indices[m], c_indices[m + 1]
+<<<<<<< HEAD
                 acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
                 for i, p in enumerate(range(c0, c1)):
                     q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                     q0, q1 = divmod(q, N)
                     acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]
+=======
+                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                for i, p in enumerate(range(c0, c1)):
+                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
+                    q0, q1 = divmod(q, N)
+                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
       integer multiples of ``Ms`` and ``Ks``, respectively.
@@ -385,7 +401,10 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
                 g1 = c_offsets[r + 1]
                 for g in range(g0, g1):
                     p, q = pq[g]
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     accumulators[r] += blocks[p] @ others[q]
         else:
             _scatter_mm2(blocks, others, c_offsets, pq, accumulators)
@@ -1297,7 +1316,10 @@ def bsr_dense_addmm(
     assert alpha != 0
 
     def kernel(grid, *sliced_tensors):
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _bsr_strided_addmm_kernel[grid](
             *ptr_stride_extractor(*sliced_tensors),
             beta,
@@ -1427,7 +1449,10 @@ def _sampled_addmm_kernel(
 
                 mat1_block = tl.load(
                     mat1_block_ptrs + mat1_col_block_stride * k_offsets[None, :],
+<<<<<<< HEAD
                     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     mask=mask_k[None, :],
                     other=0.0,
                 )
@@ -1436,7 +1461,10 @@ def _sampled_addmm_kernel(
                     mat2_block_ptrs
                     + mat2_tiled_col_stride * col_block
                     + mat2_row_block_stride * k_offsets[:, None],
+<<<<<<< HEAD
                     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     mask=mask_k[:, None],
                     other=0.0,
                 )
@@ -1974,7 +2002,10 @@ def _scaled_dot_product_attention(
         if attn_mask.dtype is not torch.bool:
             check_dtype(f_name, attn_mask, query.dtype)
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sdpa = sampled_addmm(
             attn_mask, query, key.transpose(-2, -1), beta=0.0, skip_checks=False
         )
@@ -1986,10 +2017,15 @@ def _scaled_dot_product_attention(
             )
         scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
         sdpa.values().mul_(scale_factor)
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
         sdpa = bsr_softmax(sdpa)
         torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True)
         # pyrefly: ignore [not-callable]
+=======
+        sdpa = bsr_softmax(sdpa)
+        torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sdpa = bsr_dense_mm(sdpa, value)
         return sdpa
 
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 8ca192767b2a2..378a72b1db17d 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -60,7 +60,11 @@
 the pytorch development tree::
 
   cd /path/to/pytorch
+<<<<<<< HEAD
   python -m pip install --no-build-isolation -v -e .
+=======
+  python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python torch/sparse/_triton_ops_meta.py
 
 This will compute the optimal kernel parameters for the GPU device
@@ -97,7 +101,10 @@
 kernel parameters for addmm-based operations.
 
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = ["get_meta", "tune_bsr_dense_addmm", "tune__int_bsr_dense_addmm"]
 
 import inspect
@@ -194,8 +201,12 @@ def update(op, device_name, version, key, value):
     # skip storing possible optimization failures:
     if not value:
         warnings.warn(
+<<<<<<< HEAD
             f"skipping empty value for {op}: {device_name=} {version=} {key=}",
             stacklevel=2,
+=======
+            f"skipping empty value for {op}: {device_name=} {version=} {key=}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return
     if (op, device_name, version) in _operation_device_version_data:
@@ -218,8 +229,12 @@ def dump():
     if begin_data_index == -1 or end_data_index == -1:
         warnings.warn(
             f"{current_file} cannot be updated:"
+<<<<<<< HEAD
             " BEGIN/END GENERATED DATA comment blocks appear to be corrupted",
             stacklevel=2,
+=======
+            " BEGIN/END GENERATED DATA comment blocks appear to be corrupted"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return
 
@@ -234,10 +249,15 @@ def sort_key(key):
     part2 = current_content[end_data_index:]
     data_part = []
     for op_key in sorted(_operation_device_version_data, key=sort_key):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
         data_part.append("    " + repr(op_key).replace("'", '"') + ": {")
         op_data = _operation_device_version_data[op_key]
         # pyrefly: ignore [bad-argument-type]
+=======
+        data_part.append("    " + repr(op_key).replace("'", '"') + ": {")
+        op_data = _operation_device_version_data[op_key]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data_part.extend(f"        {key}: {op_data[key]}," for key in sorted(op_data))
         data_part.append("    },")
     new_content = part1 + "\n".join(data_part) + "\n" + part2
@@ -371,7 +391,10 @@ def from_key(key, parameters):
                 if next_target < minimal_target:
                     minimal_target = next_target
                     parameters = next_parameters
+<<<<<<< HEAD
                     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     pbar.total += i + 1
                     break
         else:
@@ -438,9 +461,15 @@ def from_key(key, parameters):
 
 
 def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device):
+<<<<<<< HEAD
     assert sparsity <= 1.0 and sparsity >= 0.0, (
         "sparsity should be a value between 0 and 1"
     )
+=======
+    assert (
+        sparsity <= 1.0 and sparsity >= 0.0
+    ), "sparsity should be a value between 0 and 1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert M % blocksize[0] == 0
     assert N % blocksize[1] == 0
     shape = (B, M // blocksize[0], N // blocksize[1])[int(B == 0) :]
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index df5e3508e5256..77ab39b449552 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import warnings
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Optional
+=======
+from typing import Any, Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.sparse._semi_structured_conversions import (
@@ -121,7 +125,10 @@ def __new__(  # noqa: PYI034
                     "module for further information about the project."
                 ),
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             cls._PROTOTYPE_WARNING_SHOWN = True
 
@@ -185,7 +192,10 @@ def __tensor_unflatten__(
         outer_stride,
     ) -> torch.Tensor:
         shape, fuse_transpose_cusparselt, alg_id_cusparselt, requires_grad = tensor_meta
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(
             shape=shape,
             packed=inner_tensors.get("packed", None),
@@ -415,7 +425,10 @@ def from_dense(
             sparse_tensor_cutlass,
             meta_tensor_cutlass,
         ) = sparse_semi_structured_from_dense_cutlass(original_tensor)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(
             original_tensor.shape,
             packed=sparse_tensor_cutlass,
@@ -469,6 +482,7 @@ def prune_dense_static_sort(
         The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
         ```
         from torch.sparse import SparseSemiStructuredTensorCUTLASS
+<<<<<<< HEAD
         from torch.sparse._semi_structured_conversions import (
             _sparse_semi_structured_tile,
             _compute_compressed_swizzled_bitmask,
@@ -489,6 +503,16 @@ def prune_dense_static_sort(
             meta_t_cutlass,
             bitmask,
         )
+=======
+        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+
+        pruned = _sparse_semi_structured_tile(dense)
+        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
+        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(pruned.t().contiguous())
+        bitmask = _compute_compressed_swizzled_bitmask(pruned)
+
+        SparseSemiStructuredTensorCUTLASS(dense.shape, packed_cutlass, meta_cutlass, packed_t_cutlass, meta_t_cutlass, bitmask)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ```
         """
         # We can either pack to the CUTLASS or cuSPARSELt representation, depending on the use_cutlass flag.
@@ -502,7 +526,10 @@ def prune_dense_static_sort(
             original_tensor, algorithm=algorithm, use_cutlass=True
         )
 
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(
             original_tensor.shape,
             packed=packed,
@@ -564,7 +591,10 @@ def from_dense(
         cls, original_tensor: torch.Tensor
     ) -> "SparseSemiStructuredTensorCUSPARSELT":
         cls._validate_device_dim_dtype_shape(original_tensor)
+<<<<<<< HEAD
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(
             shape=original_tensor.shape,
             packed=torch._cslt_compress(original_tensor),
@@ -582,7 +612,11 @@ def prune_dense_static_sort(
         cls, original_tensor: torch.Tensor, algorithm=""
     ) -> "SparseSemiStructuredTensor":
         """
+<<<<<<< HEAD
         This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
+=======
+        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPASRELt metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout and sparse matmul.
 
         The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.
@@ -601,19 +635,27 @@ def prune_dense_static_sort(
         The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
         ```
         from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
+<<<<<<< HEAD
         from torch.sparse._semi_structured_conversions import (
             _sparse_semi_structured_tile,
             _compute_compressed_swizzled_bitmask,
         )
+=======
+        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         pruned = _sparse_semi_structured_tile(dense)
         packed_cusparselt = torch._cslt_compress(pruned)
         packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
         bitmask = _compute_compressed_swizzled_bitmask(pruned)
 
+<<<<<<< HEAD
         SparseSemiStructuredTensorCUSPARSELT(
             dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask
         )
+=======
+        SparseSemiStructuredTensorCUSPARSELT(dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ```
         """
         (
@@ -626,12 +668,15 @@ def prune_dense_static_sort(
             original_tensor, algorithm=algorithm, use_cutlass=False
         )
 
+<<<<<<< HEAD
         # Map this two 2-dim view of packed data.
         # TODO: is this proper cuSPARSELt metadata?
         packed = packed.view(original_tensor.shape[0], -1)
         packed_t = packed_t.view(original_tensor.shape[1], -1)
 
         # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return cls(
             original_tensor.shape,
             packed=packed,
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index dbc9314ad2087..f66bf82bcfeee 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -134,7 +134,13 @@
     >>> torch.special.digamma(a)
     tensor([-0.5772, -1.9635])
 
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 gammaln = _add_docstr(
@@ -160,7 +166,13 @@
     >>> torch.special.gammaln(a)
     tensor([ 0.5724,  0.0000, -0.1208])
 
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 polygamma = _add_docstr(
@@ -196,7 +208,13 @@
     tensor([ 6.4939, 97.4091])
     >>> torch.special.polygamma(4, a)
     tensor([ -24.8863, -771.4742])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 erf = _add_docstr(
@@ -220,7 +238,13 @@
 
     >>> torch.special.erf(torch.tensor([0, -1., 10.]))
     tensor([ 0.0000, -0.8427,  1.0000])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 erfc = _add_docstr(
@@ -245,7 +269,13 @@
 
     >>> torch.special.erfc(torch.tensor([0, -1., 10.]))
     tensor([ 1.0000, 1.8427,  0.0000])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 erfcx = _add_docstr(
@@ -273,7 +303,13 @@
 
     >>> torch.special.erfcx(torch.tensor([0, -1., 10.]))
     tensor([ 1.0000, 5.0090, 0.0561])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 erfinv = _add_docstr(
@@ -299,7 +335,13 @@
 
     >>> torch.special.erfinv(torch.tensor([0, 0.5, -1.]))
     tensor([ 0.0000,  0.4769,    -inf])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 logit = _add_docstr(
@@ -337,7 +379,13 @@
     tensor([0.2796, 0.9331, 0.6486, 0.1523, 0.6516])
     >>> torch.special.logit(a, eps=1e-6)
     tensor([-0.9466,  2.6352,  0.6131, -1.7169,  0.6261])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 logsumexp = _add_docstr(
@@ -346,7 +394,13 @@
 logsumexp(input, dim, keepdim=False, *, out=None)
 
 Alias for :func:`torch.logsumexp`.
+<<<<<<< HEAD
 """.format(**multi_dim_common),
+=======
+""".format(
+        **multi_dim_common
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 expit = _add_docstr(
@@ -373,7 +427,13 @@
     tensor([ 0.9213,  1.0887, -0.8858, -1.7683])
     >>> torch.special.expit(t)
     tensor([ 0.7153,  0.7481,  0.2920,  0.1458])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 exp2 = _add_docstr(
@@ -398,7 +458,13 @@
 
     >>> torch.special.exp2(torch.tensor([0, math.log2(2.), 3, 4]))
     tensor([ 1.,  2.,  8., 16.])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 expm1 = _add_docstr(
@@ -426,7 +492,13 @@
 
     >>> torch.special.expm1(torch.tensor([0, math.log(2.)]))
     tensor([ 0.,  1.])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 xlog1py = _add_docstr(
@@ -471,7 +543,13 @@
     tensor([1.6094, 3.2189, 4.8283])
     >>> torch.special.xlog1py(2, y)
     tensor([2.7726, 2.1972, 1.3863])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 xlogy = _add_docstr(
@@ -516,7 +594,13 @@
     tensor([1.3863, 2.7726, 4.1589])
     >>> torch.special.xlogy(2, y)
     tensor([2.1972, 1.3863, 0.0000])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 i0 = _add_docstr(
@@ -542,7 +626,13 @@
     >>> torch.i0(torch.arange(5, dtype=torch.float32))
     tensor([ 1.0000,  1.2661,  2.2796,  4.8808, 11.3019])
 
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 i0e = _add_docstr(
@@ -567,7 +657,13 @@
 
     >>> torch.special.i0e(torch.arange(5, dtype=torch.float32))
     tensor([1.0000, 0.4658, 0.3085, 0.2430, 0.2070])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 i1 = _add_docstr(
@@ -592,7 +688,13 @@
 
     >>> torch.special.i1(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.5652, 1.5906, 3.9534, 9.7595])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 i1e = _add_docstr(
@@ -618,7 +720,13 @@
 
     >>> torch.special.i1e(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.2079, 0.2153, 0.1968, 0.1788])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ndtr = _add_docstr(
@@ -643,7 +751,13 @@
 
     >>> torch.special.ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([0.0013, 0.0228, 0.1587, 0.5000, 0.8413, 0.9772, 0.9987])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ndtri = _add_docstr(
@@ -671,7 +785,13 @@
 
     >>> torch.special.ndtri(torch.tensor([0, 0.25, 0.5, 0.75, 1]))
     tensor([   -inf, -0.6745,  0.0000,  0.6745,     inf])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 log_ndtr = _add_docstr(
@@ -696,7 +816,13 @@
 
     >>> torch.special.log_ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([-6.6077 -3.7832 -1.841  -0.6931 -0.1728 -0.023  -0.0014])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 log1p = _add_docstr(
@@ -737,7 +863,13 @@
     tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
     >>> torch.special.sinc(t)
     tensor([ 0.9186,  0.8631, -0.0259, -0.1300])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 round = _add_docstr(
@@ -842,7 +974,13 @@
     tensor([1.6449, 0.0823])
     >>> torch.special.zeta(2, torch.tensor([1., 2.]))
     tensor([1.6449, 0.6449])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 multigammaln = _add_docstr(
@@ -879,7 +1017,13 @@
     >>> torch.special.multigammaln(a, 2)
     tensor([[0.3928, 0.4007, 0.7586],
             [1.0311, 0.3901, 0.5049]])
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 gammainc = _add_docstr(
@@ -928,7 +1072,13 @@
     >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
     tensor([1., 1., 1.])
 
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 gammaincc = _add_docstr(
@@ -976,7 +1126,13 @@
     >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
     tensor([1., 1., 1.])
 
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 airy_ai = _add_docstr(
@@ -993,7 +1149,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 bessel_j0 = _add_docstr(
@@ -1010,7 +1172,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 bessel_j1 = _add_docstr(
@@ -1027,7 +1195,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 bessel_y0 = _add_docstr(
@@ -1044,7 +1218,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 bessel_y1 = _add_docstr(
@@ -1061,7 +1241,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 chebyshev_polynomial_t = _add_docstr(
@@ -1092,7 +1278,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 chebyshev_polynomial_u = _add_docstr(
@@ -1124,7 +1316,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 chebyshev_polynomial_v = _add_docstr(
@@ -1142,7 +1340,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 chebyshev_polynomial_w = _add_docstr(
@@ -1160,7 +1364,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 hermite_polynomial_h = _add_docstr(
@@ -1186,7 +1396,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 hermite_polynomial_he = _add_docstr(
@@ -1212,7 +1428,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 laguerre_polynomial_l = _add_docstr(
@@ -1238,7 +1460,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 legendre_polynomial_p = _add_docstr(
@@ -1264,7 +1492,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 modified_bessel_i0 = _add_docstr(
@@ -1281,7 +1515,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 modified_bessel_i1 = _add_docstr(
@@ -1298,7 +1538,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 modified_bessel_k0 = _add_docstr(
@@ -1315,7 +1561,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 modified_bessel_k1 = _add_docstr(
@@ -1332,7 +1584,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 scaled_modified_bessel_k0 = _add_docstr(
@@ -1349,7 +1607,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 scaled_modified_bessel_k1 = _add_docstr(
@@ -1366,7 +1630,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 shifted_chebyshev_polynomial_t = _add_docstr(
@@ -1384,7 +1654,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 shifted_chebyshev_polynomial_u = _add_docstr(
@@ -1402,7 +1678,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 shifted_chebyshev_polynomial_v = _add_docstr(
@@ -1420,7 +1702,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 shifted_chebyshev_polynomial_w = _add_docstr(
@@ -1438,7 +1726,13 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 spherical_bessel_j0 = _add_docstr(
@@ -1455,5 +1749,11 @@
 
 Keyword args:
     {out}
+<<<<<<< HEAD
 """.format(**common_args),
+=======
+""".format(
+        **common_args
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
diff --git a/torch/storage.py b/torch/storage.py
index 1b9023121ddfb..d181ff5d06c0c 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -618,7 +618,11 @@ def _get_storage_from_sequence(sequence, dtype, device):
 
 def _isint(x):
     if HAS_NUMPY:
+<<<<<<< HEAD
         return isinstance(x, (int, np.integer))  # pyrefly: ignore [missing-attribute]
+=======
+        return isinstance(x, (int, np.integer))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return isinstance(x, int)
 
@@ -889,9 +893,15 @@ def untyped(self):
         return self._untyped_storage
 
     def _new_wrapped_storage(self, untyped_storage) -> Self:
+<<<<<<< HEAD
         assert type(untyped_storage) is torch.UntypedStorage
 
         if type(self) is TypedStorage:
+=======
+        assert type(untyped_storage) == torch.UntypedStorage
+
+        if type(self) == TypedStorage:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return cast(
                 Self,
                 TypedStorage(
@@ -913,7 +923,11 @@ def _maybe_wrap_index(self, idx, is_stop=False):
                 return 0
 
         else:
+<<<<<<< HEAD
             if type(idx) is not int:
+=======
+            if type(idx) != int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise TypeError(f"can't index a {type(self)} with {type(idx)}")
             if is_stop:
                 if (idx > self._size()) or (idx < -self._size()):
@@ -1513,7 +1527,11 @@ class _LegacyStorageMeta(type):
     dtype: torch.dtype
 
     def __instancecheck__(cls, instance):
+<<<<<<< HEAD
         if type(instance) is TypedStorage:
+=======
+        if type(instance) == TypedStorage:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls_device = _get_device_from_module(cls.__module__)
             return (cls_device == instance.device.type) and (
                 cls.dtype == instance.dtype
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 6724bd3d523b0..669168c06cbb6 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,7 +1,10 @@
 from torch._C import FileCheck as FileCheck
 
 from . import _utils
+<<<<<<< HEAD
 
 # pyrefly: ignore [deprecated]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._comparison import assert_allclose, assert_close as assert_close
 from ._creation import make_tensor as make_tensor
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 45622ec7f15e9..66a8544f6fb15 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -3,8 +3,13 @@
 import cmath
 import collections.abc
 import contextlib
+<<<<<<< HEAD
 from collections.abc import Callable, Collection, Sequence
 from typing import Any, NoReturn, Optional, Union
+=======
+from collections.abc import Collection, Sequence
+from typing import Any, Callable, NoReturn, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -92,9 +97,13 @@ def default_tolerances(
                 f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
             )
     dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
+<<<<<<< HEAD
     rtols, atols = zip(
         *[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes], strict=True
     )
+=======
+    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return max(rtols), max(atols)
 
 
@@ -243,7 +252,10 @@ def make_scalar_mismatch_msg(
             Defaults to "Scalars".
     """
     abs_diff = abs(actual - expected)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rel_diff = float("inf") if expected == 0 else abs_diff / abs(expected)
     return _make_mismatch_msg(
         default_identifier="Scalars",
@@ -487,7 +499,10 @@ def __init__(
     def _supported_types(self) -> tuple[type, ...]:
         cls: list[type] = [bool]
         if HAS_NUMPY:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls.append(np.bool_)
         return tuple(cls)
 
@@ -503,7 +518,10 @@ def _process_inputs(
     def _to_bool(self, bool_like: Any, *, id: tuple[Any, ...]) -> bool:
         if isinstance(bool_like, bool):
             return bool_like
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(bool_like, np.bool_):
             return bool_like.item()
         else:
@@ -583,7 +601,10 @@ def __init__(
     def _supported_types(self) -> tuple[type, ...]:
         cls = list(self._NUMBER_TYPES)
         if HAS_NUMPY:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls.append(np.number)
         return tuple(cls)
 
@@ -599,7 +620,10 @@ def _process_inputs(
     def _to_number(
         self, number_like: Any, *, id: tuple[Any, ...]
     ) -> Union[int, float, complex]:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if HAS_NUMPY and isinstance(number_like, np.number):
             return number_like.item()
         elif isinstance(number_like, self._NUMBER_TYPES):
@@ -1122,7 +1146,10 @@ def originate_pairs(
     mapping_types: tuple[type, ...] = (collections.abc.Mapping,),
     id: tuple[Any, ...] = (),
     **options: Any,
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[Pair]:
     """Originates pairs from the individual inputs.
 
@@ -1221,7 +1248,10 @@ def originate_pairs(
     else:
         for pair_type in pair_types:
             try:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-instantiation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return [pair_type(actual, expected, id=id, **options)]
             # Raising an `UnsupportedInputs` during origination indicates that the pair type is not able to handle the
             # inputs. Thus, we try the next pair type.
@@ -1319,9 +1349,13 @@ def not_close_error_metas(
     # would not get freed until cycle collection, leaking cuda memory in tests.
     # We break the cycle by removing the reference to the error_meta objects
     # from this frame as it returns.
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     error_metas = [error_metas]
     # pyrefly: ignore [bad-return]
+=======
+    error_metas = [error_metas]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return error_metas.pop()
 
 
@@ -1549,9 +1583,13 @@ def assert_close(
         >>> expected = torch.tensor([1.0, 2.0, 3.0])
         >>> actual = torch.tensor([1.0, 4.0, 5.0])
         >>> # The default error message can be overwritten.
+<<<<<<< HEAD
         >>> torch.testing.assert_close(
         ...     actual, expected, msg="Argh, the tensors are not close!"
         ... )
+=======
+        >>> torch.testing.assert_close(actual, expected, msg="Argh, the tensors are not close!")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Traceback (most recent call last):
         ...
         AssertionError: Argh, the tensors are not close!
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index 23d80d6ceae4f..bbb52d2e59117 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -115,11 +115,19 @@ def make_tensor(
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> from torch.testing import make_tensor
         >>> # Creates a float tensor with values in [-1, 1)
+<<<<<<< HEAD
         >>> make_tensor((3,), device="cpu", dtype=torch.float32, low=-1, high=1)
         >>> # xdoctest: +SKIP
         tensor([ 0.1205, 0.2282, -0.6380])
         >>> # Creates a bool tensor on CUDA
         >>> make_tensor((2, 2), device="cuda", dtype=torch.bool)
+=======
+        >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1)
+        >>> # xdoctest: +SKIP
+        tensor([ 0.1205, 0.2282, -0.6380])
+        >>> # Creates a bool tensor on CUDA
+        >>> make_tensor((2, 2), device='cuda', dtype=torch.bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor([[False, False],
                 [False, True]], device='cuda:0')
     """
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index b3616fede6ce6..7d5053e72da50 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -437,13 +437,21 @@ def compare(first, second):
                 if isinstance(first, torch.Tensor):
                     return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
+<<<<<<< HEAD
                     return all(compare(f, s) for f, s in zip(first, second, strict=False))
+=======
+                    return all(compare(f, s) for f, s in zip(first, second))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     return first == second
 
             # If both torch.* and Tensor.* variants were found, check outputs are identical
             if (output is not None) and (output_method is not None):
+<<<<<<< HEAD
                 self.assertTrue(type(output) is type(output_method))
+=======
+                self.assertTrue(type(output) == type(output_method))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 comparison = compare(output, output_method)
                 self.assertTrue(
                     comparison, f"torch.{op} result did not match Tensor.{op} result"
@@ -465,7 +473,11 @@ def compare(first, second):
                     control = getattr(args[0].to(run_as_type), op)(
                         *cast(args[1:], run_as_type), **add_kwargs
                     )
+<<<<<<< HEAD
                 self.assertTrue(type(output_to_compare) is type(control))
+=======
+                self.assertTrue(type(output_to_compare) == type(control))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 comparison = compare(output_to_compare, control)
                 self.assertTrue(comparison, f"torch.{op} result did not match control")
             self.assertTrue(torch.is_autocast_enabled(device_type=device))
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 74dfe0c56c232..65ae124568dae 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -40,8 +40,11 @@
                   and torch.cuda.get_device_capability()[1] > 0)
 IS_JETSON = LazyVal(lambda: torch.cuda.is_available() and (torch.cuda.get_device_capability() in [(7, 2), (8, 7)] or IS_THOR))
 IS_SM89 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (8, 9))
+<<<<<<< HEAD
 IS_SM90 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0))
 IS_SM100 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (10, 0))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def evaluate_gfx_arch_within(arch_list):
     if not torch.cuda.is_available():
@@ -81,6 +84,7 @@ def evaluate_platform_supports_efficient_attention():
 def evaluate_platform_supports_cudnn_attention():
     return (not TEST_WITH_ROCM) and SM80OrLater and (TEST_CUDNN_VERSION >= 90000)
 
+<<<<<<< HEAD
 def evaluate_platform_supports_green_context():
     if IS_WINDOWS:
         return False
@@ -91,6 +95,8 @@ def evaluate_platform_supports_green_context():
         return False
     return int(driver_version.split('.')[0]) >= 550
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PLATFORM_SUPPORTS_FLASH_ATTENTION: bool = LazyVal(lambda: evaluate_platform_supports_flash_attention())
 PLATFORM_SUPPORTS_MEM_EFF_ATTENTION: bool = LazyVal(lambda: evaluate_platform_supports_efficient_attention())
 PLATFORM_SUPPORTS_CUDNN_ATTENTION: bool = LazyVal(lambda: evaluate_platform_supports_cudnn_attention())
@@ -103,8 +109,11 @@ def evaluate_platform_supports_green_context():
 
 PLATFORM_SUPPORTS_BF16: bool = LazyVal(lambda: TEST_CUDA and SM80OrLater)
 
+<<<<<<< HEAD
 PLATFORM_SUPPORTS_GREEN_CONTEXT: bool = LazyVal(lambda: evaluate_platform_supports_green_context())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
@@ -120,6 +129,7 @@ def evaluate_platform_supports_fp8():
             return SM90OrLater or torch.cuda.get_device_capability() == (8, 9)
     return False
 
+<<<<<<< HEAD
 def evaluate_platform_supports_fp8_grouped_gemm():
     if torch.cuda.is_available():
         if torch.version.hip:
@@ -134,6 +144,11 @@ def evaluate_platform_supports_fp8_grouped_gemm():
     return False
 
 def evaluate_platform_supports_mx_gemm():
+=======
+PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
+
+def _platform_supports_mx_gemm():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch.cuda.is_available():
         if torch.version.hip:
             if ROCM_VERSION >= (7, 0):
@@ -142,6 +157,7 @@ def evaluate_platform_supports_mx_gemm():
             return SM100OrLater
     return False
 
+<<<<<<< HEAD
 def evaluate_platform_supports_mxfp8_grouped_gemm():
     if torch.cuda.is_available() and not torch.version.hip:
         built_with_fbgemm_genai = "USE_FBGEMM_GENAI" in torch.__config__.show()
@@ -152,6 +168,9 @@ def evaluate_platform_supports_mxfp8_grouped_gemm():
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_FP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_fp8_grouped_gemm())
 PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp8_grouped_gemm())
+=======
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_NUMBA:
     try:
@@ -192,6 +211,12 @@ def tf32_off():
 
 @contextlib.contextmanager
 def tf32_on(self, tf32_precision=1e-5):
+<<<<<<< HEAD
+=======
+    if torch.version.hip:
+        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
     old_precision = self.precision
     try:
@@ -200,6 +225,14 @@ def tf32_on(self, tf32_precision=1e-5):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
             yield
     finally:
+<<<<<<< HEAD
+=======
+        if torch.version.hip:
+            if hip_allow_tf32 is not None:
+                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+            else:
+                del os.environ["HIPBLASLT_ALLOW_TF32"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precision
 
@@ -249,7 +282,11 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
+<<<<<<< HEAD
 def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):
+=======
+def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
@@ -264,7 +301,11 @@ def wrapper(f):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
+<<<<<<< HEAD
             kwargs.update(zip(arg_names, args, strict=False))
+=======
+            kwargs.update(zip(arg_names, args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
@@ -309,7 +350,11 @@ def _get_torch_rocm_version():
     if not TEST_WITH_ROCM or torch.version.hip is None:
         return (0, 0)
     rocm_version = str(torch.version.hip)
+<<<<<<< HEAD
     rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
+=======
+    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return tuple(int(x) for x in rocm_version.split("."))
 
 def _check_cusparse_generic_available():
@@ -322,7 +367,11 @@ def _check_hipsparse_generic_available():
         return False
 
     rocm_version = str(torch.version.hip)
+<<<<<<< HEAD
     rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
+=======
+    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
     return not (rocm_version_tuple is None or rocm_version_tuple < (5, 1))
 
@@ -337,7 +386,11 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
     mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     with torch.no_grad():
+<<<<<<< HEAD
         for c, s in zip(mod_control.parameters(), mod_scaling.parameters(), strict=True):
+=======
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             s.copy_(c)
 
     kwargs = {"lr": 1.0}
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index c31d7a54b65a1..590b5a820fd63 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -9,10 +9,17 @@
 import threading
 import unittest
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Sequence
 from enum import Enum
 from functools import partial, wraps
 from typing import Any, ClassVar, Optional, TypeVar, Union
+=======
+from collections.abc import Iterable, Sequence
+from enum import Enum
+from functools import partial, wraps
+from typing import Any, Callable, ClassVar, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -34,7 +41,10 @@
     IS_MACOS,
     is_privateuse1_backend_available,
     IS_REMOTE_GPU,
+<<<<<<< HEAD
     IS_S390X,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_SANDCASTLE,
     IS_WINDOWS,
     NATIVE_DEVICES,
@@ -46,7 +56,10 @@
     TEST_MPS,
     TEST_WITH_ASAN,
     TEST_WITH_MIOPEN_SUGGEST_NHWC,
+<<<<<<< HEAD
     TEST_WITH_MTIA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_ROCM,
     TEST_WITH_TORCHINDUCTOR,
     TEST_WITH_TSAN,
@@ -391,8 +404,13 @@ def _get_tolerance_override(self, test, dtype):
         return test.tolerance_overrides.get(dtype, tol(self.precision, self.rel_tol))
 
     def _apply_precision_override_for_test(self, test, param_kwargs):
+<<<<<<< HEAD
         dtype = param_kwargs.get("dtype")
         dtype = param_kwargs.get("dtypes", dtype)
+=======
+        dtype = param_kwargs["dtype"] if "dtype" in param_kwargs else None
+        dtype = param_kwargs["dtypes"] if "dtypes" in param_kwargs else dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dtype:
             self.precision = self._get_precision_override(test, dtype)
             self.precision, self.rel_tol = self._get_tolerance_override(test, dtype)
@@ -630,6 +648,7 @@ def get_primary_device(cls):
     @classmethod
     def get_all_devices(cls):
         # currently only one device is supported on MPS backend
+<<<<<<< HEAD
         primary_device_idx = int(cls.get_primary_device().split(":")[1])
         num_devices = torch.xpu.device_count()
 
@@ -641,6 +660,10 @@ def get_all_devices(cls):
             if idx != primary_device_idx
         ]
         return [prim_device] + non_primary_devices
+=======
+        prim_device = cls.get_primary_device()
+        return [prim_device]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def setUpClass(cls):
@@ -704,6 +727,7 @@ def get_device_type_test_bases():
 
     if IS_SANDCASTLE or IS_FBCODE:
         if IS_REMOTE_GPU:
+<<<<<<< HEAD
             # Skip if sanitizer is enabled or we're on MTIA machines
             if (
                 not TEST_WITH_ASAN
@@ -711,6 +735,10 @@ def get_device_type_test_bases():
                 and not TEST_WITH_UBSAN
                 and not TEST_WITH_MTIA
             ):
+=======
+            # Skip if sanitizer is enabled
+            if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 test_bases.append(CUDATestBase)
         else:
             test_bases.append(CPUTestBase)
@@ -737,9 +765,15 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
     intersect = set(except_for if except_for else []) & set(
         only_for if only_for else []
     )
+<<<<<<< HEAD
     assert not intersect, (
         f"device ({intersect}) appeared in both except_for and only_for"
     )
+=======
+    assert (
+        not intersect
+    ), f"device ({intersect}) appeared in both except_for and only_for"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Replace your privateuse1 backend name with 'privateuse1'
     if is_privateuse1_backend_available():
@@ -1155,7 +1189,11 @@ def test_wrapper(*args, **kwargs):
                             tracked_input = get_tracked_input()
                             if PRINT_REPRO_ON_FAILURE and tracked_input is not None:
                                 e_tracked = Exception(  # noqa: TRY002
+<<<<<<< HEAD
                                     f"{str(e)}\n\nCaused by {tracked_input.type_desc} "
+=======
+                                    f"Caused by {tracked_input.type_desc} "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     f"at index {tracked_input.index}: "
                                     f"{_serialize_sample(tracked_input.val)}"
                                 )
@@ -1293,6 +1331,7 @@ def __init__(self, dep, reason):
 
 
 def _has_sufficient_memory(device, size):
+<<<<<<< HEAD
     device_ = torch.device(device)
     device_type = device_.type
     if device_type in ["cuda", "xpu"]:
@@ -1326,6 +1365,28 @@ def _has_sufficient_memory(device, size):
         raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
 
     if device_type != "cpu":
+=======
+    if torch.device(device).type == "cuda":
+        if not torch.cuda.is_available():
+            return False
+        gc.collect()
+        torch.cuda.empty_cache()
+        # torch.cuda.mem_get_info, aka cudaMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
+        if device == "cuda":
+            device = "cuda:0"
+        return (
+            torch.cuda.memory.mem_get_info(device)[0]
+            * torch.cuda.memory.get_per_process_memory_fraction(device)
+        ) >= size
+
+    if device == "xla":
+        raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
+
+    if device == "xpu":
+        raise unittest.SkipTest("TODO: Memory availability checks for Intel GPU?")
+
+    if device != "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise unittest.SkipTest("Unknown device type")
 
     # CPU
@@ -1338,10 +1399,13 @@ def _has_sufficient_memory(device, size):
     else:
         effective_size = size
 
+<<<<<<< HEAD
     # don't try using all RAM on s390x, leave some for service processes
     if IS_S390X:
         effective_size = effective_size * 2
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if psutil.virtual_memory().available < effective_size:
         gc.collect()
     return psutil.virtual_memory().available >= effective_size
@@ -1375,6 +1439,10 @@ def dep_fn(self, *args, **kwargs):
             # an additional array of the same size as the input.
             if inductor and torch._inductor.config.cpp_wrapper and _device != "cpu":
                 size_bytes *= 2
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not _has_sufficient_memory(_device, size_bytes):
                 raise unittest.SkipTest(f"Insufficient {_device} memory")
 
@@ -1386,9 +1454,14 @@ def dep_fn(self, *args, **kwargs):
 
 
 class expectedFailure:
+<<<<<<< HEAD
     def __init__(self, device_type, dtype=None):
         self.device_type = device_type
         self.dtype = dtype
+=======
+    def __init__(self, device_type):
+        self.device_type = device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(self, fn):
         @wraps(fn)
@@ -1402,6 +1475,7 @@ def efail_fn(slf, *args, **kwargs):
             else:
                 target_device_type = slf.device_type
 
+<<<<<<< HEAD
             target_dtype = kwargs.get("dtype", getattr(slf, "dtype", None))
             device_matches = (
                 self.device_type is None or self.device_type == target_device_type
@@ -1409,6 +1483,9 @@ def efail_fn(slf, *args, **kwargs):
             dtype_matches = self.dtype is None or self.dtype == target_dtype
 
             if device_matches and dtype_matches:
+=======
+            if self.device_type is None or self.device_type == target_device_type:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     fn(slf, *args, **kwargs)
                 except Exception:
@@ -1422,13 +1499,21 @@ def efail_fn(slf, *args, **kwargs):
 
 
 class onlyOn:
+<<<<<<< HEAD
     def __init__(self, device_type: Union[str, list]):
+=======
+    def __init__(self, device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device_type = device_type
 
     def __call__(self, fn):
         @wraps(fn)
         def only_fn(slf, *args, **kwargs):
+<<<<<<< HEAD
             if slf.device_type not in self.device_type:
+=======
+            if self.device_type != slf.device_type:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 reason = f"Only runs on {self.device_type}"
                 raise unittest.SkipTest(reason)
 
@@ -1446,9 +1531,15 @@ def __init__(self, num_required_devices):
         self.num_required_devices = num_required_devices
 
     def __call__(self, fn):
+<<<<<<< HEAD
         assert not hasattr(fn, "num_required_devices"), (
             f"deviceCountAtLeast redefinition for {fn.__name__}"
         )
+=======
+        assert not hasattr(
+            fn, "num_required_devices"
+        ), f"deviceCountAtLeast redefinition for {fn.__name__}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn.num_required_devices = self.num_required_devices
 
         @wraps(fn)
@@ -1513,6 +1604,7 @@ def only_fn(self, *args, **kwargs):
 # self.precision *2, max(1, self.precision)).
 class precisionOverride:
     def __init__(self, d):
+<<<<<<< HEAD
         assert isinstance(d, dict), (
             "precisionOverride not given a dtype : precision dict!"
         )
@@ -1520,6 +1612,15 @@ def __init__(self, d):
             assert isinstance(dtype, torch.dtype), (
                 f"precisionOverride given unknown dtype {dtype}"
             )
+=======
+        assert isinstance(
+            d, dict
+        ), "precisionOverride not given a dtype : precision dict!"
+        for dtype in d.keys():
+            assert isinstance(
+                dtype, torch.dtype
+            ), f"precisionOverride given unknown dtype {dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.d = d
 
@@ -1552,12 +1653,21 @@ class toleranceOverride:
     def __init__(self, d):
         assert isinstance(d, dict), "toleranceOverride not given a dtype : tol dict!"
         for dtype, prec in d.items():
+<<<<<<< HEAD
             assert isinstance(dtype, torch.dtype), (
                 f"toleranceOverride given unknown dtype {dtype}"
             )
             assert isinstance(prec, tol), (
                 "toleranceOverride not given a dtype : tol dict!"
             )
+=======
+            assert isinstance(
+                dtype, torch.dtype
+            ), f"toleranceOverride given unknown dtype {dtype}"
+            assert isinstance(
+                prec, tol
+            ), "toleranceOverride not given a dtype : tol dict!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.d = d
 
@@ -1585,6 +1695,7 @@ def __init__(self, *args, device_type="all"):
                     "all dtype variants must be. "
                     f"Received non-list non-tuple dtype {str(arg)}"
                 )
+<<<<<<< HEAD
                 assert all(isinstance(dtype, torch.dtype) for dtype in arg), (
                     f"Unknown dtype in {str(arg)}"
                 )
@@ -1592,6 +1703,15 @@ def __init__(self, *args, device_type="all"):
             assert all(isinstance(arg, torch.dtype) for arg in args), (
                 f"Unknown dtype in {str(args)}"
             )
+=======
+                assert all(
+                    isinstance(dtype, torch.dtype) for dtype in arg
+                ), f"Unknown dtype in {str(arg)}"
+        else:
+            assert all(
+                isinstance(arg, torch.dtype) for arg in args
+            ), f"Unknown dtype in {str(args)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.args = args
         self.device_type = device_type
@@ -1616,12 +1736,15 @@ def __init__(self, *args):
         super().__init__(*args, device_type="cuda")
 
 
+<<<<<<< HEAD
 # Overrides specified dtypes on Intel GPU.
 class dtypesIfXPU(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type="xpu")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class dtypesIfMPS(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type="mps")
@@ -1728,10 +1851,13 @@ def expectedFailureMPS(fn):
     return expectedFailure("mps")(fn)
 
 
+<<<<<<< HEAD
 def expectedFailureMPSComplex(fn):
     return expectedFailure("mps", torch.complex64)(fn)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def expectedFailureMPSPre15(fn):
     import platform
 
@@ -1991,10 +2117,13 @@ def skipHPU(fn):
     return skipHPUIf(True, "test doesn't work on HPU backend")(fn)
 
 
+<<<<<<< HEAD
 def skipXPU(fn):
     return skipXPUIf(True, "test doesn't work on XPU backend")(fn)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipPRIVATEUSE1(fn):
     return skipPRIVATEUSE1If(True, "test doesn't work on privateuse1 backend")(fn)
 
@@ -2013,18 +2142,27 @@ def get_all_device_types() -> list[str]:
     and torch.cpu._is_avx2_supported()
     and os.getenv("ATEN_CPU_CAPABILITY") != "default"
 )
+<<<<<<< HEAD
 IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = (
     torch.xpu.is_available() and torch.utils._triton.has_triton()
 )
 flex_attention_supported_platform = unittest.skipUnless(
     IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED
     or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+=======
+flex_attention_supported_platform = unittest.skipUnless(
+    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     or (
         torch.cuda.is_available()
         and torch.utils._triton.has_triton()
         and torch.cuda.get_device_capability() >= (8, 0)
     ),
+<<<<<<< HEAD
     "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later",
+=======
+    "Requires CUDA and Triton, or CPU with avx2 and later",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 18384b311b936..6ac7355475a96 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1,7 +1,10 @@
 # mypy: ignore-errors
 
 import faulthandler
+<<<<<<< HEAD
 import functools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import logging
 import multiprocessing
@@ -16,14 +19,21 @@
 import traceback
 import types
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
 from functools import partial, reduce, wraps
 from io import StringIO
+<<<<<<< HEAD
 from typing import Any, NamedTuple, Optional, Union
+=======
+from typing import Any, Callable, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -34,12 +44,18 @@
 from torch._C._autograd import DeviceType
 from torch._C._distributed_c10d import _SymmetricMemory
 from torch._logging._internal import trace_log
+<<<<<<< HEAD
 from torch.testing._internal import common_utils
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
     find_free_port,
     IS_SANDCASTLE,
+<<<<<<< HEAD
     LazyVal,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     retry_on_connect_failures,
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
@@ -100,10 +116,17 @@ class TestSkip(NamedTuple):
 class DistTestCases:
     # Backends that do not support a specific collective
     skip_collective = {}
+<<<<<<< HEAD
     skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc", "xccl"}
     skip_collective["reduce"] = set()
     skip_collective["sendrecv anysource"] = {"nccl", "ucc", "xccl"}
     skip_collective["cpu barrier"] = {"nccl", "ucc", "xccl"}
+=======
+    skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc"}
+    skip_collective["reduce"] = set()
+    skip_collective["sendrecv anysource"] = {"nccl", "ucc"}
+    skip_collective["cpu barrier"] = {"nccl", "ucc"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Sets showing that something is implemented
     backend_feature = {}
@@ -211,6 +234,7 @@ def at_least_x_gpu(x):
     return False
 
 
+<<<<<<< HEAD
 def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
     _handle_test_skip = getattr(args[0], "_handle_test_skip", None)
     if len(args) == 0 or _handle_test_skip is None:
@@ -219,6 +243,8 @@ def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
     return True
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -229,9 +255,13 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if TEST_XPU and torch.xpu.device_count() >= x:
                 return func(*args, **kwargs)
+<<<<<<< HEAD
             test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
             if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
                 sys.exit(test_skip.exit_code)
+=======
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return wrapper
 
@@ -247,9 +277,13 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
+<<<<<<< HEAD
             test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
             if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
                 sys.exit(test_skip.exit_code)
+=======
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return wrapper
 
@@ -269,9 +303,15 @@ def verify_ddp_error_logged(model_DDP, err_substr):
         if err_substr.find("\nException raised from ") == -1
         else err_substr.split("\nException raised from ")[0]
     )
+<<<<<<< HEAD
     assert actual in logging_err, (
         f"Did not find expected {actual} in ddp logging data error: {logging_err}"
     )
+=======
+    assert (
+        actual in logging_err
+    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def with_nccl_blocking_wait(func):
@@ -310,9 +350,15 @@ def wrapper(*args, **kwargs):
         finally:
             # restore old values.
             if cached_nccl_async_error_handling is not None:
+<<<<<<< HEAD
                 os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
                     cached_nccl_async_error_handling
                 )
+=======
+                os.environ[
+                    "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+                ] = cached_nccl_async_error_handling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if cached_nccl_blocking_wait is not None:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = cached_nccl_blocking_wait
@@ -354,8 +400,11 @@ def requires_gloo():
 
 
 def requires_nccl_version(version, msg):
+<<<<<<< HEAD
     if not TEST_CUDA:
         return lambda f: f
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not c10d.is_nccl_available():
         return skip_but_pass_in_sandcastle(
             "c10d was not compiled with the NCCL backend",
@@ -428,6 +477,7 @@ def requires_multicast_support():
     )
 
 
+<<<<<<< HEAD
 def evaluate_platform_supports_symm_mem():
     if TEST_CUDA:
         if TEST_WITH_ROCM:
@@ -486,6 +536,19 @@ def decorator(func):
         return unittest.skipIf(reason is not None, reason)(func)
 
     return decorator
+=======
+def skip_if_rocm_multiprocess(func):
+    """Skips a test for ROCm"""
+    func.skip_if_rocm_multiprocess = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        sys.exit(TEST_SKIPS["skipIfRocm"].exit_code)
+
+    return wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def skip_if_win32():
@@ -500,10 +563,16 @@ def sm_is_or_higher_than(device: torch.device, major: int, minor: int) -> bool:
     Returns True if the device's compute capability is (major, minor) or higher.
     Error out if the device is not a CUDA device.
     Returns False if device is a RoCM device.
+<<<<<<< HEAD
     Returns True if device is a non-CUDA device.
     """
     if device.type != "cuda":
         return True
+=======
+    """
+    if device.type != "cuda":
+        raise ValueError("sm_is_or_later() is only supported for CUDA devices")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch.version.hip is not None:
         # ROCm devices may have different compute capability codes
@@ -786,12 +855,16 @@ def _start_processes(self, proc) -> None:
             process = proc(
                 target=self.__class__._run,
                 name="process " + str(rank),
+<<<<<<< HEAD
                 args=(
                     rank,
                     self._current_test_name(),
                     self.file_name,
                     child_conn,
                 ),
+=======
+                args=(rank, self._current_test_name(), self.file_name, child_conn),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs={
                     "fake_pg": getattr(self, "fake_pg", False),
                 },
@@ -868,14 +941,21 @@ def run_test(self, test_name: str, parent_pipe) -> None:
             torch._C._set_print_stack_traces_on_fatal_signal(True)
         # Show full C++ stacktraces when a Python error originating from C++ is raised.
         os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+<<<<<<< HEAD
         common_utils.set_rng_seed()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
         # We're retrieving a corresponding test and executing it.
         try:
             getattr(self, test_name)()
         except unittest.SkipTest as se:
+<<<<<<< HEAD
             logger.info(  # noqa: G200
+=======
+            logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Process %s skipping test %s for following reason: %s",
                 self.rank,
                 test_name,
@@ -884,7 +964,11 @@ def run_test(self, test_name: str, parent_pipe) -> None:
             sys.exit(TEST_SKIPS["generic"].exit_code)
         except Exception:
             logger.error(
+<<<<<<< HEAD
                 "Caught exception: \n%s exiting process %s with exit code: %s",
+=======
+                "Caught exception: \n%s exiting " "process %s with exit code: %s",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 traceback.format_exc(),
                 self.rank,
                 MultiProcessTestCase.TEST_ERROR_EXIT_CODE,
@@ -917,10 +1001,18 @@ def _get_timedout_process_traceback(self) -> None:
                 try:
                     pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
                     pipes.append((i, pipe))
+<<<<<<< HEAD
                 except ConnectionError:
                     logger.exception(
                         "Encountered error while trying to get traceback for process %s",
                         i,
+=======
+                except ConnectionError as e:
+                    logger.error(
+                        "Encountered error while trying to get traceback for process %s: %s",
+                        i,
+                        e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         # Wait for results.
@@ -943,10 +1035,18 @@ def _get_timedout_process_traceback(self) -> None:
                     logger.error(
                         "Could not retrieve traceback for timed out process: %s", rank
                     )
+<<<<<<< HEAD
             except ConnectionError:
                 logger.exception(
                     "Encountered error while trying to get traceback for process %s",
                     rank,
+=======
+            except ConnectionError as e:
+                logger.error(
+                    "Encountered error while trying to get traceback for process %s: %s",
+                    rank,
+                    e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def _join_processes(self, fn) -> None:
@@ -1151,7 +1251,11 @@ def run_subtests(
     subtest_config_values: list[list[Any]] = [item[1] for item in subtest_config_items]
     for values in itertools.product(*subtest_config_values):
         # Map keyword to chosen value
+<<<<<<< HEAD
         subtest_kwargs = dict(zip(subtest_config_keys, values, strict=True))
+=======
+        subtest_kwargs = dict(zip(subtest_config_keys, values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with cls_inst.subTest(**subtest_kwargs):
             torch._dynamo.reset()
             test_fn(*test_args, **test_kwargs, **subtest_kwargs)
@@ -1159,24 +1263,45 @@ def run_subtests(
         c10d.barrier()
 
 
+<<<<<<< HEAD
 @functools.cache
+=======
+# Cannot use functools.cache as it requires python 3.9
+EFA_PROBE_RESULT = None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
     Libfabric EFA interfaces and EFA software components installed,
     see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
     """
+<<<<<<< HEAD
 
     try:
         return (
+=======
+    global EFA_PROBE_RESULT
+    if EFA_PROBE_RESULT is not None:
+        return EFA_PROBE_RESULT
+
+    try:
+        EFA_PROBE_RESULT = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             subprocess.run(
                 ["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False
             ).returncode
             == 0
         )
     except FileNotFoundError:
+<<<<<<< HEAD
         pass
     return False
+=======
+        EFA_PROBE_RESULT = False
+    return EFA_PROBE_RESULT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tp_transports():
@@ -1213,7 +1338,11 @@ def worker(rank, world_pg, store):
             )
             try:
                 callback()
+<<<<<<< HEAD
             except BaseException as ex:  # noqa: B036
+=======
+            except BaseException as ex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Exceptions are handled in MultiThreadedTestCase
                 MultiThreadedTestCase.exception_queue.put((rank, sys.exc_info()))
                 ProcessLocalGroup.exception_handle(
@@ -1374,7 +1503,11 @@ def run_test_with_threaded_pg(self, test_name, rank, world_size):
 
         try:
             getattr(self, test_name)()
+<<<<<<< HEAD
         except BaseException as ex:  # noqa: B036
+=======
+        except BaseException as ex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.exception_queue.put((rank, sys.exc_info()))
             ProcessLocalGroup.exception_handle(
                 ex
@@ -1442,7 +1575,11 @@ def _check_return_codes(cls, failed_ranks, timeout, fn):
                 logger.error("Caught exception: \n%s exiting thread %s", msg, rank)
                 error_msg += f"Thread {rank} exited with exception:\n{msg}\n"
             elif isinstance(exc, SystemExit):
+<<<<<<< HEAD
                 if type(exc.code) is int and skip_code < 0:
+=======
+                if type(exc.code) == int and skip_code < 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     skip_code = exc.code
 
         # check exceptions
@@ -1520,12 +1657,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 @contextmanager
 def _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
     rank, world_size, backend=None, init_pg=True, fake_pg=False
+=======
+    rank, world_size, backend="nccl", init_pg=True, fake_pg=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
     # Just manually implement the most important part of the dynamo behavior to reset/clear.
     if not fake_pg:
         torch.accelerator.set_device_index(rank)
+<<<<<<< HEAD
 
     device_type = (
         acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
@@ -1533,6 +1675,8 @@ def _dynamo_dist_per_rank_init(
     if backend is None:
         backend = c10d.get_default_backend_for_device(device_type)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "6789"
     if init_pg:
@@ -1579,12 +1723,18 @@ def setUpClass(cls):
             )
         )
         cls.rank = 0
+<<<<<<< HEAD
         device = torch.accelerator.current_accelerator().type
         cls.device = f"{device}:{cls.rank}"
         cls.device_ids = None if device in cls.device else [cls.rank]
         c10d.init_process_group(
             c10d.get_default_backend_for_device(device), rank=cls.rank, world_size=1
         )
+=======
+        cls.device = f"cuda:{cls.rank}"
+        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
+        c10d.init_process_group("nccl", rank=cls.rank, world_size=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def tearDownClass(cls):
@@ -1620,7 +1770,11 @@ def _run(
         self.run_test(test_name, parent_pipe)
 
 
+<<<<<<< HEAD
 class MultiProcContinuousTest(TestCase):
+=======
+class MultiProcContinousTest(TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Class variables:
     MAIN_PROCESS_RANK = -1
     # number of test processes
@@ -1663,11 +1817,16 @@ def opts(cls, high_priority_stream=False):
     @classmethod
     def _init_pg(cls, rank, world_size, rdvz_file):
         assert rdvz_file is not None
+<<<<<<< HEAD
         # rank should be local_rank for tests running on <= 8gpus which is how all these tests are designed
         # and we expect LOCAL_RANK set by torchrun. Setting it lets init_device_mesh set the device without
         # issuing a warning
         os.environ["LOCAL_RANK"] = str(rank)
         store = c10d.FileStore(rdvz_file, world_size)
+=======
+        store = c10d.FileStore(rdvz_file, world_size)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # create nccl processgroup with opts
         c10d.init_process_group(
             backend=cls.backend_str(),
@@ -1682,22 +1841,32 @@ def _init_pg(cls, rank, world_size, rdvz_file):
     @classmethod
     def _run_test_given_id(cls, test_id: str, **kwargs) -> None:
         # self.id() == e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+<<<<<<< HEAD
         test_name = test_id.rsplit(".", maxsplit=1)[-1]
+=======
+        test_name = test_id.split(".")[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Get the test function from the test class
         self = cls(test_name)
         self.rank = cls.rank
         self.world_size = cls.world_size
         test_fn = getattr(self, test_name)
+<<<<<<< HEAD
 
         # Ensure all the ranks use the same seed.
         common_utils.set_rng_seed()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Run the test function
         test_fn(**kwargs)
 
     @classmethod
     def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue):
+<<<<<<< HEAD
         raised_exception = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Sub tests are going to access these values, check first
         assert 0 <= rank < world_size
         # set class variables for the test class
@@ -1708,7 +1877,11 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
         cls._init_pg(rank, world_size, rdvz_file)
 
         # End of bootstrap
+<<<<<<< HEAD
         logger.debug("Setup complete")
+=======
+        logger.info("Setup complete")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Loop forever, waiting for a test name to run
         while True:
@@ -1722,6 +1895,7 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
             try:
                 cls._run_test_given_id(test_id)
                 completion_queue.put(test_id)
+<<<<<<< HEAD
             except BaseException as ex:  # noqa: B036
                 raised_exception = True
                 # Send the exception and stack trace back to the dispatcher
@@ -1740,6 +1914,15 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
         # Only call this on a clean exit path
         if not raised_exception:
             c10d.destroy_process_group()
+=======
+            except BaseException as ex:
+                # Send the exception back to the dispatcher
+                completion_queue.put(ex)
+
+        # Termination
+        logger.info("Terminating ...")
+        c10d.destroy_process_group()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def _spawn_processes(cls, world_size) -> None:
@@ -1770,7 +1953,13 @@ def _spawn_processes(cls, world_size) -> None:
             cls.processes.append(process)
             cls.task_queues.append(task_queue)
             cls.completion_queues.append(completion_queue)
+<<<<<<< HEAD
             logger.debug("Started process %s with pid %s", rank, process.pid)  # noqa: UP031
+=======
+            logger.info(
+                "Started process %s with pid %s", rank, process.pid
+            )  # noqa: UP031
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def setUpClass(cls):
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
index 474bb689f0ad9..a8b31ebcff601 100644
--- a/torch/testing/_internal/common_dtype.py
+++ b/torch/testing/_internal/common_dtype.py
@@ -121,6 +121,7 @@ def all_types_and_half():
     return _all_types_and_half
 
 
+<<<<<<< HEAD
 _all_mps_types = (
     _dispatch_dtypes({torch.float, torch.half, torch.bfloat16}) + _integral_types
 )
@@ -134,6 +135,8 @@ def all_mps_types_and(*dtypes):
     return _all_mps_types + _validate_dtypes(*dtypes)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _float8_types = _dispatch_dtypes(
     (
         torch.float8_e4m3fn,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 36c72f1d5c3be..38a7b4f778b18 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -6,15 +6,24 @@
 import re
 import sys
 import time
+<<<<<<< HEAD
 import unittest
 import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Callable
+=======
+import warnings
+from abc import ABC, abstractmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
 from functools import wraps
+<<<<<<< HEAD
 from typing import Any, cast, no_type_check, Optional, Union
+=======
+from typing import Any, Callable, cast, no_type_check, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 import torch
@@ -59,7 +68,10 @@
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
     get_cycles_per_ms,
+<<<<<<< HEAD
     set_rng_seed,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_CUDA,
     TEST_HPU,
     TEST_XPU,
@@ -157,7 +169,11 @@ def _assert_module_states(
     assert rank0_states is not None  # mypy
     for state in olist[1:]:
         assert state is not None  # mypy
+<<<<<<< HEAD
         for (_, p1), (_, p2) in zip(rank0_states, state, strict=True):
+=======
+        for (_, p1), (_, p2) in zip(rank0_states, state):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert_fn(p1, p2)
 
 
@@ -998,6 +1014,7 @@ def patch_all_gather(new_all_gather_into_tensor: Callable):
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def patch_foreach_all_gather(new_foreach_all_gather: Callable):
     orig_foreach_all_gather = (
         torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather
@@ -1034,6 +1051,8 @@ def patch_foreach_reduce(new_foreach_reduce: Callable):
 
 
 @contextlib.contextmanager
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def patch_reduce_scatter(new_reduce_scatter_tensor: Callable):
     orig_reduce_scatter = dist.reduce_scatter_tensor
     dist.barrier()
@@ -1135,9 +1154,13 @@ def check_sharded_parity(
     prefixes_to_ignore: tuple[str, ...] = (),
 ):
     for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
+<<<<<<< HEAD
         replicated_module.named_parameters(),
         sharded_module.named_parameters(),
         strict=True,
+=======
+        replicated_module.named_parameters(), sharded_module.named_parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         clean_sharded_name = sharded_name
         for prefix in prefixes_to_ignore:
@@ -1163,7 +1186,10 @@ def check_sharded_parity(
         cls.assertEqual(sharded_param.grad.to_local(), sharded_ref_grad.to_local())
 
 
+<<<<<<< HEAD
 @unittest.skipIf(TEST_XPU, "not-support-multithread")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FSDPTestMultiThread(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -1229,8 +1255,11 @@ def _run(cls, rank, test_name, file_name, pipe, **kwargs):  # type: ignore[overr
         fake_pg = kwargs.get("fake_pg", False)
 
         print(f"dist init r={self.rank}, world={self.world_size}")
+<<<<<<< HEAD
         if torch.accelerator.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Specify gloo backend to make 'init_process_group()' succeed,
         # Actual tests will be skipped if there is no enough GPUs.
@@ -1268,7 +1297,10 @@ def _run(cls, rank, test_name, file_name, pipe, **kwargs):  # type: ignore[overr
         dist.barrier(device_ids=device_ids)
 
         torch._dynamo.reset()
+<<<<<<< HEAD
         set_rng_seed()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_test(test_name, pipe)
         torch._dynamo.reset()
 
@@ -1328,10 +1360,17 @@ def _train_for_several_steps(
             loss = sharded_grad_scaler.scale(loss)
 
             if not mixed_precision and not use_pure_fp16:
+<<<<<<< HEAD
                 assert loss.dtype == torch.float32, (
                     "loss data type should be float32, as the original \
                     parameter data type is float32."
                 )
+=======
+                assert (
+                    loss.dtype == torch.float32
+                ), "loss data type should be float32, as the original \
+                    parameter data type is float32."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 if use_pure_fp16:
                     self.assertEqual(loss.dtype, torch.float16)
@@ -1397,9 +1436,15 @@ def _test_fsdp_parity(
                 wrapper should provide data parallel semantics. If ``None``,
                 then the callable defaults to the DDP constructor.
         """
+<<<<<<< HEAD
         assert fsdp_init_mode != FSDPInitMode.NO_FSDP, (
             "Expects an FSDP init mode that wraps with FSDP"
         )
+=======
+        assert (
+            fsdp_init_mode != FSDPInitMode.NO_FSDP
+        ), "Expects an FSDP init mode that wraps with FSDP"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if init_kwargs is None:
             init_kwargs = {}
         lr = 1e-2
@@ -1551,9 +1596,13 @@ def wrapper(*args, **kwargs):
             original_fully_shard: Any = torch.distributed.fsdp.fully_shard
             for mode in FullyShardMode:
                 if mode != FullyShardMode.EAGER and not has_triton():
+<<<<<<< HEAD
                     warnings.warn(
                         "Inductor on GPU needs Triton and recent GPU arch", stacklevel=2
                     )
+=======
+                    warnings.warn("Inductor on GPU needs Triton and recent GPU arch")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 # barrier to ensure thread reading the same value
                 original_skip_fsdp_hooks = torch._dynamo.config.skip_fsdp_hooks
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index ac6e851d7e28b..7a22c9d9e9e02 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -135,7 +135,11 @@ def get_recording_tensors(args):
 
         self.assertEqual(outputs, outputs_test)
         self.assertEqual(grads, grads_test)
+<<<<<<< HEAD
         for g2, g2_test in zip(grads2, grads2_test, strict=True):
+=======
+        for g2, g2_test in zip(grads2, grads2_test):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if g2 is None and g2_test is None:
                 continue
             self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0cecc762bce4a..1567a5d14256d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -28,7 +28,11 @@
     (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
      skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
+<<<<<<< HEAD
      toleranceOverride, tol, skipXPU)
+=======
+     toleranceOverride, tol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM53OrLater, SM80OrLater, SM89OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
@@ -39,7 +43,11 @@
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, IS_S390X, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
+<<<<<<< HEAD
     TEST_WITH_TORCHINDUCTOR, MACOS_VERSION,
+=======
+    TEST_WITH_TORCHINDUCTOR, MACOS_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -449,7 +457,11 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     biases = [None, channels, None]
     is_training = [True, False, False]
 
+<<<<<<< HEAD
     for weight, bias, training in zip(weights, biases, is_training, strict=True):
+=======
+    for weight, bias, training in zip(weights, biases, is_training):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield SampleInput(
             make_arg(input_shape),
             args=(
@@ -1161,8 +1173,13 @@ def make_arg_conj(size):
 
 
 def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
+<<<<<<< HEAD
     alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6 if dtype.is_floating_point else 2)
     beta_val = kwargs.get('beta', 1 + 2j if dtype.is_complex else 0.2 if dtype.is_floating_point else 3)
+=======
+    alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6)
+    beta_val = kwargs.get('beta', 1 + 2j if dtype.is_complex else 0.2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tests_list = [
         ((2, 3), (2, 2), (2, 3), False),
         ((3, 3), (3, 3), (3, 3), False),
@@ -1915,7 +1932,11 @@ def get_val(dtype):
     for sample in sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
         # The scalar we are passing to new_full must be the same dtype
         # as the one of the resulting tensor
+<<<<<<< HEAD
         use_dtype = sample.kwargs.get('dtype', dtype)
+=======
+        use_dtype = sample.kwargs['dtype'] if 'dtype' in sample.kwargs else dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield SampleInput(
             sample.input, *sample.args, get_val(use_dtype), **sample.kwargs)
 
@@ -2456,7 +2477,11 @@ def error_inputs_cat(op_info, device, **kwargs):
 
     # error inputs for empty tensors
     yield ErrorInput(SampleInput([], kwargs={'dim': 1}),
+<<<<<<< HEAD
                      error_regex='non-empty list of Tensors', error_type=ValueError)
+=======
+                     error_regex='non-empty list of Tensors')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # error inputs for different sizes
     yield ErrorInput(SampleInput([make_arg((S, S, L, L)), make_arg((S, 0, L - 1, L))], kwargs={'dim': 1}),
@@ -2509,7 +2534,11 @@ def error_inputs_cat(op_info, device, **kwargs):
                      error_regex='zero-dimensional.*cannot be concatenated')
 
     # error inputs for different dtype of out tensors
+<<<<<<< HEAD
     d = make_tensor((2, 3), device=device, dtype=torch.double if not device.startswith("mps") else torch.float16)
+=======
+    d = make_tensor((2, 3), device=device, dtype=torch.double)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x = make_tensor((2, 3), device=device, dtype=torch.float32)
     yield ErrorInput(SampleInput(x, kwargs={'out': d}), error_type=TypeError,
                      error_regex='invalid combination of arguments')
@@ -3011,7 +3040,11 @@ def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
     err_msg_aminmax = "cannot compute aminmax over an empty dimension as the operation has no identity"
     if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
         yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_amax_amin)
+<<<<<<< HEAD
     elif op_info.name == 'aminmax':
+=======
+    elif op_info.name in ['aminmax']:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_aminmax)
 
     # Error Inputs for tensors with more than 64 dimension
@@ -3050,7 +3083,11 @@ def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
     if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
         yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': illegal_values}),
                          error_regex=err_msg_amax_amin2)
+<<<<<<< HEAD
     elif op_info.name == 'aminmax':
+=======
+    elif op_info.name in ['aminmax']:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': (max_values, min_values)}),
                          error_regex=err_msg_aminmax2)
 
@@ -3631,7 +3668,11 @@ def _gen_shape(self):
     def _gen_kwargs(self):
         keys = self.kwargs.keys()
         for values in product(*self.kwargs.values()):
+<<<<<<< HEAD
             yield dict(zip(keys, values, strict=True))
+=======
+            yield dict(zip(keys, values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def gen_input_params(self):
         yield from product(self._gen_shape(), self._gen_kwargs())
@@ -4400,7 +4441,11 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
     weights = [channels, None]
     biases = [None, None]
 
+<<<<<<< HEAD
     for weight_channels, bias_channels in zip(weights, biases, strict=True):
+=======
+    for weight_channels, bias_channels in zip(weights, biases):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         running_mean = make_arg_without_requires_grad(channels, low=0)
         running_var = make_arg_without_requires_grad(channels, low=0)
         yield SampleInput(
@@ -4883,7 +4928,11 @@ def shape(size, rank, with_batch_channel=True):
 def reference_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs)
 
+<<<<<<< HEAD
     if mode == 'bilinear':
+=======
+    if mode in ('bilinear', ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_arg = partial(
             make_tensor,
             device=device,
@@ -5905,7 +5954,10 @@ def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
             ((1, 3), (1, 2)),
             ((1, 3), (0, 1)),
             ((1, 3), (0, 2, 0, 1)),
+<<<<<<< HEAD
             ((5, 3), (-1, -2, 1, 1)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ((0, 3, 3), (1, 2)),
             ((0, 3, 3), (0, 1)),
             ((0, 3, 3), (0, 2, 0, 1)),
@@ -6005,7 +6057,10 @@ def sample_inputs_repeat_interleave(op_info, device, dtype, requires_grad, **kwa
     yield SampleInput(make_input((2, 3, 4)), repeats=2)
     yield SampleInput(make_input((2, 3, 4)), repeats=2, dim=1)
     yield SampleInput(make_input((2, 3, 4)), repeats=torch.arange(3, device=device), dim=1)
+<<<<<<< HEAD
     yield SampleInput(make_input((4, 1)), repeats=torch.arange(4, device=device), dim=0, output_size=6)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def sample_inputs_stft(op_info, device, dtype, requires_grad, **kwargs):
@@ -6527,8 +6582,11 @@ def sample_inputs_view_as_real(op_info, device, dtype, requires_grad, **kwargs):
 
 def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
     make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+<<<<<<< HEAD
     other_dtype = torch.float16 if device.startswith("mps") else torch.float64
     other_dtype_name = "Half" if device.startswith("mps") else "Double"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_ref:
         error_float = "Expected both inputs to be Half, Float or Double tensors but got torch.float32 and torch.int32"
@@ -6536,16 +6594,28 @@ def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
         error_out = "Expected out tensor to have dtype torch.complex128 but got torch.complex64 instead"
     else:
         error_float = "Expected both inputs to be Half, Float or Double tensors but got Float and Int"
+<<<<<<< HEAD
         error_dtype = f"Expected object of scalar type Float but got scalar type {other_dtype_name} for second argument"
         error_out = f"Expected object of scalar type Complex{other_dtype_name} but got scalar type ComplexFloat for argument 'out'"
+=======
+        error_dtype = "Expected object of scalar type Float but got scalar type Double for second argument"
+        error_out = "Expected object of scalar type ComplexDouble but got scalar type ComplexFloat for argument 'out'"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.int)),
                      error_type=RuntimeError, error_regex=error_float)
 
+<<<<<<< HEAD
     yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=other_dtype)),
                      error_type=RuntimeError, error_regex=error_dtype)
 
     yield ErrorInput(SampleInput(make_arg(M, S, dtype=other_dtype), make_arg(M, S, dtype=other_dtype),
+=======
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.float64)),
+                     error_type=RuntimeError, error_regex=error_dtype)
+
+    yield ErrorInput(SampleInput(make_arg(M, S, dtype=torch.float64), make_arg(M, S, dtype=torch.float64),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                  out=make_arg(M, S, dtype=torch.complex64)),
                      error_type=RuntimeError, error_regex=error_out)
 
@@ -8360,6 +8430,7 @@ def sample_inputs_grid_sampler_2d(op_info, device, dtype, requires_grad, **kwarg
             align_corners,
         )
 
+<<<<<<< HEAD
 def sample_inputs_grid_sampler_3d(op_info, device, dtype, requires_grad, **kwargs):
     _make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
                           low=-1, high=1)
@@ -8390,6 +8461,8 @@ def sample_inputs_grid_sampler_3d(op_info, device, dtype, requires_grad, **kwarg
             align_corners,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -8469,8 +8542,12 @@ def gen_shape_kwargs():
             yield make_input(s), make_target(s), dict(reduction=reduction)
             yield make_input(s), make_target(s), dict(weight=make_weight(), reduction=reduction)
             yield make_input(s), make_target(s), dict(weight=make_weight(low=0), reduction=reduction)
+<<<<<<< HEAD
             if dtype.is_floating_point or dtype.is_complex:
                 yield make_input(s), make_target(s), dict(weight=make_weight(high=0), reduction=reduction)
+=======
+            yield make_input(s), make_target(s), dict(weight=make_weight(high=0), reduction=reduction)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t = make_target(s)
             ignore = num_classes // 2
             # If "mean", nll returns NaN, so it's not differentiable at those points
@@ -9346,7 +9423,11 @@ def sample_inputs_multi_head_attention_forward(opinfo, device, dtype, requires_g
             "k_proj_weight" : k_proj_weight,
             "v_proj_weight" : v_proj_weight,
             "attn_mask" : attn_mask,
+<<<<<<< HEAD
             "training" : dropout_p > 0.0,
+=======
+            "training" : True if dropout_p > 0.0 else False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "use_separate_proj_weight" : use_separate_proj_weight
         }
 
@@ -9468,7 +9549,11 @@ def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dt
             # unary
             if opinfo.ref in (torch.abs, torch.neg):
                 return False
+<<<<<<< HEAD
             if opinfo.ref_inplace == torch.Tensor.zero_:
+=======
+            if opinfo.ref_inplace in (torch.Tensor.zero_,):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
             return dtype in integral_types_and(torch.bool)
         if self.arity < 2 or rightmost_arg_type == ForeachRightmostArgType.Tensor:
@@ -9698,7 +9783,11 @@ def __init__(
         super().__init__(arity, rightmost_supports_scalar, rightmost_supports_scalarlist)
 
     def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
+<<<<<<< HEAD
         return dtype in integral_types_and(torch.bool) and opinfo.ref == torch.addcmul
+=======
+        return dtype in integral_types_and(torch.bool) and opinfo.ref in (torch.addcmul,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, **kwargs):
         assert "num_input_tensors" not in kwargs
@@ -9710,7 +9799,12 @@ def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, *
             sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
             for _ in range(2)
         ]
+<<<<<<< HEAD
         kwargs.pop("scalars", None)
+=======
+        if "scalars" in kwargs:
+            del kwargs["scalars"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs.update(self._sample_kwargs(opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype))
         yield ForeachSampleInput(input, *args, **kwargs)
 
@@ -11615,7 +11709,11 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         # numpy searchsorted only supports 1D inputs so we split up ND inputs
         orig_shape = boundary.shape
         num_splits = np.prod(sorted_sequence.shape[:-1])
+<<<<<<< HEAD
         splits = range(num_splits)
+=======
+        splits = range(0, num_splits)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
         if sorter is not None:
             sorter = sorter.reshape(num_splits, -1)
@@ -11625,6 +11723,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]
 
         split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
+<<<<<<< HEAD
                      for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter, strict=True)]
         split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
         return np.stack(split_ret).reshape(orig_shape)
@@ -11649,6 +11748,12 @@ def reference_hash_tensor(tensor, dim=(), keepdim=False, mode=0):
     return result
 
 
+=======
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+        split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
+        return np.stack(split_ret).reshape(orig_shape)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def loss_reference_reduction_wrapper(fn):
     def wrapper(input, target, *, size_average=None, reduce=None, reduction="mean", **other_kwargs):
         if size_average is not None or reduce is not None:
@@ -11881,11 +11986,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                        safe_val=2)),
     BinaryUfuncInfo('add',
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+<<<<<<< HEAD
                     ref=lambda input, other, *, alpha=1: (
                         np.add(input, other)
                         if alpha == 1
                         else np.add(input, np.multiply(alpha, other))
                     ),
+=======
+                    ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
+                    else np.add(input, np.multiply(alpha, other)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
                                                      torch.float16, torch.chalf),
                     dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.int32),
@@ -12363,10 +12473,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+<<<<<<< HEAD
                # Higher differences starting with Zen3 or Alder Lake
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=4e-05, rtol=4e-06)}),
                    'TestDecomp', 'test_quick', device_type='cpu'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestMathBits', 'test_conj_view', device_type='cuda'),
@@ -13645,8 +13758,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
                                or (_get_torch_rocm_version() >= (5, 2))),
                           "cusparseSDDMM was added in 11.2.1"),
+<<<<<<< HEAD
                skipCPUIfNoMklSparse,
                skipXPU],
+=======
+               skipCPUIfNoMklSparse, ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            skips=(
                # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
@@ -14889,8 +15006,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            skips=(
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
+<<<<<<< HEAD
                DecorateInfo(unittest.skip('Fails on cuda'), 'TestCommon', 'test_complex_half_reference_testing',
                             active_if=not TEST_WITH_ROCM),
+=======
+               DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
                DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
@@ -16258,7 +16379,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         aten_backward_name='_prelu_kernel_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
+<<<<<<< HEAD
             (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(x.ndim)])),
+=======
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -16374,7 +16499,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         supports_out=True,
         supports_forward_ad=False,
         supports_autograd=False,
+<<<<<<< HEAD
         decorators=[skipXPU, skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
+=======
+        decorators=[skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         skips=(
             # Sample inputs isn't really parametrized on dtype
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
@@ -16501,8 +16630,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         # FIXME: mask_type == 2 (LowerRight)
         decorators=[
             skipCUDAIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "This platform doesn't support efficient attention"),
+<<<<<<< HEAD
             skipCUDAIf(TEST_WITH_ROCM, "Efficient attention on ROCM doesn't support custom_mask_type==2"),
             skipXPU],
+=======
+            skipCUDAIf(TEST_WITH_ROCM, "Efficient attention on ROCM doesn't support custom_mask_type==2")],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         skips=(
             # Checking the scaler value of the philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
@@ -16597,7 +16730,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type='cuda',), ],
         skips=[
             # still want to test that first derivative works though second derivative isn't supported
+<<<<<<< HEAD
             DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad")]
+=======
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad"),
+            # produces 0 instead of nan on ROCM
+            DecorateInfo(unittest.expectedFailure,
+                         'TestUnaryUfuncs', "test_reference_numerics_extremal",
+                         device_type='cuda',
+                         active_if=(TEST_WITH_ROCM)), ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     UnaryUfuncInfo(
         'nn.functional.logsigmoid',
@@ -19686,7 +19828,19 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_gradgrad=True,
            supports_out=True,
            check_batched_grad=False,
+<<<<<<< HEAD
            ),
+=======
+           skips=(
+               # Expected __torch_dispatch__ for aten::unbind_copy.int_out to return None
+               # but it returned something else instead.
+               DecorateInfo(
+                   unittest.expectedFailure,
+                   'TestProxyTensorOpInfo',
+                   'test_make_fx_symbolic_exhaustive_out'
+               ),
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('vstack',
            aliases=('row_stack',),
            dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
@@ -20228,7 +20382,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            ),
     OpInfo('logcumsumexp',
            dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+<<<<<<< HEAD
            backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+=======
+           backward_dtypes=floating_and_complex_types_and(torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
@@ -20537,11 +20696,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         'jiterator_binary',
         op=torch.cuda.jiterator._create_jit_fn(
             "template <typename T> T binary(T x, T y, T alpha) { return x + alpha * y; }", alpha=1),
+<<<<<<< HEAD
         ref=lambda input, other, *, alpha=1: (
             np.add(input, other)
             if alpha == 1
             else np.add(input, np.multiply(alpha, other))
         ),
+=======
+        ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
+            else np.add(input, np.multiply(alpha, other)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
         sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2, alpha=-3.14),
         supports_out=False,
@@ -20865,8 +21029,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
+<<<<<<< HEAD
             # AssertionError: Tensor-likes are not close!
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='xpu'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases
@@ -21070,6 +21237,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(slowTest, 'TestDecomp', 'test_comprehensive', dtypes=(torch.float32, torch.float64),
                          active_if=IS_WINDOWS),
         ),),
+<<<<<<< HEAD
     # TODO: Remove grid_sampler_3d tests once `nn.functional.grid_sample` has
     # MPS support for all cases.
     OpInfo(
@@ -21086,6 +21254,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.skip('Skipped!'), device_type='xpu'),
             DecorateInfo(unittest.skip('Skipped!'), device_type='meta'),
         ),),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo(
         "argwhere",
         ref=np.argwhere,
@@ -21442,6 +21612,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                          "TestConsistency", "test_output_match", device_type="mps"),
         ),
     ),
+<<<<<<< HEAD
     ReductionOpInfo(
         'hash_tensor',
         result_dtype=torch.uint64,
@@ -21462,6 +21633,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.expectedFailure, 'TestDTensorOps', 'test_dtensor_op_db'),
         )
     ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo(
         "nn.functional.ctc_loss",
         dtypes=floating_types(),
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
index 70ab98137bd71..6deb4e1ba65be 100644
--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -7,6 +7,12 @@
 import torch
 
 
+<<<<<<< HEAD
+=======
+# Test whether hardware BF32 math mode enabled. It is enabled only on:
+# - MKLDNN is available
+# - BF16 is supported by MKLDNN
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def bf32_is_not_fp32():
     if not torch.backends.mkldnn.is_available():
         return False
@@ -15,6 +21,7 @@ def bf32_is_not_fp32():
     return True
 
 
+<<<<<<< HEAD
 def tf32_is_not_fp32():
     if not torch.backends.mkldnn.is_available():
         return False
@@ -75,28 +82,69 @@ def tf32_on(self, tf32_precision=1e-5):
 def reduced_f32_on_and_off(bf32_precision=1e-2, tf32_precision=1e-5):
     def with_reduced_f32_disabled(self, function_call):
         with reduced_f32_off():
+=======
+@contextlib.contextmanager
+def bf32_off():
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    try:
+        torch.set_float32_matmul_precision("highest")
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+
+
+@contextlib.contextmanager
+def bf32_on(self, bf32_precision=1e-5):
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    old_precision = self.precision
+    try:
+        torch.set_float32_matmul_precision("medium")
+        self.precision = bf32_precision
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_bf32=True, another with allow_bf32=False. When running with
+# allow_bf32=True, it will use reduced precision as specified by the
+# argument
+def bf32_on_and_off(bf32_precision=1e-5):
+    def with_bf32_disabled(self, function_call):
+        with bf32_off():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             function_call()
 
     def with_bf32_enabled(self, function_call):
         with bf32_on(self, bf32_precision):
             function_call()
 
+<<<<<<< HEAD
     def with_tf32_enabled(self, function_call):
         with tf32_on(self, tf32_precision):
             function_call()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def wrapper(f):
         params = inspect.signature(f).parameters
         arg_names = tuple(params.keys())
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
+<<<<<<< HEAD
             kwargs.update(zip(arg_names, args, strict=False))
             cond = True
+=======
+            kwargs.update(zip(arg_names, args))
+            cond = bf32_is_not_fp32()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if "device" in kwargs:
                 cond = cond and (torch.device(kwargs["device"]).type == "cpu")
             if "dtype" in kwargs:
                 cond = cond and (kwargs["dtype"] == torch.float)
+<<<<<<< HEAD
             bf32_cond = cond and bf32_is_not_fp32()
             tf32_cond = cond and tf32_is_not_fp32()
             if bf32_cond or tf32_cond:
@@ -105,6 +153,11 @@ def wrapped(*args, **kwargs):
                     with_bf32_enabled(kwargs["self"], lambda: f(**kwargs))
                 if tf32_cond:
                     with_tf32_enabled(kwargs["self"], lambda: f(**kwargs))
+=======
+            if cond:
+                with_bf32_disabled(kwargs["self"], lambda: f(**kwargs))
+                with_bf32_enabled(kwargs["self"], lambda: f(**kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 f(**kwargs)
 
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 120a76eb5ef32..c96a20291a4ec 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -16,7 +16,11 @@
     floating_types, floating_and_complex_types_and, get_all_fp_dtypes)
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _update_param_kwargs, expectedFailureMPS, toleranceOverride, tol,
+<<<<<<< HEAD
     precisionOverride, skipMeta, skipMPS)
+=======
+    skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_nn import (
     cosineembeddingloss_reference, cross_entropy_loss_reference, ctcloss_reference,
@@ -24,7 +28,11 @@
     marginrankingloss_reference, multimarginloss_reference, multilabelmarginloss_reference,
     nllloss_reference, nlllossNd_reference, smoothl1loss_reference, softmarginloss_reference, get_reduction)
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     freeze_rng_state, skipIfMPS, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+=======
+    freeze_rng_state, skipIfMPS, skipIfMPSOnMacOS13, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo)
 from types import ModuleType
 import operator
@@ -432,11 +440,15 @@ def module_inputs_torch_nn_GaussianNLLLoss(module_info, device, dtype, requires_
         ('reduction_sum', {'reduction': 'sum'}),
         ('reduction_mean', {'reduction': 'mean'}),
         ('reduction_none', {'reduction': 'none'}),
+<<<<<<< HEAD
         ('homoscedastic', {'homoscedastic': True}),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     module_inputs = []
     for desc, constructor_kwargs in cases:
+<<<<<<< HEAD
         homoscedastic = constructor_kwargs.pop('homoscedastic', False)
         var_input = make_input(1, 3).abs() if homoscedastic else make_input(4, 1).abs()
         module_inputs.append(
@@ -444,6 +456,13 @@ def module_inputs_torch_nn_GaussianNLLLoss(module_info, device, dtype, requires_
                         forward_input=FunctionInput(make_input(4, 3),
                                                     make_target(4, 3),
                                                     var_input),
+=======
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input(3),
+                                                    make_target(3),
+                                                    make_input(1).abs()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         desc=desc,
                         reference_fn=no_batch_dim_reference_fn)
         )
@@ -1413,7 +1432,11 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, tra
                     forward_input=FunctionInput(make_input((2, 3, 4)),
                                                 make_input((2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
+<<<<<<< HEAD
                                                                          for a, b in zip(i, t, strict=True))),
+=======
+                                                                         for a, b in zip(i, t))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ModuleInput(constructor_input=FunctionInput(),
                     forward_input=FunctionInput(make_input(()), make_input(())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
@@ -3416,16 +3439,29 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        'TestModule',
                        'test_memory_format',
                        active_if=operator.itemgetter('training'),
+<<<<<<< HEAD
                        device_type='cuda',),
                ),),
+=======
+                       device_type='cuda',
+                   ),
+                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', dtypes=[torch.float16], device_type='mps',),),
+               ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ModuleInfo(torch.nn.AvgPool3d,
                module_inputs_func=module_inputs_torch_nn_AvgPool3d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                skips=(
                    # No channels_last support for AvgPool1d as it does not take 4D inputs
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+<<<<<<< HEAD
                    # backward not supported on MPS backend
                    DecorateInfo(skipMPS, 'TestModule', 'test_non_contiguous_tensors'),)
+=======
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
     ModuleInfo(torch.nn.BatchNorm1d,
                train_and_eval_differ=True,
@@ -3496,6 +3532,19 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False),
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
+<<<<<<< HEAD
+=======
+               skips=(
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                )),
@@ -3504,10 +3553,27 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='cuda', dtypes=[torch.float64]),
+<<<<<<< HEAD
+=======
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32, torch.float16]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3517,6 +3583,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Conv3d is not supported on MPS backend
                    DecorateInfo(skipMPS, device_type="mps"),
                    # This was wrongly being skipped before and needs investigation.
@@ -3532,10 +3603,24 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
+<<<<<<< HEAD
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
                ),
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Not implemented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                    DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
@@ -3546,6 +3631,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Fails on backward check because ViewAsRealBackward apply contiguous for grad
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
                                 dtypes=(torch.complex32, torch.complex64, torch.complex128)),
@@ -3553,9 +3643,24 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64, torch.complex128]),
+<<<<<<< HEAD
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
+=======
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16, torch.float32]),
+                   # Not implemented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3567,11 +3672,22 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # ConvTranspose3d is not supported on MPS backend
                    DecorateInfo(skipMPS),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+<<<<<<< HEAD
+=======
+                   # These fail only on ROCm
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
@@ -3631,9 +3747,23 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3643,6 +3773,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3650,6 +3785,18 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='cuda', dtypes=[torch.float64]),
+<<<<<<< HEAD
+=======
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32, torch.float16]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3659,6 +3806,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3676,9 +3828,23 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3688,6 +3854,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3695,6 +3866,18 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64]),
+<<<<<<< HEAD
+=======
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32, torch.float16]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3704,6 +3887,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
+=======
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
@@ -3776,6 +3964,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.MaxPool3d,
                module_inputs_func=module_inputs_torch_nn_MaxPool3d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+<<<<<<< HEAD
+=======
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipIfMPS, device_type='mps'),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
     ModuleInfo(torch.nn.KLDivLoss,
                module_inputs_func=module_inputs_torch_nn_KLDivLoss,
@@ -3792,6 +3986,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+<<<<<<< HEAD
+=======
+                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', 'test_non_contiguous_tensors',
+                                device_type='mps', dtypes=[torch.float16],),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # See #119108: tolerance issue
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
                                 device_type='mps', dtypes=[torch.float16]),)
@@ -3973,6 +4173,17 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.LocalResponseNorm,
                module_inputs_func=module_inputs_torch_nn_LocalResponseNorm,
+<<<<<<< HEAD
+=======
+               skips=(
+                   # uses avg_pool3d which is not supported on MPS backend
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_memory_format'),
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_non_contiguous_tensors'),
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_forward'),
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_if_train_and_eval_modes_differ'),
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_non_contiguous'),
+                   DecorateInfo(expectedFailureMPS, 'TestModule', 'test_save_load'),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                ),
     ModuleInfo(torch.nn.LayerNorm,
                module_inputs_func=module_inputs_torch_nn_LayerNorm,
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 45ede2d5e433f..58c761297d63e 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -12,9 +12,14 @@
 
     def mps_ops_modifier(
         ops: Sequence[OpInfo],
+<<<<<<< HEAD
         device_type: str = "mps",
         xfail_exclusion: Optional[list[str]] = None,
         sparse: bool = False,
+=======
+        device_type: Optional[str] = None,
+        xfail_exclusion: Optional[list[str]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Sequence[OpInfo]:
         if xfail_exclusion is None:
             xfail_exclusion = []
@@ -26,7 +31,10 @@ def mps_ops_modifier(
             "__rsub__",
             "__getitem__",
             "_unsafe_masked_index",
+<<<<<<< HEAD
             "_unsafe_masked_index_put_accumulate",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "abs",
             "add",
             "alias_copy",
@@ -38,7 +46,10 @@ def mps_ops_modifier(
             "as_strided_copy",
             "as_strided_scatter",
             "asin",
+<<<<<<< HEAD
             "asinh",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "acos",
             "atan",
             "broadcast_tensors",
@@ -76,17 +87,25 @@ def mps_ops_modifier(
             "H",
             "hsplit",
             "imag",
+<<<<<<< HEAD
             "index_add",
             "index_copy",
             "index_select",
             "index_put",
+=======
+            "index_copy",
+            "index_select",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "isfinite",
             "isinf",
             "isreal",
             "item",
             "kron",
             "linalg.diagonal",
+<<<<<<< HEAD
             "linalg.householder_product",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "linalg.svd",
             "log10",
             "log1p",
@@ -184,6 +203,12 @@ def mps_ops_modifier(
             "zero_",
             "zeros",
             "zeros_like",
+<<<<<<< HEAD
+=======
+        }
+
+        AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "__rdiv__",
             "__rmatmul__",
             "_chunk_cat",
@@ -277,6 +302,11 @@ def mps_ops_modifier(
             "roll",
             "rot90",
             "short",
+<<<<<<< HEAD
+=======
+            "sinh",
+            "sqrt",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "square",
             "stack",
             "stft",
@@ -290,6 +320,88 @@ def mps_ops_modifier(
             "where",
             "byte",
         }
+<<<<<<< HEAD
+=======
+        # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
+        MACOS_BEFORE_13_3_XFAILLIST = {
+            # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
+            "cdist": [torch.float32],
+            # CPU Error: cpu not giving nan for x/0.0
+            "atan2": [
+                torch.bool,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            # test blow pass on macOS 12 as it falls back to cpu
+            # Argsort case using duplicate indices (undefined behaviour):
+            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
+            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
+            # Elements from index 30 and 5133 are both equal.
+            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+            "argsort": [torch.float16, torch.int8, torch.uint8, torch.bool],
+            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+            # The values of the sorted tensor match the CPU,
+            # but in case of the returned indices this results in undefined behaviour.
+            "sort": [torch.int8, torch.uint8, torch.bool, torch.float16],
+            # Unsupported dtypes
+            "cumsum": [torch.int64],
+            "cumprod": [torch.int64],
+            "cumulative_trapezoid": [torch.int64],
+            "masked.cumsum": [torch.int64],
+            "masked.cumprod": [torch.int64],
+            "linalg.vander": [torch.int64],
+            # Fail with `Expected 1.0 but got nan.` for empty tensors
+            # Caused by sample input at index 23: SampleInput(
+            #     input=Tensor[size=(), device="mps:0", dtype=torch.float32],
+            #     args=(0),
+            #     kwargs={'mask': 'Tensor[size=(), device="mps:0", dtype=torch.bool]'},
+            #     broadcasts_input=False, name='')
+            "masked.softmin": [torch.float32, torch.float16],
+            "masked.softmax": [torch.float32, torch.float16],
+            "masked.log_softmax": [torch.float32, torch.float16],
+        }
+
+        MACOS_AFTER_13_1_XFAILLIST = {
+            # before macOS 13.2 it falls back to cpu and pass the forward pass
+            "grid_sampler_2d": [
+                torch.float32,
+                torch.float16,
+                torch.bfloat16,
+            ],  # Unsupported Border padding mode
+        }
+
+        MACOS_13_3_XFAILLIST = {
+            # Failure due to precision issue for fp16
+            # on both cpu and mps there are test cases that might produce inf result
+            # 'nn.functional.pairwise_distance': [torch.float16],
+            # test blow pass on macOS 12 as it falls back to cpu
+            # Argsort case using duplicate indices (undefined behaviour):
+            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
+            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
+            # Elements from index 30 and 5133 are both equal.
+            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+            "argsort": [
+                torch.float16,
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.bfloat16,
+            ],
+            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+            # The values of the sorted tensor match the CPU,
+            # but in case of the returned indices this results in undefined behaviour.
+            "sort": [
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.float16,
+                torch.bfloat16,
+            ],
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         MACOS_BEFORE_14_4_XFAILLIST = {
             # These ops work fine in 14.4 but fail in 14.2 or 13.x
@@ -297,7 +409,11 @@ def mps_ops_modifier(
         }
 
         # Those ops are not expected to work
+<<<<<<< HEAD
         UNIMPLEMENTED_XFAILLIST: dict[str, Optional[list]] = {
+=======
+        UNIMPLEMENTED_XFAILLIST = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Failures due to lack of op implementation on MPS backend
             "logspace": None,
             "logspacetensor_overload": None,
@@ -312,17 +428,31 @@ def mps_ops_modifier(
             "gcd": None,
             "geqrf": None,
             "nn.functional.grid_sample": None,  # Unsupported Border padding mode
+<<<<<<< HEAD
             "hash_tensor": None,
             "heaviside": None,
+=======
+            "heaviside": None,
+            "igamma": None,
+            "igammac": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "index_reduceprod": None,
             "index_reducemean": None,
             "index_reduceamax": None,
             "index_reduceamin": None,
+<<<<<<< HEAD
             # "kthvalue": None,
+=======
+            "kthvalue": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "lcm": None,
             "linalg.cond": None,
             "linalg.eigh": None,
             "linalg.eigvalsh": None,
+<<<<<<< HEAD
+=======
+            "linalg.householder_product": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "linalg.ldl_factor": None,
             "linalg.ldl_factor_ex": None,
             "linalg.ldl_solve": None,
@@ -336,10 +466,18 @@ def mps_ops_modifier(
             "linalg.qr": None,
             "linalg.svdvals": None,
             "linalg.vecdot": None,
+<<<<<<< HEAD
+=======
+            "logcumsumexp": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "lu_solve": None,
             "masked.median": None,
             "matrix_exp": None,
             "mode": None,
+<<<<<<< HEAD
+=======
+            "native_dropout_backward": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "normnuc": None,
             "nn.functional.fractional_max_pool2d": None,
             "nn.functional.fractional_max_pool3d": None,
@@ -347,7 +485,20 @@ def mps_ops_modifier(
             "nn.functional.adaptive_max_pool3d": None,
             "nn.functional.interpolatearea": None,
             "nn.functional.interpolatebicubic": [torch.uint8],
+<<<<<<< HEAD
+            "nn.functional.ctc_loss": None,
+=======
+            "nn.functional.max_unpool1dgrad": None,
+            "nn.functional.max_unpool2dgrad": None,
+            "nn.functional.max_unpool3dgrad": None,
+            "nn.functional.avg_pool3d": None,
             "nn.functional.ctc_loss": None,
+            "nn.functional.embedding_bag": None,
+            "nn.functional.max_pool3d": None,
+            "nn.functional.max_unpool1d": None,
+            "nn.functional.max_unpool2d": None,
+            "nn.functional.max_unpool3d": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "nn.functional.multi_margin_loss": None,
             "nn.functional.multilabel_margin_loss": None,
             "nn.functional.pdist": None,
@@ -376,13 +527,21 @@ def mps_ops_modifier(
             "special.airy_ai": None,
             "special.erfcx": None,
             "special.laguerre_polynomial_l": None,
+<<<<<<< HEAD
             "special.legendre_polynomial_p": None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "special.log_ndtr": None,
             "special.ndtri": None,
             "svd_lowrank": None,
             "symeig": None,
             "take": None,
             "to": None,
+<<<<<<< HEAD
+=======
+            "to_sparse": None,
+            "unique": None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "vdot": None,
             "segment_reduce_": None,
             "_upsample_bilinear2d_aa": [torch.uint8],  # uint8 is for CPU only
@@ -415,7 +574,13 @@ def mps_ops_modifier(
                 torch.float16,
             ],
             # Unsupported dtypes
+<<<<<<< HEAD
+            "histc": [torch.float16, torch.bfloat16],
+=======
+            "dot": [torch.int64] if MACOS_VERSION < 14.0 else [],
             "histc": [torch.float16, torch.bfloat16],
+            "index_add": [torch.int64],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # GEMM on MPS is not supported for integral types
             "nn.functional.linear": [
                 torch.int16,
@@ -424,9 +589,25 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
+<<<<<<< HEAD
             "addbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
             "baddbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
             "mat": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+=======
+            "addmmdecomposed": [
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            "addbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "addmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "baddbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "mat": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "matmul": [torch.int64] if MACOS_VERSION < 14.0 else [],
+            "__rmatmul__": [torch.int64] if MACOS_VERSION < 14.0 else [],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # returned output on CPU is float64
             "bincount": [
                 torch.int16,
@@ -435,6 +616,7 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
+<<<<<<< HEAD
         }
         UNIMPLEMENTED_XFAILLIST_SPARSE: dict[str, Optional[list]] = {
             "logspace": None,
@@ -443,6 +625,56 @@ def mps_ops_modifier(
             "linalg.eigvals": None,
             "put": None,
         }
+=======
+            # round not working properly for float16 and bfloat16
+            "round": [torch.float16, torch.bfloat16],
+            "rounddecimals_0": [torch.bfloat16],
+            # atomic operations not supported
+            "_unsafe_masked_index_put_accumulate": [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int64,
+            ],
+        }
+
+        if MACOS_VERSION < 14.0:
+            # FFT and BFloat16 support was added in MacOS 14
+            UNIMPLEMENTED_XFAILLIST.update(
+                {
+                    "bfloat16": None,
+                    "fft.fft": None,
+                    "fft.fft2": None,
+                    "fft.fftn": None,
+                    "fft.hfft": None,
+                    "fft.hfft2": None,
+                    "fft.hfftn": None,
+                    "fft.ifft": None,
+                    "fft.ifft2": None,
+                    "fft.ifftn": None,
+                    "fft.ihfft": None,
+                    "fft.ihfft2": None,
+                    "fft.ihfftn": None,
+                    "fft.irfft": None,
+                    "fft.irfft2": None,
+                    "fft.irfftn": None,
+                    "fft.rfft": None,
+                    "fft.rfft2": None,
+                    "fft.rfftn": None,
+                    "stft": None,
+                    # Error in TestConsistencyCPU.test_output_match_isin_cpu fails for integers,
+                    # not reproducible in later OS. Added assert to op if used in < 14.0
+                    "isin": [
+                        torch.int64,
+                        torch.int32,
+                        torch.int16,
+                        torch.uint8,
+                        torch.int8,
+                    ],
+                    "nn.functional.max_pool2d": [torch.uint8],
+                }
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if MACOS_VERSION < 15.0:
             UNIMPLEMENTED_XFAILLIST.update(
@@ -451,10 +683,15 @@ def mps_ops_modifier(
                     "nanquantile": None,
                 }
             )
+<<<<<<< HEAD
         if sparse:
             UNIMPLEMENTED_XFAILLIST.update(UNIMPLEMENTED_XFAILLIST_SPARSE)
 
         UNDEFINED_XFAILLIST: dict[str, Optional[list]] = {
+=======
+
+        UNDEFINED_XFAILLIST = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Top 60 operators
             # topk fails with duplicate indices
             "topk": [
@@ -501,6 +738,15 @@ def mps_ops_modifier(
                 torch.float16,
                 torch.bfloat16,
             ],
+<<<<<<< HEAD
+=======
+            "index_put": [
+                torch.uint8,
+                torch.int8,
+                torch.int16,
+                torch.int64,
+            ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # zero to negative integer powers are undefined
             "__rpow__": [torch.int8, torch.int16, torch.int32, torch.int64],
             "resize_": [torch.float16, torch.float32, torch.bfloat16],
@@ -531,12 +777,17 @@ def mps_ops_modifier(
             ],
         }
 
+<<<<<<< HEAD
         ON_MPS_XFAILLIST: dict[str, Optional[list]] = {
+=======
+        ON_MPS_XFAILLIST = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Failures due to lack of implementation of downstream functions on MPS backend
             # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
             "linalg.matrix_rank": None,
             # Exception: Caused by `torch.arange(-8.001, -4.0, dtype=torch.uint8, device="mps")`
             "arange": [torch.uint8],
+<<<<<<< HEAD
             # before macOS 13.2 it falls back to cpu and pass the forward pass
             "grid_sampler_2d": [
                 torch.float32,
@@ -569,6 +820,8 @@ def mps_ops_modifier(
                 torch.float16,
                 torch.bfloat16,
             ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         EMPTY_OPS_SKIPLIST = {
@@ -590,10 +843,13 @@ def mps_ops_modifier(
             # Unsupported
             # This doesn't work on M1, but is partially working on M2 with the exception of torch.float16
             "nn.functional.conv3d": None,
+<<<<<<< HEAD
             # The CPU impl of grid_sampler_3d does not use opmath_t, so it has a
             # large amount of error compared with the MPS impl for half
             # precision types. So we have to skip these for now.
             "grid_sampler_3d": [torch.float16, torch.bfloat16],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
@@ -604,6 +860,7 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
 
         for op in ops:
             key = op.name + op.variant_test_name
+<<<<<<< HEAD
             addDecorator(
                 op,
                 DecorateInfo(
@@ -628,6 +885,8 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                         ],
                     ),
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if key in EMPTY_OPS_SKIPLIST:
                 addDecorator(
                     op,
@@ -664,8 +923,53 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                     ),
                 )
 
+<<<<<<< HEAD
             # If ops is not supported for complex types, expect it to fail
             if key not in SUPPORTED_COMPLEX_OPS:
+=======
+            if (
+                key in MACOS_BEFORE_13_3_XFAILLIST
+                and key not in xfail_exclusion
+                and (torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=MACOS_BEFORE_13_3_XFAILLIST[key],
+                    ),
+                )
+
+            if (
+                key in MACOS_AFTER_13_1_XFAILLIST
+                and key not in xfail_exclusion
+                and torch.backends.mps.is_macos13_or_newer(2)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_AFTER_13_1_XFAILLIST[key]
+                    ),
+                )
+
+            if (
+                key in MACOS_13_3_XFAILLIST
+                and key not in xfail_exclusion
+                and (MACOS_VERSION >= 13.3)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST[key]
+                    ),
+                )
+
+            # If ops is not supported for complex types, expect it to fail
+            if key not in SUPPORTED_COMPLEX_OPS and (
+                key not in AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS
+                or MACOS_VERSION < 14.0
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 addDecorator(
                     op,
                     DecorateInfo(
@@ -684,15 +988,22 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "_upsample_bilinear2d_aa": None,  # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
             "_upsample_bicubic2d_aa": None,  # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
             "sparse.mmreduce": [torch.float32],  # csr not supported
+<<<<<<< HEAD
             "linalg.householder_product": None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "unique_consecutive": [torch.float16, torch.float32],
             "scalar_tensor": [torch.float16, torch.float32],
             "cdist": [torch.float32],
             "masked.scatter": [torch.float16, torch.float32],
+<<<<<<< HEAD
             "grid_sampler_3d": None,
             "index_fill": [torch.float16, torch.float32],  # missing `aten::_unique`.
             "igamma": None,  # currently not supported for any device
             "igammac": None,  # currently not supported for any device
+=======
+            "index_fill": [torch.float16, torch.float32],  # missing `aten::_unique`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "linalg.solve": [torch.float16, torch.float32],  # missing `aten::lu_solve`.
             "linalg.solve_ex": [
                 torch.float16,
@@ -713,11 +1024,14 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "special.i1e": [torch.float16],  # "i1e_backward" not implemented for 'Half'
             # Correctness issues
             "atanh": [torch.float32],
+<<<<<<< HEAD
             # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
             # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
             # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
             # Running `msort` with stable `sort` passes.
             "msort": [torch.float16],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Random output
             "exponential": [torch.float16, torch.float32],
             # CPU errors
@@ -754,12 +1068,35 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "signal.windows.kaiser": [torch.float32],
             "signal.windows.nuttall": [torch.float32],
             "eye": [torch.float16, torch.float32],
+<<<<<<< HEAD
             # topk fails with duplicate indices
             "topk": [torch.float16],
             # Could not run 'aten::uniform_' with arguments from the 'SparseCPU' backend
             "to_sparse": None,
             # Exception: the derivative for '_unique2' is not implemented.
             "unique": None,
+=======
+            # round not working properly for float16
+            "round": [torch.float16],
+            # topk fails with duplicate indices
+            "topk": [torch.float16],
+        }
+
+        MACOS_BEFORE_13_3_XFAILLIST_GRAD = {
+            # Failures due to precision issues (may be fast-math). These has been fixed in MacOS 14
+            "masked.softmin": [torch.float32, torch.float16],
+            "masked.softmax": [torch.float32, torch.float16],
+            "masked.log_softmax": [torch.float32, torch.float16],
+            "atanh": [torch.float16],
+            "triangular_solve": [torch.float32],
+            # Unsupported Border padding mode, forward pass success as fallback to cpu
+            "grid_sampler_2d": [torch.float32, torch.float16, torch.bfloat16],
+            # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+            # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+            # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+            # Running `msort` with stable `sort` passes.
+            "msort": [torch.float16],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         SKIPLIST_GRAD = {
@@ -773,6 +1110,17 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "nn.functional.conv_transpose3d": [torch.float16],
         }
 
+<<<<<<< HEAD
+=======
+        MACOS_13_3_XFAILLIST_GRAD = {
+            # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+            # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+            # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+            # Running `msort` with stable `sort` passes.
+            "msort": [torch.float16],
+        }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ON_MPS_XFAILLIST = {
             # Failures due to lack of implementation of downstream functions on MPS backend
             # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
@@ -803,6 +1151,27 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                     ),
                 )
 
+<<<<<<< HEAD
+=======
+            if key in MACOS_BEFORE_13_3_XFAILLIST_GRAD and (
+                torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=MACOS_BEFORE_13_3_XFAILLIST_GRAD[key],
+                    ),
+                )
+
+            if key in MACOS_13_3_XFAILLIST_GRAD and (MACOS_VERSION >= 13.3):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST_GRAD[key]
+                    ),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ops
 
     def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
@@ -812,10 +1181,19 @@ def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "__rmod__",
             "__rsub__",
             "__rpow__",
+<<<<<<< HEAD
+=======
+            "bernoulli",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "clamp_max",
             "clamp_min",
             "masked_scatter",
             # unsupported float64 dtype
+<<<<<<< HEAD
+=======
+            "cat",
+            "complex",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "multinomial",
             "nn.functional.conv1d",
             "nn.functional.conv2d",
@@ -827,6 +1205,13 @@ def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "amax",
             "amin",
             "aminmax",
+<<<<<<< HEAD
+=======
+            # memory overlapping checks
+            "index_select",
+            # unimplemented
+            "logcumsumexp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
@@ -838,6 +1223,7 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                 addDecorator(op, DecorateInfo(unittest.expectedFailure))
 
         return ops
+<<<<<<< HEAD
 else:
 
     def mps_ops_modifier(
@@ -847,3 +1233,5 @@ def mps_ops_modifier(
         sparse: bool = False,
     ) -> Sequence[OpInfo]:
         return ops
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 3153359326dca..2ad8f1b34a37a 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -15,7 +15,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import _reduction as _Reduction
+<<<<<<< HEAD
 from torch.testing._internal import common_utils
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \
     gradcheck, gradgradcheck, set_default_dtype, skipIfTorchDynamo, TEST_WITH_ROCM
 from torch.testing._internal.common_cuda import TEST_CUDA, SM90OrLater
@@ -24,8 +27,12 @@
 from torch.types import _TensorOrTensors
 import torch.backends.cudnn
 
+<<<<<<< HEAD
 from typing import Union, Any
 from collections.abc import Callable
+=======
+from typing import Callable, Union, Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Sequence
 
 TemporaryFile = tempfile.TemporaryFile
@@ -1080,7 +1087,10 @@ def unsqueeze_inp(inp):
 
 
 def get_new_module_tests():
+<<<<<<< HEAD
     common_utils.set_rng_seed()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_module_tests = [
         poissonnllloss_no_reduce_test(),
         bceloss_no_reduce_test(),
@@ -2633,7 +2643,11 @@ def get_new_module_tests():
     # add conv padding mode tests:
     for padding_mode, cpp_padding_mode in zip(
             ['reflect', 'circular', 'replicate', 'zeros'],
+<<<<<<< HEAD
             ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros'], strict=True):
+=======
+            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # conv signature:
         #     in_channels, out_channels, kernel_size, stride=1,
         #     padding=0, dilation=1, groups=1,
@@ -2848,8 +2862,13 @@ def nll_loss_helper(input, target, weight, ignore_index):
         return (result, norm)
 
     losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
+<<<<<<< HEAD
                           for i, t in zip(input, target, strict=True)]
     losses, weights = zip(*losses_and_weights, strict=True)
+=======
+                          for i, t in zip(input, target)]
+    losses, weights = zip(*losses_and_weights)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     losses_tensor = input.new_tensor(losses)
     if reduction == 'mean':
         return sum(losses_tensor) / sum(weights)
@@ -2896,7 +2915,11 @@ def _multilabelmarginloss_reference(input, target):
 
     sum = 0
     for target_index in targets:
+<<<<<<< HEAD
         for i in range(len(input)):
+=======
+        for i in range(0, len(input)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])
 
@@ -2914,7 +2937,11 @@ def multilabelmarginloss_reference(input, target, reduction='mean'):
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n).zero_()
+<<<<<<< HEAD
     for i in range(n):
+=======
+    for i in range(0, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output[i] = _multilabelmarginloss_reference(input[i], target[i])
 
     if reduction == 'mean':
@@ -2955,7 +2982,11 @@ def _multimarginloss_reference(input, target_idx, p, margin, weight):
         weight = input.new(len(input)).fill_(1)
 
     output = 0
+<<<<<<< HEAD
     for i in range(len(input)):
+=======
+    for i in range(0, len(input)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if i != target_idx:
             output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
     return output
@@ -2972,7 +3003,11 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n)
+<<<<<<< HEAD
     for x in range(n):
+=======
+    for x in range(0, n):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
 
     if reduction == 'mean':
@@ -2987,7 +3022,11 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
 def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
     def _cos(a, b):
         cos = a.new(a.size(0))
+<<<<<<< HEAD
         for i in range(a.size(0)):
+=======
+        for i in range(0, a.size(0)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
         return cos
 
@@ -3268,7 +3307,11 @@ def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=T
         for i in range(output_size):
             param, d_param = self._get_parameters(module)
             # make non grad zeros
+<<<<<<< HEAD
             d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param, strict=True)]
+=======
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             d_out = torch.zeros_like(output)
             flat_d_out = d_out.view(-1)
@@ -3282,7 +3325,11 @@ def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=T
             d_input = self._backward(module, input, output, d_out)
 
             if jacobian_input:
+<<<<<<< HEAD
                 for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input), strict=True):
+=======
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     jacobian_x[:, i] = d_x.contiguous().view(-1)
             if jacobian_parameters:
                 jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
@@ -3320,7 +3367,11 @@ def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
         numerical_t = list(_iter_tensors(numerical))
 
         differences = []
+<<<<<<< HEAD
         for a, n in zip(analytical_t, numerical_t, strict=True):
+=======
+        for a, n in zip(analytical_t, numerical_t):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if a.numel() != 0:
                 differences.append(a.add(n, alpha=-1).abs().max())
             # TODO: compare structure (ensure analytic jacobian has correct shape)
@@ -3421,7 +3472,11 @@ def __init__(self, *args, **kwargs):
             kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
         self.precision = kwargs.get('precision', 2e-4)
         self.check_forward_only = kwargs.get('check_forward_only', False)
+<<<<<<< HEAD
         self.default_dtype = kwargs.get('default_dtype')
+=======
+        self.default_dtype = kwargs.get('default_dtype', None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.default_dtype is None:
             self.default_dtype = torch.get_default_dtype()
 
@@ -3528,7 +3583,11 @@ def test_cuda(self, test_case):
             gpu_module = self.constructor(*self.constructor_args).float().cuda()
             cpu_param = test_case._get_parameters(cpu_module)
             gpu_param = test_case._get_parameters(gpu_module)
+<<<<<<< HEAD
             for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0], strict=True):
+=======
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gpu_p.data.copy_(cpu_p)
 
             test_case._zero_grad_input(cpu_input_tuple)
@@ -3549,7 +3608,11 @@ def test_cuda(self, test_case):
                 cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
                 gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+<<<<<<< HEAD
                 for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1], strict=True):
+=======
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
             # Run double-backwards on CPU and GPU and compare results
@@ -3575,7 +3638,11 @@ def test_cuda(self, test_case):
                     gpu_gradOutput,
                     create_graph=True)
 
+<<<<<<< HEAD
                 for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs, strict=True):
+=======
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
 
                 # We mix output into the second backwards computation so that
@@ -3598,7 +3665,11 @@ def test_cuda(self, test_case):
                     gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                     retain_graph=True)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+<<<<<<< HEAD
                 for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg, strict=True):
+=======
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
 
             self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
@@ -3632,7 +3703,11 @@ def __init__(self, *args, **kwargs):
         self.test_cpu = kwargs.get('test_cpu', True)
         self.has_sparse_gradients = kwargs.get('has_sparse_gradients', False)
         self.check_batched_grad = kwargs.get('check_batched_grad', True)
+<<<<<<< HEAD
         self.gradcheck_fast_mode = kwargs.get('gradcheck_fast_mode')
+=======
+        self.gradcheck_fast_mode = kwargs.get('gradcheck_fast_mode', None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.supports_forward_ad = kwargs.get('supports_forward_ad', False)
         self.supports_fwgrad_bwgrad = kwargs.get('supports_fwgrad_bwgrad', False)
 
@@ -3836,7 +3911,11 @@ def __init__(self, *args, **kwargs):
         self.with_tf32 = kwargs.get('with_tf32', True)
         self.tf32_precision = kwargs.get('tf32_precision', 0.001)
         self.check_batched_grad = kwargs.get('check_batched_grad', True)
+<<<<<<< HEAD
         self.default_dtype = kwargs.get('default_dtype')
+=======
+        self.default_dtype = kwargs.get('default_dtype', None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.default_dtype is None:
             self.default_dtype = torch.get_default_dtype()
 
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 6b41e24b96caf..e2cfbfa954072 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -20,7 +20,10 @@
     AdamW,
     ASGD,
     LBFGS,
+<<<<<<< HEAD
     Muon,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NAdam,
     Optimizer,
     RAdam,
@@ -43,14 +46,21 @@
     _TestParametrizer,
     skipIfMPS,
     skipIfTorchDynamo,
+<<<<<<< HEAD
+=======
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_TORCHDYNAMO,
 )
 from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
 
 
+<<<<<<< HEAD
 CUDA_CONFIG_GPUS = ["cuda", "xpu"]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OptimizerInput:
     """Contains args / kwargs to be passed to an optimizer constructor."""
 
@@ -248,9 +258,14 @@ def test_wrapper(*args, **kwargs):
 # Helper function for generating error inputs for all optimizers, used below.
 def get_error_inputs_for_all_optims(device, dtype):
     if _get_device_type(device) == "cpu":
+<<<<<<< HEAD
         # Creating 2D parameters for compatibility with Muon.
         sample_param = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
         sample_param2 = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
+=======
+        sample_param = Parameter(torch.randn(1, device=device, dtype=dtype))
+        sample_param2 = Parameter(torch.randn(1, device=device, dtype=dtype))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             ErrorOptimizerInput(
                 OptimizerInput(
@@ -371,7 +386,11 @@ def optim_inputs_func_adadelta(device, dtype=None):
         OptimizerInput(
             params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_adadelta(device, dtype):
@@ -530,7 +549,11 @@ def optim_inputs_func_adam(device, dtype=None):
             params=None,
             kwargs={
                 "lr": torch.tensor(0.001),
+<<<<<<< HEAD
                 "betas": (torch.tensor([[[0.9]]]), torch.tensor([[0.99]])),
+=======
+                "betas": (torch.tensor(0.9), torch.tensor(0.99)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "amsgrad": True,
                 "capturable": True,
             },
@@ -571,6 +594,7 @@ def optim_inputs_func_adam(device, dtype=None):
                 desc="amsgrad",
             ),
         ]
+<<<<<<< HEAD
         + (
             cuda_supported_configs
             if _get_device_type(device) in CUDA_CONFIG_GPUS
@@ -579,6 +603,12 @@ def optim_inputs_func_adam(device, dtype=None):
         + (mps_supported_configs if _get_device_type(device) == "mps" else [])
     )
     if dtype == torch.float16:
+=======
+        + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+        + (mps_supported_configs if _get_device_type(device) == "mps" else [])
+    )
+    if dtype in (torch.float16,):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for input in total:
             """
             Too small eps will make denom to be zero for low precision dtype
@@ -656,7 +686,11 @@ def optim_error_inputs_func_adam(device, dtype):
                 error_regex=r"betas\[0\] as a Tensor is not supported for capturable=False and foreach=True",
             ),
         ]
+<<<<<<< HEAD
     if _get_device_type(device) in CUDA_CONFIG_GPUS:
+=======
+    if _get_device_type(device) == "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sample_tensor = torch.empty((), device=device, dtype=dtype)
         error_inputs += [
             ErrorOptimizerInput(
@@ -727,7 +761,11 @@ def optim_inputs_func_adamax(device, dtype=None):
             kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize, weight_decay",
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_adamax(device, dtype):
@@ -798,7 +836,11 @@ def optim_inputs_func_asgd(device, dtype=None):
             kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize, nonzero weight_decay",
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_asgd(device, dtype):
@@ -841,6 +883,7 @@ def optim_error_inputs_func_lbfgs(device, dtype):
     return error_inputs
 
 
+<<<<<<< HEAD
 def optim_inputs_func_muon(device, dtype=None):
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
@@ -916,6 +959,8 @@ def optim_error_inputs_func_muon(device, dtype):
     return error_inputs
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def optim_inputs_func_nadam(device, dtype=None):
     cuda_supported_configs = [
         OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
@@ -980,7 +1025,11 @@ def optim_inputs_func_nadam(device, dtype=None):
             kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_nadam(device, dtype):
@@ -1058,7 +1107,11 @@ def optim_inputs_func_radam(device=None, dtype=None):
             kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_radam(device, dtype):
@@ -1143,7 +1196,11 @@ def optim_inputs_func_rmsprop(device, dtype=None):
             },
             desc="maximize, centered, weight_decay, w/ momentum",
         ),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_rmsprop(device, dtype):
@@ -1185,7 +1242,11 @@ def optim_inputs_func_rprop(device, dtype=None):
             desc="non-default step_sizes",
         ),
         OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
+<<<<<<< HEAD
     ] + (cuda_supported_configs if _get_device_type(device) in CUDA_CONFIG_GPUS else [])
+=======
+    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def optim_error_inputs_func_rprop(device, dtype):
@@ -1351,9 +1412,15 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
     trivial. That said, we sometimes want to test for all possible configs on an
     optimizer including all supported flags, so this helper returns all optim inputs.
     """
+<<<<<<< HEAD
     assert all(x in ["foreach", "fused", "differentiable"] for x in skip), (
         "skip must be a subset of ['foreach', 'fused', 'differentiable']"
     )
+=======
+    assert all(
+        x in ["foreach", "fused", "differentiable"] for x in skip
+    ), "skip must be a subset of ['foreach', 'fused', 'differentiable']"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     optim_inputs = optim_info.optim_inputs_func(device)
 
@@ -1677,7 +1744,11 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             "maximize",
             "capturable",
         ),
+<<<<<<< HEAD
         supports_fused_on=("cpu", "cuda", "xpu", "mps"),
+=======
+        supports_fused_on=("cpu", "cuda", "mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         decorators=(
             # Expected floating point error between fused and compiled forloop
             DecorateInfo(
@@ -1953,6 +2024,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         ),
     ),
     OptimizerInfo(
+<<<<<<< HEAD
         Muon,
         optim_inputs_func=optim_inputs_func_muon,
         optim_error_inputs_func=optim_error_inputs_func_muon,
@@ -1974,6 +2046,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         ),
     ),
     OptimizerInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         NAdam,
         optim_inputs_func=optim_inputs_func_nadam,
         optim_error_inputs_func=optim_error_inputs_func_nadam,
@@ -2167,7 +2241,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supports_fused_on=(
             "cpu",
             "cuda",
+<<<<<<< HEAD
             "xpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "mps",
         ),
         skips=(
@@ -2186,6 +2263,19 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "test_complex_2d",
             ),
             DecorateInfo(
+<<<<<<< HEAD
+=======
+                toleranceOverride(
+                    {  # previously atol=5-05, rtol=0.001, https://github.com/pytorch/pytorch/issues/116202
+                        torch.float32: tol(atol=5e-04, rtol=0.007),
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_mixed_device_dtype",
+                active_if=TEST_WITH_TORCHDYNAMO,
+            ),
+            DecorateInfo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo(
                     "This test uses mocks, which dynamo does not support"
                 ),
@@ -2209,6 +2299,12 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+<<<<<<< HEAD
+=======
+                skipIfXpu(msg="SparseAdam is not yet supported on the XPU stack"),
+            ),
+            DecorateInfo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
                 "TestOptimRenewed",
                 "test_param_groups_lr",
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index fde4f396b2b91..f1646d4c6d14f 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -58,7 +58,11 @@
     XNNPACKQuantizer,
 )
 
+<<<<<<< HEAD
 from torch.export import export
+=======
+from torch.export import export_for_training
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.jit.mobile import _load_for_lite_interpreter
 from torch.testing._internal.common_quantized import override_quantized_engine
 from torch.testing._internal.common_utils import TEST_WITH_ROCM, TestCase
@@ -87,8 +91,12 @@
 import os
 
 import unittest
+<<<<<<< HEAD
 from typing import Any, Optional, Union
 from collections.abc import Callable
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 import torch._dynamo as torchdynamo
@@ -612,7 +620,11 @@ def _group_quantize_tensor_symmetric(w, n_bit=4, groupsize=32):
 
 
 def _dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
+<<<<<<< HEAD
     # source: https://github.com/meta-pytorch/gpt-fast/blob/main/quantize.py
+=======
+    # source: https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # default setup for affine quantization of activations
     x_dtype = x.dtype
     x = x.float()
@@ -766,7 +778,11 @@ def is_leaf_module(module):
             and not isinstance(module, _FusedModule)
         ):
             for child in module.children():
+<<<<<<< HEAD
                 if type(child) is nn.Dropout:
+=======
+                if type(child) in [nn.Dropout]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 self.checkObservers(
                     child, propagate_qconfig_list, prepare_custom_config_dict
@@ -1247,7 +1263,11 @@ def checkGraphModeFxOp(
                }
             """
             # TODO: make img_data a single example instead of a list
+<<<<<<< HEAD
             if type(inputs) is list:
+=======
+            if type(inputs) == list:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inputs = inputs[0]
 
             if quant_type == QuantType.QAT:
@@ -1287,7 +1307,11 @@ def checkGraphModeFxOp(
                 prepare_custom_config=prepare_custom_config,
                 backend_config=backend_config,
             )
+<<<<<<< HEAD
             if quant_type != QuantType.DYNAMIC:
+=======
+            if not quant_type == QuantType.DYNAMIC:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 prepared(*inputs)
 
             if print_debug_info:
@@ -1514,7 +1538,11 @@ def _test_quantizer(
             {0: torch.export.Dim("dim")} if i == 0 else None
             for i in range(len(example_inputs))
         )
+<<<<<<< HEAD
         m = export(
+=======
+        m = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m,
             example_inputs,
             dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
@@ -1555,7 +1583,11 @@ def _test_quantizer(
             m_fx = _convert_to_reference_decomposed_fx(
                 m_fx, backend_config=backend_config
             )
+<<<<<<< HEAD
             m_fx = export(
+=======
+            m_fx = export_for_training(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 m_fx,
                 example_inputs,
                 dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
@@ -1579,7 +1611,11 @@ def _quantize(self, m, quantizer, example_inputs, is_qat: bool = False):
         # resetting dynamo cache
         torch._dynamo.reset()
 
+<<<<<<< HEAD
         m = export(m, example_inputs, strict=True).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_qat:
             m = prepare_qat_pt2e(m, quantizer)
         else:
@@ -3184,15 +3220,23 @@ def forward(self, x):
             x = self.adaptive_avg_pool2d(x)
             return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class ConvWithBNRelu(torch.nn.Module):
         def __init__(self, relu, dim=2, bn=True, bias=True, padding=0):
             super().__init__()
             convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+<<<<<<< HEAD
             bns = {
                 1: torch.nn.BatchNorm1d,
                 2: torch.nn.BatchNorm2d,
                 3: torch.nn.BatchNorm3d,
             }
+=======
+            bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.conv = convs[dim](3, 3, 3, bias=bias, padding=padding)
 
             if bn:
@@ -3398,7 +3442,11 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
     with maybe_no_grad:
+<<<<<<< HEAD
         export_model = export(mod, inputs, strict=True).module(check_guards=False)
+=======
+        export_model = export_for_training(mod, inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = (
             quantizer
             if quantizer
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 10670123d630f..6d80ce69cff9d 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -10,6 +10,10 @@
 from torch.testing._internal.common_utils import TEST_WITH_TSAN, IS_PPC, IS_MACOS, IS_WINDOWS
 
 supported_qengines = torch.backends.quantized.supported_engines
+<<<<<<< HEAD
+=======
+supported_qengines.remove('none')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Note: We currently do not run QNNPACK tests on WINDOWS and MACOS as it is flaky. Issue #29326
 # QNNPACK is not supported on PPC
 if 'qnnpack' in supported_qengines and any([IS_PPC, TEST_WITH_TSAN, IS_MACOS, IS_WINDOWS]):
@@ -447,6 +451,7 @@ def _floatx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
 def ceil_div(a, b):
     return (a + b - 1) // b
 
+<<<<<<< HEAD
 # NVIDIA Blackwell HW requires scales for MX/NV blocked formats to be in a 128x4 tile layout,
 # with a weird 32x4x4 internal layout of that tile. If we want to take swizzled scales and use them
 # for non-gemm purposes (like testing), we need to de-swizzle them, then they can be applied much
@@ -497,6 +502,8 @@ def from_blocked_format(x_mxfp8, scales_unswizzled, blocksize=32):
     x_f32 = x_mxfp8.to(torch.float) * scales.to(torch.float)
     return x_f32.to(torch.bfloat16)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def to_blocked(input_matrix) -> torch.Tensor:
     """
     Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
@@ -529,6 +536,7 @@ def to_blocked(input_matrix) -> torch.Tensor:
     rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
 
     return rearranged.flatten()
+<<<<<<< HEAD
 
 # This function is extracted from https://github.com/pytorch/ao/blob/v0.12.0/torchao/prototype/mx_formats/mx_tensor.py#L142
 def to_mxfp8(
@@ -636,3 +644,5 @@ def generate_jagged_offs(E, M, multiple_of=16, dtype=torch.int32, device="cuda")
     selected_values, _ = torch.sort(selected_values)
 
     return selected_values.to(dtype).to(device)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index becc8615929e7..7528e8c831abd 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -48,11 +48,19 @@
 from statistics import mean
 from typing import (
     Any,
+<<<<<<< HEAD
+=======
+    Callable,
+    Dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Optional,
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterable, Iterator
 from unittest.mock import MagicMock
 
@@ -68,6 +76,10 @@
 import torch.cuda
 from torch import Tensor
 from torch._C import ScriptDict, ScriptList  # type: ignore[attr-defined]
+<<<<<<< HEAD
+=======
+from torch._dynamo.trace_rules import _as_posix_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._utils_internal import get_writable_path
 from torch._logging.scribe import open_source_signpost
 from torch.nn import (
@@ -95,19 +107,29 @@
 import torch.utils._pytree as pytree
 from torch.utils import cpp_extension
 try:
+<<<<<<< HEAD
     import pytest  # type: ignore[import-not-found]
+=======
+    import pytest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     has_pytest = True
 except ImportError:
     has_pytest = False
 
+<<<<<<< HEAD
 SEED = 1234
 MI350_ARCH = ("gfx950",)
 MI300_ARCH = ("gfx940", "gfx941", "gfx942")
 MI200_ARCH = ("gfx90a")
+=======
+
+MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
 
+<<<<<<< HEAD
 def is_arch(arch_list):
     if torch.cuda.is_available():
         prop = torch.cuda.get_device_properties(0)
@@ -142,6 +164,16 @@ class ProfilingMode(Enum):
 USE_PYTEST = False
 
 
+=======
+def is_navi3_arch():
+    if torch.cuda.is_available():
+        prop = torch.cuda.get_device_properties(0)
+        gfx_arch = prop.gcnArchName.split(":")[0]
+        if gfx_arch in NAVI3_ARCH:
+            return True
+    return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def freeze_rng_state(*args, **kwargs):
     return torch.testing._utils.freeze_rng_state(*args, **kwargs)
 
@@ -334,7 +366,11 @@ def maybe_load_json(filename):
 if os.getenv("DISABLED_TESTS_FILE", ""):
     disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
 
+<<<<<<< HEAD
 NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', 'mps', torch._C._get_privateuse1_backend_name())
+=======
+NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', torch._C._get_privateuse1_backend_name())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # used for managing devices testing for torch profiler UTs
 # for now cpu, cuda and xpu are added for testing torch profiler UTs
@@ -366,10 +402,16 @@ def extract_test_fn() -> Optional[Callable]:
             self_val = frame.f_locals["self"]
             if isinstance(self_val, unittest.TestCase):
                 test_id = self_val.id()
+<<<<<<< HEAD
                 *_, cls_name, test_name = test_id.rsplit('.', 2)
                 if cls_name == type(self_val).__name__ and test_name.startswith("test"):
                     test_fn = getattr(self_val, test_name).__func__
                     return test_fn
+=======
+                test_name = test_id.split('.')[2]
+                test_fn = getattr(self_val, test_name).__func__
+                return test_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except Exception:
         pass
     return None
@@ -700,7 +742,11 @@ def _formatted_str_repr(self, idx, name, value):
             return f"{name}{idx}"
 
     def _default_subtest_name(self, idx, values):
+<<<<<<< HEAD
         return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values, strict=True)])
+=======
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_subtest_name(self, idx, values, explicit_name=None):
         if explicit_name:
@@ -744,7 +790,11 @@ def test_wrapper(*args, **kwargs):
                     raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
                                        f'values and {len(self.arg_names)} names for test "{test.__name__}"')
 
+<<<<<<< HEAD
                 param_kwargs = dict(zip(self.arg_names, values, strict=True))
+=======
+                param_kwargs = dict(zip(self.arg_names, values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)
 
@@ -876,6 +926,14 @@ def test_wrapper(*args, **kwargs):
         yield (test_wrapper, test_name, {}, decorator_fn)
 
 
+<<<<<<< HEAD
+=======
+class ProfilingMode(Enum):
+    LEGACY = 1
+    SIMPLE = 2
+    PROFILING = 3
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cppProfilingFlagsToProfilingMode():
     old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
     old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
@@ -894,7 +952,10 @@ def cppProfilingFlagsToProfilingMode():
 def enable_profiling_mode_for_profiling_tests():
     old_prof_exec_state = False
     old_prof_mode_state = False
+<<<<<<< HEAD
     assert GRAPH_EXECUTOR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
         old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
         old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
@@ -929,7 +990,10 @@ def num_profiled_runs(num_runs):
 def prof_callable(callable, *args, **kwargs):
     if 'profile_and_replay' in kwargs:
         del kwargs['profile_and_replay']
+<<<<<<< HEAD
         assert GRAPH_EXECUTOR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
             with enable_profiling_mode_for_profiling_tests():
                 callable(*args, **kwargs)
@@ -959,6 +1023,7 @@ def _get_test_report_path():
     test_source = override if override is not None else 'python-unittest'
     return os.path.join('test-reports', test_source)
 
+<<<<<<< HEAD
 def parse_cmd_line_args():
     global CI_FUNCTORCH_ROOT
     global CI_PT_ROOT
@@ -1044,6 +1109,74 @@ def run_unittest_help(argv):
     CI_TEST_PREFIX = str(Path(os.getcwd()))
     CI_PT_ROOT = str(Path(os.getcwd()).parent)
     CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
+=======
+is_running_via_run_test = "run_test.py" in getattr(__main__, "__file__", "")
+parser = argparse.ArgumentParser(add_help=not is_running_via_run_test, allow_abbrev=False)
+parser.add_argument('--subprocess', action='store_true',
+                    help='whether to run each test in a subprocess')
+parser.add_argument('--seed', type=int, default=1234)
+parser.add_argument('--accept', action='store_true')
+parser.add_argument('--jit-executor', '--jit_executor', type=str)
+parser.add_argument('--repeat', type=int, default=1)
+parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
+parser.add_argument('--use-pytest', action='store_true')
+parser.add_argument('--save-xml', nargs='?', type=str,
+                    const=_get_test_report_path(),
+                    default=_get_test_report_path() if IS_CI else None)
+parser.add_argument('--discover-tests', action='store_true')
+parser.add_argument('--log-suffix', type=str, default="")
+parser.add_argument('--run-parallel', type=int, default=1)
+parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
+parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
+parser.add_argument('--rerun-disabled-tests', action='store_true')
+parser.add_argument('--pytest-single-test', type=str, nargs=1)
+parser.add_argument('--showlocals', action=argparse.BooleanOptionalAction, default=False)
+
+# Only run when -h or --help flag is active to display both unittest and parser help messages.
+def run_unittest_help(argv):
+    unittest.main(argv=argv)
+
+if '-h' in sys.argv or '--help' in sys.argv:
+    help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,))
+    help_thread.start()
+    help_thread.join()
+
+args, remaining = parser.parse_known_args()
+if args.jit_executor == 'legacy':
+    GRAPH_EXECUTOR = ProfilingMode.LEGACY
+elif args.jit_executor == 'profiling':
+    GRAPH_EXECUTOR = ProfilingMode.PROFILING
+elif args.jit_executor == 'simple':
+    GRAPH_EXECUTOR = ProfilingMode.SIMPLE
+else:
+    # infer flags based on the default settings
+    GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
+
+RERUN_DISABLED_TESTS = args.rerun_disabled_tests
+
+SLOW_TESTS_FILE = args.import_slow_tests
+DISABLED_TESTS_FILE = args.import_disabled_tests
+LOG_SUFFIX = args.log_suffix
+RUN_PARALLEL = args.run_parallel
+TEST_BAILOUTS = args.test_bailouts
+USE_PYTEST = args.use_pytest
+PYTEST_SINGLE_TEST = args.pytest_single_test
+TEST_DISCOVER = args.discover_tests
+TEST_IN_SUBPROCESS = args.subprocess
+TEST_SAVE_XML = args.save_xml
+REPEAT_COUNT = args.repeat
+SEED = args.seed
+SHOWLOCALS = args.showlocals
+if not getattr(expecttest, "ACCEPT", False):
+    expecttest.ACCEPT = args.accept
+UNITTEST_ARGS = [sys.argv[0]] + remaining
+torch.manual_seed(SEED)
+
+# CI Prefix path used only on CI environment
+CI_TEST_PREFIX = str(Path(os.getcwd()))
+CI_PT_ROOT = str(Path(os.getcwd()).parent)
+CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def wait_for_process(p, timeout=None):
     try:
@@ -1192,9 +1325,13 @@ def lint_test_case_extension(suite):
     return succeed
 
 
+<<<<<<< HEAD
 def get_report_path(argv=None, pytest=False):
     if argv is None:
         argv = UNITTEST_ARGS
+=======
+def get_report_path(argv=UNITTEST_ARGS, pytest=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     test_filename = sanitize_test_filename(argv[0])
     test_report_path = TEST_SAVE_XML + LOG_SUFFIX
     test_report_path = os.path.join(test_report_path, test_filename)
@@ -1245,11 +1382,15 @@ def pytest_collection_finish(self, session):
     return test_collector_plugin.tests
 
 
+<<<<<<< HEAD
 def run_tests(argv=None):
     parse_cmd_line_args()
     if argv is None:
         argv = UNITTEST_ARGS
 
+=======
+def run_tests(argv=UNITTEST_ARGS):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # import test files.
     if SLOW_TESTS_FILE:
         if os.path.exists(SLOW_TESTS_FILE):
@@ -1259,7 +1400,11 @@ def run_tests(argv=None):
                 # use env vars so pytest-xdist subprocesses can still access them
                 os.environ['SLOW_TESTS_FILE'] = SLOW_TESTS_FILE
         else:
+<<<<<<< HEAD
             warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}', stacklevel=2)
+=======
+            warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if DISABLED_TESTS_FILE:
         if os.path.exists(DISABLED_TESTS_FILE):
             with open(DISABLED_TESTS_FILE) as fp:
@@ -1267,7 +1412,11 @@ def run_tests(argv=None):
                 disabled_tests_dict = json.load(fp)
                 os.environ['DISABLED_TESTS_FILE'] = DISABLED_TESTS_FILE
         else:
+<<<<<<< HEAD
             warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}', stacklevel=2)
+=======
+            warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Determine the test launch mechanism
     if TEST_DISCOVER:
         _print_test_names()
@@ -1479,7 +1628,11 @@ def is_privateuse1_backend_available():
 TEST_MPS = torch.backends.mps.is_available()
 MACOS_VERSION = float('.'.join(platform.mac_ver()[0].split('.')[:2]) or -1)
 TEST_XPU = torch.xpu.is_available()
+<<<<<<< HEAD
 TEST_HPU = bool(hasattr(torch, "hpu") and torch.hpu.is_available())
+=======
+TEST_HPU = True if (hasattr(torch, "hpu") and torch.hpu.is_available()) else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_CUDA = torch.cuda.is_available()
 custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
 TEST_PRIVATEUSE1 = is_privateuse1_backend_available()
@@ -1523,10 +1676,13 @@ def split_if_not_empty(x: str):
     "TEST_WITH_ROCM",
     env_var="PYTORCH_TEST_WITH_ROCM",
 )
+<<<<<<< HEAD
 TEST_WITH_MTIA: bool = TestEnvironment.def_flag(
     "TEST_WITH_MTIA",
     env_var="PYTORCH_TEST_WITH_MTIA",
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
 # See #64427
@@ -1760,10 +1916,13 @@ def serialTest(condition=True):
     """
     Decorator for running tests serially.  Requires pytest
     """
+<<<<<<< HEAD
     # If one apply decorator directly condition will be callable
     # And test will essentially be essentially skipped, which is undesirable
     assert type(condition) is bool
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def decorator(fn):
         if has_pytest and condition:
             return pytest.mark.serial(fn)
@@ -1819,7 +1978,10 @@ def decorator(fn):
         if not isinstance(fn, type):
             @wraps(fn)
             def wrapper(*args, **kwargs):
+<<<<<<< HEAD
                 assert GRAPH_EXECUTOR
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if GRAPH_EXECUTOR == ProfilingMode.LEGACY:
                     raise unittest.SkipTest(msg)
                 else:
@@ -1984,6 +2146,7 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+<<<<<<< HEAD
 def getRocmArchName(device_index: int = 0):
     return torch.cuda.get_device_properties(device_index).gcnArchName
 
@@ -1991,10 +2154,13 @@ def isRocmArchAnyOf(arch: tuple[str, ...]):
     rocmArch = getRocmArchName()
     return any(x in rocmArch for x in arch)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfRocmArch(arch: tuple[str, ...]):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
+<<<<<<< HEAD
             if TEST_WITH_ROCM and isRocmArchAnyOf(arch):
                 reason = f"skipIfRocm: test skipped on {arch}"
                 raise unittest.SkipTest(reason)
@@ -2022,6 +2188,12 @@ def wrap_fn(self, *args, **kwargs):
                 if not result:
                     reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
                         f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
+=======
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise unittest.SkipTest(reason)
             return fn(self, *args, **kwargs)
         return wrap_fn
@@ -2040,9 +2212,17 @@ def runOnRocmArch(arch: tuple[str, ...]):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
+<<<<<<< HEAD
             if TEST_WITH_ROCM and not isRocmArchAnyOf(arch):
                 reason = f"skipIfRocm: test only runs on {arch}"
                 raise unittest.SkipTest(reason)
+=======
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] not in arch:
+                    reason = f"skipIfRocm: test only runs on {arch}"
+                    raise unittest.SkipTest(reason)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return fn(self, *args, **kwargs)
         return wrap_fn
     return dec_fn
@@ -2050,6 +2230,7 @@ def wrap_fn(self, *args, **kwargs):
 def xfailIfS390X(func):
     return unittest.expectedFailure(func) if IS_S390X else func
 
+<<<<<<< HEAD
 def xfailIf(condition):
     def wrapper(func):
         if condition:
@@ -2058,6 +2239,8 @@ def wrapper(func):
             return func
     return wrapper
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfXpu(func=None, *, msg="test doesn't currently work on the XPU stack"):
     def dec_fn(fn):
         reason = f"skipIfXpu: {msg}"
@@ -2083,6 +2266,19 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+<<<<<<< HEAD
+=======
+def skipIfMPSOnMacOS13(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_MPS and int(MACOS_VERSION) == 13:
+            raise unittest.SkipTest("Test crashes MPSGraph on MacOS13")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfHpu(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -2092,18 +2288,27 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+<<<<<<< HEAD
 def getRocmVersion() -> tuple[int, int]:
     from torch.testing._internal.common_cuda import _get_torch_rocm_version
     rocm_version = _get_torch_rocm_version()
     return (rocm_version[0], rocm_version[1])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Skips a test on CUDA if ROCm is available and its version is lower than requested.
 def skipIfRocmVersionLessThan(version=None):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
             if TEST_WITH_ROCM:
+<<<<<<< HEAD
                 rocm_version_tuple = getRocmVersion()
+=======
+                rocm_version = str(torch.version.hip)
+                rocm_version = rocm_version.split("-")[0]    # ignore git sha
+                rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
                     reason = f"ROCm {rocm_version_tuple} is available but {version} required"
                     raise unittest.SkipTest(reason)
@@ -2111,6 +2316,26 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
+<<<<<<< HEAD
+=======
+def skipIfRocmVersionAndArch(version=None, arch=None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                rocm_version = str(torch.version.hip)
+                rocm_version = rocm_version.split("-")[0]    # ignore git sha
+                rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+                if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
+                    prop = torch.cuda.get_device_properties(0)
+                    if prop.gcnArchName.split(":")[0] in arch:
+                        reason = f"ROCm {version} and {arch} combination not supported"
+                        raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfNotMiopenSuggestNHWC(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -2135,6 +2360,7 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+<<<<<<< HEAD
 def skipIfWindowsXPU(func=None, *, msg="test doesn't currently work on the Windows stack"):
     def dec_fn(fn):
         reason = f"skipIfWindowsXPU: {msg}"
@@ -2170,6 +2396,8 @@ def requires_cuda_p2p_access():
         "cuda p2p access is not available",
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Reverts the linalg backend back to default to make sure potential failures in one
 # test do not affect other tests
 def setLinalgBackendsToDefaultFinally(fn):
@@ -2486,9 +2714,13 @@ def get_function_arglist(func):
     return inspect.getfullargspec(func).args
 
 
+<<<<<<< HEAD
 def set_rng_seed(seed=None):
     if seed is None:
         seed = SEED
+=======
+def set_rng_seed(seed):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.manual_seed(seed)
     random.seed(seed)
     if TEST_NUMPY:
@@ -2686,18 +2918,32 @@ def __exit__(self, exc_type, exc_value, traceback):
                 msg = ("CUDA caching allocator reports a memory leak not "  # type: ignore[possibly-undefined]
                        f"verified by the driver API in {self.name}! "
                        f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
+<<<<<<< HEAD
                        f"and is now reported as {caching_allocator_mem_allocated} "  # type: ignore[possibly-undefined]
                        f"on device {i}. "
                        f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")  # type: ignore[possibly-undefined]
                 warnings.warn(msg, stacklevel=2)
             elif caching_allocator_discrepancy and driver_discrepancy:  # type: ignore[possibly-undefined]
+=======
+                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"on device {i}. "
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
+                warnings.warn(msg)
+            elif caching_allocator_discrepancy and driver_discrepancy:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # A caching allocator discrepancy validated by the driver API is a
                 #   failure (except on ROCm, see below)
                 msg = (f"CUDA driver API confirmed a leak in {self.name}! "  # type: ignore[possibly-undefined]
                        f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
+<<<<<<< HEAD
                        f"and is now reported as {caching_allocator_mem_allocated} "  # type: ignore[possibly-undefined]
                        f"on device {i}. "
                        f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")  # type: ignore[possibly-undefined]
+=======
+                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"on device {i}. "
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 raise RuntimeError(msg)
 
@@ -2783,7 +3029,11 @@ def settings(*args, **kwargs):
         "pytorch_ci" if IS_CI else os.getenv('PYTORCH_HYPOTHESIS_PROFILE', 'dev')
     )
 except ImportError:
+<<<<<<< HEAD
     warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning, stacklevel=2)
+=======
+    warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Used in check_if_enable to see if a test method should be disabled by an issue,
 # sanitizes a test method name from appended suffixes by @dtypes parametrization.
@@ -3250,7 +3500,11 @@ def remove_comment_lines(self, input_string):
 
     def remove_empty_lines(self, input_string):
         lines = input_string.split('\n')
+<<<<<<< HEAD
         filtered_lines = [line for line in lines if line.strip() != '']
+=======
+        filtered_lines = [line for line in lines if not line.strip() == '']
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return '\n'.join(filtered_lines)
 
     # ignore comments will ignore lines that starts with # after being stripped
@@ -3427,7 +3681,11 @@ def expect_failure(f, file_name):
                     def wrapper(*args, **kwargs):
                         try:
                             f(*args, **kwargs)
+<<<<<<< HEAD
                         except BaseException as e:  # noqa: B036
+=======
+                        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             self.skipTest(e)
                         raise RuntimeError(f"Unexpected success, please remove `{file_name}`")
                     return wrapper
@@ -3449,7 +3707,11 @@ def ignore_failure(f, file_name):
                     def wrapper(*args, **kwargs):
                         try:
                             f(*args, **kwargs)
+<<<<<<< HEAD
                         except BaseException as e:  # noqa: B036
+=======
+                        except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             self.skipTest(e)
                         method = getattr(self, self._testMethodName)
                         if getattr(method, "__unittest_expecting_failure__", False):
@@ -3479,8 +3741,11 @@ def wrapper(*args, **kwargs):
 
         if strict_mode or should_reset_dynamo:
             torch._dynamo.reset()
+<<<<<<< HEAD
         elif torch._dynamo.config.compiled_autograd:
             torch._dynamo.compiled_autograd.reset()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Early terminate test if necessary.  If using pytest, use the -x flag instead
         if using_unittest and self._should_stop_test_suite():
@@ -3511,7 +3776,11 @@ def run(self, result=None):
 
     def setUp(self):
         check_if_enable(self)
+<<<<<<< HEAD
         set_rng_seed()
+=======
+        set_rng_seed(SEED)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Save global check sparse tensor invariants state that can be
         # restored from tearDown:
@@ -3729,7 +3998,11 @@ def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
             n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
         blocknnz = nnz // (blocksize0 * blocksize1)
         sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
+<<<<<<< HEAD
         sparse_tensors_it = map(list, zip(*sparse_tensors, strict=True))
+=======
+        sparse_tensors_it = map(list, zip(*sparse_tensors))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
         compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
@@ -4291,7 +4564,11 @@ def assertNotEqual(self, x, y, msg: Optional[str] = None, *,
             self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)
 
     def assertEqualTypeString(self, x, y) -> None:
+<<<<<<< HEAD
         # This API is used simulate deprecated x.type() is y.type()
+=======
+        # This API is used simulate deprecated x.type() == y.type()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(x.device, y.device)
         self.assertEqual(x.dtype, y.dtype)
         self.assertEqual(x.is_sparse, y.is_sparse)
@@ -4674,7 +4951,11 @@ def download_file(url, binary=True):
         return path
     except error.URLError as e:
         msg = f"could not download test file '{url}'"
+<<<<<<< HEAD
         warnings.warn(msg, RuntimeWarning, stacklevel=2)
+=======
+        warnings.warn(msg, RuntimeWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise unittest.SkipTest(msg) from e
 
 def find_free_port():
@@ -5173,7 +5454,11 @@ def gradcheck(fn, inputs, **kwargs):
 
     for key, value in default_values.items():
         # default value override values explicitly set to None
+<<<<<<< HEAD
         k = kwargs.get(key)
+=======
+        k = kwargs.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs[key] = k if k is not None else value
 
     return torch.autograd.gradcheck(fn, inputs, **kwargs)
@@ -5193,7 +5478,11 @@ def gradgradcheck(fn, inputs, grad_outputs=None, **kwargs):
 
     for key, value in default_values.items():
         # default value override values explicitly set to None
+<<<<<<< HEAD
         k = kwargs.get(key)
+=======
+        k = kwargs.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs[key] = k if k is not None else value
 
     return torch.autograd.gradgradcheck(fn, inputs, grad_outputs, **kwargs)
@@ -5708,8 +5997,11 @@ class LazyVal:
 
 
 def munge_exc(e, *, suppress_suffix=True, suppress_prefix=True, file=None, skip=0):
+<<<<<<< HEAD
     from torch._dynamo.trace_rules import _as_posix_path
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if file is None:
         file = inspect.stack()[1 + skip].filename  # skip one frame
 
@@ -5770,17 +6062,29 @@ def match_obj(obj):
         num_garbage_objs = len(garbage_objs)
         if num_garbage_objs > 0:
             warnings.warn(
+<<<<<<< HEAD
                 f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?", stacklevel=2
+=======
+                f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             try:
                 import objgraph  # type: ignore[import-not-found,import-untyped]
                 warnings.warn(
+<<<<<<< HEAD
                     f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png", stacklevel=2
+=======
+                    f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for g in garbage_objs[:limit]:
                     objgraph.show_backrefs([g], max_depth=10)
             except ImportError:
+<<<<<<< HEAD
                 warnings.warn("`pip install objgraph` to enable memory leak debugging", stacklevel=2)
+=======
+                warnings.warn("`pip install objgraph` to enable memory leak debugging")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     finally:
         gc.set_debug(0)
@@ -5809,7 +6113,12 @@ def install_cpp_extension(extension_root):
             shutil.rmtree(d)
 
     # Build the extension
+<<<<<<< HEAD
     cmd = [sys.executable, "-m", "pip", "install", extension_root, "-v", "--no-build-isolation", "--root", install_dir]
+=======
+    setup_py_path = os.path.join(extension_root, "setup.py")
+    cmd = [sys.executable, setup_py_path, "install", "--root", install_dir]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return_code = shell(cmd, cwd=extension_root, env=os.environ)
     if return_code != 0:
         raise RuntimeError(f"build failed for cpp extension at {extension_root}")
@@ -5846,6 +6155,7 @@ def load_inline(*args, **kwargs):
                 return cpp_extension.load_inline(*args, **kwargs)
 
         return func(*args, load_inline=load_inline, **kwargs)
+<<<<<<< HEAD
     return wrapper
 
 def recover_orig_fp32_precision(fn):
@@ -5884,6 +6194,13 @@ def wrap_fn(self, *args, **kwargs):
 
 # Decorator to patch multiple test class members for the duration of the subtest
 def patch_test_members(updates: dict[str, Any]):
+=======
+
+    return wrapper
+
+# Decorator to patch multiple test class members for the duration of the subtest
+def patch_test_members(updates: Dict[str, Any]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def decorator(test_func):
         @wraps(test_func)
         def wrapper(self, *args, **kwargs):
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 527fc8a5826e8..da347bad72239 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -10,7 +10,11 @@
 from functools import partial
 from torch.utils._mode_utils import no_dispatch, all_same_mode
 import torch.autograd.forward_ad as fwAD
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 
 
@@ -234,7 +238,11 @@ def wrap(e):
                     #    tensor results to be that of the tensors that alias the input
                     result = func(*args, **kwargs)
                     if isinstance(result, (tuple, list)):
+<<<<<<< HEAD
                         for a, b in zip(rs, result, strict=True):
+=======
+                        for a, b in zip(rs, result):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             a.set_(b)
                     else:
                         rs.set_(result)
@@ -303,7 +311,11 @@ def generate_subclass_choices(flat_args, CCT, cct_mode):
     for which_args_are_wrapped in itertools.product(*subclass_options):
 
         result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
+<<<<<<< HEAD
                   for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args, strict=True)]
+=======
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield result, which_args_are_wrapped
 
 
@@ -539,11 +551,19 @@ def maybe_make_dual(dual):
                 return fwAD.make_dual(primal.detach(), tangent)
             elif is_tensorlist(primal):
                 return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
+<<<<<<< HEAD
                              for pri, tang in zip(primal, tangent, strict=True))
             return primal
 
         def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
             op_args = tuple(map(maybe_make_dual, zip(args, tangent_args, strict=True)))
+=======
+                             for pri, tang in zip(primal, tangent))
+            return primal
+
+        def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}
 
             if gradcheck_wrapper is None:
@@ -572,7 +592,11 @@ def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
                 new_tang_args, new_tang_kwargs, \
                     which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
 
+<<<<<<< HEAD
                 op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args, strict=True)))
+=======
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
 
                 try:
diff --git a/torch/testing/_internal/custom_tensor.py b/torch/testing/_internal/custom_tensor.py
index de1b44ba8dac8..3128436ad7c8b 100644
--- a/torch/testing/_internal/custom_tensor.py
+++ b/torch/testing/_internal/custom_tensor.py
@@ -144,9 +144,13 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             new_out = pytree.tree_unflatten(
                 (
                     CustomTensorPlainOut(tensor1, tensor2)
+<<<<<<< HEAD
                     for tensor1, tensor2 in zip(
                         out_inner_flat_1, out_inner_flat_2, strict=True
                     )
+=======
+                    for tensor1, tensor2 in zip(out_inner_flat_1, out_inner_flat_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 spec,
             )
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
index 60c744ac1a84c..6315d263a4a1c 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -22,7 +22,11 @@ def world_size(self):
         return TEST_GPU_NUM
 
     def init_pg(self, backend="nccl"):
+<<<<<<< HEAD
         if backend not in ["nccl", "gloo", "mpi", "hccl"]:
+=======
+        if backend not in ["nccl", "gloo", "mpi"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(f"Backend {backend} not supported!")
 
         dist.init_process_group(
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 117f6ec8c1b25..6930244fb1ce3 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -2,6 +2,7 @@
 
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+<<<<<<< HEAD
 import contextlib
 import functools
 import itertools
@@ -11,11 +12,20 @@
 from dataclasses import dataclass
 from functools import partial, wraps
 from typing import Any, cast, Optional, TypeVar, Union
+=======
+import itertools
+import sys
+from collections.abc import Iterator, Sequence
+from dataclasses import dataclass
+from functools import partial, wraps
+from typing import Any, Callable, cast, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._local_tensor import (
     local_tensor_mode,
     LocalIntNode,
@@ -29,6 +39,12 @@
     distribute_tensor,
     DTensor,
     init_device_mesh,
+=======
+from torch._utils import _get_device_module
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Placement,
     Replicate,
     Shard,
@@ -41,13 +57,17 @@
     SequenceParallel,
 )
 from torch.testing._internal.common_distributed import (
+<<<<<<< HEAD
     MultiProcContinuousTest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MultiProcessTestCase,
     MultiThreadedTestCase,
     run_subtests,
     skip_if_lt_x_gpu,
     TEST_SKIPS,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     TEST_CUDA,
     TEST_HPU,
@@ -63,6 +83,24 @@
     DEVICE_TYPE = torch.accelerator.current_accelerator().type
     DEVICE_COUNT = torch.accelerator.device_count()
     PG_BACKEND = dist.Backend.default_device_backend_map[DEVICE_TYPE]
+=======
+from torch.testing._internal.common_utils import TEST_CUDA, TEST_HPU, TEST_XPU
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+
+
+if TEST_CUDA:
+    DEVICE_TYPE = "cuda"
+    PG_BACKEND = "nccl"
+    DEVICE_COUNT = _get_device_module("cuda").device_count()
+elif TEST_HPU:
+    DEVICE_TYPE = "hpu"
+    PG_BACKEND = "hccl"
+    DEVICE_COUNT = _get_device_module("hpu").device_count()
+elif TEST_XPU:
+    DEVICE_TYPE = "xpu"
+    PG_BACKEND = "xccl"
+    DEVICE_COUNT = _get_device_module("xpu").device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else:
     DEVICE_TYPE = "cpu"
     PG_BACKEND = "gloo"
@@ -70,7 +108,11 @@
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
+<<<<<<< HEAD
 if (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1) and DEVICE_COUNT > 1:
+=======
+if (TEST_CUDA or TEST_XPU or TEST_HPU) and DEVICE_COUNT > 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # when we actually have multiple GPUs, relax the requirement to smaller counts.
     NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
 
@@ -344,6 +386,7 @@ def skip_unless_torch_gpu(method: T) -> T:
     return cast(T, skip_if_lt_x_gpu(NUM_DEVICES)(method))
 
 
+<<<<<<< HEAD
 class DTensorContinuousTestBase(MultiProcContinuousTest):
     @classmethod
     def device_type(cls) -> str:
@@ -368,16 +411,24 @@ def is_local_tensor_enabled(self) -> bool:
         return False
 
     @property
+=======
+class DTensorTestBase(MultiProcessTestCase):
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def world_size(self) -> int:
         return NUM_DEVICES
 
     @property
     def device_type(self) -> str:
         # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
+<<<<<<< HEAD
         if (
             not (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1)
             or DEVICE_COUNT < self.world_size
         ):
+=======
+        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < self.world_size:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "cpu"
         else:
             return DEVICE_TYPE
@@ -387,6 +438,7 @@ def backend(self) -> str:
         backend = dist.get_default_backend_for_device(DEVICE_TYPE)
         return backend
 
+<<<<<<< HEAD
     def init_manual_seed_for_rank(self) -> None:
         torch.manual_seed(self.rank)
 
@@ -419,17 +471,47 @@ def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
             # set device for nccl pg for collectives
             # TODO: if users want to enable testing across hosts, we may need
             # to change this part.
+=======
+    def build_device_mesh(self) -> DeviceMesh:
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
+
+    def init_pg(self, eager_init) -> None:
+        if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+
+        if self.backend not in [
+            "nccl",
+            "gloo",
+            "mpi",
+            "cpu:gloo,cuda:nccl",
+            "hccl",
+            "xccl",
+        ]:
+            raise RuntimeError(f"Backend {self.backend} not supported!")
+
+        device_id = None
+        if "nccl" in self.backend or "xccl" in self.backend:
+            # set device for nccl pg for collectives
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.accelerator.set_device_index(self.rank)
             # we only need to set device_id for nccl backend with eager init
             device_id = (
                 torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
             )
+<<<<<<< HEAD
 
         # For nccl backend, bind the device to the process if device_id is not None
         # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
         # for form subgroup to avoid unnecessary overhead.
         dist.init_process_group(
             backend=backend,
+=======
+        # For nccl backend, bind the device to the process if device_id is not None
+        # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
+        # for form subgroup to avoid unnecesssary overhead.
+        dist.init_process_group(
+            backend=self.backend,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             world_size=self.world_size,
             rank=self.rank,  # pyre-ignore[16]
             init_method=f"file://{self.file_name}",  # pyre-ignore[16]
@@ -446,6 +528,7 @@ def destroy_pg(self, device_id: Optional[int] = None) -> None:
             device_id = (
                 torch.cuda.current_device() if self.device_type == "cuda" else self.rank
             )
+<<<<<<< HEAD
 
         if self.device_type == "cpu":
             # NOTE: when `device_id` is not None, barrier() will choose the accelerator
@@ -458,12 +541,16 @@ def destroy_pg(self, device_id: Optional[int] = None) -> None:
         else:
             dist.barrier(device_ids=[device_id])
 
+=======
+        dist.barrier(device_ids=[device_id])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.destroy_process_group()
 
     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()
 
+<<<<<<< HEAD
     def _test_op_on_dtensor(self, op_call, *args, **kwargs) -> None:
         """
         This function checks ``op_call(dtensor).full_tensor() == op_call(dtensor.full_tensor())``.
@@ -490,6 +577,8 @@ def _test_op_on_dtensor(self, op_call, *args, **kwargs) -> None:
         d_out_full_tensor_flattened = [dt.full_tensor() for dt in d_out_flattened]
         self.assertEqual(out_flattened, d_out_full_tensor_flattened)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # pyre-ignore[2]:
     def _test_op(self, mesh: DeviceMesh, op_call, *args, **kwargs) -> None:
         out = op_call(*args, **kwargs)
@@ -508,6 +597,7 @@ def run_subtests(self, *args, **kwargs):
 
 
 # wrapper to initialize comms (processgroup)
+<<<<<<< HEAD
 def with_comms(
     eager_init: Union[TestFunc, bool] = False, backend: Optional[str] = None
 ) -> TestFunc:
@@ -519,6 +609,15 @@ def wrapper(
             **kwargs: dict[str, Any],  # type: ignore[misc]
         ) -> None:
             self.init_pg(eager_init, backend)
+=======
+def with_comms(eager_init: Union[TestFunc, bool] = False) -> TestFunc:
+    def decorator(func, eager_init: bool = False):
+        @wraps(func)  # pyre-ignore[6]
+        def wrapper(
+            self, *args: tuple[object], **kwargs: dict[str, Any]  # type: ignore[misc]
+        ) -> None:
+            self.init_pg(eager_init)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             try:
                 func(self, *args, **kwargs)  # type: ignore[misc]
@@ -533,7 +632,11 @@ def wrapper(
     return (
         decorator(func=eager_init)
         if callable(eager_init)
+<<<<<<< HEAD
         else partial(decorator, eager_init=eager_init, backend=backend)
+=======
+        else partial(decorator, eager_init=eager_init)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -547,7 +650,11 @@ def device_type(self) -> str:
         return DEVICE_TYPE
 
     def build_device_mesh(self):
+<<<<<<< HEAD
         return init_device_mesh(self.device_type, (self.world_size,))
+=======
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def setUp(self) -> None:
         super().setUp()
@@ -678,7 +785,11 @@ def __next__(self) -> tuple[tuple[object, ...], dict[str, object]]:
     def to_dist_tensor(
         self, t: torch.Tensor, mesh: DeviceMesh, placements: list[Placement]
     ) -> torch.Tensor:
+<<<<<<< HEAD
         if type(t) is torch.Tensor or type(t) is nn.Parameter or type(t) is LocalTensor:
+=======
+        if type(t) is torch.Tensor or type(t) is nn.Parameter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.is_supported_tensor(t):
                 self.hit += 1
                 if t.ndim == 0:
@@ -687,7 +798,11 @@ def to_dist_tensor(
                 else:
                     # distribute non-scalar tensors
                     r = distribute_tensor(t, mesh, placements)
+<<<<<<< HEAD
                 if isinstance(t, nn.Parameter):
+=======
+                if type(t) is nn.Parameter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     r = nn.Parameter(  # type: ignore[assignment]
                         r, requires_grad=r.requires_grad
                     )
@@ -704,6 +819,7 @@ def to_dist_tensor(
             return t
         else:
             raise RuntimeError(f"Trying to convert to DTensor, but got {type(t)}")
+<<<<<<< HEAD
 
 
 class LocalDTensorTestBase(DTensorTestBase):
@@ -818,3 +934,5 @@ def map_local_tensor_for_rank(tensor, rank, func):
 @maybe_run_for_local_tensor
 def map_local_for_rank(rank, func):
     return func(rank)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index 49a57ca263991..6c573b25a410b 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -3,6 +3,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import io
+<<<<<<< HEAD
 import logging
 import os
 import shutil
@@ -10,6 +11,13 @@
 from collections.abc import Callable
 from functools import wraps
 from typing import Any, cast, IO, Optional
+=======
+import os
+import shutil
+import tempfile
+from functools import wraps
+from typing import Any, Callable, cast, IO, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # introduced as collections.abc.Buffer in Python 3.12
 from typing_extensions import Buffer
@@ -159,6 +167,7 @@ def wrapper(self, *args: tuple[object], **kwargs: dict[str, Any]) -> None:
                 shutil.rmtree(self.temp_dir, ignore_errors=True)
 
     return wrapper
+<<<<<<< HEAD
 
 
 def with_checkpoint_logging(
@@ -192,3 +201,5 @@ def wrapper(self, *args: tuple[object], **kwargs: dict[str, Any]) -> None:
             target_logger.setLevel(original_level)
 
     return wrapper
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index a78e312306ba2..c9917ad2fa71c 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -40,7 +40,11 @@ def _verify_msd(
         if not options.ignore_frozen_params:
             self.assertEqual(len(msd), len(dist_msd))
         for fqn, param in msd.items():
+<<<<<<< HEAD
             dist_param = dist_msd.get(fqn)
+=======
+            dist_param = dist_msd.get(fqn, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not options.ignore_frozen_params:
                 self.assertIsNotNone(dist_param, f"{fqn=}")
                 try:
@@ -60,7 +64,11 @@ def _verify_osd(
         dist_osd: dict[str, Any],
     ) -> None:
         params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
+<<<<<<< HEAD
         param_pid_mapping = dict(zip(params, range(len(params)), strict=True))
+=======
+        param_pid_mapping = dict(zip(params, range(len(params))))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fqn_pid_mapping = {}
         for fqn, param in model.named_parameters():
             pid = param_pid_mapping[param]
@@ -90,7 +98,11 @@ def _verify_osd(
             dist_osd[_PG] = [new_pg]
 
         self.assertEqual(len(osd[_PG]), len(dist_osd[_PG]))
+<<<<<<< HEAD
         for group, dist_group in zip(osd[_PG], dist_osd[_PG], strict=True):
+=======
+        for group, dist_group in zip(osd[_PG], dist_osd[_PG]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(group), len(dist_group))
             for key, value in group.items():
                 # Below doesn't work because param_groups can have None
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index ca9bc297010ac..5d2e1f1bb3d76 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -192,7 +192,11 @@ def __init__(
         self.hybrid_module = HybridModel(
             self.remote_em_rref,
             self.remote_net_rref,
+<<<<<<< HEAD
             self.trainer_group if ddp_mode == DdpMode.INSIDE else None,
+=======
+            self.trainer_group if ddp_mode in (DdpMode.INSIDE,) else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.ddp_params, self.non_ddp_params = (
             self.hybrid_module.ddp_params,
@@ -238,9 +242,13 @@ def train_batch(
             sparse_microbatch = torch.split(sparse_features, 2)
             values_microbatch = torch.split(values, 2)
             batches = []
+<<<<<<< HEAD
             for d, s, v in zip(
                 dense_microbatch, sparse_microbatch, values_microbatch, strict=True
             ):
+=======
+            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
                 batches.append(feature_set)
 
@@ -255,11 +263,15 @@ def train_batch(
             else:
                 input_batches = batches
 
+<<<<<<< HEAD
         with (
             self.hybrid_module.join()
             if simulate_uneven_inputs
             else contextlib.nullcontext()
         ):
+=======
+        with self.hybrid_module.join() if simulate_uneven_inputs else contextlib.nullcontext():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for b in input_batches:
                 with dist_autograd.context() as context_id:
                     output = self.hybrid_module.forward(b)
@@ -267,7 +279,12 @@ def train_batch(
                     dist_autograd.backward(context_id, [loss])
                     grads_dict = dist_autograd.get_gradients(context_id)
                     gLogger.info(
+<<<<<<< HEAD
                         "Loss is %s for mini batch: %s. Grads dict has %s entries: %s",
+=======
+                        "Loss is %s for mini batch: %s. "
+                        "Grads dict has %s entries: %s",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         loss,
                         mini_batch,
                         len(grads_dict),
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index c67e2473cb7fc..08221559b9504 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -13,12 +13,19 @@
 import time
 import unittest
 from collections import defaultdict, namedtuple, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
 from functools import reduce
+<<<<<<< HEAD
 from typing import Any, NamedTuple, Union
+=======
+from typing import Any, Callable, NamedTuple, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 
@@ -138,6 +145,7 @@ def eq(value, other):
 f = Foo(10)
 f.bar = 1
 
+<<<<<<< HEAD
 
 # Defer instantiation until the seed is set so that randn() returns the same
 # values in all processes.
@@ -150,6 +158,18 @@ def create_collectives_object_test_list():
         [1, 2, True, "string", [4, 5, "nested"]],
     ]
 
+=======
+foo_cpu_tensor = Foo(torch.randn(3, 3))
+
+
+COLLECTIVES_OBJECT_TEST_LIST = [
+    {"key1": 3, "key2": 4, "key3": {"nested": True}},
+    f,
+    foo_cpu_tensor,
+    "foo",
+    [1, 2, True, "string", [4, 5, "nested"]],
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Allowlist of distributed backends where profiling collectives is supported.
 PROFILING_SUPPORTED_BACKENDS = [
@@ -399,6 +419,15 @@ def forward(self, x):
             return F.relu(self.lin1(x))
 
 
+<<<<<<< HEAD
+=======
+DDP_NET = Net()
+BN_NET = BatchNormNet()
+BN_NET_NO_AFFINE = BatchNormNet(affine=False)
+ONLY_SBN_NET = nn.SyncBatchNorm(2, momentum=0.99)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_timeout(test_id):
     test_name = test_id.split(".")[-1]
     if test_name in CUSTOMIZED_TIMEOUT:
@@ -456,6 +485,7 @@ def require_world_size(world_size):
     return lambda func: func
 
 
+<<<<<<< HEAD
 def require_exact_world_size(world_size):
     if int(os.environ["WORLD_SIZE"]) != world_size:
         return skip_but_pass_in_sandcastle(
@@ -464,6 +494,8 @@ def require_exact_world_size(world_size):
     return lambda func: func
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextmanager
 def _lock():
     TEMP_DIR = os.environ["TEMP_DIR"]
@@ -658,13 +690,21 @@ def _init_group_test(self, **kwargs):
             return (group, group_id, rank)
 
         def _init_full_group_test(self, **kwargs):
+<<<<<<< HEAD
             group = list(range(dist.get_world_size()))
+=======
+            group = list(range(0, dist.get_world_size()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.new_group(**kwargs)
             rank = dist.get_rank()
             return (group, group_id, rank)
 
         def _init_global_test(self):
+<<<<<<< HEAD
             group = list(range(dist.get_world_size()))
+=======
+            group = list(range(0, dist.get_world_size()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_id = dist.group.WORLD
             rank = dist.get_rank()
             return (group, group_id, rank)
@@ -678,7 +718,11 @@ def _verify_buffers_equal(self, m1, m2):
             # Verify buffers across ranks.
             m1_buffers = list(m1.buffers())
             m2_buffers = list(m2.buffers())
+<<<<<<< HEAD
             for buf1, buf2 in zip(m1_buffers, m2_buffers, strict=True):
+=======
+            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gathered_bufs = [
                     torch.empty_like(buf1) for _ in range(dist.get_world_size())
                 ]
@@ -704,7 +748,11 @@ def _sanity_check_profiler_nccl_meta(self, nccl_meta_events):
                 self.assertNotEqual(args.get("dtype", ""), "")
 
                 per_coll_meta[collname].append(args)
+<<<<<<< HEAD
                 if collname == "wait":
+=======
+                if collname in {"wait"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
 
                 self.assertEqual(args["Process Group Description"], "default_pg")
@@ -725,7 +773,11 @@ def test_dump_DDP_relevant_env_vars(self):
                 lines = out.getvalue().splitlines()
 
             def format_line(var):
+<<<<<<< HEAD
                 return f"env:{var}={os.environ.get(var, 'N/A')}"
+=======
+                return f"env:{var}={os.environ[var] if var in os.environ else 'N/A'}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Check relevant env vars
             vars = [
@@ -855,6 +907,11 @@ def _test_barrier_timeout(self, group_id, timeout):
                 with exception_ctx:
                     dist.barrier(group_id)
                 self.assertGreaterAlmostEqual(time.time(), expected_time, delta=0.1)
+<<<<<<< HEAD
+=======
+            else:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @skip_but_pass_in_sandcastle_if(
             BACKEND != "gloo", "Only gloo backend supports timeouts"
@@ -924,7 +981,12 @@ def test_new_subgroups(self):
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
+<<<<<<< HEAD
         @require_exact_world_size(4)
+=======
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_new_subgroups_with_group_param(self):
             # Initialize global test environment
             self._init_global_test()
@@ -969,10 +1031,16 @@ def test_new_subgroups_group_size_exceeds_world_size(self):
         @require_world_size(4)
         @skip_if_lt_x_gpu(4)
         def test_new_subgroups_world_size_not_divisible_by_group_size(self):
+<<<<<<< HEAD
             expected_msg = f"The world size ({dist.get_world_size()}) must be divisible by 'group_size=3'"
             with self.assertRaisesRegex(
                 ValueError,
                 re.escape(expected_msg),
+=======
+            with self.assertRaisesRegex(
+                ValueError,
+                re.escape("The world size (4) must be divisible by 'group_size=3'"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dist.new_subgroups(3)
 
@@ -1114,7 +1182,11 @@ def test_periodic_model_averager(self):
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
+<<<<<<< HEAD
                 for step in range(20):
+=======
+                for step in range(0, 20):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1143,7 +1215,11 @@ def test_periodic_model_averager_param_group(self):
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
+<<<<<<< HEAD
                 for step in range(20):
+=======
+                for step in range(0, 20):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Reset the parameters at every step.
                     for param_group in opt.param_groups:
                         for params in param_group["params"]:
@@ -1203,7 +1279,11 @@ def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averag
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
+<<<<<<< HEAD
                 for step in range(20):
+=======
+                for step in range(0, 20):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1220,7 +1300,11 @@ def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averag
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
+<<<<<<< HEAD
         @require_exact_world_size(4)
+=======
+        @require_world_size(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(4)
         def test_3_level_hierarchical_model_averager(self):
             rank = dist.get_rank()
@@ -1284,7 +1368,11 @@ def test_3_level_hierarchical_model_averager(self):
             expected_global_avg_tensor = (
                 torch.ones_like(param.data) * sum(range(world_size)) / world_size
             )
+<<<<<<< HEAD
             for step in range(25):
+=======
+            for step in range(0, 25):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Reset the parameters at every step.
                 param.data = copy.deepcopy(tensor)
                 for params in model.parameters():
@@ -1390,7 +1478,11 @@ def test_batch_isend_irecv_nccl(self):
 
             for val in ["1", "0"]:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
+<<<<<<< HEAD
                 for src in range(world_size):
+=======
+                for src in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
                         src
                     )
@@ -1409,7 +1501,11 @@ def test_batch_isend_irecv_nccl(self):
                 for req in reqs:
                     req.wait()
 
+<<<<<<< HEAD
                 for src in range(world_size):
+=======
+                for src in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(recv_tensors[src], expected_tensors[src])
 
             self._barrier()
@@ -1505,7 +1601,11 @@ def test_batch_isend_irecv_gloo(self):
             rank = dist.get_rank()
             p2p_op_list = []
 
+<<<<<<< HEAD
             for src in range(dist.get_world_size()):
+=======
+            for src in range(0, dist.get_world_size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1528,7 +1628,11 @@ def test_batch_isend_irecv_gloo_tags(self):
             rank = dist.get_rank()
             p2p_op_list = []
 
+<<<<<<< HEAD
             for src in range(dist.get_world_size()):
+=======
+            for src in range(0, dist.get_world_size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1602,10 +1706,17 @@ def _test_send_recv_nccl(self, profiler_ctx=None):
             tensor = _build_tensor(rank + 1, device_id=device_id)
             profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
             with profiler_cls as prof:
+<<<<<<< HEAD
                 for src in range(world_size):
                     if src == rank:
                         # Send mode
                         for dst in range(world_size):
+=======
+                for src in range(0, world_size):
+                    if src == rank:
+                        # Send mode
+                        for dst in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1674,10 +1785,17 @@ def _test_send_recv(self, profiler_ctx):
             tensor = _build_tensor(send_size)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
+<<<<<<< HEAD
                 for src in range(dist.get_world_size()):
                     if src == rank:
                         # Send mode
                         for dst in range(dist.get_world_size()):
+=======
+                for src in range(0, dist.get_world_size()):
+                    if src == rank:
+                        # Send mode
+                        for dst in range(0, dist.get_world_size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1742,10 +1860,17 @@ def _test_send_recv_any_source(self, profiler_ctx):
 
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
+<<<<<<< HEAD
                 for dst in range(dist.get_world_size()):
                     if dst == rank:
                         # Recv mode
                         for dst in range(dist.get_world_size()):
+=======
+                for dst in range(0, dist.get_world_size()):
+                    if dst == rank:
+                        # Recv mode
+                        for dst in range(0, dist.get_world_size()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if dst == rank:
                                 continue
 
@@ -1846,10 +1971,17 @@ def _test_send_recv_with_tag(self, profiler_ctx):
             tensor = _build_tensor(send_recv_size, value=rank)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
+<<<<<<< HEAD
                 for dst in range(world_size):
                     if dst == rank:
                         # Recv mode
                         for src in range(world_size):
+=======
+                for dst in range(0, world_size):
+                    if dst == rank:
+                        # Recv mode
+                        for src in range(0, world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if src == rank:
                                 continue
                             output_tensor = _build_tensor(send_recv_size, value=-1)
@@ -2108,7 +2240,10 @@ def test_broadcast_full_group(self):
             "Only NCCL backend supports high priority stream",
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_nccl_high_priority_stream(self):
             group, _, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3046,7 +3181,11 @@ def _test_all_reduce_coalesced_helper(
                 curr_values = master_values if rank == src else worker_values
                 tensors = [
                     _build_tensor(src + 1, val, dtype=dtype)
+<<<<<<< HEAD
                     for dtype, val in zip(dtypes, curr_values, strict=True)
+=======
+                    for dtype, val in zip(dtypes, curr_values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
                 if cuda:
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -3067,9 +3206,13 @@ def _test_all_reduce_coalesced_helper(
                 )
                 expected_tensors = [
                     _build_tensor(src + 1, expected_value, dtype=dtype)
+<<<<<<< HEAD
                     for dtype, expected_value in zip(
                         dtypes, expected_values, strict=True
                     )
+=======
+                    for dtype, expected_value in zip(dtypes, expected_values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
                 self.assertEqual(tensors, expected_tensors)
 
@@ -3269,7 +3412,10 @@ def test_scatter(self):
             BACKEND != "nccl", "Only Nccl supports CUDA gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_scatter_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3342,7 +3488,11 @@ def _test_gather_helper(
                 )
                 if rank == dest:
                     expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+<<<<<<< HEAD
                     for t1, t2 in zip(tensors, expected_tensors, strict=True):
+=======
+                    for t1, t2 in zip(tensors, expected_tensors):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3444,7 +3594,11 @@ def _test_all_gather_helper(
                 expected_tensors = [
                     _build_tensor(dest + 1, i, dtype=dtype) for i in group
                 ]
+<<<<<<< HEAD
                 for t1, t2 in zip(tensors, expected_tensors, strict=True):
+=======
+                for t1, t2 in zip(tensors, expected_tensors):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3460,7 +3614,10 @@ def test_all_gather(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3477,7 +3634,10 @@ def test_all_gather_complex(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_cuda_complex(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3590,7 +3750,10 @@ def test_all_gather_into_cat_tensor_cuda(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_into_stack_tensor_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3631,8 +3794,13 @@ def _run_all_gather_coalesced_and_verify(
                 tensor_shapes=tensor_shapes,
             )
 
+<<<<<<< HEAD
             for l1, l2 in zip(output_tensor_lists, expected_tensors, strict=True):
                 for t1, t2 in zip(l1, l2, strict=True):
+=======
+            for l1, l2 in zip(output_tensor_lists, expected_tensors):
+                for t1, t2 in zip(l1, l2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not torch.equal(t1, t2):
                         return False
             return True
@@ -3831,7 +3999,11 @@ def _test_all_to_all_helper(
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
                 dist.all_to_all(out_tensors, in_tensors, group=group_id)
+<<<<<<< HEAD
                 for t1, t2 in zip(out_tensors, expected_tensors, strict=True):
+=======
+                for t1, t2 in zip(out_tensors, expected_tensors):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(t1, t2)
             self._barrier()
 
@@ -3846,7 +4018,10 @@ def test_all_to_all_single_equal_split(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_to_all_single_equal_split_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -4211,7 +4386,11 @@ def _test_DDP_helper(
 
         def _assert_equal_param(self, param_gpu, param_DDP):
             self.assertEqual(len(param_gpu), len(param_DDP))
+<<<<<<< HEAD
             for p_gpu, p_DDP in zip(param_gpu, param_DDP, strict=True):
+=======
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_niter(
@@ -4298,7 +4477,11 @@ def _test_DistributedDataParallel(
             # as baseline
 
             # cpu training setup
+<<<<<<< HEAD
             model = Net()
+=======
+            model = DDP_NET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # single gpu training setup
             model_gpu = copy.deepcopy(model)
@@ -4353,7 +4536,11 @@ def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False):
             _group, _group_id, rank = self._init_global_test()
 
             # cpu training setup
+<<<<<<< HEAD
             model_base = Net()
+=======
+            model_base = DDP_NET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # DDP-CPU training setup
             model_DDP = copy.deepcopy(model_base)
@@ -4626,7 +4813,10 @@ def _test_ddp_hook_with_optimizer_parity(
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+<<<<<<< HEAD
                         strict=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4658,7 +4848,10 @@ def _test_ddp_hook_with_optimizer_parity(
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+<<<<<<< HEAD
                         strict=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4835,9 +5028,13 @@ def _test_ddp_apply_optim_in_backward(
                         optimizer_kwargs=optim_kwargs,
                     )
 
+<<<<<<< HEAD
                 for p1, p2 in zip(
                     model.parameters(), model_optim_in_bwd.parameters(), strict=True
                 ):
+=======
+                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(p1, p2, "Parameters not initially equal!")
                 # Enable determinism in cudnn operators
                 with torch.backends.cudnn.flags(
@@ -4855,9 +5052,13 @@ def _test_ddp_apply_optim_in_backward(
                             inp
                         ).sum().backward()  # runs optimizer as well
                         for p1, p2 in zip(
+<<<<<<< HEAD
                             model.parameters(),
                             model_optim_in_bwd.parameters(),
                             strict=True,
+=======
+                            model.parameters(), model_optim_in_bwd.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ):
                             self.assertEqual(
                                 p1, p2, f"Params not equal at iteration {i}"
@@ -5337,9 +5538,13 @@ def step_model(model, input, target):
                     # sync grads
                     step_model(ddp_model, ddp_input, ddp_target)
 
+<<<<<<< HEAD
                 for i, j in zip(
                     model.parameters(), ddp_model.parameters(), strict=True
                 ):
+=======
+                for i, j in zip(model.parameters(), ddp_model.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if not i.requires_grad:
                         continue
                     if iteration % 2 == 0:
@@ -5464,7 +5669,10 @@ def add(fut):
             f"The {BACKEND} backend does not support DistributedDataParallel",
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_DistributedDataParallel(self):
             _group, _group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -5511,7 +5719,11 @@ def test_DistributedDataParallel(self):
         def _test_DistributedDataParallel_with_amp(self, grad_is_view=False):
             torch.manual_seed(31415)
             # Creates model and optimizer in default precision
+<<<<<<< HEAD
             model = Net().cuda()
+=======
+            model = copy.deepcopy(DDP_NET).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
 
             # Creates a GradScaler once at the beginning of training.
@@ -5579,7 +5791,10 @@ def test_DistributedDataParallel_with_amp_and_grad_is_view(self):
             for i, j in zip(
                 ddp_model_grad_not_view.parameters(),
                 ddp_model_grad_is_view.parameters(),
+<<<<<<< HEAD
                 strict=True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 self.assertEqual(i, j)
 
@@ -5597,7 +5812,11 @@ def _test_DistributedDataParallel_SyncBatchNorm(
             # as baseline
 
             # cpu training setup
+<<<<<<< HEAD
             model = BatchNormNet() if affine else BatchNormNet(affine=False)
+=======
+            model = BN_NET if affine else BN_NET_NO_AFFINE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # single gpu training setup
             model_gpu = copy.deepcopy(model)
@@ -5647,7 +5866,10 @@ def _test_DistributedDataParallel_SyncBatchNorm(
         def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
             learning_rate = 0.03
 
+<<<<<<< HEAD
             DDP_NET = Net()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             net = torch.nn.parallel.DistributedDataParallel(
                 copy.deepcopy(DDP_NET).cuda(),
                 device_ids=[self.rank],
@@ -5685,9 +5907,13 @@ def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
                     target,
                 )
                 for p1, p2 in zip(
+<<<<<<< HEAD
                     net.parameters(),
                     net_using_post_localSGD_opt.parameters(),
                     strict=True,
+=======
+                    net.parameters(), net_using_post_localSGD_opt.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     self.assertEqual(p1.data, p2.data)
 
@@ -5716,7 +5942,11 @@ def _test_post_localSGD_optimizer_step_reload(
             learning_rate = 0.03
 
             net_using_post_localSGD_opt = torch.nn.parallel.DistributedDataParallel(
+<<<<<<< HEAD
                 Net().cuda(), device_ids=[self.rank]
+=======
+                copy.deepcopy(DDP_NET).cuda(), device_ids=[self.rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             averager = create_averager()
@@ -5866,7 +6096,11 @@ def _test_DistributedDataParallel_SyncBatchNorm_with_memory_format(
             bs_offset = int(rank * 2)
             global_bs = int(num_processes * 2)
 
+<<<<<<< HEAD
             model = nn.SyncBatchNorm(2, momentum=0.99)
+=======
+            model = ONLY_SBN_NET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model_gpu = copy.deepcopy(model).cuda(rank)
             model_DDP = nn.parallel.DistributedDataParallel(
                 model_gpu, device_ids=[rank]
@@ -6076,7 +6310,10 @@ def test_DistributedDataParallel_SyncBatchNorm_Single_Input_Per_Process(self):
         def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_Running_Value(
             self,
         ):
+<<<<<<< HEAD
             ONLY_SBN_NET = nn.SyncBatchNorm(2, momentum=0.99)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _group, _group_id, rank = self._init_global_test()
             model = nn.parallel.DistributedDataParallel(
                 ONLY_SBN_NET.cuda(rank), device_ids=[rank]
@@ -6144,7 +6381,11 @@ def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_gradient(self):
         def test_DistributedDataParallel_SyncBatchNorm_half(self):
             _group, _group_id, rank = self._init_global_test()
 
+<<<<<<< HEAD
             model = BatchNormNet()
+=======
+            model = copy.deepcopy(BN_NET)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = model.half()
             model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
             model = nn.parallel.DistributedDataParallel(
@@ -6160,7 +6401,11 @@ def test_DistributedDataParallel_SyncBatchNorm_half(self):
 
         def _test_ddp_logging_data(self, is_gpu):
             rank = dist.get_rank()
+<<<<<<< HEAD
             model_DDP = Net()
+=======
+            model_DDP = copy.deepcopy(DDP_NET)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_gpu:
                 model_DDP = nn.parallel.DistributedDataParallel(
                     model_DDP.cuda(rank), device_ids=[rank]
@@ -6232,7 +6477,11 @@ def _test_ddp_logging_data(self, is_gpu):
         )
         def test_ddp_logging_data_cpu(self):
             def parse_env(var):
+<<<<<<< HEAD
                 return os.environ.get(var, "N/A")
+=======
+                return os.environ[var] if var in os.environ else "N/A"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             dist.set_debug_level(dist.DebugLevel.INFO)
             _, group_id, _ = self._init_global_test()
@@ -6436,7 +6685,11 @@ def test_ddp_logging_data_gpu(self):
             BACKEND == "nccl", "nccl does not support DDP on CPU models"
         )
         def test_static_graph_api_cpu(self):
+<<<<<<< HEAD
             model_DDP = nn.parallel.DistributedDataParallel(Net())
+=======
+            model_DDP = nn.parallel.DistributedDataParallel(DDP_NET)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected_err = "should be called before training loop starts"
             with self.assertRaisesRegex(RuntimeError, expected_err):
                 local_bs = 2
@@ -6669,7 +6922,11 @@ def validate_global_samples(local_num_samples):
         def _test_allgather_object(self, subgroup=None):
             # Only set device for NCCL backend since it must use GPUs.
 
+<<<<<<< HEAD
             gather_objects = create_collectives_object_test_list()
+=======
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             backend = os.environ["BACKEND"]
             if backend == "nccl":
@@ -6713,7 +6970,11 @@ def test_all_gather_object_subgroup(self):
 
         def _test_gather_object(self, pg=None):
             # Ensure stateful objects can be gathered
+<<<<<<< HEAD
             gather_objects = create_collectives_object_test_list()
+=======
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             my_rank = dist.get_rank(pg)
 
             backend = os.environ["BACKEND"]
@@ -6763,7 +7024,10 @@ class Bar:
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+<<<<<<< HEAD
         @require_exact_world_size(4)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_gather_object(self):
             return self._test_gather_object()
 
@@ -6772,7 +7036,10 @@ def test_gather_object(self):
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+<<<<<<< HEAD
         @require_exact_world_size(4)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_gather_object_subgroup(self):
             default = _get_default_group()
             backend = dist.get_backend(default)
@@ -6837,7 +7104,11 @@ def test_ddp_sync_module_states(self):
             # they are the same as new_model on rank_to_broadcast.
             if rank == rank_to_broadcast:
                 expected_states = new_model.state_dict().values()
+<<<<<<< HEAD
                 for t, expected in zip(net_module_states, expected_states, strict=True):
+=======
+                for t, expected in zip(net_module_states, expected_states):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(t, expected)
 
         @skip_if_lt_x_gpu(2)
@@ -7048,7 +7319,11 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
                 self.assertNotEqual(attrs.get("dtype", ""), "")
 
                 per_coll_meta[collname].append(attrs)
+<<<<<<< HEAD
                 if collname == "wait":
+=======
+                if collname in {"wait"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
 
                 self.assertEqual(attrs["pg_name"], "0")  # yes this is a string
@@ -7154,9 +7429,13 @@ def test_ddp_join_model_equivalence(self):
 
             # Validate model state dicts are equal
             for (_, local_tensor), (_, dist_tensor) in zip(
+<<<<<<< HEAD
                 local_model.state_dict().items(),
                 net.module.state_dict().items(),
                 strict=True,
+=======
+                local_model.state_dict().items(), net.module.state_dict().items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 self.assertEqual(local_tensor, dist_tensor)
 
@@ -7287,7 +7566,11 @@ def forward(self, x):
                     return x
 
             torch.cuda.set_device(self.rank)
+<<<<<<< HEAD
             model_bn = BatchNormNet()
+=======
+            model_bn = BN_NET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model_bn = nn.SyncBatchNorm.convert_sync_batchnorm(
                 copy.deepcopy(model_bn)
             ).cuda(self.rank)
@@ -7487,7 +7770,11 @@ def forward(self, x, rank):
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
                         mapping = dict.fromkeys(
+<<<<<<< HEAD
                             range(num_early_join_ranks), baseline_iter
+=======
+                            range(0, num_early_join_ranks), baseline_iter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
@@ -7583,7 +7870,11 @@ def forward(self, _):
                     loss.backward()
 
         def _test_broadcast_object_list(self, group=None):
+<<<<<<< HEAD
             gather_objects = create_collectives_object_test_list()
+=======
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Only set device for NCCL backend since it must use GPUs.
             # Case where rank != GPU device.
@@ -7744,17 +8035,25 @@ def forward(self, x):
                     # materialized param grad is not touched by DDP, so its grad should
                     # be the same as if running locally.
                     for materialized_param, local_param in zip(
+<<<<<<< HEAD
                         ddp.module.fc2.parameters(),
                         local_model.fc2.parameters(),
                         strict=True,
+=======
+                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertEqual(materialized_param.grad, local_param.grad)
 
                     # fc1 parameter grad should still be different, due to allreduce.
                     for synced_param, local_param in zip(
+<<<<<<< HEAD
                         ddp.module.fc1.parameters(),
                         local_model.fc1.parameters(),
                         strict=True,
+=======
+                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertFalse(synced_param.grad == local_param.grad)
 
@@ -8311,11 +8610,18 @@ def forward(self, x):
         @require_backend_is_available({"gloo"})
         def test_scatter_object_list(self):
             src_rank = 0
+<<<<<<< HEAD
             collectives_object_test_list = create_collectives_object_test_list()
             scatter_list = (
                 collectives_object_test_list
                 if self.rank == src_rank
                 else [None for _ in collectives_object_test_list]
+=======
+            scatter_list = (
+                COLLECTIVES_OBJECT_TEST_LIST
+                if self.rank == src_rank
+                else [None for _ in COLLECTIVES_OBJECT_TEST_LIST]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             world_size = dist.get_world_size()
             scatter_list = scatter_list[:world_size]
@@ -8328,8 +8634,13 @@ def test_scatter_object_list(self):
             dist.scatter_object_list(output_obj_list, scatter_list, src=src_rank)
             self.assertEqual(
                 output_obj_list[0],
+<<<<<<< HEAD
                 collectives_object_test_list[
                     self.rank % len(collectives_object_test_list)
+=======
+                COLLECTIVES_OBJECT_TEST_LIST[
+                    self.rank % len(COLLECTIVES_OBJECT_TEST_LIST)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             )
             # Ensure errors are raised upon incorrect arguments.
@@ -8607,7 +8918,11 @@ def _test_output_unused_in_loss(self, module_cls, gradient_as_bucket_view):
 
                 # Verify grads are the same
                 for local_param, dist_param in zip(
+<<<<<<< HEAD
                     local_net.parameters(), net.parameters(), strict=True
+=======
+                    local_net.parameters(), net.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     local_grad = local_param.grad
                     dist_grad = dist_param.grad
@@ -8657,7 +8972,11 @@ def test_undefined_grad_parity_unused_parameters(self):
             torch._C._functions.UndefinedGrad()(out).backward()
             torch._C._functions.UndefinedGrad()(local_out).backward()
             for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
+<<<<<<< HEAD
                 net.named_parameters(), local_net.named_parameters(), strict=True
+=======
+                net.named_parameters(), local_net.named_parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dist_grad = dist_param.grad
                 local_grad = local_param.grad
@@ -8715,9 +9034,13 @@ def test_different_graph_across_ranks(self):
             self.assertTrue(
                 static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
             )
+<<<<<<< HEAD
             for i, j in zip(
                 base_model.parameters(), static_model.parameters(), strict=True
             ):
+=======
+            for i, j in zip(base_model.parameters(), static_model.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(i, j)
 
         @require_backend_is_available({"gloo"})
@@ -9325,7 +9648,11 @@ def get_loss(model_output):
                     loss_static.backward()
                     self._model_step(model_static_graph)
                     for p, p_static in zip(
+<<<<<<< HEAD
                         model.parameters(), model_static_graph.parameters(), strict=True
+=======
+                        model.parameters(), model_static_graph.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertEqual(p, p_static)
 
@@ -10002,7 +10329,11 @@ def forward(self, x):
                         p.grad.data = p.grad / iters
 
                     for p_ddp, p_local in zip(
+<<<<<<< HEAD
                         model.parameters(), local_model.parameters(), strict=True
+=======
+                        model.parameters(), local_model.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ):
                         self.assertTrue(
                             torch.allclose(p_ddp.grad, p_local.grad),
@@ -10017,7 +10348,11 @@ def forward(self, x):
             "Only Nccl & Gloo backend support DistributedDataParallel",
         )
         def test_sync_bn_logged(self):
+<<<<<<< HEAD
             model = BatchNormNet()
+=======
+            model = BN_NET
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rank = self.rank
             # single gpu training setup
             model_gpu = model.cuda(rank)
@@ -10219,9 +10554,13 @@ def _test_hook_pickling(self, hook, hook_state):
             #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
             # To make sure random state was restored properly, all entries should equal the original
             for entry1, entry2 in zip(
+<<<<<<< HEAD
                 hook_state.rng.get_state(),
                 dummy_hook_state.rng.get_state(),
                 strict=True,
+=======
+                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 np.testing.assert_array_equal(entry1, entry2)
 
@@ -10242,7 +10581,11 @@ def _test_hook_pickling(self, hook, hook_state):
 
             # Check that gradients after 10 epochs are the same
             for orig_param, dummy_param in zip(
+<<<<<<< HEAD
                 ddp_model.parameters(), dummy_ddp_model.parameters(), strict=True
+=======
+                ddp_model.parameters(), dummy_ddp_model.parameters()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 self.assertEqual(orig_param.grad, dummy_param.grad)
 
@@ -10329,9 +10672,13 @@ def test_ddp_compile_static_graph(self):
                 self.assertEqual(out_ddp, out_ddp_static)
                 out_ddp.backward()
                 out_ddp_static.backward()
+<<<<<<< HEAD
                 for p1, p2 in zip(
                     ddp.parameters(), ddp_static.parameters(), strict=True
                 ):
+=======
+                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(p1.grad, p2.grad)
 
         @skip_if_lt_x_gpu(2)
@@ -10424,9 +10771,13 @@ def test_skip_all_reduce_unused_parameters(self):
                 test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
             )
 
+<<<<<<< HEAD
             for i, j in zip(
                 base_model.parameters(), test_model_1.parameters(), strict=True
             ):
+=======
+            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(i, j)
 
 
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index de9c2cc7ee520..4478caa0218a4 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -11,7 +11,11 @@ class FakeStore(dist.Store):
     """
 
 
+<<<<<<< HEAD
 def _create_fake_pg(common_opts, backend_opts):
+=======
+def _create_fake_pg(prefix_store, rank, world_size, timeout):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     A fake process group (not related to FakeTensor) is a process group which
     doesn't actually do any communication, it just hallucinates some
@@ -22,6 +26,7 @@ def _create_fake_pg(common_opts, backend_opts):
     for every collective. It should be used as a convenient tool when playing
     with distributed but don't care about the actual data.
     """
+<<<<<<< HEAD
     return FakeProcessGroup._create_internal(
         common_opts.group_rank, common_opts.group_size, backend_opts
     )
@@ -30,3 +35,9 @@ def _create_fake_pg(common_opts, backend_opts):
 dist.Backend.register_backend(
     "fake", _create_fake_pg, extended_api=True, devices=["cpu", "cuda", "hpu", "xpu"]
 )
+=======
+    return FakeProcessGroup(rank, world_size)
+
+
+dist.Backend.register_backend("fake", _create_fake_pg, devices=["cpu", "cuda", "hpu"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 79aff05b3421f..5c03d87f4e74f 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -71,6 +71,7 @@ def bitwise_reduce(tensors, op):
 }
 
 
+<<<<<<< HEAD
 # Note [Hide collectives mutation from autograd]
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Threaded PG is intended to closely simulate the behavior of regular process
@@ -89,6 +90,8 @@ def bitwise_reduce(tensors, op):
 # the mutations from autograd.
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AllToAll:
     @torch.no_grad()
     def work(self, data):
@@ -97,10 +100,14 @@ def work(self, data):
             output_tensor_list, _ = data[dest_rank]
             for src_rank in range(world_size):
                 _, input_tensor_list = data[src_rank]
+<<<<<<< HEAD
                 # See Note [Hide collectives mutation from autograd]
                 output_tensor_list[src_rank].detach().copy_(
                     input_tensor_list[dest_rank]
                 )
+=======
+                output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AllToAllBase:
@@ -120,10 +127,16 @@ def work(self, data):
                     input_buffer.size(0), input_split_sizes, world_size
                 )
 
+<<<<<<< HEAD
                 # See Note [Hide collectives mutation from autograd]
                 output_buffer[
                     output_indexes[src_rank] : output_indexes[src_rank + 1]
                 ].detach().copy_(
+=======
+                output_buffer[
+                    output_indexes[src_rank] : output_indexes[src_rank + 1]
+                ].copy_(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     input_buffer[
                         input_indexes[dest_rank] : input_indexes[dest_rank + 1]
                     ]
@@ -166,7 +179,11 @@ def work(self, data):
             # collect all data to the list and make them
             # all on rank 0 device
             tensors = [
+<<<<<<< HEAD
                 data[src_rank][i].to(rank_0_device) for src_rank in range(len(data))
+=======
+                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
 
             # now mimic reduce across all ranks
@@ -174,8 +191,12 @@ def work(self, data):
 
             # copy all the reduced value to each rank
             for src_rank in range(len(data)):
+<<<<<<< HEAD
                 # See Note [Hide collectives mutation from autograd]
                 data[src_rank][i].detach().copy_(res.to(data[src_rank][i].device))
+=======
+                data[src_rank][i].copy_(res.to(data[src_rank][i].device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AllGather:
@@ -189,8 +210,12 @@ def work(self, data):
 
             for dest in data:
                 dest_tensor = dest[0][0][src_rank]
+<<<<<<< HEAD
                 # See Note [Hide collectives mutation from autograd]
                 dest_tensor.detach().copy_(src_tensor)
+=======
+                dest_tensor.copy_(src_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Scatter:
@@ -209,8 +234,12 @@ def work(self, data):
             # Can't handle scatter with multiple output tensor
             assert len(out_tensor_list) == 1
             dest_tensor = out_tensor_list[0]
+<<<<<<< HEAD
             # See Note [Hide collectives mutation from autograd]
             dest_tensor.detach().copy_(src_in_tensors[rank])
+=======
+            dest_tensor.copy_(src_in_tensors[rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Gather:
@@ -227,8 +256,12 @@ def work(self, data):
             # Can't handle gather with multiple tensor lists
             assert len(src_in_tensor_list) == 1
             dest_tensor = out_tensor_list[rank]
+<<<<<<< HEAD
             # See Note [Hide collectives mutation from autograd]
             dest_tensor.detach().copy_(src_in_tensor_list[0])
+=======
+            dest_tensor.copy_(src_in_tensor_list[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ReduceScatter:
@@ -250,6 +283,7 @@ def work(self, data):
                 assert len(dest_tensor_on_rank_i) == 1
                 dst_tensor_device = dest_tensor_on_rank_i[0].device
                 if not start_reduction[i]:
+<<<<<<< HEAD
                     # See Note [Hide collectives mutation from autograd]
                     dest_tensor_on_rank_i[0].detach().copy_(
                         to_scatter[i].to(dst_tensor_device)
@@ -265,6 +299,16 @@ def work(self, data):
             for each_rank_data in data:
                 # See Note [Hide collectives mutation from autograd]
                 each_rank_data[0][0].detach().div_(num_ranks)
+=======
+                    dest_tensor_on_rank_i[0].copy_(to_scatter[i].to(dst_tensor_device))
+                    start_reduction[i] = True
+                else:
+                    dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device))
+        if self.op == dist.ReduceOp.AVG:
+            num_ranks = len(data)
+            for each_rank_data in data:
+                each_rank_data[0][0] /= num_ranks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Broadcast:
@@ -275,12 +319,18 @@ def __init__(self, src):
     def work(self, data):
         in_tensor_list = flatten_list(data[self.src])
         for i in range(len(data)):
+<<<<<<< HEAD
             if i == self.src:
                 continue
             out_tensor_list = flatten_list(data[i])
             for j in range(len(in_tensor_list)):
                 # See Note [Hide collectives mutation from autograd]
                 out_tensor_list[j].detach().copy_(in_tensor_list[j])
+=======
+            out_tensor_list = flatten_list(data[i])
+            for j in range(len(in_tensor_list)):
+                out_tensor_list[j].copy_(in_tensor_list[j])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Collective:
@@ -457,9 +507,13 @@ def reduce_scatter_tensor_coalesced(
     ):
         works = [
             self._reduce_scatter_base(output_tensor, input_tensor, opts)
+<<<<<<< HEAD
             for output_tensor, input_tensor in zip(
                 output_tensors, input_tensors, strict=True
             )
+=======
+            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for work in works[:-1]:
             work.wait()
@@ -469,7 +523,11 @@ def allgather_into_tensor_coalesced(
         self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
     ):
         res = None
+<<<<<<< HEAD
         for o_t, i_t in zip(output_tensor_list, input_tensor_list, strict=True):
+=======
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res = self._allgather_base(o_t, i_t)
         return res
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 3c5c9101e43c4..b1580767c3ef9 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -266,7 +266,11 @@ def _verify_backwards_remote(self, tensors, context_id, local_grads, *args):
         grads = dist_autograd.get_gradients(context_id)
         nargs = len(args)
         ngrads = 0
+<<<<<<< HEAD
         for i in range(nargs):
+=======
+        for i in range(0, nargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if local_grads[i] is not None:
                 self.assertIn(args[i], grads)
                 self.assertEqual(local_grads[i], grads[args[i]])
@@ -1973,7 +1977,11 @@ def test_clean_context_during_backward(self):
         DistAutogradTest._test_clean_context_backward_context_id = context_id
 
         # Send the context id to all nodes.
+<<<<<<< HEAD
         for i in range(self.world_size):
+=======
+        for i in range(0, self.world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if i != self.rank:
                 rank_distance = (i - self.rank + self.world_size) % self.world_size
                 rpc.rpc_sync(
@@ -1988,7 +1996,11 @@ def test_clean_context_during_backward(self):
         self.assertEqual(self.world_size - 1, len(known_context_ids))
 
         t1 = torch.rand((3, 3), requires_grad=True)
+<<<<<<< HEAD
         for i in range(100):
+=======
+        for i in range(0, 100):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dst = self._next_rank()
             t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
 
@@ -2749,7 +2761,11 @@ def test_gradients_synchronizations(self):
 
                 for i in range(len(futs)):
                     local_gradients = [p.grad for p in local_layers[i].parameters()]
+<<<<<<< HEAD
                     for g1, g2 in zip(futs[i].wait(), local_gradients, strict=True):
+=======
+                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertEqual(g1, g2)
 
         rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index ad0b7fbe2207f..49fa4923de5d1 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -46,7 +46,11 @@ def get_model(self):
     @rpc.functions.async_execution
     def update_and_fetch_model(ps_rref, grads):
         self = ps_rref.local_value()
+<<<<<<< HEAD
         for p, g in zip(self.model.parameters(), grads, strict=True):
+=======
+        for p, g in zip(self.model.parameters(), grads):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if p.grad is None:
                 p.grad = g
             else:
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index 57008aed17dba..05e7457ef5886 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -216,7 +216,11 @@ def finish_episode(self):
             returns.insert(0, R)
         returns = torch.tensor(returns)
         returns = (returns - returns.mean()) / (returns.std() + self.eps)
+<<<<<<< HEAD
         for log_prob, R in zip(probs, returns, strict=True):
+=======
+        for log_prob, R in zip(probs, returns):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             policy_loss.append(-log_prob * R)
         self.optimizer.zero_grad()
         policy_loss = torch.cat(policy_loss).sum()
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 76c089f45800d..e912dd1db3285 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -85,7 +85,11 @@ def test_rref_local_value(self):
         ):
             rref_local_value(rref)
 
+<<<<<<< HEAD
         ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
+=======
+        ret = ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, torch.add(torch.ones(2, 2), 1))
 
     @dist_init
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 57bb19bdf37e6..87cd98e645ca8 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -43,7 +43,10 @@
     IS_MACOS,
     load_tests,
     skip_but_pass_in_sandcastle_if,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TemporaryFileName,
 )
 from torch.testing._internal.dist_utils import (
@@ -665,7 +668,11 @@ def __init__(self, init_method):
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
+<<<<<<< HEAD
 load_tests = load_tests  # noqa: PLW0127
+=======
+load_tests = load_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MyEmbeddingBagModel(torch.nn.Module):
@@ -1819,7 +1826,11 @@ def test_profiler_rpc_key_names(self):
         # Spawn multiple threads that send RPCs to ensure keys are correctly
         # prefixed when there are multiple RPCs being created/in flight at the
         # same time.
+<<<<<<< HEAD
         dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
+=======
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def rpc_with_profiling(dst_worker):
             with _profile() as prof:
@@ -1885,7 +1896,11 @@ def _run_test_profiler_remote_events_profiled(self):
         if self.rank != 1:
             return
 
+<<<<<<< HEAD
         dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
+=======
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for dst in dst_ranks:
             dst_worker = worker_name(dst)
             with _profile() as prof:
@@ -3561,7 +3576,11 @@ def test_custom_exception_throw_during_reconstruction(self):
                 print(f"Got msg {msg}")
                 self.assertTrue("Original exception on remote side was" in msg)
                 self.assertTrue("CustomException" in msg)
+<<<<<<< HEAD
             except BaseException as e:  # noqa: B036
+=======
+            except BaseException as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise RuntimeError(f"Failure - expected RuntimeError, got {e}") from e
             finally:
                 self.assertTrue(exc_caught)
@@ -4979,7 +4998,10 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
 
     # Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dist_init(setup_rpc=False)
     def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
diff --git a/torch/testing/_internal/dynamo_test_failures.py b/torch/testing/_internal/dynamo_test_failures.py
index cdc69b7920cf0..c0b3a785ef43c 100644
--- a/torch/testing/_internal/dynamo_test_failures.py
+++ b/torch/testing/_internal/dynamo_test_failures.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 """
 This file contains the list of tests that are known to fail under Dynamo
 
@@ -28,6 +29,37 @@
 
 
 def find_test_dir() -> Optional[str]:
+=======
+# mypy: allow-untyped-defs
+import logging
+import os
+import sys
+
+
+# NOTE: [dynamo_test_failures.py]
+#
+# We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
+# We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
+# We generate runWithoutCompiledAutograd for all tests in `compiled_autograd_skips`
+#
+# For an easier-than-manual way of generating and updating these lists,
+# see scripts/compile_tests/update_failures.py
+#
+# If you're adding a new test, and it's failing PYTORCH_TEST_WITH_DYNAMO=1,
+# either add the appropriate decorators to your test or add skips for them
+# via test/dynamo_skips and test/dynamo_expected_failures.
+#
+# *These are not exactly unittest.expectedFailure and unittest.skip. We'll
+# always execute the test and then suppress the signal, if necessary.
+# If your tests crashes, or is slow, please use @skipIfTorchDynamo instead.
+#
+# The expected failure and skip files are located in test/dynamo_skips and
+# test/dynamo_expected_failures. They're individual files rather than a list so
+# git will merge changes easier.
+
+
+def find_test_dir():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Find the path to the dynamo expected failure and skip files.
     from os.path import abspath, basename, dirname, exists, join, normpath
 
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index fc6cfa8cf7f4e..0a28f630e9ccd 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -202,6 +202,7 @@ def body_fn(iter_t, x):
 
     return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
 
+<<<<<<< HEAD
 def simple_while_loop_stack_output(iter_t, x):
     def cond_fn(iter_t, x):
         return iter_t > 0
@@ -237,6 +238,8 @@ def body_gm(inp1, inp2):
 
     # TODO: Dynamo would rewrite this op differently
     return torch._higher_order_ops.local_map_hop(gm, inp1, inp2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
@@ -410,6 +413,7 @@ def fn(x):
         supports_autograd=False,
     ),
     OpInfo(
+<<<<<<< HEAD
         name="while_loop_stack_output",
         variant_test_name="simple",
         op=simple_while_loop_stack_output,
@@ -423,6 +427,8 @@ def fn(x):
         supports_autograd=False,
     ),
     OpInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name="auto_functionalize",
         variant_test_name="simple",
         op=simple_auto_functionalize,
@@ -477,6 +483,7 @@ def fn(x):
         ),
         decorators=[onlyCUDA],
     ),
+<<<<<<< HEAD
     OpInfo(
         name="local_map_hop",
         variant_test_name="simple",
@@ -498,4 +505,6 @@ def fn(x):
         ),
         decorators=[onlyCUDA, unittest.skipIf(not torch.distributed.is_available(), "requires distributed build")],
     ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
index a00e1e1a048a0..2948943f35556 100644
--- a/torch/testing/_internal/hypothesis_utils.py
+++ b/torch/testing/_internal/hypothesis_utils.py
@@ -7,7 +7,10 @@
 
 import hypothesis
 from functools import reduce
+<<<<<<< HEAD
 from importlib.metadata import version
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from hypothesis import assume
 from hypothesis import settings
 from hypothesis import strategies as st
@@ -347,6 +350,7 @@ def tensor_conv(
 
     return X, W, b, groups, tr
 
+<<<<<<< HEAD
 
 # We set the deadline in the currently loaded profile.
 # Creating (and loading) a separate profile overrides any settings the user
@@ -369,11 +373,28 @@ def assert_deadline_disabled():
     if hypothesis_version < (3, 27, 0):
         import warnings
 
+=======
+# We set the deadline in the currently loaded profile.
+# Creating (and loading) a separate profile overrides any settings the user
+# already specified.
+hypothesis_version = hypothesis.version.__version_info__
+current_settings = settings._profiles[settings._current_profile].__dict__
+current_settings['deadline'] = None
+if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0):
+    current_settings['timeout'] = hypothesis.unlimited
+def assert_deadline_disabled():
+    if hypothesis_version < (3, 27, 0):
+        import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warning_message = (
             "Your version of hypothesis is outdated. "
             "To avoid `DeadlineExceeded` errors, please update. "
             f"Current hypothesis version: {hypothesis.__version__}"
         )
+<<<<<<< HEAD
         warnings.warn(warning_message, stacklevel=2)
+=======
+        warnings.warn(warning_message)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         assert settings().deadline is None
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index bd11e01a80250..d9cf6591c6287 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -1,5 +1,6 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 import contextlib
 import functools
 import logging
@@ -14,10 +15,29 @@
 from torch._inductor.codecache import CppCodeCache
 from torch._inductor.codegen.common import (
     get_custom_backend_config_for_device,
+=======
+import logging
+import torch
+import re
+import unittest
+import functools
+import contextlib
+import os
+from subprocess import CalledProcessError
+import sys
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._inductor.graph import GraphLowering
+from torch._inductor.compile_fx import shape_env_from_inputs
+from torch._inductor.codecache import CppCodeCache
+from torch._inductor.custom_graph_pass import CustomGraphModulePass
+from torch._inductor.codegen.common import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_custom_backend_pass_for_device,
     get_scheduling_for_device,
     get_wrapper_codegen_for_device,
     init_backend_registration,
+<<<<<<< HEAD
     register_backend_for_device,
 )
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
@@ -33,10 +53,20 @@
     OrderedSet,
 )
 from torch.fx.experimental.proxy_tensor import make_fx
+=======
+    register_backend_for_device
+)
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen
+from torch._inductor.utils import get_gpu_shared_memory, is_big_gpu
+from torch._inductor.utils import GPU_TYPES, get_gpu_type, is_gpu
+from torch.utils._helion import has_helion
+from torch.utils._triton import has_triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     IS_CI,
     IS_FBCODE,
     IS_WINDOWS,
@@ -50,6 +80,19 @@
 log: logging.Logger = logging.getLogger(__name__)
 
 
+=======
+    LazyVal,
+    IS_FBCODE,
+)
+from torch.testing._internal.common_utils import (
+    TestCase,
+    IS_CI,
+    IS_WINDOWS,
+)
+
+log: logging.Logger = logging.getLogger(__name__)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def test_cpu():
     try:
         CppCodeCache.load("")
@@ -62,7 +105,10 @@ def test_cpu():
     ):
         return False
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HAS_CPU = LazyVal(test_cpu)
 
 HAS_TRITON = has_triton()
@@ -71,12 +117,16 @@ def test_cpu():
 
 if HAS_TRITON:
     import triton
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON_HAS_CPU = "cpu" in triton.backends.backends
 else:
     TRITON_HAS_CPU = False
 
 
+<<<<<<< HEAD
 HAS_CUDA_AND_TRITON = torch.cuda.is_available() and HAS_TRITON
 
 HAS_XPU_AND_TRITON = torch.xpu.is_available() and HAS_TRITON
@@ -85,6 +135,15 @@ def test_cpu():
 
 HAS_GPU = HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON
 HAS_GPU_AND_TRITON = HAS_GPU
+=======
+HAS_CUDA = torch.cuda.is_available() and HAS_TRITON
+
+HAS_XPU = torch.xpu.is_available() and HAS_TRITON
+
+HAS_MPS = torch.mps.is_available()
+
+HAS_GPU = HAS_CUDA or HAS_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 GPU_TYPE = get_gpu_type()
 
@@ -94,6 +153,7 @@ def test_cpu():
 )
 
 _desired_test_bases = get_desired_device_type_test_bases(allow_xpu=True)
+<<<<<<< HEAD
 RUN_GPU = HAS_GPU and any(
     is_gpu(getattr(x, "device_type", "")) for x in _desired_test_bases
 )
@@ -103,6 +163,18 @@ def test_cpu():
 )
 
 
+=======
+RUN_GPU = (
+    HAS_GPU
+    and any(is_gpu(getattr(x, "device_type", "")) for x in _desired_test_bases)
+)
+
+RUN_CPU = (
+    HAS_CPU
+    and any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _check_has_dynamic_shape(
     self: TestCase,
     code,
@@ -124,31 +196,46 @@ def _check_has_dynamic_shape(
 
 def skipDeviceIf(cond, msg, *, device):
     if cond:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def decorate_fn(fn):
             @functools.wraps(fn)
             def inner(self, *args, **kwargs):
                 if not hasattr(self, "device"):
+<<<<<<< HEAD
                     warn_msg = (
                         "Expect the test class to have attribute device but not found. "
                     )
+=======
+                    warn_msg = "Expect the test class to have attribute device but not found. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if hasattr(self, "device_type"):
                         warn_msg += "Consider using the skip device decorators in common_device_type.py"
                     log.warning(warn_msg)
                 if self.device == device:
                     raise unittest.SkipTest(msg)
                 return fn(self, *args, **kwargs)
+<<<<<<< HEAD
 
             return inner
 
     else:
 
+=======
+            return inner
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def decorate_fn(fn):
             return fn
 
     return decorate_fn
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skip_windows_ci(name: str, file: str) -> None:
     if IS_WINDOWS and IS_CI:
         module = os.path.basename(file).strip(".py")
@@ -159,6 +246,7 @@ def skip_windows_ci(name: str, file: str) -> None:
             sys.exit(0)
         raise unittest.SkipTest("requires sympy/functorch/filelock")
 
+<<<<<<< HEAD
 
 # TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
 requires_gpu = functools.partial(
@@ -177,22 +265,50 @@ def inner(fn):
             return unittest.skip(
                 f"Only if the CUDA device has at least {min_mem_required / 1e9:.3f}GB memory to be safe"
             )(fn)
+=======
+# TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
+requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
+requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
+requires_helion = functools.partial(unittest.skipIf, not HAS_HELION, "requires helion")
+
+def requires_cuda_with_enough_memory(min_mem_required):
+    def inner(fn):
+        if not torch.cuda.is_available() or torch.cuda.get_device_properties().total_memory < min_mem_required:
+            return unittest.skip(f"Only if the CUDA device has at least {min_mem_required / 1e9:.3f}GB memory to be safe")(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return fn
 
     return inner
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 skipCUDAIf = functools.partial(skipDeviceIf, device="cuda")
 skipXPUIf = functools.partial(skipDeviceIf, device="xpu")
 skipCPUIf = functools.partial(skipDeviceIf, device="cpu")
 
+<<<<<<< HEAD
 IS_A100 = LazyVal(lambda: HAS_CUDA_AND_TRITON and get_gpu_shared_memory() == 166912)
 
 IS_H100 = LazyVal(lambda: HAS_CUDA_AND_TRITON and get_gpu_shared_memory() == 232448)
 
 IS_BIG_GPU = LazyVal(lambda: HAS_GPU_AND_TRITON and is_big_gpu())
 
+=======
+IS_A100 = LazyVal(
+    lambda: HAS_CUDA
+    and get_gpu_shared_memory() == 166912
+)
+
+IS_H100 = LazyVal(
+    lambda: HAS_CUDA
+    and get_gpu_shared_memory() == 232448
+)
+
+IS_BIG_GPU = LazyVal(lambda: HAS_CUDA and is_big_gpu())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def dummy_graph() -> GraphLowering:
     """
@@ -209,7 +325,10 @@ def dummy_graph() -> GraphLowering:
 
     return graph
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def maybe_skip_size_asserts(op):
     """
     For certain ops, there meta and eager implementation returns different
@@ -246,6 +365,7 @@ def maybe_skip_size_asserts(op):
     else:
         return contextlib.nullcontext()
 
+<<<<<<< HEAD
 
 def get_func_call() -> str:
     return (
@@ -254,11 +374,18 @@ def get_func_call() -> str:
         else "def call("
     )
 
+=======
+def get_func_call() -> str:
+    return "void inductor_entry_impl(" if torch._inductor.config.cpp_wrapper else "def call("
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_kernel_launch() -> str:
     return "call_triton_" if torch._inductor.config.cpp_wrapper else ".run("
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def clone_preserve_strides_offset(x, device=None):
     if not isinstance(x, torch.Tensor):
         return x
@@ -272,7 +399,10 @@ def clone_preserve_strides_offset(x, device=None):
     out = torch.as_strided(buffer, x.size(), x.stride(), x.storage_offset())
     return out
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # define the e4m3/e5m2 constants
 E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
 E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
@@ -284,7 +414,10 @@ def clone_preserve_strides_offset(x, device=None):
 
 Tensor = torch.Tensor
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _to_fp8_saturated(x: Tensor, float8_dtype: torch.dtype) -> Tensor:
     # The default behavior in PyTorch for casting to `float8_e4m3fn`
     # and `e5m2` is to not saturate. In this context, we should saturate.
@@ -304,7 +437,10 @@ def _to_fp8_saturated(x: Tensor, float8_dtype: torch.dtype) -> Tensor:
         raise TypeError(f"Unsupported float8_dtype: {float8_dtype}")
     return x.to(float8_dtype)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.no_grad()
 def _amax_to_scale(
     amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
@@ -323,7 +459,10 @@ def _amax_to_scale(
         res = torch.clamp(res, max=FP16_MAX_POS)
     return res
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _quantize_tensorwise(x: Tensor, float8_dtype: torch.dtype):
     amax = torch.max(torch.abs(x))
     scale = _amax_to_scale(amax, float8_dtype, x.dtype)
@@ -331,7 +470,10 @@ def _quantize_tensorwise(x: Tensor, float8_dtype: torch.dtype):
     inverse_scale = scale.reciprocal()
     return x_fp8, inverse_scale
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
     amax = torch.max(torch.abs(x), dim=1, keepdim=True).values
     scale = _amax_to_scale(amax, float8_dtype, x.dtype)
@@ -339,6 +481,7 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
     inverse_scale = scale.reciprocal()
     return x_fp8, inverse_scale
 
+<<<<<<< HEAD
 
 def _quantize_blockwise(
     x: Tensor, float8_dtype: torch.dtype, block_outer: int, block_inner: int
@@ -380,12 +523,18 @@ def get_dtype(self, buffer_name: str) -> torch.dtype:  # noqa: ARG002
         return torch.float32
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextlib.contextmanager
 def patch_inductor_backend(
     device: str,
     python_wrapper_codegen: PythonWrapperCodegen = None,
+<<<<<<< HEAD
     custom_pass: CustomGraphModulePass = None,
     custom_backend_config: ConfigModule = None,
+=======
+    custom_pass: CustomGraphModulePass = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Patch the inductor backend for a specific device.
@@ -397,15 +546,20 @@ def patch_inductor_backend(
     original_scheduling = get_scheduling_for_device(device)
     original_python_wrapper = get_wrapper_codegen_for_device(device, False)
     original_cpp_wrapper = get_wrapper_codegen_for_device(device, True)
+<<<<<<< HEAD
     original_fx_wrapper = get_wrapper_codegen_for_device(device, fx_wrapper=True)
     original_custom_pass = get_custom_backend_pass_for_device(device)
     original_custom_backend_config = get_custom_backend_config_for_device(device)
+=======
+    original_custom_pass = get_custom_backend_pass_for_device(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     try:
         # Register modified backend for the device
         register_backend_for_device(
             device,
             original_scheduling,
+<<<<<<< HEAD
             (
                 python_wrapper_codegen
                 if python_wrapper_codegen is not None
@@ -419,6 +573,11 @@ def patch_inductor_backend(
                 if custom_backend_config is not None
                 else original_custom_backend_config
             ),
+=======
+            python_wrapper_codegen if python_wrapper_codegen is not None else original_python_wrapper,
+            original_cpp_wrapper,
+            custom_pass if custom_pass is not None else original_custom_pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         yield
     finally:
@@ -428,7 +587,11 @@ def patch_inductor_backend(
             original_scheduling,
             original_python_wrapper,
             original_cpp_wrapper,
+<<<<<<< HEAD
             original_fx_wrapper,
             original_custom_pass,
             original_custom_backend_config,
+=======
+            original_custom_pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index ce8e68ae1e2c5..37f1380cadd29 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -249,7 +249,11 @@ def extract_files(buffer):
             saved_module_buffer_2.seek(0)
             code_files_2, _debug_files_2 = extract_files(saved_module_buffer_2)
 
+<<<<<<< HEAD
             for a, b in zip(code_files, code_files_2, strict=True):
+=======
+            for a, b in zip(code_files, code_files_2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertMultiLineEqual(a, b)
 
             if isinstance(m, torch._C.ScriptModule):
@@ -439,7 +443,11 @@ def checkBailouts(self, model, inputs, expected):
         state = model.get_debug_state()
         plan = get_execution_plan(state)
         num_bailouts = plan.code.num_bailouts()
+<<<<<<< HEAD
         for i in range(num_bailouts):
+=======
+        for i in range(0, num_bailouts):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             plan.code.request_bailout(i)
             bailout_outputs = model(*inputs)
             self.assertEqual(bailout_outputs, expected)
@@ -617,7 +625,11 @@ def input_reduce(input, fn, acc):
         self.assertEqual(outputs, outputs_ge)
         if inputs_require_grads:
             self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
+<<<<<<< HEAD
             for g2, g2_ge in zip(grads2, grads2_ge, strict=True):
+=======
+            for g2, g2_ge in zip(grads2, grads2_ge):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if g2 is None and g2_ge is None:
                     continue
                 self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
diff --git a/torch/testing/_internal/logging_utils.py b/torch/testing/_internal/logging_utils.py
index 1e1ecf8f4f707..fc77c3d9dbe6e 100644
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@@ -7,7 +7,11 @@
 import torch._logging
 import torch._logging._internal
 from contextlib import AbstractContextManager
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import LazyString
 from torch._inductor import config as inductor_config
 import logging
@@ -228,11 +232,19 @@ def multiple_logs_to_string(module: str, *log_options: str) -> tuple[list[io.Str
     def tmp_redirect_logs():
         loggers = [torch._logging.getArtifactLogger(module, option) for option in log_options]
         try:
+<<<<<<< HEAD
             for logger, handler in zip(loggers, handlers, strict=True):
                 logger.addHandler(handler)
             yield
         finally:
             for logger, handler in zip(loggers, handlers, strict=True):
+=======
+            for logger, handler in zip(loggers, handlers):
+                logger.addHandler(handler)
+            yield
+        finally:
+            for logger, handler in zip(loggers, handlers):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 logger.removeHandler(handler)
 
     def ctx_manager() -> AbstractContextManager[None]:
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index d6e88e239e7b6..0a696e715cceb 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -8,12 +8,20 @@
 import operator
 import unittest
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
+=======
+from collections.abc import Iterable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
 from itertools import product
+<<<<<<< HEAD
 from typing import Any, Optional, TypeVar, Union
+=======
+from typing import Any, Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.testing import make_tensor
@@ -162,11 +170,21 @@ def __init__(
         # Allow calling either as SampleInput(input, args=args, kwargs=kwargs), or as
         # SampleInput(input, *args, **kwargs) but not to mix the two forms
         if args is not None or kwargs is not None:
+<<<<<<< HEAD
             assert not var_args and not var_kwargs, """
 A SampleInput can be constructed "naturally" with *args and **kwargs or by
 explicitly setting the "args" and "kwargs" parameters, but the two
 methods of construction cannot be mixed!"""
         elif var_args or var_kwargs:
+=======
+            assert (
+                not var_args and not var_kwargs
+            ), """
+A SampleInput can be constructed "naturally" with *args and **kwargs or by
+explicitly setting the "args" and "kwargs" parameters, but the two
+methods of construction cannot be mixed!"""
+        elif len(var_args) or len(var_kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert (
                 output_process_fn_grad is None
                 and broadcasts_input is None
@@ -224,7 +242,11 @@ def _repr_helper(self, formatter):
             f"name={repr(self.name)}",
         ]
 
+<<<<<<< HEAD
         return f"SampleInput({', '.join(a for a in arguments if a is not None)})"
+=======
+        return f'SampleInput({", ".join(a for a in arguments if a is not None)})'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self):
         return self._repr_helper(lambda x: x)
@@ -1534,10 +1556,14 @@ def supported_dtypes(self, device_type):
         device_type = torch.device(device_type).type
         if device_type == "cuda" and TEST_WITH_ROCM:
             device_type = "rocm"
+<<<<<<< HEAD
         result = self.dtypesIf.get(device_type, self.dtypes)
         if device_type == "mps":
             return result - {torch.float64, torch.cdouble}
         return result
+=======
+        return self.dtypesIf.get(device_type, self.dtypes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def supported_backward_dtypes(self, device_type):
         if not self.supports_autograd:
@@ -1555,8 +1581,11 @@ def supported_backward_dtypes(self, device_type):
             )
         elif device_type == "hpu":
             backward_dtypes = self.backward_dtypesIfHpu
+<<<<<<< HEAD
         elif device_type == "mps":
             backward_dtypes = self.backward_dtypes - {torch.double, torch.cdouble}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             backward_dtypes = self.backward_dtypes
 
@@ -1604,11 +1633,21 @@ def __post_init__(self):
 
     # returns a string identifier of the rule type
     @abstractmethod
+<<<<<<< HEAD
     def type(self) -> str: ...
 
     # returns an appropriate context that handles the xfail, skips, etc.
     @abstractmethod
     def get_context(self, test_case): ...
+=======
+    def type(self) -> str:
+        ...
+
+    # returns an appropriate context that handles the xfail, skips, etc.
+    @abstractmethod
+    def get_context(self, test_case):
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # useful for specifying xfails
@@ -1792,10 +1831,15 @@ def __init__(
         # kwargs to use when calling the op. This is required for operators that
         # have other required parameters besides the input tensor.
         generate_args_kwargs: Callable = lambda t, dim=None, keepdim=False: (
+<<<<<<< HEAD
             yield (
                 (),
                 {},
             )
+=======
+            yield (),
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         # Options from the OpInfo base class
         **kwargs,
@@ -2479,9 +2523,15 @@ def __init__(
             self.supports_one_python_scalar = True
 
         if self.supports_one_python_scalar:
+<<<<<<< HEAD
             assert supports_rhs_python_scalar, (
                 "Can't support lhs and rhs Python scalars but not rhs scalars!"
             )
+=======
+            assert (
+                supports_rhs_python_scalar
+            ), "Can't support lhs and rhs Python scalars but not rhs scalars!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The following functions and classes are for testing elementwise unary operators.
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index d65fbef658a45..a89167c3e4bfe 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -102,9 +102,14 @@ def sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwar
         for mask in _generate_masked_op_mask(
             sample_input.input.shape, device, **kwargs
         ):
+<<<<<<< HEAD
             sample_input_args, sample_input_kwargs = (
                 sample_input.args,
                 dict(mask=mask, **sample_input.kwargs),
+=======
+            sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                mask=mask, **sample_input.kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             yield SampleInput(
                 sample_input.input.detach().requires_grad_(requires_grad),
@@ -225,9 +230,14 @@ def sample_inputs_masked_norm(op_info, device, dtype, requires_grad, **kwargs):
             op_info, device, dtype, requires_grad, **kwargs
         ):
             sample_input_args, sample_input_kwargs = (
+<<<<<<< HEAD
                 (ord,) + sample_input.args,
                 sample_input.kwargs.copy(),
             )
+=======
+                ord,
+            ) + sample_input.args, sample_input.kwargs.copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield SampleInput(
                 sample_input.input.clone().requires_grad_(requires_grad),
                 args=sample_input_args,
@@ -278,9 +288,14 @@ def masked_samples():
             for mask in _generate_masked_op_mask(
                 sample_input.input.shape, device, **kwargs
             ):
+<<<<<<< HEAD
                 sample_input_args, sample_input_kwargs = (
                     sample_input.args,
                     dict(mask=mask, **sample_input.kwargs),
+=======
+                sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                    mask=mask, **sample_input.kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 yield SampleInput(
                     sample_input.input.detach().requires_grad_(requires_grad),
@@ -365,11 +380,18 @@ def sample_inputs_masked_cumops(op_info, device, dtype, requires_grad, **kwargs)
         for mask in _generate_masked_op_mask(
             sample_input.input.shape, device, **kwargs
         ):
+<<<<<<< HEAD
             if type(mask) is not torch.Tensor:
                 continue
             sample_input_args, sample_input_kwargs = (
                 sample_input.args,
                 dict(mask=mask, **sample_input.kwargs),
+=======
+            if type(mask) != torch.Tensor:
+                continue
+            sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                mask=mask, **sample_input.kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if "keepdim" in sample_input_kwargs:
                 sample_input_kwargs.pop("keepdim")
@@ -402,9 +424,15 @@ def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwar
         make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
     )
     for shape, input_masks, other_masks in zip(
+<<<<<<< HEAD
         shapes, input_mask_lists, other_mask_lists, strict=True
     ):
         for input_mask, other_mask in zip(input_masks, other_masks, strict=True):
+=======
+        shapes, input_mask_lists, other_mask_lists
+    ):
+        for input_mask, other_mask in zip(input_masks, other_masks):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield SampleInput(
                 make_arg(shape),
                 make_arg(shape),
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index ae5a468ddd6ae..03a1b43943d35 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -41,7 +41,10 @@
     skipIfSlowGradcheckEnv,
     slowTest,
     TEST_WITH_ROCM,
+<<<<<<< HEAD
     TEST_XPU,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.opinfo.core import (
     clone_sample,
@@ -293,7 +296,11 @@ def sample_inputs_linalg_multi_dot(op_info, device, dtype, requires_grad, **kwar
 
     for sizes in test_cases:
         tensors = []
+<<<<<<< HEAD
         for size in itertools.pairwise(sizes):
+=======
+        for size in zip(sizes[:-1], sizes[1:]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t = make_tensor(
                 size, dtype=dtype, device=device, requires_grad=requires_grad
             )
@@ -322,7 +329,11 @@ def sample_inputs_linalg_matrix_norm(op_info, device, dtype, requires_grad, **kw
 def sample_inputs_linalg_norm(
     op_info, device, dtype, requires_grad, *, variant=None, **kwargs
 ):
+<<<<<<< HEAD
     if variant is not None and variant != "subgradient_at_zero":
+=======
+    if variant is not None and variant not in ("subgradient_at_zero",):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError(
             f"Unsupported variant, expected variant to be 'subgradient_at_zero' but got: {variant}"
         )
@@ -1767,12 +1778,16 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+<<<<<<< HEAD
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestCommon",
                 "test_compare_cpu",
                 active_if=(not TEST_XPU),
             ),
+=======
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     OpInfo(
@@ -1788,12 +1803,16 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+<<<<<<< HEAD
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestCommon",
                 "test_compare_cpu",
                 active_if=(not TEST_XPU),
             ),
+=======
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     OpInfo(
@@ -1810,12 +1829,16 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+<<<<<<< HEAD
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestCommon",
                 "test_compare_cpu",
                 active_if=(not TEST_XPU),
             ),
+=======
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     OpInfo(
diff --git a/torch/testing/_internal/opinfo/definitions/nested.py b/torch/testing/_internal/opinfo/definitions/nested.py
index 2f58ad2d7fb89..0c9ca5eaa0ad2 100644
--- a/torch/testing/_internal/opinfo/definitions/nested.py
+++ b/torch/testing/_internal/opinfo/definitions/nested.py
@@ -107,7 +107,10 @@ def get_dim_argnames(self) -> tuple[Optional[str], Optional[str]]:
     "flatten": ExtraOpData(is_view=True, dim_args=[["start_dim", "end_dim"]]),
     "flip": ExtraOpData(dim_args=[["dims..."]]),
     "gather": ExtraOpData(dim_args=[["dim"]]),
+<<<<<<< HEAD
     "hash_tensor": ExtraOpData(dim_args=[["dim..."]]),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "imag": ExtraOpData(is_view=True),
     "index_add": ExtraOpData(dim_args=[["dim"]]),
     "index_copy": ExtraOpData(dim_args=[["dim"]]),
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index f81efd19dbc6c..15b98ff3a6a96 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -1,9 +1,15 @@
 # mypy: ignore-errors
 
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import partial
 from itertools import product
+=======
+from functools import partial
+from itertools import product
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy
 
diff --git a/torch/testing/_internal/opinfo/definitions/sparse.py b/torch/testing/_internal/opinfo/definitions/sparse.py
index 200a3ad9ed902..e347c02c1bae3 100644
--- a/torch/testing/_internal/opinfo/definitions/sparse.py
+++ b/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -204,7 +204,11 @@ def _validate_sample_input_sparse_reduction(op_info, sample, check_validate=Fals
     if op_info.name == "sum":
         sample = _validate_sample_input_sparse_reduction_sum(sample)
 
+<<<<<<< HEAD
     if op_info.name == "masked.sum":
+=======
+    if op_info.name in {"masked.sum"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mask = sample.kwargs.get("mask", UNSPECIFIED)
         if (
             mask not in {None, UNSPECIFIED}
@@ -792,6 +796,7 @@ def _sample_inputs_sparse_like_fns(
 
 
 def _validate_sample_input_sparse_like_fns(op_info, sample, check_validate=False):
+<<<<<<< HEAD
     if (
         sample.input.layout
         in {
@@ -802,6 +807,14 @@ def _validate_sample_input_sparse_like_fns(op_info, sample, check_validate=False
         }
         and op_info.name != "zeros_like"
     ):
+=======
+    if sample.input.layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    } and op_info.name not in {"zeros_like"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sample.kwargs.get("layout", sample.input.layout) != sample.input.layout:
             return ErrorInput(
                 sample,
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index d6dce75437d17..62fc024204896 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -394,8 +394,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+<<<<<<< HEAD
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -407,8 +415,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+<<<<<<< HEAD
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -418,10 +434,20 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -431,10 +457,20 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -448,10 +484,13 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: inf
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+<<<<<<< HEAD
             # Too slow
             DecorateInfo(
                 unittest.skip, "TestCommon", "test_compare_cpu", device_type="xpu"
             ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -463,8 +502,16 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+<<<<<<< HEAD
             # Greatest absolute difference: inf
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -476,11 +523,18 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+<<<<<<< HEAD
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
             # Too slow
             DecorateInfo(
                 unittest.skip, "TestCommon", "test_compare_cpu", device_type="xpu"
+=======
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         ),
         supports_one_python_scalar=True,
@@ -491,10 +545,25 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -588,10 +657,25 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -601,10 +685,25 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -614,10 +713,25 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -627,10 +741,25 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
         skips=(
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+=======
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index d9e2127e956b4..b4baaee7a80be 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -62,8 +62,12 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type):
     assert device_type in ["cpu", "cuda"]
     if not TEST_CUDA and device_type == "cuda":
         warnings.warn(
+<<<<<<< HEAD
             "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!",
             stacklevel=2,
+=======
+            "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return _dynamic_dispatch_dtypes(())
 
@@ -77,8 +81,12 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type):
             # We raise a warning, so that user knows that this was the case
             # and can investigate if there was an issue with the `sample_inputs_fn`.
             warnings.warn(
+<<<<<<< HEAD
                 f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}",
                 stacklevel=2,
+=======
+                f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             continue
 
diff --git a/torch/testing/_internal/optests/aot_autograd.py b/torch/testing/_internal/optests/aot_autograd.py
index 3c4d05a95a33e..9c66a4c29c1ac 100644
--- a/torch/testing/_internal/optests/aot_autograd.py
+++ b/torch/testing/_internal/optests/aot_autograd.py
@@ -3,7 +3,11 @@
 import torch
 import torch.utils._pytree as pytree
 from torch.testing._utils import wrapper_set_seed
+<<<<<<< HEAD
 from functorch.compile import compiled_function, min_cut_rematerialization_partition, default_partition, nop
+=======
+from functorch.compile import compiled_function, min_cut_rematerialization_partition, nop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .make_fx import randomize
 import re
 
@@ -38,8 +42,12 @@ def aot_autograd_check(
         assert_equals_fn=torch.testing.assert_close,
         check_gradients=True,
         try_check_data_specialization=False,
+<<<<<<< HEAD
         skip_correctness_check=False,
         disable_functionalization=False):
+=======
+        skip_correctness_check=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Compares func(*args, **kwargs) in eager-mode to under AOTAutograd.
 
     Compares outputs and (if check_gradients=True) gradients produced by
@@ -64,6 +72,7 @@ def func_no_tensors(args):
         c_args, c_kwargs = pytree.tree_unflatten(reconstructed_flat_args, args_spec)
         return func(*c_args, **c_kwargs)
 
+<<<<<<< HEAD
     # cannot use the min cut partitioner without functionalization
     if disable_functionalization:
         compiled_f = compiled_function(
@@ -85,6 +94,10 @@ def func_no_tensors(args):
             keep_inference_input_mutations=True,
             disable_functionalization=False
         )
+=======
+    compiled_f = compiled_function(
+        func_no_tensors, nop, nop, dynamic=dynamic, partition_fn=min_cut_rematerialization_partition)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out = wrapper_set_seed(func_no_tensors, args)
     if check_gradients == "auto":
diff --git a/torch/testing/_internal/optests/autograd_registration.py b/torch/testing/_internal/optests/autograd_registration.py
index ae5ae34059eaa..b50135e39009b 100644
--- a/torch/testing/_internal/optests/autograd_registration.py
+++ b/torch/testing/_internal/optests/autograd_registration.py
@@ -83,17 +83,27 @@ def autograd_registration_check(op, args, kwargs):
 
     # Determine which AutogradBACKEND key to check
     all_device_types = {arg.device.type for arg in all_tensors}
+<<<<<<< HEAD
     if not all_device_types.issubset(["cpu", "cuda", "xpu"]):
         # Don't want to support other keys yet
         raise NotImplementedError(
             f"autograd_registration_check: NYI devices other than CPU/CUDA/XPU, got {all_device_types}"
+=======
+    if not all_device_types.issubset(["cpu", "cuda"]):
+        # Don't want to support other keys yet
+        raise NotImplementedError(
+            f"autograd_registration_check: NYI devices other than CPU/CUDA, got {all_device_types}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if "cuda" in all_device_types:
         key = "AutogradCUDA"
     elif "cpu" in all_device_types:
         key = "AutogradCPU"
+<<<<<<< HEAD
     elif "xpu" in all_device_types:
         key = "AutogradXPU"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch._C._dispatch_has_kernel_for_dispatch_key(op.name(), key):
         return
@@ -129,6 +139,10 @@ def not_an_input_and_requires_grad(tensor):
         f"which may lead to silently incorrect results. If your operator consists "
         f"of regular PyTorch operations, consider not using an operator at all "
         f"or registering your operator as CompositeImplicitAutograd. If you have "
+<<<<<<< HEAD
         f"an autograd.Function registered to a backend (CPU/CUDA/XPU) key, the correct "
+=======
+        f"an autograd.Function registered to a backend (CPU/CUDA) key, the correct "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"location for it is the Autograd key."
     )
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 17f7e27d67463..34718d87acc74 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -10,8 +10,13 @@
 import tempfile
 import threading
 import unittest
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from typing import Any, Optional, Union
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
diff --git a/torch/testing/_internal/optests/make_fx.py b/torch/testing/_internal/optests/make_fx.py
index 970a0be1b3695..2b3700984177d 100644
--- a/torch/testing/_internal/optests/make_fx.py
+++ b/torch/testing/_internal/optests/make_fx.py
@@ -55,7 +55,11 @@ def run(f, *args, **kwargs):
 # If any argument is a torch.Size(), maybe get dynamic shapes for it by:
 # - Create a temporary Tensor whose size is the torch.Size() we want. Note that
 #   we use an expanded Tensor as we cannot pass "meta" Tensors to make_fx.
+<<<<<<< HEAD
 # - Pass it to make_fx such that it is converted to a proxy Tensor
+=======
+# - Pass it to make_fx such that it is is converted to a proxy Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
 #   symbolic mode, a no-op otherwise)
 def handle_sizes_for_dynamic_shapes(func, args, kwargs):
diff --git a/torch/testing/_internal/subclasses.py b/torch/testing/_internal/subclasses.py
index 228f98139fea5..c99107f74d54e 100644
--- a/torch/testing/_internal/subclasses.py
+++ b/torch/testing/_internal/subclasses.py
@@ -70,7 +70,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
     def __coerce_same_metadata_as_tangent__(
         self, expected_metadata: Any, expected_type: Optional[type] = None
     ):
+<<<<<<< HEAD
         if expected_type is type(self.a):
+=======
+        if expected_type == type(self.a):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.a
         elif expected_type is TwoTensor:
             return TwoTensor(self.a, self.a.clone())
diff --git a/torch/testing/_internal/torchbind_impls.py b/torch/testing/_internal/torchbind_impls.py
index e5162ba0d6cb6..d8ab9aa39e3c9 100644
--- a/torch/testing/_internal/torchbind_impls.py
+++ b/torch/testing/_internal/torchbind_impls.py
@@ -46,6 +46,7 @@ def fake_queue_pop(tq):
     def fake_queue_push(tq, x):
         return tq.push(x)
 
+<<<<<<< HEAD
     torch.library.register_autocast(
         "_TorchScriptTesting::queue_push", "cpu", torch.float32
     )
@@ -60,6 +61,8 @@ def fake_queue_push(tq, x):
         "_TorchScriptTesting::queue_pop", "cuda", torch.float32
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.library.register_fake("_TorchScriptTesting::queue_size")
     def fake_queue_size(tq):
         return tq.size()
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 0964c68ebb20b..28791a60e761c 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -2,6 +2,7 @@
 
 import unittest
 
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import (
     HAS_CUDA_AND_TRITON,
     HAS_GPU,
@@ -16,12 +17,20 @@
 requires_gpu_and_triton = unittest.skipUnless(
     HAS_XPU_AND_TRITON or HAS_CUDA_AND_TRITON, "requires gpu and triton"
 )
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA, HAS_GPU
+from torch.utils._triton import has_triton
+
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 requires_gpu = unittest.skipUnless(HAS_GPU, "requires gpu")
 
 if has_triton():
     import triton
     from triton import language as tl
 
+<<<<<<< HEAD
     import torch
 
     def _get_strange_configs() -> list[triton.Config]:
@@ -79,6 +88,8 @@ def _get_strange_configs() -> list[triton.Config]:
             ]
         return configs
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Define here so that multiple tests can take advantage of it
     @triton.jit
     def add_kernel(
@@ -912,7 +923,11 @@ def strange_config_matmul_kernel(
         b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
 
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+<<<<<<< HEAD
         for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
+=======
+        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
             b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
             accumulator = tl.dot(a, b, accumulator)
@@ -994,6 +1009,7 @@ def kernel_inline_asm_single_quotes(
         )
         tl.store(out_ptr + offsets, cos_pow, mask=offsets < numel)
 
+<<<<<<< HEAD
     @triton.jit
     def add_kernel_with_boolean_param(
         in_ptr0,
@@ -1015,6 +1031,8 @@ def add_kernel_with_boolean_param(
             output = x
         tl.store(out_ptr + offsets, output, mask=mask)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # support the old (experimental) and new (tensor_descriptor) APIs
     def create_tensor_descriptor_shim(
         tensor, block_sizes: list[int], new_api: bool = True
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index 8197829ac7f44..f74e8132cb7ed 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -9,7 +9,11 @@
 # A simple tensor subclass that holds two tensors internally, and runs every op on both tensors.
 class TwoTensor(torch.Tensor):
     @staticmethod
+<<<<<<< HEAD
     def __new__(cls, a, b, outer_size=None, outer_stride=None, *, requires_grad=None):
+=======
+    def __new__(cls, a, b, outer_size=None, outer_stride=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if outer_size is None:
             outer_size = a.size()
         if outer_stride is None:
@@ -28,7 +32,11 @@ def __new__(cls, a, b, outer_size=None, outer_stride=None, *, requires_grad=None
         kwargs["storage_offset"] = a.storage_offset()
         kwargs["device"] = a.device
         kwargs["layout"] = a.layout
+<<<<<<< HEAD
         kwargs["requires_grad"] = requires_grad or a.requires_grad
+=======
+        kwargs["requires_grad"] = a.requires_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs["dtype"] = a.dtype
         out = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
 
@@ -39,7 +47,11 @@ def __new__(cls, a, b, outer_size=None, outer_stride=None, *, requires_grad=None
 
     @torch._disable_dynamo
     @mark_subclass_constructor_exportable_experimental
+<<<<<<< HEAD
     def __init__(self, a, b, outer_size=None, outer_stride=None, *, requires_grad=None):
+=======
+    def __init__(self, a, b, outer_size=None, outer_stride=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.a = a
         self.b = b
 
@@ -78,7 +90,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         # our two inner tensors return the same value
         out_flat = [
             cls(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
+<<<<<<< HEAD
             for o_a, o_b in zip(out_a_flat, out_b_flat, strict=True)
+=======
+            for o_a, o_b in zip(out_a_flat, out_b_flat)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         out = pytree.tree_unflatten(out_flat, spec)
         from torch._higher_order_ops.cond import cond_op
diff --git a/torch/types.py b/torch/types.py
index 0388c9c66aefe..104da9ea82add 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -12,12 +12,18 @@
     str as _str,
 )
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import Any, IO, TYPE_CHECKING, TypeAlias, Union
 from typing_extensions import Self
+=======
+from typing import Any, IO, TYPE_CHECKING, Union
+from typing_extensions import Self, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # `as` imports have better static analysis support than assignment `ExposedType: TypeAlias = HiddenType`
 from torch import (  # noqa: F401
     device as _device,
+<<<<<<< HEAD
     DispatchKey,
     dtype as _dtype,
     layout as _layout,
@@ -27,6 +33,17 @@
     SymFloat,
     SymInt,
     Tensor,
+=======
+    DispatchKey as DispatchKey,
+    dtype as _dtype,
+    layout as _layout,
+    qscheme as _qscheme,
+    Size as Size,
+    SymBool as SymBool,
+    SymFloat as SymFloat,
+    SymInt as SymInt,
+    Tensor as Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index 1c3ec15790063..68baac806d79b 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -29,7 +29,17 @@ def set_module(obj, mod):
     obj.__module__ = mod
 
 
+<<<<<<< HEAD
 cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), "share", "cmake")
+=======
+if torch._running_with_deploy():
+    # not valid inside torch_deploy interpreter, no paths exists for frozen modules
+    cmake_prefix_path = None
+else:
+    cmake_prefix_path = _osp.join(
+        _osp.dirname(_osp.dirname(__file__)), "share", "cmake"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def swap_tensors(t1, t2):
diff --git a/torch/utils/_appending_byte_serializer.py b/torch/utils/_appending_byte_serializer.py
index 82cced0b3dc82..2adc28b73f1ea 100644
--- a/torch/utils/_appending_byte_serializer.py
+++ b/torch/utils/_appending_byte_serializer.py
@@ -1,7 +1,12 @@
 import base64
 import zlib
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from typing import Generic, TypeVar
+=======
+from collections.abc import Iterable
+from typing import Callable, Generic, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 T = TypeVar("T")
@@ -38,8 +43,12 @@ def to_bytes(self) -> bytes:
         digest = zlib.crc32(self._data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
             4, byteorder="big", signed=False
         )
+<<<<<<< HEAD
         if len(digest) != CHECKSUM_DIGEST_SIZE:
             raise AssertionError("Computed checksum digest has unexpected size")
+=======
+        assert len(digest) == CHECKSUM_DIGEST_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._data[0:CHECKSUM_DIGEST_SIZE] = digest
         return bytes(self._data)
 
@@ -47,6 +56,7 @@ def to_bytes(self) -> bytes:
 class BytesReader:
     def __init__(self, data: bytes) -> None:
         # Check for data corruption
+<<<<<<< HEAD
         if len(data) < CHECKSUM_DIGEST_SIZE:
             raise AssertionError("Input data is too short to contain checksum")
         digest = zlib.crc32(data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
@@ -54,6 +64,13 @@ def __init__(self, data: bytes) -> None:
         )
         if len(digest) != CHECKSUM_DIGEST_SIZE:
             raise AssertionError("Computed checksum digest has unexpected size")
+=======
+        assert len(data) >= CHECKSUM_DIGEST_SIZE
+        digest = zlib.crc32(data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
+            4, byteorder="big", signed=False
+        )
+        assert len(digest) == CHECKSUM_DIGEST_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if data[0:CHECKSUM_DIGEST_SIZE] != digest:
             raise RuntimeError(
                 "Bytes object is corrupted, checksum does not match. "
@@ -123,11 +140,15 @@ def to_bytes(self) -> bytes:
     @staticmethod
     def to_list(data: bytes, *, deserialize_fn: Callable[[BytesReader], T]) -> list[T]:
         reader = BytesReader(data)
+<<<<<<< HEAD
         if reader.read_uint64() != _ENCODING_VERSION:
             raise AssertionError(
                 f"Encoding version mismatch in AppendingByteSerializer.to_list, \
                     got {reader.read_uint64()}"
             )
+=======
+        assert reader.read_uint64() == _ENCODING_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result: list[T] = []
         while not reader.is_finished():
diff --git a/torch/utils/_backport_slots.py b/torch/utils/_backport_slots.py
new file mode 100644
index 0000000000000..123996a854165
--- /dev/null
+++ b/torch/utils/_backport_slots.py
@@ -0,0 +1,116 @@
+# This code is backported from python 3.10 dataclasses. Once 3.10 becomes the
+# minimum supported we should use dataclass(slots=True) instead.
+
+from __future__ import annotations
+
+import dataclasses
+import itertools
+from typing import TYPE_CHECKING, TypeVar
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from _typeshed import DataclassInstance
+
+
+__all__ = ["dataclass_slots"]
+
+_T = TypeVar("_T", bound="DataclassInstance")
+
+
+def dataclass_slots(cls: type[_T]) -> type[DataclassInstance]:
+    assert dataclasses.is_dataclass(cls), "Can only be used on dataclasses."
+
+    def _get_slots(cls: type[DataclassInstance]) -> Generator[str, None, None]:
+        slots = cls.__dict__.get("__slots__")
+        # `__dictoffset__` and `__weakrefoffset__` can tell us whether
+        # the base type has dict/weakref slots, in a way that works correctly
+        # for both Python classes and C extension types. Extension types
+        # don't use `__slots__` for slot creation
+        if slots is None:
+            slots = []
+            if getattr(cls, "__weakrefoffset__", -1) != 0:
+                slots.append("__weakref__")
+            if getattr(cls, "__dictrefoffset__", -1) != 0:
+                slots.append("__dict__")
+            yield from slots
+        elif isinstance(slots, str):
+            yield slots
+        # Slots may be any iterable, but we cannot handle an iterator
+        # because it will already be (partially) consumed.
+        elif not hasattr(cls, "__next__"):
+            yield from slots
+        else:
+            raise TypeError(f"Slots of '{cls.__name__}' cannot be determined")
+
+    def _add_slots(
+        cls: type[DataclassInstance], is_frozen: bool, weakref_slot: bool
+    ) -> type[DataclassInstance]:
+        # Need to create a new class, since we can't set __slots__
+        #  after a class has been created.
+
+        # Make sure __slots__ isn't already set.
+        if "__slots__" in cls.__dict__:
+            raise TypeError(f"{cls.__name__} already specifies __slots__")
+
+        # Create a new dict for our new class.
+        cls_dict = dict(cls.__dict__)
+        field_names = tuple(f.name for f in dataclasses.fields(cls))
+        # Make sure slots don't overlap with those in base classes.
+        inherited_slots = set(
+            itertools.chain.from_iterable(map(_get_slots, cls.__mro__[1:-1]))
+        )
+        # The slots for our class.  Remove slots from our base classes.  Add
+        # '__weakref__' if weakref_slot was given, unless it is already present.
+        cls_dict["__slots__"] = tuple(
+            itertools.filterfalse(
+                inherited_slots.__contains__,
+                itertools.chain(
+                    # gh-93521: '__weakref__' also needs to be filtered out if
+                    # already present in inherited_slots
+                    field_names,
+                    ("__weakref__",) if weakref_slot else (),
+                ),
+            ),
+        )
+
+        for field_name in field_names:
+            # Remove our attributes, if present. They'll still be
+            #  available in _MARKER.
+            cls_dict.pop(field_name, None)
+
+        # Remove __dict__ itself.
+        cls_dict.pop("__dict__", None)
+
+        # Clear existing `__weakref__` descriptor, it belongs to a previous type:
+        cls_dict.pop("__weakref__", None)  # gh-102069
+
+        # And finally create the class.
+        qualname = getattr(cls, "__qualname__", None)
+        cls = type(cls.__name__, cls.__bases__, cls_dict)
+        if qualname is not None:
+            cls.__qualname__ = qualname
+
+        def _dataclass_getstate(self: _T) -> object:
+            fields = dataclasses.fields(self)
+            return [getattr(self, f.name) for f in fields]
+
+        def _dataclass_setstate(self: _T, state: list[object]) -> None:
+            fields = dataclasses.fields(self)
+            for field, value in zip(fields, state):
+                # use setattr because dataclass may be frozen
+                object.__setattr__(self, field.name, value)
+
+        if is_frozen:
+            # Need this for pickling frozen classes with slots.
+            if "__getstate__" not in cls_dict:
+                cls.__getstate__ = _dataclass_getstate  # type: ignore[method-assign, assignment]
+            if "__setstate__" not in cls_dict:
+                cls.__setstate__ = _dataclass_setstate  # type: ignore[attr-defined]
+
+        return cls
+
+    params = getattr(cls, dataclasses._PARAMS)  # type: ignore[attr-defined]
+    weakref_slot = getattr(params, "weakref_slot", False)
+    return _add_slots(cls, params.frozen, weakref_slot)
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 12ba497efd79c..6be6d58a622ff 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -6,12 +6,30 @@
 import io
 import os
 import pickle
+<<<<<<< HEAD
 import tokenize
 import unittest
 from collections.abc import Callable
 from dataclasses import dataclass
 from types import FunctionType, ModuleType
 from typing import Any, Generic, NoReturn, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+import sys
+import tokenize
+import unittest
+from dataclasses import dataclass
+from types import FunctionType, ModuleType
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    NoReturn,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 from unittest import mock
 
@@ -29,7 +47,11 @@
 _UNSET_SENTINEL = object()
 
 
+<<<<<<< HEAD
 @dataclass(kw_only=True)
+=======
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _Config(Generic[T]):
     """Represents a config with richer behaviour than just a default value.
     ::
@@ -43,17 +65,29 @@ class _Config(Generic[T]):
         alias: If set, the directly use the value of the alias.
         env_name_force: If set, this environment variable has precedence over
             everything after this.
+<<<<<<< HEAD
             If multiple env variables are given, the precedence order is from
+=======
+            If multiple env variables are given, the precendence order is from
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             left to right.
         user_override: If a user sets a value (i.e. foo.bar=True), that
             has precedence over everything after this.
         env_name_default: If set, this environment variable will override everything
             after this.
+<<<<<<< HEAD
             If multiple env variables are given, the precedence order is from
             left to right.
         justknob: If this pytorch installation supports justknobs, that will
             override defaults, but will not override the user_override precedence.
         default: This value is the lowest precedence, and will be used if nothing is
+=======
+            If multiple env variables are given, the precendence order is from
+            left to right.
+        justknob: If this pytorch installation supports justknobs, that will
+            override defaults, but will not override the user_override precendence.
+        default: This value is the lowest precendance, and will be used if nothing is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             set.
 
     Environment Variables:
@@ -73,6 +107,7 @@ class _Config(Generic[T]):
     justknob: Optional[str] = None
     env_name_default: Optional[list[str]] = None
     env_name_force: Optional[list[str]] = None
+<<<<<<< HEAD
     value_type: Optional[type] = None
     alias: Optional[str] = None
 
@@ -99,13 +134,50 @@ def __post_init__(self) -> None:
     @staticmethod
     def string_or_list_of_string_to_list(
         val: Optional[Union[str, list[str]]],
+=======
+    alias: Optional[str] = None
+
+    def __init__(
+        self,
+        default: Union[T, object] = _UNSET_SENTINEL,
+        justknob: Optional[str] = None,
+        env_name_default: Optional[Union[str, list[str]]] = None,
+        env_name_force: Optional[Union[str, list[str]]] = None,
+        value_type: Optional[type] = None,
+        alias: Optional[str] = None,
+    ):
+        # python 3.9 does not support kw_only on the dataclass :(.
+        self.default = default
+        self.justknob = justknob
+        self.env_name_default = _Config.string_or_list_of_string_to_list(
+            env_name_default
+        )
+        self.env_name_force = _Config.string_or_list_of_string_to_list(env_name_force)
+        self.value_type = value_type
+        self.alias = alias
+        if self.alias is not None:
+            assert (
+                default is _UNSET_SENTINEL
+                and justknob is None
+                and env_name_default is None
+                and env_name_force is None
+            ), "if alias is set, none of {default, justknob and env var} can be set"
+
+    @staticmethod
+    def string_or_list_of_string_to_list(
+        val: Optional[Union[str, list[str]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[list[str]]:
         if val is None:
             return None
         if isinstance(val, str):
             return [val]
+<<<<<<< HEAD
         if not isinstance(val, list):
             raise AssertionError(f"val is not a list, got {type(val)}")
+=======
+        assert isinstance(val, list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return val
 
 
@@ -122,7 +194,12 @@ def Config(
         env_name_force: Optional[Union[str, list[str]]] = None,
         value_type: Optional[type] = None,
         alias: Optional[str] = None,
+<<<<<<< HEAD
     ) -> T: ...
+=======
+    ) -> T:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 else:
 
@@ -135,12 +212,16 @@ def Config(
         alias: Optional[str] = None,
     ) -> _Config[T]:
         return _Config(
+<<<<<<< HEAD
             default=default,
             justknob=justknob,
             env_name_default=env_name_default,
             env_name_force=env_name_force,
             value_type=value_type,
             alias=alias,
+=======
+            default, justknob, env_name_default, env_name_force, value_type, alias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -170,7 +251,14 @@ def visit(
         prefix: str,
     ) -> None:
         """Walk the module structure and move everything to module._config"""
+<<<<<<< HEAD
         type_hints = inspect.get_annotations(source)
+=======
+        if sys.version_info[:2] < (3, 10):
+            type_hints = getattr(source, "__annotations__", {})
+        else:
+            type_hints = inspect.get_annotations(source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, value in list(source.__dict__.items()):
             if (
                 key.startswith("__")
@@ -198,10 +286,14 @@ def visit(
                 if dest is module:
                     delattr(module, key)
             elif isinstance(value, type):
+<<<<<<< HEAD
                 if value.__module__ != module.__name__:
                     raise AssertionError(
                         f"subconfig class {value} must be defined in module {module.__name__}"
                     )
+=======
+                assert value.__module__ == module.__name__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # a subconfig with `class Blah:` syntax
                 proxy = SubConfigProxy(module, f"{name}.")
                 visit(value, proxy, f"{name}.")
@@ -242,8 +334,15 @@ def get_assignments_with_compile_ignored_comments(module: ModuleType) -> set[str
             prev_name = ""
             maybe_current = token.string.strip()
             if COMPILE_IGNORED_MARKER in maybe_current:
+<<<<<<< HEAD
                 if current_comment != ("", -1):
                     raise AssertionError(f"unconsumed {COMPILE_IGNORED_MARKER}")
+=======
+                assert current_comment == (
+                    "",
+                    -1,
+                ), f"unconsumed {COMPILE_IGNORED_MARKER}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 current_comment = maybe_current, token.start[0]
         elif token.type == tokenize.NAME:
             # Only accept the first name token, to handle if you have
@@ -260,8 +359,12 @@ def get_assignments_with_compile_ignored_comments(module: ModuleType) -> set[str
                 assignments.add(prev_name)
                 current_comment = "", -1  # reset
             prev_name = ""
+<<<<<<< HEAD
     if current_comment != ("", -1):
         raise AssertionError(f"unconsumed {COMPILE_IGNORED_MARKER}")
+=======
+    assert current_comment == ("", -1), f"unconsumed {COMPILE_IGNORED_MARKER}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return assignments
 
 
@@ -313,6 +416,7 @@ def __init__(self, config: _Config):
 
         # Ensure justknobs and envvars are allowlisted types
         if self.justknob is not None and self.default is not None:
+<<<<<<< HEAD
             if not isinstance(self.default, bool):
                 raise AssertionError(
                     f"justknobs only support booleans, {self.default} is not a boolean"
@@ -321,14 +425,27 @@ def __init__(self, config: _Config):
             config.env_name_default is not None or config.env_name_force is not None
         ):
             if self.value_type not in (
+=======
+            assert isinstance(
+                self.default, bool
+            ), f"justknobs only support booleans, {self.default} is not a boolean"
+        if self.value_type is not None and (
+            config.env_name_default is not None or config.env_name_force is not None
+        ):
+            assert self.value_type in (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 bool,
                 str,
                 Optional[bool],
                 Optional[str],
+<<<<<<< HEAD
             ):
                 raise AssertionError(
                     f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
                 )
+=======
+            ), f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ConfigModule(ModuleType):
@@ -413,7 +530,11 @@ def _get_alias_module_and_name(
         try:
             module = importlib.import_module(module_name)
         except ImportError as e:
+<<<<<<< HEAD
             raise AttributeError(f"config alias {alias} does not exist") from e
+=======
+            raise AttributeError("config alias {alias} does not exist") from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return module, constant_name
 
     def _get_alias_val(self, entry: _ConfigEntry) -> Any:
@@ -426,10 +547,14 @@ def _get_alias_val(self, entry: _ConfigEntry) -> Any:
 
     def _set_alias_val(self, entry: _ConfigEntry, val: Any) -> None:
         data = self._get_alias_module_and_name(entry)
+<<<<<<< HEAD
         if data is None:
             raise AssertionError(
                 "alias data should not be None when setting alias value"
             )
+=======
+        assert data is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module, constant_name = data
         setattr(module, constant_name, val)
 
@@ -626,9 +751,12 @@ def load_config(self, maybe_pickled_config: Union[bytes, dict[str, Any]]) -> Non
     def get_config_copy(self) -> dict[str, Any]:
         return self._get_dict()
 
+<<<<<<< HEAD
     def get_serializable_config_copy(self) -> dict[str, Any]:
         return self._get_dict(ignored_keys=getattr(self, "_save_config_ignore", []))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def patch(
         self,
         arg1: Optional[Union[str, dict[str, Any]]] = None,
@@ -654,6 +782,7 @@ def foo(...):
         changes: dict[str, Any]
         if arg1 is not None:
             if arg2 is not None:
+<<<<<<< HEAD
                 if not isinstance(arg1, str):
                     raise AssertionError(
                         "first argument must be a string when passing 2 positional args to patch"
@@ -680,6 +809,21 @@ def foo(...):
                 )
         if not isinstance(changes, dict):
             raise AssertionError(f"expected `dict` got {type(changes)}")
+=======
+                assert isinstance(arg1, str)
+                # patch("key", True) syntax
+                changes = {arg1: arg2}
+            else:
+                assert isinstance(arg1, dict)
+                # patch({"key": True}) syntax
+                changes = arg1
+            assert not kwargs
+        else:
+            # patch(key=True) syntax
+            changes = kwargs
+            assert arg2 is None
+        assert isinstance(changes, dict), f"expected `dict` got {type(changes)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prior: dict[str, Any] = {}
         config = self
 
@@ -688,10 +832,14 @@ def __init__(self) -> None:
                 self.changes = changes
 
             def __enter__(self) -> None:
+<<<<<<< HEAD
                 if prior:
                     raise AssertionError(
                         "prior should be empty when entering ConfigPatch"
                     )
+=======
+                assert not prior
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for key in self.changes.keys():
                     # KeyError on invalid entry
                     prior[key] = config.__getattr__(key)
diff --git a/torch/utils/_config_typing.pyi b/torch/utils/_config_typing.pyi
index 9cae7368cfa5e..b189c007b9dfc 100644
--- a/torch/utils/_config_typing.pyi
+++ b/torch/utils/_config_typing.pyi
@@ -21,8 +21,12 @@ This file should be imported into any file that uses install_config_module like
 Note that the import should happen before the call to install_config_module(), otherwise runtime errors may occur.
 """
 
+<<<<<<< HEAD
 if not TYPE_CHECKING:  # noqa: PYI002
     raise AssertionError("Do not use at runtime")  # noqa: W291
+=======
+assert TYPE_CHECKING, "Do not use at runtime"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def save_config() -> bytes: ...
 def save_config_portable(*, ignore_private_configs: bool = True) -> dict[str, Any]: ...
@@ -32,5 +36,8 @@ def to_dict() -> dict[str, Any]: ...
 def shallow_copy_dict() -> dict[str, Any]: ...
 def load_config(config: bytes | dict[str, Any]) -> None: ...
 def get_config_copy() -> dict[str, Any]: ...
+<<<<<<< HEAD
 def get_serializable_config_copy() -> dict[str, Any]: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def patch(arg1: str | dict[str, Any] | None = None, arg2: Any = None, **kwargs): ...
diff --git a/torch/utils/_content_store.py b/torch/utils/_content_store.py
index 0086a1e874ddf..ed83a67897292 100644
--- a/torch/utils/_content_store.py
+++ b/torch/utils/_content_store.py
@@ -217,10 +217,14 @@ def read_storage(self, h: str, *, device=None) -> torch.UntypedStorage:
             weights_only=True,
             map_location=device,
         )._untyped_storage
+<<<<<<< HEAD
         if s is None:
             raise AssertionError(
                 f"expected storage for hash {h} in {os.path.join(self.loc, 'storages')}, got None"
             )
+=======
+        assert s is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.storage_cache is not None:
             self.storage_cache[device][h] = StorageWeakRef(s)
         return s
diff --git a/torch/utils/_contextlib.py b/torch/utils/_contextlib.py
index 408cdfe7d7b77..2e9db7365abcd 100644
--- a/torch/utils/_contextlib.py
+++ b/torch/utils/_contextlib.py
@@ -6,9 +6,13 @@
 import inspect
 import sys
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, cast, overload, TypeVar
 from typing_extensions import Self
+=======
+from typing import Any, Callable, cast, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Used for annotating the decorator usage of _DecoratorContextManager (e.g.,
@@ -50,7 +54,11 @@ def generator_context(*args, **kwargs):
                         gen.close()
                     raise
 
+<<<<<<< HEAD
                 except BaseException:  # noqa: B036
+=======
+                except BaseException:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Propagate the exception thrown at us by the caller
                     with ctx_factory():
                         response = gen.throw(*sys.exc_info())
@@ -87,6 +95,7 @@ def context_decorator(ctx, func):
     be a multi-shot context manager that can be directly invoked multiple times)
     or a callable that produces a context manager.
     """
+<<<<<<< HEAD
     if callable(ctx) and hasattr(ctx, "__enter__"):
         raise AssertionError(
             f"Passed in {ctx} is both callable and also a valid context manager "
@@ -95,6 +104,15 @@ def context_decorator(ctx, func):
             "context_decorator(lambda: ctx()); if you intended to pass a context "
             "manager directly, rewrite your call as context_decorator(lambda: ctx)"
         )
+=======
+    assert not (callable(ctx) and hasattr(ctx, "__enter__")), (
+        f"Passed in {ctx} is both callable and also a valid context manager "
+        "(has __enter__), making it ambiguous which interface to use.  If you "
+        "intended to pass a context manager factory, rewrite your call as "
+        "context_decorator(lambda: ctx()); if you intended to pass a context "
+        "manager directly, rewrite your call as context_decorator(lambda: ctx)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not callable(ctx):
 
@@ -119,7 +137,10 @@ def ctx_factory():
 
     @functools.wraps(func)
     def decorate_context(*args, **kwargs):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-context-manager]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with ctx_factory():
             return func(*args, **kwargs)
 
@@ -159,12 +180,16 @@ def clone(self):
 class _NoParamDecoratorContextManager(_DecoratorContextManager):
     """Allow a context manager to be used as a decorator without parentheses."""
 
+<<<<<<< HEAD
     @overload
     def __new__(cls, orig_func: F) -> F: ...  # type: ignore[misc]
     @overload
     def __new__(cls, orig_func: None = None) -> Self: ...
 
     def __new__(cls, orig_func: F | None = None) -> Self | F:  # type: ignore[misc]
+=======
+    def __new__(cls, orig_func=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if orig_func is None:
             return super().__new__(cls)
         return cls()(orig_func)
diff --git a/torch/utils/_cpp_embed_headers.py b/torch/utils/_cpp_embed_headers.py
index 1d1577b0d8cb5..a1a7b3442b31f 100644
--- a/torch/utils/_cpp_embed_headers.py
+++ b/torch/utils/_cpp_embed_headers.py
@@ -53,6 +53,10 @@ def embed_headers(
     import sys
 
     if len(sys.argv) < 2:
+<<<<<<< HEAD
         print(f"Usage:\n {sys.argv[0]} filename")
+=======
+        print("Usage:\n {sys.argv[0]} filename")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sys.exit(1)
     print(embed_headers(sys.argv[1]))
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index 51f1704e4a22c..a8dcb8a121661 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -13,14 +13,23 @@
 """
 
 import functools
+<<<<<<< HEAD
 import types
 from collections.abc import Callable, Iterable, Mapping
 from typing import Any, Optional, overload, TypeVar, Union
 from typing_extensions import deprecated, Self, TypeAlias, TypeIs
+=======
+import sys
+import types
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+from typing_extensions import deprecated, TypeIs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.utils._pytree as python_pytree
 from torch.torch_version import TorchVersion as _TorchVersion
 from torch.utils._pytree import (
+<<<<<<< HEAD
     is_namedtuple,
     is_namedtuple_class,
     is_namedtuple_instance,
@@ -28,6 +37,15 @@
     is_structseq_class,
     is_structseq_instance,
     KeyEntry,
+=======
+    is_namedtuple as is_namedtuple,
+    is_namedtuple_class as is_namedtuple_class,
+    is_namedtuple_instance as is_namedtuple_instance,
+    is_structseq as is_structseq,
+    is_structseq_class as is_structseq_class,
+    is_structseq_instance as is_structseq_instance,
+    KeyEntry as KeyEntry,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -42,7 +60,11 @@
 
 
 import optree
+<<<<<<< HEAD
 from optree import PyTreeSpec  # direct import for type annotations
+=======
+from optree import PyTreeSpec as TreeSpec  # direct import for type annotations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -100,8 +122,11 @@
 R = TypeVar("R")
 
 
+<<<<<<< HEAD
 TreeSpec: TypeAlias = PyTreeSpec
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Context = Any
 PyTree = Any
 FlattenFunc = Callable[[PyTree], tuple[list[Any], Context]]
@@ -269,6 +294,7 @@ def _is_pytreespec_instance(obj: Any, /) -> TypeIs[TreeSpec]:
     return isinstance(obj, TreeSpec)
 
 
+<<<<<<< HEAD
 def treespec_leaf() -> TreeSpec:
     """Make a treespec representing a leaf node."""
     return optree.treespec_leaf(none_is_leaf=True, namespace="torch")
@@ -293,6 +319,8 @@ def treespec_dict(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def tree_is_leaf(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -307,9 +335,15 @@ def tree_is_leaf(
     False
     >>> tree_is_leaf((1, 2, 3), is_leaf=lambda x: isinstance(x, tuple))
     True
+<<<<<<< HEAD
     >>> tree_is_leaf({"a": 1, "b": 2, "c": 3})
     False
     >>> tree_is_leaf({"a": 1, "b": 2, "c": None})
+=======
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': 3})
+    False
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': None})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     False
 
     Args:
@@ -393,6 +427,14 @@ def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
         The reconstructed pytree, containing the ``leaves`` placed in the structure described by
         ``treespec``.
     """
+<<<<<<< HEAD
+=======
+    if not _is_pytreespec_instance(treespec):
+        raise TypeError(
+            f"tree_unflatten(leaves, treespec): Expected `treespec` to be instance of "
+            f"PyTreeSpec but got item of type {type(treespec)}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return optree.tree_unflatten(treespec, leaves)  # type: ignore[arg-type]
 
 
@@ -590,7 +632,14 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
+<<<<<<< HEAD
 TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
+=======
+if sys.version_info >= (3, 10):
+    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -603,6 +652,7 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
+<<<<<<< HEAD
 def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]: ...
 
 
@@ -614,10 +664,25 @@ def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]
 def map_only(
     type_or_types_or_pred: Type3[T, S, U], /
 ) -> MapOnlyFn[Fn3[T, S, U, Any]]: ...
+=======
+def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Type3[T, S, U], /) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This specialization is needed for the implementations below that call
 @overload
+<<<<<<< HEAD
 def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 
 
@@ -625,6 +690,15 @@ def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 def map_only(
     type_or_types_or_pred: Callable[[Any], bool], /
 ) -> MapOnlyFn[FnAny[Any]]: ...
+=======
+def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Callable[[Any], bool], /) -> MapOnlyFn[FnAny[Any]]:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def map_only(
@@ -648,7 +722,14 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
+<<<<<<< HEAD
     if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
+=======
+    if isinstance(type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(type_or_types_or_pred, types.UnionType)
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
@@ -677,7 +758,12 @@ def tree_map_only(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -687,7 +773,12 @@ def tree_map_only(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -697,7 +788,12 @@ def tree_map_only(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -707,7 +803,12 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -717,7 +818,12 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_map_only(
@@ -737,7 +843,12 @@ def tree_map_only_(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -747,7 +858,12 @@ def tree_map_only_(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -757,7 +873,12 @@ def tree_map_only_(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -767,7 +888,12 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -777,7 +903,12 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_map_only_(
@@ -815,7 +946,12 @@ def tree_all_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -825,7 +961,12 @@ def tree_all_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -835,7 +976,12 @@ def tree_all_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_all_only(
@@ -856,7 +1002,12 @@ def tree_any_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -866,7 +1017,12 @@ def tree_any_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -876,7 +1032,12 @@ def tree_any_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_any_only(
@@ -957,10 +1118,14 @@ def _broadcast_to_and_flatten(
     treespec: TreeSpec,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> Optional[list[Any]]:
+<<<<<<< HEAD
     if not _is_pytreespec_instance(treespec):
         raise AssertionError(
             f"_broadcast_to_and_flatten: Expected `treespec` to be instance of PyTreeSpec but got {type(treespec)}"
         )
+=======
+    assert _is_pytreespec_instance(treespec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     full_tree = tree_unflatten([0] * treespec.num_leaves, treespec)
     try:
         return broadcast_prefix(tree, full_tree, is_leaf=is_leaf)
@@ -1011,6 +1176,7 @@ def __instancecheck__(self, instance: object) -> bool:
         return _is_pytreespec_instance(instance) and instance.is_leaf()
 
 
+<<<<<<< HEAD
 @deprecated(
     "`isinstance(treespec, LeafSpec)` is deprecated, "
     "use `isinstance(treespec, TreeSpec)` and `treespec.is_leaf()` instead.",
@@ -1019,6 +1185,11 @@ def __instancecheck__(self, instance: object) -> bool:
 class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):  # type: ignore[misc,final]
     def __new__(cls) -> Self:
         return treespec_leaf()  # type: ignore[return-value]
+=======
+class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):
+    def __new__(cls) -> "LeafSpec":
+        return optree.treespec_leaf(none_is_leaf=True)  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_flatten_with_path(
@@ -1108,7 +1279,10 @@ def key_get(obj: Any, kp: KeyPath) -> Any:
 
 
 with python_pytree._NODE_REGISTRY_LOCK:
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     python_pytree._cxx_pytree_imported = True
     args, kwargs = (), {}  # type: ignore[var-annotated]
     for args, kwargs in python_pytree._cxx_pytree_pending_imports:
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index 2780218e03eef..0b320d26b388f 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -60,7 +60,10 @@ def _device_constructors():
 # NB: This is directly called from C++ in torch/csrc/Device.cpp
 class DeviceContext(TorchFunctionMode):
     def __init__(self, device):
+<<<<<<< HEAD
         # pyrefly: ignore [read-only]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device = torch.device(device)
 
     def __enter__(self):
@@ -83,6 +86,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         CURRENT_DEVICE = self.old_device
         cur_stack = []
         # Invariant: there should only be one DeviceContext on the stack at any time
+<<<<<<< HEAD
         # (At the bottom), pop all modes until we hit the bottom, assert it's a DeviceContext
         # or else someone else has popped it!
         for _ in range(_len_torch_function_stack() - 1):
@@ -91,14 +95,25 @@ def __exit__(self, exc_type, exc_val, exc_tb):
                 raise AssertionError(
                     "Found nested DeviceContext on the mode stack where none expected"
                 )
+=======
+        # (At the bottom), pop all mdoes until we hit the bottom, assert it's a DeviceContext
+        # or else someone else has popped it!
+        for _ in range(_len_torch_function_stack() - 1):
+            mode = _pop_mode()
+            assert not isinstance(mode, DeviceContext)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cur_stack.append(mode)
 
         if _len_torch_function_stack() > 0:
             mode = _pop_mode()
+<<<<<<< HEAD
             if not isinstance(mode, DeviceContext):
                 raise AssertionError(
                     "Expected a DeviceContext at the bottom of the mode stack"
                 )
+=======
+            assert isinstance(mode, DeviceContext)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for mode in reversed(cur_stack):
             _push_mode(mode)
diff --git a/torch/utils/_exposed_in.py b/torch/utils/_exposed_in.py
index 2cca4ce240ad9..0c1ee8be19eae 100644
--- a/torch/utils/_exposed_in.py
+++ b/torch/utils/_exposed_in.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import TypeVar
+=======
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 F = TypeVar("F")
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 8b682d96c1918..2e20af16797a6 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -1,4 +1,9 @@
+<<<<<<< HEAD
 from typing import Optional, TypeAlias
+=======
+from typing import Optional
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -7,7 +12,11 @@
 
 def _get_foreach_kernels_supported_devices() -> list[str]:
     r"""Return the device type list that supports foreach kernels."""
+<<<<<<< HEAD
     return ["cuda", "xpu", "mtia", torch._C._get_privateuse1_backend_name()]
+=======
+    return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_fused_kernels_supported_devices() -> list[str]:
@@ -18,7 +27,10 @@ def _get_fused_kernels_supported_devices() -> list[str]:
         "xpu",
         "hpu",
         "cpu",
+<<<<<<< HEAD
         "mtia",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._get_privateuse1_backend_name(),
     ]
 
diff --git a/torch/utils/_freeze.py b/torch/utils/_freeze.py
new file mode 100644
index 0000000000000..8696065adb9f9
--- /dev/null
+++ b/torch/utils/_freeze.py
@@ -0,0 +1,292 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+"""
+Freeze Python packages.
+
+
+
+
+Freezing makes it possible to ship arbitrary Python modules as part of a C++
+library. The Python source of the module is compiled to bytecode and written
+to `.c` files, to be imported by Python's built-in FrozenImporter.
+
+In a normal Python installation, FrozenImporter is only used to bootstrap the
+initialization of the import machinery. Python's importers are defined in
+Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be
+retrieved before any importers are available. Freezing the module bytecode
+resolves this circular dependency.
+
+This script will freeze the Python standard library. It produces two things:
+- Bytecode files: A set of `.c` that define C variables containing Python bytecode.
+- Main file: A `main.c` file listing all of these modules in the right form to be
+  consumed by FrozenImporter.
+
+The library that wishes to these modules make them available to the local
+Python instance by extending `PyImport_FrozenModules` appropriately (see
+https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules).
+"""
+
+import argparse
+import functools
+import itertools
+import marshal
+import os
+import types
+from dataclasses import dataclass
+from pathlib import Path
+
+
+PATH_MARKER = "<Generated by torch::deploy>"
+MAIN_INCLUDES = """#include <Python.h>
+
+"""
+
+MAIN_PREFIX_TEMPLATE = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen {}[] = {{
+"""
+
+FAKE_PREFIX = MAIN_PREFIX_TEMPLATE.format("_PyImport_FrozenModules")
+
+MAIN_SUFFIX = """\
+    {0, 0, 0} /* sentinel */
+};
+"""
+
+# Exclude some standard library modules to:
+# 1. Slim down the final frozen lib.
+# 2. Remove functionality we don't want to support.
+DENY_LIST = [
+    # Interface to unix databases
+    "dbm",
+    # ncurses bindings (terminal interfaces)
+    "curses",
+    # Tcl/Tk GUI
+    "tkinter",
+    "tkinter",
+    # Tests for the standard library
+    "test",
+    "tests",
+    "idle_test",
+    "__phello__.foo.py",
+    # importlib frozen modules. These are already baked into CPython.
+    "_bootstrap.py",
+    "_bootstrap_external.py",
+]
+
+NUM_BYTECODE_FILES = 5
+
+
+def indent_msg(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        args[0].indent += 1
+        ret = fn(*args, **kwargs)
+        args[0].indent -= 1
+        return ret
+
+    return wrapper
+
+
+@dataclass
+class FrozenModule:
+    # The fully qualified module name, e.g. 'foo.bar.baz'
+    module_name: str
+    # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz'
+    c_name: str
+    # The size of the C variable. Negative if this module is a package.
+    size: int
+    # The frozen bytecode
+    bytecode: bytes
+
+
+class Freezer:
+    def __init__(self, verbose: bool):
+        self.frozen_modules: list[FrozenModule] = []
+        self.indent: int = 0
+        self.verbose: bool = verbose
+
+    def msg(self, path: Path, code: str):
+        if not self.verbose:
+            return
+        # P: package dir
+        # F: python file
+        # S: skipped (not a package dir)
+        # X: skipped (deny-listed)
+        # N: skipped (not a python file)
+        print("    " * self.indent, end="")
+        print(f"{code} {path}")
+
+    def write_bytecode(self, install_root):
+        """
+        Write the `.c` files containing the frozen bytecode.
+
+        Shared frozen modules evenly across the files.
+        """
+        bytecode_file_names = [f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES)]
+        bytecode_files = [
+            open(os.path.join(install_root, name), "w") for name in bytecode_file_names
+        ]
+        it = itertools.cycle(bytecode_files)
+        for m in self.frozen_modules:
+            self.write_frozen(m, next(it))
+
+        for f in bytecode_files:
+            f.close()
+
+    def write_main(self, install_root, oss, symbol_name):
+        """Write the `main.c` file containing a table enumerating all the frozen modules."""
+        with open(os.path.join(install_root, "main.c"), "w") as outfp:
+            outfp.write(MAIN_INCLUDES)
+            for m in self.frozen_modules:
+                outfp.write(f"extern unsigned char {m.c_name}[];\n")
+
+            outfp.write(MAIN_PREFIX_TEMPLATE.format(symbol_name))
+            for m in self.frozen_modules:
+                outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
+            outfp.write(MAIN_SUFFIX)
+            if oss:
+                outfp.write(FAKE_PREFIX)
+                outfp.write(MAIN_SUFFIX)
+
+    def write_frozen(self, m: FrozenModule, outfp):
+        """Write a single frozen module's bytecode out to a C variable."""
+        outfp.write(f"unsigned char {m.c_name}[] = {{")
+        for i in range(0, len(m.bytecode), 16):
+            outfp.write("\n\t")
+            for c in bytes(m.bytecode[i : i + 16]):
+                outfp.write(f"{c:d},")
+        outfp.write("\n};\n")
+
+    def compile_path(self, path: Path, top_package_path: Path):
+        """Entry point for compiling a Path object."""
+        if path.is_dir():
+            self.compile_package(path, top_package_path)
+        else:
+            self.compile_file(path, top_package_path)
+
+    @indent_msg
+    def compile_package(self, path: Path, top_package_path: Path):
+        """Compile all the files within a Python package dir."""
+        assert path.is_dir()
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        # Python packages are directories that have __init__.py in them.
+        is_package_dir = any(child.name == "__init__.py" for child in path.iterdir())
+        if not is_package_dir:
+            self.msg(path, "S")
+            return
+
+        self.msg(path, "P")
+        # Recursively compile all children in this dir
+        for child in path.iterdir():
+            self.compile_path(child, top_package_path)
+
+    def get_module_qualname(self, file_path: Path, top_package_path: Path) -> list[str]:
+        # `path` looks like 'Lib/foo/bar/baz.py'
+
+        # chop off 'Lib/' to get something that represents a Python module hierarchy.
+        # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz'
+        normalized_path = file_path.relative_to(top_package_path.parent)
+
+        if normalized_path.name == "__init__.py":
+            # Special handling for `__init__.py`. In this case, this file
+            # specifies that the containing directory should be treated as a package.
+            # For 'foo/bar/baz/__init__.py':
+            # - The module name is 'baz'
+            module_basename = normalized_path.parent.name
+            # - The parent is foo.bar (need to shave off the 'baz')
+            module_parent = normalized_path.parent.parent.parts
+        else:
+            module_basename = normalized_path.stem
+            module_parent = normalized_path.parent.parts
+        return list(module_parent) + [module_basename]
+
+    def compile_string(self, file_content: str) -> types.CodeType:
+        # instead of passing in the real build time path to 'compile', we
+        # pass in a marker instead. This prevents the build time path being
+        # leaked to runtime. That path may not be available at runtime.
+        # Setting the path to a mark make sure it's a hard error rather
+        # than a flaky error when inspect module tries to retrieve python source
+        # code during torchscripting.
+        path_marker = PATH_MARKER
+        return compile(file_content, path_marker, "exec")
+
+    @indent_msg
+    def compile_file(self, path: Path, top_package_path: Path):
+        """
+        Compile a Python source file to frozen bytecode.
+
+        Append the result to `self.frozen_modules`.
+        """
+        assert path.is_file()
+        if path.suffix != ".py":
+            self.msg(path, "N")
+            return
+
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        self.msg(path, "F")
+        module_qualname = self.get_module_qualname(path, top_package_path)
+        module_mangled_name = "__".join(module_qualname)
+        c_name = "M_" + module_mangled_name
+
+        with open(path) as src_file:
+            co = self.compile_string(src_file.read())
+
+        bytecode = marshal.dumps(co)
+        size = len(bytecode)
+        if path.name == "__init__.py":
+            # Python packages are signified by negative size.
+            size = -size
+        self.frozen_modules.append(
+            FrozenModule(".".join(module_qualname), c_name, size, bytecode)
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compile py source")
+    parser.add_argument("paths", nargs="*", help="Paths to freeze.")
+    parser.add_argument("--verbose", action="store_true", help="Print debug logs")
+    parser.add_argument(
+        "--install-dir", "--install_dir", help="Root directory for all output files"
+    )
+    parser.add_argument(
+        "--oss",
+        action="store_true",
+        help="If it's OSS build, add a fake _PyImport_FrozenModules",
+    )
+    parser.add_argument(
+        "--symbol-name",
+        "--symbol_name",
+        help="The name of the frozen module array symbol to generate",
+        default="_PyImport_FrozenModules_torch",
+    )
+
+    args = parser.parse_args()
+
+    f = Freezer(args.verbose)
+
+    for p in args.paths:
+        path = Path(p)
+        if path.is_dir() and not Path.exists(path / "__init__.py"):
+            # this 'top level path p' is a standard directory containing modules,
+            # not a module itself
+            # each 'mod' could be a dir containing __init__.py or .py file
+            # NB: sorted to make sure this is deterministic
+            for mod in sorted(path.glob("*")):
+                f.compile_path(mod, mod)
+        else:
+            f.compile_path(path, path)
+
+    f.write_bytecode(args.install_dir)
+    f.write_main(args.install_dir, args.oss, args.symbol_name)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/torch/utils/_functools.py b/torch/utils/_functools.py
index dd3a460efb1cc..595b6e17c66c1 100644
--- a/torch/utils/_functools.py
+++ b/torch/utils/_functools.py
@@ -1,7 +1,12 @@
 import functools
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Concatenate, TypeVar
 from typing_extensions import ParamSpec
+=======
+from typing import Callable, TypeVar
+from typing_extensions import Concatenate, ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _P = ParamSpec("_P")
@@ -13,7 +18,11 @@
 
 
 def cache_method(
+<<<<<<< HEAD
     f: Callable[Concatenate[_C, _P], _T],
+=======
+    f: Callable[Concatenate[_C, _P], _T]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[Concatenate[_C, _P], _T]:
     """
     Like `@functools.cache` but for methods.
@@ -31,17 +40,27 @@ def cache_method(
 
     @functools.wraps(f)
     def wrap(self: _C, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+<<<<<<< HEAD
         if kwargs:
             raise AssertionError("cache_method does not accept keyword arguments")
         if not (cache := getattr(self, cache_name, None)):
             cache = {}
             setattr(self, cache_name, cache)
         # pyrefly: ignore [unbound-name]
+=======
+        assert not kwargs
+        if not (cache := getattr(self, cache_name, None)):
+            cache = {}
+            setattr(self, cache_name, cache)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cached_value = cache.get(args, _cache_sentinel)
         if cached_value is not _cache_sentinel:
             return cached_value
         value = f(self, *args, **kwargs)
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cache[args] = value
         return value
 
diff --git a/torch/utils/_get_clean_triton.py b/torch/utils/_get_clean_triton.py
index fbbabc3f50e62..9d459fe97b96d 100644
--- a/torch/utils/_get_clean_triton.py
+++ b/torch/utils/_get_clean_triton.py
@@ -3,7 +3,10 @@
 import os
 import re
 import subprocess
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 
 
@@ -108,7 +111,11 @@ def process_file(
             env["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = "1"
 
             result = subprocess.run(
+<<<<<<< HEAD
                 [sys.executable, input_filename],
+=======
+                ["python", input_filename],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 env=env,
                 capture_output=True,
                 text=True,
diff --git a/torch/utils/_import_utils.py b/torch/utils/_import_utils.py
index 240f92acacb9d..803e7dea4899e 100644
--- a/torch/utils/_import_utils.py
+++ b/torch/utils/_import_utils.py
@@ -3,6 +3,11 @@
 from types import ModuleType
 from typing import Optional
 
+<<<<<<< HEAD
+=======
+import torch
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _check_module_exists(name: str) -> bool:
     r"""Returns if a top-level module with :attr:`name` exists *without**
@@ -20,7 +25,15 @@ def _check_module_exists(name: str) -> bool:
 
 @functools.lru_cache
 def dill_available() -> bool:
+<<<<<<< HEAD
     return _check_module_exists("dill")
+=======
+    return (
+        _check_module_exists("dill")
+        # dill fails to import under torchdeploy
+        and not torch._running_with_deploy()
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.lru_cache
diff --git a/torch/utils/_ordered_set.py b/torch/utils/_ordered_set.py
index eea7310222394..82e86cd83a066 100644
--- a/torch/utils/_ordered_set.py
+++ b/torch/utils/_ordered_set.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
 from collections.abc import (
+<<<<<<< HEAD
     Hashable,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Iterable,
     Iterator,
     MutableSet,
@@ -11,8 +14,13 @@
 from typing import Any, cast, Optional, TypeVar
 
 
+<<<<<<< HEAD
 T = TypeVar("T", bound=Hashable)
 T_co = TypeVar("T_co", bound=Hashable, covariant=True)
+=======
+T = TypeVar("T")
+T_co = TypeVar("T_co", covariant=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = ["OrderedSet"]
 
@@ -77,7 +85,10 @@ def _wrap_iter_in_set(cls, other: Any) -> Any:
     def pop(self) -> T:
         if not self:
             raise KeyError("pop from an empty set")
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._dict.popitem()[0]
 
     def copy(self) -> OrderedSet[T]:
@@ -159,7 +170,10 @@ def __or__(self, other: AbstractSet[T_co]) -> OrderedSet[T]:
     def __and__(self, other: AbstractSet[T_co]) -> OrderedSet[T]:
         # MutableSet impl will iterate over other, iter over smaller of two sets
         if isinstance(other, OrderedSet) and len(self) < len(other):
+<<<<<<< HEAD
             # pyrefly: ignore [unsupported-operation, bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return other & self
         return cast(OrderedSet[T], super().__and__(other))
 
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 4ab48bc41ba53..c6326a7576431 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,11 +1,18 @@
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 import functools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Optional, overload, Protocol, Union
+=======
+from typing import Any, Optional, overload, Protocol, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeIs
 
 import torch
@@ -28,11 +35,17 @@
 
 _is_in_torch_dispatch_mode = False
 _is_in_non_infra_torch_dispatch_mode = False
+<<<<<<< HEAD
 # If inside any mode that has ignore_compile_internals() = False
 _is_in_any_mode_without_ignore_compile_internals = False
 
 
 def is_in_torch_dispatch_mode(include_infra_modes: bool = True) -> bool:
+=======
+
+
+def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         _is_in_torch_dispatch_mode
         if include_infra_modes
@@ -40,10 +53,13 @@ def is_in_torch_dispatch_mode(include_infra_modes: bool = True) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_in_any_mode_without_ignore_compile_internals() -> bool:
     return _is_in_any_mode_without_ignore_compile_internals
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TorchDispatchMode:
     """
     A ``TorchDispatchMode`` allows you to override the meaning of all
@@ -71,6 +87,7 @@ class TorchDispatchMode:
     the next mode on the mode stack.  If you want recursively call back into
     your current ``__torch_dispatch__`` implementation, either explicitly
     invoke ``self.__torch_dispatch__(...)``, or use the context manager
+<<<<<<< HEAD
     ``self`` to make PyTorch
     API self-referential (beware of infinite loops, in this case!)
     """
@@ -85,13 +102,25 @@ def __init__(self, _dispatch_key=None):
         if _dispatch_key is not None:
             if not isinstance(_dispatch_key, torch._C.DispatchKey):
                 raise AssertionError("_dispatch_key must be a torch._C.DispatchKey")
+=======
+    ``__torch_dispatch__(self)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+
+    def __init__(self, _dispatch_key=None):
+        if _dispatch_key is not None:
+            assert isinstance(_dispatch_key, torch._C.DispatchKey)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.__dict__["_dispatch_key"] = _dispatch_key
 
         self.old_dispatch_mode_flags: deque[bool] = deque()
         self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()
+<<<<<<< HEAD
         self.old_without_ignore_compile_internals_dispatch_mode_flags: deque[bool] = (
             deque()
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_dispatch_mode_flags"):
@@ -100,6 +129,7 @@ def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_non_infra_dispatch_mode_flags"):
             self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()  # type: ignore[no-redef]
 
+<<<<<<< HEAD
         if not hasattr(
             self, "old_without_ignore_compile_internals_dispatch_mode_flags"
         ):
@@ -107,14 +137,19 @@ def _lazy_init_old_dispatch_mode_flags(self):
                 bool
             ] = deque()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         raise NotImplementedError
 
     def __enter__(self):
         global _is_in_torch_dispatch_mode
         global _is_in_non_infra_torch_dispatch_mode
+<<<<<<< HEAD
         global _is_in_any_mode_without_ignore_compile_internals
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Previously, there wasn't any state in this class' constructor
         # super calls were added to existing modes, but for any new modes
         # this will replicate the previous behavior of not strictly needing
@@ -128,6 +163,7 @@ def __enter__(self):
         _is_in_non_infra_torch_dispatch_mode = (
             _is_in_non_infra_torch_dispatch_mode or not self.is_infra_mode()
         )
+<<<<<<< HEAD
         self.old_without_ignore_compile_internals_dispatch_mode_flags.append(
             _is_in_any_mode_without_ignore_compile_internals
         )
@@ -135,6 +171,8 @@ def __enter__(self):
             _is_in_any_mode_without_ignore_compile_internals
             or not self.ignore_compile_internals()
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _push_mode(self)
         return self
 
@@ -150,17 +188,24 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         _is_in_non_infra_torch_dispatch_mode = (
             self.old_non_infra_dispatch_mode_flags.pop()
         )
+<<<<<<< HEAD
         global _is_in_any_mode_without_ignore_compile_internals
         _is_in_any_mode_without_ignore_compile_internals = (
             self.old_without_ignore_compile_internals_dispatch_mode_flags.pop()
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
     def push(cls, *args, **kwargs):
         warnings.warn(
+<<<<<<< HEAD
             "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`",
             stacklevel=2,
+=======
+            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         instance = cls(*args, **kwargs)
         return instance
@@ -169,6 +214,7 @@ def push(cls, *args, **kwargs):
     def is_infra_mode(cls):
         return False
 
+<<<<<<< HEAD
     @classmethod
     def ignore_compile_internals(cls):
         """Ignore operators that are compiled via torch.compile.
@@ -208,12 +254,19 @@ def _get_current_dispatch_mode() -> Optional[TorchDispatchMode]:
     executed) if there are any.
     """
     stack_len = _len_torch_dispatch_stack()
+=======
+
+def _get_current_dispatch_mode():
+    stack_len = _len_torch_dispatch_stack()
+    # Return a user mode on the stack if there are any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if stack_len > 0:
         return _get_dispatch_stack_at(stack_len - 1)
     return None
 
 
 def _detect_infra_mode(key):
+<<<<<<< HEAD
     if key not in (
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
         torch._C._TorchDispatchModeKey.PROXY,
@@ -223,15 +276,25 @@ def _detect_infra_mode(key):
                 or PROXY ({torch._C._TorchDispatchModeKey.PROXY}) _TorchDispatchModeKey, \
                     got {key}"
         )
+=======
+    assert key in [
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+        torch._C._TorchDispatchModeKey.PROXY,
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._ops import _get_dispatch_mode_pre_dispatch
 
     pre_dispatch_mode = _get_dispatch_mode_pre_dispatch(key)
     post_dispatch_mode = torch._C._get_dispatch_mode(key)
 
+<<<<<<< HEAD
     if pre_dispatch_mode is not None and post_dispatch_mode is not None:
         raise AssertionError(
             "At most one of pre_dispatch_mode and post_dispatch_mode may be active"
         )
+=======
+    assert (pre_dispatch_mode is None) or (post_dispatch_mode is None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if pre_dispatch_mode is None:
         return post_dispatch_mode
@@ -257,6 +320,7 @@ def _unset_infra_mode(key):
 
 
 def _disable_infra_mode(key):
+<<<<<<< HEAD
     if key not in (
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
         torch._C._TorchDispatchModeKey.PROXY,
@@ -264,6 +328,12 @@ def _disable_infra_mode(key):
         raise AssertionError(
             "key must be either FUNCTIONAL or PROXY _TorchDispatchModeKey"
         )
+=======
+    assert key in (
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+        torch._C._TorchDispatchModeKey.PROXY,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mode_unset = _unset_infra_mode(key)
     try:
         yield mode_unset
@@ -272,22 +342,30 @@ def _disable_infra_mode(key):
             _push_mode(mode_unset)
 
 
+<<<<<<< HEAD
 def _get_current_dispatch_mode_stack() -> list[TorchDispatchMode]:
     """
     Returns the current stack of dispatch modes, with the most recent
     (i.e., the one that will be processed first) at the end of the
     list (standard stack convention).
     """
+=======
+def _get_current_dispatch_mode_stack():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stack_len = _len_torch_dispatch_stack()
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
 
 def _push_mode(mode: TorchDispatchMode):
     k = mode._dispatch_key if hasattr(mode, "_dispatch_key") else None
+<<<<<<< HEAD
     if k is not None and k != torch._C.DispatchKey.PreDispatch:
         raise AssertionError(
             "mode._dispatch_key must be None or DispatchKey.PreDispatch"
         )
+=======
+    assert k is None or k == torch._C.DispatchKey.PreDispatch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if k is None:
         _push_on_torch_dispatch_stack(mode)
         return
@@ -388,12 +466,22 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
 # Subtypes which have __tensor_flatten__ and __tensor_unflatten__.
 class TensorWithFlatten(Protocol):
+<<<<<<< HEAD
     def __tensor_flatten__(self) -> tuple[Sequence[str], object]: ...
+=======
+    def __tensor_flatten__(self) -> tuple[Sequence[str], object]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def __tensor_unflatten__(
         inner_tensors: int, flatten_spec: int, outer_size: int, outer_stride: int
+<<<<<<< HEAD
     ) -> torch.Tensor: ...
+=======
+    ) -> torch.Tensor:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # It would be really nice to be able to say that the return of
     # is_traceable_wrapper_subclass() is Intersection[torch.Tensor,
@@ -402,6 +490,7 @@ def __tensor_unflatten__(
     shape: torch._C.Size
 
     @overload
+<<<<<<< HEAD
     def stride(self, dim: None = None) -> tuple[int, ...]: ...
 
     @overload
@@ -416,6 +505,28 @@ def size(self, dim: int) -> int: ...
     def storage_offset(self) -> int: ...
 
     def dim(self) -> int: ...
+=======
+    def stride(self, dim: None = None) -> tuple[int, ...]:
+        ...
+
+    @overload
+    def stride(self, dim: int) -> int:
+        ...
+
+    @overload
+    def size(self, dim: None = None) -> tuple[int, ...]:
+        ...
+
+    @overload
+    def size(self, dim: int) -> int:
+        ...
+
+    def storage_offset(self) -> int:
+        ...
+
+    def dim(self) -> int:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def to(
@@ -425,7 +536,12 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
+<<<<<<< HEAD
     ) -> torch.Tensor: ...
+=======
+    ) -> torch.Tensor:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def to(
@@ -436,7 +552,12 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
+<<<<<<< HEAD
     ) -> torch.Tensor: ...
+=======
+    ) -> torch.Tensor:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def to(
@@ -446,7 +567,12 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
+<<<<<<< HEAD
     ) -> torch.Tensor: ...
+=======
+    ) -> torch.Tensor:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
@@ -479,7 +605,11 @@ def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
                 that require the stride info to be constructed. In most cases, this arg can be
                 safely ignored.
     """
+<<<<<<< HEAD
     is_subclass = isinstance(t, torch.Tensor) and type(t) is not torch.Tensor
+=======
+    is_subclass = isinstance(t, torch.Tensor) and type(t) != torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         is_subclass
         and hasattr(t, "__tensor_flatten__")
@@ -491,7 +621,11 @@ def is_traceable_wrapper_subclass_type(t: type) -> TypeIs[type[TensorWithFlatten
     """Same as above, but takes a type argument instead of an instance."""
     return (
         issubclass(t, torch.Tensor)
+<<<<<<< HEAD
         and t is not torch.Tensor
+=======
+        and t != torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and hasattr(t, "__tensor_flatten__")
         and hasattr(t, "__tensor_unflatten__")
     )
@@ -525,6 +659,7 @@ def transform_subclass(t, callback, outer_size=None, outer_stride=None):
     # NB: Purposefully guard here to simplify the inner / outer symbols.
     # Using sym_eq() for symbolic comparison can result in an expression that's too
     # difficult to guard on, so we use == here.
+<<<<<<< HEAD
     if sub.shape != outer_size:
         raise AssertionError(
             f"Expected return value from {type(t)}__tensor_unflatten__() to have "
@@ -535,6 +670,16 @@ def transform_subclass(t, callback, outer_size=None, outer_stride=None):
             f"Expected return value from {type(t)}__tensor_unflatten__() to have "
             f"stride equal to {outer_stride}, but got: {sub.stride()}"
         )
+=======
+    assert sub.shape == outer_size, (
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
+        f"shape equal to {outer_size}, but got: {sub.shape}"
+    )
+    assert sub.stride() == outer_stride, (
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
+        f"stride equal to {outer_stride}, but got: {sub.stride()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return sub
 
@@ -551,12 +696,18 @@ def _correct_storage_aliasing(func, schema_info, args, outs):
     It does this by unsafely overwriting the storage field of the output tensor
     to be the same storage as the input.
     """
+<<<<<<< HEAD
     if not isinstance(func, torch._ops.OpOverload):
         raise AssertionError(f"func must be an OpOverload, got {type(args)}")
     if not isinstance(args, tuple):
         raise AssertionError(f"args must be a tuple, got {type(args)}")
     if not isinstance(outs, (list, tuple)):
         raise AssertionError(f"outs must be a list or tuple, got {type(args)}")
+=======
+    assert isinstance(func, torch._ops.OpOverload)
+    assert isinstance(args, tuple)
+    assert isinstance(outs, (list, tuple))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def alias_non_inplace_storage(arg, ret):
         # This is hopefully a reasonable assert:
@@ -565,6 +716,7 @@ def alias_non_inplace_storage(arg, ret):
         # in theory if a subclass that needs this API wants to sometimes return
         # plain tensors, we could remove the assert and just not perform the aliasing,
         # but it seems safer to learn more about this case first.
+<<<<<<< HEAD
         #
         # Performance note: This is all just to assert that the argument and result
         # types match, checking that is cheaper than is_traceable_wrapper_subclass_type,
@@ -582,6 +734,15 @@ def alias_non_inplace_storage(arg, ret):
                         f"Called {str(func)} with input of type {type(arg)}\n"
                         f"and output of type {type(ret)}. But expected types to match."
                     )
+=======
+        if is_traceable_wrapper_subclass(arg) or is_traceable_wrapper_subclass(ret):
+            ret_list = ret if isinstance(ret, list) else [ret]
+            for r in ret_list:
+                assert type(arg) == type(
+                    r
+                ), f"""Called {str(func)} with input of type {type(arg)}
+and output of type {type(ret)}. But expected types to match."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Need to call a non-dispatcher helper, because we explicitly do **not**
         # want our subclass to intercept the set_() call.
         # instead, our subclass should directly have its storage swapped out.
@@ -597,6 +758,7 @@ def alias_non_inplace_storage(arg, ret):
             for r in ret:
                 torch._functionalize_unsafe_set(r, arg)
         else:
+<<<<<<< HEAD
             if not isinstance(ret, torch.Tensor):
                 raise AssertionError(f"expected torch.Tensor, got {type(ret)}")
             torch._functionalize_unsafe_set(ret, arg)
@@ -607,6 +769,22 @@ def alias_non_inplace_storage(arg, ret):
                 schema_arg.alias_set & schema_out.alias_set
             ) and not schema_arg.is_write
             if is_read_only_alias_match:
+=======
+            assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
+            torch._functionalize_unsafe_set(ret, arg)
+
+    def is_read_only_alias_match(arg, ret):
+        shared_aliases = arg.alias_set & ret.alias_set
+        return len(shared_aliases) > 0 and not arg.is_write
+
+    num_args = len(func._schema.arguments)
+    num_returns = len(func._schema.returns)
+    for arg_idx in range(num_args):
+        for return_idx in range(num_returns):
+            if is_read_only_alias_match(
+                schema_info.args[arg_idx], schema_info.outs[return_idx]
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 alias_non_inplace_storage(args[arg_idx], outs[return_idx])
 
 
@@ -625,6 +803,7 @@ class SchemaInfo:
     args: list[AliasInfo]
     outs: list[AliasInfo]
 
+<<<<<<< HEAD
     # NOTE[SchemaInfo int_tags]: This has nothing to do with aliasing, but we take
     # advantage of our existing caching of data for each OpOverload to paper over an
     # efficiency problem with pybind11::enum_ (which currently is used to implement
@@ -632,20 +811,35 @@ class SchemaInfo:
     # each element must be converted to int with the __int__ method, which incurs a lot
     # of overhead. Converting to int once and caching removes this per-op overhead.
     int_tags: list[int]
+=======
+
+# Can't import torch._ops.OpOverload due to circular reference
+parsed_schema_map: dict[Any, SchemaInfo] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Given an OpOverload, returns schema information on it.
 # This is cached for efficiency, since it can involve running torchgen
+<<<<<<< HEAD
 @functools.cache
 def get_alias_info(func) -> SchemaInfo:
+=======
+def get_alias_info(func) -> SchemaInfo:
+    if func in parsed_schema_map:
+        return parsed_schema_map[func]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # For ATen ops: use torchgen (since torchscript parser doesn't handle alias annotations
     # properly for some ops that output tensorlists)
     if func.namespace == "aten":
         torchgen_schema_str = str(func._schema)
+<<<<<<< HEAD
         if not torchgen_schema_str.startswith("aten::"):
             raise AssertionError(
                 "Expected torchgen schema string to start with 'aten::'"
             )
+=======
+        assert torchgen_schema_str.startswith("aten::")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # remove the aten:: namespace, which is added by the torchscript parser,
         # and torchgen doesn't know how to handle
         torchgen_schema_str = torchgen_schema_str[6:]
@@ -702,6 +896,7 @@ def get_alias_info(func) -> SchemaInfo:
             )
             for a in func._schema.returns
         ]
+<<<<<<< HEAD
     schema_info = SchemaInfo(
         args=arg_schemas, outs=out_schemas, int_tags=[int(x) for x in func.tags]
     )
@@ -712,6 +907,13 @@ def get_alias_info(func) -> SchemaInfo:
 _TORCH_TAG_INPLACE_VIEW_INT = int(torch.Tag.inplace_view)  # type: ignore[call-overload]
 
 
+=======
+    schema_info = SchemaInfo(args=arg_schemas, outs=out_schemas)
+    parsed_schema_map[func] = schema_info
+    return schema_info
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def return_and_correct_aliasing(func, args, kwargs, out):
     """
     This function should be used by wrapper tensor ``__torch_dispatch__`` subclasses
@@ -733,6 +935,7 @@ def return_and_correct_aliasing(func, args, kwargs, out):
     schema_info = get_alias_info(func)
 
     def get_write_alias(x):
+<<<<<<< HEAD
         alias_set = x.alias_set
         if not alias_set or not x.is_write:
             return None
@@ -742,6 +945,16 @@ def get_write_alias(x):
         # timeit says next(iter(alias_set)) is faster than list(alias_set)[0] even for
         # set of size 1 on Python 3.13.
         return next(iter(alias_set))
+=======
+        if len(x.alias_set) == 0:
+            return None
+        alias_set = list(x.alias_set)
+        # torchscript allows for complicated alias sets, but our dispatcher ops only really involve simple aliasing
+        assert len(alias_set) == 1
+        if x.is_write:
+            return alias_set[0]
+        return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(  # type: ignore[misc]
@@ -752,10 +965,14 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
             i for i, a in enumerate(schema_info.args) if output_alias in a.alias_set
         ]
         # For any dispatcher op with an output alias, we expect it to map to exactly one alias in the schema's input arguments.
+<<<<<<< HEAD
         if len(arg_indices) != 1:
             raise AssertionError(
                 "Expected exactly one argument index for the given output alias"
             )
+=======
+        assert len(arg_indices) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         idx = arg_indices[0]
         arg_info = schema_info.args[idx]
         if arg_info.name is not None and arg_info.name in new_kwargs:
@@ -770,8 +987,12 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # For inplace_view ops in particular, we'll try hard to make sure that the wrapper subclass's
     # metadata is set correctly.
+<<<<<<< HEAD
     # See NOTE[SchemaInfo int_tags] above.
     if _TORCH_TAG_INPLACE_VIEW_INT in schema_info.int_tags:
+=======
+    if torch.Tag.inplace_view in func.tags:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # no_dispatch() to make sure that we secretly change the metadata on the wrapper,
         # but don't end up dispatching the op anywhere else.
         mutated_args = [
@@ -781,10 +1002,14 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         ]
         # Assumption: we have a very small number of inplace_view ops that follow a strict schema:
         # there is only a single argument that gets its metadata mutated.
+<<<<<<< HEAD
         if len(mutated_args) != 1:
             raise AssertionError(
                 "expected exactly one mutated arg for inplace_view ops"
             )
+=======
+        assert len(mutated_args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This check exists because we generally *do* want to update the metadata of any wrapper subclasses,
         # but FunctionalTensor is special: it overrides all size/stride calls to plumb to the inner tensor.
         # so we don't actually need to update the metadata (and attempting to do so causes errors)
@@ -803,6 +1028,7 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # Next: we need to make sure to return inputs directly, if the output is a mutable alias (e.g. add_()).
 
+<<<<<<< HEAD
     # Compute write aliases once instead of repeatedly.
     schema_info_outs_write_aliases = [get_write_alias(r) for r in schema_info.outs]
     # simple case: none of our outputs have mutable aliases, so we can return the output as-is
@@ -816,13 +1042,37 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
     if len(schema_info_outs_write_aliases) == 1:
         return get_arg_from_alias(
             schema_info_outs_write_aliases[0], schema_info, args, kwargs
+=======
+    # simple case: none of our outputs have mutable aliases, so we can return the output as-is
+    if not any(get_write_alias(r) is not None for r in schema_info.outs):
+        return out
+
+    # simplifying assumption: we don't have **any** ops with return types like "-> (Tensor(a!), Tensor)"
+    if not all(get_write_alias(r) is not None for r in schema_info.outs):
+        raise RuntimeError("Unsupported schema: " + str(func._schema))
+
+    if len(func._schema.returns) == 1:
+        return get_arg_from_alias(
+            get_write_alias(schema_info.outs[0]), schema_info, args, kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # In the multi-return case, all aten ops return a tuple / list, so cast accordingly.
     outs_to_return = type(out)(
         [
+<<<<<<< HEAD
             (get_arg_from_alias(write_alias, schema_info, args, kwargs))
             for write_alias in schema_info_outs_write_aliases
+=======
+            (
+                get_arg_from_alias(
+                    get_write_alias(schema_info.outs[i]), schema_info, args, kwargs
+                )
+                if get_write_alias(r) is not None
+                else o
+            )
+            for ((i, r), o) in zip(enumerate(schema_info.outs), out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     )
     return outs_to_return
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index b2c7c9985bf52..3adc37cc1ed70 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -20,14 +20,26 @@
 import importlib
 import importlib.metadata
 import json
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import threading
 import types
 import warnings
 from collections import defaultdict, deque, namedtuple, OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
 from enum import Enum
 from typing import (
     Any,
+=======
+from collections.abc import Hashable, Iterable, Mapping, Sequence
+from enum import Enum
+from typing import (
+    Any,
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cast,
     ClassVar,
     Final,
@@ -97,6 +109,7 @@
 
 
 class KeyEntry(Protocol):
+<<<<<<< HEAD
     def __hash__(self) -> int: ...
 
     def __eq__(self, other: object) -> bool: ...
@@ -104,6 +117,19 @@ def __eq__(self, other: object) -> bool: ...
     def __str__(self) -> str: ...
 
     def get(self, parent: Any) -> Any: ...
+=======
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other: object) -> bool:
+        ...
+
+    def __str__(self) -> str:
+        ...
+
+    def get(self, parent: Any) -> Any:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class EnumEncoder(json.JSONEncoder):
@@ -368,7 +394,11 @@ def _unflatten_fn(values: Iterable[Any], context: Context) -> Any:
 
     def _flatten_fn_with_keys(obj: Any) -> tuple[list[Any], Context]:
         flattened, (flat_names, _none_names) = _flatten_fn(obj)  # type: ignore[misc]
+<<<<<<< HEAD
         return [(GetAttrKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
+=======
+        return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _private_register_pytree_node(
         cls,
@@ -471,13 +501,21 @@ class ConstantNode:
 
 def _is_constant_holder(spec: "TreeSpec") -> bool:
     """Checks if the spec is from a pytree registered with register_constant"""
+<<<<<<< HEAD
     return isinstance(spec._context, ConstantNode)
+=======
+    return isinstance(spec.context, ConstantNode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _retrieve_constant(spec: "TreeSpec") -> Any:
     """Given a spec from a pytree registered with register_constant, retrieves the constant"""
+<<<<<<< HEAD
     if not _is_constant_holder(spec):
         raise AssertionError("spec does not correspond to a registered constant pytree")
+=======
+    assert _is_constant_holder(spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return tree_unflatten([], spec)
 
 
@@ -603,7 +641,10 @@ def _private_register_pytree_node(
             warnings.warn(
                 f"{cls} is already registered as pytree node. "
                 "Overwriting the previous registration.",
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         node_def = NodeDef(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn)
@@ -710,7 +751,10 @@ def __init_subclass__(cls) -> NoReturn:
     def __new__(
         cls: type[Self],
         sequence: Iterable[_T_co],
+<<<<<<< HEAD
         # pyrefly: ignore [bad-function-definition]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dict: dict[str, Any] = ...,
     ) -> Self:
         raise NotImplementedError
@@ -754,10 +798,16 @@ def _tuple_flatten(d: tuple[T, ...]) -> tuple[list[T], Context]:
 
 
 def _tuple_flatten_with_keys(
+<<<<<<< HEAD
     d: tuple[T, ...],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _tuple_flatten(d)
     # pyrefly: ignore [bad-return]
+=======
+    d: tuple[T, ...]
+) -> tuple[list[tuple[KeyEntry, T]], Context]:
+    values, context = _tuple_flatten(d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -771,7 +821,10 @@ def _list_flatten(d: list[T]) -> tuple[list[T], Context]:
 
 def _list_flatten_with_keys(d: list[T]) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _list_flatten(d)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -784,10 +837,16 @@ def _dict_flatten(d: dict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _dict_flatten_with_keys(
+<<<<<<< HEAD
     d: dict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _dict_flatten(d)
     # pyrefly: ignore [bad-return]
+=======
+    d: dict[Any, T]
+) -> tuple[list[tuple[KeyEntry, T]], Context]:
+    values, context = _dict_flatten(d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
 
 
@@ -803,7 +862,10 @@ def _namedtuple_flatten_with_keys(
     d: NamedTuple,
 ) -> tuple[list[tuple[KeyEntry, Any]], Context]:
     values, context = _namedtuple_flatten(d)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         [(GetAttrKey(field), v) for field, v in zip(context._fields, values)],
         context,
@@ -850,10 +912,16 @@ def _ordereddict_flatten(d: OrderedDict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _ordereddict_flatten_with_keys(
+<<<<<<< HEAD
     d: OrderedDict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _ordereddict_flatten(d)
     # pyrefly: ignore [bad-return]
+=======
+    d: OrderedDict[Any, T]
+) -> tuple[list[tuple[KeyEntry, T]], Context]:
+    values, context = _ordereddict_flatten(d)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
 
 
@@ -874,11 +942,18 @@ def _defaultdict_flatten(d: defaultdict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _defaultdict_flatten_with_keys(
+<<<<<<< HEAD
     d: defaultdict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _defaultdict_flatten(d)
     _, dict_context = context
     # pyrefly: ignore [bad-return]
+=======
+    d: defaultdict[Any, T]
+) -> tuple[list[tuple[KeyEntry, T]], Context]:
+    values, context = _defaultdict_flatten(d)
+    _, dict_context = context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(MappingKey(k), v) for k, v in zip(dict_context, values)], context
 
 
@@ -901,14 +976,20 @@ def _defaultdict_serialize(context: Context) -> DumpableContext:
 
 
 def _defaultdict_deserialize(dumpable_context: DumpableContext) -> Context:
+<<<<<<< HEAD
     if not isinstance(dumpable_context, dict):
         raise AssertionError("dumpable_context must be a dict")
 
     expected_keys = {
+=======
+    assert isinstance(dumpable_context, dict)
+    assert set(dumpable_context) == {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "default_factory_module",
         "default_factory_name",
         "dict_context",
     }
+<<<<<<< HEAD
     if set(dumpable_context) != expected_keys:
         raise AssertionError(
             f"dumpable_context keys must be {expected_keys}, got {set(dumpable_context)}"
@@ -920,6 +1001,13 @@ def _defaultdict_deserialize(dumpable_context: DumpableContext) -> Context:
         raise AssertionError("default_factory_module must be a string")
     if not isinstance(default_factory_name, str):
         raise AssertionError("default_factory_name must be a string")
+=======
+
+    default_factory_module = dumpable_context["default_factory_module"]
+    default_factory_name = dumpable_context["default_factory_name"]
+    assert isinstance(default_factory_module, str)
+    assert isinstance(default_factory_name, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module = importlib.import_module(default_factory_module)
     default_factory = getattr(module, default_factory_name)
 
@@ -935,7 +1023,10 @@ def _deque_flatten_with_keys(
     d: deque[T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _deque_flatten(d)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -1047,9 +1138,15 @@ def tree_is_leaf(
     False
     >>> tree_is_leaf((1, 2, 3), is_leaf=lambda x: isinstance(x, tuple))
     True
+<<<<<<< HEAD
     >>> tree_is_leaf({"a": 1, "b": 2, "c": 3})
     False
     >>> tree_is_leaf({"a": 1, "b": 2, "c": None})
+=======
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': 3})
+    False
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': None})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     False
     """
     if is_leaf is not None and is_leaf(tree):
@@ -1074,14 +1171,20 @@ def _is_leaf(tree: PyTree, is_leaf: Optional[Callable[[PyTree], bool]] = None) -
 @dataclasses.dataclass(init=True, frozen=True, eq=True, repr=False)
 class TreeSpec:
     type: Any
+<<<<<<< HEAD
     _context: Context
     _children: list[Self]
+=======
+    context: Context
+    children_specs: list["TreeSpec"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     num_nodes: int = dataclasses.field(init=False)
     num_leaves: int = dataclasses.field(init=False)
     num_children: int = dataclasses.field(init=False)
 
     def __post_init__(self) -> None:
+<<<<<<< HEAD
         if self.type is None:
             assert self._context is None
             assert len(self._children) == 0
@@ -1092,21 +1195,38 @@ def __post_init__(self) -> None:
             num_nodes = sum((spec.num_nodes for spec in self._children), start=1)
             num_leaves = sum(spec.num_leaves for spec in self._children)
             num_children = len(self._children)
+=======
+        num_nodes = sum((spec.num_nodes for spec in self.children_specs), start=1)
+        num_leaves = sum(spec.num_leaves for spec in self.children_specs)
+        num_children = len(self.children_specs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         object.__setattr__(self, "num_nodes", num_nodes)
         object.__setattr__(self, "num_leaves", num_leaves)
         object.__setattr__(self, "num_children", num_children)
 
     def __repr__(self, indent: int = 0) -> str:
+<<<<<<< HEAD
         repr_prefix: str = f"TreeSpec({self.type.__name__}, {self._context}, ["
         children_specs_str: str = ""
         if self.num_children > 0:
             indent += 2
             children_specs_str += self._children[0].__repr__(indent)
+=======
+        repr_prefix: str = f"TreeSpec({self.type.__name__}, {self.context}, ["
+        children_specs_str: str = ""
+        if self.num_children > 0:
+            indent += 2
+            children_specs_str += self.children_specs[0].__repr__(indent)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             children_specs_str += "," if self.num_children > 1 else ""
             children_specs_str += ",".join(
                 [
                     "\n" + " " * indent + child.__repr__(indent)
+<<<<<<< HEAD
                     for child in self._children[1:]
+=======
+                    for child in self.children_specs[1:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
         repr_suffix: str = f"{children_specs_str}])"
@@ -1118,13 +1238,20 @@ def __eq__(self, other: PyTree) -> bool:
         elif other.__class__ is self.__class__:
             if str(self.type) != str(other.type):
                 return False
+<<<<<<< HEAD
             if self._context != other._context:
                 return False
             elif self._children != other._children:
+=======
+            if self.context != other.context:
+                return False
+            elif self.children_specs != other.children_specs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
             return True
         return NotImplemented
 
+<<<<<<< HEAD
     @property
     def context(self) -> Context:
         return self._context
@@ -1148,6 +1275,11 @@ def children(self) -> list[Self]:
     def child(self, index: int) -> Self:
         return self._children[index]
 
+=======
+    def is_leaf(self) -> bool:
+        return self.num_nodes == 1 and self.num_leaves == 1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def flatten_up_to(self, tree: PyTree) -> list[PyTree]:
         def helper(treespec: TreeSpec, tree: PyTree, subtrees: list[PyTree]) -> None:
             if treespec.is_leaf():
@@ -1169,7 +1301,11 @@ def helper(treespec: TreeSpec, tree: PyTree, subtrees: list[PyTree]) -> None:
                         f"Node arity mismatch; "
                         f"expected {treespec.num_children}, but got {len(children)}.",
                     )
+<<<<<<< HEAD
                 if context != treespec._context:
+=======
+                if context != treespec.context:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise ValueError(
                         f"Node context mismatch for custom node type {treespec.type!r}.",
                     )
@@ -1194,10 +1330,17 @@ def helper(treespec: TreeSpec, tree: PyTree, subtrees: list[PyTree]) -> None:
                 if both_standard_dict:
                     # dictionary types are compatible with each other
                     dict_context = (
+<<<<<<< HEAD
                         treespec._context
                         if treespec.type is not defaultdict
                         # ignore mismatch of `default_factory` for defaultdict
                         else treespec._context[1]
+=======
+                        treespec.context
+                        if treespec.type is not defaultdict
+                        # ignore mismatch of `default_factory` for defaultdict
+                        else treespec.context[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     expected_keys = dict_context
                     got_key_set = set(tree)
@@ -1218,6 +1361,7 @@ def helper(treespec: TreeSpec, tree: PyTree, subtrees: list[PyTree]) -> None:
                     children, context = flatten_fn(tree)
                     if (
                         node_type is not deque  # ignore mismatch of `maxlen` for deque
+<<<<<<< HEAD
                     ) and context != treespec._context:
                         raise ValueError(
                             f"Node context mismatch for node type {treespec.type!r}; "
@@ -1225,6 +1369,15 @@ def helper(treespec: TreeSpec, tree: PyTree, subtrees: list[PyTree]) -> None:
                         )
 
             for subtree, subspec in zip(children, treespec._children):
+=======
+                    ) and context != treespec.context:
+                        raise ValueError(
+                            f"Node context mismatch for node type {treespec.type!r}; "
+                            f"expected {treespec.context!r}, but got {context!r}.",  # namedtuple type mismatch
+                        )
+
+            for subtree, subspec in zip(children, treespec.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 helper(subspec, subtree, subtrees)
 
         subtrees: list[PyTree] = []
@@ -1249,16 +1402,25 @@ def unflatten(self, leaves: Iterable[Any]) -> PyTree:
         start = 0
         end = 0
         child_pytrees = []
+<<<<<<< HEAD
         for child_spec in self._children:
+=======
+        for child_spec in self.children_specs:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             end += child_spec.num_leaves
             child_pytrees.append(child_spec.unflatten(leaves[start:end]))
             start = end
 
+<<<<<<< HEAD
         return unflatten_fn(child_pytrees, self._context)
+=======
+        return unflatten_fn(child_pytrees, self.context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self) -> int:
         node_type = self.type
         if node_type is defaultdict:
+<<<<<<< HEAD
             default_factory, dict_context = self._context
             hashable_context = (default_factory, tuple(dict_context))
         elif node_type in (dict, OrderedDict):
@@ -1267,6 +1429,16 @@ def __hash__(self) -> int:
             hashable_context = self._context
         elif isinstance(self._context, ConstantNode):
             hashable_context = self._context.value
+=======
+            default_factory, dict_context = self.context
+            hashable_context = (default_factory, tuple(dict_context))
+        elif node_type in (dict, OrderedDict):
+            hashable_context = tuple(self.context)
+        elif node_type is None or node_type in BUILTIN_TYPES:
+            hashable_context = self.context
+        elif isinstance(self.context, ConstantNode):
+            hashable_context = self.context.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # The context for user-defined node types might not be hashable.
             # Ignore it for hashing.
@@ -1274,13 +1446,18 @@ def __hash__(self) -> int:
             # same hash. This might increase the hash collision rate, but we
             # don't care about that.
             hashable_context = None
+<<<<<<< HEAD
         return hash((node_type, hashable_context, tuple(self._children)))
+=======
+        return hash((node_type, hashable_context, tuple(self.children_specs)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NOTE: subclassing a dataclass is subtle. In order to enable reasoning about
 # this class with `dataclasses.fields`, etc., while having a simplified
 # constructor that takes no argument, we wrap with `dataclass(init=True, ...)`
 # again, with fields that have `init=False`.
+<<<<<<< HEAD
 @deprecated(
     "`isinstance(treespec, LeafSpec)` is deprecated, "
     "use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.",
@@ -1291,6 +1468,15 @@ class LeafSpec(TreeSpec):
     type: Any = dataclasses.field(default=None, init=False)
     _context: Context = dataclasses.field(default=None, init=False)
     _children: list[Self] = dataclasses.field(default_factory=list, init=False)
+=======
+@dataclasses.dataclass(init=True, frozen=True, eq=False, repr=False)
+class LeafSpec(TreeSpec):
+    type: Any = dataclasses.field(default=None, init=False)
+    context: Context = dataclasses.field(default=None, init=False)
+    children_specs: list["TreeSpec"] = dataclasses.field(
+        default_factory=list, init=False
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self) -> None:
         # Override `__post_init__` for `num_leaves` derivation.
@@ -1304,6 +1490,7 @@ def __repr__(self, indent: int = 0) -> str:
 
 # All leaves are equivalent, so represent with a single object to save on
 # object construction time
+<<<<<<< HEAD
 with warnings.catch_warnings():
     warnings.filterwarnings(
         "ignore", category=FutureWarning, module=__name__, append=False
@@ -1334,6 +1521,9 @@ def treespec_dict(
     if any(not isinstance(child, TreeSpec) for child in dct.values()):
         raise ValueError(f"Expected a dictionary of TreeSpec values, got: {dct!r}.")
     return TreeSpec(dict, list(dct.keys()), list(dct.values()))
+=======
+_LEAF_SPEC = LeafSpec()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_flatten(
@@ -1417,9 +1607,15 @@ def tree_map(
 
     See also :func:`tree_map_`.
 
+<<<<<<< HEAD
     >>> tree_map(lambda x: x + 1, {"x": 7, "y": (42, 64)})
     {'x': 8, 'y': (43, 65)}
     >>> tree_map(lambda x: x is None, {"x": 7, "y": (42, 64), "z": None})
+=======
+    >>> tree_map(lambda x: x + 1, {'x': 7, 'y': (42, 64)})
+    {'x': 8, 'y': (43, 65)}
+    >>> tree_map(lambda x: x is None, {'x': 7, 'y': (42, 64), 'z': None})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {'x': False, 'y': (False, False), 'z': True}
 
     If multiple inputs are given, the structure of the tree is taken from the first input;
@@ -1487,7 +1683,14 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
+<<<<<<< HEAD
 TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
+=======
+if sys.version_info >= (3, 10):
+    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -1500,6 +1703,7 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
+<<<<<<< HEAD
 def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]: ...
 
 
@@ -1511,10 +1715,25 @@ def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]
 def map_only(
     type_or_types_or_pred: Type3[T, S, U], /
 ) -> MapOnlyFn[Fn3[T, S, U, Any]]: ...
+=======
+def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Type3[T, S, U], /) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This specialization is needed for the implementations below that call
 @overload
+<<<<<<< HEAD
 def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 
 
@@ -1522,6 +1741,15 @@ def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 def map_only(
     type_or_types_or_pred: Callable[[Any], bool], /
 ) -> MapOnlyFn[FnAny[Any]]: ...
+=======
+def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+@overload
+def map_only(type_or_types_or_pred: Callable[[Any], bool], /) -> MapOnlyFn[FnAny[Any]]:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def map_only(
@@ -1545,7 +1773,14 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
+<<<<<<< HEAD
     if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
+=======
+    if isinstance(type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(type_or_types_or_pred, types.UnionType)
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
@@ -1574,7 +1809,12 @@ def tree_map_only(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1584,7 +1824,12 @@ def tree_map_only(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1594,7 +1839,12 @@ def tree_map_only(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1604,7 +1854,12 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1614,7 +1869,12 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_map_only(
@@ -1634,7 +1894,12 @@ def tree_map_only_(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1644,7 +1909,12 @@ def tree_map_only_(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1654,7 +1924,12 @@ def tree_map_only_(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1664,7 +1939,12 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1674,7 +1954,12 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> PyTree: ...
+=======
+) -> PyTree:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_map_only_(
@@ -1712,7 +1997,12 @@ def tree_all_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1722,7 +2012,12 @@ def tree_all_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1732,7 +2027,12 @@ def tree_all_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_all_only(
@@ -1753,7 +2053,12 @@ def tree_any_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1763,7 +2068,12 @@ def tree_any_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1773,7 +2083,12 @@ def tree_any_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
+<<<<<<< HEAD
 ) -> bool: ...
+=======
+) -> bool:
+    ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def tree_any_only(
@@ -1800,8 +2115,12 @@ def _broadcast_to_and_flatten(
     treespec: TreeSpec,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> Optional[list[Any]]:
+<<<<<<< HEAD
     if not isinstance(treespec, TreeSpec):
         raise AssertionError("treespec must be a TreeSpec")
+=======
+    assert isinstance(treespec, TreeSpec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if tree_is_leaf(tree, is_leaf=is_leaf):
         return [tree] * treespec.num_leaves
@@ -1812,15 +2131,26 @@ def _broadcast_to_and_flatten(
         return None
 
     flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+<<<<<<< HEAD
     child_pytrees, context = flatten_fn(tree)
 
     # Check if the Node is different from the spec
     if len(child_pytrees) != treespec.num_children or context != treespec._context:
+=======
+    child_pytrees, ctx = flatten_fn(tree)
+
+    # Check if the Node is different from the spec
+    if len(child_pytrees) != treespec.num_children or ctx != treespec.context:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     # Recursively flatten the children
     result: list[Any] = []
+<<<<<<< HEAD
     for child, child_spec in zip(child_pytrees, treespec._children):
+=======
+    for child, child_spec in zip(child_pytrees, treespec.children_specs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat = _broadcast_to_and_flatten(child, child_spec, is_leaf=is_leaf)
         if flat is not None:
             result += flat
@@ -1874,7 +2204,11 @@ def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
 
     if serialize_node_def.to_dumpable_context is None:
         try:
+<<<<<<< HEAD
             serialized_context = json.dumps(treespec._context, cls=EnumEncoder)
+=======
+            serialized_context = json.dumps(treespec.context, cls=EnumEncoder)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError as e:
             raise TypeError(
                 "Unable to serialize context. "
@@ -1882,9 +2216,15 @@ def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
                 "custom serializer using _register_pytree_node."
             ) from e
     else:
+<<<<<<< HEAD
         serialized_context = serialize_node_def.to_dumpable_context(treespec._context)
 
     child_schemas = [_treespec_to_json(child) for child in treespec._children]
+=======
+        serialized_context = serialize_node_def.to_dumpable_context(treespec.context)
+
+    child_schemas = [_treespec_to_json(child) for child in treespec.children_specs]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return _TreeSpecSchema(serialized_type_name, serialized_context, child_schemas)
 
@@ -1897,7 +2237,10 @@ def enum_object_hook(obj: dict[str, Any]) -> Union[Enum, dict[str, Any]]:
         for attr in classname.split("."):
             enum_cls = getattr(enum_cls, attr)
         enum_cls = cast(type[Enum], enum_cls)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return enum_cls[obj["name"]]
     return obj
 
@@ -1912,7 +2255,11 @@ def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
 
     if json_schema["type"] not in SERIALIZED_TYPE_TO_PYTHON_TYPE:
         raise NotImplementedError(
+<<<<<<< HEAD
             f"Deserializing {json_schema['type']} in pytree is not registered.",
+=======
+            f'Deserializing {json_schema["type"]} in pytree is not registered.',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     typ = SERIALIZED_TYPE_TO_PYTHON_TYPE[json_schema["type"]]
diff --git a/torch/utils/_stats.py b/torch/utils/_stats.py
index b8a2978c3ea70..54c862b4ee641 100644
--- a/torch/utils/_stats.py
+++ b/torch/utils/_stats.py
@@ -4,8 +4,12 @@
 import collections
 import functools
 from collections import OrderedDict
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import TypeVar
+=======
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
diff --git a/torch/utils/_strobelight/cli_function_profiler.py b/torch/utils/_strobelight/cli_function_profiler.py
index 024cd93b35788..41d355eaf1569 100644
--- a/torch/utils/_strobelight/cli_function_profiler.py
+++ b/torch/utils/_strobelight/cli_function_profiler.py
@@ -6,9 +6,15 @@
 import re
 import subprocess
 import time
+<<<<<<< HEAD
 from collections.abc import Callable, Sequence
 from threading import Lock
 from typing import Any, Optional, TypeVar
+=======
+from collections.abc import Sequence
+from threading import Lock
+from typing import Any, Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
@@ -58,7 +64,11 @@ class StrobelightCLIFunctionProfiler:
 
     StrobelightCLIFunctionProfiler can be used to profile a python function and
     generate a strobelight link with the results. It works on meta servers but
+<<<<<<< HEAD
     does not requires an fbcode target.
+=======
+    does not requries an fbcode target.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     When stop_at_error is false(default), error during profiling does not prevent
     the work function from running.
 
@@ -301,11 +311,18 @@ def strobelight(
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
     def strobelight_inner(
+<<<<<<< HEAD
         work_function: Callable[_P, _R],
     ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
             # pyrefly: ignore [bad-argument-type]
+=======
+        work_function: Callable[_P, _R]
+    ) -> Callable[_P, Optional[_R]]:
+        @functools.wraps(work_function)
+        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index d152b719bcde5..e05357d54d789 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -3,8 +3,12 @@
 import math
 import operator
 import sys
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Callable, Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeVarTuple, Unpack
 
 import sympy
@@ -20,8 +24,11 @@
 from sympy.printing.precedence import PRECEDENCE
 from sympy.utilities.iterables import sift
 
+<<<<<<< HEAD
 from torch.torch_version import TorchVersion
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .numbers import int_oo
 
 
@@ -101,11 +108,18 @@ def _is_symbols_binary_summation(expr: sympy.Expr) -> bool:
 
 
 def _keep_float(
+<<<<<<< HEAD
     f: Callable[[Unpack[_Ts]], _T],
 ) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
     @functools.wraps(f)
     def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
         # pyrefly: ignore [bad-argument-type]
+=======
+    f: Callable[[Unpack[_Ts]], _T]
+) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
+    @functools.wraps(f)
+    def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r: Union[_T, sympy.Float] = f(*args)
         if any(isinstance(a, sympy.Float) for a in args) and not isinstance(
             r, sympy.Float
@@ -113,7 +127,10 @@ def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
             r = sympy.Float(float(r))
         return r
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return inner
 
 
@@ -200,12 +217,18 @@ class FloorDiv(sympy.Function):
 
     @property
     def base(self) -> sympy.Basic:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.args[0]
 
     @property
     def divisor(self) -> sympy.Basic:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.args[1]
 
     def _sympystr(self, printer: sympy.printing.StrPrinter) -> str:
@@ -274,6 +297,7 @@ def eval(
             for term in sympy.Add.make_args(base):
                 quotient = term / divisor
 
+<<<<<<< HEAD
                 # This is a sympy bug fixed in https://github.com/sympy/sympy/pull/28442
                 # sympy can generate a quotient with (1/22)*.... such that quotient.is_integer is True
                 # FloorDiv should not allow that as output. see
@@ -288,6 +312,9 @@ def eval(
                     quotient_is_integer = quotient.is_integer
 
                 if quotient_is_integer:
+=======
+                if quotient.is_integer:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     terms.append(term)
                     quotients += quotient
 
@@ -311,6 +338,14 @@ def eval(
 
         return None
 
+<<<<<<< HEAD
+=======
+    def _ccode(self, printer):
+        base = printer.parenthesize(self.base, PRECEDENCE["Atom"] - 0.5)
+        divisor = printer.parenthesize(self.divisor, PRECEDENCE["Atom"] - 0.5)
+        return f"floor({base}/{divisor})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ModularIndexing(sympy.Function):
     """
@@ -327,6 +362,10 @@ def eval(
     ) -> Optional[sympy.Basic]:
         if base == 0 or modulus == 1:
             return sympy.S.Zero
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             isinstance(base, sympy.Integer)
             and isinstance(divisor, sympy.Integer)
@@ -374,7 +413,10 @@ def eval(
         return None
 
     def _eval_is_nonnegative(self) -> Optional[bool]:
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         p, q = self.args[:2]
         return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
 
@@ -455,7 +497,10 @@ def eval(cls, p: sympy.Expr, q: sympy.Expr) -> Optional[sympy.Expr]:
         #   - floor(p / q) = 0
         #   - p % q = p - floor(p / q) * q = p
         less = p < q
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if less.is_Boolean and bool(less) and r.is_positive:
             return p
 
@@ -471,6 +516,7 @@ def _eval_is_nonnegative(self) -> Optional[bool]:
     def _eval_is_nonpositive(self) -> Optional[bool]:
         return True if self.args[1].is_negative else None  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
     def _ccode(self, printer):
         # pyrefly: ignore [missing-attribute]
         p = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
@@ -480,6 +526,8 @@ def _ccode(self, printer):
         abs_q = str(q) if self.args[1].is_positive else f"abs({q})"
         return f"({p} % {q}) < 0 ? {p} % {q} + {abs_q} : {p} % {q}"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Generic modulus: only defined on non-negative arguments
 class Mod(sympy.Function):
@@ -510,10 +558,15 @@ def eval(cls, p, q):
 
         # Evaluate if they are both literals.
         if q.is_Number and p.is_Number:
+<<<<<<< HEAD
             if p < 0:
                 raise AssertionError(p)
             if q < 1:
                 raise AssertionError(q)
+=======
+            assert p >= 0, p
+            assert q >= 1, q
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return p % q
 
         # If q == 2, it's a matter of whether p is odd or even.
@@ -559,7 +612,10 @@ def eval(cls, number):
             return sympy.Integer(math.ceil(float(number)))
 
     def _ccode(self, printer):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         number = printer.parenthesize(self.args[0], self.args[0].precedence - 0.5)
         return f"ceil({number})"
 
@@ -830,7 +886,10 @@ def do(ai, a):
             if not cond:
                 return ai.func(*[do(i, a) for i in ai.args], evaluate=False)
             if isinstance(ai, cls):
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return ai.func(*[do(i, a) for i in ai.args if i != a], evaluate=False)
             return a
 
@@ -949,12 +1008,19 @@ def _find_localzeros(cls, values, **options):
 
     _eval_is_algebraic = lambda s: _torf(i.is_algebraic for i in s.args)  # noqa: E731
     _eval_is_antihermitian = lambda s: _torf(  # noqa: E731
+<<<<<<< HEAD
         i.is_antihermitian
         for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_commutative = lambda s: _torf(  # noqa: E731
         i.is_commutative
         for i in s.args  # noqa: E731
+=======
+        i.is_antihermitian for i in s.args  # noqa: E731
+    )  # noqa: E731
+    _eval_is_commutative = lambda s: _torf(  # noqa: E731
+        i.is_commutative for i in s.args  # noqa: E731
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )  # noqa: E731
     _eval_is_complex = lambda s: _torf(i.is_complex for i in s.args)  # noqa: E731
     _eval_is_composite = lambda s: _torf(i.is_composite for i in s.args)  # noqa: E731
@@ -968,12 +1034,19 @@ def _find_localzeros(cls, values, **options):
     _eval_is_negative = lambda s: _torf(i.is_negative for i in s.args)  # noqa: E731
     _eval_is_noninteger = lambda s: _torf(i.is_noninteger for i in s.args)  # noqa: E731
     _eval_is_nonnegative = lambda s: _torf(  # noqa: E731
+<<<<<<< HEAD
         i.is_nonnegative
         for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_nonpositive = lambda s: _torf(  # noqa: E731
         i.is_nonpositive
         for i in s.args  # noqa: E731
+=======
+        i.is_nonnegative for i in s.args  # noqa: E731
+    )  # noqa: E731
+    _eval_is_nonpositive = lambda s: _torf(  # noqa: E731
+        i.is_nonpositive for i in s.args  # noqa: E731
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )  # noqa: E731
     _eval_is_nonzero = lambda s: _torf(i.is_nonzero for i in s.args)  # noqa: E731
     _eval_is_odd = lambda s: _torf(i.is_odd for i in s.args)  # noqa: E731
@@ -983,12 +1056,19 @@ def _find_localzeros(cls, values, **options):
     _eval_is_rational = lambda s: _torf(i.is_rational for i in s.args)  # noqa: E731
     _eval_is_real = lambda s: _torf(i.is_real for i in s.args)  # noqa: E731
     _eval_is_extended_real = lambda s: _torf(  # noqa: E731
+<<<<<<< HEAD
         i.is_extended_real
         for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_transcendental = lambda s: _torf(  # noqa: E731
         i.is_transcendental
         for i in s.args  # noqa: E731
+=======
+        i.is_extended_real for i in s.args  # noqa: E731
+    )  # noqa: E731
+    _eval_is_transcendental = lambda s: _torf(  # noqa: E731
+        i.is_transcendental for i in s.args  # noqa: E731
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )  # noqa: E731
     _eval_is_zero = lambda s: _torf(i.is_zero for i in s.args)  # noqa: E731
 
@@ -1008,7 +1088,10 @@ def _eval_is_nonnegative(self):  # type:ignore[override]
         return fuzzy_or(a.is_nonnegative for a in self.args)  # type: ignore[attr-defined]
 
     def _eval_is_negative(self):  # type:ignore[override]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fuzzy_and(a.is_negative for a in self.args)
 
 
@@ -1027,7 +1110,10 @@ def _eval_is_nonnegative(self):  # type:ignore[override]
         return fuzzy_and(a.is_nonnegative for a in self.args)  # type: ignore[attr-defined]
 
     def _eval_is_negative(self):  # type:ignore[override]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fuzzy_or(a.is_negative for a in self.args)
 
 
@@ -1092,7 +1178,11 @@ def eval(cls, base, exp):
 
 
 # base is assumed to be nonnegative, thereby prevent complex numbers from
+<<<<<<< HEAD
 # occurring
+=======
+# occuring
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FloatPow(sympy.Function):
     is_real = True
 
@@ -1165,9 +1255,13 @@ def eval(cls, base, divisor):
             return sympy.Float(int(base) / int(divisor))
 
     def _ccode(self, printer):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         base = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
         # pyrefly: ignore [missing-attribute]
+=======
+        base = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         divisor = printer.parenthesize(self.args[1], PRECEDENCE["Atom"] - 0.5)
         return f"((int){base}/(int){divisor})"
 
@@ -1183,10 +1277,14 @@ class IsNonOverlappingAndDenseIndicator(sympy.Function):
 
     @classmethod
     def eval(cls, *args):
+<<<<<<< HEAD
         if len(args) % 2 != 0:
             raise AssertionError(
                 f"expected an even number of arguments, got {len(args)}"
             )
+=======
+        assert len(args) % 2 == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = len(args) // 2
         sizes = args[0:dim]
         strides = args[dim:]
@@ -1218,8 +1316,12 @@ def eval(cls, *args):
             # this function could help figure this out.
 
         if all(isinstance(a, sympy.Integer) for a in strides):
+<<<<<<< HEAD
             if dim == 0:
                 raise AssertionError("dim must not be zero")
+=======
+            assert dim != 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # When all strides are integral, we can sort, and the size for the
             # largest stride doesn't matter and can be arbitrarily symbolic
             s_sizes, s_strides = zip(
@@ -1331,6 +1433,7 @@ class Identity(sympy.Function):
     precedence = 10
 
     def __repr__(self):  # type: ignore[override]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         return f"Identity({self.args[0]})"
 
@@ -1341,6 +1444,11 @@ def _sympystr(self, printer):
 
     def _eval_is_real(self):
         # pyrefly: ignore [missing-attribute]
+=======
+        return f"Identity({self.args[0]})"
+
+    def _eval_is_real(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.args[0].is_real
 
     def _eval_is_integer(self):
@@ -1348,6 +1456,7 @@ def _eval_is_integer(self):
 
     def _eval_expand_identity(self, **hints):
         # Removes the identity op.
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         return self.args[0]
 
@@ -1357,6 +1466,14 @@ def __int__(self) -> int:
 
     def __float__(self) -> float:
         # pyrefly: ignore [missing-attribute]
+=======
+        return self.args[0]
+
+    def __int__(self) -> int:
+        return int(self.args[0])
+
+    def __float__(self) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return float(self.args[0])
 
 
@@ -1368,7 +1485,11 @@ class OpaqueUnaryFn(sympy.Function):
         constant propagation.  This helps avoid performing transformations
         that are valid for real numbers but are invalid for floating point;
         in particular, while we are willing to make optimizations that change
+<<<<<<< HEAD
         numerics for Tensor compute, we are NOT willing to make optimizations
+=======
+        numerics for Tensor compute, we are NOT willing to make optimziations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         that change numerics for size compute.
         """
 
@@ -1452,10 +1573,14 @@ def eval(cls, a, b):
                 return sympy.Integer(getattr(operator, real_op_name)(int(a), int(b)))
             return None
 
+<<<<<<< HEAD
     nm = "BitwiseFn_" + name
     BitwiseFn.__name__ = nm
     BitwiseFn.__qualname__ = nm
 
+=======
+    BitwiseFn.__name__ = "BitwiseFn_" + name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return BitwiseFn
 
 
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index 6dc496a0ddb13..e25ad0178415e 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -160,8 +160,12 @@ def _run_sympy_handler(analysis, args, expr, index_dtype=torch.int64):
     handler = getattr(analysis, handler_name)
     try:
         if handler_name in ASSOCIATIVE_OPS:
+<<<<<<< HEAD
             if len(args) <= 1:
                 raise AssertionError("associative op needs >1 args")
+=======
+            assert len(args) > 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             acc = handler(args[0], args[1])
             for i in range(2, len(args)):
                 acc = handler(acc, args[i])
@@ -220,7 +224,14 @@ def sympy_interp(
                 missing_handler=missing_handler,
             )
             for arg in expr.args
+<<<<<<< HEAD
         ],
         expr,
         index_dtype=index_dtype,
     )
+=======
+        ],  # type: ignore[arg-type]
+        expr,
+        index_dtype=index_dtype,
+    )  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/_sympy/numbers.py b/torch/utils/_sympy/numbers.py
index f675de25ad8a7..ba289db401942 100644
--- a/torch/utils/_sympy/numbers.py
+++ b/torch/utils/_sympy/numbers.py
@@ -9,7 +9,10 @@
 from sympy.core.singleton import S, Singleton
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class IntInfinity(Number, metaclass=Singleton):
     r"""Positive integer infinite quantity.
 
@@ -204,7 +207,10 @@ def ceiling(self):
 int_oo = S.IntInfinity
 
 
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-inheritance]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NegativeIntInfinity(Number, metaclass=Singleton):
     """Negative integer infinite quantity.
 
diff --git a/torch/utils/_sympy/printers.py b/torch/utils/_sympy/printers.py
index 526443577b3f8..1a95051c2a470 100644
--- a/torch/utils/_sympy/printers.py
+++ b/torch/utils/_sympy/printers.py
@@ -20,9 +20,12 @@ class ExprPrinter(StrPrinter):
     def _print_Mul(self, expr: sympy.Expr) -> str:
         return self.stringify(expr.args, "*", precedence(expr))
 
+<<<<<<< HEAD
     def _print_Not(self, expr: sympy.Expr) -> str:
         return f"not ({self._print(expr.args[0])})"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_Add(self, expr: sympy.Expr, order: Optional[str] = None) -> str:
         return self.stringify(expr.args, " + ", precedence(expr))
 
@@ -66,6 +69,7 @@ def _print_Float(self, expr: sympy.Expr) -> str:
     # NB: this pow by natural, you should never have used builtin sympy.pow
     # for FloatPow, and a symbolic exponent should be PowByNatural.  These
     # means exp is guaranteed to be integer.
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def _print_Pow(self, expr: sympy.Expr) -> str:
         base, exp = expr.args
@@ -74,6 +78,13 @@ def _print_Pow(self, expr: sympy.Expr) -> str:
         exp = int(exp)
         if exp < 0:
             raise AssertionError(f"exponent must be non-negative, got {exp}")
+=======
+    def _print_Pow(self, expr: sympy.Expr) -> str:
+        base, exp = expr.args
+        assert exp == int(exp), exp
+        exp = int(exp)
+        assert exp >= 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if exp > 0:
             return self.stringify([base] * exp, "*", PRECEDENCE["Mul"])
         return "1"
@@ -135,8 +146,12 @@ def _print_TruncToFloat(self, expr: sympy.Expr) -> str:
 
 class PythonPrinter(ExprPrinter):
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("ToFloat expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: We use sym_float here because the printer is used for cache
         # serialization, and cache guards get evaluated with SymInt to
         # propagate guards to the parent ShapeEnv.  However, this comes at a
@@ -200,6 +215,7 @@ def _print_PowByNatural(self, expr: sympy.Expr) -> str:
         return self.stringify(expr.args, " ** ", PRECEDENCE["Pow"])
 
     def _print_floor(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("floor expects exactly one argument")
         return f"math.floor({self._print(expr.args[0])})"
@@ -212,10 +228,22 @@ def _print_FloorToInt(self, expr: sympy.Expr) -> str:
     def _print_TruncToInt(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 1:
             raise AssertionError("TruncToInt expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+        return f"math.floor({self._print(expr.args[0])})"
+
+    def _print_FloorToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.floor({self._print(expr.args[0])})"
+
+    def _print_TruncToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This also could have been int(), they'll do the same thing for float
         return f"math.trunc({self._print(expr.args[0])})"
 
     def _print_ceiling(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("ceiling expects exactly one argument")
         return f"math.ceil({self._print(expr.args[0])})"
@@ -228,12 +256,24 @@ def _print_CeilToInt(self, expr: sympy.Expr) -> str:
     def _print_Abs(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 1:
             raise AssertionError("Abs expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+        return f"math.ceil({self._print(expr.args[0])})"
+
+    def _print_CeilToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.ceil({self._print(expr.args[0])})"
+
+    def _print_Abs(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"abs({self._print(expr.args[0])})"
 
     # NB: It's expected that we've made explicit any promotion in the sympy
     # expression, so it doesn't matter that Python max/min doesn't perform
     # promotion
     def _print_Max(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) < 2:
             raise AssertionError("Max expects at least two arguments")
         return f"max({', '.join(map(self._print, expr.args))})"
@@ -304,6 +344,63 @@ def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
         number, ndigits = expr.args
         if not isinstance(ndigits, sympy.Integer):
             raise TypeError("ndigits must be an instance of sympy.Integer")
+=======
+        assert len(expr.args) >= 2
+        return f"max({', '.join(map(self._print, expr.args))})"
+
+    def _print_Min(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) >= 2
+        return f"min({', '.join(map(self._print, expr.args))})"
+
+    def _print_OpaqueUnaryFn_cos(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.cos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.cosh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_acos(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.acos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sin(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.sin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.sinh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_asin(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.asin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tan(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.tan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.tanh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.atan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.log2({self._print(expr.args[0])})"
+
+    def _print_RoundToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"round({self._print(expr.args[0])})"
+
+    def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        assert isinstance(ndigits, sympy.Integer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"round({self._print(number)}, {ndigits})"
 
 
@@ -314,8 +411,12 @@ def _print_Integer(self, expr: sympy.Expr) -> str:
         if i > INDEX_TYPE_MAX or i < INDEX_TYPE_MIN:
             raise OverflowError(f"{i} too big to convert to {INDEX_TYPE}")
         elif i == INDEX_TYPE_MIN:
+<<<<<<< HEAD
             if i != (-1) << 63:
                 raise AssertionError("unexpected minimum index type value")
+=======
+            assert i == (-1) << 63
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Writing -9223372036854775808L makes the value overflow
             # as it is parsed as -(9223372036854775808L) by the C/C++ compiler
             return f"(-1{suffix} << 63)"
@@ -348,24 +449,37 @@ def _print_FloorDiv(self, expr: sympy.Expr) -> str:
         return f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
 
     def _print_floor(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("floor expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = f"std::floor({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_FloorToInt(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("FloorToInt expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = f"std::floor({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_TruncToInt(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("TruncToInt expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = f"std::trunc({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})"
 
     def _print_TruncToFloat(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("TruncToFloat expects exactly one argument")
         return f"std::trunc({self._print(expr.args[0])})"
@@ -373,6 +487,13 @@ def _print_TruncToFloat(self, expr: sympy.Expr) -> str:
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 1:
             raise AssertionError("ToFloat expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+        return f"std::trunc({self._print(expr.args[0])})"
+
+    def _print_ToFloat(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"static_cast<double>({self._print(expr.args[0])})"
 
     def _print_PythonMod(self, expr: sympy.Expr) -> str:
@@ -437,14 +558,22 @@ def _print_Rational(self, expr: sympy.Expr) -> str:
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_ceiling(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("ceiling expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = f"std::ceil({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_CeilToInt(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("CeilToInt expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = f"std::ceil({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
@@ -467,6 +596,7 @@ def _print_Max(self, expr: sympy.Expr) -> str:
             return f"std::max<{INDEX_TYPE}>({il})"
 
     def _print_Abs(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("Abs expects exactly one argument")
         return f"std::abs({self._print(expr.args[0])})"
@@ -514,6 +644,45 @@ def _print_OpaqueUnaryFn_tanh(self, expr: sympy.Expr) -> str:
     def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 1:
             raise AssertionError("atan expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+        return f"std::abs({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cos(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::cos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::cosh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_acos(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::acos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sin(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::sin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::sinh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_asin(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::asin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tan(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::tan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"std::tanh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"std::atan({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sqrt(self, expr: sympy.Expr) -> str:
@@ -523,12 +692,17 @@ def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
         return f"std::log2({self._print(expr.args[0])})"
 
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 1:
             raise AssertionError("RoundToInt expects exactly one argument")
+=======
+        assert len(expr.args) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: dispatch to llrint depending on index type
         return f"std::lrint({self._print(expr.args[0])})"
 
     def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         if len(expr.args) != 2:
             raise AssertionError("RoundDecimal expects exactly two arguments")
         number, ndigits = expr.args
@@ -536,6 +710,13 @@ def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
             # ndigits < 0 should have been filtered by the sympy function
             if ndigits >= 0:
                 raise AssertionError("ndigits must be negative for integer inputs")
+=======
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        if number.is_integer:
+            # ndigits < 0 should have been filtered by the sympy function
+            assert ndigits < 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise ValueError(
                 f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
             )
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
index c3a3878f3c8c1..647b12442d5c8 100644
--- a/torch/utils/_sympy/reference.py
+++ b/torch/utils/_sympy/reference.py
@@ -76,8 +76,12 @@ def ge(a, b):
 
     @staticmethod
     def not_(a):
+<<<<<<< HEAD
         if isinstance(a, bool):
             raise AssertionError("not_ needs sympy expr")
+=======
+        assert not isinstance(a, bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ~a
 
     @staticmethod
@@ -176,7 +180,10 @@ def sqrt(x):
 
     @staticmethod
     def pow(a, b):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _keep_float(FloatPow)(a, b)
 
     @staticmethod
diff --git a/torch/utils/_sympy/solve.py b/torch/utils/_sympy/solve.py
index 840957f4109cb..521ed6186bfd4 100644
--- a/torch/utils/_sympy/solve.py
+++ b/torch/utils/_sympy/solve.py
@@ -21,7 +21,11 @@
 
 
 def mirror_rel_op(type: type) -> Optional[type[sympy.Rel]]:
+<<<<<<< HEAD
     return _MIRROR_REL_OP.get(type)
+=======
+    return _MIRROR_REL_OP.get(type, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Tries to simplify 'expr', so as to leave only 'thing' in the left-hand side.
@@ -77,8 +81,12 @@ def try_solve(
         if e is None:
             continue
 
+<<<<<<< HEAD
         if not isinstance(e, sympy.Rel):
             raise AssertionError("expected sympy.Rel")
+=======
+        assert isinstance(e, sympy.Rel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for _ in range(trials):
             trial = _try_isolate_lhs(e, thing, floordiv_inequality=floordiv_inequality)
@@ -129,8 +137,12 @@ def _try_isolate_lhs(
             if isinstance(e, INEQUALITY_TYPES) and other.is_negative:
                 op = mirror_rel_op(op)  # type: ignore[assignment]
 
+<<<<<<< HEAD
             if op is None:
                 raise AssertionError("expected op to be not None")
+=======
+            assert op is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             e = op(lhs, rhs)
 
     ################################################################################
@@ -153,22 +165,33 @@ def _try_isolate_lhs(
         if isinstance(e, sympy.Eq):
             numerator, denominator = e.lhs.args
             return sympy.And(
+<<<<<<< HEAD
                 sympy.Ge(numerator, (e.rhs * denominator)),
                 sympy.Lt(numerator, ((e.rhs + 1) * denominator)),
+=======
+                sympy.Ge(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
+                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # a // b != expr
         # => a < (b * expr) or a >= (b * (expr + 1))
         if isinstance(e, sympy.Ne):
             numerator, denominator = e.lhs.args
             return sympy.Or(
+<<<<<<< HEAD
                 sympy.Lt(numerator, (e.rhs * denominator)),
                 sympy.Ge(numerator, ((e.rhs + 1) * denominator)),
+=======
+                sympy.Lt(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
+                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # The transformations below only work if b is positive.
         # Note: we only have this information for constants.
         # a // b > expr  => a >= b * (expr + 1)
         # a // b >= expr => a >= b * expr
         if isinstance(e, (sympy.Gt, sympy.Ge)):
+<<<<<<< HEAD
             quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)
             return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))
         # a // b < expr  => a < b * expr
@@ -176,5 +199,14 @@ def _try_isolate_lhs(
         if isinstance(e, (sympy.Lt, sympy.Le)):
             quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)
             return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))
+=======
+            quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)  # type: ignore[arg-type]
+            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+        # a // b < expr  => a < b * expr
+        # a // b <= expr => a < b * (expr + 1)
+        if isinstance(e, (sympy.Lt, sympy.Le)):
+            quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)  # type: ignore[arg-type]
+            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return e
diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py
index cd25478e6ed18..bd88642aa3b23 100644
--- a/torch/utils/_sympy/symbol.py
+++ b/torch/utils/_sympy/symbol.py
@@ -89,8 +89,12 @@ def make_symbol(prefix: SymT, idx: int, **kwargs) -> sympy.Symbol:
 # This type is a little wider than it should be, because free_symbols says
 # that it contains Basic, rather than Symbol
 def symbol_is_type(sym: sympy.Basic, prefix: Union[SymT, Iterable[SymT]]) -> bool:
+<<<<<<< HEAD
     if not isinstance(sym, sympy.Symbol):
         raise AssertionError("expected sympy.Symbol")
+=======
+    assert isinstance(sym, sympy.Symbol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name_str = sym.name.lower()  # Match capitalized names like XBLOCK, RBLOCK
     if isinstance(prefix, SymT):
         return name_str.startswith(prefix_str[prefix])
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index ef7c1696480b5..d99320d544f31 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -7,17 +7,29 @@
 import logging
 import math
 import operator
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import (
+=======
+from typing import (
+    Callable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Generic,
     Optional,
     overload,
     SupportsFloat,
     TYPE_CHECKING,
+<<<<<<< HEAD
     TypeGuard,
     TypeVar,
     Union,
 )
+=======
+    TypeVar,
+    Union,
+)
+from typing_extensions import TypeGuard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
@@ -71,14 +83,22 @@ def simple_sympify(e):
             return sympy.oo if e > 0 else -sympy.oo
         return sympy.Float(e)
     elif isinstance(e, sympy.Expr):
+<<<<<<< HEAD
         if not getattr(e, "is_number", False):
             raise AssertionError(e)
+=======
+        assert e.is_number, e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NaNs can occur when doing things like 0 * sympy.oo, but it is better
         # if the operator notices this and takes care of it, because sometimes
         # the NaN is inappropriate (for example, for ints, the [-oo, oo] range
         # should go to zero when multiplied with [0, 0])
+<<<<<<< HEAD
         if e == sympy.nan:
             raise AssertionError("sympy expression is NaN")
+=======
+        assert e != sympy.nan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return e
     elif isinstance(e, BooleanAtom):
         return e
@@ -89,17 +109,28 @@ def simple_sympify(e):
 # Sympy atomics only. Unlike <=, it also works on Sympy bools.
 def sympy_generic_le(lower, upper):
     if isinstance(lower, sympy.Expr):
+<<<<<<< HEAD
         if not isinstance(upper, sympy.Expr):
             raise AssertionError(
                 "upper must be a sympy.Expr when lower is a sympy.Expr"
             )
+=======
+        assert isinstance(upper, sympy.Expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # instead of lower <= upper, we do upper >= lower since upper is mostly int_oo
         # and we have better code paths there.
         return upper >= lower
     else:
         # only negative condition is True > False
+<<<<<<< HEAD
         if not isinstance(lower, SympyBoolean) or not isinstance(upper, SympyBoolean):
             raise AssertionError((lower, upper))
+=======
+        assert isinstance(lower, SympyBoolean) and isinstance(upper, SympyBoolean), (
+            lower,
+            upper,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return not (lower and not upper)
 
 
@@ -126,9 +157,13 @@ def vr_is_expr(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[sympy.Expr]]:
 class ValueRanges(Generic[_T]):
     if TYPE_CHECKING:
         # ruff doesn't understand circular references but mypy does
+<<<<<<< HEAD
         # pyrefly: ignore [unbound-name]
         ExprVR = ValueRanges[sympy.Expr]  # noqa: F821
         # pyrefly: ignore [unbound-name]
+=======
+        ExprVR = ValueRanges[sympy.Expr]  # noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BoolVR = ValueRanges[SympyBoolean]  # noqa: F821
         AllVR = Union[ExprVR, BoolVR]
 
@@ -149,14 +184,24 @@ def __init__(
         self: ValueRanges[sympy.Expr],
         lower: ExprIn,
         upper: ExprIn,
+<<<<<<< HEAD
     ) -> None: ...
+=======
+    ) -> None:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def __init__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         lower: BoolIn,
         upper: BoolIn,
+<<<<<<< HEAD
     ) -> None: ...
+=======
+    ) -> None:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, lower: AllIn, upper: AllIn) -> None:
         lower = simple_sympify(lower)
@@ -171,8 +216,12 @@ def __init__(self, lower: AllIn, upper: AllIn) -> None:
 
         is_bool_lower = isinstance(lower, SympyBoolean)
         is_bool_upper = isinstance(upper, SympyBoolean)
+<<<<<<< HEAD
         if is_bool_lower != is_bool_upper:
             raise AssertionError((lower, upper))
+=======
+        assert is_bool_lower == is_bool_upper, (lower, upper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Warning: is_int/is_float is best effort.  We do pretty well in
         # Dynamo, but in Inductor these attributes are often wrong because we
@@ -215,8 +264,12 @@ def __init__(self, lower: AllIn, upper: AllIn) -> None:
         """
         # NB: [-oo, oo] always advertises as float!
         object.__setattr__(self, "is_float", not self.is_bool and not self.is_int)
+<<<<<<< HEAD
         if not self.is_bool and not self.is_int and not self.is_float:
             raise AssertionError((lower, upper))
+=======
+        assert self.is_bool or self.is_int or self.is_float, (lower, upper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def boolify(self) -> ValueRanges[SympyBoolean]:
         if vr_is_bool(self):
@@ -245,25 +298,41 @@ def tighten(self, other) -> ValueRanges:
     def __and__(
         self: ValueRanges[sympy.Expr],
         other: ValueRanges[sympy.Expr],
+<<<<<<< HEAD
     ) -> ValueRanges[sympy.Expr]: ...
+=======
+    ) -> ValueRanges[sympy.Expr]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def __and__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         other: ValueRanges[SympyBoolean],
+<<<<<<< HEAD
     ) -> ValueRanges[SympyBoolean]: ...
+=======
+    ) -> ValueRanges[SympyBoolean]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __and__(self: AllVR, other: AllVR) -> AllVR:
         if other in (ValueRanges.unknown(), ValueRanges.unknown_int()):
             return self
         if self in (ValueRanges.unknown(), ValueRanges.unknown_int()):
             return other
+<<<<<<< HEAD
         if self.is_bool != other.is_bool:
             raise AssertionError((self, other))
         if self.is_int != other.is_int:
             raise AssertionError((self, other))
         if self.is_float != other.is_float:
             raise AssertionError((self, other))
+=======
+        assert self.is_bool == other.is_bool, (self, other)
+        assert self.is_int == other.is_int, (self, other)
+        assert self.is_float == other.is_float, (self, other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.is_bool:
             return ValueRanges(
                 sympy.Or(self.lower, other.lower), sympy.And(self.upper, other.upper)
@@ -278,23 +347,39 @@ def __and__(self: AllVR, other: AllVR) -> AllVR:
     def __or__(
         self: ValueRanges[sympy.Expr],
         other: ValueRanges[sympy.Expr],
+<<<<<<< HEAD
     ) -> ValueRanges[sympy.Expr]: ...
+=======
+    ) -> ValueRanges[sympy.Expr]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def __or__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         other: ValueRanges[SympyBoolean],
+<<<<<<< HEAD
     ) -> ValueRanges[SympyBoolean]: ...
+=======
+    ) -> ValueRanges[SympyBoolean]:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __or__(self: AllVR, other: AllVR) -> AllVR:
         if ValueRanges.unknown() in (self, other):
             return ValueRanges.unknown()
+<<<<<<< HEAD
         if self.is_bool != other.is_bool:
             raise AssertionError((self, other))
         if self.is_int != other.is_int:
             raise AssertionError((self, other))
         if self.is_float != other.is_float:
             raise AssertionError((self, other))
+=======
+        assert self.is_bool == other.is_bool, (self, other)
+        assert self.is_int == other.is_int, (self, other)
+        assert self.is_float == other.is_float, (self, other)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.is_bool:
             return ValueRanges(
                 sympy.And(self.lower, other.lower), sympy.Or(self.upper, other.upper)
@@ -350,7 +435,12 @@ def increasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
 
     @overload
     @staticmethod
+<<<<<<< HEAD
     def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR: ...
+=======
+    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     @staticmethod
@@ -390,7 +480,12 @@ def coordinatewise_increasing_map(
         x: Union[ExprIn, ExprVR],
         y: Union[ExprIn, ExprVR],
         fn: ExprFn2,
+<<<<<<< HEAD
     ) -> ExprVR: ...
+=======
+    ) -> ExprVR:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     @staticmethod
@@ -398,7 +493,12 @@ def coordinatewise_increasing_map(  # type: ignore[misc]
         x: Union[BoolIn, BoolVR],
         y: Union[BoolIn, BoolVR],
         fn: BoolFn2,
+<<<<<<< HEAD
     ) -> BoolVR: ...
+=======
+    ) -> BoolVR:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def coordinatewise_increasing_map(
@@ -439,6 +539,7 @@ class SymPyValueRangeAnalysis:
     @staticmethod
     def constant(value, dtype):
         if isinstance(value, ValueRanges):
+<<<<<<< HEAD
             if not value.is_singleton():
                 raise AssertionError("ValueRanges must be a singleton for constant()")
             value = value.lower
@@ -448,6 +549,15 @@ def constant(value, dtype):
             value, (BooleanAtom, sympy.Integer, sympy.Number)
         ):
             raise AssertionError(f"not a supported constant type: {type(value)}")
+=======
+            assert value.is_singleton()
+            value = value.lower
+        # NB: value is NOT a sympy expression, it's a constant!
+        is_python = isinstance(value, (int, float, bool))
+        assert is_python or isinstance(
+            value, (BooleanAtom, sympy.Integer, sympy.Number)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # using nan makes subsequent computation throw, and for the purposes of optimization
         # returning -math.inf - math.inf is equivalent to giving up
@@ -466,6 +576,7 @@ def constant(value, dtype):
             # We do a type check on a best-effort basis
             # We don't want to force a cast to sympy.Float if the value is Rational to avoid losing precision
             if dtype == torch.bool:
+<<<<<<< HEAD
                 if not isinstance(value, BooleanAtom):
                     raise AssertionError("expected BooleanAtom for bool dtype")
             elif dtype.is_floating_point:
@@ -477,6 +588,14 @@ def constant(value, dtype):
                 # dtype is intXX
                 if not getattr(value, "is_integer", False):
                     raise AssertionError("expected integer sympy value for int dtype")
+=======
+                assert isinstance(value, BooleanAtom)
+            elif dtype.is_floating_point:
+                assert not value.is_finite or value.is_real
+            else:
+                # dtype is intXX
+                assert value.is_integer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         r = ValueRanges.wrap(value)
         return r
@@ -484,7 +603,10 @@ def constant(value, dtype):
     @staticmethod
     def to_dtype(a, dtype, src_dtype=None):
         if dtype == torch.float64:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ValueRanges.increasing_map(a, ToFloat)
         elif dtype == torch.bool:
             return ValueRanges.unknown_bool()
@@ -494,15 +616,22 @@ def to_dtype(a, dtype, src_dtype=None):
 
     @staticmethod
     def trunc_to_int(a, dtype):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ValueRanges.increasing_map(a, TruncToInt)
 
     @staticmethod
     def not_(a):
         a = ValueRanges.wrap(a)
         a = a.boolify()
+<<<<<<< HEAD
         if not a.is_bool:
             raise AssertionError("not_ expects a boolean ValueRanges")
+=======
+        assert a.is_bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ValueRanges.decreasing_map(a, sympy.Not)
 
     @staticmethod
@@ -588,10 +717,14 @@ def identity(cls, a):
     def lt(cls, a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
+<<<<<<< HEAD
         if a.is_bool != b.is_bool:
             raise AssertionError(
                 "operands must both be boolean ValueRanges or both non-boolean"
             )
+=======
+        assert a.is_bool == b.is_bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if a.is_bool:
             return cls.and_(cls.not_(a), b)
         else:
@@ -624,10 +757,14 @@ def mul(cls, a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
 
+<<<<<<< HEAD
         if a.is_bool != b.is_bool:
             raise AssertionError(
                 "operands must both be boolean ValueRanges or both non-boolean"
             )
+=======
+        assert a.is_bool == b.is_bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if a.is_bool:
             return cls.and_(a, b)
 
@@ -650,10 +787,14 @@ def int_truediv(a, b):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(
+<<<<<<< HEAD
                 a,
                 b,
                 # pyrefly: ignore [bad-argument-type]
                 _keep_float(IntTrueDiv),
+=======
+                a, b, _keep_float(IntTrueDiv)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @staticmethod
@@ -666,10 +807,14 @@ def truediv(a, b):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(
+<<<<<<< HEAD
                 a,
                 b,
                 # pyrefly: ignore [bad-argument-type]
                 _keep_float(FloatTrueDiv),
+=======
+                a, b, _keep_float(FloatTrueDiv)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @staticmethod
@@ -748,7 +893,10 @@ def pow_by_natural(cls, a, b):
             # We should know that b >= 0 but we may have forgotten this fact due
             # to replacements, so don't assert it, but DO clamp it to prevent
             # degenerate problems
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ValueRanges.coordinatewise_increasing_map(
                 a, b & ValueRanges(0, int_oo), PowByNatural
             )
@@ -915,7 +1063,10 @@ def round_decimal(cls, number, ndigits):
 
     @classmethod
     def round_to_int(cls, number, dtype):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ValueRanges.increasing_map(number, RoundToInt)
 
     # It's used in some models on symints
@@ -933,10 +1084,14 @@ def where(a, b, c):
         a = a.boolify()
         # We sometimes write unknown without specifying the type correctly
         # In particular, we do that when initialising the bounds for loads in bounds.py
+<<<<<<< HEAD
         if b.is_bool != c.is_bool and ValueRanges.unknown() not in (b, c):
             raise AssertionError(
                 "where() requires b and c to have the same boolean-ness or allow unknown()"
             )
+=======
+        assert b.is_bool == c.is_bool or ValueRanges.unknown() in (b, c)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if b.is_bool:
             return ValueRanges(sympy.And(b.lower, c.lower), sympy.Or(b.upper, c.upper))
         else:
@@ -951,7 +1106,11 @@ def expr_cond_pair(a, b):
         return (a, b)
 
     # piecewise function can be used to convert a SymBool to SymInt:
+<<<<<<< HEAD
     # int_expr = Piecewise((1, bool_expr), (0, True)), it evaluates to 1 when sym_bool is True and 0 otherwise.
+=======
+    # int_expr = Piecewise((1, bool_expr), (0, True)), it evalutes to 1 when sym_bool is True and 0 otherwise.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # ranges is a sequence of (expr_range, condition_range) pairs. The range pair is constructed in expr_cond_pair.
     # The ValueRange of Piecewise is just the union of all expr ranges whose condition expr can be True.
@@ -1032,7 +1191,10 @@ def atan(x):
 
     @staticmethod
     def trunc(x):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ValueRanges.increasing_map(x, TruncToFloat)
 
 
@@ -1060,7 +1222,11 @@ def bound_sympy(
 
     # If there's a tracing context, augment available constrained ranges.
     context = torch._guards.TracingContext.try_get()
+<<<<<<< HEAD
     if context and context.fake_mode and context.fake_mode.shape_env:
+=======
+    if context and context.fake_mode.shape_env:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if ranges:
             ranges = {**context.fake_mode.shape_env.var_to_range, **ranges}
         else:
diff --git a/torch/utils/_thunk.py b/torch/utils/_thunk.py
index 28689f2f76f18..c42847f257d77 100644
--- a/torch/utils/_thunk.py
+++ b/torch/utils/_thunk.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Generic, Optional, TypeVar
+=======
+from typing import Callable, Generic, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 R = TypeVar("R")
diff --git a/torch/utils/_traceback.py b/torch/utils/_traceback.py
index 21fadb297be80..456edd003e0c1 100644
--- a/torch/utils/_traceback.py
+++ b/torch/utils/_traceback.py
@@ -206,8 +206,12 @@ def extract(*, script=False, cpp=False, skip=0):
         import torch._C._profiler
 
         if script or cpp:
+<<<<<<< HEAD
             if skip != 0:
                 raise AssertionError("skip with script/cpp NYI")
+=======
+            assert skip == 0, "skip with script/cpp NYI"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return CapturedTraceback(
             torch._C._profiler.gather_traceback(python=True, script=script, cpp=cpp),
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index f062f7e7508cb..687bd14a5ff7b 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -6,17 +6,31 @@
 @functools.cache
 def has_triton_package() -> bool:
     try:
+<<<<<<< HEAD
         import triton  # noqa: F401
 
         return True
     except ImportError:
         return False
+=======
+        from triton.compiler.compiler import triton_key
+
+        return triton_key is not None
+    except ImportError:
+        return False
+    except RuntimeError:
+        return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
 def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
     try:
+<<<<<<< HEAD
         import triton
+=======
+        import triton  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         major, minor = tuple(int(v) for v in triton.__version__.split(".")[:2])
         return (major, minor)
@@ -82,7 +96,11 @@ def has_triton_tma_device() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
+<<<<<<< HEAD
         ) or torch.xpu.is_available():
+=======
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # old API
             try:
                 from triton.language.extra.cuda import (  # noqa: F401
@@ -105,6 +123,7 @@ def has_triton_tma_device() -> bool:
     return False
 
 
+<<<<<<< HEAD
 @functools.cache
 def has_datacenter_blackwell_tma_device() -> bool:
     import torch
@@ -120,6 +139,8 @@ def has_datacenter_blackwell_tma_device() -> bool:
     return False
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache(None)
 def has_triton_stable_tma_api() -> bool:
     if has_triton_package():
@@ -129,7 +150,11 @@ def has_triton_stable_tma_api() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
+<<<<<<< HEAD
         ) or torch.xpu.is_available():
+=======
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 from triton.language import make_tensor_descriptor  # noqa: F401
 
@@ -144,11 +169,14 @@ def has_triton() -> bool:
     if not has_triton_package():
         return False
 
+<<<<<<< HEAD
     from torch._inductor.config import triton_disable_device_detection
 
     if triton_disable_device_detection:
         return False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.device_interface import get_interface_for_device
 
     def cuda_extra_check(device_interface: Any) -> bool:
@@ -166,7 +194,10 @@ def _return_true(device_interface: Any) -> bool:
         "cuda": cuda_extra_check,
         "xpu": _return_true,
         "cpu": cpu_extra_check,
+<<<<<<< HEAD
         "mtia": _return_true,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     def is_device_compatible_with_triton() -> bool:
@@ -190,7 +221,11 @@ def triton_backend() -> Any:
 
 @functools.cache
 def triton_hash_with_backend() -> str:
+<<<<<<< HEAD
     from torch._inductor.runtime.triton_compat import triton_key
+=======
+    from triton.compiler.compiler import triton_key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     backend = triton_backend()
     key = f"{triton_key()}-{backend.hash()}"
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index 67e9a7311a09d..84f8c627c785c 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -6,10 +6,14 @@
 from torch.overrides import handle_torch_function, has_torch_function_unary
 
 
+<<<<<<< HEAD
 __all__ = [
     "rename_privateuse1_backend",
     "generate_methods_for_privateuse1_backend",
 ]
+=======
+__all__ = ["rename_privateuse1_backend", "generate_methods_for_privateuse1_backend"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: Should use `torch._C._get_privateuse1_backend_name()` to get
 # renamed-backend name for `privateuse1`, but the func will cause an
@@ -109,7 +113,11 @@ def _get_current_device_index():
     elif isinstance(device, str):
         device = torch.device(device)
 
+<<<<<<< HEAD
     # variable device can only be torch.device type or int type
+=======
+    # variable devcie can only be torch.device type or int type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(device, torch.device):
         if device.type != custom_backend_name:
             raise RuntimeError(f"Invalid device, must be {custom_backend_name} device")
@@ -202,7 +210,10 @@ def wrap_module_to(
         Args:
             device (int, optional): if specified, all parameters will be copied to that device
         """
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._apply(lambda t: getattr(t, custom_backend_name)(device))
 
     _check_register_once(torch.nn.Module, custom_backend_name)
@@ -252,6 +263,7 @@ def wrap_module_to(
             device (int, optional): if specified, all parameters will be copied to that device
         """
         ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(
+<<<<<<< HEAD
             # pyrefly: ignore [not-iterable]
             *args,
             **kwargs,
@@ -261,6 +273,13 @@ def wrap_module_to(
             return self.to(*args, **kwargs)
         kwargs.update({"device": custom_backend_name})
         # pyrefly: ignore [not-iterable]
+=======
+            *args, **kwargs
+        )
+        if ex.device.type == custom_backend_name:
+            return self.to(*args, **kwargs)
+        kwargs.update({"device": custom_backend_name})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.to(*args, **kwargs)
 
     _check_register_once(torch.nn.utils.rnn.PackedSequence, custom_backend_name)
@@ -434,17 +453,27 @@ def func_name(*args, **kwargs):
     it is marked as private. It is a convenience function for backend implementers to
     more easily call the hooks into their backend extensions.
     """
+<<<<<<< HEAD
     if not isinstance(func_name, str):
         raise AssertionError(f"func_name must be `str`, but got `{type(func_name)}`.")
     backend_name = _get_privateuse1_backend_name()
     custom_device_mod = getattr(torch, backend_name, None)
     function = getattr(custom_device_mod, func_name, None)
+=======
+    assert isinstance(
+        func_name, str
+    ), f"func_name must be `str`, but got `{type(func_name)}`."
+    backend_name = _get_privateuse1_backend_name()
+    custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
+    function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if custom_device_mod is None or function is None:
         message = f"Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend "
         message += f"module with `torch._register_device_module('{backend_name}', BackendModule)`. And "
         message += f"BackendModule needs to have the following API's:\n `{func_name}(*args, **kwargs)`. \n"
         raise RuntimeError(message)
     return function
+<<<<<<< HEAD
 
 
 class _DummyBackendModule:
@@ -520,3 +549,5 @@ def _setup_privateuseone_for_python_backend(
     torch._register_device_module(rename, backend_module)
     torch._C._acc.register_python_privateuseone_hook(hook)
     torch._C._acc.register_python_privateuseone_device_guard(device_guard)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/benchmark/README.md b/torch/utils/benchmark/README.md
index 6fa025e51d37c..b0521e1db16f5 100644
--- a/torch/utils/benchmark/README.md
+++ b/torch/utils/benchmark/README.md
@@ -25,7 +25,11 @@ into two broad categories:
 
   * `Timer` implements the `blocked_autorange` function which is a
   mixture of `timeit.Timer.repeat` and `timeit.Timer.autorange`. This function
+<<<<<<< HEAD
   selects an appropriate number and runs for a roughly fixed amount of time
+=======
+  selects and appropriate number and runs for a roughly fixed amount of time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   (like `autorange`), but is less wasteful than `autorange` which discards
   ~75% of measurements. It runs many times, similar to `repeat`, and returns
   a `Measurement` containing all of the run results.
@@ -46,7 +50,11 @@ table will be generated per unique label.
 may be logically equivalent differ in implementation. Assigning separate
 sub_labels will result in a row per sub_label. If a sublabel is not provided,
 `stmt` is used instead. Statistics (such as computing the fastest
+<<<<<<< HEAD
 implementation) use all sub_labels.
+=======
+implementation) are use all sub_labels.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 * `description`: This describes the inputs. For instance, `stmt=torch.add(x, y)`
 can be run over several values of `x` and `y`. Each pair should be given its
diff --git a/torch/utils/benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
index 55f25e5c896d5..392350df7062b 100644
--- a/torch/utils/benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -22,10 +22,15 @@ def assert_dicts_equal(dict_0, dict_1):
         x = {"a": np.ones((2, 1))}
         x == x  # Raises ValueError
     """
+<<<<<<< HEAD
     if set(dict_0.keys()) != set(dict_0.keys()):
         raise AssertionError("dicts must have the same keys")
     if all(np.all(v != dict_1[k]) for k, v in dict_0.items() if k != "dtype"):
         raise AssertionError("dict values differ for keys other than 'dtype'")
+=======
+    assert set(dict_0.keys()) == set(dict_0.keys())
+    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def run(n, stmt, fuzzer_cls):
diff --git a/torch/utils/benchmark/examples/sparse/compare.py b/torch/utils/benchmark/examples/sparse/compare.py
index fa00fb1818cda..2931717b701cb 100644
--- a/torch/utils/benchmark/examples/sparse/compare.py
+++ b/torch/utils/benchmark/examples/sparse/compare.py
@@ -63,7 +63,10 @@ def generate_coo_data(size, sparse_dim, nnz, dtype, device):
     indices = torch.rand(sparse_dim, nnz, device=device)
     indices.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(indices))
     indices = indices.to(torch.long)
+<<<<<<< HEAD
     # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values = torch.rand([nnz, ], dtype=dtype, device=device)
     return indices, values
 
diff --git a/torch/utils/benchmark/examples/sparse/fuzzer.py b/torch/utils/benchmark/examples/sparse/fuzzer.py
index a5aac22179d86..c5aaf2d65c33f 100644
--- a/torch/utils/benchmark/examples/sparse/fuzzer.py
+++ b/torch/utils/benchmark/examples/sparse/fuzzer.py
@@ -40,7 +40,11 @@ def main():
             [
                 benchmark_utils.FuzzedSparseTensor(
                     name=name,
+<<<<<<< HEAD
                     size=tuple(f"k{i}" for i in range(3)),
+=======
+                    size=tuple([f"k{i}" for i in range(3)]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     min_elements=64 * 1024,
                     max_elements=128 * 1024,
                     sparse_dim="sparse_dim",
diff --git a/torch/utils/benchmark/examples/sparse/op_benchmark.py b/torch/utils/benchmark/examples/sparse/op_benchmark.py
index bd52084fbc0cc..9a7b99eb1b461 100644
--- a/torch/utils/benchmark/examples/sparse/op_benchmark.py
+++ b/torch/utils/benchmark/examples/sparse/op_benchmark.py
@@ -20,10 +20,15 @@ def assert_dicts_equal(dict_0, dict_1):
         x = {"a": np.ones((2, 1))}
         x == x  # Raises ValueError
     """
+<<<<<<< HEAD
     if set(dict_0.keys()) != set(dict_0.keys()):
         raise AssertionError("dicts must have the same keys")
     if all(np.all(v != dict_1[k]) for k, v in dict_0.items() if k != "dtype"):
         raise AssertionError("dict values differ for keys other than 'dtype'")
+=======
+    assert set(dict_0.keys()) == set(dict_0.keys())
+    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def run(n, stmt, fuzzer_cls):
     float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
diff --git a/torch/utils/benchmark/op_fuzzers/sparse_unary.py b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
index 07d2aeeeabaf2..8d9b0cebe9f36 100644
--- a/torch/utils/benchmark/op_fuzzers/sparse_unary.py
+++ b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Optional
@@ -11,6 +12,14 @@
 from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
 
 __all__ = ["UnaryOpSparseFuzzer"]
+=======
+# mypy: allow-untyped-defs
+
+import numpy as np
+import torch
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _MIN_DIM_SIZE = 16
 _MAX_DIM_SIZE = 16 * 1024 ** 2
@@ -20,9 +29,13 @@
 ))
 
 class UnaryOpSparseFuzzer(Fuzzer):
+<<<<<<< HEAD
     def __init__(self, seed: Optional[int], dtype: _dtype | None = None, cuda: bool = False) -> None:
         if dtype is None:
             dtype = getattr(torch, 'float32', None)
+=======
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             parameters=[
                 # Sparse dim parameter of x. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py
index c91e3d12b29e1..af8a05066bebe 100644
--- a/torch/utils/benchmark/utils/_stubs.py
+++ b/torch/utils/benchmark/utils/_stubs.py
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 from typing import Any
 from collections.abc import Callable
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Protocol, runtime_checkable
 
 
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index 10fe1d898de0f..80b7711f73fc0 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -276,8 +276,12 @@ def unit_to_english(u: str) -> str:
 
 def trim_sigfig(x: float, n: int) -> float:
     """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
+<<<<<<< HEAD
     if n != int(n):
         raise AssertionError("Number of significant figures must be an integer")
+=======
+    assert n == int(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     magnitude = int(torch.tensor(x).abs().log10().ceil().item())
     scale = 10 ** (magnitude - n)
     return float(torch.tensor(x / scale).round() * scale)
@@ -313,10 +317,15 @@ def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> st
     use_dev_shm: bool = (os.getenv("BENCHMARK_USE_DEV_SHM") or "").lower() in ("1", "true")
     if use_dev_shm:
         root = "/dev/shm/pytorch_benchmark_utils"
+<<<<<<< HEAD
         if os.name != "posix":
             raise AssertionError(f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}")
         if not os.path.exists("/dev/shm"):
             raise AssertionError("This system does not appear to support tmpfs (/dev/shm).")
+=======
+        assert os.name == "posix", f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}"
+        assert os.path.exists("/dev/shm"), "This system does not appear to support tmpfs (/dev/shm)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.makedirs(root, exist_ok=True)
 
         # Because we're working in shared memory, it is more important than
diff --git a/torch/utils/benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
index 0b8a2163b3c4c..4de102f73877d 100644
--- a/torch/utils/benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -157,8 +157,12 @@ def __init__(
             trim_significant_figures: bool,
             highlight_warnings: bool
     ):
+<<<<<<< HEAD
         if len({r.label for r in results}) != 1:
             raise AssertionError("All results must share the same label")
+=======
+        assert len({r.label for r in results}) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.results = results
         self._colorize = colorize
@@ -281,7 +285,11 @@ class Compare:
     https://pytorch.org/tutorials/recipes/recipes/benchmark.html
 
     Args:
+<<<<<<< HEAD
         results: List of Measurement to display.
+=======
+        results: List of Measurment to display.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     def __init__(self, results: list[common.Measurement]):
         self._results: list[common.Measurement] = []
diff --git a/torch/utils/benchmark/utils/compile.py b/torch/utils/benchmark/utils/compile.py
index 777120c811057..55958600c5d0b 100644
--- a/torch/utils/benchmark/utils/compile.py
+++ b/torch/utils/benchmark/utils/compile.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Any, cast, Optional, Union
 from collections.abc import Callable
+=======
+from typing import Any, Callable, cast, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
@@ -15,7 +19,10 @@
 _default_float_32_precision = torch.get_float32_matmul_precision()
 
 try:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from tabulate import tabulate
 
     HAS_TABULATE = True
@@ -129,7 +136,11 @@ def bench_all(
         This is a simple utility that can be used to benchmark torch.compile
         In particular it ensures that your GPU is setup to use tensor cores if it supports its
         It also tries out all the main backends and prints a table of results so you can easily compare them all
+<<<<<<< HEAD
         Many of the backendds have their own optional dependencies so please pip install them separately
+=======
+        Many of the backendds have their own optional dependencies so please pip install them seperately
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         You will get one table for inference and another for training
         If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
@@ -170,7 +181,10 @@ def bench_all(
                             _disable_tensor_cores()
                             table.append([
                                 ("Training" if optimizer else "Inference"),
+<<<<<<< HEAD
                                 # pyrefly: ignore [redundant-condition]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 backend if backend else "-",
                                 mode if mode is not None else "-",
                                 f"{compilation_time} ms " if compilation_time else "-",
@@ -191,5 +205,8 @@ def bench_all(
                     ])
 
 
+<<<<<<< HEAD
         # pyrefly: ignore [not-callable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tabulate(table, headers=field_names, tablefmt="github")
diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py
index 969eb6abb6954..4765bacbaccb3 100644
--- a/torch/utils/benchmark/utils/cpp_jit.py
+++ b/torch/utils/benchmark/utils/cpp_jit.py
@@ -35,7 +35,10 @@ def _get_build_root() -> str:
     global _BUILD_ROOT
     if _BUILD_ROOT is None:
         _BUILD_ROOT = _make_temp_dir(prefix="benchmark_utils_jit_build")
+<<<<<<< HEAD
         # pyrefly: ignore [missing-argument]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         atexit.register(shutil.rmtree, _BUILD_ROOT)
     return _BUILD_ROOT
 
@@ -159,8 +162,12 @@ def compile_timeit_template(*, stmt: str, setup: str, global_setup: str) -> Time
         src: str = f.read()
 
     module = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=False)
+<<<<<<< HEAD
     if not isinstance(module, TimeitModuleType):
         raise AssertionError("compiled module is not a TimeitModuleType")
+=======
+    assert isinstance(module, TimeitModuleType)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return module
 
 
@@ -170,6 +177,10 @@ def compile_callgrind_template(*, stmt: str, setup: str, global_setup: str) -> s
         src: str = f.read()
 
     target = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=True)
+<<<<<<< HEAD
     if not isinstance(target, str):
         raise AssertionError("compiled target path is not a string")
+=======
+    assert isinstance(target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return target
diff --git a/torch/utils/benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
index f7fc21ceaf88b..580bbc6ecd5c2 100644
--- a/torch/utils/benchmark/utils/fuzzer.py
+++ b/torch/utils/benchmark/utils/fuzzer.py
@@ -1,8 +1,12 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools as it
+<<<<<<< HEAD
 from typing import Any, Optional, Union
 from collections.abc import Callable
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -93,6 +97,7 @@ def sample(self, state):
 
     def _check_distribution(self, distribution):
         if not isinstance(distribution, dict):
+<<<<<<< HEAD
             if distribution not in _DISTRIBUTIONS:
                 raise AssertionError(f"Unknown distribution: {distribution}")
         else:
@@ -104,6 +109,14 @@ def _check_distribution(self, distribution):
                 raise AssertionError("When passing a custom distribution, 'minval' must be None")
             if self._maxval is not None:
                 raise AssertionError("When passing a custom distribution, 'maxval' must be None")
+=======
+            assert distribution in _DISTRIBUTIONS
+        else:
+            assert not any(i < 0 for i in distribution.values()), "Probabilities cannot be negative"
+            assert abs(sum(distribution.values()) - 1) <= 1e-5, "Distribution is not normalized"
+            assert self._minval is None
+            assert self._maxval is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return distribution
 
@@ -333,8 +346,12 @@ def satisfies_constraints(self, params):
         size, _, allocation_size = self._get_size_and_steps(params)
         # Product is computed in Python to avoid integer overflow.
         num_elements = prod(size)
+<<<<<<< HEAD
         if num_elements < 0:
             raise AssertionError("Computed number of elements is negative")
+=======
+        assert num_elements >= 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         allocation_bytes = prod(allocation_size, base=dtype_size(self._dtype))
 
diff --git a/torch/utils/benchmark/utils/sparse_fuzzer.py b/torch/utils/benchmark/utils/sparse_fuzzer.py
index cd84900c5b438..7a0647561b9f6 100644
--- a/torch/utils/benchmark/utils/sparse_fuzzer.py
+++ b/torch/utils/benchmark/utils/sparse_fuzzer.py
@@ -70,8 +70,12 @@ def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
         """
         if isinstance(size, Number):
             size = [size] * sparse_dim
+<<<<<<< HEAD
         if all(size[d] <= 0 for d in range(sparse_dim)) and nnz != 0:
             raise AssertionError('invalid arguments')
+=======
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         v_size = [nnz] + list(size[sparse_dim:])
         if dtype.is_floating_point:
             v = torch.rand(size=v_size, dtype=dtype, device="cpu")
@@ -92,20 +96,32 @@ def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
         return x
 
     def _make_tensor(self, params, state):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         size, _, _ = self._get_size_and_steps(params)
         density = params['density']
         nnz = math.ceil(sum(size) * density)
         if nnz > sum(size):
             raise AssertionError('nnz cannot exceed total number of elements')
+=======
+        size, _, _ = self._get_size_and_steps(params)
+        density = params['density']
+        nnz = math.ceil(sum(size) * density)
+        assert nnz <= sum(size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         is_coalesced = params['coalesced']
         sparse_dim = params['sparse_dim'] if self._sparse_dim else len(size)
         sparse_dim = min(sparse_dim, len(size))
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
         tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
 
         # pyrefly: ignore [missing-attribute]
+=======
+        tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._cuda:
             tensor = tensor.cuda()
         sparse_dim = tensor.sparse_dim()
@@ -121,7 +137,10 @@ def _make_tensor(self, params, state):
             "sparse_dim": sparse_dim,
             "dense_dim": dense_dim,
             "is_hybrid": is_hybrid,
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "dtype": str(self._dtype),
         }
         return tensor, properties
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 3dc17edeb7964..96c91ae337720 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -2,8 +2,12 @@
 import enum
 import timeit
 import textwrap
+<<<<<<< HEAD
 from typing import overload, Any, NoReturn, Optional, Union
 from collections.abc import Callable
+=======
+from typing import overload, Any, Callable, NoReturn, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.utils.benchmark.utils import common, cpp_jit
@@ -14,9 +18,27 @@
 __all__ = ["Timer", "timer", "Language"]
 
 
+<<<<<<< HEAD
 if torch.accelerator.is_available():
     def timer() -> float:
         torch.accelerator.synchronize()
+=======
+if torch.backends.cuda.is_built() and torch.cuda.is_available():  # type: ignore[no-untyped-call]
+    def timer() -> float:
+        torch.cuda.synchronize()
+        return timeit.default_timer()
+elif torch.xpu.is_available():
+    def timer() -> float:
+        torch.xpu.synchronize()
+        return timeit.default_timer()
+elif torch._C._get_privateuse1_backend_name() != "privateuseone":
+    privateuse1_device_handler = getattr(torch, torch._C._get_privateuse1_backend_name(), None) \
+        if torch._C._get_privateuse1_backend_name() != "cpu" else None
+
+    def timer() -> float:
+        if privateuse1_device_handler:
+            privateuse1_device_handler.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return timeit.default_timer()
 else:
     timer = timeit.default_timer
@@ -38,12 +60,21 @@ def __init__(
     ) -> None:
         if timer is not timeit.default_timer:
             raise NotImplementedError(
+<<<<<<< HEAD
                 "PyTorch was built with accelerators and an accelerator is present; however "
                 "Timer does not yet support accelerator measurements. If your "
                 "code is CPU only, pass `timer=timeit.default_timer` to the "
                 "Timer's constructor to indicate this. (Note that this will "
                 "produce incorrect results if an accelerator is in fact used, as "
                 "Timer will not synchronize the accelerator.)"
+=======
+                "PyTorch was built with CUDA and a GPU is present; however "
+                "Timer does not yet support GPU measurements. If your "
+                "code is CPU only, pass `timer=timeit.default_timer` to the "
+                "Timer's constructor to indicate this. (Note that this will "
+                "produce incorrect results if the GPU is in fact used, as "
+                "Timer will not synchronize CUDA.)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if globals:
@@ -77,7 +108,11 @@ class Timer:
     1) Runtime aware:
         Timer will perform warmups (important as some elements of PyTorch are
         lazily initialized), set threadpool size so that comparisons are
+<<<<<<< HEAD
         apples-to-apples, and synchronize asynchronous accelerator functions when
+=======
+        apples-to-apples, and synchronize asynchronous CUDA functions when
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         necessary.
 
     2) Focus on replicates:
@@ -120,8 +155,13 @@ class Timer:
 
         timer:
             Callable which returns the current time. If PyTorch was built
+<<<<<<< HEAD
             without accelerators or there is no accelerator present, this defaults to
             `timeit.default_timer`; otherwise it will synchronize accelerators before
+=======
+            without CUDA or there is no GPU present, this defaults to
+            `timeit.default_timer`; otherwise it will synchronize CUDA before
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             measuring the time.
 
         globals:
@@ -208,8 +248,12 @@ def __init__(
                 )
 
         elif language in (Language.CPP, "cpp", "c++"):
+<<<<<<< HEAD
             if self._timer_cls is not timeit.Timer:
                 raise AssertionError("_timer_cls has already been swapped.")
+=======
+            assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._timer_cls = CPPTimer
             setup = ("" if setup == "pass" else setup)
             self._language = Language.CPP
@@ -234,7 +278,10 @@ def __init__(
         setup = textwrap.dedent(setup)
         setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
 
+<<<<<<< HEAD
         # pyrefly: ignore [bad-instantiation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._timer = self._timer_cls(
             stmt=stmt,
             setup=setup,
@@ -350,7 +397,11 @@ def blocked_autorange(
 
             2) A large block size better amortizes the cost of `timer`
                invocation, and results in a less biased measurement. This is
+<<<<<<< HEAD
                important because accelerator synchronization time is non-trivial
+=======
+               important because CUDA synchronization time is non-trivial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                (order single to low double digit microseconds) and would
                otherwise bias the measurement.
 
@@ -487,7 +538,11 @@ def collect_callgrind(
         the fact that a small number of iterations is generally sufficient to
         obtain good measurements.
 
+<<<<<<< HEAD
         In order to use this method `valgrind`, `callgrind_control`, and
+=======
+        In order to to use this method `valgrind`, `callgrind_control`, and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         `callgrind_annotate` must be installed.
 
         Because there is a process boundary between the caller (this process)
@@ -518,8 +573,12 @@ def collect_callgrind(
         # the parent process rather than the valgrind subprocess.
         self._timeit(1)
         is_python = (self._language == Language.PYTHON)
+<<<<<<< HEAD
         if not is_python and self._globals:
             raise AssertionError("_timer globals are only supported for Python timers")
+=======
+        assert is_python or not self._globals
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = valgrind_timer_interface.wrapper_singleton().collect_callgrind(
             task_spec=self._task_spec,
             globals=self._globals,
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 9080f82721600..3f019befea694 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -11,9 +11,14 @@
 import sys
 import textwrap
 from typing import (
+<<<<<<< HEAD
     cast, Any, NamedTuple,
     Optional, Union, TYPE_CHECKING)
 from collections.abc import Callable
+=======
+    cast, Any, Callable, NamedTuple,
+    Optional, Union, TYPE_CHECKING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 
 import torch
@@ -145,8 +150,12 @@ def _merge(
         second: "FunctionCounts",
         merge_fn: Callable[[int], int]
     ) -> "FunctionCounts":
+<<<<<<< HEAD
         if self.inclusive != second.inclusive:
             raise AssertionError("Cannot merge inclusive and exclusive counts.")
+=======
+        assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counts: collections.defaultdict[str, int] = collections.defaultdict(int)
         for c, fn in self:
             counts[fn] += c
@@ -449,13 +458,19 @@ def construct(self) -> str:
         load_lines = []
         for name, wrapped_value in self._globals.items():
             if wrapped_value.setup is not None:
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 load_lines.append(textwrap.dedent(wrapped_value.setup))
 
             if wrapped_value.serialization == Serialization.PICKLE:
                 path = os.path.join(self._data_dir, f"{name}.pkl")
                 load_lines.append(
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f"with open({repr(path)}, 'rb') as f:\n    {name} = pickle.load(f)")
                 with open(path, "wb") as f:
                     pickle.dump(wrapped_value.value, f)
@@ -465,13 +480,19 @@ def construct(self) -> str:
                 # TODO: Figure out if we can use torch.serialization.add_safe_globals here
                 # Using weights_only=False after the change in
                 # https://dev-discuss.pytorch.org/t/bc-breaking-change-torch-load-is-being-flipped-to-use-weights-only-true-by-default-in-the-nightlies-after-137602/2573
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 load_lines.append(f"{name} = torch.load({repr(path)}, weights_only=False)")
                 torch.save(wrapped_value.value, path)
 
             elif wrapped_value.serialization == Serialization.TORCH_JIT:
                 path = os.path.join(self._data_dir, f"{name}.pt")
+<<<<<<< HEAD
                 # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 load_lines.append(f"{name} = torch.jit.load({repr(path)})")
                 with open(path, "wb") as f:
                     torch.jit.save(wrapped_value.value, f)  # type: ignore[no-untyped-call]
@@ -497,8 +518,12 @@ def __init__(self) -> None:
         else:
             print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
             self._bindings_module = cpp_jit.get_compat_bindings()
+<<<<<<< HEAD
             if not all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols):
                 raise AssertionError("JIT-compiled callgrind bindings are missing required symbols")
+=======
+            assert all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._supported_platform = self._bindings_module._valgrind_supported_platform()
 
         self._commands_available: dict[str, bool] = {}
@@ -537,8 +562,12 @@ def collect_callgrind(
     ) -> tuple[CallgrindStats, ...]:
         """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
         self._validate()
+<<<<<<< HEAD
         if not is_python and collect_baseline:
             raise AssertionError("collect_baseline is only supported for Python timers")
+=======
+        assert is_python or not collect_baseline
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         *task_stats, baseline_stats = self._invoke(
             task_spec=task_spec,
@@ -549,8 +578,12 @@ def collect_callgrind(
             is_python=is_python,
             retain_out_file=retain_out_file,
         )
+<<<<<<< HEAD
         if len(task_stats) != repeats:
             raise AssertionError("Unexpected number of task stats returned from _invoke")
+=======
+        assert len(task_stats) == repeats
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return tuple(
             CallgrindStats(
@@ -642,8 +675,12 @@ def run(args: list[str], **kwargs: Any) -> tuple[CompletedProcessType, str]:
 
                 run_loop_cmd = ["python", script_file]
             else:
+<<<<<<< HEAD
                 if collect_baseline:
                     raise AssertionError("collect_baseline must be False for non-Python timers")
+=======
+                assert not collect_baseline
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 run_loop_exec = cpp_jit.compile_callgrind_template(
                     stmt=task_spec.stmt,
                     setup=task_spec.setup,
@@ -709,8 +746,12 @@ class ScanState(enum.Enum):
                             scan_state = ScanState.PARSING
 
                     else:
+<<<<<<< HEAD
                         if scan_state != ScanState.PARSING:
                             raise AssertionError("Failed to enter PARSING state while parsing callgrind_annotate output")
+=======
+                        assert scan_state == ScanState.PARSING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         fn_match = function_pattern.match(l)
                         if fn_match:
                             ir_str, file_function = fn_match.groups()
@@ -728,8 +769,12 @@ class ScanState(enum.Enum):
                         else:
                             break
 
+<<<<<<< HEAD
                 if scan_state != ScanState.PARSING:
                     raise AssertionError(f"Failed to parse {fpath}")
+=======
+                assert scan_state == ScanState.PARSING, f"Failed to parse {fpath}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
 
             def read_results(i: int) -> tuple[FunctionCounts, FunctionCounts, Optional[str]]:
diff --git a/torch/utils/bottleneck/__init__.py b/torch/utils/bottleneck/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py
new file mode 100644
index 0000000000000..d8bc43be0e2bb
--- /dev/null
+++ b/torch/utils/bottleneck/__main__.py
@@ -0,0 +1,229 @@
+# mypy: allow-untyped-defs
+import argparse
+import cProfile
+import pstats
+import sys
+import os
+
+import torch
+from torch.autograd import profiler
+from torch.utils.collect_env import get_env_info
+
+
+def redirect_argv(new_argv):
+    sys.argv[:] = new_argv[:]
+
+
+def compiled_with_cuda(sysinfo):
+    if sysinfo.cuda_compiled_version:
+        return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
+    return 'not compiled w/ CUDA'
+
+
+env_summary = """
+--------------------------------------------------------------------------------
+  Environment Summary
+--------------------------------------------------------------------------------
+PyTorch {pytorch_version}{debug_str} {cuda_compiled}
+Running with Python {py_version} and {cuda_runtime}
+
+`{pip_version} list` truncated output:
+{pip_list_output}
+""".strip()
+
+
+def run_env_analysis():
+    print('Running environment analysis...')
+    info = get_env_info()
+
+    result: dict[str, str] = {}
+
+    debug_str = ''
+    if info.is_debug_build:
+        debug_str = ' DEBUG'
+
+    cuda_avail = ''
+    if info.is_cuda_available:
+        cuda = info.cuda_runtime_version
+        if cuda is not None:
+            cuda_avail = 'CUDA ' + cuda
+    else:
+        cuda = 'CUDA unavailable'
+
+    pip_version = info.pip_version
+    pip_list_output = info.pip_packages
+    if pip_list_output is None:
+        pip_list_output = 'Unable to fetch'
+
+    result = {
+        'debug_str': debug_str,
+        'pytorch_version': info.torch_version,
+        'cuda_compiled': compiled_with_cuda(info),
+        'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
+        'cuda_runtime': cuda_avail,
+        'pip_version': pip_version,
+        'pip_list_output': pip_list_output,
+    }
+
+    return env_summary.format(**result)
+
+
+def run_cprofile(code, globs, launch_blocking=False):
+    print('Running your script with cProfile')
+    prof = cProfile.Profile()
+    prof.enable()
+    exec(code, globs, None)
+    prof.disable()
+    return prof
+
+
+cprof_summary = """
+--------------------------------------------------------------------------------
+  cProfile output
+--------------------------------------------------------------------------------
+""".strip()
+
+
+def print_cprofile_summary(prof, sortby='tottime', topk=15):
+    print(cprof_summary)
+    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
+    cprofile_stats.print_stats(topk)
+
+
+def run_autograd_prof(code, globs):
+    def run_prof(use_cuda=False):
+        with profiler.profile(use_cuda=use_cuda) as prof:
+            exec(code, globs, None)
+        return prof
+
+    print('Running your script with the autograd profiler...')
+    result = [run_prof(use_cuda=False)]
+    if torch.cuda.is_available():
+        result.append(run_prof(use_cuda=True))
+    else:
+        result.append(None)
+
+    return result
+
+
+autograd_prof_summary = """
+--------------------------------------------------------------------------------
+  autograd profiler output ({mode} mode)
+--------------------------------------------------------------------------------
+        {description}
+{cuda_warning}
+{output}
+""".strip()
+
+
+def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
+    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
+    if sortby not in valid_sortby:
+        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
+                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
+                'Defaulting to `cpu_time`.')
+        print(warn.format(sortby))
+        sortby = 'cpu_time'
+
+    if mode == 'CUDA':
+        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
+                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
+                        '\tPlease ignore this output if your code does not use CUDA.\n')
+    else:
+        cuda_warning = ''
+
+    sorted_events = sorted(prof.function_events,
+                           key=lambda x: getattr(x, sortby), reverse=True)
+    topk_events = sorted_events[:topk]
+
+    result = {
+        'mode': mode,
+        'description': f'top {topk} events sorted by {sortby}',
+        'output': torch.autograd.profiler_util._build_table(topk_events),
+        'cuda_warning': cuda_warning
+    }
+
+    print(autograd_prof_summary.format(**result))
+
+
+descript = """
+`bottleneck` is a tool that can be used as an initial step for debugging
+bottlenecks in your program.
+
+It summarizes runs of your script with the Python profiler and PyTorch\'s
+autograd profiler. Because your script will be profiled, please ensure that it
+exits in a finite amount of time.
+
+For more complicated uses of the profilers, please see
+https://docs.python.org/3/library/profile.html and
+https://pytorch.org/docs/main/autograd.html#profiler for more information.
+""".strip()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=descript)
+    parser.add_argument('scriptfile', type=str,
+                        help='Path to the script to be run. '
+                        'Usually run with `python path/to/script`.')
+    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
+                        help='Command-line arguments to be passed to the script.')
+    return parser.parse_args()
+
+
+def cpu_time_total(autograd_prof):
+    return sum(event.cpu_time_total for event in autograd_prof.function_events)
+
+
+def main():
+    args = parse_args()
+
+    # Customizable constants.
+    scriptfile = args.scriptfile
+    scriptargs = [] if args.args is None else args.args
+    scriptargs.insert(0, scriptfile)
+    cprofile_sortby = 'tottime'
+    cprofile_topk = 15
+    autograd_prof_sortby = 'cpu_time_total'
+    autograd_prof_topk = 15
+
+    redirect_argv(scriptargs)
+
+    sys.path.insert(0, os.path.dirname(scriptfile))
+    with open(scriptfile, 'rb') as stream:
+        code = compile(stream.read(), scriptfile, 'exec')
+    globs = {
+        '__file__': scriptfile,
+        '__name__': '__main__',
+        '__package__': None,
+        '__cached__': None,
+    }
+
+    print(descript)
+
+    env_summary = run_env_analysis()
+
+    if torch.cuda.is_available():
+        torch.cuda.init()
+    cprofile_prof = run_cprofile(code, globs)
+    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
+
+    print(env_summary)
+    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
+
+    if not torch.cuda.is_available():
+        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+        return
+
+    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
+    # if their execution times are very different.
+    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
+    if len(autograd_prof_cpu.function_events) > 0:
+        cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
+        pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
+        if abs(pct_diff) > 0.05:
+            print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+
+    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
+
+if __name__ == '__main__':
+    main()
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index ccb56172a077b..573d01d659758 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,7 +1,12 @@
 #!/usr/bin/env python3
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Any, TypeVar, Optional, NamedTuple, Union
 from collections.abc import Callable, Sequence
+=======
+from typing import Any, TypeVar, Optional, NamedTuple, Union, Callable
+from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import torch
 from torch._C import TupleType, ListType
@@ -116,6 +121,7 @@ def bundle_inputs(
     )
 
     # The above cloning function returns a torch._C.scriptmodule and we need a torch.jit.scriptmodule.
+<<<<<<< HEAD
     # Fortunately there is a function in _recursive that does exactly that conversion.
     cloned_module = wrap_cpp_module(clone)
     if isinstance(inputs, dict):
@@ -125,6 +131,15 @@ def bundle_inputs(
     else:
         if not isinstance(info, list) and info is not None:
             raise AssertionError("If inputs is a list, info must be a list or None")
+=======
+    # Fortunately theres a function in _recursive that does exactly that conversion.
+    cloned_module = wrap_cpp_module(clone)
+    if isinstance(inputs, dict):
+        assert isinstance(info, dict) or info is None
+        augment_many_model_functions_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
+    else:
+        assert isinstance(info, list) or info is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         augment_model_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
     return cloned_module
 
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 1b16da9f242f3..b89357fe26a5d 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -83,7 +83,11 @@ def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
 def check_backward_validity(inputs: Iterable[Any]) -> None:
     if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
         warnings.warn(
+<<<<<<< HEAD
             "None of the inputs have requires_grad=True. Gradients will be None", stacklevel=2
+=======
+            "None of the inputs have requires_grad=True. Gradients will be None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -132,7 +136,11 @@ def _infer_device_type(*args):
 
     def add_device_types(arg):
         nonlocal device_types
+<<<<<<< HEAD
         if isinstance(arg, torch.Tensor) and arg.device.type != "cpu":
+=======
+        if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_types.append(arg.device.type)
     tree_map(add_device_types, args)
 
@@ -144,7 +152,11 @@ def add_device_types(arg):
             "devices will be ignored. Consequently, if any checkpointed functions involve randomness, "
             "this may result in incorrect gradients. (Note that if CUDA devices are among the devices "
             "detected, it will be prioritized; otherwise, the first device encountered will be selected.)"
+<<<<<<< HEAD
             f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}", stacklevel=2
+=======
+            f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     if len(device_types) == 0:
         return DefaultDeviceType.get_device_type()
@@ -222,7 +234,10 @@ def _get_autocast_kwargs(device_type="cuda"):
 
 class CheckpointFunction(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(ctx, run_function, preserve_rng_state, *args):
         check_backward_validity(args)
         ctx.run_function = run_function
@@ -348,7 +363,10 @@ def checkpoint(
     context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
     determinism_check: str = _DEFAULT_DETERMINISM_MODE,
     debug: bool = False,
+<<<<<<< HEAD
     early_stop: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     **kwargs
 ):
     r"""Checkpoint a model or part of the model.
@@ -375,7 +393,11 @@ def checkpoint(
     .. warning::
 
         The ``use_reentrant`` parameter should be passed explicitly. In version
+<<<<<<< HEAD
         2.9 we will raise an exception if ``use_reentrant`` is not passed.
+=======
+        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         If you are using the ``use_reentrant=True`` variant, please refer to the
         note below for important considerations and potential limitations.
 
@@ -427,9 +449,12 @@ def checkpoint(
             passed as the tuple. For example, in LSTM, if user passes
             ``(activation, hidden)``, :attr:`function` should correctly use the
             first input as ``activation`` and the second input as ``hidden``
+<<<<<<< HEAD
         args: tuple containing inputs to the :attr:`function`
 
     Keyword args:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         preserve_rng_state(bool, optional):  Omit stashing and restoring
             the RNG state during each checkpoint. Note that under torch.compile,
             this flag doesn't take effect and we always preserve RNG state.
@@ -437,7 +462,11 @@ def checkpoint(
         use_reentrant(bool):
             specify whether to use the activation checkpoint variant that
             requires reentrant autograd. This parameter should be passed
+<<<<<<< HEAD
             explicitly. In version 2.9 we will raise an exception if
+=======
+            explicitly. In version 2.5 we will raise an exception if
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ``use_reentrant`` is not passed. If ``use_reentrant=False``,
             ``checkpoint`` will use an implementation that does not require
             reentrant autograd. This allows ``checkpoint`` to support additional
@@ -460,11 +489,15 @@ def checkpoint(
             a trace of the operators ran during the original forward computation
             as well as the recomputation. This argument is only supported if
             ``use_reentrant=False``.
+<<<<<<< HEAD
         early_stop(bool, optional): If ``True``, non-reentrant checkpoint stops
             recomputation as soon as it has computed all needed Tensors. This
             argument is ignored if ``use_reentrant=True``. Can be overridden
             globally using :func:`set_checkpoint_early_stop` context manager.
             Default: ``True``.
+=======
+        args: tuple containing inputs to the :attr:`function`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         Output of running :attr:`function` on :attr:`*args`
@@ -472,8 +505,13 @@ def checkpoint(
     if use_reentrant is None:
         warnings.warn(
             "torch.utils.checkpoint: the use_reentrant parameter should be "
+<<<<<<< HEAD
             "passed explicitly. Starting in PyTorch 2.9, calling checkpoint "
             "without use_reentrant will raise an exception. use_reentrant=False is "
+=======
+            "passed explicitly. In version 2.5 we will raise an exception "
+            "if use_reentrant is not passed. use_reentrant=False is "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
             "details on the differences between the two variants.",
@@ -497,7 +535,11 @@ def checkpoint(
         return CheckpointFunction.apply(function, preserve, *args)
     else:
         gen = _checkpoint_without_reentrant_generator(
+<<<<<<< HEAD
             function, preserve, context_fn, determinism_check, debug, early_stop, *args, **kwargs
+=======
+            function, preserve, context_fn, determinism_check, debug, *args, **kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Runs pre-forward logic
         next(gen)
@@ -520,7 +562,11 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar
 
     .. warning::
         The ``use_reentrant`` parameter should be passed explicitly. In version
+<<<<<<< HEAD
         2.9 we will raise an exception if ``use_reentrant`` is not passed.
+=======
+        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         If you are using the ``use_reentrant=True` variant, please see
         :func:`~torch.utils.checkpoint.checkpoint` for
         the important considerations and limitations of this variant. It is
@@ -561,11 +607,19 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar
         warnings.warn(
             "torch.utils.checkpoint.checkpoint_sequential: the use_reentrant "
             "parameter should be passed explicitly. "
+<<<<<<< HEAD
             "In version 2.9 we will raise an exception if use_reentrant "
             "is not passed. use_reentrant=False is "
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
             "details on the differences between the two variants.", stacklevel=2
+=======
+            "In version 2.5 we will raise an exception if use_reentrant "
+            "is not passed. use_reentrant=False is "
+            "recommended, but if you need to preserve the current default "
+            "behavior, you can pass use_reentrant=True. Refer to docs for more "
+            "details on the differences between the two variants."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         use_reentrant = True
 
@@ -740,7 +794,11 @@ def _internal_assert(cond):
 #    by holder=None. We skip over them. We still save x at (4) (since its holder
 #    is still alive.)
 
+<<<<<<< HEAD
 _enable_checkpoint_early_stop: Optional[bool] = None
+=======
+_enable_checkpoint_early_stop = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -785,7 +843,10 @@ def __init__(self):
 
 class _NoopSaveInputs(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(*args):
         return torch.empty((0,))
 
@@ -868,7 +929,11 @@ def check_recomputed_tensors_match(self, gid):
         if not len(self.weak_holders) == self.recomp_counter[gid]:
             # 2. During recompute, fewer tensors were saved
             #
+<<<<<<< HEAD
             # We know that every time we save something do original forward
+=======
+            # We know that everytime we save something do original forward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # we append to weak_holder, and every time we save a tensor
             # during recompute we increment recompute_counter.
             raise CheckpointError(
@@ -1008,7 +1073,10 @@ def get_context_manager(self):
             def logging_mode():
                 with LoggingTensorMode(), \
                      capture_logs(True, python_tb=True, script_tb=True, cpp_tb=cpp_tb) as logs_and_tb:
+<<<<<<< HEAD
                     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.logs, self.tbs = logs_and_tb
                     yield logs_and_tb
             return logging_mode()
@@ -1034,10 +1102,15 @@ def get_str_tb(label, capture_logs):
                     out += f"{line['filename']}:{line['line']}:{line['name']}\n"
                 out += "\n\n"
             return out
+<<<<<<< HEAD
         if capture_logs_fwd.logs is None:
             raise AssertionError("capture_logs_fwd.logs is None")
         if capture_logs_recompute.logs is None:
             raise AssertionError("capture_logs_recompute.logs is None")
+=======
+        assert capture_logs_fwd.logs is not None
+        assert capture_logs_recompute.logs is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise CheckpointError(
             _checkpoint_error_template.format(
                 forward_traces=get_str_tb("original", capture_logs_fwd),
@@ -1075,14 +1148,22 @@ def __init__(self, target_frame_ref: ReferenceType, gid: int):
         def pack_hook(x):
             x = x.detach() if x.requires_grad else x
             target_frame = target_frame_ref()
+<<<<<<< HEAD
             if target_frame is None:
                 raise AssertionError("Internal error: target_frame reference is None")
+=======
+            assert target_frame is not None  # appease mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             recomp_idx = target_frame.recomp_counter[gid]
             target_frame.recomp_counter[gid] += 1
 
             if recomp_idx >= len(target_frame.weak_holders):
+<<<<<<< HEAD
                 if target_frame.early_stop:
                     raise AssertionError("Unexpected state: target_frame.early_stop is set")
+=======
+                assert not target_frame.early_stop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not target_frame.forward_completed:
                     # We run into this case when early stop is not enabled and do
                     # grad within checkpoint.
@@ -1191,7 +1272,11 @@ def unpack_hook_with_error_cb(holder):
 def _is_compiling(func, args, kwargs):
     # Check if we are under AOTAutograd tracing
     # Checking that a functional mode is active should always do what we want
+<<<<<<< HEAD
     return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY) is not None
+=======
+    return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL) is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _VersionWrapper:
@@ -1285,7 +1370,11 @@ class CheckpointPolicy(enum.Enum):
 
 
 def _policy_from_bool(b):
+<<<<<<< HEAD
     # For backward compatibility
+=======
+    # For backward compatability
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return CheckpointPolicy.MUST_SAVE if b else CheckpointPolicy.PREFER_RECOMPUTE
 
 
@@ -1296,7 +1385,11 @@ def _policy_from_bool(b):
     # With subclasses involved, these metadata ops become dispatchable, this
     # can result in incorrectness if these ops are selected cached.
     torch.ops.prim.device.default,
+<<<<<<< HEAD
 } | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)  # type: ignore[has-type]
+=======
+} | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _CachingTorchDispatchMode(TorchDispatchMode):
@@ -1463,7 +1556,10 @@ def _checkpoint_without_reentrant_generator(
     context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
     determinism_check: str = _DEFAULT_DETERMINISM_MODE,
     debug: bool = False,
+<<<<<<< HEAD
     early_stop: bool = True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *args,
     **kwargs
 ):
@@ -1491,10 +1587,13 @@ def _checkpoint_without_reentrant_generator(
         debug(bool, optional): If ``True``, error messages will also include
             a trace of the operators ran during the original forward computation
             as well as the recomputation.
+<<<<<<< HEAD
         early_stop(bool, optional): If ``True``, non-reentrant checkpoint stops
             recomputation as soon as it has computed all needed Tensors. Can be
             overridden globally using :func:`set_checkpoint_early_stop` context
             manager. Default: ``True``.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *args: Arguments to pass in to the given ``function``.
         **kwargs: Keyword arguments to pass into the given ``function``.
     """
@@ -1519,6 +1618,7 @@ def _checkpoint_without_reentrant_generator(
     device_module = _get_device_module(device_type)
     forward_context, recompute_context = context_fn()
     if _is_compiling(fn, args, kwargs) and context_fn != noop_context_fn:
+<<<<<<< HEAD
         if (
             not isinstance(forward_context, TorchDispatchMode)
             or not isinstance(recompute_context, TorchDispatchMode)
@@ -1527,6 +1627,14 @@ def _checkpoint_without_reentrant_generator(
                 "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` "
                 "must generate a tuple of two `TorchDispatchMode`s."
             )
+=======
+        assert (
+            isinstance(forward_context, TorchDispatchMode) and
+            isinstance(recompute_context, TorchDispatchMode)
+        ), \
+            "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` " + \
+            "must generate a tuple of two `TorchDispatchMode`s."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
     device_autocast_kwargs, cpu_autocast_kwargs = _get_autocast_kwargs(device_type=device_type)
 
@@ -1565,7 +1673,11 @@ def recompute_fn(*inputs):
 
     new_frame = _CheckpointFrame(
         recompute_fn,
+<<<<<<< HEAD
         _enable_checkpoint_early_stop if _enable_checkpoint_early_stop is not None else early_stop,
+=======
+        _enable_checkpoint_early_stop,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unpack_error_cb,
         metadata_fn
     )
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index 3b8b62cfde6d4..9f2fd0b47bd75 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -6,22 +6,34 @@
 import datetime
 import json
 import locale
+<<<<<<< HEAD
 import os
 import re
 import subprocess
 import sys
 from collections import namedtuple
 from typing import cast as _cast
+=======
+import re
+import subprocess
+import sys
+import os
+from collections import namedtuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
     import torch
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_AVAILABLE = True
 except (ImportError, NameError, AttributeError, OSError):
     TORCH_AVAILABLE = False
 
 # System Environment Information
+<<<<<<< HEAD
 SystemEnv = namedtuple(
     "SystemEnv",
     [
@@ -53,6 +65,35 @@
         "cpu_info",
     ],
 )
+=======
+SystemEnv = namedtuple('SystemEnv', [
+    'torch_version',
+    'is_debug_build',
+    'cuda_compiled_version',
+    'gcc_version',
+    'clang_version',
+    'cmake_version',
+    'os',
+    'libc_version',
+    'python_version',
+    'python_platform',
+    'is_cuda_available',
+    'cuda_runtime_version',
+    'cuda_module_loading',
+    'nvidia_driver_version',
+    'nvidia_gpu_models',
+    'cudnn_version',
+    'pip_version',  # 'pip' or 'pip3'
+    'pip_packages',
+    'conda_packages',
+    'hip_compiled_version',
+    'hip_runtime_version',
+    'miopen_runtime_version',
+    'caching_allocator_config',
+    'is_xnnpack_available',
+    'cpu_info',
+])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 COMMON_PATTERNS = [
     "torch",
@@ -79,6 +120,7 @@
     "nvtx",
 ]
 
+<<<<<<< HEAD
 ONEAPI_PATTERNS = [
     "dpcpp-cpp-rt",
     "intel-cmplr-lib-rt",
@@ -103,6 +145,8 @@
     "tcmlib",
 ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONDA_PATTERNS = [
     "cudatoolkit",
     "soumith",
@@ -119,6 +163,7 @@
 
 def run(command):
     """Return (return-code, stdout, stderr)."""
+<<<<<<< HEAD
     shell = type(command) is str
     p = subprocess.Popen(
         command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
@@ -127,6 +172,15 @@ def run(command):
     rc = p.returncode
     if get_platform() == "win32":
         enc = "oem"
+=======
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         enc = locale.getpreferredencoding()
     output = raw_output.decode(enc)
@@ -152,19 +206,31 @@ def run_and_parse_first_match(run_lambda, command, regex):
         return None
     return match.group(1)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_and_return_first_line(run_lambda, command):
     """Run command using run_lambda and returns first line if output is not empty."""
     rc, out, _ = run_lambda(command)
     if rc != 0:
         return None
+<<<<<<< HEAD
     return out.split("\n")[0]
+=======
+    return out.split('\n')[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_conda_packages(run_lambda, patterns=None):
     if patterns is None:
+<<<<<<< HEAD
         patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
     conda = os.environ.get("CONDA_EXE", "conda")
+=======
+        patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out = run_and_read_all(run_lambda, "{} list".format(conda))
     if out is None:
         return out
@@ -172,6 +238,7 @@ def get_conda_packages(run_lambda, patterns=None):
     return "\n".join(
         line
         for line in out.splitlines()
+<<<<<<< HEAD
         if not line.startswith("#") and any(name in line for name in patterns)
     )
 
@@ -206,6 +273,34 @@ def get_gpu_info(run_lambda):
         and hasattr(torch.version, "hip")
         and torch.version.hip is not None
     ):
+=======
+        if not line.startswith("#")
+        and any(name in line for name in patterns)
+    )
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == 'darwin':
+        cmd = 'kextstat | grep -i cuda'
+        return run_and_parse_first_match(run_lambda, cmd,
+                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if TORCH_AVAILABLE and torch.cuda.is_available():
             if torch.version.hip is not None:
                 prop = torch.cuda.get_device_properties(0)
@@ -218,6 +313,7 @@ def get_gpu_info(run_lambda):
             return torch.cuda.get_device_name(None) + gcnArch
         return None
     smi = get_nvidia_smi()
+<<<<<<< HEAD
     uuid_regex = re.compile(r" \(UUID: .+?\)")
     rc, out, _ = run_lambda(smi + " -L")
     if rc != 0:
@@ -228,32 +324,65 @@ def get_gpu_info(run_lambda):
 
 def get_running_cuda_version(run_lambda):
     return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
+=======
+    uuid_regex = re.compile(r' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_cudnn_version(run_lambda):
     """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+<<<<<<< HEAD
     if get_platform() == "win32":
         system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
         cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
         where_cmd = os.path.join(system_root, "System32", "where")
         cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
     elif get_platform() == "darwin":
+=======
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, 'System32', 'where')
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == 'darwin':
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # CUDA libraries and drivers can be found in /usr/local/cuda/. See
         # https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-mac-os-x/index.html#installation
         # https://docs.nvidia.com/deeplearning/cudnn/installation/latest/
         # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+<<<<<<< HEAD
         cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
+=======
+        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
     rc, out, _ = run_lambda(cudnn_cmd)
     # find will return 1 if there are permission errors or if not found
     if len(out) == 0 or (rc != 1 and rc != 0):
+<<<<<<< HEAD
         l = os.environ.get("CUDNN_LIBRARY")
+=======
+        l = os.environ.get('CUDNN_LIBRARY')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if l is not None and os.path.isfile(l):
             return os.path.realpath(l)
         return None
     files_set = set()
+<<<<<<< HEAD
     for fn in out.split("\n"):
+=======
+    for fn in out.split('\n'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = os.path.realpath(fn)  # eliminate symbolic links
         if os.path.isfile(fn):
             files_set.add(fn)
@@ -263,12 +392,18 @@ def get_cudnn_version(run_lambda):
     files = sorted(files_set)
     if len(files) == 1:
         return files[0]
+<<<<<<< HEAD
     result = "\n".join(files)
     return "Probably one of the following:\n{}".format(result)
+=======
+    result = '\n'.join(files)
+    return 'Probably one of the following:\n{}'.format(result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_nvidia_smi():
     # Note: nvidia-smi is currently available only on Windows and Linux
+<<<<<<< HEAD
     smi = "nvidia-smi"
     if get_platform() == "win32":
         system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
@@ -277,6 +412,14 @@ def get_nvidia_smi():
             program_files_root, "NVIDIA Corporation", "NVSMI", smi
         )
         new_path = os.path.join(system_root, "System32", smi)
+=======
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files')
+        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi)
+        new_path = os.path.join(system_root, 'System32', smi)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         smis = [new_path, legacy_path]
         for candidate_smi in smis:
             if os.path.exists(candidate_smi):
@@ -285,6 +428,7 @@ def get_nvidia_smi():
     return smi
 
 
+<<<<<<< HEAD
 def _detect_linux_pkg_manager():
     if get_platform() != "linux":
         return "N/A"
@@ -433,6 +577,8 @@ def get_intel_gpu_detected(run_lambda):
     return "\n".join(devices)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # example outputs of CPU infos
 #  * linux
 #    Architecture:            x86_64
@@ -508,12 +654,20 @@ def get_intel_gpu_detected(run_lambda):
 #    ProcessorType=3
 #    Revision=27142
 
+<<<<<<< HEAD
 
 def get_cpu_info(run_lambda):
     rc, out, err = 0, "", ""
     if get_platform() == "linux":
         rc, out, err = run_lambda("lscpu")
     elif get_platform() == "win32":
+=======
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rc, out, err = run_lambda(
             'powershell.exe "gwmi -Class Win32_Processor | Select-Object -Property Name,Manufacturer,Family,\
             Architecture,ProcessorType,DeviceID,CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision\
@@ -533,9 +687,15 @@ def get_cpu_info(run_lambda):
                 lst.append(out)
                 lst.append(str(e))
             out = "\n".join(lst)
+<<<<<<< HEAD
     elif get_platform() == "darwin":
         rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
     cpu_info = "None"
+=======
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rc == 0:
         cpu_info = out
     else:
@@ -544,6 +704,7 @@ def get_cpu_info(run_lambda):
 
 
 def get_platform():
+<<<<<<< HEAD
     if sys.platform.startswith("linux"):
         return "linux"
     elif sys.platform.startswith("win32"):
@@ -552,12 +713,26 @@ def get_platform():
         return "cygwin"
     elif sys.platform.startswith("darwin"):
         return "darwin"
+=======
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return sys.platform
 
 
 def get_mac_version(run_lambda):
+<<<<<<< HEAD
     return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
+=======
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_windows_version(run_lambda):
@@ -575,6 +750,7 @@ def get_windows_version(run_lambda):
 
 
 def get_lsb_version(run_lambda):
+<<<<<<< HEAD
     return run_and_parse_first_match(
         run_lambda, "lsb_release -a", r"Description:\t(.*)"
     )
@@ -584,10 +760,19 @@ def check_release_file(run_lambda):
     return run_and_parse_first_match(
         run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
     )
+=======
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_os(run_lambda):
     from platform import machine
+<<<<<<< HEAD
 
     platform = get_platform()
 
@@ -605,13 +790,37 @@ def get_os(run_lambda):
         desc = get_lsb_version(run_lambda)
         if desc is not None:
             return "{} ({})".format(desc, machine())
+=======
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Try reading /etc/*-release
         desc = check_release_file(run_lambda)
         if desc is not None:
+<<<<<<< HEAD
             return "{} ({})".format(desc, machine())
 
         return "{} ({})".format(platform, machine())
+=======
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Unknown platform
     return platform
@@ -619,21 +828,31 @@ def get_os(run_lambda):
 
 def get_python_platform():
     import platform
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return platform.platform()
 
 
 def get_libc_version():
     import platform
+<<<<<<< HEAD
 
     if get_platform() != "linux":
         return "N/A"
     return "-".join(platform.libc_ver())
+=======
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_pip_packages(run_lambda, patterns=None):
     """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
     if patterns is None:
+<<<<<<< HEAD
         patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
 
     pip_version = "pip3" if sys.version_info.major == 3 else "pip"
@@ -649,22 +868,49 @@ def get_pip_packages(run_lambda, patterns=None):
 
     filtered_out = "\n".join(
         line for line in out.splitlines() if any(name in line for name in patterns)
+=======
+        patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+
+    pip_version = 'pip3' if sys.version_info.major == 3 else 'pip'
+
+    os.environ['PIP_DISABLE_PIP_VERSION_CHECK'] = '1'
+    # People generally have pip as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    out = run_and_read_all(run_lambda, [sys.executable, '-mpip', 'list', '--format=freeze'])
+    if out is None:
+        return pip_version, out
+
+    filtered_out = '\n'.join(
+        line
+        for line in out.splitlines()
+        if any(name in line for name in patterns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return pip_version, filtered_out
 
 
 def get_cachingallocator_config():
+<<<<<<< HEAD
     ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
     if not ca_config:
         ca_config = os.environ.get("PYTORCH_HIP_ALLOC_CONF", "")
+=======
+    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    if not ca_config:
+        ca_config = os.environ.get('PYTORCH_HIP_ALLOC_CONF', '')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ca_config
 
 
 def get_cuda_module_loading_config():
     if TORCH_AVAILABLE and torch.cuda.is_available():
         torch.cuda.init()
+<<<<<<< HEAD
         config = os.environ.get("CUDA_MODULE_LOADING", "")
+=======
+        config = os.environ.get('CUDA_MODULE_LOADING', '')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return config
     else:
         return "N/A"
@@ -673,12 +919,18 @@ def get_cuda_module_loading_config():
 def is_xnnpack_available():
     if TORCH_AVAILABLE:
         import torch.backends.xnnpack
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
     else:
         return "N/A"
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_env_info():
     """
     Collects environment information to aid in debugging.
@@ -692,7 +944,11 @@ def get_env_info():
     Caching allocator config, XNNPACK availability and CPU information.
 
     Returns:
+<<<<<<< HEAD
         SystemEnv (namedtuple): A tuple containing various environment details
+=======
+        SystemEnv (namedtuple): A tuple containining various environment details
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and system information.
     """
     run_lambda = run
@@ -703,6 +959,7 @@ def get_env_info():
         debug_mode_str = str(torch.version.debug)
         cuda_available_str = str(torch.cuda.is_available())
         cuda_version_str = torch.version.cuda
+<<<<<<< HEAD
         xpu_available_str = str(torch.xpu.is_available())
         if torch.xpu.is_available():
             xpu_available_str = (
@@ -730,6 +987,23 @@ def get_version_or_na(cfg, prefix):
     else:
         version_str = debug_mode_str = cuda_available_str = cuda_version_str = xpu_available_str = "N/A"  # type: ignore[assignment]
         hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+=======
+        if not hasattr(torch.version, 'hip') or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        else:  # HIP version
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else 'N/A'
+
+            cfg = torch._C._show_config().split('\n')
+            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
+            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
+            cuda_version_str = 'N/A'
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     sys_version = sys.version.replace("\n", " ")
 
@@ -738,9 +1012,13 @@ def get_version_or_na(cfg, prefix):
     return SystemEnv(
         torch_version=version_str,
         is_debug_build=debug_mode_str,
+<<<<<<< HEAD
         python_version="{} ({}-bit runtime)".format(
             sys_version, sys.maxsize.bit_length() + 1
         ),
+=======
+        python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         python_platform=get_python_platform(),
         is_cuda_available=cuda_available_str,
         cuda_compiled_version=cuda_version_str,
@@ -749,7 +1027,10 @@ def get_version_or_na(cfg, prefix):
         nvidia_gpu_models=get_gpu_info(run_lambda),
         nvidia_driver_version=get_nvidia_driver_version(run_lambda),
         cudnn_version=get_cudnn_version(run_lambda),
+<<<<<<< HEAD
         is_xpu_available=xpu_available_str,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hip_compiled_version=hip_compiled_version,
         hip_runtime_version=hip_runtime_version,
         miopen_runtime_version=miopen_runtime_version,
@@ -766,7 +1047,10 @@ def get_version_or_na(cfg, prefix):
         cpu_info=get_cpu_info(run_lambda),
     )
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env_info_fmt = """
 PyTorch version: {torch_version}
 Is debug build: {is_debug_build}
@@ -787,7 +1071,10 @@ def get_version_or_na(cfg, prefix):
 GPU models and configuration: {nvidia_gpu_models}
 Nvidia driver version: {nvidia_driver_version}
 cuDNN version: {cudnn_version}
+<<<<<<< HEAD
 Is XPU available: {is_xpu_available}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HIP runtime version: {hip_runtime_version}
 MIOpen runtime version: {miopen_runtime_version}
 Is XNNPACK available: {is_xnnpack_available}
@@ -802,14 +1089,22 @@ def get_version_or_na(cfg, prefix):
 
 
 def pretty_str(envinfo):
+<<<<<<< HEAD
     def replace_nones(dct, replacement="Could not collect"):
+=======
+    def replace_nones(dct, replacement='Could not collect'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key in dct.keys():
             if dct[key] is not None:
                 continue
             dct[key] = replacement
         return dct
 
+<<<<<<< HEAD
     def replace_bools(dct, true="Yes", false="No"):
+=======
+    def replace_bools(dct, true='Yes', false='No'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key in dct.keys():
             if dct[key] is True:
                 dct[key] = true
@@ -817,25 +1112,40 @@ def replace_bools(dct, true="Yes", false="No"):
                 dct[key] = false
         return dct
 
+<<<<<<< HEAD
     def prepend(text, tag="[prepend]"):
         lines = text.split("\n")
         updated_lines = [tag + line for line in lines]
         return "\n".join(updated_lines)
 
     def replace_if_empty(text, replacement="No relevant packages"):
+=======
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if text is not None and len(text) == 0:
             return replacement
         return text
 
     def maybe_start_on_next_line(string):
         # If `string` is multiline, prepend a \n to it.
+<<<<<<< HEAD
         if string is not None and len(string.split("\n")) > 1:
             return "\n{}\n".format(string)
+=======
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return string
 
     mutable_dict = envinfo._asdict()
 
     # If nvidia_gpu_models is multiline, start on the next line
+<<<<<<< HEAD
     mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
         envinfo.nvidia_gpu_models
     )
@@ -859,6 +1169,25 @@ def maybe_start_on_next_line(string):
             mutable_dict[field] = "No CUDA"
         if envinfo.cuda_compiled_version is None:
             mutable_dict["cuda_compiled_version"] = "None"
+=======
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Replace True with Yes, False with No
     mutable_dict = replace_bools(mutable_dict)
@@ -867,6 +1196,7 @@ def maybe_start_on_next_line(string):
     mutable_dict = replace_nones(mutable_dict)
 
     # If either of these are '', replace with 'No relevant packages'
+<<<<<<< HEAD
     mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
     mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
 
@@ -881,6 +1211,20 @@ def maybe_start_on_next_line(string):
             mutable_dict["conda_packages"], "[conda] "
         )
     mutable_dict["cpu_info"] = envinfo.cpu_info
+=======
+    mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'],
+                                               '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'],
+                                                 '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return env_info_fmt.format(**mutable_dict)
 
 
@@ -904,6 +1248,7 @@ def main():
     output = get_pretty_env_info()
     print(output)
 
+<<<<<<< HEAD
     if (
         TORCH_AVAILABLE
         and hasattr(torch, "utils")
@@ -929,4 +1274,20 @@ def main():
 
 
 if __name__ == "__main__":
+=======
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+
+if __name__ == '__main__':
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     main()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index aa5ca8c4b5d3a..23b866e2b130c 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -11,7 +11,10 @@
 import subprocess
 import sys
 import sysconfig
+<<<<<<< HEAD
 import types
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import collections
 from pathlib import Path
 import errno
@@ -23,11 +26,19 @@
 import torch._appdirs
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
+<<<<<<< HEAD
 from typing import Optional, Union
 from typing_extensions import deprecated
 from torch.torch_version import TorchVersion, Version
 
 
+=======
+from .hipify import hipify_python
+from .hipify.hipify_python import GeneratedFileCleaner
+from typing import Optional, Union
+from torch.torch_version import TorchVersion, Version
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from setuptools.command.build_ext import build_ext
 
 IS_WINDOWS = sys.platform == 'win32'
@@ -208,7 +219,11 @@ def _join_sycl_home(*paths) -> str:
     "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
     "Your compiler (%s) is not compatible with the compiler Pytorch was"
     "built with for this platform, which is %s on %s. Please"
+<<<<<<< HEAD
     "use %s to compile your extension. Alternatively, you may"
+=======
+    "use %s to to compile your extension. Alternatively, you may"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "compile PyTorch from source using %s, and then you can also use"
     "%s to compile your extension."
     "See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help"
@@ -229,7 +244,11 @@ def _join_sycl_home(*paths) -> str:
 )
 ROCM_HOME = _find_rocm_home() if (torch.cuda._is_compiled() and torch.version.hip) else None
 HIP_HOME = _join_rocm_home('hip') if ROCM_HOME else None
+<<<<<<< HEAD
 IS_HIP_EXTENSION = bool(ROCM_HOME is not None and torch.version.hip is not None)
+=======
+IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ROCM_VERSION = None
 if torch.version.hip is not None:
     ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
@@ -237,7 +256,10 @@ def _join_sycl_home(*paths) -> str:
 CUDA_HOME = _find_cuda_home() if (torch.cuda._is_compiled() and torch.version.cuda) else None
 CUDNN_HOME = os.environ.get('CUDNN_HOME') or os.environ.get('CUDNN_PATH')
 SYCL_HOME = _find_sycl_home() if torch.xpu._is_compiled() else None
+<<<<<<< HEAD
 WINDOWS_CUDA_HOME = os.environ.get('WINDOWS_CUDA_HOME')  # used for AOTI cross-compilation
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # PyTorch releases have the version pattern major.minor.patch, whereas when
 # PyTorch is built from source, we append the git commit hash, which gives
@@ -277,6 +299,7 @@ def _join_sycl_home(*paths) -> str:
     '-DHIP_ENABLE_WARP_SYNC_BUILTINS=1'
 ]
 
+<<<<<<< HEAD
 if IS_WINDOWS:
     # Compatibility flags, similar to those set in cmake/Dependencies.cmake.
     COMMON_HIPCC_FLAGS.append('-fms-extensions')
@@ -294,6 +317,8 @@ def _get_icpx_version() -> str:
         raise AssertionError("Failed to parse DPC++ compiler version")
     # Aligning version format with what torch.version.xpu() returns
     return f"{version[0]}{version[1]:02}{version[2]:02}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_sycl_arch_list():
@@ -322,11 +347,18 @@ def _append_sycl_targets_if_missing(cflags):
         cflags.append('-fsycl-targets=spir64')
 
 def _get_sycl_device_flags(cflags):
+<<<<<<< HEAD
     # We need last occurrence of -fsycl-targets as it will be the one taking effect.
     # So searching in reversed list.
     flags = [f for f in reversed(cflags) if f.startswith('-fsycl-targets=')]
     if not flags:
         raise AssertionError("bug: -fsycl-targets should have been amended to cflags")
+=======
+    # We need last occurence of -fsycl-targets as it will be the one taking effect.
+    # So searching in reversed list.
+    flags = [f for f in reversed(cflags) if f.startswith('-fsycl-targets=')]
+    assert flags, "bug: -fsycl-targets should have been ammended to cflags"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     arch_list = _get_sycl_arch_list()
     if arch_list != '':
@@ -350,7 +382,11 @@ def _get_sycl_device_flags(cflags):
     'win-amd64' : 'x86_amd64',
 }
 
+<<<<<<< HEAD
 min_supported_cpython = "0x030A0000"  # Python 3.10 hexcode
+=======
+min_supported_cpython = "0x03090000"  # Python 3.9 hexcode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_cxx_compiler():
     if IS_WINDOWS:
@@ -474,18 +510,32 @@ def get_compiler_abi_compatibility_and_version(compiler) -> tuple[bool, TorchVer
     try:
         if IS_LINUX:
             minimum_required_version = MINIMUM_GCC_VERSION
+<<<<<<< HEAD
             compiler_info = subprocess.check_output([compiler, '-dumpfullversion', '-dumpversion'])
         else:
             minimum_required_version = MINIMUM_MSVC_VERSION
             compiler_info = subprocess.check_output(compiler, stderr=subprocess.STDOUT)
         match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode(*SUBPROCESS_DECODE_ARGS).strip())
         version = ['0', '0', '0'] if match is None else list(match.groups())
+=======
+            versionstr = subprocess.check_output([compiler, '-dumpfullversion', '-dumpversion'])
+            version = versionstr.decode(*SUBPROCESS_DECODE_ARGS).strip().split('.')
+        else:
+            minimum_required_version = MINIMUM_MSVC_VERSION
+            compiler_info = subprocess.check_output(compiler, stderr=subprocess.STDOUT)
+            match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode(*SUBPROCESS_DECODE_ARGS).strip())
+            version = ['0', '0', '0'] if match is None else list(match.groups())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except Exception:
         _, error, _ = sys.exc_info()
         logger.warning('Error checking compiler version for %s: %s', compiler, error)
         return (False, TorchVersion('0.0.0'))
 
+<<<<<<< HEAD
     # convert alphanumeric string to numeric string
+=======
+    # convert alpha-numeric string to numeric string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # amdclang++ returns str like 0.0.0git, others return 0.0.0
     numeric_version = [re.sub(r'\D', '', v) for v in version]
 
@@ -557,7 +607,10 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
                 f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
             )
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Specify Visual Studio C runtime library for hipcc
 def _set_hipcc_runtime_lib(is_standalone, debug):
     if is_standalone:
@@ -664,8 +717,12 @@ def build_extensions(self) -> None:
             extension = next(extension_iter, None)
 
         if sycl_ext:
+<<<<<<< HEAD
             if not self.use_ninja:
                 raise AssertionError("ninja is required to build sycl extensions.")
+=======
+            assert self.use_ninja, "ninja is required to build sycl extensions."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if cuda_ext and not IS_HIP_EXTENSION:
             _check_cuda_version(compiler_name, compiler_version)
@@ -694,6 +751,7 @@ def build_extensions(self) -> None:
                 # min supported CPython version.
                 # See https://docs.python.org/3/c-api/stable.html#c.Py_LIMITED_API
                 self._add_compile_flag(extension, f'-DPy_LIMITED_API={min_supported_cpython}')
+<<<<<<< HEAD
             self._define_torch_extension_name(extension)
 
             if 'nvcc_dlink' in extension.extra_compile_args:
@@ -701,6 +759,21 @@ def build_extensions(self) -> None:
                     raise AssertionError(
                         f"With dlink=True, ninja is required to build cuda extension {extension.name}."
                     )
+=======
+            else:
+                # pybind11 is not CPython API stable so don't add these flags used when
+                # compiling pybind11 when pybind11 is not even used. otherwise, the build
+                # logs are confusing.
+                # See note [Pybind11 ABI constants]
+                for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+                    val = getattr(torch._C, f"_PYBIND11_{name}")
+                    if val is not None and not IS_WINDOWS:
+                        self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
+            self._define_torch_extension_name(extension)
+
+            if 'nvcc_dlink' in extension.extra_compile_args:
+                assert self.use_ninja, f"With dlink=True, ninja is required to build cuda extension {extension.name}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Register .cu, .cuh, .hip, .mm and .sycl as valid source extensions.
         # NOTE: At the moment .sycl is not a standard extension for SYCL supported
@@ -785,7 +858,11 @@ def unix_wrap_ninja_compile(sources,
             r"""Compiles sources by outputting a ninja file and running it."""
             # NB: I copied some lines from self.compiler (which is an instance
             # of distutils.UnixCCompiler). See the following link.
+<<<<<<< HEAD
             # https://github.com/python/cpython/blob/f03a8f8d5001963ad5b5b28dbd95497e9cc15596/Lib/distutils/ccompiler.py#L564-L567  # codespell:ignore
+=======
+            # https://github.com/python/cpython/blob/f03a8f8d5001963ad5b5b28dbd95497e9cc15596/Lib/distutils/ccompiler.py#L564-L567
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This can be fragile, but a lot of other repos also do this
             # (see https://github.com/search?q=_setup_compile&type=Code)
             # so it is probably OK; we'll also get CI signal if/when
@@ -794,7 +871,10 @@ def unix_wrap_ninja_compile(sources,
 
             # Use absolute path for output_dir so that the object file paths
             # (`objects`) get generated with absolute paths.
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_dir = os.path.abspath(output_dir)
 
             # See Note [Absolute include_dirs]
@@ -858,11 +938,15 @@ def unix_wrap_ninja_compile(sources,
                 host_cflags = extra_cc_cflags + common_cflags + post_cflags
                 append_std17_if_no_std_present(host_cflags)
                 # escaping quoted arguments to pass them thru SYCL compiler
+<<<<<<< HEAD
                 icpx_version = _get_icpx_version()
                 if int(icpx_version) >= 20250200:
                     host_cflags = [item.replace('"', '\\"') for item in host_cflags]
                 else:
                     host_cflags = [item.replace('"', '\\\\"') for item in host_cflags]
+=======
+                host_cflags = [item.replace('"', '\\\\"') for item in host_cflags]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 host_cflags = ' '.join(host_cflags)
                 # Note the order: shlex.quote sycl_flags first, _wrap_sycl_host_flags
                 # second. Reason is that sycl host flags are quoted, space containing
@@ -919,7 +1003,11 @@ def spawn(cmd):
                     if m
                 ]
 
+<<<<<<< HEAD
                 obj_regex = re.compile('/Fo(.*)')  # codespell:ignore
+=======
+                obj_regex = re.compile('/Fo(.*)')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 obj_list = [
                     m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
                     if m
@@ -985,7 +1073,10 @@ def win_wrap_ninja_compile(sources,
                                    is_standalone=False):
             if not self.compiler.initialized:
                 self.compiler.initialize()
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_dir = os.path.abspath(output_dir)
 
             # Note [Absolute include_dirs]
@@ -1078,7 +1169,11 @@ def win_wrap_ninja_compile(sources,
             # Return *all* object filenames, not just the ones we just built.
             return objects
         # Monkey-patch the _compile or compile method.
+<<<<<<< HEAD
         # https://github.com/python/cpython/blob/dc0284ee8f7a270b6005467f26d8e5773d76e959/Lib/distutils/ccompiler.py#L511  # codespell:ignore
+=======
+        # https://github.com/python/cpython/blob/dc0284ee8f7a270b6005467f26d8e5773d76e959/Lib/distutils/ccompiler.py#L511
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.compiler.compiler_type == 'msvc':
             if self.use_ninja:
                 self.compiler.compile = win_wrap_ninja_compile
@@ -1378,7 +1473,10 @@ def CUDAExtension(name, sources, *args, **kwargs):
     include_dirs = kwargs.get('include_dirs', [])
 
     if IS_HIP_EXTENSION:
+<<<<<<< HEAD
         from .hipify import hipify_python
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_dir = os.getcwd()
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
@@ -1507,7 +1605,11 @@ def SyclExtension(name, sources, *args, **kwargs):
 
     return setuptools.Extension(name, sources, *args, **kwargs)
 
+<<<<<<< HEAD
 def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str]:
+=======
+def include_paths(device_type: str = "cpu") -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get the include paths required to build a C++ or CUDA or SYCL extension.
 
@@ -1516,6 +1618,7 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
     Returns:
         A list of include path strings.
     """
+<<<<<<< HEAD
     paths = []
     lib_include = os.path.join(_TORCH_PATH, 'include')
     if torch_include_dirs:
@@ -1524,6 +1627,14 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
             # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
             os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
         ])
+=======
+    lib_include = os.path.join(_TORCH_PATH, 'include')
+    paths = [
+        lib_include,
+        # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
+        os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if device_type == "cuda" and IS_HIP_EXTENSION:
         paths.append(os.path.join(lib_include, 'THH'))
         paths.append(_join_rocm_home('include'))
@@ -1537,7 +1648,10 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
         # Support CUDA_INC_PATH env variable supported by CMake files
         if (cuda_inc_path := os.environ.get("CUDA_INC_PATH", None)) and \
                 cuda_inc_path != '/usr/include':
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             paths.append(cuda_inc_path)
         if CUDNN_HOME is not None:
             paths.append(os.path.join(CUDNN_HOME, 'include'))
@@ -1547,7 +1661,11 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
     return paths
 
 
+<<<<<<< HEAD
 def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cross_target_platform: Optional[str] = None) -> list[str]:
+=======
+def library_paths(device_type: str = "cpu") -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get the library paths required to build a C++ or CUDA extension.
 
@@ -1557,12 +1675,17 @@ def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cro
     Returns:
         A list of library path strings.
     """
+<<<<<<< HEAD
 
     paths = []
 
     if torch_include_dirs:
         # We need to link against libtorch.so
         paths.extend([TORCH_LIB_PATH])
+=======
+    # We need to link against libtorch.so
+    paths = [TORCH_LIB_PATH]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if device_type == "cuda" and IS_HIP_EXTENSION:
         lib_dir = 'lib'
@@ -1570,6 +1693,7 @@ def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cro
         if HIP_HOME is not None:
             paths.append(os.path.join(HIP_HOME, 'lib'))
     elif device_type == "cuda":
+<<<<<<< HEAD
         if cross_target_platform == "windows":
             lib_dir = os.path.join('lib', 'x64')
             if WINDOWS_CUDA_HOME is None:
@@ -1590,6 +1714,22 @@ def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cro
             paths.append(_join_cuda_home(lib_dir))
             if CUDNN_HOME is not None:
                 paths.append(os.path.join(CUDNN_HOME, lib_dir))
+=======
+        if IS_WINDOWS:
+            lib_dir = os.path.join('lib', 'x64')
+        else:
+            lib_dir = 'lib64'
+            if (not os.path.exists(_join_cuda_home(lib_dir)) and
+                    os.path.exists(_join_cuda_home('lib'))):
+                # 64-bit CUDA may be installed in 'lib' (see e.g. gh-16955)
+                # Note that it's also possible both don't exist (see
+                # _find_cuda_home) - in that case we stay with 'lib64'.
+                lib_dir = 'lib'
+
+        paths.append(_join_cuda_home(lib_dir))
+        if CUDNN_HOME is not None:
+            paths.append(os.path.join(CUDNN_HOME, lib_dir))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif device_type == "xpu":
         if IS_WINDOWS:
             lib_dir = os.path.join('lib', 'x64')
@@ -1729,9 +1869,31 @@ def load(name,
         is_standalone,
         keep_intermediates=keep_intermediates)
 
+<<<<<<< HEAD
 @deprecated("PyBind11 ABI handling is internal to PyBind11; this will be removed after PyTorch 2.9.0")
 def _get_pybind11_abi_build_flags() -> list[str]:
     return []
+=======
+def _get_pybind11_abi_build_flags():
+    # Note [Pybind11 ABI constants]
+    #
+    # Pybind11 before 2.4 used to build an ABI strings using the following pattern:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__"
+    # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}{PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__"
+    #
+    # This was done in order to further narrow down the chances of compiler ABI incompatibility
+    # that can cause a hard to debug segfaults.
+    # For PyTorch extensions we want to relax those restrictions and pass compiler, stdlib and abi properties
+    # captured during PyTorch native library compilation in torch/csrc/Module.cpp
+
+    abi_cflags = []
+    for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+        pval = getattr(torch._C, f"_PYBIND11_{pname}")
+        if pval is not None and not IS_WINDOWS:
+            abi_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
+    return abi_cflags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def check_compiler_is_gcc(compiler):
     if not IS_LINUX:
@@ -1763,7 +1925,11 @@ def _check_and_build_extension_h_precompiler_headers(
         is_standalone=False):
     r'''
     Precompiled Headers(PCH) can pre-build the same headers and reduce build time for pytorch load_inline modules.
+<<<<<<< HEAD
     GCC official manual: https://gcc.gnu.org/onlinedocs/gcc-4.0.4/gcc/Precompiled-Headers.html
+=======
+    GCC offical manual: https://gcc.gnu.org/onlinedocs/gcc-4.0.4/gcc/Precompiled-Headers.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PCH only works when built pch file(header.h.gch) and build target have the same build parameters. So, We need
     add a signature file to record PCH file parameters. If the build parameters(signature) changed, it should rebuild
     PCH file.
@@ -1862,6 +2028,10 @@ def build_precompile_header(pch_cmd):
         common_cflags += ['-DTORCH_API_INCLUDE_EXTENSION_H']
 
     common_cflags += ['-std=c++17', '-fPIC']
+<<<<<<< HEAD
+=======
+    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     common_cflags_str = listToString(common_cflags)
 
     pch_cmd = format_precompiler_header_cmd(compiler, head_file, head_file_pch, common_cflags_str, torch_include_dirs_str, extra_cflags_str, extra_include_paths_str)
@@ -2102,7 +2272,11 @@ def _jit_compile(name,
                  with_sycl: Optional[bool],
                  is_python_module,
                  is_standalone,
+<<<<<<< HEAD
                  keep_intermediates=True) -> Union[types.ModuleType, str]:
+=======
+                 keep_intermediates=True) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if is_python_module and is_standalone:
         raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.")
 
@@ -2132,8 +2306,11 @@ def _jit_compile(name,
     if baton.try_acquire():
         try:
             if version != old_version:
+<<<<<<< HEAD
                 from .hipify import hipify_python
                 from .hipify.hipify_python import GeneratedFileCleaner
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
                         hipify_result = hipify_python.hipify(
@@ -2316,7 +2493,11 @@ def _write_ninja_file_and_build_library(
 def is_ninja_available():
     """Return ``True`` if the `ninja <https://ninja-build.org/>`_ build system is available on the system, ``False`` otherwise."""
     try:
+<<<<<<< HEAD
         subprocess.check_output(['ninja', '--version'])
+=======
+        subprocess.check_output('ninja --version'.split())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except Exception:
         return False
     else:
@@ -2335,10 +2516,17 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
 
         extra_ldflags.append('c10.lib')
         if with_cuda:
+<<<<<<< HEAD
             extra_ldflags.append('c10_hip.lib' if IS_HIP_EXTENSION else 'c10_cuda.lib')
         extra_ldflags.append('torch_cpu.lib')
         if with_cuda:
             extra_ldflags.append('torch_hip.lib' if IS_HIP_EXTENSION else 'torch_cuda.lib')
+=======
+            extra_ldflags.append('c10_cuda.lib')
+        extra_ldflags.append('torch_cpu.lib')
+        if with_cuda:
+            extra_ldflags.append('torch_cuda.lib')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # /INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
             # Related issue: https://github.com/pytorch/pytorch/issues/31611
             extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
@@ -2366,7 +2554,11 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     if with_cuda:
         if verbose:
             logger.info('Detected CUDA files, patching ldflags')
+<<<<<<< HEAD
         if IS_WINDOWS and not IS_HIP_EXTENSION:
+=======
+        if IS_WINDOWS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib", "x64")}')
             extra_ldflags.append('cudart.lib')
             if CUDNN_HOME is not None:
@@ -2383,12 +2575,17 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
             if CUDNN_HOME is not None:
                 extra_ldflags.append(f'-L{os.path.join(CUDNN_HOME, "lib64")}')
         elif IS_HIP_EXTENSION:
+<<<<<<< HEAD
             if IS_WINDOWS:
                 extra_ldflags.append(f'/LIBPATH:{_join_rocm_home("lib")}')
                 extra_ldflags.append('amdhip64.lib')
             else:
                 extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
                 extra_ldflags.append('-lamdhip64')
+=======
+            extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
+            extra_ldflags.append('-lamdhip64')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return extra_ldflags
 
 
@@ -2428,13 +2625,21 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
         ('Ampere', '8.0;8.6+PTX'),
         ('Ada', '8.9+PTX'),
         ('Hopper', '9.0+PTX'),
+<<<<<<< HEAD
         ('Blackwell+Tegra', '11.0'),
+=======
+        ('Blackwell+Tegra', '10.1'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ('Blackwell', '10.0;10.3;12.0;12.1+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
                         '7.0', '7.2', '7.5', '8.0', '8.6', '8.7', '8.9', '9.0', '9.0a',
+<<<<<<< HEAD
                         '10.0', '10.0a', '11.0', '11.0a', '10.3', '10.3a', '12.0',
+=======
+                        '10.0', '10.0a', '10.1', '10.1a', '10.3', '10.3a', '12.0',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         '12.0a', '12.1', '12.1a']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
@@ -2444,8 +2649,16 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
     # See cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
     _arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
 
+<<<<<<< HEAD
     # If not given or set as native, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list or _arch_list == "native":
+=======
+    # If not given, determine what's best for the GPU / CUDA version that can be found
+    if not _arch_list:
+        logger.warning(
+            "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
+            "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2464,6 +2677,7 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                 arch_list.append(arch)
         arch_list = sorted(arch_list)
         arch_list[-1] += '+PTX'
+<<<<<<< HEAD
 
         if not _arch_list:
             # Only log on rank 0 in distributed settings to avoid spam
@@ -2473,12 +2687,19 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                     "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
                     "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
                     arch_list_str)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')
         # Expand named arches
+<<<<<<< HEAD
         for named_arch, archival in named_arches.items():
             _arch_list = _arch_list.replace(named_arch, archival)
+=======
+        for named_arch, archval in named_arches.items():
+            _arch_list = _arch_list.replace(named_arch, archval)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         arch_list = _arch_list.split(';')
 
@@ -2579,7 +2800,10 @@ def _get_num_workers(verbose: bool) -> Optional[int]:
 def _get_vc_env(vc_arch: str) -> dict[str, str]:
     try:
         from setuptools import distutils  # type: ignore[attr-defined]
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return distutils._msvccompiler._get_vc_env(vc_arch)
     except AttributeError:
         try:
@@ -2614,7 +2838,11 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         # subprocess.run assumes that sys.__stdout__ has not been modified and
         # attempts to write to it by default.  However, when we call _run_ninja_build
         # from ahead-of-time cpp extensions, the following happens:
+<<<<<<< HEAD
         # 1) If the stdout encoding is not utf-8, setuptools detaches __stdout__.
+=======
+        # 1) If the stdout encoding is not utf-8, setuptools detachs __stdout__.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #    https://github.com/pypa/setuptools/blob/7e97def47723303fafabe48b22168bbc11bb4821/setuptools/dist.py#L1110
         #    (it probably shouldn't do this)
         # 2) subprocess.run (on POSIX, with no stdout override) relies on
@@ -2659,11 +2887,17 @@ def _import_module_from_library(module_name, path, is_python_module):
     if is_python_module:
         # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
         spec = importlib.util.spec_from_file_location(module_name, filepath)
+<<<<<<< HEAD
         if spec is None:
             raise AssertionError(f"Failed to create spec for module {module_name} at {filepath}")
         module = importlib.util.module_from_spec(spec)
         if not isinstance(spec.loader, importlib.abc.Loader):
             raise AssertionError("spec.loader is not a valid importlib Loader")
+=======
+        assert spec is not None
+        module = importlib.util.module_from_spec(spec)
+        assert isinstance(spec.loader, importlib.abc.Loader)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         spec.loader.exec_module(module)
         return module
     else:
@@ -2710,6 +2944,11 @@ def _write_ninja_file_to_build_library(path,
         common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}')
         common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
 
+<<<<<<< HEAD
+=======
+    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Windows does not understand `-isystem` and quotes flags later.
     if IS_WINDOWS:
         common_cflags += [f'-I{include}' for include in user_includes + system_includes]
@@ -2718,14 +2957,20 @@ def _write_ninja_file_to_build_library(path,
         common_cflags += [f'-isystem {shlex.quote(include)}' for include in system_includes]
 
     if IS_WINDOWS:
+<<<<<<< HEAD
         COMMON_HIP_FLAGS.extend(['-fms-runtime-lib=dll'])
         cflags = common_cflags + ['/std:c++17'] + extra_cflags
         cflags += COMMON_MSVC_FLAGS + (COMMON_HIP_FLAGS if IS_HIP_EXTENSION else [])
+=======
+        cflags = common_cflags + ['/std:c++17'] + extra_cflags
+        cflags += COMMON_HIP_FLAGS if IS_HIP_EXTENSION else COMMON_MSVC_FLAGS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cflags = _nt_quote_args(cflags)
     else:
         cflags = common_cflags + ['-fPIC', '-std=c++17'] + extra_cflags
 
     if with_cuda and IS_HIP_EXTENSION:
+<<<<<<< HEAD
         cuda_flags = ['-DWITH_HIP'] + common_cflags + extra_cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
         cuda_flags = cuda_flags + ['-std=c++17']
         cuda_flags += _get_rocm_arch_flags(cuda_flags)
@@ -2734,6 +2979,13 @@ def _write_ninja_file_to_build_library(path,
             cuda_flags = _nt_quote_args(cuda_flags)
     elif with_cuda:
         cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags(extra_cuda_cflags)
+=======
+        cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
+        cuda_flags += _get_rocm_arch_flags(cuda_flags)
+        cuda_flags += extra_cuda_cflags
+    elif with_cuda:
+        cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if IS_WINDOWS:
             for flag in COMMON_MSVC_FLAGS:
                 cuda_flags = ['-Xcompiler', flag] + cuda_flags
@@ -2760,9 +3012,13 @@ def _write_ninja_file_to_build_library(path,
         _append_sycl_std_if_no_std_present(sycl_cflags)
         host_cflags = cflags
         # escaping quoted arguments to pass them thru SYCL compiler
+<<<<<<< HEAD
         icpx_version = _get_icpx_version()
         if int(icpx_version) < 20250200:
             host_cflags = [item.replace('\\"', '\\\\"') for item in host_cflags]
+=======
+        host_cflags = [item.replace('\\"', '\\\\"') for item in host_cflags]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         host_cflags = ' '.join(host_cflags)
         sycl_cflags += _wrap_sycl_host_flags(host_cflags)
         sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS.copy()
@@ -2865,10 +3121,15 @@ def sanitize_flags(flags):
     ldflags = sanitize_flags(ldflags)
 
     # Sanity checks...
+<<<<<<< HEAD
     if len(sources) != len(objects):
         raise AssertionError("sources and objects lists must be the same length")
     if len(sources) == 0:
         raise AssertionError("At least one source is required to build a library")
+=======
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     compiler = get_cxx_compiler()
 
@@ -2911,9 +3172,13 @@ def sanitize_flags(flags):
     if IS_WINDOWS:
         compiler_name = "$cxx" if IS_HIP_EXTENSION else "cl"
         compile_rule.append(
+<<<<<<< HEAD
             f'  command = {compiler_name} '
             '/showIncludes $cflags -c $in /Fo$out $post_cflags'  # codespell:ignore
         )
+=======
+            f'  command = {compiler_name} /showIncludes $cflags -c $in /Fo$out $post_cflags')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not IS_HIP_EXTENSION:
             compile_rule.append('  deps = msvc')
     else:
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 4ab5e7ce7f1c5..5d98c148048bb 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -74,5 +74,9 @@
 ]
 
 # Please keep this list sorted
+<<<<<<< HEAD
 if __all__ != sorted(__all__):
     raise AssertionError("__all__ is not sorted")
+=======
+assert __all__ == sorted(__all__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/_utils/__init__.py b/torch/utils/data/_utils/__init__.py
index 44111ef697b71..881380a114621 100644
--- a/torch/utils/data/_utils/__init__.py
+++ b/torch/utils/data/_utils/__init__.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 r"""Utility classes & functions for data loading. Code in this folder is mostly used by ../dataloder.py.
 
 A lot of multiprocessing is used in data loading, which only supports running
@@ -42,7 +46,11 @@
     HAS_NUMPY = False
 
 
+<<<<<<< HEAD
 def _set_python_exit_flag() -> None:
+=======
+def _set_python_exit_flag():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global python_exit_status
     python_exit_status = True
 
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index efe50ba22e8e6..72e4f20cc133b 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -12,8 +12,12 @@
 import contextlib
 import copy
 import re
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Optional, Union
+=======
+from typing import Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -45,7 +49,11 @@ def default_convert(data):
         >>> default_convert(np.array([0, 1]))
         tensor([0, 1])
         >>> # Example with NamedTuple
+<<<<<<< HEAD
         >>> Point = namedtuple("Point", ["x", "y"])
+=======
+        >>> Point = namedtuple('Point', ['x', 'y'])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> default_convert(Point(0, 0))
         Point(x=0, y=0)
         >>> default_convert(Point(np.array(0), np.array(0)))
@@ -204,7 +212,10 @@ def collate(
         # check to make sure that the elements in batch have consistent size
         it = iter(batch)
         elem_size = len(next(it))
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not all(len(elem) == elem_size for elem in it):
             raise RuntimeError("each element in list of batch should be of equal size")
         transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
@@ -368,6 +379,7 @@ def default_collate(batch):
         >>> default_collate([0, 1, 2, 3])
         tensor([0, 1, 2, 3])
         >>> # Example with a batch of `str`s:
+<<<<<<< HEAD
         >>> default_collate(["a", "b", "c"])
         ['a', 'b', 'c']
         >>> # Example with `Map` inside the batch:
@@ -375,6 +387,15 @@ def default_collate(batch):
         {'A': tensor([  0, 100]), 'B': tensor([  1, 100])}
         >>> # Example with `NamedTuple` inside the batch:
         >>> Point = namedtuple("Point", ["x", "y"])
+=======
+        >>> default_collate(['a', 'b', 'c'])
+        ['a', 'b', 'c']
+        >>> # Example with `Map` inside the batch:
+        >>> default_collate([{'A': 0, 'B': 1}, {'A': 100, 'B': 100}])
+        {'A': tensor([  0, 100]), 'B': tensor([  1, 100])}
+        >>> # Example with `NamedTuple` inside the batch:
+        >>> Point = namedtuple('Point', ['x', 'y'])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> default_collate([Point(0, 0), Point(1, 1)])
         Point(x=tensor([0, 1]), y=tensor([0, 1]))
         >>> # Example with `Tuple` inside the batch:
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index 223962fc04ba9..df79ecce935e1 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -21,7 +21,20 @@ def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
     torch.set_num_threads(1)
 
     torch.multiprocessing._set_thread_name("pt_data_pin")
+<<<<<<< HEAD
     torch.accelerator.set_device_index(device_id)
+=======
+
+    if device == "cuda":
+        torch.cuda.set_device(device_id)
+    elif device == "xpu":
+        torch.xpu.set_device(device_id)  # type: ignore[attr-defined]
+    elif device == torch._C._get_privateuse1_backend_name():
+        custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name())
+        custom_device_mod.set_device(device_id)
+    elif device is None:
+        torch.accelerator.set_device_index(device_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def do_one_step():
         try:
@@ -69,10 +82,14 @@ def pin_memory(data, device=None):
                 )
                 return clone
             else:
+<<<<<<< HEAD
                 return type(data)(
                     # pyrefly: ignore [bad-argument-count]
                     {k: pin_memory(sample, device) for k, sample in data.items()}
                 )  # type: ignore[call-arg]
+=======
+                return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError:
             # The mapping type may not support `copy()` / `update(mapping)`
             # or `__init__(iterable)`.
diff --git a/torch/utils/data/_utils/signal_handling.py b/torch/utils/data/_utils/signal_handling.py
index 33e1dd021e975..b2a58a3f15c33 100644
--- a/torch/utils/data/_utils/signal_handling.py
+++ b/torch/utils/data/_utils/signal_handling.py
@@ -72,8 +72,12 @@ def handler(signum, frame):
         # Python can still get and update the process status successfully.
         _error_if_any_worker_fails()
         if previous_handler is not None:
+<<<<<<< HEAD
             if not callable(previous_handler):
                 raise AssertionError("previous_handler is not callable")
+=======
+            assert callable(previous_handler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             previous_handler(signum, frame)
 
     signal.signal(signal.SIGCHLD, handler)
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 5e61912dc6e77..bf6dbd8363d6d 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 r"""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+=======
+r""""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 These **needs** to be in global scope since Py2 doesn't support serializing
 static methods.
@@ -269,10 +273,14 @@ def _worker_loop(
 
         shared_rng = torch.Generator()
         if isinstance(dataset, IterDataPipe):
+<<<<<<< HEAD
             if shared_seed is None:
                 raise AssertionError(
                     "shared_seed must be provided for IterDataPipe workers"
                 )
+=======
+            assert shared_seed is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shared_rng.manual_seed(shared_seed)
             dataset = apply_random_seed(dataset, shared_rng)
 
@@ -324,10 +332,14 @@ def _worker_loop(
                 iteration_end = False
 
                 if isinstance(dataset, IterDataPipe):
+<<<<<<< HEAD
                     if r.seed is None:
                         raise AssertionError(
                             "resume iteration seed is None for IterDataPipe"
                         )
+=======
+                    assert r.seed is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     shared_rng.manual_seed(r.seed)
                     dataset = apply_random_seed(dataset, shared_rng)
 
@@ -338,10 +350,14 @@ def _worker_loop(
                 continue
             elif r is None:
                 # Received the final signal
+<<<<<<< HEAD
                 if not done_event.is_set() and not iteration_end:
                     raise AssertionError(
                         "Received final signal but neither done_event nor iteration_end is set"
                     )
+=======
+                assert done_event.is_set() or iteration_end
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 break
             elif done_event.is_set() or iteration_end:
                 # `done_event` is set. But I haven't received the final signal
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 19400eb4a21a7..f04503992e6f0 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -5,7 +5,10 @@
 functions to be run in multiprocessing. E.g., the data loading worker loop is
 in `./_utils/worker.py`.
 """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import functools
@@ -16,8 +19,12 @@
 import queue
 import threading
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import torch
@@ -111,6 +118,7 @@ def _get_distributed_settings():
 def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
     global_worker_id = worker_id
     info = torch.utils.data.get_worker_info()
+<<<<<<< HEAD
     if info is None:
         raise AssertionError("Worker info is None in sharding worker init function")
     total_workers = info.num_workers
@@ -119,6 +127,12 @@ def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
         raise AssertionError(
             "datapipe must be an instance of IterDataPipe or MapDataPipe"
         )
+=======
+    assert info is not None
+    total_workers = info.num_workers
+    datapipe = info.dataset
+    assert isinstance(datapipe, (IterDataPipe, MapDataPipe))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # To distribute elements across distributed process evenly, we should shard data on distributed
     # processes first then shard on worker processes
     total_workers *= world_size
@@ -196,8 +210,14 @@ class DataLoader(Generic[_T_co]):
         persistent_workers (bool, optional): If ``True``, the data loader will not shut down
             the worker processes after a dataset has been consumed once. This allows to
             maintain the workers `Dataset` instances alive. (default: ``False``)
+<<<<<<< HEAD
         pin_memory_device (str, optional): Deprecated, the current :ref:`accelerator<accelerators>`
             will be used as the device if ``pin_memory=True``.
+=======
+        pin_memory_device (str, optional): the device to :attr:`pin_memory` on if ``pin_memory`` is
+            ``True``. If not given, the current :ref:`accelerator<accelerators>` will be the
+            default. This argument is discouraged and subject to deprecated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in_order (bool, optional): If ``False``, the data loader will not enforce that batches
             are returned in a first-in, first-out order. Only applies when ``num_workers > 0``. (default: ``True``)
 
@@ -485,7 +505,11 @@ def __setattr__(self, attr, val):
 
     def __iter__(self) -> _BaseDataLoaderIter:
         # When using a single worker the returned iterator should be
+<<<<<<< HEAD
         # created every time to avoid resetting its state
+=======
+        # created everytime to avoid resetting its state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # However, in the case of a multiple workers iterator
         # the iterator is only created once in the lifetime of the
         # DataLoader object so that workers can be reused
@@ -561,10 +585,17 @@ def check_worker_number_rationality(self):
         #     necessary.
         #
         #
+<<<<<<< HEAD
         # [Note] Please note that this function respects `cpuset` only when os.sched_getaffinity is
         #        available (available in most of Linux system, but not OSX and Windows).
         #        When os.sched_getaffinity is not available, os.cpu_count() is called instead, but
         #        it doesn't respect cpuset.
+=======
+        # [Note] Please note that this function repects `cpuset` only when os.sched_getaffinity is
+        #        available (available in most of Linux system, but not OSX and Windows).
+        #        When os.sched_getaffinity is not available, os.cpu_count() is called instead, but
+        #        it doesn't repect cpuset.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #        We don't take threading into account since each worker process is single threaded
         #        at this time.
         #
@@ -624,8 +655,12 @@ def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
             warnings.warn(
                 _create_warning_msg(
                     max_num_worker_suggest, self.num_workers, cpuset_checked
+<<<<<<< HEAD
                 ),
                 stacklevel=2,
+=======
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return
 
@@ -633,8 +668,12 @@ def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
             warnings.warn(
                 _create_warning_msg(
                     max_num_worker_suggest, self.num_workers, cpuset_checked
+<<<<<<< HEAD
                 ),
                 stacklevel=2,
+=======
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -661,6 +700,7 @@ def __init__(self, loader: DataLoader) -> None:
         ws, rank = _get_distributed_settings()
         self._world_size = ws
         self._rank = rank
+<<<<<<< HEAD
 
         if loader.pin_memory and loader.pin_memory_device:
             warnings.warn(
@@ -699,6 +739,47 @@ def __init__(self, loader: DataLoader) -> None:
             )
             warnings.warn(warn_msg, stacklevel=2)
 
+=======
+        # If pin_memory_device not set, default behaviour is current accelerator.
+        # If pin_memory_device is set but pin_memory is not set, the default
+        # behaviour false.
+        if len(loader.pin_memory_device) == 0:
+            if loader.pin_memory and not torch.accelerator.is_available():
+                warn_msg = (
+                    "'pin_memory' argument is set as true but no accelerator is found, "
+                    "then device pinned memory won't be used."
+                )
+                warnings.warn(warn_msg)
+
+            self._pin_memory = loader.pin_memory and torch.accelerator.is_available()
+            self._pin_memory_device = None
+            # Currently, pin_memory would raise error on the MPS backend (see
+            # https://github.com/pytorch/pytorch/issues/86060), so forcibly
+            # disable pin_memory on MPS. Remove this restriction once pinned
+            # memory allocation for MPS is fixed.
+            if (
+                self._pin_memory
+                and (acc := torch.accelerator.current_accelerator()) is not None
+                and acc.type == "mps"
+            ):
+                self._pin_memory = False
+                warn_msg = (
+                    "'pin_memory' argument is set as true but not supported on MPS now, "
+                    "then device pinned memory won't be used."
+                )
+                warnings.warn(warn_msg)
+        else:
+            if not loader.pin_memory:
+                warn_msg = (
+                    "'pin_memory_device' is set but 'pin_memory' argument is not set, "
+                    "then device pinned memory won't be used."
+                    "please set 'pin_memory' to true, if you need to use the device pin memory"
+                )
+                warnings.warn(warn_msg)
+
+            self._pin_memory = loader.pin_memory
+            self._pin_memory_device = loader.pin_memory_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._timeout = loader.timeout
         self._collate_fn = loader.collate_fn
         self._sampler_iter = iter(self._index_sampler)
@@ -754,7 +835,11 @@ def __next__(self) -> Any:
                         "IterableDataset replica at each worker. Please see "
                         "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples."
                     )
+<<<<<<< HEAD
                 warnings.warn(warn_msg, stacklevel=2)
+=======
+                warnings.warn(warn_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return data
 
     def __len__(self) -> int:
@@ -772,12 +857,17 @@ def __getstate__(self):
 class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
     def __init__(self, loader):
         super().__init__(loader)
+<<<<<<< HEAD
         if self._timeout != 0:
             raise AssertionError("_SingleProcessDataLoaderIter requires timeout == 0")
         if self._num_workers != 0:
             raise AssertionError(
                 "_SingleProcessDataLoaderIter requires num_workers == 0"
             )
+=======
+        assert self._timeout == 0
+        assert self._num_workers == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Adds forward compatibilities so classic DataLoader can work with DataPipes:
         #   Taking care of distributed sharding
@@ -896,7 +986,11 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #        2. A similar issue araises when a `DataLoader` is used in a subprocess.
     #           When a process ends, it shuts the all its daemonic children
     #           down with a SIGTERM (instead of joining them without a timeout).
+<<<<<<< HEAD
     #           Similarly for threads, but by a different mechanism. This fact,
+=======
+    #           Simiarly for threads, but by a different mechanism. This fact,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #           together with a few implementation details of multiprocessing, forces
     #           us to make workers daemonic. All of our problems arise when a
     #           DataLoader is used in a subprocess, and are caused by multiprocessing
@@ -1027,7 +1121,11 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #   `cancel_join_thread` on that queue if its `IterableDataset` iterator
     #   happens to exhaust coincidentally, which is out of the control of the
     #   main process). Thus, since we will exit `pin_memory_thread` before the
+<<<<<<< HEAD
     #   workers (see below), two separate events are used.
+=======
+    #   workers (see below), two separete events are used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # NOTE: In short, the protocol is that the main process will set these
     #       `done_event`s and then the corresponding processes/threads a `None`,
@@ -1119,6 +1217,7 @@ def __init__(self, loader):
         self._prefetch_factor = loader.prefetch_factor
         self._in_order = loader.in_order
 
+<<<<<<< HEAD
         if self._num_workers <= 0:
             raise AssertionError(
                 "num_workers must be greater than 0 for MultiProcessingDataLoaderIter"
@@ -1127,6 +1226,10 @@ def __init__(self, loader):
             raise AssertionError(
                 "prefetch_factor must be greater than 0 for MultiProcessingDataLoaderIter"
             )
+=======
+        assert self._num_workers > 0
+        assert self._prefetch_factor > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if loader.multiprocessing_context is None:
             multiprocessing_context = torch.multiprocessing
@@ -1194,13 +1297,32 @@ def __init__(self, loader):
 
             # Queue is not type-annotated
             self._data_queue = queue.Queue()  # type: ignore[var-annotated]
+<<<<<<< HEAD
             current_device_id = torch.accelerator.current_device_index()
+=======
+            current_device = -1
+            if self._pin_memory_device == "cuda":
+                current_device = torch.cuda.current_device()
+            elif self._pin_memory_device == "xpu":
+                current_device = torch.xpu.current_device()
+            elif self._pin_memory_device == torch._C._get_privateuse1_backend_name():
+                custom_device_mod = getattr(
+                    torch, torch._C._get_privateuse1_backend_name()
+                )
+                current_device = custom_device_mod.current_device()
+            elif self._pin_memory_device is None:
+                current_device = torch.accelerator.current_device_index()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pin_memory_thread = threading.Thread(
                 target=_utils.pin_memory._pin_memory_loop,
                 args=(
                     self._worker_result_queue,
                     self._data_queue,
+<<<<<<< HEAD
                     current_device_id,
+=======
+                    current_device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._pin_memory_thread_done_event,
                     self._pin_memory_device,
                 ),
@@ -1227,10 +1349,14 @@ def __init__(self, loader):
                 atexit.register(_MultiProcessingDataLoaderIter._clean_up_worker, w)
 
         # .pid can be None only before process is spawned (not the case, so ignore)
+<<<<<<< HEAD
         _utils.signal_handling._set_worker_pids(
             id(self),
             tuple(w.pid for w in self._workers),  # type: ignore[misc]
         )
+=======
+        _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers))  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _utils.signal_handling._set_SIGCHLD_handler()
         self._worker_pids_set = True
         self._reset(loader, first_iter=True)
@@ -1271,10 +1397,14 @@ def _reset(self, loader, first_iter=False):
             while resume_iteration_cnt > 0:
                 return_idx, return_data = self._get_data()
                 if isinstance(return_idx, _utils.worker._ResumeIteration):
+<<<<<<< HEAD
                     if return_data is not None:
                         raise AssertionError(
                             "Expected return_data to be None when resuming iteration"
                         )
+=======
+                    assert return_data is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     resume_iteration_cnt -= 1
         # prime the prefetch loop
         for _ in range(self._prefetch_factor * self._num_workers):
@@ -1499,10 +1629,14 @@ def _next_data(self):
                 self._rcvd_idx += 1
                 return self._process_data(data, worker_id)
 
+<<<<<<< HEAD
             if self._shutdown or self._tasks_outstanding <= 0:
                 raise AssertionError(
                     "Invalid iterator state: shutdown or no outstanding tasks when fetching next data"
                 )
+=======
+            assert not self._shutdown and self._tasks_outstanding > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             idx, data = self._get_data()
             self._tasks_outstanding -= 1
             if self._dataset_kind == _DatasetKind.Iterable:
@@ -1531,10 +1665,14 @@ def _next_data(self):
 
     def _try_put_index(self):
         max_tasks = self._prefetch_factor * self._num_workers
+<<<<<<< HEAD
         if self._tasks_outstanding >= max_tasks:
             raise AssertionError(
                 "Number of outstanding tasks exceeded maximum allowed tasks"
             )
+=======
+        assert self._tasks_outstanding < max_tasks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             index = self._next_index()
@@ -1573,6 +1711,7 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
         # exhausting an `IterableDataset`. This should be used only when this
         # `_MultiProcessingDataLoaderIter` is going to continue running.
 
+<<<<<<< HEAD
         if (
             not self._workers_status[worker_id]
             and not self._persistent_workers
@@ -1581,6 +1720,11 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
             raise AssertionError(
                 "Worker status inconsistent when marking worker as unavailable"
             )
+=======
+        assert self._workers_status[worker_id] or (
+            self._persistent_workers and shutdown
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Signal termination to that specific worker.
         q = self._index_queues[worker_id]
@@ -1593,16 +1737,24 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
         # (2) since we don't join, the worker may still raise error, and we
         # prefer capturing those, rather than ignoring them, even though they
         # are raised after the worker has finished its job.
+<<<<<<< HEAD
         # Joining is deferred to `_shutdown_workers`, which it is called when
+=======
+        # Joinning is deferred to `_shutdown_workers`, which it is called when
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # all workers finish their jobs (e.g., `IterableDataset` replicas) or
         # when this iterator is garbage collected.
 
         self._workers_status[worker_id] = False
 
+<<<<<<< HEAD
         if self._workers_done_event.is_set() != shutdown:
             raise AssertionError(
                 "_workers_done_event state does not match shutdown flag"
             )
+=======
+        assert self._workers_done_event.is_set() == shutdown
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _shutdown_workers(self):
         # Called when shutting down this `_MultiProcessingDataLoaderIter`.
diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md
index e8776bc39b87b..f0e37f53f9190 100644
--- a/torch/utils/data/datapipes/README.md
+++ b/torch/utils/data/datapipes/README.md
@@ -51,7 +51,11 @@ Note that `__len__` method is optional for `IterDataPipe`.
 Like `CSVParserIterDataPipe` in the [Using DataPipe sector](#using-datapipe), `__len__` is not implemented because the size of each file streams is unknown for us before loading it.
 
 Besides, in some special cases, `__len__` method can be provided, but it would either return an integer length or raise Error depending on the arguments of DataPipe.
+<<<<<<< HEAD
 And, the Error is required to be `TypeError` to support Python's built-in functions like `list(dp)`.
+=======
+And, the Error is required to be `TypeError` to support Python's build-in functions like `list(dp)`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Please check NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] for detailed reason in PyTorch.
 
 ### Registering DataPipe with functional API
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index 507e00259c4c7..8b43bae2cde2d 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -1,8 +1,13 @@
 # mypy: allow-untyped-defs
 import inspect
+<<<<<<< HEAD
 from collections.abc import Callable
 from functools import wraps
 from typing import Any, get_type_hints, Optional, Union
+=======
+from functools import wraps
+from typing import Any, Callable, get_type_hints, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes._typing import _DataPipeMeta
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -75,9 +80,15 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
 class non_deterministic:
     cls: Optional[type[IterDataPipe]] = None
     # TODO: Lambda for picking
+<<<<<<< HEAD
     deterministic_fn: Callable[..., bool]
 
     def __init__(self, arg: Union[type[IterDataPipe], Callable[..., bool]]) -> None:
+=======
+    deterministic_fn: Callable[[], bool]
+
+    def __init__(self, arg: Union[type[IterDataPipe], Callable[[], bool]]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # 1. Decorator doesn't have any argument
         if isinstance(arg, type):  # type: ignore[arg-type]
             if not issubclass(arg, IterDataPipe):  # type: ignore[arg-type]
@@ -92,7 +103,11 @@ def __init__(self, arg: Union[type[IterDataPipe], Callable[..., bool]]) -> None:
         #    When the function returns True, the instance is non-deterministic. Otherwise,
         #    the instance is a deterministic DataPipe.
         elif isinstance(arg, Callable):  # type:ignore[arg-type]
+<<<<<<< HEAD
             self.deterministic_fn = arg
+=======
+            self.deterministic_fn = arg  # type: ignore[assignment, misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise TypeError(f"{arg} can not be decorated by non_deterministic")
 
@@ -110,7 +125,12 @@ def __call__(self, *args, **kwargs):
 
         # Decorate with a functional argument
         if not (
+<<<<<<< HEAD
             isinstance(args[0], type) and issubclass(args[0], IterDataPipe)  # type: ignore[arg-type]
+=======
+            isinstance(args[0], type)
+            and issubclass(args[0], IterDataPipe)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             raise TypeError(
                 f"Only `IterDataPipe` can be decorated, but {args[0].__name__} is found"
@@ -119,7 +139,11 @@ def __call__(self, *args, **kwargs):
         return self.deterministic_wrapper_fn
 
     def deterministic_wrapper_fn(self, *args, **kwargs) -> IterDataPipe:
+<<<<<<< HEAD
         res = self.deterministic_fn(*args, **kwargs)
+=======
+        res = self.deterministic_fn(*args, **kwargs)  # type: ignore[call-arg, misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(res, bool):
             raise TypeError(
                 "deterministic_fn of `non_deterministic` decorator is required "
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
index 32777cfd01d34..d7f6cc510a0df 100644
--- a/torch/utils/data/datapipes/_typing.py
+++ b/torch/utils/data/datapipes/_typing.py
@@ -78,7 +78,11 @@ def issubtype(left, right, recursive=True):
         if getattr(right, "__origin__", None) is Generic:
             return True
 
+<<<<<<< HEAD
     if right is type(None):
+=======
+    if right == type(None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     # Right-side type
@@ -138,7 +142,11 @@ def _issubtype_with_constraints(variant, constraints, recursive=True):
     #   - TypeVar[TypeVar[...]]
     # So, variant and each constraint may be a TypeVar or a Union.
     # In these cases, all of inner types from the variant are required to be
+<<<<<<< HEAD
     # extracted and verified as a subtype of any constraint. And, all of
+=======
+    # extraced and verified as a subtype of any constraint. And, all of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # inner types from any constraint being a TypeVar or a Union are
     # also required to be extracted and verified if the variant belongs to
     # any of them.
@@ -265,7 +273,10 @@ def issubtype_of_instance(self, other):
 
 # Default type for DataPipe without annotation
 _T_co = TypeVar("_T_co", covariant=True)
+<<<<<<< HEAD
 # pyrefly: ignore [invalid-annotation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _DEFAULT_TYPE = _DataPipeType(Generic[_T_co])
 
 
@@ -284,7 +295,10 @@ def __new__(cls, name, bases, namespace, **kwargs):
         return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
 
         # TODO: the statements below are not reachable by design as there is a bug and typing is low priority for now.
+<<<<<<< HEAD
         # pyrefly: ignore [no-access]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls.__origin__ = None
         if "type" in namespace:
             return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
diff --git a/torch/utils/data/datapipes/dataframe/__init__.py b/torch/utils/data/datapipes/dataframe/__init__.py
index f7f4b7dcb414c..ed78229b0bf5f 100644
--- a/torch/utils/data/datapipes/dataframe/__init__.py
+++ b/torch/utils/data/datapipes/dataframe/__init__.py
@@ -8,5 +8,9 @@
 __all__ = ["CaptureDataFrame", "DFIterDataPipe", "DataFramesAsTuplesPipe"]
 
 # Please keep this list sorted
+<<<<<<< HEAD
 if __all__ != sorted(__all__):
     raise AssertionError("__all__ is not sorted")
+=======
+assert __all__ == sorted(__all__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index 8908721bccd77..fd4c6c86bdb49 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -51,7 +51,11 @@ def __iter__(self):
             yield self.output_var.apply_ops(item)
 
 
+<<<<<<< HEAD
 #  TODO(VitalyFedyunin): Extract this list from the DFIterDataPipe registered functions
+=======
+#  TODO(VitalyFedyunin): Extract this list from the DFIterDataPipe registred functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DATAPIPES_OPS = [
     "_dataframes_as_tuples",
     "groupby",
@@ -80,7 +84,10 @@ def __str__(self):
 
     def _ops_str(self):
         res = ""
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for op in self.ctx["operations"]:
             if len(res) > 0:
                 res += "\n"
@@ -90,7 +97,10 @@ def _ops_str(self):
     def __getstate__(self):
         # TODO(VitalyFedyunin): Currently can't pickle (why?)
         self.ctx["schema_df"] = None
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for var in self.ctx["variables"]:
             var.calculated_value = None
         state = {}
@@ -105,7 +115,11 @@ def __setstate__(self, state):
     def __getattr__(self, attrname):
         if attrname == "kwarg" or attrname == "kwargs":
             raise RuntimeError("no kwargs!")
+<<<<<<< HEAD
         if attrname == "__deepcopy__":
+=======
+        if attrname in ["__deepcopy__"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise AttributeError
         result = CaptureGetAttr(self, attrname, ctx=self.ctx)
         return result
@@ -114,13 +128,19 @@ def __getitem__(self, key):
         return CaptureGetItem(self, key, ctx=self.ctx)
 
     def __setitem__(self, key, value):
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ctx["operations"].append(CaptureSetItem(self, key, value, ctx=self.ctx))
 
     def __add__(self, add_val):
         res = CaptureAdd(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ctx["operations"].append(
             CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
         )
@@ -129,7 +149,10 @@ def __add__(self, add_val):
     def __sub__(self, add_val):
         res = CaptureSub(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ctx["operations"].append(
             CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
         )
@@ -139,19 +162,29 @@ def __mul__(self, add_val):
         res = CaptureMul(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
         t = CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ctx["operations"].append(t)
         return var
 
     def _is_context_empty(self):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.ctx["operations"]) == 0 and len(self.ctx["variables"]) == 0
 
     def apply_ops_2(self, dataframe):
         # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         self.ctx["variables"][0].calculated_value = dataframe
         # pyrefly: ignore [not-iterable]
+=======
+        self.ctx["variables"][0].calculated_value = dataframe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for op in self.ctx["operations"]:
             op.execute()
 
@@ -184,7 +217,10 @@ def __call__(self, *args, **kwargs):
         res = CaptureCall(self, ctx=self.ctx, args=args, kwargs=kwargs)
         var = CaptureVariable(None, ctx=self.ctx)
         t = CaptureVariableAssign(ctx=self.ctx, variable=var, value=res)
+<<<<<<< HEAD
         # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ctx["operations"].append(t)
         return var
 
@@ -211,7 +247,11 @@ class CaptureLikeMock:
     def __init__(self, name):
         import unittest.mock as mock
 
+<<<<<<< HEAD
         # TODO(VitalyFedyunin): Do not use private function here, copy own implementation instead.
+=======
+        # TODO(VitalyFedyunin): Do not use provate function here, copy own implementation instead.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_target, attribute = mock._get_target(name)  # type: ignore[attr-defined]
         self.get_target = get_target
         self.attribute = attribute
@@ -283,9 +323,13 @@ def execute(self):
 
     def apply_ops(self, dataframe):
         # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
         self.ctx["variables"][0].calculated_value = dataframe
         # pyrefly: ignore [not-iterable]
+=======
+        self.ctx["variables"][0].calculated_value = dataframe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for op in self.ctx["operations"]:
             op.execute()
         return self.calculated_value
@@ -385,7 +429,10 @@ def get_val(capture):
 
 class CaptureInitial(CaptureVariable):
     def __init__(self, schema_df=None):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_ctx: dict[str, list[Any]] = {
             "operations": [],
             "variables": [],
@@ -401,7 +448,10 @@ class CaptureDataFrame(CaptureInitial):
 
 class CaptureDataFrameWithDataPipeOps(CaptureDataFrame):
     def as_datapipe(self):
+<<<<<<< HEAD
         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return DataFrameTracedOps(self.ctx["variables"][0].source_datapipe, self)
 
     def raw_iterator(self):
diff --git a/torch/utils/data/datapipes/dataframe/datapipes.py b/torch/utils/data/datapipes/dataframe/datapipes.py
index edb08d77a81d9..bb18fad02135c 100644
--- a/torch/utils/data/datapipes/dataframe/datapipes.py
+++ b/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -53,7 +53,11 @@ def __iter__(self):
             if len(buffer) == self.n_batch:
                 yield df_wrapper.concat(buffer)
                 buffer = []
+<<<<<<< HEAD
         if buffer:
+=======
+        if len(buffer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield df_wrapper.concat(buffer)
 
 
@@ -78,7 +82,11 @@ def __iter__(self):
             if len(buffer) == size:
                 yield df_wrapper.concat(buffer)
                 buffer = []
+<<<<<<< HEAD
         if buffer:
+=======
+        if len(buffer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield df_wrapper.concat(buffer)
 
 
@@ -92,7 +100,10 @@ def __iter__(self):
         size = None
         all_buffer = []
         filter_res = []
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for df in self.source_datapipe:
             if size is None:
                 size = len(df.index)
@@ -107,7 +118,11 @@ def __iter__(self):
                 if len(buffer) == size:
                     yield df_wrapper.concat(buffer)
                     buffer = []
+<<<<<<< HEAD
         if buffer:
+=======
+        if len(buffer):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield df_wrapper.concat(buffer)
 
 
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index f0811ac81b616..ad26b07bee272 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -1,7 +1,12 @@
 import functools
 import pickle
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator
 from typing import Optional, TypeVar
+=======
+from collections.abc import Iterable, Iterator
+from typing import Callable, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils._import_utils import import_dill
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -99,9 +104,13 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
             >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
             >>> dp = IterableWrapper(range(10))
             >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+<<<<<<< HEAD
             >>> map_dp_2 = dp.map(
             ...     lambda x: x + 1
             ... )  # Using functional form (recommended)
+=======
+            >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> list(map_dp_1)
             [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
             >>> list(map_dp_2)
@@ -116,9 +125,13 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
             >>> list(it1)
             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
             >>> it1 = iter(source_dp)
+<<<<<<< HEAD
             >>> it2 = iter(
             ...     source_dp
             ... )  # The creation of a new iterator invalidates `it1`
+=======
+            >>> it2 = iter(source_dp)  # The creation of a new iterator invalidates `it1`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> next(it2)
             0
             >>> next(it1)  # Further usage of `it1` will raise a `RunTimeError`
@@ -135,7 +148,10 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
     _fast_forward_iterator: Optional[Iterator] = None
 
     def __iter__(self) -> Iterator[_T_co]:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-return]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def __getattr__(self, attribute_name):
@@ -380,7 +396,10 @@ def __getstate__(self):
             value = pickle.dumps(self._datapipe)
         except Exception:
             if HAS_DILL:
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 value = dill.dumps(self._datapipe)
                 use_dill = True
             else:
@@ -390,7 +409,10 @@ def __getstate__(self):
     def __setstate__(self, state):
         value, use_dill = state
         if use_dill:
+<<<<<<< HEAD
             # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._datapipe = dill.loads(value)
         else:
             self._datapipe = pickle.loads(value)
@@ -407,7 +429,10 @@ def __len__(self):
 class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
     def __init__(self, datapipe: IterDataPipe[_T_co]):
         super().__init__(datapipe)
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._datapipe_iter: Optional[Iterator[_T_co]] = None
 
     def __iter__(self) -> "_IterDataPipeSerializationWrapper":
@@ -415,10 +440,14 @@ def __iter__(self) -> "_IterDataPipeSerializationWrapper":
         return self
 
     def __next__(self) -> _T_co:  # type: ignore[type-var]
+<<<<<<< HEAD
         if self._datapipe_iter is None:
             raise AssertionError(
                 "Iterator has not been initialized; call __iter__() before __next__()"
             )
+=======
+        assert self._datapipe_iter is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return next(self._datapipe_iter)
 
 
diff --git a/torch/utils/data/datapipes/gen_pyi.py b/torch/utils/data/datapipes/gen_pyi.py
index 9f16f6f4552d4..72c0feb888a94 100644
--- a/torch/utils/data/datapipes/gen_pyi.py
+++ b/torch/utils/data/datapipes/gen_pyi.py
@@ -8,7 +8,11 @@
 
 try:
     from torchgen.api.python import format_function_signature
+<<<<<<< HEAD
     from torchgen.utils import FileManager
+=======
+    from torchgen.utils import FileManager as FileManager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 except ImportError:
     import sys
 
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 05831250da468..83e6be675dc8a 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -62,5 +62,9 @@
 ]
 
 # Please keep this list sorted
+<<<<<<< HEAD
 if __all__ != sorted(__all__):
     raise AssertionError("__all__ is not sorted")
+=======
+assert __all__ == sorted(__all__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 1ce1c9c07196c..f8ea0edb74c16 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,10 +1,16 @@
 # mypy: allow-untyped-defs
 import functools
 from collections import namedtuple
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator, Sized
 from typing import Any, Optional, TypeVar, Union
 
 import torch
+=======
+from collections.abc import Iterator, Sized
+from typing import Any, Callable, Optional, TypeVar, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.data._utils.collate import default_collate
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
@@ -55,8 +61,12 @@ class MapperIterDataPipe(IterDataPipe[_T_co]):
         >>> def add_one(x):
         ...     return x + 1
         >>> dp = IterableWrapper(range(10))
+<<<<<<< HEAD
         >>> # Invocation via functional form is preferred
         ... map_dp_1 = dp.map(add_one)
+=======
+        >>> map_dp_1 = dp.map(add_one)  # Invocation via functional form is preferred
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> list(map_dp_1)
         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle`
@@ -76,7 +86,10 @@ def __init__(
         input_col=None,
         output_col=None,
     ) -> None:
+<<<<<<< HEAD
         torch._C._log_api_usage_once("python.data_pipes.map")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.datapipe = datapipe
 
@@ -118,7 +131,10 @@ def _apply_fn(self, data):
                 for idx in sorted(self.input_col[1:], reverse=True):
                     del data[idx]
             else:
+<<<<<<< HEAD
                 # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 data[self.input_col] = res
         else:
             if self.output_col == -1:
@@ -151,7 +167,11 @@ def _collate_helper(conversion, item):
 
     for name in conversion.keys():
         if name not in columns_name:
+<<<<<<< HEAD
             raise RuntimeError("Conversion keys mismatch")
+=======
+            raise RuntimeError("Conversion keys missmatch")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for name in columns_name:
         if name in conversion:
@@ -204,7 +224,11 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
         ...     def __init__(self, start, end):
         ...         super(MyIterDataPipe).__init__()
+<<<<<<< HEAD
         ...         assert end > start, "this example only works with end >= start"
+=======
+        ...         assert end > start, "this example code only works with end >= start"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...         self.start = start
         ...         self.end = end
         ...
@@ -213,11 +237,19 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         ...
         ...     def __len__(self):
         ...         return self.end - self.start
+<<<<<<< HEAD
+=======
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> ds = MyIterDataPipe(start=3, end=7)
         >>> print(list(ds))
         [3, 4, 5, 6]
         >>> def collate_fn(batch):
         ...     return torch.tensor(batch, dtype=torch.float)
+<<<<<<< HEAD
+=======
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> collated_ds = CollateIterDataPipe(ds, collate_fn=collate_fn)
         >>> print(list(collated_ds))
         [tensor(3.), tensor(4.), tensor(5.), tensor(6.)]
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index ff76e995f0ad2..ea41da241c562 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -38,6 +38,7 @@ def __init__(
         sampler_args: Optional[tuple] = None,
         sampler_kwargs: Optional[dict] = None,
     ) -> None:
+<<<<<<< HEAD
         if not isinstance(datapipe, Sized):
             raise AssertionError(
                 "Sampler class requires input datapipe implemented `__len__`"
@@ -49,6 +50,17 @@ def __init__(
         self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
         self.sampler_kwargs["data_source"] = self.datapipe
         self.sampler = sampler(*self.sampler_args, **self.sampler_kwargs)
+=======
+        assert isinstance(
+            datapipe, Sized
+        ), "Sampler class requires input datapipe implemented `__len__`"
+        super().__init__()
+        self.datapipe = datapipe
+        self.sampler_args = () if sampler_args is None else sampler_args
+        self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
+        # https://github.com/python/mypy/pull/9629 will solve
+        self.sampler = sampler(*self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs)  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __iter__(self) -> Iterator[_T_co]:
         return iter(self.sampler)
@@ -113,8 +125,12 @@ def __init__(
         # TODO: Performance optimization
         #       buffer can be a fixed size and remove expensive `append()` and `len()` operations
         self._buffer: list[_T_co] = []
+<<<<<<< HEAD
         if buffer_size <= 0:
             raise AssertionError("buffer_size should be larger than 0")
+=======
+        assert buffer_size > 0, "buffer_size should be larger than 0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if unbatch_level == 0:
             self.datapipe = datapipe
         else:
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 2e3d371244253..9f7c669a2b4a8 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -3,8 +3,13 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import deque
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator, Sized
 from typing import Any, Literal, Optional, TypeVar
+=======
+from collections.abc import Iterator, Sized
+from typing import Any, Callable, Literal, Optional, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -59,7 +64,10 @@ def __iter__(self) -> Iterator:
 
     def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return sum(len(dp) for dp in self.datapipes)
         else:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
@@ -117,6 +125,7 @@ class _ContainerTemplate(ABC):
     r"""Abstract class for container ``DataPipes``. The followings are three required methods."""
 
     @abstractmethod
+<<<<<<< HEAD
     def get_next_element_by_instance(self, instance_id: int): ...
 
     @abstractmethod
@@ -124,6 +133,18 @@ def is_every_instance_exhausted(self) -> bool: ...
 
     @abstractmethod
     def reset(self) -> None: ...
+=======
+    def get_next_element_by_instance(self, instance_id: int):
+        ...
+
+    @abstractmethod
+    def is_every_instance_exhausted(self) -> bool:
+        ...
+
+    @abstractmethod
+    def reset(self) -> None:
+        ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def get_length_by_instance(self, instance_id: int):
@@ -159,7 +180,10 @@ def __init__(
                 "Unlimited buffer size is set for `fork`, "
                 "please be aware of OOM at random places",
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         if copy is None:
             self.copy_fn = _no_op
@@ -181,7 +205,10 @@ def __init__(
         self._child_stop: list[bool] = [True for _ in range(num_instances)]
 
     def __len__(self):
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.main_datapipe)
 
     def get_next_element_by_instance(self, instance_id: int):
@@ -241,7 +268,10 @@ def is_every_instance_exhausted(self) -> bool:
         return self.end_ptr is not None and all(self._child_stop)
 
     def get_length_by_instance(self, instance_id: int) -> int:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.main_datapipe)
 
     def reset(self) -> None:
@@ -325,10 +355,15 @@ class _ChildDataPipe(IterDataPipe):
     _is_child_datapipe: bool = True
 
     def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
+<<<<<<< HEAD
         if not isinstance(main_datapipe, _ContainerTemplate):
             raise AssertionError("main_datapipe must implement _ContainerTemplate")
 
         # pyrefly: ignore [bad-assignment]
+=======
+        assert isinstance(main_datapipe, _ContainerTemplate)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.main_datapipe: IterDataPipe = main_datapipe
         self.instance_id = instance_id
 
@@ -360,7 +395,10 @@ def _set_main_datapipe_valid_iterator_id(self) -> int:
                     "Some child DataPipes are not exhausted when __iter__ is called. We are resetting "
                     "the buffer and each child DataPipe will read from the start again.",
                     UserWarning,
+<<<<<<< HEAD
                     stacklevel=2,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             self.main_datapipe.reset()
         # 3. Otherwise, the iterator is behind the others, so it will just need to catch up by setting
@@ -407,9 +445,13 @@ class DemultiplexerIterDataPipe(IterDataPipe):
         >>> # It can also filter out any element that gets `None` from the `classifier_fn`
         >>> def odd_or_even_no_zero(n):
         ...     return n % 2 if n != 0 else None
+<<<<<<< HEAD
         >>> dp1, dp2 = source_dp.demux(
         ...     num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True
         ... )
+=======
+        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> list(dp1)
         [2, 4]
         >>> list(dp2)
@@ -434,9 +476,13 @@ def __new__(
         # When num_instances == 1, demux can be replaced by filter,
         # but keep it as Demultiplexer for the sake of consistency
         # like throwing Error when classification result is out of o range
+<<<<<<< HEAD
         container = _DemultiplexerIterDataPipe(
             datapipe, num_instances, classifier_fn, drop_none, buffer_size
         )  # type: ignore[abstract]
+=======
+        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)  # type: ignore[abstract]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
@@ -456,7 +502,10 @@ def __init__(
         drop_none: bool,
         buffer_size: int,
     ):
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.main_datapipe = datapipe
         self._datapipe_iterator: Optional[Iterator[Any]] = None
         self.num_instances = num_instances
@@ -466,12 +515,18 @@ def __init__(
                 "Unlimited buffer size is set for `demux`, "
                 "please be aware of OOM at random places",
                 UserWarning,
+<<<<<<< HEAD
                 stacklevel=2,
             )
         self.current_buffer_usage = 0
         # pyrefly: ignore [invalid-type-var]
         self.child_buffers: list[deque[_T_co]] = [deque() for _ in range(num_instances)]
         # pyrefly: ignore [invalid-type-var]
+=======
+            )
+        self.current_buffer_usage = 0
+        self.child_buffers: list[deque[_T_co]] = [deque() for _ in range(num_instances)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.classifier_fn = classifier_fn
         self.drop_none = drop_none
         self.main_datapipe_exhausted = False
@@ -614,22 +669,36 @@ class MultiplexerIterDataPipe(IterDataPipe):
     Example:
         >>> # xdoctest: +REQUIRES(module:torchdata)
         >>> from torchdata.datapipes.iter import IterableWrapper
+<<<<<<< HEAD
         >>> dp1, dp2, dp3 = (
         ...     IterableWrapper(range(3)),
         ...     IterableWrapper(range(10, 15)),
         ...     IterableWrapper(range(20, 25)),
         ... )
+=======
+        >>> dp1, dp2, dp3 = IterableWrapper(range(3)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> list(dp1.mux(dp2, dp3))
         [0, 10, 20, 1, 11, 21, 2, 12, 22]
     """
 
     def __init__(self, *datapipes):
         self.datapipes = datapipes
+<<<<<<< HEAD
         self.buffer: list = []  # Store values to be yielded only when every iterator provides one
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
         while iterators:
+=======
+        self.buffer: list = (
+            []
+        )  # Store values to be yielded only when every iterator provides one
+
+    def __iter__(self):
+        iterators = [iter(x) for x in self.datapipes]
+        while len(iterators):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for it in iterators:
                 try:
                     value = next(it)
@@ -684,11 +753,15 @@ class ZipperIterDataPipe(IterDataPipe[tuple[_T_co]]):
     Example:
         >>> # xdoctest: +REQUIRES(module:torchdata)
         >>> from torchdata.datapipes.iter import IterableWrapper
+<<<<<<< HEAD
         >>> dp1, dp2, dp3 = (
         ...     IterableWrapper(range(5)),
         ...     IterableWrapper(range(10, 15)),
         ...     IterableWrapper(range(20, 25)),
         ... )
+=======
+        >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> list(dp1.zip(dp2, dp3))
         [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)]
     """
@@ -709,7 +782,10 @@ def __iter__(self) -> Iterator[tuple[_T_co]]:
 
     def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return min(len(dp) for dp in self.datapipes)
         else:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index 5b627a190e8a8..03924b5a0a29a 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -1,4 +1,9 @@
+<<<<<<< HEAD
 from collections.abc import Iterable, Iterator
+=======
+# mypy: allow-untyped-defs
+from collections.abc import Iterable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from io import IOBase
 from typing import Optional
 
@@ -32,12 +37,17 @@ class FileOpenerIterDataPipe(IterDataPipe[tuple[str, IOBase]]):
 
     Example:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> from torchdata.datapipes.iter import (
         ...     FileLister,
         ...     FileOpener,
         ...     StreamReader,
         ... )
         >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith(".txt"))
+=======
+        >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader
+        >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> dp = FileOpener(dp)
         >>> dp = StreamReader(dp)
         >>> list(dp)
@@ -52,7 +62,11 @@ def __init__(
         length: int = -1,
     ):
         super().__init__()
+<<<<<<< HEAD
         self.datapipe: Iterable[str] = datapipe
+=======
+        self.datapipe: Iterable = datapipe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.mode: str = mode
         self.encoding: Optional[str] = encoding
 
@@ -69,12 +83,20 @@ def __init__(
     # Remove annotation due to 'IOBase' is a general type and true type
     # is determined at runtime based on mode. Some `DataPipe` requiring
     # a subtype would cause mypy error.
+<<<<<<< HEAD
     def __iter__(self) -> Iterator[tuple[str, IOBase]]:
+=======
+    def __iter__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield from get_file_binaries_from_pathnames(
             self.datapipe, self.mode, self.encoding
         )
 
+<<<<<<< HEAD
     def __len__(self) -> int:
+=======
+    def __len__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.length == -1:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
         return self.length
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 865feb9953e35..cc20d50f65ba7 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,8 +1,17 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections import defaultdict
 from collections.abc import Callable, Iterator, Sized
 from typing import Any, Optional, TypeVar
 
+=======
+import warnings
+from collections import defaultdict
+from collections.abc import Iterator, Sized
+from typing import Any, Callable, Optional, TypeVar
+
+import torch.utils.data.datapipes.iter.sharding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
@@ -19,6 +28,19 @@
 
 
 def __getattr__(name: str):
+<<<<<<< HEAD
+=======
+    if name in ["SHARDING_PRIORITIES", "ShardingFilterIterDataPipe"]:
+        warnings.warn(
+            f"`{name}` from `torch.utils.data.datapipes.iter.grouping` is going to be removed in PyTorch 2.1"
+            f"Please use `{name}` from the `torch.utils.data.datapipes.iter.sharding`",
+            category=FutureWarning,
+            stacklevel=2,
+        )
+
+        return getattr(torch.utils.data.datapipes.iter.sharding, name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     raise AttributeError(f"module {__name__} has no attribute {name}")
 
 
@@ -57,8 +79,12 @@ def __init__(
         drop_last: bool = False,
         wrapper_class: type[DataChunk] = DataChunk,
     ) -> None:
+<<<<<<< HEAD
         if batch_size <= 0:
             raise AssertionError("Batch size is required to be larger than 0!")
+=======
+        assert batch_size > 0, "Batch size is required to be larger than 0!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.datapipe = datapipe
         self.batch_size = batch_size
@@ -171,9 +197,13 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
         >>> from torchdata.datapipes.iter import IterableWrapper
         >>> def group_fn(file):
         ...     return os.path.basename(file).split(".")[0]
+<<<<<<< HEAD
         >>> source_dp = IterableWrapper(
         ...     ["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"]
         ... )
+=======
+        >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
         >>> list(dp0)
         [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
@@ -182,12 +212,16 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
         >>> list(dp1)
         [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
         >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
+<<<<<<< HEAD
         >>> dp2 = source_dp.groupby(
         ...     group_key_fn=group_fn,
         ...     buffer_size=3,
         ...     group_size=3,
         ...     guaranteed_group_size=2,
         ... )
+=======
+        >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> list(dp2)
         [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
     """
@@ -204,9 +238,13 @@ def __init__(
         drop_remaining: bool = False,
     ):
         _check_unpickable_fn(group_key_fn)
+<<<<<<< HEAD
         # pyrefly: ignore [invalid-type-var]
         self.datapipe = datapipe
         # pyrefly: ignore [invalid-type-var]
+=======
+        self.datapipe = datapipe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.group_key_fn = group_key_fn
 
         self.keep_key = keep_key
@@ -216,6 +254,7 @@ def __init__(
         self.group_size = group_size
         self.guaranteed_group_size = None
         if group_size is not None and buffer_size is not None:
+<<<<<<< HEAD
             if not (0 < group_size <= buffer_size):
                 raise AssertionError("group_size must be > 0 and <= buffer_size")
             # pyrefly: ignore [bad-assignment]
@@ -226,6 +265,12 @@ def __init__(
                     "guaranteed_group_size must be > 0 and <= group_size and group_size must be set"
                 )
             # pyrefly: ignore [bad-assignment]
+=======
+            assert 0 < group_size <= buffer_size
+            self.guaranteed_group_size = group_size
+        if guaranteed_group_size is not None:
+            assert group_size is not None and 0 < guaranteed_group_size <= group_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.guaranteed_group_size = guaranteed_group_size
         self.drop_remaining = drop_remaining
         self.wrapper_class = DataChunk
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index ba4d708a0a318..9634a073c92b5 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -1,6 +1,12 @@
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable, Iterator, Sized
 from io import BufferedIOBase
 from typing import Any
+=======
+from collections.abc import Iterable, Iterator, Sized
+from io import BufferedIOBase
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index afb0e91d85579..fe7fea19079d2 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,6 +1,11 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable, Iterator
 from typing import TypeVar
+=======
+from collections.abc import Iterator
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
@@ -88,7 +93,11 @@ def _returnIfTrue(self, data: _T) -> tuple[bool, _T]:
             for idx, mask in enumerate(df_wrapper.iterate(condition)):
                 if mask:
                     result.append(df_wrapper.get_item(data, idx))
+<<<<<<< HEAD
             if result:
+=======
+            if len(result):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return True, df_wrapper.concat(result)
             else:
                 return False, None  # type: ignore[return-value]
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index e45ddab282f7b..b57914c1d35ce 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -1,17 +1,30 @@
+<<<<<<< HEAD
 import copy
 import warnings
 from collections.abc import Iterable, Iterator, Sized
 from typing import TypeVar
+=======
+# mypy: allow-untyped-defs
+import copy
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes.datapipe import IterDataPipe
 
 
+<<<<<<< HEAD
 _T = TypeVar("_T")
 
 __all__ = ["IterableWrapperIterDataPipe"]
 
 
 class IterableWrapperIterDataPipe(IterDataPipe[_T]):
+=======
+__all__ = ["IterableWrapperIterDataPipe"]
+
+
+class IterableWrapperIterDataPipe(IterDataPipe):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Wraps an iterable object to create an IterDataPipe.
 
@@ -33,11 +46,19 @@ class IterableWrapperIterDataPipe(IterDataPipe[_T]):
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     """
 
+<<<<<<< HEAD
     def __init__(self, iterable: Iterable[_T], deepcopy: bool = True) -> None:
         self.iterable = iterable
         self.deepcopy = deepcopy
 
     def __iter__(self) -> Iterator[_T]:
+=======
+    def __init__(self, iterable, deepcopy=True):
+        self.iterable = iterable
+        self.deepcopy = deepcopy
+
+    def __iter__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source_data = self.iterable
         if self.deepcopy:
             try:
@@ -49,6 +70,7 @@ def __iter__(self) -> Iterator[_T]:
             except TypeError:
                 warnings.warn(
                     "The input iterable can not be deepcopied, "
+<<<<<<< HEAD
                     "please be aware of in-place modification would affect source data.",
                     stacklevel=2,
                 )
@@ -58,3 +80,11 @@ def __len__(self) -> int:
         if isinstance(self.iterable, Sized):
             return len(self.iterable)
         raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+=======
+                    "please be aware of in-place modification would affect source data."
+                )
+        yield from source_data
+
+    def __len__(self):
+        return len(self.iterable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py
index bc555e8fdac26..241bd471e1f89 100644
--- a/torch/utils/data/datapipes/map/__init__.py
+++ b/torch/utils/data/datapipes/map/__init__.py
@@ -16,5 +16,9 @@
 __all__ = ["Batcher", "Concater", "Mapper", "SequenceWrapper", "Shuffler", "Zipper"]
 
 # Please keep this list sorted
+<<<<<<< HEAD
 if __all__ != sorted(__all__):
     raise AssertionError("__all__ is not sorted")
+=======
+assert __all__ == sorted(__all__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py
index 3696d34b2a815..4a49b37fabd39 100644
--- a/torch/utils/data/datapipes/map/callable.py
+++ b/torch/utils/data/datapipes/map/callable.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Callable
 from typing import TypeVar
+=======
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import MapDataPipe
@@ -60,7 +64,10 @@ def __init__(
         self.fn = fn  # type: ignore[assignment]
 
     def __len__(self) -> int:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.datapipe)
 
     def __getitem__(self, index) -> _T_co:
diff --git a/torch/utils/data/datapipes/map/combinatorics.py b/torch/utils/data/datapipes/map/combinatorics.py
index 4876ce3fd1cbc..01d7a4b639373 100644
--- a/torch/utils/data/datapipes/map/combinatorics.py
+++ b/torch/utils/data/datapipes/map/combinatorics.py
@@ -64,7 +64,10 @@ def __init__(
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.indices = list(range(len(datapipe))) if indices is None else indices
         self._enabled = True
         self._seed = None
@@ -96,7 +99,10 @@ def reset(self) -> None:
         self._shuffled_indices = self._rng.sample(self.indices, len(self.indices))
 
     def __len__(self) -> int:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.datapipe)
 
     def __getstate__(self):
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 21a412ff91609..d358406fcdf86 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -49,16 +49,25 @@ def __init__(self, *datapipes: MapDataPipe):
     def __getitem__(self, index) -> _T_co:  # type: ignore[type-var]
         offset = 0
         for dp in self.datapipes:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             if index - offset < len(dp):
                 return dp[index - offset]
             else:
                 # pyrefly: ignore [bad-argument-type]
+=======
+            if index - offset < len(dp):
+                return dp[index - offset]
+            else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 offset += len(dp)
         raise IndexError(f"Index {index} is out of range.")
 
     def __len__(self) -> int:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return sum(len(dp) for dp in self.datapipes)
 
 
@@ -105,5 +114,8 @@ def __getitem__(self, index) -> tuple[_T_co, ...]:
         return tuple(res)
 
     def __len__(self) -> int:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return min(len(dp) for dp in self.datapipes)
diff --git a/torch/utils/data/datapipes/map/grouping.py b/torch/utils/data/datapipes/map/grouping.py
index 5929cab242791..91bad5a2f1ad7 100644
--- a/torch/utils/data/datapipes/map/grouping.py
+++ b/torch/utils/data/datapipes/map/grouping.py
@@ -45,8 +45,12 @@ def __init__(
         drop_last: bool = False,
         wrapper_class: type[DataChunk] = DataChunk,
     ) -> None:
+<<<<<<< HEAD
         if batch_size <= 0:
             raise AssertionError("Batch size is required to be larger than 0!")
+=======
+        assert batch_size > 0, "Batch size is required to be larger than 0!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.datapipe = datapipe
         self.batch_size = batch_size
diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py
index 360f66b3137c7..7a80086b90474 100644
--- a/torch/utils/data/datapipes/map/utils.py
+++ b/torch/utils/data/datapipes/map/utils.py
@@ -1,17 +1,30 @@
+<<<<<<< HEAD
 import copy
 import warnings
 from collections.abc import Mapping, Sequence
 from typing import Any, TypeVar, Union
+=======
+# mypy: allow-untyped-defs
+import copy
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data.datapipes.datapipe import MapDataPipe
 
 
+<<<<<<< HEAD
 _T = TypeVar("_T")
 
 __all__ = ["SequenceWrapperMapDataPipe"]
 
 
 class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
+=======
+__all__ = ["SequenceWrapperMapDataPipe"]
+
+
+class SequenceWrapperMapDataPipe(MapDataPipe):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Wraps a sequence object into a MapDataPipe.
 
@@ -31,6 +44,7 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
         >>> dp = SequenceWrapper(range(10))
         >>> list(dp)
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+<<<<<<< HEAD
         >>> dp = SequenceWrapper({"a": 100, "b": 200, "c": 300, "d": 400})
         >>> dp["a"]
         100
@@ -41,21 +55,40 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
     def __init__(
         self, sequence: Union[Sequence[_T], Mapping[Any, _T]], deepcopy: bool = True
     ) -> None:
+=======
+        >>> dp = SequenceWrapper({'a': 100, 'b': 200, 'c': 300, 'd': 400})
+        >>> dp['a']
+        100
+    """
+
+    def __init__(self, sequence, deepcopy=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if deepcopy:
             try:
                 self.sequence = copy.deepcopy(sequence)
             except TypeError:
                 warnings.warn(
                     "The input sequence can not be deepcopied, "
+<<<<<<< HEAD
                     "please be aware of in-place modification would affect source data",
                     stacklevel=2,
+=======
+                    "please be aware of in-place modification would affect source data"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.sequence = sequence
         else:
             self.sequence = sequence
 
+<<<<<<< HEAD
     def __getitem__(self, index: int) -> _T:
         return self.sequence[index]
 
     def __len__(self) -> int:
+=======
+    def __getitem__(self, index):
+        return self.sequence[index]
+
+    def __len__(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return len(self.sequence)
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index 003ca568fcaf6..e97809685de72 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -4,9 +4,15 @@
 import inspect
 import os
 import warnings
+<<<<<<< HEAD
 from collections.abc import Callable, Iterable
 from io import IOBase
 from typing import Any, Optional, Union
+=======
+from collections.abc import Iterable
+from io import IOBase
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils._import_utils import dill_available
 
@@ -149,8 +155,12 @@ def _check_unpickable_fn(fn: Callable):
     if _is_local_fn(fn) and not dill_available():
         warnings.warn(
             "Local function is not supported by pickle, please use "
+<<<<<<< HEAD
             "regular python function or functools.partial instead.",
             stacklevel=2,
+=======
+            "regular python function or functools.partial instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return
 
@@ -158,8 +168,12 @@ def _check_unpickable_fn(fn: Callable):
     if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not dill_available():
         warnings.warn(
             "Lambda function is not supported by pickle, please use "
+<<<<<<< HEAD
             "regular python function or functools.partial instead.",
             stacklevel=2,
+=======
+            "regular python function or functools.partial instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return
 
@@ -187,7 +201,11 @@ def get_file_pathnames_from_root(
 ) -> Iterable[str]:
     # print out an error message and raise the error out
     def onerror(err: OSError):
+<<<<<<< HEAD
         warnings.warn(err.filename + " : " + err.strerror, stacklevel=2)
+=======
+        warnings.warn(err.filename + " : " + err.strerror)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise err
 
     if os.path.isfile(root):
@@ -198,7 +216,10 @@ def onerror(err: OSError):
         if match_masks(fname, masks):
             yield path
     else:
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for path, dirs, files in os.walk(root, onerror=onerror):
             if abspath:
                 path = os.path.abspath(path)
@@ -313,7 +334,11 @@ def _deprecation_warning(
     if new_argument_name:
         msg = f"{msg}\nPlease use `{old_class_name}({new_argument_name}=)` instead."
 
+<<<<<<< HEAD
     warnings.warn(msg, FutureWarning, stacklevel=2)
+=======
+    warnings.warn(msg, FutureWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class StreamWrapper:
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index f4cc55838ae08..6ec72d7ef3a28 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -45,8 +45,13 @@ def basichandlers(extension: str, data):
 
     Example:
         >>> import pickle
+<<<<<<< HEAD
         >>> data = pickle.dumps("some data")
         >>> new_data = basichandlers("pickle", data)
+=======
+        >>> data = pickle.dumps('some data')
+        >>> new_data = basichandlers('pickle', data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> new_data
         some data
 
@@ -61,7 +66,11 @@ def basichandlers(extension: str, data):
     if extension in "txt text transcript":
         return data.decode("utf-8")
 
+<<<<<<< HEAD
     if extension in ["cls", "cls2", "class", "count", "index", "inx", "id"]:
+=======
+    if extension in "cls cls2 class count index inx id".split():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             return int(data)
         except ValueError:
@@ -70,10 +79,17 @@ def basichandlers(extension: str, data):
     if extension in "json jsn":
         return json.loads(data)
 
+<<<<<<< HEAD
     if extension in ["pyd", "pickle"]:
         return pickle.loads(data)
 
     if extension in ["pt"]:
+=======
+    if extension in "pyd pickle".split():
+        return pickle.loads(data)
+
+    if extension in "pt".split():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stream = io.BytesIO(data)
         return torch.load(stream)
 
@@ -169,12 +185,22 @@ class ImageHandler:
     """
 
     def __init__(self, imagespec):
+<<<<<<< HEAD
         if imagespec not in list(imagespecs.keys()):
             raise AssertionError(f"unknown image specification: {imagespec}")
         self.imagespec = imagespec.lower()
 
     def __call__(self, extension, data):
         if extension.lower() not in ["jpg", "jpeg", "png", "ppm", "pgm", "pbm", "pnm"]:
+=======
+        assert imagespec in list(
+            imagespecs.keys()
+        ), f"unknown image specification: {imagespec}"
+        self.imagespec = imagespec.lower()
+
+    def __call__(self, extension, data):
+        if extension.lower() not in "jpg jpeg png ppm pgm pbm pnm".split():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         try:
@@ -204,20 +230,32 @@ def __call__(self, extension, data):
                 return img
             elif atype == "numpy":
                 result = np.asarray(img)
+<<<<<<< HEAD
                 if result.dtype != np.uint8:
                     raise AssertionError(
                         f"numpy image array should be type uint8, but got {result.dtype}"
                     )
+=======
+                assert (
+                    result.dtype == np.uint8
+                ), f"numpy image array should be type uint8, but got {result.dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if etype == "uint8":
                     return result
                 else:
                     return result.astype("f") / 255.0
             elif atype == "torch":
                 result = np.asarray(img)
+<<<<<<< HEAD
                 if result.dtype != np.uint8:
                     raise AssertionError(
                         f"numpy image array should be type uint8, but got {result.dtype}"
                     )
+=======
+                assert (
+                    result.dtype == np.uint8
+                ), f"numpy image array should be type uint8, but got {result.dtype}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if etype == "uint8":
                     result = np.array(result.transpose(2, 0, 1))
@@ -236,6 +274,7 @@ def imagehandler(imagespec):
 # torch video
 ################################################################
 def videohandler(extension, data):
+<<<<<<< HEAD
     if extension not in [
         "mp4",
         "ogv",
@@ -247,6 +286,9 @@ def videohandler(extension, data):
         "webm",
         "wmv",
     ]:
+=======
+    if extension not in "mp4 ogv mjpeg avi mov h264 mpg webm wmv".split():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     try:
diff --git a/torch/utils/data/datapipes/utils/snapshot.py b/torch/utils/data/datapipes/utils/snapshot.py
index 42aec1aa308a9..a75f2d7eacb78 100644
--- a/torch/utils/data/datapipes/utils/snapshot.py
+++ b/torch/utils/data/datapipes/utils/snapshot.py
@@ -43,7 +43,10 @@ def _simple_graph_snapshot_restoration(
     # simple fast-forwarding. Therefore, we need to call `reset` twice, because if `SnapshotState` is `Restored`,
     # the first reset will not actually reset.
     datapipe.reset()  # This ensures `SnapshotState` is `Iterating` by this point, even if it was `Restored`.
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     apply_random_seed(datapipe, rng)
 
     remainder = n_iterations
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index f4e61963cd01e..20b800cc6a4aa 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -96,7 +96,11 @@ class IterableDataset(Dataset[_T_co], Iterable[_T_co]):
         >>> class MyIterableDataset(torch.utils.data.IterableDataset):
         ...     def __init__(self, start, end):
         ...         super(MyIterableDataset).__init__()
+<<<<<<< HEAD
         ...         assert end > start, "this example only works with end >= start"
+=======
+        ...         assert end > start, "this example code only works with end >= start"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...         self.start = start
         ...         self.end = end
         ...
@@ -138,7 +142,11 @@ class IterableDataset(Dataset[_T_co], Iterable[_T_co]):
         >>> class MyIterableDataset(torch.utils.data.IterableDataset):
         ...     def __init__(self, start, end):
         ...         super(MyIterableDataset).__init__()
+<<<<<<< HEAD
         ...         assert end > start, "this example only works with end >= start"
+=======
+        ...         assert end > start, "this example code only works with end >= start"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...         self.start = start
         ...         self.end = end
         ...
@@ -198,8 +206,14 @@ class TensorDataset(Dataset[tuple[Tensor, ...]]):
     tensors: tuple[Tensor, ...]
 
     def __init__(self, *tensors: Tensor) -> None:
+<<<<<<< HEAD
         if all(tensors[0].size(0) != tensor.size(0) for tensor in tensors):
             raise AssertionError("Size mismatch between tensors")
+=======
+        assert all(
+            tensors[0].size(0) == tensor.size(0) for tensor in tensors
+        ), "Size mismatch between tensors"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.tensors = tensors
 
     def __getitem__(self, index):
@@ -221,7 +235,11 @@ class StackDataset(Dataset[_T_stack]):
         >>> tuple_stack = StackDataset(images, texts)
         >>> tuple_stack[0] == (images[0], texts[0])
         >>> dict_stack = StackDataset(image=images, text=texts)
+<<<<<<< HEAD
         >>> dict_stack[0] == {"image": images[0], "text": texts[0]}
+=======
+        >>> dict_stack[0] == {'image': images[0], 'text': texts[0]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         *args (Dataset): Datasets for stacking returned as tuple.
@@ -320,11 +338,19 @@ def cumsum(sequence):
     def __init__(self, datasets: Iterable[Dataset]) -> None:
         super().__init__()
         self.datasets = list(datasets)
+<<<<<<< HEAD
         if len(self.datasets) == 0:
             raise AssertionError("datasets should not be an empty iterable")
         for d in self.datasets:
             if isinstance(d, IterableDataset):
                 raise AssertionError("ConcatDataset does not support IterableDataset")
+=======
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatDataset does not support IterableDataset"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cumulative_sizes = self.cumsum(self.datasets)
 
     def __len__(self):
@@ -370,15 +396,27 @@ def __init__(self, datasets: Iterable[Dataset]) -> None:
 
     def __iter__(self):
         for d in self.datasets:
+<<<<<<< HEAD
             if not isinstance(d, IterableDataset):
                 raise AssertionError("ChainDataset only supports IterableDataset")
+=======
+            assert isinstance(
+                d, IterableDataset
+            ), "ChainDataset only supports IterableDataset"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield from d
 
     def __len__(self):
         total = 0
         for d in self.datasets:
+<<<<<<< HEAD
             if not isinstance(d, IterableDataset):
                 raise AssertionError("ChainDataset only supports IterableDataset")
+=======
+            assert isinstance(
+                d, IterableDataset
+            ), "ChainDataset only supports IterableDataset"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             total += len(d)  # type: ignore[arg-type]
         return total
 
@@ -451,7 +489,13 @@ def random_split(
         for i, frac in enumerate(lengths):
             if frac < 0 or frac > 1:
                 raise ValueError(f"Fraction at index {i} is not between 0 and 1")
+<<<<<<< HEAD
             n_items_in_split = math.floor(len(dataset) * frac)  # type: ignore[arg-type]
+=======
+            n_items_in_split = int(
+                math.floor(len(dataset) * frac)  # type: ignore[arg-type]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             subset_lengths.append(n_items_in_split)
         remainder = len(dataset) - sum(subset_lengths)  # type: ignore[arg-type]
         # add 1 to all the lengths in round-robin fashion until the remainder is 0
@@ -463,8 +507,12 @@ def random_split(
             if length == 0:
                 warnings.warn(
                     f"Length of split at index {i} is 0. "
+<<<<<<< HEAD
                     f"This might result in an empty dataset.",
                     stacklevel=2,
+=======
+                    f"This might result in an empty dataset."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     # Cannot verify that dataset is Sized
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index b2f4eb04e8e24..3ec7ae658ca56 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -125,6 +125,7 @@ def __iter__(self) -> Iterator[_T_co]:
         else:
             # remove tail of data to make it evenly divisible.
             indices = indices[: self.total_size]
+<<<<<<< HEAD
         if len(indices) != self.total_size:
             raise AssertionError(
                 f"Number of indices ({len(indices)}) does not match total_size ({self.total_size})"
@@ -138,6 +139,14 @@ def __iter__(self) -> Iterator[_T_co]:
             )
 
         # pyrefly: ignore [bad-return]
+=======
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return iter(indices)
 
     def __len__(self) -> int:
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index 8867109c1e0b7..cb6509644f28f 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -72,7 +72,10 @@ def reduce_hook(obj):
             p.dump(scan_obj)
         except (pickle.PickleError, AttributeError, TypeError):
             if dill_available():
+<<<<<<< HEAD
                 # pyrefly: ignore [missing-attribute]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 d.dump(scan_obj)
             else:
                 raise
@@ -132,7 +135,11 @@ def traverse(datapipe: DataPipe, only_datapipe: Optional[bool] = None) -> DataPi
     )
     if not only_datapipe:
         msg += " And, the behavior will be changed to the equivalent of `only_datapipe=True`."
+<<<<<<< HEAD
     warnings.warn(msg, FutureWarning, stacklevel=2)
+=======
+    warnings.warn(msg, FutureWarning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if only_datapipe is None:
         only_datapipe = False
     cache: set[int] = set()
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index bb97558256bec..0355c27864823 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -116,8 +116,12 @@ def apply_shuffle_settings(
     if not shufflers and shuffle:
         warnings.warn(
             "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. "
+<<<<<<< HEAD
             "Be aware that the default buffer size might not be sufficient for your task.",
             stacklevel=2,
+=======
+            "Be aware that the default buffer size might not be sufficient for your task."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         datapipe = datapipe.shuffle()
         shufflers = [
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 81f05a936df8f..cad7689df5293 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -6,12 +6,15 @@
 import torch
 
 
+<<<<<<< HEAD
 # Note: For benchmarking changes to samplers, see:
 # /benchmarks/data/samplers_bench.py
 # This benchmark compares the performance of different sampler implementations
 # and can be used to evaluate the impact of optimizations.
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "BatchSampler",
     "RandomSampler",
@@ -32,6 +35,13 @@ class Sampler(Generic[_T_co]):
     way to iterate over indices or lists of indices (batches) of dataset elements,
     and may provide a :meth:`__len__` method that returns the length of the returned iterators.
 
+<<<<<<< HEAD
+=======
+    Args:
+        data_source (Dataset): This argument is not used and will be removed in 2.2.0.
+            You may still have custom implementation that utilizes it.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Example:
         >>> # xdoctest: +SKIP
         >>> class AccedingSequenceLengthSampler(Sampler[int]):
@@ -63,6 +73,18 @@ class Sampler(Generic[_T_co]):
               calculation involving the length of a :class:`~torch.utils.data.DataLoader`.
     """
 
+<<<<<<< HEAD
+=======
+    def __init__(self, data_source: Optional[Sized] = None) -> None:
+        if data_source is not None:
+            import warnings
+
+            warnings.warn(
+                "`data_source` argument is not used and will be removed in 2.2.0."
+                "You may still have custom implementation that utilizes it."
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __iter__(self) -> Iterator[_T_co]:
         raise NotImplementedError
 
@@ -98,7 +120,11 @@ class SequentialSampler(Sampler[int]):
     r"""Samples elements sequentially, always in the same order.
 
     Args:
+<<<<<<< HEAD
         data_source (Sized): data source to sample from. Must implement __len__.
+=======
+        data_source (Dataset): dataset to sample from
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     data_source: Sized
@@ -119,7 +145,11 @@ class RandomSampler(Sampler[int]):
     If with replacement, then user can specify :attr:`num_samples` to draw.
 
     Args:
+<<<<<<< HEAD
         data_source (Sized): data source to sample from. Must implement __len__.
+=======
+        data_source (Dataset): dataset to sample from
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
         num_samples (int): number of samples to draw, default=`len(dataset)`.
         generator (Generator): Generator used in sampling.
@@ -223,6 +253,7 @@ class WeightedRandomSampler(Sampler[int]):
 
     Example:
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+<<<<<<< HEAD
         >>> list(
         ...     WeightedRandomSampler(
         ...         [0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True
@@ -234,6 +265,11 @@ class WeightedRandomSampler(Sampler[int]):
         ...         [0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False
         ...     )
         ... )
+=======
+        >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
+        [4, 4, 1, 4, 5]
+        >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [0, 1, 4, 3, 2]
     """
 
@@ -293,6 +329,7 @@ class BatchSampler(Sampler[list[int]]):
             its size would be less than ``batch_size``
 
     Example:
+<<<<<<< HEAD
         >>> list(
         ...     BatchSampler(
         ...         SequentialSampler(range(10)), batch_size=3, drop_last=False
@@ -302,6 +339,11 @@ class BatchSampler(Sampler[list[int]]):
         >>> list(
         ...     BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)
         ... )
+=======
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
     """
 
@@ -331,6 +373,10 @@ def __init__(
         self.drop_last = drop_last
 
     def __iter__(self) -> Iterator[list[int]]:
+<<<<<<< HEAD
+=======
+        # Implemented based on the benchmarking in https://github.com/pytorch/pytorch/pull/76951
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sampler_iter = iter(self.sampler)
         if self.drop_last:
             # Create multiple references to the same iterator
diff --git a/torch/utils/data/standard_pipes.ipynb b/torch/utils/data/standard_pipes.ipynb
index c40058bca7699..8df58c14c4a03 100644
--- a/torch/utils/data/standard_pipes.ipynb
+++ b/torch/utils/data/standard_pipes.ipynb
@@ -753,7 +753,11 @@
     "\n",
     "Arguments:\n",
     " - `group_key_fn`\n",
+<<<<<<< HEAD
     " - `group_size` - yield resulted group as soon as `group_size` elements accumulated\n",
+=======
+    " - `group_size` - yeild resulted group as soon as `group_size` elements accumulated\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     " - `guaranteed_group_size:int = None`\n",
     " - `unbatch_level:int = 0` if specified calls `unbatch(unbatch_level=unbatch_level)` on source datapipe before batching (see `unbatch`)\n",
     "\n",
@@ -962,7 +966,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+<<<<<<< HEAD
     "This behaviour becomes noticeable when data is bigger than buffer and some groups getting evicted before gathering all potential items"
+=======
+    "This behaviour becomes noticable when data is bigger than buffer and some groups getting evicted before gathering all potential items"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
diff --git a/torch/utils/data/typing.ipynb b/torch/utils/data/typing.ipynb
index 1b1aa8c9da72f..3d7725e96b1ee 100644
--- a/torch/utils/data/typing.ipynb
+++ b/torch/utils/data/typing.ipynb
@@ -399,7 +399,11 @@
     "\n",
     "Note: This decorator is only allowed to be attached to `__iter__` for now. It can be extended into `__getitem__` and further `nonblocking` functions.\n",
     "\n",
+<<<<<<< HEAD
     "`runtime_validation_disabled` is a context manager to turn off the type validation during runtime. It's useful for DataLoader to disable the runtime validation after the first epoch is finished for better performance. Note: the runtime validation is enabled by default."
+=======
+    "`runtime_validation_disabled` is a context manager to turn off the type validaiton during runtime. It's useful for DataLoader to disable the runtime validaiton after the first epoch is finished for better performance. Note: the runtime validation is enabled by default."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
@@ -684,7 +688,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+<<<<<<< HEAD
     "- Compatible with context manager to disable validation"
+=======
+    "- Compatible with context mangager to disable validation"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index f63cc89cc26ea..855d87d43fd5d 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -1,14 +1,24 @@
+<<<<<<< HEAD
 from typing import Any, Optional
+=======
+from typing import Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import enum
 
+<<<<<<< HEAD
 from torch._C import _to_dlpack as to_dlpack
 from torch.types import Device as _Device
+=======
+from torch._C import _from_dlpack
+from torch._C import _to_dlpack as to_dlpack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = [
     "DLDeviceType",
     "from_dlpack",
+<<<<<<< HEAD
 ]
 
 class DLDeviceType(enum.IntEnum):
@@ -16,11 +26,23 @@ class DLDeviceType(enum.IntEnum):
     kDLCPU = 1,
     kDLCUDA = 2,
     kDLCUDAHost = 3,
+=======
+    "to_dlpack",
+]
+
+
+class DLDeviceType(enum.IntEnum):
+    # Enums as in DLPack specification (aten/src/ATen/dlpack.h)
+    kDLCPU = 1,
+    kDLGPU = 2,
+    kDLCPUPinned = 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kDLOpenCL = 4,
     kDLVulkan = 7,
     kDLMetal = 8,
     kDLVPI = 9,
     kDLROCM = 10,
+<<<<<<< HEAD
     kDLROCMHost = 11,
     kDLExtDev = 12,
     kDLCUDAManaged = 13,
@@ -28,6 +50,10 @@ class DLDeviceType(enum.IntEnum):
     kDLWebGPU = 15,
     kDLHexagon = 16,
     kDLMAIA = 17,
+=======
+    kDLExtDev = 12,
+    kDLOneAPI = 14,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 torch._C._add_docstr(to_dlpack, r"""to_dlpack(tensor) -> PyCapsule
@@ -55,12 +81,16 @@ class DLDeviceType(enum.IntEnum):
 
 # TODO: add a typing.Protocol to be able to tell Mypy that only objects with
 # __dlpack__ and __dlpack_device__ methods are accepted.
+<<<<<<< HEAD
 def from_dlpack(
     ext_tensor: Any,
     *,
     device: Optional[_Device] = None,
     copy: Optional[bool] = None
 ) -> 'torch.Tensor':
+=======
+def from_dlpack(ext_tensor: Any) -> 'torch.Tensor':
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """from_dlpack(ext_tensor) -> Tensor
 
     Converts a tensor from an external library into a ``torch.Tensor``.
@@ -82,6 +112,7 @@ def from_dlpack(
             an opaque ``PyCapsule`` instance, typically produced by a
             ``to_dlpack`` function or method.
 
+<<<<<<< HEAD
         device (torch.device or str or None): An optional PyTorch device
             specifying where to place the new tensor. If None (default), the
             new tensor will be on the same device as ``ext_tensor``.
@@ -89,6 +120,8 @@ def from_dlpack(
         copy (bool or None): An optional boolean indicating whether or not to copy
             ``self``. If None, PyTorch will copy only if necessary.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Examples::
 
         >>> import torch.utils.dlpack
@@ -119,6 +152,7 @@ def from_dlpack(
 
     """
     if hasattr(ext_tensor, '__dlpack__'):
+<<<<<<< HEAD
         # Only populate kwargs if any of the optional arguments are, in fact, not None. Otherwise,
         # leave them out, since we might end up falling back to no-extra-kwargs __dlpack__ call.
         kwargs: dict[str, Any] = {}
@@ -142,11 +176,19 @@ def from_dlpack(
         # stream
         if ext_device[0] in (DLDeviceType.kDLCUDA, DLDeviceType.kDLROCM):
             stream = torch.cuda.current_stream(f'cuda:{ext_device[1]}')
+=======
+        device = ext_tensor.__dlpack_device__()
+        # device is either CUDA or ROCm, we need to pass the current
+        # stream
+        if device[0] in (DLDeviceType.kDLGPU, DLDeviceType.kDLROCM):
+            stream = torch.cuda.current_stream(f'cuda:{device[1]}')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # cuda_stream is the pointer to the stream and it is a public
             # attribute, but it is not documented
             # The array API specify that the default legacy stream must be passed
             # with a value of 1 for CUDA
             # https://data-apis.org/array-api/latest/API_specification/array_object.html?dlpack-self-stream-none#dlpack-self-stream-none
+<<<<<<< HEAD
             is_cuda = ext_device[0] == DLDeviceType.kDLCUDA
             # Since pytorch is not using PTDS by default, lets directly pass
             # the legacy stream
@@ -169,3 +211,16 @@ def from_dlpack(
         # Old versions just call the converter
         dlpack = ext_tensor
     return torch._C._from_dlpack(dlpack)
+=======
+            is_cuda = device[0] == DLDeviceType.kDLGPU
+            # Since pytorch is not using PTDS by default, lets directly pass
+            # the legacy stream
+            stream_ptr = 1 if is_cuda and stream.cuda_stream == 0 else stream.cuda_stream
+            dlpack = ext_tensor.__dlpack__(stream=stream_ptr)
+        else:
+            dlpack = ext_tensor.__dlpack__()
+    else:
+        # Old versions just call the converter
+        dlpack = ext_tensor
+    return _from_dlpack(dlpack)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index 3d51d9efb339f..6ed4e31b99936 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -31,7 +31,10 @@ def try_acquire(self):
             True if the file could be created, else False.
         """
         try:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
             return True
         except FileExistsError:
@@ -53,7 +56,11 @@ def wait(self):
             if self.warn_after_seconds is not None:
                 if time.time() - start_time > self.warn_after_seconds and not has_warned:
                     warnings.warn(f'Waited on lock file "{self.lock_file_path}" for '
+<<<<<<< HEAD
                                   f'{self.warn_after_seconds} seconds.', stacklevel=2)
+=======
+                                  f'{self.warn_after_seconds} seconds.')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     has_warned = True
 
     def release(self):
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index e368d52de0c53..da9756cc65572 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -2,8 +2,12 @@
 import torch
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from .module_tracker import ModuleTracker
+<<<<<<< HEAD
 from typing import Any, Optional, Union, TypeVar
 from collections.abc import Callable
+=======
+from typing import Any, Optional, Union, TypeVar, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 from typing_extensions import ParamSpec
 from collections import defaultdict
@@ -62,8 +66,12 @@ def mm_flop(a_shape, b_shape, *args, out_shape=None, **kwargs) -> int:
     # Inputs contains the shapes of two matrices.
     m, k = a_shape
     k2, n = b_shape
+<<<<<<< HEAD
     if k != k2:
         raise AssertionError(f"matmul: inner dimensions must match (k == k2), got {k} and {k2}")
+=======
+    assert k == k2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
     return m * n * 2 * k
 
@@ -79,10 +87,15 @@ def bmm_flop(a_shape, b_shape, out_shape=None, **kwargs) -> int:
     # Inputs contains the shapes of two tensor.
     b, m, k = a_shape
     b2, k2, n = b_shape
+<<<<<<< HEAD
     if b != b2:
         raise AssertionError(f"bmm: batch dimensions must match (b == b2), got {b} and {b2}")
     if k != k2:
         raise AssertionError(f"bmm: inner dimensions must match (k == k2), got {k} and {k2}")
+=======
+    assert b == b2
+    assert k == k2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
     flop = b * m * n * 2 * k
     return flop
@@ -131,6 +144,10 @@ def conv_flop_count(
     Returns:
         int: the number of flops
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     batch_size = x_shape[0]
     conv_shape = (x_shape if transposed else out_shape)[2:]
     c_out, c_in, *filter_size = w_shape
@@ -149,10 +166,16 @@ def conv_flop_count(
     flop = prod(conv_shape) * prod(filter_size) * batch_size * c_out * c_in * 2
     return flop
 
+<<<<<<< HEAD
 @register_flop_formula([aten.convolution, aten._convolution, aten.cudnn_convolution, aten._slow_conv2d_forward])
 def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, *args, out_shape=None, **kwargs) -> int:
     """Count flops for convolution."""
     # pyrefly: ignore [bad-argument-type]
+=======
+@register_flop_formula([aten.convolution, aten._convolution])
+def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, *args, out_shape=None, **kwargs) -> int:
+    """Count flops for convolution."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
 
 
@@ -269,8 +292,12 @@ def sdpa_flop_count(query_shape, key_shape, value_shape):
     b, h, s_q, d_q = query_shape
     _b2, _h2, s_k, _d2 = key_shape
     _b3, _h3, _s3, d_v = value_shape
+<<<<<<< HEAD
     if not b == _b2 == _b3 or not h == _h2 == _h3 or not d_q == _d2 or not s_k == _s3 or not d_q == _d2:
         raise AssertionError("sdpa_flop_count: query/key/value shapes are incompatible")
+=======
+    assert b == _b2 == _b3 and h == _h2 == _h3 and d_q == _d2 and s_k == _s3 and d_q == _d2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     total_flops = 0
     # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
     total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))
@@ -324,6 +351,7 @@ def _unpack_flash_attention_nested_shapes(
         # In comparison, non-Nested inputs have shape (batch, heads, sequence len, dimension)
         # To deal with this, we convert to a shape of (batch, heads, max_seq_len, dimension)
         # So the flops calculation in this case is an overestimate of the actual flops.
+<<<<<<< HEAD
         if len(key.shape) != 3:
             raise AssertionError("sdpa_flop_count: expected key.shape to be 3-dimensional")
         if len(value.shape) != 3:
@@ -339,6 +367,17 @@ def _unpack_flash_attention_nested_shapes(
             raise AssertionError("sdpa_flop_count: cum_seq_k must not be None")
         if cum_seq_q.shape != cum_seq_k.shape:
             raise AssertionError("sdpa_flop_count: cum_seq_q and cum_seq_k must have the same shape")
+=======
+        assert len(key.shape) == 3
+        assert len(value.shape) == 3
+        assert grad_out is None or grad_out.shape == query.shape
+        _, h_q, d_q = query.shape
+        _, h_k, d_k = key.shape
+        _, h_v, d_v = value.shape
+        assert cum_seq_q is not None
+        assert cum_seq_k is not None
+        assert cum_seq_q.shape == cum_seq_k.shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_q_lengths = _offsets_to_lengths(cum_seq_q, max_q)
         seq_k_lengths = _offsets_to_lengths(cum_seq_k, max_k)
         for (seq_q_len, seq_k_len) in zip(seq_q_lengths, seq_k_lengths):
@@ -378,6 +417,7 @@ def _unpack_efficient_attention_nested_shapes(
         # In comparison, non-Nested inputs have shape (batch, heads, sequence len, dimension)
         # To deal with this, we convert to a shape of (batch, heads, max_seq_len, dimension)
         # So the flops calculation in this case is an overestimate of the actual flops.
+<<<<<<< HEAD
         if len(key.shape) != 4:
             raise AssertionError("_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensional")
         if len(value.shape) != 4:
@@ -394,6 +434,17 @@ def _unpack_efficient_attention_nested_shapes(
         if cu_seqlens_q.shape != cu_seqlens_k.shape:
             raise AssertionError("_unpack_efficient_attention_nested_shapes: "
                                  "cu_seqlens_q and cu_seqlens_k must have the same shape")
+=======
+        assert len(key.shape) == 4
+        assert len(value.shape) == 4
+        assert grad_out is None or grad_out.shape == query.shape
+        _, _, h_q, d_q = query.shape
+        _, _, h_k, d_k = key.shape
+        _, _, h_v, d_v = value.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert cu_seqlens_q.shape == cu_seqlens_k.shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seqlens_q = _offsets_to_lengths(cu_seqlens_q, max_seqlen_q)
         seqlens_k = _offsets_to_lengths(cu_seqlens_k, max_seqlen_k)
         for len_q, len_k in zip(seqlens_q, seqlens_k):
@@ -477,10 +528,15 @@ def sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape
     _b2, _h2, s_k, _d2 = key_shape
     _b3, _h3, _s3, d_v = value_shape
     _b4, _h4, _s4, _d4 = grad_out_shape
+<<<<<<< HEAD
     if not b == _b2 == _b3 == _b4 or not h == _h2 == _h3 == _h4 or not d_q == _d2:
         raise AssertionError("sdpa_backward_flop_count: batch/heads/dimension mismatch among tensors")
     if not d_v == _d4 or not s_k == _s3 or not s_q == _s4:
         raise AssertionError("sdpa_backward_flop_count: grad_out/value/key/query shapes are incompatible")
+=======
+    assert b == _b2 == _b3 == _b4 and h == _h2 == _h3 == _h4 and d_q == _d2
+    assert d_v == _d4 and s_k == _s3 and s_q == _s4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     total_flops = 0
     # Step 1: We recompute the scores matrix.
     # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
@@ -581,8 +637,11 @@ def _efficient_attention_backward_flop(
     aten._scaled_mm: _scaled_mm_flop,
     aten.convolution: conv_flop,
     aten._convolution: conv_flop,
+<<<<<<< HEAD
     aten.cudnn_convolution: conv_flop,
     aten._slow_conv2d_forward: conv_flop,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.convolution_backward: conv_backward_flop,
     aten._scaled_dot_product_efficient_attention: sdpa_flop,
     aten._scaled_dot_product_flash_attention: sdpa_flop,
@@ -696,9 +755,13 @@ def get_table(self, depth=None):
         if depth is None:
             depth = 999999
 
+<<<<<<< HEAD
 
         import tabulate
 
+=======
+        import tabulate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tabulate.PRESERVE_WHITESPACE = True
         header = ["Module", "FLOP", "% Total"]
         values = []
@@ -761,8 +824,12 @@ def __enter__(self):
         return self
 
     def __exit__(self, *args):
+<<<<<<< HEAD
         if self.mode is None:
             raise AssertionError("Internal error: FlopCounter.__exit__ called but mode is None")
+=======
+        assert self.mode is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         b = self.mode.__exit__(*args)
         self.mode = None  # break cycles
         self.mod_tracker.__exit__()
@@ -779,6 +846,7 @@ def _count_flops(self, func_packet, out, args, kwargs):
 
         return out
 
+<<<<<<< HEAD
 class _FlopCounterMode(TorchDispatchMode):
     supports_higher_order_operators = True
 
@@ -854,12 +922,23 @@ def _handle_higher_order_ops(self, func, types, args, kwargs):
             # output with the same structure.
             return true_out
 
+=======
+
+class _FlopCounterMode(TorchDispatchMode):
+    def __init__(self, counter: FlopCounterMode):
+        self.counter = counter
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs if kwargs else {}
 
         # Skip ops from non-standard dispatch_sizes_strides_policy such as NJT
+<<<<<<< HEAD
         if func in {torch.ops.aten.sym_is_contiguous.default,
                     torch.ops.aten.is_contiguous.default,
+=======
+        if func in {torch.ops.aten.is_contiguous.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
                     torch.ops.aten.is_non_overlapping_and_dense.default,
@@ -876,9 +955,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
             return NotImplemented
 
+<<<<<<< HEAD
         if isinstance(func, torch._ops.HigherOrderOperator):
             return self._handle_higher_order_ops(func, types, args, kwargs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If we don't have func in flop_registry, see if it can decompose
         if func not in self.counter.flop_registry and func is not torch.ops.prim.device.default:
             with self:
diff --git a/torch/utils/hipify/constants.py b/torch/utils/hipify/constants.py
index 302cf487a85da..ecc5eca35d755 100644
--- a/torch/utils/hipify/constants.py
+++ b/torch/utils/hipify/constants.py
@@ -59,5 +59,9 @@
 API_PYTORCH = 1337
 API_CAFFE2 = 1338
 API_C10 = 1339
+<<<<<<< HEAD
 API_PYT_EXT = 1340
 API_ROCMSMI = 1341
+=======
+API_ROCMSMI = 1340
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index ce3320e3d381c..944de59e31076 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -4,7 +4,11 @@
 from .constants import (API_BLAS, API_C10, API_CAFFE2, API_DRIVER, API_FFT,
                         API_PYTORCH, API_RAND, API_ROCTX, API_RTC, API_RUNTIME,
                         API_SPECIAL, API_ROCMSMI, CONV_CACHE, CONV_CONTEXT, CONV_D3D9,
+<<<<<<< HEAD
                         API_PYT_EXT, CONV_D3D10, CONV_D3D11, CONV_DEF, CONV_DEVICE,
+=======
+                        CONV_D3D10, CONV_D3D11, CONV_DEF, CONV_DEVICE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         CONV_DEVICE_FUNC, CONV_EGL, CONV_ERROR, CONV_EVENT,
                         CONV_EXEC, CONV_GL, CONV_GRAPHICS, CONV_INCLUDE,
                         CONV_INCLUDE_CUDA_MAIN_H, CONV_INIT, CONV_JIT,
@@ -48,7 +52,10 @@
     ]
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_TYPE_NAME_MAP = collections.OrderedDict(
     [
         ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)),
@@ -180,9 +187,12 @@
         ),
         ("CUlimit", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
         ("CUlimit_enum", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
+<<<<<<< HEAD
         ("CUmemAccessDesc", ("hipMemAccessDesc", CONV_TYPE, API_DRIVER)),
         ("CUmemAccessDesc_st", ("hipMemAccessDesc", CONV_TYPE, API_DRIVER)),
         ("CUmemAccessDesc_v1", ("hipMemAccessDesc", CONV_TYPE, API_DRIVER)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "CUmemAttach_flags",
             ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
@@ -191,6 +201,7 @@
             "CUmemAttach_flags_enum",
             ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
         ),
+<<<<<<< HEAD
         ("CUmemAllocationGranularity_flags", ("hipMemAllocationGranularity_flags", CONV_TYPE, API_DRIVER)),
         ("CUmemAllocationGranularity_flags_enum", ("hipMemAllocationGranularity_flags", CONV_TYPE, API_DRIVER)),
         ("CUmemAllocationHandleType", ("hipMemAllocationHandleType", CONV_TYPE, API_DRIVER)),
@@ -223,6 +234,8 @@
         ("CUmem_advise_enum", ("hipMemoryAdvise", CONV_TYPE, API_DRIVER)),
         ("CUmem_range_attribute_enum", ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER)),
         ("CUmemoryPool", ("hipMemPool_t", CONV_TYPE, API_DRIVER)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("CUmemorytype", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
         ("CUmemorytype_enum", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
         ("CUresourcetype", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
@@ -342,7 +355,10 @@
         ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)),
         ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)),
         ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaHostFn_t", ("hipHostFn_t", CONV_TYPE, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)),
@@ -580,14 +596,20 @@
         ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
         ("CUuuid", ("hipUUID", CONV_TYPE, API_RUNTIME)),
         ("cudaGraph_t", ("hipGraph_t", CONV_TYPE, API_RAND)),
+<<<<<<< HEAD
         ("cudaGraphNode_t", ("hipGraphNode_t", CONV_TYPE, API_RAND)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaGraphExec_t", ("hipGraphExec_t", CONV_TYPE, API_RAND)),
         ("__nv_bfloat16", ("__hip_bfloat16", CONV_TYPE, API_RUNTIME)),
         ("__nv_bfloat162", ("__hip_bfloat162", CONV_TYPE, API_RUNTIME)),
     ]
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_INCLUDE_MAP = collections.OrderedDict(
     [
         # since pytorch uses "\b{pattern}\b" as the actual re pattern,
@@ -610,12 +632,18 @@
             "channel_descriptor.h",
             ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME),
         ),
+<<<<<<< HEAD
         ('include "device_functions.h', ('include "hip/device_functions.h', CONV_INCLUDE, API_RUNTIME)),
         ('include <device_functions.h', ('include <hip/device_functions.h', CONV_INCLUDE, API_RUNTIME)),
         ('include "driver_types.h', ('include "hip/driver_types.h', CONV_INCLUDE, API_RUNTIME)),
         ('include <driver_types.h', ('include <hip/driver_types.h', CONV_INCLUDE, API_RUNTIME)),
         ('include "library_types.h', ('include "hip/library_types.h', CONV_INCLUDE, API_RUNTIME)),
         ('include <library_types.h', ('include <hip/library_types.h', CONV_INCLUDE, API_RUNTIME)),
+=======
+        ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
+        ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_bf16.h", ("hip/hip_bf16.h", CONV_INCLUDE, API_RUNTIME)),
@@ -672,6 +700,7 @@
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+<<<<<<< HEAD
         ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
         ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
         ("tensorpipe/tensorpipe_cuda.h", ("tensorpipe/tensorpipe_hip.h", CONV_INCLUDE, API_PYT_EXT)),
@@ -679,6 +708,12 @@
 )
 
 # pyrefly: ignore [no-matching-overload]
+=======
+        ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
+    ]
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_IDENTIFIER_MAP = collections.OrderedDict(
     [
         ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)),
@@ -2594,6 +2629,7 @@
             "CU_MEMORYTYPE_UNIFIED",
             ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
         ),
+<<<<<<< HEAD
         ("CU_MEMHOSTREGISTER_READ_ONLY", ("hipHostRegisterReadOnly", CONV_TYPE, API_DRIVER)),
         ("CU_MEMPOOL_ATTR_RELEASE_THRESHOLD", ("hipMemPoolAttrReleaseThreshold", CONV_TYPE, API_DRIVER)),
         ("CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT", ("hipMemPoolAttrReservedMemCurrent", CONV_TYPE, API_DRIVER)),
@@ -2626,6 +2662,8 @@
         ("CU_MEM_LOCATION_TYPE_INVALID", ("hipMemLocationTypeInvalid", CONV_TYPE, API_DRIVER)),
         ("CU_MEM_OPERATION_TYPE_MAP", ("hipMemOperationTypeMap", CONV_TYPE, API_DRIVER)),
         ("CU_MEM_OPERATION_TYPE_UNMAP", ("hipMemOperationTypeUnmap", CONV_TYPE, API_DRIVER)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "CU_RESOURCE_TYPE_ARRAY",
             ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
@@ -3256,6 +3294,7 @@
             "cuMemGetAddressRange_v2",
             ("hipMemGetAddressRange", CONV_MEM, API_DRIVER),
         ),
+<<<<<<< HEAD
         ("cuArray3DCreate_v2", ("hipArray3DCreate", CONV_MEM, API_DRIVER)),
         ("cuArray3DGetDescriptor_v2", ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER)),
         ("cuArrayGetDescriptor_v2", ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER)),
@@ -3313,6 +3352,8 @@
         ("cuMemPoolSetAccess", ("hipMemPoolSetAccess", CONV_MEM, API_DRIVER)),
         ("cuMemPoolSetAttribute", ("hipMemPoolSetAttribute", CONV_MEM, API_DRIVER)),
         ("cuMemPoolTrimTo", ("hipMemPoolTrimTo", CONV_MEM, API_DRIVER)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "cuPointerGetAttributes",
             ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
@@ -4201,6 +4242,7 @@
         ("cudaMemPoolAttrUsedMemCurrent", ("hipMemPoolAttrUsedMemCurrent", CONV_MEM, API_RUNTIME)),
         ("cudaMemPoolAttrUsedMemHigh", ("hipMemPoolAttrUsedMemHigh", CONV_MEM, API_RUNTIME)),
         ("cudaMemPoolGetAttribute", ("hipMemPoolGetAttribute", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
         (
             "cudaMemPoolReuseAllowInternalDependencies",
             ("hipMemPoolReuseAllowInternalDependencies", CONV_MEM, API_RUNTIME)
@@ -4210,6 +4252,11 @@
             "cudaMemPoolReuseFollowEventDependencies",
             ("hipMemPoolReuseFollowEventDependencies", CONV_MEM, API_RUNTIME)
         ),
+=======
+        ("cudaMemPoolReuseAllowInternalDependencies", ("hipMemPoolReuseAllowInternalDependencies", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolReuseAllowOpportunistic", ("hipMemPoolReuseAllowOpportunistic", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolReuseFollowEventDependencies", ("hipMemPoolReuseFollowEventDependencies", CONV_MEM, API_RUNTIME)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaMemPoolSetAccess", ("hipMemPoolSetAccess", CONV_MEM, API_RUNTIME)),
         ("cudaMemPoolSetAttribute", ("hipMemPoolSetAttribute", CONV_MEM, API_RUNTIME)),
         ("cudaMemPoolTrimTo", ("hipMemPoolTrimTo", CONV_MEM, API_RUNTIME)),
@@ -4240,7 +4287,10 @@
         ),
         ("cudaMallocHost", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
         ("cudaMallocArray", ("hipMallocArray", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaMallocAsync", ("hipMallocAsync", CONV_MEM, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaMalloc", ("hipMalloc", CONV_MEM, API_RUNTIME)),
         ("cudaMalloc3D", ("hipMalloc3D", CONV_MEM, API_RUNTIME)),
         ("cudaMalloc3DArray", ("hipMalloc3DArray", CONV_MEM, API_RUNTIME)),
@@ -4255,22 +4305,31 @@
         ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)),
         ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)),
         ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaFreeAsync", ("hipFreeAsync", CONV_MEM, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)),
         ("cudaHostRegister", ("hipHostRegister", CONV_MEM, API_RUNTIME)),
         ("cudaHostUnregister", ("hipHostUnregister", CONV_MEM, API_RUNTIME)),
         ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
         ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)),
         ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaMemoryTypeUnregistered", ("hipMemoryTypeUnregistered", CONV_MEM, API_RUNTIME)),
         ("cudaMemoryTypeManaged", ("hipMemoryTypeManaged", CONV_MEM, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)),
         ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)),
         ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)),
         ("cudaHostAllocDefault", ("hipHostMallocDefault", CONV_MEM, API_RUNTIME)),
         ("cudaHostAllocPortable", ("hipHostMallocPortable", CONV_MEM, API_RUNTIME)),
         ("cudaHostAllocMapped", ("hipHostMallocMapped", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaHostNodeParams", ("hipHostNodeParams", CONV_MEM, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "cudaHostAllocWriteCombined",
             ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME),
@@ -4331,13 +4390,17 @@
         ("cudaStreamGetCaptureInfo_v2", ("hipStreamGetCaptureInfo_v2", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaStreamCaptureStatusNone", ("hipStreamCaptureStatusNone", CONV_TYPE, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeGlobal", ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeRelaxed", ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeThreadLocal", ("hipStreamCaptureModeThreadLocal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaStreamSetCaptureDependencies", ("hipStreamSetCaptureDependencies", CONV_STREAM, API_RUNTIME)),
         ("cudaStreamUpdateCaptureDependencies", ("hipStreamUpdateCaptureDependencies", CONV_STREAM, API_RUNTIME)),
         ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
@@ -4346,6 +4409,11 @@
             "cudaGraphInstantiateFlagAutoFreeOnLaunch",
             ("hipGraphInstantiateFlagAutoFreeOnLaunch", CONV_TYPE, API_RUNTIME)
         ),
+=======
+        ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphInstantiateWithFlags", ("hipGraphInstantiateWithFlags", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphInstantiateFlagAutoFreeOnLaunch", ("hipGraphInstantiateFlagAutoFreeOnLaunch", CONV_TYPE, API_RUNTIME)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaGraphDestroy", ("hipGraphDestroy", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphExecDestroy", ("hipGraphExecDestroy", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)),
@@ -4354,6 +4422,7 @@
         ("cudaGraphDebugDotFlagsVerbose", ("hipGraphDebugDotFlagsVerbose", CONV_NUMERIC_LITERAL, API_RUNTIME)),
         ("cudaGraphRetainUserObject", ("hipGraphRetainUserObject", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphUserObjectMove", ("hipGraphUserObjectMove", CONV_TYPE, API_RUNTIME)),
+<<<<<<< HEAD
         ("cudaDeviceGetGraphMemAttribute", ("hipDeviceGetGraphMemAttribute", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceGraphMemTrim", ("hipDeviceGraphMemTrim", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceSetGraphMemAttribute", ("hipDeviceSetGraphMemAttribute", CONV_TYPE, API_RUNTIME)),
@@ -4557,6 +4626,8 @@
         ("cudaGraphNodeTypeMemcpy", ("hipGraphNodeTypeMemcpy", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphNodeTypeMemset", ("hipGraphNodeTypeMemset", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphNodeTypeWaitEvent", ("hipGraphNodeTypeWaitEvent", CONV_TYPE, API_RUNTIME)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cudaUserObject_t", ("hipUserObject_t", CONV_TYPE, API_RUNTIME)),
         ("cudaUserObjectCreate", ("hipUserObjectCreate", CONV_TYPE, API_RUNTIME)),
         ("cudaUserObjectNoDestructorSync", ("hipUserObjectNoDestructorSync", CONV_TYPE, API_RUNTIME)),
@@ -7704,11 +7775,16 @@
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+<<<<<<< HEAD
         ("CUBLASLT_MATMUL_DESC_POINTER_MODE", ("HIPBLASLT_MATMUL_DESC_POINTER_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_POINTER_MODE_DEVICE", ("HIPBLASLT_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS)),
         ("CUBLASLT_POINTER_MODE_HOST", ("HIPBLASLT_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS)),
+=======
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),
@@ -8372,7 +8448,10 @@
     ]
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_SPECIAL_MAP = collections.OrderedDict(
     [
         # SPARSE
@@ -8595,6 +8674,7 @@
             ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
         (
+<<<<<<< HEAD
             "CUSPARSE_STATUS_NOT_SUPPORTED",
             ("HIPSPARSE_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
@@ -8603,6 +8683,8 @@
             ("HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
         (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "CUSPARSE_OPERATION_TRANSPOSE",
             ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
@@ -8863,7 +8945,10 @@
     ]
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
     [
         ("USE_CUDA", ("USE_ROCM", API_PYTORCH)),
@@ -9328,7 +9413,10 @@
     ]
 )
 
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
     [
         ("PYTORCH_NO_CUDA_MEMORY_CACHING", ("PYTORCH_NO_CUDA_MEMORY_CACHING", API_CAFFE2)),
@@ -9404,7 +9492,11 @@
     ]
 )
 
+<<<<<<< HEAD
 # We must treat very carefully here.  Blanket conversions like are done
+=======
+# We must tread very carefully here.  Blanket conversions like are done
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # in CAFFE2_SPECIFIC_MAPPINGS are not presently supported on PyTorch,
 # because a regex for CUDA will also match a filename like CUDAGuard.h,
 # but the HIPIFY script doesn't presently move the file and so the substitution
@@ -9414,7 +9506,10 @@
 #
 # NB: if you want a transformation to ONLY apply to the c10/ directory,
 # put it as API_CAFFE2
+<<<<<<< HEAD
 # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_MAPPINGS = collections.OrderedDict(
     [
         ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_PYTORCH)),
@@ -9475,7 +9570,10 @@
         ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10)),
         ("CUDAKernelLaunchRegistry", ("HIPKernelLaunchRegistry", API_C10)),
         ("c10::cuda::get_cuda_check_suffix", ("c10::hip::get_hip_check_suffix", API_C10)),
+<<<<<<< HEAD
         ("c10::cuda::get_cuda_error_help", ("c10::hip::get_hip_error_help", API_C10)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 )
 
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 93ce3c50dfcf2..af6491eb1d265 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -120,7 +120,10 @@ def __enter__(self):
     def open(self, fn, *args, **kwargs):
         if not os.path.exists(fn):
             self.files_to_clean.add(os.path.abspath(fn))
+<<<<<<< HEAD
         # pyrefly: ignore [not-iterable]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return open(fn, *args, **kwargs)
 
     def makedirs(self, dn, exist_ok=False):
@@ -466,7 +469,11 @@ def find_closure_group(input_string, start, group):
 
 
 def find_bracket_group(input_string, start):
+<<<<<<< HEAD
     """Finds the first balanced parentheses."""
+=======
+    """Finds the first balanced parantheses."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return find_closure_group(input_string, start, group=["{", "}"])
 
 
@@ -549,8 +556,12 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
     """
     # At the moment, some PyTorch source files are HIPified in place.  The predicate
     # is_out_of_place tells us if this is the case or not.
+<<<<<<< HEAD
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
+=======
+    assert not os.path.isabs(rel_filepath)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not is_pytorch_extension and not is_out_of_place(rel_filepath):
         return rel_filepath
 
@@ -617,8 +628,12 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
 
 
 def is_out_of_place(rel_filepath):
+<<<<<<< HEAD
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
+=======
+    assert not os.path.isabs(rel_filepath)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rel_filepath.startswith("torch/"):
         return False
     if rel_filepath.startswith("third_party/nvfuser/"):
@@ -630,8 +645,12 @@ def is_out_of_place(rel_filepath):
 
 # Keep this synchronized with includes/ignores in build_amd.py
 def is_pytorch_file(rel_filepath):
+<<<<<<< HEAD
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
+=======
+    assert not os.path.isabs(rel_filepath)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rel_filepath.startswith("aten/"):
         if rel_filepath.startswith("aten/src/ATen/core/"):
             return False
@@ -640,8 +659,11 @@ def is_pytorch_file(rel_filepath):
         return True
     if rel_filepath.startswith("third_party/nvfuser/"):
         return True
+<<<<<<< HEAD
     if rel_filepath.startswith("third_party/fbgemm/"):
         return True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rel_filepath.startswith("tools/autograd/templates/"):
         return True
     return False
@@ -664,13 +686,20 @@ def is_special_file(rel_filepath):
     return False
 
 def is_caffe2_gpu_file(rel_filepath):
+<<<<<<< HEAD
     if os.path.isabs(rel_filepath):
         raise AssertionError("rel_filepath must be a relative path")
+=======
+    assert not os.path.isabs(rel_filepath)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rel_filepath.startswith("c10/cuda"):
         return True
     filename = os.path.basename(rel_filepath)
     _, ext = os.path.splitext(filename)
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
 
 class TrieNode:
@@ -791,8 +820,12 @@ def export_to_regex(self):
 PYTORCH_SPECIAL_MAP = {}
 
 for mapping in CUDA_TO_HIP_MAPPINGS:
+<<<<<<< HEAD
     if not isinstance(mapping, Mapping):
         raise TypeError("Expected each mapping in CUDA_TO_HIP_MAPPINGS to be a Mapping")
+=======
+    assert isinstance(mapping, Mapping)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for src, value in mapping.items():
         dst = value[0]
         meta_data = value[1:]
@@ -1146,7 +1179,10 @@ def hipify(
                                         out_of_place_only=out_of_place_only,
                                         is_pytorch_extension=is_pytorch_extension))
     all_files_set = set(all_files)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for f in extra_files:
         if not os.path.isabs(f):
             f = os.path.join(output_directory, f)
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index e52a57d709951..10872120500ad 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -88,7 +88,11 @@ def warn_if_has_hooks(tensor):
                 warnings.warn(f"backward hook {repr(hook)} on tensor will not be "
                               "serialized.  If this is expected, you can "
                               "decorate the function with @torch.utils.hooks.unserializable_hook "
+<<<<<<< HEAD
                               "to suppress this warning", stacklevel=2)
+=======
+                              "to suppress this warning")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class BackwardHook:
     """
@@ -145,7 +149,10 @@ def hook(grad_input, _):
 
                 res = out
 
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.grad_outputs = None
 
             return self._unpack_none(self.input_tensors_index, res)
@@ -238,8 +245,12 @@ def hook(_, grad_output):
                     self.grad_outputs = None
 
                 if local_grad_outputs is not None:
+<<<<<<< HEAD
                     if self.output_tensors_index is None:
                         raise AssertionError("output_tensors_index should not be None when grad_outputs is not None")
+=======
+                    assert self.output_tensors_index is not None  # mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return tuple(local_grad_outputs[i] for i in self.output_tensors_index)
 
             grad_fn.register_hook(hook)
diff --git a/torch/utils/jit/log_extract.py b/torch/utils/jit/log_extract.py
index 9e018457802f4..c7f45a825c8f6 100644
--- a/torch/utils/jit/log_extract.py
+++ b/torch/utils/jit/log_extract.py
@@ -32,6 +32,7 @@ def make_tensor_from_type(inp_type: torch._C.TensorType):
     stride = inp_type.strides()
     device = inp_type.device()
     dtype = inp_type.dtype()
+<<<<<<< HEAD
     if size is None:
         raise AssertionError("make_tensor_from_type: 'size' is None (inp_type.sizes() returned None)")
     if stride is None:
@@ -40,6 +41,12 @@ def make_tensor_from_type(inp_type: torch._C.TensorType):
         raise AssertionError("make_tensor_from_type: 'device' is None (inp_type.device() returned None)")
     if dtype is None:
         raise AssertionError("make_tensor_from_type: 'dtype' is None (inp_type.dtype() returned None)")
+=======
+    assert size is not None
+    assert stride is not None
+    assert device is not None
+    assert dtype is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty_strided(size=size, stride=stride, device=device, dtype=dtype)
 
 def load_graph_and_inputs(ir: str) -> tuple[Any, list[Any]]:
@@ -85,8 +92,12 @@ def run_test(ir, inputs, *, warmup_runs=10, test_runs=20) -> float:
         if isinstance(input, torch.Tensor):
             is_cpu = input.device.type == "cpu"
             break
+<<<<<<< HEAD
     if is_cpu is None:
         raise AssertionError("No tensor found in inputs")
+=======
+    assert is_cpu is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out = time_cpu(graph, inputs, test_runs) if is_cpu else time_cuda(graph, inputs, test_runs)
     return out
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index b6b09937eb90c..262e6301b54f8 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -137,12 +137,18 @@ class MkldnnBatchNorm(torch.jit.ScriptModule):
     def __init__(self, dense_module):
         super().__init__()
 
+<<<<<<< HEAD
         if dense_module.training:
             raise AssertionError("Only support eval mode batchnorm for mkldnn path now")
         if not dense_module.track_running_stats:
             raise AssertionError("Only support track_running_stats=True for mkldnn path now")
         if not dense_module.affine:
             raise AssertionError("Only support affine=True for mkldnn path now")
+=======
+        assert not dense_module.training
+        assert dense_module.track_running_stats
+        assert dense_module.affine
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if dense_module.momentum is None:
             self.exponential_average_factor = 0.0
@@ -207,9 +213,14 @@ def forward(self, x):
         return y
 
 def to_mkldnn(module, dtype=torch.float):
+<<<<<<< HEAD
     if dtype not in (torch.float, torch.bfloat16, torch.half):
         raise AssertionError("MKLDNN only support float, bfloat16, and half path now")
 
+=======
+    assert dtype in [torch.float, torch.bfloat16, torch.half], \
+        "MKLDNN only support float, bfloat16, and half path now"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def m_fn(m, d):
         if isinstance(m, torch.nn.Linear):
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 2ba3ea36088ce..0e1027b45aab5 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -66,7 +66,10 @@
 
 import argparse
 import io
+<<<<<<< HEAD
 import itertools
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import json
 import os
 import pickle
@@ -87,6 +90,7 @@
            'burn_in_info', 'get_info_and_burn_skeleton']
 
 def get_storage_info(storage):
+<<<<<<< HEAD
     if not isinstance(storage, torch.utils.show_pickle.FakeObject):
         raise AssertionError(f"storage is not FakeObject: {type(storage)}")
     if storage.module != "pers":
@@ -112,6 +116,21 @@ def get_storage_info(storage):
         raise AssertionError(f"sa[1].module is not 'torch': {sa[1].module!r}")
     if not sa[1].name.endswith("Storage"):
         raise AssertionError(f"sa[1].name does not end with 'Storage': {sa[1].name!r}")
+=======
+    assert isinstance(storage, torch.utils.show_pickle.FakeObject)
+    assert storage.module == "pers"
+    assert storage.name == "obj"
+    assert storage.state is None
+    assert isinstance(storage.args, tuple)
+    assert len(storage.args) == 1
+    sa = storage.args[0]
+    assert isinstance(sa, tuple)
+    assert len(sa) == 5
+    assert sa[0] == "storage"
+    assert isinstance(sa[1], torch.utils.show_pickle.FakeClass)
+    assert sa[1].module == "torch"
+    assert sa[1].name.endswith("Storage")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     storage_info = [sa[1].name.replace("Storage", "")] + list(sa[2:])
     return storage_info
 
@@ -136,19 +155,28 @@ def hierarchical_pickle(data):
         if (
             typename.startswith(('__torch__.', 'torch.jit.LoweredWrapper.', 'torch.jit.LoweredModule.'))
         ):
+<<<<<<< HEAD
             if data.args != ():
                 raise AssertionError("data.args is not ()")
+=======
+            assert data.args == ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return {
                 "__module_type__": typename,
                 "state": hierarchical_pickle(data.state),
             }
         if typename == "torch._utils._rebuild_tensor_v2":
+<<<<<<< HEAD
             if data.state is not None:
                 raise AssertionError("data.state is not None")
+=======
+            assert data.state is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             storage, offset, size, stride, requires_grad, *_ = data.args
             storage_info = get_storage_info(storage)
             return {"__tensor_v2__": [storage_info, offset, size, stride, requires_grad]}
         if typename == "torch._utils._rebuild_qtensor":
+<<<<<<< HEAD
             if data.state is not None:
                 raise AssertionError("data.state is not None")
             storage, offset, size, stride, quantizer, requires_grad, *_ = data.args
@@ -166,12 +194,25 @@ def hierarchical_pickle(data):
                     raise AssertionError("quantizer[1] is not a float")
                 if not isinstance(quantizer[2], int):
                     raise AssertionError("quantizer[2] is not an int")
+=======
+            assert data.state is None
+            storage, offset, size, stride, quantizer, requires_grad, *_ = data.args
+            storage_info = get_storage_info(storage)
+            assert isinstance(quantizer, tuple)
+            assert isinstance(quantizer[0], torch.utils.show_pickle.FakeClass)
+            assert quantizer[0].module == "torch"
+            if quantizer[0].name == "per_tensor_affine":
+                assert len(quantizer) == 3
+                assert isinstance(quantizer[1], float)
+                assert isinstance(quantizer[2], int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 quantizer_extra = list(quantizer[1:3])
             else:
                 quantizer_extra = []
             quantizer_json = [quantizer[0].name] + quantizer_extra
             return {"__qtensor__": [storage_info, offset, size, stride, quantizer_json, requires_grad]}
         if typename == "torch.jit._pickle.restore_type_tag":
+<<<<<<< HEAD
             if data.state is not None:
                 raise AssertionError("data.state is not None")
             obj, typ = data.args
@@ -199,6 +240,27 @@ def hierarchical_pickle(data):
             msg, = data.args
             if not isinstance(msg, str):
                 raise AssertionError("msg is not a string")
+=======
+            assert data.state is None
+            obj, typ = data.args
+            assert isinstance(typ, str)
+            return hierarchical_pickle(obj)
+        if re.fullmatch(r"torch\.jit\._pickle\.build_[a-z]+list", typename):
+            assert data.state is None
+            ls, = data.args
+            assert isinstance(ls, list)
+            return hierarchical_pickle(ls)
+        if typename == "torch.device":
+            assert data.state is None
+            name, = data.args
+            assert isinstance(name, str)
+            # Just forget that it was a device and return the name.
+            return name
+        if typename == "builtin.UnicodeDecodeError":
+            assert data.state is None
+            msg, = data.args
+            assert isinstance(msg, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Hack: Pretend this is a module so we don't need custom serialization.
             # Hack: Wrap the message in a tuple so it looks like a nice state object.
             # TODO: Undo at least that second hack.  We should support string states.
@@ -237,13 +299,17 @@ def get_model_info(
     with zipfile.ZipFile(path_or_file) as zf:
         path_prefix = None
         zip_files = []
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for zi in zf.infolist():
             prefix = re.sub("/.*", "", zi.filename)
             if path_prefix is None:
                 path_prefix = prefix
             elif prefix != path_prefix:
                 raise Exception(f"Mismatched prefixes: {path_prefix} != {prefix}")  # noqa: TRY002
+<<<<<<< HEAD
             zip_files.append(
                 {
                     "filename": zi.filename,
@@ -259,6 +325,20 @@ def get_model_info(
         def get_pickle(name):
             if path_prefix is None:
                 raise AssertionError("path_prefix is None")
+=======
+            zip_files.append(dict(
+                filename=zi.filename,
+                compression=zi.compress_type,
+                compressed_size=zi.compress_size,
+                file_size=zi.file_size,
+            ))
+
+        assert path_prefix is not None
+        version = zf.read(path_prefix + "/version").decode("utf-8").strip()
+
+        def get_pickle(name):
+            assert path_prefix is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with zf.open(path_prefix + f"/{name}.pkl") as handle:
                 raw = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
                 return hierarchical_pickle(raw)
@@ -266,14 +346,24 @@ def get_pickle(name):
         model_data = get_pickle("data")
         constants = get_pickle("constants")
 
+<<<<<<< HEAD
         # Intern strings that are likely to be reused.
         # Pickle automatically detects shared structure,
         # so reused strings are stored efficiently.
+=======
+        # Intern strings that are likely to be re-used.
+        # Pickle automatically detects shared structure,
+        # so re-used strings are stored efficiently.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # However, JSON has no way of representing this,
         # so we have to do it manually.
         interned_strings : dict[str, int] = {}
 
+<<<<<<< HEAD
         def intern(s):
+=======
+        def ist(s):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if s not in interned_strings:
                 interned_strings[s] = len(interned_strings)
             return interned_strings[s]
@@ -313,11 +403,18 @@ def parse_new_format(line):
                 debug_info.append((len(raw_code), (('', '', 0), 0, 0)))
 
             code_parts = []
+<<<<<<< HEAD
             for di, di_next in itertools.pairwise(debug_info):
                 start, source_range, *_ = di
                 end = di_next[0]
                 if end <= start:
                     raise AssertionError("end is not greater than start")
+=======
+            for di, di_next in zip(debug_info, debug_info[1:]):
+                start, source_range, *_ = di
+                end = di_next[0]
+                assert end > start
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 source, s_start, s_end = source_range
                 s_text, s_file, s_line = source
                 # TODO: Handle this case better.  TorchScript ranges are in bytes,
@@ -328,7 +425,11 @@ def parse_new_format(line):
                     s_start = 0
                     s_end = 0
                 text = raw_code[start:end]
+<<<<<<< HEAD
                 code_parts.append([text.decode("utf-8"), intern(s_file), s_line, intern(s_text), s_start, s_end])
+=======
+                code_parts.append([text.decode("utf-8"), ist(s_file), s_line, ist(s_text), s_start, s_end])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code_files[zi.filename] = code_parts
 
         extra_files_json_pattern = re.compile(re.escape(path_prefix) + "/extra/.*\\.json")
@@ -367,6 +468,7 @@ def parse_new_format(line):
                 continue
             extra_pickles[zi.filename] = contents
 
+<<<<<<< HEAD
     return {
         "model": {
             "title": title,
@@ -381,6 +483,20 @@ def parse_new_format(line):
             "extra_pickles": extra_pickles,
         }
     }
+=======
+    return {"model": dict(
+        title=title,
+        file_size=file_size,
+        version=version,
+        zip_files=zip_files,
+        interned_strings=list(interned_strings),
+        code_files=code_files,
+        model_data=model_data,
+        constants=constants,
+        extra_files_jsons=extra_files_jsons,
+        extra_pickles=extra_pickles,
+    )}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_inline_skeleton():
@@ -392,12 +508,18 @@ def get_inline_skeleton():
 
     import importlib.resources
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-argument-type]
     skeleton = importlib.resources.read_text(__package__, "skeleton.html")
     # pyrefly: ignore [bad-argument-type]
     js_code = importlib.resources.read_text(__package__, "code.js")
     for js_module in ["preact", "htm"]:
         # pyrefly: ignore [bad-argument-type]
+=======
+    skeleton = importlib.resources.read_text(__package__, "skeleton.html")
+    js_code = importlib.resources.read_text(__package__, "code.js")
+    for js_module in ["preact", "htm"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         js_lib = importlib.resources.read_binary(__package__, f"{js_module}.mjs")
         js_url = "data:application/javascript," + urllib.parse.quote(js_lib)
         js_code = js_code.replace(f"https://unpkg.com/{js_module}?module", js_url)
@@ -429,7 +551,11 @@ def get_info_and_burn_skeleton(path_or_bytesio, **kwargs):
 
 
 def main(argv, *, stdout=None):
+<<<<<<< HEAD
     warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.", stacklevel=2)
+=======
+    warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parser = argparse.ArgumentParser()
     parser.add_argument("--style", choices=["json", "html"])
     parser.add_argument("--title")
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
index 4c7dec0481522..1e088abe7725e 100644
--- a/torch/utils/module_tracker.py
+++ b/torch/utils/module_tracker.py
@@ -49,7 +49,10 @@ class ModuleTracker:
             def my_linear(m1, m2, bias):
                 print(f"Current modules: {tracker.parents}")
                 return torch.mm(m1, m2.t()) + bias
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.nn.functional.linear = my_linear
 
             mod(torch.rand(2, 2))
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index f0e8910580de1..f148c5920fa7f 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -20,8 +20,11 @@ def make_np(x: torch.Tensor) -> np.ndarray:
     if np.isscalar(x):
         return np.array([x])
     if isinstance(x, torch.Tensor):
+<<<<<<< HEAD
         if x.device.type == "meta":
             return np.random.randn(1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _prepare_pytorch(x)
     raise NotImplementedError(
         f"Got {type(x)}, but numpy array or torch tensor are expected."
@@ -31,7 +34,11 @@ def make_np(x: torch.Tensor) -> np.ndarray:
 def _prepare_pytorch(x: torch.Tensor) -> np.ndarray:
     if x.dtype == torch.bfloat16:
         x = x.to(torch.float16)
+<<<<<<< HEAD
     # pyrefly: ignore [bad-assignment]
     x = x.detach().cpu().numpy()
     # pyrefly: ignore [bad-return]
+=======
+    x = x.detach().cpu().numpy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return x
diff --git a/torch/utils/tensorboard/_embedding.py b/torch/utils/tensorboard/_embedding.py
index 28385426c280c..1157637877825 100644
--- a/torch/utils/tensorboard/_embedding.py
+++ b/torch/utils/tensorboard/_embedding.py
@@ -25,10 +25,16 @@ def make_tsv(metadata, save_path, metadata_header=None):
     if not metadata_header:
         metadata = [str(x) for x in metadata]
     else:
+<<<<<<< HEAD
         if len(metadata_header) != len(
             metadata[0]
         ):
             raise AssertionError("len of header must be equal to the number of columns in metadata")
+=======
+        assert len(metadata_header) == len(
+            metadata[0]
+        ), "len of header must be equal to the number of columns in metadata"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metadata = ["\t".join(str(e) for e in l) for l in [metadata_header] + metadata]
 
     metadata_bytes = tf.compat.as_bytes("\n".join(metadata) + "\n")
@@ -43,7 +49,11 @@ def make_sprite(label_img, save_path):
 
     # this ensures the sprite image has correct dimension as described in
     # https://www.tensorflow.org/get_started/embedding_viz
+<<<<<<< HEAD
     nrow = math.ceil((label_img.size(0)) ** 0.5)
+=======
+    nrow = int(math.ceil((label_img.size(0)) ** 0.5))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arranged_img_CHW = make_grid(make_np(label_img), ncols=nrow)
 
     # augment images so that #images equals nrow*nrow
diff --git a/torch/utils/tensorboard/_proto_graph.py b/torch/utils/tensorboard/_proto_graph.py
index c4e234dff6ba0..6864abf959072 100644
--- a/torch/utils/tensorboard/_proto_graph.py
+++ b/torch/utils/tensorboard/_proto_graph.py
@@ -1,13 +1,22 @@
+<<<<<<< HEAD
 import torch
 
 from typing import Optional, Union
 from collections.abc import Sequence
+=======
+# mypy: allow-untyped-defs
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from tensorboard.compat.proto.node_def_pb2 import NodeDef
 from tensorboard.compat.proto.attr_value_pb2 import AttrValue
 from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
 
 
+<<<<<<< HEAD
 def attr_value_proto(dtype: object, shape: Optional[Sequence[int]], s: Optional[str]) -> dict[str, AttrValue]:
+=======
+def attr_value_proto(dtype, shape, s):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Create a dict of objects matching a NodeDef's attr field.
 
     Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/attr_value.proto
@@ -23,7 +32,11 @@ def attr_value_proto(dtype: object, shape: Optional[Sequence[int]], s: Optional[
     return attr
 
 
+<<<<<<< HEAD
 def tensor_shape_proto(outputsize: Sequence[int]) -> TensorShapeProto:
+=======
+def tensor_shape_proto(outputsize):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Create an object matching a tensor_shape field.
 
     Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/tensor_shape.proto .
@@ -32,6 +45,7 @@ def tensor_shape_proto(outputsize: Sequence[int]) -> TensorShapeProto:
 
 
 def node_proto(
+<<<<<<< HEAD
     name: str,
     op: str = "UnSpecified",
     input: Optional[Union[list[str], str]] = None,
@@ -40,6 +54,16 @@ def node_proto(
     outputsize: Optional[Sequence[int]] = None,
     attributes: str = "",
 ) -> NodeDef:
+=======
+    name,
+    op="UnSpecified",
+    input=None,
+    dtype=None,
+    shape: Optional[tuple] = None,
+    outputsize=None,
+    attributes="",
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Create an object matching a NodeDef.
 
     Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/node_def.proto .
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index b3ef6a468dca5..5cbdd8e10bfe5 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -187,8 +187,12 @@ def populate_namespace_from_OP_to_IO(self):
                 )
 
         for key, node in self.nodes_io.items():
+<<<<<<< HEAD
             if type(node) is NodeBase:
                 # pyrefly: ignore [unsupported-operation]
+=======
+            if type(node) == NodeBase:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
             if hasattr(node, "input_or_output"):
                 self.unique_name_to_scoped_name[key] = (
@@ -199,7 +203,10 @@ def populate_namespace_from_OP_to_IO(self):
                 self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
                 if node.scope == "" and self.shallowest_scope_name:
                     self.unique_name_to_scoped_name[node.debugName] = (
+<<<<<<< HEAD
                         # pyrefly: ignore [unsupported-operation]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.shallowest_scope_name + "/" + node.debugName
                     )
 
diff --git a/torch/utils/tensorboard/_utils.py b/torch/utils/tensorboard/_utils.py
index ac06b8c3986f3..ca047ef735933 100644
--- a/torch/utils/tensorboard/_utils.py
+++ b/torch/utils/tensorboard/_utils.py
@@ -45,7 +45,11 @@ def _prepare_video(V):
     Convesrion is done from [batchsize, time(frame), channel(color), height, width]  (5D tensor)
     to [time(frame), new_width, new_height, channel] (4D tensor).
 
+<<<<<<< HEAD
     A batch of images are spread to a grid, which forms a frame.
+=======
+    A batch of images are spreaded to a grid, which forms a frame.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     e.g. Video with batchsize 16 will have a 4x4 grid.
     """
     b, t, c, h, w = V.shape
@@ -57,14 +61,21 @@ def is_power2(num):
         return num != 0 and ((num & (num - 1)) == 0)
 
     # pad to nearest power of 2, all at once
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
     if not is_power2(V.shape[0]):
         # pyrefly: ignore [index-error]
+=======
+    if not is_power2(V.shape[0]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         len_addition = int(2 ** V.shape[0].bit_length() - V.shape[0])
         V = np.concatenate((V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
 
     n_rows = 2 ** ((b.bit_length() - 1) // 2)
+<<<<<<< HEAD
     # pyrefly: ignore [index-error]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     n_cols = V.shape[0] // n_rows
 
     V = np.reshape(V, newshape=(n_rows, n_cols, t, c, h, w))
@@ -76,12 +87,19 @@ def is_power2(num):
 
 def make_grid(I, ncols=8):
     # I: N1HW or N3HW
+<<<<<<< HEAD
     if not isinstance(I, np.ndarray):
         raise AssertionError("plugin error, should pass numpy array here")
     if I.shape[1] == 1:
         I = np.concatenate([I, I, I], 1)
     if I.ndim != 4 or I.shape[1] != 3:
         raise AssertionError("Input should be a 4D numpy array with 3 channels")
+=======
+    assert isinstance(I, np.ndarray), "plugin error, should pass numpy array here"
+    if I.shape[1] == 1:
+        I = np.concatenate([I, I, I], 1)
+    assert I.ndim == 4 and I.shape[1] == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nimg = I.shape[0]
     H = I.shape[2]
     W = I.shape[3]
@@ -103,12 +121,22 @@ def make_grid(I, ncols=8):
 
 
 def convert_to_HWC(tensor, input_format):  # tensor: numpy array
+<<<<<<< HEAD
     if len(set(input_format)) != len(input_format):
         raise AssertionError(f"You can not use the same dimension shordhand twice. \
             input_format: {input_format}")
     if len(tensor.shape) != len(input_format):
         raise AssertionError(f"size of input tensor and input format are different. \
         tensor shape: {tensor.shape}, input_format: {input_format}")
+=======
+    assert len(set(input_format)) == len(
+        input_format
+    ), f"You can not use the same dimension shordhand twice.         input_format: {input_format}"
+    assert len(tensor.shape) == len(
+        input_format
+    ), f"size of input tensor and input format are different. \
+        tensor shape: {tensor.shape}, input_format: {input_format}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_format = input_format.upper()
 
     if len(input_format) == 4:
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index ae3b6a7a19a51..cc5d90438a44b 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -9,7 +9,10 @@
 import torch
 import numpy as np
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from google.protobuf import struct_pb2
 
 from tensorboard.compat.proto.summary_pb2 import (
@@ -370,9 +373,15 @@ def scalar(name, tensor, collections=None, new_style=False, double_precision=Fal
       ValueError: If tensor has the wrong shape or type.
     """
     tensor = make_np(tensor).squeeze()
+<<<<<<< HEAD
     if tensor.ndim != 0:
         raise AssertionError(f"Tensor should contain one element (0 dimensions). \
             Was given size: {tensor.size} and {tensor.ndim} dimensions.")
+=======
+    assert (
+        tensor.ndim == 0
+    ), f"Tensor should contain one element (0 dimensions). Was given size: {tensor.size} and {tensor.ndim} dimensions."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # python float is double precision in numpy
     scalar = float(tensor)
     if new_style:
@@ -498,7 +507,10 @@ def make_histogram(values, bins, max_bins=None):
         subsampling = num_bins // max_bins
         subsampling_remainder = num_bins % subsampling
         if subsampling_remainder != 0:
+<<<<<<< HEAD
             # pyrefly: ignore [no-matching-overload]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             counts = np.pad(
                 counts,
                 pad_width=[[0, subsampling - subsampling_remainder]],
@@ -700,8 +712,12 @@ def audio(tag, tensor, sample_rate=44100):
     if abs(array).max() > 1:
         print("warning: audio amplitude out of range, auto clipped.")
         array = array.clip(-1, 1)
+<<<<<<< HEAD
     if array.ndim != 1:
         raise AssertionError("input tensor should be 1 dimensional.")
+=======
+    assert array.ndim == 1, "input tensor should be 1 dimensional."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     array = (array * np.iinfo(np.int16).max).astype("<i2")
 
     import io
@@ -732,8 +748,12 @@ def custom_scalars(layout):
         for chart_name, chart_metadata in v.items():
             tags = chart_metadata[1]
             if chart_metadata[0] == "Margin":
+<<<<<<< HEAD
                 if len(tags) != 3:
                     raise AssertionError("len(tags) != 3")
+=======
+                assert len(tags) == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mgcc = layout_pb2.MarginChartContent(
                     series=[
                         layout_pb2.MarginChartContent.Series(
@@ -838,6 +858,7 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
         weights = 1.0
 
     # Compute bins of true positives and false positives.
+<<<<<<< HEAD
     # pyrefly: ignore [unsupported-operation]
     bucket_indices = np.int32(np.floor(predictions * (num_thresholds - 1)))
     float_labels = labels.astype(np.float64)
@@ -846,13 +867,23 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
     tp_buckets, _ = np.histogram(
         bucket_indices,
         # pyrefly: ignore [bad-argument-type]
+=======
+    bucket_indices = np.int32(np.floor(predictions * (num_thresholds - 1)))
+    float_labels = labels.astype(np.float64)
+    histogram_range = (0, num_thresholds - 1)
+    tp_buckets, _ = np.histogram(
+        bucket_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bins=num_thresholds,
         range=histogram_range,
         weights=float_labels * weights,
     )
     fp_buckets, _ = np.histogram(
         bucket_indices,
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bins=num_thresholds,
         range=histogram_range,
         weights=(1.0 - float_labels) * weights,
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index e100ddb179f62..a5a6abe13ef7c 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -254,9 +254,13 @@ def __init__(
         buckets = []
         neg_buckets = []
         while v < 1e20:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-argument-type]
             buckets.append(v)
             # pyrefly: ignore [bad-argument-type]
+=======
+            buckets.append(v)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             neg_buckets.append(-v)
             v *= 1.1
         self.default_bins = neg_buckets[::-1] + [0] + buckets
@@ -264,6 +268,7 @@ def __init__(
     def _get_file_writer(self):
         """Return the default FileWriter instance. Recreates it if closed."""
         if self.all_writers is None or self.file_writer is None:
+<<<<<<< HEAD
             # pyrefly: ignore [bad-assignment]
             self.file_writer = FileWriter(
                 self.log_dir, self.max_queue, self.flush_secs, self.filename_suffix
@@ -277,6 +282,17 @@ def _get_file_writer(self):
                     Event(step=most_recent_step, file_version="brain.Event:2")
                 )
                 # pyrefly: ignore [missing-attribute]
+=======
+            self.file_writer = FileWriter(
+                self.log_dir, self.max_queue, self.flush_secs, self.filename_suffix
+            )
+            self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
+            if self.purge_step is not None:
+                most_recent_step = self.purge_step
+                self.file_writer.add_event(
+                    Event(step=most_recent_step, file_version="brain.Event:2")
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.file_writer.add_event(
                     Event(
                         step=most_recent_step,
@@ -420,8 +436,12 @@ def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None
         fw_logdir = self._get_file_writer().get_logdir()
         for tag, scalar_value in tag_scalar_dict.items():
             fw_tag = fw_logdir + "/" + main_tag.replace("/", "_") + "_" + tag
+<<<<<<< HEAD
             if self.all_writers is None:
                 raise AssertionError("self.all_writers is None")
+=======
+            assert self.all_writers is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if fw_tag in self.all_writers.keys():
                 fw = self.all_writers[fw_tag]
             else:
@@ -931,6 +951,7 @@ def add_embedding(
             fs.makedirs(save_path)
 
         if metadata is not None:
+<<<<<<< HEAD
             if mat.shape[0] != len(
                 metadata
             ):
@@ -944,6 +965,22 @@ def add_embedding(
 
         if mat.ndim != 2:
             raise AssertionError("mat should be 2D, where mat.size(0) is the number of data points")
+=======
+            assert mat.shape[0] == len(
+                metadata
+            ), "#labels should equal with #data points"
+            make_tsv(metadata, save_path, metadata_header=metadata_header)
+
+        if label_img is not None:
+            assert (
+                mat.shape[0] == label_img.shape[0]
+            ), "#images should equal with #data points"
+            make_sprite(label_img, save_path)
+
+        assert (
+            mat.ndim == 2
+        ), "mat should be 2D, where mat.size(0) is the number of data points"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_mat(mat, save_path)
 
         # Filesystem doesn't necessarily have append semantics, so we store an
@@ -956,7 +993,10 @@ def add_embedding(
         )
         self._projector_config.embeddings.extend([embedding_info])
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from google.protobuf import text_format
 
         config_pbtxt = text_format.MessageToString(self._projector_config)
@@ -1094,8 +1134,12 @@ def add_custom_scalars_marginchart(
         torch._C._log_api_usage_once(
             "tensorboard.logging.add_custom_scalars_marginchart"
         )
+<<<<<<< HEAD
         if len(tags) != 3:
             raise AssertionError(f"Expected 3 tags, got {len(tags)}.")
+=======
+        assert len(tags) == 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout = {category: {title: ["Margin", tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
@@ -1207,7 +1251,10 @@ def close(self):
         for writer in self.all_writers.values():
             writer.flush()
             writer.close()
+<<<<<<< HEAD
         # pyrefly: ignore [bad-assignment]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.file_writer = self.all_writers = None
 
     def __enter__(self):
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
index 3f06f6220eef2..2af978364ac0e 100644
--- a/torch/utils/throughput_benchmark.py
+++ b/torch/utils/throughput_benchmark.py
@@ -5,8 +5,12 @@
 
 def format_time(time_us=None, time_ms=None, time_s=None):
     """Define time formatting."""
+<<<<<<< HEAD
     if sum([time_us is not None, time_ms is not None, time_s is not None]) != 1:
         raise AssertionError("Expected only one of time_us, time_ms, time_s is given.")
+=======
+    assert sum([time_us is not None, time_ms is not None, time_s is not None]) == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     US_IN_SECOND = 1e6
     US_IN_MS = 1e3
diff --git a/torch/utils/viz/MemoryViz.js b/torch/utils/viz/MemoryViz.js
index 09f8c444f600c..8fda567aa2128 100644
--- a/torch/utils/viz/MemoryViz.js
+++ b/torch/utils/viz/MemoryViz.js
@@ -33,12 +33,21 @@ function version_space() {
   };
 }
 
+<<<<<<< HEAD
 function Segment(addr, size, stream, frames, version, user_metadata) {
   return {addr, size, stream, version, frames, user_metadata};
 }
 
 function Block(addr, size, requested_size, frames, free_requested, version, user_metadata) {
   return {addr, size, requested_size, frames, free_requested, version, user_metadata};
+=======
+function Segment(addr, size, stream, frames, version) {
+  return {addr, size, stream, version, frames};
+}
+
+function Block(addr, size, requested_size, frames, free_requested, version) {
+  return {addr, size, requested_size, frames, free_requested, version};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function EventSelector(outer, events, stack_info, memory_view) {
@@ -140,9 +149,13 @@ function eventStack(e, allocated, reserved) {
       reserved,
     )} reserved)\n${event}`;
   }
+<<<<<<< HEAD
   const user_metadata_str = format_user_metadata(e.user_metadata);
   const frames_str = format_frames(e.frames);
   return event + '\n' + (user_metadata_str ? user_metadata_str + '\n' : '') + frames_str;
+=======
+  return event + '\n' + format_frames(e.frames);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function hashCode(num) {
@@ -218,7 +231,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
         seg.stream,
         seg.frames || [],
         seg.version,
+<<<<<<< HEAD
         seg.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ),
     );
     for (const b of seg.blocks) {
@@ -232,7 +248,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
         b.frames,
         b.state === 'active_pending_free',
         b.version,
+<<<<<<< HEAD
         b.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       );
     }
   }
@@ -311,7 +330,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
             event.frames,
             false,
             event.version,
+<<<<<<< HEAD
             event.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           );
           break;
         case 'free_requested':
@@ -325,7 +347,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
             event.frames,
             true,
             event.version,
+<<<<<<< HEAD
             event.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           );
           break;
         case 'alloc':
@@ -341,7 +366,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
               event.stream,
               event.frames,
               event.version,
+<<<<<<< HEAD
               event.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
           );
           break;
@@ -355,7 +383,10 @@ function MemoryView(outer, stack_info, snapshot, device) {
               event.stream,
               event.frames,
               event.version,
+<<<<<<< HEAD
               event.user_metadata,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
           );
           break;
@@ -434,17 +465,24 @@ function MemoryView(outer, stack_info, snapshot, device) {
           if (t.internal_free > 0) {
             internal = ` (${(t.internal_free / free) * 100}% internal)`;
           }
+<<<<<<< HEAD
           const user_metadata_str = format_user_metadata(t.user_metadata);
           const frames_str = format_frames(t.frames);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return (
             `s${t.addr.toString(16)}_${t.version}: segment ${formatSize(
               t.size,
             )} allocated, ` +
             `${formatSize(free)} free${internal} (stream ${
               t.stream
+<<<<<<< HEAD
             })\n` +
             (user_metadata_str ? user_metadata_str + '\n' : '') +
             frames_str
+=======
+            })\n${format_frames(t.frames)}`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           );
         },
         d => {
@@ -505,15 +543,22 @@ function MemoryView(outer, stack_info, snapshot, device) {
           if (t.free_requested) {
             requested = ' (block freed but waiting due to record_stream)';
           }
+<<<<<<< HEAD
           const user_metadata_str = format_user_metadata(t.user_metadata);
           const frames_str = format_frames(t.frames);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return (
             `b${t.addr.toString(16)}_${t.version} ` +
             `${formatSize(t.requested_size)} allocation${requested} (stream ${
               t.segment.stream
             })\n` +
+<<<<<<< HEAD
             (user_metadata_str ? user_metadata_str + '\n' : '') +
             frames_str
+=======
+            format_frames(t.frames)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           );
         },
         removeStroke,
@@ -539,15 +584,22 @@ function MemoryView(outer, stack_info, snapshot, device) {
         d => {
           addStroke(d);
           const t = d.datum();
+<<<<<<< HEAD
           const user_metadata_str = format_user_metadata(t.user_metadata);
           const frames_str = format_frames(t.frames);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return (
             `Free space lost due to rounding ${formatSize(
               t.size - t.requested_size,
             )}` +
             ` (stream ${t.segment.stream})\n` +
+<<<<<<< HEAD
             (user_metadata_str ? user_metadata_str + '\n' : '') +
             frames_str
+=======
+            format_frames(t.frames)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           );
         },
         removeStroke,
@@ -778,6 +830,7 @@ function frameFilter({name, filename}) {
   return true;
 }
 
+<<<<<<< HEAD
 function format_user_metadata(user_metadata) {
   if (!user_metadata) {
     return '';
@@ -795,6 +848,8 @@ function format_user_metadata(user_metadata) {
   return 'User Metadata:\n' + metadata_lines.join('\n');
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 function format_frames(frames) {
   if (frames.length === 0) {
     return (
@@ -1027,10 +1082,13 @@ function process_alloc_data(snapshot, device, plot_segments, max_entries) {
       if (!elem.action.includes('alloc')) {
         text = `${text}\nalloc not recorded, stack trace for free:`;
       }
+<<<<<<< HEAD
       const user_metadata_str = format_user_metadata(elem.user_metadata);
       if (user_metadata_str) {
         text = `${text}\n${user_metadata_str}`;
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       text = `${text}\n${format_frames(elem.frames)}`;
       return text;
     },
@@ -1763,6 +1821,7 @@ body.on('drop', () => {
 selection_to_div[''] = body
   .append('div')
   .text(
+<<<<<<< HEAD
     'Drag and drop or select a file to load a local snapshot. No data from the snapshot is uploaded.',
   );
 
@@ -1785,6 +1844,11 @@ const fileInput = body.append('input')
     selected_change();                       // refresh the UI
   });
 
+=======
+    'Drag and drop a file to load a local snapshot. No data from the snapshot is uploaded.',
+  );
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 let next_unique_n = 1;
 function add_snapshot(name, loader) {
   if (name in snapshot_to_loader) {
diff --git a/torch/utils/viz/_cycles.py b/torch/utils/viz/_cycles.py
index 72ceba903aad2..d8bd42e2c0dab 100644
--- a/torch/utils/viz/_cycles.py
+++ b/torch/utils/viz/_cycles.py
@@ -70,7 +70,11 @@ def remove():
         gc.callbacks.remove(gc_callback)
     return remove
 
+<<<<<<< HEAD
 # Function to visualize cycles adapted from refcycle:
+=======
+# Function to visualize cycles adapated from refcycle:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Copyright 2013 Mark Dickinson
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -277,7 +281,11 @@ def create_graph(objects, *, context=None, filter=None):
         references = annotated_references(obj)
         for referrent in gc.get_referents(obj):
             rid = id(referrent)
+<<<<<<< HEAD
             tidx = id_to_node.get(rid)
+=======
+            tidx = id_to_node.get(rid, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if tidx is None:
                 continue
             labels = references.get(rid, ["?"])
@@ -311,11 +319,15 @@ def escape(n):
 
 
 def is_cuda_tensor(obj):
+<<<<<<< HEAD
     return (
         isinstance(obj, torch.Tensor) and
         obj.device.type == "cuda" and
         not isinstance(obj, torch._subclasses.FakeTensor)
     )
+=======
+    return isinstance(obj, torch.Tensor) and obj.is_cuda and not isinstance(obj, torch._subclasses.FakeTensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def cuda_allocation_context():
     snapshot = torch.cuda.memory._snapshot()
@@ -340,7 +352,11 @@ def object_context(obj):
 def to_dot(nodes):
     lines = ["digraph GraphName {", "node [shape=rect];", 'rankdir=LR;']
     for i, n in enumerate(nodes):
+<<<<<<< HEAD
         lines.append(f'{i} [label={escape(n.label)}, color={"red" if n.root else "black"}];')
+=======
+        lines.append(f'{i} [label={escape(n.label)}, color={ "red" if n.root else "black"}];')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for i, f in enumerate(nodes):
         for label, j in f.referrents:
@@ -465,7 +481,10 @@ def to_html(nodes):
         if n.context is None:
             continue
         s = _listener_template.format(id=str(i + 1), stack=escape(f'{n.label}:\n{n.context}'))
+<<<<<<< HEAD
         # pyrefly: ignore [bad-argument-type]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         listeners.append(s)
     dot = to_dot(nodes)
     return _template.replace('$DOT', repr(dot)).replace('$LISTENERS', '\n'.join(listeners))
@@ -487,7 +506,11 @@ def warn_tensor_cycles():
     Install a warning that reports whenever a cycle that is holding CUDA memory is observed.
 
     The warning produces an .html file that visualizes the cycle,
+<<<<<<< HEAD
     and links it to the stack frame that allocated the CUDA tensor.
+=======
+    and links it to the stack frame that allocted the CUDA tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Reference cycles are freed by the cycle collector rather than being cleaned up
     when the objects in the cycle first become unreachable. If a cycle points to a tensor,
diff --git a/torch/utils/weak.py b/torch/utils/weak.py
index cd829e531b46c..b55315e15d01a 100644
--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
@@ -3,6 +3,11 @@
 
 import collections.abc as _collections_abc
 import weakref
+<<<<<<< HEAD
+=======
+
+from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Mapping, MutableMapping
 from weakref import ref
 
@@ -20,6 +25,7 @@
 ]
 
 
+<<<<<<< HEAD
 # TODO: make weakref properly thread safe following
 # https://github.com/python/cpython/pull/125325
 class _IterationGuard:
@@ -47,6 +53,8 @@ def __exit__(self, e, t, b):
                 w._commit_removals()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This file defines a variant of WeakKeyDictionary that overrides the hashing
 # behavior of the key to use object identity, rather than the builtin
 # __eq__/__hash__ functions.  This is useful for Tensor weak keys, as their
@@ -292,10 +300,15 @@ def popitem(self):
             if o is not None:
                 return o, value
 
+<<<<<<< HEAD
     # pyrefly: ignore [bad-override]
     def pop(self, key, *args):
         self._dirty_len = True
         # pyrefly: ignore [not-iterable]
+=======
+    def pop(self, key, *args):
+        self._dirty_len = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.pop(self.ref_type(key), *args)  # CHANGED
 
     def setdefault(self, key, default=None):
@@ -310,7 +323,11 @@ def update(self, dict=None, **kwargs):  # type: ignore[override]
                 dict = type({})(dict)
             for key, value in dict.items():
                 d[self.ref_type(key, self._remove)] = value  # CHANGED
+<<<<<<< HEAD
         if kwargs:
+=======
+        if len(kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.update(kwargs)
 
     def __ior__(self, other):
@@ -352,16 +369,24 @@ class TensorWeakRef:
     ref: WeakRef[Tensor]
 
     def __init__(self, tensor: Tensor):
+<<<<<<< HEAD
         if not isinstance(tensor, Tensor):
             raise AssertionError(f"expected torch.Tensor, got {type(tensor)}.")
+=======
+        assert isinstance(tensor, Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ref = weakref.ref(tensor)
 
     def __call__(self):
         out = self.ref()
         if out is None:
             return out
+<<<<<<< HEAD
         if not isinstance(out, Tensor):
             raise AssertionError(f"expected torch.Tensor, got {type(out)}.")
+=======
+        assert isinstance(out, Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO, add _fix_weakref type binding
         out._fix_weakref()  # type: ignore[attr-defined]
         return out
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 5fec24c74de5a..b0f8f298e4362 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -6,12 +6,19 @@
 This package is lazily initialized, so you can always import it, and use
 :func:`is_available()` to determine if your system supports XPU.
 """
+<<<<<<< HEAD
 
 import threading
 import traceback
 from collections.abc import Callable
 from functools import lru_cache
 from typing import Any, Optional, Union
+=======
+import threading
+import traceback
+from functools import lru_cache
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C
@@ -78,6 +85,7 @@ def is_bf16_supported(including_emulation: bool = True) -> bool:
     )
 
 
+<<<<<<< HEAD
 def is_tf32_supported() -> bool:
     r"""Return a bool indicating if the current XPU device supports dtype tf32."""
     if not is_available():
@@ -89,6 +97,8 @@ def is_tf32_supported() -> bool:
     return torch.xpu.get_device_properties().has_subgroup_matrix_multiply_accumulate
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_initialized():
     r"""Return whether PyTorch's XPU state has been initialized."""
     return _initialized and not _is_in_bad_fork()
@@ -248,6 +258,7 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
         Dict[str, Any]: the xpu capability dictionary of the device
     """
     props = get_device_properties(device)
+<<<<<<< HEAD
     # Only keep attributes that are safe for dictionary serialization.
     serializable_types = (int, float, bool, str, type(None), list, tuple, dict)
     return {
@@ -261,6 +272,21 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
 def get_device_properties(
     device: Optional[_device_t] = None,
 ) -> _XpuDeviceProperties:  # pyrefly: ignore  # not-a-type
+=======
+    # pybind service attributes are no longer needed and their presence breaks
+    # the further logic related to the serialization of the created dictionary.
+    # In particular it filters out `<bound method PyCapsule._pybind11_conduit_v1_ of _XpuDeviceProperties..>`
+    # to fix Triton tests.
+    # This field appears after updating pybind to 2.13.6.
+    return {
+        prop: getattr(props, prop)
+        for prop in dir(props)
+        if not prop.startswith(("__", "_pybind11_"))
+    }
+
+
+def get_device_properties(device: Optional[_device_t] = None) -> _XpuDeviceProperties:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Get the properties of a device.
 
     Args:
@@ -294,6 +320,7 @@ def _get_device(device: Union[int, str, torch.device]) -> torch.device:
     return device
 
 
+<<<<<<< HEAD
 def can_device_access_peer(device: _device_t, peer: _device_t) -> bool:
     r"""Query whether a device can access a peer device's memory.
 
@@ -310,6 +337,8 @@ def can_device_access_peer(device: _device_t, peer: _device_t) -> bool:
     return torch._C._xpu_canDeviceAccessPeer(device, peer)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class StreamContext:
     r"""Context-manager that selects a given stream.
 
@@ -321,14 +350,21 @@ class StreamContext:
             ``None``.
     .. note:: Streams are per-device.
     """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cur_stream: Optional["torch.xpu.Stream"]
 
     def __init__(self, stream: Optional["torch.xpu.Stream"]):
         self.stream = stream
         self.idx = _get_device_index(None, True)
         if self.idx is None:
+<<<<<<< HEAD
             self.idx = -1  # pyrefly: ignore [bad-assignment]
+=======
+            self.idx = -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __enter__(self):
         cur_stream = self.stream
@@ -468,7 +504,11 @@ def get_gencode_flags() -> str:
     arch_list = get_arch_list()
     if len(arch_list) == 0:
         return ""
+<<<<<<< HEAD
     return f"-device {','.join(arch for arch in arch_list)}"
+=======
+    return f'-device {",".join(arch for arch in arch_list)}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_generator(device: torch.device) -> torch._C.Generator:
@@ -530,7 +570,10 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     memory_stats_as_nested_dict,
     reset_accumulated_memory_stats,
     reset_peak_memory_stats,
+<<<<<<< HEAD
     set_per_process_memory_fraction,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .random import (
     get_rng_state,
@@ -549,7 +592,10 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "Event",
     "Stream",
     "StreamContext",
+<<<<<<< HEAD
     "can_device_access_peer",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "current_device",
     "current_stream",
     "default_generators",
@@ -570,7 +616,10 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "is_available",
     "is_bf16_supported",
     "is_initialized",
+<<<<<<< HEAD
     "is_tf32_supported",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "manual_seed",
     "manual_seed_all",
     "max_memory_allocated",
@@ -585,7 +634,10 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "seed",
     "seed_all",
     "set_device",
+<<<<<<< HEAD
     "set_per_process_memory_fraction",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "set_rng_state",
     "set_rng_state_all",
     "set_stream",
diff --git a/torch/xpu/_gpu_trace.py b/torch/xpu/_gpu_trace.py
index 7c3a8b9bf785b..6511a1c08ed45 100644
--- a/torch/xpu/_gpu_trace.py
+++ b/torch/xpu/_gpu_trace.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 from collections.abc import Callable
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._utils import CallbackRegistry
 
diff --git a/torch/xpu/memory.py b/torch/xpu/memory.py
index 9086b1258fc8c..76eeda1db02da 100644
--- a/torch/xpu/memory.py
+++ b/torch/xpu/memory.py
@@ -4,7 +4,11 @@
 import torch
 from torch.types import Device
 
+<<<<<<< HEAD
 from . import _get_device_index, _lazy_init, is_initialized
+=======
+from . import _get_device_index, is_initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _device_t = Union[Device, str, int, None]
@@ -194,6 +198,7 @@ def mem_get_info(device: _device_t = None) -> tuple[int, int]:
     return torch._C._xpu_getMemoryInfo(device)
 
 
+<<<<<<< HEAD
 def set_per_process_memory_fraction(fraction: float, device: _device_t = None) -> None:
     r"""
     Set the memory fraction for a single process on XPU device.
@@ -219,6 +224,8 @@ def set_per_process_memory_fraction(fraction: float, device: _device_t = None) -
     torch._C._xpu_setMemoryFraction(fraction, device)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "empty_cache",
     "max_memory_allocated",
@@ -230,5 +237,8 @@ def set_per_process_memory_fraction(fraction: float, device: _device_t = None) -
     "memory_stats_as_nested_dict",
     "reset_accumulated_memory_stats",
     "reset_peak_memory_stats",
+<<<<<<< HEAD
     "set_per_process_memory_fraction",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
index a1d78305f0a5e..0a62319e95abb 100644
--- a/torch/xpu/streams.py
+++ b/torch/xpu/streams.py
@@ -126,7 +126,11 @@ def record(self, stream=None) -> None:
         """
         if stream is None:
             stream = torch.xpu.current_stream()
+<<<<<<< HEAD
         super().record(stream)  # pyrefly: ignore [bad-argument-type]
+=======
+        super().record(stream)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index a66151a31bb17..6e0ba137f651b 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -39,12 +39,18 @@
     "aten._flash_attention_forward.default": {},
     "aten._fused_moving_avg_obs_fq_helper_functional.default": {},
     "aten._fused_moving_avg_obs_fq_helper.default": {},
+<<<<<<< HEAD
     "aten._fused_rms_norm.default": {},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten._histogramdd_from_bin_cts.default": {},
     "aten._int_mm.out": {},
     "aten._pdist_backward.default": {},
     "aten._pdist_forward.default": {},
+<<<<<<< HEAD
     "aten._scaled_dot_product_attention_math_for_mps.default": {},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten._scaled_dot_product_cudnn_attention_backward.default": {},
     "aten._scaled_dot_product_cudnn_attention.default": {},
     "aten._scaled_dot_product_efficient_attention_backward.default": {},
@@ -56,7 +62,10 @@
     "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {},
     "aten._scaled_dot_product_fused_attention_overrideable.default": {},
     "aten._scaled_mm.default": {},
+<<<<<<< HEAD
     "aten._scaled_grouped_mm.default": {},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten._scaled_mm.out": {},
     "aten._segment_reduce_backward.default": {},
     "aten._thnn_fused_lstm_cell.default": {},
@@ -176,6 +185,7 @@
     "aten.view.dtype": {},
     "aten._weight_int4pack_mm_with_scales_and_zeros.default": {},
 }
+<<<<<<< HEAD
 
 # `python torchgen/gen.py --update-aoti-c-shim` will automatically generate
 # c_shim_aten.{h/cpp} based on the list below.
@@ -190,3 +200,5 @@
     "aten.new_empty.default": {},
     "aten.new_zeros.default": {},
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torchgen/api/functionalization.py b/torchgen/api/functionalization.py
index f4b46b5f14760..a4fd6c3dcd234 100644
--- a/torchgen/api/functionalization.py
+++ b/torchgen/api/functionalization.py
@@ -23,6 +23,7 @@
 
 
 # This file describes the translation of JIT schema to API's used
+<<<<<<< HEAD
 # when creating `ViewMeta` specializations that are used by the functionalization pass.
 # These API's mostly follow the dispatcher API, with one difference:
 # - While the forward function just directly calls into the at::_ops API
@@ -30,6 +31,22 @@
 #   is responsible for generating both the call-site, and the declarations
 #   (which are implemented manually in the at::functionalization::impl namespace).
 
+=======
+# when creating view lambdas that are used by the functionalization pass.
+# There are two types of lambdas: forward lambdas and reverse lambdas.
+# These API's mostly follow the dispatcher API, with a few quirks:
+# - The lambda capture has to convert reference types to value types
+# - While the forward lambda just directly calls into the at::_ops API
+#   (following the dispatcher convention), the logic here for the reverse lambda
+#   is responsible for generating both the call-site, and the declarations
+#   (which are implemented manually in the at::functionalization::impl namespace).
+
+# The lambdas generated for each view op in the functionalization pass are of the form
+# [capture_arguments](outer_arguments) -> returns_type {
+#     return name(inner_arguments);
+# }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Define some specific lambda input arguments.
 base_binding = Binding(
     name="base",
@@ -39,6 +56,7 @@
     ),
     default=None,
 )
+<<<<<<< HEAD
 
 has_symbolic_inputs_binding = Binding(
     name="has_symbolic_inputs",
@@ -51,6 +69,8 @@
     ),
     default=None,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mutated_view_binding = Binding(
     name="mutated_view",
     nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))),
@@ -59,11 +79,19 @@
     ),
     default=None,
 )
+<<<<<<< HEAD
 out_index_binding = Binding(
     name="out_index",
     nctype=NamedCType(name="out_index", type=BaseCType(longT)),
     argument=Argument(
         name="out_index", type=BaseType(BaseTy.int), default=None, annotation=None
+=======
+mutated_view_idx_binding = Binding(
+    name="mutated_view_idx",
+    nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     default=None,
 )
@@ -91,6 +119,7 @@
 )
 
 
+<<<<<<< HEAD
 # Name of the `ViewMeta` specialization class created.
 def classname(func: FunctionSchema, with_namespace: bool = False) -> str:
     namespace = "at::functionalization::" if with_namespace else ""
@@ -98,6 +127,10 @@ def classname(func: FunctionSchema, with_namespace: bool = False) -> str:
 
 
 # Name of the operation called inside the `forward`/`reverse` implementations.
+=======
+# The lambda capture itself doesn't have a name.
+# The name returned here corresponds to the name of the inner function called by the lambda.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def name(
     g: NativeFunctionsViewGroup,
     *,
@@ -134,6 +167,27 @@ def reverse_name(f: NativeFunction, include_namespace: bool) -> str:
         return f"{api_name}_inverse"
 
 
+<<<<<<< HEAD
+=======
+def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> list[Binding]:
+    # capture arguments include all arguments except `self`.
+    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
+    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    non_self_args = args[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    all_bindings = [
+        inverse_return_mode_binding if is_reverse else reapply_views_binding
+    ]
+    all_bindings.extend(non_self_value_bindings)
+    return all_bindings
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def returns_type(func: FunctionSchema) -> CType:
     # Assertion: all view ops return tensor-like outputs
     assert len(func.returns) >= 1
@@ -144,6 +198,7 @@ def returns_type(func: FunctionSchema) -> CType:
     return BaseCType(tensorT)
 
 
+<<<<<<< HEAD
 # Checks whether `func` might return more than one value.
 def is_multi_output(func: FunctionSchema) -> bool:
     return len(func.returns) > 1 or (
@@ -187,6 +242,26 @@ def attributes(func: FunctionSchema, owning: bool = True) -> list[Binding]:
 
 
 def op_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
+=======
+def outer_arguments(*, is_reverse: bool) -> list[Binding]:
+    if is_reverse:
+        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
+    else:
+        return [base_binding, mutated_view_idx_binding]
+
+
+def inner_call_index(func: FunctionSchema) -> Binding | None:
+    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
+    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
+    if len(func.returns) > 1 or (
+        len(func.returns) == 1 and func.returns[0].type.is_list_like()
+    ):
+        return mutated_view_idx_binding
+    return None
+
+
+def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = func.arguments.flat_all
     assert args[0].type == BaseType(BaseTy.Tensor)
     non_self_args = args[1:]
@@ -200,12 +275,21 @@ def op_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
         # the reverse lambda does the same, but with an additional "mutated_view" arg
         # additionally, we have a calling convention: for view ops that return multiple tensor outputs
         # their corresponding view_inverse function takes in an additional index argument.
+<<<<<<< HEAD
         if is_multi_output(func):
+=======
+        index_binding = inner_call_index(func)
+        if index_binding is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [
                 base_binding,
                 mutated_view_binding,
                 inverse_return_mode_binding,
+<<<<<<< HEAD
                 out_index_binding,
+=======
+                index_binding,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ] + non_self_bindings
         else:
             return [
diff --git a/torchgen/api/types/signatures.py b/torchgen/api/types/signatures.py
index d4a47536dd1ff..9a784d1f95b14 100644
--- a/torchgen/api/types/signatures.py
+++ b/torchgen/api/types/signatures.py
@@ -300,12 +300,91 @@ def decl(self) -> str:
         return_type = functionalization.returns_type(self.g.view.func)
         decls = [
             a.decl()
+<<<<<<< HEAD
             for a in functionalization.op_arguments(self.g.view.func, is_reverse=True)
+=======
+            for a in functionalization.inner_arguments(
+                self.g.view.func, is_reverse=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
 
 
 @dataclass(frozen=True)
+<<<<<<< HEAD
+=======
+class FunctionalizationLambda:
+    g: NativeFunctionsViewGroup
+
+    # are we generating the forward lambda or the reverse lambda?
+    is_reverse: bool
+
+    def captures(self) -> list[Expr]:
+        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
+        # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed,
+        # and plumb it into the lambda.
+        outer_ctx = dispatcher.arguments(self.g.view.func) + [
+            functionalization.reapply_views_binding,
+            functionalization.inverse_return_mode_binding,
+        ]
+        capture_bindings = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        # allow_expensive_conversions is set because we want to convert
+        # some reference types (IntArrayRef) to value types (vector<int64_t>).
+        capture_exprs = translate.translate(
+            outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True
+        )
+        return capture_exprs
+
+    def decl(self) -> str:
+        return_type = functionalization.returns_type(self.g.view.func)
+        capture_str = ", ".join(
+            f"{val.type.name} = {val.expr}" for val in self.captures()
+        )
+        decls = [
+            a.decl()
+            for a in functionalization.outer_arguments(is_reverse=self.is_reverse)
+        ]
+        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
+
+    def inner_call(self, *, reapply_views: bool | None = None) -> str:
+        inner_call_name = functionalization.name(
+            self.g,
+            is_reverse=self.is_reverse,
+            include_namespace=True,
+            reapply_views=reapply_views,
+        )
+
+        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
+        capture_ctx = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        full_ctx = arg_ctx + capture_ctx
+
+        assert self.g.view_copy is not None
+        call_bindings = functionalization.inner_arguments(
+            self.g.view_copy.func, is_reverse=self.is_reverse
+        )
+        maybe_index = functionalization.inner_call_index(self.g.view_copy.func)
+        call_exprs = [
+            e.expr for e in translate.translate(full_ctx, call_bindings, method=False)
+        ]
+        if not self.is_reverse and maybe_index is not None:
+            return f"{inner_call_name}({', '.join(call_exprs)})[{maybe_index.name}];"
+        else:
+            return f"{inner_call_name}({', '.join(call_exprs)});"
+
+    @staticmethod
+    def from_func(
+        g: NativeFunctionsViewGroup, *, is_reverse: bool
+    ) -> FunctionalizationLambda:
+        return FunctionalizationLambda(g, is_reverse)
+
+
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class StructuredImplSignature:
     g: NativeFunctionsGroup
     name: str
diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py
index 41c05653fffdf..50950e7e2f3f3 100644
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@@ -79,7 +79,10 @@
 typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
 tensorGeometryT = BaseCppType("at", "TensorGeometry")
 SymIntT = BaseCppType("c10", "SymInt")
+<<<<<<< HEAD
 SymBoolT = BaseCppType("c10", "SymBool")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
 
 # Types representing template parameters.  Technically, we probably shouldn't
@@ -126,7 +129,10 @@
     BaseTy.Storage: storageT,
     BaseTy.Stream: streamT,
     BaseTy.SymInt: SymIntT,
+<<<<<<< HEAD
     BaseTy.SymBool: SymBoolT,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # CTypes encode C++ type structure as needed for translation.
diff --git a/torchgen/context.py b/torchgen/context.py
index e3725d66b9643..bec66adc46ec7 100644
--- a/torchgen/context.py
+++ b/torchgen/context.py
@@ -2,7 +2,11 @@
 
 import contextlib
 import functools
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, TypeVar, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torchgen.local as local
 from torchgen.model import (
@@ -16,7 +20,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterator
+=======
+    from collections.abc import Iterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Helper functions for defining generators on things in the model
diff --git a/torchgen/gen.py b/torchgen/gen.py
index ae0e4b52a0fc8..0d695ae02c6d5 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -8,7 +8,11 @@
 from collections import defaultdict, namedtuple, OrderedDict
 from dataclasses import dataclass, field
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, Literal, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, Literal, TYPE_CHECKING, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import assert_never
 
 import yaml
@@ -43,8 +47,11 @@
     gen_functionalization_definition,
     gen_functionalization_registration,
     gen_functionalization_view_inverse_declaration,
+<<<<<<< HEAD
     gen_functionalization_view_meta_classes_decl,
     gen_functionalization_view_meta_classes_impl,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GenCompositeViewCopyKernel,
 )
 from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing
@@ -96,8 +103,12 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Sequence
     from typing import Optional
+=======
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 T = TypeVar("T")
@@ -2218,7 +2229,11 @@ def gen_source_files(
     per_operator_headers: bool,
     skip_dispatcher_op_registration: bool,
     update_aoti_c_shim: bool,
+<<<<<<< HEAD
     aoti_backends: set[Optional[DispatchKey]],
+=======
+    aoti_backends: set[DispatchKey],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extend_aoti_c_shim: bool,
 ) -> None:
     extra_cuda_headers = """\
@@ -2495,6 +2510,7 @@ def key_func(
         },
     )
 
+<<<<<<< HEAD
     def gen_op_headers(
         g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
     ) -> list[str]:
@@ -2537,6 +2553,50 @@ def gen_op_headers(
     def functionalization_env_callable(
         g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
     ) -> dict[str, list[str]]:
+=======
+    def functionalization_env_callable(
+        g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
+    ) -> dict[str, list[str]]:
+        def gen_op_headers(
+            g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
+        ) -> list[str]:
+            if isinstance(g, NativeFunctionsViewGroup):
+                # view ops always get a functionalization kernel
+                headers = [
+                    f"#include <ATen/ops/{g.view.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+                ]
+                if g.view_copy is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
+                    ]
+                return headers
+            elif isinstance(g, NativeFunctionsGroup):
+                headers = [
+                    f"#include <ATen/ops/{g.functional.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
+                    f"#include <ATen/ops/{g.out.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+                ]
+                if g.inplace is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
+                    ]
+                if g.mutable is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
+                    ]
+                return headers
+            else:
+                return [
+                    f"#include <ATen/ops/{g.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.root_name}_ops.h>",
+                ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return {
             "ops_headers": gen_op_headers(g),
             "func_definitions": gen_functionalization_definition(
@@ -2602,6 +2662,7 @@ def functionalization_env_callable(
         },
     )
 
+<<<<<<< HEAD
     cpu_fm.write(
         "ViewMetaClasses.h",
         lambda: {
@@ -2627,6 +2688,8 @@ def functionalization_env_callable(
         },
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Note [view_copy NativeFunctions]
     # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd
     # needs to have a corresponding non-aliasing {view}_copy variant.
@@ -2868,14 +2931,18 @@ def main() -> None:
     aoti_backends = {
         DispatchKey.CPU,
         DispatchKey.CUDA,
+<<<<<<< HEAD
         # None will generate the aten shim based on aten_shimified_ops
         # which does not bypass the dispatcher
         None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     # TODO: stop generating CUDA kernels for non-CUDA builds
     ignore_keys = set()
 
+<<<<<<< HEAD
     MPS_KEYS = {DispatchKey.MPS, DispatchKey.SparseMPS, DispatchKey.SparseCsrMPS}
     if options.mps or options.update_aoti_c_shim:
         functions_keys.update(MPS_KEYS)
@@ -2883,6 +2950,16 @@ def main() -> None:
     else:
         ignore_keys.update(MPS_KEYS)
         dispatch_keys[:] = [k for k in dispatch_keys if k not in MPS_KEYS]
+=======
+    if options.mps or options.update_aoti_c_shim:
+        functions_keys.add(DispatchKey.MPS)
+        aoti_backends.add(DispatchKey.MPS)
+    else:
+        ignore_keys.add(DispatchKey.MPS)
+
+        if DispatchKey.MPS in dispatch_keys:
+            del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if options.xpu or options.update_aoti_c_shim:
         functions_keys.add(DispatchKey.XPU)
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
index 65161200256e5..6767218f1bca0 100644
--- a/torchgen/gen_aoti_c_shim.py
+++ b/torchgen/gen_aoti_c_shim.py
@@ -6,7 +6,11 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+<<<<<<< HEAD
 from torchgen.aoti.fallback_ops import aten_shimified_ops, inductor_fallback_ops
+=======
+from torchgen.aoti.fallback_ops import inductor_fallback_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.api.types import DispatcherSignature
 from torchgen.api.types.signatures import CppSignature, CppSignatureGroup
 from torchgen.context import method_with_native_function
@@ -24,14 +28,20 @@
     OperatorName,
     OptionalType,
     Type,
+<<<<<<< HEAD
     Variant,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torchgen.utils import FileManager, mapMaybe
 
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
+<<<<<<< HEAD
     from typing import Optional
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 base_type_to_c_type = {
@@ -251,6 +261,7 @@ def convert_return(typ: BaseType, val: str) -> str:
 
     ret_pointer_can_be_null = False
     unambiguous_name = schema.name.unambiguous_name()
+<<<<<<< HEAD
     for name in (
         "_functional_sym_constrain_range",
         "_scaled_dot_product_cudnn_attention",
@@ -264,6 +275,15 @@ def convert_return(typ: BaseType, val: str) -> str:
         "grid_sampler_3d_backward",
         "linear_backward",
     ):
+=======
+    for name in [
+        "_scaled_dot_product_flash_attention",
+        "_scaled_dot_product_efficient_attention",
+        "_scaled_dot_product_cudnn_attention",
+        "_scaled_dot_product_fused_attention_overrideable",
+        "convolution_backward",
+    ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in unambiguous_name:
             ret_pointer_can_be_null = True
             break
@@ -393,6 +413,7 @@ def gen_static_dispatch_backend_call_signature(
 
 def gen_static_dispatch_backend_call(
     f: NativeFunction,
+<<<<<<< HEAD
     backend_index: Optional[BackendIndex] = None,
 ) -> str:
     sig = DispatcherSignature.from_schema(f.func)
@@ -416,20 +437,34 @@ def gen_static_dispatch_backend_call(
         return f"at::{cpp_sig.name()}"
     else:
         return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}"
+=======
+    backend_index: BackendIndex,
+) -> str:
+    sig = DispatcherSignature.from_schema(f.func)
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
+    return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_backend_index_for_aoti(
     func: NativeFunction,
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
+<<<<<<< HEAD
     dispatch_key: Optional[DispatchKey],
+=======
+    dispatch_key: DispatchKey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_indices: dict[DispatchKey, BackendIndex],
     extend_aoti_c_shim: bool,
 ) -> BackendIndex | None:
     backend_index = None
+<<<<<<< HEAD
 
     if dispatch_key is None:
         return backend_index
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if backend_indices[dispatch_key].has_kernel(func) or (
         func.structured_delegate is not None
         and func.structured_delegate in func_group_mapping
@@ -463,19 +498,31 @@ def get_backend_index_for_aoti(
 def get_header_for_aoti(
     func: NativeFunction,
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
+<<<<<<< HEAD
     dispatch_key: Optional[DispatchKey],
+=======
+    dispatch_key: DispatchKey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_indices: dict[DispatchKey, BackendIndex],
     extend_aoti_c_shim: bool,
 ) -> str | None:
     backend_index = get_backend_index_for_aoti(
         func, func_group_mapping, dispatch_key, backend_indices, extend_aoti_c_shim
     )
+<<<<<<< HEAD
     if backend_index is None:
         if dispatch_key is None:
             return f"#include <ATen/ops/{func.root_name}.h>"
         return None
 
     return f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+=======
+    return (
+        None
+        if backend_index is None
+        else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_fallback_op_name(func: NativeFunction) -> str:
@@ -490,7 +537,11 @@ def gen_c_shim(
     func: NativeFunction,
     version_info: dict[str, list[str]],
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
+<<<<<<< HEAD
     dispatch_key: Optional[DispatchKey],
+=======
+    dispatch_key: DispatchKey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_indices: dict[DispatchKey, BackendIndex],
     header: bool,
     extend_aoti_c_shim: bool,
@@ -498,11 +549,19 @@ def gen_c_shim(
     backend_index = get_backend_index_for_aoti(
         func, func_group_mapping, dispatch_key, backend_indices, extend_aoti_c_shim
     )
+<<<<<<< HEAD
     if backend_index is None and dispatch_key is not None:
         return None
 
     schema = func.func
     device = "aten" if dispatch_key is None else dispatch_key.lower()
+=======
+    if backend_index is None:
+        return None
+
+    schema = func.func
+    device = dispatch_key.lower()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_call = gen_static_dispatch_backend_call(
         func,
         backend_index,
@@ -528,7 +587,11 @@ def gen_c_shim(
 class ShimGenerator:
     inductor_fallback_ops: dict[str, dict[str, list[str]]]
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup]
+<<<<<<< HEAD
     dispatch_key: Optional[DispatchKey]
+=======
+    dispatch_key: DispatchKey
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_indices: dict[DispatchKey, BackendIndex]
     header: bool  # True to generate .h and False to generate .cpp
     extend_aoti_c_shim: bool
@@ -555,7 +618,11 @@ def gen_aoti_c_shim(
     native_functions: Sequence[NativeFunction],
     inductor_fallback_ops: dict[str, dict[str, list[str]]],
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
+<<<<<<< HEAD
     dispatch_key: Optional[DispatchKey],
+=======
+    dispatch_key: DispatchKey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_indices: dict[DispatchKey, BackendIndex],
     header: bool,
     extend_aoti_c_shim: bool,
@@ -576,6 +643,7 @@ def gen_aoti_c_shim(
             )
         )
     )
+<<<<<<< HEAD
     device = "aten" if dispatch_key is None else dispatch_key.lower()
     include_device_functions = (
         "#include <ATen/Functions.h>"
@@ -589,6 +657,9 @@ def gen_aoti_c_shim(
         if dispatch_key is None
         else ""
     )
+=======
+    device = dispatch_key.lower()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     warning = """
 
 // WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
@@ -597,7 +668,10 @@ def gen_aoti_c_shim(
     if header:
         return (
             warning
+<<<<<<< HEAD
             + aten_warning
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             + textwrap.dedent("""
 
             #pragma once
@@ -620,14 +694,21 @@ def gen_aoti_c_shim(
     else:
         return (
             warning
+<<<<<<< HEAD
             + aten_warning
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             + textwrap.dedent(f"""
 
             #include <torch/csrc/inductor/aoti_torch/generated/{"extend/" if extend_aoti_c_shim else ""}c_shim_{device}.h>
             #include <torch/csrc/inductor/aoti_torch/utils.h>
 
             #ifndef AT_PER_OPERATOR_HEADERS
+<<<<<<< HEAD
             {include_device_functions}
+=======
+            #include <ATen/{str(dispatch_key)}Functions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #include <ATen/CompositeExplicitAutogradFunctions.h>
             #include <ATen/CompositeExplicitAutogradNonFunctionalFunctions.h>
             #include <ATen/CompositeImplicitAutogradFunctions.h>
@@ -646,7 +727,11 @@ def gen_aoti_c_shim(
 
 def gen_aoti_c_shim_files(
     aoti_fm: FileManager,
+<<<<<<< HEAD
     aoti_backends: set[Optional[DispatchKey]],
+=======
+    aoti_backends: set[DispatchKey],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     native_functions: Sequence[NativeFunction],
     backend_indices: dict[DispatchKey, BackendIndex],
     structured_native_functions: Sequence[NativeFunctionsGroup],
@@ -662,6 +747,7 @@ def gen_aoti_c_shim_files(
                 break
 
     for dispatch_key in aoti_backends:
+<<<<<<< HEAD
         # Use aten_shimified_ops for the aten backend, inductor_fallback_ops for others
         fallback_ops_dict = (
             aten_shimified_ops if dispatch_key is None else inductor_fallback_ops
@@ -670,11 +756,18 @@ def gen_aoti_c_shim_files(
         for func in native_functions:
             op_name = get_fallback_op_name(func)
             if op_name in fallback_ops_dict:
+=======
+        fallbacks = {}
+        for func in native_functions:
+            op_name = get_fallback_op_name(func)
+            if op_name in inductor_fallback_ops:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fallbacks[op_name] = func
         fallback_native_functions = tuple(
             value for _, value in sorted(fallbacks.items())
         )
 
+<<<<<<< HEAD
         # Use "aten" as the device name when dispatch_key is Generic
         device_name = "aten" if dispatch_key is None else dispatch_key.lower()
 
@@ -683,6 +776,13 @@ def gen_aoti_c_shim_files(
         new_header = gen_aoti_c_shim(
             fallback_native_functions,
             fallback_ops_dict,
+=======
+        # header files were checked in for ABI-compatiblilty checking
+        header_file_name = f"c_shim_{dispatch_key.lower()}.h"
+        new_header = gen_aoti_c_shim(
+            fallback_native_functions,
+            inductor_fallback_ops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             structured_func_group_dict,
             dispatch_key,
             backend_indices,
@@ -750,6 +850,7 @@ def headers_for_aoti() -> str:
                     headers.append(header)
             return "\n".join(sorted(set(headers)))
 
+<<<<<<< HEAD
         extra_headers = (
             extra_cuda_headers
             if dispatch_key is not None and is_cuda_dispatch_key(dispatch_key)
@@ -761,6 +862,15 @@ def headers_for_aoti() -> str:
             lambda: gen_aoti_c_shim(
                 fallback_native_functions,
                 fallback_ops_dict,
+=======
+        extra_headers = extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+
+        aoti_fm.write(
+            f"c_shim_{dispatch_key.lower()}.cpp",
+            lambda: gen_aoti_c_shim(
+                fallback_native_functions,
+                inductor_fallback_ops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 structured_func_group_dict,
                 dispatch_key,
                 backend_indices,
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index c396941cf913d..85f7458a595f1 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -1,15 +1,25 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
 
 from torchgen.api import cpp, dispatcher, functionalization
+=======
+from typing import Callable, TYPE_CHECKING
+
+from torchgen.api import cpp, dispatcher
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.api.translate import translate
 from torchgen.api.types import (
     BaseCType,
     Binding,
     CType,
     DispatcherSignature,
+<<<<<<< HEAD
+=======
+    FunctionalizationLambda,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     iTensorListRefT,
     NativeSignature,
     OptionalCType,
@@ -47,12 +57,19 @@
     MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT,
     OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY,
 )
+<<<<<<< HEAD
 from torchgen.utils import concatMap, dataclass_repr, FileManager
 
 
 if TYPE_CHECKING:
     from collections.abc import Callable
 
+=======
+from torchgen.utils import dataclass_repr
+
+
+if TYPE_CHECKING:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torchgen.selective_build.selector import SelectiveBuilder
 
 
@@ -366,8 +383,11 @@ def emit_view_functionalization_body(
     with native_function_manager(f):
         call_sig = DispatcherSignature.from_schema(g.view_copy.func)
 
+<<<<<<< HEAD
         spec = ViewMetaSpecialization(g, f=f)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # the "view_copy" op name that the functionalization kernels need to call
         api_name = g.view_copy.func.name.unambiguous_name()
         # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors)
@@ -388,6 +408,12 @@ def emit_view_functionalization_body(
             for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)
         ]
 
+<<<<<<< HEAD
+=======
+        forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False)
+        reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
         meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
         meta_call_args = [
@@ -415,7 +441,23 @@ def emit_view_functionalization_body(
             : at::functionalization::InverseReturnMode::NeverView
       );
       {symbolic_inputs_check}
+<<<<<<< HEAD
       auto view_meta = {spec.new()};
+=======
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }},
+        /*has_symbolic_inputs=*/{symbolic_inputs_varname}
+      );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto compute_reference_meta =
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
@@ -443,6 +485,10 @@ def emit_view_functionalization_body(
 """
 
         else:
+<<<<<<< HEAD
+=======
+            is_multi_output_view = isinstance(f.func.returns[0].type, ListType)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return f"""
     {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
       {unwrap_tensor_args_str}
@@ -476,7 +522,25 @@ def emit_view_functionalization_body(
         }}
       }}
       {symbolic_inputs_check}
+<<<<<<< HEAD
       auto view_meta = {spec.new()};
+=======
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }},
+        /*has_symbolic_inputs=*/{symbolic_inputs_varname},
+        /*is_multi_output=*/{str(is_multi_output_view).lower()},
+        /*is_as_strided=*/{str(str(f.func.name) == "as_strided").lower()}
+      );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]
       if (compute_reference_meta && !disable_meta_reference()) {{
@@ -654,6 +718,31 @@ def emit_inplace_functionalization_body(
         for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False)
     ]
 
+<<<<<<< HEAD
+=======
+    if f.func.is_out_fn():
+        mutable_input_post_processing = "\n".join(
+            [
+                f"""
+      at::functionalization::impl::replace_(
+        {a.name}, {"std::get<" + str(i) + ">(tmp_output)" if len(f.func.returns) > 1 else "tmp_output"});
+      at::functionalization::impl::commit_update({a.name});"""
+                for (i, a) in enumerate(f.func.arguments.out)
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+    else:
+        mutable_input_post_processing = "\n".join(  # noqa: F841
+            [
+                f"""
+      at::functionalization::impl::replace_({a.name}, tmp_output);
+      at::functionalization::impl::commit_update({a.name});"""
+                for a in f.func.arguments.flat_all
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
     # We don't want to run the inplace meta func for ops like .set_(), because:
     # (1) they're unnecessary: inplace meta checks are only useful for ops like add_(),
@@ -722,6 +811,7 @@ def emit_decl_helper(g: NativeFunctionsViewGroup) -> str | None:
     return emit_decl_helper(g)
 
 
+<<<<<<< HEAD
 # Helper class for generating `ViewMeta` specializations.
 @dataclass
 class ViewMetaSpecialization:
@@ -1017,6 +1107,8 @@ def gen_functionalization_view_meta_classes(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gen_functionalization_registration(
     selector: SelectiveBuilder,
     g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index ffd0aab2a2816..c164501a0451b 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -4,7 +4,11 @@
 import os
 from collections import namedtuple
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import yaml
 
@@ -26,7 +30,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Iterator, Sequence
+=======
+    from collections.abc import Iterable, Iterator, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index daf60589a0cc3..5fbec44bef2be 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -150,7 +150,11 @@ def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> str | None:
     assert schema.kind() == SchemaKind.inplace
     if not is_mutated_arg(schema.arguments.flat_all[0]):
         return None
+<<<<<<< HEAD
     if len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) != 1:
+=======
+    if not len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) == 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     # Only support cases where all returns are Tensors or vector<Tensor>
diff --git a/torchgen/model.py b/torchgen/model.py
index 906b61e2f19cc..6bc86e53f9ad8 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -5,14 +5,22 @@
 import re
 from dataclasses import dataclass
 from enum import auto, Enum
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
+=======
+from typing import Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import assert_never
 
 from torchgen.utils import NamespaceHelper, OrderedSet
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Callable, Iterator, Sequence
+=======
+    from collections.abc import Iterator, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -60,6 +68,7 @@ class Variant(Enum):
 DEFAULT_KERNEL_NAMESPACE = "at::native"
 
 # NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
+<<<<<<< HEAD
 BACKEND_COMPONENTS = [
     "CPU",
     "CUDA",
@@ -77,6 +86,9 @@ class Variant(Enum):
     "PrivateUse2",
     "PrivateUse3",
 ]
+=======
+BACKEND_COMPONENTS = "CPU CUDA HIP XLA MTIA MPS IPU XPU HPU VE Lazy Meta PrivateUse1 PrivateUse2 PrivateUse3".split()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FUNCTIONALITY_KEYS = [
     "",
     "Quantized",
@@ -304,8 +316,11 @@ def codegen_per_backend_entries() -> str:
     DispatchKey.SparseCsrXPU,
     DispatchKey.SparseCUDA,
     DispatchKey.SparseCsrCUDA,
+<<<<<<< HEAD
     DispatchKey.SparseMPS,
     DispatchKey.SparseCsrMPS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DispatchKey.QuantizedCPU,
     DispatchKey.QuantizedCUDA,
     DispatchKey.CompositeImplicitAutograd,
diff --git a/torchgen/selective_build/operator.py b/torchgen/selective_build/operator.py
index 8047f033e3d2b..9007b400332d5 100644
--- a/torchgen/selective_build/operator.py
+++ b/torchgen/selective_build/operator.py
@@ -168,4 +168,8 @@ def merge_operator_dicts(
 
 
 def strip_operator_overload_name(op_name: str) -> str:
+<<<<<<< HEAD
     return op_name.split(".", maxsplit=1)[0]
+=======
+    return op_name.split(".")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 1091b0ebed68c..22529825d2e0a 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -7,19 +7,31 @@
 import re
 import sys
 import textwrap
+<<<<<<< HEAD
 from dataclasses import is_dataclass
 from enum import auto, Enum
 from pathlib import Path
 from pprint import pformat
 from typing import Any, Generic, TYPE_CHECKING, TypeVar
 from typing_extensions import assert_never, Self
+=======
+from dataclasses import fields, is_dataclass
+from enum import auto, Enum
+from pathlib import Path
+from typing import Any, Callable, Generic, Literal, NoReturn, TYPE_CHECKING, TypeVar
+from typing_extensions import assert_never, deprecated, Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen.code_template import CodeTemplate
 
 
 if TYPE_CHECKING:
     from argparse import Namespace
+<<<<<<< HEAD
     from collections.abc import Callable, Iterable, Iterator, Sequence
+=======
+    from collections.abc import Iterable, Iterator, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 TORCHGEN_ROOT = Path(__file__).absolute().parent
@@ -98,6 +110,18 @@ def context(msg_fn: Callable[[], str]) -> Iterator[None]:
         raise
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    # A little trick from https://github.com/python/mypy/issues/6366
+    # for getting mypy to do exhaustiveness checking
+    # TODO: put this somewhere else, maybe
+    @deprecated("Use typing_extensions.assert_never instead")
+    def assert_never(x: NoReturn) -> NoReturn:  # type: ignore[misc] # noqa: F811
+        raise AssertionError(f"Unhandled type: {type(x).__name__}")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.cache
 def _read_template(template_fn: str) -> CodeTemplate:
     return CodeTemplate.from_file(template_fn)
@@ -346,7 +370,52 @@ def dataclass_repr(
     indent: int = 0,
     width: int = 80,
 ) -> str:
+<<<<<<< HEAD
     return pformat(obj, indent, width)
+=======
+    # built-in pprint module support dataclasses from python 3.10
+    if sys.version_info >= (3, 10):
+        from pprint import pformat
+
+        return pformat(obj, indent, width)
+
+    return _pformat(obj, indent=indent, width=width)
+
+
+def _pformat(
+    obj: Any,
+    indent: int,
+    width: int,
+    curr_indent: int = 0,
+) -> str:
+    assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}"
+
+    class_name = obj.__class__.__name__
+    # update current indentation level with class name
+    curr_indent += len(class_name) + 1
+
+    fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr]
+
+    fields_str = []
+    for name, attr in fields_list:
+        # update the current indent level with the field name
+        # dict, list, set and tuple also add indent as done in pprint
+        _curr_indent = curr_indent + len(name) + 1
+        if is_dataclass(attr):
+            str_repr = _pformat(attr, indent, width, _curr_indent)
+        elif isinstance(attr, dict):
+            str_repr = _format_dict(attr, indent, width, _curr_indent)
+        elif isinstance(attr, (list, set, tuple)):
+            str_repr = _format_list(attr, indent, width, _curr_indent)
+        else:
+            str_repr = repr(attr)
+
+        fields_str.append(f"{name}={str_repr}")
+
+    indent_str = curr_indent * " "
+    body = f",\n{indent_str}".join(fields_str)
+    return f"{class_name}({body})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _format_dict(
@@ -360,7 +429,11 @@ def _format_dict(
     for k, v in attr.items():
         k_repr = repr(k)
         v_str = (
+<<<<<<< HEAD
             pformat(v, indent, width, curr_indent + len(k_repr))
+=======
+            _pformat(v, indent, width, curr_indent + len(k_repr))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_dataclass(v)
             else repr(v)
         )
@@ -377,7 +450,11 @@ def _format_list(
 ) -> str:
     curr_indent += indent + 1
     list_repr = [
+<<<<<<< HEAD
         pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+=======
+        _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for l in attr
     ]
     start, end = ("[", "]") if isinstance(attr, list) else ("(", ")")
@@ -473,7 +550,11 @@ def get_cpp_namespace(self, default: str = "") -> str:
 
 
 class OrderedSet(Generic[T]):
+<<<<<<< HEAD
     storage: dict[T, None]
+=======
+    storage: dict[T, Literal[None]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, iterable: Iterable[T] | None = None) -> None:
         if iterable is None:
diff --git a/version.txt b/version.txt
index 1e8c33284d92d..5cc96e2cf2607 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 2.10.0a0
+=======
+2.8.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))